From dce4a407a24b04eebc6a376f8e62b41aaa7b071f Mon Sep 17 00:00:00 2001
From: Stephen Hines <srhines@google.com>
Date: Thu, 29 May 2014 02:49:00 -0700
Subject: Update LLVM for 3.5 rebase (r209712).

Change-Id: I149556c940fb7dc92d075273c87ff584f400941f
---
 .arcconfig                                         |     2 +-
 .clang-format                                      |     1 +
 .gitignore                                         |     3 +-
 Android.mk                                         |     1 +
 CMakeLists.txt                                     |     9 +-
 CREDITS.TXT                                        |     1 +
 Makefile.rules                                     |    25 +-
 README.txt                                         |     1 -
 autoconf/configure.ac                              |    11 +-
 bindings/ocaml/Makefile                            |     2 +-
 bindings/ocaml/all_backends/Makefile               |    21 +
 bindings/ocaml/all_backends/all_backends_ocaml.c   |    32 +
 bindings/ocaml/all_backends/llvm_all_backends.ml   |    10 +
 bindings/ocaml/all_backends/llvm_all_backends.mli  |    11 +
 bindings/ocaml/llvm/META.llvm.in                   |     8 +
 bindings/python/llvm/object.py                     |    25 +-
 bindings/python/llvm/tests/test_object.py          |     2 +-
 cmake/config-ix.cmake                              |    19 +-
 cmake/modules/AddLLVM.cmake                        |    16 +-
 cmake/modules/AddSphinxTarget.cmake                |    56 +
 cmake/modules/FindSphinx.cmake                     |    25 +
 cmake/modules/HandleLLVMOptions.cmake              |    28 +-
 cmake/modules/LLVMConfig.cmake.in                  |     1 +
 cmake/modules/LLVMConfigVersion.cmake.in           |    14 +-
 cmake/modules/Makefile                             |     4 +
 configure                                          |    12 +-
 docs/ARM-BE-bitcastfail.png                        |   Bin 0 -> 29373 bytes
 docs/ARM-BE-bitcastsuccess.png                     |   Bin 0 -> 41468 bytes
 docs/ARM-BE-ld1.png                                |   Bin 0 -> 22561 bytes
 docs/ARM-BE-ldr.png                                |   Bin 0 -> 16516 bytes
 docs/AliasAnalysis.rst                             |    20 +-
 docs/BigEndianNEON.rst                             |   205 +
 docs/BitCodeFormat.rst                             |     2 -
 docs/BlockFrequencyTerminology.rst                 |   130 +
 docs/BranchWeightMetadata.rst                      |     9 +-
 docs/Bugpoint.rst                                  |     2 +-
 docs/CMake.rst                                     |    40 +-
 docs/CMakeLists.txt                                |    15 +
 docs/CodeGenerator.rst                             |    14 +-
 docs/CodingStandards.rst                           |    21 +-
 docs/CommandGuide/index.rst                        |     1 +
 docs/CommandGuide/llvm-cov.rst                     |   114 +-
 docs/CommandGuide/llvm-dwarfdump.rst               |    30 +
 docs/CommandGuide/llvm-symbolizer.rst              |     7 +-
 docs/CommandGuide/tblgen.rst                       |     3 +-
 docs/CompilerWriterInfo.rst                        |     2 +-
 docs/DeveloperPolicy.rst                           |     2 +-
 docs/Extensions.rst                                |    31 +
 docs/GettingStartedVS.rst                          |     8 +-
 docs/LLVMBuild.rst                                 |     4 +-
 docs/LangRef.rst                                   |   187 +-
 docs/Passes.rst                                    |    28 +-
 docs/Phabricator.rst                               |     9 +-
 docs/ProgrammersManual.rst                         |    90 +-
 docs/README.txt                                    |     9 +
 docs/ReleaseNotes.rst                              |     3 +
 docs/SegmentedStacks.rst                           |     5 +-
 docs/TableGen/LangIntro.rst                        |    10 +-
 docs/TableGen/LangRef.rst                          |     6 +-
 docs/WritingAnLLVMBackend.rst                      |     2 +-
 docs/YamlIO.rst                                    |    40 +-
 docs/index.rst                                     |    13 +-
 examples/BrainF/BrainFDriver.cpp                   |     1 +
 include/llvm-c/Core.h                              |    54 +-
 include/llvm-c/Object.h                            |     1 -
 include/llvm-c/Transforms/PassManagerBuilder.h     |     1 -
 include/llvm-c/lto.h                               |    22 +-
 include/llvm-c/module.modulemap                    |     5 +
 include/llvm/ADT/APFloat.h                         |     9 +-
 include/llvm/ADT/ArrayRef.h                        |    18 +-
 include/llvm/ADT/BitVector.h                       |     8 +-
 include/llvm/ADT/DenseMap.h                        |     8 +-
 include/llvm/ADT/DepthFirstIterator.h              |    13 +
 include/llvm/ADT/EquivalenceClasses.h              |    12 +-
 include/llvm/ADT/FoldingSet.h                      |     4 +-
 include/llvm/ADT/Hashing.h                         |     6 +-
 include/llvm/ADT/ImmutableIntervalMap.h            |   248 -
 include/llvm/ADT/ImmutableMap.h                    |     4 +-
 include/llvm/ADT/ImmutableSet.h                    |    21 +-
 include/llvm/ADT/IntervalMap.h                     |     6 +-
 include/llvm/ADT/IntrusiveRefCntPtr.h              |     8 +-
 include/llvm/ADT/OwningPtr.h                       |    10 +-
 include/llvm/ADT/PointerUnion.h                    |    46 +
 include/llvm/ADT/PostOrderIterator.h               |     4 +-
 include/llvm/ADT/SCCIterator.h                     |   225 +-
 include/llvm/ADT/STLExtras.h                       |   155 +-
 include/llvm/ADT/ScopedHashTable.h                 |    14 +-
 include/llvm/ADT/SmallVector.h                     |   115 +-
 include/llvm/ADT/SparseMultiSet.h                  |     2 +-
 include/llvm/ADT/SparseSet.h                       |     2 +-
 include/llvm/ADT/Statistic.h                       |     2 +-
 include/llvm/ADT/StringExtras.h                    |     2 +-
 include/llvm/ADT/StringMap.h                       |    66 +-
 include/llvm/ADT/StringRef.h                       |    11 +-
 include/llvm/ADT/StringSwitch.h                    |     2 +-
 include/llvm/ADT/TinyPtrVector.h                   |    14 +-
 include/llvm/ADT/Triple.h                          |     9 +-
 include/llvm/ADT/Twine.h                           |     2 +-
 include/llvm/ADT/edit_distance.h                   |     4 +-
 include/llvm/ADT/ilist.h                           |    18 +-
 include/llvm/ADT/ilist_node.h                      |    12 +-
 include/llvm/ADT/iterator.h                        |   244 +
 include/llvm/ADT/iterator_range.h                  |     8 +
 include/llvm/Analysis/AliasAnalysis.h              |    12 +-
 include/llvm/Analysis/AliasSetTracker.h            |    25 +-
 include/llvm/Analysis/BlockFrequencyImpl.h         |   379 -
 include/llvm/Analysis/BlockFrequencyInfo.h         |    12 +-
 include/llvm/Analysis/BlockFrequencyInfoImpl.h     |  1859 +++
 include/llvm/Analysis/BranchProbabilityInfo.h      |     2 +-
 include/llvm/Analysis/CFG.h                        |     8 +-
 include/llvm/Analysis/CGSCCPassManager.h           |   591 +
 include/llvm/Analysis/ConstantFolding.h            |    22 +-
 include/llvm/Analysis/DOTGraphTraitsPass.h         |     1 +
 include/llvm/Analysis/DependenceAnalysis.h         |    13 +-
 include/llvm/Analysis/DominanceFrontier.h          |     2 +-
 include/llvm/Analysis/IVUsers.h                    |     2 +-
 include/llvm/Analysis/InstructionSimplify.h        |   192 +-
 include/llvm/Analysis/IntervalPartition.h          |     6 +-
 include/llvm/Analysis/LazyCallGraph.h              |   433 +-
 include/llvm/Analysis/LazyValueInfo.h              |     4 +-
 include/llvm/Analysis/LibCallAliasAnalysis.h       |     2 +-
 include/llvm/Analysis/LibCallSemantics.h           |     2 +-
 include/llvm/Analysis/Loads.h                      |     7 +-
 include/llvm/Analysis/LoopInfo.h                   |    49 +-
 include/llvm/Analysis/LoopInfoImpl.h               |    31 +-
 include/llvm/Analysis/MemoryBuiltins.h             |     2 +-
 include/llvm/Analysis/MemoryDependenceAnalysis.h   |     9 +-
 include/llvm/Analysis/PHITransAddr.h               |     3 +-
 include/llvm/Analysis/PtrUseVisitor.h              |    12 +-
 include/llvm/Analysis/RegionInfo.h                 |     9 +-
 include/llvm/Analysis/ScalarEvolution.h            |    17 +-
 include/llvm/Analysis/ScalarEvolutionExpander.h    |     6 +-
 include/llvm/Analysis/ScalarEvolutionExpressions.h |    88 +-
 include/llvm/Analysis/SparsePropagation.h          |     2 +-
 include/llvm/Analysis/TargetTransformInfo.h        |     6 +-
 include/llvm/Analysis/ValueTracking.h              |    37 +-
 include/llvm/Bitcode/BitstreamReader.h             |    12 +-
 include/llvm/Bitcode/BitstreamWriter.h             |     8 +-
 include/llvm/Bitcode/LLVMBitCodes.h                |     5 +-
 include/llvm/Bitcode/ReaderWriter.h                |     4 +-
 include/llvm/CMakeLists.txt                        |     6 +
 include/llvm/CodeGen/Analysis.h                    |     2 +-
 include/llvm/CodeGen/AsmPrinter.h                  |   957 +-
 include/llvm/CodeGen/CallingConvLower.h            |    56 +-
 include/llvm/CodeGen/CommandFlags.h                |    32 +-
 include/llvm/CodeGen/EdgeBundles.h                 |     5 -
 include/llvm/CodeGen/FastISel.h                    |     6 +
 include/llvm/CodeGen/FunctionLoweringInfo.h        |     4 +-
 include/llvm/CodeGen/GCMetadata.h                  |     5 +-
 include/llvm/CodeGen/GCStrategy.h                  |     4 +-
 include/llvm/CodeGen/ISDOpcodes.h                  |     5 +
 include/llvm/CodeGen/JITCodeEmitter.h              |     6 +-
 include/llvm/CodeGen/LatencyPriorityQueue.h        |     2 +-
 include/llvm/CodeGen/LexicalScopes.h               |   218 +-
 include/llvm/CodeGen/LinkAllCodegenComponents.h    |    15 +-
 include/llvm/CodeGen/LiveInterval.h                |    24 +-
 include/llvm/CodeGen/LiveIntervalAnalysis.h        |     6 +-
 include/llvm/CodeGen/LiveIntervalUnion.h           |     6 +-
 include/llvm/CodeGen/LivePhysRegs.h                |     2 +-
 include/llvm/CodeGen/LiveRangeEdit.h               |     4 +-
 include/llvm/CodeGen/LiveRegMatrix.h               |     3 +-
 include/llvm/CodeGen/LiveStackAnalysis.h           |     2 +-
 include/llvm/CodeGen/MachineBasicBlock.h           |    37 +-
 include/llvm/CodeGen/MachineBlockFrequencyInfo.h   |    12 +-
 include/llvm/CodeGen/MachineCodeEmitter.h          |     2 +-
 include/llvm/CodeGen/MachineCodeInfo.h             |     2 +-
 include/llvm/CodeGen/MachineFrameInfo.h            |     2 +-
 include/llvm/CodeGen/MachineFunction.h             |    17 +-
 include/llvm/CodeGen/MachineInstr.h                |    89 +-
 include/llvm/CodeGen/MachineInstrBuilder.h         |     2 +-
 include/llvm/CodeGen/MachineInstrBundle.h          |     2 +-
 include/llvm/CodeGen/MachineMemOperand.h           |    31 +-
 include/llvm/CodeGen/MachineModuleInfo.h           |     6 +-
 include/llvm/CodeGen/MachineOperand.h              |    20 +-
 include/llvm/CodeGen/MachinePassRegistry.h         |     4 +-
 include/llvm/CodeGen/MachinePostDominators.h       |     2 +-
 include/llvm/CodeGen/MachineRegisterInfo.h         |    66 +-
 include/llvm/CodeGen/MachineSSAUpdater.h           |     2 +-
 include/llvm/CodeGen/MachineScheduler.h            |    39 +-
 include/llvm/CodeGen/MachineTraceMetrics.h         |     2 +-
 include/llvm/CodeGen/MachineValueType.h            |     1 +
 include/llvm/CodeGen/PBQP/CostAllocator.h          |     2 +-
 include/llvm/CodeGen/PBQP/Graph.h                  |     8 +-
 include/llvm/CodeGen/PBQP/RegAllocSolver.h         |     4 +-
 include/llvm/CodeGen/Passes.h                      |    21 +-
 include/llvm/CodeGen/PseudoSourceValue.h           |    31 +-
 include/llvm/CodeGen/RegAllocPBQP.h                |     2 +-
 include/llvm/CodeGen/RegisterClassInfo.h           |     9 +-
 include/llvm/CodeGen/RegisterPressure.h            |    15 +-
 include/llvm/CodeGen/RegisterScavenging.h          |     6 +-
 include/llvm/CodeGen/ResourcePriorityQueue.h       |     2 +-
 include/llvm/CodeGen/ScheduleDAG.h                 |    71 +-
 include/llvm/CodeGen/ScheduleDAGInstrs.h           |    13 +-
 include/llvm/CodeGen/ScoreboardHazardRecognizer.h  |     4 +-
 include/llvm/CodeGen/SelectionDAG.h                |   134 +-
 include/llvm/CodeGen/SelectionDAGISel.h            |     4 +-
 include/llvm/CodeGen/SelectionDAGNodes.h           |    83 +-
 include/llvm/CodeGen/SlotIndexes.h                 |    22 +-
 include/llvm/CodeGen/StackMaps.h                   |    40 +-
 include/llvm/CodeGen/StackProtector.h              |     5 +-
 .../llvm/CodeGen/TargetLoweringObjectFileImpl.h    |    17 +-
 include/llvm/CodeGen/TargetSchedule.h              |     6 +-
 include/llvm/CodeGen/ValueTypes.h                  |     6 +-
 include/llvm/CodeGen/VirtRegMap.h                  |     2 +-
 include/llvm/DebugInfo/DIContext.h                 |    50 +-
 include/llvm/DebugInfo/DWARFFormValue.h            |     4 +-
 include/llvm/ExecutionEngine/ExecutionEngine.h     |    55 +-
 include/llvm/ExecutionEngine/JITEventListener.h    |     8 +-
 include/llvm/ExecutionEngine/ObjectImage.h         |     8 +
 include/llvm/ExecutionEngine/RTDyldMemoryManager.h |     2 +-
 include/llvm/ExecutionEngine/RuntimeDyld.h         |     2 +-
 .../llvm/ExecutionEngine/SectionMemoryManager.h    |     2 +-
 include/llvm/IR/Argument.h                         |     6 +-
 include/llvm/IR/Attributes.h                       |    14 +-
 include/llvm/IR/BasicBlock.h                       |    13 +-
 include/llvm/IR/CallSite.h                         |    13 +-
 include/llvm/IR/CallingConv.h                      |     8 +-
 include/llvm/IR/ConstantRange.h                    |     4 +-
 include/llvm/IR/Constants.h                        |     8 +-
 include/llvm/IR/DIBuilder.h                        |    54 +-
 include/llvm/IR/DataLayout.h                       |    17 +-
 include/llvm/IR/DebugInfo.h                        |    51 +-
 include/llvm/IR/DebugLoc.h                         |     5 +-
 include/llvm/IR/DerivedTypes.h                     |     6 +-
 include/llvm/IR/DiagnosticInfo.h                   |   192 +-
 include/llvm/IR/Dominators.h                       |     2 +-
 include/llvm/IR/Function.h                         |    18 +-
 include/llvm/IR/GVMaterializer.h                   |    20 +-
 include/llvm/IR/GetElementPtrTypeIterator.h        |     4 +-
 include/llvm/IR/GlobalAlias.h                      |    47 +-
 include/llvm/IR/GlobalObject.h                     |    58 +
 include/llvm/IR/GlobalValue.h                      |   143 +-
 include/llvm/IR/GlobalVariable.h                   |     8 +-
 include/llvm/IR/IRBuilder.h                        |    70 +-
 include/llvm/IR/InstrTypes.h                       |    36 +-
 include/llvm/IR/Instruction.h                      |     6 +-
 include/llvm/IR/Instructions.h                     |   271 +-
 include/llvm/IR/Intrinsics.td                      |     5 +-
 include/llvm/IR/IntrinsicsAArch64.td               |   983 +-
 include/llvm/IR/IntrinsicsARM.td                   |     8 +-
 include/llvm/IR/IntrinsicsARM64.td                 |   628 -
 include/llvm/IR/IntrinsicsNVVM.td                  |   920 ++
 include/llvm/IR/IntrinsicsX86.td                   |   197 +-
 include/llvm/IR/LLVMContext.h                      |    36 +-
 include/llvm/IR/LegacyPassManagers.h               |    12 +-
 include/llvm/IR/LegacyPassNameParser.h             |     8 +-
 include/llvm/IR/MDBuilder.h                        |   120 +-
 include/llvm/IR/Metadata.h                         |     4 +-
 include/llvm/IR/Module.h                           |   148 +-
 include/llvm/IR/PassManager.h                      |    12 +-
 include/llvm/IR/PredIteratorCache.h                |     2 +-
 include/llvm/IR/SymbolTableListTraits.h            |     8 +-
 include/llvm/IR/Type.h                             |     6 +-
 include/llvm/IR/Use.h                              |     4 +-
 include/llvm/IR/User.h                             |    37 +-
 include/llvm/IR/Value.h                            |    68 +-
 include/llvm/IR/ValueHandle.h                      |    10 +-
 include/llvm/IR/ValueMap.h                         |     6 +-
 include/llvm/IR/Verifier.h                         |    17 +-
 include/llvm/InitializePasses.h                    |     3 +
 include/llvm/LTO/LTOCodeGenerator.h                |    42 +-
 include/llvm/LTO/LTOModule.h                       |   152 +-
 include/llvm/LineEditor/LineEditor.h               |     4 +-
 include/llvm/LinkAllPasses.h                       |    17 +-
 include/llvm/MC/MCAsmInfo.h                        |     2 +-
 include/llvm/MC/MCAsmLayout.h                      |     8 +
 include/llvm/MC/MCAssembler.h                      |    81 +-
 include/llvm/MC/MCContext.h                        |    37 +-
 include/llvm/MC/MCDisassembler.h                   |    33 +-
 include/llvm/MC/MCDwarf.h                          |    18 +-
 include/llvm/MC/MCELFStreamer.h                    |     7 +-
 include/llvm/MC/MCELFSymbolFlags.h                 |     8 +-
 include/llvm/MC/MCExpr.h                           |    15 +-
 include/llvm/MC/MCExternalSymbolizer.h             |     2 +-
 include/llvm/MC/MCFixup.h                          |     2 -
 include/llvm/MC/MCFunction.h                       |    12 +-
 include/llvm/MC/MCInst.h                           |     8 +-
 include/llvm/MC/MCInstPrinter.h                    |     5 +-
 include/llvm/MC/MCInstrDesc.h                      |     6 +-
 include/llvm/MC/MCInstrItineraries.h               |     6 +-
 include/llvm/MC/MCModule.h                         |     5 +-
 include/llvm/MC/MCObjectFileInfo.h                 |     9 +-
 include/llvm/MC/MCObjectStreamer.h                 |    15 +-
 include/llvm/MC/MCParser/AsmLexer.h                |     2 +-
 include/llvm/MC/MCParser/MCAsmParser.h             |     2 +-
 include/llvm/MC/MCParser/MCParsedAsmOperand.h      |     2 +-
 include/llvm/MC/MCRegisterInfo.h                   |     8 +-
 include/llvm/MC/MCSchedule.h                       |    21 +-
 include/llvm/MC/MCSectionCOFF.h                    |     5 +-
 include/llvm/MC/MCSectionELF.h                     |     5 +-
 include/llvm/MC/MCStreamer.h                       |    49 +-
 include/llvm/MC/MCSubtargetInfo.h                  |    13 +-
 include/llvm/MC/MCSymbol.h                         |     8 +-
 include/llvm/MC/MCTargetAsmParser.h                |    16 +-
 include/llvm/MC/MCTargetOptions.h                  |    54 +
 include/llvm/MC/MCTargetOptionsCommandFlags.h      |    44 +
 include/llvm/MC/MCValue.h                          |    12 +-
 include/llvm/MC/MCWin64EH.h                        |    10 +-
 include/llvm/MC/MCWinCOFFObjectWriter.h            |     1 +
 include/llvm/MC/MCWinCOFFStreamer.h                |    75 +
 include/llvm/MC/SubtargetFeature.h                 |    13 +-
 include/llvm/Object/Archive.h                      |     8 +-
 include/llvm/Object/Binary.h                       |     3 +-
 include/llvm/Object/COFF.h                         |     9 +-
 include/llvm/Object/COFFYAML.h                     |     9 +-
 include/llvm/Object/ELF.h                          |   112 +-
 include/llvm/Object/ELFObjectFile.h                |    25 +-
 include/llvm/Object/ELFYAML.h                      |    47 +-
 include/llvm/Object/MachO.h                        |     6 +-
 include/llvm/Object/MachOUniversal.h               |     7 +-
 include/llvm/Object/ObjectFile.h                   |    55 +-
 include/llvm/Object/StringTableBuilder.h           |    59 +
 include/llvm/Object/SymbolicFile.h                 |     5 +-
 include/llvm/Object/YAML.h                         |     1 +
 include/llvm/Option/Arg.h                          |    13 +-
 include/llvm/Option/ArgList.h                      |    23 +-
 include/llvm/Option/OptSpecifier.h                 |     2 +
 include/llvm/Option/Option.h                       |     2 +-
 include/llvm/Pass.h                                |     3 +-
 include/llvm/PassAnalysisSupport.h                 |     4 +-
 include/llvm/PassRegistry.h                        |     2 +-
 include/llvm/PassSupport.h                         |     8 +-
 include/llvm/ProfileData/InstrProf.h               |     1 +
 include/llvm/ProfileData/InstrProfReader.h         |   107 +-
 include/llvm/ProfileData/InstrProfWriter.h         |     2 +-
 include/llvm/Support/ARMBuildAttributes.h          |    13 +
 include/llvm/Support/Allocator.h                   |   410 +-
 include/llvm/Support/ArrayRecycler.h               |     4 +-
 include/llvm/Support/BlockFrequency.h              |    10 -
 include/llvm/Support/BranchProbability.h           |    34 +-
 include/llvm/Support/COFF.h                        |     6 +-
 include/llvm/Support/Casting.h                     |    10 +-
 include/llvm/Support/CommandLine.h                 |    29 +-
 include/llvm/Support/Compression.h                 |     7 +-
 include/llvm/Support/CrashRecoveryContext.h        |    35 +-
 include/llvm/Support/Debug.h                       |    16 +-
 include/llvm/Support/DynamicLibrary.h              |     4 +-
 include/llvm/Support/ELF.h                         |    17 +-
 include/llvm/Support/ErrorHandling.h               |    10 +-
 include/llvm/Support/FileOutputBuffer.h            |     4 -
 include/llvm/Support/FileSystem.h                  |    29 +-
 include/llvm/Support/FileUtilities.h               |     2 +-
 include/llvm/Support/FormattedStream.h             |     8 +-
 include/llvm/Support/GCOV.h                        |    50 +-
 include/llvm/Support/GenericDomTree.h              |    45 +-
 include/llvm/Support/GenericDomTreeConstruction.h  |    12 +-
 include/llvm/Support/GraphWriter.h                 |     9 +-
 include/llvm/Support/LEB128.h                      |     2 +-
 include/llvm/Support/LineIterator.h                |     5 +-
 include/llvm/Support/LockFileManager.h             |    12 +-
 include/llvm/Support/MachO.h                       |    48 +
 include/llvm/Support/ManagedStatic.h               |     2 +-
 include/llvm/Support/Memory.h                      |    10 +-
 include/llvm/Support/MemoryBuffer.h                |    36 +-
 include/llvm/Support/OnDiskHashTable.h             |   571 +
 include/llvm/Support/Path.h                        |     5 +
 include/llvm/Support/Program.h                     |    26 +-
 include/llvm/Support/Regex.h                       |     7 +-
 include/llvm/Support/Registry.h                    |    15 +-
 include/llvm/Support/SMLoc.h                       |     4 +-
 include/llvm/Support/SaveAndRestore.h              |    48 +-
 include/llvm/Support/Signals.h                     |     2 +-
 include/llvm/Support/SourceMgr.h                   |     9 +-
 include/llvm/Support/StreamableMemoryObject.h      |     2 +-
 include/llvm/Support/StringPool.h                  |     8 +-
 include/llvm/Support/TargetRegistry.h              |    85 +-
 include/llvm/Support/Timer.h                       |    14 +-
 include/llvm/Support/Unicode.h                     |     5 +
 include/llvm/Support/UnicodeCharRanges.h           |     4 +
 include/llvm/Support/YAMLParser.h                  |   181 +-
 include/llvm/Support/YAMLTraits.h                  |   157 +-
 include/llvm/Support/circular_raw_ostream.h        |     8 +-
 include/llvm/Support/raw_ostream.h                 |    15 +-
 include/llvm/Support/system_error.h                |     4 +-
 include/llvm/TableGen/Error.h                      |     1 -
 include/llvm/TableGen/Main.h                       |     1 -
 include/llvm/TableGen/Record.h                     |   184 +-
 include/llvm/TableGen/StringMatcher.h              |    12 +-
 include/llvm/TableGen/StringToOffsetTable.h        |    10 +-
 include/llvm/Target/Target.td                      |     9 +-
 include/llvm/Target/TargetCallingConv.h            |    12 +-
 include/llvm/Target/TargetCallingConv.td           |     5 +
 include/llvm/Target/TargetFrameLowering.h          |     6 +-
 include/llvm/Target/TargetInstrInfo.h              |    18 +-
 include/llvm/Target/TargetIntrinsicInfo.h          |     4 +-
 include/llvm/Target/TargetLowering.h               |   247 +-
 include/llvm/Target/TargetLoweringObjectFile.h     |    19 +-
 include/llvm/Target/TargetMachine.h                |    87 +-
 include/llvm/Target/TargetOptions.h                |    27 +-
 include/llvm/Target/TargetRegisterInfo.h           |    18 +-
 include/llvm/Target/TargetSchedule.td              |     2 +
 include/llvm/Target/TargetSubtargetInfo.h          |    12 +-
 include/llvm/Transforms/IPO.h                      |     3 -
 include/llvm/Transforms/IPO/PassManagerBuilder.h   |    11 +-
 include/llvm/Transforms/Instrumentation.h          |     4 +-
 include/llvm/Transforms/ObjCARC.h                  |     1 -
 include/llvm/Transforms/Scalar.h                   |    24 +-
 include/llvm/Transforms/Utils/BasicBlockUtils.h    |    24 +-
 include/llvm/Transforms/Utils/BuildLibCalls.h      |     3 +-
 include/llvm/Transforms/Utils/Cloning.h            |    38 +-
 include/llvm/Transforms/Utils/CmpInstAnalysis.h    |     1 -
 include/llvm/Transforms/Utils/CodeExtractor.h      |     3 +-
 include/llvm/Transforms/Utils/CtorUtils.h          |    32 +
 include/llvm/Transforms/Utils/IntegerDivision.h    |     8 +-
 include/llvm/Transforms/Utils/Local.h              |    34 +-
 include/llvm/Transforms/Utils/LoopUtils.h          |     8 +-
 include/llvm/Transforms/Utils/PromoteMemToReg.h    |     2 +-
 include/llvm/Transforms/Utils/SSAUpdater.h         |    12 +-
 include/llvm/Transforms/Utils/SSAUpdaterImpl.h     |    16 +-
 include/llvm/Transforms/Utils/SimplifyIndVar.h     |     5 +-
 include/llvm/Transforms/Utils/SimplifyLibCalls.h   |     1 +
 .../llvm/Transforms/Utils/UnifyFunctionExitNodes.h |     3 +-
 include/llvm/Transforms/Utils/UnrollLoop.h         |     1 -
 include/llvm/Transforms/Utils/ValueMapper.h        |    31 +-
 include/llvm/Transforms/Utils/VectorUtils.h        |   180 +
 include/llvm/Transforms/Vectorize.h                |     3 +
 include/llvm/module.modulemap                      |   177 +
 include/llvm/module.modulemap.build                |     5 +
 lib/Analysis/AliasAnalysis.cpp                     |     2 +-
 lib/Analysis/AliasAnalysisCounter.cpp              |     4 +-
 lib/Analysis/AliasSetTracker.cpp                   |    18 +-
 lib/Analysis/Analysis.cpp                          |     5 +-
 lib/Analysis/Android.mk                            |     1 +
 lib/Analysis/BasicAliasAnalysis.cpp                |    43 +-
 lib/Analysis/BlockFrequencyInfo.cpp                |    13 +-
 lib/Analysis/BlockFrequencyInfoImpl.cpp            |   995 ++
 lib/Analysis/BranchProbabilityInfo.cpp             |    18 +-
 lib/Analysis/CFG.cpp                               |     8 +-
 lib/Analysis/CFGPrinter.cpp                        |    13 +-
 lib/Analysis/CGSCCPassManager.cpp                  |   167 +
 lib/Analysis/CMakeLists.txt                        |     2 +
 lib/Analysis/ConstantFolding.cpp                   |   160 +-
 lib/Analysis/CostModel.cpp                         |    40 +-
 lib/Analysis/Delinearization.cpp                   |    32 +-
 lib/Analysis/DependenceAnalysis.cpp                |   164 +-
 lib/Analysis/DominanceFrontier.cpp                 |     4 +-
 lib/Analysis/IPA/CallGraph.cpp                     |    12 +-
 lib/Analysis/IPA/CallGraphSCCPass.cpp              |    27 +-
 lib/Analysis/IPA/GlobalsModRef.cpp                 |    21 +-
 lib/Analysis/IPA/InlineCost.cpp                    |    64 +-
 lib/Analysis/IVUsers.cpp                           |    13 +-
 lib/Analysis/InstCount.cpp                         |     5 +-
 lib/Analysis/InstructionSimplify.cpp               |   203 +-
 lib/Analysis/IntervalPartition.cpp                 |     2 +-
 lib/Analysis/LazyCallGraph.cpp                     |   673 +-
 lib/Analysis/LazyValueInfo.cpp                     |    21 +-
 lib/Analysis/LibCallAliasAnalysis.cpp              |     2 +-
 lib/Analysis/LibCallSemantics.cpp                  |     4 +-
 lib/Analysis/Lint.cpp                              |    40 +-
 lib/Analysis/Loads.cpp                             |    10 +-
 lib/Analysis/LoopInfo.cpp                          |    30 +-
 lib/Analysis/LoopPass.cpp                          |     9 +-
 lib/Analysis/MemDepPrinter.cpp                     |    14 +-
 lib/Analysis/MemoryBuiltins.cpp                    |    57 +-
 lib/Analysis/MemoryDependenceAnalysis.cpp          |    31 +-
 lib/Analysis/NoAliasAnalysis.cpp                   |     2 +-
 lib/Analysis/PHITransAddr.cpp                      |    44 +-
 lib/Analysis/PostDominators.cpp                    |     4 +-
 lib/Analysis/RegionInfo.cpp                        |    85 +-
 lib/Analysis/RegionPass.cpp                        |    18 +-
 lib/Analysis/RegionPrinter.cpp                     |    22 +-
 lib/Analysis/ScalarEvolution.cpp                   |  1167 +-
 lib/Analysis/ScalarEvolutionAliasAnalysis.cpp      |     8 +-
 lib/Analysis/ScalarEvolutionExpander.cpp           |    51 +-
 lib/Analysis/ScalarEvolutionNormalization.cpp      |     2 +-
 lib/Analysis/SparsePropagation.cpp                 |     7 +-
 lib/Analysis/TargetTransformInfo.cpp               |    15 +-
 lib/Analysis/TypeBasedAliasAnalysis.cpp            |    30 +-
 lib/Analysis/ValueTracking.cpp                     |   264 +-
 lib/AsmParser/LLLexer.cpp                          |    19 +-
 lib/AsmParser/LLLexer.h                            |     4 +
 lib/AsmParser/LLParser.cpp                         |   302 +-
 lib/AsmParser/LLParser.h                           |    10 +-
 lib/AsmParser/LLToken.h                            |     6 +-
 lib/AsmParser/Parser.cpp                           |     8 +-
 lib/AsmParser/module.modulemap                     |     1 +
 lib/Bitcode/Reader/BitReader.cpp                   |     4 +-
 lib/Bitcode/Reader/BitcodeReader.cpp               |   222 +-
 lib/Bitcode/Reader/BitcodeReader.h                 |    20 +-
 lib/Bitcode/Reader/BitstreamReader.cpp             |     2 +-
 lib/Bitcode/Writer/BitWriter.cpp                   |     1 +
 lib/Bitcode/Writer/BitcodeWriter.cpp               |   118 +-
 lib/Bitcode/module.modulemap                       |     1 +
 lib/CodeGen/AggressiveAntiDepBreaker.cpp           |    52 +-
 lib/CodeGen/AggressiveAntiDepBreaker.h             |     3 +-
 lib/CodeGen/AllocationOrder.cpp                    |     3 +-
 lib/CodeGen/Analysis.cpp                           |     6 +-
 lib/CodeGen/Android.mk                             |     1 +
 lib/CodeGen/AsmPrinter/ARMException.cpp            |    23 +-
 lib/CodeGen/AsmPrinter/AddressPool.cpp             |    45 +
 lib/CodeGen/AsmPrinter/AddressPool.h               |    52 +
 lib/CodeGen/AsmPrinter/Android.mk                  |    12 +-
 lib/CodeGen/AsmPrinter/AsmPrinter.cpp              |   287 +-
 lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp         |    74 +-
 lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp     |    25 +-
 lib/CodeGen/AsmPrinter/CMakeLists.txt              |     4 +
 lib/CodeGen/AsmPrinter/DIE.cpp                     |    21 +-
 lib/CodeGen/AsmPrinter/DIE.h                       |    37 +-
 lib/CodeGen/AsmPrinter/DIEHash.cpp                 |    18 +-
 lib/CodeGen/AsmPrinter/DIEHash.h                   |     2 +-
 .../AsmPrinter/DbgValueHistoryCalculator.cpp       |   175 +
 lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h |    54 +
 lib/CodeGen/AsmPrinter/DebugLocEntry.h             |   138 +-
 lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp         |    39 +-
 lib/CodeGen/AsmPrinter/DwarfAccelTable.h           |    29 +-
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp              |  1609 +-
 lib/CodeGen/AsmPrinter/DwarfDebug.h                |   211 +-
 lib/CodeGen/AsmPrinter/DwarfException.cpp          |    28 +-
 lib/CodeGen/AsmPrinter/DwarfFile.cpp               |   156 +
 lib/CodeGen/AsmPrinter/DwarfFile.h                 |    84 +
 lib/CodeGen/AsmPrinter/DwarfStringPool.cpp         |    74 +
 lib/CodeGen/AsmPrinter/DwarfStringPool.h           |    55 +
 lib/CodeGen/AsmPrinter/DwarfUnit.cpp               |   838 +-
 lib/CodeGen/AsmPrinter/DwarfUnit.h                 |   174 +-
 lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp   |    27 +-
 lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h     |     4 +-
 lib/CodeGen/AtomicExpandLoadLinkedPass.cpp         |   337 +
 lib/CodeGen/BasicTargetTransformInfo.cpp           |    90 +-
 lib/CodeGen/BranchFolding.cpp                      |    74 +-
 lib/CodeGen/CMakeLists.txt                         |     1 +
 lib/CodeGen/CalcSpillWeights.cpp                   |    14 +-
 lib/CodeGen/CallingConvLower.cpp                   |    12 +-
 lib/CodeGen/CodeGen.cpp                            |     1 +
 lib/CodeGen/CodeGenPrepare.cpp                     |   457 +-
 lib/CodeGen/CriticalAntiDepBreaker.cpp             |    34 +-
 lib/CodeGen/DFAPacketizer.cpp                      |     4 +-
 lib/CodeGen/DeadMachineInstructionElim.cpp         |     5 +-
 lib/CodeGen/DwarfEHPrepare.cpp                     |    11 +-
 lib/CodeGen/EarlyIfConversion.cpp                  |    19 +-
 lib/CodeGen/EdgeBundles.cpp                        |    32 +-
 lib/CodeGen/ExecutionDepsFix.cpp                   |    17 +-
 lib/CodeGen/ExpandISelPseudos.cpp                  |     3 +-
 lib/CodeGen/ExpandPostRAPseudos.cpp                |     3 +-
 lib/CodeGen/GCMetadata.cpp                         |    17 +-
 lib/CodeGen/GCStrategy.cpp                         |    14 +-
 lib/CodeGen/IfConversion.cpp                       |    44 +-
 lib/CodeGen/InlineSpiller.cpp                      |    17 +-
 lib/CodeGen/InterferenceCache.cpp                  |     3 +-
 lib/CodeGen/InterferenceCache.h                    |    20 +-
 lib/CodeGen/IntrinsicLowering.cpp                  |     6 +-
 lib/CodeGen/LLVMTargetMachine.cpp                  |    63 +-
 lib/CodeGen/LatencyPriorityQueue.cpp               |    11 +-
 lib/CodeGen/LexicalScopes.cpp                      |   142 +-
 lib/CodeGen/LiveDebugVariables.cpp                 |    37 +-
 lib/CodeGen/LiveInterval.cpp                       |    12 +-
 lib/CodeGen/LiveIntervalAnalysis.cpp               |    15 +-
 lib/CodeGen/LiveIntervalUnion.cpp                  |     7 +-
 lib/CodeGen/LiveRangeCalc.cpp                      |    11 +-
 lib/CodeGen/LiveRangeCalc.h                        |     7 +-
 lib/CodeGen/LiveRangeEdit.cpp                      |    11 +-
 lib/CodeGen/LiveRegMatrix.cpp                      |     3 +-
 lib/CodeGen/LiveStackAnalysis.cpp                  |     3 +-
 lib/CodeGen/LiveVariables.cpp                      |    43 +-
 lib/CodeGen/LocalStackSlotAllocation.cpp           |     3 +-
 lib/CodeGen/MachineBasicBlock.cpp                  |    52 +-
 lib/CodeGen/MachineBlockFrequencyInfo.cpp          |    17 +-
 lib/CodeGen/MachineBlockPlacement.cpp              |    37 +-
 lib/CodeGen/MachineBranchProbabilityInfo.cpp       |     4 +-
 lib/CodeGen/MachineCSE.cpp                         |     3 +-
 lib/CodeGen/MachineCopyPropagation.cpp             |     3 +-
 lib/CodeGen/MachineFunction.cpp                    |    35 +-
 lib/CodeGen/MachineFunctionAnalysis.cpp            |     4 +-
 lib/CodeGen/MachineInstr.cpp                       |    79 +-
 lib/CodeGen/MachineLICM.cpp                        |    42 +-
 lib/CodeGen/MachineModuleInfo.cpp                  |    40 +-
 lib/CodeGen/MachinePassRegistry.cpp                |     2 +-
 lib/CodeGen/MachineRegisterInfo.cpp                |    18 +-
 lib/CodeGen/MachineSSAUpdater.cpp                  |    10 +-
 lib/CodeGen/MachineScheduler.cpp                   |    82 +-
 lib/CodeGen/MachineSink.cpp                        |    37 +-
 lib/CodeGen/MachineTraceMetrics.cpp                |    78 +-
 lib/CodeGen/MachineVerifier.cpp                    |   126 +-
 lib/CodeGen/OptimizePHIs.cpp                       |     3 +-
 lib/CodeGen/PHIElimination.cpp                     |    24 +-
 lib/CodeGen/Passes.cpp                             |    24 +-
 lib/CodeGen/PeepholeOptimizer.cpp                  |    15 +-
 lib/CodeGen/PostRASchedulerList.cpp                |    23 +-
 lib/CodeGen/ProcessImplicitDefs.cpp                |     4 +-
 lib/CodeGen/PrologEpilogInserter.cpp               |    20 +-
 lib/CodeGen/PseudoSourceValue.cpp                  |    10 +-
 lib/CodeGen/RegAllocBase.cpp                       |     5 +-
 lib/CodeGen/RegAllocBase.h                         |     3 +-
 lib/CodeGen/RegAllocBasic.cpp                      |     7 +-
 lib/CodeGen/RegAllocFast.cpp                       |     7 +-
 lib/CodeGen/RegAllocGreedy.cpp                     |   100 +-
 lib/CodeGen/RegAllocPBQP.cpp                       |    25 +-
 lib/CodeGen/RegisterClassInfo.cpp                  |     9 +-
 lib/CodeGen/RegisterCoalescer.cpp                  |    50 +-
 lib/CodeGen/RegisterCoalescer.h                    |     4 +-
 lib/CodeGen/RegisterPressure.cpp                   |     4 +-
 lib/CodeGen/RegisterScavenging.cpp                 |    15 +-
 lib/CodeGen/ScheduleDAG.cpp                        |     5 +-
 lib/CodeGen/ScheduleDAGInstrs.cpp                  |   134 +-
 lib/CodeGen/ScoreboardHazardRecognizer.cpp         |     5 +-
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp           |   612 +-
 lib/CodeGen/SelectionDAG/FastISel.cpp              |    78 +-
 lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp  |     7 +-
 lib/CodeGen/SelectionDAG/InstrEmitter.cpp          |    40 +-
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp           |   137 +-
 lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp    |    14 +-
 lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp  |   162 +-
 lib/CodeGen/SelectionDAG/LegalizeTypes.cpp         |    35 +-
 lib/CodeGen/SelectionDAG/LegalizeTypes.h           |     2 +-
 lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp  |     9 +-
 lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp     |    47 +-
 lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp   |   120 +-
 lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp |    13 +-
 lib/CodeGen/SelectionDAG/SDNodeDbgValue.h          |    16 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp       |    33 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp     |    75 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp    |    30 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h      |     2 +-
 lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp       |    11 +-
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp          |   819 +-
 lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp   |   510 +-
 lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h     |    31 +-
 lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp    |     8 +-
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp      |   133 +-
 lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp   |    10 +-
 lib/CodeGen/SelectionDAG/TargetLowering.cpp        |   236 +-
 lib/CodeGen/ShadowStackGC.cpp                      |    13 +-
 lib/CodeGen/SjLjEHPrepare.cpp                      |    11 +-
 lib/CodeGen/SlotIndexes.cpp                        |    12 +-
 lib/CodeGen/SpillPlacement.cpp                     |    33 +-
 lib/CodeGen/SpillPlacement.h                       |     2 +-
 lib/CodeGen/Spiller.cpp                            |     4 +-
 lib/CodeGen/SplitKit.cpp                           |    17 +-
 lib/CodeGen/SplitKit.h                             |     4 +-
 lib/CodeGen/StackColoring.cpp                      |    32 +-
 lib/CodeGen/StackMapLivenessAnalysis.cpp           |     5 +-
 lib/CodeGen/StackMaps.cpp                          |   289 +-
 lib/CodeGen/StackProtector.cpp                     |    23 +-
 lib/CodeGen/StackSlotColoring.cpp                  |    20 +-
 lib/CodeGen/TailDuplication.cpp                    |    29 +-
 lib/CodeGen/TargetInstrInfo.cpp                    |    46 +-
 lib/CodeGen/TargetLoweringBase.cpp                 |    38 +-
 lib/CodeGen/TargetLoweringObjectFileImpl.cpp       |    56 +-
 lib/CodeGen/TargetRegisterInfo.cpp                 |    14 +-
 lib/CodeGen/TwoAddressInstructionPass.cpp          |    23 +-
 lib/CodeGen/VirtRegMap.cpp                         |     3 +-
 lib/CodeGen/module.modulemap                       |     1 +
 lib/DebugInfo/DWARFCompileUnit.h                   |     6 +-
 lib/DebugInfo/DWARFContext.cpp                     |   246 +-
 lib/DebugInfo/DWARFContext.h                       |     2 +-
 lib/DebugInfo/DWARFDebugAbbrev.cpp                 |   112 +-
 lib/DebugInfo/DWARFDebugAbbrev.h                   |    50 +-
 lib/DebugInfo/DWARFDebugArangeSet.h                |     1 -
 lib/DebugInfo/DWARFDebugAranges.cpp                |    36 +-
 lib/DebugInfo/DWARFDebugAranges.h                  |    17 +-
 lib/DebugInfo/DWARFDebugFrame.cpp                  |    72 +-
 lib/DebugInfo/DWARFDebugFrame.h                    |     6 +-
 lib/DebugInfo/DWARFDebugInfoEntry.cpp              |    95 +-
 lib/DebugInfo/DWARFDebugInfoEntry.h                |    57 +-
 lib/DebugInfo/DWARFDebugLine.cpp                   |   348 +-
 lib/DebugInfo/DWARFDebugLine.h                     |   109 +-
 lib/DebugInfo/DWARFDebugRangeList.cpp              |    15 +-
 lib/DebugInfo/DWARFDebugRangeList.h                |    15 +-
 lib/DebugInfo/DWARFFormValue.cpp                   |    10 +-
 lib/DebugInfo/DWARFTypeUnit.h                      |    10 +-
 lib/DebugInfo/DWARFUnit.cpp                        |   145 +-
 lib/DebugInfo/DWARFUnit.h                          |    24 +-
 lib/DebugInfo/module.modulemap                     |     1 +
 lib/ExecutionEngine/ExecutionEngine.cpp            |   146 +-
 lib/ExecutionEngine/ExecutionEngineBindings.cpp    |    15 +-
 .../IntelJITEvents/IntelJITEventListener.cpp       |    10 +-
 lib/ExecutionEngine/Interpreter/Execution.cpp      |    67 +-
 .../Interpreter/ExternalFunctions.cpp              |     6 +-
 lib/ExecutionEngine/Interpreter/Interpreter.cpp    |     2 +-
 lib/ExecutionEngine/Interpreter/Interpreter.h      |     4 +-
 lib/ExecutionEngine/JIT/JIT.cpp                    |    26 +-
 lib/ExecutionEngine/JIT/JIT.h                      |     2 +-
 lib/ExecutionEngine/JIT/JITEmitter.cpp             |    36 +-
 lib/ExecutionEngine/JIT/JITMemoryManager.cpp       |    57 +-
 lib/ExecutionEngine/MCJIT/LLVMBuild.txt            |     2 +-
 lib/ExecutionEngine/MCJIT/MCJIT.cpp                |    35 +-
 lib/ExecutionEngine/MCJIT/MCJIT.h                  |     4 +-
 lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp |     2 +-
 .../OProfileJIT/OProfileJITEventListener.cpp       |    17 +-
 .../OProfileJIT/OProfileWrapper.cpp                |     3 +-
 lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp   |    13 +-
 .../RuntimeDyld/ObjectImageCommon.h                |    21 +-
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp    |    97 +-
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp |   107 +-
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h   |     8 +-
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h  |    27 +-
 .../RuntimeDyld/RuntimeDyldMachO.cpp               |   483 +-
 lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h |    60 +-
 lib/ExecutionEngine/TargetSelect.cpp               |     8 +-
 lib/IR/Android.mk                                  |     1 +
 lib/IR/AsmWriter.cpp                               |   100 +-
 lib/IR/Attributes.cpp                              |    22 +-
 lib/IR/AutoUpgrade.cpp                             |    69 +-
 lib/IR/BasicBlock.cpp                              |    20 +-
 lib/IR/CMakeLists.txt                              |     5 +-
 lib/IR/ConstantFold.cpp                            |   108 +-
 lib/IR/Constants.cpp                               |    65 +-
 lib/IR/ConstantsContext.h                          |     7 +-
 lib/IR/Core.cpp                                    |   140 +-
 lib/IR/DIBuilder.cpp                               |   204 +-
 lib/IR/DataLayout.cpp                              |    14 +-
 lib/IR/DebugInfo.cpp                               |    69 +-
 lib/IR/DebugLoc.cpp                                |    59 +-
 lib/IR/DiagnosticInfo.cpp                          |   125 +
 lib/IR/Function.cpp                                |    24 +-
 lib/IR/GCOV.cpp                                    |   183 +-
 lib/IR/Globals.cpp                                 |   155 +-
 lib/IR/IRPrintingPasses.cpp                        |     2 +-
 lib/IR/InlineAsm.cpp                               |     2 +-
 lib/IR/Instruction.cpp                             |   147 +-
 lib/IR/Instructions.cpp                            |    91 +-
 lib/IR/IntrinsicInst.cpp                           |     4 +-
 lib/IR/LLVMContext.cpp                             |    36 +-
 lib/IR/LLVMContextImpl.cpp                         |    27 +-
 lib/IR/LLVMContextImpl.h                           |    10 +-
 lib/IR/LeaksContext.h                              |    16 +-
 lib/IR/LegacyPassManager.cpp                       |    67 +-
 lib/IR/MDBuilder.cpp                               |   139 +
 lib/IR/Mangler.cpp                                 |     2 +-
 lib/IR/Metadata.cpp                                |    45 +-
 lib/IR/Module.cpp                                  |    20 +-
 lib/IR/Pass.cpp                                    |    12 +-
 lib/IR/PassManager.cpp                             |    11 +-
 lib/IR/PassRegistry.cpp                            |    22 +-
 lib/IR/SymbolTableListTraitsImpl.h                 |     4 +-
 lib/IR/Type.cpp                                    |    28 +-
 lib/IR/Use.cpp                                     |     6 +-
 lib/IR/Value.cpp                                   |    86 +-
 lib/IR/ValueSymbolTable.cpp                        |     9 +-
 lib/IR/Verifier.cpp                                |   442 +-
 lib/IR/module.modulemap                            |     1 +
 lib/IRReader/IRReader.cpp                          |    14 +-
 lib/LTO/LTOCodeGenerator.cpp                       |    60 +-
 lib/LTO/LTOModule.cpp                              |    54 +-
 lib/Linker/LinkModules.cpp                         |   282 +-
 lib/MC/Android.mk                                  |     4 +-
 lib/MC/CMakeLists.txt                              |     2 +-
 lib/MC/ELFObjectWriter.cpp                         |   446 +-
 lib/MC/MCAsmInfo.cpp                               |     6 +-
 lib/MC/MCAsmStreamer.cpp                           |   193 +-
 lib/MC/MCAssembler.cpp                             |   196 +-
 lib/MC/MCContext.cpp                               |   102 +-
 lib/MC/MCDisassembler.cpp                          |    14 -
 lib/MC/MCDisassembler/Disassembler.cpp             |    27 +-
 lib/MC/MCDwarf.cpp                                 |   111 +-
 lib/MC/MCELFStreamer.cpp                           |    11 +-
 lib/MC/MCExpr.cpp                                  |    69 +-
 lib/MC/MCExternalSymbolizer.cpp                    |    14 +-
 lib/MC/MCFixup.cpp                                 |    37 -
 lib/MC/MCFunction.cpp                              |    15 +-
 lib/MC/MCInst.cpp                                  |     4 +-
 lib/MC/MCMachOStreamer.cpp                         |    27 +-
 lib/MC/MCModule.cpp                                |    16 +-
 lib/MC/MCModuleYAML.cpp                            |    25 +-
 lib/MC/MCNullStreamer.cpp                          |    10 +-
 lib/MC/MCObjectDisassembler.cpp                    |    16 +-
 lib/MC/MCObjectFileInfo.cpp                        |   111 +-
 lib/MC/MCObjectStreamer.cpp                        |    40 +-
 lib/MC/MCObjectSymbolizer.cpp                      |    42 +-
 lib/MC/MCParser/AsmLexer.cpp                       |     8 +-
 lib/MC/MCParser/AsmParser.cpp                      |    82 +-
 lib/MC/MCParser/COFFAsmParser.cpp                  |     6 +-
 lib/MC/MCParser/DarwinAsmParser.cpp                |     9 +-
 lib/MC/MCParser/ELFAsmParser.cpp                   |     8 +-
 lib/MC/MCParser/MCAsmLexer.cpp                     |     2 +-
 lib/MC/MCParser/MCAsmParser.cpp                    |     2 +-
 lib/MC/MCRelocationInfo.cpp                        |     4 +-
 lib/MC/MCSectionCOFF.cpp                           |     5 +-
 lib/MC/MCSectionMachO.cpp                          |    16 +-
 lib/MC/MCStreamer.cpp                              |    38 +-
 lib/MC/MCSubtargetInfo.cpp                         |    17 +-
 lib/MC/MCTargetOptions.cpp                         |    19 +
 lib/MC/MCValue.cpp                                 |    20 +-
 lib/MC/MachObjectWriter.cpp                        |    34 +-
 lib/MC/SubtargetFeature.cpp                        |   154 +-
 lib/MC/WinCOFFObjectWriter.cpp                     |   294 +-
 lib/MC/WinCOFFStreamer.cpp                         |   313 +-
 lib/Object/Android.mk                              |     1 +
 lib/Object/Archive.cpp                             |    21 +-
 lib/Object/CMakeLists.txt                          |     1 +
 lib/Object/COFFObjectFile.cpp                      |    55 +-
 lib/Object/COFFYAML.cpp                            |    33 +-
 lib/Object/ELF.cpp                                 |    10 +
 lib/Object/ELFYAML.cpp                             |   429 +-
 lib/Object/LLVMBuild.txt                           |     2 +-
 lib/Object/MachOObjectFile.cpp                     |   135 +-
 lib/Object/MachOUniversal.cpp                      |    22 +-
 lib/Object/Object.cpp                              |     9 +-
 lib/Object/StringTableBuilder.cpp                  |    51 +
 lib/Option/ArgList.cpp                             |    87 +-
 lib/Option/OptTable.cpp                            |    10 +-
 lib/Option/Option.cpp                              |    20 +-
 lib/ProfileData/Android.mk                         |    33 +
 lib/ProfileData/InstrProf.cpp                      |     4 +-
 lib/ProfileData/InstrProfIndexed.h                 |    55 +
 lib/ProfileData/InstrProfReader.cpp                |   165 +-
 lib/ProfileData/InstrProfWriter.cpp                |    90 +-
 lib/Support/APFloat.cpp                            |    18 +-
 lib/Support/APInt.cpp                              |    15 +-
 lib/Support/Allocator.cpp                          |    33 +-
 lib/Support/Atomic.cpp                             |     1 +
 lib/Support/BlockFrequency.cpp                     |    95 +-
 lib/Support/BranchProbability.cpp                  |    55 +-
 lib/Support/CommandLine.cpp                        |    75 +-
 lib/Support/Compression.cpp                        |    35 +-
 lib/Support/CrashRecoveryContext.cpp               |    25 +-
 lib/Support/DAGDeltaAlgorithm.cpp                  |     2 +
 lib/Support/DataExtractor.cpp                      |     4 +-
 lib/Support/DataStream.cpp                         |     5 +-
 lib/Support/Debug.cpp                              |     2 +-
 lib/Support/Dwarf.cpp                              |    44 +-
 lib/Support/DynamicLibrary.cpp                     |    12 +-
 lib/Support/ErrorHandling.cpp                      |     6 +-
 lib/Support/FileOutputBuffer.cpp                   |    13 +-
 lib/Support/FoldingSet.cpp                         |    22 +-
 lib/Support/FormattedStream.cpp                    |     2 +-
 lib/Support/GraphWriter.cpp                        |     6 +-
 lib/Support/Host.cpp                               |    45 +-
 lib/Support/IntervalMap.cpp                        |     2 +-
 lib/Support/LineIterator.cpp                       |     7 +-
 lib/Support/LockFileManager.cpp                    |    18 +-
 lib/Support/ManagedStatic.cpp                      |    17 +-
 lib/Support/MemoryBuffer.cpp                       |   120 +-
 lib/Support/Mutex.cpp                              |    10 +-
 lib/Support/Path.cpp                               |     8 +-
 lib/Support/PrettyStackTrace.cpp                   |     4 +-
 lib/Support/RWMutex.cpp                            |    14 +-
 lib/Support/Regex.cpp                              |     2 +-
 lib/Support/SearchForAddressOfSpecialSymbol.cpp    |     2 +-
 lib/Support/SmallPtrSet.cpp                        |     2 +-
 lib/Support/SourceMgr.cpp                          |     7 +-
 lib/Support/StringMap.cpp                          |    10 +-
 lib/Support/StringRef.cpp                          |     4 +-
 lib/Support/TargetRegistry.cpp                     |    20 +-
 lib/Support/ThreadLocal.cpp                        |     4 +-
 lib/Support/Threading.cpp                          |     6 +-
 lib/Support/Timer.cpp                              |    21 +-
 lib/Support/Triple.cpp                             |    16 +-
 lib/Support/Unix/Memory.inc                        |    14 +-
 lib/Support/Unix/Path.inc                          |    22 +-
 lib/Support/Unix/Process.inc                       |     4 +-
 lib/Support/Unix/Program.inc                       |    23 +-
 lib/Support/Unix/Signals.inc                       |    32 +-
 lib/Support/Unix/TimeValue.inc                     |    10 +-
 lib/Support/Windows/DynamicLibrary.inc             |     2 +-
 lib/Support/Windows/Process.inc                    |    14 +-
 lib/Support/Windows/TimeValue.inc                  |    19 +-
 lib/Support/YAMLParser.cpp                         |    44 +-
 lib/Support/YAMLTraits.cpp                         |    58 +-
 lib/Support/raw_ostream.cpp                        |     6 +-
 lib/Support/regengine.inc                          |     2 +-
 lib/TableGen/Main.cpp                              |     1 +
 lib/TableGen/Record.cpp                            |   190 +-
 lib/TableGen/TGLexer.cpp                           |    11 +-
 lib/TableGen/TGLexer.h                             |     2 +-
 lib/TableGen/TGParser.cpp                          |   398 +-
 lib/TableGen/TGParser.h                            |    12 +-
 lib/TableGen/module.modulemap                      |     1 +
 lib/Target/AArch64/AArch64.h                       |    45 +-
 lib/Target/AArch64/AArch64.td                      |    93 +-
 lib/Target/AArch64/AArch64AddressTypePromotion.cpp |   492 +
 lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp    |   387 +
 lib/Target/AArch64/AArch64AsmPrinter.cpp           |   652 +-
 lib/Target/AArch64/AArch64AsmPrinter.h             |    76 -
 lib/Target/AArch64/AArch64BranchFixupPass.cpp      |   600 -
 lib/Target/AArch64/AArch64BranchRelaxation.cpp     |   510 +
 lib/Target/AArch64/AArch64CallingConv.td           |   197 -
 lib/Target/AArch64/AArch64CallingConvention.td     |   240 +
 .../AArch64/AArch64CleanupLocalDynamicTLSPass.cpp  |   147 +
 lib/Target/AArch64/AArch64CollectLOH.cpp           |  1117 ++
 lib/Target/AArch64/AArch64ConditionalCompares.cpp  |   919 ++
 .../AArch64/AArch64DeadRegisterDefinitionsPass.cpp |   134 +
 lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp    |   749 +
 lib/Target/AArch64/AArch64FastISel.cpp             |  1981 +++
 lib/Target/AArch64/AArch64FrameLowering.cpp        |  1295 +-
 lib/Target/AArch64/AArch64FrameLowering.h          |   125 +-
 lib/Target/AArch64/AArch64ISelDAGToDAG.cpp         |  4074 +++--
 lib/Target/AArch64/AArch64ISelLowering.cpp         | 11375 ++++++++-----
 lib/Target/AArch64/AArch64ISelLowering.h           |   701 +-
 lib/Target/AArch64/AArch64InstrAtomics.td          |   364 +
 lib/Target/AArch64/AArch64InstrFormats.td          |  9605 +++++++++--
 lib/Target/AArch64/AArch64InstrInfo.cpp            |  2646 ++-
 lib/Target/AArch64/AArch64InstrInfo.h              |   236 +-
 lib/Target/AArch64/AArch64InstrInfo.td             | 10158 ++++++------
 lib/Target/AArch64/AArch64InstrNEON.td             |  9476 -----------
 lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp   |   942 ++
 lib/Target/AArch64/AArch64MCInstLower.cpp          |   243 +-
 lib/Target/AArch64/AArch64MCInstLower.h            |    52 +
 lib/Target/AArch64/AArch64MachineFunctionInfo.cpp  |    18 -
 lib/Target/AArch64/AArch64MachineFunctionInfo.h    |   188 +-
 lib/Target/AArch64/AArch64PerfectShuffle.h         |  6586 ++++++++
 lib/Target/AArch64/AArch64PromoteConstant.cpp      |   578 +
 lib/Target/AArch64/AArch64RegisterInfo.cpp         |   452 +-
 lib/Target/AArch64/AArch64RegisterInfo.h           |   104 +-
 lib/Target/AArch64/AArch64RegisterInfo.td          |   733 +-
 lib/Target/AArch64/AArch64SchedA53.td              |   291 +
 lib/Target/AArch64/AArch64SchedCyclone.td          |   865 +
 lib/Target/AArch64/AArch64Schedule.td              |   168 +-
 lib/Target/AArch64/AArch64ScheduleA53.td           |   144 -
 lib/Target/AArch64/AArch64SelectionDAGInfo.cpp     |    48 +-
 lib/Target/AArch64/AArch64SelectionDAGInfo.h       |    17 +-
 lib/Target/AArch64/AArch64StorePairSuppress.cpp    |   168 +
 lib/Target/AArch64/AArch64Subtarget.cpp            |   117 +-
 lib/Target/AArch64/AArch64Subtarget.h              |    91 +-
 lib/Target/AArch64/AArch64TargetMachine.cpp        |   170 +-
 lib/Target/AArch64/AArch64TargetMachine.h          |    74 +-
 lib/Target/AArch64/AArch64TargetObjectFile.cpp     |    46 +-
 lib/Target/AArch64/AArch64TargetObjectFile.h       |    35 +-
 lib/Target/AArch64/AArch64TargetTransformInfo.cpp  |   385 +-
 lib/Target/AArch64/Android.mk                      |    34 +-
 lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp  |  5357 +++---
 lib/Target/AArch64/AsmParser/CMakeLists.txt        |     3 +
 lib/Target/AArch64/AsmParser/LLVMBuild.txt         |     2 +-
 lib/Target/AArch64/AsmParser/Makefile              |     2 +-
 lib/Target/AArch64/CMakeLists.txt                  |    32 +-
 .../AArch64/Disassembler/AArch64Disassembler.cpp   |  2556 ++-
 .../AArch64/Disassembler/AArch64Disassembler.h     |    40 +
 .../Disassembler/AArch64ExternalSymbolizer.cpp     |   221 +
 .../Disassembler/AArch64ExternalSymbolizer.h       |    38 +
 lib/Target/AArch64/Disassembler/Android.mk         |     3 +-
 lib/Target/AArch64/Disassembler/CMakeLists.txt     |    11 +
 lib/Target/AArch64/Disassembler/LLVMBuild.txt      |     2 +-
 lib/Target/AArch64/Disassembler/Makefile           |     2 +-
 .../AArch64/InstPrinter/AArch64InstPrinter.cpp     |  1567 +-
 .../AArch64/InstPrinter/AArch64InstPrinter.h       |   214 +-
 lib/Target/AArch64/InstPrinter/Android.mk          |     1 +
 lib/Target/AArch64/InstPrinter/CMakeLists.txt      |     4 +
 lib/Target/AArch64/InstPrinter/LLVMBuild.txt       |     2 +-
 lib/Target/AArch64/InstPrinter/Makefile            |     2 +-
 lib/Target/AArch64/LLVMBuild.txt                   |     4 +-
 .../AArch64/MCTargetDesc/AArch64AddressingModes.h  |   738 +
 .../AArch64/MCTargetDesc/AArch64AsmBackend.cpp     |  1009 +-
 .../MCTargetDesc/AArch64ELFObjectWriter.cpp        |   428 +-
 .../AArch64/MCTargetDesc/AArch64ELFStreamer.cpp    |    49 +-
 .../AArch64/MCTargetDesc/AArch64ELFStreamer.h      |     7 +-
 .../AArch64/MCTargetDesc/AArch64FixupKinds.h       |   161 +-
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp      |    67 +-
 lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h |    23 +-
 .../AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp  |   914 +-
 lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp  |   212 +-
 lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h    |   237 +-
 .../AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp   |   196 +-
 .../AArch64/MCTargetDesc/AArch64MCTargetDesc.h     |    38 +-
 .../MCTargetDesc/AArch64MachObjectWriter.cpp       |   396 +
 lib/Target/AArch64/MCTargetDesc/Android.mk         |     1 +
 lib/Target/AArch64/MCTargetDesc/CMakeLists.txt     |     7 +-
 lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt      |     2 +-
 lib/Target/AArch64/Makefile                        |    21 +-
 lib/Target/AArch64/README.txt                      |     2 -
 .../AArch64/TargetInfo/AArch64TargetInfo.cpp       |    30 +-
 lib/Target/AArch64/TargetInfo/CMakeLists.txt       |     4 +
 lib/Target/AArch64/TargetInfo/LLVMBuild.txt        |     2 +-
 lib/Target/AArch64/Utils/AArch64BaseInfo.cpp       |   424 +-
 lib/Target/AArch64/Utils/AArch64BaseInfo.h         |   592 +-
 lib/Target/AArch64/Utils/Android.mk                |    15 +
 lib/Target/AArch64/Utils/LLVMBuild.txt             |     2 +-
 lib/Target/AArch64/Utils/Makefile                  |     5 +-
 lib/Target/ARM/A15SDOptimizer.cpp                  |    17 +-
 lib/Target/ARM/ARM.h                               |     2 -
 lib/Target/ARM/ARMAsmPrinter.cpp                   |   163 +-
 lib/Target/ARM/ARMAsmPrinter.h                     |     9 +-
 lib/Target/ARM/ARMAtomicExpandPass.cpp             |   406 -
 lib/Target/ARM/ARMBaseInstrInfo.cpp                |    73 +-
 lib/Target/ARM/ARMBaseInstrInfo.h                  |     2 +-
 lib/Target/ARM/ARMBaseRegisterInfo.cpp             |    18 +-
 lib/Target/ARM/ARMBaseRegisterInfo.h               |     6 +-
 lib/Target/ARM/ARMCallingConv.h                    |   113 +-
 lib/Target/ARM/ARMCallingConv.td                   |     3 +
 lib/Target/ARM/ARMCodeEmitter.cpp                  |     9 +-
 lib/Target/ARM/ARMConstantIslandPass.cpp           |    21 +-
 lib/Target/ARM/ARMExpandPseudoInsts.cpp            |    62 +-
 lib/Target/ARM/ARMFastISel.cpp                     |    32 +-
 lib/Target/ARM/ARMFeatures.h                       |     6 +-
 lib/Target/ARM/ARMFrameLowering.cpp                |   129 +-
 lib/Target/ARM/ARMFrameLowering.h                  |     2 +-
 lib/Target/ARM/ARMHazardRecognizer.cpp             |     4 +-
 lib/Target/ARM/ARMHazardRecognizer.h               |     2 +-
 lib/Target/ARM/ARMISelDAGToDAG.cpp                 |   126 +-
 lib/Target/ARM/ARMISelLowering.cpp                 |   888 +-
 lib/Target/ARM/ARMISelLowering.h                   |    28 +-
 lib/Target/ARM/ARMInstrFormats.td                  |     5 +-
 lib/Target/ARM/ARMInstrInfo.td                     |   178 +-
 lib/Target/ARM/ARMInstrNEON.td                     |  1326 +-
 lib/Target/ARM/ARMInstrThumb.td                    |    25 +-
 lib/Target/ARM/ARMInstrThumb2.td                   |    30 +-
 lib/Target/ARM/ARMJITInfo.cpp                      |     7 +-
 lib/Target/ARM/ARMLoadStoreOptimizer.cpp           |   305 +-
 lib/Target/ARM/ARMOptimizeBarriersPass.cpp         |     8 +-
 lib/Target/ARM/ARMRegisterInfo.td                  |     8 +-
 lib/Target/ARM/ARMScheduleV6.td                    |     4 +-
 lib/Target/ARM/ARMSelectionDAGInfo.cpp             |    51 +-
 lib/Target/ARM/ARMSubtarget.cpp                    |     6 +-
 lib/Target/ARM/ARMSubtarget.h                      |     8 +-
 lib/Target/ARM/ARMTargetMachine.cpp                |    13 +-
 lib/Target/ARM/ARMTargetMachine.h                  |    17 +-
 lib/Target/ARM/ARMTargetObjectFile.cpp             |     7 +-
 lib/Target/ARM/ARMTargetObjectFile.h               |     2 +-
 lib/Target/ARM/ARMTargetTransformInfo.cpp          |     7 +-
 lib/Target/ARM/Android.mk                          |     1 -
 lib/Target/ARM/AsmParser/ARMAsmParser.cpp          |   485 +-
 lib/Target/ARM/CMakeLists.txt                      |     1 -
 lib/Target/ARM/Disassembler/ARMDisassembler.cpp    |    24 +-
 lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp      |     3 +-
 lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp      |   129 +-
 lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp |     8 +-
 lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp     |    92 +-
 lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp       |    14 +-
 lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h         |     4 +-
 lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp   |    14 +-
 lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp          |     3 +-
 lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp    |    29 +-
 lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h      |    10 +-
 .../ARM/MCTargetDesc/ARMMachObjectWriter.cpp       |    14 +-
 lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp  |     6 +-
 .../ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp    |    82 +
 lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp |    46 +
 lib/Target/ARM/MCTargetDesc/Android.mk             |     4 +-
 lib/Target/ARM/MCTargetDesc/CMakeLists.txt         |     8 +-
 lib/Target/ARM/MLxExpansionPass.cpp                |     9 +-
 lib/Target/ARM/README-Thumb.txt                    |     4 -
 lib/Target/ARM/Thumb1FrameLowering.cpp             |     4 +-
 lib/Target/ARM/Thumb1RegisterInfo.h                |     2 +-
 lib/Target/ARM/Thumb2ITBlockPass.cpp               |     3 +-
 lib/Target/ARM/Thumb2SizeReduction.cpp             |    14 +-
 lib/Target/ARM64/ARM64.h                           |    48 -
 lib/Target/ARM64/ARM64.td                          |    95 -
 lib/Target/ARM64/ARM64AddressTypePromotion.cpp     |   496 -
 lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp        |   392 -
 lib/Target/ARM64/ARM64AsmPrinter.cpp               |   563 -
 lib/Target/ARM64/ARM64BranchRelaxation.cpp         |   505 -
 lib/Target/ARM64/ARM64CallingConv.h                |    94 -
 lib/Target/ARM64/ARM64CallingConvention.td         |   210 -
 .../ARM64/ARM64CleanupLocalDynamicTLSPass.cpp      |   147 -
 lib/Target/ARM64/ARM64CollectLOH.cpp               |  1157 --
 lib/Target/ARM64/ARM64ConditionalCompares.cpp      |   918 --
 .../ARM64/ARM64DeadRegisterDefinitionsPass.cpp     |   104 -
 lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp        |   737 -
 lib/Target/ARM64/ARM64FastISel.cpp                 |  1929 ---
 lib/Target/ARM64/ARM64FrameLowering.cpp            |   816 -
 lib/Target/ARM64/ARM64FrameLowering.h              |    75 -
 lib/Target/ARM64/ARM64ISelDAGToDAG.cpp             |  2381 ---
 lib/Target/ARM64/ARM64ISelLowering.cpp             |  7551 ---------
 lib/Target/ARM64/ARM64ISelLowering.h               |   422 -
 lib/Target/ARM64/ARM64InstrAtomics.td              |   293 -
 lib/Target/ARM64/ARM64InstrFormats.td              |  8193 ---------
 lib/Target/ARM64/ARM64InstrInfo.cpp                |  1864 ---
 lib/Target/ARM64/ARM64InstrInfo.h                  |   219 -
 lib/Target/ARM64/ARM64InstrInfo.td                 |  4458 -----
 lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp       |   947 --
 lib/Target/ARM64/ARM64MCInstLower.cpp              |   201 -
 lib/Target/ARM64/ARM64MCInstLower.h                |    52 -
 lib/Target/ARM64/ARM64MachineFunctionInfo.h        |   139 -
 lib/Target/ARM64/ARM64PerfectShuffle.h             |  6586 --------
 lib/Target/ARM64/ARM64PromoteConstant.cpp          |   585 -
 lib/Target/ARM64/ARM64RegisterInfo.cpp             |   400 -
 lib/Target/ARM64/ARM64RegisterInfo.h               |   101 -
 lib/Target/ARM64/ARM64RegisterInfo.td              |   561 -
 lib/Target/ARM64/ARM64SchedCyclone.td              |   852 -
 lib/Target/ARM64/ARM64Schedule.td                  |    92 -
 lib/Target/ARM64/ARM64SelectionDAGInfo.cpp         |    57 -
 lib/Target/ARM64/ARM64SelectionDAGInfo.h           |    37 -
 lib/Target/ARM64/ARM64StorePairSuppress.cpp        |   167 -
 lib/Target/ARM64/ARM64Subtarget.cpp                |   100 -
 lib/Target/ARM64/ARM64Subtarget.h                  |    87 -
 lib/Target/ARM64/ARM64TargetMachine.cpp            |   157 -
 lib/Target/ARM64/ARM64TargetMachine.h              |    69 -
 lib/Target/ARM64/ARM64TargetObjectFile.cpp         |    52 -
 lib/Target/ARM64/ARM64TargetObjectFile.h           |    40 -
 lib/Target/ARM64/ARM64TargetTransformInfo.cpp      |   326 -
 lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp      |  4832 ------
 lib/Target/ARM64/AsmParser/CMakeLists.txt          |     6 -
 lib/Target/ARM64/AsmParser/LLVMBuild.txt           |    24 -
 lib/Target/ARM64/AsmParser/Makefile                |    15 -
 lib/Target/ARM64/CMakeLists.txt                    |    50 -
 .../ARM64/Disassembler/ARM64Disassembler.cpp       |  2142 ---
 lib/Target/ARM64/Disassembler/ARM64Disassembler.h  |    54 -
 lib/Target/ARM64/Disassembler/CMakeLists.txt       |    13 -
 lib/Target/ARM64/Disassembler/LLVMBuild.txt        |    24 -
 lib/Target/ARM64/Disassembler/Makefile             |    16 -
 lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp  |  1428 --
 lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h    |   157 -
 lib/Target/ARM64/InstPrinter/CMakeLists.txt        |     7 -
 lib/Target/ARM64/InstPrinter/LLVMBuild.txt         |    24 -
 lib/Target/ARM64/InstPrinter/Makefile              |    15 -
 lib/Target/ARM64/LLVMBuild.txt                     |    36 -
 .../ARM64/MCTargetDesc/ARM64AddressingModes.h      |   758 -
 lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp  |   533 -
 lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h      |   998 --
 .../ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp    |   237 -
 lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp |   158 -
 lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h   |    26 -
 lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h    |    72 -
 lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp   |    92 -
 lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h     |    36 -
 .../ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp      |   563 -
 lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp      |   168 -
 lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h        |   162 -
 .../ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp       |   167 -
 lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h  |    62 -
 .../ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp   |   396 -
 lib/Target/ARM64/MCTargetDesc/CMakeLists.txt       |    14 -
 lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt        |    24 -
 lib/Target/ARM64/MCTargetDesc/Makefile             |    16 -
 lib/Target/ARM64/Makefile                          |    25 -
 lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp    |    21 -
 lib/Target/ARM64/TargetInfo/CMakeLists.txt         |     7 -
 lib/Target/ARM64/TargetInfo/LLVMBuild.txt          |    24 -
 lib/Target/ARM64/TargetInfo/Makefile               |    15 -
 lib/Target/CppBackend/CPPBackend.cpp               |    28 +-
 lib/Target/CppBackend/CPPTargetMachine.h           |    14 +-
 lib/Target/Hexagon/Hexagon.td                      |     2 -
 lib/Target/Hexagon/HexagonAsmPrinter.cpp           |     5 +-
 lib/Target/Hexagon/HexagonAsmPrinter.h             |    11 +-
 lib/Target/Hexagon/HexagonCFGOptimizer.cpp         |    15 +-
 lib/Target/Hexagon/HexagonCopyToCombine.cpp        |    18 +-
 lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp  |     6 +-
 lib/Target/Hexagon/HexagonFixupHwLoops.cpp         |     8 +-
 lib/Target/Hexagon/HexagonFrameLowering.cpp        |     4 +-
 lib/Target/Hexagon/HexagonFrameLowering.h          |    32 +-
 lib/Target/Hexagon/HexagonHardwareLoops.cpp        |   107 +-
 lib/Target/Hexagon/HexagonISelDAGToDAG.cpp         |    19 +-
 lib/Target/Hexagon/HexagonISelLowering.cpp         |    29 +-
 lib/Target/Hexagon/HexagonISelLowering.h           |    43 +-
 lib/Target/Hexagon/HexagonInstrFormats.td          |   167 +-
 lib/Target/Hexagon/HexagonInstrFormatsV4.td        |    31 +-
 lib/Target/Hexagon/HexagonInstrInfo.cpp            |    19 +-
 lib/Target/Hexagon/HexagonInstrInfo.h              |   197 +-
 lib/Target/Hexagon/HexagonInstrInfo.td             |    14 +-
 lib/Target/Hexagon/HexagonInstrInfoV4.td           |    35 +-
 lib/Target/Hexagon/HexagonMachineScheduler.cpp     |    20 +-
 lib/Target/Hexagon/HexagonMachineScheduler.h       |    19 +-
 lib/Target/Hexagon/HexagonNewValueJump.cpp         |    13 +-
 lib/Target/Hexagon/HexagonPeephole.cpp             |     9 +-
 lib/Target/Hexagon/HexagonRegisterInfo.cpp         |     9 +-
 lib/Target/Hexagon/HexagonRegisterInfo.h           |    17 +-
 lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp      |     6 +-
 lib/Target/Hexagon/HexagonSchedule.td              |    51 -
 lib/Target/Hexagon/HexagonScheduleV4.td            |   165 +-
 lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp     |     3 +-
 lib/Target/Hexagon/HexagonSelectionDAGInfo.h       |     3 +-
 .../Hexagon/HexagonSplitConst32AndConst64.cpp      |    24 +-
 lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp     |    10 +-
 lib/Target/Hexagon/HexagonSubtarget.cpp            |     2 +
 lib/Target/Hexagon/HexagonTargetMachine.cpp        |    38 +-
 lib/Target/Hexagon/HexagonTargetMachine.h          |    20 +-
 lib/Target/Hexagon/HexagonVLIWPacketizer.cpp       |    28 +-
 .../Hexagon/InstPrinter/HexagonInstPrinter.cpp     |     3 +-
 .../Hexagon/InstPrinter/HexagonInstPrinter.h       |     2 +-
 lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h  |    72 +-
 .../Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp      |     2 +-
 lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h |     2 +-
 lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h    |     2 +-
 .../Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp   |     6 +-
 lib/Target/LLVMBuild.txt                           |     2 +-
 .../MSP430/InstPrinter/MSP430InstPrinter.cpp       |     5 +-
 lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h  |     6 +-
 lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h   |     2 +-
 .../MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp     |     6 +-
 lib/Target/MSP430/MSP430AsmPrinter.cpp             |    13 +-
 lib/Target/MSP430/MSP430BranchSelector.cpp         |     7 +-
 lib/Target/MSP430/MSP430FrameLowering.cpp          |     2 +-
 lib/Target/MSP430/MSP430FrameLowering.h            |    22 +-
 lib/Target/MSP430/MSP430ISelDAGToDAG.cpp           |    39 +-
 lib/Target/MSP430/MSP430ISelLowering.cpp           |    23 +-
 lib/Target/MSP430/MSP430ISelLowering.h             |    56 +-
 lib/Target/MSP430/MSP430InstrInfo.cpp              |     8 +-
 lib/Target/MSP430/MSP430InstrInfo.h                |    39 +-
 lib/Target/MSP430/MSP430RegisterInfo.cpp           |    18 +-
 lib/Target/MSP430/MSP430RegisterInfo.h             |    12 +-
 lib/Target/MSP430/MSP430SelectionDAGInfo.cpp       |     3 +-
 lib/Target/MSP430/MSP430Subtarget.cpp              |     6 +-
 lib/Target/MSP430/MSP430TargetMachine.cpp          |     4 +-
 lib/Target/MSP430/MSP430TargetMachine.h            |    16 +-
 lib/Target/Mips/Android.mk                         |     1 +
 lib/Target/Mips/AsmParser/LLVMBuild.txt            |     2 +-
 lib/Target/Mips/AsmParser/MipsAsmParser.cpp        |   273 +-
 lib/Target/Mips/CMakeLists.txt                     |     2 +
 lib/Target/Mips/Disassembler/LLVMBuild.txt         |     2 +-
 lib/Target/Mips/Disassembler/MipsDisassembler.cpp  |   370 +-
 lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp    |     5 +-
 lib/Target/Mips/InstPrinter/MipsInstPrinter.h      |     6 +-
 lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp    |    92 +-
 lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h      |    18 +-
 .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp      |    12 +
 lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h      |    12 +
 lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h       |     2 +-
 lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp |    85 +-
 lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h   |    27 +-
 lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp        |     3 +-
 lib/Target/Mips/MCTargetDesc/MipsMCExpr.h          |    10 +-
 lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h          |     2 +-
 lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp  |    13 +-
 .../Mips/MCTargetDesc/MipsNaClELFStreamer.cpp      |     7 +-
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp       |   153 +
 lib/Target/Mips/Makefile                           |     2 +-
 lib/Target/Mips/MicroMipsInstrFPU.td               |    14 +-
 lib/Target/Mips/MicroMipsInstrInfo.td              |    23 +-
 lib/Target/Mips/Mips.td                            |   101 +-
 lib/Target/Mips/Mips16FrameLowering.cpp            |     2 +-
 lib/Target/Mips/Mips16FrameLowering.h              |    20 +-
 lib/Target/Mips/Mips16HardFloat.cpp                |    14 +-
 lib/Target/Mips/Mips16HardFloat.h                  |     4 +-
 lib/Target/Mips/Mips16HardFloatInfo.cpp            |     4 +-
 lib/Target/Mips/Mips16ISelDAGToDAG.cpp             |    19 +-
 lib/Target/Mips/Mips16ISelDAGToDAG.h               |    10 +-
 lib/Target/Mips/Mips16ISelLowering.cpp             |    34 +-
 lib/Target/Mips/Mips16ISelLowering.h               |    21 +-
 lib/Target/Mips/Mips16InstrInfo.cpp                |     1 +
 lib/Target/Mips/Mips16InstrInfo.h                  |    50 +-
 lib/Target/Mips/Mips16RegisterInfo.cpp             |     2 +
 lib/Target/Mips/Mips16RegisterInfo.h               |    16 +-
 lib/Target/Mips/Mips32r6InstrFormats.td            |   386 +
 lib/Target/Mips/Mips32r6InstrInfo.td               |   583 +
 lib/Target/Mips/Mips64InstrInfo.td                 |   257 +-
 lib/Target/Mips/Mips64r6InstrInfo.td               |    88 +
 lib/Target/Mips/MipsAsmPrinter.cpp                 |    40 +-
 lib/Target/Mips/MipsAsmPrinter.h                   |    32 +-
 lib/Target/Mips/MipsCallingConv.td                 |     4 +-
 lib/Target/Mips/MipsCodeEmitter.cpp                |    46 +-
 lib/Target/Mips/MipsCondMov.td                     |   138 +-
 lib/Target/Mips/MipsConstantIslandPass.cpp         |    20 +-
 lib/Target/Mips/MipsDelaySlotFiller.cpp            |    78 +-
 lib/Target/Mips/MipsFastISel.cpp                   |   283 +
 lib/Target/Mips/MipsFrameLowering.cpp              |     2 +-
 lib/Target/Mips/MipsFrameLowering.h                |     2 +-
 lib/Target/Mips/MipsISelDAGToDAG.cpp               |    12 +-
 lib/Target/Mips/MipsISelDAGToDAG.h                 |    12 +-
 lib/Target/Mips/MipsISelLowering.cpp               |   296 +-
 lib/Target/Mips/MipsISelLowering.h                 |   103 +-
 lib/Target/Mips/MipsInstrFPU.td                    |   356 +-
 lib/Target/Mips/MipsInstrFormats.td                |    33 +-
 lib/Target/Mips/MipsInstrInfo.cpp                  |     8 +-
 lib/Target/Mips/MipsInstrInfo.h                    |    50 +-
 lib/Target/Mips/MipsInstrInfo.td                   |   412 +-
 lib/Target/Mips/MipsJITInfo.cpp                    |     3 +-
 lib/Target/Mips/MipsJITInfo.h                      |    14 +-
 lib/Target/Mips/MipsLongBranch.cpp                 |    96 +-
 lib/Target/Mips/MipsMCInstLower.cpp                |    68 +
 lib/Target/Mips/MipsMCInstLower.h                  |     8 +
 lib/Target/Mips/MipsMSAInstrInfo.td                |    90 +-
 lib/Target/Mips/MipsMachineFunction.cpp            |     7 +-
 lib/Target/Mips/MipsMachineFunction.h              |     8 +-
 lib/Target/Mips/MipsModuleISelDAGToDAG.cpp         |     2 +
 lib/Target/Mips/MipsModuleISelDAGToDAG.h           |     8 +-
 lib/Target/Mips/MipsOptimizePICCall.cpp            |    46 +-
 lib/Target/Mips/MipsOs16.cpp                       |     3 +-
 lib/Target/Mips/MipsOs16.h                         |     4 +-
 lib/Target/Mips/MipsRegisterInfo.cpp               |    16 +-
 lib/Target/Mips/MipsRegisterInfo.h                 |    21 +-
 lib/Target/Mips/MipsRegisterInfo.td                |    16 +
 lib/Target/Mips/MipsSEFrameLowering.cpp            |     3 +-
 lib/Target/Mips/MipsSEFrameLowering.h              |    14 +-
 lib/Target/Mips/MipsSEISelDAGToDAG.cpp             |    15 +-
 lib/Target/Mips/MipsSEISelDAGToDAG.h               |    66 +-
 lib/Target/Mips/MipsSEISelLowering.cpp             |   185 +-
 lib/Target/Mips/MipsSEISelLowering.h               |    36 +-
 lib/Target/Mips/MipsSEInstrInfo.cpp                |     2 +-
 lib/Target/Mips/MipsSEInstrInfo.h                  |    48 +-
 lib/Target/Mips/MipsSERegisterInfo.cpp             |     4 +-
 lib/Target/Mips/MipsSERegisterInfo.h               |    12 +-
 lib/Target/Mips/MipsSelectionDAGInfo.cpp           |     3 +-
 lib/Target/Mips/MipsSubtarget.cpp                  |    57 +-
 lib/Target/Mips/MipsSubtarget.h                    |    61 +-
 lib/Target/Mips/MipsTargetMachine.cpp              |    12 +-
 lib/Target/Mips/MipsTargetMachine.h                |    46 +-
 lib/Target/Mips/MipsTargetStreamer.h               |   134 +-
 lib/Target/NVPTX/CMakeLists.txt                    |     3 +
 lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp  |     3 +-
 lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h    |    14 +-
 lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h      |     6 +-
 .../NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp       |     6 +-
 lib/Target/NVPTX/NVPTX.h                           |     3 +
 lib/Target/NVPTX/NVPTXAllocaHoisting.h             |     6 +-
 lib/Target/NVPTX/NVPTXAsmPrinter.cpp               |   382 +-
 lib/Target/NVPTX/NVPTXAsmPrinter.h                 |    66 +-
 lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp   |     2 +-
 .../NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp       |   195 +
 lib/Target/NVPTX/NVPTXFrameLowering.h              |    10 +-
 lib/Target/NVPTX/NVPTXGenericToNVVM.cpp            |    10 +-
 lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp             |   826 +-
 lib/Target/NVPTX/NVPTXISelDAGToDAG.h               |    15 +-
 lib/Target/NVPTX/NVPTXISelLowering.cpp             |   552 +-
 lib/Target/NVPTX/NVPTXISelLowering.h               |   139 +-
 lib/Target/NVPTX/NVPTXImageOptimizer.cpp           |   178 +
 lib/Target/NVPTX/NVPTXInstrInfo.cpp                |     7 +-
 lib/Target/NVPTX/NVPTXInstrInfo.h                  |    16 +-
 lib/Target/NVPTX/NVPTXIntrinsics.td                |  1823 ++
 lib/Target/NVPTX/NVPTXLowerAggrCopies.h            |     6 +-
 lib/Target/NVPTX/NVPTXMCExpr.cpp                   |     3 +-
 lib/Target/NVPTX/NVPTXMCExpr.h                     |    12 +-
 lib/Target/NVPTX/NVPTXMachineFunctionInfo.h        |    46 +
 lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp         |     6 +-
 lib/Target/NVPTX/NVPTXRegisterInfo.cpp             |    21 +-
 lib/Target/NVPTX/NVPTXRegisterInfo.h               |    23 +-
 lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp      |   357 +
 lib/Target/NVPTX/NVPTXSection.h                    |    16 +-
 lib/Target/NVPTX/NVPTXSubtarget.cpp                |     8 +-
 lib/Target/NVPTX/NVPTXSubtarget.h                  |     7 +-
 lib/Target/NVPTX/NVPTXTargetMachine.cpp            |    47 +-
 lib/Target/NVPTX/NVPTXTargetMachine.h              |    24 +-
 lib/Target/NVPTX/NVPTXTargetObjectFile.h           |    38 +-
 lib/Target/NVPTX/NVPTXUtilities.cpp                |    67 +-
 lib/Target/NVPTX/NVPTXUtilities.h                  |     4 +
 lib/Target/NVPTX/NVVMReflect.cpp                   |    14 +-
 lib/Target/PowerPC/AsmParser/LLVMBuild.txt         |     4 +-
 lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp      |    52 +-
 lib/Target/PowerPC/Disassembler/LLVMBuild.txt      |     2 +-
 .../PowerPC/Disassembler/PPCDisassembler.cpp       |    11 +-
 lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp  |     3 +-
 lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h    |     6 +-
 lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp  |    20 +-
 .../PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp    |     7 +-
 lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp   |     4 +-
 lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h     |     4 +-
 .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp      |     5 +-
 lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp      |     3 +-
 lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h        |    10 +-
 .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp       |    25 +-
 .../PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp   |     8 +-
 lib/Target/PowerPC/PPCAsmPrinter.cpp               |    82 +-
 lib/Target/PowerPC/PPCBranchSelector.cpp           |     9 +-
 lib/Target/PowerPC/PPCCTRLoops.cpp                 |    30 +-
 lib/Target/PowerPC/PPCCodeEmitter.cpp              |    10 +-
 lib/Target/PowerPC/PPCFastISel.cpp                 |    71 +-
 lib/Target/PowerPC/PPCFrameLowering.cpp            |   102 +-
 lib/Target/PowerPC/PPCFrameLowering.h              |    28 +-
 lib/Target/PowerPC/PPCHazardRecognizers.cpp        |     5 +-
 lib/Target/PowerPC/PPCHazardRecognizers.h          |    24 +-
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp             |   164 +-
 lib/Target/PowerPC/PPCISelLowering.cpp             |   308 +-
 lib/Target/PowerPC/PPCISelLowering.h               |   120 +-
 lib/Target/PowerPC/PPCInstrAltivec.td              |     2 +-
 lib/Target/PowerPC/PPCInstrInfo.cpp                |    43 +-
 lib/Target/PowerPC/PPCInstrInfo.h                  |   169 +-
 lib/Target/PowerPC/PPCInstrInfo.td                 |     8 +-
 lib/Target/PowerPC/PPCInstrVSX.td                  |     2 +-
 lib/Target/PowerPC/PPCJITInfo.cpp                  |     3 +-
 lib/Target/PowerPC/PPCJITInfo.h                    |    16 +-
 lib/Target/PowerPC/PPCMCInstLower.cpp              |     2 +-
 lib/Target/PowerPC/PPCRegisterInfo.cpp             |     9 +-
 lib/Target/PowerPC/PPCRegisterInfo.h               |    40 +-
 lib/Target/PowerPC/PPCRegisterInfo.td              |    12 +
 lib/Target/PowerPC/PPCSelectionDAGInfo.cpp         |     3 +-
 lib/Target/PowerPC/PPCSubtarget.cpp                |    32 +-
 lib/Target/PowerPC/PPCSubtarget.h                  |    15 +-
 lib/Target/PowerPC/PPCTargetMachine.cpp            |    36 +-
 lib/Target/PowerPC/PPCTargetMachine.h              |    26 +-
 lib/Target/PowerPC/PPCTargetTransformInfo.cpp      |   187 +-
 lib/Target/R600/AMDGPU.h                           |    11 +-
 lib/Target/R600/AMDGPU.td                          |    11 +
 lib/Target/R600/AMDGPUAsmPrinter.cpp               |    25 +-
 lib/Target/R600/AMDGPUAsmPrinter.h                 |    13 +-
 lib/Target/R600/AMDGPUCallingConv.td               |     2 +-
 lib/Target/R600/AMDGPUConvertToISA.cpp             |     4 +-
 lib/Target/R600/AMDGPUFrameLowering.cpp            |     2 +-
 lib/Target/R600/AMDGPUFrameLowering.h              |    13 +-
 lib/Target/R600/AMDGPUISelDAGToDAG.cpp             |   291 +-
 lib/Target/R600/AMDGPUISelLowering.cpp             |   744 +-
 lib/Target/R600/AMDGPUISelLowering.h               |    91 +-
 lib/Target/R600/AMDGPUInstrInfo.cpp                |    22 +-
 lib/Target/R600/AMDGPUInstrInfo.h                  |    73 +-
 lib/Target/R600/AMDGPUInstrInfo.td                 |    15 +
 lib/Target/R600/AMDGPUInstructions.td              |    49 +-
 lib/Target/R600/AMDGPUIntrinsics.td                |     4 +
 lib/Target/R600/AMDGPUMCInstLower.cpp              |    28 +-
 lib/Target/R600/AMDGPUMCInstLower.h                |    16 +-
 lib/Target/R600/AMDGPURegisterInfo.cpp             |     8 +-
 lib/Target/R600/AMDGPURegisterInfo.h               |    14 +-
 lib/Target/R600/AMDGPUSubtarget.cpp                |    13 +-
 lib/Target/R600/AMDGPUSubtarget.h                  |    15 +-
 lib/Target/R600/AMDGPUTargetMachine.cpp            |    23 +-
 lib/Target/R600/AMDGPUTargetMachine.h              |    27 +-
 lib/Target/R600/AMDGPUTargetTransformInfo.cpp      |    26 +-
 lib/Target/R600/AMDILCFGStructurizer.cpp           |    58 +-
 lib/Target/R600/AMDILISelLowering.cpp              |    92 +-
 lib/Target/R600/AMDILIntrinsicInfo.cpp             |     2 +-
 lib/Target/R600/AMDILIntrinsicInfo.h               |    12 +-
 lib/Target/R600/AMDILIntrinsics.td                 |     4 -
 lib/Target/R600/CMakeLists.txt                     |     1 +
 lib/Target/R600/CaymanInstructions.td              |     7 +-
 lib/Target/R600/EvergreenInstructions.td           |    18 +-
 lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp  |   118 +-
 lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h    |    13 +-
 lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp  |    26 +-
 lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp   |     4 +-
 lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h     |     2 +-
 .../R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp       |     4 +-
 lib/Target/R600/MCTargetDesc/LLVMBuild.txt         |     4 +-
 lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp |    10 +-
 lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp   |    10 +-
 lib/Target/R600/Processors.td                      |     2 +
 lib/Target/R600/R600ClauseMergePass.cpp            |     7 +-
 lib/Target/R600/R600ControlFlowFinalizer.cpp       |    15 +-
 lib/Target/R600/R600EmitClauseMarkers.cpp          |     6 +-
 lib/Target/R600/R600ExpandSpecialInstrs.cpp        |     6 +-
 lib/Target/R600/R600ISelLowering.cpp               |   150 +-
 lib/Target/R600/R600ISelLowering.h                 |    32 +-
 lib/Target/R600/R600InstrInfo.cpp                  |     8 +-
 lib/Target/R600/R600InstrInfo.h                    |    76 +-
 lib/Target/R600/R600Instructions.td                |     6 +
 lib/Target/R600/R600MachineFunctionInfo.h          |     2 +-
 lib/Target/R600/R600MachineScheduler.cpp           |    14 +-
 lib/Target/R600/R600MachineScheduler.h             |    15 +-
 lib/Target/R600/R600OptimizeVectorRegisters.cpp    |    11 +-
 lib/Target/R600/R600Packetizer.cpp                 |    24 +-
 lib/Target/R600/R600RegisterInfo.h                 |    15 +-
 lib/Target/R600/R600TextureIntrinsicsReplacer.cpp  |     8 +-
 lib/Target/R600/SIAnnotateControlFlow.cpp          |    28 +-
 lib/Target/R600/SIFixSGPRCopies.cpp                |    23 +-
 lib/Target/R600/SIISelLowering.cpp                 |   307 +-
 lib/Target/R600/SIISelLowering.h                   |    36 +-
 lib/Target/R600/SIInsertWaits.cpp                  |     8 +-
 lib/Target/R600/SIInstrFormats.td                  |    23 +-
 lib/Target/R600/SIInstrInfo.cpp                    |   379 +-
 lib/Target/R600/SIInstrInfo.h                      |    66 +-
 lib/Target/R600/SIInstrInfo.td                     |   146 +-
 lib/Target/R600/SIInstructions.td                  |  1255 +-
 lib/Target/R600/SILowerControlFlow.cpp             |    10 +-
 lib/Target/R600/SILowerI1Copies.cpp                |   148 +
 lib/Target/R600/SIMachineFunctionInfo.cpp          |    57 +-
 lib/Target/R600/SIMachineFunctionInfo.h            |     9 +-
 lib/Target/R600/SIRegisterInfo.cpp                 |    18 +-
 lib/Target/R600/SIRegisterInfo.h                   |    20 +-
 lib/Target/R600/SIRegisterInfo.td                  |    14 +-
 lib/Target/R600/SITypeRewriter.cpp                 |    24 +-
 lib/Target/Sparc/AsmParser/LLVMBuild.txt           |     2 +-
 lib/Target/Sparc/AsmParser/SparcAsmParser.cpp      |    51 +-
 lib/Target/Sparc/DelaySlotFiller.cpp               |     7 +-
 lib/Target/Sparc/Disassembler/LLVMBuild.txt        |     2 +-
 .../Sparc/Disassembler/SparcDisassembler.cpp       |    29 +-
 lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp  |     4 +-
 lib/Target/Sparc/InstPrinter/SparcInstPrinter.h    |     8 +-
 lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp  |    16 +-
 lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp   |     2 +-
 lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h     |    14 +-
 .../Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp      |     5 +-
 lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp      |     3 +-
 lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h        |    10 +-
 .../Sparc/MCTargetDesc/SparcMCTargetDesc.cpp       |    16 +-
 lib/Target/Sparc/SparcAsmPrinter.cpp               |    17 +-
 lib/Target/Sparc/SparcCodeEmitter.cpp              |    13 +-
 lib/Target/Sparc/SparcFrameLowering.cpp            |     9 +-
 lib/Target/Sparc/SparcFrameLowering.h              |    17 +-
 lib/Target/Sparc/SparcISelDAGToDAG.cpp             |    12 +-
 lib/Target/Sparc/SparcISelLowering.cpp             |   110 +-
 lib/Target/Sparc/SparcISelLowering.h               |    52 +-
 lib/Target/Sparc/SparcInstr64Bit.td                |     4 +-
 lib/Target/Sparc/SparcInstrAliases.td              |     8 +-
 lib/Target/Sparc/SparcInstrInfo.cpp                |    15 +-
 lib/Target/Sparc/SparcInstrInfo.h                  |    68 +-
 lib/Target/Sparc/SparcJITInfo.cpp                  |     3 +-
 lib/Target/Sparc/SparcJITInfo.h                    |    14 +-
 lib/Target/Sparc/SparcMCInstLower.cpp              |     2 +-
 lib/Target/Sparc/SparcRegisterInfo.cpp             |     8 +-
 lib/Target/Sparc/SparcRegisterInfo.h               |    15 +-
 lib/Target/Sparc/SparcSelectionDAGInfo.cpp         |     3 +-
 lib/Target/Sparc/SparcSubtarget.cpp                |     6 +-
 lib/Target/Sparc/SparcTargetMachine.cpp            |     4 +-
 lib/Target/Sparc/SparcTargetMachine.h              |    20 +-
 lib/Target/Sparc/SparcTargetObjectFile.cpp         |     2 +-
 lib/Target/Sparc/SparcTargetStreamer.h             |     8 +-
 lib/Target/SystemZ/AsmParser/LLVMBuild.txt         |     2 +-
 lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp  |    11 +-
 .../SystemZ/Disassembler/SystemZDisassembler.cpp   |    11 +-
 .../SystemZ/InstPrinter/SystemZInstPrinter.cpp     |     4 +-
 lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt      |     2 +-
 .../SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp  |     3 +-
 .../SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp |     2 +-
 .../SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp   |     7 +-
 lib/Target/SystemZ/SystemZElimCompare.cpp          |     8 +-
 lib/Target/SystemZ/SystemZFrameLowering.cpp        |     2 +-
 lib/Target/SystemZ/SystemZISelDAGToDAG.cpp         |    32 +-
 lib/Target/SystemZ/SystemZISelLowering.cpp         |    54 +-
 lib/Target/SystemZ/SystemZInstrFormats.td          |     2 +-
 lib/Target/SystemZ/SystemZInstrInfo.cpp            |    24 +-
 lib/Target/SystemZ/SystemZInstrInfo.h              |     2 +-
 lib/Target/SystemZ/SystemZLongBranch.cpp           |    13 +-
 lib/Target/SystemZ/SystemZRegisterInfo.cpp         |     8 +-
 lib/Target/SystemZ/SystemZRegisterInfo.h           |     2 +-
 lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp     |     5 +-
 lib/Target/SystemZ/SystemZShortenInst.cpp          |     8 +-
 lib/Target/SystemZ/SystemZSubtarget.cpp            |     6 +-
 lib/Target/Target.cpp                              |     8 -
 lib/Target/TargetLoweringObjectFile.cpp            |    12 +-
 lib/Target/TargetMachine.cpp                       |    69 +-
 lib/Target/TargetMachineC.cpp                      |    22 +-
 lib/Target/TargetSubtargetInfo.cpp                 |    11 +-
 lib/Target/X86/Android.mk                          |     1 -
 lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp |   122 +-
 lib/Target/X86/AsmParser/X86AsmInstrumentation.h   |    17 +-
 lib/Target/X86/AsmParser/X86AsmParser.cpp          |    93 +-
 lib/Target/X86/AsmParser/X86Operand.h              |     6 +-
 lib/Target/X86/CMakeLists.txt                      |     1 -
 lib/Target/X86/Disassembler/Android.mk             |     3 +-
 lib/Target/X86/Disassembler/CMakeLists.txt         |     2 +-
 lib/Target/X86/Disassembler/Makefile               |     4 +-
 lib/Target/X86/Disassembler/X86Disassembler.cpp    |    51 +-
 lib/Target/X86/Disassembler/X86Disassembler.h      |    17 +-
 .../X86/Disassembler/X86DisassemblerDecoder.c      |  1821 --
 .../X86/Disassembler/X86DisassemblerDecoder.cpp    |  1838 +++
 .../X86/Disassembler/X86DisassemblerDecoder.h      |   362 +-
 .../Disassembler/X86DisassemblerDecoderCommon.h    |   221 +-
 lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp   |     3 +-
 lib/Target/X86/InstPrinter/X86ATTInstPrinter.h     |     2 +
 lib/Target/X86/InstPrinter/X86InstComments.cpp     |     4 +-
 lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp |     3 +-
 lib/Target/X86/MCTargetDesc/Android.mk             |     3 +-
 lib/Target/X86/MCTargetDesc/CMakeLists.txt         |     1 +
 lib/Target/X86/MCTargetDesc/LLVMBuild.txt          |     2 +-
 lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp      |    16 +-
 lib/Target/X86/MCTargetDesc/X86BaseInfo.h          |     9 +-
 lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp |     8 +-
 .../X86/MCTargetDesc/X86ELFRelocationInfo.cpp      |     2 +-
 lib/Target/X86/MCTargetDesc/X86FixupKinds.h        |     1 +
 lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp       |     8 +-
 lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp   |    19 +-
 lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp    |    42 +-
 lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h      |     9 +
 .../X86/MCTargetDesc/X86MachORelocationInfo.cpp    |     2 +-
 .../X86/MCTargetDesc/X86MachObjectWriter.cpp       |    23 +-
 .../X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp    |    72 +-
 lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp |    51 +
 lib/Target/X86/X86.h                               |     4 +-
 lib/Target/X86/X86.td                              |    27 +-
 lib/Target/X86/X86AsmPrinter.cpp                   |   192 +-
 lib/Target/X86/X86AsmPrinter.h                     |     4 +-
 lib/Target/X86/X86COFFMachineModuleInfo.cpp        |    19 -
 lib/Target/X86/X86COFFMachineModuleInfo.h          |    46 -
 lib/Target/X86/X86CallingConv.h                    |    27 -
 lib/Target/X86/X86CallingConv.td                   |    10 -
 lib/Target/X86/X86CodeEmitter.cpp                  |     9 +-
 lib/Target/X86/X86FastISel.cpp                     |    36 +-
 lib/Target/X86/X86FixupLEAs.cpp                    |   107 +-
 lib/Target/X86/X86FloatingPoint.cpp                |     5 +-
 lib/Target/X86/X86FrameLowering.cpp                |    91 +-
 lib/Target/X86/X86FrameLowering.h                  |     2 +-
 lib/Target/X86/X86ISelDAGToDAG.cpp                 |    98 +-
 lib/Target/X86/X86ISelLowering.cpp                 |  2193 ++-
 lib/Target/X86/X86ISelLowering.h                   |    42 +-
 lib/Target/X86/X86InstrAVX512.td                   |   206 +-
 lib/Target/X86/X86InstrBuilder.h                   |     3 +-
 lib/Target/X86/X86InstrCompiler.td                 |     4 +-
 lib/Target/X86/X86InstrFMA.td                      |    46 +-
 lib/Target/X86/X86InstrFragmentsSIMD.td            |     5 +-
 lib/Target/X86/X86InstrInfo.cpp                    |   262 +-
 lib/Target/X86/X86InstrInfo.h                      |     9 +-
 lib/Target/X86/X86InstrInfo.td                     |   200 +-
 lib/Target/X86/X86InstrMMX.td                      |     8 +-
 lib/Target/X86/X86InstrSSE.td                      |   147 +-
 lib/Target/X86/X86InstrSystem.td                   |     2 +-
 lib/Target/X86/X86JITInfo.cpp                      |    10 +-
 lib/Target/X86/X86MCInstLower.cpp                  |     8 +-
 lib/Target/X86/X86PadShortFunction.cpp             |     9 +-
 lib/Target/X86/X86RegisterInfo.cpp                 |     8 +-
 lib/Target/X86/X86RegisterInfo.h                   |     4 +-
 lib/Target/X86/X86SchedHaswell.td                  |     3 +
 lib/Target/X86/X86SchedSandyBridge.td              |     3 +
 lib/Target/X86/X86ScheduleAtom.td                  |     4 +
 lib/Target/X86/X86ScheduleSLM.td                   |   849 +-
 lib/Target/X86/X86SelectionDAGInfo.cpp             |    36 +-
 lib/Target/X86/X86Subtarget.cpp                    |   364 +-
 lib/Target/X86/X86Subtarget.h                      |    11 +-
 lib/Target/X86/X86TargetMachine.cpp                |    33 +-
 lib/Target/X86/X86TargetObjectFile.cpp             |    14 +-
 lib/Target/X86/X86TargetTransformInfo.cpp          |   159 +-
 lib/Target/X86/X86VZeroUpper.cpp                   |     6 +-
 .../XCore/Disassembler/XCoreDisassembler.cpp       |    19 +-
 lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp  |     3 +-
 lib/Target/XCore/InstPrinter/XCoreInstPrinter.h    |     4 +-
 lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp   |     2 +-
 lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h     |     2 +-
 .../XCore/MCTargetDesc/XCoreMCTargetDesc.cpp       |    13 +-
 lib/Target/XCore/XCoreAsmPrinter.cpp               |    17 +-
 lib/Target/XCore/XCoreFrameLowering.cpp            |    19 +-
 lib/Target/XCore/XCoreFrameLowering.h              |    27 +-
 lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp    |     4 +-
 lib/Target/XCore/XCoreISelDAGToDAG.cpp             |    17 +-
 lib/Target/XCore/XCoreISelLowering.cpp             |   125 +-
 lib/Target/XCore/XCoreISelLowering.h               |    51 +-
 lib/Target/XCore/XCoreInstrInfo.cpp                |    17 +-
 lib/Target/XCore/XCoreInstrInfo.h                  |    76 +-
 lib/Target/XCore/XCoreLowerThreadLocal.cpp         |     7 +-
 lib/Target/XCore/XCoreRegisterInfo.cpp             |    12 +-
 lib/Target/XCore/XCoreRegisterInfo.h               |    15 +-
 lib/Target/XCore/XCoreSelectionDAGInfo.cpp         |    19 +-
 lib/Target/XCore/XCoreSelectionDAGInfo.h           |     4 +-
 lib/Target/XCore/XCoreSubtarget.cpp                |     6 +-
 lib/Target/XCore/XCoreTargetMachine.cpp            |     6 +-
 lib/Target/XCore/XCoreTargetMachine.h              |    18 +-
 lib/Target/XCore/XCoreTargetObjectFile.h           |     2 +-
 lib/Target/XCore/XCoreTargetTransformInfo.cpp      |     5 +-
 lib/Transforms/Hello/Hello.cpp                     |     3 +-
 lib/Transforms/IPO/ArgumentPromotion.cpp           |    17 +-
 lib/Transforms/IPO/ConstantMerge.cpp               |     9 +-
 lib/Transforms/IPO/DeadArgumentElimination.cpp     |     8 +-
 lib/Transforms/IPO/ExtractGV.cpp                   |     9 +-
 lib/Transforms/IPO/FunctionAttrs.cpp               |    28 +-
 lib/Transforms/IPO/GlobalDCE.cpp                   |    23 +-
 lib/Transforms/IPO/GlobalOpt.cpp                   |   269 +-
 lib/Transforms/IPO/IPConstantPropagation.cpp       |    11 +-
 lib/Transforms/IPO/InlineAlways.cpp                |    11 +-
 lib/Transforms/IPO/InlineSimple.cpp                |     7 +-
 lib/Transforms/IPO/Inliner.cpp                     |    70 +-
 lib/Transforms/IPO/Internalize.cpp                 |    10 +-
 lib/Transforms/IPO/LoopExtractor.cpp               |     7 +-
 lib/Transforms/IPO/MergeFunctions.cpp              |   734 +-
 lib/Transforms/IPO/PartialInlining.cpp             |    11 +-
 lib/Transforms/IPO/PassManagerBuilder.cpp          |    28 +-
 lib/Transforms/IPO/PruneEH.cpp                     |     5 +-
 lib/Transforms/IPO/StripDeadPrototypes.cpp         |     3 +-
 lib/Transforms/IPO/StripSymbols.cpp                |     2 +-
 lib/Transforms/InstCombine/InstCombine.h           |   107 +-
 lib/Transforms/InstCombine/InstCombineAddSub.cpp   |   102 +-
 lib/Transforms/InstCombine/InstCombineAndOrXor.cpp |   149 +-
 lib/Transforms/InstCombine/InstCombineCalls.cpp    |   305 +-
 lib/Transforms/InstCombine/InstCombineCasts.cpp    |    89 +-
 lib/Transforms/InstCombine/InstCombineCompares.cpp |   390 +-
 .../InstCombine/InstCombineLoadStoreAlloca.cpp     |    51 +-
 .../InstCombine/InstCombineMulDivRem.cpp           |   128 +-
 lib/Transforms/InstCombine/InstCombinePHI.cpp      |    80 +-
 lib/Transforms/InstCombine/InstCombineSelect.cpp   |    92 +-
 lib/Transforms/InstCombine/InstCombineShifts.cpp   |    91 +-
 .../InstCombine/InstCombineSimplifyDemanded.cpp    |    85 +-
 .../InstCombine/InstCombineVectorOps.cpp           |   115 +-
 lib/Transforms/InstCombine/InstCombineWorklist.h   |     7 +-
 .../InstCombine/InstructionCombining.cpp           |   246 +-
 .../Instrumentation/AddressSanitizer.cpp           |   310 +-
 lib/Transforms/Instrumentation/BoundsChecking.cpp  |     9 +-
 .../Instrumentation/DataFlowSanitizer.cpp          |    32 +-
 lib/Transforms/Instrumentation/DebugIR.cpp         |    30 +-
 lib/Transforms/Instrumentation/DebugIR.h           |     2 +-
 lib/Transforms/Instrumentation/GCOVProfiling.cpp   |    45 +-
 lib/Transforms/Instrumentation/MemorySanitizer.cpp |   226 +-
 lib/Transforms/Instrumentation/ThreadSanitizer.cpp |    14 +-
 lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h     |    40 +-
 lib/Transforms/ObjCARC/DependencyAnalysis.cpp      |     5 +-
 lib/Transforms/ObjCARC/ObjCARCAPElim.cpp           |    13 +-
 lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp    |     3 +-
 lib/Transforms/ObjCARC/ObjCARCContract.cpp         |     9 +-
 lib/Transforms/ObjCARC/ObjCARCExpand.cpp           |     4 +-
 lib/Transforms/ObjCARC/ObjCARCOpts.cpp             |    49 +-
 lib/Transforms/Scalar/ADCE.cpp                     |     3 +-
 lib/Transforms/Scalar/Android.mk                   |     6 +-
 lib/Transforms/Scalar/CMakeLists.txt               |    11 +-
 lib/Transforms/Scalar/ConstantHoisting.cpp         |    24 +-
 lib/Transforms/Scalar/ConstantProp.cpp             |     5 +-
 .../Scalar/CorrelatedValuePropagation.cpp          |     5 +-
 lib/Transforms/Scalar/DCE.cpp                      |     3 +-
 lib/Transforms/Scalar/DeadStoreElimination.cpp     |    33 +-
 lib/Transforms/Scalar/EarlyCSE.cpp                 |    23 +-
 lib/Transforms/Scalar/FlattenCFGPass.cpp           |     3 +-
 lib/Transforms/Scalar/GVN.cpp                      |   115 +-
 lib/Transforms/Scalar/GlobalMerge.cpp              |     8 +-
 lib/Transforms/Scalar/IndVarSimplify.cpp           |    80 +-
 lib/Transforms/Scalar/JumpThreading.cpp            |    39 +-
 lib/Transforms/Scalar/LICM.cpp                     |    15 +-
 lib/Transforms/Scalar/LoopDeletion.cpp             |     3 +-
 lib/Transforms/Scalar/LoopIdiomRecognize.cpp       |    65 +-
 lib/Transforms/Scalar/LoopInstSimplify.cpp         |    17 +-
 lib/Transforms/Scalar/LoopRerollPass.cpp           |     7 +-
 lib/Transforms/Scalar/LoopRotation.cpp             |    36 +-
 lib/Transforms/Scalar/LoopStrengthReduce.cpp       |   683 +-
 lib/Transforms/Scalar/LoopUnrollPass.cpp           |    12 +-
 lib/Transforms/Scalar/LoopUnswitch.cpp             |    62 +-
 lib/Transforms/Scalar/LowerAtomic.cpp              |     5 +-
 lib/Transforms/Scalar/MemCpyOptimizer.cpp          |    41 +-
 lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp  |     3 +-
 lib/Transforms/Scalar/Reassociate.cpp              |    43 +-
 lib/Transforms/Scalar/Reg2Mem.cpp                  |     3 +-
 lib/Transforms/Scalar/SCCP.cpp                     |    33 +-
 lib/Transforms/Scalar/SROA.cpp                     |   106 +-
 lib/Transforms/Scalar/SampleProfile.cpp            |    10 +-
 lib/Transforms/Scalar/Scalar.cpp                   |     2 +
 lib/Transforms/Scalar/ScalarReplAggregates.cpp     |    79 +-
 lib/Transforms/Scalar/Scalarizer.cpp               |    13 +-
 .../Scalar/SeparateConstOffsetFromGEP.cpp          |   623 +
 lib/Transforms/Scalar/SimplifyCFGPass.cpp          |    13 +-
 lib/Transforms/Scalar/Sink.cpp                     |    13 +-
 lib/Transforms/Scalar/StructurizeCFG.cpp           |    27 +-
 lib/Transforms/Scalar/TailRecursionElimination.cpp |   380 +-
 lib/Transforms/Utils/AddDiscriminators.cpp         |    15 +-
 lib/Transforms/Utils/Android.mk                    |     1 +
 lib/Transforms/Utils/BasicBlockUtils.cpp           |   117 +-
 lib/Transforms/Utils/BreakCriticalEdges.cpp        |    15 +-
 lib/Transforms/Utils/BuildLibCalls.cpp             |    31 +-
 lib/Transforms/Utils/BypassSlowDivision.cpp        |     7 +-
 lib/Transforms/Utils/CMakeLists.txt                |     1 +
 lib/Transforms/Utils/CloneFunction.cpp             |     8 +-
 lib/Transforms/Utils/CloneModule.cpp               |    14 +-
 lib/Transforms/Utils/CmpInstAnalysis.cpp           |     2 +-
 lib/Transforms/Utils/CodeExtractor.cpp             |    20 +-
 lib/Transforms/Utils/CtorUtils.cpp                 |   183 +
 lib/Transforms/Utils/DemoteRegToStack.cpp          |    14 +-
 lib/Transforms/Utils/FlattenCFG.cpp                |    14 +-
 lib/Transforms/Utils/GlobalStatus.cpp              |     4 +-
 lib/Transforms/Utils/InlineFunction.cpp            |   253 +-
 lib/Transforms/Utils/IntegerDivision.cpp           |     3 +-
 lib/Transforms/Utils/LCSSA.cpp                     |     5 +-
 lib/Transforms/Utils/Local.cpp                     |    84 +-
 lib/Transforms/Utils/LoopSimplify.cpp              |    42 +-
 lib/Transforms/Utils/LoopUnroll.cpp                |    28 +-
 lib/Transforms/Utils/LoopUnrollRuntime.cpp         |    13 +-
 lib/Transforms/Utils/LowerExpectIntrinsic.cpp      |     3 +-
 lib/Transforms/Utils/LowerInvoke.cpp               |     3 +-
 lib/Transforms/Utils/LowerSwitch.cpp               |     7 +-
 lib/Transforms/Utils/Mem2Reg.cpp                   |     3 +-
 lib/Transforms/Utils/ModuleUtils.cpp               |    29 +-
 lib/Transforms/Utils/PromoteMemoryToRegister.cpp   |    29 +-
 lib/Transforms/Utils/SSAUpdater.cpp                |    21 +-
 lib/Transforms/Utils/SimplifyCFG.cpp               |   153 +-
 lib/Transforms/Utils/SimplifyIndVar.cpp            |    30 +-
 lib/Transforms/Utils/SimplifyInstructions.cpp      |    24 +-
 lib/Transforms/Utils/SimplifyLibCalls.cpp          |   316 +-
 lib/Transforms/Utils/SpecialCaseList.cpp           |     6 +-
 lib/Transforms/Utils/UnifyFunctionExitNodes.cpp    |     8 +-
 lib/Transforms/Utils/ValueMapper.cpp               |    20 +-
 lib/Transforms/Vectorize/BBVectorize.cpp           |    94 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp         |   443 +-
 lib/Transforms/Vectorize/SLPVectorizer.cpp         |   257 +-
 shared_llvm.mk                                     |     6 +-
 test/Analysis/BlockFrequencyInfo/bad_input.ll      |    50 +
 test/Analysis/BlockFrequencyInfo/basic.ll          |    55 +-
 .../Analysis/BlockFrequencyInfo/double_backedge.ll |    27 +
 test/Analysis/BlockFrequencyInfo/double_exit.ll    |   165 +
 test/Analysis/BlockFrequencyInfo/irreducible.ll    |   421 +
 .../BlockFrequencyInfo/loop_with_branch.ll         |    44 +
 .../nested_loop_with_branches.ll                   |    59 +
 test/Analysis/BranchProbabilityInfo/loop.ll        |    42 +-
 test/Analysis/BranchProbabilityInfo/pr18705.ll     |    58 +
 test/Analysis/CostModel/AArch64/lit.local.cfg      |     3 +
 test/Analysis/CostModel/AArch64/select.ll          |    38 +
 test/Analysis/CostModel/AArch64/store.ll           |    22 +
 test/Analysis/CostModel/ARM64/lit.local.cfg        |     3 -
 test/Analysis/CostModel/ARM64/select.ll            |    38 -
 test/Analysis/CostModel/ARM64/store.ll             |    22 -
 test/Analysis/CostModel/PowerPC/ext.ll             |     2 +-
 test/Analysis/CostModel/PowerPC/insert_extract.ll  |     4 +-
 test/Analysis/CostModel/PowerPC/load_store.ll      |     8 +-
 test/Analysis/CostModel/X86/intrinsic-cost.ll      |    28 +
 test/Analysis/CostModel/X86/vdiv-cost.ll           |    92 +
 test/Analysis/CostModel/X86/vselect-cost.ll        |   126 +
 test/Analysis/Delinearization/a.ll                 |    11 -
 test/Analysis/Delinearization/gcd_multiply_expr.ll |   153 +
 test/Analysis/Delinearization/himeno_1.ll          |    10 -
 test/Analysis/Delinearization/himeno_2.ll          |    10 -
 .../iv_times_constant_in_subscript.ll              |    45 +
 test/Analysis/Delinearization/lit.local.cfg        |     2 +-
 .../multidim_ivs_and_integer_offsets_3d.ll         |    10 -
 .../multidim_ivs_and_integer_offsets_nts_3d.ll     |    10 -
 .../multidim_ivs_and_parameteric_offsets_3d.ll     |    10 -
 .../Delinearization/multidim_only_ivs_2d.ll        |     5 -
 .../Delinearization/multidim_only_ivs_2d_nested.ll |     2 +
 .../Delinearization/multidim_only_ivs_3d.ll        |    10 -
 .../Delinearization/multidim_only_ivs_3d_cast.ll   |    10 -
 ...tidim_two_accesses_different_delinearization.ll |    43 +
 test/Analysis/Delinearization/undef.ll             |    38 +
 test/Analysis/DependenceAnalysis/Banerjee.ll       |    22 +-
 test/Analysis/DependenceAnalysis/GCD.ll            |    16 +-
 test/Analysis/LazyCallGraph/basic.ll               |    50 +
 test/Analysis/ScalarEvolution/max-trip-count.ll    |   109 +
 test/Assembler/2009-04-25-AliasGEP.ll              |     8 -
 test/Assembler/addrspacecast-alias.ll              |     5 +-
 test/Assembler/alias-addrspace.ll                  |     6 +
 test/Assembler/alias-redefinition.ll               |     7 +
 test/Assembler/alias-to-alias.ll                   |     5 +
 test/Assembler/alias-to-alias2.ll                  |     7 +
 test/Assembler/alias-type.ll                       |     6 +
 test/Assembler/half-constprop.ll                   |     2 +-
 test/Assembler/half-conv.ll                        |     2 +-
 test/Assembler/internal-hidden-alias.ll            |     6 +
 test/Assembler/internal-hidden-function.ll         |     7 +
 test/Assembler/internal-hidden-variable.ll         |     4 +
 test/Assembler/internal-protected-alias.ll         |     6 +
 test/Assembler/internal-protected-function.ll      |     7 +
 test/Assembler/internal-protected-variable.ll      |     4 +
 test/Assembler/private-hidden-alias.ll             |     6 +
 test/Assembler/private-hidden-function.ll          |     7 +
 test/Assembler/private-hidden-variable.ll          |     4 +
 test/Assembler/private-protected-alias.ll          |     6 +
 test/Assembler/private-protected-function.ll       |     7 +
 test/Assembler/private-protected-variable.ll       |     4 +
 test/Bitcode/attributes.ll                         |     5 +
 ...eprecated-linker_private-linker_private_weak.ll |    17 +
 .../local-linkage-default-visibility.3.4.ll        |    79 +
 .../local-linkage-default-visibility.3.4.ll.bc     |   Bin 0 -> 924 bytes
 test/Bitcode/old-aliases.ll                        |    22 +
 test/Bitcode/old-aliases.ll.bc                     |   Bin 0 -> 368 bytes
 test/Bitcode/tailcall.ll                           |    17 +
 test/Bitcode/upgrade-global-ctors.ll               |     3 +
 test/Bitcode/upgrade-global-ctors.ll.bc            |   Bin 0 -> 316 bytes
 test/CMakeLists.txt                                |     1 +
 test/CodeGen/AArch64/128bit_load_store.ll          |    20 +-
 test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll    |    69 +
 test/CodeGen/AArch64/adc.ll                        |     4 +-
 test/CodeGen/AArch64/addsub-shifted.ll             |    18 +-
 test/CodeGen/AArch64/addsub.ll                     |    22 +-
 test/CodeGen/AArch64/addsub_ext.ll                 |     8 +-
 test/CodeGen/AArch64/alloca.ll                     |   121 +-
 test/CodeGen/AArch64/analyze-branch.ll             |     4 +-
 test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll |    47 +
 .../AArch64/arm64-2011-03-17-AsmPrinterCrash.ll    |    45 +
 .../arm64-2011-03-21-Unaligned-Frame-Index.ll      |    12 +
 test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll   |    26 +
 .../CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll |    31 +
 .../AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll |    40 +
 .../arm64-2012-05-07-DAGCombineVectorExtract.ll    |    20 +
 .../AArch64/arm64-2012-05-07-MemcpyAlignBug.ll     |    21 +
 .../AArch64/arm64-2012-05-09-LOADgot-bug.ll        |    22 +
 .../CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll |    50 +
 test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll    |    67 +
 .../AArch64/arm64-2012-07-11-InstrEmitterBug.ll    |    56 +
 .../CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll |    19 +
 .../CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll |    15 +
 .../CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll |    37 +
 test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll  |    11 +
 .../arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll |    23 +
 .../arm64-2014-04-28-sqshl-uqshl-i64Contant.ll     |    19 +
 .../AArch64/arm64-2014-04-29-EXT-undef-mask.ll     |    23 +
 test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll       |    67 +
 test/CodeGen/AArch64/arm64-aapcs.ll                |   103 +
 test/CodeGen/AArch64/arm64-abi-varargs.ll          |   191 +
 test/CodeGen/AArch64/arm64-abi.ll                  |   238 +
 test/CodeGen/AArch64/arm64-abi_align.ll            |   532 +
 test/CodeGen/AArch64/arm64-addp.ll                 |    32 +
 test/CodeGen/AArch64/arm64-addr-mode-folding.ll    |   171 +
 test/CodeGen/AArch64/arm64-addr-type-promotion.ll  |    82 +
 test/CodeGen/AArch64/arm64-addrmode.ll             |    72 +
 .../AArch64/arm64-alloc-no-stack-realign.ll        |    21 +
 .../AArch64/arm64-alloca-frame-pointer-offset.ll   |    29 +
 test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll        |    72 +
 test/CodeGen/AArch64/arm64-ands-bad-peephole.ll    |    31 +
 test/CodeGen/AArch64/arm64-anyregcc-crash.ll       |    19 +
 test/CodeGen/AArch64/arm64-anyregcc.ll             |   363 +
 test/CodeGen/AArch64/arm64-arith-saturating.ll     |   153 +
 test/CodeGen/AArch64/arm64-arith.ll                |   262 +
 .../arm64-arm64-dead-def-elimination-flag.ll       |    16 +
 test/CodeGen/AArch64/arm64-atomic-128.ll           |   225 +
 test/CodeGen/AArch64/arm64-atomic.ll               |   331 +
 test/CodeGen/AArch64/arm64-basic-pic.ll            |    54 +
 .../AArch64/arm64-big-endian-bitconverts.ll        |  1101 ++
 test/CodeGen/AArch64/arm64-big-endian-eh.ll        |    73 +
 test/CodeGen/AArch64/arm64-big-endian-varargs.ll   |    58 +
 .../AArch64/arm64-big-endian-vector-callee.ll      |   848 +
 .../AArch64/arm64-big-endian-vector-caller.ll      |  1100 ++
 test/CodeGen/AArch64/arm64-big-imm-offsets.ll      |    14 +
 test/CodeGen/AArch64/arm64-big-stack.ll            |    21 +
 test/CodeGen/AArch64/arm64-bitfield-extract.ll     |   532 +
 test/CodeGen/AArch64/arm64-blockaddress.ll         |    30 +
 test/CodeGen/AArch64/arm64-build-vector.ll         |    35 +
 test/CodeGen/AArch64/arm64-call-tailcalls.ll       |    91 +
 test/CodeGen/AArch64/arm64-cast-opt.ll             |    31 +
 test/CodeGen/AArch64/arm64-ccmp-heuristics.ll      |   190 +
 test/CodeGen/AArch64/arm64-ccmp.ll                 |   289 +
 test/CodeGen/AArch64/arm64-clrsb.ll                |    36 +
 test/CodeGen/AArch64/arm64-coalesce-ext.ll         |    17 +
 test/CodeGen/AArch64/arm64-code-model-large-abs.ll |    72 +
 .../AArch64/arm64-collect-loh-garbage-crash.ll     |    37 +
 test/CodeGen/AArch64/arm64-collect-loh-str.ll      |    23 +
 test/CodeGen/AArch64/arm64-collect-loh.ll          |    53 +
 test/CodeGen/AArch64/arm64-complex-copy-noneon.ll  |    21 +
 test/CodeGen/AArch64/arm64-complex-ret.ll          |     7 +
 test/CodeGen/AArch64/arm64-const-addr.ll           |    23 +
 test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll  |    24 +
 test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll  |    29 +
 test/CodeGen/AArch64/arm64-copy-tuple.ll           |   146 +
 test/CodeGen/AArch64/arm64-crc32.ll                |    71 +
 test/CodeGen/AArch64/arm64-crypto.ll               |   135 +
 test/CodeGen/AArch64/arm64-cse.ll                  |    59 +
 test/CodeGen/AArch64/arm64-csel.ll                 |   230 +
 test/CodeGen/AArch64/arm64-cvt.ll                  |   401 +
 .../AArch64/arm64-dagcombiner-convergence.ll       |    19 +
 .../AArch64/arm64-dagcombiner-dead-indexed-load.ll |    29 +
 .../AArch64/arm64-dagcombiner-indexed-load.ll      |    46 +
 .../AArch64/arm64-dagcombiner-load-slicing.ll      |   102 +
 test/CodeGen/AArch64/arm64-dead-def-frame-index.ll |    18 +
 .../CodeGen/AArch64/arm64-dead-register-def-bug.ll |    32 +
 test/CodeGen/AArch64/arm64-dup.ll                  |   323 +
 test/CodeGen/AArch64/arm64-early-ifcvt.ll          |   423 +
 test/CodeGen/AArch64/arm64-elf-calls.ll            |    20 +
 test/CodeGen/AArch64/arm64-elf-constpool.ll        |    13 +
 test/CodeGen/AArch64/arm64-elf-globals.ll          |   115 +
 test/CodeGen/AArch64/arm64-ext.ll                  |   118 +
 test/CodeGen/AArch64/arm64-extend-int-to-fp.ll     |    19 +
 test/CodeGen/AArch64/arm64-extend.ll               |    15 +
 test/CodeGen/AArch64/arm64-extern-weak.ll          |    51 +
 test/CodeGen/AArch64/arm64-extload-knownzero.ll    |    28 +
 test/CodeGen/AArch64/arm64-extract.ll              |    58 +
 test/CodeGen/AArch64/arm64-extract_subvector.ll    |    51 +
 .../CodeGen/AArch64/arm64-fast-isel-addr-offset.ll |    47 +
 test/CodeGen/AArch64/arm64-fast-isel-alloca.ll     |    25 +
 test/CodeGen/AArch64/arm64-fast-isel-br.ll         |   155 +
 test/CodeGen/AArch64/arm64-fast-isel-call.ll       |   100 +
 test/CodeGen/AArch64/arm64-fast-isel-conversion.ll |   442 +
 test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll       |   146 +
 test/CodeGen/AArch64/arm64-fast-isel-gv.ll         |    38 +
 test/CodeGen/AArch64/arm64-fast-isel-icmp.ll       |   214 +
 test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll |    36 +
 test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll  |   135 +
 .../CodeGen/AArch64/arm64-fast-isel-materialize.ll |    27 +
 test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll  |    68 +
 test/CodeGen/AArch64/arm64-fast-isel-rem.ll        |    44 +
 test/CodeGen/AArch64/arm64-fast-isel-ret.ll        |    63 +
 test/CodeGen/AArch64/arm64-fast-isel-select.ll     |    63 +
 test/CodeGen/AArch64/arm64-fast-isel.ll            |    95 +
 test/CodeGen/AArch64/arm64-fastcc-tailcall.ll      |    24 +
 .../arm64-fastisel-gep-promote-before-add.ll       |    18 +
 test/CodeGen/AArch64/arm64-fcmp-opt.ll             |   204 +
 test/CodeGen/AArch64/arm64-fcopysign.ll            |    51 +
 .../arm64-fixed-point-scalar-cvt-dagcombine.ll     |    15 +
 test/CodeGen/AArch64/arm64-fmadd.ll                |    92 +
 test/CodeGen/AArch64/arm64-fmax.ll                 |    34 +
 test/CodeGen/AArch64/arm64-fminv.ll                |   101 +
 test/CodeGen/AArch64/arm64-fmuladd.ll              |    88 +
 test/CodeGen/AArch64/arm64-fold-address.ll         |    79 +
 test/CodeGen/AArch64/arm64-fold-lsl.ll             |    79 +
 test/CodeGen/AArch64/arm64-fp-contract-zero.ll     |    14 +
 test/CodeGen/AArch64/arm64-fp-imm.ll               |    32 +
 test/CodeGen/AArch64/arm64-fp.ll                   |     8 +
 test/CodeGen/AArch64/arm64-fp128-folding.ll        |    17 +
 test/CodeGen/AArch64/arm64-fp128.ll                |   273 +
 test/CodeGen/AArch64/arm64-frame-index.ll          |    11 +
 test/CodeGen/AArch64/arm64-frameaddr.ll            |    15 +
 test/CodeGen/AArch64/arm64-global-address.ll       |    14 +
 test/CodeGen/AArch64/arm64-hello.ll                |    38 +
 test/CodeGen/AArch64/arm64-i16-subreg-extract.ll   |    12 +
 test/CodeGen/AArch64/arm64-icmp-opt.ll             |    17 +
 test/CodeGen/AArch64/arm64-illegal-float-ops.ll    |   295 +
 test/CodeGen/AArch64/arm64-indexed-memory.ll       |   351 +
 .../CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll |    40 +
 test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll  |  6174 +++++++
 test/CodeGen/AArch64/arm64-inline-asm-error-I.ll   |    11 +
 test/CodeGen/AArch64/arm64-inline-asm-error-J.ll   |    11 +
 test/CodeGen/AArch64/arm64-inline-asm-error-K.ll   |    11 +
 test/CodeGen/AArch64/arm64-inline-asm-error-L.ll   |    11 +
 test/CodeGen/AArch64/arm64-inline-asm-error-M.ll   |    11 +
 test/CodeGen/AArch64/arm64-inline-asm-error-N.ll   |    11 +
 .../AArch64/arm64-inline-asm-zero-reg-error.ll     |    11 +
 test/CodeGen/AArch64/arm64-inline-asm.ll           |   230 +
 test/CodeGen/AArch64/arm64-join-reserved.ll        |    17 +
 test/CodeGen/AArch64/arm64-jumptable.ll            |    35 +
 test/CodeGen/AArch64/arm64-large-frame.ll          |    69 +
 test/CodeGen/AArch64/arm64-ld1.ll                  |  1345 ++
 test/CodeGen/AArch64/arm64-ldp.ll                  |   149 +
 test/CodeGen/AArch64/arm64-ldur.ll                 |    67 +
 test/CodeGen/AArch64/arm64-ldxr-stxr.ll            |   270 +
 test/CodeGen/AArch64/arm64-leaf.ll                 |    13 +
 test/CodeGen/AArch64/arm64-long-shift.ll           |    59 +
 test/CodeGen/AArch64/arm64-memcpy-inline.ll        |   112 +
 test/CodeGen/AArch64/arm64-memset-inline.ll        |    27 +
 test/CodeGen/AArch64/arm64-memset-to-bzero.ll      |   108 +
 test/CodeGen/AArch64/arm64-misched-basic-A53.ll    |   124 +
 .../AArch64/arm64-misched-forwarding-A53.ll        |    21 +
 test/CodeGen/AArch64/arm64-movi.ll                 |   202 +
 test/CodeGen/AArch64/arm64-mul.ll                  |    90 +
 test/CodeGen/AArch64/arm64-named-reg-alloc.ll      |    14 +
 test/CodeGen/AArch64/arm64-named-reg-notareg.ll    |    13 +
 test/CodeGen/AArch64/arm64-neg.ll                  |    71 +
 test/CodeGen/AArch64/arm64-neon-2velem-high.ll     |   341 +
 test/CodeGen/AArch64/arm64-neon-2velem.ll          |  2853 ++++
 test/CodeGen/AArch64/arm64-neon-3vdiff.ll          |  1829 +++
 test/CodeGen/AArch64/arm64-neon-aba-abd.ll         |   236 +
 test/CodeGen/AArch64/arm64-neon-across.ll          |   460 +
 test/CodeGen/AArch64/arm64-neon-add-pairwise.ll    |   100 +
 test/CodeGen/AArch64/arm64-neon-add-sub.ll         |   237 +
 .../AArch64/arm64-neon-compare-instructions.ll     |  1191 ++
 test/CodeGen/AArch64/arm64-neon-copy.ll            |  1445 ++
 .../AArch64/arm64-neon-copyPhysReg-tuple.ll        |    48 +
 test/CodeGen/AArch64/arm64-neon-mul-div.ll         |   797 +
 .../AArch64/arm64-neon-scalar-by-elem-mul.ll       |   124 +
 test/CodeGen/AArch64/arm64-neon-select_cc.ll       |   206 +
 test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll   |   482 +
 test/CodeGen/AArch64/arm64-neon-simd-shift.ll      |   663 +
 test/CodeGen/AArch64/arm64-neon-simd-vget.ll       |   225 +
 test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll      |    74 +
 .../AArch64/arm64-neon-vector-list-spill.ll        |   175 +
 test/CodeGen/AArch64/arm64-patchpoint.ll           |   171 +
 test/CodeGen/AArch64/arm64-pic-local-symbol.ll     |    22 +
 test/CodeGen/AArch64/arm64-platform-reg.ll         |    26 +
 test/CodeGen/AArch64/arm64-popcnt.ll               |    43 +
 test/CodeGen/AArch64/arm64-prefetch.ll             |    88 +
 test/CodeGen/AArch64/arm64-promote-const.ll        |   255 +
 test/CodeGen/AArch64/arm64-redzone.ll              |    18 +
 test/CodeGen/AArch64/arm64-reg-copy-noneon.ll      |    20 +
 .../AArch64/arm64-register-offset-addressing.ll    |   145 +
 test/CodeGen/AArch64/arm64-register-pairing.ll     |    53 +
 .../AArch64/arm64-regress-f128csel-flags.ll        |    27 +
 .../AArch64/arm64-regress-interphase-shift.ll      |    33 +
 test/CodeGen/AArch64/arm64-return-vector.ll        |    11 +
 test/CodeGen/AArch64/arm64-returnaddr.ll           |    26 +
 test/CodeGen/AArch64/arm64-rev.ll                  |   235 +
 test/CodeGen/AArch64/arm64-rounding.ll             |   208 +
 test/CodeGen/AArch64/arm64-scaled_iv.ll            |    38 +
 test/CodeGen/AArch64/arm64-scvt.ll                 |   830 +
 test/CodeGen/AArch64/arm64-shifted-sext.ll         |   277 +
 .../CodeGen/AArch64/arm64-simd-scalar-to-vector.ll |    22 +
 test/CodeGen/AArch64/arm64-simplest-elf.ll         |    18 +
 test/CodeGen/AArch64/arm64-sincos.ll               |    42 +
 .../CodeGen/AArch64/arm64-sitofp-combine-chains.ll |    22 +
 test/CodeGen/AArch64/arm64-sli-sri-opt.ll          |    41 +
 test/CodeGen/AArch64/arm64-smaxv.ll                |    74 +
 test/CodeGen/AArch64/arm64-sminv.ll                |    74 +
 test/CodeGen/AArch64/arm64-spill-lr.ll             |    74 +
 test/CodeGen/AArch64/arm64-spill.ll                |    15 +
 test/CodeGen/AArch64/arm64-st1.ll                  |   676 +
 test/CodeGen/AArch64/arm64-stack-no-frame.ll       |    20 +
 test/CodeGen/AArch64/arm64-stackmap.ll             |   288 +
 test/CodeGen/AArch64/arm64-stackpointer.ll         |    24 +
 test/CodeGen/AArch64/arm64-stacksave.ll            |    20 +
 test/CodeGen/AArch64/arm64-stp.ll                  |   101 +
 test/CodeGen/AArch64/arm64-strict-align.ll         |    26 +
 test/CodeGen/AArch64/arm64-stur.ll                 |    98 +
 test/CodeGen/AArch64/arm64-subsections.ll          |     5 +
 test/CodeGen/AArch64/arm64-subvector-extend.ll     |   141 +
 .../AArch64/arm64-swizzle-tbl-i16-layout.ll        |    36 +
 test/CodeGen/AArch64/arm64-tbl.ll                  |   132 +
 test/CodeGen/AArch64/arm64-this-return.ll          |    83 +
 test/CodeGen/AArch64/arm64-tls-darwin.ll           |    18 +
 test/CodeGen/AArch64/arm64-tls-dynamic-together.ll |    18 +
 test/CodeGen/AArch64/arm64-tls-dynamics.ll         |   135 +
 test/CodeGen/AArch64/arm64-tls-execs.ll            |    63 +
 test/CodeGen/AArch64/arm64-trap.ll                 |     8 +
 test/CodeGen/AArch64/arm64-trn.ll                  |   134 +
 test/CodeGen/AArch64/arm64-trunc-store.ll          |    75 +
 test/CodeGen/AArch64/arm64-umaxv.ll                |    92 +
 test/CodeGen/AArch64/arm64-uminv.ll                |    92 +
 test/CodeGen/AArch64/arm64-umov.ll                 |    33 +
 test/CodeGen/AArch64/arm64-unaligned_ldst.ll       |    41 +
 test/CodeGen/AArch64/arm64-uzp.ll                  |   107 +
 test/CodeGen/AArch64/arm64-vaargs.ll               |    20 +
 test/CodeGen/AArch64/arm64-vabs.ll                 |   804 +
 test/CodeGen/AArch64/arm64-vadd.ll                 |   941 ++
 test/CodeGen/AArch64/arm64-vaddlv.ll               |    26 +
 test/CodeGen/AArch64/arm64-vaddv.ll                |   245 +
 test/CodeGen/AArch64/arm64-variadic-aapcs.ll       |   143 +
 test/CodeGen/AArch64/arm64-vbitwise.ll             |    91 +
 test/CodeGen/AArch64/arm64-vclz.ll                 |   109 +
 test/CodeGen/AArch64/arm64-vcmp.ll                 |   236 +
 test/CodeGen/AArch64/arm64-vcnt.ll                 |    56 +
 test/CodeGen/AArch64/arm64-vcombine.ll             |    17 +
 test/CodeGen/AArch64/arm64-vcvt.ll                 |   686 +
 test/CodeGen/AArch64/arm64-vcvt_f.ll               |    82 +
 test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll        |    73 +
 test/CodeGen/AArch64/arm64-vcvt_n.ll               |    49 +
 test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll        |    34 +
 test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll       |    11 +
 test/CodeGen/AArch64/arm64-vecCmpBr.ll             |   207 +
 test/CodeGen/AArch64/arm64-vecFold.ll              |   145 +
 test/CodeGen/AArch64/arm64-vector-ext.ll           |    16 +
 test/CodeGen/AArch64/arm64-vector-imm.ll           |   134 +
 test/CodeGen/AArch64/arm64-vector-insertion.ll     |    33 +
 test/CodeGen/AArch64/arm64-vector-ldst.ll          |   601 +
 test/CodeGen/AArch64/arm64-vext.ll                 |   464 +
 test/CodeGen/AArch64/arm64-vext_reverse.ll         |   172 +
 test/CodeGen/AArch64/arm64-vfloatintrinsics.ll     |   375 +
 test/CodeGen/AArch64/arm64-vhadd.ll                |   249 +
 test/CodeGen/AArch64/arm64-vhsub.ll                |   125 +
 test/CodeGen/AArch64/arm64-virtual_base.ll         |    51 +
 test/CodeGen/AArch64/arm64-vmax.ll                 |   679 +
 test/CodeGen/AArch64/arm64-vminmaxnm.ll            |    68 +
 test/CodeGen/AArch64/arm64-vmovn.ll                |   242 +
 test/CodeGen/AArch64/arm64-vmul.ll                 |  2036 +++
 test/CodeGen/AArch64/arm64-volatile.ll             |    27 +
 test/CodeGen/AArch64/arm64-vpopcnt.ll              |    68 +
 test/CodeGen/AArch64/arm64-vqadd.ll                |   332 +
 test/CodeGen/AArch64/arm64-vqsub.ll                |   147 +
 test/CodeGen/AArch64/arm64-vselect.ll              |    25 +
 test/CodeGen/AArch64/arm64-vsetcc_fp.ll            |    11 +
 test/CodeGen/AArch64/arm64-vshift.ll               |  1917 +++
 test/CodeGen/AArch64/arm64-vshr.ll                 |    63 +
 test/CodeGen/AArch64/arm64-vshuffle.ll             |   115 +
 test/CodeGen/AArch64/arm64-vsqrt.ll                |   232 +
 test/CodeGen/AArch64/arm64-vsra.ll                 |   150 +
 test/CodeGen/AArch64/arm64-vsub.ll                 |   417 +
 test/CodeGen/AArch64/arm64-weak-reference.ll       |    10 +
 test/CodeGen/AArch64/arm64-xaluo.ll                |   524 +
 test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll    |    17 +
 test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll   |    49 +
 test/CodeGen/AArch64/arm64-zext.ll                 |    11 +
 test/CodeGen/AArch64/arm64-zextload-unscaled.ll    |    40 +
 test/CodeGen/AArch64/arm64-zip.ll                  |   107 +
 test/CodeGen/AArch64/asm-large-immediate.ll        |    10 +
 test/CodeGen/AArch64/assertion-rc-mismatch.ll      |     2 +-
 test/CodeGen/AArch64/atomic-ops.ll                 |   474 +-
 test/CodeGen/AArch64/basic-pic.ll                  |    12 +-
 test/CodeGen/AArch64/bitfield-insert-0.ll          |     2 +-
 test/CodeGen/AArch64/bitfield-insert.ll            |    25 +-
 test/CodeGen/AArch64/bitfield.ll                   |    10 +-
 test/CodeGen/AArch64/blockaddress.ll               |     2 +-
 test/CodeGen/AArch64/bool-loads.ll                 |    22 +-
 test/CodeGen/AArch64/breg.ll                       |     4 +-
 test/CodeGen/AArch64/callee-save.ll                |    18 +-
 test/CodeGen/AArch64/code-model-large-abs.ll       |     2 +-
 test/CodeGen/AArch64/compare-branch.ll             |     2 +-
 test/CodeGen/AArch64/concatvector-bugs.ll          |    68 -
 test/CodeGen/AArch64/cond-sel.ll                   |    38 +-
 test/CodeGen/AArch64/cpus.ll                       |     9 +-
 test/CodeGen/AArch64/directcond.ll                 |    24 +-
 test/CodeGen/AArch64/dp-3source.ll                 |     2 +-
 test/CodeGen/AArch64/dp1.ll                        |     2 +-
 test/CodeGen/AArch64/dp2.ll                        |    24 +-
 test/CodeGen/AArch64/eliminate-trunc.ll            |    39 +
 test/CodeGen/AArch64/extern-weak.ll                |    28 +-
 test/CodeGen/AArch64/extract.ll                    |     6 +-
 test/CodeGen/AArch64/fastcc-reserved.ll            |    16 +-
 test/CodeGen/AArch64/fastcc.ll                     |    47 +-
 test/CodeGen/AArch64/fcmp.ll                       |     2 +-
 test/CodeGen/AArch64/fcvt-fixed.ll                 |     6 +-
 test/CodeGen/AArch64/fcvt-int.ll                   |     2 +-
 test/CodeGen/AArch64/flags-multiuse.ll             |     4 +-
 test/CodeGen/AArch64/floatdp_1source.ll            |     2 +-
 test/CodeGen/AArch64/floatdp_2source.ll            |     2 +-
 test/CodeGen/AArch64/fp-cond-sel.ll                |    23 +-
 test/CodeGen/AArch64/fp-dp3.ll                     |     4 +-
 test/CodeGen/AArch64/fp128-folding.ll              |     4 +-
 test/CodeGen/AArch64/fp128.ll                      |   279 -
 test/CodeGen/AArch64/fpimm.ll                      |     6 +-
 test/CodeGen/AArch64/frameaddr.ll                  |     4 +-
 test/CodeGen/AArch64/free-zext.ll                  |    14 +
 test/CodeGen/AArch64/func-argpassing.ll            |    64 +-
 test/CodeGen/AArch64/func-calls.ll                 |    61 +-
 test/CodeGen/AArch64/global-alignment.ll           |    26 +-
 test/CodeGen/AArch64/got-abuse.ll                  |     6 +-
 test/CodeGen/AArch64/i1-contents.ll                |    55 +
 test/CodeGen/AArch64/i128-align.ll                 |     6 +-
 test/CodeGen/AArch64/i128-shift.ll                 |    43 -
 test/CodeGen/AArch64/illegal-float-ops.ll          |     2 +-
 test/CodeGen/AArch64/init-array.ll                 |     4 +-
 .../CodeGen/AArch64/inline-asm-constraints-badI.ll |     4 +-
 .../CodeGen/AArch64/inline-asm-constraints-badK.ll |     2 +-
 .../AArch64/inline-asm-constraints-badK2.ll        |     2 +-
 .../CodeGen/AArch64/inline-asm-constraints-badL.ll |     2 +-
 test/CodeGen/AArch64/inline-asm-constraints.ll     |   137 -
 test/CodeGen/AArch64/inline-asm-modifiers.ll       |   147 -
 test/CodeGen/AArch64/jump-table.ll                 |    10 +-
 test/CodeGen/AArch64/large-consts.ll               |     9 +-
 test/CodeGen/AArch64/large-frame.ll                |   119 -
 test/CodeGen/AArch64/ldst-opt.ll                   |   301 +
 test/CodeGen/AArch64/ldst-regoffset.ll             |    80 +-
 test/CodeGen/AArch64/ldst-unscaledimm.ll           |     6 +-
 test/CodeGen/AArch64/ldst-unsignedimm.ll           |    62 +-
 test/CodeGen/AArch64/lit.local.cfg                 |     7 +
 test/CodeGen/AArch64/literal_pools.ll              |   103 -
 test/CodeGen/AArch64/literal_pools_float.ll        |    46 +
 test/CodeGen/AArch64/local_vars.ll                 |    21 +-
 test/CodeGen/AArch64/logical-imm.ll                |     2 +-
 test/CodeGen/AArch64/logical_shifted_reg.ll        |     6 +-
 test/CodeGen/AArch64/mature-mc-support.ll          |     8 +-
 test/CodeGen/AArch64/misched-basic-A53.ll          |   112 -
 test/CodeGen/AArch64/movw-consts.ll                |    32 +-
 test/CodeGen/AArch64/movw-shift-encoding.ll        |     9 +-
 test/CodeGen/AArch64/mul-lohi.ll                   |     4 +-
 test/CodeGen/AArch64/neon-2velem-high.ll           |   331 -
 test/CodeGen/AArch64/neon-2velem.ll                |  2853 ----
 test/CodeGen/AArch64/neon-3vdiff.ll                |  1833 ---
 test/CodeGen/AArch64/neon-aba-abd.ll               |   236 -
 test/CodeGen/AArch64/neon-across.ll                |   472 -
 test/CodeGen/AArch64/neon-add-pairwise.ll          |   101 -
 test/CodeGen/AArch64/neon-add-sub.ll               |   279 -
 test/CodeGen/AArch64/neon-bitwise-instructions.ll  |   519 +-
 test/CodeGen/AArch64/neon-bsl.ll                   |   235 -
 test/CodeGen/AArch64/neon-compare-instructions.ll  |   957 +-
 test/CodeGen/AArch64/neon-copy.ll                  |  1402 --
 test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll     |    47 -
 test/CodeGen/AArch64/neon-crypto.ll                |   144 -
 test/CodeGen/AArch64/neon-diagnostics.ll           |     2 +-
 test/CodeGen/AArch64/neon-extract.ll               |   106 +-
 test/CodeGen/AArch64/neon-facge-facgt.ll           |    56 -
 test/CodeGen/AArch64/neon-frsqrt-frecp.ll          |    54 -
 test/CodeGen/AArch64/neon-halving-add-sub.ll       |   207 -
 test/CodeGen/AArch64/neon-idiv.ll                  |    13 +
 test/CodeGen/AArch64/neon-load-store-v1i32.ll      |    29 -
 test/CodeGen/AArch64/neon-max-min-pairwise.ll      |   346 -
 test/CodeGen/AArch64/neon-max-min.ll               |   310 -
 test/CodeGen/AArch64/neon-misc-scalar.ll           |    60 -
 test/CodeGen/AArch64/neon-misc.ll                  |  2014 ---
 test/CodeGen/AArch64/neon-mov.ll                   |   127 +-
 test/CodeGen/AArch64/neon-mul-div.ll               |   754 -
 test/CodeGen/AArch64/neon-perm.ll                  |   838 +-
 test/CodeGen/AArch64/neon-rounding-halving-add.ll  |   105 -
 test/CodeGen/AArch64/neon-rounding-shift.ll        |   121 -
 test/CodeGen/AArch64/neon-saturating-add-sub.ll    |   241 -
 .../AArch64/neon-saturating-rounding-shift.ll      |   121 -
 test/CodeGen/AArch64/neon-saturating-shift.ll      |   121 -
 test/CodeGen/AArch64/neon-scalar-abs.ll            |    61 -
 test/CodeGen/AArch64/neon-scalar-add-sub.ll        |    50 -
 test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll    |    28 +-
 test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll    |   124 -
 test/CodeGen/AArch64/neon-scalar-compare.ll        |   343 -
 test/CodeGen/AArch64/neon-scalar-copy.ll           |    64 +-
 test/CodeGen/AArch64/neon-scalar-cvt.ll            |   133 -
 test/CodeGen/AArch64/neon-scalar-ext.ll            |   113 -
 test/CodeGen/AArch64/neon-scalar-extract-narrow.ll |   104 -
 test/CodeGen/AArch64/neon-scalar-fabd.ll           |    20 -
 test/CodeGen/AArch64/neon-scalar-fcvt.ll           |   233 -
 test/CodeGen/AArch64/neon-scalar-fp-compare.ll     |   282 -
 test/CodeGen/AArch64/neon-scalar-mul.ll            |   143 -
 test/CodeGen/AArch64/neon-scalar-neg.ll            |    61 -
 test/CodeGen/AArch64/neon-scalar-recip.ll          |    92 -
 .../CodeGen/AArch64/neon-scalar-reduce-pairwise.ll |   215 -
 test/CodeGen/AArch64/neon-scalar-rounding-shift.ll |    39 -
 .../AArch64/neon-scalar-saturating-add-sub.ll      |   242 -
 .../neon-scalar-saturating-rounding-shift.ll       |    94 -
 .../AArch64/neon-scalar-saturating-shift.ll        |    88 -
 test/CodeGen/AArch64/neon-scalar-shift-imm.ll      |   531 -
 test/CodeGen/AArch64/neon-scalar-shift.ll          |   236 -
 test/CodeGen/AArch64/neon-select_cc.ll             |   202 -
 test/CodeGen/AArch64/neon-shift.ll                 |   171 -
 test/CodeGen/AArch64/neon-shl-ashr-lshr.ll         |   333 -
 test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll  |  2314 ---
 test/CodeGen/AArch64/neon-simd-ldst-one.ll         |  2299 ---
 test/CodeGen/AArch64/neon-simd-ldst.ll             |   164 -
 .../AArch64/neon-simd-post-ldst-multi-elem.ll      |   354 -
 test/CodeGen/AArch64/neon-simd-post-ldst-one.ll    |   319 -
 test/CodeGen/AArch64/neon-simd-shift.ll            |  1556 --
 test/CodeGen/AArch64/neon-simd-tbl.ll              |   828 -
 test/CodeGen/AArch64/neon-simd-vget.ll             |   225 -
 test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll      |    30 -
 test/CodeGen/AArch64/neon-truncStore-extLoad.ll    |     8 +-
 test/CodeGen/AArch64/neon-v1i1-setcc.ll            |    68 -
 test/CodeGen/AArch64/neon-vector-list-spill.ll     |   175 -
 test/CodeGen/AArch64/nzcv-save.ll                  |    18 +
 test/CodeGen/AArch64/pic-eh-stubs.ll               |     6 +-
 test/CodeGen/AArch64/ragreedy-csr.ll               |     6 +-
 test/CodeGen/AArch64/regress-bitcast-formals.ll    |     2 +-
 test/CodeGen/AArch64/regress-f128csel-flags.ll     |     2 +-
 test/CodeGen/AArch64/regress-fp128-livein.ll       |     2 +-
 test/CodeGen/AArch64/regress-tail-livereg.ll       |     2 +-
 test/CodeGen/AArch64/regress-tblgen-chains.ll      |    13 +-
 .../AArch64/regress-w29-reserved-with-fp.ll        |    15 -
 test/CodeGen/AArch64/regress-wzr-allocatable.ll    |    41 -
 test/CodeGen/AArch64/returnaddr.ll                 |     2 +-
 test/CodeGen/AArch64/setcc-takes-i32.ll            |     2 +-
 test/CodeGen/AArch64/sext_inreg.ll                 |   198 -
 test/CodeGen/AArch64/sibling-call.ll               |    14 +-
 test/CodeGen/AArch64/sincos-expansion.ll           |     2 +-
 test/CodeGen/AArch64/sincospow-vector-expansion.ll |     2 +-
 test/CodeGen/AArch64/tail-call.ll                  |    34 +-
 test/CodeGen/AArch64/tls-dynamic-together.ll       |    18 -
 test/CodeGen/AArch64/tls-dynamics.ll               |   121 -
 test/CodeGen/AArch64/tls-execs.ll                  |    63 -
 test/CodeGen/AArch64/tst-br.ll                     |    12 +-
 test/CodeGen/AArch64/variadic.ll                   |   241 -
 test/CodeGen/AArch64/zero-reg.ll                   |     9 +-
 test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll         |     2 +
 test/CodeGen/ARM/2010-08-04-StackVariable.ll       |     2 +-
 test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll |     2 +-
 test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll        |    50 +
 test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll  |    27 +
 test/CodeGen/ARM/Windows/chkstk.ll                 |    24 +
 test/CodeGen/ARM/Windows/frame-register.ll         |    22 +
 .../Windows/integer-floating-point-conversion.ll   |    74 +
 test/CodeGen/ARM/Windows/memset.ll                 |    18 +
 test/CodeGen/ARM/Windows/mov32t-bundling.ll        |    28 +
 test/CodeGen/ARM/Windows/movw-movt-relocations.ll  |    27 +
 test/CodeGen/ARM/Windows/no-aeabi.ll               |    22 +
 test/CodeGen/ARM/Windows/pic.ll                    |    16 +
 test/CodeGen/ARM/Windows/read-only-data.ll         |    15 +
 test/CodeGen/ARM/aapcs-hfa-code.ll                 |   111 +
 test/CodeGen/ARM/aapcs-hfa.ll                      |   164 +
 test/CodeGen/ARM/aliases.ll                        |     2 +-
 test/CodeGen/ARM/argaddr.ll                        |     2 +-
 test/CodeGen/ARM/atomic-64bit.ll                   |   131 +-
 test/CodeGen/ARM/atomic-ops-v8.ll                  |   108 +-
 test/CodeGen/ARM/available_externally.ll           |     6 +-
 test/CodeGen/ARM/big-endian-eh-unwind.ll           |    73 +
 test/CodeGen/ARM/big-endian-neon-bitconv.ll        |   392 +
 test/CodeGen/ARM/big-endian-vector-callee.ll       |  1172 ++
 test/CodeGen/ARM/big-endian-vector-caller.ll       |  1369 ++
 test/CodeGen/ARM/bswap16.ll                        |    42 +
 test/CodeGen/ARM/build-attributes.ll               |    10 +
 test/CodeGen/ARM/dagcombine-concatvector.ll        |    11 +-
 test/CodeGen/ARM/debug-frame-vararg.ll             |     9 +-
 test/CodeGen/ARM/debug-frame.ll                    |    11 +-
 test/CodeGen/ARM/debug-segmented-stacks.ll         |     8 +-
 test/CodeGen/ARM/dwarf-eh.ll                       |    71 +
 test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll     |    61 +
 test/CodeGen/ARM/ehabi-handlerdata.ll              |    59 +
 test/CodeGen/ARM/ehabi.ll                          |   231 +-
 test/CodeGen/ARM/frame-register.ll                 |    38 +
 test/CodeGen/ARM/func-argpassing-endian.ll         |   122 +
 test/CodeGen/ARM/hfa-in-contiguous-registers.ll    |    94 +
 test/CodeGen/ARM/hints.ll                          |    69 +
 test/CodeGen/ARM/ifcvt-branch-weight-bug.ll        |     3 +-
 test/CodeGen/ARM/indirect-hidden.ll                |    22 +
 test/CodeGen/ARM/interrupt-attr.ll                 |    38 +-
 test/CodeGen/ARM/intrinsics-overflow.ll            |    57 +
 test/CodeGen/ARM/intrinsics-v8.ll                  |     4 +-
 test/CodeGen/ARM/longMAC.ll                        |    21 +-
 test/CodeGen/ARM/long_shift.ll                     |    58 +-
 test/CodeGen/ARM/memcpy-inline.ll                  |    28 +-
 test/CodeGen/ARM/misched-copy-arm.ll               |     2 +-
 test/CodeGen/ARM/movt.ll                           |     2 +-
 test/CodeGen/ARM/mul.ll                            |    14 +-
 test/CodeGen/ARM/mvn.ll                            |     3 +-
 test/CodeGen/ARM/named-reg-alloc.ll                |    14 +
 test/CodeGen/ARM/named-reg-notareg.ll              |    13 +
 test/CodeGen/ARM/phi.ll                            |     1 +
 test/CodeGen/ARM/ret_i64_arg2.ll                   |     2 +-
 test/CodeGen/ARM/ret_i64_arg3.ll                   |     2 +-
 test/CodeGen/ARM/segmented-stacks-dynamic.ll       |    16 +-
 test/CodeGen/ARM/segmented-stacks.ll               |    36 +-
 test/CodeGen/ARM/smml.ll                           |     3 +-
 test/CodeGen/ARM/stack-frame.ll                    |     8 +-
 test/CodeGen/ARM/stackpointer.ll                   |    25 +
 test/CodeGen/ARM/sub.ll                            |    21 +-
 test/CodeGen/ARM/t2-imm.ll                         |     2 +-
 test/CodeGen/ARM/thumb2-it-block.ll                |     4 +-
 test/CodeGen/ARM/trap.ll                           |     1 +
 test/CodeGen/ARM/undefined.ll                      |    14 +
 test/CodeGen/ARM/vcombine.ll                       |    39 +-
 test/CodeGen/ARM/vfp-libcalls.ll                   |    11 +
 test/CodeGen/ARM/vrev.ll                           |     8 +
 test/CodeGen/ARM/zextload_demandedbits.ll          |     2 +-
 test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll         |    47 -
 test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll   |    45 -
 .../ARM64/2011-03-21-Unaligned-Frame-Index.ll      |    12 -
 test/CodeGen/ARM64/2011-04-21-CPSRBug.ll           |    26 -
 test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll        |    31 -
 .../CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll |    40 -
 .../ARM64/2012-05-07-DAGCombineVectorExtract.ll    |    20 -
 test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll    |    21 -
 test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll       |    22 -
 test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll        |    50 -
 test/CodeGen/ARM64/2012-06-06-FPToUI.ll            |    65 -
 test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll   |    56 -
 test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll        |    19 -
 test/CodeGen/ARM64/2013-01-23-frem-crash.ll        |    15 -
 test/CodeGen/ARM64/2013-01-23-sext-crash.ll        |    37 -
 test/CodeGen/ARM64/2013-02-12-shufv8i8.ll          |    11 -
 test/CodeGen/ARM64/AdvSIMD-Scalar.ll               |    38 -
 test/CodeGen/ARM64/aapcs.ll                        |    86 -
 test/CodeGen/ARM64/abi-varargs.ll                  |   191 -
 test/CodeGen/ARM64/abi.ll                          |   236 -
 test/CodeGen/ARM64/abi_align.ll                    |   529 -
 test/CodeGen/ARM64/addp.ll                         |    32 -
 test/CodeGen/ARM64/addr-mode-folding.ll            |   171 -
 test/CodeGen/ARM64/addr-type-promotion.ll          |    82 -
 test/CodeGen/ARM64/addrmode.ll                     |    72 -
 test/CodeGen/ARM64/alloc-no-stack-realign.ll       |    21 -
 test/CodeGen/ARM64/alloca-frame-pointer-offset.ll  |    29 -
 test/CodeGen/ARM64/andCmpBrToTBZ.ll                |    72 -
 test/CodeGen/ARM64/anyregcc-crash.ll               |    19 -
 test/CodeGen/ARM64/anyregcc.ll                     |   363 -
 test/CodeGen/ARM64/arith-saturating.ll             |   153 -
 test/CodeGen/ARM64/arith.ll                        |   262 -
 test/CodeGen/ARM64/atomic-128.ll                   |   213 -
 test/CodeGen/ARM64/atomic.ll                       |   343 -
 test/CodeGen/ARM64/basic-pic.ll                    |    54 -
 test/CodeGen/ARM64/big-imm-offsets.ll              |    14 -
 test/CodeGen/ARM64/big-stack.ll                    |    21 -
 test/CodeGen/ARM64/bitfield-extract.ll             |   406 -
 test/CodeGen/ARM64/blockaddress.ll                 |    30 -
 test/CodeGen/ARM64/build-vector.ll                 |    35 -
 test/CodeGen/ARM64/call-tailcalls.ll               |    91 -
 test/CodeGen/ARM64/cast-opt.ll                     |    31 -
 test/CodeGen/ARM64/ccmp-heuristics.ll              |   190 -
 test/CodeGen/ARM64/ccmp.ll                         |   289 -
 test/CodeGen/ARM64/coalesce-ext.ll                 |    17 -
 test/CodeGen/ARM64/code-model-large-abs.ll         |    72 -
 test/CodeGen/ARM64/collect-loh-garbage-crash.ll    |    37 -
 test/CodeGen/ARM64/collect-loh-str.ll              |    23 -
 test/CodeGen/ARM64/collect-loh.ll                  |    47 -
 test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S  |    17 -
 test/CodeGen/ARM64/complex-ret.ll                  |     7 -
 test/CodeGen/ARM64/convert-v2f64-v2i32.ll          |    24 -
 test/CodeGen/ARM64/convert-v2i32-v2f64.ll          |    29 -
 test/CodeGen/ARM64/copy-tuple.ll                   |   146 -
 test/CodeGen/ARM64/crc32.ll                        |    71 -
 test/CodeGen/ARM64/crypto.ll                       |   135 -
 test/CodeGen/ARM64/cse.ll                          |    59 -
 test/CodeGen/ARM64/csel.ll                         |   222 -
 test/CodeGen/ARM64/cvt.ll                          |   401 -
 test/CodeGen/ARM64/dagcombiner-convergence.ll      |    19 -
 test/CodeGen/ARM64/dagcombiner-load-slicing.ll     |   102 -
 test/CodeGen/ARM64/dup.ll                          |   322 -
 test/CodeGen/ARM64/early-ifcvt.ll                  |   423 -
 test/CodeGen/ARM64/elf-calls.ll                    |    20 -
 test/CodeGen/ARM64/elf-constpool.ll                |    13 -
 test/CodeGen/ARM64/elf-globals.ll                  |   115 -
 test/CodeGen/ARM64/ext.ll                          |   101 -
 test/CodeGen/ARM64/extend-int-to-fp.ll             |    19 -
 test/CodeGen/ARM64/extend.ll                       |    15 -
 test/CodeGen/ARM64/extern-weak.ll                  |    51 -
 test/CodeGen/ARM64/extload-knownzero.ll            |    28 -
 test/CodeGen/ARM64/extract.ll                      |    58 -
 test/CodeGen/ARM64/extract_subvector.ll            |    51 -
 test/CodeGen/ARM64/fast-isel-addr-offset.ll        |    47 -
 test/CodeGen/ARM64/fast-isel-alloca.ll             |    24 -
 test/CodeGen/ARM64/fast-isel-br.ll                 |   155 -
 test/CodeGen/ARM64/fast-isel-call.ll               |    91 -
 test/CodeGen/ARM64/fast-isel-conversion.ll         |   416 -
 test/CodeGen/ARM64/fast-isel-fcmp.ll               |   146 -
 test/CodeGen/ARM64/fast-isel-gv.ll                 |    38 -
 test/CodeGen/ARM64/fast-isel-icmp.ll               |   214 -
 test/CodeGen/ARM64/fast-isel-indirectbr.ll         |    36 -
 test/CodeGen/ARM64/fast-isel-intrinsic.ll          |   135 -
 test/CodeGen/ARM64/fast-isel-materialize.ll        |    27 -
 test/CodeGen/ARM64/fast-isel-noconvert.ll          |    36 -
 test/CodeGen/ARM64/fast-isel-rem.ll                |    33 -
 test/CodeGen/ARM64/fast-isel-ret.ll                |    63 -
 test/CodeGen/ARM64/fast-isel-select.ll             |    63 -
 test/CodeGen/ARM64/fast-isel.ll                    |    95 -
 test/CodeGen/ARM64/fastcc-tailcall.ll              |    24 -
 .../ARM64/fastisel-gep-promote-before-add.ll       |    18 -
 test/CodeGen/ARM64/fcmp-opt.ll                     |   173 -
 test/CodeGen/ARM64/fcopysign.ll                    |    51 -
 .../ARM64/fixed-point-scalar-cvt-dagcombine.ll     |    15 -
 test/CodeGen/ARM64/fmadd.ll                        |    92 -
 test/CodeGen/ARM64/fmax.ll                         |    21 -
 test/CodeGen/ARM64/fminv.ll                        |   101 -
 test/CodeGen/ARM64/fmuladd.ll                      |    88 -
 test/CodeGen/ARM64/fold-address.ll                 |    79 -
 test/CodeGen/ARM64/fold-lsl.ll                     |    79 -
 test/CodeGen/ARM64/fp-imm.ll                       |    32 -
 test/CodeGen/ARM64/fp.ll                           |     8 -
 test/CodeGen/ARM64/fp128-folding.ll                |    17 -
 test/CodeGen/ARM64/fp128.ll                        |   274 -
 test/CodeGen/ARM64/frame-index.ll                  |    11 -
 test/CodeGen/ARM64/frameaddr.ll                    |    15 -
 test/CodeGen/ARM64/global-address.ll               |    14 -
 test/CodeGen/ARM64/hello.ll                        |    38 -
 test/CodeGen/ARM64/i16-subreg-extract.ll           |    12 -
 test/CodeGen/ARM64/icmp-opt.ll                     |    17 -
 test/CodeGen/ARM64/illegal-float-ops.ll            |   295 -
 test/CodeGen/ARM64/indexed-memory.ll               |   351 -
 test/CodeGen/ARM64/inline-asm-error-I.ll           |    11 -
 test/CodeGen/ARM64/inline-asm-error-J.ll           |    11 -
 test/CodeGen/ARM64/inline-asm-error-K.ll           |    11 -
 test/CodeGen/ARM64/inline-asm-error-L.ll           |    11 -
 test/CodeGen/ARM64/inline-asm-error-M.ll           |    11 -
 test/CodeGen/ARM64/inline-asm-error-N.ll           |    11 -
 test/CodeGen/ARM64/inline-asm-zero-reg-error.ll    |    11 -
 test/CodeGen/ARM64/inline-asm.ll                   |   230 -
 test/CodeGen/ARM64/join-reserved.ll                |    17 -
 test/CodeGen/ARM64/jumptable.ll                    |    35 -
 test/CodeGen/ARM64/ld1.ll                          |  1345 --
 test/CodeGen/ARM64/ldp.ll                          |   149 -
 test/CodeGen/ARM64/ldur.ll                         |    67 -
 test/CodeGen/ARM64/ldxr-stxr.ll                    |   143 -
 test/CodeGen/ARM64/leaf-compact-unwind.ll          |   161 -
 test/CodeGen/ARM64/leaf.ll                         |    13 -
 test/CodeGen/ARM64/lit.local.cfg                   |    11 -
 test/CodeGen/ARM64/long-shift.ll                   |    59 -
 test/CodeGen/ARM64/memcpy-inline.ll                |   112 -
 test/CodeGen/ARM64/memset-inline.ll                |    27 -
 test/CodeGen/ARM64/memset-to-bzero.ll              |   101 -
 test/CodeGen/ARM64/movi.ll                         |   202 -
 test/CodeGen/ARM64/mul.ll                          |    90 -
 test/CodeGen/ARM64/neg.ll                          |    71 -
 test/CodeGen/ARM64/neon-compare-instructions.ll    |  1191 --
 test/CodeGen/ARM64/patchpoint.ll                   |   163 -
 test/CodeGen/ARM64/platform-reg.ll                 |    26 -
 test/CodeGen/ARM64/popcnt.ll                       |    43 -
 test/CodeGen/ARM64/prefetch.ll                     |    88 -
 test/CodeGen/ARM64/promote-const.ll                |   255 -
 test/CodeGen/ARM64/redzone.ll                      |    18 -
 test/CodeGen/ARM64/register-offset-addressing.ll   |    12 -
 test/CodeGen/ARM64/register-pairing.ll             |    53 -
 test/CodeGen/ARM64/regress-f128csel-flags.ll       |    27 -
 test/CodeGen/ARM64/regress-interphase-shift.ll     |    29 -
 test/CodeGen/ARM64/return-vector.ll                |    11 -
 test/CodeGen/ARM64/returnaddr.ll                   |    26 -
 test/CodeGen/ARM64/rev.ll                          |   221 -
 test/CodeGen/ARM64/rounding.ll                     |   208 -
 test/CodeGen/ARM64/scaled_iv.ll                    |    38 -
 test/CodeGen/ARM64/scvt.ll                         |   830 -
 test/CodeGen/ARM64/shifted-sext.ll                 |   277 -
 test/CodeGen/ARM64/simd-scalar-to-vector.ll        |    22 -
 test/CodeGen/ARM64/simplest-elf.ll                 |    18 -
 test/CodeGen/ARM64/sincos.ll                       |    42 -
 test/CodeGen/ARM64/sitofp-combine-chains.ll        |    22 -
 test/CodeGen/ARM64/sli-sri-opt.ll                  |    41 -
 test/CodeGen/ARM64/smaxv.ll                        |    74 -
 test/CodeGen/ARM64/sminv.ll                        |    74 -
 test/CodeGen/ARM64/spill-lr.ll                     |    74 -
 test/CodeGen/ARM64/spill.ll                        |    15 -
 test/CodeGen/ARM64/st1.ll                          |   676 -
 test/CodeGen/ARM64/stack-no-frame.ll               |    20 -
 test/CodeGen/ARM64/stackmap.ll                     |   288 -
 test/CodeGen/ARM64/stacksave.ll                    |    20 -
 test/CodeGen/ARM64/stp.ll                          |   101 -
 test/CodeGen/ARM64/strict-align.ll                 |    25 -
 test/CodeGen/ARM64/stur.ll                         |    98 -
 test/CodeGen/ARM64/subvector-extend.ll             |   141 -
 test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll       |    36 -
 test/CodeGen/ARM64/tbl.ll                          |   132 -
 test/CodeGen/ARM64/this-return.ll                  |    83 -
 test/CodeGen/ARM64/tls-darwin.ll                   |    18 -
 test/CodeGen/ARM64/tls-dynamic-together.ll         |    18 -
 test/CodeGen/ARM64/tls-dynamics.ll                 |   135 -
 test/CodeGen/ARM64/tls-execs.ll                    |    63 -
 test/CodeGen/ARM64/trap.ll                         |     8 -
 test/CodeGen/ARM64/trn.ll                          |   134 -
 test/CodeGen/ARM64/trunc-store.ll                  |    75 -
 test/CodeGen/ARM64/umaxv.ll                        |    92 -
 test/CodeGen/ARM64/uminv.ll                        |    92 -
 test/CodeGen/ARM64/umov.ll                         |    33 -
 test/CodeGen/ARM64/unaligned_ldst.ll               |    41 -
 test/CodeGen/ARM64/uzp.ll                          |   107 -
 test/CodeGen/ARM64/vaargs.ll                       |    20 -
 test/CodeGen/ARM64/vabs.ll                         |   804 -
 test/CodeGen/ARM64/vadd.ll                         |   941 --
 test/CodeGen/ARM64/vaddlv.ll                       |    26 -
 test/CodeGen/ARM64/vaddv.ll                        |   233 -
 test/CodeGen/ARM64/variadic-aapcs.ll               |   143 -
 test/CodeGen/ARM64/vbitwise.ll                     |    91 -
 test/CodeGen/ARM64/vclz.ll                         |   109 -
 test/CodeGen/ARM64/vcmp.ll                         |   227 -
 test/CodeGen/ARM64/vcnt.ll                         |    56 -
 test/CodeGen/ARM64/vcombine.ll                     |    17 -
 test/CodeGen/ARM64/vcvt.ll                         |   686 -
 test/CodeGen/ARM64/vcvt_f.ll                       |    82 -
 test/CodeGen/ARM64/vcvt_f32_su32.ll                |    73 -
 test/CodeGen/ARM64/vcvt_n.ll                       |    49 -
 test/CodeGen/ARM64/vcvt_su32_f32.ll                |    34 -
 test/CodeGen/ARM64/vcvtxd_f32_f64.ll               |    11 -
 test/CodeGen/ARM64/vecCmpBr.ll                     |   207 -
 test/CodeGen/ARM64/vecFold.ll                      |   145 -
 test/CodeGen/ARM64/vector-ext.ll                   |    16 -
 test/CodeGen/ARM64/vector-imm.ll                   |   134 -
 test/CodeGen/ARM64/vector-ldst.ll                  |   601 -
 test/CodeGen/ARM64/vext.ll                         |   464 -
 test/CodeGen/ARM64/vfloatintrinsics.ll             |   375 -
 test/CodeGen/ARM64/vhadd.ll                        |   249 -
 test/CodeGen/ARM64/vhsub.ll                        |   125 -
 test/CodeGen/ARM64/virtual_base.ll                 |    51 -
 test/CodeGen/ARM64/vmax.ll                         |   679 -
 test/CodeGen/ARM64/vminmaxnm.ll                    |    68 -
 test/CodeGen/ARM64/vmovn.ll                        |   242 -
 test/CodeGen/ARM64/vmul.ll                         |  2003 ---
 test/CodeGen/ARM64/volatile.ll                     |    27 -
 test/CodeGen/ARM64/vqadd.ll                        |   332 -
 test/CodeGen/ARM64/vqsub.ll                        |   147 -
 test/CodeGen/ARM64/vselect.ll                      |    18 -
 test/CodeGen/ARM64/vsetcc_fp.ll                    |    11 -
 test/CodeGen/ARM64/vshift.ll                       |  1909 ---
 test/CodeGen/ARM64/vshr.ll                         |    63 -
 test/CodeGen/ARM64/vshuffle.ll                     |   115 -
 test/CodeGen/ARM64/vsqrt.ll                        |   232 -
 test/CodeGen/ARM64/vsra.ll                         |   150 -
 test/CodeGen/ARM64/vsub.ll                         |   417 -
 test/CodeGen/ARM64/weak-reference.ll               |    10 -
 test/CodeGen/ARM64/xaluo.ll                        |   524 -
 test/CodeGen/ARM64/zero-cycle-regmov.ll            |    17 -
 test/CodeGen/ARM64/zero-cycle-zeroing.ll           |    49 -
 test/CodeGen/ARM64/zext.ll                         |    11 -
 test/CodeGen/ARM64/zextload-unscaled.ll            |    40 -
 test/CodeGen/ARM64/zip.ll                          |   107 -
 test/CodeGen/Hexagon/hwloop-dbg.ll                 |     3 +-
 test/CodeGen/MSP430/fp.ll                          |     2 +-
 test/CodeGen/Mips/2010-07-20-Switch.ll             |     8 +-
 test/CodeGen/Mips/Fast-ISel/nullvoid.ll            |     9 +
 test/CodeGen/Mips/Fast-ISel/simplestore.ll         |    15 +
 test/CodeGen/Mips/Fast-ISel/simplestorei.ll        |    65 +
 test/CodeGen/Mips/abicalls.ll                      |     1 +
 test/CodeGen/Mips/cconv/arguments-float.ll         |   222 +
 test/CodeGen/Mips/cconv/arguments-fp128.ll         |    51 +
 .../Mips/cconv/arguments-hard-float-varargs.ll     |   157 +
 test/CodeGen/Mips/cconv/arguments-hard-float.ll    |   211 +
 test/CodeGen/Mips/cconv/arguments-hard-fp128.ll    |    49 +
 test/CodeGen/Mips/cconv/arguments.ll               |   170 +
 test/CodeGen/Mips/cconv/callee-saved-float.ll      |   111 +
 test/CodeGen/Mips/cconv/callee-saved.ll            |   167 +
 test/CodeGen/Mips/cconv/memory-layout.ll           |   140 +
 test/CodeGen/Mips/cconv/reserved-space.ll          |    39 +
 test/CodeGen/Mips/cconv/return-float.ll            |    48 +
 test/CodeGen/Mips/cconv/return-hard-float.ll       |    46 +
 test/CodeGen/Mips/cconv/return-hard-fp128.ll       |    31 +
 test/CodeGen/Mips/cconv/return.ll                  |    66 +
 test/CodeGen/Mips/cconv/stack-alignment.ll         |    28 +
 test/CodeGen/Mips/cmov.ll                          |     3 +-
 test/CodeGen/Mips/eh-dwarf-cfa.ll                  |     2 +
 test/CodeGen/Mips/eh-return64.ll                   |     1 +
 test/CodeGen/Mips/elf_eflags.ll                    |     3 +
 test/CodeGen/Mips/elf_st_other.ll                  |    12 -
 test/CodeGen/Mips/fabs.ll                          |    50 +-
 test/CodeGen/Mips/fcopysign-f32-f64.ll             |     1 +
 test/CodeGen/Mips/fcopysign.ll                     |     1 +
 test/CodeGen/Mips/fmadd1.ll                        |    15 +
 test/CodeGen/Mips/fneg.ll                          |    27 +-
 test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll     |     2 +-
 test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll       |     2 +-
 test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll       |     2 +-
 test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll       |     2 +-
 test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll       |     2 +-
 test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll       |     2 +-
 test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll         |    16 +-
 test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll       |     4 +-
 test/CodeGen/Mips/inlineasm-operand-code.ll        |    28 +-
 test/CodeGen/Mips/inlineasm_constraint.ll          |    30 +-
 test/CodeGen/Mips/int-to-float-conversion.ll       |     1 +
 test/CodeGen/Mips/largeimmprinting.ll              |     4 +-
 test/CodeGen/Mips/load-store-left-right.ll         |   434 +-
 test/CodeGen/Mips/longbranch.ll                    |   154 +-
 test/CodeGen/Mips/micromips-directives.ll          |    16 +
 test/CodeGen/Mips/micromips-long-branch.ll         | 16437 -------------------
 test/CodeGen/Mips/mips32r6/compatibility.ll        |     9 +
 test/CodeGen/Mips/mips64-f128.ll                   |     4 +-
 test/CodeGen/Mips/mips64-sret.ll                   |    25 +-
 test/CodeGen/Mips/mips64countleading.ll            |    11 +-
 test/CodeGen/Mips/mips64directive.ll               |     1 +
 test/CodeGen/Mips/mips64ext.ll                     |     3 +-
 test/CodeGen/Mips/mips64fpimm0.ll                  |     1 +
 test/CodeGen/Mips/mips64fpldst.ll                  |     2 +
 test/CodeGen/Mips/mips64imm.ll                     |     1 +
 test/CodeGen/Mips/mips64instrs.ll                  |    18 +-
 test/CodeGen/Mips/mips64intldst.ll                 |     2 +
 test/CodeGen/Mips/mips64lea.ll                     |     1 +
 test/CodeGen/Mips/mips64load-store-left-right.ll   |    73 -
 test/CodeGen/Mips/mips64muldiv.ll                  |     1 +
 test/CodeGen/Mips/mips64r6/compatibility.ll        |     9 +
 test/CodeGen/Mips/msa/basic_operations.ll          |   358 +-
 test/CodeGen/Mips/msa/basic_operations_float.ll    |   117 +-
 test/CodeGen/Mips/optimize-fp-math.ll              |     1 +
 test/CodeGen/Mips/remat-immed-load.ll              |     1 +
 test/CodeGen/Mips/sint-fp-store_pattern.ll         |     1 +
 test/CodeGen/Mips/start-asm-file.ll                |    91 +
 test/CodeGen/Mips/tls-alias.ll                     |     2 +-
 test/CodeGen/Mips/unalignedload.ll                 |    82 +-
 test/CodeGen/NVPTX/access-non-generic.ll           |    91 +
 test/CodeGen/NVPTX/addrspacecast-gvar.ll           |     9 +
 test/CodeGen/NVPTX/addrspacecast.ll                |     4 +-
 test/CodeGen/NVPTX/local-stack-frame.ll            |    18 +-
 test/CodeGen/NVPTX/surf-read.ll                    |    20 +
 test/CodeGen/NVPTX/surf-write.ll                   |    16 +
 test/CodeGen/NVPTX/tex-read.ll                     |    20 +
 .../CodeGen/PowerPC/2007-11-16-landingpad-split.ll |     1 +
 test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll |     1 -
 test/CodeGen/PowerPC/aa-tbaa.ll                    |     2 +-
 test/CodeGen/PowerPC/alias.ll                      |    31 +
 test/CodeGen/PowerPC/cc.ll                         |    70 +
 test/CodeGen/PowerPC/ctrloop-le.ll                 |     3 +
 test/CodeGen/PowerPC/ctrloop-lt.ll                 |     3 +
 test/CodeGen/PowerPC/ctrloop-sh.ll                 |    72 +
 test/CodeGen/PowerPC/dbg.ll                        |     3 +-
 test/CodeGen/PowerPC/indexed-load.ll               |    22 +
 test/CodeGen/PowerPC/mcm-10.ll                     |     3 +-
 test/CodeGen/PowerPC/mcm-11.ll                     |     3 +-
 test/CodeGen/PowerPC/mcm-obj-2.ll                  |     4 +-
 test/CodeGen/PowerPC/named-reg-alloc-r0.ll         |    15 +
 test/CodeGen/PowerPC/named-reg-alloc-r1-64.ll      |    18 +
 test/CodeGen/PowerPC/named-reg-alloc-r1.ll         |    20 +
 test/CodeGen/PowerPC/named-reg-alloc-r13-64.ll     |    18 +
 test/CodeGen/PowerPC/named-reg-alloc-r13.ll        |    18 +
 test/CodeGen/PowerPC/named-reg-alloc-r2-64.ll      |    17 +
 test/CodeGen/PowerPC/named-reg-alloc-r2.ll         |    18 +
 test/CodeGen/PowerPC/rlwimi-dyn-and.ll             |    48 +
 test/CodeGen/PowerPC/splat-bug.ll                  |    18 +
 test/CodeGen/R600/32-bit-local-address-space.ll    |     8 +-
 test/CodeGen/R600/64bit-kernel-args.ll             |     4 +-
 test/CodeGen/R600/add.ll                           |    25 +
 test/CodeGen/R600/add_i64.ll                       |     2 +-
 test/CodeGen/R600/address-space.ll                 |     6 +-
 test/CodeGen/R600/array-ptr-calc-i32.ll            |     2 +-
 test/CodeGen/R600/array-ptr-calc-i64.ll            |     2 +-
 test/CodeGen/R600/call.ll                          |    33 +
 test/CodeGen/R600/extload.ll                       |    11 +-
 test/CodeGen/R600/extract_vector_elt_i16.ll        |    29 +
 test/CodeGen/R600/fabs.ll                          |    11 +
 test/CodeGen/R600/fconst64.ll                      |     4 +-
 test/CodeGen/R600/fneg.ll                          |    13 +-
 test/CodeGen/R600/fp_to_uint.f64.ll                |     9 +
 test/CodeGen/R600/gep-address-space.ll             |     4 +-
 test/CodeGen/R600/gv-const-addrspace-fail.ll       |    58 +
 test/CodeGen/R600/gv-const-addrspace.ll            |    57 +-
 test/CodeGen/R600/infinite-loop.ll                 |     2 +-
 test/CodeGen/R600/insert_vector_elt.ll             |    28 +-
 test/CodeGen/R600/insert_vector_elt_f64.ll         |     2 +-
 test/CodeGen/R600/kernel-args.ll                   |    32 +-
 test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll           |   388 +-
 test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll           |   514 +
 test/CodeGen/R600/llvm.AMDGPU.imad24.ll            |    21 +
 test/CodeGen/R600/llvm.AMDGPU.imul24.ll            |    15 +
 test/CodeGen/R600/llvm.AMDGPU.umad24.ll            |    19 +
 test/CodeGen/R600/llvm.AMDGPU.umul24.ll            |    17 +
 test/CodeGen/R600/llvm.SI.tbuffer.store.ll         |     8 +-
 test/CodeGen/R600/llvm.cos.ll                      |    43 +-
 test/CodeGen/R600/llvm.rint.f64.ll                 |    37 +
 test/CodeGen/R600/llvm.rint.ll                     |    49 +-
 test/CodeGen/R600/llvm.sin.ll                      |    44 +-
 test/CodeGen/R600/llvm.sqrt.ll                     |     2 +-
 test/CodeGen/R600/load-i1.ll                       |   107 +
 test/CodeGen/R600/local-64.ll                      |    52 +-
 test/CodeGen/R600/local-memory-two-objects.ll      |     4 +-
 test/CodeGen/R600/loop-idiom.ll                    |     2 +-
 test/CodeGen/R600/mad_int24.ll                     |    17 +-
 test/CodeGen/R600/mad_uint24.ll                    |    67 +-
 test/CodeGen/R600/mubuf.ll                         |    16 +-
 test/CodeGen/R600/mul.ll                           |    63 +-
 test/CodeGen/R600/mul_int24.ll                     |    17 +-
 test/CodeGen/R600/mul_uint24.ll                    |    61 +-
 test/CodeGen/R600/mulhu.ll                         |     2 +-
 test/CodeGen/R600/or.ll                            |     4 +-
 test/CodeGen/R600/private-memory.ll                |     8 +-
 test/CodeGen/R600/pv.ll                            |     2 +-
 test/CodeGen/R600/register-count-comments.ll       |     2 +-
 test/CodeGen/R600/salu-to-valu.ll                  |    42 +
 .../R600/schedule-vs-if-nested-loop-failure.ll     |     2 +-
 test/CodeGen/R600/selectcc.ll                      |    19 +
 test/CodeGen/R600/setcc.ll                         |    26 +-
 test/CodeGen/R600/setcc64.ll                       |    26 +-
 test/CodeGen/R600/seto.ll                          |     3 +-
 test/CodeGen/R600/setuo.ll                         |     3 +-
 test/CodeGen/R600/sext-in-reg.ll                   |   371 +-
 test/CodeGen/R600/sgpr-control-flow.ll             |    27 +
 test/CodeGen/R600/sgpr-copy-duplicate-operand.ll   |     2 +-
 test/CodeGen/R600/sgpr-copy.ll                     |     2 +-
 test/CodeGen/R600/si-annotate-cf-assertion.ll      |     2 +-
 .../R600/simplify-demanded-bits-build-pair.ll      |    36 +
 test/CodeGen/R600/smrd.ll                          |    28 +-
 test/CodeGen/R600/store-v3i64.ll                   |     2 +-
 test/CodeGen/R600/store-vector-ptrs.ll             |     2 +-
 test/CodeGen/R600/store.ll                         |    66 +
 test/CodeGen/R600/sub.ll                           |    55 +-
 test/CodeGen/R600/trunc-store-i1.ll                |     2 +-
 test/CodeGen/R600/trunc.ll                         |     7 +-
 test/CodeGen/R600/uaddo.ll                         |    17 +
 test/CodeGen/R600/udivrem64.ll                     |    82 +
 test/CodeGen/R600/uint_to_fp.f64.ll                |     9 +
 test/CodeGen/R600/unaligned-load-store.ll          |     2 +-
 test/CodeGen/R600/v_cndmask.ll                     |     3 +-
 test/CodeGen/R600/valu-i1.ll                       |    39 +
 test/CodeGen/R600/work-item-intrinsics.ll          |    16 +-
 test/CodeGen/R600/xor.ll                           |    18 +
 test/CodeGen/R600/zero_extend.ll                   |    16 +-
 test/CodeGen/SPARC/2011-01-11-FrameAddr.ll         |     6 +-
 test/CodeGen/SPARC/2011-01-19-DelaySlot.ll         |     3 +-
 test/CodeGen/SPARC/64abi.ll                        |     4 +-
 test/CodeGen/SPARC/64bit.ll                        |    12 +-
 test/CodeGen/SPARC/64cond.ll                       |     6 +-
 test/CodeGen/SPARC/atomics.ll                      |     6 +-
 test/CodeGen/SPARC/exception.ll                    |    34 -
 test/CodeGen/SPARC/leafproc.ll                     |     6 +-
 test/CodeGen/SPARC/parts.ll                        |     6 +-
 test/CodeGen/SPARC/sret-secondary.ll               |     8 +
 test/CodeGen/SystemZ/alias-01.ll                   |     3 -
 test/CodeGen/Thumb/2009-06-18-ThumbCommuteMul.ll   |     4 +-
 test/CodeGen/Thumb/2010-06-18-SibCallCrash.ll      |     2 +-
 test/CodeGen/Thumb/2010-07-15-debugOrdering.ll     |     2 +-
 test/CodeGen/Thumb/DbgValueOtherTargets.test       |     2 +-
 test/CodeGen/Thumb/barrier.ll                      |     6 +-
 test/CodeGen/Thumb/dyn-stackalloc.ll               |     7 +-
 test/CodeGen/Thumb/fpconv.ll                       |     2 +-
 test/CodeGen/Thumb/fpow.ll                         |     2 +-
 test/CodeGen/Thumb/inlineasm-imm-thumb.ll          |     2 +-
 test/CodeGen/Thumb/inlineasm-thumb.ll              |     3 +-
 test/CodeGen/Thumb/ispositive.ll                   |     2 +-
 test/CodeGen/Thumb/ldr_ext.ll                      |     4 +-
 test/CodeGen/Thumb/ldr_frame.ll                    |     2 +-
 test/CodeGen/Thumb/long-setcc.ll                   |     9 +-
 test/CodeGen/Thumb/long.ll                         |    20 +-
 test/CodeGen/Thumb/long_shift.ll                   |     2 +-
 test/CodeGen/Thumb/mul.ll                          |    14 +-
 test/CodeGen/Thumb/rev.ll                          |     2 +-
 test/CodeGen/Thumb/segmented-stacks-dynamic.ll     |    12 +-
 test/CodeGen/Thumb/segmented-stacks.ll             |    36 +-
 .../Thumb/stack-coloring-without-frame-ptr.ll      |     2 +-
 test/CodeGen/Thumb/stack-frame.ll                  |     5 +-
 test/CodeGen/Thumb/thumb-imm.ll                    |     6 +-
 test/CodeGen/Thumb/thumb-ldm.ll                    |    42 +
 test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll         |    37 +
 test/CodeGen/Thumb/trap.ll                         |     2 +-
 test/CodeGen/Thumb/tst_teq.ll                      |     5 +-
 test/CodeGen/Thumb/vargs.ll                        |    11 +-
 test/CodeGen/Thumb2/bfi.ll                         |     2 +-
 test/CodeGen/Thumb2/bfx.ll                         |     2 +-
 test/CodeGen/Thumb2/carry.ll                       |     2 +-
 test/CodeGen/Thumb2/div.ll                         |     8 +-
 test/CodeGen/Thumb2/ifcvt-neon.ll                  |     2 +-
 test/CodeGen/Thumb2/longMACt.ll                    |     2 +-
 test/CodeGen/Thumb2/mul_const.ll                   |     2 +-
 test/CodeGen/Thumb2/segmented-stacks.ll            |     8 +-
 test/CodeGen/Thumb2/thumb2-adc.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-add.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-add2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-add3.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-add4.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-add5.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-add6.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-and.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-and2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-asr.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-asr2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-bcc.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-bfc.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-bic.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-clz.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-cmn.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-cmn2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-cmp.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-cmp2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-eor.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-eor2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-jtb.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-ldm.ll                  |     2 +
 test/CodeGen/Thumb2/thumb2-ldr.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-ldr_ext.ll              |    18 +-
 test/CodeGen/Thumb2/thumb2-ldr_post.ll             |     2 +-
 test/CodeGen/Thumb2/thumb2-ldr_pre.ll              |    12 +-
 test/CodeGen/Thumb2/thumb2-ldrb.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-ldrh.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-lsl.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-lsl2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-lsr.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-lsr2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-lsr3.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-mla.ll                  |     6 +-
 test/CodeGen/Thumb2/thumb2-mls.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-mov.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-mul.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-mulhi.ll                |     2 +-
 test/CodeGen/Thumb2/thumb2-mvn.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-mvn2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-neg.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-orn.ll                  |     3 +-
 test/CodeGen/Thumb2/thumb2-orn2.ll                 |     3 +-
 test/CodeGen/Thumb2/thumb2-orr.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-orr2.ll                 |     3 +-
 test/CodeGen/Thumb2/thumb2-pack.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-rev.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-ror.ll                  |     4 +-
 test/CodeGen/Thumb2/thumb2-rsb.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-rsb2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-sbc.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-select.ll               |     3 +-
 test/CodeGen/Thumb2/thumb2-select_xform.ll         |     2 +-
 test/CodeGen/Thumb2/thumb2-shifter.ll              |     4 +-
 test/CodeGen/Thumb2/thumb2-smla.ll                 |     4 +-
 test/CodeGen/Thumb2/thumb2-smul.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-str.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-str_post.ll             |     2 +-
 test/CodeGen/Thumb2/thumb2-str_pre.ll              |     2 +-
 test/CodeGen/Thumb2/thumb2-strb.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-strh.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-sub.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-sub2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-sub3.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-sub4.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-sub5.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-sxt-uxt.ll              |     2 +-
 test/CodeGen/Thumb2/thumb2-sxt_rot.ll              |     3 +-
 test/CodeGen/Thumb2/thumb2-teq.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-teq2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-tst.ll                  |     2 +-
 test/CodeGen/Thumb2/thumb2-tst2.ll                 |     2 +-
 test/CodeGen/Thumb2/thumb2-uxt_rot.ll              |     4 +-
 test/CodeGen/Thumb2/thumb2-uxtb.ll                 |     4 +-
 test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll        |    10 +-
 test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll    |     6 +-
 test/CodeGen/X86/2010-08-04-StackVariable.ll       |     2 +-
 test/CodeGen/X86/MergeConsecutiveStores.ll         |     1 +
 test/CodeGen/X86/aliases.ll                        |     2 +-
 test/CodeGen/X86/atom-bypass-slow-division-64.ll   |     4 +-
 test/CodeGen/X86/avoid_complex_am.ll               |    40 +
 test/CodeGen/X86/avx-blend.ll                      |    59 +-
 test/CodeGen/X86/avx-shuffle.ll                    |    26 +
 test/CodeGen/X86/avx.ll                            |   136 +
 test/CodeGen/X86/avx1-logical-load-folding.ll      |    60 +
 test/CodeGen/X86/avx2-blend.ll                     |    11 +
 test/CodeGen/X86/avx2-vector-shifts.ll             |    10 +
 test/CodeGen/X86/avx512-cvt.ll                     |    32 +
 test/CodeGen/X86/avx512-gather-scatter-intrin.ll   |   206 +-
 test/CodeGen/X86/avx512-insert-extract.ll          |    38 +
 test/CodeGen/X86/avx512-intrinsics.ll              |    14 +-
 test/CodeGen/X86/avx512-mov.ll                     |    28 +
 test/CodeGen/X86/avx512-shuffle.ll                 |    19 +
 test/CodeGen/X86/blend-msb.ll                      |    22 +-
 test/CodeGen/X86/bmi.ll                            |    17 +
 test/CodeGen/X86/br-fold.ll                        |    18 +-
 test/CodeGen/X86/bswap-vector.ll                   |   137 +-
 test/CodeGen/X86/cdecl-method-return.ll            |    69 -
 test/CodeGen/X86/cfi.ll                            |    27 +
 test/CodeGen/X86/cmp.ll                            |    45 +-
 test/CodeGen/X86/codegen-prepare-addrmode-sext.ll  |    20 +
 test/CodeGen/X86/codegen-prepare-crash.ll          |    14 +
 test/CodeGen/X86/codegen-prepare.ll                |     1 +
 test/CodeGen/X86/combine-avx-intrinsics.ll         |   119 +
 test/CodeGen/X86/combine-avx2-intrinsics.ll        |   164 +
 test/CodeGen/X86/combine-sse2-intrinsics.ll        |    53 +
 test/CodeGen/X86/combine-sse41-intrinsics.ll       |   182 +
 .../X86/constant-hoisting-shift-immediate.ll       |    25 +
 test/CodeGen/X86/divide-by-constant.ll             |     8 +-
 test/CodeGen/X86/dllexport-x86_64.ll               |    68 +-
 test/CodeGen/X86/dllexport.ll                      |    85 +-
 test/CodeGen/X86/expand-opaque-const.ll            |    21 +
 test/CodeGen/X86/f16c-intrinsics.ll                |    14 +
 test/CodeGen/X86/fma-do-not-commute.ll             |    30 +
 test/CodeGen/X86/fold-load-vec.ll                  |     2 +-
 test/CodeGen/X86/gcc_except_table.ll               |     4 -
 test/CodeGen/X86/global-sections.ll                |    80 +-
 test/CodeGen/X86/indirect-hidden.ll                |    43 +
 test/CodeGen/X86/isel-sink.ll                      |     1 +
 test/CodeGen/X86/lit.local.cfg                     |     2 +-
 test/CodeGen/X86/live-out-reg-info.ll              |     2 +-
 test/CodeGen/X86/lower-bitcast.ll                  |   155 +
 test/CodeGen/X86/lower-vec-shift.ll                |   125 +
 test/CodeGen/X86/lzcnt-tzcnt.ll                    |   447 +
 test/CodeGen/X86/masked-iv-safe.ll                 |     6 +-
 test/CodeGen/X86/merge_store.ll                    |     1 +
 test/CodeGen/X86/mod128.ll                         |    26 +
 test/CodeGen/X86/musttail-indirect.ll              |   124 +
 test/CodeGen/X86/musttail-thiscall.ll              |    31 +
 test/CodeGen/X86/musttail.ll                       |    90 +
 test/CodeGen/X86/named-reg-alloc.ll                |    14 +
 test/CodeGen/X86/named-reg-notareg.ll              |    13 +
 test/CodeGen/X86/no-cfi.ll                         |    34 -
 test/CodeGen/X86/peep-test-4.ll                    |    76 +-
 test/CodeGen/X86/peephole-multiple-folds.ll        |     4 +-
 .../CodeGen/X86/ragreedy-last-chance-recoloring.ll |    13 +
 test/CodeGen/X86/rdtsc.ll                          |    53 +-
 test/CodeGen/X86/remat-invalid-liveness.ll         |    85 +
 test/CodeGen/X86/ret-mmx.ll                        |     1 +
 test/CodeGen/X86/rotate3.ll                        |    76 -
 test/CodeGen/X86/segmented-stacks-dynamic.ll       |    12 +-
 test/CodeGen/X86/segmented-stacks.ll               |    89 +-
 test/CodeGen/X86/sse2.ll                           |    18 +
 test/CodeGen/X86/sse3.ll                           |     2 +-
 test/CodeGen/X86/sse41-blend.ll                    |    40 +-
 test/CodeGen/X86/sse41.ll                          |   447 +-
 test/CodeGen/X86/stack-protector-dbginfo.ll        |     2 +-
 test/CodeGen/X86/stack-protector.ll                |   597 +-
 test/CodeGen/X86/stackpointer.ll                   |    28 +
 test/CodeGen/X86/tls.ll                            |    87 +
 test/CodeGen/X86/vec_shuffle-41.ll                 |    21 +
 test/CodeGen/X86/vec_splat.ll                      |    16 +
 test/CodeGen/X86/vector-idiv.ll                    |   217 +
 test/CodeGen/X86/win32_sret.ll                     |    83 +
 test/CodeGen/X86/x86-64-sret-return-2.ll           |    18 +
 test/CodeGen/XCore/epilogue_prologue.ll            |    37 +-
 test/CodeGen/XCore/llvm-intrinsics.ll              |     6 +-
 test/DebugInfo/2009-11-05-DeadGlobalVariable.ll    |     4 +-
 .../DebugInfo/2009-11-06-NamelessGlobalVariable.ll |     2 +-
 test/DebugInfo/2010-03-19-DbgDeclare.ll            |     2 +-
 test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll       |    20 +-
 test/DebugInfo/2010-07-19-Crash.ll                 |     2 +-
 test/DebugInfo/AArch64/cfi-frame.ll                |    58 -
 test/DebugInfo/AArch64/lit.local.cfg               |     2 +-
 test/DebugInfo/AArch64/struct_by_value.ll          |    68 +
 test/DebugInfo/AArch64/variable-loc.ll             |   101 -
 test/DebugInfo/ARM64/lit.local.cfg                 |     4 -
 test/DebugInfo/ARM64/struct_by_value.ll            |    68 -
 test/DebugInfo/COFF/asm.ll                         |     4 +-
 test/DebugInfo/COFF/multifile.ll                   |     4 +-
 test/DebugInfo/COFF/multifunction.ll               |     4 +-
 test/DebugInfo/COFF/simple.ll                      |     4 +-
 .../COFF/tail-call-without-lexical-scopes.ll       |     2 +-
 test/DebugInfo/Inputs/llvm-symbolizer-dwo-test     |   Bin 0 -> 9579 bytes
 test/DebugInfo/Inputs/llvm-symbolizer-dwo-test.cc  |    18 +
 test/DebugInfo/Mips/delay-slot.ll                  |    75 +
 test/DebugInfo/Mips/lit.local.cfg                  |     3 +
 test/DebugInfo/SystemZ/variable-loc.ll             |    23 +-
 test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll  |     4 +-
 test/DebugInfo/X86/2011-12-16-BadStructRef.ll      |    20 +-
 test/DebugInfo/X86/DW_AT_byte_size.ll              |     4 +-
 test/DebugInfo/X86/DW_AT_linkage_name.ll           |   116 +
 test/DebugInfo/X86/DW_AT_location-reference.ll     |     4 +-
 test/DebugInfo/X86/DW_AT_specification.ll          |    16 +-
 test/DebugInfo/X86/arguments.ll                    |     3 +-
 test/DebugInfo/X86/array.ll                        |   101 +
 test/DebugInfo/X86/array2.ll                       |   107 +
 test/DebugInfo/X86/block-capture.ll                |    26 +-
 test/DebugInfo/X86/coff_debug_info_type.ll         |     2 +-
 test/DebugInfo/X86/concrete_out_of_line.ll         |    52 +-
 test/DebugInfo/X86/cu-ranges.ll                    |     2 +-
 test/DebugInfo/X86/dbg-at-specficiation.ll         |     2 +-
 test/DebugInfo/X86/dbg-const.ll                    |     2 +-
 test/DebugInfo/X86/dbg-declare-arg.ll              |     2 +-
 test/DebugInfo/X86/dbg-declare.ll                  |     4 +-
 test/DebugInfo/X86/dbg-large-unsigned-const.ll     |    62 -
 test/DebugInfo/X86/dbg-subrange.ll                 |     4 +-
 test/DebugInfo/X86/dbg-value-const-byref.ll        |    15 +-
 test/DebugInfo/X86/dbg-value-inlined-parameter.ll  |    27 +-
 test/DebugInfo/X86/dbg-value-location.ll           |     2 +-
 test/DebugInfo/X86/debug-dead-local-var.ll         |    51 +
 test/DebugInfo/X86/debug-info-blocks.ll            |    21 +-
 test/DebugInfo/X86/debug-info-static-member.ll     |     4 +-
 test/DebugInfo/X86/debug-loc-offset.ll             |     4 +-
 test/DebugInfo/X86/debug-ranges-offset.ll          |   241 +
 test/DebugInfo/X86/elf-names.ll                    |     6 +-
 test/DebugInfo/X86/empty-and-one-elem-array.ll     |    10 +-
 test/DebugInfo/X86/ending-run.ll                   |     4 +-
 test/DebugInfo/X86/fission-ranges.ll               |     2 +-
 test/DebugInfo/X86/formal_parameter.ll             |    83 +
 test/DebugInfo/X86/gnu-public-names.ll             |    36 +-
 test/DebugInfo/X86/inline-member-function.ll       |    95 +
 test/DebugInfo/X86/inline-seldag-test.ll           |    77 +
 test/DebugInfo/X86/instcombine-instrinsics.ll      |   157 +-
 test/DebugInfo/X86/linkage-name.ll                 |     8 +-
 test/DebugInfo/X86/pr11300.ll                      |    15 +-
 test/DebugInfo/X86/pr12831.ll                      |     2 +-
 test/DebugInfo/X86/sret.ll                         |    25 +-
 test/DebugInfo/X86/type_units_with_addresses.ll    |   151 +
 test/DebugInfo/constant-pointers.ll                |    51 +
 test/DebugInfo/cross-cu-inlining.ll                |   137 +
 test/DebugInfo/cross-cu-linkonce.ll                |    74 +
 test/DebugInfo/cu-line-tables.ll                   |     7 +-
 test/DebugInfo/debug-info-qualifiers.ll            |     2 -
 test/DebugInfo/dwarfdump-inlining.test             |    28 -
 test/DebugInfo/dwarfdump-ranges.test               |    10 +
 test/DebugInfo/dwarfdump-test.test                 |    56 -
 test/DebugInfo/dwarfdump-zlib.test                 |    12 +-
 test/DebugInfo/inline-scopes.ll                    |   130 +
 test/DebugInfo/inlined-vars.ll                     |     3 +-
 test/DebugInfo/llvm-symbolizer-zlib.test           |     7 +
 test/DebugInfo/llvm-symbolizer.test                |    58 +-
 test/DebugInfo/namespace.ll                        |    50 +-
 test/DebugInfo/namespace_function_definition.ll    |    44 +
 .../namespace_inline_function_definition.ll        |    92 +
 test/DebugInfo/restrict.ll                         |    53 +
 test/DebugInfo/sugared-constants.ll                |    82 +
 test/DebugInfo/two-cus-from-same-file.ll           |     3 +-
 test/DebugInfo/typedef.ll                          |    32 +
 test/DebugInfo/unconditional-branch.ll             |     6 +-
 test/DebugInfo/varargs.ll                          |    14 +-
 .../RuntimeDyld/arm_secdiff_reloc.test             |     1 -
 .../RuntimeDyld/macho_relocations.test             |     1 +
 test/ExecutionEngine/lit.local.cfg                 |     7 +-
 test/Feature/alias2.ll                             |    19 +
 test/Feature/aliases.ll                            |     4 +-
 test/Feature/instructions.ll                       |     2 +
 .../AddressSanitizer/X86/asm_attr.ll               |    20 +
 .../AddressSanitizer/X86/asm_mov.ll                |    44 +-
 .../Instrumentation/AddressSanitizer/X86/asm_mov.s |    18 +-
 .../X86/asm_mov_no_instrumentation.s               |    20 -
 .../AddressSanitizer/X86/asm_swap_intel.s          |    71 +
 test/Instrumentation/AddressSanitizer/basic.ll     |    33 +-
 .../AddressSanitizer/coverage-dbg.ll               |    33 +
 test/Instrumentation/AddressSanitizer/coverage.ll  |    14 +
 .../instrumentation-with-call-threshold.ll         |    30 +
 test/Instrumentation/AddressSanitizer/test64.ll    |    10 +-
 .../MemorySanitizer/do-not-emit-module-limits.ll   |    21 +
 .../instrumentation-with-call-threshold.ll         |    47 +
 test/LTO/attrs.ll                                  |    15 +
 test/LTO/keep-used-puts-during-instcombine.ll      |     4 +-
 test/Linker/Inputs/PR8300.b.ll                     |     2 +-
 test/Linker/Inputs/alias.ll                        |     3 +
 test/Linker/Inputs/cycle.ll                        |     2 +
 test/Linker/Inputs/datalayout-b.ll                 |     2 +-
 test/Linker/Inputs/old_global_ctors.3.4.bc         |   Bin 0 -> 368 bytes
 test/Linker/alias.ll                               |    16 +
 test/Linker/cycle.ll                               |     7 +
 test/Linker/debug-info-version-a.ll                |    16 +
 test/Linker/debug-info-version-b.ll                |    10 +
 test/Linker/global_ctors.ll                        |    29 +
 test/Linker/type-unique-odr-a.ll                   |    16 +-
 test/MC/AArch64/arm64-adr.s                        |    31 +
 test/MC/AArch64/arm64-advsimd.s                    |  1997 +++
 test/MC/AArch64/arm64-aliases.s                    |   753 +
 test/MC/AArch64/arm64-arithmetic-encoding.s        |   615 +
 test/MC/AArch64/arm64-arm64-fixup.s                |    10 +
 test/MC/AArch64/arm64-basic-a64-instructions.s     |    18 +
 test/MC/AArch64/arm64-be-datalayout.s              |     4 +
 test/MC/AArch64/arm64-bitfield-encoding.s          |    38 +
 test/MC/AArch64/arm64-branch-encoding.s            |   159 +
 test/MC/AArch64/arm64-condbr-without-dots.s        |    37 +
 test/MC/AArch64/arm64-crypto.s                     |    66 +
 test/MC/AArch64/arm64-diagno-predicate.s           |    24 +
 test/MC/AArch64/arm64-diags.s                      |   392 +
 test/MC/AArch64/arm64-directive_loh.s              |    93 +
 test/MC/AArch64/arm64-elf-reloc-condbr.s           |    10 +
 test/MC/AArch64/arm64-elf-relocs.s                 |   249 +
 test/MC/AArch64/arm64-fp-encoding.s                |   443 +
 test/MC/AArch64/arm64-large-relocs.s               |    38 +
 test/MC/AArch64/arm64-leaf-compact-unwind.s        |   208 +
 test/MC/AArch64/arm64-logical-encoding.s           |   224 +
 test/MC/AArch64/arm64-mapping-across-sections.s    |    28 +
 test/MC/AArch64/arm64-mapping-within-section.s     |    23 +
 test/MC/AArch64/arm64-memory.s                     |   634 +
 test/MC/AArch64/arm64-nv-cond.s                    |    11 +
 test/MC/AArch64/arm64-optional-hash.s              |    31 +
 test/MC/AArch64/arm64-separator.s                  |    20 +
 test/MC/AArch64/arm64-simd-ldst.s                  |  2404 +++
 test/MC/AArch64/arm64-small-data-fixups.s          |    24 +
 test/MC/AArch64/arm64-spsel-sysreg.s               |    24 +
 test/MC/AArch64/arm64-system-encoding.s            |   623 +
 test/MC/AArch64/arm64-target-specific-sysreg.s     |    10 +
 test/MC/AArch64/arm64-tls-modifiers-darwin.s       |    13 +
 test/MC/AArch64/arm64-tls-relocs.s                 |   320 +
 test/MC/AArch64/arm64-v128_lo-diagnostics.s        |    11 +
 test/MC/AArch64/arm64-variable-exprs.s             |    40 +
 test/MC/AArch64/arm64-vector-lists.s               |    20 +
 test/MC/AArch64/arm64-verbose-vector-case.s        |    19 +
 test/MC/AArch64/basic-a64-diagnostics.s            |   861 +-
 test/MC/AArch64/basic-a64-instructions.s           |  1845 +--
 test/MC/AArch64/elf-globaladdress.ll               |     2 +-
 test/MC/AArch64/elf-reloc-addend.s                 |     8 -
 test/MC/AArch64/elf-reloc-condbr.s                 |    10 -
 test/MC/AArch64/gicv3-regs.s                       |   220 +-
 test/MC/AArch64/lit.local.cfg                      |     4 +-
 test/MC/AArch64/neon-2velem.s                      |     2 +-
 test/MC/AArch64/neon-3vdiff.s                      |     2 +-
 test/MC/AArch64/neon-across.s                      |     2 +-
 test/MC/AArch64/neon-compare-instructions.s        |    70 +-
 test/MC/AArch64/neon-crypto.s                      |     5 +-
 test/MC/AArch64/neon-diagnostics.s                 |   736 +-
 test/MC/AArch64/neon-extract.s                     |     6 +-
 test/MC/AArch64/neon-mov.s                         |   143 +-
 test/MC/AArch64/neon-perm.s                        |     2 +-
 test/MC/AArch64/neon-scalar-compare.s              |    10 +-
 test/MC/AArch64/neon-scalar-dup.s                  |    44 +-
 test/MC/AArch64/neon-simd-copy.s                   |    72 +-
 test/MC/AArch64/neon-simd-ldst-multi-elem.s        |   786 +-
 test/MC/AArch64/neon-simd-ldst-one-elem.s          |   514 +-
 test/MC/AArch64/neon-simd-misc.s                   |     6 +-
 test/MC/AArch64/neon-simd-post-ldst-multi-elem.s   |   426 +-
 test/MC/AArch64/neon-tbl.s                         |    97 +-
 test/MC/AArch64/noneon-diagnostics.s               |    13 +-
 test/MC/AArch64/optional-hash.s                    |     2 +-
 test/MC/AArch64/tls-relocs.s                       |   301 +-
 test/MC/AArch64/trace-regs.s                       |   765 +-
 test/MC/ARM/Windows/mov32t-range.s                 |    37 +
 test/MC/ARM/arm-thumb-cpus-default.s               |    23 +-
 test/MC/ARM/arm-thumb-cpus.s                       |    23 +-
 test/MC/ARM/arm_fixups.s                           |     6 +-
 test/MC/ARM/basic-thumb2-instructions.s            |     6 +
 test/MC/ARM/big-endian-arm-fixup.s                 |   107 +
 test/MC/ARM/big-endian-thumb-fixup.s               |    63 +
 test/MC/ARM/big-endian-thumb2-fixup.s              |    49 +
 test/MC/ARM/coff-debugging-secrel.ll               |    49 +
 test/MC/ARM/coff-file.s                            |    47 +
 test/MC/ARM/coff-function-type-info.ll             |    45 +
 test/MC/ARM/coff-relocations.s                     |   101 +
 test/MC/ARM/complex-operands.s                     |     8 +-
 test/MC/ARM/diagnostics.s                          |     8 +
 test/MC/ARM/dwarf-cfi-initial-state.s              |     1 +
 test/MC/ARM/eh-directive-save-diagnoatics.s        |    41 -
 test/MC/ARM/eh-directive-save-diagnostics.s        |    41 +
 test/MC/ARM/elf-thumbfunc-reloc.s                  |    13 +-
 test/MC/ARM/elf-thumbfunc.s                        |    12 +-
 test/MC/ARM/ldrd-strd-gnu-arm-bad-imm.s            |     9 +
 test/MC/ARM/ldrd-strd-gnu-arm.s                    |    20 +
 test/MC/ARM/ldrd-strd-gnu-thumb-bad-regs.s         |    10 +
 test/MC/ARM/ldrd-strd-gnu-thumb.s                  |    20 +
 test/MC/ARM/neon-vld-encoding.s                    |     2 +-
 test/MC/ARM/neon-vld-vst-align.s                   |  8354 ++++++++++
 test/MC/ARM/pool.s                                 |     1 +
 test/MC/ARM/symbol-variants.s                      |     6 +
 test/MC/ARM/thumb2-diagnostics.s                   |    18 +
 test/MC/ARM/thumb2-strd.s                          |    10 +
 test/MC/ARM/thumb2be-b.w-encoding.s                |     9 +
 test/MC/ARM/thumb2be-beq.w-encoding.s              |     9 +
 test/MC/ARM/thumb2be-movt-encoding.s               |     9 +
 test/MC/ARM/thumb2be-movw-encoding.s               |     9 +
 test/MC/ARM/thumb_set.s                            |    45 +-
 test/MC/ARM/udf-arm-diagnostics.s                  |    19 +
 test/MC/ARM/udf-arm.s                              |    11 +
 test/MC/ARM/udf-thumb-2-diagnostics.s              |    25 +
 test/MC/ARM/udf-thumb-2.s                          |    13 +
 test/MC/ARM/udf-thumb-diagnostics.s                |    19 +
 test/MC/ARM/udf-thumb.s                            |    11 +
 test/MC/ARM/vmov-vmvn-byte-replicate.s             |    31 +
 test/MC/ARM/vmov-vmvn-illegal-cases.s              |    30 +
 test/MC/ARM/vorr-vbic-illegal-cases.s              |    42 +
 test/MC/ARM64/advsimd.s                            |  1997 ---
 test/MC/ARM64/aliases.s                            |   733 -
 test/MC/ARM64/arithmetic-encoding.s                |   631 -
 test/MC/ARM64/arm64-fixup.s                        |    10 -
 test/MC/ARM64/basic-a64-instructions.s             |    18 -
 test/MC/ARM64/bitfield-encoding.s                  |    30 -
 test/MC/ARM64/branch-encoding.s                    |   159 -
 test/MC/ARM64/crypto.s                             |    66 -
 test/MC/ARM64/diags.s                              |   242 -
 test/MC/ARM64/directive_loh.s                      |    93 -
 test/MC/ARM64/elf-relocs.s                         |   249 -
 test/MC/ARM64/fp-encoding.s                        |   507 -
 test/MC/ARM64/large-relocs.s                       |    38 -
 test/MC/ARM64/lit.local.cfg                        |     6 -
 test/MC/ARM64/logical-encoding.s                   |   224 -
 test/MC/ARM64/mapping-across-sections.s            |    28 -
 test/MC/ARM64/mapping-within-section.s             |    23 -
 test/MC/ARM64/memory.s                             |   634 -
 test/MC/ARM64/separator.s                          |    20 -
 test/MC/ARM64/simd-ldst.s                          |  2404 ---
 test/MC/ARM64/small-data-fixups.s                  |    24 -
 test/MC/ARM64/system-encoding.s                    |   679 -
 test/MC/ARM64/tls-modifiers-darwin.s               |    13 -
 test/MC/ARM64/tls-relocs.s                         |   320 -
 test/MC/ARM64/variable-exprs.s                     |    40 -
 test/MC/AsmParser/cfi-invalid-startproc.s          |    16 +
 test/MC/AsmParser/directive_seh.s                  |     8 +-
 test/MC/AsmParser/invalid-input-assertion.s        |     9 +
 test/MC/AsmParser/macros-darwin-vararg.s           |     8 +
 test/MC/AsmParser/vararg-default-value.s           |    15 +
 test/MC/AsmParser/vararg.s                         |    41 +
 test/MC/COFF/alias.s                               |     2 +-
 test/MC/COFF/comm.ll                               |     4 +-
 test/MC/COFF/comm.s                                |     4 +-
 test/MC/COFF/directive-section-characteristics.ll  |    17 +
 test/MC/COFF/file.s                                |    47 +
 test/MC/COFF/global_ctors_dtors.ll                 |    28 +-
 test/MC/COFF/initialised-data.ll                   |     7 +
 test/MC/COFF/invalid-def.s                         |     5 +
 test/MC/COFF/invalid-endef.s                       |     4 +
 test/MC/COFF/invalid-scl-range.s                   |     6 +
 test/MC/COFF/invalid-scl.s                         |     4 +
 test/MC/COFF/invalid-type-range.s                  |     6 +
 test/MC/COFF/invalid-type.s                        |     4 +
 test/MC/COFF/offset.s                              |    19 +
 test/MC/COFF/symbol-alias.s                        |     2 +-
 test/MC/COFF/weak-symbol.ll                        |     8 +-
 .../MC/Disassembler/AArch64/a64-ignored-fields.txt |     1 +
 test/MC/Disassembler/AArch64/arm64-advsimd.txt     |  2283 +++
 test/MC/Disassembler/AArch64/arm64-arithmetic.txt  |   526 +
 .../AArch64/arm64-basic-a64-undefined.txt          |    31 +
 test/MC/Disassembler/AArch64/arm64-bitfield.txt    |    29 +
 test/MC/Disassembler/AArch64/arm64-branch.txt      |    75 +
 .../Disassembler/AArch64/arm64-canonical-form.txt  |    21 +
 test/MC/Disassembler/AArch64/arm64-crc32.txt       |    18 +
 test/MC/Disassembler/AArch64/arm64-crypto.txt      |    47 +
 .../Disassembler/AArch64/arm64-invalid-logical.txt |     6 +
 test/MC/Disassembler/AArch64/arm64-logical.txt     |   223 +
 test/MC/Disassembler/AArch64/arm64-memory.txt      |   564 +
 .../Disassembler/AArch64/arm64-non-apple-fmov.txt  |     7 +
 test/MC/Disassembler/AArch64/arm64-scalar-fp.txt   |   255 +
 test/MC/Disassembler/AArch64/arm64-system.txt      |    62 +
 .../AArch64/basic-a64-instructions.txt             |  1355 +-
 .../Disassembler/AArch64/basic-a64-undefined.txt   |    67 +-
 .../AArch64/basic-a64-unpredictable.txt            |     1 +
 test/MC/Disassembler/AArch64/gicv3-regs.txt        |   221 +-
 .../AArch64/ldp-offset-predictable.txt             |     1 +
 .../AArch64/ldp-postind.predictable.txt            |     1 +
 .../AArch64/ldp-preind.predictable.txt             |     1 +
 test/MC/Disassembler/AArch64/lit.local.cfg         |     2 +-
 test/MC/Disassembler/AArch64/neon-instructions.txt |   195 +-
 test/MC/Disassembler/AArch64/trace-regs.txt        |   733 +-
 test/MC/Disassembler/ARM/invalid-thumbv7.txt       |    39 -
 test/MC/Disassembler/ARM64/advsimd.txt             |  2282 ---
 test/MC/Disassembler/ARM64/arithmetic.txt          |   522 -
 test/MC/Disassembler/ARM64/bitfield.txt            |    29 -
 test/MC/Disassembler/ARM64/branch.txt              |    75 -
 test/MC/Disassembler/ARM64/crc32.txt               |    18 -
 test/MC/Disassembler/ARM64/crypto.txt              |    47 -
 test/MC/Disassembler/ARM64/invalid-logical.txt     |     6 -
 test/MC/Disassembler/ARM64/lit.local.cfg           |     5 -
 test/MC/Disassembler/ARM64/logical.txt             |   217 -
 test/MC/Disassembler/ARM64/memory.txt              |   558 -
 test/MC/Disassembler/ARM64/scalar-fp.txt           |   255 -
 test/MC/Disassembler/ARM64/system.txt              |    58 -
 test/MC/Disassembler/Mips/mips32r6.txt             |   116 +
 test/MC/Disassembler/Mips/mips64r6.txt             |   129 +
 test/MC/Disassembler/Mips/msa/test_2r.txt          |    17 +
 test/MC/Disassembler/Mips/msa/test_2r_msa64.txt    |     3 +
 test/MC/Disassembler/Mips/msa/test_2rf.txt         |    34 +
 test/MC/Disassembler/Mips/msa/test_3r.txt          |   244 +
 test/MC/Disassembler/Mips/msa/test_3rf.txt         |    84 +
 test/MC/Disassembler/Mips/msa/test_bit.txt         |    50 +
 test/MC/Disassembler/Mips/msa/test_ctrlregs.txt    |    35 +
 test/MC/Disassembler/Mips/msa/test_dlsa.txt        |     6 +
 test/MC/Disassembler/Mips/msa/test_elm.txt         |    17 +
 test/MC/Disassembler/Mips/msa/test_elm_insert.txt  |     5 +
 .../Mips/msa/test_elm_insert_msa64.txt             |     3 +
 test/MC/Disassembler/Mips/msa/test_elm_insve.txt   |     6 +
 test/MC/Disassembler/Mips/msa/test_elm_msa64.txt   |     6 +
 test/MC/Disassembler/Mips/msa/test_i10.txt         |     6 +
 test/MC/Disassembler/Mips/msa/test_i5.txt          |    46 +
 test/MC/Disassembler/Mips/msa/test_i8.txt          |    12 +
 test/MC/Disassembler/Mips/msa/test_lsa.txt         |     6 +
 test/MC/Disassembler/Mips/msa/test_mi10.txt        |    28 +
 test/MC/Disassembler/Mips/msa/test_vec.txt         |     9 +
 test/MC/Disassembler/Sparc/sparc-fp.txt            |     6 +-
 test/MC/Disassembler/X86/prefixes.txt              |     4 +
 test/MC/Disassembler/X86/x86-32.txt                |     3 +
 test/MC/ELF/comdat.s                               |     6 +-
 test/MC/ELF/common.s                               |    12 +-
 test/MC/ELF/comp-dir.s                             |     1 -
 test/MC/ELF/compression.s                          |    68 +-
 test/MC/ELF/file-double.s                          |     8 +-
 test/MC/ELF/gen-dwarf.s                            |    14 +-
 test/MC/ELF/lcomm.s                                |     4 +-
 test/MC/ELF/many-sections-2.s                      |     6 +-
 test/MC/ELF/noexec.s                               |     2 +-
 test/MC/ELF/offset.s                               |    59 +
 test/MC/ELF/pic-diff.s                             |     2 +-
 test/MC/ELF/pr9292.s                               |     4 +-
 test/MC/ELF/relocation-386.s                       |     3 +
 test/MC/ELF/relocation.s                           |    11 +-
 test/MC/ELF/set.s                                  |     4 +-
 test/MC/ELF/strtab-suffix-opt.s                    |    21 +
 test/MC/ELF/subtraction-error.s                    |     8 +
 test/MC/ELF/symref.s                               |   142 -
 test/MC/ELF/symver.s                               |   142 +
 test/MC/ELF/tls-i386.s                             |    28 +-
 test/MC/ELF/tls.s                                  |    14 +-
 test/MC/ELF/type.s                                 |    24 +-
 test/MC/ELF/undef.s                                |    79 +-
 test/MC/ELF/weakref.s                              |    34 +-
 .../MachO/AArch64/darwin-ARM64-local-label-diff.s  |    21 +
 test/MC/MachO/AArch64/darwin-ARM64-reloc.s         |   157 +
 test/MC/MachO/AArch64/lit.local.cfg                |     4 +
 test/MC/MachO/ARM/bad-darwin-directives.s          |    23 +-
 .../MC/MachO/ARM64/darwin-ARM64-local-label-diff.s |    21 -
 test/MC/MachO/ARM64/darwin-ARM64-reloc.s           |   157 -
 test/MC/MachO/ARM64/lit.local.cfg                  |     4 -
 test/MC/MachO/bad-darwin-x86_64-reloc-expr.s       |     6 +
 test/MC/MachO/debug_frame.s                        |     1 +
 test/MC/MachO/temp-labels.s                        |     2 +-
 test/MC/Mips/cpload-bad.s                          |    15 +
 test/MC/Mips/cpload.s                              |    33 +
 test/MC/Mips/cpsetup.s                             |    64 +-
 test/MC/Mips/elf-N64.s                             |     1 +
 test/MC/Mips/elf-gprel-32-64.s                     |     3 +
 test/MC/Mips/elf_eflags.s                          |    41 +-
 test/MC/Mips/elf_eflags_nan2008.s                  |    12 +
 test/MC/Mips/elf_eflags_nanlegacy.s                |    15 +
 test/MC/Mips/llvm-mc-fixup-endianness.s            |     6 +
 test/MC/Mips/micromips-control-instructions.s      |     8 +-
 test/MC/Mips/micromips-el-fixup-data.s             |     2 +-
 test/MC/Mips/mips-control-instructions.s           |     4 +-
 test/MC/Mips/mips1/invalid-mips2-wrong-error.s     |    16 +
 test/MC/Mips/mips1/invalid-mips2.s                 |    23 +
 test/MC/Mips/mips1/invalid-mips3-wrong-error.s     |    23 +
 test/MC/Mips/mips1/invalid-mips3.s                 |    65 +
 test/MC/Mips/mips1/invalid-mips4-wrong-error.s     |    23 +
 test/MC/Mips/mips1/invalid-mips4.s                 |    82 +
 test/MC/Mips/mips1/invalid-mips5-wrong-error.s     |    46 +
 test/MC/Mips/mips1/invalid-mips5.s                 |    83 +
 test/MC/Mips/mips1/valid-xfail.s                   |    14 +-
 test/MC/Mips/mips1/valid.s                         |   181 +-
 test/MC/Mips/mips2/invalid-mips3-wrong-error.s     |    19 +
 test/MC/Mips/mips2/invalid-mips3.s                 |    48 +
 test/MC/Mips/mips2/invalid-mips32.s                |    32 +
 test/MC/Mips/mips2/invalid-mips32r2-xfail.s        |    11 +
 test/MC/Mips/mips2/invalid-mips32r2.s              |    59 +
 test/MC/Mips/mips2/invalid-mips4-wrong-error.s     |    14 +
 test/MC/Mips/mips2/invalid-mips4.s                 |    65 +
 test/MC/Mips/mips2/invalid-mips5-wrong-error.s     |    46 +
 test/MC/Mips/mips2/invalid-mips5.s                 |    66 +
 test/MC/Mips/mips2/valid-xfail.s                   |    17 -
 test/MC/Mips/mips2/valid.s                         |   227 +-
 test/MC/Mips/mips3/invalid-mips4.s                 |    23 +
 test/MC/Mips/mips3/invalid-mips5-wrong-error.s     |    46 +
 test/MC/Mips/mips3/invalid-mips5.s                 |    25 +
 test/MC/Mips/mips3/valid-xfail.s                   |    15 -
 test/MC/Mips/mips3/valid.s                         |   315 +-
 test/MC/Mips/mips32/invalid-mips32r2-xfail.s       |     8 -
 test/MC/Mips/mips32/invalid-mips32r2.s             |    13 +-
 test/MC/Mips/mips32/invalid-mips64.s               |     9 +
 test/MC/Mips/mips32/valid-xfail.s                  |    68 +-
 test/MC/Mips/mips32/valid.s                        |   270 +-
 test/MC/Mips/mips32r2/invalid-mips64r2.s           |    10 +
 test/MC/Mips/mips32r2/valid-xfail.s                |   608 +-
 test/MC/Mips/mips32r2/valid.s                      |   321 +-
 test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s  |    15 +
 test/MC/Mips/mips32r6/invalid-mips1.s              |     8 +
 test/MC/Mips/mips32r6/invalid-mips2-wrong-error.s  |    20 +
 test/MC/Mips/mips32r6/invalid-mips2.s              |    14 +
 test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s |    16 +
 test/MC/Mips/mips32r6/relocations.s                |    55 +
 test/MC/Mips/mips32r6/valid-xfail.s                |    19 +
 test/MC/Mips/mips32r6/valid.s                      |   126 +
 test/MC/Mips/mips4/invalid-mips5-wrong-error.s     |    46 +
 test/MC/Mips/mips4/invalid-mips5.s                 |     9 +
 test/MC/Mips/mips4/invalid-mips64-xfail.s          |    22 -
 test/MC/Mips/mips4/invalid-mips64.s                |    20 +-
 test/MC/Mips/mips4/invalid-mips64r2-xfail.s        |    16 -
 test/MC/Mips/mips4/invalid-mips64r2.s              |    25 +-
 test/MC/Mips/mips4/valid-xfail.s                   |    89 +-
 test/MC/Mips/mips4/valid.s                         |   349 +-
 test/MC/Mips/mips5/invalid-mips64.s                |    21 +
 test/MC/Mips/mips5/invalid-mips64r2-xfail.s        |    11 +
 test/MC/Mips/mips5/invalid-mips64r2.s              |    43 +
 test/MC/Mips/mips5/valid-xfail.s                   |   163 +-
 test/MC/Mips/mips5/valid.s                         |   351 +-
 test/MC/Mips/mips64/invalid-mips64r2-xfail.s       |     4 -
 test/MC/Mips/mips64/invalid-mips64r2.s             |    32 +-
 test/MC/Mips/mips64/valid-xfail.s                  |   176 +-
 test/MC/Mips/mips64/valid.s                        |   376 +-
 test/MC/Mips/mips64r2/valid-xfail.s                |   611 +-
 test/MC/Mips/mips64r2/valid.s                      |   414 +-
 test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s  |    15 +
 test/MC/Mips/mips64r6/invalid-mips1.s              |     8 +
 test/MC/Mips/mips64r6/invalid-mips2.s              |    14 +
 test/MC/Mips/mips64r6/invalid-mips3-wrong-error.s  |    23 +
 test/MC/Mips/mips64r6/invalid-mips3.s              |    14 +
 test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s  |    44 +
 test/MC/Mips/mips64r6/relocations.s                |    55 +
 test/MC/Mips/mips64r6/valid-xfail.s                |    19 +
 test/MC/Mips/mips64r6/valid.s                      |   139 +
 test/MC/Mips/mips_directives.s                     |     2 +-
 test/MC/Mips/mips_gprel16.s                        |     3 +
 test/MC/Mips/msa/test_2r.s                         |    20 -
 test/MC/Mips/msa/test_2r_msa64.s                   |     6 -
 test/MC/Mips/msa/test_2rf.s                        |    37 -
 test/MC/Mips/msa/test_3r.s                         |   247 -
 test/MC/Mips/msa/test_3rf.s                        |    87 -
 test/MC/Mips/msa/test_bit.s                        |    53 -
 test/MC/Mips/msa/test_cbranch.s                    |    20 +-
 test/MC/Mips/msa/test_ctrlregs.s                   |    38 -
 test/MC/Mips/msa/test_dlsa.s                       |     9 -
 test/MC/Mips/msa/test_elm.s                        |    20 -
 test/MC/Mips/msa/test_elm_insert.s                 |     8 -
 test/MC/Mips/msa/test_elm_insert_msa64.s           |     6 -
 test/MC/Mips/msa/test_elm_insve.s                  |     9 -
 test/MC/Mips/msa/test_elm_msa64.s                  |     7 -
 test/MC/Mips/msa/test_i10.s                        |    10 -
 test/MC/Mips/msa/test_i5.s                         |    49 -
 test/MC/Mips/msa/test_i8.s                         |    15 -
 test/MC/Mips/msa/test_lsa.s                        |     9 -
 test/MC/Mips/msa/test_mi10.s                       |    31 -
 test/MC/Mips/msa/test_vec.s                        |    12 -
 test/MC/Mips/octeon-instructions.s                 |    20 +
 test/MC/PowerPC/ppc64-initial-cfa.s                |     1 +
 test/MC/Sparc/sparc-alu-instructions.s             |     4 +-
 test/MC/Sparc/sparc-fp-instructions.s              |    12 +-
 test/MC/X86/avx512-encodings.s                     |  1160 +-
 test/MC/X86/x86-64.s                               |     2 +-
 test/Object/Inputs/COFF/weak-external.yaml         |    43 +
 test/Object/Inputs/COFF/x86-64.yaml                |    17 +
 .../Object/Inputs/macho-text-data-bss.macho-x86_64 |   Bin 0 -> 844 bytes
 .../Inputs/macho-universal-archive.x86_64.i386     |   Bin 0 -> 1656 bytes
 test/Object/Inputs/relocation-dynamic.elf-i386     |   Bin 0 -> 1504 bytes
 test/Object/Inputs/relocation-relocatable.elf-i386 |   Bin 0 -> 772 bytes
 test/Object/Inputs/trivial-object-test.coff-x86-64 |   Bin 347 -> 437 bytes
 test/Object/X86/objdump-cfg-invalid-opcode.yaml    |     2 +-
 test/Object/X86/objdump-disassembly-symbolic.test  |    20 +
 test/Object/X86/yaml2obj-elf-x86-rel.yaml          |    41 +
 test/Object/ar-error.test                          |     6 +
 test/Object/archive-long-index.test                |    26 +-
 test/Object/archive-symtab.test                    |    24 +-
 test/Object/archive-toc.test                       |    24 +-
 test/Object/extract.ll                             |     2 +-
 test/Object/nm-shared-object.test                  |    28 +-
 test/Object/nm-trivial-object.test                 |    46 +-
 test/Object/nm-universal-binary.test               |    23 +-
 test/Object/obj2yaml-coff-weak-external.test       |     3 +
 test/Object/obj2yaml.test                          |   244 +-
 test/Object/size-trivial-macho.test                |    15 +
 .../yaml2obj-elf-file-headers-with-e_flags.yaml    |     9 +-
 test/Object/yaml2obj-elf-rel.yaml                  |   118 +
 test/Object/yaml2obj-elf-section-basic.yaml        |    34 +-
 test/Object/yaml2obj-elf-section-invalid-size.yaml |    26 +
 test/Object/yaml2obj-readobj.test                  |     3 +
 test/Other/extract-alias.ll                        |     6 +-
 test/Other/optimization-remarks-inline.ll          |    40 +
 test/Other/pass-pipeline-parsing.ll                |    36 +
 test/TableGen/GeneralList.td                       |     1 -
 test/TableGen/lisp.td                              |     1 -
 test/TableGen/listconcat.td                        |    18 +
 test/TableGen/strconcat.td                         |    14 +-
 .../AddDiscriminators/no-discriminators.ll         |    71 +
 test/Transforms/ArgumentPromotion/inalloca.ll      |     4 +-
 .../ARM/atomic-expansion-v7.ll                     |   340 +
 .../ARM/atomic-expansion-v8.ll                     |   202 +
 .../AtomicExpandLoadLinked/ARM/lit.local.cfg       |     4 +
 test/Transforms/BBVectorize/simple-int.ll          |   376 +-
 .../CodeGenPrepare/X86/sink-addrspacecast.ll       |    37 +
 test/Transforms/ConstProp/loads.ll                 |    34 +
 .../ConstantHoisting/AArch64/const-addr.ll         |    23 +
 .../ConstantHoisting/AArch64/large-immediate.ll    |    27 +
 .../ConstantHoisting/AArch64/lit.local.cfg         |     3 +
 .../ConstantHoisting/PowerPC/const-base-addr.ll    |    23 +
 .../ConstantHoisting/PowerPC/lit.local.cfg         |     4 +
 test/Transforms/ConstantHoisting/PowerPC/masks.ll  |    66 +
 test/Transforms/ConstantHoisting/X86/cast-inst.ll  |    29 +
 .../ConstantHoisting/X86/delete-dead-cast-inst.ll  |    10 +-
 .../ConstantHoisting/X86/large-immediate.ll        |    27 +
 test/Transforms/GVN/load-pre-nonlocal.ll           |    87 +
 .../Transforms/GlobalDCE/2009-01-05-DeadAliases.ll |    18 +-
 test/Transforms/GlobalDCE/global_ctors.ll          |    14 +
 .../GlobalDCE/global_ctors_integration.ll          |    45 +
 test/Transforms/GlobalMerge/AArch64/arm64.ll       |    88 +
 test/Transforms/GlobalMerge/AArch64/lit.local.cfg  |     4 +
 test/Transforms/GlobalMerge/ARM64/arm64.ll         |    88 -
 test/Transforms/GlobalMerge/ARM64/lit.local.cfg    |     4 -
 .../GlobalOpt/2009-02-15-BitcastAlias.ll           |     2 +-
 test/Transforms/GlobalOpt/alias-resolve.ll         |     4 +-
 test/Transforms/GlobalOpt/alias-used-section.ll    |     4 +-
 test/Transforms/GlobalOpt/atexit.ll                |     2 +-
 test/Transforms/GlobalOpt/ctor-list-opt.ll         |    19 +-
 .../IPConstantProp/2009-09-24-byval-ptr.ll         |     2 +-
 test/Transforms/IndVarSimplify/pr18223.ll          |    30 +
 test/Transforms/Inline/2010-05-31-ByvalTailcall.ll |    24 -
 test/Transforms/Inline/always-inline.ll            |    11 +
 test/Transforms/Inline/byval-tail-call.ll          |    38 +
 test/Transforms/Inline/byval_lifetime.ll           |    26 +
 test/Transforms/Inline/inline-cold.ll              |   116 +-
 test/Transforms/Inline/inline-tail.ll              |   185 +-
 test/Transforms/Inline/inline-vla.ll               |    38 +
 test/Transforms/Inline/optimization-remarks.ll     |    60 +
 test/Transforms/Inline/switch.ll                   |    60 +
 .../InstCombine/2012-04-23-Neon-Intrinsics.ll      |    20 +-
 .../InstCombine/OverlappingInsertvalues.ll         |    36 +
 test/Transforms/InstCombine/alloca.ll              |    21 +
 .../InstCombine/bitcast-alias-function.ll          |    24 +-
 test/Transforms/InstCombine/blend_x86.ll           |    55 +
 .../InstCombine/call-cast-target-inalloca.ll       |     2 +-
 test/Transforms/InstCombine/div.ll                 |    19 +
 test/Transforms/InstCombine/gep-addrspace.ll       |    17 +-
 test/Transforms/InstCombine/icmp.ll                |     9 +
 test/Transforms/InstCombine/memcpy-from-global.ll  |    67 +-
 test/Transforms/InstCombine/overflow-mul.ll        |   164 +
 test/Transforms/InstCombine/pr19420.ll             |    67 +
 test/Transforms/InstCombine/select.ll              |   100 +
 test/Transforms/InstCombine/shift.ll               |    94 +-
 test/Transforms/InstCombine/strlen-1.ll            |    12 +
 test/Transforms/InstCombine/vec_demanded_elts.ll   |   366 +
 test/Transforms/InstCombine/vec_shuffle.ll         |   162 +-
 test/Transforms/InstSimplify/compare.ll            |   126 +
 test/Transforms/InstSimplify/dead-code-removal.ll  |    15 +
 .../Internalize/2009-01-05-InternalizeAliases.ll   |    11 +-
 test/Transforms/Internalize/local-visibility.ll    |    25 +
 test/Transforms/JumpThreading/phi-eq.ll            |     2 +-
 .../LoopSimplify/2007-10-28-InvokeCrash.ll         |     2 +-
 .../LoopStrengthReduce/AArch64/lit.local.cfg       |     5 +
 .../LoopStrengthReduce/AArch64/lsr-memcpy.ll       |    33 +
 .../LoopStrengthReduce/AArch64/lsr-memset.ll       |   101 +
 .../LoopStrengthReduce/AArch64/req-regs.ll         |    70 +
 .../ARM/2012-06-15-lsr-noaddrmode.ll               |     2 +-
 .../LoopStrengthReduce/ARM/ivchain-ARM.ll          |     3 +-
 .../LoopStrengthReduce/ARM64/lit.local.cfg         |     5 -
 .../LoopStrengthReduce/ARM64/lsr-memcpy.ll         |    33 -
 .../LoopStrengthReduce/ARM64/lsr-memset.ll         |   101 -
 .../LoopStrengthReduce/X86/ivchain-X86.ll          |     2 +
 .../dont_insert_redundant_ops.ll                   |     6 +-
 test/Transforms/LoopUnroll/X86/partial.ll          |    51 +-
 test/Transforms/LoopUnroll/loop-remarks.ll         |    25 +
 .../LoopVectorize/AArch64/aarch64-unroll.ll        |    42 +
 .../LoopVectorize/AArch64/arm64-unroll.ll          |    42 +
 .../LoopVectorize/AArch64/gather-cost.ll           |    85 +
 .../Transforms/LoopVectorize/AArch64/lit.local.cfg |     6 +
 test/Transforms/LoopVectorize/ARM64/gather-cost.ll |    85 -
 test/Transforms/LoopVectorize/ARM64/lit.local.cfg  |     6 -
 .../LoopVectorize/X86/metadata-enable.ll           |    20 +-
 .../Transforms/LoopVectorize/X86/vect.omp.force.ll |    93 +
 .../LoopVectorize/X86/vect.omp.force.small-tc.ll   |    73 +
 .../LoopVectorize/X86/vectorization-remarks.ll     |    67 +
 test/Transforms/LoopVectorize/store-shuffle-bug.ll |    17 +-
 .../LoopVectorize/vect.omp.persistence.ll          |    88 +
 test/Transforms/LoopVectorize/vect.stats.ll        |    65 +
 test/Transforms/MergeFunc/crash.ll                 |    14 +-
 .../Transforms/MergeFunc/inttoptr-address-space.ll |     6 +-
 test/Transforms/MergeFunc/inttoptr.ll              |    14 +-
 .../MergeFunc/mergefunc-struct-return.ll           |    40 +
 .../Transforms/SLPVectorizer/AArch64/lit.local.cfg |     3 +
 .../SLPVectorizer/AArch64/mismatched-intrinsics.ll |    18 +
 test/Transforms/SLPVectorizer/ARM64/lit.local.cfg  |     3 -
 .../SLPVectorizer/ARM64/mismatched-intrinsics.ll   |    18 -
 test/Transforms/SLPVectorizer/X86/align.ll         |    27 +
 test/Transforms/SLPVectorizer/X86/call.ll          |   128 +
 .../SLPVectorizer/X86/consecutive-access.ll        |   175 +
 .../SLPVectorizer/X86/continue_vectorizing.ll      |    31 +
 test/Transforms/SLPVectorizer/X86/cse.ll           |    30 +
 .../X86/insert-element-build-vector.ll             |    62 +-
 test/Transforms/SLPVectorizer/X86/intrinsic.ll     |    44 +
 .../X86/non-vectorizable-intrinsic.ll              |    36 +
 test/Transforms/SLPVectorizer/X86/value-bug.ll     |    80 +
 .../SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg |     4 +
 .../NVPTX/split-gep-and-gvn.ll                     |    59 +
 .../SeparateConstOffsetFromGEP/NVPTX/split-gep.ll  |   137 +
 test/Transforms/SimplifyCFG/extract-cost.ll        |    22 +
 test/Transforms/TailCallElim/basic.ll              |    31 +
 test/Verifier/aliasing-chain.ll                    |     6 -
 test/Verifier/bitcast-alias-address-space.ll       |    10 -
 test/Verifier/global-ctors.ll                      |    11 +
 test/Verifier/inalloca3.ll                         |    13 +
 test/Verifier/musttail-invalid.ll                  |    82 +
 test/Verifier/musttail-valid.ll                    |    16 +
 test/Verifier/sret.ll                              |     7 +
 test/lit.cfg                                       |     9 +-
 test/tools/llvm-cov/Inputs/range_based_for.gcda    |   Bin 0 -> 164 bytes
 test/tools/llvm-cov/Inputs/range_based_for.gcno    |   Bin 0 -> 552 bytes
 .../llvm-cov/Inputs/test_long_file_names.output    |     8 +
 test/tools/llvm-cov/Inputs/test_long_paths.output  |     8 +
 test/tools/llvm-cov/Inputs/test_missing.cpp.gcov   |    77 +
 test/tools/llvm-cov/Inputs/test_missing.h.gcov     |     6 +
 test/tools/llvm-cov/Inputs/test_missing.output     |     8 +
 test/tools/llvm-cov/Inputs/test_no_output.output   |     6 +
 test/tools/llvm-cov/lit.local.cfg                  |     2 +-
 test/tools/llvm-cov/llvm-cov.test                  |    19 +
 test/tools/llvm-cov/range_based_for.cpp            |    29 +
 .../tools/llvm-objdump/Inputs/file-aux-record.yaml |    21 +
 test/tools/llvm-objdump/Inputs/file.obj.coff-arm   |   Bin 0 -> 374 bytes
 test/tools/llvm-objdump/coff-file.test             |     6 +
 .../coff-non-null-terminated-file.test             |     5 +
 test/tools/llvm-profdata/Inputs/no-counts.profdata |     3 +
 test/tools/llvm-profdata/errors.test               |    11 +-
 test/tools/llvm-profdata/raw-two-profiles.test     |    64 +
 test/tools/llvm-profdata/simple.test               |    27 +-
 .../llvm-readobj/Inputs/dynamic-table-exe.x86      |   Bin 0 -> 6555 bytes
 .../tools/llvm-readobj/Inputs/file-aux-record.yaml |    21 +
 .../Inputs/file-multiple-aux-records.yaml          |    21 +
 .../llvm-readobj/coff-file-sections-reading.test   |    18 +
 .../coff-non-null-terminated-file.test             |    20 +
 test/tools/llvm-readobj/dynamic.test               |    45 +-
 tools/bugpoint/BugDriver.cpp                       |    14 +-
 tools/bugpoint/BugDriver.h                         |     6 +-
 tools/bugpoint/CrashDebugger.cpp                   |     9 +-
 tools/bugpoint/ExecutionDriver.cpp                 |     8 +-
 tools/bugpoint/ExtractFunction.cpp                 |    24 +-
 tools/bugpoint/FindBugs.cpp                        |     2 +-
 tools/bugpoint/Miscompilation.cpp                  |     6 +-
 tools/bugpoint/OptimizerDriver.cpp                 |    16 +-
 tools/bugpoint/ToolRunner.cpp                      |    37 +-
 tools/bugpoint/ToolRunner.h                        |    16 +-
 tools/bugpoint/bugpoint.cpp                        |     2 +-
 tools/llc/Android.mk                               |     3 +-
 tools/llc/llc.cpp                                  |    68 +-
 tools/lli/RemoteMemoryManager.cpp                  |    13 +-
 tools/lli/RemoteMemoryManager.h                    |     2 +-
 tools/lli/RemoteTarget.cpp                         |     4 +-
 tools/lli/RemoteTargetExternal.cpp                 |     2 +
 tools/lli/Unix/RPCChannel.inc                      |     4 +-
 tools/lli/lli.cpp                                  |    17 +-
 tools/llvm-ar/llvm-ar.cpp                          |     6 +-
 tools/llvm-as/llvm-as.cpp                          |     3 +-
 tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp          |    34 +-
 tools/llvm-c-test/object.c                         |     5 +-
 tools/llvm-config/CMakeLists.txt                   |     3 +-
 tools/llvm-cov/llvm-cov.cpp                        |    24 +-
 tools/llvm-diff/DiffLog.cpp                        |     4 +-
 tools/llvm-diff/DifferenceEngine.h                 |     2 +-
 tools/llvm-dis/llvm-dis.cpp                        |     5 +-
 tools/llvm-dwarfdump/llvm-dwarfdump.cpp            |    54 +-
 tools/llvm-extract/llvm-extract.cpp                |    25 +-
 tools/llvm-link/llvm-link.cpp                      |     9 +-
 tools/llvm-lto/llvm-lto.cpp                        |    30 +-
 tools/llvm-mc/Disassembler.cpp                     |    22 +-
 tools/llvm-mc/llvm-mc.cpp                          |    46 +-
 tools/llvm-nm/llvm-nm.cpp                          |    60 +-
 tools/llvm-objdump/MachODump.cpp                   |    12 +-
 tools/llvm-objdump/llvm-objdump.cpp                |    87 +-
 tools/llvm-profdata/llvm-profdata.cpp              |    13 +-
 tools/llvm-readobj/ARMAttributeParser.cpp          |   102 +-
 tools/llvm-readobj/ARMEHABIPrinter.h               |    13 +-
 tools/llvm-readobj/CMakeLists.txt                  |     9 +-
 tools/llvm-readobj/COFFDumper.cpp                  |   464 +-
 tools/llvm-readobj/ELFDumper.cpp                   |    94 +-
 tools/llvm-readobj/StreamWriter.h                  |    29 +-
 tools/llvm-readobj/Win64EHDumper.cpp               |   328 +
 tools/llvm-readobj/Win64EHDumper.h                 |    62 +
 tools/llvm-rtdyld/llvm-rtdyld.cpp                  |    38 +-
 tools/llvm-shlib/Makefile                          |     4 +-
 tools/llvm-size/llvm-size.cpp                      |     2 +-
 tools/llvm-stress/llvm-stress.cpp                  |     6 +-
 tools/llvm-symbolizer/LLVMSymbolize.cpp            |    79 +-
 tools/llvm-symbolizer/LLVMSymbolize.h              |    12 +-
 tools/llvm-symbolizer/llvm-symbolizer.cpp          |    15 +-
 tools/lto/lto.cpp                                  |   169 +-
 tools/obj2yaml/CMakeLists.txt                      |     2 +-
 tools/obj2yaml/Error.cpp                           |    54 +
 tools/obj2yaml/Error.h                             |    42 +
 tools/obj2yaml/coff2yaml.cpp                       |     5 +-
 tools/obj2yaml/elf2yaml.cpp                        |   290 +
 tools/obj2yaml/obj2yaml.cpp                        |    45 +-
 tools/obj2yaml/obj2yaml.h                          |     7 +-
 tools/opt/NewPMDriver.cpp                          |    20 +-
 tools/opt/PassRegistry.def                         |    51 +
 tools/opt/Passes.cpp                               |   145 +-
 tools/opt/PrintSCC.cpp                             |     6 +-
 tools/opt/opt.cpp                                  |    30 +-
 tools/yaml2obj/yaml2elf.cpp                        |   237 +-
 tools/yaml2obj/yaml2obj.cpp                        |    34 +-
 unittests/ADT/PointerUnionTest.cpp                 |    44 +-
 unittests/ADT/SCCIteratorTest.cpp                  |     2 +-
 unittests/ADT/SmallVectorTest.cpp                  |    63 +-
 unittests/ADT/StringMapTest.cpp                    |    88 +
 unittests/ADT/StringRefTest.cpp                    |     1 +
 unittests/Analysis/CMakeLists.txt                  |     2 +
 unittests/Analysis/LazyCallGraphTest.cpp           |   720 +
 unittests/Analysis/MixedTBAATest.cpp               |    77 +
 unittests/Bitcode/BitReaderTest.cpp                |     1 +
 unittests/CMakeLists.txt                           |     1 +
 unittests/CodeGen/DIEHashTest.cpp                  |   200 +-
 .../JIT/IntelJITEventListenerTest.cpp              |     2 +-
 unittests/ExecutionEngine/JIT/JITTest.cpp          |     4 +-
 unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp  |    27 +
 unittests/IR/CMakeLists.txt                        |     1 +
 unittests/IR/ConstantsTest.cpp                     |    28 +
 unittests/IR/InstructionsTest.cpp                  |    33 +
 unittests/IR/MDBuilderTest.cpp                     |     1 +
 unittests/IR/UserTest.cpp                          |    96 +
 unittests/IR/VerifierTest.cpp                      |     4 +-
 unittests/LineEditor/LineEditor.cpp                |     1 +
 unittests/Linker/LinkModulesTest.cpp               |    39 +-
 unittests/Object/CMakeLists.txt                    |     1 +
 unittests/Object/StringTableBuilderTest.cpp        |    40 +
 unittests/Option/OptionParsingTest.cpp             |    20 +-
 unittests/Support/AllocatorTest.cpp                |    58 +-
 unittests/Support/BlockFrequencyTest.cpp           |   147 +-
 unittests/Support/BranchProbabilityTest.cpp        |   282 +
 unittests/Support/CMakeLists.txt                   |     4 +-
 unittests/Support/CompressionTest.cpp              |    17 +-
 unittests/Support/FileOutputBufferTest.cpp         |     9 +-
 unittests/Support/IteratorTest.cpp                 |   101 +
 unittests/Support/MemoryBufferTest.cpp             |     3 +-
 unittests/Support/YAMLIOTest.cpp                   |    48 +
 unittests/Transforms/Utils/SpecialCaseList.cpp     |     5 +-
 utils/FileCheck/FileCheck.cpp                      |    11 +-
 utils/FileUpdate/FileUpdate.cpp                    |     1 +
 utils/PerfectShuffle/CMakeLists.txt                |     3 +
 utils/TableGen/AsmMatcherEmitter.cpp               |    69 +-
 utils/TableGen/AsmWriterEmitter.cpp                |   274 +-
 utils/TableGen/CTagsEmitter.cpp                    |     4 +-
 utils/TableGen/CallingConvEmitter.cpp              |     8 +-
 utils/TableGen/CodeGenDAGPatterns.cpp              |   138 +-
 utils/TableGen/CodeGenDAGPatterns.h                |    29 +-
 utils/TableGen/CodeGenInstruction.cpp              |    37 +-
 utils/TableGen/CodeGenInstruction.h                |    10 +-
 utils/TableGen/CodeGenMapTable.cpp                 |     4 +-
 utils/TableGen/CodeGenRegisters.cpp                |    16 +-
 utils/TableGen/CodeGenRegisters.h                  |    11 +-
 utils/TableGen/CodeGenSchedule.cpp                 |    20 +-
 utils/TableGen/CodeGenSchedule.h                   |    28 +-
 utils/TableGen/CodeGenTarget.cpp                   |     8 +-
 utils/TableGen/CodeGenTarget.h                     |     3 +
 utils/TableGen/DAGISelEmitter.cpp                  |     2 +
 utils/TableGen/DAGISelMatcher.cpp                  |     4 +-
 utils/TableGen/DAGISelMatcher.h                    |     2 +-
 utils/TableGen/DAGISelMatcherEmitter.cpp           |     4 +-
 utils/TableGen/DAGISelMatcherGen.cpp               |   128 +-
 utils/TableGen/DAGISelMatcherOpt.cpp               |    23 +-
 utils/TableGen/DFAPacketizerEmitter.cpp            |    79 +-
 utils/TableGen/DisassemblerEmitter.cpp             |    11 +-
 utils/TableGen/FastISelEmitter.cpp                 |     6 +-
 utils/TableGen/FixedLenDecoderEmitter.cpp          |    18 +-
 utils/TableGen/InstrInfoEmitter.cpp                |   125 +-
 utils/TableGen/IntrinsicEmitter.cpp                |    11 +-
 utils/TableGen/PseudoLoweringEmitter.cpp           |   132 +-
 utils/TableGen/RegisterInfoEmitter.cpp             |    16 +-
 utils/TableGen/SetTheory.cpp                       |     2 +-
 utils/TableGen/SubtargetEmitter.cpp                |    44 +-
 utils/TableGen/X86DisassemblerShared.h             |    47 +-
 utils/TableGen/X86RecognizableInstr.cpp            |     2 +-
 utils/TableGen/module.modulemap                    |     4 +
 utils/lit/utils/check-coverage                     |     4 +-
 utils/lit/utils/check-sdist                        |     2 +-
 utils/lldbDataFormatters.py                        |    16 +-
 utils/llvm-build/llvmbuild/componentinfo.py        |     2 +-
 utils/llvm-build/llvmbuild/main.py                 |     6 +-
 utils/release/test-release.sh                      |     2 +-
 utils/yaml-bench/YAMLBench.cpp                     |     2 +-
 3730 files changed, 236612 insertions(+), 227481 deletions(-)
 create mode 100644 bindings/ocaml/all_backends/Makefile
 create mode 100644 bindings/ocaml/all_backends/all_backends_ocaml.c
 create mode 100644 bindings/ocaml/all_backends/llvm_all_backends.ml
 create mode 100644 bindings/ocaml/all_backends/llvm_all_backends.mli
 create mode 100644 cmake/modules/AddSphinxTarget.cmake
 create mode 100644 cmake/modules/FindSphinx.cmake
 create mode 100644 docs/ARM-BE-bitcastfail.png
 create mode 100644 docs/ARM-BE-bitcastsuccess.png
 create mode 100644 docs/ARM-BE-ld1.png
 create mode 100644 docs/ARM-BE-ldr.png
 create mode 100644 docs/BigEndianNEON.rst
 create mode 100644 docs/BlockFrequencyTerminology.rst
 create mode 100644 docs/CommandGuide/llvm-dwarfdump.rst
 create mode 100644 include/llvm-c/module.modulemap
 delete mode 100644 include/llvm/ADT/ImmutableIntervalMap.h
 create mode 100644 include/llvm/ADT/iterator.h
 delete mode 100644 include/llvm/Analysis/BlockFrequencyImpl.h
 create mode 100644 include/llvm/Analysis/BlockFrequencyInfoImpl.h
 create mode 100644 include/llvm/Analysis/CGSCCPassManager.h
 create mode 100644 include/llvm/IR/GlobalObject.h
 delete mode 100644 include/llvm/IR/IntrinsicsARM64.td
 create mode 100644 include/llvm/MC/MCTargetOptions.h
 create mode 100644 include/llvm/MC/MCTargetOptionsCommandFlags.h
 create mode 100644 include/llvm/MC/MCWinCOFFStreamer.h
 create mode 100644 include/llvm/Object/StringTableBuilder.h
 create mode 100644 include/llvm/Support/OnDiskHashTable.h
 create mode 100644 include/llvm/Transforms/Utils/CtorUtils.h
 create mode 100644 include/llvm/Transforms/Utils/VectorUtils.h
 create mode 100644 include/llvm/module.modulemap
 create mode 100644 include/llvm/module.modulemap.build
 create mode 100644 lib/Analysis/BlockFrequencyInfoImpl.cpp
 create mode 100644 lib/Analysis/CGSCCPassManager.cpp
 create mode 100644 lib/AsmParser/module.modulemap
 create mode 100644 lib/Bitcode/module.modulemap
 create mode 100644 lib/CodeGen/AsmPrinter/AddressPool.cpp
 create mode 100644 lib/CodeGen/AsmPrinter/AddressPool.h
 create mode 100644 lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
 create mode 100644 lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
 create mode 100644 lib/CodeGen/AsmPrinter/DwarfFile.cpp
 create mode 100644 lib/CodeGen/AsmPrinter/DwarfFile.h
 create mode 100644 lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
 create mode 100644 lib/CodeGen/AsmPrinter/DwarfStringPool.h
 create mode 100644 lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
 create mode 100644 lib/CodeGen/module.modulemap
 create mode 100644 lib/DebugInfo/module.modulemap
 create mode 100644 lib/IR/MDBuilder.cpp
 create mode 100644 lib/IR/module.modulemap
 delete mode 100644 lib/MC/MCFixup.cpp
 create mode 100644 lib/MC/MCTargetOptions.cpp
 create mode 100644 lib/Object/StringTableBuilder.cpp
 create mode 100644 lib/ProfileData/Android.mk
 create mode 100644 lib/ProfileData/InstrProfIndexed.h
 create mode 100644 lib/TableGen/module.modulemap
 create mode 100644 lib/Target/AArch64/AArch64AddressTypePromotion.cpp
 create mode 100644 lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
 delete mode 100644 lib/Target/AArch64/AArch64AsmPrinter.h
 delete mode 100644 lib/Target/AArch64/AArch64BranchFixupPass.cpp
 create mode 100644 lib/Target/AArch64/AArch64BranchRelaxation.cpp
 delete mode 100644 lib/Target/AArch64/AArch64CallingConv.td
 create mode 100644 lib/Target/AArch64/AArch64CallingConvention.td
 create mode 100644 lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
 create mode 100644 lib/Target/AArch64/AArch64CollectLOH.cpp
 create mode 100644 lib/Target/AArch64/AArch64ConditionalCompares.cpp
 create mode 100644 lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
 create mode 100644 lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
 create mode 100644 lib/Target/AArch64/AArch64FastISel.cpp
 create mode 100644 lib/Target/AArch64/AArch64InstrAtomics.td
 delete mode 100644 lib/Target/AArch64/AArch64InstrNEON.td
 create mode 100644 lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
 create mode 100644 lib/Target/AArch64/AArch64MCInstLower.h
 delete mode 100644 lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
 create mode 100644 lib/Target/AArch64/AArch64PerfectShuffle.h
 create mode 100644 lib/Target/AArch64/AArch64PromoteConstant.cpp
 create mode 100644 lib/Target/AArch64/AArch64SchedA53.td
 create mode 100644 lib/Target/AArch64/AArch64SchedCyclone.td
 delete mode 100644 lib/Target/AArch64/AArch64ScheduleA53.td
 create mode 100644 lib/Target/AArch64/AArch64StorePairSuppress.cpp
 create mode 100644 lib/Target/AArch64/Disassembler/AArch64Disassembler.h
 create mode 100644 lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
 create mode 100644 lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
 create mode 100644 lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
 create mode 100644 lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
 delete mode 100644 lib/Target/AArch64/README.txt
 delete mode 100644 lib/Target/ARM/ARMAtomicExpandPass.cpp
 create mode 100644 lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
 create mode 100644 lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
 delete mode 100644 lib/Target/ARM64/ARM64.h
 delete mode 100644 lib/Target/ARM64/ARM64.td
 delete mode 100644 lib/Target/ARM64/ARM64AddressTypePromotion.cpp
 delete mode 100644 lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp
 delete mode 100644 lib/Target/ARM64/ARM64AsmPrinter.cpp
 delete mode 100644 lib/Target/ARM64/ARM64BranchRelaxation.cpp
 delete mode 100644 lib/Target/ARM64/ARM64CallingConv.h
 delete mode 100644 lib/Target/ARM64/ARM64CallingConvention.td
 delete mode 100644 lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp
 delete mode 100644 lib/Target/ARM64/ARM64CollectLOH.cpp
 delete mode 100644 lib/Target/ARM64/ARM64ConditionalCompares.cpp
 delete mode 100644 lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
 delete mode 100644 lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp
 delete mode 100644 lib/Target/ARM64/ARM64FastISel.cpp
 delete mode 100644 lib/Target/ARM64/ARM64FrameLowering.cpp
 delete mode 100644 lib/Target/ARM64/ARM64FrameLowering.h
 delete mode 100644 lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
 delete mode 100644 lib/Target/ARM64/ARM64ISelLowering.cpp
 delete mode 100644 lib/Target/ARM64/ARM64ISelLowering.h
 delete mode 100644 lib/Target/ARM64/ARM64InstrAtomics.td
 delete mode 100644 lib/Target/ARM64/ARM64InstrFormats.td
 delete mode 100644 lib/Target/ARM64/ARM64InstrInfo.cpp
 delete mode 100644 lib/Target/ARM64/ARM64InstrInfo.h
 delete mode 100644 lib/Target/ARM64/ARM64InstrInfo.td
 delete mode 100644 lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp
 delete mode 100644 lib/Target/ARM64/ARM64MCInstLower.cpp
 delete mode 100644 lib/Target/ARM64/ARM64MCInstLower.h
 delete mode 100644 lib/Target/ARM64/ARM64MachineFunctionInfo.h
 delete mode 100644 lib/Target/ARM64/ARM64PerfectShuffle.h
 delete mode 100644 lib/Target/ARM64/ARM64PromoteConstant.cpp
 delete mode 100644 lib/Target/ARM64/ARM64RegisterInfo.cpp
 delete mode 100644 lib/Target/ARM64/ARM64RegisterInfo.h
 delete mode 100644 lib/Target/ARM64/ARM64RegisterInfo.td
 delete mode 100644 lib/Target/ARM64/ARM64SchedCyclone.td
 delete mode 100644 lib/Target/ARM64/ARM64Schedule.td
 delete mode 100644 lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
 delete mode 100644 lib/Target/ARM64/ARM64SelectionDAGInfo.h
 delete mode 100644 lib/Target/ARM64/ARM64StorePairSuppress.cpp
 delete mode 100644 lib/Target/ARM64/ARM64Subtarget.cpp
 delete mode 100644 lib/Target/ARM64/ARM64Subtarget.h
 delete mode 100644 lib/Target/ARM64/ARM64TargetMachine.cpp
 delete mode 100644 lib/Target/ARM64/ARM64TargetMachine.h
 delete mode 100644 lib/Target/ARM64/ARM64TargetObjectFile.cpp
 delete mode 100644 lib/Target/ARM64/ARM64TargetObjectFile.h
 delete mode 100644 lib/Target/ARM64/ARM64TargetTransformInfo.cpp
 delete mode 100644 lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
 delete mode 100644 lib/Target/ARM64/AsmParser/CMakeLists.txt
 delete mode 100644 lib/Target/ARM64/AsmParser/LLVMBuild.txt
 delete mode 100644 lib/Target/ARM64/AsmParser/Makefile
 delete mode 100644 lib/Target/ARM64/CMakeLists.txt
 delete mode 100644 lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
 delete mode 100644 lib/Target/ARM64/Disassembler/ARM64Disassembler.h
 delete mode 100644 lib/Target/ARM64/Disassembler/CMakeLists.txt
 delete mode 100644 lib/Target/ARM64/Disassembler/LLVMBuild.txt
 delete mode 100644 lib/Target/ARM64/Disassembler/Makefile
 delete mode 100644 lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
 delete mode 100644 lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
 delete mode 100644 lib/Target/ARM64/InstPrinter/CMakeLists.txt
 delete mode 100644 lib/Target/ARM64/InstPrinter/LLVMBuild.txt
 delete mode 100644 lib/Target/ARM64/InstPrinter/Makefile
 delete mode 100644 lib/Target/ARM64/LLVMBuild.txt
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
 delete mode 100644 lib/Target/ARM64/MCTargetDesc/Makefile
 delete mode 100644 lib/Target/ARM64/Makefile
 delete mode 100644 lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp
 delete mode 100644 lib/Target/ARM64/TargetInfo/CMakeLists.txt
 delete mode 100644 lib/Target/ARM64/TargetInfo/LLVMBuild.txt
 delete mode 100644 lib/Target/ARM64/TargetInfo/Makefile
 create mode 100644 lib/Target/Mips/Mips32r6InstrFormats.td
 create mode 100644 lib/Target/Mips/Mips32r6InstrInfo.td
 create mode 100644 lib/Target/Mips/Mips64r6InstrInfo.td
 create mode 100644 lib/Target/Mips/MipsFastISel.cpp
 create mode 100644 lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
 create mode 100644 lib/Target/NVPTX/NVPTXImageOptimizer.cpp
 create mode 100644 lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
 create mode 100644 lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
 create mode 100644 lib/Target/R600/SILowerI1Copies.cpp
 delete mode 100644 lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
 create mode 100644 lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
 create mode 100644 lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
 delete mode 100644 lib/Target/X86/X86COFFMachineModuleInfo.cpp
 delete mode 100644 lib/Target/X86/X86COFFMachineModuleInfo.h
 create mode 100644 lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
 create mode 100644 lib/Transforms/Utils/CtorUtils.cpp
 create mode 100644 test/Analysis/BlockFrequencyInfo/bad_input.ll
 create mode 100644 test/Analysis/BlockFrequencyInfo/double_backedge.ll
 create mode 100644 test/Analysis/BlockFrequencyInfo/double_exit.ll
 create mode 100644 test/Analysis/BlockFrequencyInfo/irreducible.ll
 create mode 100644 test/Analysis/BlockFrequencyInfo/loop_with_branch.ll
 create mode 100644 test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll
 create mode 100644 test/Analysis/BranchProbabilityInfo/pr18705.ll
 create mode 100644 test/Analysis/CostModel/AArch64/lit.local.cfg
 create mode 100644 test/Analysis/CostModel/AArch64/select.ll
 create mode 100644 test/Analysis/CostModel/AArch64/store.ll
 delete mode 100644 test/Analysis/CostModel/ARM64/lit.local.cfg
 delete mode 100644 test/Analysis/CostModel/ARM64/select.ll
 delete mode 100644 test/Analysis/CostModel/ARM64/store.ll
 create mode 100644 test/Analysis/CostModel/X86/vdiv-cost.ll
 create mode 100644 test/Analysis/CostModel/X86/vselect-cost.ll
 create mode 100644 test/Analysis/Delinearization/gcd_multiply_expr.ll
 create mode 100644 test/Analysis/Delinearization/iv_times_constant_in_subscript.ll
 create mode 100644 test/Analysis/Delinearization/multidim_two_accesses_different_delinearization.ll
 create mode 100644 test/Analysis/Delinearization/undef.ll
 delete mode 100644 test/Assembler/2009-04-25-AliasGEP.ll
 create mode 100644 test/Assembler/alias-addrspace.ll
 create mode 100644 test/Assembler/alias-redefinition.ll
 create mode 100644 test/Assembler/alias-to-alias.ll
 create mode 100644 test/Assembler/alias-to-alias2.ll
 create mode 100644 test/Assembler/alias-type.ll
 create mode 100644 test/Assembler/internal-hidden-alias.ll
 create mode 100644 test/Assembler/internal-hidden-function.ll
 create mode 100644 test/Assembler/internal-hidden-variable.ll
 create mode 100644 test/Assembler/internal-protected-alias.ll
 create mode 100644 test/Assembler/internal-protected-function.ll
 create mode 100644 test/Assembler/internal-protected-variable.ll
 create mode 100644 test/Assembler/private-hidden-alias.ll
 create mode 100644 test/Assembler/private-hidden-function.ll
 create mode 100644 test/Assembler/private-hidden-variable.ll
 create mode 100644 test/Assembler/private-protected-alias.ll
 create mode 100644 test/Assembler/private-protected-function.ll
 create mode 100644 test/Assembler/private-protected-variable.ll
 create mode 100644 test/Bitcode/deprecated-linker_private-linker_private_weak.ll
 create mode 100644 test/Bitcode/local-linkage-default-visibility.3.4.ll
 create mode 100644 test/Bitcode/local-linkage-default-visibility.3.4.ll.bc
 create mode 100644 test/Bitcode/old-aliases.ll
 create mode 100644 test/Bitcode/old-aliases.ll.bc
 create mode 100644 test/Bitcode/tailcall.ll
 create mode 100644 test/Bitcode/upgrade-global-ctors.ll
 create mode 100644 test/Bitcode/upgrade-global-ctors.ll.bc
 create mode 100644 test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
 create mode 100644 test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
 create mode 100644 test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
 create mode 100644 test/CodeGen/AArch64/arm64-aapcs.ll
 create mode 100644 test/CodeGen/AArch64/arm64-abi-varargs.ll
 create mode 100644 test/CodeGen/AArch64/arm64-abi.ll
 create mode 100644 test/CodeGen/AArch64/arm64-abi_align.ll
 create mode 100644 test/CodeGen/AArch64/arm64-addp.ll
 create mode 100644 test/CodeGen/AArch64/arm64-addr-mode-folding.ll
 create mode 100644 test/CodeGen/AArch64/arm64-addr-type-promotion.ll
 create mode 100644 test/CodeGen/AArch64/arm64-addrmode.ll
 create mode 100644 test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
 create mode 100644 test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
 create mode 100644 test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
 create mode 100644 test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
 create mode 100644 test/CodeGen/AArch64/arm64-anyregcc-crash.ll
 create mode 100644 test/CodeGen/AArch64/arm64-anyregcc.ll
 create mode 100644 test/CodeGen/AArch64/arm64-arith-saturating.ll
 create mode 100644 test/CodeGen/AArch64/arm64-arith.ll
 create mode 100644 test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
 create mode 100644 test/CodeGen/AArch64/arm64-atomic-128.ll
 create mode 100644 test/CodeGen/AArch64/arm64-atomic.ll
 create mode 100644 test/CodeGen/AArch64/arm64-basic-pic.ll
 create mode 100644 test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
 create mode 100644 test/CodeGen/AArch64/arm64-big-endian-eh.ll
 create mode 100644 test/CodeGen/AArch64/arm64-big-endian-varargs.ll
 create mode 100644 test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
 create mode 100644 test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
 create mode 100644 test/CodeGen/AArch64/arm64-big-imm-offsets.ll
 create mode 100644 test/CodeGen/AArch64/arm64-big-stack.ll
 create mode 100644 test/CodeGen/AArch64/arm64-bitfield-extract.ll
 create mode 100644 test/CodeGen/AArch64/arm64-blockaddress.ll
 create mode 100644 test/CodeGen/AArch64/arm64-build-vector.ll
 create mode 100644 test/CodeGen/AArch64/arm64-call-tailcalls.ll
 create mode 100644 test/CodeGen/AArch64/arm64-cast-opt.ll
 create mode 100644 test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
 create mode 100644 test/CodeGen/AArch64/arm64-ccmp.ll
 create mode 100644 test/CodeGen/AArch64/arm64-clrsb.ll
 create mode 100644 test/CodeGen/AArch64/arm64-coalesce-ext.ll
 create mode 100644 test/CodeGen/AArch64/arm64-code-model-large-abs.ll
 create mode 100644 test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
 create mode 100644 test/CodeGen/AArch64/arm64-collect-loh-str.ll
 create mode 100644 test/CodeGen/AArch64/arm64-collect-loh.ll
 create mode 100644 test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
 create mode 100644 test/CodeGen/AArch64/arm64-complex-ret.ll
 create mode 100644 test/CodeGen/AArch64/arm64-const-addr.ll
 create mode 100644 test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
 create mode 100644 test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
 create mode 100644 test/CodeGen/AArch64/arm64-copy-tuple.ll
 create mode 100644 test/CodeGen/AArch64/arm64-crc32.ll
 create mode 100644 test/CodeGen/AArch64/arm64-crypto.ll
 create mode 100644 test/CodeGen/AArch64/arm64-cse.ll
 create mode 100644 test/CodeGen/AArch64/arm64-csel.ll
 create mode 100644 test/CodeGen/AArch64/arm64-cvt.ll
 create mode 100644 test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
 create mode 100644 test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
 create mode 100644 test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
 create mode 100644 test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
 create mode 100644 test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
 create mode 100644 test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
 create mode 100644 test/CodeGen/AArch64/arm64-dup.ll
 create mode 100644 test/CodeGen/AArch64/arm64-early-ifcvt.ll
 create mode 100644 test/CodeGen/AArch64/arm64-elf-calls.ll
 create mode 100644 test/CodeGen/AArch64/arm64-elf-constpool.ll
 create mode 100644 test/CodeGen/AArch64/arm64-elf-globals.ll
 create mode 100644 test/CodeGen/AArch64/arm64-ext.ll
 create mode 100644 test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
 create mode 100644 test/CodeGen/AArch64/arm64-extend.ll
 create mode 100644 test/CodeGen/AArch64/arm64-extern-weak.ll
 create mode 100644 test/CodeGen/AArch64/arm64-extload-knownzero.ll
 create mode 100644 test/CodeGen/AArch64/arm64-extract.ll
 create mode 100644 test/CodeGen/AArch64/arm64-extract_subvector.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-br.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-call.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-gv.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-rem.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-ret.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel-select.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fast-isel.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fcmp-opt.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fcopysign.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fmadd.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fmax.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fminv.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fmuladd.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fold-address.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fold-lsl.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fp-contract-zero.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fp-imm.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fp.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fp128-folding.ll
 create mode 100644 test/CodeGen/AArch64/arm64-fp128.ll
 create mode 100644 test/CodeGen/AArch64/arm64-frame-index.ll
 create mode 100644 test/CodeGen/AArch64/arm64-frameaddr.ll
 create mode 100644 test/CodeGen/AArch64/arm64-global-address.ll
 create mode 100644 test/CodeGen/AArch64/arm64-hello.ll
 create mode 100644 test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
 create mode 100644 test/CodeGen/AArch64/arm64-icmp-opt.ll
 create mode 100644 test/CodeGen/AArch64/arm64-illegal-float-ops.ll
 create mode 100644 test/CodeGen/AArch64/arm64-indexed-memory.ll
 create mode 100644 test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
 create mode 100644 test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
 create mode 100644 test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
 create mode 100644 test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
 create mode 100644 test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
 create mode 100644 test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
 create mode 100644 test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
 create mode 100644 test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
 create mode 100644 test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
 create mode 100644 test/CodeGen/AArch64/arm64-inline-asm.ll
 create mode 100644 test/CodeGen/AArch64/arm64-join-reserved.ll
 create mode 100644 test/CodeGen/AArch64/arm64-jumptable.ll
 create mode 100644 test/CodeGen/AArch64/arm64-large-frame.ll
 create mode 100644 test/CodeGen/AArch64/arm64-ld1.ll
 create mode 100644 test/CodeGen/AArch64/arm64-ldp.ll
 create mode 100644 test/CodeGen/AArch64/arm64-ldur.ll
 create mode 100644 test/CodeGen/AArch64/arm64-ldxr-stxr.ll
 create mode 100644 test/CodeGen/AArch64/arm64-leaf.ll
 create mode 100644 test/CodeGen/AArch64/arm64-long-shift.ll
 create mode 100644 test/CodeGen/AArch64/arm64-memcpy-inline.ll
 create mode 100644 test/CodeGen/AArch64/arm64-memset-inline.ll
 create mode 100644 test/CodeGen/AArch64/arm64-memset-to-bzero.ll
 create mode 100644 test/CodeGen/AArch64/arm64-misched-basic-A53.ll
 create mode 100644 test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
 create mode 100644 test/CodeGen/AArch64/arm64-movi.ll
 create mode 100644 test/CodeGen/AArch64/arm64-mul.ll
 create mode 100644 test/CodeGen/AArch64/arm64-named-reg-alloc.ll
 create mode 100644 test/CodeGen/AArch64/arm64-named-reg-notareg.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neg.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-2velem-high.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-2velem.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-3vdiff.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-aba-abd.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-across.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-add-sub.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-copy.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-mul-div.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-select_cc.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-simd-shift.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-simd-vget.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
 create mode 100644 test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
 create mode 100644 test/CodeGen/AArch64/arm64-patchpoint.ll
 create mode 100644 test/CodeGen/AArch64/arm64-pic-local-symbol.ll
 create mode 100644 test/CodeGen/AArch64/arm64-platform-reg.ll
 create mode 100644 test/CodeGen/AArch64/arm64-popcnt.ll
 create mode 100644 test/CodeGen/AArch64/arm64-prefetch.ll
 create mode 100644 test/CodeGen/AArch64/arm64-promote-const.ll
 create mode 100644 test/CodeGen/AArch64/arm64-redzone.ll
 create mode 100644 test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
 create mode 100644 test/CodeGen/AArch64/arm64-register-offset-addressing.ll
 create mode 100644 test/CodeGen/AArch64/arm64-register-pairing.ll
 create mode 100644 test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
 create mode 100644 test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
 create mode 100644 test/CodeGen/AArch64/arm64-return-vector.ll
 create mode 100644 test/CodeGen/AArch64/arm64-returnaddr.ll
 create mode 100644 test/CodeGen/AArch64/arm64-rev.ll
 create mode 100644 test/CodeGen/AArch64/arm64-rounding.ll
 create mode 100644 test/CodeGen/AArch64/arm64-scaled_iv.ll
 create mode 100644 test/CodeGen/AArch64/arm64-scvt.ll
 create mode 100644 test/CodeGen/AArch64/arm64-shifted-sext.ll
 create mode 100644 test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
 create mode 100644 test/CodeGen/AArch64/arm64-simplest-elf.ll
 create mode 100644 test/CodeGen/AArch64/arm64-sincos.ll
 create mode 100644 test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
 create mode 100644 test/CodeGen/AArch64/arm64-sli-sri-opt.ll
 create mode 100644 test/CodeGen/AArch64/arm64-smaxv.ll
 create mode 100644 test/CodeGen/AArch64/arm64-sminv.ll
 create mode 100644 test/CodeGen/AArch64/arm64-spill-lr.ll
 create mode 100644 test/CodeGen/AArch64/arm64-spill.ll
 create mode 100644 test/CodeGen/AArch64/arm64-st1.ll
 create mode 100644 test/CodeGen/AArch64/arm64-stack-no-frame.ll
 create mode 100644 test/CodeGen/AArch64/arm64-stackmap.ll
 create mode 100644 test/CodeGen/AArch64/arm64-stackpointer.ll
 create mode 100644 test/CodeGen/AArch64/arm64-stacksave.ll
 create mode 100644 test/CodeGen/AArch64/arm64-stp.ll
 create mode 100644 test/CodeGen/AArch64/arm64-strict-align.ll
 create mode 100644 test/CodeGen/AArch64/arm64-stur.ll
 create mode 100644 test/CodeGen/AArch64/arm64-subsections.ll
 create mode 100644 test/CodeGen/AArch64/arm64-subvector-extend.ll
 create mode 100644 test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
 create mode 100644 test/CodeGen/AArch64/arm64-tbl.ll
 create mode 100644 test/CodeGen/AArch64/arm64-this-return.ll
 create mode 100644 test/CodeGen/AArch64/arm64-tls-darwin.ll
 create mode 100644 test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
 create mode 100644 test/CodeGen/AArch64/arm64-tls-dynamics.ll
 create mode 100644 test/CodeGen/AArch64/arm64-tls-execs.ll
 create mode 100644 test/CodeGen/AArch64/arm64-trap.ll
 create mode 100644 test/CodeGen/AArch64/arm64-trn.ll
 create mode 100644 test/CodeGen/AArch64/arm64-trunc-store.ll
 create mode 100644 test/CodeGen/AArch64/arm64-umaxv.ll
 create mode 100644 test/CodeGen/AArch64/arm64-uminv.ll
 create mode 100644 test/CodeGen/AArch64/arm64-umov.ll
 create mode 100644 test/CodeGen/AArch64/arm64-unaligned_ldst.ll
 create mode 100644 test/CodeGen/AArch64/arm64-uzp.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vaargs.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vabs.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vadd.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vaddlv.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vaddv.ll
 create mode 100644 test/CodeGen/AArch64/arm64-variadic-aapcs.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vbitwise.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vclz.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vcmp.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vcnt.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vcombine.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vcvt.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vcvt_f.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vcvt_n.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vecCmpBr.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vecFold.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vector-ext.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vector-imm.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vector-insertion.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vector-ldst.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vext.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vext_reverse.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vhadd.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vhsub.ll
 create mode 100644 test/CodeGen/AArch64/arm64-virtual_base.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vmax.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vminmaxnm.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vmovn.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vmul.ll
 create mode 100644 test/CodeGen/AArch64/arm64-volatile.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vpopcnt.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vqadd.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vqsub.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vselect.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vsetcc_fp.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vshift.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vshr.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vshuffle.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vsqrt.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vsra.ll
 create mode 100644 test/CodeGen/AArch64/arm64-vsub.ll
 create mode 100644 test/CodeGen/AArch64/arm64-weak-reference.ll
 create mode 100644 test/CodeGen/AArch64/arm64-xaluo.ll
 create mode 100644 test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
 create mode 100644 test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
 create mode 100644 test/CodeGen/AArch64/arm64-zext.ll
 create mode 100644 test/CodeGen/AArch64/arm64-zextload-unscaled.ll
 create mode 100644 test/CodeGen/AArch64/arm64-zip.ll
 create mode 100644 test/CodeGen/AArch64/asm-large-immediate.ll
 delete mode 100644 test/CodeGen/AArch64/concatvector-bugs.ll
 create mode 100644 test/CodeGen/AArch64/eliminate-trunc.ll
 delete mode 100644 test/CodeGen/AArch64/fp128.ll
 create mode 100644 test/CodeGen/AArch64/free-zext.ll
 create mode 100644 test/CodeGen/AArch64/i1-contents.ll
 delete mode 100644 test/CodeGen/AArch64/i128-shift.ll
 delete mode 100644 test/CodeGen/AArch64/inline-asm-constraints.ll
 delete mode 100644 test/CodeGen/AArch64/inline-asm-modifiers.ll
 delete mode 100644 test/CodeGen/AArch64/large-frame.ll
 create mode 100644 test/CodeGen/AArch64/ldst-opt.ll
 delete mode 100644 test/CodeGen/AArch64/literal_pools.ll
 create mode 100644 test/CodeGen/AArch64/literal_pools_float.ll
 delete mode 100644 test/CodeGen/AArch64/misched-basic-A53.ll
 delete mode 100644 test/CodeGen/AArch64/neon-2velem-high.ll
 delete mode 100644 test/CodeGen/AArch64/neon-2velem.ll
 delete mode 100644 test/CodeGen/AArch64/neon-3vdiff.ll
 delete mode 100644 test/CodeGen/AArch64/neon-aba-abd.ll
 delete mode 100644 test/CodeGen/AArch64/neon-across.ll
 delete mode 100644 test/CodeGen/AArch64/neon-add-pairwise.ll
 delete mode 100644 test/CodeGen/AArch64/neon-add-sub.ll
 delete mode 100644 test/CodeGen/AArch64/neon-bsl.ll
 delete mode 100644 test/CodeGen/AArch64/neon-copy.ll
 delete mode 100644 test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
 delete mode 100644 test/CodeGen/AArch64/neon-crypto.ll
 delete mode 100644 test/CodeGen/AArch64/neon-facge-facgt.ll
 delete mode 100644 test/CodeGen/AArch64/neon-frsqrt-frecp.ll
 delete mode 100644 test/CodeGen/AArch64/neon-halving-add-sub.ll
 create mode 100644 test/CodeGen/AArch64/neon-idiv.ll
 delete mode 100644 test/CodeGen/AArch64/neon-load-store-v1i32.ll
 delete mode 100644 test/CodeGen/AArch64/neon-max-min-pairwise.ll
 delete mode 100644 test/CodeGen/AArch64/neon-max-min.ll
 delete mode 100644 test/CodeGen/AArch64/neon-misc-scalar.ll
 delete mode 100644 test/CodeGen/AArch64/neon-misc.ll
 delete mode 100644 test/CodeGen/AArch64/neon-mul-div.ll
 delete mode 100644 test/CodeGen/AArch64/neon-rounding-halving-add.ll
 delete mode 100644 test/CodeGen/AArch64/neon-rounding-shift.ll
 delete mode 100644 test/CodeGen/AArch64/neon-saturating-add-sub.ll
 delete mode 100644 test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
 delete mode 100644 test/CodeGen/AArch64/neon-saturating-shift.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-abs.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-add-sub.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-compare.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-cvt.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-ext.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-fabd.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-fcvt.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-fp-compare.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-mul.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-neg.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-recip.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-shift-imm.ll
 delete mode 100644 test/CodeGen/AArch64/neon-scalar-shift.ll
 delete mode 100644 test/CodeGen/AArch64/neon-select_cc.ll
 delete mode 100644 test/CodeGen/AArch64/neon-shift.ll
 delete mode 100644 test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
 delete mode 100644 test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
 delete mode 100644 test/CodeGen/AArch64/neon-simd-ldst-one.ll
 delete mode 100644 test/CodeGen/AArch64/neon-simd-ldst.ll
 delete mode 100644 test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
 delete mode 100644 test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
 delete mode 100644 test/CodeGen/AArch64/neon-simd-shift.ll
 delete mode 100644 test/CodeGen/AArch64/neon-simd-tbl.ll
 delete mode 100644 test/CodeGen/AArch64/neon-simd-vget.ll
 delete mode 100644 test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
 delete mode 100644 test/CodeGen/AArch64/neon-v1i1-setcc.ll
 delete mode 100644 test/CodeGen/AArch64/neon-vector-list-spill.ll
 create mode 100644 test/CodeGen/AArch64/nzcv-save.ll
 delete mode 100644 test/CodeGen/AArch64/regress-wzr-allocatable.ll
 delete mode 100644 test/CodeGen/AArch64/sext_inreg.ll
 delete mode 100644 test/CodeGen/AArch64/tls-dynamic-together.ll
 delete mode 100644 test/CodeGen/AArch64/tls-dynamics.ll
 delete mode 100644 test/CodeGen/AArch64/tls-execs.ll
 delete mode 100644 test/CodeGen/AArch64/variadic.ll
 create mode 100644 test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
 create mode 100644 test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
 create mode 100644 test/CodeGen/ARM/Windows/chkstk.ll
 create mode 100644 test/CodeGen/ARM/Windows/frame-register.ll
 create mode 100644 test/CodeGen/ARM/Windows/integer-floating-point-conversion.ll
 create mode 100644 test/CodeGen/ARM/Windows/memset.ll
 create mode 100644 test/CodeGen/ARM/Windows/mov32t-bundling.ll
 create mode 100644 test/CodeGen/ARM/Windows/movw-movt-relocations.ll
 create mode 100644 test/CodeGen/ARM/Windows/pic.ll
 create mode 100644 test/CodeGen/ARM/Windows/read-only-data.ll
 create mode 100644 test/CodeGen/ARM/aapcs-hfa-code.ll
 create mode 100644 test/CodeGen/ARM/aapcs-hfa.ll
 create mode 100644 test/CodeGen/ARM/big-endian-eh-unwind.ll
 create mode 100644 test/CodeGen/ARM/big-endian-neon-bitconv.ll
 create mode 100644 test/CodeGen/ARM/big-endian-vector-callee.ll
 create mode 100644 test/CodeGen/ARM/big-endian-vector-caller.ll
 create mode 100644 test/CodeGen/ARM/bswap16.ll
 create mode 100644 test/CodeGen/ARM/dwarf-eh.ll
 create mode 100644 test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll
 create mode 100644 test/CodeGen/ARM/ehabi-handlerdata.ll
 create mode 100644 test/CodeGen/ARM/frame-register.ll
 create mode 100644 test/CodeGen/ARM/func-argpassing-endian.ll
 create mode 100644 test/CodeGen/ARM/hfa-in-contiguous-registers.ll
 create mode 100644 test/CodeGen/ARM/hints.ll
 create mode 100644 test/CodeGen/ARM/indirect-hidden.ll
 create mode 100644 test/CodeGen/ARM/intrinsics-overflow.ll
 create mode 100644 test/CodeGen/ARM/named-reg-alloc.ll
 create mode 100644 test/CodeGen/ARM/named-reg-notareg.ll
 create mode 100644 test/CodeGen/ARM/stackpointer.ll
 create mode 100644 test/CodeGen/ARM/undefined.ll
 create mode 100644 test/CodeGen/ARM/vfp-libcalls.ll
 delete mode 100644 test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
 delete mode 100644 test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
 delete mode 100644 test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
 delete mode 100644 test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
 delete mode 100644 test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
 delete mode 100644 test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
 delete mode 100644 test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
 delete mode 100644 test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
 delete mode 100644 test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
 delete mode 100644 test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
 delete mode 100644 test/CodeGen/ARM64/2012-06-06-FPToUI.ll
 delete mode 100644 test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
 delete mode 100644 test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
 delete mode 100644 test/CodeGen/ARM64/2013-01-23-frem-crash.ll
 delete mode 100644 test/CodeGen/ARM64/2013-01-23-sext-crash.ll
 delete mode 100644 test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
 delete mode 100644 test/CodeGen/ARM64/AdvSIMD-Scalar.ll
 delete mode 100644 test/CodeGen/ARM64/aapcs.ll
 delete mode 100644 test/CodeGen/ARM64/abi-varargs.ll
 delete mode 100644 test/CodeGen/ARM64/abi.ll
 delete mode 100644 test/CodeGen/ARM64/abi_align.ll
 delete mode 100644 test/CodeGen/ARM64/addp.ll
 delete mode 100644 test/CodeGen/ARM64/addr-mode-folding.ll
 delete mode 100644 test/CodeGen/ARM64/addr-type-promotion.ll
 delete mode 100644 test/CodeGen/ARM64/addrmode.ll
 delete mode 100644 test/CodeGen/ARM64/alloc-no-stack-realign.ll
 delete mode 100644 test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
 delete mode 100644 test/CodeGen/ARM64/andCmpBrToTBZ.ll
 delete mode 100644 test/CodeGen/ARM64/anyregcc-crash.ll
 delete mode 100644 test/CodeGen/ARM64/anyregcc.ll
 delete mode 100644 test/CodeGen/ARM64/arith-saturating.ll
 delete mode 100644 test/CodeGen/ARM64/arith.ll
 delete mode 100644 test/CodeGen/ARM64/atomic-128.ll
 delete mode 100644 test/CodeGen/ARM64/atomic.ll
 delete mode 100644 test/CodeGen/ARM64/basic-pic.ll
 delete mode 100644 test/CodeGen/ARM64/big-imm-offsets.ll
 delete mode 100644 test/CodeGen/ARM64/big-stack.ll
 delete mode 100644 test/CodeGen/ARM64/bitfield-extract.ll
 delete mode 100644 test/CodeGen/ARM64/blockaddress.ll
 delete mode 100644 test/CodeGen/ARM64/build-vector.ll
 delete mode 100644 test/CodeGen/ARM64/call-tailcalls.ll
 delete mode 100644 test/CodeGen/ARM64/cast-opt.ll
 delete mode 100644 test/CodeGen/ARM64/ccmp-heuristics.ll
 delete mode 100644 test/CodeGen/ARM64/ccmp.ll
 delete mode 100644 test/CodeGen/ARM64/coalesce-ext.ll
 delete mode 100644 test/CodeGen/ARM64/code-model-large-abs.ll
 delete mode 100644 test/CodeGen/ARM64/collect-loh-garbage-crash.ll
 delete mode 100644 test/CodeGen/ARM64/collect-loh-str.ll
 delete mode 100644 test/CodeGen/ARM64/collect-loh.ll
 delete mode 100644 test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
 delete mode 100644 test/CodeGen/ARM64/complex-ret.ll
 delete mode 100644 test/CodeGen/ARM64/convert-v2f64-v2i32.ll
 delete mode 100644 test/CodeGen/ARM64/convert-v2i32-v2f64.ll
 delete mode 100644 test/CodeGen/ARM64/copy-tuple.ll
 delete mode 100644 test/CodeGen/ARM64/crc32.ll
 delete mode 100644 test/CodeGen/ARM64/crypto.ll
 delete mode 100644 test/CodeGen/ARM64/cse.ll
 delete mode 100644 test/CodeGen/ARM64/csel.ll
 delete mode 100644 test/CodeGen/ARM64/cvt.ll
 delete mode 100644 test/CodeGen/ARM64/dagcombiner-convergence.ll
 delete mode 100644 test/CodeGen/ARM64/dagcombiner-load-slicing.ll
 delete mode 100644 test/CodeGen/ARM64/dup.ll
 delete mode 100644 test/CodeGen/ARM64/early-ifcvt.ll
 delete mode 100644 test/CodeGen/ARM64/elf-calls.ll
 delete mode 100644 test/CodeGen/ARM64/elf-constpool.ll
 delete mode 100644 test/CodeGen/ARM64/elf-globals.ll
 delete mode 100644 test/CodeGen/ARM64/ext.ll
 delete mode 100644 test/CodeGen/ARM64/extend-int-to-fp.ll
 delete mode 100644 test/CodeGen/ARM64/extend.ll
 delete mode 100644 test/CodeGen/ARM64/extern-weak.ll
 delete mode 100644 test/CodeGen/ARM64/extload-knownzero.ll
 delete mode 100644 test/CodeGen/ARM64/extract.ll
 delete mode 100644 test/CodeGen/ARM64/extract_subvector.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-addr-offset.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-alloca.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-br.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-call.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-conversion.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-fcmp.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-gv.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-icmp.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-indirectbr.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-intrinsic.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-materialize.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-noconvert.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-rem.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-ret.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel-select.ll
 delete mode 100644 test/CodeGen/ARM64/fast-isel.ll
 delete mode 100644 test/CodeGen/ARM64/fastcc-tailcall.ll
 delete mode 100644 test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
 delete mode 100644 test/CodeGen/ARM64/fcmp-opt.ll
 delete mode 100644 test/CodeGen/ARM64/fcopysign.ll
 delete mode 100644 test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
 delete mode 100644 test/CodeGen/ARM64/fmadd.ll
 delete mode 100644 test/CodeGen/ARM64/fmax.ll
 delete mode 100644 test/CodeGen/ARM64/fminv.ll
 delete mode 100644 test/CodeGen/ARM64/fmuladd.ll
 delete mode 100644 test/CodeGen/ARM64/fold-address.ll
 delete mode 100644 test/CodeGen/ARM64/fold-lsl.ll
 delete mode 100644 test/CodeGen/ARM64/fp-imm.ll
 delete mode 100644 test/CodeGen/ARM64/fp.ll
 delete mode 100644 test/CodeGen/ARM64/fp128-folding.ll
 delete mode 100644 test/CodeGen/ARM64/fp128.ll
 delete mode 100644 test/CodeGen/ARM64/frame-index.ll
 delete mode 100644 test/CodeGen/ARM64/frameaddr.ll
 delete mode 100644 test/CodeGen/ARM64/global-address.ll
 delete mode 100644 test/CodeGen/ARM64/hello.ll
 delete mode 100644 test/CodeGen/ARM64/i16-subreg-extract.ll
 delete mode 100644 test/CodeGen/ARM64/icmp-opt.ll
 delete mode 100644 test/CodeGen/ARM64/illegal-float-ops.ll
 delete mode 100644 test/CodeGen/ARM64/indexed-memory.ll
 delete mode 100644 test/CodeGen/ARM64/inline-asm-error-I.ll
 delete mode 100644 test/CodeGen/ARM64/inline-asm-error-J.ll
 delete mode 100644 test/CodeGen/ARM64/inline-asm-error-K.ll
 delete mode 100644 test/CodeGen/ARM64/inline-asm-error-L.ll
 delete mode 100644 test/CodeGen/ARM64/inline-asm-error-M.ll
 delete mode 100644 test/CodeGen/ARM64/inline-asm-error-N.ll
 delete mode 100644 test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
 delete mode 100644 test/CodeGen/ARM64/inline-asm.ll
 delete mode 100644 test/CodeGen/ARM64/join-reserved.ll
 delete mode 100644 test/CodeGen/ARM64/jumptable.ll
 delete mode 100644 test/CodeGen/ARM64/ld1.ll
 delete mode 100644 test/CodeGen/ARM64/ldp.ll
 delete mode 100644 test/CodeGen/ARM64/ldur.ll
 delete mode 100644 test/CodeGen/ARM64/ldxr-stxr.ll
 delete mode 100644 test/CodeGen/ARM64/leaf-compact-unwind.ll
 delete mode 100644 test/CodeGen/ARM64/leaf.ll
 delete mode 100644 test/CodeGen/ARM64/lit.local.cfg
 delete mode 100644 test/CodeGen/ARM64/long-shift.ll
 delete mode 100644 test/CodeGen/ARM64/memcpy-inline.ll
 delete mode 100644 test/CodeGen/ARM64/memset-inline.ll
 delete mode 100644 test/CodeGen/ARM64/memset-to-bzero.ll
 delete mode 100644 test/CodeGen/ARM64/movi.ll
 delete mode 100644 test/CodeGen/ARM64/mul.ll
 delete mode 100644 test/CodeGen/ARM64/neg.ll
 delete mode 100644 test/CodeGen/ARM64/neon-compare-instructions.ll
 delete mode 100644 test/CodeGen/ARM64/patchpoint.ll
 delete mode 100644 test/CodeGen/ARM64/platform-reg.ll
 delete mode 100644 test/CodeGen/ARM64/popcnt.ll
 delete mode 100644 test/CodeGen/ARM64/prefetch.ll
 delete mode 100644 test/CodeGen/ARM64/promote-const.ll
 delete mode 100644 test/CodeGen/ARM64/redzone.ll
 delete mode 100644 test/CodeGen/ARM64/register-offset-addressing.ll
 delete mode 100644 test/CodeGen/ARM64/register-pairing.ll
 delete mode 100644 test/CodeGen/ARM64/regress-f128csel-flags.ll
 delete mode 100644 test/CodeGen/ARM64/regress-interphase-shift.ll
 delete mode 100644 test/CodeGen/ARM64/return-vector.ll
 delete mode 100644 test/CodeGen/ARM64/returnaddr.ll
 delete mode 100644 test/CodeGen/ARM64/rev.ll
 delete mode 100644 test/CodeGen/ARM64/rounding.ll
 delete mode 100644 test/CodeGen/ARM64/scaled_iv.ll
 delete mode 100644 test/CodeGen/ARM64/scvt.ll
 delete mode 100644 test/CodeGen/ARM64/shifted-sext.ll
 delete mode 100644 test/CodeGen/ARM64/simd-scalar-to-vector.ll
 delete mode 100644 test/CodeGen/ARM64/simplest-elf.ll
 delete mode 100644 test/CodeGen/ARM64/sincos.ll
 delete mode 100644 test/CodeGen/ARM64/sitofp-combine-chains.ll
 delete mode 100644 test/CodeGen/ARM64/sli-sri-opt.ll
 delete mode 100644 test/CodeGen/ARM64/smaxv.ll
 delete mode 100644 test/CodeGen/ARM64/sminv.ll
 delete mode 100644 test/CodeGen/ARM64/spill-lr.ll
 delete mode 100644 test/CodeGen/ARM64/spill.ll
 delete mode 100644 test/CodeGen/ARM64/st1.ll
 delete mode 100644 test/CodeGen/ARM64/stack-no-frame.ll
 delete mode 100644 test/CodeGen/ARM64/stackmap.ll
 delete mode 100644 test/CodeGen/ARM64/stacksave.ll
 delete mode 100644 test/CodeGen/ARM64/stp.ll
 delete mode 100644 test/CodeGen/ARM64/strict-align.ll
 delete mode 100644 test/CodeGen/ARM64/stur.ll
 delete mode 100644 test/CodeGen/ARM64/subvector-extend.ll
 delete mode 100644 test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
 delete mode 100644 test/CodeGen/ARM64/tbl.ll
 delete mode 100644 test/CodeGen/ARM64/this-return.ll
 delete mode 100644 test/CodeGen/ARM64/tls-darwin.ll
 delete mode 100644 test/CodeGen/ARM64/tls-dynamic-together.ll
 delete mode 100644 test/CodeGen/ARM64/tls-dynamics.ll
 delete mode 100644 test/CodeGen/ARM64/tls-execs.ll
 delete mode 100644 test/CodeGen/ARM64/trap.ll
 delete mode 100644 test/CodeGen/ARM64/trn.ll
 delete mode 100644 test/CodeGen/ARM64/trunc-store.ll
 delete mode 100644 test/CodeGen/ARM64/umaxv.ll
 delete mode 100644 test/CodeGen/ARM64/uminv.ll
 delete mode 100644 test/CodeGen/ARM64/umov.ll
 delete mode 100644 test/CodeGen/ARM64/unaligned_ldst.ll
 delete mode 100644 test/CodeGen/ARM64/uzp.ll
 delete mode 100644 test/CodeGen/ARM64/vaargs.ll
 delete mode 100644 test/CodeGen/ARM64/vabs.ll
 delete mode 100644 test/CodeGen/ARM64/vadd.ll
 delete mode 100644 test/CodeGen/ARM64/vaddlv.ll
 delete mode 100644 test/CodeGen/ARM64/vaddv.ll
 delete mode 100644 test/CodeGen/ARM64/variadic-aapcs.ll
 delete mode 100644 test/CodeGen/ARM64/vbitwise.ll
 delete mode 100644 test/CodeGen/ARM64/vclz.ll
 delete mode 100644 test/CodeGen/ARM64/vcmp.ll
 delete mode 100644 test/CodeGen/ARM64/vcnt.ll
 delete mode 100644 test/CodeGen/ARM64/vcombine.ll
 delete mode 100644 test/CodeGen/ARM64/vcvt.ll
 delete mode 100644 test/CodeGen/ARM64/vcvt_f.ll
 delete mode 100644 test/CodeGen/ARM64/vcvt_f32_su32.ll
 delete mode 100644 test/CodeGen/ARM64/vcvt_n.ll
 delete mode 100644 test/CodeGen/ARM64/vcvt_su32_f32.ll
 delete mode 100644 test/CodeGen/ARM64/vcvtxd_f32_f64.ll
 delete mode 100644 test/CodeGen/ARM64/vecCmpBr.ll
 delete mode 100644 test/CodeGen/ARM64/vecFold.ll
 delete mode 100644 test/CodeGen/ARM64/vector-ext.ll
 delete mode 100644 test/CodeGen/ARM64/vector-imm.ll
 delete mode 100644 test/CodeGen/ARM64/vector-ldst.ll
 delete mode 100644 test/CodeGen/ARM64/vext.ll
 delete mode 100644 test/CodeGen/ARM64/vfloatintrinsics.ll
 delete mode 100644 test/CodeGen/ARM64/vhadd.ll
 delete mode 100644 test/CodeGen/ARM64/vhsub.ll
 delete mode 100644 test/CodeGen/ARM64/virtual_base.ll
 delete mode 100644 test/CodeGen/ARM64/vmax.ll
 delete mode 100644 test/CodeGen/ARM64/vminmaxnm.ll
 delete mode 100644 test/CodeGen/ARM64/vmovn.ll
 delete mode 100644 test/CodeGen/ARM64/vmul.ll
 delete mode 100644 test/CodeGen/ARM64/volatile.ll
 delete mode 100644 test/CodeGen/ARM64/vqadd.ll
 delete mode 100644 test/CodeGen/ARM64/vqsub.ll
 delete mode 100644 test/CodeGen/ARM64/vselect.ll
 delete mode 100644 test/CodeGen/ARM64/vsetcc_fp.ll
 delete mode 100644 test/CodeGen/ARM64/vshift.ll
 delete mode 100644 test/CodeGen/ARM64/vshr.ll
 delete mode 100644 test/CodeGen/ARM64/vshuffle.ll
 delete mode 100644 test/CodeGen/ARM64/vsqrt.ll
 delete mode 100644 test/CodeGen/ARM64/vsra.ll
 delete mode 100644 test/CodeGen/ARM64/vsub.ll
 delete mode 100644 test/CodeGen/ARM64/weak-reference.ll
 delete mode 100644 test/CodeGen/ARM64/xaluo.ll
 delete mode 100644 test/CodeGen/ARM64/zero-cycle-regmov.ll
 delete mode 100644 test/CodeGen/ARM64/zero-cycle-zeroing.ll
 delete mode 100644 test/CodeGen/ARM64/zext.ll
 delete mode 100644 test/CodeGen/ARM64/zextload-unscaled.ll
 delete mode 100644 test/CodeGen/ARM64/zip.ll
 create mode 100644 test/CodeGen/Mips/Fast-ISel/nullvoid.ll
 create mode 100644 test/CodeGen/Mips/Fast-ISel/simplestore.ll
 create mode 100644 test/CodeGen/Mips/Fast-ISel/simplestorei.ll
 create mode 100644 test/CodeGen/Mips/cconv/arguments-float.ll
 create mode 100644 test/CodeGen/Mips/cconv/arguments-fp128.ll
 create mode 100644 test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
 create mode 100644 test/CodeGen/Mips/cconv/arguments-hard-float.ll
 create mode 100644 test/CodeGen/Mips/cconv/arguments-hard-fp128.ll
 create mode 100644 test/CodeGen/Mips/cconv/arguments.ll
 create mode 100644 test/CodeGen/Mips/cconv/callee-saved-float.ll
 create mode 100644 test/CodeGen/Mips/cconv/callee-saved.ll
 create mode 100644 test/CodeGen/Mips/cconv/memory-layout.ll
 create mode 100644 test/CodeGen/Mips/cconv/reserved-space.ll
 create mode 100644 test/CodeGen/Mips/cconv/return-float.ll
 create mode 100644 test/CodeGen/Mips/cconv/return-hard-float.ll
 create mode 100644 test/CodeGen/Mips/cconv/return-hard-fp128.ll
 create mode 100644 test/CodeGen/Mips/cconv/return.ll
 create mode 100644 test/CodeGen/Mips/cconv/stack-alignment.ll
 delete mode 100644 test/CodeGen/Mips/elf_st_other.ll
 create mode 100644 test/CodeGen/Mips/micromips-directives.ll
 delete mode 100644 test/CodeGen/Mips/micromips-long-branch.ll
 create mode 100644 test/CodeGen/Mips/mips32r6/compatibility.ll
 delete mode 100644 test/CodeGen/Mips/mips64load-store-left-right.ll
 create mode 100644 test/CodeGen/Mips/mips64r6/compatibility.ll
 create mode 100644 test/CodeGen/Mips/start-asm-file.ll
 create mode 100644 test/CodeGen/NVPTX/access-non-generic.ll
 create mode 100644 test/CodeGen/NVPTX/addrspacecast-gvar.ll
 create mode 100644 test/CodeGen/NVPTX/surf-read.ll
 create mode 100644 test/CodeGen/NVPTX/surf-write.ll
 create mode 100644 test/CodeGen/NVPTX/tex-read.ll
 create mode 100644 test/CodeGen/PowerPC/alias.ll
 create mode 100644 test/CodeGen/PowerPC/cc.ll
 create mode 100644 test/CodeGen/PowerPC/ctrloop-sh.ll
 create mode 100644 test/CodeGen/PowerPC/indexed-load.ll
 create mode 100644 test/CodeGen/PowerPC/named-reg-alloc-r0.ll
 create mode 100644 test/CodeGen/PowerPC/named-reg-alloc-r1-64.ll
 create mode 100644 test/CodeGen/PowerPC/named-reg-alloc-r1.ll
 create mode 100644 test/CodeGen/PowerPC/named-reg-alloc-r13-64.ll
 create mode 100644 test/CodeGen/PowerPC/named-reg-alloc-r13.ll
 create mode 100644 test/CodeGen/PowerPC/named-reg-alloc-r2-64.ll
 create mode 100644 test/CodeGen/PowerPC/named-reg-alloc-r2.ll
 create mode 100644 test/CodeGen/PowerPC/rlwimi-dyn-and.ll
 create mode 100644 test/CodeGen/PowerPC/splat-bug.ll
 create mode 100644 test/CodeGen/R600/call.ll
 create mode 100644 test/CodeGen/R600/extract_vector_elt_i16.ll
 create mode 100644 test/CodeGen/R600/fp_to_uint.f64.ll
 create mode 100644 test/CodeGen/R600/gv-const-addrspace-fail.ll
 create mode 100644 test/CodeGen/R600/llvm.AMDGPU.imad24.ll
 create mode 100644 test/CodeGen/R600/llvm.AMDGPU.imul24.ll
 create mode 100644 test/CodeGen/R600/llvm.AMDGPU.umad24.ll
 create mode 100644 test/CodeGen/R600/llvm.AMDGPU.umul24.ll
 create mode 100644 test/CodeGen/R600/llvm.rint.f64.ll
 create mode 100644 test/CodeGen/R600/load-i1.ll
 create mode 100644 test/CodeGen/R600/selectcc.ll
 create mode 100644 test/CodeGen/R600/sgpr-control-flow.ll
 create mode 100644 test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
 create mode 100644 test/CodeGen/R600/uaddo.ll
 create mode 100644 test/CodeGen/R600/udivrem64.ll
 create mode 100644 test/CodeGen/R600/uint_to_fp.f64.ll
 create mode 100644 test/CodeGen/R600/valu-i1.ll
 create mode 100644 test/CodeGen/SPARC/sret-secondary.ll
 create mode 100644 test/CodeGen/Thumb/thumb-ldm.ll
 create mode 100644 test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
 create mode 100644 test/CodeGen/X86/avoid_complex_am.ll
 create mode 100644 test/CodeGen/X86/avx.ll
 create mode 100644 test/CodeGen/X86/avx1-logical-load-folding.ll
 create mode 100644 test/CodeGen/X86/avx2-blend.ll
 delete mode 100644 test/CodeGen/X86/cdecl-method-return.ll
 create mode 100644 test/CodeGen/X86/cfi.ll
 create mode 100644 test/CodeGen/X86/codegen-prepare-crash.ll
 create mode 100644 test/CodeGen/X86/combine-avx-intrinsics.ll
 create mode 100644 test/CodeGen/X86/combine-avx2-intrinsics.ll
 create mode 100644 test/CodeGen/X86/combine-sse2-intrinsics.ll
 create mode 100644 test/CodeGen/X86/combine-sse41-intrinsics.ll
 create mode 100644 test/CodeGen/X86/constant-hoisting-shift-immediate.ll
 create mode 100644 test/CodeGen/X86/expand-opaque-const.ll
 create mode 100644 test/CodeGen/X86/fma-do-not-commute.ll
 create mode 100644 test/CodeGen/X86/indirect-hidden.ll
 create mode 100644 test/CodeGen/X86/lower-bitcast.ll
 create mode 100644 test/CodeGen/X86/lower-vec-shift.ll
 create mode 100644 test/CodeGen/X86/lzcnt-tzcnt.ll
 create mode 100644 test/CodeGen/X86/mod128.ll
 create mode 100644 test/CodeGen/X86/musttail-indirect.ll
 create mode 100644 test/CodeGen/X86/musttail-thiscall.ll
 create mode 100644 test/CodeGen/X86/musttail.ll
 create mode 100644 test/CodeGen/X86/named-reg-alloc.ll
 create mode 100644 test/CodeGen/X86/named-reg-notareg.ll
 delete mode 100644 test/CodeGen/X86/no-cfi.ll
 create mode 100644 test/CodeGen/X86/remat-invalid-liveness.ll
 delete mode 100644 test/CodeGen/X86/rotate3.ll
 create mode 100644 test/CodeGen/X86/stackpointer.ll
 create mode 100644 test/CodeGen/X86/vec_shuffle-41.ll
 create mode 100644 test/CodeGen/X86/vector-idiv.ll
 create mode 100644 test/CodeGen/X86/x86-64-sret-return-2.ll
 delete mode 100644 test/DebugInfo/AArch64/cfi-frame.ll
 create mode 100644 test/DebugInfo/AArch64/struct_by_value.ll
 delete mode 100644 test/DebugInfo/AArch64/variable-loc.ll
 delete mode 100644 test/DebugInfo/ARM64/lit.local.cfg
 delete mode 100644 test/DebugInfo/ARM64/struct_by_value.ll
 create mode 100755 test/DebugInfo/Inputs/llvm-symbolizer-dwo-test
 create mode 100644 test/DebugInfo/Inputs/llvm-symbolizer-dwo-test.cc
 create mode 100644 test/DebugInfo/Mips/delay-slot.ll
 create mode 100644 test/DebugInfo/Mips/lit.local.cfg
 create mode 100644 test/DebugInfo/X86/DW_AT_linkage_name.ll
 create mode 100644 test/DebugInfo/X86/array.ll
 create mode 100644 test/DebugInfo/X86/array2.ll
 delete mode 100644 test/DebugInfo/X86/dbg-large-unsigned-const.ll
 create mode 100644 test/DebugInfo/X86/debug-dead-local-var.ll
 create mode 100644 test/DebugInfo/X86/debug-ranges-offset.ll
 create mode 100644 test/DebugInfo/X86/formal_parameter.ll
 create mode 100644 test/DebugInfo/X86/inline-member-function.ll
 create mode 100644 test/DebugInfo/X86/inline-seldag-test.ll
 create mode 100644 test/DebugInfo/X86/type_units_with_addresses.ll
 create mode 100644 test/DebugInfo/constant-pointers.ll
 create mode 100644 test/DebugInfo/cross-cu-inlining.ll
 create mode 100644 test/DebugInfo/cross-cu-linkonce.ll
 delete mode 100644 test/DebugInfo/dwarfdump-inlining.test
 create mode 100644 test/DebugInfo/dwarfdump-ranges.test
 delete mode 100644 test/DebugInfo/dwarfdump-test.test
 create mode 100644 test/DebugInfo/inline-scopes.ll
 create mode 100644 test/DebugInfo/llvm-symbolizer-zlib.test
 create mode 100644 test/DebugInfo/namespace_function_definition.ll
 create mode 100644 test/DebugInfo/namespace_inline_function_definition.ll
 create mode 100644 test/DebugInfo/restrict.ll
 create mode 100644 test/DebugInfo/sugared-constants.ll
 create mode 100644 test/DebugInfo/typedef.ll
 delete mode 100644 test/ExecutionEngine/RuntimeDyld/arm_secdiff_reloc.test
 create mode 100644 test/ExecutionEngine/RuntimeDyld/macho_relocations.test
 create mode 100644 test/Feature/alias2.ll
 create mode 100644 test/Instrumentation/AddressSanitizer/X86/asm_attr.ll
 create mode 100644 test/Instrumentation/AddressSanitizer/X86/asm_swap_intel.s
 create mode 100644 test/Instrumentation/AddressSanitizer/coverage-dbg.ll
 create mode 100644 test/Instrumentation/AddressSanitizer/instrumentation-with-call-threshold.ll
 create mode 100644 test/Instrumentation/MemorySanitizer/do-not-emit-module-limits.ll
 create mode 100644 test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll
 create mode 100644 test/LTO/attrs.ll
 create mode 100644 test/Linker/Inputs/alias.ll
 create mode 100644 test/Linker/Inputs/cycle.ll
 create mode 100644 test/Linker/Inputs/old_global_ctors.3.4.bc
 create mode 100644 test/Linker/alias.ll
 create mode 100644 test/Linker/cycle.ll
 create mode 100644 test/Linker/debug-info-version-a.ll
 create mode 100644 test/Linker/debug-info-version-b.ll
 create mode 100644 test/Linker/global_ctors.ll
 create mode 100644 test/MC/AArch64/arm64-adr.s
 create mode 100644 test/MC/AArch64/arm64-advsimd.s
 create mode 100644 test/MC/AArch64/arm64-aliases.s
 create mode 100644 test/MC/AArch64/arm64-arithmetic-encoding.s
 create mode 100644 test/MC/AArch64/arm64-arm64-fixup.s
 create mode 100644 test/MC/AArch64/arm64-basic-a64-instructions.s
 create mode 100644 test/MC/AArch64/arm64-be-datalayout.s
 create mode 100644 test/MC/AArch64/arm64-bitfield-encoding.s
 create mode 100644 test/MC/AArch64/arm64-branch-encoding.s
 create mode 100644 test/MC/AArch64/arm64-condbr-without-dots.s
 create mode 100644 test/MC/AArch64/arm64-crypto.s
 create mode 100644 test/MC/AArch64/arm64-diagno-predicate.s
 create mode 100644 test/MC/AArch64/arm64-diags.s
 create mode 100644 test/MC/AArch64/arm64-directive_loh.s
 create mode 100644 test/MC/AArch64/arm64-elf-reloc-condbr.s
 create mode 100644 test/MC/AArch64/arm64-elf-relocs.s
 create mode 100644 test/MC/AArch64/arm64-fp-encoding.s
 create mode 100644 test/MC/AArch64/arm64-large-relocs.s
 create mode 100644 test/MC/AArch64/arm64-leaf-compact-unwind.s
 create mode 100644 test/MC/AArch64/arm64-logical-encoding.s
 create mode 100644 test/MC/AArch64/arm64-mapping-across-sections.s
 create mode 100644 test/MC/AArch64/arm64-mapping-within-section.s
 create mode 100644 test/MC/AArch64/arm64-memory.s
 create mode 100644 test/MC/AArch64/arm64-nv-cond.s
 create mode 100644 test/MC/AArch64/arm64-optional-hash.s
 create mode 100644 test/MC/AArch64/arm64-separator.s
 create mode 100644 test/MC/AArch64/arm64-simd-ldst.s
 create mode 100644 test/MC/AArch64/arm64-small-data-fixups.s
 create mode 100644 test/MC/AArch64/arm64-spsel-sysreg.s
 create mode 100644 test/MC/AArch64/arm64-system-encoding.s
 create mode 100644 test/MC/AArch64/arm64-target-specific-sysreg.s
 create mode 100644 test/MC/AArch64/arm64-tls-modifiers-darwin.s
 create mode 100644 test/MC/AArch64/arm64-tls-relocs.s
 create mode 100644 test/MC/AArch64/arm64-v128_lo-diagnostics.s
 create mode 100644 test/MC/AArch64/arm64-variable-exprs.s
 create mode 100644 test/MC/AArch64/arm64-vector-lists.s
 create mode 100644 test/MC/AArch64/arm64-verbose-vector-case.s
 delete mode 100644 test/MC/AArch64/elf-reloc-addend.s
 delete mode 100644 test/MC/AArch64/elf-reloc-condbr.s
 create mode 100644 test/MC/ARM/Windows/mov32t-range.s
 create mode 100644 test/MC/ARM/big-endian-arm-fixup.s
 create mode 100644 test/MC/ARM/big-endian-thumb-fixup.s
 create mode 100644 test/MC/ARM/big-endian-thumb2-fixup.s
 create mode 100644 test/MC/ARM/coff-debugging-secrel.ll
 create mode 100644 test/MC/ARM/coff-file.s
 create mode 100644 test/MC/ARM/coff-function-type-info.ll
 create mode 100644 test/MC/ARM/coff-relocations.s
 delete mode 100644 test/MC/ARM/eh-directive-save-diagnoatics.s
 create mode 100644 test/MC/ARM/eh-directive-save-diagnostics.s
 create mode 100644 test/MC/ARM/ldrd-strd-gnu-arm-bad-imm.s
 create mode 100644 test/MC/ARM/ldrd-strd-gnu-arm.s
 create mode 100644 test/MC/ARM/ldrd-strd-gnu-thumb-bad-regs.s
 create mode 100644 test/MC/ARM/ldrd-strd-gnu-thumb.s
 create mode 100644 test/MC/ARM/neon-vld-vst-align.s
 create mode 100644 test/MC/ARM/thumb2-strd.s
 create mode 100644 test/MC/ARM/thumb2be-b.w-encoding.s
 create mode 100644 test/MC/ARM/thumb2be-beq.w-encoding.s
 create mode 100644 test/MC/ARM/thumb2be-movt-encoding.s
 create mode 100644 test/MC/ARM/thumb2be-movw-encoding.s
 create mode 100644 test/MC/ARM/udf-arm-diagnostics.s
 create mode 100644 test/MC/ARM/udf-arm.s
 create mode 100644 test/MC/ARM/udf-thumb-2-diagnostics.s
 create mode 100644 test/MC/ARM/udf-thumb-2.s
 create mode 100644 test/MC/ARM/udf-thumb-diagnostics.s
 create mode 100644 test/MC/ARM/udf-thumb.s
 create mode 100644 test/MC/ARM/vmov-vmvn-byte-replicate.s
 create mode 100644 test/MC/ARM/vmov-vmvn-illegal-cases.s
 create mode 100644 test/MC/ARM/vorr-vbic-illegal-cases.s
 delete mode 100644 test/MC/ARM64/advsimd.s
 delete mode 100644 test/MC/ARM64/aliases.s
 delete mode 100644 test/MC/ARM64/arithmetic-encoding.s
 delete mode 100644 test/MC/ARM64/arm64-fixup.s
 delete mode 100644 test/MC/ARM64/basic-a64-instructions.s
 delete mode 100644 test/MC/ARM64/bitfield-encoding.s
 delete mode 100644 test/MC/ARM64/branch-encoding.s
 delete mode 100644 test/MC/ARM64/crypto.s
 delete mode 100644 test/MC/ARM64/diags.s
 delete mode 100644 test/MC/ARM64/directive_loh.s
 delete mode 100644 test/MC/ARM64/elf-relocs.s
 delete mode 100644 test/MC/ARM64/fp-encoding.s
 delete mode 100644 test/MC/ARM64/large-relocs.s
 delete mode 100644 test/MC/ARM64/lit.local.cfg
 delete mode 100644 test/MC/ARM64/logical-encoding.s
 delete mode 100644 test/MC/ARM64/mapping-across-sections.s
 delete mode 100644 test/MC/ARM64/mapping-within-section.s
 delete mode 100644 test/MC/ARM64/memory.s
 delete mode 100644 test/MC/ARM64/separator.s
 delete mode 100644 test/MC/ARM64/simd-ldst.s
 delete mode 100644 test/MC/ARM64/small-data-fixups.s
 delete mode 100644 test/MC/ARM64/system-encoding.s
 delete mode 100644 test/MC/ARM64/tls-modifiers-darwin.s
 delete mode 100644 test/MC/ARM64/tls-relocs.s
 delete mode 100644 test/MC/ARM64/variable-exprs.s
 create mode 100644 test/MC/AsmParser/cfi-invalid-startproc.s
 create mode 100644 test/MC/AsmParser/invalid-input-assertion.s
 create mode 100644 test/MC/AsmParser/macros-darwin-vararg.s
 create mode 100644 test/MC/AsmParser/vararg-default-value.s
 create mode 100644 test/MC/AsmParser/vararg.s
 create mode 100644 test/MC/COFF/directive-section-characteristics.ll
 create mode 100644 test/MC/COFF/file.s
 create mode 100644 test/MC/COFF/initialised-data.ll
 create mode 100644 test/MC/COFF/invalid-def.s
 create mode 100644 test/MC/COFF/invalid-endef.s
 create mode 100644 test/MC/COFF/invalid-scl-range.s
 create mode 100644 test/MC/COFF/invalid-scl.s
 create mode 100644 test/MC/COFF/invalid-type-range.s
 create mode 100644 test/MC/COFF/invalid-type.s
 create mode 100644 test/MC/COFF/offset.s
 create mode 100644 test/MC/Disassembler/AArch64/arm64-advsimd.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-arithmetic.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-basic-a64-undefined.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-bitfield.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-branch.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-canonical-form.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-crc32.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-crypto.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-invalid-logical.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-logical.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-memory.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-non-apple-fmov.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-scalar-fp.txt
 create mode 100644 test/MC/Disassembler/AArch64/arm64-system.txt
 delete mode 100644 test/MC/Disassembler/ARM64/advsimd.txt
 delete mode 100644 test/MC/Disassembler/ARM64/arithmetic.txt
 delete mode 100644 test/MC/Disassembler/ARM64/bitfield.txt
 delete mode 100644 test/MC/Disassembler/ARM64/branch.txt
 delete mode 100644 test/MC/Disassembler/ARM64/crc32.txt
 delete mode 100644 test/MC/Disassembler/ARM64/crypto.txt
 delete mode 100644 test/MC/Disassembler/ARM64/invalid-logical.txt
 delete mode 100644 test/MC/Disassembler/ARM64/lit.local.cfg
 delete mode 100644 test/MC/Disassembler/ARM64/logical.txt
 delete mode 100644 test/MC/Disassembler/ARM64/memory.txt
 delete mode 100644 test/MC/Disassembler/ARM64/scalar-fp.txt
 delete mode 100644 test/MC/Disassembler/ARM64/system.txt
 create mode 100644 test/MC/Disassembler/Mips/mips32r6.txt
 create mode 100644 test/MC/Disassembler/Mips/mips64r6.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_2r.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_2r_msa64.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_2rf.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_3r.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_3rf.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_bit.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_ctrlregs.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_dlsa.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_elm.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_elm_insert.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_elm_insert_msa64.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_elm_insve.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_elm_msa64.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_i10.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_i5.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_i8.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_lsa.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_mi10.txt
 create mode 100644 test/MC/Disassembler/Mips/msa/test_vec.txt
 create mode 100644 test/MC/ELF/strtab-suffix-opt.s
 create mode 100644 test/MC/ELF/subtraction-error.s
 delete mode 100644 test/MC/ELF/symref.s
 create mode 100644 test/MC/ELF/symver.s
 create mode 100644 test/MC/MachO/AArch64/darwin-ARM64-local-label-diff.s
 create mode 100644 test/MC/MachO/AArch64/darwin-ARM64-reloc.s
 create mode 100644 test/MC/MachO/AArch64/lit.local.cfg
 delete mode 100644 test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s
 delete mode 100644 test/MC/MachO/ARM64/darwin-ARM64-reloc.s
 delete mode 100644 test/MC/MachO/ARM64/lit.local.cfg
 create mode 100644 test/MC/MachO/bad-darwin-x86_64-reloc-expr.s
 create mode 100644 test/MC/Mips/cpload-bad.s
 create mode 100644 test/MC/Mips/cpload.s
 create mode 100644 test/MC/Mips/elf_eflags_nan2008.s
 create mode 100644 test/MC/Mips/elf_eflags_nanlegacy.s
 create mode 100644 test/MC/Mips/llvm-mc-fixup-endianness.s
 create mode 100644 test/MC/Mips/mips1/invalid-mips2-wrong-error.s
 create mode 100644 test/MC/Mips/mips1/invalid-mips2.s
 create mode 100644 test/MC/Mips/mips1/invalid-mips3-wrong-error.s
 create mode 100644 test/MC/Mips/mips1/invalid-mips3.s
 create mode 100644 test/MC/Mips/mips1/invalid-mips4-wrong-error.s
 create mode 100644 test/MC/Mips/mips1/invalid-mips4.s
 create mode 100644 test/MC/Mips/mips1/invalid-mips5-wrong-error.s
 create mode 100644 test/MC/Mips/mips1/invalid-mips5.s
 create mode 100644 test/MC/Mips/mips2/invalid-mips3-wrong-error.s
 create mode 100644 test/MC/Mips/mips2/invalid-mips3.s
 create mode 100644 test/MC/Mips/mips2/invalid-mips32.s
 create mode 100644 test/MC/Mips/mips2/invalid-mips32r2-xfail.s
 create mode 100644 test/MC/Mips/mips2/invalid-mips32r2.s
 create mode 100644 test/MC/Mips/mips2/invalid-mips4-wrong-error.s
 create mode 100644 test/MC/Mips/mips2/invalid-mips4.s
 create mode 100644 test/MC/Mips/mips2/invalid-mips5-wrong-error.s
 create mode 100644 test/MC/Mips/mips2/invalid-mips5.s
 delete mode 100644 test/MC/Mips/mips2/valid-xfail.s
 create mode 100644 test/MC/Mips/mips3/invalid-mips4.s
 create mode 100644 test/MC/Mips/mips3/invalid-mips5-wrong-error.s
 create mode 100644 test/MC/Mips/mips3/invalid-mips5.s
 delete mode 100644 test/MC/Mips/mips3/valid-xfail.s
 create mode 100644 test/MC/Mips/mips32/invalid-mips64.s
 create mode 100644 test/MC/Mips/mips32r2/invalid-mips64r2.s
 create mode 100644 test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s
 create mode 100644 test/MC/Mips/mips32r6/invalid-mips1.s
 create mode 100644 test/MC/Mips/mips32r6/invalid-mips2-wrong-error.s
 create mode 100644 test/MC/Mips/mips32r6/invalid-mips2.s
 create mode 100644 test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s
 create mode 100644 test/MC/Mips/mips32r6/relocations.s
 create mode 100644 test/MC/Mips/mips32r6/valid-xfail.s
 create mode 100644 test/MC/Mips/mips32r6/valid.s
 create mode 100644 test/MC/Mips/mips4/invalid-mips5-wrong-error.s
 create mode 100644 test/MC/Mips/mips4/invalid-mips5.s
 delete mode 100644 test/MC/Mips/mips4/invalid-mips64-xfail.s
 create mode 100644 test/MC/Mips/mips5/invalid-mips64.s
 create mode 100644 test/MC/Mips/mips5/invalid-mips64r2-xfail.s
 create mode 100644 test/MC/Mips/mips5/invalid-mips64r2.s
 create mode 100644 test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s
 create mode 100644 test/MC/Mips/mips64r6/invalid-mips1.s
 create mode 100644 test/MC/Mips/mips64r6/invalid-mips2.s
 create mode 100644 test/MC/Mips/mips64r6/invalid-mips3-wrong-error.s
 create mode 100644 test/MC/Mips/mips64r6/invalid-mips3.s
 create mode 100644 test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s
 create mode 100644 test/MC/Mips/mips64r6/relocations.s
 create mode 100644 test/MC/Mips/mips64r6/valid-xfail.s
 create mode 100644 test/MC/Mips/mips64r6/valid.s
 create mode 100644 test/Object/Inputs/COFF/weak-external.yaml
 create mode 100644 test/Object/Inputs/macho-text-data-bss.macho-x86_64
 create mode 100644 test/Object/Inputs/macho-universal-archive.x86_64.i386
 create mode 100755 test/Object/Inputs/relocation-dynamic.elf-i386
 create mode 100644 test/Object/Inputs/relocation-relocatable.elf-i386
 create mode 100644 test/Object/X86/yaml2obj-elf-x86-rel.yaml
 create mode 100644 test/Object/ar-error.test
 create mode 100644 test/Object/obj2yaml-coff-weak-external.test
 create mode 100644 test/Object/size-trivial-macho.test
 create mode 100644 test/Object/yaml2obj-elf-rel.yaml
 create mode 100644 test/Object/yaml2obj-elf-section-invalid-size.yaml
 create mode 100644 test/Other/optimization-remarks-inline.ll
 create mode 100644 test/TableGen/listconcat.td
 create mode 100644 test/Transforms/AddDiscriminators/no-discriminators.ll
 create mode 100644 test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll
 create mode 100644 test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll
 create mode 100644 test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg
 create mode 100644 test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
 create mode 100644 test/Transforms/ConstantHoisting/AArch64/const-addr.ll
 create mode 100644 test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
 create mode 100644 test/Transforms/ConstantHoisting/AArch64/lit.local.cfg
 create mode 100644 test/Transforms/ConstantHoisting/PowerPC/const-base-addr.ll
 create mode 100644 test/Transforms/ConstantHoisting/PowerPC/lit.local.cfg
 create mode 100644 test/Transforms/ConstantHoisting/PowerPC/masks.ll
 create mode 100644 test/Transforms/ConstantHoisting/X86/cast-inst.ll
 create mode 100644 test/Transforms/ConstantHoisting/X86/large-immediate.ll
 create mode 100644 test/Transforms/GVN/load-pre-nonlocal.ll
 create mode 100644 test/Transforms/GlobalDCE/global_ctors.ll
 create mode 100644 test/Transforms/GlobalDCE/global_ctors_integration.ll
 create mode 100644 test/Transforms/GlobalMerge/AArch64/arm64.ll
 create mode 100644 test/Transforms/GlobalMerge/AArch64/lit.local.cfg
 delete mode 100644 test/Transforms/GlobalMerge/ARM64/arm64.ll
 delete mode 100644 test/Transforms/GlobalMerge/ARM64/lit.local.cfg
 create mode 100644 test/Transforms/IndVarSimplify/pr18223.ll
 delete mode 100644 test/Transforms/Inline/2010-05-31-ByvalTailcall.ll
 create mode 100644 test/Transforms/Inline/byval-tail-call.ll
 create mode 100644 test/Transforms/Inline/byval_lifetime.ll
 create mode 100644 test/Transforms/Inline/inline-vla.ll
 create mode 100644 test/Transforms/Inline/optimization-remarks.ll
 create mode 100644 test/Transforms/Inline/switch.ll
 create mode 100644 test/Transforms/InstCombine/OverlappingInsertvalues.ll
 create mode 100644 test/Transforms/InstCombine/blend_x86.ll
 create mode 100644 test/Transforms/InstCombine/overflow-mul.ll
 create mode 100644 test/Transforms/InstCombine/pr19420.ll
 create mode 100644 test/Transforms/InstSimplify/dead-code-removal.ll
 create mode 100644 test/Transforms/Internalize/local-visibility.ll
 create mode 100644 test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg
 create mode 100644 test/Transforms/LoopStrengthReduce/AArch64/lsr-memcpy.ll
 create mode 100644 test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll
 create mode 100644 test/Transforms/LoopStrengthReduce/AArch64/req-regs.ll
 delete mode 100644 test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
 delete mode 100644 test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll
 delete mode 100644 test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll
 create mode 100644 test/Transforms/LoopUnroll/loop-remarks.ll
 create mode 100644 test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll
 create mode 100644 test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
 create mode 100644 test/Transforms/LoopVectorize/AArch64/gather-cost.ll
 create mode 100644 test/Transforms/LoopVectorize/AArch64/lit.local.cfg
 delete mode 100644 test/Transforms/LoopVectorize/ARM64/gather-cost.ll
 delete mode 100644 test/Transforms/LoopVectorize/ARM64/lit.local.cfg
 create mode 100644 test/Transforms/LoopVectorize/X86/vect.omp.force.ll
 create mode 100644 test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
 create mode 100644 test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
 create mode 100644 test/Transforms/LoopVectorize/vect.omp.persistence.ll
 create mode 100644 test/Transforms/LoopVectorize/vect.stats.ll
 create mode 100644 test/Transforms/MergeFunc/mergefunc-struct-return.ll
 create mode 100644 test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
 create mode 100644 test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll
 delete mode 100644 test/Transforms/SLPVectorizer/ARM64/lit.local.cfg
 delete mode 100644 test/Transforms/SLPVectorizer/ARM64/mismatched-intrinsics.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/align.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/call.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/consecutive-access.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll
 create mode 100644 test/Transforms/SLPVectorizer/X86/value-bug.ll
 create mode 100644 test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg
 create mode 100644 test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
 create mode 100644 test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
 create mode 100644 test/Transforms/SimplifyCFG/extract-cost.ll
 delete mode 100644 test/Verifier/aliasing-chain.ll
 delete mode 100644 test/Verifier/bitcast-alias-address-space.ll
 create mode 100644 test/Verifier/global-ctors.ll
 create mode 100644 test/Verifier/inalloca3.ll
 create mode 100644 test/Verifier/musttail-invalid.ll
 create mode 100644 test/Verifier/musttail-valid.ll
 create mode 100644 test/Verifier/sret.ll
 create mode 100644 test/tools/llvm-cov/Inputs/range_based_for.gcda
 create mode 100644 test/tools/llvm-cov/Inputs/range_based_for.gcno
 create mode 100644 test/tools/llvm-cov/Inputs/test_long_file_names.output
 create mode 100644 test/tools/llvm-cov/Inputs/test_long_paths.output
 create mode 100644 test/tools/llvm-cov/Inputs/test_missing.cpp.gcov
 create mode 100644 test/tools/llvm-cov/Inputs/test_missing.h.gcov
 create mode 100644 test/tools/llvm-cov/Inputs/test_missing.output
 create mode 100644 test/tools/llvm-cov/Inputs/test_no_output.output
 create mode 100644 test/tools/llvm-cov/range_based_for.cpp
 create mode 100644 test/tools/llvm-objdump/Inputs/file-aux-record.yaml
 create mode 100755 test/tools/llvm-objdump/Inputs/file.obj.coff-arm
 create mode 100644 test/tools/llvm-objdump/coff-file.test
 create mode 100644 test/tools/llvm-objdump/coff-non-null-terminated-file.test
 create mode 100644 test/tools/llvm-profdata/Inputs/no-counts.profdata
 create mode 100644 test/tools/llvm-profdata/raw-two-profiles.test
 create mode 100755 test/tools/llvm-readobj/Inputs/dynamic-table-exe.x86
 create mode 100644 test/tools/llvm-readobj/Inputs/file-aux-record.yaml
 create mode 100644 test/tools/llvm-readobj/Inputs/file-multiple-aux-records.yaml
 create mode 100644 test/tools/llvm-readobj/coff-file-sections-reading.test
 create mode 100644 test/tools/llvm-readobj/coff-non-null-terminated-file.test
 create mode 100644 tools/llvm-readobj/Win64EHDumper.cpp
 create mode 100644 tools/llvm-readobj/Win64EHDumper.h
 create mode 100644 tools/obj2yaml/Error.cpp
 create mode 100644 tools/obj2yaml/Error.h
 create mode 100644 tools/obj2yaml/elf2yaml.cpp
 create mode 100644 tools/opt/PassRegistry.def
 create mode 100644 unittests/Analysis/LazyCallGraphTest.cpp
 create mode 100644 unittests/Analysis/MixedTBAATest.cpp
 create mode 100644 unittests/IR/UserTest.cpp
 create mode 100644 unittests/Object/StringTableBuilderTest.cpp
 create mode 100644 unittests/Support/BranchProbabilityTest.cpp
 create mode 100644 unittests/Support/IteratorTest.cpp
 create mode 100644 utils/PerfectShuffle/CMakeLists.txt
 create mode 100644 utils/TableGen/module.modulemap

diff --git a/.arcconfig b/.arcconfig
index 4711195..0632311 100644
--- a/.arcconfig
+++ b/.arcconfig
@@ -1,4 +1,4 @@
 {
   "project_id" : "llvm",
-  "conduit_uri" : "http://llvm-reviews.chandlerc.com/"
+  "conduit_uri" : "http://reviews.llvm.org/"
 }
diff --git a/.clang-format b/.clang-format
index 9b3aa8b..5bead5f 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1 +1,2 @@
 BasedOnStyle: LLVM
+
diff --git a/.gitignore b/.gitignore
index 2462830..dd0f148 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,8 @@
 # vim swap files
 .*.swp
 .sw?
+#OS X specific files.
+.DS_store
 
 #==============================================================================#
 # Explicit files to ignore (only matches one).
@@ -35,7 +37,6 @@ compile_commands.json
 #==============================================================================#
 # External projects that are tracked independently.
 projects/*
-!projects/sample
 !projects/CMakeLists.txt
 !projects/Makefile
 # Clang, which is tracked independently.
diff --git a/Android.mk b/Android.mk
index 9f6e8a5..3c46a22 100644
--- a/Android.mk
+++ b/Android.mk
@@ -26,6 +26,7 @@ subdirs := \
   lib/MC/MCParser \
   lib/Object \
   lib/Option \
+  lib/ProfileData \
   lib/Support \
   lib/TableGen \
   lib/Target \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 624f755..0d6eead 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,7 +128,6 @@ set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name
 
 set(LLVM_ALL_TARGETS
   AArch64
-  ARM64
   ARM
   CppBackend
   Hexagon
@@ -144,7 +143,7 @@ set(LLVM_ALL_TARGETS
   )
 
 # List of targets with JIT support:
-set(LLVM_TARGETS_WITH_JIT X86 PowerPC AArch64 ARM64 ARM Mips SystemZ)
+set(LLVM_TARGETS_WITH_JIT X86 PowerPC AArch64 ARM Mips SystemZ)
 
 set(LLVM_TARGETS_TO_BUILD "all"
     CACHE STRING "Semicolon-separated list of targets to build, or \"all\".")
@@ -199,7 +198,7 @@ option(LLVM_ENABLE_PIC "Build Position-Independent Code" ON)
 # MSVC has a gazillion warnings with this.
 if( MSVC )
   option(LLVM_ENABLE_WARNINGS "Enable compiler warnings." OFF)
-else( MSVC )
+else()
   option(LLVM_ENABLE_WARNINGS "Enable compiler warnings." ON)
 endif()
 
@@ -287,7 +286,8 @@ option(LLVM_INCLUDE_TESTS "Generate build targets for the LLVM unit tests." ON)
 
 option (LLVM_BUILD_DOCS "Build the llvm documentation." OFF)
 option (LLVM_INCLUDE_DOCS "Generate build targets for llvm documentation." ON)
-option (LLVM_ENABLE_DOXYGEN "Use doxygen to generate llvm documentation." OFF)
+option (LLVM_ENABLE_DOXYGEN "Use doxygen to generate llvm API documentation." OFF)
+option (LLVM_ENABLE_SPHINX "Use Sphinx to generate llvm documentation." OFF)
 
 option (LLVM_BUILD_EXTERNAL_COMPILER_RT
   "Build compiler-rt as an external project." OFF)
@@ -497,6 +497,7 @@ add_subdirectory(lib)
 if( LLVM_INCLUDE_UTILS )
   add_subdirectory(utils/FileCheck)
   add_subdirectory(utils/FileUpdate)
+  add_subdirectory(utils/PerfectShuffle)
   add_subdirectory(utils/count)
   add_subdirectory(utils/not)
   add_subdirectory(utils/llvm-lit)
diff --git a/CREDITS.TXT b/CREDITS.TXT
index 750a1c2..311a661 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -132,6 +132,7 @@ W: http://www-src.lip6.fr/homepages/Nicolas.Geoffray/
 D: PPC backend fixes for Linux
 
 N: Louis Gerbarg
+E: lgg@apple.com
 D: Portions of the PowerPC backend
 
 N: Saem Ghani
diff --git a/Makefile.rules b/Makefile.rules
index cb4abaf..9417971 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -1063,8 +1063,9 @@ ifeq ($(HOST_OS), $(filter $(HOST_OS), Cygwin MingW))
 LLVMLibsOptions += -Wl,--enable-auto-import,--enable-runtime-pseudo-reloc \
                    -L $(SharedLibDir)
 endif
-LLVMLibsOptions += -lLLVM-$(LLVMVersion)
-LLVMLibsPaths += $(SharedLibDir)/$(SharedPrefix)LLVM-$(LLVMVersion)$(SHLIBEXT)
+LLVM_SO_NAME = LLVM-$(LLVM_VERSION_MAJOR).$(LLVM_VERSION_MINOR)$(LLVM_VERSION_SUFFIX)
+LLVMLibsOptions += -l$(LLVM_SO_NAME)
+LLVMLibsPaths += $(SharedLibDir)/$(SharedPrefix)$(LLVM_SO_NAME)$(SHLIBEXT)
 else
 
 ifndef NO_LLVM_CONFIG
@@ -1180,7 +1181,12 @@ LibName.O  := $(LibDir)/$(LIBRARYNAME).o
 #---------------------------------------------------------
 ifdef SHARED_LIBRARY
 
-all-local:: $(LibName.SO)
+all-local:: $(AliasName.SO)
+
+$(AliasName.SO): $(LibName.SO)
+ifdef SHARED_ALIAS
+	$(Verb) $(AliasTool) $(BaseLibName.SO) $(AliasName.SO)
+endif
 
 ifdef EXPORTED_SYMBOL_FILE
 $(LibName.SO): $(NativeExportsFile)
@@ -1232,12 +1238,15 @@ $(DestSharedLib): $(LibName.SO) $(DestSharedLibDir)
 	$(Verb) $(INSTALL) $(LibName.SO) $(DestSharedLib)
 ifdef SHARED_ALIAS
 	$(Echo) Creating alias from $(DestSharedLib) to $(DestSharedAlias)
-	$(Verb) $(AliasTool) $(DestSharedLib) $(DestSharedAlias)
+	$(Verb) $(AliasTool) $(BaseLibName.SO) $(DestSharedAlias)
 endif
 
 uninstall-local::
 	$(Echo) Uninstalling $(BuildMode) Shared Library $(DestSharedLib)
 	-$(Verb) $(RM) -f $(DestSharedLib)
+ifdef SHARED_ALIAS
+	-$(Verb) $(RM) -f $(DestSharedAlias)
+endif
 endif
 endif
 
@@ -1713,8 +1722,14 @@ $(ObjDir)/%GenDFAPacketizer.inc.tmp : %.td $(ObjDir)/.dir $(LLVM_TBLGEN)
 	$(Echo) "Building $(<F) DFA packetizer tables with tblgen"
 	$(Verb) $(LLVMTableGen) -gen-dfa-packetizer -o $(call SYSPATH, $@) $<
 
+# Dump all the records to <target>.td.expanded.  This is useful for debugging.
+$(TARGET:%=%.td.expanded): \
+%.td.expanded : %.td $(LLVM_TBLGEN)
+	$(Echo) "Building a fully expanded version of $(<F)"
+	$(Verb) $(LLVMTableGen) -o $(call SYSPATH, $@) $<
+
 clean-local::
-	-$(Verb) $(RM) -f $(INCFiles)
+	-$(Verb) $(RM) -f $(INCFiles) $(TARGET).td.expanded
 
 endif # TARGET
 
diff --git a/README.txt b/README.txt
index e957a4d..193330f 100644
--- a/README.txt
+++ b/README.txt
@@ -15,4 +15,3 @@ documentation setup.
 
 If you're writing a package for LLVM, see docs/Packaging.rst for our
 suggestions.
-
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 6b9c17a..08f756c 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -419,7 +419,7 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch],
   amd64-* | x86_64-*)     llvm_cv_target_arch="x86_64" ;;
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
-  arm64*-*)               llvm_cv_target_arch="ARM64" ;;
+  arm64*-*)               llvm_cv_target_arch="AArch64" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
   aarch64*-*)             llvm_cv_target_arch="AArch64" ;;
   mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
@@ -455,7 +455,7 @@ case $host in
   amd64-* | x86_64-*)     host_arch="x86_64" ;;
   sparc*-*)               host_arch="Sparc" ;;
   powerpc*-*)             host_arch="PowerPC" ;;
-  arm64*-*)               host_arch="ARM64" ;;
+  arm64*-*)               host_arch="AArch64" ;;
   arm*-*)                 host_arch="ARM" ;;
   aarch64*-*)             host_arch="AArch64" ;;
   mips-* | mips64-*)      host_arch="Mips" ;;
@@ -786,7 +786,6 @@ else
     PowerPC)     AC_SUBST(TARGET_HAS_JIT,1) ;;
     x86_64)      AC_SUBST(TARGET_HAS_JIT,1) ;;
     ARM)         AC_SUBST(TARGET_HAS_JIT,1) ;;
-    AArch64)     AC_SUBST(TARGET_HAS_JIT,0) ;;
     Mips)        AC_SUBST(TARGET_HAS_JIT,1) ;;
     XCore)       AC_SUBST(TARGET_HAS_JIT,0) ;;
     MSP430)      AC_SUBST(TARGET_HAS_JIT,0) ;;
@@ -797,7 +796,7 @@ else
   esac
 fi
 
-TARGETS_WITH_JIT="AArch64 ARM ARM64 Mips PowerPC SystemZ X86"
+TARGETS_WITH_JIT="ARM AArch64 Mips PowerPC SystemZ X86"
 AC_SUBST(TARGETS_WITH_JIT,$TARGETS_WITH_JIT)
 
 dnl Allow enablement of building and installing docs
@@ -950,7 +949,7 @@ if test "$llvm_cv_enable_crash_overrides" = "yes" ; then
 fi
 
 dnl List all possible targets
-ALL_TARGETS="X86 Sparc PowerPC AArch64 ARM ARM64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
+ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
 AC_SUBST(ALL_TARGETS,$ALL_TARGETS)
 
 dnl Allow specific targets to be specified for building (or not)
@@ -972,7 +971,7 @@ case "$enableval" in
         sparc)    TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
         powerpc)  TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
         aarch64)  TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
-        arm64)    TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;;
+        arm64)    TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
         arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
         mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         mipsel)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
diff --git a/bindings/ocaml/Makefile b/bindings/ocaml/Makefile
index 44562fe8..b0e1f09 100644
--- a/bindings/ocaml/Makefile
+++ b/bindings/ocaml/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 DIRS = llvm bitreader bitwriter irreader analysis target executionengine \
-       transforms linker backends
+       transforms linker backends all_backends
 ExtraMakefiles = $(PROJ_OBJ_DIR)/Makefile.ocaml
 
 ocamldoc:
diff --git a/bindings/ocaml/all_backends/Makefile b/bindings/ocaml/all_backends/Makefile
new file mode 100644
index 0000000..a5ff290
--- /dev/null
+++ b/bindings/ocaml/all_backends/Makefile
@@ -0,0 +1,21 @@
+##===- bindings/ocaml/all_backends/Makefile ----------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# This is the makefile for the Objective Caml Llvm_backends interface.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../../..
+include $(LEVEL)/Makefile.config
+
+LIBRARYNAME := llvm_all_backends
+UsedComponents := $(TARGETS_TO_BUILD)
+UsedOcamlInterfaces :=
+
+include ../Makefile.ocaml
diff --git a/bindings/ocaml/all_backends/all_backends_ocaml.c b/bindings/ocaml/all_backends/all_backends_ocaml.c
new file mode 100644
index 0000000..8fe7d9b
--- /dev/null
+++ b/bindings/ocaml/all_backends/all_backends_ocaml.c
@@ -0,0 +1,32 @@
+/*===-- all_backends_ocaml.c - LLVM OCaml Glue ------------------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
+|* are by and large transparent wrappers to the corresponding C functions.    *|
+|*                                                                            *|
+|* Note that these functions intentionally take liberties with the CAMLparamX *|
+|* macros, since most of the parameters are not GC heap objects.              *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c/Target.h"
+#include "caml/alloc.h"
+#include "caml/fail.h"
+#include "caml/memory.h"
+#include "caml/custom.h"
+
+/* unit -> unit */
+CAMLprim value llvm_initialize_all(value Unit) {
+  LLVMInitializeAllTargetInfos();
+  LLVMInitializeAllTargets();
+  LLVMInitializeAllTargetMCs();
+  LLVMInitializeAllAsmPrinters();
+  LLVMInitializeAllAsmParsers();
+  return Val_unit;
+}
diff --git a/bindings/ocaml/all_backends/llvm_all_backends.ml b/bindings/ocaml/all_backends/llvm_all_backends.ml
new file mode 100644
index 0000000..f4f4725
--- /dev/null
+++ b/bindings/ocaml/all_backends/llvm_all_backends.ml
@@ -0,0 +1,10 @@
+(*===-- llvm_all_backends.ml - LLVM OCaml Interface -----------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+external initialize : unit -> unit = "llvm_initialize_all"
diff --git a/bindings/ocaml/all_backends/llvm_all_backends.mli b/bindings/ocaml/all_backends/llvm_all_backends.mli
new file mode 100644
index 0000000..1808544
--- /dev/null
+++ b/bindings/ocaml/all_backends/llvm_all_backends.mli
@@ -0,0 +1,11 @@
+(*===-- llvm_all_backends.mli - LLVM OCaml Interface ----------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+(** Initialize all the backends targets *)
+val initialize : unit -> unit
diff --git a/bindings/ocaml/llvm/META.llvm.in b/bindings/ocaml/llvm/META.llvm.in
index c241ea5..edb84e0 100644
--- a/bindings/ocaml/llvm/META.llvm.in
+++ b/bindings/ocaml/llvm/META.llvm.in
@@ -93,3 +93,11 @@ package "linker" (
     archive(byte) = "llvm_linker.cma"
     archive(native) = "llvm_linker.cmxa"
 )
+
+package "all_backends" (
+    requires = "llvm"
+    version  = "@PACKAGE_VERSION@"
+    description = "All backends for LLVM"
+    archive(byte) = "llvm_all_backends.cma"
+    archive(native) = "llvm_all_backends.cmxa"
+)
diff --git a/bindings/python/llvm/object.py b/bindings/python/llvm/object.py
index 473aa3a..4e912ed 100644
--- a/bindings/python/llvm/object.py
+++ b/bindings/python/llvm/object.py
@@ -78,7 +78,10 @@ Here are some examples on how to perform iteration:
 """
 
 from ctypes import c_char_p
+from ctypes import c_char
+from ctypes import POINTER
 from ctypes import c_uint64
+from ctypes import string_at
 
 from .common import CachedProperty
 from .common import LLVMObject
@@ -211,7 +214,12 @@ class Section(LLVMObject):
         if self.expired:
             raise Exception('Section instance has expired.')
 
-        return lib.LLVMGetSectionContents(self)
+        siz = self.size
+
+        r = lib.LLVMGetSectionContents(self)
+        if r:
+            return string_at(r, siz)
+        return None
 
     @CachedProperty
     def address(self):
@@ -311,14 +319,6 @@ class Symbol(LLVMObject):
         return lib.LLVMGetSymbolAddress(self)
 
     @CachedProperty
-    def file_offset(self):
-        """The offset of this symbol in the file, in long bytes."""
-        if self.expired:
-            raise Exception('Symbol instance has expired.')
-
-        return lib.LLVMGetSymbolFileOffset(self)
-
-    @CachedProperty
     def size(self):
         """The size of the symbol, in long bytes."""
         if self.expired:
@@ -345,7 +345,6 @@ class Symbol(LLVMObject):
         """Cache all cacheable properties."""
         getattr(self, 'name')
         getattr(self, 'address')
-        getattr(self, 'file_offset')
         getattr(self, 'size')
 
     def expire(self):
@@ -471,7 +470,8 @@ def register_library(library):
     library.LLVMGetSectionSize.restype = c_uint64
 
     library.LLVMGetSectionContents.argtypes = [c_object_p]
-    library.LLVMGetSectionContents.restype = c_char_p
+    # Can't use c_char_p here as it isn't a NUL-terminated string.
+    library.LLVMGetSectionContents.restype = POINTER(c_char)
 
     library.LLVMGetSectionAddress.argtypes = [c_object_p]
     library.LLVMGetSectionAddress.restype = c_uint64
@@ -495,9 +495,6 @@ def register_library(library):
     library.LLVMGetSymbolAddress.argtypes = [Symbol]
     library.LLVMGetSymbolAddress.restype = c_uint64
 
-    library.LLVMGetSymbolFileOffset.argtypes = [Symbol]
-    library.LLVMGetSymbolFileOffset.restype = c_uint64
-
     library.LLVMGetSymbolSize.argtypes = [Symbol]
     library.LLVMGetSymbolSize.restype = c_uint64
 
diff --git a/bindings/python/llvm/tests/test_object.py b/bindings/python/llvm/tests/test_object.py
index 7ff981b..3f92d81 100644
--- a/bindings/python/llvm/tests/test_object.py
+++ b/bindings/python/llvm/tests/test_object.py
@@ -23,6 +23,7 @@ class TestObjectFile(TestBase):
             assert isinstance(section.size, long)
             assert isinstance(section.contents, str)
             assert isinstance(section.address, long)
+            assert len(section.contents) == section.size
 
         self.assertGreater(count, 0)
 
@@ -39,7 +40,6 @@ class TestObjectFile(TestBase):
             assert isinstance(symbol.name, str)
             assert isinstance(symbol.address, long)
             assert isinstance(symbol.size, long)
-            assert isinstance(symbol.file_offset, long)
 
         self.assertGreater(count, 0)
 
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index f007b37..1325e79 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -17,6 +17,11 @@ if( UNIX AND NOT BEOS )
   # Used by check_symbol_exists:
   set(CMAKE_REQUIRED_LIBRARIES m)
 endif()
+# x86_64 FreeBSD 9.2 requires libcxxrt to be specified explicitly.
+if( CMAKE_SYSTEM MATCHES "FreeBSD-9.2-RELEASE" AND
+    CMAKE_SIZEOF_VOID_P EQUAL 8 )
+  list(APPEND CMAKE_REQUIRED_LIBRARIES "cxxrt")
+endif()
 
 # Helper macros and functions
 macro(add_cxx_include result files)
@@ -367,7 +372,7 @@ elseif (LLVM_NATIVE_ARCH MATCHES "powerpc")
 elseif (LLVM_NATIVE_ARCH MATCHES "aarch64")
   set(LLVM_NATIVE_ARCH AArch64)
 elseif (LLVM_NATIVE_ARCH MATCHES "arm64")
-  set(LLVM_NATIVE_ARCH ARM64)
+  set(LLVM_NATIVE_ARCH AArch64)
 elseif (LLVM_NATIVE_ARCH MATCHES "arm")
   set(LLVM_NATIVE_ARCH ARM)
 elseif (LLVM_NATIVE_ARCH MATCHES "mips")
@@ -481,7 +486,7 @@ set(LLVM_PREFIX ${CMAKE_INSTALL_PREFIX})
 
 if (LLVM_ENABLE_DOXYGEN)
   message(STATUS "Doxygen enabled.")
-  find_package(Doxygen)
+  find_package(Doxygen REQUIRED)
 
   if (DOXYGEN_FOUND)
     # If we find doxygen and we want to enable doxygen by default create a
@@ -500,3 +505,13 @@ if (LLVM_ENABLE_DOXYGEN)
 else()
   message(STATUS "Doxygen disabled.")
 endif()
+
+if (LLVM_ENABLE_SPHINX)
+  message(STATUS "Sphinx enabled.")
+  find_package(Sphinx REQUIRED)
+  if (LLVM_BUILD_DOCS)
+    add_custom_target(sphinx ALL)
+  endif()
+else()
+  message(STATUS "Sphinx disabled.")
+endif()
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 41902d2..69ffa5b 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -276,6 +276,11 @@ function(llvm_add_library name)
   endif()
 
   if(ARG_SHARED)
+    if(WIN32)
+      set_target_properties(${name} PROPERTIES
+        PREFIX ""
+        )
+    endif()
     if (MSVC)
       set_target_properties(${name}
         PROPERTIES
@@ -627,11 +632,12 @@ function(add_lit_target target comment)
   if (NOT CMAKE_CFG_INTDIR STREQUAL ".")
     list(APPEND LIT_ARGS --param build_mode=${CMAKE_CFG_INTDIR})
   endif ()
-  set(LIT_COMMAND
-    ${PYTHON_EXECUTABLE}
-    ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py
-    ${LIT_ARGS}
-    )
+  if (LLVM_MAIN_SRC_DIR)
+    set (LIT_COMMAND ${PYTHON_EXECUTABLE} ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py)
+  else()
+    find_program(LIT_COMMAND llvm-lit)
+  endif ()
+  list(APPEND LIT_COMMAND ${LIT_ARGS})
   foreach(param ${ARG_PARAMS})
     list(APPEND LIT_COMMAND --param ${param})
   endforeach()
diff --git a/cmake/modules/AddSphinxTarget.cmake b/cmake/modules/AddSphinxTarget.cmake
new file mode 100644
index 0000000..fc28a49
--- /dev/null
+++ b/cmake/modules/AddSphinxTarget.cmake
@@ -0,0 +1,56 @@
+# Handy function for creating the different Sphinx targets.
+#
+# ``builder`` should be one of the supported builders used by
+# the sphinx-build command.
+#
+# ``project`` should be the project name
+function (add_sphinx_target builder project)
+  set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/${builder}")
+  set(SPHINX_DOC_TREE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
+  set(SPHINX_TARGET_NAME docs-${project}-${builder})
+  add_custom_target(${SPHINX_TARGET_NAME}
+                    COMMAND ${SPHINX_EXECUTABLE}
+                            -b ${builder}
+                            -d "${SPHINX_DOC_TREE_DIR}"
+                            -q # Quiet: no output other than errors and warnings.
+                            -W # Warnings are errors.
+                            "${CMAKE_CURRENT_SOURCE_DIR}" # Source
+                            "${SPHINX_BUILD_DIR}" # Output
+                    COMMENT
+                    "Generating ${builder} Sphinx documentation for ${project}")
+
+  # When "clean" target is run, remove the Sphinx build directory
+  set_property(DIRECTORY APPEND PROPERTY
+               ADDITIONAL_MAKE_CLEAN_FILES
+               "${SPHINX_BUILD_DIR}")
+
+  # We need to remove ${SPHINX_DOC_TREE_DIR} when make clean is run
+  # but we should only add this path once
+  get_property(_CURRENT_MAKE_CLEAN_FILES
+               DIRECTORY PROPERTY ADDITIONAL_MAKE_CLEAN_FILES)
+  list(FIND _CURRENT_MAKE_CLEAN_FILES "${SPHINX_DOC_TREE_DIR}" _INDEX)
+  if (_INDEX EQUAL -1)
+    set_property(DIRECTORY APPEND PROPERTY
+                 ADDITIONAL_MAKE_CLEAN_FILES
+                 "${SPHINX_DOC_TREE_DIR}")
+  endif()
+
+  if (LLVM_BUILD_DOCS)
+    add_dependencies(sphinx ${SPHINX_TARGET_NAME})
+
+    # Handle installation
+    if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+      if (builder STREQUAL man)
+        # FIXME: We might not ship all the tools that these man pages describe
+        install(DIRECTORY "${SPHINX_BUILD_DIR}/" # Slash indicates contents of
+                DESTINATION share/man/man1)
+
+      elseif (builder STREQUAL html)
+        install(DIRECTORY "${SPHINX_BUILD_DIR}"
+                DESTINATION "share/doc/${project}")
+      else()
+        message(WARNING Installation of ${builder} not supported)
+      endif()
+    endif()
+  endif()
+endfunction()
diff --git a/cmake/modules/FindSphinx.cmake b/cmake/modules/FindSphinx.cmake
new file mode 100644
index 0000000..a2adcae
--- /dev/null
+++ b/cmake/modules/FindSphinx.cmake
@@ -0,0 +1,25 @@
+# CMake find_package() Module for Sphinx documentation generator
+# http://sphinx-doc.org/
+#
+# Example usage:
+#
+# find_package(Sphinx)
+#
+# If successful the following variables will be defined
+# SPHINX_FOUND
+# SPHINX_EXECUTABLE
+
+find_program(SPHINX_EXECUTABLE
+             NAMES sphinx-build sphinx-build2
+             DOC "Path to sphinx-build executable")
+
+# Handle REQUIRED and QUIET arguments
+# this will also set SPHINX_FOUND to true if SPHINX_EXECUTABLE exists
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Sphinx
+                                  "Failed to locate sphinx-build executable"
+                                  SPHINX_EXECUTABLE)
+
+# Provide options for controlling different types of output
+option(SPHINX_OUTPUT_HTML "Output standalone HTML files" ON)
+option(SPHINX_OUTPUT_MAN "Output man pages" ON)
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index d5afc62..447ba52 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -56,13 +56,16 @@ if( LLVM_ENABLE_ASSERTIONS )
   if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
     add_definitions( -UNDEBUG )
     # Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines.
-    set(REGEXP_NDEBUG "(^| )[/-]D *NDEBUG($| )")
-    string (REGEX REPLACE "${REGEXP_NDEBUG}" " "
-      CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
-    string (REGEX REPLACE "${REGEXP_NDEBUG}" " "
-      CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
-    string (REGEX REPLACE "${REGEXP_NDEBUG}" " "
-      CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}")
+    foreach (flags_var_to_scrub
+        CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_CXX_FLAGS_MINSIZEREL
+        CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS_MINSIZEREL)
+      string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " "
+        "${flags_var_to_scrub}" "${${flags_var_to_scrub}}")
+    endforeach()
   endif()
 else()
   if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELEASE" )
@@ -295,6 +298,17 @@ elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
     append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
     check_cxx_compiler_flag("-Werror -Wnon-virtual-dtor" CXX_SUPPORTS_NON_VIRTUAL_DTOR_FLAG)
     append_if(CXX_SUPPORTS_NON_VIRTUAL_DTOR_FLAG "-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
+
+    # Check if -Wcomment is OK with an // comment ending with '\' if the next
+    # line is also a // comment.
+    set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} -Werror -Wcomment)
+    CHECK_C_SOURCE_COMPILES("// \\\\\\n//\\nint main() {return 0;}"
+                            C_WCOMMENT_ALLOWS_LINE_WRAP)
+    set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+    if (NOT C_WCOMMENT_ALLOWS_LINE_WRAP)
+      append("-Wno-comment" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    endif()
   endif (LLVM_ENABLE_WARNINGS)
   append_if(LLVM_ENABLE_WERROR "-Werror" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   if (LLVM_ENABLE_CXX1Y)
diff --git a/cmake/modules/LLVMConfig.cmake.in b/cmake/modules/LLVMConfig.cmake.in
index 2f8d037..780001a 100644
--- a/cmake/modules/LLVMConfig.cmake.in
+++ b/cmake/modules/LLVMConfig.cmake.in
@@ -4,6 +4,7 @@
 
 set(LLVM_VERSION_MAJOR @LLVM_VERSION_MAJOR@)
 set(LLVM_VERSION_MINOR @LLVM_VERSION_MINOR@)
+set(LLVM_VERSION_PATCH @LLVM_VERSION_PATCH@)
 set(LLVM_PACKAGE_VERSION @PACKAGE_VERSION@)
 
 set(LLVM_COMMON_DEPENDS @LLVM_COMMON_DEPENDS@)
diff --git a/cmake/modules/LLVMConfigVersion.cmake.in b/cmake/modules/LLVMConfigVersion.cmake.in
index add5aa9..e9ac4ed 100644
--- a/cmake/modules/LLVMConfigVersion.cmake.in
+++ b/cmake/modules/LLVMConfigVersion.cmake.in
@@ -1 +1,13 @@
-set(PACKAGE_VERSION "@PACKAGE_VERSION@")
\ No newline at end of file
+set(PACKAGE_VERSION "@PACKAGE_VERSION@")
+
+# LLVM is API-compatible only with matching major.minor versions
+# and patch versions not less than that requested.
+if("@LLVM_VERSION_MAJOR@.@LLVM_VERSION_MINOR@" VERSION_EQUAL
+    "${PACKAGE_FIND_VERSION_MAJOR}.${PACKAGE_FIND_VERSION_MINOR}"
+   AND NOT "@LLVM_VERSION_PATCH@" VERSION_LESS "${PACKAGE_FIND_VERSION_PATCH}")
+  set(PACKAGE_VERSION_COMPATIBLE 1)
+  if("@LLVM_VERSION_PATCH@" VERSION_EQUAL
+      "${PACKAGE_FIND_VERSION_PATCH}")
+    set(PACKAGE_VERSION_EXACT 1)
+  endif()
+endif()
diff --git a/cmake/modules/Makefile b/cmake/modules/Makefile
index 8f20ddf..265c1f8 100644
--- a/cmake/modules/Makefile
+++ b/cmake/modules/Makefile
@@ -24,6 +24,7 @@ $(PROJ_OBJ_DIR)/LLVMConfig.cmake: LLVMConfig.cmake.in $(LLVMBuildCMakeFrag)
 	  -e 's/@LLVM_CONFIG_CODE@/set(LLVM_INSTALL_PREFIX "'"$(subst /,\/,$(PROJ_prefix))"'")/' \
 	  -e 's/@LLVM_VERSION_MAJOR@/'"$(LLVM_VERSION_MAJOR)"'/' \
 	  -e 's/@LLVM_VERSION_MINOR@/'"$(LLVM_VERSION_MINOR)"'/' \
+	  -e 's/@LLVM_VERSION_PATCH@/'"$(LLVM_VERSION_PATCH)"'/' \
 	  -e 's/@PACKAGE_VERSION@/'"$(LLVMVersion)"'/' \
 	  -e 's/@LLVM_COMMON_DEPENDS@//' \
 	  -e 's/@LLVM_AVAILABLE_LIBS@/'"$(subst -l,,$(LLVMConfigLibs))"'/' \
@@ -51,6 +52,9 @@ $(PROJ_OBJ_DIR)/LLVMConfigVersion.cmake: LLVMConfigVersion.cmake.in
 	$(Echo) 'Generating LLVM CMake package version file'
 	$(Verb) cat $< | sed \
 	  -e 's/@PACKAGE_VERSION@/'"$(LLVMVersion)"'/' \
+	  -e 's/@LLVM_VERSION_MAJOR@/'"$(LLVM_VERSION_MAJOR)"'/' \
+	  -e 's/@LLVM_VERSION_MINOR@/'"$(LLVM_VERSION_MINOR)"'/' \
+	  -e 's/@LLVM_VERSION_PATCH@/'"$(LLVM_VERSION_PATCH)"'/' \
 	  > $@
 
 $(PROJ_OBJ_DIR)/LLVMExports.cmake: $(LLVMBuildCMakeExportsFrag)
diff --git a/configure b/configure
index 778aa18..e1959df 100755
--- a/configure
+++ b/configure
@@ -4151,7 +4151,7 @@ else
   amd64-* | x86_64-*)     llvm_cv_target_arch="x86_64" ;;
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
-  arm64*-*)               llvm_cv_target_arch="ARM64" ;;
+  arm64*-*)               llvm_cv_target_arch="AArch64" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
   aarch64*-*)             llvm_cv_target_arch="AArch64" ;;
   mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
@@ -4188,7 +4188,7 @@ case $host in
   amd64-* | x86_64-*)     host_arch="x86_64" ;;
   sparc*-*)               host_arch="Sparc" ;;
   powerpc*-*)             host_arch="PowerPC" ;;
-  arm64*-*)               host_arch="ARM64" ;;
+  arm64*-*)               host_arch="AArch64" ;;
   arm*-*)                 host_arch="ARM" ;;
   aarch64*-*)             host_arch="AArch64" ;;
   mips-* | mips64-*)      host_arch="Mips" ;;
@@ -5103,8 +5103,6 @@ else
  ;;
     ARM)         TARGET_HAS_JIT=1
  ;;
-    AArch64)     TARGET_HAS_JIT=0
- ;;
     Mips)        TARGET_HAS_JIT=1
  ;;
     XCore)       TARGET_HAS_JIT=0
@@ -5122,7 +5120,7 @@ else
   esac
 fi
 
-TARGETS_WITH_JIT="AArch64 ARM ARM64 Mips PowerPC SystemZ X86"
+TARGETS_WITH_JIT="ARM AArch64 Mips PowerPC SystemZ X86"
 TARGETS_WITH_JIT=$TARGETS_WITH_JIT
 
 
@@ -5359,7 +5357,7 @@ _ACEOF
 
 fi
 
-ALL_TARGETS="X86 Sparc PowerPC AArch64 ARM ARM64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
+ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600"
 ALL_TARGETS=$ALL_TARGETS
 
 
@@ -5383,7 +5381,7 @@ case "$enableval" in
         sparc)    TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
         powerpc)  TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
         aarch64)  TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
-        arm64)    TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;;
+        arm64)    TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
         arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
         mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         mipsel)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
diff --git a/docs/ARM-BE-bitcastfail.png b/docs/ARM-BE-bitcastfail.png
new file mode 100644
index 0000000..0c83f0b
Binary files /dev/null and b/docs/ARM-BE-bitcastfail.png differ
diff --git a/docs/ARM-BE-bitcastsuccess.png b/docs/ARM-BE-bitcastsuccess.png
new file mode 100644
index 0000000..8f3414d
Binary files /dev/null and b/docs/ARM-BE-bitcastsuccess.png differ
diff --git a/docs/ARM-BE-ld1.png b/docs/ARM-BE-ld1.png
new file mode 100644
index 0000000..f58c99d
Binary files /dev/null and b/docs/ARM-BE-ld1.png differ
diff --git a/docs/ARM-BE-ldr.png b/docs/ARM-BE-ldr.png
new file mode 100644
index 0000000..73db07e
Binary files /dev/null and b/docs/ARM-BE-ldr.png differ
diff --git a/docs/AliasAnalysis.rst b/docs/AliasAnalysis.rst
index 712d57d..1cbaee7 100644
--- a/docs/AliasAnalysis.rst
+++ b/docs/AliasAnalysis.rst
@@ -51,7 +51,7 @@ starting address and size, and function calls are represented as the actual
 get mod/ref information for arbitrary instructions.
 
 All ``AliasAnalysis`` interfaces require that in queries involving multiple
-values, values which are not `constants <LangRef.html#constants>`_ are all
+values, values which are not :ref:`constants <constants>` are all
 defined within the same function.
 
 Representation of Pointers
@@ -111,7 +111,7 @@ returns MustAlias, PartialAlias, MayAlias, or NoAlias as appropriate.
 
 Like all ``AliasAnalysis`` interfaces, the ``alias`` method requires that either
 the two pointer values be defined within the same function, or at least one of
-the values is a `constant <LangRef.html#constants>`_.
+the values is a :ref:`constant <constants>`.
 
 .. _Must, May, or No:
 
@@ -126,7 +126,7 @@ used for reading memory. Another is when the memory is freed and reallocated
 between accesses through one pointer and accesses through the other --- in this
 case, there is a dependence, but it's mediated by the free and reallocation.
 
-As an exception to this is with the `noalias <LangRef.html#noalias>`_ keyword;
+As an exception to this is with the :ref:`noalias <noalias>` keyword;
 the "irrelevant" dependencies are ignored.
 
 The ``MayAlias`` response is used whenever the two pointers might refer to the
@@ -246,6 +246,20 @@ analysis run method (``run`` for a ``Pass``, ``runOnFunction`` for a
     return false;
   }
 
+Required methods to override
+----------------------------
+
+You must override the ``getAdjustedAnalysisPointer`` method on all subclasses
+of ``AliasAnalysis``. An example implementation of this method would look like:
+
+.. code-block:: c++
+
+  void *getAdjustedAnalysisPointer(const void* ID) override {
+    if (ID == &AliasAnalysis::ID)
+      return (AliasAnalysis*)this;
+    return this;
+  }
+
 Interfaces which may be specified
 ---------------------------------
 
diff --git a/docs/BigEndianNEON.rst b/docs/BigEndianNEON.rst
new file mode 100644
index 0000000..242eb0e
--- /dev/null
+++ b/docs/BigEndianNEON.rst
@@ -0,0 +1,205 @@
+==============================================
+Using ARM NEON instructions in big endian mode
+==============================================
+
+.. contents::
+    :local:
+
+Introduction
+============
+
+Generating code for big endian ARM processors is for the most part straightforward. NEON loads and stores however have some interesting properties that make code generation decisions less obvious in big endian mode.
+
+The aim of this document is to explain the problem with NEON loads and stores, and the solution that has been implemented in LLVM.
+
+In this document the term "vector" refers to what the ARM ABI calls a "short vector", which is a sequence of items that can fit in a NEON register. This sequence can be 64 or 128 bits in length, and can constitute 8, 16, 32 or 64 bit items. This document refers to A64 instructions throughout, but is almost applicable to the A32/ARMv7 instruction sets also. The ABI format for passing vectors in A32 is sligtly different to A64. Apart from that, the same concepts apply.
+
+Example: C-level intrinsics -> assembly
+---------------------------------------
+
+It may be helpful first to illustrate how C-level ARM NEON intrinsics are lowered to instructions.
+
+This trivial C function takes a vector of four ints and sets the zero'th lane to the value "42"::
+
+    #include <arm_neon.h>
+    int32x4_t f(int32x4_t p) {
+        return vsetq_lane_s32(42, p, 0);
+    }
+
+arm_neon.h intrinsics generate "generic" IR where possible (that is, normal IR instructions not ``llvm.arm.neon.*`` intrinsic calls). The above generates::
+
+    define <4 x i32> @f(<4 x i32> %p) {
+      %vset_lane = insertelement <4 x i32> %p, i32 42, i32 0
+      ret <4 x i32> %vset_lane
+    }
+
+Which then becomes the following trivial assembly::
+
+    f:                                      // @f
+            movz	w8, #0x2a
+            ins 	v0.s[0], w8
+            ret
+
+Problem
+=======
+
+The main problem is how vectors are represented in memory and in registers.
+
+First, a recap. The "endianness" of an item affects its representation in memory only. In a register, a number is just a sequence of bits - 64 bits in the case of AArch64 general purpose registers. Memory, however, is a sequence of addressable units of 8 bits in size. Any number greater than 8 bits must therefore be split up into 8-bit chunks, and endianness describes the order in which these chunks are laid out in memory.
+
+A "little endian" layout has the least significant byte first (lowest in memory address). A "big endian" layout has the *most* significant byte first. This means that when loading an item from big endian memory, the lowest 8-bits in memory must go in the most significant 8-bits, and so forth.
+
+``LDR`` and ``LD1``
+===================
+
+.. figure:: ARM-BE-ldr.png
+    :align: right
+    
+    Big endian vector load using ``LDR``.
+
+
+A vector is a consecutive sequence of items that are operated on simultaneously. To load a 64-bit vector, 64 bits need to be read from memory. In little endian mode, we can do this by just performing a 64-bit load - ``LDR q0, [foo]``. However if we try this in big endian mode, because of the byte swapping the lane indices end up being swapped! The zero'th item as laid out in memory becomes the n'th lane in the vector.
+
+.. figure:: ARM-BE-ld1.png
+    :align: right
+
+    Big endian vector load using ``LD1``. Note that the lanes retain the correct ordering.
+
+
+Because of this, the instruction ``LD1`` performs a vector load but performs byte swapping not on the entire 64 bits, but on the individual items within the vector. This means that the register content is the same as it would have been on a little endian system.
+
+It may seem that ``LD1`` should suffice to peform vector loads on a big endian machine. However there are pros and cons to the two approaches that make it less than simple which register format to pick.
+
+There are two options:
+
+    1. The content of a vector register is the same *as if* it had been loaded with an ``LDR`` instruction.
+    2. The content of a vector register is the same *as if* it had been loaded with an ``LD1`` instruction.
+
+Because ``LD1 == LDR + REV`` and similarly ``LDR == LD1 + REV`` (on a big endian system), we can simulate either type of load with the other type of load plus a ``REV`` instruction. So we're not deciding which instructions to use, but which format to use (which will then influence which instruction is best to use).
+
+.. The 'clearer' container is required to make the following section header come after the floated
+   images above.
+.. container:: clearer
+
+    Note that throughout this section we only mention loads. Stores have exactly the same problems as their associated loads, so have been skipped for brevity.
+ 
+
+Considerations
+==============
+
+LLVM IR Lane ordering
+---------------------
+
+LLVM IR has first class vector types. In LLVM IR, the zero'th element of a vector resides at the lowest memory address. The optimizer relies on this property in certain areas, for example when concatenating vectors together. The intention is for arrays and vectors to have identical memory layouts - ``[4 x i8]`` and ``<4 x i8>`` should be represented the same in memory. Without this property there would be many special cases that the optimizer would have to cleverly handle.
+
+Use of ``LDR`` would break this lane ordering property. This doesn't preclude the use of ``LDR``, but we would have to do one of two things:
+
+   1. Insert a ``REV`` instruction to reverse the lane order after every ``LDR``.
+   2. Disable all optimizations that rely on lane layout, and for every access to an individual lane (``insertelement``/``extractelement``/``shufflevector``) reverse the lane index.
+
+AAPCS
+-----
+
+The ARM procedure call standard (AAPCS) defines the ABI for passing vectors between functions in registers. It states:
+
+    When a short vector is transferred between registers and memory it is treated as an opaque object. That is a short vector is stored in memory as if it were stored with a single ``STR`` of the entire register; a short vector is loaded from memory using the corresponding ``LDR`` instruction. On a little-endian system this means that element 0 will always contain the lowest addressed element of a short vector; on a big-endian system element 0 will contain the highest-addressed element of a short vector.
+
+    -- Procedure Call Standard for the ARM 64-bit Architecture (AArch64), 4.1.2 Short Vectors
+
+The use of ``LDR`` and ``STR`` as the ABI defines has at least one advantage over ``LD1`` and ``ST1``. ``LDR`` and ``STR`` are oblivious to the size of the individual lanes of a vector. ``LD1`` and ``ST1`` are not - the lane size is encoded within them. This is important across an ABI boundary, because it would become necessary to know the lane width the callee expects. Consider the following code:
+
+.. code-block:: c
+
+    <callee.c>
+    void callee(uint32x2_t v) {
+      ...
+    }
+
+    <caller.c>
+    extern void callee(uint32x2_t);
+    void caller() {
+      callee(...);
+    }
+
+If ``callee`` changed its signature to ``uint16x4_t``, which is equivalent in register content, if we passed as ``LD1`` we'd break this code until ``caller`` was updated and recompiled.
+
+There is an argument that if the signatures of the two functions are different then the behaviour should be undefined. But there may be functions that are agnostic to the lane layout of the vector, and treating the vector as an opaque value (just loading it and storing it) would be impossible without a common format across ABI boundaries.
+
+So to preserve ABI compatibility, we need to use the ``LDR`` lane layout across function calls.
+
+Alignment
+---------
+
+In strict alignment mode, ``LDR qX`` requires its address to be 128-bit aligned, whereas ``LD1`` only requires it to be as aligned as the lane size. If we canonicalised on using ``LDR``, we'd still need to use ``LD1`` in some places to avoid alignment faults (the result of the ``LD1`` would then need to be reversed with ``REV``).
+
+Most operating systems however do not run with alignment faults enabled, so this is often not an issue.
+
+Summary
+-------
+
+The following table summarises the instructions that are required to be emitted for each property mentioned above for each of the two solutions.
+
++-------------------------------+-------------------------------+---------------------+
+|                               | ``LDR`` layout                | ``LD1`` layout      |
++===============================+===============================+=====================+
+| Lane ordering                 |   ``LDR + REV``               |    ``LD1``          |
++-------------------------------+-------------------------------+---------------------+
+| AAPCS                         |   ``LDR``                     |    ``LD1 + REV``    |
++-------------------------------+-------------------------------+---------------------+
+| Alignment for strict mode     |   ``LDR`` / ``LD1 + REV``     |    ``LD1``          |
++-------------------------------+-------------------------------+---------------------+
+
+Neither approach is perfect, and choosing one boils down to choosing the lesser of two evils. The issue with lane ordering, it was decided, would have to change target-agnostic compiler passes and would result in a strange IR in which lane indices were reversed. It was decided that this was worse than the changes that would have to be made to support ``LD1``, so ``LD1`` was chosen as the canonical vector load instruction (and by inference, ``ST1`` for vector stores).
+
+Implementation
+==============
+
+There are 3 parts to the implementation:
+
+    1. Predicate ``LDR`` and ``STR`` instructions so that they are never allowed to be selected to generate vector loads and stores. The exception is one-lane vectors [1]_ - these by definition cannot have lane ordering problems so are fine to use ``LDR``/``STR``. 
+
+    2. Create code generation patterns for bitconverts that create ``REV`` instructions.
+
+    3. Make sure appropriate bitconverts are created so that vector values get passed over call boundaries as 1-element vectors (which is the same as if they were loaded with ``LDR``).
+
+Bitconverts
+-----------
+
+.. image:: ARM-BE-bitcastfail.png
+    :align: right
+
+The main problem with the ``LD1`` solution is dealing with bitconverts (or bitcasts, or reinterpret casts). These are pseudo instructions that only change the compiler's interpretation of data, not the underlying data itself. A requirement is that if data is loaded and then saved again (called a "round trip"), the memory contents should be the same after the store as before the load. If a vector is loaded and is then bitconverted to a different vector type before storing, the round trip will currently be broken.
+
+Take for example this code sequence::
+
+    %0 = load <4 x i32> %x
+    %1 = bitcast <4 x i32> %0 to <2 x i64>
+         store <2 x i64> %1, <2 x i64>* %y
+
+This would produce a code sequence such as that in the figure on the right. The mismatched ``LD1`` and ``ST1`` cause the stored data to differ from the loaded data.
+
+.. container:: clearer
+
+    When we see a bitcast from type ``X`` to type ``Y``, what we need to do is to change the in-register representation of the data to be *as if* it had just been loaded by a ``LD1`` of type ``Y``.
+
+.. image:: ARM-BE-bitcastsuccess.png
+    :align: right
+
+Conceptually this is simple - we can insert a ``REV`` undoing the ``LD1`` of type ``X`` (converting the in-register representation to the same as if it had been loaded by ``LDR``) and then insert another ``REV`` to change the representation to be as if it had been loaded by an ``LD1`` of type ``Y``.
+
+For the previous example, this would be::
+
+    LD1   v0.4s, [x]
+
+    REV64 v0.4s, v0.4s                  // There is no REV128 instruction, so it must be synthesizedcd 
+    EXT   v0.16b, v0.16b, v0.16b, #8    // with a REV64 then an EXT to swap the two 64-bit elements.
+
+    REV64 v0.2d, v0.2d
+    EXT   v0.16b, v0.16b, v0.16b, #8
+
+    ST1   v0.2d, [y]
+
+It turns out that these ``REV`` pairs can, in almost all cases, be squashed together into a single ``REV``. For the example above, a ``REV128 4s`` + ``REV128 2d`` is actually a ``REV64 4s``, as shown in the figure on the right.
+
+.. [1] One lane vectors may seem useless as a concept but they serve to distinguish between values held in general purpose registers and values held in NEON/VFP registers. For example, an ``i64`` would live in an ``x`` register, but ``<1 x i64>`` would live in a ``d`` register.
+
diff --git a/docs/BitCodeFormat.rst b/docs/BitCodeFormat.rst
index 86436ff..fce1e37 100644
--- a/docs/BitCodeFormat.rst
+++ b/docs/BitCodeFormat.rst
@@ -747,8 +747,6 @@ function. The operand fields are:
   * ``arm_apcscc``: code 66
   * ``arm_aapcscc``: code 67
   * ``arm_aapcs_vfpcc``: code 68
-  * ``x86_thiscallcc``: code 70
-  * ``x86_cdeclmethodcc``: code 80
 
 * isproto*: Non-zero if this entry represents a declaration rather than a
   definition
diff --git a/docs/BlockFrequencyTerminology.rst b/docs/BlockFrequencyTerminology.rst
new file mode 100644
index 0000000..41f89f8
--- /dev/null
+++ b/docs/BlockFrequencyTerminology.rst
@@ -0,0 +1,130 @@
+================================
+LLVM Block Frequency Terminology
+================================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+Block Frequency is a metric for estimating the relative frequency of different
+basic blocks.  This document describes the terminology that the
+``BlockFrequencyInfo`` and ``MachineBlockFrequencyInfo`` analysis passes use.
+
+Branch Probability
+==================
+
+Blocks with multiple successors have probabilities associated with each
+outgoing edge.  These are called branch probabilities.  For a given block, the
+sum of its outgoing branch probabilities should be 1.0.
+
+Branch Weight
+=============
+
+Rather than storing fractions on each edge, we store an integer weight.
+Weights are relative to the other edges of a given predecessor block.  The
+branch probability associated with a given edge is its own weight divided by
+the sum of the weights on the predecessor's outgoing edges.
+
+For example, consider this IR:
+
+.. code-block:: llvm
+
+   define void @foo() {
+       ; ...
+       A:
+           br i1 %cond, label %B, label %C, !prof !0
+       ; ...
+   }
+   !0 = metadata !{metadata !"branch_weights", i32 7, i32 8}
+
+and this simple graph representation::
+
+   A -> B  (edge-weight: 7)
+   A -> C  (edge-weight: 8)
+
+The probability of branching from block A to block B is 7/15, and the
+probability of branching from block A to block C is 8/15.
+
+See :doc:`BranchWeightMetadata` for details about the branch weight IR
+representation.
+
+Block Frequency
+===============
+
+Block frequency is a relative metric that represents the number of times a
+block executes.  The ratio of a block frequency to the entry block frequency is
+the expected number of times the block will execute per entry to the function.
+
+Block frequency is the main output of the ``BlockFrequencyInfo`` and
+``MachineBlockFrequencyInfo`` analysis passes.
+
+Implementation: a series of DAGs
+================================
+
+The implementation of the block frequency calculation analyses each loop,
+bottom-up, ignoring backedges; i.e., as a DAG.  After each loop is processed,
+it's packaged up to act as a pseudo-node in its parent loop's (or the
+function's) DAG analysis.
+
+Block Mass
+==========
+
+For each DAG, the entry node is assigned a mass of ``UINT64_MAX`` and mass is
+distributed to successors according to branch weights.  Block Mass uses a
+fixed-point representation where ``UINT64_MAX`` represents ``1.0`` and ``0``
+represents a number just above ``0.0``.
+
+After mass is fully distributed, in any cut of the DAG that separates the exit
+nodes from the entry node, the sum of the block masses of the nodes succeeded
+by a cut edge should equal ``UINT64_MAX``.  In other words, mass is conserved
+as it "falls" through the DAG.
+
+If a function's basic block graph is a DAG, then block masses are valid block
+frequencies.  This works poorly in practise though, since downstream users rely
+on adding block frequencies together without hitting the maximum.
+
+Loop Scale
+==========
+
+Loop scale is a metric that indicates how many times a loop iterates per entry.
+As mass is distributed through the loop's DAG, the (otherwise ignored) backedge
+mass is collected.  This backedge mass is used to compute the exit frequency,
+and thus the loop scale.
+
+Implementation: Getting from mass and scale to frequency
+========================================================
+
+After analysing the complete series of DAGs, each block has a mass (local to
+its containing loop, if any), and each loop pseudo-node has a loop scale and
+its own mass (from its parent's DAG).
+
+We can get an initial frequency assignment (with entry frequency of 1.0) by
+multiplying these masses and loop scales together.  A given block's frequency
+is the product of its mass, the mass of containing loops' pseudo nodes, and the
+containing loops' loop scales.
+
+Since downstream users need integers (not floating point), this initial
+frequency assignment is shifted as necessary into the range of ``uint64_t``.
+
+Block Bias
+==========
+
+Block bias is a proposed *absolute* metric to indicate a bias toward or away
+from a given block during a function's execution.  The idea is that bias can be
+used in isolation to indicate whether a block is relatively hot or cold, or to
+compare two blocks to indicate whether one is hotter or colder than the other.
+
+The proposed calculation involves calculating a *reference* block frequency,
+where:
+
+* every branch weight is assumed to be 1 (i.e., every branch probability
+  distribution is even) and
+
+* loop scales are ignored.
+
+This reference frequency represents what the block frequency would be in an
+unbiased graph.
+
+The bias is the ratio of the block frequency to this reference block frequency.
diff --git a/docs/BranchWeightMetadata.rst b/docs/BranchWeightMetadata.rst
index 71ecd34..aff7923 100644
--- a/docs/BranchWeightMetadata.rst
+++ b/docs/BranchWeightMetadata.rst
@@ -8,10 +8,11 @@ LLVM Branch Weight Metadata
 Introduction
 ============
 
-Branch Weight Metadata represents branch weights as its likeliness to be
-taken. Metadata is assigned to the ``TerminatorInst`` as a ``MDNode`` of the
-``MD_prof`` kind. The first operator is always a ``MDString`` node with the
-string "branch_weights". Number of operators depends on the terminator type.
+Branch Weight Metadata represents branch weights as its likeliness to be taken
+(see :doc:`BlockFrequencyTerminology`). Metadata is assigned to the
+``TerminatorInst`` as a ``MDNode`` of the ``MD_prof`` kind. The first operator
+is always a ``MDString`` node with the string "branch_weights".  Number of
+operators depends on the terminator type.
 
 Branch weights might be fetch from the profiling file, or generated based on
 `__builtin_expect`_ instruction.
diff --git a/docs/Bugpoint.rst b/docs/Bugpoint.rst
index 1a5fc8c..8fa64bc 100644
--- a/docs/Bugpoint.rst
+++ b/docs/Bugpoint.rst
@@ -17,7 +17,7 @@ optimization (or combination of optimizations) that causes the crash, and reduce
 the file down to a small example which triggers the crash.
 
 For detailed case scenarios, such as debugging ``opt``, or one of the LLVM code
-generators, see `How To Submit a Bug Report document <HowToSubmitABug.html>`_.
+generators, see :doc:`HowToSubmitABug`.
 
 Design Philosophy
 =================
diff --git a/docs/CMake.rst b/docs/CMake.rst
index cbca1db..fed283d 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@@ -87,7 +87,7 @@ names are case-sensitive. Example:
 
 .. code-block:: console
 
-  $ cmake -G "Visual Studio 10" path/to/llvm/source/root
+  $ cmake -G "Visual Studio 11" path/to/llvm/source/root
 
 For a given development platform there can be more than one adequate
 generator. If you use Visual Studio "NMake Makefiles" is a generator you can use
@@ -211,8 +211,8 @@ LLVM-specific variables
 **LLVM_ENABLE_THREADS**:BOOL
   Build with threads support, if available. Defaults to ON.
 
-**LLVM_ENABLE_CXX11**:BOOL
-  Build in C++11 mode, if available. Defaults to OFF.
+**LLVM_ENABLE_CXX1Y**:BOOL
+  Build in C++1y mode, if available. Defaults to OFF.
 
 **LLVM_ENABLE_ASSERTIONS**:BOOL
   Enables code assertions. Defaults to OFF if and only if ``CMAKE_BUILD_TYPE``
@@ -283,6 +283,12 @@ LLVM-specific variables
   are ``Address``, ``Memory`` and ``MemoryWithOrigins``. Defaults to empty
   string.
 
+**LLVM_BUILD_DOCS**:BOOL
+  Enables all enabled documentation targets (i.e. Doxgyen and Sphinx targets) to
+  be built as part of the normal build. If the ``install`` target is run then
+  this also enables all built documentation targets to be installed. Defaults to
+  OFF.
+
 **LLVM_ENABLE_DOXYGEN**:BOOL
   Enables the generation of browsable HTML documentation using doxygen.
   Defaults to OFF.
@@ -306,14 +312,13 @@ LLVM-specific variables
 
 **LLVM_DOXYGEN_QHP_NAMESPACE**:STRING
   Namespace under which the intermediate Qt Help Project file lives. See `Qt
-  Help Project <http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace>`_
+  Help Project`_
   for more information. Defaults to "org.llvm". This option is only useful in
   combination with ``-DLLVM_ENABLE_DOXYGEN_QT_HELP=ON``; otherwise
   this has no effect.
     
 **LLVM_DOXYGEN_QHP_CUST_FILTER_NAME**:STRING
-  See `Qt Help Project
-  <http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-filters>`_ for
+  See `Qt Help Project`_ for
   more information. Defaults to the CMake variable ``${PACKAGE_STRING}`` which
   is a combination of the package name and version string. This filter can then
   be used in Qt Creator to select only documentation from LLVM when browsing
@@ -321,12 +326,35 @@ LLVM-specific variables
   useful in combination with ``-DLLVM_ENABLE_DOXYGEN_QT_HELP=ON``;
   otherwise this has no effect.
 
+.. _Qt Help Project: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-filters
+
 **LLVM_DOXYGEN_QHELPGENERATOR_PATH**:STRING
   The path to the ``qhelpgenerator`` executable. Defaults to whatever CMake's
   ``find_program()`` can find. This option is only useful in combination with
   ``-DLLVM_ENABLE_DOXYGEN_QT_HELP=ON``; otherwise this has no
   effect.
 
+**LLVM_ENABLE_SPHINX**:BOOL
+  If enabled CMake will search for the ``sphinx-build`` executable and will make
+  the ``SPHINX_OUTPUT_HTML`` and ``SPHINX_OUTPUT_MAN`` CMake options available.
+  Defaults to OFF.
+
+**SPHINX_EXECUTABLE**:STRING
+  The path to the ``sphinx-build`` executable detected by CMake.
+
+**SPHINX_OUTPUT_HTML**:BOOL
+  If enabled (and ``LLVM_ENABLE_SPHINX`` is enabled) then the targets for
+  building the documentation as html are added (but not built by default unless
+  ``LLVM_BUILD_DOCS`` is enabled). There is a target for each project in the
+  source tree that uses sphinx (e.g.  ``docs-llvm-html``, ``docs-clang-html``
+  and ``docs-lld-html``). Defaults to ON.
+
+**SPHINX_OUTPUT_MAN**:BOOL
+  If enabled (and ``LLVM_ENABLE_SPHINX`` is enabled) the targets for building
+  the man pages are added (but not built by default unless ``LLVM_BUILD_DOCS``
+  is enabled). Currently the only target added is ``docs-llvm-man``. Defaults
+  to ON.
+
 Executing the test suite
 ========================
 
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index f0aa9c2..d310a0a 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -89,3 +89,18 @@ if (LLVM_ENABLE_DOXYGEN)
   endif()
 endif()
 endif()
+
+if (LLVM_ENABLE_SPHINX)
+  if (SPHINX_FOUND)
+    include(AddSphinxTarget)
+    if (${SPHINX_OUTPUT_HTML})
+      add_sphinx_target(html llvm)
+    endif()
+
+
+    if (${SPHINX_OUTPUT_MAN})
+      add_sphinx_target(man llvm)
+    endif()
+
+  endif()
+endif()
diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst
index d7d98bc..cc09946 100644
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst
@@ -70,7 +70,7 @@ different pieces of this will be useful to you.  In any case, you should be
 familiar with the `target description`_ and `machine code representation`_
 classes.  If you want to add a backend for a new target, you will need to
 `implement the target description`_ classes for your new target and understand
-the `LLVM code representation <LangRef.html>`_.  If you are interested in
+the :doc:`LLVM code representation <LangRef>`.  If you are interested in
 implementing a new `code generation algorithm`_, it should only depend on the
 target-description and machine code representation classes, ensuring that it is
 portable.
@@ -172,7 +172,7 @@ architecture.  These target descriptions often have a large amount of common
 information (e.g., an ``add`` instruction is almost identical to a ``sub``
 instruction).  In order to allow the maximum amount of commonality to be
 factored out, the LLVM code generator uses the
-:doc:`TableGen <TableGenFundamentals>` tool to describe big chunks of the
+:doc:`TableGen/index` tool to describe big chunks of the
 target machine, which allows the use of domain-specific and target-specific
 abstractions to reduce the amount of repetition.
 
@@ -277,7 +277,7 @@ an associated register class.  When the register allocator runs, it replaces
 virtual registers with a physical register in the set.
 
 The target-specific implementations of these classes is auto-generated from a
-`TableGen <TableGenFundamentals.html>`_ description of the register file.
+:doc:`TableGen/index` description of the register file.
 
 .. _TargetInstrInfo:
 
@@ -1993,7 +1993,7 @@ Tail Calls
 
 This box indicates whether the target supports guaranteed tail calls.  These are
 calls marked "`tail <LangRef.html#i_call>`_" and use the fastcc calling
-convention.  Please see the `tail call section more more details`_.
+convention.  Please see the `tail call section`_ for more details.
 
 .. _feat_segstacks:
 
@@ -2011,7 +2011,7 @@ Basic support exists on the X86 backend. Currently vararg doesn't work and the
 object files are not marked the way the gold linker expects, but simple Go
 programs can be built by dragonegg.
 
-.. _tail call section more more details:
+.. _tail call section:
 
 Tail call optimization
 ----------------------
@@ -2145,10 +2145,6 @@ The following target-specific calling conventions are known to backend:
   others via stack. Callee is responsible for stack cleaning. This convention is
   used by MSVC by default for methods in its ABI (CC ID = 70).
 
-* **X86_CDeclMethod** --- Identical to the standard x86_32 C calling convention,
-  except that an sret paramter, if present, is placed on the stack after the
-  second parameter, which must an integer or pointer.  (CC ID = 80).
-
 .. _X86 addressing mode:
 
 Representing X86 addressing modes in MachineInstrs
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index 2ebdfbc..edbef3a 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@@ -76,10 +76,7 @@ implemented in the LLVM namespace following the expected standard interface.
 
 There are some exceptions such as the standard I/O streams library which are
 avoided. Also, there is much more detailed information on these subjects in the
-`Programmer's Manual`_.
-
-.. _Programmer's Manual:
-  http://llvm.org/docs/ProgrammersManual.html
+:doc:`ProgrammersManual`.
 
 Supported C++11 Language and Library Features
 ---------------------------------------------
@@ -111,6 +108,9 @@ unlikely to be supported by our host compilers.
 * Lambdas: N2927_
 
   * But *not* ``std::function``, until Clang implements `MSVC-compatible RTTI`_.
+    In many cases, you may be able to use ``llvm::function_ref`` instead, and it
+    is a superior choice in those cases.
+  * And *not* lambdas with default arguments.
 
 * ``decltype``: N2343_
 * Nested closing right angle brackets: N1757_
@@ -119,6 +119,11 @@ unlikely to be supported by our host compilers.
 * Strongly-typed and forward declarable enums: N2347_, N2764_
 * Local and unnamed types as template arguments: N2657_
 * Range-based for-loop: N2930_
+
+  * But ``{}`` are required around inner ``do {} while()`` loops.  As a result,
+    ``{}`` are required around function-like macros inside range-based for
+    loops.
+
 * ``override`` and ``final``: N2928_, N3206_, N3272_
 * Atomic operations and the C++11 memory model: N2429_
 
@@ -605,7 +610,7 @@ is never used for a class.  Because of this, we turn them off globally in the
 code.
 
 That said, LLVM does make extensive use of a hand-rolled form of RTTI that use
-templates like `isa<>, cast<>, and dyn_cast<> <ProgrammersManual.html#isa>`_.
+templates like :ref:`isa\<>, cast\<>, and dyn_cast\<> <isa>`.
 This form of RTTI is opt-in and can be
 :doc:`added to any class <HowToSetUpLLVMStyleRTTI>`. It is also
 substantially more efficient than ``dynamic_cast<>``.
@@ -1281,9 +1286,9 @@ method will never be implemented. This enables other checks like
 ``-Wunused-private-field`` to run correctly on classes that contain these
 methods.
 
-To maintain compatibility with C++03, ``LLVM_DELETED_FUNCTION`` should be used
-which will expand to ``= delete`` if the compiler supports it. These methods
-should still be declared private. Example of the uncopyable pattern:
+For compatibility with MSVC, ``LLVM_DELETED_FUNCTION`` should be used which
+will expand to ``= delete`` on compilers that support it. These methods should
+still be declared private. Example of the uncopyable pattern:
 
 .. code-block:: c++
 
diff --git a/docs/CommandGuide/index.rst b/docs/CommandGuide/index.rst
index ab4788a..ed18cd0 100644
--- a/docs/CommandGuide/index.rst
+++ b/docs/CommandGuide/index.rst
@@ -28,6 +28,7 @@ Basic Commands
    llvm-profdata
    llvm-stress
    llvm-symbolizer
+   llvm-dwarfdump
 
 Debugging Tools
 ~~~~~~~~~~~~~~~
diff --git a/docs/CommandGuide/llvm-cov.rst b/docs/CommandGuide/llvm-cov.rst
index 524f240..e0b2fe9 100644
--- a/docs/CommandGuide/llvm-cov.rst
+++ b/docs/CommandGuide/llvm-cov.rst
@@ -4,32 +4,120 @@ llvm-cov - emit coverage information
 SYNOPSIS
 --------
 
-:program:`llvm-cov` [-gcno=filename] [-gcda=filename] [dump]
+:program:`llvm-cov` [options] SOURCEFILE
 
 DESCRIPTION
 -----------
 
-The experimental :program:`llvm-cov` tool reads in description file generated
-by compiler and coverage data file generated by instrumented program.  This
-program assumes that the description and data file uses same format as gcov
-files.
+The :program:`llvm-cov` tool reads code coverage data files and displays the
+coverage information for a specified source file. It is compatible with the
+``gcov`` tool from version 4.2 of ``GCC`` and may also be compatible with
+some later versions of ``gcov``.
+
+To use llvm-cov, you must first build an instrumented version of your
+application that collects coverage data as it runs. Compile with the
+``-fprofile-arcs`` and ``-ftest-coverage`` options to add the
+instrumentation. (Alternatively, you can use the ``--coverage`` option, which
+includes both of those other options.) You should compile with debugging
+information (``-g``) and without optimization (``-O0``); otherwise, the
+coverage data cannot be accurately mapped back to the source code.
+
+At the time you compile the instrumented code, a ``.gcno`` data file will be
+generated for each object file. These ``.gcno`` files contain half of the
+coverage data. The other half of the data comes from ``.gcda`` files that are
+generated when you run the instrumented program, with a separate ``.gcda``
+file for each object file. Each time you run the program, the execution counts
+are summed into any existing ``.gcda`` files, so be sure to remove any old
+files if you do not want their contents to be included.
+
+By default, the ``.gcda`` files are written into the same directory as the
+object files, but you can override that by setting the ``GCOV_PREFIX`` and
+``GCOV_PREFIX_STRIP`` environment variables. The ``GCOV_PREFIX_STRIP``
+variable specifies a number of directory components to be removed from the
+start of the absolute path to the object file directory. After stripping those
+directories, the prefix from the ``GCOV_PREFIX`` variable is added. These
+environment variables allow you to run the instrumented program on a machine
+where the original object file directories are not accessible, but you will
+then need to copy the ``.gcda`` files back to the object file directories
+where llvm-cov expects to find them.
+
+Once you have generated the coverage data files, run llvm-cov for each main
+source file where you want to examine the coverage results. This should be run
+from the same directory where you previously ran the compiler. The results for
+the specified source file are written to a file named by appending a ``.gcov``
+suffix. A separate output file is also created for each file included by the
+main source file, also with a ``.gcov`` suffix added.
+
+The basic content of an llvm-cov output file is a copy of the source file with
+an execution count and line number prepended to every line. The execution
+count is shown as ``-`` if a line does not contain any executable code. If
+a line contains code but that code was never executed, the count is displayed
+as ``#####``.
+
 
 OPTIONS
 -------
 
-.. option:: -gcno=filename
+.. option:: -a, --all-blocks
+
+ Display all basic blocks. If there are multiple blocks for a single line of
+ source code, this option causes llvm-cov to show the count for each block
+ instead of just one count for the entire line.
+
+.. option:: -b, --branch-probabilities
+
+ Display conditional branch probabilities and a summary of branch information. 
+
+.. option:: -c, --branch-counts
+
+ Display branch counts instead of probabilities (requires -b).
+
+.. option:: -f, --function-summaries
+
+ Show a summary of coverage for each function instead of just one summary for
+ an entire source file.
+
+.. option:: --help
+
+ Display available options (--help-hidden for more).
+
+.. option:: -l, --long-file-names
+
+ For coverage output of files included from the main source file, add the
+ main file name followed by ``##`` as a prefix to the output file names. This
+ can be combined with the --preserve-paths option to use complete paths for
+ both the main file and the included file.
+
+.. option:: -n, --no-output
+
+ Do not output any ``.gcov`` files. Summary information is still
+ displayed.
+
+.. option:: -o=<DIR|FILE>, --object-directory=<DIR>, --object-file=<FILE>
+
+ Find objects in DIR or based on FILE's path. If you specify a particular
+ object file, the coverage data files are expected to have the same base name
+ with ``.gcno`` and ``.gcda`` extensions. If you specify a directory, the
+ files are expected in that directory with the same base name as the source
+ file.
+
+.. option:: -p, --preserve-paths
 
- This option selects input description file generated by compiler while
- instrumenting program.
+ Preserve path components when naming the coverage output files. In addition
+ to the source file name, include the directories from the path to that
+ file. The directories are separate by ``#`` characters, with ``.`` directories
+ removed and ``..`` directories replaced by ``^`` characters. When used with
+ the --long-file-names option, this applies to both the main file name and the
+ included file name.
 
-.. option:: -gcda=filename
+.. option:: -u, --unconditional-branches
 
- This option selects coverage data file generated by instrumented compiler.
+ Include unconditional branches in the output for the --branch-probabilities
+ option.
 
-.. option:: -dump
+.. option:: -version
 
- This options enables output dump that is suitable for a developer to help
- debug :program:`llvm-cov` itself.
+ Display the version of llvm-cov.
 
 EXIT STATUS
 -----------
diff --git a/docs/CommandGuide/llvm-dwarfdump.rst b/docs/CommandGuide/llvm-dwarfdump.rst
new file mode 100644
index 0000000..afaa0be
--- /dev/null
+++ b/docs/CommandGuide/llvm-dwarfdump.rst
@@ -0,0 +1,30 @@
+llvm-dwarfdump - print contents of DWARF sections
+=================================================
+
+SYNOPSIS
+--------
+
+:program:`llvm-dwarfdump` [*options*] [*filenames...*]
+
+DESCRIPTION
+-----------
+
+:program:`llvm-dwarfdump` parses DWARF sections in the object files
+and prints their contents in human-readable form.
+
+OPTIONS
+-------
+
+.. option:: -debug-dump=section
+
+  Specify the DWARF section to dump.
+  For example, use ``abbrev`` to dump the contents of ``.debug_abbrev`` section,
+  ``loc.dwo`` to dump the contents of ``.debug_loc.dwo`` etc.
+  See ``llvm-dwarfdump --help`` for the complete list of supported sections.
+  Use ``all`` to dump all DWARF sections. It is the default.
+
+EXIT STATUS
+-----------
+
+:program:`llvm-dwarfdump` returns 0. Other exit codes imply internal
+program error.
diff --git a/docs/CommandGuide/llvm-symbolizer.rst b/docs/CommandGuide/llvm-symbolizer.rst
index dfbdb3a..ce2d9c0 100644
--- a/docs/CommandGuide/llvm-symbolizer.rst
+++ b/docs/CommandGuide/llvm-symbolizer.rst
@@ -61,11 +61,14 @@ OPTIONS
 -------
 
 .. option:: -obj
+
   Path to object file to be symbolized.
 
-.. option:: -functions
+.. option:: -functions=[none|short|linkage]
 
-  Print function names as well as source file/line locations. Defaults to true.
+  Specify the way function names are printed (omit function name,
+  print short function name, or print full linkage name, respectively).
+  Defaults to ``linkage``.
 
 .. option:: -use-symbol-table
 
diff --git a/docs/CommandGuide/tblgen.rst b/docs/CommandGuide/tblgen.rst
index 1c46828..a42b04d 100644
--- a/docs/CommandGuide/tblgen.rst
+++ b/docs/CommandGuide/tblgen.rst
@@ -15,7 +15,8 @@ users of LLVM will not need to use this program.  It is only for assisting with
 writing an LLVM target backend.
 
 The input and output of :program:`tblgen` is beyond the scope of this short
-introduction.  Please see :doc:`../TableGenFundamentals`.
+introduction; please see the :doc:`introduction to TableGen
+<../TableGen/index>`.
 
 The *filename* argument specifies the name of a Target Description (``.td``)
 file to read as input.
diff --git a/docs/CompilerWriterInfo.rst b/docs/CompilerWriterInfo.rst
index 240271a..606b5f5 100644
--- a/docs/CompilerWriterInfo.rst
+++ b/docs/CompilerWriterInfo.rst
@@ -113,7 +113,7 @@ XCore
 -----
 
 * `The XMOS XS1 Architecture (ISA) <https://www.xmos.com/en/download/public/The-XMOS-XS1-Architecture%28X7879A%29.pdf>`_
-* `Tools Developement Guide (includes ABI) <https://www.xmos.com/download/public/Tools-Development-Guide%28X9114A%29.pdf>`_
+* `Tools Development Guide (includes ABI) <https://www.xmos.com/download/public/Tools-Development-Guide%28X9114A%29.pdf>`_
 
 Other relevant lists
 --------------------
diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index b9ac576..74a8979 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -336,7 +336,7 @@ Making a Major Change
 ---------------------
 
 When a developer begins a major new project with the aim of contributing it back
-to LLVM, s/he should inform the community with an email to the `llvmdev
+to LLVM, they should inform the community with an email to the `llvmdev
 <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ email list, to the extent
 possible. The reason for this is to:
 
diff --git a/docs/Extensions.rst b/docs/Extensions.rst
index 7d8c521..a49485c 100644
--- a/docs/Extensions.rst
+++ b/docs/Extensions.rst
@@ -159,3 +159,34 @@ different COMDATs:
   .globl Symbol2
   Symbol2:
   .long 1
+
+Target Specific Behaviour
+=========================
+
+Windows on ARM
+--------------
+
+Stack Probe Emission
+^^^^^^^^^^^^^^^^^^^^
+
+The reference implementation (Microsoft Visual Studio 2012) emits stack probes
+in the following fashion:
+
+.. code-block:: gas
+
+  movw r4, #constant
+  bl __chkstk
+  sub.w sp, sp, r4
+
+However, this has the limitation of 32 MiB (±16MiB).  In order to accommodate
+larger binaries, LLVM supports the use of ``-mcode-model=large`` to allow a 4GiB
+range via a slight deviation.  It will generate an indirect jump as follows:
+
+.. code-block:: gas
+
+  movw r4, #constant
+  movw r12, :lower16:__chkstk
+  movt r12, :upper16:__chkstk
+  blx r12
+  sub.w sp, sp, r4
+
diff --git a/docs/GettingStartedVS.rst b/docs/GettingStartedVS.rst
index 628bfdc..aa980d2 100644
--- a/docs/GettingStartedVS.rst
+++ b/docs/GettingStartedVS.rst
@@ -34,7 +34,7 @@ Most of the tools build and work.  ``bugpoint`` does build, but does
 not work.
 
 Additional information about the LLVM directory structure and tool chain
-can be found on the main `Getting Started <GettingStarted.html>`_ page.
+can be found on the main :doc:`GettingStarted` page.
 
 
 Requirements
@@ -97,7 +97,7 @@ Here's the short story for getting up and running quickly with LLVM:
      using LLVM.  Another important option is ``LLVM_TARGETS_TO_BUILD``,
      which controls the LLVM target architectures that are included on the
      build.
-   * See the `LLVM CMake guide <CMake.html>`_ for detailed information about
+   * See the :doc:`LLVM CMake guide <CMake>` for detailed information about
      how to configure the LLVM build.
 
 6. Start Visual Studio
@@ -215,8 +215,8 @@ An Example Using the LLVM Tool Chain
 Common Problems
 ===============
 If you are having problems building or using LLVM, or if you have any other
-general questions about LLVM, please consult the `Frequently Asked Questions
-<FAQ.html>`_ page.
+general questions about LLVM, please consult the :doc:`Frequently Asked Questions
+<FAQ>` page.
 
 
 Links
diff --git a/docs/LLVMBuild.rst b/docs/LLVMBuild.rst
index c0c96d3..58f6f4d 100644
--- a/docs/LLVMBuild.rst
+++ b/docs/LLVMBuild.rst
@@ -86,8 +86,8 @@ LLVM primarily uses the following types of components:
   libraries that they build on top of.
 - *Build Tools* - Build tools are applications which are designed to be run
   as part of the build process (typically to generate other source files).
-  Currently, LLVM uses one main build tool called :doc:`TableGen
-  <TableGenFundamentals>` to generate a variety of source files.
+  Currently, LLVM uses one main build tool called :doc:`TableGen/index`
+  to generate a variety of source files.
 - *Tools* - Command line applications which are built using the LLVM
   component libraries. Most LLVM tools are small and are primarily
   frontends to the library interfaces.
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 91692ad..fa40363 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -440,7 +440,10 @@ styles:
     defining module will bind to the local symbol. That is, the symbol
     cannot be overridden by another module.
 
-.. _namedtypes:
+A symbol with ``internal`` or ``private`` linkage must have ``default``
+visibility.
+
+.. _dllstorageclass:
 
 DLL Storage Classes
 -------------------
@@ -461,6 +464,8 @@ DLL storage class:
     exists for defining a dll interface, the compiler, assembler and linker know
     it is externally referenced and must refrain from deleting the symbol.
 
+.. _namedtypes:
+
 Structure Types
 ---------------
 
@@ -802,6 +807,9 @@ Currently, only the following parameter attributes are defined:
     not to trap and to be properly aligned. This may only be applied to
     the first parameter. This is not a valid attribute for return
     values.
+
+.. _noalias:
+
 ``noalias``
     This indicates that pointer values :ref:`based <pointeraliasing>` on
     the argument or return value do not alias pointer values which are
@@ -811,8 +819,8 @@ Currently, only the following parameter attributes are defined:
     "irrelevant" to the ``noalias`` keyword for the arguments and return
     value used in that call. The caller shares the responsibility with
     the callee for ensuring that these requirements are met. For further
-    details, please see the discussion of the NoAlias response in `alias
-    analysis <AliasAnalysis.html#MustMayNo>`_.
+    details, please see the discussion of the NoAlias response in :ref:`alias
+    analysis <Must, May, or No>`.
 
     Note that this definition of ``noalias`` is intentionally similar
     to the definition of ``restrict`` in C99 for function arguments,
@@ -841,6 +849,13 @@ Currently, only the following parameter attributes are defined:
     operands for the :ref:`bitcast instruction <i_bitcast>`. This is not a
     valid attribute for return values and can only be applied to one parameter.
 
+``nonnull``
+    This indicates that the parameter or return pointer is not null. This
+    attribute may only be applied to pointer typed parameters. This is not
+    checked or enforced by LLVM, the caller must ensure that the pointer
+    passed in is non-null, or the callee must ensure that the returned pointer 
+    is non-null.
+
 .. _gc:
 
 Garbage Collector Names
@@ -1986,6 +2001,8 @@ notion of a forward declared structure.
 | ``opaque``   | An opaque type.   |
 +--------------+-------------------+
 
+.. _constants:
+
 Constants
 =========
 
@@ -2770,15 +2787,29 @@ for optimizations are prefixed with ``llvm.mem``.
 '``llvm.mem.parallel_loop_access``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-For a loop to be parallel, in addition to using
-the ``llvm.loop`` metadata to mark the loop latch branch instruction,
-also all of the memory accessing instructions in the loop body need to be
-marked with the ``llvm.mem.parallel_loop_access`` metadata. If there
-is at least one memory accessing instruction not marked with the metadata,
-the loop must be considered a sequential loop. This causes parallel loops to be
-converted to sequential loops due to optimization passes that are unaware of
-the parallel semantics and that insert new memory instructions to the loop
-body.
+The ``llvm.mem.parallel_loop_access`` metadata refers to a loop identifier, 
+or metadata containing a list of loop identifiers for nested loops. 
+The metadata is attached to memory accessing instructions and denotes that 
+no loop carried memory dependence exist between it and other instructions denoted 
+with the same loop identifier.
+
+Precisely, given two instructions ``m1`` and ``m2`` that both have the 
+``llvm.mem.parallel_loop_access`` metadata, with ``L1`` and ``L2`` being the 
+set of loops associated with that metadata, respectively, then there is no loop 
+carried dependence between ``m1`` and ``m2`` for loops ``L1`` or 
+``L2``.
+
+As a special case, if all memory accessing instructions in a loop have 
+``llvm.mem.parallel_loop_access`` metadata that refers to that loop, then the 
+loop has no loop carried memory dependences and is considered to be a parallel 
+loop.  
+
+Note that if not all memory access instructions have such metadata referring to 
+the loop, then the loop is considered not being trivially parallel. Additional 
+memory dependence analysis is required to make that determination.  As a fail 
+safe mechanism, this causes loops that were originally parallel to be considered 
+sequential (if optimization passes that are unaware of the parallel semantics 
+insert new memory instructions into the loop body).
 
 Example of a loop that is considered parallel due to its correct use of
 both ``llvm.loop`` and ``llvm.mem.parallel_loop_access``
@@ -3144,14 +3175,18 @@ The '``llvm.global_ctors``' Global Variable
 
 .. code-block:: llvm
 
-    %0 = type { i32, void ()* }
-    @llvm.global_ctors = appending global [1 x %0] [%0 { i32 65535, void ()* @ctor }]
+    %0 = type { i32, void ()*, i8* }
+    @llvm.global_ctors = appending global [1 x %0] [%0 { i32 65535, void ()* @ctor, i8* @data }]
 
 The ``@llvm.global_ctors`` array contains a list of constructor
-functions and associated priorities. The functions referenced by this
-array will be called in ascending order of priority (i.e. lowest first)
-when the module is loaded. The order of functions with the same priority
-is not defined.
+functions, priorities, and an optional associated global or function.
+The functions referenced by this array will be called in ascending order
+of priority (i.e. lowest first) when the module is loaded. The order of
+functions with the same priority is not defined.
+
+If the third field is present, non-null, and points to a global variable
+or function, the initializer function will only run if the associated
+data from the current module is not discarded.
 
 .. _llvmglobaldtors:
 
@@ -3160,14 +3195,18 @@ The '``llvm.global_dtors``' Global Variable
 
 .. code-block:: llvm
 
-    %0 = type { i32, void ()* }
-    @llvm.global_dtors = appending global [1 x %0] [%0 { i32 65535, void ()* @dtor }]
+    %0 = type { i32, void ()*, i8* }
+    @llvm.global_dtors = appending global [1 x %0] [%0 { i32 65535, void ()* @dtor, i8* @data }]
+
+The ``@llvm.global_dtors`` array contains a list of destructor
+functions, priorities, and an optional associated global or function.
+The functions referenced by this array will be called in descending
+order of priority (i.e. highest first) when the module is unloaded. The
+order of functions with the same priority is not defined.
 
-The ``@llvm.global_dtors`` array contains a list of destructor functions
-and associated priorities. The functions referenced by this array will
-be called in descending order of priority (i.e. highest first) when the
-module is loaded. The order of functions with the same priority is not
-defined.
+If the third field is present, non-null, and points to a global variable
+or function, the destructor function will only run if the associated
+data from the current module is not discarded.
 
 Instruction Reference
 =====================
@@ -4465,7 +4504,7 @@ Syntax:
 
 ::
 
-      <result> = extractelement <n x <ty>> <val>, i32 <idx>    ; yields <ty>
+      <result> = extractelement <n x <ty>> <val>, <ty2> <idx>  ; yields <ty>
 
 Overview:
 """""""""
@@ -4479,7 +4518,7 @@ Arguments:
 The first operand of an '``extractelement``' instruction is a value of
 :ref:`vector <t_vector>` type. The second operand is an index indicating
 the position from which to extract the element. The index may be a
-variable.
+variable of any integer type.
 
 Semantics:
 """"""""""
@@ -4505,7 +4544,7 @@ Syntax:
 
 ::
 
-      <result> = insertelement <n x <ty>> <val>, <ty> <elt>, i32 <idx>    ; yields <n x <ty>>
+      <result> = insertelement <n x <ty>> <val>, <ty> <elt>, <ty2> <idx>    ; yields <n x <ty>>
 
 Overview:
 """""""""
@@ -4520,7 +4559,7 @@ The first operand of an '``insertelement``' instruction is a value of
 :ref:`vector <t_vector>` type. The second operand is a scalar value whose
 type must equal the element type of the first operand. The third operand
 is an index indicating the position at which to insert the value. The
-index may be a variable.
+index may be a variable of any integer type.
 
 Semantics:
 """"""""""
@@ -6156,7 +6195,7 @@ Syntax:
 
 ::
 
-      <result> = [tail] call [cconv] [ret attrs] <ty> [<fnty>*] <fnptrval>(<function args>) [fn attrs]
+      <result> = [tail | musttail] call [cconv] [ret attrs] <ty> [<fnty>*] <fnptrval>(<function args>) [fn attrs]
 
 Overview:
 """""""""
@@ -6168,17 +6207,34 @@ Arguments:
 
 This instruction requires several arguments:
 
-#. The optional "tail" marker indicates that the callee function does
-   not access any allocas or varargs in the caller. Note that calls may
-   be marked "tail" even if they do not occur before a
-   :ref:`ret <i_ret>` instruction. If the "tail" marker is present, the
-   function call is eligible for tail call optimization, but `might not
-   in fact be optimized into a jump <CodeGenerator.html#tailcallopt>`_.
-   The code generator may optimize calls marked "tail" with either 1)
-   automatic `sibling call
-   optimization <CodeGenerator.html#sibcallopt>`_ when the caller and
-   callee have matching signatures, or 2) forced tail call optimization
-   when the following extra requirements are met:
+#. The optional ``tail`` and ``musttail`` markers indicate that the optimizers
+   should perform tail call optimization.  The ``tail`` marker is a hint that
+   `can be ignored <CodeGenerator.html#sibcallopt>`_.  The ``musttail`` marker
+   means that the call must be tail call optimized in order for the program to
+   be correct.  The ``musttail`` marker provides these guarantees:
+
+   #. The call will not cause unbounded stack growth if it is part of a
+      recursive cycle in the call graph.
+   #. Arguments with the :ref:`inalloca <attr_inalloca>` attribute are
+      forwarded in place.
+
+   Both markers imply that the callee does not access allocas or varargs from
+   the caller.  Calls marked ``musttail`` must obey the following additional
+   rules:
+
+   - The call must immediately precede a :ref:`ret <i_ret>` instruction,
+     or a pointer bitcast followed by a ret instruction.
+   - The ret instruction must return the (possibly bitcasted) value
+     produced by the call or void.
+   - The caller and callee prototypes must match.  Pointer types of
+     parameters or return types may differ in pointee type, but not
+     in address space.
+   - The calling conventions of the caller and callee must match.
+   - All ABI-impacting function attributes, such as sret, byval, inreg,
+     returned, and inalloca, must match.
+
+   Tail call optimization for calls marked ``tail`` is guaranteed to occur if
+   the following conditions are met:
 
    -  Caller and callee both have the calling convention ``fastcc``.
    -  The call is in tail position (ret immediately follows call and ret
@@ -6782,6 +6838,51 @@ Note that calling this intrinsic does not prevent function inlining or
 other aggressive transformations, so the value returned may not be that
 of the obvious source-language caller.
 
+.. _int_read_register:
+.. _int_write_register:
+
+'``llvm.read_register``' and '``llvm.write_register``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i32 @llvm.read_register.i32(metadata)
+      declare i64 @llvm.read_register.i64(metadata)
+      declare void @llvm.write_register.i32(metadata, i32 @value)
+      declare void @llvm.write_register.i64(metadata, i64 @value)
+      !0 = metadata !{metadata !"sp\00"}
+
+Overview:
+"""""""""
+
+The '``llvm.read_register``' and '``llvm.write_register``' intrinsics
+provides access to the named register. The register must be valid on
+the architecture being compiled to. The type needs to be compatible
+with the register being read.
+
+Semantics:
+""""""""""
+
+The '``llvm.read_register``' intrinsic returns the current value of the
+register, where possible. The '``llvm.write_register``' intrinsic sets
+the current value of the register, where possible.
+
+This is useful to implement named register global variables that need
+to always be mapped to a specific register, as is common practice on
+bare-metal programs including OS kernels.
+
+The compiler doesn't check for register availability or use of the used
+register in surrounding code, including inline assembly. Because of that,
+allocatable registers are not supported.
+
+Warning: So far it only works with the stack pointer on selected
+architectures (ARM, AArch64, PowerPC and x86_64). Significant amount of
+work is needed to support other registers and even more so, allocatable
+registers.
+
 .. _int_stacksave:
 
 '``llvm.stacksave``' Intrinsic
@@ -6964,11 +7065,11 @@ Semantics:
 
 On platforms with coherent instruction and data caches (e.g. x86), this
 intrinsic is a nop. On platforms with non-coherent instruction and data
-cache (e.g. ARM, MIPS), the intrinsic is lowered either to appropiate
+cache (e.g. ARM, MIPS), the intrinsic is lowered either to appropriate
 instructions or a system call, if cache flushing requires special
 privileges.
 
-The default behavior is to emit a call to ``__clear_cache'' from the run
+The default behavior is to emit a call to ``__clear_cache`` from the run
 time library.
 
 This instrinsic does *not* empty the instruction pipeline. Modifications
diff --git a/docs/Passes.rst b/docs/Passes.rst
index a288933..b51829d 100644
--- a/docs/Passes.rst
+++ b/docs/Passes.rst
@@ -302,15 +302,6 @@ standard error in a human-readable form.
 This pass, only available in ``opt``, printsthe SCCs of each function CFG to
 standard error in a human-readable fom.
 
-``-print-dbginfo``: Print debug info in human readable form
------------------------------------------------------------
-
-Pass that prints instructions, and associated debug info:
-
-#. source/line/col information
-#. original variable name
-#. original type name
-
 ``-print-dom-info``: Dominator Info Printer
 -------------------------------------------
 
@@ -549,6 +540,8 @@ instructions that are obviously dead.
 A trivial dead store elimination that only considers basic-block local
 redundant stores.
 
+.. _passes-functionattrs:
+
 ``-functionattrs``: Deduce function attributes
 ----------------------------------------------
 
@@ -657,7 +650,7 @@ program, and is used for a wide variety of program transformations.
 ------------------------------------------------
 
 Combine instructions to form fewer, simple instructions.  This pass does not
-modify the CFG This pass is where algebraic simplification happens.
+modify the CFG. This pass is where algebraic simplification happens.
 
 This pass combines things like:
 
@@ -690,6 +683,13 @@ program:
    shifts.
 #. … etc.
 
+This pass can also simplify calls to specific well-known function calls (e.g.
+runtime library functions).  For example, a call ``exit(3)`` that occurs within
+the ``main()`` function can be transformed into simply ``return 3``. Whether or
+not library calls are simplified is controlled by the
+:ref:`-functionattrs <passes-functionattrs>` pass and LLVM's knowledge of
+library calls on different targets.
+
 ``-internalize``: Internalize Global Symbols
 --------------------------------------------
 
@@ -1020,14 +1020,6 @@ as:
 Note that this pass has a habit of making definitions be dead.  It is a good
 idea to run a :ref:`DCE <passes-dce>` pass sometime after running this pass.
 
-``-simplify-libcalls``: Simplify well-known library calls
----------------------------------------------------------
-
-Applies a variety of small optimizations for calls to specific well-known
-function calls (e.g. runtime library functions).  For example, a call
-``exit(3)`` that occurs within the ``main()`` function can be transformed into
-simply ``return 3``.
-
 .. _passes-simplifycfg:
 
 ``-simplifycfg``: Simplify the CFG
diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst
index 581c9e5..18b2817 100644
--- a/docs/Phabricator.rst
+++ b/docs/Phabricator.rst
@@ -99,7 +99,7 @@ line:
   Differential Revision: <URL>
 
 where ``<URL>`` is the URL for the code review, starting with
-``http://llvm-reviews.chandlerc.com/``.
+``http://reviews.llvm.org/``.
 
 Note that Arcanist will add this automatically.
 
@@ -110,10 +110,9 @@ review, and add a link from the review to the commit.
 Status
 ------
 
-Currently, we're testing Phabricator for use with Clang/LLVM. Please let us
-know whether you like it and what could be improved!
+Please let us know whether you like it and what could be improved!
 
-.. _LLVM's Phabricator: http://llvm-reviews.chandlerc.com
-.. _Code Repository Browser: http://llvm-reviews.chandlerc.com/diffusion/
+.. _LLVM's Phabricator: http://reviews.llvm.org
+.. _Code Repository Browser: http://reviews.llvm.org/diffusion/
 .. _Arcanist Quick Start: http://www.phabricator.com/docs/phabricator/article/Arcanist_Quick_Start.html
 .. _Arcanist User Guide: http://www.phabricator.com/docs/phabricator/article/Arcanist_User_Guide.html
diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst
index 9f388cc..7e46ac4 100644
--- a/docs/ProgrammersManual.rst
+++ b/docs/ProgrammersManual.rst
@@ -263,6 +263,78 @@ almost never be stored or mentioned directly.  They are intended solely for use
 when defining a function which should be able to efficiently accept concatenated
 strings.
 
+.. _function_apis:
+
+Passing functions and other callable objects
+--------------------------------------------
+
+Sometimes you may want a function to be passed a callback object. In order to
+support lambda expressions and other function objects, you should not use the
+traditional C approach of taking a function pointer and an opaque cookie:
+
+.. code-block:: c++
+
+    void takeCallback(bool (*Callback)(Function *, void *), void *Cookie);
+
+Instead, use one of the following approaches:
+
+Function template
+^^^^^^^^^^^^^^^^^
+
+If you don't mind putting the definition of your function into a header file,
+make it a function template that is templated on the callable type.
+
+.. code-block:: c++
+
+    template<typename Callable>
+    void takeCallback(Callable Callback) {
+      Callback(1, 2, 3);
+    }
+
+The ``function_ref`` class template
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``function_ref``
+(`doxygen <http://llvm.org/doxygen/classllvm_1_1function_ref.html>`__) class
+template represents a reference to a callable object, templated over the type
+of the callable. This is a good choice for passing a callback to a function,
+if you don't need to hold onto the callback after the function returns.
+
+``function_ref<Ret(Param1, Param2, ...)>`` can be implicitly constructed from
+any callable object that can be called with arguments of type ``Param1``,
+``Param2``, ..., and returns a value that can be converted to type ``Ret``.
+For example:
+
+.. code-block:: c++
+
+    void visitBasicBlocks(Function *F, function_ref<bool (BasicBlock*)> Callback) {
+      for (BasicBlock &BB : *F)
+        if (Callback(&BB))
+          return;
+    }
+
+can be called using:
+
+.. code-block:: c++
+
+    visitBasicBlocks(F, [&](BasicBlock *BB) {
+      if (process(BB))
+        return isEmpty(BB);
+      return false;
+    });
+
+Note that a ``function_ref`` object contains pointers to external memory, so
+it is not generally safe to store an instance of the class (unless you know
+that the external storage will not be freed).
+``function_ref`` is small enough that it should always be passed by value.
+
+``std::function``
+^^^^^^^^^^^^^^^^^
+
+You cannot use ``std::function`` within LLVM code, because it is not supported
+by all our target toolchains.
+
+
 .. _DEBUG:
 
 The ``DEBUG()`` macro and ``-debug`` option
@@ -1559,14 +1631,14 @@ Iterating over the ``Instruction`` in a ``Function``
 If you're finding that you commonly iterate over a ``Function``'s
 ``BasicBlock``\ s and then that ``BasicBlock``'s ``Instruction``\ s,
 ``InstIterator`` should be used instead.  You'll need to include
-``llvm/Support/InstIterator.h`` (`doxygen
-<http://llvm.org/doxygen/InstIterator_8h-source.html>`__) and then instantiate
+``llvm/IR/InstIterator.h`` (`doxygen
+<http://llvm.org/doxygen/InstIterator_8h.html>`__) and then instantiate
 ``InstIterator``\ s explicitly in your code.  Here's a small example that shows
 how to dump all instructions in a function to the standard error stream:
 
 .. code-block:: c++
 
-  #include "llvm/Support/InstIterator.h"
+  #include "llvm/IR/InstIterator.h"
 
   // F is a pointer to a Function instance
   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
@@ -1738,16 +1810,12 @@ chain of ``F``:
 
   Function *F = ...;
 
-  for (Value::use_iterator i = F->use_begin(), e = F->use_end(); i != e; ++i)
-    if (Instruction *Inst = dyn_cast<Instruction>(*i)) {
+  for (User *U : GV->users()) {    
+    if (Instruction *Inst = dyn_cast<Instruction>(U)) {
       errs() << "F is used in instruction:\n";
       errs() << *Inst << "\n";
     }
 
-Note that dereferencing a ``Value::use_iterator`` is not a very cheap operation.
-Instead of performing ``*i`` above several times, consider doing it only once in
-the loop body and reusing its result.
-
 Alternatively, it's common to have an instance of the ``User`` Class (`doxygen
 <http://llvm.org/doxygen/classllvm_1_1User.html>`__) and need to know what
 ``Value``\ s are used by it.  The list of all ``Value``\ s used by a ``User`` is
@@ -1759,8 +1827,8 @@ instruction uses (that is, the operands of the particular ``Instruction``):
 
   Instruction *pi = ...;
 
-  for (User::op_iterator i = pi->op_begin(), e = pi->op_end(); i != e; ++i) {
-    Value *v = *i;
+  for (Use &U : pi->operands()) {
+    Value *v = U.get();
     // ...
   }
 
diff --git a/docs/README.txt b/docs/README.txt
index 22cf930..3d63429 100644
--- a/docs/README.txt
+++ b/docs/README.txt
@@ -40,3 +40,12 @@ The correspondence between .rst files and man pages is
 These .rst files are also included during HTML generation so they are also
 viewable online (as noted above) at e.g.
 `http://llvm.org/docs/CommandGuide/Foo.html`.
+
+Checking links
+==============
+
+The reachibility of external links in the documentation can be checked by
+running:
+
+    cd docs/
+    make -f Makefile.sphinx linkcheck
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 723e7cf..8dc1681 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -52,6 +52,9 @@ Non-comprehensive list of changes in this release
 * llvm-ar now handles IR files like regular object files. In particular, a
   regular symbol table is created for symbols defined in IR files.
 
+* LLVM now always uses cfi directives for producing most stack
+  unwinding information.
+
 .. NOTE
    For small 1-3 sentence descriptions, just add an entry at the end of
    this list. If your description won't fit comfortably in one bullet
diff --git a/docs/SegmentedStacks.rst b/docs/SegmentedStacks.rst
index e44ce423..c0bf32b 100644
--- a/docs/SegmentedStacks.rst
+++ b/docs/SegmentedStacks.rst
@@ -13,9 +13,8 @@ monolithic chunk (of some worst case size) at thread initialization. This is
 done by allocating stack blocks (henceforth called *stacklets*) and linking them
 into a doubly linked list. The function prologue is responsible for checking if
 the current stacklet has enough space for the function to execute; and if not,
-call into the libgcc runtime to allocate more stack space. When using ``llc``,
-segmented stacks can be enabled by adding ``-segmented-stacks`` to the command
-line.
+call into the libgcc runtime to allocate more stack space. Segmented stacks are
+enabled with the ``"split-stack"`` attribute on LLVM functions.
 
 The runtime functionality is `already there in libgcc
 <http://gcc.gnu.org/wiki/SplitStacks>`_.
diff --git a/docs/TableGen/LangIntro.rst b/docs/TableGen/LangIntro.rst
index f139f35..3e74dff 100644
--- a/docs/TableGen/LangIntro.rst
+++ b/docs/TableGen/LangIntro.rst
@@ -160,8 +160,16 @@ supported include:
     remaining elements in the list may be arbitrary other values, including
     nested ```dag``' values.
 
-``!strconcat(a, b)``
+``!listconcat(a, b, ...)``
+    A list value that is the result of concatenating the 'a' and 'b' lists.
+    The lists must have the same element type.
+    More than two arguments are accepted with the result being the concatenation
+    of all the lists given.
+
+``!strconcat(a, b, ...)``
     A string value that is the result of concatenating the 'a' and 'b' strings.
+    More than two arguments are accepted with the result being the concatenation
+    of all the strings given.
 
 ``str1#str2``
     "#" (paste) is a shorthand for !strconcat.  It may concatenate things that
diff --git a/docs/TableGen/LangRef.rst b/docs/TableGen/LangRef.rst
index e3db3aa..9b074be 100644
--- a/docs/TableGen/LangRef.rst
+++ b/docs/TableGen/LangRef.rst
@@ -2,8 +2,6 @@
 TableGen Language Reference
 ===========================
 
-.. sectionauthor:: Sean Silva <silvas@purdue.edu>
-
 .. contents::
    :local:
 
@@ -18,7 +16,7 @@ This document is meant to be a normative spec about the TableGen language
 in and of itself (i.e. how to understand a given construct in terms of how
 it affects the final set of records represented by the TableGen file). If
 you are unsure if this document is really what you are looking for, please
-read :doc:`/TableGenFundamentals` first.
+read the :doc:`introduction to TableGen <index>` first.
 
 Notation
 ========
@@ -95,7 +93,7 @@ wide variety of meanings:
    BangOperator: one of
                :!eq     !if      !head    !tail      !con
                :!add    !shl     !sra     !srl
-               :!cast   !empty   !subst   !foreach   !strconcat
+               :!cast   !empty   !subst   !foreach   !listconcat   !strconcat
 
 Syntax
 ======
diff --git a/docs/WritingAnLLVMBackend.rst b/docs/WritingAnLLVMBackend.rst
index 429f52a..fb7c16f 100644
--- a/docs/WritingAnLLVMBackend.rst
+++ b/docs/WritingAnLLVMBackend.rst
@@ -51,7 +51,7 @@ These essential documents must be read before reading this document:
   Formation, SSA-based Optimization, Register Allocation, Prolog/Epilog Code
   Insertion, Late Machine Code Optimizations, and Code Emission.
 
-* :doc:`TableGenFundamentals` --- a document that describes the TableGen
+* :doc:`TableGen/index` --- a document that describes the TableGen
   (``tblgen``) application that manages domain-specific information to support
   LLVM code generation.  TableGen processes input from a target description
   file (``.td`` suffix) and generates C++ code that can be used for code
diff --git a/docs/YamlIO.rst b/docs/YamlIO.rst
index b1917b6..76dd021 100644
--- a/docs/YamlIO.rst
+++ b/docs/YamlIO.rst
@@ -399,6 +399,42 @@ the above schema, a same valid YAML document is:
     name:    Tom
     flags:   [ pointy, flat ]
 
+Sometimes a "flags" field might contains an enumeration part
+defined by a bit-mask.
+
+.. code-block:: c++
+
+    enum {
+      flagsFeatureA = 1,
+      flagsFeatureB = 2,
+      flagsFeatureC = 4,
+
+      flagsCPUMask = 24,
+
+      flagsCPU1 = 8,
+      flagsCPU2 = 16
+    };
+
+To support reading and writing such fields, you need to use the maskedBitSet()
+method and provide the bit values, their names and the enumeration mask.
+
+.. code-block:: c++
+
+    template <>
+    struct ScalarBitSetTraits<MyFlags> {
+      static void bitset(IO &io, MyFlags &value) {
+        io.bitSetCase(value, "featureA",  flagsFeatureA);
+        io.bitSetCase(value, "featureB",  flagsFeatureB);
+        io.bitSetCase(value, "featureC",  flagsFeatureC);
+        io.maskedBitSetCase(value, "CPU1",  flagsCPU1, flagsCPUMask);
+        io.maskedBitSetCase(value, "CPU2",  flagsCPU2, flagsCPUMask);
+      }
+    };
+
+YAML I/O (when writing) will apply the enumeration mask to the flags field,
+and compare the result and values from the bitset. As in case of a regular
+bitset, each that matches will cause the corresponding string to be added
+to the flow sequence.
 
 Custom Scalar
 -------------
@@ -426,8 +462,10 @@ looks like:
       static StringRef input(StringRef scalar, T &value) {
         // do custom parsing here.  Return the empty string on success,
         // or an error message on failure.
-        return StringRef(); 
+        return StringRef();
       }
+      // Determine if this scalar needs quotes.
+      static bool mustQuote(StringRef) { return true; }
     };
     
 
diff --git a/docs/index.rst b/docs/index.rst
index 726a392..1d4fbd9 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -215,6 +215,7 @@ For API clients and LLVM developers.
 
    AliasAnalysis
    BitCodeFormat
+   BlockFrequencyTerminology
    BranchWeightMetadata
    Bugpoint
    CodeGenerator
@@ -236,6 +237,7 @@ For API clients and LLVM developers.
    NVPTXUsage
    StackMaps
    InAlloca
+   BigEndianNEON
 
 :doc:`WritingAnLLVMPass`
    Information on how to write LLVM transformations and analyses.
@@ -248,7 +250,7 @@ For API clients and LLVM developers.
    working on retargetting LLVM to a new architecture, designing a new codegen
    pass, or enhancing existing components.
 
-:doc:`TableGen Fundamentals <TableGen/index>`
+:doc:`TableGen <TableGen/index>`
    Describes the TableGen tool, which is used heavily by the LLVM code
    generator.
 
@@ -298,6 +300,10 @@ For API clients and LLVM developers.
 :doc:`BranchWeightMetadata`
    Provides information about Branch Prediction Information.
 
+:doc:`BlockFrequencyTerminology`
+   Provides information about terminology used in the ``BlockFrequencyInfo``
+   analysis pass.
+
 :doc:`SegmentedStacks`
    This document describes segmented stacks and how they are used in LLVM.
 
@@ -314,6 +320,11 @@ For API clients and LLVM developers.
   LLVM support for mapping instruction addresses to the location of
   values and allowing code to be patched.
 
+:doc:`BigEndianNEON`
+  LLVM's support for generating NEON instructions on big endian ARM targets is
+  somewhat nonintuitive. This document explains the implementation and rationale.
+
+
 Development Process Documentation
 =================================
 
diff --git a/examples/BrainF/BrainFDriver.cpp b/examples/BrainF/BrainFDriver.cpp
index d726464..e2de6bc 100644
--- a/examples/BrainF/BrainFDriver.cpp
+++ b/examples/BrainF/BrainFDriver.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 50c5e3a..f37e3f8 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -124,6 +124,12 @@ typedef struct LLVMOpaquePassRegistry *LLVMPassRegistryRef;
  * @see llvm::Use */
 typedef struct LLVMOpaqueUse *LLVMUseRef;
 
+
+/**
+ * @see llvm::DiagnosticInfo
+ */
+typedef struct LLVMOpaqueDiagnosticInfo *LLVMDiagnosticInfoRef;
+
 typedef enum {
     LLVMZExtAttribute       = 1<<0,
     LLVMSExtAttribute       = 1<<1,
@@ -159,7 +165,8 @@ typedef enum {
     LLVMStackProtectStrongAttribute = 1ULL<<33,
     LLVMCold = 1ULL << 34,
     LLVMOptimizeNone = 1ULL << 35,
-    LLVMInAllocaAttribute = 1ULL << 36
+    LLVMInAllocaAttribute = 1ULL << 36,
+    LLVMNonNullAttribute = 1ULL << 37
     */
 } LLVMAttribute;
 
@@ -400,6 +407,13 @@ typedef enum {
                              the old one */
 } LLVMAtomicRMWBinOp;
 
+typedef enum {
+    LLVMDSError,
+    LLVMDSWarning,
+    LLVMDSRemark,
+    LLVMDSNote
+} LLVMDiagnosticSeverity;
+
 /**
  * @}
  */
@@ -453,6 +467,9 @@ void LLVMEnablePrettyStackTrace(void);
  * @{
  */
 
+typedef void (*LLVMDiagnosticHandler)(LLVMDiagnosticInfoRef, void *);
+typedef void (*LLVMYieldCallback)(LLVMContextRef, void *);
+
 /**
  * Create a new context.
  *
@@ -467,6 +484,21 @@ LLVMContextRef LLVMContextCreate(void);
 LLVMContextRef LLVMGetGlobalContext(void);
 
 /**
+ * Set the diagnostic handler for this context.
+ */
+void LLVMContextSetDiagnosticHandler(LLVMContextRef C,
+                                     LLVMDiagnosticHandler Handler,
+                                     void *DiagnosticContext);
+
+/**
+ * Set the yield callback function for this context.
+ *
+ * @see LLVMContext::setYieldCallback()
+ */
+void LLVMContextSetYieldCallback(LLVMContextRef C, LLVMYieldCallback Callback,
+                                 void *OpaqueHandle);
+
+/**
  * Destroy a context instance.
  *
  * This should be called for every call to LLVMContextCreate() or memory
@@ -474,6 +506,21 @@ LLVMContextRef LLVMGetGlobalContext(void);
  */
 void LLVMContextDispose(LLVMContextRef C);
 
+/**
+ * Return a string representation of the DiagnosticInfo. Use
+ * LLVMDisposeMessage to free the string.
+ *
+ * @see DiagnosticInfo::print()
+ */
+char *LLVMGetDiagInfoDescription(LLVMDiagnosticInfoRef DI);
+
+/**
+ * Return an enum LLVMDiagnosticSeverity.
+ *
+ * @see DiagnosticInfo::getSeverity()
+ */
+LLVMDiagnosticSeverity LLVMGetDiagInfoSeverity(LLVMDiagnosticInfoRef DI);
+
 unsigned LLVMGetMDKindIDInContext(LLVMContextRef C, const char* Name,
                                   unsigned SLen);
 unsigned LLVMGetMDKindID(const char* Name, unsigned SLen);
@@ -1121,9 +1168,10 @@ LLVMTypeRef LLVMX86MMXType(void);
       macro(ConstantStruct)                 \
       macro(ConstantVector)                 \
       macro(GlobalValue)                    \
-        macro(Function)                     \
         macro(GlobalAlias)                  \
-        macro(GlobalVariable)               \
+        macro(GlobalObject)                 \
+          macro(Function)                   \
+          macro(GlobalVariable)             \
       macro(UndefValue)                     \
     macro(Instruction)                      \
       macro(BinaryOperator)                 \
diff --git a/include/llvm-c/Object.h b/include/llvm-c/Object.h
index c271552..447fcea 100644
--- a/include/llvm-c/Object.h
+++ b/include/llvm-c/Object.h
@@ -78,7 +78,6 @@ void LLVMMoveToNextRelocation(LLVMRelocationIteratorRef RI);
 // SymbolRef accessors
 const char *LLVMGetSymbolName(LLVMSymbolIteratorRef SI);
 uint64_t LLVMGetSymbolAddress(LLVMSymbolIteratorRef SI);
-uint64_t LLVMGetSymbolFileOffset(LLVMSymbolIteratorRef SI);
 uint64_t LLVMGetSymbolSize(LLVMSymbolIteratorRef SI);
 
 // RelocationRef accessors
diff --git a/include/llvm-c/Transforms/PassManagerBuilder.h b/include/llvm-c/Transforms/PassManagerBuilder.h
index 545f8aa..3d7a9d6 100644
--- a/include/llvm-c/Transforms/PassManagerBuilder.h
+++ b/include/llvm-c/Transforms/PassManagerBuilder.h
@@ -19,7 +19,6 @@
 typedef struct LLVMOpaquePassManagerBuilder *LLVMPassManagerBuilderRef;
 
 #ifdef __cplusplus
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 extern "C" {
 #endif
 
diff --git a/include/llvm-c/lto.h b/include/llvm-c/lto.h
index 049c4d7..51079896 100644
--- a/include/llvm-c/lto.h
+++ b/include/llvm-c/lto.h
@@ -40,7 +40,7 @@ typedef bool lto_bool_t;
  * @{
  */
 
-#define LTO_API_VERSION 10
+#define LTO_API_VERSION 11
 
 /**
  * \since prior to LTO_API_VERSION=3
@@ -79,14 +79,15 @@ typedef enum {
 typedef enum {
     LTO_CODEGEN_PIC_MODEL_STATIC         = 0,
     LTO_CODEGEN_PIC_MODEL_DYNAMIC        = 1,
-    LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC = 2
+    LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC = 2,
+    LTO_CODEGEN_PIC_MODEL_DEFAULT        = 3
 } lto_codegen_model;
 
 /** opaque reference to a loaded object module */
-typedef struct LTOModule*         lto_module_t;
+typedef struct LLVMOpaqueLTOModule *lto_module_t;
 
 /** opaque reference to a code generator */
-typedef struct LTOCodeGenerator*  lto_code_gen_t;
+typedef struct LLVMOpaqueLTOCodeGenerator *lto_code_gen_t;
 
 #ifdef __cplusplus
 extern "C" {
@@ -374,6 +375,14 @@ lto_codegen_set_pic_model(lto_code_gen_t cg, lto_codegen_model);
 extern void
 lto_codegen_set_cpu(lto_code_gen_t cg, const char *cpu);
 
+/**
+ * Sets attributes for the cpu to generate code for.
+ *
+ * \since LTO_API_VERSION=11
+ */
+extern void
+lto_codegen_set_attr(lto_code_gen_t cg, const char *attr);
+
 
 /**
  * Sets the location of the assembler tool to run. If not set, libLTO
@@ -394,8 +403,9 @@ lto_codegen_set_assembler_args(lto_code_gen_t cg, const char **args,
                                int nargs);
 
 /**
- * Tells LTO optimization passes that this symbol must be preserved
- * because it is referenced by native code or a command line option.
+ * Adds to a list of all global symbols that must exist in the final generated
+ * code. If a function is not listed there, it might be inlined into every usage
+ * and optimized away.
  *
  * \since prior to LTO_API_VERSION=3
  */
diff --git a/include/llvm-c/module.modulemap b/include/llvm-c/module.modulemap
new file mode 100644
index 0000000..2bcdbc1
--- /dev/null
+++ b/include/llvm-c/module.modulemap
@@ -0,0 +1,5 @@
+module LLVM_C {
+  requires cplusplus
+  umbrella "."
+  module * { export * }
+}
diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h
index acfefe9..50f1463 100644
--- a/include/llvm/ADT/APFloat.h
+++ b/include/llvm/ADT/APFloat.h
@@ -236,19 +236,19 @@ public:
       APInt fill(64, type);
       return getQNaN(Sem, Negative, &fill);
     } else {
-      return getQNaN(Sem, Negative, 0);
+      return getQNaN(Sem, Negative, nullptr);
     }
   }
 
   /// Factory for QNaN values.
   static APFloat getQNaN(const fltSemantics &Sem, bool Negative = false,
-                         const APInt *payload = 0) {
+                         const APInt *payload = nullptr) {
     return makeNaN(Sem, false, Negative, payload);
   }
 
   /// Factory for SNaN values.
   static APFloat getSNaN(const fltSemantics &Sem, bool Negative = false,
-                         const APInt *payload = 0) {
+                         const APInt *payload = nullptr) {
     return makeNaN(Sem, true, Negative, payload);
   }
 
@@ -500,7 +500,8 @@ private:
 
   void makeLargest(bool Neg = false);
   void makeSmallest(bool Neg = false);
-  void makeNaN(bool SNaN = false, bool Neg = false, const APInt *fill = 0);
+  void makeNaN(bool SNaN = false, bool Neg = false,
+               const APInt *fill = nullptr);
   static APFloat makeNaN(const fltSemantics &Sem, bool SNaN, bool Negative,
                          const APInt *fill);
   void makeInf(bool Neg = false);
diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index fcf280d..1b64fee 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h
@@ -12,7 +12,6 @@
 
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Allocator.h"
 #include <vector>
 
 namespace llvm {
@@ -49,10 +48,10 @@ namespace llvm {
     /// @{
 
     /// Construct an empty ArrayRef.
-    /*implicit*/ ArrayRef() : Data(0), Length(0) {}
+    /*implicit*/ ArrayRef() : Data(nullptr), Length(0) {}
 
     /// Construct an empty ArrayRef from None.
-    /*implicit*/ ArrayRef(NoneType) : Data(0), Length(0) {}
+    /*implicit*/ ArrayRef(NoneType) : Data(nullptr), Length(0) {}
 
     /// Construct an ArrayRef from a single element.
     /*implicit*/ ArrayRef(const T &OneElt)
@@ -121,9 +120,9 @@ namespace llvm {
       return Data[Length-1];
     }
 
-    // copy - Allocate copy in BumpPtrAllocator and return ArrayRef<T> to it.
-    ArrayRef<T> copy(BumpPtrAllocator &Allocator) {
-      T *Buff = Allocator.Allocate<T>(Length);
+    // copy - Allocate copy in Allocator and return ArrayRef<T> to it.
+    template <typename Allocator> ArrayRef<T> copy(Allocator &A) {
+      T *Buff = A.template Allocate<T>(Length);
       std::copy(begin(), end(), Buff);
       return ArrayRef<T>(Buff, Length);
     }
@@ -132,10 +131,7 @@ namespace llvm {
     bool equals(ArrayRef RHS) const {
       if (Length != RHS.Length)
         return false;
-      for (size_type i = 0; i != Length; i++)
-        if (Data[i] != RHS.Data[i])
-          return false;
-      return true;
+      return std::equal(begin(), end(), RHS.begin());
     }
 
     /// slice(n) - Chop off the first N elements of the array.
@@ -221,7 +217,7 @@ namespace llvm {
 
     /// Construct an MutableArrayRef from a C array.
     template <size_t N>
-    /*implicit*/ MutableArrayRef(T (&Arr)[N])
+    /*implicit*/ LLVM_CONSTEXPR MutableArrayRef(T (&Arr)[N])
       : ArrayRef<T>(Arr) {}
 
     T *data() const { return const_cast<T*>(ArrayRef<T>::data()); }
diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h
index b531820..da2b3ad 100644
--- a/include/llvm/ADT/BitVector.h
+++ b/include/llvm/ADT/BitVector.h
@@ -72,7 +72,7 @@ public:
 
   /// BitVector default ctor - Creates an empty bitvector.
   BitVector() : Size(0), Capacity(0) {
-    Bits = 0;
+    Bits = nullptr;
   }
 
   /// BitVector ctor - Creates a bitvector of specified number of bits. All
@@ -88,7 +88,7 @@ public:
   /// BitVector copy ctor.
   BitVector(const BitVector &RHS) : Size(RHS.size()) {
     if (Size == 0) {
-      Bits = 0;
+      Bits = nullptr;
       Capacity = 0;
       return;
     }
@@ -100,7 +100,7 @@ public:
 
   BitVector(BitVector &&RHS)
     : Bits(RHS.Bits), Size(RHS.Size), Capacity(RHS.Capacity) {
-    RHS.Bits = 0;
+    RHS.Bits = nullptr;
   }
 
   ~BitVector() {
@@ -467,7 +467,7 @@ public:
     Size = RHS.Size;
     Capacity = RHS.Capacity;
 
-    RHS.Bits = 0;
+    RHS.Bits = nullptr;
 
     return *this;
   }
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index 037989f..8269132 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -461,12 +461,12 @@ private:
     const unsigned NumBuckets = getNumBuckets();
 
     if (NumBuckets == 0) {
-      FoundBucket = 0;
+      FoundBucket = nullptr;
       return false;
     }
 
     // FoundTombstone - Keep track of whether we find a tombstone while probing.
-    const BucketT *FoundTombstone = 0;
+    const BucketT *FoundTombstone = nullptr;
     const KeyT EmptyKey = getEmptyKey();
     const KeyT TombstoneKey = getTombstoneKey();
     assert(!KeyInfoT::isEqual(Val, EmptyKey) &&
@@ -665,7 +665,7 @@ private:
   bool allocateBuckets(unsigned Num) {
     NumBuckets = Num;
     if (NumBuckets == 0) {
-      Buckets = 0;
+      Buckets = nullptr;
       return false;
     }
 
@@ -985,7 +985,7 @@ public:
 private:
   pointer Ptr, End;
 public:
-  DenseMapIterator() : Ptr(0), End(0) {}
+  DenseMapIterator() : Ptr(nullptr), End(nullptr) {}
 
   DenseMapIterator(pointer Pos, pointer E, bool NoAdvance = false)
     : Ptr(Pos), End(E) {
diff --git a/include/llvm/ADT/DepthFirstIterator.h b/include/llvm/ADT/DepthFirstIterator.h
index 6445442..dfba43f 100644
--- a/include/llvm/ADT/DepthFirstIterator.h
+++ b/include/llvm/ADT/DepthFirstIterator.h
@@ -33,6 +33,7 @@
 #ifndef LLVM_ADT_DEPTHFIRSTITERATOR_H
 #define LLVM_ADT_DEPTHFIRSTITERATOR_H
 
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -207,6 +208,12 @@ df_iterator<T> df_end(const T& G) {
   return df_iterator<T>::end(G);
 }
 
+// Provide an accessor method to use them in range-based patterns.
+template <class T>
+iterator_range<df_iterator<T>> depth_first(const T& G) {
+  return iterator_range<df_iterator<T>>(df_begin(G), df_end(G));
+}
+
 // Provide global definitions of external depth first iterators...
 template <class T, class SetTy = std::set<typename GraphTraits<T>::NodeType*> >
 struct df_ext_iterator : public df_iterator<T, SetTy, true> {
@@ -244,6 +251,12 @@ idf_iterator<T> idf_end(const T& G){
   return idf_iterator<T>::end(Inverse<T>(G));
 }
 
+// Provide an accessor method to use them in range-based patterns.
+template <class T>
+iterator_range<idf_iterator<T>> inverse_depth_first(const T& G) {
+  return iterator_range<idf_iterator<T>>(idf_begin(G), idf_end(G));
+}
+
 // Provide global definitions of external inverse depth first iterators...
 template <class T, class SetTy = std::set<typename GraphTraits<T>::NodeType*> >
 struct idf_ext_iterator : public idf_iterator<T, SetTy, true> {
diff --git a/include/llvm/ADT/EquivalenceClasses.h b/include/llvm/ADT/EquivalenceClasses.h
index 2256ee7..e0396c7 100644
--- a/include/llvm/ADT/EquivalenceClasses.h
+++ b/include/llvm/ADT/EquivalenceClasses.h
@@ -86,14 +86,14 @@ class EquivalenceClasses {
     }
 
     void setNext(const ECValue *NewNext) const {
-      assert(getNext() == 0 && "Already has a next pointer!");
+      assert(getNext() == nullptr && "Already has a next pointer!");
       Next = (const ECValue*)((intptr_t)NewNext | (intptr_t)isLeader());
     }
   public:
     ECValue(const ECValue &RHS) : Leader(this), Next((ECValue*)(intptr_t)1),
                                   Data(RHS.Data) {
       // Only support copying of singleton nodes.
-      assert(RHS.isLeader() && RHS.getNext() == 0 && "Not a singleton!");
+      assert(RHS.isLeader() && RHS.getNext() == nullptr && "Not a singleton!");
     }
 
     bool operator<(const ECValue &UFN) const { return Data < UFN.Data; }
@@ -147,10 +147,10 @@ public:
   class member_iterator;
   member_iterator member_begin(iterator I) const {
     // Only leaders provide anything to iterate over.
-    return member_iterator(I->isLeader() ? &*I : 0);
+    return member_iterator(I->isLeader() ? &*I : nullptr);
   }
   member_iterator member_end() const {
-    return member_iterator(0);
+    return member_iterator(nullptr);
   }
 
   /// findValue - Return an iterator to the specified value.  If it does not
@@ -251,13 +251,13 @@ public:
     explicit member_iterator(const ECValue *N) : Node(N) {}
 
     reference operator*() const {
-      assert(Node != 0 && "Dereferencing end()!");
+      assert(Node != nullptr && "Dereferencing end()!");
       return Node->getData();
     }
     reference operator->() const { return operator*(); }
 
     member_iterator &operator++() {
-      assert(Node != 0 && "++'d off the end of the list!");
+      assert(Node != nullptr && "++'d off the end of the list!");
       Node = Node->getNext();
       return *this;
     }
diff --git a/include/llvm/ADT/FoldingSet.h b/include/llvm/ADT/FoldingSet.h
index 188010d..9b7ee85 100644
--- a/include/llvm/ADT/FoldingSet.h
+++ b/include/llvm/ADT/FoldingSet.h
@@ -137,7 +137,7 @@ public:
 
   public:
 
-    Node() : NextInFoldingSetBucket(0) {}
+    Node() : NextInFoldingSetBucket(nullptr) {}
 
     // Accessors
     void *getNextInBucket() const { return NextInFoldingSetBucket; }
@@ -269,7 +269,7 @@ class FoldingSetNodeIDRef {
   const unsigned *Data;
   size_t Size;
 public:
-  FoldingSetNodeIDRef() : Data(0), Size(0) {}
+  FoldingSetNodeIDRef() : Data(nullptr), Size(0) {}
   FoldingSetNodeIDRef(const unsigned *D, size_t S) : Data(D), Size(S) {}
 
   /// ComputeHash - Compute a strong hash value for this FoldingSetNodeIDRef,
diff --git a/include/llvm/ADT/Hashing.h b/include/llvm/ADT/Hashing.h
index 4bffd8e..b11e3c1 100644
--- a/include/llvm/ADT/Hashing.h
+++ b/include/llvm/ADT/Hashing.h
@@ -45,7 +45,6 @@
 #ifndef LLVM_ADT_HASHING_H
 #define LLVM_ADT_HASHING_H
 
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/SwapByteOrder.h"
@@ -266,7 +265,6 @@ inline uint64_t hash_short(const char *s, size_t length, uint64_t seed) {
 /// keeps 56 bytes of arbitrary state.
 struct hash_state {
   uint64_t h0, h1, h2, h3, h4, h5, h6;
-  uint64_t seed;
 
   /// \brief Create a new hash_state structure and initialize it based on the
   /// seed and the first 64-byte chunk.
@@ -274,7 +272,7 @@ struct hash_state {
   static hash_state create(const char *s, uint64_t seed) {
     hash_state state = {
       0, seed, hash_16_bytes(seed, k1), rotate(seed ^ k1, 49),
-      seed * k1, shift_mix(seed), 0, seed };
+      seed * k1, shift_mix(seed), 0 };
     state.h6 = hash_16_bytes(state.h4, state.h5);
     state.mix(s);
     return state;
@@ -412,7 +410,7 @@ template <typename InputIteratorT>
 hash_code hash_combine_range_impl(InputIteratorT first, InputIteratorT last) {
   const size_t seed = get_execution_seed();
   char buffer[64], *buffer_ptr = buffer;
-  char *const buffer_end = buffer_ptr + array_lengthof(buffer);
+  char *const buffer_end = std::end(buffer);
   while (first != last && store_and_advance(buffer_ptr, buffer_end,
                                             get_hashable_data(*first)))
     ++first;
diff --git a/include/llvm/ADT/ImmutableIntervalMap.h b/include/llvm/ADT/ImmutableIntervalMap.h
deleted file mode 100644
index 6793c6b..0000000
--- a/include/llvm/ADT/ImmutableIntervalMap.h
+++ /dev/null
@@ -1,248 +0,0 @@
-//===--- ImmutableIntervalMap.h - Immutable (functional) map  ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the ImmutableIntervalMap class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ADT_IMMUTABLEINTERVALMAP_H
-#define LLVM_ADT_IMMUTABLEINTERVALMAP_H
-
-#include "llvm/ADT/ImmutableMap.h"
-
-namespace llvm {
-
-class Interval {
-private:
-  int64_t Start;
-  int64_t End;
-
-public:
-  Interval(int64_t S, int64_t E) : Start(S), End(E) {}
-
-  int64_t getStart() const { return Start; }
-  int64_t getEnd() const { return End; }
-};
-
-template <typename T>
-struct ImutIntervalInfo {
-  typedef const std::pair<Interval, T> value_type;
-  typedef const value_type &value_type_ref;
-  typedef const Interval key_type;
-  typedef const Interval &key_type_ref;
-  typedef const T data_type;
-  typedef const T &data_type_ref;
-
-  static key_type_ref KeyOfValue(value_type_ref V) {
-    return V.first;
-  }
-
-  static data_type_ref DataOfValue(value_type_ref V) {
-    return V.second;
-  }
-
-  static bool isEqual(key_type_ref L, key_type_ref R) {
-    return L.getStart() == R.getStart() && L.getEnd() == R.getEnd();
-  }
-
-  static bool isDataEqual(data_type_ref L, data_type_ref R) {
-    return ImutContainerInfo<T>::isEqual(L,R);
-  }
-
-  static bool isLess(key_type_ref L, key_type_ref R) {
-    // Assume L and R does not overlap.
-    if (L.getStart() < R.getStart()) {
-      assert(L.getEnd() < R.getStart());
-      return true;
-    } else if (L.getStart() == R.getStart()) {
-      assert(L.getEnd() == R.getEnd());
-      return false;
-    } else {
-      assert(L.getStart() > R.getEnd());
-      return false;
-    }
-  }
-
-  static bool isContainedIn(key_type_ref K, key_type_ref L) {
-    if (K.getStart() >= L.getStart() && K.getEnd() <= L.getEnd())
-      return true;
-    else
-      return false;
-  }
-
-  static void Profile(FoldingSetNodeID &ID, value_type_ref V) {
-    ID.AddInteger(V.first.getStart());
-    ID.AddInteger(V.first.getEnd());
-    ImutProfileInfo<T>::Profile(ID, V.second);
-  }
-};
-
-template <typename ImutInfo>
-class ImutIntervalAVLFactory : public ImutAVLFactory<ImutInfo> {
-  typedef ImutAVLTree<ImutInfo> TreeTy;
-  typedef typename ImutInfo::value_type     value_type;
-  typedef typename ImutInfo::value_type_ref value_type_ref;
-  typedef typename ImutInfo::key_type       key_type;
-  typedef typename ImutInfo::key_type_ref   key_type_ref;
-  typedef typename ImutInfo::data_type      data_type;
-  typedef typename ImutInfo::data_type_ref  data_type_ref;
-
-public:
-  ImutIntervalAVLFactory(BumpPtrAllocator &Alloc) 
-    : ImutAVLFactory<ImutInfo>(Alloc) {}
-
-  TreeTy *Add(TreeTy *T, value_type_ref V) {
-    T = add_internal(V,T);
-    this->MarkImmutable(T);
-    return T;
-  }
-
-  TreeTy *Find(TreeTy *T, key_type_ref K) {
-    if (!T)
-      return NULL;
-
-    key_type_ref CurrentKey = ImutInfo::KeyOfValue(this->getValue(T));
-
-    if (ImutInfo::isContainedIn(K, CurrentKey))
-      return T;
-    else if (ImutInfo::isLess(K, CurrentKey))
-      return Find(this->getLeft(T), K);
-    else
-      return Find(this->getRight(T), K);
-  }
-
-private:
-  TreeTy *add_internal(value_type_ref V, TreeTy *T) {
-    key_type_ref K = ImutInfo::KeyOfValue(V);
-    T = removeAllOverlaps(T, K);
-    if (this->isEmpty(T))
-      return this->CreateNode(NULL, V, NULL);
-
-    assert(!T->isMutable());
-
-    key_type_ref KCurrent = ImutInfo::KeyOfValue(this->Value(T));
-
-    if (ImutInfo::isLess(K, KCurrent))
-      return this->Balance(add_internal(V, this->Left(T)), this->Value(T), 
-                                        this->Right(T));
-    else
-      return this->Balance(this->Left(T), this->Value(T), 
-                           add_internal(V, this->Right(T)));
-  }
-
-  // Remove all overlaps from T.
-  TreeTy *removeAllOverlaps(TreeTy *T, key_type_ref K) {
-    bool Changed;
-    do {
-      Changed = false;
-      T = removeOverlap(T, K, Changed);
-      this->markImmutable(T);
-    } while (Changed);
-
-    return T;
-  }
-
-  // Remove one overlap from T.
-  TreeTy *removeOverlap(TreeTy *T, key_type_ref K, bool &Changed) {
-    if (!T)
-      return NULL;
-    Interval CurrentK = ImutInfo::KeyOfValue(this->Value(T));
-
-    // If current key does not overlap the inserted key.
-    if (CurrentK.getStart() > K.getEnd())
-      return this->Balance(removeOverlap(this->Left(T), K, Changed),
-                           this->Value(T), this->Right(T));
-    else if (CurrentK.getEnd() < K.getStart())
-      return this->Balance(this->Left(T), this->Value(T), 
-                           removeOverlap(this->Right(T), K, Changed));
-
-    // Current key overlaps with the inserted key.
-    // Remove the current key.
-    Changed = true;
-    data_type_ref OldData = ImutInfo::DataOfValue(this->Value(T));
-    T = this->Remove_internal(CurrentK, T);
-    // Add back the unoverlapped part of the current key.
-    if (CurrentK.getStart() < K.getStart()) {
-      if (CurrentK.getEnd() <= K.getEnd()) {
-        Interval NewK(CurrentK.getStart(), K.getStart()-1);
-        return add_internal(std::make_pair(NewK, OldData), T);
-      } else {
-        Interval NewK1(CurrentK.getStart(), K.getStart()-1);
-        T = add_internal(std::make_pair(NewK1, OldData), T); 
-
-        Interval NewK2(K.getEnd()+1, CurrentK.getEnd());
-        return add_internal(std::make_pair(NewK2, OldData), T);
-      }
-    } else {
-      if (CurrentK.getEnd() > K.getEnd()) {
-        Interval NewK(K.getEnd()+1, CurrentK.getEnd());
-        return add_internal(std::make_pair(NewK, OldData), T);
-      } else
-        return T;
-    }
-  }
-};
-
-/// ImmutableIntervalMap maps an interval [start, end] to a value. The intervals
-/// in the map are guaranteed to be disjoint.
-template <typename ValT>
-class ImmutableIntervalMap 
-  : public ImmutableMap<Interval, ValT, ImutIntervalInfo<ValT> > {
-
-  typedef typename ImutIntervalInfo<ValT>::value_type      value_type;
-  typedef typename ImutIntervalInfo<ValT>::value_type_ref  value_type_ref;
-  typedef typename ImutIntervalInfo<ValT>::key_type        key_type;
-  typedef typename ImutIntervalInfo<ValT>::key_type_ref    key_type_ref;
-  typedef typename ImutIntervalInfo<ValT>::data_type       data_type;
-  typedef typename ImutIntervalInfo<ValT>::data_type_ref   data_type_ref;
-  typedef ImutAVLTree<ImutIntervalInfo<ValT> > TreeTy;
-
-public:
-  explicit ImmutableIntervalMap(TreeTy *R) 
-    : ImmutableMap<Interval, ValT, ImutIntervalInfo<ValT> >(R) {}
-
-  class Factory {
-    ImutIntervalAVLFactory<ImutIntervalInfo<ValT> > F;
-
-  public:
-    Factory(BumpPtrAllocator& Alloc) : F(Alloc) {}
-
-    ImmutableIntervalMap getEmptyMap() { 
-      return ImmutableIntervalMap(F.getEmptyTree()); 
-    }
-
-    ImmutableIntervalMap add(ImmutableIntervalMap Old, 
-                             key_type_ref K, data_type_ref D) {
-      TreeTy *T = F.add(Old.Root, std::pair<key_type, data_type>(K, D));
-      return ImmutableIntervalMap(F.getCanonicalTree(T));
-    }
-
-    ImmutableIntervalMap remove(ImmutableIntervalMap Old, key_type_ref K) {
-      TreeTy *T = F.remove(Old.Root, K);
-      return ImmutableIntervalMap(F.getCanonicalTree(T));
-    }
-
-    data_type *lookup(ImmutableIntervalMap M, key_type_ref K) {
-      TreeTy *T = F.Find(M.getRoot(), K);
-      if (T)
-        return &T->getValue().second;
-      else
-        return 0;
-    }
-  };
-
-private:
-  // For ImmutableIntervalMap, the lookup operation has to be done by the 
-  // factory.
-  data_type* lookup(key_type_ref K) const;
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/include/llvm/ADT/ImmutableMap.h b/include/llvm/ADT/ImmutableMap.h
index 8f8fb98..11f281b 100644
--- a/include/llvm/ADT/ImmutableMap.h
+++ b/include/llvm/ADT/ImmutableMap.h
@@ -241,14 +241,14 @@ public:
       if (T) return &T->getValue().second;
     }
 
-    return 0;
+    return nullptr;
   }
   
   /// getMaxElement - Returns the <key,value> pair in the ImmutableMap for
   ///  which key is the highest in the ordering of keys in the map.  This
   ///  method returns NULL if the map is empty.
   value_type* getMaxElement() const {
-    return Root ? &(Root->getMaxElement()->getValue()) : 0;
+    return Root ? &(Root->getMaxElement()->getValue()) : nullptr;
   }
 
   //===--------------------------------------------------===//
diff --git a/include/llvm/ADT/ImmutableSet.h b/include/llvm/ADT/ImmutableSet.h
index ad34969..5a3d8ad 100644
--- a/include/llvm/ADT/ImmutableSet.h
+++ b/include/llvm/ADT/ImmutableSet.h
@@ -81,7 +81,7 @@ public:
       else
         T = T->getRight();
     }
-    return NULL;
+    return nullptr;
   }
 
   /// getMaxElement - Find the subtree associated with the highest ranged
@@ -242,9 +242,9 @@ private:
   ///   ImutAVLFactory.
   ImutAVLTree(Factory *f, ImutAVLTree* l, ImutAVLTree* r, value_type_ref v,
               unsigned height)
-    : factory(f), left(l), right(r), prev(0), next(0), height(height),
-      IsMutable(true), IsDigestCached(false), IsCanonicalized(0),
-      value(v), digest(0), refCount(0)
+    : factory(f), left(l), right(r), prev(nullptr), next(nullptr),
+      height(height), IsMutable(true), IsDigestCached(false),
+      IsCanonicalized(0), value(v), digest(0), refCount(0)
   {
     if (left) left->retain();
     if (right) right->retain();
@@ -411,7 +411,7 @@ public:
     return T;
   }
 
-  TreeTy* getEmptyTree() const { return NULL; }
+  TreeTy* getEmptyTree() const { return nullptr; }
 
 protected:
 
@@ -607,7 +607,7 @@ protected:
 public:
   TreeTy *getCanonicalTree(TreeTy *TNew) {
     if (!TNew)
-      return 0;
+      return nullptr;
 
     if (TNew->IsCanonicalized)
       return TNew;
@@ -619,7 +619,7 @@ public:
     do {
       if (!entry)
         break;
-      for (TreeTy *T = entry ; T != 0; T = T->next) {
+      for (TreeTy *T = entry ; T != nullptr; T = T->next) {
         // Compare the Contents('T') with Contents('TNew')
         typename TreeTy::iterator TI = T->begin(), TE = T->end();
         if (!compareTreeWithSection(TNew, TI, TE))
@@ -696,12 +696,7 @@ public:
   }
 
   inline bool operator==(const _Self& x) const {
-    if (stack.size() != x.stack.size())
-      return false;
-    for (unsigned i = 0 ; i < stack.size(); i++)
-      if (stack[i] != x.stack[i])
-        return false;
-    return true;
+    return stack == x.stack;
   }
 
   inline bool operator!=(const _Self& x) const { return !operator==(x); }
diff --git a/include/llvm/ADT/IntervalMap.h b/include/llvm/ADT/IntervalMap.h
index 1ca3288..46549ee 100644
--- a/include/llvm/ADT/IntervalMap.h
+++ b/include/llvm/ADT/IntervalMap.h
@@ -1177,7 +1177,7 @@ branchRoot(unsigned Position) {
   if (Nodes == 1)
     size[0] = rootSize;
   else
-    NewOffset = distribute(Nodes, rootSize, Leaf::Capacity,  NULL, size,
+    NewOffset = distribute(Nodes, rootSize, Leaf::Capacity,  nullptr, size,
                            Position, true);
 
   // Allocate new nodes.
@@ -1218,7 +1218,7 @@ splitRoot(unsigned Position) {
   if (Nodes == 1)
     Size[0] = rootSize;
   else
-    NewOffset = distribute(Nodes, rootSize, Leaf::Capacity,  NULL, Size,
+    NewOffset = distribute(Nodes, rootSize, Leaf::Capacity,  nullptr, Size,
                            Position, true);
 
   // Allocate new nodes.
@@ -1346,7 +1346,7 @@ protected:
 
 public:
   /// const_iterator - Create an iterator that isn't pointing anywhere.
-  const_iterator() : map(0) {}
+  const_iterator() : map(nullptr) {}
 
   /// setMap - Change the map iterated over. This call must be followed by a
   /// call to goToBegin(), goToEnd(), or find()
diff --git a/include/llvm/ADT/IntrusiveRefCntPtr.h b/include/llvm/ADT/IntrusiveRefCntPtr.h
index 729e37f..cd1946c 100644
--- a/include/llvm/ADT/IntrusiveRefCntPtr.h
+++ b/include/llvm/ADT/IntrusiveRefCntPtr.h
@@ -139,7 +139,7 @@ public:
   public:
     typedef T element_type;
 
-    explicit IntrusiveRefCntPtr() : Obj(0) {}
+    explicit IntrusiveRefCntPtr() : Obj(nullptr) {}
 
     IntrusiveRefCntPtr(T* obj) : Obj(obj) {
       retain();
@@ -150,7 +150,7 @@ public:
     }
 
     IntrusiveRefCntPtr(IntrusiveRefCntPtr&& S) : Obj(S.Obj) {
-      S.Obj = 0;
+      S.Obj = nullptr;
     }
 
     template <class X>
@@ -179,7 +179,7 @@ public:
 
     typedef T* (IntrusiveRefCntPtr::*unspecified_bool_type) () const;
     operator unspecified_bool_type() const {
-      return Obj == 0 ? 0 : &IntrusiveRefCntPtr::getPtr;
+      return Obj ? &IntrusiveRefCntPtr::getPtr : nullptr;
     }
 
     void swap(IntrusiveRefCntPtr& other) {
@@ -190,7 +190,7 @@ public:
 
     void reset() {
       release();
-      Obj = 0;
+      Obj = nullptr;
     }
 
     void resetWithoutRelease() {
diff --git a/include/llvm/ADT/OwningPtr.h b/include/llvm/ADT/OwningPtr.h
index 034bcfd..5e83358 100644
--- a/include/llvm/ADT/OwningPtr.h
+++ b/include/llvm/ADT/OwningPtr.h
@@ -69,7 +69,7 @@ public:
   /// not delete the pointer before returning it.
   T *take() {
     T *Tmp = Ptr;
-    Ptr = 0;
+    Ptr = nullptr;
     return Tmp;
   }
 
@@ -84,9 +84,9 @@ public:
 
   T *operator->() const { return Ptr; }
   T *get() const { return Ptr; }
-  LLVM_EXPLICIT operator bool() const { return Ptr != 0; }
-  bool operator!() const { return Ptr == 0; }
-  bool isValid() const { return Ptr != 0; }
+  LLVM_EXPLICIT operator bool() const { return Ptr != nullptr; }
+  bool operator!() const { return Ptr == nullptr; }
+  bool isValid() const { return Ptr != nullptr; }
 
   void swap(OwningPtr &RHS) {
     T *Tmp = RHS.Ptr;
@@ -146,7 +146,7 @@ public:
 
   T *get() const { return Ptr; }
   LLVM_EXPLICIT operator bool() const { return Ptr != 0; }
-  bool operator!() const { return Ptr == 0; }
+  bool operator!() const { return Ptr == nullptr; }
 
   void swap(OwningArrayPtr &RHS) {
     T *Tmp = RHS.Ptr;
diff --git a/include/llvm/ADT/PointerUnion.h b/include/llvm/ADT/PointerUnion.h
index 8cbe8d1..a6dddd2 100644
--- a/include/llvm/ADT/PointerUnion.h
+++ b/include/llvm/ADT/PointerUnion.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_ADT_POINTERUNION_H
 #define LLVM_ADT_POINTERUNION_H
 
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Support/Compiler.h"
 
@@ -153,6 +154,12 @@ namespace llvm {
          "Can't get the address because PointerLikeTypeTraits changes the ptr");
       return (PT1 *)Val.getAddrOfPointer();
     }
+
+    /// \brief Assignment from nullptr which just clears the union.
+    const PointerUnion &operator=(std::nullptr_t) {
+      Val.initWithPointer(nullptr);
+      return *this;
+    }
     
     /// Assignment operators - Allow assigning into this union from either
     /// pointer type, setting the discriminator to remember what it came from.
@@ -297,6 +304,12 @@ namespace llvm {
       if (is<T>()) return get<T>();
       return T();
     }
+
+    /// \brief Assignment from nullptr which just clears the union.
+    const PointerUnion3 &operator=(std::nullptr_t) {
+      Val = nullptr;
+      return *this;
+    }
     
     /// Assignment operators - Allow assigning into this union from either
     /// pointer type, setting the discriminator to remember what it came from.
@@ -406,6 +419,12 @@ namespace llvm {
       if (is<T>()) return get<T>();
       return T();
     }
+
+    /// \brief Assignment from nullptr which just clears the union.
+    const PointerUnion4 &operator=(std::nullptr_t) {
+      Val = nullptr;
+      return *this;
+    }
     
     /// Assignment operators - Allow assigning into this union from either
     /// pointer type, setting the discriminator to remember what it came from.
@@ -455,6 +474,33 @@ namespace llvm {
           ::NumLowBitsAvailable
     };
   };
+
+  // Teach DenseMap how to use PointerUnions as keys.
+  template<typename T, typename U>
+  struct DenseMapInfo<PointerUnion<T, U> > {
+    typedef PointerUnion<T, U> Pair;
+    typedef DenseMapInfo<T> FirstInfo;
+    typedef DenseMapInfo<U> SecondInfo;
+
+    static inline Pair getEmptyKey() {
+      return Pair(FirstInfo::getEmptyKey());
+    }
+    static inline Pair getTombstoneKey() {
+      return Pair(FirstInfo::getTombstoneKey());
+    }
+    static unsigned getHashValue(const Pair &PairVal) {
+      intptr_t key = (intptr_t)PairVal.getOpaqueValue();
+      return DenseMapInfo<intptr_t>::getHashValue(key);
+    }
+    static bool isEqual(const Pair &LHS, const Pair &RHS) {
+      return LHS.template is<T>() == RHS.template is<T>() &&
+             (LHS.template is<T>() ?
+              FirstInfo::isEqual(LHS.template get<T>(),
+                                 RHS.template get<T>()) :
+              SecondInfo::isEqual(LHS.template get<U>(),
+                                  RHS.template get<U>()));
+    }
+  };
 }
 
 #endif
diff --git a/include/llvm/ADT/PostOrderIterator.h b/include/llvm/ADT/PostOrderIterator.h
index 59fa3f3..dd8cc74 100644
--- a/include/llvm/ADT/PostOrderIterator.h
+++ b/include/llvm/ADT/PostOrderIterator.h
@@ -111,7 +111,7 @@ class po_iterator : public std::iterator<std::forward_iterator_tag,
   }
 
   inline po_iterator(NodeType *BB) {
-    this->insertEdge((NodeType*)0, BB);
+    this->insertEdge((NodeType*)nullptr, BB);
     VisitStack.push_back(std::make_pair(BB, GT::child_begin(BB)));
     traverseChild();
   }
@@ -119,7 +119,7 @@ class po_iterator : public std::iterator<std::forward_iterator_tag,
 
   inline po_iterator(NodeType *BB, SetType &S) :
     po_iterator_storage<SetType, ExtStorage>(S) {
-    if (this->insertEdge((NodeType*)0, BB)) {
+    if (this->insertEdge((NodeType*)nullptr, BB)) {
       VisitStack.push_back(std::make_pair(BB, GT::child_begin(BB)));
       traverseChild();
     }
diff --git a/include/llvm/ADT/SCCIterator.h b/include/llvm/ADT/SCCIterator.h
index 58ac149..bc74416 100644
--- a/include/llvm/ADT/SCCIterator.h
+++ b/include/llvm/ADT/SCCIterator.h
@@ -25,6 +25,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/iterator.h"
 #include <vector>
 
 namespace llvm {
@@ -35,19 +36,17 @@ namespace llvm {
 /// This is implemented using Tarjan's DFS algorithm using an internal stack to
 /// build up a vector of nodes in a particular SCC. Note that it is a forward
 /// iterator and thus you cannot backtrack or re-visit nodes.
-template <class GraphT, class GT = GraphTraits<GraphT> >
+template <class GraphT, class GT = GraphTraits<GraphT>>
 class scc_iterator
-    : public std::iterator<std::forward_iterator_tag,
-                           std::vector<typename GT::NodeType>, ptrdiff_t> {
+    : public iterator_facade_base<
+          scc_iterator<GraphT, GT>, std::forward_iterator_tag,
+          const std::vector<typename GT::NodeType *>, ptrdiff_t> {
   typedef typename GT::NodeType NodeType;
   typedef typename GT::ChildIteratorType ChildItTy;
   typedef std::vector<NodeType *> SccTy;
-  typedef std::iterator<std::forward_iterator_tag,
-                        std::vector<typename GT::NodeType>, ptrdiff_t> super;
-  typedef typename super::reference reference;
-  typedef typename super::pointer pointer;
+  typedef typename scc_iterator::reference reference;
 
-  // Element of VisitStack during DFS.
+  /// Element of VisitStack during DFS.
   struct StackElement {
     NodeType *Node;       ///< The current node pointer.
     ChildItTy NextChild;  ///< The next child, modified inplace during DFS.
@@ -63,135 +62,63 @@ class scc_iterator
     }
   };
 
-  // The visit counters used to detect when a complete SCC is on the stack.
-  // visitNum is the global counter.
-  // nodeVisitNumbers are per-node visit numbers, also used as DFS flags.
+  /// The visit counters used to detect when a complete SCC is on the stack.
+  /// visitNum is the global counter.
+  ///
+  /// nodeVisitNumbers are per-node visit numbers, also used as DFS flags.
   unsigned visitNum;
   DenseMap<NodeType *, unsigned> nodeVisitNumbers;
 
-  // Stack holding nodes of the SCC.
+  /// Stack holding nodes of the SCC.
   std::vector<NodeType *> SCCNodeStack;
 
-  // The current SCC, retrieved using operator*().
+  /// The current SCC, retrieved using operator*().
   SccTy CurrentSCC;
 
-
-  // DFS stack, Used to maintain the ordering.  The top contains the current
-  // node, the next child to visit, and the minimum uplink value of all child
+  /// DFS stack, Used to maintain the ordering.  The top contains the current
+  /// node, the next child to visit, and the minimum uplink value of all child
   std::vector<StackElement> VisitStack;
 
-  // A single "visit" within the non-recursive DFS traversal.
-  void DFSVisitOne(NodeType *N) {
-    ++visitNum;
-    nodeVisitNumbers[N] = visitNum;
-    SCCNodeStack.push_back(N);
-    VisitStack.push_back(StackElement(N, GT::child_begin(N), visitNum));
-#if 0 // Enable if needed when debugging.
-    dbgs() << "TarjanSCC: Node " << N <<
-          " : visitNum = " << visitNum << "\n";
-#endif
-  }
+  /// A single "visit" within the non-recursive DFS traversal.
+  void DFSVisitOne(NodeType *N);
 
-  // The stack-based DFS traversal; defined below.
-  void DFSVisitChildren() {
-    assert(!VisitStack.empty());
-    while (VisitStack.back().NextChild !=
-           GT::child_end(VisitStack.back().Node)) {
-      // TOS has at least one more child so continue DFS
-      NodeType *childN = *VisitStack.back().NextChild++;
-      typename DenseMap<NodeType *, unsigned>::iterator Visited =
-        nodeVisitNumbers.find(childN);
-      if (Visited == nodeVisitNumbers.end()) {
-        // this node has never been seen.
-        DFSVisitOne(childN);
-        continue;
-      }
-
-      unsigned childNum = Visited->second;
-      if (VisitStack.back().MinVisited > childNum)
-        VisitStack.back().MinVisited = childNum;
-    }
-  }
+  /// The stack-based DFS traversal; defined below.
+  void DFSVisitChildren();
 
-  // Compute the next SCC using the DFS traversal.
-  void GetNextSCC() {
-    CurrentSCC.clear(); // Prepare to compute the next SCC
-    while (!VisitStack.empty()) {
-      DFSVisitChildren();
+  /// Compute the next SCC using the DFS traversal.
+  void GetNextSCC();
 
-      // Pop the leaf on top of the VisitStack.
-      NodeType *visitingN = VisitStack.back().Node;
-      unsigned minVisitNum = VisitStack.back().MinVisited;
-      assert(VisitStack.back().NextChild == GT::child_end(visitingN));
-      VisitStack.pop_back();
-
-      // Propagate MinVisitNum to parent so we can detect the SCC starting node.
-      if (!VisitStack.empty() && VisitStack.back().MinVisited > minVisitNum)
-        VisitStack.back().MinVisited = minVisitNum;
-
-#if 0 // Enable if needed when debugging.
-      dbgs() << "TarjanSCC: Popped node " << visitingN <<
-            " : minVisitNum = " << minVisitNum << "; Node visit num = " <<
-            nodeVisitNumbers[visitingN] << "\n";
-#endif
-
-      if (minVisitNum != nodeVisitNumbers[visitingN])
-        continue;
-
-      // A full SCC is on the SCCNodeStack!  It includes all nodes below
-      // visitingN on the stack.  Copy those nodes to CurrentSCC,
-      // reset their minVisit values, and return (this suspends
-      // the DFS traversal till the next ++).
-      do {
-        CurrentSCC.push_back(SCCNodeStack.back());
-        SCCNodeStack.pop_back();
-        nodeVisitNumbers[CurrentSCC.back()] = ~0U;
-      } while (CurrentSCC.back() != visitingN);
-      return;
-    }
-  }
-
-  inline scc_iterator(NodeType *entryN) : visitNum(0) {
+  scc_iterator(NodeType *entryN) : visitNum(0) {
     DFSVisitOne(entryN);
     GetNextSCC();
   }
 
-  // End is when the DFS stack is empty.
-  inline scc_iterator() {}
+  /// End is when the DFS stack is empty.
+  scc_iterator() {}
 
 public:
-  static inline scc_iterator begin(const GraphT &G) {
+  static scc_iterator begin(const GraphT &G) {
     return scc_iterator(GT::getEntryNode(G));
   }
-  static inline scc_iterator end(const GraphT &) { return scc_iterator(); }
+  static scc_iterator end(const GraphT &) { return scc_iterator(); }
 
   /// \brief Direct loop termination test which is more efficient than
   /// comparison with \c end().
-  inline bool isAtEnd() const {
+  bool isAtEnd() const {
     assert(!CurrentSCC.empty() || VisitStack.empty());
     return CurrentSCC.empty();
   }
 
-  inline bool operator==(const scc_iterator &x) const {
+  bool operator==(const scc_iterator &x) const {
     return VisitStack == x.VisitStack && CurrentSCC == x.CurrentSCC;
   }
-  inline bool operator!=(const scc_iterator &x) const { return !operator==(x); }
 
-  inline scc_iterator &operator++() {
+  scc_iterator &operator++() {
     GetNextSCC();
     return *this;
   }
-  inline scc_iterator operator++(int) {
-    scc_iterator tmp = *this;
-    ++*this;
-    return tmp;
-  }
 
-  inline const SccTy &operator*() const {
-    assert(!CurrentSCC.empty() && "Dereferencing END SCC iterator!");
-    return CurrentSCC;
-  }
-  inline SccTy &operator*() {
+  reference operator*() const {
     assert(!CurrentSCC.empty() && "Dereferencing END SCC iterator!");
     return CurrentSCC;
   }
@@ -200,7 +127,88 @@ public:
   ///
   /// If the SCC has more than one node, this is trivially true.  If not, it may
   /// still contain a loop if the node has an edge back to itself.
-  bool hasLoop() const {
+  bool hasLoop() const;
+
+  /// This informs the \c scc_iterator that the specified \c Old node
+  /// has been deleted, and \c New is to be used in its place.
+  void ReplaceNode(NodeType *Old, NodeType *New) {
+    assert(nodeVisitNumbers.count(Old) && "Old not in scc_iterator?");
+    nodeVisitNumbers[New] = nodeVisitNumbers[Old];
+    nodeVisitNumbers.erase(Old);
+  }
+};
+
+template <class GraphT, class GT>
+void scc_iterator<GraphT, GT>::DFSVisitOne(NodeType *N) {
+  ++visitNum;
+  nodeVisitNumbers[N] = visitNum;
+  SCCNodeStack.push_back(N);
+  VisitStack.push_back(StackElement(N, GT::child_begin(N), visitNum));
+#if 0 // Enable if needed when debugging.
+  dbgs() << "TarjanSCC: Node " << N <<
+        " : visitNum = " << visitNum << "\n";
+#endif
+}
+
+template <class GraphT, class GT>
+void scc_iterator<GraphT, GT>::DFSVisitChildren() {
+  assert(!VisitStack.empty());
+  while (VisitStack.back().NextChild != GT::child_end(VisitStack.back().Node)) {
+    // TOS has at least one more child so continue DFS
+    NodeType *childN = *VisitStack.back().NextChild++;
+    typename DenseMap<NodeType *, unsigned>::iterator Visited =
+        nodeVisitNumbers.find(childN);
+    if (Visited == nodeVisitNumbers.end()) {
+      // this node has never been seen.
+      DFSVisitOne(childN);
+      continue;
+    }
+
+    unsigned childNum = Visited->second;
+    if (VisitStack.back().MinVisited > childNum)
+      VisitStack.back().MinVisited = childNum;
+  }
+}
+
+template <class GraphT, class GT> void scc_iterator<GraphT, GT>::GetNextSCC() {
+  CurrentSCC.clear(); // Prepare to compute the next SCC
+  while (!VisitStack.empty()) {
+    DFSVisitChildren();
+
+    // Pop the leaf on top of the VisitStack.
+    NodeType *visitingN = VisitStack.back().Node;
+    unsigned minVisitNum = VisitStack.back().MinVisited;
+    assert(VisitStack.back().NextChild == GT::child_end(visitingN));
+    VisitStack.pop_back();
+
+    // Propagate MinVisitNum to parent so we can detect the SCC starting node.
+    if (!VisitStack.empty() && VisitStack.back().MinVisited > minVisitNum)
+      VisitStack.back().MinVisited = minVisitNum;
+
+#if 0 // Enable if needed when debugging.
+    dbgs() << "TarjanSCC: Popped node " << visitingN <<
+          " : minVisitNum = " << minVisitNum << "; Node visit num = " <<
+          nodeVisitNumbers[visitingN] << "\n";
+#endif
+
+    if (minVisitNum != nodeVisitNumbers[visitingN])
+      continue;
+
+    // A full SCC is on the SCCNodeStack!  It includes all nodes below
+    // visitingN on the stack.  Copy those nodes to CurrentSCC,
+    // reset their minVisit values, and return (this suspends
+    // the DFS traversal till the next ++).
+    do {
+      CurrentSCC.push_back(SCCNodeStack.back());
+      SCCNodeStack.pop_back();
+      nodeVisitNumbers[CurrentSCC.back()] = ~0U;
+    } while (CurrentSCC.back() != visitingN);
+    return;
+  }
+}
+
+template <class GraphT, class GT>
+bool scc_iterator<GraphT, GT>::hasLoop() const {
     assert(!CurrentSCC.empty() && "Dereferencing END SCC iterator!");
     if (CurrentSCC.size() > 1)
       return true;
@@ -212,15 +220,6 @@ public:
     return false;
   }
 
-  /// This informs the \c scc_iterator that the specified \c Old node
-  /// has been deleted, and \c New is to be used in its place.
-  void ReplaceNode(NodeType *Old, NodeType *New) {
-    assert(nodeVisitNumbers.count(Old) && "Old not in scc_iterator?");
-    nodeVisitNumbers[New] = nodeVisitNumbers[Old];
-    nodeVisitNumbers.erase(Old);
-  }
-};
-
 /// \brief Construct the begin iterator for a deduced graph type T.
 template <class T> scc_iterator<T> scc_begin(const T &G) {
   return scc_iterator<T>::begin(G);
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index ab6884f..1cef393 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -55,6 +55,131 @@ struct greater_ptr : public std::binary_function<Ty, Ty, bool> {
   }
 };
 
+/// An efficient, type-erasing, non-owning reference to a callable. This is
+/// intended for use as the type of a function parameter that is not used
+/// after the function in question returns.
+///
+/// This class does not own the callable, so it is not in general safe to store
+/// a function_ref.
+template<typename Fn> class function_ref;
+
+#if LLVM_HAS_VARIADIC_TEMPLATES
+
+template<typename Ret, typename ...Params>
+class function_ref<Ret(Params...)> {
+  Ret (*callback)(intptr_t callable, Params ...params);
+  intptr_t callable;
+
+  template<typename Callable>
+  static Ret callback_fn(intptr_t callable, Params ...params) {
+    return (*reinterpret_cast<Callable*>(callable))(
+        std::forward<Params>(params)...);
+  }
+
+public:
+  template<typename Callable>
+  function_ref(Callable &&callable)
+      : callback(callback_fn<typename std::remove_reference<Callable>::type>),
+        callable(reinterpret_cast<intptr_t>(&callable)) {}
+  Ret operator()(Params ...params) const {
+    return callback(callable, std::forward<Params>(params)...);
+  }
+};
+
+#else
+
+template<typename Ret>
+class function_ref<Ret()> {
+  Ret (*callback)(intptr_t callable);
+  intptr_t callable;
+
+  template<typename Callable>
+  static Ret callback_fn(intptr_t callable) {
+    return (*reinterpret_cast<Callable*>(callable))();
+  }
+
+public:
+  template<typename Callable>
+  function_ref(Callable &&callable)
+      : callback(callback_fn<typename std::remove_reference<Callable>::type>),
+        callable(reinterpret_cast<intptr_t>(&callable)) {}
+  Ret operator()() const { return callback(callable); }
+};
+
+template<typename Ret, typename Param1>
+class function_ref<Ret(Param1)> {
+  Ret (*callback)(intptr_t callable, Param1 param1);
+  intptr_t callable;
+
+  template<typename Callable>
+  static Ret callback_fn(intptr_t callable, Param1 param1) {
+    return (*reinterpret_cast<Callable*>(callable))(
+        std::forward<Param1>(param1));
+  }
+
+public:
+  template<typename Callable>
+  function_ref(Callable &&callable)
+      : callback(callback_fn<typename std::remove_reference<Callable>::type>),
+        callable(reinterpret_cast<intptr_t>(&callable)) {}
+  Ret operator()(Param1 param1) {
+    return callback(callable, std::forward<Param1>(param1));
+  }
+};
+
+template<typename Ret, typename Param1, typename Param2>
+class function_ref<Ret(Param1, Param2)> {
+  Ret (*callback)(intptr_t callable, Param1 param1, Param2 param2);
+  intptr_t callable;
+
+  template<typename Callable>
+  static Ret callback_fn(intptr_t callable, Param1 param1, Param2 param2) {
+    return (*reinterpret_cast<Callable*>(callable))(
+        std::forward<Param1>(param1),
+        std::forward<Param2>(param2));
+  }
+
+public:
+  template<typename Callable>
+  function_ref(Callable &&callable)
+      : callback(callback_fn<typename std::remove_reference<Callable>::type>),
+        callable(reinterpret_cast<intptr_t>(&callable)) {}
+  Ret operator()(Param1 param1, Param2 param2) {
+    return callback(callable,
+                    std::forward<Param1>(param1),
+                    std::forward<Param2>(param2));
+  }
+};
+
+template<typename Ret, typename Param1, typename Param2, typename Param3>
+class function_ref<Ret(Param1, Param2, Param3)> {
+  Ret (*callback)(intptr_t callable, Param1 param1, Param2 param2, Param3 param3);
+  intptr_t callable;
+
+  template<typename Callable>
+  static Ret callback_fn(intptr_t callable, Param1 param1, Param2 param2,
+                         Param3 param3) {
+    return (*reinterpret_cast<Callable*>(callable))(
+        std::forward<Param1>(param1),
+        std::forward<Param2>(param2),
+        std::forward<Param3>(param3));
+  }
+
+public:
+  template<typename Callable>
+  function_ref(Callable &&callable)
+      : callback(callback_fn<typename std::remove_reference<Callable>::type>),
+        callable(reinterpret_cast<intptr_t>(&callable)) {}
+  Ret operator()(Param1 param1, Param2 param2, Param3 param3) {
+    return callback(callable,
+                    std::forward<Param1>(param1),
+                    std::forward<Param2>(param2),
+                    std::forward<Param3>(param3));
+  }
+};
+
+#endif
+
 // deleter - Very very very simple method that is used to invoke operator
 // delete on something.  It is used like this:
 //
@@ -165,27 +290,20 @@ struct less_second {
 //     Extra additions for arrays
 //===----------------------------------------------------------------------===//
 
-/// Find where an array ends (for ending iterators)
-/// This returns a pointer to the byte immediately
-/// after the end of an array.
-template<class T, std::size_t N>
-inline T *array_endof(T (&x)[N]) {
-  return x+N;
-}
-
 /// Find the length of an array.
-template<class T, std::size_t N>
-inline size_t array_lengthof(T (&)[N]) {
+template <class T, std::size_t N>
+LLVM_CONSTEXPR inline size_t array_lengthof(T (&)[N]) {
   return N;
 }
 
-/// array_pod_sort_comparator - This is helper function for array_pod_sort,
-/// which just uses operator< on T.
+/// Adapt std::less<T> for array_pod_sort.
 template<typename T>
 inline int array_pod_sort_comparator(const void *P1, const void *P2) {
-  if (*reinterpret_cast<const T*>(P1) < *reinterpret_cast<const T*>(P2))
+  if (std::less<T>()(*reinterpret_cast<const T*>(P1),
+                     *reinterpret_cast<const T*>(P2)))
     return -1;
-  if (*reinterpret_cast<const T*>(P2) < *reinterpret_cast<const T*>(P1))
+  if (std::less<T>()(*reinterpret_cast<const T*>(P2),
+                     *reinterpret_cast<const T*>(P1)))
     return 1;
   return 0;
 }
@@ -208,7 +326,7 @@ inline int (*get_array_pod_sort_comparator(const T &))
 /// possible.
 ///
 /// This function assumes that you have simple POD-like types that can be
-/// compared with operator< and can be moved with memcpy.  If this isn't true,
+/// compared with std::less and can be moved with memcpy.  If this isn't true,
 /// you should use std::sort.
 ///
 /// NOTE: If qsort_r were portable, we could allow a custom comparator and
@@ -412,6 +530,13 @@ make_unique(size_t n) {
 
 #endif
 
+template<typename First, typename Second>
+struct pair_hash {
+  size_t operator()(const std::pair<First, Second> &P) const {
+    return std::hash<First>()(P.first) * 31 + std::hash<Second>()(P.second);
+  }
+};
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/ADT/ScopedHashTable.h b/include/llvm/ADT/ScopedHashTable.h
index efddd9f..3cc7738 100644
--- a/include/llvm/ADT/ScopedHashTable.h
+++ b/include/llvm/ADT/ScopedHashTable.h
@@ -159,18 +159,16 @@ private:
   void operator=(const ScopedHashTable&);  // NOT YET IMPLEMENTED
   friend class ScopedHashTableScope<K, V, KInfo, AllocatorTy>;
 public:
-  ScopedHashTable() : CurScope(0) {}
+  ScopedHashTable() : CurScope(nullptr) {}
   ScopedHashTable(AllocatorTy A) : CurScope(0), Allocator(A) {}
   ~ScopedHashTable() {
-    assert(CurScope == 0 && TopLevelMap.empty() && "Scope imbalance!");
+    assert(!CurScope && TopLevelMap.empty() && "Scope imbalance!");
   }
   
 
   /// Access to the allocator.
-  typedef typename ReferenceAdder<AllocatorTy>::result AllocatorRefTy;
-  typedef typename ReferenceAdder<const AllocatorTy>::result AllocatorCRefTy;
-  AllocatorRefTy getAllocator() { return Allocator; }
-  AllocatorCRefTy getAllocator() const { return Allocator; }
+  AllocatorTy &getAllocator() { return Allocator; }
+  const AllocatorTy &getAllocator() const { return Allocator; }
 
   bool count(const K &Key) const {
     return TopLevelMap.count(Key);
@@ -222,7 +220,7 @@ ScopedHashTableScope<K, V, KInfo, Allocator>::
   ScopedHashTableScope(ScopedHashTable<K, V, KInfo, Allocator> &ht) : HT(ht) {
   PrevScope = HT.CurScope;
   HT.CurScope = this;
-  LastValInScope = 0;
+  LastValInScope = nullptr;
 }
 
 template <typename K, typename V, typename KInfo, typename Allocator>
@@ -233,7 +231,7 @@ ScopedHashTableScope<K, V, KInfo, Allocator>::~ScopedHashTableScope() {
   // Pop and delete all values corresponding to this scope.
   while (ScopedHashTableVal<K, V> *ThisEntry = LastValInScope) {
     // Pop this value out of the TopLevelMap.
-    if (ThisEntry->getNextForKey() == 0) {
+    if (!ThisEntry->getNextForKey()) {
       assert(HT.TopLevelMap[ThisEntry->getKey()] == ThisEntry &&
              "Scope imbalance!");
       HT.TopLevelMap.erase(ThisEntry->getKey());
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index 0a4140e..dcf0354 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -220,28 +220,20 @@ protected:
   /// Guarantees space for at least one more element, or MinSize more
   /// elements if specified.
   void grow(size_t MinSize = 0);
-  
+
 public:
   void push_back(const T &Elt) {
-    if (this->EndX < this->CapacityX) {
-    Retry:
-      ::new ((void*) this->end()) T(Elt);
-      this->setEnd(this->end()+1);
-      return;
-    }
-    this->grow();
-    goto Retry;
+    if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
+      this->grow();
+    ::new ((void*) this->end()) T(Elt);
+    this->setEnd(this->end()+1);
   }
 
   void push_back(T &&Elt) {
-    if (this->EndX < this->CapacityX) {
-    Retry:
-      ::new ((void*) this->end()) T(::std::move(Elt));
-      this->setEnd(this->end()+1);
-      return;
-    }
-    this->grow();
-    goto Retry;
+    if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
+      this->grow();
+    ::new ((void*) this->end()) T(::std::move(Elt));
+    this->setEnd(this->end()+1);
   }
 
   void pop_back() {
@@ -255,7 +247,7 @@ template <typename T, bool isPodLike>
 void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
   size_t CurCapacity = this->capacity();
   size_t CurSize = this->size();
-  // Always grow, even from zero.  
+  // Always grow, even from zero.
   size_t NewCapacity = size_t(NextPowerOf2(CurCapacity+2));
   if (NewCapacity < MinSize)
     NewCapacity = MinSize;
@@ -335,16 +327,12 @@ protected:
   }
 public:
   void push_back(const T &Elt) {
-    if (this->EndX < this->CapacityX) {
-    Retry:
-      memcpy(this->end(), &Elt, sizeof(T));
-      this->setEnd(this->end()+1);
-      return;
-    }
-    this->grow();
-    goto Retry;
+    if (LLVM_UNLIKELY(this->EndX >= this->CapacityX))
+      this->grow();
+    memcpy(this->end(), &Elt, sizeof(T));
+    this->setEnd(this->end()+1);
   }
-  
+
   void pop_back() {
     this->setEnd(this->end()-1);
   }
@@ -493,26 +481,25 @@ public:
     assert(I >= this->begin() && "Insertion iterator is out of bounds.");
     assert(I <= this->end() && "Inserting past the end of the vector.");
 
-    if (this->EndX < this->CapacityX) {
-    Retry:
-      ::new ((void*) this->end()) T(::std::move(this->back()));
-      this->setEnd(this->end()+1);
-      // Push everything else over.
-      this->move_backward(I, this->end()-1, this->end());
+    if (this->EndX >= this->CapacityX) {
+      size_t EltNo = I-this->begin();
+      this->grow();
+      I = this->begin()+EltNo;
+    }
 
-      // If we just moved the element we're inserting, be sure to update
-      // the reference.
-      T *EltPtr = &Elt;
-      if (I <= EltPtr && EltPtr < this->EndX)
-        ++EltPtr;
+    ::new ((void*) this->end()) T(::std::move(this->back()));
+    this->setEnd(this->end()+1);
+    // Push everything else over.
+    this->move_backward(I, this->end()-1, this->end());
 
-      *I = ::std::move(*EltPtr);
-      return I;
-    }
-    size_t EltNo = I-this->begin();
-    this->grow();
-    I = this->begin()+EltNo;
-    goto Retry;
+    // If we just moved the element we're inserting, be sure to update
+    // the reference.
+    T *EltPtr = &Elt;
+    if (I <= EltPtr && EltPtr < this->EndX)
+      ++EltPtr;
+
+    *I = ::std::move(*EltPtr);
+    return I;
   }
 
   iterator insert(iterator I, const T &Elt) {
@@ -524,26 +511,24 @@ public:
     assert(I >= this->begin() && "Insertion iterator is out of bounds.");
     assert(I <= this->end() && "Inserting past the end of the vector.");
 
-    if (this->EndX < this->CapacityX) {
-    Retry:
-      ::new ((void*) this->end()) T(this->back());
-      this->setEnd(this->end()+1);
-      // Push everything else over.
-      this->move_backward(I, this->end()-1, this->end());
-
-      // If we just moved the element we're inserting, be sure to update
-      // the reference.
-      const T *EltPtr = &Elt;
-      if (I <= EltPtr && EltPtr < this->EndX)
-        ++EltPtr;
-
-      *I = *EltPtr;
-      return I;
+    if (this->EndX >= this->CapacityX) {
+      size_t EltNo = I-this->begin();
+      this->grow();
+      I = this->begin()+EltNo;
     }
-    size_t EltNo = I-this->begin();
-    this->grow();
-    I = this->begin()+EltNo;
-    goto Retry;
+    ::new ((void*) this->end()) T(this->back());
+    this->setEnd(this->end()+1);
+    // Push everything else over.
+    this->move_backward(I, this->end()-1, this->end());
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference.
+    const T *EltPtr = &Elt;
+    if (I <= EltPtr && EltPtr < this->EndX)
+      ++EltPtr;
+
+    *I = *EltPtr;
+    return I;
   }
 
   iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
@@ -820,7 +805,7 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
     this->grow(RHSSize);
   } else if (CurSize) {
     // Otherwise, use assignment for the already-constructed elements.
-    this->move(RHS.begin(), RHS.end(), this->begin());
+    this->move(RHS.begin(), RHS.begin()+CurSize, this->begin());
   }
 
   // Move-construct the new elements in place.
diff --git a/include/llvm/ADT/SparseMultiSet.h b/include/llvm/ADT/SparseMultiSet.h
index 797a898..d2b2f8d 100644
--- a/include/llvm/ADT/SparseMultiSet.h
+++ b/include/llvm/ADT/SparseMultiSet.h
@@ -187,7 +187,7 @@ public:
   typedef const ValueT *const_pointer;
 
   SparseMultiSet()
-    : Sparse(0), Universe(0), FreelistIdx(SMSNode::INVALID), NumFree(0) { }
+    : Sparse(nullptr), Universe(0), FreelistIdx(SMSNode::INVALID), NumFree(0) {}
 
   ~SparseMultiSet() { free(Sparse); }
 
diff --git a/include/llvm/ADT/SparseSet.h b/include/llvm/ADT/SparseSet.h
index b46ccc9..899f2e4 100644
--- a/include/llvm/ADT/SparseSet.h
+++ b/include/llvm/ADT/SparseSet.h
@@ -142,7 +142,7 @@ public:
   typedef ValueT *pointer;
   typedef const ValueT *const_pointer;
 
-  SparseSet() : Sparse(0), Universe(0) {}
+  SparseSet() : Sparse(nullptr), Universe(0) {}
   ~SparseSet() { free(Sparse); }
 
   /// setUniverse - Set the universe size which determines the largest key the
diff --git a/include/llvm/ADT/Statistic.h b/include/llvm/ADT/Statistic.h
index 26aac7b..d98abc3 100644
--- a/include/llvm/ADT/Statistic.h
+++ b/include/llvm/ADT/Statistic.h
@@ -46,7 +46,7 @@ public:
   /// construct - This should only be called for non-global statistics.
   void construct(const char *name, const char *desc) {
     Name = name; Desc = desc;
-    Value = 0; Initialized = 0;
+    Value = 0; Initialized = false;
   }
 
   // Allow use of this class as the value itself.
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index a0b3fe7..a152f4d 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -141,7 +141,7 @@ void SplitString(StringRef Source,
 // better: http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
 //   X*33+c -> X*33^c
 static inline unsigned HashString(StringRef Str, unsigned Result = 0) {
-  for (unsigned i = 0, e = Str.size(); i != e; ++i)
+  for (StringRef::size_type i = 0, e = Str.size(); i != e; ++i)
     Result = Result * 33 + (unsigned char)Str[i];
   return Result;
 }
diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index 4e74cf6..ecac5dd 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include <cstring>
+#include <utility>
 
 namespace llvm {
   template<typename ValueT>
@@ -48,13 +49,20 @@ protected:
   unsigned NumTombstones;
   unsigned ItemSize;
 protected:
-  explicit StringMapImpl(unsigned itemSize) : ItemSize(itemSize) {
-    // Initialize the map with zero buckets to allocation.
-    TheTable = 0;
-    NumBuckets = 0;
-    NumItems = 0;
-    NumTombstones = 0;
+  explicit StringMapImpl(unsigned itemSize)
+      : TheTable(nullptr),
+        // Initialize the map with zero buckets to allocation.
+        NumBuckets(0), NumItems(0), NumTombstones(0), ItemSize(itemSize) {}
+  StringMapImpl(StringMapImpl &&RHS)
+      : TheTable(RHS.TheTable), NumBuckets(RHS.NumBuckets),
+        NumItems(RHS.NumItems), NumTombstones(RHS.NumTombstones),
+        ItemSize(RHS.ItemSize) {
+    RHS.TheTable = nullptr;
+    RHS.NumBuckets = 0;
+    RHS.NumItems = 0;
+    RHS.NumTombstones = 0;
   }
+
   StringMapImpl(unsigned InitSize, unsigned ItemSize);
   void RehashTable();
 
@@ -109,8 +117,8 @@ public:
 
   explicit StringMapEntry(unsigned strLen)
     : StringMapEntryBase(strLen), second() {}
-  StringMapEntry(unsigned strLen, const ValueTy &V)
-    : StringMapEntryBase(strLen), second(V) {}
+  StringMapEntry(unsigned strLen, ValueTy V)
+      : StringMapEntryBase(strLen), second(std::move(V)) {}
 
   StringRef getKey() const {
     return StringRef(getKeyData(), getKeyLength());
@@ -146,7 +154,7 @@ public:
       static_cast<StringMapEntry*>(Allocator.Allocate(AllocSize,Alignment));
 
     // Default construct the value.
-    new (NewItem) StringMapEntry(KeyLength, InitVal);
+    new (NewItem) StringMapEntry(KeyLength, std::move(InitVal));
 
     // Copy the string information.
     char *StrBuffer = const_cast<char*>(NewItem->getKeyData());
@@ -166,7 +174,7 @@ public:
   static StringMapEntry *Create(const char *KeyStart, const char *KeyEnd,
                                 InitType InitVal) {
     MallocAllocator A;
-    return Create(KeyStart, KeyEnd, A, InitVal);
+    return Create(KeyStart, KeyEnd, A, std::move(InitVal));
   }
 
   static StringMapEntry *Create(const char *KeyStart, const char *KeyEnd) {
@@ -198,8 +206,10 @@ public:
   template<typename AllocatorTy>
   void Destroy(AllocatorTy &Allocator) {
     // Free memory referenced by the item.
+    unsigned AllocSize =
+        static_cast<unsigned>(sizeof(StringMapEntry)) + getKeyLength() + 1;
     this->~StringMapEntry();
-    Allocator.Deallocate(this);
+    Allocator.Deallocate(static_cast<void *>(this), AllocSize);
   }
 
   /// Destroy this object, releasing memory back to the malloc allocator.
@@ -231,23 +241,19 @@ public:
     : StringMapImpl(InitialSize, static_cast<unsigned>(sizeof(MapEntryTy))),
       Allocator(A) {}
 
-  StringMap(const StringMap &RHS)
-    : StringMapImpl(static_cast<unsigned>(sizeof(MapEntryTy))) {
-    assert(RHS.empty() &&
-           "Copy ctor from non-empty stringmap not implemented yet!");
-    (void)RHS;
-  }
-  void operator=(const StringMap &RHS) {
-    assert(RHS.empty() &&
-           "assignment from non-empty stringmap not implemented yet!");
-    (void)RHS;
-    clear();
+  StringMap(StringMap &&RHS)
+      : StringMapImpl(std::move(RHS)), Allocator(std::move(RHS.Allocator)) {}
+
+  StringMap &operator=(StringMap RHS) {
+    StringMapImpl::swap(RHS);
+    std::swap(Allocator, RHS.Allocator);
+    return *this;
   }
 
-  typedef typename ReferenceAdder<AllocatorTy>::result AllocatorRefTy;
-  typedef typename ReferenceAdder<const AllocatorTy>::result AllocatorCRefTy;
-  AllocatorRefTy getAllocator() { return Allocator; }
-  AllocatorCRefTy getAllocator() const { return Allocator; }
+  // FIXME: Implement copy operations if/when they're needed.
+
+  AllocatorTy &getAllocator() { return Allocator; }
+  const AllocatorTy &getAllocator() const { return Allocator; }
 
   typedef const char* key_type;
   typedef ValueTy mapped_type;
@@ -330,7 +336,7 @@ public:
       if (Bucket && Bucket != getTombstoneVal()) {
         static_cast<MapEntryTy*>(Bucket)->Destroy(Allocator);
       }
-      Bucket = 0;
+      Bucket = nullptr;
     }
 
     NumItems = 0;
@@ -348,7 +354,7 @@ public:
       return *static_cast<MapEntryTy*>(Bucket);
 
     MapEntryTy *NewItem =
-      MapEntryTy::Create(Key.begin(), Key.end(), Allocator, Val);
+        MapEntryTy::Create(Key.begin(), Key.end(), Allocator, std::move(Val));
 
     if (Bucket == getTombstoneVal())
       --NumTombstones;
@@ -410,7 +416,7 @@ protected:
 public:
   typedef StringMapEntry<ValueTy> value_type;
 
-  StringMapConstIterator() : Ptr(0) { }
+  StringMapConstIterator() : Ptr(nullptr) { }
 
   explicit StringMapConstIterator(StringMapEntryBase **Bucket,
                                   bool NoAdvance = false)
@@ -443,7 +449,7 @@ public:
 
 private:
   void AdvancePastEmptyBuckets() {
-    while (*Ptr == 0 || *Ptr == StringMapImpl::getTombstoneVal())
+    while (*Ptr == nullptr || *Ptr == StringMapImpl::getTombstoneVal())
       ++Ptr;
   }
 };
diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index 0514d7b..1f413e8 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@@ -10,7 +10,6 @@
 #ifndef LLVM_ADT_STRINGREF_H
 #define LLVM_ADT_STRINGREF_H
 
-#include "llvm/Support/Allocator.h"
 #include <algorithm>
 #include <cassert>
 #include <cstring>
@@ -70,7 +69,7 @@ namespace llvm {
     /// @{
 
     /// Construct an empty string ref.
-    /*implicit*/ StringRef() : Data(0), Length(0) {}
+    /*implicit*/ StringRef() : Data(nullptr), Length(0) {}
 
     /// Construct a string ref from a cstring.
     /*implicit*/ StringRef(const char *Str)
@@ -124,9 +123,9 @@ namespace llvm {
       return Data[Length-1];
     }
 
-    // copy - Allocate copy in BumpPtrAllocator and return StringRef to it.
-    StringRef copy(BumpPtrAllocator &Allocator) {
-      char *S = Allocator.Allocate<char>(Length);
+    // copy - Allocate copy in Allocator and return StringRef to it.
+    template <typename Allocator> StringRef copy(Allocator &A) {
+      char *S = A.template Allocate<char>(Length);
       std::copy(begin(), end(), S);
       return StringRef(S, Length);
     }
@@ -186,7 +185,7 @@ namespace llvm {
 
     /// str - Get the contents as an std::string.
     std::string str() const {
-      if (Data == 0) return std::string();
+      if (!Data) return std::string();
       return std::string(Data, Length);
     }
 
diff --git a/include/llvm/ADT/StringSwitch.h b/include/llvm/ADT/StringSwitch.h
index 7fd6e27..0393a0c 100644
--- a/include/llvm/ADT/StringSwitch.h
+++ b/include/llvm/ADT/StringSwitch.h
@@ -49,7 +49,7 @@ class StringSwitch {
 
 public:
   explicit StringSwitch(StringRef S)
-  : Str(S), Result(0) { }
+  : Str(S), Result(nullptr) { }
 
   template<unsigned N>
   StringSwitch& Case(const char (&S)[N], const T& Value) {
diff --git a/include/llvm/ADT/TinyPtrVector.h b/include/llvm/ADT/TinyPtrVector.h
index 1df8d66..5669b2a 100644
--- a/include/llvm/ADT/TinyPtrVector.h
+++ b/include/llvm/ADT/TinyPtrVector.h
@@ -69,7 +69,7 @@ public:
   }
 
   TinyPtrVector(TinyPtrVector &&RHS) : Val(RHS.Val) {
-    RHS.Val = (EltTy)0;
+    RHS.Val = (EltTy)nullptr;
   }
   TinyPtrVector &operator=(TinyPtrVector &&RHS) {
     if (this == &RHS)
@@ -92,7 +92,7 @@ public:
     }
 
     Val = RHS.Val;
-    RHS.Val = (EltTy)0;
+    RHS.Val = (EltTy)nullptr;
     return *this;
   }
 
@@ -174,7 +174,7 @@ public:
   }
 
   void push_back(EltTy NewVal) {
-    assert(NewVal != 0 && "Can't add a null value");
+    assert(NewVal && "Can't add a null value");
 
     // If we have nothing, add something.
     if (Val.isNull()) {
@@ -195,7 +195,7 @@ public:
   void pop_back() {
     // If we have a single value, convert to empty.
     if (Val.template is<EltTy>())
-      Val = (EltTy)0;
+      Val = (EltTy)nullptr;
     else if (VecTy *Vec = Val.template get<VecTy*>())
       Vec->pop_back();
   }
@@ -203,7 +203,7 @@ public:
   void clear() {
     // If we have a single value, convert to empty.
     if (Val.template is<EltTy>()) {
-      Val = (EltTy)0;
+      Val = (EltTy)nullptr;
     } else if (VecTy *Vec = Val.template dyn_cast<VecTy*>()) {
       // If we have a vector form, just clear it.
       Vec->clear();
@@ -218,7 +218,7 @@ public:
     // If we have a single value, convert to empty.
     if (Val.template is<EltTy>()) {
       if (I == begin())
-        Val = (EltTy)0;
+        Val = (EltTy)nullptr;
     } else if (VecTy *Vec = Val.template dyn_cast<VecTy*>()) {
       // multiple items in a vector; just do the erase, there is no
       // benefit to collapsing back to a pointer
@@ -234,7 +234,7 @@ public:
 
     if (Val.template is<EltTy>()) {
       if (S == begin() && S != E)
-        Val = (EltTy)0;
+        Val = (EltTy)nullptr;
     } else if (VecTy *Vec = Val.template dyn_cast<VecTy*>()) {
       return Vec->erase(S, E);
     }
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 185003d..95f3380 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -48,7 +48,8 @@ public:
 
     arm,        // ARM (little endian): arm, armv.*, xscale
     armeb,      // ARM (big endian): armeb
-    arm64,      // ARM: arm64
+    arm64,      // ARM64 (little endian): arm64
+    arm64_be,   // ARM64 (big endian): arm64_be
     aarch64,    // AArch64 (little endian): aarch64
     aarch64_be, // AArch64 (big endian): aarch64_be
     hexagon,    // Hexagon: hexagon
@@ -335,6 +336,10 @@ public:
     return isMacOSX() || isiOS();
   }
 
+  bool isOSFreeBSD() const {
+    return getOS() == Triple::FreeBSD;
+  }
+
   bool isWindowsMSVCEnvironment() const {
     return getOS() == Triple::Win32 &&
            (getEnvironment() == Triple::UnknownEnvironment ||
@@ -362,7 +367,7 @@ public:
 
   /// \brief Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
   bool isOSMSVCRT() const {
-    return getOS() == Triple::Win32 || getOS() == Triple::MinGW32;
+    return isWindowsMSVCEnvironment() || isWindowsGNUEnvironment();
   }
 
   /// \brief Tests whether the OS is Windows.
diff --git a/include/llvm/ADT/Twine.h b/include/llvm/ADT/Twine.h
index e16c6b4..a54fd74 100644
--- a/include/llvm/ADT/Twine.h
+++ b/include/llvm/ADT/Twine.h
@@ -374,7 +374,7 @@ namespace llvm {
     static Twine utohexstr(const uint64_t &Val) {
       Child LHS, RHS;
       LHS.uHex = &Val;
-      RHS.twine = 0;
+      RHS.twine = nullptr;
       return Twine(LHS, UHexKind, RHS, EmptyKind);
     }
 
diff --git a/include/llvm/ADT/edit_distance.h b/include/llvm/ADT/edit_distance.h
index f77ef13..9ee1edc 100644
--- a/include/llvm/ADT/edit_distance.h
+++ b/include/llvm/ADT/edit_distance.h
@@ -17,8 +17,8 @@
 #define LLVM_ADT_EDIT_DISTANCE_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/OwningPtr.h"
 #include <algorithm>
+#include <memory>
 
 namespace llvm {
 
@@ -57,7 +57,7 @@ unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray,
 
   const unsigned SmallBufferSize = 64;
   unsigned SmallBuffer[SmallBufferSize];
-  llvm::OwningArrayPtr<unsigned> Allocated;
+  std::unique_ptr<unsigned[]> Allocated;
   unsigned *Previous = SmallBuffer;
   if (2*(n + 1) > SmallBufferSize) {
     Previous = new unsigned [2*(n+1)];
diff --git a/include/llvm/ADT/ilist.h b/include/llvm/ADT/ilist.h
index 6aeaa91..bc14845 100644
--- a/include/llvm/ADT/ilist.h
+++ b/include/llvm/ADT/ilist.h
@@ -83,7 +83,7 @@ struct ilist_sentinel_traits {
   /// provideInitialHead - when constructing an ilist, provide a starting
   /// value for its Head
   /// @return null node to indicate that it needs to be allocated later
-  static NodeTy *provideInitialHead() { return 0; }
+  static NodeTy *provideInitialHead() { return nullptr; }
 
   /// ensureHead - make sure that Head is either already
   /// initialized or assigned a fresh sentinel
@@ -92,7 +92,7 @@ struct ilist_sentinel_traits {
     if (!Head) {
       Head = ilist_traits<NodeTy>::createSentinel();
       ilist_traits<NodeTy>::noteHead(Head, Head);
-      ilist_traits<NodeTy>::setNext(Head, 0);
+      ilist_traits<NodeTy>::setNext(Head, nullptr);
       return Head;
     }
     return ilist_traits<NodeTy>::getPrev(Head);
@@ -175,7 +175,7 @@ public:
 
   ilist_iterator(pointer NP) : NodePtr(NP) {}
   ilist_iterator(reference NR) : NodePtr(&NR) {}
-  ilist_iterator() : NodePtr(0) {}
+  ilist_iterator() : NodePtr(nullptr) {}
 
   // This is templated so that we can allow constructing a const iterator from
   // a nonconst iterator...
@@ -383,7 +383,7 @@ public:
   // Miscellaneous inspection routines.
   size_type max_size() const { return size_type(-1); }
   bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const {
-    return Head == 0 || Head == getTail();
+    return !Head || Head == getTail();
   }
 
   // Front and back accessor functions...
@@ -451,8 +451,8 @@ public:
     // an ilist (and potentially deleted) with iterators still pointing at it.
     // When those iterators are incremented or decremented, they will assert on
     // the null next/prev pointer instead of "usually working".
-    this->setNext(Node, 0);
-    this->setPrev(Node, 0);
+    this->setNext(Node, nullptr);
+    this->setPrev(Node, nullptr);
     return Node;
   }
 
@@ -494,9 +494,9 @@ private:
       // Note: we have to be careful about the case when we move the first node
       // in the list.  This node is the list sentinel node and we can't move it.
       NodeTy *ThisSentinel = getTail();
-      setTail(0);
+      setTail(nullptr);
       NodeTy *L2Sentinel = L2.getTail();
-      L2.setTail(0);
+      L2.setTail(nullptr);
 
       // Remove [first, last) from its old position.
       NodeTy *First = &*first, *Prev = this->getPrev(First);
@@ -537,7 +537,7 @@ public:
   //
 
   size_type LLVM_ATTRIBUTE_UNUSED_RESULT size() const {
-    if (Head == 0) return 0; // Don't require construction of sentinel if empty.
+    if (!Head) return 0; // Don't require construction of sentinel if empty.
     return std::distance(begin(), end());
   }
 
diff --git a/include/llvm/ADT/ilist_node.h b/include/llvm/ADT/ilist_node.h
index 0361244..85aa7a4 100644
--- a/include/llvm/ADT/ilist_node.h
+++ b/include/llvm/ADT/ilist_node.h
@@ -30,7 +30,7 @@ protected:
   NodeTy *getPrev() { return Prev; }
   const NodeTy *getPrev() const { return Prev; }
   void setPrev(NodeTy *P) { Prev = P; }
-  ilist_half_node() : Prev(0) {}
+  ilist_half_node() : Prev(nullptr) {}
 };
 
 template<typename NodeTy>
@@ -48,7 +48,7 @@ class ilist_node : private ilist_half_node<NodeTy> {
   const NodeTy *getNext() const { return Next; }
   void setNext(NodeTy *N) { Next = N; }
 protected:
-  ilist_node() : Next(0) {}
+  ilist_node() : Next(nullptr) {}
 
 public:
   /// @name Adjacent Node Accessors
@@ -60,7 +60,7 @@ public:
 
     // Check for sentinel.
     if (!Prev->getNext())
-      return 0;
+      return nullptr;
 
     return Prev;
   }
@@ -71,7 +71,7 @@ public:
 
     // Check for sentinel.
     if (!Prev->getNext())
-      return 0;
+      return nullptr;
 
     return Prev;
   }
@@ -82,7 +82,7 @@ public:
 
     // Check for sentinel.
     if (!Next->getNext())
-      return 0;
+      return nullptr;
 
     return Next;
   }
@@ -93,7 +93,7 @@ public:
 
     // Check for sentinel.
     if (!Next->getNext())
-      return 0;
+      return nullptr;
 
     return Next;
   }
diff --git a/include/llvm/ADT/iterator.h b/include/llvm/ADT/iterator.h
new file mode 100644
index 0000000..56041db
--- /dev/null
+++ b/include/llvm/ADT/iterator.h
@@ -0,0 +1,244 @@
+//===- iterator.h - Utilities for using and defining iterators --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_ITERATOR_H
+#define LLVM_ADT_ITERATOR_H
+
+#include <iterator>
+#include <cstddef>
+
+namespace llvm {
+
+/// \brief CRTP base class which implements the entire standard iterator facade
+/// in terms of a minimal subset of the interface.
+///
+/// Use this when it is reasonable to implement most of the iterator
+/// functionality in terms of a core subset. If you need special behavior or
+/// there are performance implications for this, you may want to override the
+/// relevant members instead.
+///
+/// Note, one abstraction that this does *not* provide is implementing
+/// subtraction in terms of addition by negating the difference. Negation isn't
+/// always information preserving, and I can see very reasonable iterator
+/// designs where this doesn't work well. It doesn't really force much added
+/// boilerplate anyways.
+///
+/// Another abstraction that this doesn't provide is implementing increment in
+/// terms of addition of one. These aren't equivalent for all iterator
+/// categories, and respecting that adds a lot of complexity for little gain.
+template <typename DerivedT, typename IteratorCategoryT, typename T,
+          typename DifferenceTypeT = std::ptrdiff_t, typename PointerT = T *,
+          typename ReferenceT = T &>
+class iterator_facade_base
+    : public std::iterator<IteratorCategoryT, T, DifferenceTypeT, PointerT,
+                           ReferenceT> {
+protected:
+  enum {
+    IsRandomAccess =
+        std::is_base_of<std::random_access_iterator_tag, IteratorCategoryT>::value,
+    IsBidirectional =
+        std::is_base_of<std::bidirectional_iterator_tag, IteratorCategoryT>::value,
+  };
+
+public:
+  DerivedT operator+(DifferenceTypeT n) const {
+    static_assert(
+        IsRandomAccess,
+        "The '+' operator is only defined for random access iterators.");
+    DerivedT tmp = *static_cast<const DerivedT *>(this);
+    tmp += n;
+    return tmp;
+  }
+  friend DerivedT operator+(DifferenceTypeT n, const DerivedT &i) {
+    static_assert(
+        IsRandomAccess,
+        "The '+' operator is only defined for random access iterators.");
+    return i + n;
+  }
+  DerivedT operator-(DifferenceTypeT n) const {
+    static_assert(
+        IsRandomAccess,
+        "The '-' operator is only defined for random access iterators.");
+    DerivedT tmp = *static_cast<const DerivedT *>(this);
+    tmp -= n;
+    return tmp;
+  }
+
+  DerivedT &operator++() {
+    return static_cast<DerivedT *>(this)->operator+=(1);
+  }
+  DerivedT operator++(int) {
+    DerivedT tmp = *static_cast<DerivedT *>(this);
+    ++*static_cast<DerivedT *>(this);
+    return tmp;
+  }
+  DerivedT &operator--() {
+    static_assert(
+        IsBidirectional,
+        "The decrement operator is only defined for bidirectional iterators.");
+    return static_cast<DerivedT *>(this)->operator-=(1);
+  }
+  DerivedT operator--(int) {
+    static_assert(
+        IsBidirectional,
+        "The decrement operator is only defined for bidirectional iterators.");
+    DerivedT tmp = *static_cast<DerivedT *>(this);
+    --*static_cast<DerivedT *>(this);
+    return tmp;
+  }
+
+  bool operator!=(const DerivedT &RHS) const {
+    return !static_cast<const DerivedT *>(this)->operator==(RHS);
+  }
+
+  bool operator>(const DerivedT &RHS) const {
+    static_assert(
+        IsRandomAccess,
+        "Relational operators are only defined for random access iterators.");
+    return !static_cast<const DerivedT *>(this)->operator<(RHS) &&
+           !static_cast<const DerivedT *>(this)->operator==(RHS);
+  }
+  bool operator<=(const DerivedT &RHS) const {
+    static_assert(
+        IsRandomAccess,
+        "Relational operators are only defined for random access iterators.");
+    return !static_cast<const DerivedT *>(this)->operator>(RHS);
+  }
+  bool operator>=(const DerivedT &RHS) const {
+    static_assert(
+        IsRandomAccess,
+        "Relational operators are only defined for random access iterators.");
+    return !static_cast<const DerivedT *>(this)->operator<(RHS);
+  }
+
+  PointerT operator->() const {
+    return &static_cast<const DerivedT *>(this)->operator*();
+  }
+  ReferenceT operator[](DifferenceTypeT n) const {
+    static_assert(IsRandomAccess,
+                  "Subscripting is only defined for random access iterators.");
+    return *static_cast<const DerivedT *>(this)->operator+(n);
+  }
+};
+
+/// \brief CRTP base class for adapting an iterator to a different type.
+///
+/// This class can be used through CRTP to adapt one iterator into another.
+/// Typically this is done through providing in the derived class a custom \c
+/// operator* implementation. Other methods can be overridden as well.
+template <
+    typename DerivedT, typename WrappedIteratorT,
+    typename IteratorCategoryT =
+        typename std::iterator_traits<WrappedIteratorT>::iterator_category,
+    typename T = typename std::iterator_traits<WrappedIteratorT>::value_type,
+    typename DifferenceTypeT =
+        typename std::iterator_traits<WrappedIteratorT>::difference_type,
+    typename PointerT = T *, typename ReferenceT = T &,
+    // Don't provide these, they are mostly to act as aliases below.
+    typename WrappedTraitsT = std::iterator_traits<WrappedIteratorT>>
+class iterator_adaptor_base
+    : public iterator_facade_base<DerivedT, IteratorCategoryT, T,
+                                  DifferenceTypeT, PointerT, ReferenceT> {
+  typedef typename iterator_adaptor_base::iterator_facade_base BaseT;
+
+protected:
+  WrappedIteratorT I;
+
+  iterator_adaptor_base() {}
+
+  template <typename U>
+  explicit iterator_adaptor_base(
+      U &&u,
+      typename std::enable_if<
+          !std::is_base_of<typename std::remove_cv<
+                               typename std::remove_reference<U>::type>::type,
+                           DerivedT>::value,
+          int>::type = 0)
+      : I(std::forward<U &&>(u)) {}
+
+public:
+  typedef DifferenceTypeT difference_type;
+
+  DerivedT &operator+=(difference_type n) {
+    static_assert(
+        BaseT::IsRandomAccess,
+        "The '+=' operator is only defined for random access iterators.");
+    I += n;
+    return *static_cast<DerivedT *>(this);
+  }
+  DerivedT &operator-=(difference_type n) {
+    static_assert(
+        BaseT::IsRandomAccess,
+        "The '-=' operator is only defined for random access iterators.");
+    I -= n;
+    return *static_cast<DerivedT *>(this);
+  }
+  using BaseT::operator-;
+  difference_type operator-(const DerivedT &RHS) const {
+    static_assert(
+        BaseT::IsRandomAccess,
+        "The '-' operator is only defined for random access iterators.");
+    return I - RHS.I;
+  }
+
+  // We have to explicitly provide ++ and -- rather than letting the facade
+  // forward to += because WrappedIteratorT might not support +=.
+  using BaseT::operator++;
+  DerivedT &operator++() {
+    ++I;
+    return *static_cast<DerivedT *>(this);
+  }
+  using BaseT::operator--;
+  DerivedT &operator--() {
+    static_assert(
+        BaseT::IsBidirectional,
+        "The decrement operator is only defined for bidirectional iterators.");
+    --I;
+    return *static_cast<DerivedT *>(this);
+  }
+
+  bool operator==(const DerivedT &RHS) const { return I == RHS.I; }
+  bool operator<(const DerivedT &RHS) const {
+    static_assert(
+        BaseT::IsRandomAccess,
+        "Relational operators are only defined for random access iterators.");
+    return I < RHS.I;
+  }
+
+  ReferenceT operator*() const { return *I; }
+};
+
+/// \brief An iterator type that allows iterating over the pointees via some
+/// other iterator.
+///
+/// The typical usage of this is to expose a type that iterates over Ts, but
+/// which is implemented with some iterator over T*s:
+///
+/// \code
+///   typedef pointee_iterator<SmallVectorImpl<T *>::iterator> iterator;
+/// \endcode
+template <typename WrappedIteratorT,
+          typename T = typename std::remove_reference<
+              decltype(**std::declval<WrappedIteratorT>())>::type>
+struct pointee_iterator
+    : iterator_adaptor_base<
+          pointee_iterator<WrappedIteratorT>, WrappedIteratorT,
+          typename std::iterator_traits<WrappedIteratorT>::iterator_category,
+          T> {
+  pointee_iterator() {}
+  template <typename U>
+  pointee_iterator(U &&u)
+      : pointee_iterator::iterator_adaptor_base(std::forward<U &&>(u)) {}
+
+  T &operator*() const { return **this->I; }
+};
+
+}
+
+#endif
diff --git a/include/llvm/ADT/iterator_range.h b/include/llvm/ADT/iterator_range.h
index 4f2f321..dd17d6c 100644
--- a/include/llvm/ADT/iterator_range.h
+++ b/include/llvm/ADT/iterator_range.h
@@ -40,6 +40,14 @@ public:
   IteratorT begin() const { return begin_iterator; }
   IteratorT end() const { return end_iterator; }
 };
+
+/// \brief Convenience function for iterating over sub-ranges.
+///
+/// This provides a bit of syntactic sugar to make using sub-ranges
+/// in for loops a bit easier. Analogous to std::make_pair().
+template <class T> iterator_range<T> make_range(T x, T y) {
+  return iterator_range<T>(std::move(x), std::move(y));
+}
 }
 
 #endif
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index a06a562..8852866 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -75,7 +75,7 @@ protected:
 
 public:
   static char ID; // Class identification, replacement for typeinfo
-  AliasAnalysis() : DL(0), TLI(0), AA(0) {}
+  AliasAnalysis() : DL(nullptr), TLI(nullptr), AA(nullptr) {}
   virtual ~AliasAnalysis();  // We want to be subclassed
 
   /// UnknownSize - This is a special value which can be used with the
@@ -116,8 +116,8 @@ public:
     /// the location, or null if there is no known unique tag.
     const MDNode *TBAATag;
 
-    explicit Location(const Value *P = 0, uint64_t S = UnknownSize,
-                      const MDNode *N = 0)
+    explicit Location(const Value *P = nullptr, uint64_t S = UnknownSize,
+                      const MDNode *N = nullptr)
       : Ptr(P), Size(S), TBAATag(N) {}
 
     Location getWithNewPtr(const Value *NewPtr) const {
@@ -134,7 +134,7 @@ public:
 
     Location getWithoutTBAATag() const {
       Location Copy(*this);
-      Copy.TBAATag = 0;
+      Copy.TBAATag = nullptr;
       return Copy;
     }
   };
@@ -560,12 +560,12 @@ struct DenseMapInfo<AliasAnalysis::Location> {
   static inline AliasAnalysis::Location getEmptyKey() {
     return
       AliasAnalysis::Location(DenseMapInfo<const Value *>::getEmptyKey(),
-                              0, 0);
+                              0, nullptr);
   }
   static inline AliasAnalysis::Location getTombstoneKey() {
     return
       AliasAnalysis::Location(DenseMapInfo<const Value *>::getTombstoneKey(),
-                              0, 0);
+                              0, nullptr);
   }
   static unsigned getHashValue(const AliasAnalysis::Location &Val) {
     return DenseMapInfo<const Value *>::getHashValue(Val.Ptr) ^
diff --git a/include/llvm/Analysis/AliasSetTracker.h b/include/llvm/Analysis/AliasSetTracker.h
index 72e75ec..6117d91 100644
--- a/include/llvm/Analysis/AliasSetTracker.h
+++ b/include/llvm/Analysis/AliasSetTracker.h
@@ -43,13 +43,13 @@ class AliasSet : public ilist_node<AliasSet> {
     const MDNode *TBAAInfo;
   public:
     PointerRec(Value *V)
-      : Val(V), PrevInList(0), NextInList(0), AS(0), Size(0),
+      : Val(V), PrevInList(nullptr), NextInList(nullptr), AS(nullptr), Size(0),
         TBAAInfo(DenseMapInfo<const MDNode *>::getEmptyKey()) {}
 
     Value *getValue() const { return Val; }
     
     PointerRec *getNext() const { return NextInList; }
-    bool hasAliasSet() const { return AS != 0; }
+    bool hasAliasSet() const { return AS != nullptr; }
 
     PointerRec** setPrevInList(PointerRec **PIL) {
       PrevInList = PIL;
@@ -75,7 +75,7 @@ class AliasSet : public ilist_node<AliasSet> {
       // If we have missing or conflicting TBAAInfo, return null.
       if (TBAAInfo == DenseMapInfo<const MDNode *>::getEmptyKey() ||
           TBAAInfo == DenseMapInfo<const MDNode *>::getTombstoneKey())
-        return 0;
+        return nullptr;
       return TBAAInfo;
     }
 
@@ -91,7 +91,7 @@ class AliasSet : public ilist_node<AliasSet> {
     }
 
     void setAliasSet(AliasSet *as) {
-      assert(AS == 0 && "Already have an alias set!");
+      assert(!AS && "Already have an alias set!");
       AS = as;
     }
 
@@ -100,7 +100,7 @@ class AliasSet : public ilist_node<AliasSet> {
       *PrevInList = NextInList;
       if (AS->PtrListEnd == &NextInList) {
         AS->PtrListEnd = PrevInList;
-        assert(*AS->PtrListEnd == 0 && "List not terminated right!");
+        assert(*AS->PtrListEnd == nullptr && "List not terminated right!");
       }
       delete this;
     }
@@ -174,7 +174,7 @@ public:
   class iterator;
   iterator begin() const { return iterator(PtrList); }
   iterator end()   const { return iterator(); }
-  bool empty() const { return PtrList == 0; }
+  bool empty() const { return PtrList == nullptr; }
 
   void print(raw_ostream &OS) const;
   void dump() const;
@@ -184,7 +184,7 @@ public:
                                         PointerRec, ptrdiff_t> {
     PointerRec *CurNode;
   public:
-    explicit iterator(PointerRec *CN = 0) : CurNode(CN) {}
+    explicit iterator(PointerRec *CN = nullptr) : CurNode(CN) {}
 
     bool operator==(const iterator& x) const {
       return CurNode == x.CurNode;
@@ -220,8 +220,9 @@ private:
   // Can only be created by AliasSetTracker. Also, ilist creates one
   // to serve as a sentinel.
   friend struct ilist_sentinel_traits<AliasSet>;
-  AliasSet() : PtrList(0), PtrListEnd(&PtrList), Forward(0), RefCount(0),
-               AccessTy(NoModRef), AliasTy(MustAlias), Volatile(false) {
+  AliasSet()
+    : PtrList(nullptr), PtrListEnd(&PtrList), Forward(nullptr), RefCount(0),
+      AccessTy(NoModRef), AliasTy(MustAlias), Volatile(false) {
   }
 
   AliasSet(const AliasSet &AS) LLVM_DELETED_FUNCTION;
@@ -285,7 +286,7 @@ class AliasSetTracker {
     void deleted() override;
     void allUsesReplacedWith(Value *) override;
   public:
-    ASTCallbackVH(Value *V, AliasSetTracker *AST = 0);
+    ASTCallbackVH(Value *V, AliasSetTracker *AST = nullptr);
     ASTCallbackVH &operator=(Value *V);
   };
   /// ASTCallbackVHDenseMapInfo - Traits to tell DenseMap that tell us how to
@@ -354,7 +355,7 @@ public:
   /// pointer didn't alias anything).
   AliasSet &getAliasSetForPointer(Value *P, uint64_t Size,
                                   const MDNode *TBAAInfo,
-                                  bool *New = 0);
+                                  bool *New = nullptr);
 
   /// getAliasSetForPointerIfExists - Return the alias set containing the
   /// location specified if one exists, otherwise return null.
@@ -408,7 +409,7 @@ private:
   // entry for the pointer if it doesn't already exist.
   AliasSet::PointerRec &getEntryFor(Value *V) {
     AliasSet::PointerRec *&Entry = PointerMap[ASTCallbackVH(V, this)];
-    if (Entry == 0)
+    if (!Entry)
       Entry = new AliasSet::PointerRec(V);
     return *Entry;
   }
diff --git a/include/llvm/Analysis/BlockFrequencyImpl.h b/include/llvm/Analysis/BlockFrequencyImpl.h
deleted file mode 100644
index 5488847..0000000
--- a/include/llvm/Analysis/BlockFrequencyImpl.h
+++ /dev/null
@@ -1,379 +0,0 @@
-//===-- BlockFrequencyImpl.h - Block Frequency Implementation --*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Shared implementation of BlockFrequency for IR and Machine Instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_BLOCKFREQUENCYIMPL_H
-#define LLVM_ANALYSIS_BLOCKFREQUENCYIMPL_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/Support/BlockFrequency.h"
-#include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <string>
-#include <vector>
-
-namespace llvm {
-
-
-class BlockFrequencyInfo;
-class MachineBlockFrequencyInfo;
-
-/// BlockFrequencyImpl implements block frequency algorithm for IR and
-/// Machine Instructions. Algorithm starts with value ENTRY_FREQ
-/// for the entry block and then propagates frequencies using branch weights
-/// from (Machine)BranchProbabilityInfo. LoopInfo is not required because
-/// algorithm can find "backedges" by itself.
-template<class BlockT, class FunctionT, class BlockProbInfoT>
-class BlockFrequencyImpl {
-
-  DenseMap<const BlockT *, BlockFrequency> Freqs;
-
-  BlockProbInfoT *BPI;
-
-  FunctionT *Fn;
-
-  typedef GraphTraits< Inverse<BlockT *> > GT;
-
-  static const uint64_t EntryFreq = 1 << 14;
-
-  std::string getBlockName(BasicBlock *BB) const {
-    return BB->getName().str();
-  }
-
-  std::string getBlockName(MachineBasicBlock *MBB) const {
-    std::string str;
-    raw_string_ostream ss(str);
-    ss << "BB#" << MBB->getNumber();
-
-    if (const BasicBlock *BB = MBB->getBasicBlock())
-      ss << " derived from LLVM BB " << BB->getName();
-
-    return ss.str();
-  }
-
-  void setBlockFreq(BlockT *BB, BlockFrequency Freq) {
-    Freqs[BB] = Freq;
-    DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") = ";
-          printBlockFreq(dbgs(), Freq) << "\n");
-  }
-
-  /// getEdgeFreq - Return edge frequency based on SRC frequency and Src -> Dst
-  /// edge probability.
-  BlockFrequency getEdgeFreq(BlockT *Src, BlockT *Dst) const {
-    BranchProbability Prob = BPI->getEdgeProbability(Src, Dst);
-    return getBlockFreq(Src) * Prob;
-  }
-
-  /// incBlockFreq - Increase BB block frequency by FREQ.
-  ///
-  void incBlockFreq(BlockT *BB, BlockFrequency Freq) {
-    Freqs[BB] += Freq;
-    DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") += ";
-          printBlockFreq(dbgs(), Freq) << " --> ";
-          printBlockFreq(dbgs(), Freqs[BB]) << "\n");
-  }
-
-  // All blocks in postorder.
-  std::vector<BlockT *> POT;
-
-  // Map Block -> Position in reverse-postorder list.
-  DenseMap<BlockT *, unsigned> RPO;
-
-  // For each loop header, record the per-iteration probability of exiting the
-  // loop. This is the reciprocal of the expected number of loop iterations.
-  typedef DenseMap<BlockT*, BranchProbability> LoopExitProbMap;
-  LoopExitProbMap LoopExitProb;
-
-  // (reverse-)postorder traversal iterators.
-  typedef typename std::vector<BlockT *>::iterator pot_iterator;
-  typedef typename std::vector<BlockT *>::reverse_iterator rpot_iterator;
-
-  pot_iterator pot_begin() { return POT.begin(); }
-  pot_iterator pot_end() { return POT.end(); }
-
-  rpot_iterator rpot_begin() { return POT.rbegin(); }
-  rpot_iterator rpot_end() { return POT.rend(); }
-
-  rpot_iterator rpot_at(BlockT *BB) {
-    rpot_iterator I = rpot_begin();
-    unsigned idx = RPO.lookup(BB);
-    assert(idx);
-    std::advance(I, idx - 1);
-
-    assert(*I == BB);
-    return I;
-  }
-
-  /// isBackedge - Return if edge Src -> Dst is a reachable backedge.
-  ///
-  bool isBackedge(BlockT *Src, BlockT *Dst) const {
-    unsigned a = RPO.lookup(Src);
-    if (!a)
-      return false;
-    unsigned b = RPO.lookup(Dst);
-    assert(b && "Destination block should be reachable");
-    return a >= b;
-  }
-
-  /// getSingleBlockPred - return single BB block predecessor or NULL if
-  /// BB has none or more predecessors.
-  BlockT *getSingleBlockPred(BlockT *BB) {
-    typename GT::ChildIteratorType
-      PI = GraphTraits< Inverse<BlockT *> >::child_begin(BB),
-      PE = GraphTraits< Inverse<BlockT *> >::child_end(BB);
-
-    if (PI == PE)
-      return 0;
-
-    BlockT *Pred = *PI;
-
-    ++PI;
-    if (PI != PE)
-      return 0;
-
-    return Pred;
-  }
-
-  void doBlock(BlockT *BB, BlockT *LoopHead,
-               SmallPtrSet<BlockT *, 8> &BlocksInLoop) {
-
-    DEBUG(dbgs() << "doBlock(" << getBlockName(BB) << ")\n");
-    setBlockFreq(BB, 0);
-
-    if (BB == LoopHead) {
-      setBlockFreq(BB, EntryFreq);
-      return;
-    }
-
-    if (BlockT *Pred = getSingleBlockPred(BB)) {
-      if (BlocksInLoop.count(Pred))
-        setBlockFreq(BB, getEdgeFreq(Pred, BB));
-      // TODO: else? irreducible, ignore it for now.
-      return;
-    }
-
-    bool isInLoop = false;
-    bool isLoopHead = false;
-
-    for (typename GT::ChildIteratorType
-         PI = GraphTraits< Inverse<BlockT *> >::child_begin(BB),
-         PE = GraphTraits< Inverse<BlockT *> >::child_end(BB);
-         PI != PE; ++PI) {
-      BlockT *Pred = *PI;
-
-      if (isBackedge(Pred, BB)) {
-        isLoopHead = true;
-      } else if (BlocksInLoop.count(Pred)) {
-        incBlockFreq(BB, getEdgeFreq(Pred, BB));
-        isInLoop = true;
-      }
-      // TODO: else? irreducible.
-    }
-
-    if (!isInLoop)
-      return;
-
-    if (!isLoopHead)
-      return;
-
-    // This block is a loop header, so boost its frequency by the expected
-    // number of loop iterations. The loop blocks will be revisited so they all
-    // get this boost.
-    typename LoopExitProbMap::const_iterator I = LoopExitProb.find(BB);
-    assert(I != LoopExitProb.end() && "Loop header missing from table");
-    Freqs[BB] /= I->second;
-    DEBUG(dbgs() << "Loop header scaled to ";
-          printBlockFreq(dbgs(), Freqs[BB]) << ".\n");
-  }
-
-  /// doLoop - Propagate block frequency down through the loop.
-  void doLoop(BlockT *Head, BlockT *Tail) {
-    DEBUG(dbgs() << "doLoop(" << getBlockName(Head) << ", "
-                 << getBlockName(Tail) << ")\n");
-
-    SmallPtrSet<BlockT *, 8> BlocksInLoop;
-
-    for (rpot_iterator I = rpot_at(Head), E = rpot_at(Tail); ; ++I) {
-      BlockT *BB = *I;
-      doBlock(BB, Head, BlocksInLoop);
-
-      BlocksInLoop.insert(BB);
-      if (I == E)
-        break;
-    }
-
-    // Compute loop's cyclic probability using backedges probabilities.
-    BlockFrequency BackFreq;
-    for (typename GT::ChildIteratorType
-         PI = GraphTraits< Inverse<BlockT *> >::child_begin(Head),
-         PE = GraphTraits< Inverse<BlockT *> >::child_end(Head);
-         PI != PE; ++PI) {
-      BlockT *Pred = *PI;
-      assert(Pred);
-      if (isBackedge(Pred, Head))
-        BackFreq += getEdgeFreq(Pred, Head);
-    }
-
-    // The cyclic probability is freq(BackEdges) / freq(Head), where freq(Head)
-    // only counts edges entering the loop, not the loop backedges.
-    // The probability of leaving the loop on each iteration is:
-    //
-    //   ExitProb = 1 - CyclicProb
-    //
-    // The Expected number of loop iterations is:
-    //
-    //   Iterations = 1 / ExitProb
-    //
-    uint64_t D = std::max(getBlockFreq(Head).getFrequency(), UINT64_C(1));
-    uint64_t N = std::max(BackFreq.getFrequency(), UINT64_C(1));
-    if (N < D)
-      N = D - N;
-    else
-      // We'd expect N < D, but rounding and saturation means that can't be
-      // guaranteed.
-      N = 1;
-
-    // Now ExitProb = N / D, make sure it fits in an i32/i32 fraction.
-    assert(N <= D);
-    if (D > UINT32_MAX) {
-      unsigned Shift = 32 - countLeadingZeros(D);
-      D >>= Shift;
-      N >>= Shift;
-      if (N == 0)
-        N = 1;
-    }
-    BranchProbability LEP = BranchProbability(N, D);
-    LoopExitProb.insert(std::make_pair(Head, LEP));
-    DEBUG(dbgs() << "LoopExitProb[" << getBlockName(Head) << "] = " << LEP
-          << " from 1 - ";
-          printBlockFreq(dbgs(), BackFreq) << " / ";
-          printBlockFreq(dbgs(), getBlockFreq(Head)) << ".\n");
-  }
-
-  friend class BlockFrequencyInfo;
-  friend class MachineBlockFrequencyInfo;
-
-  BlockFrequencyImpl() { }
-
-  void doFunction(FunctionT *fn, BlockProbInfoT *bpi) {
-    Fn = fn;
-    BPI = bpi;
-
-    // Clear everything.
-    RPO.clear();
-    POT.clear();
-    LoopExitProb.clear();
-    Freqs.clear();
-
-    BlockT *EntryBlock = fn->begin();
-
-    std::copy(po_begin(EntryBlock), po_end(EntryBlock), std::back_inserter(POT));
-
-    unsigned RPOidx = 0;
-    for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
-      BlockT *BB = *I;
-      RPO[BB] = ++RPOidx;
-      DEBUG(dbgs() << "RPO[" << getBlockName(BB) << "] = " << RPO[BB] << "\n");
-    }
-
-    // Travel over all blocks in postorder.
-    for (pot_iterator I = pot_begin(), E = pot_end(); I != E; ++I) {
-      BlockT *BB = *I;
-      BlockT *LastTail = 0;
-      DEBUG(dbgs() << "POT: " << getBlockName(BB) << "\n");
-
-      for (typename GT::ChildIteratorType
-           PI = GraphTraits< Inverse<BlockT *> >::child_begin(BB),
-           PE = GraphTraits< Inverse<BlockT *> >::child_end(BB);
-           PI != PE; ++PI) {
-
-        BlockT *Pred = *PI;
-        if (isBackedge(Pred, BB) && (!LastTail || RPO[Pred] > RPO[LastTail]))
-          LastTail = Pred;
-      }
-
-      if (LastTail)
-        doLoop(BB, LastTail);
-    }
-
-    // At the end assume the whole function as a loop, and travel over it once
-    // again.
-    doLoop(*(rpot_begin()), *(pot_begin()));
-  }
-
-public:
-
-  uint64_t getEntryFreq() { return EntryFreq; }
-
-  /// getBlockFreq - Return block frequency. Return 0 if we don't have it.
-  BlockFrequency getBlockFreq(const BlockT *BB) const {
-    typename DenseMap<const BlockT *, BlockFrequency>::const_iterator
-      I = Freqs.find(BB);
-    if (I != Freqs.end())
-      return I->second;
-    return 0;
-  }
-
-  void print(raw_ostream &OS) const {
-    OS << "\n\n---- Block Freqs ----\n";
-    for (typename FunctionT::iterator I = Fn->begin(), E = Fn->end(); I != E;) {
-      BlockT *BB = I++;
-      OS << " " << getBlockName(BB) << " = ";
-      printBlockFreq(OS, getBlockFreq(BB)) << "\n";
-
-      for (typename GraphTraits<BlockT *>::ChildIteratorType
-           SI = GraphTraits<BlockT *>::child_begin(BB),
-           SE = GraphTraits<BlockT *>::child_end(BB); SI != SE; ++SI) {
-        BlockT *Succ = *SI;
-        OS << "  " << getBlockName(BB) << " -> " << getBlockName(Succ)
-           << " = "; printBlockFreq(OS, getEdgeFreq(BB, Succ)) << "\n";
-      }
-    }
-  }
-
-  void dump() const {
-    print(dbgs());
-  }
-
-  // Utility method that looks up the block frequency associated with BB and
-  // prints it to OS.
-  raw_ostream &printBlockFreq(raw_ostream &OS,
-                              const BlockT *BB) {
-    return printBlockFreq(OS, getBlockFreq(BB));
-  }
-
-  raw_ostream &printBlockFreq(raw_ostream &OS,
-                              const BlockFrequency &Freq) const {
-    // Convert fixed-point number to decimal.
-    uint64_t Frequency = Freq.getFrequency();
-    OS << Frequency / EntryFreq << ".";
-    uint64_t Rem = Frequency % EntryFreq;
-    uint64_t Eps = 1;
-    do {
-      Rem *= 10;
-      Eps *= 10;
-      OS << Rem / EntryFreq;
-      Rem = Rem % EntryFreq;
-    } while (Rem >= Eps/2);
-    return OS;
-  }
-
-};
-
-}
-
-#endif
diff --git a/include/llvm/Analysis/BlockFrequencyInfo.h b/include/llvm/Analysis/BlockFrequencyInfo.h
index 2f701d9..3289a28 100644
--- a/include/llvm/Analysis/BlockFrequencyInfo.h
+++ b/include/llvm/Analysis/BlockFrequencyInfo.h
@@ -1,4 +1,4 @@
-//===------- BlockFrequencyInfo.h - Block Frequency Analysis --*- C++ -*---===//
+//===- BlockFrequencyInfo.h - Block Frequency Analysis ----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,14 +21,12 @@
 namespace llvm {
 
 class BranchProbabilityInfo;
-template<class BlockT, class FunctionT, class BranchProbInfoT>
-class BlockFrequencyImpl;
+template <class BlockT> class BlockFrequencyInfoImpl;
 
-/// BlockFrequencyInfo pass uses BlockFrequencyImpl implementation to estimate
-/// IR basic block frequencies.
+/// BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to
+/// estimate IR basic block frequencies.
 class BlockFrequencyInfo : public FunctionPass {
-  typedef BlockFrequencyImpl<BasicBlock, Function, BranchProbabilityInfo>
-  ImplType;
+  typedef BlockFrequencyInfoImpl<BasicBlock> ImplType;
   std::unique_ptr<ImplType> BFI;
 
 public:
diff --git a/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/include/llvm/Analysis/BlockFrequencyInfoImpl.h
new file mode 100644
index 0000000..bd72d3e
--- /dev/null
+++ b/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -0,0 +1,1859 @@
+//==- BlockFrequencyInfoImpl.h - Block Frequency Implementation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Shared implementation of BlockFrequency for IR and Machine Instructions.
+// See the documentation below for BlockFrequencyInfoImpl for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
+#define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <deque>
+#include <list>
+#include <string>
+#include <vector>
+
+#define DEBUG_TYPE "block-freq"
+
+//===----------------------------------------------------------------------===//
+//
+// UnsignedFloat definition.
+//
+// TODO: Make this private to BlockFrequencyInfoImpl or delete.
+//
+//===----------------------------------------------------------------------===//
+namespace llvm {
+
+class UnsignedFloatBase {
+public:
+  static const int32_t MaxExponent = 16383;
+  static const int32_t MinExponent = -16382;
+  static const int DefaultPrecision = 10;
+
+  static void dump(uint64_t D, int16_t E, int Width);
+  static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width,
+                            unsigned Precision);
+  static std::string toString(uint64_t D, int16_t E, int Width,
+                              unsigned Precision);
+  static int countLeadingZeros32(uint32_t N) { return countLeadingZeros(N); }
+  static int countLeadingZeros64(uint64_t N) { return countLeadingZeros(N); }
+  static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); }
+
+  static std::pair<uint64_t, bool> splitSigned(int64_t N) {
+    if (N >= 0)
+      return std::make_pair(N, false);
+    uint64_t Unsigned = N == INT64_MIN ? UINT64_C(1) << 63 : uint64_t(-N);
+    return std::make_pair(Unsigned, true);
+  }
+  static int64_t joinSigned(uint64_t U, bool IsNeg) {
+    if (U > uint64_t(INT64_MAX))
+      return IsNeg ? INT64_MIN : INT64_MAX;
+    return IsNeg ? -int64_t(U) : int64_t(U);
+  }
+
+  static int32_t extractLg(const std::pair<int32_t, int> &Lg) {
+    return Lg.first;
+  }
+  static int32_t extractLgFloor(const std::pair<int32_t, int> &Lg) {
+    return Lg.first - (Lg.second > 0);
+  }
+  static int32_t extractLgCeiling(const std::pair<int32_t, int> &Lg) {
+    return Lg.first + (Lg.second < 0);
+  }
+
+  static std::pair<uint64_t, int16_t> divide64(uint64_t L, uint64_t R);
+  static std::pair<uint64_t, int16_t> multiply64(uint64_t L, uint64_t R);
+
+  static int compare(uint64_t L, uint64_t R, int Shift) {
+    assert(Shift >= 0);
+    assert(Shift < 64);
+
+    uint64_t L_adjusted = L >> Shift;
+    if (L_adjusted < R)
+      return -1;
+    if (L_adjusted > R)
+      return 1;
+
+    return L > L_adjusted << Shift ? 1 : 0;
+  }
+};
+
+/// \brief Simple representation of an unsigned floating point.
+///
+/// UnsignedFloat is a unsigned floating point number.  It uses simple
+/// saturation arithmetic, and every operation is well-defined for every value.
+///
+/// The number is split into a signed exponent and unsigned digits.  The number
+/// represented is \c getDigits()*2^getExponent().  In this way, the digits are
+/// much like the mantissa in the x87 long double, but there is no canonical
+/// form, so the same number can be represented by many bit representations
+/// (it's always in "denormal" mode).
+///
+/// UnsignedFloat is templated on the underlying integer type for digits, which
+/// is expected to be one of uint64_t, uint32_t, uint16_t or uint8_t.
+///
+/// Unlike builtin floating point types, UnsignedFloat is portable.
+///
+/// Unlike APFloat, UnsignedFloat does not model architecture floating point
+/// behaviour (this should make it a little faster), and implements most
+/// operators (this makes it usable).
+///
+/// UnsignedFloat is totally ordered.  However, there is no canonical form, so
+/// there are multiple representations of most scalars.  E.g.:
+///
+///     UnsignedFloat(8u, 0) == UnsignedFloat(4u, 1)
+///     UnsignedFloat(4u, 1) == UnsignedFloat(2u, 2)
+///     UnsignedFloat(2u, 2) == UnsignedFloat(1u, 3)
+///
+/// UnsignedFloat implements most arithmetic operations.  Precision is kept
+/// where possible.  Uses simple saturation arithmetic, so that operations
+/// saturate to 0.0 or getLargest() rather than under or overflowing.  It has
+/// some extra arithmetic for unit inversion.  0.0/0.0 is defined to be 0.0.
+/// Any other division by 0.0 is defined to be getLargest().
+///
+/// As a convenience for modifying the exponent, left and right shifting are
+/// both implemented, and both interpret negative shifts as positive shifts in
+/// the opposite direction.
+///
+/// Exponents are limited to the range accepted by x87 long double.  This makes
+/// it trivial to add functionality to convert to APFloat (this is already
+/// relied on for the implementation of printing).
+///
+/// The current plan is to gut this and make the necessary parts of it (even
+/// more) private to BlockFrequencyInfo.
+template <class DigitsT> class UnsignedFloat : UnsignedFloatBase {
+public:
+  static_assert(!std::numeric_limits<DigitsT>::is_signed,
+                "only unsigned floats supported");
+
+  typedef DigitsT DigitsType;
+
+private:
+  typedef std::numeric_limits<DigitsType> DigitsLimits;
+
+  static const int Width = sizeof(DigitsType) * 8;
+  static_assert(Width <= 64, "invalid integer width for digits");
+
+private:
+  DigitsType Digits;
+  int16_t Exponent;
+
+public:
+  UnsignedFloat() : Digits(0), Exponent(0) {}
+
+  UnsignedFloat(DigitsType Digits, int16_t Exponent)
+      : Digits(Digits), Exponent(Exponent) {}
+
+private:
+  UnsignedFloat(const std::pair<uint64_t, int16_t> &X)
+      : Digits(X.first), Exponent(X.second) {}
+
+public:
+  static UnsignedFloat getZero() { return UnsignedFloat(0, 0); }
+  static UnsignedFloat getOne() { return UnsignedFloat(1, 0); }
+  static UnsignedFloat getLargest() {
+    return UnsignedFloat(DigitsLimits::max(), MaxExponent);
+  }
+  static UnsignedFloat getFloat(uint64_t N) { return adjustToWidth(N, 0); }
+  static UnsignedFloat getInverseFloat(uint64_t N) {
+    return getFloat(N).invert();
+  }
+  static UnsignedFloat getFraction(DigitsType N, DigitsType D) {
+    return getQuotient(N, D);
+  }
+
+  int16_t getExponent() const { return Exponent; }
+  DigitsType getDigits() const { return Digits; }
+
+  /// \brief Convert to the given integer type.
+  ///
+  /// Convert to \c IntT using simple saturating arithmetic, truncating if
+  /// necessary.
+  template <class IntT> IntT toInt() const;
+
+  bool isZero() const { return !Digits; }
+  bool isLargest() const { return *this == getLargest(); }
+  bool isOne() const {
+    if (Exponent > 0 || Exponent <= -Width)
+      return false;
+    return Digits == DigitsType(1) << -Exponent;
+  }
+
+  /// \brief The log base 2, rounded.
+  ///
+  /// Get the lg of the scalar.  lg 0 is defined to be INT32_MIN.
+  int32_t lg() const { return extractLg(lgImpl()); }
+
+  /// \brief The log base 2, rounded towards INT32_MIN.
+  ///
+  /// Get the lg floor.  lg 0 is defined to be INT32_MIN.
+  int32_t lgFloor() const { return extractLgFloor(lgImpl()); }
+
+  /// \brief The log base 2, rounded towards INT32_MAX.
+  ///
+  /// Get the lg ceiling.  lg 0 is defined to be INT32_MIN.
+  int32_t lgCeiling() const { return extractLgCeiling(lgImpl()); }
+
+  bool operator==(const UnsignedFloat &X) const { return compare(X) == 0; }
+  bool operator<(const UnsignedFloat &X) const { return compare(X) < 0; }
+  bool operator!=(const UnsignedFloat &X) const { return compare(X) != 0; }
+  bool operator>(const UnsignedFloat &X) const { return compare(X) > 0; }
+  bool operator<=(const UnsignedFloat &X) const { return compare(X) <= 0; }
+  bool operator>=(const UnsignedFloat &X) const { return compare(X) >= 0; }
+
+  bool operator!() const { return isZero(); }
+
+  /// \brief Convert to a decimal representation in a string.
+  ///
+  /// Convert to a string.  Uses scientific notation for very large/small
+  /// numbers.  Scientific notation is used roughly for numbers outside of the
+  /// range 2^-64 through 2^64.
+  ///
+  /// \c Precision indicates the number of decimal digits of precision to use;
+  /// 0 requests the maximum available.
+  ///
+  /// As a special case to make debugging easier, if the number is small enough
+  /// to convert without scientific notation and has more than \c Precision
+  /// digits before the decimal place, it's printed accurately to the first
+  /// digit past zero.  E.g., assuming 10 digits of precision:
+  ///
+  ///     98765432198.7654... => 98765432198.8
+  ///      8765432198.7654... =>  8765432198.8
+  ///       765432198.7654... =>   765432198.8
+  ///        65432198.7654... =>    65432198.77
+  ///         5432198.7654... =>     5432198.765
+  std::string toString(unsigned Precision = DefaultPrecision) {
+    return UnsignedFloatBase::toString(Digits, Exponent, Width, Precision);
+  }
+
+  /// \brief Print a decimal representation.
+  ///
+  /// Print a string.  See toString for documentation.
+  raw_ostream &print(raw_ostream &OS,
+                     unsigned Precision = DefaultPrecision) const {
+    return UnsignedFloatBase::print(OS, Digits, Exponent, Width, Precision);
+  }
+  void dump() const { return UnsignedFloatBase::dump(Digits, Exponent, Width); }
+
+  UnsignedFloat &operator+=(const UnsignedFloat &X);
+  UnsignedFloat &operator-=(const UnsignedFloat &X);
+  UnsignedFloat &operator*=(const UnsignedFloat &X);
+  UnsignedFloat &operator/=(const UnsignedFloat &X);
+  UnsignedFloat &operator<<=(int16_t Shift) { shiftLeft(Shift); return *this; }
+  UnsignedFloat &operator>>=(int16_t Shift) { shiftRight(Shift); return *this; }
+
+private:
+  void shiftLeft(int32_t Shift);
+  void shiftRight(int32_t Shift);
+
+  /// \brief Adjust two floats to have matching exponents.
+  ///
+  /// Adjust \c this and \c X to have matching exponents.  Returns the new \c X
+  /// by value.  Does nothing if \a isZero() for either.
+  ///
+  /// The value that compares smaller will lose precision, and possibly become
+  /// \a isZero().
+  UnsignedFloat matchExponents(UnsignedFloat X);
+
+  /// \brief Increase exponent to match another float.
+  ///
+  /// Increases \c this to have an exponent matching \c X.  May decrease the
+  /// exponent of \c X in the process, and \c this may possibly become \a
+  /// isZero().
+  void increaseExponentToMatch(UnsignedFloat &X, int32_t ExponentDiff);
+
+public:
+  /// \brief Scale a large number accurately.
+  ///
+  /// Scale N (multiply it by this).  Uses full precision multiplication, even
+  /// if Width is smaller than 64, so information is not lost.
+  uint64_t scale(uint64_t N) const;
+  uint64_t scaleByInverse(uint64_t N) const {
+    // TODO: implement directly, rather than relying on inverse.  Inverse is
+    // expensive.
+    return inverse().scale(N);
+  }
+  int64_t scale(int64_t N) const {
+    std::pair<uint64_t, bool> Unsigned = splitSigned(N);
+    return joinSigned(scale(Unsigned.first), Unsigned.second);
+  }
+  int64_t scaleByInverse(int64_t N) const {
+    std::pair<uint64_t, bool> Unsigned = splitSigned(N);
+    return joinSigned(scaleByInverse(Unsigned.first), Unsigned.second);
+  }
+
+  int compare(const UnsignedFloat &X) const;
+  int compareTo(uint64_t N) const {
+    UnsignedFloat Float = getFloat(N);
+    int Compare = compare(Float);
+    if (Width == 64 || Compare != 0)
+      return Compare;
+
+    // Check for precision loss.  We know *this == RoundTrip.
+    uint64_t RoundTrip = Float.template toInt<uint64_t>();
+    return N == RoundTrip ? 0 : RoundTrip < N ? -1 : 1;
+  }
+  int compareTo(int64_t N) const { return N < 0 ? 1 : compareTo(uint64_t(N)); }
+
+  UnsignedFloat &invert() { return *this = UnsignedFloat::getFloat(1) / *this; }
+  UnsignedFloat inverse() const { return UnsignedFloat(*this).invert(); }
+
+private:
+  static UnsignedFloat getProduct(DigitsType L, DigitsType R);
+  static UnsignedFloat getQuotient(DigitsType Dividend, DigitsType Divisor);
+
+  std::pair<int32_t, int> lgImpl() const;
+  static int countLeadingZerosWidth(DigitsType Digits) {
+    if (Width == 64)
+      return countLeadingZeros64(Digits);
+    if (Width == 32)
+      return countLeadingZeros32(Digits);
+    return countLeadingZeros32(Digits) + Width - 32;
+  }
+
+  static UnsignedFloat adjustToWidth(uint64_t N, int32_t S) {
+    assert(S >= MinExponent);
+    assert(S <= MaxExponent);
+    if (Width == 64 || N <= DigitsLimits::max())
+      return UnsignedFloat(N, S);
+
+    // Shift right.
+    int Shift = 64 - Width - countLeadingZeros64(N);
+    DigitsType Shifted = N >> Shift;
+
+    // Round.
+    assert(S + Shift <= MaxExponent);
+    return getRounded(UnsignedFloat(Shifted, S + Shift),
+                      N & UINT64_C(1) << (Shift - 1));
+  }
+
+  static UnsignedFloat getRounded(UnsignedFloat P, bool Round) {
+    if (!Round)
+      return P;
+    if (P.Digits == DigitsLimits::max())
+      // Careful of overflow in the exponent.
+      return UnsignedFloat(1, P.Exponent) <<= Width;
+    return UnsignedFloat(P.Digits + 1, P.Exponent);
+  }
+};
+
+#define UNSIGNED_FLOAT_BOP(op, base)                                           \
+  template <class DigitsT>                                                     \
+  UnsignedFloat<DigitsT> operator op(const UnsignedFloat<DigitsT> &L,          \
+                                     const UnsignedFloat<DigitsT> &R) {        \
+    return UnsignedFloat<DigitsT>(L) base R;                                   \
+  }
+UNSIGNED_FLOAT_BOP(+, += )
+UNSIGNED_FLOAT_BOP(-, -= )
+UNSIGNED_FLOAT_BOP(*, *= )
+UNSIGNED_FLOAT_BOP(/, /= )
+UNSIGNED_FLOAT_BOP(<<, <<= )
+UNSIGNED_FLOAT_BOP(>>, >>= )
+#undef UNSIGNED_FLOAT_BOP
+
+template <class DigitsT>
+raw_ostream &operator<<(raw_ostream &OS, const UnsignedFloat<DigitsT> &X) {
+  return X.print(OS, 10);
+}
+
+#define UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, T1, T2)                             \
+  template <class DigitsT>                                                     \
+  bool operator op(const UnsignedFloat<DigitsT> &L, T1 R) {                    \
+    return L.compareTo(T2(R)) op 0;                                            \
+  }                                                                            \
+  template <class DigitsT>                                                     \
+  bool operator op(T1 L, const UnsignedFloat<DigitsT> &R) {                    \
+    return 0 op R.compareTo(T2(L));                                            \
+  }
+#define UNSIGNED_FLOAT_COMPARE_TO(op)                                          \
+  UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, uint64_t, uint64_t)                       \
+  UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, uint32_t, uint64_t)                       \
+  UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, int64_t, int64_t)                         \
+  UNSIGNED_FLOAT_COMPARE_TO_TYPE(op, int32_t, int64_t)
+UNSIGNED_FLOAT_COMPARE_TO(< )
+UNSIGNED_FLOAT_COMPARE_TO(> )
+UNSIGNED_FLOAT_COMPARE_TO(== )
+UNSIGNED_FLOAT_COMPARE_TO(!= )
+UNSIGNED_FLOAT_COMPARE_TO(<= )
+UNSIGNED_FLOAT_COMPARE_TO(>= )
+#undef UNSIGNED_FLOAT_COMPARE_TO
+#undef UNSIGNED_FLOAT_COMPARE_TO_TYPE
+
+template <class DigitsT>
+uint64_t UnsignedFloat<DigitsT>::scale(uint64_t N) const {
+  if (Width == 64 || N <= DigitsLimits::max())
+    return (getFloat(N) * *this).template toInt<uint64_t>();
+
+  // Defer to the 64-bit version.
+  return UnsignedFloat<uint64_t>(Digits, Exponent).scale(N);
+}
+
+template <class DigitsT>
+UnsignedFloat<DigitsT> UnsignedFloat<DigitsT>::getProduct(DigitsType L,
+                                                          DigitsType R) {
+  // Check for zero.
+  if (!L || !R)
+    return getZero();
+
+  // Check for numbers that we can compute with 64-bit math.
+  if (Width <= 32 || (L <= UINT32_MAX && R <= UINT32_MAX))
+    return adjustToWidth(uint64_t(L) * uint64_t(R), 0);
+
+  // Do the full thing.
+  return UnsignedFloat(multiply64(L, R));
+}
+template <class DigitsT>
+UnsignedFloat<DigitsT> UnsignedFloat<DigitsT>::getQuotient(DigitsType Dividend,
+                                                           DigitsType Divisor) {
+  // Check for zero.
+  if (!Dividend)
+    return getZero();
+  if (!Divisor)
+    return getLargest();
+
+  if (Width == 64)
+    return UnsignedFloat(divide64(Dividend, Divisor));
+
+  // We can compute this with 64-bit math.
+  int Shift = countLeadingZeros64(Dividend);
+  uint64_t Shifted = uint64_t(Dividend) << Shift;
+  uint64_t Quotient = Shifted / Divisor;
+
+  // If Quotient needs to be shifted, then adjustToWidth will round.
+  if (Quotient > DigitsLimits::max())
+    return adjustToWidth(Quotient, -Shift);
+
+  // Round based on the value of the next bit.
+  return getRounded(UnsignedFloat(Quotient, -Shift),
+                    Shifted % Divisor >= getHalf(Divisor));
+}
+
+template <class DigitsT>
+template <class IntT>
+IntT UnsignedFloat<DigitsT>::toInt() const {
+  typedef std::numeric_limits<IntT> Limits;
+  if (*this < 1)
+    return 0;
+  if (*this >= Limits::max())
+    return Limits::max();
+
+  IntT N = Digits;
+  if (Exponent > 0) {
+    assert(size_t(Exponent) < sizeof(IntT) * 8);
+    return N << Exponent;
+  }
+  if (Exponent < 0) {
+    assert(size_t(-Exponent) < sizeof(IntT) * 8);
+    return N >> -Exponent;
+  }
+  return N;
+}
+
+template <class DigitsT>
+std::pair<int32_t, int> UnsignedFloat<DigitsT>::lgImpl() const {
+  if (isZero())
+    return std::make_pair(INT32_MIN, 0);
+
+  // Get the floor of the lg of Digits.
+  int32_t LocalFloor = Width - countLeadingZerosWidth(Digits) - 1;
+
+  // Get the floor of the lg of this.
+  int32_t Floor = Exponent + LocalFloor;
+  if (Digits == UINT64_C(1) << LocalFloor)
+    return std::make_pair(Floor, 0);
+
+  // Round based on the next digit.
+  assert(LocalFloor >= 1);
+  bool Round = Digits & UINT64_C(1) << (LocalFloor - 1);
+  return std::make_pair(Floor + Round, Round ? 1 : -1);
+}
+
+template <class DigitsT>
+UnsignedFloat<DigitsT> UnsignedFloat<DigitsT>::matchExponents(UnsignedFloat X) {
+  if (isZero() || X.isZero() || Exponent == X.Exponent)
+    return X;
+
+  int32_t Diff = int32_t(X.Exponent) - int32_t(Exponent);
+  if (Diff > 0)
+    increaseExponentToMatch(X, Diff);
+  else
+    X.increaseExponentToMatch(*this, -Diff);
+  return X;
+}
+template <class DigitsT>
+void UnsignedFloat<DigitsT>::increaseExponentToMatch(UnsignedFloat &X,
+                                                     int32_t ExponentDiff) {
+  assert(ExponentDiff > 0);
+  if (ExponentDiff >= 2 * Width) {
+    *this = getZero();
+    return;
+  }
+
+  // Use up any leading zeros on X, and then shift this.
+  int32_t ShiftX = std::min(countLeadingZerosWidth(X.Digits), ExponentDiff);
+  assert(ShiftX < Width);
+
+  int32_t ShiftThis = ExponentDiff - ShiftX;
+  if (ShiftThis >= Width) {
+    *this = getZero();
+    return;
+  }
+
+  X.Digits <<= ShiftX;
+  X.Exponent -= ShiftX;
+  Digits >>= ShiftThis;
+  Exponent += ShiftThis;
+  return;
+}
+
+template <class DigitsT>
+UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>::
+operator+=(const UnsignedFloat &X) {
+  if (isLargest() || X.isZero())
+    return *this;
+  if (isZero() || X.isLargest())
+    return *this = X;
+
+  // Normalize exponents.
+  UnsignedFloat Scaled = matchExponents(X);
+
+  // Check for zero again.
+  if (isZero())
+    return *this = Scaled;
+  if (Scaled.isZero())
+    return *this;
+
+  // Compute sum.
+  DigitsType Sum = Digits + Scaled.Digits;
+  bool DidOverflow = Sum < Digits;
+  Digits = Sum;
+  if (!DidOverflow)
+    return *this;
+
+  if (Exponent == MaxExponent)
+    return *this = getLargest();
+
+  ++Exponent;
+  Digits = UINT64_C(1) << (Width - 1) | Digits >> 1;
+
+  return *this;
+}
+template <class DigitsT>
+UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>::
+operator-=(const UnsignedFloat &X) {
+  if (X.isZero())
+    return *this;
+  if (*this <= X)
+    return *this = getZero();
+
+  // Normalize exponents.
+  UnsignedFloat Scaled = matchExponents(X);
+  assert(Digits >= Scaled.Digits);
+
+  // Compute difference.
+  if (!Scaled.isZero()) {
+    Digits -= Scaled.Digits;
+    return *this;
+  }
+
+  // Check if X just barely lost its last bit.  E.g., for 32-bit:
+  //
+  //   1*2^32 - 1*2^0 == 0xffffffff != 1*2^32
+  if (*this == UnsignedFloat(1, X.lgFloor() + Width)) {
+    Digits = DigitsType(0) - 1;
+    --Exponent;
+  }
+  return *this;
+}
+template <class DigitsT>
+UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>::
+operator*=(const UnsignedFloat &X) {
+  if (isZero())
+    return *this;
+  if (X.isZero())
+    return *this = X;
+
+  // Save the exponents.
+  int32_t Exponents = int32_t(Exponent) + int32_t(X.Exponent);
+
+  // Get the raw product.
+  *this = getProduct(Digits, X.Digits);
+
+  // Combine with exponents.
+  return *this <<= Exponents;
+}
+template <class DigitsT>
+UnsignedFloat<DigitsT> &UnsignedFloat<DigitsT>::
+operator/=(const UnsignedFloat &X) {
+  if (isZero())
+    return *this;
+  if (X.isZero())
+    return *this = getLargest();
+
+  // Save the exponents.
+  int32_t Exponents = int32_t(Exponent) - int32_t(X.Exponent);
+
+  // Get the raw quotient.
+  *this = getQuotient(Digits, X.Digits);
+
+  // Combine with exponents.
+  return *this <<= Exponents;
+}
+template <class DigitsT>
+void UnsignedFloat<DigitsT>::shiftLeft(int32_t Shift) {
+  if (!Shift || isZero())
+    return;
+  assert(Shift != INT32_MIN);
+  if (Shift < 0) {
+    shiftRight(-Shift);
+    return;
+  }
+
+  // Shift as much as we can in the exponent.
+  int32_t ExponentShift = std::min(Shift, MaxExponent - Exponent);
+  Exponent += ExponentShift;
+  if (ExponentShift == Shift)
+    return;
+
+  // Check this late, since it's rare.
+  if (isLargest())
+    return;
+
+  // Shift the digits themselves.
+  Shift -= ExponentShift;
+  if (Shift > countLeadingZerosWidth(Digits)) {
+    // Saturate.
+    *this = getLargest();
+    return;
+  }
+
+  Digits <<= Shift;
+  return;
+}
+
+template <class DigitsT>
+void UnsignedFloat<DigitsT>::shiftRight(int32_t Shift) {
+  if (!Shift || isZero())
+    return;
+  assert(Shift != INT32_MIN);
+  if (Shift < 0) {
+    shiftLeft(-Shift);
+    return;
+  }
+
+  // Shift as much as we can in the exponent.
+  int32_t ExponentShift = std::min(Shift, Exponent - MinExponent);
+  Exponent -= ExponentShift;
+  if (ExponentShift == Shift)
+    return;
+
+  // Shift the digits themselves.
+  Shift -= ExponentShift;
+  if (Shift >= Width) {
+    // Saturate.
+    *this = getZero();
+    return;
+  }
+
+  Digits >>= Shift;
+  return;
+}
+
+template <class DigitsT>
+int UnsignedFloat<DigitsT>::compare(const UnsignedFloat &X) const {
+  // Check for zero.
+  if (isZero())
+    return X.isZero() ? 0 : -1;
+  if (X.isZero())
+    return 1;
+
+  // Check for the scale.  Use lgFloor to be sure that the exponent difference
+  // is always lower than 64.
+  int32_t lgL = lgFloor(), lgR = X.lgFloor();
+  if (lgL != lgR)
+    return lgL < lgR ? -1 : 1;
+
+  // Compare digits.
+  if (Exponent < X.Exponent)
+    return UnsignedFloatBase::compare(Digits, X.Digits, X.Exponent - Exponent);
+
+  return -UnsignedFloatBase::compare(X.Digits, Digits, Exponent - X.Exponent);
+}
+
+template <class T> struct isPodLike<UnsignedFloat<T>> {
+  static const bool value = true;
+};
+}
+
+//===----------------------------------------------------------------------===//
+//
+// BlockMass definition.
+//
+// TODO: Make this private to BlockFrequencyInfoImpl or delete.
+//
+//===----------------------------------------------------------------------===//
+namespace llvm {
+
+/// \brief Mass of a block.
+///
+/// This class implements a sort of fixed-point fraction always between 0.0 and
+/// 1.0.  getMass() == UINT64_MAX indicates a value of 1.0.
+///
+/// Masses can be added and subtracted.  Simple saturation arithmetic is used,
+/// so arithmetic operations never overflow or underflow.
+///
+/// Masses can be multiplied.  Multiplication treats full mass as 1.0 and uses
+/// an inexpensive floating-point algorithm that's off-by-one (almost, but not
+/// quite, maximum precision).
+///
+/// Masses can be scaled by \a BranchProbability at maximum precision.
+class BlockMass {
+  uint64_t Mass;
+
+public:
+  BlockMass() : Mass(0) {}
+  explicit BlockMass(uint64_t Mass) : Mass(Mass) {}
+
+  static BlockMass getEmpty() { return BlockMass(); }
+  static BlockMass getFull() { return BlockMass(UINT64_MAX); }
+
+  uint64_t getMass() const { return Mass; }
+
+  bool isFull() const { return Mass == UINT64_MAX; }
+  bool isEmpty() const { return !Mass; }
+
+  bool operator!() const { return isEmpty(); }
+
+  /// \brief Add another mass.
+  ///
+  /// Adds another mass, saturating at \a isFull() rather than overflowing.
+  BlockMass &operator+=(const BlockMass &X) {
+    uint64_t Sum = Mass + X.Mass;
+    Mass = Sum < Mass ? UINT64_MAX : Sum;
+    return *this;
+  }
+
+  /// \brief Subtract another mass.
+  ///
+  /// Subtracts another mass, saturating at \a isEmpty() rather than
+  /// undeflowing.
+  BlockMass &operator-=(const BlockMass &X) {
+    uint64_t Diff = Mass - X.Mass;
+    Mass = Diff > Mass ? 0 : Diff;
+    return *this;
+  }
+
+  BlockMass &operator*=(const BranchProbability &P) {
+    Mass = P.scale(Mass);
+    return *this;
+  }
+
+  bool operator==(const BlockMass &X) const { return Mass == X.Mass; }
+  bool operator!=(const BlockMass &X) const { return Mass != X.Mass; }
+  bool operator<=(const BlockMass &X) const { return Mass <= X.Mass; }
+  bool operator>=(const BlockMass &X) const { return Mass >= X.Mass; }
+  bool operator<(const BlockMass &X) const { return Mass < X.Mass; }
+  bool operator>(const BlockMass &X) const { return Mass > X.Mass; }
+
+  /// \brief Convert to floating point.
+  ///
+  /// Convert to a float.  \a isFull() gives 1.0, while \a isEmpty() gives
+  /// slightly above 0.0.
+  UnsignedFloat<uint64_t> toFloat() const;
+
+  void dump() const;
+  raw_ostream &print(raw_ostream &OS) const;
+};
+
+inline BlockMass operator+(const BlockMass &L, const BlockMass &R) {
+  return BlockMass(L) += R;
+}
+inline BlockMass operator-(const BlockMass &L, const BlockMass &R) {
+  return BlockMass(L) -= R;
+}
+inline BlockMass operator*(const BlockMass &L, const BranchProbability &R) {
+  return BlockMass(L) *= R;
+}
+inline BlockMass operator*(const BranchProbability &L, const BlockMass &R) {
+  return BlockMass(R) *= L;
+}
+
+inline raw_ostream &operator<<(raw_ostream &OS, const BlockMass &X) {
+  return X.print(OS);
+}
+
+template <> struct isPodLike<BlockMass> {
+  static const bool value = true;
+};
+}
+
+//===----------------------------------------------------------------------===//
+//
+// BlockFrequencyInfoImpl definition.
+//
+//===----------------------------------------------------------------------===//
+namespace llvm {
+
+class BasicBlock;
+class BranchProbabilityInfo;
+class Function;
+class Loop;
+class LoopInfo;
+class MachineBasicBlock;
+class MachineBranchProbabilityInfo;
+class MachineFunction;
+class MachineLoop;
+class MachineLoopInfo;
+
+namespace bfi_detail {
+struct IrreducibleGraph;
+
+// This is part of a workaround for a GCC 4.7 crash on lambdas.
+template <class BT> struct BlockEdgesAdder;
+}
+
+/// \brief Base class for BlockFrequencyInfoImpl
+///
+/// BlockFrequencyInfoImplBase has supporting data structures and some
+/// algorithms for BlockFrequencyInfoImplBase.  Only algorithms that depend on
+/// the block type (or that call such algorithms) are skipped here.
+///
+/// Nevertheless, the majority of the overall algorithm documention lives with
+/// BlockFrequencyInfoImpl.  See there for details.
+class BlockFrequencyInfoImplBase {
+public:
+  typedef UnsignedFloat<uint64_t> Float;
+
+  /// \brief Representative of a block.
+  ///
+  /// This is a simple wrapper around an index into the reverse-post-order
+  /// traversal of the blocks.
+  ///
+  /// Unlike a block pointer, its order has meaning (location in the
+  /// topological sort) and it's class is the same regardless of block type.
+  struct BlockNode {
+    typedef uint32_t IndexType;
+    IndexType Index;
+
+    bool operator==(const BlockNode &X) const { return Index == X.Index; }
+    bool operator!=(const BlockNode &X) const { return Index != X.Index; }
+    bool operator<=(const BlockNode &X) const { return Index <= X.Index; }
+    bool operator>=(const BlockNode &X) const { return Index >= X.Index; }
+    bool operator<(const BlockNode &X) const { return Index < X.Index; }
+    bool operator>(const BlockNode &X) const { return Index > X.Index; }
+
+    BlockNode() : Index(UINT32_MAX) {}
+    BlockNode(IndexType Index) : Index(Index) {}
+
+    bool isValid() const { return Index <= getMaxIndex(); }
+    static size_t getMaxIndex() { return UINT32_MAX - 1; }
+  };
+
+  /// \brief Stats about a block itself.
+  struct FrequencyData {
+    Float Floating;
+    uint64_t Integer;
+  };
+
+  /// \brief Data about a loop.
+  ///
+  /// Contains the data necessary to represent represent a loop as a
+  /// pseudo-node once it's packaged.
+  struct LoopData {
+    typedef SmallVector<std::pair<BlockNode, BlockMass>, 4> ExitMap;
+    typedef SmallVector<BlockNode, 4> NodeList;
+    LoopData *Parent;       ///< The parent loop.
+    bool IsPackaged;        ///< Whether this has been packaged.
+    uint32_t NumHeaders;    ///< Number of headers.
+    ExitMap Exits;          ///< Successor edges (and weights).
+    NodeList Nodes;         ///< Header and the members of the loop.
+    BlockMass BackedgeMass; ///< Mass returned to loop header.
+    BlockMass Mass;
+    Float Scale;
+
+    LoopData(LoopData *Parent, const BlockNode &Header)
+        : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header) {}
+    template <class It1, class It2>
+    LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther,
+             It2 LastOther)
+        : Parent(Parent), IsPackaged(false), Nodes(FirstHeader, LastHeader) {
+      NumHeaders = Nodes.size();
+      Nodes.insert(Nodes.end(), FirstOther, LastOther);
+    }
+    bool isHeader(const BlockNode &Node) const {
+      if (isIrreducible())
+        return std::binary_search(Nodes.begin(), Nodes.begin() + NumHeaders,
+                                  Node);
+      return Node == Nodes[0];
+    }
+    BlockNode getHeader() const { return Nodes[0]; }
+    bool isIrreducible() const { return NumHeaders > 1; }
+
+    NodeList::const_iterator members_begin() const {
+      return Nodes.begin() + NumHeaders;
+    }
+    NodeList::const_iterator members_end() const { return Nodes.end(); }
+    iterator_range<NodeList::const_iterator> members() const {
+      return make_range(members_begin(), members_end());
+    }
+  };
+
+  /// \brief Index of loop information.
+  struct WorkingData {
+    BlockNode Node; ///< This node.
+    LoopData *Loop; ///< The loop this block is inside.
+    BlockMass Mass; ///< Mass distribution from the entry block.
+
+    WorkingData(const BlockNode &Node) : Node(Node), Loop(nullptr) {}
+
+    bool isLoopHeader() const { return Loop && Loop->isHeader(Node); }
+    bool isDoubleLoopHeader() const {
+      return isLoopHeader() && Loop->Parent && Loop->Parent->isIrreducible() &&
+             Loop->Parent->isHeader(Node);
+    }
+
+    LoopData *getContainingLoop() const {
+      if (!isLoopHeader())
+        return Loop;
+      if (!isDoubleLoopHeader())
+        return Loop->Parent;
+      return Loop->Parent->Parent;
+    }
+
+    /// \brief Resolve a node to its representative.
+    ///
+    /// Get the node currently representing Node, which could be a containing
+    /// loop.
+    ///
+    /// This function should only be called when distributing mass.  As long as
+    /// there are no irreducilbe edges to Node, then it will have complexity
+    /// O(1) in this context.
+    ///
+    /// In general, the complexity is O(L), where L is the number of loop
+    /// headers Node has been packaged into.  Since this method is called in
+    /// the context of distributing mass, L will be the number of loop headers
+    /// an early exit edge jumps out of.
+    BlockNode getResolvedNode() const {
+      auto L = getPackagedLoop();
+      return L ? L->getHeader() : Node;
+    }
+    LoopData *getPackagedLoop() const {
+      if (!Loop || !Loop->IsPackaged)
+        return nullptr;
+      auto L = Loop;
+      while (L->Parent && L->Parent->IsPackaged)
+        L = L->Parent;
+      return L;
+    }
+
+    /// \brief Get the appropriate mass for a node.
+    ///
+    /// Get appropriate mass for Node.  If Node is a loop-header (whose loop
+    /// has been packaged), returns the mass of its pseudo-node.  If it's a
+    /// node inside a packaged loop, it returns the loop's mass.
+    BlockMass &getMass() {
+      if (!isAPackage())
+        return Mass;
+      if (!isADoublePackage())
+        return Loop->Mass;
+      return Loop->Parent->Mass;
+    }
+
+    /// \brief Has ContainingLoop been packaged up?
+    bool isPackaged() const { return getResolvedNode() != Node; }
+    /// \brief Has Loop been packaged up?
+    bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; }
+    /// \brief Has Loop been packaged up twice?
+    bool isADoublePackage() const {
+      return isDoubleLoopHeader() && Loop->Parent->IsPackaged;
+    }
+  };
+
+  /// \brief Unscaled probability weight.
+  ///
+  /// Probability weight for an edge in the graph (including the
+  /// successor/target node).
+  ///
+  /// All edges in the original function are 32-bit.  However, exit edges from
+  /// loop packages are taken from 64-bit exit masses, so we need 64-bits of
+  /// space in general.
+  ///
+  /// In addition to the raw weight amount, Weight stores the type of the edge
+  /// in the current context (i.e., the context of the loop being processed).
+  /// Is this a local edge within the loop, an exit from the loop, or a
+  /// backedge to the loop header?
+  struct Weight {
+    enum DistType { Local, Exit, Backedge };
+    DistType Type;
+    BlockNode TargetNode;
+    uint64_t Amount;
+    Weight() : Type(Local), Amount(0) {}
+  };
+
+  /// \brief Distribution of unscaled probability weight.
+  ///
+  /// Distribution of unscaled probability weight to a set of successors.
+  ///
+  /// This class collates the successor edge weights for later processing.
+  ///
+  /// \a DidOverflow indicates whether \a Total did overflow while adding to
+  /// the distribution.  It should never overflow twice.
+  struct Distribution {
+    typedef SmallVector<Weight, 4> WeightList;
+    WeightList Weights;    ///< Individual successor weights.
+    uint64_t Total;        ///< Sum of all weights.
+    bool DidOverflow;      ///< Whether \a Total did overflow.
+
+    Distribution() : Total(0), DidOverflow(false) {}
+    void addLocal(const BlockNode &Node, uint64_t Amount) {
+      add(Node, Amount, Weight::Local);
+    }
+    void addExit(const BlockNode &Node, uint64_t Amount) {
+      add(Node, Amount, Weight::Exit);
+    }
+    void addBackedge(const BlockNode &Node, uint64_t Amount) {
+      add(Node, Amount, Weight::Backedge);
+    }
+
+    /// \brief Normalize the distribution.
+    ///
+    /// Combines multiple edges to the same \a Weight::TargetNode and scales
+    /// down so that \a Total fits into 32-bits.
+    ///
+    /// This is linear in the size of \a Weights.  For the vast majority of
+    /// cases, adjacent edge weights are combined by sorting WeightList and
+    /// combining adjacent weights.  However, for very large edge lists an
+    /// auxiliary hash table is used.
+    void normalize();
+
+  private:
+    void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type);
+  };
+
+  /// \brief Data about each block.  This is used downstream.
+  std::vector<FrequencyData> Freqs;
+
+  /// \brief Loop data: see initializeLoops().
+  std::vector<WorkingData> Working;
+
+  /// \brief Indexed information about loops.
+  std::list<LoopData> Loops;
+
+  /// \brief Add all edges out of a packaged loop to the distribution.
+  ///
+  /// Adds all edges from LocalLoopHead to Dist.  Calls addToDist() to add each
+  /// successor edge.
+  ///
+  /// \return \c true unless there's an irreducible backedge.
+  bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop,
+                               Distribution &Dist);
+
+  /// \brief Add an edge to the distribution.
+  ///
+  /// Adds an edge to Succ to Dist.  If \c LoopHead.isValid(), then whether the
+  /// edge is local/exit/backedge is in the context of LoopHead.  Otherwise,
+  /// every edge should be a local edge (since all the loops are packaged up).
+  ///
+  /// \return \c true unless aborted due to an irreducible backedge.
+  bool addToDist(Distribution &Dist, const LoopData *OuterLoop,
+                 const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight);
+
+  LoopData &getLoopPackage(const BlockNode &Head) {
+    assert(Head.Index < Working.size());
+    assert(Working[Head.Index].isLoopHeader());
+    return *Working[Head.Index].Loop;
+  }
+
+  /// \brief Analyze irreducible SCCs.
+  ///
+  /// Separate irreducible SCCs from \c G, which is an explict graph of \c
+  /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr).
+  /// Insert them into \a Loops before \c Insert.
+  ///
+  /// \return the \c LoopData nodes representing the irreducible SCCs.
+  iterator_range<std::list<LoopData>::iterator>
+  analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop,
+                     std::list<LoopData>::iterator Insert);
+
+  /// \brief Update a loop after packaging irreducible SCCs inside of it.
+  ///
+  /// Update \c OuterLoop.  Before finding irreducible control flow, it was
+  /// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a
+  /// LoopData::BackedgeMass need to be reset.  Also, nodes that were packaged
+  /// up need to be removed from \a OuterLoop::Nodes.
+  void updateLoopWithIrreducible(LoopData &OuterLoop);
+
+  /// \brief Distribute mass according to a distribution.
+  ///
+  /// Distributes the mass in Source according to Dist.  If LoopHead.isValid(),
+  /// backedges and exits are stored in its entry in Loops.
+  ///
+  /// Mass is distributed in parallel from two copies of the source mass.
+  void distributeMass(const BlockNode &Source, LoopData *OuterLoop,
+                      Distribution &Dist);
+
+  /// \brief Compute the loop scale for a loop.
+  void computeLoopScale(LoopData &Loop);
+
+  /// \brief Package up a loop.
+  void packageLoop(LoopData &Loop);
+
+  /// \brief Unwrap loops.
+  void unwrapLoops();
+
+  /// \brief Finalize frequency metrics.
+  ///
+  /// Calculates final frequencies and cleans up no-longer-needed data
+  /// structures.
+  void finalizeMetrics();
+
+  /// \brief Clear all memory.
+  void clear();
+
+  virtual std::string getBlockName(const BlockNode &Node) const;
+  std::string getLoopName(const LoopData &Loop) const;
+
+  virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
+  void dump() const { print(dbgs()); }
+
+  Float getFloatingBlockFreq(const BlockNode &Node) const;
+
+  BlockFrequency getBlockFreq(const BlockNode &Node) const;
+
+  raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const;
+  raw_ostream &printBlockFreq(raw_ostream &OS,
+                              const BlockFrequency &Freq) const;
+
+  uint64_t getEntryFreq() const {
+    assert(!Freqs.empty());
+    return Freqs[0].Integer;
+  }
+  /// \brief Virtual destructor.
+  ///
+  /// Need a virtual destructor to mask the compiler warning about
+  /// getBlockName().
+  virtual ~BlockFrequencyInfoImplBase() {}
+};
+
+namespace bfi_detail {
+template <class BlockT> struct TypeMap {};
+template <> struct TypeMap<BasicBlock> {
+  typedef BasicBlock BlockT;
+  typedef Function FunctionT;
+  typedef BranchProbabilityInfo BranchProbabilityInfoT;
+  typedef Loop LoopT;
+  typedef LoopInfo LoopInfoT;
+};
+template <> struct TypeMap<MachineBasicBlock> {
+  typedef MachineBasicBlock BlockT;
+  typedef MachineFunction FunctionT;
+  typedef MachineBranchProbabilityInfo BranchProbabilityInfoT;
+  typedef MachineLoop LoopT;
+  typedef MachineLoopInfo LoopInfoT;
+};
+
+/// \brief Get the name of a MachineBasicBlock.
+///
+/// Get the name of a MachineBasicBlock.  It's templated so that including from
+/// CodeGen is unnecessary (that would be a layering issue).
+///
+/// This is used mainly for debug output.  The name is similar to
+/// MachineBasicBlock::getFullName(), but skips the name of the function.
+template <class BlockT> std::string getBlockName(const BlockT *BB) {
+  assert(BB && "Unexpected nullptr");
+  auto MachineName = "BB" + Twine(BB->getNumber());
+  if (BB->getBasicBlock())
+    return (MachineName + "[" + BB->getName() + "]").str();
+  return MachineName.str();
+}
+/// \brief Get the name of a BasicBlock.
+template <> inline std::string getBlockName(const BasicBlock *BB) {
+  assert(BB && "Unexpected nullptr");
+  return BB->getName().str();
+}
+
+/// \brief Graph of irreducible control flow.
+///
+/// This graph is used for determining the SCCs in a loop (or top-level
+/// function) that has irreducible control flow.
+///
+/// During the block frequency algorithm, the local graphs are defined in a
+/// light-weight way, deferring to the \a BasicBlock or \a MachineBasicBlock
+/// graphs for most edges, but getting others from \a LoopData::ExitMap.  The
+/// latter only has successor information.
+///
+/// \a IrreducibleGraph makes this graph explicit.  It's in a form that can use
+/// \a GraphTraits (so that \a analyzeIrreducible() can use \a scc_iterator),
+/// and it explicitly lists predecessors and successors.  The initialization
+/// that relies on \c MachineBasicBlock is defined in the header.
+struct IrreducibleGraph {
+  typedef BlockFrequencyInfoImplBase BFIBase;
+
+  BFIBase &BFI;
+
+  typedef BFIBase::BlockNode BlockNode;
+  struct IrrNode {
+    BlockNode Node;
+    unsigned NumIn;
+    std::deque<const IrrNode *> Edges;
+    IrrNode(const BlockNode &Node) : Node(Node), NumIn(0) {}
+
+    typedef std::deque<const IrrNode *>::const_iterator iterator;
+    iterator pred_begin() const { return Edges.begin(); }
+    iterator succ_begin() const { return Edges.begin() + NumIn; }
+    iterator pred_end() const { return succ_begin(); }
+    iterator succ_end() const { return Edges.end(); }
+  };
+  BlockNode Start;
+  const IrrNode *StartIrr;
+  std::vector<IrrNode> Nodes;
+  SmallDenseMap<uint32_t, IrrNode *, 4> Lookup;
+
+  /// \brief Construct an explicit graph containing irreducible control flow.
+  ///
+  /// Construct an explicit graph of the control flow in \c OuterLoop (or the
+  /// top-level function, if \c OuterLoop is \c nullptr).  Uses \c
+  /// addBlockEdges to add block successors that have not been packaged into
+  /// loops.
+  ///
+  /// \a BlockFrequencyInfoImpl::computeIrreducibleMass() is the only expected
+  /// user of this.
+  template <class BlockEdgesAdder>
+  IrreducibleGraph(BFIBase &BFI, const BFIBase::LoopData *OuterLoop,
+                   BlockEdgesAdder addBlockEdges)
+      : BFI(BFI), StartIrr(nullptr) {
+    initialize(OuterLoop, addBlockEdges);
+  }
+
+  template <class BlockEdgesAdder>
+  void initialize(const BFIBase::LoopData *OuterLoop,
+                  BlockEdgesAdder addBlockEdges);
+  void addNodesInLoop(const BFIBase::LoopData &OuterLoop);
+  void addNodesInFunction();
+  void addNode(const BlockNode &Node) {
+    Nodes.emplace_back(Node);
+    BFI.Working[Node.Index].getMass() = BlockMass::getEmpty();
+  }
+  void indexNodes();
+  template <class BlockEdgesAdder>
+  void addEdges(const BlockNode &Node, const BFIBase::LoopData *OuterLoop,
+                BlockEdgesAdder addBlockEdges);
+  void addEdge(IrrNode &Irr, const BlockNode &Succ,
+               const BFIBase::LoopData *OuterLoop);
+};
+template <class BlockEdgesAdder>
+void IrreducibleGraph::initialize(const BFIBase::LoopData *OuterLoop,
+                                  BlockEdgesAdder addBlockEdges) {
+  if (OuterLoop) {
+    addNodesInLoop(*OuterLoop);
+    for (auto N : OuterLoop->Nodes)
+      addEdges(N, OuterLoop, addBlockEdges);
+  } else {
+    addNodesInFunction();
+    for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
+      addEdges(Index, OuterLoop, addBlockEdges);
+  }
+  StartIrr = Lookup[Start.Index];
+}
+template <class BlockEdgesAdder>
+void IrreducibleGraph::addEdges(const BlockNode &Node,
+                                const BFIBase::LoopData *OuterLoop,
+                                BlockEdgesAdder addBlockEdges) {
+  auto L = Lookup.find(Node.Index);
+  if (L == Lookup.end())
+    return;
+  IrrNode &Irr = *L->second;
+  const auto &Working = BFI.Working[Node.Index];
+
+  if (Working.isAPackage())
+    for (const auto &I : Working.Loop->Exits)
+      addEdge(Irr, I.first, OuterLoop);
+  else
+    addBlockEdges(*this, Irr, OuterLoop);
+}
+}
+
+/// \brief Shared implementation for block frequency analysis.
+///
+/// This is a shared implementation of BlockFrequencyInfo and
+/// MachineBlockFrequencyInfo, and calculates the relative frequencies of
+/// blocks.
+///
+/// LoopInfo defines a loop as a "non-trivial" SCC dominated by a single block,
+/// which is called the header.  A given loop, L, can have sub-loops, which are
+/// loops within the subgraph of L that exclude its header.  (A "trivial" SCC
+/// consists of a single block that does not have a self-edge.)
+///
+/// In addition to loops, this algorithm has limited support for irreducible
+/// SCCs, which are SCCs with multiple entry blocks.  Irreducible SCCs are
+/// discovered on they fly, and modelled as loops with multiple headers.
+///
+/// The headers of irreducible sub-SCCs consist of its entry blocks and all
+/// nodes that are targets of a backedge within it (excluding backedges within
+/// true sub-loops).  Block frequency calculations act as if a block is
+/// inserted that intercepts all the edges to the headers.  All backedges and
+/// entries point to this block.  Its successors are the headers, which split
+/// the frequency evenly.
+///
+/// This algorithm leverages BlockMass and UnsignedFloat to maintain precision,
+/// separates mass distribution from loop scaling, and dithers to eliminate
+/// probability mass loss.
+///
+/// The implementation is split between BlockFrequencyInfoImpl, which knows the
+/// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and
+/// BlockFrequencyInfoImplBase, which doesn't.  The base class uses \a
+/// BlockNode, a wrapper around a uint32_t.  BlockNode is numbered from 0 in
+/// reverse-post order.  This gives two advantages:  it's easy to compare the
+/// relative ordering of two nodes, and maps keyed on BlockT can be represented
+/// by vectors.
+///
+/// This algorithm is O(V+E), unless there is irreducible control flow, in
+/// which case it's O(V*E) in the worst case.
+///
+/// These are the main stages:
+///
+///  0. Reverse post-order traversal (\a initializeRPOT()).
+///
+///     Run a single post-order traversal and save it (in reverse) in RPOT.
+///     All other stages make use of this ordering.  Save a lookup from BlockT
+///     to BlockNode (the index into RPOT) in Nodes.
+///
+///  1. Loop initialization (\a initializeLoops()).
+///
+///     Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
+///     the algorithm.  In particular, store the immediate members of each loop
+///     in reverse post-order.
+///
+///  2. Calculate mass and scale in loops (\a computeMassInLoops()).
+///
+///     For each loop (bottom-up), distribute mass through the DAG resulting
+///     from ignoring backedges and treating sub-loops as a single pseudo-node.
+///     Track the backedge mass distributed to the loop header, and use it to
+///     calculate the loop scale (number of loop iterations).  Immediate
+///     members that represent sub-loops will already have been visited and
+///     packaged into a pseudo-node.
+///
+///     Distributing mass in a loop is a reverse-post-order traversal through
+///     the loop.  Start by assigning full mass to the Loop header.  For each
+///     node in the loop:
+///
+///         - Fetch and categorize the weight distribution for its successors.
+///           If this is a packaged-subloop, the weight distribution is stored
+///           in \a LoopData::Exits.  Otherwise, fetch it from
+///           BranchProbabilityInfo.
+///
+///         - Each successor is categorized as \a Weight::Local, a local edge
+///           within the current loop, \a Weight::Backedge, a backedge to the
+///           loop header, or \a Weight::Exit, any successor outside the loop.
+///           The weight, the successor, and its category are stored in \a
+///           Distribution.  There can be multiple edges to each successor.
+///
+///         - If there's a backedge to a non-header, there's an irreducible SCC.
+///           The usual flow is temporarily aborted.  \a
+///           computeIrreducibleMass() finds the irreducible SCCs within the
+///           loop, packages them up, and restarts the flow.
+///
+///         - Normalize the distribution:  scale weights down so that their sum
+///           is 32-bits, and coalesce multiple edges to the same node.
+///
+///         - Distribute the mass accordingly, dithering to minimize mass loss,
+///           as described in \a distributeMass().
+///
+///     Finally, calculate the loop scale from the accumulated backedge mass.
+///
+///  3. Distribute mass in the function (\a computeMassInFunction()).
+///
+///     Finally, distribute mass through the DAG resulting from packaging all
+///     loops in the function.  This uses the same algorithm as distributing
+///     mass in a loop, except that there are no exit or backedge edges.
+///
+///  4. Unpackage loops (\a unwrapLoops()).
+///
+///     Initialize each block's frequency to a floating point representation of
+///     its mass.
+///
+///     Visit loops top-down, scaling the frequencies of its immediate members
+///     by the loop's pseudo-node's frequency.
+///
+///  5. Convert frequencies to a 64-bit range (\a finalizeMetrics()).
+///
+///     Using the min and max frequencies as a guide, translate floating point
+///     frequencies to an appropriate range in uint64_t.
+///
+/// It has some known flaws.
+///
+///   - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
+///     BlockFrequency's 64-bit integer precision.
+///
+///   - The model of irreducible control flow is a rough approximation.
+///
+///     Modelling irreducible control flow exactly involves setting up and
+///     solving a group of infinite geometric series.  Such precision is
+///     unlikely to be worthwhile, since most of our algorithms give up on
+///     irreducible control flow anyway.
+///
+///     Nevertheless, we might find that we need to get closer.  Here's a sort
+///     of TODO list for the model with diminishing returns, to be completed as
+///     necessary.
+///
+///       - The headers for the \a LoopData representing an irreducible SCC
+///         include non-entry blocks.  When these extra blocks exist, they
+///         indicate a self-contained irreducible sub-SCC.  We could treat them
+///         as sub-loops, rather than arbitrarily shoving the problematic
+///         blocks into the headers of the main irreducible SCC.
+///
+///       - Backedge frequencies are assumed to be evenly split between the
+///         headers of a given irreducible SCC.  Instead, we could track the
+///         backedge mass separately for each header, and adjust their relative
+///         frequencies.
+///
+///       - Entry frequencies are assumed to be evenly split between the
+///         headers of a given irreducible SCC, which is the only option if we
+///         need to compute mass in the SCC before its parent loop.  Instead,
+///         we could partially compute mass in the parent loop, and stop when
+///         we get to the SCC.  Here, we have the correct ratio of entry
+///         masses, which we can use to adjust their relative frequencies.
+///         Compute mass in the SCC, and then continue propagation in the
+///         parent.
+///
+///       - We can propagate mass iteratively through the SCC, for some fixed
+///         number of iterations.  Each iteration starts by assigning the entry
+///         blocks their backedge mass from the prior iteration.  The final
+///         mass for each block (and each exit, and the total backedge mass
+///         used for computing loop scale) is the sum of all iterations.
+///         (Running this until fixed point would "solve" the geometric
+///         series by simulation.)
+template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
+  typedef typename bfi_detail::TypeMap<BT>::BlockT BlockT;
+  typedef typename bfi_detail::TypeMap<BT>::FunctionT FunctionT;
+  typedef typename bfi_detail::TypeMap<BT>::BranchProbabilityInfoT
+  BranchProbabilityInfoT;
+  typedef typename bfi_detail::TypeMap<BT>::LoopT LoopT;
+  typedef typename bfi_detail::TypeMap<BT>::LoopInfoT LoopInfoT;
+
+  // This is part of a workaround for a GCC 4.7 crash on lambdas.
+  friend struct bfi_detail::BlockEdgesAdder<BT>;
+
+  typedef GraphTraits<const BlockT *> Successor;
+  typedef GraphTraits<Inverse<const BlockT *>> Predecessor;
+
+  const BranchProbabilityInfoT *BPI;
+  const LoopInfoT *LI;
+  const FunctionT *F;
+
+  // All blocks in reverse postorder.
+  std::vector<const BlockT *> RPOT;
+  DenseMap<const BlockT *, BlockNode> Nodes;
+
+  typedef typename std::vector<const BlockT *>::const_iterator rpot_iterator;
+
+  rpot_iterator rpot_begin() const { return RPOT.begin(); }
+  rpot_iterator rpot_end() const { return RPOT.end(); }
+
+  size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); }
+
+  BlockNode getNode(const rpot_iterator &I) const {
+    return BlockNode(getIndex(I));
+  }
+  BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB); }
+
+  const BlockT *getBlock(const BlockNode &Node) const {
+    assert(Node.Index < RPOT.size());
+    return RPOT[Node.Index];
+  }
+
+  /// \brief Run (and save) a post-order traversal.
+  ///
+  /// Saves a reverse post-order traversal of all the nodes in \a F.
+  void initializeRPOT();
+
+  /// \brief Initialize loop data.
+  ///
+  /// Build up \a Loops using \a LoopInfo.  \a LoopInfo gives us a mapping from
+  /// each block to the deepest loop it's in, but we need the inverse.  For each
+  /// loop, we store in reverse post-order its "immediate" members, defined as
+  /// the header, the headers of immediate sub-loops, and all other blocks in
+  /// the loop that are not in sub-loops.
+  void initializeLoops();
+
+  /// \brief Propagate to a block's successors.
+  ///
+  /// In the context of distributing mass through \c OuterLoop, divide the mass
+  /// currently assigned to \c Node between its successors.
+  ///
+  /// \return \c true unless there's an irreducible backedge.
+  bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node);
+
+  /// \brief Compute mass in a particular loop.
+  ///
+  /// Assign mass to \c Loop's header, and then for each block in \c Loop in
+  /// reverse post-order, distribute mass to its successors.  Only visits nodes
+  /// that have not been packaged into sub-loops.
+  ///
+  /// \pre \a computeMassInLoop() has been called for each subloop of \c Loop.
+  /// \return \c true unless there's an irreducible backedge.
+  bool computeMassInLoop(LoopData &Loop);
+
+  /// \brief Try to compute mass in the top-level function.
+  ///
+  /// Assign mass to the entry block, and then for each block in reverse
+  /// post-order, distribute mass to its successors.  Skips nodes that have
+  /// been packaged into loops.
+  ///
+  /// \pre \a computeMassInLoops() has been called.
+  /// \return \c true unless there's an irreducible backedge.
+  bool tryToComputeMassInFunction();
+
+  /// \brief Compute mass in (and package up) irreducible SCCs.
+  ///
+  /// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front
+  /// of \c Insert), and call \a computeMassInLoop() on each of them.
+  ///
+  /// If \c OuterLoop is \c nullptr, it refers to the top-level function.
+  ///
+  /// \pre \a computeMassInLoop() has been called for each subloop of \c
+  /// OuterLoop.
+  /// \pre \c Insert points at the the last loop successfully processed by \a
+  /// computeMassInLoop().
+  /// \pre \c OuterLoop has irreducible SCCs.
+  void computeIrreducibleMass(LoopData *OuterLoop,
+                              std::list<LoopData>::iterator Insert);
+
+  /// \brief Compute mass in all loops.
+  ///
+  /// For each loop bottom-up, call \a computeMassInLoop().
+  ///
+  /// \a computeMassInLoop() aborts (and returns \c false) on loops that
+  /// contain a irreducible sub-SCCs.  Use \a computeIrreducibleMass() and then
+  /// re-enter \a computeMassInLoop().
+  ///
+  /// \post \a computeMassInLoop() has returned \c true for every loop.
+  void computeMassInLoops();
+
+  /// \brief Compute mass in the top-level function.
+  ///
+  /// Uses \a tryToComputeMassInFunction() and \a computeIrreducibleMass() to
+  /// compute mass in the top-level function.
+  ///
+  /// \post \a tryToComputeMassInFunction() has returned \c true.
+  void computeMassInFunction();
+
+  std::string getBlockName(const BlockNode &Node) const override {
+    return bfi_detail::getBlockName(getBlock(Node));
+  }
+
+public:
+  const FunctionT *getFunction() const { return F; }
+
+  void doFunction(const FunctionT *F, const BranchProbabilityInfoT *BPI,
+                  const LoopInfoT *LI);
+  BlockFrequencyInfoImpl() : BPI(nullptr), LI(nullptr), F(nullptr) {}
+
+  using BlockFrequencyInfoImplBase::getEntryFreq;
+  BlockFrequency getBlockFreq(const BlockT *BB) const {
+    return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB));
+  }
+  Float getFloatingBlockFreq(const BlockT *BB) const {
+    return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB));
+  }
+
+  /// \brief Print the frequencies for the current function.
+  ///
+  /// Prints the frequencies for the blocks in the current function.
+  ///
+  /// Blocks are printed in the natural iteration order of the function, rather
+  /// than reverse post-order.  This provides two advantages:  writing -analyze
+  /// tests is easier (since blocks come out in source order), and even
+  /// unreachable blocks are printed.
+  ///
+  /// \a BlockFrequencyInfoImplBase::print() only knows reverse post-order, so
+  /// we need to override it here.
+  raw_ostream &print(raw_ostream &OS) const override;
+  using BlockFrequencyInfoImplBase::dump;
+
+  using BlockFrequencyInfoImplBase::printBlockFreq;
+  raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const {
+    return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB));
+  }
+};
+
+template <class BT>
+void BlockFrequencyInfoImpl<BT>::doFunction(const FunctionT *F,
+                                            const BranchProbabilityInfoT *BPI,
+                                            const LoopInfoT *LI) {
+  // Save the parameters.
+  this->BPI = BPI;
+  this->LI = LI;
+  this->F = F;
+
+  // Clean up left-over data structures.
+  BlockFrequencyInfoImplBase::clear();
+  RPOT.clear();
+  Nodes.clear();
+
+  // Initialize.
+  DEBUG(dbgs() << "\nblock-frequency: " << F->getName() << "\n================="
+               << std::string(F->getName().size(), '=') << "\n");
+  initializeRPOT();
+  initializeLoops();
+
+  // Visit loops in post-order to find thelocal mass distribution, and then do
+  // the full function.
+  computeMassInLoops();
+  computeMassInFunction();
+  unwrapLoops();
+  finalizeMetrics();
+}
+
+template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() {
+  const BlockT *Entry = F->begin();
+  RPOT.reserve(F->size());
+  std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT));
+  std::reverse(RPOT.begin(), RPOT.end());
+
+  assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() &&
+         "More nodes in function than Block Frequency Info supports");
+
+  DEBUG(dbgs() << "reverse-post-order-traversal\n");
+  for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
+    BlockNode Node = getNode(I);
+    DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node) << "\n");
+    Nodes[*I] = Node;
+  }
+
+  Working.reserve(RPOT.size());
+  for (size_t Index = 0; Index < RPOT.size(); ++Index)
+    Working.emplace_back(Index);
+  Freqs.resize(RPOT.size());
+}
+
+template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() {
+  DEBUG(dbgs() << "loop-detection\n");
+  if (LI->empty())
+    return;
+
+  // Visit loops top down and assign them an index.
+  std::deque<std::pair<const LoopT *, LoopData *>> Q;
+  for (const LoopT *L : *LI)
+    Q.emplace_back(L, nullptr);
+  while (!Q.empty()) {
+    const LoopT *Loop = Q.front().first;
+    LoopData *Parent = Q.front().second;
+    Q.pop_front();
+
+    BlockNode Header = getNode(Loop->getHeader());
+    assert(Header.isValid());
+
+    Loops.emplace_back(Parent, Header);
+    Working[Header.Index].Loop = &Loops.back();
+    DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
+
+    for (const LoopT *L : *Loop)
+      Q.emplace_back(L, &Loops.back());
+  }
+
+  // Visit nodes in reverse post-order and add them to their deepest containing
+  // loop.
+  for (size_t Index = 0; Index < RPOT.size(); ++Index) {
+    // Loop headers have already been mostly mapped.
+    if (Working[Index].isLoopHeader()) {
+      LoopData *ContainingLoop = Working[Index].getContainingLoop();
+      if (ContainingLoop)
+        ContainingLoop->Nodes.push_back(Index);
+      continue;
+    }
+
+    const LoopT *Loop = LI->getLoopFor(RPOT[Index]);
+    if (!Loop)
+      continue;
+
+    // Add this node to its containing loop's member list.
+    BlockNode Header = getNode(Loop->getHeader());
+    assert(Header.isValid());
+    const auto &HeaderData = Working[Header.Index];
+    assert(HeaderData.isLoopHeader());
+
+    Working[Index].Loop = HeaderData.Loop;
+    HeaderData.Loop->Nodes.push_back(Index);
+    DEBUG(dbgs() << " - loop = " << getBlockName(Header)
+                 << ": member = " << getBlockName(Index) << "\n");
+  }
+}
+
+template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() {
+  // Visit loops with the deepest first, and the top-level loops last.
+  for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
+    if (computeMassInLoop(*L))
+      continue;
+    auto Next = std::next(L);
+    computeIrreducibleMass(&*L, L.base());
+    L = std::prev(Next);
+    if (computeMassInLoop(*L))
+      continue;
+    llvm_unreachable("unhandled irreducible control flow");
+  }
+}
+
+template <class BT>
+bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
+  // Compute mass in loop.
+  DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
+
+  if (Loop.isIrreducible()) {
+    BlockMass Remaining = BlockMass::getFull();
+    for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
+      auto &Mass = Working[Loop.Nodes[H].Index].getMass();
+      Mass = Remaining * BranchProbability(1, Loop.NumHeaders - H);
+      Remaining -= Mass;
+    }
+    for (const BlockNode &M : Loop.Nodes)
+      if (!propagateMassToSuccessors(&Loop, M))
+        llvm_unreachable("unhandled irreducible control flow");
+  } else {
+    Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
+    if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
+      llvm_unreachable("irreducible control flow to loop header!?");
+    for (const BlockNode &M : Loop.members())
+      if (!propagateMassToSuccessors(&Loop, M))
+        // Irreducible backedge.
+        return false;
+  }
+
+  computeLoopScale(Loop);
+  packageLoop(Loop);
+  return true;
+}
+
+template <class BT>
+bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() {
+  // Compute mass in function.
+  DEBUG(dbgs() << "compute-mass-in-function\n");
+  assert(!Working.empty() && "no blocks in function");
+  assert(!Working[0].isLoopHeader() && "entry block is a loop header");
+
+  Working[0].getMass() = BlockMass::getFull();
+  for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) {
+    // Check for nodes that have been packaged.
+    BlockNode Node = getNode(I);
+    if (Working[Node.Index].isPackaged())
+      continue;
+
+    if (!propagateMassToSuccessors(nullptr, Node))
+      return false;
+  }
+  return true;
+}
+
+template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() {
+  if (tryToComputeMassInFunction())
+    return;
+  computeIrreducibleMass(nullptr, Loops.begin());
+  if (tryToComputeMassInFunction())
+    return;
+  llvm_unreachable("unhandled irreducible control flow");
+}
+
+/// \note This should be a lambda, but that crashes GCC 4.7.
+namespace bfi_detail {
+template <class BT> struct BlockEdgesAdder {
+  typedef BT BlockT;
+  typedef BlockFrequencyInfoImplBase::LoopData LoopData;
+  typedef GraphTraits<const BlockT *> Successor;
+
+  const BlockFrequencyInfoImpl<BT> &BFI;
+  explicit BlockEdgesAdder(const BlockFrequencyInfoImpl<BT> &BFI)
+      : BFI(BFI) {}
+  void operator()(IrreducibleGraph &G, IrreducibleGraph::IrrNode &Irr,
+                  const LoopData *OuterLoop) {
+    const BlockT *BB = BFI.RPOT[Irr.Node.Index];
+    for (auto I = Successor::child_begin(BB), E = Successor::child_end(BB);
+         I != E; ++I)
+      G.addEdge(Irr, BFI.getNode(*I), OuterLoop);
+  }
+};
+}
+template <class BT>
+void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass(
+    LoopData *OuterLoop, std::list<LoopData>::iterator Insert) {
+  DEBUG(dbgs() << "analyze-irreducible-in-";
+        if (OuterLoop) dbgs() << "loop: " << getLoopName(*OuterLoop) << "\n";
+        else dbgs() << "function\n");
+
+  using namespace bfi_detail;
+  // Ideally, addBlockEdges() would be declared here as a lambda, but that
+  // crashes GCC 4.7.
+  BlockEdgesAdder<BT> addBlockEdges(*this);
+  IrreducibleGraph G(*this, OuterLoop, addBlockEdges);
+
+  for (auto &L : analyzeIrreducible(G, OuterLoop, Insert))
+    computeMassInLoop(L);
+
+  if (!OuterLoop)
+    return;
+  updateLoopWithIrreducible(*OuterLoop);
+}
+
+template <class BT>
+bool
+BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop,
+                                                      const BlockNode &Node) {
+  DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
+  // Calculate probability for successors.
+  Distribution Dist;
+  if (auto *Loop = Working[Node.Index].getPackagedLoop()) {
+    assert(Loop != OuterLoop && "Cannot propagate mass in a packaged loop");
+    if (!addLoopSuccessorsToDist(OuterLoop, *Loop, Dist))
+      // Irreducible backedge.
+      return false;
+  } else {
+    const BlockT *BB = getBlock(Node);
+    for (auto SI = Successor::child_begin(BB), SE = Successor::child_end(BB);
+         SI != SE; ++SI)
+      // Do not dereference SI, or getEdgeWeight() is linear in the number of
+      // successors.
+      if (!addToDist(Dist, OuterLoop, Node, getNode(*SI),
+                     BPI->getEdgeWeight(BB, SI)))
+        // Irreducible backedge.
+        return false;
+  }
+
+  // Distribute mass to successors, saving exit and backedge data in the
+  // loop header.
+  distributeMass(Node, OuterLoop, Dist);
+  return true;
+}
+
+template <class BT>
+raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const {
+  if (!F)
+    return OS;
+  OS << "block-frequency-info: " << F->getName() << "\n";
+  for (const BlockT &BB : *F)
+    OS << " - " << bfi_detail::getBlockName(&BB)
+       << ": float = " << getFloatingBlockFreq(&BB)
+       << ", int = " << getBlockFreq(&BB).getFrequency() << "\n";
+
+  // Add an extra newline for readability.
+  OS << "\n";
+  return OS;
+}
+}
+
+#undef DEBUG_TYPE
+
+#endif
diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h
index 4a6a280..4414c84 100644
--- a/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -47,7 +47,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool runOnFunction(Function &F) override;
-  void print(raw_ostream &OS, const Module *M = 0) const override;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
 
   /// \brief Get an edge's probability, relative to other out-edges of the Src.
   ///
diff --git a/include/llvm/Analysis/CFG.h b/include/llvm/Analysis/CFG.h
index 02e3b45..7f92eda 100644
--- a/include/llvm/Analysis/CFG.h
+++ b/include/llvm/Analysis/CFG.h
@@ -65,8 +65,8 @@ bool isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
 /// on branchy code but not loops, and LI is most useful on code with loops but
 /// does not help on branchy code outside loops.
 bool isPotentiallyReachable(const Instruction *From, const Instruction *To,
-                            const DominatorTree *DT = 0,
-                            const LoopInfo *LI = 0);
+                            const DominatorTree *DT = nullptr,
+                            const LoopInfo *LI = nullptr);
 
 /// \brief Determine whether block 'To' is reachable from 'From', returning
 /// true if uncertain.
@@ -75,8 +75,8 @@ bool isPotentiallyReachable(const Instruction *From, const Instruction *To,
 /// Returns false only if we can prove that once 'From' has been reached then
 /// 'To' can not be executed. Conservatively returns true.
 bool isPotentiallyReachable(const BasicBlock *From, const BasicBlock *To,
-                            const DominatorTree *DT = 0,
-                            const LoopInfo *LI = 0);
+                            const DominatorTree *DT = nullptr,
+                            const LoopInfo *LI = nullptr);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Analysis/CGSCCPassManager.h b/include/llvm/Analysis/CGSCCPassManager.h
new file mode 100644
index 0000000..09101ae
--- /dev/null
+++ b/include/llvm/Analysis/CGSCCPassManager.h
@@ -0,0 +1,591 @@
+//===- CGSCCPassManager.h - Call graph pass management ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This header provides classes for managing passes over SCCs of the call
+/// graph. These passes form an important component of LLVM's interprocedural
+/// optimizations. Because they operate on the SCCs of the call graph, and they
+/// wtraverse the graph in post order, they can effectively do pair-wise
+/// interprocedural optimizations for all call edges in the program. At each
+/// call site edge, the callee has already been optimized as much as is
+/// possible. This in turn allows very accurate analysis of it for IPO.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_CGSCC_PASS_MANAGER_H
+#define LLVM_ANALYSIS_CGSCC_PASS_MANAGER_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+
+namespace llvm {
+
+class CGSCCAnalysisManager;
+
+class CGSCCPassManager {
+public:
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  CGSCCPassManager() {}
+  CGSCCPassManager(CGSCCPassManager &&Arg) : Passes(std::move(Arg.Passes)) {}
+  CGSCCPassManager &operator=(CGSCCPassManager &&RHS) {
+    Passes = std::move(RHS.Passes);
+    return *this;
+  }
+
+  /// \brief Run all of the CGSCC passes in this pass manager over a SCC.
+  PreservedAnalyses run(LazyCallGraph::SCC *C,
+                        CGSCCAnalysisManager *AM = nullptr);
+
+  template <typename CGSCCPassT> void addPass(CGSCCPassT Pass) {
+    Passes.emplace_back(new CGSCCPassModel<CGSCCPassT>(std::move(Pass)));
+  }
+
+  static StringRef name() { return "CGSCCPassManager"; }
+
+private:
+  // Pull in the concept type and model template specialized for SCCs.
+  typedef detail::PassConcept<LazyCallGraph::SCC *, CGSCCAnalysisManager>
+  CGSCCPassConcept;
+  template <typename PassT>
+  struct CGSCCPassModel
+      : detail::PassModel<LazyCallGraph::SCC *, CGSCCAnalysisManager, PassT> {
+    CGSCCPassModel(PassT Pass)
+        : detail::PassModel<LazyCallGraph::SCC *, CGSCCAnalysisManager, PassT>(
+              std::move(Pass)) {}
+  };
+
+  CGSCCPassManager(const CGSCCPassManager &) LLVM_DELETED_FUNCTION;
+  CGSCCPassManager &operator=(const CGSCCPassManager &) LLVM_DELETED_FUNCTION;
+
+  std::vector<std::unique_ptr<CGSCCPassConcept>> Passes;
+};
+
+/// \brief A function analysis manager to coordinate and cache analyses run over
+/// a module.
+class CGSCCAnalysisManager : public detail::AnalysisManagerBase<
+                                 CGSCCAnalysisManager, LazyCallGraph::SCC *> {
+  friend class detail::AnalysisManagerBase<CGSCCAnalysisManager,
+                                           LazyCallGraph::SCC *>;
+  typedef detail::AnalysisManagerBase<CGSCCAnalysisManager,
+                                      LazyCallGraph::SCC *> BaseT;
+  typedef BaseT::ResultConceptT ResultConceptT;
+  typedef BaseT::PassConceptT PassConceptT;
+
+public:
+  // Most public APIs are inherited from the CRTP base class.
+
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  CGSCCAnalysisManager() {}
+  CGSCCAnalysisManager(CGSCCAnalysisManager &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))),
+        CGSCCAnalysisResults(std::move(Arg.CGSCCAnalysisResults)) {}
+  CGSCCAnalysisManager &operator=(CGSCCAnalysisManager &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    CGSCCAnalysisResults = std::move(RHS.CGSCCAnalysisResults);
+    return *this;
+  }
+
+  /// \brief Returns true if the analysis manager has an empty results cache.
+  bool empty() const;
+
+  /// \brief Clear the function analysis result cache.
+  ///
+  /// This routine allows cleaning up when the set of functions itself has
+  /// potentially changed, and thus we can't even look up a a result and
+  /// invalidate it directly. Notably, this does *not* call invalidate
+  /// functions as there is nothing to be done for them.
+  void clear();
+
+private:
+  CGSCCAnalysisManager(const CGSCCAnalysisManager &) LLVM_DELETED_FUNCTION;
+  CGSCCAnalysisManager &
+  operator=(const CGSCCAnalysisManager &) LLVM_DELETED_FUNCTION;
+
+  /// \brief Get a function pass result, running the pass if necessary.
+  ResultConceptT &getResultImpl(void *PassID, LazyCallGraph::SCC *C);
+
+  /// \brief Get a cached function pass result or return null.
+  ResultConceptT *getCachedResultImpl(void *PassID,
+                                      LazyCallGraph::SCC *C) const;
+
+  /// \brief Invalidate a function pass result.
+  void invalidateImpl(void *PassID, LazyCallGraph::SCC *C);
+
+  /// \brief Invalidate the results for a function..
+  void invalidateImpl(LazyCallGraph::SCC *C, const PreservedAnalyses &PA);
+
+  /// \brief List of function analysis pass IDs and associated concept pointers.
+  ///
+  /// Requires iterators to be valid across appending new entries and arbitrary
+  /// erases. Provides both the pass ID and concept pointer such that it is
+  /// half of a bijection and provides storage for the actual result concept.
+  typedef std::list<
+      std::pair<void *, std::unique_ptr<detail::AnalysisResultConcept<
+                            LazyCallGraph::SCC *>>>> CGSCCAnalysisResultListT;
+
+  /// \brief Map type from function pointer to our custom list type.
+  typedef DenseMap<LazyCallGraph::SCC *, CGSCCAnalysisResultListT>
+  CGSCCAnalysisResultListMapT;
+
+  /// \brief Map from function to a list of function analysis results.
+  ///
+  /// Provides linear time removal of all analysis results for a function and
+  /// the ultimate storage for a particular cached analysis result.
+  CGSCCAnalysisResultListMapT CGSCCAnalysisResultLists;
+
+  /// \brief Map type from a pair of analysis ID and function pointer to an
+  /// iterator into a particular result list.
+  typedef DenseMap<std::pair<void *, LazyCallGraph::SCC *>,
+                   CGSCCAnalysisResultListT::iterator> CGSCCAnalysisResultMapT;
+
+  /// \brief Map from an analysis ID and function to a particular cached
+  /// analysis result.
+  CGSCCAnalysisResultMapT CGSCCAnalysisResults;
+};
+
+/// \brief A module analysis which acts as a proxy for a CGSCC analysis
+/// manager.
+///
+/// This primarily proxies invalidation information from the module analysis
+/// manager and module pass manager to a CGSCC analysis manager. You should
+/// never use a CGSCC analysis manager from within (transitively) a module
+/// pass manager unless your parent module pass has received a proxy result
+/// object for it.
+class CGSCCAnalysisManagerModuleProxy {
+public:
+  class Result {
+  public:
+    explicit Result(CGSCCAnalysisManager &CGAM) : CGAM(&CGAM) {}
+    // We have to explicitly define all the special member functions because
+    // MSVC refuses to generate them.
+    Result(const Result &Arg) : CGAM(Arg.CGAM) {}
+    Result(Result &&Arg) : CGAM(std::move(Arg.CGAM)) {}
+    Result &operator=(Result RHS) {
+      std::swap(CGAM, RHS.CGAM);
+      return *this;
+    }
+    ~Result();
+
+    /// \brief Accessor for the \c CGSCCAnalysisManager.
+    CGSCCAnalysisManager &getManager() { return *CGAM; }
+
+    /// \brief Handler for invalidation of the module.
+    ///
+    /// If this analysis itself is preserved, then we assume that the call
+    /// graph of the module hasn't changed and thus we don't need to invalidate
+    /// *all* cached data associated with a \c SCC* in the \c
+    /// CGSCCAnalysisManager.
+    ///
+    /// Regardless of whether this analysis is marked as preserved, all of the
+    /// analyses in the \c CGSCCAnalysisManager are potentially invalidated
+    /// based on the set of preserved analyses.
+    bool invalidate(Module *M, const PreservedAnalyses &PA);
+
+  private:
+    CGSCCAnalysisManager *CGAM;
+  };
+
+  static void *ID() { return (void *)&PassID; }
+
+  explicit CGSCCAnalysisManagerModuleProxy(CGSCCAnalysisManager &CGAM)
+      : CGAM(&CGAM) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  CGSCCAnalysisManagerModuleProxy(
+      const CGSCCAnalysisManagerModuleProxy &Arg)
+      : CGAM(Arg.CGAM) {}
+  CGSCCAnalysisManagerModuleProxy(CGSCCAnalysisManagerModuleProxy &&Arg)
+      : CGAM(std::move(Arg.CGAM)) {}
+  CGSCCAnalysisManagerModuleProxy &
+  operator=(CGSCCAnalysisManagerModuleProxy RHS) {
+    std::swap(CGAM, RHS.CGAM);
+    return *this;
+  }
+
+  /// \brief Run the analysis pass and create our proxy result object.
+  ///
+  /// This doesn't do any interesting work, it is primarily used to insert our
+  /// proxy result object into the module analysis cache so that we can proxy
+  /// invalidation to the CGSCC analysis manager.
+  ///
+  /// In debug builds, it will also assert that the analysis manager is empty
+  /// as no queries should arrive at the CGSCC analysis manager prior to
+  /// this analysis being requested.
+  Result run(Module *M);
+
+private:
+  static char PassID;
+
+  CGSCCAnalysisManager *CGAM;
+};
+
+/// \brief A CGSCC analysis which acts as a proxy for a module analysis
+/// manager.
+///
+/// This primarily provides an accessor to a parent module analysis manager to
+/// CGSCC passes. Only the const interface of the module analysis manager is
+/// provided to indicate that once inside of a CGSCC analysis pass you
+/// cannot request a module analysis to actually run. Instead, the user must
+/// rely on the \c getCachedResult API.
+///
+/// This proxy *doesn't* manage the invalidation in any way. That is handled by
+/// the recursive return path of each layer of the pass manager and the
+/// returned PreservedAnalysis set.
+class ModuleAnalysisManagerCGSCCProxy {
+public:
+  /// \brief Result proxy object for \c ModuleAnalysisManagerCGSCCProxy.
+  class Result {
+  public:
+    explicit Result(const ModuleAnalysisManager &MAM) : MAM(&MAM) {}
+    // We have to explicitly define all the special member functions because
+    // MSVC refuses to generate them.
+    Result(const Result &Arg) : MAM(Arg.MAM) {}
+    Result(Result &&Arg) : MAM(std::move(Arg.MAM)) {}
+    Result &operator=(Result RHS) {
+      std::swap(MAM, RHS.MAM);
+      return *this;
+    }
+
+    const ModuleAnalysisManager &getManager() const { return *MAM; }
+
+    /// \brief Handle invalidation by ignoring it, this pass is immutable.
+    bool invalidate(LazyCallGraph::SCC *) { return false; }
+
+  private:
+    const ModuleAnalysisManager *MAM;
+  };
+
+  static void *ID() { return (void *)&PassID; }
+
+  ModuleAnalysisManagerCGSCCProxy(const ModuleAnalysisManager &MAM)
+      : MAM(&MAM) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  ModuleAnalysisManagerCGSCCProxy(
+      const ModuleAnalysisManagerCGSCCProxy &Arg)
+      : MAM(Arg.MAM) {}
+  ModuleAnalysisManagerCGSCCProxy(ModuleAnalysisManagerCGSCCProxy &&Arg)
+      : MAM(std::move(Arg.MAM)) {}
+  ModuleAnalysisManagerCGSCCProxy &
+  operator=(ModuleAnalysisManagerCGSCCProxy RHS) {
+    std::swap(MAM, RHS.MAM);
+    return *this;
+  }
+
+  /// \brief Run the analysis pass and create our proxy result object.
+  /// Nothing to see here, it just forwards the \c MAM reference into the
+  /// result.
+  Result run(LazyCallGraph::SCC *) { return Result(*MAM); }
+
+private:
+  static char PassID;
+
+  const ModuleAnalysisManager *MAM;
+};
+
+/// \brief The core module pass which does a post-order walk of the SCCs and
+/// runs a CGSCC pass over each one.
+///
+/// Designed to allow composition of a CGSCCPass(Manager) and
+/// a ModulePassManager. Note that this pass must be run with a module analysis
+/// manager as it uses the LazyCallGraph analysis. It will also run the
+/// \c CGSCCAnalysisManagerModuleProxy analysis prior to running the CGSCC
+/// pass over the module to enable a \c FunctionAnalysisManager to be used
+/// within this run safely.
+template <typename CGSCCPassT> class ModuleToPostOrderCGSCCPassAdaptor {
+public:
+  explicit ModuleToPostOrderCGSCCPassAdaptor(CGSCCPassT Pass)
+      : Pass(std::move(Pass)) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  ModuleToPostOrderCGSCCPassAdaptor(
+      const ModuleToPostOrderCGSCCPassAdaptor &Arg)
+      : Pass(Arg.Pass) {}
+  ModuleToPostOrderCGSCCPassAdaptor(ModuleToPostOrderCGSCCPassAdaptor &&Arg)
+      : Pass(std::move(Arg.Pass)) {}
+  friend void swap(ModuleToPostOrderCGSCCPassAdaptor &LHS,
+                   ModuleToPostOrderCGSCCPassAdaptor &RHS) {
+    using std::swap;
+    swap(LHS.Pass, RHS.Pass);
+  }
+  ModuleToPostOrderCGSCCPassAdaptor &
+  operator=(ModuleToPostOrderCGSCCPassAdaptor RHS) {
+    swap(*this, RHS);
+    return *this;
+  }
+
+  /// \brief Runs the CGSCC pass across every SCC in the module.
+  PreservedAnalyses run(Module *M, ModuleAnalysisManager *AM) {
+    assert(AM && "We need analyses to compute the call graph!");
+
+    // Setup the CGSCC analysis manager from its proxy.
+    CGSCCAnalysisManager &CGAM =
+        AM->getResult<CGSCCAnalysisManagerModuleProxy>(M).getManager();
+
+    // Get the call graph for this module.
+    LazyCallGraph &CG = AM->getResult<LazyCallGraphAnalysis>(M);
+
+    PreservedAnalyses PA = PreservedAnalyses::all();
+    for (LazyCallGraph::SCC &C : CG.postorder_sccs()) {
+      PreservedAnalyses PassPA = Pass.run(&C, &CGAM);
+
+      // We know that the CGSCC pass couldn't have invalidated any other
+      // SCC's analyses (that's the contract of a CGSCC pass), so
+      // directly handle the CGSCC analysis manager's invalidation here.
+      // FIXME: This isn't quite correct. We need to handle the case where the
+      // pass updated the CG, particularly some child of the current SCC, and
+      // invalidate its analyses.
+      CGAM.invalidate(&C, PassPA);
+
+      // Then intersect the preserved set so that invalidation of module
+      // analyses will eventually occur when the module pass completes.
+      PA.intersect(std::move(PassPA));
+    }
+
+    // By definition we preserve the proxy. This precludes *any* invalidation
+    // of CGSCC analyses by the proxy, but that's OK because we've taken
+    // care to invalidate analyses in the CGSCC analysis manager
+    // incrementally above.
+    PA.preserve<CGSCCAnalysisManagerModuleProxy>();
+    return PA;
+  }
+
+  static StringRef name() { return "ModuleToPostOrderCGSCCPassAdaptor"; }
+
+private:
+  CGSCCPassT Pass;
+};
+
+/// \brief A function to deduce a function pass type and wrap it in the
+/// templated adaptor.
+template <typename CGSCCPassT>
+ModuleToPostOrderCGSCCPassAdaptor<CGSCCPassT>
+createModuleToPostOrderCGSCCPassAdaptor(CGSCCPassT Pass) {
+  return std::move(
+      ModuleToPostOrderCGSCCPassAdaptor<CGSCCPassT>(std::move(Pass)));
+}
+
+/// \brief A CGSCC analysis which acts as a proxy for a function analysis
+/// manager.
+///
+/// This primarily proxies invalidation information from the CGSCC analysis
+/// manager and CGSCC pass manager to a function analysis manager. You should
+/// never use a function analysis manager from within (transitively) a CGSCC
+/// pass manager unless your parent CGSCC pass has received a proxy result
+/// object for it.
+class FunctionAnalysisManagerCGSCCProxy {
+public:
+  class Result {
+  public:
+    explicit Result(FunctionAnalysisManager &FAM) : FAM(&FAM) {}
+    // We have to explicitly define all the special member functions because
+    // MSVC refuses to generate them.
+    Result(const Result &Arg) : FAM(Arg.FAM) {}
+    Result(Result &&Arg) : FAM(std::move(Arg.FAM)) {}
+    Result &operator=(Result RHS) {
+      std::swap(FAM, RHS.FAM);
+      return *this;
+    }
+    ~Result();
+
+    /// \brief Accessor for the \c FunctionAnalysisManager.
+    FunctionAnalysisManager &getManager() { return *FAM; }
+
+    /// \brief Handler for invalidation of the SCC.
+    ///
+    /// If this analysis itself is preserved, then we assume that the set of \c
+    /// Function objects in the \c SCC hasn't changed and thus we don't need
+    /// to invalidate *all* cached data associated with a \c Function* in the \c
+    /// FunctionAnalysisManager.
+    ///
+    /// Regardless of whether this analysis is marked as preserved, all of the
+    /// analyses in the \c FunctionAnalysisManager are potentially invalidated
+    /// based on the set of preserved analyses.
+    bool invalidate(LazyCallGraph::SCC *C, const PreservedAnalyses &PA);
+
+  private:
+    FunctionAnalysisManager *FAM;
+  };
+
+  static void *ID() { return (void *)&PassID; }
+
+  explicit FunctionAnalysisManagerCGSCCProxy(FunctionAnalysisManager &FAM)
+      : FAM(&FAM) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  FunctionAnalysisManagerCGSCCProxy(
+      const FunctionAnalysisManagerCGSCCProxy &Arg)
+      : FAM(Arg.FAM) {}
+  FunctionAnalysisManagerCGSCCProxy(FunctionAnalysisManagerCGSCCProxy &&Arg)
+      : FAM(std::move(Arg.FAM)) {}
+  FunctionAnalysisManagerCGSCCProxy &
+  operator=(FunctionAnalysisManagerCGSCCProxy RHS) {
+    std::swap(FAM, RHS.FAM);
+    return *this;
+  }
+
+  /// \brief Run the analysis pass and create our proxy result object.
+  ///
+  /// This doesn't do any interesting work, it is primarily used to insert our
+  /// proxy result object into the module analysis cache so that we can proxy
+  /// invalidation to the function analysis manager.
+  ///
+  /// In debug builds, it will also assert that the analysis manager is empty
+  /// as no queries should arrive at the function analysis manager prior to
+  /// this analysis being requested.
+  Result run(LazyCallGraph::SCC *C);
+
+private:
+  static char PassID;
+
+  FunctionAnalysisManager *FAM;
+};
+
+/// \brief A function analysis which acts as a proxy for a CGSCC analysis
+/// manager.
+///
+/// This primarily provides an accessor to a parent CGSCC analysis manager to
+/// function passes. Only the const interface of the CGSCC analysis manager is
+/// provided to indicate that once inside of a function analysis pass you
+/// cannot request a CGSCC analysis to actually run. Instead, the user must
+/// rely on the \c getCachedResult API.
+///
+/// This proxy *doesn't* manage the invalidation in any way. That is handled by
+/// the recursive return path of each layer of the pass manager and the
+/// returned PreservedAnalysis set.
+class CGSCCAnalysisManagerFunctionProxy {
+public:
+  /// \brief Result proxy object for \c ModuleAnalysisManagerFunctionProxy.
+  class Result {
+  public:
+    explicit Result(const CGSCCAnalysisManager &CGAM) : CGAM(&CGAM) {}
+    // We have to explicitly define all the special member functions because
+    // MSVC refuses to generate them.
+    Result(const Result &Arg) : CGAM(Arg.CGAM) {}
+    Result(Result &&Arg) : CGAM(std::move(Arg.CGAM)) {}
+    Result &operator=(Result RHS) {
+      std::swap(CGAM, RHS.CGAM);
+      return *this;
+    }
+
+    const CGSCCAnalysisManager &getManager() const { return *CGAM; }
+
+    /// \brief Handle invalidation by ignoring it, this pass is immutable.
+    bool invalidate(Function *) { return false; }
+
+  private:
+    const CGSCCAnalysisManager *CGAM;
+  };
+
+  static void *ID() { return (void *)&PassID; }
+
+  CGSCCAnalysisManagerFunctionProxy(const CGSCCAnalysisManager &CGAM)
+      : CGAM(&CGAM) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  CGSCCAnalysisManagerFunctionProxy(
+      const CGSCCAnalysisManagerFunctionProxy &Arg)
+      : CGAM(Arg.CGAM) {}
+  CGSCCAnalysisManagerFunctionProxy(CGSCCAnalysisManagerFunctionProxy &&Arg)
+      : CGAM(std::move(Arg.CGAM)) {}
+  CGSCCAnalysisManagerFunctionProxy &
+  operator=(CGSCCAnalysisManagerFunctionProxy RHS) {
+    std::swap(CGAM, RHS.CGAM);
+    return *this;
+  }
+
+  /// \brief Run the analysis pass and create our proxy result object.
+  /// Nothing to see here, it just forwards the \c CGAM reference into the
+  /// result.
+  Result run(Function *) { return Result(*CGAM); }
+
+private:
+  static char PassID;
+
+  const CGSCCAnalysisManager *CGAM;
+};
+
+/// \brief Adaptor that maps from a SCC to its functions.
+///
+/// Designed to allow composition of a FunctionPass(Manager) and
+/// a CGSCCPassManager. Note that if this pass is constructed with a pointer
+/// to a \c CGSCCAnalysisManager it will run the
+/// \c FunctionAnalysisManagerCGSCCProxy analysis prior to running the function
+/// pass over the SCC to enable a \c FunctionAnalysisManager to be used
+/// within this run safely.
+template <typename FunctionPassT> class CGSCCToFunctionPassAdaptor {
+public:
+  explicit CGSCCToFunctionPassAdaptor(FunctionPassT Pass)
+      : Pass(std::move(Pass)) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  CGSCCToFunctionPassAdaptor(const CGSCCToFunctionPassAdaptor &Arg)
+      : Pass(Arg.Pass) {}
+  CGSCCToFunctionPassAdaptor(CGSCCToFunctionPassAdaptor &&Arg)
+      : Pass(std::move(Arg.Pass)) {}
+  friend void swap(CGSCCToFunctionPassAdaptor &LHS, CGSCCToFunctionPassAdaptor &RHS) {
+    using std::swap;
+    swap(LHS.Pass, RHS.Pass);
+  }
+  CGSCCToFunctionPassAdaptor &operator=(CGSCCToFunctionPassAdaptor RHS) {
+    swap(*this, RHS);
+    return *this;
+  }
+
+  /// \brief Runs the function pass across every function in the module.
+  PreservedAnalyses run(LazyCallGraph::SCC *C, CGSCCAnalysisManager *AM) {
+    FunctionAnalysisManager *FAM = nullptr;
+    if (AM)
+      // Setup the function analysis manager from its proxy.
+      FAM = &AM->getResult<FunctionAnalysisManagerCGSCCProxy>(C).getManager();
+
+    PreservedAnalyses PA = PreservedAnalyses::all();
+    for (LazyCallGraph::Node *N : *C) {
+      PreservedAnalyses PassPA = Pass.run(&N->getFunction(), FAM);
+
+      // We know that the function pass couldn't have invalidated any other
+      // function's analyses (that's the contract of a function pass), so
+      // directly handle the function analysis manager's invalidation here.
+      if (FAM)
+        FAM->invalidate(&N->getFunction(), PassPA);
+
+      // Then intersect the preserved set so that invalidation of module
+      // analyses will eventually occur when the module pass completes.
+      PA.intersect(std::move(PassPA));
+    }
+
+    // By definition we preserve the proxy. This precludes *any* invalidation
+    // of function analyses by the proxy, but that's OK because we've taken
+    // care to invalidate analyses in the function analysis manager
+    // incrementally above.
+    // FIXME: We need to update the call graph here to account for any deleted
+    // edges!
+    PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+    return PA;
+  }
+
+  static StringRef name() { return "CGSCCToFunctionPassAdaptor"; }
+
+private:
+  FunctionPassT Pass;
+};
+
+/// \brief A function to deduce a function pass type and wrap it in the
+/// templated adaptor.
+template <typename FunctionPassT>
+CGSCCToFunctionPassAdaptor<FunctionPassT>
+createCGSCCToFunctionPassAdaptor(FunctionPassT Pass) {
+  return std::move(CGSCCToFunctionPassAdaptor<FunctionPassT>(std::move(Pass)));
+}
+
+}
+
+#endif
diff --git a/include/llvm/Analysis/ConstantFolding.h b/include/llvm/Analysis/ConstantFolding.h
index 0018a56..09d45ca 100644
--- a/include/llvm/Analysis/ConstantFolding.h
+++ b/include/llvm/Analysis/ConstantFolding.h
@@ -36,15 +36,16 @@ namespace llvm {
 /// Note that this fails if not all of the operands are constant.  Otherwise,
 /// this function can only fail when attempting to fold instructions like loads
 /// and stores, which have no constant expression form.
-Constant *ConstantFoldInstruction(Instruction *I, const DataLayout *TD = 0,
-                                  const TargetLibraryInfo *TLI = 0);
+Constant *ConstantFoldInstruction(Instruction *I,
+                                  const DataLayout *TD = nullptr,
+                                  const TargetLibraryInfo *TLI = nullptr);
 
 /// ConstantFoldConstantExpression - Attempt to fold the constant expression
 /// using the specified DataLayout.  If successful, the constant result is
 /// result is returned, if not, null is returned.
 Constant *ConstantFoldConstantExpression(const ConstantExpr *CE,
-                                         const DataLayout *TD = 0,
-                                         const TargetLibraryInfo *TLI = 0);
+                                         const DataLayout *TD = nullptr,
+                                         const TargetLibraryInfo *TLI =nullptr);
 
 /// ConstantFoldInstOperands - Attempt to constant fold an instruction with the
 /// specified operands.  If successful, the constant result is returned, if not,
@@ -54,8 +55,8 @@ Constant *ConstantFoldConstantExpression(const ConstantExpr *CE,
 ///
 Constant *ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
                                    ArrayRef<Constant *> Ops,
-                                   const DataLayout *TD = 0,
-                                   const TargetLibraryInfo *TLI = 0);
+                                   const DataLayout *TD = nullptr,
+                                   const TargetLibraryInfo *TLI = nullptr);
 
 /// ConstantFoldCompareInstOperands - Attempt to constant fold a compare
 /// instruction (icmp/fcmp) with the specified operands.  If it fails, it
@@ -63,8 +64,8 @@ Constant *ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
 ///
 Constant *ConstantFoldCompareInstOperands(unsigned Predicate,
                                           Constant *LHS, Constant *RHS,
-                                          const DataLayout *TD = 0,
-                                          const TargetLibraryInfo *TLI = 0);
+                                          const DataLayout *TD = nullptr,
+                                          const TargetLibraryInfo *TLI=nullptr);
 
 /// ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue
 /// instruction with the specified operands and indices.  The constant result is
@@ -75,7 +76,8 @@ Constant *ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val,
 /// ConstantFoldLoadFromConstPtr - Return the value that a load from C would
 /// produce if it is constant and determinable.  If this is not determinable,
 /// return null.
-Constant *ConstantFoldLoadFromConstPtr(Constant *C, const DataLayout *TD = 0);
+Constant *ConstantFoldLoadFromConstPtr(Constant *C,
+                                       const DataLayout *TD = nullptr);
 
 /// ConstantFoldLoadThroughGEPConstantExpr - Given a constant and a
 /// getelementptr constantexpr, return the constant value being addressed by the
@@ -96,7 +98,7 @@ bool canConstantFoldCallTo(const Function *F);
 /// ConstantFoldCall - Attempt to constant fold a call to the specified function
 /// with the specified arguments, returning null if unsuccessful.
 Constant *ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
-                           const TargetLibraryInfo *TLI = 0);
+                           const TargetLibraryInfo *TLI = nullptr);
 }
 
 #endif
diff --git a/include/llvm/Analysis/DOTGraphTraitsPass.h b/include/llvm/Analysis/DOTGraphTraitsPass.h
index ff3392a..53c832c 100644
--- a/include/llvm/Analysis/DOTGraphTraitsPass.h
+++ b/include/llvm/Analysis/DOTGraphTraitsPass.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/FileSystem.h"
 
 namespace llvm {
 
diff --git a/include/llvm/Analysis/DependenceAnalysis.h b/include/llvm/Analysis/DependenceAnalysis.h
index a142828..279755e 100644
--- a/include/llvm/Analysis/DependenceAnalysis.h
+++ b/include/llvm/Analysis/DependenceAnalysis.h
@@ -73,8 +73,8 @@ namespace llvm {
                Instruction *Destination) :
       Src(Source),
       Dst(Destination),
-      NextPredecessor(NULL),
-      NextSuccessor(NULL) {}
+      NextPredecessor(nullptr),
+      NextSuccessor(nullptr) {}
     virtual ~Dependence() {}
 
     /// Dependence::DVEntry - Each level in the distance/direction vector
@@ -96,7 +96,7 @@ namespace llvm {
       bool Splitable : 1; // Splitting the loop will break dependence.
       const SCEV *Distance; // NULL implies no distance available.
       DVEntry() : Direction(ALL), Scalar(true), PeelFirst(false),
-                  PeelLast(false), Splitable(false), Distance(NULL) { }
+                  PeelLast(false), Splitable(false), Distance(nullptr) { }
     };
 
     /// getSrc - Returns the source instruction for this dependence.
@@ -154,7 +154,7 @@ namespace llvm {
 
     /// getDistance - Returns the distance (or NULL) associated with a
     /// particular level.
-    virtual const SCEV *getDistance(unsigned Level) const { return NULL; }
+    virtual const SCEV *getDistance(unsigned Level) const { return nullptr; }
 
     /// isPeelFirst - Returns true if peeling the first iteration from
     /// this loop will break this dependence.
@@ -910,7 +910,8 @@ namespace llvm {
                          const Constraint &CurConstraint) const;
 
     bool tryDelinearize(const SCEV *SrcSCEV, const SCEV *DstSCEV,
-                        SmallVectorImpl<Subscript> &Pair) const;
+                        SmallVectorImpl<Subscript> &Pair,
+                        const SCEV *ElementSize) const;
 
   public:
     static char ID; // Class identification, replacement for typeinfo
@@ -921,7 +922,7 @@ namespace llvm {
     bool runOnFunction(Function &F) override;
     void releaseMemory() override;
     void getAnalysisUsage(AnalysisUsage &) const override;
-    void print(raw_ostream &, const Module * = 0) const override;
+    void print(raw_ostream &, const Module * = nullptr) const override;
   }; // class DependenceAnalysis
 
   /// createDependenceAnalysisPass - This creates an instance of the
diff --git a/include/llvm/Analysis/DominanceFrontier.h b/include/llvm/Analysis/DominanceFrontier.h
index 4dcea2d..0fbaa13 100644
--- a/include/llvm/Analysis/DominanceFrontier.h
+++ b/include/llvm/Analysis/DominanceFrontier.h
@@ -142,7 +142,7 @@ public:
 
   /// print - Convert to human readable form
   ///
-  void print(raw_ostream &OS, const Module* = 0) const override;
+  void print(raw_ostream &OS, const Module* = nullptr) const override;
 
   /// dump - Dump the dominance frontier to dbgs().
   void dump() const;
diff --git a/include/llvm/Analysis/IVUsers.h b/include/llvm/Analysis/IVUsers.h
index c6bb494..6038872 100644
--- a/include/llvm/Analysis/IVUsers.h
+++ b/include/llvm/Analysis/IVUsers.h
@@ -169,7 +169,7 @@ public:
     return Processed.count(Inst);
   }
 
-  void print(raw_ostream &OS, const Module* = 0) const override;
+  void print(raw_ostream &OS, const Module* = nullptr) const override;
 
   /// dump - This method is used for debugging.
   void dump() const;
diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h
index 775d0df..2367c0b 100644
--- a/include/llvm/Analysis/InstructionSimplify.h
+++ b/include/llvm/Analysis/InstructionSimplify.h
@@ -48,160 +48,166 @@ namespace llvm {
   /// SimplifyAddInst - Given operands for an Add, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyAddInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
-                         const DataLayout *TD = 0,
-                         const TargetLibraryInfo *TLI = 0,
-                         const DominatorTree *DT = 0);
+                         const DataLayout *TD = nullptr,
+                         const TargetLibraryInfo *TLI = nullptr,
+                         const DominatorTree *DT = nullptr);
 
   /// SimplifySubInst - Given operands for a Sub, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifySubInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
-                         const DataLayout *TD = 0,
-                         const TargetLibraryInfo *TLI = 0,
-                         const DominatorTree *DT = 0);
+                         const DataLayout *TD = nullptr,
+                         const TargetLibraryInfo *TLI = nullptr,
+                         const DominatorTree *DT = nullptr);
 
   /// Given operands for an FAdd, see if we can fold the result.  If not, this
   /// returns null.
   Value *SimplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF,
-                         const DataLayout *TD = 0,
-                         const TargetLibraryInfo *TLI = 0,
-                         const DominatorTree *DT = 0);
+                         const DataLayout *TD = nullptr,
+                         const TargetLibraryInfo *TLI = nullptr,
+                         const DominatorTree *DT = nullptr);
 
   /// Given operands for an FSub, see if we can fold the result.  If not, this
   /// returns null.
   Value *SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF,
-                         const DataLayout *TD = 0,
-                         const TargetLibraryInfo *TLI = 0,
-                         const DominatorTree *DT = 0);
+                         const DataLayout *TD = nullptr,
+                         const TargetLibraryInfo *TLI = nullptr,
+                         const DominatorTree *DT = nullptr);
 
   /// Given operands for an FMul, see if we can fold the result.  If not, this
   /// returns null.
   Value *SimplifyFMulInst(Value *LHS, Value *RHS,
                           FastMathFlags FMF,
-                          const DataLayout *TD = 0,
-                          const TargetLibraryInfo *TLI = 0,
-                          const DominatorTree *DT = 0);
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr);
 
   /// SimplifyMulInst - Given operands for a Mul, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyMulInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
-                         const TargetLibraryInfo *TLI = 0,
-                         const DominatorTree *DT = 0);
+  Value *SimplifyMulInst(Value *LHS, Value *RHS, const DataLayout *TD = nullptr,
+                         const TargetLibraryInfo *TLI = nullptr,
+                         const DominatorTree *DT = nullptr);
 
   /// SimplifySDivInst - Given operands for an SDiv, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifySDivInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
-                          const TargetLibraryInfo *TLI = 0,
-                          const DominatorTree *DT = 0);
+  Value *SimplifySDivInst(Value *LHS, Value *RHS,
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr);
 
   /// SimplifyUDivInst - Given operands for a UDiv, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyUDivInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
-                          const TargetLibraryInfo *TLI = 0,
-                          const DominatorTree *DT = 0);
+  Value *SimplifyUDivInst(Value *LHS, Value *RHS,
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr);
 
   /// SimplifyFDivInst - Given operands for an FDiv, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyFDivInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
-                          const TargetLibraryInfo *TLI = 0,
-                          const DominatorTree *DT = 0);
+  Value *SimplifyFDivInst(Value *LHS, Value *RHS,
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr);
 
   /// SimplifySRemInst - Given operands for an SRem, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifySRemInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
-                          const TargetLibraryInfo *TLI = 0,
-                          const DominatorTree *DT = 0);
+  Value *SimplifySRemInst(Value *LHS, Value *RHS,
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr);
 
   /// SimplifyURemInst - Given operands for a URem, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyURemInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
-                          const TargetLibraryInfo *TLI = 0,
-                          const DominatorTree *DT = 0);
+  Value *SimplifyURemInst(Value *LHS, Value *RHS,
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr);
 
   /// SimplifyFRemInst - Given operands for an FRem, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyFRemInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
-                          const TargetLibraryInfo *TLI = 0,
-                          const DominatorTree *DT = 0);
+  Value *SimplifyFRemInst(Value *LHS, Value *RHS,
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr);
 
   /// SimplifyShlInst - Given operands for a Shl, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                         const DataLayout *TD = 0,
-                         const TargetLibraryInfo *TLI = 0,
-                         const DominatorTree *DT = 0);
+                         const DataLayout *TD = nullptr,
+                         const TargetLibraryInfo *TLI = nullptr,
+                         const DominatorTree *DT = nullptr);
 
   /// SimplifyLShrInst - Given operands for a LShr, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
-                          const DataLayout *TD = 0,
-                          const TargetLibraryInfo *TLI = 0,
-                          const DominatorTree *DT = 0);
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr);
 
   /// SimplifyAShrInst - Given operands for a AShr, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
-                          const DataLayout *TD = 0,
-                          const TargetLibraryInfo *TLI = 0,
-                          const DominatorTree *DT = 0);
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr);
 
   /// SimplifyAndInst - Given operands for an And, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyAndInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
-                         const TargetLibraryInfo *TLI = 0,
-                         const DominatorTree *DT = 0);
+  Value *SimplifyAndInst(Value *LHS, Value *RHS, const DataLayout *TD = nullptr,
+                         const TargetLibraryInfo *TLI = nullptr,
+                         const DominatorTree *DT = nullptr);
 
   /// SimplifyOrInst - Given operands for an Or, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyOrInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
-                        const TargetLibraryInfo *TLI = 0,
-                        const DominatorTree *DT = 0);
+  Value *SimplifyOrInst(Value *LHS, Value *RHS, const DataLayout *TD = nullptr,
+                        const TargetLibraryInfo *TLI = nullptr,
+                        const DominatorTree *DT = nullptr);
 
   /// SimplifyXorInst - Given operands for a Xor, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyXorInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
-                         const TargetLibraryInfo *TLI = 0,
-                         const DominatorTree *DT = 0);
+  Value *SimplifyXorInst(Value *LHS, Value *RHS, const DataLayout *TD = nullptr,
+                         const TargetLibraryInfo *TLI = nullptr,
+                         const DominatorTree *DT = nullptr);
 
   /// SimplifyICmpInst - Given operands for an ICmpInst, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                          const DataLayout *TD = 0,
-                          const TargetLibraryInfo *TLI = 0,
-                          const DominatorTree *DT = 0);
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr);
 
   /// SimplifyFCmpInst - Given operands for an FCmpInst, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                          const DataLayout *TD = 0,
-                          const TargetLibraryInfo *TLI = 0,
-                          const DominatorTree *DT = 0);
+                          const DataLayout *TD = nullptr,
+                          const TargetLibraryInfo *TLI = nullptr,
+                          const DominatorTree *DT = nullptr);
 
   /// SimplifySelectInst - Given operands for a SelectInst, see if we can fold
   /// the result.  If not, this returns null.
   Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
-                            const DataLayout *TD = 0,
-                            const TargetLibraryInfo *TLI = 0,
-                            const DominatorTree *DT = 0);
+                            const DataLayout *TD = nullptr,
+                            const TargetLibraryInfo *TLI = nullptr,
+                            const DominatorTree *DT = nullptr);
 
   /// SimplifyGEPInst - Given operands for an GetElementPtrInst, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyGEPInst(ArrayRef<Value *> Ops, const DataLayout *TD = 0,
-                         const TargetLibraryInfo *TLI = 0,
-                         const DominatorTree *DT = 0);
+  Value *SimplifyGEPInst(ArrayRef<Value *> Ops, const DataLayout *TD = nullptr,
+                         const TargetLibraryInfo *TLI = nullptr,
+                         const DominatorTree *DT = nullptr);
 
   /// SimplifyInsertValueInst - Given operands for an InsertValueInst, see if we
   /// can fold the result.  If not, this returns null.
   Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
                                  ArrayRef<unsigned> Idxs,
-                                 const DataLayout *TD = 0,
-                                 const TargetLibraryInfo *TLI = 0,
-                                 const DominatorTree *DT = 0);
+                                 const DataLayout *TD = nullptr,
+                                 const TargetLibraryInfo *TLI = nullptr,
+                                 const DominatorTree *DT = nullptr);
 
   /// SimplifyTruncInst - Given operands for an TruncInst, see if we can fold
   /// the result.  If not, this returns null.
-  Value *SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout *TD = 0,
-                           const TargetLibraryInfo *TLI = 0,
-                           const DominatorTree *DT = 0);
+  Value *SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout *TD = nullptr,
+                           const TargetLibraryInfo *TLI = nullptr,
+                           const DominatorTree *DT = nullptr);
 
   //=== Helper functions for higher up the class hierarchy.
 
@@ -209,40 +215,40 @@ namespace llvm {
   /// SimplifyCmpInst - Given operands for a CmpInst, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                         const DataLayout *TD = 0,
-                         const TargetLibraryInfo *TLI = 0,
-                         const DominatorTree *DT = 0);
+                         const DataLayout *TD = nullptr,
+                         const TargetLibraryInfo *TLI = nullptr,
+                         const DominatorTree *DT = nullptr);
 
   /// SimplifyBinOp - Given operands for a BinaryOperator, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                       const DataLayout *TD = 0,
-                       const TargetLibraryInfo *TLI = 0,
-                       const DominatorTree *DT = 0);
+                       const DataLayout *TD = nullptr,
+                       const TargetLibraryInfo *TLI = nullptr,
+                       const DominatorTree *DT = nullptr);
 
   /// \brief Given a function and iterators over arguments, see if we can fold
   /// the result.
   ///
   /// If this call could not be simplified returns null.
   Value *SimplifyCall(Value *V, User::op_iterator ArgBegin,
-                      User::op_iterator ArgEnd, const DataLayout *TD = 0,
-                      const TargetLibraryInfo *TLI = 0,
-                      const DominatorTree *DT = 0);
+                      User::op_iterator ArgEnd, const DataLayout *TD = nullptr,
+                      const TargetLibraryInfo *TLI = nullptr,
+                      const DominatorTree *DT = nullptr);
 
   /// \brief Given a function and set of arguments, see if we can fold the
   /// result.
   ///
   /// If this call could not be simplified returns null.
   Value *SimplifyCall(Value *V, ArrayRef<Value *> Args,
-                      const DataLayout *TD = 0,
-                      const TargetLibraryInfo *TLI = 0,
-                      const DominatorTree *DT = 0);
+                      const DataLayout *TD = nullptr,
+                      const TargetLibraryInfo *TLI = nullptr,
+                      const DominatorTree *DT = nullptr);
 
   /// SimplifyInstruction - See if we can compute a simplified version of this
   /// instruction.  If not, this returns null.
-  Value *SimplifyInstruction(Instruction *I, const DataLayout *TD = 0,
-                             const TargetLibraryInfo *TLI = 0,
-                             const DominatorTree *DT = 0);
+  Value *SimplifyInstruction(Instruction *I, const DataLayout *TD = nullptr,
+                             const TargetLibraryInfo *TLI = nullptr,
+                             const DominatorTree *DT = nullptr);
 
 
   /// \brief Replace all uses of 'I' with 'SimpleV' and simplify the uses
@@ -254,9 +260,9 @@ namespace llvm {
   ///
   /// The function returns true if any simplifications were performed.
   bool replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
-                                     const DataLayout *TD = 0,
-                                     const TargetLibraryInfo *TLI = 0,
-                                     const DominatorTree *DT = 0);
+                                     const DataLayout *TD = nullptr,
+                                     const TargetLibraryInfo *TLI = nullptr,
+                                     const DominatorTree *DT = nullptr);
 
   /// \brief Recursively attempt to simplify an instruction.
   ///
@@ -265,9 +271,9 @@ namespace llvm {
   /// of the users impacted. It returns true if any simplifications were
   /// performed.
   bool recursivelySimplifyInstruction(Instruction *I,
-                                      const DataLayout *TD = 0,
-                                      const TargetLibraryInfo *TLI = 0,
-                                      const DominatorTree *DT = 0);
+                                      const DataLayout *TD = nullptr,
+                                      const TargetLibraryInfo *TLI = nullptr,
+                                      const DominatorTree *DT = nullptr);
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/Analysis/IntervalPartition.h b/include/llvm/Analysis/IntervalPartition.h
index 05248bd..274be2b 100644
--- a/include/llvm/Analysis/IntervalPartition.h
+++ b/include/llvm/Analysis/IntervalPartition.h
@@ -48,7 +48,7 @@ class IntervalPartition : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid
 
-  IntervalPartition() : FunctionPass(ID), RootInterval(0) {
+  IntervalPartition() : FunctionPass(ID), RootInterval(nullptr) {
     initializeIntervalPartitionPass(*PassRegistry::getPassRegistry());
   }
 
@@ -62,7 +62,7 @@ public:
   IntervalPartition(IntervalPartition &I, bool);
 
   // print - Show contents in human readable format...
-  void print(raw_ostream &O, const Module* = 0) const override;
+  void print(raw_ostream &O, const Module* = nullptr) const override;
 
   // getRootInterval() - Return the root interval that contains the starting
   // block of the function.
@@ -77,7 +77,7 @@ public:
   // getBlockInterval - Return the interval that a basic block exists in.
   inline Interval *getBlockInterval(BasicBlock *BB) {
     IntervalMapTy::iterator I = IntervalMap.find(BB);
-    return I != IntervalMap.end() ? I->second : 0;
+    return I != IntervalMap.end() ? I->second : nullptr;
   }
 
   // getAnalysisUsage - Implement the Pass API
diff --git a/include/llvm/Analysis/LazyCallGraph.h b/include/llvm/Analysis/LazyCallGraph.h
index 74b0c8e..70a4df5 100644
--- a/include/llvm/Analysis/LazyCallGraph.h
+++ b/include/llvm/Analysis/LazyCallGraph.h
@@ -38,8 +38,11 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
@@ -100,6 +103,7 @@ class raw_ostream;
 class LazyCallGraph {
 public:
   class Node;
+  class SCC;
   typedef SmallVector<PointerUnion<Function *, Node *>, 4> NodeVectorT;
   typedef SmallVectorImpl<PointerUnion<Function *, Node *>> NodeVectorImplT;
 
@@ -109,67 +113,271 @@ public:
   /// be scanned for "calls" or uses of functions and its child information
   /// will be constructed. All of these results are accumulated and cached in
   /// the graph.
-  class iterator : public std::iterator<std::bidirectional_iterator_tag, Node *,
-                                        ptrdiff_t, Node *, Node *> {
+  class iterator
+      : public iterator_adaptor_base<iterator, NodeVectorImplT::iterator,
+                                     std::forward_iterator_tag, Node> {
     friend class LazyCallGraph;
     friend class LazyCallGraph::Node;
-    typedef std::iterator<std::bidirectional_iterator_tag, Node *, ptrdiff_t,
-                          Node *, Node *> BaseT;
 
-    /// \brief Nonce type to select the constructor for the end iterator.
-    struct IsAtEndT {};
-
-    LazyCallGraph &G;
-    NodeVectorImplT::iterator NI;
+    LazyCallGraph *G;
+    NodeVectorImplT::iterator E;
 
-    // Build the begin iterator for a node.
-    explicit iterator(LazyCallGraph &G, NodeVectorImplT &Nodes)
-        : G(G), NI(Nodes.begin()) {}
-
-    // Build the end iterator for a node. This is selected purely by overload.
-    iterator(LazyCallGraph &G, NodeVectorImplT &Nodes, IsAtEndT /*Nonce*/)
-        : G(G), NI(Nodes.end()) {}
+    // Build the iterator for a specific position in a node list.
+    iterator(LazyCallGraph &G, NodeVectorImplT::iterator NI,
+             NodeVectorImplT::iterator E)
+        : iterator_adaptor_base(NI), G(&G), E(E) {
+      while (I != E && I->isNull())
+        ++I;
+    }
 
   public:
-    iterator(const iterator &Arg) : G(Arg.G), NI(Arg.NI) {}
-    iterator(iterator &&Arg) : G(Arg.G), NI(std::move(Arg.NI)) {}
-    iterator &operator=(iterator Arg) {
-      std::swap(Arg, *this);
+    iterator() {}
+
+    using iterator_adaptor_base::operator++;
+    iterator &operator++() {
+      do {
+        ++I;
+      } while (I != E && I->isNull());
       return *this;
     }
 
-    bool operator==(const iterator &Arg) { return NI == Arg.NI; }
-    bool operator!=(const iterator &Arg) { return !operator==(Arg); }
-
     reference operator*() const {
-      if (NI->is<Node *>())
-        return NI->get<Node *>();
+      if (I->is<Node *>())
+        return *I->get<Node *>();
 
-      Function *F = NI->get<Function *>();
-      Node *ChildN = G.get(*F);
-      *NI = ChildN;
+      Function *F = I->get<Function *>();
+      Node &ChildN = G->get(*F);
+      *I = &ChildN;
       return ChildN;
     }
-    pointer operator->() const { return operator*(); }
+  };
 
-    iterator &operator++() {
-      ++NI;
-      return *this;
+  /// \brief A node in the call graph.
+  ///
+  /// This represents a single node. It's primary roles are to cache the list of
+  /// callees, de-duplicate and provide fast testing of whether a function is
+  /// a callee, and facilitate iteration of child nodes in the graph.
+  class Node {
+    friend class LazyCallGraph;
+    friend class LazyCallGraph::SCC;
+
+    LazyCallGraph *G;
+    Function &F;
+
+    // We provide for the DFS numbering and Tarjan walk lowlink numbers to be
+    // stored directly within the node.
+    int DFSNumber;
+    int LowLink;
+
+    mutable NodeVectorT Callees;
+    DenseMap<Function *, size_t> CalleeIndexMap;
+
+    /// \brief Basic constructor implements the scanning of F into Callees and
+    /// CalleeIndexMap.
+    Node(LazyCallGraph &G, Function &F);
+
+    /// \brief Internal helper to insert a callee.
+    void insertEdgeInternal(Function &Callee);
+
+    /// \brief Internal helper to insert a callee.
+    void insertEdgeInternal(Node &CalleeN);
+
+    /// \brief Internal helper to remove a callee from this node.
+    void removeEdgeInternal(Function &Callee);
+
+  public:
+    typedef LazyCallGraph::iterator iterator;
+
+    Function &getFunction() const {
+      return F;
+    };
+
+    iterator begin() const {
+      return iterator(*G, Callees.begin(), Callees.end());
     }
-    iterator operator++(int) {
-      iterator prev = *this;
-      ++*this;
-      return prev;
+    iterator end() const { return iterator(*G, Callees.end(), Callees.end()); }
+
+    /// Equality is defined as address equality.
+    bool operator==(const Node &N) const { return this == &N; }
+    bool operator!=(const Node &N) const { return !operator==(N); }
+  };
+
+  /// \brief An SCC of the call graph.
+  ///
+  /// This represents a Strongly Connected Component of the call graph as
+  /// a collection of call graph nodes. While the order of nodes in the SCC is
+  /// stable, it is not any particular order.
+  class SCC {
+    friend class LazyCallGraph;
+    friend class LazyCallGraph::Node;
+
+    LazyCallGraph *G;
+    SmallPtrSet<SCC *, 1> ParentSCCs;
+    SmallVector<Node *, 1> Nodes;
+
+    SCC(LazyCallGraph &G) : G(&G) {}
+
+    void insert(Node &N);
+
+    void
+    internalDFS(SmallVectorImpl<std::pair<Node *, Node::iterator>> &DFSStack,
+                SmallVectorImpl<Node *> &PendingSCCStack, Node *N,
+                SmallVectorImpl<SCC *> &ResultSCCs);
+
+  public:
+    typedef SmallVectorImpl<Node *>::const_iterator iterator;
+    typedef pointee_iterator<SmallPtrSet<SCC *, 1>::const_iterator> parent_iterator;
+
+    iterator begin() const { return Nodes.begin(); }
+    iterator end() const { return Nodes.end(); }
+
+    parent_iterator parent_begin() const { return ParentSCCs.begin(); }
+    parent_iterator parent_end() const { return ParentSCCs.end(); }
+
+    iterator_range<parent_iterator> parents() const {
+      return iterator_range<parent_iterator>(parent_begin(), parent_end());
     }
 
-    iterator &operator--() {
-      --NI;
-      return *this;
+    /// \brief Test if this SCC is a parent of \a C.
+    bool isParentOf(const SCC &C) const { return C.isChildOf(*this); }
+
+    /// \brief Test if this SCC is an ancestor of \a C.
+    bool isAncestorOf(const SCC &C) const { return C.isDescendantOf(*this); }
+
+    /// \brief Test if this SCC is a child of \a C.
+    bool isChildOf(const SCC &C) const {
+      return ParentSCCs.count(const_cast<SCC *>(&C));
     }
-    iterator operator--(int) {
-      iterator next = *this;
-      --*this;
-      return next;
+
+    /// \brief Test if this SCC is a descendant of \a C.
+    bool isDescendantOf(const SCC &C) const;
+
+    ///@{
+    /// \name Mutation API
+    ///
+    /// These methods provide the core API for updating the call graph in the
+    /// presence of a (potentially still in-flight) DFS-found SCCs.
+    ///
+    /// Note that these methods sometimes have complex runtimes, so be careful
+    /// how you call them.
+
+    /// \brief Insert an edge from one node in this SCC to another in this SCC.
+    ///
+    /// By the definition of an SCC, this does not change the nature or make-up
+    /// of any SCCs.
+    void insertIntraSCCEdge(Node &CallerN, Node &CalleeN);
+
+    /// \brief Insert an edge whose tail is in this SCC and head is in some
+    /// child SCC.
+    ///
+    /// There must be an existing path from the caller to the callee. This
+    /// operation is inexpensive and does not change the set of SCCs in the
+    /// graph.
+    void insertOutgoingEdge(Node &CallerN, Node &CalleeN);
+
+    /// \brief Insert an edge whose tail is in a descendant SCC and head is in
+    /// this SCC.
+    ///
+    /// There must be an existing path from the callee to the caller in this
+    /// case. NB! This is has the potential to be a very expensive function. It
+    /// inherently forms a cycle in the prior SCC DAG and we have to merge SCCs
+    /// to resolve that cycle. But finding all of the SCCs which participate in
+    /// the cycle can in the worst case require traversing every SCC in the
+    /// graph. Every attempt is made to avoid that, but passes must still
+    /// exercise caution calling this routine repeatedly.
+    ///
+    /// FIXME: We could possibly optimize this quite a bit for cases where the
+    /// caller and callee are very nearby in the graph. See comments in the
+    /// implementation for details, but that use case might impact users.
+    SmallVector<SCC *, 1> insertIncomingEdge(Node &CallerN, Node &CalleeN);
+
+    /// \brief Remove an edge whose source is in this SCC and target is *not*.
+    ///
+    /// This removes an inter-SCC edge. All inter-SCC edges originating from
+    /// this SCC have been fully explored by any in-flight DFS SCC formation,
+    /// so this is always safe to call once you have the source SCC.
+    ///
+    /// This operation does not change the set of SCCs or the members of the
+    /// SCCs and so is very inexpensive. It may change the connectivity graph
+    /// of the SCCs though, so be careful calling this while iterating over
+    /// them.
+    void removeInterSCCEdge(Node &CallerN, Node &CalleeN);
+
+    /// \brief Remove an edge which is entirely within this SCC.
+    ///
+    /// Both the \a Caller and the \a Callee must be within this SCC. Removing
+    /// such an edge make break cycles that form this SCC and thus this
+    /// operation may change the SCC graph significantly. In particular, this
+    /// operation will re-form new SCCs based on the remaining connectivity of
+    /// the graph. The following invariants are guaranteed to hold after
+    /// calling this method:
+    ///
+    /// 1) This SCC is still an SCC in the graph.
+    /// 2) This SCC will be the parent of any new SCCs. Thus, this SCC is
+    ///    preserved as the root of any new SCC directed graph formed.
+    /// 3) No SCC other than this SCC has its member set changed (this is
+    ///    inherent in the definition of removing such an edge).
+    /// 4) All of the parent links of the SCC graph will be updated to reflect
+    ///    the new SCC structure.
+    /// 5) All SCCs formed out of this SCC, excluding this SCC, will be
+    ///    returned in a vector.
+    /// 6) The order of the SCCs in the vector will be a valid postorder
+    ///    traversal of the new SCCs.
+    ///
+    /// These invariants are very important to ensure that we can build
+    /// optimization pipeliens on top of the CGSCC pass manager which
+    /// intelligently update the SCC graph without invalidating other parts of
+    /// the SCC graph.
+    ///
+    /// The runtime complexity of this method is, in the worst case, O(V+E)
+    /// where V is the number of nodes in this SCC and E is the number of edges
+    /// leaving the nodes in this SCC. Note that E includes both edges within
+    /// this SCC and edges from this SCC to child SCCs. Some effort has been
+    /// made to minimize the overhead of common cases such as self-edges and
+    /// edge removals which result in a spanning tree with no more cycles.
+    SmallVector<SCC *, 1> removeIntraSCCEdge(Node &CallerN, Node &CalleeN);
+
+    ///@}
+  };
+
+  /// \brief A post-order depth-first SCC iterator over the call graph.
+  ///
+  /// This iterator triggers the Tarjan DFS-based formation of the SCC DAG for
+  /// the call graph, walking it lazily in depth-first post-order. That is, it
+  /// always visits SCCs for a callee prior to visiting the SCC for a caller
+  /// (when they are in different SCCs).
+  class postorder_scc_iterator
+      : public iterator_facade_base<postorder_scc_iterator,
+                                    std::forward_iterator_tag, SCC> {
+    friend class LazyCallGraph;
+    friend class LazyCallGraph::Node;
+
+    /// \brief Nonce type to select the constructor for the end iterator.
+    struct IsAtEndT {};
+
+    LazyCallGraph *G;
+    SCC *C;
+
+    // Build the begin iterator for a node.
+    postorder_scc_iterator(LazyCallGraph &G) : G(&G) {
+      C = G.getNextSCCInPostOrder();
+    }
+
+    // Build the end iterator for a node. This is selected purely by overload.
+    postorder_scc_iterator(LazyCallGraph &G, IsAtEndT /*Nonce*/)
+        : G(&G), C(nullptr) {}
+
+  public:
+    bool operator==(const postorder_scc_iterator &Arg) const {
+      return G == Arg.G && C == Arg.C;
+    }
+
+    reference operator*() const { return *C; }
+
+    using iterator_facade_base::operator++;
+    postorder_scc_iterator &operator++() {
+      C = G->getNextSCCInPostOrder();
+      return *this;
     }
   };
 
@@ -180,44 +388,75 @@ public:
   /// requested during traversal.
   LazyCallGraph(Module &M);
 
-  /// \brief Copy constructor.
-  ///
-  /// This does a deep copy of the graph. It does no verification that the
-  /// graph remains valid for the module. It is also relatively expensive.
-  LazyCallGraph(const LazyCallGraph &G);
-
-  /// \brief Move constructor.
-  ///
-  /// This is a deep move. It leaves G in an undefined but destroyable state.
-  /// Any other operation on G is likely to fail.
   LazyCallGraph(LazyCallGraph &&G);
+  LazyCallGraph &operator=(LazyCallGraph &&RHS);
+
+  iterator begin() {
+    return iterator(*this, EntryNodes.begin(), EntryNodes.end());
+  }
+  iterator end() { return iterator(*this, EntryNodes.end(), EntryNodes.end()); }
 
-  /// \brief Copy and move assignment.
-  LazyCallGraph &operator=(LazyCallGraph RHS) {
-    std::swap(*this, RHS);
-    return *this;
+  postorder_scc_iterator postorder_scc_begin() {
+    return postorder_scc_iterator(*this);
+  }
+  postorder_scc_iterator postorder_scc_end() {
+    return postorder_scc_iterator(*this, postorder_scc_iterator::IsAtEndT());
   }
 
-  iterator begin() { return iterator(*this, EntryNodes); }
-  iterator end() { return iterator(*this, EntryNodes, iterator::IsAtEndT()); }
+  iterator_range<postorder_scc_iterator> postorder_sccs() {
+    return iterator_range<postorder_scc_iterator>(postorder_scc_begin(),
+                                                  postorder_scc_end());
+  }
 
   /// \brief Lookup a function in the graph which has already been scanned and
   /// added.
   Node *lookup(const Function &F) const { return NodeMap.lookup(&F); }
 
+  /// \brief Lookup a function's SCC in the graph.
+  ///
+  /// \returns null if the function hasn't been assigned an SCC via the SCC
+  /// iterator walk.
+  SCC *lookupSCC(Node &N) const { return SCCMap.lookup(&N); }
+
   /// \brief Get a graph node for a given function, scanning it to populate the
   /// graph data as necessary.
-  Node *get(Function &F) {
+  Node &get(Function &F) {
     Node *&N = NodeMap[&F];
     if (N)
-      return N;
+      return *N;
 
     return insertInto(F, N);
   }
 
-private:
-  Module &M;
+  ///@{
+  /// \name Pre-SCC Mutation API
+  ///
+  /// These methods are only valid to call prior to forming any SCCs for this
+  /// call graph. They can be used to update the core node-graph during
+  /// a node-based inorder traversal that precedes any SCC-based traversal.
+  ///
+  /// Once you begin manipulating a call graph's SCCs, you must perform all
+  /// mutation of the graph via the SCC methods.
+
+  /// \brief Update the call graph after inserting a new edge.
+  void insertEdge(Node &Caller, Function &Callee);
+
+  /// \brief Update the call graph after inserting a new edge.
+  void insertEdge(Function &Caller, Function &Callee) {
+    return insertEdge(get(Caller), Callee);
+  }
+
+  /// \brief Update the call graph after deleting an edge.
+  void removeEdge(Node &Caller, Function &Callee);
+
+  /// \brief Update the call graph after deleting an edge.
+  void removeEdge(Function &Caller, Function &Callee) {
+    return removeEdge(get(Caller), Callee);
+  }
+
+  ///@}
 
+private:
   /// \brief Allocator that holds all the call graph nodes.
   SpecificBumpPtrAllocator<Node> BPA;
 
@@ -230,56 +469,46 @@ private:
   /// escape at the module scope.
   NodeVectorT EntryNodes;
 
-  /// \brief Set of the entry nodes to the graph.
-  SmallPtrSet<Function *, 4> EntryNodeSet;
-
-  /// \brief Helper to insert a new function, with an already looked-up entry in
-  /// the NodeMap.
-  Node *insertInto(Function &F, Node *&MappedN);
+  /// \brief Map of the entry nodes in the graph to their indices in
+  /// \c EntryNodes.
+  DenseMap<Function *, size_t> EntryIndexMap;
 
-  /// \brief Helper to copy a node from another graph into this one.
-  Node *copyInto(const Node &OtherN);
+  /// \brief Allocator that holds all the call graph SCCs.
+  SpecificBumpPtrAllocator<SCC> SCCBPA;
 
-  /// \brief Helper to move a node from another graph into this one.
-  Node *moveInto(Node &&OtherN);
-};
+  /// \brief Maps Function -> SCC for fast lookup.
+  DenseMap<Node *, SCC *> SCCMap;
 
-/// \brief A node in the call graph.
-///
-/// This represents a single node. It's primary roles are to cache the list of
-/// callees, de-duplicate and provide fast testing of whether a function is
-/// a callee, and facilitate iteration of child nodes in the graph.
-class LazyCallGraph::Node {
-  friend class LazyCallGraph;
+  /// \brief The leaf SCCs of the graph.
+  ///
+  /// These are all of the SCCs which have no children.
+  SmallVector<SCC *, 4> LeafSCCs;
 
-  LazyCallGraph &G;
-  Function &F;
-  mutable NodeVectorT Callees;
-  SmallPtrSet<Function *, 4> CalleeSet;
+  /// \brief Stack of nodes in the DFS walk.
+  SmallVector<std::pair<Node *, iterator>, 4> DFSStack;
 
-  /// \brief Basic constructor implements the scanning of F into Callees and
-  /// CalleeSet.
-  Node(LazyCallGraph &G, Function &F);
+  /// \brief Set of entry nodes not-yet-processed into SCCs.
+  SmallVector<Function *, 4> SCCEntryNodes;
 
-  /// \brief Constructor used when copying a node from one graph to another.
-  Node(LazyCallGraph &G, const Node &OtherN);
+  /// \brief Stack of nodes the DFS has walked but not yet put into a SCC.
+  SmallVector<Node *, 4> PendingSCCStack;
 
-  /// \brief Constructor used when moving a node from one graph to another.
-  Node(LazyCallGraph &G, Node &&OtherN);
+  /// \brief Counter for the next DFS number to assign.
+  int NextDFSNumber;
 
-public:
-  typedef LazyCallGraph::iterator iterator;
+  /// \brief Helper to insert a new function, with an already looked-up entry in
+  /// the NodeMap.
+  Node &insertInto(Function &F, Node *&MappedN);
 
-  Function &getFunction() const {
-    return F;
-  };
+  /// \brief Helper to update pointers back to the graph object during moves.
+  void updateGraphPtrs();
 
-  iterator begin() const { return iterator(G, Callees); }
-  iterator end() const { return iterator(G, Callees, iterator::IsAtEndT()); }
+  /// \brief Helper to form a new SCC out of the top of a DFSStack-like
+  /// structure.
+  SCC *formSCC(Node *RootN, SmallVectorImpl<Node *> &NodeStack);
 
-  /// Equality is defined as address equality.
-  bool operator==(const Node &N) const { return this == &N; }
-  bool operator!=(const Node &N) const { return !operator==(N); }
+  /// \brief Retrieve the next node in the post-order SCC walk of the call graph.
+  SCC *getNextSCCInPostOrder();
 };
 
 // Provide GraphTraits specializations for call graphs.
diff --git a/include/llvm/Analysis/LazyValueInfo.h b/include/llvm/Analysis/LazyValueInfo.h
index a4cb806..2fe7386 100644
--- a/include/llvm/Analysis/LazyValueInfo.h
+++ b/include/llvm/Analysis/LazyValueInfo.h
@@ -33,10 +33,10 @@ class LazyValueInfo : public FunctionPass {
   void operator=(const LazyValueInfo&) LLVM_DELETED_FUNCTION;
 public:
   static char ID;
-  LazyValueInfo() : FunctionPass(ID), PImpl(0) {
+  LazyValueInfo() : FunctionPass(ID), PImpl(nullptr) {
     initializeLazyValueInfoPass(*PassRegistry::getPassRegistry());
   }
-  ~LazyValueInfo() { assert(PImpl == 0 && "releaseMemory not called"); }
+  ~LazyValueInfo() { assert(!PImpl && "releaseMemory not called"); }
 
   /// Tristate - This is used to return true/false/dunno results.
   enum Tristate {
diff --git a/include/llvm/Analysis/LibCallAliasAnalysis.h b/include/llvm/Analysis/LibCallAliasAnalysis.h
index 481015e..4c03c92 100644
--- a/include/llvm/Analysis/LibCallAliasAnalysis.h
+++ b/include/llvm/Analysis/LibCallAliasAnalysis.h
@@ -27,7 +27,7 @@ namespace llvm {
     
     LibCallInfo *LCI;
     
-    explicit LibCallAliasAnalysis(LibCallInfo *LC = 0)
+    explicit LibCallAliasAnalysis(LibCallInfo *LC = nullptr)
         : FunctionPass(ID), LCI(LC) {
       initializeLibCallAliasAnalysisPass(*PassRegistry::getPassRegistry());
     }
diff --git a/include/llvm/Analysis/LibCallSemantics.h b/include/llvm/Analysis/LibCallSemantics.h
index 0f0bc23..8bd747f 100644
--- a/include/llvm/Analysis/LibCallSemantics.h
+++ b/include/llvm/Analysis/LibCallSemantics.h
@@ -130,7 +130,7 @@ namespace llvm {
     mutable const LibCallLocationInfo *Locations;
     mutable unsigned NumLocations;
   public:
-    LibCallInfo() : Impl(0), Locations(0), NumLocations(0) {}
+    LibCallInfo() : Impl(nullptr), Locations(nullptr), NumLocations(0) {}
     virtual ~LibCallInfo();
     
     //===------------------------------------------------------------------===//
diff --git a/include/llvm/Analysis/Loads.h b/include/llvm/Analysis/Loads.h
index ebcb762..25c5928 100644
--- a/include/llvm/Analysis/Loads.h
+++ b/include/llvm/Analysis/Loads.h
@@ -27,7 +27,8 @@ class MDNode;
 /// specified pointer, we do a quick local scan of the basic block containing
 /// ScanFrom, to determine if the address is already accessed.
 bool isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
-                                 unsigned Align, const DataLayout *TD = 0);
+                                 unsigned Align,
+                                 const DataLayout *TD = nullptr);
 
 /// FindAvailableLoadedValue - Scan the ScanBB block backwards (starting at
 /// the instruction before ScanFrom) checking to see if we have the value at
@@ -49,8 +50,8 @@ bool isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
 Value *FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
                                 BasicBlock::iterator &ScanFrom,
                                 unsigned MaxInstsToScan = 6,
-                                AliasAnalysis *AA = 0,
-                                MDNode **TBAATag = 0);
+                                AliasAnalysis *AA = nullptr,
+                                MDNode **TBAATag = nullptr);
 
 }
 
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index aeeea3c..bef03e9 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -79,7 +79,7 @@ class LoopBase {
     operator=(const LoopBase<BlockT, LoopT> &) LLVM_DELETED_FUNCTION;
 public:
   /// Loop ctor - This creates an empty loop.
-  LoopBase() : ParentLoop(0) {}
+  LoopBase() : ParentLoop(nullptr) {}
   ~LoopBase() {
     for (size_t i = 0, e = SubLoops.size(); i != e; ++i)
       delete SubLoops[i];
@@ -106,7 +106,7 @@ public:
   ///
   bool contains(const LoopT *L) const {
     if (L == this) return true;
-    if (L == 0)    return false;
+    if (!L)        return false;
     return contains(L->getParentLoop());
   }
 
@@ -265,7 +265,7 @@ public:
   /// updates the loop depth of the new child.
   ///
   void addChildLoop(LoopT *NewChild) {
-    assert(NewChild->ParentLoop == 0 && "NewChild already has a parent!");
+    assert(!NewChild->ParentLoop && "NewChild already has a parent!");
     NewChild->ParentLoop = static_cast<LoopT *>(this);
     SubLoops.push_back(NewChild);
   }
@@ -278,7 +278,7 @@ public:
     LoopT *Child = *I;
     assert(Child->ParentLoop == this && "Child is not a child of this loop!");
     SubLoops.erase(SubLoops.begin()+(I-begin()));
-    Child->ParentLoop = 0;
+    Child->ParentLoop = nullptr;
     return Child;
   }
 
@@ -333,7 +333,7 @@ public:
 
 protected:
   friend class LoopInfoBase<BlockT, LoopT>;
-  explicit LoopBase(BlockT *BB) : ParentLoop(0) {
+  explicit LoopBase(BlockT *BB) : ParentLoop(nullptr) {
     Blocks.push_back(BB);
     DenseBlockSet.insert(BB);
   }
@@ -372,7 +372,7 @@ public:
   /// If null, the terminator of the loop preheader is used.
   ///
   bool makeLoopInvariant(Value *V, bool &Changed,
-                         Instruction *InsertPt = 0) const;
+                         Instruction *InsertPt = nullptr) const;
 
   /// makeLoopInvariant - If the given instruction is inside of the
   /// loop and it can be hoisted, do so to make it trivially loop-invariant.
@@ -384,7 +384,7 @@ public:
   /// If null, the terminator of the loop preheader is used.
   ///
   bool makeLoopInvariant(Instruction *I, bool &Changed,
-                         Instruction *InsertPt = 0) const;
+                         Instruction *InsertPt = nullptr) const;
 
   /// getCanonicalInductionVariable - Check to see if the loop has a canonical
   /// induction variable: an integer recurrence that starts at 0 and increments
@@ -453,6 +453,31 @@ public:
 
   void dump() const;
 
+  /// \brief Return the debug location of the start of this loop.
+  /// This looks for a BB terminating instruction with a known debug
+  /// location by looking at the preheader and header blocks. If it
+  /// cannot find a terminating instruction with location information,
+  /// it returns an unknown location.
+  DebugLoc getStartLoc() const {
+    DebugLoc StartLoc;
+    BasicBlock *HeadBB;
+
+    // Try the pre-header first.
+    if ((HeadBB = getLoopPreheader()) != nullptr) {
+      StartLoc = HeadBB->getTerminator()->getDebugLoc();
+      if (!StartLoc.isUnknown())
+        return StartLoc;
+    }
+
+    // If we have no pre-header or there are no instructions with debug
+    // info in it, try the header.
+    HeadBB = getHeader();
+    if (HeadBB)
+      StartLoc = HeadBB->getTerminator()->getDebugLoc();
+
+    return StartLoc;
+  }
+
 private:
   friend class LoopInfoBase<BasicBlock, Loop>;
   explicit Loop(BasicBlock *BB) : LoopBase<BasicBlock, Loop>(BB) {}
@@ -531,7 +556,7 @@ public:
   LoopT *removeLoop(iterator I) {
     assert(I != end() && "Cannot remove end iterator!");
     LoopT *L = *I;
-    assert(L->getParentLoop() == 0 && "Not a top-level loop!");
+    assert(!L->getParentLoop() && "Not a top-level loop!");
     TopLevelLoops.erase(TopLevelLoops.begin() + (I-begin()));
     return L;
   }
@@ -555,14 +580,14 @@ public:
                  std::find(TopLevelLoops.begin(), TopLevelLoops.end(), OldLoop);
     assert(I != TopLevelLoops.end() && "Old loop not at top level!");
     *I = NewLoop;
-    assert(NewLoop->ParentLoop == 0 && OldLoop->ParentLoop == 0 &&
+    assert(!NewLoop->ParentLoop && !OldLoop->ParentLoop &&
            "Loops already embedded into a subloop!");
   }
 
   /// addTopLevelLoop - This adds the specified loop to the collection of
   /// top-level loops.
   void addTopLevelLoop(LoopT *New) {
-    assert(New->getParentLoop() == 0 && "Loop already in subloop!");
+    assert(!New->getParentLoop() && "Loop already in subloop!");
     TopLevelLoops.push_back(New);
   }
 
@@ -583,7 +608,7 @@ public:
 
   static bool isNotAlreadyContainedIn(const LoopT *SubLoop,
                                       const LoopT *ParentLoop) {
-    if (SubLoop == 0) return true;
+    if (!SubLoop) return true;
     if (SubLoop == ParentLoop) return false;
     return isNotAlreadyContainedIn(SubLoop->getParentLoop(), ParentLoop);
   }
@@ -660,7 +685,7 @@ public:
 
   void releaseMemory() override { LI.releaseMemory(); }
 
-  void print(raw_ostream &O, const Module* M = 0) const override;
+  void print(raw_ostream &O, const Module* M = nullptr) const override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h
index dd2dc28..948be0f 100644
--- a/include/llvm/Analysis/LoopInfoImpl.h
+++ b/include/llvm/Analysis/LoopInfoImpl.h
@@ -53,7 +53,7 @@ BlockT *LoopBase<BlockT, LoopT>::getExitingBlock() const {
   getExitingBlocks(ExitingBlocks);
   if (ExitingBlocks.size() == 1)
     return ExitingBlocks[0];
-  return 0;
+  return nullptr;
 }
 
 /// getExitBlocks - Return all of the successor blocks of this loop.  These
@@ -80,7 +80,7 @@ BlockT *LoopBase<BlockT, LoopT>::getExitBlock() const {
   getExitBlocks(ExitBlocks);
   if (ExitBlocks.size() == 1)
     return ExitBlocks[0];
-  return 0;
+  return nullptr;
 }
 
 /// getExitEdges - Return all pairs of (_inside_block_,_outside_block_).
@@ -108,14 +108,14 @@ template<class BlockT, class LoopT>
 BlockT *LoopBase<BlockT, LoopT>::getLoopPreheader() const {
   // Keep track of nodes outside the loop branching to the header...
   BlockT *Out = getLoopPredecessor();
-  if (!Out) return 0;
+  if (!Out) return nullptr;
 
   // Make sure there is only one exit out of the preheader.
   typedef GraphTraits<BlockT*> BlockTraits;
   typename BlockTraits::ChildIteratorType SI = BlockTraits::child_begin(Out);
   ++SI;
   if (SI != BlockTraits::child_end(Out))
-    return 0;  // Multiple exits from the block, must not be a preheader.
+    return nullptr;  // Multiple exits from the block, must not be a preheader.
 
   // The predecessor has exactly one successor, so it is a preheader.
   return Out;
@@ -129,7 +129,7 @@ BlockT *LoopBase<BlockT, LoopT>::getLoopPreheader() const {
 template<class BlockT, class LoopT>
 BlockT *LoopBase<BlockT, LoopT>::getLoopPredecessor() const {
   // Keep track of nodes outside the loop branching to the header...
-  BlockT *Out = 0;
+  BlockT *Out = nullptr;
 
   // Loop over the predecessors of the header node...
   BlockT *Header = getHeader();
@@ -140,7 +140,7 @@ BlockT *LoopBase<BlockT, LoopT>::getLoopPredecessor() const {
     typename InvBlockTraits::NodeType *N = *PI;
     if (!contains(N)) {     // If the block is not in the loop...
       if (Out && Out != N)
-        return 0;             // Multiple predecessors outside the loop
+        return nullptr;     // Multiple predecessors outside the loop
       Out = N;
     }
   }
@@ -160,11 +160,11 @@ BlockT *LoopBase<BlockT, LoopT>::getLoopLatch() const {
     InvBlockTraits::child_begin(Header);
   typename InvBlockTraits::ChildIteratorType PE =
     InvBlockTraits::child_end(Header);
-  BlockT *Latch = 0;
+  BlockT *Latch = nullptr;
   for (; PI != PE; ++PI) {
     typename InvBlockTraits::NodeType *N = *PI;
     if (contains(N)) {
-      if (Latch) return 0;
+      if (Latch) return nullptr;
       Latch = N;
     }
   }
@@ -188,7 +188,7 @@ addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase<BlockT, LoopT> &LIB) {
   assert((Blocks.empty() || LIB[getHeader()] == this) &&
          "Incorrect LI specified for this loop!");
   assert(NewBB && "Cannot add a null basic block to the loop!");
-  assert(LIB[NewBB] == 0 && "BasicBlock already in the loop!");
+  assert(!LIB[NewBB] && "BasicBlock already in the loop!");
 
   LoopT *L = static_cast<LoopT *>(this);
 
@@ -210,12 +210,12 @@ template<class BlockT, class LoopT>
 void LoopBase<BlockT, LoopT>::
 replaceChildLoopWith(LoopT *OldChild, LoopT *NewChild) {
   assert(OldChild->ParentLoop == this && "This loop is already broken!");
-  assert(NewChild->ParentLoop == 0 && "NewChild already has a parent!");
+  assert(!NewChild->ParentLoop && "NewChild already has a parent!");
   typename std::vector<LoopT *>::iterator I =
     std::find(SubLoops.begin(), SubLoops.end(), OldChild);
   assert(I != SubLoops.end() && "OldChild not in loop!");
   *I = NewChild;
-  OldChild->ParentLoop = 0;
+  OldChild->ParentLoop = nullptr;
   NewChild->ParentLoop = static_cast<LoopT *>(this);
 }
 
@@ -270,11 +270,10 @@ void LoopBase<BlockT, LoopT>::verifyLoop() const {
       // though it is permitted if the predecessor is not itself actually
       // reachable.
       BlockT *EntryBB = BB->getParent()->begin();
-        for (df_iterator<BlockT *> NI = df_begin(EntryBB),
-               NE = df_end(EntryBB); NI != NE; ++NI)
-          for (unsigned i = 0, e = OutsideLoopPreds.size(); i != e; ++i)
-            assert(*NI != OutsideLoopPreds[i] &&
-                   "Loop has multiple entry points!");
+      for (BlockT *CB : depth_first(EntryBB))
+        for (unsigned i = 0, e = OutsideLoopPreds.size(); i != e; ++i)
+          assert(CB != OutsideLoopPreds[i] &&
+                 "Loop has multiple entry points!");
     }
     assert(HasInsideLoopPreds && "Loop block has no in-loop predecessors!");
     assert(HasInsideLoopSuccs && "Loop block has no in-loop successors!");
diff --git a/include/llvm/Analysis/MemoryBuiltins.h b/include/llvm/Analysis/MemoryBuiltins.h
index ff4bc22..d414680 100644
--- a/include/llvm/Analysis/MemoryBuiltins.h
+++ b/include/llvm/Analysis/MemoryBuiltins.h
@@ -233,7 +233,7 @@ class ObjectSizeOffsetEvaluator
   bool RoundToAlign;
 
   SizeOffsetEvalType unknown() {
-    return std::make_pair((Value*)0, (Value*)0);
+    return std::make_pair(nullptr, nullptr);
   }
   SizeOffsetEvalType compute_(Value *V);
 
diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h
index 123d435..1c4441b 100644
--- a/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -97,7 +97,7 @@ namespace llvm {
     PairTy Value;
     explicit MemDepResult(PairTy V) : Value(V) {}
   public:
-    MemDepResult() : Value(0, Invalid) {}
+    MemDepResult() : Value(nullptr, Invalid) {}
 
     /// get methods: These are static ctor methods for creating various
     /// MemDepResult kinds.
@@ -155,7 +155,7 @@ namespace llvm {
     /// getInst() - If this is a normal dependency, return the instruction that
     /// is depended on.  Otherwise, return null.
     Instruction *getInst() const {
-      if (Value.getInt() == Other) return NULL;
+      if (Value.getInt() == Other) return nullptr;
       return Value.getPointer();
     }
 
@@ -285,7 +285,8 @@ namespace llvm {
       /// pointer. May be null if there are no tags or conflicting tags.
       const MDNode *TBAATag;
 
-      NonLocalPointerInfo() : Size(AliasAnalysis::UnknownSize), TBAATag(0) {}
+      NonLocalPointerInfo()
+        : Size(AliasAnalysis::UnknownSize), TBAATag(nullptr) {}
     };
 
     /// CachedNonLocalPointerInfo - This map stores the cached results of doing
@@ -401,7 +402,7 @@ namespace llvm {
                                           bool isLoad,
                                           BasicBlock::iterator ScanIt,
                                           BasicBlock *BB,
-                                          Instruction *QueryInst = 0);
+                                          Instruction *QueryInst = nullptr);
 
 
     /// getLoadLoadClobberFullWidthSize - This is a little bit of analysis that
diff --git a/include/llvm/Analysis/PHITransAddr.h b/include/llvm/Analysis/PHITransAddr.h
index 6d70edd..69f5907 100644
--- a/include/llvm/Analysis/PHITransAddr.h
+++ b/include/llvm/Analysis/PHITransAddr.h
@@ -45,7 +45,8 @@ class PHITransAddr {
   /// InstInputs - The inputs for our symbolic address.
   SmallVector<Instruction*, 4> InstInputs;
 public:
-  PHITransAddr(Value *addr, const DataLayout *DL) : Addr(addr), DL(DL), TLI(0) {
+  PHITransAddr(Value *addr, const DataLayout *DL)
+      : Addr(addr), DL(DL), TLI(nullptr) {
     // If the address is an instruction, the whole thing is considered an input.
     if (Instruction *I = dyn_cast<Instruction>(Addr))
       InstInputs.push_back(I);
diff --git a/include/llvm/Analysis/PtrUseVisitor.h b/include/llvm/Analysis/PtrUseVisitor.h
index 572d5d7..6e61fc3 100644
--- a/include/llvm/Analysis/PtrUseVisitor.h
+++ b/include/llvm/Analysis/PtrUseVisitor.h
@@ -48,13 +48,13 @@ public:
   /// analysis and whether the visit completed or aborted early.
   class PtrInfo {
   public:
-    PtrInfo() : AbortedInfo(0, false), EscapedInfo(0, false) {}
+    PtrInfo() : AbortedInfo(nullptr, false), EscapedInfo(nullptr, false) {}
 
     /// \brief Reset the pointer info, clearing all state.
     void reset() {
-      AbortedInfo.setPointer(0);
+      AbortedInfo.setPointer(nullptr);
       AbortedInfo.setInt(false);
-      EscapedInfo.setPointer(0);
+      EscapedInfo.setPointer(nullptr);
       EscapedInfo.setInt(false);
     }
 
@@ -76,14 +76,14 @@ public:
 
     /// \brief Mark the visit as aborted. Intended for use in a void return.
     /// \param I The instruction which caused the visit to abort, if available.
-    void setAborted(Instruction *I = 0) {
+    void setAborted(Instruction *I = nullptr) {
       AbortedInfo.setInt(true);
       AbortedInfo.setPointer(I);
     }
 
     /// \brief Mark the pointer as escaped. Intended for use in a void return.
     /// \param I The instruction which escapes the pointer, if available.
-    void setEscaped(Instruction *I = 0) {
+    void setEscaped(Instruction *I = nullptr) {
       EscapedInfo.setInt(true);
       EscapedInfo.setPointer(I);
     }
@@ -92,7 +92,7 @@ public:
     /// for use in a void return.
     /// \param I The instruction which both escapes the pointer and aborts the
     /// visit, if available.
-    void setEscapedAndAborted(Instruction *I = 0) {
+    void setEscapedAndAborted(Instruction *I = nullptr) {
       setEscaped(I);
       setAborted(I);
     }
diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h
index 4d55408..82a788d 100644
--- a/include/llvm/Analysis/RegionInfo.h
+++ b/include/llvm/Analysis/RegionInfo.h
@@ -33,6 +33,7 @@
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Support/Allocator.h"
 #include <map>
+#include <memory>
 
 namespace llvm {
 
@@ -213,7 +214,7 @@ class Region : public RegionNode {
   // (The entry BasicBlock is part of RegionNode)
   BasicBlock *exit;
 
-  typedef std::vector<Region*> RegionSet;
+  typedef std::vector<std::unique_ptr<Region>> RegionSet;
 
   // The subregions of this region.
   RegionSet children;
@@ -246,7 +247,7 @@ public:
   /// @param Parent The surrounding region or NULL if this is a top level
   ///               region.
   Region(BasicBlock *Entry, BasicBlock *Exit, RegionInfo* RI,
-         DominatorTree *DT, Region *Parent = 0);
+         DominatorTree *DT, Region *Parent = nullptr);
 
   /// Delete the Region and all its subregions.
   ~Region();
@@ -311,7 +312,7 @@ public:
   /// @brief Check if a Region is the TopLevel region.
   ///
   /// The toplevel region represents the whole function.
-  bool isTopLevelRegion() const { return exit == NULL; }
+  bool isTopLevelRegion() const { return exit == nullptr; }
 
   /// @brief Return a new (non-canonical) region, that is obtained by joining
   ///        this region with its predecessors.
@@ -515,7 +516,7 @@ public:
     }
 
     // Construct the end iterator.
-    block_iterator_wrapper() : super(df_end<pointer>((BasicBlock *)0)) {}
+    block_iterator_wrapper() : super(df_end<pointer>((BasicBlock *)nullptr)) {}
 
     /*implicit*/ block_iterator_wrapper(super I) : super(I) {}
 
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 06489d8..0570826 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -210,7 +210,7 @@ namespace llvm {
       void deleted() override;
       void allUsesReplacedWith(Value *New) override;
     public:
-      SCEVCallbackVH(Value *V, ScalarEvolution *SE = 0);
+      SCEVCallbackVH(Value *V, ScalarEvolution *SE = nullptr);
     };
 
     friend class SCEVCallbackVH;
@@ -291,7 +291,7 @@ namespace llvm {
       const SCEV *ExactNotTaken;
       PointerIntPair<ExitNotTakenInfo*, 1> NextExit;
 
-      ExitNotTakenInfo() : ExitingBlock(0), ExactNotTaken(0) {}
+      ExitNotTakenInfo() : ExitingBlock(nullptr), ExactNotTaken(nullptr) {}
 
       /// isCompleteList - Return true if all loop exits are computable.
       bool isCompleteList() const {
@@ -321,7 +321,7 @@ namespace llvm {
       const SCEV *Max;
 
     public:
-      BackedgeTakenInfo() : Max(0) {}
+      BackedgeTakenInfo() : Max(nullptr) {}
 
       /// Initialize BackedgeTakenInfo from a list of exact exit counts.
       BackedgeTakenInfo(
@@ -894,10 +894,19 @@ namespace llvm {
     /// indirect operand.
     bool hasOperand(const SCEV *S, const SCEV *Op) const;
 
+    /// Return the size of an element read or written by Inst.
+    const SCEV *getElementSize(Instruction *Inst);
+
+    /// Compute the array dimensions Sizes from the set of Terms extracted from
+    /// the memory access function of this SCEVAddRecExpr.
+    void findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
+                             SmallVectorImpl<const SCEV *> &Sizes,
+                             const SCEV *ElementSize) const;
+
     bool runOnFunction(Function &F) override;
     void releaseMemory() override;
     void getAnalysisUsage(AnalysisUsage &AU) const override;
-    void print(raw_ostream &OS, const Module* = 0) const override;
+    void print(raw_ostream &OS, const Module* = nullptr) const override;
     void verifyAnalysis() const override;
 
   private:
diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h
index 9162735..b9bef97 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpander.h
@@ -92,7 +92,7 @@ namespace llvm {
   public:
     /// SCEVExpander - Construct a SCEVExpander in "canonical" mode.
     explicit SCEVExpander(ScalarEvolution &se, const char *name)
-      : SE(se), IVName(name), IVIncInsertLoop(0), IVIncInsertPos(0),
+      : SE(se), IVName(name), IVIncInsertLoop(nullptr), IVIncInsertPos(nullptr),
         CanonicalMode(true), LSRMode(false),
         Builder(se.getContext(), TargetFolder(se.DL)) {
 #ifndef NDEBUG
@@ -131,7 +131,7 @@ namespace llvm {
     /// representative. Return the number of phis eliminated.
     unsigned replaceCongruentIVs(Loop *L, const DominatorTree *DT,
                                  SmallVectorImpl<WeakVH> &DeadInsts,
-                                 const TargetTransformInfo *TTI = NULL);
+                                 const TargetTransformInfo *TTI = nullptr);
 
     /// expandCodeFor - Insert code to directly compute the specified SCEV
     /// expression into the program.  The inserted code is inserted into the
@@ -219,7 +219,7 @@ namespace llvm {
     /// expression into the program.  The inserted code is inserted into the
     /// SCEVExpander's current insertion point. If a type is specified, the
     /// result will be expanded to have that type, with a cast if necessary.
-    Value *expandCodeFor(const SCEV *SH, Type *Ty = 0);
+    Value *expandCodeFor(const SCEV *SH, Type *Ty = nullptr);
 
     /// getRelevantLoop - Determine the most "relevant" loop for the given SCEV.
     const Loop *getRelevantLoop(const SCEV *);
diff --git a/include/llvm/Analysis/ScalarEvolutionExpressions.h b/include/llvm/Analysis/ScalarEvolutionExpressions.h
index ed8c133..01b034f 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_ANALYSIS_SCALAREVOLUTIONEXPRESSIONS_H
 #define LLVM_ANALYSIS_SCALAREVOLUTIONEXPRESSIONS_H
 
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -151,8 +152,12 @@ namespace llvm {
     }
 
     typedef const SCEV *const *op_iterator;
+    typedef iterator_range<op_iterator> op_range;
     op_iterator op_begin() const { return Operands; }
     op_iterator op_end() const { return Operands + NumOperands; }
+    op_range operands() const {
+      return make_range(op_begin(), op_end());
+    }
 
     Type *getType() const { return getOperand(0)->getType(); }
 
@@ -352,12 +357,83 @@ namespace llvm {
       return S->getSCEVType() == scAddRecExpr;
     }
 
-    /// Splits the SCEV into two vectors of SCEVs representing the subscripts
-    /// and sizes of an array access. Returns the remainder of the
-    /// delinearization that is the offset start of the array.
-    const SCEV *delinearize(ScalarEvolution &SE,
-                            SmallVectorImpl<const SCEV *> &Subscripts,
-                            SmallVectorImpl<const SCEV *> &Sizes) const;
+    /// Collect parametric terms occurring in step expressions.
+    void collectParametricTerms(ScalarEvolution &SE,
+                                SmallVectorImpl<const SCEV *> &Terms) const;
+
+    /// Return in Subscripts the access functions for each dimension in Sizes.
+    void computeAccessFunctions(ScalarEvolution &SE,
+                                SmallVectorImpl<const SCEV *> &Subscripts,
+                                SmallVectorImpl<const SCEV *> &Sizes) const;
+
+    /// Split this SCEVAddRecExpr into two vectors of SCEVs representing the
+    /// subscripts and sizes of an array access.
+    ///
+    /// The delinearization is a 3 step process: the first two steps compute the
+    /// sizes of each subscript and the third step computes the access functions
+    /// for the delinearized array:
+    ///
+    /// 1. Find the terms in the step functions
+    /// 2. Compute the array size
+    /// 3. Compute the access function: divide the SCEV by the array size
+    ///    starting with the innermost dimensions found in step 2. The Quotient
+    ///    is the SCEV to be divided in the next step of the recursion. The
+    ///    Remainder is the subscript of the innermost dimension. Loop over all
+    ///    array dimensions computed in step 2.
+    ///
+    /// To compute a uniform array size for several memory accesses to the same
+    /// object, one can collect in step 1 all the step terms for all the memory
+    /// accesses, and compute in step 2 a unique array shape. This guarantees
+    /// that the array shape will be the same across all memory accesses.
+    ///
+    /// FIXME: We could derive the result of steps 1 and 2 from a description of
+    /// the array shape given in metadata.
+    ///
+    /// Example:
+    ///
+    /// A[][n][m]
+    ///
+    /// for i
+    ///   for j
+    ///     for k
+    ///       A[j+k][2i][5i] =
+    ///
+    /// The initial SCEV:
+    ///
+    /// A[{{{0,+,2*m+5}_i, +, n*m}_j, +, n*m}_k]
+    ///
+    /// 1. Find the different terms in the step functions:
+    /// -> [2*m, 5, n*m, n*m]
+    ///
+    /// 2. Compute the array size: sort and unique them
+    /// -> [n*m, 2*m, 5]
+    /// find the GCD of all the terms = 1
+    /// divide by the GCD and erase constant terms
+    /// -> [n*m, 2*m]
+    /// GCD = m
+    /// divide by GCD -> [n, 2]
+    /// remove constant terms
+    /// -> [n]
+    /// size of the array is A[unknown][n][m]
+    ///
+    /// 3. Compute the access function
+    /// a. Divide {{{0,+,2*m+5}_i, +, n*m}_j, +, n*m}_k by the innermost size m
+    /// Quotient: {{{0,+,2}_i, +, n}_j, +, n}_k
+    /// Remainder: {{{0,+,5}_i, +, 0}_j, +, 0}_k
+    /// The remainder is the subscript of the innermost array dimension: [5i].
+    ///
+    /// b. Divide Quotient: {{{0,+,2}_i, +, n}_j, +, n}_k by next outer size n
+    /// Quotient: {{{0,+,0}_i, +, 1}_j, +, 1}_k
+    /// Remainder: {{{0,+,2}_i, +, 0}_j, +, 0}_k
+    /// The Remainder is the subscript of the next array dimension: [2i].
+    ///
+    /// The subscript of the outermost dimension is the Quotient: [j+k].
+    ///
+    /// Overall, we have: A[][n][m], and the access function: A[j+k][2i][5i].
+    void delinearize(ScalarEvolution &SE,
+                     SmallVectorImpl<const SCEV *> &Subscripts,
+                     SmallVectorImpl<const SCEV *> &Sizes,
+                     const SCEV *ElementSize) const;
   };
 
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/Analysis/SparsePropagation.h b/include/llvm/Analysis/SparsePropagation.h
index 76c8ccf..65ff2f6 100644
--- a/include/llvm/Analysis/SparsePropagation.h
+++ b/include/llvm/Analysis/SparsePropagation.h
@@ -82,7 +82,7 @@ public:
   /// constant value, return it.  Otherwise return null.  The returned value
   /// must be in the same LLVM type as Val.
   virtual Constant *GetConstant(LatticeVal LV, Value *Val, SparseSolver &SS) {
-    return 0;
+    return nullptr;
   }
 
   /// ComputeArgument - Given a formal argument value, compute and return a
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 2ac6ffa..79fe1dc 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -105,7 +105,7 @@ public:
   /// The returned cost is defined in terms of \c TargetCostConstants, see its
   /// comments for a detailed explanation of the cost values.
   virtual unsigned getOperationCost(unsigned Opcode, Type *Ty,
-                                    Type *OpTy = 0) const;
+                                    Type *OpTy = nullptr) const;
 
   /// \brief Estimate the cost of a GEP operation when lowered.
   ///
@@ -356,7 +356,7 @@ public:
   /// The index and subtype parameters are used by the subvector insertion and
   /// extraction shuffle kinds.
   virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0,
-                                  Type *SubTp = 0) const;
+                                  Type *SubTp = nullptr) const;
 
   /// \return The expected cost of cast instructions, such as bitcast, trunc,
   /// zext, etc.
@@ -369,7 +369,7 @@ public:
 
   /// \returns The expected cost of compare and select instructions.
   virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy = 0) const;
+                                      Type *CondTy = nullptr) const;
 
   /// \return The expected cost of vector Insert and Extract.
   /// Use -1 to indicate that there is no information on the index value.
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index 0392f98..ce78967 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -27,24 +27,22 @@ namespace llvm {
   class MDNode;
   class TargetLibraryInfo;
 
-  /// ComputeMaskedBits - Determine which of the bits specified in Mask are
-  /// known to be either zero or one and return them in the KnownZero/KnownOne
-  /// bit sets.  This code only analyzes bits in Mask, in order to short-circuit
-  /// processing.
+  /// Determine which bits of V are known to be either zero or one and return
+  /// them in the KnownZero/KnownOne bit sets.
   ///
   /// This function is defined on values with integer type, values with pointer
   /// type (but only if TD is non-null), and vectors of integers.  In the case
-  /// where V is a vector, the mask, known zero, and known one values are the
+  /// where V is a vector, the known zero and known one values are the
   /// same width as the vector element, and the bit is set only if it is true
   /// for all of the elements in the vector.
-  void ComputeMaskedBits(Value *V,  APInt &KnownZero, APInt &KnownOne,
-                         const DataLayout *TD = 0, unsigned Depth = 0);
-  void computeMaskedBitsLoad(const MDNode &Ranges, APInt &KnownZero);
+  void computeKnownBits(Value *V,  APInt &KnownZero, APInt &KnownOne,
+                        const DataLayout *TD = nullptr, unsigned Depth = 0);
+  void computeKnownBitsLoad(const MDNode &Ranges, APInt &KnownZero);
 
   /// ComputeSignBit - Determine whether the sign bit is known to be zero or
-  /// one.  Convenience wrapper around ComputeMaskedBits.
+  /// one.  Convenience wrapper around computeKnownBits.
   void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
-                      const DataLayout *TD = 0, unsigned Depth = 0);
+                      const DataLayout *TD = nullptr, unsigned Depth = 0);
 
   /// isKnownToBeAPowerOfTwo - Return true if the given value is known to have
   /// exactly one bit set when defined. For vectors return true if every
@@ -57,7 +55,8 @@ namespace llvm {
   /// when defined.  For vectors return true if every element is known to be
   /// non-zero when defined.  Supports values with integer or pointer type and
   /// vectors of integers.
-  bool isKnownNonZero(Value *V, const DataLayout *TD = 0, unsigned Depth = 0);
+  bool isKnownNonZero(Value *V, const DataLayout *TD = nullptr,
+                      unsigned Depth = 0);
 
   /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
   /// this predicate to simplify operations downstream.  Mask is known to be
@@ -69,7 +68,7 @@ namespace llvm {
   /// same width as the vector element, and the bit is set only if it is true
   /// for all of the elements in the vector.
   bool MaskedValueIsZero(Value *V, const APInt &Mask, 
-                         const DataLayout *TD = 0, unsigned Depth = 0);
+                         const DataLayout *TD = nullptr, unsigned Depth = 0);
 
   
   /// ComputeNumSignBits - Return the number of times the sign bit of the
@@ -80,7 +79,7 @@ namespace llvm {
   ///
   /// 'Op' must have a scalar integer type.
   ///
-  unsigned ComputeNumSignBits(Value *Op, const DataLayout *TD = 0,
+  unsigned ComputeNumSignBits(Value *Op, const DataLayout *TD = nullptr,
                               unsigned Depth = 0);
 
   /// ComputeMultiple - This function computes the integer multiple of Base that
@@ -112,7 +111,7 @@ namespace llvm {
   /// insertvalues when a part of a nested struct is extracted.
   Value *FindInsertedValue(Value *V,
                            ArrayRef<unsigned> idx_range,
-                           Instruction *InsertBefore = 0);
+                           Instruction *InsertBefore = nullptr);
 
   /// GetPointerBaseWithConstantOffset - Analyze the specified pointer to see if
   /// it can be expressed as a base pointer plus a constant offset.  Return the
@@ -143,10 +142,10 @@ namespace llvm {
   /// being addressed.  Note that the returned value has pointer type if the
   /// specified value does.  If the MaxLookup value is non-zero, it limits the
   /// number of instructions to be stripped off.
-  Value *GetUnderlyingObject(Value *V, const DataLayout *TD = 0,
+  Value *GetUnderlyingObject(Value *V, const DataLayout *TD = nullptr,
                              unsigned MaxLookup = 6);
   static inline const Value *
-  GetUnderlyingObject(const Value *V, const DataLayout *TD = 0,
+  GetUnderlyingObject(const Value *V, const DataLayout *TD = nullptr,
                       unsigned MaxLookup = 6) {
     return GetUnderlyingObject(const_cast<Value *>(V), TD, MaxLookup);
   }
@@ -156,7 +155,7 @@ namespace llvm {
   /// multiple objects.
   void GetUnderlyingObjects(Value *V,
                             SmallVectorImpl<Value *> &Objects,
-                            const DataLayout *TD = 0,
+                            const DataLayout *TD = nullptr,
                             unsigned MaxLookup = 6);
 
   /// onlyUsedByLifetimeMarkers - Return true if the only users of this pointer
@@ -182,12 +181,12 @@ namespace llvm {
   /// However, this method can return true for instructions that read memory;
   /// for such instructions, moving them may change the resulting value.
   bool isSafeToSpeculativelyExecute(const Value *V,
-                                    const DataLayout *TD = 0);
+                                    const DataLayout *TD = nullptr);
 
   /// isKnownNonNull - Return true if this pointer couldn't possibly be null by
   /// its definition.  This returns true for allocas, non-extern-weak globals
   /// and byval arguments.
-  bool isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI = 0);
+  bool isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI = nullptr);
 
 } // end namespace llvm
 
diff --git a/include/llvm/Bitcode/BitstreamReader.h b/include/llvm/Bitcode/BitstreamReader.h
index fcbf426..6f478b7 100644
--- a/include/llvm/Bitcode/BitstreamReader.h
+++ b/include/llvm/Bitcode/BitstreamReader.h
@@ -111,7 +111,7 @@ public:
          i != e; ++i)
       if (BlockInfoRecords[i].BlockID == BlockID)
         return &BlockInfoRecords[i];
-    return 0;
+    return nullptr;
   }
 
   BlockInfo &getOrCreateBlockInfo(unsigned BlockID) {
@@ -200,9 +200,9 @@ class BitstreamCursor {
 
 
 public:
-  BitstreamCursor() : BitStream(0), NextChar(0) {
-  }
-  BitstreamCursor(const BitstreamCursor &RHS) : BitStream(0), NextChar(0) {
+  BitstreamCursor() : BitStream(nullptr), NextChar(0) {}
+  BitstreamCursor(const BitstreamCursor &RHS)
+      : BitStream(nullptr), NextChar(0) {
     operator=(RHS);
   }
 
@@ -490,7 +490,7 @@ public:
 
   /// EnterSubBlock - Having read the ENTER_SUBBLOCK abbrevid, enter
   /// the block, and return true if the block has an error.
-  bool EnterSubBlock(unsigned BlockID, unsigned *NumWordsP = 0);
+  bool EnterSubBlock(unsigned BlockID, unsigned *NumWordsP = nullptr);
 
   bool ReadBlockEnd() {
     if (BlockScope.empty()) return true;
@@ -541,7 +541,7 @@ public:
   void skipRecord(unsigned AbbrevID);
 
   unsigned readRecord(unsigned AbbrevID, SmallVectorImpl<uint64_t> &Vals,
-                      StringRef *Blob = 0);
+                      StringRef *Blob = nullptr);
 
   //===--------------------------------------------------------------------===//
   // Abbrev Processing
diff --git a/include/llvm/Bitcode/BitstreamWriter.h b/include/llvm/Bitcode/BitstreamWriter.h
index ef88a88..dcfebd9 100644
--- a/include/llvm/Bitcode/BitstreamWriter.h
+++ b/include/llvm/Bitcode/BitstreamWriter.h
@@ -204,7 +204,7 @@ public:
          i != e; ++i)
       if (BlockInfoRecords[i].BlockID == BlockID)
         return &BlockInfoRecords[i];
-    return 0;
+    return nullptr;
   }
 
   void EnterSubblock(unsigned BlockID, unsigned CodeLen) {
@@ -347,7 +347,7 @@ private:
             EmitAbbreviatedField(EltEnc, (unsigned char)BlobData[i]);
 
           // Know that blob data is consumed for assertion below.
-          BlobData = 0;
+          BlobData = nullptr;
         } else {
           // Emit a vbr6 to indicate the number of elements present.
           EmitVBR(static_cast<uint32_t>(Vals.size()-RecordIdx), 6);
@@ -378,7 +378,7 @@ private:
             WriteByte((unsigned char)BlobData[i]);
 
           // Know that blob data is consumed for assertion below.
-          BlobData = 0;
+          BlobData = nullptr;
         } else {
           for (unsigned e = Vals.size(); RecordIdx != e; ++RecordIdx) {
             assert(isUInt<8>(Vals[RecordIdx]) &&
@@ -397,7 +397,7 @@ private:
       }
     }
     assert(RecordIdx == Vals.size() && "Not all record operands emitted!");
-    assert(BlobData == 0 &&
+    assert(BlobData == nullptr &&
            "Blob data specified for record that doesn't use it!");
   }
 
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index 7e6831b..04c08ab 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -311,7 +311,7 @@ namespace bitc {
     // 32 is unused.
     FUNC_CODE_DEBUG_LOC_AGAIN  = 33, // DEBUG_LOC_AGAIN
 
-    FUNC_CODE_INST_CALL        = 34, // CALL:       [attr, fnty, fnid, args...]
+    FUNC_CODE_INST_CALL        = 34, // CALL:    [attr, cc, fnty, fnid, args...]
 
     FUNC_CODE_DEBUG_LOC        = 35, // DEBUG_LOC:  [Line,Col,ScopeVal, IAVal]
     FUNC_CODE_INST_FENCE       = 36, // FENCE: [ordering, synchscope]
@@ -371,7 +371,8 @@ namespace bitc {
     ATTR_KIND_BUILTIN = 35,
     ATTR_KIND_COLD = 36,
     ATTR_KIND_OPTIMIZE_NONE = 37,
-    ATTR_KIND_IN_ALLOCA = 38
+    ATTR_KIND_IN_ALLOCA = 38,
+    ATTR_KIND_NON_NULL = 39
   };
 
 } // End bitc namespace
diff --git a/include/llvm/Bitcode/ReaderWriter.h b/include/llvm/Bitcode/ReaderWriter.h
index 0918e92..4c194a6 100644
--- a/include/llvm/Bitcode/ReaderWriter.h
+++ b/include/llvm/Bitcode/ReaderWriter.h
@@ -39,7 +39,7 @@ namespace llvm {
   Module *getStreamedBitcodeModule(const std::string &name,
                                    DataStreamer *streamer,
                                    LLVMContext &Context,
-                                   std::string *ErrMsg = 0);
+                                   std::string *ErrMsg = nullptr);
 
   /// getBitcodeTargetTriple - Read the header of the specified bitcode
   /// buffer and extract just the triple information. If successful,
@@ -48,7 +48,7 @@ namespace llvm {
   /// if ErrMsg is non-null.
   std::string getBitcodeTargetTriple(MemoryBuffer *Buffer,
                                      LLVMContext &Context,
-                                     std::string *ErrMsg = 0);
+                                     std::string *ErrMsg = nullptr);
 
   /// Read the specified bitcode file, returning the module.
   /// This method *never* takes ownership of Buffer.
diff --git a/include/llvm/CMakeLists.txt b/include/llvm/CMakeLists.txt
index 0f5c63d..ca4fd13 100644
--- a/include/llvm/CMakeLists.txt
+++ b/include/llvm/CMakeLists.txt
@@ -12,3 +12,9 @@ if( MSVC_IDE OR XCODE )
   set_target_properties(llvm_headers_do_not_build PROPERTIES FOLDER "Misc"
                         EXCLUDE_FROM_DEFAULT_BUILD ON)
 endif()
+
+# If we're doing an out-of-tree build, copy a module map for generated
+# header files into the build area.
+if (NOT "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")
+  configure_file(module.modulemap.build module.modulemap COPYONLY)
+endif (NOT "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")
diff --git a/include/llvm/CodeGen/Analysis.h b/include/llvm/CodeGen/Analysis.h
index 5f2bbd6..c3aefd4 100644
--- a/include/llvm/CodeGen/Analysis.h
+++ b/include/llvm/CodeGen/Analysis.h
@@ -54,7 +54,7 @@ inline unsigned ComputeLinearIndex(Type *Ty,
 ///
 void ComputeValueVTs(const TargetLowering &TLI, Type *Ty,
                      SmallVectorImpl<EVT> &ValueVTs,
-                     SmallVectorImpl<uint64_t> *Offsets = 0,
+                     SmallVectorImpl<uint64_t> *Offsets = nullptr,
                      uint64_t StartingOffset = 0);
 
 /// ExtractTypeInfo - Returns the type info, possibly bitcast, encoded in V.
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index d96d810..b53fb42 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -23,501 +23,490 @@
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
-  class AsmPrinterHandler;
-  class BlockAddress;
-  class ByteStreamer;
-  class GCStrategy;
-  class Constant;
-  class ConstantArray;
-  class GCMetadataPrinter;
-  class GlobalValue;
-  class GlobalVariable;
-  class MachineBasicBlock;
-  class MachineFunction;
-  class MachineInstr;
-  class MachineLocation;
-  class MachineLoopInfo;
-  class MachineLoop;
-  class MachineConstantPoolValue;
-  class MachineJumpTableInfo;
-  class MachineModuleInfo;
-  class MCAsmInfo;
-  class MCCFIInstruction;
-  class MCContext;
-  class MCInst;
-  class MCInstrInfo;
-  class MCSection;
-  class MCStreamer;
-  class MCSubtargetInfo;
-  class MCSymbol;
-  class MDNode;
-  class DwarfDebug;
-  class DwarfException;
-  class Mangler;
-  class TargetLoweringObjectFile;
-  class DataLayout;
-  class TargetMachine;
-
-  /// AsmPrinter - This class is intended to be used as a driving class for all
-  /// asm writers.
-  class AsmPrinter : public MachineFunctionPass {
-  public:
-    /// Target machine description.
-    ///
-    TargetMachine &TM;
-
-    /// Target Asm Printer information.
-    ///
-    const MCAsmInfo *MAI;
-
-    const MCInstrInfo *MII;
-    /// OutContext - This is the context for the output file that we are
-    /// streaming.  This owns all of the global MC-related objects for the
-    /// generated translation unit.
-    MCContext &OutContext;
-
-    /// OutStreamer - This is the MCStreamer object for the file we are
-    /// generating.  This contains the transient state for the current
-    /// translation unit that we are generating (such as the current section
-    /// etc).
-    MCStreamer &OutStreamer;
-
-    /// The current machine function.
-    const MachineFunction *MF;
-
-    /// MMI - This is a pointer to the current MachineModuleInfo.
-    MachineModuleInfo *MMI;
-
-    /// Name-mangler for global names.
-    ///
-    Mangler *Mang;
-
-    /// The symbol for the current function. This is recalculated at the
-    /// beginning of each call to runOnMachineFunction().
-    ///
-    MCSymbol *CurrentFnSym;
-
-    /// The symbol used to represent the start of the current function for the
-    /// purpose of calculating its size (e.g. using the .size directive). By
-    /// default, this is equal to CurrentFnSym.
-    MCSymbol *CurrentFnSymForSize;
-
-  private:
-    // GCMetadataPrinters - The garbage collection metadata printer table.
-    void *GCMetadataPrinters;  // Really a DenseMap.
-
-    /// VerboseAsm - Emit comments in assembly output if this is true.
-    ///
-    bool VerboseAsm;
-    static char ID;
-
-    /// If VerboseAsm is set, a pointer to the loop info for this
-    /// function.
-    MachineLoopInfo *LI;
-
-    struct HandlerInfo {
-      AsmPrinterHandler *Handler;
-      const char *TimerName, *TimerGroupName;
-      HandlerInfo(AsmPrinterHandler *Handler, const char *TimerName,
-                  const char *TimerGroupName)
-          : Handler(Handler), TimerName(TimerName),
-            TimerGroupName(TimerGroupName) {}
-    };
-    /// Handlers - a vector of all debug/EH info emitters we should use.
-    /// This vector maintains ownership of the emitters.
-    SmallVector<HandlerInfo, 1> Handlers;
-
-    /// DD - If the target supports dwarf debug info, this pointer is non-null.
-    DwarfDebug *DD;
-
-  protected:
-    explicit AsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
-
-  public:
-    virtual ~AsmPrinter();
-
-    DwarfDebug *getDwarfDebug() { return DD; }
-
-    /// isVerbose - Return true if assembly output should contain comments.
-    ///
-    bool isVerbose() const { return VerboseAsm; }
-
-    /// getFunctionNumber - Return a unique ID for the current function.
-    ///
-    unsigned getFunctionNumber() const;
-
-    /// getObjFileLowering - Return information about object file lowering.
-    const TargetLoweringObjectFile &getObjFileLowering() const;
-
-    /// getDataLayout - Return information about data layout.
-    const DataLayout &getDataLayout() const;
-
-    /// getSubtargetInfo - Return information about subtarget.
-    const MCSubtargetInfo &getSubtargetInfo() const;
-
-    void EmitToStreamer(MCStreamer &S, const MCInst &Inst);
-
-    /// getTargetTriple - Return the target triple string.
-    StringRef getTargetTriple() const;
-
-    /// getCurrentSection() - Return the current section we are emitting to.
-    const MCSection *getCurrentSection() const;
-
-    void getNameWithPrefix(SmallVectorImpl<char> &Name,
-                           const GlobalValue *GV) const;
-
-    MCSymbol *getSymbol(const GlobalValue *GV) const;
-
-    //===------------------------------------------------------------------===//
-    // MachineFunctionPass Implementation.
-    //===------------------------------------------------------------------===//
-
-    /// getAnalysisUsage - Record analysis usage.
-    ///
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-    /// doInitialization - Set up the AsmPrinter when we are working on a new
-    /// module.  If your pass overrides this, it must make sure to explicitly
-    /// call this implementation.
-    bool doInitialization(Module &M) override;
+class AsmPrinterHandler;
+class BlockAddress;
+class ByteStreamer;
+class GCStrategy;
+class Constant;
+class ConstantArray;
+class GCMetadataPrinter;
+class GlobalValue;
+class GlobalVariable;
+class MachineBasicBlock;
+class MachineFunction;
+class MachineInstr;
+class MachineLocation;
+class MachineLoopInfo;
+class MachineLoop;
+class MachineConstantPoolValue;
+class MachineJumpTableInfo;
+class MachineModuleInfo;
+class MCAsmInfo;
+class MCCFIInstruction;
+class MCContext;
+class MCInst;
+class MCInstrInfo;
+class MCSection;
+class MCStreamer;
+class MCSubtargetInfo;
+class MCSymbol;
+class MDNode;
+class DwarfDebug;
+class DwarfException;
+class Mangler;
+class TargetLoweringObjectFile;
+class DataLayout;
+class TargetMachine;
+
+/// This class is intended to be used as a driving class for all asm writers.
+class AsmPrinter : public MachineFunctionPass {
+public:
+  /// Target machine description.
+  ///
+  TargetMachine &TM;
+
+  /// Target Asm Printer information.
+  ///
+  const MCAsmInfo *MAI;
+
+  const MCInstrInfo *MII;
+  /// This is the context for the output file that we are streaming. This owns
+  /// all of the global MC-related objects for the generated translation unit.
+  MCContext &OutContext;
+
+  /// This is the MCStreamer object for the file we are generating. This
+  /// contains the transient state for the current translation unit that we are
+  /// generating (such as the current section etc).
+  MCStreamer &OutStreamer;
+
+  /// The current machine function.
+  const MachineFunction *MF;
+
+  /// This is a pointer to the current MachineModuleInfo.
+  MachineModuleInfo *MMI;
+
+  /// Name-mangler for global names.
+  ///
+  Mangler *Mang;
+
+  /// The symbol for the current function. This is recalculated at the beginning
+  /// of each call to runOnMachineFunction().
+  ///
+  MCSymbol *CurrentFnSym;
+
+  /// The symbol used to represent the start of the current function for the
+  /// purpose of calculating its size (e.g. using the .size directive). By
+  /// default, this is equal to CurrentFnSym.
+  MCSymbol *CurrentFnSymForSize;
+
+private:
+  // The garbage collection metadata printer table.
+  void *GCMetadataPrinters; // Really a DenseMap.
+
+  /// Emit comments in assembly output if this is true.
+  ///
+  bool VerboseAsm;
+  static char ID;
+
+  /// If VerboseAsm is set, a pointer to the loop info for this function.
+  MachineLoopInfo *LI;
+
+  struct HandlerInfo {
+    AsmPrinterHandler *Handler;
+    const char *TimerName, *TimerGroupName;
+    HandlerInfo(AsmPrinterHandler *Handler, const char *TimerName,
+                const char *TimerGroupName)
+        : Handler(Handler), TimerName(TimerName),
+          TimerGroupName(TimerGroupName) {}
+  };
+  /// A vector of all debug/EH info emitters we should use. This vector
+  /// maintains ownership of the emitters.
+  SmallVector<HandlerInfo, 1> Handlers;
 
-    /// doFinalization - Shut down the asmprinter.  If you override this in your
-    /// pass, you must make sure to call it explicitly.
-    bool doFinalization(Module &M) override;
-
-    /// runOnMachineFunction - Emit the specified function out to the
-    /// OutStreamer.
-    bool runOnMachineFunction(MachineFunction &MF) override {
-      SetupMachineFunction(MF);
-      EmitFunctionHeader();
-      EmitFunctionBody();
-      return false;
-    }
+  /// If the target supports dwarf debug info, this pointer is non-null.
+  DwarfDebug *DD;
 
-    //===------------------------------------------------------------------===//
-    // Coarse grained IR lowering routines.
-    //===------------------------------------------------------------------===//
-
-    /// SetupMachineFunction - This should be called when a new MachineFunction
-    /// is being processed from runOnMachineFunction.
-    void SetupMachineFunction(MachineFunction &MF);
+protected:
+  explicit AsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
 
-    /// EmitFunctionHeader - This method emits the header for the current
-    /// function.
-    void EmitFunctionHeader();
+public:
+  virtual ~AsmPrinter();
 
-    /// EmitFunctionBody - This method emits the body and trailer for a
-    /// function.
-    void EmitFunctionBody();
+  DwarfDebug *getDwarfDebug() { return DD; }
 
-    void emitCFIInstruction(const MachineInstr &MI);
-
-    enum CFIMoveType {
-      CFI_M_None,
-      CFI_M_EH,
-      CFI_M_Debug
-    };
-    CFIMoveType needsCFIMoves();
+  /// Return true if assembly output should contain comments.
+  ///
+  bool isVerbose() const { return VerboseAsm; }
 
-    bool needsSEHMoves();
+  /// Return a unique ID for the current function.
+  ///
+  unsigned getFunctionNumber() const;
 
-    /// EmitConstantPool - Print to the current output stream assembly
-    /// representations of the constants in the constant pool MCP. This is
-    /// used to print out constants which have been "spilled to memory" by
-    /// the code generator.
-    ///
-    virtual void EmitConstantPool();
+  /// Return information about object file lowering.
+  const TargetLoweringObjectFile &getObjFileLowering() const;
 
-    /// EmitJumpTableInfo - Print assembly representations of the jump tables
-    /// used by the current function to the current output stream.
-    ///
-    void EmitJumpTableInfo();
+  /// Return information about data layout.
+  const DataLayout &getDataLayout() const;
 
-    /// EmitGlobalVariable - Emit the specified global variable to the .s file.
-    virtual void EmitGlobalVariable(const GlobalVariable *GV);
+  /// Return information about subtarget.
+  const MCSubtargetInfo &getSubtargetInfo() const;
 
-    /// EmitSpecialLLVMGlobal - Check to see if the specified global is a
-    /// special global used by LLVM.  If so, emit it and return true, otherwise
-    /// do nothing and return false.
-    bool EmitSpecialLLVMGlobal(const GlobalVariable *GV);
+  void EmitToStreamer(MCStreamer &S, const MCInst &Inst);
 
-    /// EmitAlignment - Emit an alignment directive to the specified power of
-    /// two boundary.  For example, if you pass in 3 here, you will get an 8
-    /// byte alignment.  If a global value is specified, and if that global has
-    /// an explicit alignment requested, it will override the alignment request
-    /// if required for correctness.
-    ///
-    void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const;
-
-    /// EmitBasicBlockStart - This method prints the label for the specified
-    /// MachineBasicBlock, an alignment (if present) and a comment describing
-    /// it if appropriate.
-    void EmitBasicBlockStart(const MachineBasicBlock *MBB) const;
-
-    /// \brief Print a general LLVM constant to the .s file.
-    void EmitGlobalConstant(const Constant *CV);
-
-
-    //===------------------------------------------------------------------===//
-    // Overridable Hooks
-    //===------------------------------------------------------------------===//
-
-    // Targets can, or in the case of EmitInstruction, must implement these to
-    // customize output.
-
-    /// EmitStartOfAsmFile - This virtual method can be overridden by targets
-    /// that want to emit something at the start of their file.
-    virtual void EmitStartOfAsmFile(Module &) {}
-
-    /// EmitEndOfAsmFile - This virtual method can be overridden by targets that
-    /// want to emit something at the end of their file.
-    virtual void EmitEndOfAsmFile(Module &) {}
-
-    /// EmitFunctionBodyStart - Targets can override this to emit stuff before
-    /// the first basic block in the function.
-    virtual void EmitFunctionBodyStart() {}
-
-    /// EmitFunctionBodyEnd - Targets can override this to emit stuff after
-    /// the last basic block in the function.
-    virtual void EmitFunctionBodyEnd() {}
-
-    /// EmitInstruction - Targets should implement this to emit instructions.
-    virtual void EmitInstruction(const MachineInstr *) {
-      llvm_unreachable("EmitInstruction not implemented");
-    }
-
-    /// GetCPISymbol - Return the symbol for the specified constant pool entry.
-    virtual MCSymbol *GetCPISymbol(unsigned CPID) const;
-
-    virtual void EmitFunctionEntryLabel();
-
-    virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV);
-
-    /// EmitXXStructor - Targets can override this to change how global
-    /// constants that are part of a C++ static/global constructor list are
-    /// emitted.
-    virtual void EmitXXStructor(const Constant *CV) {
-      EmitGlobalConstant(CV);
-    }
-
-    /// isBlockOnlyReachableByFallthough - Return true if the basic block has
-    /// exactly one predecessor and the control transfer mechanism between
-    /// the predecessor and this block is a fall-through.
-    virtual bool
-    isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
-
-    /// emitImplicitDef - Targets can override this to customize the output of
-    /// IMPLICIT_DEF instructions in verbose mode.
-    virtual void emitImplicitDef(const MachineInstr *MI) const;
-
-    //===------------------------------------------------------------------===//
-    // Symbol Lowering Routines.
-    //===------------------------------------------------------------------===//
-  public:
-
-    /// GetTempSymbol - Return the MCSymbol corresponding to the assembler
-    /// temporary label with the specified stem and unique ID.
-    MCSymbol *GetTempSymbol(Twine Name, unsigned ID) const;
-
-    /// GetTempSymbol - Return an assembler temporary label with the specified
-    /// stem.
-    MCSymbol *GetTempSymbol(Twine Name) const;
-
-    /// Return the MCSymbol for a private symbol with global value name as its
-    /// base, with the specified suffix.
-    MCSymbol *getSymbolWithGlobalValueBase(const GlobalValue *GV,
-                                           StringRef Suffix) const;
-
-    /// GetExternalSymbolSymbol - Return the MCSymbol for the specified
-    /// ExternalSymbol.
-    MCSymbol *GetExternalSymbolSymbol(StringRef Sym) const;
-
-    /// GetJTISymbol - Return the symbol for the specified jump table entry.
-    MCSymbol *GetJTISymbol(unsigned JTID, bool isLinkerPrivate = false) const;
-
-    /// GetJTSetSymbol - Return the symbol for the specified jump table .set
-    /// FIXME: privatize to AsmPrinter.
-    MCSymbol *GetJTSetSymbol(unsigned UID, unsigned MBBID) const;
-
-    /// GetBlockAddressSymbol - Return the MCSymbol used to satisfy BlockAddress
-    /// uses of the specified basic block.
-    MCSymbol *GetBlockAddressSymbol(const BlockAddress *BA) const;
-    MCSymbol *GetBlockAddressSymbol(const BasicBlock *BB) const;
-
-    //===------------------------------------------------------------------===//
-    // Emission Helper Routines.
-    //===------------------------------------------------------------------===//
-  public:
-    /// printOffset - This is just convenient handler for printing offsets.
-    void printOffset(int64_t Offset, raw_ostream &OS) const;
-
-    /// EmitInt8 - Emit a byte directive and value.
-    ///
-    void EmitInt8(int Value) const;
-
-    /// EmitInt16 - Emit a short directive and value.
-    ///
-    void EmitInt16(int Value) const;
-
-    /// EmitInt32 - Emit a long directive and value.
-    ///
-    void EmitInt32(int Value) const;
-
-    /// EmitLabelDifference - Emit something like ".long Hi-Lo" where the size
-    /// in bytes of the directive is specified by Size and Hi/Lo specify the
-    /// labels.  This implicitly uses .set if it is available.
-    void EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
-                             unsigned Size) const;
-
-    /// EmitLabelOffsetDifference - Emit something like ".long Hi+Offset-Lo"
-    /// where the size in bytes of the directive is specified by Size and Hi/Lo
-    /// specify the labels.  This implicitly uses .set if it is available.
-    void EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
-                                   const MCSymbol *Lo, unsigned Size) const;
-
-    /// EmitLabelPlusOffset - Emit something like ".long Label+Offset"
-    /// where the size in bytes of the directive is specified by Size and Label
-    /// specifies the label.  This implicitly uses .set if it is available.
-    void EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
-                             unsigned Size,
-                             bool IsSectionRelative = false) const;
-
-    /// EmitLabelReference - Emit something like ".long Label"
-    /// where the size in bytes of the directive is specified by Size and Label
-    /// specifies the label.
-    void EmitLabelReference(const MCSymbol *Label, unsigned Size,
-                            bool IsSectionRelative = false) const {
-      EmitLabelPlusOffset(Label, 0, Size, IsSectionRelative);
-    }
-
-    //===------------------------------------------------------------------===//
-    // Dwarf Emission Helper Routines
-    //===------------------------------------------------------------------===//
-
-    /// EmitSLEB128 - emit the specified signed leb128 value.
-    void EmitSLEB128(int64_t Value, const char *Desc = 0) const;
-
-    /// EmitULEB128 - emit the specified unsigned leb128 value.
-    void EmitULEB128(uint64_t Value, const char *Desc = 0,
-                     unsigned PadTo = 0) const;
-
-    /// EmitCFAByte - Emit a .byte 42 directive for a DW_CFA_xxx value.
-    void EmitCFAByte(unsigned Val) const;
-
-    /// EmitEncodingByte - Emit a .byte 42 directive that corresponds to an
-    /// encoding.  If verbose assembly output is enabled, we output comments
-    /// describing the encoding.  Desc is a string saying what the encoding is
-    /// specifying (e.g. "LSDA").
-    void EmitEncodingByte(unsigned Val, const char *Desc = 0) const;
-
-    /// GetSizeOfEncodedValue - Return the size of the encoding in bytes.
-    unsigned GetSizeOfEncodedValue(unsigned Encoding) const;
-
-    /// EmitReference - Emit reference to a ttype global with a specified encoding.
-    void EmitTTypeReference(const GlobalValue *GV, unsigned Encoding) const;
-
-    /// EmitSectionOffset - Emit the 4-byte offset of Label from the start of
-    /// its section.  This can be done with a special directive if the target
-    /// supports it (e.g. cygwin) or by emitting it as an offset from a label at
-    /// the start of the section.
-    ///
-    /// SectionLabel is a temporary label emitted at the start of the section
-    /// that Label lives in.
-    void EmitSectionOffset(const MCSymbol *Label,
-                           const MCSymbol *SectionLabel) const;
-
-    /// getISAEncoding - Get the value for DW_AT_APPLE_isa. Zero if no isa
-    /// encoding specified.
-    virtual unsigned getISAEncoding() { return 0; }
-
-    /// EmitDwarfRegOp - Emit dwarf register operation.
-    virtual void EmitDwarfRegOp(ByteStreamer &BS, const MachineLocation &MLoc,
-                                bool Indirect) const;
-
-    //===------------------------------------------------------------------===//
-    // Dwarf Lowering Routines
-    //===------------------------------------------------------------------===//
-
-    /// \brief Emit frame instruction to describe the layout of the frame.
-    void emitCFIInstruction(const MCCFIInstruction &Inst) const;
-
-    //===------------------------------------------------------------------===//
-    // Inline Asm Support
-    //===------------------------------------------------------------------===//
-  public:
-    // These are hooks that targets can override to implement inline asm
-    // support.  These should probably be moved out of AsmPrinter someday.
-
-    /// PrintSpecial - Print information related to the specified machine instr
-    /// that is independent of the operand, and may be independent of the instr
-    /// itself.  This can be useful for portably encoding the comment character
-    /// or other bits of target-specific knowledge into the asmstrings.  The
-    /// syntax used is ${:comment}.  Targets can override this to add support
-    /// for their own strange codes.
-    virtual void PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
-                              const char *Code) const;
-
-    /// PrintAsmOperand - Print the specified operand of MI, an INLINEASM
-    /// instruction, using the specified assembler variant.  Targets should
-    /// override this to format as appropriate.  This method can return true if
-    /// the operand is erroneous.
-    virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                                 unsigned AsmVariant, const char *ExtraCode,
-                                 raw_ostream &OS);
-
-    /// PrintAsmMemoryOperand - Print the specified operand of MI, an INLINEASM
-    /// instruction, using the specified assembler variant as an address.
-    /// Targets should override this to format as appropriate.  This method can
-    /// return true if the operand is erroneous.
-    virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
-                                       unsigned AsmVariant,
-                                       const char *ExtraCode, raw_ostream &OS);
-
-    /// Let the target do anything it needs to do after emitting inlineasm.
-    /// This callback can be used restore the original mode in case the
-    /// inlineasm contains directives to switch modes.
-    /// \p StartInfo - the original subtarget info before inline asm
-    /// \p EndInfo   - the final subtarget info after parsing the inline asm,
-    ///                or NULL if the value is unknown.
-    virtual void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
-                                  const MCSubtargetInfo *EndInfo) const;
-
-  private:
-    /// Private state for PrintSpecial()
-    // Assign a unique ID to this machine instruction.
-    mutable const MachineInstr *LastMI;
-    mutable unsigned LastFn;
-    mutable unsigned Counter;
-    mutable unsigned SetCounter;
-
-    /// EmitInlineAsm - Emit a blob of inline asm to the output streamer.
-    void EmitInlineAsm(StringRef Str, const MDNode *LocMDNode = 0,
-                       InlineAsm::AsmDialect AsmDialect =
-                           InlineAsm::AD_ATT) const;
-
-    /// EmitInlineAsm - This method formats and emits the specified machine
-    /// instruction that is an inline asm.
-    void EmitInlineAsm(const MachineInstr *MI) const;
-
-    //===------------------------------------------------------------------===//
-    // Internal Implementation Details
-    //===------------------------------------------------------------------===//
-
-    /// EmitVisibility - This emits visibility information about symbol, if
-    /// this is suported by the target.
-    void EmitVisibility(MCSymbol *Sym, unsigned Visibility,
-                        bool IsDefinition = true) const;
-
-    void EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const;
-
-    void EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
-                            const MachineBasicBlock *MBB, unsigned uid) const;
-    void EmitLLVMUsedList(const ConstantArray *InitList);
-    /// Emit llvm.ident metadata in an '.ident' directive.
-    void EmitModuleIdents(Module &M);
-    void EmitXXStructorList(const Constant *List, bool isCtor);
-    GCMetadataPrinter *GetOrCreateGCPrinter(GCStrategy *C);
-  };
+  /// Return the target triple string.
+  StringRef getTargetTriple() const;
+
+  /// Return the current section we are emitting to.
+  const MCSection *getCurrentSection() const;
+
+  void getNameWithPrefix(SmallVectorImpl<char> &Name,
+                         const GlobalValue *GV) const;
+
+  MCSymbol *getSymbol(const GlobalValue *GV) const;
+
+  //===------------------------------------------------------------------===//
+  // MachineFunctionPass Implementation.
+  //===------------------------------------------------------------------===//
+
+  /// Record analysis usage.
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Set up the AsmPrinter when we are working on a new module. If your pass
+  /// overrides this, it must make sure to explicitly call this implementation.
+  bool doInitialization(Module &M) override;
+
+  /// Shut down the asmprinter. If you override this in your pass, you must make
+  /// sure to call it explicitly.
+  bool doFinalization(Module &M) override;
+
+  /// Emit the specified function out to the OutStreamer.
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    SetupMachineFunction(MF);
+    EmitFunctionHeader();
+    EmitFunctionBody();
+    return false;
+  }
+
+  //===------------------------------------------------------------------===//
+  // Coarse grained IR lowering routines.
+  //===------------------------------------------------------------------===//
+
+  /// This should be called when a new MachineFunction is being processed from
+  /// runOnMachineFunction.
+  void SetupMachineFunction(MachineFunction &MF);
+
+  /// This method emits the header for the current function.
+  void EmitFunctionHeader();
+
+  /// This method emits the body and trailer for a function.
+  void EmitFunctionBody();
+
+  void emitCFIInstruction(const MachineInstr &MI);
+
+  enum CFIMoveType { CFI_M_None, CFI_M_EH, CFI_M_Debug };
+  CFIMoveType needsCFIMoves();
+
+  bool needsSEHMoves();
+
+  /// Print to the current output stream assembly representations of the
+  /// constants in the constant pool MCP. This is used to print out constants
+  /// which have been "spilled to memory" by the code generator.
+  ///
+  virtual void EmitConstantPool();
+
+  /// Print assembly representations of the jump tables used by the current
+  /// function to the current output stream.
+  ///
+  void EmitJumpTableInfo();
+
+  /// Emit the specified global variable to the .s file.
+  virtual void EmitGlobalVariable(const GlobalVariable *GV);
+
+  /// Check to see if the specified global is a special global used by LLVM. If
+  /// so, emit it and return true, otherwise do nothing and return false.
+  bool EmitSpecialLLVMGlobal(const GlobalVariable *GV);
+
+  /// Emit an alignment directive to the specified power of two boundary. For
+  /// example, if you pass in 3 here, you will get an 8 byte alignment. If a
+  /// global value is specified, and if that global has an explicit alignment
+  /// requested, it will override the alignment request if required for
+  /// correctness.
+  ///
+  void EmitAlignment(unsigned NumBits, const GlobalObject *GO = nullptr) const;
+
+  /// This method prints the label for the specified MachineBasicBlock, an
+  /// alignment (if present) and a comment describing it if appropriate.
+  void EmitBasicBlockStart(const MachineBasicBlock &MBB) const;
+
+  /// \brief Print a general LLVM constant to the .s file.
+  void EmitGlobalConstant(const Constant *CV);
+
+  //===------------------------------------------------------------------===//
+  // Overridable Hooks
+  //===------------------------------------------------------------------===//
+
+  // Targets can, or in the case of EmitInstruction, must implement these to
+  // customize output.
+
+  /// This virtual method can be overridden by targets that want to emit
+  /// something at the start of their file.
+  virtual void EmitStartOfAsmFile(Module &) {}
+
+  /// This virtual method can be overridden by targets that want to emit
+  /// something at the end of their file.
+  virtual void EmitEndOfAsmFile(Module &) {}
+
+  /// Targets can override this to emit stuff before the first basic block in
+  /// the function.
+  virtual void EmitFunctionBodyStart() {}
+
+  /// Targets can override this to emit stuff after the last basic block in the
+  /// function.
+  virtual void EmitFunctionBodyEnd() {}
+
+  /// Targets should implement this to emit instructions.
+  virtual void EmitInstruction(const MachineInstr *) {
+    llvm_unreachable("EmitInstruction not implemented");
+  }
+
+  /// Return the symbol for the specified constant pool entry.
+  virtual MCSymbol *GetCPISymbol(unsigned CPID) const;
+
+  virtual void EmitFunctionEntryLabel();
+
+  virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV);
+
+  /// Targets can override this to change how global constants that are part of
+  /// a C++ static/global constructor list are emitted.
+  virtual void EmitXXStructor(const Constant *CV) { EmitGlobalConstant(CV); }
+
+  /// Return true if the basic block has exactly one predecessor and the control
+  /// transfer mechanism between the predecessor and this block is a
+  /// fall-through.
+  virtual bool
+  isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
+
+  /// Targets can override this to customize the output of IMPLICIT_DEF
+  /// instructions in verbose mode.
+  virtual void emitImplicitDef(const MachineInstr *MI) const;
+
+  //===------------------------------------------------------------------===//
+  // Symbol Lowering Routines.
+  //===------------------------------------------------------------------===//
+public:
+  /// Return the MCSymbol corresponding to the assembler temporary label with
+  /// the specified stem and unique ID.
+  MCSymbol *GetTempSymbol(Twine Name, unsigned ID) const;
+
+  /// Return an assembler temporary label with the specified stem.
+  MCSymbol *GetTempSymbol(Twine Name) const;
+
+  /// Return the MCSymbol for a private symbol with global value name as its
+  /// base, with the specified suffix.
+  MCSymbol *getSymbolWithGlobalValueBase(const GlobalValue *GV,
+                                         StringRef Suffix) const;
+
+  /// Return the MCSymbol for the specified ExternalSymbol.
+  MCSymbol *GetExternalSymbolSymbol(StringRef Sym) const;
+
+  /// Return the symbol for the specified jump table entry.
+  MCSymbol *GetJTISymbol(unsigned JTID, bool isLinkerPrivate = false) const;
+
+  /// Return the symbol for the specified jump table .set
+  /// FIXME: privatize to AsmPrinter.
+  MCSymbol *GetJTSetSymbol(unsigned UID, unsigned MBBID) const;
+
+  /// Return the MCSymbol used to satisfy BlockAddress uses of the specified
+  /// basic block.
+  MCSymbol *GetBlockAddressSymbol(const BlockAddress *BA) const;
+  MCSymbol *GetBlockAddressSymbol(const BasicBlock *BB) const;
+
+  //===------------------------------------------------------------------===//
+  // Emission Helper Routines.
+  //===------------------------------------------------------------------===//
+public:
+  /// This is just convenient handler for printing offsets.
+  void printOffset(int64_t Offset, raw_ostream &OS) const;
+
+  /// Emit a byte directive and value.
+  ///
+  void EmitInt8(int Value) const;
+
+  /// Emit a short directive and value.
+  ///
+  void EmitInt16(int Value) const;
+
+  /// Emit a long directive and value.
+  ///
+  void EmitInt32(int Value) const;
+
+  /// Emit something like ".long Hi-Lo" where the size in bytes of the directive
+  /// is specified by Size and Hi/Lo specify the labels.  This implicitly uses
+  /// .set if it is available.
+  void EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
+                           unsigned Size) const;
+
+  /// Emit something like ".long Hi+Offset-Lo" where the size in bytes of the
+  /// directive is specified by Size and Hi/Lo specify the labels.  This
+  /// implicitly uses .set if it is available.
+  void EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
+                                 const MCSymbol *Lo, unsigned Size) const;
+
+  /// Emit something like ".long Label+Offset" where the size in bytes of the
+  /// directive is specified by Size and Label specifies the label.  This
+  /// implicitly uses .set if it is available.
+  void EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
+                           unsigned Size, bool IsSectionRelative = false) const;
+
+  /// Emit something like ".long Label" where the size in bytes of the directive
+  /// is specified by Size and Label specifies the label.
+  void EmitLabelReference(const MCSymbol *Label, unsigned Size,
+                          bool IsSectionRelative = false) const {
+    EmitLabelPlusOffset(Label, 0, Size, IsSectionRelative);
+  }
+
+  //===------------------------------------------------------------------===//
+  // Dwarf Emission Helper Routines
+  //===------------------------------------------------------------------===//
+
+  /// Emit the specified signed leb128 value.
+  void EmitSLEB128(int64_t Value, const char *Desc = nullptr) const;
+
+  /// Emit the specified unsigned leb128 value.
+  void EmitULEB128(uint64_t Value, const char *Desc = nullptr,
+                   unsigned PadTo = 0) const;
+
+  /// Emit a .byte 42 directive for a DW_CFA_xxx value.
+  void EmitCFAByte(unsigned Val) const;
+
+  /// Emit a .byte 42 directive that corresponds to an encoding.  If verbose
+  /// assembly output is enabled, we output comments describing the encoding.
+  /// Desc is a string saying what the encoding is specifying (e.g. "LSDA").
+  void EmitEncodingByte(unsigned Val, const char *Desc = nullptr) const;
+
+  /// Return the size of the encoding in bytes.
+  unsigned GetSizeOfEncodedValue(unsigned Encoding) const;
+
+  /// Emit reference to a ttype global with a specified encoding.
+  void EmitTTypeReference(const GlobalValue *GV, unsigned Encoding) const;
+
+  /// Emit the 4-byte offset of Label from the start of its section.  This can
+  /// be done with a special directive if the target supports it (e.g. cygwin)
+  /// or by emitting it as an offset from a label at the start of the section.
+  ///
+  /// SectionLabel is a temporary label emitted at the start of the section
+  /// that Label lives in.
+  void EmitSectionOffset(const MCSymbol *Label,
+                         const MCSymbol *SectionLabel) const;
+
+  /// Get the value for DW_AT_APPLE_isa. Zero if no isa encoding specified.
+  virtual unsigned getISAEncoding() { return 0; }
+
+  /// \brief Emit a partial DWARF register operation.
+  /// \param MLoc             the register
+  /// \param PieceSize        size and
+  /// \param PieceOffset      offset of the piece in bits, if this is one
+  ///                         piece of an aggregate value.
+  ///
+  /// If size and offset is zero an operation for the entire
+  /// register is emitted: Some targets do not provide a DWARF
+  /// register number for every register.  If this is the case, this
+  /// function will attempt to emit a DWARF register by emitting a
+  /// piece of a super-register or by piecing together multiple
+  /// subregisters that alias the register.
+  void EmitDwarfRegOpPiece(ByteStreamer &BS, const MachineLocation &MLoc,
+                           unsigned PieceSize = 0,
+                           unsigned PieceOffset = 0) const;
+
+  /// Emit dwarf register operation.
+  /// \param Indirect   whether this is a register-indirect address
+  virtual void EmitDwarfRegOp(ByteStreamer &BS, const MachineLocation &MLoc,
+                              bool Indirect) const;
+
+  //===------------------------------------------------------------------===//
+  // Dwarf Lowering Routines
+  //===------------------------------------------------------------------===//
+
+  /// \brief Emit frame instruction to describe the layout of the frame.
+  void emitCFIInstruction(const MCCFIInstruction &Inst) const;
+
+  //===------------------------------------------------------------------===//
+  // Inline Asm Support
+  //===------------------------------------------------------------------===//
+public:
+  // These are hooks that targets can override to implement inline asm
+  // support.  These should probably be moved out of AsmPrinter someday.
+
+  /// Print information related to the specified machine instr that is
+  /// independent of the operand, and may be independent of the instr itself.
+  /// This can be useful for portably encoding the comment character or other
+  /// bits of target-specific knowledge into the asmstrings.  The syntax used is
+  /// ${:comment}.  Targets can override this to add support for their own
+  /// strange codes.
+  virtual void PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
+                            const char *Code) const;
+
+  /// Print the specified operand of MI, an INLINEASM instruction, using the
+  /// specified assembler variant.  Targets should override this to format as
+  /// appropriate.  This method can return true if the operand is erroneous.
+  virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &OS);
+
+  /// Print the specified operand of MI, an INLINEASM instruction, using the
+  /// specified assembler variant as an address. Targets should override this to
+  /// format as appropriate.  This method can return true if the operand is
+  /// erroneous.
+  virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                     unsigned AsmVariant, const char *ExtraCode,
+                                     raw_ostream &OS);
+
+  /// Let the target do anything it needs to do after emitting inlineasm.
+  /// This callback can be used restore the original mode in case the
+  /// inlineasm contains directives to switch modes.
+  /// \p StartInfo - the original subtarget info before inline asm
+  /// \p EndInfo   - the final subtarget info after parsing the inline asm,
+  ///                or NULL if the value is unknown.
+  virtual void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                                const MCSubtargetInfo *EndInfo) const;
+
+private:
+  /// Private state for PrintSpecial()
+  // Assign a unique ID to this machine instruction.
+  mutable const MachineInstr *LastMI;
+  mutable unsigned LastFn;
+  mutable unsigned Counter;
+  mutable unsigned SetCounter;
+
+  /// Emit a blob of inline asm to the output streamer.
+  void
+  EmitInlineAsm(StringRef Str, const MDNode *LocMDNode = nullptr,
+                InlineAsm::AsmDialect AsmDialect = InlineAsm::AD_ATT) const;
+
+  /// This method formats and emits the specified machine instruction that is an
+  /// inline asm.
+  void EmitInlineAsm(const MachineInstr *MI) const;
+
+  //===------------------------------------------------------------------===//
+  // Internal Implementation Details
+  //===------------------------------------------------------------------===//
+
+  /// This emits visibility information about symbol, if this is suported by the
+  /// target.
+  void EmitVisibility(MCSymbol *Sym, unsigned Visibility,
+                      bool IsDefinition = true) const;
+
+  void EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const;
+
+  void EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                          const MachineBasicBlock *MBB, unsigned uid) const;
+  void EmitLLVMUsedList(const ConstantArray *InitList);
+  /// Emit llvm.ident metadata in an '.ident' directive.
+  void EmitModuleIdents(Module &M);
+  void EmitXXStructorList(const Constant *List, bool isCtor);
+  GCMetadataPrinter *GetOrCreateGCPrinter(GCStrategy &C);
+};
 }
 
 #endif
diff --git a/include/llvm/CodeGen/CallingConvLower.h b/include/llvm/CodeGen/CallingConvLower.h
index 50bbb0d..04af4bd 100644
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -112,6 +112,23 @@ public:
     return Ret;
   }
 
+  // There is no need to differentiate between a pending CCValAssign and other
+  // kinds, as they are stored in a different list.
+  static CCValAssign getPending(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                LocInfo HTP) {
+    return getReg(ValNo, ValVT, 0, LocVT, HTP);
+  }
+
+  void convertToReg(unsigned RegNo) {
+    Loc = RegNo;
+    isMem = false;
+  }
+
+  void convertToMem(unsigned Offset) {
+    Loc = Offset;
+    isMem = true;
+  }
+
   unsigned getValNo() const { return ValNo; }
   MVT getValVT() const { return ValVT; }
 
@@ -164,6 +181,7 @@ private:
 
   unsigned StackOffset;
   SmallVector<uint32_t, 16> UsedRegs;
+  SmallVector<CCValAssign, 4> PendingLocs;
 
   // ByValInfo and SmallVector<ByValInfo, 4> ByValRegs:
   //
@@ -279,7 +297,7 @@ public:
 
   /// getFirstUnallocated - Return the first unallocated register in the set, or
   /// NumRegs if they are all allocated.
-  unsigned getFirstUnallocated(const uint16_t *Regs, unsigned NumRegs) const {
+  unsigned getFirstUnallocated(const MCPhysReg *Regs, unsigned NumRegs) const {
     for (unsigned i = 0; i != NumRegs; ++i)
       if (!isAllocated(Regs[i]))
         return i;
@@ -306,7 +324,7 @@ public:
   /// AllocateReg - Attempt to allocate one of the specified registers.  If none
   /// are available, return zero.  Otherwise, return the first one available,
   /// marking it and any aliases as allocated.
-  unsigned AllocateReg(const uint16_t *Regs, unsigned NumRegs) {
+  unsigned AllocateReg(const MCPhysReg *Regs, unsigned NumRegs) {
     unsigned FirstUnalloc = getFirstUnallocated(Regs, NumRegs);
     if (FirstUnalloc == NumRegs)
       return 0;    // Didn't find the reg.
@@ -317,8 +335,33 @@ public:
     return Reg;
   }
 
+  /// AllocateRegBlock - Attempt to allocate a block of RegsRequired consecutive
+  /// registers. If this is not possible, return zero. Otherwise, return the first
+  /// register of the block that were allocated, marking the entire block as allocated.
+  unsigned AllocateRegBlock(const uint16_t *Regs, unsigned NumRegs, unsigned RegsRequired) {
+    for (unsigned StartIdx = 0; StartIdx <= NumRegs - RegsRequired; ++StartIdx) {
+      bool BlockAvailable = true;
+      // Check for already-allocated regs in this block
+      for (unsigned BlockIdx = 0; BlockIdx < RegsRequired; ++BlockIdx) {
+        if (isAllocated(Regs[StartIdx + BlockIdx])) {
+          BlockAvailable = false;
+          break;
+        }
+      }
+      if (BlockAvailable) {
+        // Mark the entire block as allocated
+        for (unsigned BlockIdx = 0; BlockIdx < RegsRequired; ++BlockIdx) {
+          MarkAllocated(Regs[StartIdx + BlockIdx]);
+        }
+        return Regs[StartIdx];
+      }
+    }
+    // No block was available
+    return 0;
+  }
+
   /// Version of AllocateReg with list of registers to be shadowed.
-  unsigned AllocateReg(const uint16_t *Regs, const uint16_t *ShadowRegs,
+  unsigned AllocateReg(const MCPhysReg *Regs, const MCPhysReg *ShadowRegs,
                        unsigned NumRegs) {
     unsigned FirstUnalloc = getFirstUnallocated(Regs, NumRegs);
     if (FirstUnalloc == NumRegs)
@@ -351,7 +394,7 @@ public:
   /// Version of AllocateStack with list of extra registers to be shadowed.
   /// Note that, unlike AllocateReg, this shadows ALL of the shadow registers.
   unsigned AllocateStack(unsigned Size, unsigned Align,
-                         const uint16_t *ShadowRegs, unsigned NumShadowRegs) {
+                         const MCPhysReg *ShadowRegs, unsigned NumShadowRegs) {
     for (unsigned i = 0; i < NumShadowRegs; ++i)
       MarkAllocated(ShadowRegs[i]);
     return AllocateStack(Size, Align);
@@ -411,6 +454,11 @@ public:
 
   ParmContext getCallOrPrologue() const { return CallOrPrologue; }
 
+  // Get list of pending assignments
+  SmallVectorImpl<llvm::CCValAssign> &getPendingLocs() {
+    return PendingLocs;
+  }
+
 private:
   /// MarkAllocated - Mark a register and all of its aliases as allocated.
   void MarkAllocated(unsigned Reg);
diff --git a/include/llvm/CodeGen/CommandFlags.h b/include/llvm/CodeGen/CommandFlags.h
index 02a4bb5..2956ad8 100644
--- a/include/llvm/CodeGen/CommandFlags.h
+++ b/include/llvm/CodeGen/CommandFlags.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_CODEGEN_COMMANDFLAGS_H
 #define LLVM_CODEGEN_COMMANDFLAGS_H
 
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
@@ -69,11 +70,6 @@ CMModel("code-model",
                               "Large code model"),
                    clEnumValEnd));
 
-cl::opt<bool>
-RelaxAll("mc-relax-all",
-         cl::desc("When used with filetype=obj, "
-                  "relax all fixups in the emitted object file"));
-
 cl::opt<TargetMachine::CodeGenFileType>
 FileType("filetype", cl::init(TargetMachine::CGFT_AssemblyFile),
   cl::desc("Choose a file type (not all types are supported by all targets):"),
@@ -86,12 +82,6 @@ FileType("filetype", cl::init(TargetMachine::CGFT_AssemblyFile),
                         "Emit nothing, for performance testing"),
              clEnumValEnd));
 
-cl::opt<bool> DisableCFI("disable-cfi", cl::Hidden,
-                         cl::desc("Do not use .cfi_* directives"));
-
-cl::opt<bool> EnableDwarfDirectory("enable-dwarf-directory", cl::Hidden,
-                  cl::desc("Use .file directives with an explicit directory."));
-
 cl::opt<bool>
 DisableRedZone("disable-red-zone",
                cl::desc("Do not emit code that uses the red zone."),
@@ -190,11 +180,6 @@ EnablePIE("enable-pie",
           cl::init(false));
 
 cl::opt<bool>
-SegmentedStacks("segmented-stacks",
-                cl::desc("Use segmented stacks if possible."),
-                cl::init(false));
-
-cl::opt<bool>
 UseInitArray("use-init-array",
              cl::desc("Use .init_array instead of .ctors."),
              cl::init(false));
@@ -208,6 +193,15 @@ cl::opt<std::string> StartAfter("start-after",
                           cl::value_desc("pass-name"),
                           cl::init(""));
 
+cl::opt<bool> DataSections("data-sections",
+                           cl::desc("Emit data into separate sections"),
+                           cl::init(false));
+
+cl::opt<bool>
+FunctionSections("function-sections",
+                 cl::desc("Emit functions into separate sections"),
+                 cl::init(false));
+
 // Common utility function tightly tied to the options listed here. Initializes
 // a TargetOptions object with CodeGen flags and returns it.
 static inline TargetOptions InitTargetOptionsFromCodeGenFlags() {
@@ -229,8 +223,12 @@ static inline TargetOptions InitTargetOptionsFromCodeGenFlags() {
   Options.StackAlignmentOverride = OverrideStackAlignment;
   Options.TrapFuncName = TrapFuncName;
   Options.PositionIndependentExecutable = EnablePIE;
-  Options.EnableSegmentedStacks = SegmentedStacks;
   Options.UseInitArray = UseInitArray;
+  Options.DataSections = DataSections;
+  Options.FunctionSections = FunctionSections;
+
+  Options.MCOptions = InitMCTargetOptionsFromFlags();
+
   return Options;
 }
 
diff --git a/include/llvm/CodeGen/EdgeBundles.h b/include/llvm/CodeGen/EdgeBundles.h
index 2899fe1..c31fad2 100644
--- a/include/llvm/CodeGen/EdgeBundles.h
+++ b/include/llvm/CodeGen/EdgeBundles.h
@@ -59,11 +59,6 @@ private:
   void getAnalysisUsage(AnalysisUsage&) const override;
 };
 
-/// Specialize WriteGraph, the standard implementation won't work.
-raw_ostream &WriteGraph(raw_ostream &O, const EdgeBundles &G,
-                        bool ShortNames = false,
-                        const Twine &Title = "");
-
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index aeffbd4..bfeede2 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h
@@ -343,6 +343,12 @@ protected:
 
   unsigned createResultReg(const TargetRegisterClass *RC);
 
+  /// Try to constrain Op so that it is usable by argument OpNum of the provided
+  /// MCInstrDesc. If this fails, create a new virtual register in the correct
+  /// class and COPY the value there.
+  unsigned constrainOperandRegClass(const MCInstrDesc &II, unsigned Op,
+                                    unsigned OpNum);
+
   /// Emit a constant in a register using target-specific logic, such as
   /// constant pool loads.
   virtual unsigned TargetMaterializeConstant(const Constant* C) {
diff --git a/include/llvm/CodeGen/FunctionLoweringInfo.h b/include/llvm/CodeGen/FunctionLoweringInfo.h
index 06e7aaa..9636b51 100644
--- a/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -153,11 +153,11 @@ public:
   /// register is a PHI destination and the PHI's LiveOutInfo is not valid.
   const LiveOutInfo *GetLiveOutRegInfo(unsigned Reg) {
     if (!LiveOutRegInfo.inBounds(Reg))
-      return NULL;
+      return nullptr;
 
     const LiveOutInfo *LOI = &LiveOutRegInfo[Reg];
     if (!LOI->IsValid)
-      return NULL;
+      return nullptr;
 
     return LOI;
   }
diff --git a/include/llvm/CodeGen/GCMetadata.h b/include/llvm/CodeGen/GCMetadata.h
index ea94542..ddcc823 100644
--- a/include/llvm/CodeGen/GCMetadata.h
+++ b/include/llvm/CodeGen/GCMetadata.h
@@ -38,6 +38,8 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 
+#include <memory>
+
 namespace llvm {
   class AsmPrinter;
   class GCStrategy;
@@ -163,7 +165,7 @@ namespace llvm {
   ///
   class GCModuleInfo : public ImmutablePass {
     typedef StringMap<GCStrategy*> strategy_map_type;
-    typedef std::vector<GCStrategy*> list_type;
+    typedef std::vector<std::unique_ptr<GCStrategy>> list_type;
     typedef DenseMap<const Function*,GCFunctionInfo*> finfo_map_type;
 
     strategy_map_type StrategyMap;
@@ -178,7 +180,6 @@ namespace llvm {
     static char ID;
 
     GCModuleInfo();
-    ~GCModuleInfo();
 
     /// clear - Resets the pass. Any pass, which uses GCModuleInfo, should
     /// call it in doFinalization().
diff --git a/include/llvm/CodeGen/GCStrategy.h b/include/llvm/CodeGen/GCStrategy.h
index dfc26d7..81e1f85 100644
--- a/include/llvm/CodeGen/GCStrategy.h
+++ b/include/llvm/CodeGen/GCStrategy.h
@@ -54,7 +54,7 @@ namespace llvm {
   /// be abstractly described.
   class GCStrategy {
   public:
-    typedef std::vector<GCFunctionInfo*> list_type;
+    typedef std::vector<std::unique_ptr<GCFunctionInfo>> list_type;
     typedef list_type::iterator iterator;
     
   private:
@@ -77,7 +77,7 @@ namespace llvm {
   public:
     GCStrategy();
     
-    virtual ~GCStrategy();
+    virtual ~GCStrategy() {}
     
     
     /// getName - The name of the GC strategy, for debugging.
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index 89b0908..49891b2 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -72,6 +72,11 @@ namespace ISD {
     /// the parent's frame or return address, and so on.
     FRAMEADDR, RETURNADDR,
 
+    /// READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on
+    /// the DAG, which implements the named register global variables extension.
+    READ_REGISTER,
+    WRITE_REGISTER,
+
     /// FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to
     /// first (possible) on-stack argument. This is needed for correct stack
     /// adjustment during unwind.
diff --git a/include/llvm/CodeGen/JITCodeEmitter.h b/include/llvm/CodeGen/JITCodeEmitter.h
index bb0df2e..dc2a027 100644
--- a/include/llvm/CodeGen/JITCodeEmitter.h
+++ b/include/llvm/CodeGen/JITCodeEmitter.h
@@ -260,7 +260,7 @@ public:
     // Check for buffer overflow.
     if (Size >= (uintptr_t)(BufferEnd-CurBufferPtr)) {
       CurBufferPtr = BufferEnd;
-      Result = 0;
+      Result = nullptr;
     } else {
       // Allocate the space.
       Result = CurBufferPtr;
@@ -334,7 +334,9 @@ public:
 
   /// getLabelLocations - Return the label locations map of the label IDs to
   /// their address.
-  virtual DenseMap<MCSymbol*, uintptr_t> *getLabelLocations() { return 0; }
+  virtual DenseMap<MCSymbol*, uintptr_t> *getLabelLocations() {
+    return nullptr;
+  }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/CodeGen/LatencyPriorityQueue.h b/include/llvm/CodeGen/LatencyPriorityQueue.h
index d566da8..cf601ae 100644
--- a/include/llvm/CodeGen/LatencyPriorityQueue.h
+++ b/include/llvm/CodeGen/LatencyPriorityQueue.h
@@ -62,7 +62,7 @@ namespace llvm {
     }
 
     void releaseState() override {
-      SUnits = 0;
+      SUnits = nullptr;
     }
 
     unsigned getLatency(unsigned NodeNum) const {
diff --git a/include/llvm/CodeGen/LexicalScopes.h b/include/llvm/CodeGen/LexicalScopes.h
index e0593f8..31d6872 100644
--- a/include/llvm/CodeGen/LexicalScopes.h
+++ b/include/llvm/CodeGen/LexicalScopes.h
@@ -21,16 +21,17 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/ValueHandle.h"
 #include <utility>
+#include <unordered_map>
 namespace llvm {
 
 class MachineInstr;
 class MachineBasicBlock;
 class MachineFunction;
-class LexicalScope;
 
 //===----------------------------------------------------------------------===//
 /// InsnRange - This is used to track range of instructions with identical
@@ -39,13 +40,103 @@ class LexicalScope;
 typedef std::pair<const MachineInstr *, const MachineInstr *> InsnRange;
 
 //===----------------------------------------------------------------------===//
+/// LexicalScope - This class is used to track scope information.
+///
+class LexicalScope {
+
+public:
+  LexicalScope(LexicalScope *P, const MDNode *D, const MDNode *I, bool A)
+      : Parent(P), Desc(D), InlinedAtLocation(I), AbstractScope(A),
+        LastInsn(nullptr), FirstInsn(nullptr), DFSIn(0), DFSOut(0) {
+    if (Parent)
+      Parent->addChild(this);
+  }
+
+  // Accessors.
+  LexicalScope *getParent() const { return Parent; }
+  const MDNode *getDesc() const { return Desc; }
+  const MDNode *getInlinedAt() const { return InlinedAtLocation; }
+  const MDNode *getScopeNode() const { return Desc; }
+  bool isAbstractScope() const { return AbstractScope; }
+  SmallVectorImpl<LexicalScope *> &getChildren() { return Children; }
+  SmallVectorImpl<InsnRange> &getRanges() { return Ranges; }
+
+  /// addChild - Add a child scope.
+  void addChild(LexicalScope *S) { Children.push_back(S); }
+
+  /// openInsnRange - This scope covers instruction range starting from MI.
+  void openInsnRange(const MachineInstr *MI) {
+    if (!FirstInsn)
+      FirstInsn = MI;
+
+    if (Parent)
+      Parent->openInsnRange(MI);
+  }
+
+  /// extendInsnRange - Extend the current instruction range covered by
+  /// this scope.
+  void extendInsnRange(const MachineInstr *MI) {
+    assert(FirstInsn && "MI Range is not open!");
+    LastInsn = MI;
+    if (Parent)
+      Parent->extendInsnRange(MI);
+  }
+
+  /// closeInsnRange - Create a range based on FirstInsn and LastInsn collected
+  /// until now. This is used when a new scope is encountered while walking
+  /// machine instructions.
+  void closeInsnRange(LexicalScope *NewScope = nullptr) {
+    assert(LastInsn && "Last insn missing!");
+    Ranges.push_back(InsnRange(FirstInsn, LastInsn));
+    FirstInsn = nullptr;
+    LastInsn = nullptr;
+    // If Parent dominates NewScope then do not close Parent's instruction
+    // range.
+    if (Parent && (!NewScope || !Parent->dominates(NewScope)))
+      Parent->closeInsnRange(NewScope);
+  }
+
+  /// dominates - Return true if current scope dominates given lexical scope.
+  bool dominates(const LexicalScope *S) const {
+    if (S == this)
+      return true;
+    if (DFSIn < S->getDFSIn() && DFSOut > S->getDFSOut())
+      return true;
+    return false;
+  }
+
+  // Depth First Search support to walk and manipulate LexicalScope hierarchy.
+  unsigned getDFSOut() const { return DFSOut; }
+  void setDFSOut(unsigned O) { DFSOut = O; }
+  unsigned getDFSIn() const { return DFSIn; }
+  void setDFSIn(unsigned I) { DFSIn = I; }
+
+  /// dump - print lexical scope.
+  void dump(unsigned Indent = 0) const;
+
+private:
+  LexicalScope *Parent;                        // Parent to this scope.
+  AssertingVH<const MDNode> Desc;              // Debug info descriptor.
+  AssertingVH<const MDNode> InlinedAtLocation; // Location at which this
+                                               // scope is inlined.
+  bool AbstractScope;                          // Abstract Scope
+  SmallVector<LexicalScope *, 4> Children;     // Scopes defined in scope.
+                                               // Contents not owned.
+  SmallVector<InsnRange, 4> Ranges;
+
+  const MachineInstr *LastInsn;  // Last instruction of this scope.
+  const MachineInstr *FirstInsn; // First instruction of this scope.
+  unsigned DFSIn, DFSOut;        // In & Out Depth use to determine
+                                 // scope nesting.
+};
+
+//===----------------------------------------------------------------------===//
 /// LexicalScopes -  This class provides interface to collect and use lexical
 /// scoping information from machine instruction.
 ///
 class LexicalScopes {
 public:
-  LexicalScopes() : MF(NULL), CurrentFnLexicalScope(NULL) {}
-  ~LexicalScopes();
+  LexicalScopes() : MF(nullptr), CurrentFnLexicalScope(nullptr) {}
 
   /// initialize - Scan machine function and constuct lexical scope nest, resets
   /// the instance if necessary.
@@ -55,7 +146,7 @@ public:
   void reset();
 
   /// empty - Return true if there is any lexical scope information available.
-  bool empty() { return CurrentFnLexicalScope == NULL; }
+  bool empty() { return CurrentFnLexicalScope == nullptr; }
 
   /// isCurrentFunctionScope - Return true if given lexical scope represents
   /// current function.
@@ -87,20 +178,20 @@ public:
     return AbstractScopesList;
   }
 
-  /// findAbstractScope - Find an abstract scope or return NULL.
+  /// findAbstractScope - Find an abstract scope or return null.
   LexicalScope *findAbstractScope(const MDNode *N) {
-    return AbstractScopeMap.lookup(N);
+    auto I = AbstractScopeMap.find(N);
+    return I != AbstractScopeMap.end() ? &I->second : nullptr;
   }
 
   /// findInlinedScope - Find an inlined scope for the given DebugLoc or return
   /// NULL.
-  LexicalScope *findInlinedScope(DebugLoc DL) {
-    return InlinedLexicalScopeMap.lookup(DL);
-  }
+  LexicalScope *findInlinedScope(DebugLoc DL);
 
-  /// findLexicalScope - Find regular lexical scope or return NULL.
+  /// findLexicalScope - Find regular lexical scope or return null.
   LexicalScope *findLexicalScope(const MDNode *N) {
-    return LexicalScopeMap.lookup(N);
+    auto I = LexicalScopeMap.find(N);
+    return I != LexicalScopeMap.end() ? &I->second : nullptr;
   }
 
   /// dump - Print data structures to dbgs().
@@ -132,17 +223,19 @@ private:
 private:
   const MachineFunction *MF;
 
-  /// LexicalScopeMap - Tracks the scopes in the current function.  Owns the
-  /// contained LexicalScope*s.
-  DenseMap<const MDNode *, LexicalScope *> LexicalScopeMap;
+  /// LexicalScopeMap - Tracks the scopes in the current function.
+  // Use an unordered_map to ensure value pointer validity over insertion.
+  std::unordered_map<const MDNode *, LexicalScope> LexicalScopeMap;
 
   /// InlinedLexicalScopeMap - Tracks inlined function scopes in current
   /// function.
-  DenseMap<DebugLoc, LexicalScope *> InlinedLexicalScopeMap;
+  std::unordered_map<std::pair<const MDNode *, const MDNode *>, LexicalScope,
+                     pair_hash<const MDNode *, const MDNode *>>
+  InlinedLexicalScopeMap;
 
   /// AbstractScopeMap - These scopes are  not included LexicalScopeMap.
-  /// AbstractScopes owns its LexicalScope*s.
-  DenseMap<const MDNode *, LexicalScope *> AbstractScopeMap;
+  // Use an unordered_map to ensure value pointer validity over insertion.
+  std::unordered_map<const MDNode *, LexicalScope> AbstractScopeMap;
 
   /// AbstractScopesList - Tracks abstract scopes constructed while processing
   /// a function.
@@ -153,97 +246,6 @@ private:
   LexicalScope *CurrentFnLexicalScope;
 };
 
-//===----------------------------------------------------------------------===//
-/// LexicalScope - This class is used to track scope information.
-///
-class LexicalScope {
-
-public:
-  LexicalScope(LexicalScope *P, const MDNode *D, const MDNode *I, bool A)
-      : Parent(P), Desc(D), InlinedAtLocation(I), AbstractScope(A), LastInsn(0),
-        FirstInsn(0), DFSIn(0), DFSOut(0) {
-    if (Parent)
-      Parent->addChild(this);
-  }
-
-  // Accessors.
-  LexicalScope *getParent() const { return Parent; }
-  const MDNode *getDesc() const { return Desc; }
-  const MDNode *getInlinedAt() const { return InlinedAtLocation; }
-  const MDNode *getScopeNode() const { return Desc; }
-  bool isAbstractScope() const { return AbstractScope; }
-  SmallVectorImpl<LexicalScope *> &getChildren() { return Children; }
-  SmallVectorImpl<InsnRange> &getRanges() { return Ranges; }
-
-  /// addChild - Add a child scope.
-  void addChild(LexicalScope *S) { Children.push_back(S); }
-
-  /// openInsnRange - This scope covers instruction range starting from MI.
-  void openInsnRange(const MachineInstr *MI) {
-    if (!FirstInsn)
-      FirstInsn = MI;
-
-    if (Parent)
-      Parent->openInsnRange(MI);
-  }
-
-  /// extendInsnRange - Extend the current instruction range covered by
-  /// this scope.
-  void extendInsnRange(const MachineInstr *MI) {
-    assert(FirstInsn && "MI Range is not open!");
-    LastInsn = MI;
-    if (Parent)
-      Parent->extendInsnRange(MI);
-  }
-
-  /// closeInsnRange - Create a range based on FirstInsn and LastInsn collected
-  /// until now. This is used when a new scope is encountered while walking
-  /// machine instructions.
-  void closeInsnRange(LexicalScope *NewScope = NULL) {
-    assert(LastInsn && "Last insn missing!");
-    Ranges.push_back(InsnRange(FirstInsn, LastInsn));
-    FirstInsn = NULL;
-    LastInsn = NULL;
-    // If Parent dominates NewScope then do not close Parent's instruction
-    // range.
-    if (Parent && (!NewScope || !Parent->dominates(NewScope)))
-      Parent->closeInsnRange(NewScope);
-  }
-
-  /// dominates - Return true if current scope dominates given lexical scope.
-  bool dominates(const LexicalScope *S) const {
-    if (S == this)
-      return true;
-    if (DFSIn < S->getDFSIn() && DFSOut > S->getDFSOut())
-      return true;
-    return false;
-  }
-
-  // Depth First Search support to walk and manipulate LexicalScope hierarchy.
-  unsigned getDFSOut() const { return DFSOut; }
-  void setDFSOut(unsigned O) { DFSOut = O; }
-  unsigned getDFSIn() const { return DFSIn; }
-  void setDFSIn(unsigned I) { DFSIn = I; }
-
-  /// dump - print lexical scope.
-  void dump(unsigned Indent = 0) const;
-
-private:
-  LexicalScope *Parent;                        // Parent to this scope.
-  AssertingVH<const MDNode> Desc;              // Debug info descriptor.
-  AssertingVH<const MDNode> InlinedAtLocation; // Location at which this
-                                               // scope is inlined.
-  bool AbstractScope;                          // Abstract Scope
-  SmallVector<LexicalScope *, 4> Children;     // Scopes defined in scope.
-                                               // Contents not owned.
-  SmallVector<InsnRange, 4> Ranges;
-
-  const MachineInstr *LastInsn;  // Last instruction of this scope.
-  const MachineInstr *FirstInsn; // First instruction of this scope.
-  unsigned DFSIn, DFSOut;        // In & Out Depth use to determine
-                                 // scope nesting.
-};
-
 } // end llvm namespace
 
 #endif
diff --git a/include/llvm/CodeGen/LinkAllCodegenComponents.h b/include/llvm/CodeGen/LinkAllCodegenComponents.h
index 916c0f2..372c294 100644
--- a/include/llvm/CodeGen/LinkAllCodegenComponents.h
+++ b/include/llvm/CodeGen/LinkAllCodegenComponents.h
@@ -40,12 +40,15 @@ namespace {
       llvm::linkErlangGC();
       llvm::linkShadowStackGC();
 
-      (void) llvm::createBURRListDAGScheduler(NULL, llvm::CodeGenOpt::Default);
-      (void) llvm::createSourceListDAGScheduler(NULL,llvm::CodeGenOpt::Default);
-      (void) llvm::createHybridListDAGScheduler(NULL,llvm::CodeGenOpt::Default);
-      (void) llvm::createFastDAGScheduler(NULL, llvm::CodeGenOpt::Default);
-      (void) llvm::createDefaultScheduler(NULL, llvm::CodeGenOpt::Default);
-      (void) llvm::createVLIWDAGScheduler(NULL, llvm::CodeGenOpt::Default);
+      (void) llvm::createBURRListDAGScheduler(nullptr,
+                                              llvm::CodeGenOpt::Default);
+      (void) llvm::createSourceListDAGScheduler(nullptr,
+                                                llvm::CodeGenOpt::Default);
+      (void) llvm::createHybridListDAGScheduler(nullptr,
+                                                llvm::CodeGenOpt::Default);
+      (void) llvm::createFastDAGScheduler(nullptr, llvm::CodeGenOpt::Default);
+      (void) llvm::createDefaultScheduler(nullptr, llvm::CodeGenOpt::Default);
+      (void) llvm::createVLIWDAGScheduler(nullptr, llvm::CodeGenOpt::Default);
 
     }
   } ForceCodegenLinking; // Force link by creating a global definition.
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index 41d126a..6629e60 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -116,13 +116,13 @@ namespace llvm {
     /// Return the value leaving the instruction, if any. This can be a
     /// live-through value, or a live def. A dead def returns NULL.
     VNInfo *valueOut() const {
-      return isDeadDef() ? 0 : LateVal;
+      return isDeadDef() ? nullptr : LateVal;
     }
 
     /// Return the value defined by this instruction, if any. This includes
     /// dead defs, it is the value created by the instruction's def operands.
     VNInfo *valueDefined() const {
-      return EarlyVal == LateVal ? 0 : LateVal;
+      return EarlyVal == LateVal ? nullptr : LateVal;
     }
 
     /// Return the end point of the last live range segment to interact with
@@ -154,7 +154,7 @@ namespace llvm {
       SlotIndex end;    // End point of the interval (exclusive)
       VNInfo *valno;    // identifier for the value contained in this segment.
 
-      Segment() : valno(0) {}
+      Segment() : valno(nullptr) {}
 
       Segment(SlotIndex S, SlotIndex E, VNInfo *V)
         : start(S), end(E), valno(V) {
@@ -336,20 +336,20 @@ namespace llvm {
     /// is none.
     const Segment *getSegmentContaining(SlotIndex Idx) const {
       const_iterator I = FindSegmentContaining(Idx);
-      return I == end() ? 0 : &*I;
+      return I == end() ? nullptr : &*I;
     }
 
     /// Return the live segment that contains the specified index, or null if
     /// there is none.
     Segment *getSegmentContaining(SlotIndex Idx) {
       iterator I = FindSegmentContaining(Idx);
-      return I == end() ? 0 : &*I;
+      return I == end() ? nullptr : &*I;
     }
 
     /// getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
     VNInfo *getVNInfoAt(SlotIndex Idx) const {
       const_iterator I = FindSegmentContaining(Idx);
-      return I == end() ? 0 : I->valno;
+      return I == end() ? nullptr : I->valno;
     }
 
     /// getVNInfoBefore - Return the VNInfo that is live up to but not
@@ -357,7 +357,7 @@ namespace llvm {
     /// used by an instruction at this SlotIndex position.
     VNInfo *getVNInfoBefore(SlotIndex Idx) const {
       const_iterator I = FindSegmentContaining(Idx.getPrevSlot());
-      return I == end() ? 0 : I->valno;
+      return I == end() ? nullptr : I->valno;
     }
 
     /// Return an iterator to the segment that contains the specified index, or
@@ -443,13 +443,13 @@ namespace llvm {
       const_iterator I = find(Idx.getBaseIndex());
       const_iterator E = end();
       if (I == E)
-        return LiveQueryResult(0, 0, SlotIndex(), false);
+        return LiveQueryResult(nullptr, nullptr, SlotIndex(), false);
 
       // Is this an instruction live-in segment?
       // If Idx is the start index of a basic block, include live-in segments
       // that start at Idx.getBaseIndex().
-      VNInfo *EarlyVal = 0;
-      VNInfo *LateVal  = 0;
+      VNInfo *EarlyVal = nullptr;
+      VNInfo *LateVal  = nullptr;
       SlotIndex EndPoint;
       bool Kill = false;
       if (I->start <= Idx.getBaseIndex()) {
@@ -466,7 +466,7 @@ namespace llvm {
         // predecessor.
         // Such a value is not live-in.
         if (EarlyVal->def == Idx.getBaseIndex())
-          EarlyVal = 0;
+          EarlyVal = nullptr;
       }
       // I now points to the segment that may be live-through, or defined by
       // this instr. Ignore segments starting after the current instr.
@@ -597,7 +597,7 @@ namespace llvm {
   public:
     /// Create a LiveRangeUpdater for adding segments to LR.
     /// LR will temporarily be in an invalid state until flush() is called.
-    LiveRangeUpdater(LiveRange *lr = 0) : LR(lr) {}
+    LiveRangeUpdater(LiveRange *lr = nullptr) : LR(lr) {}
 
     ~LiveRangeUpdater() { flush(); }
 
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index 5492593..ddd623c 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h
@@ -137,7 +137,7 @@ namespace llvm {
     // Interval removal.
     void removeInterval(unsigned Reg) {
       delete VirtRegIntervals[Reg];
-      VirtRegIntervals[Reg] = 0;
+      VirtRegIntervals[Reg] = nullptr;
     }
 
     /// Given a register and an instruction, adds a live segment from that
@@ -153,7 +153,7 @@ namespace llvm {
     /// Return true if the interval may have been separated into multiple
     /// connected components.
     bool shrinkToUses(LiveInterval *li,
-                      SmallVectorImpl<MachineInstr*> *dead = 0);
+                      SmallVectorImpl<MachineInstr*> *dead = nullptr);
 
     /// extendToIndices - Extend the live range of LI to reach all points in
     /// Indices. The points in the Indices array must be jointly dominated by
@@ -262,7 +262,7 @@ namespace llvm {
     bool runOnMachineFunction(MachineFunction&) override;
 
     /// print - Implement the dump method.
-    void print(raw_ostream &O, const Module* = 0) const override;
+    void print(raw_ostream &O, const Module* = nullptr) const override;
 
     /// intervalIsInOneMBB - If LI is confined to a single basic block, return
     /// a pointer to that block.  If LI is live in to or out of any block,
diff --git a/include/llvm/CodeGen/LiveIntervalUnion.h b/include/llvm/CodeGen/LiveIntervalUnion.h
index 95933d1..2f40509 100644
--- a/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -122,8 +122,8 @@ public:
     {}
 
     void clear() {
-      LiveUnion = NULL;
-      VirtReg = NULL;
+      LiveUnion = nullptr;
+      VirtReg = nullptr;
       InterferingVRegs.clear();
       CheckedFirstInterference = false;
       SeenAllInterferences = false;
@@ -182,7 +182,7 @@ public:
     unsigned Size;
     LiveIntervalUnion *LIUs;
   public:
-    Array() : Size(0), LIUs(0) {}
+    Array() : Size(0), LIUs(nullptr) {}
     ~Array() { clear(); }
 
     // Initialize the array to have Size entries.
diff --git a/include/llvm/CodeGen/LivePhysRegs.h b/include/llvm/CodeGen/LivePhysRegs.h
index c93eaf5..847092b 100644
--- a/include/llvm/CodeGen/LivePhysRegs.h
+++ b/include/llvm/CodeGen/LivePhysRegs.h
@@ -48,7 +48,7 @@ class LivePhysRegs {
   LivePhysRegs &operator=(const LivePhysRegs&) LLVM_DELETED_FUNCTION;
 public:
   /// \brief Constructs a new empty LivePhysRegs set.
-  LivePhysRegs() : TRI(0), LiveRegs() {}
+  LivePhysRegs() : TRI(nullptr), LiveRegs() {}
 
   /// \brief Constructs and initialize an empty LivePhysRegs set.
   LivePhysRegs(const TargetRegisterInfo *TRI) : TRI(TRI) {
diff --git a/include/llvm/CodeGen/LiveRangeEdit.h b/include/llvm/CodeGen/LiveRangeEdit.h
index 4ce39e3..5767cab 100644
--- a/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/include/llvm/CodeGen/LiveRangeEdit.h
@@ -116,7 +116,7 @@ public:
                 MachineFunction &MF,
                 LiveIntervals &lis,
                 VirtRegMap *vrm,
-                Delegate *delegate = 0)
+                Delegate *delegate = nullptr)
     : Parent(parent), NewRegs(newRegs),
       MRI(MF.getRegInfo()), LIS(lis), VRM(vrm),
       TII(*MF.getTarget().getInstrInfo()),
@@ -174,7 +174,7 @@ public:
   struct Remat {
     VNInfo *ParentVNI;      // parent_'s value at the remat location.
     MachineInstr *OrigMI;   // Instruction defining ParentVNI.
-    explicit Remat(VNInfo *ParentVNI) : ParentVNI(ParentVNI), OrigMI(0) {}
+    explicit Remat(VNInfo *ParentVNI) : ParentVNI(ParentVNI), OrigMI(nullptr) {}
   };
 
   /// canRematerializeAt - Determine if ParentVNI can be rematerialized at
diff --git a/include/llvm/CodeGen/LiveRegMatrix.h b/include/llvm/CodeGen/LiveRegMatrix.h
index 28b819b..878b4d9 100644
--- a/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/include/llvm/CodeGen/LiveRegMatrix.h
@@ -25,7 +25,6 @@
 #define LLVM_CODEGEN_LIVEREGMATRIX_H
 
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
@@ -51,7 +50,7 @@ class LiveRegMatrix : public MachineFunctionPass {
   LiveIntervalUnion::Array Matrix;
 
   // Cached queries per register unit.
-  OwningArrayPtr<LiveIntervalUnion::Query> Queries;
+  std::unique_ptr<LiveIntervalUnion::Query[]> Queries;
 
   // Cached register mask interference info.
   unsigned RegMaskTag;
diff --git a/include/llvm/CodeGen/LiveStackAnalysis.h b/include/llvm/CodeGen/LiveStackAnalysis.h
index ac32a9c..df68398 100644
--- a/include/llvm/CodeGen/LiveStackAnalysis.h
+++ b/include/llvm/CodeGen/LiveStackAnalysis.h
@@ -92,7 +92,7 @@ namespace llvm {
     bool runOnMachineFunction(MachineFunction&) override;
 
     /// print - Implement the dump method.
-    void print(raw_ostream &O, const Module* = 0) const override;
+    void print(raw_ostream &O, const Module* = nullptr) const override;
   };
 }
 
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index 5e86e75..90bdeee4 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -160,7 +160,7 @@ public:
     template<class OtherTy, class OtherIterTy>
     bundle_iterator(const bundle_iterator<OtherTy, OtherIterTy> &I)
       : MII(I.getInstrIterator()) {}
-    bundle_iterator() : MII(0) {}
+    bundle_iterator() : MII(nullptr) {}
 
     Ty &operator*() const { return *MII; }
     Ty *operator->() const { return &operator*(); }
@@ -219,10 +219,15 @@ public:
   unsigned size() const { return (unsigned)Insts.size(); }
   bool empty() const { return Insts.empty(); }
 
-  MachineInstr& front() { return Insts.front(); }
-  MachineInstr& back()  { return Insts.back(); }
-  const MachineInstr& front() const { return Insts.front(); }
-  const MachineInstr& back()  const { return Insts.back(); }
+  MachineInstr       &instr_front()       { return Insts.front(); }
+  MachineInstr       &instr_back()        { return Insts.back();  }
+  const MachineInstr &instr_front() const { return Insts.front(); }
+  const MachineInstr &instr_back()  const { return Insts.back();  }
+
+  MachineInstr       &front()             { return Insts.front(); }
+  MachineInstr       &back()              { return *--end();      }
+  const MachineInstr &front()       const { return Insts.front(); }
+  const MachineInstr &back()        const { return *--end();      }
 
   instr_iterator                instr_begin()       { return Insts.begin();  }
   const_instr_iterator          instr_begin() const { return Insts.begin();  }
@@ -242,6 +247,12 @@ public:
   reverse_iterator       rend  ()       { return instr_rend();   }
   const_reverse_iterator rend  () const { return instr_rend();   }
 
+  inline iterator_range<iterator> terminators() {
+    return iterator_range<iterator>(getFirstTerminator(), end());
+  }
+  inline iterator_range<const_iterator> terminators() const {
+    return iterator_range<const_iterator>(getFirstTerminator(), end());
+  }
 
   // Machine-CFG iterators
   typedef std::vector<MachineBasicBlock *>::iterator       pred_iterator;
@@ -256,7 +267,6 @@ public:
                                                          succ_reverse_iterator;
   typedef std::vector<MachineBasicBlock *>::const_reverse_iterator
                                                    const_succ_reverse_iterator;
-
   pred_iterator        pred_begin()       { return Predecessors.begin(); }
   const_pred_iterator  pred_begin() const { return Predecessors.begin(); }
   pred_iterator        pred_end()         { return Predecessors.end();   }
@@ -290,6 +300,19 @@ public:
   }
   bool                 succ_empty() const { return Successors.empty();   }
 
+  inline iterator_range<pred_iterator> predecessors() {
+    return iterator_range<pred_iterator>(pred_begin(), pred_end());
+  }
+  inline iterator_range<const_pred_iterator> predecessors() const {
+    return iterator_range<const_pred_iterator>(pred_begin(), pred_end());
+  }
+  inline iterator_range<succ_iterator> successors() {
+    return iterator_range<succ_iterator>(succ_begin(), succ_end());
+  }
+  inline iterator_range<const_succ_iterator> successors() const {
+    return iterator_range<const_succ_iterator>(succ_begin(), succ_end());
+  }
+
   // LiveIn management methods.
 
   /// addLiveIn - Add the specified register as a live in.  Note that it
@@ -609,7 +632,7 @@ public:
 
   // Debugging methods.
   void dump() const;
-  void print(raw_ostream &OS, SlotIndexes* = 0) const;
+  void print(raw_ostream &OS, SlotIndexes* = nullptr) const;
 
   // Printing method used by LoopInfo.
   void printAsOperand(raw_ostream &OS, bool PrintType = true);
diff --git a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
index f3ef87c..1aef689 100644
--- a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
+++ b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
@@ -1,4 +1,4 @@
-//====-- MachineBlockFrequencyInfo.h - MBB Frequency Analysis -*- C++ -*--====//
+//===- MachineBlockFrequencyInfo.h - MBB Frequency Analysis -*- C++ -*-----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -22,14 +22,12 @@ namespace llvm {
 
 class MachineBasicBlock;
 class MachineBranchProbabilityInfo;
-template<class BlockT, class FunctionT, class BranchProbInfoT>
-class BlockFrequencyImpl;
+template <class BlockT> class BlockFrequencyInfoImpl;
 
-/// MachineBlockFrequencyInfo pass uses BlockFrequencyImpl implementation to estimate
-/// machine basic block frequencies.
+/// MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation
+/// to estimate machine basic block frequencies.
 class MachineBlockFrequencyInfo : public MachineFunctionPass {
-  typedef BlockFrequencyImpl<MachineBasicBlock, MachineFunction,
-                             MachineBranchProbabilityInfo> ImplType;
+  typedef BlockFrequencyInfoImpl<MachineBasicBlock> ImplType;
   std::unique_ptr<ImplType> MBFI;
 
 public:
diff --git a/include/llvm/CodeGen/MachineCodeEmitter.h b/include/llvm/CodeGen/MachineCodeEmitter.h
index f729ced..81b0ba1 100644
--- a/include/llvm/CodeGen/MachineCodeEmitter.h
+++ b/include/llvm/CodeGen/MachineCodeEmitter.h
@@ -262,7 +262,7 @@ public:
     // Check for buffer overflow.
     if (Size >= (uintptr_t)(BufferEnd-CurBufferPtr)) {
       CurBufferPtr = BufferEnd;
-      Result = 0;
+      Result = nullptr;
     } else {
       // Allocate the space.
       Result = CurBufferPtr;
diff --git a/include/llvm/CodeGen/MachineCodeInfo.h b/include/llvm/CodeGen/MachineCodeInfo.h
index ba9dfab..820bc87 100644
--- a/include/llvm/CodeGen/MachineCodeInfo.h
+++ b/include/llvm/CodeGen/MachineCodeInfo.h
@@ -27,7 +27,7 @@ private:
   void *Address; // The address of the function in memory
 
 public:
-  MachineCodeInfo() : Size(0), Address(0) {}
+  MachineCodeInfo() : Size(0), Address(nullptr) {}
 
   void setSize(size_t s) {
     Size = s;
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 1dedd74..bd0ea11 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -519,7 +519,7 @@ public:
   /// a nonnegative identifier to represent it.
   ///
   int CreateStackObject(uint64_t Size, unsigned Alignment, bool isSS,
-                        const AllocaInst *Alloca = 0);
+                        const AllocaInst *Alloca = nullptr);
 
   /// CreateSpillStackObject - Create a new statically sized stack object that
   /// represents a spill slot, returning a nonnegative identifier to represent
diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 652d63d..f4c2542 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -259,6 +259,9 @@ public:
     return MBBNumbering[N];
   }
 
+  /// Should we be emitting segmented stack stuff for the function
+  bool shouldSplitStack();
+
   /// getNumBlockIDs - Return the number of MBB ID's allocated.
   ///
   unsigned getNumBlockIDs() const { return (unsigned)MBBNumbering.size(); }
@@ -268,12 +271,12 @@ public:
   /// dense, and match the ordering of the blocks within the function.  If a
   /// specific MachineBasicBlock is specified, only that block and those after
   /// it are renumbered.
-  void RenumberBlocks(MachineBasicBlock *MBBFrom = 0);
+  void RenumberBlocks(MachineBasicBlock *MBBFrom = nullptr);
   
   /// print - Print out the MachineFunction in a format suitable for debugging
   /// to the specified stream.
   ///
-  void print(raw_ostream &OS, SlotIndexes* = 0) const;
+  void print(raw_ostream &OS, SlotIndexes* = nullptr) const;
 
   /// viewCFG - This function is meant for use from the debugger.  You can just
   /// say 'call F->viewCFG()' and a ghostview window should pop up from the
@@ -296,7 +299,7 @@ public:
 
   /// verify - Run the current MachineFunction through the machine code
   /// verifier, useful for debugger use.
-  void verify(Pass *p = NULL, const char *Banner = NULL) const;
+  void verify(Pass *p = nullptr, const char *Banner = nullptr) const;
 
   // Provide accessors for the MachineBasicBlock list...
   typedef BasicBlockListType::iterator iterator;
@@ -364,7 +367,7 @@ public:
   /// implementation.
   void removeFromMBBNumbering(unsigned N) {
     assert(N < MBBNumbering.size() && "Illegal basic block #");
-    MBBNumbering[N] = 0;
+    MBBNumbering[N] = nullptr;
   }
 
   /// CreateMachineInstr - Allocate a new MachineInstr. Use this instead
@@ -389,7 +392,7 @@ public:
   /// CreateMachineBasicBlock - Allocate a new MachineBasicBlock. Use this
   /// instead of `new MachineBasicBlock'.
   ///
-  MachineBasicBlock *CreateMachineBasicBlock(const BasicBlock *bb = 0);
+  MachineBasicBlock *CreateMachineBasicBlock(const BasicBlock *bb = nullptr);
 
   /// DeleteMachineBasicBlock - Delete the given MachineBasicBlock.
   ///
@@ -401,8 +404,8 @@ public:
   MachineMemOperand *getMachineMemOperand(MachinePointerInfo PtrInfo,
                                           unsigned f, uint64_t s,
                                           unsigned base_alignment,
-                                          const MDNode *TBAAInfo = 0,
-                                          const MDNode *Ranges = 0);
+                                          const MDNode *TBAAInfo = nullptr,
+                                          const MDNode *Ranges = nullptr);
   
   /// getMachineMemOperand - Allocate a new MachineMemOperand by copying
   /// an existing one, adjusting by an offset and using the given size.
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index f5dc75e..b0d3e02 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -24,6 +24,7 @@
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -243,6 +244,14 @@ public:
   ///
   DebugLoc getDebugLoc() const { return debugLoc; }
 
+  /// getDebugVariable() - Return the debug variable referenced by
+  /// this DBG_VALUE instruction.
+  DIVariable getDebugVariable() const {
+    assert(isDebugValue() && "not a DBG_VALUE");
+    const MDNode *Var = getOperand(getNumOperands() - 1).getMetadata();
+    return DIVariable(Var);
+  }
+
   /// emitError - Emit an error referring to the source location of this
   /// instruction. This should only be used for inline assembly that is somehow
   /// impossible to compile. Other errors should have been handled much
@@ -287,22 +296,54 @@ public:
   const_mop_iterator operands_begin() const { return Operands; }
   const_mop_iterator operands_end() const { return Operands + NumOperands; }
 
-  inline iterator_range<mop_iterator>  operands() {
+  iterator_range<mop_iterator> operands() {
     return iterator_range<mop_iterator>(operands_begin(), operands_end());
   }
-  inline iterator_range<const_mop_iterator> operands() const {
+  iterator_range<const_mop_iterator> operands() const {
     return iterator_range<const_mop_iterator>(operands_begin(), operands_end());
   }
+  iterator_range<mop_iterator> explicit_operands() {
+    return iterator_range<mop_iterator>(
+        operands_begin(), operands_begin() + getNumExplicitOperands());
+  }
+  iterator_range<const_mop_iterator> explicit_operands() const {
+    return iterator_range<const_mop_iterator>(
+        operands_begin(), operands_begin() + getNumExplicitOperands());
+  }
+  iterator_range<mop_iterator> implicit_operands() {
+    return iterator_range<mop_iterator>(explicit_operands().end(),
+                                        operands_end());
+  }
+  iterator_range<const_mop_iterator> implicit_operands() const {
+    return iterator_range<const_mop_iterator>(explicit_operands().end(),
+                                              operands_end());
+  }
+  iterator_range<mop_iterator> defs() {
+    return iterator_range<mop_iterator>(
+        operands_begin(), operands_begin() + getDesc().getNumDefs());
+  }
+  iterator_range<const_mop_iterator> defs() const {
+    return iterator_range<const_mop_iterator>(
+        operands_begin(), operands_begin() + getDesc().getNumDefs());
+  }
+  iterator_range<mop_iterator> uses() {
+    return iterator_range<mop_iterator>(
+        operands_begin() + getDesc().getNumDefs(), operands_end());
+  }
+  iterator_range<const_mop_iterator> uses() const {
+    return iterator_range<const_mop_iterator>(
+        operands_begin() + getDesc().getNumDefs(), operands_end());
+  }
 
   /// Access to memory operands of the instruction
   mmo_iterator memoperands_begin() const { return MemRefs; }
   mmo_iterator memoperands_end() const { return MemRefs + NumMemRefs; }
   bool memoperands_empty() const { return NumMemRefs == 0; }
 
-  inline iterator_range<mmo_iterator>  memoperands() {
+  iterator_range<mmo_iterator>  memoperands() {
     return iterator_range<mmo_iterator>(memoperands_begin(), memoperands_end());
   }
-  inline iterator_range<mmo_iterator> memoperands() const {
+  iterator_range<mmo_iterator> memoperands() const {
     return iterator_range<mmo_iterator>(memoperands_begin(), memoperands_end());
   }
 
@@ -735,7 +776,8 @@ public:
   /// is a read of a super-register.
   /// This does not count partial redefines of virtual registers as reads:
   ///   %reg1024:6 = OP.
-  bool readsRegister(unsigned Reg, const TargetRegisterInfo *TRI = NULL) const {
+  bool readsRegister(unsigned Reg,
+                     const TargetRegisterInfo *TRI = nullptr) const {
     return findRegisterUseOperandIdx(Reg, false, TRI) != -1;
   }
 
@@ -751,12 +793,13 @@ public:
   /// partial defines.
   /// If Ops is not null, all operand indices for Reg are added.
   std::pair<bool,bool> readsWritesVirtualRegister(unsigned Reg,
-                                      SmallVectorImpl<unsigned> *Ops = 0) const;
+                                SmallVectorImpl<unsigned> *Ops = nullptr) const;
 
   /// killsRegister - Return true if the MachineInstr kills the specified
   /// register. If TargetRegisterInfo is passed, then it also checks if there is
   /// a kill of a super-register.
-  bool killsRegister(unsigned Reg, const TargetRegisterInfo *TRI = NULL) const {
+  bool killsRegister(unsigned Reg,
+                     const TargetRegisterInfo *TRI = nullptr) const {
     return findRegisterUseOperandIdx(Reg, true, TRI) != -1;
   }
 
@@ -764,7 +807,8 @@ public:
   /// specified register. If TargetRegisterInfo is passed, then it also checks
   /// if there is a def of a super-register.
   /// NOTE: It's ignoring subreg indices on virtual registers.
-  bool definesRegister(unsigned Reg, const TargetRegisterInfo *TRI=NULL) const {
+  bool definesRegister(unsigned Reg,
+                       const TargetRegisterInfo *TRI = nullptr) const {
     return findRegisterDefOperandIdx(Reg, false, false, TRI) != -1;
   }
 
@@ -779,7 +823,7 @@ public:
   /// instruction. If TargetRegisterInfo is passed, then it also checks
   /// if there is a dead def of a super-register.
   bool registerDefIsDead(unsigned Reg,
-                         const TargetRegisterInfo *TRI = NULL) const {
+                         const TargetRegisterInfo *TRI = nullptr) const {
     return findRegisterDefOperandIdx(Reg, true, false, TRI) != -1;
   }
 
@@ -787,14 +831,14 @@ public:
   /// the specific register or -1 if it is not found. It further tightens
   /// the search criteria to a use that kills the register if isKill is true.
   int findRegisterUseOperandIdx(unsigned Reg, bool isKill = false,
-                                const TargetRegisterInfo *TRI = NULL) const;
+                                const TargetRegisterInfo *TRI = nullptr) const;
 
   /// findRegisterUseOperand - Wrapper for findRegisterUseOperandIdx, it returns
   /// a pointer to the MachineOperand rather than an index.
   MachineOperand *findRegisterUseOperand(unsigned Reg, bool isKill = false,
-                                         const TargetRegisterInfo *TRI = NULL) {
+                                      const TargetRegisterInfo *TRI = nullptr) {
     int Idx = findRegisterUseOperandIdx(Reg, isKill, TRI);
-    return (Idx == -1) ? NULL : &getOperand(Idx);
+    return (Idx == -1) ? nullptr : &getOperand(Idx);
   }
 
   /// findRegisterDefOperandIdx() - Returns the operand index that is a def of
@@ -805,14 +849,14 @@ public:
   /// This may also return a register mask operand when Overlap is true.
   int findRegisterDefOperandIdx(unsigned Reg,
                                 bool isDead = false, bool Overlap = false,
-                                const TargetRegisterInfo *TRI = NULL) const;
+                                const TargetRegisterInfo *TRI = nullptr) const;
 
   /// findRegisterDefOperand - Wrapper for findRegisterDefOperandIdx, it returns
   /// a pointer to the MachineOperand rather than an index.
   MachineOperand *findRegisterDefOperand(unsigned Reg, bool isDead = false,
-                                         const TargetRegisterInfo *TRI = NULL) {
+                                      const TargetRegisterInfo *TRI = nullptr) {
     int Idx = findRegisterDefOperandIdx(Reg, isDead, false, TRI);
-    return (Idx == -1) ? NULL : &getOperand(Idx);
+    return (Idx == -1) ? nullptr : &getOperand(Idx);
   }
 
   /// findFirstPredOperandIdx() - Find the index of the first operand in the
@@ -830,7 +874,7 @@ public:
   /// The flag operand is an immediate that can be decoded with methods like
   /// InlineAsm::hasRegClassConstraint().
   ///
-  int findInlineAsmFlagIdx(unsigned OpIdx, unsigned *GroupNo = 0) const;
+  int findInlineAsmFlagIdx(unsigned OpIdx, unsigned *GroupNo = nullptr) const;
 
   /// getRegClassConstraint - Compute the static register class constraint for
   /// operand OpIdx.  For normal instructions, this is derived from the
@@ -892,7 +936,8 @@ public:
   /// check if the register def is tied to a source operand, due to either
   /// two-address elimination or inline assembly constraints. Returns the
   /// first tied use operand index by reference if UseOpIdx is not null.
-  bool isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx = 0) const {
+  bool isRegTiedToUseOperand(unsigned DefOpIdx,
+                             unsigned *UseOpIdx = nullptr) const {
     const MachineOperand &MO = getOperand(DefOpIdx);
     if (!MO.isReg() || !MO.isDef() || !MO.isTied())
       return false;
@@ -904,7 +949,8 @@ public:
   /// isRegTiedToDefOperand - Return true if the use operand of the specified
   /// index is tied to an def operand. It also returns the def operand index by
   /// reference if DefOpIdx is not null.
-  bool isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx = 0) const {
+  bool isRegTiedToDefOperand(unsigned UseOpIdx,
+                             unsigned *DefOpIdx = nullptr) const {
     const MachineOperand &MO = getOperand(UseOpIdx);
     if (!MO.isReg() || !MO.isUse() || !MO.isTied())
       return false;
@@ -943,7 +989,8 @@ public:
 
   /// addRegisterDefined - We have determined MI defines a register. Make sure
   /// there is an operand defining Reg.
-  void addRegisterDefined(unsigned Reg, const TargetRegisterInfo *RegInfo = 0);
+  void addRegisterDefined(unsigned Reg,
+                          const TargetRegisterInfo *RegInfo = nullptr);
 
   /// setPhysRegsDeadExcept - Mark every physreg used by this instruction as
   /// dead except those in the UsedRegs list.
@@ -997,7 +1044,7 @@ public:
   //
   // Debugging support
   //
-  void print(raw_ostream &OS, const TargetMachine *TM = 0,
+  void print(raw_ostream &OS, const TargetMachine *TM = nullptr,
              bool SkipOpers = false) const;
   void dump() const;
 
@@ -1098,7 +1145,7 @@ private:
 /// useful for CSE, etc.
 struct MachineInstrExpressionTrait : DenseMapInfo<MachineInstr*> {
   static inline MachineInstr *getEmptyKey() {
-    return 0;
+    return nullptr;
   }
 
   static inline MachineInstr *getTombstoneKey() {
diff --git a/include/llvm/CodeGen/MachineInstrBuilder.h b/include/llvm/CodeGen/MachineInstrBuilder.h
index d7eb706..21a482c 100644
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -46,7 +46,7 @@ class MachineInstrBuilder {
   MachineFunction *MF;
   MachineInstr *MI;
 public:
-  MachineInstrBuilder() : MF(0), MI(0) {}
+  MachineInstrBuilder() : MF(nullptr), MI(nullptr) {}
 
   /// Create a MachineInstrBuilder for manipulating an existing instruction.
   /// F must be the machine function  that was used to allocate I.
diff --git a/include/llvm/CodeGen/MachineInstrBundle.h b/include/llvm/CodeGen/MachineInstrBundle.h
index 9519edb..1220224 100644
--- a/include/llvm/CodeGen/MachineInstrBundle.h
+++ b/include/llvm/CodeGen/MachineInstrBundle.h
@@ -196,7 +196,7 @@ public:
   ///            each operand referring to Reg.
   /// @returns A filled-in RegInfo struct.
   VirtRegInfo analyzeVirtReg(unsigned Reg,
-                 SmallVectorImpl<std::pair<MachineInstr*, unsigned> > *Ops = 0);
+           SmallVectorImpl<std::pair<MachineInstr*, unsigned> > *Ops = nullptr);
 
   /// analyzePhysReg - Analyze how the current instruction or bundle uses a
   /// physical register.  This function should not be called after operator++(),
diff --git a/include/llvm/CodeGen/MachineMemOperand.h b/include/llvm/CodeGen/MachineMemOperand.h
index f01b8eb..2532c16 100644
--- a/include/llvm/CodeGen/MachineMemOperand.h
+++ b/include/llvm/CodeGen/MachineMemOperand.h
@@ -16,11 +16,13 @@
 #ifndef LLVM_CODEGEN_MACHINEMEMOPERAND_H
 #define LLVM_CODEGEN_MACHINEMEMOPERAND_H
 
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/IR/Value.h"  // PointerLikeTypeTraits<Value*>
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
 
-class Value;
 class FoldingSetNodeID;
 class MDNode;
 class raw_ostream;
@@ -33,17 +35,23 @@ struct MachinePointerInfo {
   /// V - This is the IR pointer value for the access, or it is null if unknown.
   /// If this is null, then the access is to a pointer in the default address
   /// space.
-  const Value *V;
+  PointerUnion<const Value *, const PseudoSourceValue *> V;
 
   /// Offset - This is an offset from the base Value*.
   int64_t Offset;
 
-  explicit MachinePointerInfo(const Value *v = 0, int64_t offset = 0)
+  explicit MachinePointerInfo(const Value *v = nullptr, int64_t offset = 0)
+    : V(v), Offset(offset) {}
+
+  explicit MachinePointerInfo(const PseudoSourceValue *v,
+                              int64_t offset = 0)
     : V(v), Offset(offset) {}
 
   MachinePointerInfo getWithOffset(int64_t O) const {
-    if (V == 0) return MachinePointerInfo(0, 0);
-    return MachinePointerInfo(V, Offset+O);
+    if (V.isNull()) return MachinePointerInfo();
+    if (V.is<const Value*>())
+      return MachinePointerInfo(V.get<const Value*>(), Offset+O);
+    return MachinePointerInfo(V.get<const PseudoSourceValue*>(), Offset+O);
   }
 
   /// getAddrSpace - Return the LLVM IR address space number that this pointer
@@ -109,8 +117,8 @@ public:
   /// MachineMemOperand - Construct an MachineMemOperand object with the
   /// specified PtrInfo, flags, size, and base alignment.
   MachineMemOperand(MachinePointerInfo PtrInfo, unsigned flags, uint64_t s,
-                    unsigned base_alignment, const MDNode *TBAAInfo = 0,
-                    const MDNode *Ranges = 0);
+                    unsigned base_alignment, const MDNode *TBAAInfo = nullptr,
+                    const MDNode *Ranges = nullptr);
 
   const MachinePointerInfo &getPointerInfo() const { return PtrInfo; }
 
@@ -121,7 +129,13 @@ public:
   /// other PseudoSourceValue member functions which return objects which stand
   /// for frame/stack pointer relative references and other special references
   /// which are not representable in the high-level IR.
-  const Value *getValue() const { return PtrInfo.V; }
+  const Value *getValue() const { return PtrInfo.V.dyn_cast<const Value*>(); }
+
+  const PseudoSourceValue *getPseudoValue() const {
+    return PtrInfo.V.dyn_cast<const PseudoSourceValue*>();
+  }
+
+  const void *getOpaqueValue() const { return PtrInfo.V.getOpaqueValue(); }
 
   /// getFlags - Return the raw flags of the source value, \see MemOperandFlags.
   unsigned int getFlags() const { return Flags & ((1 << MOMaxBits) - 1); }
@@ -177,6 +191,7 @@ public:
   /// should only be used when an object is being relocated and all references
   /// to it are being updated.
   void setValue(const Value *NewSV) { PtrInfo.V = NewSV; }
+  void setValue(const PseudoSourceValue *NewSV) { PtrInfo.V = NewSV; }
   void setOffset(int64_t NewOffset) { PtrInfo.Offset = NewOffset; }
 
   /// Profile - Gather unique data for the object.
diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index 28f4544..6d8d056 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -71,7 +71,7 @@ struct LandingPadInfo {
   std::vector<int> TypeIds;              // List of type ids (filters negative)
 
   explicit LandingPadInfo(MachineBasicBlock *MBB)
-    : LandingPadBlock(MBB), LandingPadLabel(0), Personality(0) {}
+    : LandingPadBlock(MBB), LandingPadLabel(nullptr), Personality(nullptr) {}
 };
 
 //===----------------------------------------------------------------------===//
@@ -201,7 +201,7 @@ public:
   ///
   template<typename Ty>
   Ty &getObjFileInfo() {
-    if (ObjFileMMI == 0)
+    if (ObjFileMMI == nullptr)
       ObjFileMMI = new Ty(*this);
     return *static_cast<Ty*>(ObjFileMMI);
   }
@@ -334,7 +334,7 @@ public:
 
   /// TidyLandingPads - Remap landing pad labels and remove any deleted landing
   /// pads.
-  void TidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap = 0);
+  void TidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap = nullptr);
 
   /// getLandingPads - Return a reference to the landing pad info for the
   /// current function.
diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index 57bdb4c..22969bc8 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h
@@ -42,7 +42,7 @@ class MCSymbol;
 ///
 class MachineOperand {
 public:
-  enum MachineOperandType {
+  enum MachineOperandType : unsigned char {
     MO_Register,          ///< Register operand.
     MO_Immediate,         ///< Immediate operand
     MO_CImmediate,        ///< Immediate >64bit operand
@@ -65,7 +65,7 @@ public:
 private:
   /// OpKind - Specify what kind of operand this is.  This discriminates the
   /// union.
-  unsigned char OpKind; // MachineOperandType
+  MachineOperandType OpKind;
 
   /// Subregister number for MO_Register.  A value of 0 indicates the
   /// MO_Register has no subReg.
@@ -181,7 +181,7 @@ private:
   } Contents;
 
   explicit MachineOperand(MachineOperandType K)
-    : OpKind(K), SubReg_TargetFlags(0), ParentMI(0) {}
+    : OpKind(K), SubReg_TargetFlags(0), ParentMI(nullptr) {}
 public:
   /// getType - Returns the MachineOperandType for this operand.
   ///
@@ -215,9 +215,9 @@ public:
   ///
   /// Never call clearParent() on an operand in a MachineInstr.
   ///
-  void clearParent() { ParentMI = 0; }
+  void clearParent() { ParentMI = nullptr; }
 
-  void print(raw_ostream &os, const TargetMachine *TM = 0) const;
+  void print(raw_ostream &os, const TargetMachine *TM = nullptr) const;
 
   //===--------------------------------------------------------------------===//
   // Accessors that tell you what kind of MachineOperand you're looking at.
@@ -227,7 +227,7 @@ public:
   bool isReg() const { return OpKind == MO_Register; }
   /// isImm - Tests if this is a MO_Immediate operand.
   bool isImm() const { return OpKind == MO_Immediate; }
-  /// isCImm - Test if t his is a MO_CImmediate operand.
+  /// isCImm - Test if this is a MO_CImmediate operand.
   bool isCImm() const { return OpKind == MO_CImmediate; }
   /// isFPImm - Tests if this is a MO_FPImmediate operand.
   bool isFPImm() const { return OpKind == MO_FPImmediate; }
@@ -593,8 +593,8 @@ public:
     Op.TiedTo = 0;
     Op.IsDebug = isDebug;
     Op.SmallContents.RegNo = Reg;
-    Op.Contents.Reg.Prev = 0;
-    Op.Contents.Reg.Next = 0;
+    Op.Contents.Reg.Prev = nullptr;
+    Op.Contents.Reg.Next = nullptr;
     Op.setSubReg(SubReg);
     return Op;
   }
@@ -711,12 +711,12 @@ private:
   /// part of a machine instruction.
   bool isOnRegUseList() const {
     assert(isReg() && "Can only add reg operand to use lists");
-    return Contents.Reg.Prev != 0;
+    return Contents.Reg.Prev != nullptr;
   }
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const MachineOperand& MO) {
-  MO.print(OS, 0);
+  MO.print(OS, nullptr);
   return OS;
 }
 
diff --git a/include/llvm/CodeGen/MachinePassRegistry.h b/include/llvm/CodeGen/MachinePassRegistry.h
index cd212ab..c962e68 100644
--- a/include/llvm/CodeGen/MachinePassRegistry.h
+++ b/include/llvm/CodeGen/MachinePassRegistry.h
@@ -59,7 +59,7 @@ private:
 public:
 
   MachinePassRegistryNode(const char *N, const char *D, MachinePassCtor C)
-  : Next(NULL)
+  : Next(nullptr)
   , Name(N)
   , Description(D)
   , Ctor(C)
@@ -123,7 +123,7 @@ class RegisterPassParser : public MachinePassRegistryListener,
                    public cl::parser<typename RegistryClass::FunctionPassCtor> {
 public:
   RegisterPassParser() {}
-  ~RegisterPassParser() { RegistryClass::setListener(NULL); }
+  ~RegisterPassParser() { RegistryClass::setListener(nullptr); }
 
   void initialize(cl::Option &O) {
     cl::parser<typename RegistryClass::FunctionPassCtor>::initialize(O);
diff --git a/include/llvm/CodeGen/MachinePostDominators.h b/include/llvm/CodeGen/MachinePostDominators.h
index a6f9f3d..beb2c4f 100644
--- a/include/llvm/CodeGen/MachinePostDominators.h
+++ b/include/llvm/CodeGen/MachinePostDominators.h
@@ -79,7 +79,7 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override;
-  void print(llvm::raw_ostream &OS, const Module *M = 0) const override;
+  void print(llvm::raw_ostream &OS, const Module *M = nullptr) const override;
 };
 } //end of namespace llvm
 
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 2285130..51139f7 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -135,7 +135,7 @@ public:
     // notifications, we will need to change to using a list.
     assert(TheDelegate == delegate &&
            "Only the current delegate can perform reset!");
-    TheDelegate = 0;
+    TheDelegate = nullptr;
   }
 
   void setDelegate(Delegate *delegate) {
@@ -223,7 +223,7 @@ public:
   reg_iterator reg_begin(unsigned RegNo) const {
     return reg_iterator(getRegUseDefListHead(RegNo));
   }
-  static reg_iterator reg_end() { return reg_iterator(0); }
+  static reg_iterator reg_end() { return reg_iterator(nullptr); }
 
   inline iterator_range<reg_iterator>  reg_operands(unsigned Reg) const {
     return iterator_range<reg_iterator>(reg_begin(Reg), reg_end());
@@ -236,7 +236,9 @@ public:
   reg_instr_iterator reg_instr_begin(unsigned RegNo) const {
     return reg_instr_iterator(getRegUseDefListHead(RegNo));
   }
-  static reg_instr_iterator reg_instr_end() { return reg_instr_iterator(0); }
+  static reg_instr_iterator reg_instr_end() {
+    return reg_instr_iterator(nullptr);
+  }
 
   inline iterator_range<reg_instr_iterator>
   reg_instructions(unsigned Reg) const {
@@ -251,7 +253,9 @@ public:
   reg_bundle_iterator reg_bundle_begin(unsigned RegNo) const {
     return reg_bundle_iterator(getRegUseDefListHead(RegNo));
   }
-  static reg_bundle_iterator reg_bundle_end() { return reg_bundle_iterator(0); }
+  static reg_bundle_iterator reg_bundle_end() {
+    return reg_bundle_iterator(nullptr);
+  }
 
   inline iterator_range<reg_bundle_iterator> reg_bundles(unsigned Reg) const {
     return iterator_range<reg_bundle_iterator>(reg_bundle_begin(Reg),
@@ -269,7 +273,9 @@ public:
   reg_nodbg_iterator reg_nodbg_begin(unsigned RegNo) const {
     return reg_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
-  static reg_nodbg_iterator reg_nodbg_end() { return reg_nodbg_iterator(0); }
+  static reg_nodbg_iterator reg_nodbg_end() {
+    return reg_nodbg_iterator(nullptr);
+  }
 
   inline iterator_range<reg_nodbg_iterator>
   reg_nodbg_operands(unsigned Reg) const {
@@ -286,7 +292,7 @@ public:
     return reg_instr_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
   static reg_instr_nodbg_iterator reg_instr_nodbg_end() {
-    return reg_instr_nodbg_iterator(0);
+    return reg_instr_nodbg_iterator(nullptr);
   }
 
   inline iterator_range<reg_instr_nodbg_iterator>
@@ -304,7 +310,7 @@ public:
     return reg_bundle_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
   static reg_bundle_nodbg_iterator reg_bundle_nodbg_end() {
-    return reg_bundle_nodbg_iterator(0);
+    return reg_bundle_nodbg_iterator(nullptr);
   }
 
   inline iterator_range<reg_bundle_nodbg_iterator> 
@@ -325,7 +331,7 @@ public:
   def_iterator def_begin(unsigned RegNo) const {
     return def_iterator(getRegUseDefListHead(RegNo));
   }
-  static def_iterator def_end() { return def_iterator(0); }
+  static def_iterator def_end() { return def_iterator(nullptr); }
 
   inline iterator_range<def_iterator> def_operands(unsigned Reg) const {
     return iterator_range<def_iterator>(def_begin(Reg), def_end());
@@ -338,7 +344,9 @@ public:
   def_instr_iterator def_instr_begin(unsigned RegNo) const {
     return def_instr_iterator(getRegUseDefListHead(RegNo));
   }
-  static def_instr_iterator def_instr_end() { return def_instr_iterator(0); }
+  static def_instr_iterator def_instr_end() {
+    return def_instr_iterator(nullptr);
+  }
 
   inline iterator_range<def_instr_iterator>
   def_instructions(unsigned Reg) const {
@@ -353,7 +361,9 @@ public:
   def_bundle_iterator def_bundle_begin(unsigned RegNo) const {
     return def_bundle_iterator(getRegUseDefListHead(RegNo));
   }
-  static def_bundle_iterator def_bundle_end() { return def_bundle_iterator(0); }
+  static def_bundle_iterator def_bundle_end() {
+    return def_bundle_iterator(nullptr);
+  }
 
   inline iterator_range<def_bundle_iterator> def_bundles(unsigned Reg) const {
     return iterator_range<def_bundle_iterator>(def_bundle_begin(Reg),
@@ -379,7 +389,7 @@ public:
   use_iterator use_begin(unsigned RegNo) const {
     return use_iterator(getRegUseDefListHead(RegNo));
   }
-  static use_iterator use_end() { return use_iterator(0); }
+  static use_iterator use_end() { return use_iterator(nullptr); }
 
   inline iterator_range<use_iterator> use_operands(unsigned Reg) const {
     return iterator_range<use_iterator>(use_begin(Reg), use_end());
@@ -392,7 +402,9 @@ public:
   use_instr_iterator use_instr_begin(unsigned RegNo) const {
     return use_instr_iterator(getRegUseDefListHead(RegNo));
   }
-  static use_instr_iterator use_instr_end() { return use_instr_iterator(0); }
+  static use_instr_iterator use_instr_end() {
+    return use_instr_iterator(nullptr);
+  }
 
   inline iterator_range<use_instr_iterator>
   use_instructions(unsigned Reg) const {
@@ -407,7 +419,9 @@ public:
   use_bundle_iterator use_bundle_begin(unsigned RegNo) const {
     return use_bundle_iterator(getRegUseDefListHead(RegNo));
   }
-  static use_bundle_iterator use_bundle_end() { return use_bundle_iterator(0); }
+  static use_bundle_iterator use_bundle_end() {
+    return use_bundle_iterator(nullptr);
+  }
 
   inline iterator_range<use_bundle_iterator> use_bundles(unsigned Reg) const {
     return iterator_range<use_bundle_iterator>(use_bundle_begin(Reg),
@@ -434,7 +448,9 @@ public:
   use_nodbg_iterator use_nodbg_begin(unsigned RegNo) const {
     return use_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
-  static use_nodbg_iterator use_nodbg_end() { return use_nodbg_iterator(0); }
+  static use_nodbg_iterator use_nodbg_end() {
+    return use_nodbg_iterator(nullptr);
+  }
 
   inline iterator_range<use_nodbg_iterator>
   use_nodbg_operands(unsigned Reg) const {
@@ -451,7 +467,7 @@ public:
     return use_instr_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
   static use_instr_nodbg_iterator use_instr_nodbg_end() {
-    return use_instr_nodbg_iterator(0);
+    return use_instr_nodbg_iterator(nullptr);
   }
 
   inline iterator_range<use_instr_nodbg_iterator>
@@ -469,7 +485,7 @@ public:
     return use_bundle_nodbg_iterator(getRegUseDefListHead(RegNo));
   }
   static use_bundle_nodbg_iterator use_bundle_nodbg_end() {
-    return use_bundle_nodbg_iterator(0);
+    return use_bundle_nodbg_iterator(nullptr);
   }
 
   inline iterator_range<use_bundle_nodbg_iterator>
@@ -779,7 +795,7 @@ public:
       if (!ReturnUses) {
         if (Op) {
           if (Op->isUse())
-            Op = 0;
+            Op = nullptr;
           else
             assert(!Op->isDebug() && "Can't have debug defs");
         }
@@ -797,7 +813,7 @@ public:
                           MachineInstr, ptrdiff_t>::pointer pointer;
 
     defusechain_iterator(const defusechain_iterator &I) : Op(I.Op) {}
-    defusechain_iterator() : Op(0) {}
+    defusechain_iterator() : Op(nullptr) {}
 
     bool operator==(const defusechain_iterator &x) const {
       return Op == x.Op;
@@ -807,7 +823,7 @@ public:
     }
 
     /// atEnd - return true if this iterator is equal to reg_end() on the value.
-    bool atEnd() const { return Op == 0; }
+    bool atEnd() const { return Op == nullptr; }
 
     // Iterator traversal: forward iteration only
     defusechain_iterator &operator++() {          // Preincrement
@@ -882,7 +898,7 @@ public:
       if (!ReturnUses) {
         if (Op) {
           if (Op->isUse())
-            Op = 0;
+            Op = nullptr;
           else
             assert(!Op->isDebug() && "Can't have debug defs");
         }
@@ -900,7 +916,7 @@ public:
                           MachineInstr, ptrdiff_t>::pointer pointer;
 
     defusechain_instr_iterator(const defusechain_instr_iterator &I) : Op(I.Op){}
-    defusechain_instr_iterator() : Op(0) {}
+    defusechain_instr_iterator() : Op(nullptr) {}
 
     bool operator==(const defusechain_instr_iterator &x) const {
       return Op == x.Op;
@@ -910,7 +926,7 @@ public:
     }
 
     /// atEnd - return true if this iterator is equal to reg_end() on the value.
-    bool atEnd() const { return Op == 0; }
+    bool atEnd() const { return Op == nullptr; }
 
     // Iterator traversal: forward iteration only
     defusechain_instr_iterator &operator++() {          // Preincrement
@@ -957,7 +973,7 @@ class PSetIterator {
   const int *PSet;
   unsigned Weight;
 public:
-  PSetIterator(): PSet(0), Weight(0) {}
+  PSetIterator(): PSet(nullptr), Weight(0) {}
   PSetIterator(unsigned RegUnit, const MachineRegisterInfo *MRI) {
     const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
     if (TargetRegisterInfo::isVirtualRegister(RegUnit)) {
@@ -970,7 +986,7 @@ public:
       Weight = TRI->getRegUnitWeight(RegUnit);
     }
     if (*PSet == -1)
-      PSet = 0;
+      PSet = nullptr;
   }
   bool isValid() const { return PSet; }
 
@@ -982,7 +998,7 @@ public:
     assert(isValid() && "Invalid PSetIterator.");
     ++PSet;
     if (*PSet == -1)
-      PSet = 0;
+      PSet = nullptr;
   }
 };
 
diff --git a/include/llvm/CodeGen/MachineSSAUpdater.h b/include/llvm/CodeGen/MachineSSAUpdater.h
index 8fc367e..486a26e 100644
--- a/include/llvm/CodeGen/MachineSSAUpdater.h
+++ b/include/llvm/CodeGen/MachineSSAUpdater.h
@@ -57,7 +57,7 @@ public:
   /// MachineSSAUpdater constructor.  If InsertedPHIs is specified, it will be
   /// filled in with all PHI Nodes created by rewriting.
   explicit MachineSSAUpdater(MachineFunction &MF,
-                             SmallVectorImpl<MachineInstr*> *InsertedPHIs = 0);
+                        SmallVectorImpl<MachineInstr*> *InsertedPHIs = nullptr);
   ~MachineSSAUpdater();
 
   /// Initialize - Reset this object to get ready for a new set of SSA
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index c54300c..acd37e1 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -81,6 +81,8 @@
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 
+#include <memory>
+
 namespace llvm {
 
 extern cl::opt<bool> ForceTopDown;
@@ -221,14 +223,14 @@ public:
 class ScheduleDAGMI : public ScheduleDAGInstrs {
 protected:
   AliasAnalysis *AA;
-  MachineSchedStrategy *SchedImpl;
+  std::unique_ptr<MachineSchedStrategy> SchedImpl;
 
   /// Topo - A topological ordering for SUnits which permits fast IsReachable
   /// and similar queries.
   ScheduleDAGTopologicalSort Topo;
 
   /// Ordered list of DAG postprocessing steps.
-  std::vector<ScheduleDAGMutation*> Mutations;
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
 
   /// The top of the unscheduled zone.
   MachineBasicBlock::iterator CurrentTop;
@@ -246,17 +248,19 @@ protected:
   unsigned NumInstrsScheduled;
 #endif
 public:
-  ScheduleDAGMI(MachineSchedContext *C, MachineSchedStrategy *S, bool IsPostRA):
-    ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, IsPostRA,
-                      /*RemoveKillFlags=*/IsPostRA, C->LIS),
-    AA(C->AA), SchedImpl(S), Topo(SUnits, &ExitSU), CurrentTop(),
-    CurrentBottom(), NextClusterPred(NULL), NextClusterSucc(NULL) {
+  ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
+                bool IsPostRA)
+      : ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, IsPostRA,
+                          /*RemoveKillFlags=*/IsPostRA, C->LIS),
+        AA(C->AA), SchedImpl(std::move(S)), Topo(SUnits, &ExitSU), CurrentTop(),
+        CurrentBottom(), NextClusterPred(nullptr), NextClusterSucc(nullptr) {
 #ifndef NDEBUG
     NumInstrsScheduled = 0;
 #endif
   }
 
-  virtual ~ScheduleDAGMI();
+  // Provide a vtable anchor
+  ~ScheduleDAGMI() override;
 
   /// Return true if this DAG supports VReg liveness and RegPressure.
   virtual bool hasVRegLiveness() const { return false; }
@@ -266,8 +270,8 @@ public:
   /// building and before MachineSchedStrategy initialization.
   ///
   /// ScheduleDAGMI takes ownership of the Mutation object.
-  void addMutation(ScheduleDAGMutation *Mutation) {
-    Mutations.push_back(Mutation);
+  void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
+    Mutations.push_back(std::move(Mutation));
   }
 
   /// \brief True if an edge can be added from PredSU to SuccSU without creating
@@ -375,11 +379,12 @@ protected:
   RegPressureTracker BotRPTracker;
 
 public:
-  ScheduleDAGMILive(MachineSchedContext *C, MachineSchedStrategy *S):
-    ScheduleDAGMI(C, S, /*IsPostRA=*/false), RegClassInfo(C->RegClassInfo),
-    DFSResult(0), ShouldTrackPressure(false), RPTracker(RegPressure),
-    TopRPTracker(TopPressure), BotRPTracker(BotPressure)
-  {}
+  ScheduleDAGMILive(MachineSchedContext *C,
+                    std::unique_ptr<MachineSchedStrategy> S)
+      : ScheduleDAGMI(C, std::move(S), /*IsPostRA=*/false),
+        RegClassInfo(C->RegClassInfo), DFSResult(nullptr),
+        ShouldTrackPressure(false), RPTracker(RegPressure),
+        TopRPTracker(TopPressure), BotRPTracker(BotPressure) {}
 
   virtual ~ScheduleDAGMILive();
 
@@ -628,9 +633,9 @@ public:
   /// Pending queues extend the ready queues with the same ID and the
   /// PendingFlag set.
   SchedBoundary(unsigned ID, const Twine &Name):
-    DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"),
+    DAG(nullptr), SchedModel(nullptr), Rem(nullptr), Available(ID, Name+".A"),
     Pending(ID << LogMaxQID, Name+".P"),
-    HazardRec(0) {
+    HazardRec(nullptr) {
     reset();
   }
 
diff --git a/include/llvm/CodeGen/MachineTraceMetrics.h b/include/llvm/CodeGen/MachineTraceMetrics.h
index dc0bc1d..323b694 100644
--- a/include/llvm/CodeGen/MachineTraceMetrics.h
+++ b/include/llvm/CodeGen/MachineTraceMetrics.h
@@ -154,7 +154,7 @@ public:
     unsigned InstrHeight;
 
     TraceBlockInfo() :
-      Pred(0), Succ(0),
+      Pred(nullptr), Succ(nullptr),
       InstrDepth(~0u), InstrHeight(~0u),
       HasValidInstrDepths(false), HasValidInstrHeights(false) {}
 
diff --git a/include/llvm/CodeGen/MachineValueType.h b/include/llvm/CodeGen/MachineValueType.h
index 84053ca..ad215ec 100644
--- a/include/llvm/CodeGen/MachineValueType.h
+++ b/include/llvm/CodeGen/MachineValueType.h
@@ -16,6 +16,7 @@
 #define LLVM_CODEGEN_MACHINEVALUETYPE_H
 
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 
 namespace llvm {
 
diff --git a/include/llvm/CodeGen/PBQP/CostAllocator.h b/include/llvm/CodeGen/PBQP/CostAllocator.h
index 1646334..ff62c09 100644
--- a/include/llvm/CodeGen/PBQP/CostAllocator.h
+++ b/include/llvm/CodeGen/PBQP/CostAllocator.h
@@ -54,7 +54,7 @@ public:
       entry->incRef();
     }
     PoolRef& operator=(const PoolRef &r) {
-      assert(entry != 0 && "entry should not be null.");
+      assert(entry != nullptr && "entry should not be null.");
       PoolEntry *temp = r.entry;
       temp->incRef();
       entry->decRef();
diff --git a/include/llvm/CodeGen/PBQP/Graph.h b/include/llvm/CodeGen/PBQP/Graph.h
index 07c3337..a55f0ea 100644
--- a/include/llvm/CodeGen/PBQP/Graph.h
+++ b/include/llvm/CodeGen/PBQP/Graph.h
@@ -29,12 +29,12 @@ namespace PBQP {
     typedef unsigned NodeId;
     typedef unsigned EdgeId;
 
-    /// \brief Returns a value representing an invalid (non-existant) node.
+    /// \brief Returns a value representing an invalid (non-existent) node.
     static NodeId invalidNodeId() {
       return std::numeric_limits<NodeId>::max();
     }
 
-    /// \brief Returns a value representing an invalid (non-existant) edge.
+    /// \brief Returns a value representing an invalid (non-existent) edge.
     static EdgeId invalidEdgeId() {
       return std::numeric_limits<EdgeId>::max();
     }
@@ -336,7 +336,7 @@ namespace PBQP {
     /// each node in the graph, and handleAddEdge for each edge, to give the
     /// solver an opportunity to set up any requried metadata.
     void setSolver(SolverT &S) {
-      assert(Solver == nullptr && "Solver already set. Call unsetSolver().");
+      assert(!Solver && "Solver already set. Call unsetSolver().");
       Solver = &S;
       for (auto NId : nodeIds())
         Solver->handleAddNode(NId);
@@ -346,7 +346,7 @@ namespace PBQP {
 
     /// \brief Release from solver instance.
     void unsetSolver() {
-      assert(Solver != nullptr && "Solver not set.");
+      assert(Solver && "Solver not set.");
       Solver = nullptr;
     }
 
diff --git a/include/llvm/CodeGen/PBQP/RegAllocSolver.h b/include/llvm/CodeGen/PBQP/RegAllocSolver.h
index 79ff6b4..977c348 100644
--- a/include/llvm/CodeGen/PBQP/RegAllocSolver.h
+++ b/include/llvm/CodeGen/PBQP/RegAllocSolver.h
@@ -86,7 +86,7 @@ namespace PBQP {
                      ConservativelyAllocatable,
                      NotProvablyAllocatable } ReductionState;
 
-      NodeMetadata() : RS(Unprocessed), DeniedOpts(0), OptUnsafeEdges(0) {}
+      NodeMetadata() : RS(Unprocessed), DeniedOpts(0), OptUnsafeEdges(nullptr){}
       ~NodeMetadata() { delete[] OptUnsafeEdges; }
 
       void setup(const Vector& Costs) {
@@ -346,7 +346,7 @@ namespace PBQP {
 
     typedef Graph<RegAllocSolverImpl> Graph;
 
-    Solution solve(Graph& G) {
+    inline Solution solve(Graph& G) {
       if (G.empty())
         return Solution();
       RegAllocSolverImpl RegAllocSolver(G);
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 5d68f86..35210f1 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -59,7 +59,7 @@ class IdentifyingPassPtr {
   };
   bool IsInstance;
 public:
-  IdentifyingPassPtr() : P(0), IsInstance(false) {}
+  IdentifyingPassPtr() : P(nullptr), IsInstance(false) {}
   IdentifyingPassPtr(AnalysisID IDPtr) : ID(IDPtr), IsInstance(false) {}
   IdentifyingPassPtr(Pass *InstancePtr) : P(InstancePtr), IsInstance(true) {}
 
@@ -133,10 +133,6 @@ public:
     return *static_cast<TMC*>(TM);
   }
 
-  const TargetLowering *getTargetLowering() const {
-    return TM->getTargetLowering();
-  }
-
   //
   void setInitialized() { Initialized = true; }
 
@@ -151,7 +147,7 @@ public:
   void setStartStopPasses(AnalysisID Start, AnalysisID Stop) {
     StartAfter = Start;
     StopAfter = Stop;
-    Started = (StartAfter == 0);
+    Started = (StartAfter == nullptr);
   }
 
   void setDisableVerify(bool Disable) { setOpt(DisableVerify, Disable); }
@@ -218,14 +214,14 @@ public:
   /// Return NULL to select the default (generic) machine scheduler.
   virtual ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const {
-    return 0;
+    return nullptr;
   }
 
   /// Similar to createMachineScheduler but used when postRA machine scheduling
   /// is enabled.
   virtual ScheduleDAGInstrs *
   createPostMachineScheduler(MachineSchedContext *C) const {
-    return 0;
+    return nullptr;
   }
 
 protected:
@@ -349,6 +345,8 @@ protected:
 
 /// List of target independent CodeGen pass IDs.
 namespace llvm {
+  FunctionPass *createAtomicExpandLoadLinkedPass(const TargetMachine *TM);
+
   /// \brief Create a basic TargetTransformInfo analysis pass.
   ///
   /// This pass implements the target transform info analysis using the target
@@ -372,7 +370,10 @@ namespace llvm {
 
   /// createCodeGenPreparePass - Transform the code to expose more pattern
   /// matching during instruction selection.
-  FunctionPass *createCodeGenPreparePass(const TargetMachine *TM = 0);
+  FunctionPass *createCodeGenPreparePass(const TargetMachine *TM = nullptr);
+
+  /// AtomicExpandLoadLinkedID -- FIXME
+  extern char &AtomicExpandLoadLinkedID;
 
   /// MachineLoopInfo - This pass is a loop analysis pass.
   extern char &MachineLoopInfoID;
@@ -547,7 +548,7 @@ namespace llvm {
   /// createMachineVerifierPass - This pass verifies cenerated machine code
   /// instructions for correctness.
   ///
-  FunctionPass *createMachineVerifierPass(const char *Banner = 0);
+  FunctionPass *createMachineVerifierPass(const char *Banner = nullptr);
 
   /// createDwarfEHPass - This pass mulches exception handling code into a form
   /// adapted to code generation.  Required if using dwarf exception handling.
diff --git a/include/llvm/CodeGen/PseudoSourceValue.h b/include/llvm/CodeGen/PseudoSourceValue.h
index 0af8915..cc3e25a 100644
--- a/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/include/llvm/CodeGen/PseudoSourceValue.h
@@ -18,21 +18,32 @@
 
 namespace llvm {
   class MachineFrameInfo;
+  class MachineMemOperand;
   class raw_ostream;
 
+  raw_ostream &operator<<(raw_ostream &OS, const MachineMemOperand &MMO);
+
   /// PseudoSourceValue - Special value supplied for machine level alias
   /// analysis. It indicates that a memory access references the functions
   /// stack frame (e.g., a spill slot), below the stack frame (e.g., argument
   /// space), or constant pool.
-  class PseudoSourceValue : public Value {
+  class PseudoSourceValue {
   private:
+    friend raw_ostream &llvm::operator<<(raw_ostream &OS,
+                                         const MachineMemOperand &MMO);
+
     /// printCustom - Implement printing for PseudoSourceValue. This is called
     /// from Value::print or Value's operator<<.
     ///
-    void printCustom(raw_ostream &O) const override;
+    virtual void printCustom(raw_ostream &O) const;
 
   public:
-    explicit PseudoSourceValue(enum ValueTy Subclass = PseudoSourceValueVal);
+    /// isFixed - Whether this is a FixedStackPseudoSourceValue.
+    bool isFixed;
+
+    explicit PseudoSourceValue(bool isFixed = false);
+
+    virtual ~PseudoSourceValue();
 
     /// isConstant - Test whether the memory pointed to by this
     /// PseudoSourceValue has a constant value.
@@ -47,14 +58,6 @@ namespace llvm {
     /// PseudoSourceValue can ever alias an LLVM IR Value.
     virtual bool mayAlias(const MachineFrameInfo *) const;
 
-    /// classof - Methods for support type inquiry through isa, cast, and
-    /// dyn_cast:
-    ///
-    static inline bool classof(const Value *V) {
-      return V->getValueID() == PseudoSourceValueVal ||
-             V->getValueID() == FixedStackPseudoSourceValueVal;
-    }
-
     /// A pseudo source value referencing a fixed stack frame entry,
     /// e.g., a spill slot.
     static const PseudoSourceValue *getFixedStack(int FI);
@@ -84,13 +87,13 @@ namespace llvm {
     const int FI;
   public:
     explicit FixedStackPseudoSourceValue(int fi) :
-        PseudoSourceValue(FixedStackPseudoSourceValueVal), FI(fi) {}
+        PseudoSourceValue(true), FI(fi) {}
 
     /// classof - Methods for support type inquiry through isa, cast, and
     /// dyn_cast:
     ///
-    static inline bool classof(const Value *V) {
-      return V->getValueID() == FixedStackPseudoSourceValueVal;
+    static inline bool classof(const PseudoSourceValue *V) {
+      return V->isFixed == true;
     }
 
     bool isConstant(const MachineFrameInfo *MFI) const override;
diff --git a/include/llvm/CodeGen/RegAllocPBQP.h b/include/llvm/CodeGen/RegAllocPBQP.h
index efd7c61..6343bb7 100644
--- a/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/include/llvm/CodeGen/RegAllocPBQP.h
@@ -159,7 +159,7 @@ namespace llvm {
 
   FunctionPass *
   createPBQPRegisterAllocator(std::unique_ptr<PBQPBuilder> &builder,
-                              char *customPassID = 0);
+                              char *customPassID = nullptr);
 }
 
 #endif /* LLVM_CODEGEN_REGALLOCPBQP_H */
diff --git a/include/llvm/CodeGen/RegisterClassInfo.h b/include/llvm/CodeGen/RegisterClassInfo.h
index 9ec12bd..d784dfb 100644
--- a/include/llvm/CodeGen/RegisterClassInfo.h
+++ b/include/llvm/CodeGen/RegisterClassInfo.h
@@ -19,7 +19,6 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 namespace llvm {
@@ -31,7 +30,7 @@ class RegisterClassInfo {
     bool ProperSubClass;
     uint8_t MinCost;
     uint16_t LastCostChange;
-    OwningArrayPtr<MCPhysReg> Order;
+    std::unique_ptr<MCPhysReg[]> Order;
 
     RCInfo()
       : Tag(0), NumRegs(0), ProperSubClass(false), MinCost(0),
@@ -43,7 +42,7 @@ class RegisterClassInfo {
   };
 
   // Brief cached information for each register class.
-  OwningArrayPtr<RCInfo> RegClass;
+  std::unique_ptr<RCInfo[]> RegClass;
 
   // Tag changes whenever cached information needs to be recomputed. An RCInfo
   // entry is valid when its tag matches.
@@ -54,7 +53,7 @@ class RegisterClassInfo {
 
   // Callee saved registers of last MF. Assumed to be valid until the next
   // runOnFunction() call.
-  const uint16_t *CalleeSaved;
+  const MCPhysReg *CalleeSaved;
 
   // Map register number to CalleeSaved index + 1;
   SmallVector<uint8_t, 4> CSRNum;
@@ -62,7 +61,7 @@ class RegisterClassInfo {
   // Reserved registers in the current MF.
   BitVector Reserved;
 
-  OwningArrayPtr<unsigned> PSetLimits;
+  std::unique_ptr<unsigned[]> PSetLimits;
 
   // Compute all information about RC.
   void compute(const TargetRegisterClass *RC) const;
diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index a801d1d..c11a6ac 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -158,7 +158,7 @@ class PressureDiffs {
   unsigned Size;
   unsigned Max;
 public:
-  PressureDiffs(): PDiffArray(0), Size(0), Max(0) {}
+  PressureDiffs(): PDiffArray(nullptr), Size(0), Max(0) {}
   ~PressureDiffs() { free(PDiffArray); }
 
   void clear() { Size = 0; }
@@ -285,12 +285,12 @@ class RegPressureTracker {
 
 public:
   RegPressureTracker(IntervalPressure &rp) :
-    MF(0), TRI(0), RCI(0), LIS(0), MBB(0), P(rp), RequireIntervals(true),
-    TrackUntiedDefs(false) {}
+    MF(nullptr), TRI(nullptr), RCI(nullptr), LIS(nullptr), MBB(nullptr), P(rp),
+    RequireIntervals(true), TrackUntiedDefs(false) {}
 
   RegPressureTracker(RegionPressure &rp) :
-    MF(0), TRI(0), RCI(0), LIS(0), MBB(0), P(rp), RequireIntervals(false),
-    TrackUntiedDefs(false) {}
+    MF(nullptr), TRI(nullptr), RCI(nullptr), LIS(nullptr), MBB(nullptr), P(rp),
+    RequireIntervals(false), TrackUntiedDefs(false) {}
 
   void reset();
 
@@ -318,7 +318,8 @@ public:
   SlotIndex getCurrSlot() const;
 
   /// Recede across the previous instruction.
-  bool recede(SmallVectorImpl<unsigned> *LiveUses = 0, PressureDiff *PDiff = 0);
+  bool recede(SmallVectorImpl<unsigned> *LiveUses = nullptr,
+              PressureDiff *PDiff = nullptr);
 
   /// Advance across the current instruction.
   bool advance();
@@ -393,7 +394,7 @@ public:
                                          MaxPressureLimit);
 
     assert(isBottomClosed() && "Uninitialized pressure tracker");
-    return getMaxUpwardPressureDelta(MI, 0, Delta, CriticalPSets,
+    return getMaxUpwardPressureDelta(MI, nullptr, Delta, CriticalPSets,
                                      MaxPressureLimit);
   }
 
diff --git a/include/llvm/CodeGen/RegisterScavenging.h b/include/llvm/CodeGen/RegisterScavenging.h
index 28ebe53..335dd7f 100644
--- a/include/llvm/CodeGen/RegisterScavenging.h
+++ b/include/llvm/CodeGen/RegisterScavenging.h
@@ -42,7 +42,7 @@ class RegScavenger {
 
   /// Information on scavenged registers (held in a spill slot).
   struct ScavengedInfo {
-    ScavengedInfo(int FI = -1) : FrameIndex(FI), Reg(0), Restore(NULL) {}
+    ScavengedInfo(int FI = -1) : FrameIndex(FI), Reg(0), Restore(nullptr) {}
 
     /// A spill slot used for scavenging a register post register allocation.
     int FrameIndex;
@@ -73,7 +73,7 @@ class RegScavenger {
 
 public:
   RegScavenger()
-    : MBB(NULL), NumPhysRegs(0), Tracking(false) {}
+    : MBB(nullptr), NumPhysRegs(0), Tracking(false) {}
 
   /// enterBasicBlock - Start tracking liveness from the begin of the specific
   /// basic block.
@@ -104,7 +104,7 @@ public:
 
   /// skipTo - Move the internal MBB iterator but do not update register states.
   void skipTo(MachineBasicBlock::iterator I) {
-    if (I == MachineBasicBlock::iterator(NULL))
+    if (I == MachineBasicBlock::iterator(nullptr))
       Tracking = false;
     MBBI = I;
   }
diff --git a/include/llvm/CodeGen/ResourcePriorityQueue.h b/include/llvm/CodeGen/ResourcePriorityQueue.h
index 7ae9111..114fe7c 100644
--- a/include/llvm/CodeGen/ResourcePriorityQueue.h
+++ b/include/llvm/CodeGen/ResourcePriorityQueue.h
@@ -92,7 +92,7 @@ namespace llvm {
     void updateNode(const SUnit *SU) override {}
 
     void releaseState() override {
-      SUnits = 0;
+      SUnits = nullptr;
     }
 
     unsigned getLatency(unsigned NodeNum) const {
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index 4886e5c..5a65d59 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -95,7 +95,7 @@ namespace llvm {
     /// SDep - Construct a null SDep. This is only for use by container
     /// classes which require default constructors. SUnits may not
     /// have null SDep edges.
-    SDep() : Dep(0, Data) {}
+    SDep() : Dep(nullptr, Data) {}
 
     /// SDep - Construct an SDep with the specified values.
     SDep(SUnit *S, Kind kind, unsigned Reg)
@@ -317,46 +317,49 @@ namespace llvm {
     /// SUnit - Construct an SUnit for pre-regalloc scheduling to represent
     /// an SDNode and any nodes flagged to it.
     SUnit(SDNode *node, unsigned nodenum)
-      : Node(node), Instr(0), OrigNode(0), SchedClass(0), NodeNum(nodenum),
-        NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
-        NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0), NumRegDefsLeft(0),
-        Latency(0), isVRegCycle(false), isCall(false), isCallOp(false),
-        isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
-        hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
-        isAvailable(false), isScheduled(false), isScheduleHigh(false),
-        isScheduleLow(false), isCloned(false), isUnbuffered(false),
-        hasReservedResource(false), SchedulingPref(Sched::None),
-        isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
-        TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
+      : Node(node), Instr(nullptr), OrigNode(nullptr), SchedClass(nullptr),
+        NodeNum(nodenum), NodeQueueId(0), NumPreds(0), NumSuccs(0),
+        NumPredsLeft(0), NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0),
+        NumRegDefsLeft(0), Latency(0), isVRegCycle(false), isCall(false),
+        isCallOp(false), isTwoAddress(false), isCommutable(false),
+        hasPhysRegUses(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
+        isPending(false), isAvailable(false), isScheduled(false),
+        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
+        isUnbuffered(false), hasReservedResource(false),
+        SchedulingPref(Sched::None), isDepthCurrent(false),
+        isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
+        BotReadyCycle(0), CopyDstRC(nullptr), CopySrcRC(nullptr) {}
 
     /// SUnit - Construct an SUnit for post-regalloc scheduling to represent
     /// a MachineInstr.
     SUnit(MachineInstr *instr, unsigned nodenum)
-      : Node(0), Instr(instr), OrigNode(0), SchedClass(0), NodeNum(nodenum),
-        NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
-        NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0), NumRegDefsLeft(0),
-        Latency(0), isVRegCycle(false), isCall(false), isCallOp(false),
-        isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
-        hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
-        isAvailable(false), isScheduled(false), isScheduleHigh(false),
-        isScheduleLow(false), isCloned(false), isUnbuffered(false),
-        hasReservedResource(false), SchedulingPref(Sched::None),
-        isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
-        TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
+      : Node(nullptr), Instr(instr), OrigNode(nullptr), SchedClass(nullptr),
+        NodeNum(nodenum), NodeQueueId(0), NumPreds(0), NumSuccs(0),
+        NumPredsLeft(0), NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0),
+        NumRegDefsLeft(0), Latency(0), isVRegCycle(false), isCall(false),
+        isCallOp(false), isTwoAddress(false), isCommutable(false),
+        hasPhysRegUses(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
+        isPending(false), isAvailable(false), isScheduled(false),
+        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
+        isUnbuffered(false), hasReservedResource(false),
+        SchedulingPref(Sched::None), isDepthCurrent(false),
+        isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
+        BotReadyCycle(0), CopyDstRC(nullptr), CopySrcRC(nullptr) {}
 
     /// SUnit - Construct a placeholder SUnit.
     SUnit()
-      : Node(0), Instr(0), OrigNode(0), SchedClass(0), NodeNum(BoundaryID),
-        NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
-        NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0), NumRegDefsLeft(0),
-        Latency(0), isVRegCycle(false), isCall(false), isCallOp(false),
-        isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
-        hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
-        isAvailable(false), isScheduled(false), isScheduleHigh(false),
-        isScheduleLow(false), isCloned(false), isUnbuffered(false),
-        hasReservedResource(false), SchedulingPref(Sched::None),
-        isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
-        TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
+      : Node(nullptr), Instr(nullptr), OrigNode(nullptr), SchedClass(nullptr),
+        NodeNum(BoundaryID), NodeQueueId(0), NumPreds(0), NumSuccs(0),
+        NumPredsLeft(0), NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0),
+        NumRegDefsLeft(0), Latency(0), isVRegCycle(false), isCall(false),
+        isCallOp(false), isTwoAddress(false), isCommutable(false),
+        hasPhysRegUses(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
+        isPending(false), isAvailable(false), isScheduled(false),
+        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
+        isUnbuffered(false), hasReservedResource(false),
+        SchedulingPref(Sched::None), isDepthCurrent(false),
+        isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
+        BotReadyCycle(0), CopyDstRC(nullptr), CopySrcRC(nullptr) {}
 
     /// \brief Boundary nodes are placeholders for the boundary of the
     /// scheduling region.
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 72bbe8b..e6754a2 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -158,7 +158,7 @@ namespace llvm {
                                const MachineDominatorTree &mdt,
                                bool IsPostRAFlag,
                                bool RemoveKillFlags = false,
-                               LiveIntervals *LIS = 0);
+                               LiveIntervals *LIS = nullptr);
 
     virtual ~ScheduleDAGInstrs() {}
 
@@ -206,8 +206,9 @@ namespace llvm {
 
     /// buildSchedGraph - Build SUnits from the MachineBasicBlock that we are
     /// input.
-    void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker = 0,
-                         PressureDiffs *PDiffs = 0);
+    void buildSchedGraph(AliasAnalysis *AA,
+                         RegPressureTracker *RPTracker = nullptr,
+                         PressureDiffs *PDiffs = nullptr);
 
     /// addSchedBarrierDeps - Add dependencies from instructions in the current
     /// list of instructions being scheduled to scheduling barrier. We want to
@@ -259,10 +260,10 @@ namespace llvm {
   /// newSUnit - Creates a new SUnit and return a ptr to it.
   inline SUnit *ScheduleDAGInstrs::newSUnit(MachineInstr *MI) {
 #ifndef NDEBUG
-    const SUnit *Addr = SUnits.empty() ? 0 : &SUnits[0];
+    const SUnit *Addr = SUnits.empty() ? nullptr : &SUnits[0];
 #endif
     SUnits.push_back(SUnit(MI, (unsigned)SUnits.size()));
-    assert((Addr == 0 || Addr == &SUnits[0]) &&
+    assert((Addr == nullptr || Addr == &SUnits[0]) &&
            "SUnits std::vector reallocated on the fly!");
     SUnits.back().OrigNode = &SUnits.back();
     return &SUnits.back();
@@ -272,7 +273,7 @@ namespace llvm {
   inline SUnit *ScheduleDAGInstrs::getSUnit(MachineInstr *MI) const {
     DenseMap<MachineInstr*, SUnit*>::const_iterator I = MISUnitMap.find(MI);
     if (I == MISUnitMap.end())
-      return 0;
+      return nullptr;
     return I->second;
   }
 } // namespace llvm
diff --git a/include/llvm/CodeGen/ScoreboardHazardRecognizer.h b/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
index fbbbb0c..ab14c2d 100644
--- a/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
+++ b/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
@@ -47,7 +47,7 @@ class ScoreboardHazardRecognizer : public ScheduleHazardRecognizer {
     // Indices into the Scoreboard that represent the current cycle.
     size_t Head;
   public:
-    Scoreboard():Data(NULL), Depth(0), Head(0) { }
+    Scoreboard():Data(nullptr), Depth(0), Head(0) { }
     ~Scoreboard() {
       delete[] Data;
     }
@@ -62,7 +62,7 @@ class ScoreboardHazardRecognizer : public ScheduleHazardRecognizer {
     }
 
     void reset(size_t d = 1) {
-      if (Data == NULL) {
+      if (!Data) {
         Depth = d;
         Data = new unsigned[Depth];
       }
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index a30656a..d9c38c0 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -392,7 +392,7 @@ public:
   SDVTList getVTList(EVT VT1, EVT VT2);
   SDVTList getVTList(EVT VT1, EVT VT2, EVT VT3);
   SDVTList getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4);
-  SDVTList getVTList(const EVT *VTs, unsigned NumVTs);
+  SDVTList getVTList(ArrayRef<EVT> VTs);
 
   //===--------------------------------------------------------------------===//
   // Node creation methods.
@@ -496,7 +496,8 @@ public:
                        SDValue Glue) {
     SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, getRegister(Reg, N.getValueType()), N, Glue };
-    return getNode(ISD::CopyToReg, dl, VTs, Ops, Glue.getNode() ? 4 : 3);
+    return getNode(ISD::CopyToReg, dl, VTs,
+                   ArrayRef<SDValue>(Ops, Glue.getNode() ? 4 : 3));
   }
 
   // Similar to last getCopyToReg() except parameter Reg is a SDValue
@@ -504,13 +505,14 @@ public:
                          SDValue Glue) {
     SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, Reg, N, Glue };
-    return getNode(ISD::CopyToReg, dl, VTs, Ops, Glue.getNode() ? 4 : 3);
+    return getNode(ISD::CopyToReg, dl, VTs,
+                   ArrayRef<SDValue>(Ops, Glue.getNode() ? 4 : 3));
   }
 
   SDValue getCopyFromReg(SDValue Chain, SDLoc dl, unsigned Reg, EVT VT) {
     SDVTList VTs = getVTList(VT, MVT::Other);
     SDValue Ops[] = { Chain, getRegister(Reg, VT) };
-    return getNode(ISD::CopyFromReg, dl, VTs, Ops, 2);
+    return getNode(ISD::CopyFromReg, dl, VTs, Ops);
   }
 
   // This version of the getCopyFromReg method takes an extra operand, which
@@ -520,7 +522,8 @@ public:
                            SDValue Glue) {
     SDVTList VTs = getVTList(VT, MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, getRegister(Reg, VT), Glue };
-    return getNode(ISD::CopyFromReg, dl, VTs, Ops, Glue.getNode() ? 3 : 2);
+    return getNode(ISD::CopyFromReg, dl, VTs,
+                   ArrayRef<SDValue>(Ops, Glue.getNode() ? 3 : 2));
   }
 
   SDValue getCondCode(ISD::CondCode Cond);
@@ -554,16 +557,24 @@ public:
   /// value assuming it was the smaller SrcTy value.
   SDValue getZeroExtendInReg(SDValue Op, SDLoc DL, EVT SrcTy);
 
+  /// getBoolExtOrTrunc - Convert Op, which must be of integer type, to the
+  /// integer type VT, by using an extension appropriate for the target's
+  /// BooleanContent or truncating it.
+  SDValue getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT);
+
   /// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
   SDValue getNOT(SDLoc DL, SDValue Val, EVT VT);
 
+  /// \brief Create a logical NOT operation as (XOR Val, BooleanOne).
+  SDValue getLogicalNOT(SDLoc DL, SDValue Val, EVT VT);
+
   /// getCALLSEQ_START - Return a new CALLSEQ_START node, which always must have
   /// a glue result (to ensure it's not CSE'd).  CALLSEQ_START does not have a
   /// useful SDLoc.
   SDValue getCALLSEQ_START(SDValue Chain, SDValue Op, SDLoc DL) {
     SDVTList VTs = getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain,  Op };
-    return getNode(ISD::CALLSEQ_START, DL, VTs, Ops, 2);
+    return getNode(ISD::CALLSEQ_START, DL, VTs, Ops);
   }
 
   /// getCALLSEQ_END - Return a new CALLSEQ_END node, which always must have a
@@ -576,9 +587,9 @@ public:
     Ops.push_back(Chain);
     Ops.push_back(Op1);
     Ops.push_back(Op2);
-    Ops.push_back(InGlue);
-    return getNode(ISD::CALLSEQ_END, DL, NodeTys, &Ops[0],
-                   (unsigned)Ops.size() - (InGlue.getNode() == 0 ? 1 : 0));
+    if (InGlue.getNode())
+      Ops.push_back(InGlue);
+    return getNode(ISD::CALLSEQ_END, DL, NodeTys, Ops);
   }
 
   /// getUNDEF - Return an UNDEF node.  UNDEF does not have a useful SDLoc.
@@ -604,17 +615,14 @@ public:
   SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT,
                   SDValue N1, SDValue N2, SDValue N3, SDValue N4,
                   SDValue N5);
+  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT, ArrayRef<SDUse> Ops);
   SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                  const SDUse *Ops, unsigned NumOps);
-  SDValue getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                  const SDValue *Ops, unsigned NumOps);
+                  ArrayRef<SDValue> Ops);
   SDValue getNode(unsigned Opcode, SDLoc DL,
                   ArrayRef<EVT> ResultTys,
-                  const SDValue *Ops, unsigned NumOps);
-  SDValue getNode(unsigned Opcode, SDLoc DL, const EVT *VTs, unsigned NumVTs,
-                  const SDValue *Ops, unsigned NumOps);
+                  ArrayRef<SDValue> Ops);
   SDValue getNode(unsigned Opcode, SDLoc DL, SDVTList VTs,
-                  const SDValue *Ops, unsigned NumOps);
+                  ArrayRef<SDValue> Ops);
   SDValue getNode(unsigned Opcode, SDLoc DL, SDVTList VTs);
   SDValue getNode(unsigned Opcode, SDLoc DL, SDVTList VTs, SDValue N);
   SDValue getNode(unsigned Opcode, SDLoc DL, SDVTList VTs,
@@ -705,7 +713,7 @@ public:
   /// getAtomic - Gets a node for an atomic op, produces result (if relevant)
   /// and chain and takes 2 operands.
   SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain,
-                    SDValue Ptr, SDValue Val, const Value* PtrVal,
+                    SDValue Ptr, SDValue Val, const Value *PtrVal,
                     unsigned Alignment, AtomicOrdering Ordering,
                     SynchronizationScope SynchScope);
   SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDValue Chain,
@@ -716,11 +724,6 @@ public:
   /// getAtomic - Gets a node for an atomic op, produces result and chain and
   /// takes 1 operand.
   SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, EVT VT,
-                    SDValue Chain, SDValue Ptr, const Value* PtrVal,
-                    unsigned Alignment,
-                    AtomicOrdering Ordering,
-                    SynchronizationScope SynchScope);
-  SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, EVT VT,
                     SDValue Chain, SDValue Ptr, MachineMemOperand *MMO,
                     AtomicOrdering Ordering,
                     SynchronizationScope SynchScope);
@@ -728,37 +731,30 @@ public:
   /// getAtomic - Gets a node for an atomic op, produces result and chain and
   /// takes N operands.
   SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTList,
-                    SDValue *Ops, unsigned NumOps, MachineMemOperand *MMO,
+                    ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
                     AtomicOrdering SuccessOrdering,
                     AtomicOrdering FailureOrdering,
                     SynchronizationScope SynchScope);
   SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTList,
-                    SDValue *Ops, unsigned NumOps, MachineMemOperand *MMO,
+                    ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
                     AtomicOrdering Ordering, SynchronizationScope SynchScope);
 
   /// getMemIntrinsicNode - Creates a MemIntrinsicNode that may produce a
   /// result and takes a list of operands. Opcode may be INTRINSIC_VOID,
   /// INTRINSIC_W_CHAIN, or a target-specific opcode with a value not
   /// less than FIRST_TARGET_MEMORY_OPCODE.
-  SDValue getMemIntrinsicNode(unsigned Opcode, SDLoc dl,
-                              const EVT *VTs, unsigned NumVTs,
-                              const SDValue *Ops, unsigned NumOps,
-                              EVT MemVT, MachinePointerInfo PtrInfo,
-                              unsigned Align = 0, bool Vol = false,
-                              bool ReadMem = true, bool WriteMem = true);
-
   SDValue getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
-                              const SDValue *Ops, unsigned NumOps,
+                              ArrayRef<SDValue> Ops,
                               EVT MemVT, MachinePointerInfo PtrInfo,
                               unsigned Align = 0, bool Vol = false,
                               bool ReadMem = true, bool WriteMem = true);
 
   SDValue getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
-                              const SDValue *Ops, unsigned NumOps,
+                              ArrayRef<SDValue> Ops,
                               EVT MemVT, MachineMemOperand *MMO);
 
   /// getMergeValues - Create a MERGE_VALUES node from the given operands.
-  SDValue getMergeValues(const SDValue *Ops, unsigned NumOps, SDLoc dl);
+  SDValue getMergeValues(ArrayRef<SDValue> Ops, SDLoc dl);
 
   /// getLoad - Loads are not normal binary operators: their result type is not
   /// determined by their operands, and they produce a value AND a token chain.
@@ -766,14 +762,15 @@ public:
   SDValue getLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr,
                   MachinePointerInfo PtrInfo, bool isVolatile,
                   bool isNonTemporal, bool isInvariant, unsigned Alignment,
-                  const MDNode *TBAAInfo = 0, const MDNode *Ranges = 0);
+                  const MDNode *TBAAInfo = nullptr,
+                  const MDNode *Ranges = nullptr);
   SDValue getLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr,
                   MachineMemOperand *MMO);
   SDValue getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
                      SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo,
                      EVT MemVT, bool isVolatile,
                      bool isNonTemporal, unsigned Alignment,
-                     const MDNode *TBAAInfo = 0);
+                     const MDNode *TBAAInfo = nullptr);
   SDValue getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
                      SDValue Chain, SDValue Ptr, EVT MemVT,
                      MachineMemOperand *MMO);
@@ -784,8 +781,8 @@ public:
                   SDValue Chain, SDValue Ptr, SDValue Offset,
                   MachinePointerInfo PtrInfo, EVT MemVT,
                   bool isVolatile, bool isNonTemporal, bool isInvariant,
-                  unsigned Alignment, const MDNode *TBAAInfo = 0,
-                  const MDNode *Ranges = 0);
+                  unsigned Alignment, const MDNode *TBAAInfo = nullptr,
+                  const MDNode *Ranges = nullptr);
   SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
                   EVT VT, SDLoc dl,
                   SDValue Chain, SDValue Ptr, SDValue Offset,
@@ -796,14 +793,14 @@ public:
   SDValue getStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr,
                    MachinePointerInfo PtrInfo, bool isVolatile,
                    bool isNonTemporal, unsigned Alignment,
-                   const MDNode *TBAAInfo = 0);
+                   const MDNode *TBAAInfo = nullptr);
   SDValue getStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr,
                    MachineMemOperand *MMO);
   SDValue getTruncStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr,
                         MachinePointerInfo PtrInfo, EVT TVT,
                         bool isNonTemporal, bool isVolatile,
                         unsigned Alignment,
-                        const MDNode *TBAAInfo = 0);
+                        const MDNode *TBAAInfo = nullptr);
   SDValue getTruncStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr,
                         EVT TVT, MachineMemOperand *MMO);
   SDValue getIndexedStore(SDValue OrigStoe, SDLoc dl, SDValue Base,
@@ -837,8 +834,7 @@ public:
                                SDValue Op3, SDValue Op4);
   SDNode *UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
                                SDValue Op3, SDValue Op4, SDValue Op5);
-  SDNode *UpdateNodeOperands(SDNode *N,
-                               const SDValue *Ops, unsigned NumOps);
+  SDNode *UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops);
 
   /// SelectNodeTo - These are used for target selectors to *mutate* the
   /// specified node to have the specified return type, Target opcode, and
@@ -851,15 +847,14 @@ public:
   SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT,
                        SDValue Op1, SDValue Op2, SDValue Op3);
   SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT,
-                       const SDValue *Ops, unsigned NumOps);
+                       ArrayRef<SDValue> Ops);
   SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1, EVT VT2);
   SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1,
-                       EVT VT2, const SDValue *Ops, unsigned NumOps);
+                       EVT VT2, ArrayRef<SDValue> Ops);
   SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1,
-                       EVT VT2, EVT VT3, const SDValue *Ops, unsigned NumOps);
+                       EVT VT2, EVT VT3, ArrayRef<SDValue> Ops);
   SDNode *SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT1,
-                       EVT VT2, EVT VT3, EVT VT4, const SDValue *Ops,
-                       unsigned NumOps);
+                       EVT VT2, EVT VT3, EVT VT4, ArrayRef<SDValue> Ops);
   SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1,
                        EVT VT2, SDValue Op1);
   SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1,
@@ -869,12 +864,12 @@ public:
   SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, EVT VT1,
                        EVT VT2, EVT VT3, SDValue Op1, SDValue Op2, SDValue Op3);
   SDNode *SelectNodeTo(SDNode *N, unsigned TargetOpc, SDVTList VTs,
-                       const SDValue *Ops, unsigned NumOps);
+                       ArrayRef<SDValue> Ops);
 
   /// MorphNodeTo - This *mutates* the specified node to have the specified
   /// return type, opcode, and operands.
   SDNode *MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs,
-                      const SDValue *Ops, unsigned NumOps);
+                      ArrayRef<SDValue> Ops);
 
   /// getMachineNode - These are used for target selectors to create a new node
   /// with specified return type(s), MachineInstr opcode, and operands.
@@ -927,17 +922,19 @@ public:
 
   /// getNodeIfExists - Get the specified node if it's already available, or
   /// else return NULL.
-  SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTs,
-                          const SDValue *Ops, unsigned NumOps);
+  SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTs, ArrayRef<SDValue> Ops);
 
   /// getDbgValue - Creates a SDDbgValue node.
   ///
-  SDDbgValue *getDbgValue(MDNode *MDPtr, SDNode *N, unsigned R, uint64_t Off,
-                          DebugLoc DL, unsigned O);
-  SDDbgValue *getDbgValue(MDNode *MDPtr, const Value *C, uint64_t Off,
-                          DebugLoc DL, unsigned O);
-  SDDbgValue *getDbgValue(MDNode *MDPtr, unsigned FI, uint64_t Off,
+  SDDbgValue *getDbgValue(MDNode *MDPtr, SDNode *N, unsigned R,
+			  bool IsIndirect, uint64_t Off,
                           DebugLoc DL, unsigned O);
+  /// Constant.
+  SDDbgValue *getConstantDbgValue(MDNode *MDPtr, const Value *C, uint64_t Off,
+				  DebugLoc DL, unsigned O);
+  /// Frame index.
+  SDDbgValue *getFrameIndexDbgValue(MDNode *MDPtr, unsigned FI, uint64_t Off,
+				    DebugLoc DL, unsigned O);
 
   /// RemoveDeadNode - Remove the specified node from the system. If any of its
   /// operands then becomes dead, remove them as well. Inform UpdateListener
@@ -1082,13 +1079,12 @@ public:
   bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth = 0)
     const;
 
-  /// ComputeMaskedBits - Determine which of the bits specified in Mask are
-  /// known to be either zero or one and return them in the KnownZero/KnownOne
-  /// bitsets.  This code only analyzes bits in Mask, in order to short-circuit
-  /// processing.  Targets can implement the computeMaskedBitsForTargetNode
-  /// method in the TargetLowering class to allow target nodes to be understood.
-  void ComputeMaskedBits(SDValue Op, APInt &KnownZero, APInt &KnownOne,
-                         unsigned Depth = 0) const;
+  /// Determine which bits of Op are known to be either zero or one and return
+  /// them in the KnownZero/KnownOne bitsets.  Targets can implement the
+  /// computeKnownBitsForTargetNode method in the TargetLowering class to allow
+  /// target nodes to be understood.
+  void computeKnownBits(SDValue Op, APInt &KnownZero, APInt &KnownOne,
+                        unsigned Depth = 0) const;
 
   /// ComputeNumSignBits - Return the number of times the sign bit of the
   /// register is replicated into the other bits.  We know that at least 1 bit
@@ -1160,21 +1156,27 @@ public:
     return SplitVector(N->getOperand(OpNo), SDLoc(N));
   }
 
+  /// ExtractVectorElements - Append the extracted elements from Start to Count
+  /// out of the vector Op in Args. If Count is 0, all of the elements will be
+  /// extracted.
+  void ExtractVectorElements(SDValue Op, SmallVectorImpl<SDValue> &Args,
+                             unsigned Start = 0, unsigned Count = 0);
+
+  unsigned getEVTAlignment(EVT MemoryVT) const;
+
 private:
   bool RemoveNodeFromCSEMaps(SDNode *N);
   void AddModifiedNodeToCSEMaps(SDNode *N);
   SDNode *FindModifiedNodeSlot(SDNode *N, SDValue Op, void *&InsertPos);
   SDNode *FindModifiedNodeSlot(SDNode *N, SDValue Op1, SDValue Op2,
                                void *&InsertPos);
-  SDNode *FindModifiedNodeSlot(SDNode *N, const SDValue *Ops, unsigned NumOps,
+  SDNode *FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
                                void *&InsertPos);
   SDNode *UpdadeSDLocOnMergedSDNode(SDNode *N, SDLoc loc);
 
   void DeleteNodeNotInCSEMaps(SDNode *N);
   void DeallocateNode(SDNode *N);
 
-  unsigned getEVTAlignment(EVT MemoryVT) const;
-
   void allnodes_clear();
 
   /// VTList - List of non-single value types.
diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h
index b92b6ec..520be40 100644
--- a/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/include/llvm/CodeGen/SelectionDAGISel.h
@@ -242,13 +242,15 @@ private:
 
   // Calls to these functions are generated by tblgen.
   SDNode *Select_INLINEASM(SDNode *N);
+  SDNode *Select_READ_REGISTER(SDNode *N);
+  SDNode *Select_WRITE_REGISTER(SDNode *N);
   SDNode *Select_UNDEF(SDNode *N);
   void CannotYetSelect(SDNode *N);
 
 private:
   void DoInstructionSelection();
   SDNode *MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTs,
-                    const SDValue *Ops, unsigned NumOps, unsigned EmitNodeInfo);
+                    ArrayRef<SDValue> Ops, unsigned EmitNodeInfo);
 
   void PrepareEHLandingPad();
 
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index fd915b0..4f0ddb7 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -19,6 +19,7 @@
 #ifndef LLVM_CODEGEN_SELECTIONDAGNODES_H
 #define LLVM_CODEGEN_SELECTIONDAGNODES_H
 
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/STLExtras.h"
@@ -99,7 +100,7 @@ class SDValue {
   SDNode *Node;       // The node defining the value we are using.
   unsigned ResNo;     // Which return value of the node we are using.
 public:
-  SDValue() : Node(0), ResNo(0) {}
+  SDValue() : Node(nullptr), ResNo(0) {}
   SDValue(SDNode *node, unsigned resno) : Node(node), ResNo(resno) {}
 
   /// get the index which selects a specific result in the SDNode
@@ -233,7 +234,7 @@ class SDUse {
   void operator=(const SDUse &U) LLVM_DELETED_FUNCTION;
 
 public:
-  SDUse() : Val(), User(NULL), Prev(NULL), Next(NULL) {}
+  SDUse() : Val(), User(nullptr), Prev(nullptr), Next(nullptr) {}
 
   /// Normally SDUse will just implicitly convert to an SDValue that it holds.
   operator const SDValue&() const { return Val; }
@@ -407,7 +408,7 @@ public:
 
   /// use_empty - Return true if there are no uses of this node.
   ///
-  bool use_empty() const { return UseList == NULL; }
+  bool use_empty() const { return UseList == nullptr; }
 
   /// hasOneUse - Return true if there is exactly one use of this node.
   ///
@@ -457,7 +458,7 @@ public:
                           SDUse, ptrdiff_t>::pointer pointer;
 
     use_iterator(const use_iterator &I) : Op(I.Op) {}
-    use_iterator() : Op(0) {}
+    use_iterator() : Op(nullptr) {}
 
     bool operator==(const use_iterator &x) const {
       return Op == x.Op;
@@ -467,7 +468,7 @@ public:
     }
 
     /// atEnd - return true if this iterator is at the end of uses list.
-    bool atEnd() const { return Op == 0; }
+    bool atEnd() const { return Op == nullptr; }
 
     // Iterator traversal: forward iteration only.
     use_iterator &operator++() {          // Preincrement
@@ -505,8 +506,14 @@ public:
     return use_iterator(UseList);
   }
 
-  static use_iterator use_end() { return use_iterator(0); }
+  static use_iterator use_end() { return use_iterator(nullptr); }
 
+  inline iterator_range<use_iterator> uses() {
+    return iterator_range<use_iterator>(use_begin(), use_end());
+  }
+  inline iterator_range<use_iterator> uses() const {
+    return iterator_range<use_iterator>(use_begin(), use_end());
+  }
 
   /// hasNUsesOfValue - Return true if there are exactly NUSES uses of the
   /// indicated value.  This method ignores uses of other values defined by this
@@ -579,7 +586,7 @@ public:
     if (getNumOperands() != 0 &&
       getOperand(getNumOperands()-1).getValueType() == MVT::Glue)
       return getOperand(getNumOperands()-1).getNode();
-    return 0;
+    return nullptr;
   }
 
   // If this is a pseudo op, like copyfromreg, look to see if there is a
@@ -604,7 +611,7 @@ public:
     for (use_iterator UI = use_begin(), UE = use_end(); UI != UE; ++UI)
       if (UI.getUse().get().getValueType() == MVT::Glue)
         return *UI;
-    return 0;
+    return nullptr;
   }
 
   /// getNumValues - Return the number of values defined/returned by this
@@ -637,12 +644,12 @@ public:
 
   /// getOperationName - Return the opcode of this operation for printing.
   ///
-  std::string getOperationName(const SelectionDAG *G = 0) const;
+  std::string getOperationName(const SelectionDAG *G = nullptr) const;
   static const char* getIndexedModeName(ISD::MemIndexedMode AM);
   void print_types(raw_ostream &OS, const SelectionDAG *G) const;
   void print_details(raw_ostream &OS, const SelectionDAG *G) const;
-  void print(raw_ostream &OS, const SelectionDAG *G = 0) const;
-  void printr(raw_ostream &OS, const SelectionDAG *G = 0) const;
+  void print(raw_ostream &OS, const SelectionDAG *G = nullptr) const;
+  void printr(raw_ostream &OS, const SelectionDAG *G = nullptr) const;
 
   /// printrFull - Print a SelectionDAG node and all children down to
   /// the leaves.  The given SelectionDAG allows target-specific nodes
@@ -650,7 +657,7 @@ public:
   /// print the whole DAG, including children that appear multiple
   /// times.
   ///
-  void printrFull(raw_ostream &O, const SelectionDAG *G = 0) const;
+  void printrFull(raw_ostream &O, const SelectionDAG *G = nullptr) const;
 
   /// printrWithDepth - Print a SelectionDAG node and children up to
   /// depth "depth."  The given SelectionDAG allows target-specific
@@ -658,7 +665,7 @@ public:
   /// will print children that appear multiple times wherever they are
   /// used.
   ///
-  void printrWithDepth(raw_ostream &O, const SelectionDAG *G = 0,
+  void printrWithDepth(raw_ostream &O, const SelectionDAG *G = nullptr,
                        unsigned depth = 100) const;
 
 
@@ -683,14 +690,15 @@ public:
   /// Unlike dumpr, this will print the whole DAG, including children
   /// that appear multiple times.
   ///
-  void dumprFull(const SelectionDAG *G = 0) const;
+  void dumprFull(const SelectionDAG *G = nullptr) const;
 
   /// dumprWithDepth - printrWithDepth to dbgs().  The given
   /// SelectionDAG allows target-specific nodes to be printed in
   /// human-readable form.  Unlike dumpr, this will print children
   /// that appear multiple times wherever they are used.
   ///
-  void dumprWithDepth(const SelectionDAG *G = 0, unsigned depth = 100) const;
+  void dumprWithDepth(const SelectionDAG *G = nullptr,
+                      unsigned depth = 100) const;
 
   /// Profile - Gather unique data for the node.
   ///
@@ -707,14 +715,14 @@ protected:
   }
 
   SDNode(unsigned Opc, unsigned Order, const DebugLoc dl, SDVTList VTs,
-         const SDValue *Ops, unsigned NumOps)
+         ArrayRef<SDValue> Ops)
     : NodeType(Opc), OperandsNeedDelete(true), HasDebugValue(false),
       SubclassData(0), NodeId(-1),
-      OperandList(NumOps ? new SDUse[NumOps] : 0),
-      ValueList(VTs.VTs), UseList(NULL),
-      NumOperands(NumOps), NumValues(VTs.NumVTs),
+      OperandList(Ops.size() ? new SDUse[Ops.size()] : nullptr),
+      ValueList(VTs.VTs), UseList(nullptr),
+      NumOperands(Ops.size()), NumValues(VTs.NumVTs),
       debugLoc(dl), IROrder(Order) {
-    for (unsigned i = 0; i != NumOps; ++i) {
+    for (unsigned i = 0; i != Ops.size(); ++i) {
       OperandList[i].setUser(this);
       OperandList[i].setInitial(Ops[i]);
     }
@@ -725,9 +733,9 @@ protected:
   /// set later with InitOperands.
   SDNode(unsigned Opc, unsigned Order, const DebugLoc dl, SDVTList VTs)
     : NodeType(Opc), OperandsNeedDelete(false), HasDebugValue(false),
-      SubclassData(0), NodeId(-1), OperandList(0),
-      ValueList(VTs.VTs), UseList(NULL), NumOperands(0), NumValues(VTs.NumVTs),
-      debugLoc(dl), IROrder(Order) {}
+      SubclassData(0), NodeId(-1), OperandList(nullptr), ValueList(VTs.VTs),
+      UseList(nullptr), NumOperands(0), NumValues(VTs.NumVTs), debugLoc(dl),
+      IROrder(Order) {}
 
   /// InitOperands - Initialize the operands list of this with 1 operand.
   void InitOperands(SDUse *Ops, const SDValue &Op0) {
@@ -812,7 +820,7 @@ private:
   int IROrder;
 
 public:
-  SDLoc() : Ptr(NULL), IROrder(0) {}
+  SDLoc() : Ptr(nullptr), IROrder(0) {}
   SDLoc(const SDNode *N) : Ptr(N), IROrder(-1) {
     assert(N && "null SDNode");
   }
@@ -823,14 +831,14 @@ public:
     assert(Order >= 0 && "bad IROrder");
   }
   unsigned getIROrder() {
-    if (IROrder >= 0 || Ptr == NULL) {
+    if (IROrder >= 0 || Ptr == nullptr) {
       return (unsigned)IROrder;
     }
     const SDNode *N = (const SDNode*)(Ptr);
     return N->getIROrder();
   }
   DebugLoc getDebugLoc() {
-    if (Ptr == NULL) {
+    if (!Ptr) {
       return DebugLoc();
     }
     if (IROrder >= 0) {
@@ -990,8 +998,7 @@ public:
             EVT MemoryVT, MachineMemOperand *MMO);
 
   MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
-            const SDValue *Ops,
-            unsigned NumOps, EVT MemoryVT, MachineMemOperand *MMO);
+            ArrayRef<SDValue> Ops, EVT MemoryVT, MachineMemOperand *MMO);
 
   bool readMem() const { return MMO->isLoad(); }
   bool writeMem() const { return MMO->isStore(); }
@@ -1024,8 +1031,7 @@ public:
     return SynchronizationScope((SubclassData >> 12) & 1);
   }
 
-  /// Returns the SrcValue and offset that describes the location of the access
-  const Value *getSrcValue() const { return MMO->getValue(); }
+  // Returns the offset from the location of the access.
   int64_t getSrcValueOffset() const { return MMO->getOffset(); }
 
   /// Returns the TBAAInfo that describes the dereference.
@@ -1153,7 +1159,7 @@ public:
     InitOperands(Ops, Chain, Ptr);
   }
   AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT,
-               SDValue* AllOps, SDUse *DynOps, unsigned NumOps,
+               const SDValue* AllOps, SDUse *DynOps, unsigned NumOps,
                MachineMemOperand *MMO,
                AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering,
                SynchronizationScope SynchScope)
@@ -1208,9 +1214,9 @@ public:
 class MemIntrinsicSDNode : public MemSDNode {
 public:
   MemIntrinsicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
-                     const SDValue *Ops, unsigned NumOps,
-                     EVT MemoryVT, MachineMemOperand *MMO)
-    : MemSDNode(Opc, Order, dl, VTs, Ops, NumOps, MemoryVT, MMO) {
+                     ArrayRef<SDValue> Ops, EVT MemoryVT,
+                     MachineMemOperand *MMO)
+    : MemSDNode(Opc, Order, dl, VTs, Ops, MemoryVT, MMO) {
   }
 
   // Methods to support isa and dyn_cast
@@ -1680,11 +1686,10 @@ class CvtRndSatSDNode : public SDNode {
   ISD::CvtCode CvtCode;
   friend class SelectionDAG;
   explicit CvtRndSatSDNode(EVT VT, unsigned Order, DebugLoc dl,
-                           const SDValue *Ops, unsigned NumOps,
-                           ISD::CvtCode Code)
-    : SDNode(ISD::CONVERT_RNDSAT, Order, dl, getSDVTList(VT), Ops, NumOps),
+                           ArrayRef<SDValue> Ops, ISD::CvtCode Code)
+    : SDNode(ISD::CONVERT_RNDSAT, Order, dl, getSDVTList(VT), Ops),
       CvtCode(Code) {
-    assert(NumOps == 5 && "wrong number of operations");
+    assert(Ops.size() == 5 && "wrong number of operations");
   }
 public:
   ISD::CvtCode getCvtCode() const { return CvtCode; }
@@ -1827,7 +1832,7 @@ public:
 private:
   friend class SelectionDAG;
   MachineSDNode(unsigned Opc, unsigned Order, const DebugLoc DL, SDVTList VTs)
-    : SDNode(Opc, Order, DL, VTs), MemRefs(0), MemRefsEnd(0) {}
+    : SDNode(Opc, Order, DL, VTs), MemRefs(nullptr), MemRefsEnd(nullptr) {}
 
   /// LocalOperands - Operands for this instruction, if they fit here. If
   /// they don't, this field is unused.
diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index 1cc34d5..00bb22b 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -147,11 +147,11 @@ namespace llvm {
     };
 
     /// Construct an invalid index.
-    SlotIndex() : lie(0, 0) {}
+    SlotIndex() : lie(nullptr, 0) {}
 
     // Construct a new slot index from the given one, and set the slot.
     SlotIndex(const SlotIndex &li, Slot s) : lie(li.listEntry(), unsigned(s)) {
-      assert(lie.getPointer() != 0 &&
+      assert(lie.getPointer() != nullptr &&
              "Attempt to construct index with 0 pointer.");
     }
 
@@ -421,7 +421,7 @@ namespace llvm {
     /// Returns the instruction for the given index, or null if the given
     /// index has no instruction associated with it.
     MachineInstr* getInstructionFromIndex(SlotIndex index) const {
-      return index.isValid() ? index.listEntry()->getInstr() : 0;
+      return index.isValid() ? index.listEntry()->getInstr() : nullptr;
     }
 
     /// Returns the next non-null index, if one exists.
@@ -551,14 +551,14 @@ namespace llvm {
 
       // Check that we don't cross the boundary into this block.
       if (itr->first < end)
-        return 0;
+        return nullptr;
 
       itr = std::prev(itr);
 
       if (itr->first <= start)
         return itr->second;
 
-      return 0;
+      return nullptr;
     }
 
     /// Insert the given machine instruction into the mapping. Returns the
@@ -574,7 +574,7 @@ namespace llvm {
       // affected by debug information.
       assert(!mi->isDebugValue() && "Cannot number DBG_VALUE instructions.");
 
-      assert(mi->getParent() != 0 && "Instr must be added to function.");
+      assert(mi->getParent() != nullptr && "Instr must be added to function.");
 
       // Get the entries where mi should be inserted.
       IndexList::iterator prevItr, nextItr;
@@ -615,7 +615,7 @@ namespace llvm {
         IndexListEntry *miEntry(mi2iItr->second.listEntry());
         assert(miEntry->getInstr() == mi && "Instruction indexes broken.");
         // FIXME: Eventually we want to actually delete these indexes.
-        miEntry->setInstr(0);
+        miEntry->setInstr(nullptr);
         mi2iMap.erase(mi2iItr);
       }
     }
@@ -640,15 +640,15 @@ namespace llvm {
       MachineFunction::iterator nextMBB =
         std::next(MachineFunction::iterator(mbb));
 
-      IndexListEntry *startEntry = 0;
-      IndexListEntry *endEntry = 0;
+      IndexListEntry *startEntry = nullptr;
+      IndexListEntry *endEntry = nullptr;
       IndexList::iterator newItr;
       if (nextMBB == mbb->getParent()->end()) {
         startEntry = &indexList.back();
-        endEntry = createEntry(0, 0);
+        endEntry = createEntry(nullptr, 0);
         newItr = indexList.insertAfter(startEntry, endEntry);
       } else {
-        startEntry = createEntry(0, 0);
+        startEntry = createEntry(nullptr, 0);
         endEntry = getMBBStartIdx(nextMBB).listEntry();
         newItr = indexList.insert(endEntry, startEntry);
       }
diff --git a/include/llvm/CodeGen/StackMaps.h b/include/llvm/CodeGen/StackMaps.h
index a62ab6e..5eddbb6 100644
--- a/include/llvm/CodeGen/StackMaps.h
+++ b/include/llvm/CodeGen/StackMaps.h
@@ -21,6 +21,7 @@ namespace llvm {
 
 class AsmPrinter;
 class MCExpr;
+class MCStreamer;
 
 /// \brief MI-level patchpoint operands.
 ///
@@ -115,7 +116,7 @@ public:
   // OpParser.
   typedef enum { DirectMemRefOp, IndirectMemRefOp, ConstantOp } OpType;
 
-  StackMaps(AsmPrinter &AP) : AP(AP) {}
+  StackMaps(AsmPrinter &AP);
 
   /// \brief Generate a stackmap record for a stackmap instruction.
   ///
@@ -131,8 +132,11 @@ public:
   void serializeToStackMapSection();
 
 private:
+  static const char *WSMP;
+
   typedef SmallVector<Location, 8> LocationVec;
   typedef SmallVector<LiveOutReg, 8> LiveOutVec;
+  typedef MapVector<int64_t, int64_t> ConstantPool;
   typedef MapVector<const MCSymbol *, uint64_t> FnStackSizeMap;
 
   struct CallsiteInfo {
@@ -140,7 +144,7 @@ private:
     uint64_t ID;
     LocationVec Locations;
     LiveOutVec LiveOuts;
-    CallsiteInfo() : CSOffsetExpr(0), ID(0) {}
+    CallsiteInfo() : CSOffsetExpr(nullptr), ID(0) {}
     CallsiteInfo(const MCExpr *CSOffsetExpr, uint64_t ID,
                  LocationVec &Locations, LiveOutVec &LiveOuts)
       : CSOffsetExpr(CSOffsetExpr), ID(ID), Locations(Locations),
@@ -149,26 +153,6 @@ private:
 
   typedef std::vector<CallsiteInfo> CallsiteInfoList;
 
-  struct ConstantPool {
-  private:
-    typedef std::map<int64_t, size_t> ConstantsMap;
-    std::vector<int64_t> ConstantsList;
-    ConstantsMap ConstantIndexes;
-
-  public:
-    size_t getNumConstants() const { return ConstantsList.size(); }
-    int64_t getConstant(size_t Idx) const { return ConstantsList[Idx]; }
-    size_t getConstantIndex(int64_t ConstVal) {
-      size_t NextIdx = ConstantsList.size();
-      ConstantsMap::const_iterator I =
-        ConstantIndexes.insert(ConstantIndexes.end(),
-                               std::make_pair(ConstVal, NextIdx));
-      if (I->second == NextIdx)
-        ConstantsList.push_back(ConstVal);
-      return I->second;
-    }
-  };
-
   AsmPrinter &AP;
   CallsiteInfoList CSInfos;
   ConstantPool ConstPool;
@@ -196,6 +180,18 @@ private:
                            MachineInstr::const_mop_iterator MOI,
                            MachineInstr::const_mop_iterator MOE,
                            bool recordResult = false);
+
+  /// \brief Emit the stackmap header.
+  void emitStackmapHeader(MCStreamer &OS);
+
+  /// \brief Emit the function frame record for each function.
+  void emitFunctionFrameRecords(MCStreamer &OS);
+
+  /// \brief Emit the constant pool.
+  void emitConstantPoolEntries(MCStreamer &OS);
+
+  /// \brief Emit the callsite info for each stackmap/patchpoint intrinsic call.
+  void emitCallsiteEntries(MCStreamer &OS, const TargetRegisterInfo *TRI);
 };
 
 }
diff --git a/include/llvm/CodeGen/StackProtector.h b/include/llvm/CodeGen/StackProtector.h
index 0b8b8c0..8cef85c 100644
--- a/include/llvm/CodeGen/StackProtector.h
+++ b/include/llvm/CodeGen/StackProtector.h
@@ -105,11 +105,12 @@ private:
 
 public:
   static char ID; // Pass identification, replacement for typeid.
-  StackProtector() : FunctionPass(ID), TM(0), TLI(0), SSPBufferSize(0) {
+  StackProtector()
+      : FunctionPass(ID), TM(nullptr), TLI(nullptr), SSPBufferSize(0) {
     initializeStackProtectorPass(*PassRegistry::getPassRegistry());
   }
   StackProtector(const TargetMachine *TM)
-      : FunctionPass(ID), TM(TM), TLI(0), Trip(TM->getTargetTriple()),
+      : FunctionPass(ID), TM(TM), TLI(nullptr), Trip(TM->getTargetTriple()),
         SSPBufferSize(8) {
     initializeStackProtectorPass(*PassRegistry::getPassRegistry());
   }
diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index 16fed32..9f1cbaa 100644
--- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -67,10 +67,12 @@ public:
                                     MachineModuleInfo *MMI) const override;
 
   void InitializeELF(bool UseInitArray_);
-  const MCSection *
-    getStaticCtorSection(unsigned Priority = 65535) const override;
-  const MCSection *
-    getStaticDtorSection(unsigned Priority = 65535) const override;
+  const MCSection *getStaticCtorSection(unsigned Priority,
+                                        const MCSymbol *KeySym,
+                                        const MCSection *KeySec) const override;
+  const MCSection *getStaticDtorSection(unsigned Priority,
+                                        const MCSymbol *KeySym,
+                                        const MCSection *KeySec) const override;
 };
 
 
@@ -140,6 +142,13 @@ public:
   void emitModuleFlags(MCStreamer &Streamer,
                        ArrayRef<Module::ModuleFlagEntry> ModuleFlags,
                        Mangler &Mang, const TargetMachine &TM) const override;
+
+  const MCSection *getStaticCtorSection(unsigned Priority,
+                                        const MCSymbol *KeySym,
+                                        const MCSection *KeySec) const override;
+  const MCSection *getStaticDtorSection(unsigned Priority,
+                                        const MCSymbol *KeySym,
+                                        const MCSection *KeySec) const override;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h
index 4e178d0..690b70f 100644
--- a/include/llvm/CodeGen/TargetSchedule.h
+++ b/include/llvm/CodeGen/TargetSchedule.h
@@ -41,7 +41,7 @@ class TargetSchedModel {
   unsigned MicroOpFactor; // Multiply to normalize microops to resource units.
   unsigned ResourceLCM;   // Resource units per cycle. Latency normalization factor.
 public:
-  TargetSchedModel(): STI(0), TII(0) {}
+  TargetSchedModel(): STI(nullptr), TII(nullptr) {}
 
   /// \brief Initialize the machine model for instruction scheduling.
   ///
@@ -75,7 +75,7 @@ public:
   const InstrItineraryData *getInstrItineraries() const {
     if (hasInstrItineraries())
       return &InstrItins;
-    return 0;
+    return nullptr;
   }
 
   /// \brief Identify the processor corresponding to the current subtarget.
@@ -86,7 +86,7 @@ public:
 
   /// \brief Return the number of issue slots required for this MI.
   unsigned getNumMicroOps(const MachineInstr *MI,
-                          const MCSchedClassDesc *SC = 0) const;
+                          const MCSchedClassDesc *SC = nullptr) const;
 
   /// \brief Get the number of kinds of resources for this target.
   unsigned getNumProcResourceKinds() const {
diff --git a/include/llvm/CodeGen/ValueTypes.h b/include/llvm/CodeGen/ValueTypes.h
index 8cf26fa..4e93940 100644
--- a/include/llvm/CodeGen/ValueTypes.h
+++ b/include/llvm/CodeGen/ValueTypes.h
@@ -35,9 +35,9 @@ namespace llvm {
 
   public:
     EVT() : V((MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE)),
-            LLVMTy(0) {}
-    EVT(MVT::SimpleValueType SVT) : V(SVT), LLVMTy(0) { }
-    EVT(MVT S) : V(S), LLVMTy(0) {}
+            LLVMTy(nullptr) {}
+    EVT(MVT::SimpleValueType SVT) : V(SVT), LLVMTy(nullptr) { }
+    EVT(MVT S) : V(S), LLVMTy(nullptr) {}
 
     bool operator==(EVT VT) const {
       return !(*this != VT);
diff --git a/include/llvm/CodeGen/VirtRegMap.h b/include/llvm/CodeGen/VirtRegMap.h
index 89b5a9f..eceb875 100644
--- a/include/llvm/CodeGen/VirtRegMap.h
+++ b/include/llvm/CodeGen/VirtRegMap.h
@@ -177,7 +177,7 @@ namespace llvm {
     /// the specified stack slot
     void assignVirt2StackSlot(unsigned virtReg, int frameIndex);
 
-    void print(raw_ostream &OS, const Module* M = 0) const override;
+    void print(raw_ostream &OS, const Module* M = nullptr) const override;
     void dump() const;
   };
 
diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h
index 69a4f8d..c1aba01 100644
--- a/include/llvm/DebugInfo/DIContext.h
+++ b/include/llvm/DebugInfo/DIContext.h
@@ -16,42 +16,31 @@
 #define LLVM_DEBUGINFO_DICONTEXT_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/RelocVisitor.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DataTypes.h"
 
+#include <string>
+
 namespace llvm {
 
 class raw_ostream;
 
 /// DILineInfo - a format-neutral container for source line information.
-class DILineInfo {
-  SmallString<16> FileName;
-  SmallString<16> FunctionName;
+struct DILineInfo {
+  std::string FileName;
+  std::string FunctionName;
   uint32_t Line;
   uint32_t Column;
-public:
+
   DILineInfo()
-    : FileName("<invalid>"), FunctionName("<invalid>"),
-      Line(0), Column(0) {}
-  DILineInfo(StringRef fileName, StringRef functionName, uint32_t line,
-             uint32_t column)
-      : FileName(fileName), FunctionName(functionName), Line(line),
-        Column(column) {}
-
-  const char *getFileName() { return FileName.c_str(); }
-  const char *getFunctionName() { return FunctionName.c_str(); }
-  uint32_t getLine() const { return Line; }
-  uint32_t getColumn() const { return Column; }
+      : FileName("<invalid>"), FunctionName("<invalid>"), Line(0), Column(0) {}
 
   bool operator==(const DILineInfo &RHS) const {
     return Line == RHS.Line && Column == RHS.Column &&
-           FileName.equals(RHS.FileName) &&
-           FunctionName.equals(RHS.FunctionName);
+           FileName == RHS.FileName && FunctionName == RHS.FunctionName;
   }
   bool operator!=(const DILineInfo &RHS) const {
     return !(*this == RHS);
@@ -79,19 +68,16 @@ class DIInliningInfo {
 
 /// DILineInfoSpecifier - controls which fields of DILineInfo container
 /// should be filled with data.
-class DILineInfoSpecifier {
-  const uint32_t Flags;  // Or'ed flags that set the info we want to fetch.
-public:
-  enum Specification {
-    FileLineInfo = 1 << 0,
-    AbsoluteFilePath = 1 << 1,
-    FunctionName = 1 << 2
-  };
-  // Use file/line info by default.
-  DILineInfoSpecifier(uint32_t flags = FileLineInfo) : Flags(flags) {}
-  bool needs(Specification spec) const {
-    return (Flags & spec) > 0;
-  }
+struct DILineInfoSpecifier {
+  enum class FileLineInfoKind { None, Default, AbsoluteFilePath };
+  enum class FunctionNameKind { None, ShortName, LinkageName };
+
+  FileLineInfoKind FLIKind;
+  FunctionNameKind FNKind;
+
+  DILineInfoSpecifier(FileLineInfoKind FLIKind = FileLineInfoKind::Default,
+                      FunctionNameKind FNKind = FunctionNameKind::None)
+      : FLIKind(FLIKind), FNKind(FNKind) {}
 };
 
 /// Selects which debug sections get dumped.
diff --git a/include/llvm/DebugInfo/DWARFFormValue.h b/include/llvm/DebugInfo/DWARFFormValue.h
index 533d259..d517a72 100644
--- a/include/llvm/DebugInfo/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARFFormValue.h
@@ -36,7 +36,7 @@ public:
 
 private:
   struct ValueType {
-    ValueType() : data(NULL) {
+    ValueType() : data(nullptr) {
       uval = 0;
     }
 
@@ -60,7 +60,7 @@ public:
   bool extractValue(DataExtractor data, uint32_t *offset_ptr,
                     const DWARFUnit *u);
   bool isInlinedCStr() const {
-    return Value.data != NULL && Value.data == (const uint8_t*)Value.cstr;
+    return Value.data != nullptr && Value.data == (const uint8_t*)Value.cstr;
   }
 
   /// getAsFoo functions below return the extracted value as Foo if only
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index 4dca870..7518c1e 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -123,6 +123,9 @@ class ExecutionEngine {
   /// using dlsym).
   bool SymbolSearchingDisabled;
 
+  /// Whether the JIT should verify IR modules during compilation.
+  bool VerifyModules;
+
   friend class EngineBuilder;  // To allow access to JITCtor and InterpCtor.
 
 protected:
@@ -181,7 +184,7 @@ public:
   /// freeMachineCodeForFunction works.
   static ExecutionEngine *create(Module *M,
                                  bool ForceInterpreter = false,
-                                 std::string *ErrorStr = 0,
+                                 std::string *ErrorStr = nullptr,
                                  CodeGenOpt::Level OptLevel =
                                  CodeGenOpt::Default,
                                  bool GVsWithCode = true);
@@ -193,8 +196,8 @@ public:
   /// Clients should make sure to initialize targets prior to calling this
   /// function.
   static ExecutionEngine *createJIT(Module *M,
-                                    std::string *ErrorStr = 0,
-                                    JITMemoryManager *JMM = 0,
+                                    std::string *ErrorStr = nullptr,
+                                    JITMemoryManager *JMM = nullptr,
                                     CodeGenOpt::Level OptLevel =
                                     CodeGenOpt::Default,
                                     bool GVsWithCode = true,
@@ -219,10 +222,7 @@ public:
   /// needed by another object.
   ///
   /// MCJIT will take ownership of the ObjectFile.
-  virtual void addObjectFile(object::ObjectFile *O) {
-    llvm_unreachable(
-      "ExecutionEngine subclass doesn't implement addObjectFile.");
-  }
+  virtual void addObjectFile(std::unique_ptr<object::ObjectFile> O);
 
   /// addArchive - Add an Archive to the execution engine.
   ///
@@ -411,7 +411,7 @@ public:
   }
 
   // The JIT overrides a version that actually does this.
-  virtual void runJITOnFunction(Function *, MachineCodeInfo * = 0) { }
+  virtual void runJITOnFunction(Function *, MachineCodeInfo * = nullptr) { }
 
   /// getGlobalValueAtAddress - Return the LLVM global value object that starts
   /// at the specified address.
@@ -478,7 +478,7 @@ public:
   }
 
   /// Return the target machine (if available).
-  virtual TargetMachine *getTargetMachine() { return NULL; }
+  virtual TargetMachine *getTargetMachine() { return nullptr; }
 
   /// DisableLazyCompilation - When lazy compilation is off (the default), the
   /// JIT will eagerly compile every function reachable from the argument to
@@ -525,6 +525,17 @@ public:
     return SymbolSearchingDisabled;
   }
 
+  /// Enable/Disable IR module verification.
+  ///
+  /// Note: Module verification is enabled by default in Debug builds, and
+  /// disabled by default in Release. Use this method to override the default.
+  void setVerifyModules(bool Verify) {
+    VerifyModules = Verify;
+  }
+  bool getVerifyModules() const {
+    return VerifyModules;
+  }
+
   /// InstallLazyFunctionCreator - If an unknown function is needed, the
   /// specified function pointer is invoked to create it.  If it returns null,
   /// the JIT will abort.
@@ -572,19 +583,28 @@ private:
   std::string MCPU;
   SmallVector<std::string, 4> MAttrs;
   bool UseMCJIT;
+  bool VerifyModules;
 
   /// InitEngine - Does the common initialization of default options.
   void InitEngine() {
     WhichEngine = EngineKind::Either;
-    ErrorStr = NULL;
+    ErrorStr = nullptr;
     OptLevel = CodeGenOpt::Default;
-    MCJMM = NULL;
-    JMM = NULL;
+    MCJMM = nullptr;
+    JMM = nullptr;
     Options = TargetOptions();
     AllocateGVsWithCode = false;
     RelocModel = Reloc::Default;
     CMModel = CodeModel::JITDefault;
     UseMCJIT = false;
+
+  // IR module verification is enabled by default in debug builds, and disabled
+  // by default in release builds.
+#ifndef NDEBUG
+  VerifyModules = true;
+#else
+  VerifyModules = false;
+#endif
   }
 
 public:
@@ -610,7 +630,7 @@ public:
   /// the setJITMemoryManager() option.
   EngineBuilder &setMCJITMemoryManager(RTDyldMemoryManager *mcjmm) {
     MCJMM = mcjmm;
-    JMM = NULL;
+    JMM = nullptr;
     return *this;
   }
 
@@ -622,7 +642,7 @@ public:
   /// memory manager.  This option defaults to NULL. This option overrides
   /// setMCJITMemoryManager() as well.
   EngineBuilder &setJITMemoryManager(JITMemoryManager *jmm) {
-    MCJMM = NULL;
+    MCJMM = nullptr;
     JMM = jmm;
     return *this;
   }
@@ -694,6 +714,13 @@ public:
     return *this;
   }
 
+  /// setVerifyModules - Set whether the JIT implementation should verify
+  /// IR modules during compilation.
+  EngineBuilder &setVerifyModules(bool Verify) {
+    VerifyModules = Verify;
+    return *this;
+  }
+
   /// setMAttrs - Set cpu-specific attributes.
   template<typename StringSequence>
   EngineBuilder &setMAttrs(const StringSequence &mattrs) {
diff --git a/include/llvm/ExecutionEngine/JITEventListener.h b/include/llvm/ExecutionEngine/JITEventListener.h
index 8daf2bd..99fe36c 100644
--- a/include/llvm/ExecutionEngine/JITEventListener.h
+++ b/include/llvm/ExecutionEngine/JITEventListener.h
@@ -98,11 +98,11 @@ public:
   static JITEventListener *createIntelJITEventListener(
                                       IntelJITEventsWrapper* AlternativeImpl);
 #else
-  static JITEventListener *createIntelJITEventListener() { return 0; }
+  static JITEventListener *createIntelJITEventListener() { return nullptr; }
 
   static JITEventListener *createIntelJITEventListener(
                                       IntelJITEventsWrapper* AlternativeImpl) {
-    return 0;
+    return nullptr;
   }
 #endif // USE_INTEL_JITEVENTS
 
@@ -115,11 +115,11 @@ public:
                                       OProfileWrapper* AlternativeImpl);
 #else
 
-  static JITEventListener *createOProfileJITEventListener() { return 0; }
+  static JITEventListener *createOProfileJITEventListener() { return nullptr; }
 
   static JITEventListener *createOProfileJITEventListener(
                                       OProfileWrapper* AlternativeImpl) {
-    return 0;
+    return nullptr;
   }
 #endif // USE_OPROFILE
 
diff --git a/include/llvm/ExecutionEngine/ObjectImage.h b/include/llvm/ExecutionEngine/ObjectImage.h
index 1a13647..1fcedd8 100644
--- a/include/llvm/ExecutionEngine/ObjectImage.h
+++ b/include/llvm/ExecutionEngine/ObjectImage.h
@@ -36,9 +36,17 @@ public:
 
   virtual object::symbol_iterator begin_symbols() const = 0;
   virtual object::symbol_iterator end_symbols() const = 0;
+  iterator_range<object::symbol_iterator> symbols() const {
+    return iterator_range<object::symbol_iterator>(begin_symbols(),
+                                                   end_symbols());
+  }
 
   virtual object::section_iterator begin_sections() const = 0;
   virtual object::section_iterator end_sections() const  = 0;
+  iterator_range<object::section_iterator> sections() const {
+    return iterator_range<object::section_iterator>(begin_sections(),
+                                                    end_sections());
+  }
 
   virtual /* Triple::ArchType */ unsigned getArch() const = 0;
 
diff --git a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
index 70dd1cb..b1d6810 100644
--- a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
+++ b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
@@ -114,7 +114,7 @@ public:
   /// operations needed to reliably use the memory are also performed.
   ///
   /// Returns true if an error occurred, false otherwise.
-  virtual bool finalizeMemory(std::string *ErrMsg = 0) = 0;
+  virtual bool finalizeMemory(std::string *ErrMsg = nullptr) = 0;
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index 8d7b81b..30c0d49 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -55,7 +55,7 @@ public:
   /// Ownership of the input object is transferred to the ObjectImage
   /// instance returned from this function if successful. In the case of load
   /// failure, the input object will be deleted.
-  ObjectImage *loadObject(object::ObjectFile *InputObject);
+  ObjectImage *loadObject(std::unique_ptr<object::ObjectFile> InputObject);
 
   /// Get the address of our local copy of the symbol. This may or may not
   /// be the address used for relocation (clients can copy the data around
diff --git a/include/llvm/ExecutionEngine/SectionMemoryManager.h b/include/llvm/ExecutionEngine/SectionMemoryManager.h
index f68028b..f24bb4d 100644
--- a/include/llvm/ExecutionEngine/SectionMemoryManager.h
+++ b/include/llvm/ExecutionEngine/SectionMemoryManager.h
@@ -72,7 +72,7 @@ public:
   /// operations needed to reliably use the memory are also performed.
   ///
   /// \returns true if an error occurred, false otherwise.
-  bool finalizeMemory(std::string *ErrMsg = 0) override;
+  bool finalizeMemory(std::string *ErrMsg = nullptr) override;
 
   /// \brief Invalidate instruction cache for code sections.
   ///
diff --git a/include/llvm/IR/Argument.h b/include/llvm/IR/Argument.h
index 7c1ebf6..3a63e1a 100644
--- a/include/llvm/IR/Argument.h
+++ b/include/llvm/IR/Argument.h
@@ -44,7 +44,7 @@ public:
   ///
   /// If \p F is specified, the argument is inserted at the end of the argument
   /// list for \p F.
-  explicit Argument(Type *Ty, const Twine &Name = "", Function *F = 0);
+  explicit Argument(Type *Ty, const Twine &Name = "", Function *F = nullptr);
 
   inline const Function *getParent() const { return Parent; }
   inline       Function *getParent()       { return Parent; }
@@ -55,6 +55,10 @@ public:
   /// For example in "void foo(int a, float b)" a is 0 and b is 1.
   unsigned getArgNo() const;
 
+  /// \brief Return true if this argument has the nonnull attribute on it in
+  /// its containing function.
+  bool hasNonNullAttr() const;
+
   /// \brief Return true if this argument has the byval attribute on it in its
   /// containing function.
   bool hasByValAttr() const;
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index 9eccf40..86f9cc8 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -86,6 +86,7 @@ public:
     NoInline,              ///< inline=never
     NonLazyBind,           ///< Function is called early and/or
                            ///< often, so lazy binding isn't worthwhile
+    NonNull,               ///< Pointer is known to be not null
     NoRedZone,             ///< Disable redzone
     NoReturn,              ///< Mark the function as not returning
     NoUnwind,              ///< Function doesn't unwind stack
@@ -116,7 +117,7 @@ private:
   AttributeImpl *pImpl;
   Attribute(AttributeImpl *A) : pImpl(A) {}
 public:
-  Attribute() : pImpl(0) {}
+  Attribute() : pImpl(nullptr) {}
 
   //===--------------------------------------------------------------------===//
   // Attribute Construction
@@ -232,7 +233,7 @@ private:
 
   explicit AttributeSet(AttributeSetImpl *LI) : pImpl(LI) {}
 public:
-  AttributeSet() : pImpl(0) {}
+  AttributeSet() : pImpl(nullptr) {}
 
   //===--------------------------------------------------------------------===//
   // AttributeSet Construction and Mutation
@@ -242,7 +243,7 @@ public:
   static AttributeSet get(LLVMContext &C, ArrayRef<AttributeSet> Attrs);
   static AttributeSet get(LLVMContext &C, unsigned Index,
                           ArrayRef<Attribute::AttrKind> Kind);
-  static AttributeSet get(LLVMContext &C, unsigned Index, AttrBuilder &B);
+  static AttributeSet get(LLVMContext &C, unsigned Index, const AttrBuilder &B);
 
   /// \brief Add an attribute to the attribute set at the given index. Since
   /// attribute sets are immutable, this returns a new set.
@@ -469,6 +470,8 @@ public:
   typedef std::pair<std::string, std::string>                td_type;
   typedef std::map<std::string, std::string>::iterator       td_iterator;
   typedef std::map<std::string, std::string>::const_iterator td_const_iterator;
+  typedef llvm::iterator_range<td_iterator>                  td_range;
+  typedef llvm::iterator_range<td_const_iterator>            td_const_range;
 
   td_iterator td_begin()             { return TargetDepAttrs.begin(); }
   td_iterator td_end()               { return TargetDepAttrs.end(); }
@@ -476,6 +479,11 @@ public:
   td_const_iterator td_begin() const { return TargetDepAttrs.begin(); }
   td_const_iterator td_end() const   { return TargetDepAttrs.end(); }
 
+  td_range td_attrs() { return td_range(td_begin(), td_end()); }
+  td_const_range td_attrs() const {
+    return td_const_range(td_begin(), td_end());
+  }
+
   bool td_empty() const              { return TargetDepAttrs.empty(); }
 
   bool operator==(const AttrBuilder &B);
diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index 1adc254..a19489a 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h
@@ -90,7 +90,8 @@ private:
   /// inserted at either the end of the function (if InsertBefore is null), or
   /// before the specified basic block.
   explicit BasicBlock(LLVMContext &C, const Twine &Name = "",
-                      Function *Parent = 0, BasicBlock *InsertBefore = 0);
+                      Function *Parent = nullptr,
+                      BasicBlock *InsertBefore = nullptr);
 public:
   /// \brief Get the context in which this basic block lives.
   LLVMContext &getContext() const;
@@ -107,7 +108,8 @@ public:
   /// inserted at either the end of the function (if InsertBefore is 0), or
   /// before the specified basic block.
   static BasicBlock *Create(LLVMContext &Context, const Twine &Name = "",
-                            Function *Parent = 0,BasicBlock *InsertBefore = 0) {
+                            Function *Parent = nullptr,
+                            BasicBlock *InsertBefore = nullptr) {
     return new BasicBlock(Context, Name, Parent, InsertBefore);
   }
   ~BasicBlock();
@@ -172,14 +174,15 @@ public:
   void moveAfter(BasicBlock *MovePos);
 
 
-  /// \brief Return this block if it has a single predecessor block. Otherwise
-  /// return a null pointer.
+  /// \brief Return the predecessor of this block if it has a single predecessor
+  /// block. Otherwise return a null pointer.
   BasicBlock *getSinglePredecessor();
   const BasicBlock *getSinglePredecessor() const {
     return const_cast<BasicBlock*>(this)->getSinglePredecessor();
   }
 
-  /// \brief Return this block if it has a unique predecessor block. Otherwise return a null pointer.
+  /// \brief Return the predecessor of this block if it has a unique predecessor
+  /// block. Otherwise return a null pointer.
   ///
   /// Note that unique predecessor doesn't mean single edge, there can be
   /// multiple edges from the unique predecessor to this block (for example a
diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h
index ec46103..deea415 100644
--- a/include/llvm/IR/CallSite.h
+++ b/include/llvm/IR/CallSite.h
@@ -47,7 +47,7 @@ class CallSiteBase {
 protected:
   PointerIntPair<InstrTy*, 1, bool> I;
 public:
-  CallSiteBase() : I(0, false) {}
+  CallSiteBase() : I(nullptr, false) {}
   CallSiteBase(CallTy *CI) : I(CI, true) { assert(CI); }
   CallSiteBase(InvokeTy *II) : I(II, false) { assert(II); }
   CallSiteBase(ValTy *II) { *this = get(II); }
@@ -160,6 +160,17 @@ public:
   ///
   FunTy *getCaller() const { return (*this)->getParent()->getParent(); }
 
+  /// \brief Tests if this call site must be tail call optimized.  Only a
+  /// CallInst can be tail call optimized.
+  bool isMustTailCall() const {
+    return isCall() && cast<CallInst>(getInstruction())->isMustTailCall();
+  }
+
+  /// \brief Tests if this call site is marked as a tail call.
+  bool isTailCall() const {
+    return isCall() && cast<CallInst>(getInstruction())->isTailCall();
+  }
+
 #define CALLSITE_DELEGATE_GETTER(METHOD) \
   InstrTy *II = getInstruction();    \
   return isCall()                        \
diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index af44e8a..1eaf4f7 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -137,13 +137,7 @@ namespace CallingConv {
     /// convention differs from the more common \c X86_64_SysV convention
     /// in a number of ways, most notably in that XMM registers used to pass
     /// arguments are shadowed by GPRs, and vice versa.
-    X86_64_Win64 = 79,
-
-    /// \brief The calling convention used for __cdecl methods on win32.
-    /// Differs from the C calling convention only in that the order of the
-    /// first parameter and the sret parameter are swapped.
-    X86_CDeclMethod = 80
-
+    X86_64_Win64 = 79
   };
 } // End CallingConv namespace
 
diff --git a/include/llvm/IR/ConstantRange.h b/include/llvm/IR/ConstantRange.h
index 86988de..342422c 100644
--- a/include/llvm/IR/ConstantRange.h
+++ b/include/llvm/IR/ConstantRange.h
@@ -114,12 +114,12 @@ public:
   const APInt *getSingleElement() const {
     if (Upper == Lower + 1)
       return &Lower;
-    return 0;
+    return nullptr;
   }
 
   /// isSingleElement - Return true if this set contains exactly one member.
   ///
-  bool isSingleElement() const { return getSingleElement() != 0; }
+  bool isSingleElement() const { return getSingleElement() != nullptr; }
 
   /// getSetSize - Return the number of elements in this set.
   ///
diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h
index ed7a70f..0e72f04 100644
--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h
@@ -299,7 +299,7 @@ class ConstantAggregateZero : public Constant {
   ConstantAggregateZero(const ConstantAggregateZero &) LLVM_DELETED_FUNCTION;
 protected:
   explicit ConstantAggregateZero(Type *ty)
-    : Constant(ty, ConstantAggregateZeroVal, 0, 0) {}
+    : Constant(ty, ConstantAggregateZeroVal, nullptr, 0) {}
 protected:
   // allocate space for exactly zero operands
   void *operator new(size_t s) {
@@ -486,7 +486,7 @@ class ConstantPointerNull : public Constant {
 protected:
   explicit ConstantPointerNull(PointerType *T)
     : Constant(T,
-               Value::ConstantPointerNullVal, 0, 0) {}
+               Value::ConstantPointerNullVal, nullptr, 0) {}
 
 protected:
   // allocate space for exactly zero operands
@@ -536,7 +536,7 @@ class ConstantDataSequential : public Constant {
   ConstantDataSequential(const ConstantDataSequential &) LLVM_DELETED_FUNCTION;
 protected:
   explicit ConstantDataSequential(Type *ty, ValueTy VT, const char *Data)
-    : Constant(ty, VT, 0, 0), DataElements(Data), Next(0) {}
+    : Constant(ty, VT, nullptr, 0), DataElements(Data), Next(nullptr) {}
   ~ConstantDataSequential() { delete Next; }
 
   static Constant *getImpl(StringRef Bytes, Type *Ty);
@@ -1136,7 +1136,7 @@ class UndefValue : public Constant {
   void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   UndefValue(const UndefValue &) LLVM_DELETED_FUNCTION;
 protected:
-  explicit UndefValue(Type *T) : Constant(T, UndefValueVal, 0, 0) {}
+  explicit UndefValue(Type *T) : Constant(T, UndefValueVal, nullptr, 0) {}
 protected:
   // allocate space for exactly zero operands
   void *operator new(size_t s) {
diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h
index 7d87a69..8b05bbb 100644
--- a/include/llvm/IR/DIBuilder.h
+++ b/include/llvm/IR/DIBuilder.h
@@ -78,7 +78,7 @@ namespace llvm {
     DITemplateValueParameter
     createTemplateValueParameter(unsigned Tag, DIDescriptor Scope,
                                  StringRef Name, DIType Ty, Value *Val,
-                                 MDNode *File = 0, unsigned LineNo = 0,
+                                 MDNode *File = nullptr, unsigned LineNo = 0,
                                  unsigned ColumnNo = 0);
 
     DIBuilder(const DIBuilder &) LLVM_DELETED_FUNCTION;
@@ -293,7 +293,7 @@ namespace llvm {
                                     uint64_t OffsetInBits, unsigned Flags,
                                     DIType DerivedFrom, DIArray Elements,
                                     DIType VTableHolder = DIType(),
-                                    MDNode *TemplateParms = 0,
+                                    MDNode *TemplateParms = nullptr,
                                     StringRef UniqueIdentifier = StringRef());
 
     /// createStructType - Create debugging information entry for a struct.
@@ -342,7 +342,7 @@ namespace llvm {
     /// @param ColumnNo     Column Number.
     DITemplateTypeParameter
     createTemplateTypeParameter(DIDescriptor Scope, StringRef Name, DIType Ty,
-                                MDNode *File = 0, unsigned LineNo = 0,
+                                MDNode *File = nullptr, unsigned LineNo = 0,
                                 unsigned ColumnNo = 0);
 
     /// createTemplateValueParameter - Create debugging information for template
@@ -356,7 +356,7 @@ namespace llvm {
     /// @param ColumnNo     Column Number.
     DITemplateValueParameter
     createTemplateValueParameter(DIDescriptor Scope, StringRef Name,
-                                 DIType Ty, Value *Val, MDNode *File = 0,
+                                 DIType Ty, Value *Val, MDNode *File = nullptr,
                                  unsigned LineNo = 0, unsigned ColumnNo = 0);
 
     /// \brief Create debugging information for a template template parameter.
@@ -369,8 +369,9 @@ namespace llvm {
     /// @param ColumnNo     Column Number.
     DITemplateValueParameter
     createTemplateTemplateParameter(DIDescriptor Scope, StringRef Name,
-                                    DIType Ty, StringRef Val, MDNode *File = 0,
-                                    unsigned LineNo = 0, unsigned ColumnNo = 0);
+                                    DIType Ty, StringRef Val,
+                                    MDNode *File = nullptr, unsigned LineNo = 0,
+                                    unsigned ColumnNo = 0);
 
     /// \brief Create debugging information for a template parameter pack.
     /// @param Scope        Scope in which this type is defined.
@@ -382,7 +383,7 @@ namespace llvm {
     /// @param ColumnNo     Column Number.
     DITemplateValueParameter
     createTemplateParameterPack(DIDescriptor Scope, StringRef Name,
-                                DIType Ty, DIArray Val, MDNode *File = 0,
+                                DIType Ty, DIArray Val, MDNode *File = nullptr,
                                 unsigned LineNo = 0, unsigned ColumnNo = 0);
 
     /// createArrayType - Create debugging information entry for an array.
@@ -433,7 +434,7 @@ namespace llvm {
     /// flag set.
     DIType createObjectPointerType(DIType Ty);
 
-    /// createForwardDecl - Create a temporary forward-declared type.
+    /// \brief Create a permanent forward-declared type.
     DICompositeType createForwardDecl(unsigned Tag, StringRef Name,
                                       DIDescriptor Scope, DIFile F,
                                       unsigned Line, unsigned RuntimeLang = 0,
@@ -441,6 +442,12 @@ namespace llvm {
                                       uint64_t AlignInBits = 0,
                                       StringRef UniqueIdentifier = StringRef());
 
+    /// \brief Create a temporary forward-declared type.
+    DICompositeType createReplaceableForwardDecl(
+        unsigned Tag, StringRef Name, DIDescriptor Scope, DIFile F,
+        unsigned Line, unsigned RuntimeLang = 0, uint64_t SizeInBits = 0,
+        uint64_t AlignInBits = 0, StringRef UniqueIdentifier = StringRef());
+
     /// retainType - Retain DIType in a module even if it is not referenced
     /// through debug info anchors.
     void retainType(DIType T);
@@ -498,7 +505,7 @@ namespace llvm {
     createStaticVariable(DIDescriptor Context, StringRef Name,
                          StringRef LinkageName, DIFile File, unsigned LineNo,
                          DITypeRef Ty, bool isLocalToUnit, llvm::Value *Val,
-                         MDNode *Decl = NULL);
+                         MDNode *Decl = nullptr);
 
 
     /// createLocalVariable - Create a new descriptor for the specified
@@ -564,9 +571,9 @@ namespace llvm {
                                 unsigned ScopeLine,
                                 unsigned Flags = 0,
                                 bool isOptimized = false,
-                                Function *Fn = 0,
-                                MDNode *TParam = 0,
-                                MDNode *Decl = 0);
+                                Function *Fn = nullptr,
+                                MDNode *TParam = nullptr,
+                                MDNode *Decl = nullptr);
 
     /// FIXME: this is added for dragonegg. Once we update dragonegg
     /// to call resolve function, this will be removed.
@@ -578,9 +585,9 @@ namespace llvm {
                                 unsigned ScopeLine,
                                 unsigned Flags = 0,
                                 bool isOptimized = false,
-                                Function *Fn = 0,
-                                MDNode *TParam = 0,
-                                MDNode *Decl = 0);
+                                Function *Fn = nullptr,
+                                MDNode *TParam = nullptr,
+                                MDNode *Decl = nullptr);
 
     /// createMethod - Create a new descriptor for the specified C++ method.
     /// See comments in DISubprogram for descriptions of these fields.
@@ -610,8 +617,8 @@ namespace llvm {
                               DIType VTableHolder = DIType(),
                               unsigned Flags = 0,
                               bool isOptimized = false,
-                              Function *Fn = 0,
-                              MDNode *TParam = 0);
+                              Function *Fn = nullptr,
+                              MDNode *TParam = nullptr);
 
     /// createNameSpace - This creates new descriptor for a namespace
     /// with the specified parent scope.
@@ -647,24 +654,27 @@ namespace llvm {
     /// @param NS The namespace being imported here
     /// @param Line Line number
     DIImportedEntity createImportedModule(DIScope Context, DINameSpace NS,
-                                          unsigned Line,
-                                          StringRef Name = StringRef());
+                                          unsigned Line);
 
     /// \brief Create a descriptor for an imported module.
     /// @param Context The scope this module is imported into
     /// @param NS An aliased namespace
     /// @param Line Line number
     DIImportedEntity createImportedModule(DIScope Context, DIImportedEntity NS,
-                                          unsigned Line, StringRef Name);
+                                          unsigned Line);
 
     /// \brief Create a descriptor for an imported function.
     /// @param Context The scope this module is imported into
     /// @param Decl The declaration (or definition) of a function, type, or
     ///             variable
     /// @param Line Line number
+    DIImportedEntity createImportedDeclaration(DIScope Context, DIScope Decl,
+                                               unsigned Line,
+                                               StringRef Name = StringRef());
     DIImportedEntity createImportedDeclaration(DIScope Context,
-                                               DIScope Decl,
-                                               unsigned Line);
+                                               DIImportedEntity NS,
+                                               unsigned Line,
+                                               StringRef Name = StringRef());
 
     /// insertDeclare - Insert a new llvm.dbg.declare intrinsic call.
     /// @param Storage     llvm::Value of the variable
diff --git a/include/llvm/IR/DataLayout.h b/include/llvm/IR/DataLayout.h
index 59dca63..3079dec 100644
--- a/include/llvm/IR/DataLayout.h
+++ b/include/llvm/IR/DataLayout.h
@@ -27,6 +27,9 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/DataTypes.h"
 
+// this needs to be outside of the namespace, to avoid conflict with llvm-c decl
+typedef struct LLVMOpaqueTargetData *LLVMTargetDataRef;
+
 namespace llvm {
 
 class Value;
@@ -174,14 +177,14 @@ private:
 
 public:
   /// Constructs a DataLayout from a specification string. See reset().
-  explicit DataLayout(StringRef LayoutDescription) : LayoutMap(0) {
+  explicit DataLayout(StringRef LayoutDescription) : LayoutMap(nullptr) {
     reset(LayoutDescription);
   }
 
   /// Initialize target data from properties stored in the module.
   explicit DataLayout(const Module *M);
 
-  DataLayout(const DataLayout &DL) : LayoutMap(0) { *this = DL; }
+  DataLayout(const DataLayout &DL) : LayoutMap(nullptr) { *this = DL; }
 
   DataLayout &operator=(const DataLayout &DL) {
     clear();
@@ -408,7 +411,7 @@ public:
   /// none are set.
   Type *getLargestLegalIntType(LLVMContext &C) const {
     unsigned LargestSize = getLargestLegalIntTypeSize();
-    return (LargestSize == 0) ? 0 : Type::getIntNTy(C, LargestSize);
+    return (LargestSize == 0) ? nullptr : Type::getIntNTy(C, LargestSize);
   }
 
   /// getLargestLegalIntType - Return the size of largest legal integer type
@@ -445,6 +448,14 @@ public:
   }
 };
 
+inline DataLayout *unwrap(LLVMTargetDataRef P) {
+   return reinterpret_cast<DataLayout*>(P);
+}
+
+inline LLVMTargetDataRef wrap(const DataLayout *P) {
+   return reinterpret_cast<LLVMTargetDataRef>(const_cast<DataLayout*>(P));
+}
+
 class DataLayoutPass : public ImmutablePass {
   DataLayout DL;
 
diff --git a/include/llvm/IR/DebugInfo.h b/include/llvm/IR/DebugInfo.h
index f7244b8..65e0a06 100644
--- a/include/llvm/IR/DebugInfo.h
+++ b/include/llvm/IR/DebugInfo.h
@@ -104,7 +104,7 @@ protected:
   void replaceFunctionField(unsigned Elt, Function *F);
 
 public:
-  explicit DIDescriptor(const MDNode *N = 0) : DbgNode(N) {}
+  explicit DIDescriptor(const MDNode *N = nullptr) : DbgNode(N) {}
 
   bool Verify() const;
 
@@ -116,7 +116,7 @@ public:
   // FIXME: This operator bool isn't actually protecting anything at the
   // moment due to the conversion operator above making DIDescriptor nodes
   // implicitly convertable to bool.
-  LLVM_EXPLICIT operator bool() const { return DbgNode != 0; }
+  LLVM_EXPLICIT operator bool() const { return DbgNode != nullptr; }
 
   bool operator==(DIDescriptor Other) const { return DbgNode == Other.DbgNode; }
   bool operator!=(DIDescriptor Other) const { return !operator==(Other); }
@@ -159,7 +159,7 @@ class DISubrange : public DIDescriptor {
   void printInternal(raw_ostream &OS) const;
 
 public:
-  explicit DISubrange(const MDNode *N = 0) : DIDescriptor(N) {}
+  explicit DISubrange(const MDNode *N = nullptr) : DIDescriptor(N) {}
 
   int64_t getLo() const { return getInt64Field(1); }
   int64_t getCount() const { return getInt64Field(2); }
@@ -169,7 +169,7 @@ public:
 /// DIArray - This descriptor holds an array of descriptors.
 class DIArray : public DIDescriptor {
 public:
-  explicit DIArray(const MDNode *N = 0) : DIDescriptor(N) {}
+  explicit DIArray(const MDNode *N = nullptr) : DIDescriptor(N) {}
 
   unsigned getNumElements() const;
   DIDescriptor getElement(unsigned Idx) const {
@@ -185,7 +185,7 @@ class DIEnumerator : public DIDescriptor {
   void printInternal(raw_ostream &OS) const;
 
 public:
-  explicit DIEnumerator(const MDNode *N = 0) : DIDescriptor(N) {}
+  explicit DIEnumerator(const MDNode *N = nullptr) : DIDescriptor(N) {}
 
   StringRef getName() const { return getStringField(1); }
   int64_t getEnumValue() const { return getInt64Field(2); }
@@ -210,7 +210,7 @@ protected:
   void printInternal(raw_ostream &OS) const;
 
 public:
-  explicit DIScope(const MDNode *N = 0) : DIDescriptor(N) {}
+  explicit DIScope(const MDNode *N = nullptr) : DIDescriptor(N) {}
 
   /// Gets the parent scope for this scope node or returns a
   /// default constructed scope.
@@ -292,7 +292,7 @@ protected:
   void printInternal(raw_ostream &OS) const;
 
 public:
-  explicit DIType(const MDNode *N = 0) : DIScope(N) {}
+  explicit DIType(const MDNode *N = nullptr) : DIScope(N) {}
   operator DITypeRef () const {
     assert(isType() &&
            "constructing DITypeRef from an MDNode that is not a type");
@@ -339,14 +339,14 @@ public:
 
   /// replaceAllUsesWith - Replace all uses of debug info referenced by
   /// this descriptor.
-  void replaceAllUsesWith(DIDescriptor &D);
+  void replaceAllUsesWith(LLVMContext &VMContext, DIDescriptor D);
   void replaceAllUsesWith(MDNode *D);
 };
 
 /// DIBasicType - A basic type, like 'int' or 'float'.
 class DIBasicType : public DIType {
 public:
-  explicit DIBasicType(const MDNode *N = 0) : DIType(N) {}
+  explicit DIBasicType(const MDNode *N = nullptr) : DIType(N) {}
 
   unsigned getEncoding() const { return getUnsignedField(9); }
 
@@ -362,7 +362,7 @@ class DIDerivedType : public DIType {
   void printInternal(raw_ostream &OS) const;
 
 public:
-  explicit DIDerivedType(const MDNode *N = 0) : DIType(N) {}
+  explicit DIDerivedType(const MDNode *N = nullptr) : DIType(N) {}
 
   DITypeRef getTypeDerivedFrom() const { return getFieldAs<DITypeRef>(9); }
 
@@ -395,7 +395,7 @@ class DICompositeType : public DIDerivedType {
   void printInternal(raw_ostream &OS) const;
 
 public:
-  explicit DICompositeType(const MDNode *N = 0) : DIDerivedType(N) {}
+  explicit DICompositeType(const MDNode *N = nullptr) : DIDerivedType(N) {}
 
   DIArray getTypeArray() const { return getFieldAs<DIArray>(10); }
   void setTypeArray(DIArray Elements, DIArray TParams = DIArray());
@@ -414,7 +414,7 @@ class DIFile : public DIScope {
   friend class DIDescriptor;
 
 public:
-  explicit DIFile(const MDNode *N = 0) : DIScope(N) {}
+  explicit DIFile(const MDNode *N = nullptr) : DIScope(N) {}
   MDNode *getFileNode() const;
   bool Verify() const;
 };
@@ -425,9 +425,11 @@ class DICompileUnit : public DIScope {
   void printInternal(raw_ostream &OS) const;
 
 public:
-  explicit DICompileUnit(const MDNode *N = 0) : DIScope(N) {}
+  explicit DICompileUnit(const MDNode *N = nullptr) : DIScope(N) {}
 
-  unsigned getLanguage() const { return getUnsignedField(2); }
+  dwarf::SourceLanguage getLanguage() const {
+    return static_cast<dwarf::SourceLanguage>(getUnsignedField(2));
+  }
   StringRef getProducer() const { return getStringField(3); }
 
   bool isOptimized() const { return getUnsignedField(4) != 0; }
@@ -453,7 +455,7 @@ class DISubprogram : public DIScope {
   void printInternal(raw_ostream &OS) const;
 
 public:
-  explicit DISubprogram(const MDNode *N = 0) : DIScope(N) {}
+  explicit DISubprogram(const MDNode *N = nullptr) : DIScope(N) {}
 
   DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(2); }
   StringRef getName() const { return getStringField(3); }
@@ -532,7 +534,7 @@ public:
 /// DILexicalBlock - This is a wrapper for a lexical block.
 class DILexicalBlock : public DIScope {
 public:
-  explicit DILexicalBlock(const MDNode *N = 0) : DIScope(N) {}
+  explicit DILexicalBlock(const MDNode *N = nullptr) : DIScope(N) {}
   DIScope getContext() const { return getFieldAs<DIScope>(2); }
   unsigned getLineNumber() const { return getUnsignedField(3); }
   unsigned getColumnNumber() const { return getUnsignedField(4); }
@@ -544,7 +546,7 @@ public:
 /// a filename change.
 class DILexicalBlockFile : public DIScope {
 public:
-  explicit DILexicalBlockFile(const MDNode *N = 0) : DIScope(N) {}
+  explicit DILexicalBlockFile(const MDNode *N = nullptr) : DIScope(N) {}
   DIScope getContext() const {
     if (getScope().isSubprogram())
       return getScope();
@@ -562,7 +564,7 @@ class DINameSpace : public DIScope {
   void printInternal(raw_ostream &OS) const;
 
 public:
-  explicit DINameSpace(const MDNode *N = 0) : DIScope(N) {}
+  explicit DINameSpace(const MDNode *N = nullptr) : DIScope(N) {}
   DIScope getContext() const { return getFieldAs<DIScope>(2); }
   StringRef getName() const { return getStringField(3); }
   unsigned getLineNumber() const { return getUnsignedField(4); }
@@ -572,14 +574,16 @@ public:
 /// DIUnspecifiedParameter - This is a wrapper for unspecified parameters.
 class DIUnspecifiedParameter : public DIDescriptor {
 public:
-  explicit DIUnspecifiedParameter(const MDNode *N = 0) : DIDescriptor(N) {}
+  explicit DIUnspecifiedParameter(const MDNode *N = nullptr)
+    : DIDescriptor(N) {}
   bool Verify() const;
 };
 
 /// DITemplateTypeParameter - This is a wrapper for template type parameter.
 class DITemplateTypeParameter : public DIDescriptor {
 public:
-  explicit DITemplateTypeParameter(const MDNode *N = 0) : DIDescriptor(N) {}
+  explicit DITemplateTypeParameter(const MDNode *N = nullptr)
+    : DIDescriptor(N) {}
 
   DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(1); }
   StringRef getName() const { return getStringField(2); }
@@ -596,7 +600,8 @@ public:
 /// DITemplateValueParameter - This is a wrapper for template value parameter.
 class DITemplateValueParameter : public DIDescriptor {
 public:
-  explicit DITemplateValueParameter(const MDNode *N = 0) : DIDescriptor(N) {}
+  explicit DITemplateValueParameter(const MDNode *N = nullptr)
+    : DIDescriptor(N) {}
 
   DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(1); }
   StringRef getName() const { return getStringField(2); }
@@ -617,7 +622,7 @@ class DIGlobalVariable : public DIDescriptor {
   void printInternal(raw_ostream &OS) const;
 
 public:
-  explicit DIGlobalVariable(const MDNode *N = 0) : DIDescriptor(N) {}
+  explicit DIGlobalVariable(const MDNode *N = nullptr) : DIDescriptor(N) {}
 
   DIScope getContext() const { return getFieldAs<DIScope>(2); }
   StringRef getName() const { return getStringField(3); }
@@ -650,7 +655,7 @@ class DIVariable : public DIDescriptor {
   void printInternal(raw_ostream &OS) const;
 
 public:
-  explicit DIVariable(const MDNode *N = 0) : DIDescriptor(N) {}
+  explicit DIVariable(const MDNode *N = nullptr) : DIDescriptor(N) {}
 
   DIScope getContext() const { return getFieldAs<DIScope>(1); }
   StringRef getName() const { return getStringField(2); }
diff --git a/include/llvm/IR/DebugLoc.h b/include/llvm/IR/DebugLoc.h
index 50b5d54..6d769d4 100644
--- a/include/llvm/IR/DebugLoc.h
+++ b/include/llvm/IR/DebugLoc.h
@@ -21,6 +21,7 @@ namespace llvm {
   template <typename T> struct DenseMapInfo;
   class MDNode;
   class LLVMContext;
+  class raw_ostream;
 
   /// DebugLoc - Debug location id.  This is carried by Instruction, SDNode,
   /// and MachineInstr to compactly encode file/line/scope information for an
@@ -58,7 +59,7 @@ namespace llvm {
     /// get - Get a new DebugLoc that corresponds to the specified line/col
     /// scope/inline location.
     static DebugLoc get(unsigned Line, unsigned Col,
-                        MDNode *Scope, MDNode *InlinedAt = 0);
+                        MDNode *Scope, MDNode *InlinedAt = nullptr);
 
     /// getFromDILocation - Translate the DILocation quad into a DebugLoc.
     static DebugLoc getFromDILocation(MDNode *N);
@@ -106,6 +107,8 @@ namespace llvm {
     bool operator!=(const DebugLoc &DL) const { return !(*this == DL); }
 
     void dump(const LLVMContext &Ctx) const;
+    /// \brief prints source location /path/to/file.exe:line:col @[inlined at]
+    void print(const LLVMContext &Ctx, raw_ostream &OS) const;
   };
 
   template <>
diff --git a/include/llvm/IR/DerivedTypes.h b/include/llvm/IR/DerivedTypes.h
index 71d9973..ff15087 100644
--- a/include/llvm/IR/DerivedTypes.h
+++ b/include/llvm/IR/DerivedTypes.h
@@ -188,7 +188,7 @@ class StructType : public CompositeType {
   StructType(const StructType &) LLVM_DELETED_FUNCTION;
   const StructType &operator=(const StructType &) LLVM_DELETED_FUNCTION;
   StructType(LLVMContext &C)
-    : CompositeType(C, StructTyID), SymbolTableEntry(0) {}
+    : CompositeType(C, StructTyID), SymbolTableEntry(nullptr) {}
   enum {
     /// This is the contents of the SubClassData field.
     SCDB_HasBody = 1,
@@ -249,10 +249,10 @@ public:
   bool isOpaque() const { return (getSubclassData() & SCDB_HasBody) == 0; }
 
   /// isSized - Return true if this is a sized type.
-  bool isSized(SmallPtrSet<const Type*, 4> *Visited = 0) const;
+  bool isSized(SmallPtrSet<const Type*, 4> *Visited = nullptr) const;
   
   /// hasName - Return true if this is a named struct that has a non-empty name.
-  bool hasName() const { return SymbolTableEntry != 0; }
+  bool hasName() const { return SymbolTableEntry != nullptr; }
   
   /// getName - Return the name for this struct type if it has an identity.
   /// This may return an empty string for an unnamed struct type.  Do not call
diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h
index 49eb1b0..e78a42b 100644
--- a/include/llvm/IR/DiagnosticInfo.h
+++ b/include/llvm/IR/DiagnosticInfo.h
@@ -15,7 +15,9 @@
 #ifndef LLVM_SUPPORT_DIAGNOSTICINFO_H
 #define LLVM_SUPPORT_DIAGNOSTICINFO_H
 
+#include "llvm-c/Core.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/Casting.h"
 
 namespace llvm {
@@ -24,8 +26,10 @@ namespace llvm {
 class DiagnosticPrinter;
 class Function;
 class Instruction;
+class LLVMContextImpl;
 class Twine;
 class Value;
+class DebugLoc;
 
 /// \brief Defines the different supported severity of a diagnostic.
 enum DiagnosticSeverity {
@@ -44,6 +48,9 @@ enum DiagnosticKind {
   DK_StackSize,
   DK_DebugMetadataVersion,
   DK_SampleProfile,
+  DK_OptimizationRemark,
+  DK_OptimizationRemarkMissed,
+  DK_OptimizationRemarkAnalysis,
   DK_FirstPluginKind
 };
 
@@ -105,7 +112,7 @@ public:
   DiagnosticInfoInlineAsm(const Twine &MsgStr,
                           DiagnosticSeverity Severity = DS_Error)
       : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(0), MsgStr(MsgStr),
-        Instr(NULL) {}
+        Instr(nullptr) {}
 
   /// \p LocCookie if non-zero gives the line number for this report.
   /// \p MsgStr gives the message.
@@ -114,7 +121,7 @@ public:
   DiagnosticInfoInlineAsm(unsigned LocCookie, const Twine &MsgStr,
                           DiagnosticSeverity Severity = DS_Error)
       : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(LocCookie),
-        MsgStr(MsgStr), Instr(NULL) {}
+        MsgStr(MsgStr), Instr(nullptr) {}
 
   /// \p Instr gives the original instruction that triggered the diagnostic.
   /// \p MsgStr gives the message.
@@ -208,7 +215,7 @@ public:
         LineNum(0), Msg(Msg) {}
   DiagnosticInfoSampleProfile(const Twine &Msg,
                               DiagnosticSeverity Severity = DS_Error)
-      : DiagnosticInfo(DK_SampleProfile, Severity), FileName(NULL),
+      : DiagnosticInfo(DK_SampleProfile, Severity), FileName(nullptr),
         LineNum(0), Msg(Msg) {}
 
   /// \see DiagnosticInfo::print.
@@ -227,7 +234,7 @@ private:
   /// Name of the input file associated with this diagnostic.
   const char *FileName;
 
-  /// Line number where the diagnostic occured. If 0, no line number will
+  /// Line number where the diagnostic occurred. If 0, no line number will
   /// be emitted in the message.
   unsigned LineNum;
 
@@ -235,6 +242,183 @@ private:
   const Twine &Msg;
 };
 
+/// Common features for diagnostics dealing with optimization remarks.
+class DiagnosticInfoOptimizationRemarkBase : public DiagnosticInfo {
+public:
+  /// \p PassName is the name of the pass emitting this diagnostic.
+  /// \p Fn is the function where the diagnostic is being emitted. \p DLoc is
+  /// the location information to use in the diagnostic. If line table
+  /// information is available, the diagnostic will include the source code
+  /// location. \p Msg is the message to show. Note that this class does not
+  /// copy this message, so this reference must be valid for the whole life time
+  /// of the diagnostic.
+  DiagnosticInfoOptimizationRemarkBase(enum DiagnosticKind Kind,
+                                       const char *PassName, const Function &Fn,
+                                       const DebugLoc &DLoc, const Twine &Msg)
+      : DiagnosticInfo(Kind, DS_Remark), PassName(PassName), Fn(Fn), DLoc(DLoc),
+        Msg(Msg) {}
+
+  /// \see DiagnosticInfo::print.
+  void print(DiagnosticPrinter &DP) const override;
+
+  /// Hand rolled RTTI.
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_OptimizationRemark;
+  }
+
+  /// Return true if this optimization remark is enabled by one of
+  /// of the LLVM command line flags (-pass-remarks, -pass-remarks-missed,
+  /// or -pass-remarks-analysis). Note that this only handles the LLVM
+  /// flags. We cannot access Clang flags from here (they are handled
+  /// in BackendConsumer::OptimizationRemarkHandler).
+  virtual bool isEnabled() const = 0;
+
+  /// Return true if location information is available for this diagnostic.
+  bool isLocationAvailable() const;
+
+  /// Return a string with the location information for this diagnostic
+  /// in the format "file:line:col". If location information is not available,
+  /// it returns "<unknown>:0:0".
+  const std::string getLocationStr() const;
+
+  /// Return location information for this diagnostic in three parts:
+  /// the source file name, line number and column.
+  void getLocation(StringRef *Filename, unsigned *Line, unsigned *Column) const;
+
+  const char *getPassName() const { return PassName; }
+  const Function &getFunction() const { return Fn; }
+  const DebugLoc &getDebugLoc() const { return DLoc; }
+  const Twine &getMsg() const { return Msg; }
+
+private:
+  /// Name of the pass that triggers this report. If this matches the
+  /// regular expression given in -Rpass=regexp, then the remark will
+  /// be emitted.
+  const char *PassName;
+
+  /// Function where this diagnostic is triggered.
+  const Function &Fn;
+
+  /// Debug location where this diagnostic is triggered.
+  DebugLoc DLoc;
+
+  /// Message to report.
+  const Twine &Msg;
+};
+
+/// Diagnostic information for applied optimization remarks.
+class DiagnosticInfoOptimizationRemark
+    : public DiagnosticInfoOptimizationRemarkBase {
+public:
+  /// \p PassName is the name of the pass emitting this diagnostic. If
+  /// this name matches the regular expression given in -Rpass=, then the
+  /// diagnostic will be emitted. \p Fn is the function where the diagnostic
+  /// is being emitted. \p DLoc is the location information to use in the
+  /// diagnostic. If line table information is available, the diagnostic
+  /// will include the source code location. \p Msg is the message to show.
+  /// Note that this class does not copy this message, so this reference
+  /// must be valid for the whole life time of the diagnostic.
+  DiagnosticInfoOptimizationRemark(const char *PassName, const Function &Fn,
+                                   const DebugLoc &DLoc, const Twine &Msg)
+      : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemark, PassName,
+                                             Fn, DLoc, Msg) {}
+
+  /// Hand rolled RTTI
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_OptimizationRemark;
+  }
+
+  /// \see DiagnosticInfoOptimizationRemarkBase::isEnabled.
+  virtual bool isEnabled() const override;
+};
+
+/// Diagnostic information for missed-optimization remarks.
+class DiagnosticInfoOptimizationRemarkMissed
+    : public DiagnosticInfoOptimizationRemarkBase {
+public:
+  /// \p PassName is the name of the pass emitting this diagnostic. If
+  /// this name matches the regular expression given in -Rpass-missed=, then the
+  /// diagnostic will be emitted. \p Fn is the function where the diagnostic
+  /// is being emitted. \p DLoc is the location information to use in the
+  /// diagnostic. If line table information is available, the diagnostic
+  /// will include the source code location. \p Msg is the message to show.
+  /// Note that this class does not copy this message, so this reference
+  /// must be valid for the whole life time of the diagnostic.
+  DiagnosticInfoOptimizationRemarkMissed(const char *PassName,
+                                         const Function &Fn,
+                                         const DebugLoc &DLoc, const Twine &Msg)
+      : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemarkMissed,
+                                             PassName, Fn, DLoc, Msg) {}
+
+  /// Hand rolled RTTI
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_OptimizationRemarkMissed;
+  }
+
+  /// \see DiagnosticInfoOptimizationRemarkBase::isEnabled.
+  virtual bool isEnabled() const override;
+};
+
+/// Diagnostic information for optimization analysis remarks.
+class DiagnosticInfoOptimizationRemarkAnalysis
+    : public DiagnosticInfoOptimizationRemarkBase {
+public:
+  /// \p PassName is the name of the pass emitting this diagnostic. If
+  /// this name matches the regular expression given in -Rpass-analysis=, then
+  /// the diagnostic will be emitted. \p Fn is the function where the diagnostic
+  /// is being emitted. \p DLoc is the location information to use in the
+  /// diagnostic. If line table information is available, the diagnostic will
+  /// include the source code location. \p Msg is the message to show. Note that
+  /// this class does not copy this message, so this reference must be valid for
+  /// the whole life time of the diagnostic.
+  DiagnosticInfoOptimizationRemarkAnalysis(const char *PassName,
+                                           const Function &Fn,
+                                           const DebugLoc &DLoc,
+                                           const Twine &Msg)
+      : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemarkAnalysis,
+                                             PassName, Fn, DLoc, Msg) {}
+
+  /// Hand rolled RTTI
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_OptimizationRemarkAnalysis;
+  }
+
+  /// \see DiagnosticInfoOptimizationRemarkBase::isEnabled.
+  virtual bool isEnabled() const override;
+};
+
+// Create wrappers for C Binding types (see CBindingWrapping.h).
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(DiagnosticInfo, LLVMDiagnosticInfoRef)
+
+/// Emit an optimization-applied message. \p PassName is the name of the pass
+/// emitting the message. If -Rpass= is given and \p PassName matches the
+/// regular expression in -Rpass, then the remark will be emitted. \p Fn is
+/// the function triggering the remark, \p DLoc is the debug location where
+/// the diagnostic is generated. \p Msg is the message string to use.
+void emitOptimizationRemark(LLVMContext &Ctx, const char *PassName,
+                            const Function &Fn, const DebugLoc &DLoc,
+                            const Twine &Msg);
+
+/// Emit an optimization-missed message. \p PassName is the name of the
+/// pass emitting the message. If -Rpass-missed= is given and \p PassName
+/// matches the regular expression in -Rpass, then the remark will be
+/// emitted. \p Fn is the function triggering the remark, \p DLoc is the
+/// debug location where the diagnostic is generated. \p Msg is the
+/// message string to use.
+void emitOptimizationRemarkMissed(LLVMContext &Ctx, const char *PassName,
+                                  const Function &Fn, const DebugLoc &DLoc,
+                                  const Twine &Msg);
+
+/// Emit an optimization analysis remark message. \p PassName is the name of
+/// the pass emitting the message. If -Rpass-analysis= is given and \p
+/// PassName matches the regular expression in -Rpass, then the remark will be
+/// emitted. \p Fn is the function triggering the remark, \p DLoc is the debug
+/// location where the diagnostic is generated. \p Msg is the message string
+/// to use.
+void emitOptimizationRemarkAnalysis(LLVMContext &Ctx, const char *PassName,
+                                    const Function &Fn, const DebugLoc &DLoc,
+                                    const Twine &Msg);
+
 } // End namespace llvm
 
 #endif
diff --git a/include/llvm/IR/Dominators.h b/include/llvm/IR/Dominators.h
index 86bbe39..3648202 100644
--- a/include/llvm/IR/Dominators.h
+++ b/include/llvm/IR/Dominators.h
@@ -182,7 +182,7 @@ public:
 
   void releaseMemory() override { DT.releaseMemory(); }
 
-  void print(raw_ostream &OS, const Module *M = 0) const override;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index cb43bba..22444bd 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -23,7 +23,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalObject.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
@@ -68,8 +68,7 @@ private:
   mutable ilist_half_node<Argument> Sentinel;
 };
 
-class Function : public GlobalValue,
-                 public ilist_node<Function> {
+class Function : public GlobalObject, public ilist_node<Function> {
 public:
   typedef iplist<Argument> ArgumentListType;
   typedef iplist<BasicBlock> BasicBlockListType;
@@ -123,11 +122,11 @@ private:
   /// the module.
   ///
   Function(FunctionType *Ty, LinkageTypes Linkage,
-           const Twine &N = "", Module *M = 0);
+           const Twine &N = "", Module *M = nullptr);
 
 public:
   static Function *Create(FunctionType *Ty, LinkageTypes Linkage,
-                          const Twine &N = "", Module *M = 0) {
+                          const Twine &N = "", Module *M = nullptr) {
     return new(0) Function(Ty, Linkage, N, M);
   }
 
@@ -298,7 +297,8 @@ public:
   /// @brief Determine if the function returns a structure through first
   /// pointer argument.
   bool hasStructRetAttr() const {
-    return AttributeSets.hasAttribute(1, Attribute::StructRet);
+    return AttributeSets.hasAttribute(1, Attribute::StructRet) ||
+           AttributeSets.hasAttribute(2, Attribute::StructRet);
   }
 
   /// @brief Determine if the parameter does not alias other parameters.
@@ -483,7 +483,7 @@ public:
   /// other than direct calls or invokes to it, or blockaddress expressions.
   /// Optionally passes back an offending user for diagnostic purposes.
   ///
-  bool hasAddressTaken(const User** = 0) const;
+  bool hasAddressTaken(const User** = nullptr) const;
 
   /// isDefTriviallyDead - Return true if it is trivially safe to remove
   /// this function definition from the module (because it isn't externally
@@ -505,12 +505,12 @@ private:
 
 inline ValueSymbolTable *
 ilist_traits<BasicBlock>::getSymTab(Function *F) {
-  return F ? &F->getValueSymbolTable() : 0;
+  return F ? &F->getValueSymbolTable() : nullptr;
 }
 
 inline ValueSymbolTable *
 ilist_traits<Argument>::getSymTab(Function *F) {
-  return F ? &F->getValueSymbolTable() : 0;
+  return F ? &F->getValueSymbolTable() : nullptr;
 }
 
 } // End llvm namespace
diff --git a/include/llvm/IR/GVMaterializer.h b/include/llvm/IR/GVMaterializer.h
index 6717bc8..dbe52bc 100644
--- a/include/llvm/IR/GVMaterializer.h
+++ b/include/llvm/IR/GVMaterializer.h
@@ -33,26 +33,26 @@ protected:
 public:
   virtual ~GVMaterializer();
 
-  /// isMaterializable - True if GV can be materialized from whatever backing
-  /// store this GVMaterializer uses and has not been materialized yet.
+  /// True if GV can be materialized from whatever backing store this
+  /// GVMaterializer uses and has not been materialized yet.
   virtual bool isMaterializable(const GlobalValue *GV) const = 0;
 
-  /// isDematerializable - True if GV has been materialized and can be
-  /// dematerialized back to whatever backing store this GVMaterializer uses.
+  /// True if GV has been materialized and can be dematerialized back to
+  /// whatever backing store this GVMaterializer uses.
   virtual bool isDematerializable(const GlobalValue *GV) const = 0;
 
-  /// Materialize - make sure the given GlobalValue is fully read.
+  /// Make sure the given GlobalValue is fully read.
   ///
   virtual error_code Materialize(GlobalValue *GV) = 0;
 
-  /// Dematerialize - If the given GlobalValue is read in, and if the
-  /// GVMaterializer supports it, release the memory for the GV, and set it up
-  /// to be materialized lazily.  If the Materializer doesn't support this
-  /// capability, this method is a noop.
+  /// If the given GlobalValue is read in, and if the GVMaterializer supports
+  /// it, release the memory for the GV, and set it up to be materialized
+  /// lazily. If the Materializer doesn't support this capability, this method
+  /// is a noop.
   ///
   virtual void Dematerialize(GlobalValue *) {}
 
-  /// MaterializeModule - make sure the entire Module has been completely read.
+  /// Make sure the entire Module has been completely read.
   ///
   virtual error_code MaterializeModule(Module *M) = 0;
 };
diff --git a/include/llvm/IR/GetElementPtrTypeIterator.h b/include/llvm/IR/GetElementPtrTypeIterator.h
index f2722d6..dcf8e64 100644
--- a/include/llvm/IR/GetElementPtrTypeIterator.h
+++ b/include/llvm/IR/GetElementPtrTypeIterator.h
@@ -38,7 +38,7 @@ namespace llvm {
     }
     static generic_gep_type_iterator end(ItTy It) {
       generic_gep_type_iterator I;
-      I.CurTy = 0;
+      I.CurTy = nullptr;
       I.OpIt = It;
       return I;
     }
@@ -69,7 +69,7 @@ namespace llvm {
       if (CompositeType *CT = dyn_cast<CompositeType>(CurTy)) {
         CurTy = CT->getTypeAtIndex(getOperand());
       } else {
-        CurTy = 0;
+        CurTy = nullptr;
       }
       ++OpIt;
       return *this;
diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h
index 2ca481a..d9f0b4a 100644
--- a/include/llvm/IR/GlobalAlias.h
+++ b/include/llvm/IR/GlobalAlias.h
@@ -33,15 +33,37 @@ class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> {
 
   void setParent(Module *parent);
 
+  GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage,
+              const Twine &Name, GlobalObject *Aliasee, Module *Parent);
+
 public:
   // allocate space for exactly one operand
   void *operator new(size_t s) {
     return User::operator new(s, 1);
   }
-  /// GlobalAlias ctor - If a parent module is specified, the alias is
-  /// automatically inserted into the end of the specified module's alias list.
-  GlobalAlias(Type *Ty, LinkageTypes Linkage, const Twine &Name = "",
-              Constant* Aliasee = 0, Module *Parent = 0);
+
+  /// If a parent module is specified, the alias is automatically inserted into
+  /// the end of the specified module's alias list.
+  static GlobalAlias *create(Type *Ty, unsigned AddressSpace,
+                             LinkageTypes Linkage, const Twine &Name,
+                             GlobalObject *Aliasee, Module *Parent);
+
+  // Without the Aliasee.
+  static GlobalAlias *create(Type *Ty, unsigned AddressSpace,
+                             LinkageTypes Linkage, const Twine &Name,
+                             Module *Parent);
+
+  // The module is taken from the Aliasee.
+  static GlobalAlias *create(Type *Ty, unsigned AddressSpace,
+                             LinkageTypes Linkage, const Twine &Name,
+                             GlobalObject *Aliasee);
+
+  // Type, Parent and AddressSpace taken from the Aliasee.
+  static GlobalAlias *create(LinkageTypes Linkage, const Twine &Name,
+                             GlobalObject *Aliasee);
+
+  // Linkage, Type, Parent and AddressSpace taken from the Aliasee.
+  static GlobalAlias *create(const Twine &Name, GlobalObject *Aliasee);
 
   /// Provide fast operand accessors
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
@@ -57,20 +79,13 @@ public:
   void eraseFromParent() override;
 
   /// set/getAliasee - These methods retrive and set alias target.
-  void setAliasee(Constant *GV);
-  const Constant *getAliasee() const {
-    return getOperand(0);
-  }
-  Constant *getAliasee() {
-    return getOperand(0);
+  void setAliasee(GlobalObject *GO);
+  const GlobalObject *getAliasee() const {
+    return const_cast<GlobalAlias *>(this)->getAliasee();
   }
 
-  /// This method tries to ultimately resolve the alias by going through the
-  /// aliasing chain and trying to find the very last global. Returns NULL if a
-  /// cycle was found.
-  GlobalValue *getAliasedGlobal();
-  const GlobalValue *getAliasedGlobal() const {
-    return const_cast<GlobalAlias *>(this)->getAliasedGlobal();
+  GlobalObject *getAliasee() {
+    return cast_or_null<GlobalObject>(getOperand(0));
   }
 
   static bool isValidLinkage(LinkageTypes L) {
diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h
new file mode 100644
index 0000000..3bc8b85
--- /dev/null
+++ b/include/llvm/IR/GlobalObject.h
@@ -0,0 +1,58 @@
+//===-- llvm/GlobalObject.h - Class to represent a global object *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This represents an independent object. That is, a function or a global
+// variable, but not an alias.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_GLOBALOBJECT_H
+#define LLVM_IR_GLOBALOBJECT_H
+
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+
+namespace llvm {
+
+class Module;
+
+class GlobalObject : public GlobalValue {
+  GlobalObject(const GlobalObject &) LLVM_DELETED_FUNCTION;
+
+protected:
+  GlobalObject(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps,
+               LinkageTypes Linkage, const Twine &Name)
+      : GlobalValue(Ty, VTy, Ops, NumOps, Linkage, Name) {
+    setGlobalValueSubClassData(0);
+  }
+
+  std::string Section;     // Section to emit this into, empty means default
+public:
+  unsigned getAlignment() const {
+    return (1u << getGlobalValueSubClassData()) >> 1;
+  }
+  void setAlignment(unsigned Align);
+
+  bool hasSection() const { return !getSection().empty(); }
+  const std::string &getSection() const { return Section; }
+  void setSection(StringRef S);
+
+  void copyAttributesFrom(const GlobalValue *Src) override;
+
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static inline bool classof(const Value *V) {
+    return V->getValueID() == Value::FunctionVal ||
+           V->getValueID() == Value::GlobalVariableVal;
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index 59c320d..10df372 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -59,11 +59,11 @@ public:
   };
 
 protected:
-  GlobalValue(Type *ty, ValueTy vty, Use *Ops, unsigned NumOps,
-              LinkageTypes linkage, const Twine &Name)
-    : Constant(ty, vty, Ops, NumOps), Linkage(linkage),
-      Visibility(DefaultVisibility), Alignment(0), UnnamedAddr(0),
-      DllStorageClass(DefaultStorageClass), Parent(0) {
+  GlobalValue(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps,
+              LinkageTypes Linkage, const Twine &Name)
+      : Constant(Ty, VTy, Ops, NumOps), Linkage(Linkage),
+        Visibility(DefaultVisibility), UnnamedAddr(0),
+        DllStorageClass(DefaultStorageClass), Parent(nullptr) {
     setName(Name);
   }
 
@@ -71,20 +71,29 @@ protected:
   // Linkage and Visibility from turning into negative values.
   LinkageTypes Linkage : 5;   // The linkage of this global
   unsigned Visibility : 2;    // The visibility style of this global
-  unsigned Alignment : 16;    // Alignment of this symbol, must be power of two
   unsigned UnnamedAddr : 1;   // This value's address is not significant
   unsigned DllStorageClass : 2; // DLL storage class
+
+private:
+  // Give subclasses access to what otherwise would be wasted padding.
+  // (22 + 2 + 1 + 2 + 5) == 32.
+  unsigned SubClassData : 22;
+protected:
+  unsigned getGlobalValueSubClassData() const {
+    return SubClassData;
+  }
+  void setGlobalValueSubClassData(unsigned V) {
+    assert(V < (1 << 22) && "It will not fit");
+    SubClassData = V;
+  }
+
   Module *Parent;             // The containing module.
-  std::string Section;        // Section to emit this into, empty mean default
 public:
   ~GlobalValue() {
     removeDeadConstantUsers();   // remove any dead constants using this.
   }
 
-  unsigned getAlignment() const {
-    return (1u << Alignment) >> 1;
-  }
-  void setAlignment(unsigned Align);
+  unsigned getAlignment() const;
 
   bool hasUnnamedAddr() const { return UnnamedAddr; }
   void setUnnamedAddr(bool Val) { UnnamedAddr = Val; }
@@ -95,7 +104,11 @@ public:
   bool hasProtectedVisibility() const {
     return Visibility == ProtectedVisibility;
   }
-  void setVisibility(VisibilityTypes V) { Visibility = V; }
+  void setVisibility(VisibilityTypes V) {
+    assert((!hasLocalLinkage() || V == DefaultVisibility) &&
+           "local linkage requires default visibility");
+    Visibility = V;
+  }
 
   DLLStorageClassTypes getDLLStorageClass() const {
     return DLLStorageClassTypes(DllStorageClass);
@@ -108,22 +121,10 @@ public:
   }
   void setDLLStorageClass(DLLStorageClassTypes C) { DllStorageClass = C; }
 
-  bool hasSection() const { return !Section.empty(); }
-  const std::string &getSection() const { return Section; }
-  void setSection(StringRef S) {
-    assert((getValueID() != Value::GlobalAliasVal || S.empty()) &&
-           "GlobalAlias should not have a section!");
-    Section = S;
-  }
-
-  /// If the usage is empty (except transitively dead constants), then this
-  /// global value can be safely deleted since the destructor will
-  /// delete the dead constants as well.
-  /// @brief Determine if the usage of this global value is empty except
-  /// for transitively dead constants.
-  bool use_empty_except_constants();
+  bool hasSection() const { return !getSection().empty(); }
+  const std::string &getSection() const;
 
-  /// getType - Global values are always pointers.
+  /// Global values are always pointers.
   inline PointerType *getType() const {
     return cast<PointerType>(User::getType());
   }
@@ -144,8 +145,14 @@ public:
   static bool isLinkOnceLinkage(LinkageTypes Linkage) {
     return Linkage == LinkOnceAnyLinkage || Linkage == LinkOnceODRLinkage;
   }
+  static bool isWeakAnyLinkage(LinkageTypes Linkage) {
+    return Linkage == WeakAnyLinkage;
+  }
+  static bool isWeakODRLinkage(LinkageTypes Linkage) {
+    return Linkage == WeakODRLinkage;
+  }
   static bool isWeakLinkage(LinkageTypes Linkage) {
-    return Linkage == WeakAnyLinkage || Linkage == WeakODRLinkage;
+    return isWeakAnyLinkage(Linkage) || isWeakODRLinkage(Linkage);
   }
   static bool isAppendingLinkage(LinkageTypes Linkage) {
     return Linkage == AppendingLinkage;
@@ -166,24 +173,24 @@ public:
     return Linkage == CommonLinkage;
   }
 
-  /// isDiscardableIfUnused - Whether the definition of this global may be
-  /// discarded if it is not used in its compilation unit.
+  /// Whether the definition of this global may be discarded if it is not used
+  /// in its compilation unit.
   static bool isDiscardableIfUnused(LinkageTypes Linkage) {
     return isLinkOnceLinkage(Linkage) || isLocalLinkage(Linkage);
   }
 
-  /// mayBeOverridden - Whether the definition of this global may be replaced
-  /// by something non-equivalent at link time.  For example, if a function has
-  /// weak linkage then the code defining it may be replaced by different code.
+  /// Whether the definition of this global may be replaced by something
+  /// non-equivalent at link time. For example, if a function has weak linkage
+  /// then the code defining it may be replaced by different code.
   static bool mayBeOverridden(LinkageTypes Linkage) {
     return Linkage == WeakAnyLinkage || Linkage == LinkOnceAnyLinkage ||
            Linkage == CommonLinkage || Linkage == ExternalWeakLinkage;
   }
 
-  /// isWeakForLinker - Whether the definition of this global may be replaced at
-  /// link time.  NB: Using this method outside of the code generators is almost
-  /// always a mistake: when working at the IR level use mayBeOverridden instead
-  /// as it knows about ODR semantics.
+  /// Whether the definition of this global may be replaced at link time.  NB:
+  /// Using this method outside of the code generators is almost always a
+  /// mistake: when working at the IR level use mayBeOverridden instead as it
+  /// knows about ODR semantics.
   static bool isWeakForLinker(LinkageTypes Linkage)  {
     return Linkage == AvailableExternallyLinkage || Linkage == WeakAnyLinkage ||
            Linkage == WeakODRLinkage || Linkage == LinkOnceAnyLinkage ||
@@ -201,6 +208,12 @@ public:
   bool hasWeakLinkage() const {
     return isWeakLinkage(Linkage);
   }
+  bool hasWeakAnyLinkage() const {
+    return isWeakAnyLinkage(Linkage);
+  }
+  bool hasWeakODRLinkage() const {
+    return isWeakODRLinkage(Linkage);
+  }
   bool hasAppendingLinkage() const { return isAppendingLinkage(Linkage); }
   bool hasInternalLinkage() const { return isInternalLinkage(Linkage); }
   bool hasPrivateLinkage() const { return isPrivateLinkage(Linkage); }
@@ -208,7 +221,11 @@ public:
   bool hasExternalWeakLinkage() const { return isExternalWeakLinkage(Linkage); }
   bool hasCommonLinkage() const { return isCommonLinkage(Linkage); }
 
-  void setLinkage(LinkageTypes LT) { Linkage = LT; }
+  void setLinkage(LinkageTypes LT) {
+    if (isLocalLinkage(LT))
+      Visibility = DefaultVisibility;
+    Linkage = LT;
+  }
   LinkageTypes getLinkage() const { return Linkage; }
 
   bool isDiscardableIfUnused() const {
@@ -219,13 +236,13 @@ public:
 
   bool isWeakForLinker() const { return isWeakForLinker(Linkage); }
 
-  /// copyAttributesFrom - copy all additional attributes (those not needed to
-  /// create a GlobalValue) from the GlobalValue Src to this one.
+  /// Copy all additional attributes (those not needed to create a GlobalValue)
+  /// from the GlobalValue Src to this one.
   virtual void copyAttributesFrom(const GlobalValue *Src);
 
-  /// getRealLinkageName - If special LLVM prefix that is used to inform the asm
-  /// printer to not emit usual symbol prefix before the symbol name is used
-  /// then return linkage name after skipping this special LLVM prefix.
+  /// If special LLVM prefix that is used to inform the asm printer to not emit
+  /// usual symbol prefix before the symbol name is used then return linkage
+  /// name after skipping this special LLVM prefix.
   static StringRef getRealLinkageName(StringRef Name) {
     if (!Name.empty() && Name[0] == '\1')
       return Name.substr(1);
@@ -238,24 +255,24 @@ public:
 /// BitcodeReader to load the Module.
 /// @{
 
-  /// isMaterializable - If this function's Module is being lazily streamed in
-  /// functions from disk or some other source, this method can be used to check
-  /// to see if the function has been read in yet or not.
+  /// If this function's Module is being lazily streamed in functions from disk
+  /// or some other source, this method can be used to check to see if the
+  /// function has been read in yet or not.
   bool isMaterializable() const;
 
-  /// isDematerializable - Returns true if this function was loaded from a
-  /// GVMaterializer that's still attached to its Module and that knows how to
-  /// dematerialize the function.
+  /// Returns true if this function was loaded from a GVMaterializer that's
+  /// still attached to its Module and that knows how to dematerialize the
+  /// function.
   bool isDematerializable() const;
 
-  /// Materialize - make sure this GlobalValue is fully read.  If the module is
-  /// corrupt, this returns true and fills in the optional string with
-  /// information about the problem.  If successful, this returns false.
-  bool Materialize(std::string *ErrInfo = 0);
+  /// Make sure this GlobalValue is fully read. If the module is corrupt, this
+  /// returns true and fills in the optional string with information about the
+  /// problem.  If successful, this returns false.
+  bool Materialize(std::string *ErrInfo = nullptr);
 
-  /// Dematerialize - If this GlobalValue is read in, and if the GVMaterializer
-  /// supports it, release the memory for the function, and set it up to be
-  /// materialized lazily.  If !isDematerializable(), this method is a noop.
+  /// If this GlobalValue is read in, and if the GVMaterializer supports it,
+  /// release the memory for the function, and set it up to be materialized
+  /// lazily. If !isDematerializable(), this method is a noop.
   void Dematerialize();
 
 /// @}
@@ -263,20 +280,18 @@ public:
   /// Override from Constant class.
   void destroyConstant() override;
 
-  /// isDeclaration - Return true if the primary definition of this global 
-  /// value is outside of the current translation unit.
+  /// Return true if the primary definition of this global value is outside of
+  /// the current translation unit.
   bool isDeclaration() const;
 
-  /// removeFromParent - This method unlinks 'this' from the containing module,
-  /// but does not delete it.
+  /// This method unlinks 'this' from the containing module, but does not delete
+  /// it.
   virtual void removeFromParent() = 0;
 
-  /// eraseFromParent - This method unlinks 'this' from the containing module
-  /// and deletes it.
+  /// This method unlinks 'this' from the containing module and deletes it.
   virtual void eraseFromParent() = 0;
 
-  /// getParent - Get the module that this global value is contained inside
-  /// of...
+  /// Get the module that this global value is contained inside of...
   inline Module *getParent() { return Parent; }
   inline const Module *getParent() const { return Parent; }
 
diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index a82740f..8cd4332 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -22,7 +22,7 @@
 
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist_node.h"
-#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/OperandTraits.h"
 
 namespace llvm {
@@ -32,7 +32,7 @@ class Constant;
 template<typename ValueSubClass, typename ItemParentClass>
   class SymbolTableListTraits;
 
-class GlobalVariable : public GlobalValue, public ilist_node<GlobalVariable> {
+class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
   friend class SymbolTableListTraits<GlobalVariable, Module>;
   void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   void operator=(const GlobalVariable &) LLVM_DELETED_FUNCTION;
@@ -66,14 +66,14 @@ public:
   /// GlobalVariable ctor - If a parent module is specified, the global is
   /// automatically inserted into the end of the specified modules global list.
   GlobalVariable(Type *Ty, bool isConstant, LinkageTypes Linkage,
-                 Constant *Initializer = 0, const Twine &Name = "",
+                 Constant *Initializer = nullptr, const Twine &Name = "",
                  ThreadLocalMode = NotThreadLocal, unsigned AddressSpace = 0,
                  bool isExternallyInitialized = false);
   /// GlobalVariable ctor - This creates a global and inserts it before the
   /// specified other global.
   GlobalVariable(Module &M, Type *Ty, bool isConstant,
                  LinkageTypes Linkage, Constant *Initializer,
-                 const Twine &Name = "", GlobalVariable *InsertBefore = 0,
+                 const Twine &Name = "", GlobalVariable *InsertBefore = nullptr,
                  ThreadLocalMode = NotThreadLocal, unsigned AddressSpace = 0,
                  bool isExternallyInitialized = false);
 
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index 79ee7b7..580d333 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -58,7 +58,7 @@ protected:
   FastMathFlags FMF;
 public:
 
-  IRBuilderBase(LLVMContext &context, MDNode *FPMathTag = 0)
+  IRBuilderBase(LLVMContext &context, MDNode *FPMathTag = nullptr)
     : Context(context), DefaultFPMathTag(FPMathTag), FMF() {
     ClearInsertionPoint();
   }
@@ -70,8 +70,8 @@ public:
   /// \brief Clear the insertion point: created instructions will not be
   /// inserted into a block.
   void ClearInsertionPoint() {
-    BB = 0;
-    InsertPt = 0;
+    BB = nullptr;
+    InsertPt = nullptr;
   }
 
   BasicBlock *GetInsertBlock() const { return BB; }
@@ -140,14 +140,14 @@ public:
 
   public:
     /// \brief Creates a new insertion point which doesn't point to anything.
-    InsertPoint() : Block(0) {}
+    InsertPoint() : Block(nullptr) {}
 
     /// \brief Creates a new insertion point at the given location.
     InsertPoint(BasicBlock *InsertBlock, BasicBlock::iterator InsertPoint)
       : Block(InsertBlock), Point(InsertPoint) {}
 
     /// \brief Returns true if this insert point is set.
-    bool isSet() const { return (Block != 0); }
+    bool isSet() const { return (Block != nullptr); }
 
     llvm::BasicBlock *getBlock() const { return Block; }
     llvm::BasicBlock::iterator getPoint() const { return Point; }
@@ -362,27 +362,27 @@ public:
   /// If the pointer isn't an i8*, it will be converted.  If a TBAA tag is
   /// specified, it will be added to the instruction.
   CallInst *CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, unsigned Align,
-                         bool isVolatile = false, MDNode *TBAATag = 0) {
+                         bool isVolatile = false, MDNode *TBAATag = nullptr) {
     return CreateMemSet(Ptr, Val, getInt64(Size), Align, isVolatile, TBAATag);
   }
 
   CallInst *CreateMemSet(Value *Ptr, Value *Val, Value *Size, unsigned Align,
-                         bool isVolatile = false, MDNode *TBAATag = 0);
+                         bool isVolatile = false, MDNode *TBAATag = nullptr);
 
   /// \brief Create and insert a memcpy between the specified pointers.
   ///
   /// If the pointers aren't i8*, they will be converted.  If a TBAA tag is
   /// specified, it will be added to the instruction.
   CallInst *CreateMemCpy(Value *Dst, Value *Src, uint64_t Size, unsigned Align,
-                         bool isVolatile = false, MDNode *TBAATag = 0,
-                         MDNode *TBAAStructTag = 0) {
+                         bool isVolatile = false, MDNode *TBAATag = nullptr,
+                         MDNode *TBAAStructTag = nullptr) {
     return CreateMemCpy(Dst, Src, getInt64(Size), Align, isVolatile, TBAATag,
                         TBAAStructTag);
   }
 
   CallInst *CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align,
-                         bool isVolatile = false, MDNode *TBAATag = 0,
-                         MDNode *TBAAStructTag = 0);
+                         bool isVolatile = false, MDNode *TBAATag = nullptr,
+                         MDNode *TBAAStructTag = nullptr);
 
   /// \brief Create and insert a memmove between the specified
   /// pointers.
@@ -390,22 +390,22 @@ public:
   /// If the pointers aren't i8*, they will be converted.  If a TBAA tag is
   /// specified, it will be added to the instruction.
   CallInst *CreateMemMove(Value *Dst, Value *Src, uint64_t Size, unsigned Align,
-                          bool isVolatile = false, MDNode *TBAATag = 0) {
+                          bool isVolatile = false, MDNode *TBAATag = nullptr) {
     return CreateMemMove(Dst, Src, getInt64(Size), Align, isVolatile, TBAATag);
   }
 
   CallInst *CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
-                          bool isVolatile = false, MDNode *TBAATag = 0);
+                          bool isVolatile = false, MDNode *TBAATag = nullptr);
 
   /// \brief Create a lifetime.start intrinsic.
   ///
   /// If the pointer isn't i8* it will be converted.
-  CallInst *CreateLifetimeStart(Value *Ptr, ConstantInt *Size = 0);
+  CallInst *CreateLifetimeStart(Value *Ptr, ConstantInt *Size = nullptr);
 
   /// \brief Create a lifetime.end intrinsic.
   ///
   /// If the pointer isn't i8* it will be converted.
-  CallInst *CreateLifetimeEnd(Value *Ptr, ConstantInt *Size = 0);
+  CallInst *CreateLifetimeEnd(Value *Ptr, ConstantInt *Size = nullptr);
 
 private:
   Value *getCastedInt8PtrValue(Value *Ptr);
@@ -433,43 +433,44 @@ class IRBuilder : public IRBuilderBase, public Inserter {
   T Folder;
 public:
   IRBuilder(LLVMContext &C, const T &F, const Inserter &I = Inserter(),
-            MDNode *FPMathTag = 0)
+            MDNode *FPMathTag = nullptr)
     : IRBuilderBase(C, FPMathTag), Inserter(I), Folder(F) {
   }
 
-  explicit IRBuilder(LLVMContext &C, MDNode *FPMathTag = 0)
+  explicit IRBuilder(LLVMContext &C, MDNode *FPMathTag = nullptr)
     : IRBuilderBase(C, FPMathTag), Folder() {
   }
 
-  explicit IRBuilder(BasicBlock *TheBB, const T &F, MDNode *FPMathTag = 0)
+  explicit IRBuilder(BasicBlock *TheBB, const T &F, MDNode *FPMathTag = nullptr)
     : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder(F) {
     SetInsertPoint(TheBB);
   }
 
-  explicit IRBuilder(BasicBlock *TheBB, MDNode *FPMathTag = 0)
+  explicit IRBuilder(BasicBlock *TheBB, MDNode *FPMathTag = nullptr)
     : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder() {
     SetInsertPoint(TheBB);
   }
 
-  explicit IRBuilder(Instruction *IP, MDNode *FPMathTag = 0)
+  explicit IRBuilder(Instruction *IP, MDNode *FPMathTag = nullptr)
     : IRBuilderBase(IP->getContext(), FPMathTag), Folder() {
     SetInsertPoint(IP);
     SetCurrentDebugLocation(IP->getDebugLoc());
   }
 
-  explicit IRBuilder(Use &U, MDNode *FPMathTag = 0)
+  explicit IRBuilder(Use &U, MDNode *FPMathTag = nullptr)
     : IRBuilderBase(U->getContext(), FPMathTag), Folder() {
     SetInsertPoint(U);
     SetCurrentDebugLocation(cast<Instruction>(U.getUser())->getDebugLoc());
   }
 
   IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP, const T& F,
-            MDNode *FPMathTag = 0)
+            MDNode *FPMathTag = nullptr)
     : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder(F) {
     SetInsertPoint(TheBB, IP);
   }
 
-  IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP, MDNode *FPMathTag = 0)
+  IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP,
+            MDNode *FPMathTag = nullptr)
     : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder() {
     SetInsertPoint(TheBB, IP);
   }
@@ -541,7 +542,7 @@ public:
   /// \brief Create a conditional 'br Cond, TrueDest, FalseDest'
   /// instruction.
   BranchInst *CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False,
-                           MDNode *BranchWeights = 0) {
+                           MDNode *BranchWeights = nullptr) {
     return Insert(addBranchWeights(BranchInst::Create(True, False, Cond),
                                    BranchWeights));
   }
@@ -550,7 +551,7 @@ public:
   /// and with a hint for the number of cases that will be added (for efficient
   /// allocation).
   SwitchInst *CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases = 10,
-                           MDNode *BranchWeights = 0) {
+                           MDNode *BranchWeights = nullptr) {
     return Insert(addBranchWeights(SwitchInst::Create(V, Dest, NumCases),
                                    BranchWeights));
   }
@@ -638,7 +639,7 @@ public:
     return CreateAdd(LHS, RHS, Name, true, false);
   }
   Value *CreateFAdd(Value *LHS, Value *RHS, const Twine &Name = "",
-                    MDNode *FPMathTag = 0) {
+                    MDNode *FPMathTag = nullptr) {
     if (Constant *LC = dyn_cast<Constant>(LHS))
       if (Constant *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateFAdd(LC, RC), Name);
@@ -660,7 +661,7 @@ public:
     return CreateSub(LHS, RHS, Name, true, false);
   }
   Value *CreateFSub(Value *LHS, Value *RHS, const Twine &Name = "",
-                    MDNode *FPMathTag = 0) {
+                    MDNode *FPMathTag = nullptr) {
     if (Constant *LC = dyn_cast<Constant>(LHS))
       if (Constant *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateFSub(LC, RC), Name);
@@ -682,7 +683,7 @@ public:
     return CreateMul(LHS, RHS, Name, true, false);
   }
   Value *CreateFMul(Value *LHS, Value *RHS, const Twine &Name = "",
-                    MDNode *FPMathTag = 0) {
+                    MDNode *FPMathTag = nullptr) {
     if (Constant *LC = dyn_cast<Constant>(LHS))
       if (Constant *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateFMul(LC, RC), Name);
@@ -714,7 +715,7 @@ public:
     return CreateSDiv(LHS, RHS, Name, true);
   }
   Value *CreateFDiv(Value *LHS, Value *RHS, const Twine &Name = "",
-                    MDNode *FPMathTag = 0) {
+                    MDNode *FPMathTag = nullptr) {
     if (Constant *LC = dyn_cast<Constant>(LHS))
       if (Constant *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateFDiv(LC, RC), Name);
@@ -734,7 +735,7 @@ public:
     return Insert(BinaryOperator::CreateSRem(LHS, RHS), Name);
   }
   Value *CreateFRem(Value *LHS, Value *RHS, const Twine &Name = "",
-                    MDNode *FPMathTag = 0) {
+                    MDNode *FPMathTag = nullptr) {
     if (Constant *LC = dyn_cast<Constant>(LHS))
       if (Constant *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateFRem(LC, RC), Name);
@@ -844,7 +845,7 @@ public:
 
   Value *CreateBinOp(Instruction::BinaryOps Opc,
                      Value *LHS, Value *RHS, const Twine &Name = "",
-                     MDNode *FPMathTag = 0) {
+                     MDNode *FPMathTag = nullptr) {
     if (Constant *LC = dyn_cast<Constant>(LHS))
       if (Constant *RC = dyn_cast<Constant>(RHS))
         return Insert(Folder.CreateBinOp(Opc, LC, RC), Name);
@@ -869,7 +870,8 @@ public:
   Value *CreateNUWNeg(Value *V, const Twine &Name = "") {
     return CreateNeg(V, Name, true, false);
   }
-  Value *CreateFNeg(Value *V, const Twine &Name = "", MDNode *FPMathTag = 0) {
+  Value *CreateFNeg(Value *V, const Twine &Name = "",
+                    MDNode *FPMathTag = nullptr) {
     if (Constant *VC = dyn_cast<Constant>(V))
       return Insert(Folder.CreateFNeg(VC), Name);
     return Insert(AddFPMathAttributes(BinaryOperator::CreateFNeg(V),
@@ -885,7 +887,7 @@ public:
   // Instruction creation methods: Memory Instructions
   //===--------------------------------------------------------------------===//
 
-  AllocaInst *CreateAlloca(Type *Ty, Value *ArraySize = 0,
+  AllocaInst *CreateAlloca(Type *Ty, Value *ArraySize = nullptr,
                            const Twine &Name = "") {
     return Insert(new AllocaInst(Ty, ArraySize), Name);
   }
@@ -898,7 +900,7 @@ public:
     return Insert(new LoadInst(Ptr), Name);
   }
   LoadInst *CreateLoad(Value *Ptr, bool isVolatile, const Twine &Name = "") {
-    return Insert(new LoadInst(Ptr, 0, isVolatile), Name);
+    return Insert(new LoadInst(Ptr, nullptr, isVolatile), Name);
   }
   StoreInst *CreateStore(Value *Val, Value *Ptr, bool isVolatile = false) {
     return Insert(new StoreInst(Val, Ptr, isVolatile));
diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index e1a5130..a27859e 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -36,7 +36,7 @@ class TerminatorInst : public Instruction {
 protected:
   TerminatorInst(Type *Ty, Instruction::TermOps iType,
                  Use *Ops, unsigned NumOps,
-                 Instruction *InsertBefore = 0)
+                 Instruction *InsertBefore = nullptr)
     : Instruction(Ty, iType, Ops, NumOps, InsertBefore) {}
 
   TerminatorInst(Type *Ty, Instruction::TermOps iType,
@@ -91,7 +91,7 @@ class UnaryInstruction : public Instruction {
 
 protected:
   UnaryInstruction(Type *Ty, unsigned iType, Value *V,
-                   Instruction *IB = 0)
+                   Instruction *IB = nullptr)
     : Instruction(Ty, iType, &Op<0>(), 1, IB) {
     Op<0>() = V;
   }
@@ -160,7 +160,7 @@ public:
   ///
   static BinaryOperator *Create(BinaryOps Op, Value *S1, Value *S2,
                                 const Twine &Name = Twine(),
-                                Instruction *InsertBefore = 0);
+                                Instruction *InsertBefore = nullptr);
 
   /// Create() - Construct a binary instruction, given the opcode and the two
   /// operands.  Also automatically insert this instruction to the end of the
@@ -285,23 +285,23 @@ public:
   ///     instructions out of SUB and XOR instructions.
   ///
   static BinaryOperator *CreateNeg(Value *Op, const Twine &Name = "",
-                                   Instruction *InsertBefore = 0);
+                                   Instruction *InsertBefore = nullptr);
   static BinaryOperator *CreateNeg(Value *Op, const Twine &Name,
                                    BasicBlock *InsertAtEnd);
   static BinaryOperator *CreateNSWNeg(Value *Op, const Twine &Name = "",
-                                      Instruction *InsertBefore = 0);
+                                      Instruction *InsertBefore = nullptr);
   static BinaryOperator *CreateNSWNeg(Value *Op, const Twine &Name,
                                       BasicBlock *InsertAtEnd);
   static BinaryOperator *CreateNUWNeg(Value *Op, const Twine &Name = "",
-                                      Instruction *InsertBefore = 0);
+                                      Instruction *InsertBefore = nullptr);
   static BinaryOperator *CreateNUWNeg(Value *Op, const Twine &Name,
                                       BasicBlock *InsertAtEnd);
   static BinaryOperator *CreateFNeg(Value *Op, const Twine &Name = "",
-                                    Instruction *InsertBefore = 0);
+                                    Instruction *InsertBefore = nullptr);
   static BinaryOperator *CreateFNeg(Value *Op, const Twine &Name,
                                     BasicBlock *InsertAtEnd);
   static BinaryOperator *CreateNot(Value *Op, const Twine &Name = "",
-                                   Instruction *InsertBefore = 0);
+                                   Instruction *InsertBefore = nullptr);
   static BinaryOperator *CreateNot(Value *Op, const Twine &Name,
                                    BasicBlock *InsertAtEnd);
 
@@ -389,7 +389,7 @@ class CastInst : public UnaryInstruction {
 protected:
   /// @brief Constructor with insert-before-instruction semantics for subclasses
   CastInst(Type *Ty, unsigned iType, Value *S,
-           const Twine &NameStr = "", Instruction *InsertBefore = 0)
+           const Twine &NameStr = "", Instruction *InsertBefore = nullptr)
     : UnaryInstruction(Ty, iType, S, InsertBefore) {
     setName(NameStr);
   }
@@ -411,7 +411,7 @@ public:
     Value *S,                ///< The value to be casted (operand 0)
     Type *Ty,          ///< The type to which cast should be made
     const Twine &Name = "", ///< Name for the instruction
-    Instruction *InsertBefore = 0 ///< Place to insert the instruction
+    Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
   /// Provides a way to construct any of the CastInst subclasses using an
   /// opcode instead of the subclass's constructor. The opcode must be in the
@@ -432,7 +432,7 @@ public:
     Value *S,                ///< The value to be casted (operand 0)
     Type *Ty,          ///< The type to which cast should be made
     const Twine &Name = "", ///< Name for the instruction
-    Instruction *InsertBefore = 0 ///< Place to insert the instruction
+    Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
   /// @brief Create a ZExt or BitCast cast instruction
@@ -448,7 +448,7 @@ public:
     Value *S,                ///< The value to be casted (operand 0)
     Type *Ty,          ///< The type to which cast should be made
     const Twine &Name = "", ///< Name for the instruction
-    Instruction *InsertBefore = 0 ///< Place to insert the instruction
+    Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
   /// @brief Create a SExt or BitCast cast instruction
@@ -472,7 +472,7 @@ public:
     Value *S,                ///< The pointer value to be casted (operand 0)
     Type *Ty,          ///< The type to which cast should be made
     const Twine &Name = "", ///< Name for the instruction
-    Instruction *InsertBefore = 0 ///< Place to insert the instruction
+    Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
   /// @brief Create a ZExt, BitCast, or Trunc for int -> int casts.
@@ -481,7 +481,7 @@ public:
     Type *Ty,          ///< The type to which cast should be made
     bool isSigned,           ///< Whether to regard S as signed or not
     const Twine &Name = "", ///< Name for the instruction
-    Instruction *InsertBefore = 0 ///< Place to insert the instruction
+    Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
   /// @brief Create a ZExt, BitCast, or Trunc for int -> int casts.
@@ -498,7 +498,7 @@ public:
     Value *S,                ///< The floating point value to be casted
     Type *Ty,          ///< The floating point type to cast to
     const Twine &Name = "", ///< Name for the instruction
-    Instruction *InsertBefore = 0 ///< Place to insert the instruction
+    Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
   /// @brief Create an FPExt, BitCast, or FPTrunc for fp -> fp casts
@@ -514,7 +514,7 @@ public:
     Value *S,                ///< The value to be casted (operand 0)
     Type *Ty,          ///< The type to which cast should be made
     const Twine &Name = "", ///< Name for the instruction
-    Instruction *InsertBefore = 0 ///< Place to insert the instruction
+    Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
   /// @brief Create a Trunc or BitCast cast instruction
@@ -641,7 +641,7 @@ class CmpInst : public Instruction {
 protected:
   CmpInst(Type *ty, Instruction::OtherOps op, unsigned short pred,
           Value *LHS, Value *RHS, const Twine &Name = "",
-          Instruction *InsertBefore = 0);
+          Instruction *InsertBefore = nullptr);
 
   CmpInst(Type *ty, Instruction::OtherOps op, unsigned short pred,
           Value *LHS, Value *RHS, const Twine &Name,
@@ -701,7 +701,7 @@ public:
   static CmpInst *Create(OtherOps Op,
                          unsigned short predicate, Value *S1,
                          Value *S2, const Twine &Name = "",
-                         Instruction *InsertBefore = 0);
+                         Instruction *InsertBefore = nullptr);
 
   /// Construct a compare instruction, given the opcode, the predicate and the
   /// two operands.  Also automatically insert this instruction to the end of
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index 928dc07..bac6a95 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -141,14 +141,14 @@ public:
   /// getMetadata - Get the metadata of given kind attached to this Instruction.
   /// If the metadata is not found then return null.
   MDNode *getMetadata(unsigned KindID) const {
-    if (!hasMetadata()) return 0;
+    if (!hasMetadata()) return nullptr;
     return getMetadataImpl(KindID);
   }
 
   /// getMetadata - Get the metadata of given kind attached to this Instruction.
   /// If the metadata is not found then return null.
   MDNode *getMetadata(StringRef Kind) const {
-    if (!hasMetadata()) return 0;
+    if (!hasMetadata()) return nullptr;
     return getMetadataImpl(Kind);
   }
 
@@ -461,7 +461,7 @@ protected:
   }
 
   Instruction(Type *Ty, unsigned iType, Use *Ops, unsigned NumOps,
-              Instruction *InsertBefore = 0);
+              Instruction *InsertBefore = nullptr);
   Instruction(Type *Ty, unsigned iType, Use *Ops, unsigned NumOps,
               BasicBlock *InsertAtEnd);
   virtual Instruction *clone_impl() const = 0;
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 06d7287..7d338a6 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -60,16 +60,17 @@ class AllocaInst : public UnaryInstruction {
 protected:
   AllocaInst *clone_impl() const override;
 public:
-  explicit AllocaInst(Type *Ty, Value *ArraySize = 0,
-                      const Twine &Name = "", Instruction *InsertBefore = 0);
+  explicit AllocaInst(Type *Ty, Value *ArraySize = nullptr,
+                      const Twine &Name = "",
+                      Instruction *InsertBefore = nullptr);
   AllocaInst(Type *Ty, Value *ArraySize,
              const Twine &Name, BasicBlock *InsertAtEnd);
 
-  AllocaInst(Type *Ty, const Twine &Name, Instruction *InsertBefore = 0);
+  AllocaInst(Type *Ty, const Twine &Name, Instruction *InsertBefore = nullptr);
   AllocaInst(Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd);
 
   AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
-             const Twine &Name = "", Instruction *InsertBefore = 0);
+             const Twine &Name = "", Instruction *InsertBefore = nullptr);
   AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
              const Twine &Name, BasicBlock *InsertAtEnd);
 
@@ -156,17 +157,17 @@ public:
   LoadInst(Value *Ptr, const Twine &NameStr, Instruction *InsertBefore);
   LoadInst(Value *Ptr, const Twine &NameStr, BasicBlock *InsertAtEnd);
   LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile = false,
-           Instruction *InsertBefore = 0);
+           Instruction *InsertBefore = nullptr);
   LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile,
            BasicBlock *InsertAtEnd);
   LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile,
-           unsigned Align, Instruction *InsertBefore = 0);
+           unsigned Align, Instruction *InsertBefore = nullptr);
   LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile,
            unsigned Align, BasicBlock *InsertAtEnd);
   LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile,
            unsigned Align, AtomicOrdering Order,
            SynchronizationScope SynchScope = CrossThread,
-           Instruction *InsertBefore = 0);
+           Instruction *InsertBefore = nullptr);
   LoadInst(Value *Ptr, const Twine &NameStr, bool isVolatile,
            unsigned Align, AtomicOrdering Order,
            SynchronizationScope SynchScope,
@@ -174,8 +175,9 @@ public:
 
   LoadInst(Value *Ptr, const char *NameStr, Instruction *InsertBefore);
   LoadInst(Value *Ptr, const char *NameStr, BasicBlock *InsertAtEnd);
-  explicit LoadInst(Value *Ptr, const char *NameStr = 0,
-                    bool isVolatile = false,  Instruction *InsertBefore = 0);
+  explicit LoadInst(Value *Ptr, const char *NameStr = nullptr,
+                    bool isVolatile = false,
+                    Instruction *InsertBefore = nullptr);
   LoadInst(Value *Ptr, const char *NameStr, bool isVolatile,
            BasicBlock *InsertAtEnd);
 
@@ -280,16 +282,16 @@ public:
   StoreInst(Value *Val, Value *Ptr, Instruction *InsertBefore);
   StoreInst(Value *Val, Value *Ptr, BasicBlock *InsertAtEnd);
   StoreInst(Value *Val, Value *Ptr, bool isVolatile = false,
-            Instruction *InsertBefore = 0);
+            Instruction *InsertBefore = nullptr);
   StoreInst(Value *Val, Value *Ptr, bool isVolatile, BasicBlock *InsertAtEnd);
   StoreInst(Value *Val, Value *Ptr, bool isVolatile,
-            unsigned Align, Instruction *InsertBefore = 0);
+            unsigned Align, Instruction *InsertBefore = nullptr);
   StoreInst(Value *Val, Value *Ptr, bool isVolatile,
             unsigned Align, BasicBlock *InsertAtEnd);
   StoreInst(Value *Val, Value *Ptr, bool isVolatile,
             unsigned Align, AtomicOrdering Order,
             SynchronizationScope SynchScope = CrossThread,
-            Instruction *InsertBefore = 0);
+            Instruction *InsertBefore = nullptr);
   StoreInst(Value *Val, Value *Ptr, bool isVolatile,
             unsigned Align, AtomicOrdering Order,
             SynchronizationScope SynchScope,
@@ -409,7 +411,7 @@ public:
   // SequentiallyConsistent.
   FenceInst(LLVMContext &C, AtomicOrdering Ordering,
             SynchronizationScope SynchScope = CrossThread,
-            Instruction *InsertBefore = 0);
+            Instruction *InsertBefore = nullptr);
   FenceInst(LLVMContext &C, AtomicOrdering Ordering,
             SynchronizationScope SynchScope,
             BasicBlock *InsertAtEnd);
@@ -477,7 +479,7 @@ public:
                     AtomicOrdering SuccessOrdering,
                     AtomicOrdering FailureOrdering,
                     SynchronizationScope SynchScope,
-                    Instruction *InsertBefore = 0);
+                    Instruction *InsertBefore = nullptr);
   AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                     AtomicOrdering SuccessOrdering,
                     AtomicOrdering FailureOrdering,
@@ -651,7 +653,7 @@ public:
   }
   AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
                 AtomicOrdering Ordering, SynchronizationScope SynchScope,
-                Instruction *InsertBefore = 0);
+                Instruction *InsertBefore = nullptr);
   AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
                 AtomicOrdering Ordering, SynchronizationScope SynchScope,
                 BasicBlock *InsertAtEnd);
@@ -779,7 +781,7 @@ protected:
 public:
   static GetElementPtrInst *Create(Value *Ptr, ArrayRef<Value *> IdxList,
                                    const Twine &NameStr = "",
-                                   Instruction *InsertBefore = 0) {
+                                   Instruction *InsertBefore = nullptr) {
     unsigned Values = 1 + unsigned(IdxList.size());
     return new(Values)
       GetElementPtrInst(Ptr, IdxList, Values, NameStr, InsertBefore);
@@ -797,7 +799,7 @@ public:
   static GetElementPtrInst *CreateInBounds(Value *Ptr,
                                            ArrayRef<Value *> IdxList,
                                            const Twine &NameStr = "",
-                                           Instruction *InsertBefore = 0) {
+                                           Instruction *InsertBefore = nullptr){
     GetElementPtrInst *GEP = Create(Ptr, IdxList, NameStr, InsertBefore);
     GEP->setIsInBounds(true);
     return GEP;
@@ -1237,7 +1239,7 @@ public:
   static CallInst *Create(Value *Func,
                           ArrayRef<Value *> Args,
                           const Twine &NameStr = "",
-                          Instruction *InsertBefore = 0) {
+                          Instruction *InsertBefore = nullptr) {
     return new(unsigned(Args.size() + 1))
       CallInst(Func, Args, NameStr, InsertBefore);
   }
@@ -1248,7 +1250,7 @@ public:
       CallInst(Func, Args, NameStr, InsertAtEnd);
   }
   static CallInst *Create(Value *F, const Twine &NameStr = "",
-                          Instruction *InsertBefore = 0) {
+                          Instruction *InsertBefore = nullptr) {
     return new(1) CallInst(F, NameStr, InsertBefore);
   }
   static CallInst *Create(Value *F, const Twine &NameStr,
@@ -1263,13 +1265,13 @@ public:
   /// 3. Bitcast the result of the malloc call to the specified type.
   static Instruction *CreateMalloc(Instruction *InsertBefore,
                                    Type *IntPtrTy, Type *AllocTy,
-                                   Value *AllocSize, Value *ArraySize = 0,
-                                   Function* MallocF = 0,
+                                   Value *AllocSize, Value *ArraySize = nullptr,
+                                   Function* MallocF = nullptr,
                                    const Twine &Name = "");
   static Instruction *CreateMalloc(BasicBlock *InsertAtEnd,
                                    Type *IntPtrTy, Type *AllocTy,
-                                   Value *AllocSize, Value *ArraySize = 0,
-                                   Function* MallocF = 0,
+                                   Value *AllocSize, Value *ArraySize = nullptr,
+                                   Function* MallocF = nullptr,
                                    const Twine &Name = "");
   /// CreateFree - Generate the IR for a call to the builtin free function.
   static Instruction* CreateFree(Value* Source, Instruction *InsertBefore);
@@ -1277,10 +1279,24 @@ public:
 
   ~CallInst();
 
-  bool isTailCall() const { return getSubclassDataFromInstruction() & 1; }
+  // Note that 'musttail' implies 'tail'.
+  enum TailCallKind { TCK_None = 0, TCK_Tail = 1, TCK_MustTail = 2 };
+  TailCallKind getTailCallKind() const {
+    return TailCallKind(getSubclassDataFromInstruction() & 3);
+  }
+  bool isTailCall() const {
+    return (getSubclassDataFromInstruction() & 3) != TCK_None;
+  }
+  bool isMustTailCall() const {
+    return (getSubclassDataFromInstruction() & 3) == TCK_MustTail;
+  }
   void setTailCall(bool isTC = true) {
-    setInstructionSubclassData((getSubclassDataFromInstruction() & ~1) |
-                               unsigned(isTC));
+    setInstructionSubclassData((getSubclassDataFromInstruction() & ~3) |
+                               unsigned(isTC ? TCK_Tail : TCK_None));
+  }
+  void setTailCallKind(TailCallKind TCK) {
+    setInstructionSubclassData((getSubclassDataFromInstruction() & ~3) |
+                               unsigned(TCK));
   }
 
   /// Provide fast operand accessors
@@ -1314,11 +1330,11 @@ public:
   /// getCallingConv/setCallingConv - Get or set the calling convention of this
   /// function call.
   CallingConv::ID getCallingConv() const {
-    return static_cast<CallingConv::ID>(getSubclassDataFromInstruction() >> 1);
+    return static_cast<CallingConv::ID>(getSubclassDataFromInstruction() >> 2);
   }
   void setCallingConv(CallingConv::ID CC) {
-    setInstructionSubclassData((getSubclassDataFromInstruction() & 1) |
-                               (static_cast<unsigned>(CC) << 1));
+    setInstructionSubclassData((getSubclassDataFromInstruction() & 3) |
+                               (static_cast<unsigned>(CC) << 2));
   }
 
   /// getAttributes - Return the parameter attributes for this call.
@@ -1520,7 +1536,7 @@ protected:
 public:
   static SelectInst *Create(Value *C, Value *S1, Value *S2,
                             const Twine &NameStr = "",
-                            Instruction *InsertBefore = 0) {
+                            Instruction *InsertBefore = nullptr) {
     return new(3) SelectInst(C, S1, S2, NameStr, InsertBefore);
   }
   static SelectInst *Create(Value *C, Value *S1, Value *S2,
@@ -1575,7 +1591,7 @@ protected:
 
 public:
   VAArgInst(Value *List, Type *Ty, const Twine &NameStr = "",
-             Instruction *InsertBefore = 0)
+             Instruction *InsertBefore = nullptr)
     : UnaryInstruction(Ty, VAArg, List, InsertBefore) {
     setName(NameStr);
   }
@@ -1607,7 +1623,7 @@ public:
 ///
 class ExtractElementInst : public Instruction {
   ExtractElementInst(Value *Vec, Value *Idx, const Twine &NameStr = "",
-                     Instruction *InsertBefore = 0);
+                     Instruction *InsertBefore = nullptr);
   ExtractElementInst(Value *Vec, Value *Idx, const Twine &NameStr,
                      BasicBlock *InsertAtEnd);
 protected:
@@ -1616,7 +1632,7 @@ protected:
 public:
   static ExtractElementInst *Create(Value *Vec, Value *Idx,
                                    const Twine &NameStr = "",
-                                   Instruction *InsertBefore = 0) {
+                                   Instruction *InsertBefore = nullptr) {
     return new(2) ExtractElementInst(Vec, Idx, NameStr, InsertBefore);
   }
   static ExtractElementInst *Create(Value *Vec, Value *Idx,
@@ -1668,7 +1684,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ExtractElementInst, Value)
 class InsertElementInst : public Instruction {
   InsertElementInst(Value *Vec, Value *NewElt, Value *Idx,
                     const Twine &NameStr = "",
-                    Instruction *InsertBefore = 0);
+                    Instruction *InsertBefore = nullptr);
   InsertElementInst(Value *Vec, Value *NewElt, Value *Idx,
                     const Twine &NameStr, BasicBlock *InsertAtEnd);
 protected:
@@ -1677,7 +1693,7 @@ protected:
 public:
   static InsertElementInst *Create(Value *Vec, Value *NewElt, Value *Idx,
                                    const Twine &NameStr = "",
-                                   Instruction *InsertBefore = 0) {
+                                   Instruction *InsertBefore = nullptr) {
     return new(3) InsertElementInst(Vec, NewElt, Idx, NameStr, InsertBefore);
   }
   static InsertElementInst *Create(Value *Vec, Value *NewElt, Value *Idx,
@@ -1734,7 +1750,7 @@ public:
   }
   ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
                     const Twine &NameStr = "",
-                    Instruction *InsertBefor = 0);
+                    Instruction *InsertBefor = nullptr);
   ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
                     const Twine &NameStr, BasicBlock *InsertAtEnd);
 
@@ -1832,7 +1848,7 @@ public:
   static ExtractValueInst *Create(Value *Agg,
                                   ArrayRef<unsigned> Idxs,
                                   const Twine &NameStr = "",
-                                  Instruction *InsertBefore = 0) {
+                                  Instruction *InsertBefore = nullptr) {
     return new
       ExtractValueInst(Agg, Idxs, NameStr, InsertBefore);
   }
@@ -1933,7 +1949,7 @@ class InsertValueInst : public Instruction {
   /// and two index insertvalue instructions are so common.
   InsertValueInst(Value *Agg, Value *Val,
                   unsigned Idx, const Twine &NameStr = "",
-                  Instruction *InsertBefore = 0);
+                  Instruction *InsertBefore = nullptr);
   InsertValueInst(Value *Agg, Value *Val, unsigned Idx,
                   const Twine &NameStr, BasicBlock *InsertAtEnd);
 protected:
@@ -1947,7 +1963,7 @@ public:
   static InsertValueInst *Create(Value *Agg, Value *Val,
                                  ArrayRef<unsigned> Idxs,
                                  const Twine &NameStr = "",
-                                 Instruction *InsertBefore = 0) {
+                                 Instruction *InsertBefore = nullptr) {
     return new InsertValueInst(Agg, Val, Idxs, NameStr, InsertBefore);
   }
   static InsertValueInst *Create(Value *Agg, Value *Val,
@@ -2052,8 +2068,9 @@ class PHINode : public Instruction {
     return User::operator new(s, 0);
   }
   explicit PHINode(Type *Ty, unsigned NumReservedValues,
-                   const Twine &NameStr = "", Instruction *InsertBefore = 0)
-    : Instruction(Ty, Instruction::PHI, 0, 0, InsertBefore),
+                   const Twine &NameStr = "",
+                   Instruction *InsertBefore = nullptr)
+    : Instruction(Ty, Instruction::PHI, nullptr, 0, InsertBefore),
       ReservedSpace(NumReservedValues) {
     setName(NameStr);
     OperandList = allocHungoffUses(ReservedSpace);
@@ -2061,7 +2078,7 @@ class PHINode : public Instruction {
 
   PHINode(Type *Ty, unsigned NumReservedValues, const Twine &NameStr,
           BasicBlock *InsertAtEnd)
-    : Instruction(Ty, Instruction::PHI, 0, 0, InsertAtEnd),
+    : Instruction(Ty, Instruction::PHI, nullptr, 0, InsertAtEnd),
       ReservedSpace(NumReservedValues) {
     setName(NameStr);
     OperandList = allocHungoffUses(ReservedSpace);
@@ -2078,7 +2095,7 @@ public:
   /// edges that this phi node will have (use 0 if you really have no idea).
   static PHINode *Create(Type *Ty, unsigned NumReservedValues,
                          const Twine &NameStr = "",
-                         Instruction *InsertBefore = 0) {
+                         Instruction *InsertBefore = nullptr) {
     return new PHINode(Ty, NumReservedValues, NameStr, InsertBefore);
   }
   static PHINode *Create(Type *Ty, unsigned NumReservedValues,
@@ -2270,7 +2287,7 @@ public:
   static LandingPadInst *Create(Type *RetTy, Value *PersonalityFn,
                                 unsigned NumReservedClauses,
                                 const Twine &NameStr = "",
-                                Instruction *InsertBefore = 0);
+                                Instruction *InsertBefore = nullptr);
   static LandingPadInst *Create(Type *RetTy, Value *PersonalityFn,
                                 unsigned NumReservedClauses,
                                 const Twine &NameStr, BasicBlock *InsertAtEnd);
@@ -2356,15 +2373,15 @@ private:
   //
   // NOTE: If the Value* passed is of type void then the constructor behaves as
   // if it was passed NULL.
-  explicit ReturnInst(LLVMContext &C, Value *retVal = 0,
-                      Instruction *InsertBefore = 0);
+  explicit ReturnInst(LLVMContext &C, Value *retVal = nullptr,
+                      Instruction *InsertBefore = nullptr);
   ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd);
   explicit ReturnInst(LLVMContext &C, BasicBlock *InsertAtEnd);
 protected:
   ReturnInst *clone_impl() const override;
 public:
-  static ReturnInst* Create(LLVMContext &C, Value *retVal = 0,
-                            Instruction *InsertBefore = 0) {
+  static ReturnInst* Create(LLVMContext &C, Value *retVal = nullptr,
+                            Instruction *InsertBefore = nullptr) {
     return new(!!retVal) ReturnInst(C, retVal, InsertBefore);
   }
   static ReturnInst* Create(LLVMContext &C, Value *retVal,
@@ -2381,7 +2398,7 @@ public:
 
   /// Convenience accessor. Returns null if there is no return value.
   Value *getReturnValue() const {
-    return getNumOperands() != 0 ? getOperand(0) : 0;
+    return getNumOperands() != 0 ? getOperand(0) : nullptr;
   }
 
   unsigned getNumSuccessors() const { return 0; }
@@ -2426,20 +2443,21 @@ class BranchInst : public TerminatorInst {
   // BranchInst(BB* T, BB *F, Value *C, Inst *I) - 'br C, T, F', insert before I
   // BranchInst(BB* B, BB *I)                    - 'br B'        insert at end
   // BranchInst(BB* T, BB *F, Value *C, BB *I)   - 'br C, T, F', insert at end
-  explicit BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore = 0);
+  explicit BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore = nullptr);
   BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
-             Instruction *InsertBefore = 0);
+             Instruction *InsertBefore = nullptr);
   BranchInst(BasicBlock *IfTrue, BasicBlock *InsertAtEnd);
   BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
              BasicBlock *InsertAtEnd);
 protected:
   BranchInst *clone_impl() const override;
 public:
-  static BranchInst *Create(BasicBlock *IfTrue, Instruction *InsertBefore = 0) {
+  static BranchInst *Create(BasicBlock *IfTrue,
+                            Instruction *InsertBefore = nullptr) {
     return new(1) BranchInst(IfTrue, InsertBefore);
   }
   static BranchInst *Create(BasicBlock *IfTrue, BasicBlock *IfFalse,
-                            Value *Cond, Instruction *InsertBefore = 0) {
+                            Value *Cond, Instruction *InsertBefore = nullptr) {
     return new(3) BranchInst(IfTrue, IfFalse, Cond, InsertBefore);
   }
   static BranchInst *Create(BasicBlock *IfTrue, BasicBlock *InsertAtEnd) {
@@ -2658,7 +2676,8 @@ public:
   };
 
   static SwitchInst *Create(Value *Value, BasicBlock *Default,
-                            unsigned NumCases, Instruction *InsertBefore = 0) {
+                            unsigned NumCases,
+                            Instruction *InsertBefore = nullptr) {
     return new SwitchInst(Value, Default, NumCases, InsertBefore);
   }
   static SwitchInst *Create(Value *Value, BasicBlock *Default,
@@ -2742,12 +2761,12 @@ public:
   /// findCaseDest - Finds the unique case value for a given successor. Returns
   /// null if the successor is not found, not unique, or is the default case.
   ConstantInt *findCaseDest(BasicBlock *BB) {
-    if (BB == getDefaultDest()) return NULL;
+    if (BB == getDefaultDest()) return nullptr;
 
-    ConstantInt *CI = NULL;
+    ConstantInt *CI = nullptr;
     for (CaseIt i = case_begin(), e = case_end(); i != e; ++i) {
       if (i.getCaseSuccessor() == BB) {
-        if (CI) return NULL;   // Multiple cases lead to BB.
+        if (CI) return nullptr;   // Multiple cases lead to BB.
         else CI = i.getCaseValue();
       }
     }
@@ -2834,7 +2853,7 @@ protected:
   IndirectBrInst *clone_impl() const override;
 public:
   static IndirectBrInst *Create(Value *Address, unsigned NumDests,
-                                Instruction *InsertBefore = 0) {
+                                Instruction *InsertBefore = nullptr) {
     return new IndirectBrInst(Address, NumDests, InsertBefore);
   }
   static IndirectBrInst *Create(Value *Address, unsigned NumDests,
@@ -2928,7 +2947,7 @@ public:
   static InvokeInst *Create(Value *Func,
                             BasicBlock *IfNormal, BasicBlock *IfException,
                             ArrayRef<Value *> Args, const Twine &NameStr = "",
-                            Instruction *InsertBefore = 0) {
+                            Instruction *InsertBefore = nullptr) {
     unsigned Values = unsigned(Args.size()) + 3;
     return new(Values) InvokeInst(Func, IfNormal, IfException, Args,
                                   Values, NameStr, InsertBefore);
@@ -3175,12 +3194,12 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InvokeInst, Value)
 class ResumeInst : public TerminatorInst {
   ResumeInst(const ResumeInst &RI);
 
-  explicit ResumeInst(Value *Exn, Instruction *InsertBefore=0);
+  explicit ResumeInst(Value *Exn, Instruction *InsertBefore=nullptr);
   ResumeInst(Value *Exn, BasicBlock *InsertAtEnd);
 protected:
   ResumeInst *clone_impl() const override;
 public:
-  static ResumeInst *Create(Value *Exn, Instruction *InsertBefore = 0) {
+  static ResumeInst *Create(Value *Exn, Instruction *InsertBefore = nullptr) {
     return new(1) ResumeInst(Exn, InsertBefore);
   }
   static ResumeInst *Create(Value *Exn, BasicBlock *InsertAtEnd) {
@@ -3234,7 +3253,7 @@ public:
   void *operator new(size_t s) {
     return User::operator new(s, 0);
   }
-  explicit UnreachableInst(LLVMContext &C, Instruction *InsertBefore = 0);
+  explicit UnreachableInst(LLVMContext &C, Instruction *InsertBefore = nullptr);
   explicit UnreachableInst(LLVMContext &C, BasicBlock *InsertAtEnd);
 
   unsigned getNumSuccessors() const { return 0; }
@@ -3265,16 +3284,16 @@ protected:
 public:
   /// \brief Constructor with insert-before-instruction semantics
   TruncInst(
-    Value *S,                     ///< The value to be truncated
-    Type *Ty,               ///< The (smaller) type to truncate to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be truncated
+    Type *Ty,                           ///< The (smaller) type to truncate to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-at-end-of-block semantics
   TruncInst(
     Value *S,                     ///< The value to be truncated
-    Type *Ty,               ///< The (smaller) type to truncate to
+    Type *Ty,                     ///< The (smaller) type to truncate to
     const Twine &NameStr,         ///< A name for the new instruction
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
@@ -3301,16 +3320,16 @@ protected:
 public:
   /// \brief Constructor with insert-before-instruction semantics
   ZExtInst(
-    Value *S,                     ///< The value to be zero extended
-    Type *Ty,               ///< The type to zero extend to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be zero extended
+    Type *Ty,                           ///< The type to zero extend to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-at-end semantics.
   ZExtInst(
     Value *S,                     ///< The value to be zero extended
-    Type *Ty,               ///< The type to zero extend to
+    Type *Ty,                     ///< The type to zero extend to
     const Twine &NameStr,         ///< A name for the new instruction
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
@@ -3337,16 +3356,16 @@ protected:
 public:
   /// \brief Constructor with insert-before-instruction semantics
   SExtInst(
-    Value *S,                     ///< The value to be sign extended
-    Type *Ty,               ///< The type to sign extend to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be sign extended
+    Type *Ty,                           ///< The type to sign extend to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-at-end-of-block semantics
   SExtInst(
     Value *S,                     ///< The value to be sign extended
-    Type *Ty,               ///< The type to sign extend to
+    Type *Ty,                     ///< The type to sign extend to
     const Twine &NameStr,         ///< A name for the new instruction
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
@@ -3373,16 +3392,16 @@ protected:
 public:
   /// \brief Constructor with insert-before-instruction semantics
   FPTruncInst(
-    Value *S,                     ///< The value to be truncated
-    Type *Ty,               ///< The type to truncate to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be truncated
+    Type *Ty,                           ///< The type to truncate to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-before-instruction semantics
   FPTruncInst(
     Value *S,                     ///< The value to be truncated
-    Type *Ty,               ///< The type to truncate to
+    Type *Ty,                     ///< The type to truncate to
     const Twine &NameStr,         ///< A name for the new instruction
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
@@ -3409,16 +3428,16 @@ protected:
 public:
   /// \brief Constructor with insert-before-instruction semantics
   FPExtInst(
-    Value *S,                     ///< The value to be extended
-    Type *Ty,               ///< The type to extend to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be extended
+    Type *Ty,                           ///< The type to extend to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-at-end-of-block semantics
   FPExtInst(
     Value *S,                     ///< The value to be extended
-    Type *Ty,               ///< The type to extend to
+    Type *Ty,                     ///< The type to extend to
     const Twine &NameStr,         ///< A name for the new instruction
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
@@ -3445,16 +3464,16 @@ protected:
 public:
   /// \brief Constructor with insert-before-instruction semantics
   UIToFPInst(
-    Value *S,                     ///< The value to be converted
-    Type *Ty,               ///< The type to convert to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be converted
+    Type *Ty,                           ///< The type to convert to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-at-end-of-block semantics
   UIToFPInst(
     Value *S,                     ///< The value to be converted
-    Type *Ty,               ///< The type to convert to
+    Type *Ty,                     ///< The type to convert to
     const Twine &NameStr,         ///< A name for the new instruction
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
@@ -3481,16 +3500,16 @@ protected:
 public:
   /// \brief Constructor with insert-before-instruction semantics
   SIToFPInst(
-    Value *S,                     ///< The value to be converted
-    Type *Ty,               ///< The type to convert to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be converted
+    Type *Ty,                           ///< The type to convert to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-at-end-of-block semantics
   SIToFPInst(
     Value *S,                     ///< The value to be converted
-    Type *Ty,               ///< The type to convert to
+    Type *Ty,                     ///< The type to convert to
     const Twine &NameStr,         ///< A name for the new instruction
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
@@ -3517,16 +3536,16 @@ protected:
 public:
   /// \brief Constructor with insert-before-instruction semantics
   FPToUIInst(
-    Value *S,                     ///< The value to be converted
-    Type *Ty,               ///< The type to convert to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be converted
+    Type *Ty,                           ///< The type to convert to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-at-end-of-block semantics
   FPToUIInst(
     Value *S,                     ///< The value to be converted
-    Type *Ty,               ///< The type to convert to
+    Type *Ty,                     ///< The type to convert to
     const Twine &NameStr,         ///< A name for the new instruction
     BasicBlock *InsertAtEnd       ///< Where to insert the new instruction
   );
@@ -3553,16 +3572,16 @@ protected:
 public:
   /// \brief Constructor with insert-before-instruction semantics
   FPToSIInst(
-    Value *S,                     ///< The value to be converted
-    Type *Ty,               ///< The type to convert to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be converted
+    Type *Ty,                           ///< The type to convert to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-at-end-of-block semantics
   FPToSIInst(
     Value *S,                     ///< The value to be converted
-    Type *Ty,               ///< The type to convert to
+    Type *Ty,                     ///< The type to convert to
     const Twine &NameStr,         ///< A name for the new instruction
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
@@ -3585,16 +3604,16 @@ class IntToPtrInst : public CastInst {
 public:
   /// \brief Constructor with insert-before-instruction semantics
   IntToPtrInst(
-    Value *S,                     ///< The value to be converted
-    Type *Ty,               ///< The type to convert to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be converted
+    Type *Ty,                           ///< The type to convert to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-at-end-of-block semantics
   IntToPtrInst(
     Value *S,                     ///< The value to be converted
-    Type *Ty,               ///< The type to convert to
+    Type *Ty,                     ///< The type to convert to
     const Twine &NameStr,         ///< A name for the new instruction
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
@@ -3629,16 +3648,16 @@ protected:
 public:
   /// \brief Constructor with insert-before-instruction semantics
   PtrToIntInst(
-    Value *S,                     ///< The value to be converted
-    Type *Ty,               ///< The type to convert to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be converted
+    Type *Ty,                           ///< The type to convert to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-at-end-of-block semantics
   PtrToIntInst(
     Value *S,                     ///< The value to be converted
-    Type *Ty,               ///< The type to convert to
+    Type *Ty,                     ///< The type to convert to
     const Twine &NameStr,         ///< A name for the new instruction
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
@@ -3677,16 +3696,16 @@ protected:
 public:
   /// \brief Constructor with insert-before-instruction semantics
   BitCastInst(
-    Value *S,                     ///< The value to be casted
-    Type *Ty,               ///< The type to casted to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be casted
+    Type *Ty,                           ///< The type to casted to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-at-end-of-block semantics
   BitCastInst(
     Value *S,                     ///< The value to be casted
-    Type *Ty,               ///< The type to casted to
+    Type *Ty,                     ///< The type to casted to
     const Twine &NameStr,         ///< A name for the new instruction
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
@@ -3714,10 +3733,10 @@ protected:
 public:
   /// \brief Constructor with insert-before-instruction semantics
   AddrSpaceCastInst(
-    Value *S,                     ///< The value to be casted
-    Type *Ty,                     ///< The type to casted to
-    const Twine &NameStr = "",    ///< A name for the new instruction
-    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+    Value *S,                           ///< The value to be casted
+    Type *Ty,                           ///< The type to casted to
+    const Twine &NameStr = "",          ///< A name for the new instruction
+    Instruction *InsertBefore = nullptr ///< Where to insert the new instruction
   );
 
   /// \brief Constructor with insert-at-end-of-block semantics
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 6a48f17..edd1621 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -250,6 +250,10 @@ def int_gcwrite : Intrinsic<[],
 //
 def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_frameaddress  : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_read_register  : Intrinsic<[llvm_anyint_ty], [llvm_metadata_ty],
+                                   [IntrNoMem], "llvm.read_register">;
+def int_write_register : Intrinsic<[], [llvm_metadata_ty, llvm_anyint_ty],
+                                   [], "llvm.write_register">;
 
 // Note: we treat stacksave/stackrestore as writemem because we don't otherwise
 // model their dependencies on allocas.
@@ -529,7 +533,6 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
 include "llvm/IR/IntrinsicsPowerPC.td"
 include "llvm/IR/IntrinsicsX86.td"
 include "llvm/IR/IntrinsicsARM.td"
-include "llvm/IR/IntrinsicsARM64.td"
 include "llvm/IR/IntrinsicsAArch64.td"
 include "llvm/IR/IntrinsicsXCore.td"
 include "llvm/IR/IntrinsicsHexagon.td"
diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
index 61c0e5d..23757aa 100644
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@@ -1,4 +1,4 @@
-//===- IntrinsicsAArch64.td - Defines AArch64 intrinsics -----------*- tablegen -*-===//
+//===- IntrinsicsAARCH64.td - Defines AARCH64 intrinsics ---*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,401 +7,630 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines all of the AArch64-specific intrinsics.
+// This file defines all of the AARCH64-specific intrinsics.
 //
 //===----------------------------------------------------------------------===//
 
+let TargetPrefix = "aarch64" in {
+
+def int_aarch64_ldxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
+def int_aarch64_ldaxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
+def int_aarch64_stxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
+def int_aarch64_stlxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
+
+def int_aarch64_ldxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
+def int_aarch64_ldaxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
+def int_aarch64_stxp : Intrinsic<[llvm_i32_ty],
+                               [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>;
+def int_aarch64_stlxp : Intrinsic<[llvm_i32_ty],
+                                [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>;
+
+def int_aarch64_clrex : Intrinsic<[]>;
+
+def int_aarch64_sdiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+                                LLVMMatchType<0>], [IntrNoMem]>;
+def int_aarch64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+                                LLVMMatchType<0>], [IntrNoMem]>;
+}
+
 //===----------------------------------------------------------------------===//
 // Advanced SIMD (NEON)
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_2Scalar_Float_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+
+  class AdvSIMD_FPToIntRounding_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
+
+  class AdvSIMD_1IntArg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_1FloatArg_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Expand_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Long_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>;
+  class AdvSIMD_1IntArg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMExtendedType<0>], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Int_Across_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  class AdvSIMD_1VectorArg_Float_Across_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+
+  class AdvSIMD_2IntArg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2FloatArg_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Compare_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
+  class AdvSIMD_2Arg_FloatCompare_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Long_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMTruncatedType<0>, LLVMTruncatedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Wide_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, LLVMTruncatedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMExtendedType<0>, LLVMExtendedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2Arg_Scalar_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMExtendedType<0>, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_anyvector_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMTruncatedType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMTruncatedType<0>, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
+                [IntrNoMem]>;
+
+  class AdvSIMD_3VectorArg_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty],
+               [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+               [IntrNoMem]>;
+  class AdvSIMD_3VectorArg_Scalar_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty],
+               [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+               [IntrNoMem]>;
+  class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty],
+               [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty,
+                LLVMMatchType<1>], [IntrNoMem]>;
+  class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_CvtFxToFP_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
+                [IntrNoMem]>;
+  class AdvSIMD_CvtFPToFx_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
+                [IntrNoMem]>;
+}
 
-// Vector Absolute Compare (Floating Point)
-def int_aarch64_neon_vacgeq :
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
-def int_aarch64_neon_vacgtq :
-  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
-
-// Vector saturating accumulate
-def int_aarch64_neon_suqadd : Neon_2Arg_Intrinsic;
-def int_aarch64_neon_usqadd : Neon_2Arg_Intrinsic;
-
-// Vector Bitwise reverse
-def int_aarch64_neon_rbit : Neon_1Arg_Intrinsic;
-
-// Vector extract and narrow
-def int_aarch64_neon_xtn : 
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Vector floating-point convert
-def int_aarch64_neon_frintn : Neon_1Arg_Intrinsic;
-def int_aarch64_neon_fsqrt : Neon_1Arg_Intrinsic;
-def int_aarch64_neon_vcvtxn :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-def int_aarch64_neon_vcvtzs :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-def int_aarch64_neon_vcvtzu :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Vector maxNum (Floating Point)
-def int_aarch64_neon_vmaxnm : Neon_2Arg_Intrinsic;
-
-// Vector minNum (Floating Point)
-def int_aarch64_neon_vminnm : Neon_2Arg_Intrinsic;
-
-// Vector Pairwise maxNum (Floating Point)
-def int_aarch64_neon_vpmaxnm : Neon_2Arg_Intrinsic;
-
-// Vector Pairwise minNum (Floating Point)
-def int_aarch64_neon_vpminnm : Neon_2Arg_Intrinsic;
-
-// Vector Multiply Extended and Scalar Multiply Extended (Floating Point)
-def int_aarch64_neon_vmulx  :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
-
-class Neon_N2V_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty],
-              [IntrNoMem]>;
-class Neon_N3V_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty],
-              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-              [IntrNoMem]>;
-class Neon_N2V_Narrow_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty],
-              [LLVMExtendedType<0>, llvm_i32_ty],
-              [IntrNoMem]>;
-
-// Vector rounding shift right by immediate (Signed)
-def int_aarch64_neon_vsrshr : Neon_N2V_Intrinsic;
-def int_aarch64_neon_vurshr : Neon_N2V_Intrinsic;
-def int_aarch64_neon_vsqshlu : Neon_N2V_Intrinsic;
-
-def int_aarch64_neon_vsri : Neon_N3V_Intrinsic;
-def int_aarch64_neon_vsli : Neon_N3V_Intrinsic;
-
-def int_aarch64_neon_vsqshrun : Neon_N2V_Narrow_Intrinsic;
-def int_aarch64_neon_vrshrn : Neon_N2V_Narrow_Intrinsic;
-def int_aarch64_neon_vsqrshrun : Neon_N2V_Narrow_Intrinsic;
-def int_aarch64_neon_vsqshrn : Neon_N2V_Narrow_Intrinsic;
-def int_aarch64_neon_vuqshrn : Neon_N2V_Narrow_Intrinsic;
-def int_aarch64_neon_vsqrshrn : Neon_N2V_Narrow_Intrinsic;
-def int_aarch64_neon_vuqrshrn : Neon_N2V_Narrow_Intrinsic;
-
-// Vector across
-class Neon_Across_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-def int_aarch64_neon_saddlv : Neon_Across_Intrinsic;
-def int_aarch64_neon_uaddlv : Neon_Across_Intrinsic;
-def int_aarch64_neon_smaxv  : Neon_Across_Intrinsic;
-def int_aarch64_neon_umaxv  : Neon_Across_Intrinsic;
-def int_aarch64_neon_sminv  : Neon_Across_Intrinsic;
-def int_aarch64_neon_uminv  : Neon_Across_Intrinsic;
-def int_aarch64_neon_vaddv  : Neon_Across_Intrinsic;
-def int_aarch64_neon_vmaxv :
-  Intrinsic<[llvm_float_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_aarch64_neon_vminv :
-  Intrinsic<[llvm_float_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_aarch64_neon_vmaxnmv :
-  Intrinsic<[llvm_float_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-def int_aarch64_neon_vminnmv :
-  Intrinsic<[llvm_float_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-
-// Vector Table Lookup.
-def int_aarch64_neon_vtbl1 :
-  Intrinsic<[llvm_anyvector_ty],
-            [llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
-
-def int_aarch64_neon_vtbl2 :
-  Intrinsic<[llvm_anyvector_ty],
-            [llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
-            [IntrNoMem]>;
-
-def int_aarch64_neon_vtbl3 :
-  Intrinsic<[llvm_anyvector_ty],
-            [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
-            LLVMMatchType<0>], [IntrNoMem]>;
-
-def int_aarch64_neon_vtbl4 :
-  Intrinsic<[llvm_anyvector_ty],
-            [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
-            llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
-
-// Vector Table Extension.
-// Some elements of the destination vector may not be updated, so the original
-// value of that vector is passed as the first argument.  The next 1-4
-// arguments after that are the table.
-def int_aarch64_neon_vtbx1 :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>, llvm_v16i8_ty, LLVMMatchType<0>],
-            [IntrNoMem]>;
-
-def int_aarch64_neon_vtbx2 :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-             LLVMMatchType<0>], [IntrNoMem]>;
-
-def int_aarch64_neon_vtbx3 :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-             llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
-
-def int_aarch64_neon_vtbx4 :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-             llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
-            [IntrNoMem]>;
-
-// Vector Load/store
-def int_aarch64_neon_vld1x2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                                        [llvm_ptr_ty, llvm_i32_ty],
-                                        [IntrReadArgMem]>;
-def int_aarch64_neon_vld1x3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>],
-                                        [llvm_ptr_ty, llvm_i32_ty],
-                                        [IntrReadArgMem]>;
-def int_aarch64_neon_vld1x4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, LLVMMatchType<0>],
-                                        [llvm_ptr_ty, llvm_i32_ty],
-                                        [IntrReadArgMem]>;
-
-def int_aarch64_neon_vst1x2 : Intrinsic<[],
-                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, llvm_i32_ty],
-                                        [IntrReadWriteArgMem]>;
-def int_aarch64_neon_vst1x3 : Intrinsic<[],
-                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         llvm_i32_ty], [IntrReadWriteArgMem]>;
-def int_aarch64_neon_vst1x4 : Intrinsic<[],
-                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, llvm_i32_ty],
-                                        [IntrReadWriteArgMem]>;
-
-// Scalar Add
-def int_aarch64_neon_vaddds :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-def int_aarch64_neon_vadddu :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-
-
-// Scalar Sub
-def int_aarch64_neon_vsubds :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-def int_aarch64_neon_vsubdu :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-
-
-// Scalar Shift
-// Scalar Shift Left
-def int_aarch64_neon_vshlds :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-def int_aarch64_neon_vshldu :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-
-// Scalar Saturating Shift Left
-def int_aarch64_neon_vqshls : Neon_2Arg_Intrinsic;
-def int_aarch64_neon_vqshlu : Neon_2Arg_Intrinsic;
-
-// Scalar Shift Rouding Left
-def int_aarch64_neon_vrshlds :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-def int_aarch64_neon_vrshldu :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
-
-// Scalar Saturating Rounding Shift Left
-def int_aarch64_neon_vqrshls : Neon_2Arg_Intrinsic;
-def int_aarch64_neon_vqrshlu : Neon_2Arg_Intrinsic;
-
-// Scalar Reduce Pairwise Add.
-def int_aarch64_neon_vpadd :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v2i64_ty],[IntrNoMem]>;
-def int_aarch64_neon_vpfadd :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Scalar Reduce Pairwise Floating Point Max/Min.
-def int_aarch64_neon_vpmax :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-def int_aarch64_neon_vpmin :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Scalar Reduce Pairwise Floating Point Maxnm/Minnm.
-def int_aarch64_neon_vpfmaxnm :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-def int_aarch64_neon_vpfminnm :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Scalar Signed Integer Convert To Floating-point
-def int_aarch64_neon_vcvtint2fps :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Scalar Unsigned Integer Convert To Floating-point
-def int_aarch64_neon_vcvtint2fpu :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-// Scalar Floating-point Convert
-def int_aarch64_neon_fcvtxn :
-  Intrinsic<[llvm_float_ty], [llvm_double_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtns : 
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtnu :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtps :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtpu :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtms :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtmu :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtas :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtau :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtzs :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-def int_aarch64_neon_fcvtzu :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-
-// Scalar Floating-point Reciprocal Estimate.
-def int_aarch64_neon_vrecpe :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-
-// Scalar Floating-point Reciprocal Exponent
-def int_aarch64_neon_vrecpx :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-
-// Scalar Floating-point Reciprocal Square Root Estimate
-def int_aarch64_neon_vrsqrte :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-
-// Scalar Floating-point Reciprocal Step
-def int_aarch64_neon_vrecps :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-            [IntrNoMem]>;
-
-// Scalar Floating-point Reciprocal Square Root Step
-def int_aarch64_neon_vrsqrts :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-            [IntrNoMem]>;
-
-// Compare with vector operands.
-class Neon_Cmp_Intrinsic :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_anyvector_ty],
-            [IntrNoMem]>;
-
-// Floating-point compare with scalar operands.
-class Neon_Float_Cmp_Intrinsic :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty, llvm_anyfloat_ty],
-            [IntrNoMem]>;
-
-// Scalar Compare Equal
-def int_aarch64_neon_vceq : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fceq : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Compare Greater-Than or Equal
-def int_aarch64_neon_vcge : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_vchs : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fcge : Neon_Float_Cmp_Intrinsic;
-def int_aarch64_neon_fchs : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Compare Less-Than or Equal
-def int_aarch64_neon_vclez : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fclez : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Compare Less-Than
-def int_aarch64_neon_vcltz : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fcltz : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Compare Greater-Than
-def int_aarch64_neon_vcgt : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_vchi : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fcgt : Neon_Float_Cmp_Intrinsic;
-def int_aarch64_neon_fchi : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Compare Bitwise Test Bits
-def int_aarch64_neon_vtstd : Neon_Cmp_Intrinsic;
-
-// Scalar Floating-point Absolute Compare Greater Than Or Equal
-def int_aarch64_neon_vcage : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fcage : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Floating-point Absolute Compare Greater Than
-def int_aarch64_neon_vcagt : Neon_Cmp_Intrinsic;
-def int_aarch64_neon_fcagt : Neon_Float_Cmp_Intrinsic;
-
-// Scalar Signed Saturating Accumulated of Unsigned Value
-def int_aarch64_neon_vuqadd : Neon_2Arg_Intrinsic;
-
-// Scalar Unsigned Saturating Accumulated of Signed Value
-def int_aarch64_neon_vsqadd : Neon_2Arg_Intrinsic;
-
-// Scalar Absolute Value
-def int_aarch64_neon_vabs :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty], [IntrNoMem]>;
-
-// Scalar Absolute Difference
-def int_aarch64_neon_vabd :
-  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-            [IntrNoMem]>;
-
-// Scalar Negate Value
-def int_aarch64_neon_vneg :
-  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty], [IntrNoMem]>;
-
-// Signed Saturating Doubling Multiply-Add Long
-def int_aarch64_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;
-
-// Signed Saturating Doubling Multiply-Subtract Long
-def int_aarch64_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
-
-def int_aarch64_neon_vmull_p64 :
-  Intrinsic<[llvm_v16i8_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
+// Arithmetic ops
 
-class Neon_2Arg_ShiftImm_Intrinsic
-  : Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_i32_ty], [IntrNoMem]>;
+let Properties = [IntrNoMem] in {
+  // Vector Add Across Lanes
+  def int_aarch64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_faddv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+
+  // Vector Long Add Across Lanes
+  def int_aarch64_neon_saddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_uaddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+
+  // Vector Halving Add
+  def int_aarch64_neon_shadd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_uhadd : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Rounding Halving Add
+  def int_aarch64_neon_srhadd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_urhadd : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Saturating Add
+  def int_aarch64_neon_sqadd : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_suqadd : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_usqadd : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_uqadd : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Add High-Half
+  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
+  // header is no longer supported.
+  def int_aarch64_neon_addhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Rounding Add High-Half
+  def int_aarch64_neon_raddhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Saturating Doubling Multiply High
+  def int_aarch64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Saturating Rounding Doubling Multiply High
+  def int_aarch64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Polynominal Multiply
+  def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Long Multiply
+  def int_aarch64_neon_smull : AdvSIMD_2VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_umull : AdvSIMD_2VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_pmull : AdvSIMD_2VectorArg_Long_Intrinsic;
+
+  // 64-bit polynomial multiply really returns an i128, which is not legal. Fake
+  // it with a v16i8.
+  def int_aarch64_neon_pmull64 :
+        Intrinsic<[llvm_v16i8_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
+
+  // Vector Extending Multiply
+  def int_aarch64_neon_fmulx : AdvSIMD_2FloatArg_Intrinsic {
+    let Properties = [IntrNoMem, Commutative];
+  }
+
+  // Vector Saturating Doubling Long Multiply
+  def int_aarch64_neon_sqdmull : AdvSIMD_2VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_sqdmulls_scalar
+    : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  // Vector Halving Subtract
+  def int_aarch64_neon_shsub : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_uhsub : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Saturating Subtract
+  def int_aarch64_neon_sqsub : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_uqsub : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Subtract High-Half
+  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
+  // header is no longer supported.
+  def int_aarch64_neon_subhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Rounding Subtract High-Half
+  def int_aarch64_neon_rsubhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
+
+  // Vector Compare Absolute Greater-than-or-equal
+  def int_aarch64_neon_facge : AdvSIMD_2Arg_FloatCompare_Intrinsic;
+
+  // Vector Compare Absolute Greater-than
+  def int_aarch64_neon_facgt : AdvSIMD_2Arg_FloatCompare_Intrinsic;
+
+  // Vector Absolute Difference
+  def int_aarch64_neon_sabd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_uabd : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fabd : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Scalar Absolute Difference
+  def int_aarch64_sisd_fabd : AdvSIMD_2Scalar_Float_Intrinsic;
+
+  // Vector Max
+  def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Max Across Lanes
+  def int_aarch64_neon_smaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_umaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_fmaxv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+  def int_aarch64_neon_fmaxnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+
+  // Vector Min
+  def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Vector Min/Max Number
+  def int_aarch64_neon_fminnm : AdvSIMD_2FloatArg_Intrinsic;
+  def int_aarch64_neon_fmaxnm : AdvSIMD_2FloatArg_Intrinsic;
+
+  // Vector Min Across Lanes
+  def int_aarch64_neon_sminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_uminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
+  def int_aarch64_neon_fminv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+  def int_aarch64_neon_fminnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
+
+  // Pairwise Add
+  def int_aarch64_neon_addp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Long Pairwise Add
+  // FIXME: In theory, we shouldn't need intrinsics for saddlp or
+  // uaddlp, but tblgen's type inference currently can't handle the
+  // pattern fragments this ends up generating.
+  def int_aarch64_neon_saddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
+  def int_aarch64_neon_uaddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
 
-class Neon_3Arg_ShiftImm_Intrinsic
-  : Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty, llvm_i32_ty],
-              [IntrNoMem]>;
+  // Folding Maximum
+  def int_aarch64_neon_smaxp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_umaxp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fmaxp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Folding Minimum
+  def int_aarch64_neon_sminp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_uminp : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_fminp : AdvSIMD_2VectorArg_Intrinsic;
+
+  // Reciprocal Estimate/Step
+  def int_aarch64_neon_frecps : AdvSIMD_2FloatArg_Intrinsic;
+  def int_aarch64_neon_frsqrts : AdvSIMD_2FloatArg_Intrinsic;
+
+  // Reciprocal Exponent
+  def int_aarch64_neon_frecpx : AdvSIMD_1FloatArg_Intrinsic;
+
+  // Vector Saturating Shift Left
+  def int_aarch64_neon_sqshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_uqshl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Rounding Shift Left
+  def int_aarch64_neon_srshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_urshl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Saturating Rounding Shift Left
+  def int_aarch64_neon_sqrshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_uqrshl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Signed->Unsigned Shift Left by Constant
+  def int_aarch64_neon_sqshlu : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Signed->Unsigned Narrowing Saturating Shift Right by Constant
+  def int_aarch64_neon_sqshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Signed->Unsigned Rounding Narrowing Saturating Shift Right by Const
+  def int_aarch64_neon_sqrshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Narrowing Shift Right by Constant
+  def int_aarch64_neon_sqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+  def int_aarch64_neon_uqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Rounding Narrowing Shift Right by Constant
+  def int_aarch64_neon_rshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Rounding Narrowing Saturating Shift Right by Constant
+  def int_aarch64_neon_sqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+  def int_aarch64_neon_uqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
+
+  // Vector Shift Left
+  def int_aarch64_neon_sshl : AdvSIMD_2IntArg_Intrinsic;
+  def int_aarch64_neon_ushl : AdvSIMD_2IntArg_Intrinsic;
+
+  // Vector Widening Shift Left by Constant
+  def int_aarch64_neon_shll : AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic;
+  def int_aarch64_neon_sshll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
+  def int_aarch64_neon_ushll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
+
+  // Vector Shift Right by Constant and Insert
+  def int_aarch64_neon_vsri : AdvSIMD_3VectorArg_Scalar_Intrinsic;
+
+  // Vector Shift Left by Constant and Insert
+  def int_aarch64_neon_vsli : AdvSIMD_3VectorArg_Scalar_Intrinsic;
+
+  // Vector Saturating Narrow
+  def int_aarch64_neon_scalar_sqxtn: AdvSIMD_1IntArg_Narrow_Intrinsic;
+  def int_aarch64_neon_scalar_uqxtn : AdvSIMD_1IntArg_Narrow_Intrinsic;
+  def int_aarch64_neon_sqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
+  def int_aarch64_neon_uqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
+
+  // Vector Saturating Extract and Unsigned Narrow
+  def int_aarch64_neon_scalar_sqxtun : AdvSIMD_1IntArg_Narrow_Intrinsic;
+  def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic;
+
+  // Vector Absolute Value
+  def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
+
+  // Vector Saturating Absolute Value
+  def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;
+
+  // Vector Saturating Negation
+  def int_aarch64_neon_sqneg : AdvSIMD_1IntArg_Intrinsic;
+
+  // Vector Count Leading Sign Bits
+  def int_aarch64_neon_cls : AdvSIMD_1VectorArg_Intrinsic;
+
+  // Vector Reciprocal Estimate
+  def int_aarch64_neon_urecpe : AdvSIMD_1VectorArg_Intrinsic;
+  def int_aarch64_neon_frecpe : AdvSIMD_1FloatArg_Intrinsic;
+
+  // Vector Square Root Estimate
+  def int_aarch64_neon_ursqrte : AdvSIMD_1VectorArg_Intrinsic;
+  def int_aarch64_neon_frsqrte : AdvSIMD_1FloatArg_Intrinsic;
+
+  // Vector Bitwise Reverse
+  def int_aarch64_neon_rbit : AdvSIMD_1VectorArg_Intrinsic;
+
+  // Vector Conversions Between Half-Precision and Single-Precision.
+  def int_aarch64_neon_vcvtfp2hf
+    : Intrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_aarch64_neon_vcvthf2fp
+    : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>;
 
-// Scalar Shift Right (Immediate)
-def int_aarch64_neon_vshrds_n : Neon_2Arg_ShiftImm_Intrinsic;
-def int_aarch64_neon_vshrdu_n : Neon_2Arg_ShiftImm_Intrinsic;
+  // Vector Conversions Between Floating-point and Fixed-point.
+  def int_aarch64_neon_vcvtfp2fxs : AdvSIMD_CvtFPToFx_Intrinsic;
+  def int_aarch64_neon_vcvtfp2fxu : AdvSIMD_CvtFPToFx_Intrinsic;
+  def int_aarch64_neon_vcvtfxs2fp : AdvSIMD_CvtFxToFP_Intrinsic;
+  def int_aarch64_neon_vcvtfxu2fp : AdvSIMD_CvtFxToFP_Intrinsic;
 
-// Scalar Shift Right and Accumulate (Immediate)
-def int_aarch64_neon_vsrads_n : Neon_3Arg_ShiftImm_Intrinsic;
-def int_aarch64_neon_vsradu_n : Neon_3Arg_ShiftImm_Intrinsic;
+  // Vector FP->Int Conversions
+  def int_aarch64_neon_fcvtas : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtau : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtms : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtmu : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtns : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtnu : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtps : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtpu : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtzs : AdvSIMD_FPToIntRounding_Intrinsic;
+  def int_aarch64_neon_fcvtzu : AdvSIMD_FPToIntRounding_Intrinsic;
 
-// Scalar Rounding Shift Right and Accumulate (Immediate)
-def int_aarch64_neon_vrsrads_n : Neon_3Arg_ShiftImm_Intrinsic;
-def int_aarch64_neon_vrsradu_n : Neon_3Arg_ShiftImm_Intrinsic;
+  // Vector FP Rounding: only ties to even is unrepresented by a normal
+  // intrinsic.
+  def int_aarch64_neon_frintn : AdvSIMD_1FloatArg_Intrinsic;
 
-// Scalar Shift Left (Immediate)
-def int_aarch64_neon_vshld_n : Neon_2Arg_ShiftImm_Intrinsic;
+  // Scalar FP->Int conversions
 
-// Scalar Saturating Shift Left (Immediate)
-def int_aarch64_neon_vqshls_n : Neon_N2V_Intrinsic;
-def int_aarch64_neon_vqshlu_n : Neon_N2V_Intrinsic;
+  // Vector FP Inexact Narrowing
+  def int_aarch64_neon_fcvtxn : AdvSIMD_1VectorArg_Expand_Intrinsic;
+
+  // Scalar FP Inexact Narrowing
+  def int_aarch64_sisd_fcvtxn : Intrinsic<[llvm_float_ty], [llvm_double_ty],
+                                        [IntrNoMem]>;
+}
+
+let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_2Vector2Index_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_anyvector_ty, llvm_i64_ty, LLVMMatchType<0>, llvm_i64_ty],
+                [IntrNoMem]>;
+}
 
-// Scalar Signed Saturating Shift Left Unsigned (Immediate)
-def int_aarch64_neon_vqshlus_n : Neon_N2V_Intrinsic;
+// Vector element to element moves
+def int_aarch64_neon_vcopy_lane: AdvSIMD_2Vector2Index_Intrinsic;
+
+let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_1Vec_Load_Intrinsic
+      : Intrinsic<[llvm_anyvector_ty], [LLVMAnyPointerType<LLVMMatchType<0>>],
+                  [IntrReadArgMem]>;
+  class AdvSIMD_1Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<2>]>;
+
+  class AdvSIMD_2Vec_Load_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                [LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadArgMem]>;
+  class AdvSIMD_2Vec_Load_Lane_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                [LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadArgMem]>;
+  class AdvSIMD_2Vec_Store_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                     LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadWriteArgMem, NoCapture<2>]>;
+  class AdvSIMD_2Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<3>]>;
+
+  class AdvSIMD_3Vec_Load_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadArgMem]>;
+  class AdvSIMD_3Vec_Load_Lane_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadArgMem]>;
+  class AdvSIMD_3Vec_Store_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                     LLVMMatchType<0>, LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadWriteArgMem, NoCapture<3>]>;
+  class AdvSIMD_3Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<4>]>;
+
+  class AdvSIMD_4Vec_Load_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadArgMem]>;
+  class AdvSIMD_4Vec_Load_Lane_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>],
+                [LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadArgMem]>;
+  class AdvSIMD_4Vec_Store_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 LLVMAnyPointerType<LLVMMatchType<0>>],
+                [IntrReadWriteArgMem, NoCapture<4>]>;
+  class AdvSIMD_4Vec_Store_Lane_Intrinsic
+    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
+                 LLVMMatchType<0>, LLVMMatchType<0>,
+                 llvm_i64_ty, llvm_anyptr_ty],
+                [IntrReadWriteArgMem, NoCapture<5>]>;
+}
 
-// Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
-def int_aarch64_neon_vcvtfxs2fp_n :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty, llvm_i32_ty], [IntrNoMem]>;
+// Memory ops
 
-// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
-def int_aarch64_neon_vcvtfxu2fp_n :
-  Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_aarch64_neon_ld1x2 : AdvSIMD_2Vec_Load_Intrinsic;
+def int_aarch64_neon_ld1x3 : AdvSIMD_3Vec_Load_Intrinsic;
+def int_aarch64_neon_ld1x4 : AdvSIMD_4Vec_Load_Intrinsic;
 
-// Scalar Floating-point Convert To Signed Fixed-point (Immediate)
-def int_aarch64_neon_vcvtfp2fxs_n :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_aarch64_neon_st1x2 : AdvSIMD_2Vec_Store_Intrinsic;
+def int_aarch64_neon_st1x3 : AdvSIMD_3Vec_Store_Intrinsic;
+def int_aarch64_neon_st1x4 : AdvSIMD_4Vec_Store_Intrinsic;
 
-// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
-def int_aarch64_neon_vcvtfp2fxu_n :
-  Intrinsic<[llvm_anyvector_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_aarch64_neon_ld2 : AdvSIMD_2Vec_Load_Intrinsic;
+def int_aarch64_neon_ld3 : AdvSIMD_3Vec_Load_Intrinsic;
+def int_aarch64_neon_ld4 : AdvSIMD_4Vec_Load_Intrinsic;
 
+def int_aarch64_neon_ld2lane : AdvSIMD_2Vec_Load_Lane_Intrinsic;
+def int_aarch64_neon_ld3lane : AdvSIMD_3Vec_Load_Lane_Intrinsic;
+def int_aarch64_neon_ld4lane : AdvSIMD_4Vec_Load_Lane_Intrinsic;
+
+def int_aarch64_neon_ld2r : AdvSIMD_2Vec_Load_Intrinsic;
+def int_aarch64_neon_ld3r : AdvSIMD_3Vec_Load_Intrinsic;
+def int_aarch64_neon_ld4r : AdvSIMD_4Vec_Load_Intrinsic;
+
+def int_aarch64_neon_st2  : AdvSIMD_2Vec_Store_Intrinsic;
+def int_aarch64_neon_st3  : AdvSIMD_3Vec_Store_Intrinsic;
+def int_aarch64_neon_st4  : AdvSIMD_4Vec_Store_Intrinsic;
+
+def int_aarch64_neon_st2lane  : AdvSIMD_2Vec_Store_Lane_Intrinsic;
+def int_aarch64_neon_st3lane  : AdvSIMD_3Vec_Store_Lane_Intrinsic;
+def int_aarch64_neon_st4lane  : AdvSIMD_4Vec_Store_Lane_Intrinsic;
+
+let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_Tbl1_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbl2_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
+  class AdvSIMD_Tbl3_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbl4_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+
+  class AdvSIMD_Tbx1_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbx2_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbx3_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
+                 llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class AdvSIMD_Tbx4_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
+                 llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+}
+def int_aarch64_neon_tbl1 : AdvSIMD_Tbl1_Intrinsic;
+def int_aarch64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic;
+def int_aarch64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic;
+def int_aarch64_neon_tbl4 : AdvSIMD_Tbl4_Intrinsic;
+
+def int_aarch64_neon_tbx1 : AdvSIMD_Tbx1_Intrinsic;
+def int_aarch64_neon_tbx2 : AdvSIMD_Tbx2_Intrinsic;
+def int_aarch64_neon_tbx3 : AdvSIMD_Tbx3_Intrinsic;
+def int_aarch64_neon_tbx4 : AdvSIMD_Tbx4_Intrinsic;
+
+let TargetPrefix = "aarch64" in {
+  class Crypto_AES_DataKey_Intrinsic
+    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+
+  class Crypto_AES_Data_Intrinsic
+    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+
+  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
+  // (v4i32).
+  class Crypto_SHA_5Hash4Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+
+  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
+  // (v4i32).
+  class Crypto_SHA_1Hash_Intrinsic
+    : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  // SHA intrinsic taking 8 words of the schedule
+  class Crypto_SHA_8Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+
+  // SHA intrinsic taking 12 words of the schedule
+  class Crypto_SHA_12Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+
+  // SHA intrinsic taking 8 words of the hash and 4 of the schedule.
+  class Crypto_SHA_8Hash4Schedule_Intrinsic
+    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+}
+
+// AES
+def int_aarch64_crypto_aese   : Crypto_AES_DataKey_Intrinsic;
+def int_aarch64_crypto_aesd   : Crypto_AES_DataKey_Intrinsic;
+def int_aarch64_crypto_aesmc  : Crypto_AES_Data_Intrinsic;
+def int_aarch64_crypto_aesimc : Crypto_AES_Data_Intrinsic;
+
+// SHA1
+def int_aarch64_crypto_sha1c  : Crypto_SHA_5Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha1p  : Crypto_SHA_5Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha1m  : Crypto_SHA_5Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha1h  : Crypto_SHA_1Hash_Intrinsic;
+
+def int_aarch64_crypto_sha1su0 : Crypto_SHA_12Schedule_Intrinsic;
+def int_aarch64_crypto_sha1su1 : Crypto_SHA_8Schedule_Intrinsic;
+
+// SHA256
+def int_aarch64_crypto_sha256h   : Crypto_SHA_8Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha256h2  : Crypto_SHA_8Hash4Schedule_Intrinsic;
+def int_aarch64_crypto_sha256su0 : Crypto_SHA_8Schedule_Intrinsic;
+def int_aarch64_crypto_sha256su1 : Crypto_SHA_12Schedule_Intrinsic;
+
+//===----------------------------------------------------------------------===//
+// CRC32
+
+let TargetPrefix = "aarch64" in {
+
+def int_aarch64_crc32b  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32h  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32w  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32x  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
+    [IntrNoMem]>;
+def int_aarch64_crc32cx : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
+    [IntrNoMem]>;
 }
diff --git a/include/llvm/IR/IntrinsicsARM.td b/include/llvm/IR/IntrinsicsARM.td
index 482f98e..d19d7b8 100644
--- a/include/llvm/IR/IntrinsicsARM.td
+++ b/include/llvm/IR/IntrinsicsARM.td
@@ -122,7 +122,13 @@ def int_arm_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
 
 //===----------------------------------------------------------------------===//
 // HINT
-def int_arm_sevl : Intrinsic<[], []>;
+
+def int_arm_hint : Intrinsic<[], [llvm_i32_ty]>;
+
+//===----------------------------------------------------------------------===//
+// UND (reserved undefined sequence)
+
+def int_arm_undefined : Intrinsic<[], [llvm_i32_ty]>;
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD (NEON)
diff --git a/include/llvm/IR/IntrinsicsARM64.td b/include/llvm/IR/IntrinsicsARM64.td
deleted file mode 100644
index d7f307e..0000000
--- a/include/llvm/IR/IntrinsicsARM64.td
+++ /dev/null
@@ -1,628 +0,0 @@
-//===- IntrinsicsARM64.td - Defines ARM64 intrinsics -------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines all of the ARM64-specific intrinsics.
-//
-//===----------------------------------------------------------------------===//
-
-let TargetPrefix = "arm64" in {
-
-def int_arm64_ldxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>;
-def int_arm64_stxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>;
-def int_arm64_clrex : Intrinsic<[]>;
-
-def int_arm64_ldxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>;
-def int_arm64_stxp : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_i64_ty,
-    llvm_ptr_ty]>;
-
-def int_arm64_sdiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
-                                LLVMMatchType<0>], [IntrNoMem]>;
-def int_arm64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
-                                LLVMMatchType<0>], [IntrNoMem]>;
-}
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD (NEON)
-
-let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
-  class AdvSIMD_2Scalar_Float_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-
-  class AdvSIMD_FPToIntRounding_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
-
-  class AdvSIMD_1IntArg_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  class AdvSIMD_1FloatArg_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Expand_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>;
-  class AdvSIMD_1IntArg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [LLVMExtendedType<0>], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Int_Across_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Float_Across_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-
-  class AdvSIMD_2IntArg_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2FloatArg_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Compare_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
-                [IntrNoMem]>;
-  class AdvSIMD_2Arg_FloatCompare_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMTruncatedType<0>, LLVMTruncatedType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Wide_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, LLVMTruncatedType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMExtendedType<0>, LLVMExtendedType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2Arg_Scalar_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyint_ty],
-                [LLVMExtendedType<0>, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_anyvector_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMTruncatedType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMTruncatedType<0>, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
-                [IntrNoMem]>;
-
-  class AdvSIMD_3VectorArg_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty],
-               [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-               [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Scalar_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty],
-               [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-               [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty],
-               [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty,
-                LLVMMatchType<1>], [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_CvtFxToFP_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
-                [IntrNoMem]>;
-  class AdvSIMD_CvtFPToFx_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
-                [IntrNoMem]>;
-}
-
-// Arithmetic ops
-
-let Properties = [IntrNoMem] in {
-  // Vector Add Across Lanes
-  def int_arm64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_faddv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-
-  // Vector Long Add Across Lanes
-  def int_arm64_neon_saddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_uaddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-
-  // Vector Halving Add
-  def int_arm64_neon_shadd : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_uhadd : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Rounding Halving Add
-  def int_arm64_neon_srhadd : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_urhadd : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Saturating Add
-  def int_arm64_neon_sqadd : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_suqadd : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_usqadd : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_uqadd : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Add High-Half
-  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
-  // header is no longer supported.
-  def int_arm64_neon_addhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
-
-  // Vector Rounding Add High-Half
-  def int_arm64_neon_raddhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
-
-  // Vector Saturating Doubling Multiply High
-  def int_arm64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Saturating Rounding Doubling Multiply High
-  def int_arm64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Polynominal Multiply
-  def int_arm64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Long Multiply
-  def int_arm64_neon_smull : AdvSIMD_2VectorArg_Long_Intrinsic;
-  def int_arm64_neon_umull : AdvSIMD_2VectorArg_Long_Intrinsic;
-  def int_arm64_neon_pmull : AdvSIMD_2VectorArg_Long_Intrinsic;
-
-  // 64-bit polynomial multiply really returns an i128, which is not legal. Fake
-  // it with a v16i8.
-  def int_arm64_neon_pmull64 :
-        Intrinsic<[llvm_v16i8_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
-
-  // Vector Extending Multiply
-  def int_arm64_neon_fmulx : AdvSIMD_2FloatArg_Intrinsic;
-
-  // Vector Saturating Doubling Long Multiply
-  def int_arm64_neon_sqdmull : AdvSIMD_2VectorArg_Long_Intrinsic;
-  def int_arm64_neon_sqdmulls_scalar
-    : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  // Vector Halving Subtract
-  def int_arm64_neon_shsub : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_uhsub : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Saturating Subtract
-  def int_arm64_neon_sqsub : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_uqsub : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Subtract High-Half
-  // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that
-  // header is no longer supported.
-  def int_arm64_neon_subhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
-
-  // Vector Rounding Subtract High-Half
-  def int_arm64_neon_rsubhn : AdvSIMD_2VectorArg_Narrow_Intrinsic;
-
-  // Vector Compare Absolute Greater-than-or-equal
-  def int_arm64_neon_facge : AdvSIMD_2Arg_FloatCompare_Intrinsic;
-
-  // Vector Compare Absolute Greater-than
-  def int_arm64_neon_facgt : AdvSIMD_2Arg_FloatCompare_Intrinsic;
-
-  // Vector Absolute Difference
-  def int_arm64_neon_sabd : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_uabd : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fabd : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Scalar Absolute Difference
-  def int_arm64_sisd_fabd : AdvSIMD_2Scalar_Float_Intrinsic;
-
-  // Vector Max
-  def int_arm64_neon_smax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_umax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Max Across Lanes
-  def int_arm64_neon_smaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_umaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_fmaxv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-  def int_arm64_neon_fmaxnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-
-  // Vector Min
-  def int_arm64_neon_smin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_umin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Vector Min/Max Number
-  def int_arm64_neon_fminnm : AdvSIMD_2FloatArg_Intrinsic;
-  def int_arm64_neon_fmaxnm : AdvSIMD_2FloatArg_Intrinsic;
-
-  // Vector Min Across Lanes
-  def int_arm64_neon_sminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_uminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
-  def int_arm64_neon_fminv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-  def int_arm64_neon_fminnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic;
-
-  // Pairwise Add
-  def int_arm64_neon_addp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Long Pairwise Add
-  // FIXME: In theory, we shouldn't need intrinsics for saddlp or
-  // uaddlp, but tblgen's type inference currently can't handle the
-  // pattern fragments this ends up generating.
-  def int_arm64_neon_saddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
-  def int_arm64_neon_uaddlp : AdvSIMD_1VectorArg_Expand_Intrinsic;
-
-  // Folding Maximum
-  def int_arm64_neon_smaxp : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_umaxp : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fmaxp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Folding Minimum
-  def int_arm64_neon_sminp : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_uminp : AdvSIMD_2VectorArg_Intrinsic;
-  def int_arm64_neon_fminp : AdvSIMD_2VectorArg_Intrinsic;
-
-  // Reciprocal Estimate/Step
-  def int_arm64_neon_frecps : AdvSIMD_2FloatArg_Intrinsic;
-  def int_arm64_neon_frsqrts : AdvSIMD_2FloatArg_Intrinsic;
-
-  // Reciprocal Exponent
-  def int_arm64_neon_frecpx : AdvSIMD_1FloatArg_Intrinsic;
-
-  // Vector Saturating Shift Left
-  def int_arm64_neon_sqshl : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_uqshl : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Rounding Shift Left
-  def int_arm64_neon_srshl : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_urshl : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Saturating Rounding Shift Left
-  def int_arm64_neon_sqrshl : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_uqrshl : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Signed->Unsigned Shift Left by Constant
-  def int_arm64_neon_sqshlu : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Signed->Unsigned Narrowing Saturating Shift Right by Constant
-  def int_arm64_neon_sqshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Signed->Unsigned Rounding Narrowing Saturating Shift Right by Const
-  def int_arm64_neon_sqrshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Narrowing Shift Right by Constant
-  def int_arm64_neon_sqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-  def int_arm64_neon_uqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Rounding Narrowing Shift Right by Constant
-  def int_arm64_neon_rshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Rounding Narrowing Saturating Shift Right by Constant
-  def int_arm64_neon_sqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-  def int_arm64_neon_uqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic;
-
-  // Vector Shift Left
-  def int_arm64_neon_sshl : AdvSIMD_2IntArg_Intrinsic;
-  def int_arm64_neon_ushl : AdvSIMD_2IntArg_Intrinsic;
-
-  // Vector Widening Shift Left by Constant
-  def int_arm64_neon_shll : AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic;
-  def int_arm64_neon_sshll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
-  def int_arm64_neon_ushll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic;
-
-  // Vector Shift Right by Constant and Insert
-  def int_arm64_neon_vsri : AdvSIMD_3VectorArg_Scalar_Intrinsic;
-
-  // Vector Shift Left by Constant and Insert
-  def int_arm64_neon_vsli : AdvSIMD_3VectorArg_Scalar_Intrinsic;
-
-  // Vector Saturating Narrow
-  def int_arm64_neon_scalar_sqxtn: AdvSIMD_1IntArg_Narrow_Intrinsic;
-  def int_arm64_neon_scalar_uqxtn : AdvSIMD_1IntArg_Narrow_Intrinsic;
-  def int_arm64_neon_sqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
-  def int_arm64_neon_uqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic;
-
-  // Vector Saturating Extract and Unsigned Narrow
-  def int_arm64_neon_scalar_sqxtun : AdvSIMD_1IntArg_Narrow_Intrinsic;
-  def int_arm64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic;
-
-  // Vector Absolute Value
-  def int_arm64_neon_abs : AdvSIMD_1IntArg_Intrinsic;
-
-  // Vector Saturating Absolute Value
-  def int_arm64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;
-
-  // Vector Saturating Negation
-  def int_arm64_neon_sqneg : AdvSIMD_1IntArg_Intrinsic;
-
-  // Vector Count Leading Sign Bits
-  def int_arm64_neon_cls : AdvSIMD_1VectorArg_Intrinsic;
-
-  // Vector Reciprocal Estimate
-  def int_arm64_neon_urecpe : AdvSIMD_1VectorArg_Intrinsic;
-  def int_arm64_neon_frecpe : AdvSIMD_1FloatArg_Intrinsic;
-
-  // Vector Square Root Estimate
-  def int_arm64_neon_ursqrte : AdvSIMD_1VectorArg_Intrinsic;
-  def int_arm64_neon_frsqrte : AdvSIMD_1FloatArg_Intrinsic;
-
-  // Vector Bitwise Reverse
-  def int_arm64_neon_rbit : AdvSIMD_1VectorArg_Intrinsic;
-
-  // Vector Conversions Between Half-Precision and Single-Precision.
-  def int_arm64_neon_vcvtfp2hf
-    : Intrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-  def int_arm64_neon_vcvthf2fp
-    : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>;
-
-  // Vector Conversions Between Floating-point and Fixed-point.
-  def int_arm64_neon_vcvtfp2fxs : AdvSIMD_CvtFPToFx_Intrinsic;
-  def int_arm64_neon_vcvtfp2fxu : AdvSIMD_CvtFPToFx_Intrinsic;
-  def int_arm64_neon_vcvtfxs2fp : AdvSIMD_CvtFxToFP_Intrinsic;
-  def int_arm64_neon_vcvtfxu2fp : AdvSIMD_CvtFxToFP_Intrinsic;
-
-  // Vector FP->Int Conversions
-  def int_arm64_neon_fcvtas : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtau : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtms : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtmu : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtns : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtnu : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtps : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtpu : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtzs : AdvSIMD_FPToIntRounding_Intrinsic;
-  def int_arm64_neon_fcvtzu : AdvSIMD_FPToIntRounding_Intrinsic;
-
-  // Vector FP Rounding: only ties to even is unrepresented by a normal
-  // intrinsic.
-  def int_arm64_neon_frintn : AdvSIMD_1FloatArg_Intrinsic;
-
-  // Scalar FP->Int conversions
-
-  // Vector FP Inexact Narrowing
-  def int_arm64_neon_fcvtxn : AdvSIMD_1VectorArg_Expand_Intrinsic;
-
-  // Scalar FP Inexact Narrowing
-  def int_arm64_sisd_fcvtxn : Intrinsic<[llvm_float_ty], [llvm_double_ty],
-                                        [IntrNoMem]>;
-}
-
-let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
-  class AdvSIMD_2Vector2Index_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_anyvector_ty, llvm_i64_ty, LLVMMatchType<0>, llvm_i64_ty],
-                [IntrNoMem]>;
-}
-
-// Vector element to element moves
-def int_arm64_neon_vcopy_lane: AdvSIMD_2Vector2Index_Intrinsic;
-
-let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
-  class AdvSIMD_1Vec_Load_Intrinsic
-      : Intrinsic<[llvm_anyvector_ty], [LLVMAnyPointerType<LLVMMatchType<0>>],
-                  [IntrReadArgMem]>;
-  class AdvSIMD_1Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadWriteArgMem, NoCapture<2>]>;
-
-  class AdvSIMD_2Vec_Load_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                [LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadArgMem]>;
-  class AdvSIMD_2Vec_Load_Lane_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                [LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadArgMem]>;
-  class AdvSIMD_2Vec_Store_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                     LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadWriteArgMem, NoCapture<2>]>;
-  class AdvSIMD_2Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadWriteArgMem, NoCapture<3>]>;
-
-  class AdvSIMD_3Vec_Load_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
-                [LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadArgMem]>;
-  class AdvSIMD_3Vec_Load_Lane_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>],
-                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadArgMem]>;
-  class AdvSIMD_3Vec_Store_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                     LLVMMatchType<0>, LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadWriteArgMem, NoCapture<3>]>;
-  class AdvSIMD_3Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty,
-                 LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadWriteArgMem, NoCapture<4>]>;
-
-  class AdvSIMD_4Vec_Load_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>],
-                [LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadArgMem]>;
-  class AdvSIMD_4Vec_Load_Lane_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>],
-                [LLVMMatchType<0>, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadArgMem]>;
-  class AdvSIMD_4Vec_Store_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>,
-                 LLVMAnyPointerType<LLVMMatchType<0>>],
-                [IntrReadWriteArgMem, NoCapture<4>]>;
-  class AdvSIMD_4Vec_Store_Lane_Intrinsic
-    : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>,
-                 LLVMMatchType<0>, LLVMMatchType<0>,
-                 llvm_i64_ty, llvm_anyptr_ty],
-                [IntrReadWriteArgMem, NoCapture<5>]>;
-}
-
-// Memory ops
-
-def int_arm64_neon_ld1x2 : AdvSIMD_2Vec_Load_Intrinsic;
-def int_arm64_neon_ld1x3 : AdvSIMD_3Vec_Load_Intrinsic;
-def int_arm64_neon_ld1x4 : AdvSIMD_4Vec_Load_Intrinsic;
-
-def int_arm64_neon_st1x2 : AdvSIMD_2Vec_Store_Intrinsic;
-def int_arm64_neon_st1x3 : AdvSIMD_3Vec_Store_Intrinsic;
-def int_arm64_neon_st1x4 : AdvSIMD_4Vec_Store_Intrinsic;
-
-def int_arm64_neon_ld2 : AdvSIMD_2Vec_Load_Intrinsic;
-def int_arm64_neon_ld3 : AdvSIMD_3Vec_Load_Intrinsic;
-def int_arm64_neon_ld4 : AdvSIMD_4Vec_Load_Intrinsic;
-
-def int_arm64_neon_ld2lane : AdvSIMD_2Vec_Load_Lane_Intrinsic;
-def int_arm64_neon_ld3lane : AdvSIMD_3Vec_Load_Lane_Intrinsic;
-def int_arm64_neon_ld4lane : AdvSIMD_4Vec_Load_Lane_Intrinsic;
-
-def int_arm64_neon_ld2r : AdvSIMD_2Vec_Load_Intrinsic;
-def int_arm64_neon_ld3r : AdvSIMD_3Vec_Load_Intrinsic;
-def int_arm64_neon_ld4r : AdvSIMD_4Vec_Load_Intrinsic;
-
-def int_arm64_neon_st2  : AdvSIMD_2Vec_Store_Intrinsic;
-def int_arm64_neon_st3  : AdvSIMD_3Vec_Store_Intrinsic;
-def int_arm64_neon_st4  : AdvSIMD_4Vec_Store_Intrinsic;
-
-def int_arm64_neon_st2lane  : AdvSIMD_2Vec_Store_Lane_Intrinsic;
-def int_arm64_neon_st3lane  : AdvSIMD_3Vec_Store_Lane_Intrinsic;
-def int_arm64_neon_st4lane  : AdvSIMD_4Vec_Store_Lane_Intrinsic;
-
-let TargetPrefix = "arm64" in {  // All intrinsics start with "llvm.arm64.".
-  class AdvSIMD_Tbl1_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbl2_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>;
-  class AdvSIMD_Tbl3_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
-                 LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbl4_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty,
-                 LLVMMatchType<0>],
-                [IntrNoMem]>;
-
-  class AdvSIMD_Tbx1_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_v16i8_ty, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbx2_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-                 LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbx3_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-                 llvm_v16i8_ty, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class AdvSIMD_Tbx4_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty,
-                 llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
-                [IntrNoMem]>;
-}
-def int_arm64_neon_tbl1 : AdvSIMD_Tbl1_Intrinsic;
-def int_arm64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic;
-def int_arm64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic;
-def int_arm64_neon_tbl4 : AdvSIMD_Tbl4_Intrinsic;
-
-def int_arm64_neon_tbx1 : AdvSIMD_Tbx1_Intrinsic;
-def int_arm64_neon_tbx2 : AdvSIMD_Tbx2_Intrinsic;
-def int_arm64_neon_tbx3 : AdvSIMD_Tbx3_Intrinsic;
-def int_arm64_neon_tbx4 : AdvSIMD_Tbx4_Intrinsic;
-
-let TargetPrefix = "arm64" in {
-  class Crypto_AES_DataKey_Intrinsic
-    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
-
-  class Crypto_AES_Data_Intrinsic
-    : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
-
-  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
-  // (v4i32).
-  class Crypto_SHA_5Hash4Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
-                [IntrNoMem]>;
-
-  // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule
-  // (v4i32).
-  class Crypto_SHA_1Hash_Intrinsic
-    : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-
-  // SHA intrinsic taking 8 words of the schedule
-  class Crypto_SHA_8Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
-
-  // SHA intrinsic taking 12 words of the schedule
-  class Crypto_SHA_12Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
-                [IntrNoMem]>;
-
-  // SHA intrinsic taking 8 words of the hash and 4 of the schedule.
-  class Crypto_SHA_8Hash4Schedule_Intrinsic
-    : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
-                [IntrNoMem]>;
-}
-
-// AES
-def int_arm64_crypto_aese   : Crypto_AES_DataKey_Intrinsic;
-def int_arm64_crypto_aesd   : Crypto_AES_DataKey_Intrinsic;
-def int_arm64_crypto_aesmc  : Crypto_AES_Data_Intrinsic;
-def int_arm64_crypto_aesimc : Crypto_AES_Data_Intrinsic;
-
-// SHA1
-def int_arm64_crypto_sha1c  : Crypto_SHA_5Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha1p  : Crypto_SHA_5Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha1m  : Crypto_SHA_5Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha1h  : Crypto_SHA_1Hash_Intrinsic;
-
-def int_arm64_crypto_sha1su0 : Crypto_SHA_12Schedule_Intrinsic;
-def int_arm64_crypto_sha1su1 : Crypto_SHA_8Schedule_Intrinsic;
-
-// SHA256
-def int_arm64_crypto_sha256h   : Crypto_SHA_8Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha256h2  : Crypto_SHA_8Hash4Schedule_Intrinsic;
-def int_arm64_crypto_sha256su0 : Crypto_SHA_8Schedule_Intrinsic;
-def int_arm64_crypto_sha256su1 : Crypto_SHA_12Schedule_Intrinsic;
-
-//===----------------------------------------------------------------------===//
-// CRC32
-
-let TargetPrefix = "arm64" in {
-
-def int_arm64_crc32b  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32h  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32w  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32x  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
-    [IntrNoMem]>;
-def int_arm64_crc32cx : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty],
-    [IntrNoMem]>;
-}
diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td
index 7f72ce8..26dc70a 100644
--- a/include/llvm/IR/IntrinsicsNVVM.td
+++ b/include/llvm/IR/IntrinsicsNVVM.td
@@ -875,6 +875,14 @@ def int_nvvm_move_ptr : Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
   [IntrNoMem, NoCapture<0>], "llvm.nvvm.move.ptr">;
 
 
+// For getting the handle from a texture or surface variable
+def int_nvvm_texsurf_handle
+  : Intrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyi64ptr_ty],
+              [IntrNoMem], "llvm.nvvm.texsurf.handle">;
+def int_nvvm_texsurf_handle_internal
+  : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty],
+              [IntrNoMem], "llvm.nvvm.texsurf.handle.internal">;
+
 /// Error / Warn
 def int_nvvm_compiler_error :
     Intrinsic<[], [llvm_anyptr_ty], [], "llvm.nvvm.compiler.error">;
@@ -882,6 +890,918 @@ def int_nvvm_compiler_warn :
     Intrinsic<[], [llvm_anyptr_ty], [], "llvm.nvvm.compiler.warn">;
 
 
+// Texture Fetch
+def int_nvvm_tex_1d_v4f32_i32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.1d.v4f32.i32">;
+def int_nvvm_tex_1d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.v4f32.f32">;
+def int_nvvm_tex_1d_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.level.v4f32.f32">;
+def int_nvvm_tex_1d_grad_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.grad.v4f32.f32">;
+def int_nvvm_tex_1d_v4i32_i32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.1d.v4i32.i32">;
+def int_nvvm_tex_1d_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.v4i32.f32">;
+def int_nvvm_tex_1d_level_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.level.v4i32.f32.level">;
+def int_nvvm_tex_1d_grad_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.grad.v4i32.f32">;
+
+def int_nvvm_tex_1d_array_v4f32_i32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.1d.array.v4f32.i32">;
+def int_nvvm_tex_1d_array_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.array.v4f32.f32">;
+def int_nvvm_tex_1d_array_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.array.level.v4f32.f32">;
+def int_nvvm_tex_1d_array_grad_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.array.grad.v4f32.f32">;
+def int_nvvm_tex_1d_array_v4i32_i32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.1d.array.v4i32.i32">;
+def int_nvvm_tex_1d_array_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.array.v4i32.f32">;
+def int_nvvm_tex_1d_array_level_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.array.level.v4i32.f32">;
+def int_nvvm_tex_1d_array_grad_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.array.grad.v4i32.f32">;
+
+def int_nvvm_tex_2d_v4f32_i32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.2d.v4f32.i32">;
+def int_nvvm_tex_2d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.v4f32.f32">;
+def int_nvvm_tex_2d_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.level.v4f32.f32">;
+def int_nvvm_tex_2d_grad_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.grad.v4f32.f32">;
+def int_nvvm_tex_2d_v4i32_i32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.2d.v4i32.i32">;
+def int_nvvm_tex_2d_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.v4i32.f32">;
+def int_nvvm_tex_2d_level_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.level.v4i32.f32">;
+def int_nvvm_tex_2d_grad_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.grad.v4i32.f32">;
+
+def int_nvvm_tex_2d_array_v4f32_i32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty], [],
+              "llvm.nvvm.tex.2d.array.v4f32.i32">;
+def int_nvvm_tex_2d_array_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.array.v4f32.f32">;
+def int_nvvm_tex_2d_array_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.array.level.v4f32.f32">;
+def int_nvvm_tex_2d_array_grad_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.array.grad.v4f32.f32">;
+def int_nvvm_tex_2d_array_v4i32_i32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty], [],
+              "llvm.nvvm.tex.2d.array.v4i32.i32">;
+def int_nvvm_tex_2d_array_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.array.v4i32.f32">;
+def int_nvvm_tex_2d_array_level_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.array.level.v4i32.f32">;
+def int_nvvm_tex_2d_array_grad_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.array.grad.v4i32.f32">;
+
+def int_nvvm_tex_3d_v4f32_i32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [], "llvm.nvvm.tex.3d.v4f32.i32">;
+def int_nvvm_tex_3d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.3d.v4f32.f32">;
+def int_nvvm_tex_3d_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.3d.level.v4f32.f32">;
+def int_nvvm_tex_3d_grad_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.3d.grad.v4f32.f32">;
+def int_nvvm_tex_3d_v4i32_i32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [], "llvm.nvvm.tex.3d.v4i32.i32">;
+def int_nvvm_tex_3d_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.3d.v4i32.f32">;
+def int_nvvm_tex_3d_level_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.3d.level.v4i32.f32">;
+def int_nvvm_tex_3d_grad_v4i32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.3d.grad.v4i32.f32">;
+
+// Surface Load
+def int_nvvm_suld_1d_i8_trap
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.i8.trap">;
+def int_nvvm_suld_1d_i16_trap
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.i16.trap">;
+def int_nvvm_suld_1d_i32_trap
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.i32.trap">;
+def int_nvvm_suld_1d_v2i8_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v2i8.trap">;
+def int_nvvm_suld_1d_v2i16_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v2i16.trap">;
+def int_nvvm_suld_1d_v2i32_trap
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v2i32.trap">;
+def int_nvvm_suld_1d_v4i8_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v4i8.trap">;
+def int_nvvm_suld_1d_v4i16_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v4i16.trap">;
+def int_nvvm_suld_1d_v4i32_trap
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v4i32.trap">;
+
+def int_nvvm_suld_1d_array_i8_trap
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.i8.trap">;
+def int_nvvm_suld_1d_array_i16_trap
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.i16.trap">;
+def int_nvvm_suld_1d_array_i32_trap
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.i32.trap">;
+def int_nvvm_suld_1d_array_v2i8_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v2i8.trap">;
+def int_nvvm_suld_1d_array_v2i16_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v2i16.trap">;
+def int_nvvm_suld_1d_array_v2i32_trap
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v2i32.trap">;
+def int_nvvm_suld_1d_array_v4i8_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v4i8.trap">;
+def int_nvvm_suld_1d_array_v4i16_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v4i16.trap">;
+def int_nvvm_suld_1d_array_v4i32_trap
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v4i32.trap">;
+
+def int_nvvm_suld_2d_i8_trap
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.i8.trap">;
+def int_nvvm_suld_2d_i16_trap
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.i16.trap">;
+def int_nvvm_suld_2d_i32_trap
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.i32.trap">;
+def int_nvvm_suld_2d_v2i8_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v2i8.trap">;
+def int_nvvm_suld_2d_v2i16_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v2i16.trap">;
+def int_nvvm_suld_2d_v2i32_trap
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v2i32.trap">;
+def int_nvvm_suld_2d_v4i8_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v4i8.trap">;
+def int_nvvm_suld_2d_v4i16_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v4i16.trap">;
+def int_nvvm_suld_2d_v4i32_trap
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v4i32.trap">;
+
+def int_nvvm_suld_2d_array_i8_trap
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.i8.trap">;
+def int_nvvm_suld_2d_array_i16_trap
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.i16.trap">;
+def int_nvvm_suld_2d_array_i32_trap
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.i32.trap">;
+def int_nvvm_suld_2d_array_v2i8_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v2i8.trap">;
+def int_nvvm_suld_2d_array_v2i16_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v2i16.trap">;
+def int_nvvm_suld_2d_array_v2i32_trap
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v2i32.trap">;
+def int_nvvm_suld_2d_array_v4i8_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v4i8.trap">;
+def int_nvvm_suld_2d_array_v4i16_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v4i16.trap">;
+def int_nvvm_suld_2d_array_v4i32_trap
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v4i32.trap">;
+
+def int_nvvm_suld_3d_i8_trap
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.i8.trap">;
+def int_nvvm_suld_3d_i16_trap
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.i16.trap">;
+def int_nvvm_suld_3d_i32_trap
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.i32.trap">;
+def int_nvvm_suld_3d_v2i8_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v2i8.trap">;
+def int_nvvm_suld_3d_v2i16_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v2i16.trap">;
+def int_nvvm_suld_3d_v2i32_trap
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v2i32.trap">;
+def int_nvvm_suld_3d_v4i8_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v4i8.trap">;
+def int_nvvm_suld_3d_v4i16_trap
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v4i16.trap">;
+def int_nvvm_suld_3d_v4i32_trap
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v4i32.trap">;
+
+//===- Texture Query ------------------------------------------------------===//
+
+def int_nvvm_txq_channel_order
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.txq.channel.order">,
+    GCCBuiltin<"__nvvm_txq_channel_order">;
+def int_nvvm_txq_channel_data_type
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.txq.channel.data.type">,
+    GCCBuiltin<"__nvvm_txq_channel_data_type">;
+def int_nvvm_txq_width
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.txq.width">,
+    GCCBuiltin<"__nvvm_txq_width">;
+def int_nvvm_txq_height
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.txq.height">,
+    GCCBuiltin<"__nvvm_txq_height">;
+def int_nvvm_txq_depth
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.txq.depth">,
+    GCCBuiltin<"__nvvm_txq_depth">;
+def int_nvvm_txq_array_size
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.txq.array.size">,
+    GCCBuiltin<"__nvvm_txq_array_size">;
+def int_nvvm_txq_num_samples
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.txq.num.samples">,
+    GCCBuiltin<"__nvvm_txq_num_samples">;
+def int_nvvm_txq_num_mipmap_levels
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.txq.num.mipmap.levels">,
+    GCCBuiltin<"__nvvm_txq_num_mipmap_levels">;
+
+//===- Surface Query ------------------------------------------------------===//
+
+def int_nvvm_suq_channel_order
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.suq.channel.order">,
+    GCCBuiltin<"__nvvm_suq_channel_order">;
+def int_nvvm_suq_channel_data_type
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.suq.channel.data.type">,
+    GCCBuiltin<"__nvvm_suq_channel_data_type">;
+def int_nvvm_suq_width
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.suq.width">,
+    GCCBuiltin<"__nvvm_suq_width">;
+def int_nvvm_suq_height
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.suq.height">,
+    GCCBuiltin<"__nvvm_suq_height">;
+def int_nvvm_suq_depth
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.suq.depth">,
+    GCCBuiltin<"__nvvm_suq_depth">;
+def int_nvvm_suq_array_size
+  : Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.suq.array.size">,
+    GCCBuiltin<"__nvvm_suq_array_size">;
+
+
+//===- Handle Query -------------------------------------------------------===//
+
+def int_nvvm_istypep_sampler
+  : Intrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.istypep.sampler">,
+    GCCBuiltin<"__nvvm_istypep_sampler">;
+def int_nvvm_istypep_surface
+  : Intrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.istypep.surface">,
+    GCCBuiltin<"__nvvm_istypep_surface">;
+def int_nvvm_istypep_texture
+  : Intrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem],
+              "llvm.nvvm.istypep.texture">,
+    GCCBuiltin<"__nvvm_istypep_texture">;
+
+
+
+//===- Surface Stores -----------------------------------------------------===//
+
+// Unformatted
+
+def int_nvvm_sust_b_1d_i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_i8_trap">;
+def int_nvvm_sust_b_1d_i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_i16_trap">;
+def int_nvvm_sust_b_1d_i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_i32_trap">;
+def int_nvvm_sust_b_1d_v2i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.v2i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v2i8_trap">;
+def int_nvvm_sust_b_1d_v2i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.v2i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v2i16_trap">;
+def int_nvvm_sust_b_1d_v2i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.v2i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v2i32_trap">;
+def int_nvvm_sust_b_1d_v4i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.v4i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v4i8_trap">;
+def int_nvvm_sust_b_1d_v4i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.v4i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v4i16_trap">;
+def int_nvvm_sust_b_1d_v4i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.v4i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v4i32_trap">;
+
+
+def int_nvvm_sust_b_1d_array_i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_i8_trap">;
+def int_nvvm_sust_b_1d_array_i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_i16_trap">;
+def int_nvvm_sust_b_1d_array_i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.array.i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_i32_trap">;
+def int_nvvm_sust_b_1d_array_v2i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v2i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i8_trap">;
+def int_nvvm_sust_b_1d_array_v2i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v2i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i16_trap">;
+def int_nvvm_sust_b_1d_array_v2i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v2i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i32_trap">;
+def int_nvvm_sust_b_1d_array_v4i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v4i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i8_trap">;
+def int_nvvm_sust_b_1d_array_v4i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v4i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i16_trap">;
+def int_nvvm_sust_b_1d_array_v4i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v4i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i32_trap">;
+
+
+def int_nvvm_sust_b_2d_i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_i8_trap">;
+def int_nvvm_sust_b_2d_i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_i16_trap">;
+def int_nvvm_sust_b_2d_i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_i32_trap">;
+def int_nvvm_sust_b_2d_v2i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.v2i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v2i8_trap">;
+def int_nvvm_sust_b_2d_v2i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.v2i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v2i16_trap">;
+def int_nvvm_sust_b_2d_v2i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.v2i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v2i32_trap">;
+def int_nvvm_sust_b_2d_v4i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.v4i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v4i8_trap">;
+def int_nvvm_sust_b_2d_v4i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.v4i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v4i16_trap">;
+def int_nvvm_sust_b_2d_v4i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.v4i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v4i32_trap">;
+
+
+def int_nvvm_sust_b_2d_array_i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_i8_trap">;
+def int_nvvm_sust_b_2d_array_i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_i16_trap">;
+def int_nvvm_sust_b_2d_array_i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.array.i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_i32_trap">;
+def int_nvvm_sust_b_2d_array_v2i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v2i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i8_trap">;
+def int_nvvm_sust_b_2d_array_v2i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v2i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i16_trap">;
+def int_nvvm_sust_b_2d_array_v2i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v2i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i32_trap">;
+def int_nvvm_sust_b_2d_array_v4i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v4i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i8_trap">;
+def int_nvvm_sust_b_2d_array_v4i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v4i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i16_trap">;
+def int_nvvm_sust_b_2d_array_v4i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v4i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i32_trap">;
+
+
+def int_nvvm_sust_b_3d_i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_3d_i8_trap">;
+def int_nvvm_sust_b_3d_i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_3d_i16_trap">;
+def int_nvvm_sust_b_3d_i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.3d.i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_3d_i32_trap">;
+def int_nvvm_sust_b_3d_v2i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.v2i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v2i8_trap">;
+def int_nvvm_sust_b_3d_v2i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.v2i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v2i16_trap">;
+def int_nvvm_sust_b_3d_v2i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.3d.v2i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v2i32_trap">;
+def int_nvvm_sust_b_3d_v4i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.v4i8.trap">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v4i8_trap">;
+def int_nvvm_sust_b_3d_v4i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.v4i16.trap">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v4i16_trap">;
+def int_nvvm_sust_b_3d_v4i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.3d.v4i32.trap">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v4i32_trap">;
+
+// Formatted
+
+def int_nvvm_sust_p_1d_i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.1d.i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_i8_trap">;
+def int_nvvm_sust_p_1d_i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.1d.i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_i16_trap">;
+def int_nvvm_sust_p_1d_i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.1d.i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_i32_trap">;
+def int_nvvm_sust_p_1d_v2i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.1d.v2i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_v2i8_trap">;
+def int_nvvm_sust_p_1d_v2i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.1d.v2i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_v2i16_trap">;
+def int_nvvm_sust_p_1d_v2i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.1d.v2i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_v2i32_trap">;
+def int_nvvm_sust_p_1d_v4i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.1d.v4i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_v4i8_trap">;
+def int_nvvm_sust_p_1d_v4i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.1d.v4i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_v4i16_trap">;
+def int_nvvm_sust_p_1d_v4i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.1d.v4i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_v4i32_trap">;
+
+
+def int_nvvm_sust_p_1d_array_i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.1d.array.i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_array_i8_trap">;
+def int_nvvm_sust_p_1d_array_i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.1d.array.i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_array_i16_trap">;
+def int_nvvm_sust_p_1d_array_i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.1d.array.i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_array_i32_trap">;
+def int_nvvm_sust_p_1d_array_v2i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.1d.array.v2i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_array_v2i8_trap">;
+def int_nvvm_sust_p_1d_array_v2i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.1d.array.v2i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_array_v2i16_trap">;
+def int_nvvm_sust_p_1d_array_v2i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.1d.array.v2i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_array_v2i32_trap">;
+def int_nvvm_sust_p_1d_array_v4i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.1d.array.v4i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_array_v4i8_trap">;
+def int_nvvm_sust_p_1d_array_v4i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.1d.array.v4i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_array_v4i16_trap">;
+def int_nvvm_sust_p_1d_array_v4i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.1d.array.v4i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_1d_array_v4i32_trap">;
+
+
+def int_nvvm_sust_p_2d_i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.2d.i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_i8_trap">;
+def int_nvvm_sust_p_2d_i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.2d.i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_i16_trap">;
+def int_nvvm_sust_p_2d_i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.2d.i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_i32_trap">;
+def int_nvvm_sust_p_2d_v2i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.2d.v2i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_v2i8_trap">;
+def int_nvvm_sust_p_2d_v2i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.2d.v2i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_v2i16_trap">;
+def int_nvvm_sust_p_2d_v2i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.2d.v2i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_v2i32_trap">;
+def int_nvvm_sust_p_2d_v4i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.2d.v4i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_v4i8_trap">;
+def int_nvvm_sust_p_2d_v4i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.2d.v4i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_v4i16_trap">;
+def int_nvvm_sust_p_2d_v4i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.2d.v4i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_v4i32_trap">;
+
+
+def int_nvvm_sust_p_2d_array_i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.2d.array.i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_array_i8_trap">;
+def int_nvvm_sust_p_2d_array_i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.2d.array.i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_array_i16_trap">;
+def int_nvvm_sust_p_2d_array_i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.2d.array.i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_array_i32_trap">;
+def int_nvvm_sust_p_2d_array_v2i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.2d.array.v2i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_array_v2i8_trap">;
+def int_nvvm_sust_p_2d_array_v2i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.2d.array.v2i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_array_v2i16_trap">;
+def int_nvvm_sust_p_2d_array_v2i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.2d.array.v2i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_array_v2i32_trap">;
+def int_nvvm_sust_p_2d_array_v4i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.2d.array.v4i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_array_v4i8_trap">;
+def int_nvvm_sust_p_2d_array_v4i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.2d.array.v4i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_array_v4i16_trap">;
+def int_nvvm_sust_p_2d_array_v4i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.2d.array.v4i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_2d_array_v4i32_trap">;
+
+
+def int_nvvm_sust_p_3d_i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.3d.i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_3d_i8_trap">;
+def int_nvvm_sust_p_3d_i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.3d.i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_3d_i16_trap">;
+def int_nvvm_sust_p_3d_i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.3d.i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_3d_i32_trap">;
+def int_nvvm_sust_p_3d_v2i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.3d.v2i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_3d_v2i8_trap">;
+def int_nvvm_sust_p_3d_v2i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.3d.v2i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_3d_v2i16_trap">;
+def int_nvvm_sust_p_3d_v2i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.3d.v2i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_3d_v2i32_trap">;
+def int_nvvm_sust_p_3d_v4i8_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.3d.v4i8.trap">,
+    GCCBuiltin<"__nvvm_sust_p_3d_v4i8_trap">;
+def int_nvvm_sust_p_3d_v4i16_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.p.3d.v4i16.trap">,
+    GCCBuiltin<"__nvvm_sust_p_3d_v4i16_trap">;
+def int_nvvm_sust_p_3d_v4i32_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.p.3d.v4i32.trap">,
+    GCCBuiltin<"__nvvm_sust_p_3d_v4i32_trap">;
+
+
+
 // Old PTX back-end intrinsics retained here for backwards-compatibility
 
 multiclass PTXReadSpecialRegisterIntrinsic_v4i32<string prefix> {
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 8f64b5d..36d93fe 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -18,6 +18,15 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 }
 
 //===----------------------------------------------------------------------===//
+// Read Time Stamp Counter.
+let TargetPrefix = "x86" in {
+  def int_x86_rdtsc : GCCBuiltin<"__builtin_ia32_rdtsc">,
+              Intrinsic<[llvm_i64_ty], [], []>;
+  def int_x86_rdtscp : GCCBuiltin<"__builtin_ia32_rdtscp">,
+              Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], [IntrReadWriteArgMem]>;
+}
+
+//===----------------------------------------------------------------------===//
 // 3DNow!
 
 let TargetPrefix = "x86" in {
@@ -1120,6 +1129,27 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
         GCCBuiltin<"__builtin_ia32_vperm2f128_si256">,
         Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
                   llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_vpermt_d_512:
+        GCCBuiltin<"__builtin_ia32_vpermt2vard512_mask">,
+        Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                  llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_vpermt_q_512:
+        GCCBuiltin<"__builtin_ia32_vpermt2varq512_mask">,
+        Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                  llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_vpermt_ps_512:
+        GCCBuiltin<"__builtin_ia32_vpermt2varps512_mask">,
+        Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty,
+                  llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_vpermt_pd_512:
+        GCCBuiltin<"__builtin_ia32_vpermt2varpd512_mask">,
+        Intrinsic<[llvm_v8f64_ty], [llvm_v8i64_ty,
+                  llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>;
+
 }
 
 // Vector blend
@@ -2999,141 +3029,104 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Gather and Scatter ops
 let TargetPrefix = "x86" in {
-  def int_x86_avx512_gather_dpd_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherdpd512">,
-          Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i8_ty,
-                     llvm_v8i32_ty, llvm_ptr_ty, llvm_i32_ty],
+  def int_x86_avx512_gather_dpd_512  : GCCBuiltin<"__builtin_ia32_gathersiv8df">,
+          Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
+                     llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
                     [IntrReadArgMem]>;
-  def int_x86_avx512_gather_dps_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherdps512">,
-          Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i16_ty,
-                     llvm_v16i32_ty, llvm_ptr_ty, llvm_i32_ty],
+  def int_x86_avx512_gather_dps_512  : GCCBuiltin<"__builtin_ia32_gathersiv16sf">,
+          Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_ptr_ty,
+                     llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
                     [IntrReadArgMem]>;
-  def int_x86_avx512_gather_qpd_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherqpd512">,
-          Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i8_ty,
-                     llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
+  def int_x86_avx512_gather_qpd_512  : GCCBuiltin<"__builtin_ia32_gatherdiv8df">,
+          Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
+                     llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
                     [IntrReadArgMem]>;
-  def int_x86_avx512_gather_qps_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherqps512">,
-          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i8_ty,
-                     llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
+  def int_x86_avx512_gather_qps_512  : GCCBuiltin<"__builtin_ia32_gatherdiv16sf">,
+          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty,
+                     llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
                     [IntrReadArgMem]>;
 
-  def int_x86_avx512_gather_dpd_512  : GCCBuiltin<"__builtin_ia32_gatherdpd512">,
-          Intrinsic<[llvm_v8f64_ty], [llvm_v8i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty],
-                    [IntrReadArgMem]>;
-  def int_x86_avx512_gather_dps_512  : GCCBuiltin<"__builtin_ia32_gatherdps512">,
-          Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty],
-                    [IntrReadArgMem]>;
-  def int_x86_avx512_gather_qpd_512  : GCCBuiltin<"__builtin_ia32_gatherqpd512">,
-          Intrinsic<[llvm_v8f64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty],
-                    [IntrReadArgMem]>;
-  def int_x86_avx512_gather_qps_512  : GCCBuiltin<"__builtin_ia32_gatherqps512">,
-          Intrinsic<[llvm_v8f32_ty], [llvm_v8i64_ty, llvm_ptr_ty, 
-                     llvm_i32_ty],
-                    [IntrReadArgMem]>;
-
-  def int_x86_avx512_gather_dpq_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherdpq512">,
-          Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i8_ty,
-                     llvm_v8i32_ty, llvm_ptr_ty, llvm_i32_ty],
-                    [IntrReadArgMem]>;
-  def int_x86_avx512_gather_dpi_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherdpi512">,
-          Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_i16_ty,
-                     llvm_v16i32_ty, llvm_ptr_ty, llvm_i32_ty],
-                    [IntrReadArgMem]>;
-  def int_x86_avx512_gather_qpq_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherqpq512">,
-          Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i8_ty,
-                     llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
-                    [IntrReadArgMem]>;
-  def int_x86_avx512_gather_qpi_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherqpi512">,
-          Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_i8_ty,
-                     llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
-                    [IntrReadArgMem]>;
 
-  def int_x86_avx512_gather_dpq_512  : GCCBuiltin<"__builtin_ia32_gatherdpq512">,
-          Intrinsic<[llvm_v8i64_ty], [llvm_v8i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty],
+  def int_x86_avx512_gather_dpq_512  : GCCBuiltin<"__builtin_ia32_gathersiv8di">,
+          Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
+                     llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
                     [IntrReadArgMem]>;
-  def int_x86_avx512_gather_dpi_512  : GCCBuiltin<"__builtin_ia32_gatherdpi512">,
-          Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty, 
-                     llvm_i32_ty],
+  def int_x86_avx512_gather_dpi_512  : GCCBuiltin<"__builtin_ia32_gathersiv16si">,
+          Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
+                     llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
                     [IntrReadArgMem]>;
-  def int_x86_avx512_gather_qpq_512  : GCCBuiltin<"__builtin_ia32_gatherqpq512">,
+  def int_x86_avx512_gather_qpq_512  : GCCBuiltin<"__builtin_ia32_gatherdiv8di">,
           Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty],
+                     llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
                     [IntrReadArgMem]>;
-  def int_x86_avx512_gather_qpi_512  : GCCBuiltin<"__builtin_ia32_gatherqpi512">,
-          Intrinsic<[llvm_v8i32_ty], [llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty],
+  def int_x86_avx512_gather_qpi_512  : GCCBuiltin<"__builtin_ia32_gatherdiv16si">,
+          Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty,
+                     llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
                     [IntrReadArgMem]>;
+
 // scatter
-  def int_x86_avx512_scatter_dpd_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterdpd512">,
+  def int_x86_avx512_scatter_dpd_512  : GCCBuiltin<"__builtin_ia32_scattersiv8df">,
           Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
                         llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
                     [IntrReadWriteArgMem]>;
-  def int_x86_avx512_scatter_dps_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterdps512">,
+  def int_x86_avx512_scatter_dps_512  : GCCBuiltin<"__builtin_ia32_scattersiv16sf">,
           Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty,
                        llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty],
                     [IntrReadWriteArgMem]>;
-  def int_x86_avx512_scatter_qpd_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterqpd512">,
+  def int_x86_avx512_scatter_qpd_512  : GCCBuiltin<"__builtin_ia32_scatterdiv8df">,
           Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
                      llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty],
                     [IntrReadWriteArgMem]>;
-  def int_x86_avx512_scatter_qps_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterqps512">,
+  def int_x86_avx512_scatter_qps_512  : GCCBuiltin<"__builtin_ia32_scatterdiv16sf">,
           Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
                      llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty],
                     [IntrReadWriteArgMem]>;
 
-  def int_x86_avx512_scatter_dpd_512  : GCCBuiltin<"__builtin_ia32_scatterdpd512">,
-          Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8f64_ty, 
-                         llvm_i32_ty],
-                    [IntrReadWriteArgMem]>;
-  def int_x86_avx512_scatter_dps_512  : GCCBuiltin<"__builtin_ia32_scatterdps512">,
-          Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_v16f32_ty, 
-                         llvm_i32_ty],
-                    [IntrReadWriteArgMem]>;
-  def int_x86_avx512_scatter_qpd_512  : GCCBuiltin<"__builtin_ia32_scatterqpd512">,
-          Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8f64_ty, 
-                         llvm_i32_ty],
-                    [IntrReadWriteArgMem]>;
-  def int_x86_avx512_scatter_qps_512  : GCCBuiltin<"__builtin_ia32_scatterqps512">,
-          Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8f32_ty, 
-                         llvm_i32_ty],
-                    [IntrReadWriteArgMem]>;
 
-  def int_x86_avx512_scatter_dpq_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterdpq512">,
-          Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, 
-                         llvm_v8i64_ty, llvm_i32_ty],
+  def int_x86_avx512_scatter_dpq_512  : GCCBuiltin<"__builtin_ia32_scattersiv8di">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
+                         llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
                     [IntrReadWriteArgMem]>;
-  def int_x86_avx512_scatter_dpi_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterdpi512">,
+  def int_x86_avx512_scatter_dpi_512  : GCCBuiltin<"__builtin_ia32_scattersiv16si">,
           Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty,
                      llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
                     [IntrReadWriteArgMem]>;
-  def int_x86_avx512_scatter_qpq_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterqpq512">,
-          Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
-                     llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty],
+  def int_x86_avx512_scatter_qpq_512  : GCCBuiltin<"__builtin_ia32_scatterdiv8di">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,llvm_v8i64_ty, llvm_v8i64_ty,
+                         llvm_i32_ty],
                     [IntrReadWriteArgMem]>;
-  def int_x86_avx512_scatter_qpi_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterqpi512">,
-          Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
-                     llvm_v8i64_ty, llvm_v8i32_ty, llvm_i32_ty],
+  def int_x86_avx512_scatter_qpi_512  : GCCBuiltin<"__builtin_ia32_scatterdiv16si">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i64_ty, llvm_v8i32_ty, 
+                         llvm_i32_ty],
                     [IntrReadWriteArgMem]>;
 
-  def int_x86_avx512_scatter_dpq_512  : GCCBuiltin<"__builtin_ia32_scatterdpq512">,
-          Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
-                    []>;
-  def int_x86_avx512_scatter_dpi_512  : GCCBuiltin<"__builtin_ia32_scatterdpi512">,
-          Intrinsic<[], [llvm_ptr_ty,
-                     llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
-                    []>;
-  def int_x86_avx512_scatter_qpq_512  : GCCBuiltin<"__builtin_ia32_scatterqpq512">,
-          Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8i64_ty,
-                         llvm_i32_ty],
-                    []>;
-  def int_x86_avx512_scatter_qpi_512  : GCCBuiltin<"__builtin_ia32_scatterqpi512">,
-          Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8i32_ty, 
-                         llvm_i32_ty],
-                    []>;
+  // gather prefetch
+  def int_x86_avx512_gatherpf_dpd_512  : GCCBuiltin<"__builtin_ia32_gatherpfdpd">,
+          Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
+                     llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_gatherpf_dps_512  : GCCBuiltin<"__builtin_ia32_gatherpfdps">,
+          Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
+                     llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_gatherpf_qpd_512  : GCCBuiltin<"__builtin_ia32_gatherpfqpd">,
+          Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
+                     llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_gatherpf_qps_512  : GCCBuiltin<"__builtin_ia32_gatherpfqps">,
+          Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
+                     llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+
+  // scatter prefetch
+  def int_x86_avx512_scatterpf_dpd_512  : GCCBuiltin<"__builtin_ia32_scatterpfdpd">,
+          Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
+                     llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_scatterpf_dps_512  : GCCBuiltin<"__builtin_ia32_scatterpfdps">,
+          Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
+                     llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_scatterpf_qpd_512  : GCCBuiltin<"__builtin_ia32_scatterpfqpd">,
+          Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
+                     llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+  def int_x86_avx512_scatterpf_qps_512  : GCCBuiltin<"__builtin_ia32_scatterpfqps">,
+          Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
+                     llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
 }
 
 // AVX-512 conflict detection
diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h
index ae4859a..4d940d5 100644
--- a/include/llvm/IR/LLVMContext.h
+++ b/include/llvm/IR/LLVMContext.h
@@ -29,6 +29,8 @@ class Module;
 class SMDiagnostic;
 class DiagnosticInfo;
 template <typename T> class SmallVectorImpl;
+class Function;
+class DebugLoc;
 
 /// This is an important class for using LLVM in a threaded context.  It
 /// (opaquely) owns and manages the core "global" data of LLVM's core
@@ -70,6 +72,10 @@ public:
   /// \see LLVMContext::diagnose.
   typedef void (*DiagnosticHandlerTy)(const DiagnosticInfo &DI, void *Context);
 
+  /// Defines the type of a yield callback.
+  /// \see LLVMContext::setYieldCallback.
+  typedef void (*YieldCallbackTy)(LLVMContext *Context, void *OpaqueHandle);
+
   /// setInlineAsmDiagnosticHandler - This method sets a handler that is invoked
   /// when problems with inline asm are detected by the backend.  The first
   /// argument is a function pointer and the second is a context pointer that
@@ -78,7 +84,7 @@ public:
   /// LLVMContext doesn't take ownership or interpret either of these
   /// pointers.
   void setInlineAsmDiagnosticHandler(InlineAsmDiagHandlerTy DiagHandler,
-                                     void *DiagContext = 0);
+                                     void *DiagContext = nullptr);
 
   /// getInlineAsmDiagnosticHandler - Return the diagnostic handler set by
   /// setInlineAsmDiagnosticHandler.
@@ -96,7 +102,7 @@ public:
   /// LLVMContext doesn't take ownership or interpret either of these
   /// pointers.
   void setDiagnosticHandler(DiagnosticHandlerTy DiagHandler,
-                            void *DiagContext = 0);
+                            void *DiagContext = nullptr);
 
   /// getDiagnosticHandler - Return the diagnostic handler set by
   /// setDiagnosticHandler.
@@ -116,6 +122,32 @@ public:
   /// for RS_Error, "warning: " for RS_Warning, and "note: " for RS_Note.
   void diagnose(const DiagnosticInfo &DI);
 
+  /// \brief Registers a yield callback with the given context.
+  ///
+  /// The yield callback function may be called by LLVM to transfer control back
+  /// to the client that invoked the LLVM compilation. This can be used to yield
+  /// control of the thread, or perform periodic work needed by the client.
+  /// There is no guaranteed frequency at which callbacks must occur; in fact,
+  /// the client is not guaranteed to ever receive this callback. It is at the
+  /// sole discretion of LLVM to do so and only if it can guarantee that
+  /// suspending the thread won't block any forward progress in other LLVM
+  /// contexts in the same process.
+  ///
+  /// At a suspend point, the state of the current LLVM context is intentionally
+  /// undefined. No assumptions about it can or should be made. Only LLVM
+  /// context API calls that explicitly state that they can be used during a
+  /// yield callback are allowed to be used. Any other API calls into the
+  /// context are not supported until the yield callback function returns
+  /// control to LLVM. Other LLVM contexts are unaffected by this restriction.
+  void setYieldCallback(YieldCallbackTy Callback, void *OpaqueHandle);
+
+  /// \brief Calls the yield callback (if applicable).
+  ///
+  /// This transfers control of the current thread back to the client, which may
+  /// suspend the current thread. Only call this method when LLVM doesn't hold
+  /// any global mutex or cannot block the execution in another LLVM context.
+  void yield();
+
   /// emitError - Emit an error message to the currently installed error handler
   /// with optional location information.  This function returns, so code should
   /// be prepared to drop the erroneous construct on the floor and "not crash".
diff --git a/include/llvm/IR/LegacyPassManagers.h b/include/llvm/IR/LegacyPassManagers.h
index 5c9dccd..f6065a4 100644
--- a/include/llvm/IR/LegacyPassManagers.h
+++ b/include/llvm/IR/LegacyPassManagers.h
@@ -120,11 +120,11 @@ class PassManagerPrettyStackEntry : public PrettyStackTraceEntry {
   Module *M;
 public:
   explicit PassManagerPrettyStackEntry(Pass *p)
-    : P(p), V(0), M(0) {}  // When P is releaseMemory'd.
+    : P(p), V(nullptr), M(nullptr) {}  // When P is releaseMemory'd.
   PassManagerPrettyStackEntry(Pass *p, Value &v)
-    : P(p), V(&v), M(0) {} // When P is run on V
+    : P(p), V(&v), M(nullptr) {} // When P is run on V
   PassManagerPrettyStackEntry(Pass *p, Module &m)
-    : P(p), V(0), M(&m) {} // When P is run on M
+    : P(p), V(nullptr), M(&m) {} // When P is run on M
 
   /// print - Emit information about this stack frame to OS.
   void print(raw_ostream &OS) const override;
@@ -263,7 +263,7 @@ private:
 class PMDataManager {
 public:
 
-  explicit PMDataManager() : TPM(NULL), Depth(0) {
+  explicit PMDataManager() : TPM(nullptr), Depth(0) {
     initializeAnalysisInfo();
   }
 
@@ -303,7 +303,7 @@ public:
   void initializeAnalysisInfo() {
     AvailableAnalysis.clear();
     for (unsigned i = 0; i < PMT_Last; ++i)
-      InheritedAnalysis[i] = NULL;
+      InheritedAnalysis[i] = nullptr;
   }
 
   // Return true if P preserves high level analysis used by other
@@ -441,7 +441,7 @@ public:
   Pass *getAsPass() override { return this; }
 
   /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const override{
+  void getAnalysisUsage(AnalysisUsage &Info) const override {
     Info.setPreservesAll();
   }
 
diff --git a/include/llvm/IR/LegacyPassNameParser.h b/include/llvm/IR/LegacyPassNameParser.h
index 1f6bbbc..b72fc4c 100644
--- a/include/llvm/IR/LegacyPassNameParser.h
+++ b/include/llvm/IR/LegacyPassNameParser.h
@@ -43,7 +43,7 @@ class PassNameParser : public PassRegistrationListener,
                        public cl::parser<const PassInfo*> {
   cl::Option *Opt;
 public:
-  PassNameParser() : Opt(0) {}
+  PassNameParser() : Opt(nullptr) {}
   virtual ~PassNameParser();
 
   void initialize(cl::Option &O) {
@@ -62,8 +62,8 @@ public:
   inline bool ignorablePass(const PassInfo *P) const {
     // Ignore non-selectable and non-constructible passes!  Ignore
     // non-optimizations.
-    return P->getPassArgument() == 0 || *P->getPassArgument() == 0 ||
-           P->getNormalCtor() == 0 || ignorablePassImpl(P);
+    return P->getPassArgument() == nullptr || *P->getPassArgument() == 0 ||
+           P->getNormalCtor() == nullptr || ignorablePassImpl(P);
   }
 
   // Implement the PassRegistrationListener callbacks used to populate our map
@@ -73,7 +73,7 @@ public:
     if (findOption(P->getPassArgument()) != getNumOptions()) {
       errs() << "Two passes with the same argument (-"
            << P->getPassArgument() << ") attempted to be registered!\n";
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
     addLiteralOption(P->getPassArgument(), P, P->getPassName());
   }
diff --git a/include/llvm/IR/MDBuilder.h b/include/llvm/IR/MDBuilder.h
index c07b2bd..37d263b 100644
--- a/include/llvm/IR/MDBuilder.h
+++ b/include/llvm/IR/MDBuilder.h
@@ -15,14 +15,17 @@
 #ifndef LLVM_IR_MDBUILDER_H
 #define LLVM_IR_MDBUILDER_H
 
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Metadata.h"
+#include "llvm/Support/DataTypes.h"
+#include <utility>
 
 namespace llvm {
 
 class APInt;
+template <typename T> class ArrayRef;
 class LLVMContext;
+class MDNode;
+class MDString;
+class StringRef;
 
 class MDBuilder {
   LLVMContext &Context;
@@ -31,9 +34,7 @@ public:
   MDBuilder(LLVMContext &context) : Context(context) {}
 
   /// \brief Return the given string as metadata.
-  MDString *createString(StringRef Str) {
-    return MDString::get(Context, Str);
-  }
+  MDString *createString(StringRef Str);
 
   //===------------------------------------------------------------------===//
   // FPMath metadata.
@@ -42,55 +43,24 @@ public:
   /// \brief Return metadata with the given settings.  The special value 0.0
   /// for the Accuracy parameter indicates the default (maximal precision)
   /// setting.
-  MDNode *createFPMath(float Accuracy) {
-    if (Accuracy == 0.0)
-      return 0;
-    assert(Accuracy > 0.0 && "Invalid fpmath accuracy!");
-    Value *Op = ConstantFP::get(Type::getFloatTy(Context), Accuracy);
-    return MDNode::get(Context, Op);
-  }
+  MDNode *createFPMath(float Accuracy);
 
   //===------------------------------------------------------------------===//
   // Prof metadata.
   //===------------------------------------------------------------------===//
 
   /// \brief Return metadata containing two branch weights.
-  MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight) {
-    uint32_t Weights[] = { TrueWeight, FalseWeight };
-    return createBranchWeights(Weights);
-  }
+  MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight);
 
   /// \brief Return metadata containing a number of branch weights.
-  MDNode *createBranchWeights(ArrayRef<uint32_t> Weights) {
-    assert(Weights.size() >= 2 && "Need at least two branch weights!");
-
-    SmallVector<Value *, 4> Vals(Weights.size()+1);
-    Vals[0] = createString("branch_weights");
-
-    Type *Int32Ty = Type::getInt32Ty(Context);
-    for (unsigned i = 0, e = Weights.size(); i != e; ++i)
-      Vals[i+1] = ConstantInt::get(Int32Ty, Weights[i]);
-
-    return MDNode::get(Context, Vals);
-  }
+  MDNode *createBranchWeights(ArrayRef<uint32_t> Weights);
 
   //===------------------------------------------------------------------===//
   // Range metadata.
   //===------------------------------------------------------------------===//
 
   /// \brief Return metadata describing the range [Lo, Hi).
-  MDNode *createRange(const APInt &Lo, const APInt &Hi) {
-    assert(Lo.getBitWidth() == Hi.getBitWidth() && "Mismatched bitwidths!");
-    // If the range is everything then it is useless.
-    if (Hi == Lo)
-      return 0;
-
-    // Return the range [Lo, Hi).
-    Type *Ty = IntegerType::get(Context, Lo.getBitWidth());
-    Value *Range[2] = { ConstantInt::get(Ty, Lo), ConstantInt::get(Ty, Hi) };
-    return MDNode::get(Context, Range);
-  }
-
+  MDNode *createRange(const APInt &Lo, const APInt &Hi);
 
   //===------------------------------------------------------------------===//
   // TBAA metadata.
@@ -99,41 +69,17 @@ public:
   /// \brief Return metadata appropriate for a TBAA root node.  Each returned
   /// node is distinct from all other metadata and will never be identified
   /// (uniqued) with anything else.
-  MDNode *createAnonymousTBAARoot() {
-    // To ensure uniqueness the root node is self-referential.
-    MDNode *Dummy = MDNode::getTemporary(Context, ArrayRef<Value*>());
-    MDNode *Root = MDNode::get(Context, Dummy);
-    // At this point we have
-    //   !0 = metadata !{}            <- dummy
-    //   !1 = metadata !{metadata !0} <- root
-    // Replace the dummy operand with the root node itself and delete the dummy.
-    Root->replaceOperandWith(0, Root);
-    MDNode::deleteTemporary(Dummy);
-    // We now have
-    //   !1 = metadata !{metadata !1} <- self-referential root
-    return Root;
-  }
+  MDNode *createAnonymousTBAARoot();
 
   /// \brief Return metadata appropriate for a TBAA root node with the given
   /// name.  This may be identified (uniqued) with other roots with the same
   /// name.
-  MDNode *createTBAARoot(StringRef Name) {
-    return MDNode::get(Context, createString(Name));
-  }
+  MDNode *createTBAARoot(StringRef Name);
 
   /// \brief Return metadata for a non-root TBAA node with the given name,
   /// parent in the TBAA tree, and value for 'pointsToConstantMemory'.
   MDNode *createTBAANode(StringRef Name, MDNode *Parent,
-                         bool isConstant = false) {
-    if (isConstant) {
-      Constant *Flags = ConstantInt::get(Type::getInt64Ty(Context), 1);
-      Value *Ops[3] = { createString(Name), Parent, Flags };
-      return MDNode::get(Context, Ops);
-    } else {
-      Value *Ops[2] = { createString(Name), Parent };
-      return MDNode::get(Context, Ops);
-    }
-  }
+                         bool isConstant = false);
 
   struct TBAAStructField {
     uint64_t Offset;
@@ -145,49 +91,23 @@ public:
 
   /// \brief Return metadata for a tbaa.struct node with the given
   /// struct field descriptions.
-  MDNode *createTBAAStructNode(ArrayRef<TBAAStructField> Fields) {
-    SmallVector<Value *, 4> Vals(Fields.size() * 3);
-    Type *Int64 = IntegerType::get(Context, 64);
-    for (unsigned i = 0, e = Fields.size(); i != e; ++i) {
-      Vals[i * 3 + 0] = ConstantInt::get(Int64, Fields[i].Offset);
-      Vals[i * 3 + 1] = ConstantInt::get(Int64, Fields[i].Size);
-      Vals[i * 3 + 2] = Fields[i].TBAA;
-    }
-    return MDNode::get(Context, Vals);
-  }
+  MDNode *createTBAAStructNode(ArrayRef<TBAAStructField> Fields);
 
   /// \brief Return metadata for a TBAA struct node in the type DAG
   /// with the given name, a list of pairs (offset, field type in the type DAG).
-  MDNode *createTBAAStructTypeNode(StringRef Name,
-             ArrayRef<std::pair<MDNode*, uint64_t> > Fields) {
-    SmallVector<Value *, 4> Ops(Fields.size() * 2 + 1);
-    Type *Int64 = IntegerType::get(Context, 64);
-    Ops[0] = createString(Name);
-    for (unsigned i = 0, e = Fields.size(); i != e; ++i) {
-      Ops[i * 2 + 1] = Fields[i].first;
-      Ops[i * 2 + 2] = ConstantInt::get(Int64, Fields[i].second);
-    }
-    return MDNode::get(Context, Ops);
-  }
+  MDNode *
+  createTBAAStructTypeNode(StringRef Name,
+                           ArrayRef<std::pair<MDNode *, uint64_t>> Fields);
 
   /// \brief Return metadata for a TBAA scalar type node with the
   /// given name, an offset and a parent in the TBAA type DAG.
   MDNode *createTBAAScalarTypeNode(StringRef Name, MDNode *Parent,
-                                   uint64_t Offset = 0) {
-    ConstantInt *Off = ConstantInt::get(Type::getInt64Ty(Context), Offset);
-    Value *Ops[3] = { createString(Name), Parent, Off };
-    return MDNode::get(Context, Ops);
-  }
+                                   uint64_t Offset = 0);
 
   /// \brief Return metadata for a TBAA tag node with the given
   /// base type, access type and offset relative to the base type.
   MDNode *createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType,
-                                  uint64_t Offset) {
-    Type *Int64 = IntegerType::get(Context, 64);
-    Value *Ops[3] = { BaseType, AccessType, ConstantInt::get(Int64, Offset) };
-    return MDNode::get(Context, Ops);
-  }
-
+                                  uint64_t Offset);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index d054fbb..7a0ca88 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@@ -218,7 +218,7 @@ class NamedMDNode : public ilist_node<NamedMDNode> {
     friend class NamedMDNode;
 
   public:
-    op_iterator_impl() : Node(0), Idx(0) { }
+    op_iterator_impl() : Node(nullptr), Idx(0) { }
 
     bool operator==(const op_iterator_impl &o) const { return Idx == o.Idx; }
     bool operator!=(const op_iterator_impl &o) const { return Idx != o.Idx; }
@@ -272,7 +272,7 @@ public:
   StringRef getName() const;
 
   /// print - Implement operator<< on NamedMDNode.
-  void print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW = 0) const;
+  void print(raw_ostream &ROS) const;
 
   /// dump() - Allow printing of NamedMDNodes from the debugger.
   void dump() const;
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index f0d4002..0c309e8 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -285,33 +285,29 @@ public:
 /// @name Generic Value Accessors
 /// @{
 
-  /// getNamedValue - Return the global value in the module with
-  /// the specified name, of arbitrary type.  This method returns null
-  /// if a global with the specified name is not found.
+  /// Return the global value in the module with the specified name, of
+  /// arbitrary type. This method returns null if a global with the specified
+  /// name is not found.
   GlobalValue *getNamedValue(StringRef Name) const;
 
-  /// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
-  /// This ID is uniqued across modules in the current LLVMContext.
+  /// Return a unique non-zero ID for the specified metadata kind. This ID is
+  /// uniqued across modules in the current LLVMContext.
   unsigned getMDKindID(StringRef Name) const;
 
-  /// getMDKindNames - Populate client supplied SmallVector with the name for
-  /// custom metadata IDs registered in this LLVMContext.
+  /// Populate client supplied SmallVector with the name for custom metadata IDs
+  /// registered in this LLVMContext.
   void getMDKindNames(SmallVectorImpl<StringRef> &Result) const;
 
-  
-  typedef DenseMap<StructType*, unsigned, DenseMapInfo<StructType*> >
-                   NumeredTypesMapTy;
-
-  /// getTypeByName - Return the type with the specified name, or null if there
-  /// is none by that name.
+  /// Return the type with the specified name, or null if there is none by that
+  /// name.
   StructType *getTypeByName(StringRef Name) const;
 
 /// @}
 /// @name Function Accessors
 /// @{
 
-  /// getOrInsertFunction - Look up the specified function in the module symbol
-  /// table.  Four possibilities:
+  /// Look up the specified function in the module symbol table. Four
+  /// possibilities:
   ///   1. If it does not exist, add a prototype for the function and return it.
   ///   2. If it exists, and has a local linkage, the existing function is
   ///      renamed and a new one is inserted.
@@ -324,33 +320,32 @@ public:
 
   Constant *getOrInsertFunction(StringRef Name, FunctionType *T);
 
-  /// getOrInsertFunction - Look up the specified function in the module symbol
-  /// table.  If it does not exist, add a prototype for the function and return
-  /// it.  This function guarantees to return a constant of pointer to the
-  /// specified function type or a ConstantExpr BitCast of that type if the
-  /// named function has a different type.  This version of the method takes a
-  /// null terminated list of function arguments, which makes it easier for
-  /// clients to use.
+  /// Look up the specified function in the module symbol table. If it does not
+  /// exist, add a prototype for the function and return it. This function
+  /// guarantees to return a constant of pointer to the specified function type
+  /// or a ConstantExpr BitCast of that type if the named function has a
+  /// different type. This version of the method takes a null terminated list of
+  /// function arguments, which makes it easier for clients to use.
   Constant *getOrInsertFunction(StringRef Name,
                                 AttributeSet AttributeList,
                                 Type *RetTy, ...)  END_WITH_NULL;
 
-  /// getOrInsertFunction - Same as above, but without the attributes.
+  /// Same as above, but without the attributes.
   Constant *getOrInsertFunction(StringRef Name, Type *RetTy, ...)
     END_WITH_NULL;
 
-  /// getFunction - Look up the specified function in the module symbol table.
-  /// If it does not exist, return null.
+  /// Look up the specified function in the module symbol table. If it does not
+  /// exist, return null.
   Function *getFunction(StringRef Name) const;
 
 /// @}
 /// @name Global Variable Accessors
 /// @{
 
-  /// getGlobalVariable - Look up the specified global variable in the module
-  /// symbol table.  If it does not exist, return null. If AllowInternal is set
-  /// to true, this function will return types that have InternalLinkage. By
-  /// default, these types are not returned.
+  /// Look up the specified global variable in the module symbol table. If it
+  /// does not exist, return null. If AllowInternal is set to true, this
+  /// function will return types that have InternalLinkage. By default, these
+  /// types are not returned.
   const GlobalVariable *getGlobalVariable(StringRef Name,
                                           bool AllowInternal = false) const {
     return const_cast<Module *>(this)->getGlobalVariable(Name, AllowInternal);
@@ -358,9 +353,9 @@ public:
 
   GlobalVariable *getGlobalVariable(StringRef Name, bool AllowInternal = false);
 
-  /// getNamedGlobal - Return the global variable in the module with the
-  /// specified name, of arbitrary type.  This method returns null if a global
-  /// with the specified name is not found.
+  /// Return the global variable in the module with the specified name, of
+  /// arbitrary type. This method returns null if a global with the specified
+  /// name is not found.
   GlobalVariable *getNamedGlobal(StringRef Name) {
     return getGlobalVariable(Name, true);
   }
@@ -368,8 +363,7 @@ public:
     return const_cast<Module *>(this)->getNamedGlobal(Name);
   }
 
-  /// getOrInsertGlobal - Look up the specified global in the module symbol
-  /// table.
+  /// Look up the specified global in the module symbol table.
   ///   1. If it does not exist, add a declaration of the global and return it.
   ///   2. Else, the global exists but has the wrong type: return the function
   ///      with a constantexpr cast to the right type.
@@ -381,53 +375,49 @@ public:
 /// @name Global Alias Accessors
 /// @{
 
-  /// getNamedAlias - Return the global alias in the module with the
-  /// specified name, of arbitrary type.  This method returns null if a global
-  /// with the specified name is not found.
+  /// Return the global alias in the module with the specified name, of
+  /// arbitrary type. This method returns null if a global with the specified
+  /// name is not found.
   GlobalAlias *getNamedAlias(StringRef Name) const;
 
 /// @}
 /// @name Named Metadata Accessors
 /// @{
 
-  /// getNamedMetadata - Return the first NamedMDNode in the module with the
-  /// specified name. This method returns null if a NamedMDNode with the
-  /// specified name is not found.
+  /// Return the first NamedMDNode in the module with the specified name. This
+  /// method returns null if a NamedMDNode with the specified name is not found.
   NamedMDNode *getNamedMetadata(const Twine &Name) const;
 
-  /// getOrInsertNamedMetadata - Return the named MDNode in the module
-  /// with the specified name. This method returns a new NamedMDNode if a
-  /// NamedMDNode with the specified name is not found.
+  /// Return the named MDNode in the module with the specified name. This method
+  /// returns a new NamedMDNode if a NamedMDNode with the specified name is not
+  /// found.
   NamedMDNode *getOrInsertNamedMetadata(StringRef Name);
 
-  /// eraseNamedMetadata - Remove the given NamedMDNode from this module
-  /// and delete it.
+  /// Remove the given NamedMDNode from this module and delete it.
   void eraseNamedMetadata(NamedMDNode *NMD);
 
 /// @}
 /// @name Module Flags Accessors
 /// @{
 
-  /// getModuleFlagsMetadata - Returns the module flags in the provided vector.
+  /// Returns the module flags in the provided vector.
   void getModuleFlagsMetadata(SmallVectorImpl<ModuleFlagEntry> &Flags) const;
 
   /// Return the corresponding value if Key appears in module flags, otherwise
   /// return null.
   Value *getModuleFlag(StringRef Key) const;
 
-  /// getModuleFlagsMetadata - Returns the NamedMDNode in the module that
-  /// represents module-level flags. This method returns null if there are no
-  /// module-level flags.
+  /// Returns the NamedMDNode in the module that represents module-level flags.
+  /// This method returns null if there are no module-level flags.
   NamedMDNode *getModuleFlagsMetadata() const;
 
-  /// getOrInsertModuleFlagsMetadata - Returns the NamedMDNode in the module
-  /// that represents module-level flags. If module-level flags aren't found,
-  /// it creates the named metadata that contains them.
+  /// Returns the NamedMDNode in the module that represents module-level flags.
+  /// If module-level flags aren't found, it creates the named metadata that
+  /// contains them.
   NamedMDNode *getOrInsertModuleFlagsMetadata();
 
-  /// addModuleFlag - Add a module-level flag to the module-level flags
-  /// metadata. It will create the module-level flags named metadata if it
-  /// doesn't already exist.
+  /// Add a module-level flag to the module-level flags metadata. It will create
+  /// the module-level flags named metadata if it doesn't already exist.
   void addModuleFlag(ModFlagBehavior Behavior, StringRef Key, Value *Val);
   void addModuleFlag(ModFlagBehavior Behavior, StringRef Key, uint32_t Val);
   void addModuleFlag(MDNode *Node);
@@ -436,31 +426,31 @@ public:
 /// @name Materialization
 /// @{
 
-  /// setMaterializer - Sets the GVMaterializer to GVM.  This module must not
-  /// yet have a Materializer.  To reset the materializer for a module that
-  /// already has one, call MaterializeAllPermanently first.  Destroying this
-  /// module will destroy its materializer without materializing any more
-  /// GlobalValues.  Without destroying the Module, there is no way to detach or
-  /// destroy a materializer without materializing all the GVs it controls, to
-  /// avoid leaving orphan unmaterialized GVs.
+  /// Sets the GVMaterializer to GVM. This module must not yet have a
+  /// Materializer. To reset the materializer for a module that already has one,
+  /// call MaterializeAllPermanently first. Destroying this module will destroy
+  /// its materializer without materializing any more GlobalValues. Without
+  /// destroying the Module, there is no way to detach or destroy a materializer
+  /// without materializing all the GVs it controls, to avoid leaving orphan
+  /// unmaterialized GVs.
   void setMaterializer(GVMaterializer *GVM);
-  /// getMaterializer - Retrieves the GVMaterializer, if any, for this Module.
+  /// Retrieves the GVMaterializer, if any, for this Module.
   GVMaterializer *getMaterializer() const { return Materializer.get(); }
 
-  /// isMaterializable - True if the definition of GV has yet to be materialized
-  /// from the GVMaterializer.
+  /// True if the definition of GV has yet to be materializedfrom the
+  /// GVMaterializer.
   bool isMaterializable(const GlobalValue *GV) const;
-  /// isDematerializable - Returns true if this GV was loaded from this Module's
-  /// GVMaterializer and the GVMaterializer knows how to dematerialize the GV.
+  /// Returns true if this GV was loaded from this Module's GVMaterializer and
+  /// the GVMaterializer knows how to dematerialize the GV.
   bool isDematerializable(const GlobalValue *GV) const;
 
-  /// Materialize - Make sure the GlobalValue is fully read.  If the module is
-  /// corrupt, this returns true and fills in the optional string with
-  /// information about the problem.  If successful, this returns false.
-  bool Materialize(GlobalValue *GV, std::string *ErrInfo = 0);
-  /// Dematerialize - If the GlobalValue is read in, and if the GVMaterializer
-  /// supports it, release the memory for the function, and set it up to be
-  /// materialized lazily.  If !isDematerializable(), this method is a noop.
+  /// Make sure the GlobalValue is fully read. If the module is corrupt, this
+  /// returns true and fills in the optional string with information about the
+  /// problem. If successful, this returns false.
+  bool Materialize(GlobalValue *GV, std::string *ErrInfo = nullptr);
+  /// If the GlobalValue is read in, and if the GVMaterializer supports it,
+  /// release the memory for the function, and set it up to be materialized
+  /// lazily. If !isDematerializable(), this method is a noop.
   void Dematerialize(GlobalValue *GV);
 
   /// Make sure all GlobalValues in this Module are fully read.
@@ -598,12 +588,20 @@ public:
   /// is delete'd for real.  Note that no operations are valid on an object
   /// that has "dropped all references", except operator delete.
   void dropAllReferences();
+
+/// @}
+/// @name Utility functions for querying Debug information.
+/// @{
+
+  /// \brief Returns the Dwarf Version by checking module flags.
+  unsigned getDwarfVersion() const;
+
 /// @}
 };
 
 /// An raw_ostream inserter for modules.
 inline raw_ostream &operator<<(raw_ostream &O, const Module &M) {
-  M.print(O, 0);
+  M.print(O, nullptr);
   return O;
 }
 
diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h
index c6c530c..cc2a80b 100644
--- a/include/llvm/IR/PassManager.h
+++ b/include/llvm/IR/PassManager.h
@@ -193,7 +193,7 @@ class PassRunAcceptsAnalysisManager {
   template <typename T> static BigType f(...);
 
 public:
-  enum { Value = sizeof(f<PassT>(0)) == sizeof(SmallType) };
+  enum { Value = sizeof(f<PassT>(nullptr)) == sizeof(SmallType) };
 };
 
 /// \brief A template wrapper used to implement the polymorphic API.
@@ -293,7 +293,7 @@ template <typename IRUnitT, typename ResultT> class ResultHasInvalidateMethod {
   template <typename T> static BigType f(...);
 
 public:
-  enum { Value = sizeof(f<ResultT>(0)) == sizeof(SmallType) };
+  enum { Value = sizeof(f<ResultT>(nullptr)) == sizeof(SmallType) };
 };
 
 /// \brief Wrapper to model the analysis result concept.
@@ -480,7 +480,7 @@ public:
   ///
   /// This method should only be called for a single module as there is the
   /// expectation that the lifetime of a pass is bounded to that of a module.
-  PreservedAnalyses run(Module *M, ModuleAnalysisManager *AM = 0);
+  PreservedAnalyses run(Module *M, ModuleAnalysisManager *AM = nullptr);
 
   template <typename ModulePassT> void addPass(ModulePassT Pass) {
     Passes.emplace_back(new ModulePassModel<ModulePassT>(std::move(Pass)));
@@ -524,7 +524,7 @@ public:
     Passes.emplace_back(new FunctionPassModel<FunctionPassT>(std::move(Pass)));
   }
 
-  PreservedAnalyses run(Function *F, FunctionAnalysisManager *AM = 0);
+  PreservedAnalyses run(Function *F, FunctionAnalysisManager *AM = nullptr);
 
   static StringRef name() { return "FunctionPassManager"; }
 
@@ -616,7 +616,7 @@ public:
     ResultConceptT *ResultConcept =
         derived_this()->getCachedResultImpl(PassT::ID(), IR);
     if (!ResultConcept)
-      return 0;
+      return nullptr;
 
     typedef detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result>
         ResultModelT;
@@ -987,7 +987,7 @@ public:
 
   /// \brief Runs the function pass across every function in the module.
   PreservedAnalyses run(Module *M, ModuleAnalysisManager *AM) {
-    FunctionAnalysisManager *FAM = 0;
+    FunctionAnalysisManager *FAM = nullptr;
     if (AM)
       // Setup the function analysis manager from its proxy.
       FAM = &AM->getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
diff --git a/include/llvm/IR/PredIteratorCache.h b/include/llvm/IR/PredIteratorCache.h
index bf18dfe..02bc583 100644
--- a/include/llvm/IR/PredIteratorCache.h
+++ b/include/llvm/IR/PredIteratorCache.h
@@ -44,7 +44,7 @@ namespace llvm {
       if (Entry) return Entry;
 
       SmallVector<BasicBlock*, 32> PredCache(pred_begin(BB), pred_end(BB));
-      PredCache.push_back(0); // null terminator.
+      PredCache.push_back(nullptr); // null terminator.
       
       BlockToPredCountMap[BB] = PredCache.size()-1;
 
diff --git a/include/llvm/IR/SymbolTableListTraits.h b/include/llvm/IR/SymbolTableListTraits.h
index 561ce01..0a5149c 100644
--- a/include/llvm/IR/SymbolTableListTraits.h
+++ b/include/llvm/IR/SymbolTableListTraits.h
@@ -46,19 +46,19 @@ public:
   /// getListOwner - Return the object that owns this list.  If this is a list
   /// of instructions, it returns the BasicBlock that owns them.
   ItemParentClass *getListOwner() {
-    size_t Offset(size_t(&((ItemParentClass*)0->*ItemParentClass::
-                           getSublistAccess(static_cast<ValueSubClass*>(0)))));
+    size_t Offset(size_t(&((ItemParentClass*)nullptr->*ItemParentClass::
+                           getSublistAccess(static_cast<ValueSubClass*>(nullptr)))));
     iplist<ValueSubClass>* Anchor(static_cast<iplist<ValueSubClass>*>(this));
     return reinterpret_cast<ItemParentClass*>(reinterpret_cast<char*>(Anchor)-
                                               Offset);
   }
 
   static iplist<ValueSubClass> &getList(ItemParentClass *Par) {
-    return Par->*(Par->getSublistAccess((ValueSubClass*)0));
+    return Par->*(Par->getSublistAccess((ValueSubClass*)nullptr));
   }
 
   static ValueSymbolTable *getSymTab(ItemParentClass *Par) {
-    return Par ? toPtr(Par->getValueSymbolTable()) : 0;
+    return Par ? toPtr(Par->getValueSymbolTable()) : nullptr;
   }
 
   void addNodeToList(ValueSubClass *V);
diff --git a/include/llvm/IR/Type.h b/include/llvm/IR/Type.h
index 742a0d3..7955587 100644
--- a/include/llvm/IR/Type.h
+++ b/include/llvm/IR/Type.h
@@ -88,7 +88,7 @@ protected:
   friend class LLVMContextImpl;
   explicit Type(LLVMContext &C, TypeID tid)
     : Context(C), IDAndSubclassData(0),
-      NumContainedTys(0), ContainedTys(0) {
+      NumContainedTys(0), ContainedTys(nullptr) {
     setTypeID(tid);
   }
   ~Type() {}
@@ -265,7 +265,7 @@ public:
   /// get the actual size for a particular target, it is reasonable to use the
   /// DataLayout subsystem to do this.
   ///
-  bool isSized(SmallPtrSet<const Type*, 4> *Visited = 0) const {
+  bool isSized(SmallPtrSet<const Type*, 4> *Visited = nullptr) const {
     // If it's a primitive, it is always sized.
     if (getTypeID() == IntegerTyID || isFloatingPointTy() ||
         getTypeID() == PointerTyID ||
@@ -419,7 +419,7 @@ private:
   /// isSizedDerivedType - Derived types like structures and arrays are sized
   /// iff all of the members of the type are sized as well.  Since asking for
   /// their size is relatively uncommon, move this operation out of line.
-  bool isSizedDerivedType(SmallPtrSet<const Type*, 4> *Visited = 0) const;
+  bool isSizedDerivedType(SmallPtrSet<const Type*, 4> *Visited = nullptr) const;
 };
 
 // Printing of types.
diff --git a/include/llvm/IR/Use.h b/include/llvm/IR/Use.h
index 340572a..033cd3e 100644
--- a/include/llvm/IR/Use.h
+++ b/include/llvm/IR/Use.h
@@ -60,7 +60,7 @@ public:
 /// implicit. The implicit pointer is found via a waymarking algorithm
 /// described in the programmer's manual:
 ///
-///   http://www.llvm.org/docs/ProgrammersManual.html#UserLayout
+///   http://www.llvm.org/docs/ProgrammersManual.html#the-waymarking-algorithm
 ///
 /// This is essentially the single most memory intensive object in LLVM because
 /// of the number of uses in the system. At the same time, the constant time
@@ -88,7 +88,7 @@ private:
   enum PrevPtrTag { zeroDigitTag, oneDigitTag, stopTag, fullStopTag };
 
   /// Constructor
-  Use(PrevPtrTag tag) : Val(0) { Prev.setInt(tag); }
+  Use(PrevPtrTag tag) : Val(nullptr) { Prev.setInt(tag); }
 
 public:
   operator Value *() const { return Val; }
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index 061bc91..bc7696b 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -19,6 +19,7 @@
 #ifndef LLVM_IR_USER_H
 #define LLVM_IR_USER_H
 
+#include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -55,7 +56,7 @@ protected:
   Use *allocHungoffUses(unsigned) const;
   void dropHungoffUses() {
     Use::zap(OperandList, OperandList + NumOperands, true);
-    OperandList = 0;
+    OperandList = nullptr;
     // Reset NumOperands so User::operator delete() does the right thing.
     NumOperands = 0;
   }
@@ -129,33 +130,13 @@ public:
 
   /// Convenience iterator for directly iterating over the Values in the
   /// OperandList
-  class value_op_iterator : public std::iterator<std::forward_iterator_tag,
-                                                 Value*> {
-    op_iterator OI;
-  public:
-    explicit value_op_iterator(Use *U) : OI(U) {}
-
-    bool operator==(const value_op_iterator &x) const {
-      return OI == x.OI;
-    }
-    bool operator!=(const value_op_iterator &x) const {
-      return !operator==(x);
-    }
-
-    /// Iterator traversal: forward iteration only
-    value_op_iterator &operator++() {          // Preincrement
-      ++OI;
-      return *this;
-    }
-    value_op_iterator operator++(int) {        // Postincrement
-      value_op_iterator tmp = *this; ++*this; return tmp;
-    }
-
-    /// Retrieve a pointer to the current Value.
-    Value *operator*() const {
-      return *OI;
-    }
+  struct value_op_iterator
+      : iterator_adaptor_base<value_op_iterator, op_iterator,
+                              std::random_access_iterator_tag, Value *,
+                              ptrdiff_t, Value *, Value *> {
+    explicit value_op_iterator(Use *U = nullptr) : iterator_adaptor_base(U) {}
 
+    Value *operator*() const { return *I; }
     Value *operator->() const { return operator*(); }
   };
 
@@ -179,7 +160,7 @@ public:
   //
   void dropAllReferences() {
     for (Use &U : operands())
-      U.set(0);
+      U.set(nullptr);
   }
 
   /// replaceUsesOfWith - Replaces all references to the "From" definition with
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index d5b9f11..0158683 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file declares the Value class. 
+// This file declares the Value class.
 //
 //===----------------------------------------------------------------------===//
 
@@ -31,6 +31,7 @@ class Constant;
 class DataLayout;
 class Function;
 class GlobalAlias;
+class GlobalObject;
 class GlobalValue;
 class GlobalVariable;
 class InlineAsm;
@@ -52,7 +53,7 @@ typedef StringMapEntry<Value*> ValueName;
 //                                 Value Class
 //===----------------------------------------------------------------------===//
 
-/// This is a very important LLVM class. It is the base class of all values 
+/// This is a very important LLVM class. It is the base class of all values
 /// computed by a program that may be used as operands to other values. Value is
 /// the super class of other important classes such as Instruction and Function.
 /// All Values have a Type. Type is not a subclass of Value. Some values can
@@ -182,10 +183,6 @@ private:
   Value(const Value &) LLVM_DELETED_FUNCTION;
 
 protected:
-  /// printCustom - Value subclasses can override this to implement custom
-  /// printing behavior.
-  virtual void printCustom(raw_ostream &O) const;
-
   Value(Type *Ty, unsigned scid);
 public:
   virtual ~Value();
@@ -196,14 +193,15 @@ public:
 
   /// print - Implement operator<< on Value.
   ///
-  void print(raw_ostream &O, AssemblyAnnotationWriter *AAW = 0) const;
+  void print(raw_ostream &O) const;
 
   /// \brief Print the name of this Value out to the specified raw_ostream.
   /// This is useful when you just want to print 'int %reg126', not the
   /// instruction that generated it. If you specify a Module for context, then
   /// even constanst get pretty-printed; for example, the type of a null
   /// pointer is printed symbolically.
-  void printAsOperand(raw_ostream &O, bool PrintType = true, const Module *M = 0) const;
+  void printAsOperand(raw_ostream &O, bool PrintType = true,
+                      const Module *M = nullptr) const;
 
   /// All values are typed, get the type of this value.
   ///
@@ -213,10 +211,10 @@ public:
   LLVMContext &getContext() const;
 
   // All values can potentially be named.
-  bool hasName() const { return Name != 0 && SubclassID != MDStringVal; }
+  bool hasName() const { return Name != nullptr && SubclassID != MDStringVal; }
   ValueName *getValueName() const { return Name; }
   void setValueName(ValueName *VN) { Name = VN; }
-  
+
   /// getName() - Return a constant reference to the value's name. This is cheap
   /// and guaranteed to return the same reference as long as the value is not
   /// modified.
@@ -228,9 +226,9 @@ public:
   /// \param Name The new name; or "" if the value's name should be removed.
   void setName(const Twine &Name);
 
-  
+
   /// takeName - transfer the name from V to this value, setting V's name to
-  /// empty.  It is an error to call V->takeName(V). 
+  /// empty.  It is an error to call V->takeName(V).
   void takeName(Value *V);
 
   /// replaceAllUsesWith - Go through the uses list for this definition and make
@@ -242,7 +240,7 @@ public:
   //----------------------------------------------------------------------
   // Methods for handling the chain of uses of this Value.
   //
-  bool               use_empty() const { return UseList == 0; }
+  bool               use_empty() const { return UseList == nullptr; }
 
   typedef use_iterator_impl<Use>       use_iterator;
   typedef use_iterator_impl<const Use> const_use_iterator;
@@ -303,7 +301,7 @@ public:
   void addUse(Use &U) { U.addToList(&UseList); }
 
   /// An enumeration for keeping track of the concrete subclass of Value that
-  /// is actually instantiated. Values of this enumeration are kept in the 
+  /// is actually instantiated. Values of this enumeration are kept in the
   /// Value classes SubclassID field. They are used for concrete type
   /// identification.
   enum ValueTy {
@@ -327,9 +325,6 @@ public:
     MDNodeVal,                // This is an instance of MDNode
     MDStringVal,              // This is an instance of MDString
     InlineAsmVal,             // This is an instance of InlineAsm
-    PseudoSourceValueVal,     // This is an instance of PseudoSourceValue
-    FixedStackPseudoSourceValueVal, // This is an instance of 
-                                    // FixedStackPseudoSourceValue
     InstructionVal,           // This is an instance of Instruction
     // Enum values starting at InstructionVal are used for Instructions;
     // don't add new values here!
@@ -436,7 +431,7 @@ public:
   /// isDereferenceablePointer - Test if this value is always a pointer to
   /// allocated and suitably aligned memory for a simple load or store.
   bool isDereferenceablePointer() const;
-  
+
   /// DoPHITranslation - If this value is a PHI node with CurBB as its parent,
   /// return the value in the PHI node corresponding to PredBB.  If not, return
   /// ourself.  This is useful if you want to know the value something has in a
@@ -447,11 +442,11 @@ public:
                                 const BasicBlock *PredBB) const{
     return const_cast<Value*>(this)->DoPHITranslation(CurBB, PredBB);
   }
-  
+
   /// MaximumAlignment - This is the greatest alignment value supported by
   /// load, store, and alloca instructions, and global values.
   static const unsigned MaximumAlignment = 1u << 29;
-  
+
   /// mutateType - Mutate the type of this Value to be of the specified type.
   /// Note that this is an extremely dangerous operation which can create
   /// completely invalid IR very easily.  It is strongly recommended that you
@@ -460,7 +455,7 @@ public:
   void mutateType(Type *Ty) {
     VTy = Ty;
   }
-  
+
 protected:
   unsigned short getSubclassDataFromValue() const { return SubclassData; }
   void setValueSubclassData(unsigned short D) { SubclassData = D; }
@@ -470,7 +465,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const Value &V) {
   V.print(OS);
   return OS;
 }
-  
+
 void Use::set(Value *V) {
   if (Val) removeFromList();
   Val = V;
@@ -494,55 +489,60 @@ template <> struct isa_impl<Argument, Value> {
   }
 };
 
-template <> struct isa_impl<InlineAsm, Value> { 
+template <> struct isa_impl<InlineAsm, Value> {
   static inline bool doit(const Value &Val) {
     return Val.getValueID() == Value::InlineAsmVal;
   }
 };
 
-template <> struct isa_impl<Instruction, Value> { 
+template <> struct isa_impl<Instruction, Value> {
   static inline bool doit(const Value &Val) {
     return Val.getValueID() >= Value::InstructionVal;
   }
 };
 
-template <> struct isa_impl<BasicBlock, Value> { 
+template <> struct isa_impl<BasicBlock, Value> {
   static inline bool doit(const Value &Val) {
     return Val.getValueID() == Value::BasicBlockVal;
   }
 };
 
-template <> struct isa_impl<Function, Value> { 
+template <> struct isa_impl<Function, Value> {
   static inline bool doit(const Value &Val) {
     return Val.getValueID() == Value::FunctionVal;
   }
 };
 
-template <> struct isa_impl<GlobalVariable, Value> { 
+template <> struct isa_impl<GlobalVariable, Value> {
   static inline bool doit(const Value &Val) {
     return Val.getValueID() == Value::GlobalVariableVal;
   }
 };
 
-template <> struct isa_impl<GlobalAlias, Value> { 
+template <> struct isa_impl<GlobalAlias, Value> {
   static inline bool doit(const Value &Val) {
     return Val.getValueID() == Value::GlobalAliasVal;
   }
 };
 
-template <> struct isa_impl<GlobalValue, Value> { 
+template <> struct isa_impl<GlobalValue, Value> {
+  static inline bool doit(const Value &Val) {
+    return isa<GlobalObject>(Val) || isa<GlobalAlias>(Val);
+  }
+};
+
+template <> struct isa_impl<GlobalObject, Value> {
   static inline bool doit(const Value &Val) {
-    return isa<GlobalVariable>(Val) || isa<Function>(Val) ||
-      isa<GlobalAlias>(Val);
+    return isa<GlobalVariable>(Val) || isa<Function>(Val);
   }
 };
 
-template <> struct isa_impl<MDNode, Value> { 
+template <> struct isa_impl<MDNode, Value> {
   static inline bool doit(const Value &Val) {
     return Val.getValueID() == Value::MDNodeVal;
   }
 };
-  
+
 // Value* is only 4-byte aligned.
 template<>
 class PointerLikeTypeTraits<Value*> {
@@ -559,7 +559,7 @@ public:
 DEFINE_ISA_CONVERSION_FUNCTIONS(Value, LLVMValueRef)
 
 /* Specialized opaque value conversions.
- */ 
+ */
 inline Value **unwrap(LLVMValueRef *Vals) {
   return reinterpret_cast<Value**>(Vals);
 }
diff --git a/include/llvm/IR/ValueHandle.h b/include/llvm/IR/ValueHandle.h
index 9b5e11a..aa29b2e 100644
--- a/include/llvm/IR/ValueHandle.h
+++ b/include/llvm/IR/ValueHandle.h
@@ -64,14 +64,14 @@ private:
   ValueHandleBase(const ValueHandleBase&) LLVM_DELETED_FUNCTION;
 public:
   explicit ValueHandleBase(HandleBaseKind Kind)
-    : PrevPair(0, Kind), Next(0), VP(0, 0) {}
+    : PrevPair(nullptr, Kind), Next(nullptr), VP(nullptr, 0) {}
   ValueHandleBase(HandleBaseKind Kind, Value *V)
-    : PrevPair(0, Kind), Next(0), VP(V, 0) {
+    : PrevPair(nullptr, Kind), Next(nullptr), VP(V, 0) {
     if (isValid(VP.getPointer()))
       AddToUseList();
   }
   ValueHandleBase(HandleBaseKind Kind, const ValueHandleBase &RHS)
-    : PrevPair(0, Kind), Next(0), VP(RHS.VP) {
+    : PrevPair(nullptr, Kind), Next(nullptr), VP(RHS.VP) {
     if (isValid(VP.getPointer()))
       AddToExistingUseList(RHS.getPrevPtr());
   }
@@ -214,7 +214,7 @@ public:
   AssertingVH(ValueTy *P) : ValueHandleBase(Assert, GetAsValue(P)) {}
   AssertingVH(const AssertingVH &RHS) : ValueHandleBase(Assert, RHS) {}
 #else
-  AssertingVH() : ThePtr(0) {}
+  AssertingVH() : ThePtr(nullptr) {}
   AssertingVH(ValueTy *P) : ThePtr(P) {}
 #endif
 
@@ -366,7 +366,7 @@ public:
   ///
   /// All implementations must remove the reference from this object to the
   /// Value that's being destroyed.
-  virtual void deleted() { setValPtr(NULL); }
+  virtual void deleted() { setValPtr(nullptr); }
 
   /// Called when this->getValPtr()->replaceAllUsesWith(new_value) is called,
   /// _before_ any of the uses have actually been replaced.  If WeakVH were
diff --git a/include/llvm/IR/ValueMap.h b/include/llvm/IR/ValueMap.h
index 42da529..1503aed 100644
--- a/include/llvm/IR/ValueMap.h
+++ b/include/llvm/IR/ValueMap.h
@@ -67,7 +67,7 @@ struct ValueMapConfig {
   /// and onDelete) and not inside other ValueMap methods.  NULL means that no
   /// mutex is necessary.
   template<typename ExtraDataT>
-  static sys::Mutex *getMutex(const ExtraDataT &/*Data*/) { return NULL; }
+  static sys::Mutex *getMutex(const ExtraDataT &/*Data*/) { return nullptr; }
 };
 
 /// See the file comment.
@@ -253,10 +253,10 @@ struct DenseMapInfo<ValueMapCallbackVH<KeyT, ValueT, Config> > {
   typedef DenseMapInfo<KeyT> PointerInfo;
 
   static inline VH getEmptyKey() {
-    return VH(PointerInfo::getEmptyKey(), NULL);
+    return VH(PointerInfo::getEmptyKey(), nullptr);
   }
   static inline VH getTombstoneKey() {
-    return VH(PointerInfo::getTombstoneKey(), NULL);
+    return VH(PointerInfo::getTombstoneKey(), nullptr);
   }
   static unsigned getHashValue(const VH &Val) {
     return PointerInfo::getHashValue(Val.Unwrap());
diff --git a/include/llvm/IR/Verifier.h b/include/llvm/IR/Verifier.h
index 9a2f402..0272e20 100644
--- a/include/llvm/IR/Verifier.h
+++ b/include/llvm/IR/Verifier.h
@@ -28,6 +28,7 @@ namespace llvm {
 
 class Function;
 class FunctionPass;
+class ModulePass;
 class Module;
 class PreservedAnalyses;
 class raw_ostream;
@@ -38,14 +39,14 @@ class raw_ostream;
 /// If there are no errors, the function returns false. If an error is found,
 /// a message describing the error is written to OS (if non-null) and true is
 /// returned.
-bool verifyFunction(const Function &F, raw_ostream *OS = 0);
+bool verifyFunction(const Function &F, raw_ostream *OS = nullptr);
 
 /// \brief Check a module for errors.
 ///
 /// If there are no errors, the function returns false. If an error is found,
 /// a message describing the error is written to OS (if non-null) and true is
 /// returned.
-bool verifyModule(const Module &M, raw_ostream *OS = 0);
+bool verifyModule(const Module &M, raw_ostream *OS = nullptr);
 
 /// \brief Create a verifier pass.
 ///
@@ -58,6 +59,18 @@ bool verifyModule(const Module &M, raw_ostream *OS = 0);
 /// Note that this creates a pass suitable for the legacy pass manager. It has nothing to do with \c VerifierPass.
 FunctionPass *createVerifierPass(bool FatalErrors = true);
 
+/// \brief Create a debug-info verifier pass.
+///
+/// Check a module for validity of debug info. This is essentially a pass
+/// wrapped around the debug-info parts of \a verifyModule().  When the pass
+/// detects a verification error it is always printed to stderr, and by default
+/// they are fatal. You can override that by passing \c false to \p
+/// FatalErrors.
+///
+/// Note that this creates a pass suitable for the legacy pass manager. It has
+/// nothing to do with \c VerifierPass.
+ModulePass *createDebugInfoVerifierPass(bool FatalErrors = true);
+
 class VerifierPass {
   bool FatalErrors;
 
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 9b9f234..8e53615 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -71,6 +71,7 @@ void initializeAliasDebuggerPass(PassRegistry&);
 void initializeAliasSetPrinterPass(PassRegistry&);
 void initializeAlwaysInlinerPass(PassRegistry&);
 void initializeArgPromotionPass(PassRegistry&);
+void initializeAtomicExpandLoadLinkedPass(PassRegistry&);
 void initializeSampleProfileLoaderPass(PassRegistry&);
 void initializeBarrierNoopPass(PassRegistry&);
 void initializeBasicAliasAnalysisPass(PassRegistry&);
@@ -103,6 +104,7 @@ void initializeDAHPass(PassRegistry&);
 void initializeDCEPass(PassRegistry&);
 void initializeDSEPass(PassRegistry&);
 void initializeDebugIRPass(PassRegistry&);
+void initializeDebugInfoVerifierLegacyPassPass(PassRegistry &);
 void initializeDeadInstEliminationPass(PassRegistry&);
 void initializeDeadMachineInstructionElimPass(PassRegistry&);
 void initializeDelinearizationPass(PassRegistry &);
@@ -236,6 +238,7 @@ void initializeSimpleInlinerPass(PassRegistry&);
 void initializeRegisterCoalescerPass(PassRegistry&);
 void initializeSingleLoopExtractorPass(PassRegistry&);
 void initializeSinkingPass(PassRegistry&);
+void initializeSeparateConstOffsetFromGEPPass(PassRegistry &);
 void initializeSlotIndexesPass(PassRegistry&);
 void initializeSpillPlacementPass(PassRegistry&);
 void initializeStackProtectorPass(PassRegistry&);
diff --git a/include/llvm/LTO/LTOCodeGenerator.h b/include/llvm/LTO/LTOCodeGenerator.h
index 5433991..b19b232 100644
--- a/include/llvm/LTO/LTOCodeGenerator.h
+++ b/include/llvm/LTO/LTOCodeGenerator.h
@@ -53,11 +53,9 @@ namespace llvm {
   class TargetLibraryInfo;
   class TargetMachine;
   class raw_ostream;
-}
 
 //===----------------------------------------------------------------------===//
-/// LTOCodeGenerator - C++ class which implements the opaque lto_code_gen_t
-/// type.
+/// C++ class which implements the opaque lto_code_gen_t type.
 ///
 struct LTOCodeGenerator {
   static const char *getVersionString();
@@ -68,11 +66,12 @@ struct LTOCodeGenerator {
   // Merge given module, return true on success.
   bool addModule(struct LTOModule*, std::string &errMsg);
 
-  void setTargetOptions(llvm::TargetOptions options);
+  void setTargetOptions(TargetOptions options);
   void setDebugInfo(lto_debug_model);
   void setCodePICModel(lto_codegen_model);
 
   void setCpu(const char *mCpu) { MCpu = mCpu; }
+  void setAttr(const char *mAttr) { MAttr = mAttr; }
 
   void addMustPreserveSymbol(const char *sym) { MustPreserveSymbols[sym] = 1; }
 
@@ -120,40 +119,37 @@ struct LTOCodeGenerator {
 private:
   void initializeLTOPasses();
 
-  bool generateObjectFile(llvm::raw_ostream &out,
-                          bool disableOpt,
-                          bool disableInline,
-                          bool disableGVNLoadPRE,
-                          std::string &errMsg);
+  bool generateObjectFile(raw_ostream &out, bool disableOpt, bool disableInline,
+                          bool disableGVNLoadPRE, std::string &errMsg);
   void applyScopeRestrictions();
-  void applyRestriction(llvm::GlobalValue &GV,
-                        const llvm::ArrayRef<llvm::StringRef> &Libcalls,
-                        std::vector<const char*> &MustPreserveList,
-                        llvm::SmallPtrSet<llvm::GlobalValue*, 8> &AsmUsed,
-                        llvm::Mangler &Mangler);
+  void applyRestriction(GlobalValue &GV, const ArrayRef<StringRef> &Libcalls,
+                        std::vector<const char *> &MustPreserveList,
+                        SmallPtrSet<GlobalValue *, 8> &AsmUsed,
+                        Mangler &Mangler);
   bool determineTarget(std::string &errMsg);
 
-  static void DiagnosticHandler(const llvm::DiagnosticInfo &DI, void *Context);
+  static void DiagnosticHandler(const DiagnosticInfo &DI, void *Context);
 
-  void DiagnosticHandler2(const llvm::DiagnosticInfo &DI);
+  void DiagnosticHandler2(const DiagnosticInfo &DI);
 
-  typedef llvm::StringMap<uint8_t> StringSet;
+  typedef StringMap<uint8_t> StringSet;
 
-  llvm::LLVMContext &Context;
-  llvm::Linker Linker;
-  llvm::TargetMachine *TargetMach;
+  LLVMContext &Context;
+  Linker IRLinker;
+  TargetMachine *TargetMach;
   bool EmitDwarfDebugInfo;
   bool ScopeRestrictionsDone;
   lto_codegen_model CodeModel;
   StringSet MustPreserveSymbols;
   StringSet AsmUndefinedRefs;
-  llvm::MemoryBuffer *NativeObjectFile;
+  MemoryBuffer *NativeObjectFile;
   std::vector<char *> CodegenOptions;
   std::string MCpu;
+  std::string MAttr;
   std::string NativeObjectPath;
-  llvm::TargetOptions Options;
+  TargetOptions Options;
   lto_diagnostic_handler_t DiagHandler;
   void *DiagContext;
 };
-
+}
 #endif // LTO_CODE_GENERATOR_H
diff --git a/include/llvm/LTO/LTOModule.h b/include/llvm/LTO/LTOModule.h
index 1e4fa1b..f1b1480 100644
--- a/include/llvm/LTO/LTOModule.h
+++ b/include/llvm/LTO/LTOModule.h
@@ -31,25 +31,24 @@ namespace llvm {
   class MemoryBuffer;
   class TargetOptions;
   class Value;
-}
 
 //===----------------------------------------------------------------------===//
-/// LTOModule - C++ class which implements the opaque lto_module_t type.
+/// C++ class which implements the opaque lto_module_t type.
 ///
 struct LTOModule {
 private:
-  typedef llvm::StringMap<uint8_t> StringSet;
+  typedef StringMap<uint8_t> StringSet;
 
   struct NameAndAttributes {
     const char        *name;
     uint32_t           attributes;
     bool               isFunction;
-    const llvm::GlobalValue *symbol;
+    const GlobalValue *symbol;
   };
 
-  std::unique_ptr<llvm::Module>           _module;
-  std::unique_ptr<llvm::TargetMachine>    _target;
-  llvm::MCObjectFileInfo ObjFileInfo;
+  std::unique_ptr<Module> _module;
+  std::unique_ptr<TargetMachine> _target;
+  MCObjectFileInfo ObjFileInfo;
   StringSet                               _linkeropt_strings;
   std::vector<const char *>               _deplibs;
   std::vector<const char *>               _linkeropts;
@@ -57,174 +56,161 @@ private:
 
   // _defines and _undefines only needed to disambiguate tentative definitions
   StringSet                               _defines;
-  llvm::StringMap<NameAndAttributes>      _undefines;
+  StringMap<NameAndAttributes> _undefines;
   std::vector<const char*>                _asm_undefines;
-  llvm::MCContext                         _context;
+  MCContext _context;
 
   // Use mangler to add GlobalPrefix to names to match linker names.
-  llvm::Mangler                           _mangler;
+  Mangler _mangler;
+
+  LTOModule(Module *m, TargetMachine *t);
 
-  LTOModule(llvm::Module *m, llvm::TargetMachine *t);
 public:
-  /// isBitcodeFile - Returns 'true' if the file or memory contents is LLVM
-  /// bitcode.
+  /// Returns 'true' if the file or memory contents is LLVM bitcode.
   static bool isBitcodeFile(const void *mem, size_t length);
   static bool isBitcodeFile(const char *path);
 
-  /// isBitcodeFileForTarget - Returns 'true' if the file or memory contents
-  /// is LLVM bitcode for the specified triple.
+  /// Returns 'true' if the file or memory contents is LLVM bitcode for the
+  /// specified triple.
   static bool isBitcodeFileForTarget(const void *mem,
                                      size_t length,
                                      const char *triplePrefix);
   static bool isBitcodeFileForTarget(const char *path,
                                      const char *triplePrefix);
 
-  /// makeLTOModule - Create an LTOModule. N.B. These methods take ownership
-  /// of the buffer. The caller must have initialized the Targets, the
-  /// TargetMCs, the AsmPrinters, and the AsmParsers by calling:
+  /// Create an LTOModule. N.B. These methods take ownership of the buffer. The
+  /// caller must have initialized the Targets, the TargetMCs, the AsmPrinters,
+  /// and the AsmParsers by calling:
   ///
   /// InitializeAllTargets();
   /// InitializeAllTargetMCs();
   /// InitializeAllAsmPrinters();
   /// InitializeAllAsmParsers();
-  static LTOModule *makeLTOModule(const char* path,
-                                  llvm::TargetOptions options,
+  static LTOModule *makeLTOModule(const char *path, TargetOptions options,
                                   std::string &errMsg);
-  static LTOModule *makeLTOModule(int fd, const char *path,
-                                  size_t size, llvm::TargetOptions options,
+  static LTOModule *makeLTOModule(int fd, const char *path, size_t size,
+                                  TargetOptions options, std::string &errMsg);
+  static LTOModule *makeLTOModule(int fd, const char *path, size_t map_size,
+                                  off_t offset, TargetOptions options,
                                   std::string &errMsg);
-  static LTOModule *makeLTOModule(int fd, const char *path,
-                                  size_t map_size,
-                                  off_t offset, llvm::TargetOptions options,
-                                  std::string& errMsg);
   static LTOModule *makeLTOModule(const void *mem, size_t length,
-                                  llvm::TargetOptions options,
-                                  std::string &errMsg,
-                                  llvm::StringRef path = "");
+                                  TargetOptions options, std::string &errMsg,
+                                  StringRef path = "");
 
-  /// getTargetTriple - Return the Module's target triple.
+  /// Return the Module's target triple.
   const char *getTargetTriple() {
     return _module->getTargetTriple().c_str();
   }
 
-  /// setTargetTriple - Set the Module's target triple.
+  /// Set the Module's target triple.
   void setTargetTriple(const char *triple) {
     _module->setTargetTriple(triple);
   }
 
-  /// getSymbolCount - Get the number of symbols
+  /// Get the number of symbols
   uint32_t getSymbolCount() {
     return _symbols.size();
   }
 
-  /// getSymbolAttributes - Get the attributes for a symbol at the specified
-  /// index.
+  /// Get the attributes for a symbol at the specified index.
   lto_symbol_attributes getSymbolAttributes(uint32_t index) {
     if (index < _symbols.size())
       return lto_symbol_attributes(_symbols[index].attributes);
     return lto_symbol_attributes(0);
   }
 
-  /// getSymbolName - Get the name of the symbol at the specified index.
+  /// Get the name of the symbol at the specified index.
   const char *getSymbolName(uint32_t index) {
     if (index < _symbols.size())
       return _symbols[index].name;
-    return NULL;
+    return nullptr;
   }
 
-  /// getDependentLibraryCount - Get the number of dependent libraries
+  /// Get the number of dependent libraries
   uint32_t getDependentLibraryCount() {
     return _deplibs.size();
   }
 
-  /// getDependentLibrary - Get the dependent library at the specified index.
+  /// Get the dependent library at the specified index.
   const char *getDependentLibrary(uint32_t index) {
     if (index < _deplibs.size())
       return _deplibs[index];
-    return NULL;
+    return nullptr;
   }
 
-  /// getLinkerOptCount - Get the number of linker options
+  /// Get the number of linker options
   uint32_t getLinkerOptCount() {
     return _linkeropts.size();
   }
 
-  /// getLinkerOpt - Get the linker option at the specified index.
+  /// Get the linker option at the specified index.
   const char *getLinkerOpt(uint32_t index) {
     if (index < _linkeropts.size())
       return _linkeropts[index];
-    return NULL;
+    return nullptr;
   }
 
-  /// getLLVVMModule - Return the Module.
-  llvm::Module *getLLVVMModule() { return _module.get(); }
+  /// Return the Module.
+  Module *getLLVVMModule() { return _module.get(); }
 
-  /// getAsmUndefinedRefs -
   const std::vector<const char*> &getAsmUndefinedRefs() {
     return _asm_undefines;
   }
 
 private:
-  /// parseMetadata - Parse metadata from the module
+  /// Parse metadata from the module
   // FIXME: it only parses "Linker Options" metadata at the moment
   void parseMetadata();
 
-  /// parseSymbols - Parse the symbols from the module and model-level ASM and
-  /// add them to either the defined or undefined lists.
+  /// Parse the symbols from the module and model-level ASM and add them to
+  /// either the defined or undefined lists.
   bool parseSymbols(std::string &errMsg);
 
-  /// addPotentialUndefinedSymbol - Add a symbol which isn't defined just yet
-  /// to a list to be resolved later.
-  void addPotentialUndefinedSymbol(const llvm::GlobalValue *dcl, bool isFunc);
+  /// Add a symbol which isn't defined just yet to a list to be resolved later.
+  void addPotentialUndefinedSymbol(const GlobalValue *dcl, bool isFunc);
 
-  /// addDefinedSymbol - Add a defined symbol to the list.
-  void addDefinedSymbol(const llvm::GlobalValue *def, bool isFunction);
+  /// Add a defined symbol to the list.
+  void addDefinedSymbol(const GlobalValue *def, bool isFunction);
 
-  /// addDefinedFunctionSymbol - Add a function symbol as defined to the list.
-  void addDefinedFunctionSymbol(const llvm::Function *f);
+  /// Add a function symbol as defined to the list.
+  void addDefinedFunctionSymbol(const Function *f);
 
-  /// addDefinedDataSymbol - Add a data symbol as defined to the list.
-  void addDefinedDataSymbol(const llvm::GlobalValue *v);
+  /// Add a data symbol as defined to the list.
+  void addDefinedDataSymbol(const GlobalValue *v);
 
-  /// addAsmGlobalSymbols - Add global symbols from module-level ASM to the
-  /// defined or undefined lists.
+  /// Add global symbols from module-level ASM to the defined or undefined
+  /// lists.
   bool addAsmGlobalSymbols(std::string &errMsg);
 
-  /// addAsmGlobalSymbol - Add a global symbol from module-level ASM to the
-  /// defined list.
+  /// Add a global symbol from module-level ASM to the defined list.
   void addAsmGlobalSymbol(const char *, lto_symbol_attributes scope);
 
-  /// addAsmGlobalSymbolUndef - Add a global symbol from module-level ASM to
-  /// the undefined list.
+  /// Add a global symbol from module-level ASM to the undefined list.
   void addAsmGlobalSymbolUndef(const char *);
 
-  /// addObjCClass - Parse i386/ppc ObjC class data structure.
-  void addObjCClass(const llvm::GlobalVariable *clgv);
+  /// Parse i386/ppc ObjC class data structure.
+  void addObjCClass(const GlobalVariable *clgv);
 
-  /// addObjCCategory - Parse i386/ppc ObjC category data structure.
-  void addObjCCategory(const llvm::GlobalVariable *clgv);
+  /// Parse i386/ppc ObjC category data structure.
+  void addObjCCategory(const GlobalVariable *clgv);
 
-  /// addObjCClassRef - Parse i386/ppc ObjC class list data structure.
-  void addObjCClassRef(const llvm::GlobalVariable *clgv);
+  /// Parse i386/ppc ObjC class list data structure.
+  void addObjCClassRef(const GlobalVariable *clgv);
 
-  /// objcClassNameFromExpression - Get string that the data pointer points
-  /// to.
-  bool objcClassNameFromExpression(const llvm::Constant* c, std::string &name);
+  /// Get string that the data pointer points to.
+  bool objcClassNameFromExpression(const Constant *c, std::string &name);
 
-  /// isTargetMatch - Returns 'true' if the memory buffer is for the specified
-  /// target triple.
-  static bool isTargetMatch(llvm::MemoryBuffer *memBuffer,
-                            const char *triplePrefix);
+  /// Returns 'true' if the memory buffer is for the specified target triple.
+  static bool isTargetMatch(MemoryBuffer *memBuffer, const char *triplePrefix);
 
-  /// makeLTOModule - Create an LTOModule (private version). N.B. This
-  /// method takes ownership of the buffer.
-  static LTOModule *makeLTOModule(llvm::MemoryBuffer *buffer,
-                                  llvm::TargetOptions options,
+  /// Create an LTOModule (private version). N.B. This method takes ownership of
+  /// the buffer.
+  static LTOModule *makeLTOModule(MemoryBuffer *buffer, TargetOptions options,
                                   std::string &errMsg);
 
   /// Create a MemoryBuffer from a memory range with an optional name.
-  static llvm::MemoryBuffer *makeBuffer(const void *mem, size_t length,
-                                        llvm::StringRef name = "");
+  static MemoryBuffer *makeBuffer(const void *mem, size_t length,
+                                  StringRef name = "");
 };
-
+}
 #endif // LTO_MODULE_H
diff --git a/include/llvm/LineEditor/LineEditor.h b/include/llvm/LineEditor/LineEditor.h
index 42839ed..1a9a691 100644
--- a/include/llvm/LineEditor/LineEditor.h
+++ b/include/llvm/LineEditor/LineEditor.h
@@ -11,9 +11,9 @@
 #define LLVM_LINEEDITOR_LINEEDITOR_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/StringRef.h"
-#include <stdio.h>
+#include <cstdio>
+#include <memory>
 #include <string>
 #include <vector>
 
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 1603250..2616ebd 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -53,7 +53,7 @@ namespace {
       (void) llvm::createAliasDebugger();
       (void) llvm::createArgumentPromotionPass();
       (void) llvm::createBasicAliasAnalysisPass();
-      (void) llvm::createLibCallAliasAnalysisPass(0);
+      (void) llvm::createLibCallAliasAnalysisPass(nullptr);
       (void) llvm::createScalarEvolutionAliasAnalysisPass();
       (void) llvm::createTypeBasedAliasAnalysisPass();
       (void) llvm::createBoundsCheckingPass();
@@ -140,9 +140,9 @@ namespace {
       (void) llvm::createMetaRenamerPass();
       (void) llvm::createFunctionAttrsPass();
       (void) llvm::createMergeFunctionsPass();
-      (void) llvm::createPrintModulePass(*(llvm::raw_ostream*)0);
-      (void) llvm::createPrintFunctionPass(*(llvm::raw_ostream*)0);
-      (void) llvm::createPrintBasicBlockPass(*(llvm::raw_ostream*)0);
+      (void) llvm::createPrintModulePass(*(llvm::raw_ostream*)nullptr);
+      (void) llvm::createPrintFunctionPass(*(llvm::raw_ostream*)nullptr);
+      (void) llvm::createPrintBasicBlockPass(*(llvm::raw_ostream*)nullptr);
       (void) llvm::createModuleDebugInfoPrinterPass();
       (void) llvm::createPartialInliningPass();
       (void) llvm::createLintPass();
@@ -156,15 +156,16 @@ namespace {
       (void) llvm::createBBVectorizePass();
       (void) llvm::createPartiallyInlineLibCallsPass();
       (void) llvm::createScalarizerPass();
+      (void) llvm::createSeparateConstOffsetFromGEPPass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::FindUsedTypes();
       (void)new llvm::ScalarEvolution();
-      ((llvm::Function*)0)->viewCFGOnly();
+      ((llvm::Function*)nullptr)->viewCFGOnly();
       llvm::RGPassManager RGM;
-      ((llvm::RegionPass*)0)->runOnRegion((llvm::Region*)0, RGM);
-      llvm::AliasSetTracker X(*(llvm::AliasAnalysis*)0);
-      X.add((llvm::Value*)0, 0, 0);  // for -print-alias-sets
+      ((llvm::RegionPass*)nullptr)->runOnRegion((llvm::Region*)nullptr, RGM);
+      llvm::AliasSetTracker X(*(llvm::AliasAnalysis*)nullptr);
+      X.add((llvm::Value*)nullptr, 0, nullptr);  // for -print-alias-sets
     }
   } ForcePassLinking; // Force link by creating a global definition.
 }
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 037a24f..f7d3be2 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -365,7 +365,7 @@ namespace llvm {
     /// specify a section to switch to if the translation unit doesn't have any
     /// trampolines that require an executable stack.
     virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const{
-      return 0;
+      return nullptr;
     }
 
     virtual const MCExpr *
diff --git a/include/llvm/MC/MCAsmLayout.h b/include/llvm/MC/MCAsmLayout.h
index 3058b7b..f048e34 100644
--- a/include/llvm/MC/MCAsmLayout.h
+++ b/include/llvm/MC/MCAsmLayout.h
@@ -17,6 +17,7 @@ namespace llvm {
 class MCAssembler;
 class MCFragment;
 class MCSectionData;
+class MCSymbol;
 class MCSymbolData;
 
 /// Encapsulates the layout of an assembly file at a particular point in time.
@@ -102,8 +103,15 @@ public:
 
   /// \brief Get the offset of the given symbol, as computed in the current
   /// layout.
+  /// \result True on success.
+  bool getSymbolOffset(const MCSymbolData *SD, uint64_t &Val) const;
+
+  /// \brief Variant that reports a fatal error if the offset is not computable.
   uint64_t getSymbolOffset(const MCSymbolData *SD) const;
 
+  /// \brief If this symbol is equivalent to A + Constant, return A.
+  const MCSymbol *getBaseSymbol(const MCSymbol &Symbol) const;
+
   /// @}
 };
 
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index 34b760c..be13b36 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -52,7 +52,6 @@ public:
   enum FragmentType {
     FT_Align,
     FT_Data,
-    FT_Compressed,
     FT_CompactEncodedInst,
     FT_Fill,
     FT_Relaxable,
@@ -87,7 +86,7 @@ private:
   /// @}
 
 protected:
-  MCFragment(FragmentType _Kind, MCSectionData *_Parent = 0);
+  MCFragment(FragmentType _Kind, MCSectionData *_Parent = nullptr);
 
 public:
   // Only for sentinel.
@@ -138,7 +137,7 @@ class MCEncodedFragment : public MCFragment {
 
   uint8_t BundlePadding;
 public:
-  MCEncodedFragment(MCFragment::FragmentType FType, MCSectionData *SD = 0)
+  MCEncodedFragment(MCFragment::FragmentType FType, MCSectionData *SD = nullptr)
     : MCFragment(FType, SD), BundlePadding(0)
   {
   }
@@ -162,7 +161,6 @@ public:
         return false;
       case MCFragment::FT_Relaxable:
       case MCFragment::FT_CompactEncodedInst:
-      case MCFragment::FT_Compressed:
       case MCFragment::FT_Data:
         return true;
     }
@@ -177,7 +175,7 @@ class MCEncodedFragmentWithFixups : public MCEncodedFragment {
 
 public:
   MCEncodedFragmentWithFixups(MCFragment::FragmentType FType,
-                              MCSectionData *SD = 0)
+                              MCSectionData *SD = nullptr)
     : MCEncodedFragment(FType, SD)
   {
   }
@@ -197,8 +195,7 @@ public:
 
   static bool classof(const MCFragment *F) {
     MCFragment::FragmentType Kind = F->getKind();
-    return Kind == MCFragment::FT_Relaxable || Kind == MCFragment::FT_Data ||
-           Kind == MCFragment::FT_Compressed;
+    return Kind == MCFragment::FT_Relaxable || Kind == MCFragment::FT_Data;
   }
 };
 
@@ -217,13 +214,8 @@ class MCDataFragment : public MCEncodedFragmentWithFixups {
 
   /// Fixups - The list of fixups in this fragment.
   SmallVector<MCFixup, 4> Fixups;
-protected:
-  MCDataFragment(MCFragment::FragmentType FType, MCSectionData *SD = 0)
-      : MCEncodedFragmentWithFixups(FType, SD), HasInstructions(false),
-        AlignToBundleEnd(false) {}
-
 public:
-  MCDataFragment(MCSectionData *SD = 0)
+  MCDataFragment(MCSectionData *SD = nullptr)
     : MCEncodedFragmentWithFixups(FT_Data, SD),
       HasInstructions(false), AlignToBundleEnd(false)
   {
@@ -255,21 +247,10 @@ public:
   const_fixup_iterator fixup_end() const override {return Fixups.end();}
 
   static bool classof(const MCFragment *F) {
-    return F->getKind() == MCFragment::FT_Data ||
-           F->getKind() == MCFragment::FT_Compressed;
+    return F->getKind() == MCFragment::FT_Data;
   }
 };
 
-class MCCompressedFragment: public MCDataFragment {
-  mutable SmallVector<char, 32> CompressedContents;
-public:
-  MCCompressedFragment(MCSectionData *SD = nullptr)
-      : MCDataFragment(FT_Compressed, SD) {}
-  const SmallVectorImpl<char> &getCompressedContents() const;
-  using MCDataFragment::getContents;
-  SmallVectorImpl<char> &getContents() override;
-};
-
 /// This is a compact (memory-size-wise) fragment for holding an encoded
 /// instruction (non-relaxable) that has no fixups registered. When applicable,
 /// it can be used instead of MCDataFragment and lead to lower memory
@@ -283,7 +264,7 @@ class MCCompactEncodedInstFragment : public MCEncodedFragment {
 
   SmallVector<char, 4> Contents;
 public:
-  MCCompactEncodedInstFragment(MCSectionData *SD = 0)
+  MCCompactEncodedInstFragment(MCSectionData *SD = nullptr)
     : MCEncodedFragment(FT_CompactEncodedInst, SD), AlignToBundleEnd(false)
   {
   }
@@ -326,7 +307,7 @@ class MCRelaxableFragment : public MCEncodedFragmentWithFixups {
 public:
   MCRelaxableFragment(const MCInst &_Inst,
                       const MCSubtargetInfo &_STI,
-                      MCSectionData *SD = 0)
+                      MCSectionData *SD = nullptr)
     : MCEncodedFragmentWithFixups(FT_Relaxable, SD), Inst(_Inst), STI(_STI) {
   }
 
@@ -382,7 +363,7 @@ class MCAlignFragment : public MCFragment {
 
 public:
   MCAlignFragment(unsigned _Alignment, int64_t _Value, unsigned _ValueSize,
-                  unsigned _MaxBytesToEmit, MCSectionData *SD = 0)
+                  unsigned _MaxBytesToEmit, MCSectionData *SD = nullptr)
     : MCFragment(FT_Align, SD), Alignment(_Alignment),
       Value(_Value),ValueSize(_ValueSize),
       MaxBytesToEmit(_MaxBytesToEmit), EmitNops(false) {}
@@ -423,7 +404,7 @@ class MCFillFragment : public MCFragment {
 
 public:
   MCFillFragment(int64_t _Value, unsigned _ValueSize, uint64_t _Size,
-                 MCSectionData *SD = 0)
+                 MCSectionData *SD = nullptr)
     : MCFragment(FT_Fill, SD),
       Value(_Value), ValueSize(_ValueSize), Size(_Size) {
     assert((!ValueSize || (Size % ValueSize) == 0) &&
@@ -456,7 +437,8 @@ class MCOrgFragment : public MCFragment {
   int8_t Value;
 
 public:
-  MCOrgFragment(const MCExpr &_Offset, int8_t _Value, MCSectionData *SD = 0)
+  MCOrgFragment(const MCExpr &_Offset, int8_t _Value,
+                MCSectionData *SD = nullptr)
     : MCFragment(FT_Org, SD),
       Offset(&_Offset), Value(_Value) {}
 
@@ -485,7 +467,8 @@ class MCLEBFragment : public MCFragment {
 
   SmallString<8> Contents;
 public:
-  MCLEBFragment(const MCExpr &Value_, bool IsSigned_, MCSectionData *SD = 0)
+  MCLEBFragment(const MCExpr &Value_, bool IsSigned_,
+                MCSectionData *SD = nullptr)
     : MCFragment(FT_LEB, SD),
       Value(&Value_), IsSigned(IsSigned_) { Contents.push_back(0); }
 
@@ -521,7 +504,7 @@ class MCDwarfLineAddrFragment : public MCFragment {
 
 public:
   MCDwarfLineAddrFragment(int64_t _LineDelta, const MCExpr &_AddrDelta,
-                      MCSectionData *SD = 0)
+                      MCSectionData *SD = nullptr)
     : MCFragment(FT_Dwarf, SD),
       LineDelta(_LineDelta), AddrDelta(&_AddrDelta) { Contents.push_back(0); }
 
@@ -552,7 +535,8 @@ class MCDwarfCallFrameFragment : public MCFragment {
   SmallString<8> Contents;
 
 public:
-  MCDwarfCallFrameFragment(const MCExpr &_AddrDelta,  MCSectionData *SD = 0)
+  MCDwarfCallFrameFragment(const MCExpr &_AddrDelta,
+                           MCSectionData *SD = nullptr)
     : MCFragment(FT_DwarfFrame, SD),
       AddrDelta(&_AddrDelta) { Contents.push_back(0); }
 
@@ -633,7 +617,7 @@ private:
 public:
   // Only for use as sentinel.
   MCSectionData();
-  MCSectionData(const MCSection &Section, MCAssembler *A = 0);
+  MCSectionData(const MCSection &Section, MCAssembler *A = nullptr);
 
   const MCSection &getSection() const { return *Section; }
 
@@ -743,7 +727,7 @@ public:
   // Only for use as sentinel.
   MCSymbolData();
   MCSymbolData(const MCSymbol &_Symbol, MCFragment *_Fragment, uint64_t _Offset,
-               MCAssembler *A = 0);
+               MCAssembler *A = nullptr);
 
   /// @name Accessors
   /// @{
@@ -850,6 +834,9 @@ public:
   typedef SymbolDataListType::const_iterator const_symbol_iterator;
   typedef SymbolDataListType::iterator symbol_iterator;
 
+  typedef iterator_range<symbol_iterator> symbol_range;
+  typedef iterator_range<const_symbol_iterator> const_symbol_range;
+
   typedef std::vector<std::string> FileNameVectorType;
   typedef FileNameVectorType::const_iterator const_file_name_iterator;
 
@@ -915,7 +902,7 @@ private:
   // here. Maybe when the relocation stuff moves to target specific,
   // this can go with it? The streamer would need some target specific
   // refactoring too.
-  SmallPtrSet<const MCSymbol*, 64> ThumbFuncs;
+  mutable SmallPtrSet<const MCSymbol*, 64> ThumbFuncs;
 
   /// \brief The bundle alignment size currently set in the assembler.
   ///
@@ -1008,9 +995,7 @@ public:
                         const MCAsmLayout &Layout) const;
 
   /// Check whether a given symbol has been flagged with .thumb_func.
-  bool isThumbFunc(const MCSymbol *Func) const {
-    return ThumbFuncs.count(Func);
-  }
+  bool isThumbFunc(const MCSymbol *Func) const;
 
   /// Flag a function symbol as the target of a .thumb_func directive.
   void setIsThumbFunc(const MCSymbol *Func) { ThumbFuncs.insert(Func); }
@@ -1115,6 +1100,9 @@ public:
   symbol_iterator symbol_end() { return Symbols.end(); }
   const_symbol_iterator symbol_end() const { return Symbols.end(); }
 
+  symbol_range symbols() { return make_range(symbol_begin(), symbol_end()); }
+  const_symbol_range symbols() const { return make_range(symbol_begin(), symbol_end()); }
+
   size_t symbol_size() const { return Symbols.size(); }
 
   /// @}
@@ -1203,7 +1191,7 @@ public:
   }
 
   MCSectionData &getOrCreateSectionData(const MCSection &Section,
-                                        bool *Created = 0) {
+                                        bool *Created = nullptr) {
     MCSectionData *&Entry = SectionMap[&Section];
 
     if (Created) *Created = !Entry;
@@ -1214,22 +1202,27 @@ public:
   }
 
   bool hasSymbolData(const MCSymbol &Symbol) const {
-    return SymbolMap.lookup(&Symbol) != 0;
+    return SymbolMap.lookup(&Symbol) != nullptr;
+  }
+
+  MCSymbolData &getSymbolData(const MCSymbol &Symbol) {
+    return const_cast<MCSymbolData &>(
+        static_cast<const MCAssembler &>(*this).getSymbolData(Symbol));
   }
 
-  MCSymbolData &getSymbolData(const MCSymbol &Symbol) const {
+  const MCSymbolData &getSymbolData(const MCSymbol &Symbol) const {
     MCSymbolData *Entry = SymbolMap.lookup(&Symbol);
     assert(Entry && "Missing symbol data!");
     return *Entry;
   }
 
   MCSymbolData &getOrCreateSymbolData(const MCSymbol &Symbol,
-                                      bool *Created = 0) {
+                                      bool *Created = nullptr) {
     MCSymbolData *&Entry = SymbolMap[&Symbol];
 
     if (Created) *Created = !Entry;
     if (!Entry)
-      Entry = new MCSymbolData(Symbol, 0, 0, this);
+      Entry = new MCSymbolData(Symbol, nullptr, 0, this);
 
     return *Entry;
   }
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 9091ed9..7557e76 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -137,7 +137,7 @@ namespace llvm {
 
     /// The information gathered from labels that will have dwarf label
     /// entries when generating dwarf assembly source files.
-    std::vector<const MCGenDwarfLabelEntry *> MCGenDwarfLabelEntries;
+    std::vector<MCGenDwarfLabelEntry> MCGenDwarfLabelEntries;
 
     /// The string to embed in the debug information for the compile unit, if
     /// non-empty.
@@ -147,6 +147,9 @@ namespace llvm {
     /// non-empty.
     StringRef DwarfDebugProducer;
 
+    /// The maximum version of dwarf that we should emit.
+    uint16_t DwarfVersion;
+
     /// Honor temporary labels, this is useful for debugging semantic
     /// differences between temporary and non-temporary labels (primarily on
     /// Darwin).
@@ -155,7 +158,11 @@ namespace llvm {
     /// The Compile Unit ID that we are currently processing.
     unsigned DwarfCompileUnitID;
 
-    void *MachOUniquingMap, *ELFUniquingMap, *COFFUniquingMap;
+    typedef std::pair<std::string, std::string> SectionGroupPair;
+
+    StringMap<const MCSectionMachO*> MachOUniquingMap;
+    std::map<SectionGroupPair, const MCSectionELF *> ELFUniquingMap;
+    std::map<SectionGroupPair, const MCSectionCOFF *> COFFUniquingMap;
 
     /// Do automatic reset in destructor
     bool AutoReset;
@@ -167,8 +174,8 @@ namespace llvm {
 
   public:
     explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI,
-                       const MCObjectFileInfo *MOFI, const SourceMgr *Mgr = 0,
-                       bool DoAutoReset = true);
+                       const MCObjectFileInfo *MOFI,
+                       const SourceMgr *Mgr = nullptr, bool DoAutoReset = true);
     ~MCContext();
 
     const SourceMgr *getSourceManager() const { return SrcMgr; }
@@ -259,6 +266,8 @@ namespace llvm {
                                       unsigned Flags, SectionKind Kind,
                                       unsigned EntrySize, StringRef Group);
 
+    void renameELFSection(const MCSectionELF *Section, StringRef Name);
+
     const MCSectionELF *CreateELFGroupSection();
 
     const MCSectionCOFF *getCOFFSection(StringRef Section,
@@ -266,7 +275,7 @@ namespace llvm {
                                         SectionKind Kind,
                                         StringRef COMDATSymName,
                                         int Selection,
-                                        const MCSectionCOFF *Assoc = 0);
+                                        const MCSectionCOFF *Assoc = nullptr);
 
     const MCSectionCOFF *getCOFFSection(StringRef Section,
                                         unsigned Characteristics,
@@ -304,14 +313,6 @@ namespace llvm {
 
     bool isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID = 0);
 
-    bool hasDwarfFiles() const {
-      // Traverse MCDwarfFilesCUMap and check whether each entry is empty.
-      for (const auto &FileTable : MCDwarfLineTablesCUMap)
-        if (!FileTable.second.getMCDwarfFiles().empty())
-           return true;
-      return false;
-    }
-
     const std::map<unsigned, MCDwarfLineTable> &getMCDwarfLineTables() const {
       return MCDwarfLineTablesCUMap;
     }
@@ -385,11 +386,10 @@ namespace llvm {
     void setGenDwarfSectionEndSym(MCSymbol *Sym) {
       GenDwarfSectionEndSym = Sym;
     }
-    const std::vector<const MCGenDwarfLabelEntry *>
-      &getMCGenDwarfLabelEntries() const {
+    const std::vector<MCGenDwarfLabelEntry> &getMCGenDwarfLabelEntries() const {
       return MCGenDwarfLabelEntries;
     }
-    void addMCGenDwarfLabelEntry(const MCGenDwarfLabelEntry *E) {
+    void addMCGenDwarfLabelEntry(const MCGenDwarfLabelEntry &E) {
       MCGenDwarfLabelEntries.push_back(E);
     }
 
@@ -399,6 +399,9 @@ namespace llvm {
     void setDwarfDebugProducer(StringRef S) { DwarfDebugProducer = S; }
     StringRef getDwarfDebugProducer() { return DwarfDebugProducer; }
 
+    void setDwarfVersion(uint16_t v) { DwarfVersion = v; }
+    uint16_t getDwarfVersion() const { return DwarfVersion; }
+
     /// @}
 
     char *getSecureLogFile() { return SecureLogFile; }
@@ -420,7 +423,7 @@ namespace llvm {
     // Unrecoverable error has occurred. Display the best diagnostic we can
     // and bail via exit(1). For now, most MC backend errors are unrecoverable.
     // FIXME: We should really do something about that.
-    LLVM_ATTRIBUTE_NORETURN void FatalError(SMLoc L, const Twine &Msg);
+    LLVM_ATTRIBUTE_NORETURN void FatalError(SMLoc L, const Twine &Msg) const;
   };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCDisassembler.h b/include/llvm/MC/MCDisassembler.h
index d545fc7..9d441bb 100644
--- a/include/llvm/MC/MCDisassembler.h
+++ b/include/llvm/MC/MCDisassembler.h
@@ -10,7 +10,6 @@
 #define LLVM_MC_MCDISASSEMBLER_H
 
 #include "llvm-c/Disassembler.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/MC/MCRelocationInfo.h"
 #include "llvm/MC/MCSymbolizer.h"
 #include "llvm/Support/DataTypes.h"
@@ -56,9 +55,8 @@ public:
   };
 
   /// Constructor     - Performs initial setup for the disassembler.
-  MCDisassembler(const MCSubtargetInfo &STI)
-      : GetOpInfo(0), SymbolLookUp(0), DisInfo(0), Ctx(0), STI(STI),
-        Symbolizer(), CommentStream(0) {}
+  MCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : Ctx(Ctx), STI(STI), Symbolizer(), CommentStream(nullptr) {}
 
   virtual ~MCDisassembler();
 
@@ -85,18 +83,7 @@ public:
                                        raw_ostream &vStream,
                                        raw_ostream &cStream) const = 0;
 private:
-  //
-  // Hooks for symbolic disassembly via the public 'C' interface.
-  //
-  // The function to get the symbolic information for operands.
-  LLVMOpInfoCallback GetOpInfo;
-  // The function to lookup a symbol name.
-  LLVMSymbolLookupCallback SymbolLookUp;
-  // The pointer to the block of symbolic information for above call back.
-  void *DisInfo;
-  // The assembly context for creating symbols and MCExprs in place of
-  // immediate operands when there is symbolic information.
-  MCContext *Ctx;
+  MCContext &Ctx;
 
 protected:
   // Subtarget information, for instruction decoding predicates if required.
@@ -116,19 +103,7 @@ public:
   /// This takes ownership of \p Symzer, and deletes the previously set one.
   void setSymbolizer(std::unique_ptr<MCSymbolizer> Symzer);
 
-  /// Sets up an external symbolizer that uses the C API callbacks.
-  void setupForSymbolicDisassembly(LLVMOpInfoCallback GetOpInfo,
-                                   LLVMSymbolLookupCallback SymbolLookUp,
-                                   void *DisInfo,
-                                   MCContext *Ctx,
-                                   std::unique_ptr<MCRelocationInfo> &RelInfo);
-
-  LLVMOpInfoCallback getLLVMOpInfoCallback() const { return GetOpInfo; }
-  LLVMSymbolLookupCallback getLLVMSymbolLookupCallback() const {
-    return SymbolLookUp;
-  }
-  void *getDisInfoBlock() const { return DisInfo; }
-  MCContext *getMCContext() const { return Ctx; }
+  MCContext& getContext() const { return Ctx; }
 
   const MCSubtargetInfo& getSubtargetInfo() const { return STI; }
 
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index 6e77c6c..6df8a19 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -30,6 +30,7 @@
 namespace llvm {
 class MCAsmBackend;
 class MCContext;
+class MCObjectStreamer;
 class MCSection;
 class MCStreamer;
 class MCSymbol;
@@ -147,7 +148,7 @@ public:
   // This is called when an instruction is assembled into the specified
   // section and if there is information from the last .loc directive that
   // has yet to have a line entry made for it is made.
-  static void Make(MCStreamer *MCOS, const MCSection *Section);
+  static void Make(MCObjectStreamer *MCOS, const MCSection *Section);
 };
 
 /// MCLineSection - Instances of this class represent the line information
@@ -210,10 +211,10 @@ class MCDwarfLineTable {
 
 public:
   // This emits the Dwarf file and the line tables for all Compile Units.
-  static void Emit(MCStreamer *MCOS);
+  static void Emit(MCObjectStreamer *MCOS);
 
   // This emits the Dwarf file and the line tables for a given Compile Unit.
-  void EmitCU(MCStreamer *MCOS) const;
+  void EmitCU(MCObjectStreamer *MCOS) const;
 
   unsigned getFile(StringRef &Directory, StringRef &FileName,
                    unsigned FileNumber = 0);
@@ -464,9 +465,9 @@ public:
 
 struct MCDwarfFrameInfo {
   MCDwarfFrameInfo()
-      : Begin(0), End(0), Personality(0), Lsda(0), Function(0), Instructions(),
-        PersonalityEncoding(), LsdaEncoding(0), CompactUnwindEncoding(0),
-        IsSignalFrame(false), IsSimple(false) {}
+    : Begin(nullptr), End(nullptr), Personality(nullptr), Lsda(nullptr),
+      Function(nullptr), Instructions(), PersonalityEncoding(), LsdaEncoding(0),
+      CompactUnwindEncoding(0), IsSignalFrame(false), IsSimple(false) {}
   MCSymbol *Begin;
   MCSymbol *End;
   const MCSymbol *Personality;
@@ -485,9 +486,8 @@ public:
   //
   // This emits the frame info section.
   //
-  static void Emit(MCStreamer &streamer, MCAsmBackend *MAB,
-                   bool usingCFI, bool isEH);
-  static void EmitAdvanceLoc(MCStreamer &Streamer, uint64_t AddrDelta);
+  static void Emit(MCObjectStreamer &streamer, MCAsmBackend *MAB, bool isEH);
+  static void EmitAdvanceLoc(MCObjectStreamer &Streamer, uint64_t AddrDelta);
   static void EncodeAdvanceLoc(MCContext &Context, uint64_t AddrDelta,
                                raw_ostream &OS);
 };
diff --git a/include/llvm/MC/MCELFStreamer.h b/include/llvm/MC/MCELFStreamer.h
index ebd5d57..be39128 100644
--- a/include/llvm/MC/MCELFStreamer.h
+++ b/include/llvm/MC/MCELFStreamer.h
@@ -61,18 +61,17 @@ public:
   void EmitCOFFSymbolType(int Type) override;
   void EndCOFFSymbolDef() override;
 
-  MCSymbolData &getOrCreateSymbolData(const MCSymbol *Symbol) override;
-
   void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) override;
 
   void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                              unsigned ByteAlignment) override;
 
-  void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
+  void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = nullptr,
                     uint64_t Size = 0, unsigned ByteAlignment = 0) override;
   void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                       uint64_t Size, unsigned ByteAlignment = 0) override;
-  void EmitValueImpl(const MCExpr *Value, unsigned Size) override;
+  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                     const SMLoc &Loc = SMLoc()) override;
 
   void EmitFileDirective(StringRef Filename) override;
 
diff --git a/include/llvm/MC/MCELFSymbolFlags.h b/include/llvm/MC/MCELFSymbolFlags.h
index 5b82a58..2f1f561 100644
--- a/include/llvm/MC/MCELFSymbolFlags.h
+++ b/include/llvm/MC/MCELFSymbolFlags.h
@@ -24,9 +24,7 @@ namespace llvm {
     ELF_STT_Shift   = 0,  // Shift value for STT_* flags.
     ELF_STB_Shift   = 4,  // Shift value for STB_* flags.
     ELF_STV_Shift   = 8,  // Shift value for STV_* flags.
-    ELF_STO_Shift   = 10, // Shift value for STO_* flags.
-    ELF_Other_Shift = 16  // Shift value for llvm local flags,
-                          // not part of the final object file
+    ELF_STO_Shift   = 10  // Shift value for STO_* flags.
   };
 
   enum ELFSymbolFlags {
@@ -49,9 +47,7 @@ namespace llvm {
       ELF_STV_Default   = (ELF::STV_DEFAULT   << ELF_STV_Shift),
       ELF_STV_Internal  = (ELF::STV_INTERNAL  << ELF_STV_Shift),
       ELF_STV_Hidden    = (ELF::STV_HIDDEN    << ELF_STV_Shift),
-      ELF_STV_Protected = (ELF::STV_PROTECTED << ELF_STV_Shift),
-
-      ELF_Other_ThumbFunc = (1                << ELF_Other_Shift)
+      ELF_STV_Protected = (ELF::STV_PROTECTED << ELF_STV_Shift)
   };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index 0033a54..ca5cecb 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -53,8 +53,9 @@ protected:
 
   bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
                                  const MCAsmLayout *Layout,
-                                 const SectionAddrMap *Addrs,
-                                 bool InSet) const;
+                                 const SectionAddrMap *Addrs, bool InSet,
+                                 bool ForceVarExpansion) const;
+
 public:
   /// @name Accessors
   /// @{
@@ -93,6 +94,14 @@ public:
   /// @result - True on success.
   bool EvaluateAsRelocatable(MCValue &Res, const MCAsmLayout *Layout) const;
 
+  /// \brief Try to evaluate the expression to the form (a - b + constant) where
+  /// neither a nor b are variables.
+  ///
+  /// This is a more aggressive variant of EvaluateAsRelocatable. The intended
+  /// use is for when relocations are not available, like the symbol value in
+  /// the symbol table.
+  bool EvaluateAsValue(MCValue &Res, const MCAsmLayout *Layout) const;
+
   /// FindAssociatedSection - Find the "associated section" for this expression,
   /// which is currently defined as the absolute section for constants, or
   /// otherwise the section associated with the first defined symbol in the
@@ -253,6 +262,8 @@ public:
     VK_Mips_GOT_LO16,
     VK_Mips_CALL_HI16,
     VK_Mips_CALL_LO16,
+    VK_Mips_PCREL_HI16,
+    VK_Mips_PCREL_LO16,
 
     VK_COFF_IMGREL32 // symbol@imgrel (image-relative)
   };
diff --git a/include/llvm/MC/MCExternalSymbolizer.h b/include/llvm/MC/MCExternalSymbolizer.h
index cab9152..2c7d237 100644
--- a/include/llvm/MC/MCExternalSymbolizer.h
+++ b/include/llvm/MC/MCExternalSymbolizer.h
@@ -26,7 +26,7 @@ namespace llvm {
 ///
 /// See llvm-c/Disassembler.h.
 class MCExternalSymbolizer : public MCSymbolizer {
-
+protected:
   /// \name Hooks for symbolic disassembly via the public 'C' interface.
   /// @{
   /// The function to get the symbolic information for operands.
diff --git a/include/llvm/MC/MCFixup.h b/include/llvm/MC/MCFixup.h
index e6d675f..98a1419 100644
--- a/include/llvm/MC/MCFixup.h
+++ b/include/llvm/MC/MCFixup.h
@@ -88,8 +88,6 @@ public:
 
   MCFixupKind getKind() const { return MCFixupKind(Kind); }
 
-  MCSymbolRefExpr::VariantKind getAccessVariant() const;
-
   uint32_t getOffset() const { return Offset; }
   void setOffset(uint32_t Value) { Offset = Value; }
 
diff --git a/include/llvm/MC/MCFunction.h b/include/llvm/MC/MCFunction.h
index 22c9192..bfa470b 100644
--- a/include/llvm/MC/MCFunction.h
+++ b/include/llvm/MC/MCFunction.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInst.h"
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -88,13 +89,12 @@ class MCFunction {
 
   std::string Name;
   MCModule *ParentModule;
-  typedef std::vector<MCBasicBlock*> BasicBlockListTy;
+  typedef std::vector<std::unique_ptr<MCBasicBlock>> BasicBlockListTy;
   BasicBlockListTy Blocks;
 
   // MCModule owns the function.
   friend class MCModule;
   MCFunction(StringRef Name, MCModule *Parent);
-  ~MCFunction();
 
 public:
   /// \brief Create an MCBasicBlock backed by Insts and add it to this function.
@@ -126,10 +126,10 @@ public:
   const_iterator   end() const { return Blocks.end(); }
         iterator   end()       { return Blocks.end(); }
 
-  const MCBasicBlock* front() const { return Blocks.front(); }
-        MCBasicBlock* front()       { return Blocks.front(); }
-  const MCBasicBlock*  back() const { return Blocks.back(); }
-        MCBasicBlock*  back()       { return Blocks.back(); }
+  const MCBasicBlock* front() const { return Blocks.front().get(); }
+        MCBasicBlock* front()       { return Blocks.front().get(); }
+  const MCBasicBlock*  back() const { return Blocks.back().get(); }
+        MCBasicBlock*  back()       { return Blocks.back().get(); }
 
   /// \brief Find the basic block, if any, that starts at \p StartAddr.
   const MCBasicBlock *find(uint64_t StartAddr) const;
diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h
index 4766815..6918280 100644
--- a/include/llvm/MC/MCInst.h
+++ b/include/llvm/MC/MCInst.h
@@ -184,18 +184,18 @@ public:
   /// \brief Dump the MCInst as prettily as possible using the additional MC
   /// structures, if given. Operators are separated by the \p Separator
   /// string.
-  void dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI = 0,
-                   const MCInstPrinter *Printer = 0,
+  void dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI = nullptr,
+                   const MCInstPrinter *Printer = nullptr,
                    StringRef Separator = " ") const;
 };
 
 inline raw_ostream& operator<<(raw_ostream &OS, const MCOperand &MO) {
-  MO.print(OS, 0);
+  MO.print(OS, nullptr);
   return OS;
 }
 
 inline raw_ostream& operator<<(raw_ostream &OS, const MCInst &MI) {
-  MI.print(OS, 0);
+  MI.print(OS, nullptr);
   return OS;
 }
 
diff --git a/include/llvm/MC/MCInstPrinter.h b/include/llvm/MC/MCInstPrinter.h
index b4258be..7f55b29 100644
--- a/include/llvm/MC/MCInstPrinter.h
+++ b/include/llvm/MC/MCInstPrinter.h
@@ -57,8 +57,9 @@ protected:
 public:
   MCInstPrinter(const MCAsmInfo &mai, const MCInstrInfo &mii,
                 const MCRegisterInfo &mri)
-    : CommentStream(0), MAI(mai), MII(mii), MRI(mri), AvailableFeatures(0),
-      UseMarkup(0), PrintImmHex(0), PrintHexStyle(HexStyle::C) {}
+    : CommentStream(nullptr), MAI(mai), MII(mii), MRI(mri),
+      AvailableFeatures(0), UseMarkup(0), PrintImmHex(0),
+      PrintHexStyle(HexStyle::C) {}
 
   virtual ~MCInstPrinter();
 
diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h
index 214b593..5896de7 100644
--- a/include/llvm/MC/MCInstrDesc.h
+++ b/include/llvm/MC/MCInstrDesc.h
@@ -504,7 +504,7 @@ public:
 
   /// \brief Return the number of implicit uses this instruction has.
   unsigned getNumImplicitUses() const {
-    if (ImplicitUses == 0) return 0;
+    if (!ImplicitUses) return 0;
     unsigned i = 0;
     for (; ImplicitUses[i]; ++i) /*empty*/;
     return i;
@@ -526,7 +526,7 @@ public:
 
   /// \brief Return the number of implicit defs this instruct has.
   unsigned getNumImplicitDefs() const {
-    if (ImplicitDefs == 0) return 0;
+    if (!ImplicitDefs) return 0;
     unsigned i = 0;
     for (; ImplicitDefs[i]; ++i) /*empty*/;
     return i;
@@ -544,7 +544,7 @@ public:
   /// \brief Return true if this instruction implicitly
   /// defines the specified physical register.
   bool hasImplicitDefOfPhysReg(unsigned Reg,
-                               const MCRegisterInfo *MRI = 0) const {
+                               const MCRegisterInfo *MRI = nullptr) const {
     if (const uint16_t *ImpDefs = ImplicitDefs)
       for (; *ImpDefs; ++ImpDefs)
         if (*ImpDefs == Reg || (MRI && MRI->isSubRegister(Reg, *ImpDefs)))
diff --git a/include/llvm/MC/MCInstrItineraries.h b/include/llvm/MC/MCInstrItineraries.h
index c4f9e1c..5104345 100644
--- a/include/llvm/MC/MCInstrItineraries.h
+++ b/include/llvm/MC/MCInstrItineraries.h
@@ -119,8 +119,8 @@ public:
   /// Ctors.
   ///
   InstrItineraryData() : SchedModel(&MCSchedModel::DefaultSchedModel),
-                         Stages(0), OperandCycles(0),
-                         Forwardings(0), Itineraries(0) {}
+                         Stages(nullptr), OperandCycles(nullptr),
+                         Forwardings(nullptr), Itineraries(nullptr) {}
 
   InstrItineraryData(const MCSchedModel *SM, const InstrStage *S,
                      const unsigned *OS, const unsigned *F)
@@ -129,7 +129,7 @@ public:
 
   /// isEmpty - Returns true if there are no itineraries.
   ///
-  bool isEmpty() const { return Itineraries == 0; }
+  bool isEmpty() const { return Itineraries == nullptr; }
 
   /// isEndMarker - Returns true if the index is for the end marker
   /// itinerary.
diff --git a/include/llvm/MC/MCModule.h b/include/llvm/MC/MCModule.h
index 63635c7..aa389cb 100644
--- a/include/llvm/MC/MCModule.h
+++ b/include/llvm/MC/MCModule.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
+#include <memory>
 #include <vector>
 
 namespace llvm {
@@ -73,7 +74,7 @@ class MCModule {
 
   /// \name Function tracking
   /// @{
-  typedef std::vector<MCFunction*> FunctionListTy;
+  typedef std::vector<std::unique_ptr<MCFunction>> FunctionListTy;
   FunctionListTy Functions;
   /// @}
 
@@ -87,7 +88,7 @@ class MCModule {
   friend class MCObjectDisassembler;
 
 public:
-  MCModule() : Entrypoint(0) { }
+  MCModule();
   ~MCModule();
 
   /// \name Create a new MCAtom covering the specified offset range.
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 1c5c19e..1a56040 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -44,11 +44,10 @@ protected:
   /// section.
   bool SupportsCompactUnwindWithoutEHFrame;
 
-  /// PersonalityEncoding, LSDAEncoding, FDEEncoding, TTypeEncoding - Some
-  /// encoding values for EH.
+  /// PersonalityEncoding, LSDAEncoding, TTypeEncoding - Some encoding values
+  /// for EH.
   unsigned PersonalityEncoding;
   unsigned LSDAEncoding;
-  unsigned FDEEncoding;
   unsigned FDECFIEncoding;
   unsigned TTypeEncoding;
 
@@ -217,9 +216,7 @@ public:
 
   unsigned getPersonalityEncoding() const { return PersonalityEncoding; }
   unsigned getLSDAEncoding() const { return LSDAEncoding; }
-  unsigned getFDEEncoding(bool CFI) const {
-    return CFI ? FDECFIEncoding : FDEEncoding;
-  }
+  unsigned getFDEEncoding() const { return FDECFIEncoding; }
   unsigned getTTypeEncoding() const { return TTypeEncoding; }
 
   unsigned getCompactUnwindDwarfEHFrameOnly() const {
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index a42b7a05..e41a8ba 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -35,6 +35,8 @@ class MCObjectStreamer : public MCStreamer {
   MCAssembler *Assembler;
   MCSectionData *CurSectionData;
   MCSectionData::iterator CurInsertionPoint;
+  bool EmitEHFrame;
+  bool EmitDebugFrame;
 
   virtual void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo&) = 0;
   void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
@@ -54,6 +56,12 @@ public:
   /// Object streamers require the integrated assembler.
   bool isIntegratedAssemblerRequired() const override { return true; }
 
+  MCSymbolData &getOrCreateSymbolData(const MCSymbol *Symbol) {
+    return getAssembler().getOrCreateSymbolData(*Symbol);
+  }
+  void EmitFrames(MCAsmBackend *MAB);
+  void EmitCFISections(bool EH, bool Debug) override;
+
 protected:
   MCSectionData *getCurrentSectionData() const {
     return CurSectionData;
@@ -81,7 +89,8 @@ public:
   void EmitLabel(MCSymbol *Symbol) override;
   void EmitDebugLabel(MCSymbol *Symbol) override;
   void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
-  void EmitValueImpl(const MCExpr *Value, unsigned Size) override;
+  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                     const SMLoc &Loc = SMLoc()) override;
   void EmitULEB128Value(const MCExpr *Value) override;
   void EmitSLEB128Value(const MCExpr *Value) override;
   void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override;
@@ -109,9 +118,9 @@ public:
                              StringRef FileName) override;
   void EmitDwarfAdvanceLineAddr(int64_t LineDelta, const MCSymbol *LastLabel,
                                 const MCSymbol *Label,
-                                unsigned PointerSize) override;
+                                unsigned PointerSize);
   void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
-                                 const MCSymbol *Label) override;
+                                 const MCSymbol *Label);
   void EmitGPRel32Value(const MCExpr *Value) override;
   void EmitGPRel64Value(const MCExpr *Value) override;
   void EmitFill(uint64_t NumBytes, uint8_t FillValue) override;
diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h
index f36011c..59b5c09 100644
--- a/include/llvm/MC/MCParser/AsmLexer.h
+++ b/include/llvm/MC/MCParser/AsmLexer.h
@@ -42,7 +42,7 @@ public:
   AsmLexer(const MCAsmInfo &MAI);
   ~AsmLexer();
 
-  void setBuffer(const MemoryBuffer *buf, const char *ptr = NULL);
+  void setBuffer(const MemoryBuffer *buf, const char *ptr = nullptr);
 
   StringRef LexUntilEndOfStatement() override;
   StringRef LexUntilEndOfLine();
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index 0389caa..f751786 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -39,7 +39,7 @@ public:
     unsigned Length, Size, Type;
 
     void clear() {
-      OpDecl = 0;
+      OpDecl = nullptr;
       IsVarDecl = false;
       Length = 1;
       Size = 0;
diff --git a/include/llvm/MC/MCParser/MCParsedAsmOperand.h b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
index 818fbbd..e8740aa 100644
--- a/include/llvm/MC/MCParser/MCParsedAsmOperand.h
+++ b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
@@ -38,7 +38,7 @@ public:
   unsigned getMCOperandNum() { return MCOperandNum; }
 
   virtual StringRef getSymName() { return StringRef(); }
-  virtual void *getOpDecl() { return 0; }
+  virtual void *getOpDecl() { return nullptr; }
 
   /// isToken - Is this a token operand?
   virtual bool isToken() const = 0;
diff --git a/include/llvm/MC/MCRegisterInfo.h b/include/llvm/MC/MCRegisterInfo.h
index 3fa89c1..766f631 100644
--- a/include/llvm/MC/MCRegisterInfo.h
+++ b/include/llvm/MC/MCRegisterInfo.h
@@ -159,7 +159,7 @@ private:
   const MCRegisterClass *Classes;             // Pointer to the regclass array
   unsigned NumClasses;                        // Number of entries in the array
   unsigned NumRegUnits;                       // Number of regunits.
-  const uint16_t (*RegUnitRoots)[2];          // Pointer to regunit root table.
+  const MCPhysReg (*RegUnitRoots)[2];         // Pointer to regunit root table.
   const MCPhysReg *DiffLists;                 // Pointer to the difflists array
   const char *RegStrings;                     // Pointer to the string table.
   const uint16_t *SubRegIndices;              // Pointer to the subreg lookup
@@ -191,7 +191,7 @@ public:
 
   protected:
     /// Create an invalid iterator. Call init() to point to something useful.
-    DiffListIterator() : Val(0), List(0) {}
+    DiffListIterator() : Val(0), List(nullptr) {}
 
     /// init - Point the iterator to InitVal, decoding subsequent values from
     /// DiffList. The iterator will initially point to InitVal, sub-classes are
@@ -223,7 +223,7 @@ public:
     void operator++() {
       // The end of the list is encoded as a 0 differential.
       if (!advance())
-        List = 0;
+        List = nullptr;
     }
   };
 
@@ -239,7 +239,7 @@ public:
   void InitMCRegisterInfo(const MCRegisterDesc *D, unsigned NR, unsigned RA,
                           unsigned PC,
                           const MCRegisterClass *C, unsigned NC,
-                          const uint16_t (*RURoots)[2],
+                          const MCPhysReg (*RURoots)[2],
                           unsigned NRU,
                           const MCPhysReg *DL,
                           const char *Strings,
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index d1ab411..862a0fd 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -159,6 +159,14 @@ public:
   unsigned MicroOpBufferSize;
   static const unsigned DefaultMicroOpBufferSize = 0;
 
+  // LoopMicroOpBufferSize is the number of micro-ops that the processor may
+  // buffer for optimized loop execution. More generally, this represents the
+  // optimal number of micro-ops in a loop body. A loop may be partially
+  // unrolled to bring the count of micro-ops in the loop body closer to this
+  // number.
+  unsigned LoopMicroOpBufferSize;
+  static const unsigned DefaultLoopMicroOpBufferSize = 0;
+
   // LoadLatency is the expected latency of load instructions.
   //
   // If MinLatency >= 0, this may be overriden for individual load opcodes by
@@ -198,23 +206,24 @@ public:
   // MCSchedModel instead of using a generated itinerary.
   MCSchedModel(): IssueWidth(DefaultIssueWidth),
                   MicroOpBufferSize(DefaultMicroOpBufferSize),
+                  LoopMicroOpBufferSize(DefaultLoopMicroOpBufferSize),
                   LoadLatency(DefaultLoadLatency),
                   HighLatency(DefaultHighLatency),
                   MispredictPenalty(DefaultMispredictPenalty),
-                  CompleteModel(true),
-                  ProcID(0), ProcResourceTable(0), SchedClassTable(0),
-                  NumProcResourceKinds(0), NumSchedClasses(0),
-                  InstrItineraries(0) {
+                  CompleteModel(true), ProcID(0), ProcResourceTable(nullptr),
+                  SchedClassTable(nullptr), NumProcResourceKinds(0),
+                  NumSchedClasses(0), InstrItineraries(nullptr) {
     (void)NumProcResourceKinds;
     (void)NumSchedClasses;
   }
 
   // Table-gen driven ctor.
-  MCSchedModel(unsigned iw, int mbs, unsigned ll, unsigned hl,
+  MCSchedModel(unsigned iw, int mbs, int lmbs, unsigned ll, unsigned hl,
                unsigned mp, bool cm, unsigned pi, const MCProcResourceDesc *pr,
                const MCSchedClassDesc *sc, unsigned npr, unsigned nsc,
                const InstrItinerary *ii):
-    IssueWidth(iw), MicroOpBufferSize(mbs), LoadLatency(ll), HighLatency(hl),
+    IssueWidth(iw), MicroOpBufferSize(mbs), LoopMicroOpBufferSize(lmbs),
+    LoadLatency(ll), HighLatency(hl),
     MispredictPenalty(mp), CompleteModel(cm), ProcID(pi),
     ProcResourceTable(pr), SchedClassTable(sc), NumProcResourceKinds(npr),
     NumSchedClasses(nsc), InstrItineraries(ii) {}
diff --git a/include/llvm/MC/MCSectionCOFF.h b/include/llvm/MC/MCSectionCOFF.h
index aa02d9a..a428f9e 100644
--- a/include/llvm/MC/MCSectionCOFF.h
+++ b/include/llvm/MC/MCSectionCOFF.h
@@ -58,7 +58,7 @@ class MCSymbol;
       assert ((Characteristics & 0x00F00000) == 0 &&
         "alignment must not be set upon section creation");
       assert ((Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) ==
-              (Assoc != 0) &&
+              (Assoc != nullptr) &&
         "associative COMDAT section must have an associated section");
     }
     ~MCSectionCOFF();
@@ -79,7 +79,8 @@ class MCSymbol;
     int getSelection() const { return Selection; }
     const MCSectionCOFF *getAssocSection() const { return Assoc; }
 
-    void setSelection(int Selection, const MCSectionCOFF *Assoc = 0) const;
+    void setSelection(int Selection,
+                      const MCSectionCOFF *Assoc = nullptr) const;
 
     void PrintSwitchToSection(const MCAsmInfo &MAI, raw_ostream &OS,
                               const MCExpr *Subsection) const override;
diff --git a/include/llvm/MC/MCSectionELF.h b/include/llvm/MC/MCSectionELF.h
index 89c02cc..5ec23f1 100644
--- a/include/llvm/MC/MCSectionELF.h
+++ b/include/llvm/MC/MCSectionELF.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_MC_MCSECTIONELF_H
 #define LLVM_MC_MCSECTIONELF_H
 
-#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
@@ -53,6 +53,9 @@ private:
     : MCSection(SV_ELF, K), SectionName(Section), Type(type), Flags(flags),
       EntrySize(entrySize), Group(group) {}
   ~MCSectionELF();
+
+  void setSectionName(StringRef Name) { SectionName = Name; }
+
 public:
 
   /// ShouldOmitSectionDirective - Decides whether a '.section' directive
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 8ee60c1..2a8367a 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -121,6 +121,8 @@ public:
 
   virtual void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE);
 
+  virtual void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value);
+
   void finish() override;
 
   /// Callback used to implement the ldr= pseudo.
@@ -152,9 +154,6 @@ class MCStreamer {
   MCStreamer(const MCStreamer &) LLVM_DELETED_FUNCTION;
   MCStreamer &operator=(const MCStreamer &) LLVM_DELETED_FUNCTION;
 
-  bool EmitEHFrame;
-  bool EmitDebugFrame;
-
   std::vector<MCDwarfFrameInfo> FrameInfos;
   MCDwarfFrameInfo *getCurrentFrameInfo();
   MCSymbol *EmitCFICommon();
@@ -187,7 +186,6 @@ protected:
   virtual void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame);
   void RecordProcEnd(MCDwarfFrameInfo &Frame);
   virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &CurFrame);
-  void EmitFrames(MCAsmBackend *MAB, bool usingCFI);
 
   MCWin64EHUnwindInfo *getCurrentW64UnwindInfo() {
     return CurrentW64UnwindInfo;
@@ -332,7 +330,8 @@ public:
   /// @p Section.  This is required to update CurSection.
   ///
   /// This corresponds to assembler directives like .section, .text, etc.
-  void SwitchSection(const MCSection *Section, const MCExpr *Subsection = 0) {
+  void SwitchSection(const MCSection *Section,
+                     const MCExpr *Subsection = nullptr) {
     assert(Section && "Cannot switch to a null section!");
     MCSectionSubPair curSection = SectionStack.back().first;
     SectionStack.back().second = curSection;
@@ -346,7 +345,7 @@ public:
   /// emitted to @p Section.  This is required to update CurSection. This
   /// version does not call ChangeSection.
   void SwitchSectionNoChange(const MCSection *Section,
-                             const MCExpr *Subsection = 0) {
+                             const MCExpr *Subsection = nullptr) {
     assert(Section && "Cannot switch to a null section!");
     MCSectionSubPair curSection = SectionStack.back().first;
     SectionStack.back().second = curSection;
@@ -397,9 +396,6 @@ public:
   /// a Thumb mode function (ARM target only).
   virtual void EmitThumbFunc(MCSymbol *Func) = 0;
 
-  /// getOrCreateSymbolData - Get symbol data for given symbol.
-  virtual MCSymbolData &getOrCreateSymbolData(const MCSymbol *Symbol);
-
   /// EmitAssignment - Emit an assignment of @p Value to @p Symbol.
   ///
   /// This corresponds to an assembler statement such as:
@@ -495,8 +491,9 @@ public:
   /// @param Size - The size of the zerofill symbol.
   /// @param ByteAlignment - The alignment of the zerofill symbol if
   /// non-zero. This must be a power of 2 on some targets.
-  virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                            uint64_t Size = 0, unsigned ByteAlignment = 0) = 0;
+  virtual void EmitZerofill(const MCSection *Section,
+                            MCSymbol *Symbol = nullptr, uint64_t Size = 0,
+                            unsigned ByteAlignment = 0) = 0;
 
   /// EmitTBSSSymbol - Emit a thread local bss (.tbss) symbol.
   ///
@@ -527,9 +524,12 @@ public:
   /// @param Value - The value to emit.
   /// @param Size - The size of the integer (in bytes) to emit. This must
   /// match a native machine width.
-  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) = 0;
+  /// @param Loc - The location of the expression for error reporting.
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                             const SMLoc &Loc = SMLoc()) = 0;
 
-  void EmitValue(const MCExpr *Value, unsigned Size);
+  void EmitValue(const MCExpr *Value, unsigned Size,
+                 const SMLoc &Loc = SMLoc());
 
   /// EmitIntValue - Special case of EmitValue that avoids the client having
   /// to pass in a MCExpr for constant integers.
@@ -650,14 +650,6 @@ public:
                                      unsigned Isa, unsigned Discriminator,
                                      StringRef FileName);
 
-  virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
-                                        const MCSymbol *LastLabel,
-                                        const MCSymbol *Label,
-                                        unsigned PointerSize) = 0;
-
-  virtual void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
-                                         const MCSymbol *Label) {}
-
   virtual MCSymbol *getDwarfLineTableSymbol(unsigned CUID);
 
   void EmitDwarfSetLineAddr(int64_t LineDelta, const MCSymbol *Label,
@@ -754,10 +746,9 @@ MCStreamer *createNullStreamer(MCContext &Ctx);
 /// \param ShowInst - Whether to show the MCInst representation inline with
 /// the assembly.
 MCStreamer *createAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                              bool isVerboseAsm, bool useCFI,
-                              bool useDwarfDirectory, MCInstPrinter *InstPrint,
-                              MCCodeEmitter *CE, MCAsmBackend *TAB,
-                              bool ShowInst);
+                              bool isVerboseAsm, bool useDwarfDirectory,
+                              MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                              MCAsmBackend *TAB, bool ShowInst);
 
 /// createMachOStreamer - Create a machine code streamer which will generate
 /// Mach-O format object files.
@@ -768,14 +759,6 @@ MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
                                 bool RelaxAll = false,
                                 bool LabelSections = false);
 
-/// createWinCOFFStreamer - Create a machine code streamer which will
-/// generate Microsoft COFF format object files.
-///
-/// Takes ownership of \p TAB and \p CE.
-MCStreamer *createWinCOFFStreamer(MCContext &Ctx, MCAsmBackend &TAB,
-                                  MCCodeEmitter &CE, raw_ostream &OS,
-                                  bool RelaxAll = false);
-
 /// createELFStreamer - Create a machine code streamer which will generate
 /// ELF format object files.
 MCStreamer *createELFStreamer(MCContext &Ctx, MCAsmBackend &TAB,
diff --git a/include/llvm/MC/MCSubtargetInfo.h b/include/llvm/MC/MCSubtargetInfo.h
index 01e8236..088c5e7 100644
--- a/include/llvm/MC/MCSubtargetInfo.h
+++ b/include/llvm/MC/MCSubtargetInfo.h
@@ -28,8 +28,8 @@ class StringRef;
 ///
 class MCSubtargetInfo {
   std::string TargetTriple;            // Target triple
-  const SubtargetFeatureKV *ProcFeatures;  // Processor feature list
-  const SubtargetFeatureKV *ProcDesc;  // Processor descriptions
+  ArrayRef<SubtargetFeatureKV> ProcFeatures;  // Processor feature list
+  ArrayRef<SubtargetFeatureKV> ProcDesc;  // Processor descriptions
 
   // Scheduler machine model
   const SubtargetInfoKV *ProcSchedModels;
@@ -41,21 +41,18 @@ class MCSubtargetInfo {
   const InstrStage *Stages;            // Instruction itinerary stages
   const unsigned *OperandCycles;       // Itinerary operand cycles
   const unsigned *ForwardingPaths;     // Forwarding paths
-  unsigned NumFeatures;                // Number of processor features
-  unsigned NumProcs;                   // Number of processors
   uint64_t FeatureBits;                // Feature bits for current CPU + FS
 
 public:
   void InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
-                           const SubtargetFeatureKV *PF,
-                           const SubtargetFeatureKV *PD,
+                           ArrayRef<SubtargetFeatureKV> PF,
+                           ArrayRef<SubtargetFeatureKV> PD,
                            const SubtargetInfoKV *ProcSched,
                            const MCWriteProcResEntry *WPR,
                            const MCWriteLatencyEntry *WL,
                            const MCReadAdvanceEntry *RA,
                            const InstrStage *IS,
-                           const unsigned *OC, const unsigned *FP,
-                           unsigned NF, unsigned NP);
+                           const unsigned *OC, const unsigned *FP);
 
   /// getTargetTriple - Return the target triple string.
   StringRef getTargetTriple() const {
diff --git a/include/llvm/MC/MCSymbol.h b/include/llvm/MC/MCSymbol.h
index ea14da1..0b3c3ce 100644
--- a/include/llvm/MC/MCSymbol.h
+++ b/include/llvm/MC/MCSymbol.h
@@ -60,7 +60,7 @@ namespace llvm {
     friend class MCExpr;
     friend class MCContext;
     MCSymbol(StringRef name, bool isTemporary)
-      : Name(name), Section(0), Value(0),
+      : Name(name), Section(nullptr), Value(nullptr),
         IsTemporary(isTemporary), IsUsed(false) {}
 
     MCSymbol(const MCSymbol&) LLVM_DELETED_FUNCTION;
@@ -87,7 +87,7 @@ namespace llvm {
     ///
     /// Defined symbols are either absolute or in some section.
     bool isDefined() const {
-      return Section != 0;
+      return Section != nullptr;
     }
 
     /// isInSection - Check if this symbol is defined in some section (i.e., it
@@ -118,7 +118,7 @@ namespace llvm {
 
     /// setUndefined - Mark the symbol as undefined.
     void setUndefined() {
-      Section = 0;
+      Section = nullptr;
     }
 
     /// setAbsolute - Mark the symbol as absolute.
@@ -130,7 +130,7 @@ namespace llvm {
 
     /// isVariable - Check if this is a variable symbol.
     bool isVariable() const {
-      return Value != 0;
+      return Value != nullptr;
     }
 
     /// getVariableValue() - Get the value for variable symbols.
diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h
index 0073136..18ef6c2 100644
--- a/include/llvm/MC/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCTargetAsmParser.h
@@ -12,14 +12,15 @@
 
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCTargetOptions.h"
 
 namespace llvm {
-class MCStreamer;
-class StringRef;
-class SMLoc;
 class AsmToken;
-class MCParsedAsmOperand;
 class MCInst;
+class MCParsedAsmOperand;
+class MCStreamer;
+class SMLoc;
+class StringRef;
 template <typename T> class SmallVectorImpl;
 
 enum AsmRewriteKind {
@@ -63,7 +64,7 @@ struct ParseInstructionInfo {
 
   SmallVectorImpl<AsmRewrite> *AsmRewrites;
 
-  ParseInstructionInfo() : AsmRewrites(0) {}
+  ParseInstructionInfo() : AsmRewrites(nullptr) {}
   ParseInstructionInfo(SmallVectorImpl<AsmRewrite> *rewrites)
     : AsmRewrites(rewrites) {}
 
@@ -97,6 +98,9 @@ protected: // Can only create subclasses.
   /// ms-style inline assembly.
   MCAsmParserSemaCallback *SemaCallback;
 
+  /// Set of options which affects instrumentation of inline assembly.
+  MCTargetOptions MCOptions;
+
 public:
   virtual ~MCTargetAsmParser();
 
@@ -179,7 +183,7 @@ public:
   virtual const MCExpr *applyModifierToExpr(const MCExpr *E,
                                             MCSymbolRefExpr::VariantKind,
                                             MCContext &Ctx) {
-    return 0;
+    return nullptr;
   }
 
   virtual void onLabelParsed(MCSymbol *Symbol) { };
diff --git a/include/llvm/MC/MCTargetOptions.h b/include/llvm/MC/MCTargetOptions.h
new file mode 100644
index 0000000..80cc8be
--- /dev/null
+++ b/include/llvm/MC/MCTargetOptions.h
@@ -0,0 +1,54 @@
+//===- MCTargetOptions.h - MC Target Options -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCTARGETOPTIONS_H
+#define LLVM_MC_MCTARGETOPTIONS_H
+
+namespace llvm {
+
+class MCTargetOptions {
+public:
+  enum AsmInstrumentation {
+    AsmInstrumentationNone,
+    AsmInstrumentationAddress
+  };
+
+  /// Enables AddressSanitizer instrumentation at machine level.
+  bool SanitizeAddress : 1;
+
+  bool MCRelaxAll : 1;
+  bool MCNoExecStack : 1;
+  bool MCSaveTempLabels : 1;
+  bool MCUseDwarfDirectory : 1;
+  bool ShowMCEncoding : 1;
+  bool ShowMCInst : 1;
+  bool AsmVerbose : 1;
+  MCTargetOptions();
+};
+
+inline bool operator==(const MCTargetOptions &LHS, const MCTargetOptions &RHS) {
+#define ARE_EQUAL(X) LHS.X == RHS.X
+  return (ARE_EQUAL(SanitizeAddress) &&
+          ARE_EQUAL(MCRelaxAll) &&
+          ARE_EQUAL(MCNoExecStack) &&
+          ARE_EQUAL(MCSaveTempLabels) &&
+          ARE_EQUAL(MCUseDwarfDirectory) &&
+          ARE_EQUAL(ShowMCEncoding) &&
+          ARE_EQUAL(ShowMCInst) &&
+          ARE_EQUAL(AsmVerbose));
+#undef ARE_EQUAL
+}
+
+inline bool operator!=(const MCTargetOptions &LHS, const MCTargetOptions &RHS) {
+  return !(LHS == RHS);
+}
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/MC/MCTargetOptionsCommandFlags.h b/include/llvm/MC/MCTargetOptionsCommandFlags.h
new file mode 100644
index 0000000..17a117a
--- /dev/null
+++ b/include/llvm/MC/MCTargetOptionsCommandFlags.h
@@ -0,0 +1,44 @@
+//===-- MCTargetOptionsCommandFlags.h --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains machine code-specific flags that are shared between
+// different command line tools.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCTARGETOPTIONSCOMMANDFLAGS_H
+#define LLVM_MC_MCTARGETOPTIONSCOMMANDFLAGS_H
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/MC/MCTargetOptions.h"
+using namespace llvm;
+
+cl::opt<MCTargetOptions::AsmInstrumentation> AsmInstrumentation(
+    "asm-instrumentation", cl::desc("Instrumentation of inline assembly and "
+                                    "assembly source files"),
+    cl::init(MCTargetOptions::AsmInstrumentationNone),
+    cl::values(clEnumValN(MCTargetOptions::AsmInstrumentationNone, "none",
+                          "no instrumentation at all"),
+               clEnumValN(MCTargetOptions::AsmInstrumentationAddress, "address",
+                          "instrument instructions with memory arguments"),
+               clEnumValEnd));
+
+cl::opt<bool> RelaxAll("mc-relax-all",
+                       cl::desc("When used with filetype=obj, "
+                                "relax all fixups in the emitted object file"));
+
+static inline MCTargetOptions InitMCTargetOptionsFromFlags() {
+  MCTargetOptions Options;
+  Options.SanitizeAddress =
+      (AsmInstrumentation == MCTargetOptions::AsmInstrumentationAddress);
+  Options.MCRelaxAll = RelaxAll;
+  return Options;
+}
+
+#endif
diff --git a/include/llvm/MC/MCValue.h b/include/llvm/MC/MCValue.h
index f4ea511..dd86979 100644
--- a/include/llvm/MC/MCValue.h
+++ b/include/llvm/MC/MCValue.h
@@ -14,14 +14,13 @@
 #ifndef LLVM_MC_MCVALUE_H
 #define LLVM_MC_MCVALUE_H
 
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
 
 namespace llvm {
 class MCAsmInfo;
-class MCSymbol;
-class MCSymbolRefExpr;
 class raw_ostream;
 
 /// MCValue - This represents an "assembler immediate".  In its most
@@ -61,7 +60,10 @@ public:
   /// dump - Print the value to stderr.
   void dump() const;
 
-  static MCValue get(const MCSymbolRefExpr *SymA, const MCSymbolRefExpr *SymB=0,
+  MCSymbolRefExpr::VariantKind getAccessVariant() const;
+
+  static MCValue get(const MCSymbolRefExpr *SymA,
+                     const MCSymbolRefExpr *SymB = nullptr,
                      int64_t Val = 0, uint32_t RefKind = 0) {
     MCValue R;
     assert((!SymB || SymA) && "Invalid relocatable MCValue!");
@@ -75,8 +77,8 @@ public:
   static MCValue get(int64_t Val) {
     MCValue R;
     R.Cst = Val;
-    R.SymA = 0;
-    R.SymB = 0;
+    R.SymA = nullptr;
+    R.SymB = nullptr;
     R.RefKind = 0;
     return R;
   }
diff --git a/include/llvm/MC/MCWin64EH.h b/include/llvm/MC/MCWin64EH.h
index eb4665a..d21e762 100644
--- a/include/llvm/MC/MCWin64EH.h
+++ b/include/llvm/MC/MCWin64EH.h
@@ -61,11 +61,11 @@ namespace llvm {
   };
 
   struct MCWin64EHUnwindInfo {
-    MCWin64EHUnwindInfo() : Begin(0), End(0), ExceptionHandler(0),
-                            Function(0), PrologEnd(0), Symbol(0),
-                            HandlesUnwind(false), HandlesExceptions(false),
-                            LastFrameInst(-1), ChainedParent(0),
-                            Instructions() {}
+    MCWin64EHUnwindInfo()
+      : Begin(nullptr), End(nullptr),ExceptionHandler(nullptr),
+        Function(nullptr), PrologEnd(nullptr), Symbol(nullptr),
+        HandlesUnwind(false), HandlesExceptions(false), LastFrameInst(-1),
+        ChainedParent(nullptr), Instructions() {}
     MCSymbol *Begin;
     MCSymbol *End;
     const MCSymbol *ExceptionHandler;
diff --git a/include/llvm/MC/MCWinCOFFObjectWriter.h b/include/llvm/MC/MCWinCOFFObjectWriter.h
index 213481c..dad7bb5 100644
--- a/include/llvm/MC/MCWinCOFFObjectWriter.h
+++ b/include/llvm/MC/MCWinCOFFObjectWriter.h
@@ -30,6 +30,7 @@ namespace llvm {
     virtual unsigned getRelocType(const MCValue &Target,
                                   const MCFixup &Fixup,
                                   bool IsCrossSection) const = 0;
+    virtual bool recordRelocation(const MCFixup &) const { return true; }
   };
 
   /// \brief Construct a new Win COFF writer instance.
diff --git a/include/llvm/MC/MCWinCOFFStreamer.h b/include/llvm/MC/MCWinCOFFStreamer.h
new file mode 100644
index 0000000..34e39bb
--- /dev/null
+++ b/include/llvm/MC/MCWinCOFFStreamer.h
@@ -0,0 +1,75 @@
+//===- MCWinCOFFStreamer.h - COFF Object File Interface ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCWINCOFFSTREAMER_H
+#define LLVM_MC_MCWINCOFFSTREAMER_H
+
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCObjectStreamer.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCContext;
+class MCCodeEmitter;
+class MCExpr;
+class MCInst;
+class MCSection;
+class MCSubtargetInfo;
+class MCSymbol;
+class StringRef;
+class raw_ostream;
+
+class MCWinCOFFStreamer : public MCObjectStreamer {
+public:
+  MCWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB, MCCodeEmitter &CE,
+                    raw_ostream &OS);
+
+  /// \name MCStreamer interface
+  /// \{
+
+  void InitSections() override;
+  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitDebugLabel(MCSymbol *Symbol) override;
+  void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
+  void EmitThumbFunc(MCSymbol *Func) override;
+  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
+  void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
+  void BeginCOFFSymbolDef(MCSymbol const *Symbol) override;
+  void EmitCOFFSymbolStorageClass(int StorageClass) override;
+  void EmitCOFFSymbolType(int Type) override;
+  void EndCOFFSymbolDef() override;
+  void EmitCOFFSectionIndex(MCSymbol const *Symbol) override;
+  void EmitCOFFSecRel32(MCSymbol const *Symbol) override;
+  void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) override;
+  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                        unsigned ByteAlignment) override;
+  void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                             unsigned ByteAlignment) override;
+  void EmitZerofill(const MCSection *Section, MCSymbol *Symbol, uint64_t Size,
+                    unsigned ByteAlignment) override;
+  void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol, uint64_t Size,
+                      unsigned ByteAlignment) override;
+  void EmitFileDirective(StringRef Filename) override;
+  void EmitIdent(StringRef IdentString) override;
+  void EmitWin64EHHandlerData() override;
+  void FinishImpl() override;
+
+  /// \}
+
+protected:
+  const MCSymbol *CurSymbol;
+  void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+
+private:
+  LLVM_ATTRIBUTE_NORETURN void FatalError(const Twine &Msg) const;
+};
+}
+
+#endif
+
diff --git a/include/llvm/MC/SubtargetFeature.h b/include/llvm/MC/SubtargetFeature.h
index d0735cc..c5d62a6 100644
--- a/include/llvm/MC/SubtargetFeature.h
+++ b/include/llvm/MC/SubtargetFeature.h
@@ -18,9 +18,9 @@
 #ifndef LLVM_MC_SUBTARGETFEATURE_H
 #define LLVM_MC_SUBTARGETFEATURE_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/DataTypes.h"
-#include <vector>
 
 namespace llvm {
   class raw_ostream;
@@ -78,20 +78,17 @@ public:
   std::string getString() const;
 
   /// Adding Features.
-  void AddFeature(const StringRef String, bool IsEnabled = true);
+  void AddFeature(const StringRef String);
 
   /// ToggleFeature - Toggle a feature and returns the newly updated feature
   /// bits.
   uint64_t ToggleFeature(uint64_t Bits, const StringRef String,
-                         const SubtargetFeatureKV *FeatureTable,
-                         size_t FeatureTableSize);
+                         ArrayRef<SubtargetFeatureKV> FeatureTable);
 
   /// Get feature bits of a CPU.
   uint64_t getFeatureBits(const StringRef CPU,
-                          const SubtargetFeatureKV *CPUTable,
-                          size_t CPUTableSize,
-                          const SubtargetFeatureKV *FeatureTable,
-                          size_t FeatureTableSize);
+                          ArrayRef<SubtargetFeatureKV> CPUTable,
+                          ArrayRef<SubtargetFeatureKV> FeatureTable);
 
   /// Print feature string.
   void print(raw_ostream &OS) const;
diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index 4fae76f..652b659 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h
@@ -89,21 +89,17 @@ public:
       return StringRef(Data.data() + StartOfFile, getSize());
     }
 
-    error_code getMemoryBuffer(OwningPtr<MemoryBuffer> &Result,
-                               bool FullPath = false) const;
     error_code getMemoryBuffer(std::unique_ptr<MemoryBuffer> &Result,
                                bool FullPath = false) const;
 
-    error_code getAsBinary(OwningPtr<Binary> &Result,
-                           LLVMContext *Context = 0) const;
     error_code getAsBinary(std::unique_ptr<Binary> &Result,
-                           LLVMContext *Context = 0) const;
+                           LLVMContext *Context = nullptr) const;
   };
 
   class child_iterator {
     Child child;
   public:
-    child_iterator() : child(Child(0, 0)) {}
+    child_iterator() : child(Child(nullptr, nullptr)) {}
     child_iterator(const Child &c) : child(c) {}
     const Child* operator->() const {
       return &child;
diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h
index b10e40a..8ac84e7 100644
--- a/include/llvm/Object/Binary.h
+++ b/include/llvm/Object/Binary.h
@@ -128,7 +128,8 @@ public:
 /// @param Source The data to create the Binary from. Ownership is transferred
 ///        to the Binary if successful. If an error is returned,
 ///        Source is destroyed by createBinary before returning.
-ErrorOr<Binary *> createBinary(MemoryBuffer *Source, LLVMContext *Context = 0);
+ErrorOr<Binary *> createBinary(MemoryBuffer *Source,
+                               LLVMContext *Context = nullptr);
 
 ErrorOr<Binary *> createBinary(StringRef Path);
 }
diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h
index 6e05c2d..bd9c677 100644
--- a/include/llvm/Object/COFF.h
+++ b/include/llvm/Object/COFF.h
@@ -287,6 +287,10 @@ struct coff_aux_weak_external {
   char Unused[10];
 };
 
+struct coff_aux_file {
+  char FileName[18];
+};
+
 struct coff_aux_section_definition {
   support::ulittle32_t Length;
   support::ulittle16_t NumberOfRelocations;
@@ -387,7 +391,6 @@ protected:
                                    bool &Result) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
-  bool section_rel_empty(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
   error_code getRelocationAddress(DataRefImpl Rel,
@@ -461,7 +464,7 @@ public:
 // The iterator for the import directory table.
 class ImportDirectoryEntryRef {
 public:
-  ImportDirectoryEntryRef() : OwningObject(0) {}
+  ImportDirectoryEntryRef() : OwningObject(nullptr) {}
   ImportDirectoryEntryRef(const import_directory_table_entry *Table, uint32_t I,
                           const COFFObjectFile *Owner)
       : ImportTable(Table), Index(I), OwningObject(Owner) {}
@@ -485,7 +488,7 @@ private:
 // The iterator for the export directory table entry.
 class ExportDirectoryEntryRef {
 public:
-  ExportDirectoryEntryRef() : OwningObject(0) {}
+  ExportDirectoryEntryRef() : OwningObject(nullptr) {}
   ExportDirectoryEntryRef(const export_directory_table_entry *Table, uint32_t I,
                           const COFFObjectFile *Owner)
       : ExportTable(Table), Index(I), OwningObject(Owner) {}
diff --git a/include/llvm/Object/COFFYAML.h b/include/llvm/Object/COFFYAML.h
index b5f9ccc..3f48e07 100644
--- a/include/llvm/Object/COFFYAML.h
+++ b/include/llvm/Object/COFFYAML.h
@@ -121,8 +121,13 @@ struct ScalarEnumerationTraits<COFF::SymbolComplexType> {
 };
 
 template <>
-struct ScalarEnumerationTraits<COFF::RelocationTypeX86> {
-  static void enumeration(IO &IO, COFF::RelocationTypeX86 &Value);
+struct ScalarEnumerationTraits<COFF::RelocationTypeI386> {
+  static void enumeration(IO &IO, COFF::RelocationTypeI386 &Value);
+};
+
+template <>
+struct ScalarEnumerationTraits<COFF::RelocationTypeAMD64> {
+  static void enumeration(IO &IO, COFF::RelocationTypeAMD64 &Value);
 };
 
 template <>
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index 824e06e..ee97d4e 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -60,12 +60,12 @@ public:
   public:
     typedef ptrdiff_t difference_type;
     typedef EntT value_type;
-    typedef std::random_access_iterator_tag iterator_category;
+    typedef std::forward_iterator_tag iterator_category;
     typedef value_type &reference;
     typedef value_type *pointer;
 
     /// \brief Default construct iterator.
-    ELFEntityIterator() : EntitySize(0), Current(0) {}
+    ELFEntityIterator() : EntitySize(0), Current(nullptr) {}
     ELFEntityIterator(uintX_t EntSize, const char *Start)
         : EntitySize(EntSize), Current(Start) {}
 
@@ -136,6 +136,7 @@ public:
   typedef ELFEntityIterator<const Elf_Rela> Elf_Rela_Iter;
   typedef ELFEntityIterator<const Elf_Rel> Elf_Rel_Iter;
   typedef ELFEntityIterator<const Elf_Shdr> Elf_Shdr_Iter;
+  typedef iterator_range<Elf_Shdr_Iter> Elf_Shdr_Range;
 
   /// \brief Archive files are 2 byte aligned, so we need this for
   ///     PointerIntPair to work.
@@ -249,7 +250,7 @@ private:
 
   /// \brief Represents a region described by entries in the .dynamic table.
   struct DynRegionInfo {
-    DynRegionInfo() : Addr(0), Size(0), EntSize(0) {}
+    DynRegionInfo() : Addr(nullptr), Size(0), EntSize(0) {}
     /// \brief Address in current address space.
     const void *Addr;
     /// \brief Size in bytes of the region.
@@ -273,19 +274,19 @@ private:
     public:
     // If the integer is 0, this is an Elf_Verdef*.
     // If the integer is 1, this is an Elf_Vernaux*.
-    VersionMapEntry() : PointerIntPair<const void*, 1>(NULL, 0) { }
+    VersionMapEntry() : PointerIntPair<const void*, 1>(nullptr, 0) { }
     VersionMapEntry(const Elf_Verdef *verdef)
         : PointerIntPair<const void*, 1>(verdef, 0) { }
     VersionMapEntry(const Elf_Vernaux *vernaux)
         : PointerIntPair<const void*, 1>(vernaux, 1) { }
-    bool isNull() const { return getPointer() == NULL; }
+    bool isNull() const { return getPointer() == nullptr; }
     bool isVerdef() const { return !isNull() && getInt() == 0; }
     bool isVernaux() const { return !isNull() && getInt() == 1; }
     const Elf_Verdef *getVerdef() const {
-      return isVerdef() ? (const Elf_Verdef*)getPointer() : NULL;
+      return isVerdef() ? (const Elf_Verdef*)getPointer() : nullptr;
     }
     const Elf_Vernaux *getVernaux() const {
-      return isVernaux() ? (const Elf_Vernaux*)getPointer() : NULL;
+      return isVernaux() ? (const Elf_Vernaux*)getPointer() : nullptr;
     }
   };
   mutable SmallVector<VersionMapEntry, 16> VersionMap;
@@ -317,6 +318,11 @@ public:
 
   ELFFile(MemoryBuffer *Object, error_code &ec);
 
+  bool isMipsELF64() const {
+    return Header->e_machine == ELF::EM_MIPS &&
+      Header->getFileClass() == ELF::ELFCLASS64;
+  }
+
   bool isMips64EL() const {
     return Header->e_machine == ELF::EM_MIPS &&
       Header->getFileClass() == ELF::ELFCLASS64 &&
@@ -325,6 +331,9 @@ public:
 
   Elf_Shdr_Iter begin_sections() const;
   Elf_Shdr_Iter end_sections() const;
+  Elf_Shdr_Range sections() const {
+    return make_range(begin_sections(), end_sections());
+  }
 
   Elf_Sym_Iter begin_symbols() const;
   Elf_Sym_Iter end_symbols() const;
@@ -338,7 +347,7 @@ public:
     if (DynSymRegion.Addr)
       return Elf_Sym_Iter(DynSymRegion.EntSize, (const char *)DynSymRegion.Addr,
                           true);
-    return Elf_Sym_Iter(0, 0, true);
+    return Elf_Sym_Iter(0, nullptr, true);
   }
 
   Elf_Sym_Iter end_dynamic_symbols() const {
@@ -346,7 +355,7 @@ public:
       return Elf_Sym_Iter(DynSymRegion.EntSize,
                           (const char *)DynSymRegion.Addr + DynSymRegion.Size,
                           true);
-    return Elf_Sym_Iter(0, 0, true);
+    return Elf_Sym_Iter(0, nullptr, true);
   }
 
   Elf_Rela_Iter begin_rela(const Elf_Shdr *sec) const {
@@ -478,7 +487,7 @@ void ELFFile<ELFT>::LoadVersionNeeds(const Elf_Shdr *sec) const {
 template <class ELFT>
 void ELFFile<ELFT>::LoadVersionMap() const {
   // If there is no dynamic symtab or version table, there is nothing to do.
-  if (DynSymRegion.Addr == NULL || dot_gnu_version_sec == NULL)
+  if (!DynSymRegion.Addr || !dot_gnu_version_sec)
     return;
 
   // Has the VersionMap already been loaded?
@@ -510,7 +519,7 @@ ELFFile<ELFT>::getSection(const Elf_Sym *symb) const {
   if (symb->st_shndx == ELF::SHN_XINDEX)
     return getSection(ExtendedSymbolTable.lookup(symb));
   if (symb->st_shndx >= ELF::SHN_LORESERVE)
-    return 0;
+    return nullptr;
   return getSection(symb->st_shndx);
 }
 
@@ -537,10 +546,16 @@ StringRef ELFFile<ELFT>::getRelocationTypeName(uint32_t Type) const {
 template <class ELFT>
 void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
                                           SmallVectorImpl<char> &Result) const {
-  if (!isMips64EL()) {
+  if (!isMipsELF64()) {
     StringRef Name = getRelocationTypeName(Type);
     Result.append(Name.begin(), Name.end());
   } else {
+    // The Mips N64 ABI allows up to three operations to be specified per
+    // relocation record. Unfortunately there's no easy way to test for the
+    // presence of N64 ELFs as they have no special flag that identifies them
+    // as being N64. We can safely assume at the moment that all Mips
+    // ELFCLASS64 ELFs are N64. New Mips64 ABIs should provide enough
+    // information to disambiguate between old vs new ABIs.
     uint8_t Type1 = (Type >> 0) & 0xFF;
     uint8_t Type2 = (Type >> 8) & 0xFF;
     uint8_t Type3 = (Type >> 16) & 0xFF;
@@ -565,7 +580,7 @@ std::pair<const typename ELFFile<ELFT>::Elf_Shdr *,
           const typename ELFFile<ELFT>::Elf_Sym *>
 ELFFile<ELFT>::getRelocationSymbol(const Elf_Shdr *Sec, const RelT *Rel) const {
   if (!Sec->sh_link)
-    return std::make_pair((const Elf_Shdr *)0, (const Elf_Sym *)0);
+    return std::make_pair(nullptr, nullptr);
   const Elf_Shdr *SymTable = getSection(Sec->sh_link);
   return std::make_pair(
       SymTable, getEntry<Elf_Sym>(SymTable, Rel->getSymbol(isMips64EL())));
@@ -604,15 +619,15 @@ typename ELFFile<ELFT>::uintX_t ELFFile<ELFT>::getStringTableIndex() const {
 template <class ELFT>
 ELFFile<ELFT>::ELFFile(MemoryBuffer *Object, error_code &ec)
     : Buf(Object),
-      SectionHeaderTable(0),
-      dot_shstrtab_sec(0),
-      dot_strtab_sec(0),
-      dot_symtab_sec(0),
-      SymbolTableSectionHeaderIndex(0),
-      dot_gnu_version_sec(0),
-      dot_gnu_version_r_sec(0),
-      dot_gnu_version_d_sec(0),
-      dt_soname(0) {
+      SectionHeaderTable(nullptr),
+      dot_shstrtab_sec(nullptr),
+      dot_strtab_sec(nullptr),
+      dot_symtab_sec(nullptr),
+      SymbolTableSectionHeaderIndex(nullptr),
+      dot_gnu_version_sec(nullptr),
+      dot_gnu_version_r_sec(nullptr),
+      dot_gnu_version_d_sec(nullptr),
+      dt_soname(nullptr) {
   const uint64_t FileSize = Buf->getBufferSize();
 
   if (sizeof(Elf_Ehdr) > FileSize)
@@ -641,30 +656,29 @@ ELFFile<ELFT>::ELFFile(MemoryBuffer *Object, error_code &ec)
 
   // Scan sections for special sections.
 
-  for (Elf_Shdr_Iter SecI = begin_sections(), SecE = end_sections();
-       SecI != SecE; ++SecI) {
-    switch (SecI->sh_type) {
+  for (const Elf_Shdr &Sec : sections()) {
+    switch (Sec.sh_type) {
     case ELF::SHT_SYMTAB_SHNDX:
       if (SymbolTableSectionHeaderIndex)
         // FIXME: Proper error handling.
         report_fatal_error("More than one .symtab_shndx!");
-      SymbolTableSectionHeaderIndex = &*SecI;
+      SymbolTableSectionHeaderIndex = &Sec;
       break;
     case ELF::SHT_SYMTAB:
       if (dot_symtab_sec)
         // FIXME: Proper error handling.
         report_fatal_error("More than one .symtab!");
-      dot_symtab_sec = &*SecI;
-      dot_strtab_sec = getSection(SecI->sh_link);
+      dot_symtab_sec = &Sec;
+      dot_strtab_sec = getSection(Sec.sh_link);
       break;
     case ELF::SHT_DYNSYM: {
       if (DynSymRegion.Addr)
         // FIXME: Proper error handling.
         report_fatal_error("More than one .dynsym!");
-      DynSymRegion.Addr = base() + SecI->sh_offset;
-      DynSymRegion.Size = SecI->sh_size;
-      DynSymRegion.EntSize = SecI->sh_entsize;
-      const Elf_Shdr *DynStr = getSection(SecI->sh_link);
+      DynSymRegion.Addr = base() + Sec.sh_offset;
+      DynSymRegion.Size = Sec.sh_size;
+      DynSymRegion.EntSize = Sec.sh_entsize;
+      const Elf_Shdr *DynStr = getSection(Sec.sh_link);
       DynStrRegion.Addr = base() + DynStr->sh_offset;
       DynStrRegion.Size = DynStr->sh_size;
       DynStrRegion.EntSize = DynStr->sh_entsize;
@@ -674,27 +688,27 @@ ELFFile<ELFT>::ELFFile(MemoryBuffer *Object, error_code &ec)
       if (DynamicRegion.Addr)
         // FIXME: Proper error handling.
         report_fatal_error("More than one .dynamic!");
-      DynamicRegion.Addr = base() + SecI->sh_offset;
-      DynamicRegion.Size = SecI->sh_size;
-      DynamicRegion.EntSize = SecI->sh_entsize;
+      DynamicRegion.Addr = base() + Sec.sh_offset;
+      DynamicRegion.Size = Sec.sh_size;
+      DynamicRegion.EntSize = Sec.sh_entsize;
       break;
     case ELF::SHT_GNU_versym:
-      if (dot_gnu_version_sec != NULL)
+      if (dot_gnu_version_sec != nullptr)
         // FIXME: Proper error handling.
         report_fatal_error("More than one .gnu.version section!");
-      dot_gnu_version_sec = &*SecI;
+      dot_gnu_version_sec = &Sec;
       break;
     case ELF::SHT_GNU_verdef:
-      if (dot_gnu_version_d_sec != NULL)
+      if (dot_gnu_version_d_sec != nullptr)
         // FIXME: Proper error handling.
         report_fatal_error("More than one .gnu.version_d section!");
-      dot_gnu_version_d_sec = &*SecI;
+      dot_gnu_version_d_sec = &Sec;
       break;
     case ELF::SHT_GNU_verneed:
-      if (dot_gnu_version_r_sec != NULL)
+      if (dot_gnu_version_r_sec != nullptr)
         // FIXME: Proper error handling.
         report_fatal_error("More than one .gnu.version_r section!");
-      dot_gnu_version_r_sec = &*SecI;
+      dot_gnu_version_r_sec = &Sec;
       break;
     }
   }
@@ -761,7 +775,7 @@ typename ELFFile<ELFT>::Elf_Shdr_Iter ELFFile<ELFT>::end_sections() const {
 template <class ELFT>
 typename ELFFile<ELFT>::Elf_Sym_Iter ELFFile<ELFT>::begin_symbols() const {
   if (!dot_symtab_sec)
-    return Elf_Sym_Iter(0, 0, false);
+    return Elf_Sym_Iter(0, nullptr, false);
   return Elf_Sym_Iter(dot_symtab_sec->sh_entsize,
                       (const char *)base() + dot_symtab_sec->sh_offset, false);
 }
@@ -769,7 +783,7 @@ typename ELFFile<ELFT>::Elf_Sym_Iter ELFFile<ELFT>::begin_symbols() const {
 template <class ELFT>
 typename ELFFile<ELFT>::Elf_Sym_Iter ELFFile<ELFT>::end_symbols() const {
   if (!dot_symtab_sec)
-    return Elf_Sym_Iter(0, 0, false);
+    return Elf_Sym_Iter(0, nullptr, false);
   return Elf_Sym_Iter(dot_symtab_sec->sh_entsize,
                       (const char *)base() + dot_symtab_sec->sh_offset +
                           dot_symtab_sec->sh_size,
@@ -782,14 +796,14 @@ ELFFile<ELFT>::begin_dynamic_table() const {
   if (DynamicRegion.Addr)
     return Elf_Dyn_Iter(DynamicRegion.EntSize,
                         (const char *)DynamicRegion.Addr);
-  return Elf_Dyn_Iter(0, 0);
+  return Elf_Dyn_Iter(0, nullptr);
 }
 
 template <class ELFT>
 typename ELFFile<ELFT>::Elf_Dyn_Iter
 ELFFile<ELFT>::end_dynamic_table(bool NULLEnd) const {
   if (!DynamicRegion.Addr)
-    return Elf_Dyn_Iter(0, 0);
+    return Elf_Dyn_Iter(0, nullptr);
   Elf_Dyn_Iter Ret(DynamicRegion.EntSize,
                     (const char *)DynamicRegion.Addr + DynamicRegion.Size);
 
@@ -842,7 +856,7 @@ template <class ELFT>
 const typename ELFFile<ELFT>::Elf_Shdr *
 ELFFile<ELFT>::getSection(uint32_t index) const {
   if (index == 0)
-    return 0;
+    return nullptr;
   if (!SectionHeaderTable || index >= getNumSections())
     // FIXME: Proper error handling.
     report_fatal_error("Invalid section index!");
@@ -871,7 +885,7 @@ const char *ELFFile<ELFT>::getString(const Elf_Shdr *section,
 template <class ELFT>
 const char *ELFFile<ELFT>::getDynamicString(uintX_t Offset) const {
   if (!DynStrRegion.Addr || Offset >= DynStrRegion.Size)
-    return 0;
+    return nullptr;
   return (const char *)DynStrRegion.Addr + Offset;
 }
 
@@ -913,7 +927,7 @@ ErrorOr<StringRef> ELFFile<ELFT>::getSymbolVersion(const Elf_Shdr *section,
                                                    const Elf_Sym *symb,
                                                    bool &IsDefault) const {
   // Handle non-dynamic symbols.
-  if (section != DynSymRegion.Addr && section != 0) {
+  if (section != DynSymRegion.Addr && section != nullptr) {
     // Non-dynamic symbols can have versions in their names
     // A name of the form 'foo@V1' indicates version 'V1', non-default.
     // A name of the form 'foo@@V2' indicates version 'V2', default version.
@@ -937,7 +951,7 @@ ErrorOr<StringRef> ELFFile<ELFT>::getSymbolVersion(const Elf_Shdr *section,
   }
 
   // This is a dynamic symbol. Look in the GNU symbol version table.
-  if (dot_gnu_version_sec == NULL) {
+  if (!dot_gnu_version_sec) {
     // No version table.
     IsDefault = false;
     return StringRef("");
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index 2958067..302caba 100644
--- a/include/llvm/Object/ELFObjectFile.h
+++ b/include/llvm/Object/ELFObjectFile.h
@@ -89,7 +89,6 @@ protected:
                                    bool &Result) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
-  bool section_rel_empty(DataRefImpl Sec) const override;
   section_iterator getRelocatedSection(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
@@ -256,8 +255,7 @@ error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
   Result = ESym->st_value;
 
   // Clear the ARM/Thumb indicator flag.
-  if (EF.getHeader()->e_machine == ELF::EM_ARM &&
-      ESym->getType() == ELF::STT_FUNC)
+  if (Header->e_machine == ELF::EM_ARM && ESym->getType() == ELF::STT_FUNC)
     Result &= ~1;
 
   if (Header->e_type == ELF::ET_REL)
@@ -497,12 +495,6 @@ ELFObjectFile<ELFT>::section_rel_end(DataRefImpl Sec) const {
 }
 
 template <class ELFT>
-bool ELFObjectFile<ELFT>::section_rel_empty(DataRefImpl Sec) const {
-  const Elf_Shdr *S = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  return S->sh_size == 0;
-}
-
-template <class ELFT>
 section_iterator
 ELFObjectFile<ELFT>::getRelocatedSection(DataRefImpl Sec) const {
   if (EF.getHeader()->e_type != ELF::ET_REL)
@@ -563,10 +555,17 @@ ELFObjectFile<ELFT>::getRelocationSymbol(DataRefImpl Rel) const {
 template <class ELFT>
 error_code ELFObjectFile<ELFT>::getRelocationAddress(DataRefImpl Rel,
                                                      uint64_t &Result) const {
-  assert((EF.getHeader()->e_type == ELF::ET_EXEC ||
-          EF.getHeader()->e_type == ELF::ET_DYN) &&
-         "Only executable and shared objects files have relocation addresses");
-  Result = getROffset(Rel);
+  uint64_t ROffset = getROffset(Rel);
+  const Elf_Ehdr *Header = EF.getHeader();
+
+  if (Header->e_type == ELF::ET_REL) {
+    const Elf_Shdr *RelocationSec = getRelSection(Rel);
+    const Elf_Shdr *RelocatedSec = EF.getSection(RelocationSec->sh_info);
+    Result = ROffset + RelocatedSec->sh_addr;
+  } else {
+    Result = ROffset;
+  }
+
   return object_error::success;
 }
 
diff --git a/include/llvm/Object/ELFYAML.h b/include/llvm/Object/ELFYAML.h
index 1eba660..524e55b 100644
--- a/include/llvm/Object/ELFYAML.h
+++ b/include/llvm/Object/ELFYAML.h
@@ -40,6 +40,7 @@ LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_ELFOSABI)
 // Just use 64, since it can hold 32-bit values too.
 LLVM_YAML_STRONG_TYPEDEF(uint64_t, ELF_EF)
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, ELF_SHT)
+LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_REL)
 // Just use 64, since it can hold 32-bit values too.
 LLVM_YAML_STRONG_TYPEDEF(uint64_t, ELF_SHF)
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STT)
@@ -68,17 +69,42 @@ struct LocalGlobalWeakSymbols {
   std::vector<Symbol> Weak;
 };
 struct Section {
+  enum class SectionKind { RawContent, Relocation };
+  SectionKind Kind;
   StringRef Name;
   ELF_SHT Type;
   ELF_SHF Flags;
   llvm::yaml::Hex64 Address;
-  object::yaml::BinaryRef Content;
   StringRef Link;
+  StringRef Info;
   llvm::yaml::Hex64 AddressAlign;
+  Section(SectionKind Kind) : Kind(Kind) {}
+  virtual ~Section();
+};
+struct RawContentSection : Section {
+  object::yaml::BinaryRef Content;
+  llvm::yaml::Hex64 Size;
+  RawContentSection() : Section(SectionKind::RawContent) {}
+  static bool classof(const Section *S) {
+    return S->Kind == SectionKind::RawContent;
+  }
+};
+struct Relocation {
+  llvm::yaml::Hex64 Offset;
+  int64_t Addend;
+  ELF_REL Type;
+  StringRef Symbol;
+};
+struct RelocationSection : Section {
+  std::vector<Relocation> Relocations;
+  RelocationSection() : Section(SectionKind::Relocation) {}
+  static bool classof(const Section *S) {
+    return S->Kind == SectionKind::Relocation;
+  }
 };
 struct Object {
   FileHeader Header;
-  std::vector<Section> Sections;
+  std::vector<std::unique_ptr<Section>> Sections;
   // Although in reality the symbols reside in a section, it is a lot
   // cleaner and nicer if we read them from the YAML as a separate
   // top-level key, which automatically ensures that invariants like there
@@ -89,8 +115,9 @@ struct Object {
 } // end namespace ELFYAML
 } // end namespace llvm
 
-LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::Section)
+LLVM_YAML_IS_SEQUENCE_VECTOR(std::unique_ptr<llvm::ELFYAML::Section>)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::Symbol)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::Relocation)
 
 namespace llvm {
 namespace yaml {
@@ -141,6 +168,11 @@ struct ScalarEnumerationTraits<ELFYAML::ELF_STT> {
 };
 
 template <>
+struct ScalarEnumerationTraits<ELFYAML::ELF_REL> {
+  static void enumeration(IO &IO, ELFYAML::ELF_REL &Value);
+};
+
+template <>
 struct MappingTraits<ELFYAML::FileHeader> {
   static void mapping(IO &IO, ELFYAML::FileHeader &FileHdr);
 };
@@ -155,9 +187,14 @@ struct MappingTraits<ELFYAML::LocalGlobalWeakSymbols> {
   static void mapping(IO &IO, ELFYAML::LocalGlobalWeakSymbols &Symbols);
 };
 
+template <> struct MappingTraits<ELFYAML::Relocation> {
+  static void mapping(IO &IO, ELFYAML::Relocation &Rel);
+};
+
 template <>
-struct MappingTraits<ELFYAML::Section> {
-  static void mapping(IO &IO, ELFYAML::Section &Section);
+struct MappingTraits<std::unique_ptr<ELFYAML::Section>> {
+  static void mapping(IO &IO, std::unique_ptr<ELFYAML::Section> &Section);
+  static StringRef validate(IO &io, std::unique_ptr<ELFYAML::Section> &Section);
 };
 
 template <>
diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
index f242611..710ad7e 100644
--- a/include/llvm/Object/MachO.h
+++ b/include/llvm/Object/MachO.h
@@ -31,7 +31,7 @@ class DiceRef {
   const ObjectFile *OwningObject;
 
 public:
-  DiceRef() : OwningObject(NULL) { }
+  DiceRef() : OwningObject(nullptr) { }
 
   DiceRef(DataRefImpl DiceP, const ObjectFile *Owner);
 
@@ -88,7 +88,6 @@ public:
                                    bool &Result) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
-  bool section_rel_empty(DataRefImpl Sec) const override;
 
   void moveRelocationNext(DataRefImpl &Rel) const override;
   error_code getRelocationAddress(DataRefImpl Rel,
@@ -112,6 +111,9 @@ public:
   basic_symbol_iterator symbol_begin_impl() const override;
   basic_symbol_iterator symbol_end_impl() const override;
 
+  // MachO specific.
+  basic_symbol_iterator getSymbolByIndex(unsigned Index) const;
+
   section_iterator section_begin() const override;
   section_iterator section_end() const override;
 
diff --git a/include/llvm/Object/MachOUniversal.h b/include/llvm/Object/MachOUniversal.h
index 9b1afd2..d27c824 100644
--- a/include/llvm/Object/MachOUniversal.h
+++ b/include/llvm/Object/MachOUniversal.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Object/Binary.h"
+#include "llvm/Object/Archive.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MachO.h"
 
@@ -41,7 +42,7 @@ public:
     ObjectForArch(const MachOUniversalBinary *Parent, uint32_t Index);
 
     void clear() {
-      Parent = 0;
+      Parent = nullptr;
       Index = 0;
     }
 
@@ -53,6 +54,8 @@ public:
     uint32_t getCPUType() const { return Header.cputype; }
 
     error_code getAsObjectFile(std::unique_ptr<ObjectFile> &Result) const;
+
+    error_code getAsArchive(std::unique_ptr<Archive> &Result) const;
   };
 
   class object_iterator {
@@ -83,7 +86,7 @@ public:
     return ObjectForArch(this, 0);
   }
   object_iterator end_objects() const {
-    return ObjectForArch(0, 0);
+    return ObjectForArch(nullptr, 0);
   }
 
   uint32_t getNumberOfObjects() const { return NumberOfObjects; }
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 8298b63..10209b9 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -38,7 +38,7 @@ class RelocationRef {
   const ObjectFile *OwningObject;
 
 public:
-  RelocationRef() : OwningObject(NULL) { }
+  RelocationRef() : OwningObject(nullptr) { }
 
   RelocationRef(DataRefImpl RelocationP, const ObjectFile *Owner);
 
@@ -82,7 +82,7 @@ class SectionRef {
   const ObjectFile *OwningObject;
 
 public:
-  SectionRef() : OwningObject(NULL) { }
+  SectionRef() : OwningObject(nullptr) { }
 
   SectionRef(DataRefImpl SectionP, const ObjectFile *Owner);
 
@@ -113,11 +113,10 @@ public:
 
   relocation_iterator relocation_begin() const;
   relocation_iterator relocation_end() const;
-  typedef iterator_range<relocation_iterator> relocation_iterator_range;
-  relocation_iterator_range relocations() const {
-    return relocation_iterator_range(relocation_begin(), relocation_end());
+  iterator_range<relocation_iterator> relocations() const {
+    return iterator_range<relocation_iterator>(relocation_begin(),
+                                               relocation_end());
   }
-  bool relocation_empty() const;
   section_iterator getRelocatedSection() const;
 
   DataRefImpl getRawDataRefImpl() const;
@@ -146,7 +145,6 @@ public:
   /// Returns the symbol virtual address (i.e. address at which it will be
   /// mapped).
   error_code getAddress(uint64_t &Result) const;
-  error_code getFileOffset(uint64_t &Result) const;
   /// @brief Get the alignment of this symbol as the actual value (not log 2).
   error_code getAlignment(uint32_t &Result) const;
   error_code getSize(uint64_t &Result) const;
@@ -185,7 +183,7 @@ class LibraryRef {
   const ObjectFile *OwningObject;
 
 public:
-  LibraryRef() : OwningObject(NULL) { }
+  LibraryRef() : OwningObject(nullptr) { }
 
   LibraryRef(DataRefImpl LibraryP, const ObjectFile *Owner);
 
@@ -256,7 +254,6 @@ protected:
                                            bool &Result) const = 0;
   virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const = 0;
   virtual relocation_iterator section_rel_end(DataRefImpl Sec) const = 0;
-  virtual bool section_rel_empty(DataRefImpl Sec) const = 0;
   virtual section_iterator getRelocatedSection(DataRefImpl Sec) const;
 
   // Same as above for RelocationRef.
@@ -350,42 +347,6 @@ inline error_code SymbolRef::getAddress(uint64_t &Result) const {
   return getObject()->getSymbolAddress(getRawDataRefImpl(), Result);
 }
 
-inline error_code SymbolRef::getFileOffset(uint64_t &Result) const {
-  uint64_t Address;
-  if (error_code EC = getAddress(Address))
-    return EC;
-  if (Address == UnknownAddressOrSize) {
-    Result = UnknownAddressOrSize;
-    return object_error::success;
-  }
-
-  const ObjectFile *Obj = getObject();
-  section_iterator SecI(Obj->section_begin());
-  if (error_code EC = getSection(SecI))
-    return EC;
-
-  if (SecI == Obj->section_end()) {
-    Result = UnknownAddressOrSize;
-    return object_error::success;
-  }
-
-  uint64_t SectionAddress;
-  if (error_code EC = SecI->getAddress(SectionAddress))
-    return EC;
-
-  uint64_t OffsetInSection = Address - SectionAddress;
-
-  StringRef SecContents;
-  if (error_code EC = SecI->getContents(SecContents))
-    return EC;
-
-  // FIXME: this is a hack.
-  uint64_t SectionOffset = (uint64_t)SecContents.data() - (uint64_t)Obj->base();
-
-  Result = SectionOffset + OffsetInSection;
-  return object_error::success;
-}
-
 inline error_code SymbolRef::getAlignment(uint32_t &Result) const {
   return getObject()->getSymbolAlignment(getRawDataRefImpl(), Result);
 }
@@ -491,10 +452,6 @@ inline relocation_iterator SectionRef::relocation_end() const {
   return OwningObject->section_rel_end(SectionPimpl);
 }
 
-inline bool SectionRef::relocation_empty() const {
-  return OwningObject->section_rel_empty(SectionPimpl);
-}
-
 inline section_iterator SectionRef::getRelocatedSection() const {
   return OwningObject->getRelocatedSection(SectionPimpl);
 }
diff --git a/include/llvm/Object/StringTableBuilder.h b/include/llvm/Object/StringTableBuilder.h
new file mode 100644
index 0000000..c61e216
--- /dev/null
+++ b/include/llvm/Object/StringTableBuilder.h
@@ -0,0 +1,59 @@
+//===-- StringTableBuilder.h - String table building utility ------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECT_STRINGTABLE_BUILDER_H
+#define LLVM_OBJECT_STRINGTABLE_BUILDER_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include <cassert>
+
+namespace llvm {
+
+/// \brief Utility for building string tables with deduplicated suffixes.
+class StringTableBuilder {
+  SmallString<256> StringTable;
+  StringMap<size_t> StringIndexMap;
+
+public:
+  /// \brief Add a string to the builder. Returns a StringRef to the internal
+  /// copy of s. Can only be used before the table is finalized.
+  StringRef add(StringRef s) {
+    assert(!isFinalized());
+    return StringIndexMap.GetOrCreateValue(s, 0).getKey();
+  }
+
+  /// \brief Analyze the strings and build the final table. No more strings can
+  /// be added after this point.
+  void finalize();
+
+  /// \brief Retrieve the string table data. Can only be used after the table
+  /// is finalized.
+  StringRef data() {
+    assert(isFinalized());
+    return StringTable;
+  }
+
+  /// \brief Get the offest of a string in the string table. Can only be used
+  /// after the table is finalized.
+  size_t getOffset(StringRef s) {
+    assert(isFinalized());
+    assert(StringIndexMap.count(s) && "String is not in table!");
+    return StringIndexMap[s];
+  }
+
+private:
+  bool isFinalized() {
+    return !StringTable.empty();
+  }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/include/llvm/Object/SymbolicFile.h b/include/llvm/Object/SymbolicFile.h
index bead2c3..28400e1 100644
--- a/include/llvm/Object/SymbolicFile.h
+++ b/include/llvm/Object/SymbolicFile.h
@@ -90,7 +90,7 @@ public:
                                  // (e.g. section symbols)
   };
 
-  BasicSymbolRef() : OwningObject(NULL) { }
+  BasicSymbolRef() : OwningObject(nullptr) { }
   BasicSymbolRef(DataRefImpl SymbolP, const SymbolicFile *Owner);
 
   bool operator==(const BasicSymbolRef &Other) const;
@@ -147,7 +147,8 @@ public:
                                                     LLVMContext *Context);
 
   static ErrorOr<SymbolicFile *> createSymbolicFile(MemoryBuffer *Object) {
-    return createSymbolicFile(Object, true, sys::fs::file_magic::unknown, 0);
+    return createSymbolicFile(Object, true, sys::fs::file_magic::unknown,
+                              nullptr);
   }
   static ErrorOr<SymbolicFile *> createSymbolicFile(StringRef ObjectPath);
 
diff --git a/include/llvm/Object/YAML.h b/include/llvm/Object/YAML.h
index 89fe504..1792e8b 100644
--- a/include/llvm/Object/YAML.h
+++ b/include/llvm/Object/YAML.h
@@ -108,6 +108,7 @@ template <> struct ScalarTraits<object::yaml::BinaryRef> {
   static void output(const object::yaml::BinaryRef &, void *,
                      llvm::raw_ostream &);
   static StringRef input(StringRef, void *, object::yaml::BinaryRef &);
+  static bool mustQuote(StringRef S) { return needsQuotes(S); }
 };
 }
 
diff --git a/include/llvm/Option/Arg.h b/include/llvm/Option/Arg.h
index 6b8ed3f..dcaa540 100644
--- a/include/llvm/Option/Arg.h
+++ b/include/llvm/Option/Arg.h
@@ -27,10 +27,7 @@ class ArgList;
 /// \brief A concrete instance of a particular driver option.
 ///
 /// The Arg class encodes just enough information to be able to
-/// derive the argument values efficiently. In addition, Arg
-/// instances have an intrusive double linked list which is used by
-/// ArgList to provide efficient iteration over all instances of a
-/// particular option.
+/// derive the argument values efficiently.
 class Arg {
   Arg(const Arg &) LLVM_DELETED_FUNCTION;
   void operator=(const Arg &) LLVM_DELETED_FUNCTION;
@@ -63,14 +60,14 @@ private:
 
 public:
   Arg(const Option Opt, StringRef Spelling, unsigned Index,
-      const Arg *BaseArg = 0);
+      const Arg *BaseArg = nullptr);
   Arg(const Option Opt, StringRef Spelling, unsigned Index,
-      const char *Value0, const Arg *BaseArg = 0);
+      const char *Value0, const Arg *BaseArg = nullptr);
   Arg(const Option Opt, StringRef Spelling, unsigned Index,
-      const char *Value0, const char *Value1, const Arg *BaseArg = 0);
+      const char *Value0, const char *Value1, const Arg *BaseArg = nullptr);
   ~Arg();
 
-  const Option getOption() const { return Opt; }
+  const Option &getOption() const { return Opt; }
   StringRef getSpelling() const { return Spelling; }
   unsigned getIndex() const { return Index; }
 
diff --git a/include/llvm/Option/ArgList.h b/include/llvm/Option/ArgList.h
index 98ba6ec..ab40a1a 100644
--- a/include/llvm/Option/ArgList.h
+++ b/include/llvm/Option/ArgList.h
@@ -15,6 +15,7 @@
 #include "llvm/Option/OptSpecifier.h"
 #include "llvm/Option/Option.h"
 #include <list>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -105,10 +106,14 @@ private:
   arglist_type Args;
 
 protected:
-  ArgList();
+  // Default ctor provided explicitly as it is not provided implicitly due to
+  // the presence of the (deleted) copy ctor above.
+  ArgList() { }
+  // Virtual to provide a vtable anchor and because -Wnon-virtua-dtor warns, not
+  // because this type is ever actually destroyed polymorphically.
+  virtual ~ArgList();
 
 public:
-  virtual ~ArgList();
 
   /// @name Arg Access
   /// @{
@@ -160,16 +165,16 @@ public:
   ///
   /// \p Claim Whether the argument should be claimed, if it exists.
   bool hasArgNoClaim(OptSpecifier Id) const {
-    return getLastArgNoClaim(Id) != 0;
+    return getLastArgNoClaim(Id) != nullptr;
   }
   bool hasArg(OptSpecifier Id) const {
-    return getLastArg(Id) != 0;
+    return getLastArg(Id) != nullptr;
   }
   bool hasArg(OptSpecifier Id0, OptSpecifier Id1) const {
-    return getLastArg(Id0, Id1) != 0;
+    return getLastArg(Id0, Id1) != nullptr;
   }
   bool hasArg(OptSpecifier Id0, OptSpecifier Id1, OptSpecifier Id2) const {
-    return getLastArg(Id0, Id1, Id2) != 0;
+    return getLastArg(Id0, Id1, Id2) != nullptr;
   }
 
   /// getLastArg - Return the last argument matching \p Id, or null.
@@ -334,7 +339,7 @@ class DerivedArgList : public ArgList {
   const InputArgList &BaseArgs;
 
   /// The list of arguments we synthesized.
-  mutable arglist_type SynthesizedArgs;
+  mutable SmallVector<std::unique_ptr<Arg>, 16> SynthesizedArgs;
 
 public:
   /// Construct a new derived arg list from \p BaseArgs.
@@ -358,9 +363,7 @@ public:
 
   /// AddSynthesizedArg - Add a argument to the list of synthesized arguments
   /// (to be freed).
-  void AddSynthesizedArg(Arg *A) {
-    SynthesizedArgs.push_back(A);
-  }
+  void AddSynthesizedArg(Arg *A);
 
   const char *MakeArgString(StringRef Str) const override;
 
diff --git a/include/llvm/Option/OptSpecifier.h b/include/llvm/Option/OptSpecifier.h
index 02bc6b1..b7caa6e 100644
--- a/include/llvm/Option/OptSpecifier.h
+++ b/include/llvm/Option/OptSpecifier.h
@@ -10,6 +10,8 @@
 #ifndef LLVM_OPTION_OPTSPECIFIER_H
 #define LLVM_OPTION_OPTSPECIFIER_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 namespace opt {
   class Option;
diff --git a/include/llvm/Option/Option.h b/include/llvm/Option/Option.h
index 03d4774..b2cfacb 100644
--- a/include/llvm/Option/Option.h
+++ b/include/llvm/Option/Option.h
@@ -73,7 +73,7 @@ public:
   ~Option();
 
   bool isValid() const {
-    return Info != 0;
+    return Info != nullptr;
   }
 
   unsigned getID() const {
diff --git a/include/llvm/Pass.h b/include/llvm/Pass.h
index ff700cf..c2b9f95 100644
--- a/include/llvm/Pass.h
+++ b/include/llvm/Pass.h
@@ -87,7 +87,8 @@ class Pass {
   Pass(const Pass &) LLVM_DELETED_FUNCTION;
 
 public:
-  explicit Pass(PassKind K, char &pid) : Resolver(0), PassID(&pid), Kind(K) { }
+  explicit Pass(PassKind K, char &pid)
+    : Resolver(nullptr), PassID(&pid), Kind(K) { }
   virtual ~Pass();
 
 
diff --git a/include/llvm/PassAnalysisSupport.h b/include/llvm/PassAnalysisSupport.h
index a581802..9164305 100644
--- a/include/llvm/PassAnalysisSupport.h
+++ b/include/llvm/PassAnalysisSupport.h
@@ -129,7 +129,7 @@ public:
 
   // Find pass that is implementing PI.
   Pass *findImplPass(AnalysisID PI) {
-    Pass *ResultPass = 0;
+    Pass *ResultPass = nullptr;
     for (unsigned i = 0; i < AnalysisImpls.size() ; ++i) {
       if (AnalysisImpls[i].first == PI) {
         ResultPass = AnalysisImpls[i].second;
@@ -182,7 +182,7 @@ AnalysisType *Pass::getAnalysisIfAvailable() const {
   const void *PI = &AnalysisType::ID;
 
   Pass *ResultPass = Resolver->getAnalysisIfAvailable(PI, true);
-  if (ResultPass == 0) return 0;
+  if (!ResultPass) return nullptr;
 
   // Because the AnalysisType may not be a subclass of pass (for
   // AnalysisGroups), we use getAdjustedAnalysisPointer here to potentially
diff --git a/include/llvm/PassRegistry.h b/include/llvm/PassRegistry.h
index 756b1b8..7f2a014 100644
--- a/include/llvm/PassRegistry.h
+++ b/include/llvm/PassRegistry.h
@@ -37,7 +37,7 @@ class PassRegistry {
   void *getImpl() const;
    
 public:
-  PassRegistry() : pImpl(0) { }
+  PassRegistry() : pImpl(nullptr) { }
   ~PassRegistry();
   
   /// getPassRegistry - Access the global registry object, which is 
diff --git a/include/llvm/PassSupport.h b/include/llvm/PassSupport.h
index baee77f..8efb45f 100644
--- a/include/llvm/PassSupport.h
+++ b/include/llvm/PassSupport.h
@@ -59,7 +59,7 @@ public:
   /// through RegisterPass.
   PassInfo(const char *name, const char *arg, const void *pi,
            NormalCtor_t normal, bool isCFGOnly, bool is_analysis,
-           TargetMachineCtor_t machine = NULL)
+           TargetMachineCtor_t machine = nullptr)
     : PassName(name), PassArgument(arg), PassID(pi), 
       IsCFGOnlyPass(isCFGOnly), 
       IsAnalysis(is_analysis), IsAnalysisGroup(false), NormalCtor(normal),
@@ -70,8 +70,8 @@ public:
   PassInfo(const char *name, const void *pi)
     : PassName(name), PassArgument(""), PassID(pi), 
       IsCFGOnlyPass(false), 
-      IsAnalysis(false), IsAnalysisGroup(true), NormalCtor(0),
-      TargetMachineCtor(0) {}
+      IsAnalysis(false), IsAnalysisGroup(true), NormalCtor(nullptr),
+      TargetMachineCtor(nullptr) {}
 
   /// getPassName - Return the friendly name for the pass, never returns null
   ///
@@ -256,7 +256,7 @@ class RegisterAGBase : public PassInfo {
 public:
   RegisterAGBase(const char *Name,
                  const void *InterfaceID,
-                 const void *PassID = 0,
+                 const void *PassID = nullptr,
                  bool isDefault = false);
 };
 
diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index d8f3ca6..8457678 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -29,6 +29,7 @@ struct instrprof_error {
     bad_magic,
     bad_header,
     unsupported_version,
+    unsupported_hash_type,
     too_large,
     truncated,
     malformed,
diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index 2c070b9..3e18c76 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -16,10 +16,12 @@
 #define LLVM_PROFILEDATA_INSTRPROF_READER_H_
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/OnDiskHashTable.h"
 
 #include <iterator>
 
@@ -29,6 +31,9 @@ class InstrProfReader;
 
 /// Profiling information for a single function.
 struct InstrProfRecord {
+  InstrProfRecord() {}
+  InstrProfRecord(StringRef Name, uint64_t Hash, ArrayRef<uint64_t> Counts)
+      : Name(Name), Hash(Hash), Counts(Counts) {}
   StringRef Name;
   uint64_t Hash;
   ArrayRef<uint64_t> Counts;
@@ -160,6 +165,7 @@ private:
   const ProfileData *DataEnd;
   const uint64_t *CountersStart;
   const char *NamesStart;
+  const char *ProfileEnd;
 
   RawInstrProfReader(const TextInstrProfReader &) LLVM_DELETED_FUNCTION;
   RawInstrProfReader &operator=(const TextInstrProfReader &)
@@ -173,6 +179,7 @@ public:
   error_code readNextRecord(InstrProfRecord &Record) override;
 
 private:
+  error_code readNextHeader(const char *CurrentPos);
   error_code readHeader(const RawHeader &Header);
   template <class IntT>
   IntT swap(IntT Int) const {
@@ -191,6 +198,104 @@ private:
 typedef RawInstrProfReader<uint32_t> RawInstrProfReader32;
 typedef RawInstrProfReader<uint64_t> RawInstrProfReader64;
 
+namespace IndexedInstrProf {
+enum class HashT : uint32_t;
+}
+
+/// Trait for lookups into the on-disk hash table for the binary instrprof
+/// format.
+class InstrProfLookupTrait {
+  std::vector<uint64_t> CountBuffer;
+  IndexedInstrProf::HashT HashType;
+public:
+  InstrProfLookupTrait(IndexedInstrProf::HashT HashType) : HashType(HashType) {}
+
+  typedef InstrProfRecord data_type;
+  typedef StringRef internal_key_type;
+  typedef StringRef external_key_type;
+  typedef uint64_t hash_value_type;
+  typedef uint64_t offset_type;
+
+  static bool EqualKey(StringRef A, StringRef B) { return A == B; }
+  static StringRef GetInternalKey(StringRef K) { return K; }
+
+  hash_value_type ComputeHash(StringRef K);
+
+  static std::pair<offset_type, offset_type>
+  ReadKeyDataLength(const unsigned char *&D) {
+    using namespace support;
+    offset_type KeyLen = endian::readNext<offset_type, little, unaligned>(D);
+    offset_type DataLen = endian::readNext<offset_type, little, unaligned>(D);
+    return std::make_pair(KeyLen, DataLen);
+  }
+
+  StringRef ReadKey(const unsigned char *D, offset_type N) {
+    return StringRef((const char *)D, N);
+  }
+
+  InstrProfRecord ReadData(StringRef K, const unsigned char *D, offset_type N) {
+    if (N < 2 * sizeof(uint64_t) || N % sizeof(uint64_t)) {
+      // The data is corrupt, don't try to read it.
+      CountBuffer.clear();
+      return InstrProfRecord("", 0, CountBuffer);
+    }
+
+    using namespace support;
+
+    // The first stored value is the hash.
+    uint64_t Hash = endian::readNext<uint64_t, little, unaligned>(D);
+    // Each counter follows.
+    unsigned NumCounters = N / sizeof(uint64_t) - 1;
+    CountBuffer.clear();
+    CountBuffer.reserve(NumCounters - 1);
+    for (unsigned I = 0; I < NumCounters; ++I)
+      CountBuffer.push_back(endian::readNext<uint64_t, little, unaligned>(D));
+
+    return InstrProfRecord(K, Hash, CountBuffer);
+  }
+};
+typedef OnDiskIterableChainedHashTable<InstrProfLookupTrait>
+    InstrProfReaderIndex;
+
+/// Reader for the indexed binary instrprof format.
+class IndexedInstrProfReader : public InstrProfReader {
+private:
+  /// The profile data file contents.
+  std::unique_ptr<MemoryBuffer> DataBuffer;
+  /// The index into the profile data.
+  std::unique_ptr<InstrProfReaderIndex> Index;
+  /// Iterator over the profile data.
+  InstrProfReaderIndex::data_iterator RecordIterator;
+  /// The maximal execution count among all fucntions.
+  uint64_t MaxFunctionCount;
+
+  IndexedInstrProfReader(const IndexedInstrProfReader &) LLVM_DELETED_FUNCTION;
+  IndexedInstrProfReader &operator=(const IndexedInstrProfReader &)
+    LLVM_DELETED_FUNCTION;
+public:
+  IndexedInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
+      : DataBuffer(std::move(DataBuffer)), Index(nullptr),
+        RecordIterator(InstrProfReaderIndex::data_iterator()) {}
+
+  /// Return true if the given buffer is in an indexed instrprof format.
+  static bool hasFormat(const MemoryBuffer &DataBuffer);
+
+  /// Read the file header.
+  error_code readHeader() override;
+  /// Read a single record.
+  error_code readNextRecord(InstrProfRecord &Record) override;
+
+  /// Fill Counts with the profile data for the given function name.
+  error_code getFunctionCounts(StringRef FuncName, uint64_t &FuncHash,
+                               std::vector<uint64_t> &Counts);
+  /// Return the maximum of all known function counts.
+  uint64_t getMaximumFunctionCount() { return MaxFunctionCount; }
+
+  /// Factory method to create an indexed reader.
+  static error_code create(std::string Path,
+                           std::unique_ptr<IndexedInstrProfReader> &Result);
+};
+
 } // end namespace llvm
 
 #endif // LLVM_PROFILEDATA_INSTRPROF_READER_H_
diff --git a/include/llvm/ProfileData/InstrProfWriter.h b/include/llvm/ProfileData/InstrProfWriter.h
index f818fa0..fa37bf1 100644
--- a/include/llvm/ProfileData/InstrProfWriter.h
+++ b/include/llvm/ProfileData/InstrProfWriter.h
@@ -41,7 +41,7 @@ public:
   error_code addFunctionCounts(StringRef FunctionName, uint64_t FunctionHash,
                                ArrayRef<uint64_t> Counters);
   /// Ensure that all data is written to disk.
-  void write(raw_ostream &OS);
+  void write(raw_fd_ostream &OS);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/ARMBuildAttributes.h b/include/llvm/Support/ARMBuildAttributes.h
index 69732fc..1631200 100644
--- a/include/llvm/Support/ARMBuildAttributes.h
+++ b/include/llvm/Support/ARMBuildAttributes.h
@@ -146,6 +146,19 @@ enum {
   AllowNeon2 = 2,     // SIMDv2 was permitted (Half-precision FP, MAC operations)
   AllowNeonARMv8 = 3, // ARM v8-A SIMD was permitted
 
+  // Tag_ABI_PCS_RW_data, (=15), uleb128
+  AddressRWPCRel = 1, // Address RW static data PC-relative
+  AddressRWSBRel = 2, // Address RW static data SB-relative
+  AddressRWNone = 3, // No RW static data permitted
+
+  // Tag_ABI_PCS_RO_data, (=14), uleb128
+  AddressROPCRel = 1, // Address RO static data PC-relative
+  AddressRONone = 2, // No RO static data permitted
+
+  // Tag_ABI_PCS_GOT_use, (=17), uleb128
+  AddressDirect = 1, // Address imported data directly
+  AddressGOT = 2, // Address imported data indirectly (via GOT)
+
   // Tag_ABI_FP_denormal, (=20), uleb128
   PreserveFPSign = 2, // sign when flushed-to-zero is preserved
 
diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h
index 0641322..7a7e4c0 100644
--- a/include/llvm/Support/Allocator.h
+++ b/include/llvm/Support/Allocator.h
@@ -6,14 +6,22 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file defines the MallocAllocator and BumpPtrAllocator interfaces.
-//
+/// \file
+///
+/// This file defines the MallocAllocator and BumpPtrAllocator interfaces. Both
+/// of these conform to an LLVM "Allocator" concept which consists of an
+/// Allocate method accepting a size and alignment, and a Deallocate accepting
+/// a pointer and size. Further, the LLVM "Allocator" concept has overloads of
+/// Allocate and Deallocate for setting size and alignment based on the final
+/// type. These overloads are typically provided by a base class template \c
+/// AllocatorBase.
+///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_SUPPORT_ALLOCATOR_H
 #define LLVM_SUPPORT_ALLOCATOR_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/MathExtras.h"
@@ -24,90 +32,86 @@
 #include <cstdlib>
 
 namespace llvm {
-template <typename T> struct ReferenceAdder {
-  typedef T &result;
-};
-template <typename T> struct ReferenceAdder<T &> {
-  typedef T result;
-};
 
-class MallocAllocator {
+/// \brief CRTP base class providing obvious overloads for the core \c
+/// Allocate() methods of LLVM-style allocators.
+///
+/// This base class both documents the full public interface exposed by all
+/// LLVM-style allocators, and redirects all of the overloads to a single core
+/// set of methods which the derived class must define.
+template <typename DerivedT> class AllocatorBase {
 public:
-  MallocAllocator() {}
-  ~MallocAllocator() {}
-
-  void Reset() {}
-
-  void *Allocate(size_t Size, size_t /*Alignment*/) { return malloc(Size); }
-
-  template <typename T> T *Allocate() {
-    return static_cast<T *>(malloc(sizeof(T)));
+  /// \brief Allocate \a Size bytes of \a Alignment aligned memory. This method
+  /// must be implemented by \c DerivedT.
+  void *Allocate(size_t Size, size_t Alignment) {
+#ifdef __clang__
+    static_assert(static_cast<void *(AllocatorBase::*)(size_t, size_t)>(
+                      &AllocatorBase::Allocate) !=
+                      static_cast<void *(DerivedT::*)(size_t, size_t)>(
+                          &DerivedT::Allocate),
+                  "Class derives from AllocatorBase without implementing the "
+                  "core Allocate(size_t, size_t) overload!");
+#endif
+    return static_cast<DerivedT *>(this)->Allocate(Size, Alignment);
   }
 
-  template <typename T> T *Allocate(size_t Num) {
-    return static_cast<T *>(malloc(sizeof(T) * Num));
+  /// \brief Deallocate \a Ptr to \a Size bytes of memory allocated by this
+  /// allocator.
+  void Deallocate(const void *Ptr, size_t Size) {
+#ifdef __clang__
+    static_assert(static_cast<void (AllocatorBase::*)(const void *, size_t)>(
+                      &AllocatorBase::Deallocate) !=
+                      static_cast<void (DerivedT::*)(const void *, size_t)>(
+                          &DerivedT::Deallocate),
+                  "Class derives from AllocatorBase without implementing the "
+                  "core Deallocate(void *) overload!");
+#endif
+    return static_cast<DerivedT *>(this)->Deallocate(Ptr, Size);
   }
 
-  void Deallocate(const void *Ptr) { free(const_cast<void *>(Ptr)); }
+  // The rest of these methods are helpers that redirect to one of the above
+  // core methods.
 
-  void PrintStats() const {}
-};
+  /// \brief Allocate space for a sequence of objects without constructing them.
+  template <typename T> T *Allocate(size_t Num = 1) {
+    return static_cast<T *>(Allocate(Num * sizeof(T), AlignOf<T>::Alignment));
+  }
 
-/// MemSlab - This structure lives at the beginning of every slab allocated by
-/// the bump allocator.
-class MemSlab {
-public:
-  size_t Size;
-  MemSlab *NextPtr;
+  /// \brief Deallocate space for a sequence of objects without constructing them.
+  template <typename T>
+  typename std::enable_if<
+      !std::is_same<typename std::remove_cv<T>::type, void>::value, void>::type
+  Deallocate(T *Ptr, size_t Num = 1) {
+    Deallocate(static_cast<const void *>(Ptr), Num * sizeof(T));
+  }
 };
 
-/// SlabAllocator - This class can be used to parameterize the underlying
-/// allocation strategy for the bump allocator.  In particular, this is used
-/// by the JIT to allocate contiguous swathes of executable memory.  The
-/// interface uses MemSlab's instead of void *'s so that the allocator
-/// doesn't have to remember the size of the pointer it allocated.
-class SlabAllocator {
+class MallocAllocator : public AllocatorBase<MallocAllocator> {
 public:
-  virtual ~SlabAllocator();
-  virtual MemSlab *Allocate(size_t Size) = 0;
-  virtual void Deallocate(MemSlab *Slab) = 0;
-};
+  void Reset() {}
 
-/// MallocSlabAllocator - The default slab allocator for the bump allocator
-/// is an adapter class for MallocAllocator that just forwards the method
-/// calls and translates the arguments.
-class MallocSlabAllocator : public SlabAllocator {
-  /// Allocator - The underlying allocator that we forward to.
-  ///
-  MallocAllocator Allocator;
+  void *Allocate(size_t Size, size_t /*Alignment*/) { return malloc(Size); }
 
-public:
-  MallocSlabAllocator() : Allocator() {}
-  virtual ~MallocSlabAllocator();
-  MemSlab *Allocate(size_t Size) override;
-  void Deallocate(MemSlab *Slab) override;
-};
+  // Pull in base class overloads.
+  using AllocatorBase<MallocAllocator>::Allocate;
 
-/// \brief Non-templated base class for the \c BumpPtrAllocatorImpl template.
-class BumpPtrAllocatorBase {
-public:
-  void Deallocate(const void * /*Ptr*/) {}
-  void PrintStats() const;
+  void Deallocate(const void *Ptr, size_t /*Size*/) {
+    free(const_cast<void *>(Ptr));
+  }
 
-  /// \brief Returns the total physical memory allocated by this allocator.
-  size_t getTotalMemory() const;
+  // Pull in base class overloads.
+  using AllocatorBase<MallocAllocator>::Deallocate;
 
-protected:
-  /// \brief The slab that we are currently allocating into.
-  MemSlab *CurSlab;
+  void PrintStats() const {}
+};
 
-  /// \brief How many bytes we've allocated.
-  ///
-  /// Used so that we can compute how much space was wasted.
-  size_t BytesAllocated;
+namespace detail {
 
-  BumpPtrAllocatorBase() : CurSlab(0), BytesAllocated(0) {}
-};
+// We call out to an external function to actually print the message as the
+// printing code uses Allocator.h in its implementation.
+void printBumpPtrAllocatorStats(unsigned NumSlabs, size_t BytesAllocated,
+                                size_t TotalMemory);
+} // End namespace detail.
 
 /// \brief Allocate memory in an ever growing pool, as if by bump-pointer.
 ///
@@ -119,11 +123,15 @@ protected:
 ///
 /// Note that this also has a threshold for forcing allocations above a certain
 /// size into their own slab.
-template <size_t SlabSize = 4096, size_t SizeThreshold = SlabSize>
-class BumpPtrAllocatorImpl : public BumpPtrAllocatorBase {
-  BumpPtrAllocatorImpl(const BumpPtrAllocatorImpl &) LLVM_DELETED_FUNCTION;
-  void operator=(const BumpPtrAllocatorImpl &) LLVM_DELETED_FUNCTION;
-
+///
+/// The BumpPtrAllocatorImpl template defaults to using a MallocAllocator
+/// object, which wraps malloc, to allocate memory, but it can be changed to
+/// use a custom allocator.
+template <typename AllocatorT = MallocAllocator, size_t SlabSize = 4096,
+          size_t SizeThreshold = SlabSize>
+class BumpPtrAllocatorImpl
+    : public AllocatorBase<
+          BumpPtrAllocatorImpl<AllocatorT, SlabSize, SizeThreshold>> {
 public:
   static_assert(SizeThreshold <= SlabSize,
                 "The SizeThreshold must be at most the SlabSize to ensure "
@@ -131,26 +139,69 @@ public:
                 "allocation.");
 
   BumpPtrAllocatorImpl()
-      : Allocator(DefaultSlabAllocator), NumSlabs(0) {}
-  BumpPtrAllocatorImpl(SlabAllocator &Allocator)
-      : Allocator(Allocator), NumSlabs(0) {}
-  ~BumpPtrAllocatorImpl() { DeallocateSlabs(CurSlab); }
+      : CurPtr(nullptr), End(nullptr), BytesAllocated(0), Allocator() {}
+  template <typename T>
+  BumpPtrAllocatorImpl(T &&Allocator)
+      : CurPtr(nullptr), End(nullptr), BytesAllocated(0),
+        Allocator(std::forward<T &&>(Allocator)) {}
+
+  // Manually implement a move constructor as we must clear the old allocators
+  // slabs as a matter of correctness.
+  BumpPtrAllocatorImpl(BumpPtrAllocatorImpl &&Old)
+      : CurPtr(Old.CurPtr), End(Old.End), Slabs(std::move(Old.Slabs)),
+        CustomSizedSlabs(std::move(Old.CustomSizedSlabs)),
+        BytesAllocated(Old.BytesAllocated),
+        Allocator(std::move(Old.Allocator)) {
+    Old.CurPtr = Old.End = nullptr;
+    Old.BytesAllocated = 0;
+    Old.Slabs.clear();
+    Old.CustomSizedSlabs.clear();
+  }
+
+  ~BumpPtrAllocatorImpl() {
+    DeallocateSlabs(Slabs.begin(), Slabs.end());
+    DeallocateCustomSizedSlabs();
+  }
+
+  BumpPtrAllocatorImpl &operator=(BumpPtrAllocatorImpl &&RHS) {
+    DeallocateSlabs(Slabs.begin(), Slabs.end());
+    DeallocateCustomSizedSlabs();
+
+    CurPtr = RHS.CurPtr;
+    End = RHS.End;
+    BytesAllocated = RHS.BytesAllocated;
+    Slabs = std::move(RHS.Slabs);
+    CustomSizedSlabs = std::move(RHS.CustomSizedSlabs);
+    Allocator = std::move(RHS.Allocator);
+
+    RHS.CurPtr = RHS.End = nullptr;
+    RHS.BytesAllocated = 0;
+    RHS.Slabs.clear();
+    RHS.CustomSizedSlabs.clear();
+    return *this;
+  }
 
   /// \brief Deallocate all but the current slab and reset the current pointer
   /// to the beginning of it, freeing all memory allocated so far.
   void Reset() {
-    if (!CurSlab)
+    if (Slabs.empty())
       return;
-    DeallocateSlabs(CurSlab->NextPtr);
-    CurSlab->NextPtr = 0;
-    CurPtr = (char *)(CurSlab + 1);
-    End = ((char *)CurSlab) + CurSlab->Size;
+
+    // Reset the state.
     BytesAllocated = 0;
+    CurPtr = (char *)Slabs.front();
+    End = CurPtr + SlabSize;
+
+    // Deallocate all but the first slab, and all custome sized slabs.
+    DeallocateSlabs(std::next(Slabs.begin()), Slabs.end());
+    Slabs.erase(std::next(Slabs.begin()), Slabs.end());
+    DeallocateCustomSizedSlabs();
+    CustomSizedSlabs.clear();
   }
 
   /// \brief Allocate space at the specified alignment.
   void *Allocate(size_t Size, size_t Alignment) {
-    if (!CurSlab) // Start a new slab if we haven't allocated one already.
+    if (!CurPtr) // Start a new slab if we haven't allocated one already.
       StartNewSlab();
 
     // Keep track of how many bytes we've allocated.
@@ -174,18 +225,13 @@ public:
     }
 
     // If Size is really big, allocate a separate slab for it.
-    size_t PaddedSize = Size + sizeof(MemSlab) + Alignment - 1;
+    size_t PaddedSize = Size + Alignment - 1;
     if (PaddedSize > SizeThreshold) {
-      ++NumSlabs;
-      MemSlab *NewSlab = Allocator.Allocate(PaddedSize);
-
-      // Put the new slab after the current slab, since we are not allocating
-      // into it.
-      NewSlab->NextPtr = CurSlab->NextPtr;
-      CurSlab->NextPtr = NewSlab;
+      void *NewSlab = Allocator.Allocate(PaddedSize, 0);
+      CustomSizedSlabs.push_back(std::make_pair(NewSlab, PaddedSize));
 
-      Ptr = alignPtr((char *)(NewSlab + 1), Alignment);
-      assert((uintptr_t)Ptr + Size <= (uintptr_t)NewSlab + NewSlab->Size);
+      Ptr = alignPtr((char *)NewSlab, Alignment);
+      assert((uintptr_t)Ptr + Size <= (uintptr_t)NewSlab + PaddedSize);
       __msan_allocated_memory(Ptr, Size);
       return Ptr;
     }
@@ -199,36 +245,31 @@ public:
     return Ptr;
   }
 
-  /// \brief Allocate space for one object without constructing it.
-  template <typename T> T *Allocate() {
-    return static_cast<T *>(Allocate(sizeof(T), AlignOf<T>::Alignment));
-  }
+  // Pull in base class overloads.
+  using AllocatorBase<BumpPtrAllocatorImpl>::Allocate;
 
-  /// \brief Allocate space for an array of objects without constructing them.
-  template <typename T> T *Allocate(size_t Num) {
-    return static_cast<T *>(Allocate(Num * sizeof(T), AlignOf<T>::Alignment));
-  }
+  void Deallocate(const void * /*Ptr*/, size_t /*Size*/) {}
 
-  /// \brief Allocate space for an array of objects with the specified alignment
-  /// and without constructing them.
-  template <typename T> T *Allocate(size_t Num, size_t Alignment) {
-    // Round EltSize up to the specified alignment.
-    size_t EltSize = (sizeof(T) + Alignment - 1) & (-Alignment);
-    return static_cast<T *>(Allocate(Num * EltSize, Alignment));
-  }
+  // Pull in base class overloads.
+  using AllocatorBase<BumpPtrAllocatorImpl>::Deallocate;
 
-  size_t GetNumSlabs() const { return NumSlabs; }
+  size_t GetNumSlabs() const { return Slabs.size() + CustomSizedSlabs.size(); }
 
-private:
-  /// \brief The default allocator used if one is not provided.
-  MallocSlabAllocator DefaultSlabAllocator;
+  size_t getTotalMemory() const {
+    size_t TotalMemory = 0;
+    for (auto I = Slabs.begin(), E = Slabs.end(); I != E; ++I)
+      TotalMemory += computeSlabSize(std::distance(Slabs.begin(), I));
+    for (auto &PtrAndSize : CustomSizedSlabs)
+      TotalMemory += PtrAndSize.second;
+    return TotalMemory;
+  }
 
-  /// \brief The underlying allocator we use to get slabs of memory.
-  ///
-  /// This defaults to MallocSlabAllocator, which wraps malloc, but it could be
-  /// changed to use a custom allocator.
-  SlabAllocator &Allocator;
+  void PrintStats() const {
+    detail::printBumpPtrAllocatorStats(Slabs.size(), BytesAllocated,
+                                       getTotalMemory());
+  }
 
+private:
   /// \brief The current pointer into the current slab.
   ///
   /// This points to the next free byte in the slab.
@@ -237,46 +278,67 @@ private:
   /// \brief The end of the current slab.
   char *End;
 
-  /// \brief How many slabs we've allocated.
+  /// \brief The slabs allocated so far.
+  SmallVector<void *, 4> Slabs;
+
+  /// \brief Custom-sized slabs allocated for too-large allocation requests.
+  SmallVector<std::pair<void *, size_t>, 0> CustomSizedSlabs;
+
+  /// \brief How many bytes we've allocated.
   ///
-  /// Used to scale the size of each slab and reduce the number of allocations
-  /// for extremely heavy memory use scenarios.
-  size_t NumSlabs;
+  /// Used so that we can compute how much space was wasted.
+  size_t BytesAllocated;
 
-  /// \brief Allocate a new slab and move the bump pointers over into the new
-  /// slab, modifying CurPtr and End.
-  void StartNewSlab() {
-    ++NumSlabs;
+  /// \brief The allocator instance we use to get slabs of memory.
+  AllocatorT Allocator;
+
+  static size_t computeSlabSize(unsigned SlabIdx) {
     // Scale the actual allocated slab size based on the number of slabs
     // allocated. Every 128 slabs allocated, we double the allocated size to
     // reduce allocation frequency, but saturate at multiplying the slab size by
     // 2^30.
-    // FIXME: Currently, this count includes special slabs for objects above the
-    // size threshold. That will be fixed in a subsequent commit to make the
-    // growth even more predictable.
-    size_t AllocatedSlabSize =
-        SlabSize * ((size_t)1 << std::min<size_t>(30, NumSlabs / 128));
-
-    MemSlab *NewSlab = Allocator.Allocate(AllocatedSlabSize);
-    NewSlab->NextPtr = CurSlab;
-    CurSlab = NewSlab;
-    CurPtr = (char *)(CurSlab + 1);
-    End = ((char *)CurSlab) + CurSlab->Size;
+    return SlabSize * ((size_t)1 << std::min<size_t>(30, SlabIdx / 128));
   }
 
-  /// \brief Deallocate all memory slabs after and including this one.
-  void DeallocateSlabs(MemSlab *Slab) {
-    while (Slab) {
-      MemSlab *NextSlab = Slab->NextPtr;
+  /// \brief Allocate a new slab and move the bump pointers over into the new
+  /// slab, modifying CurPtr and End.
+  void StartNewSlab() {
+    size_t AllocatedSlabSize = computeSlabSize(Slabs.size());
+
+    void *NewSlab = Allocator.Allocate(AllocatedSlabSize, 0);
+    Slabs.push_back(NewSlab);
+    CurPtr = (char *)(NewSlab);
+    End = ((char *)NewSlab) + AllocatedSlabSize;
+  }
+
+  /// \brief Deallocate a sequence of slabs.
+  void DeallocateSlabs(SmallVectorImpl<void *>::iterator I,
+                       SmallVectorImpl<void *>::iterator E) {
+    for (; I != E; ++I) {
+      size_t AllocatedSlabSize =
+          computeSlabSize(std::distance(Slabs.begin(), I));
 #ifndef NDEBUG
       // Poison the memory so stale pointers crash sooner.  Note we must
       // preserve the Size and NextPtr fields at the beginning.
-      sys::Memory::setRangeWritable(Slab + 1, Slab->Size - sizeof(MemSlab));
-      memset(Slab + 1, 0xCD, Slab->Size - sizeof(MemSlab));
+      sys::Memory::setRangeWritable(*I, AllocatedSlabSize);
+      memset(*I, 0xCD, AllocatedSlabSize);
 #endif
-      Allocator.Deallocate(Slab);
-      Slab = NextSlab;
-      --NumSlabs;
+      Allocator.Deallocate(*I, AllocatedSlabSize);
+    }
+  }
+
+  /// \brief Deallocate all memory for custom sized slabs.
+  void DeallocateCustomSizedSlabs() {
+    for (auto &PtrAndSize : CustomSizedSlabs) {
+      void *Ptr = PtrAndSize.first;
+      size_t Size = PtrAndSize.second;
+#ifndef NDEBUG
+      // Poison the memory so stale pointers crash sooner.  Note we must
+      // preserve the Size and NextPtr fields at the beginning.
+      sys::Memory::setRangeWritable(Ptr, Size);
+      memset(Ptr, 0xCD, Size);
+#endif
+      Allocator.Deallocate(Ptr, Size);
     }
   }
 
@@ -297,25 +359,42 @@ template <typename T> class SpecificBumpPtrAllocator {
 
 public:
   SpecificBumpPtrAllocator() : Allocator() {}
-  SpecificBumpPtrAllocator(SlabAllocator &allocator) : Allocator(allocator) {}
-
+  SpecificBumpPtrAllocator(SpecificBumpPtrAllocator &&Old)
+      : Allocator(std::move(Old.Allocator)) {}
   ~SpecificBumpPtrAllocator() { DestroyAll(); }
 
+  SpecificBumpPtrAllocator &operator=(SpecificBumpPtrAllocator &&RHS) {
+    Allocator = std::move(RHS.Allocator);
+    return *this;
+  }
+
   /// Call the destructor of each allocated object and deallocate all but the
   /// current slab and reset the current pointer to the beginning of it, freeing
   /// all memory allocated so far.
   void DestroyAll() {
-    MemSlab *Slab = Allocator.CurSlab;
-    while (Slab) {
-      char *End = Slab == Allocator.CurSlab ? Allocator.CurPtr
-                                            : (char *)Slab + Slab->Size;
-      for (char *Ptr = (char *)(Slab + 1); Ptr < End; Ptr += sizeof(T)) {
-        Ptr = alignPtr(Ptr, alignOf<T>());
-        if (Ptr + sizeof(T) <= End)
-          reinterpret_cast<T *>(Ptr)->~T();
-      }
-      Slab = Slab->NextPtr;
+    auto DestroyElements = [](char *Begin, char *End) {
+      assert(Begin == alignPtr(Begin, alignOf<T>()));
+      for (char *Ptr = Begin; Ptr + sizeof(T) <= End; Ptr += sizeof(T))
+        reinterpret_cast<T *>(Ptr)->~T();
+    };
+
+    for (auto I = Allocator.Slabs.begin(), E = Allocator.Slabs.end(); I != E;
+         ++I) {
+      size_t AllocatedSlabSize = BumpPtrAllocator::computeSlabSize(
+          std::distance(Allocator.Slabs.begin(), I));
+      char *Begin = alignPtr((char *)*I, alignOf<T>());
+      char *End = *I == Allocator.Slabs.back() ? Allocator.CurPtr
+                                               : (char *)*I + AllocatedSlabSize;
+
+      DestroyElements(Begin, End);
+    }
+
+    for (auto &PtrAndSize : Allocator.CustomSizedSlabs) {
+      void *Ptr = PtrAndSize.first;
+      size_t Size = PtrAndSize.second;
+      DestroyElements(alignPtr((char *)Ptr, alignOf<T>()), (char *)Ptr + Size);
     }
+
     Allocator.Reset();
   }
 
@@ -325,10 +404,10 @@ public:
 
 }  // end namespace llvm
 
-template <size_t SlabSize, size_t SizeThreshold>
-void *
-operator new(size_t Size,
-             llvm::BumpPtrAllocatorImpl<SlabSize, SizeThreshold> &Allocator) {
+template <typename AllocatorT, size_t SlabSize, size_t SizeThreshold>
+void *operator new(size_t Size,
+                   llvm::BumpPtrAllocatorImpl<AllocatorT, SlabSize,
+                                              SizeThreshold> &Allocator) {
   struct S {
     char c;
     union {
@@ -342,8 +421,9 @@ operator new(size_t Size,
       Size, std::min((size_t)llvm::NextPowerOf2(Size), offsetof(S, x)));
 }
 
-template <size_t SlabSize, size_t SizeThreshold>
-void operator delete(void *,
-                     llvm::BumpPtrAllocatorImpl<SlabSize, SizeThreshold> &) {}
+template <typename AllocatorT, size_t SlabSize, size_t SizeThreshold>
+void operator delete(
+    void *, llvm::BumpPtrAllocatorImpl<AllocatorT, SlabSize, SizeThreshold> &) {
+}
 
 #endif // LLVM_SUPPORT_ALLOCATOR_H
diff --git a/include/llvm/Support/ArrayRecycler.h b/include/llvm/Support/ArrayRecycler.h
index e974332..36f644a 100644
--- a/include/llvm/Support/ArrayRecycler.h
+++ b/include/llvm/Support/ArrayRecycler.h
@@ -44,10 +44,10 @@ class ArrayRecycler {
   // Return NULL if no entries are available.
   T *pop(unsigned Idx) {
     if (Idx >= Bucket.size())
-      return 0;
+      return nullptr;
     FreeList *Entry = Bucket[Idx];
     if (!Entry)
-      return 0;
+      return nullptr;
     Bucket[Idx] = Entry->Next;
     return reinterpret_cast<T*>(Entry);
   }
diff --git a/include/llvm/Support/BlockFrequency.h b/include/llvm/Support/BlockFrequency.h
index dae520b..4304a25 100644
--- a/include/llvm/Support/BlockFrequency.h
+++ b/include/llvm/Support/BlockFrequency.h
@@ -23,14 +23,8 @@ class BranchProbability;
 
 // This class represents Block Frequency as a 64-bit value.
 class BlockFrequency {
-
   uint64_t Frequency;
 
-  /// \brief Scale the given BlockFrequency by N/D. Return the remainder from
-  /// the division by D. Upon overflow, the routine will saturate and
-  /// additionally will return the remainder set to D.
-  uint32_t scale(uint32_t N, uint32_t D);
-
 public:
   BlockFrequency(uint64_t Freq = 0) : Frequency(Freq) { }
 
@@ -58,10 +52,6 @@ public:
   /// \brief Shift block frequency to the right by count digits saturating to 1.
   BlockFrequency &operator>>=(const unsigned count);
 
-  /// \brief Scale the given BlockFrequency by N/D. Return the remainder from
-  /// the division by D. Upon overflow, the routine will saturate.
-  uint32_t scale(const BranchProbability &Prob);
-
   bool operator<(const BlockFrequency &RHS) const {
     return Frequency < RHS.Frequency;
   }
diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h
index eedf692..9aab6ac 100644
--- a/include/llvm/Support/BranchProbability.h
+++ b/include/llvm/Support/BranchProbability.h
@@ -46,10 +46,26 @@ public:
     return BranchProbability(D - N, D);
   }
 
-  void print(raw_ostream &OS) const;
+  raw_ostream &print(raw_ostream &OS) const;
 
   void dump() const;
 
+  /// \brief Scale a large integer.
+  ///
+  /// Scales \c Num.  Guarantees full precision.  Returns the floor of the
+  /// result.
+  ///
+  /// \return \c Num times \c this.
+  uint64_t scale(uint64_t Num) const;
+
+  /// \brief Scale a large integer by the inverse.
+  ///
+  /// Scales \c Num by the inverse of \c this.  Guarantees full precision.
+  /// Returns the floor of the result.
+  ///
+  /// \return \c Num divided by \c this.
+  uint64_t scaleByInverse(uint64_t Num) const;
+
   bool operator==(BranchProbability RHS) const {
     return (uint64_t)N * RHS.D == (uint64_t)D * RHS.N;
   }
@@ -59,18 +75,14 @@ public:
   bool operator<(BranchProbability RHS) const {
     return (uint64_t)N * RHS.D < (uint64_t)D * RHS.N;
   }
-  bool operator>(BranchProbability RHS) const {
-    return RHS < *this;
-  }
-  bool operator<=(BranchProbability RHS) const {
-    return (uint64_t)N * RHS.D <= (uint64_t)D * RHS.N;
-  }
-  bool operator>=(BranchProbability RHS) const {
-    return RHS <= *this;
-  }
+  bool operator>(BranchProbability RHS) const { return RHS < *this; }
+  bool operator<=(BranchProbability RHS) const { return !(RHS < *this); }
+  bool operator>=(BranchProbability RHS) const { return !(*this < RHS); }
 };
 
-raw_ostream &operator<<(raw_ostream &OS, const BranchProbability &Prob);
+inline raw_ostream &operator<<(raw_ostream &OS, const BranchProbability &Prob) {
+  return Prob.print(OS);
+}
 
 }
 
diff --git a/include/llvm/Support/COFF.h b/include/llvm/Support/COFF.h
index dca7fc6..f0e5c7d 100644
--- a/include/llvm/Support/COFF.h
+++ b/include/llvm/Support/COFF.h
@@ -275,7 +275,7 @@ namespace COFF {
     uint16_t Type;
   };
 
-  enum RelocationTypeX86 {
+  enum RelocationTypeI386 {
     IMAGE_REL_I386_ABSOLUTE = 0x0000,
     IMAGE_REL_I386_DIR16    = 0x0001,
     IMAGE_REL_I386_REL16    = 0x0002,
@@ -286,8 +286,10 @@ namespace COFF {
     IMAGE_REL_I386_SECREL   = 0x000B,
     IMAGE_REL_I386_TOKEN    = 0x000C,
     IMAGE_REL_I386_SECREL7  = 0x000D,
-    IMAGE_REL_I386_REL32    = 0x0014,
+    IMAGE_REL_I386_REL32    = 0x0014
+  };
 
+  enum RelocationTypeAMD64 {
     IMAGE_REL_AMD64_ABSOLUTE  = 0x0000,
     IMAGE_REL_AMD64_ADDR64    = 0x0001,
     IMAGE_REL_AMD64_ADDR32    = 0x0002,
diff --git a/include/llvm/Support/Casting.h b/include/llvm/Support/Casting.h
index 689f590..beed31a 100644
--- a/include/llvm/Support/Casting.h
+++ b/include/llvm/Support/Casting.h
@@ -245,7 +245,7 @@ inline typename cast_retty<X, Y *>::ret_type cast(Y *Val) {
 template <class X, class Y>
 LLVM_ATTRIBUTE_UNUSED_RESULT inline typename cast_retty<X, Y *>::ret_type
 cast_or_null(Y *Val) {
-  if (Val == 0) return 0;
+  if (!Val) return nullptr;
   assert(isa<X>(Val) && "cast_or_null<Ty>() argument of incompatible type!");
   return cast<X>(Val);
 }
@@ -263,19 +263,19 @@ template <class X, class Y>
 LLVM_ATTRIBUTE_UNUSED_RESULT inline typename std::enable_if<
     !is_simple_type<Y>::value, typename cast_retty<X, const Y>::ret_type>::type
 dyn_cast(const Y &Val) {
-  return isa<X>(Val) ? cast<X>(Val) : 0;
+  return isa<X>(Val) ? cast<X>(Val) : nullptr;
 }
 
 template <class X, class Y>
 LLVM_ATTRIBUTE_UNUSED_RESULT inline typename cast_retty<X, Y>::ret_type
 dyn_cast(Y &Val) {
-  return isa<X>(Val) ? cast<X>(Val) : 0;
+  return isa<X>(Val) ? cast<X>(Val) : nullptr;
 }
 
 template <class X, class Y>
 LLVM_ATTRIBUTE_UNUSED_RESULT inline typename cast_retty<X, Y *>::ret_type
 dyn_cast(Y *Val) {
-  return isa<X>(Val) ? cast<X>(Val) : 0;
+  return isa<X>(Val) ? cast<X>(Val) : nullptr;
 }
 
 // dyn_cast_or_null<X> - Functionally identical to dyn_cast, except that a null
@@ -284,7 +284,7 @@ dyn_cast(Y *Val) {
 template <class X, class Y>
 LLVM_ATTRIBUTE_UNUSED_RESULT inline typename cast_retty<X, Y *>::ret_type
 dyn_cast_or_null(Y *Val) {
-  return (Val && isa<X>(Val)) ? cast<X>(Val) : 0;
+  return (Val && isa<X>(Val)) ? cast<X>(Val) : nullptr;
 }
 
 } // End llvm namespace
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index e49a97e..5cb5501 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -41,14 +41,14 @@ namespace cl {
 // ParseCommandLineOptions - Command line option processing entry point.
 //
 void ParseCommandLineOptions(int argc, const char * const *argv,
-                             const char *Overview = 0);
+                             const char *Overview = nullptr);
 
 //===----------------------------------------------------------------------===//
 // ParseEnvironmentOptions - Environment variable option processing alternate
 //                           entry point.
 //
 void ParseEnvironmentOptions(const char *progName, const char *envvar,
-                             const char *Overview = 0);
+                             const char *Overview = nullptr);
 
 ///===---------------------------------------------------------------------===//
 /// SetVersionPrinter - Override the default (LLVM specific) version printer
@@ -146,7 +146,7 @@ private:
   const char *const Description;
   void registerCategory();
 public:
-  OptionCategory(const char *const Name, const char *const Description = 0)
+  OptionCategory(const char *const Name, const char *const Description = nullptr)
       : Name(Name), Description(Description) { registerCategory(); }
   const char *getName() const { return Name; }
   const char *getDescription() const { return Description; }
@@ -238,7 +238,7 @@ protected:
                   enum OptionHidden Hidden)
     : NumOccurrences(0), Occurrences(OccurrencesFlag), Value(0),
       HiddenFlag(Hidden), Formatting(NormalFormatting), Misc(0),
-      Position(0), AdditionalVals(0), NextRegistered(0),
+      Position(0), AdditionalVals(0), NextRegistered(nullptr),
       ArgStr(""), HelpStr(""), ValueStr(""), Category(&GeneralCategory) {
   }
 
@@ -763,7 +763,7 @@ public:
   }
 
   // getValueName - Do not print =<value> at all.
-  const char *getValueName() const override { return 0; }
+  const char *getValueName() const override { return nullptr; }
 
   void printOptionDiff(const Option &O, bool V, OptVal Default,
                        size_t GlobalWidth) const;
@@ -787,7 +787,7 @@ public:
   }
 
   // getValueName - Do not print =<value> at all.
-  const char *getValueName() const override { return 0; }
+  const char *getValueName() const override { return nullptr; }
 
   void printOptionDiff(const Option &O, boolOrDefault V, OptVal Default,
                        size_t GlobalWidth) const;
@@ -1063,12 +1063,12 @@ class opt_storage {
   OptionValue<DataType> Default;
 
   void check_location() const {
-    assert(Location != 0 && "cl::location(...) not specified for a command "
+    assert(Location && "cl::location(...) not specified for a command "
            "line option with external storage, "
            "or cl::init specified before cl::location()!!");
   }
 public:
-  opt_storage() : Location(0) {}
+  opt_storage() : Location(nullptr) {}
 
   bool setLocation(Option &O, DataType &L) {
     if (Location)
@@ -1469,7 +1469,7 @@ class bits_storage {
   }
 
 public:
-  bits_storage() : Location(0) {}
+  bits_storage() : Location(nullptr) {}
 
   bool setLocation(Option &O, unsigned &L) {
     if (Location)
@@ -1664,7 +1664,7 @@ class alias : public Option {
   void done() {
     if (!hasArgStr())
       error("cl::alias must have argument name specified!");
-    if (AliasFor == 0)
+    if (!AliasFor)
       error("cl::alias must have an cl::aliasopt(option) specified!");
       addArgument();
   }
@@ -1677,27 +1677,28 @@ public:
 
   // One option...
   template<class M0t>
-  explicit alias(const M0t &M0) : Option(Optional, Hidden), AliasFor(0) {
+  explicit alias(const M0t &M0) : Option(Optional, Hidden), AliasFor(nullptr) {
     apply(M0, this);
     done();
   }
   // Two options...
   template<class M0t, class M1t>
-  alias(const M0t &M0, const M1t &M1) : Option(Optional, Hidden), AliasFor(0) {
+  alias(const M0t &M0, const M1t &M1)
+    : Option(Optional, Hidden), AliasFor(nullptr) {
     apply(M0, this); apply(M1, this);
     done();
   }
   // Three options...
   template<class M0t, class M1t, class M2t>
   alias(const M0t &M0, const M1t &M1, const M2t &M2)
-    : Option(Optional, Hidden), AliasFor(0) {
+    : Option(Optional, Hidden), AliasFor(nullptr) {
     apply(M0, this); apply(M1, this); apply(M2, this);
     done();
   }
   // Four options...
   template<class M0t, class M1t, class M2t, class M3t>
   alias(const M0t &M0, const M1t &M1, const M2t &M2, const M3t &M3)
-    : Option(Optional, Hidden), AliasFor(0) {
+    : Option(Optional, Hidden), AliasFor(nullptr) {
     apply(M0, this); apply(M1, this); apply(M2, this); apply(M3, this);
     done();
   }
diff --git a/include/llvm/Support/Compression.h b/include/llvm/Support/Compression.h
index 80eff5c..8152b60 100644
--- a/include/llvm/Support/Compression.h
+++ b/include/llvm/Support/Compression.h
@@ -16,10 +16,10 @@
 
 #include "llvm/Support/DataTypes.h"
 #include <memory>
+#include "llvm/ADT/SmallVector.h"
 
 namespace llvm {
 
-class MemoryBuffer;
 class StringRef;
 
 namespace zlib {
@@ -42,12 +42,11 @@ enum Status {
 
 bool isAvailable();
 
-Status compress(StringRef InputBuffer,
-                std::unique_ptr<MemoryBuffer> &CompressedBuffer,
+Status compress(StringRef InputBuffer, SmallVectorImpl<char> &CompressedBuffer,
                 CompressionLevel Level = DefaultCompression);
 
 Status uncompress(StringRef InputBuffer,
-                  std::unique_ptr<MemoryBuffer> &UncompressedBuffer,
+                  SmallVectorImpl<char> &UncompressedBuffer,
                   size_t UncompressedSize);
 
 uint32_t crc32(StringRef Buffer);
diff --git a/include/llvm/Support/CrashRecoveryContext.h b/include/llvm/Support/CrashRecoveryContext.h
index 4500efe..c132373 100644
--- a/include/llvm/Support/CrashRecoveryContext.h
+++ b/include/llvm/Support/CrashRecoveryContext.h
@@ -12,11 +12,13 @@
 
 #include <string>
 
+#include "llvm/ADT/STLExtras.h"
+
 namespace llvm {
 class StringRef;
 
 class CrashRecoveryContextCleanup;
-  
+
 /// \brief Crash recovery helper object.
 ///
 /// This class implements support for running operations in a safe context so
@@ -46,21 +48,10 @@ class CrashRecoveryContext {
   void *Impl;
   CrashRecoveryContextCleanup *head;
 
-  /// An adaptor to convert an arbitrary functor into a void(void*), void* pair.
-  template<typename T> struct FunctorAdaptor {
-    T Fn;
-    static void invoke(void *Data) {
-      return static_cast<FunctorAdaptor<T>*>(Data)->Fn();
-    }
-    typedef void Callback(void*);
-    Callback *fn() { return &invoke; }
-    void *arg() { return this; }
-  };
-
 public:
-  CrashRecoveryContext() : Impl(0), head(0) {}
+  CrashRecoveryContext() : Impl(nullptr), head(nullptr) {}
   ~CrashRecoveryContext();
-  
+
   void registerCleanup(CrashRecoveryContextCleanup *cleanup);
   void unregisterCleanup(CrashRecoveryContextCleanup *cleanup);
 
@@ -86,11 +77,9 @@ public:
   /// make as little assumptions as possible about the program state when
   /// RunSafely has returned false. Clients can use getBacktrace() to retrieve
   /// the backtrace of the crash on failures.
-  bool RunSafely(void (*Fn)(void*), void *UserData);
-  template<typename Functor>
-  bool RunSafely(Functor Fn) {
-    FunctorAdaptor<Functor> Adaptor = { Fn };
-    return RunSafely(Adaptor.fn(), Adaptor.arg());
+  bool RunSafely(function_ref<void()> Fn);
+  bool RunSafely(void (*Fn)(void*), void *UserData) {
+    return RunSafely([&]() { Fn(UserData); });
   }
 
   /// \brief Execute the provide callback function (with the given arguments) in
@@ -98,12 +87,10 @@ public:
   /// requested stack size).
   ///
   /// See RunSafely() and llvm_execute_on_thread().
+  bool RunSafelyOnThread(function_ref<void()>, unsigned RequestedStackSize = 0);
   bool RunSafelyOnThread(void (*Fn)(void*), void *UserData,
-                         unsigned RequestedStackSize = 0);
-  template<typename Functor>
-  bool RunSafelyOnThread(Functor Fn, unsigned RequestedStackSize = 0) {
-    FunctorAdaptor<Functor> Adaptor = { Fn };
-    return RunSafelyOnThread(Adaptor.fn(), Adaptor.arg(), RequestedStackSize);
+                         unsigned RequestedStackSize = 0) {
+    return RunSafelyOnThread([&]() { Fn(UserData); }, RequestedStackSize);
   }
 
   /// \brief Explicitly trigger a crash recovery in the current process, and
diff --git a/include/llvm/Support/Debug.h b/include/llvm/Support/Debug.h
index 2702408..e93e6ca 100644
--- a/include/llvm/Support/Debug.h
+++ b/include/llvm/Support/Debug.h
@@ -13,10 +13,12 @@
 //
 // In particular, just wrap your code with the DEBUG() macro, and it will be
 // enabled automatically if you specify '-debug' on the command-line.
-// Alternatively, you can also use the SET_DEBUG_TYPE("foo") macro to specify
-// that your debug code belongs to class "foo".  Then, on the command line, you
-// can specify '-debug-only=foo' to enable JUST the debug information for the
-// foo class.
+// Alternatively, you can also define the DEBUG_TYPE macro to "foo" specify
+// that your debug code belongs to class "foo". Be careful that you only do
+// this after including Debug.h and not around any #include of headers. Headers
+// should define and undef the macro acround the code that needs to use the
+// DEBUG() macro. Then, on the command line, you can specify '-debug-only=foo'
+// to enable JUST the debug information for the foo class.
 //
 // When compiling without assertions, the -debug-* options and all code in
 // DEBUG() statements disappears, so it does not affect the runtime of the code.
@@ -30,12 +32,6 @@
 
 namespace llvm {
 
-/// DEBUG_TYPE macro - Files can specify a DEBUG_TYPE as a string, which causes
-/// all of their DEBUG statements to be activatable with -debug-only=thatstring.
-#ifndef DEBUG_TYPE
-#define DEBUG_TYPE ""
-#endif
-
 #ifndef NDEBUG
 /// DebugFlag - This boolean is set to true if the '-debug' command line option
 /// is specified.  This should probably not be referenced directly, instead, use
diff --git a/include/llvm/Support/DynamicLibrary.h b/include/llvm/Support/DynamicLibrary.h
index 1e2d16c..de47be6 100644
--- a/include/llvm/Support/DynamicLibrary.h
+++ b/include/llvm/Support/DynamicLibrary.h
@@ -65,7 +65,7 @@ namespace sys {
     /// It is safe to call this function multiple times for the same library.
     /// @brief Open a dynamic library permanently.
     static DynamicLibrary getPermanentLibrary(const char *filename,
-                                              std::string *errMsg = 0);
+                                              std::string *errMsg = nullptr);
 
     /// This function permanently loads the dynamic library at the given path.
     /// Use this instead of getPermanentLibrary() when you won't need to get
@@ -73,7 +73,7 @@ namespace sys {
     ///
     /// It is safe to call this function multiple times for the same library.
     static bool LoadLibraryPermanently(const char *Filename,
-                                       std::string *ErrMsg = 0) {
+                                       std::string *ErrMsg = nullptr) {
       return !getPermanentLibrary(Filename, ErrMsg).isValid();
     }
 
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index 7b10ebd..0b3e55b 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -807,6 +807,7 @@ enum : unsigned {
   EF_MIPS_CPIC      = 0x00000004, // Call object with Position independent code
   EF_MIPS_ABI2      = 0x00000020,
   EF_MIPS_32BITMODE = 0x00000100,
+  EF_MIPS_NAN2008   = 0x00000400, // Uses IEE 754-2008 NaN encoding
   EF_MIPS_ABI_O32   = 0x00001000, // This file follows the first MIPS 32 bit ABI
 
   //ARCH_ASE
@@ -823,11 +824,12 @@ enum : unsigned {
   EF_MIPS_ARCH_64   = 0x60000000, // MIPS64 instruction set per linux not elf.h
   EF_MIPS_ARCH_32R2 = 0x70000000, // mips32r2
   EF_MIPS_ARCH_64R2 = 0x80000000, // mips64r2
+  EF_MIPS_ARCH_32R6 = 0x90000000, // mips32r6
+  EF_MIPS_ARCH_64R6 = 0xa0000000, // mips64r6
   EF_MIPS_ARCH      = 0xf0000000  // Mask for applying EF_MIPS_ARCH_ variant
 };
 
 // ELF Relocation types for Mips
-// .
 enum {
   R_MIPS_NONE              =  0,
   R_MIPS_16                =  1,
@@ -880,6 +882,12 @@ enum {
   R_MIPS_TLS_TPREL_HI16    = 49,
   R_MIPS_TLS_TPREL_LO16    = 50,
   R_MIPS_GLOB_DAT          = 51,
+  R_MIPS_PC21_S2           = 60,
+  R_MIPS_PC26_S2           = 61,
+  R_MIPS_PC18_S3           = 62,
+  R_MIPS_PC19_S2           = 63,
+  R_MIPS_PCHI16            = 64,
+  R_MIPS_PCLO16            = 65,
   R_MIPS16_GOT16           = 102,
   R_MIPS16_HI16            = 104,
   R_MIPS16_LO16            = 105,
@@ -906,7 +914,11 @@ enum {
 
 // Special values for the st_other field in the symbol table entry for MIPS.
 enum {
-  STO_MIPS_MICROMIPS       = 0x80 // MIPS Specific ISA for MicroMips
+  STO_MIPS_OPTIONAL        = 0x04,  // Symbol whose definition is optional
+  STO_MIPS_PLT             = 0x08,  // PLT entry related dynamic table record
+  STO_MIPS_PIC             = 0x20,  // PIC func in an object mixes PIC/non-PIC
+  STO_MIPS_MICROMIPS       = 0x80,  // MIPS Specific ISA for MicroMips
+  STO_MIPS_MIPS16          = 0xf0   // MIPS Specific ISA for Mips16
 };
 
 // Hexagon Specific e_flags
@@ -1661,6 +1673,7 @@ enum {
   DT_LOPROC       = 0x70000000, // Start of processor specific tags.
   DT_HIPROC       = 0x7FFFFFFF, // End of processor specific tags.
 
+  DT_GNU_HASH     = 0x6FFFFEF5, // Reference to the GNU hash table.
   DT_RELACOUNT    = 0x6FFFFFF9, // ELF32_Rela count.
   DT_RELCOUNT     = 0x6FFFFFFA, // ELF32_Rel count.
 
diff --git a/include/llvm/Support/ErrorHandling.h b/include/llvm/Support/ErrorHandling.h
index b948d97..ac3a4d8 100644
--- a/include/llvm/Support/ErrorHandling.h
+++ b/include/llvm/Support/ErrorHandling.h
@@ -47,7 +47,7 @@ namespace llvm {
   /// \param user_data - An argument which will be passed to the install error
   /// handler.
   void install_fatal_error_handler(fatal_error_handler_t handler,
-                                   void *user_data = 0);
+                                   void *user_data = nullptr);
 
   /// Restores default error handling behaviour.
   /// This must not be called between llvm_start_multithreaded() and
@@ -59,7 +59,7 @@ namespace llvm {
   /// remove_fatal_error_handler in its destructor.
   struct ScopedFatalErrorHandler {
     explicit ScopedFatalErrorHandler(fatal_error_handler_t handler,
-                                     void *user_data = 0) {
+                                     void *user_data = nullptr) {
       install_fatal_error_handler(handler, user_data);
     }
 
@@ -86,9 +86,9 @@ namespace llvm {
   /// This function calls abort(), and prints the optional message to stderr.
   /// Use the llvm_unreachable macro (that adds location info), instead of
   /// calling this function directly.
-  LLVM_ATTRIBUTE_NORETURN void llvm_unreachable_internal(const char *msg=0,
-                                                         const char *file=0,
-                                                         unsigned line=0);
+  LLVM_ATTRIBUTE_NORETURN void
+  llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr,
+                            unsigned line=0);
 }
 
 /// Marks that the current location is not supposed to be reachable.
diff --git a/include/llvm/Support/FileOutputBuffer.h b/include/llvm/Support/FileOutputBuffer.h
index 1884a24..a8a48fa 100644
--- a/include/llvm/Support/FileOutputBuffer.h
+++ b/include/llvm/Support/FileOutputBuffer.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_SUPPORT_FILEOUTPUTBUFFER_H
 #define LLVM_SUPPORT_FILEOUTPUTBUFFER_H
 
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
@@ -41,9 +40,6 @@ public:
   /// buffer of the specified size. When committed, the buffer will be written
   /// to the file at the specified path.
   static error_code create(StringRef FilePath, size_t Size,
-                           OwningPtr<FileOutputBuffer> &Result,
-                           unsigned Flags = 0);
-  static error_code create(StringRef FilePath, size_t Size,
                            std::unique_ptr<FileOutputBuffer> &Result,
                            unsigned Flags = 0);
 
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index b511a8e..806a3e3 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -165,15 +165,30 @@ class file_status
   file_type Type;
   perms Perms;
 public:
-  file_status() : Type(file_type::status_error) {}
-  file_status(file_type Type) : Type(Type) {}
-
   #if defined(LLVM_ON_UNIX)
+    file_status() : fs_st_dev(0), fs_st_ino(0), fs_st_mtime(0),
+        fs_st_uid(0), fs_st_gid(0), fs_st_size(0),
+        Type(file_type::status_error), Perms(perms_not_known) {}
+
+    file_status(file_type Type) : fs_st_dev(0), fs_st_ino(0), fs_st_mtime(0),
+        fs_st_uid(0), fs_st_gid(0), fs_st_size(0), Type(Type),
+        Perms(perms_not_known) {}
+
     file_status(file_type Type, perms Perms, dev_t Dev, ino_t Ino, time_t MTime,
                 uid_t UID, gid_t GID, off_t Size)
         : fs_st_dev(Dev), fs_st_ino(Ino), fs_st_mtime(MTime), fs_st_uid(UID),
           fs_st_gid(GID), fs_st_size(Size), Type(Type), Perms(Perms) {}
   #elif defined(LLVM_ON_WIN32)
+    file_status() : LastWriteTimeHigh(0), LastWriteTimeLow(0),
+        VolumeSerialNumber(0), FileSizeHigh(0), FileSizeLow(0),
+        FileIndexHigh(0), FileIndexLow(0), Type(file_type::status_error),
+        Perms(perms_not_known) {}
+
+    file_status(file_type Type) : LastWriteTimeHigh(0), LastWriteTimeLow(0),
+        VolumeSerialNumber(0), FileSizeHigh(0), FileSizeLow(0),
+        FileIndexHigh(0), FileIndexLow(0), Type(Type),
+        Perms(perms_not_known) {}
+
     file_status(file_type Type, uint32_t LastWriteTimeHigh,
                 uint32_t LastWriteTimeLow, uint32_t VolumeSerialNumber,
                 uint32_t FileSizeHigh, uint32_t FileSizeLow,
@@ -562,7 +577,7 @@ error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
 error_code createUniqueDirectory(const Twine &Prefix,
                                  SmallVectorImpl<char> &ResultPath);
 
-enum OpenFlags {
+enum OpenFlags : unsigned {
   F_None = 0,
 
   /// F_Excl - When opening a file, this flag makes raw_fd_ostream
@@ -814,7 +829,7 @@ public:
   }
 
   /// Construct end iterator.
-  directory_iterator() : State(0) {}
+  directory_iterator() : State(nullptr) {}
 
   // No operator++ because we need error_code.
   directory_iterator &increment(error_code &ec) {
@@ -828,9 +843,9 @@ public:
   bool operator==(const directory_iterator &RHS) const {
     if (State == RHS.State)
       return true;
-    if (RHS.State == 0)
+    if (!RHS.State)
       return State->CurrentEntry == directory_entry();
-    if (State == 0)
+    if (!State)
       return RHS.State->CurrentEntry == directory_entry();
     return State->CurrentEntry == RHS.State->CurrentEntry;
   }
diff --git a/include/llvm/Support/FileUtilities.h b/include/llvm/Support/FileUtilities.h
index 873b8df..3f2f176 100644
--- a/include/llvm/Support/FileUtilities.h
+++ b/include/llvm/Support/FileUtilities.h
@@ -30,7 +30,7 @@ namespace llvm {
   int DiffFilesWithTolerance(StringRef FileA,
                              StringRef FileB,
                              double AbsTol, double RelTol,
-                             std::string *Error = 0);
+                             std::string *Error = nullptr);
 
 
   /// FileRemover - This class is a simple object meant to be stack allocated.
diff --git a/include/llvm/Support/FormattedStream.h b/include/llvm/Support/FormattedStream.h
index 78c4809..8137daa 100644
--- a/include/llvm/Support/FormattedStream.h
+++ b/include/llvm/Support/FormattedStream.h
@@ -85,12 +85,12 @@ public:
   /// underneath it.
   ///
   formatted_raw_ostream(raw_ostream &Stream, bool Delete = false) 
-    : raw_ostream(), TheStream(0), DeleteStream(false), Position(0, 0) {
+    : raw_ostream(), TheStream(nullptr), DeleteStream(false), Position(0, 0) {
     setStream(Stream, Delete);
   }
   explicit formatted_raw_ostream()
-    : raw_ostream(), TheStream(0), DeleteStream(false), Position(0, 0) {
-    Scanned = 0;
+    : raw_ostream(), TheStream(nullptr), DeleteStream(false), Position(0, 0) {
+    Scanned = nullptr;
   }
 
   ~formatted_raw_ostream() {
@@ -114,7 +114,7 @@ public:
       SetUnbuffered();
     TheStream->SetUnbuffered();
 
-    Scanned = 0;
+    Scanned = nullptr;
   }
 
   /// PadToColumn - Align the output to some column number.  If the current
diff --git a/include/llvm/Support/GCOV.h b/include/llvm/Support/GCOV.h
index 902f2db..0cb6cfd 100644
--- a/include/llvm/Support/GCOV.h
+++ b/include/llvm/Support/GCOV.h
@@ -37,9 +37,9 @@ namespace GCOV {
 
 /// GCOVOptions - A struct for passing gcov options between functions.
 struct GCOVOptions {
-  GCOVOptions(bool A, bool B, bool C, bool F, bool P, bool U)
+  GCOVOptions(bool A, bool B, bool C, bool F, bool P, bool U, bool L, bool N)
       : AllBlocks(A), BranchInfo(B), BranchCount(C), FuncCoverage(F),
-        PreservePaths(P), UncondBranch(U) {}
+        PreservePaths(P), UncondBranch(U), LongFileNames(L), NoOutput(N) {}
 
   bool AllBlocks;
   bool BranchInfo;
@@ -47,6 +47,8 @@ struct GCOVOptions {
   bool FuncCoverage;
   bool PreservePaths;
   bool UncondBranch;
+  bool LongFileNames;
+  bool NoOutput;
 };
 
 /// GCOVBuffer - A wrapper around MemoryBuffer to provide GCOV specific
@@ -232,7 +234,6 @@ class GCOVFile {
 public:
   GCOVFile() : GCNOInitialized(false), Checksum(0), Functions(), RunCount(0),
                ProgramCount(0) {}
-  ~GCOVFile();
   bool readGCNO(GCOVBuffer &Buffer);
   bool readGCDA(GCOVBuffer &Buffer);
   uint32_t getChecksum() const { return Checksum; }
@@ -242,27 +243,27 @@ private:
   bool GCNOInitialized;
   GCOV::GCOVVersion Version;
   uint32_t Checksum;
-  SmallVector<GCOVFunction *, 16> Functions;
+  SmallVector<std::unique_ptr<GCOVFunction>, 16> Functions;
   uint32_t RunCount;
   uint32_t ProgramCount;
 };
 
 /// GCOVEdge - Collects edge information.
 struct GCOVEdge {
-  GCOVEdge(GCOVBlock *S, GCOVBlock *D): Src(S), Dst(D), Count(0) {}
+  GCOVEdge(GCOVBlock &S, GCOVBlock &D) : Src(S), Dst(D), Count(0) {}
 
-  GCOVBlock *Src;
-  GCOVBlock *Dst;
+  GCOVBlock &Src;
+  GCOVBlock &Dst;
   uint64_t Count;
 };
 
 /// GCOVFunction - Collects function information.
 class GCOVFunction {
 public:
-  typedef SmallVectorImpl<GCOVBlock *>::const_iterator BlockIterator;
+  typedef SmallVectorImpl<std::unique_ptr<GCOVBlock>>::const_iterator
+  BlockIterator;
 
   GCOVFunction(GCOVFile &P) : Parent(P), Ident(0), LineNumber(0) {}
-  ~GCOVFunction();
   bool readGCNO(GCOVBuffer &Buffer, GCOV::GCOVVersion Version);
   bool readGCDA(GCOVBuffer &Buffer, GCOV::GCOVVersion Version);
   StringRef getName() const { return Name; }
@@ -283,8 +284,8 @@ private:
   uint32_t LineNumber;
   StringRef Name;
   StringRef Filename;
-  SmallVector<GCOVBlock *, 16> Blocks;
-  SmallVector<GCOVEdge *, 16> Edges;
+  SmallVector<std::unique_ptr<GCOVBlock>, 16> Blocks;
+  SmallVector<std::unique_ptr<GCOVEdge>, 16> Edges;
 };
 
 /// GCOVBlock - Collects block information.
@@ -298,7 +299,7 @@ class GCOVBlock {
 
   struct SortDstEdgesFunctor {
     bool operator()(const GCOVEdge *E1, const GCOVEdge *E2) {
-      return E1->Dst->Number < E2->Dst->Number;
+      return E1->Dst.Number < E2->Dst.Number;
     }
   };
 public:
@@ -314,13 +315,13 @@ public:
   uint64_t getCount() const { return Counter; }
 
   void addSrcEdge(GCOVEdge *Edge) {
-    assert(Edge->Dst == this); // up to caller to ensure edge is valid
+    assert(&Edge->Dst == this); // up to caller to ensure edge is valid
     SrcEdges.push_back(Edge);
   }
   void addDstEdge(GCOVEdge *Edge) {
-    assert(Edge->Src == this); // up to caller to ensure edge is valid
+    assert(&Edge->Src == this); // up to caller to ensure edge is valid
     // Check if adding this edge causes list to become unsorted.
-    if (DstEdges.size() && DstEdges.back()->Dst->Number > Edge->Dst->Number)
+    if (DstEdges.size() && DstEdges.back()->Dst.Number > Edge->Dst.Number)
       DstEdgesAreSorted = false;
     DstEdges.push_back(Edge);
   }
@@ -355,8 +356,10 @@ class FileInfo {
   typedef DenseMap<uint32_t, BlockVector> BlockLines;
 
   struct LineData {
+    LineData() : LastLine(0) {}
     BlockLines Blocks;
     FunctionLines Functions;
+    uint32_t LastLine;
   };
 
   struct GCOVCoverage {
@@ -378,23 +381,30 @@ public:
     Options(Options), LineInfo(), RunCount(0), ProgramCount(0) {}
 
   void addBlockLine(StringRef Filename, uint32_t Line, const GCOVBlock *Block) {
+    if (Line > LineInfo[Filename].LastLine)
+      LineInfo[Filename].LastLine = Line;
     LineInfo[Filename].Blocks[Line-1].push_back(Block);
   }
   void addFunctionLine(StringRef Filename, uint32_t Line,
                        const GCOVFunction *Function) {
+    if (Line > LineInfo[Filename].LastLine)
+      LineInfo[Filename].LastLine = Line;
     LineInfo[Filename].Functions[Line-1].push_back(Function);
   }
   void setRunCount(uint32_t Runs) { RunCount = Runs; }
   void setProgramCount(uint32_t Programs) { ProgramCount = Programs; }
-  void print(StringRef GCNOFile, StringRef GCDAFile);
+  void print(StringRef MainFilename, StringRef GCNOFile, StringRef GCDAFile);
+
 private:
-  void printFunctionSummary(raw_fd_ostream &OS,
+  std::string getCoveragePath(StringRef Filename, StringRef MainFilename);
+  std::unique_ptr<raw_ostream> openCoveragePath(StringRef CoveragePath);
+  void printFunctionSummary(raw_ostream &OS,
                             const FunctionVector &Funcs) const;
-  void printBlockInfo(raw_fd_ostream &OS, const GCOVBlock &Block,
+  void printBlockInfo(raw_ostream &OS, const GCOVBlock &Block,
                       uint32_t LineIndex, uint32_t &BlockNo) const;
-  void printBranchInfo(raw_fd_ostream &OS, const GCOVBlock &Block,
+  void printBranchInfo(raw_ostream &OS, const GCOVBlock &Block,
                        GCOVCoverage &Coverage, uint32_t &EdgeNo);
-  void printUncondBranchInfo(raw_fd_ostream &OS, uint32_t &EdgeNo,
+  void printUncondBranchInfo(raw_ostream &OS, uint32_t &EdgeNo,
                              uint64_t Count) const;
 
   void printCoverage(const GCOVCoverage &Coverage) const;
diff --git a/include/llvm/Support/GenericDomTree.h b/include/llvm/Support/GenericDomTree.h
index 6878844..e344220 100644
--- a/include/llvm/Support/GenericDomTree.h
+++ b/include/llvm/Support/GenericDomTree.h
@@ -186,9 +186,9 @@ class DominatorTreeBase : public DominatorBase<NodeT> {
     assert(isReachableFromEntry(A));
 
     const DomTreeNodeBase<NodeT> *IDom;
-    while ((IDom = B->getIDom()) != 0 && IDom != A && IDom != B)
+    while ((IDom = B->getIDom()) != nullptr && IDom != A && IDom != B)
       B = IDom;   // Walk up the tree
-    return IDom != 0;
+    return IDom != nullptr;
   }
 
 protected:
@@ -205,7 +205,7 @@ protected:
     unsigned Semi;
     NodeT *Label;
 
-    InfoRec() : DFSNum(0), Parent(0), Semi(0), Label(0) {}
+    InfoRec() : DFSNum(0), Parent(0), Semi(0), Label(nullptr) {}
   };
 
   DenseMap<NodeT*, NodeT*> IDoms;
@@ -224,7 +224,7 @@ protected:
     IDoms.clear();
     this->Roots.clear();
     Vertex.clear();
-    RootNode = 0;
+    RootNode = nullptr;
   }
 
   // NewBB is split and now it has one successor. Update dominator tree to
@@ -260,7 +260,7 @@ protected:
 
     // Find NewBB's immediate dominator and create new dominator tree node for
     // NewBB.
-    NodeT *NewBBIDom = 0;
+    NodeT *NewBBIDom = nullptr;
     unsigned i = 0;
     for (i = 0; i < PredBlocks.size(); ++i)
       if (DT.isReachableFromEntry(PredBlocks[i])) {
@@ -344,7 +344,7 @@ public:
   void getDescendants(NodeT *R, SmallVectorImpl<NodeT *> &Result) const {
     Result.clear();
     const DomTreeNodeBase<NodeT> *RN = getNode(R);
-    if (RN == NULL)
+    if (!RN)
       return; // If R is unreachable, it will not be present in the DOM tree.
     SmallVector<const DomTreeNodeBase<NodeT> *, 8> WL;
     WL.push_back(RN);
@@ -361,7 +361,7 @@ public:
   ///
   bool properlyDominates(const DomTreeNodeBase<NodeT> *A,
                          const DomTreeNodeBase<NodeT> *B) const {
-    if (A == 0 || B == 0)
+    if (!A || !B)
       return false;
     if (A == B)
       return false;
@@ -453,6 +453,21 @@ public:
     DomTreeNodeBase<NodeT> *NodeA = getNode(A);
     DomTreeNodeBase<NodeT> *NodeB = getNode(B);
 
+    // If we have DFS info, then we can avoid all allocations by just querying
+    // it from each IDom. Note that because we call 'dominates' twice above, we
+    // expect to call through this code at most 16 times in a row without
+    // building valid DFS information. This is important as below is a *very*
+    // slow tree walk.
+    if (DFSInfoValid) {
+      DomTreeNodeBase<NodeT> *IDomA = NodeA->getIDom();
+      while (IDomA) {
+        if (NodeB->DominatedBy(IDomA))
+          return IDomA->getBlock();
+        IDomA = IDomA->getIDom();
+      }
+      return nullptr;
+    }
+
     // Collect NodeA dominators set.
     SmallPtrSet<DomTreeNodeBase<NodeT>*, 16> NodeADoms;
     NodeADoms.insert(NodeA);
@@ -471,7 +486,7 @@ public:
       IDomB = IDomB->getIDom();
     }
 
-    return NULL;
+    return nullptr;
   }
 
   const NodeT *findNearestCommonDominator(const NodeT *A, const NodeT *B) {
@@ -489,7 +504,7 @@ public:
   /// creates a new node as a child of DomBB dominator node,linking it into
   /// the children list of the immediate dominator.
   DomTreeNodeBase<NodeT> *addNewBlock(NodeT *BB, NodeT *DomBB) {
-    assert(getNode(BB) == 0 && "Block already in dominator tree!");
+    assert(getNode(BB) == nullptr && "Block already in dominator tree!");
     DomTreeNodeBase<NodeT> *IDomNode = getNode(DomBB);
     assert(IDomNode && "Not immediate dominator specified for block!");
     DFSInfoValid = false;
@@ -636,7 +651,7 @@ protected:
     // immediate dominator.
     NodeT *IDom = getIDom(BB);
 
-    assert(IDom || this->DomTreeNodes[NULL]);
+    assert(IDom || this->DomTreeNodes[nullptr]);
     DomTreeNodeBase<NodeT> *IDomNode = getNodeForBlock(IDom);
 
     // Add a new tree node for this NodeT, and link it as a child of
@@ -659,14 +674,14 @@ public:
   void recalculate(FT& F) {
     typedef GraphTraits<FT*> TraitsTy;
     reset();
-    this->Vertex.push_back(0);
+    this->Vertex.push_back(nullptr);
 
     if (!this->IsPostDominators) {
       // Initialize root
       NodeT *entry = TraitsTy::getEntryNode(&F);
       this->Roots.push_back(entry);
-      this->IDoms[entry] = 0;
-      this->DomTreeNodes[entry] = 0;
+      this->IDoms[entry] = nullptr;
+      this->DomTreeNodes[entry] = nullptr;
 
       Calculate<FT, NodeT*>(*this, F);
     } else {
@@ -677,8 +692,8 @@ public:
           addRoot(I);
 
         // Prepopulate maps so that we don't get iterator invalidation issues later.
-        this->IDoms[I] = 0;
-        this->DomTreeNodes[I] = 0;
+        this->IDoms[I] = nullptr;
+        this->DomTreeNodes[I] = nullptr;
       }
 
       Calculate<FT, Inverse<NodeT*> >(*this, F);
diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index f6bb8f4..bcba5e0 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@@ -156,11 +156,11 @@ void Calculate(DominatorTreeBase<typename GraphTraits<NodeT>::NodeType>& DT,
   bool MultipleRoots = (DT.Roots.size() > 1);
   if (MultipleRoots) {
     typename DominatorTreeBase<typename GraphT::NodeType>::InfoRec &BBInfo =
-        DT.Info[NULL];
+        DT.Info[nullptr];
     BBInfo.DFSNum = BBInfo.Semi = ++N;
-    BBInfo.Label = NULL;
+    BBInfo.Label = nullptr;
 
-    DT.Vertex.push_back(NULL);       // Vertex[n] = V;
+    DT.Vertex.push_back(nullptr);       // Vertex[n] = V;
   }
 
   // Step #1: Number blocks in depth-first order and initialize variables used
@@ -249,10 +249,10 @@ void Calculate(DominatorTreeBase<typename GraphTraits<NodeT>::NodeType>& DT,
   // one exit block, or it may be the virtual exit (denoted by (BasicBlock *)0)
   // which postdominates all real exits if there are multiple exit blocks, or
   // an infinite loop.
-  typename GraphT::NodeType* Root = !MultipleRoots ? DT.Roots[0] : 0;
+  typename GraphT::NodeType* Root = !MultipleRoots ? DT.Roots[0] : nullptr;
 
   DT.DomTreeNodes[Root] = DT.RootNode =
-                        new DomTreeNodeBase<typename GraphT::NodeType>(Root, 0);
+                  new DomTreeNodeBase<typename GraphT::NodeType>(Root, nullptr);
 
   // Loop over all of the reachable blocks in the function...
   for (unsigned i = 2; i <= N; ++i) {
@@ -263,7 +263,7 @@ void Calculate(DominatorTreeBase<typename GraphTraits<NodeT>::NodeType>& DT,
 
     typename GraphT::NodeType* ImmDom = DT.getIDom(W);
 
-    assert(ImmDom || DT.DomTreeNodes[NULL]);
+    assert(ImmDom || DT.DomTreeNodes[nullptr]);
 
     // Get or calculate the node for the immediate dominator
     DomTreeNodeBase<typename GraphT::NodeType> *IDomNode =
diff --git a/include/llvm/Support/GraphWriter.h b/include/llvm/Support/GraphWriter.h
index 62547dd..539673a 100644
--- a/include/llvm/Support/GraphWriter.h
+++ b/include/llvm/Support/GraphWriter.h
@@ -259,8 +259,8 @@ public:
 
   /// emitSimpleNode - Outputs a simple (non-record) node
   void emitSimpleNode(const void *ID, const std::string &Attr,
-                      const std::string &Label, unsigned NumEdgeSources = 0,
-                      const std::vector<std::string> *EdgeSourceLabels = 0) {
+                   const std::string &Label, unsigned NumEdgeSources = 0,
+                   const std::vector<std::string> *EdgeSourceLabels = nullptr) {
     O << "\tNode" << ID << "[ ";
     if (!Attr.empty())
       O << Attr << ",";
@@ -325,7 +325,10 @@ template <typename GraphType>
 std::string WriteGraph(const GraphType &G, const Twine &Name,
                        bool ShortNames = false, const Twine &Title = "") {
   int FD;
-  std::string Filename = createGraphFilename(Name, FD);
+  // Windows can't always handle long paths, so limit the length of the name.
+  std::string N = Name.str();
+  N = N.substr(0, std::min<std::size_t>(N.size(), 140));
+  std::string Filename = createGraphFilename(N, FD);
   raw_fd_ostream O(FD, /*shouldClose=*/ true);
 
   if (FD == -1) {
diff --git a/include/llvm/Support/LEB128.h b/include/llvm/Support/LEB128.h
index 9ef5fe6..ea76c9b 100644
--- a/include/llvm/Support/LEB128.h
+++ b/include/llvm/Support/LEB128.h
@@ -77,7 +77,7 @@ inline unsigned encodeULEB128(uint64_t Value, uint8_t *p,
 
 
 /// Utility function to decode a ULEB128 value.
-inline uint64_t decodeULEB128(const uint8_t *p, unsigned *n = 0) {
+inline uint64_t decodeULEB128(const uint8_t *p, unsigned *n = nullptr) {
   const uint8_t *orig_p = p;
   uint64_t Value = 0;
   unsigned Shift = 0;
diff --git a/include/llvm/Support/LineIterator.h b/include/llvm/Support/LineIterator.h
index 7077656..2a58262 100644
--- a/include/llvm/Support/LineIterator.h
+++ b/include/llvm/Support/LineIterator.h
@@ -11,6 +11,7 @@
 #define LLVM_SUPPORT_LINEITERATOR_H__
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
 #include <iterator>
 
 namespace llvm {
@@ -28,7 +29,7 @@ class MemoryBuffer;
 ///
 /// Note that this iterator requires the buffer to be nul terminated.
 class line_iterator
-    : public std::iterator<std::forward_iterator_tag, StringRef, ptrdiff_t> {
+    : public std::iterator<std::forward_iterator_tag, StringRef> {
   const MemoryBuffer *Buffer;
   char CommentMarker;
 
@@ -37,7 +38,7 @@ class line_iterator
 
 public:
   /// \brief Default construct an "end" iterator.
-  line_iterator() : Buffer(0) {}
+  line_iterator() : Buffer(nullptr) {}
 
   /// \brief Construct a new iterator around some memory buffer.
   explicit line_iterator(const MemoryBuffer &Buffer, char CommentMarker = '\0');
diff --git a/include/llvm/Support/LockFileManager.h b/include/llvm/Support/LockFileManager.h
index 9df8675..523a781 100644
--- a/include/llvm/Support/LockFileManager.h
+++ b/include/llvm/Support/LockFileManager.h
@@ -40,6 +40,16 @@ public:
     LFS_Error
   };
 
+  /// \brief Describes the result of waiting for the owner to release the lock.
+  enum WaitForUnlockResult {
+    /// \brief The lock was released successfully.
+    Res_Success,
+    /// \brief Owner died while holding the lock.
+    Res_OwnerDied,
+    /// \brief Reached timeout while waiting for the owner to release the lock.
+    Res_Timeout
+  };
+
 private:
   SmallString<128> FileName;
   SmallString<128> LockFileName;
@@ -67,7 +77,7 @@ public:
   operator LockFileState() const { return getState(); }
 
   /// \brief For a shared lock, wait until the owner releases the lock.
-  void waitForUnlock();
+  WaitForUnlockResult waitForUnlock();
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/MachO.h b/include/llvm/Support/MachO.h
index ef06a41..2a0fc7b 100644
--- a/include/llvm/Support/MachO.h
+++ b/include/llvm/Support/MachO.h
@@ -153,27 +153,59 @@ namespace llvm {
     enum SectionType : uint32_t {
       // Constant masks for the "flags[7:0]" field in llvm::MachO::section and
       // llvm::MachO::section_64 (mask "flags" with SECTION_TYPE)
+
+      /// S_REGULAR - Regular section.
       S_REGULAR                             = 0x00u,
+      /// S_ZEROFILL - Zero fill on demand section.
       S_ZEROFILL                            = 0x01u,
+      /// S_CSTRING_LITERALS - Section with literal C strings.
       S_CSTRING_LITERALS                    = 0x02u,
+      /// S_4BYTE_LITERALS - Section with 4 byte literals.
       S_4BYTE_LITERALS                      = 0x03u,
+      /// S_8BYTE_LITERALS - Section with 8 byte literals.
       S_8BYTE_LITERALS                      = 0x04u,
+      /// S_LITERAL_POINTERS - Section with pointers to literals.
       S_LITERAL_POINTERS                    = 0x05u,
+      /// S_NON_LAZY_SYMBOL_POINTERS - Section with non-lazy symbol pointers.
       S_NON_LAZY_SYMBOL_POINTERS            = 0x06u,
+      /// S_LAZY_SYMBOL_POINTERS - Section with lazy symbol pointers.
       S_LAZY_SYMBOL_POINTERS                = 0x07u,
+      /// S_SYMBOL_STUBS - Section with symbol stubs, byte size of stub in
+      /// the Reserved2 field.
       S_SYMBOL_STUBS                        = 0x08u,
+      /// S_MOD_INIT_FUNC_POINTERS - Section with only function pointers for
+      /// initialization.
       S_MOD_INIT_FUNC_POINTERS              = 0x09u,
+      /// S_MOD_TERM_FUNC_POINTERS - Section with only function pointers for
+      /// termination.
       S_MOD_TERM_FUNC_POINTERS              = 0x0au,
+      /// S_COALESCED - Section contains symbols that are to be coalesced.
       S_COALESCED                           = 0x0bu,
+      /// S_GB_ZEROFILL - Zero fill on demand section (that can be larger than 4
+      /// gigabytes).
       S_GB_ZEROFILL                         = 0x0cu,
+      /// S_INTERPOSING - Section with only pairs of function pointers for
+      /// interposing.
       S_INTERPOSING                         = 0x0du,
+      /// S_16BYTE_LITERALS - Section with only 16 byte literals.
       S_16BYTE_LITERALS                     = 0x0eu,
+      /// S_DTRACE_DOF - Section contains DTrace Object Format.
       S_DTRACE_DOF                          = 0x0fu,
+      /// S_LAZY_DYLIB_SYMBOL_POINTERS - Section with lazy symbol pointers to
+      /// lazy loaded dylibs.
       S_LAZY_DYLIB_SYMBOL_POINTERS          = 0x10u,
+      /// S_THREAD_LOCAL_REGULAR - Thread local data section.
       S_THREAD_LOCAL_REGULAR                = 0x11u,
+      /// S_THREAD_LOCAL_ZEROFILL - Thread local zerofill section.
       S_THREAD_LOCAL_ZEROFILL               = 0x12u,
+      /// S_THREAD_LOCAL_VARIABLES - Section with thread local variable
+      /// structure data.
       S_THREAD_LOCAL_VARIABLES              = 0x13u,
+      /// S_THREAD_LOCAL_VARIABLE_POINTERS - Section with pointers to thread
+      /// local structures.
       S_THREAD_LOCAL_VARIABLE_POINTERS      = 0x14u,
+      /// S_THREAD_LOCAL_INIT_FUNCTION_POINTERS - Section with thread local
+      /// variable initialization pointers to functions.
       S_THREAD_LOCAL_INIT_FUNCTION_POINTERS = 0x15u,
 
       LAST_KNOWN_SECTION_TYPE = S_THREAD_LOCAL_INIT_FUNCTION_POINTERS
@@ -182,18 +214,34 @@ namespace llvm {
     enum : uint32_t {
       // Constant masks for the "flags[31:24]" field in llvm::MachO::section and
       // llvm::MachO::section_64 (mask "flags" with SECTION_ATTRIBUTES_USR)
+
+      /// S_ATTR_PURE_INSTRUCTIONS - Section contains only true machine
+      /// instructions.
       S_ATTR_PURE_INSTRUCTIONS   = 0x80000000u,
+      /// S_ATTR_NO_TOC - Section contains coalesced symbols that are not to be
+      /// in a ranlib table of contents.
       S_ATTR_NO_TOC              = 0x40000000u,
+      /// S_ATTR_STRIP_STATIC_SYMS - Ok to strip static symbols in this section
+      /// in files with the MY_DYLDLINK flag.
       S_ATTR_STRIP_STATIC_SYMS   = 0x20000000u,
+      /// S_ATTR_NO_DEAD_STRIP - No dead stripping.
       S_ATTR_NO_DEAD_STRIP       = 0x10000000u,
+      /// S_ATTR_LIVE_SUPPORT - Blocks are live if they reference live blocks.
       S_ATTR_LIVE_SUPPORT        = 0x08000000u,
+      /// S_ATTR_SELF_MODIFYING_CODE - Used with i386 code stubs written on by
+      /// dyld.
       S_ATTR_SELF_MODIFYING_CODE = 0x04000000u,
+      /// S_ATTR_DEBUG - A debug section.
       S_ATTR_DEBUG               = 0x02000000u,
 
       // Constant masks for the "flags[23:8]" field in llvm::MachO::section and
       // llvm::MachO::section_64 (mask "flags" with SECTION_ATTRIBUTES_SYS)
+
+      /// S_ATTR_SOME_INSTRUCTIONS - Section contains some machine instructions.
       S_ATTR_SOME_INSTRUCTIONS   = 0x00000400u,
+      /// S_ATTR_EXT_RELOC - Section has external relocation entries.
       S_ATTR_EXT_RELOC           = 0x00000200u,
+      /// S_ATTR_LOC_RELOC - Section has local relocation entries.
       S_ATTR_LOC_RELOC           = 0x00000100u,
 
       // Constant masks for the value of an indirect symbol in an indirect
diff --git a/include/llvm/Support/ManagedStatic.h b/include/llvm/Support/ManagedStatic.h
index 5587618..1bb8cea 100644
--- a/include/llvm/Support/ManagedStatic.h
+++ b/include/llvm/Support/ManagedStatic.h
@@ -47,7 +47,7 @@ protected:
   void RegisterManagedStatic(void *(*creator)(), void (*deleter)(void*)) const;
 public:
   /// isConstructed - Return true if this object has not been created yet.
-  bool isConstructed() const { return Ptr != 0; }
+  bool isConstructed() const { return Ptr != nullptr; }
 
   void destroy() const;
 };
diff --git a/include/llvm/Support/Memory.h b/include/llvm/Support/Memory.h
index 8251fcd..0996adb 100644
--- a/include/llvm/Support/Memory.h
+++ b/include/llvm/Support/Memory.h
@@ -28,7 +28,7 @@ namespace sys {
   /// @brief Memory block abstraction.
   class MemoryBlock {
   public:
-    MemoryBlock() : Address(0), Size(0) { }
+    MemoryBlock() : Address(nullptr), Size(0) { }
     MemoryBlock(void *addr, size_t size) : Address(addr), Size(size) { }
     void *base() const { return Address; }
     size_t size() const { return Size; }
@@ -120,7 +120,7 @@ namespace sys {
     /// @brief Allocate Read/Write/Execute memory.
     static MemoryBlock AllocateRWX(size_t NumBytes,
                                    const MemoryBlock *NearBlock,
-                                   std::string *ErrMsg = 0);
+                                   std::string *ErrMsg = nullptr);
 
     /// This method releases a block of Read/Write/Execute memory that was
     /// allocated with the AllocateRWX method. It should not be used to
@@ -129,7 +129,7 @@ namespace sys {
     /// On success, this returns false, otherwise it returns true and fills
     /// in *ErrMsg.
     /// @brief Release Read/Write/Execute memory.
-    static bool ReleaseRWX(MemoryBlock &block, std::string *ErrMsg = 0);
+    static bool ReleaseRWX(MemoryBlock &block, std::string *ErrMsg = nullptr);
 
 
     /// InvalidateInstructionCache - Before the JIT can run a block of code
@@ -140,12 +140,12 @@ namespace sys {
     /// setExecutable - Before the JIT can run a block of code, it has to be
     /// given read and executable privilege. Return true if it is already r-x
     /// or the system is able to change its previlege.
-    static bool setExecutable(MemoryBlock &M, std::string *ErrMsg = 0);
+    static bool setExecutable(MemoryBlock &M, std::string *ErrMsg = nullptr);
 
     /// setWritable - When adding to a block of code, the JIT may need
     /// to mark a block of code as RW since the protections are on page
     /// boundaries, and the JIT internal allocations are not page aligned.
-    static bool setWritable(MemoryBlock &M, std::string *ErrMsg = 0);
+    static bool setWritable(MemoryBlock &M, std::string *ErrMsg = nullptr);
 
     /// setRangeExecutable - Mark the page containing a range of addresses
     /// as executable.
diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index 578c7e8..5810c47 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h
@@ -24,7 +24,6 @@
 namespace llvm {
 
 class error_code;
-template<class T> class OwningPtr;
 
 /// MemoryBuffer - This interface provides simple read-only access to a block
 /// of memory, and provides simple methods for reading files and standard input
@@ -67,34 +66,39 @@ public:
   /// MemoryBuffer if successful, otherwise returning null.  If FileSize is
   /// specified, this means that the client knows that the file exists and that
   /// it has the specified size.
-  static error_code getFile(Twine Filename, OwningPtr<MemoryBuffer> &Result,
-                            int64_t FileSize = -1,
-                            bool RequiresNullTerminator = true);
+  ///
+  /// \param IsVolatileSize Set to true to indicate that the file size may be
+  /// changing, e.g. when libclang tries to parse while the user is
+  /// editing/updating the file.
   static error_code getFile(Twine Filename,
                             std::unique_ptr<MemoryBuffer> &Result,
                             int64_t FileSize = -1,
-                            bool RequiresNullTerminator = true);
+                            bool RequiresNullTerminator = true,
+                            bool IsVolatileSize = false);
 
   /// Given an already-open file descriptor, map some slice of it into a
   /// MemoryBuffer. The slice is specified by an \p Offset and \p MapSize.
   /// Since this is in the middle of a file, the buffer is not null terminated.
-  static error_code getOpenFileSlice(int FD, const char *Filename,
-                                     OwningPtr<MemoryBuffer> &Result,
-                                     uint64_t MapSize, int64_t Offset);
+  ///
+  /// \param IsVolatileSize Set to true to indicate that the file size may be
+  /// changing, e.g. when libclang tries to parse while the user is
+  /// editing/updating the file.
   static error_code getOpenFileSlice(int FD, const char *Filename,
                                      std::unique_ptr<MemoryBuffer> &Result,
-                                     uint64_t MapSize, int64_t Offset);
+                                     uint64_t MapSize, int64_t Offset,
+                                     bool IsVolatileSize = false);
 
   /// Given an already-open file descriptor, read the file and return a
   /// MemoryBuffer.
-  static error_code getOpenFile(int FD, const char *Filename,
-                                OwningPtr<MemoryBuffer> &Result,
-                                uint64_t FileSize,
-                                bool RequiresNullTerminator = true);
+  ///
+  /// \param IsVolatileSize Set to true to indicate that the file size may be
+  /// changing, e.g. when libclang tries to parse while the user is
+  /// editing/updating the file.
   static error_code getOpenFile(int FD, const char *Filename,
                                 std::unique_ptr<MemoryBuffer> &Result,
                                 uint64_t FileSize,
-                                bool RequiresNullTerminator = true);
+                                bool RequiresNullTerminator = true,
+                                bool IsVolatileSize = false);
 
   /// getMemBuffer - Open the specified memory range as a MemoryBuffer.  Note
   /// that InputData must be null terminated if RequiresNullTerminator is true.
@@ -123,7 +127,6 @@ public:
 
   /// getSTDIN - Read all of stdin into a file buffer, and return it.
   /// If an error occurs, this returns null and sets ec.
-  static error_code getSTDIN(OwningPtr<MemoryBuffer> &Result);
   static error_code getSTDIN(std::unique_ptr<MemoryBuffer> &Result);
 
 
@@ -131,9 +134,6 @@ public:
   /// if the Filename is "-".  If an error occurs, this returns null and sets
   /// ec.
   static error_code getFileOrSTDIN(StringRef Filename,
-                                   OwningPtr<MemoryBuffer> &Result,
-                                   int64_t FileSize = -1);
-  static error_code getFileOrSTDIN(StringRef Filename,
                                    std::unique_ptr<MemoryBuffer> &Result,
                                    int64_t FileSize = -1);
 
diff --git a/include/llvm/Support/OnDiskHashTable.h b/include/llvm/Support/OnDiskHashTable.h
new file mode 100644
index 0000000..f6d43a4
--- /dev/null
+++ b/include/llvm/Support/OnDiskHashTable.h
@@ -0,0 +1,571 @@
+//===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Defines facilities for reading and writing on-disk hash tables.
+///
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
+#define LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
+
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdlib>
+
+namespace llvm {
+
+/// \brief Generates an on disk hash table.
+///
+/// This needs an \c Info that handles storing values into the hash table's
+/// payload and computes the hash for a given key. This should provide the
+/// following interface:
+///
+/// \code
+/// class ExampleInfo {
+/// public:
+///   typedef ExampleKey key_type;   // Must be copy constructible
+///   typedef ExampleKey &key_type_ref;
+///   typedef ExampleData data_type; // Must be copy constructible
+///   typedef ExampleData &data_type_ref;
+///   typedef uint32_t hash_value_type; // The type the hash function returns.
+///   typedef uint32_t offset_type; // The type for offsets into the table.
+///
+///   /// Calculate the hash for Key
+///   static hash_value_type ComputeHash(key_type_ref Key);
+///   /// Return the lengths, in bytes, of the given Key/Data pair.
+///   static std::pair<offset_type, offset_type>
+///   EmitKeyDataLength(raw_ostream &Out, key_type_ref Key, data_type_ref Data);
+///   /// Write Key to Out.  KeyLen is the length from EmitKeyDataLength.
+///   static void EmitKey(raw_ostream &Out, key_type_ref Key,
+///                       offset_type KeyLen);
+///   /// Write Data to Out.  DataLen is the length from EmitKeyDataLength.
+///   static void EmitData(raw_ostream &Out, key_type_ref Key,
+///                        data_type_ref Data, offset_type DataLen);
+/// };
+/// \endcode
+template <typename Info> class OnDiskChainedHashTableGenerator {
+  /// \brief A single item in the hash table.
+  class Item {
+  public:
+    typename Info::key_type Key;
+    typename Info::data_type Data;
+    Item *Next;
+    const typename Info::hash_value_type Hash;
+
+    Item(typename Info::key_type_ref Key, typename Info::data_type_ref Data,
+         Info &InfoObj)
+        : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {}
+  };
+
+  typedef typename Info::offset_type offset_type;
+  offset_type NumBuckets;
+  offset_type NumEntries;
+  llvm::SpecificBumpPtrAllocator<Item> BA;
+
+  /// \brief A linked list of values in a particular hash bucket.
+  class Bucket {
+  public:
+    offset_type Off;
+    Item *Head;
+    unsigned Length;
+
+    Bucket() {}
+  };
+
+  Bucket *Buckets;
+
+private:
+  /// \brief Insert an item into the appropriate hash bucket.
+  void insert(Bucket *Buckets, size_t Size, Item *E) {
+    Bucket &B = Buckets[E->Hash & (Size - 1)];
+    E->Next = B.Head;
+    ++B.Length;
+    B.Head = E;
+  }
+
+  /// \brief Resize the hash table, moving the old entries into the new buckets.
+  void resize(size_t NewSize) {
+    Bucket *NewBuckets = (Bucket *)std::calloc(NewSize, sizeof(Bucket));
+    // Populate NewBuckets with the old entries.
+    for (size_t I = 0; I < NumBuckets; ++I)
+      for (Item *E = Buckets[I].Head; E;) {
+        Item *N = E->Next;
+        E->Next = nullptr;
+        insert(NewBuckets, NewSize, E);
+        E = N;
+      }
+
+    free(Buckets);
+    NumBuckets = NewSize;
+    Buckets = NewBuckets;
+  }
+
+public:
+  /// \brief Insert an entry into the table.
+  void insert(typename Info::key_type_ref Key,
+              typename Info::data_type_ref Data) {
+    Info InfoObj;
+    insert(Key, Data, InfoObj);
+  }
+
+  /// \brief Insert an entry into the table.
+  ///
+  /// Uses the provided Info instead of a stack allocated one.
+  void insert(typename Info::key_type_ref Key,
+              typename Info::data_type_ref Data, Info &InfoObj) {
+
+    ++NumEntries;
+    if (4 * NumEntries >= 3 * NumBuckets)
+      resize(NumBuckets * 2);
+    insert(Buckets, NumBuckets, new (BA.Allocate()) Item(Key, Data, InfoObj));
+  }
+
+  /// \brief Emit the table to Out, which must not be at offset 0.
+  offset_type Emit(raw_ostream &Out) {
+    Info InfoObj;
+    return Emit(Out, InfoObj);
+  }
+
+  /// \brief Emit the table to Out, which must not be at offset 0.
+  ///
+  /// Uses the provided Info instead of a stack allocated one.
+  offset_type Emit(raw_ostream &Out, Info &InfoObj) {
+    using namespace llvm::support;
+    endian::Writer<little> LE(Out);
+
+    // Emit the payload of the table.
+    for (offset_type I = 0; I < NumBuckets; ++I) {
+      Bucket &B = Buckets[I];
+      if (!B.Head)
+        continue;
+
+      // Store the offset for the data of this bucket.
+      B.Off = Out.tell();
+      assert(B.Off && "Cannot write a bucket at offset 0. Please add padding.");
+
+      // Write out the number of items in the bucket.
+      LE.write<uint16_t>(B.Length);
+      assert(B.Length != 0 && "Bucket has a head but zero length?");
+
+      // Write out the entries in the bucket.
+      for (Item *I = B.Head; I; I = I->Next) {
+        LE.write<typename Info::hash_value_type>(I->Hash);
+        const std::pair<offset_type, offset_type> &Len =
+            InfoObj.EmitKeyDataLength(Out, I->Key, I->Data);
+        InfoObj.EmitKey(Out, I->Key, Len.first);
+        InfoObj.EmitData(Out, I->Key, I->Data, Len.second);
+      }
+    }
+
+    // Pad with zeros so that we can start the hashtable at an aligned address.
+    offset_type TableOff = Out.tell();
+    uint64_t N = llvm::OffsetToAlignment(TableOff, alignOf<offset_type>());
+    TableOff += N;
+    while (N--)
+      LE.write<uint8_t>(0);
+
+    // Emit the hashtable itself.
+    LE.write<offset_type>(NumBuckets);
+    LE.write<offset_type>(NumEntries);
+    for (offset_type I = 0; I < NumBuckets; ++I)
+      LE.write<offset_type>(Buckets[I].Off);
+
+    return TableOff;
+  }
+
+  OnDiskChainedHashTableGenerator() {
+    NumEntries = 0;
+    NumBuckets = 64;
+    // Note that we do not need to run the constructors of the individual
+    // Bucket objects since 'calloc' returns bytes that are all 0.
+    Buckets = (Bucket *)std::calloc(NumBuckets, sizeof(Bucket));
+  }
+
+  ~OnDiskChainedHashTableGenerator() { std::free(Buckets); }
+};
+
+/// \brief Provides lookup on an on disk hash table.
+///
+/// This needs an \c Info that handles reading values from the hash table's
+/// payload and computes the hash for a given key. This should provide the
+/// following interface:
+///
+/// \code
+/// class ExampleLookupInfo {
+/// public:
+///   typedef ExampleData data_type;
+///   typedef ExampleInternalKey internal_key_type; // The stored key type.
+///   typedef ExampleKey external_key_type; // The type to pass to find().
+///   typedef uint32_t hash_value_type; // The type the hash function returns.
+///   typedef uint32_t offset_type; // The type for offsets into the table.
+///
+///   /// Compare two keys for equality.
+///   static bool EqualKey(internal_key_type &Key1, internal_key_type &Key2);
+///   /// Calculate the hash for the given key.
+///   static hash_value_type ComputeHash(internal_key_type &IKey);
+///   /// Translate from the semantic type of a key in the hash table to the
+///   /// type that is actually stored and used for hashing and comparisons.
+///   /// The internal and external types are often the same, in which case this
+///   /// can simply return the passed in value.
+///   static const internal_key_type &GetInternalKey(external_key_type &EKey);
+///   /// Read the key and data length from Buffer, leaving it pointing at the
+///   /// following byte.
+///   static std::pair<offset_type, offset_type>
+///   ReadKeyDataLength(const unsigned char *&Buffer);
+///   /// Read the key from Buffer, given the KeyLen as reported from
+///   /// ReadKeyDataLength.
+///   const internal_key_type &ReadKey(const unsigned char *Buffer,
+///                                    offset_type KeyLen);
+///   /// Read the data for Key from Buffer, given the DataLen as reported from
+///   /// ReadKeyDataLength.
+///   data_type ReadData(StringRef Key, const unsigned char *Buffer,
+///                      offset_type DataLen);
+/// };
+/// \endcode
+template <typename Info> class OnDiskChainedHashTable {
+  const typename Info::offset_type NumBuckets;
+  const typename Info::offset_type NumEntries;
+  const unsigned char *const Buckets;
+  const unsigned char *const Base;
+  Info InfoObj;
+
+public:
+  typedef typename Info::internal_key_type internal_key_type;
+  typedef typename Info::external_key_type external_key_type;
+  typedef typename Info::data_type         data_type;
+  typedef typename Info::hash_value_type   hash_value_type;
+  typedef typename Info::offset_type       offset_type;
+
+  OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
+                         const unsigned char *Buckets,
+                         const unsigned char *Base,
+                         const Info &InfoObj = Info())
+      : NumBuckets(NumBuckets), NumEntries(NumEntries), Buckets(Buckets),
+        Base(Base), InfoObj(InfoObj) {
+    assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
+           "'buckets' must have a 4-byte alignment");
+  }
+
+  offset_type getNumBuckets() const { return NumBuckets; }
+  offset_type getNumEntries() const { return NumEntries; }
+  const unsigned char *getBase() const { return Base; }
+  const unsigned char *getBuckets() const { return Buckets; }
+
+  bool isEmpty() const { return NumEntries == 0; }
+
+  class iterator {
+    internal_key_type Key;
+    const unsigned char *const Data;
+    const offset_type Len;
+    Info *InfoObj;
+
+  public:
+    iterator() : Data(nullptr), Len(0) {}
+    iterator(const internal_key_type K, const unsigned char *D, offset_type L,
+             Info *InfoObj)
+        : Key(K), Data(D), Len(L), InfoObj(InfoObj) {}
+
+    data_type operator*() const { return InfoObj->ReadData(Key, Data, Len); }
+    bool operator==(const iterator &X) const { return X.Data == Data; }
+    bool operator!=(const iterator &X) const { return X.Data != Data; }
+  };
+
+  /// \brief Look up the stored data for a particular key.
+  iterator find(const external_key_type &EKey, Info *InfoPtr = 0) {
+    if (!InfoPtr)
+      InfoPtr = &InfoObj;
+
+    using namespace llvm::support;
+    const internal_key_type &IKey = InfoObj.GetInternalKey(EKey);
+    hash_value_type KeyHash = InfoObj.ComputeHash(IKey);
+
+    // Each bucket is just an offset into the hash table file.
+    offset_type Idx = KeyHash & (NumBuckets - 1);
+    const unsigned char *Bucket = Buckets + sizeof(offset_type) * Idx;
+
+    offset_type Offset = endian::readNext<offset_type, little, aligned>(Bucket);
+    if (Offset == 0)
+      return iterator(); // Empty bucket.
+    const unsigned char *Items = Base + Offset;
+
+    // 'Items' starts with a 16-bit unsigned integer representing the
+    // number of items in this bucket.
+    unsigned Len = endian::readNext<uint16_t, little, unaligned>(Items);
+
+    for (unsigned i = 0; i < Len; ++i) {
+      // Read the hash.
+      hash_value_type ItemHash =
+          endian::readNext<hash_value_type, little, unaligned>(Items);
+
+      // Determine the length of the key and the data.
+      const std::pair<offset_type, offset_type> &L =
+          Info::ReadKeyDataLength(Items);
+      offset_type ItemLen = L.first + L.second;
+
+      // Compare the hashes.  If they are not the same, skip the entry entirely.
+      if (ItemHash != KeyHash) {
+        Items += ItemLen;
+        continue;
+      }
+
+      // Read the key.
+      const internal_key_type &X =
+          InfoPtr->ReadKey((const unsigned char *const)Items, L.first);
+
+      // If the key doesn't match just skip reading the value.
+      if (!InfoPtr->EqualKey(X, IKey)) {
+        Items += ItemLen;
+        continue;
+      }
+
+      // The key matches!
+      return iterator(X, Items + L.first, L.second, InfoPtr);
+    }
+
+    return iterator();
+  }
+
+  iterator end() const { return iterator(); }
+
+  Info &getInfoObj() { return InfoObj; }
+
+  /// \brief Create the hash table.
+  ///
+  /// \param Buckets is the beginning of the hash table itself, which follows
+  /// the payload of entire structure. This is the value returned by
+  /// OnDiskHashTableGenerator::Emit.
+  ///
+  /// \param Base is the point from which all offsets into the structure are
+  /// based. This is offset 0 in the stream that was used when Emitting the
+  /// table.
+  static OnDiskChainedHashTable *Create(const unsigned char *Buckets,
+                                        const unsigned char *const Base,
+                                        const Info &InfoObj = Info()) {
+    using namespace llvm::support;
+    assert(Buckets > Base);
+    assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
+           "buckets should be 4-byte aligned.");
+
+    offset_type NumBuckets =
+        endian::readNext<offset_type, little, aligned>(Buckets);
+    offset_type NumEntries =
+        endian::readNext<offset_type, little, aligned>(Buckets);
+    return new OnDiskChainedHashTable<Info>(NumBuckets, NumEntries, Buckets,
+                                            Base, InfoObj);
+  }
+};
+
+/// \brief Provides lookup and iteration over an on disk hash table.
+///
+/// \copydetails llvm::OnDiskChainedHashTable
+template <typename Info>
+class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
+  const unsigned char *Payload;
+
+public:
+  typedef OnDiskChainedHashTable<Info>          base_type;
+  typedef typename base_type::internal_key_type internal_key_type;
+  typedef typename base_type::external_key_type external_key_type;
+  typedef typename base_type::data_type         data_type;
+  typedef typename base_type::hash_value_type   hash_value_type;
+  typedef typename base_type::offset_type       offset_type;
+
+  OnDiskIterableChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
+                                 const unsigned char *Buckets,
+                                 const unsigned char *Payload,
+                                 const unsigned char *Base,
+                                 const Info &InfoObj = Info())
+      : base_type(NumBuckets, NumEntries, Buckets, Base, InfoObj),
+        Payload(Payload) {}
+
+  /// \brief Iterates over all of the keys in the table.
+  class key_iterator {
+    const unsigned char *Ptr;
+    offset_type NumItemsInBucketLeft;
+    offset_type NumEntriesLeft;
+    Info *InfoObj;
+
+  public:
+    typedef external_key_type value_type;
+
+    key_iterator(const unsigned char *const Ptr, offset_type NumEntries,
+                 Info *InfoObj)
+        : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
+          InfoObj(InfoObj) {}
+    key_iterator()
+        : Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0),
+          InfoObj(0) {}
+
+    friend bool operator==(const key_iterator &X, const key_iterator &Y) {
+      return X.NumEntriesLeft == Y.NumEntriesLeft;
+    }
+    friend bool operator!=(const key_iterator &X, const key_iterator &Y) {
+      return X.NumEntriesLeft != Y.NumEntriesLeft;
+    }
+
+    key_iterator &operator++() { // Preincrement
+      using namespace llvm::support;
+      if (!NumItemsInBucketLeft) {
+        // 'Items' starts with a 16-bit unsigned integer representing the
+        // number of items in this bucket.
+        NumItemsInBucketLeft =
+            endian::readNext<uint16_t, little, unaligned>(Ptr);
+      }
+      Ptr += sizeof(hash_value_type); // Skip the hash.
+      // Determine the length of the key and the data.
+      const std::pair<offset_type, offset_type> &L =
+          Info::ReadKeyDataLength(Ptr);
+      Ptr += L.first + L.second;
+      assert(NumItemsInBucketLeft);
+      --NumItemsInBucketLeft;
+      assert(NumEntriesLeft);
+      --NumEntriesLeft;
+      return *this;
+    }
+    key_iterator operator++(int) { // Postincrement
+      key_iterator tmp = *this; ++*this; return tmp;
+    }
+
+    value_type operator*() const {
+      const unsigned char *LocalPtr = Ptr;
+      if (!NumItemsInBucketLeft)
+        LocalPtr += 2; // number of items in bucket
+      LocalPtr += sizeof(hash_value_type); // Skip the hash.
+
+      // Determine the length of the key and the data.
+      const std::pair<offset_type, offset_type> &L =
+          Info::ReadKeyDataLength(LocalPtr);
+
+      // Read the key.
+      const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
+      return InfoObj->GetExternalKey(Key);
+    }
+  };
+
+  key_iterator key_begin() {
+    return key_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
+  }
+  key_iterator key_end() { return key_iterator(); }
+
+  iterator_range<key_iterator> keys() {
+    return make_range(key_begin(), key_end());
+  }
+
+  /// \brief Iterates over all the entries in the table, returning the data.
+  class data_iterator {
+    const unsigned char *Ptr;
+    offset_type NumItemsInBucketLeft;
+    offset_type NumEntriesLeft;
+    Info *InfoObj;
+
+  public:
+    typedef data_type value_type;
+
+    data_iterator(const unsigned char *const Ptr, offset_type NumEntries,
+                  Info *InfoObj)
+        : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
+          InfoObj(InfoObj) {}
+    data_iterator()
+        : Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0),
+          InfoObj(nullptr) {}
+
+    bool operator==(const data_iterator &X) const {
+      return X.NumEntriesLeft == NumEntriesLeft;
+    }
+    bool operator!=(const data_iterator &X) const {
+      return X.NumEntriesLeft != NumEntriesLeft;
+    }
+
+    data_iterator &operator++() { // Preincrement
+      using namespace llvm::support;
+      if (!NumItemsInBucketLeft) {
+        // 'Items' starts with a 16-bit unsigned integer representing the
+        // number of items in this bucket.
+        NumItemsInBucketLeft =
+            endian::readNext<uint16_t, little, unaligned>(Ptr);
+      }
+      Ptr += sizeof(hash_value_type); // Skip the hash.
+      // Determine the length of the key and the data.
+      const std::pair<offset_type, offset_type> &L =
+          Info::ReadKeyDataLength(Ptr);
+      Ptr += L.first + L.second;
+      assert(NumItemsInBucketLeft);
+      --NumItemsInBucketLeft;
+      assert(NumEntriesLeft);
+      --NumEntriesLeft;
+      return *this;
+    }
+    data_iterator operator++(int) { // Postincrement
+      data_iterator tmp = *this; ++*this; return tmp;
+    }
+
+    value_type operator*() const {
+      const unsigned char *LocalPtr = Ptr;
+      if (!NumItemsInBucketLeft)
+        LocalPtr += 2; // number of items in bucket
+      LocalPtr += sizeof(hash_value_type); // Skip the hash.
+
+      // Determine the length of the key and the data.
+      const std::pair<offset_type, offset_type> &L =
+          Info::ReadKeyDataLength(LocalPtr);
+
+      // Read the key.
+      const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
+      return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
+    }
+  };
+
+  data_iterator data_begin() {
+    return data_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
+  }
+  data_iterator data_end() { return data_iterator(); }
+
+  iterator_range<data_iterator> data() {
+    return make_range(data_begin(), data_end());
+  }
+
+  /// \brief Create the hash table.
+  ///
+  /// \param Buckets is the beginning of the hash table itself, which follows
+  /// the payload of entire structure. This is the value returned by
+  /// OnDiskHashTableGenerator::Emit.
+  ///
+  /// \param Payload is the beginning of the data contained in the table.  This
+  /// is Base plus any padding or header data that was stored, ie, the offset
+  /// that the stream was at when calling Emit.
+  ///
+  /// \param Base is the point from which all offsets into the structure are
+  /// based. This is offset 0 in the stream that was used when Emitting the
+  /// table.
+  static OnDiskIterableChainedHashTable *
+  Create(const unsigned char *Buckets, const unsigned char *const Payload,
+         const unsigned char *const Base, const Info &InfoObj = Info()) {
+    using namespace llvm::support;
+    assert(Buckets > Base);
+    assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
+           "buckets should be 4-byte aligned.");
+
+    offset_type NumBuckets =
+        endian::readNext<offset_type, little, aligned>(Buckets);
+    offset_type NumEntries =
+        endian::readNext<offset_type, little, aligned>(Buckets);
+    return new OnDiskIterableChainedHashTable<Info>(
+        NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj);
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
diff --git a/include/llvm/Support/Path.h b/include/llvm/Support/Path.h
index ba18529..cf821f0 100644
--- a/include/llvm/Support/Path.h
+++ b/include/llvm/Support/Path.h
@@ -295,6 +295,11 @@ const StringRef extension(StringRef path);
 /// @result true if \a value is a path separator character on the host OS
 bool is_separator(char value);
 
+/// @brief Return the preferred separator for this platform.
+///
+/// @result StringRef of the preferred separator, null-terminated.
+const StringRef get_separator();
+
 /// @brief Get the typical temporary directory for the system, e.g., 
 /// "/var/tmp" or "C:/TEMP"
 ///
diff --git a/include/llvm/Support/Program.h b/include/llvm/Support/Program.h
index a1067a6..9160b7d 100644
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h
@@ -87,11 +87,11 @@ struct ProcessInfo {
       const char **args, ///< A vector of strings that are passed to the
       ///< program.  The first element should be the name of the program.
       ///< The list *must* be terminated by a null char* entry.
-      const char **env = 0, ///< An optional vector of strings to use for
+      const char **env = nullptr, ///< An optional vector of strings to use for
       ///< the program's environment. If not provided, the current program's
       ///< environment will be used.
-      const StringRef **redirects = 0, ///< An optional array of pointers to
-      ///< paths. If the array is null, no redirection is done. The array
+      const StringRef **redirects = nullptr, ///< An optional array of pointers
+      ///< to paths. If the array is null, no redirection is done. The array
       ///< should have a size of at least three. The inferior process's
       ///< stdin(0), stdout(1), and stderr(2) will be redirected to the
       ///< corresponding paths.
@@ -107,11 +107,11 @@ struct ProcessInfo {
       ///< of memory can be allocated by process. If memory usage will be
       ///< higher limit, the child is killed and this call returns. If zero
       ///< - no memory limit.
-      std::string *ErrMsg = 0, ///< If non-zero, provides a pointer to a string
-      ///< instance in which error messages will be returned. If the string
-      ///< is non-empty upon return an error occurred while invoking the
+      std::string *ErrMsg = nullptr, ///< If non-zero, provides a pointer to a
+      ///< string instance in which error messages will be returned. If the
+      ///< string is non-empty upon return an error occurred while invoking the
       ///< program.
-      bool *ExecutionFailed = 0);
+      bool *ExecutionFailed = nullptr);
 
   /// Similar to ExecuteAndWait, but returns immediately.
   /// @returns The \see ProcessInfo of the newly launced process.
@@ -119,9 +119,9 @@ struct ProcessInfo {
   /// Wait until the process finished execution or win32 CloseHandle() API on
   /// ProcessInfo.ProcessHandle to avoid memory leaks.
   ProcessInfo
-  ExecuteNoWait(StringRef Program, const char **args, const char **env = 0,
-                const StringRef **redirects = 0, unsigned memoryLimit = 0,
-                std::string *ErrMsg = 0, bool *ExecutionFailed = 0);
+  ExecuteNoWait(StringRef Program, const char **args, const char **env = nullptr,
+                const StringRef **redirects = nullptr, unsigned memoryLimit = 0,
+                std::string *ErrMsg = nullptr, bool *ExecutionFailed = nullptr);
 
   /// Return true if the given arguments fit within system-specific
   /// argument length limits.
@@ -142,9 +142,9 @@ struct ProcessInfo {
       ///< will perform a non-blocking wait on the child process.
       bool WaitUntilTerminates, ///< If true, ignores \p SecondsToWait and waits
       ///< until child has terminated.
-      std::string *ErrMsg = 0 ///< If non-zero, provides a pointer to a string
-      ///< instance in which error messages will be returned. If the string
-      ///< is non-empty upon return an error occurred while invoking the
+      std::string *ErrMsg = nullptr ///< If non-zero, provides a pointer to a
+      ///< string instance in which error messages will be returned. If the
+      ///< string is non-empty upon return an error occurred while invoking the
       ///< program.
       );
   }
diff --git a/include/llvm/Support/Regex.h b/include/llvm/Support/Regex.h
index 2eea369..bf533ca 100644
--- a/include/llvm/Support/Regex.h
+++ b/include/llvm/Support/Regex.h
@@ -55,7 +55,7 @@ namespace llvm {
     Regex(Regex &&regex) {
       preg = regex.preg;
       error = regex.error;
-      regex.preg = NULL;
+      regex.preg = nullptr;
     }
     ~Regex();
 
@@ -75,7 +75,7 @@ namespace llvm {
     /// the first group is always the entire pattern.
     ///
     /// This returns true on a successful match.
-    bool match(StringRef String, SmallVectorImpl<StringRef> *Matches = 0);
+    bool match(StringRef String, SmallVectorImpl<StringRef> *Matches = nullptr);
 
     /// sub - Return the result of replacing the first match of the regex in
     /// \p String with the \p Repl string. Backreferences like "\0" in the
@@ -87,7 +87,8 @@ namespace llvm {
     /// \param Error If non-null, any errors in the substitution (invalid
     /// backreferences, trailing backslashes) will be recorded as a non-empty
     /// string.
-    std::string sub(StringRef Repl, StringRef String, std::string *Error = 0);
+    std::string sub(StringRef Repl, StringRef String,
+                    std::string *Error = nullptr);
 
     /// \brief If this function returns true, ^Str$ is an extended regular
     /// expression that matches Str and only Str.
diff --git a/include/llvm/Support/Registry.h b/include/llvm/Support/Registry.h
index 073becd..b0c2e89 100644
--- a/include/llvm/Support/Registry.h
+++ b/include/llvm/Support/Registry.h
@@ -14,24 +14,27 @@
 #ifndef LLVM_SUPPORT_REGISTRY_H
 #define LLVM_SUPPORT_REGISTRY_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Compiler.h"
 
+#include <memory>
+
 namespace llvm {
   /// A simple registry entry which provides only a name, description, and
   /// no-argument constructor.
   template <typename T>
   class SimpleRegistryEntry {
     const char *Name, *Desc;
-    T *(*Ctor)();
+    std::unique_ptr<T> (*Ctor)();
 
   public:
-    SimpleRegistryEntry(const char *N, const char *D, T *(*C)())
+    SimpleRegistryEntry(const char *N, const char *D, std::unique_ptr<T> (*C)())
       : Name(N), Desc(D), Ctor(C)
     {}
 
     const char *getName() const { return Name; }
     const char *getDesc() const { return Desc; }
-    T *instantiate() const { return Ctor(); }
+    std::unique_ptr<T> instantiate() const { return Ctor(); }
   };
 
 
@@ -88,7 +91,7 @@ namespace llvm {
       const entry& Val;
 
     public:
-      node(const entry& V) : Next(0), Val(V) {
+      node(const entry& V) : Next(nullptr), Val(V) {
         if (Tail)
           Tail->Next = this;
         else
@@ -116,7 +119,7 @@ namespace llvm {
     };
 
     static iterator begin() { return iterator(Head); }
-    static iterator end()   { return iterator(0); }
+    static iterator end()   { return iterator(nullptr); }
 
 
     /// Abstract base class for registry listeners, which are informed when new
@@ -195,7 +198,7 @@ namespace llvm {
       entry Entry;
       node Node;
 
-      static T *CtorFn() { return new V(); }
+      static std::unique_ptr<T> CtorFn() { return make_unique<V>(); }
 
     public:
       Add(const char *Name, const char *Desc)
diff --git a/include/llvm/Support/SMLoc.h b/include/llvm/Support/SMLoc.h
index 0906471..d5b4c57 100644
--- a/include/llvm/Support/SMLoc.h
+++ b/include/llvm/Support/SMLoc.h
@@ -23,9 +23,9 @@ namespace llvm {
 class SMLoc {
   const char *Ptr;
 public:
-  SMLoc() : Ptr(0) {}
+  SMLoc() : Ptr(nullptr) {}
 
-  bool isValid() const { return Ptr != 0; }
+  bool isValid() const { return Ptr != nullptr; }
 
   bool operator==(const SMLoc &RHS) const { return RHS.Ptr == Ptr; }
   bool operator!=(const SMLoc &RHS) const { return RHS.Ptr != Ptr; }
diff --git a/include/llvm/Support/SaveAndRestore.h b/include/llvm/Support/SaveAndRestore.h
index 6330bec..ef154ac 100644
--- a/include/llvm/Support/SaveAndRestore.h
+++ b/include/llvm/Support/SaveAndRestore.h
@@ -6,10 +6,11 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-//  This file provides utility classes that uses RAII to save and restore
-//  values.
-//
+///
+/// \file
+/// This file provides utility classes that use RAII to save and restore
+/// values.
+///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_SUPPORT_SAVEANDRESTORE_H
@@ -17,31 +18,32 @@
 
 namespace llvm {
 
-// SaveAndRestore - A utility class that uses RAII to save and restore
-//  the value of a variable.
-template<typename T>
-struct SaveAndRestore {
-  SaveAndRestore(T& x) : X(x), old_value(x) {}
-  SaveAndRestore(T& x, const T &new_value) : X(x), old_value(x) {
-    X = new_value;
+/// A utility class that uses RAII to save and restore the value of a variable.
+template <typename T> struct SaveAndRestore {
+  SaveAndRestore(T &X) : X(X), OldValue(X) {}
+  SaveAndRestore(T &X, const T &NewValue) : X(X), OldValue(X) {
+    X = NewValue;
   }
-  ~SaveAndRestore() { X = old_value; }
-  T get() { return old_value; }
+  ~SaveAndRestore() { X = OldValue; }
+  T get() { return OldValue; }
+
 private:
-  T& X;
-  T old_value;
+  T &X;
+  T OldValue;
 };
 
-// SaveOr - Similar to SaveAndRestore.  Operates only on bools; the old
-//  value of a variable is saved, and during the dstor the old value is
-//  or'ed with the new value.
+/// Similar to \c SaveAndRestore.  Operates only on bools; the old value of a
+/// variable is saved, and during the dstor the old value is or'ed with the new
+/// value.
 struct SaveOr {
-  SaveOr(bool& x) : X(x), old_value(x) { x = false; }
-  ~SaveOr() { X |= old_value; }
+  SaveOr(bool &X) : X(X), OldValue(X) { X = false; }
+  ~SaveOr() { X |= OldValue; }
+
 private:
-  bool& X;
-  const bool old_value;
+  bool &X;
+  const bool OldValue;
 };
 
-}
+} // namespace llvm
+
 #endif
diff --git a/include/llvm/Support/Signals.h b/include/llvm/Support/Signals.h
index 58ed175..6cbc1f6 100644
--- a/include/llvm/Support/Signals.h
+++ b/include/llvm/Support/Signals.h
@@ -28,7 +28,7 @@ namespace sys {
   /// This function registers signal handlers to ensure that if a signal gets
   /// delivered that the named file is removed.
   /// @brief Remove a file if a fatal signal occurs.
-  bool RemoveFileOnSignal(StringRef Filename, std::string* ErrMsg = 0);
+  bool RemoveFileOnSignal(StringRef Filename, std::string* ErrMsg = nullptr);
 
   /// This function removes a file from the list of files to be removed on
   /// signal delivery.
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index dd48974..39f896d 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -71,7 +71,8 @@ private:
   SourceMgr(const SourceMgr&) LLVM_DELETED_FUNCTION;
   void operator=(const SourceMgr&) LLVM_DELETED_FUNCTION;
 public:
-  SourceMgr() : LineNoCache(0), DiagHandler(0), DiagContext(0) {}
+  SourceMgr()
+    : LineNoCache(nullptr), DiagHandler(nullptr), DiagContext(nullptr) {}
   ~SourceMgr();
 
   void setIncludeDirs(const std::vector<std::string> &Dirs) {
@@ -80,7 +81,7 @@ public:
 
   /// setDiagHandler - Specify a diagnostic handler to be invoked every time
   /// PrintMessage is called. Ctx is passed into the handler when it is invoked.
-  void setDiagHandler(DiagHandlerTy DH, void *Ctx = 0) {
+  void setDiagHandler(DiagHandlerTy DH, void *Ctx = nullptr) {
     DiagHandler = DH;
     DiagContext = Ctx;
   }
@@ -222,10 +223,10 @@ class SMDiagnostic {
 public:
   // Null diagnostic.
   SMDiagnostic()
-    : SM(0), LineNo(0), ColumnNo(0), Kind(SourceMgr::DK_Error) {}
+    : SM(nullptr), LineNo(0), ColumnNo(0), Kind(SourceMgr::DK_Error) {}
   // Diagnostic with no location (e.g. file not found, command line arg error).
   SMDiagnostic(StringRef filename, SourceMgr::DiagKind Knd, StringRef Msg)
-    : SM(0), Filename(filename), LineNo(-1), ColumnNo(-1), Kind(Knd),
+    : SM(nullptr), Filename(filename), LineNo(-1), ColumnNo(-1), Kind(Knd),
       Message(Msg) {}
 
   // Diagnostic with a location.
diff --git a/include/llvm/Support/StreamableMemoryObject.h b/include/llvm/Support/StreamableMemoryObject.h
index 0259630..9c9e55c 100644
--- a/include/llvm/Support/StreamableMemoryObject.h
+++ b/include/llvm/Support/StreamableMemoryObject.h
@@ -116,7 +116,7 @@ public:
     // the memory doesn't go away/get reallocated, but it's
     // not currently necessary. Users that need the pointer don't stream.
     assert(0 && "getPointer in streaming memory objects not allowed");
-    return NULL;
+    return nullptr;
   }
   bool isValidAddress(uint64_t address) const override;
   bool isObjectEnd(uint64_t address) const override;
diff --git a/include/llvm/Support/StringPool.h b/include/llvm/Support/StringPool.h
index 71adbc5..7e1394c 100644
--- a/include/llvm/Support/StringPool.h
+++ b/include/llvm/Support/StringPool.h
@@ -48,7 +48,7 @@ namespace llvm {
       unsigned Refcount; ///< Number of referencing PooledStringPtrs.
 
     public:
-      PooledString() : Pool(0), Refcount(0) { }
+      PooledString() : Pool(nullptr), Refcount(0) { }
     };
 
     friend class PooledStringPtr;
@@ -81,7 +81,7 @@ namespace llvm {
     entry_t *S;
 
   public:
-    PooledStringPtr() : S(0) {}
+    PooledStringPtr() : S(nullptr) {}
 
     explicit PooledStringPtr(entry_t *E) : S(E) {
       if (S) ++S->getValue().Refcount;
@@ -107,7 +107,7 @@ namespace llvm {
         S->getValue().Pool->InternTable.remove(S);
         S->Destroy();
       }
-      S = 0;
+      S = nullptr;
     }
 
     ~PooledStringPtr() { clear(); }
@@ -128,7 +128,7 @@ namespace llvm {
     }
 
     inline const char *operator*() const { return begin(); }
-    inline operator bool() const { return S != 0; }
+    inline operator bool() const { return S != nullptr; }
 
     inline bool operator==(const PooledStringPtr &That) { return S == That.S; }
     inline bool operator!=(const PooledStringPtr &That) { return S != That.S; }
diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h
index 8e7478c..fcdc604 100644
--- a/include/llvm/Support/TargetRegistry.h
+++ b/include/llvm/Support/TargetRegistry.h
@@ -45,14 +45,14 @@ namespace llvm {
   class MCSymbolizer;
   class MCRelocationInfo;
   class MCTargetAsmParser;
+  class MCTargetOptions;
   class TargetMachine;
   class TargetOptions;
   class raw_ostream;
   class formatted_raw_ostream;
 
   MCStreamer *createAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                                bool isVerboseAsm, bool useCFI,
-                                bool useDwarfDirectory,
+                                bool isVerboseAsm, bool useDwarfDirectory,
                                 MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                                 MCAsmBackend *TAB, bool ShowInst);
 
@@ -104,11 +104,14 @@ namespace llvm {
                                                 const MCRegisterInfo &MRI,
                                                 StringRef TT,
                                                 StringRef CPU);
-    typedef MCTargetAsmParser *(*MCAsmParserCtorTy)(MCSubtargetInfo &STI,
-                                                    MCAsmParser &P,
-                                                    const MCInstrInfo &MII);
+    typedef MCTargetAsmParser *(*MCAsmParserCtorTy)(
+        MCSubtargetInfo &STI,
+        MCAsmParser &P,
+        const MCInstrInfo &MII,
+        const MCTargetOptions &Options);
     typedef MCDisassembler *(*MCDisassemblerCtorTy)(const Target &T,
-                                                    const MCSubtargetInfo &STI);
+                                                    const MCSubtargetInfo &STI,
+                                                    MCContext &Ctx);
     typedef MCInstPrinter *(*MCInstPrinterCtorTy)(const Target &T,
                                                   unsigned SyntaxVariant,
                                                   const MCAsmInfo &MAI,
@@ -131,7 +134,6 @@ namespace llvm {
     typedef MCStreamer *(*AsmStreamerCtorTy)(MCContext &Ctx,
                                              formatted_raw_ostream &OS,
                                              bool isVerboseAsm,
-                                             bool useCFI,
                                              bool useDwarfDirectory,
                                              MCInstPrinter *InstPrint,
                                              MCCodeEmitter *CE,
@@ -233,8 +235,8 @@ namespace llvm {
 
   public:
     Target()
-        : AsmStreamerCtorFn(0), MCRelocationInfoCtorFn(0),
-          MCSymbolizerCtorFn(0) {}
+        : AsmStreamerCtorFn(nullptr), MCRelocationInfoCtorFn(nullptr),
+          MCSymbolizerCtorFn(nullptr) {}
 
     /// @name Target Information
     /// @{
@@ -256,10 +258,10 @@ namespace llvm {
     bool hasJIT() const { return HasJIT; }
 
     /// hasTargetMachine - Check if this target supports code generation.
-    bool hasTargetMachine() const { return TargetMachineCtorFn != 0; }
+    bool hasTargetMachine() const { return TargetMachineCtorFn != nullptr; }
 
     /// hasMCAsmBackend - Check if this target supports .o generation.
-    bool hasMCAsmBackend() const { return MCAsmBackendCtorFn != 0; }
+    bool hasMCAsmBackend() const { return MCAsmBackendCtorFn != nullptr; }
 
     /// @}
     /// @name Feature Constructors
@@ -275,7 +277,7 @@ namespace llvm {
     MCAsmInfo *createMCAsmInfo(const MCRegisterInfo &MRI,
                                StringRef Triple) const {
       if (!MCAsmInfoCtorFn)
-        return 0;
+        return nullptr;
       return MCAsmInfoCtorFn(MRI, Triple);
     }
 
@@ -285,7 +287,7 @@ namespace llvm {
                                        CodeModel::Model CM,
                                        CodeGenOpt::Level OL) const {
       if (!MCCodeGenInfoCtorFn)
-        return 0;
+        return nullptr;
       return MCCodeGenInfoCtorFn(Triple, RM, CM, OL);
     }
 
@@ -293,7 +295,7 @@ namespace llvm {
     ///
     MCInstrInfo *createMCInstrInfo() const {
       if (!MCInstrInfoCtorFn)
-        return 0;
+        return nullptr;
       return MCInstrInfoCtorFn();
     }
 
@@ -301,7 +303,7 @@ namespace llvm {
     ///
     MCInstrAnalysis *createMCInstrAnalysis(const MCInstrInfo *Info) const {
       if (!MCInstrAnalysisCtorFn)
-        return 0;
+        return nullptr;
       return MCInstrAnalysisCtorFn(Info);
     }
 
@@ -309,7 +311,7 @@ namespace llvm {
     ///
     MCRegisterInfo *createMCRegInfo(StringRef Triple) const {
       if (!MCRegInfoCtorFn)
-        return 0;
+        return nullptr;
       return MCRegInfoCtorFn(Triple);
     }
 
@@ -325,7 +327,7 @@ namespace llvm {
     MCSubtargetInfo *createMCSubtargetInfo(StringRef Triple, StringRef CPU,
                                            StringRef Features) const {
       if (!MCSubtargetInfoCtorFn)
-        return 0;
+        return nullptr;
       return MCSubtargetInfoCtorFn(Triple, CPU, Features);
     }
 
@@ -342,7 +344,7 @@ namespace llvm {
                              CodeModel::Model CM = CodeModel::Default,
                              CodeGenOpt::Level OL = CodeGenOpt::Default) const {
       if (!TargetMachineCtorFn)
-        return 0;
+        return nullptr;
       return TargetMachineCtorFn(*this, Triple, CPU, Features, Options,
                                  RM, CM, OL);
     }
@@ -353,7 +355,7 @@ namespace llvm {
     MCAsmBackend *createMCAsmBackend(const MCRegisterInfo &MRI,
                                      StringRef Triple, StringRef CPU) const {
       if (!MCAsmBackendCtorFn)
-        return 0;
+        return nullptr;
       return MCAsmBackendCtorFn(*this, MRI, Triple, CPU);
     }
 
@@ -361,26 +363,29 @@ namespace llvm {
     ///
     /// \param Parser The target independent parser implementation to use for
     /// parsing and lexing.
-    MCTargetAsmParser *createMCAsmParser(MCSubtargetInfo &STI,
-                                         MCAsmParser &Parser,
-                                         const MCInstrInfo &MII) const {
+    MCTargetAsmParser *createMCAsmParser(
+        MCSubtargetInfo &STI,
+        MCAsmParser &Parser,
+        const MCInstrInfo &MII,
+        const MCTargetOptions &Options) const {
       if (!MCAsmParserCtorFn)
-        return 0;
-      return MCAsmParserCtorFn(STI, Parser, MII);
+        return nullptr;
+      return MCAsmParserCtorFn(STI, Parser, MII, Options);
     }
 
     /// createAsmPrinter - Create a target specific assembly printer pass.  This
     /// takes ownership of the MCStreamer object.
     AsmPrinter *createAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) const{
       if (!AsmPrinterCtorFn)
-        return 0;
+        return nullptr;
       return AsmPrinterCtorFn(TM, Streamer);
     }
 
-    MCDisassembler *createMCDisassembler(const MCSubtargetInfo &STI) const {
+    MCDisassembler *createMCDisassembler(const MCSubtargetInfo &STI,
+                                         MCContext &Ctx) const {
       if (!MCDisassemblerCtorFn)
-        return 0;
-      return MCDisassemblerCtorFn(*this, STI);
+        return nullptr;
+      return MCDisassemblerCtorFn(*this, STI, Ctx);
     }
 
     MCInstPrinter *createMCInstPrinter(unsigned SyntaxVariant,
@@ -389,7 +394,7 @@ namespace llvm {
                                        const MCRegisterInfo &MRI,
                                        const MCSubtargetInfo &STI) const {
       if (!MCInstPrinterCtorFn)
-        return 0;
+        return nullptr;
       return MCInstPrinterCtorFn(*this, SyntaxVariant, MAI, MII, MRI, STI);
     }
 
@@ -400,7 +405,7 @@ namespace llvm {
                                        const MCSubtargetInfo &STI,
                                        MCContext &Ctx) const {
       if (!MCCodeEmitterCtorFn)
-        return 0;
+        return nullptr;
       return MCCodeEmitterCtorFn(II, MRI, STI, Ctx);
     }
 
@@ -421,7 +426,7 @@ namespace llvm {
                                        bool RelaxAll,
                                        bool NoExecStack) const {
       if (!MCObjectStreamerCtorFn)
-        return 0;
+        return nullptr;
       return MCObjectStreamerCtorFn(*this, TT, Ctx, TAB, _OS, _Emitter, STI,
                                     RelaxAll, NoExecStack);
     }
@@ -430,19 +435,16 @@ namespace llvm {
     MCStreamer *createAsmStreamer(MCContext &Ctx,
                                   formatted_raw_ostream &OS,
                                   bool isVerboseAsm,
-                                  bool useCFI,
                                   bool useDwarfDirectory,
                                   MCInstPrinter *InstPrint,
                                   MCCodeEmitter *CE,
                                   MCAsmBackend *TAB,
                                   bool ShowInst) const {
       if (AsmStreamerCtorFn)
-        return AsmStreamerCtorFn(Ctx, OS, isVerboseAsm, useCFI,
-                                 useDwarfDirectory, InstPrint, CE, TAB,
-                                 ShowInst);
-      return llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI,
-                                     useDwarfDirectory, InstPrint, CE, TAB,
-                                     ShowInst);
+        return AsmStreamerCtorFn(Ctx, OS, isVerboseAsm, useDwarfDirectory,
+                                 InstPrint, CE, TAB, ShowInst);
+      return llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useDwarfDirectory,
+                                     InstPrint, CE, TAB, ShowInst);
     }
 
     /// createMCRelocationInfo - Create a target specific MCRelocationInfo.
@@ -486,7 +488,7 @@ namespace llvm {
       explicit iterator(Target *T) : Current(T) {}
       friend struct TargetRegistry;
     public:
-      iterator() : Current(0) {}
+      iterator() : Current(nullptr) {}
 
       bool operator==(const iterator &x) const {
         return Current == x.Current;
@@ -1097,8 +1099,9 @@ namespace llvm {
 
   private:
     static MCTargetAsmParser *Allocator(MCSubtargetInfo &STI, MCAsmParser &P,
-                                        const MCInstrInfo &MII) {
-      return new MCAsmParserImpl(STI, P, MII);
+                                        const MCInstrInfo &MII,
+                                        const MCTargetOptions &Options) {
+      return new MCAsmParserImpl(STI, P, MII, Options);
     }
   };
 
diff --git a/include/llvm/Support/Timer.h b/include/llvm/Support/Timer.h
index d009d7f..45c1828 100644
--- a/include/llvm/Support/Timer.h
+++ b/include/llvm/Support/Timer.h
@@ -85,24 +85,24 @@ class Timer {
   
   Timer **Prev, *Next;   // Doubly linked list of timers in the group.
 public:
-  explicit Timer(StringRef N) : TG(0) { init(N); }
-  Timer(StringRef N, TimerGroup &tg) : TG(0) { init(N, tg); }
-  Timer(const Timer &RHS) : TG(0) {
-    assert(RHS.TG == 0 && "Can only copy uninitialized timers");
+  explicit Timer(StringRef N) : TG(nullptr) { init(N); }
+  Timer(StringRef N, TimerGroup &tg) : TG(nullptr) { init(N, tg); }
+  Timer(const Timer &RHS) : TG(nullptr) {
+    assert(!RHS.TG && "Can only copy uninitialized timers");
   }
   const Timer &operator=(const Timer &T) {
-    assert(TG == 0 && T.TG == 0 && "Can only assign uninit timers");
+    assert(!TG && !T.TG && "Can only assign uninit timers");
     return *this;
   }
   ~Timer();
 
   // Create an uninitialized timer, client must use 'init'.
-  explicit Timer() : TG(0) {}
+  explicit Timer() : TG(nullptr) {}
   void init(StringRef N);
   void init(StringRef N, TimerGroup &tg);
   
   const std::string &getName() const { return Name; }
-  bool isInitialized() const { return TG != 0; }
+  bool isInitialized() const { return TG != nullptr; }
   
   /// startTimer - Start the timer running.  Time between calls to
   /// startTimer/stopTimer is counted by the Timer class.  Note that these calls
diff --git a/include/llvm/Support/Unicode.h b/include/llvm/Support/Unicode.h
index e6a52c4..f668a5b 100644
--- a/include/llvm/Support/Unicode.h
+++ b/include/llvm/Support/Unicode.h
@@ -12,6 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_SUPPORT_UNICODE_H
+#define LLVM_SUPPORT_UNICODE_H
+
 #include "llvm/ADT/StringRef.h"
 
 namespace llvm {
@@ -60,3 +63,5 @@ int columnWidthUTF8(StringRef Text);
 } // namespace unicode
 } // namespace sys
 } // namespace llvm
+
+#endif
diff --git a/include/llvm/Support/UnicodeCharRanges.h b/include/llvm/Support/UnicodeCharRanges.h
index 734d323..79137bf 100644
--- a/include/llvm/Support/UnicodeCharRanges.h
+++ b/include/llvm/Support/UnicodeCharRanges.h
@@ -21,6 +21,8 @@
 namespace llvm {
 namespace sys {
 
+#define DEBUG_TYPE "unicode"
+
 /// \brief Represents a closed range of Unicode code points [Lower, Upper].
 struct UnicodeCharRange {
   uint32_t Lower;
@@ -88,6 +90,8 @@ private:
   const CharRanges Ranges;
 };
 
+#undef DEBUG_TYPE // "unicode"
+
 } // namespace sys
 } // namespace llvm
 
diff --git a/include/llvm/Support/YAMLParser.h b/include/llvm/Support/YAMLParser.h
index 5194b52..c39874c 100644
--- a/include/llvm/Support/YAMLParser.h
+++ b/include/llvm/Support/YAMLParser.h
@@ -60,26 +60,26 @@ class Node;
 class Scanner;
 struct Token;
 
-/// @brief Dump all the tokens in this stream to OS.
-/// @returns true if there was an error, false otherwise.
+/// \brief Dump all the tokens in this stream to OS.
+/// \returns true if there was an error, false otherwise.
 bool dumpTokens(StringRef Input, raw_ostream &);
 
-/// @brief Scans all tokens in input without outputting anything. This is used
+/// \brief Scans all tokens in input without outputting anything. This is used
 ///        for benchmarking the tokenizer.
-/// @returns true if there was an error, false otherwise.
+/// \returns true if there was an error, false otherwise.
 bool scanTokens(StringRef Input);
 
-/// @brief Escape \a Input for a double quoted scalar.
+/// \brief Escape \a Input for a double quoted scalar.
 std::string escape(StringRef Input);
 
-/// @brief This class represents a YAML stream potentially containing multiple
+/// \brief This class represents a YAML stream potentially containing multiple
 ///        documents.
 class Stream {
 public:
-  /// @brief This keeps a reference to the string referenced by \p Input.
+  /// \brief This keeps a reference to the string referenced by \p Input.
   Stream(StringRef Input, SourceMgr &);
 
-  /// @brief This takes ownership of \p InputBuffer.
+  /// \brief This takes ownership of \p InputBuffer.
   Stream(MemoryBuffer *InputBuffer, SourceMgr &);
   ~Stream();
 
@@ -101,9 +101,10 @@ private:
   friend class Document;
 };
 
-/// @brief Abstract base class for all Nodes.
+/// \brief Abstract base class for all Nodes.
 class Node {
-   virtual void anchor();
+  virtual void anchor();
+
 public:
   enum NodeKind {
     NK_Null,
@@ -117,7 +118,7 @@ public:
   Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
        StringRef Tag);
 
-  /// @brief Get the value of the anchor attached to this node. If it does not
+  /// \brief Get the value of the anchor attached to this node. If it does not
   ///        have one, getAnchor().size() will be 0.
   StringRef getAnchor() const { return Anchor; }
 
@@ -144,14 +145,13 @@ public:
 
   unsigned int getType() const { return TypeID; }
 
-  void *operator new ( size_t Size
-                     , BumpPtrAllocator &Alloc
-                     , size_t Alignment = 16) throw() {
+  void *operator new(size_t Size, BumpPtrAllocator &Alloc,
+                     size_t Alignment = 16) throw() {
     return Alloc.Allocate(Size, Alignment);
   }
 
-  void operator delete(void *Ptr, BumpPtrAllocator &Alloc, size_t) throw() {
-    Alloc.Deallocate(Ptr);
+  void operator delete(void *Ptr, BumpPtrAllocator &Alloc, size_t Size) throw() {
+    Alloc.Deallocate(Ptr, Size);
   }
 
 protected:
@@ -169,28 +169,28 @@ private:
   StringRef Tag;
 };
 
-/// @brief A null value.
+/// \brief A null value.
 ///
 /// Example:
 ///   !!null null
 class NullNode : public Node {
   void anchor() override;
+
 public:
   NullNode(std::unique_ptr<Document> &D)
       : Node(NK_Null, D, StringRef(), StringRef()) {}
 
-  static inline bool classof(const Node *N) {
-    return N->getType() == NK_Null;
-  }
+  static inline bool classof(const Node *N) { return N->getType() == NK_Null; }
 };
 
-/// @brief A scalar node is an opaque datum that can be presented as a
+/// \brief A scalar node is an opaque datum that can be presented as a
 ///        series of zero or more Unicode scalar values.
 ///
 /// Example:
 ///   Adena
 class ScalarNode : public Node {
   void anchor() override;
+
 public:
   ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
              StringRef Val)
@@ -205,9 +205,9 @@ public:
   // utf8).
   StringRef getRawValue() const { return Value; }
 
-  /// @brief Gets the value of this node as a StringRef.
+  /// \brief Gets the value of this node as a StringRef.
   ///
-  /// @param Storage is used to store the content of the returned StringRef iff
+  /// \param Storage is used to store the content of the returned StringRef iff
   ///        it requires any modification from how it appeared in the source.
   ///        This happens with escaped characters and multi-line literals.
   StringRef getValue(SmallVectorImpl<char> &Storage) const;
@@ -219,12 +219,12 @@ public:
 private:
   StringRef Value;
 
-  StringRef unescapeDoubleQuoted( StringRef UnquotedValue
-                                , StringRef::size_type Start
-                                , SmallVectorImpl<char> &Storage) const;
+  StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
+                                 StringRef::size_type Start,
+                                 SmallVectorImpl<char> &Storage) const;
 };
 
-/// @brief A key and value pair. While not technically a Node under the YAML
+/// \brief A key and value pair. While not technically a Node under the YAML
 ///        representation graph, it is easier to treat them this way.
 ///
 /// TODO: Consider making this not a child of Node.
@@ -233,22 +233,24 @@ private:
 ///   Section: .text
 class KeyValueNode : public Node {
   void anchor() override;
+
 public:
   KeyValueNode(std::unique_ptr<Document> &D)
-      : Node(NK_KeyValue, D, StringRef(), StringRef()), Key(0), Value(0) {}
+      : Node(NK_KeyValue, D, StringRef(), StringRef()), Key(nullptr),
+        Value(nullptr) {}
 
-  /// @brief Parse and return the key.
+  /// \brief Parse and return the key.
   ///
   /// This may be called multiple times.
   ///
-  /// @returns The key, or nullptr if failed() == true.
+  /// \returns The key, or nullptr if failed() == true.
   Node *getKey();
 
-  /// @brief Parse and return the value.
+  /// \brief Parse and return the value.
   ///
   /// This may be called multiple times.
   ///
-  /// @returns The value, or nullptr if failed() == true.
+  /// \returns The value, or nullptr if failed() == true.
   Node *getValue();
 
   void skip() override {
@@ -265,47 +267,47 @@ private:
   Node *Value;
 };
 
-/// @brief This is an iterator abstraction over YAML collections shared by both
+/// \brief This is an iterator abstraction over YAML collections shared by both
 ///        sequences and maps.
 ///
 /// BaseT must have a ValueT* member named CurrentEntry and a member function
 /// increment() which must set CurrentEntry to 0 to create an end iterator.
 template <class BaseT, class ValueT>
 class basic_collection_iterator
-  : public std::iterator<std::forward_iterator_tag, ValueT> {
+    : public std::iterator<std::forward_iterator_tag, ValueT> {
 public:
-  basic_collection_iterator() : Base(0) {}
+  basic_collection_iterator() : Base(nullptr) {}
   basic_collection_iterator(BaseT *B) : Base(B) {}
 
-  ValueT *operator ->() const {
+  ValueT *operator->() const {
     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
     return Base->CurrentEntry;
   }
 
-  ValueT &operator *() const {
+  ValueT &operator*() const {
     assert(Base && Base->CurrentEntry &&
            "Attempted to dereference end iterator!");
     return *Base->CurrentEntry;
   }
 
-  operator ValueT*() const {
+  operator ValueT *() const {
     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
     return Base->CurrentEntry;
   }
 
-  bool operator !=(const basic_collection_iterator &Other) const {
-    if(Base != Other.Base)
+  bool operator!=(const basic_collection_iterator &Other) const {
+    if (Base != Other.Base)
       return true;
-    return (Base && Other.Base) && Base->CurrentEntry
-                                   != Other.Base->CurrentEntry;
+    return (Base && Other.Base) &&
+           Base->CurrentEntry != Other.Base->CurrentEntry;
   }
 
   basic_collection_iterator &operator++() {
     assert(Base && "Attempted to advance iterator past end!");
     Base->increment();
     // Create an end iterator.
-    if (Base->CurrentEntry == 0)
-      Base = 0;
+    if (!Base->CurrentEntry)
+      Base = nullptr;
     return *this;
   }
 
@@ -323,17 +325,16 @@ typename CollectionType::iterator begin(CollectionType &C) {
   return ret;
 }
 
-template <class CollectionType>
-void skip(CollectionType &C) {
+template <class CollectionType> void skip(CollectionType &C) {
   // TODO: support skipping from the middle of a parsed collection ;/
   assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
   if (C.IsAtBeginning)
-    for (typename CollectionType::iterator i = begin(C), e = C.end();
-                                           i != e; ++i)
+    for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
+         ++i)
       i->skip();
 }
 
-/// @brief Represents a YAML map created from either a block map for a flow map.
+/// \brief Represents a YAML map created from either a block map for a flow map.
 ///
 /// This parses the YAML stream as increment() is called.
 ///
@@ -342,6 +343,7 @@ void skip(CollectionType &C) {
 ///   Scope: Global
 class MappingNode : public Node {
   void anchor() override;
+
 public:
   enum MappingType {
     MT_Block,
@@ -352,22 +354,18 @@ public:
   MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
               MappingType MT)
       : Node(NK_Mapping, D, Anchor, Tag), Type(MT), IsAtBeginning(true),
-        IsAtEnd(false), CurrentEntry(0) {}
+        IsAtEnd(false), CurrentEntry(nullptr) {}
 
   friend class basic_collection_iterator<MappingNode, KeyValueNode>;
   typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator;
   template <class T> friend typename T::iterator yaml::begin(T &);
   template <class T> friend void yaml::skip(T &);
 
-  iterator begin() {
-    return yaml::begin(*this);
-  }
+  iterator begin() { return yaml::begin(*this); }
 
   iterator end() { return iterator(); }
 
-  void skip() override {
-    yaml::skip(*this);
-  }
+  void skip() override { yaml::skip(*this); }
 
   static inline bool classof(const Node *N) {
     return N->getType() == NK_Mapping;
@@ -382,7 +380,7 @@ private:
   void increment();
 };
 
-/// @brief Represents a YAML sequence created from either a block sequence for a
+/// \brief Represents a YAML sequence created from either a block sequence for a
 ///        flow sequence.
 ///
 /// This parses the YAML stream as increment() is called.
@@ -392,6 +390,7 @@ private:
 ///   - World
 class SequenceNode : public Node {
   void anchor() override;
+
 public:
   enum SequenceType {
     ST_Block,
@@ -411,7 +410,7 @@ public:
       : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST), IsAtBeginning(true),
         IsAtEnd(false),
         WasPreviousTokenFlowEntry(true), // Start with an imaginary ','.
-        CurrentEntry(0) {}
+        CurrentEntry(nullptr) {}
 
   friend class basic_collection_iterator<SequenceNode, Node>;
   typedef basic_collection_iterator<SequenceNode, Node> iterator;
@@ -420,15 +419,11 @@ public:
 
   void increment();
 
-  iterator begin() {
-    return yaml::begin(*this);
-  }
+  iterator begin() { return yaml::begin(*this); }
 
   iterator end() { return iterator(); }
 
-  void skip() override {
-    yaml::skip(*this);
-  }
+  void skip() override { yaml::skip(*this); }
 
   static inline bool classof(const Node *N) {
     return N->getType() == NK_Sequence;
@@ -442,12 +437,13 @@ private:
   Node *CurrentEntry;
 };
 
-/// @brief Represents an alias to a Node with an anchor.
+/// \brief Represents an alias to a Node with an anchor.
 ///
 /// Example:
 ///   *AnchorName
 class AliasNode : public Node {
   void anchor() override;
+
 public:
   AliasNode(std::unique_ptr<Document> &D, StringRef Val)
       : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
@@ -455,50 +451,46 @@ public:
   StringRef getName() const { return Name; }
   Node *getTarget();
 
-  static inline bool classof(const Node *N) {
-    return N->getType() == NK_Alias;
-  }
+  static inline bool classof(const Node *N) { return N->getType() == NK_Alias; }
 
 private:
   StringRef Name;
 };
 
-/// @brief A YAML Stream is a sequence of Documents. A document contains a root
+/// \brief A YAML Stream is a sequence of Documents. A document contains a root
 ///        node.
 class Document {
 public:
-  /// @brief Root for parsing a node. Returns a single node.
+  /// \brief Root for parsing a node. Returns a single node.
   Node *parseBlockNode();
 
   Document(Stream &ParentStream);
 
-  /// @brief Finish parsing the current document and return true if there are
+  /// \brief Finish parsing the current document and return true if there are
   ///        more. Return false otherwise.
   bool skip();
 
-  /// @brief Parse and return the root level node.
+  /// \brief Parse and return the root level node.
   Node *getRoot() {
     if (Root)
       return Root;
     return Root = parseBlockNode();
   }
 
-  const std::map<StringRef, StringRef> &getTagMap() const {
-    return TagMap;
-  }
+  const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
 
 private:
   friend class Node;
   friend class document_iterator;
 
-  /// @brief Stream to read tokens from.
+  /// \brief Stream to read tokens from.
   Stream &stream;
 
-  /// @brief Used to allocate nodes to. All are destroyed without calling their
+  /// \brief Used to allocate nodes to. All are destroyed without calling their
   ///        destructor when the document is destroyed.
   BumpPtrAllocator NodeAllocator;
 
-  /// @brief The root node. Used to support skipping a partially parsed
+  /// \brief The root node. Used to support skipping a partially parsed
   ///        document.
   Node *Root;
 
@@ -510,7 +502,7 @@ private:
   void setError(const Twine &Message, Token &Location) const;
   bool failed() const;
 
-  /// @brief Parse %BLAH directives and return true if any were encountered.
+  /// \brief Parse %BLAH directives and return true if any were encountered.
   bool parseDirectives();
 
   /// \brief Parse %YAML
@@ -519,30 +511,28 @@ private:
   /// \brief Parse %TAG
   void parseTAGDirective();
 
-  /// @brief Consume the next token and error if it is not \a TK.
+  /// \brief Consume the next token and error if it is not \a TK.
   bool expectToken(int TK);
 };
 
-/// @brief Iterator abstraction for Documents over a Stream.
+/// \brief Iterator abstraction for Documents over a Stream.
 class document_iterator {
 public:
-  document_iterator() : Doc(0) {}
+  document_iterator() : Doc(nullptr) {}
   document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
 
-  bool operator ==(const document_iterator &Other) {
+  bool operator==(const document_iterator &Other) {
     if (isAtEnd() || Other.isAtEnd())
       return isAtEnd() && Other.isAtEnd();
 
     return Doc == Other.Doc;
   }
-  bool operator !=(const document_iterator &Other) {
-    return !(*this == Other);
-  }
+  bool operator!=(const document_iterator &Other) { return !(*this == Other); }
 
-  document_iterator operator ++() {
-    assert(Doc != 0 && "incrementing iterator past the end.");
+  document_iterator operator++() {
+    assert(Doc && "incrementing iterator past the end.");
     if (!(*Doc)->skip()) {
-      Doc->reset(0);
+      Doc->reset(nullptr);
     } else {
       Stream &S = (*Doc)->stream;
       Doc->reset(new Document(S));
@@ -550,21 +540,18 @@ public:
     return *this;
   }
 
-  Document &operator *() {
-    return *Doc->get();
-  }
+  Document &operator*() { return *Doc->get(); }
 
   std::unique_ptr<Document> &operator->() { return *Doc; }
 
 private:
-  bool isAtEnd() const {
-    return !Doc || !*Doc;
-  }
+  bool isAtEnd() const { return !Doc || !*Doc; }
 
   std::unique_ptr<Document> *Doc;
 };
 
-}
-}
+} // End namespace yaml.
+
+} // End namespace llvm.
 
 #endif
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index ea217c3..4ee05ed 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Regex.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
@@ -32,7 +33,7 @@ namespace yaml {
 /// This class should be specialized by any type that needs to be converted
 /// to/from a YAML mapping.  For example:
 ///
-///     struct ScalarBitSetTraits<MyStruct> {
+///     struct MappingTraits<MyStruct> {
 ///       static void mapping(IO &io, MyStruct &s) {
 ///         io.mapRequired("name", s.name);
 ///         io.mapRequired("size", s.size);
@@ -98,6 +99,7 @@ struct ScalarBitSetTraits {
 ///        // return empty string on success, or error string
 ///        return StringRef();
 ///      }
+///      static bool mustQuote(StringRef) { return true; }
 ///    };
 template<typename T>
 struct ScalarTraits {
@@ -109,6 +111,9 @@ struct ScalarTraits {
   // Function to convert a string to a value.  Returns the empty
   // StringRef on success or an error string if string is malformed:
   //static StringRef input(StringRef scalar, void *ctxt, T &value);
+  //
+  // Function to determine if the value should be quoted.
+  //static bool mustQuote(StringRef);
 };
 
 
@@ -171,7 +176,8 @@ struct has_ScalarEnumerationTraits
   static double test(...);
 
 public:
-  static bool const value = (sizeof(test<ScalarEnumerationTraits<T> >(0)) == 1);
+  static bool const value =
+    (sizeof(test<ScalarEnumerationTraits<T> >(nullptr)) == 1);
 };
 
 
@@ -188,7 +194,7 @@ struct has_ScalarBitSetTraits
   static double test(...);
 
 public:
-  static bool const value = (sizeof(test<ScalarBitSetTraits<T> >(0)) == 1);
+  static bool const value = (sizeof(test<ScalarBitSetTraits<T> >(nullptr)) == 1);
 };
 
 
@@ -198,16 +204,19 @@ struct has_ScalarTraits
 {
   typedef StringRef (*Signature_input)(StringRef, void*, T&);
   typedef void (*Signature_output)(const T&, void*, llvm::raw_ostream&);
+  typedef bool (*Signature_mustQuote)(StringRef);
 
   template <typename U>
-  static char test(SameType<Signature_input, &U::input>*,
-                   SameType<Signature_output, &U::output>*);
+  static char test(SameType<Signature_input, &U::input> *,
+                   SameType<Signature_output, &U::output> *,
+                   SameType<Signature_mustQuote, &U::mustQuote> *);
 
   template <typename U>
   static double test(...);
 
 public:
-  static bool const value = (sizeof(test<ScalarTraits<T> >(0,0)) == 1);
+  static bool const value =
+      (sizeof(test<ScalarTraits<T>>(nullptr, nullptr, nullptr)) == 1);
 };
 
 
@@ -224,7 +233,7 @@ struct has_MappingTraits
   static double test(...);
 
 public:
-  static bool const value = (sizeof(test<MappingTraits<T> >(0)) == 1);
+  static bool const value = (sizeof(test<MappingTraits<T> >(nullptr)) == 1);
 };
 
 // Test if MappingTraits<T>::validate() is defined on type T.
@@ -240,7 +249,7 @@ struct has_MappingValidateTraits
   static double test(...);
 
 public:
-  static bool const value = (sizeof(test<MappingTraits<T> >(0)) == 1);
+  static bool const value = (sizeof(test<MappingTraits<T> >(nullptr)) == 1);
 };
 
 
@@ -258,7 +267,7 @@ struct has_SequenceMethodTraits
   static double test(...);
 
 public:
-  static bool const value =  (sizeof(test<SequenceTraits<T> >(0)) == 1);
+  static bool const value =  (sizeof(test<SequenceTraits<T> >(nullptr)) == 1);
 };
 
 
@@ -288,7 +297,7 @@ struct has_FlowTraits<T, true>
   static char (&f(...))[2];
 
 public:
-  static bool const value = sizeof(f<Derived>(0)) == 2;
+  static bool const value = sizeof(f<Derived>(nullptr)) == 2;
 };
 
 
@@ -312,10 +321,84 @@ struct has_DocumentListTraits
   static double test(...);
 
 public:
-  static bool const value =  (sizeof(test<DocumentListTraits<T> >(0)) == 1);
+  static bool const value = (sizeof(test<DocumentListTraits<T> >(nullptr))==1);
 };
 
+inline bool isNumber(StringRef S) {
+  static const char OctalChars[] = "01234567";
+  if (S.startswith("0") &&
+      S.drop_front().find_first_not_of(OctalChars) == StringRef::npos)
+    return true;
+
+  if (S.startswith("0o") &&
+      S.drop_front(2).find_first_not_of(OctalChars) == StringRef::npos)
+    return true;
+
+  static const char HexChars[] = "0123456789abcdefABCDEF";
+  if (S.startswith("0x") &&
+      S.drop_front(2).find_first_not_of(HexChars) == StringRef::npos)
+    return true;
+
+  static const char DecChars[] = "0123456789";
+  if (S.find_first_not_of(DecChars) == StringRef::npos)
+    return true;
+
+  if (S.equals(".inf") || S.equals(".Inf") || S.equals(".INF"))
+    return true;
+
+  Regex FloatMatcher("^(\\.[0-9]+|[0-9]+(\\.[0-9]*)?)([eE][-+]?[0-9]+)?$");
+  if (FloatMatcher.match(S))
+    return true;
+
+  return false;
+}
+
+inline bool isNumeric(StringRef S) {
+  if ((S.front() == '-' || S.front() == '+') && isNumber(S.drop_front()))
+    return true;
+
+  if (isNumber(S))
+    return true;
+
+  if (S.equals(".nan") || S.equals(".NaN") || S.equals(".NAN"))
+    return true;
+
+  return false;
+}
+
+inline bool isNull(StringRef S) {
+  return S.equals("null") || S.equals("Null") || S.equals("NULL") ||
+         S.equals("~");
+}
 
+inline bool isBool(StringRef S) {
+  return S.equals("true") || S.equals("True") || S.equals("TRUE") ||
+         S.equals("false") || S.equals("False") || S.equals("FALSE");
+}
+
+inline bool needsQuotes(StringRef S) {
+  if (S.empty())
+    return true;
+  if (isspace(S.front()) || isspace(S.back()))
+    return true;
+  if (S.front() == ',')
+    return true;
+
+  static const char ScalarSafeChars[] =
+      "abcdefghijklmnopqrstuvwxyz"
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-/^., \t";
+  if (S.find_first_not_of(ScalarSafeChars) != StringRef::npos)
+    return true;
+
+  if (isNull(S))
+    return true;
+  if (isBool(S))
+    return true;
+  if (isNumeric(S))
+    return true;
+
+  return false;
+}
 
 
 template<typename T>
@@ -340,7 +423,7 @@ struct unvalidatedMappingTraits : public std::integral_constant<bool,
 class IO {
 public:
 
-  IO(void *Ctxt=NULL);
+  IO(void *Ctxt=nullptr);
   virtual ~IO();
 
   virtual bool outputting() = 0;
@@ -370,7 +453,7 @@ public:
   virtual bool bitSetMatch(const char*, bool) = 0;
   virtual void endBitSetScalar() = 0;
 
-  virtual void scalarString(StringRef &) = 0;
+  virtual void scalarString(StringRef &, bool) = 0;
 
   virtual void setError(const Twine &) = 0;
 
@@ -404,6 +487,19 @@ public:
     }
   }
 
+  template <typename T>
+  void maskedBitSetCase(T &Val, const char *Str, T ConstVal, T Mask) {
+    if (bitSetMatch(Str, outputting() && (Val & Mask) == ConstVal))
+      Val = Val | ConstVal;
+  }
+
+  template <typename T>
+  void maskedBitSetCase(T &Val, const char *Str, uint32_t ConstVal,
+                        uint32_t Mask) {
+    if (bitSetMatch(Str, outputting() && (Val & Mask) == ConstVal))
+      Val = Val | ConstVal;
+  }
+
   void *getContext();
   void setContext(void *);
 
@@ -520,11 +616,11 @@ yamlize(IO &io, T &Val, bool) {
     llvm::raw_string_ostream Buffer(Storage);
     ScalarTraits<T>::output(Val, io.getContext(), Buffer);
     StringRef Str = Buffer.str();
-    io.scalarString(Str);
+    io.scalarString(Str, ScalarTraits<T>::mustQuote(Str));
   }
   else {
     StringRef Str;
-    io.scalarString(Str);
+    io.scalarString(Str, ScalarTraits<T>::mustQuote(Str));
     StringRef Result = ScalarTraits<T>::input(Str, io.getContext(), Val);
     if ( !Result.empty() ) {
       io.setError(llvm::Twine(Result));
@@ -601,78 +697,91 @@ template<>
 struct ScalarTraits<bool> {
   static void output(const bool &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, bool &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<StringRef> {
   static void output(const StringRef &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, StringRef &);
+  static bool mustQuote(StringRef S) { return needsQuotes(S); }
 };
  
 template<>
 struct ScalarTraits<std::string> {
   static void output(const std::string &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, std::string &);
+  static bool mustQuote(StringRef S) { return needsQuotes(S); }
 };
 
 template<>
 struct ScalarTraits<uint8_t> {
   static void output(const uint8_t &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, uint8_t &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<uint16_t> {
   static void output(const uint16_t &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, uint16_t &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<uint32_t> {
   static void output(const uint32_t &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, uint32_t &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<uint64_t> {
   static void output(const uint64_t &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, uint64_t &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<int8_t> {
   static void output(const int8_t &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, int8_t &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<int16_t> {
   static void output(const int16_t &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, int16_t &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<int32_t> {
   static void output(const int32_t &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, int32_t &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<int64_t> {
   static void output(const int64_t &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, int64_t &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<float> {
   static void output(const float &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, float &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<double> {
   static void output(const double &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, double &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 
@@ -682,7 +791,7 @@ struct ScalarTraits<double> {
 template <typename TNorm, typename TFinal>
 struct MappingNormalization {
   MappingNormalization(IO &i_o, TFinal &Obj)
-      : io(i_o), BufPtr(NULL), Result(Obj) {
+      : io(i_o), BufPtr(nullptr), Result(Obj) {
     if ( io.outputting() ) {
       BufPtr = new (&Buffer) TNorm(io, Obj);
     }
@@ -765,9 +874,9 @@ public:
   // user-data. The DiagHandler can be specified to provide
   // alternative error reporting.
   Input(StringRef InputContent,
-        void *Ctxt = NULL,
-        SourceMgr::DiagHandlerTy DiagHandler = NULL,
-        void *DiagHandlerCtxt = NULL);
+        void *Ctxt = nullptr,
+        SourceMgr::DiagHandlerTy DiagHandler = nullptr,
+        void *DiagHandlerCtxt = nullptr);
   ~Input();
 
   // Check if there was an syntax or semantic error during parsing.
@@ -794,7 +903,7 @@ private:
   bool beginBitSetScalar(bool &) override;
   bool bitSetMatch(const char *, bool ) override;
   void endBitSetScalar() override;
-  void scalarString(StringRef &) override;
+  void scalarString(StringRef &, bool) override;
   void setError(const Twine &message) override;
   bool canElideEmptySequence() override;
 
@@ -896,7 +1005,7 @@ private:
 ///
 class Output : public IO {
 public:
-  Output(llvm::raw_ostream &, void *Ctxt=NULL);
+  Output(llvm::raw_ostream &, void *Ctxt=nullptr);
   virtual ~Output();
 
   bool outputting() override;
@@ -919,7 +1028,7 @@ public:
   bool beginBitSetScalar(bool &) override;
   bool bitSetMatch(const char *, bool ) override;
   void endBitSetScalar() override;
-  void scalarString(StringRef &) override;
+  void scalarString(StringRef &, bool) override;
   void setError(const Twine &message) override;
   bool canElideEmptySequence() override;
 public:
@@ -990,24 +1099,28 @@ template<>
 struct ScalarTraits<Hex8> {
   static void output(const Hex8 &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, Hex8 &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<Hex16> {
   static void output(const Hex16 &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, Hex16 &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<Hex32> {
   static void output(const Hex32 &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, Hex32 &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template<>
 struct ScalarTraits<Hex64> {
   static void output(const Hex64 &, void*, llvm::raw_ostream &);
   static StringRef input(StringRef, void*, Hex64 &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 
diff --git a/include/llvm/Support/circular_raw_ostream.h b/include/llvm/Support/circular_raw_ostream.h
index 3114199..ee7b89f 100644
--- a/include/llvm/Support/circular_raw_ostream.h
+++ b/include/llvm/Support/circular_raw_ostream.h
@@ -109,10 +109,10 @@ namespace llvm
     circular_raw_ostream(raw_ostream &Stream, const char *Header,
                          size_t BuffSize = 0, bool Owns = REFERENCE_ONLY) 
         : raw_ostream(/*unbuffered*/true),
-            TheStream(0),
+            TheStream(nullptr),
             OwnsStream(Owns),
             BufferSize(BuffSize),
-            BufferArray(0),
+            BufferArray(nullptr),
             Filled(false),
             Banner(Header) {
       if (BufferSize != 0)
@@ -122,9 +122,9 @@ namespace llvm
     }
     explicit circular_raw_ostream()
         : raw_ostream(/*unbuffered*/true),
-            TheStream(0),
+            TheStream(nullptr),
             OwnsStream(REFERENCE_ONLY),
-            BufferArray(0),
+            BufferArray(nullptr),
             Filled(false),
             Banner("") {
       Cur = BufferArray;
diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index 0240035..34fbe08 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h
@@ -17,13 +17,18 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
-#include "llvm/Support/FileSystem.h"
 
 namespace llvm {
   class format_object_base;
   template <typename T>
   class SmallVectorImpl;
 
+  namespace sys {
+    namespace fs {
+      enum OpenFlags : unsigned;
+    }
+  }
+
 /// raw_ostream - This class implements an extremely fast bulk output stream
 /// that can *only* output to a stream.  It does not support seeking, reopening,
 /// rewinding, line buffered disciplines etc. It is a simple buffer that outputs
@@ -76,7 +81,7 @@ public:
   explicit raw_ostream(bool unbuffered=false)
     : BufferMode(unbuffered ? Unbuffered : InternalBuffer) {
     // Start out ready to flush.
-    OutBufStart = OutBufEnd = OutBufCur = 0;
+    OutBufStart = OutBufEnd = OutBufCur = nullptr;
   }
 
   virtual ~raw_ostream();
@@ -102,7 +107,7 @@ public:
   size_t GetBufferSize() const {
     // If we're supposed to be buffered but haven't actually gotten around
     // to allocating the buffer yet, return the value that would be used.
-    if (BufferMode != Unbuffered && OutBufStart == 0)
+    if (BufferMode != Unbuffered && OutBufStart == nullptr)
       return preferred_buffer_size();
 
     // Otherwise just return the size of the allocated buffer.
@@ -115,7 +120,7 @@ public:
   /// set to unbuffered.
   void SetUnbuffered() {
     flush();
-    SetBufferAndMode(0, 0, Unbuffered);
+    SetBufferAndMode(nullptr, 0, Unbuffered);
   }
 
   size_t GetNumBytesInBuffer() const {
@@ -157,7 +162,7 @@ public:
     size_t Size = Str.size();
 
     // Make sure we can use the fast path.
-    if (OutBufCur+Size > OutBufEnd)
+    if (Size > (size_t)(OutBufEnd - OutBufCur))
       return write(Str.data(), Size);
 
     memcpy(OutBufCur, Str.data(), Size);
diff --git a/include/llvm/Support/system_error.h b/include/llvm/Support/system_error.h
index 4ca4b06..aa5e9f7 100644
--- a/include/llvm/Support/system_error.h
+++ b/include/llvm/Support/system_error.h
@@ -706,7 +706,7 @@ public:
   static void unspecified_bool_true() {}
 
   operator unspecified_bool_type() const { // true if error
-    return _val_ == 0 ? 0 : unspecified_bool_true;
+    return _val_ == 0 ? nullptr : unspecified_bool_true;
   }
 };
 
@@ -771,7 +771,7 @@ public:
   static void unspecified_bool_true() {}
 
   operator unspecified_bool_type() const { // true if error
-    return _val_ == 0 ? 0 : unspecified_bool_true;
+    return _val_ == 0 ? nullptr : unspecified_bool_true;
   }
 };
 
diff --git a/include/llvm/TableGen/Error.h b/include/llvm/TableGen/Error.h
index 17ac418..3df658d 100644
--- a/include/llvm/TableGen/Error.h
+++ b/include/llvm/TableGen/Error.h
@@ -34,7 +34,6 @@ LLVM_ATTRIBUTE_NORETURN void PrintFatalError(ArrayRef<SMLoc> ErrorLoc,
 extern SourceMgr SrcMgr;
 extern unsigned ErrorsPrinted;
 
-
 } // end namespace "llvm"
 
 #endif
diff --git a/include/llvm/TableGen/Main.h b/include/llvm/TableGen/Main.h
index 6b51e20..866b986 100644
--- a/include/llvm/TableGen/Main.h
+++ b/include/llvm/TableGen/Main.h
@@ -23,7 +23,6 @@ class raw_ostream;
 typedef bool TableGenMainFn(raw_ostream &OS, RecordKeeper &Records);
 
 int TableGenMain(char *argv0, TableGenMainFn *MainFn);
-
 }
 
 #endif
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index 2bed006..36464d7 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -87,7 +87,7 @@ private:
 public:
   RecTyKind getRecTyKind() const { return Kind; }
 
-  RecTy(RecTyKind K) : Kind(K), ListTy(0) {}
+  RecTy(RecTyKind K) : Kind(K), ListTy(nullptr) {}
   virtual ~RecTy() {}
 
   virtual std::string getAsString() const = 0;
@@ -102,12 +102,12 @@ public:
   ListRecTy *getListTy();
 
 public:   // These methods should only be called from subclasses of Init
-  virtual Init *convertValue( UnsetInit *UI) { return 0; }
-  virtual Init *convertValue(   BitInit *BI) { return 0; }
-  virtual Init *convertValue(  BitsInit *BI) { return 0; }
-  virtual Init *convertValue(   IntInit *II) { return 0; }
-  virtual Init *convertValue(StringInit *SI) { return 0; }
-  virtual Init *convertValue(  ListInit *LI) { return 0; }
+  virtual Init *convertValue( UnsetInit *UI) { return nullptr; }
+  virtual Init *convertValue(   BitInit *BI) { return nullptr; }
+  virtual Init *convertValue(  BitsInit *BI) { return nullptr; }
+  virtual Init *convertValue(   IntInit *II) { return nullptr; }
+  virtual Init *convertValue(StringInit *SI) { return nullptr; }
+  virtual Init *convertValue(  ListInit *LI) { return nullptr; }
   virtual Init *convertValue( UnOpInit *UI) {
     return convertValue((TypedInit*)UI);
   }
@@ -117,10 +117,10 @@ public:   // These methods should only be called from subclasses of Init
   virtual Init *convertValue( TernOpInit *UI) {
     return convertValue((TypedInit*)UI);
   }
-  virtual Init *convertValue(VarBitInit *VB) { return 0; }
-  virtual Init *convertValue(   DefInit *DI) { return 0; }
-  virtual Init *convertValue(   DagInit *DI) { return 0; }
-  virtual Init *convertValue( TypedInit *TI) { return 0; }
+  virtual Init *convertValue(VarBitInit *VB) { return nullptr; }
+  virtual Init *convertValue(   DefInit *DI) { return nullptr; }
+  virtual Init *convertValue(   DagInit *DI) { return nullptr; }
+  virtual Init *convertValue( TypedInit *TI) { return nullptr; }
   virtual Init *convertValue(   VarInit *VI) {
     return convertValue((TypedInit*)VI);
   }
@@ -137,12 +137,12 @@ inline raw_ostream &operator<<(raw_ostream &OS, const RecTy &Ty) {
   return OS;
 }
 
-
 /// BitRecTy - 'bit' - Represent a single bit
 ///
 class BitRecTy : public RecTy {
   static BitRecTy Shared;
   BitRecTy() : RecTy(BitRecTyKind) {}
+
 public:
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == BitRecTyKind;
@@ -154,11 +154,11 @@ public:
   Init *convertValue(   BitInit *BI) override { return (Init*)BI; }
   Init *convertValue(  BitsInit *BI) override;
   Init *convertValue(   IntInit *II) override;
-  Init *convertValue(StringInit *SI) override { return 0; }
-  Init *convertValue(  ListInit *LI) override { return 0; }
+  Init *convertValue(StringInit *SI) override { return nullptr; }
+  Init *convertValue(  ListInit *LI) override { return nullptr; }
   Init *convertValue(VarBitInit *VB) override { return (Init*)VB; }
-  Init *convertValue(   DefInit *DI) override { return 0; }
-  Init *convertValue(   DagInit *DI) override { return 0; }
+  Init *convertValue(   DefInit *DI) override { return nullptr; }
+  Init *convertValue(   DagInit *DI) override { return nullptr; }
   Init *convertValue( UnOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( BinOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( TernOpInit *UI) override {return RecTy::convertValue(UI);}
@@ -174,12 +174,12 @@ public:
   bool baseClassOf(const RecTy*) const override;
 };
 
-
 /// BitsRecTy - 'bits<n>' - Represent a fixed number of bits
 ///
 class BitsRecTy : public RecTy {
   unsigned Size;
   explicit BitsRecTy(unsigned Sz) : RecTy(BitsRecTyKind), Size(Sz) {}
+
 public:
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == BitsRecTyKind;
@@ -193,32 +193,32 @@ public:
   Init *convertValue(   BitInit *UI) override;
   Init *convertValue(  BitsInit *BI) override;
   Init *convertValue(   IntInit *II) override;
-  Init *convertValue(StringInit *SI) override { return 0; }
-  Init *convertValue(  ListInit *LI) override { return 0; }
-  Init *convertValue(VarBitInit *VB) override { return 0; }
-  Init *convertValue(   DefInit *DI) override { return 0; }
-  Init *convertValue(   DagInit *DI) override { return 0; }
-  Init *convertValue( UnOpInit *UI) override { return RecTy::convertValue(UI);}
+  Init *convertValue(StringInit *SI) override { return nullptr; }
+  Init *convertValue(  ListInit *LI) override { return nullptr; }
+  Init *convertValue(VarBitInit *VB) override { return nullptr; }
+  Init *convertValue(   DefInit *DI) override { return nullptr; }
+  Init *convertValue(   DagInit *DI) override { return nullptr; }
+  Init *convertValue(  UnOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( BinOpInit *UI) override { return RecTy::convertValue(UI);}
-  Init *convertValue( TernOpInit *UI) override {return RecTy::convertValue(UI);}
+  Init *convertValue(TernOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( TypedInit *TI) override;
-  Init *convertValue(   VarInit *VI) override{ return RecTy::convertValue(VI);}
-  Init *convertValue( FieldInit *FI) override{ return RecTy::convertValue(FI);}
+  Init *convertValue(   VarInit *VI) override { return RecTy::convertValue(VI);}
+  Init *convertValue( FieldInit *FI) override { return RecTy::convertValue(FI);}
 
   std::string getAsString() const override;
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const override{
+  bool typeIsConvertibleTo(const RecTy *RHS) const override {
     return RHS->baseClassOf(this);
   }
   bool baseClassOf(const RecTy*) const override;
 };
 
-
 /// IntRecTy - 'int' - Represent an integer value of no particular size
 ///
 class IntRecTy : public RecTy {
   static IntRecTy Shared;
   IntRecTy() : RecTy(IntRecTyKind) {}
+
 public:
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == IntRecTyKind;
@@ -230,11 +230,11 @@ public:
   Init *convertValue(   BitInit *BI) override;
   Init *convertValue(  BitsInit *BI) override;
   Init *convertValue(   IntInit *II) override { return (Init*)II; }
-  Init *convertValue(StringInit *SI) override { return 0; }
-  Init *convertValue(  ListInit *LI) override { return 0; }
-  Init *convertValue(VarBitInit *VB) override { return 0; }
-  Init *convertValue(   DefInit *DI) override { return 0; }
-  Init *convertValue(   DagInit *DI) override { return 0; }
+  Init *convertValue(StringInit *SI) override { return nullptr; }
+  Init *convertValue(  ListInit *LI) override { return nullptr; }
+  Init *convertValue(VarBitInit *VB) override { return nullptr; }
+  Init *convertValue(   DefInit *DI) override { return nullptr; }
+  Init *convertValue(   DagInit *DI) override { return nullptr; }
   Init *convertValue( UnOpInit *UI)  override { return RecTy::convertValue(UI);}
   Init *convertValue( BinOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( TernOpInit *UI) override {return RecTy::convertValue(UI);}
@@ -256,6 +256,7 @@ public:
 class StringRecTy : public RecTy {
   static StringRecTy Shared;
   StringRecTy() : RecTy(StringRecTyKind) {}
+
 public:
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == StringRecTyKind;
@@ -264,18 +265,18 @@ public:
   static StringRecTy *get() { return &Shared; }
 
   Init *convertValue( UnsetInit *UI) override { return (Init*)UI; }
-  Init *convertValue(   BitInit *BI) override { return 0; }
-  Init *convertValue(  BitsInit *BI) override { return 0; }
-  Init *convertValue(   IntInit *II) override { return 0; }
+  Init *convertValue(   BitInit *BI) override { return nullptr; }
+  Init *convertValue(  BitsInit *BI) override { return nullptr; }
+  Init *convertValue(   IntInit *II) override { return nullptr; }
   Init *convertValue(StringInit *SI) override { return (Init*)SI; }
-  Init *convertValue(  ListInit *LI) override { return 0; }
+  Init *convertValue(  ListInit *LI) override { return nullptr; }
   Init *convertValue( UnOpInit *BO) override;
   Init *convertValue( BinOpInit *BO) override;
   Init *convertValue( TernOpInit *BO) override {return RecTy::convertValue(BO);}
 
-  Init *convertValue(VarBitInit *VB) override { return 0; }
-  Init *convertValue(   DefInit *DI) override { return 0; }
-  Init *convertValue(   DagInit *DI) override { return 0; }
+  Init *convertValue(VarBitInit *VB) override { return nullptr; }
+  Init *convertValue(   DefInit *DI) override { return nullptr; }
+  Init *convertValue(   DagInit *DI) override { return nullptr; }
   Init *convertValue( TypedInit *TI) override;
   Init *convertValue(   VarInit *VI) override { return RecTy::convertValue(VI);}
   Init *convertValue( FieldInit *FI) override { return RecTy::convertValue(FI);}
@@ -294,6 +295,7 @@ class ListRecTy : public RecTy {
   RecTy *Ty;
   explicit ListRecTy(RecTy *T) : RecTy(ListRecTyKind), Ty(T) {}
   friend ListRecTy *RecTy::getListTy();
+
 public:
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == ListRecTyKind;
@@ -303,24 +305,24 @@ public:
   RecTy *getElementType() const { return Ty; }
 
   Init *convertValue( UnsetInit *UI) override { return (Init*)UI; }
-  Init *convertValue(   BitInit *BI) override { return 0; }
-  Init *convertValue(  BitsInit *BI) override { return 0; }
-  Init *convertValue(   IntInit *II) override { return 0; }
-  Init *convertValue(StringInit *SI) override { return 0; }
+  Init *convertValue(   BitInit *BI) override { return nullptr; }
+  Init *convertValue(  BitsInit *BI) override { return nullptr; }
+  Init *convertValue(   IntInit *II) override { return nullptr; }
+  Init *convertValue(StringInit *SI) override { return nullptr; }
   Init *convertValue(  ListInit *LI) override;
-  Init *convertValue(VarBitInit *VB) override { return 0; }
-  Init *convertValue(   DefInit *DI) override { return 0; }
-  Init *convertValue(   DagInit *DI) override { return 0; }
-  Init *convertValue( UnOpInit *UI) override { return RecTy::convertValue(UI);}
+  Init *convertValue(VarBitInit *VB) override { return nullptr; }
+  Init *convertValue(   DefInit *DI) override { return nullptr; }
+  Init *convertValue(   DagInit *DI) override { return nullptr; }
+  Init *convertValue(  UnOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( BinOpInit *UI) override { return RecTy::convertValue(UI);}
-  Init *convertValue( TernOpInit *UI) override{ return RecTy::convertValue(UI);}
+  Init *convertValue(TernOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( TypedInit *TI) override;
   Init *convertValue(   VarInit *VI) override { return RecTy::convertValue(VI);}
   Init *convertValue( FieldInit *FI) override { return RecTy::convertValue(FI);}
 
   std::string getAsString() const override;
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const override{
+  bool typeIsConvertibleTo(const RecTy *RHS) const override {
     return RHS->baseClassOf(this);
   }
 
@@ -332,6 +334,7 @@ public:
 class DagRecTy : public RecTy {
   static DagRecTy Shared;
   DagRecTy() : RecTy(DagRecTyKind) {}
+
 public:
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == DagRecTyKind;
@@ -340,13 +343,13 @@ public:
   static DagRecTy *get() { return &Shared; }
 
   Init *convertValue( UnsetInit *UI) override { return (Init*)UI; }
-  Init *convertValue(   BitInit *BI) override { return 0; }
-  Init *convertValue(  BitsInit *BI) override { return 0; }
-  Init *convertValue(   IntInit *II) override { return 0; }
-  Init *convertValue(StringInit *SI) override { return 0; }
-  Init *convertValue(  ListInit *LI) override { return 0; }
-  Init *convertValue(VarBitInit *VB) override { return 0; }
-  Init *convertValue(   DefInit *DI) override { return 0; }
+  Init *convertValue(   BitInit *BI) override { return nullptr; }
+  Init *convertValue(  BitsInit *BI) override { return nullptr; }
+  Init *convertValue(   IntInit *II) override { return nullptr; }
+  Init *convertValue(StringInit *SI) override { return nullptr; }
+  Init *convertValue(  ListInit *LI) override { return nullptr; }
+  Init *convertValue(VarBitInit *VB) override { return nullptr; }
+  Init *convertValue(   DefInit *DI) override { return nullptr; }
   Init *convertValue( UnOpInit *BO) override;
   Init *convertValue( BinOpInit *BO) override;
   Init *convertValue( TernOpInit *BO) override {return RecTy::convertValue(BO);}
@@ -357,12 +360,11 @@ public:
 
   std::string getAsString() const override { return "dag"; }
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const override{
+  bool typeIsConvertibleTo(const RecTy *RHS) const override {
     return RHS->baseClassOf(this);
   }
 };
 
-
 /// RecordRecTy - '[classname]' - Represent an instance of a class, such as:
 /// (R32 X = EAX).
 ///
@@ -370,6 +372,7 @@ class RecordRecTy : public RecTy {
   Record *Rec;
   explicit RecordRecTy(Record *R) : RecTy(RecordRecTyKind), Rec(R) {}
   friend class Record;
+
 public:
   static bool classof(const RecTy *RT) {
     return RT->getRecTyKind() == RecordRecTyKind;
@@ -380,17 +383,17 @@ public:
   Record *getRecord() const { return Rec; }
 
   Init *convertValue( UnsetInit *UI) override { return (Init*)UI; }
-  Init *convertValue(   BitInit *BI) override { return 0; }
-  Init *convertValue(  BitsInit *BI) override { return 0; }
-  Init *convertValue(   IntInit *II) override { return 0; }
-  Init *convertValue(StringInit *SI) override { return 0; }
-  Init *convertValue(  ListInit *LI) override { return 0; }
-  Init *convertValue(VarBitInit *VB) override { return 0; }
+  Init *convertValue(   BitInit *BI) override { return nullptr; }
+  Init *convertValue(  BitsInit *BI) override { return nullptr; }
+  Init *convertValue(   IntInit *II) override { return nullptr; }
+  Init *convertValue(StringInit *SI) override { return nullptr; }
+  Init *convertValue(  ListInit *LI) override { return nullptr; }
+  Init *convertValue(VarBitInit *VB) override { return nullptr; }
   Init *convertValue( UnOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( BinOpInit *UI) override { return RecTy::convertValue(UI);}
   Init *convertValue( TernOpInit *UI) override {return RecTy::convertValue(UI);}
   Init *convertValue(   DefInit *DI) override;
-  Init *convertValue(   DagInit *DI) override { return 0; }
+  Init *convertValue(   DagInit *DI) override { return nullptr; }
   Init *convertValue( TypedInit *VI) override;
   Init *convertValue(   VarInit *VI) override { return RecTy::convertValue(VI);}
   Init *convertValue( FieldInit *FI) override { return RecTy::convertValue(FI);}
@@ -496,7 +499,7 @@ public:
   ///
   virtual Init *
   convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
-    return 0;
+    return nullptr;
   }
 
   /// convertInitListSlice - This method is used to implement the list slice
@@ -506,14 +509,16 @@ public:
   ///
   virtual Init *
   convertInitListSlice(const std::vector<unsigned> &Elements) const {
-    return 0;
+    return nullptr;
   }
 
   /// getFieldType - This method is used to implement the FieldInit class.
   /// Implementors of this method should return the type of the named field if
   /// they are of record type.
   ///
-  virtual RecTy *getFieldType(const std::string &FieldName) const { return 0; }
+  virtual RecTy *getFieldType(const std::string &FieldName) const {
+    return nullptr;
+  }
 
   /// getFieldInit - This method complements getFieldType to return the
   /// initializer for the specified field.  If getFieldType returns non-null
@@ -521,7 +526,7 @@ public:
   ///
   virtual Init *getFieldInit(Record &R, const RecordVal *RV,
                              const std::string &FieldName) const {
-    return 0;
+    return nullptr;
   }
 
   /// resolveReferences - This method is used by classes that refer to other
@@ -587,7 +592,6 @@ public:
                                             unsigned Elt) const = 0;
 };
 
-
 /// UnsetInit - ? - Represents an uninitialized value
 ///
 class UnsetInit : public Init {
@@ -614,7 +618,6 @@ public:
   std::string getAsString() const override { return "?"; }
 };
 
-
 /// BitInit - true/false - Represent a concrete initializer for a bit.
 ///
 class BitInit : public Init {
@@ -693,7 +696,6 @@ public:
   }
 };
 
-
 /// IntInit - 7 - Represent an initialization by a literal integer value.
 ///
 class IntInit : public TypedInit {
@@ -734,7 +736,6 @@ public:
   }
 };
 
-
 /// StringInit - "foo" - Represent an initialization by a string value.
 ///
 class StringInit : public TypedInit {
@@ -779,6 +780,7 @@ public:
 ///
 class ListInit : public TypedInit, public FoldingSetNode {
   std::vector<Init*> Values;
+
 public:
   typedef std::vector<Init*>::const_iterator const_iterator;
 
@@ -841,7 +843,6 @@ public:
   }
 };
 
-
 /// OpInit - Base class for operators
 ///
 class OpInit : public TypedInit {
@@ -876,12 +877,12 @@ public:
   Init *getBit(unsigned Bit) const override;
 };
 
-
 /// UnOpInit - !op (X) - Transform an init.
 ///
 class UnOpInit : public OpInit {
 public:
   enum UnaryOp { CAST, HEAD, TAIL, EMPTY };
+
 private:
   UnaryOp Opc;
   Init *LHS;
@@ -927,7 +928,8 @@ public:
 ///
 class BinOpInit : public OpInit {
 public:
-  enum BinaryOp { ADD, SHL, SRA, SRL, STRCONCAT, CONCAT, EQ };
+  enum BinaryOp { ADD, SHL, SRA, SRL, LISTCONCAT, STRCONCAT, CONCAT, EQ };
+
 private:
   BinaryOp Opc;
   Init *LHS, *RHS;
@@ -980,6 +982,7 @@ public:
 class TernOpInit : public OpInit {
 public:
   enum TernaryOp { SUBST, FOREACH, IF };
+
 private:
   TernaryOp Opc;
   Init *LHS, *MHS, *RHS;
@@ -1036,7 +1039,6 @@ public:
   std::string getAsString() const override;
 };
 
-
 /// VarInit - 'Opcode' - Represent a reference to an entire variable object.
 ///
 class VarInit : public TypedInit {
@@ -1086,7 +1088,6 @@ public:
   std::string getAsString() const override { return getName(); }
 };
 
-
 /// VarBitInit - Opcode{0} - Represent access to one bit of a variable or field.
 ///
 class VarBitInit : public Init {
@@ -1212,7 +1213,6 @@ public:
   }
 };
 
-
 /// FieldInit - X.Y - Represent a reference to a subfield of a variable
 ///
 class FieldInit : public TypedInit {
@@ -1339,6 +1339,7 @@ class RecordVal {
   RecTy *Ty;
   unsigned Prefix;
   Init *Value;
+
 public:
   RecordVal(Init *N, RecTy *T, unsigned P);
   RecordVal(const std::string &N, RecTy *T, unsigned P);
@@ -1356,9 +1357,9 @@ public:
   bool setValue(Init *V) {
     if (V) {
       Value = V->convertInitializerTo(Ty);
-      return Value == 0;
+      return Value == nullptr;
     }
-    Value = 0;
+    Value = nullptr;
     return false;
   }
 
@@ -1395,18 +1396,17 @@ class Record {
   void checkName();
 
 public:
-
   // Constructs a record.
   explicit Record(const std::string &N, ArrayRef<SMLoc> locs,
                   RecordKeeper &records, bool Anonymous = false) :
     ID(LastID++), Name(StringInit::get(N)), Locs(locs.begin(), locs.end()),
-    TrackedRecords(records), TheInit(0), IsAnonymous(Anonymous) {
+    TrackedRecords(records), TheInit(nullptr), IsAnonymous(Anonymous) {
     init();
   }
   explicit Record(Init *N, ArrayRef<SMLoc> locs, RecordKeeper &records,
                   bool Anonymous = false) :
     ID(LastID++), Name(N), Locs(locs.begin(), locs.end()),
-    TrackedRecords(records), TheInit(0), IsAnonymous(Anonymous) {
+    TrackedRecords(records), TheInit(nullptr), IsAnonymous(Anonymous) {
     init();
   }
 
@@ -1420,10 +1420,8 @@ public:
 
   ~Record() {}
 
-
   static unsigned getNewUID() { return LastID++; }
 
-
   unsigned getID() const { return ID; }
 
   const std::string &getName() const;
@@ -1461,7 +1459,7 @@ public:
   const RecordVal *getValue(const Init *Name) const {
     for (unsigned i = 0, e = Values.size(); i != e; ++i)
       if (Values[i].getNameInit() == Name) return &Values[i];
-    return 0;
+    return nullptr;
   }
   const RecordVal *getValue(StringRef Name) const {
     return getValue(StringInit::get(Name));
@@ -1469,7 +1467,7 @@ public:
   RecordVal *getValue(const Init *Name) {
     for (unsigned i = 0, e = Values.size(); i != e; ++i)
       if (Values[i].getNameInit() == Name) return &Values[i];
-    return 0;
+    return nullptr;
   }
   RecordVal *getValue(StringRef Name) {
     return getValue(StringInit::get(Name));
@@ -1484,7 +1482,7 @@ public:
   }
 
   void addValue(const RecordVal &RV) {
-    assert(getValue(RV.getNameInit()) == 0 && "Value already added!");
+    assert(getValue(RV.getNameInit()) == nullptr && "Value already added!");
     Values.push_back(RV);
     if (Values.size() > 1)
       // Keep NAME at the end of the list.  It makes record dumps a
@@ -1531,7 +1529,7 @@ public:
   /// resolveReferences - If there are any field references that refer to fields
   /// that have been filled in, we can propagate the values now.
   ///
-  void resolveReferences() { resolveReferencesTo(0); }
+  void resolveReferences() { resolveReferencesTo(nullptr); }
 
   /// resolveReferencesTo - If anything in this record refers to RV, replace the
   /// reference to RV with the RHS of RV.  If RV is null, we resolve all
@@ -1660,11 +1658,11 @@ public:
 
   Record *getClass(const std::string &Name) const {
     std::map<std::string, Record*>::const_iterator I = Classes.find(Name);
-    return I == Classes.end() ? 0 : I->second;
+    return I == Classes.end() ? nullptr : I->second;
   }
   Record *getDef(const std::string &Name) const {
     std::map<std::string, Record*>::const_iterator I = Defs.find(Name);
-    return I == Defs.end() ? 0 : I->second;
+    return I == Defs.end() ? nullptr : I->second;
   }
   void addClass(Record *R) {
     bool Ins = Classes.insert(std::make_pair(R->getName(), R)).second;
diff --git a/include/llvm/TableGen/StringMatcher.h b/include/llvm/TableGen/StringMatcher.h
index 99cbcad..b438779 100644
--- a/include/llvm/TableGen/StringMatcher.h
+++ b/include/llvm/TableGen/StringMatcher.h
@@ -21,29 +21,29 @@
 
 namespace llvm {
   class raw_ostream;
-  
+
 /// StringMatcher - Given a list of strings and code to execute when they match,
 /// output a simple switch tree to classify the input string.
-/// 
+///
 /// If a match is found, the code in Vals[i].second is executed; control must
 /// not exit this code fragment.  If nothing matches, execution falls through.
 ///
 class StringMatcher {
 public:
   typedef std::pair<std::string, std::string> StringPair;
+
 private:
   StringRef StrVariableName;
   const std::vector<StringPair> &Matches;
   raw_ostream &OS;
-  
+
 public:
   StringMatcher(StringRef strVariableName, 
                 const std::vector<StringPair> &matches, raw_ostream &os)
     : StrVariableName(strVariableName), Matches(matches), OS(os) {}
-  
+
   void Emit(unsigned Indent = 0) const;
-  
-  
+
 private:
   bool EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
                                 unsigned CharNo, unsigned IndentCount) const;
diff --git a/include/llvm/TableGen/StringToOffsetTable.h b/include/llvm/TableGen/StringToOffsetTable.h
index d94d3a2..c924bd8 100644
--- a/include/llvm/TableGen/StringToOffsetTable.h
+++ b/include/llvm/TableGen/StringToOffsetTable.h
@@ -25,8 +25,8 @@ namespace llvm {
 class StringToOffsetTable {
   StringMap<unsigned> StringOffset;
   std::string AggregateString;
+
 public:
-  
   unsigned GetOrAddStringOffset(StringRef Str, bool appendZero = true) {
     StringMapEntry<unsigned> &Entry = StringOffset.GetOrCreateValue(Str, -1U);
     if (Entry.getValue() == -1U) {
@@ -36,10 +36,10 @@ public:
       if (appendZero)
         AggregateString += '\0';
     }
-    
+
     return Entry.getValue();
   }
-  
+
   void EmitString(raw_ostream &O) {
     // Escape the string.
     SmallString<256> Str;
@@ -55,11 +55,11 @@ public:
       }
       O << AggregateString[i];
       ++CharsPrinted;
-      
+
       // Print escape sequences all together.
       if (AggregateString[i] != '\\')
         continue;
-      
+
       assert(i+1 < AggregateString.size() && "Incomplete escape sequence!");
       if (isdigit(AggregateString[i+1])) {
         assert(isdigit(AggregateString[i+2]) && 
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index facb89a..7d1f19c 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -950,10 +950,15 @@ class MnemonicAlias<string From, string To, string VariantName = ""> {
 /// InstAlias - This defines an alternate assembly syntax that is allowed to
 /// match an instruction that has a different (more canonical) assembly
 /// representation.
-class InstAlias<string Asm, dag Result, bit Emit = 0b1> {
+class InstAlias<string Asm, dag Result, int Emit = 1> {
   string AsmString = Asm;      // The .s format to match the instruction with.
   dag ResultInst = Result;     // The MCInst to generate.
-  bit EmitAlias = Emit;        // Emit the alias instead of what's aliased.
+
+  // This determines which order the InstPrinter detects aliases for
+  // printing. A larger value makes the alias more likely to be
+  // emitted. The Instruction's own definition is notionally 0.5, so 0
+  // disables printing and 1 enables it if there are no conflicting aliases.
+  int EmitPriority = Emit;
 
   // Predicates - Predicates that must be true for this to match.
   list<Predicate> Predicates = [];
diff --git a/include/llvm/Target/TargetCallingConv.h b/include/llvm/Target/TargetCallingConv.h
index a660403..a0f2674 100644
--- a/include/llvm/Target/TargetCallingConv.h
+++ b/include/llvm/Target/TargetCallingConv.h
@@ -47,8 +47,12 @@ namespace ISD {
     static const uint64_t InAllocaOffs   = 12;
     static const uint64_t OrigAlign      = 0x1FULL<<27;
     static const uint64_t OrigAlignOffs  = 27;
-    static const uint64_t ByValSize      = 0xffffffffULL<<32; ///< Struct size
+    static const uint64_t ByValSize      = 0x3fffffffULL<<32; ///< Struct size
     static const uint64_t ByValSizeOffs  = 32;
+    static const uint64_t InConsecutiveRegsLast      = 0x1ULL<<62; ///< Struct size
+    static const uint64_t InConsecutiveRegsLastOffs  = 62;
+    static const uint64_t InConsecutiveRegs      = 0x1ULL<<63; ///< Struct size
+    static const uint64_t InConsecutiveRegsOffs  = 63;
 
     static const uint64_t One            = 1ULL; ///< 1 of this type, for shifts
 
@@ -80,6 +84,12 @@ namespace ISD {
     bool isReturned()  const { return Flags & Returned; }
     void setReturned() { Flags |= One << ReturnedOffs; }
 
+    bool isInConsecutiveRegs()  const { return Flags & InConsecutiveRegs; }
+    void setInConsecutiveRegs() { Flags |= One << InConsecutiveRegsOffs; }
+
+    bool isInConsecutiveRegsLast()  const { return Flags & InConsecutiveRegsLast; }
+    void setInConsecutiveRegsLast() { Flags |= One << InConsecutiveRegsLastOffs; }
+
     unsigned getByValAlign() const {
       return (unsigned)
         ((One << ((Flags & ByValAlign) >> ByValAlignOffs)) / 2);
diff --git a/include/llvm/Target/TargetCallingConv.td b/include/llvm/Target/TargetCallingConv.td
index 9d1dc38..8f31e08 100644
--- a/include/llvm/Target/TargetCallingConv.td
+++ b/include/llvm/Target/TargetCallingConv.td
@@ -42,6 +42,11 @@ class CCIf<string predicate, CCAction A> : CCPredicateAction<A> {
 class CCIfByVal<CCAction A> : CCIf<"ArgFlags.isByVal()", A> {
 }
 
+/// CCIfConsecutiveRegs - If the current argument has InConsecutiveRegs
+/// parameter attribute, apply Action A.
+class CCIfConsecutiveRegs<CCAction A> : CCIf<"ArgFlags.isInConsecutiveRegs()", A> {
+}
+
 /// CCIfCC - Match if the current calling convention is 'CC'.
 class CCIfCC<string CC, CCAction A>
   : CCIf<!strconcat("State.getCallingConv() == ", CC), A> {}
diff --git a/include/llvm/Target/TargetFrameLowering.h b/include/llvm/Target/TargetFrameLowering.h
index a60147f..7c42e23 100644
--- a/include/llvm/Target/TargetFrameLowering.h
+++ b/include/llvm/Target/TargetFrameLowering.h
@@ -105,7 +105,7 @@ public:
   virtual const SpillSlot *
   getCalleeSavedSpillSlots(unsigned &NumEntries) const {
     NumEntries = 0;
-    return 0;
+    return nullptr;
   }
 
   /// targetHandlesStackFrameRounding - Returns true if the target is
@@ -190,7 +190,7 @@ public:
   /// before PrologEpilogInserter scans the physical registers used to determine
   /// what callee saved registers should be spilled. This method is optional.
   virtual void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                RegScavenger *RS = NULL) const {
+                                             RegScavenger *RS = nullptr) const {
 
   }
 
@@ -200,7 +200,7 @@ public:
   /// replaced with direct constants.  This method is optional.
   ///
   virtual void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                               RegScavenger *RS = NULL) const {
+                                             RegScavenger *RS = nullptr) const {
   }
 
   /// eliminateCallFramePseudoInstr - This method is called during prolog/epilog
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index d4e14f6..165b35f 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -66,7 +66,7 @@ public:
   /// rematerializable, meaning it has no side effects and requires no operands
   /// that aren't always available.
   bool isTriviallyReMaterializable(const MachineInstr *MI,
-                                   AliasAnalysis *AA = 0) const {
+                                   AliasAnalysis *AA = nullptr) const {
     return MI->getOpcode() == TargetOpcode::IMPLICIT_DEF ||
            (MI->getDesc().isRematerializable() &&
             (isReallyTriviallyReMaterializable(MI, AA) ||
@@ -230,7 +230,7 @@ public:
   virtual MachineInstr *
   convertToThreeAddress(MachineFunction::iterator &MFI,
                    MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const {
-    return 0;
+    return nullptr;
   }
 
   /// commuteInstruction - If a target has any instructions that are
@@ -257,7 +257,7 @@ public:
   /// aggressive checks.
   virtual bool produceSameValue(const MachineInstr *MI0,
                                 const MachineInstr *MI1,
-                                const MachineRegisterInfo *MRI = 0) const;
+                                const MachineRegisterInfo *MRI = nullptr) const;
 
   /// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
   /// true if it cannot be understood (e.g. it's a switch dispatch or isn't
@@ -555,7 +555,7 @@ protected:
                                           MachineInstr* MI,
                                           const SmallVectorImpl<unsigned> &Ops,
                                           int FrameIndex) const {
-    return 0;
+    return nullptr;
   }
 
   /// foldMemoryOperandImpl - Target-dependent implementation for
@@ -565,7 +565,7 @@ protected:
                                               MachineInstr* MI,
                                           const SmallVectorImpl<unsigned> &Ops,
                                               MachineInstr* LoadMI) const {
-    return 0;
+    return nullptr;
   }
 
 public:
@@ -597,7 +597,7 @@ public:
   /// value.
   virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
                                       bool UnfoldLoad, bool UnfoldStore,
-                                      unsigned *LoadRegIndex = 0) const {
+                                      unsigned *LoadRegIndex = nullptr) const {
     return 0;
   }
 
@@ -780,7 +780,7 @@ public:
                         const MachineRegisterInfo *MRI,
                         unsigned &FoldAsLoadDefReg,
                         MachineInstr *&DefMI) const {
-    return 0;
+    return nullptr;
   }
 
   /// FoldImmediate - 'Reg' is known to be defined by a move immediate
@@ -838,7 +838,7 @@ public:
   /// PredCost.
   virtual unsigned getInstrLatency(const InstrItineraryData *ItinData,
                                    const MachineInstr *MI,
-                                   unsigned *PredCost = 0) const;
+                                   unsigned *PredCost = nullptr) const;
 
   virtual unsigned getPredicationCost(const MachineInstr *MI) const;
 
@@ -1003,7 +1003,7 @@ public:
   /// Create machine specific model for scheduling.
   virtual DFAPacketizer*
     CreateTargetScheduleState(const TargetMachine*, const ScheduleDAG*) const {
-    return NULL;
+    return nullptr;
   }
 
 private:
diff --git a/include/llvm/Target/TargetIntrinsicInfo.h b/include/llvm/Target/TargetIntrinsicInfo.h
index ce21349..6de264e 100644
--- a/include/llvm/Target/TargetIntrinsicInfo.h
+++ b/include/llvm/Target/TargetIntrinsicInfo.h
@@ -40,7 +40,7 @@ public:
   /// intrinsic, Tys should point to an array of numTys pointers to Type,
   /// and must provide exactly one type for each overloaded type in the
   /// intrinsic.
-  virtual std::string getName(unsigned IID, Type **Tys = 0,
+  virtual std::string getName(unsigned IID, Type **Tys = nullptr,
                               unsigned numTys = 0) const = 0;
 
   /// Look up target intrinsic by name. Return intrinsic ID or 0 for unknown
@@ -56,7 +56,7 @@ public:
   /// Create or insert an LLVM Function declaration for an intrinsic,
   /// and return it. The Tys and numTys are for intrinsics with overloaded
   /// types. See above for more information.
-  virtual Function *getDeclaration(Module *M, unsigned ID, Type **Tys = 0,
+  virtual Function *getDeclaration(Module *M, unsigned ID, Type **Tys = nullptr,
                                    unsigned numTys = 0) const = 0;
 };
 
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 2f6445f..60a4079 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -31,6 +31,8 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Target/TargetCallingConv.h"
 #include "llvm/Target/TargetMachine.h"
 #include <climits>
@@ -180,6 +182,9 @@ public:
     return HasMultipleConditionRegisters;
   }
 
+  /// Return true if the target has BitExtract instructions.
+  bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
+
   /// Return true if a vector of the given type should be split
   /// (TypeSplitVector) instead of promoted (TypePromoteInteger) during type
   /// legalization.
@@ -322,7 +327,7 @@ public:
   bool isTypeLegal(EVT VT) const {
     assert(!VT.isSimple() ||
            (unsigned)VT.getSimpleVT().SimpleTy < array_lengthof(RegClassForVT));
-    return VT.isSimple() && RegClassForVT[VT.getSimpleVT().SimpleTy] != 0;
+    return VT.isSimple() && RegClassForVT[VT.getSimpleVT().SimpleTy] != nullptr;
   }
 
   class ValueTypeActionImpl {
@@ -332,7 +337,7 @@ public:
 
   public:
     ValueTypeActionImpl() {
-      std::fill(ValueTypeActions, array_endof(ValueTypeActions), 0);
+      std::fill(std::begin(ValueTypeActions), std::end(ValueTypeActions), 0);
     }
 
     LegalizeTypeAction getTypeAction(MVT VT) const {
@@ -754,7 +759,7 @@ public:
   /// alignment error (trap) on the target machine.
   virtual bool allowsUnalignedMemoryAccesses(EVT,
                                              unsigned AddrSpace = 0,
-                                             bool * /*Fast*/ = 0) const {
+                                             bool * /*Fast*/ = nullptr) const {
     return false;
   }
 
@@ -896,6 +901,35 @@ public:
   /// @}
 
   //===--------------------------------------------------------------------===//
+  /// \name Helpers for load-linked/store-conditional atomic expansion.
+  /// @{
+
+  /// Perform a load-linked operation on Addr, returning a "Value *" with the
+  /// corresponding pointee type. This may entail some non-trivial operations to
+  /// truncate or reconstruct types that will be illegal in the backend. See
+  /// ARMISelLowering for an example implementation.
+  virtual Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                                AtomicOrdering Ord) const {
+    llvm_unreachable("Load linked unimplemented on this target");
+  }
+
+  /// Perform a store-conditional operation to Addr. Return the status of the
+  /// store. This should be 0 if the store succeeded, non-zero otherwise.
+  virtual Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                                      Value *Addr, AtomicOrdering Ord) const {
+    llvm_unreachable("Store conditional unimplemented on this target");
+  }
+
+  /// Return true if the given (atomic) instruction should be expanded by the
+  /// IR-level AtomicExpandLoadLinked pass into a loop involving
+  /// load-linked/store-conditional pairs. Atomic stores will be expanded in the
+  /// same way as "atomic xchg" operations which ignore their output if needed.
+  virtual bool shouldExpandAtomicInIR(Instruction *Inst) const {
+    return false;
+  }
+
+
+  //===--------------------------------------------------------------------===//
   // TargetLowering Configuration Methods - These methods should be invoked by
   // the derived class constructor to configure this object for the target.
   //
@@ -975,6 +1009,14 @@ protected:
     HasMultipleConditionRegisters = hasManyRegs;
   }
 
+  /// Tells the code generator that the target has BitExtract instructions.
+  /// The code generator will aggressively sink "shift"s into the blocks of
+  /// their users if the users will generate "and" instructions which can be
+  /// combined with "shift" to BitExtract instructions.
+  void setHasExtractBitsInsn(bool hasExtractInsn = true) {
+    HasExtractBitsInsn = hasExtractInsn;
+  }
+
   /// Tells the code generator not to expand sequence of operations into a
   /// separate sequences that increases the amount of flow control.
   void setJumpIsExpensive(bool isExpensive = true) {
@@ -1178,7 +1220,7 @@ public:
     int64_t      BaseOffs;
     bool         HasBaseReg;
     int64_t      Scale;
-    AddrMode() : BaseGV(0), BaseOffs(0), HasBaseReg(false), Scale(0) {}
+    AddrMode() : BaseGV(nullptr), BaseOffs(0), HasBaseReg(false), Scale(0) {}
   };
 
   /// Return true if the addressing mode represented by AM is legal for this
@@ -1394,6 +1436,12 @@ private:
   /// the blocks of their users.
   bool HasMultipleConditionRegisters;
 
+  /// Tells the code generator that the target has BitExtract instructions.
+  /// The code generator will aggressively sink "shift"s into the blocks of
+  /// their users if the users will generate "and" instructions which can be
+  /// combined with "shift" to BitExtract instructions.
+  bool HasExtractBitsInsn;
+
   /// Tells the code generator not to expand integer divides by constants into a
   /// sequence of muls, adds, and shifts.  This is a hack until a real cost
   /// model is in place.  If we ever optimize for size, this will be set to true
@@ -1895,15 +1943,16 @@ public:
 
   /// Determine which of the bits specified in Mask are known to be either zero
   /// or one and return them in the KnownZero/KnownOne bitsets.
-  virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                              APInt &KnownZero,
-                                              APInt &KnownOne,
-                                              const SelectionDAG &DAG,
-                                              unsigned Depth = 0) const;
+  virtual void computeKnownBitsForTargetNode(const SDValue Op,
+                                             APInt &KnownZero,
+                                             APInt &KnownOne,
+                                             const SelectionDAG &DAG,
+                                             unsigned Depth = 0) const;
 
   /// This method can be implemented by targets that want to expose additional
   /// information about sign bits to the DAG Combiner.
   virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   const SelectionDAG &DAG,
                                                    unsigned Depth = 0) const;
 
   struct DAGCombinerInfo {
@@ -1968,6 +2017,15 @@ public:
   ///
   virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
+  /// Return true if it is profitable to move a following shift through this
+  //  node, adjusting any immediate operands as necessary to preserve semantics.
+  //  This transformation may not be desirable if it disrupts a particularly
+  //  auspicious target-specific tree (e.g. bitfield extraction in AArch64).
+  //  By default, it returns true.
+  virtual bool isDesirableToCommuteWithShift(const SDNode *N /*Op*/) const {
+    return true;
+  }
+
   /// Return true if the target has native support for the specified value type
   /// and it is 'desirable' to use the type for the given node type. e.g. On x86
   /// i16 is legal, but undesirable since i16 instruction encodings are longer
@@ -2053,7 +2111,7 @@ public:
     unsigned NumFixedArgs;
     CallingConv::ID CallConv;
     SDValue Callee;
-    ArgListTy &Args;
+    ArgListTy *Args;
     SelectionDAG &DAG;
     SDLoc DL;
     ImmutableCallSite *CS;
@@ -2061,33 +2119,96 @@ public:
     SmallVector<SDValue, 32> OutVals;
     SmallVector<ISD::InputArg, 32> Ins;
 
+    CallLoweringInfo(SelectionDAG &DAG)
+      : RetTy(nullptr), RetSExt(false), RetZExt(false), IsVarArg(false),
+        IsInReg(false), DoesNotReturn(false), IsReturnValueUsed(true),
+        IsTailCall(false), NumFixedArgs(-1), CallConv(CallingConv::C),
+        Args(nullptr), DAG(DAG), CS(nullptr) {}
+
+    CallLoweringInfo &setDebugLoc(SDLoc dl) {
+      DL = dl;
+      return *this;
+    }
+
+    CallLoweringInfo &setChain(SDValue InChain) {
+      Chain = InChain;
+      return *this;
+    }
+
+    CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultType,
+                                SDValue Target, ArgListTy *ArgsList,
+                                unsigned FixedArgs = -1) {
+      RetTy = ResultType;
+      Callee = Target;
+      CallConv = CC;
+      NumFixedArgs =
+        (FixedArgs == static_cast<unsigned>(-1) ? Args->size() : FixedArgs);
+      Args = ArgsList;
+      return *this;
+    }
+
+    CallLoweringInfo &setCallee(Type *ResultType, FunctionType *FTy,
+                                SDValue Target, ArgListTy *ArgsList,
+                                ImmutableCallSite &Call) {
+      RetTy = ResultType;
+
+      IsInReg = Call.paramHasAttr(0, Attribute::InReg);
+      DoesNotReturn = Call.doesNotReturn();
+      IsVarArg = FTy->isVarArg();
+      IsReturnValueUsed = !Call.getInstruction()->use_empty();
+      RetSExt = Call.paramHasAttr(0, Attribute::SExt);
+      RetZExt = Call.paramHasAttr(0, Attribute::ZExt);
+
+      Callee = Target;
+
+      CallConv = Call.getCallingConv();
+      NumFixedArgs = FTy->getNumParams();
+      Args = ArgsList;
+
+      CS = &Call;
 
-    /// Constructs a call lowering context based on the ImmutableCallSite \p cs.
-    CallLoweringInfo(SDValue chain, Type *retTy,
-                     FunctionType *FTy, bool isTailCall, SDValue callee,
-                     ArgListTy &args, SelectionDAG &dag, SDLoc dl,
-                     ImmutableCallSite &cs)
-    : Chain(chain), RetTy(retTy), RetSExt(cs.paramHasAttr(0, Attribute::SExt)),
-      RetZExt(cs.paramHasAttr(0, Attribute::ZExt)), IsVarArg(FTy->isVarArg()),
-      IsInReg(cs.paramHasAttr(0, Attribute::InReg)),
-      DoesNotReturn(cs.doesNotReturn()),
-      IsReturnValueUsed(!cs.getInstruction()->use_empty()),
-      IsTailCall(isTailCall), NumFixedArgs(FTy->getNumParams()),
-      CallConv(cs.getCallingConv()), Callee(callee), Args(args), DAG(dag),
-      DL(dl), CS(&cs) {}
-
-    /// Constructs a call lowering context based on the provided call
-    /// information.
-    CallLoweringInfo(SDValue chain, Type *retTy, bool retSExt, bool retZExt,
-                     bool isVarArg, bool isInReg, unsigned numFixedArgs,
-                     CallingConv::ID callConv, bool isTailCall,
-                     bool doesNotReturn, bool isReturnValueUsed, SDValue callee,
-                     ArgListTy &args, SelectionDAG &dag, SDLoc dl)
-    : Chain(chain), RetTy(retTy), RetSExt(retSExt), RetZExt(retZExt),
-      IsVarArg(isVarArg), IsInReg(isInReg), DoesNotReturn(doesNotReturn),
-      IsReturnValueUsed(isReturnValueUsed), IsTailCall(isTailCall),
-      NumFixedArgs(numFixedArgs), CallConv(callConv), Callee(callee),
-      Args(args), DAG(dag), DL(dl), CS(NULL) {}
+      return *this;
+    }
+
+    CallLoweringInfo &setInRegister(bool Value = true) {
+      IsInReg = Value;
+      return *this;
+    }
+
+    CallLoweringInfo &setNoReturn(bool Value = true) {
+      DoesNotReturn = Value;
+      return *this;
+    }
+
+    CallLoweringInfo &setVarArg(bool Value = true) {
+      IsVarArg = Value;
+      return *this;
+    }
+
+    CallLoweringInfo &setTailCall(bool Value = true) {
+      IsTailCall = Value;
+      return *this;
+    }
+
+    CallLoweringInfo &setDiscardResult(bool Value = true) {
+      IsReturnValueUsed = !Value;
+      return *this;
+    }
+
+    CallLoweringInfo &setSExtResult(bool Value = true) {
+      RetSExt = Value;
+      return *this;
+    }
+
+    CallLoweringInfo &setZExtResult(bool Value = true) {
+      RetZExt = Value;
+      return *this;
+    }
+
+    ArgListTy &getArgs() {
+      assert(Args && "Arguments must be set before accessing them");
+      return *Args;
+    }
   };
 
   /// This function lowers an abstract call to a function into an actual call.
@@ -2156,6 +2277,13 @@ public:
     return "__clear_cache";
   }
 
+  /// Return the register ID of the name passed in. Used by named register
+  /// global variables extension. There is no target-independent behaviour
+  /// so the default action is to bail.
+  virtual unsigned getRegisterByName(const char* RegName, EVT VT) const {
+    report_fatal_error("Named registers not implemented for this target");
+  }
+
   /// Return the type that should be used to zero or sign extend a
   /// zeroext/signext integer argument or return value.  FIXME: Most C calling
   /// convention requires the return type to be promoted, but this is not true
@@ -2168,10 +2296,19 @@ public:
     return VT.bitsLT(MinVT) ? MinVT : VT;
   }
 
+  /// For some targets, an LLVM struct type must be broken down into multiple
+  /// simple types, but the calling convention specifies that the entire struct
+  /// must be passed in a block of consecutive registers.
+  virtual bool
+  functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv,
+                                            bool isVarArg) const {
+    return false;
+  }
+
   /// Returns a 0 terminated array of registers that can be safely used as
   /// scratch registers.
-  virtual const uint16_t *getScratchRegisters(CallingConv::ID CC) const {
-    return NULL;
+  virtual const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const {
+    return nullptr;
   }
 
   /// This callback is used to prepare for a volatile or atomic load.
@@ -2232,7 +2369,7 @@ public:
   /// target does not support "fast" ISel.
   virtual FastISel *createFastISel(FunctionLoweringInfo &,
                                    const TargetLibraryInfo *) const {
-    return 0;
+    return nullptr;
   }
 
 
@@ -2306,7 +2443,7 @@ public:
     AsmOperandInfo(const InlineAsm::ConstraintInfo &info)
       : InlineAsm::ConstraintInfo(info),
         ConstraintType(TargetLowering::C_Unknown),
-        CallOperandVal(0), ConstraintVT(MVT::Other) {
+        CallOperandVal(nullptr), ConstraintVT(MVT::Other) {
     }
   };
 
@@ -2334,7 +2471,7 @@ public:
   /// Op, otherwise an empty SDValue can be passed.
   virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo,
                                       SDValue Op,
-                                      SelectionDAG *DAG = 0) const;
+                                      SelectionDAG *DAG = nullptr) const;
 
   /// Given a constraint, return the type of constraint it is for this target.
   virtual ConstraintType getConstraintType(const std::string &Constraint) const;
@@ -2368,10 +2505,30 @@ public:
   //
   SDValue BuildExactSDIV(SDValue Op1, SDValue Op2, SDLoc dl,
                          SelectionDAG &DAG) const;
-  SDValue BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
-                      std::vector<SDNode*> *Created) const;
-  SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
-                      std::vector<SDNode*> *Created) const;
+  SDValue BuildSDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                    bool IsAfterLegalization,
+                    std::vector<SDNode *> *Created) const;
+  SDValue BuildUDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                    bool IsAfterLegalization,
+                    std::vector<SDNode *> *Created) const;
+
+  //===--------------------------------------------------------------------===//
+  // Legalization utility functions
+  //
+
+  /// Expand a MUL into two nodes.  One that computes the high bits of
+  /// the result and one that computes the low bits.
+  /// \param HiLoVT The value type to use for the Lo and Hi nodes.
+  /// \param LL Low bits of the LHS of the MUL.  You can use this parameter
+  ///        if you want to control how low bits are extracted from the LHS.
+  /// \param LH High bits of the LHS of the MUL.  See LL for meaning.
+  /// \param RL Low bits of the RHS of the MUL.  See LL for meaning
+  /// \param RH High bits of the RHS of the MUL.  See LL for meaning.
+  /// \returns true if the node has been expanded. false if it has not
+  bool expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
+                 SelectionDAG &DAG, SDValue LL = SDValue(),
+                 SDValue LH = SDValue(), SDValue RL = SDValue(),
+                 SDValue RH = SDValue()) const;
 
   //===--------------------------------------------------------------------===//
   // Instruction Emitting Hooks
diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h
index cdb7ea6..374a163 100644
--- a/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/include/llvm/Target/TargetLoweringObjectFile.h
@@ -44,7 +44,7 @@ class TargetLoweringObjectFile : public MCObjectFileInfo {
 public:
   MCContext &getContext() const { return *Ctx; }
 
-  TargetLoweringObjectFile() : MCObjectFileInfo(), Ctx(0), DL(0) {}
+  TargetLoweringObjectFile() : MCObjectFileInfo(), Ctx(nullptr), DL(nullptr) {}
 
   virtual ~TargetLoweringObjectFile();
 
@@ -104,7 +104,7 @@ public:
   virtual const MCSection *getSpecialCasedSectionGlobals(const GlobalValue *GV,
                                                          SectionKind Kind,
                                                          Mangler &Mang) const {
-    return 0;
+    return nullptr;
   }
 
   /// Return an MCExpr to use for a reference to the specified global variable
@@ -130,14 +130,15 @@ public:
   getTTypeReference(const MCSymbolRefExpr *Sym, unsigned Encoding,
                     MCStreamer &Streamer) const;
 
-  virtual const MCSection *
-  getStaticCtorSection(unsigned Priority = 65535) const {
-    (void)Priority;
+  virtual const MCSection *getStaticCtorSection(unsigned Priority,
+                                                const MCSymbol *KeySym,
+                                                const MCSection *KeySec) const {
     return StaticCtorSection;
   }
-  virtual const MCSection *
-  getStaticDtorSection(unsigned Priority = 65535) const {
-    (void)Priority;
+
+  virtual const MCSection *getStaticDtorSection(unsigned Priority,
+                                                const MCSymbol *KeySym,
+                                                const MCSection *KeySec) const {
     return StaticDtorSection;
   }
 
@@ -148,7 +149,7 @@ public:
   virtual const MCExpr *
   getExecutableRelativeSymbol(const ConstantExpr *CE, Mangler &Mang,
                               const TargetMachine &TM) const {
-    return 0;
+    return nullptr;
   }
 
   /// \brief True if the section is atomized using the symbols in it.
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index ce3f866..b263c57 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -84,11 +84,6 @@ protected: // Can only create subclasses.
   ///
   const MCAsmInfo *AsmInfo;
 
-  unsigned MCRelaxAll : 1;
-  unsigned MCNoExecStack : 1;
-  unsigned MCSaveTempLabels : 1;
-  unsigned MCUseCFI : 1;
-  unsigned MCUseDwarfDirectory : 1;
   unsigned RequireStructuredCFG : 1;
 
 public:
@@ -102,7 +97,9 @@ public:
 
   /// getSubtargetImpl - virtual method implemented by subclasses that returns
   /// a reference to that target's TargetSubtargetInfo-derived member variable.
-  virtual const TargetSubtargetInfo *getSubtargetImpl() const { return 0; }
+  virtual const TargetSubtargetInfo *getSubtargetImpl() const {
+    return nullptr;
+  }
 
   mutable TargetOptions Options;
 
@@ -118,11 +115,15 @@ public:
   //
   // N.B. These objects may change during compilation. It's not safe to cache
   // them between functions.
-  virtual const TargetInstrInfo         *getInstrInfo() const { return 0; }
-  virtual const TargetFrameLowering *getFrameLowering() const { return 0; }
-  virtual const TargetLowering    *getTargetLowering() const { return 0; }
-  virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const{ return 0; }
-  virtual const DataLayout             *getDataLayout() const { return 0; }
+  virtual const TargetInstrInfo  *getInstrInfo() const { return nullptr; }
+  virtual const TargetFrameLowering *getFrameLowering() const {
+    return nullptr;
+  }
+  virtual const TargetLowering *getTargetLowering() const { return nullptr; }
+  virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const {
+    return nullptr;
+  }
+  virtual const DataLayout *getDataLayout() const { return nullptr; }
 
   /// getMCAsmInfo - Return target specific asm information.
   ///
@@ -139,64 +140,28 @@ public:
   /// not, return null.  This is kept separate from RegInfo until RegInfo has
   /// details of graph coloring register allocation removed from it.
   ///
-  virtual const TargetRegisterInfo *getRegisterInfo() const { return 0; }
+  virtual const TargetRegisterInfo *getRegisterInfo() const { return nullptr; }
 
   /// getIntrinsicInfo - If intrinsic information is available, return it.  If
   /// not, return null.
   ///
-  virtual const TargetIntrinsicInfo *getIntrinsicInfo() const { return 0; }
+  virtual const TargetIntrinsicInfo *getIntrinsicInfo() const { return nullptr;}
 
   /// getJITInfo - If this target supports a JIT, return information for it,
   /// otherwise return null.
   ///
-  virtual TargetJITInfo *getJITInfo() { return 0; }
+  virtual TargetJITInfo *getJITInfo() { return nullptr; }
 
   /// getInstrItineraryData - Returns instruction itinerary data for the target
   /// or specific subtarget.
   ///
   virtual const InstrItineraryData *getInstrItineraryData() const {
-    return 0;
+    return nullptr;
   }
 
   bool requiresStructuredCFG() const { return RequireStructuredCFG; }
   void setRequiresStructuredCFG(bool Value) { RequireStructuredCFG = Value; }
 
-  /// hasMCRelaxAll - Check whether all machine code instructions should be
-  /// relaxed.
-  bool hasMCRelaxAll() const { return MCRelaxAll; }
-
-  /// setMCRelaxAll - Set whether all machine code instructions should be
-  /// relaxed.
-  void setMCRelaxAll(bool Value) { MCRelaxAll = Value; }
-
-  /// hasMCSaveTempLabels - Check whether temporary labels will be preserved
-  /// (i.e., not treated as temporary).
-  bool hasMCSaveTempLabels() const { return MCSaveTempLabels; }
-
-  /// setMCSaveTempLabels - Set whether temporary labels will be preserved
-  /// (i.e., not treated as temporary).
-  void setMCSaveTempLabels(bool Value) { MCSaveTempLabels = Value; }
-
-  /// hasMCNoExecStack - Check whether an executable stack is not needed.
-  bool hasMCNoExecStack() const { return MCNoExecStack; }
-
-  /// setMCNoExecStack - Set whether an executabel stack is not needed.
-  void setMCNoExecStack(bool Value) { MCNoExecStack = Value; }
-
-  /// hasMCUseCFI - Check whether we should use dwarf's .cfi_* directives.
-  bool hasMCUseCFI() const { return MCUseCFI; }
-
-  /// setMCUseCFI - Set whether all we should use dwarf's .cfi_* directives.
-  void setMCUseCFI(bool Value) { MCUseCFI = Value; }
-
-  /// hasMCUseDwarfDirectory - Check whether we should use .file directives with
-  /// explicit directories.
-  bool hasMCUseDwarfDirectory() const { return MCUseDwarfDirectory; }
-
-  /// setMCUseDwarfDirectory - Set whether all we should use .file directives
-  /// with explicit directories.
-  void setMCUseDwarfDirectory(bool Value) { MCUseDwarfDirectory = Value; }
-
   /// getRelocationModel - Returns the code generation relocation model. The
   /// choices are static, PIC, and dynamic-no-pic, and target default.
   Reloc::Model getRelocationModel() const;
@@ -222,26 +187,26 @@ public:
 
   /// getAsmVerbosityDefault - Returns the default value of asm verbosity.
   ///
-  static bool getAsmVerbosityDefault();
+  bool getAsmVerbosityDefault() const ;
 
   /// setAsmVerbosityDefault - Set the default value of asm verbosity. Default
   /// is false.
-  static void setAsmVerbosityDefault(bool);
+  void setAsmVerbosityDefault(bool);
 
   /// getDataSections - Return true if data objects should be emitted into their
   /// own section, corresponds to -fdata-sections.
-  static bool getDataSections();
+  bool getDataSections() const;
 
   /// getFunctionSections - Return true if functions should be emitted into
   /// their own section, corresponding to -ffunction-sections.
-  static bool getFunctionSections();
+  bool getFunctionSections() const;
 
   /// setDataSections - Set if the data are emit into separate sections.
-  static void setDataSections(bool);
+  void setDataSections(bool);
 
   /// setFunctionSections - Set if the functions are emit into separate
   /// sections.
-  static void setFunctionSections(bool);
+  void setFunctionSections(bool);
 
   /// \brief Register analysis passes for this target with a pass manager.
   virtual void addAnalysisPasses(PassManagerBase &) {}
@@ -263,8 +228,8 @@ public:
                                    formatted_raw_ostream &,
                                    CodeGenFileType,
                                    bool /*DisableVerify*/ = true,
-                                   AnalysisID /*StartAfter*/ = 0,
-                                   AnalysisID /*StopAfter*/ = 0) {
+                                   AnalysisID /*StartAfter*/ = nullptr,
+                                   AnalysisID /*StopAfter*/ = nullptr) {
     return true;
   }
 
@@ -323,8 +288,8 @@ public:
   /// generation.
   bool addPassesToEmitFile(PassManagerBase &PM, formatted_raw_ostream &Out,
                            CodeGenFileType FileType, bool DisableVerify = true,
-                           AnalysisID StartAfter = 0,
-                           AnalysisID StopAfter = 0) override;
+                           AnalysisID StartAfter = nullptr,
+                           AnalysisID StopAfter = nullptr) override;
 
   /// addPassesToEmitMachineCode - Add passes to the specified pass manager to
   /// get machine code emitted.  This uses a JITCodeEmitter object to handle
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index 1f87343..636eaf5 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_TARGET_TARGETOPTIONS_H
 #define LLVM_TARGET_TARGETOPTIONS_H
 
+#include "llvm/MC/MCTargetOptions.h"
 #include <string>
 
 namespace llvm {
@@ -49,9 +50,10 @@ namespace llvm {
           JITEmitDebugInfoToDisk(false), GuaranteedTailCallOpt(false),
           DisableTailCalls(false), StackAlignmentOverride(0),
           EnableFastISel(false), PositionIndependentExecutable(false),
-          EnableSegmentedStacks(false), UseInitArray(false),
-          DisableIntegratedAS(false), CompressDebugSections(false),
-          TrapFuncName(""), FloatABIType(FloatABI::Default),
+          UseInitArray(false), DisableIntegratedAS(false),
+          CompressDebugSections(false), FunctionSections(false),
+          DataSections(false), TrapUnreachable(false), TrapFuncName(""),
+          FloatABIType(FloatABI::Default),
           AllowFPOpFusion(FPOpFusion::Standard) {}
 
     /// PrintMachineCode - This flag is enabled when the -print-machineinstrs
@@ -152,8 +154,6 @@ namespace llvm {
     /// if the relocation model is anything other than PIC.
     unsigned PositionIndependentExecutable : 1;
 
-    unsigned EnableSegmentedStacks : 1;
-
     /// UseInitArray - Use .init_array instead of .ctors for static
     /// constructors.
     unsigned UseInitArray : 1;
@@ -164,6 +164,15 @@ namespace llvm {
     /// Compress DWARF debug sections.
     unsigned CompressDebugSections : 1;
 
+    /// Emit functions into separate sections.
+    unsigned FunctionSections : 1;
+
+    /// Emit data into separate sections.
+    unsigned DataSections : 1;
+
+    /// Emit target-specific trap instruction for 'unreachable' IR instructions.
+    unsigned TrapUnreachable : 1;
+
     /// getTrapFunctionName - If this returns a non-empty string, this means
     /// isel should lower Intrinsic::trap to a call to the specified function
     /// name instead of an ISD::TRAP node.
@@ -195,6 +204,9 @@ namespace llvm {
     /// via the llvm.fma.* intrinsic) will always be honored, regardless of
     /// the value of this option.
     FPOpFusion::FPOpFusionMode AllowFPOpFusion;
+
+    /// Machine level options.
+    MCTargetOptions MCOptions;
   };
 
 // Comparison operators:
@@ -217,11 +229,12 @@ inline bool operator==(const TargetOptions &LHS,
     ARE_EQUAL(StackAlignmentOverride) &&
     ARE_EQUAL(EnableFastISel) &&
     ARE_EQUAL(PositionIndependentExecutable) &&
-    ARE_EQUAL(EnableSegmentedStacks) &&
     ARE_EQUAL(UseInitArray) &&
+    ARE_EQUAL(TrapUnreachable) &&
     ARE_EQUAL(TrapFuncName) &&
     ARE_EQUAL(FloatABIType) &&
-    ARE_EQUAL(AllowFPOpFusion);
+    ARE_EQUAL(AllowFPOpFusion) &&
+    ARE_EQUAL(MCOptions);
 #undef ARE_EQUAL
 }
 
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index b0c21c1..a162297 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -174,7 +174,7 @@ public:
   /// isASubClass - return true if this TargetRegisterClass is a subset
   /// class of at least one other TargetRegisterClass.
   bool isASubClass() const {
-    return SuperClasses[0] != 0;
+    return SuperClasses[0] != nullptr;
   }
 
   /// getRawAllocationOrder - Returns the preferred order for allocating
@@ -317,7 +317,7 @@ public:
   /// indicating if a register is allocatable or not. If a register class is
   /// specified, returns the subset for the class.
   BitVector getAllocatableSet(const MachineFunction &MF,
-                              const TargetRegisterClass *RC = NULL) const;
+                              const TargetRegisterClass *RC = nullptr) const;
 
   /// getCostPerUse - Return the additional cost of using this register instead
   /// of other registers in its class.
@@ -420,8 +420,8 @@ public:
   /// order of desired callee-save stack frame offset. The first register is
   /// closest to the incoming stack pointer if stack grows down, and vice versa.
   ///
-  virtual const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF = 0)
-                                                                      const = 0;
+  virtual const MCPhysReg*
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const = 0;
 
   /// getCallPreservedMask - Return a mask of call-preserved registers for the
   /// given calling convention on the current sub-target.  The mask should
@@ -443,7 +443,7 @@ public:
   ///
   virtual const uint32_t *getCallPreservedMask(CallingConv::ID) const {
     // The default mask clobbers everything.  All targets should override.
-    return 0;
+    return nullptr;
   }
 
   /// getReservedRegs - Returns a bitset indexed by physical register number
@@ -651,7 +651,7 @@ public:
                                      ArrayRef<MCPhysReg> Order,
                                      SmallVectorImpl<MCPhysReg> &Hints,
                                      const MachineFunction &MF,
-                                     const VirtRegMap *VRM = 0) const;
+                                     const VirtRegMap *VRM = nullptr) const;
 
   /// avoidWriteAfterWrite - Return true if the register allocator should avoid
   /// writing a register from RC in two consecutive instructions.
@@ -805,7 +805,7 @@ public:
   /// instruction.  FIOperandNum is the FI operand number.
   virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                    int SPAdj, unsigned FIOperandNum,
-                                   RegScavenger *RS = NULL) const = 0;
+                                   RegScavenger *RS = nullptr) const = 0;
 
   //===--------------------------------------------------------------------===//
   /// Debug information queries.
@@ -874,7 +874,7 @@ public:
     Mask += RCMaskWords;
     SubReg = *Idx++;
     if (!SubReg)
-      Idx = 0;
+      Idx = nullptr;
   }
 };
 
@@ -902,7 +902,7 @@ class PrintReg {
   unsigned Reg;
   unsigned SubIdx;
 public:
-  explicit PrintReg(unsigned reg, const TargetRegisterInfo *tri = 0,
+  explicit PrintReg(unsigned reg, const TargetRegisterInfo *tri = nullptr,
                     unsigned subidx = 0)
     : TRI(tri), Reg(reg), SubIdx(subidx) {}
   void print(raw_ostream&) const;
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index b4d0c44..e6eeb88 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -79,6 +79,8 @@ class SchedMachineModel {
   int MinLatency = -1; // Determines which instructions are allowed in a group.
                        // (-1) inorder (0) ooo, (1): inorder +var latencies.
   int MicroOpBufferSize = -1; // Max micro-ops that can be buffered.
+  int LoopMicroOpBufferSize = -1; // Max micro-ops that can be buffered for
+                                  // optimized loop dispatch/execution.
   int LoadLatency = -1; // Cycles for loads to access the cache.
   int HighLatency = -1; // Approximation of cycles for "high latency" ops.
   int MispredictPenalty = -1; // Extra cycles for a mispredicted branch.
diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index 1b2e06a..c0c342b 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -76,6 +76,11 @@ public:
                                    MachineInstr *end,
                                    unsigned NumRegionInstrs) const {}
 
+  // \brief Perform target specific adjustments to the latency of a schedule
+  // dependency.
+  virtual void adjustSchedDependency(SUnit *def, SUnit *use,
+                                     SDep& dep) const { }
+
   // enablePostRAScheduler - If the target can benefit from post-regalloc
   // scheduling and the specified optimization level meets the requirement
   // return true to enable post-register-allocation scheduling. In
@@ -84,15 +89,14 @@ public:
   virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                                      AntiDepBreakMode& Mode,
                                      RegClassVector& CriticalPathRCs) const;
-  // adjustSchedDependency - Perform target specific adjustments to
-  // the latency of a schedule dependency.
-  virtual void adjustSchedDependency(SUnit *def, SUnit *use,
-                                     SDep& dep) const { }
 
   /// \brief Enable use of alias analysis during code generation (during MI
   /// scheduling, DAGCombine, etc.).
   virtual bool useAA() const;
 
+  /// \brief Enable the use of the early if conversion pass.
+  virtual bool enableEarlyIfConversion() const { return false; }
+
   /// \brief Reset the features for the subtarget.
   virtual void resetSubtargetFeatures(const MachineFunction *MF) { }
 };
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index 334fb1c..ce1a7d6 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -58,21 +58,18 @@ ModulePass *createStripDeadDebugInfoPass();
 ///
 ModulePass *createConstantMergePass();
 
-
 //===----------------------------------------------------------------------===//
 /// createGlobalOptimizerPass - This function returns a new pass that optimizes
 /// non-address taken internal globals.
 ///
 ModulePass *createGlobalOptimizerPass();
 
-
 //===----------------------------------------------------------------------===//
 /// createGlobalDCEPass - This transform is designed to eliminate unreachable
 /// internal globals (functions or global variables)
 ///
 ModulePass *createGlobalDCEPass();
 
-
 //===----------------------------------------------------------------------===//
 /// createGVExtractionPass - If deleteFn is true, this pass deletes
 /// the specified global values. Otherwise, it deletes as much of the module as
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 42b6b27..023de08 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -55,7 +55,6 @@ using legacy::FunctionPassManager;
 ///   ...
 class PassManagerBuilder {
 public:
-
   /// Extensions are passed the builder itself (so they can see how it is
   /// configured) as well as the pass manager to add stuff to.
   typedef void (*ExtensionFn)(const PassManagerBuilder &Builder,
@@ -86,7 +85,12 @@ public:
     /// EP_EnabledOnOptLevel0 - This extension point allows adding passes that
     /// should not be disabled by O0 optimization level. The passes will be
     /// inserted after the inlining pass.
-    EP_EnabledOnOptLevel0
+    EP_EnabledOnOptLevel0,
+
+    /// EP_Peephole - This extension point allows adding passes that perform
+    /// peephole optimizations similar to the instruction combiner. These passes
+    /// will be inserted after each instance of the instruction combiner pass.
+    EP_Peephole,
   };
 
   /// The Optimization Level - Specify the basic optimization level.
@@ -106,6 +110,7 @@ public:
   /// added to the per-module passes.
   Pass *Inliner;
 
+  bool DisableTailCalls;
   bool DisableUnitAtATime;
   bool DisableUnrollLoops;
   bool BBVectorize;
@@ -129,8 +134,8 @@ public:
 private:
   void addExtensionsToPM(ExtensionPointTy ETy, PassManagerBase &PM) const;
   void addInitialAliasAnalysisPasses(PassManagerBase &PM) const;
-public:
 
+public:
   /// populateFunctionPassManager - This fills in the function pass manager,
   /// which is expected to be run on each function immediately as it is
   /// generated.  The idea is to reduce the size of the IR in memory.
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index b527546..61d5c26 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -79,8 +79,8 @@ FunctionPass *createThreadSanitizerPass(StringRef BlacklistFile = StringRef());
 
 // Insert DataFlowSanitizer (dynamic data flow analysis) instrumentation
 ModulePass *createDataFlowSanitizerPass(StringRef ABIListFile = StringRef(),
-                                        void *(*getArgTLS)() = 0,
-                                        void *(*getRetValTLS)() = 0);
+                                        void *(*getArgTLS)() = nullptr,
+                                        void *(*getRetValTLS)() = nullptr);
 
 #if defined(__GNUC__) && defined(__linux__) && !defined(ANDROID)
 inline ModulePass *createDataFlowSanitizerPassForJIT(StringRef ABIListFile =
diff --git a/include/llvm/Transforms/ObjCARC.h b/include/llvm/Transforms/ObjCARC.h
index b3c19c0..1897adc 100644
--- a/include/llvm/Transforms/ObjCARC.h
+++ b/include/llvm/Transforms/ObjCARC.h
@@ -46,4 +46,3 @@ Pass *createObjCARCOptPass();
 } // End llvm namespace
 
 #endif
-
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 7267222..cf1d655 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -122,7 +122,7 @@ Pass *createLICMPass();
 //
 Pass *createLoopStrengthReducePass();
 
-Pass *createGlobalMergePass(const TargetMachine *TM = 0);
+Pass *createGlobalMergePass(const TargetMachine *TM = nullptr);
 
 //===----------------------------------------------------------------------===//
 //
@@ -155,14 +155,14 @@ Pass *createLoopRerollPass();
 //
 // LoopRotate - This pass is a simple loop rotating pass.
 //
-Pass *createLoopRotatePass();
+Pass *createLoopRotatePass(int MaxHeaderSize = -1);
 
 //===----------------------------------------------------------------------===//
 //
 // LoopIdiom - This pass recognizes and replaces idioms in loops.
 //
 Pass *createLoopIdiomPass();
-  
+
 //===----------------------------------------------------------------------===//
 //
 // PromoteMemoryToRegister - This pass is used to promote memory references to
@@ -201,7 +201,7 @@ FunctionPass *createReassociatePass();
 // preds always go to some succ.
 //
 FunctionPass *createJumpThreadingPass();
-  
+
 //===----------------------------------------------------------------------===//
 //
 // CFGSimplification - Merge basic blocks, eliminate unreachable blocks,
@@ -284,10 +284,10 @@ extern char &LCSSAID;
 // tree.
 //
 FunctionPass *createEarlyCSEPass();
-  
+
 //===----------------------------------------------------------------------===//
 //
-// GVN - This pass performs global value numbering and redundant load 
+// GVN - This pass performs global value numbering and redundant load
 // elimination cotemporaneously.
 //
 FunctionPass *createGVNPass(bool NoLoads = false);
@@ -305,7 +305,7 @@ FunctionPass *createMemCpyOptPass();
 // can prove are dead.
 //
 Pass *createLoopDeletionPass();
-  
+
 //===----------------------------------------------------------------------===//
 //
 // ConstantHoisting - This pass prepares a function for expensive constants.
@@ -318,7 +318,7 @@ FunctionPass *createConstantHoistingPass();
 //
 FunctionPass *createInstructionNamerPass();
 extern char &InstructionNamerID;
-  
+
 //===----------------------------------------------------------------------===//
 //
 // Sink - Code Sinking
@@ -344,14 +344,12 @@ Pass *createCorrelatedValuePropagationPass();
 FunctionPass *createInstructionSimplifierPass();
 extern char &InstructionSimplifierID;
 
-
 //===----------------------------------------------------------------------===//
 //
 // LowerExpectIntrinsics - Removes llvm.expect intrinsics and creates
 // "block_weights" metadata.
 FunctionPass *createLowerExpectIntrinsicPass();
 
-
 //===----------------------------------------------------------------------===//
 //
 // PartiallyInlineLibCalls - Tries to inline the fast path of library
@@ -377,6 +375,12 @@ FunctionPass *createScalarizerPass();
 // AddDiscriminators - Add DWARF path discriminators to the IR.
 FunctionPass *createAddDiscriminatorsPass();
 
+//===----------------------------------------------------------------------===//
+//
+// SeparateConstOffsetFromGEP - Split GEPs for better CSE
+//
+FunctionPass *createSeparateConstOffsetFromGEPPass();
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 4d5e305..7309f69 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -34,23 +34,22 @@ class TerminatorInst;
 /// predecessors.
 void DeleteDeadBlock(BasicBlock *BB);
 
-
 /// FoldSingleEntryPHINodes - We know that BB has one predecessor.  If there are
 /// any single-entry PHI nodes in it, fold them away.  This handles the case
 /// when all entries to the PHI nodes in a block are guaranteed equal, such as
 /// when the block has exactly one predecessor.
-void FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P = 0);
+void FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P = nullptr);
 
 /// DeleteDeadPHIs - Examine each PHI in the given block and delete it if it
 /// is dead. Also recursively delete any operands that become dead as
 /// a result. This includes tracing the def-use list from the PHI to see if
 /// it is ultimately unused or if it reaches an unused cycle. Return true
 /// if any PHIs were deleted.
-bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI = 0);
+bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI = nullptr);
 
 /// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,
 /// if possible.  The return value indicates success or failure.
-bool MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P = 0);
+bool MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P = nullptr);
 
 // ReplaceInstWithValue - Replace all uses of an instruction (specified by BI)
 // with a value, then remove and delete the original instruction.
@@ -89,12 +88,13 @@ void ReplaceInstWithInst(Instruction *From, Instruction *To);
 /// to.
 ///
 BasicBlock *SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
-                              Pass *P = 0, bool MergeIdenticalEdges = false,
+                              Pass *P = nullptr,
+                              bool MergeIdenticalEdges = false,
                               bool DontDeleteUselessPHIs = false,
                               bool SplitLandingPads = false);
 
 inline BasicBlock *SplitCriticalEdge(BasicBlock *BB, succ_iterator SI,
-                                     Pass *P = 0) {
+                                     Pass *P = nullptr) {
   return SplitCriticalEdge(BB->getTerminator(), SI.getSuccessorIndex(), P);
 }
 
@@ -103,7 +103,8 @@ inline BasicBlock *SplitCriticalEdge(BasicBlock *BB, succ_iterator SI,
 /// This updates all of the same analyses as the other SplitCriticalEdge
 /// function.  If P is specified, it updates the analyses
 /// described above.
-inline bool SplitCriticalEdge(BasicBlock *Succ, pred_iterator PI, Pass *P = 0) {
+inline bool SplitCriticalEdge(BasicBlock *Succ, pred_iterator PI,
+                              Pass *P = nullptr) {
   bool MadeChange = false;
   TerminatorInst *TI = (*PI)->getTerminator();
   for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
@@ -117,7 +118,7 @@ inline bool SplitCriticalEdge(BasicBlock *Succ, pred_iterator PI, Pass *P = 0) {
 /// an edge between the two blocks.  If P is specified, it updates the analyses
 /// described above.
 inline BasicBlock *SplitCriticalEdge(BasicBlock *Src, BasicBlock *Dst,
-                                     Pass *P = 0,
+                                     Pass *P = nullptr,
                                      bool MergeIdenticalEdges = false,
                                      bool DontDeleteUselessPHIs = false) {
   TerminatorInst *TI = Src->getTerminator();
@@ -155,7 +156,7 @@ BasicBlock *SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P);
 /// is an exit of a loop with other exits).
 ///
 BasicBlock *SplitBlockPredecessors(BasicBlock *BB, ArrayRef<BasicBlock*> Preds,
-                                   const char *Suffix, Pass *P = 0);
+                                   const char *Suffix, Pass *P = nullptr);
 
 /// SplitLandingPadPredecessors - This method transforms the landing pad,
 /// OrigBB, by introducing two new basic blocks into the function. One of those
@@ -203,8 +204,7 @@ ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
 /// Returns the NewBasicBlock's terminator.
 TerminatorInst *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
                                           bool Unreachable,
-                                          MDNode *BranchWeights = 0);
-
+                                          MDNode *BranchWeights = nullptr);
 
 /// SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen,
 /// but also creates the ElseBlock.
@@ -223,7 +223,7 @@ TerminatorInst *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
 void SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
                                    TerminatorInst **ThenTerm,
                                    TerminatorInst **ElseTerm,
-                                   MDNode *BranchWeights = 0);
+                                   MDNode *BranchWeights = nullptr);
 
 ///
 /// GetIfCondition - Check whether BB is the merge point of a if-region.
diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index 0f39ada..1e407fb 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -21,7 +21,7 @@ namespace llvm {
   class Value;
   class DataLayout;
   class TargetLibraryInfo;
-  
+
   /// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
   Value *CastToCStr(Value *V, IRBuilder<> &B);
 
@@ -124,6 +124,7 @@ namespace llvm {
     virtual void replaceCall(Value *With) = 0;
     virtual bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp,
                             bool isString) const = 0;
+
   public:
     virtual ~SimplifyFortifiedLibCalls();
     bool fold(CallInst *CI, const DataLayout *TD, const TargetLibraryInfo *TLI);
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index 96c6508..bdf50dd 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -55,17 +55,16 @@ struct ClonedCodeInfo {
   /// ContainsCalls - This is set to true if the cloned code contains a normal
   /// call instruction.
   bool ContainsCalls;
-  
+
   /// ContainsDynamicAllocas - This is set to true if the cloned code contains
   /// a 'dynamic' alloca.  Dynamic allocas are allocas that are either not in
   /// the entry block or they are in the entry block but are not a constant
   /// size.
   bool ContainsDynamicAllocas;
-  
+
   ClonedCodeInfo() : ContainsCalls(false), ContainsDynamicAllocas(false) {}
 };
 
-
 /// CloneBasicBlock - Return a copy of the specified basic block, but without
 /// embedding the block into a particular function.  The block returned is an
 /// exact copy of the specified basic block, without any remapping having been
@@ -96,8 +95,8 @@ struct ClonedCodeInfo {
 ///
 BasicBlock *CloneBasicBlock(const BasicBlock *BB,
                             ValueToValueMapTy &VMap,
-                            const Twine &NameSuffix = "", Function *F = 0,
-                            ClonedCodeInfo *CodeInfo = 0);
+                            const Twine &NameSuffix = "", Function *F = nullptr,
+                            ClonedCodeInfo *CodeInfo = nullptr);
 
 /// CloneFunction - Return a copy of the specified function, but without
 /// embedding the function into another module.  Also, any references specified
@@ -114,7 +113,7 @@ BasicBlock *CloneBasicBlock(const BasicBlock *BB,
 Function *CloneFunction(const Function *F,
                         ValueToValueMapTy &VMap,
                         bool ModuleLevelChanges,
-                        ClonedCodeInfo *CodeInfo = 0);
+                        ClonedCodeInfo *CodeInfo = nullptr);
 
 /// Clone OldFunc into NewFunc, transforming the old arguments into references
 /// to VMap values.  Note that if NewFunc already has basic blocks, the ones
@@ -129,10 +128,10 @@ void CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                        ValueToValueMapTy &VMap,
                        bool ModuleLevelChanges,
                        SmallVectorImpl<ReturnInst*> &Returns,
-                       const char *NameSuffix = "", 
-                       ClonedCodeInfo *CodeInfo = 0,
-                       ValueMapTypeRemapper *TypeMapper = 0,
-                       ValueMaterializer *Materializer = 0);
+                       const char *NameSuffix = "",
+                       ClonedCodeInfo *CodeInfo = nullptr,
+                       ValueMapTypeRemapper *TypeMapper = nullptr,
+                       ValueMaterializer *Materializer = nullptr);
 
 /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
 /// except that it does some simple constant prop and DCE on the fly.  The
@@ -149,19 +148,18 @@ void CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                                ValueToValueMapTy &VMap,
                                bool ModuleLevelChanges,
                                SmallVectorImpl<ReturnInst*> &Returns,
-                               const char *NameSuffix = "", 
-                               ClonedCodeInfo *CodeInfo = 0,
-                               const DataLayout *DL = 0,
-                               Instruction *TheCall = 0);
+                               const char *NameSuffix = "",
+                               ClonedCodeInfo *CodeInfo = nullptr,
+                               const DataLayout *DL = nullptr,
+                               Instruction *TheCall = nullptr);
 
-  
 /// InlineFunctionInfo - This class captures the data input to the
-/// InlineFunction call, and records the auxiliary results produced by it. 
+/// InlineFunction call, and records the auxiliary results produced by it.
 class InlineFunctionInfo {
 public:
-  explicit InlineFunctionInfo(CallGraph *cg = 0, const DataLayout *DL = 0)
+  explicit InlineFunctionInfo(CallGraph *cg = nullptr, const DataLayout *DL = nullptr)
     : CG(cg), DL(DL) {}
-  
+
   /// CG - If non-null, InlineFunction will update the callgraph to reflect the
   /// changes it makes.
   CallGraph *CG;
@@ -174,13 +172,13 @@ public:
   /// InlinedCalls - InlineFunction fills this in with callsites that were
   /// inlined from the callee.  This is only filled in if CG is non-null.
   SmallVector<WeakVH, 8> InlinedCalls;
-  
+
   void reset() {
     StaticAllocas.clear();
     InlinedCalls.clear();
   }
 };
-  
+
 /// InlineFunction - This function inlines the called function into the basic
 /// block of the caller.  This returns false if it is not possible to inline
 /// this call.  The program is still in a well defined state if this occurs
diff --git a/include/llvm/Transforms/Utils/CmpInstAnalysis.h b/include/llvm/Transforms/Utils/CmpInstAnalysis.h
index 22469e0..73c15e4 100644
--- a/include/llvm/Transforms/Utils/CmpInstAnalysis.h
+++ b/include/llvm/Transforms/Utils/CmpInstAnalysis.h
@@ -63,4 +63,3 @@ namespace llvm {
 } // end namespace llvm
 
 #endif
-
diff --git a/include/llvm/Transforms/Utils/CodeExtractor.h b/include/llvm/Transforms/Utils/CodeExtractor.h
index 1122678..6b41e82 100644
--- a/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -66,7 +66,7 @@ namespace llvm {
     /// dominates the rest, prepare a code extractor object for pulling this
     /// sequence out into its new function. When a DominatorTree is also given,
     /// extra checking and transformations are enabled.
-    CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT = 0,
+    CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT = nullptr,
                   bool AggregateArgs = false);
 
     /// \brief Create a code extractor for a loop body.
@@ -120,7 +120,6 @@ namespace llvm {
                                     BasicBlock *newHeader,
                                     ValueSet &inputs,
                                     ValueSet &outputs);
-
   };
 }
 
diff --git a/include/llvm/Transforms/Utils/CtorUtils.h b/include/llvm/Transforms/Utils/CtorUtils.h
new file mode 100644
index 0000000..81e7b95
--- /dev/null
+++ b/include/llvm/Transforms/Utils/CtorUtils.h
@@ -0,0 +1,32 @@
+//===- CtorUtils.h - Helpers for working with global_ctors ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines functions that are used to process llvm.global_ctors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_CTOR_UTILS_H
+#define LLVM_TRANSFORMS_UTILS_CTOR_UTILS_H
+
+#include "llvm/ADT/STLExtras.h"
+
+namespace llvm {
+
+class GlobalVariable;
+class Function;
+class Module;
+
+/// Call "ShouldRemove" for every entry in M's global_ctor list and remove the
+/// entries for which it returns true.  Return true if anything changed.
+bool optimizeGlobalCtorsList(Module &M,
+                             function_ref<bool(Function *)> ShouldRemove);
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Transforms/Utils/IntegerDivision.h b/include/llvm/Transforms/Utils/IntegerDivision.h
index 55e8b66..0ec3321 100644
--- a/include/llvm/Transforms/Utils/IntegerDivision.h
+++ b/include/llvm/Transforms/Utils/IntegerDivision.h
@@ -55,16 +55,16 @@ namespace llvm {
   /// @brief Replace Rem with generated code.
   bool expandRemainderUpTo64Bits(BinaryOperator *Rem);
 
-  /// Generate code to divide two integers, replacing Div with the generated 
+  /// Generate code to divide two integers, replacing Div with the generated
   /// code. Uses ExpandDivision with a 32bit Div which makes it useful for
   /// targets with little or no support for less than 32 bit arithmetic.
-  /// 
+  ///
   /// @brief Replace Rem with generated code.
   bool expandDivisionUpTo32Bits(BinaryOperator *Div);
 
-  /// Generate code to divide two integers, replacing Div with the generated 
+  /// Generate code to divide two integers, replacing Div with the generated
   /// code. Uses ExpandDivision with a 64bit Div.
-  /// 
+  ///
   /// @brief Replace Rem with generated code.
   bool expandDivisionUpTo64Bits(BinaryOperator *Div);
 
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index c68fd06..6f64269 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -55,7 +55,7 @@ template<typename T> class SmallVectorImpl;
 /// conditions and indirectbr addresses this might make dead if
 /// DeleteDeadConditions is true.
 bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false,
-                            const TargetLibraryInfo *TLI = 0);
+                            const TargetLibraryInfo *TLI = nullptr);
 
 //===----------------------------------------------------------------------===//
 //  Local dead code elimination.
@@ -64,30 +64,31 @@ bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false,
 /// isInstructionTriviallyDead - Return true if the result produced by the
 /// instruction is not used, and the instruction has no side effects.
 ///
-bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=0);
+bool isInstructionTriviallyDead(Instruction *I,
+                                const TargetLibraryInfo *TLI = nullptr);
 
 /// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a
 /// trivially dead instruction, delete it.  If that makes any of its operands
 /// trivially dead, delete them too, recursively.  Return true if any
 /// instructions were deleted.
 bool RecursivelyDeleteTriviallyDeadInstructions(Value *V,
-                                                const TargetLibraryInfo *TLI=0);
+                                        const TargetLibraryInfo *TLI = nullptr);
 
 /// RecursivelyDeleteDeadPHINode - If the specified value is an effectively
 /// dead PHI node, due to being a def-use chain of single-use nodes that
 /// either forms a cycle or is terminated by a trivially dead instruction,
 /// delete it.  If that makes any of its operands trivially dead, delete them
 /// too, recursively.  Return true if a change was made.
-bool RecursivelyDeleteDeadPHINode(PHINode *PN, const TargetLibraryInfo *TLI=0);
-
+bool RecursivelyDeleteDeadPHINode(PHINode *PN,
+                                  const TargetLibraryInfo *TLI = nullptr);
 
 /// SimplifyInstructionsInBlock - Scan the specified basic block and try to
 /// simplify any instructions in it and recursively delete dead instructions.
 ///
 /// This returns true if it changed the code, note that it can delete
 /// instructions in other blocks as well in this block.
-bool SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD = 0,
-                                 const TargetLibraryInfo *TLI = 0);
+bool SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD = nullptr,
+                                 const TargetLibraryInfo *TLI = nullptr);
 
 //===----------------------------------------------------------------------===//
 //  Control Flow Graph Restructuring.
@@ -105,16 +106,14 @@ bool SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD = 0,
 /// .. and delete the predecessor corresponding to the '1', this will attempt to
 /// recursively fold the 'and' to 0.
 void RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
-                                  DataLayout *TD = 0);
-
+                                  DataLayout *TD = nullptr);
 
 /// MergeBasicBlockIntoOnlyPred - BB is a block with one predecessor and its
 /// predecessor is known to have one successor (BB!).  Eliminate the edge
 /// between them, moving the instructions in the predecessor into BB.  This
 /// deletes the predecessor block.
 ///
-void MergeBasicBlockIntoOnlyPred(BasicBlock *BB, Pass *P = 0);
-
+void MergeBasicBlockIntoOnlyPred(BasicBlock *BB, Pass *P = nullptr);
 
 /// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an
 /// unconditional branch, and contains no instructions other than PHI nodes,
@@ -137,13 +136,13 @@ bool EliminateDuplicatePHINodes(BasicBlock *BB);
 /// the basic block that was pointed to.
 ///
 bool SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
-                 const DataLayout *TD = 0);
+                 const DataLayout *TD = nullptr);
 
 /// FlatternCFG - This function is used to flatten a CFG.  For
 /// example, it uses parallel-and and parallel-or mode to collapse
 //  if-conditions and merge if-regions with identical statements.
 ///
-bool FlattenCFG(BasicBlock *BB, AliasAnalysis *AA = 0);
+bool FlattenCFG(BasicBlock *BB, AliasAnalysis *AA = nullptr);
 
 /// FoldBranchToCommonDest - If this basic block is ONLY a setcc and a branch,
 /// and if a predecessor branches to us and one of our successors, fold the
@@ -159,22 +158,23 @@ bool FoldBranchToCommonDest(BranchInst *BI);
 ///
 AllocaInst *DemoteRegToStack(Instruction &X,
                              bool VolatileLoads = false,
-                             Instruction *AllocaPoint = 0);
+                             Instruction *AllocaPoint = nullptr);
 
 /// DemotePHIToStack - This function takes a virtual register computed by a phi
 /// node and replaces it with a slot in the stack frame, allocated via alloca.
 /// The phi node is deleted and it returns the pointer to the alloca inserted.
-AllocaInst *DemotePHIToStack(PHINode *P, Instruction *AllocaPoint = 0);
+AllocaInst *DemotePHIToStack(PHINode *P, Instruction *AllocaPoint = nullptr);
 
 /// getOrEnforceKnownAlignment - If the specified pointer has an alignment that
 /// we can determine, return it, otherwise return 0.  If PrefAlign is specified,
 /// and it is more than the alignment of the ultimate object, see if we can
 /// increase the alignment of the ultimate object, making this check succeed.
 unsigned getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
-                                    const DataLayout *TD = 0);
+                                    const DataLayout *TD = nullptr);
 
 /// getKnownAlignment - Try to infer an alignment for the specified pointer.
-static inline unsigned getKnownAlignment(Value *V, const DataLayout *TD = 0) {
+static inline unsigned getKnownAlignment(Value *V,
+                                         const DataLayout *TD = nullptr) {
   return getOrEnforceKnownAlignment(V, 0, TD);
 }
 
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index 64e18ca1..ee26d83 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -32,7 +32,7 @@ BasicBlock *InsertPreheaderForLoop(Loop *L, Pass *P);
 /// will optionally update \c AliasAnalysis and \c ScalarEvolution analyses if
 /// passed into it.
 bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
-                  AliasAnalysis *AA = 0, ScalarEvolution *SE = 0);
+                  AliasAnalysis *AA = nullptr, ScalarEvolution *SE = nullptr);
 
 /// \brief Put loop into LCSSA form.
 ///
@@ -45,7 +45,7 @@ bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
 /// If ScalarEvolution is passed in, it will be preserved.
 ///
 /// Returns true if any modifications are made to the loop.
-bool formLCSSA(Loop &L, DominatorTree &DT, ScalarEvolution *SE = 0);
+bool formLCSSA(Loop &L, DominatorTree &DT, ScalarEvolution *SE = nullptr);
 
 /// \brief Put a loop nest into LCSSA form.
 ///
@@ -56,8 +56,8 @@ bool formLCSSA(Loop &L, DominatorTree &DT, ScalarEvolution *SE = 0);
 /// If ScalarEvolution is passed in, it will be preserved.
 ///
 /// Returns true if any modifications are made to the loop.
-bool formLCSSARecursively(Loop &L, DominatorTree &DT, ScalarEvolution *SE = 0);
-
+bool formLCSSARecursively(Loop &L, DominatorTree &DT,
+                          ScalarEvolution *SE = nullptr);
 }
 
 #endif
diff --git a/include/llvm/Transforms/Utils/PromoteMemToReg.h b/include/llvm/Transforms/Utils/PromoteMemToReg.h
index 22f46e5..c83fedb 100644
--- a/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -41,7 +41,7 @@ bool isAllocaPromotable(const AllocaInst *AI);
 /// If AST is specified, the specified tracker is updated to reflect changes
 /// made to the IR.
 void PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                     AliasSetTracker *AST = 0);
+                     AliasSetTracker *AST = nullptr);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Transforms/Utils/SSAUpdater.h b/include/llvm/Transforms/Utils/SSAUpdater.h
index 0c0e5de..7874a5f 100644
--- a/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -56,7 +56,7 @@ private:
 public:
   /// If InsertedPHIs is specified, it will be filled
   /// in with all PHI Nodes created by rewriting.
-  explicit SSAUpdater(SmallVectorImpl<PHINode*> *InsertedPHIs = 0);
+  explicit SSAUpdater(SmallVectorImpl<PHINode*> *InsertedPHIs = nullptr);
   ~SSAUpdater();
 
   /// \brief Reset this object to get ready for a new set of SSA updates with
@@ -133,31 +133,31 @@ private:
 class LoadAndStorePromoter {
 protected:
   SSAUpdater &SSA;
+
 public:
   LoadAndStorePromoter(const SmallVectorImpl<Instruction*> &Insts,
                        SSAUpdater &S, StringRef Name = StringRef());
   virtual ~LoadAndStorePromoter() {}
-  
+
   /// \brief This does the promotion.
   ///
   /// Insts is a list of loads and stores to promote, and Name is the basename
   /// for the PHIs to insert. After this is complete, the loads and stores are
   /// removed from the code.
   void run(const SmallVectorImpl<Instruction*> &Insts) const;
-  
-  
+
   /// \brief Return true if the specified instruction is in the Inst list.
   ///
   /// The Insts list is the one passed into the constructor. Clients should
   /// implement this with a more efficient version if possible.
   virtual bool isInstInList(Instruction *I,
                             const SmallVectorImpl<Instruction*> &Insts) const;
-  
+
   /// \brief This hook is invoked after all the stores are found and inserted as
   /// available values.
   virtual void doExtraRewritesBeforeFinalDeletion() const {
   }
-  
+
   /// \brief Clients can choose to implement this to get notified right before
   /// a load is RAUW'd another value.
   virtual void replaceLoadWithValue(LoadInst *LI, Value *V) const {
diff --git a/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index 0f3da16..ed0841c 100644
--- a/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -23,6 +23,8 @@
 
 namespace llvm {
 
+#define DEBUG_TYPE "ssaupdater"
+
 class CastInst;
 class PHINode;
 template<typename T> class SSAUpdaterTraits;
@@ -52,8 +54,8 @@ private:
     PhiT *PHITag;      // Marker for existing PHIs that match.
 
     BBInfo(BlkT *ThisBB, ValT V)
-      : BB(ThisBB), AvailableVal(V), DefBB(V ? this : 0), BlkNum(0), IDom(0),
-      NumPreds(0), Preds(0), PHITag(0) { }
+      : BB(ThisBB), AvailableVal(V), DefBB(V ? this : nullptr), BlkNum(0),
+        IDom(nullptr), NumPreds(0), Preds(nullptr), PHITag(nullptr) {}
   };
 
   typedef DenseMap<BlkT*, ValT> AvailableValsTy;
@@ -115,7 +117,7 @@ public:
       Traits::FindPredecessorBlocks(Info->BB, &Preds);
       Info->NumPreds = Preds.size();
       if (Info->NumPreds == 0)
-        Info->Preds = 0;
+        Info->Preds = nullptr;
       else
         Info->Preds = static_cast<BBInfo**>
           (Allocator.Allocate(Info->NumPreds * sizeof(BBInfo*),
@@ -148,7 +150,7 @@ public:
     // Now that we know what blocks are backwards-reachable from the starting
     // block, do a forward depth-first traversal to assign postorder numbers
     // to those blocks.
-    BBInfo *PseudoEntry = new (Allocator) BBInfo(0, 0);
+    BBInfo *PseudoEntry = new (Allocator) BBInfo(nullptr, 0);
     unsigned BlkNum = 1;
 
     // Initialize the worklist with the roots from the backward traversal.
@@ -231,7 +233,7 @@ public:
       for (typename BlockListTy::reverse_iterator I = BlockList->rbegin(),
              E = BlockList->rend(); I != E; ++I) {
         BBInfo *Info = *I;
-        BBInfo *NewIDom = 0;
+        BBInfo *NewIDom = nullptr;
 
         // Iterate through the block's predecessors.
         for (unsigned p = 0; p != Info->NumPreds; ++p) {
@@ -386,7 +388,7 @@ public:
       // Match failed: clear all the PHITag values.
       for (typename BlockListTy::iterator I = BlockList->begin(),
              E = BlockList->end(); I != E; ++I)
-        (*I)->PHITag = 0;
+        (*I)->PHITag = nullptr;
     }
   }
 
@@ -451,6 +453,8 @@ public:
   }
 };
 
+#undef DEBUG_TYPE // "ssaupdater"
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/SimplifyIndVar.h b/include/llvm/Transforms/Utils/SimplifyIndVar.h
index dedeca3..dcb1d67 100644
--- a/include/llvm/Transforms/Utils/SimplifyIndVar.h
+++ b/include/llvm/Transforms/Utils/SimplifyIndVar.h
@@ -37,8 +37,9 @@ protected:
   bool ShouldSplitOverflowIntrinsics;
 
   virtual void anchor();
+
 public:
-  IVVisitor(): DT(NULL), ShouldSplitOverflowIntrinsics(false) {}
+  IVVisitor(): DT(nullptr), ShouldSplitOverflowIntrinsics(false) {}
   virtual ~IVVisitor() {}
 
   const DominatorTree *getDomTree() const { return DT; }
@@ -57,7 +58,7 @@ public:
 /// simplifyUsersOfIV - Simplify instructions that use this induction variable
 /// by using ScalarEvolution to analyze the IV's recurrence.
 bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM,
-                       SmallVectorImpl<WeakVH> &Dead, IVVisitor *V = NULL);
+                       SmallVectorImpl<WeakVH> &Dead, IVVisitor *V = nullptr);
 
 /// SimplifyLoopIVs - Simplify users of induction variables within this
 /// loop. This does not actually change or add IVs.
diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 6bb81be..a2a5f9a 100644
--- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -30,6 +30,7 @@ namespace llvm {
     /// Impl - A pointer to the actual implementation of the library call
     /// simplifier.
     LibCallSimplifierImpl *Impl;
+
   public:
     LibCallSimplifier(const DataLayout *TD, const TargetLibraryInfo *TLI,
                       bool UnsafeFPShrink);
diff --git a/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h b/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h
index 79a70cf..7ac2572 100644
--- a/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h
+++ b/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h
@@ -24,10 +24,11 @@ namespace llvm {
 
 struct UnifyFunctionExitNodes : public FunctionPass {
   BasicBlock *ReturnBlock, *UnwindBlock, *UnreachableBlock;
+
 public:
   static char ID; // Pass identification, replacement for typeid
   UnifyFunctionExitNodes() : FunctionPass(ID),
-                             ReturnBlock(0), UnwindBlock(0) {
+                             ReturnBlock(nullptr), UnwindBlock(nullptr) {
     initializeUnifyFunctionExitNodesPass(*PassRegistry::getPassRegistry());
   }
 
diff --git a/include/llvm/Transforms/Utils/UnrollLoop.h b/include/llvm/Transforms/Utils/UnrollLoop.h
index 0bbd572..aaadd7d 100644
--- a/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -29,7 +29,6 @@ bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool AllowRuntime,
 
 bool UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
                              LPPassManager* LPM);
-
 }
 
 #endif
diff --git a/include/llvm/Transforms/Utils/ValueMapper.h b/include/llvm/Transforms/Utils/ValueMapper.h
index e96610e..5774763 100644
--- a/include/llvm/Transforms/Utils/ValueMapper.h
+++ b/include/llvm/Transforms/Utils/ValueMapper.h
@@ -28,7 +28,7 @@ namespace llvm {
     virtual void anchor();  // Out of line method.
   public:
     virtual ~ValueMapTypeRemapper() {}
-    
+
     /// remapType - The client should implement this method if they want to
     /// remap types while mapping values.
     virtual Type *remapType(Type *SrcTy) = 0;
@@ -46,53 +46,52 @@ namespace llvm {
     /// lazily.
     virtual Value *materializeValueFor(Value *V) = 0;
   };
-  
+
   /// RemapFlags - These are flags that the value mapping APIs allow.
   enum RemapFlags {
     RF_None = 0,
-    
+
     /// RF_NoModuleLevelChanges - If this flag is set, the remapper knows that
     /// only local values within a function (such as an instruction or argument)
     /// are mapped, not global values like functions and global metadata.
     RF_NoModuleLevelChanges = 1,
-    
+
     /// RF_IgnoreMissingEntries - If this flag is set, the remapper ignores
     /// entries that are not in the value map.  If it is unset, it aborts if an
     /// operand is asked to be remapped which doesn't exist in the mapping.
     RF_IgnoreMissingEntries = 2
   };
-  
+
   static inline RemapFlags operator|(RemapFlags LHS, RemapFlags RHS) {
     return RemapFlags(unsigned(LHS)|unsigned(RHS));
   }
-  
+
   Value *MapValue(const Value *V, ValueToValueMapTy &VM,
                   RemapFlags Flags = RF_None,
-                  ValueMapTypeRemapper *TypeMapper = 0,
-                  ValueMaterializer *Materializer = 0);
+                  ValueMapTypeRemapper *TypeMapper = nullptr,
+                  ValueMaterializer *Materializer = nullptr);
 
   void RemapInstruction(Instruction *I, ValueToValueMapTy &VM,
                         RemapFlags Flags = RF_None,
-                        ValueMapTypeRemapper *TypeMapper = 0,
-                        ValueMaterializer *Materializer = 0);
-  
+                        ValueMapTypeRemapper *TypeMapper = nullptr,
+                        ValueMaterializer *Materializer = nullptr);
+
   /// MapValue - provide versions that preserve type safety for MDNode and
   /// Constants.
   inline MDNode *MapValue(const MDNode *V, ValueToValueMapTy &VM,
                           RemapFlags Flags = RF_None,
-                          ValueMapTypeRemapper *TypeMapper = 0,
-                          ValueMaterializer *Materializer = 0) {
+                          ValueMapTypeRemapper *TypeMapper = nullptr,
+                          ValueMaterializer *Materializer = nullptr) {
     return cast<MDNode>(MapValue((const Value*)V, VM, Flags, TypeMapper,
                                  Materializer));
   }
   inline Constant *MapValue(const Constant *V, ValueToValueMapTy &VM,
                             RemapFlags Flags = RF_None,
-                            ValueMapTypeRemapper *TypeMapper = 0,
-                            ValueMaterializer *Materializer = 0) {
+                            ValueMapTypeRemapper *TypeMapper = nullptr,
+                            ValueMaterializer *Materializer = nullptr) {
     return cast<Constant>(MapValue((const Value*)V, VM, Flags, TypeMapper,
                                    Materializer));
   }
-  
 
 } // End llvm namespace
 
diff --git a/include/llvm/Transforms/Utils/VectorUtils.h b/include/llvm/Transforms/Utils/VectorUtils.h
new file mode 100644
index 0000000..e1d6c56
--- /dev/null
+++ b/include/llvm/Transforms/Utils/VectorUtils.h
@@ -0,0 +1,180 @@
+//===- llvm/Transforms/Utils/VectorUtils.h - Vector utilities -*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some vectorizer utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_VECTORUTILS_H
+#define LLVM_TRANSFORMS_UTILS_VECTORUTILS_H
+
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+
+namespace llvm {
+
+/// \brief Identify if the intrinsic is trivially vectorizable.
+///
+/// This method returns true if the intrinsic's argument types are all
+/// scalars for the scalar form of the intrinsic and all vectors for
+/// the vector form of the intrinsic.
+static inline bool isTriviallyVectorizable(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::sqrt:
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::exp:
+  case Intrinsic::exp2:
+  case Intrinsic::log:
+  case Intrinsic::log10:
+  case Intrinsic::log2:
+  case Intrinsic::fabs:
+  case Intrinsic::copysign:
+  case Intrinsic::floor:
+  case Intrinsic::ceil:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint:
+  case Intrinsic::round:
+  case Intrinsic::bswap:
+  case Intrinsic::ctpop:
+  case Intrinsic::pow:
+  case Intrinsic::fma:
+  case Intrinsic::fmuladd:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I,
+                                              Intrinsic::ID ValidIntrinsicID) {
+  if (I.getNumArgOperands() != 1 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      !I.onlyReadsMemory())
+    return Intrinsic::not_intrinsic;
+
+  return ValidIntrinsicID;
+}
+
+static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I,
+                                               Intrinsic::ID ValidIntrinsicID) {
+  if (I.getNumArgOperands() != 2 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      !I.getArgOperand(1)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      I.getType() != I.getArgOperand(1)->getType() ||
+      !I.onlyReadsMemory())
+    return Intrinsic::not_intrinsic;
+
+  return ValidIntrinsicID;
+}
+
+static Intrinsic::ID
+getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
+  // If we have an intrinsic call, check if it is trivially vectorizable.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+    Intrinsic::ID ID = II->getIntrinsicID();
+    if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
+        ID == Intrinsic::lifetime_end)
+      return ID;
+    else
+      return Intrinsic::not_intrinsic;
+  }
+
+  if (!TLI)
+    return Intrinsic::not_intrinsic;
+
+  LibFunc::Func Func;
+  Function *F = CI->getCalledFunction();
+  // We're going to make assumptions on the semantics of the functions, check
+  // that the target knows that it's available in this environment and it does
+  // not have local linkage.
+  if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func))
+    return Intrinsic::not_intrinsic;
+
+  // Otherwise check if we have a call to a function that can be turned into a
+  // vector intrinsic.
+  switch (Func) {
+  default:
+    break;
+  case LibFunc::sin:
+  case LibFunc::sinf:
+  case LibFunc::sinl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::sin);
+  case LibFunc::cos:
+  case LibFunc::cosf:
+  case LibFunc::cosl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::cos);
+  case LibFunc::exp:
+  case LibFunc::expf:
+  case LibFunc::expl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::exp);
+  case LibFunc::exp2:
+  case LibFunc::exp2f:
+  case LibFunc::exp2l:
+    return checkUnaryFloatSignature(*CI, Intrinsic::exp2);
+  case LibFunc::log:
+  case LibFunc::logf:
+  case LibFunc::logl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::log);
+  case LibFunc::log10:
+  case LibFunc::log10f:
+  case LibFunc::log10l:
+    return checkUnaryFloatSignature(*CI, Intrinsic::log10);
+  case LibFunc::log2:
+  case LibFunc::log2f:
+  case LibFunc::log2l:
+    return checkUnaryFloatSignature(*CI, Intrinsic::log2);
+  case LibFunc::fabs:
+  case LibFunc::fabsf:
+  case LibFunc::fabsl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::fabs);
+  case LibFunc::copysign:
+  case LibFunc::copysignf:
+  case LibFunc::copysignl:
+    return checkBinaryFloatSignature(*CI, Intrinsic::copysign);
+  case LibFunc::floor:
+  case LibFunc::floorf:
+  case LibFunc::floorl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::floor);
+  case LibFunc::ceil:
+  case LibFunc::ceilf:
+  case LibFunc::ceill:
+    return checkUnaryFloatSignature(*CI, Intrinsic::ceil);
+  case LibFunc::trunc:
+  case LibFunc::truncf:
+  case LibFunc::truncl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::trunc);
+  case LibFunc::rint:
+  case LibFunc::rintf:
+  case LibFunc::rintl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::rint);
+  case LibFunc::nearbyint:
+  case LibFunc::nearbyintf:
+  case LibFunc::nearbyintl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint);
+  case LibFunc::round:
+  case LibFunc::roundf:
+  case LibFunc::roundl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::round);
+  case LibFunc::pow:
+  case LibFunc::powf:
+  case LibFunc::powl:
+    return checkBinaryFloatSignature(*CI, Intrinsic::pow);
+  }
+
+  return Intrinsic::not_intrinsic;
+}
+
+} // llvm namespace
+
+#endif
diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h
index e93b39a..aec3993 100644
--- a/include/llvm/Transforms/Vectorize.h
+++ b/include/llvm/Transforms/Vectorize.h
@@ -47,6 +47,9 @@ struct VectorizeConfig {
   /// @brief Vectorize floating-point math intrinsics.
   bool VectorizeMath;
 
+  /// @brief Vectorize bit intrinsics.
+  bool VectorizeBitManipulations;
+
   /// @brief Vectorize the fused-multiply-add intrinsic.
   bool VectorizeFMA;
 
diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
new file mode 100644
index 0000000..1790a72
--- /dev/null
+++ b/include/llvm/module.modulemap
@@ -0,0 +1,177 @@
+module LLVM_Analysis {
+  requires cplusplus
+  umbrella "Analysis"
+  module * { export * }
+  exclude header "Analysis/BlockFrequencyInfoImpl.h"
+}
+
+module LLVM_AsmParser { requires cplusplus umbrella "AsmParser" module * { export * } }
+
+// A module covering CodeGen/ and Target/. These are intertwined
+// and codependent, and thus notionally form a single module.
+module LLVM_Backend {
+  requires cplusplus
+
+  module CodeGen {
+    umbrella "CodeGen"
+    module * { export * }
+
+    // FIXME: Why is this excluded?
+    exclude header "CodeGen/MachineValueType.h"
+
+    // Exclude these; they're intended to be included into only a single
+    // translation unit (or none) and aren't part of this module.
+    exclude header "CodeGen/CommandFlags.h"
+    exclude header "CodeGen/LinkAllAsmWriterComponents.h"
+    exclude header "CodeGen/LinkAllCodegenComponents.h"
+  }
+
+  module Target {
+    umbrella "Target"
+    module * { export * }
+  }
+
+  // FIXME: Where should this go?
+  module Analysis_BlockFrequencyInfoImpl {
+    header "Analysis/BlockFrequencyInfoImpl.h"
+    export *
+  }
+}
+
+module LLVM_Bitcode { requires cplusplus umbrella "Bitcode" module * { export * } }
+module LLVM_Config { requires cplusplus umbrella "Config" module * { export * } }
+module LLVM_DebugInfo { requires cplusplus umbrella "DebugInfo" module * { export * } }
+module LLVM_ExecutionEngine {
+  requires cplusplus
+
+  umbrella "ExecutionEngine"
+  module * { export * }
+
+  // Exclude this; it's an optional component of the ExecutionEngine.
+  exclude header "ExecutionEngine/OProfileWrapper.h"
+
+  // Exclude these; they're intended to be included into only a single
+  // translation unit (or none) and aren't part of this module.
+  exclude header "ExecutionEngine/JIT.h"
+  exclude header "ExecutionEngine/MCJIT.h"
+  exclude header "ExecutionEngine/Interpreter.h"
+}
+
+module LLVM_IR {
+  requires cplusplus
+
+  // FIXME: Is this the right place for these?
+  module Pass { header "Pass.h" export * }
+  module PassSupport { header "PassSupport.h" export * }
+  module PassAnalysisSupport { header "PassAnalysisSupport.h" export * }
+  module PassRegistry { header "PassRegistry.h" export * }
+  module InitializePasses { header "InitializePasses.h" export * }
+
+  umbrella "IR"
+  module * { export * }
+
+  // We cannot have llvm/PassManager.h and llvm/IR/PassManager.h in the same TU,
+  // so we can't include llvm/IR/PassManager.h in the IR module.
+  exclude header "IR/PassManager.h"
+  exclude header "IR/LegacyPassManager.h"
+
+  // Exclude this; it's intended for (repeated) textual inclusion.
+  exclude header "IR/Instruction.def"
+}
+
+module LLVM_LegacyPassManager {
+  requires cplusplus
+  module CompatInterface { header "PassManager.h" export * }
+  module Implementation { header "IR/LegacyPassManager.h" export * }
+}
+
+module LLVM_IR_PassManager {
+  requires cplusplus
+  // FIXME PR19358: This doesn't work! conflict LLVM_LegacyPassManager, "cannot use legacy pass manager and new pass manager in same file"
+  header "IR/PassManager.h"
+  export *
+}
+
+module LLVM_IRReader { requires cplusplus umbrella "IRReader" module * { export * } }
+module LLVM_LineEditor { requires cplusplus umbrella "LineEditor" module * { export * } }
+module LLVM_LTO { requires cplusplus umbrella "LTO" module * { export * } }
+
+module LLVM_MC {
+  requires cplusplus
+
+  // FIXME: Mislayered?
+  module Support_TargetRegistry {
+    header "Support/TargetRegistry.h"
+    export *
+  }
+
+  umbrella "MC"
+  module * { export * }
+
+  // Exclude this; it's fundamentally non-modular.
+  exclude header "MC/MCTargetOptionsCommandFlags.h"
+}
+
+module LLVM_Object { requires cplusplus umbrella "Object" module * { export * } }
+module LLVM_Option { requires cplusplus umbrella "Option" module * { export * } }
+module LLVM_TableGen { requires cplusplus umbrella "TableGen" module * { export * } }
+
+module LLVM_Transforms {
+  requires cplusplus
+  umbrella "Transforms"
+  module * { export * }
+
+  // FIXME: Excluded because it does bad things with the legacy pass manager.
+  exclude header "Transforms/IPO/PassManagerBuilder.h"
+}
+
+// A module covering ADT/ and Support/. These are intertwined and
+// codependent, and notionally form a single module.
+module LLVM_Utils {
+  module ADT {
+    requires cplusplus
+
+    umbrella "ADT"
+    module * { export * }
+  }
+
+  module Support {
+    requires cplusplus
+
+    umbrella "Support"
+    module * { export * }
+
+    // Exclude this; it's only included on Solaris.
+    exclude header "Support/Solaris.h"
+
+    // Exclude this; it's only included on AIX and fundamentally non-modular.
+    exclude header "Support/AIXDataTypesFix.h"
+
+    // Exclude this; it's fundamentally non-modular.
+    exclude header "Support/Debug.h"
+
+    // Exclude this; it's fundamentally non-modular.
+    exclude header "Support/PluginLoader.h"
+
+    // Exclude this; it's a weirdly-factored part of llvm-gcov and conflicts
+    // with the Analysis module (which also defines an llvm::GCOVOptions).
+    exclude header "Support/GCOV.h"
+
+    // FIXME: Mislayered?
+    exclude header "Support/TargetRegistry.h"
+  }
+}
+
+module LLVM_CodeGen_MachineValueType {
+  requires cplusplus
+  header "CodeGen/MachineValueType.h"
+  export *
+}
+
+// This is used for a $src == $build compilation. Otherwise we use
+// LLVM_Support_DataTypes_Build, defined in a module map that is
+// copied into the build area.
+module LLVM_Support_DataTypes_Src {
+  header "llvm/Support/DataTypes.h"
+  export *
+}
diff --git a/include/llvm/module.modulemap.build b/include/llvm/module.modulemap.build
new file mode 100644
index 0000000..7150fe9
--- /dev/null
+++ b/include/llvm/module.modulemap.build
@@ -0,0 +1,5 @@
+// This is copied into the build area for a $src != $build compilation.
+module LLVM_Support_DataTypes {
+  header "Support/DataTypes.h"
+  export *
+}
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 9583bbe..57237e5 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -473,7 +473,7 @@ AliasAnalysis::~AliasAnalysis() {}
 ///
 void AliasAnalysis::InitializeAliasAnalysis(Pass *P) {
   DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = P->getAnalysisIfAvailable<TargetLibraryInfo>();
   AA = &P->getAnalysis<AliasAnalysis>();
 }
diff --git a/lib/Analysis/AliasAnalysisCounter.cpp b/lib/Analysis/AliasAnalysisCounter.cpp
index 2e3bc55..b860914 100644
--- a/lib/Analysis/AliasAnalysisCounter.cpp
+++ b/lib/Analysis/AliasAnalysisCounter.cpp
@@ -126,7 +126,7 @@ AliasAnalysis::AliasResult
 AliasAnalysisCounter::alias(const Location &LocA, const Location &LocB) {
   AliasResult R = getAnalysis<AliasAnalysis>().alias(LocA, LocB);
 
-  const char *AliasString = 0;
+  const char *AliasString = nullptr;
   switch (R) {
   case NoAlias:   No++;   AliasString = "No alias"; break;
   case MayAlias:  May++;  AliasString = "May alias"; break;
@@ -152,7 +152,7 @@ AliasAnalysisCounter::getModRefInfo(ImmutableCallSite CS,
                                     const Location &Loc) {
   ModRefResult R = getAnalysis<AliasAnalysis>().getModRefInfo(CS, Loc);
 
-  const char *MRString = 0;
+  const char *MRString = nullptr;
   switch (R) {
   case NoModRef: NoMR++;     MRString = "NoModRef"; break;
   case Ref:      JustRef++;  MRString = "JustRef"; break;
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index ab1005e..a45fe23 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -72,16 +72,16 @@ void AliasSet::mergeSetIn(AliasSet &AS, AliasSetTracker &AST) {
     AS.PtrList->setPrevInList(PtrListEnd);
     PtrListEnd = AS.PtrListEnd;
 
-    AS.PtrList = 0;
+    AS.PtrList = nullptr;
     AS.PtrListEnd = &AS.PtrList;
-    assert(*AS.PtrListEnd == 0 && "End of list is not null?");
+    assert(*AS.PtrListEnd == nullptr && "End of list is not null?");
   }
 }
 
 void AliasSetTracker::removeAliasSet(AliasSet *AS) {
   if (AliasSet *Fwd = AS->Forward) {
     Fwd->dropRef(*this);
-    AS->Forward = 0;
+    AS->Forward = nullptr;
   }
   AliasSets.erase(AS);
 }
@@ -115,10 +115,10 @@ void AliasSet::addPointer(AliasSetTracker &AST, PointerRec &Entry,
   Entry.updateSizeAndTBAAInfo(Size, TBAAInfo);
 
   // Add it to the end of the list...
-  assert(*PtrListEnd == 0 && "End of list is not null?");
+  assert(*PtrListEnd == nullptr && "End of list is not null?");
   *PtrListEnd = &Entry;
   PtrListEnd = Entry.setPrevInList(PtrListEnd);
-  assert(*PtrListEnd == 0 && "End of list is not null?");
+  assert(*PtrListEnd == nullptr && "End of list is not null?");
   addRef();               // Entry points to alias set.
 }
 
@@ -217,11 +217,11 @@ void AliasSetTracker::clear() {
 AliasSet *AliasSetTracker::findAliasSetForPointer(const Value *Ptr,
                                                   uint64_t Size,
                                                   const MDNode *TBAAInfo) {
-  AliasSet *FoundSet = 0;
+  AliasSet *FoundSet = nullptr;
   for (iterator I = begin(), E = end(); I != E; ++I) {
     if (I->Forward || !I->aliasesPointer(Ptr, Size, TBAAInfo, AA)) continue;
     
-    if (FoundSet == 0) {  // If this is the first alias set ptr can go into.
+    if (!FoundSet) {      // If this is the first alias set ptr can go into.
       FoundSet = I;       // Remember it.
     } else {              // Otherwise, we must merge the sets.
       FoundSet->mergeSetIn(*I, *this);     // Merge in contents.
@@ -245,12 +245,12 @@ bool AliasSetTracker::containsPointer(Value *Ptr, uint64_t Size,
 
 
 AliasSet *AliasSetTracker::findAliasSetForUnknownInst(Instruction *Inst) {
-  AliasSet *FoundSet = 0;
+  AliasSet *FoundSet = nullptr;
   for (iterator I = begin(), E = end(); I != E; ++I) {
     if (I->Forward || !I->aliasesUnknownInst(Inst, AA))
       continue;
     
-    if (FoundSet == 0)        // If this is the first alias set ptr can go into.
+    if (!FoundSet)            // If this is the first alias set ptr can go into.
       FoundSet = I;           // Remember it.
     else if (!I->Forward)     // Otherwise, we must merge the sets.
       FoundSet->mergeSetIn(*I, *this);     // Merge in contents.
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index c960123..01c1c7e 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -73,7 +73,7 @@ void LLVMInitializeAnalysis(LLVMPassRegistryRef R) {
 
 LLVMBool LLVMVerifyModule(LLVMModuleRef M, LLVMVerifierFailureAction Action,
                           char **OutMessages) {
-  raw_ostream *DebugOS = Action != LLVMReturnStatusAction ? &errs() : 0;
+  raw_ostream *DebugOS = Action != LLVMReturnStatusAction ? &errs() : nullptr;
   std::string Messages;
   raw_string_ostream MsgsOS(Messages);
 
@@ -94,7 +94,8 @@ LLVMBool LLVMVerifyModule(LLVMModuleRef M, LLVMVerifierFailureAction Action,
 
 LLVMBool LLVMVerifyFunction(LLVMValueRef Fn, LLVMVerifierFailureAction Action) {
   LLVMBool Result = verifyFunction(
-      *unwrap<Function>(Fn), Action != LLVMReturnStatusAction ? &errs() : 0);
+      *unwrap<Function>(Fn), Action != LLVMReturnStatusAction ? &errs()
+                                                              : nullptr);
 
   if (Action == LLVMAbortProcessAction && Result)
     report_fatal_error("Broken function found, compilation aborted!");
diff --git a/lib/Analysis/Android.mk b/lib/Analysis/Android.mk
index 76eee74..a8fef77 100644
--- a/lib/Analysis/Android.mk
+++ b/lib/Analysis/Android.mk
@@ -9,6 +9,7 @@ analysis_SRC_FILES := \
   Analysis.cpp \
   BasicAliasAnalysis.cpp \
   BlockFrequencyInfo.cpp \
+  BlockFrequencyInfoImpl.cpp \
   BranchProbabilityInfo.cpp \
   CFG.cpp \
   CFGPrinter.cpp \
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index e267374..fe90b84 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -298,7 +298,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
   do {
     // See if this is a bitcast or GEP.
     const Operator *Op = dyn_cast<Operator>(V);
-    if (Op == 0) {
+    if (!Op) {
       // The only non-operator case we can handle are GlobalAliases.
       if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
         if (!GA->mayBeOverridden()) {
@@ -315,7 +315,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
     }
 
     const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
-    if (GEPOp == 0) {
+    if (!GEPOp) {
       // If it's not a GEP, hand it off to SimplifyInstruction to see if it
       // can come up with something. This matches what GetUnderlyingObject does.
       if (const Instruction *I = dyn_cast<Instruction>(V))
@@ -336,7 +336,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
     // If we are lacking DataLayout information, we can't compute the offets of
     // elements computed by GEPs.  However, we can handle bitcast equivalent
     // GEPs.
-    if (DL == 0) {
+    if (!DL) {
       if (!GEPOp->hasAllZeroIndices())
         return V;
       V = GEPOp->getOperand(0);
@@ -433,7 +433,7 @@ static const Function *getParent(const Value *V) {
   if (const Argument *arg = dyn_cast<Argument>(V))
     return arg->getParent();
 
-  return NULL;
+  return nullptr;
 }
 
 static bool notDifferentParent(const Value *O1, const Value *O2) {
@@ -753,7 +753,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
 
   // Finally, handle specific knowledge of intrinsics.
   const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction());
-  if (II != 0)
+  if (II != nullptr)
     switch (II->getIntrinsicID()) {
     default: break;
     case Intrinsic::memcpy:
@@ -868,21 +868,6 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
   return ModRefResult(AliasAnalysis::getModRefInfo(CS, Loc) & Min);
 }
 
-static bool areVarIndicesEqual(SmallVectorImpl<VariableGEPIndex> &Indices1,
-                               SmallVectorImpl<VariableGEPIndex> &Indices2) {
-  unsigned Size1 = Indices1.size();
-  unsigned Size2 = Indices2.size();
-
-  if (Size1 != Size2)
-    return false;
-
-  for (unsigned I = 0; I != Size1; ++I)
-    if (Indices1[I] != Indices2[I])
-      return false;
-
-  return true;
-}
-
 /// aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP instruction
 /// against another pointer.  We know that V1 is a GEP, but we don't know
 /// anything about V2.  UnderlyingV1 is GetUnderlyingObject(GEP1, DL),
@@ -904,8 +889,8 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
   // derived pointer.
   if (const GEPOperator *GEP2 = dyn_cast<GEPOperator>(V2)) {
     // Do the base pointers alias?
-    AliasResult BaseAlias = aliasCheck(UnderlyingV1, UnknownSize, 0,
-                                       UnderlyingV2, UnknownSize, 0);
+    AliasResult BaseAlias = aliasCheck(UnderlyingV1, UnknownSize, nullptr,
+                                       UnderlyingV2, UnknownSize, nullptr);
 
     // Check for geps of non-aliasing underlying pointers where the offsets are
     // identical.
@@ -929,8 +914,8 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
         // DecomposeGEPExpression and GetUnderlyingObject should return the
         // same result except when DecomposeGEPExpression has no DataLayout.
         if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
-          assert(DL == 0 &&
-             "DecomposeGEPExpression and GetUnderlyingObject disagree!");
+          assert(!DL &&
+                 "DecomposeGEPExpression and GetUnderlyingObject disagree!");
           return MayAlias;
         }
         // If the max search depth is reached the result is undefined
@@ -939,7 +924,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
 
         // Same offsets.
         if (GEP1BaseOffset == GEP2BaseOffset &&
-            areVarIndicesEqual(GEP1VariableIndices, GEP2VariableIndices))
+            GEP1VariableIndices == GEP2VariableIndices)
           return NoAlias;
         GEP1VariableIndices.clear();
       }
@@ -966,7 +951,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
     if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
-      assert(DL == 0 &&
+      assert(!DL &&
              "DecomposeGEPExpression and GetUnderlyingObject disagree!");
       return MayAlias;
     }
@@ -988,7 +973,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
     if (V1Size == UnknownSize && V2Size == UnknownSize)
       return MayAlias;
 
-    AliasResult R = aliasCheck(UnderlyingV1, UnknownSize, 0,
+    AliasResult R = aliasCheck(UnderlyingV1, UnknownSize, nullptr,
                                V2, V2Size, V2TBAAInfo);
     if (R != MustAlias)
       // If V2 may alias GEP base pointer, conservatively returns MayAlias.
@@ -1005,7 +990,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
     if (GEP1BasePtr != UnderlyingV1) {
-      assert(DL == 0 &&
+      assert(!DL &&
              "DecomposeGEPExpression and GetUnderlyingObject disagree!");
       return MayAlias;
     }
@@ -1371,7 +1356,7 @@ bool BasicAliasAnalysis::isValueEqualInPotentialCycles(const Value *V,
   // Use dominance or loop info if available.
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : 0;
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   LoopInfo *LI = getAnalysisIfAvailable<LoopInfo>();
 
   // Make sure that the visited phis cannot reach the Value. This ensures that
diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp
index 63049a5..8ed8e3e 100644
--- a/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/lib/Analysis/BlockFrequencyInfo.cpp
@@ -1,4 +1,4 @@
-//=======-------- BlockFrequencyInfo.cpp - Block Frequency Analysis -------===//
+//===- BlockFrequencyInfo.cpp - Block Frequency Analysis ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BlockFrequencyImpl.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/Passes.h"
@@ -24,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "block-freq"
+
 #ifndef NDEBUG
 enum GVDAGType {
   GVDT_None,
@@ -106,6 +108,7 @@ struct DOTGraphTraits<BlockFrequencyInfo*> : public DefaultDOTGraphTraits {
 INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq",
                       "Block Frequency Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq",
                     "Block Frequency Analysis", true, true)
 
@@ -120,14 +123,16 @@ BlockFrequencyInfo::~BlockFrequencyInfo() {}
 
 void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<BranchProbabilityInfo>();
+  AU.addRequired<LoopInfo>();
   AU.setPreservesAll();
 }
 
 bool BlockFrequencyInfo::runOnFunction(Function &F) {
   BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>();
+  LoopInfo &LI = getAnalysis<LoopInfo>();
   if (!BFI)
     BFI.reset(new ImplType);
-  BFI->doFunction(&F, &BPI);
+  BFI->doFunction(&F, &BPI, &LI);
 #ifndef NDEBUG
   if (ViewBlockFreqPropagationDAG != GVDT_None)
     view();
@@ -158,7 +163,7 @@ void BlockFrequencyInfo::view() const {
 }
 
 const Function *BlockFrequencyInfo::getFunction() const {
-  return BFI ? BFI->Fn : nullptr;
+  return BFI ? BFI->getFunction() : nullptr;
 }
 
 raw_ostream &BlockFrequencyInfo::
diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp
new file mode 100644
index 0000000..87d93a4
--- /dev/null
+++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -0,0 +1,995 @@
+//===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loops should be simplified before this analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Support/raw_ostream.h"
+#include <deque>
+
+using namespace llvm;
+using namespace llvm::bfi_detail;
+
+#define DEBUG_TYPE "block-freq"
+
+//===----------------------------------------------------------------------===//
+//
+// UnsignedFloat implementation.
+//
+//===----------------------------------------------------------------------===//
+#ifndef _MSC_VER
+const int32_t UnsignedFloatBase::MaxExponent;
+const int32_t UnsignedFloatBase::MinExponent;
+#endif
+
+static void appendDigit(std::string &Str, unsigned D) {
+  assert(D < 10);
+  Str += '0' + D % 10;
+}
+
+static void appendNumber(std::string &Str, uint64_t N) {
+  while (N) {
+    appendDigit(Str, N % 10);
+    N /= 10;
+  }
+}
+
+static bool doesRoundUp(char Digit) {
+  switch (Digit) {
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9':
+    return true;
+  default:
+    return false;
+  }
+}
+
+static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) {
+  assert(E >= UnsignedFloatBase::MinExponent);
+  assert(E <= UnsignedFloatBase::MaxExponent);
+
+  // Find a new E, but don't let it increase past MaxExponent.
+  int LeadingZeros = UnsignedFloatBase::countLeadingZeros64(D);
+  int NewE = std::min(UnsignedFloatBase::MaxExponent, E + 63 - LeadingZeros);
+  int Shift = 63 - (NewE - E);
+  assert(Shift <= LeadingZeros);
+  assert(Shift == LeadingZeros || NewE == UnsignedFloatBase::MaxExponent);
+  D <<= Shift;
+  E = NewE;
+
+  // Check for a denormal.
+  unsigned AdjustedE = E + 16383;
+  if (!(D >> 63)) {
+    assert(E == UnsignedFloatBase::MaxExponent);
+    AdjustedE = 0;
+  }
+
+  // Build the float and print it.
+  uint64_t RawBits[2] = {D, AdjustedE};
+  APFloat Float(APFloat::x87DoubleExtended, APInt(80, RawBits));
+  SmallVector<char, 24> Chars;
+  Float.toString(Chars, Precision, 0);
+  return std::string(Chars.begin(), Chars.end());
+}
+
+static std::string stripTrailingZeros(const std::string &Float) {
+  size_t NonZero = Float.find_last_not_of('0');
+  assert(NonZero != std::string::npos && "no . in floating point string");
+
+  if (Float[NonZero] == '.')
+    ++NonZero;
+
+  return Float.substr(0, NonZero + 1);
+}
+
+std::string UnsignedFloatBase::toString(uint64_t D, int16_t E, int Width,
+                                        unsigned Precision) {
+  if (!D)
+    return "0.0";
+
+  // Canonicalize exponent and digits.
+  uint64_t Above0 = 0;
+  uint64_t Below0 = 0;
+  uint64_t Extra = 0;
+  int ExtraShift = 0;
+  if (E == 0) {
+    Above0 = D;
+  } else if (E > 0) {
+    if (int Shift = std::min(int16_t(countLeadingZeros64(D)), E)) {
+      D <<= Shift;
+      E -= Shift;
+
+      if (!E)
+        Above0 = D;
+    }
+  } else if (E > -64) {
+    Above0 = D >> -E;
+    Below0 = D << (64 + E);
+  } else if (E > -120) {
+    Below0 = D >> (-E - 64);
+    Extra = D << (128 + E);
+    ExtraShift = -64 - E;
+  }
+
+  // Fall back on APFloat for very small and very large numbers.
+  if (!Above0 && !Below0)
+    return toStringAPFloat(D, E, Precision);
+
+  // Append the digits before the decimal.
+  std::string Str;
+  size_t DigitsOut = 0;
+  if (Above0) {
+    appendNumber(Str, Above0);
+    DigitsOut = Str.size();
+  } else
+    appendDigit(Str, 0);
+  std::reverse(Str.begin(), Str.end());
+
+  // Return early if there's nothing after the decimal.
+  if (!Below0)
+    return Str + ".0";
+
+  // Append the decimal and beyond.
+  Str += '.';
+  uint64_t Error = UINT64_C(1) << (64 - Width);
+
+  // We need to shift Below0 to the right to make space for calculating
+  // digits.  Save the precision we're losing in Extra.
+  Extra = (Below0 & 0xf) << 56 | (Extra >> 8);
+  Below0 >>= 4;
+  size_t SinceDot = 0;
+  size_t AfterDot = Str.size();
+  do {
+    if (ExtraShift) {
+      --ExtraShift;
+      Error *= 5;
+    } else
+      Error *= 10;
+
+    Below0 *= 10;
+    Extra *= 10;
+    Below0 += (Extra >> 60);
+    Extra = Extra & (UINT64_MAX >> 4);
+    appendDigit(Str, Below0 >> 60);
+    Below0 = Below0 & (UINT64_MAX >> 4);
+    if (DigitsOut || Str.back() != '0')
+      ++DigitsOut;
+    ++SinceDot;
+  } while (Error && (Below0 << 4 | Extra >> 60) >= Error / 2 &&
+           (!Precision || DigitsOut <= Precision || SinceDot < 2));
+
+  // Return early for maximum precision.
+  if (!Precision || DigitsOut <= Precision)
+    return stripTrailingZeros(Str);
+
+  // Find where to truncate.
+  size_t Truncate =
+      std::max(Str.size() - (DigitsOut - Precision), AfterDot + 1);
+
+  // Check if there's anything to truncate.
+  if (Truncate >= Str.size())
+    return stripTrailingZeros(Str);
+
+  bool Carry = doesRoundUp(Str[Truncate]);
+  if (!Carry)
+    return stripTrailingZeros(Str.substr(0, Truncate));
+
+  // Round with the first truncated digit.
+  for (std::string::reverse_iterator I(Str.begin() + Truncate), E = Str.rend();
+       I != E; ++I) {
+    if (*I == '.')
+      continue;
+    if (*I == '9') {
+      *I = '0';
+      continue;
+    }
+
+    ++*I;
+    Carry = false;
+    break;
+  }
+
+  // Add "1" in front if we still need to carry.
+  return stripTrailingZeros(std::string(Carry, '1') + Str.substr(0, Truncate));
+}
+
+raw_ostream &UnsignedFloatBase::print(raw_ostream &OS, uint64_t D, int16_t E,
+                                      int Width, unsigned Precision) {
+  return OS << toString(D, E, Width, Precision);
+}
+
+void UnsignedFloatBase::dump(uint64_t D, int16_t E, int Width) {
+  print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E
+                                << "]";
+}
+
+static std::pair<uint64_t, int16_t>
+getRoundedFloat(uint64_t N, bool ShouldRound, int64_t Shift) {
+  if (ShouldRound)
+    if (!++N)
+      // Rounding caused an overflow.
+      return std::make_pair(UINT64_C(1), Shift + 64);
+  return std::make_pair(N, Shift);
+}
+
+std::pair<uint64_t, int16_t> UnsignedFloatBase::divide64(uint64_t Dividend,
+                                                         uint64_t Divisor) {
+  // Input should be sanitized.
+  assert(Divisor);
+  assert(Dividend);
+
+  // Minimize size of divisor.
+  int16_t Shift = 0;
+  if (int Zeros = countTrailingZeros(Divisor)) {
+    Shift -= Zeros;
+    Divisor >>= Zeros;
+  }
+
+  // Check for powers of two.
+  if (Divisor == 1)
+    return std::make_pair(Dividend, Shift);
+
+  // Maximize size of dividend.
+  if (int Zeros = countLeadingZeros64(Dividend)) {
+    Shift -= Zeros;
+    Dividend <<= Zeros;
+  }
+
+  // Start with the result of a divide.
+  uint64_t Quotient = Dividend / Divisor;
+  Dividend %= Divisor;
+
+  // Continue building the quotient with long division.
+  //
+  // TODO: continue with largers digits.
+  while (!(Quotient >> 63) && Dividend) {
+    // Shift Dividend, and check for overflow.
+    bool IsOverflow = Dividend >> 63;
+    Dividend <<= 1;
+    --Shift;
+
+    // Divide.
+    bool DoesDivide = IsOverflow || Divisor <= Dividend;
+    Quotient = (Quotient << 1) | uint64_t(DoesDivide);
+    Dividend -= DoesDivide ? Divisor : 0;
+  }
+
+  // Round.
+  if (Dividend >= getHalf(Divisor))
+    if (!++Quotient)
+      // Rounding caused an overflow in Quotient.
+      return std::make_pair(UINT64_C(1), Shift + 64);
+
+  return getRoundedFloat(Quotient, Dividend >= getHalf(Divisor), Shift);
+}
+
+std::pair<uint64_t, int16_t> UnsignedFloatBase::multiply64(uint64_t L,
+                                                           uint64_t R) {
+  // Separate into two 32-bit digits (U.L).
+  uint64_t UL = L >> 32, LL = L & UINT32_MAX, UR = R >> 32, LR = R & UINT32_MAX;
+
+  // Compute cross products.
+  uint64_t P1 = UL * UR, P2 = UL * LR, P3 = LL * UR, P4 = LL * LR;
+
+  // Sum into two 64-bit digits.
+  uint64_t Upper = P1, Lower = P4;
+  auto addWithCarry = [&](uint64_t N) {
+    uint64_t NewLower = Lower + (N << 32);
+    Upper += (N >> 32) + (NewLower < Lower);
+    Lower = NewLower;
+  };
+  addWithCarry(P2);
+  addWithCarry(P3);
+
+  // Check whether the upper digit is empty.
+  if (!Upper)
+    return std::make_pair(Lower, 0);
+
+  // Shift as little as possible to maximize precision.
+  unsigned LeadingZeros = countLeadingZeros64(Upper);
+  int16_t Shift = 64 - LeadingZeros;
+  if (LeadingZeros)
+    Upper = Upper << LeadingZeros | Lower >> Shift;
+  bool ShouldRound = Shift && (Lower & UINT64_C(1) << (Shift - 1));
+  return getRoundedFloat(Upper, ShouldRound, Shift);
+}
+
+//===----------------------------------------------------------------------===//
+//
+// BlockMass implementation.
+//
+//===----------------------------------------------------------------------===//
+UnsignedFloat<uint64_t> BlockMass::toFloat() const {
+  if (isFull())
+    return UnsignedFloat<uint64_t>(1, 0);
+  return UnsignedFloat<uint64_t>(getMass() + 1, -64);
+}
+
+void BlockMass::dump() const { print(dbgs()); }
+
+static char getHexDigit(int N) {
+  assert(N < 16);
+  if (N < 10)
+    return '0' + N;
+  return 'a' + N - 10;
+}
+raw_ostream &BlockMass::print(raw_ostream &OS) const {
+  for (int Digits = 0; Digits < 16; ++Digits)
+    OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf);
+  return OS;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// BlockFrequencyInfoImpl implementation.
+//
+//===----------------------------------------------------------------------===//
+namespace {
+
+typedef BlockFrequencyInfoImplBase::BlockNode BlockNode;
+typedef BlockFrequencyInfoImplBase::Distribution Distribution;
+typedef BlockFrequencyInfoImplBase::Distribution::WeightList WeightList;
+typedef BlockFrequencyInfoImplBase::Float Float;
+typedef BlockFrequencyInfoImplBase::LoopData LoopData;
+typedef BlockFrequencyInfoImplBase::Weight Weight;
+typedef BlockFrequencyInfoImplBase::FrequencyData FrequencyData;
+
+/// \brief Dithering mass distributer.
+///
+/// This class splits up a single mass into portions by weight, dithering to
+/// spread out error.  No mass is lost.  The dithering precision depends on the
+/// precision of the product of \a BlockMass and \a BranchProbability.
+///
+/// The distribution algorithm follows.
+///
+///  1. Initialize by saving the sum of the weights in \a RemWeight and the
+///     mass to distribute in \a RemMass.
+///
+///  2. For each portion:
+///
+///      1. Construct a branch probability, P, as the portion's weight divided
+///         by the current value of \a RemWeight.
+///      2. Calculate the portion's mass as \a RemMass times P.
+///      3. Update \a RemWeight and \a RemMass at each portion by subtracting
+///         the current portion's weight and mass.
+struct DitheringDistributer {
+  uint32_t RemWeight;
+  BlockMass RemMass;
+
+  DitheringDistributer(Distribution &Dist, const BlockMass &Mass);
+
+  BlockMass takeMass(uint32_t Weight);
+};
+}
+
+DitheringDistributer::DitheringDistributer(Distribution &Dist,
+                                           const BlockMass &Mass) {
+  Dist.normalize();
+  RemWeight = Dist.Total;
+  RemMass = Mass;
+}
+
+BlockMass DitheringDistributer::takeMass(uint32_t Weight) {
+  assert(Weight && "invalid weight");
+  assert(Weight <= RemWeight);
+  BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight);
+
+  // Decrement totals (dither).
+  RemWeight -= Weight;
+  RemMass -= Mass;
+  return Mass;
+}
+
+void Distribution::add(const BlockNode &Node, uint64_t Amount,
+                       Weight::DistType Type) {
+  assert(Amount && "invalid weight of 0");
+  uint64_t NewTotal = Total + Amount;
+
+  // Check for overflow.  It should be impossible to overflow twice.
+  bool IsOverflow = NewTotal < Total;
+  assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow");
+  DidOverflow |= IsOverflow;
+
+  // Update the total.
+  Total = NewTotal;
+
+  // Save the weight.
+  Weight W;
+  W.TargetNode = Node;
+  W.Amount = Amount;
+  W.Type = Type;
+  Weights.push_back(W);
+}
+
+static void combineWeight(Weight &W, const Weight &OtherW) {
+  assert(OtherW.TargetNode.isValid());
+  if (!W.Amount) {
+    W = OtherW;
+    return;
+  }
+  assert(W.Type == OtherW.Type);
+  assert(W.TargetNode == OtherW.TargetNode);
+  assert(W.Amount < W.Amount + OtherW.Amount && "Unexpected overflow");
+  W.Amount += OtherW.Amount;
+}
+static void combineWeightsBySorting(WeightList &Weights) {
+  // Sort so edges to the same node are adjacent.
+  std::sort(Weights.begin(), Weights.end(),
+            [](const Weight &L,
+               const Weight &R) { return L.TargetNode < R.TargetNode; });
+
+  // Combine adjacent edges.
+  WeightList::iterator O = Weights.begin();
+  for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E;
+       ++O, (I = L)) {
+    *O = *I;
+
+    // Find the adjacent weights to the same node.
+    for (++L; L != E && I->TargetNode == L->TargetNode; ++L)
+      combineWeight(*O, *L);
+  }
+
+  // Erase extra entries.
+  Weights.erase(O, Weights.end());
+  return;
+}
+static void combineWeightsByHashing(WeightList &Weights) {
+  // Collect weights into a DenseMap.
+  typedef DenseMap<BlockNode::IndexType, Weight> HashTable;
+  HashTable Combined(NextPowerOf2(2 * Weights.size()));
+  for (const Weight &W : Weights)
+    combineWeight(Combined[W.TargetNode.Index], W);
+
+  // Check whether anything changed.
+  if (Weights.size() == Combined.size())
+    return;
+
+  // Fill in the new weights.
+  Weights.clear();
+  Weights.reserve(Combined.size());
+  for (const auto &I : Combined)
+    Weights.push_back(I.second);
+}
+static void combineWeights(WeightList &Weights) {
+  // Use a hash table for many successors to keep this linear.
+  if (Weights.size() > 128) {
+    combineWeightsByHashing(Weights);
+    return;
+  }
+
+  combineWeightsBySorting(Weights);
+}
+static uint64_t shiftRightAndRound(uint64_t N, int Shift) {
+  assert(Shift >= 0);
+  assert(Shift < 64);
+  if (!Shift)
+    return N;
+  return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1));
+}
+void Distribution::normalize() {
+  // Early exit for termination nodes.
+  if (Weights.empty())
+    return;
+
+  // Only bother if there are multiple successors.
+  if (Weights.size() > 1)
+    combineWeights(Weights);
+
+  // Early exit when combined into a single successor.
+  if (Weights.size() == 1) {
+    Total = 1;
+    Weights.front().Amount = 1;
+    return;
+  }
+
+  // Determine how much to shift right so that the total fits into 32-bits.
+  //
+  // If we shift at all, shift by 1 extra.  Otherwise, the lower limit of 1
+  // for each weight can cause a 32-bit overflow.
+  int Shift = 0;
+  if (DidOverflow)
+    Shift = 33;
+  else if (Total > UINT32_MAX)
+    Shift = 33 - countLeadingZeros(Total);
+
+  // Early exit if nothing needs to be scaled.
+  if (!Shift)
+    return;
+
+  // Recompute the total through accumulation (rather than shifting it) so that
+  // it's accurate after shifting.
+  Total = 0;
+
+  // Sum the weights to each node and shift right if necessary.
+  for (Weight &W : Weights) {
+    // Scale down below UINT32_MAX.  Since Shift is larger than necessary, we
+    // can round here without concern about overflow.
+    assert(W.TargetNode.isValid());
+    W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift));
+    assert(W.Amount <= UINT32_MAX);
+
+    // Update the total.
+    Total += W.Amount;
+  }
+  assert(Total <= UINT32_MAX);
+}
+
+void BlockFrequencyInfoImplBase::clear() {
+  // Swap with a default-constructed std::vector, since std::vector<>::clear()
+  // does not actually clear heap storage.
+  std::vector<FrequencyData>().swap(Freqs);
+  std::vector<WorkingData>().swap(Working);
+  Loops.clear();
+}
+
+/// \brief Clear all memory not needed downstream.
+///
+/// Releases all memory not used downstream.  In particular, saves Freqs.
+static void cleanup(BlockFrequencyInfoImplBase &BFI) {
+  std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs));
+  BFI.clear();
+  BFI.Freqs = std::move(SavedFreqs);
+}
+
+bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
+                                           const LoopData *OuterLoop,
+                                           const BlockNode &Pred,
+                                           const BlockNode &Succ,
+                                           uint64_t Weight) {
+  if (!Weight)
+    Weight = 1;
+
+  auto isLoopHeader = [&OuterLoop](const BlockNode &Node) {
+    return OuterLoop && OuterLoop->isHeader(Node);
+  };
+
+  BlockNode Resolved = Working[Succ.Index].getResolvedNode();
+
+#ifndef NDEBUG
+  auto debugSuccessor = [&](const char *Type) {
+    dbgs() << "  =>"
+           << " [" << Type << "] weight = " << Weight;
+    if (!isLoopHeader(Resolved))
+      dbgs() << ", succ = " << getBlockName(Succ);
+    if (Resolved != Succ)
+      dbgs() << ", resolved = " << getBlockName(Resolved);
+    dbgs() << "\n";
+  };
+  (void)debugSuccessor;
+#endif
+
+  if (isLoopHeader(Resolved)) {
+    DEBUG(debugSuccessor("backedge"));
+    Dist.addBackedge(OuterLoop->getHeader(), Weight);
+    return true;
+  }
+
+  if (Working[Resolved.Index].getContainingLoop() != OuterLoop) {
+    DEBUG(debugSuccessor("  exit  "));
+    Dist.addExit(Resolved, Weight);
+    return true;
+  }
+
+  if (Resolved < Pred) {
+    if (!isLoopHeader(Pred)) {
+      // If OuterLoop is an irreducible loop, we can't actually handle this.
+      assert((!OuterLoop || !OuterLoop->isIrreducible()) &&
+             "unhandled irreducible control flow");
+
+      // Irreducible backedge.  Abort.
+      DEBUG(debugSuccessor("abort!!!"));
+      return false;
+    }
+
+    // If "Pred" is a loop header, then this isn't really a backedge; rather,
+    // OuterLoop must be irreducible.  These false backedges can come only from
+    // secondary loop headers.
+    assert(OuterLoop && OuterLoop->isIrreducible() && !isLoopHeader(Resolved) &&
+           "unhandled irreducible control flow");
+  }
+
+  DEBUG(debugSuccessor(" local  "));
+  Dist.addLocal(Resolved, Weight);
+  return true;
+}
+
+bool BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
+    const LoopData *OuterLoop, LoopData &Loop, Distribution &Dist) {
+  // Copy the exit map into Dist.
+  for (const auto &I : Loop.Exits)
+    if (!addToDist(Dist, OuterLoop, Loop.getHeader(), I.first,
+                   I.second.getMass()))
+      // Irreducible backedge.
+      return false;
+
+  return true;
+}
+
+/// \brief Get the maximum allowed loop scale.
+///
+/// Gives the maximum number of estimated iterations allowed for a loop.  Very
+/// large numbers cause problems downstream (even within 64-bits).
+static Float getMaxLoopScale() { return Float(1, 12); }
+
+/// \brief Compute the loop scale for a loop.
+void BlockFrequencyInfoImplBase::computeLoopScale(LoopData &Loop) {
+  // Compute loop scale.
+  DEBUG(dbgs() << "compute-loop-scale: " << getLoopName(Loop) << "\n");
+
+  // LoopScale == 1 / ExitMass
+  // ExitMass == HeadMass - BackedgeMass
+  BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass;
+
+  // Block scale stores the inverse of the scale.
+  Loop.Scale = ExitMass.toFloat().inverse();
+
+  DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
+               << " - " << Loop.BackedgeMass << ")\n"
+               << " - scale = " << Loop.Scale << "\n");
+
+  if (Loop.Scale > getMaxLoopScale()) {
+    Loop.Scale = getMaxLoopScale();
+    DEBUG(dbgs() << " - reduced-to-max-scale: " << getMaxLoopScale() << "\n");
+  }
+}
+
+/// \brief Package up a loop.
+void BlockFrequencyInfoImplBase::packageLoop(LoopData &Loop) {
+  DEBUG(dbgs() << "packaging-loop: " << getLoopName(Loop) << "\n");
+
+  // Clear the subloop exits to prevent quadratic memory usage.
+  for (const BlockNode &M : Loop.Nodes) {
+    if (auto *Loop = Working[M.Index].getPackagedLoop())
+      Loop->Exits.clear();
+    DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n");
+  }
+  Loop.IsPackaged = true;
+}
+
+void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
+                                                LoopData *OuterLoop,
+                                                Distribution &Dist) {
+  BlockMass Mass = Working[Source.Index].getMass();
+  DEBUG(dbgs() << "  => mass:  " << Mass << "\n");
+
+  // Distribute mass to successors as laid out in Dist.
+  DitheringDistributer D(Dist, Mass);
+
+#ifndef NDEBUG
+  auto debugAssign = [&](const BlockNode &T, const BlockMass &M,
+                         const char *Desc) {
+    dbgs() << "  => assign " << M << " (" << D.RemMass << ")";
+    if (Desc)
+      dbgs() << " [" << Desc << "]";
+    if (T.isValid())
+      dbgs() << " to " << getBlockName(T);
+    dbgs() << "\n";
+  };
+  (void)debugAssign;
+#endif
+
+  for (const Weight &W : Dist.Weights) {
+    // Check for a local edge (non-backedge and non-exit).
+    BlockMass Taken = D.takeMass(W.Amount);
+    if (W.Type == Weight::Local) {
+      Working[W.TargetNode.Index].getMass() += Taken;
+      DEBUG(debugAssign(W.TargetNode, Taken, nullptr));
+      continue;
+    }
+
+    // Backedges and exits only make sense if we're processing a loop.
+    assert(OuterLoop && "backedge or exit outside of loop");
+
+    // Check for a backedge.
+    if (W.Type == Weight::Backedge) {
+      OuterLoop->BackedgeMass += Taken;
+      DEBUG(debugAssign(BlockNode(), Taken, "back"));
+      continue;
+    }
+
+    // This must be an exit.
+    assert(W.Type == Weight::Exit);
+    OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken));
+    DEBUG(debugAssign(W.TargetNode, Taken, "exit"));
+  }
+}
+
+static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
+                                     const Float &Min, const Float &Max) {
+  // Scale the Factor to a size that creates integers.  Ideally, integers would
+  // be scaled so that Max == UINT64_MAX so that they can be best
+  // differentiated.  However, the register allocator currently deals poorly
+  // with large numbers.  Instead, push Min up a little from 1 to give some
+  // room to differentiate small, unequal numbers.
+  //
+  // TODO: fix issues downstream so that ScalingFactor can be Float(1,64)/Max.
+  Float ScalingFactor = Min.inverse();
+  if ((Max / Min).lg() < 60)
+    ScalingFactor <<= 3;
+
+  // Translate the floats to integers.
+  DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
+               << ", factor = " << ScalingFactor << "\n");
+  for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
+    Float Scaled = BFI.Freqs[Index].Floating * ScalingFactor;
+    BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>());
+    DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = "
+                 << BFI.Freqs[Index].Floating << ", scaled = " << Scaled
+                 << ", int = " << BFI.Freqs[Index].Integer << "\n");
+  }
+}
+
+/// \brief Unwrap a loop package.
+///
+/// Visits all the members of a loop, adjusting their BlockData according to
+/// the loop's pseudo-node.
+static void unwrapLoop(BlockFrequencyInfoImplBase &BFI, LoopData &Loop) {
+  DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getLoopName(Loop)
+               << ": mass = " << Loop.Mass << ", scale = " << Loop.Scale
+               << "\n");
+  Loop.Scale *= Loop.Mass.toFloat();
+  Loop.IsPackaged = false;
+  DEBUG(dbgs() << "  => combined-scale = " << Loop.Scale << "\n");
+
+  // Propagate the head scale through the loop.  Since members are visited in
+  // RPO, the head scale will be updated by the loop scale first, and then the
+  // final head scale will be used for updated the rest of the members.
+  for (const BlockNode &N : Loop.Nodes) {
+    const auto &Working = BFI.Working[N.Index];
+    Float &F = Working.isAPackage() ? Working.getPackagedLoop()->Scale
+                                    : BFI.Freqs[N.Index].Floating;
+    Float New = Loop.Scale * F;
+    DEBUG(dbgs() << " - " << BFI.getBlockName(N) << ": " << F << " => " << New
+                 << "\n");
+    F = New;
+  }
+}
+
+void BlockFrequencyInfoImplBase::unwrapLoops() {
+  // Set initial frequencies from loop-local masses.
+  for (size_t Index = 0; Index < Working.size(); ++Index)
+    Freqs[Index].Floating = Working[Index].Mass.toFloat();
+
+  for (LoopData &Loop : Loops)
+    unwrapLoop(*this, Loop);
+}
+
+void BlockFrequencyInfoImplBase::finalizeMetrics() {
+  // Unwrap loop packages in reverse post-order, tracking min and max
+  // frequencies.
+  auto Min = Float::getLargest();
+  auto Max = Float::getZero();
+  for (size_t Index = 0; Index < Working.size(); ++Index) {
+    // Update min/max scale.
+    Min = std::min(Min, Freqs[Index].Floating);
+    Max = std::max(Max, Freqs[Index].Floating);
+  }
+
+  // Convert to integers.
+  convertFloatingToInteger(*this, Min, Max);
+
+  // Clean up data structures.
+  cleanup(*this);
+
+  // Print out the final stats.
+  DEBUG(dump());
+}
+
+BlockFrequency
+BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const {
+  if (!Node.isValid())
+    return 0;
+  return Freqs[Node.Index].Integer;
+}
+Float
+BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
+  if (!Node.isValid())
+    return Float::getZero();
+  return Freqs[Node.Index].Floating;
+}
+
+std::string
+BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const {
+  return std::string();
+}
+std::string
+BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const {
+  return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*");
+}
+
+raw_ostream &
+BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
+                                           const BlockNode &Node) const {
+  return OS << getFloatingBlockFreq(Node);
+}
+
+raw_ostream &
+BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
+                                           const BlockFrequency &Freq) const {
+  Float Block(Freq.getFrequency(), 0);
+  Float Entry(getEntryFreq(), 0);
+
+  return OS << Block / Entry;
+}
+
+void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) {
+  Start = OuterLoop.getHeader();
+  Nodes.reserve(OuterLoop.Nodes.size());
+  for (auto N : OuterLoop.Nodes)
+    addNode(N);
+  indexNodes();
+}
+void IrreducibleGraph::addNodesInFunction() {
+  Start = 0;
+  for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
+    if (!BFI.Working[Index].isPackaged())
+      addNode(Index);
+  indexNodes();
+}
+void IrreducibleGraph::indexNodes() {
+  for (auto &I : Nodes)
+    Lookup[I.Node.Index] = &I;
+}
+void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ,
+                               const BFIBase::LoopData *OuterLoop) {
+  if (OuterLoop && OuterLoop->isHeader(Succ))
+    return;
+  auto L = Lookup.find(Succ.Index);
+  if (L == Lookup.end())
+    return;
+  IrrNode &SuccIrr = *L->second;
+  Irr.Edges.push_back(&SuccIrr);
+  SuccIrr.Edges.push_front(&Irr);
+  ++SuccIrr.NumIn;
+}
+
+namespace llvm {
+template <> struct GraphTraits<IrreducibleGraph> {
+  typedef bfi_detail::IrreducibleGraph GraphT;
+
+  typedef const GraphT::IrrNode NodeType;
+  typedef GraphT::IrrNode::iterator ChildIteratorType;
+
+  static const NodeType *getEntryNode(const GraphT &G) {
+    return G.StartIrr;
+  }
+  static ChildIteratorType child_begin(NodeType *N) { return N->succ_begin(); }
+  static ChildIteratorType child_end(NodeType *N) { return N->succ_end(); }
+};
+}
+
+/// \brief Find extra irreducible headers.
+///
+/// Find entry blocks and other blocks with backedges, which exist when \c G
+/// contains irreducible sub-SCCs.
+static void findIrreducibleHeaders(
+    const BlockFrequencyInfoImplBase &BFI,
+    const IrreducibleGraph &G,
+    const std::vector<const IrreducibleGraph::IrrNode *> &SCC,
+    LoopData::NodeList &Headers, LoopData::NodeList &Others) {
+  // Map from nodes in the SCC to whether it's an entry block.
+  SmallDenseMap<const IrreducibleGraph::IrrNode *, bool, 8> InSCC;
+
+  // InSCC also acts the set of nodes in the graph.  Seed it.
+  for (const auto *I : SCC)
+    InSCC[I] = false;
+
+  for (auto I = InSCC.begin(), E = InSCC.end(); I != E; ++I) {
+    auto &Irr = *I->first;
+    for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) {
+      if (InSCC.count(P))
+        continue;
+
+      // This is an entry block.
+      I->second = true;
+      Headers.push_back(Irr.Node);
+      DEBUG(dbgs() << "  => entry = " << BFI.getBlockName(Irr.Node) << "\n");
+      break;
+    }
+  }
+  assert(Headers.size() >= 2 && "Should be irreducible");
+  if (Headers.size() == InSCC.size()) {
+    // Every block is a header.
+    std::sort(Headers.begin(), Headers.end());
+    return;
+  }
+
+  // Look for extra headers from irreducible sub-SCCs.
+  for (const auto &I : InSCC) {
+    // Entry blocks are already headers.
+    if (I.second)
+      continue;
+
+    auto &Irr = *I.first;
+    for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) {
+      // Skip forward edges.
+      if (P->Node < Irr.Node)
+        continue;
+
+      // Skip predecessors from entry blocks.  These can have inverted
+      // ordering.
+      if (InSCC.lookup(P))
+        continue;
+
+      // Store the extra header.
+      Headers.push_back(Irr.Node);
+      DEBUG(dbgs() << "  => extra = " << BFI.getBlockName(Irr.Node) << "\n");
+      break;
+    }
+    if (Headers.back() == Irr.Node)
+      // Added this as a header.
+      continue;
+
+    // This is not a header.
+    Others.push_back(Irr.Node);
+    DEBUG(dbgs() << "  => other = " << BFI.getBlockName(Irr.Node) << "\n");
+  }
+  std::sort(Headers.begin(), Headers.end());
+  std::sort(Others.begin(), Others.end());
+}
+
+static void createIrreducibleLoop(
+    BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G,
+    LoopData *OuterLoop, std::list<LoopData>::iterator Insert,
+    const std::vector<const IrreducibleGraph::IrrNode *> &SCC) {
+  // Translate the SCC into RPO.
+  DEBUG(dbgs() << " - found-scc\n");
+
+  LoopData::NodeList Headers;
+  LoopData::NodeList Others;
+  findIrreducibleHeaders(BFI, G, SCC, Headers, Others);
+
+  auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(),
+                                Headers.end(), Others.begin(), Others.end());
+
+  // Update loop hierarchy.
+  for (const auto &N : Loop->Nodes)
+    if (BFI.Working[N.Index].isLoopHeader())
+      BFI.Working[N.Index].Loop->Parent = &*Loop;
+    else
+      BFI.Working[N.Index].Loop = &*Loop;
+}
+
+iterator_range<std::list<LoopData>::iterator>
+BlockFrequencyInfoImplBase::analyzeIrreducible(
+    const IrreducibleGraph &G, LoopData *OuterLoop,
+    std::list<LoopData>::iterator Insert) {
+  assert((OuterLoop == nullptr) == (Insert == Loops.begin()));
+  auto Prev = OuterLoop ? std::prev(Insert) : Loops.end();
+
+  for (auto I = scc_begin(G); !I.isAtEnd(); ++I) {
+    if (I->size() < 2)
+      continue;
+
+    // Translate the SCC into RPO.
+    createIrreducibleLoop(*this, G, OuterLoop, Insert, *I);
+  }
+
+  if (OuterLoop)
+    return make_range(std::next(Prev), Insert);
+  return make_range(Loops.begin(), Insert);
+}
+
+void
+BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) {
+  OuterLoop.Exits.clear();
+  OuterLoop.BackedgeMass = BlockMass::getEmpty();
+  auto O = OuterLoop.Nodes.begin() + 1;
+  for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I)
+    if (!Working[I->Index].isPackaged())
+      *O++ = *I;
+  OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end());
+}
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index b901c54..bbd8750 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "branch-prob"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -25,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "branch-prob"
+
 INITIALIZE_PASS_BEGIN(BranchProbabilityInfo, "branch-prob",
                       "Branch Probability Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
@@ -322,6 +323,9 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) {
       InEdges.push_back(I.getSuccessorIndex());
   }
 
+  if (BackEdges.empty() && ExitingEdges.empty())
+    return false;
+
   if (uint32_t numBackEdges = BackEdges.size()) {
     uint32_t backWeight = LBH_TAKEN_WEIGHT / numBackEdges;
     if (backWeight < NORMAL_WEIGHT)
@@ -557,7 +561,7 @@ isEdgeHot(const BasicBlock *Src, const BasicBlock *Dst) const {
 BasicBlock *BranchProbabilityInfo::getHotSucc(BasicBlock *BB) const {
   uint32_t Sum = 0;
   uint32_t MaxWeight = 0;
-  BasicBlock *MaxSucc = 0;
+  BasicBlock *MaxSucc = nullptr;
 
   for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     BasicBlock *Succ = *I;
@@ -577,7 +581,7 @@ BasicBlock *BranchProbabilityInfo::getHotSucc(BasicBlock *BB) const {
   if (BranchProbability(MaxWeight, Sum) > BranchProbability(4, 5))
     return MaxSucc;
 
-  return 0;
+  return nullptr;
 }
 
 /// Get the raw edge weight for the edge. If can't find it, return
@@ -594,11 +598,9 @@ getEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors) const {
   return DEFAULT_WEIGHT;
 }
 
-uint32_t
-BranchProbabilityInfo::
-getEdgeWeight(const BasicBlock *Src, succ_const_iterator Dst) const {
-  size_t index = std::distance(succ_begin(Src), Dst);
-  return getEdgeWeight(Src, index);
+uint32_t BranchProbabilityInfo::getEdgeWeight(const BasicBlock *Src,
+                                              succ_const_iterator Dst) const {
+  return getEdgeWeight(Src, Dst.getSuccessorIndex());
 }
 
 /// Get the raw edge weight calculated for the block pair. This returns the sum
diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
index 6963760..8ef5302 100644
--- a/lib/Analysis/CFG.cpp
+++ b/lib/Analysis/CFG.cpp
@@ -123,7 +123,7 @@ static bool loopContainsBoth(const LoopInfo *LI,
                              const BasicBlock *BB1, const BasicBlock *BB2) {
   const Loop *L1 = getOutermostLoop(LI, BB1);
   const Loop *L2 = getOutermostLoop(LI, BB2);
-  return L1 != NULL && L1 == L2;
+  return L1 != nullptr && L1 == L2;
 }
 
 static bool isPotentiallyReachableInner(SmallVectorImpl<BasicBlock *> &Worklist,
@@ -133,7 +133,7 @@ static bool isPotentiallyReachableInner(SmallVectorImpl<BasicBlock *> &Worklist,
   // When the stop block is unreachable, it's dominated from everywhere,
   // regardless of whether there's a path between the two blocks.
   if (DT && !DT->isReachableFromEntry(StopBB))
-    DT = 0;
+    DT = nullptr;
 
   // Limit the number of blocks we visit. The goal is to avoid run-away compile
   // times on large CFGs without hampering sensible code. Arbitrarily chosen.
@@ -156,7 +156,7 @@ static bool isPotentiallyReachableInner(SmallVectorImpl<BasicBlock *> &Worklist,
       return true;
     }
 
-    if (const Loop *Outer = LI ? getOutermostLoop(LI, BB) : 0) {
+    if (const Loop *Outer = LI ? getOutermostLoop(LI, BB) : nullptr) {
       // All blocks in a single loop are reachable from all other blocks. From
       // any of these blocks, we can skip directly to the exits of the loop,
       // ignoring any other blocks inside the loop body.
@@ -200,7 +200,7 @@ bool llvm::isPotentiallyReachable(const Instruction *A, const Instruction *B,
 
     // If the block is in a loop then we can reach any instruction in the block
     // from any other instruction in the block by going around a backedge.
-    if (LI && LI->getLoopFor(BB) != 0)
+    if (LI && LI->getLoopFor(BB) != nullptr)
       return true;
 
     // Linear scan, start at 'A', see whether we hit 'B' or the end first.
diff --git a/lib/Analysis/CFGPrinter.cpp b/lib/Analysis/CFGPrinter.cpp
index 537d6d1..c2c19d6 100644
--- a/lib/Analysis/CFGPrinter.cpp
+++ b/lib/Analysis/CFGPrinter.cpp
@@ -19,6 +19,7 @@
 
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/FileSystem.h"
 using namespace llvm;
 
 namespace {
@@ -33,7 +34,7 @@ namespace {
       return false;
     }
 
-    void print(raw_ostream &OS, const Module* = 0) const override {}
+    void print(raw_ostream &OS, const Module* = nullptr) const override {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesAll();
@@ -56,7 +57,7 @@ namespace {
       return false;
     }
 
-    void print(raw_ostream &OS, const Module* = 0) const override {}
+    void print(raw_ostream &OS, const Module* = nullptr) const override {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesAll();
@@ -90,7 +91,7 @@ namespace {
       return false;
     }
 
-    void print(raw_ostream &OS, const Module* = 0) const override {}
+    void print(raw_ostream &OS, const Module* = nullptr) const override {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesAll();
@@ -123,7 +124,7 @@ namespace {
       errs() << "\n";
       return false;
     }
-    void print(raw_ostream &OS, const Module* = 0) const override {}
+    void print(raw_ostream &OS, const Module* = nullptr) const override {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesAll();
@@ -147,8 +148,8 @@ void Function::viewCFG() const {
 
 /// viewCFGOnly - This function is meant for use from the debugger.  It works
 /// just like viewCFG, but it does not include the contents of basic blocks
-/// into the nodes, just the label.  If you are only interested in the CFG t
-/// his can make the graph smaller.
+/// into the nodes, just the label.  If you are only interested in the CFG
+/// this can make the graph smaller.
 ///
 void Function::viewCFGOnly() const {
   ViewGraph(this, "cfg" + getName(), true);
diff --git a/lib/Analysis/CGSCCPassManager.cpp b/lib/Analysis/CGSCCPassManager.cpp
new file mode 100644
index 0000000..5d1d8a9
--- /dev/null
+++ b/lib/Analysis/CGSCCPassManager.cpp
@@ -0,0 +1,167 @@
+//===- CGSCCPassManager.cpp - Managing & running CGSCC passes -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DebugPM("debug-cgscc-pass-manager", cl::Hidden,
+        cl::desc("Print CGSCC pass management debugging information"));
+
+PreservedAnalyses CGSCCPassManager::run(LazyCallGraph::SCC *C,
+                                        CGSCCAnalysisManager *AM) {
+  PreservedAnalyses PA = PreservedAnalyses::all();
+
+  if (DebugPM)
+    dbgs() << "Starting CGSCC pass manager run.\n";
+
+  for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx) {
+    if (DebugPM)
+      dbgs() << "Running CGSCC pass: " << Passes[Idx]->name() << "\n";
+
+    PreservedAnalyses PassPA = Passes[Idx]->run(C, AM);
+    if (AM)
+      AM->invalidate(C, PassPA);
+    PA.intersect(std::move(PassPA));
+  }
+
+  if (DebugPM)
+    dbgs() << "Finished CGSCC pass manager run.\n";
+
+  return PA;
+}
+
+bool CGSCCAnalysisManager::empty() const {
+  assert(CGSCCAnalysisResults.empty() == CGSCCAnalysisResultLists.empty() &&
+         "The storage and index of analysis results disagree on how many there "
+         "are!");
+  return CGSCCAnalysisResults.empty();
+}
+
+void CGSCCAnalysisManager::clear() {
+  CGSCCAnalysisResults.clear();
+  CGSCCAnalysisResultLists.clear();
+}
+
+CGSCCAnalysisManager::ResultConceptT &
+CGSCCAnalysisManager::getResultImpl(void *PassID, LazyCallGraph::SCC *C) {
+  CGSCCAnalysisResultMapT::iterator RI;
+  bool Inserted;
+  std::tie(RI, Inserted) = CGSCCAnalysisResults.insert(std::make_pair(
+      std::make_pair(PassID, C), CGSCCAnalysisResultListT::iterator()));
+
+  // If we don't have a cached result for this function, look up the pass and
+  // run it to produce a result, which we then add to the cache.
+  if (Inserted) {
+    CGSCCAnalysisResultListT &ResultList = CGSCCAnalysisResultLists[C];
+    ResultList.emplace_back(PassID, lookupPass(PassID).run(C, this));
+    RI->second = std::prev(ResultList.end());
+  }
+
+  return *RI->second->second;
+}
+
+CGSCCAnalysisManager::ResultConceptT *
+CGSCCAnalysisManager::getCachedResultImpl(void *PassID,
+                                          LazyCallGraph::SCC *C) const {
+  CGSCCAnalysisResultMapT::const_iterator RI =
+      CGSCCAnalysisResults.find(std::make_pair(PassID, C));
+  return RI == CGSCCAnalysisResults.end() ? nullptr : &*RI->second->second;
+}
+
+void CGSCCAnalysisManager::invalidateImpl(void *PassID, LazyCallGraph::SCC *C) {
+  CGSCCAnalysisResultMapT::iterator RI =
+      CGSCCAnalysisResults.find(std::make_pair(PassID, C));
+  if (RI == CGSCCAnalysisResults.end())
+    return;
+
+  CGSCCAnalysisResultLists[C].erase(RI->second);
+}
+
+void CGSCCAnalysisManager::invalidateImpl(LazyCallGraph::SCC *C,
+                                          const PreservedAnalyses &PA) {
+  // Clear all the invalidated results associated specifically with this
+  // function.
+  SmallVector<void *, 8> InvalidatedPassIDs;
+  CGSCCAnalysisResultListT &ResultsList = CGSCCAnalysisResultLists[C];
+  for (CGSCCAnalysisResultListT::iterator I = ResultsList.begin(),
+                                          E = ResultsList.end();
+       I != E;)
+    if (I->second->invalidate(C, PA)) {
+      InvalidatedPassIDs.push_back(I->first);
+      I = ResultsList.erase(I);
+    } else {
+      ++I;
+    }
+  while (!InvalidatedPassIDs.empty())
+    CGSCCAnalysisResults.erase(
+        std::make_pair(InvalidatedPassIDs.pop_back_val(), C));
+  CGSCCAnalysisResultLists.erase(C);
+}
+
+char CGSCCAnalysisManagerModuleProxy::PassID;
+
+CGSCCAnalysisManagerModuleProxy::Result
+CGSCCAnalysisManagerModuleProxy::run(Module *M) {
+  assert(CGAM->empty() && "CGSCC analyses ran prior to the module proxy!");
+  return Result(*CGAM);
+}
+
+CGSCCAnalysisManagerModuleProxy::Result::~Result() {
+  // Clear out the analysis manager if we're being destroyed -- it means we
+  // didn't even see an invalidate call when we got invalidated.
+  CGAM->clear();
+}
+
+bool CGSCCAnalysisManagerModuleProxy::Result::invalidate(
+    Module *M, const PreservedAnalyses &PA) {
+  // If this proxy isn't marked as preserved, then we can't even invalidate
+  // individual CGSCC analyses, there may be an invalid set of SCC objects in
+  // the cache making it impossible to incrementally preserve them.
+  // Just clear the entire manager.
+  if (!PA.preserved(ID()))
+    CGAM->clear();
+
+  // Return false to indicate that this result is still a valid proxy.
+  return false;
+}
+
+char ModuleAnalysisManagerCGSCCProxy::PassID;
+
+char FunctionAnalysisManagerCGSCCProxy::PassID;
+
+FunctionAnalysisManagerCGSCCProxy::Result
+FunctionAnalysisManagerCGSCCProxy::run(LazyCallGraph::SCC *C) {
+  assert(FAM->empty() && "Function analyses ran prior to the CGSCC proxy!");
+  return Result(*FAM);
+}
+
+FunctionAnalysisManagerCGSCCProxy::Result::~Result() {
+  // Clear out the analysis manager if we're being destroyed -- it means we
+  // didn't even see an invalidate call when we got invalidated.
+  FAM->clear();
+}
+
+bool FunctionAnalysisManagerCGSCCProxy::Result::invalidate(
+    LazyCallGraph::SCC *C, const PreservedAnalyses &PA) {
+  // If this proxy isn't marked as preserved, then we can't even invalidate
+  // individual function analyses, there may be an invalid set of Function
+  // objects in the cache making it impossible to incrementally preserve them.
+  // Just clear the entire manager.
+  if (!PA.preserved(ID()))
+    FAM->clear();
+
+  // Return false to indicate that this result is still a valid proxy.
+  return false;
+}
+
+char CGSCCAnalysisManagerFunctionProxy::PassID;
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index c6d4573..b546789 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -7,9 +7,11 @@ add_llvm_library(LLVMAnalysis
   Analysis.cpp
   BasicAliasAnalysis.cpp
   BlockFrequencyInfo.cpp
+  BlockFrequencyInfoImpl.cpp
   BranchProbabilityInfo.cpp
   CFG.cpp
   CFGPrinter.cpp
+  CGSCCPassManager.cpp
   CaptureTracking.cpp
   CostModel.cpp
   CodeMetrics.cpp
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 782acfa..0ac1cb5 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -56,7 +56,7 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
   // Handle a vector->integer cast.
   if (IntegerType *IT = dyn_cast<IntegerType>(DestTy)) {
     VectorType *VTy = dyn_cast<VectorType>(C->getType());
-    if (VTy == 0)
+    if (!VTy)
       return ConstantExpr::getBitCast(C, DestTy);
 
     unsigned NumSrcElts = VTy->getNumElements();
@@ -73,7 +73,7 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
     }
 
     ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(C);
-    if (CDV == 0)
+    if (!CDV)
       return ConstantExpr::getBitCast(C, DestTy);
 
     // Now that we know that the input value is a vector of integers, just shift
@@ -93,7 +93,7 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
 
   // The code below only handles casts to vectors currently.
   VectorType *DestVTy = dyn_cast<VectorType>(DestTy);
-  if (DestVTy == 0)
+  if (!DestVTy)
     return ConstantExpr::getBitCast(C, DestTy);
 
   // If this is a scalar -> vector cast, convert the input into a <1 x scalar>
@@ -411,32 +411,32 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
                                         TD.getTypeAllocSizeInBits(LoadTy),
                                         AS);
     } else
-      return 0;
+      return nullptr;
 
     C = FoldBitCast(C, MapTy, TD);
     if (Constant *Res = FoldReinterpretLoadFromConstPtr(C, TD))
       return FoldBitCast(Res, LoadTy, TD);
-    return 0;
+    return nullptr;
   }
 
   unsigned BytesLoaded = (IntType->getBitWidth() + 7) / 8;
   if (BytesLoaded > 32 || BytesLoaded == 0)
-    return 0;
+    return nullptr;
 
   GlobalValue *GVal;
   APInt Offset;
   if (!IsConstantOffsetFromGlobal(C, GVal, Offset, TD))
-    return 0;
+    return nullptr;
 
   GlobalVariable *GV = dyn_cast<GlobalVariable>(GVal);
   if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
       !GV->getInitializer()->getType()->isSized())
-    return 0;
+    return nullptr;
 
   // If we're loading off the beginning of the global, some bytes may be valid,
   // but we don't try to handle this.
   if (Offset.isNegative())
-    return 0;
+    return nullptr;
 
   // If we're not accessing anything in this constant, the result is undefined.
   if (Offset.getZExtValue() >=
@@ -446,7 +446,7 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
   unsigned char RawBytes[32] = {0};
   if (!ReadDataFromGlobal(GV->getInitializer(), Offset.getZExtValue(), RawBytes,
                           BytesLoaded, TD))
-    return 0;
+    return nullptr;
 
   APInt ResultVal = APInt(IntType->getBitWidth(), 0);
   if (TD.isLittleEndian()) {
@@ -466,6 +466,52 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
   return ConstantInt::get(IntType->getContext(), ResultVal);
 }
 
+static Constant *ConstantFoldLoadThroughBitcast(ConstantExpr *CE,
+                                                const DataLayout *DL) {
+  if (!DL)
+    return nullptr;
+  auto *DestPtrTy = dyn_cast<PointerType>(CE->getType());
+  if (!DestPtrTy)
+    return nullptr;
+  Type *DestTy = DestPtrTy->getElementType();
+
+  Constant *C = ConstantFoldLoadFromConstPtr(CE->getOperand(0), DL);
+  if (!C)
+    return nullptr;
+
+  do {
+    Type *SrcTy = C->getType();
+
+    // If the type sizes are the same and a cast is legal, just directly
+    // cast the constant.
+    if (DL->getTypeSizeInBits(DestTy) == DL->getTypeSizeInBits(SrcTy)) {
+      Instruction::CastOps Cast = Instruction::BitCast;
+      // If we are going from a pointer to int or vice versa, we spell the cast
+      // differently.
+      if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
+        Cast = Instruction::IntToPtr;
+      else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
+        Cast = Instruction::PtrToInt;
+
+      if (CastInst::castIsValid(Cast, C, DestTy))
+        return ConstantExpr::getCast(Cast, C, DestTy);
+    }
+
+    // If this isn't an aggregate type, there is nothing we can do to drill down
+    // and find a bitcastable constant.
+    if (!SrcTy->isAggregateType())
+      return nullptr;
+
+    // We're simulating a load through a pointer that was bitcast to point to
+    // a different type, so we can try to walk down through the initial
+    // elements of an aggregate to see if some part of th e aggregate is
+    // castable to implement the "load" semantic model.
+    C = C->getAggregateElement(0u);
+  } while (C);
+
+  return nullptr;
+}
+
 /// ConstantFoldLoadFromConstPtr - Return the value that a load from C would
 /// produce if it is constant and determinable.  If this is not determinable,
 /// return null.
@@ -479,7 +525,7 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
   // If the loaded value isn't a constant expr, we can't handle it.
   ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
   if (!CE)
-    return 0;
+    return nullptr;
 
   if (CE->getOpcode() == Instruction::GetElementPtr) {
     if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0))) {
@@ -491,6 +537,10 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
     }
   }
 
+  if (CE->getOpcode() == Instruction::BitCast)
+    if (Constant *LoadedC = ConstantFoldLoadThroughBitcast(CE, TD))
+      return LoadedC;
+
   // Instead of loading constant c string, use corresponding integer value
   // directly if string length is small enough.
   StringRef Str;
@@ -542,16 +592,16 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
   // Try hard to fold loads from bitcasted strange and non-type-safe things.
   if (TD)
     return FoldReinterpretLoadFromConstPtr(CE, *TD);
-  return 0;
+  return nullptr;
 }
 
 static Constant *ConstantFoldLoadInst(const LoadInst *LI, const DataLayout *TD){
-  if (LI->isVolatile()) return 0;
+  if (LI->isVolatile()) return nullptr;
 
   if (Constant *C = dyn_cast<Constant>(LI->getOperand(0)))
     return ConstantFoldLoadFromConstPtr(C, TD);
 
-  return 0;
+  return nullptr;
 }
 
 /// SymbolicallyEvaluateBinop - One of Op0/Op1 is a constant expression.
@@ -571,8 +621,8 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
     unsigned BitWidth = DL->getTypeSizeInBits(Op0->getType()->getScalarType());
     APInt KnownZero0(BitWidth, 0), KnownOne0(BitWidth, 0);
     APInt KnownZero1(BitWidth, 0), KnownOne1(BitWidth, 0);
-    ComputeMaskedBits(Op0, KnownZero0, KnownOne0, DL);
-    ComputeMaskedBits(Op1, KnownZero1, KnownOne1, DL);
+    computeKnownBits(Op0, KnownZero0, KnownOne0, DL);
+    computeKnownBits(Op1, KnownZero1, KnownOne1, DL);
     if ((KnownOne1 | KnownZero0).isAllOnesValue()) {
       // All the bits of Op0 that the 'and' could be masking are already zero.
       return Op0;
@@ -608,7 +658,7 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
       }
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// CastGEPIndices - If array indices are not pointer-sized integers,
@@ -618,7 +668,7 @@ static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
                                 Type *ResultTy, const DataLayout *TD,
                                 const TargetLibraryInfo *TLI) {
   if (!TD)
-    return 0;
+    return nullptr;
 
   Type *IntPtrTy = TD->getIntPtrType(ResultTy);
 
@@ -641,7 +691,7 @@ static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
   }
 
   if (!Any)
-    return 0;
+    return nullptr;
 
   Constant *C = ConstantExpr::getGetElementPtr(Ops[0], NewIdxs);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
@@ -676,7 +726,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
   Constant *Ptr = Ops[0];
   if (!TD || !Ptr->getType()->getPointerElementType()->isSized() ||
       !Ptr->getType()->isPointerTy())
-    return 0;
+    return nullptr;
 
   Type *IntPtrTy = TD->getIntPtrType(Ptr->getType());
   Type *ResultElementTy = ResultTy->getPointerElementType();
@@ -690,7 +740,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
       // "inttoptr (sub (ptrtoint Ptr), V)"
       if (Ops.size() == 2 && ResultElementTy->isIntegerTy(8)) {
         ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[1]);
-        assert((CE == 0 || CE->getType() == IntPtrTy) &&
+        assert((!CE || CE->getType() == IntPtrTy) &&
                "CastGEPIndices didn't canonicalize index types!");
         if (CE && CE->getOpcode() == Instruction::Sub &&
             CE->getOperand(0)->isNullValue()) {
@@ -702,7 +752,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
           return Res;
         }
       }
-      return 0;
+      return nullptr;
     }
 
   unsigned BitWidth = TD->getTypeSizeInBits(IntPtrTy);
@@ -765,7 +815,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
 
         // Only handle pointers to sized types, not pointers to functions.
         if (!ATy->getElementType()->isSized())
-          return 0;
+          return nullptr;
       }
 
       // Determine which element of the array the offset points into.
@@ -810,7 +860,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
   // type, then the offset is pointing into the middle of an indivisible
   // member, so we can't simplify it.
   if (Offset != 0)
-    return 0;
+    return nullptr;
 
   // Create a GEP.
   Constant *C = ConstantExpr::getGetElementPtr(Ptr, NewIdxs);
@@ -841,7 +891,7 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
                                         const TargetLibraryInfo *TLI) {
   // Handle PHI nodes quickly here...
   if (PHINode *PN = dyn_cast<PHINode>(I)) {
-    Constant *CommonValue = 0;
+    Constant *CommonValue = nullptr;
 
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
       Value *Incoming = PN->getIncomingValue(i);
@@ -854,14 +904,14 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
       // If the incoming value is not a constant, then give up.
       Constant *C = dyn_cast<Constant>(Incoming);
       if (!C)
-        return 0;
+        return nullptr;
       // Fold the PHI's operands.
       if (ConstantExpr *NewC = dyn_cast<ConstantExpr>(C))
         C = ConstantFoldConstantExpression(NewC, TD, TLI);
       // If the incoming value is a different constant to
       // the one we saw previously, then give up.
       if (CommonValue && C != CommonValue)
-        return 0;
+        return nullptr;
       CommonValue = C;
     }
 
@@ -876,7 +926,7 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
   for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
     Constant *Op = dyn_cast<Constant>(*i);
     if (!Op)
-      return 0;  // All operands not constant!
+      return nullptr;  // All operands not constant!
 
     // Fold the Instruction's operands.
     if (ConstantExpr *NewCE = dyn_cast<ConstantExpr>(Op))
@@ -966,14 +1016,14 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
   }
 
   switch (Opcode) {
-  default: return 0;
+  default: return nullptr;
   case Instruction::ICmp:
   case Instruction::FCmp: llvm_unreachable("Invalid for compares");
   case Instruction::Call:
     if (Function *F = dyn_cast<Function>(Ops.back()))
       if (canConstantFoldCallTo(F))
         return ConstantFoldCall(F, Ops.slice(0, Ops.size() - 1), TLI);
-    return 0;
+    return nullptr;
   case Instruction::PtrToInt:
     // If the input is a inttoptr, eliminate the pair.  This requires knowing
     // the width of a pointer, so it can't be done in ConstantExpr::getCast.
@@ -1142,14 +1192,14 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
 Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
                                                        ConstantExpr *CE) {
   if (!CE->getOperand(1)->isNullValue())
-    return 0;  // Do not allow stepping over the value!
+    return nullptr;  // Do not allow stepping over the value!
 
   // Loop over all of the operands, tracking down which value we are
   // addressing.
   for (unsigned i = 2, e = CE->getNumOperands(); i != e; ++i) {
     C = C->getAggregateElement(CE->getOperand(i));
-    if (C == 0)
-      return 0;
+    if (!C)
+      return nullptr;
   }
   return C;
 }
@@ -1164,8 +1214,8 @@ Constant *llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
   // addressing.
   for (unsigned i = 0, e = Indices.size(); i != e; ++i) {
     C = C->getAggregateElement(Indices[i]);
-    if (C == 0)
-      return 0;
+    if (!C)
+      return nullptr;
   }
   return C;
 }
@@ -1270,7 +1320,7 @@ static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
   V = NativeFP(V);
   if (sys::llvm_fenv_testexcept()) {
     sys::llvm_fenv_clearexcept();
-    return 0;
+    return nullptr;
   }
 
   return GetConstantFoldFPValue(V, Ty);
@@ -1282,7 +1332,7 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
   V = NativeFP(V, W);
   if (sys::llvm_fenv_testexcept()) {
     sys::llvm_fenv_clearexcept();
-    return 0;
+    return nullptr;
   }
 
   return GetConstantFoldFPValue(V, Ty);
@@ -1311,7 +1361,7 @@ static Constant *ConstantFoldConvertToInt(const APFloat &Val,
                                                   /*isSigned=*/true, mode,
                                                   &isExact);
   if (status != APFloat::opOK && status != APFloat::opInexact)
-    return 0;
+    return nullptr;
   return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
 }
 
@@ -1345,7 +1395,7 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
       }
 
       if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
-        return 0;
+        return nullptr;
 
       if (IntrinsicID == Intrinsic::round) {
         APFloat V = Op->getValueAPF();
@@ -1357,7 +1407,7 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
       /// likely to be aborted with an exception anyway, and some host libms
       /// have known errors raising exceptions.
       if (Op->getValueAPF().isNaN() || Op->getValueAPF().isInfinity())
-        return 0;
+        return nullptr;
 
       /// Currently APFloat versions of these functions do not exist, so we use
       /// the host native double versions.  Float versions are not called
@@ -1396,7 +1446,7 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
       }
 
       if (!TLI)
-        return 0;
+        return nullptr;
 
       switch (Name[0]) {
       case 'a':
@@ -1467,7 +1517,7 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
       default:
         break;
       }
-      return 0;
+      return nullptr;
     }
 
     if (ConstantInt *Op = dyn_cast<ConstantInt>(Operands[0])) {
@@ -1491,7 +1541,7 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
         return ConstantFP::get(Ty->getContext(), Val);
       }
       default:
-        return 0;
+        return nullptr;
       }
     }
 
@@ -1523,21 +1573,21 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
     if (isa<UndefValue>(Operands[0])) {
       if (IntrinsicID == Intrinsic::bswap)
         return Operands[0];
-      return 0;
+      return nullptr;
     }
 
-    return 0;
+    return nullptr;
   }
 
   if (Operands.size() == 2) {
     if (ConstantFP *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
       if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
-        return 0;
+        return nullptr;
       double Op1V = getValueAsDouble(Op1);
 
       if (ConstantFP *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
         if (Op2->getType() != Op1->getType())
-          return 0;
+          return nullptr;
 
         double Op2V = getValueAsDouble(Op2);
         if (IntrinsicID == Intrinsic::pow) {
@@ -1550,7 +1600,7 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
           return ConstantFP::get(Ty->getContext(), V1);
         }
         if (!TLI)
-          return 0;
+          return nullptr;
         if (Name == "pow" && TLI->has(LibFunc::pow))
           return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
         if (Name == "fmod" && TLI->has(LibFunc::fmod))
@@ -1571,7 +1621,7 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
                                  APFloat((double)std::pow((double)Op1V,
                                                    (int)Op2C->getZExtValue())));
       }
-      return 0;
+      return nullptr;
     }
 
     if (ConstantInt *Op1 = dyn_cast<ConstantInt>(Operands[0])) {
@@ -1624,13 +1674,13 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
         }
       }
 
-      return 0;
+      return nullptr;
     }
-    return 0;
+    return nullptr;
   }
 
   if (Operands.size() != 3)
-    return 0;
+    return nullptr;
 
   if (const ConstantFP *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
     if (const ConstantFP *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
@@ -1646,14 +1696,14 @@ static Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID,
           if (s != APFloat::opInvalidOp)
             return ConstantFP::get(Ty->getContext(), V);
 
-          return 0;
+          return nullptr;
         }
         }
       }
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 static Constant *ConstantFoldVectorCall(StringRef Name, unsigned IntrinsicID,
@@ -1690,7 +1740,7 @@ Constant *
 llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
                        const TargetLibraryInfo *TLI) {
   if (!F->hasName())
-    return 0;
+    return nullptr;
   StringRef Name = F->getName();
 
   Type *Ty = F->getReturnType();
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index b49211d..780b1aa 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -17,8 +17,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define CM_NAME "cost-model"
-#define DEBUG_TYPE CM_NAME
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -32,6 +30,9 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define CM_NAME "cost-model"
+#define DEBUG_TYPE CM_NAME
+
 static cl::opt<bool> EnableReduxCost("costmodel-reduxcost", cl::init(false),
                                      cl::Hidden,
                                      cl::desc("Recognize reduction patterns."));
@@ -41,7 +42,7 @@ namespace {
 
   public:
     static char ID; // Class identification, replacement for typeinfo
-    CostModelAnalysis() : FunctionPass(ID), F(0), TTI(0) {
+    CostModelAnalysis() : FunctionPass(ID), F(nullptr), TTI(nullptr) {
       initializeCostModelAnalysisPass(
         *PassRegistry::getPassRegistry());
     }
@@ -101,24 +102,13 @@ static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) {
   // Check for a splat of a constant or for a non uniform vector of constants.
   if (isa<ConstantVector>(V) || isa<ConstantDataVector>(V)) {
     OpInfo = TargetTransformInfo::OK_NonUniformConstantValue;
-    if (cast<Constant>(V)->getSplatValue() != NULL)
+    if (cast<Constant>(V)->getSplatValue() != nullptr)
       OpInfo = TargetTransformInfo::OK_UniformConstantValue;
   }
 
   return OpInfo;
 }
 
-static bool matchMask(SmallVectorImpl<int> &M1, SmallVectorImpl<int> &M2) {
-  if (M1.size() != M2.size())
-    return false;
-
-  for (unsigned i = 0, e = M1.size(); i != e; ++i)
-    if (M1[i] != M2[i])
-      return false;
-
-  return true;
-}
-
 static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
                                      unsigned Level) {
   // We don't need a shuffle if we just want to have element 0 in position 0 of
@@ -136,7 +126,7 @@ static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
     Mask[i] = val;
 
   SmallVector<int, 16> ActualMask = SI->getShuffleMask();
-  if (!matchMask(Mask, ActualMask))
+  if (Mask != ActualMask)
     return false;
 
   return true;
@@ -150,7 +140,7 @@ static bool matchPairwiseReductionAtLevel(const BinaryOperator *BinOp,
   // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
   //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
-  if (BinOp == 0)
+  if (BinOp == nullptr)
     return false;
 
   assert(BinOp->getType()->isVectorTy() && "Expecting a vector type");
@@ -171,9 +161,9 @@ static bool matchPairwiseReductionAtLevel(const BinaryOperator *BinOp,
     return false;
 
   // Shuffle inputs must match.
-  Value *NextLevelOpL = LS ? LS->getOperand(0) : 0;
-  Value *NextLevelOpR = RS ? RS->getOperand(0) : 0;
-  Value *NextLevelOp = 0;
+  Value *NextLevelOpL = LS ? LS->getOperand(0) : nullptr;
+  Value *NextLevelOpR = RS ? RS->getOperand(0) : nullptr;
+  Value *NextLevelOp = nullptr;
   if (NextLevelOpR && NextLevelOpL) {
     // If we have two shuffles their operands must match.
     if (NextLevelOpL != NextLevelOpR)
@@ -198,7 +188,7 @@ static bool matchPairwiseReductionAtLevel(const BinaryOperator *BinOp,
 
   // Check that the next levels binary operation exists and matches with the
   // current one.
-  BinaryOperator *NextLevelBinOp = 0;
+  BinaryOperator *NextLevelBinOp = nullptr;
   if (Level + 1 != NumLevels) {
     if (!(NextLevelBinOp = dyn_cast<BinaryOperator>(NextLevelOp)))
       return false;
@@ -277,7 +267,7 @@ getShuffleAndOtherOprd(BinaryOperator *B) {
 
   Value *L = B->getOperand(0);
   Value *R = B->getOperand(1);
-  ShuffleVectorInst *S = 0;
+  ShuffleVectorInst *S = nullptr;
 
   if ((S = dyn_cast<ShuffleVectorInst>(L)))
     return std::make_pair(R, S);
@@ -337,7 +327,7 @@ static bool matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
     std::tie(NextRdxOp, Shuffle) = getShuffleAndOtherOprd(BinOp);
 
     // Check the current reduction operation and the shuffle use the same value.
-    if (Shuffle == 0)
+    if (Shuffle == nullptr)
       return false;
     if (Shuffle->getOperand(0) != NextRdxOp)
       return false;
@@ -349,7 +339,7 @@ static bool matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
     std::fill(&ShuffleMask[MaskStart], ShuffleMask.end(), -1);
 
     SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
-    if (!matchMask(ShuffleMask, Mask))
+    if (ShuffleMask != Mask)
       return false;
 
     RdxOp = NextRdxOp;
@@ -478,7 +468,7 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
 
     if (NumVecElems == Mask.size() && isReverseVectorMask(Mask))
       return TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0, 0,
-                                 0);
+                                 nullptr);
     return -1;
   }
   case Instruction::Call:
diff --git a/lib/Analysis/Delinearization.cpp b/lib/Analysis/Delinearization.cpp
index fd4a2f0..9334ceb 100644
--- a/lib/Analysis/Delinearization.cpp
+++ b/lib/Analysis/Delinearization.cpp
@@ -14,8 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DL_NAME "delinearize"
-#define DEBUG_TYPE DL_NAME
 #include "llvm/IR/Constants.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/Passes.h"
@@ -34,6 +32,9 @@
 
 using namespace llvm;
 
+#define DL_NAME "delinearize"
+#define DEBUG_TYPE DL_NAME
+
 namespace {
 
 class Delinearization : public FunctionPass {
@@ -51,7 +52,7 @@ public:
   }
   bool runOnFunction(Function &F) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override;
-  void print(raw_ostream &O, const Module *M = 0) const override;
+  void print(raw_ostream &O, const Module *M = nullptr) const override;
 };
 
 } // end anonymous namespace
@@ -76,7 +77,7 @@ static Value *getPointerOperand(Instruction &Inst) {
     return Store->getPointerOperand();
   else if (GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(&Inst))
     return Gep->getPointerOperand();
-  return NULL;
+  return nullptr;
 }
 
 void Delinearization::print(raw_ostream &O, const Module *) const {
@@ -92,25 +93,38 @@ void Delinearization::print(raw_ostream &O, const Module *) const {
     const BasicBlock *BB = Inst->getParent();
     // Delinearize the memory access as analyzed in all the surrounding loops.
     // Do not analyze memory accesses outside loops.
-    for (Loop *L = LI->getLoopFor(BB); L != NULL; L = L->getParentLoop()) {
+    for (Loop *L = LI->getLoopFor(BB); L != nullptr; L = L->getParentLoop()) {
       const SCEV *AccessFn = SE->getSCEVAtScope(getPointerOperand(*Inst), L);
+
+      const SCEVUnknown *BasePointer =
+          dyn_cast<SCEVUnknown>(SE->getPointerBase(AccessFn));
+      // Do not delinearize if we cannot find the base pointer.
+      if (!BasePointer)
+        break;
+      AccessFn = SE->getMinusSCEV(AccessFn, BasePointer);
       const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(AccessFn);
 
       // Do not try to delinearize memory accesses that are not AddRecs.
       if (!AR)
         break;
 
+
+      O << "\n";
+      O << "Inst:" << *Inst << "\n";
+      O << "In Loop with Header: " << L->getHeader()->getName() << "\n";
       O << "AddRec: " << *AR << "\n";
 
       SmallVector<const SCEV *, 3> Subscripts, Sizes;
-      const SCEV *Res = AR->delinearize(*SE, Subscripts, Sizes);
-      int Size = Subscripts.size();
-      if (Res == AR || Size == 0) {
+      AR->delinearize(*SE, Subscripts, Sizes, SE->getElementSize(Inst));
+      if (Subscripts.size() == 0 || Sizes.size() == 0 ||
+          Subscripts.size() != Sizes.size()) {
         O << "failed to delinearize\n";
         continue;
       }
-      O << "Base offset: " << *Res << "\n";
+
+      O << "Base offset: " << *BasePointer << "\n";
       O << "ArrayDecl[UnknownSize]";
+      int Size = Subscripts.size();
       for (int i = 0; i < Size - 1; i++)
         O << "[" << *Sizes[i] << "]";
       O << " with elements of " << *Sizes[Size - 1] << " bytes.\n";
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index ff98611..d0784f1 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -51,8 +51,6 @@
 //                                                                            //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "da"
-
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -69,6 +67,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "da"
+
 //===----------------------------------------------------------------------===//
 // statistics
 
@@ -234,7 +234,7 @@ FullDependence::FullDependence(Instruction *Source,
   Levels(CommonLevels),
   LoopIndependent(PossiblyLoopIndependent) {
   Consistent = true;
-  DV = CommonLevels ? new DVEntry[CommonLevels] : NULL;
+  DV = CommonLevels ? new DVEntry[CommonLevels] : nullptr;
 }
 
 // The rest are simple getters that hide the implementation.
@@ -658,7 +658,7 @@ Value *getPointerOperand(Instruction *I) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return SI->getPointerOperand();
   llvm_unreachable("Value is not load or store instruction");
-  return 0;
+  return nullptr;
 }
 
 
@@ -932,7 +932,7 @@ const SCEV *DependenceAnalysis::collectUpperBound(const Loop *L,
     const SCEV *UB = SE->getBackedgeTakenCount(L);
     return SE->getNoopOrZeroExtend(UB, T);
   }
-  return NULL;
+  return nullptr;
 }
 
 
@@ -943,7 +943,7 @@ const SCEVConstant *DependenceAnalysis::collectConstantUpperBound(const Loop *L,
                                                                   ) const {
   if (const SCEV *UB = collectUpperBound(L, T))
     return dyn_cast<SCEVConstant>(UB);
-  return NULL;
+  return nullptr;
 }
 
 
@@ -2194,7 +2194,7 @@ const SCEVConstant *getConstantPart(const SCEVMulExpr *Product) {
     if (const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Product->getOperand(Op)))
       return Constant;
   }
-  return NULL;
+  return nullptr;
 }
 
 
@@ -2646,8 +2646,8 @@ void DependenceAnalysis::findBoundsALL(CoefficientInfo *A,
                                        CoefficientInfo *B,
                                        BoundInfo *Bound,
                                        unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::ALL] = NULL; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::ALL] = NULL; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::ALL] = nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::ALL] = nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     Bound[K].Lower[Dependence::DVEntry::ALL] =
       SE->getMulExpr(SE->getMinusSCEV(A[K].NegPart, B[K].PosPart),
@@ -2687,8 +2687,8 @@ void DependenceAnalysis::findBoundsEQ(CoefficientInfo *A,
                                       CoefficientInfo *B,
                                       BoundInfo *Bound,
                                       unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::EQ] = NULL; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::EQ] = NULL; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::EQ] = nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::EQ] = nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
     const SCEV *NegativePart = getNegativePart(Delta);
@@ -2729,8 +2729,8 @@ void DependenceAnalysis::findBoundsLT(CoefficientInfo *A,
                                       CoefficientInfo *B,
                                       BoundInfo *Bound,
                                       unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::LT] = NULL; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::LT] = NULL; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::LT] = nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::LT] = nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     const SCEV *Iter_1 =
       SE->getMinusSCEV(Bound[K].Iterations,
@@ -2776,8 +2776,8 @@ void DependenceAnalysis::findBoundsGT(CoefficientInfo *A,
                                       CoefficientInfo *B,
                                       BoundInfo *Bound,
                                       unsigned K) const {
-  Bound[K].Lower[Dependence::DVEntry::GT] = NULL; // Default value = -infinity.
-  Bound[K].Upper[Dependence::DVEntry::GT] = NULL; // Default value = +infinity.
+  Bound[K].Lower[Dependence::DVEntry::GT] = nullptr; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::GT] = nullptr; // Default value = +infinity.
   if (Bound[K].Iterations) {
     const SCEV *Iter_1 =
       SE->getMinusSCEV(Bound[K].Iterations,
@@ -2829,7 +2829,7 @@ DependenceAnalysis::collectCoeffInfo(const SCEV *Subscript,
     CI[K].Coeff = Zero;
     CI[K].PosPart = Zero;
     CI[K].NegPart = Zero;
-    CI[K].Iterations = NULL;
+    CI[K].Iterations = nullptr;
   }
   while (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Subscript)) {
     const Loop *L = AddRec->getLoop();
@@ -2872,7 +2872,7 @@ const SCEV *DependenceAnalysis::getLowerBound(BoundInfo *Bound) const {
     if (Bound[K].Lower[Bound[K].Direction])
       Sum = SE->getAddExpr(Sum, Bound[K].Lower[Bound[K].Direction]);
     else
-      Sum = NULL;
+      Sum = nullptr;
   }
   return Sum;
 }
@@ -2888,7 +2888,7 @@ const SCEV *DependenceAnalysis::getUpperBound(BoundInfo *Bound) const {
     if (Bound[K].Upper[Bound[K].Direction])
       Sum = SE->getAddExpr(Sum, Bound[K].Upper[Bound[K].Direction]);
     else
-      Sum = NULL;
+      Sum = nullptr;
   }
   return Sum;
 }
@@ -3148,12 +3148,12 @@ void DependenceAnalysis::updateDirection(Dependence::DVEntry &Level,
   }
   else if (CurConstraint.isLine()) {
     Level.Scalar = false;
-    Level.Distance = NULL;
+    Level.Distance = nullptr;
     // direction should be accurate
   }
   else if (CurConstraint.isPoint()) {
     Level.Scalar = false;
-    Level.Distance = NULL;
+    Level.Distance = nullptr;
     unsigned NewDirection = Dependence::DVEntry::NONE;
     if (!isKnownPredicate(CmpInst::ICMP_NE,
                           CurConstraint.getY(),
@@ -3180,59 +3180,55 @@ void DependenceAnalysis::updateDirection(Dependence::DVEntry &Level,
 /// source and destination array references are recurrences on a nested loop,
 /// this function flattens the nested recurrences into separate recurrences
 /// for each loop level.
-bool
-DependenceAnalysis::tryDelinearize(const SCEV *SrcSCEV, const SCEV *DstSCEV,
-                                   SmallVectorImpl<Subscript> &Pair) const {
+bool DependenceAnalysis::tryDelinearize(const SCEV *SrcSCEV,
+                                        const SCEV *DstSCEV,
+                                        SmallVectorImpl<Subscript> &Pair,
+                                        const SCEV *ElementSize) const {
+  const SCEVUnknown *SrcBase =
+      dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcSCEV));
+  const SCEVUnknown *DstBase =
+      dyn_cast<SCEVUnknown>(SE->getPointerBase(DstSCEV));
+
+  if (!SrcBase || !DstBase || SrcBase != DstBase)
+    return false;
+
+  SrcSCEV = SE->getMinusSCEV(SrcSCEV, SrcBase);
+  DstSCEV = SE->getMinusSCEV(DstSCEV, DstBase);
+
   const SCEVAddRecExpr *SrcAR = dyn_cast<SCEVAddRecExpr>(SrcSCEV);
   const SCEVAddRecExpr *DstAR = dyn_cast<SCEVAddRecExpr>(DstSCEV);
   if (!SrcAR || !DstAR || !SrcAR->isAffine() || !DstAR->isAffine())
     return false;
 
-  SmallVector<const SCEV *, 4> SrcSubscripts, DstSubscripts, SrcSizes, DstSizes;
-  const SCEV *RemainderS = SrcAR->delinearize(*SE, SrcSubscripts, SrcSizes);
-  const SCEV *RemainderD = DstAR->delinearize(*SE, DstSubscripts, DstSizes);
+  // First step: collect parametric terms in both array references.
+  SmallVector<const SCEV *, 4> Terms;
+  SrcAR->collectParametricTerms(*SE, Terms);
+  DstAR->collectParametricTerms(*SE, Terms);
 
-  int size = SrcSubscripts.size();
-  // Fail when there is only a subscript: that's a linearized access function.
-  if (size < 2)
-    return false;
-
-  int dstSize = DstSubscripts.size();
-  // Fail when the number of subscripts in Src and Dst differ.
-  if (size != dstSize)
-    return false;
+  // Second step: find subscript sizes.
+  SmallVector<const SCEV *, 4> Sizes;
+  SE->findArrayDimensions(Terms, Sizes, ElementSize);
 
-  // Fail when the size of any of the subscripts in Src and Dst differs: the
-  // dependence analysis assumes that elements in the same array have same size.
-  // SCEV delinearization does not have a context based on which it would decide
-  // globally the size of subscripts that would best fit all the array accesses.
-  for (int i = 0; i < size; ++i)
-    if (SrcSizes[i] != DstSizes[i])
-      return false;
+  // Third step: compute the access functions for each subscript.
+  SmallVector<const SCEV *, 4> SrcSubscripts, DstSubscripts;
+  SrcAR->computeAccessFunctions(*SE, SrcSubscripts, Sizes);
+  DstAR->computeAccessFunctions(*SE, DstSubscripts, Sizes);
 
-  // When the difference in remainders is different than a constant it might be
-  // that the base address of the arrays is not the same.
-  const SCEV *DiffRemainders = SE->getMinusSCEV(RemainderS, RemainderD);
-  if (!isa<SCEVConstant>(DiffRemainders))
+  // Fail when there is only a subscript: that's a linearized access function.
+  if (SrcSubscripts.size() < 2 || DstSubscripts.size() < 2 ||
+      SrcSubscripts.size() != DstSubscripts.size())
     return false;
 
-  // Normalize the last dimension: integrate the size of the "scalar dimension"
-  // and the remainder of the delinearization.
-  DstSubscripts[size-1] = SE->getMulExpr(DstSubscripts[size-1],
-                                         DstSizes[size-1]);
-  SrcSubscripts[size-1] = SE->getMulExpr(SrcSubscripts[size-1],
-                                         SrcSizes[size-1]);
-  SrcSubscripts[size-1] = SE->getAddExpr(SrcSubscripts[size-1], RemainderS);
-  DstSubscripts[size-1] = SE->getAddExpr(DstSubscripts[size-1], RemainderD);
+  int size = SrcSubscripts.size();
 
-#ifndef NDEBUG
-  DEBUG(errs() << "\nSrcSubscripts: ");
-  for (int i = 0; i < size; i++)
-    DEBUG(errs() << *SrcSubscripts[i]);
-  DEBUG(errs() << "\nDstSubscripts: ");
-  for (int i = 0; i < size; i++)
-    DEBUG(errs() << *DstSubscripts[i]);
-#endif
+  DEBUG({
+      dbgs() << "\nSrcSubscripts: ";
+    for (int i = 0; i < size; i++)
+      dbgs() << *SrcSubscripts[i];
+    dbgs() << "\nDstSubscripts: ";
+    for (int i = 0; i < size; i++)
+      dbgs() << *DstSubscripts[i];
+    });
 
   // The delinearization transforms a single-subscript MIV dependence test into
   // a multi-subscript SIV dependence test that is easier to compute. So we
@@ -3290,7 +3286,7 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
   if ((!Src->mayReadFromMemory() && !Src->mayWriteToMemory()) ||
       (!Dst->mayReadFromMemory() && !Dst->mayWriteToMemory()))
     // if both instructions don't reference memory, there's no dependence
-    return NULL;
+    return nullptr;
 
   if (!isLoadOrStore(Src) || !isLoadOrStore(Dst)) {
     // can only analyze simple loads and stores, i.e., no calls, invokes, etc.
@@ -3310,7 +3306,7 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
   case AliasAnalysis::NoAlias:
     // If the objects noalias, they are distinct, accesses are independent.
     DEBUG(dbgs() << "no alias\n");
-    return NULL;
+    return nullptr;
   case AliasAnalysis::MustAlias:
     break; // The underlying objects alias; test accesses for dependence.
   }
@@ -3363,7 +3359,7 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
   }
 
   if (Delinearize && Pairs == 1 && CommonLevels > 1 &&
-      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair)) {
+      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair, SE->getElementSize(Src))) {
     DEBUG(dbgs() << "    delinerized GEP\n");
     Pairs = Pair.size();
   }
@@ -3505,26 +3501,26 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
     case Subscript::ZIV:
       DEBUG(dbgs() << ", ZIV\n");
       if (testZIV(Pair[SI].Src, Pair[SI].Dst, Result))
-        return NULL;
+        return nullptr;
       break;
     case Subscript::SIV: {
       DEBUG(dbgs() << ", SIV\n");
       unsigned Level;
-      const SCEV *SplitIter = NULL;
+      const SCEV *SplitIter = nullptr;
       if (testSIV(Pair[SI].Src, Pair[SI].Dst, Level,
                   Result, NewConstraint, SplitIter))
-        return NULL;
+        return nullptr;
       break;
     }
     case Subscript::RDIV:
       DEBUG(dbgs() << ", RDIV\n");
       if (testRDIV(Pair[SI].Src, Pair[SI].Dst, Result))
-        return NULL;
+        return nullptr;
       break;
     case Subscript::MIV:
       DEBUG(dbgs() << ", MIV\n");
       if (testMIV(Pair[SI].Src, Pair[SI].Dst, Pair[SI].Loops, Result))
-        return NULL;
+        return nullptr;
       break;
     default:
       llvm_unreachable("subscript has unexpected classification");
@@ -3558,16 +3554,16 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
           DEBUG(dbgs() << "testing subscript " << SJ << ", SIV\n");
           // SJ is an SIV subscript that's part of the current coupled group
           unsigned Level;
-          const SCEV *SplitIter = NULL;
+          const SCEV *SplitIter = nullptr;
           DEBUG(dbgs() << "SIV\n");
           if (testSIV(Pair[SJ].Src, Pair[SJ].Dst, Level,
                       Result, NewConstraint, SplitIter))
-            return NULL;
+            return nullptr;
           ConstrainedLevels.set(Level);
           if (intersectConstraints(&Constraints[Level], &NewConstraint)) {
             if (Constraints[Level].isEmpty()) {
               ++DeltaIndependence;
-              return NULL;
+              return nullptr;
             }
             Changed = true;
           }
@@ -3593,7 +3589,7 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
               case Subscript::ZIV:
                 DEBUG(dbgs() << "ZIV\n");
                 if (testZIV(Pair[SJ].Src, Pair[SJ].Dst, Result))
-                  return NULL;
+                  return nullptr;
                 Mivs.reset(SJ);
                 break;
               case Subscript::SIV:
@@ -3616,7 +3612,7 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
         if (Pair[SJ].Classification == Subscript::RDIV) {
           DEBUG(dbgs() << "RDIV test\n");
           if (testRDIV(Pair[SJ].Src, Pair[SJ].Dst, Result))
-            return NULL;
+            return nullptr;
           // I don't yet understand how to propagate RDIV results
           Mivs.reset(SJ);
         }
@@ -3629,7 +3625,7 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
         if (Pair[SJ].Classification == Subscript::MIV) {
           DEBUG(dbgs() << "MIV test\n");
           if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result))
-            return NULL;
+            return nullptr;
         }
         else
           llvm_unreachable("expected only MIV subscripts at this point");
@@ -3641,7 +3637,7 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
            SJ >= 0; SJ = ConstrainedLevels.find_next(SJ)) {
         updateDirection(Result.DV[SJ - 1], Constraints[SJ]);
         if (Result.DV[SJ - 1].Direction == Dependence::DVEntry::NONE)
-          return NULL;
+          return nullptr;
       }
     }
   }
@@ -3676,11 +3672,11 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
       }
     }
     if (AllEqual)
-      return NULL;
+      return nullptr;
   }
 
   FullDependence *Final = new FullDependence(Result);
-  Result.DV = NULL;
+  Result.DV = nullptr;
   return Final;
 }
 
@@ -3787,7 +3783,7 @@ const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence *Dep,
   }
 
   if (Delinearize && Pairs == 1 && CommonLevels > 1 &&
-      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair)) {
+      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair, SE->getElementSize(Src))) {
     DEBUG(dbgs() << "    delinerized GEP\n");
     Pairs = Pair.size();
   }
@@ -3853,11 +3849,11 @@ const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence *Dep,
     switch (Pair[SI].Classification) {
     case Subscript::SIV: {
       unsigned Level;
-      const SCEV *SplitIter = NULL;
+      const SCEV *SplitIter = nullptr;
       (void) testSIV(Pair[SI].Src, Pair[SI].Dst, Level,
                      Result, NewConstraint, SplitIter);
       if (Level == SplitLevel) {
-        assert(SplitIter != NULL);
+        assert(SplitIter != nullptr);
         return SplitIter;
       }
       break;
@@ -3892,7 +3888,7 @@ const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence *Dep,
         for (int SJ = Sivs.find_first(); SJ >= 0; SJ = Sivs.find_next(SJ)) {
           // SJ is an SIV subscript that's part of the current coupled group
           unsigned Level;
-          const SCEV *SplitIter = NULL;
+          const SCEV *SplitIter = nullptr;
           (void) testSIV(Pair[SJ].Src, Pair[SJ].Dst, Level,
                          Result, NewConstraint, SplitIter);
           if (Level == SplitLevel && SplitIter)
@@ -3933,5 +3929,5 @@ const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence *Dep,
     }
   }
   llvm_unreachable("somehow reached end of routine");
-  return NULL;
+  return nullptr;
 }
diff --git a/lib/Analysis/DominanceFrontier.cpp b/lib/Analysis/DominanceFrontier.cpp
index f0787f1..74594f8 100644
--- a/lib/Analysis/DominanceFrontier.cpp
+++ b/lib/Analysis/DominanceFrontier.cpp
@@ -40,12 +40,12 @@ const DominanceFrontier::DomSetType &
 DominanceFrontier::calculate(const DominatorTree &DT,
                              const DomTreeNode *Node) {
   BasicBlock *BB = Node->getBlock();
-  DomSetType *Result = NULL;
+  DomSetType *Result = nullptr;
 
   std::vector<DFCalculateWorkObject> workList;
   SmallPtrSet<BasicBlock *, 32> visited;
 
-  workList.push_back(DFCalculateWorkObject(BB, NULL, Node, NULL));
+  workList.push_back(DFCalculateWorkObject(BB, nullptr, Node, nullptr));
   do {
     DFCalculateWorkObject *currentW = &workList.back();
     assert (currentW && "Missing work object.");
diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
index f43675b..caec253 100644
--- a/lib/Analysis/IPA/CallGraph.cpp
+++ b/lib/Analysis/IPA/CallGraph.cpp
@@ -21,14 +21,14 @@ using namespace llvm;
 //
 
 CallGraph::CallGraph(Module &M)
-    : M(M), Root(0), ExternalCallingNode(getOrInsertFunction(0)),
-      CallsExternalNode(new CallGraphNode(0)) {
+    : M(M), Root(nullptr), ExternalCallingNode(getOrInsertFunction(nullptr)),
+      CallsExternalNode(new CallGraphNode(nullptr)) {
   // Add every function to the call graph.
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
     addToCallGraph(I);
 
   // If we didn't find a main function, use the external call graph node
-  if (Root == 0)
+  if (!Root)
     Root = ExternalCallingNode;
 }
 
@@ -210,7 +210,7 @@ void CallGraphNode::removeOneAbstractEdgeTo(CallGraphNode *Callee) {
   for (CalledFunctionsVector::iterator I = CalledFunctions.begin(); ; ++I) {
     assert(I != CalledFunctions.end() && "Cannot find callee to remove!");
     CallRecord &CR = *I;
-    if (CR.second == Callee && CR.first == 0) {
+    if (CR.second == Callee && CR.first == nullptr) {
       Callee->DropRef();
       *I = CalledFunctions.back();
       CalledFunctions.pop_back();
@@ -267,7 +267,7 @@ INITIALIZE_PASS(CallGraphWrapperPass, "basiccg", "CallGraph Construction",
 
 char CallGraphWrapperPass::ID = 0;
 
-void CallGraphWrapperPass::releaseMemory() { G.reset(0); }
+void CallGraphWrapperPass::releaseMemory() { G.reset(nullptr); }
 
 void CallGraphWrapperPass::print(raw_ostream &OS, const Module *) const {
   if (!G) {
@@ -280,7 +280,7 @@ void CallGraphWrapperPass::print(raw_ostream &OS, const Module *) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void CallGraphWrapperPass::dump() const { print(dbgs(), 0); }
+void CallGraphWrapperPass::dump() const { print(dbgs(), nullptr); }
 #endif
 
 // Enuse that users of CallGraph.h also link with this file
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index aafc085..bfab744 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "cgscc-passmgr"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/Statistic.h"
@@ -23,12 +22,15 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LegacyPassManagers.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "cgscc-passmgr"
+
 static cl::opt<unsigned> 
 MaxIterations("max-cg-scc-iterations", cl::ReallyHidden, cl::init(4));
 
@@ -112,7 +114,7 @@ bool CGPassManager::RunPassOnSCC(Pass *P, CallGraphSCC &CurSCC,
   bool Changed = false;
   PMDataManager *PM = P->getAsPMDataManager();
 
-  if (PM == 0) {
+  if (!PM) {
     CallGraphSCCPass *CGSP = (CallGraphSCCPass*)P;
     if (!CallGraphUpToDate) {
       DevirtualizedCall |= RefreshCallGraph(CurSCC, CG, false);
@@ -144,8 +146,11 @@ bool CGPassManager::RunPassOnSCC(Pass *P, CallGraphSCC &CurSCC,
        I != E; ++I) {
     if (Function *F = (*I)->getFunction()) {
       dumpPassInfo(P, EXECUTION_MSG, ON_FUNCTION_MSG, F->getName());
-      TimeRegion PassTimer(getPassTimer(FPP));
-      Changed |= FPP->runOnFunction(*F);
+      {
+        TimeRegion PassTimer(getPassTimer(FPP));
+        Changed |= FPP->runOnFunction(*F);
+      }
+      F->getContext().yield();
     }
   }
   
@@ -190,7 +195,7 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
        SCCIdx != E; ++SCCIdx, ++FunctionNo) {
     CallGraphNode *CGN = *SCCIdx;
     Function *F = CGN->getFunction();
-    if (F == 0 || F->isDeclaration()) continue;
+    if (!F || F->isDeclaration()) continue;
     
     // Walk the function body looking for call sites.  Sync up the call sites in
     // CGN with those actually in the function.
@@ -203,7 +208,7 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
     for (CallGraphNode::iterator I = CGN->begin(), E = CGN->end(); I != E; ) {
       // If this call site is null, then the function pass deleted the call
       // entirely and the WeakVH nulled it out.  
-      if (I->first == 0 ||
+      if (!I->first ||
           // If we've already seen this call site, then the FunctionPass RAUW'd
           // one call with another, which resulted in two "uses" in the edge
           // list of the same call.
@@ -217,7 +222,7 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
                "CallGraphSCCPass did not update the CallGraph correctly!");
         
         // If this was an indirect call site, count it.
-        if (I->second->getFunction() == 0)
+        if (!I->second->getFunction())
           ++NumIndirectRemoved;
         else 
           ++NumDirectRemoved;
@@ -273,7 +278,7 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
           // site could be turned direct), don't reject it in checking mode, and
           // don't tweak it to be more precise.
           if (CheckingMode && CS.getCalledFunction() &&
-              ExistingNode->getFunction() == 0)
+              ExistingNode->getFunction() == nullptr)
             continue;
           
           assert(!CheckingMode &&
@@ -286,7 +291,7 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
             CalleeNode = CG.getOrInsertFunction(Callee);
             // Keep track of whether we turned an indirect call into a direct
             // one.
-            if (ExistingNode->getFunction() == 0) {
+            if (!ExistingNode->getFunction()) {
               DevirtualizedCall = true;
               DEBUG(dbgs() << "  CGSCCPASSMGR: Devirtualized call to '"
                            << Callee->getName() << "'\n");
@@ -434,8 +439,8 @@ bool CGPassManager::runOnModule(Module &M) {
   while (!CGI.isAtEnd()) {
     // Copy the current SCC and increment past it so that the pass can hack
     // on the SCC if it wants to without invalidating our iterator.
-    std::vector<CallGraphNode*> &NodeVec = *CGI;
-    CurSCC.initialize(&NodeVec[0], &NodeVec[0]+NodeVec.size());
+    const std::vector<CallGraphNode *> &NodeVec = *CGI;
+    CurSCC.initialize(NodeVec.data(), NodeVec.data() + NodeVec.size());
     ++CGI;
     
     // At the top level, we run all the passes in this pass manager on the
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index f4097e4..607c068 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "globalsmodref-aa"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/Statistic.h"
@@ -33,6 +32,8 @@
 #include <set>
 using namespace llvm;
 
+#define DEBUG_TYPE "globalsmodref-aa"
+
 STATISTIC(NumNonAddrTakenGlobalVars,
           "Number of global vars without address taken");
 STATISTIC(NumNonAddrTakenFunctions,"Number of functions without address taken");
@@ -177,14 +178,14 @@ namespace {
         FunctionInfo.find(F);
       if (I != FunctionInfo.end())
         return &I->second;
-      return 0;
+      return nullptr;
     }
 
     void AnalyzeGlobals(Module &M);
     void AnalyzeCallGraph(CallGraph &CG, Module &M);
     bool AnalyzeUsesOfPointer(Value *V, std::vector<Function*> &Readers,
                               std::vector<Function*> &Writers,
-                              GlobalValue *OkayStoreDest = 0);
+                              GlobalValue *OkayStoreDest = nullptr);
     bool AnalyzeIndirectGlobalMemory(GlobalValue *GV);
   };
 }
@@ -358,7 +359,7 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
   // We do a bottom-up SCC traversal of the call graph.  In other words, we
   // visit all callees before callers (leaf-first).
   for (scc_iterator<CallGraph*> I = scc_begin(&CG); !I.isAtEnd(); ++I) {
-    std::vector<CallGraphNode *> &SCC = *I;
+    const std::vector<CallGraphNode *> &SCC = *I;
     assert(!SCC.empty() && "SCC with no functions?");
 
     if (!SCC[0]->getFunction()) {
@@ -410,10 +411,8 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
             FunctionEffect |= CalleeFR->FunctionEffect;
 
             // Incorporate callee's effects on globals into our info.
-            for (std::map<const GlobalValue*, unsigned>::iterator GI =
-                   CalleeFR->GlobalInfo.begin(), E = CalleeFR->GlobalInfo.end();
-                 GI != E; ++GI)
-              FR.GlobalInfo[GI->first] |= GI->second;
+            for (const auto &G : CalleeFR->GlobalInfo)
+              FR.GlobalInfo[G.first] |= G.second;
             FR.MayReadAnyGlobal |= CalleeFR->MayReadAnyGlobal;
           } else {
             // Can't say anything about it.  However, if it is inside our SCC,
@@ -492,8 +491,8 @@ GlobalsModRef::alias(const Location &LocA,
   if (GV1 || GV2) {
     // If the global's address is taken, pretend we don't know it's a pointer to
     // the global.
-    if (GV1 && !NonAddressTakenGlobals.count(GV1)) GV1 = 0;
-    if (GV2 && !NonAddressTakenGlobals.count(GV2)) GV2 = 0;
+    if (GV1 && !NonAddressTakenGlobals.count(GV1)) GV1 = nullptr;
+    if (GV2 && !NonAddressTakenGlobals.count(GV2)) GV2 = nullptr;
 
     // If the two pointers are derived from two different non-addr-taken
     // globals, or if one is and the other isn't, we know these can't alias.
@@ -507,7 +506,7 @@ GlobalsModRef::alias(const Location &LocA,
   // These pointers may be based on the memory owned by an indirect global.  If
   // so, we may be able to handle this.  First check to see if the base pointer
   // is a direct load from an indirect global.
-  GV1 = GV2 = 0;
+  GV1 = GV2 = nullptr;
   if (const LoadInst *LI = dyn_cast<LoadInst>(UV1))
     if (GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0)))
       if (IndirectGlobals.count(GV))
diff --git a/lib/Analysis/IPA/InlineCost.cpp b/lib/Analysis/IPA/InlineCost.cpp
index 8dafc1c..66f3f8e 100644
--- a/lib/Analysis/IPA/InlineCost.cpp
+++ b/lib/Analysis/IPA/InlineCost.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "inline-cost"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -34,6 +33,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "inline-cost"
+
 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
 
 namespace {
@@ -97,9 +98,6 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   void disableSROA(Value *V);
   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
                           int InstructionCost);
-  bool handleSROACandidate(bool IsSROAValid,
-                           DenseMap<Value *, int>::iterator CostIt,
-                           int InstructionCost);
   bool isGEPOffsetConstant(GetElementPtrInst &GEP);
   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
   bool simplifyCallSite(Function *F, CallSite CS);
@@ -225,21 +223,6 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
   SROACostSavings += InstructionCost;
 }
 
-/// \brief Helper for the common pattern of handling a SROA candidate.
-/// Either accumulates the cost savings if the SROA remains valid, or disables
-/// SROA for the candidate.
-bool CallAnalyzer::handleSROACandidate(bool IsSROAValid,
-                                       DenseMap<Value *, int>::iterator CostIt,
-                                       int InstructionCost) {
-  if (IsSROAValid) {
-    accumulateSROACost(CostIt, InstructionCost);
-    return true;
-  }
-
-  disableSROA(CostIt);
-  return false;
-}
-
 /// \brief Check whether a GEP's indices are all constant.
 ///
 /// Respects any simplified values known during the analysis of this callsite.
@@ -287,8 +270,17 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
 }
 
 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
-  // FIXME: Check whether inlining will turn a dynamic alloca into a static
+  // Check whether inlining will turn a dynamic alloca into a static
   // alloca, and handle that case.
+  if (I.isArrayAllocation()) {
+    if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) {
+      ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size);
+      assert(AllocSize && "Allocation size not a constant int?");
+      Type *Ty = I.getAllocatedType();
+      AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue();
+      return Base::visitAlloca(I);
+    }
+  }
 
   // Accumulate the allocated size.
   if (I.isStaticAlloca()) {
@@ -816,9 +808,29 @@ bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
   // We model unconditional switches as free, see the comments on handling
   // branches.
-  return isa<ConstantInt>(SI.getCondition()) ||
-         dyn_cast_or_null<ConstantInt>(
-             SimplifiedValues.lookup(SI.getCondition()));
+  if (isa<ConstantInt>(SI.getCondition()))
+    return true;
+  if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
+    if (isa<ConstantInt>(V))
+      return true;
+
+  // Otherwise, we need to accumulate a cost proportional to the number of
+  // distinct successor blocks. This fan-out in the CFG cannot be represented
+  // for free even if we can represent the core switch as a jumptable that
+  // takes a single instruction.
+  //
+  // NB: We convert large switches which are just used to initialize large phi
+  // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
+  // inlining those. It will prevent inlining in cases where the optimization
+  // does not (yet) fire.
+  SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
+  SuccessorBlocks.insert(SI.getDefaultDest());
+  for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
+    SuccessorBlocks.insert(I.getCaseSuccessor());
+  // Add cost corresponding to the number of distinct destinations. The first
+  // we model as free because of fallthrough.
+  Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
+  return false;
 }
 
 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
@@ -934,7 +946,7 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
 /// no constant offsets applied.
 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
   if (!DL || !V->getType()->isPointerTy())
-    return 0;
+    return nullptr;
 
   unsigned IntPtrWidth = DL->getPointerSizeInBits();
   APInt Offset = APInt::getNullValue(IntPtrWidth);
@@ -946,7 +958,7 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
   do {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
-        return 0;
+        return nullptr;
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
@@ -1247,7 +1259,7 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
 
   // Calls to functions with always-inline attributes should be inlined
   // whenever possible.
-  if (Callee->hasFnAttribute(Attribute::AlwaysInline)) {
+  if (CS.hasFnAttr(Attribute::AlwaysInline)) {
     if (isInlineViable(*Callee))
       return llvm::InlineCost::getAlways();
     return llvm::InlineCost::getNever();
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 5317a47..c819bd3 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "iv-users"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -29,6 +28,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "iv-users"
+
 char IVUsers::ID = 0;
 INITIALIZE_PASS_BEGIN(IVUsers, "iv-users",
                       "Induction Variable Users", false, true)
@@ -84,7 +85,7 @@ static bool isInteresting(const SCEV *S, const Instruction *I, const Loop *L,
 static bool isSimplifiedLoopNest(BasicBlock *BB, const DominatorTree *DT,
                                  const LoopInfo *LI,
                                  SmallPtrSet<Loop*,16> &SimpleLoopNests) {
-  Loop *NearestLoop = 0;
+  Loop *NearestLoop = nullptr;
   for (DomTreeNode *Rung = DT->getNode(BB);
        Rung; Rung = Rung->getIDom()) {
     BasicBlock *DomBB = Rung->getBlock();
@@ -253,7 +254,7 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   SE = &getAnalysis<ScalarEvolution>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
 
   // Find all uses of induction variables in this loop, and categorize
   // them by stride.  Start by finding all of the PHI nodes in the header for
@@ -329,16 +330,16 @@ static const SCEVAddRecExpr *findAddRecForLoop(const SCEV *S, const Loop *L) {
          I != E; ++I)
       if (const SCEVAddRecExpr *AR = findAddRecForLoop(*I, L))
         return AR;
-    return 0;
+    return nullptr;
   }
 
-  return 0;
+  return nullptr;
 }
 
 const SCEV *IVUsers::getStride(const IVStrideUse &IU, const Loop *L) const {
   if (const SCEVAddRecExpr *AR = findAddRecForLoop(getExpr(IU), L))
     return AR->getStepRecurrence(*SE);
-  return 0;
+  return nullptr;
 }
 
 void IVStrideUse::transformToPostInc(const Loop *L) {
diff --git a/lib/Analysis/InstCount.cpp b/lib/Analysis/InstCount.cpp
index 3d05556..de2b9c0 100644
--- a/lib/Analysis/InstCount.cpp
+++ b/lib/Analysis/InstCount.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "instcount"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Function.h"
@@ -22,6 +21,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "instcount"
+
 STATISTIC(TotalInsts , "Number of instructions (of all types)");
 STATISTIC(TotalBlocks, "Number of basic blocks");
 STATISTIC(TotalFuncs , "Number of non-external functions");
@@ -47,7 +48,7 @@ namespace {
 
     void visitInstruction(Instruction &I) {
       errs() << "Instruction Count does not know about " << I;
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
   public:
     static char ID; // Pass identification, replacement for typeid
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index d8d8a09..3684fda 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -17,7 +17,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "instsimplify"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -35,6 +34,8 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+#define DEBUG_TYPE "instsimplify"
+
 enum { RecursionLimit = 3 };
 
 STATISTIC(NumExpand,  "Number of expansions");
@@ -131,7 +132,7 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
   Instruction::BinaryOps OpcodeToExpand = (Instruction::BinaryOps)OpcToExpand;
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
-    return 0;
+    return nullptr;
 
   // Check whether the expression has the form "(A op' B) op C".
   if (BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS))
@@ -179,7 +180,7 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
         }
     }
 
-  return 0;
+  return nullptr;
 }
 
 /// FactorizeBinOp - Simplify "LHS Opcode RHS" by factorizing out a common term
@@ -192,14 +193,14 @@ static Value *FactorizeBinOp(unsigned Opcode, Value *LHS, Value *RHS,
   Instruction::BinaryOps OpcodeToExtract = (Instruction::BinaryOps)OpcToExtract;
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
-    return 0;
+    return nullptr;
 
   BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
 
   if (!Op0 || Op0->getOpcode() != OpcodeToExtract ||
       !Op1 || Op1->getOpcode() != OpcodeToExtract)
-    return 0;
+    return nullptr;
 
   // The expression has the form "(A op' B) op (C op' D)".
   Value *A = Op0->getOperand(0), *B = Op0->getOperand(1);
@@ -251,7 +252,7 @@ static Value *FactorizeBinOp(unsigned Opcode, Value *LHS, Value *RHS,
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// SimplifyAssociativeBinOp - Generic simplifications for associative binary
@@ -263,7 +264,7 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
 
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
-    return 0;
+    return nullptr;
 
   BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
@@ -308,7 +309,7 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
 
   // The remaining transforms require commutativity as well as associativity.
   if (!Instruction::isCommutative(Opcode))
-    return 0;
+    return nullptr;
 
   // Transform: "(A op B) op C" ==> "(C op A) op B" if it simplifies completely.
   if (Op0 && Op0->getOpcode() == Opcode) {
@@ -348,7 +349,7 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// ThreadBinOpOverSelect - In the case of a binary operation with a select
@@ -359,7 +360,7 @@ static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS,
                                     const Query &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
-    return 0;
+    return nullptr;
 
   SelectInst *SI;
   if (isa<SelectInst>(LHS)) {
@@ -420,7 +421,7 @@ static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS,
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// ThreadCmpOverSelect - In the case of a comparison with a select instruction,
@@ -432,7 +433,7 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
                                   unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
-    return 0;
+    return nullptr;
 
   // Make sure the select is on the LHS.
   if (!isa<SelectInst>(LHS)) {
@@ -456,7 +457,7 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
     // It didn't simplify.  However if "cmp TV, RHS" is equal to the select
     // condition then we can replace it with 'true'.  Otherwise give up.
     if (!isSameCompare(Cond, Pred, TV, RHS))
-      return 0;
+      return nullptr;
     TCmp = getTrue(Cond->getType());
   }
 
@@ -470,7 +471,7 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
     // It didn't simplify.  However if "cmp FV, RHS" is equal to the select
     // condition then we can replace it with 'false'.  Otherwise give up.
     if (!isSameCompare(Cond, Pred, FV, RHS))
-      return 0;
+      return nullptr;
     FCmp = getFalse(Cond->getType());
   }
 
@@ -482,7 +483,7 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
   // The remaining cases only make sense if the select condition has the same
   // type as the result of the comparison, so bail out if this is not so.
   if (Cond->getType()->isVectorTy() != RHS->getType()->isVectorTy())
-    return 0;
+    return nullptr;
   // If the false value simplified to false, then the result of the compare
   // is equal to "Cond && TCmp".  This also catches the case when the false
   // value simplified to false and the true value to true, returning "Cond".
@@ -502,7 +503,7 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
                         Q, MaxRecurse))
       return V;
 
-  return 0;
+  return nullptr;
 }
 
 /// ThreadBinOpOverPHI - In the case of a binary operation with an operand that
@@ -513,24 +514,24 @@ static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
                                  const Query &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
-    return 0;
+    return nullptr;
 
   PHINode *PI;
   if (isa<PHINode>(LHS)) {
     PI = cast<PHINode>(LHS);
     // Bail out if RHS and the phi may be mutually interdependent due to a loop.
     if (!ValueDominatesPHI(RHS, PI, Q.DT))
-      return 0;
+      return nullptr;
   } else {
     assert(isa<PHINode>(RHS) && "No PHI instruction operand!");
     PI = cast<PHINode>(RHS);
     // Bail out if LHS and the phi may be mutually interdependent due to a loop.
     if (!ValueDominatesPHI(LHS, PI, Q.DT))
-      return 0;
+      return nullptr;
   }
 
   // Evaluate the BinOp on the incoming phi values.
-  Value *CommonValue = 0;
+  Value *CommonValue = nullptr;
   for (unsigned i = 0, e = PI->getNumIncomingValues(); i != e; ++i) {
     Value *Incoming = PI->getIncomingValue(i);
     // If the incoming value is the phi node itself, it can safely be skipped.
@@ -541,7 +542,7 @@ static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
     // If the operation failed to simplify, or simplified to a different value
     // to previously, then give up.
     if (!V || (CommonValue && V != CommonValue))
-      return 0;
+      return nullptr;
     CommonValue = V;
   }
 
@@ -556,7 +557,7 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
                                const Query &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
-    return 0;
+    return nullptr;
 
   // Make sure the phi is on the LHS.
   if (!isa<PHINode>(LHS)) {
@@ -568,10 +569,10 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
 
   // Bail out if RHS and the phi may be mutually interdependent due to a loop.
   if (!ValueDominatesPHI(RHS, PI, Q.DT))
-    return 0;
+    return nullptr;
 
   // Evaluate the BinOp on the incoming phi values.
-  Value *CommonValue = 0;
+  Value *CommonValue = nullptr;
   for (unsigned i = 0, e = PI->getNumIncomingValues(); i != e; ++i) {
     Value *Incoming = PI->getIncomingValue(i);
     // If the incoming value is the phi node itself, it can safely be skipped.
@@ -580,7 +581,7 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
     // If the operation failed to simplify, or simplified to a different value
     // to previously, then give up.
     if (!V || (CommonValue && V != CommonValue))
-      return 0;
+      return nullptr;
     CommonValue = V;
   }
 
@@ -613,7 +614,7 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   // X + (Y - X) -> Y
   // (Y - X) + X -> Y
   // Eg: X + -X -> 0
-  Value *Y = 0;
+  Value *Y = nullptr;
   if (match(Op1, m_Sub(m_Value(Y), m_Specific(Op0))) ||
       match(Op0, m_Sub(m_Value(Y), m_Specific(Op1))))
     return Y;
@@ -647,7 +648,7 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   // "A+B" and "A+C" thus gains nothing, but costs compile time.  Similarly
   // for threading over phi nodes.
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
@@ -720,7 +721,7 @@ static Constant *computePointerDifference(const DataLayout *DL,
   // If LHS and RHS are not related via constant offsets to the same base
   // value, there is nothing we can do here.
   if (LHS != RHS)
-    return 0;
+    return nullptr;
 
   // Otherwise, the difference of LHS - RHS can be computed as:
   //    LHS - RHS
@@ -755,14 +756,14 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 
   // (X*2) - X -> X
   // (X<<1) - X -> X
-  Value *X = 0;
+  Value *X = nullptr;
   if (match(Op0, m_Mul(m_Specific(Op1), m_ConstantInt<2>())) ||
       match(Op0, m_Shl(m_Specific(Op1), m_One())))
     return Op1;
 
   // (X + Y) - Z -> X + (Y - Z) or Y + (X - Z) if everything simplifies.
   // For example, (X + Y) - Y -> X; (Y + X) - Y -> X
-  Value *Y = 0, *Z = Op1;
+  Value *Y = nullptr, *Z = Op1;
   if (MaxRecurse && match(Op0, m_Add(m_Value(X), m_Value(Y)))) { // (X + Y) - Z
     // See if "V === Y - Z" simplifies.
     if (Value *V = SimplifyBinOp(Instruction::Sub, Y, Z, Q, MaxRecurse-1))
@@ -853,7 +854,7 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   // "A-B" and "A-C" thus gains nothing, but costs compile time.  Similarly
   // for threading over phi nodes.
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
@@ -890,7 +891,7 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   // fadd [nnan ninf] X, (fsub [nnan ninf] 0, X) ==> 0
   //   where nnan and ninf have to occur at least once somewhere in this
   //   expression
-  Value *SubOp = 0;
+  Value *SubOp = nullptr;
   if (match(Op1, m_FSub(m_AnyZero(), m_Specific(Op0))))
     SubOp = Op1;
   else if (match(Op0, m_FSub(m_AnyZero(), m_Specific(Op1))))
@@ -902,7 +903,7 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
       return Constant::getNullValue(Op0->getType());
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// Given operands for an FSub, see if we can fold the result.  If not, this
@@ -939,7 +940,7 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (FMF.noNaNs() && FMF.noInfs() && Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
-  return 0;
+  return nullptr;
 }
 
 /// Given the operands for an FMul, see if we can fold the result
@@ -966,7 +967,7 @@ static Value *SimplifyFMulInst(Value *Op0, Value *Op1,
  if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZero()))
    return Op1;
 
- return 0;
+ return nullptr;
 }
 
 /// SimplifyMulInst - Given operands for a Mul, see if we can
@@ -997,7 +998,7 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
     return Op0;
 
   // (X / Y) * Y -> X if the division is exact.
-  Value *X = 0;
+  Value *X = nullptr;
   if (match(Op0, m_Exact(m_IDiv(m_Value(X), m_Specific(Op1)))) || // (X / Y) * Y
       match(Op1, m_Exact(m_IDiv(m_Value(X), m_Specific(Op0)))))   // Y * (X / Y)
     return X;
@@ -1031,7 +1032,7 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
                                       MaxRecurse))
       return V;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
@@ -1098,7 +1099,7 @@ static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
     return ConstantInt::get(Op0->getType(), 1);
 
   // (X * Y) / Y -> X if the multiplication does not overflow.
-  Value *X = 0, *Y = 0;
+  Value *X = nullptr, *Y = nullptr;
   if (match(Op0, m_Mul(m_Value(X), m_Value(Y))) && (X == Op1 || Y == Op1)) {
     if (Y != Op1) std::swap(X, Y); // Ensure expression is (X * Y) / Y, Y = Op1
     OverflowingBinaryOperator *Mul = cast<OverflowingBinaryOperator>(Op0);
@@ -1129,7 +1130,7 @@ static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
     if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
-  return 0;
+  return nullptr;
 }
 
 /// SimplifySDivInst - Given operands for an SDiv, see if we can
@@ -1139,7 +1140,7 @@ static Value *SimplifySDivInst(Value *Op0, Value *Op1, const Query &Q,
   if (Value *V = SimplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse))
     return V;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const DataLayout *DL,
@@ -1155,7 +1156,7 @@ static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q,
   if (Value *V = SimplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse))
     return V;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const DataLayout *DL,
@@ -1174,7 +1175,7 @@ static Value *SimplifyFDivInst(Value *Op0, Value *Op1, const Query &Q,
   if (match(Op1, m_Undef()))
     return Op1;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, const DataLayout *DL,
@@ -1234,7 +1235,7 @@ static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
     if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
-  return 0;
+  return nullptr;
 }
 
 /// SimplifySRemInst - Given operands for an SRem, see if we can
@@ -1244,7 +1245,7 @@ static Value *SimplifySRemInst(Value *Op0, Value *Op1, const Query &Q,
   if (Value *V = SimplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse))
     return V;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const DataLayout *DL,
@@ -1260,7 +1261,7 @@ static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q,
   if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse))
     return V;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const DataLayout *DL,
@@ -1279,7 +1280,7 @@ static Value *SimplifyFRemInst(Value *Op0, Value *Op1, const Query &,
   if (match(Op1, m_Undef()))
     return Op1;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, const DataLayout *DL,
@@ -1350,7 +1351,7 @@ static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
     if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
-  return 0;
+  return nullptr;
 }
 
 /// SimplifyShlInst - Given operands for an Shl, see if we can
@@ -1368,7 +1369,7 @@ static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   Value *X;
   if (match(Op0, m_Exact(m_Shr(m_Value(X), m_Specific(Op1)))))
     return X;
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
@@ -1399,7 +1400,7 @@ static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
       cast<OverflowingBinaryOperator>(Op0)->hasNoUnsignedWrap())
     return X;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
@@ -1435,7 +1436,7 @@ static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
       cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap())
     return X;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
@@ -1483,7 +1484,7 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
     return Constant::getNullValue(Op0->getType());
 
   // (A | ?) & A = A
-  Value *A = 0, *B = 0;
+  Value *A = nullptr, *B = nullptr;
   if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
       (A == Op1 || B == Op1))
     return Op1;
@@ -1536,7 +1537,7 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
                                       MaxRecurse))
       return V;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const DataLayout *DL,
@@ -1582,7 +1583,7 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
     return Constant::getAllOnesValue(Op0->getType());
 
   // (A & ?) | A = A
-  Value *A = 0, *B = 0;
+  Value *A = nullptr, *B = nullptr;
   if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
       (A == Op1 || B == Op1))
     return Op1;
@@ -1630,7 +1631,7 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
     if (Value *V = ThreadBinOpOverPHI(Instruction::Or, Op0, Op1, Q, MaxRecurse))
       return V;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const DataLayout *DL,
@@ -1690,7 +1691,7 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q,
   // "A^B" and "A^C" thus gains nothing, but costs compile time.  Similarly
   // for threading over phi nodes.
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const DataLayout *DL,
@@ -1710,17 +1711,17 @@ static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
                                          Value *LHS, Value *RHS) {
   SelectInst *SI = dyn_cast<SelectInst>(V);
   if (!SI)
-    return 0;
+    return nullptr;
   CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
   if (!Cmp)
-    return 0;
+    return nullptr;
   Value *CmpLHS = Cmp->getOperand(0), *CmpRHS = Cmp->getOperand(1);
   if (Pred == Cmp->getPredicate() && LHS == CmpLHS && RHS == CmpRHS)
     return Cmp;
   if (Pred == CmpInst::getSwappedPredicate(Cmp->getPredicate()) &&
       LHS == CmpRHS && RHS == CmpLHS)
     return Cmp;
-  return 0;
+  return nullptr;
 }
 
 // A significant optimization not implemented here is assuming that alloca
@@ -1768,7 +1769,7 @@ static Constant *computePointerICmp(const DataLayout *DL,
   // We can only fold certain predicates on pointer comparisons.
   switch (Pred) {
   default:
-    return 0;
+    return nullptr;
 
     // Equality comaprisons are easy to fold.
   case CmpInst::ICMP_EQ:
@@ -1874,7 +1875,7 @@ static Constant *computePointerICmp(const DataLayout *DL,
   }
 
   // Otherwise, fail.
-  return 0;
+  return nullptr;
 }
 
 /// SimplifyICmpInst - Given operands for an ICmpInst, see if we can
@@ -2000,7 +2001,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 
     // Many binary operators with constant RHS have easy to compute constant
     // range.  Use them to check whether the comparison is a tautology.
-    uint32_t Width = CI->getBitWidth();
+    unsigned Width = CI->getBitWidth();
     APInt Lower = APInt(Width, 0);
     APInt Upper = APInt(Width, 0);
     ConstantInt *CI2;
@@ -2019,6 +2020,10 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       APInt NegOne = APInt::getAllOnesValue(Width);
       if (!CI2->isZero())
         Upper = NegOne.udiv(CI2->getValue()) + 1;
+    } else if (match(LHS, m_SDiv(m_ConstantInt(CI2), m_Value()))) {
+      // 'sdiv CI2, x' produces [-|CI2|, |CI2|].
+      Upper = CI2->getValue().abs() + 1;
+      Lower = (-Upper) + 1;
     } else if (match(LHS, m_SDiv(m_Value(), m_ConstantInt(CI2)))) {
       // 'sdiv x, CI2' produces [INT_MIN / CI2, INT_MAX / CI2].
       APInt IntMin = APInt::getSignedMinValue(Width);
@@ -2033,6 +2038,13 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       APInt NegOne = APInt::getAllOnesValue(Width);
       if (CI2->getValue().ult(Width))
         Upper = NegOne.lshr(CI2->getValue()) + 1;
+    } else if (match(LHS, m_LShr(m_ConstantInt(CI2), m_Value()))) {
+      // 'lshr CI2, x' produces [CI2 >> (Width-1), CI2].
+      unsigned ShiftAmount = Width - 1;
+      if (!CI2->isZero() && cast<BinaryOperator>(LHS)->isExact())
+        ShiftAmount = CI2->getValue().countTrailingZeros();
+      Lower = CI2->getValue().lshr(ShiftAmount);
+      Upper = CI2->getValue() + 1;
     } else if (match(LHS, m_AShr(m_Value(), m_ConstantInt(CI2)))) {
       // 'ashr x, CI2' produces [INT_MIN >> CI2, INT_MAX >> CI2].
       APInt IntMin = APInt::getSignedMinValue(Width);
@@ -2041,6 +2053,19 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         Lower = IntMin.ashr(CI2->getValue());
         Upper = IntMax.ashr(CI2->getValue()) + 1;
       }
+    } else if (match(LHS, m_AShr(m_ConstantInt(CI2), m_Value()))) {
+      unsigned ShiftAmount = Width - 1;
+      if (!CI2->isZero() && cast<BinaryOperator>(LHS)->isExact())
+        ShiftAmount = CI2->getValue().countTrailingZeros();
+      if (CI2->isNegative()) {
+        // 'ashr CI2, x' produces [CI2, CI2 >> (Width-1)]
+        Lower = CI2->getValue();
+        Upper = CI2->getValue().ashr(ShiftAmount) + 1;
+      } else {
+        // 'ashr CI2, x' produces [CI2 >> (Width-1), CI2]
+        Lower = CI2->getValue().ashr(ShiftAmount);
+        Upper = CI2->getValue() + 1;
+      }
     } else if (match(LHS, m_Or(m_Value(), m_ConstantInt(CI2)))) {
       // 'or x, CI2' produces [CI2, UINT_MAX].
       Lower = CI2->getValue();
@@ -2221,7 +2246,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   BinaryOperator *RBO = dyn_cast<BinaryOperator>(RHS);
   if (MaxRecurse && (LBO || RBO)) {
     // Analyze the case when either LHS or RHS is an add instruction.
-    Value *A = 0, *B = 0, *C = 0, *D = 0;
+    Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
     // LHS = A + B (or A and B are null); RHS = C + D (or C and D are null).
     bool NoLHSWrapProblem = false, NoRHSWrapProblem = false;
     if (LBO && LBO->getOpcode() == Instruction::Add) {
@@ -2279,6 +2304,28 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
   }
 
+  // 0 - (zext X) pred C
+  if (!CmpInst::isUnsigned(Pred) && match(LHS, m_Neg(m_ZExt(m_Value())))) {
+    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
+      if (RHSC->getValue().isStrictlyPositive()) {
+        if (Pred == ICmpInst::ICMP_SLT)
+          return ConstantInt::getTrue(RHSC->getContext());
+        if (Pred == ICmpInst::ICMP_SGE)
+          return ConstantInt::getFalse(RHSC->getContext());
+        if (Pred == ICmpInst::ICMP_EQ)
+          return ConstantInt::getFalse(RHSC->getContext());
+        if (Pred == ICmpInst::ICMP_NE)
+          return ConstantInt::getTrue(RHSC->getContext());
+      }
+      if (RHSC->getValue().isNonNegative()) {
+        if (Pred == ICmpInst::ICMP_SLE)
+          return ConstantInt::getTrue(RHSC->getContext());
+        if (Pred == ICmpInst::ICMP_SGT)
+          return ConstantInt::getFalse(RHSC->getContext());
+      }
+    }
+  }
+
   // icmp pred (urem X, Y), Y
   if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
     bool KnownNonNegative, KnownNegative;
@@ -2605,7 +2652,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
@@ -2702,7 +2749,7 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
@@ -2741,7 +2788,7 @@ static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
   if (isa<UndefValue>(FalseVal))   // select C, X, undef -> X
     return TrueVal;
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
@@ -2786,7 +2833,7 @@ static Value *SimplifyGEPInst(ArrayRef<Value *> Ops, const Query &Q, unsigned) {
   // Check to see if this is constant foldable.
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     if (!isa<Constant>(Ops[i]))
-      return 0;
+      return nullptr;
 
   return ConstantExpr::getGetElementPtr(cast<Constant>(Ops[0]), Ops.slice(1));
 }
@@ -2823,7 +2870,7 @@ static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
         return Agg;
     }
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
@@ -2839,7 +2886,7 @@ Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
 static Value *SimplifyPHINode(PHINode *PN, const Query &Q) {
   // If all of the PHI's incoming values are the same then replace the PHI node
   // with the common value.
-  Value *CommonValue = 0;
+  Value *CommonValue = nullptr;
   bool HasUndefInput = false;
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     Value *Incoming = PN->getIncomingValue(i);
@@ -2851,7 +2898,7 @@ static Value *SimplifyPHINode(PHINode *PN, const Query &Q) {
       continue;
     }
     if (CommonValue && Incoming != CommonValue)
-      return 0;  // Not the same, bail out.
+      return nullptr;  // Not the same, bail out.
     CommonValue = Incoming;
   }
 
@@ -2864,7 +2911,7 @@ static Value *SimplifyPHINode(PHINode *PN, const Query &Q) {
   // instruction, we cannot return X as the result of the PHI node unless it
   // dominates the PHI block.
   if (HasUndefInput)
-    return ValueDominatesPHI(CommonValue, PN, Q.DT) ? CommonValue : 0;
+    return ValueDominatesPHI(CommonValue, PN, Q.DT) ? CommonValue : nullptr;
 
   return CommonValue;
 }
@@ -2873,7 +2920,7 @@ static Value *SimplifyTruncInst(Value *Op, Type *Ty, const Query &Q, unsigned) {
   if (Constant *C = dyn_cast<Constant>(Op))
     return ConstantFoldInstOperands(Instruction::Trunc, Ty, C, Q.DL, Q.TLI);
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout *DL,
@@ -2945,7 +2992,7 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
       if (Value *V = ThreadBinOpOverPHI(Opcode, LHS, RHS, Q, MaxRecurse))
         return V;
 
-    return 0;
+    return nullptr;
   }
 }
 
@@ -2992,7 +3039,7 @@ static Value *SimplifyIntrinsic(Intrinsic::ID IID, IterTy ArgBegin, IterTy ArgEn
                                 const Query &Q, unsigned MaxRecurse) {
   // Perform idempotent optimizations
   if (!IsIdempotent(IID))
-    return 0;
+    return nullptr;
 
   // Unary Ops
   if (std::distance(ArgBegin, ArgEnd) == 1)
@@ -3000,7 +3047,7 @@ static Value *SimplifyIntrinsic(Intrinsic::ID IID, IterTy ArgBegin, IterTy ArgEn
       if (II->getIntrinsicID() == IID)
         return II;
 
-  return 0;
+  return nullptr;
 }
 
 template <typename IterTy>
@@ -3017,7 +3064,7 @@ static Value *SimplifyCall(Value *V, IterTy ArgBegin, IterTy ArgEnd,
 
   Function *F = dyn_cast<Function>(V);
   if (!F)
-    return 0;
+    return nullptr;
 
   if (unsigned IID = F->getIntrinsicID())
     if (Value *Ret =
@@ -3025,14 +3072,14 @@ static Value *SimplifyCall(Value *V, IterTy ArgBegin, IterTy ArgEnd,
       return Ret;
 
   if (!canConstantFoldCallTo(F))
-    return 0;
+    return nullptr;
 
   SmallVector<Constant *, 4> ConstantArgs;
   ConstantArgs.reserve(ArgEnd - ArgBegin);
   for (IterTy I = ArgBegin, E = ArgEnd; I != E; ++I) {
     Constant *C = dyn_cast<Constant>(*I);
     if (!C)
-      return 0;
+      return nullptr;
     ConstantArgs.push_back(C);
   }
 
@@ -3247,7 +3294,7 @@ bool llvm::recursivelySimplifyInstruction(Instruction *I,
                                           const DataLayout *DL,
                                           const TargetLibraryInfo *TLI,
                                           const DominatorTree *DT) {
-  return replaceAndRecursivelySimplifyImpl(I, 0, DL, TLI, DT);
+  return replaceAndRecursivelySimplifyImpl(I, nullptr, DL, TLI, DT);
 }
 
 bool llvm::replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
diff --git a/lib/Analysis/IntervalPartition.cpp b/lib/Analysis/IntervalPartition.cpp
index 2e259b1..a0583e8 100644
--- a/lib/Analysis/IntervalPartition.cpp
+++ b/lib/Analysis/IntervalPartition.cpp
@@ -29,7 +29,7 @@ void IntervalPartition::releaseMemory() {
     delete Intervals[i];
   IntervalMap.clear();
   Intervals.clear();
-  RootInterval = 0;
+  RootInterval = nullptr;
 }
 
 void IntervalPartition::print(raw_ostream &O, const Module*) const {
diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp
index ea213f2..e073616 100644
--- a/lib/Analysis/LazyCallGraph.cpp
+++ b/lib/Analysis/LazyCallGraph.cpp
@@ -8,19 +8,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LazyCallGraph.h"
-#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "lcg"
+
 static void findCallees(
     SmallVectorImpl<Constant *> &Worklist, SmallPtrSetImpl<Constant *> &Visited,
     SmallVectorImpl<PointerUnion<Function *, LazyCallGraph::Node *>> &Callees,
-    SmallPtrSetImpl<Function *> &CalleeSet) {
+    DenseMap<Function *, size_t> &CalleeIndexMap) {
   while (!Worklist.empty()) {
     Constant *C = Worklist.pop_back_val();
 
@@ -35,8 +38,12 @@ static void findCallees(
       // alias. Then a test of the address of the weak function against the new
       // strong definition's address would be an effective way to determine the
       // safety of optimizing a direct call edge.
-      if (!F->isDeclaration() && CalleeSet.insert(F))
+      if (!F->isDeclaration() &&
+          CalleeIndexMap.insert(std::make_pair(F, Callees.size())).second) {
+        DEBUG(dbgs() << "    Added callable function: " << F->getName()
+                     << "\n");
         Callees.push_back(F);
+      }
       continue;
     }
 
@@ -46,7 +53,11 @@ static void findCallees(
   }
 }
 
-LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F) : G(G), F(F) {
+LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F)
+    : G(&G), F(F), DFSNumber(0), LowLink(0) {
+  DEBUG(dbgs() << "  Adding functions called by '" << F.getName()
+               << "' to the graph.\n");
+
   SmallVector<Constant *, 16> Worklist;
   SmallPtrSet<Constant *, 16> Visited;
   // Find all the potential callees in this function. First walk the
@@ -61,36 +72,41 @@ LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F) : G(G), F(F) {
   // We've collected all the constant (and thus potentially function or
   // function containing) operands to all of the instructions in the function.
   // Process them (recursively) collecting every function found.
-  findCallees(Worklist, Visited, Callees, CalleeSet);
+  findCallees(Worklist, Visited, Callees, CalleeIndexMap);
 }
 
-LazyCallGraph::Node::Node(LazyCallGraph &G, const Node &OtherN)
-    : G(G), F(OtherN.F), CalleeSet(OtherN.CalleeSet) {
-  // Loop over the other node's callees, adding the Function*s to our list
-  // directly, and recursing to add the Node*s.
-  Callees.reserve(OtherN.Callees.size());
-  for (const auto &OtherCallee : OtherN.Callees)
-    if (Function *Callee = OtherCallee.dyn_cast<Function *>())
-      Callees.push_back(Callee);
-    else
-      Callees.push_back(G.copyInto(*OtherCallee.get<Node *>()));
+void LazyCallGraph::Node::insertEdgeInternal(Function &Callee) {
+  if (Node *N = G->lookup(Callee))
+    return insertEdgeInternal(*N);
+
+  CalleeIndexMap.insert(std::make_pair(&Callee, Callees.size()));
+  Callees.push_back(&Callee);
 }
 
-LazyCallGraph::Node::Node(LazyCallGraph &G, Node &&OtherN)
-    : G(G), F(OtherN.F), Callees(std::move(OtherN.Callees)),
-      CalleeSet(std::move(OtherN.CalleeSet)) {
-  // Loop over our Callees. They've been moved from another node, but we need
-  // to move the Node*s to live under our bump ptr allocator.
-  for (auto &Callee : Callees)
-    if (Node *ChildN = Callee.dyn_cast<Node *>())
-      Callee = G.moveInto(std::move(*ChildN));
+void LazyCallGraph::Node::insertEdgeInternal(Node &CalleeN) {
+  CalleeIndexMap.insert(std::make_pair(&CalleeN.getFunction(), Callees.size()));
+  Callees.push_back(&CalleeN);
 }
 
-LazyCallGraph::LazyCallGraph(Module &M) : M(M) {
+void LazyCallGraph::Node::removeEdgeInternal(Function &Callee) {
+  auto IndexMapI = CalleeIndexMap.find(&Callee);
+  assert(IndexMapI != CalleeIndexMap.end() &&
+         "Callee not in the callee set for this caller?");
+
+  Callees[IndexMapI->second] = nullptr;
+  CalleeIndexMap.erase(IndexMapI);
+}
+
+LazyCallGraph::LazyCallGraph(Module &M) : NextDFSNumber(0) {
+  DEBUG(dbgs() << "Building CG for module: " << M.getModuleIdentifier()
+               << "\n");
   for (Function &F : M)
     if (!F.isDeclaration() && !F.hasLocalLinkage())
-      if (EntryNodeSet.insert(&F))
+      if (EntryIndexMap.insert(std::make_pair(&F, EntryNodes.size())).second) {
+        DEBUG(dbgs() << "  Adding '" << F.getName()
+                     << "' to entry set of the graph.\n");
         EntryNodes.push_back(&F);
+      }
 
   // Now add entry nodes for functions reachable via initializers to globals.
   SmallVector<Constant *, 16> Worklist;
@@ -100,51 +116,568 @@ LazyCallGraph::LazyCallGraph(Module &M) : M(M) {
       if (Visited.insert(GV.getInitializer()))
         Worklist.push_back(GV.getInitializer());
 
-  findCallees(Worklist, Visited, EntryNodes, EntryNodeSet);
-}
+  DEBUG(dbgs() << "  Adding functions referenced by global initializers to the "
+                  "entry set.\n");
+  findCallees(Worklist, Visited, EntryNodes, EntryIndexMap);
 
-LazyCallGraph::LazyCallGraph(const LazyCallGraph &G)
-    : M(G.M), EntryNodeSet(G.EntryNodeSet) {
-  EntryNodes.reserve(G.EntryNodes.size());
-  for (const auto &EntryNode : G.EntryNodes)
-    if (Function *Callee = EntryNode.dyn_cast<Function *>())
-      EntryNodes.push_back(Callee);
+  for (auto &Entry : EntryNodes) {
+    assert(!Entry.isNull() &&
+           "We can't have removed edges before we finish the constructor!");
+    if (Function *F = Entry.dyn_cast<Function *>())
+      SCCEntryNodes.push_back(F);
     else
-      EntryNodes.push_back(copyInto(*EntryNode.get<Node *>()));
+      SCCEntryNodes.push_back(&Entry.get<Node *>()->getFunction());
+  }
 }
 
-// FIXME: This would be crazy simpler if BumpPtrAllocator were movable without
-// invalidating any of the allocated memory. We should make that be the case at
-// some point and delete this.
 LazyCallGraph::LazyCallGraph(LazyCallGraph &&G)
-    : M(G.M), EntryNodes(std::move(G.EntryNodes)),
-      EntryNodeSet(std::move(G.EntryNodeSet)) {
-  // Loop over our EntryNodes. They've been moved from another graph, so we
-  // need to move the Node*s to live under our bump ptr allocator. We can just
-  // do this in-place.
-  for (auto &Entry : EntryNodes)
-    if (Node *EntryN = Entry.dyn_cast<Node *>())
-      Entry = moveInto(std::move(*EntryN));
+    : BPA(std::move(G.BPA)), NodeMap(std::move(G.NodeMap)),
+      EntryNodes(std::move(G.EntryNodes)),
+      EntryIndexMap(std::move(G.EntryIndexMap)), SCCBPA(std::move(G.SCCBPA)),
+      SCCMap(std::move(G.SCCMap)), LeafSCCs(std::move(G.LeafSCCs)),
+      DFSStack(std::move(G.DFSStack)),
+      SCCEntryNodes(std::move(G.SCCEntryNodes)),
+      NextDFSNumber(G.NextDFSNumber) {
+  updateGraphPtrs();
+}
+
+LazyCallGraph &LazyCallGraph::operator=(LazyCallGraph &&G) {
+  BPA = std::move(G.BPA);
+  NodeMap = std::move(G.NodeMap);
+  EntryNodes = std::move(G.EntryNodes);
+  EntryIndexMap = std::move(G.EntryIndexMap);
+  SCCBPA = std::move(G.SCCBPA);
+  SCCMap = std::move(G.SCCMap);
+  LeafSCCs = std::move(G.LeafSCCs);
+  DFSStack = std::move(G.DFSStack);
+  SCCEntryNodes = std::move(G.SCCEntryNodes);
+  NextDFSNumber = G.NextDFSNumber;
+  updateGraphPtrs();
+  return *this;
 }
 
-LazyCallGraph::Node *LazyCallGraph::insertInto(Function &F, Node *&MappedN) {
-  return new (MappedN = BPA.Allocate()) Node(*this, F);
+void LazyCallGraph::SCC::insert(Node &N) {
+  N.DFSNumber = N.LowLink = -1;
+  Nodes.push_back(&N);
+  G->SCCMap[&N] = this;
 }
 
-LazyCallGraph::Node *LazyCallGraph::copyInto(const Node &OtherN) {
-  Node *&N = NodeMap[&OtherN.F];
-  if (N)
-    return N;
+bool LazyCallGraph::SCC::isDescendantOf(const SCC &C) const {
+  // Walk up the parents of this SCC and verify that we eventually find C.
+  SmallVector<const SCC *, 4> AncestorWorklist;
+  AncestorWorklist.push_back(this);
+  do {
+    const SCC *AncestorC = AncestorWorklist.pop_back_val();
+    if (AncestorC->isChildOf(C))
+      return true;
+    for (const SCC *ParentC : AncestorC->ParentSCCs)
+      AncestorWorklist.push_back(ParentC);
+  } while (!AncestorWorklist.empty());
 
-  return new (N = BPA.Allocate()) Node(*this, OtherN);
+  return false;
 }
 
-LazyCallGraph::Node *LazyCallGraph::moveInto(Node &&OtherN) {
-  Node *&N = NodeMap[&OtherN.F];
-  if (N)
-    return N;
+void LazyCallGraph::SCC::insertIntraSCCEdge(Node &CallerN, Node &CalleeN) {
+  // First insert it into the caller.
+  CallerN.insertEdgeInternal(CalleeN);
+
+  assert(G->SCCMap.lookup(&CallerN) == this && "Caller must be in this SCC.");
+  assert(G->SCCMap.lookup(&CalleeN) == this && "Callee must be in this SCC.");
 
-  return new (N = BPA.Allocate()) Node(*this, std::move(OtherN));
+  // Nothing changes about this SCC or any other.
+}
+
+void LazyCallGraph::SCC::insertOutgoingEdge(Node &CallerN, Node &CalleeN) {
+  // First insert it into the caller.
+  CallerN.insertEdgeInternal(CalleeN);
+
+  assert(G->SCCMap.lookup(&CallerN) == this && "Caller must be in this SCC.");
+
+  SCC &CalleeC = *G->SCCMap.lookup(&CalleeN);
+  assert(&CalleeC != this && "Callee must not be in this SCC.");
+  assert(CalleeC.isDescendantOf(*this) &&
+         "Callee must be a descendant of the Caller.");
+
+  // The only change required is to add this SCC to the parent set of the callee.
+  CalleeC.ParentSCCs.insert(this);
+}
+
+SmallVector<LazyCallGraph::SCC *, 1>
+LazyCallGraph::SCC::insertIncomingEdge(Node &CallerN, Node &CalleeN) {
+  // First insert it into the caller.
+  CallerN.insertEdgeInternal(CalleeN);
+
+  assert(G->SCCMap.lookup(&CalleeN) == this && "Callee must be in this SCC.");
+
+  SCC &CallerC = *G->SCCMap.lookup(&CallerN);
+  assert(&CallerC != this && "Caller must not be in this SCC.");
+  assert(CallerC.isDescendantOf(*this) &&
+         "Caller must be a descendant of the Callee.");
+
+  // The algorithm we use for merging SCCs based on the cycle introduced here
+  // is to walk the SCC inverted DAG formed by the parent SCC sets. The inverse
+  // graph has the same cycle properties as the actual DAG of the SCCs, and
+  // when forming SCCs lazily by a DFS, the bottom of the graph won't exist in
+  // many cases which should prune the search space.
+  //
+  // FIXME: We can get this pruning behavior even after the incremental SCC
+  // formation by leaving behind (conservative) DFS numberings in the nodes,
+  // and pruning the search with them. These would need to be cleverly updated
+  // during the removal of intra-SCC edges, but could be preserved
+  // conservatively.
+
+  // The set of SCCs that are connected to the caller, and thus will
+  // participate in the merged connected component.
+  SmallPtrSet<SCC *, 8> ConnectedSCCs;
+  ConnectedSCCs.insert(this);
+  ConnectedSCCs.insert(&CallerC);
+
+  // We build up a DFS stack of the parents chains.
+  SmallVector<std::pair<SCC *, SCC::parent_iterator>, 8> DFSSCCs;
+  SmallPtrSet<SCC *, 8> VisitedSCCs;
+  int ConnectedDepth = -1;
+  SCC *C = this;
+  parent_iterator I = parent_begin(), E = parent_end();
+  for (;;) {
+    while (I != E) {
+      SCC &ParentSCC = *I++;
+
+      // If we have already processed this parent SCC, skip it, and remember
+      // whether it was connected so we don't have to check the rest of the
+      // stack. This also handles when we reach a child of the 'this' SCC (the
+      // callee) which terminates the search.
+      if (ConnectedSCCs.count(&ParentSCC)) {
+        ConnectedDepth = std::max<int>(ConnectedDepth, DFSSCCs.size());
+        continue;
+      }
+      if (VisitedSCCs.count(&ParentSCC))
+        continue;
+
+      // We fully explore the depth-first space, adding nodes to the connected
+      // set only as we pop them off, so "recurse" by rotating to the parent.
+      DFSSCCs.push_back(std::make_pair(C, I));
+      C = &ParentSCC;
+      I = ParentSCC.parent_begin();
+      E = ParentSCC.parent_end();
+    }
+
+    // If we've found a connection anywhere below this point on the stack (and
+    // thus up the parent graph from the caller), the current node needs to be
+    // added to the connected set now that we've processed all of its parents.
+    if ((int)DFSSCCs.size() == ConnectedDepth) {
+      --ConnectedDepth; // We're finished with this connection.
+      ConnectedSCCs.insert(C);
+    } else {
+      // Otherwise remember that its parents don't ever connect.
+      assert(ConnectedDepth < (int)DFSSCCs.size() &&
+             "Cannot have a connected depth greater than the DFS depth!");
+      VisitedSCCs.insert(C);
+    }
+
+    if (DFSSCCs.empty())
+      break; // We've walked all the parents of the caller transitively.
+
+    // Pop off the prior node and position to unwind the depth first recursion.
+    std::tie(C, I) = DFSSCCs.pop_back_val();
+    E = C->parent_end();
+  }
+
+  // Now that we have identified all of the SCCs which need to be merged into
+  // a connected set with the inserted edge, merge all of them into this SCC.
+  // FIXME: This operation currently creates ordering stability problems
+  // because we don't use stably ordered containers for the parent SCCs or the
+  // connected SCCs.
+  unsigned NewNodeBeginIdx = Nodes.size();
+  for (SCC *C : ConnectedSCCs) {
+    if (C == this)
+      continue;
+    for (SCC *ParentC : C->ParentSCCs)
+      if (!ConnectedSCCs.count(ParentC))
+        ParentSCCs.insert(ParentC);
+    C->ParentSCCs.clear();
+
+    for (Node *N : *C) {
+      for (Node &ChildN : *N) {
+        SCC &ChildC = *G->SCCMap.lookup(&ChildN);
+        if (&ChildC != C)
+          ChildC.ParentSCCs.erase(C);
+      }
+      G->SCCMap[N] = this;
+      Nodes.push_back(N);
+    }
+    C->Nodes.clear();
+  }
+  for (auto I = Nodes.begin() + NewNodeBeginIdx, E = Nodes.end(); I != E; ++I)
+    for (Node &ChildN : **I) {
+      SCC &ChildC = *G->SCCMap.lookup(&ChildN);
+      if (&ChildC != this)
+        ChildC.ParentSCCs.insert(this);
+    }
+
+  // We return the list of SCCs which were merged so that callers can
+  // invalidate any data they have associated with those SCCs. Note that these
+  // SCCs are no longer in an interesting state (they are totally empty) but
+  // the pointers will remain stable for the life of the graph itself.
+  return SmallVector<SCC *, 1>(ConnectedSCCs.begin(), ConnectedSCCs.end());
+}
+
+void LazyCallGraph::SCC::removeInterSCCEdge(Node &CallerN, Node &CalleeN) {
+  // First remove it from the node.
+  CallerN.removeEdgeInternal(CalleeN.getFunction());
+
+  assert(G->SCCMap.lookup(&CallerN) == this &&
+         "The caller must be a member of this SCC.");
+
+  SCC &CalleeC = *G->SCCMap.lookup(&CalleeN);
+  assert(&CalleeC != this &&
+         "This API only supports the rmoval of inter-SCC edges.");
+
+  assert(std::find(G->LeafSCCs.begin(), G->LeafSCCs.end(), this) ==
+             G->LeafSCCs.end() &&
+         "Cannot have a leaf SCC caller with a different SCC callee.");
+
+  bool HasOtherCallToCalleeC = false;
+  bool HasOtherCallOutsideSCC = false;
+  for (Node *N : *this) {
+    for (Node &OtherCalleeN : *N) {
+      SCC &OtherCalleeC = *G->SCCMap.lookup(&OtherCalleeN);
+      if (&OtherCalleeC == &CalleeC) {
+        HasOtherCallToCalleeC = true;
+        break;
+      }
+      if (&OtherCalleeC != this)
+        HasOtherCallOutsideSCC = true;
+    }
+    if (HasOtherCallToCalleeC)
+      break;
+  }
+  // Because the SCCs form a DAG, deleting such an edge cannot change the set
+  // of SCCs in the graph. However, it may cut an edge of the SCC DAG, making
+  // the caller no longer a parent of the callee. Walk the other call edges
+  // in the caller to tell.
+  if (!HasOtherCallToCalleeC) {
+    bool Removed = CalleeC.ParentSCCs.erase(this);
+    (void)Removed;
+    assert(Removed &&
+           "Did not find the caller SCC in the callee SCC's parent list!");
+
+    // It may orphan an SCC if it is the last edge reaching it, but that does
+    // not violate any invariants of the graph.
+    if (CalleeC.ParentSCCs.empty())
+      DEBUG(dbgs() << "LCG: Update removing " << CallerN.getFunction().getName()
+                   << " -> " << CalleeN.getFunction().getName()
+                   << " edge orphaned the callee's SCC!\n");
+  }
+
+  // It may make the Caller SCC a leaf SCC.
+  if (!HasOtherCallOutsideSCC)
+    G->LeafSCCs.push_back(this);
+}
+
+void LazyCallGraph::SCC::internalDFS(
+    SmallVectorImpl<std::pair<Node *, Node::iterator>> &DFSStack,
+    SmallVectorImpl<Node *> &PendingSCCStack, Node *N,
+    SmallVectorImpl<SCC *> &ResultSCCs) {
+  Node::iterator I = N->begin();
+  N->LowLink = N->DFSNumber = 1;
+  int NextDFSNumber = 2;
+  for (;;) {
+    assert(N->DFSNumber != 0 && "We should always assign a DFS number "
+                                "before processing a node.");
+
+    // We simulate recursion by popping out of the nested loop and continuing.
+    Node::iterator E = N->end();
+    while (I != E) {
+      Node &ChildN = *I;
+      if (SCC *ChildSCC = G->SCCMap.lookup(&ChildN)) {
+        // Check if we have reached a node in the new (known connected) set of
+        // this SCC. If so, the entire stack is necessarily in that set and we
+        // can re-start.
+        if (ChildSCC == this) {
+          insert(*N);
+          while (!PendingSCCStack.empty())
+            insert(*PendingSCCStack.pop_back_val());
+          while (!DFSStack.empty())
+            insert(*DFSStack.pop_back_val().first);
+          return;
+        }
+
+        // If this child isn't currently in this SCC, no need to process it.
+        // However, we do need to remove this SCC from its SCC's parent set.
+        ChildSCC->ParentSCCs.erase(this);
+        ++I;
+        continue;
+      }
+
+      if (ChildN.DFSNumber == 0) {
+        // Mark that we should start at this child when next this node is the
+        // top of the stack. We don't start at the next child to ensure this
+        // child's lowlink is reflected.
+        DFSStack.push_back(std::make_pair(N, I));
+
+        // Continue, resetting to the child node.
+        ChildN.LowLink = ChildN.DFSNumber = NextDFSNumber++;
+        N = &ChildN;
+        I = ChildN.begin();
+        E = ChildN.end();
+        continue;
+      }
+
+      // Track the lowest link of the children, if any are still in the stack.
+      // Any child not on the stack will have a LowLink of -1.
+      assert(ChildN.LowLink != 0 &&
+             "Low-link must not be zero with a non-zero DFS number.");
+      if (ChildN.LowLink >= 0 && ChildN.LowLink < N->LowLink)
+        N->LowLink = ChildN.LowLink;
+      ++I;
+    }
+
+    if (N->LowLink == N->DFSNumber) {
+      ResultSCCs.push_back(G->formSCC(N, PendingSCCStack));
+      if (DFSStack.empty())
+        return;
+    } else {
+      // At this point we know that N cannot ever be an SCC root. Its low-link
+      // is not its dfs-number, and we've processed all of its children. It is
+      // just sitting here waiting until some node further down the stack gets
+      // low-link == dfs-number and pops it off as well. Move it to the pending
+      // stack which is pulled into the next SCC to be formed.
+      PendingSCCStack.push_back(N);
+
+      assert(!DFSStack.empty() && "We shouldn't have an empty stack!");
+    }
+
+    N = DFSStack.back().first;
+    I = DFSStack.back().second;
+    DFSStack.pop_back();
+  }
+}
+
+SmallVector<LazyCallGraph::SCC *, 1>
+LazyCallGraph::SCC::removeIntraSCCEdge(Node &CallerN,
+                                       Node &CalleeN) {
+  // First remove it from the node.
+  CallerN.removeEdgeInternal(CalleeN.getFunction());
+
+  // We return a list of the resulting *new* SCCs in postorder.
+  SmallVector<SCC *, 1> ResultSCCs;
+
+  // Direct recursion doesn't impact the SCC graph at all.
+  if (&CallerN == &CalleeN)
+    return ResultSCCs;
+
+  // The worklist is every node in the original SCC.
+  SmallVector<Node *, 1> Worklist;
+  Worklist.swap(Nodes);
+  for (Node *N : Worklist) {
+    // The nodes formerly in this SCC are no longer in any SCC.
+    N->DFSNumber = 0;
+    N->LowLink = 0;
+    G->SCCMap.erase(N);
+  }
+  assert(Worklist.size() > 1 && "We have to have at least two nodes to have an "
+                                "edge between them that is within the SCC.");
+
+  // The callee can already reach every node in this SCC (by definition). It is
+  // the only node we know will stay inside this SCC. Everything which
+  // transitively reaches Callee will also remain in the SCC. To model this we
+  // incrementally add any chain of nodes which reaches something in the new
+  // node set to the new node set. This short circuits one side of the Tarjan's
+  // walk.
+  insert(CalleeN);
+
+  // We're going to do a full mini-Tarjan's walk using a local stack here.
+  SmallVector<std::pair<Node *, Node::iterator>, 4> DFSStack;
+  SmallVector<Node *, 4> PendingSCCStack;
+  do {
+    Node *N = Worklist.pop_back_val();
+    if (N->DFSNumber == 0)
+      internalDFS(DFSStack, PendingSCCStack, N, ResultSCCs);
+
+    assert(DFSStack.empty() && "Didn't flush the entire DFS stack!");
+    assert(PendingSCCStack.empty() && "Didn't flush all pending SCC nodes!");
+  } while (!Worklist.empty());
+
+  // Now we need to reconnect the current SCC to the graph.
+  bool IsLeafSCC = true;
+  for (Node *N : Nodes) {
+    for (Node &ChildN : *N) {
+      SCC &ChildSCC = *G->SCCMap.lookup(&ChildN);
+      if (&ChildSCC == this)
+        continue;
+      ChildSCC.ParentSCCs.insert(this);
+      IsLeafSCC = false;
+    }
+  }
+#ifndef NDEBUG
+  if (!ResultSCCs.empty())
+    assert(!IsLeafSCC && "This SCC cannot be a leaf as we have split out new "
+                         "SCCs by removing this edge.");
+  if (!std::any_of(G->LeafSCCs.begin(), G->LeafSCCs.end(),
+                   [&](SCC *C) { return C == this; }))
+    assert(!IsLeafSCC && "This SCC cannot be a leaf as it already had child "
+                         "SCCs before we removed this edge.");
+#endif
+  // If this SCC stopped being a leaf through this edge removal, remove it from
+  // the leaf SCC list.
+  if (!IsLeafSCC && !ResultSCCs.empty())
+    G->LeafSCCs.erase(std::remove(G->LeafSCCs.begin(), G->LeafSCCs.end(), this),
+                     G->LeafSCCs.end());
+
+  // Return the new list of SCCs.
+  return ResultSCCs;
+}
+
+void LazyCallGraph::insertEdge(Node &CallerN, Function &Callee) {
+  assert(SCCMap.empty() && DFSStack.empty() &&
+         "This method cannot be called after SCCs have been formed!");
+
+  return CallerN.insertEdgeInternal(Callee);
+}
+
+void LazyCallGraph::removeEdge(Node &CallerN, Function &Callee) {
+  assert(SCCMap.empty() && DFSStack.empty() &&
+         "This method cannot be called after SCCs have been formed!");
+
+  return CallerN.removeEdgeInternal(Callee);
+}
+
+LazyCallGraph::Node &LazyCallGraph::insertInto(Function &F, Node *&MappedN) {
+  return *new (MappedN = BPA.Allocate()) Node(*this, F);
+}
+
+void LazyCallGraph::updateGraphPtrs() {
+  // Process all nodes updating the graph pointers.
+  {
+    SmallVector<Node *, 16> Worklist;
+    for (auto &Entry : EntryNodes)
+      if (Node *EntryN = Entry.dyn_cast<Node *>())
+        Worklist.push_back(EntryN);
+
+    while (!Worklist.empty()) {
+      Node *N = Worklist.pop_back_val();
+      N->G = this;
+      for (auto &Callee : N->Callees)
+        if (!Callee.isNull())
+          if (Node *CalleeN = Callee.dyn_cast<Node *>())
+            Worklist.push_back(CalleeN);
+    }
+  }
+
+  // Process all SCCs updating the graph pointers.
+  {
+    SmallVector<SCC *, 16> Worklist(LeafSCCs.begin(), LeafSCCs.end());
+
+    while (!Worklist.empty()) {
+      SCC *C = Worklist.pop_back_val();
+      C->G = this;
+      Worklist.insert(Worklist.end(), C->ParentSCCs.begin(),
+                      C->ParentSCCs.end());
+    }
+  }
+}
+
+LazyCallGraph::SCC *LazyCallGraph::formSCC(Node *RootN,
+                                           SmallVectorImpl<Node *> &NodeStack) {
+  // The tail of the stack is the new SCC. Allocate the SCC and pop the stack
+  // into it.
+  SCC *NewSCC = new (SCCBPA.Allocate()) SCC(*this);
+
+  while (!NodeStack.empty() && NodeStack.back()->DFSNumber > RootN->DFSNumber) {
+    assert(NodeStack.back()->LowLink >= RootN->LowLink &&
+           "We cannot have a low link in an SCC lower than its root on the "
+           "stack!");
+    NewSCC->insert(*NodeStack.pop_back_val());
+  }
+  NewSCC->insert(*RootN);
+
+  // A final pass over all edges in the SCC (this remains linear as we only
+  // do this once when we build the SCC) to connect it to the parent sets of
+  // its children.
+  bool IsLeafSCC = true;
+  for (Node *SCCN : NewSCC->Nodes)
+    for (Node &SCCChildN : *SCCN) {
+      SCC &ChildSCC = *SCCMap.lookup(&SCCChildN);
+      if (&ChildSCC == NewSCC)
+        continue;
+      ChildSCC.ParentSCCs.insert(NewSCC);
+      IsLeafSCC = false;
+    }
+
+  // For the SCCs where we fine no child SCCs, add them to the leaf list.
+  if (IsLeafSCC)
+    LeafSCCs.push_back(NewSCC);
+
+  return NewSCC;
+}
+
+LazyCallGraph::SCC *LazyCallGraph::getNextSCCInPostOrder() {
+  Node *N;
+  Node::iterator I;
+  if (!DFSStack.empty()) {
+    N = DFSStack.back().first;
+    I = DFSStack.back().second;
+    DFSStack.pop_back();
+  } else {
+    // If we've handled all candidate entry nodes to the SCC forest, we're done.
+    do {
+      if (SCCEntryNodes.empty())
+        return nullptr;
+
+      N = &get(*SCCEntryNodes.pop_back_val());
+    } while (N->DFSNumber != 0);
+    I = N->begin();
+    N->LowLink = N->DFSNumber = 1;
+    NextDFSNumber = 2;
+  }
+
+  for (;;) {
+    assert(N->DFSNumber != 0 && "We should always assign a DFS number "
+                                "before placing a node onto the stack.");
+
+    Node::iterator E = N->end();
+    while (I != E) {
+      Node &ChildN = *I;
+      if (ChildN.DFSNumber == 0) {
+        // Mark that we should start at this child when next this node is the
+        // top of the stack. We don't start at the next child to ensure this
+        // child's lowlink is reflected.
+        DFSStack.push_back(std::make_pair(N, N->begin()));
+
+        // Recurse onto this node via a tail call.
+        assert(!SCCMap.count(&ChildN) &&
+               "Found a node with 0 DFS number but already in an SCC!");
+        ChildN.LowLink = ChildN.DFSNumber = NextDFSNumber++;
+        N = &ChildN;
+        I = ChildN.begin();
+        E = ChildN.end();
+        continue;
+      }
+
+      // Track the lowest link of the children, if any are still in the stack.
+      assert(ChildN.LowLink != 0 &&
+             "Low-link must not be zero with a non-zero DFS number.");
+      if (ChildN.LowLink >= 0 && ChildN.LowLink < N->LowLink)
+        N->LowLink = ChildN.LowLink;
+      ++I;
+    }
+
+    if (N->LowLink == N->DFSNumber)
+      // Form the new SCC out of the top of the DFS stack.
+      return formSCC(N, PendingSCCStack);
+
+    // At this point we know that N cannot ever be an SCC root. Its low-link
+    // is not its dfs-number, and we've processed all of its children. It is
+    // just sitting here waiting until some node further down the stack gets
+    // low-link == dfs-number and pops it off as well. Move it to the pending
+    // stack which is pulled into the next SCC to be formed.
+    PendingSCCStack.push_back(N);
+
+    assert(!DFSStack.empty() && "We never found a viable root!");
+    N = DFSStack.back().first;
+    I = DFSStack.back().second;
+    DFSStack.pop_back();
+  }
 }
 
 char LazyCallGraphAnalysis::PassID;
@@ -154,9 +687,9 @@ LazyCallGraphPrinterPass::LazyCallGraphPrinterPass(raw_ostream &OS) : OS(OS) {}
 static void printNodes(raw_ostream &OS, LazyCallGraph::Node &N,
                        SmallPtrSetImpl<LazyCallGraph::Node *> &Printed) {
   // Recurse depth first through the nodes.
-  for (LazyCallGraph::Node *ChildN : N)
-    if (Printed.insert(ChildN))
-      printNodes(OS, *ChildN, Printed);
+  for (LazyCallGraph::Node &ChildN : N)
+    if (Printed.insert(&ChildN))
+      printNodes(OS, ChildN, Printed);
 
   OS << "  Call edges in function: " << N.getFunction().getName() << "\n";
   for (LazyCallGraph::iterator I = N.begin(), E = N.end(); I != E; ++I)
@@ -165,6 +698,16 @@ static void printNodes(raw_ostream &OS, LazyCallGraph::Node &N,
   OS << "\n";
 }
 
+static void printSCC(raw_ostream &OS, LazyCallGraph::SCC &SCC) {
+  ptrdiff_t SCCSize = std::distance(SCC.begin(), SCC.end());
+  OS << "  SCC with " << SCCSize << " functions:\n";
+
+  for (LazyCallGraph::Node *N : SCC)
+    OS << "    " << N->getFunction().getName() << "\n";
+
+  OS << "\n";
+}
+
 PreservedAnalyses LazyCallGraphPrinterPass::run(Module *M,
                                                 ModuleAnalysisManager *AM) {
   LazyCallGraph &G = AM->getResult<LazyCallGraphAnalysis>(M);
@@ -173,9 +716,13 @@ PreservedAnalyses LazyCallGraphPrinterPass::run(Module *M,
      << "\n\n";
 
   SmallPtrSet<LazyCallGraph::Node *, 16> Printed;
-  for (LazyCallGraph::Node *N : G)
-    if (Printed.insert(N))
-      printNodes(OS, *N, Printed);
+  for (LazyCallGraph::Node &N : G)
+    if (Printed.insert(&N))
+      printNodes(OS, N, Printed);
+
+  for (LazyCallGraph::SCC &SCC : G.postorder_sccs())
+    printSCC(OS, SCC);
 
   return PreservedAnalyses::all();
+
 }
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 3d6c583..9f919f7 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "lazy-value-info"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
@@ -34,6 +33,8 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "lazy-value-info"
+
 char LazyValueInfo::ID = 0;
 INITIALIZE_PASS_BEGIN(LazyValueInfo, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
@@ -82,7 +83,7 @@ class LVILatticeVal {
   ConstantRange Range;
   
 public:
-  LVILatticeVal() : Tag(undefined), Val(0), Range(1, true) {}
+  LVILatticeVal() : Tag(undefined), Val(nullptr), Range(1, true) {}
 
   static LVILatticeVal get(Constant *C) {
     LVILatticeVal Res;
@@ -516,7 +517,7 @@ bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
   BBLV.markOverdefined();
   
   Instruction *BBI = dyn_cast<Instruction>(Val);
-  if (BBI == 0 || BBI->getParent() != BB) {
+  if (!BBI || BBI->getParent() != BB) {
     return ODCacheUpdater.markResult(solveBlockValueNonLocal(BBLV, Val, BB));
   }
 
@@ -595,7 +596,7 @@ bool LazyValueInfoCache::solveBlockValueNonLocal(LVILatticeVal &BBLV,
       Value *UnderlyingVal = GetUnderlyingObject(Val);
       // If 'GetUnderlyingObject' didn't converge, skip it. It won't converge
       // inside InstructionDereferencesPointer either.
-      if (UnderlyingVal == GetUnderlyingObject(UnderlyingVal, NULL, 1)) {
+      if (UnderlyingVal == GetUnderlyingObject(UnderlyingVal, nullptr, 1)) {
         for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
              BI != BE; ++BI) {
           if (InstructionDereferencesPointer(BI, UnderlyingVal)) {
@@ -813,7 +814,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
 
         // Recognize the range checking idiom that InstCombine produces.
         // (X-C1) u< C2 --> [C1, C1+C2)
-        ConstantInt *NegOffset = 0;
+        ConstantInt *NegOffset = nullptr;
         if (ICI->getPredicate() == ICmpInst::ICMP_ULT)
           match(ICI->getOperand(0), m_Add(m_Specific(Val),
                                           m_ConstantInt(NegOffset)));
@@ -1014,7 +1015,7 @@ bool LazyValueInfo::runOnFunction(Function &F) {
     getCache(PImpl).clear();
 
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
 
   // Fully lazy.
@@ -1030,7 +1031,7 @@ void LazyValueInfo::releaseMemory() {
   // If the cache was allocated, free it.
   if (PImpl) {
     delete &getCache(PImpl);
-    PImpl = 0;
+    PImpl = nullptr;
   }
 }
 
@@ -1044,7 +1045,7 @@ Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB) {
     if (const APInt *SingleVal = CR.getSingleElement())
       return ConstantInt::get(V->getContext(), *SingleVal);
   }
-  return 0;
+  return nullptr;
 }
 
 /// getConstantOnEdge - Determine whether the specified value is known to be a
@@ -1060,7 +1061,7 @@ Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB,
     if (const APInt *SingleVal = CR.getSingleElement())
       return ConstantInt::get(V->getContext(), *SingleVal);
   }
-  return 0;
+  return nullptr;
 }
 
 /// getPredicateOnEdge - Determine whether the specified value comparison
@@ -1072,7 +1073,7 @@ LazyValueInfo::getPredicateOnEdge(unsigned Pred, Value *V, Constant *C,
   LVILatticeVal Result = getCache(PImpl).getValueOnEdge(V, FromBB, ToBB);
   
   // If we know the value is a constant, evaluate the conditional.
-  Constant *Res = 0;
+  Constant *Res = nullptr;
   if (Result.isConstant()) {
     Res = ConstantFoldCompareInstOperands(Pred, Result.getConstant(), C, DL,
                                           TLI);
diff --git a/lib/Analysis/LibCallAliasAnalysis.cpp b/lib/Analysis/LibCallAliasAnalysis.cpp
index fefa516..016f8c5 100644
--- a/lib/Analysis/LibCallAliasAnalysis.cpp
+++ b/lib/Analysis/LibCallAliasAnalysis.cpp
@@ -54,7 +54,7 @@ LibCallAliasAnalysis::AnalyzeLibCallDetails(const LibCallFunctionInfo *FI,
   // if we have detailed info and if 'P' is any of the locations we know
   // about.
   const LibCallFunctionInfo::LocationMRInfo *Details = FI->LocationDetails;
-  if (Details == 0)
+  if (Details == nullptr)
     return MRInfo;
   
   // If the details array is of the 'DoesNot' kind, we only know something if
diff --git a/lib/Analysis/LibCallSemantics.cpp b/lib/Analysis/LibCallSemantics.cpp
index 0592ccb..7d4e254 100644
--- a/lib/Analysis/LibCallSemantics.cpp
+++ b/lib/Analysis/LibCallSemantics.cpp
@@ -46,11 +46,11 @@ LibCallInfo::getFunctionInfo(const Function *F) const {
   
   /// If this is the first time we are querying for this info, lazily construct
   /// the StringMap to index it.
-  if (Map == 0) {
+  if (!Map) {
     Impl = Map = new StringMap<const LibCallFunctionInfo*>();
     
     const LibCallFunctionInfo *Array = getFunctionInfoArray();
-    if (Array == 0) return 0;
+    if (!Array) return nullptr;
     
     // We now have the array of entries.  Populate the StringMap.
     for (unsigned i = 0; Array[i].Name; ++i)
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index b2182b1..b14f329 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -137,8 +137,8 @@ namespace {
     // that failed.  This provides a nice place to put a breakpoint if you want
     // to see why something is not correct.
     void CheckFailed(const Twine &Message,
-                     const Value *V1 = 0, const Value *V2 = 0,
-                     const Value *V3 = 0, const Value *V4 = 0) {
+                     const Value *V1 = nullptr, const Value *V2 = nullptr,
+                     const Value *V3 = nullptr, const Value *V4 = nullptr) {
       MessagesStr << Message.str() << "\n";
       WriteValue(V1);
       WriteValue(V2);
@@ -177,7 +177,7 @@ bool Lint::runOnFunction(Function &F) {
   AA = &getAnalysis<AliasAnalysis>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
   visit(F);
   dbgs() << MessagesStr.str();
@@ -199,7 +199,7 @@ void Lint::visitCallSite(CallSite CS) {
   Value *Callee = CS.getCalledValue();
 
   visitMemoryReference(I, Callee, AliasAnalysis::UnknownSize,
-                       0, 0, MemRef::Callee);
+                       0, nullptr, MemRef::Callee);
 
   if (Function *F = dyn_cast<Function>(findValue(Callee, /*OffsetOk=*/false))) {
     Assert1(CS.getCallingConv() == F->getCallingConv(),
@@ -275,10 +275,10 @@ void Lint::visitCallSite(CallSite CS) {
       MemCpyInst *MCI = cast<MemCpyInst>(&I);
       // TODO: If the size is known, use it.
       visitMemoryReference(I, MCI->getDest(), AliasAnalysis::UnknownSize,
-                           MCI->getAlignment(), 0,
+                           MCI->getAlignment(), nullptr,
                            MemRef::Write);
       visitMemoryReference(I, MCI->getSource(), AliasAnalysis::UnknownSize,
-                           MCI->getAlignment(), 0,
+                           MCI->getAlignment(), nullptr,
                            MemRef::Read);
 
       // Check that the memcpy arguments don't overlap. The AliasAnalysis API
@@ -299,10 +299,10 @@ void Lint::visitCallSite(CallSite CS) {
       MemMoveInst *MMI = cast<MemMoveInst>(&I);
       // TODO: If the size is known, use it.
       visitMemoryReference(I, MMI->getDest(), AliasAnalysis::UnknownSize,
-                           MMI->getAlignment(), 0,
+                           MMI->getAlignment(), nullptr,
                            MemRef::Write);
       visitMemoryReference(I, MMI->getSource(), AliasAnalysis::UnknownSize,
-                           MMI->getAlignment(), 0,
+                           MMI->getAlignment(), nullptr,
                            MemRef::Read);
       break;
     }
@@ -310,7 +310,7 @@ void Lint::visitCallSite(CallSite CS) {
       MemSetInst *MSI = cast<MemSetInst>(&I);
       // TODO: If the size is known, use it.
       visitMemoryReference(I, MSI->getDest(), AliasAnalysis::UnknownSize,
-                           MSI->getAlignment(), 0,
+                           MSI->getAlignment(), nullptr,
                            MemRef::Write);
       break;
     }
@@ -321,17 +321,17 @@ void Lint::visitCallSite(CallSite CS) {
               &I);
 
       visitMemoryReference(I, CS.getArgument(0), AliasAnalysis::UnknownSize,
-                           0, 0, MemRef::Read | MemRef::Write);
+                           0, nullptr, MemRef::Read | MemRef::Write);
       break;
     case Intrinsic::vacopy:
       visitMemoryReference(I, CS.getArgument(0), AliasAnalysis::UnknownSize,
-                           0, 0, MemRef::Write);
+                           0, nullptr, MemRef::Write);
       visitMemoryReference(I, CS.getArgument(1), AliasAnalysis::UnknownSize,
-                           0, 0, MemRef::Read);
+                           0, nullptr, MemRef::Read);
       break;
     case Intrinsic::vaend:
       visitMemoryReference(I, CS.getArgument(0), AliasAnalysis::UnknownSize,
-                           0, 0, MemRef::Read | MemRef::Write);
+                           0, nullptr, MemRef::Read | MemRef::Write);
       break;
 
     case Intrinsic::stackrestore:
@@ -339,7 +339,7 @@ void Lint::visitCallSite(CallSite CS) {
       // stack pointer, which the compiler may read from or write to
       // at any time, so check it for both readability and writeability.
       visitMemoryReference(I, CS.getArgument(0), AliasAnalysis::UnknownSize,
-                           0, 0, MemRef::Read | MemRef::Write);
+                           0, nullptr, MemRef::Read | MemRef::Write);
       break;
     }
 }
@@ -513,7 +513,7 @@ static bool isZero(Value *V, const DataLayout *DL) {
   if (!VecTy) {
     unsigned BitWidth = V->getType()->getIntegerBitWidth();
     APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-    ComputeMaskedBits(V, KnownZero, KnownOne, DL);
+    computeKnownBits(V, KnownZero, KnownOne, DL);
     return KnownZero.isAllOnesValue();
   }
 
@@ -534,7 +534,7 @@ static bool isZero(Value *V, const DataLayout *DL) {
       return true;
 
     APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-    ComputeMaskedBits(Elem, KnownZero, KnownOne, DL);
+    computeKnownBits(Elem, KnownZero, KnownOne, DL);
     if (KnownZero.isAllOnesValue())
       return true;
   }
@@ -572,13 +572,13 @@ void Lint::visitAllocaInst(AllocaInst &I) {
 }
 
 void Lint::visitVAArgInst(VAArgInst &I) {
-  visitMemoryReference(I, I.getOperand(0), AliasAnalysis::UnknownSize, 0, 0,
-                       MemRef::Read | MemRef::Write);
+  visitMemoryReference(I, I.getOperand(0), AliasAnalysis::UnknownSize, 0,
+                       nullptr, MemRef::Read | MemRef::Write);
 }
 
 void Lint::visitIndirectBrInst(IndirectBrInst &I) {
-  visitMemoryReference(I, I.getAddress(), AliasAnalysis::UnknownSize, 0, 0,
-                       MemRef::Branchee);
+  visitMemoryReference(I, I.getAddress(), AliasAnalysis::UnknownSize, 0,
+                       nullptr, MemRef::Branchee);
 
   Assert1(I.getNumDestinations() != 0,
           "Undefined behavior: indirectbr with no destinations", &I);
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index 0902a39..005d309 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -62,7 +62,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
   if (ByteOffset < 0) // out of bounds
     return false;
 
-  Type *BaseType = 0;
+  Type *BaseType = nullptr;
   unsigned BaseAlign = 0;
   if (const AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
     // An alloca is safe to load from as load as it is suitably aligned.
@@ -161,7 +161,7 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
     ScanFrom++;
    
     // Don't scan huge blocks.
-    if (MaxInstsToScan-- == 0) return 0;
+    if (MaxInstsToScan-- == 0) return nullptr;
     
     --ScanFrom;
     // If this is a load of Ptr, the loaded value is available.
@@ -198,7 +198,7 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
       
       // Otherwise the store that may or may not alias the pointer, bail out.
       ++ScanFrom;
-      return 0;
+      return nullptr;
     }
     
     // If this is some other instruction that may clobber Ptr, bail out.
@@ -211,11 +211,11 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
       
       // May modify the pointer, bail out.
       ++ScanFrom;
-      return 0;
+      return nullptr;
     }
   }
   
   // Got to the start of the block, we didn't find it, but are done for this
   // block.
-  return 0;
+  return nullptr;
 }
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index b38672e..46c0eaa 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -141,21 +141,21 @@ bool Loop::makeLoopInvariant(Instruction *I, bool &Changed,
 PHINode *Loop::getCanonicalInductionVariable() const {
   BasicBlock *H = getHeader();
 
-  BasicBlock *Incoming = 0, *Backedge = 0;
+  BasicBlock *Incoming = nullptr, *Backedge = nullptr;
   pred_iterator PI = pred_begin(H);
   assert(PI != pred_end(H) &&
          "Loop must have at least one backedge!");
   Backedge = *PI++;
-  if (PI == pred_end(H)) return 0;  // dead loop
+  if (PI == pred_end(H)) return nullptr;  // dead loop
   Incoming = *PI++;
-  if (PI != pred_end(H)) return 0;  // multiple backedges?
+  if (PI != pred_end(H)) return nullptr;  // multiple backedges?
 
   if (contains(Incoming)) {
     if (contains(Backedge))
-      return 0;
+      return nullptr;
     std::swap(Incoming, Backedge);
   } else if (!contains(Backedge))
-    return 0;
+    return nullptr;
 
   // Loop over all of the PHI nodes, looking for a canonical indvar.
   for (BasicBlock::iterator I = H->begin(); isa<PHINode>(I); ++I) {
@@ -171,7 +171,7 @@ PHINode *Loop::getCanonicalInductionVariable() const {
               if (CI->equalsInt(1))
                 return PN;
   }
-  return 0;
+  return nullptr;
 }
 
 /// isLCSSAForm - Return true if the Loop is in LCSSA form
@@ -232,7 +232,7 @@ bool Loop::isSafeToClone() const {
 }
 
 MDNode *Loop::getLoopID() const {
-  MDNode *LoopID = 0;
+  MDNode *LoopID = nullptr;
   if (isLoopSimplifyForm()) {
     LoopID = getLoopLatch()->getTerminator()->getMetadata(LoopMDName);
   } else {
@@ -241,7 +241,7 @@ MDNode *Loop::getLoopID() const {
     BasicBlock *H = getHeader();
     for (block_iterator I = block_begin(), IE = block_end(); I != IE; ++I) {
       TerminatorInst *TI = (*I)->getTerminator();
-      MDNode *MD = 0;
+      MDNode *MD = nullptr;
 
       // Check if this terminator branches to the loop header.
       for (unsigned i = 0, ie = TI->getNumSuccessors(); i != ie; ++i) {
@@ -251,17 +251,17 @@ MDNode *Loop::getLoopID() const {
         }
       }
       if (!MD)
-        return 0;
+        return nullptr;
 
       if (!LoopID)
         LoopID = MD;
       else if (MD != LoopID)
-        return 0;
+        return nullptr;
     }
   }
   if (!LoopID || LoopID->getNumOperands() == 0 ||
       LoopID->getOperand(0) != LoopID)
-    return 0;
+    return nullptr;
   return LoopID;
 }
 
@@ -402,7 +402,7 @@ BasicBlock *Loop::getUniqueExitBlock() const {
   getUniqueExitBlocks(UniqueExitBlocks);
   if (UniqueExitBlocks.size() == 1)
     return UniqueExitBlocks[0];
-  return 0;
+  return nullptr;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -548,7 +548,7 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
   // is considered uninitialized.
   Loop *NearLoop = BBLoop;
 
-  Loop *Subloop = 0;
+  Loop *Subloop = nullptr;
   if (NearLoop != Unloop && Unloop->contains(NearLoop)) {
     Subloop = NearLoop;
     // Find the subloop ancestor that is directly contained within Unloop.
@@ -564,7 +564,7 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
   succ_iterator I = succ_begin(BB), E = succ_end(BB);
   if (I == E) {
     assert(!Subloop && "subloop blocks must have a successor");
-    NearLoop = 0; // unloop blocks may now exit the function.
+    NearLoop = nullptr; // unloop blocks may now exit the function.
   }
   for (; I != E; ++I) {
     if (*I == BB)
@@ -637,7 +637,7 @@ void LoopInfo::updateUnloop(Loop *Unloop) {
 
       // Blocks no longer have a parent but are still referenced by Unloop until
       // the Unloop object is deleted.
-      LI.changeLoopFor(*I, 0);
+      LI.changeLoopFor(*I, nullptr);
     }
 
     // Remove the loop from the top-level LoopInfo object.
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 38e753f..8df18e7 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -15,10 +15,13 @@
 
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "loop-pass-manager"
+
 namespace {
 
 /// PrintLoopPass - Print a Function corresponding to a Loop.
@@ -61,8 +64,8 @@ LPPassManager::LPPassManager()
   : FunctionPass(ID), PMDataManager() {
   skipThisLoop = false;
   redoThisLoop = false;
-  LI = NULL;
-  CurrentLoop = NULL;
+  LI = nullptr;
+  CurrentLoop = nullptr;
 }
 
 /// Delete loop from the loop queue and loop hierarchy (LoopInfo).
@@ -251,6 +254,8 @@ bool LPPassManager::runOnFunction(Function &F) {
 
         // Then call the regular verifyAnalysis functions.
         verifyPreservedAnalysis(P);
+
+        F.getContext().yield();
       }
 
       removeNotPreservedAnalysis(P);
diff --git a/lib/Analysis/MemDepPrinter.cpp b/lib/Analysis/MemDepPrinter.cpp
index bc1dc69..10da3d5 100644
--- a/lib/Analysis/MemDepPrinter.cpp
+++ b/lib/Analysis/MemDepPrinter.cpp
@@ -46,7 +46,7 @@ namespace {
 
     bool runOnFunction(Function &F) override;
 
-    void print(raw_ostream &OS, const Module * = 0) const override;
+    void print(raw_ostream &OS, const Module * = nullptr) const override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequiredTransitive<AliasAnalysis>();
@@ -56,7 +56,7 @@ namespace {
 
     void releaseMemory() override {
       Deps.clear();
-      F = 0;
+      F = nullptr;
     }
 
   private:
@@ -106,7 +106,7 @@ bool MemDepPrinter::runOnFunction(Function &F) {
     MemDepResult Res = MDA.getDependency(Inst);
     if (!Res.isNonLocal()) {
       Deps[Inst].insert(std::make_pair(getInstTypePair(Res),
-                                       static_cast<BasicBlock *>(0)));
+                                       static_cast<BasicBlock *>(nullptr)));
     } else if (CallSite CS = cast<Value>(Inst)) {
       const MemoryDependenceAnalysis::NonLocalDepInfo &NLDI =
         MDA.getNonLocalCallDependency(CS);
@@ -122,8 +122,8 @@ bool MemDepPrinter::runOnFunction(Function &F) {
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
         if (!LI->isUnordered()) {
           // FIXME: Handle atomic/volatile loads.
-          Deps[Inst].insert(std::make_pair(getInstTypePair(0, Unknown),
-                                           static_cast<BasicBlock *>(0)));
+          Deps[Inst].insert(std::make_pair(getInstTypePair(nullptr, Unknown),
+                                           static_cast<BasicBlock *>(nullptr)));
           continue;
         }
         AliasAnalysis::Location Loc = AA.getLocation(LI);
@@ -131,8 +131,8 @@ bool MemDepPrinter::runOnFunction(Function &F) {
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
         if (!SI->isUnordered()) {
           // FIXME: Handle atomic/volatile stores.
-          Deps[Inst].insert(std::make_pair(getInstTypePair(0, Unknown),
-                                           static_cast<BasicBlock *>(0)));
+          Deps[Inst].insert(std::make_pair(getInstTypePair(nullptr, Unknown),
+                                           static_cast<BasicBlock *>(nullptr)));
           continue;
         }
         AliasAnalysis::Location Loc = AA.getLocation(SI);
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 1dba323..64d339f 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "memory-builtins"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
@@ -30,6 +29,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "memory-builtins"
+
 enum AllocType {
   OpNewLike          = 1<<0, // allocates; never returns null
   MallocLike         = 1<<1 | OpNewLike, // allocates; may return null
@@ -76,14 +77,14 @@ static Function *getCalledFunction(const Value *V, bool LookThroughBitCast) {
 
   CallSite CS(const_cast<Value*>(V));
   if (!CS.getInstruction())
-    return 0;
+    return nullptr;
 
   if (CS.isNoBuiltin())
-    return 0;
+    return nullptr;
 
   Function *Callee = CS.getCalledFunction();
   if (!Callee || !Callee->isDeclaration())
-    return 0;
+    return nullptr;
   return Callee;
 }
 
@@ -94,17 +95,17 @@ static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
                                            bool LookThroughBitCast = false) {
   // Skip intrinsics
   if (isa<IntrinsicInst>(V))
-    return 0;
+    return nullptr;
 
   Function *Callee = getCalledFunction(V, LookThroughBitCast);
   if (!Callee)
-    return 0;
+    return nullptr;
 
   // Make sure that the function is available.
   StringRef FnName = Callee->getName();
   LibFunc::Func TLIFn;
   if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
-    return 0;
+    return nullptr;
 
   unsigned i = 0;
   bool found = false;
@@ -115,11 +116,11 @@ static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
     }
   }
   if (!found)
-    return 0;
+    return nullptr;
 
   const AllocFnsTy *FnData = &AllocationFnData[i];
   if ((FnData->AllocTy & AllocTy) != FnData->AllocTy)
-    return 0;
+    return nullptr;
 
   // Check function prototype.
   int FstParam = FnData->FstParam;
@@ -135,7 +136,7 @@ static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
        FTy->getParamType(SndParam)->isIntegerTy(32) ||
        FTy->getParamType(SndParam)->isIntegerTy(64)))
     return FnData;
-  return 0;
+  return nullptr;
 }
 
 static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
@@ -202,19 +203,19 @@ bool llvm::isOperatorNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
 /// ignore InvokeInst here.
 const CallInst *llvm::extractMallocCall(const Value *I,
                                         const TargetLibraryInfo *TLI) {
-  return isMallocLikeFn(I, TLI) ? dyn_cast<CallInst>(I) : 0;
+  return isMallocLikeFn(I, TLI) ? dyn_cast<CallInst>(I) : nullptr;
 }
 
 static Value *computeArraySize(const CallInst *CI, const DataLayout *DL,
                                const TargetLibraryInfo *TLI,
                                bool LookThroughSExt = false) {
   if (!CI)
-    return 0;
+    return nullptr;
 
   // The size of the malloc's result type must be known to determine array size.
   Type *T = getMallocAllocatedType(CI, TLI);
   if (!T || !T->isSized() || !DL)
-    return 0;
+    return nullptr;
 
   unsigned ElementSize = DL->getTypeAllocSize(T);
   if (StructType *ST = dyn_cast<StructType>(T))
@@ -223,12 +224,12 @@ static Value *computeArraySize(const CallInst *CI, const DataLayout *DL,
   // If malloc call's arg can be determined to be a multiple of ElementSize,
   // return the multiple.  Otherwise, return NULL.
   Value *MallocArg = CI->getArgOperand(0);
-  Value *Multiple = 0;
+  Value *Multiple = nullptr;
   if (ComputeMultiple(MallocArg, ElementSize, Multiple,
                       LookThroughSExt))
     return Multiple;
 
-  return 0;
+  return nullptr;
 }
 
 /// isArrayMalloc - Returns the corresponding CallInst if the instruction
@@ -245,7 +246,7 @@ const CallInst *llvm::isArrayMalloc(const Value *I,
       return CI;
 
   // CI is a non-array malloc or we can't figure out that it is an array malloc.
-  return 0;
+  return nullptr;
 }
 
 /// getMallocType - Returns the PointerType resulting from the malloc call.
@@ -257,7 +258,7 @@ PointerType *llvm::getMallocType(const CallInst *CI,
                                  const TargetLibraryInfo *TLI) {
   assert(isMallocLikeFn(CI, TLI) && "getMallocType and not malloc call");
 
-  PointerType *MallocType = 0;
+  PointerType *MallocType = nullptr;
   unsigned NumOfBitCastUses = 0;
 
   // Determine if CallInst has a bitcast use.
@@ -277,7 +278,7 @@ PointerType *llvm::getMallocType(const CallInst *CI,
     return cast<PointerType>(CI->getType());
 
   // Type could not be determined.
-  return 0;
+  return nullptr;
 }
 
 /// getMallocAllocatedType - Returns the Type allocated by malloc call.
@@ -288,7 +289,7 @@ PointerType *llvm::getMallocType(const CallInst *CI,
 Type *llvm::getMallocAllocatedType(const CallInst *CI,
                                    const TargetLibraryInfo *TLI) {
   PointerType *PT = getMallocType(CI, TLI);
-  return PT ? PT->getElementType() : 0;
+  return PT ? PT->getElementType() : nullptr;
 }
 
 /// getMallocArraySize - Returns the array size of a malloc call.  If the
@@ -308,7 +309,7 @@ Value *llvm::getMallocArraySize(CallInst *CI, const DataLayout *DL,
 /// is a calloc call.
 const CallInst *llvm::extractCallocCall(const Value *I,
                                         const TargetLibraryInfo *TLI) {
-  return isCallocLikeFn(I, TLI) ? cast<CallInst>(I) : 0;
+  return isCallocLikeFn(I, TLI) ? cast<CallInst>(I) : nullptr;
 }
 
 
@@ -316,15 +317,15 @@ const CallInst *llvm::extractCallocCall(const Value *I,
 const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
   const CallInst *CI = dyn_cast<CallInst>(I);
   if (!CI || isa<IntrinsicInst>(CI))
-    return 0;
+    return nullptr;
   Function *Callee = CI->getCalledFunction();
-  if (Callee == 0 || !Callee->isDeclaration())
-    return 0;
+  if (Callee == nullptr || !Callee->isDeclaration())
+    return nullptr;
 
   StringRef FnName = Callee->getName();
   LibFunc::Func TLIFn;
   if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
-    return 0;
+    return nullptr;
 
   unsigned ExpectedNumParams;
   if (TLIFn == LibFunc::free ||
@@ -335,18 +336,18 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
            TLIFn == LibFunc::ZdaPvRKSt9nothrow_t)   // delete[](void*, nothrow)
     ExpectedNumParams = 2;
   else
-    return 0;
+    return nullptr;
 
   // Check free prototype.
   // FIXME: workaround for PR5130, this will be obsolete when a nobuiltin
   // attribute will exist.
   FunctionType *FTy = Callee->getFunctionType();
   if (!FTy->getReturnType()->isVoidTy())
-    return 0;
+    return nullptr;
   if (FTy->getNumParams() != ExpectedNumParams)
-    return 0;
+    return nullptr;
   if (FTy->getParamType(0) != Type::getInt8PtrTy(Callee->getContext()))
-    return 0;
+    return nullptr;
 
   return CI;
 }
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 015ded1..9eaf109 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "memdep"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
@@ -33,6 +32,8 @@
 #include "llvm/Support/Debug.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "memdep"
+
 STATISTIC(NumCacheNonLocal, "Number of fully cached non-local responses");
 STATISTIC(NumCacheDirtyNonLocal, "Number of dirty cached non-local responses");
 STATISTIC(NumUncacheNonLocal, "Number of uncached non-local responses");
@@ -88,10 +89,10 @@ void MemoryDependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
 bool MemoryDependenceAnalysis::runOnFunction(Function &) {
   AA = &getAnalysis<AliasAnalysis>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : 0;
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
   if (!PredCache)
     PredCache.reset(new PredIteratorCache());
   return false;
@@ -261,10 +262,10 @@ isLoadLoadClobberIfExtendedToFullWidth(const AliasAnalysis::Location &MemLoc,
                                        const LoadInst *LI,
                                        const DataLayout *DL) {
   // If we have no target data, we can't do this.
-  if (DL == 0) return false;
+  if (!DL) return false;
 
   // If we haven't already computed the base/offset of MemLoc, do so now.
-  if (MemLocBase == 0)
+  if (!MemLocBase)
     MemLocBase = GetPointerBaseWithConstantOffset(MemLoc.Ptr, MemLocOffs, DL);
 
   unsigned Size = MemoryDependenceAnalysis::
@@ -362,13 +363,13 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
                          BasicBlock::iterator ScanIt, BasicBlock *BB,
                          Instruction *QueryInst) {
 
-  const Value *MemLocBase = 0;
+  const Value *MemLocBase = nullptr;
   int64_t MemLocOffset = 0;
   unsigned Limit = BlockScanLimit;
   bool isInvariantLoad = false;
   if (isLoad && QueryInst) {
     LoadInst *LI = dyn_cast<LoadInst>(QueryInst);
-    if (LI && LI->getMetadata(LLVMContext::MD_invariant_load) != 0)
+    if (LI && LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr)
       isInvariantLoad = true;
   }
 
@@ -696,7 +697,7 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
     if (Entry != Cache.begin() && std::prev(Entry)->getBB() == DirtyBB)
       --Entry;
 
-    NonLocalDepEntry *ExistingResult = 0;
+    NonLocalDepEntry *ExistingResult = nullptr;
     if (Entry != Cache.begin()+NumSortedEntries &&
         Entry->getBB() == DirtyBB) {
       // If we already have an entry, and if it isn't already dirty, the block
@@ -807,7 +808,7 @@ GetNonLocalInfoForBlock(const AliasAnalysis::Location &Loc,
   if (Entry != Cache->begin() && (Entry-1)->getBB() == BB)
     --Entry;
 
-  NonLocalDepEntry *ExistingResult = 0;
+  NonLocalDepEntry *ExistingResult = nullptr;
   if (Entry != Cache->begin()+NumSortedEntries && Entry->getBB() == BB)
     ExistingResult = &*Entry;
 
@@ -960,7 +961,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
     if (CacheInfo->TBAATag != Loc.TBAATag) {
       if (CacheInfo->TBAATag) {
         CacheInfo->Pair = BBSkipFirstBlockPair();
-        CacheInfo->TBAATag = 0;
+        CacheInfo->TBAATag = nullptr;
         for (NonLocalDepInfo::iterator DI = CacheInfo->NonLocalDeps.begin(),
              DE = CacheInfo->NonLocalDeps.end(); DI != DE; ++DI)
           if (Instruction *Inst = DI->getResult().getInst())
@@ -1116,7 +1117,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
       SortNonLocalDepInfoCache(*Cache, NumSortedEntries);
       NumSortedEntries = Cache->size();
     }
-    Cache = 0;
+    Cache = nullptr;
 
     PredList.clear();
     for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
@@ -1126,7 +1127,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
       // Get the PHI translated pointer in this predecessor.  This can fail if
       // not translatable, in which case the getAddr() returns null.
       PHITransAddr &PredPointer = PredList.back().second;
-      PredPointer.PHITranslateValue(BB, Pred, 0);
+      PredPointer.PHITranslateValue(BB, Pred, nullptr);
 
       Value *PredPtrVal = PredPointer.getAddr();
 
@@ -1175,7 +1176,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
       // predecessor, then we have to assume that the pointer is clobbered in
       // that predecessor.  We can still do PRE of the load, which would insert
       // a computation of the pointer in this predecessor.
-      if (PredPtrVal == 0)
+      if (!PredPtrVal)
         CanTranslate = false;
 
       // FIXME: it is entirely possible that PHI translating will end up with
@@ -1224,7 +1225,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
     // for the given block.  It assumes that we haven't modified any of
     // our datastructures while processing the current block.
 
-    if (Cache == 0) {
+    if (!Cache) {
       // Refresh the CacheInfo/Cache pointer if it got invalidated.
       CacheInfo = &NonLocalPointerDeps[CacheKey];
       Cache = &CacheInfo->NonLocalDeps;
@@ -1279,7 +1280,7 @@ RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair P) {
 
   for (unsigned i = 0, e = PInfo.size(); i != e; ++i) {
     Instruction *Target = PInfo[i].getResult().getInst();
-    if (Target == 0) continue;  // Ignore non-local dep results.
+    if (!Target) continue;  // Ignore non-local dep results.
     assert(Target->getParent() == PInfo[i].getBB());
 
     // Eliminating the dirty entry from 'Cache', so update the reverse info.
diff --git a/lib/Analysis/NoAliasAnalysis.cpp b/lib/Analysis/NoAliasAnalysis.cpp
index 0c119d6..4e11e50 100644
--- a/lib/Analysis/NoAliasAnalysis.cpp
+++ b/lib/Analysis/NoAliasAnalysis.cpp
@@ -36,7 +36,7 @@ namespace {
       // Note: NoAA does not call InitializeAliasAnalysis because it's
       // special and does not support chaining.
       DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-      DL = DLP ? &DLP->getDataLayout() : 0;
+      DL = DLP ? &DLP->getDataLayout() : nullptr;
     }
 
     AliasResult alias(const Location &LocA, const Location &LocB) override {
diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp
index ad3685a..bfe8642 100644
--- a/lib/Analysis/PHITransAddr.cpp
+++ b/lib/Analysis/PHITransAddr.cpp
@@ -43,7 +43,7 @@ static bool CanPHITrans(Instruction *Inst) {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void PHITransAddr::dump() const {
-  if (Addr == 0) {
+  if (!Addr) {
     dbgs() << "PHITransAddr: null\n";
     return;
   }
@@ -58,7 +58,7 @@ static bool VerifySubExpr(Value *Expr,
                           SmallVectorImpl<Instruction*> &InstInputs) {
   // If this is a non-instruction value, there is nothing to do.
   Instruction *I = dyn_cast<Instruction>(Expr);
-  if (I == 0) return true;
+  if (!I) return true;
 
   // If it's an instruction, it is either in Tmp or its operands recursively
   // are.
@@ -90,7 +90,7 @@ static bool VerifySubExpr(Value *Expr,
 /// structure is valid, it returns true.  If invalid, it prints errors and
 /// returns false.
 bool PHITransAddr::Verify() const {
-  if (Addr == 0) return true;
+  if (!Addr) return true;
 
   SmallVector<Instruction*, 8> Tmp(InstInputs.begin(), InstInputs.end());
 
@@ -116,14 +116,14 @@ bool PHITransAddr::IsPotentiallyPHITranslatable() const {
   // If the input value is not an instruction, or if it is not defined in CurBB,
   // then we don't need to phi translate it.
   Instruction *Inst = dyn_cast<Instruction>(Addr);
-  return Inst == 0 || CanPHITrans(Inst);
+  return !Inst || CanPHITrans(Inst);
 }
 
 
 static void RemoveInstInputs(Value *V,
                              SmallVectorImpl<Instruction*> &InstInputs) {
   Instruction *I = dyn_cast<Instruction>(V);
-  if (I == 0) return;
+  if (!I) return;
 
   // If the instruction is in the InstInputs list, remove it.
   SmallVectorImpl<Instruction*>::iterator Entry =
@@ -147,7 +147,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
                                          const DominatorTree *DT) {
   // If this is a non-instruction value, it can't require PHI translation.
   Instruction *Inst = dyn_cast<Instruction>(V);
-  if (Inst == 0) return V;
+  if (!Inst) return V;
 
   // Determine whether 'Inst' is an input to our PHI translatable expression.
   bool isInput = std::count(InstInputs.begin(), InstInputs.end(), Inst);
@@ -173,7 +173,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
     // If this is a non-phi value, and it is analyzable, we can incorporate it
     // into the expression by making all instruction operands be inputs.
     if (!CanPHITrans(Inst))
-      return 0;
+      return nullptr;
 
     // All instruction operands are now inputs (and of course, they may also be
     // defined in this block, so they may need to be phi translated themselves.
@@ -187,9 +187,9 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
   // operands need to be phi translated, and if so, reconstruct it.
 
   if (CastInst *Cast = dyn_cast<CastInst>(Inst)) {
-    if (!isSafeToSpeculativelyExecute(Cast)) return 0;
+    if (!isSafeToSpeculativelyExecute(Cast)) return nullptr;
     Value *PHIIn = PHITranslateSubExpr(Cast->getOperand(0), CurBB, PredBB, DT);
-    if (PHIIn == 0) return 0;
+    if (!PHIIn) return nullptr;
     if (PHIIn == Cast->getOperand(0))
       return Cast;
 
@@ -209,7 +209,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
             (!DT || DT->dominates(CastI->getParent(), PredBB)))
           return CastI;
     }
-    return 0;
+    return nullptr;
   }
 
   // Handle getelementptr with at least one PHI translatable operand.
@@ -218,7 +218,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
     bool AnyChanged = false;
     for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) {
       Value *GEPOp = PHITranslateSubExpr(GEP->getOperand(i), CurBB, PredBB, DT);
-      if (GEPOp == 0) return 0;
+      if (!GEPOp) return nullptr;
 
       AnyChanged |= GEPOp != GEP->getOperand(i);
       GEPOps.push_back(GEPOp);
@@ -253,7 +253,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
             return GEPI;
         }
     }
-    return 0;
+    return nullptr;
   }
 
   // Handle add with a constant RHS.
@@ -265,7 +265,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
     bool isNUW = cast<BinaryOperator>(Inst)->hasNoUnsignedWrap();
 
     Value *LHS = PHITranslateSubExpr(Inst->getOperand(0), CurBB, PredBB, DT);
-    if (LHS == 0) return 0;
+    if (!LHS) return nullptr;
 
     // If the PHI translated LHS is an add of a constant, fold the immediates.
     if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(LHS))
@@ -304,11 +304,11 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
           return BO;
     }
 
-    return 0;
+    return nullptr;
   }
 
   // Otherwise, we failed.
-  return 0;
+  return nullptr;
 }
 
 
@@ -326,10 +326,10 @@ bool PHITransAddr::PHITranslateValue(BasicBlock *CurBB, BasicBlock *PredBB,
     // Make sure the value is live in the predecessor.
     if (Instruction *Inst = dyn_cast_or_null<Instruction>(Addr))
       if (!DT->dominates(Inst->getParent(), PredBB))
-        Addr = 0;
+        Addr = nullptr;
   }
 
-  return Addr == 0;
+  return Addr == nullptr;
 }
 
 /// PHITranslateWithInsertion - PHI translate this value into the specified
@@ -354,7 +354,7 @@ PHITranslateWithInsertion(BasicBlock *CurBB, BasicBlock *PredBB,
   // If not, destroy any intermediate instructions inserted.
   while (NewInsts.size() != NISize)
     NewInsts.pop_back_val()->eraseFromParent();
-  return 0;
+  return nullptr;
 }
 
 
@@ -379,10 +379,10 @@ InsertPHITranslatedSubExpr(Value *InVal, BasicBlock *CurBB,
 
   // Handle cast of PHI translatable value.
   if (CastInst *Cast = dyn_cast<CastInst>(Inst)) {
-    if (!isSafeToSpeculativelyExecute(Cast)) return 0;
+    if (!isSafeToSpeculativelyExecute(Cast)) return nullptr;
     Value *OpVal = InsertPHITranslatedSubExpr(Cast->getOperand(0),
                                               CurBB, PredBB, DT, NewInsts);
-    if (OpVal == 0) return 0;
+    if (!OpVal) return nullptr;
 
     // Otherwise insert a cast at the end of PredBB.
     CastInst *New = CastInst::Create(Cast->getOpcode(),
@@ -400,7 +400,7 @@ InsertPHITranslatedSubExpr(Value *InVal, BasicBlock *CurBB,
     for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) {
       Value *OpVal = InsertPHITranslatedSubExpr(GEP->getOperand(i),
                                                 CurBB, PredBB, DT, NewInsts);
-      if (OpVal == 0) return 0;
+      if (!OpVal) return nullptr;
       GEPOps.push_back(OpVal);
     }
 
@@ -436,5 +436,5 @@ InsertPHITranslatedSubExpr(Value *InVal, BasicBlock *CurBB,
   }
 #endif
 
-  return 0;
+  return nullptr;
 }
diff --git a/lib/Analysis/PostDominators.cpp b/lib/Analysis/PostDominators.cpp
index f23833a..6d92909 100644
--- a/lib/Analysis/PostDominators.cpp
+++ b/lib/Analysis/PostDominators.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "postdomtree"
-
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetOperations.h"
@@ -22,6 +20,8 @@
 #include "llvm/Support/GenericDomTreeConstruction.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "postdomtree"
+
 //===----------------------------------------------------------------------===//
 //  PostDominatorTree Implementation
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index f4da598..7f88ae1 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -9,7 +9,6 @@
 // Detects single entry single exit regions in the control flow graph.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "region"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
@@ -19,10 +18,13 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <algorithm>
+#include <iterator>
 #include <set>
 
 using namespace llvm;
 
+#define DEBUG_TYPE "region"
+
 // Always verify if expensive checking is enabled.
 #ifdef XDEBUG
 static bool VerifyRegionInfo = true;
@@ -62,9 +64,6 @@ Region::~Region() {
   // Only clean the cache for this Region. Caches of child Regions will be
   // cleaned when the child Regions are deleted.
   BBNodeMap.clear();
-
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    delete *I;
 }
 
 void Region::replaceEntry(BasicBlock *BB) {
@@ -88,7 +87,7 @@ void Region::replaceEntryRecursive(BasicBlock *NewEntry) {
     R->replaceEntry(NewEntry);
     for (Region::const_iterator RI = R->begin(), RE = R->end(); RI != RE; ++RI)
       if ((*RI)->getEntry() == OldEntry)
-        RegionQueue.push_back(*RI);
+        RegionQueue.push_back(RI->get());
   }
 }
 
@@ -104,7 +103,7 @@ void Region::replaceExitRecursive(BasicBlock *NewExit) {
     R->replaceExit(NewExit);
     for (Region::const_iterator RI = R->begin(), RE = R->end(); RI != RE; ++RI)
       if ((*RI)->getExit() == OldExit)
-        RegionQueue.push_back(*RI);
+        RegionQueue.push_back(RI->get());
   }
 }
 
@@ -128,8 +127,8 @@ bool Region::contains(const Loop *L) const {
   // BBs that are not part of any loop are element of the Loop
   // described by the NULL pointer. This loop is not part of any region,
   // except if the region describes the whole function.
-  if (L == 0)
-    return getExit() == 0;
+  if (!L)
+    return getExit() == nullptr;
 
   if (!contains(L->getHeader()))
     return false;
@@ -147,7 +146,7 @@ bool Region::contains(const Loop *L) const {
 
 Loop *Region::outermostLoopInRegion(Loop *L) const {
   if (!contains(L))
-    return 0;
+    return nullptr;
 
   while (L && contains(L->getParentLoop())) {
     L = L->getParentLoop();
@@ -165,14 +164,14 @@ Loop *Region::outermostLoopInRegion(LoopInfo *LI, BasicBlock* BB) const {
 BasicBlock *Region::getEnteringBlock() const {
   BasicBlock *entry = getEntry();
   BasicBlock *Pred;
-  BasicBlock *enteringBlock = 0;
+  BasicBlock *enteringBlock = nullptr;
 
   for (pred_iterator PI = pred_begin(entry), PE = pred_end(entry); PI != PE;
        ++PI) {
     Pred = *PI;
     if (DT->getNode(Pred) && !contains(Pred)) {
       if (enteringBlock)
-        return 0;
+        return nullptr;
 
       enteringBlock = Pred;
     }
@@ -184,17 +183,17 @@ BasicBlock *Region::getEnteringBlock() const {
 BasicBlock *Region::getExitingBlock() const {
   BasicBlock *exit = getExit();
   BasicBlock *Pred;
-  BasicBlock *exitingBlock = 0;
+  BasicBlock *exitingBlock = nullptr;
 
   if (!exit)
-    return 0;
+    return nullptr;
 
   for (pred_iterator PI = pred_begin(exit), PE = pred_end(exit); PI != PE;
        ++PI) {
     Pred = *PI;
     if (contains(Pred)) {
       if (exitingBlock)
-        return 0;
+        return nullptr;
 
       exitingBlock = Pred;
     }
@@ -295,7 +294,7 @@ Region* Region::getSubRegionNode(BasicBlock *BB) const {
   Region *R = RI->getRegionFor(BB);
 
   if (!R || R == this)
-    return 0;
+    return nullptr;
 
   // If we pass the BB out of this region, that means our code is broken.
   assert(contains(R) && "BB not in current region!");
@@ -304,7 +303,7 @@ Region* Region::getSubRegionNode(BasicBlock *BB) const {
     R = R->getParent();
 
   if (R->getEntry() != BB)
-    return 0;
+    return nullptr;
 
   return R;
 }
@@ -333,18 +332,20 @@ RegionNode* Region::getNode(BasicBlock *BB) const {
 void Region::transferChildrenTo(Region *To) {
   for (iterator I = begin(), E = end(); I != E; ++I) {
     (*I)->parent = To;
-    To->children.push_back(*I);
+    To->children.push_back(std::move(*I));
   }
   children.clear();
 }
 
 void Region::addSubRegion(Region *SubRegion, bool moveChildren) {
-  assert(SubRegion->parent == 0 && "SubRegion already has a parent!");
-  assert(std::find(begin(), end(), SubRegion) == children.end()
-         && "Subregion already exists!");
+  assert(!SubRegion->parent && "SubRegion already has a parent!");
+  assert(std::find_if(begin(), end(), [&](const std::unique_ptr<Region> &R) {
+           return R.get() == SubRegion;
+         }) == children.end() &&
+         "Subregion already exists!");
 
   SubRegion->parent = this;
-  children.push_back(SubRegion);
+  children.push_back(std::unique_ptr<Region>(SubRegion));
 
   if (!moveChildren)
     return;
@@ -360,23 +361,27 @@ void Region::addSubRegion(Region *SubRegion, bool moveChildren) {
         RI->setRegionFor(BB, SubRegion);
     }
 
-  std::vector<Region*> Keep;
+  std::vector<std::unique_ptr<Region>> Keep;
   for (iterator I = begin(), E = end(); I != E; ++I)
-    if (SubRegion->contains(*I) && *I != SubRegion) {
-      SubRegion->children.push_back(*I);
+    if (SubRegion->contains(I->get()) && I->get() != SubRegion) {
       (*I)->parent = SubRegion;
+      SubRegion->children.push_back(std::move(*I));
     } else
-      Keep.push_back(*I);
+      Keep.push_back(std::move(*I));
 
   children.clear();
-  children.insert(children.begin(), Keep.begin(), Keep.end());
+  children.insert(children.begin(),
+                  std::move_iterator<RegionSet::iterator>(Keep.begin()),
+                  std::move_iterator<RegionSet::iterator>(Keep.end()));
 }
 
 
 Region *Region::removeSubRegion(Region *Child) {
   assert(Child->parent == this && "Child is not a child of this region!");
-  Child->parent = 0;
-  RegionSet::iterator I = std::find(children.begin(), children.end(), Child);
+  Child->parent = nullptr;
+  RegionSet::iterator I = std::find_if(
+      children.begin(), children.end(),
+      [&](const std::unique_ptr<Region> &R) { return R.get() == Child; });
   assert(I != children.end() && "Region does not exit. Unable to remove.");
   children.erase(children.begin()+(I-begin()));
   return Child;
@@ -385,7 +390,7 @@ Region *Region::removeSubRegion(Region *Child) {
 unsigned Region::getDepth() const {
   unsigned Depth = 0;
 
-  for (Region *R = parent; R != 0; R = R->parent)
+  for (Region *R = parent; R != nullptr; R = R->parent)
     ++Depth;
 
   return Depth;
@@ -395,12 +400,12 @@ Region *Region::getExpandedRegion() const {
   unsigned NumSuccessors = exit->getTerminator()->getNumSuccessors();
 
   if (NumSuccessors == 0)
-    return NULL;
+    return nullptr;
 
   for (pred_iterator PI = pred_begin(getExit()), PE = pred_end(getExit());
        PI != PE; ++PI)
     if (!DT->dominates(getEntry(), *PI))
-      return NULL;
+      return nullptr;
 
   Region *R = RI->getRegionFor(exit);
 
@@ -408,7 +413,7 @@ Region *Region::getExpandedRegion() const {
     if (exit->getTerminator()->getNumSuccessors() == 1)
       return new Region(getEntry(), *succ_begin(exit), RI, DT);
     else
-      return NULL;
+      return nullptr;
   }
 
   while (R->getParent() && R->getParent()->getEntry() == exit)
@@ -418,7 +423,7 @@ Region *Region::getExpandedRegion() const {
     for (pred_iterator PI = pred_begin(getExit()), PE = pred_end(getExit());
          PI != PE; ++PI)
     if (!DT->dominates(R->getExit(), *PI))
-      return NULL;
+      return nullptr;
 
   return new Region(getEntry(), R->getExit(), RI, DT);
 }
@@ -577,7 +582,7 @@ Region *RegionInfo::createRegion(BasicBlock *entry, BasicBlock *exit) {
   assert(entry && exit && "entry and exit must not be null!");
 
   if (isTrivialRegion(entry, exit))
-    return 0;
+    return nullptr;
 
   Region *region = new Region(entry, exit, this, DT);
   BBtoRegion.insert(std::make_pair(entry, region));
@@ -600,7 +605,7 @@ void RegionInfo::findRegionsWithEntry(BasicBlock *entry, BBtoBBMap *ShortCut) {
   if (!N)
     return;
 
-  Region *lastRegion= 0;
+  Region *lastRegion= nullptr;
   BasicBlock *lastExit = entry;
 
   // As only a BasicBlock that postdominates entry can finish a region, walk the
@@ -680,12 +685,12 @@ void RegionInfo::releaseMemory() {
   BBtoRegion.clear();
   if (TopLevelRegion)
     delete TopLevelRegion;
-  TopLevelRegion = 0;
+  TopLevelRegion = nullptr;
 }
 
 RegionInfo::RegionInfo() : FunctionPass(ID) {
   initializeRegionInfoPass(*PassRegistry::getPassRegistry());
-  TopLevelRegion = 0;
+  TopLevelRegion = nullptr;
 }
 
 RegionInfo::~RegionInfo() {
@@ -710,7 +715,7 @@ bool RegionInfo::runOnFunction(Function &F) {
   PDT = &getAnalysis<PostDominatorTree>();
   DF = &getAnalysis<DominanceFrontier>();
 
-  TopLevelRegion = new Region(&F.getEntryBlock(), 0, this, DT, 0);
+  TopLevelRegion = new Region(&F.getEntryBlock(), nullptr, this, DT, nullptr);
   updateStatistics(TopLevelRegion);
 
   Calculate(F);
@@ -744,7 +749,7 @@ void RegionInfo::verifyAnalysis() const {
 Region *RegionInfo::getRegionFor(BasicBlock *BB) const {
   BBtoRegionMap::const_iterator I=
     BBtoRegion.find(BB);
-  return I != BBtoRegion.end() ? I->second : 0;
+  return I != BBtoRegion.end() ? I->second : nullptr;
 }
 
 void RegionInfo::setRegionFor(BasicBlock *BB, Region *R) {
@@ -756,7 +761,7 @@ Region *RegionInfo::operator[](BasicBlock *BB) const {
 }
 
 BasicBlock *RegionInfo::getMaxRegionExit(BasicBlock *BB) const {
-  BasicBlock *Exit = NULL;
+  BasicBlock *Exit = nullptr;
 
   while (true) {
     // Get largest region that starts at BB.
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index 12d7ca3..3c7798f 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -17,10 +17,11 @@
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Support/Timer.h"
 
-#define DEBUG_TYPE "regionpassmgr"
 #include "llvm/Support/Debug.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "regionpassmgr"
+
 //===----------------------------------------------------------------------===//
 // RGPassManager
 //
@@ -31,15 +32,15 @@ RGPassManager::RGPassManager()
   : FunctionPass(ID), PMDataManager() {
   skipThisRegion = false;
   redoThisRegion = false;
-  RI = NULL;
-  CurrentRegion = NULL;
+  RI = nullptr;
+  CurrentRegion = nullptr;
 }
 
 // Recurse through all subregions and all regions  into RQ.
-static void addRegionIntoQueue(Region *R, std::deque<Region *> &RQ) {
-  RQ.push_back(R);
-  for (Region::iterator I = R->begin(), E = R->end(); I != E; ++I)
-    addRegionIntoQueue(*I, RQ);
+static void addRegionIntoQueue(Region &R, std::deque<Region *> &RQ) {
+  RQ.push_back(&R);
+  for (const auto &E : R)
+    addRegionIntoQueue(*E, RQ);
 }
 
 /// Pass Manager itself does not invalidate any analysis info.
@@ -57,7 +58,7 @@ bool RGPassManager::runOnFunction(Function &F) {
   // Collect inherited analysis from Module level pass manager.
   populateInheritedAnalysis(TPM->activeStack);
 
-  addRegionIntoQueue(RI->getTopLevelRegion(), RQ);
+  addRegionIntoQueue(*RI->getTopLevelRegion(), RQ);
 
   if (RQ.empty()) // No regions, skip calling finalizers
     return false;
@@ -185,7 +186,6 @@ private:
 
 public:
   static char ID;
-  PrintRegionPass() : RegionPass(ID), Out(dbgs()) {}
   PrintRegionPass(const std::string &B, raw_ostream &o)
       : RegionPass(ID), Banner(B), Out(o) {}
 
diff --git a/lib/Analysis/RegionPrinter.cpp b/lib/Analysis/RegionPrinter.cpp
index 6467f47..893210a 100644
--- a/lib/Analysis/RegionPrinter.cpp
+++ b/lib/Analysis/RegionPrinter.cpp
@@ -98,31 +98,31 @@ struct DOTGraphTraits<RegionInfo*> : public DOTGraphTraits<RegionNode*> {
 
   // Print the cluster of the subregions. This groups the single basic blocks
   // and adds a different background color for each group.
-  static void printRegionCluster(const Region *R, GraphWriter<RegionInfo*> &GW,
+  static void printRegionCluster(const Region &R, GraphWriter<RegionInfo*> &GW,
                                  unsigned depth = 0) {
     raw_ostream &O = GW.getOStream();
-    O.indent(2 * depth) << "subgraph cluster_" << static_cast<const void*>(R)
+    O.indent(2 * depth) << "subgraph cluster_" << static_cast<const void*>(&R)
       << " {\n";
     O.indent(2 * (depth + 1)) << "label = \"\";\n";
 
-    if (!onlySimpleRegions || R->isSimple()) {
+    if (!onlySimpleRegions || R.isSimple()) {
       O.indent(2 * (depth + 1)) << "style = filled;\n";
       O.indent(2 * (depth + 1)) << "color = "
-        << ((R->getDepth() * 2 % 12) + 1) << "\n";
+        << ((R.getDepth() * 2 % 12) + 1) << "\n";
 
     } else {
       O.indent(2 * (depth + 1)) << "style = solid;\n";
       O.indent(2 * (depth + 1)) << "color = "
-        << ((R->getDepth() * 2 % 12) + 2) << "\n";
+        << ((R.getDepth() * 2 % 12) + 2) << "\n";
     }
 
-    for (Region::const_iterator RI = R->begin(), RE = R->end(); RI != RE; ++RI)
-      printRegionCluster(*RI, GW, depth + 1);
+    for (Region::const_iterator RI = R.begin(), RE = R.end(); RI != RE; ++RI)
+      printRegionCluster(**RI, GW, depth + 1);
 
-    RegionInfo *RI = R->getRegionInfo();
+    RegionInfo *RI = R.getRegionInfo();
 
-    for (const auto &BB : R->blocks())
-      if (RI->getRegionFor(BB) == R)
+    for (const auto &BB : R.blocks())
+      if (RI->getRegionFor(BB) == &R)
         O.indent(2 * (depth + 1)) << "Node"
           << static_cast<const void*>(RI->getTopLevelRegion()->getBBNode(BB))
           << ";\n";
@@ -134,7 +134,7 @@ struct DOTGraphTraits<RegionInfo*> : public DOTGraphTraits<RegionNode*> {
                                      GraphWriter<RegionInfo*> &GW) {
     raw_ostream &O = GW.getOStream();
     O << "\tcolorscheme = \"paired12\"\n";
-    printRegionCluster(RI->getTopLevelRegion(), GW, 4);
+    printRegionCluster(*RI->getTopLevelRegion(), GW, 4);
   }
 };
 } //end namespace llvm
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 08de621..42a7aa2 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -58,7 +58,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "scalar-evolution"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -89,6 +88,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "scalar-evolution"
+
 STATISTIC(NumArrayLenItCounts,
           "Number of trip counts computed with array length");
 STATISTIC(NumTripCountsComputed,
@@ -182,7 +183,7 @@ void SCEV::print(raw_ostream &OS) const {
   case scUMaxExpr:
   case scSMaxExpr: {
     const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(this);
-    const char *OpStr = 0;
+    const char *OpStr = nullptr;
     switch (NAry->getSCEVType()) {
     case scAddExpr: OpStr = " + "; break;
     case scMulExpr: OpStr = " * "; break;
@@ -312,7 +313,7 @@ const SCEV *ScalarEvolution::getConstant(ConstantInt *V) {
   FoldingSetNodeID ID;
   ID.AddInteger(scConstant);
   ID.AddPointer(V);
-  void *IP = 0;
+  void *IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
   SCEV *S = new (SCEVAllocator) SCEVConstant(ID.Intern(SCEVAllocator), V);
   UniqueSCEVs.InsertNode(S, IP);
@@ -365,7 +366,7 @@ void SCEVUnknown::deleted() {
   SE->UniqueSCEVs.RemoveNode(this);
 
   // Release the value.
-  setValPtr(0);
+  setValPtr(nullptr);
 }
 
 void SCEVUnknown::allUsesReplacedWith(Value *New) {
@@ -829,7 +830,7 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
   ID.AddInteger(scTruncate);
   ID.AddPointer(Op);
   ID.AddPointer(Ty);
-  void *IP = 0;
+  void *IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
 
   // Fold if the operand is constant.
@@ -919,7 +920,7 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
   ID.AddInteger(scZeroExtend);
   ID.AddPointer(Op);
   ID.AddPointer(Ty);
-  void *IP = 0;
+  void *IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
 
   // zext(trunc(x)) --> zext(x) or x or trunc(x)
@@ -1072,7 +1073,7 @@ static const SCEV *getOverflowLimitForStep(const SCEV *Step,
     return SE->getConstant(APInt::getSignedMaxValue(BitWidth) -
                        SE->getSignedRange(Step).getSignedMin());
   }
-  return 0;
+  return nullptr;
 }
 
 // The recurrence AR has been shown to have no signed wrap. Typically, if we can
@@ -1091,19 +1092,18 @@ static const SCEV *getPreStartForSignExtend(const SCEVAddRecExpr *AR,
   // Check for a simple looking step prior to loop entry.
   const SCEVAddExpr *SA = dyn_cast<SCEVAddExpr>(Start);
   if (!SA)
-    return 0;
+    return nullptr;
 
   // Create an AddExpr for "PreStart" after subtracting Step. Full SCEV
   // subtraction is expensive. For this purpose, perform a quick and dirty
   // difference, by checking for Step in the operand list.
   SmallVector<const SCEV *, 4> DiffOps;
-  for (SCEVAddExpr::op_iterator I = SA->op_begin(), E = SA->op_end();
-       I != E; ++I) {
-    if (*I != Step)
-      DiffOps.push_back(*I);
-  }
+  for (const SCEV *Op : SA->operands())
+    if (Op != Step)
+      DiffOps.push_back(Op);
+
   if (DiffOps.size() == SA->getNumOperands())
-    return 0;
+    return nullptr;
 
   // This is a postinc AR. Check for overflow on the preinc recurrence using the
   // same three conditions that getSignExtendedExpr checks.
@@ -1139,7 +1139,7 @@ static const SCEV *getPreStartForSignExtend(const SCEVAddRecExpr *AR,
       SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit)) {
     return PreStart;
   }
-  return 0;
+  return nullptr;
 }
 
 // Get the normalized sign-extended expression for this AddRec's Start.
@@ -1181,7 +1181,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
   ID.AddInteger(scSignExtend);
   ID.AddPointer(Op);
   ID.AddPointer(Ty);
-  void *IP = 0;
+  void *IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
 
   // If the input value is provably positive, build a zext instead.
@@ -1201,6 +1201,23 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
       return getTruncateOrSignExtend(X, Ty);
   }
 
+  // sext(C1 + (C2 * x)) --> C1 + sext(C2 * x) if C1 < C2
+  if (auto SA = dyn_cast<SCEVAddExpr>(Op)) {
+    if (SA->getNumOperands() == 2) {
+      auto SC1 = dyn_cast<SCEVConstant>(SA->getOperand(0));
+      auto SMul = dyn_cast<SCEVMulExpr>(SA->getOperand(1));
+      if (SMul && SC1) {
+        if (auto SC2 = dyn_cast<SCEVConstant>(SMul->getOperand(0))) {
+          const APInt &C1 = SC1->getValue()->getValue();
+          const APInt &C2 = SC2->getValue()->getValue();
+          if (C1.isStrictlyPositive() && C2.isStrictlyPositive() &&
+              C2.ugt(C1) && C2.isPowerOf2())
+            return getAddExpr(getSignExtendExpr(SC1, Ty),
+                              getSignExtendExpr(SMul, Ty));
+        }
+      }
+    }
+  }
   // If the input value is a chrec scev, and we can prove that the value
   // did not overflow the old, smaller, value, we can sign extend all of the
   // operands (often constants).  This allows analysis of something like
@@ -1292,6 +1309,22 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
                                L, AR->getNoWrapFlags());
         }
       }
+      // If Start and Step are constants, check if we can apply this
+      // transformation:
+      // sext{C1,+,C2} --> C1 + sext{0,+,C2} if C1 < C2
+      auto SC1 = dyn_cast<SCEVConstant>(Start);
+      auto SC2 = dyn_cast<SCEVConstant>(Step);
+      if (SC1 && SC2) {
+        const APInt &C1 = SC1->getValue()->getValue();
+        const APInt &C2 = SC2->getValue()->getValue();
+        if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) &&
+            C2.isPowerOf2()) {
+          Start = getSignExtendExpr(Start, Ty);
+          const SCEV *NewAR = getAddRecExpr(getConstant(AR->getType(), 0), Step,
+                                            L, AR->getNoWrapFlags());
+          return getAddExpr(Start, getSignExtendExpr(NewAR, Ty));
+        }
+      }
     }
 
   // The cast wasn't folded; create an explicit cast node.
@@ -1340,9 +1373,8 @@ const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op,
   // Force the cast to be folded into the operands of an addrec.
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Op)) {
     SmallVector<const SCEV *, 4> Ops;
-    for (SCEVAddRecExpr::op_iterator I = AR->op_begin(), E = AR->op_end();
-         I != E; ++I)
-      Ops.push_back(getAnyExtendExpr(*I, Ty));
+    for (const SCEV *Op : AR->operands())
+      Ops.push_back(getAnyExtendExpr(Op, Ty));
     return getAddRecExpr(Ops, AR->getLoop(), SCEV::FlagNW);
   }
 
@@ -1811,7 +1843,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   ID.AddInteger(scAddExpr);
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     ID.AddPointer(Ops[i]);
-  void *IP = 0;
+  void *IP = nullptr;
   SCEVAddExpr *S =
     static_cast<SCEVAddExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
   if (!S) {
@@ -2105,7 +2137,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
   ID.AddInteger(scMulExpr);
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     ID.AddPointer(Ops[i]);
-  void *IP = 0;
+  void *IP = nullptr;
   SCEVMulExpr *S =
     static_cast<SCEVMulExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
   if (!S) {
@@ -2230,7 +2262,7 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
   ID.AddInteger(scUDivExpr);
   ID.AddPointer(LHS);
   ID.AddPointer(RHS);
-  void *IP = 0;
+  void *IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
   SCEV *S = new (SCEVAllocator) SCEVUDivExpr(ID.Intern(SCEVAllocator),
                                              LHS, RHS);
@@ -2425,7 +2457,7 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
   for (unsigned i = 0, e = Operands.size(); i != e; ++i)
     ID.AddPointer(Operands[i]);
   ID.AddPointer(L);
-  void *IP = 0;
+  void *IP = nullptr;
   SCEVAddRecExpr *S =
     static_cast<SCEVAddRecExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
   if (!S) {
@@ -2533,7 +2565,7 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
   ID.AddInteger(scSMaxExpr);
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     ID.AddPointer(Ops[i]);
-  void *IP = 0;
+  void *IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
   const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
   std::uninitialized_copy(Ops.begin(), Ops.end(), O);
@@ -2637,7 +2669,7 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
   ID.AddInteger(scUMaxExpr);
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     ID.AddPointer(Ops[i]);
-  void *IP = 0;
+  void *IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
   const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
   std::uninitialized_copy(Ops.begin(), Ops.end(), O);
@@ -2704,7 +2736,7 @@ const SCEV *ScalarEvolution::getUnknown(Value *V) {
   FoldingSetNodeID ID;
   ID.AddInteger(scUnknown);
   ID.AddPointer(V);
-  void *IP = 0;
+  void *IP = nullptr;
   if (SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) {
     assert(cast<SCEVUnknown>(S)->getValue() == V &&
            "Stale SCEVUnknown in uniquing map!");
@@ -3010,7 +3042,7 @@ const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
     return getPointerBase(Cast->getOperand());
   }
   else if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(V)) {
-    const SCEV *PtrOp = 0;
+    const SCEV *PtrOp = nullptr;
     for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
          I != E; ++I) {
       if ((*I)->getType()->isPointerTy()) {
@@ -3090,20 +3122,20 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
       // The loop may have multiple entrances or multiple exits; we can analyze
       // this phi as an addrec if it has a unique entry value and a unique
       // backedge value.
-      Value *BEValueV = 0, *StartValueV = 0;
+      Value *BEValueV = nullptr, *StartValueV = nullptr;
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         Value *V = PN->getIncomingValue(i);
         if (L->contains(PN->getIncomingBlock(i))) {
           if (!BEValueV) {
             BEValueV = V;
           } else if (BEValueV != V) {
-            BEValueV = 0;
+            BEValueV = nullptr;
             break;
           }
         } else if (!StartValueV) {
           StartValueV = V;
         } else if (StartValueV != V) {
-          StartValueV = 0;
+          StartValueV = nullptr;
           break;
         }
       }
@@ -3363,7 +3395,7 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
     // For a SCEVUnknown, ask ValueTracking.
     unsigned BitWidth = getTypeSizeInBits(U->getType());
     APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
-    ComputeMaskedBits(U->getValue(), Zeros, Ones);
+    computeKnownBits(U->getValue(), Zeros, Ones);
     return Zeros.countTrailingOnes();
   }
 
@@ -3502,7 +3534,7 @@ ScalarEvolution::getUnsignedRange(const SCEV *S) {
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
     // For a SCEVUnknown, ask ValueTracking.
     APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
-    ComputeMaskedBits(U->getValue(), Zeros, Ones, DL);
+    computeKnownBits(U->getValue(), Zeros, Ones, DL);
     if (Ones == ~Zeros + 1)
       return setUnsignedRange(U, ConservativeResult);
     return setUnsignedRange(U,
@@ -3755,13 +3787,13 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
 
       // Instcombine's ShrinkDemandedConstant may strip bits out of
       // constants, obscuring what would otherwise be a low-bits mask.
-      // Use ComputeMaskedBits to compute what ShrinkDemandedConstant
+      // Use computeKnownBits to compute what ShrinkDemandedConstant
       // knew about to reconstruct a low-bits mask value.
       unsigned LZ = A.countLeadingZeros();
       unsigned TZ = A.countTrailingZeros();
       unsigned BitWidth = A.getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      ComputeMaskedBits(U->getOperand(0), KnownZero, KnownOne, DL);
+      computeKnownBits(U->getOperand(0), KnownZero, KnownOne, DL);
 
       APInt EffectiveMask =
           APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ);
@@ -4316,9 +4348,9 @@ ScalarEvolution::BackedgeTakenInfo::getExact(ScalarEvolution *SE) const {
   if (!ExitNotTaken.ExitingBlock) return SE->getCouldNotCompute();
   assert(ExitNotTaken.ExactNotTaken && "uninitialized not-taken info");
 
-  const SCEV *BECount = 0;
+  const SCEV *BECount = nullptr;
   for (const ExitNotTakenInfo *ENT = &ExitNotTaken;
-       ENT != 0; ENT = ENT->getNextExit()) {
+       ENT != nullptr; ENT = ENT->getNextExit()) {
 
     assert(ENT->ExactNotTaken != SE->getCouldNotCompute() && "bad exit SCEV");
 
@@ -4336,7 +4368,7 @@ const SCEV *
 ScalarEvolution::BackedgeTakenInfo::getExact(BasicBlock *ExitingBlock,
                                              ScalarEvolution *SE) const {
   for (const ExitNotTakenInfo *ENT = &ExitNotTaken;
-       ENT != 0; ENT = ENT->getNextExit()) {
+       ENT != nullptr; ENT = ENT->getNextExit()) {
 
     if (ENT->ExitingBlock == ExitingBlock)
       return ENT->ExactNotTaken;
@@ -4359,7 +4391,7 @@ bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S,
     return false;
 
   for (const ExitNotTakenInfo *ENT = &ExitNotTaken;
-       ENT != 0; ENT = ENT->getNextExit()) {
+       ENT != nullptr; ENT = ENT->getNextExit()) {
 
     if (ENT->ExactNotTaken != SE->getCouldNotCompute()
         && SE->hasOperand(ENT->ExactNotTaken, S)) {
@@ -4398,8 +4430,8 @@ ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
 
 /// clear - Invalidate this result and free the ExitNotTakenInfo array.
 void ScalarEvolution::BackedgeTakenInfo::clear() {
-  ExitNotTaken.ExitingBlock = 0;
-  ExitNotTaken.ExactNotTaken = 0;
+  ExitNotTaken.ExitingBlock = nullptr;
+  ExitNotTaken.ExactNotTaken = nullptr;
   delete[] ExitNotTaken.getNextExit();
 }
 
@@ -4410,38 +4442,63 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
   SmallVector<BasicBlock *, 8> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
-  // Examine all exits and pick the most conservative values.
-  const SCEV *MaxBECount = getCouldNotCompute();
+  SmallVector<std::pair<BasicBlock *, const SCEV *>, 4> ExitCounts;
   bool CouldComputeBECount = true;
   BasicBlock *Latch = L->getLoopLatch(); // may be NULL.
-  const SCEV *LatchMaxCount = 0;
-  SmallVector<std::pair<BasicBlock *, const SCEV *>, 4> ExitCounts;
+  const SCEV *MustExitMaxBECount = nullptr;
+  const SCEV *MayExitMaxBECount = nullptr;
+
+  // Compute the ExitLimit for each loop exit. Use this to populate ExitCounts
+  // and compute maxBECount.
   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
-    ExitLimit EL = ComputeExitLimit(L, ExitingBlocks[i]);
+    BasicBlock *ExitBB = ExitingBlocks[i];
+    ExitLimit EL = ComputeExitLimit(L, ExitBB);
+
+    // 1. For each exit that can be computed, add an entry to ExitCounts.
+    // CouldComputeBECount is true only if all exits can be computed.
     if (EL.Exact == getCouldNotCompute())
       // We couldn't compute an exact value for this exit, so
       // we won't be able to compute an exact value for the loop.
       CouldComputeBECount = false;
     else
-      ExitCounts.push_back(std::make_pair(ExitingBlocks[i], EL.Exact));
-
-    if (MaxBECount == getCouldNotCompute())
-      MaxBECount = EL.Max;
-    else if (EL.Max != getCouldNotCompute()) {
-      // We cannot take the "min" MaxBECount, because non-unit stride loops may
-      // skip some loop tests. Taking the max over the exits is sufficiently
-      // conservative.  TODO: We could do better taking into consideration
-      // non-latch exits that dominate the latch.
-      if (EL.MustExit && ExitingBlocks[i] == Latch)
-        LatchMaxCount = EL.Max;
-      else
-        MaxBECount = getUMaxFromMismatchedTypes(MaxBECount, EL.Max);
+      ExitCounts.push_back(std::make_pair(ExitBB, EL.Exact));
+
+    // 2. Derive the loop's MaxBECount from each exit's max number of
+    // non-exiting iterations. Partition the loop exits into two kinds:
+    // LoopMustExits and LoopMayExits.
+    //
+    // A LoopMustExit meets two requirements:
+    //
+    // (a) Its ExitLimit.MustExit flag must be set which indicates that the exit
+    // test condition cannot be skipped (the tested variable has unit stride or
+    // the test is less-than or greater-than, rather than a strict inequality).
+    //
+    // (b) It must dominate the loop latch, hence must be tested on every loop
+    // iteration.
+    //
+    // If any computable LoopMustExit is found, then MaxBECount is the minimum
+    // EL.Max of computable LoopMustExits. Otherwise, MaxBECount is
+    // conservatively the maximum EL.Max, where CouldNotCompute is considered
+    // greater than any computable EL.Max.
+    if (EL.MustExit && EL.Max != getCouldNotCompute() && Latch &&
+        DT->dominates(ExitBB, Latch)) {
+      if (!MustExitMaxBECount)
+        MustExitMaxBECount = EL.Max;
+      else {
+        MustExitMaxBECount =
+          getUMinFromMismatchedTypes(MustExitMaxBECount, EL.Max);
+      }
+    } else if (MayExitMaxBECount != getCouldNotCompute()) {
+      if (!MayExitMaxBECount || EL.Max == getCouldNotCompute())
+        MayExitMaxBECount = EL.Max;
+      else {
+        MayExitMaxBECount =
+          getUMaxFromMismatchedTypes(MayExitMaxBECount, EL.Max);
+      }
     }
   }
-  // Be more precise in the easy case of a loop latch that must exit.
-  if (LatchMaxCount) {
-    MaxBECount = getUMinFromMismatchedTypes(MaxBECount, LatchMaxCount);
-  }
+  const SCEV *MaxBECount = MustExitMaxBECount ? MustExitMaxBECount :
+    (MayExitMaxBECount ? MayExitMaxBECount : getCouldNotCompute());
   return BackedgeTakenInfo(ExitCounts, CouldComputeBECount, MaxBECount);
 }
 
@@ -4454,7 +4511,7 @@ ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
   // exit at this block and remember the exit block and whether all other targets
   // lead to the loop header.
   bool MustExecuteLoopHeader = true;
-  BasicBlock *Exit = 0;
+  BasicBlock *Exit = nullptr;
   for (succ_iterator SI = succ_begin(ExitingBlock), SE = succ_end(ExitingBlock);
        SI != SE; ++SI)
     if (!L->contains(*SI)) {
@@ -4800,7 +4857,7 @@ ScalarEvolution::ComputeLoadConstantCompareExitLimit(
     return getCouldNotCompute();
 
   // Okay, we allow one non-constant index into the GEP instruction.
-  Value *VarIdx = 0;
+  Value *VarIdx = nullptr;
   std::vector<Constant*> Indexes;
   unsigned VarIdxNum = 0;
   for (unsigned i = 2, e = GEP->getNumOperands(); i != e; ++i)
@@ -4810,7 +4867,7 @@ ScalarEvolution::ComputeLoadConstantCompareExitLimit(
       if (VarIdx) return getCouldNotCompute();  // Multiple non-constant idx's.
       VarIdx = GEP->getOperand(i);
       VarIdxNum = i-2;
-      Indexes.push_back(0);
+      Indexes.push_back(nullptr);
     }
 
   // Loop-invariant loads may be a byproduct of loop optimization. Skip them.
@@ -4841,7 +4898,7 @@ ScalarEvolution::ComputeLoadConstantCompareExitLimit(
 
     Constant *Result = ConstantFoldLoadThroughGEPIndices(GV->getInitializer(),
                                                          Indexes);
-    if (Result == 0) break;  // Cannot compute!
+    if (!Result) break;  // Cannot compute!
 
     // Evaluate the condition for this iteration.
     Result = ConstantExpr::getICmp(predicate, Result, RHS);
@@ -4902,14 +4959,14 @@ getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L,
 
   // Otherwise, we can evaluate this instruction if all of its operands are
   // constant or derived from a PHI node themselves.
-  PHINode *PHI = 0;
+  PHINode *PHI = nullptr;
   for (Instruction::op_iterator OpI = UseInst->op_begin(),
          OpE = UseInst->op_end(); OpI != OpE; ++OpI) {
 
     if (isa<Constant>(*OpI)) continue;
 
     Instruction *OpInst = dyn_cast<Instruction>(*OpI);
-    if (!OpInst || !canConstantEvolve(OpInst, L)) return 0;
+    if (!OpInst || !canConstantEvolve(OpInst, L)) return nullptr;
 
     PHINode *P = dyn_cast<PHINode>(OpInst);
     if (!P)
@@ -4923,8 +4980,10 @@ getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L,
       P = getConstantEvolvingPHIOperands(OpInst, L, PHIMap);
       PHIMap[OpInst] = P;
     }
-    if (P == 0) return 0;        // Not evolving from PHI
-    if (PHI && PHI != P) return 0;  // Evolving from multiple different PHIs.
+    if (!P)
+      return nullptr;  // Not evolving from PHI
+    if (PHI && PHI != P)
+      return nullptr;  // Evolving from multiple different PHIs.
     PHI = P;
   }
   // This is a expression evolving from a constant PHI!
@@ -4938,7 +4997,7 @@ getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L,
 /// constraints, return null.
 static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) {
   Instruction *I = dyn_cast<Instruction>(V);
-  if (I == 0 || !canConstantEvolve(I, L)) return 0;
+  if (!I || !canConstantEvolve(I, L)) return nullptr;
 
   if (PHINode *PN = dyn_cast<PHINode>(I)) {
     return PN;
@@ -4960,18 +5019,18 @@ static Constant *EvaluateExpression(Value *V, const Loop *L,
   // Convenient constant check, but redundant for recursive calls.
   if (Constant *C = dyn_cast<Constant>(V)) return C;
   Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return 0;
+  if (!I) return nullptr;
 
   if (Constant *C = Vals.lookup(I)) return C;
 
   // An instruction inside the loop depends on a value outside the loop that we
   // weren't given a mapping for, or a value such as a call inside the loop.
-  if (!canConstantEvolve(I, L)) return 0;
+  if (!canConstantEvolve(I, L)) return nullptr;
 
   // An unmapped PHI can be due to a branch or another loop inside this loop,
   // or due to this not being the initial iteration through a loop where we
   // couldn't compute the evolution of this particular PHI last time.
-  if (isa<PHINode>(I)) return 0;
+  if (isa<PHINode>(I)) return nullptr;
 
   std::vector<Constant*> Operands(I->getNumOperands());
 
@@ -4979,12 +5038,12 @@ static Constant *EvaluateExpression(Value *V, const Loop *L,
     Instruction *Operand = dyn_cast<Instruction>(I->getOperand(i));
     if (!Operand) {
       Operands[i] = dyn_cast<Constant>(I->getOperand(i));
-      if (!Operands[i]) return 0;
+      if (!Operands[i]) return nullptr;
       continue;
     }
     Constant *C = EvaluateExpression(Operand, L, Vals, DL, TLI);
     Vals[Operand] = C;
-    if (!C) return 0;
+    if (!C) return nullptr;
     Operands[i] = C;
   }
 
@@ -5013,7 +5072,7 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
     return I->second;
 
   if (BEs.ugt(MaxBruteForceIterations))
-    return ConstantEvolutionLoopExitValue[PN] = 0;  // Not going to evaluate it.
+    return ConstantEvolutionLoopExitValue[PN] = nullptr;  // Not going to evaluate it.
 
   Constant *&RetVal = ConstantEvolutionLoopExitValue[PN];
 
@@ -5025,22 +5084,22 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
   // entry must be a constant (coming in from outside of the loop), and the
   // second must be derived from the same PHI.
   bool SecondIsBackedge = L->contains(PN->getIncomingBlock(1));
-  PHINode *PHI = 0;
+  PHINode *PHI = nullptr;
   for (BasicBlock::iterator I = Header->begin();
        (PHI = dyn_cast<PHINode>(I)); ++I) {
     Constant *StartCST =
       dyn_cast<Constant>(PHI->getIncomingValue(!SecondIsBackedge));
-    if (StartCST == 0) continue;
+    if (!StartCST) continue;
     CurrentIterVals[PHI] = StartCST;
   }
   if (!CurrentIterVals.count(PN))
-    return RetVal = 0;
+    return RetVal = nullptr;
 
   Value *BEValue = PN->getIncomingValue(SecondIsBackedge);
 
   // Execute the loop symbolically to determine the exit value.
   if (BEs.getActiveBits() >= 32)
-    return RetVal = 0; // More than 2^32-1 iterations?? Not doing it!
+    return RetVal = nullptr; // More than 2^32-1 iterations?? Not doing it!
 
   unsigned NumIterations = BEs.getZExtValue(); // must be in range
   unsigned IterationNum = 0;
@@ -5053,8 +5112,8 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
     DenseMap<Instruction *, Constant *> NextIterVals;
     Constant *NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, DL,
                                            TLI);
-    if (NextPHI == 0)
-      return 0;        // Couldn't evaluate!
+    if (!NextPHI)
+      return nullptr;        // Couldn't evaluate!
     NextIterVals[PN] = NextPHI;
 
     bool StoppedEvolving = NextPHI == CurrentIterVals[PN];
@@ -5101,7 +5160,7 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
                                                           Value *Cond,
                                                           bool ExitWhen) {
   PHINode *PN = getConstantEvolvingPHI(Cond, L);
-  if (PN == 0) return getCouldNotCompute();
+  if (!PN) return getCouldNotCompute();
 
   // If the loop is canonicalized, the PHI will have exactly two entries.
   // That's the only form we support here.
@@ -5114,12 +5173,12 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
   // One entry must be a constant (coming in from outside of the loop), and the
   // second must be derived from the same PHI.
   bool SecondIsBackedge = L->contains(PN->getIncomingBlock(1));
-  PHINode *PHI = 0;
+  PHINode *PHI = nullptr;
   for (BasicBlock::iterator I = Header->begin();
        (PHI = dyn_cast<PHINode>(I)); ++I) {
     Constant *StartCST =
       dyn_cast<Constant>(PHI->getIncomingValue(!SecondIsBackedge));
-    if (StartCST == 0) continue;
+    if (!StartCST) continue;
     CurrentIterVals[PHI] = StartCST;
   }
   if (!CurrentIterVals.count(PN))
@@ -5189,7 +5248,7 @@ const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
     if (Values[u].first == L)
       return Values[u].second ? Values[u].second : V;
   }
-  Values.push_back(std::make_pair(L, static_cast<const SCEV *>(0)));
+  Values.push_back(std::make_pair(L, static_cast<const SCEV *>(nullptr)));
   // Otherwise compute it.
   const SCEV *C = computeSCEVAtScope(V, L);
   SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values2 = ValuesAtScopes[V];
@@ -5243,7 +5302,7 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) {
         }
         for (unsigned i = 1, e = SA->getNumOperands(); i != e; ++i) {
           Constant *C2 = BuildConstantFromSCEV(SA->getOperand(i));
-          if (!C2) return 0;
+          if (!C2) return nullptr;
 
           // First pointer!
           if (!C->getType()->isPointerTy() && C2->getType()->isPointerTy()) {
@@ -5258,7 +5317,7 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) {
           // Don't bother trying to sum two pointers. We probably can't
           // statically compute a load that results from it anyway.
           if (C2->getType()->isPointerTy())
-            return 0;
+            return nullptr;
 
           if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
             if (PTy->getElementType()->isStructTy())
@@ -5276,10 +5335,10 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) {
       const SCEVMulExpr *SM = cast<SCEVMulExpr>(V);
       if (Constant *C = BuildConstantFromSCEV(SM->getOperand(0))) {
         // Don't bother with pointers at all.
-        if (C->getType()->isPointerTy()) return 0;
+        if (C->getType()->isPointerTy()) return nullptr;
         for (unsigned i = 1, e = SM->getNumOperands(); i != e; ++i) {
           Constant *C2 = BuildConstantFromSCEV(SM->getOperand(i));
-          if (!C2 || C2->getType()->isPointerTy()) return 0;
+          if (!C2 || C2->getType()->isPointerTy()) return nullptr;
           C = ConstantExpr::getMul(C, C2);
         }
         return C;
@@ -5298,7 +5357,7 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) {
     case scUMaxExpr:
       break; // TODO: smax, umax.
   }
-  return 0;
+  return nullptr;
 }
 
 const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
@@ -5365,7 +5424,7 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
 
         // Check to see if getSCEVAtScope actually made an improvement.
         if (MadeImprovement) {
-          Constant *C = 0;
+          Constant *C = nullptr;
           if (const CmpInst *CI = dyn_cast<CmpInst>(I))
             C = ConstantFoldCompareInstOperands(CI->getPredicate(),
                                                 Operands[0], Operands[1], DL,
@@ -5697,7 +5756,7 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool IsSubExpr) {
   // to 0, it must be counting down to equal 0. Consequently, N = Start / -Step.
   // We have not yet seen any such cases.
   const SCEVConstant *StepC = dyn_cast<SCEVConstant>(Step);
-  if (StepC == 0 || StepC->getValue()->equalsInt(0))
+  if (!StepC || StepC->getValue()->equalsInt(0))
     return getCouldNotCompute();
 
   // For positive steps (counting up until unsigned overflow):
@@ -6136,18 +6195,30 @@ bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred,
 
   // If LHS or RHS is an addrec, check to see if the condition is true in
   // every iteration of the loop.
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS))
-    if (isLoopEntryGuardedByCond(
-          AR->getLoop(), Pred, AR->getStart(), RHS) &&
-        isLoopBackedgeGuardedByCond(
-          AR->getLoop(), Pred, AR->getPostIncExpr(*this), RHS))
-      return true;
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(RHS))
-    if (isLoopEntryGuardedByCond(
-          AR->getLoop(), Pred, LHS, AR->getStart()) &&
-        isLoopBackedgeGuardedByCond(
-          AR->getLoop(), Pred, LHS, AR->getPostIncExpr(*this)))
-      return true;
+  // If LHS and RHS are both addrec, both conditions must be true in
+  // every iteration of the loop.
+  const SCEVAddRecExpr *LAR = dyn_cast<SCEVAddRecExpr>(LHS);
+  const SCEVAddRecExpr *RAR = dyn_cast<SCEVAddRecExpr>(RHS);
+  bool LeftGuarded = false;
+  bool RightGuarded = false;
+  if (LAR) {
+    const Loop *L = LAR->getLoop();
+    if (isLoopEntryGuardedByCond(L, Pred, LAR->getStart(), RHS) &&
+        isLoopBackedgeGuardedByCond(L, Pred, LAR->getPostIncExpr(*this), RHS)) {
+      if (!RAR) return true;
+      LeftGuarded = true;
+    }
+  }
+  if (RAR) {
+    const Loop *L = RAR->getLoop();
+    if (isLoopEntryGuardedByCond(L, Pred, LHS, RAR->getStart()) &&
+        isLoopBackedgeGuardedByCond(L, Pred, LHS, RAR->getPostIncExpr(*this))) {
+      if (!LAR) return true;
+      RightGuarded = true;
+    }
+  }
+  if (LeftGuarded && RightGuarded)
+    return true;
 
   // Otherwise see what can be done with known constant ranges.
   return isKnownPredicateWithRanges(Pred, LHS, RHS);
@@ -6814,6 +6885,105 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
   return SE.getCouldNotCompute();
 }
 
+namespace {
+struct FindUndefs {
+  bool Found;
+  FindUndefs() : Found(false) {}
+
+  bool follow(const SCEV *S) {
+    if (const SCEVUnknown *C = dyn_cast<SCEVUnknown>(S)) {
+      if (isa<UndefValue>(C->getValue()))
+        Found = true;
+    } else if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+      if (isa<UndefValue>(C->getValue()))
+        Found = true;
+    }
+
+    // Keep looking if we haven't found it yet.
+    return !Found;
+  }
+  bool isDone() const {
+    // Stop recursion if we have found an undef.
+    return Found;
+  }
+};
+}
+
+// Return true when S contains at least an undef value.
+static inline bool
+containsUndefs(const SCEV *S) {
+  FindUndefs F;
+  SCEVTraversal<FindUndefs> ST(F);
+  ST.visitAll(S);
+
+  return F.Found;
+}
+
+namespace {
+// Collect all steps of SCEV expressions.
+struct SCEVCollectStrides {
+  ScalarEvolution &SE;
+  SmallVectorImpl<const SCEV *> &Strides;
+
+  SCEVCollectStrides(ScalarEvolution &SE, SmallVectorImpl<const SCEV *> &S)
+      : SE(SE), Strides(S) {}
+
+  bool follow(const SCEV *S) {
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+      Strides.push_back(AR->getStepRecurrence(SE));
+    return true;
+  }
+  bool isDone() const { return false; }
+};
+
+// Collect all SCEVUnknown and SCEVMulExpr expressions.
+struct SCEVCollectTerms {
+  SmallVectorImpl<const SCEV *> &Terms;
+
+  SCEVCollectTerms(SmallVectorImpl<const SCEV *> &T)
+      : Terms(T) {}
+
+  bool follow(const SCEV *S) {
+    if (isa<SCEVUnknown>(S) || isa<SCEVMulExpr>(S)) {
+      if (!containsUndefs(S))
+        Terms.push_back(S);
+
+      // Stop recursion: once we collected a term, do not walk its operands.
+      return false;
+    }
+
+    // Keep looking.
+    return true;
+  }
+  bool isDone() const { return false; }
+};
+}
+
+/// Find parametric terms in this SCEVAddRecExpr.
+void SCEVAddRecExpr::collectParametricTerms(
+    ScalarEvolution &SE, SmallVectorImpl<const SCEV *> &Terms) const {
+  SmallVector<const SCEV *, 4> Strides;
+  SCEVCollectStrides StrideCollector(SE, Strides);
+  visitAll(this, StrideCollector);
+
+  DEBUG({
+      dbgs() << "Strides:\n";
+      for (const SCEV *S : Strides)
+        dbgs() << *S << "\n";
+    });
+
+  for (const SCEV *S : Strides) {
+    SCEVCollectTerms TermCollector(Terms);
+    visitAll(S, TermCollector);
+  }
+
+  DEBUG({
+      dbgs() << "Terms:\n";
+      for (const SCEV *T : Terms)
+        dbgs() << *T << "\n";
+    });
+}
+
 static const APInt srem(const SCEVConstant *C1, const SCEVConstant *C2) {
   APInt A = C1->getValue()->getValue();
   APInt B = C2->getValue()->getValue();
@@ -6843,351 +7013,479 @@ static const APInt sdiv(const SCEVConstant *C1, const SCEVConstant *C2) {
 }
 
 namespace {
-struct SCEVGCD : public SCEVVisitor<SCEVGCD, const SCEV *> {
-public:
-  // Pattern match Step into Start. When Step is a multiply expression, find
-  // the largest subexpression of Step that appears in Start. When Start is an
-  // add expression, try to match Step in the subexpressions of Start, non
-  // matching subexpressions are returned under Remainder.
-  static const SCEV *findGCD(ScalarEvolution &SE, const SCEV *Start,
-                             const SCEV *Step, const SCEV **Remainder) {
-    assert(Remainder && "Remainder should not be NULL");
-    SCEVGCD R(SE, Step, SE.getConstant(Step->getType(), 0));
-    const SCEV *Res = R.visit(Start);
-    *Remainder = R.Remainder;
-    return Res;
-  }
+struct FindSCEVSize {
+  int Size;
+  FindSCEVSize() : Size(0) {}
 
-  SCEVGCD(ScalarEvolution &S, const SCEV *G, const SCEV *R)
-      : SE(S), GCD(G), Remainder(R) {
-    Zero = SE.getConstant(GCD->getType(), 0);
-    One = SE.getConstant(GCD->getType(), 1);
+  bool follow(const SCEV *S) {
+    ++Size;
+    // Keep looking at all operands of S.
+    return true;
   }
+  bool isDone() const {
+    return false;
+  }
+};
+}
 
-  const SCEV *visitConstant(const SCEVConstant *Constant) {
-    if (GCD == Constant || Constant == Zero)
-      return GCD;
+// Returns the size of the SCEV S.
+static inline int sizeOfSCEV(const SCEV *S) {
+  FindSCEVSize F;
+  SCEVTraversal<FindSCEVSize> ST(F);
+  ST.visitAll(S);
+  return F.Size;
+}
 
-    if (const SCEVConstant *CGCD = dyn_cast<SCEVConstant>(GCD)) {
-      const SCEV *Res = SE.getConstant(gcd(Constant, CGCD));
-      if (Res != One)
-        return Res;
+namespace {
 
-      Remainder = SE.getConstant(srem(Constant, CGCD));
-      Constant = cast<SCEVConstant>(SE.getMinusSCEV(Constant, Remainder));
-      Res = SE.getConstant(gcd(Constant, CGCD));
-      return Res;
+struct SCEVDivision : public SCEVVisitor<SCEVDivision, void> {
+public:
+  // Computes the Quotient and Remainder of the division of Numerator by
+  // Denominator.
+  static void divide(ScalarEvolution &SE, const SCEV *Numerator,
+                     const SCEV *Denominator, const SCEV **Quotient,
+                     const SCEV **Remainder) {
+    assert(Numerator && Denominator && "Uninitialized SCEV");
+
+    SCEVDivision D(SE, Numerator, Denominator);
+
+    // Check for the trivial case here to avoid having to check for it in the
+    // rest of the code.
+    if (Numerator == Denominator) {
+      *Quotient = D.One;
+      *Remainder = D.Zero;
+      return;
     }
 
-    // When GCD is not a constant, it could be that the GCD is an Add, Mul,
-    // AddRec, etc., in which case we want to find out how many times the
-    // Constant divides the GCD: we then return that as the new GCD.
-    const SCEV *Rem = Zero;
-    const SCEV *Res = findGCD(SE, GCD, Constant, &Rem);
+    if (Numerator->isZero()) {
+      *Quotient = D.Zero;
+      *Remainder = D.Zero;
+      return;
+    }
 
-    if (Res == One || Rem != Zero) {
-      Remainder = Constant;
-      return One;
+    // Split the Denominator when it is a product.
+    if (const SCEVMulExpr *T = dyn_cast<const SCEVMulExpr>(Denominator)) {
+      const SCEV *Q, *R;
+      *Quotient = Numerator;
+      for (const SCEV *Op : T->operands()) {
+        divide(SE, *Quotient, Op, &Q, &R);
+        *Quotient = Q;
+
+        // Bail out when the Numerator is not divisible by one of the terms of
+        // the Denominator.
+        if (!R->isZero()) {
+          *Quotient = D.Zero;
+          *Remainder = Numerator;
+          return;
+        }
+      }
+      *Remainder = D.Zero;
+      return;
     }
 
-    assert(isa<SCEVConstant>(Res) && "Res should be a constant");
-    Remainder = SE.getConstant(srem(Constant, cast<SCEVConstant>(Res)));
-    return Res;
+    D.visit(Numerator);
+    *Quotient = D.Quotient;
+    *Remainder = D.Remainder;
+  }
+
+  SCEVDivision(ScalarEvolution &S, const SCEV *Numerator, const SCEV *Denominator)
+      : SE(S), Denominator(Denominator) {
+    Zero = SE.getConstant(Denominator->getType(), 0);
+    One = SE.getConstant(Denominator->getType(), 1);
+
+    // By default, we don't know how to divide Expr by Denominator.
+    // Providing the default here simplifies the rest of the code.
+    Quotient = Zero;
+    Remainder = Numerator;
+  }
+
+  // Except in the trivial case described above, we do not know how to divide
+  // Expr by Denominator for the following functions with empty implementation.
+  void visitTruncateExpr(const SCEVTruncateExpr *Numerator) {}
+  void visitZeroExtendExpr(const SCEVZeroExtendExpr *Numerator) {}
+  void visitSignExtendExpr(const SCEVSignExtendExpr *Numerator) {}
+  void visitUDivExpr(const SCEVUDivExpr *Numerator) {}
+  void visitSMaxExpr(const SCEVSMaxExpr *Numerator) {}
+  void visitUMaxExpr(const SCEVUMaxExpr *Numerator) {}
+  void visitUnknown(const SCEVUnknown *Numerator) {}
+  void visitCouldNotCompute(const SCEVCouldNotCompute *Numerator) {}
+
+  void visitConstant(const SCEVConstant *Numerator) {
+    if (const SCEVConstant *D = dyn_cast<SCEVConstant>(Denominator)) {
+      Quotient = SE.getConstant(sdiv(Numerator, D));
+      Remainder = SE.getConstant(srem(Numerator, D));
+      return;
+    }
   }
 
-  const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
-    if (GCD != Expr)
-      Remainder = Expr;
-    return GCD;
+  void visitAddRecExpr(const SCEVAddRecExpr *Numerator) {
+    const SCEV *StartQ, *StartR, *StepQ, *StepR;
+    assert(Numerator->isAffine() && "Numerator should be affine");
+    divide(SE, Numerator->getStart(), Denominator, &StartQ, &StartR);
+    divide(SE, Numerator->getStepRecurrence(SE), Denominator, &StepQ, &StepR);
+    Quotient = SE.getAddRecExpr(StartQ, StepQ, Numerator->getLoop(),
+                                Numerator->getNoWrapFlags());
+    Remainder = SE.getAddRecExpr(StartR, StepR, Numerator->getLoop(),
+                                 Numerator->getNoWrapFlags());
   }
 
-  const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
-    if (GCD != Expr)
-      Remainder = Expr;
-    return GCD;
-  }
+  void visitAddExpr(const SCEVAddExpr *Numerator) {
+    SmallVector<const SCEV *, 2> Qs, Rs;
+    Type *Ty = Denominator->getType();
 
-  const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
-    if (GCD != Expr)
-      Remainder = Expr;
-    return GCD;
-  }
+    for (const SCEV *Op : Numerator->operands()) {
+      const SCEV *Q, *R;
+      divide(SE, Op, Denominator, &Q, &R);
 
-  const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
-    if (GCD == Expr)
-      return GCD;
+      // Bail out if types do not match.
+      if (Ty != Q->getType() || Ty != R->getType()) {
+        Quotient = Zero;
+        Remainder = Numerator;
+        return;
+      }
 
-    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
-      const SCEV *Rem = Zero;
-      const SCEV *Res = findGCD(SE, Expr->getOperand(e - 1 - i), GCD, &Rem);
+      Qs.push_back(Q);
+      Rs.push_back(R);
+    }
 
-      // FIXME: There may be ambiguous situations: for instance,
-      // GCD(-4 + (3 * %m), 2 * %m) where 2 divides -4 and %m divides (3 * %m).
-      // The order in which the AddExpr is traversed computes a different GCD
-      // and Remainder.
-      if (Res != One)
-        GCD = Res;
-      if (Rem != Zero)
-        Remainder = SE.getAddExpr(Remainder, Rem);
+    if (Qs.size() == 1) {
+      Quotient = Qs[0];
+      Remainder = Rs[0];
+      return;
     }
 
-    return GCD;
+    Quotient = SE.getAddExpr(Qs);
+    Remainder = SE.getAddExpr(Rs);
   }
 
-  const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
-    if (GCD == Expr)
-      return GCD;
+  void visitMulExpr(const SCEVMulExpr *Numerator) {
+    SmallVector<const SCEV *, 2> Qs;
+    Type *Ty = Denominator->getType();
 
-    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
-      if (Expr->getOperand(i) == GCD)
-        return GCD;
-    }
+    bool FoundDenominatorTerm = false;
+    for (const SCEV *Op : Numerator->operands()) {
+      // Bail out if types do not match.
+      if (Ty != Op->getType()) {
+        Quotient = Zero;
+        Remainder = Numerator;
+        return;
+      }
+
+      if (FoundDenominatorTerm) {
+        Qs.push_back(Op);
+        continue;
+      }
 
-    // If we have not returned yet, it means that GCD is not part of Expr.
-    const SCEV *PartialGCD = One;
-    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
-      const SCEV *Rem = Zero;
-      const SCEV *Res = findGCD(SE, Expr->getOperand(i), GCD, &Rem);
-      if (Rem != Zero)
-        // GCD does not divide Expr->getOperand(i).
+      // Check whether Denominator divides one of the product operands.
+      const SCEV *Q, *R;
+      divide(SE, Op, Denominator, &Q, &R);
+      if (!R->isZero()) {
+        Qs.push_back(Op);
         continue;
+      }
 
-      if (Res == GCD)
-        return GCD;
-      PartialGCD = SE.getMulExpr(PartialGCD, Res);
-      if (PartialGCD == GCD)
-        return GCD;
-    }
-
-    if (PartialGCD != One)
-      return PartialGCD;
-
-    Remainder = Expr;
-    const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(GCD);
-    if (!Mul)
-      return PartialGCD;
-
-    // When the GCD is a multiply expression, try to decompose it:
-    // this occurs when Step does not divide the Start expression
-    // as in: {(-4 + (3 * %m)),+,(2 * %m)}
-    for (int i = 0, e = Mul->getNumOperands(); i < e; ++i) {
-      const SCEV *Rem = Zero;
-      const SCEV *Res = findGCD(SE, Expr, Mul->getOperand(i), &Rem);
-      if (Rem == Zero) {
-        Remainder = Rem;
-        return Res;
+      // Bail out if types do not match.
+      if (Ty != Q->getType()) {
+        Quotient = Zero;
+        Remainder = Numerator;
+        return;
       }
+
+      FoundDenominatorTerm = true;
+      Qs.push_back(Q);
     }
 
-    return PartialGCD;
-  }
+    if (FoundDenominatorTerm) {
+      Remainder = Zero;
+      if (Qs.size() == 1)
+        Quotient = Qs[0];
+      else
+        Quotient = SE.getMulExpr(Qs);
+      return;
+    }
 
-  const SCEV *visitUDivExpr(const SCEVUDivExpr *Expr) {
-    if (GCD != Expr)
-      Remainder = Expr;
-    return GCD;
+    if (!isa<SCEVUnknown>(Denominator)) {
+      Quotient = Zero;
+      Remainder = Numerator;
+      return;
+    }
+
+    // The Remainder is obtained by replacing Denominator by 0 in Numerator.
+    ValueToValueMap RewriteMap;
+    RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
+        cast<SCEVConstant>(Zero)->getValue();
+    Remainder = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
+
+    // Quotient is (Numerator - Remainder) divided by Denominator.
+    const SCEV *Q, *R;
+    const SCEV *Diff = SE.getMinusSCEV(Numerator, Remainder);
+    if (sizeOfSCEV(Diff) > sizeOfSCEV(Numerator)) {
+      // This SCEV does not seem to simplify: fail the division here.
+      Quotient = Zero;
+      Remainder = Numerator;
+      return;
+    }
+    divide(SE, Diff, Denominator, &Q, &R);
+    assert(R == Zero &&
+           "(Numerator - Remainder) should evenly divide Denominator");
+    Quotient = Q;
   }
 
-  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
-    if (GCD == Expr)
-      return GCD;
+private:
+  ScalarEvolution &SE;
+  const SCEV *Denominator, *Quotient, *Remainder, *Zero, *One;
+};
+}
 
-    if (!Expr->isAffine()) {
-      Remainder = Expr;
-      return GCD;
-    }
+static bool findArrayDimensionsRec(ScalarEvolution &SE,
+                                   SmallVectorImpl<const SCEV *> &Terms,
+                                   SmallVectorImpl<const SCEV *> &Sizes) {
+  int Last = Terms.size() - 1;
+  const SCEV *Step = Terms[Last];
 
-    const SCEV *Rem = Zero;
-    const SCEV *Res = findGCD(SE, Expr->getOperand(0), GCD, &Rem);
-    if (Rem != Zero)
-      Remainder = SE.getAddExpr(Remainder, Rem);
+  // End of recursion.
+  if (Last == 0) {
+    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Step)) {
+      SmallVector<const SCEV *, 2> Qs;
+      for (const SCEV *Op : M->operands())
+        if (!isa<SCEVConstant>(Op))
+          Qs.push_back(Op);
 
-    Rem = Zero;
-    Res = findGCD(SE, Expr->getOperand(1), Res, &Rem);
-    if (Rem != Zero) {
-      Remainder = Expr;
-      return GCD;
+      Step = SE.getMulExpr(Qs);
     }
 
-    return Res;
+    Sizes.push_back(Step);
+    return true;
   }
 
-  const SCEV *visitSMaxExpr(const SCEVSMaxExpr *Expr) {
-    if (GCD != Expr)
-      Remainder = Expr;
-    return GCD;
-  }
+  for (const SCEV *&Term : Terms) {
+    // Normalize the terms before the next call to findArrayDimensionsRec.
+    const SCEV *Q, *R;
+    SCEVDivision::divide(SE, Term, Step, &Q, &R);
 
-  const SCEV *visitUMaxExpr(const SCEVUMaxExpr *Expr) {
-    if (GCD != Expr)
-      Remainder = Expr;
-    return GCD;
-  }
+    // Bail out when GCD does not evenly divide one of the terms.
+    if (!R->isZero())
+      return false;
 
-  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
-    if (GCD != Expr)
-      Remainder = Expr;
-    return GCD;
+    Term = Q;
   }
 
-  const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
-    return One;
-  }
+  // Remove all SCEVConstants.
+  Terms.erase(std::remove_if(Terms.begin(), Terms.end(), [](const SCEV *E) {
+                return isa<SCEVConstant>(E);
+              }),
+              Terms.end());
 
-private:
-  ScalarEvolution &SE;
-  const SCEV *GCD, *Remainder, *Zero, *One;
-};
+  if (Terms.size() > 0)
+    if (!findArrayDimensionsRec(SE, Terms, Sizes))
+      return false;
 
-struct SCEVDivision : public SCEVVisitor<SCEVDivision, const SCEV *> {
-public:
-  // Remove from Start all multiples of Step.
-  static const SCEV *divide(ScalarEvolution &SE, const SCEV *Start,
-                            const SCEV *Step) {
-    SCEVDivision D(SE, Step);
-    const SCEV *Rem = D.Zero;
-    (void)Rem;
-    // The division is guaranteed to succeed: Step should divide Start with no
-    // remainder.
-    assert(Step == SCEVGCD::findGCD(SE, Start, Step, &Rem) && Rem == D.Zero &&
-           "Step should divide Start with no remainder.");
-    return D.visit(Start);
-  }
+  Sizes.push_back(Step);
+  return true;
+}
+
+namespace {
+struct FindParameter {
+  bool FoundParameter;
+  FindParameter() : FoundParameter(false) {}
 
-  SCEVDivision(ScalarEvolution &S, const SCEV *G) : SE(S), GCD(G) {
-    Zero = SE.getConstant(GCD->getType(), 0);
-    One = SE.getConstant(GCD->getType(), 1);
+  bool follow(const SCEV *S) {
+    if (isa<SCEVUnknown>(S)) {
+      FoundParameter = true;
+      // Stop recursion: we found a parameter.
+      return false;
+    }
+    // Keep looking.
+    return true;
   }
+  bool isDone() const {
+    // Stop recursion if we have found a parameter.
+    return FoundParameter;
+  }
+};
+}
 
-  const SCEV *visitConstant(const SCEVConstant *Constant) {
-    if (GCD == Constant)
-      return One;
+// Returns true when S contains at least a SCEVUnknown parameter.
+static inline bool
+containsParameters(const SCEV *S) {
+  FindParameter F;
+  SCEVTraversal<FindParameter> ST(F);
+  ST.visitAll(S);
 
-    if (const SCEVConstant *CGCD = dyn_cast<SCEVConstant>(GCD))
-      return SE.getConstant(sdiv(Constant, CGCD));
-    return Constant;
-  }
+  return F.FoundParameter;
+}
 
-  const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
-    if (GCD == Expr)
-      return One;
-    return Expr;
-  }
+// Returns true when one of the SCEVs of Terms contains a SCEVUnknown parameter.
+static inline bool
+containsParameters(SmallVectorImpl<const SCEV *> &Terms) {
+  for (const SCEV *T : Terms)
+    if (containsParameters(T))
+      return true;
+  return false;
+}
 
-  const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
-    if (GCD == Expr)
-      return One;
-    return Expr;
-  }
+// Return the number of product terms in S.
+static inline int numberOfTerms(const SCEV *S) {
+  if (const SCEVMulExpr *Expr = dyn_cast<SCEVMulExpr>(S))
+    return Expr->getNumOperands();
+  return 1;
+}
 
-  const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
-    if (GCD == Expr)
-      return One;
-    return Expr;
-  }
+static const SCEV *removeConstantFactors(ScalarEvolution &SE, const SCEV *T) {
+  if (isa<SCEVConstant>(T))
+    return nullptr;
 
-  const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
-    if (GCD == Expr)
-      return One;
+  if (isa<SCEVUnknown>(T))
+    return T;
 
-    SmallVector<const SCEV *, 2> Operands;
-    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
-      Operands.push_back(divide(SE, Expr->getOperand(i), GCD));
+  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(T)) {
+    SmallVector<const SCEV *, 2> Factors;
+    for (const SCEV *Op : M->operands())
+      if (!isa<SCEVConstant>(Op))
+        Factors.push_back(Op);
 
-    if (Operands.size() == 1)
-      return Operands[0];
-    return SE.getAddExpr(Operands);
+    return SE.getMulExpr(Factors);
   }
 
-  const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
-    if (GCD == Expr)
-      return One;
+  return T;
+}
 
-    bool FoundGCDTerm = false;
-    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
-      if (Expr->getOperand(i) == GCD)
-        FoundGCDTerm = true;
+/// Return the size of an element read or written by Inst.
+const SCEV *ScalarEvolution::getElementSize(Instruction *Inst) {
+  Type *Ty;
+  if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+    Ty = Store->getValueOperand()->getType();
+  else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
+    Ty = Load->getPointerOperand()->getType();
+  else
+    return nullptr;
 
-    SmallVector<const SCEV *, 2> Operands;
-    if (FoundGCDTerm) {
-      FoundGCDTerm = false;
-      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
-        if (FoundGCDTerm)
-          Operands.push_back(Expr->getOperand(i));
-        else if (Expr->getOperand(i) == GCD)
-          FoundGCDTerm = true;
-        else
-          Operands.push_back(Expr->getOperand(i));
-      }
-    } else {
-      const SCEV *PartialGCD = One;
-      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
-        if (PartialGCD == GCD) {
-          Operands.push_back(Expr->getOperand(i));
-          continue;
-        }
+  Type *ETy = getEffectiveSCEVType(PointerType::getUnqual(Ty));
+  return getSizeOfExpr(ETy, Ty);
+}
 
-        const SCEV *Rem = Zero;
-        const SCEV *Res = SCEVGCD::findGCD(SE, Expr->getOperand(i), GCD, &Rem);
-        if (Rem == Zero) {
-          PartialGCD = SE.getMulExpr(PartialGCD, Res);
-          Operands.push_back(divide(SE, Expr->getOperand(i), GCD));
-        } else {
-          Operands.push_back(Expr->getOperand(i));
-        }
-      }
-    }
+/// Second step of delinearization: compute the array dimensions Sizes from the
+/// set of Terms extracted from the memory access function of this SCEVAddRec.
+void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
+                                          SmallVectorImpl<const SCEV *> &Sizes,
+                                          const SCEV *ElementSize) const {
 
-    if (Operands.size() == 1)
-      return Operands[0];
-    return SE.getMulExpr(Operands);
-  }
+  if (Terms.size() < 1)
+    return;
 
-  const SCEV *visitUDivExpr(const SCEVUDivExpr *Expr) {
-    if (GCD == Expr)
-      return One;
-    return Expr;
-  }
+  // Early return when Terms do not contain parameters: we do not delinearize
+  // non parametric SCEVs.
+  if (!containsParameters(Terms))
+    return;
 
-  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
-    if (GCD == Expr)
-      return One;
+  DEBUG({
+      dbgs() << "Terms:\n";
+      for (const SCEV *T : Terms)
+        dbgs() << *T << "\n";
+    });
 
-    assert(Expr->isAffine() && "Expr should be affine");
+  // Remove duplicates.
+  std::sort(Terms.begin(), Terms.end());
+  Terms.erase(std::unique(Terms.begin(), Terms.end()), Terms.end());
 
-    const SCEV *Start = divide(SE, Expr->getStart(), GCD);
-    const SCEV *Step = divide(SE, Expr->getStepRecurrence(SE), GCD);
+  // Put larger terms first.
+  std::sort(Terms.begin(), Terms.end(), [](const SCEV *LHS, const SCEV *RHS) {
+    return numberOfTerms(LHS) > numberOfTerms(RHS);
+  });
 
-    return SE.getAddRecExpr(Start, Step, Expr->getLoop(),
-                            Expr->getNoWrapFlags());
-  }
+  ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
 
-  const SCEV *visitSMaxExpr(const SCEVSMaxExpr *Expr) {
-    if (GCD == Expr)
-      return One;
-    return Expr;
+  // Divide all terms by the element size.
+  for (const SCEV *&Term : Terms) {
+    const SCEV *Q, *R;
+    SCEVDivision::divide(SE, Term, ElementSize, &Q, &R);
+    Term = Q;
   }
 
-  const SCEV *visitUMaxExpr(const SCEVUMaxExpr *Expr) {
-    if (GCD == Expr)
-      return One;
-    return Expr;
-  }
+  SmallVector<const SCEV *, 4> NewTerms;
 
-  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
-    if (GCD == Expr)
-      return One;
-    return Expr;
+  // Remove constant factors.
+  for (const SCEV *T : Terms)
+    if (const SCEV *NewT = removeConstantFactors(SE, T))
+      NewTerms.push_back(NewT);
+
+  DEBUG({
+      dbgs() << "Terms after sorting:\n";
+      for (const SCEV *T : NewTerms)
+        dbgs() << *T << "\n";
+    });
+
+  if (NewTerms.empty() ||
+      !findArrayDimensionsRec(SE, NewTerms, Sizes)) {
+    Sizes.clear();
+    return;
   }
 
-  const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
-    return Expr;
+  // The last element to be pushed into Sizes is the size of an element.
+  Sizes.push_back(ElementSize);
+
+  DEBUG({
+      dbgs() << "Sizes:\n";
+      for (const SCEV *S : Sizes)
+        dbgs() << *S << "\n";
+    });
+}
+
+/// Third step of delinearization: compute the access functions for the
+/// Subscripts based on the dimensions in Sizes.
+void SCEVAddRecExpr::computeAccessFunctions(
+    ScalarEvolution &SE, SmallVectorImpl<const SCEV *> &Subscripts,
+    SmallVectorImpl<const SCEV *> &Sizes) const {
+
+  // Early exit in case this SCEV is not an affine multivariate function.
+  if (Sizes.empty() || !this->isAffine())
+    return;
+
+  const SCEV *Res = this;
+  int Last = Sizes.size() - 1;
+  for (int i = Last; i >= 0; i--) {
+    const SCEV *Q, *R;
+    SCEVDivision::divide(SE, Res, Sizes[i], &Q, &R);
+
+    DEBUG({
+        dbgs() << "Res: " << *Res << "\n";
+        dbgs() << "Sizes[i]: " << *Sizes[i] << "\n";
+        dbgs() << "Res divided by Sizes[i]:\n";
+        dbgs() << "Quotient: " << *Q << "\n";
+        dbgs() << "Remainder: " << *R << "\n";
+      });
+
+    Res = Q;
+
+    // Do not record the last subscript corresponding to the size of elements in
+    // the array.
+    if (i == Last) {
+
+      // Bail out if the remainder is too complex.
+      if (isa<SCEVAddRecExpr>(R)) {
+        Subscripts.clear();
+        Sizes.clear();
+        return;
+      }
+
+      continue;
+    }
+
+    // Record the access function for the current subscript.
+    Subscripts.push_back(R);
   }
 
-private:
-  ScalarEvolution &SE;
-  const SCEV *GCD, *Zero, *One;
-};
+  // Also push in last position the remainder of the last division: it will be
+  // the access function of the innermost dimension.
+  Subscripts.push_back(Res);
+
+  std::reverse(Subscripts.begin(), Subscripts.end());
+
+  DEBUG({
+      dbgs() << "Subscripts:\n";
+      for (const SCEV *S : Subscripts)
+        dbgs() << *S << "\n";
+    });
 }
 
 /// Splits the SCEV into two vectors of SCEVs representing the subscripts and
@@ -7239,84 +7537,40 @@ private:
 /// asking for the SCEV of the memory access with respect to all enclosing
 /// loops, calling SCEV->delinearize on that and printing the results.
 
-const SCEV *
-SCEVAddRecExpr::delinearize(ScalarEvolution &SE,
-                            SmallVectorImpl<const SCEV *> &Subscripts,
-                            SmallVectorImpl<const SCEV *> &Sizes) const {
-  // Early exit in case this SCEV is not an affine multivariate function.
-  if (!this->isAffine())
-    return this;
-
-  const SCEV *Start = this->getStart();
-  const SCEV *Step = this->getStepRecurrence(SE);
-
-  // Build the SCEV representation of the canonical induction variable in the
-  // loop of this SCEV.
-  const SCEV *Zero = SE.getConstant(this->getType(), 0);
-  const SCEV *One = SE.getConstant(this->getType(), 1);
-  const SCEV *IV =
-      SE.getAddRecExpr(Zero, One, this->getLoop(), this->getNoWrapFlags());
-
-  DEBUG(dbgs() << "(delinearize: " << *this << "\n");
-
-  // When the stride of this SCEV is 1, do not compute the GCD: the size of this
-  // subscript is 1, and this same SCEV for the access function.
-  const SCEV *Remainder = Zero;
-  const SCEV *GCD = One;
-
-  // Find the GCD and Remainder of the Start and Step coefficients of this SCEV.
-  if (Step != One && !Step->isAllOnesValue())
-    GCD = SCEVGCD::findGCD(SE, Start, Step, &Remainder);
-
-  DEBUG(dbgs() << "GCD: " << *GCD << "\n");
-  DEBUG(dbgs() << "Remainder: " << *Remainder << "\n");
-
-  const SCEV *Quotient = Start;
-  if (GCD != One && !GCD->isAllOnesValue())
-    // As findGCD computed Remainder, GCD divides "Start - Remainder." The
-    // Quotient is then this SCEV without Remainder, scaled down by the GCD.  The
-    // Quotient is what will be used in the next subscript delinearization.
-    Quotient = SCEVDivision::divide(SE, SE.getMinusSCEV(Start, Remainder), GCD);
-
-  DEBUG(dbgs() << "Quotient: " << *Quotient << "\n");
-
-  const SCEV *Rem = Quotient;
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Quotient))
-    // Recursively call delinearize on the Quotient until there are no more
-    // multiples that can be recognized.
-    Rem = AR->delinearize(SE, Subscripts, Sizes);
-
-  // Scale up the canonical induction variable IV by whatever remains from the
-  // Step after division by the GCD: the GCD is the size of all the sub-array.
-  if (Step != One && !Step->isAllOnesValue() && GCD != One &&
-      !GCD->isAllOnesValue() && Step != GCD) {
-    Step = SCEVDivision::divide(SE, Step, GCD);
-    IV = SE.getMulExpr(IV, Step);
-  }
-  // The access function in the current subscript is computed as the canonical
-  // induction variable IV (potentially scaled up by the step) and offset by
-  // Rem, the offset of delinearization in the sub-array.
-  const SCEV *Index = SE.getAddExpr(IV, Rem);
-
-  // Record the access function and the size of the current subscript.
-  Subscripts.push_back(Index);
-  Sizes.push_back(GCD);
+void SCEVAddRecExpr::delinearize(ScalarEvolution &SE,
+                                 SmallVectorImpl<const SCEV *> &Subscripts,
+                                 SmallVectorImpl<const SCEV *> &Sizes,
+                                 const SCEV *ElementSize) const {
+  // First step: collect parametric terms.
+  SmallVector<const SCEV *, 4> Terms;
+  collectParametricTerms(SE, Terms);
 
-#ifndef NDEBUG
-  int Size = Sizes.size();
-  DEBUG(dbgs() << "succeeded to delinearize " << *this << "\n");
-  DEBUG(dbgs() << "ArrayDecl[UnknownSize]");
-  for (int i = 0; i < Size - 1; i++)
-    DEBUG(dbgs() << "[" << *Sizes[i] << "]");
-  DEBUG(dbgs() << " with elements of " << *Sizes[Size - 1] << " bytes.\n");
-
-  DEBUG(dbgs() << "ArrayRef");
-  for (int i = 0; i < Size; i++)
-    DEBUG(dbgs() << "[" << *Subscripts[i] << "]");
-  DEBUG(dbgs() << "\n)\n");
-#endif
+  if (Terms.empty())
+    return;
+
+  // Second step: find subscript sizes.
+  SE.findArrayDimensions(Terms, Sizes, ElementSize);
+
+  if (Sizes.empty())
+    return;
+
+  // Third step: compute the access functions for each subscript.
+  computeAccessFunctions(SE, Subscripts, Sizes);
+
+  if (Subscripts.empty())
+    return;
+
+  DEBUG({
+      dbgs() << "succeeded to delinearize " << *this << "\n";
+      dbgs() << "ArrayDecl[UnknownSize]";
+      for (const SCEV *S : Sizes)
+        dbgs() << "[" << *S << "]";
 
-  return Remainder;
+      dbgs() << "\nArrayRef";
+      for (const SCEV *S : Subscripts)
+        dbgs() << "[" << *S << "]";
+      dbgs() << "\n";
+    });
 }
 
 //===----------------------------------------------------------------------===//
@@ -7368,7 +7622,8 @@ ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se)
 //===----------------------------------------------------------------------===//
 
 ScalarEvolution::ScalarEvolution()
-  : FunctionPass(ID), ValuesAtScopes(64), LoopDispositions(64), BlockDispositions(64), FirstUnknown(0) {
+  : FunctionPass(ID), ValuesAtScopes(64), LoopDispositions(64),
+    BlockDispositions(64), FirstUnknown(nullptr) {
   initializeScalarEvolutionPass(*PassRegistry::getPassRegistry());
 }
 
@@ -7376,7 +7631,7 @@ bool ScalarEvolution::runOnFunction(Function &F) {
   this->F = &F;
   LI = &getAnalysis<LoopInfo>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   return false;
@@ -7387,7 +7642,7 @@ void ScalarEvolution::releaseMemory() {
   // destructors, so that they release their references to their values.
   for (SCEVUnknown *U = FirstUnknown; U; U = U->Next)
     U->~SCEVUnknown();
-  FirstUnknown = 0;
+  FirstUnknown = nullptr;
 
   ValueExprMap.clear();
 
diff --git a/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
index 7be6aca..6933f74 100644
--- a/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
+++ b/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
@@ -34,7 +34,7 @@ namespace {
 
   public:
     static char ID; // Class identification, replacement for typeinfo
-    ScalarEvolutionAliasAnalysis() : FunctionPass(ID), SE(0) {
+    ScalarEvolutionAliasAnalysis() : FunctionPass(ID), SE(nullptr) {
       initializeScalarEvolutionAliasAnalysisPass(
         *PassRegistry::getPassRegistry());
     }
@@ -102,7 +102,7 @@ ScalarEvolutionAliasAnalysis::GetBaseValue(const SCEV *S) {
     return U->getValue();
   }
   // No Identified object found.
-  return 0;
+  return nullptr;
 }
 
 AliasAnalysis::AliasResult
@@ -162,10 +162,10 @@ ScalarEvolutionAliasAnalysis::alias(const Location &LocA,
   if ((AO && AO != LocA.Ptr) || (BO && BO != LocB.Ptr))
     if (alias(Location(AO ? AO : LocA.Ptr,
                        AO ? +UnknownSize : LocA.Size,
-                       AO ? 0 : LocA.TBAATag),
+                       AO ? nullptr : LocA.TBAATag),
               Location(BO ? BO : LocB.Ptr,
                        BO ? +UnknownSize : LocB.Size,
-                       BO ? 0 : LocB.TBAATag)) == NoAlias)
+                       BO ? nullptr : LocB.TBAATag)) == NoAlias)
       return NoAlias;
 
   // Forward the query to the next analysis.
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index fb3d595..b507043 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -44,7 +44,7 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
   // not allowed to move it.
   BasicBlock::iterator BIP = Builder.GetInsertPoint();
 
-  Instruction *Ret = NULL;
+  Instruction *Ret = nullptr;
 
   // Check to see if there is already a cast!
   for (User *U : V->users())
@@ -627,21 +627,21 @@ static const Loop *PickMostRelevantLoop(const Loop *A, const Loop *B,
 const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) {
   // Test whether we've already computed the most relevant loop for this SCEV.
   std::pair<DenseMap<const SCEV *, const Loop *>::iterator, bool> Pair =
-    RelevantLoops.insert(std::make_pair(S, static_cast<const Loop *>(0)));
+    RelevantLoops.insert(std::make_pair(S, nullptr));
   if (!Pair.second)
     return Pair.first->second;
 
   if (isa<SCEVConstant>(S))
     // A constant has no relevant loops.
-    return 0;
+    return nullptr;
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
     if (const Instruction *I = dyn_cast<Instruction>(U->getValue()))
       return Pair.first->second = SE.LI->getLoopFor(I->getParent());
     // A non-instruction has no relevant loops.
-    return 0;
+    return nullptr;
   }
   if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) {
-    const Loop *L = 0;
+    const Loop *L = nullptr;
     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
       L = AR->getLoop();
     for (SCEVNAryExpr::op_iterator I = N->op_begin(), E = N->op_end();
@@ -716,7 +716,7 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
 
   // Emit instructions to add all the operands. Hoist as much as possible
   // out of loops, and form meaningful getelementptrs where possible.
-  Value *Sum = 0;
+  Value *Sum = nullptr;
   for (SmallVectorImpl<std::pair<const Loop *, const SCEV *> >::iterator
        I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E; ) {
     const Loop *CurLoop = I->first;
@@ -784,7 +784,7 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
 
   // Emit instructions to mul all the operands. Hoist as much as possible
   // out of loops.
-  Value *Prod = 0;
+  Value *Prod = nullptr;
   for (SmallVectorImpl<std::pair<const Loop *, const SCEV *> >::iterator
        I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E; ) {
     const SCEV *Op = I->second;
@@ -892,18 +892,18 @@ Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV,
                                            Instruction *InsertPos,
                                            bool allowScale) {
   if (IncV == InsertPos)
-    return NULL;
+    return nullptr;
 
   switch (IncV->getOpcode()) {
   default:
-    return NULL;
+    return nullptr;
   // Check for a simple Add/Sub or GEP of a loop invariant step.
   case Instruction::Add:
   case Instruction::Sub: {
     Instruction *OInst = dyn_cast<Instruction>(IncV->getOperand(1));
     if (!OInst || SE.DT->dominates(OInst, InsertPos))
       return dyn_cast<Instruction>(IncV->getOperand(0));
-    return NULL;
+    return nullptr;
   }
   case Instruction::BitCast:
     return dyn_cast<Instruction>(IncV->getOperand(0));
@@ -914,7 +914,7 @@ Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV,
         continue;
       if (Instruction *OInst = dyn_cast<Instruction>(*I)) {
         if (!SE.DT->dominates(OInst, InsertPos))
-          return NULL;
+          return nullptr;
       }
       if (allowScale) {
         // allow any kind of GEP as long as it can be hoisted.
@@ -925,11 +925,11 @@ Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV,
       // have 2 operands. i1* is used by the expander to represent an
       // address-size element.
       if (IncV->getNumOperands() != 2)
-        return NULL;
+        return nullptr;
       unsigned AS = cast<PointerType>(IncV->getType())->getAddressSpace();
       if (IncV->getType() != Type::getInt1PtrTy(SE.getContext(), AS)
           && IncV->getType() != Type::getInt8PtrTy(SE.getContext(), AS))
-        return NULL;
+        return nullptr;
       break;
     }
     return dyn_cast<Instruction>(IncV->getOperand(0));
@@ -1077,9 +1077,9 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   // Reuse a previously-inserted PHI, if present.
   BasicBlock *LatchBlock = L->getLoopLatch();
   if (LatchBlock) {
-    PHINode *AddRecPhiMatch = 0;
-    Instruction *IncV = 0;
-    TruncTy = 0;
+    PHINode *AddRecPhiMatch = nullptr;
+    Instruction *IncV = nullptr;
+    TruncTy = nullptr;
     InvertStep = false;
 
     // Only try partially matching scevs that need truncation and/or
@@ -1120,7 +1120,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
       // Stop if we have found an exact match SCEV.
       if (IsMatchingSCEV) {
         IncV = TempIncV;
-        TruncTy = 0;
+        TruncTy = nullptr;
         InvertStep = false;
         AddRecPhiMatch = PN;
         break;
@@ -1243,13 +1243,13 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
     PostIncLoopSet Loops;
     Loops.insert(L);
     Normalized =
-      cast<SCEVAddRecExpr>(TransformForPostIncUse(Normalize, S, 0, 0,
-                                                  Loops, SE, *SE.DT));
+      cast<SCEVAddRecExpr>(TransformForPostIncUse(Normalize, S, nullptr,
+                                                  nullptr, Loops, SE, *SE.DT));
   }
 
   // Strip off any non-loop-dominating component from the addrec start.
   const SCEV *Start = Normalized->getStart();
-  const SCEV *PostLoopOffset = 0;
+  const SCEV *PostLoopOffset = nullptr;
   if (!SE.properlyDominates(Start, L->getHeader())) {
     PostLoopOffset = Start;
     Start = SE.getConstant(Normalized->getType(), 0);
@@ -1261,7 +1261,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
 
   // Strip off any non-loop-dominating component from the addrec step.
   const SCEV *Step = Normalized->getStepRecurrence(SE);
-  const SCEV *PostLoopScale = 0;
+  const SCEV *PostLoopScale = nullptr;
   if (!SE.dominates(Step, L->getHeader())) {
     PostLoopScale = Step;
     Step = SE.getConstant(Normalized->getType(), 1);
@@ -1276,7 +1276,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   Type *ExpandTy = PostLoopScale ? IntTy : STy;
   // In some cases, we decide to reuse an existing phi node but need to truncate
   // it and/or invert the step.
-  Type *TruncTy = 0;
+  Type *TruncTy = nullptr;
   bool InvertStep = false;
   PHINode *PN = getAddRecExprPHILiterally(Normalized, L, ExpandTy, IntTy,
                                           TruncTy, InvertStep);
@@ -1372,7 +1372,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   const Loop *L = S->getLoop();
 
   // First check for an existing canonical IV in a suitable type.
-  PHINode *CanonicalIV = 0;
+  PHINode *CanonicalIV = nullptr;
   if (PHINode *PN = L->getCanonicalInductionVariable())
     if (SE.getTypeSizeInBits(PN->getType()) >= SE.getTypeSizeInBits(Ty))
       CanonicalIV = PN;
@@ -1393,7 +1393,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     while (isa<PHINode>(NewInsertPt) || isa<DbgInfoIntrinsic>(NewInsertPt) ||
            isa<LandingPadInst>(NewInsertPt))
       ++NewInsertPt;
-    V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), 0,
+    V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), nullptr,
                       NewInsertPt);
     return V;
   }
@@ -1666,7 +1666,8 @@ SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
 
   // Emit code for it.
   BuilderType::InsertPointGuard Guard(Builder);
-  PHINode *V = cast<PHINode>(expandCodeFor(H, 0, L->getHeader()->begin()));
+  PHINode *V = cast<PHINode>(expandCodeFor(H, nullptr,
+                                           L->getHeader()->begin()));
 
   return V;
 }
diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp
index 1e4c0bd..e9db295 100644
--- a/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -113,7 +113,7 @@ TransformImpl(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
     // Transform each operand.
     for (SCEVNAryExpr::op_iterator I = AR->op_begin(), E = AR->op_end();
          I != E; ++I) {
-      Operands.push_back(TransformSubExpr(*I, LUser, 0));
+      Operands.push_back(TransformSubExpr(*I, LUser, nullptr));
     }
     // Conservatively use AnyWrap until/unless we need FlagNW.
     const SCEV *Result = SE.getAddRecExpr(Operands, L, SCEV::FlagAnyWrap);
diff --git a/lib/Analysis/SparsePropagation.cpp b/lib/Analysis/SparsePropagation.cpp
index 87a4fa4..edd82f5 100644
--- a/lib/Analysis/SparsePropagation.cpp
+++ b/lib/Analysis/SparsePropagation.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sparseprop"
 #include "llvm/Analysis/SparsePropagation.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -21,6 +20,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "sparseprop"
+
 //===----------------------------------------------------------------------===//
 //                  AbstractLatticeFunction Implementation
 //===----------------------------------------------------------------------===//
@@ -147,7 +148,7 @@ void SparseSolver::getFeasibleSuccessors(TerminatorInst &TI,
       return;
 
     Constant *C = LatticeFunc->GetConstant(BCValue, BI->getCondition(), *this);
-    if (C == 0 || !isa<ConstantInt>(C)) {
+    if (!C || !isa<ConstantInt>(C)) {
       // Non-constant values can go either way.
       Succs[0] = Succs[1] = true;
       return;
@@ -189,7 +190,7 @@ void SparseSolver::getFeasibleSuccessors(TerminatorInst &TI,
     return;
   
   Constant *C = LatticeFunc->GetConstant(SCValue, SI.getCondition(), *this);
-  if (C == 0 || !isa<ConstantInt>(C)) {
+  if (!C || !isa<ConstantInt>(C)) {
     // All destinations are executable!
     Succs.assign(TI.getNumSuccessors(), true);
     return;
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 04d09f1..cdb0b79 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "tti"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
@@ -19,6 +18,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "tti"
+
 // Setup the analysis group to manage the TargetTransformInfo passes.
 INITIALIZE_ANALYSIS_GROUP(TargetTransformInfo, "Target Information", NoTTI)
 char TargetTransformInfo::ID = 0;
@@ -234,7 +235,7 @@ namespace {
 struct NoTTI final : ImmutablePass, TargetTransformInfo {
   const DataLayout *DL;
 
-  NoTTI() : ImmutablePass(ID), DL(0) {
+  NoTTI() : ImmutablePass(ID), DL(nullptr) {
     initializeNoTTIPass(*PassRegistry::getPassRegistry());
   }
 
@@ -242,9 +243,9 @@ struct NoTTI final : ImmutablePass, TargetTransformInfo {
     // Note that this subclass is special, and must *not* call initializeTTI as
     // it does not chain.
     TopTTI = this;
-    PrevTTI = 0;
+    PrevTTI = nullptr;
     DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-    DL = DLP ? &DLP->getDataLayout() : 0;
+    DL = DLP ? &DLP->getDataLayout() : nullptr;
   }
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -443,7 +444,7 @@ struct NoTTI final : ImmutablePass, TargetTransformInfo {
     // Otherwise delegate to the fully generic implementations.
     return getOperationCost(Operator::getOpcode(U), U->getType(),
                             U->getNumOperands() == 1 ?
-                                U->getOperand(0)->getType() : 0);
+                                U->getOperand(0)->getType() : nullptr);
   }
 
   bool hasBranchDivergence() const override { return false; }
@@ -567,7 +568,7 @@ struct NoTTI final : ImmutablePass, TargetTransformInfo {
   }
 
   unsigned getShuffleCost(ShuffleKind Kind, Type *Ty,
-                          int Index = 0, Type *SubTp = 0) const override {
+                          int Index = 0, Type *SubTp = nullptr) const override {
     return 1;
   }
 
@@ -581,7 +582,7 @@ struct NoTTI final : ImmutablePass, TargetTransformInfo {
   }
 
   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                              Type *CondTy = 0) const override {
+                              Type *CondTy = nullptr) const override {
     return 1;
   }
 
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index 05daf18..f36f6f8 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -144,7 +144,7 @@ namespace {
     const MDNode *Node;
 
   public:
-    TBAANode() : Node(0) {}
+    TBAANode() : Node(nullptr) {}
     explicit TBAANode(const MDNode *N) : Node(N) {}
 
     /// getNode - Get the MDNode for this TBAANode.
@@ -182,7 +182,6 @@ namespace {
     const MDNode *Node;
 
   public:
-    TBAAStructTagNode() : Node(0) {}
     explicit TBAAStructTagNode(const MDNode *N) : Node(N) {}
 
     /// Get the MDNode for this TBAAStructTagNode.
@@ -218,7 +217,7 @@ namespace {
     const MDNode *Node;
 
   public:
-    TBAAStructTypeNode() : Node(0) {}
+    TBAAStructTypeNode() : Node(nullptr) {}
     explicit TBAAStructTypeNode(const MDNode *N) : Node(N) {}
 
     /// Get the MDNode for this TBAAStructTypeNode.
@@ -340,7 +339,8 @@ static bool isStructPathTBAA(const MDNode *MD) {
 bool
 TypeBasedAliasAnalysis::Aliases(const MDNode *A,
                                 const MDNode *B) const {
-  if (isStructPathTBAA(A))
+  // Make sure that both MDNodes are struct-path aware.
+  if (isStructPathTBAA(A) && isStructPathTBAA(B))
     return PathAliases(A, B);
 
   // Keep track of the root node for A and B.
@@ -386,6 +386,10 @@ TypeBasedAliasAnalysis::Aliases(const MDNode *A,
 bool
 TypeBasedAliasAnalysis::PathAliases(const MDNode *A,
                                     const MDNode *B) const {
+  // Verify that both input nodes are struct-path aware.
+  assert(isStructPathTBAA(A) && "MDNode A is not struct-path aware.");
+  assert(isStructPathTBAA(B) && "MDNode B is not struct-path aware.");
+
   // Keep track of the root node for A and B.
   TBAAStructTypeNode RootA, RootB;
   TBAAStructTagNode TagA(A), TagB(B);
@@ -555,38 +559,40 @@ bool MDNode::isTBAAVtableAccess() const {
 
 MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
   if (!A || !B)
-    return NULL;
+    return nullptr;
 
   if (A == B)
     return A;
 
   // For struct-path aware TBAA, we use the access type of the tag.
-  bool StructPath = isStructPathTBAA(A);
+  bool StructPath = isStructPathTBAA(A) && isStructPathTBAA(B);
   if (StructPath) {
     A = cast_or_null<MDNode>(A->getOperand(1));
-    if (!A) return 0;
+    if (!A) return nullptr;
     B = cast_or_null<MDNode>(B->getOperand(1));
-    if (!B) return 0;
+    if (!B) return nullptr;
   }
 
   SmallVector<MDNode *, 4> PathA;
   MDNode *T = A;
   while (T) {
     PathA.push_back(T);
-    T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1)) : 0;
+    T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1))
+                                 : nullptr;
   }
 
   SmallVector<MDNode *, 4> PathB;
   T = B;
   while (T) {
     PathB.push_back(T);
-    T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1)) : 0;
+    T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1))
+                                 : nullptr;
   }
 
   int IA = PathA.size() - 1;
   int IB = PathB.size() - 1;
 
-  MDNode *Ret = 0;
+  MDNode *Ret = nullptr;
   while (IA >= 0 && IB >=0) {
     if (PathA[IA] == PathB[IB])
       Ret = PathA[IA];
@@ -599,7 +605,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
     return Ret;
 
   if (!Ret)
-    return 0;
+    return nullptr;
   // We need to convert from a type node to a tag node.
   Type *Int64 = IntegerType::get(A->getContext(), 64);
   Value *Ops[3] = { Ret, Ret, ConstantInt::get(Int64, 0) };
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 72617a0..4f48753 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -44,10 +45,10 @@ static unsigned getBitWidth(Type *Ty, const DataLayout *TD) {
   return TD ? TD->getPointerTypeSizeInBits(Ty) : 0;
 }
 
-static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
-                                    APInt &KnownZero, APInt &KnownOne,
-                                    APInt &KnownZero2, APInt &KnownOne2,
-                                    const DataLayout *TD, unsigned Depth) {
+static void computeKnownBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
+                                   APInt &KnownZero, APInt &KnownOne,
+                                   APInt &KnownZero2, APInt &KnownOne2,
+                                   const DataLayout *TD, unsigned Depth) {
   if (!Add) {
     if (ConstantInt *CLHS = dyn_cast<ConstantInt>(Op0)) {
       // We know that the top bits of C-X are clear if X contains less bits
@@ -58,7 +59,7 @@ static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
         unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros();
         // NLZ can't be BitWidth with no sign bit
         APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-        llvm::ComputeMaskedBits(Op1, KnownZero2, KnownOne2, TD, Depth+1);
+        llvm::computeKnownBits(Op1, KnownZero2, KnownOne2, TD, Depth+1);
 
         // If all of the MaskV bits are known to be zero, then we know the
         // output top bits are zero, because we now know that the output is
@@ -79,13 +80,10 @@ static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
   // result. For an add, this works with either operand. For a subtract,
   // this only works if the known zeros are in the right operand.
   APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-  llvm::ComputeMaskedBits(Op0, LHSKnownZero, LHSKnownOne, TD, Depth+1);
-  assert((LHSKnownZero & LHSKnownOne) == 0 &&
-         "Bits known to be one AND zero?");
+  llvm::computeKnownBits(Op0, LHSKnownZero, LHSKnownOne, TD, Depth+1);
   unsigned LHSKnownZeroOut = LHSKnownZero.countTrailingOnes();
 
-  llvm::ComputeMaskedBits(Op1, KnownZero2, KnownOne2, TD, Depth+1);
-  assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+  llvm::computeKnownBits(Op1, KnownZero2, KnownOne2, TD, Depth+1);
   unsigned RHSKnownZeroOut = KnownZero2.countTrailingOnes();
 
   // Determine which operand has more trailing zeros, and use that
@@ -130,15 +128,13 @@ static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
   }
 }
 
-static void ComputeMaskedBitsMul(Value *Op0, Value *Op1, bool NSW,
-                                 APInt &KnownZero, APInt &KnownOne,
-                                 APInt &KnownZero2, APInt &KnownOne2,
-                                 const DataLayout *TD, unsigned Depth) {
+static void computeKnownBitsMul(Value *Op0, Value *Op1, bool NSW,
+                                APInt &KnownZero, APInt &KnownOne,
+                                APInt &KnownZero2, APInt &KnownOne2,
+                                const DataLayout *TD, unsigned Depth) {
   unsigned BitWidth = KnownZero.getBitWidth();
-  ComputeMaskedBits(Op1, KnownZero, KnownOne, TD, Depth+1);
-  ComputeMaskedBits(Op0, KnownZero2, KnownOne2, TD, Depth+1);
-  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-  assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+  computeKnownBits(Op1, KnownZero, KnownOne, TD, Depth+1);
+  computeKnownBits(Op0, KnownZero2, KnownOne2, TD, Depth+1);
 
   bool isKnownNegative = false;
   bool isKnownNonNegative = false;
@@ -192,7 +188,7 @@ static void ComputeMaskedBitsMul(Value *Op0, Value *Op1, bool NSW,
     KnownOne.setBit(BitWidth - 1);
 }
 
-void llvm::computeMaskedBitsLoad(const MDNode &Ranges, APInt &KnownZero) {
+void llvm::computeKnownBitsLoad(const MDNode &Ranges, APInt &KnownZero) {
   unsigned BitWidth = KnownZero.getBitWidth();
   unsigned NumRanges = Ranges.getNumOperands() / 2;
   assert(NumRanges >= 1);
@@ -211,8 +207,9 @@ void llvm::computeMaskedBitsLoad(const MDNode &Ranges, APInt &KnownZero) {
 
   KnownZero = APInt::getHighBitsSet(BitWidth, MinLeadingZeros);
 }
-/// ComputeMaskedBits - Determine which of the bits are known to be either zero
-/// or one and return them in the KnownZero/KnownOne bit sets.
+
+/// Determine which bits of V are known to be either zero or one and return
+/// them in the KnownZero/KnownOne bit sets.
 ///
 /// NOTE: we cannot consider 'undef' to be "IsZero" here.  The problem is that
 /// we cannot optimize based on the assumption that it is zero without changing
@@ -226,8 +223,8 @@ void llvm::computeMaskedBitsLoad(const MDNode &Ranges, APInt &KnownZero) {
 /// where V is a vector, known zero, and known one values are the
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
-void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
-                             const DataLayout *TD, unsigned Depth) {
+void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
+                            const DataLayout *TD, unsigned Depth) {
   assert(V && "No Value?");
   assert(Depth <= MaxDepth && "Limit Search Depth");
   unsigned BitWidth = KnownZero.getBitWidth();
@@ -241,7 +238,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
           V->getType()->getScalarSizeInBits() == BitWidth) &&
          KnownZero.getBitWidth() == BitWidth &&
          KnownOne.getBitWidth() == BitWidth &&
-         "V, Mask, KnownOne and KnownZero should have same BitWidth");
+         "V, KnownOne and KnownZero should have same BitWidth");
 
   if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     // We know all of the bits for a constant!
@@ -303,7 +300,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     if (GA->mayBeOverridden()) {
       KnownZero.clearAllBits(); KnownOne.clearAllBits();
     } else {
-      ComputeMaskedBits(GA->getAliasee(), KnownZero, KnownOne, TD, Depth+1);
+      computeKnownBits(GA->getAliasee(), KnownZero, KnownOne, TD, Depth+1);
     }
     return;
   }
@@ -341,49 +338,43 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
   default: break;
   case Instruction::Load:
     if (MDNode *MD = cast<LoadInst>(I)->getMetadata(LLVMContext::MD_range))
-      computeMaskedBitsLoad(*MD, KnownZero);
-    return;
+      computeKnownBitsLoad(*MD, KnownZero);
+    break;
   case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
-    ComputeMaskedBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
-    ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
+    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
     KnownOne &= KnownOne2;
     // Output known-0 are known to be clear if zero in either the LHS | RHS.
     KnownZero |= KnownZero2;
-    return;
+    break;
   }
   case Instruction::Or: {
-    ComputeMaskedBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
-    ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
+    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
     KnownZero &= KnownZero2;
     // Output known-1 are known to be set if set in either the LHS | RHS.
     KnownOne |= KnownOne2;
-    return;
+    break;
   }
   case Instruction::Xor: {
-    ComputeMaskedBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
-    ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
+    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
     APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
     // Output known-1 are known to be set if set in only one of the LHS, RHS.
     KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
     KnownZero = KnownZeroOut;
-    return;
+    break;
   }
   case Instruction::Mul: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
-    ComputeMaskedBitsMul(I->getOperand(0), I->getOperand(1), NSW,
+    computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW,
                          KnownZero, KnownOne, KnownZero2, KnownOne2, TD, Depth);
     break;
   }
@@ -391,42 +382,40 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     // For the purposes of computing leading zeros we can conservatively
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
-    ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
+    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
     unsigned LeadZ = KnownZero2.countLeadingOnes();
 
     KnownOne2.clearAllBits();
     KnownZero2.clearAllBits();
-    ComputeMaskedBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1);
+    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1);
     unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
     if (RHSUnknownLeadingOnes != BitWidth)
       LeadZ = std::min(BitWidth,
                        LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
 
     KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
-    return;
+    break;
   }
   case Instruction::Select:
-    ComputeMaskedBits(I->getOperand(2), KnownZero, KnownOne, TD, Depth+1);
-    ComputeMaskedBits(I->getOperand(1), KnownZero2, KnownOne2, TD,
+    computeKnownBits(I->getOperand(2), KnownZero, KnownOne, TD, Depth+1);
+    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, TD,
                       Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
 
     // Only known if known in both the LHS and RHS.
     KnownOne &= KnownOne2;
     KnownZero &= KnownZero2;
-    return;
+    break;
   case Instruction::FPTrunc:
   case Instruction::FPExt:
   case Instruction::FPToUI:
   case Instruction::FPToSI:
   case Instruction::SIToFP:
   case Instruction::UIToFP:
-    return; // Can't work with floating point.
+    break; // Can't work with floating point.
   case Instruction::PtrToInt:
   case Instruction::IntToPtr:
     // We can't handle these if we don't know the pointer size.
-    if (!TD) return;
+    if (!TD) break;
     // FALL THROUGH and handle them the same as zext/trunc.
   case Instruction::ZExt:
   case Instruction::Trunc: {
@@ -439,19 +428,19 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
       SrcBitWidth = TD->getTypeSizeInBits(SrcTy->getScalarType());
     } else {
       SrcBitWidth = SrcTy->getScalarSizeInBits();
-      if (!SrcBitWidth) return;
+      if (!SrcBitWidth) break;
     }
 
     assert(SrcBitWidth && "SrcBitWidth can't be zero");
     KnownZero = KnownZero.zextOrTrunc(SrcBitWidth);
     KnownOne = KnownOne.zextOrTrunc(SrcBitWidth);
-    ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
     KnownZero = KnownZero.zextOrTrunc(BitWidth);
     KnownOne = KnownOne.zextOrTrunc(BitWidth);
     // Any top bits are known to be zero.
     if (BitWidth > SrcBitWidth)
       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
-    return;
+    break;
   }
   case Instruction::BitCast: {
     Type *SrcTy = I->getOperand(0)->getType();
@@ -459,8 +448,8 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
         // TODO: For now, not handling conversions like:
         // (bitcast i64 %x to <2 x i32>)
         !I->getType()->isVectorTy()) {
-      ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
-      return;
+      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+      break;
     }
     break;
   }
@@ -470,8 +459,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
 
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
-    ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
 
@@ -481,18 +469,17 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
     else if (KnownOne[SrcBitWidth-1])           // Input sign bit known set
       KnownOne |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
-    return;
+    break;
   }
   case Instruction::Shl:
     // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
     if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
-      ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
       KnownZero <<= ShiftAmt;
       KnownOne  <<= ShiftAmt;
       KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt); // low bits known 0
-      return;
+      break;
     }
     break;
   case Instruction::LShr:
@@ -502,13 +489,12 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
 
       // Unsigned shift right.
-      ComputeMaskedBits(I->getOperand(0), KnownZero,KnownOne, TD, Depth+1);
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      computeKnownBits(I->getOperand(0), KnownZero,KnownOne, TD, Depth+1);
       KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
       KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
       // high bits known zero.
       KnownZero |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
-      return;
+      break;
     }
     break;
   case Instruction::AShr:
@@ -518,8 +504,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
 
       // Signed shift right.
-      ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
       KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
       KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
 
@@ -528,19 +513,19 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
         KnownZero |= HighBits;
       else if (KnownOne[BitWidth-ShiftAmt-1])  // New bits are known one.
         KnownOne |= HighBits;
-      return;
+      break;
     }
     break;
   case Instruction::Sub: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
-    ComputeMaskedBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
+    computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
                             KnownZero, KnownOne, KnownZero2, KnownOne2, TD,
                             Depth);
     break;
   }
   case Instruction::Add: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
-    ComputeMaskedBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
+    computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
                             KnownZero, KnownOne, KnownZero2, KnownOne2, TD,
                             Depth);
     break;
@@ -550,7 +535,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
       APInt RA = Rem->getValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
-        ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
+        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
 
         // The low bits of the first operand are unchanged by the srem.
         KnownZero = KnownZero2 & LowBits;
@@ -574,8 +559,8 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     // remainder is zero.
     if (KnownZero.isNonNegative()) {
       APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, TD,
-                        Depth+1);
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, TD,
+                       Depth+1);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
         KnownZero.setBit(BitWidth - 1);
@@ -587,9 +572,8 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
       APInt RA = Rem->getValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
-        ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD,
-                          Depth+1);
-        assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+        computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD,
+                         Depth+1);
         KnownZero |= ~LowBits;
         KnownOne &= LowBits;
         break;
@@ -598,8 +582,8 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
 
     // Since the result is less than or equal to either operand, any leading
     // zero bits in either operand must also exist in the result.
-    ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
-    ComputeMaskedBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1);
+    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1);
 
     unsigned Leaders = std::max(KnownZero.countLeadingOnes(),
                                 KnownZero2.countLeadingOnes());
@@ -622,8 +606,8 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     // Analyze all of the subscripts of this getelementptr instruction
     // to determine if we can prove known low zero bits.
     APInt LocalKnownZero(BitWidth, 0), LocalKnownOne(BitWidth, 0);
-    ComputeMaskedBits(I->getOperand(0), LocalKnownZero, LocalKnownOne, TD,
-                      Depth+1);
+    computeKnownBits(I->getOperand(0), LocalKnownZero, LocalKnownOne, TD,
+                     Depth+1);
     unsigned TrailZ = LocalKnownZero.countTrailingOnes();
 
     gep_type_iterator GTI = gep_type_begin(I);
@@ -631,8 +615,10 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
       Value *Index = I->getOperand(i);
       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         // Handle struct member offset arithmetic.
-        if (!TD)
-          return;
+        if (!TD) {
+          TrailZ = 0;
+          break;
+        }
 
         // Handle case when index is vector zeroinitializer
         Constant *CIndex = cast<Constant>(Index);
@@ -650,11 +636,14 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
       } else {
         // Handle array index arithmetic.
         Type *IndexedTy = GTI.getIndexedType();
-        if (!IndexedTy->isSized()) return;
+        if (!IndexedTy->isSized()) {
+          TrailZ = 0;
+          break;
+        }
         unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
         uint64_t TypeSize = TD ? TD->getTypeAllocSize(IndexedTy) : 1;
         LocalKnownZero = LocalKnownOne = APInt(GEPOpiBits, 0);
-        ComputeMaskedBits(Index, LocalKnownZero, LocalKnownOne, TD, Depth+1);
+        computeKnownBits(Index, LocalKnownZero, LocalKnownOne, TD, Depth+1);
         TrailZ = std::min(TrailZ,
                           unsigned(countTrailingZeros(TypeSize) +
                                    LocalKnownZero.countTrailingOnes()));
@@ -696,11 +685,11 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
             break;
           // Ok, we have a PHI of the form L op= R. Check for low
           // zero bits.
-          ComputeMaskedBits(R, KnownZero2, KnownOne2, TD, Depth+1);
+          computeKnownBits(R, KnownZero2, KnownOne2, TD, Depth+1);
 
           // We need to take the minimum number of known bits
           APInt KnownZero3(KnownZero), KnownOne3(KnownOne);
-          ComputeMaskedBits(L, KnownZero3, KnownOne3, TD, Depth+1);
+          computeKnownBits(L, KnownZero3, KnownOne3, TD, Depth+1);
 
           KnownZero = APInt::getLowBitsSet(BitWidth,
                                            std::min(KnownZero2.countTrailingOnes(),
@@ -712,7 +701,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
 
     // Unreachable blocks may have zero-operand PHI nodes.
     if (P->getNumIncomingValues() == 0)
-      return;
+      break;
 
     // Otherwise take the unions of the known bit sets of the operands,
     // taking conservative care to avoid excessive recursion.
@@ -731,8 +720,8 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
         KnownOne2 = APInt(BitWidth, 0);
         // Recurse, but cap the recursion to one level, because we don't
         // want to waste time spinning around in loops.
-        ComputeMaskedBits(P->getIncomingValue(i), KnownZero2, KnownOne2, TD,
-                          MaxDepth-1);
+        computeKnownBits(P->getIncomingValue(i), KnownZero2, KnownOne2, TD,
+                         MaxDepth-1);
         KnownZero &= KnownZero2;
         KnownOne &= KnownOne2;
         // If all bits have been ruled out, there's no need to check
@@ -776,30 +765,32 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
         default: break;
         case Intrinsic::uadd_with_overflow:
         case Intrinsic::sadd_with_overflow:
-          ComputeMaskedBitsAddSub(true, II->getArgOperand(0),
-                                  II->getArgOperand(1), false, KnownZero,
-                                  KnownOne, KnownZero2, KnownOne2, TD, Depth);
+          computeKnownBitsAddSub(true, II->getArgOperand(0),
+                                 II->getArgOperand(1), false, KnownZero,
+                                 KnownOne, KnownZero2, KnownOne2, TD, Depth);
           break;
         case Intrinsic::usub_with_overflow:
         case Intrinsic::ssub_with_overflow:
-          ComputeMaskedBitsAddSub(false, II->getArgOperand(0),
-                                  II->getArgOperand(1), false, KnownZero,
-                                  KnownOne, KnownZero2, KnownOne2, TD, Depth);
+          computeKnownBitsAddSub(false, II->getArgOperand(0),
+                                 II->getArgOperand(1), false, KnownZero,
+                                 KnownOne, KnownZero2, KnownOne2, TD, Depth);
           break;
         case Intrinsic::umul_with_overflow:
         case Intrinsic::smul_with_overflow:
-          ComputeMaskedBitsMul(II->getArgOperand(0), II->getArgOperand(1),
-                               false, KnownZero, KnownOne,
-                               KnownZero2, KnownOne2, TD, Depth);
+          computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1),
+                              false, KnownZero, KnownOne,
+                              KnownZero2, KnownOne2, TD, Depth);
           break;
         }
       }
     }
   }
+
+  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
 }
 
 /// ComputeSignBit - Determine whether the sign bit is known to be zero or
-/// one.  Convenience wrapper around ComputeMaskedBits.
+/// one.  Convenience wrapper around computeKnownBits.
 void llvm::ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
                           const DataLayout *TD, unsigned Depth) {
   unsigned BitWidth = getBitWidth(V->getType(), TD);
@@ -810,7 +801,7 @@ void llvm::ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
   }
   APInt ZeroBits(BitWidth, 0);
   APInt OneBits(BitWidth, 0);
-  ComputeMaskedBits(V, ZeroBits, OneBits, TD, Depth);
+  computeKnownBits(V, ZeroBits, OneBits, TD, Depth);
   KnownOne = OneBits[BitWidth - 1];
   KnownZero = ZeroBits[BitWidth - 1];
 }
@@ -842,7 +833,7 @@ bool llvm::isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth) {
   if (Depth++ == MaxDepth)
     return false;
 
-  Value *X = 0, *Y = 0;
+  Value *X = nullptr, *Y = nullptr;
   // A shift of a power of two is a power of two or zero.
   if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) ||
                  match(V, m_Shr(m_Value(X), m_Value()))))
@@ -882,10 +873,10 @@ bool llvm::isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth) {
 
       unsigned BitWidth = V->getType()->getScalarSizeInBits();
       APInt LHSZeroBits(BitWidth, 0), LHSOneBits(BitWidth, 0);
-      ComputeMaskedBits(X, LHSZeroBits, LHSOneBits, 0, Depth);
+      computeKnownBits(X, LHSZeroBits, LHSOneBits, nullptr, Depth);
 
       APInt RHSZeroBits(BitWidth, 0), RHSOneBits(BitWidth, 0);
-      ComputeMaskedBits(Y, RHSZeroBits, RHSOneBits, 0, Depth);
+      computeKnownBits(Y, RHSZeroBits, RHSOneBits, nullptr, Depth);
       // If i8 V is a power of two or zero:
       //  ZeroBits: 1 1 1 0 1 1 1 1
       // ~ZeroBits: 0 0 0 1 0 0 0 0
@@ -1005,7 +996,7 @@ bool llvm::isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth) {
   unsigned BitWidth = getBitWidth(V->getType()->getScalarType(), TD);
 
   // X | Y != 0 if X != 0 or Y != 0.
-  Value *X = 0, *Y = 0;
+  Value *X = nullptr, *Y = nullptr;
   if (match(V, m_Or(m_Value(X), m_Value(Y))))
     return isKnownNonZero(X, TD, Depth) || isKnownNonZero(Y, TD, Depth);
 
@@ -1023,7 +1014,7 @@ bool llvm::isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth) {
 
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(X, KnownZero, KnownOne, TD, Depth);
+    computeKnownBits(X, KnownZero, KnownOne, TD, Depth);
     if (KnownOne[0])
       return true;
   }
@@ -1065,12 +1056,12 @@ bool llvm::isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth) {
       APInt Mask = APInt::getSignedMaxValue(BitWidth);
       // The sign bit of X is set.  If some other bit is set then X is not equal
       // to INT_MIN.
-      ComputeMaskedBits(X, KnownZero, KnownOne, TD, Depth);
+      computeKnownBits(X, KnownZero, KnownOne, TD, Depth);
       if ((KnownOne & Mask) != 0)
         return true;
       // The sign bit of Y is set.  If some other bit is set then Y is not equal
       // to INT_MIN.
-      ComputeMaskedBits(Y, KnownZero, KnownOne, TD, Depth);
+      computeKnownBits(Y, KnownZero, KnownOne, TD, Depth);
       if ((KnownOne & Mask) != 0)
         return true;
     }
@@ -1100,7 +1091,7 @@ bool llvm::isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth) {
   if (!BitWidth) return false;
   APInt KnownZero(BitWidth, 0);
   APInt KnownOne(BitWidth, 0);
-  ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
+  computeKnownBits(V, KnownZero, KnownOne, TD, Depth);
   return KnownOne != 0;
 }
 
@@ -1116,8 +1107,7 @@ bool llvm::isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth) {
 bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask,
                              const DataLayout *TD, unsigned Depth) {
   APInt KnownZero(Mask.getBitWidth(), 0), KnownOne(Mask.getBitWidth(), 0);
-  ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
-  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+  computeKnownBits(V, KnownZero, KnownOne, TD, Depth);
   return (KnownZero & Mask) == Mask;
 }
 
@@ -1142,7 +1132,7 @@ unsigned llvm::ComputeNumSignBits(Value *V, const DataLayout *TD,
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
 
-  // Note that ConstantInt is handled by the general ComputeMaskedBits case
+  // Note that ConstantInt is handled by the general computeKnownBits case
   // below.
 
   if (Depth == 6)
@@ -1187,7 +1177,7 @@ unsigned llvm::ComputeNumSignBits(Value *V, const DataLayout *TD,
       FirstAnswer = std::min(Tmp, Tmp2);
       // We computed what we know about the sign bits as our first
       // answer. Now proceed to the generic code that uses
-      // ComputeMaskedBits, and pick whichever answer is better.
+      // computeKnownBits, and pick whichever answer is better.
     }
     break;
 
@@ -1207,7 +1197,7 @@ unsigned llvm::ComputeNumSignBits(Value *V, const DataLayout *TD,
     if (ConstantInt *CRHS = dyn_cast<ConstantInt>(U->getOperand(1)))
       if (CRHS->isAllOnesValue()) {
         APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-        ComputeMaskedBits(U->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+        computeKnownBits(U->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
@@ -1232,7 +1222,7 @@ unsigned llvm::ComputeNumSignBits(Value *V, const DataLayout *TD,
     if (ConstantInt *CLHS = dyn_cast<ConstantInt>(U->getOperand(0)))
       if (CLHS->isNullValue()) {
         APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-        ComputeMaskedBits(U->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
+        computeKnownBits(U->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
         if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue())
@@ -1278,7 +1268,7 @@ unsigned llvm::ComputeNumSignBits(Value *V, const DataLayout *TD,
   // use this information.
   APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
   APInt Mask;
-  ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
+  computeKnownBits(V, KnownZero, KnownOne, TD, Depth);
 
   if (KnownZero.isNegative()) {        // sign bit is 0
     Mask = KnownZero;
@@ -1364,7 +1354,7 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
       Op1 = ConstantInt::get(V->getContext(), API);
     }
 
-    Value *Mul0 = NULL;
+    Value *Mul0 = nullptr;
     if (ComputeMultiple(Op0, Base, Mul0, LookThroughSExt, Depth+1)) {
       if (Constant *Op1C = dyn_cast<Constant>(Op1))
         if (Constant *MulC = dyn_cast<Constant>(Mul0)) {
@@ -1388,7 +1378,7 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
         }
     }
 
-    Value *Mul1 = NULL;
+    Value *Mul1 = nullptr;
     if (ComputeMultiple(Op1, Base, Mul1, LookThroughSExt, Depth+1)) {
       if (Constant *Op0C = dyn_cast<Constant>(Op0))
         if (Constant *MulC = dyn_cast<Constant>(Mul1)) {
@@ -1432,7 +1422,7 @@ bool llvm::CannotBeNegativeZero(const Value *V, unsigned Depth) {
     return 1;  // Limit search depth.
 
   const Operator *I = dyn_cast<Operator>(V);
-  if (I == 0) return false;
+  if (!I) return false;
 
   // Check if the nsz fast-math flag is set
   if (const FPMathOperator *FPO = dyn_cast<FPMathOperator>(I))
@@ -1513,7 +1503,7 @@ Value *llvm::isBytewiseValue(Value *V) {
 
         // If the top/bottom halves aren't the same, reject it.
         if (Val != Val2)
-          return 0;
+          return nullptr;
       }
       return ConstantInt::get(V->getContext(), Val);
     }
@@ -1525,11 +1515,11 @@ Value *llvm::isBytewiseValue(Value *V) {
     Value *Elt = CA->getElementAsConstant(0);
     Value *Val = isBytewiseValue(Elt);
     if (!Val)
-      return 0;
+      return nullptr;
 
     for (unsigned I = 1, E = CA->getNumElements(); I != E; ++I)
       if (CA->getElementAsConstant(I) != Elt)
-        return 0;
+        return nullptr;
 
     return Val;
   }
@@ -1540,7 +1530,7 @@ Value *llvm::isBytewiseValue(Value *V) {
   //   %c = or i16 %a, %b
   // but until there is an example that actually needs this, it doesn't seem
   // worth worrying about.
-  return 0;
+  return nullptr;
 }
 
 
@@ -1590,7 +1580,7 @@ static Value *BuildSubAggregate(Value *From, Value* To, Type *IndexedType,
   Value *V = FindInsertedValue(From, Idxs);
 
   if (!V)
-    return NULL;
+    return nullptr;
 
   // Insert the value in the new (sub) aggregrate
   return llvm::InsertValueInst::Create(To, V, makeArrayRef(Idxs).slice(IdxSkip),
@@ -1641,7 +1631,7 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
 
   if (Constant *C = dyn_cast<Constant>(V)) {
     C = C->getAggregateElement(idx_range[0]);
-    if (C == 0) return 0;
+    if (!C) return nullptr;
     return FindInsertedValue(C, idx_range.slice(1), InsertBefore);
   }
 
@@ -1654,7 +1644,7 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
       if (req_idx == idx_range.end()) {
         // We can't handle this without inserting insertvalues
         if (!InsertBefore)
-          return 0;
+          return nullptr;
 
         // The requested index identifies a part of a nested aggregate. Handle
         // this specially. For example,
@@ -1708,7 +1698,7 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
   }
   // Otherwise, we don't know (such as, extracting from a function return value
   // or load instruction)
-  return 0;
+  return nullptr;
 }
 
 /// GetPointerBaseWithConstantOffset - Analyze the specified pointer to see if
@@ -1769,13 +1759,13 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
     // Make sure the index-ee is a pointer to array of i8.
     PointerType *PT = cast<PointerType>(GEP->getOperand(0)->getType());
     ArrayType *AT = dyn_cast<ArrayType>(PT->getElementType());
-    if (AT == 0 || !AT->getElementType()->isIntegerTy(8))
+    if (!AT || !AT->getElementType()->isIntegerTy(8))
       return false;
 
     // Check to make sure that the first operand of the GEP is an integer and
     // has value 0 so that we are sure we're indexing into the initializer.
     const ConstantInt *FirstIdx = dyn_cast<ConstantInt>(GEP->getOperand(1));
-    if (FirstIdx == 0 || !FirstIdx->isZero())
+    if (!FirstIdx || !FirstIdx->isZero())
       return false;
 
     // If the second index isn't a ConstantInt, then this is a variable index
@@ -1807,7 +1797,7 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
   // Must be a Constant Array
   const ConstantDataArray *Array =
     dyn_cast<ConstantDataArray>(GV->getInitializer());
-  if (Array == 0 || !Array->isString())
+  if (!Array || !Array->isString())
     return false;
 
   // Get the number of elements in the array
@@ -1913,7 +1903,7 @@ llvm::GetUnderlyingObject(Value *V, const DataLayout *TD, unsigned MaxLookup) {
       // See if InstructionSimplify knows any relevant tricks.
       if (Instruction *I = dyn_cast<Instruction>(V))
         // TODO: Acquire a DominatorTree and use it.
-        if (Value *Simplified = SimplifyInstruction(I, TD, 0)) {
+        if (Value *Simplified = SimplifyInstruction(I, TD, nullptr)) {
           V = Simplified;
           continue;
         }
@@ -2001,7 +1991,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
       return false;
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(Op, KnownZero, KnownOne, TD);
+    computeKnownBits(Op, KnownZero, KnownOne, TD);
     return !!KnownZero;
   }
   case Instruction::Load: {
@@ -2076,14 +2066,18 @@ bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
   // Alloca never returns null, malloc might.
   if (isa<AllocaInst>(V)) return true;
 
-  // A byval or inalloca argument is never null.
+  // A byval, inalloca, or nonnull argument is never null.
   if (const Argument *A = dyn_cast<Argument>(V))
-    return A->hasByValOrInAllocaAttr();
+    return A->hasByValOrInAllocaAttr() || A->hasNonNullAttr();
 
   // Global values are not null unless extern weak.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
     return !GV->hasExternalWeakLinkage();
 
+  if (ImmutableCallSite CS = V)
+    if (CS.paramHasAttr(0, Attribute::NonNull))
+      return true;
+
   // operator new never returns null.
   if (isOperatorNewLikeFn(V, TLI, /*LookThroughBitCast=*/true))
     return true;
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 1a5eec3..44a3412 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -34,6 +34,10 @@ bool LLLexer::Error(LocTy ErrorLoc, const Twine &Msg) const {
   return true;
 }
 
+void LLLexer::Warning(LocTy WarningLoc, const Twine &Msg) const {
+  SM.PrintMessage(WarningLoc, SourceMgr::DK_Warning, Msg);
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions.
 //===----------------------------------------------------------------------===//
@@ -146,7 +150,7 @@ static bool isLabelChar(char C) {
 static const char *isLabelTail(const char *CurPtr) {
   while (1) {
     if (CurPtr[0] == ':') return CurPtr+1;
-    if (!isLabelChar(CurPtr[0])) return 0;
+    if (!isLabelChar(CurPtr[0])) return nullptr;
     ++CurPtr;
   }
 }
@@ -431,8 +435,8 @@ lltok::Kind LLLexer::LexHash() {
 ///    HexIntConstant  [us]0x[0-9A-Fa-f]+
 lltok::Kind LLLexer::LexIdentifier() {
   const char *StartChar = CurPtr;
-  const char *IntEnd = CurPtr[-1] == 'i' ? 0 : StartChar;
-  const char *KeywordEnd = 0;
+  const char *IntEnd = CurPtr[-1] == 'i' ? nullptr : StartChar;
+  const char *KeywordEnd = nullptr;
 
   for (; isLabelChar(*CurPtr); ++CurPtr) {
     // If we decide this is an integer, remember the end of the sequence.
@@ -451,7 +455,7 @@ lltok::Kind LLLexer::LexIdentifier() {
 
   // Otherwise, this wasn't a label.  If this was valid as an integer type,
   // return it.
-  if (IntEnd == 0) IntEnd = CurPtr;
+  if (!IntEnd) IntEnd = CurPtr;
   if (IntEnd != StartChar) {
     CurPtr = IntEnd;
     uint64_t NumBits = atoull(StartChar, CurPtr);
@@ -465,7 +469,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   }
 
   // Otherwise, this was a letter sequence.  See which keyword this is.
-  if (KeywordEnd == 0) KeywordEnd = CurPtr;
+  if (!KeywordEnd) KeywordEnd = CurPtr;
   CurPtr = KeywordEnd;
   --StartChar;
   unsigned Len = CurPtr-StartChar;
@@ -481,6 +485,8 @@ lltok::Kind LLLexer::LexIdentifier() {
 
   KEYWORD(private);
   KEYWORD(internal);
+  KEYWORD(linker_private);        // NOTE: deprecated, for parser compatibility
+  KEYWORD(linker_private_weak);   // NOTE: deprecated, for parser compatibility
   KEYWORD(available_externally);
   KEYWORD(linkonce);
   KEYWORD(linkonce_odr);
@@ -506,6 +512,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(null);
   KEYWORD(to);
   KEYWORD(tail);
+  KEYWORD(musttail);
   KEYWORD(target);
   KEYWORD(triple);
   KEYWORD(unwind);
@@ -548,7 +555,6 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(x86_stdcallcc);
   KEYWORD(x86_fastcallcc);
   KEYWORD(x86_thiscallcc);
-  KEYWORD(x86_cdeclmethodcc);
   KEYWORD(arm_apcscc);
   KEYWORD(arm_aapcscc);
   KEYWORD(arm_aapcs_vfpcc);
@@ -587,6 +593,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(noimplicitfloat);
   KEYWORD(noinline);
   KEYWORD(nonlazybind);
+  KEYWORD(nonnull);
   KEYWORD(noredzone);
   KEYWORD(noreturn);
   KEYWORD(nounwind);
diff --git a/lib/AsmParser/LLLexer.h b/lib/AsmParser/LLLexer.h
index 85703c7..ad11d49 100644
--- a/lib/AsmParser/LLLexer.h
+++ b/lib/AsmParser/LLLexer.h
@@ -63,6 +63,10 @@ namespace llvm {
 
     bool Error(LocTy L, const Twine &Msg) const;
     bool Error(const Twine &Msg) const { return Error(getLoc(), Msg); }
+
+    void Warning(LocTy WarningLoc, const Twine &Msg) const;
+    void Warning(const Twine &Msg) const { return Warning(getLoc(), Msg); }
+
     std::string getFilename() const;
 
   private:
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 37151e6..3282e8a 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -57,7 +57,8 @@ bool LLParser::ValidateEndOfModule() {
       for (unsigned i = 0, e = MDList.size(); i != e; ++i) {
         unsigned SlotNo = MDList[i].MDSlot;
 
-        if (SlotNo >= NumberedMetadata.size() || NumberedMetadata[SlotNo] == 0)
+        if (SlotNo >= NumberedMetadata.size() ||
+            NumberedMetadata[SlotNo] == nullptr)
           return Error(MDList[i].Loc, "use of undefined metadata '!" +
                        Twine(SlotNo) + "'");
         Inst->setMetadata(MDList[i].MDKind, NumberedMetadata[SlotNo]);
@@ -132,20 +133,20 @@ bool LLParser::ValidateEndOfModule() {
   // references after the function was defined.  Resolve those now.
   while (!ForwardRefBlockAddresses.empty()) {
     // Okay, we are referencing an already-parsed function, resolve them now.
-    Function *TheFn = 0;
+    Function *TheFn = nullptr;
     const ValID &Fn = ForwardRefBlockAddresses.begin()->first;
     if (Fn.Kind == ValID::t_GlobalName)
       TheFn = M->getFunction(Fn.StrVal);
     else if (Fn.UIntVal < NumberedVals.size())
       TheFn = dyn_cast<Function>(NumberedVals[Fn.UIntVal]);
 
-    if (TheFn == 0)
+    if (!TheFn)
       return Error(Fn.Loc, "unknown function referenced by blockaddress");
 
     // Resolve all these references.
     if (ResolveForwardRefBlockAddresses(TheFn,
                                       ForwardRefBlockAddresses.begin()->second,
-                                        0))
+                                        nullptr))
       return true;
 
     ForwardRefBlockAddresses.erase(ForwardRefBlockAddresses.begin());
@@ -206,7 +207,7 @@ bool LLParser::ResolveForwardRefBlockAddresses(Function *TheFn,
                      TheFn->getValueSymbolTable().lookup(Refs[i].first.StrVal));
     }
 
-    if (Res == 0)
+    if (!Res)
       return Error(Refs[i].first.Loc,
                    "referenced value is not a basic block");
 
@@ -247,6 +248,8 @@ bool LLParser::ParseTopLevelEntities() {
     //               ('constant'|'global') ...
     case lltok::kw_private:             // OptionalLinkage
     case lltok::kw_internal:            // OptionalLinkage
+    case lltok::kw_linker_private:      // Obsolete OptionalLinkage
+    case lltok::kw_linker_private_weak: // Obsolete OptionalLinkage
     case lltok::kw_weak:                // OptionalLinkage
     case lltok::kw_weak_odr:            // OptionalLinkage
     case lltok::kw_linkonce:            // OptionalLinkage
@@ -362,7 +365,7 @@ bool LLParser::ParseUnnamedType() {
   if (TypeID >= NumberedTypes.size())
     NumberedTypes.resize(TypeID+1);
 
-  Type *Result = 0;
+  Type *Result = nullptr;
   if (ParseStructDefinition(TypeLoc, "",
                             NumberedTypes[TypeID], Result)) return true;
 
@@ -389,7 +392,7 @@ bool LLParser::ParseNamedType() {
       ParseToken(lltok::kw_type, "expected 'type' after name"))
     return true;
 
-  Type *Result = 0;
+  Type *Result = nullptr;
   if (ParseStructDefinition(NameLoc, Name,
                             NamedTypes[Name], Result)) return true;
 
@@ -521,10 +524,10 @@ bool LLParser::ParseMDNodeID(MDNode *&Result, unsigned &SlotNo) {
   if (ParseUInt32(SlotNo)) return true;
 
   // Check existing MDNode.
-  if (SlotNo < NumberedMetadata.size() && NumberedMetadata[SlotNo] != 0)
+  if (SlotNo < NumberedMetadata.size() && NumberedMetadata[SlotNo] != nullptr)
     Result = NumberedMetadata[SlotNo];
   else
-    Result = 0;
+    Result = nullptr;
   return false;
 }
 
@@ -565,7 +568,7 @@ bool LLParser::ParseNamedMetadata() {
       if (ParseToken(lltok::exclaim, "Expected '!' here"))
         return true;
 
-      MDNode *N = 0;
+      MDNode *N = nullptr;
       if (ParseMDNodeID(N)) return true;
       NMD->addOperand(N);
     } while (EatIfPresent(lltok::comma));
@@ -584,14 +587,14 @@ bool LLParser::ParseStandaloneMetadata() {
   unsigned MetadataID = 0;
 
   LocTy TyLoc;
-  Type *Ty = 0;
+  Type *Ty = nullptr;
   SmallVector<Value *, 16> Elts;
   if (ParseUInt32(MetadataID) ||
       ParseToken(lltok::equal, "expected '=' here") ||
       ParseType(Ty, TyLoc) ||
       ParseToken(lltok::exclaim, "Expected '!' here") ||
       ParseToken(lltok::lbrace, "Expected '{' here") ||
-      ParseMDNodeVector(Elts, NULL) ||
+      ParseMDNodeVector(Elts, nullptr) ||
       ParseToken(lltok::rbrace, "expected end of metadata node"))
     return true;
 
@@ -611,7 +614,7 @@ bool LLParser::ParseStandaloneMetadata() {
     if (MetadataID >= NumberedMetadata.size())
       NumberedMetadata.resize(MetadataID+1);
 
-    if (NumberedMetadata[MetadataID] != 0)
+    if (NumberedMetadata[MetadataID] != nullptr)
       return TokError("Metadata id is already used");
     NumberedMetadata[MetadataID] = Init;
   }
@@ -619,13 +622,19 @@ bool LLParser::ParseStandaloneMetadata() {
   return false;
 }
 
+static bool isValidVisibilityForLinkage(unsigned V, unsigned L) {
+  return !GlobalValue::isLocalLinkage((GlobalValue::LinkageTypes)L) ||
+         (GlobalValue::VisibilityTypes)V == GlobalValue::DefaultVisibility;
+}
+
 /// ParseAlias:
 ///   ::= GlobalVar '=' OptionalVisibility OptionalDLLStorageClass 'alias'
 ///                     OptionalLinkage Aliasee
+///   ::= GlobalVar '=' OptionalVisibility OptionalDLLStorageClass 'alias'
+///                     OptionalLinkage OptionalAddrSpace Type, Aliasee
+///
 /// Aliasee
 ///   ::= TypeAndValue
-///   ::= 'bitcast' '(' TypeAndValue 'to' Type ')'
-///   ::= 'getelementptr' 'inbounds'? '(' ... ')'
 ///
 /// Everything through DLL storage class has already been parsed.
 ///
@@ -643,27 +652,53 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
   if(!GlobalAlias::isValidLinkage(Linkage))
     return Error(LinkageLoc, "invalid linkage type for alias");
 
-  Constant *Aliasee;
-  LocTy AliaseeLoc = Lex.getLoc();
-  if (Lex.getKind() != lltok::kw_bitcast &&
-      Lex.getKind() != lltok::kw_getelementptr) {
-    if (ParseGlobalTypeAndValue(Aliasee)) return true;
+  if (!isValidVisibilityForLinkage(Visibility, L))
+    return Error(LinkageLoc,
+                 "symbol with local linkage must have default visibility");
+
+  bool HasAddrSpace = Lex.getKind() == lltok::kw_addrspace;
+  unsigned AddrSpace;
+  LocTy AddrSpaceLoc = Lex.getLoc();
+  if (ParseOptionalAddrSpace(AddrSpace))
+    return true;
+
+  LocTy TyLoc = Lex.getLoc();
+  Type *Ty = nullptr;
+  if (ParseType(Ty))
+    return true;
+
+  bool DifferentType = EatIfPresent(lltok::comma);
+  if (HasAddrSpace && !DifferentType)
+    return Error(AddrSpaceLoc, "A type is required if addrspace is given");
+
+  Type *AliaseeType = nullptr;
+  if (DifferentType) {
+    if (ParseType(AliaseeType))
+      return true;
   } else {
-    // The bitcast dest type is not present, it is implied by the dest type.
-    ValID ID;
-    if (ParseValID(ID)) return true;
-    if (ID.Kind != ValID::t_Constant)
-      return Error(AliaseeLoc, "invalid aliasee");
-    Aliasee = ID.ConstantVal;
+    AliaseeType = Ty;
+    auto *PTy = dyn_cast<PointerType>(Ty);
+    if (!PTy)
+      return Error(TyLoc, "An alias must have pointer type");
+    Ty = PTy->getElementType();
+    AddrSpace = PTy->getAddressSpace();
   }
 
-  if (!Aliasee->getType()->isPointerTy())
-    return Error(AliaseeLoc, "alias must have pointer type");
+  LocTy AliaseeLoc = Lex.getLoc();
+  Constant *C;
+  if (ParseGlobalValue(AliaseeType, C))
+    return true;
+
+  auto *Aliasee = dyn_cast<GlobalObject>(C);
+  if (!Aliasee)
+    return Error(AliaseeLoc, "Alias must point to function or variable");
+
+  assert(Aliasee->getType()->isPointerTy());
 
   // Okay, create the alias but do not insert it into the module yet.
-  GlobalAlias* GA = new GlobalAlias(Aliasee->getType(),
-                                    (GlobalValue::LinkageTypes)Linkage, Name,
-                                    Aliasee);
+  std::unique_ptr<GlobalAlias> GA(
+      GlobalAlias::create(Ty, AddrSpace, (GlobalValue::LinkageTypes)Linkage,
+                          Name, Aliasee, /*Parent*/ nullptr));
   GA->setVisibility((GlobalValue::VisibilityTypes)Visibility);
   GA->setDLLStorageClass((GlobalValue::DLLStorageClassTypes)DLLStorageClass);
 
@@ -685,15 +720,23 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
 
     // If they agree, just RAUW the old value with the alias and remove the
     // forward ref info.
-    Val->replaceAllUsesWith(GA);
+    for (auto *User : Val->users()) {
+      if (auto *GA = dyn_cast<GlobalAlias>(User))
+        return Error(NameLoc, "Alias is pointed by alias " + GA->getName());
+    }
+
+    Val->replaceAllUsesWith(GA.get());
     Val->eraseFromParent();
     ForwardRefVals.erase(I);
   }
 
   // Insert into the module, we know its name won't collide now.
-  M->getAliasList().push_back(GA);
+  M->getAliasList().push_back(GA.get());
   assert(GA->getName() == Name && "Should not be a name conflict!");
 
+  // The module owns this now
+  GA.release();
+
   return false;
 }
 
@@ -711,6 +754,10 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
 bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
                            unsigned Linkage, bool HasLinkage,
                            unsigned Visibility, unsigned DLLStorageClass) {
+  if (!isValidVisibilityForLinkage(Visibility, Linkage))
+    return Error(NameLoc,
+                 "symbol with local linkage must have default visibility");
+
   unsigned AddrSpace;
   bool IsConstant, UnnamedAddr, IsExternallyInitialized;
   GlobalVariable::ThreadLocalMode TLM;
@@ -718,7 +765,7 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
   LocTy IsExternallyInitializedLoc;
   LocTy TyLoc;
 
-  Type *Ty = 0;
+  Type *Ty = nullptr;
   if (ParseOptionalThreadLocal(TLM) ||
       ParseOptionalAddrSpace(AddrSpace) ||
       ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
@@ -732,7 +779,7 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
 
   // If the linkage is specified and is external, then no initializer is
   // present.
-  Constant *Init = 0;
+  Constant *Init = nullptr;
   if (!HasLinkage || (Linkage != GlobalValue::ExternalWeakLinkage &&
                       Linkage != GlobalValue::ExternalLinkage)) {
     if (ParseGlobalValue(Ty, Init))
@@ -742,7 +789,7 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
   if (Ty->isFunctionTy() || Ty->isLabelTy())
     return Error(TyLoc, "invalid type for global variable");
 
-  GlobalVariable *GV = 0;
+  GlobalVariable *GV = nullptr;
 
   // See if the global was forward referenced, if so, use the global.
   if (!Name.empty()) {
@@ -760,9 +807,9 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
     }
   }
 
-  if (GV == 0) {
-    GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, 0,
-                            Name, 0, GlobalVariable::NotThreadLocal,
+  if (!GV) {
+    GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, nullptr,
+                            Name, nullptr, GlobalVariable::NotThreadLocal,
                             AddrSpace);
   } else {
     if (GV->getType()->getElementType() != Ty)
@@ -956,6 +1003,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_nest:
     case lltok::kw_noalias:
     case lltok::kw_nocapture:
+    case lltok::kw_nonnull:
     case lltok::kw_returned:
     case lltok::kw_sret:
       HaveError |=
@@ -978,9 +1026,9 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
 GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
                                     LocTy Loc) {
   PointerType *PTy = dyn_cast<PointerType>(Ty);
-  if (PTy == 0) {
+  if (!PTy) {
     Error(Loc, "global variable reference must have pointer type");
-    return 0;
+    return nullptr;
   }
 
   // Look this name up in the normal function symbol table.
@@ -989,7 +1037,7 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
 
   // If this is a forward reference for the value, see if we already created a
   // forward ref record.
-  if (Val == 0) {
+  if (!Val) {
     std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator
       I = ForwardRefVals.find(Name);
     if (I != ForwardRefVals.end())
@@ -1001,7 +1049,7 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
     if (Val->getType() == Ty) return Val;
     Error(Loc, "'@" + Name + "' defined with type '" +
           getTypeString(Val->getType()) + "'");
-    return 0;
+    return nullptr;
   }
 
   // Otherwise, create a new forward reference for this value and remember it.
@@ -1010,8 +1058,8 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
     FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, Name, M);
   else
     FwdVal = new GlobalVariable(*M, PTy->getElementType(), false,
-                                GlobalValue::ExternalWeakLinkage, 0, Name,
-                                0, GlobalVariable::NotThreadLocal,
+                                GlobalValue::ExternalWeakLinkage, nullptr, Name,
+                                nullptr, GlobalVariable::NotThreadLocal,
                                 PTy->getAddressSpace());
 
   ForwardRefVals[Name] = std::make_pair(FwdVal, Loc);
@@ -1020,16 +1068,16 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
 
 GlobalValue *LLParser::GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc) {
   PointerType *PTy = dyn_cast<PointerType>(Ty);
-  if (PTy == 0) {
+  if (!PTy) {
     Error(Loc, "global variable reference must have pointer type");
-    return 0;
+    return nullptr;
   }
 
-  GlobalValue *Val = ID < NumberedVals.size() ? NumberedVals[ID] : 0;
+  GlobalValue *Val = ID < NumberedVals.size() ? NumberedVals[ID] : nullptr;
 
   // If this is a forward reference for the value, see if we already created a
   // forward ref record.
-  if (Val == 0) {
+  if (!Val) {
     std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator
       I = ForwardRefValIDs.find(ID);
     if (I != ForwardRefValIDs.end())
@@ -1041,7 +1089,7 @@ GlobalValue *LLParser::GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc) {
     if (Val->getType() == Ty) return Val;
     Error(Loc, "'@" + Twine(ID) + "' defined with type '" +
           getTypeString(Val->getType()) + "'");
-    return 0;
+    return nullptr;
   }
 
   // Otherwise, create a new forward reference for this value and remember it.
@@ -1050,7 +1098,7 @@ GlobalValue *LLParser::GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc) {
     FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, "", M);
   else
     FwdVal = new GlobalVariable(*M, PTy->getElementType(), false,
-                                GlobalValue::ExternalWeakLinkage, 0, "");
+                                GlobalValue::ExternalWeakLinkage, nullptr, "");
 
   ForwardRefValIDs[ID] = std::make_pair(FwdVal, Loc);
   return FwdVal;
@@ -1170,6 +1218,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_nest:            B.addAttribute(Attribute::Nest); break;
     case lltok::kw_noalias:         B.addAttribute(Attribute::NoAlias); break;
     case lltok::kw_nocapture:       B.addAttribute(Attribute::NoCapture); break;
+    case lltok::kw_nonnull:         B.addAttribute(Attribute::NonNull); break;
     case lltok::kw_readnone:        B.addAttribute(Attribute::ReadNone); break;
     case lltok::kw_readonly:        B.addAttribute(Attribute::ReadOnly); break;
     case lltok::kw_returned:        B.addAttribute(Attribute::Returned); break;
@@ -1222,6 +1271,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
       return HaveError;
     case lltok::kw_inreg:           B.addAttribute(Attribute::InReg); break;
     case lltok::kw_noalias:         B.addAttribute(Attribute::NoAlias); break;
+    case lltok::kw_nonnull:         B.addAttribute(Attribute::NonNull); break;
     case lltok::kw_signext:         B.addAttribute(Attribute::SExt); break;
     case lltok::kw_zeroext:         B.addAttribute(Attribute::ZExt); break;
 
@@ -1286,6 +1336,10 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
 ///   ::= 'common'
 ///   ::= 'extern_weak'
 ///   ::= 'external'
+///
+///   Deprecated Values:
+///     ::= 'linker_private'
+///     ::= 'linker_private_weak'
 bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
   HasLinkage = false;
   switch (Lex.getKind()) {
@@ -1303,6 +1357,15 @@ bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
   case lltok::kw_common:         Res = GlobalValue::CommonLinkage;        break;
   case lltok::kw_extern_weak:    Res = GlobalValue::ExternalWeakLinkage;  break;
   case lltok::kw_external:       Res = GlobalValue::ExternalLinkage;      break;
+
+  case lltok::kw_linker_private:
+  case lltok::kw_linker_private_weak:
+    Lex.Warning("'" + Lex.getStrVal() + "' is deprecated, treating as"
+                " PrivateLinkage");
+    Lex.Lex();
+    // treat linker_private and linker_private_weak as PrivateLinkage
+    Res = GlobalValue::PrivateLinkage;
+    return false;
   }
   Lex.Lex();
   HasLinkage = true;
@@ -1350,7 +1413,6 @@ bool LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'x86_stdcallcc'
 ///   ::= 'x86_fastcallcc'
 ///   ::= 'x86_thiscallcc'
-///   ::= 'x86_cdeclmethodcc'
 ///   ::= 'arm_apcscc'
 ///   ::= 'arm_aapcscc'
 ///   ::= 'arm_aapcs_vfpcc'
@@ -1376,7 +1438,6 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
   case lltok::kw_x86_stdcallcc:  CC = CallingConv::X86_StdCall; break;
   case lltok::kw_x86_fastcallcc: CC = CallingConv::X86_FastCall; break;
   case lltok::kw_x86_thiscallcc: CC = CallingConv::X86_ThisCall; break;
-  case lltok::kw_x86_cdeclmethodcc:CC = CallingConv::X86_CDeclMethod; break;
   case lltok::kw_arm_apcscc:     CC = CallingConv::ARM_APCS; break;
   case lltok::kw_arm_aapcscc:    CC = CallingConv::ARM_AAPCS; break;
   case lltok::kw_arm_aapcs_vfpcc:CC = CallingConv::ARM_AAPCS_VFP; break;
@@ -1623,7 +1684,7 @@ bool LLParser::ParseType(Type *&Result, bool AllowVoid) {
 
     // If the type hasn't been defined yet, create a forward definition and
     // remember where that forward def'n was seen (in case it never is defined).
-    if (Entry.first == 0) {
+    if (!Entry.first) {
       Entry.first = StructType::create(Context, Lex.getStrVal());
       Entry.second = Lex.getLoc();
     }
@@ -1640,7 +1701,7 @@ bool LLParser::ParseType(Type *&Result, bool AllowVoid) {
 
     // If the type hasn't been defined yet, create a forward definition and
     // remember where that forward def'n was seen (in case it never is defined).
-    if (Entry.first == 0) {
+    if (!Entry.first) {
       Entry.first = StructType::create(Context);
       Entry.second = Lex.getLoc();
     }
@@ -1716,7 +1777,7 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
 
     // Parse the argument.
     LocTy ArgLoc;
-    Type *ArgTy = 0;
+    Type *ArgTy = nullptr;
     AttrBuilder ArgAttrs;
     Value *V;
     if (ParseType(ArgTy, ArgLoc))
@@ -1758,7 +1819,7 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
     Lex.Lex();
   } else {
     LocTy TypeLoc = Lex.getLoc();
-    Type *ArgTy = 0;
+    Type *ArgTy = nullptr;
     AttrBuilder Attrs;
     std::string Name;
 
@@ -1870,7 +1931,7 @@ bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
     Entry.second = SMLoc();
 
     // If this type number has never been uttered, create it.
-    if (Entry.first == 0)
+    if (!Entry.first)
       Entry.first = StructType::create(Context, Name);
     ResultTy = Entry.first;
     return false;
@@ -1886,7 +1947,7 @@ bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
     if (Entry.first)
       return Error(TypeLoc, "forward references to non-struct type");
 
-    ResultTy = 0;
+    ResultTy = nullptr;
     if (isPacked)
       return ParseArrayVectorType(ResultTy, true);
     return ParseType(ResultTy);
@@ -1896,7 +1957,7 @@ bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
   Entry.second = SMLoc();
 
   // If this type number has never been uttered, create it.
-  if (Entry.first == 0)
+  if (!Entry.first)
     Entry.first = StructType::create(Context, Name);
 
   StructType *STy = cast<StructType>(Entry.first);
@@ -1927,7 +1988,7 @@ bool LLParser::ParseStructBody(SmallVectorImpl<Type*> &Body) {
     return false;
 
   LocTy EltTyLoc = Lex.getLoc();
-  Type *Ty = 0;
+  Type *Ty = nullptr;
   if (ParseType(Ty)) return true;
   Body.push_back(Ty);
 
@@ -1965,7 +2026,7 @@ bool LLParser::ParseArrayVectorType(Type *&Result, bool isVector) {
       return true;
 
   LocTy TypeLoc = Lex.getLoc();
-  Type *EltTy = 0;
+  Type *EltTy = nullptr;
   if (ParseType(EltTy)) return true;
 
   if (ParseToken(isVector ? lltok::greater : lltok::rsquare,
@@ -2011,7 +2072,7 @@ LLParser::PerFunctionState::~PerFunctionState() {
       I->second.first->replaceAllUsesWith(
                            UndefValue::get(I->second.first->getType()));
       delete I->second.first;
-      I->second.first = 0;
+      I->second.first = nullptr;
     }
 
   for (std::map<unsigned, std::pair<Value*, LocTy> >::iterator
@@ -2020,7 +2081,7 @@ LLParser::PerFunctionState::~PerFunctionState() {
       I->second.first->replaceAllUsesWith(
                            UndefValue::get(I->second.first->getType()));
       delete I->second.first;
-      I->second.first = 0;
+      I->second.first = nullptr;
     }
 }
 
@@ -2069,7 +2130,7 @@ Value *LLParser::PerFunctionState::GetVal(const std::string &Name,
 
   // If this is a forward reference for the value, see if we already created a
   // forward ref record.
-  if (Val == 0) {
+  if (!Val) {
     std::map<std::string, std::pair<Value*, LocTy> >::iterator
       I = ForwardRefVals.find(Name);
     if (I != ForwardRefVals.end())
@@ -2084,13 +2145,13 @@ Value *LLParser::PerFunctionState::GetVal(const std::string &Name,
     else
       P.Error(Loc, "'%" + Name + "' defined with type '" +
               getTypeString(Val->getType()) + "'");
-    return 0;
+    return nullptr;
   }
 
   // Don't make placeholders with invalid type.
   if (!Ty->isFirstClassType() && !Ty->isLabelTy()) {
     P.Error(Loc, "invalid use of a non-first-class type");
-    return 0;
+    return nullptr;
   }
 
   // Otherwise, create a new forward reference for this value and remember it.
@@ -2107,11 +2168,11 @@ Value *LLParser::PerFunctionState::GetVal(const std::string &Name,
 Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty,
                                           LocTy Loc) {
   // Look this name up in the normal function symbol table.
-  Value *Val = ID < NumberedVals.size() ? NumberedVals[ID] : 0;
+  Value *Val = ID < NumberedVals.size() ? NumberedVals[ID] : nullptr;
 
   // If this is a forward reference for the value, see if we already created a
   // forward ref record.
-  if (Val == 0) {
+  if (!Val) {
     std::map<unsigned, std::pair<Value*, LocTy> >::iterator
       I = ForwardRefValIDs.find(ID);
     if (I != ForwardRefValIDs.end())
@@ -2126,12 +2187,12 @@ Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty,
     else
       P.Error(Loc, "'%" + Twine(ID) + "' defined with type '" +
               getTypeString(Val->getType()) + "'");
-    return 0;
+    return nullptr;
   }
 
   if (!Ty->isFirstClassType() && !Ty->isLabelTy()) {
     P.Error(Loc, "invalid use of a non-first-class type");
-    return 0;
+    return nullptr;
   }
 
   // Otherwise, create a new forward reference for this value and remember it.
@@ -2227,7 +2288,7 @@ BasicBlock *LLParser::PerFunctionState::DefineBB(const std::string &Name,
     BB = GetBB(NumberedVals.size(), Loc);
   else
     BB = GetBB(Name, Loc);
-  if (BB == 0) return 0; // Already diagnosed error.
+  if (!BB) return nullptr; // Already diagnosed error.
 
   // Move the block to the end of the function.  Forward ref'd blocks are
   // inserted wherever they happen to be referenced.
@@ -2435,7 +2496,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     // Make a global variable as a placeholder for this reference.
     GlobalVariable *FwdRef = new GlobalVariable(*M, Type::getInt8Ty(Context),
                                            false, GlobalValue::InternalLinkage,
-                                                0, "");
+                                                nullptr, "");
     ForwardRefBlockAddresses[Fn].push_back(std::make_pair(Label, FwdRef));
     ID.ConstantVal = FwdRef;
     ID.Kind = ValID::t_Constant;
@@ -2456,7 +2517,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
   case lltok::kw_inttoptr:
   case lltok::kw_ptrtoint: {
     unsigned Opc = Lex.getUIntVal();
-    Type *DestTy = 0;
+    Type *DestTy = nullptr;
     Constant *SrcVal;
     Lex.Lex();
     if (ParseToken(lltok::lparen, "expected '(' after constantexpr cast") ||
@@ -2720,18 +2781,18 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
 
 /// ParseGlobalValue - Parse a global value with the specified type.
 bool LLParser::ParseGlobalValue(Type *Ty, Constant *&C) {
-  C = 0;
+  C = nullptr;
   ValID ID;
-  Value *V = NULL;
+  Value *V = nullptr;
   bool Parsed = ParseValID(ID) ||
-                ConvertValIDToValue(Ty, ID, V, NULL);
+                ConvertValIDToValue(Ty, ID, V, nullptr);
   if (V && !(C = dyn_cast<Constant>(V)))
     return Error(ID.Loc, "global values must be constants");
   return Parsed;
 }
 
 bool LLParser::ParseGlobalTypeAndValue(Constant *&V) {
-  Type *Ty = 0;
+  Type *Ty = nullptr;
   return ParseType(Ty) ||
          ParseGlobalValue(Ty, V);
 }
@@ -2815,15 +2876,15 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
   case ValID::t_LocalID:
     if (!PFS) return Error(ID.Loc, "invalid use of function-local name");
     V = PFS->GetVal(ID.UIntVal, Ty, ID.Loc);
-    return (V == 0);
+    return V == nullptr;
   case ValID::t_LocalName:
     if (!PFS) return Error(ID.Loc, "invalid use of function-local name");
     V = PFS->GetVal(ID.StrVal, Ty, ID.Loc);
-    return (V == 0);
+    return V == nullptr;
   case ValID::t_InlineAsm: {
     PointerType *PTy = dyn_cast<PointerType>(Ty);
     FunctionType *FTy =
-      PTy ? dyn_cast<FunctionType>(PTy->getElementType()) : 0;
+      PTy ? dyn_cast<FunctionType>(PTy->getElementType()) : nullptr;
     if (!FTy || !InlineAsm::Verify(FTy, ID.StrVal2))
       return Error(ID.Loc, "invalid type for inline asm constraint string");
     V = InlineAsm::get(FTy, ID.StrVal, ID.StrVal2, ID.UIntVal&1,
@@ -2842,10 +2903,10 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
     return false;
   case ValID::t_GlobalName:
     V = GetGlobalVal(ID.StrVal, Ty, ID.Loc);
-    return V == 0;
+    return V == nullptr;
   case ValID::t_GlobalID:
     V = GetGlobalVal(ID.UIntVal, Ty, ID.Loc);
-    return V == 0;
+    return V == nullptr;
   case ValID::t_APSInt:
     if (!Ty->isIntegerTy())
       return Error(ID.Loc, "integer constant must have integer type");
@@ -2928,14 +2989,14 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
 }
 
 bool LLParser::ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS) {
-  V = 0;
+  V = nullptr;
   ValID ID;
   return ParseValID(ID, PFS) ||
          ConvertValIDToValue(Ty, ID, V, PFS);
 }
 
 bool LLParser::ParseTypeAndValue(Value *&V, PerFunctionState *PFS) {
-  Type *Ty = 0;
+  Type *Ty = nullptr;
   return ParseType(Ty) ||
          ParseValue(Ty, V, PFS);
 }
@@ -2965,7 +3026,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   unsigned DLLStorageClass;
   AttrBuilder RetAttrs;
   CallingConv::ID CC;
-  Type *RetType = 0;
+  Type *RetType = nullptr;
   LocTy RetTypeLoc = Lex.getLoc();
   if (ParseOptionalLinkage(Linkage) ||
       ParseOptionalVisibility(Visibility) ||
@@ -2998,6 +3059,10 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
     return Error(LinkageLoc, "invalid function linkage type");
   }
 
+  if (!isValidVisibilityForLinkage(Visibility, Linkage))
+    return Error(LinkageLoc,
+                 "symbol with local linkage must have default visibility");
+
   if (!FunctionType::isValidReturnType(RetType))
     return Error(RetTypeLoc, "invalid function return type");
 
@@ -3031,7 +3096,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   std::string GC;
   bool UnnamedAddr;
   LocTy UnnamedAddrLoc;
-  Constant *Prefix = 0;
+  Constant *Prefix = nullptr;
 
   if (ParseArgumentList(ArgList, isVarArg) ||
       ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
@@ -3088,7 +3153,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
     FunctionType::get(RetType, ParamTypeList, isVarArg);
   PointerType *PFT = PointerType::getUnqual(FT);
 
-  Fn = 0;
+  Fn = nullptr;
   if (!FunctionName.empty()) {
     // If this was a definition of a forward reference, remove the definition
     // from the forward reference table and fill in the forward ref.
@@ -3126,7 +3191,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
     }
   }
 
-  if (Fn == 0)
+  if (!Fn)
     Fn = Function::Create(FT, GlobalValue::ExternalLinkage, FunctionName, M);
   else // Move the forward-reference to the correct spot in the module.
     M->getFunctionList().splice(M->end(), M->getFunctionList(), Fn);
@@ -3203,7 +3268,7 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
   }
 
   BasicBlock *BB = PFS.DefineBB(Name, NameLoc);
-  if (BB == 0) return true;
+  if (!BB) return true;
 
   std::string NameStr;
 
@@ -3351,8 +3416,10 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_shufflevector:  return ParseShuffleVector(Inst, PFS);
   case lltok::kw_phi:            return ParsePHI(Inst, PFS);
   case lltok::kw_landingpad:     return ParseLandingPad(Inst, PFS);
-  case lltok::kw_call:           return ParseCall(Inst, PFS, false);
-  case lltok::kw_tail:           return ParseCall(Inst, PFS, true);
+  // Call.
+  case lltok::kw_call:     return ParseCall(Inst, PFS, CallInst::TCK_None);
+  case lltok::kw_tail:     return ParseCall(Inst, PFS, CallInst::TCK_Tail);
+  case lltok::kw_musttail: return ParseCall(Inst, PFS, CallInst::TCK_MustTail);
   // Memory.
   case lltok::kw_alloca:         return ParseAlloc(Inst, PFS);
   case lltok::kw_load:           return ParseLoad(Inst, PFS);
@@ -3417,7 +3484,7 @@ bool LLParser::ParseCmpPredicate(unsigned &P, unsigned Opc) {
 bool LLParser::ParseRet(Instruction *&Inst, BasicBlock *BB,
                         PerFunctionState &PFS) {
   SMLoc TypeLoc = Lex.getLoc();
-  Type *Ty = 0;
+  Type *Ty = nullptr;
   if (ParseType(Ty, true /*void allowed*/)) return true;
 
   Type *ResType = PFS.getFunction().getReturnType();
@@ -3567,7 +3634,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   std::vector<unsigned> FwdRefAttrGrps;
   LocTy NoBuiltinLoc;
   CallingConv::ID CC;
-  Type *RetType = 0;
+  Type *RetType = nullptr;
   LocTy RetTypeLoc;
   ValID CalleeID;
   SmallVector<ParamInfo, 16> ArgList;
@@ -3589,8 +3656,8 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   // If RetType is a non-function pointer type, then this is the short syntax
   // for the call, which means that RetType is just the return type.  Infer the
   // rest of the function argument types from the arguments that are present.
-  PointerType *PFTy = 0;
-  FunctionType *Ty = 0;
+  PointerType *PFTy = nullptr;
+  FunctionType *Ty = nullptr;
   if (!(PFTy = dyn_cast<PointerType>(RetType)) ||
       !(Ty = dyn_cast<FunctionType>(PFTy->getElementType()))) {
     // Pull out the types of all of the arguments...
@@ -3623,7 +3690,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   FunctionType::param_iterator I = Ty->param_begin();
   FunctionType::param_iterator E = Ty->param_end();
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
-    Type *ExpectedTy = 0;
+    Type *ExpectedTy = nullptr;
     if (I != E) {
       ExpectedTy = *I++;
     } else if (!Ty->isVarArg()) {
@@ -3764,7 +3831,7 @@ bool LLParser::ParseCast(Instruction *&Inst, PerFunctionState &PFS,
                          unsigned Opc) {
   LocTy Loc;
   Value *Op;
-  Type *DestTy = 0;
+  Type *DestTy = nullptr;
   if (ParseTypeAndValue(Op, Loc, PFS) ||
       ParseToken(lltok::kw_to, "expected 'to' after cast value") ||
       ParseType(DestTy))
@@ -3803,7 +3870,7 @@ bool LLParser::ParseSelect(Instruction *&Inst, PerFunctionState &PFS) {
 ///   ::= 'va_arg' TypeAndValue ',' Type
 bool LLParser::ParseVA_Arg(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Op;
-  Type *EltTy = 0;
+  Type *EltTy = nullptr;
   LocTy TypeLoc;
   if (ParseTypeAndValue(Op, PFS) ||
       ParseToken(lltok::comma, "expected ',' after vaarg operand") ||
@@ -3875,7 +3942,7 @@ bool LLParser::ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS) {
 /// ParsePHI
 ///   ::= 'phi' Type '[' Value ',' Value ']' (',' '[' Value ',' Value ']')*
 int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
-  Type *Ty = 0;  LocTy TypeLoc;
+  Type *Ty = nullptr;  LocTy TypeLoc;
   Value *Op0, *Op1;
 
   if (ParseType(Ty, TypeLoc) ||
@@ -3924,7 +3991,7 @@ int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
 ///   ::= 'filter'
 ///   ::= 'filter' TypeAndValue ( ',' TypeAndValue )*
 bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
-  Type *Ty = 0; LocTy TyLoc;
+  Type *Ty = nullptr; LocTy TyLoc;
   Value *PersFn; LocTy PersFnLoc;
 
   if (ParseType(Ty, TyLoc) ||
@@ -3968,21 +4035,26 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
 }
 
 /// ParseCall
-///   ::= 'tail'? 'call' OptionalCallingConv OptionalAttrs Type Value
+///   ::= 'call' OptionalCallingConv OptionalAttrs Type Value
+///       ParameterList OptionalAttrs
+///   ::= 'tail' 'call' OptionalCallingConv OptionalAttrs Type Value
+///       ParameterList OptionalAttrs
+///   ::= 'musttail' 'call' OptionalCallingConv OptionalAttrs Type Value
 ///       ParameterList OptionalAttrs
 bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
-                         bool isTail) {
+                         CallInst::TailCallKind TCK) {
   AttrBuilder RetAttrs, FnAttrs;
   std::vector<unsigned> FwdRefAttrGrps;
   LocTy BuiltinLoc;
   CallingConv::ID CC;
-  Type *RetType = 0;
+  Type *RetType = nullptr;
   LocTy RetTypeLoc;
   ValID CalleeID;
   SmallVector<ParamInfo, 16> ArgList;
   LocTy CallLoc = Lex.getLoc();
 
-  if ((isTail && ParseToken(lltok::kw_call, "expected 'tail call'")) ||
+  if ((TCK != CallInst::TCK_None &&
+       ParseToken(lltok::kw_call, "expected 'tail call'")) ||
       ParseOptionalCallingConv(CC) ||
       ParseOptionalReturnAttrs(RetAttrs) ||
       ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
@@ -3995,8 +4067,8 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   // If RetType is a non-function pointer type, then this is the short syntax
   // for the call, which means that RetType is just the return type.  Infer the
   // rest of the function argument types from the arguments that are present.
-  PointerType *PFTy = 0;
-  FunctionType *Ty = 0;
+  PointerType *PFTy = nullptr;
+  FunctionType *Ty = nullptr;
   if (!(PFTy = dyn_cast<PointerType>(RetType)) ||
       !(Ty = dyn_cast<FunctionType>(PFTy->getElementType()))) {
     // Pull out the types of all of the arguments...
@@ -4029,7 +4101,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   FunctionType::param_iterator I = Ty->param_begin();
   FunctionType::param_iterator E = Ty->param_end();
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
-    Type *ExpectedTy = 0;
+    Type *ExpectedTy = nullptr;
     if (I != E) {
       ExpectedTy = *I++;
     } else if (!Ty->isVarArg()) {
@@ -4058,7 +4130,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   AttributeSet PAL = AttributeSet::get(Context, Attrs);
 
   CallInst *CI = CallInst::Create(Callee, Args);
-  CI->setTailCall(isTail);
+  CI->setTailCallKind(TCK);
   CI->setCallingConv(CC);
   CI->setAttributes(PAL);
   ForwardRefAttrGroups[CI] = FwdRefAttrGrps;
@@ -4073,10 +4145,10 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 /// ParseAlloc
 ///   ::= 'alloca' 'inalloca'? Type (',' TypeAndValue)? (',' 'align' i32)?
 int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
-  Value *Size = 0;
+  Value *Size = nullptr;
   LocTy SizeLoc;
   unsigned Alignment = 0;
-  Type *Ty = 0;
+  Type *Ty = nullptr;
 
   bool IsInAlloca = EatIfPresent(lltok::kw_inalloca);
 
@@ -4315,8 +4387,8 @@ int LLParser::ParseFence(Instruction *&Inst, PerFunctionState &PFS) {
 /// ParseGetElementPtr
 ///   ::= 'getelementptr' 'inbounds'? TypeAndValue (',' TypeAndValue)*
 int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
-  Value *Ptr = 0;
-  Value *Val = 0;
+  Value *Ptr = nullptr;
+  Value *Val = nullptr;
   LocTy Loc, EltLoc;
 
   bool InBounds = EatIfPresent(lltok::kw_inbounds);
@@ -4418,11 +4490,11 @@ bool LLParser::ParseMDNodeVector(SmallVectorImpl<Value*> &Elts,
   do {
     // Null is a special case since it is typeless.
     if (EatIfPresent(lltok::kw_null)) {
-      Elts.push_back(0);
+      Elts.push_back(nullptr);
       continue;
     }
 
-    Value *V = 0;
+    Value *V = nullptr;
     if (ParseTypeAndValue(V, PFS)) return true;
     Elts.push_back(V);
   } while (EatIfPresent(lltok::comma));
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 294a1e1..e2bf462 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -176,7 +176,8 @@ namespace llvm {
       return FMF;
     }
 
-    bool ParseOptionalToken(lltok::Kind T, bool &Present, LocTy *Loc = 0) {
+    bool ParseOptionalToken(lltok::Kind T, bool &Present,
+                            LocTy *Loc = nullptr) {
       if (Lex.getKind() != T) {
         Present = false;
       } else {
@@ -348,7 +349,7 @@ namespace llvm {
                             PerFunctionState &PFS);
 
     // Constant Parsing.
-    bool ParseValID(ValID &ID, PerFunctionState *PFS = NULL);
+    bool ParseValID(ValID &ID, PerFunctionState *PFS = nullptr);
     bool ParseGlobalValue(Type *Ty, Constant *&V);
     bool ParseGlobalTypeAndValue(Constant *&V);
     bool ParseGlobalValueVector(SmallVectorImpl<Constant*> &Elts);
@@ -371,6 +372,8 @@ namespace llvm {
     bool ParseFunctionBody(Function &Fn);
     bool ParseBasicBlock(PerFunctionState &PFS);
 
+    enum TailCallType { TCT_None, TCT_Tail, TCT_MustTail };
+
     // Instruction Parsing.  Each instruction parsing routine can return with a
     // normal result, an error result, or return having eaten an extra comma.
     enum InstResult { InstNormal = 0, InstError = 1, InstExtraComma = 2 };
@@ -397,7 +400,8 @@ namespace llvm {
     bool ParseShuffleVector(Instruction *&I, PerFunctionState &PFS);
     int ParsePHI(Instruction *&I, PerFunctionState &PFS);
     bool ParseLandingPad(Instruction *&I, PerFunctionState &PFS);
-    bool ParseCall(Instruction *&I, PerFunctionState &PFS, bool isTail);
+    bool ParseCall(Instruction *&I, PerFunctionState &PFS,
+                   CallInst::TailCallKind IsTail);
     int ParseAlloc(Instruction *&I, PerFunctionState &PFS);
     int ParseLoad(Instruction *&I, PerFunctionState &PFS);
     int ParseStore(Instruction *&I, PerFunctionState &PFS);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 532e896..b6b7d82 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -39,6 +39,8 @@ namespace lltok {
 
     kw_private,
     kw_internal,
+    kw_linker_private,          // NOTE: deprecated, for parser compatibility
+    kw_linker_private_weak,     // NOTE: deprecated, for parser compatibility
     kw_linkonce, kw_linkonce_odr,
     kw_weak, kw_weak_odr, kw_appending,
     kw_dllimport, kw_dllexport, kw_common, kw_available_externally,
@@ -52,6 +54,7 @@ namespace lltok {
     kw_undef, kw_null,
     kw_to,
     kw_tail,
+    kw_musttail,
     kw_target,
     kw_triple,
     kw_unwind,
@@ -85,7 +88,7 @@ namespace lltok {
 
     kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
     kw_intel_ocl_bicc,
-    kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc, kw_x86_cdeclmethodcc,
+    kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc,
     kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
     kw_msp430_intrcc,
     kw_ptx_kernel, kw_ptx_device,
@@ -114,6 +117,7 @@ namespace lltok {
     kw_noimplicitfloat,
     kw_noinline,
     kw_nonlazybind,
+    kw_nonnull,
     kw_noredzone,
     kw_noreturn,
     kw_nounwind,
diff --git a/lib/AsmParser/Parser.cpp b/lib/AsmParser/Parser.cpp
index a1da5e1..2606bc2 100644
--- a/lib/AsmParser/Parser.cpp
+++ b/lib/AsmParser/Parser.cpp
@@ -30,12 +30,12 @@ Module *llvm::ParseAssembly(MemoryBuffer *F,
 
   // If we are parsing into an existing module, do it.
   if (M)
-    return LLParser(F, SM, Err, M).Run() ? 0 : M;
+    return LLParser(F, SM, Err, M).Run() ? nullptr : M;
 
   // Otherwise create a new module.
   std::unique_ptr<Module> M2(new Module(F->getBufferIdentifier(), Context));
   if (LLParser(F, SM, Err, M2.get()).Run())
-    return 0;
+    return nullptr;
   return M2.release();
 }
 
@@ -45,10 +45,10 @@ Module *llvm::ParseAssemblyFile(const std::string &Filename, SMDiagnostic &Err,
   if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
     Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
                        "Could not open input file: " + ec.message());
-    return 0;
+    return nullptr;
   }
 
-  return ParseAssembly(File.release(), 0, Err, Context);
+  return ParseAssembly(File.release(), nullptr, Err, Context);
 }
 
 Module *llvm::ParseAssemblyString(const char *AsmString, Module *M,
diff --git a/lib/AsmParser/module.modulemap b/lib/AsmParser/module.modulemap
new file mode 100644
index 0000000..cc300060
--- /dev/null
+++ b/lib/AsmParser/module.modulemap
@@ -0,0 +1 @@
+module AsmParser { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/Bitcode/Reader/BitReader.cpp b/lib/Bitcode/Reader/BitReader.cpp
index 3e360a8..716299f 100644
--- a/lib/Bitcode/Reader/BitReader.cpp
+++ b/lib/Bitcode/Reader/BitReader.cpp
@@ -35,7 +35,7 @@ LLVMBool LLVMParseBitcodeInContext(LLVMContextRef ContextRef,
   if (error_code EC = ModuleOrErr.getError()) {
     if (OutMessage)
       *OutMessage = strdup(EC.message().c_str());
-    *OutModule = wrap((Module*)0);
+    *OutModule = wrap((Module*)nullptr);
     return 1;
   }
 
@@ -55,7 +55,7 @@ LLVMBool LLVMGetBitcodeModuleInContext(LLVMContextRef ContextRef,
       getLazyBitcodeModule(unwrap(MemBuf), *unwrap(ContextRef));
 
   if (error_code EC = ModuleOrErr.getError()) {
-    *OutM = wrap((Module *)NULL);
+    *OutM = wrap((Module *)nullptr);
     if (OutMessage)
       *OutMessage = strdup(EC.message().c_str());
     return 1;
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index f712d9d..4170f98 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -41,7 +41,7 @@ void BitcodeReader::materializeForwardReferencedFunctions() {
 void BitcodeReader::FreeState() {
   if (BufferOwned)
     delete Buffer;
-  Buffer = 0;
+  Buffer = nullptr;
   std::vector<Type*>().swap(TypeList);
   ValueList.clear();
   MDValueList.clear();
@@ -258,7 +258,7 @@ void BitcodeReaderValueList::AssignValue(Value *V, unsigned Idx) {
     resize(Idx+1);
 
   WeakVH &OldV = ValuePtrs[Idx];
-  if (OldV == 0) {
+  if (!OldV) {
     OldV = V;
     return;
   }
@@ -298,12 +298,12 @@ Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, Type *Ty) {
     resize(Idx + 1);
 
   if (Value *V = ValuePtrs[Idx]) {
-    assert((Ty == 0 || Ty == V->getType()) && "Type mismatch in value table!");
+    assert((!Ty || Ty == V->getType()) && "Type mismatch in value table!");
     return V;
   }
 
   // No type specified, must be invalid reference.
-  if (Ty == 0) return 0;
+  if (!Ty) return nullptr;
 
   // Create and return a placeholder, which will later be RAUW'd.
   Value *V = new Argument(Ty);
@@ -403,7 +403,7 @@ void BitcodeReaderMDValueList::AssignValue(Value *V, unsigned Idx) {
     resize(Idx+1);
 
   WeakVH &OldV = MDValuePtrs[Idx];
-  if (OldV == 0) {
+  if (!OldV) {
     OldV = V;
     return;
   }
@@ -435,7 +435,7 @@ Value *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) {
 Type *BitcodeReader::getTypeByID(unsigned ID) {
   // The type table size is always specified correctly.
   if (ID >= TypeList.size())
-    return 0;
+    return nullptr;
 
   if (Type *Ty = TypeList[ID])
     return Ty;
@@ -569,6 +569,8 @@ static Attribute::AttrKind GetAttrFromCode(uint64_t Code) {
     return Attribute::NoInline;
   case bitc::ATTR_KIND_NON_LAZY_BIND:
     return Attribute::NonLazyBind;
+  case bitc::ATTR_KIND_NON_NULL:
+    return Attribute::NonNull;
   case bitc::ATTR_KIND_NO_RED_ZONE:
     return Attribute::NoRedZone;
   case bitc::ATTR_KIND_NO_RETURN:
@@ -737,7 +739,7 @@ error_code BitcodeReader::ParseTypeTableBody() {
 
     // Read a record.
     Record.clear();
-    Type *ResultTy = 0;
+    Type *ResultTy = nullptr;
     switch (Stream.readRecord(Entry.ID, Record)) {
     default:
       return Error(InvalidValue);
@@ -792,7 +794,7 @@ error_code BitcodeReader::ParseTypeTableBody() {
       if (Record.size() == 2)
         AddressSpace = Record[1];
       ResultTy = getTypeByID(Record[0]);
-      if (ResultTy == 0)
+      if (!ResultTy)
         return Error(InvalidType);
       ResultTy = PointerType::get(ResultTy, AddressSpace);
       break;
@@ -811,7 +813,7 @@ error_code BitcodeReader::ParseTypeTableBody() {
       }
 
       ResultTy = getTypeByID(Record[2]);
-      if (ResultTy == 0 || ArgTys.size() < Record.size()-3)
+      if (!ResultTy || ArgTys.size() < Record.size()-3)
         return Error(InvalidType);
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
@@ -830,7 +832,7 @@ error_code BitcodeReader::ParseTypeTableBody() {
       }
 
       ResultTy = getTypeByID(Record[1]);
-      if (ResultTy == 0 || ArgTys.size() < Record.size()-2)
+      if (!ResultTy || ArgTys.size() < Record.size()-2)
         return Error(InvalidType);
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
@@ -867,7 +869,7 @@ error_code BitcodeReader::ParseTypeTableBody() {
       StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
       if (Res) {
         Res->setName(TypeName);
-        TypeList[NumRecords] = 0;
+        TypeList[NumRecords] = nullptr;
       } else  // Otherwise, create a new struct.
         Res = StructType::create(Context, TypeName);
       TypeName.clear();
@@ -896,7 +898,7 @@ error_code BitcodeReader::ParseTypeTableBody() {
       StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
       if (Res) {
         Res->setName(TypeName);
-        TypeList[NumRecords] = 0;
+        TypeList[NumRecords] = nullptr;
       } else  // Otherwise, create a new struct with no body.
         Res = StructType::create(Context, TypeName);
       TypeName.clear();
@@ -924,7 +926,7 @@ error_code BitcodeReader::ParseTypeTableBody() {
     if (NumRecords >= TypeList.size())
       return Error(InvalidTYPETable);
     assert(ResultTy && "Didn't read a type?");
-    assert(TypeList[NumRecords] == 0 && "Already read type?");
+    assert(!TypeList[NumRecords] && "Already read type?");
     TypeList[NumRecords++] = ResultTy;
   }
 }
@@ -972,7 +974,7 @@ error_code BitcodeReader::ParseValueSymbolTable() {
       if (ConvertToString(Record, 1, ValueName))
         return Error(InvalidRecord);
       BasicBlock *BB = getBasicBlock(Record[0]);
-      if (BB == 0)
+      if (!BB)
         return Error(InvalidRecord);
 
       BB->setName(StringRef(ValueName.data(), ValueName.size()));
@@ -1028,7 +1030,7 @@ error_code BitcodeReader::ParseMetadata() {
       NamedMDNode *NMD = TheModule->getOrInsertNamedMetadata(Name);
       for (unsigned i = 0; i != Size; ++i) {
         MDNode *MD = dyn_cast_or_null<MDNode>(MDValueList.getValueFwdRef(Record[i]));
-        if (MD == 0)
+        if (!MD)
           return Error(InvalidRecord);
         NMD->addOperand(MD);
       }
@@ -1052,7 +1054,7 @@ error_code BitcodeReader::ParseMetadata() {
         else if (!Ty->isVoidTy())
           Elts.push_back(ValueList.getValueFwdRef(Record[i+1], Ty));
         else
-          Elts.push_back(NULL);
+          Elts.push_back(nullptr);
       }
       Value *V = MDNode::getWhenValsUnresolved(Context, Elts, IsFunctionLocal);
       IsFunctionLocal = false;
@@ -1092,6 +1094,28 @@ uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) {
   return 1ULL << 63;
 }
 
+// FIXME: Delete this in LLVM 4.0 and just assert that the aliasee is a
+// GlobalObject.
+static GlobalObject &
+getGlobalObjectInExpr(const DenseMap<GlobalAlias *, Constant *> &Map,
+                      Constant &C) {
+  auto *GO = dyn_cast<GlobalObject>(&C);
+  if (GO)
+    return *GO;
+
+  auto *GA = dyn_cast<GlobalAlias>(&C);
+  if (GA)
+    return getGlobalObjectInExpr(Map, *Map.find(GA)->second);
+
+  auto &CE = cast<ConstantExpr>(C);
+  assert(CE.getOpcode() == Instruction::BitCast ||
+         CE.getOpcode() == Instruction::GetElementPtr ||
+         CE.getOpcode() == Instruction::AddrSpaceCast);
+  if (CE.getOpcode() == Instruction::GetElementPtr)
+    assert(cast<GEPOperator>(CE).hasAllZeroIndices());
+  return getGlobalObjectInExpr(Map, *CE.getOperand(0));
+}
+
 /// ResolveGlobalAndAliasInits - Resolve all of the initializers for global
 /// values and aliases that we can.
 error_code BitcodeReader::ResolveGlobalAndAliasInits() {
@@ -1117,19 +1141,30 @@ error_code BitcodeReader::ResolveGlobalAndAliasInits() {
     GlobalInitWorklist.pop_back();
   }
 
+  // FIXME: Delete this in LLVM 4.0
+  // Older versions of llvm could write an alias pointing to another. We cannot
+  // construct those aliases, so we first collect an alias to aliasee expression
+  // and then compute the actual aliasee.
+  DenseMap<GlobalAlias *, Constant *> AliasInit;
+
   while (!AliasInitWorklist.empty()) {
     unsigned ValID = AliasInitWorklist.back().second;
     if (ValID >= ValueList.size()) {
       AliasInits.push_back(AliasInitWorklist.back());
     } else {
       if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
-        AliasInitWorklist.back().first->setAliasee(C);
+        AliasInit.insert(std::make_pair(AliasInitWorklist.back().first, C));
       else
         return Error(ExpectedConstant);
     }
     AliasInitWorklist.pop_back();
   }
 
+  for (auto &Pair : AliasInit) {
+    auto &GO = getGlobalObjectInExpr(AliasInit, *Pair.second);
+    Pair.first->setAliasee(&GO);
+  }
+
   while (!FunctionPrefixWorklist.empty()) {
     unsigned ValID = FunctionPrefixWorklist.back().second;
     if (ValID >= ValueList.size()) {
@@ -1185,7 +1220,7 @@ error_code BitcodeReader::ParseConstants() {
 
     // Read a record.
     Record.clear();
-    Value *V = 0;
+    Value *V = nullptr;
     unsigned BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
     default:  // Default behavior: unknown constant
@@ -1418,34 +1453,52 @@ error_code BitcodeReader::ParseConstants() {
                                   ValueList.getConstantFwdRef(Record[2],CurTy));
       break;
     }
-    case bitc::CST_CODE_CE_EXTRACTELT: { // CE_EXTRACTELT: [opty, opval, opval]
+    case bitc::CST_CODE_CE_EXTRACTELT
+        : { // CE_EXTRACTELT: [opty, opval, opty, opval]
       if (Record.size() < 3)
         return Error(InvalidRecord);
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
-      if (OpTy == 0)
+      if (!OpTy)
         return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
-      Constant *Op1 = ValueList.getConstantFwdRef(Record[2],
-                                                  Type::getInt32Ty(Context));
+      Constant *Op1 = nullptr;
+      if (Record.size() == 4) {
+        Type *IdxTy = getTypeByID(Record[2]);
+        if (!IdxTy)
+          return Error(InvalidRecord);
+        Op1 = ValueList.getConstantFwdRef(Record[3], IdxTy);
+      } else // TODO: Remove with llvm 4.0
+        Op1 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context));
+      if (!Op1)
+        return Error(InvalidRecord);
       V = ConstantExpr::getExtractElement(Op0, Op1);
       break;
     }
-    case bitc::CST_CODE_CE_INSERTELT: { // CE_INSERTELT: [opval, opval, opval]
+    case bitc::CST_CODE_CE_INSERTELT
+        : { // CE_INSERTELT: [opval, opval, opty, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
-      if (Record.size() < 3 || OpTy == 0)
+      if (Record.size() < 3 || !OpTy)
         return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1],
                                                   OpTy->getElementType());
-      Constant *Op2 = ValueList.getConstantFwdRef(Record[2],
-                                                  Type::getInt32Ty(Context));
+      Constant *Op2 = nullptr;
+      if (Record.size() == 4) {
+        Type *IdxTy = getTypeByID(Record[2]);
+        if (!IdxTy)
+          return Error(InvalidRecord);
+        Op2 = ValueList.getConstantFwdRef(Record[3], IdxTy);
+      } else // TODO: Remove with llvm 4.0
+        Op2 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context));
+      if (!Op2)
+        return Error(InvalidRecord);
       V = ConstantExpr::getInsertElement(Op0, Op1, Op2);
       break;
     }
     case bitc::CST_CODE_CE_SHUFFLEVEC: { // CE_SHUFFLEVEC: [opval, opval, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
-      if (Record.size() < 3 || OpTy == 0)
+      if (Record.size() < 3 || !OpTy)
         return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy);
@@ -1459,7 +1512,7 @@ error_code BitcodeReader::ParseConstants() {
       VectorType *RTy = dyn_cast<VectorType>(CurTy);
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
-      if (Record.size() < 4 || RTy == 0 || OpTy == 0)
+      if (Record.size() < 4 || !RTy || !OpTy)
         return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
@@ -1473,7 +1526,7 @@ error_code BitcodeReader::ParseConstants() {
       if (Record.size() < 4)
         return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
-      if (OpTy == 0)
+      if (!OpTy)
         return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
@@ -1538,11 +1591,11 @@ error_code BitcodeReader::ParseConstants() {
       if (Record.size() < 3)
         return Error(InvalidRecord);
       Type *FnTy = getTypeByID(Record[0]);
-      if (FnTy == 0)
+      if (!FnTy)
         return Error(InvalidRecord);
       Function *Fn =
         dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
-      if (Fn == 0)
+      if (!Fn)
         return Error(InvalidRecord);
 
       // If the function is already parsed we can insert the block address right
@@ -1561,7 +1614,7 @@ error_code BitcodeReader::ParseConstants() {
         GlobalVariable *FwdRef = new GlobalVariable(*Fn->getParent(),
                                                     Type::getInt8Ty(Context),
                                             false, GlobalValue::InternalLinkage,
-                                                    0, "");
+                                                    nullptr, "");
         BlockAddrFwdRefs[Fn].push_back(std::make_pair(Record[2], FwdRef));
         V = FwdRef;
       }
@@ -1649,8 +1702,11 @@ error_code BitcodeReader::GlobalCleanup() {
   // Look for global variables which need to be renamed.
   for (Module::global_iterator
          GI = TheModule->global_begin(), GE = TheModule->global_end();
-       GI != GE; ++GI)
-    UpgradeGlobalVariable(GI);
+       GI != GE;) {
+    GlobalVariable *GV = GI++;
+    UpgradeGlobalVariable(GV);
+  }
+
   // Force deallocation of memory for these vectors to favor the client that
   // want lazy deserialization.
   std::vector<std::pair<GlobalVariable*, unsigned> >().swap(GlobalInits);
@@ -1838,7 +1894,9 @@ error_code BitcodeReader::ParseModule(bool Resume) {
         Section = SectionTable[Record[5]-1];
       }
       GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
-      if (Record.size() > 6)
+      // Local linkage must have default visibility.
+      if (Record.size() > 6 && !GlobalValue::isLocalLinkage(Linkage))
+        // FIXME: Change to an error if non-default in 4.0.
         Visibility = GetDecodedVisibility(Record[6]);
 
       GlobalVariable::ThreadLocalMode TLM = GlobalVariable::NotThreadLocal;
@@ -1854,7 +1912,7 @@ error_code BitcodeReader::ParseModule(bool Resume) {
         ExternallyInitialized = Record[9];
 
       GlobalVariable *NewGV =
-        new GlobalVariable(*TheModule, Ty, isConstant, Linkage, 0, "", 0,
+        new GlobalVariable(*TheModule, Ty, isConstant, Linkage, nullptr, "", nullptr,
                            TLM, AddressSpace, ExternallyInitialized);
       NewGV->setAlignment(Alignment);
       if (!Section.empty())
@@ -1904,7 +1962,10 @@ error_code BitcodeReader::ParseModule(bool Resume) {
           return Error(InvalidID);
         Func->setSection(SectionTable[Record[6]-1]);
       }
-      Func->setVisibility(GetDecodedVisibility(Record[7]));
+      // Local linkage must have default visibility.
+      if (!Func->hasLocalLinkage())
+        // FIXME: Change to an error if non-default in 4.0.
+        Func->setVisibility(GetDecodedVisibility(Record[7]));
       if (Record.size() > 8 && Record[8]) {
         if (Record[8]-1 > GCTable.size())
           return Error(InvalidID);
@@ -1940,13 +2001,17 @@ error_code BitcodeReader::ParseModule(bool Resume) {
       Type *Ty = getTypeByID(Record[0]);
       if (!Ty)
         return Error(InvalidRecord);
-      if (!Ty->isPointerTy())
+      auto *PTy = dyn_cast<PointerType>(Ty);
+      if (!PTy)
         return Error(InvalidTypeForValue);
 
-      GlobalAlias *NewGA = new GlobalAlias(Ty, GetDecodedLinkage(Record[2]),
-                                           "", 0, TheModule);
+      auto *NewGA =
+          GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                              GetDecodedLinkage(Record[2]), "", TheModule);
       // Old bitcode files didn't have visibility field.
-      if (Record.size() > 3)
+      // Local linkage must have default visibility.
+      if (Record.size() > 3 && !NewGA->hasLocalLinkage())
+        // FIXME: Change to an error if non-default in 4.0.
         NewGA->setVisibility(GetDecodedVisibility(Record[3]));
       if (Record.size() > 4)
         NewGA->setDLLStorageClass(GetDecodedDLLStorageClass(Record[4]));
@@ -1969,7 +2034,7 @@ error_code BitcodeReader::ParseModule(bool Resume) {
 }
 
 error_code BitcodeReader::ParseBitcodeInto(Module *M) {
-  TheModule = 0;
+  TheModule = nullptr;
 
   if (error_code EC = InitStream())
     return EC;
@@ -2173,7 +2238,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
     ValueList.push_back(I);
 
   unsigned NextValueNo = ValueList.size();
-  BasicBlock *CurBB = 0;
+  BasicBlock *CurBB = nullptr;
   unsigned CurBBNo = 0;
 
   DebugLoc LastLoc;
@@ -2222,7 +2287,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
 
     // Read a record.
     Record.clear();
-    Instruction *I = 0;
+    Instruction *I = nullptr;
     unsigned BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
     default: // Default behavior: reject
@@ -2240,7 +2305,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_DEBUG_LOC_AGAIN:  // DEBUG_LOC_AGAIN
       // This record indicates that the last instruction is at the same
       // location as the previous instruction with a location.
-      I = 0;
+      I = nullptr;
 
       // Get the last instruction emitted.
       if (CurBB && !CurBB->empty())
@@ -2249,31 +2314,31 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
                !FunctionBBs[CurBBNo-1]->empty())
         I = &FunctionBBs[CurBBNo-1]->back();
 
-      if (I == 0)
+      if (!I)
         return Error(InvalidRecord);
       I->setDebugLoc(LastLoc);
-      I = 0;
+      I = nullptr;
       continue;
 
     case bitc::FUNC_CODE_DEBUG_LOC: {      // DEBUG_LOC: [line, col, scope, ia]
-      I = 0;     // Get the last instruction emitted.
+      I = nullptr;     // Get the last instruction emitted.
       if (CurBB && !CurBB->empty())
         I = &CurBB->back();
       else if (CurBBNo && FunctionBBs[CurBBNo-1] &&
                !FunctionBBs[CurBBNo-1]->empty())
         I = &FunctionBBs[CurBBNo-1]->back();
-      if (I == 0 || Record.size() < 4)
+      if (!I || Record.size() < 4)
         return Error(InvalidRecord);
 
       unsigned Line = Record[0], Col = Record[1];
       unsigned ScopeID = Record[2], IAID = Record[3];
 
-      MDNode *Scope = 0, *IA = 0;
+      MDNode *Scope = nullptr, *IA = nullptr;
       if (ScopeID) Scope = cast<MDNode>(MDValueList.getValueFwdRef(ScopeID-1));
       if (IAID)    IA = cast<MDNode>(MDValueList.getValueFwdRef(IAID-1));
       LastLoc = DebugLoc::get(Line, Col, Scope, IA);
       I->setDebugLoc(LastLoc);
-      I = 0;
+      I = nullptr;
       continue;
     }
 
@@ -2333,9 +2398,9 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
 
       Type *ResTy = getTypeByID(Record[OpNum]);
       int Opc = GetDecodedCastOpcode(Record[OpNum+1]);
-      if (Opc == -1 || ResTy == 0)
+      if (Opc == -1 || !ResTy)
         return Error(InvalidRecord);
-      Instruction *Temp = 0;
+      Instruction *Temp = nullptr;
       if ((I = UpgradeBitCastInst(Opc, Op, ResTy, Temp))) {
         if (Temp) {
           InstructionList.push_back(Temp);
@@ -2460,7 +2525,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Vec, *Idx;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
-          popValue(Record, OpNum, NextValueNo, Type::getInt32Ty(Context), Idx))
+          getValueTypePair(Record, OpNum, NextValueNo, Idx))
         return Error(InvalidRecord);
       I = ExtractElementInst::Create(Vec, Idx);
       InstructionList.push_back(I);
@@ -2473,7 +2538,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
           popValue(Record, OpNum, NextValueNo,
                    cast<VectorType>(Vec->getType())->getElementType(), Elt) ||
-          popValue(Record, OpNum, NextValueNo, Type::getInt32Ty(Context), Idx))
+          getValueTypePair(Record, OpNum, NextValueNo, Idx))
         return Error(InvalidRecord);
       I = InsertElementInst::Create(Vec, Elt, Idx);
       InstructionList.push_back(I);
@@ -2526,7 +2591,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
         }
 
         unsigned OpNum = 0;
-        Value *Op = NULL;
+        Value *Op = nullptr;
         if (getValueTypePair(Record, OpNum, NextValueNo, Op))
           return Error(InvalidRecord);
         if (OpNum != Record.size())
@@ -2540,7 +2605,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
       if (Record.size() != 1 && Record.size() != 3)
         return Error(InvalidRecord);
       BasicBlock *TrueDest = getBasicBlock(Record[0]);
-      if (TrueDest == 0)
+      if (!TrueDest)
         return Error(InvalidRecord);
 
       if (Record.size() == 1) {
@@ -2551,7 +2616,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
         BasicBlock *FalseDest = getBasicBlock(Record[1]);
         Value *Cond = getValue(Record, 2, NextValueNo,
                                Type::getInt1Ty(Context));
-        if (FalseDest == 0 || Cond == 0)
+        if (!FalseDest || !Cond)
           return Error(InvalidRecord);
         I = BranchInst::Create(TrueDest, FalseDest, Cond);
         InstructionList.push_back(I);
@@ -2571,7 +2636,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
 
         Value *Cond = getValue(Record, 2, NextValueNo, OpTy);
         BasicBlock *Default = getBasicBlock(Record[3]);
-        if (OpTy == 0 || Cond == 0 || Default == 0)
+        if (!OpTy || !Cond || !Default)
           return Error(InvalidRecord);
 
         unsigned NumCases = Record[4];
@@ -2628,7 +2693,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
       Type *OpTy = getTypeByID(Record[0]);
       Value *Cond = getValue(Record, 1, NextValueNo, OpTy);
       BasicBlock *Default = getBasicBlock(Record[2]);
-      if (OpTy == 0 || Cond == 0 || Default == 0)
+      if (!OpTy || !Cond || !Default)
         return Error(InvalidRecord);
       unsigned NumCases = (Record.size()-3)/2;
       SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
@@ -2637,7 +2702,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
         ConstantInt *CaseVal =
           dyn_cast_or_null<ConstantInt>(getFnValueByID(Record[3+i*2], OpTy));
         BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]);
-        if (CaseVal == 0 || DestBB == 0) {
+        if (!CaseVal || !DestBB) {
           delete SI;
           return Error(InvalidRecord);
         }
@@ -2651,7 +2716,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
         return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Address = getValue(Record, 1, NextValueNo, OpTy);
-      if (OpTy == 0 || Address == 0)
+      if (!OpTy || !Address)
         return Error(InvalidRecord);
       unsigned NumDests = Record.size()-2;
       IndirectBrInst *IBI = IndirectBrInst::Create(Address, NumDests);
@@ -2683,11 +2748,11 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
         return Error(InvalidRecord);
 
       PointerType *CalleeTy = dyn_cast<PointerType>(Callee->getType());
-      FunctionType *FTy = !CalleeTy ? 0 :
+      FunctionType *FTy = !CalleeTy ? nullptr :
         dyn_cast<FunctionType>(CalleeTy->getElementType());
 
       // Check that the right number of fixed parameters are here.
-      if (FTy == 0 || NormalBB == 0 || UnwindBB == 0 ||
+      if (!FTy || !NormalBB || !UnwindBB ||
           Record.size() < OpNum+FTy->getNumParams())
         return Error(InvalidRecord);
 
@@ -2695,7 +2760,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
         Ops.push_back(getValue(Record, OpNum, NextValueNo,
                                FTy->getParamType(i)));
-        if (Ops.back() == 0)
+        if (!Ops.back())
           return Error(InvalidRecord);
       }
 
@@ -2721,7 +2786,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_RESUME: { // RESUME: [opval]
       unsigned Idx = 0;
-      Value *Val = 0;
+      Value *Val = nullptr;
       if (getValueTypePair(Record, Idx, NextValueNo, Val))
         return Error(InvalidRecord);
       I = ResumeInst::Create(Val);
@@ -2768,7 +2833,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
       Type *Ty = getTypeByID(Record[Idx++]);
       if (!Ty)
         return Error(InvalidRecord);
-      Value *PersFn = 0;
+      Value *PersFn = nullptr;
       if (getValueTypePair(Record, Idx, NextValueNo, PersFn))
         return Error(InvalidRecord);
 
@@ -2961,7 +3026,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
         return Error(InvalidRecord);
 
       PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
-      FunctionType *FTy = 0;
+      FunctionType *FTy = nullptr;
       if (OpTy) FTy = dyn_cast<FunctionType>(OpTy->getElementType());
       if (!FTy || Record.size() < FTy->getNumParams()+OpNum)
         return Error(InvalidRecord);
@@ -2974,7 +3039,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
         else
           Args.push_back(getValue(Record, OpNum, NextValueNo,
                                   FTy->getParamType(i)));
-        if (Args.back() == 0)
+        if (!Args.back())
           return Error(InvalidRecord);
       }
 
@@ -2994,8 +3059,13 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
       I = CallInst::Create(Callee, Args);
       InstructionList.push_back(I);
       cast<CallInst>(I)->setCallingConv(
-        static_cast<CallingConv::ID>(CCInfo>>1));
-      cast<CallInst>(I)->setTailCall(CCInfo & 1);
+          static_cast<CallingConv::ID>((~(1U << 14) & CCInfo) >> 1));
+      CallInst::TailCallKind TCK = CallInst::TCK_None;
+      if (CCInfo & 1)
+        TCK = CallInst::TCK_Tail;
+      if (CCInfo & (1 << 14))
+        TCK = CallInst::TCK_MustTail;
+      cast<CallInst>(I)->setTailCallKind(TCK);
       cast<CallInst>(I)->setAttributes(PAL);
       break;
     }
@@ -3015,7 +3085,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
 
     // Add instruction to end of current BB.  If there is no current BB, reject
     // this file.
-    if (CurBB == 0) {
+    if (!CurBB) {
       delete I;
       return Error(InvalidInstructionWithNoBB);
     }
@@ -3024,7 +3094,7 @@ error_code BitcodeReader::ParseFunctionBody(Function *F) {
     // If this was a terminator instruction, move to the next block.
     if (isa<TerminatorInst>(I)) {
       ++CurBBNo;
-      CurBB = CurBBNo < FunctionBBs.size() ? FunctionBBs[CurBBNo] : 0;
+      CurBB = CurBBNo < FunctionBBs.size() ? FunctionBBs[CurBBNo] : nullptr;
     }
 
     // Non-void values get registered in the value table for future use.
@@ -3036,10 +3106,10 @@ OutOfRecordLoop:
 
   // Check the function list for unresolved values.
   if (Argument *A = dyn_cast<Argument>(ValueList.back())) {
-    if (A->getParent() == 0) {
+    if (!A->getParent()) {
       // We found at least one unresolved value.  Nuke them all to avoid leaks.
       for (unsigned i = ModuleValueListSize, e = ValueList.size(); i != e; ++i){
-        if ((A = dyn_cast_or_null<Argument>(ValueList[i])) && A->getParent() == 0) {
+        if ((A = dyn_cast_or_null<Argument>(ValueList[i])) && !A->getParent()) {
           A->replaceAllUsesWith(UndefValue::get(A->getType()));
           delete A;
         }
@@ -3348,7 +3418,7 @@ Module *llvm::getStreamedBitcodeModule(const std::string &name,
     if (ErrMsg)
       *ErrMsg = EC.message();
     delete M;  // Also deletes R.
-    return 0;
+    return nullptr;
   }
   R->setBufferOwned(false); // no buffer to delete
   return M;
diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
index 15be31f..593d8f9 100644
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@@ -224,13 +224,13 @@ public:
   }
 
   explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C)
-    : Context(C), TheModule(0), Buffer(buffer), BufferOwned(false),
-      LazyStreamer(0), NextUnreadBit(0), SeenValueSymbolTable(false),
+    : Context(C), TheModule(nullptr), Buffer(buffer), BufferOwned(false),
+      LazyStreamer(nullptr), NextUnreadBit(0), SeenValueSymbolTable(false),
       ValueList(C), MDValueList(C),
       SeenFirstFunctionBody(false), UseRelativeIDs(false) {
   }
   explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C)
-    : Context(C), TheModule(0), Buffer(0), BufferOwned(false),
+    : Context(C), TheModule(nullptr), Buffer(nullptr), BufferOwned(false),
       LazyStreamer(streamer), NextUnreadBit(0), SeenValueSymbolTable(false),
       ValueList(C), MDValueList(C),
       SeenFirstFunctionBody(false), UseRelativeIDs(false) {
@@ -271,7 +271,7 @@ private:
     return ValueList.getValueFwdRef(ID, Ty);
   }
   BasicBlock *getBasicBlock(unsigned ID) const {
-    if (ID >= FunctionBBs.size()) return 0; // Invalid ID
+    if (ID >= FunctionBBs.size()) return nullptr; // Invalid ID
     return FunctionBBs[ID];
   }
   AttributeSet getAttributes(unsigned i) const {
@@ -293,15 +293,15 @@ private:
     if (ValNo < InstNum) {
       // If this is not a forward reference, just return the value we already
       // have.
-      ResVal = getFnValueByID(ValNo, 0);
-      return ResVal == 0;
+      ResVal = getFnValueByID(ValNo, nullptr);
+      return ResVal == nullptr;
     } else if (Slot == Record.size()) {
       return true;
     }
 
     unsigned TypeNo = (unsigned)Record[Slot++];
     ResVal = getFnValueByID(ValNo, getTypeByID(TypeNo));
-    return ResVal == 0;
+    return ResVal == nullptr;
   }
 
   /// popValue - Read a value out of the specified record from slot 'Slot'.
@@ -320,14 +320,14 @@ private:
   bool getValue(SmallVectorImpl<uint64_t> &Record, unsigned Slot,
                 unsigned InstNum, Type *Ty, Value *&ResVal) {
     ResVal = getValue(Record, Slot, InstNum, Ty);
-    return ResVal == 0;
+    return ResVal == nullptr;
   }
 
   /// getValue -- Version of getValue that returns ResVal directly,
   /// or 0 if there is an error.
   Value *getValue(SmallVectorImpl<uint64_t> &Record, unsigned Slot,
                   unsigned InstNum, Type *Ty) {
-    if (Slot == Record.size()) return 0;
+    if (Slot == Record.size()) return nullptr;
     unsigned ValNo = (unsigned)Record[Slot];
     // Adjust the ValNo, if it was encoded relative to the InstNum.
     if (UseRelativeIDs)
@@ -338,7 +338,7 @@ private:
   /// getValueSigned -- Like getValue, but decodes signed VBRs.
   Value *getValueSigned(SmallVectorImpl<uint64_t> &Record, unsigned Slot,
                         unsigned InstNum, Type *Ty) {
-    if (Slot == Record.size()) return 0;
+    if (Slot == Record.size()) return nullptr;
     unsigned ValNo = (unsigned)decodeSignRotatedValue(Record[Slot]);
     // Adjust the ValNo, if it was encoded relative to the InstNum.
     if (UseRelativeIDs)
diff --git a/lib/Bitcode/Reader/BitstreamReader.cpp b/lib/Bitcode/Reader/BitstreamReader.cpp
index 1fd9abd..f31e1fa 100644
--- a/lib/Bitcode/Reader/BitstreamReader.cpp
+++ b/lib/Bitcode/Reader/BitstreamReader.cpp
@@ -315,7 +315,7 @@ bool BitstreamCursor::ReadBlockInfoBlock() {
   if (EnterSubBlock(bitc::BLOCKINFO_BLOCK_ID)) return true;
 
   SmallVector<uint64_t, 64> Record;
-  BitstreamReader::BlockInfo *CurBlockInfo = 0;
+  BitstreamReader::BlockInfo *CurBlockInfo = nullptr;
 
   // Read all the records for this module.
   while (1) {
diff --git a/lib/Bitcode/Writer/BitWriter.cpp b/lib/Bitcode/Writer/BitWriter.cpp
index 0275f96..3747122 100644
--- a/lib/Bitcode/Writer/BitWriter.cpp
+++ b/lib/Bitcode/Writer/BitWriter.cpp
@@ -10,6 +10,7 @@
 #include "llvm-c/BitWriter.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 5d1dac1..cc73b84 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -197,6 +197,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NO_INLINE;
   case Attribute::NonLazyBind:
     return bitc::ATTR_KIND_NON_LAZY_BIND;
+  case Attribute::NonNull:
+    return bitc::ATTR_KIND_NON_NULL;
   case Attribute::NoRedZone:
     return bitc::ATTR_KIND_NO_RED_ZONE;
   case Attribute::NoReturn:
@@ -474,8 +476,8 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
   Stream.ExitBlock();
 }
 
-static unsigned getEncodedLinkage(const GlobalValue *GV) {
-  switch (GV->getLinkage()) {
+static unsigned getEncodedLinkage(const GlobalValue &GV) {
+  switch (GV.getLinkage()) {
   case GlobalValue::ExternalLinkage:                 return 0;
   case GlobalValue::WeakAnyLinkage:                  return 1;
   case GlobalValue::AppendingLinkage:                return 2;
@@ -491,8 +493,8 @@ static unsigned getEncodedLinkage(const GlobalValue *GV) {
   llvm_unreachable("Invalid linkage");
 }
 
-static unsigned getEncodedVisibility(const GlobalValue *GV) {
-  switch (GV->getVisibility()) {
+static unsigned getEncodedVisibility(const GlobalValue &GV) {
+  switch (GV.getVisibility()) {
   case GlobalValue::DefaultVisibility:   return 0;
   case GlobalValue::HiddenVisibility:    return 1;
   case GlobalValue::ProtectedVisibility: return 2;
@@ -500,8 +502,8 @@ static unsigned getEncodedVisibility(const GlobalValue *GV) {
   llvm_unreachable("Invalid visibility");
 }
 
-static unsigned getEncodedDLLStorageClass(const GlobalValue *GV) {
-  switch (GV->getDLLStorageClass()) {
+static unsigned getEncodedDLLStorageClass(const GlobalValue &GV) {
+  switch (GV.getDLLStorageClass()) {
   case GlobalValue::DefaultStorageClass:   return 0;
   case GlobalValue::DLLImportStorageClass: return 1;
   case GlobalValue::DLLExportStorageClass: return 2;
@@ -509,8 +511,8 @@ static unsigned getEncodedDLLStorageClass(const GlobalValue *GV) {
   llvm_unreachable("Invalid DLL storage class");
 }
 
-static unsigned getEncodedThreadLocalMode(const GlobalVariable *GV) {
-  switch (GV->getThreadLocalMode()) {
+static unsigned getEncodedThreadLocalMode(const GlobalVariable &GV) {
+  switch (GV.getThreadLocalMode()) {
     case GlobalVariable::NotThreadLocal:         return 0;
     case GlobalVariable::GeneralDynamicTLSModel: return 1;
     case GlobalVariable::LocalDynamicTLSModel:   return 2;
@@ -541,36 +543,35 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   std::map<std::string, unsigned> GCMap;
   unsigned MaxAlignment = 0;
   unsigned MaxGlobalType = 0;
-  for (Module::const_global_iterator GV = M->global_begin(),E = M->global_end();
-       GV != E; ++GV) {
-    MaxAlignment = std::max(MaxAlignment, GV->getAlignment());
-    MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV->getType()));
-    if (GV->hasSection()) {
+  for (const GlobalValue &GV : M->globals()) {
+    MaxAlignment = std::max(MaxAlignment, GV.getAlignment());
+    MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV.getType()));
+    if (GV.hasSection()) {
       // Give section names unique ID's.
-      unsigned &Entry = SectionMap[GV->getSection()];
+      unsigned &Entry = SectionMap[GV.getSection()];
       if (!Entry) {
-        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, GV->getSection(),
+        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, GV.getSection(),
                           0/*TODO*/, Stream);
         Entry = SectionMap.size();
       }
     }
   }
-  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
-    MaxAlignment = std::max(MaxAlignment, F->getAlignment());
-    if (F->hasSection()) {
+  for (const Function &F : *M) {
+    MaxAlignment = std::max(MaxAlignment, F.getAlignment());
+    if (F.hasSection()) {
       // Give section names unique ID's.
-      unsigned &Entry = SectionMap[F->getSection()];
+      unsigned &Entry = SectionMap[F.getSection()];
       if (!Entry) {
-        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, F->getSection(),
+        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, F.getSection(),
                           0/*TODO*/, Stream);
         Entry = SectionMap.size();
       }
     }
-    if (F->hasGC()) {
+    if (F.hasGC()) {
       // Same for GC names.
-      unsigned &Entry = GCMap[F->getGC()];
+      unsigned &Entry = GCMap[F.getGC()];
       if (!Entry) {
-        WriteStringRecord(bitc::MODULE_CODE_GCNAME, F->getGC(),
+        WriteStringRecord(bitc::MODULE_CODE_GCNAME, F.getGC(),
                           0/*TODO*/, Stream);
         Entry = GCMap.size();
       }
@@ -606,28 +607,27 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
 
   // Emit the global variable information.
   SmallVector<unsigned, 64> Vals;
-  for (Module::const_global_iterator GV = M->global_begin(),E = M->global_end();
-       GV != E; ++GV) {
+  for (const GlobalVariable &GV : M->globals()) {
     unsigned AbbrevToUse = 0;
 
     // GLOBALVAR: [type, isconst, initid,
     //             linkage, alignment, section, visibility, threadlocal,
     //             unnamed_addr, externally_initialized, dllstorageclass]
-    Vals.push_back(VE.getTypeID(GV->getType()));
-    Vals.push_back(GV->isConstant());
-    Vals.push_back(GV->isDeclaration() ? 0 :
-                   (VE.getValueID(GV->getInitializer()) + 1));
+    Vals.push_back(VE.getTypeID(GV.getType()));
+    Vals.push_back(GV.isConstant());
+    Vals.push_back(GV.isDeclaration() ? 0 :
+                   (VE.getValueID(GV.getInitializer()) + 1));
     Vals.push_back(getEncodedLinkage(GV));
-    Vals.push_back(Log2_32(GV->getAlignment())+1);
-    Vals.push_back(GV->hasSection() ? SectionMap[GV->getSection()] : 0);
-    if (GV->isThreadLocal() ||
-        GV->getVisibility() != GlobalValue::DefaultVisibility ||
-        GV->hasUnnamedAddr() || GV->isExternallyInitialized() ||
-        GV->getDLLStorageClass() != GlobalValue::DefaultStorageClass) {
+    Vals.push_back(Log2_32(GV.getAlignment())+1);
+    Vals.push_back(GV.hasSection() ? SectionMap[GV.getSection()] : 0);
+    if (GV.isThreadLocal() ||
+        GV.getVisibility() != GlobalValue::DefaultVisibility ||
+        GV.hasUnnamedAddr() || GV.isExternallyInitialized() ||
+        GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass) {
       Vals.push_back(getEncodedVisibility(GV));
       Vals.push_back(getEncodedThreadLocalMode(GV));
-      Vals.push_back(GV->hasUnnamedAddr());
-      Vals.push_back(GV->isExternallyInitialized());
+      Vals.push_back(GV.hasUnnamedAddr());
+      Vals.push_back(GV.isExternallyInitialized());
       Vals.push_back(getEncodedDLLStorageClass(GV));
     } else {
       AbbrevToUse = SimpleGVarAbbrev;
@@ -638,20 +638,20 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   }
 
   // Emit the function proto information.
-  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
+  for (const Function &F : *M) {
     // FUNCTION:  [type, callingconv, isproto, linkage, paramattrs, alignment,
     //             section, visibility, gc, unnamed_addr, prefix]
-    Vals.push_back(VE.getTypeID(F->getType()));
-    Vals.push_back(F->getCallingConv());
-    Vals.push_back(F->isDeclaration());
+    Vals.push_back(VE.getTypeID(F.getType()));
+    Vals.push_back(F.getCallingConv());
+    Vals.push_back(F.isDeclaration());
     Vals.push_back(getEncodedLinkage(F));
-    Vals.push_back(VE.getAttributeID(F->getAttributes()));
-    Vals.push_back(Log2_32(F->getAlignment())+1);
-    Vals.push_back(F->hasSection() ? SectionMap[F->getSection()] : 0);
+    Vals.push_back(VE.getAttributeID(F.getAttributes()));
+    Vals.push_back(Log2_32(F.getAlignment())+1);
+    Vals.push_back(F.hasSection() ? SectionMap[F.getSection()] : 0);
     Vals.push_back(getEncodedVisibility(F));
-    Vals.push_back(F->hasGC() ? GCMap[F->getGC()] : 0);
-    Vals.push_back(F->hasUnnamedAddr());
-    Vals.push_back(F->hasPrefixData() ? (VE.getValueID(F->getPrefixData()) + 1)
+    Vals.push_back(F.hasGC() ? GCMap[F.getGC()] : 0);
+    Vals.push_back(F.hasUnnamedAddr());
+    Vals.push_back(F.hasPrefixData() ? (VE.getValueID(F.getPrefixData()) + 1)
                                       : 0);
     Vals.push_back(getEncodedDLLStorageClass(F));
 
@@ -661,14 +661,13 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   }
 
   // Emit the alias information.
-  for (Module::const_alias_iterator AI = M->alias_begin(), E = M->alias_end();
-       AI != E; ++AI) {
+  for (const GlobalAlias &A : M->aliases()) {
     // ALIAS: [alias type, aliasee val#, linkage, visibility]
-    Vals.push_back(VE.getTypeID(AI->getType()));
-    Vals.push_back(VE.getValueID(AI->getAliasee()));
-    Vals.push_back(getEncodedLinkage(AI));
-    Vals.push_back(getEncodedVisibility(AI));
-    Vals.push_back(getEncodedDLLStorageClass(AI));
+    Vals.push_back(VE.getTypeID(A.getType()));
+    Vals.push_back(VE.getValueID(A.getAliasee()));
+    Vals.push_back(getEncodedLinkage(A));
+    Vals.push_back(getEncodedVisibility(A));
+    Vals.push_back(getEncodedDLLStorageClass(A));
     unsigned AbbrevToUse = 0;
     Stream.EmitRecord(bitc::MODULE_CODE_ALIAS, Vals, AbbrevToUse);
     Vals.clear();
@@ -917,7 +916,7 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
   SmallVector<uint64_t, 64> Record;
 
   const ValueEnumerator::ValueList &Vals = VE.getValues();
-  Type *LastTy = 0;
+  Type *LastTy = nullptr;
   for (unsigned i = FirstVal; i != LastVal; ++i) {
     const Value *V = Vals[i].first;
     // If we need to switch types, do so now.
@@ -1087,12 +1086,14 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
         Code = bitc::CST_CODE_CE_EXTRACTELT;
         Record.push_back(VE.getTypeID(C->getOperand(0)->getType()));
         Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getTypeID(C->getOperand(1)->getType()));
         Record.push_back(VE.getValueID(C->getOperand(1)));
         break;
       case Instruction::InsertElement:
         Code = bitc::CST_CODE_CE_INSERTELT;
         Record.push_back(VE.getValueID(C->getOperand(0)));
         Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(VE.getTypeID(C->getOperand(2)->getType()));
         Record.push_back(VE.getValueID(C->getOperand(2)));
         break;
       case Instruction::ShuffleVector:
@@ -1253,13 +1254,13 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   case Instruction::ExtractElement:
     Code = bitc::FUNC_CODE_INST_EXTRACTELT;
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);
-    pushValue(I.getOperand(1), InstID, Vals, VE);
+    PushValueAndType(I.getOperand(1), InstID, Vals, VE);
     break;
   case Instruction::InsertElement:
     Code = bitc::FUNC_CODE_INST_INSERTELT;
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);
     pushValue(I.getOperand(1), InstID, Vals, VE);
-    pushValue(I.getOperand(2), InstID, Vals, VE);
+    PushValueAndType(I.getOperand(2), InstID, Vals, VE);
     break;
   case Instruction::ShuffleVector:
     Code = bitc::FUNC_CODE_INST_SHUFFLEVEC;
@@ -1469,7 +1470,8 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     Code = bitc::FUNC_CODE_INST_CALL;
 
     Vals.push_back(VE.getAttributeID(CI.getAttributes()));
-    Vals.push_back((CI.getCallingConv() << 1) | unsigned(CI.isTailCall()));
+    Vals.push_back((CI.getCallingConv() << 1) | unsigned(CI.isTailCall()) |
+                   unsigned(CI.isMustTailCall()) << 14);
     PushValueAndType(CI.getCalledValue(), InstID, Vals, VE);  // Callee
 
     // Emit value #'s for the fixed parameters.
diff --git a/lib/Bitcode/module.modulemap b/lib/Bitcode/module.modulemap
new file mode 100644
index 0000000..7df1a0a
--- /dev/null
+++ b/lib/Bitcode/module.modulemap
@@ -0,0 +1 @@
+module Bitcode { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 25c438c..0f38c64 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "post-RA-sched"
 #include "AggressiveAntiDepBreaker.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -29,6 +28,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "post-RA-sched"
+
 // If DebugDiv > 0 then only break antidep with (ID % DebugDiv) == DebugMod
 static cl::opt<int>
 DebugDiv("agg-antidep-debugdiv",
@@ -121,7 +122,7 @@ AggressiveAntiDepBreaker(MachineFunction& MFi,
   TII(MF.getTarget().getInstrInfo()),
   TRI(MF.getTarget().getRegisterInfo()),
   RegClassInfo(RCI),
-  State(NULL) {
+  State(nullptr) {
   /* Collect a bitset of all registers that are only broken if they
      are on the critical path. */
   for (unsigned i = 0, e = CriticalPathRCs.size(); i < e; ++i) {
@@ -144,7 +145,7 @@ AggressiveAntiDepBreaker::~AggressiveAntiDepBreaker() {
 }
 
 void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
-  assert(State == NULL);
+  assert(!State);
   State = new AggressiveAntiDepState(TRI->getNumRegs(), BB);
 
   bool IsReturnBlock = (!BB->empty() && BB->back().isReturn());
@@ -169,7 +170,7 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   // callee-saved register that is not saved in the prolog.
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   BitVector Pristine = MFI->getPristineRegs(BB);
-  for (const uint16_t *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
+  for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
     unsigned Reg = *I;
     if (!IsReturnBlock && !Pristine.test(Reg)) continue;
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
@@ -183,7 +184,7 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
 
 void AggressiveAntiDepBreaker::FinishBlock() {
   delete State;
-  State = NULL;
+  State = nullptr;
 }
 
 void AggressiveAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
@@ -230,13 +231,13 @@ bool AggressiveAntiDepBreaker::IsImplicitDefUse(MachineInstr *MI,
   if (Reg == 0)
     return false;
 
-  MachineOperand *Op = NULL;
+  MachineOperand *Op = nullptr;
   if (MO.isDef())
     Op = MI->findRegisterUseOperand(Reg, true);
   else
     Op = MI->findRegisterDefOperand(Reg);
 
-  return((Op != NULL) && Op->isImplicit());
+  return(Op && Op->isImplicit());
 }
 
 void AggressiveAntiDepBreaker::GetPassthruRegs(MachineInstr *MI,
@@ -273,10 +274,10 @@ static void AntiDepEdges(const SUnit *SU, std::vector<const SDep*>& Edges) {
 /// CriticalPathStep - Return the next SUnit after SU on the bottom-up
 /// critical path.
 static const SUnit *CriticalPathStep(const SUnit *SU) {
-  const SDep *Next = 0;
+  const SDep *Next = nullptr;
   unsigned NextDepth = 0;
   // Find the predecessor edge with the greatest depth.
-  if (SU != 0) {
+  if (SU) {
     for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
          P != PE; ++P) {
       const SUnit *PredSU = P->getSUnit();
@@ -292,7 +293,7 @@ static const SUnit *CriticalPathStep(const SUnit *SU) {
     }
   }
 
-  return (Next) ? Next->getSUnit() : 0;
+  return (Next) ? Next->getSUnit() : nullptr;
 }
 
 void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
@@ -309,8 +310,8 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
     DefIndices[Reg] = ~0u;
     RegRefs.erase(Reg);
     State->LeaveGroup(Reg);
-    DEBUG(if (header != NULL) {
-        dbgs() << header << TRI->getName(Reg); header = NULL; });
+    DEBUG(if (header) {
+        dbgs() << header << TRI->getName(Reg); header = nullptr; });
     DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << tag);
   }
   // Repeat for subregisters.
@@ -321,14 +322,14 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
       DefIndices[SubregReg] = ~0u;
       RegRefs.erase(SubregReg);
       State->LeaveGroup(SubregReg);
-      DEBUG(if (header != NULL) {
-          dbgs() << header << TRI->getName(Reg); header = NULL; });
+      DEBUG(if (header) {
+          dbgs() << header << TRI->getName(Reg); header = nullptr; });
       DEBUG(dbgs() << " " << TRI->getName(SubregReg) << "->g" <<
             State->GetGroup(SubregReg) << tag);
     }
   }
 
-  DEBUG(if ((header == NULL) && (footer != NULL)) dbgs() << footer);
+  DEBUG(if (!header && footer) dbgs() << footer);
 }
 
 void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
@@ -382,7 +383,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
     }
 
     // Note register reference...
-    const TargetRegisterClass *RC = NULL;
+    const TargetRegisterClass *RC = nullptr;
     if (i < MI->getDesc().getNumOperands())
       RC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
@@ -466,7 +467,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
     }
 
     // Note register reference...
-    const TargetRegisterClass *RC = NULL;
+    const TargetRegisterClass *RC = nullptr;
     if (i < MI->getDesc().getNumOperands())
       RC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
     AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
@@ -516,7 +517,7 @@ BitVector AggressiveAntiDepBreaker::GetRenameRegisters(unsigned Reg) {
        AggressiveAntiDepState::RegisterReference>::iterator Q = Range.first,
        QE = Range.second; Q != QE; ++Q) {
     const TargetRegisterClass *RC = Q->second.RC;
-    if (RC == NULL) continue;
+    if (!RC) continue;
 
     BitVector RCBV = TRI->getAllocatableSet(MF, RC);
     if (first) {
@@ -734,8 +735,8 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
   // Track progress along the critical path through the SUnit graph as
   // we walk the instructions. This is needed for regclasses that only
   // break critical-path anti-dependencies.
-  const SUnit *CriticalPathSU = 0;
-  MachineInstr *CriticalPathMI = 0;
+  const SUnit *CriticalPathSU = nullptr;
+  MachineInstr *CriticalPathMI = nullptr;
   if (CriticalPathSet.any()) {
     for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
       const SUnit *SU = &SUnits[i];
@@ -788,10 +789,10 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
 
     // If MI is not on the critical path, then we don't rename
     // registers in the CriticalPathSet.
-    BitVector *ExcludeRegs = NULL;
+    BitVector *ExcludeRegs = nullptr;
     if (MI == CriticalPathMI) {
       CriticalPathSU = CriticalPathStep(CriticalPathSU);
-      CriticalPathMI = (CriticalPathSU) ? CriticalPathSU->getInstr() : 0;
+      CriticalPathMI = (CriticalPathSU) ? CriticalPathSU->getInstr() : nullptr;
     } else if (CriticalPathSet.any()) {
       ExcludeRegs = &CriticalPathSet;
     }
@@ -815,7 +816,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
           // Don't break anti-dependencies on non-allocatable registers.
           DEBUG(dbgs() << " (non-allocatable)\n");
           continue;
-        } else if ((ExcludeRegs != NULL) && ExcludeRegs->test(AntiDepReg)) {
+        } else if (ExcludeRegs && ExcludeRegs->test(AntiDepReg)) {
           // Don't break anti-dependencies for critical path registers
           // if not on the critical path
           DEBUG(dbgs() << " (not critical-path)\n");
@@ -829,9 +830,8 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
         } else {
           // No anti-dep breaking for implicit deps
           MachineOperand *AntiDepOp = MI->findRegisterDefOperand(AntiDepReg);
-          assert(AntiDepOp != NULL &&
-                 "Can't find index for defined register operand");
-          if ((AntiDepOp == NULL) || AntiDepOp->isImplicit()) {
+          assert(AntiDepOp && "Can't find index for defined register operand");
+          if (!AntiDepOp || AntiDepOp->isImplicit()) {
             DEBUG(dbgs() << " (implicit)\n");
             continue;
           }
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.h b/lib/CodeGen/AggressiveAntiDepBreaker.h
index 29b6a10..2ab9d89 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.h
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -170,7 +170,8 @@ class RegisterClassInfo;
     void GetPassthruRegs(MachineInstr *MI, std::set<unsigned>& PassthruRegs);
 
     void HandleLastUse(unsigned Reg, unsigned KillIdx, const char *tag,
-                       const char *header =NULL, const char *footer =NULL);
+                       const char *header = nullptr,
+                       const char *footer = nullptr);
 
     void PrescanInstruction(MachineInstr *MI, unsigned Count,
                             std::set<unsigned>& PassthruRegs);
diff --git a/lib/CodeGen/AllocationOrder.cpp b/lib/CodeGen/AllocationOrder.cpp
index 3fa1f8f..dc9bcff 100644
--- a/lib/CodeGen/AllocationOrder.cpp
+++ b/lib/CodeGen/AllocationOrder.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "AllocationOrder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -25,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 // Compare VirtRegMap::getRegAllocPref().
 AllocationOrder::AllocationOrder(unsigned VirtReg,
                                  const VirtRegMap &VRM,
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 6ac5de2..6fc83a2 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -46,7 +46,7 @@ unsigned llvm::ComputeLinearIndex(Type *Ty,
         EI != EE; ++EI) {
       if (Indices && *Indices == unsigned(EI - EB))
         return ComputeLinearIndex(*EI, Indices+1, IndicesEnd, CurIndex);
-      CurIndex = ComputeLinearIndex(*EI, 0, 0, CurIndex);
+      CurIndex = ComputeLinearIndex(*EI, nullptr, nullptr, CurIndex);
     }
     return CurIndex;
   }
@@ -56,7 +56,7 @@ unsigned llvm::ComputeLinearIndex(Type *Ty,
     for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i) {
       if (Indices && *Indices == i)
         return ComputeLinearIndex(EltTy, Indices+1, IndicesEnd, CurIndex);
-      CurIndex = ComputeLinearIndex(EltTy, 0, 0, CurIndex);
+      CurIndex = ComputeLinearIndex(EltTy, nullptr, nullptr, CurIndex);
     }
     return CurIndex;
   }
@@ -228,7 +228,7 @@ static const Value *getNoopInput(const Value *V,
     // through.
     const Instruction *I = dyn_cast<Instruction>(V);
     if (!I || I->getNumOperands() == 0) return V;
-    const Value *NoopInput = 0;
+    const Value *NoopInput = nullptr;
 
     Value *Op = I->getOperand(0);
     if (isa<BitCastInst>(I)) {
diff --git a/lib/CodeGen/Android.mk b/lib/CodeGen/Android.mk
index 26f04d0..7feb42c 100644
--- a/lib/CodeGen/Android.mk
+++ b/lib/CodeGen/Android.mk
@@ -4,6 +4,7 @@ codegen_SRC_FILES := \
   AggressiveAntiDepBreaker.cpp \
   AllocationOrder.cpp \
   Analysis.cpp \
+  AtomicExpandLoadLinkedPass.cpp \
   BasicTargetTransformInfo.cpp \
   BranchFolding.cpp \
   CalcSpillWeights.cpp \
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 403feb4..1cb0159 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -57,10 +57,10 @@ void ARMException::endModule() {
 /// beginFunction - Gather pre-function exception information. Assumes it's
 /// being emitted immediately after the function entry point.
 void ARMException::beginFunction(const MachineFunction *MF) {
-  getTargetStreamer().emitFnStart();
-  if (Asm->MF->getFunction()->needsUnwindTableEntry())
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
-                                                  Asm->getFunctionNumber()));
+  if (Asm->MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+    getTargetStreamer().emitFnStart();
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
+                                                Asm->getFunctionNumber()));
   // See if we need call frame info.
   AsmPrinter::CFIMoveType MoveType = Asm->needsCFIMoves();
   assert(MoveType != AsmPrinter::CFI_M_EH &&
@@ -77,16 +77,16 @@ void ARMException::endFunction(const MachineFunction *) {
   if (shouldEmitCFI)
     Asm->OutStreamer.EmitCFIEndProc();
 
+  // Map all labels and get rid of any dead landing pads.
+  MMI->TidyLandingPads();
+
   ARMTargetStreamer &ATS = getTargetStreamer();
-  if (!Asm->MF->getFunction()->needsUnwindTableEntry())
+  if (!Asm->MF->getFunction()->needsUnwindTableEntry() &&
+      MMI->getLandingPads().empty())
     ATS.emitCantUnwind();
   else {
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
                                                   Asm->getFunctionNumber()));
-
-    // Map all labels and get rid of any dead landing pads.
-    MMI->TidyLandingPads();
-
     if (!MMI->getLandingPads().empty()) {
       // Emit references to personality.
       if (const Function * Personality =
@@ -104,7 +104,8 @@ void ARMException::endFunction(const MachineFunction *) {
     }
   }
 
-  ATS.emitFnEnd();
+  if (Asm->MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+    ATS.emitFnEnd();
 }
 
 void ARMException::EmitTypeInfos(unsigned TTypeEncoding) {
@@ -144,7 +145,7 @@ void ARMException::EmitTypeInfos(unsigned TTypeEncoding) {
         Asm->OutStreamer.AddComment("FilterInfo " + Twine(Entry));
     }
 
-    Asm->EmitTTypeReference((TypeID == 0 ? 0 : TypeInfos[TypeID - 1]),
+    Asm->EmitTTypeReference((TypeID == 0 ? nullptr : TypeInfos[TypeID - 1]),
                             TTypeEncoding);
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/AddressPool.cpp b/lib/CodeGen/AsmPrinter/AddressPool.cpp
new file mode 100644
index 0000000..8dab5e5
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/AddressPool.cpp
@@ -0,0 +1,45 @@
+//===-- llvm/CodeGen/AddressPool.cpp - Dwarf Debug Framework ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddressPool.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+class MCExpr;
+
+unsigned AddressPool::getIndex(const MCSymbol *Sym, bool TLS) {
+  HasBeenUsed = true;
+  auto IterBool =
+      Pool.insert(std::make_pair(Sym, AddressPoolEntry(Pool.size(), TLS)));
+  return IterBool.first->second.Number;
+}
+
+// Emit addresses into the section given.
+void AddressPool::emit(AsmPrinter &Asm, const MCSection *AddrSection) {
+  if (Pool.empty())
+    return;
+
+  // Start the dwarf addr section.
+  Asm.OutStreamer.SwitchSection(AddrSection);
+
+  // Order the address pool entries by ID
+  SmallVector<const MCExpr *, 64> Entries(Pool.size());
+
+  for (const auto &I : Pool)
+    Entries[I.second.Number] =
+        I.second.TLS
+            ? Asm.getObjFileLowering().getDebugThreadLocalSymbol(I.first)
+            : MCSymbolRefExpr::Create(I.first, Asm.OutContext);
+
+  for (const MCExpr *Entry : Entries)
+    Asm.OutStreamer.EmitValue(Entry, Asm.getDataLayout().getPointerSize());
+}
diff --git a/lib/CodeGen/AsmPrinter/AddressPool.h b/lib/CodeGen/AsmPrinter/AddressPool.h
new file mode 100644
index 0000000..42757d7
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/AddressPool.h
@@ -0,0 +1,52 @@
+//===-- llvm/CodeGen/AddressPool.h - Dwarf Debug Framework -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_ADDRESSPOOL_H__
+#define CODEGEN_ASMPRINTER_ADDRESSPOOL_H__
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+class MCSection;
+class MCSymbol;
+class AsmPrinter;
+// Collection of addresses for this unit and assorted labels.
+// A Symbol->unsigned mapping of addresses used by indirect
+// references.
+class AddressPool {
+  struct AddressPoolEntry {
+    unsigned Number;
+    bool TLS;
+    AddressPoolEntry(unsigned Number, bool TLS) : Number(Number), TLS(TLS) {}
+  };
+  DenseMap<const MCSymbol *, AddressPoolEntry> Pool;
+
+  /// Record whether the AddressPool has been queried for an address index since
+  /// the last "resetUsedFlag" call. Used to implement type unit fallback - a
+  /// type that references addresses cannot be placed in a type unit when using
+  /// fission.
+  bool HasBeenUsed;
+
+public:
+  AddressPool() : HasBeenUsed(false) {}
+
+  /// \brief Returns the index into the address pool with the given
+  /// label/symbol.
+  unsigned getIndex(const MCSymbol *Sym, bool TLS = false);
+
+  void emit(AsmPrinter &Asm, const MCSection *AddrSection);
+
+  bool isEmpty() { return Pool.empty(); }
+
+  bool hasBeenUsed() const { return HasBeenUsed; }
+
+  void resetUsedFlag() { HasBeenUsed = false; }
+};
+}
+#endif
diff --git a/lib/CodeGen/AsmPrinter/Android.mk b/lib/CodeGen/AsmPrinter/Android.mk
index a725fba..f56eb6e 100644
--- a/lib/CodeGen/AsmPrinter/Android.mk
+++ b/lib/CodeGen/AsmPrinter/Android.mk
@@ -8,17 +8,21 @@ codegen_asmprinter_SRC_FILES := \
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES :=	\
+	AddressPool.cpp \
 	AsmPrinter.cpp	\
 	AsmPrinterDwarf.cpp	\
 	AsmPrinterInlineAsm.cpp	\
 	ARMException.cpp	\
+	DbgValueHistoryCalculator.cpp \
 	DIE.cpp	\
 	DIEHash.cpp \
 	DwarfAccelTable.cpp \
 	DwarfCFIException.cpp \
 	DwarfDebug.cpp	\
 	DwarfException.cpp	\
-        DwarfUnit.cpp \
+	DwarfFile.cpp \
+	DwarfStringPool.cpp \
+	DwarfUnit.cpp \
 	ErlangGCPrinter.cpp \
 	OcamlGCPrinter.cpp \
 	Win64Exception.cpp \
@@ -38,17 +42,21 @@ ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES :=	\
+	AddressPool.cpp \
 	AsmPrinter.cpp \
 	AsmPrinterDwarf.cpp \
 	AsmPrinterInlineAsm.cpp \
 	ARMException.cpp        \
+	DbgValueHistoryCalculator.cpp \
 	DIE.cpp \
 	DIEHash.cpp \
 	DwarfAccelTable.cpp \
 	DwarfCFIException.cpp \
 	DwarfDebug.cpp  \
 	DwarfException.cpp      \
-        DwarfUnit.cpp \
+	DwarfFile.cpp \
+	DwarfStringPool.cpp \
+	DwarfUnit.cpp \
 	ErlangGCPrinter.cpp \
 	OcamlGCPrinter.cpp \
 	Win64Exception.cpp \
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index c3afc8b..7de9c6d 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
@@ -53,6 +52,8 @@
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 static const char *const DWARFGroupName = "DWARF Emission";
 static const char *const DbgTimerName = "Debug Info Emission";
 static const char *const EHTimerName = "DWARF Exception Writer";
@@ -62,9 +63,9 @@ STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
 char AsmPrinter::ID = 0;
 
-typedef DenseMap<GCStrategy*,GCMetadataPrinter*> gcp_map_type;
+typedef DenseMap<GCStrategy*, std::unique_ptr<GCMetadataPrinter>> gcp_map_type;
 static gcp_map_type &getGCMap(void *&P) {
-  if (P == 0)
+  if (!P)
     P = new gcp_map_type();
   return *(gcp_map_type*)P;
 }
@@ -101,23 +102,21 @@ AsmPrinter::AsmPrinter(TargetMachine &tm, MCStreamer &Streamer)
     TM(tm), MAI(tm.getMCAsmInfo()), MII(tm.getInstrInfo()),
     OutContext(Streamer.getContext()),
     OutStreamer(Streamer),
-    LastMI(0), LastFn(0), Counter(~0U), SetCounter(0) {
-  DD = 0; MMI = 0; LI = 0; MF = 0;
-  CurrentFnSym = CurrentFnSymForSize = 0;
-  GCMetadataPrinters = 0;
+    LastMI(nullptr), LastFn(0), Counter(~0U), SetCounter(0) {
+  DD = nullptr; MMI = nullptr; LI = nullptr; MF = nullptr;
+  CurrentFnSym = CurrentFnSymForSize = nullptr;
+  GCMetadataPrinters = nullptr;
   VerboseAsm = Streamer.isVerboseAsm();
 }
 
 AsmPrinter::~AsmPrinter() {
-  assert(DD == 0 && Handlers.empty() && "Debug/EH info didn't get finalized");
+  assert(!DD && Handlers.empty() && "Debug/EH info didn't get finalized");
 
-  if (GCMetadataPrinters != 0) {
+  if (GCMetadataPrinters) {
     gcp_map_type &GCMap = getGCMap(GCMetadataPrinters);
 
-    for (gcp_map_type::iterator I = GCMap.begin(), E = GCMap.end(); I != E; ++I)
-      delete I->second;
     delete &GCMap;
-    GCMetadataPrinters = 0;
+    GCMetadataPrinters = nullptr;
   }
 
   delete &OutStreamer;
@@ -209,7 +208,7 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
   assert(MI && "AsmPrinter didn't require GCModuleInfo?");
-  for (GCModuleInfo::iterator I = MI->begin(), E = MI->end(); I != E; ++I)
+  for (auto &I : *MI)
     if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*I))
       MP->beginAssembly(*this);
 
@@ -233,7 +232,7 @@ bool AsmPrinter::doInitialization(Module &M) {
     }
   }
 
-  DwarfException *DE = 0;
+  DwarfException *DE = nullptr;
   switch (MAI->getExceptionHandlingType()) {
   case ExceptionHandling::None:
     break;
@@ -370,10 +369,9 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   // sections and expected to be contiguous (e.g. ObjC metadata).
   unsigned AlignLog = getGVAlignmentLog2(GV, *DL);
 
-  for (unsigned I = 0, E = Handlers.size(); I != E; ++I) {
-    const HandlerInfo &OI = Handlers[I];
-    NamedRegionTimer T(OI.TimerName, OI.TimerGroupName, TimePassesIsEnabled);
-    OI.Handler->setSymbolSize(GVSym, Size);
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerGroupName, TimePassesIsEnabled);
+    HI.Handler->setSymbolSize(GVSym, Size);
   }
 
   // Handle common and BSS local symbols (.lcomm).
@@ -545,10 +543,9 @@ void AsmPrinter::EmitFunctionHeader() {
   }
 
   // Emit pre-function debug and/or EH information.
-  for (unsigned I = 0, E = Handlers.size(); I != E; ++I) {
-    const HandlerInfo &OI = Handlers[I];
-    NamedRegionTimer T(OI.TimerName, OI.TimerGroupName, TimePassesIsEnabled);
-    OI.Handler->beginFunction(MF);
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerGroupName, TimePassesIsEnabled);
+    HI.Handler->beginFunction(MF);
   }
 
   // Emit the prefix data.
@@ -746,69 +743,65 @@ void AsmPrinter::EmitFunctionBody() {
 
   // Print out code for the function.
   bool HasAnyRealCode = false;
-  const MachineInstr *LastMI = 0;
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-       I != E; ++I) {
+  const MachineInstr *LastMI = nullptr;
+  for (auto &MBB : *MF) {
     // Print a label for the basic block.
-    EmitBasicBlockStart(I);
-    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
-         II != IE; ++II) {
-      LastMI = II;
+    EmitBasicBlockStart(MBB);
+    for (auto &MI : MBB) {
+      LastMI = &MI;
 
       // Print the assembly for the instruction.
-      if (!II->isPosition() && !II->isImplicitDef() && !II->isKill() &&
-          !II->isDebugValue()) {
+      if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&
+          !MI.isDebugValue()) {
         HasAnyRealCode = true;
         ++EmittedInsts;
       }
 
       if (ShouldPrintDebugScopes) {
-        for (unsigned III = 0, EEE = Handlers.size(); III != EEE; ++III) {
-          const HandlerInfo &OI = Handlers[III];
-          NamedRegionTimer T(OI.TimerName, OI.TimerGroupName,
+        for (const HandlerInfo &HI : Handlers) {
+          NamedRegionTimer T(HI.TimerName, HI.TimerGroupName,
                              TimePassesIsEnabled);
-          OI.Handler->beginInstruction(II);
+          HI.Handler->beginInstruction(&MI);
         }
       }
 
       if (isVerbose())
-        emitComments(*II, OutStreamer.GetCommentOS());
+        emitComments(MI, OutStreamer.GetCommentOS());
 
-      switch (II->getOpcode()) {
+      switch (MI.getOpcode()) {
       case TargetOpcode::CFI_INSTRUCTION:
-        emitCFIInstruction(*II);
+        emitCFIInstruction(MI);
         break;
 
       case TargetOpcode::EH_LABEL:
       case TargetOpcode::GC_LABEL:
-        OutStreamer.EmitLabel(II->getOperand(0).getMCSymbol());
+        OutStreamer.EmitLabel(MI.getOperand(0).getMCSymbol());
         break;
       case TargetOpcode::INLINEASM:
-        EmitInlineAsm(II);
+        EmitInlineAsm(&MI);
         break;
       case TargetOpcode::DBG_VALUE:
         if (isVerbose()) {
-          if (!emitDebugValueComment(II, *this))
-            EmitInstruction(II);
+          if (!emitDebugValueComment(&MI, *this))
+            EmitInstruction(&MI);
         }
         break;
       case TargetOpcode::IMPLICIT_DEF:
-        if (isVerbose()) emitImplicitDef(II);
+        if (isVerbose()) emitImplicitDef(&MI);
         break;
       case TargetOpcode::KILL:
-        if (isVerbose()) emitKill(II, *this);
+        if (isVerbose()) emitKill(&MI, *this);
         break;
       default:
-        EmitInstruction(II);
+        EmitInstruction(&MI);
         break;
       }
 
       if (ShouldPrintDebugScopes) {
-        for (unsigned III = 0, EEE = Handlers.size(); III != EEE; ++III) {
-          const HandlerInfo &OI = Handlers[III];
-          NamedRegionTimer T(OI.TimerName, OI.TimerGroupName,
+        for (const HandlerInfo &HI : Handlers) {
+          NamedRegionTimer T(HI.TimerName, HI.TimerGroupName,
                              TimePassesIsEnabled);
-          OI.Handler->endInstruction();
+          HI.Handler->endInstruction();
         }
       }
     }
@@ -835,11 +828,10 @@ void AsmPrinter::EmitFunctionBody() {
   }
 
   const Function *F = MF->getFunction();
-  for (Function::const_iterator i = F->begin(), e = F->end(); i != e; ++i) {
-    const BasicBlock *BB = i;
-    if (!BB->hasAddressTaken())
+  for (const auto &BB : *F) {
+    if (!BB.hasAddressTaken())
       continue;
-    MCSymbol *Sym = GetBlockAddressSymbol(BB);
+    MCSymbol *Sym = GetBlockAddressSymbol(&BB);
     if (Sym->isDefined())
       continue;
     OutStreamer.AddComment("Address of block that was removed by CodeGen");
@@ -866,10 +858,9 @@ void AsmPrinter::EmitFunctionBody() {
   }
 
   // Emit post-function debug and/or EH information.
-  for (unsigned I = 0, E = Handlers.size(); I != E; ++I) {
-    const HandlerInfo &OI = Handlers[I];
-    NamedRegionTimer T(OI.TimerName, OI.TimerGroupName, TimePassesIsEnabled);
-    OI.Handler->endFunction(MF);
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerGroupName, TimePassesIsEnabled);
+    HI.Handler->endFunction(MF);
   }
   MMI->EndFunction();
 
@@ -881,13 +872,11 @@ void AsmPrinter::EmitFunctionBody() {
 
 bool AsmPrinter::doFinalization(Module &M) {
   // Emit global variables.
-  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    EmitGlobalVariable(I);
+  for (const auto &G : M.globals())
+    EmitGlobalVariable(&G);
 
   // Emit visibility info for declarations
-  for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    const Function &F = *I;
+  for (const Function &F : M) {
     if (!F.isDeclaration())
       continue;
     GlobalValue::VisibilityTypes V = F.getVisibility();
@@ -908,15 +897,14 @@ bool AsmPrinter::doFinalization(Module &M) {
   OutStreamer.Flush();
 
   // Finalize debug and EH information.
-  for (unsigned I = 0, E = Handlers.size(); I != E; ++I) {
-    const HandlerInfo &OI = Handlers[I];
-    NamedRegionTimer T(OI.TimerName, OI.TimerGroupName,
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerGroupName,
                        TimePassesIsEnabled);
-    OI.Handler->endModule();
-    delete OI.Handler;
+    HI.Handler->endModule();
+    delete HI.Handler;
   }
   Handlers.clear();
-  DD = 0;
+  DD = nullptr;
 
   // If the target wants to know about weak references, print them all.
   if (MAI->getWeakRefDirective()) {
@@ -926,36 +914,36 @@ bool AsmPrinter::doFinalization(Module &M) {
     // happen with the MC stuff eventually.
 
     // Print out module-level global variables here.
-    for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-         I != E; ++I) {
-      if (!I->hasExternalWeakLinkage()) continue;
-      OutStreamer.EmitSymbolAttribute(getSymbol(I), MCSA_WeakReference);
+    for (const auto &G : M.globals()) {
+      if (!G.hasExternalWeakLinkage())
+        continue;
+      OutStreamer.EmitSymbolAttribute(getSymbol(&G), MCSA_WeakReference);
     }
 
-    for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
-      if (!I->hasExternalWeakLinkage()) continue;
-      OutStreamer.EmitSymbolAttribute(getSymbol(I), MCSA_WeakReference);
+    for (const auto &F : M) {
+      if (!F.hasExternalWeakLinkage())
+        continue;
+      OutStreamer.EmitSymbolAttribute(getSymbol(&F), MCSA_WeakReference);
     }
   }
 
   if (MAI->hasSetDirective()) {
     OutStreamer.AddBlankLine();
-    for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
-         I != E; ++I) {
-      MCSymbol *Name = getSymbol(I);
+    for (const auto &Alias : M.aliases()) {
+      MCSymbol *Name = getSymbol(&Alias);
 
-      const GlobalValue *GV = I->getAliasedGlobal();
+      const GlobalValue *GV = Alias.getAliasee();
       assert(!GV->isDeclaration());
       MCSymbol *Target = getSymbol(GV);
 
-      if (I->hasExternalLinkage() || !MAI->getWeakRefDirective())
+      if (Alias.hasExternalLinkage() || !MAI->getWeakRefDirective())
         OutStreamer.EmitSymbolAttribute(Name, MCSA_Global);
-      else if (I->hasWeakLinkage() || I->hasLinkOnceLinkage())
+      else if (Alias.hasWeakLinkage() || Alias.hasLinkOnceLinkage())
         OutStreamer.EmitSymbolAttribute(Name, MCSA_WeakReference);
       else
-        assert(I->hasLocalLinkage() && "Invalid alias linkage");
+        assert(Alias.hasLocalLinkage() && "Invalid alias linkage");
 
-      EmitVisibility(Name, I->getVisibility());
+      EmitVisibility(Name, Alias.getVisibility());
 
       // Emit the directives as assignments aka .set:
       OutStreamer.EmitAssignment(Name,
@@ -966,7 +954,7 @@ bool AsmPrinter::doFinalization(Module &M) {
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
   assert(MI && "AsmPrinter didn't require GCModuleInfo?");
   for (GCModuleInfo::iterator I = MI->end(), E = MI->begin(); I != E; )
-    if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*--I))
+    if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(**--I))
       MP->finishAssembly(*this);
 
   // Emit llvm.ident metadata in an '.ident' directive.
@@ -983,8 +971,8 @@ bool AsmPrinter::doFinalization(Module &M) {
   // after everything else has gone out.
   EmitEndOfAsmFile(M);
 
-  delete Mang; Mang = 0;
-  MMI = 0;
+  delete Mang; Mang = nullptr;
+  MMI = nullptr;
 
   OutStreamer.Finish();
   OutStreamer.reset();
@@ -1100,7 +1088,7 @@ void AsmPrinter::EmitConstantPool() {
 void AsmPrinter::EmitJumpTableInfo() {
   const DataLayout *DL = MF->getTarget().getDataLayout();
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-  if (MJTI == 0) return;
+  if (!MJTI) return;
   if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline) return;
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   if (JT.empty()) return;
@@ -1185,7 +1173,7 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                     const MachineBasicBlock *MBB,
                                     unsigned UID) const {
   assert(MBB && MBB->getNumber() >= 0 && "Invalid basic block");
-  const MCExpr *Value = 0;
+  const MCExpr *Value = nullptr;
   switch (MJTI->getEntryKind()) {
   case MachineJumpTableInfo::EK_Inline:
     llvm_unreachable("Cannot emit EK_Inline jump table entry");
@@ -1308,6 +1296,15 @@ void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) {
   }
 }
 
+namespace {
+struct Structor {
+  Structor() : Priority(0), Func(nullptr), ComdatKey(nullptr) {}
+  int Priority;
+  llvm::Constant *Func;
+  llvm::GlobalValue *ComdatKey;
+};
+} // end namespace
+
 /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
 /// priority.
 void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
@@ -1319,37 +1316,52 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   const ConstantArray *InitList = dyn_cast<ConstantArray>(List);
   if (!InitList) return; // Not an array!
   StructType *ETy = dyn_cast<StructType>(InitList->getType()->getElementType());
-  if (!ETy || ETy->getNumElements() != 2) return; // Not an array of pairs!
+  // FIXME: Only allow the 3-field form in LLVM 4.0.
+  if (!ETy || ETy->getNumElements() < 2 || ETy->getNumElements() > 3)
+    return; // Not an array of two or three elements!
   if (!isa<IntegerType>(ETy->getTypeAtIndex(0U)) ||
       !isa<PointerType>(ETy->getTypeAtIndex(1U))) return; // Not (int, ptr).
+  if (ETy->getNumElements() == 3 && !isa<PointerType>(ETy->getTypeAtIndex(2U)))
+    return; // Not (int, ptr, ptr).
 
   // Gather the structors in a form that's convenient for sorting by priority.
-  typedef std::pair<unsigned, Constant *> Structor;
   SmallVector<Structor, 8> Structors;
-  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
-    ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i));
+  for (Value *O : InitList->operands()) {
+    ConstantStruct *CS = dyn_cast<ConstantStruct>(O);
     if (!CS) continue; // Malformed.
     if (CS->getOperand(1)->isNullValue())
       break;  // Found a null terminator, skip the rest.
     ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
     if (!Priority) continue; // Malformed.
-    Structors.push_back(std::make_pair(Priority->getLimitedValue(65535),
-                                       CS->getOperand(1)));
+    Structors.push_back(Structor());
+    Structor &S = Structors.back();
+    S.Priority = Priority->getLimitedValue(65535);
+    S.Func = CS->getOperand(1);
+    if (ETy->getNumElements() == 3 && !CS->getOperand(2)->isNullValue())
+      S.ComdatKey = dyn_cast<GlobalValue>(CS->getOperand(2)->stripPointerCasts());
   }
 
   // Emit the function pointers in the target-specific order
   const DataLayout *DL = TM.getDataLayout();
   unsigned Align = Log2_32(DL->getPointerPrefAlignment());
-  std::stable_sort(Structors.begin(), Structors.end(), less_first());
-  for (unsigned i = 0, e = Structors.size(); i != e; ++i) {
+  std::stable_sort(Structors.begin(), Structors.end(),
+                   [](const Structor &L,
+                      const Structor &R) { return L.Priority < R.Priority; });
+  for (Structor &S : Structors) {
+    const TargetLoweringObjectFile &Obj = getObjFileLowering();
+    const MCSymbol *KeySym = nullptr;
+    const MCSection *KeySec = nullptr;
+    if (S.ComdatKey) {
+      KeySym = getSymbol(S.ComdatKey);
+      KeySec = getObjFileLowering().SectionForGlobal(S.ComdatKey, *Mang, TM);
+    }
     const MCSection *OutputSection =
-      (isCtor ?
-       getObjFileLowering().getStaticCtorSection(Structors[i].first) :
-       getObjFileLowering().getStaticDtorSection(Structors[i].first));
+        (isCtor ? Obj.getStaticCtorSection(S.Priority, KeySym, KeySec)
+                : Obj.getStaticDtorSection(S.Priority, KeySym, KeySec));
     OutStreamer.SwitchSection(OutputSection);
     if (OutStreamer.getCurrentSection() != OutStreamer.getPreviousSection())
       EmitAlignment(Align);
-    EmitXXStructor(Structors[i].second);
+    EmitXXStructor(S.Func);
   }
 }
 
@@ -1470,7 +1482,7 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
 // an explicit alignment requested, it will override the alignment request
 // if required for correctness.
 //
-void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalValue *GV) const {
+void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalObject *GV) const {
   if (GV) NumBits = getGVAlignmentLog2(GV, *TM.getDataLayout(), NumBits);
 
   if (NumBits == 0) return;   // 1-byte aligned: no need to emit alignment.
@@ -1503,7 +1515,7 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
 
   const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
-  if (CE == 0) {
+  if (!CE) {
     llvm_unreachable("Unknown constant value to lower!");
   }
 
@@ -1528,7 +1540,7 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
       raw_string_ostream OS(S);
       OS << "Unsupported expression in static initializer: ";
       CE->printAsOperand(OS, /*PrintType=*/false,
-                     !AP.MF ? 0 : AP.MF->getFunction()->getParent());
+                     !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
       report_fatal_error(OS.str());
     }
   case Instruction::GetElementPtr: {
@@ -2055,7 +2067,7 @@ MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const {
 /// PrintParentLoopComment - Print comments about parent loops of this one.
 static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop,
                                    unsigned FunctionNumber) {
-  if (Loop == 0) return;
+  if (!Loop) return;
   PrintParentLoopComment(OS, Loop->getParentLoop(), FunctionNumber);
   OS.indent(Loop->getLoopDepth()*2)
     << "Parent Loop BB" << FunctionNumber << "_"
@@ -2069,12 +2081,12 @@ static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop,
 static void PrintChildLoopComment(raw_ostream &OS, const MachineLoop *Loop,
                                   unsigned FunctionNumber) {
   // Add child loop information
-  for (MachineLoop::iterator CL = Loop->begin(), E = Loop->end();CL != E; ++CL){
-    OS.indent((*CL)->getLoopDepth()*2)
+  for (const MachineLoop *CL : *Loop) {
+    OS.indent(CL->getLoopDepth()*2)
       << "Child Loop BB" << FunctionNumber << "_"
-      << (*CL)->getHeader()->getNumber() << " Depth " << (*CL)->getLoopDepth()
+      << CL->getHeader()->getNumber() << " Depth " << CL->getLoopDepth()
       << '\n';
-    PrintChildLoopComment(OS, *CL, FunctionNumber);
+    PrintChildLoopComment(OS, CL, FunctionNumber);
   }
 }
 
@@ -2084,7 +2096,7 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
                                        const AsmPrinter &AP) {
   // Add loop depth information
   const MachineLoop *Loop = LI->getLoopFor(&MBB);
-  if (Loop == 0) return;
+  if (!Loop) return;
 
   MachineBasicBlock *Header = Loop->getHeader();
   assert(Header && "No header for loop");
@@ -2120,42 +2132,41 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
 /// EmitBasicBlockStart - This method prints the label for the specified
 /// MachineBasicBlock, an alignment (if present) and a comment describing
 /// it if appropriate.
-void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock *MBB) const {
+void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
   // Emit an alignment directive for this block, if needed.
-  if (unsigned Align = MBB->getAlignment())
+  if (unsigned Align = MBB.getAlignment())
     EmitAlignment(Align);
 
   // If the block has its address taken, emit any labels that were used to
   // reference the block.  It is possible that there is more than one label
   // here, because multiple LLVM BB's may have been RAUW'd to this block after
   // the references were generated.
-  if (MBB->hasAddressTaken()) {
-    const BasicBlock *BB = MBB->getBasicBlock();
+  if (MBB.hasAddressTaken()) {
+    const BasicBlock *BB = MBB.getBasicBlock();
     if (isVerbose())
       OutStreamer.AddComment("Block address taken");
 
-    std::vector<MCSymbol*> Syms = MMI->getAddrLabelSymbolToEmit(BB);
-
-    for (unsigned i = 0, e = Syms.size(); i != e; ++i)
-      OutStreamer.EmitLabel(Syms[i]);
+    std::vector<MCSymbol*> Symbols = MMI->getAddrLabelSymbolToEmit(BB);
+    for (auto *Sym : Symbols)
+      OutStreamer.EmitLabel(Sym);
   }
 
   // Print some verbose block comments.
   if (isVerbose()) {
-    if (const BasicBlock *BB = MBB->getBasicBlock())
+    if (const BasicBlock *BB = MBB.getBasicBlock())
       if (BB->hasName())
         OutStreamer.AddComment("%" + BB->getName());
-    emitBasicBlockLoopComments(*MBB, LI, *this);
+    emitBasicBlockLoopComments(MBB, LI, *this);
   }
 
   // Print the main label for the block.
-  if (MBB->pred_empty() || isBlockOnlyReachableByFallthrough(MBB)) {
+  if (MBB.pred_empty() || isBlockOnlyReachableByFallthrough(&MBB)) {
     if (isVerbose()) {
       // NOTE: Want this comment at start of line, don't emit with AddComment.
-      OutStreamer.emitRawComment(" BB#" + Twine(MBB->getNumber()) + ":", false);
+      OutStreamer.emitRawComment(" BB#" + Twine(MBB.getNumber()) + ":", false);
     }
   } else {
-    OutStreamer.EmitLabel(MBB->getSymbol());
+    OutStreamer.EmitLabel(MBB.getSymbol());
   }
 }
 
@@ -2191,14 +2202,11 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
     return false;
 
   // If there isn't exactly one predecessor, it can't be a fall through.
-  MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
-  ++PI2;
-  if (PI2 != MBB->pred_end())
+  if (MBB->pred_size() > 1)
     return false;
 
   // The predecessor has to be immediately before this block.
-  MachineBasicBlock *Pred = *PI;
-
+  MachineBasicBlock *Pred = *MBB->pred_begin();
   if (!Pred->isLayoutSuccessor(MBB))
     return false;
 
@@ -2207,10 +2215,7 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
     return true;
 
   // Check the terminators in the previous blocks
-  for (MachineBasicBlock::iterator II = Pred->getFirstTerminator(),
-         IE = Pred->end(); II != IE; ++II) {
-    MachineInstr &MI = *II;
-
+  for (const auto &MI : Pred->terminators()) {
     // If it is not a simple branch, we are in a table somewhere.
     if (!MI.isBranch() || MI.isIndirectBranch())
       return false;
@@ -2231,25 +2236,25 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
 
 
 
-GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy *S) {
-  if (!S->usesMetadata())
-    return 0;
+GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
+  if (!S.usesMetadata())
+    return nullptr;
 
   gcp_map_type &GCMap = getGCMap(GCMetadataPrinters);
-  gcp_map_type::iterator GCPI = GCMap.find(S);
+  gcp_map_type::iterator GCPI = GCMap.find(&S);
   if (GCPI != GCMap.end())
-    return GCPI->second;
+    return GCPI->second.get();
 
-  const char *Name = S->getName().c_str();
+  const char *Name = S.getName().c_str();
 
   for (GCMetadataPrinterRegistry::iterator
          I = GCMetadataPrinterRegistry::begin(),
          E = GCMetadataPrinterRegistry::end(); I != E; ++I)
     if (strcmp(Name, I->getName()) == 0) {
-      GCMetadataPrinter *GMP = I->instantiate();
-      GMP->S = S;
-      GCMap.insert(std::make_pair(S, GMP));
-      return GMP;
+      std::unique_ptr<GCMetadataPrinter> GMP = I->instantiate();
+      GMP->S = &S;
+      auto IterBool = GCMap.insert(std::make_pair(&S, std::move(GMP)));
+      return IterBool.first->second.get();
     }
 
   report_fatal_error("no GCMetadataPrinter registered for GC: " + Twine(Name));
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index b696069..02cd12b 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "ByteStreamer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/ADT/SmallBitVector.h"
@@ -30,6 +29,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 //===----------------------------------------------------------------------===//
 // Dwarf Emission Helper Routines
 //===----------------------------------------------------------------------===//
@@ -216,30 +217,48 @@ static void emitDwarfRegOpIndirect(ByteStreamer &Streamer, int Reg, int Offset,
 /// Emit a dwarf register operation for describing
 /// - a small value occupying only part of a register or
 /// - a small register representing only part of a value.
-static void emitDwarfOpPiece(ByteStreamer &Streamer, unsigned Size,
-                             unsigned Offset) {
-  assert(Size > 0);
-  if (Offset > 0) {
+static void emitDwarfOpPiece(ByteStreamer &Streamer, unsigned SizeInBits,
+                             unsigned OffsetInBits) {
+  assert(SizeInBits > 0 && "zero-sized piece");
+  unsigned SizeOfByte = 8;
+  if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
     Streamer.EmitInt8(dwarf::DW_OP_bit_piece, "DW_OP_bit_piece");
-    Streamer.EmitULEB128(Size, Twine(Size));
-    Streamer.EmitULEB128(Offset, Twine(Offset));
+    Streamer.EmitULEB128(SizeInBits, Twine(SizeInBits));
+    Streamer.EmitULEB128(OffsetInBits, Twine(OffsetInBits));
   } else {
     Streamer.EmitInt8(dwarf::DW_OP_piece, "DW_OP_piece");
-    unsigned ByteSize = Size / 8; // Assuming 8 bits per byte.
+    unsigned ByteSize = SizeInBits / SizeOfByte;
     Streamer.EmitULEB128(ByteSize, Twine(ByteSize));
   }
 }
 
-/// Some targets do not provide a DWARF register number for every
-/// register.  This function attempts to emit a dwarf register by
-/// emitting a piece of a super-register or by piecing together
-/// multiple subregisters that alias the register.
-static void EmitDwarfRegOpPiece(ByteStreamer &Streamer, const AsmPrinter &AP,
-                                const MachineLocation &MLoc) {
-  assert(!MLoc.isIndirect());
-  const TargetRegisterInfo *TRI = AP.TM.getRegisterInfo();
+/// Emit a shift-right dwarf expression.
+static void emitDwarfOpShr(ByteStreamer &Streamer,
+                           unsigned ShiftBy) {
+  Streamer.EmitInt8(dwarf::DW_OP_constu, "DW_OP_constu");
+  Streamer.EmitULEB128(ShiftBy);
+  Streamer.EmitInt8(dwarf::DW_OP_shr, "DW_OP_shr");
+}
+
+// Some targets do not provide a DWARF register number for every
+// register.  This function attempts to emit a DWARF register by
+// emitting a piece of a super-register or by piecing together
+// multiple subregisters that alias the register.
+void AsmPrinter::EmitDwarfRegOpPiece(ByteStreamer &Streamer,
+                                     const MachineLocation &MLoc,
+                                     unsigned PieceSizeInBits,
+                                     unsigned PieceOffsetInBits) const {
+  assert(MLoc.isReg() && "MLoc must be a register");
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
   int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
 
+  // If this is a valid register number, emit it.
+  if (Reg >= 0) {
+    emitDwarfRegOp(Streamer, Reg);
+    emitDwarfOpPiece(Streamer, PieceSizeInBits, PieceOffsetInBits);
+    return;
+  }
+
   // Walk up the super-register chain until we find a valid number.
   // For example, EAX on x86_64 is a 32-bit piece of RAX with offset 0.
   for (MCSuperRegIterator SR(MLoc.getReg(), TRI); SR.isValid(); ++SR) {
@@ -248,9 +267,19 @@ static void EmitDwarfRegOpPiece(ByteStreamer &Streamer, const AsmPrinter &AP,
       unsigned Idx = TRI->getSubRegIndex(*SR, MLoc.getReg());
       unsigned Size = TRI->getSubRegIdxSize(Idx);
       unsigned Offset = TRI->getSubRegIdxOffset(Idx);
-      AP.OutStreamer.AddComment("super-register");
+      OutStreamer.AddComment("super-register");
       emitDwarfRegOp(Streamer, Reg);
-      emitDwarfOpPiece(Streamer, Size, Offset);
+      if (PieceOffsetInBits == Offset) {
+        emitDwarfOpPiece(Streamer, Size, Offset);
+      } else {
+        // If this is part of a variable in a sub-register at a
+        // non-zero offset, we need to manually shift the value into
+        // place, since the DW_OP_piece describes the part of the
+        // variable, not the position of the subregister.
+        emitDwarfOpPiece(Streamer, Size, PieceOffsetInBits);
+        if (Offset)
+          emitDwarfOpShr(Streamer, Offset);
+      }
       return;
     }
   }
@@ -260,7 +289,7 @@ static void EmitDwarfRegOpPiece(ByteStreamer &Streamer, const AsmPrinter &AP,
   //
   // Keep track of the current position so we can emit the more
   // efficient DW_OP_piece.
-  unsigned CurPos = 0;
+  unsigned CurPos = PieceOffsetInBits;
   // The size of the register in bits, assuming 8 bits per byte.
   unsigned RegSize = TRI->getMinimalPhysRegClass(MLoc.getReg())->getSize() * 8;
   // Keep track of the bits in the register we already emitted, so we
@@ -281,7 +310,7 @@ static void EmitDwarfRegOpPiece(ByteStreamer &Streamer, const AsmPrinter &AP,
     // If this sub-register has a DWARF number and we haven't covered
     // its range, emit a DWARF piece for it.
     if (Reg >= 0 && Intersection.any()) {
-      AP.OutStreamer.AddComment("sub-register");
+      OutStreamer.AddComment("sub-register");
       emitDwarfRegOp(Streamer, Reg);
       emitDwarfOpPiece(Streamer, Size, Offset == CurPos ? 0 : Offset);
       CurPos = Offset + Size;
@@ -291,7 +320,7 @@ static void EmitDwarfRegOpPiece(ByteStreamer &Streamer, const AsmPrinter &AP,
     }
   }
 
-  if (CurPos == 0) {
+  if (CurPos == PieceOffsetInBits) {
     // FIXME: We have no reasonable way of handling errors in here.
     Streamer.EmitInt8(dwarf::DW_OP_nop,
                       "nop (could not find a dwarf register number)");
@@ -317,8 +346,7 @@ void AsmPrinter::EmitDwarfRegOp(ByteStreamer &Streamer,
     }
 
     // Attempt to find a valid super- or sub-register.
-    if (!Indirect && !MLoc.isIndirect())
-      return EmitDwarfRegOpPiece(Streamer, *this, MLoc);
+    return EmitDwarfRegOpPiece(Streamer, MLoc);
   }
 
   if (MLoc.isIndirect())
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 567b6e3..46ee0c8 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -11,11 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -36,6 +36,8 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 namespace {
   struct SrcMgrDiagInfo {
     const MDNode *LocInfo;
@@ -88,7 +90,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
   if (!MCAI->useIntegratedAssembler() &&
       !OutStreamer.isIntegratedAssemblerRequired()) {
     OutStreamer.EmitRawText(Str);
-    emitInlineAsmEnd(TM.getSubtarget<MCSubtargetInfo>(), 0);
+    emitInlineAsmEnd(TM.getSubtarget<MCSubtargetInfo>(), nullptr);
     return;
   }
 
@@ -98,7 +100,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
   // If the current LLVMContext has an inline asm handler, set it in SourceMgr.
   LLVMContext &LLVMCtx = MMI->getModule()->getContext();
   bool HasDiagHandler = false;
-  if (LLVMCtx.getInlineAsmDiagnosticHandler() != 0) {
+  if (LLVMCtx.getInlineAsmDiagnosticHandler() != nullptr) {
     // If the source manager has an issue, we arrange for srcMgrDiagHandler
     // to be invoked, getting DiagInfo passed into it.
     DiagInfo.LocInfo = LocMDNode;
@@ -134,8 +136,11 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
   // emitInlineAsmEnd().
   MCSubtargetInfo STIOrig = *STI;
 
+  MCTargetOptions MCOptions;
+  if (MF)
+    MCOptions = MF->getTarget().Options.MCOptions;
   std::unique_ptr<MCTargetAsmParser> TAP(
-      TM.getTarget().createMCAsmParser(*STI, *Parser, *MII));
+      TM.getTarget().createMCAsmParser(*STI, *Parser, *MII, MCOptions));
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
                        " we don't have an asm parser for this target\n");
@@ -229,10 +234,10 @@ static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
 
         if (InlineAsm::isMemKind(OpFlags)) {
           Error = AP->PrintAsmMemoryOperand(MI, OpNo, InlineAsmVariant,
-                                            /*Modifier*/ 0, OS);
+                                            /*Modifier*/ nullptr, OS);
         } else {
           Error = AP->PrintAsmOperand(MI, OpNo, InlineAsmVariant,
-                                      /*Modifier*/ 0, OS);
+                                      /*Modifier*/ nullptr, OS);
         }
       }
       if (Error) {
@@ -324,7 +329,7 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
         ++LastEmitted;
         const char *StrStart = LastEmitted;
         const char *StrEnd = strchr(StrStart, '}');
-        if (StrEnd == 0)
+        if (!StrEnd)
           report_fatal_error("Unterminated ${:foo} operand in inline asm"
                              " string: '" + Twine(AsmStr) + "'");
 
@@ -399,11 +404,11 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
           else {
             if (InlineAsm::isMemKind(OpFlags)) {
               Error = AP->PrintAsmMemoryOperand(MI, OpNo, InlineAsmVariant,
-                                                Modifier[0] ? Modifier : 0,
+                                                Modifier[0] ? Modifier : nullptr,
                                                 OS);
             } else {
               Error = AP->PrintAsmOperand(MI, OpNo, InlineAsmVariant,
-                                          Modifier[0] ? Modifier : 0, OS);
+                                          Modifier[0] ? Modifier : nullptr, OS);
             }
           }
         }
@@ -452,7 +457,7 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
   // Get the !srcloc metadata node if we have it, and decode the loc cookie from
   // it.
   unsigned LocCookie = 0;
-  const MDNode *LocMD = 0;
+  const MDNode *LocMD = nullptr;
   for (unsigned i = MI->getNumOperands(); i != 0; --i) {
     if (MI->getOperand(i-1).isMetadata() &&
         (LocMD = MI->getOperand(i-1).getMetadata()) &&
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index b3eddac..b4ef185 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -1,14 +1,18 @@
 add_llvm_library(LLVMAsmPrinter
+  AddressPool.cpp
   ARMException.cpp
   AsmPrinter.cpp
   AsmPrinterDwarf.cpp
   AsmPrinterInlineAsm.cpp
+  DbgValueHistoryCalculator.cpp
   DIE.cpp
   DIEHash.cpp
   DwarfAccelTable.cpp
   DwarfCFIException.cpp
   DwarfDebug.cpp
   DwarfException.cpp
+  DwarfFile.cpp
+  DwarfStringPool.cpp
   DwarfUnit.cpp
   ErlangGCPrinter.cpp
   OcamlGCPrinter.cpp
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 26e8f2d..c3dcd9c 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -104,15 +104,6 @@ void DIEAbbrev::print(raw_ostream &O) {
 void DIEAbbrev::dump() { print(dbgs()); }
 #endif
 
-//===----------------------------------------------------------------------===//
-// DIE Implementation
-//===----------------------------------------------------------------------===//
-
-DIE::~DIE() {
-  for (unsigned i = 0, N = Children.size(); i < N; ++i)
-    delete Children[i];
-}
-
 /// Climb up the parent chain to get the unit DIE to which this DIE
 /// belongs.
 const DIE *DIE::getUnit() const {
@@ -131,7 +122,7 @@ const DIE *DIE::getUnitOrNull() const {
       return p;
     p = p->getParent();
   }
-  return NULL;
+  return nullptr;
 }
 
 DIEValue *DIE::findAttribute(dwarf::Attribute Attribute) const {
@@ -143,7 +134,7 @@ DIEValue *DIE::findAttribute(dwarf::Attribute Attribute) const {
   for (size_t i = 0; i < Values.size(); ++i)
     if (Abbrevs.getData()[i].getAttribute() == Attribute)
       return Values[i];
-  return NULL;
+  return nullptr;
 }
 
 #ifndef NDEBUG
@@ -385,12 +376,12 @@ void DIEEntry::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
 
   if (Form == dwarf::DW_FORM_ref_addr) {
     const DwarfDebug *DD = AP->getDwarfDebug();
-    unsigned Addr = Entry->getOffset();
+    unsigned Addr = Entry.getOffset();
     assert(!DD->useSplitDwarf() && "TODO: dwo files can't have relocations.");
     // For DW_FORM_ref_addr, output the offset from beginning of debug info
     // section. Entry->getOffset() returns the offset from start of the
     // compile unit.
-    DwarfCompileUnit *CU = DD->lookupUnit(Entry->getUnit());
+    DwarfCompileUnit *CU = DD->lookupUnit(Entry.getUnit());
     assert(CU && "CUDie should belong to a CU.");
     Addr += CU->getDebugInfoOffset();
     if (AP->MAI->doesDwarfUseRelocationsAcrossSections())
@@ -401,7 +392,7 @@ void DIEEntry::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
                                     CU->getSectionSym(),
                                     DIEEntry::getRefAddrSize(AP));
   } else
-    AP->EmitInt32(Entry->getOffset());
+    AP->EmitInt32(Entry.getOffset());
 }
 
 unsigned DIEEntry::getRefAddrSize(AsmPrinter *AP) {
@@ -418,7 +409,7 @@ unsigned DIEEntry::getRefAddrSize(AsmPrinter *AP) {
 
 #ifndef NDEBUG
 void DIEEntry::print(raw_ostream &O) const {
-  O << format("Die: 0x%lx", (long)(intptr_t)Entry);
+  O << format("Die: 0x%lx", (long)(intptr_t)&Entry);
 }
 #endif
 
diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/lib/CodeGen/AsmPrinter/DIE.h
index 7fefd4f..ef05f17 100644
--- a/lib/CodeGen/AsmPrinter/DIE.h
+++ b/lib/CodeGen/AsmPrinter/DIE.h
@@ -124,7 +124,13 @@ protected:
 
   /// Children DIEs.
   ///
-  std::vector<DIE *> Children;
+  // This can't be a vector<DIE> because pointer validity is requirent for the
+  // Parent pointer and DIEEntry.
+  // It can't be a list<DIE> because some clients need pointer validity before
+  // the object has been added to any child list
+  // (eg: DwarfUnit::constructVariableDIE). These aren't insurmountable, but may
+  // be more convoluted than beneficial.
+  std::vector<std::unique_ptr<DIE>> Children;
 
   DIE *Parent;
 
@@ -132,11 +138,15 @@ protected:
   ///
   SmallVector<DIEValue *, 12> Values;
 
+protected:
+  DIE()
+      : Offset(0), Size(0), Abbrev((dwarf::Tag)0, dwarf::DW_CHILDREN_no),
+        Parent(nullptr) {}
+
 public:
-  explicit DIE(unsigned Tag)
+  explicit DIE(dwarf::Tag Tag)
       : Offset(0), Size(0), Abbrev((dwarf::Tag)Tag, dwarf::DW_CHILDREN_no),
-        Parent(0) {}
-  ~DIE();
+        Parent(nullptr) {}
 
   // Accessors.
   DIEAbbrev &getAbbrev() { return Abbrev; }
@@ -145,7 +155,9 @@ public:
   dwarf::Tag getTag() const { return Abbrev.getTag(); }
   unsigned getOffset() const { return Offset; }
   unsigned getSize() const { return Size; }
-  const std::vector<DIE *> &getChildren() const { return Children; }
+  const std::vector<std::unique_ptr<DIE>> &getChildren() const {
+    return Children;
+  }
   const SmallVectorImpl<DIEValue *> &getValues() const { return Values; }
   DIE *getParent() const { return Parent; }
   /// Climb up the parent chain to get the compile or type unit DIE this DIE
@@ -166,11 +178,11 @@ public:
 
   /// addChild - Add a child to the DIE.
   ///
-  void addChild(DIE *Child) {
+  void addChild(std::unique_ptr<DIE> Child) {
     assert(!Child->getParent());
     Abbrev.setChildrenFlag(dwarf::DW_CHILDREN_yes);
-    Children.push_back(Child);
     Child->Parent = this;
+    Children.push_back(std::move(Child));
   }
 
   /// findAttribute - Find a value in the DIE with the attribute given,
@@ -399,14 +411,13 @@ public:
 /// this class can also be used as a proxy for a debug information entry not
 /// yet defined (ie. types.)
 class DIEEntry : public DIEValue {
-  DIE *const Entry;
+  DIE &Entry;
 
 public:
-  explicit DIEEntry(DIE *E) : DIEValue(isEntry), Entry(E) {
-    assert(E && "Cannot construct a DIEEntry with a null DIE");
+  explicit DIEEntry(DIE &E) : DIEValue(isEntry), Entry(E) {
   }
 
-  DIE *getEntry() const { return Entry; }
+  DIE &getEntry() const { return Entry; }
 
   /// EmitValue - Emit debug information entry offset.
   ///
@@ -464,7 +475,7 @@ public:
 class DIELoc : public DIEValue, public DIE {
   mutable unsigned Size; // Size in bytes excluding size header.
 public:
-  DIELoc() : DIEValue(isLoc), DIE(0), Size(0) {}
+  DIELoc() : DIEValue(isLoc), Size(0) {}
 
   /// ComputeSize - Calculate the size of the location expression.
   ///
@@ -507,7 +518,7 @@ public:
 class DIEBlock : public DIEValue, public DIE {
   mutable unsigned Size; // Size in bytes excluding size header.
 public:
-  DIEBlock() : DIEValue(isBlock), DIE(0), Size(0) {}
+  DIEBlock() : DIEValue(isBlock), Size(0) {}
 
   /// ComputeSize - Calculate the size of the location expression.
   ///
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
index 74beec1..c2fad59 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "dwarfdebug"
-
 #include "ByteStreamer.h"
 #include "DIEHash.h"
 #include "DIE.h"
@@ -28,6 +26,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "dwarfdebug"
+
 /// \brief Grabs the string in whichever attribute is passed in and returns
 /// a reference to it.
 static StringRef getDIEStringAttr(const DIE &Die, uint16_t Attr) {
@@ -309,7 +309,7 @@ void DIEHash::hashAttribute(AttrEntry Attr, dwarf::Tag Tag) {
     // ... An attribute that refers to another type entry T is processed as
     // follows:
   case DIEValue::isEntry:
-    hashDIEEntry(Attribute, Tag, *cast<DIEEntry>(Value)->getEntry());
+    hashDIEEntry(Attribute, Tag, cast<DIEEntry>(Value)->getEntry());
     break;
   case DIEValue::isInteger: {
     addULEB128('A');
@@ -463,20 +463,18 @@ void DIEHash::computeHash(const DIE &Die) {
   addAttributes(Die);
 
   // Then hash each of the children of the DIE.
-  for (std::vector<DIE *>::const_iterator I = Die.getChildren().begin(),
-                                          E = Die.getChildren().end();
-       I != E; ++I) {
+  for (auto &C : Die.getChildren()) {
     // 7.27 Step 7
     // If C is a nested type entry or a member function entry, ...
-    if (isType((*I)->getTag()) || (*I)->getTag() == dwarf::DW_TAG_subprogram) {
-      StringRef Name = getDIEStringAttr(**I, dwarf::DW_AT_name);
+    if (isType(C->getTag()) || C->getTag() == dwarf::DW_TAG_subprogram) {
+      StringRef Name = getDIEStringAttr(*C, dwarf::DW_AT_name);
       // ... and has a DW_AT_name attribute
       if (!Name.empty()) {
-        hashNestedType(**I, Name);
+        hashNestedType(*C, Name);
         continue;
       }
     }
-    computeHash(**I);
+    computeHash(*C);
   }
 
   // Following the last (or if there are no children), append a zero byte.
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.h b/lib/CodeGen/AsmPrinter/DIEHash.h
index 48f1601..175d660 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -89,7 +89,7 @@ class DIEHash {
   };
 
 public:
-  DIEHash(AsmPrinter *A = NULL) : AP(A) {}
+  DIEHash(AsmPrinter *A = nullptr) : AP(A) {}
 
   /// \brief Computes the ODR signature.
   uint64_t computeDIEODRSignature(const DIE &Die);
diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
new file mode 100644
index 0000000..6103254
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
@@ -0,0 +1,175 @@
+//===-- llvm/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DbgValueHistoryCalculator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <map>
+
+#define DEBUG_TYPE "dwarfdebug"
+
+namespace llvm {
+
+// \brief If @MI is a DBG_VALUE with debug value described by a
+// defined register, returns the number of this register.
+// In the other case, returns 0.
+static unsigned isDescribedByReg(const MachineInstr &MI) {
+  assert(MI.isDebugValue());
+  assert(MI.getNumOperands() == 3);
+  // If location of variable is described using a register (directly or
+  // indirecltly), this register is always a first operand.
+  return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : 0;
+}
+
+void DbgValueHistoryMap::startInstrRange(const MDNode *Var,
+                                         const MachineInstr &MI) {
+  // Instruction range should start with a DBG_VALUE instruction for the
+  // variable.
+  assert(MI.isDebugValue() && MI.getDebugVariable() == Var);
+  auto &Ranges = VarInstrRanges[Var];
+  if (!Ranges.empty() && Ranges.back().second == nullptr &&
+      Ranges.back().first->isIdenticalTo(&MI)) {
+    DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
+                 << "\t" << Ranges.back().first << "\t" << MI << "\n");
+    return;
+  }
+  Ranges.push_back(std::make_pair(&MI, nullptr));
+}
+
+void DbgValueHistoryMap::endInstrRange(const MDNode *Var,
+                                       const MachineInstr &MI) {
+  auto &Ranges = VarInstrRanges[Var];
+  // Verify that the current instruction range is not yet closed.
+  assert(!Ranges.empty() && Ranges.back().second == nullptr);
+  // For now, instruction ranges are not allowed to cross basic block
+  // boundaries.
+  assert(Ranges.back().first->getParent() == MI.getParent());
+  Ranges.back().second = &MI;
+}
+
+unsigned DbgValueHistoryMap::getRegisterForVar(const MDNode *Var) const {
+  const auto &I = VarInstrRanges.find(Var);
+  if (I == VarInstrRanges.end())
+    return 0;
+  const auto &Ranges = I->second;
+  if (Ranges.empty() || Ranges.back().second != nullptr)
+    return 0;
+  return isDescribedByReg(*Ranges.back().first);
+}
+
+namespace {
+// Maps physreg numbers to the variables they describe.
+typedef std::map<unsigned, SmallVector<const MDNode *, 1>> RegDescribedVarsMap;
+}
+
+// \brief Claim that @Var is not described by @RegNo anymore.
+static void dropRegDescribedVar(RegDescribedVarsMap &RegVars,
+                                unsigned RegNo, const MDNode *Var) {
+  const auto &I = RegVars.find(RegNo);
+  assert(RegNo != 0U && I != RegVars.end());
+  auto &VarSet = I->second;
+  const auto &VarPos = std::find(VarSet.begin(), VarSet.end(), Var);
+  assert(VarPos != VarSet.end());
+  VarSet.erase(VarPos);
+  // Don't keep empty sets in a map to keep it as small as possible.
+  if (VarSet.empty())
+    RegVars.erase(I);
+}
+
+// \brief Claim that @Var is now described by @RegNo.
+static void addRegDescribedVar(RegDescribedVarsMap &RegVars,
+                               unsigned RegNo, const MDNode *Var) {
+  assert(RegNo != 0U);
+  auto &VarSet = RegVars[RegNo];
+  assert(std::find(VarSet.begin(), VarSet.end(), Var) == VarSet.end());
+  VarSet.push_back(Var);
+}
+
+// \brief Terminate the location range for variables described by register
+// @RegNo by inserting @ClobberingInstr to their history.
+static void clobberRegisterUses(RegDescribedVarsMap &RegVars, unsigned RegNo,
+                                DbgValueHistoryMap &HistMap,
+                                const MachineInstr &ClobberingInstr) {
+  const auto &I = RegVars.find(RegNo);
+  if (I == RegVars.end())
+    return;
+  // Iterate over all variables described by this register and add this
+  // instruction to their history, clobbering it.
+  for (const auto &Var : I->second)
+    HistMap.endInstrRange(Var, ClobberingInstr);
+  RegVars.erase(I);
+}
+
+// \brief Terminate location ranges for all variables, described by registers
+// clobbered by @MI.
+static void clobberRegisterUses(RegDescribedVarsMap &RegVars,
+                                const MachineInstr &MI,
+                                const TargetRegisterInfo *TRI,
+                                DbgValueHistoryMap &HistMap) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef() || !MO.getReg())
+      continue;
+    for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid();
+         ++AI) {
+      unsigned RegNo = *AI;
+      clobberRegisterUses(RegVars, RegNo, HistMap, MI);
+    }
+  }
+}
+
+// \brief Terminate the location range for all register-described variables
+// by inserting @ClobberingInstr to their history.
+static void clobberAllRegistersUses(RegDescribedVarsMap &RegVars,
+                                    DbgValueHistoryMap &HistMap,
+                                    const MachineInstr &ClobberingInstr) {
+  for (const auto &I : RegVars)
+    for (const auto &Var : I.second)
+      HistMap.endInstrRange(Var, ClobberingInstr);
+  RegVars.clear();
+}
+
+void calculateDbgValueHistory(const MachineFunction *MF,
+                              const TargetRegisterInfo *TRI,
+                              DbgValueHistoryMap &Result) {
+  RegDescribedVarsMap RegVars;
+
+  for (const auto &MBB : *MF) {
+    for (const auto &MI : MBB) {
+      if (!MI.isDebugValue()) {
+        // Not a DBG_VALUE instruction. It may clobber registers which describe
+        // some variables.
+        clobberRegisterUses(RegVars, MI, TRI, Result);
+        continue;
+      }
+
+      assert(MI.getNumOperands() > 1 && "Invalid DBG_VALUE instruction!");
+      const MDNode *Var = MI.getDebugVariable();
+
+      if (unsigned PrevReg = Result.getRegisterForVar(Var))
+        dropRegDescribedVar(RegVars, PrevReg, Var);
+
+      Result.startInstrRange(Var, MI);
+
+      if (unsigned NewReg = isDescribedByReg(MI))
+        addRegDescribedVar(RegVars, NewReg, Var);
+    }
+
+    // Make sure locations for register-described variables are valid only
+    // until the end of the basic block (unless it's the last basic block, in
+    // which case let their liveness run off to the end of the function).
+    if (!MBB.empty() &&  &MBB != &MF->back())
+      clobberAllRegistersUses(RegVars, Result, MBB.back());
+  }
+}
+
+}
diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
new file mode 100644
index 0000000..b9177f0
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
@@ -0,0 +1,54 @@
+//===-- llvm/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H_
+#define CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H_
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+class MachineFunction;
+class MachineInstr;
+class MDNode;
+class TargetRegisterInfo;
+
+// For each user variable, keep a list of instruction ranges where this variable
+// is accessible. The variables are listed in order of appearance.
+class DbgValueHistoryMap {
+  // Each instruction range starts with a DBG_VALUE instruction, specifying the
+  // location of a variable, which is assumed to be valid until the end of the
+  // range. If end is not specified, location is valid until the start
+  // instruction of the next instruction range, or until the end of the
+  // function.
+  typedef std::pair<const MachineInstr *, const MachineInstr *> InstrRange;
+  typedef SmallVector<InstrRange, 4> InstrRanges;
+  typedef MapVector<const MDNode *, InstrRanges> InstrRangesMap;
+  InstrRangesMap VarInstrRanges;
+
+public:
+  void startInstrRange(const MDNode *Var, const MachineInstr &MI);
+  void endInstrRange(const MDNode *Var, const MachineInstr &MI);
+  // Returns register currently describing @Var. If @Var is currently
+  // unaccessible or is not described by a register, returns 0.
+  unsigned getRegisterForVar(const MDNode *Var) const;
+
+  bool empty() const { return VarInstrRanges.empty(); }
+  void clear() { VarInstrRanges.clear(); }
+  InstrRangesMap::const_iterator begin() const { return VarInstrRanges.begin(); }
+  InstrRangesMap::const_iterator end() const { return VarInstrRanges.end(); }
+};
+
+void calculateDbgValueHistory(const MachineFunction *MF,
+                              const TargetRegisterInfo *TRI,
+                              DbgValueHistoryMap &Result);
+}
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index 470453f..3beb799 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -23,75 +23,82 @@ class DebugLocEntry {
   const MCSymbol *Begin;
   const MCSymbol *End;
 
-  // Type of entry that this represents.
-  enum EntryType { E_Location, E_Integer, E_ConstantFP, E_ConstantInt };
-  enum EntryType EntryKind;
+public:
+  /// A single location or constant.
+  struct Value {
+    Value(const MDNode *Var, int64_t i)
+      : Variable(Var), EntryKind(E_Integer) {
+      Constant.Int = i;
+    }
+    Value(const MDNode *Var, const ConstantFP *CFP)
+      : Variable(Var), EntryKind(E_ConstantFP) {
+      Constant.CFP = CFP;
+    }
+    Value(const MDNode *Var, const ConstantInt *CIP)
+      : Variable(Var), EntryKind(E_ConstantInt) {
+      Constant.CIP = CIP;
+    }
+    Value(const MDNode *Var, MachineLocation Loc)
+      : Variable(Var), EntryKind(E_Location), Loc(Loc) {
+    }
 
-  union {
-    int64_t Int;
-    const ConstantFP *CFP;
-    const ConstantInt *CIP;
-  } Constants;
+    // The variable to which this location entry corresponds.
+    const MDNode *Variable;
 
-  // The location in the machine frame.
-  MachineLocation Loc;
+    // Type of entry that this represents.
+    enum EntryType { E_Location, E_Integer, E_ConstantFP, E_ConstantInt };
+    enum EntryType EntryKind;
 
-  // The variable to which this location entry corresponds.
-  const MDNode *Variable;
+    // Either a constant,
+    union {
+      int64_t Int;
+      const ConstantFP *CFP;
+      const ConstantInt *CIP;
+    } Constant;
 
-  // The compile unit to which this location entry is referenced by.
-  const DwarfCompileUnit *Unit;
+    // Or a location in the machine frame.
+    MachineLocation Loc;
 
-  bool hasSameValueOrLocation(const DebugLocEntry &Next) {
-    if (EntryKind != Next.EntryKind)
-      return false;
+    bool operator==(const Value &other) const {
+      if (EntryKind != other.EntryKind)
+        return false;
 
-    bool EqualValues;
-    switch (EntryKind) {
-    case E_Location:
-      EqualValues = Loc == Next.Loc;
-      break;
-    case E_Integer:
-      EqualValues = Constants.Int == Next.Constants.Int;
-      break;
-    case E_ConstantFP:
-      EqualValues = Constants.CFP == Next.Constants.CFP;
-      break;
-    case E_ConstantInt:
-      EqualValues = Constants.CIP == Next.Constants.CIP;
-      break;
+      switch (EntryKind) {
+      case E_Location:
+        return Loc == other.Loc;
+      case E_Integer:
+        return Constant.Int == other.Constant.Int;
+      case E_ConstantFP:
+        return Constant.CFP == other.Constant.CFP;
+      case E_ConstantInt:
+        return Constant.CIP == other.Constant.CIP;
+      }
+      llvm_unreachable("unhandled EntryKind");
     }
 
-    return EqualValues;
-  }
+    bool isLocation() const { return EntryKind == E_Location; }
+    bool isInt() const { return EntryKind == E_Integer; }
+    bool isConstantFP() const { return EntryKind == E_ConstantFP; }
+    bool isConstantInt() const { return EntryKind == E_ConstantInt; }
+    int64_t getInt() const { return Constant.Int; }
+    const ConstantFP *getConstantFP() const { return Constant.CFP; }
+    const ConstantInt *getConstantInt() const { return Constant.CIP; }
+    MachineLocation getLoc() const { return Loc; }
+    const MDNode *getVariable() const { return Variable; }
+  };
+private:
+  /// A list of locations/constants belonging to this entry.
+  SmallVector<Value, 1> Values;
+
+  /// The compile unit that this location entry is referenced by.
+  const DwarfCompileUnit *Unit;
 
 public:
-  DebugLocEntry() : Begin(0), End(0), Variable(0), Unit(0) {
-    Constants.Int = 0;
-  }
-  DebugLocEntry(const MCSymbol *B, const MCSymbol *E, MachineLocation &L,
-                const MDNode *V, const DwarfCompileUnit *U)
-      : Begin(B), End(E), Loc(L), Variable(V), Unit(U) {
-    Constants.Int = 0;
-    EntryKind = E_Location;
-  }
-  DebugLocEntry(const MCSymbol *B, const MCSymbol *E, int64_t i,
-                const DwarfCompileUnit *U)
-      : Begin(B), End(E), Variable(0), Unit(U) {
-    Constants.Int = i;
-    EntryKind = E_Integer;
-  }
-  DebugLocEntry(const MCSymbol *B, const MCSymbol *E, const ConstantFP *FPtr,
-                const DwarfCompileUnit *U)
-      : Begin(B), End(E), Variable(0), Unit(U) {
-    Constants.CFP = FPtr;
-    EntryKind = E_ConstantFP;
-  }
-  DebugLocEntry(const MCSymbol *B, const MCSymbol *E, const ConstantInt *IPtr,
-                const DwarfCompileUnit *U)
-      : Begin(B), End(E), Variable(0), Unit(U) {
-    Constants.CIP = IPtr;
-    EntryKind = E_ConstantInt;
+  DebugLocEntry() : Begin(nullptr), End(nullptr), Unit(nullptr) {}
+  DebugLocEntry(const MCSymbol *B, const MCSymbol *E,
+                Value Val, const DwarfCompileUnit *U)
+      : Begin(B), End(E), Unit(U) {
+    Values.push_back(std::move(Val));
   }
 
   /// \brief Attempt to merge this DebugLocEntry with Next and return
@@ -99,24 +106,17 @@ public:
   /// share the same Loc/Constant and if Next immediately follows this
   /// Entry.
   bool Merge(const DebugLocEntry &Next) {
-    if (End == Next.Begin && hasSameValueOrLocation(Next)) {
+    if ((End == Next.Begin && Values == Next.Values)) {
       End = Next.End;
       return true;
     }
     return false;
   }
-  bool isLocation() const { return EntryKind == E_Location; }
-  bool isInt() const { return EntryKind == E_Integer; }
-  bool isConstantFP() const { return EntryKind == E_ConstantFP; }
-  bool isConstantInt() const { return EntryKind == E_ConstantInt; }
-  int64_t getInt() const { return Constants.Int; }
-  const ConstantFP *getConstantFP() const { return Constants.CFP; }
-  const ConstantInt *getConstantInt() const { return Constants.CIP; }
-  const MDNode *getVariable() const { return Variable; }
   const MCSymbol *getBeginSym() const { return Begin; }
   const MCSymbol *getEndSym() const { return End; }
   const DwarfCompileUnit *getCU() const { return Unit; }
-  MachineLocation getLoc() const { return Loc; }
+  const ArrayRef<Value> getValues() const { return Values; }
+  void addValue(Value Val) { Values.push_back(Val); }
 };
 
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index bcbb6c8..e9527c4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -29,14 +29,15 @@ DwarfAccelTable::DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom> atomList)
     : Header(8 + (atomList.size() * 4)), HeaderData(atomList),
       Entries(Allocator) {}
 
-DwarfAccelTable::~DwarfAccelTable() {}
-
-void DwarfAccelTable::AddName(StringRef Name, const DIE *die, char Flags) {
+void DwarfAccelTable::AddName(StringRef Name, MCSymbol *StrSym, const DIE *die,
+                              char Flags) {
   assert(Data.empty() && "Already finalized!");
   // If the string is in the list already then add this die to the list
   // otherwise add a new one.
   DataArray &DIEs = Entries[Name];
-  DIEs.push_back(new (Allocator) HashDataContents(die, Flags));
+  assert(!DIEs.StrSym || DIEs.StrSym == StrSym);
+  DIEs.StrSym = StrSym;
+  DIEs.Values.push_back(new (Allocator) HashDataContents(die, Flags));
 }
 
 void DwarfAccelTable::ComputeBucketCount(void) {
@@ -72,9 +73,10 @@ void DwarfAccelTable::FinalizeTable(AsmPrinter *Asm, StringRef Prefix) {
        EI != EE; ++EI) {
 
     // Unique the entries.
-    std::stable_sort(EI->second.begin(), EI->second.end(), compareDIEs);
-    EI->second.erase(std::unique(EI->second.begin(), EI->second.end()),
-                     EI->second.end());
+    std::stable_sort(EI->second.Values.begin(), EI->second.Values.end(), compareDIEs);
+    EI->second.Values.erase(
+        std::unique(EI->second.Values.begin(), EI->second.Values.end()),
+        EI->second.Values.end());
 
     HashData *Entry = new (Allocator) HashData(EI->getKey(), EI->second);
     Data.push_back(Entry);
@@ -181,21 +183,18 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfFile *D) {
       // Remember to emit the label for our offset.
       Asm->OutStreamer.EmitLabel((*HI)->Sym);
       Asm->OutStreamer.AddComment((*HI)->Str);
-      Asm->EmitSectionOffset(D->getStringPoolEntry((*HI)->Str),
-                             D->getStringPoolSym());
+      Asm->EmitSectionOffset((*HI)->Data.StrSym,
+                             D->getStringPool().getSectionSymbol());
       Asm->OutStreamer.AddComment("Num DIEs");
-      Asm->EmitInt32((*HI)->Data.size());
-      for (ArrayRef<HashDataContents *>::const_iterator
-               DI = (*HI)->Data.begin(),
-               DE = (*HI)->Data.end();
-           DI != DE; ++DI) {
+      Asm->EmitInt32((*HI)->Data.Values.size());
+      for (HashDataContents *HD : (*HI)->Data.Values) {
         // Emit the DIE offset
-        Asm->EmitInt32((*DI)->Die->getOffset());
+        Asm->EmitInt32(HD->Die->getOffset());
         // If we have multiple Atoms emit that info too.
         // FIXME: A bit of a hack, we either emit only one atom or all info.
         if (HeaderData.Atoms.size() > 1) {
-          Asm->EmitInt16((*DI)->Die->getTag());
-          Asm->EmitInt8((*DI)->Flags);
+          Asm->EmitInt16(HD->Die->getTag());
+          Asm->EmitInt8(HD->Flags);
         }
       }
       // Emit a 0 to terminate the data unless we have a hash collision.
@@ -235,10 +234,8 @@ void DwarfAccelTable::print(raw_ostream &O) {
                                             EE = Entries.end();
        EI != EE; ++EI) {
     O << "Name: " << EI->getKeyData() << "\n";
-    for (DataArray::const_iterator DI = EI->second.begin(),
-                                   DE = EI->second.end();
-         DI != DE; ++DI)
-      (*DI)->print(O);
+    for (HashDataContents *HD : EI->second.Values)
+      HD->print(O);
   }
 
   O << "Buckets and Hashes: \n";
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
index 4a14497..a3cc95f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
@@ -125,7 +126,8 @@ public:
     uint16_t type; // enum AtomType
     uint16_t form; // DWARF DW_FORM_ defines
 
-    Atom(uint16_t type, uint16_t form) : type(type), form(form) {}
+    LLVM_CONSTEXPR Atom(uint16_t type, uint16_t form)
+        : type(type), form(form) {}
 #ifndef NDEBUG
     void print(raw_ostream &O) {
       O << "Type: " << dwarf::AtomTypeString(type) << "\n"
@@ -177,12 +179,19 @@ public:
   };
 
 private:
+  // String Data
+  struct DataArray {
+    MCSymbol *StrSym;
+    std::vector<HashDataContents *> Values;
+    DataArray() : StrSym(nullptr) {}
+  };
+  friend struct HashData;
   struct HashData {
     StringRef Str;
     uint32_t HashValue;
     MCSymbol *Sym;
-    ArrayRef<HashDataContents *> Data; // offsets
-    HashData(StringRef S, ArrayRef<HashDataContents *> Data)
+    DwarfAccelTable::DataArray &Data; // offsets
+    HashData(StringRef S, DwarfAccelTable::DataArray &Data)
         : Str(S), Data(Data) {
       HashValue = DwarfAccelTable::HashDJB(S);
     }
@@ -196,10 +205,10 @@ private:
       else
         O << "<none>";
       O << "\n";
-      for (size_t i = 0; i < Data.size(); i++) {
-        O << "  Offset: " << Data[i]->Die->getOffset() << "\n";
-        O << "  Tag: " << dwarf::TagString(Data[i]->Die->getTag()) << "\n";
-        O << "  Flags: " << Data[i]->Flags << "\n";
+      for (HashDataContents *C : Data.Values) {
+        O << "  Offset: " << C->Die->getOffset() << "\n";
+        O << "  Tag: " << dwarf::TagString(C->Die->getTag()) << "\n";
+        O << "  Flags: " << C->Flags << "\n";
       }
     }
     void dump() { print(dbgs()); }
@@ -224,8 +233,6 @@ private:
   TableHeaderData HeaderData;
   std::vector<HashData *> Data;
 
-  // String Data
-  typedef std::vector<HashDataContents *> DataArray;
   typedef StringMap<DataArray, BumpPtrAllocator &> StringEntries;
   StringEntries Entries;
 
@@ -238,8 +245,8 @@ private:
   // Public Implementation
 public:
   DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom>);
-  ~DwarfAccelTable();
-  void AddName(StringRef, const DIE *, char = 0);
+  void AddName(StringRef Name, MCSymbol *StrSym, const DIE *Die,
+               char Flags = 0);
   void FinalizeTable(AsmPrinter *, StringRef);
   void Emit(AsmPrinter *, MCSymbol *, DwarfFile *);
 #ifndef NDEBUG
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 11345eb..2a0615d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -11,12 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "dwarfdebug"
 #include "ByteStreamer.h"
 #include "DwarfDebug.h"
 #include "DIE.h"
 #include "DIEHash.h"
-#include "DwarfAccelTable.h"
 #include "DwarfUnit.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
@@ -51,6 +49,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "dwarfdebug"
+
 static cl::opt<bool>
 DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
                          cl::desc("Disable debug info printing"));
@@ -107,8 +107,6 @@ static const char *const DbgTimerName = "DWARF Debug Writer";
 
 //===----------------------------------------------------------------------===//
 
-namespace llvm {
-
 /// resolve - Look in the DwarfDebug map for the MDNode that
 /// corresponds to the reference.
 template <typename T> T DbgVariable::resolve(DIRef<T> Ref) const {
@@ -120,7 +118,6 @@ bool DbgVariable::isBlockByrefVariable() const {
   return Var.isBlockByrefVariable(DD->getTypeIdentifierMap());
 }
 
-
 DIType DbgVariable::getType() const {
   DIType Ty = Var.getType().resolve(DD->getTypeIdentifierMap());
   // FIXME: isBlockByrefVariable should be reformulated in terms of complex
@@ -166,29 +163,32 @@ DIType DbgVariable::getType() const {
   return Ty;
 }
 
-} // end llvm namespace
-
-/// Return Dwarf Version by checking module flags.
-static unsigned getDwarfVersionFromModule(const Module *M) {
-  Value *Val = M->getModuleFlag("Dwarf Version");
-  if (!Val)
-    return dwarf::DWARF_VERSION;
-  return cast<ConstantInt>(Val)->getZExtValue();
-}
+static LLVM_CONSTEXPR DwarfAccelTable::Atom TypeAtoms[] = {
+    DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4),
+    DwarfAccelTable::Atom(dwarf::DW_ATOM_die_tag, dwarf::DW_FORM_data2),
+    DwarfAccelTable::Atom(dwarf::DW_ATOM_type_flags, dwarf::DW_FORM_data1)};
 
 DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
-    : Asm(A), MMI(Asm->MMI), FirstCU(0), PrevLabel(NULL), GlobalRangeCount(0),
-      InfoHolder(A, "info_string", DIEValueAllocator),
+    : Asm(A), MMI(Asm->MMI), FirstCU(nullptr), PrevLabel(nullptr),
+      GlobalRangeCount(0), InfoHolder(A, "info_string", DIEValueAllocator),
       UsedNonDefaultText(false),
-      SkeletonHolder(A, "skel_string", DIEValueAllocator) {
-
-  DwarfInfoSectionSym = DwarfAbbrevSectionSym = DwarfStrSectionSym = 0;
-  DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = DwarfLineSectionSym = 0;
-  DwarfAddrSectionSym = 0;
-  DwarfAbbrevDWOSectionSym = DwarfStrDWOSectionSym = 0;
-  FunctionBeginSym = FunctionEndSym = 0;
-  CurFn = 0;
-  CurMI = 0;
+      SkeletonHolder(A, "skel_string", DIEValueAllocator),
+      AccelNames(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
+                                       dwarf::DW_FORM_data4)),
+      AccelObjC(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
+                                      dwarf::DW_FORM_data4)),
+      AccelNamespace(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
+                                           dwarf::DW_FORM_data4)),
+      AccelTypes(TypeAtoms) {
+
+  DwarfInfoSectionSym = DwarfAbbrevSectionSym = DwarfStrSectionSym = nullptr;
+  DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = nullptr;
+  DwarfLineSectionSym = nullptr;
+  DwarfAddrSectionSym = nullptr;
+  DwarfAbbrevDWOSectionSym = DwarfStrDWOSectionSym = nullptr;
+  FunctionBeginSym = FunctionEndSym = nullptr;
+  CurFn = nullptr;
+  CurMI = nullptr;
 
   // Turn on accelerator tables for Darwin by default, pubnames by
   // default for non-Darwin, and handle split dwarf.
@@ -209,9 +209,8 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   else
     HasDwarfPubSections = DwarfPubSections == Enable;
 
-  DwarfVersion = DwarfVersionNumber
-                     ? DwarfVersionNumber
-                     : getDwarfVersionFromModule(MMI->getModule());
+  DwarfVersion = DwarfVersionNumber ? DwarfVersionNumber
+                                    : MMI->getModule()->getDwarfVersion();
 
   {
     NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
@@ -219,76 +218,22 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   }
 }
 
+// Define out of line so we don't have to include DwarfUnit.h in DwarfDebug.h.
+DwarfDebug::~DwarfDebug() { }
+
 // Switch to the specified MCSection and emit an assembler
 // temporary label to it if SymbolStem is specified.
 static MCSymbol *emitSectionSym(AsmPrinter *Asm, const MCSection *Section,
-                                const char *SymbolStem = 0) {
+                                const char *SymbolStem = nullptr) {
   Asm->OutStreamer.SwitchSection(Section);
   if (!SymbolStem)
-    return 0;
+    return nullptr;
 
   MCSymbol *TmpSym = Asm->GetTempSymbol(SymbolStem);
   Asm->OutStreamer.EmitLabel(TmpSym);
   return TmpSym;
 }
 
-DwarfFile::~DwarfFile() {
-  for (DwarfUnit *DU : CUs)
-    delete DU;
-}
-
-MCSymbol *DwarfFile::getStringPoolSym() {
-  return Asm->GetTempSymbol(StringPref);
-}
-
-MCSymbol *DwarfFile::getStringPoolEntry(StringRef Str) {
-  std::pair<MCSymbol *, unsigned> &Entry =
-      StringPool.GetOrCreateValue(Str).getValue();
-  if (Entry.first)
-    return Entry.first;
-
-  Entry.second = NextStringPoolNumber++;
-  return Entry.first = Asm->GetTempSymbol(StringPref, Entry.second);
-}
-
-unsigned DwarfFile::getStringPoolIndex(StringRef Str) {
-  std::pair<MCSymbol *, unsigned> &Entry =
-      StringPool.GetOrCreateValue(Str).getValue();
-  if (Entry.first)
-    return Entry.second;
-
-  Entry.second = NextStringPoolNumber++;
-  Entry.first = Asm->GetTempSymbol(StringPref, Entry.second);
-  return Entry.second;
-}
-
-unsigned DwarfFile::getAddrPoolIndex(const MCSymbol *Sym, bool TLS) {
-  std::pair<AddrPool::iterator, bool> P = AddressPool.insert(
-      std::make_pair(Sym, AddressPoolEntry(NextAddrPoolNumber, TLS)));
-  if (P.second)
-    ++NextAddrPoolNumber;
-  return P.first->second.Number;
-}
-
-// Define a unique number for the abbreviation.
-//
-void DwarfFile::assignAbbrevNumber(DIEAbbrev &Abbrev) {
-  // Check the set for priors.
-  DIEAbbrev *InSet = AbbreviationsSet.GetOrInsertNode(&Abbrev);
-
-  // If it's newly added.
-  if (InSet == &Abbrev) {
-    // Add to abbreviation list.
-    Abbreviations.push_back(&Abbrev);
-
-    // Assign the vector position + 1 as its number.
-    Abbrev.setNumber(Abbreviations.size());
-  } else {
-    // Assign existing abbreviation number.
-    Abbrev.setNumber(InSet->getNumber());
-  }
-}
-
 static bool isObjCClass(StringRef Name) {
   return Name.startswith("+") || Name.startswith("-");
 }
@@ -328,26 +273,26 @@ static bool SectionSort(const MCSection *A, const MCSection *B) {
 // TODO: Determine whether or not we should add names for programs
 // that do not have a DW_AT_name or DW_AT_linkage_name field - this
 // is only slightly different than the lookup of non-standard ObjC names.
-static void addSubprogramNames(DwarfUnit *TheU, DISubprogram SP, DIE *Die) {
+void DwarfDebug::addSubprogramNames(DISubprogram SP, DIE &Die) {
   if (!SP.isDefinition())
     return;
-  TheU->addAccelName(SP.getName(), Die);
+  addAccelName(SP.getName(), Die);
 
   // If the linkage name is different than the name, go ahead and output
   // that as well into the name table.
   if (SP.getLinkageName() != "" && SP.getName() != SP.getLinkageName())
-    TheU->addAccelName(SP.getLinkageName(), Die);
+    addAccelName(SP.getLinkageName(), Die);
 
   // If this is an Objective-C selector name add it to the ObjC accelerator
   // too.
   if (isObjCClass(SP.getName())) {
     StringRef Class, Category;
     getObjCClassCategory(SP.getName(), Class, Category);
-    TheU->addAccelObjC(Class, Die);
+    addAccelObjC(Class, Die);
     if (Category != "")
-      TheU->addAccelObjC(Category, Die);
+      addAccelObjC(Category, Die);
     // Also add the base method name to the name table.
-    TheU->addAccelName(getObjCMethodName(SP.getName()), Die);
+    addAccelName(getObjCMethodName(SP.getName()), Die);
   }
 }
 
@@ -367,58 +312,21 @@ bool DwarfDebug::isSubprogramContext(const MDNode *Context) {
 // Find DIE for the given subprogram and attach appropriate DW_AT_low_pc
 // and DW_AT_high_pc attributes. If there are global variables in this
 // scope then create and insert DIEs for these variables.
-DIE *DwarfDebug::updateSubprogramScopeDIE(DwarfCompileUnit *SPCU,
+DIE &DwarfDebug::updateSubprogramScopeDIE(DwarfCompileUnit &SPCU,
                                           DISubprogram SP) {
-  DIE *SPDie = SPCU->getDIE(SP);
-
-  assert(SPDie && "Unable to find subprogram DIE!");
-
-  // If we're updating an abstract DIE, then we will be adding the children and
-  // object pointer later on. But what we don't want to do is process the
-  // concrete DIE twice.
-  if (DIE *AbsSPDIE = AbstractSPDies.lookup(SP)) {
-    // Pick up abstract subprogram DIE.
-    SPDie =
-        SPCU->createAndAddDIE(dwarf::DW_TAG_subprogram, *SPCU->getUnitDie());
-    SPCU->addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin, AbsSPDIE);
-  } else {
-    DISubprogram SPDecl = SP.getFunctionDeclaration();
-    if (!SPDecl.isSubprogram()) {
-      // There is not any need to generate specification DIE for a function
-      // defined at compile unit level. If a function is defined inside another
-      // function then gdb prefers the definition at top level and but does not
-      // expect specification DIE in parent function. So avoid creating
-      // specification DIE for a function defined inside a function.
-      DIScope SPContext = resolve(SP.getContext());
-      if (SP.isDefinition() && !SPContext.isCompileUnit() &&
-          !SPContext.isFile() && !isSubprogramContext(SPContext)) {
-        SPCU->addFlag(SPDie, dwarf::DW_AT_declaration);
-
-        // Add arguments.
-        DICompositeType SPTy = SP.getType();
-        DIArray Args = SPTy.getTypeArray();
-        uint16_t SPTag = SPTy.getTag();
-        if (SPTag == dwarf::DW_TAG_subroutine_type)
-          SPCU->constructSubprogramArguments(*SPDie, Args);
-        DIE *SPDeclDie = SPDie;
-        SPDie = SPCU->createAndAddDIE(dwarf::DW_TAG_subprogram,
-                                      *SPCU->getUnitDie());
-        SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, SPDeclDie);
-      }
-    }
-  }
+  DIE *SPDie = SPCU.getOrCreateSubprogramDIE(SP);
 
-  attachLowHighPC(SPCU, SPDie, FunctionBeginSym, FunctionEndSym);
+  attachLowHighPC(SPCU, *SPDie, FunctionBeginSym, FunctionEndSym);
 
   const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
   MachineLocation Location(RI->getFrameRegister(*Asm->MF));
-  SPCU->addAddress(SPDie, dwarf::DW_AT_frame_base, Location);
+  SPCU.addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
 
   // Add name to the name table, we do this here because we're guaranteed
   // to have concrete versions of our DW_TAG_subprogram nodes.
-  addSubprogramNames(SPCU, SP, SPDie);
+  addSubprogramNames(SP, *SPDie);
 
-  return SPDie;
+  return *SPDie;
 }
 
 /// Check whether we should create a DIE for the given Scope, return true
@@ -442,16 +350,16 @@ bool DwarfDebug::isLexicalScopeDIENull(LexicalScope *Scope) {
   return !End;
 }
 
-static void addSectionLabel(AsmPrinter *Asm, DwarfUnit *U, DIE *D,
+static void addSectionLabel(AsmPrinter &Asm, DwarfUnit &U, DIE &D,
                             dwarf::Attribute A, const MCSymbol *L,
                             const MCSymbol *Sec) {
-  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-    U->addSectionLabel(D, A, L);
+  if (Asm.MAI->doesDwarfUseRelocationsAcrossSections())
+    U.addSectionLabel(D, A, L);
   else
-    U->addSectionDelta(D, A, L, Sec);
+    U.addSectionDelta(D, A, L, Sec);
 }
 
-void DwarfDebug::addScopeRangeList(DwarfCompileUnit *TheCU, DIE *ScopeDIE,
+void DwarfDebug::addScopeRangeList(DwarfCompileUnit &TheCU, DIE &ScopeDIE,
                                    const SmallVectorImpl<InsnRange> &Range) {
   // Emit offset in .debug_range as a relocatable label. emitDIE will handle
   // emitting it appropriately.
@@ -460,10 +368,10 @@ void DwarfDebug::addScopeRangeList(DwarfCompileUnit *TheCU, DIE *ScopeDIE,
   // Under fission, ranges are specified by constant offsets relative to the
   // CU's DW_AT_GNU_ranges_base.
   if (useSplitDwarf())
-    TheCU->addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, RangeSym,
-                           DwarfDebugRangeSectionSym);
+    TheCU.addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, RangeSym,
+                          DwarfDebugRangeSectionSym);
   else
-    addSectionLabel(Asm, TheCU, ScopeDIE, dwarf::DW_AT_ranges, RangeSym,
+    addSectionLabel(*Asm, TheCU, ScopeDIE, dwarf::DW_AT_ranges, RangeSym,
                     DwarfDebugRangeSectionSym);
 
   RangeSpanList List(RangeSym);
@@ -473,227 +381,256 @@ void DwarfDebug::addScopeRangeList(DwarfCompileUnit *TheCU, DIE *ScopeDIE,
   }
 
   // Add the range list to the set of ranges to be emitted.
-  TheCU->addRangeList(std::move(List));
+  TheCU.addRangeList(std::move(List));
+}
+
+void DwarfDebug::attachRangesOrLowHighPC(DwarfCompileUnit &TheCU, DIE &Die,
+                                    const SmallVectorImpl<InsnRange> &Ranges) {
+  assert(!Ranges.empty());
+  if (Ranges.size() == 1)
+    attachLowHighPC(TheCU, Die, getLabelBeforeInsn(Ranges.front().first),
+                    getLabelAfterInsn(Ranges.front().second));
+  else
+    addScopeRangeList(TheCU, Die, Ranges);
 }
 
 // Construct new DW_TAG_lexical_block for this scope and attach
 // DW_AT_low_pc/DW_AT_high_pc labels.
-DIE *DwarfDebug::constructLexicalScopeDIE(DwarfCompileUnit *TheCU,
-                                          LexicalScope *Scope) {
+std::unique_ptr<DIE>
+DwarfDebug::constructLexicalScopeDIE(DwarfCompileUnit &TheCU,
+                                     LexicalScope *Scope) {
   if (isLexicalScopeDIENull(Scope))
-    return 0;
+    return nullptr;
 
-  DIE *ScopeDIE = new DIE(dwarf::DW_TAG_lexical_block);
+  auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_lexical_block);
   if (Scope->isAbstractScope())
     return ScopeDIE;
 
-  const SmallVectorImpl<InsnRange> &ScopeRanges = Scope->getRanges();
-
-  // If we have multiple ranges, emit them into the range section.
-  if (ScopeRanges.size() > 1) {
-    addScopeRangeList(TheCU, ScopeDIE, ScopeRanges);
-    return ScopeDIE;
-  }
-
-  // Construct the address range for this DIE.
-  SmallVectorImpl<InsnRange>::const_iterator RI = ScopeRanges.begin();
-  MCSymbol *Start = getLabelBeforeInsn(RI->first);
-  MCSymbol *End = getLabelAfterInsn(RI->second);
-  assert(End && "End label should not be null!");
-
-  assert(Start->isDefined() && "Invalid starting label for an inlined scope!");
-  assert(End->isDefined() && "Invalid end label for an inlined scope!");
-
-  attachLowHighPC(TheCU, ScopeDIE, Start, End);
+  attachRangesOrLowHighPC(TheCU, *ScopeDIE, Scope->getRanges());
 
   return ScopeDIE;
 }
 
 // This scope represents inlined body of a function. Construct DIE to
 // represent this concrete inlined copy of the function.
-DIE *DwarfDebug::constructInlinedScopeDIE(DwarfCompileUnit *TheCU,
-                                          LexicalScope *Scope) {
-  const SmallVectorImpl<InsnRange> &ScopeRanges = Scope->getRanges();
-  assert(!ScopeRanges.empty() &&
-         "LexicalScope does not have instruction markers!");
-
-  if (!Scope->getScopeNode())
-    return NULL;
+std::unique_ptr<DIE>
+DwarfDebug::constructInlinedScopeDIE(DwarfCompileUnit &TheCU,
+                                     LexicalScope *Scope) {
+  assert(Scope->getScopeNode());
   DIScope DS(Scope->getScopeNode());
   DISubprogram InlinedSP = getDISubprogram(DS);
-  DIE *OriginDIE = TheCU->getDIE(InlinedSP);
-  if (!OriginDIE) {
-    DEBUG(dbgs() << "Unable to find original DIE for an inlined subprogram.");
-    return NULL;
-  }
-
-  DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine);
-  TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin, OriginDIE);
+  // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
+  // was inlined from another compile unit.
+  DIE *OriginDIE = AbstractSPDies[InlinedSP];
+  assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
 
-  // If we have multiple ranges, emit them into the range section.
-  if (ScopeRanges.size() > 1)
-    addScopeRangeList(TheCU, ScopeDIE, ScopeRanges);
-  else {
-    SmallVectorImpl<InsnRange>::const_iterator RI = ScopeRanges.begin();
-    MCSymbol *StartLabel = getLabelBeforeInsn(RI->first);
-    MCSymbol *EndLabel = getLabelAfterInsn(RI->second);
+  auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_inlined_subroutine);
+  TheCU.addDIEEntry(*ScopeDIE, dwarf::DW_AT_abstract_origin, *OriginDIE);
 
-    if (StartLabel == 0 || EndLabel == 0)
-      llvm_unreachable("Unexpected Start and End labels for an inlined scope!");
-
-    assert(StartLabel->isDefined() &&
-           "Invalid starting label for an inlined scope!");
-    assert(EndLabel->isDefined() && "Invalid end label for an inlined scope!");
-
-    attachLowHighPC(TheCU, ScopeDIE, StartLabel, EndLabel);
-  }
+  attachRangesOrLowHighPC(TheCU, *ScopeDIE, Scope->getRanges());
 
   InlinedSubprogramDIEs.insert(OriginDIE);
 
   // Add the call site information to the DIE.
   DILocation DL(Scope->getInlinedAt());
-  TheCU->addUInt(
-      ScopeDIE, dwarf::DW_AT_call_file, None,
-      TheCU->getOrCreateSourceID(DL.getFilename(), DL.getDirectory()));
-  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_line, None, DL.getLineNumber());
+  TheCU.addUInt(*ScopeDIE, dwarf::DW_AT_call_file, None,
+                TheCU.getOrCreateSourceID(DL.getFilename(), DL.getDirectory()));
+  TheCU.addUInt(*ScopeDIE, dwarf::DW_AT_call_line, None, DL.getLineNumber());
 
   // Add name to the name table, we do this here because we're guaranteed
   // to have concrete versions of our DW_TAG_inlined_subprogram nodes.
-  addSubprogramNames(TheCU, InlinedSP, ScopeDIE);
+  addSubprogramNames(InlinedSP, *ScopeDIE);
 
   return ScopeDIE;
 }
 
-DIE *DwarfDebug::createScopeChildrenDIE(DwarfCompileUnit *TheCU,
-                                        LexicalScope *Scope,
-                                        SmallVectorImpl<DIE *> &Children) {
-  DIE *ObjectPointer = NULL;
+static std::unique_ptr<DIE> constructVariableDIE(DwarfCompileUnit &TheCU,
+                                                 DbgVariable &DV,
+                                                 const LexicalScope &Scope,
+                                                 DIE *&ObjectPointer) {
+  auto Var = TheCU.constructVariableDIE(DV, Scope.isAbstractScope());
+  if (DV.isObjectPointer())
+    ObjectPointer = Var.get();
+  return Var;
+}
+
+DIE *DwarfDebug::createScopeChildrenDIE(
+    DwarfCompileUnit &TheCU, LexicalScope *Scope,
+    SmallVectorImpl<std::unique_ptr<DIE>> &Children) {
+  DIE *ObjectPointer = nullptr;
 
   // Collect arguments for current function.
   if (LScopes.isCurrentFunctionScope(Scope)) {
     for (DbgVariable *ArgDV : CurrentFnArguments)
       if (ArgDV)
-        if (DIE *Arg =
-                TheCU->constructVariableDIE(*ArgDV, Scope->isAbstractScope())) {
-          Children.push_back(Arg);
-          if (ArgDV->isObjectPointer())
-            ObjectPointer = Arg;
-        }
+        Children.push_back(
+            constructVariableDIE(TheCU, *ArgDV, *Scope, ObjectPointer));
 
     // If this is a variadic function, add an unspecified parameter.
     DISubprogram SP(Scope->getScopeNode());
     DIArray FnArgs = SP.getType().getTypeArray();
     if (FnArgs.getElement(FnArgs.getNumElements() - 1)
             .isUnspecifiedParameter()) {
-      DIE *Ellipsis = new DIE(dwarf::DW_TAG_unspecified_parameters);
-      Children.push_back(Ellipsis);
+      Children.push_back(
+          make_unique<DIE>(dwarf::DW_TAG_unspecified_parameters));
     }
   }
 
   // Collect lexical scope children first.
   for (DbgVariable *DV : ScopeVariables.lookup(Scope))
-    if (DIE *Variable = TheCU->constructVariableDIE(*DV,
-                                                    Scope->isAbstractScope())) {
-      Children.push_back(Variable);
-      if (DV->isObjectPointer())
-        ObjectPointer = Variable;
-    }
+    Children.push_back(constructVariableDIE(TheCU, *DV, *Scope, ObjectPointer));
+
   for (LexicalScope *LS : Scope->getChildren())
-    if (DIE *Nested = constructScopeDIE(TheCU, LS))
-      Children.push_back(Nested);
+    if (std::unique_ptr<DIE> Nested = constructScopeDIE(TheCU, LS))
+      Children.push_back(std::move(Nested));
   return ObjectPointer;
 }
 
+void DwarfDebug::createAndAddScopeChildren(DwarfCompileUnit &TheCU,
+                                           LexicalScope *Scope, DIE &ScopeDIE) {
+  // We create children when the scope DIE is not null.
+  SmallVector<std::unique_ptr<DIE>, 8> Children;
+  if (DIE *ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children))
+    TheCU.addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer, *ObjectPointer);
+
+  // Add children
+  for (auto &I : Children)
+    ScopeDIE.addChild(std::move(I));
+}
+
+void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &TheCU,
+                                                     LexicalScope *Scope) {
+  assert(Scope && Scope->getScopeNode());
+  assert(Scope->isAbstractScope());
+  assert(!Scope->getInlinedAt());
+
+  DISubprogram SP(Scope->getScopeNode());
+
+  ProcessedSPNodes.insert(SP);
+
+  DIE *&AbsDef = AbstractSPDies[SP];
+  if (AbsDef)
+    return;
+
+  // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
+  // was inlined from another compile unit.
+  DwarfCompileUnit &SPCU = *SPMap[SP];
+  DIE *ContextDIE;
+
+  // Some of this is duplicated from DwarfUnit::getOrCreateSubprogramDIE, with
+  // the important distinction that the DIDescriptor is not associated with the
+  // DIE (since the DIDescriptor will be associated with the concrete DIE, if
+  // any). It could be refactored to some common utility function.
+  if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
+    ContextDIE = &SPCU.getUnitDie();
+    SPCU.getOrCreateSubprogramDIE(SPDecl);
+  } else
+    ContextDIE = SPCU.getOrCreateContextDIE(resolve(SP.getContext()));
+
+  // Passing null as the associated DIDescriptor because the abstract definition
+  // shouldn't be found by lookup.
+  AbsDef = &SPCU.createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE,
+                                 DIDescriptor());
+  SPCU.applySubprogramAttributes(SP, *AbsDef);
+  SPCU.addGlobalName(SP.getName(), *AbsDef, resolve(SP.getContext()));
+
+  SPCU.addUInt(*AbsDef, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
+  createAndAddScopeChildren(SPCU, Scope, *AbsDef);
+}
+
+DIE &DwarfDebug::constructSubprogramScopeDIE(DwarfCompileUnit &TheCU,
+                                             LexicalScope *Scope) {
+  assert(Scope && Scope->getScopeNode());
+  assert(!Scope->getInlinedAt());
+  assert(!Scope->isAbstractScope());
+  DISubprogram Sub(Scope->getScopeNode());
+
+  assert(Sub.isSubprogram());
+
+  ProcessedSPNodes.insert(Sub);
+
+  DIE &ScopeDIE = updateSubprogramScopeDIE(TheCU, Sub);
+
+  createAndAddScopeChildren(TheCU, Scope, ScopeDIE);
+
+  return ScopeDIE;
+}
+
 // Construct a DIE for this scope.
-DIE *DwarfDebug::constructScopeDIE(DwarfCompileUnit *TheCU,
-                                   LexicalScope *Scope) {
+std::unique_ptr<DIE> DwarfDebug::constructScopeDIE(DwarfCompileUnit &TheCU,
+                                                   LexicalScope *Scope) {
   if (!Scope || !Scope->getScopeNode())
-    return NULL;
+    return nullptr;
 
   DIScope DS(Scope->getScopeNode());
 
-  SmallVector<DIE *, 8> Children;
-  DIE *ObjectPointer = NULL;
-  bool ChildrenCreated = false;
+  assert((Scope->getInlinedAt() || !DS.isSubprogram()) &&
+         "Only handle inlined subprograms here, use "
+         "constructSubprogramScopeDIE for non-inlined "
+         "subprograms");
+
+  SmallVector<std::unique_ptr<DIE>, 8> Children;
 
   // We try to create the scope DIE first, then the children DIEs. This will
   // avoid creating un-used children then removing them later when we find out
   // the scope DIE is null.
-  DIE *ScopeDIE = NULL;
-  if (Scope->getInlinedAt())
+  std::unique_ptr<DIE> ScopeDIE;
+  if (Scope->getParent() && DS.isSubprogram()) {
     ScopeDIE = constructInlinedScopeDIE(TheCU, Scope);
-  else if (DS.isSubprogram()) {
-    ProcessedSPNodes.insert(DS);
-    if (Scope->isAbstractScope()) {
-      ScopeDIE = TheCU->getDIE(DS);
-      // Note down abstract DIE.
-      if (ScopeDIE)
-        AbstractSPDies.insert(std::make_pair(DS, ScopeDIE));
-    } else
-      ScopeDIE = updateSubprogramScopeDIE(TheCU, DISubprogram(DS));
+    if (!ScopeDIE)
+      return nullptr;
+    // We create children when the scope DIE is not null.
+    createScopeChildrenDIE(TheCU, Scope, Children);
   } else {
     // Early exit when we know the scope DIE is going to be null.
     if (isLexicalScopeDIENull(Scope))
-      return NULL;
+      return nullptr;
 
     // We create children here when we know the scope DIE is not going to be
     // null and the children will be added to the scope DIE.
-    ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children);
-    ChildrenCreated = true;
+    createScopeChildrenDIE(TheCU, Scope, Children);
 
     // There is no need to emit empty lexical block DIE.
     std::pair<ImportedEntityMap::const_iterator,
               ImportedEntityMap::const_iterator> Range =
-        std::equal_range(
-            ScopesWithImportedEntities.begin(),
-            ScopesWithImportedEntities.end(),
-            std::pair<const MDNode *, const MDNode *>(DS, (const MDNode *)0),
-            less_first());
+        std::equal_range(ScopesWithImportedEntities.begin(),
+                         ScopesWithImportedEntities.end(),
+                         std::pair<const MDNode *, const MDNode *>(DS, nullptr),
+                         less_first());
     if (Children.empty() && Range.first == Range.second)
-      return NULL;
+      return nullptr;
     ScopeDIE = constructLexicalScopeDIE(TheCU, Scope);
     assert(ScopeDIE && "Scope DIE should not be null.");
     for (ImportedEntityMap::const_iterator i = Range.first; i != Range.second;
          ++i)
-      constructImportedEntityDIE(TheCU, i->second, ScopeDIE);
+      constructImportedEntityDIE(TheCU, i->second, *ScopeDIE);
   }
 
-  if (!ScopeDIE) {
-    assert(Children.empty() &&
-           "We create children only when the scope DIE is not null.");
-    return NULL;
-  }
-  if (!ChildrenCreated)
-    // We create children when the scope DIE is not null.
-    ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children);
-
   // Add children
-  for (DIE *I : Children)
-    ScopeDIE->addChild(I);
-
-  if (DS.isSubprogram() && ObjectPointer != NULL)
-    TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer, ObjectPointer);
+  for (auto &I : Children)
+    ScopeDIE->addChild(std::move(I));
 
   return ScopeDIE;
 }
 
-void DwarfDebug::addGnuPubAttributes(DwarfUnit *U, DIE *D) const {
+void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const {
   if (!GenerateGnuPubSections)
     return;
 
-  U->addFlag(D, dwarf::DW_AT_GNU_pubnames);
+  U.addFlag(D, dwarf::DW_AT_GNU_pubnames);
 }
 
 // Create new DwarfCompileUnit for the given metadata node with tag
 // DW_TAG_compile_unit.
-DwarfCompileUnit *DwarfDebug::constructDwarfCompileUnit(DICompileUnit DIUnit) {
+DwarfCompileUnit &DwarfDebug::constructDwarfCompileUnit(DICompileUnit DIUnit) {
   StringRef FN = DIUnit.getFilename();
   CompilationDir = DIUnit.getDirectory();
 
-  DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
-  DwarfCompileUnit *NewCU = new DwarfCompileUnit(
-      InfoHolder.getUnits().size(), Die, DIUnit, Asm, this, &InfoHolder);
-  InfoHolder.addUnit(NewCU);
+  auto OwnedUnit = make_unique<DwarfCompileUnit>(
+      InfoHolder.getUnits().size(), DIUnit, Asm, this, &InfoHolder);
+  DwarfCompileUnit &NewCU = *OwnedUnit;
+  DIE &Die = NewCU.getUnitDie();
+  InfoHolder.addUnit(std::move(OwnedUnit));
 
   // LTO with assembly output shares a single line table amongst multiple CUs.
   // To avoid the compilation directory being ambiguous, let the line table
@@ -701,116 +638,89 @@ DwarfCompileUnit *DwarfDebug::constructDwarfCompileUnit(DICompileUnit DIUnit) {
   // compilation directory.
   if (!Asm->OutStreamer.hasRawTextSupport() || SingleCU)
     Asm->OutStreamer.getContext().setMCLineTableCompilationDir(
-        NewCU->getUniqueID(), CompilationDir);
+        NewCU.getUniqueID(), CompilationDir);
 
-  NewCU->addString(Die, dwarf::DW_AT_producer, DIUnit.getProducer());
-  NewCU->addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
-                 DIUnit.getLanguage());
-  NewCU->addString(Die, dwarf::DW_AT_name, FN);
+  NewCU.addString(Die, dwarf::DW_AT_producer, DIUnit.getProducer());
+  NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
+                DIUnit.getLanguage());
+  NewCU.addString(Die, dwarf::DW_AT_name, FN);
 
   if (!useSplitDwarf()) {
-    NewCU->initStmtList(DwarfLineSectionSym);
+    NewCU.initStmtList(DwarfLineSectionSym);
 
     // If we're using split dwarf the compilation dir is going to be in the
     // skeleton CU and so we don't need to duplicate it here.
     if (!CompilationDir.empty())
-      NewCU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
+      NewCU.addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
 
     addGnuPubAttributes(NewCU, Die);
   }
 
   if (DIUnit.isOptimized())
-    NewCU->addFlag(Die, dwarf::DW_AT_APPLE_optimized);
+    NewCU.addFlag(Die, dwarf::DW_AT_APPLE_optimized);
 
   StringRef Flags = DIUnit.getFlags();
   if (!Flags.empty())
-    NewCU->addString(Die, dwarf::DW_AT_APPLE_flags, Flags);
+    NewCU.addString(Die, dwarf::DW_AT_APPLE_flags, Flags);
 
   if (unsigned RVer = DIUnit.getRunTimeVersion())
-    NewCU->addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
-                   dwarf::DW_FORM_data1, RVer);
+    NewCU.addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
+                  dwarf::DW_FORM_data1, RVer);
 
   if (!FirstCU)
-    FirstCU = NewCU;
+    FirstCU = &NewCU;
 
   if (useSplitDwarf()) {
-    NewCU->initSection(Asm->getObjFileLowering().getDwarfInfoDWOSection(),
-                       DwarfInfoDWOSectionSym);
-    NewCU->setSkeleton(constructSkeletonCU(NewCU));
+    NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoDWOSection(),
+                      DwarfInfoDWOSectionSym);
+    NewCU.setSkeleton(constructSkeletonCU(NewCU));
   } else
-    NewCU->initSection(Asm->getObjFileLowering().getDwarfInfoSection(),
-                       DwarfInfoSectionSym);
+    NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoSection(),
+                      DwarfInfoSectionSym);
 
-  CUMap.insert(std::make_pair(DIUnit, NewCU));
-  CUDieMap.insert(std::make_pair(Die, NewCU));
+  CUMap.insert(std::make_pair(DIUnit, &NewCU));
+  CUDieMap.insert(std::make_pair(&Die, &NewCU));
   return NewCU;
 }
 
-// Construct subprogram DIE.
-void DwarfDebug::constructSubprogramDIE(DwarfCompileUnit *TheCU,
-                                        const MDNode *N) {
-  // FIXME: We should only call this routine once, however, during LTO if a
-  // program is defined in multiple CUs we could end up calling it out of
-  // beginModule as we walk the CUs.
-
-  DwarfCompileUnit *&CURef = SPMap[N];
-  if (CURef)
-    return;
-  CURef = TheCU;
-
-  DISubprogram SP(N);
-  if (!SP.isDefinition())
-    // This is a method declaration which will be handled while constructing
-    // class type.
-    return;
-
-  DIE *SubprogramDie = TheCU->getOrCreateSubprogramDIE(SP);
-
-  // Expose as a global name.
-  TheCU->addGlobalName(SP.getName(), SubprogramDie, resolve(SP.getContext()));
-}
-
-void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit *TheCU,
+void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit &TheCU,
                                             const MDNode *N) {
   DIImportedEntity Module(N);
   assert(Module.Verify());
-  if (DIE *D = TheCU->getOrCreateContextDIE(Module.getContext()))
-    constructImportedEntityDIE(TheCU, Module, D);
+  if (DIE *D = TheCU.getOrCreateContextDIE(Module.getContext()))
+    constructImportedEntityDIE(TheCU, Module, *D);
 }
 
-void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit *TheCU,
-                                            const MDNode *N, DIE *Context) {
+void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit &TheCU,
+                                            const MDNode *N, DIE &Context) {
   DIImportedEntity Module(N);
   assert(Module.Verify());
   return constructImportedEntityDIE(TheCU, Module, Context);
 }
 
-void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit *TheCU,
+void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit &TheCU,
                                             const DIImportedEntity &Module,
-                                            DIE *Context) {
+                                            DIE &Context) {
   assert(Module.Verify() &&
          "Use one of the MDNode * overloads to handle invalid metadata");
-  assert(Context && "Should always have a context for an imported_module");
-  DIE *IMDie = new DIE(Module.getTag());
-  TheCU->insertDIE(Module, IMDie);
+  DIE &IMDie = TheCU.createAndAddDIE(Module.getTag(), Context, Module);
   DIE *EntityDie;
   DIDescriptor Entity = resolve(Module.getEntity());
   if (Entity.isNameSpace())
-    EntityDie = TheCU->getOrCreateNameSpace(DINameSpace(Entity));
+    EntityDie = TheCU.getOrCreateNameSpace(DINameSpace(Entity));
   else if (Entity.isSubprogram())
-    EntityDie = TheCU->getOrCreateSubprogramDIE(DISubprogram(Entity));
+    EntityDie = TheCU.getOrCreateSubprogramDIE(DISubprogram(Entity));
   else if (Entity.isType())
-    EntityDie = TheCU->getOrCreateTypeDIE(DIType(Entity));
+    EntityDie = TheCU.getOrCreateTypeDIE(DIType(Entity));
   else
-    EntityDie = TheCU->getDIE(Entity);
-  TheCU->addSourceLine(IMDie, Module.getLineNumber(),
-                       Module.getContext().getFilename(),
-                       Module.getContext().getDirectory());
-  TheCU->addDIEEntry(IMDie, dwarf::DW_AT_import, EntityDie);
+    EntityDie = TheCU.getDIE(Entity);
+  TheCU.addSourceLine(IMDie, Module.getLineNumber(),
+                      Module.getContext().getFilename(),
+                      Module.getContext().getDirectory());
+  TheCU.addDIEEntry(IMDie, dwarf::DW_AT_import, *EntityDie);
   StringRef Name = Module.getName();
   if (!Name.empty())
-    TheCU->addString(IMDie, dwarf::DW_AT_name, Name);
-  Context->addChild(IMDie);
+    TheCU.addString(IMDie, dwarf::DW_AT_name, Name);
 }
 
 // Emit all Dwarf sections that should come prior to the content. Create
@@ -836,7 +746,7 @@ void DwarfDebug::beginModule() {
 
   for (MDNode *N : CU_Nodes->operands()) {
     DICompileUnit CUNode(N);
-    DwarfCompileUnit *CU = constructDwarfCompileUnit(CUNode);
+    DwarfCompileUnit &CU = constructDwarfCompileUnit(CUNode);
     DIArray ImportedEntities = CUNode.getImportedEntities();
     for (unsigned i = 0, e = ImportedEntities.getNumElements(); i != e; ++i)
       ScopesWithImportedEntities.push_back(std::make_pair(
@@ -846,20 +756,20 @@ void DwarfDebug::beginModule() {
               ScopesWithImportedEntities.end(), less_first());
     DIArray GVs = CUNode.getGlobalVariables();
     for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i)
-      CU->createGlobalVariableDIE(DIGlobalVariable(GVs.getElement(i)));
+      CU.createGlobalVariableDIE(DIGlobalVariable(GVs.getElement(i)));
     DIArray SPs = CUNode.getSubprograms();
     for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i)
-      constructSubprogramDIE(CU, SPs.getElement(i));
+      SPMap.insert(std::make_pair(SPs.getElement(i), &CU));
     DIArray EnumTypes = CUNode.getEnumTypes();
     for (unsigned i = 0, e = EnumTypes.getNumElements(); i != e; ++i)
-      CU->getOrCreateTypeDIE(EnumTypes.getElement(i));
+      CU.getOrCreateTypeDIE(EnumTypes.getElement(i));
     DIArray RetainedTypes = CUNode.getRetainedTypes();
     for (unsigned i = 0, e = RetainedTypes.getNumElements(); i != e; ++i) {
       DIType Ty(RetainedTypes.getElement(i));
       // The retained types array by design contains pointers to
       // MDNodes rather than DIRefs. Unique them here.
       DIType UniqueTy(resolve(Ty.getRef()));
-      CU->getOrCreateTypeDIE(UniqueTy);
+      CU.getOrCreateTypeDIE(UniqueTy);
     }
     // Emit imported_modules last so that the relevant context is already
     // available.
@@ -874,20 +784,41 @@ void DwarfDebug::beginModule() {
   SectionMap[Asm->getObjFileLowering().getTextSection()];
 }
 
-// Attach DW_AT_inline attribute with inlined subprogram DIEs.
-void DwarfDebug::computeInlinedDIEs() {
-  // Attach DW_AT_inline attribute with inlined subprogram DIEs.
-  for (DIE *ISP : InlinedSubprogramDIEs)
-    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
+void DwarfDebug::finishSubprogramDefinitions() {
+  const Module *M = MMI->getModule();
 
-  for (const auto &AI : AbstractSPDies) {
-    DIE *ISP = AI.second;
-    if (InlinedSubprogramDIEs.count(ISP))
-      continue;
-    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  for (MDNode *N : CU_Nodes->operands()) {
+    DICompileUnit TheCU(N);
+    // Construct subprogram DIE and add variables DIEs.
+    DwarfCompileUnit *SPCU =
+        static_cast<DwarfCompileUnit *>(CUMap.lookup(TheCU));
+    DIArray Subprograms = TheCU.getSubprograms();
+    for (unsigned i = 0, e = Subprograms.getNumElements(); i != e; ++i) {
+      DISubprogram SP(Subprograms.getElement(i));
+      // Perhaps the subprogram is in another CU (such as due to comdat
+      // folding, etc), in which case ignore it here.
+      if (SPMap[SP] != SPCU)
+        continue;
+      DIE *D = SPCU->getDIE(SP);
+      if (DIE *AbsSPDIE = AbstractSPDies.lookup(SP)) {
+        if (D)
+          // If this subprogram has an abstract definition, reference that
+          SPCU->addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
+      } else {
+        if (!D)
+          // Lazily construct the subprogram if we didn't see either concrete or
+          // inlined versions during codegen.
+          D = SPCU->getOrCreateSubprogramDIE(SP);
+        // And attach the attributes
+        SPCU->applySubprogramAttributes(SP, *D);
+        SPCU->addGlobalName(SP.getName(), *D, resolve(SP.getContext()));
+      }
+    }
   }
 }
 
+
 // Collect info for variables that were optimized out.
 void DwarfDebug::collectDeadVariables() {
   const Module *M = MMI->getModule();
@@ -895,34 +826,32 @@ void DwarfDebug::collectDeadVariables() {
   if (NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu")) {
     for (MDNode *N : CU_Nodes->operands()) {
       DICompileUnit TheCU(N);
+      // Construct subprogram DIE and add variables DIEs.
+      DwarfCompileUnit *SPCU =
+          static_cast<DwarfCompileUnit *>(CUMap.lookup(TheCU));
+      assert(SPCU && "Unable to find Compile Unit!");
       DIArray Subprograms = TheCU.getSubprograms();
       for (unsigned i = 0, e = Subprograms.getNumElements(); i != e; ++i) {
         DISubprogram SP(Subprograms.getElement(i));
         if (ProcessedSPNodes.count(SP) != 0)
           continue;
-        if (!SP.isSubprogram())
-          continue;
-        if (!SP.isDefinition())
-          continue;
+        assert(SP.isSubprogram() &&
+               "CU's subprogram list contains a non-subprogram");
+        assert(SP.isDefinition() &&
+               "CU's subprogram list contains a subprogram declaration");
         DIArray Variables = SP.getVariables();
         if (Variables.getNumElements() == 0)
           continue;
 
-        // Construct subprogram DIE and add variables DIEs.
-        DwarfCompileUnit *SPCU =
-            static_cast<DwarfCompileUnit *>(CUMap.lookup(TheCU));
-        assert(SPCU && "Unable to find Compile Unit!");
-        // FIXME: See the comment in constructSubprogramDIE about duplicate
-        // subprogram DIEs.
-        constructSubprogramDIE(SPCU, SP);
-        DIE *SPDIE = SPCU->getDIE(SP);
+        DIE *SPDIE = AbstractSPDies.lookup(SP);
+        if (!SPDIE)
+          SPDIE = SPCU->getDIE(SP);
+        assert(SPDIE);
         for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
           DIVariable DV(Variables.getElement(vi));
-          if (!DV.isVariable())
-            continue;
-          DbgVariable NewVar(DV, NULL, this);
-          if (DIE *VariableDIE = SPCU->constructVariableDIE(NewVar, false))
-            SPDIE->addChild(VariableDIE);
+          assert(DV.isVariable());
+          DbgVariable NewVar(DV, nullptr, this);
+          SPDIE->addChild(SPCU->constructVariableDIE(NewVar));
         }
       }
     }
@@ -930,28 +859,27 @@ void DwarfDebug::collectDeadVariables() {
 }
 
 void DwarfDebug::finalizeModuleInfo() {
+  finishSubprogramDefinitions();
+
   // Collect info for variables that were optimized out.
   collectDeadVariables();
 
-  // Attach DW_AT_inline attribute with inlined subprogram DIEs.
-  computeInlinedDIEs();
-
   // Handle anything that needs to be done on a per-unit basis after
   // all other generation.
-  for (DwarfUnit *TheU : getUnits()) {
+  for (const auto &TheU : getUnits()) {
     // Emit DW_AT_containing_type attribute to connect types with their
     // vtable holding type.
     TheU->constructContainingTypeDIEs();
 
     // Add CU specific attributes if we need to add any.
-    if (TheU->getUnitDie()->getTag() == dwarf::DW_TAG_compile_unit) {
+    if (TheU->getUnitDie().getTag() == dwarf::DW_TAG_compile_unit) {
       // If we're splitting the dwarf out now that we've got the entire
       // CU then add the dwo id to it.
       DwarfCompileUnit *SkCU =
           static_cast<DwarfCompileUnit *>(TheU->getSkeleton());
       if (useSplitDwarf()) {
         // Emit a unique identifier for this CU.
-        uint64_t ID = DIEHash(Asm).computeCUSignature(*TheU->getUnitDie());
+        uint64_t ID = DIEHash(Asm).computeCUSignature(TheU->getUnitDie());
         TheU->addUInt(TheU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
                       dwarf::DW_FORM_data8, ID);
         SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
@@ -959,12 +887,12 @@ void DwarfDebug::finalizeModuleInfo() {
 
         // We don't keep track of which addresses are used in which CU so this
         // is a bit pessimistic under LTO.
-        if (!InfoHolder.getAddrPool()->empty())
-          addSectionLabel(Asm, SkCU, SkCU->getUnitDie(),
+        if (!AddrPool.isEmpty())
+          addSectionLabel(*Asm, *SkCU, SkCU->getUnitDie(),
                           dwarf::DW_AT_GNU_addr_base, DwarfAddrSectionSym,
                           DwarfAddrSectionSym);
         if (!TheU->getRangeLists().empty())
-          addSectionLabel(Asm, SkCU, SkCU->getUnitDie(),
+          addSectionLabel(*Asm, *SkCU, SkCU->getUnitDie(),
                           dwarf::DW_AT_GNU_ranges_base,
                           DwarfDebugRangeSectionSym, DwarfDebugRangeSectionSym);
       }
@@ -975,26 +903,27 @@ void DwarfDebug::finalizeModuleInfo() {
       // FIXME: We should use ranges allow reordering of code ala
       // .subsections_via_symbols in mach-o. This would mean turning on
       // ranges for all subprogram DIEs for mach-o.
-      DwarfCompileUnit *U = SkCU ? SkCU : static_cast<DwarfCompileUnit *>(TheU);
+      DwarfCompileUnit &U =
+          SkCU ? *SkCU : static_cast<DwarfCompileUnit &>(*TheU);
       unsigned NumRanges = TheU->getRanges().size();
       if (NumRanges) {
         if (NumRanges > 1) {
-          addSectionLabel(Asm, U, U->getUnitDie(), dwarf::DW_AT_ranges,
-                          Asm->GetTempSymbol("cu_ranges", U->getUniqueID()),
+          addSectionLabel(*Asm, U, U.getUnitDie(), dwarf::DW_AT_ranges,
+                          Asm->GetTempSymbol("cu_ranges", U.getUniqueID()),
                           DwarfDebugRangeSectionSym);
 
           // A DW_AT_low_pc attribute may also be specified in combination with
           // DW_AT_ranges to specify the default base address for use in
           // location lists (see Section 2.6.2) and range lists (see Section
           // 2.17.3).
-          U->addUInt(U->getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
-                     0);
+          U.addUInt(U.getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
+                    0);
         } else {
           RangeSpan &Range = TheU->getRanges().back();
-          U->addLocalLabelAddress(U->getUnitDie(), dwarf::DW_AT_low_pc,
-                                  Range.getStart());
-          U->addLabelDelta(U->getUnitDie(), dwarf::DW_AT_high_pc,
-                           Range.getEnd(), Range.getStart());
+          U.addLocalLabelAddress(U.getUnitDie(), dwarf::DW_AT_low_pc,
+                                 Range.getStart());
+          U.addLabelDelta(U.getUnitDie(), dwarf::DW_AT_high_pc, Range.getEnd(),
+                          Range.getStart());
         }
       }
     }
@@ -1018,7 +947,7 @@ void DwarfDebug::endSections() {
       // Some symbols (e.g. common/bss on mach-o) can have no section but still
       // appear in the output. This sucks as we rely on sections to build
       // arange spans. We can do it without, but it's icky.
-      SectionMap[NULL].push_back(SCU);
+      SectionMap[nullptr].push_back(SCU);
     }
   }
 
@@ -1036,7 +965,7 @@ void DwarfDebug::endSections() {
   // Add terminating symbols for each section.
   for (unsigned ID = 0, E = Sections.size(); ID != E; ID++) {
     const MCSection *Section = Sections[ID];
-    MCSymbol *Sym = NULL;
+    MCSymbol *Sym = nullptr;
 
     if (Section) {
       // We can't call MCSection::getLabelEndName, as it's only safe to do so
@@ -1049,14 +978,14 @@ void DwarfDebug::endSections() {
     }
 
     // Insert a final terminator.
-    SectionMap[Section].push_back(SymbolCU(NULL, Sym));
+    SectionMap[Section].push_back(SymbolCU(nullptr, Sym));
   }
 }
 
 // Emit all Dwarf sections that should come after the content.
 void DwarfDebug::endModule() {
-  assert(CurFn == 0);
-  assert(CurMI == 0);
+  assert(CurFn == nullptr);
+  assert(CurMI == nullptr);
 
   if (!FirstCU)
     return;
@@ -1089,7 +1018,7 @@ void DwarfDebug::endModule() {
     emitDebugAbbrevDWO();
     emitDebugLineDWO();
     // Emit DWO addresses.
-    InfoHolder.emitAddresses(Asm->getObjFileLowering().getDwarfAddrSection());
+    AddrPool.emit(*Asm, Asm->getObjFileLowering().getDwarfAddrSection());
     emitDebugLocDWO();
   } else
     // Emit info into a debug loc section.
@@ -1111,29 +1040,34 @@ void DwarfDebug::endModule() {
 
   // clean up.
   SPMap.clear();
+  AbstractVariables.clear();
 
   // Reset these for the next Module if we have one.
-  FirstCU = NULL;
+  FirstCU = nullptr;
 }
 
 // Find abstract variable, if any, associated with Var.
 DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &DV,
                                               DebugLoc ScopeLoc) {
+  return findAbstractVariable(DV, ScopeLoc.getScope(DV->getContext()));
+}
+
+DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &DV,
+                                              const MDNode *ScopeNode) {
   LLVMContext &Ctx = DV->getContext();
   // More then one inlined variable corresponds to one abstract variable.
   DIVariable Var = cleanseInlinedVariable(DV, Ctx);
-  DbgVariable *AbsDbgVariable = AbstractVariables.lookup(Var);
-  if (AbsDbgVariable)
-    return AbsDbgVariable;
+  auto I = AbstractVariables.find(Var);
+  if (I != AbstractVariables.end())
+    return I->second.get();
 
-  LexicalScope *Scope = LScopes.findAbstractScope(ScopeLoc.getScope(Ctx));
+  LexicalScope *Scope = LScopes.findAbstractScope(ScopeNode);
   if (!Scope)
-    return NULL;
+    return nullptr;
 
-  AbsDbgVariable = new DbgVariable(Var, NULL, this);
-  addScopeVariable(Scope, AbsDbgVariable);
-  AbstractVariables[Var] = AbsDbgVariable;
-  return AbsDbgVariable;
+  auto AbsDbgVariable = make_unique<DbgVariable>(Var, nullptr, this);
+  addScopeVariable(Scope, AbsDbgVariable.get());
+  return (AbstractVariables[Var] = std::move(AbsDbgVariable)).get();
 }
 
 // If Var is a current function argument then add it to CurrentFnArguments list.
@@ -1169,7 +1103,7 @@ void DwarfDebug::collectVariableInfoFromMMITable(
     LexicalScope *Scope = LScopes.findLexicalScope(VI.Loc);
 
     // If variable scope is not found then skip this variable.
-    if (Scope == 0)
+    if (!Scope)
       continue;
 
     DbgVariable *AbsDbgVariable = findAbstractVariable(DV, VI.Loc);
@@ -1177,28 +1111,12 @@ void DwarfDebug::collectVariableInfoFromMMITable(
     RegVar->setFrameIndex(VI.Slot);
     if (!addCurrentFnArgument(RegVar, Scope))
       addScopeVariable(Scope, RegVar);
-    if (AbsDbgVariable)
-      AbsDbgVariable->setFrameIndex(VI.Slot);
   }
 }
 
-// Return true if debug value, encoded by DBG_VALUE instruction, is in a
-// defined reg.
-static bool isDbgValueInDefinedReg(const MachineInstr *MI) {
-  assert(MI->isDebugValue() && "Invalid DBG_VALUE machine instruction!");
-  return MI->getNumOperands() == 3 && MI->getOperand(0).isReg() &&
-         MI->getOperand(0).getReg() &&
-         (MI->getOperand(1).isImm() ||
-          (MI->getOperand(1).isReg() && MI->getOperand(1).getReg() == 0U));
-}
-
 // Get .debug_loc entry for the instruction range starting at MI.
-static DebugLocEntry getDebugLocEntry(AsmPrinter *Asm,
-                                      const MCSymbol *FLabel,
-                                      const MCSymbol *SLabel,
-                                      const MachineInstr *MI,
-                                      DwarfCompileUnit *Unit) {
-  const MDNode *Var = MI->getOperand(MI->getNumOperands() - 1).getMetadata();
+static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) {
+  const MDNode *Var = MI->getDebugVariable();
 
   assert(MI->getNumOperands() == 3);
   if (MI->getOperand(0).isReg()) {
@@ -1209,14 +1127,14 @@ static DebugLocEntry getDebugLocEntry(AsmPrinter *Asm,
       MLoc.set(MI->getOperand(0).getReg());
     else
       MLoc.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
-    return DebugLocEntry(FLabel, SLabel, MLoc, Var, Unit);
+    return DebugLocEntry::Value(Var, MLoc);
   }
   if (MI->getOperand(0).isImm())
-    return DebugLocEntry(FLabel, SLabel, MI->getOperand(0).getImm(), Unit);
+    return DebugLocEntry::Value(Var, MI->getOperand(0).getImm());
   if (MI->getOperand(0).isFPImm())
-    return DebugLocEntry(FLabel, SLabel, MI->getOperand(0).getFPImm(), Unit);
+    return DebugLocEntry::Value(Var, MI->getOperand(0).getFPImm());
   if (MI->getOperand(0).isCImm())
-    return DebugLocEntry(FLabel, SLabel, MI->getOperand(0).getCImm(), Unit);
+    return DebugLocEntry::Value(Var, MI->getOperand(0).getCImm());
 
   llvm_unreachable("Unexpected 3 operand DBG_VALUE instruction!");
 }
@@ -1224,35 +1142,38 @@ static DebugLocEntry getDebugLocEntry(AsmPrinter *Asm,
 // Find variables for each lexical scope.
 void
 DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
+  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  DwarfCompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
 
   // Grab the variable info that was squirreled away in the MMI side-table.
   collectVariableInfoFromMMITable(Processed);
 
-  for (const MDNode *Var : UserVariables) {
-    if (Processed.count(Var))
+  for (const auto &I : DbgValues) {
+    DIVariable DV(I.first);
+    if (Processed.count(DV))
       continue;
 
-    // History contains relevant DBG_VALUE instructions for Var and instructions
-    // clobbering it.
-    SmallVectorImpl<const MachineInstr *> &History = DbgValues[Var];
-    if (History.empty())
+    // Instruction ranges, specifying where DV is accessible.
+    const auto &Ranges = I.second;
+    if (Ranges.empty())
       continue;
-    const MachineInstr *MInsn = History.front();
 
-    DIVariable DV(Var);
-    LexicalScope *Scope = NULL;
+    LexicalScope *Scope = nullptr;
     if (DV.getTag() == dwarf::DW_TAG_arg_variable &&
         DISubprogram(DV.getContext()).describes(CurFn->getFunction()))
       Scope = LScopes.getCurrentFunctionScope();
-    else if (MDNode *IA = DV.getInlinedAt())
-      Scope = LScopes.findInlinedScope(DebugLoc::getFromDILocation(IA));
-    else
-      Scope = LScopes.findLexicalScope(cast<MDNode>(DV->getOperand(1)));
+    else if (MDNode *IA = DV.getInlinedAt()) {
+      DebugLoc DL = DebugLoc::getFromDILocation(IA);
+      Scope = LScopes.findInlinedScope(DebugLoc::get(
+          DL.getLine(), DL.getCol(), DV.getContext(), IA));
+    } else
+      Scope = LScopes.findLexicalScope(DV.getContext());
     // If variable scope is not found then skip this variable.
     if (!Scope)
       continue;
 
     Processed.insert(DV);
+    const MachineInstr *MInsn = Ranges.front().first;
     assert(MInsn->isDebugValue() && "History must begin with debug value");
     DbgVariable *AbsVar = findAbstractVariable(DV, MInsn->getDebugLoc());
     DbgVariable *RegVar = new DbgVariable(DV, AbsVar, this);
@@ -1261,9 +1182,8 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
     if (AbsVar)
       AbsVar->setMInsn(MInsn);
 
-    // Simplify ranges that are fully coalesced.
-    if (History.size() <= 1 ||
-        (History.size() == 2 && MInsn->isIdenticalTo(History.back()))) {
+    // Check if the first DBG_VALUE is valid for the rest of the function.
+    if (Ranges.size() == 1 && Ranges.front().second == nullptr) {
       RegVar->setMInsn(MInsn);
       continue;
     }
@@ -1276,58 +1196,48 @@ DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
     LocList.Label =
         Asm->GetTempSymbol("debug_loc", DotDebugLocEntries.size() - 1);
     SmallVector<DebugLocEntry, 4> &DebugLoc = LocList.List;
-    for (SmallVectorImpl<const MachineInstr *>::const_iterator
-             HI = History.begin(),
-             HE = History.end();
-         HI != HE; ++HI) {
-      const MachineInstr *Begin = *HI;
+    for (auto I = Ranges.begin(), E = Ranges.end(); I != E; ++I) {
+      const MachineInstr *Begin = I->first;
+      const MachineInstr *End = I->second;
       assert(Begin->isDebugValue() && "Invalid History entry");
 
-      // Check if DBG_VALUE is truncating a range.
+      // Check if a variable is unaccessible in this range.
       if (Begin->getNumOperands() > 1 && Begin->getOperand(0).isReg() &&
           !Begin->getOperand(0).getReg())
         continue;
 
-      // Compute the range for a register location.
-      const MCSymbol *FLabel = getLabelBeforeInsn(Begin);
-      const MCSymbol *SLabel = 0;
-
-      if (HI + 1 == HE)
-        // If Begin is the last instruction in History then its value is valid
-        // until the end of the function.
-        SLabel = FunctionEndSym;
-      else {
-        const MachineInstr *End = HI[1];
-        DEBUG(dbgs() << "DotDebugLoc Pair:\n"
-                     << "\t" << *Begin << "\t" << *End << "\n");
-        if (End->isDebugValue())
-          SLabel = getLabelBeforeInsn(End);
-        else {
-          // End is a normal instruction clobbering the range.
-          SLabel = getLabelAfterInsn(End);
-          assert(SLabel && "Forgot label after clobber instruction");
-          ++HI;
-        }
-      }
+      const MCSymbol *StartLabel = getLabelBeforeInsn(Begin);
+      assert(StartLabel && "Forgot label before DBG_VALUE starting a range!");
+
+      const MCSymbol *EndLabel;
+      if (End != nullptr)
+        EndLabel = getLabelAfterInsn(End);
+      else if (std::next(I) == Ranges.end())
+        EndLabel = FunctionEndSym;
+      else
+        EndLabel = getLabelBeforeInsn(std::next(I)->first);
+      assert(EndLabel && "Forgot label after instruction ending a range!");
 
-      // The value is valid until the next DBG_VALUE or clobber.
-      LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
-      DwarfCompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
-      DebugLocEntry Loc = getDebugLocEntry(Asm, FLabel, SLabel, Begin, TheCU);
+      DEBUG(dbgs() << "DotDebugLoc Pair:\n"
+                   << "\t" << *Begin << "\t" << *End << "\n");
+      DebugLocEntry Loc(StartLabel, EndLabel, getDebugLocValue(Begin), TheCU);
       if (DebugLoc.empty() || !DebugLoc.back().Merge(Loc))
         DebugLoc.push_back(std::move(Loc));
     }
   }
 
   // Collect info for variables that were optimized out.
-  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
   DIArray Variables = DISubprogram(FnScope->getScopeNode()).getVariables();
   for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
     DIVariable DV(Variables.getElement(i));
-    if (!DV || !DV.isVariable() || !Processed.insert(DV))
+    assert(DV.isVariable());
+    if (!Processed.insert(DV))
       continue;
     if (LexicalScope *Scope = LScopes.findLexicalScope(DV.getContext()))
-      addScopeVariable(Scope, new DbgVariable(DV, NULL, this));
+      addScopeVariable(
+          Scope,
+          new DbgVariable(DV, findAbstractVariable(DV, Scope->getScopeNode()),
+                          this));
   }
 }
 
@@ -1345,7 +1255,7 @@ MCSymbol *DwarfDebug::getLabelAfterInsn(const MachineInstr *MI) {
 
 // Process beginning of an instruction.
 void DwarfDebug::beginInstruction(const MachineInstr *MI) {
-  assert(CurMI == 0);
+  assert(CurMI == nullptr);
   CurMI = MI;
   // Check if source location changes, but ignore DBG_VALUE locations.
   if (!MI->isDebugValue()) {
@@ -1364,7 +1274,7 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
         const MDNode *Scope = DL.getScope(Asm->MF->getFunction()->getContext());
         recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags);
       } else
-        recordSourceLine(0, 0, 0, 0);
+        recordSourceLine(0, 0, nullptr, 0);
     }
   }
 
@@ -1389,15 +1299,15 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
 
 // Process end of an instruction.
 void DwarfDebug::endInstruction() {
-  assert(CurMI != 0);
+  assert(CurMI != nullptr);
   // Don't create a new label after DBG_VALUE instructions.
   // They don't generate code.
   if (!CurMI->isDebugValue())
-    PrevLabel = 0;
+    PrevLabel = nullptr;
 
   DenseMap<const MachineInstr *, MCSymbol *>::iterator I =
       LabelsAfterInsn.find(CurMI);
-  CurMI = 0;
+  CurMI = nullptr;
 
   // No label needed.
   if (I == LabelsAfterInsn.end())
@@ -1441,6 +1351,17 @@ void DwarfDebug::identifyScopeMarkers() {
   }
 }
 
+static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
+  // First known non-DBG_VALUE and non-frame setup location marks
+  // the beginning of the function body.
+  for (const auto &MBB : *MF)
+    for (const auto &MI : MBB)
+      if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
+          !MI.getDebugLoc().isUnknown())
+        return MI.getDebugLoc();
+  return DebugLoc();
+}
+
 // Gather pre-function debug information.  Assumes being called immediately
 // after the function entry point has been emitted.
 void DwarfDebug::beginFunction(const MachineFunction *MF) {
@@ -1456,7 +1377,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   if (LScopes.empty())
     return;
 
-  assert(UserVariables.empty() && DbgValues.empty() && "Maps weren't cleaned");
+  assert(DbgValues.empty() && "DbgValues map wasn't cleaned!");
 
   // Make sure that each lexical scope will have a begin/end label.
   identifyScopeMarkers();
@@ -1478,144 +1399,26 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   // Assumes in correct section after the entry point.
   Asm->OutStreamer.EmitLabel(FunctionBeginSym);
 
-  const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
-  // LiveUserVar - Map physreg numbers to the MDNode they contain.
-  std::vector<const MDNode *> LiveUserVar(TRI->getNumRegs());
-
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
-       ++I) {
-    bool AtBlockEntry = true;
-    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
-         II != IE; ++II) {
-      const MachineInstr *MI = II;
-
-      if (MI->isDebugValue()) {
-        assert(MI->getNumOperands() > 1 && "Invalid machine instruction!");
-
-        // Keep track of user variables.
-        const MDNode *Var =
-            MI->getOperand(MI->getNumOperands() - 1).getMetadata();
-
-        // Variable is in a register, we need to check for clobbers.
-        if (isDbgValueInDefinedReg(MI))
-          LiveUserVar[MI->getOperand(0).getReg()] = Var;
-
-        // Check the history of this variable.
-        SmallVectorImpl<const MachineInstr *> &History = DbgValues[Var];
-        if (History.empty()) {
-          UserVariables.push_back(Var);
-          // The first mention of a function argument gets the FunctionBeginSym
-          // label, so arguments are visible when breaking at function entry.
-          DIVariable DV(Var);
-          if (DV.isVariable() && DV.getTag() == dwarf::DW_TAG_arg_variable &&
-              getDISubprogram(DV.getContext()).describes(MF->getFunction()))
-            LabelsBeforeInsn[MI] = FunctionBeginSym;
-        } else {
-          // We have seen this variable before. Try to coalesce DBG_VALUEs.
-          const MachineInstr *Prev = History.back();
-          if (Prev->isDebugValue()) {
-            // Coalesce identical entries at the end of History.
-            if (History.size() >= 2 &&
-                Prev->isIdenticalTo(History[History.size() - 2])) {
-              DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
-                           << "\t" << *Prev << "\t"
-                           << *History[History.size() - 2] << "\n");
-              History.pop_back();
-            }
-
-            // Terminate old register assignments that don't reach MI;
-            MachineFunction::const_iterator PrevMBB = Prev->getParent();
-            if (PrevMBB != I && (!AtBlockEntry || std::next(PrevMBB) != I) &&
-                isDbgValueInDefinedReg(Prev)) {
-              // Previous register assignment needs to terminate at the end of
-              // its basic block.
-              MachineBasicBlock::const_iterator LastMI =
-                  PrevMBB->getLastNonDebugInstr();
-              if (LastMI == PrevMBB->end()) {
-                // Drop DBG_VALUE for empty range.
-                DEBUG(dbgs() << "Dropping DBG_VALUE for empty range:\n"
-                             << "\t" << *Prev << "\n");
-                History.pop_back();
-              } else if (std::next(PrevMBB) != PrevMBB->getParent()->end())
-                // Terminate after LastMI.
-                History.push_back(LastMI);
-            }
-          }
-        }
-        History.push_back(MI);
-      } else {
-        // Not a DBG_VALUE instruction.
-        if (!MI->isPosition())
-          AtBlockEntry = false;
-
-        // First known non-DBG_VALUE and non-frame setup location marks
-        // the beginning of the function body.
-        if (!MI->getFlag(MachineInstr::FrameSetup) &&
-            (PrologEndLoc.isUnknown() && !MI->getDebugLoc().isUnknown()))
-          PrologEndLoc = MI->getDebugLoc();
-
-        // Check if the instruction clobbers any registers with debug vars.
-        for (const MachineOperand &MO : MI->operands()) {
-          if (!MO.isReg() || !MO.isDef() || !MO.getReg())
-            continue;
-          for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid();
-               ++AI) {
-            unsigned Reg = *AI;
-            const MDNode *Var = LiveUserVar[Reg];
-            if (!Var)
-              continue;
-            // Reg is now clobbered.
-            LiveUserVar[Reg] = 0;
-
-            // Was MD last defined by a DBG_VALUE referring to Reg?
-            DbgValueHistoryMap::iterator HistI = DbgValues.find(Var);
-            if (HistI == DbgValues.end())
-              continue;
-            SmallVectorImpl<const MachineInstr *> &History = HistI->second;
-            if (History.empty())
-              continue;
-            const MachineInstr *Prev = History.back();
-            // Sanity-check: Register assignments are terminated at the end of
-            // their block.
-            if (!Prev->isDebugValue() || Prev->getParent() != MI->getParent())
-              continue;
-            // Is the variable still in Reg?
-            if (!isDbgValueInDefinedReg(Prev) ||
-                Prev->getOperand(0).getReg() != Reg)
-              continue;
-            // Var is clobbered. Make sure the next instruction gets a label.
-            History.push_back(MI);
-          }
-        }
-      }
-    }
-  }
+  // Calculate history for local variables.
+  calculateDbgValueHistory(MF, Asm->TM.getRegisterInfo(), DbgValues);
 
-  for (auto &I : DbgValues) {
-    SmallVectorImpl<const MachineInstr *> &History = I.second;
-    if (History.empty())
+  // Request labels for the full history.
+  for (const auto &I : DbgValues) {
+    const auto &Ranges = I.second;
+    if (Ranges.empty())
       continue;
 
-    // Make sure the final register assignments are terminated.
-    const MachineInstr *Prev = History.back();
-    if (Prev->isDebugValue() && isDbgValueInDefinedReg(Prev)) {
-      const MachineBasicBlock *PrevMBB = Prev->getParent();
-      MachineBasicBlock::const_iterator LastMI =
-          PrevMBB->getLastNonDebugInstr();
-      if (LastMI == PrevMBB->end())
-        // Drop DBG_VALUE for empty range.
-        History.pop_back();
-      else if (PrevMBB != &PrevMBB->getParent()->back()) {
-        // Terminate after LastMI.
-        History.push_back(LastMI);
-      }
-    }
-    // Request labels for the full history.
-    for (const MachineInstr *MI : History) {
-      if (MI->isDebugValue())
-        requestLabelBeforeInsn(MI);
-      else
-        requestLabelAfterInsn(MI);
+    // The first mention of a function argument gets the FunctionBeginSym
+    // label, so arguments are visible when breaking at function entry.
+    DIVariable DV(I.first);
+    if (DV.isVariable() && DV.getTag() == dwarf::DW_TAG_arg_variable &&
+        getDISubprogram(DV.getContext()).describes(MF->getFunction()))
+      LabelsBeforeInsn[Ranges.front().first] = FunctionBeginSym;
+
+    for (const auto &Range : Ranges) {
+      requestLabelBeforeInsn(Range.first);
+      if (Range.second)
+        requestLabelAfterInsn(Range.second);
     }
   }
 
@@ -1623,6 +1426,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   PrevLabel = FunctionBeginSym;
 
   // Record beginning of function.
+  PrologEndLoc = findPrologueEndLoc(MF);
   if (!PrologEndLoc.isUnknown()) {
     DebugLoc FnStartDL =
         PrologEndLoc.getFnDebugLoc(MF->getFunction()->getContext());
@@ -1671,11 +1475,11 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   // Every beginFunction(MF) call should be followed by an endFunction(MF) call,
   // though the beginFunction may not be called at all.
   // We should handle both cases.
-  if (CurFn == 0)
+  if (!CurFn)
     CurFn = MF;
   else
     assert(CurFn == MF);
-  assert(CurFn != 0);
+  assert(CurFn != nullptr);
 
   if (!MMI->hasDebugInfo() || LScopes.empty()) {
     // If we don't have a lexical scope for this function then there will
@@ -1683,7 +1487,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
     // previously used section to nullptr.
     PrevSection = nullptr;
     PrevCU = nullptr;
-    CurFn = 0;
+    CurFn = nullptr;
     return;
   }
 
@@ -1699,55 +1503,50 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   collectVariableInfo(ProcessedVars);
 
   LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
-  DwarfCompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
-  assert(TheCU && "Unable to find compile unit!");
+  DwarfCompileUnit &TheCU = *SPMap.lookup(FnScope->getScopeNode());
 
   // Construct abstract scopes.
   for (LexicalScope *AScope : LScopes.getAbstractScopesList()) {
     DISubprogram SP(AScope->getScopeNode());
-    if (SP.isSubprogram()) {
-      // Collect info for variables that were optimized out.
-      DIArray Variables = SP.getVariables();
-      for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
-        DIVariable DV(Variables.getElement(i));
-        if (!DV || !DV.isVariable() || !ProcessedVars.insert(DV))
-          continue;
-        // Check that DbgVariable for DV wasn't created earlier, when
-        // findAbstractVariable() was called for inlined instance of DV.
-        LLVMContext &Ctx = DV->getContext();
-        DIVariable CleanDV = cleanseInlinedVariable(DV, Ctx);
-        if (AbstractVariables.lookup(CleanDV))
-          continue;
-        if (LexicalScope *Scope = LScopes.findAbstractScope(DV.getContext()))
-          addScopeVariable(Scope, new DbgVariable(DV, NULL, this));
-      }
+    if (!SP.isSubprogram())
+      continue;
+    // Collect info for variables that were optimized out.
+    DIArray Variables = SP.getVariables();
+    for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
+      DIVariable DV(Variables.getElement(i));
+      assert(DV && DV.isVariable());
+      if (!ProcessedVars.insert(DV))
+        continue;
+      findAbstractVariable(DV, DV.getContext());
     }
-    if (ProcessedSPNodes.count(AScope->getScopeNode()) == 0)
-      constructScopeDIE(TheCU, AScope);
+    constructAbstractSubprogramScopeDIE(TheCU, AScope);
   }
 
-  DIE *CurFnDIE = constructScopeDIE(TheCU, FnScope);
+  DIE &CurFnDIE = constructSubprogramScopeDIE(TheCU, FnScope);
   if (!CurFn->getTarget().Options.DisableFramePointerElim(*CurFn))
-    TheCU->addFlag(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr);
+    TheCU.addFlag(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr);
 
   // Add the range of this function to the list of ranges for the CU.
   RangeSpan Span(FunctionBeginSym, FunctionEndSym);
-  TheCU->addRange(std::move(Span));
+  TheCU.addRange(std::move(Span));
   PrevSection = Asm->getCurrentSection();
-  PrevCU = TheCU;
+  PrevCU = &TheCU;
 
   // Clear debug info
-  for (auto &I : ScopeVariables)
-    DeleteContainerPointers(I.second);
+  // Ownership of DbgVariables is a bit subtle - ScopeVariables owns all the
+  // DbgVariables except those that are also in AbstractVariables (since they
+  // can be used cross-function)
+  for (const auto &I : ScopeVariables)
+    for (const auto *Var : I.second)
+      if (!AbstractVariables.count(Var->getVariable()) || Var->getAbstractVariable())
+        delete Var;
   ScopeVariables.clear();
   DeleteContainerPointers(CurrentFnArguments);
-  UserVariables.clear();
   DbgValues.clear();
-  AbstractVariables.clear();
   LabelsBeforeInsn.clear();
   LabelsAfterInsn.clear();
-  PrevLabel = NULL;
-  CurFn = 0;
+  PrevLabel = nullptr;
+  CurFn = nullptr;
 }
 
 // Register a source line with debug info. Returns the  unique label that was
@@ -1758,36 +1557,16 @@ void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
   StringRef Dir;
   unsigned Src = 1;
   unsigned Discriminator = 0;
-  if (S) {
-    DIDescriptor Scope(S);
-
-    if (Scope.isCompileUnit()) {
-      DICompileUnit CU(S);
-      Fn = CU.getFilename();
-      Dir = CU.getDirectory();
-    } else if (Scope.isFile()) {
-      DIFile F(S);
-      Fn = F.getFilename();
-      Dir = F.getDirectory();
-    } else if (Scope.isSubprogram()) {
-      DISubprogram SP(S);
-      Fn = SP.getFilename();
-      Dir = SP.getDirectory();
-    } else if (Scope.isLexicalBlockFile()) {
-      DILexicalBlockFile DBF(S);
-      Fn = DBF.getFilename();
-      Dir = DBF.getDirectory();
-    } else if (Scope.isLexicalBlock()) {
-      DILexicalBlock DB(S);
-      Fn = DB.getFilename();
-      Dir = DB.getDirectory();
-      Discriminator = DB.getDiscriminator();
-    } else
-      llvm_unreachable("Unexpected scope info");
+  if (DIScope Scope = DIScope(S)) {
+    assert(Scope.isScope());
+    Fn = Scope.getFilename();
+    Dir = Scope.getDirectory();
+    if (Scope.isLexicalBlock())
+      Discriminator = DILexicalBlock(S).getDiscriminator();
 
     unsigned CUID = Asm->OutStreamer.getContext().getDwarfCompileUnitID();
-    Src = static_cast<DwarfCompileUnit *>(InfoHolder.getUnits()[CUID])
-              ->getOrCreateSourceID(Fn, Dir);
+    Src = static_cast<DwarfCompileUnit &>(*InfoHolder.getUnits()[CUID])
+              .getOrCreateSourceID(Fn, Dir);
   }
   Asm->OutStreamer.EmitDwarfLocDirective(Src, Line, Col, Flags, 0,
                                          Discriminator, Fn);
@@ -1797,68 +1576,6 @@ void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
 // Emit Methods
 //===----------------------------------------------------------------------===//
 
-// Compute the size and offset of a DIE. The offset is relative to start of the
-// CU. It returns the offset after laying out the DIE.
-unsigned DwarfFile::computeSizeAndOffset(DIE *Die, unsigned Offset) {
-  // Record the abbreviation.
-  assignAbbrevNumber(Die->getAbbrev());
-
-  // Get the abbreviation for this DIE.
-  const DIEAbbrev &Abbrev = Die->getAbbrev();
-
-  // Set DIE offset
-  Die->setOffset(Offset);
-
-  // Start the size with the size of abbreviation code.
-  Offset += getULEB128Size(Die->getAbbrevNumber());
-
-  const SmallVectorImpl<DIEValue *> &Values = Die->getValues();
-  const SmallVectorImpl<DIEAbbrevData> &AbbrevData = Abbrev.getData();
-
-  // Size the DIE attribute values.
-  for (unsigned i = 0, N = Values.size(); i < N; ++i)
-    // Size attribute value.
-    Offset += Values[i]->SizeOf(Asm, AbbrevData[i].getForm());
-
-  // Get the children.
-  const std::vector<DIE *> &Children = Die->getChildren();
-
-  // Size the DIE children if any.
-  if (!Children.empty()) {
-    assert(Abbrev.hasChildren() && "Children flag not set");
-
-    for (DIE *Child : Children)
-      Offset = computeSizeAndOffset(Child, Offset);
-
-    // End of children marker.
-    Offset += sizeof(int8_t);
-  }
-
-  Die->setSize(Offset - Die->getOffset());
-  return Offset;
-}
-
-// Compute the size and offset for each DIE.
-void DwarfFile::computeSizeAndOffsets() {
-  // Offset from the first CU in the debug info section is 0 initially.
-  unsigned SecOffset = 0;
-
-  // Iterate over each compile unit and set the size and offsets for each
-  // DIE within each compile unit. All offsets are CU relative.
-  for (DwarfUnit *TheU : CUs) {
-    TheU->setDebugInfoOffset(SecOffset);
-
-    // CU-relative offset is reset to 0 here.
-    unsigned Offset = sizeof(int32_t) +      // Length of Unit Info
-                      TheU->getHeaderSize(); // Unit-specific headers
-
-    // EndOffset here is CU-relative, after laying out
-    // all of the CU DIE.
-    unsigned EndOffset = computeSizeAndOffset(TheU->getUnitDie(), Offset);
-    SecOffset += EndOffset;
-  }
-}
-
 // Emit initial Dwarf sections with a label at the start of each one.
 void DwarfDebug::emitSectionLabels() {
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
@@ -1906,19 +1623,19 @@ void DwarfDebug::emitSectionLabels() {
 }
 
 // Recursively emits a debug information entry.
-void DwarfDebug::emitDIE(DIE *Die) {
+void DwarfDebug::emitDIE(DIE &Die) {
   // Get the abbreviation for this DIE.
-  const DIEAbbrev &Abbrev = Die->getAbbrev();
+  const DIEAbbrev &Abbrev = Die.getAbbrev();
 
   // Emit the code (index) for the abbreviation.
   if (Asm->isVerbose())
     Asm->OutStreamer.AddComment("Abbrev [" + Twine(Abbrev.getNumber()) +
-                                "] 0x" + Twine::utohexstr(Die->getOffset()) +
-                                ":0x" + Twine::utohexstr(Die->getSize()) + " " +
+                                "] 0x" + Twine::utohexstr(Die.getOffset()) +
+                                ":0x" + Twine::utohexstr(Die.getSize()) + " " +
                                 dwarf::TagString(Abbrev.getTag()));
   Asm->EmitULEB128(Abbrev.getNumber());
 
-  const SmallVectorImpl<DIEValue *> &Values = Die->getValues();
+  const SmallVectorImpl<DIEValue *> &Values = Die.getValues();
   const SmallVectorImpl<DIEAbbrevData> &AbbrevData = Abbrev.getData();
 
   // Emit the DIE attribute values.
@@ -1940,38 +1657,14 @@ void DwarfDebug::emitDIE(DIE *Die) {
 
   // Emit the DIE children if any.
   if (Abbrev.hasChildren()) {
-    const std::vector<DIE *> &Children = Die->getChildren();
-
-    for (DIE *Child : Children)
-      emitDIE(Child);
+    for (auto &Child : Die.getChildren())
+      emitDIE(*Child);
 
     Asm->OutStreamer.AddComment("End Of Children Mark");
     Asm->EmitInt8(0);
   }
 }
 
-// Emit the various dwarf units to the unit section USection with
-// the abbreviations going into ASection.
-void DwarfFile::emitUnits(DwarfDebug *DD, const MCSymbol *ASectionSym) {
-  for (DwarfUnit *TheU : CUs) {
-    DIE *Die = TheU->getUnitDie();
-    const MCSection *USection = TheU->getSection();
-    Asm->OutStreamer.SwitchSection(USection);
-
-    // Emit the compile units header.
-    Asm->OutStreamer.EmitLabel(TheU->getLabelBegin());
-
-    // Emit size of content not including length itself
-    Asm->OutStreamer.AddComment("Length of Unit");
-    Asm->EmitInt32(TheU->getHeaderSize() + Die->getSize());
-
-    TheU->emitHeader(ASectionSym);
-
-    DD->emitDIE(Die);
-    Asm->OutStreamer.EmitLabel(TheU->getLabelEnd());
-  }
-}
-
 // Emit the debug info section.
 void DwarfDebug::emitDebugInfo() {
   DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
@@ -1986,26 +1679,6 @@ void DwarfDebug::emitAbbreviations() {
   Holder.emitAbbrevs(Asm->getObjFileLowering().getDwarfAbbrevSection());
 }
 
-void DwarfFile::emitAbbrevs(const MCSection *Section) {
-  // Check to see if it is worth the effort.
-  if (!Abbreviations.empty()) {
-    // Start the debug abbrev section.
-    Asm->OutStreamer.SwitchSection(Section);
-
-    // For each abbrevation.
-    for (const DIEAbbrev *Abbrev : Abbreviations) {
-      // Emit the abbrevations code (base 1 index.)
-      Asm->EmitULEB128(Abbrev->getNumber(), "Abbreviation Code");
-
-      // Emit the abbreviations data.
-      Abbrev->Emit(Asm);
-    }
-
-    // Mark end of abbreviations.
-    Asm->EmitULEB128(0, "EOM(3)");
-  }
-}
-
 // Emit the last address of the section and the end of the line matrix.
 void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) {
   // Define last address of section.
@@ -2032,97 +1705,52 @@ void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) {
 
 // Emit visible names into a hashed accelerator table section.
 void DwarfDebug::emitAccelNames() {
-  DwarfAccelTable AT(
-      DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4));
-  for (DwarfUnit *TheU : getUnits()) {
-    for (const auto &GI : TheU->getAccelNames()) {
-      StringRef Name = GI.getKey();
-      for (const DIE *D : GI.second)
-        AT.AddName(Name, D);
-    }
-  }
-
-  AT.FinalizeTable(Asm, "Names");
+  AccelNames.FinalizeTable(Asm, "Names");
   Asm->OutStreamer.SwitchSection(
       Asm->getObjFileLowering().getDwarfAccelNamesSection());
   MCSymbol *SectionBegin = Asm->GetTempSymbol("names_begin");
   Asm->OutStreamer.EmitLabel(SectionBegin);
 
   // Emit the full data.
-  AT.Emit(Asm, SectionBegin, &InfoHolder);
+  AccelNames.Emit(Asm, SectionBegin, &InfoHolder);
 }
 
 // Emit objective C classes and categories into a hashed accelerator table
 // section.
 void DwarfDebug::emitAccelObjC() {
-  DwarfAccelTable AT(
-      DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4));
-  for (DwarfUnit *TheU : getUnits()) {
-    for (const auto &GI : TheU->getAccelObjC()) {
-      StringRef Name = GI.getKey();
-      for (const DIE *D : GI.second)
-        AT.AddName(Name, D);
-    }
-  }
-
-  AT.FinalizeTable(Asm, "ObjC");
+  AccelObjC.FinalizeTable(Asm, "ObjC");
   Asm->OutStreamer.SwitchSection(
       Asm->getObjFileLowering().getDwarfAccelObjCSection());
   MCSymbol *SectionBegin = Asm->GetTempSymbol("objc_begin");
   Asm->OutStreamer.EmitLabel(SectionBegin);
 
   // Emit the full data.
-  AT.Emit(Asm, SectionBegin, &InfoHolder);
+  AccelObjC.Emit(Asm, SectionBegin, &InfoHolder);
 }
 
 // Emit namespace dies into a hashed accelerator table.
 void DwarfDebug::emitAccelNamespaces() {
-  DwarfAccelTable AT(
-      DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4));
-  for (DwarfUnit *TheU : getUnits()) {
-    for (const auto &GI : TheU->getAccelNamespace()) {
-      StringRef Name = GI.getKey();
-      for (const DIE *D : GI.second)
-        AT.AddName(Name, D);
-    }
-  }
-
-  AT.FinalizeTable(Asm, "namespac");
+  AccelNamespace.FinalizeTable(Asm, "namespac");
   Asm->OutStreamer.SwitchSection(
       Asm->getObjFileLowering().getDwarfAccelNamespaceSection());
   MCSymbol *SectionBegin = Asm->GetTempSymbol("namespac_begin");
   Asm->OutStreamer.EmitLabel(SectionBegin);
 
   // Emit the full data.
-  AT.Emit(Asm, SectionBegin, &InfoHolder);
+  AccelNamespace.Emit(Asm, SectionBegin, &InfoHolder);
 }
 
 // Emit type dies into a hashed accelerator table.
 void DwarfDebug::emitAccelTypes() {
-  std::vector<DwarfAccelTable::Atom> Atoms;
-  Atoms.push_back(
-      DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4));
-  Atoms.push_back(
-      DwarfAccelTable::Atom(dwarf::DW_ATOM_die_tag, dwarf::DW_FORM_data2));
-  Atoms.push_back(
-      DwarfAccelTable::Atom(dwarf::DW_ATOM_type_flags, dwarf::DW_FORM_data1));
-  DwarfAccelTable AT(Atoms);
-  for (DwarfUnit *TheU : getUnits()) {
-    for (const auto &GI : TheU->getAccelTypes()) {
-      StringRef Name = GI.getKey();
-      for (const auto &DI : GI.second)
-        AT.AddName(Name, DI.first, DI.second);
-    }
-  }
 
-  AT.FinalizeTable(Asm, "types");
+  AccelTypes.FinalizeTable(Asm, "types");
   Asm->OutStreamer.SwitchSection(
       Asm->getObjFileLowering().getDwarfAccelTypesSection());
   MCSymbol *SectionBegin = Asm->GetTempSymbol("types_begin");
   Asm->OutStreamer.EmitLabel(SectionBegin);
 
   // Emit the full data.
-  AT.Emit(Asm, SectionBegin, &InfoHolder);
+  AccelTypes.Emit(Asm, SectionBegin, &InfoHolder);
 }
 
 // Public name handling.
@@ -2148,8 +1776,8 @@ static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU,
   // look for that now.
   DIEValue *SpecVal = Die->findAttribute(dwarf::DW_AT_specification);
   if (SpecVal) {
-    DIE *SpecDIE = cast<DIEEntry>(SpecVal)->getEntry();
-    if (SpecDIE->findAttribute(dwarf::DW_AT_external))
+    DIE &SpecDIE = cast<DIEEntry>(SpecVal)->getEntry();
+    if (SpecDIE.findAttribute(dwarf::DW_AT_external))
       Linkage = dwarf::GIEL_EXTERNAL;
   } else if (Die->findAttribute(dwarf::DW_AT_external))
     Linkage = dwarf::GIEL_EXTERNAL;
@@ -2261,69 +1889,6 @@ void DwarfDebug::emitDebugPubTypes(bool GnuStyle) {
   emitDebugPubSection(GnuStyle, PSec, "Types", &DwarfUnit::getGlobalTypes);
 }
 
-// Emit strings into a string section.
-void DwarfFile::emitStrings(const MCSection *StrSection,
-                            const MCSection *OffsetSection = NULL,
-                            const MCSymbol *StrSecSym = NULL) {
-
-  if (StringPool.empty())
-    return;
-
-  // Start the dwarf str section.
-  Asm->OutStreamer.SwitchSection(StrSection);
-
-  // Get all of the string pool entries and put them in an array by their ID so
-  // we can sort them.
-  SmallVector<std::pair<unsigned, const StrPool::value_type *>, 64 > Entries;
-
-  for (const auto &I : StringPool)
-    Entries.push_back(std::make_pair(I.second.second, &I));
-
-  array_pod_sort(Entries.begin(), Entries.end());
-
-  for (const auto &Entry : Entries) {
-    // Emit a label for reference from debug information entries.
-    Asm->OutStreamer.EmitLabel(Entry.second->getValue().first);
-
-    // Emit the string itself with a terminating null byte.
-    Asm->OutStreamer.EmitBytes(StringRef(Entry.second->getKeyData(),
-                                         Entry.second->getKeyLength() + 1));
-  }
-
-  // If we've got an offset section go ahead and emit that now as well.
-  if (OffsetSection) {
-    Asm->OutStreamer.SwitchSection(OffsetSection);
-    unsigned offset = 0;
-    unsigned size = 4; // FIXME: DWARF64 is 8.
-    for (const auto &Entry : Entries) {
-      Asm->OutStreamer.EmitIntValue(offset, size);
-      offset += Entry.second->getKeyLength() + 1;
-    }
-  }
-}
-
-// Emit addresses into the section given.
-void DwarfFile::emitAddresses(const MCSection *AddrSection) {
-
-  if (AddressPool.empty())
-    return;
-
-  // Start the dwarf addr section.
-  Asm->OutStreamer.SwitchSection(AddrSection);
-
-  // Order the address pool entries by ID
-  SmallVector<const MCExpr *, 64> Entries(AddressPool.size());
-
-  for (const auto &I : AddressPool)
-    Entries[I.second.Number] =
-        I.second.TLS
-            ? Asm->getObjFileLowering().getDebugThreadLocalSymbol(I.first)
-            : MCSymbolRefExpr::Create(I.first, Asm->OutContext);
-
-  for (const MCExpr *Entry : Entries)
-    Asm->OutStreamer.EmitValue(Entry, Asm->getDataLayout().getPointerSize());
-}
-
 // Emit visible names into a debug str section.
 void DwarfDebug::emitDebugStr() {
   DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
@@ -2332,19 +1897,22 @@ void DwarfDebug::emitDebugStr() {
 
 void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
                                    const DebugLocEntry &Entry) {
-  DIVariable DV(Entry.getVariable());
-  if (Entry.isInt()) {
+  assert(Entry.getValues().size() == 1 &&
+         "multi-value entries are not supported yet.");
+  const DebugLocEntry::Value Value = Entry.getValues()[0];
+  DIVariable DV(Value.getVariable());
+  if (Value.isInt()) {
     DIBasicType BTy(resolve(DV.getType()));
     if (BTy.Verify() && (BTy.getEncoding() == dwarf::DW_ATE_signed ||
                          BTy.getEncoding() == dwarf::DW_ATE_signed_char)) {
       Streamer.EmitInt8(dwarf::DW_OP_consts, "DW_OP_consts");
-      Streamer.EmitSLEB128(Entry.getInt());
+      Streamer.EmitSLEB128(Value.getInt());
     } else {
       Streamer.EmitInt8(dwarf::DW_OP_constu, "DW_OP_constu");
-      Streamer.EmitULEB128(Entry.getInt());
+      Streamer.EmitULEB128(Value.getInt());
     }
-  } else if (Entry.isLocation()) {
-    MachineLocation Loc = Entry.getLoc();
+  } else if (Value.isLocation()) {
+    MachineLocation Loc = Value.getLoc();
     if (!DV.hasComplexAddress())
       // Regular entry.
       Asm->EmitDwarfRegOp(Streamer, Loc, DV.isIndirect());
@@ -2443,7 +2011,7 @@ void DwarfDebug::emitDebugLocDWO() {
       // address we know we've emitted elsewhere (the start of the function?
       // The start of the CU or CU subrange that encloses this range?)
       Asm->EmitInt8(dwarf::DW_LLE_start_length_entry);
-      unsigned idx = InfoHolder.getAddrPoolIndex(Entry.getBeginSym());
+      unsigned idx = AddrPool.getIndex(Entry.getBeginSym());
       Asm->EmitULEB128(idx);
       Asm->EmitLabelDifference(Entry.getEndSym(), Entry.getBeginSym(), 4);
 
@@ -2464,7 +2032,7 @@ void DwarfDebug::emitDebugARanges() {
   Asm->OutStreamer.SwitchSection(
       Asm->getObjFileLowering().getDwarfARangesSection());
 
-  typedef DenseMap<DwarfCompileUnit *, std::vector<ArangeSpan> > SpansType;
+  typedef DenseMap<DwarfCompileUnit *, std::vector<ArangeSpan>> SpansType;
 
   SpansType Spans;
 
@@ -2502,11 +2070,11 @@ void DwarfDebug::emitDebugARanges() {
 
     // If we have no section (e.g. common), just write out
     // individual spans for each symbol.
-    if (Section == NULL) {
+    if (!Section) {
       for (const SymbolCU &Cur : List) {
         ArangeSpan Span;
         Span.Start = Cur.Sym;
-        Span.End = NULL;
+        Span.End = nullptr;
         if (Cur.CU)
           Spans[Cur.CU].push_back(Span);
       }
@@ -2613,9 +2181,6 @@ void DwarfDebug::emitDebugRanges() {
   for (const auto &I : CUMap) {
     DwarfCompileUnit *TheCU = I.second;
 
-    // Emit a symbol so we can find the beginning of our ranges.
-    Asm->OutStreamer.EmitLabel(TheCU->getLabelRange());
-
     // Iterate over the misc ranges for the compile units in the module.
     for (const RangeSpanList &List : TheCU->getRangeLists()) {
       // Emit our symbol so we can find the beginning of the range.
@@ -2626,8 +2191,15 @@ void DwarfDebug::emitDebugRanges() {
         const MCSymbol *End = Range.getEnd();
         assert(Begin && "Range without a begin symbol?");
         assert(End && "Range without an end symbol?");
-        Asm->OutStreamer.EmitSymbolValue(Begin, Size);
-        Asm->OutStreamer.EmitSymbolValue(End, Size);
+        if (TheCU->getRanges().size() == 1) {
+          // Grab the begin symbol from the first range as our base.
+          const MCSymbol *Base = TheCU->getRanges()[0].getStart();
+          Asm->EmitLabelDifference(Begin, Base, Size);
+          Asm->EmitLabelDifference(End, Base, Size);
+        } else {
+          Asm->OutStreamer.EmitSymbolValue(Begin, Size);
+          Asm->OutStreamer.EmitSymbolValue(End, Size);
+        }
       }
 
       // And terminate the list with two 0 values.
@@ -2656,52 +2228,52 @@ void DwarfDebug::emitDebugRanges() {
 
 // DWARF5 Experimental Separate Dwarf emitters.
 
-void DwarfDebug::initSkeletonUnit(const DwarfUnit *U, DIE *Die,
-                                  DwarfUnit *NewU) {
+void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
+                                  std::unique_ptr<DwarfUnit> NewU) {
   NewU->addLocalString(Die, dwarf::DW_AT_GNU_dwo_name,
-                       U->getCUNode().getSplitDebugFilename());
+                       U.getCUNode().getSplitDebugFilename());
 
   if (!CompilationDir.empty())
     NewU->addLocalString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
 
-  addGnuPubAttributes(NewU, Die);
+  addGnuPubAttributes(*NewU, Die);
 
-  SkeletonHolder.addUnit(NewU);
+  SkeletonHolder.addUnit(std::move(NewU));
 }
 
 // This DIE has the following attributes: DW_AT_comp_dir, DW_AT_stmt_list,
 // DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_dwo_name, DW_AT_dwo_id,
 // DW_AT_addr_base, DW_AT_ranges_base.
-DwarfCompileUnit *DwarfDebug::constructSkeletonCU(const DwarfCompileUnit *CU) {
+DwarfCompileUnit &DwarfDebug::constructSkeletonCU(const DwarfCompileUnit &CU) {
 
-  DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
-  DwarfCompileUnit *NewCU = new DwarfCompileUnit(
-      CU->getUniqueID(), Die, CU->getCUNode(), Asm, this, &SkeletonHolder);
-  NewCU->initSection(Asm->getObjFileLowering().getDwarfInfoSection(),
-                     DwarfInfoSectionSym);
+  auto OwnedUnit = make_unique<DwarfCompileUnit>(
+      CU.getUniqueID(), CU.getCUNode(), Asm, this, &SkeletonHolder);
+  DwarfCompileUnit &NewCU = *OwnedUnit;
+  NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoSection(),
+                    DwarfInfoSectionSym);
 
-  NewCU->initStmtList(DwarfLineSectionSym);
+  NewCU.initStmtList(DwarfLineSectionSym);
 
-  initSkeletonUnit(CU, Die, NewCU);
+  initSkeletonUnit(CU, NewCU.getUnitDie(), std::move(OwnedUnit));
 
   return NewCU;
 }
 
 // This DIE has the following attributes: DW_AT_comp_dir, DW_AT_dwo_name,
 // DW_AT_addr_base.
-DwarfTypeUnit *DwarfDebug::constructSkeletonTU(DwarfTypeUnit *TU) {
+DwarfTypeUnit &DwarfDebug::constructSkeletonTU(DwarfTypeUnit &TU) {
   DwarfCompileUnit &CU = static_cast<DwarfCompileUnit &>(
-      *SkeletonHolder.getUnits()[TU->getCU().getUniqueID()]);
+      *SkeletonHolder.getUnits()[TU.getCU().getUniqueID()]);
 
-  DIE *Die = new DIE(dwarf::DW_TAG_type_unit);
-  DwarfTypeUnit *NewTU =
-      new DwarfTypeUnit(TU->getUniqueID(), Die, CU, Asm, this, &SkeletonHolder);
-  NewTU->setTypeSignature(TU->getTypeSignature());
-  NewTU->setType(NULL);
-  NewTU->initSection(
-      Asm->getObjFileLowering().getDwarfTypesSection(TU->getTypeSignature()));
+  auto OwnedUnit = make_unique<DwarfTypeUnit>(TU.getUniqueID(), CU, Asm, this,
+                                              &SkeletonHolder);
+  DwarfTypeUnit &NewTU = *OwnedUnit;
+  NewTU.setTypeSignature(TU.getTypeSignature());
+  NewTU.setType(nullptr);
+  NewTU.initSection(
+      Asm->getObjFileLowering().getDwarfTypesSection(TU.getTypeSignature()));
 
-  initSkeletonUnit(TU, Die, NewTU);
+  initSkeletonUnit(TU, NewTU.getUnitDie(), std::move(OwnedUnit));
   return NewTU;
 }
 
@@ -2711,7 +2283,7 @@ void DwarfDebug::emitDebugInfoDWO() {
   assert(useSplitDwarf() && "No split dwarf debug info?");
   // Don't pass an abbrev symbol, using a constant zero instead so as not to
   // emit relocations into the dwo file.
-  InfoHolder.emitUnits(this, /* AbbrevSymbol */nullptr);
+  InfoHolder.emitUnits(this, /* AbbrevSymbol */ nullptr);
 }
 
 // Emit the .debug_abbrev.dwo section for separated dwarf. This contains the
@@ -2748,14 +2320,25 @@ MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) {
   return &SplitTypeUnitFileTable;
 }
 
+static uint64_t makeTypeSignature(StringRef Identifier) {
+  MD5 Hash;
+  Hash.update(Identifier);
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+  return *reinterpret_cast<support::ulittle64_t *>(Result + 8);
+}
+
 void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
-                                      StringRef Identifier, DIE *RefDie,
+                                      StringRef Identifier, DIE &RefDie,
                                       DICompositeType CTy) {
-  // Flag the type unit reference as a declaration so that if it contains
-  // members (implicit special members, static data member definitions, member
-  // declarations for definitions in this CU, etc) consumers don't get confused
-  // and think this is a full definition.
-  CU.addFlag(RefDie, dwarf::DW_AT_declaration);
+  // Fast path if we're building some type units and one has already used the
+  // address pool we know we're going to throw away all this work anyway, so
+  // don't bother building dependent types.
+  if (!TypeUnitsUnderConstruction.empty() && AddrPool.hasBeenUsed())
+    return;
 
   const DwarfTypeUnit *&TU = DwarfTypeUnits[CTy];
   if (TU) {
@@ -2763,45 +2346,111 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
     return;
   }
 
-  DIE *UnitDie = new DIE(dwarf::DW_TAG_type_unit);
-  DwarfTypeUnit *NewTU =
-      new DwarfTypeUnit(InfoHolder.getUnits().size(), UnitDie, CU, Asm, this,
-                        &InfoHolder, getDwoLineTable(CU));
-  TU = NewTU;
-  InfoHolder.addUnit(NewTU);
+  bool TopLevelType = TypeUnitsUnderConstruction.empty();
+  AddrPool.resetUsedFlag();
 
-  NewTU->addUInt(UnitDie, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
-                 CU.getLanguage());
+  auto OwnedUnit =
+      make_unique<DwarfTypeUnit>(InfoHolder.getUnits().size(), CU, Asm, this,
+                                 &InfoHolder, getDwoLineTable(CU));
+  DwarfTypeUnit &NewTU = *OwnedUnit;
+  DIE &UnitDie = NewTU.getUnitDie();
+  TU = &NewTU;
+  TypeUnitsUnderConstruction.push_back(
+      std::make_pair(std::move(OwnedUnit), CTy));
 
-  MD5 Hash;
-  Hash.update(Identifier);
-  // ... take the least significant 8 bytes and return those. Our MD5
-  // implementation always returns its results in little endian, swap bytes
-  // appropriately.
-  MD5::MD5Result Result;
-  Hash.final(Result);
-  uint64_t Signature = *reinterpret_cast<support::ulittle64_t *>(Result + 8);
-  NewTU->setTypeSignature(Signature);
-  if (useSplitDwarf())
-    NewTU->setSkeleton(constructSkeletonTU(NewTU));
-  else
-    CU.applyStmtList(*UnitDie);
+  NewTU.addUInt(UnitDie, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
+                CU.getLanguage());
 
-  NewTU->setType(NewTU->createTypeDIE(CTy));
+  uint64_t Signature = makeTypeSignature(Identifier);
+  NewTU.setTypeSignature(Signature);
 
-  NewTU->initSection(
+  if (!useSplitDwarf())
+    CU.applyStmtList(UnitDie);
+
+  // FIXME: Skip using COMDAT groups for type units in the .dwo file once tools
+  // such as DWP ( http://gcc.gnu.org/wiki/DebugFissionDWP ) can cope with it.
+  NewTU.initSection(
       useSplitDwarf()
           ? Asm->getObjFileLowering().getDwarfTypesDWOSection(Signature)
           : Asm->getObjFileLowering().getDwarfTypesSection(Signature));
 
-  CU.addDIETypeSignature(RefDie, *NewTU);
+  NewTU.setType(NewTU.createTypeDIE(CTy));
+
+  if (TopLevelType) {
+    auto TypeUnitsToAdd = std::move(TypeUnitsUnderConstruction);
+    TypeUnitsUnderConstruction.clear();
+
+    // Types referencing entries in the address table cannot be placed in type
+    // units.
+    if (AddrPool.hasBeenUsed()) {
+
+      // Remove all the types built while building this type.
+      // This is pessimistic as some of these types might not be dependent on
+      // the type that used an address.
+      for (const auto &TU : TypeUnitsToAdd)
+        DwarfTypeUnits.erase(TU.second);
+
+      // Construct this type in the CU directly.
+      // This is inefficient because all the dependent types will be rebuilt
+      // from scratch, including building them in type units, discovering that
+      // they depend on addresses, throwing them out and rebuilding them.
+      CU.constructTypeDIE(RefDie, CTy);
+      return;
+    }
+
+    // If the type wasn't dependent on fission addresses, finish adding the type
+    // and all its dependent types.
+    for (auto &TU : TypeUnitsToAdd) {
+      if (useSplitDwarf())
+        TU.first->setSkeleton(constructSkeletonTU(*TU.first));
+      InfoHolder.addUnit(std::move(TU.first));
+    }
+  }
+  CU.addDIETypeSignature(RefDie, NewTU);
 }
 
-void DwarfDebug::attachLowHighPC(DwarfCompileUnit *Unit, DIE *D,
+void DwarfDebug::attachLowHighPC(DwarfCompileUnit &Unit, DIE &D,
                                  MCSymbol *Begin, MCSymbol *End) {
-  Unit->addLabelAddress(D, dwarf::DW_AT_low_pc, Begin);
+  assert(Begin && "Begin label should not be null!");
+  assert(End && "End label should not be null!");
+  assert(Begin->isDefined() && "Invalid starting label");
+  assert(End->isDefined() && "Invalid end label");
+
+  Unit.addLabelAddress(D, dwarf::DW_AT_low_pc, Begin);
   if (DwarfVersion < 4)
-    Unit->addLabelAddress(D, dwarf::DW_AT_high_pc, End);
+    Unit.addLabelAddress(D, dwarf::DW_AT_high_pc, End);
   else
-    Unit->addLabelDelta(D, dwarf::DW_AT_high_pc, End, Begin);
+    Unit.addLabelDelta(D, dwarf::DW_AT_high_pc, End, Begin);
+}
+
+// Accelerator table mutators - add each name along with its companion
+// DIE to the proper table while ensuring that the name that we're going
+// to reference is in the string table. We do this since the names we
+// add may not only be identical to the names in the DIE.
+void DwarfDebug::addAccelName(StringRef Name, const DIE &Die) {
+  if (!useDwarfAccelTables())
+    return;
+  AccelNames.AddName(Name, InfoHolder.getStringPool().getSymbol(*Asm, Name),
+                     &Die);
+}
+
+void DwarfDebug::addAccelObjC(StringRef Name, const DIE &Die) {
+  if (!useDwarfAccelTables())
+    return;
+  AccelObjC.AddName(Name, InfoHolder.getStringPool().getSymbol(*Asm, Name),
+                    &Die);
+}
+
+void DwarfDebug::addAccelNamespace(StringRef Name, const DIE &Die) {
+  if (!useDwarfAccelTables())
+    return;
+  AccelNamespace.AddName(Name, InfoHolder.getStringPool().getSymbol(*Asm, Name),
+                         &Die);
+}
+
+void DwarfDebug::addAccelType(StringRef Name, const DIE &Die, char Flags) {
+  if (!useDwarfAccelTables())
+    return;
+  AccelTypes.AddName(Name, InfoHolder.getStringPool().getSymbol(*Asm, Name),
+                     &Die);
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index da708f5..2f5abc8 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -14,10 +14,13 @@
 #ifndef CODEGEN_ASMPRINTER_DWARFDEBUG_H__
 #define CODEGEN_ASMPRINTER_DWARFDEBUG_H__
 
+#include "DwarfFile.h"
 #include "AsmPrinterHandler.h"
 #include "DIE.h"
+#include "DbgValueHistoryCalculator.h"
 #include "DebugLocEntry.h"
 #include "DebugLocList.h"
+#include "DwarfAccelTable.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -30,6 +33,8 @@
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/Support/Allocator.h"
 
+#include <memory>
+
 namespace llvm {
 
 class AsmPrinter;
@@ -74,12 +79,12 @@ class DbgVariable {
 public:
   // AbsVar may be NULL.
   DbgVariable(DIVariable V, DbgVariable *AV, DwarfDebug *DD)
-      : Var(V), TheDIE(0), DotDebugLocOffset(~0U), AbsVar(AV), MInsn(0),
-        FrameIndex(~0), DD(DD) {}
+      : Var(V), TheDIE(nullptr), DotDebugLocOffset(~0U), AbsVar(AV),
+        MInsn(nullptr), FrameIndex(~0), DD(DD) {}
 
   // Accessors.
   DIVariable getVariable() const { return Var; }
-  void setDIE(DIE *D) { TheDIE = D; }
+  void setDIE(DIE &D) { TheDIE = &D; }
   DIE *getDIE() const { return TheDIE; }
   void setDotDebugLocOffset(unsigned O) { DotDebugLocOffset = O; }
   unsigned getDotDebugLocOffset() const { return DotDebugLocOffset; }
@@ -90,7 +95,7 @@ public:
   int getFrameIndex() const { return FrameIndex; }
   void setFrameIndex(int FI) { FrameIndex = FI; }
   // Translate tag to proper Dwarf tag.
-  uint16_t getTag() const {
+  dwarf::Tag getTag() const {
     if (Var.getTag() == dwarf::DW_TAG_arg_variable)
       return dwarf::DW_TAG_formal_parameter;
 
@@ -131,99 +136,6 @@ private:
   template <typename T> T resolve(DIRef<T> Ref) const;
 };
 
-/// \brief Collects and handles information specific to a particular
-/// collection of units. This collection represents all of the units
-/// that will be ultimately output into a single object file.
-class DwarfFile {
-  // Target of Dwarf emission, used for sizing of abbreviations.
-  AsmPrinter *Asm;
-
-  // Used to uniquely define abbreviations.
-  FoldingSet<DIEAbbrev> AbbreviationsSet;
-
-  // A list of all the unique abbreviations in use.
-  std::vector<DIEAbbrev *> Abbreviations;
-
-  // A pointer to all units in the section.
-  SmallVector<DwarfUnit *, 1> CUs;
-
-  // Collection of strings for this unit and assorted symbols.
-  // A String->Symbol mapping of strings used by indirect
-  // references.
-  typedef StringMap<std::pair<MCSymbol *, unsigned>, BumpPtrAllocator &>
-  StrPool;
-  StrPool StringPool;
-  unsigned NextStringPoolNumber;
-  std::string StringPref;
-
-  struct AddressPoolEntry {
-    unsigned Number;
-    bool TLS;
-    AddressPoolEntry(unsigned Number, bool TLS) : Number(Number), TLS(TLS) {}
-  };
-  // Collection of addresses for this unit and assorted labels.
-  // A Symbol->unsigned mapping of addresses used by indirect
-  // references.
-  typedef DenseMap<const MCSymbol *, AddressPoolEntry> AddrPool;
-  AddrPool AddressPool;
-  unsigned NextAddrPoolNumber;
-
-public:
-  DwarfFile(AsmPrinter *AP, const char *Pref, BumpPtrAllocator &DA)
-      : Asm(AP), StringPool(DA), NextStringPoolNumber(0), StringPref(Pref),
-        AddressPool(), NextAddrPoolNumber(0) {}
-
-  ~DwarfFile();
-
-  const SmallVectorImpl<DwarfUnit *> &getUnits() { return CUs; }
-
-  /// \brief Compute the size and offset of a DIE given an incoming Offset.
-  unsigned computeSizeAndOffset(DIE *Die, unsigned Offset);
-
-  /// \brief Compute the size and offset of all the DIEs.
-  void computeSizeAndOffsets();
-
-  /// \brief Define a unique number for the abbreviation.
-  void assignAbbrevNumber(DIEAbbrev &Abbrev);
-
-  /// \brief Add a unit to the list of CUs.
-  void addUnit(DwarfUnit *CU) { CUs.push_back(CU); }
-
-  /// \brief Emit all of the units to the section listed with the given
-  /// abbreviation section.
-  void emitUnits(DwarfDebug *DD, const MCSymbol *ASectionSym);
-
-  /// \brief Emit a set of abbreviations to the specific section.
-  void emitAbbrevs(const MCSection *);
-
-  /// \brief Emit all of the strings to the section given.
-  void emitStrings(const MCSection *StrSection, const MCSection *OffsetSection,
-                   const MCSymbol *StrSecSym);
-
-  /// \brief Emit all of the addresses to the section given.
-  void emitAddresses(const MCSection *AddrSection);
-
-  /// \brief Returns the entry into the start of the pool.
-  MCSymbol *getStringPoolSym();
-
-  /// \brief Returns an entry into the string pool with the given
-  /// string text.
-  MCSymbol *getStringPoolEntry(StringRef Str);
-
-  /// \brief Returns the index into the string pool with the given
-  /// string text.
-  unsigned getStringPoolIndex(StringRef Str);
-
-  /// \brief Returns the string pool.
-  StrPool *getStringPool() { return &StringPool; }
-
-  /// \brief Returns the index into the address pool with the given
-  /// label/symbol.
-  unsigned getAddrPoolIndex(const MCSymbol *Sym, bool TLS = false);
-
-  /// \brief Returns the address pool.
-  AddrPool *getAddrPool() { return &AddressPool; }
-};
 
 /// \brief Helper used to pair up a symbol and its DWARF compile unit.
 struct SymbolCU {
@@ -287,7 +199,7 @@ class DwarfDebug : public AsmPrinterHandler {
   ScopeVariablesMap ScopeVariables;
 
   // Collection of abstract variables.
-  DenseMap<const MDNode *, DbgVariable *> AbstractVariables;
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
 
   // Collection of DebugLocEntry. Stored in a linked list so that DIELocLists
   // can refer to them in spite of insertions into this list.
@@ -307,15 +219,8 @@ class DwarfDebug : public AsmPrinterHandler {
   // Maps instruction with label emitted after instruction.
   DenseMap<const MachineInstr *, MCSymbol *> LabelsAfterInsn;
 
-  // Every user variable mentioned by a DBG_VALUE instruction in order of
-  // appearance.
-  SmallVector<const MDNode *, 8> UserVariables;
-
-  // For each user variable, keep a list of DBG_VALUE instructions in order.
-  // The list can also contain normal instructions that clobber the previous
-  // DBG_VALUE.
-  typedef DenseMap<const MDNode *, SmallVector<const MachineInstr *, 4> >
-  DbgValueHistoryMap;
+  // History of DBG_VALUE and clobber instructions for each user variable.
+  // Variables are listed in order of appearance.
   DbgValueHistoryMap DbgValues;
 
   // Previous instruction's location information. This is used to determine
@@ -373,6 +278,8 @@ class DwarfDebug : public AsmPrinterHandler {
   // them.
   DenseMap<const MDNode *, const DwarfTypeUnit *> DwarfTypeUnits;
 
+  SmallVector<std::pair<std::unique_ptr<DwarfTypeUnit>, DICompositeType>, 1> TypeUnitsUnderConstruction;
+
   // Whether to emit the pubnames/pubtypes sections.
   bool HasDwarfPubSections;
 
@@ -411,22 +318,30 @@ class DwarfDebug : public AsmPrinterHandler {
   // True iff there are multiple CUs in this module.
   bool SingleCU;
 
+  AddressPool AddrPool;
+
+  DwarfAccelTable AccelNames;
+  DwarfAccelTable AccelObjC;
+  DwarfAccelTable AccelNamespace;
+  DwarfAccelTable AccelTypes;
+
   MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &);
 
   void addScopeVariable(LexicalScope *LS, DbgVariable *Var);
 
-  const SmallVectorImpl<DwarfUnit *> &getUnits() {
+  const SmallVectorImpl<std::unique_ptr<DwarfUnit>> &getUnits() {
     return InfoHolder.getUnits();
   }
 
   /// \brief Find abstract variable associated with Var.
   DbgVariable *findAbstractVariable(DIVariable &Var, DebugLoc Loc);
+  DbgVariable *findAbstractVariable(DIVariable &Var, const MDNode *Scope);
 
   /// \brief Find DIE for the given subprogram and attach appropriate
   /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
   /// variables in this scope then create and insert DIEs for these
   /// variables.
-  DIE *updateSubprogramScopeDIE(DwarfCompileUnit *SPCU, DISubprogram SP);
+  DIE &updateSubprogramScopeDIE(DwarfCompileUnit &SPCU, DISubprogram SP);
 
   /// \brief A helper function to check whether the DIE for a given Scope is
   /// going to be null.
@@ -434,22 +349,33 @@ class DwarfDebug : public AsmPrinterHandler {
 
   /// \brief A helper function to construct a RangeSpanList for a given
   /// lexical scope.
-  void addScopeRangeList(DwarfCompileUnit *TheCU, DIE *ScopeDIE,
+  void addScopeRangeList(DwarfCompileUnit &TheCU, DIE &ScopeDIE,
                          const SmallVectorImpl<InsnRange> &Range);
 
   /// \brief Construct new DW_TAG_lexical_block for this scope and
   /// attach DW_AT_low_pc/DW_AT_high_pc labels.
-  DIE *constructLexicalScopeDIE(DwarfCompileUnit *TheCU, LexicalScope *Scope);
+  std::unique_ptr<DIE> constructLexicalScopeDIE(DwarfCompileUnit &TheCU,
+                                                LexicalScope *Scope);
 
   /// \brief This scope represents inlined body of a function. Construct
   /// DIE to represent this concrete inlined copy of the function.
-  DIE *constructInlinedScopeDIE(DwarfCompileUnit *TheCU, LexicalScope *Scope);
+  std::unique_ptr<DIE> constructInlinedScopeDIE(DwarfCompileUnit &TheCU,
+                                                LexicalScope *Scope);
 
   /// \brief Construct a DIE for this scope.
-  DIE *constructScopeDIE(DwarfCompileUnit *TheCU, LexicalScope *Scope);
+  std::unique_ptr<DIE> constructScopeDIE(DwarfCompileUnit &TheCU,
+                                         LexicalScope *Scope);
+  void createAndAddScopeChildren(DwarfCompileUnit &TheCU, LexicalScope *Scope,
+                                 DIE &ScopeDIE);
+  /// \brief Construct a DIE for this abstract scope.
+  void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &TheCU,
+                                           LexicalScope *Scope);
+  /// \brief Construct a DIE for this subprogram scope.
+  DIE &constructSubprogramScopeDIE(DwarfCompileUnit &TheCU,
+                                   LexicalScope *Scope);
   /// A helper function to create children of a Scope DIE.
-  DIE *createScopeChildrenDIE(DwarfCompileUnit *TheCU, LexicalScope *Scope,
-                              SmallVectorImpl<DIE *> &Children);
+  DIE *createScopeChildrenDIE(DwarfCompileUnit &TheCU, LexicalScope *Scope,
+                              SmallVectorImpl<std::unique_ptr<DIE>> &Children);
 
   /// \brief Emit initial Dwarf sections with a label at the start of each one.
   void emitSectionLabels();
@@ -460,12 +386,11 @@ class DwarfDebug : public AsmPrinterHandler {
   /// \brief Compute the size and offset of all the DIEs.
   void computeSizeAndOffsets();
 
-  /// \brief Attach DW_AT_inline attribute with inlined subprogram DIEs.
-  void computeInlinedDIEs();
-
   /// \brief Collect info for variables that were optimized out.
   void collectDeadVariables();
 
+  void finishSubprogramDefinitions();
+
   /// \brief Finish off debug information after all functions have been
   /// processed.
   void finalizeModuleInfo();
@@ -535,15 +460,16 @@ class DwarfDebug : public AsmPrinterHandler {
   /// DWARF 5 Experimental Split Dwarf Emitters
 
   /// \brief Initialize common features of skeleton units.
-  void initSkeletonUnit(const DwarfUnit *U, DIE *Die, DwarfUnit *NewU);
+  void initSkeletonUnit(const DwarfUnit &U, DIE &Die,
+                        std::unique_ptr<DwarfUnit> NewU);
 
   /// \brief Construct the split debug info compile unit for the debug info
   /// section.
-  DwarfCompileUnit *constructSkeletonCU(const DwarfCompileUnit *CU);
+  DwarfCompileUnit &constructSkeletonCU(const DwarfCompileUnit &CU);
 
   /// \brief Construct the split debug info compile unit for the debug info
   /// section.
-  DwarfTypeUnit *constructSkeletonTU(DwarfTypeUnit *TU);
+  DwarfTypeUnit &constructSkeletonTU(DwarfTypeUnit &TU);
 
   /// \brief Emit the debug info dwo section.
   void emitDebugInfoDWO();
@@ -559,25 +485,22 @@ class DwarfDebug : public AsmPrinterHandler {
 
   /// Flags to let the linker know we have emitted new style pubnames. Only
   /// emit it here if we don't have a skeleton CU for split dwarf.
-  void addGnuPubAttributes(DwarfUnit *U, DIE *D) const;
+  void addGnuPubAttributes(DwarfUnit &U, DIE &D) const;
 
   /// \brief Create new DwarfCompileUnit for the given metadata node with tag
   /// DW_TAG_compile_unit.
-  DwarfCompileUnit *constructDwarfCompileUnit(DICompileUnit DIUnit);
-
-  /// \brief Construct subprogram DIE.
-  void constructSubprogramDIE(DwarfCompileUnit *TheCU, const MDNode *N);
+  DwarfCompileUnit &constructDwarfCompileUnit(DICompileUnit DIUnit);
 
   /// \brief Construct imported_module or imported_declaration DIE.
-  void constructImportedEntityDIE(DwarfCompileUnit *TheCU, const MDNode *N);
+  void constructImportedEntityDIE(DwarfCompileUnit &TheCU, const MDNode *N);
 
   /// \brief Construct import_module DIE.
-  void constructImportedEntityDIE(DwarfCompileUnit *TheCU, const MDNode *N,
-                                  DIE *Context);
+  void constructImportedEntityDIE(DwarfCompileUnit &TheCU, const MDNode *N,
+                                  DIE &Context);
 
   /// \brief Construct import_module DIE.
-  void constructImportedEntityDIE(DwarfCompileUnit *TheCU,
-                                  const DIImportedEntity &Module, DIE *Context);
+  void constructImportedEntityDIE(DwarfCompileUnit &TheCU,
+                                  const DIImportedEntity &Module, DIE &Context);
 
   /// \brief Register a source line with debug info. Returns the unique
   /// label that was emitted and which provides correspondence to the
@@ -602,7 +525,7 @@ class DwarfDebug : public AsmPrinterHandler {
 
   /// \brief Ensure that a label will be emitted before MI.
   void requestLabelBeforeInsn(const MachineInstr *MI) {
-    LabelsBeforeInsn.insert(std::make_pair(MI, (MCSymbol *)0));
+    LabelsBeforeInsn.insert(std::make_pair(MI, nullptr));
   }
 
   /// \brief Return Label preceding the instruction.
@@ -610,13 +533,15 @@ class DwarfDebug : public AsmPrinterHandler {
 
   /// \brief Ensure that a label will be emitted after MI.
   void requestLabelAfterInsn(const MachineInstr *MI) {
-    LabelsAfterInsn.insert(std::make_pair(MI, (MCSymbol *)0));
+    LabelsAfterInsn.insert(std::make_pair(MI, nullptr));
   }
 
   /// \brief Return Label immediately following the instruction.
   MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
 
-  void attachLowHighPC(DwarfCompileUnit *Unit, DIE *D, MCSymbol *Begin,
+  void attachRangesOrLowHighPC(DwarfCompileUnit &Unit, DIE &D,
+                               const SmallVectorImpl<InsnRange> &Ranges);
+  void attachLowHighPC(DwarfCompileUnit &Unit, DIE &D, MCSymbol *Begin,
                        MCSymbol *End);
 
 public:
@@ -625,6 +550,8 @@ public:
   //
   DwarfDebug(AsmPrinter *A, Module *M);
 
+  ~DwarfDebug() override;
+
   void insertDIE(const MDNode *TypeMD, DIE *Die) {
     MDTypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
   }
@@ -654,7 +581,7 @@ public:
   /// \brief Add a DIE to the set of types that we're going to pull into
   /// type units.
   void addDwarfTypeUnitType(DwarfCompileUnit &CU, StringRef Identifier,
-                            DIE *Die, DICompositeType CTy);
+                            DIE &Die, DICompositeType CTy);
 
   /// \brief Add a label so that arange data can be generated for it.
   void addArangeLabel(SymbolCU SCU) { ArangeLabels.push_back(SCU); }
@@ -666,7 +593,7 @@ public:
   }
 
   /// \brief Recursively Emits a debug information entry.
-  void emitDIE(DIE *Die);
+  void emitDIE(DIE &Die);
 
   // Experimental DWARF5 features.
 
@@ -720,6 +647,18 @@ public:
   /// isSubprogramContext - Return true if Context is either a subprogram
   /// or another context nested inside a subprogram.
   bool isSubprogramContext(const MDNode *Context);
+
+  void addSubprogramNames(DISubprogram SP, DIE &Die);
+
+  AddressPool &getAddressPool() { return AddrPool; }
+
+  void addAccelName(StringRef Name, const DIE &Die);
+
+  void addAccelObjC(StringRef Name, const DIE &Die);
+
+  void addAccelNamespace(StringRef Name, const DIE &Die);
+
+  void addAccelType(StringRef Name, const DIE &Die, char Flags);
 };
 } // End of namespace llvm
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.cpp b/lib/CodeGen/AsmPrinter/DwarfException.cpp
index 113a9e4..3a12c73 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfException.cpp
@@ -103,7 +103,7 @@ ComputeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
 
   int FirstAction = 0;
   unsigned SizeActions = 0;
-  const LandingPadInfo *PrevLPI = 0;
+  const LandingPadInfo *PrevLPI = nullptr;
 
   for (SmallVectorImpl<const LandingPadInfo *>::const_iterator
          I = LandingPads.begin(), E = LandingPads.end(); I != E; ++I) {
@@ -181,7 +181,7 @@ bool DwarfException::CallToNoUnwindFunction(const MachineInstr *MI) {
     if (!MO.isGlobal()) continue;
 
     const Function *F = dyn_cast<Function>(MO.getGlobal());
-    if (F == 0) continue;
+    if (!F) continue;
 
     if (SawFunc) {
       // Be conservative. If we have more than one function operand for this
@@ -214,7 +214,7 @@ ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
                      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
                      const SmallVectorImpl<unsigned> &FirstActions) {
   // The end label of the previous invoke or nounwind try-range.
-  MCSymbol *LastLabel = 0;
+  MCSymbol *LastLabel = nullptr;
 
   // Whether there is a potentially throwing instruction (currently this means
   // an ordinary call) between the end of the previous try-range and now.
@@ -224,18 +224,16 @@ ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
   bool PreviousIsInvoke = false;
 
   // Visit all instructions in order of address.
-  for (MachineFunction::const_iterator I = Asm->MF->begin(), E = Asm->MF->end();
-       I != E; ++I) {
-    for (MachineBasicBlock::const_iterator MI = I->begin(), E = I->end();
-         MI != E; ++MI) {
-      if (!MI->isEHLabel()) {
-        if (MI->isCall())
-          SawPotentiallyThrowing |= !CallToNoUnwindFunction(MI);
+  for (const auto &MBB : *Asm->MF) {
+    for (const auto &MI : MBB) {
+      if (!MI.isEHLabel()) {
+        if (MI.isCall())
+          SawPotentiallyThrowing |= !CallToNoUnwindFunction(&MI);
         continue;
       }
 
       // End of the previous try-range?
-      MCSymbol *BeginLabel = MI->getOperand(0).getMCSymbol();
+      MCSymbol *BeginLabel = MI.getOperand(0).getMCSymbol();
       if (BeginLabel == LastLabel)
         SawPotentiallyThrowing = false;
 
@@ -255,7 +253,7 @@ ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
       // create a call-site entry with no landing pad for the region between the
       // try-ranges.
       if (SawPotentiallyThrowing && Asm->MAI->isExceptionHandlingDwarf()) {
-        CallSiteEntry Site = { LastLabel, BeginLabel, 0, 0 };
+        CallSiteEntry Site = { LastLabel, BeginLabel, nullptr, 0 };
         CallSites.push_back(Site);
         PreviousIsInvoke = false;
       }
@@ -305,7 +303,7 @@ ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
   // function may throw, create a call-site entry with no landing pad for the
   // region following the try-range.
   if (SawPotentiallyThrowing && Asm->MAI->isExceptionHandlingDwarf()) {
-    CallSiteEntry Site = { LastLabel, 0, 0, 0 };
+    CallSiteEntry Site = { LastLabel, nullptr, nullptr, 0 };
     CallSites.push_back(Site);
   }
 }
@@ -571,10 +569,10 @@ void DwarfException::EmitExceptionTable() {
         Asm->GetTempSymbol("eh_func_begin", Asm->getFunctionNumber());
 
       MCSymbol *BeginLabel = S.BeginLabel;
-      if (BeginLabel == 0)
+      if (!BeginLabel)
         BeginLabel = EHFuncBeginSym;
       MCSymbol *EndLabel = S.EndLabel;
-      if (EndLabel == 0)
+      if (!EndLabel)
         EndLabel = Asm->GetTempSymbol("eh_func_end", Asm->getFunctionNumber());
 
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
new file mode 100644
index 0000000..737ee54
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -0,0 +1,156 @@
+//===-- llvm/CodeGen/DwarfFile.cpp - Dwarf Debug Framework ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfFile.h"
+
+#include "DwarfDebug.h"
+#include "DwarfUnit.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+DwarfFile::DwarfFile(AsmPrinter *AP, StringRef Pref, BumpPtrAllocator &DA)
+    : Asm(AP), StrPool(DA, *Asm, Pref) {}
+
+DwarfFile::~DwarfFile() {}
+
+// Define a unique number for the abbreviation.
+//
+void DwarfFile::assignAbbrevNumber(DIEAbbrev &Abbrev) {
+  // Check the set for priors.
+  DIEAbbrev *InSet = AbbreviationsSet.GetOrInsertNode(&Abbrev);
+
+  // If it's newly added.
+  if (InSet == &Abbrev) {
+    // Add to abbreviation list.
+    Abbreviations.push_back(&Abbrev);
+
+    // Assign the vector position + 1 as its number.
+    Abbrev.setNumber(Abbreviations.size());
+  } else {
+    // Assign existing abbreviation number.
+    Abbrev.setNumber(InSet->getNumber());
+  }
+}
+
+void DwarfFile::addUnit(std::unique_ptr<DwarfUnit> U) {
+  CUs.push_back(std::move(U));
+}
+
+// Emit the various dwarf units to the unit section USection with
+// the abbreviations going into ASection.
+void DwarfFile::emitUnits(DwarfDebug *DD, const MCSymbol *ASectionSym) {
+  for (const auto &TheU : CUs) {
+    DIE &Die = TheU->getUnitDie();
+    const MCSection *USection = TheU->getSection();
+    Asm->OutStreamer.SwitchSection(USection);
+
+    // Emit the compile units header.
+    Asm->OutStreamer.EmitLabel(TheU->getLabelBegin());
+
+    // Emit size of content not including length itself
+    Asm->OutStreamer.AddComment("Length of Unit");
+    Asm->EmitInt32(TheU->getHeaderSize() + Die.getSize());
+
+    TheU->emitHeader(ASectionSym);
+
+    DD->emitDIE(Die);
+    Asm->OutStreamer.EmitLabel(TheU->getLabelEnd());
+  }
+}
+// Compute the size and offset for each DIE.
+void DwarfFile::computeSizeAndOffsets() {
+  // Offset from the first CU in the debug info section is 0 initially.
+  unsigned SecOffset = 0;
+
+  // Iterate over each compile unit and set the size and offsets for each
+  // DIE within each compile unit. All offsets are CU relative.
+  for (const auto &TheU : CUs) {
+    TheU->setDebugInfoOffset(SecOffset);
+
+    // CU-relative offset is reset to 0 here.
+    unsigned Offset = sizeof(int32_t) +      // Length of Unit Info
+                      TheU->getHeaderSize(); // Unit-specific headers
+
+    // EndOffset here is CU-relative, after laying out
+    // all of the CU DIE.
+    unsigned EndOffset = computeSizeAndOffset(TheU->getUnitDie(), Offset);
+    SecOffset += EndOffset;
+  }
+}
+// Compute the size and offset of a DIE. The offset is relative to start of the
+// CU. It returns the offset after laying out the DIE.
+unsigned DwarfFile::computeSizeAndOffset(DIE &Die, unsigned Offset) {
+  // Record the abbreviation.
+  assignAbbrevNumber(Die.getAbbrev());
+
+  // Get the abbreviation for this DIE.
+  const DIEAbbrev &Abbrev = Die.getAbbrev();
+
+  // Set DIE offset
+  Die.setOffset(Offset);
+
+  // Start the size with the size of abbreviation code.
+  Offset += getULEB128Size(Die.getAbbrevNumber());
+
+  const SmallVectorImpl<DIEValue *> &Values = Die.getValues();
+  const SmallVectorImpl<DIEAbbrevData> &AbbrevData = Abbrev.getData();
+
+  // Size the DIE attribute values.
+  for (unsigned i = 0, N = Values.size(); i < N; ++i)
+    // Size attribute value.
+    Offset += Values[i]->SizeOf(Asm, AbbrevData[i].getForm());
+
+  // Get the children.
+  const auto &Children = Die.getChildren();
+
+  // Size the DIE children if any.
+  if (!Children.empty()) {
+    assert(Abbrev.hasChildren() && "Children flag not set");
+
+    for (auto &Child : Children)
+      Offset = computeSizeAndOffset(*Child, Offset);
+
+    // End of children marker.
+    Offset += sizeof(int8_t);
+  }
+
+  Die.setSize(Offset - Die.getOffset());
+  return Offset;
+}
+void DwarfFile::emitAbbrevs(const MCSection *Section) {
+  // Check to see if it is worth the effort.
+  if (!Abbreviations.empty()) {
+    // Start the debug abbrev section.
+    Asm->OutStreamer.SwitchSection(Section);
+
+    // For each abbrevation.
+    for (const DIEAbbrev *Abbrev : Abbreviations) {
+      // Emit the abbrevations code (base 1 index.)
+      Asm->EmitULEB128(Abbrev->getNumber(), "Abbreviation Code");
+
+      // Emit the abbreviations data.
+      Abbrev->Emit(Asm);
+    }
+
+    // Mark end of abbreviations.
+    Asm->EmitULEB128(0, "EOM(3)");
+  }
+}
+
+// Emit strings into a string section.
+void DwarfFile::emitStrings(const MCSection *StrSection,
+                            const MCSection *OffsetSection,
+                            const MCSymbol *StrSecSym) {
+  StrPool.emit(*Asm, StrSection, OffsetSection, StrSecSym);
+}
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
new file mode 100644
index 0000000..3985eb2
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -0,0 +1,84 @@
+//===-- llvm/CodeGen/DwarfFile.h - Dwarf Debug Framework -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DWARFFILE_H__
+#define CODEGEN_ASMPRINTER_DWARFFILE_H__
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Allocator.h"
+#include "AddressPool.h"
+#include "DwarfStringPool.h"
+
+#include <vector>
+#include <string>
+#include <memory>
+
+namespace llvm {
+class AsmPrinter;
+class DwarfUnit;
+class DIEAbbrev;
+class MCSymbol;
+class DIE;
+class StringRef;
+class DwarfDebug;
+class MCSection;
+class DwarfFile {
+  // Target of Dwarf emission, used for sizing of abbreviations.
+  AsmPrinter *Asm;
+
+  // Used to uniquely define abbreviations.
+  FoldingSet<DIEAbbrev> AbbreviationsSet;
+
+  // A list of all the unique abbreviations in use.
+  std::vector<DIEAbbrev *> Abbreviations;
+
+  // A pointer to all units in the section.
+  SmallVector<std::unique_ptr<DwarfUnit>, 1> CUs;
+
+  DwarfStringPool StrPool;
+
+public:
+  DwarfFile(AsmPrinter *AP, StringRef Pref, BumpPtrAllocator &DA);
+
+  ~DwarfFile();
+
+  const SmallVectorImpl<std::unique_ptr<DwarfUnit>> &getUnits() { return CUs; }
+
+  /// \brief Compute the size and offset of a DIE given an incoming Offset.
+  unsigned computeSizeAndOffset(DIE &Die, unsigned Offset);
+
+  /// \brief Compute the size and offset of all the DIEs.
+  void computeSizeAndOffsets();
+
+  /// \brief Define a unique number for the abbreviation.
+  void assignAbbrevNumber(DIEAbbrev &Abbrev);
+
+  /// \brief Add a unit to the list of CUs.
+  void addUnit(std::unique_ptr<DwarfUnit> U);
+
+  /// \brief Emit all of the units to the section listed with the given
+  /// abbreviation section.
+  void emitUnits(DwarfDebug *DD, const MCSymbol *ASectionSym);
+
+  /// \brief Emit a set of abbreviations to the specific section.
+  void emitAbbrevs(const MCSection *);
+
+  /// \brief Emit all of the strings to the section given.
+  void emitStrings(const MCSection *StrSection,
+                   const MCSection *OffsetSection = nullptr,
+                   const MCSymbol *StrSecSym = nullptr);
+
+  /// \brief Returns the string pool.
+  DwarfStringPool &getStringPool() { return StrPool; }
+};
+}
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
new file mode 100644
index 0000000..72cab60
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -0,0 +1,74 @@
+//===-- llvm/CodeGen/DwarfStringPool.cpp - Dwarf Debug Framework ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfStringPool.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+MCSymbol *DwarfStringPool::getSectionSymbol() { return SectionSymbol; }
+
+static std::pair<MCSymbol *, unsigned> &
+getEntry(AsmPrinter &Asm,
+         StringMap<std::pair<MCSymbol *, unsigned>, BumpPtrAllocator &> &Pool,
+         StringRef Prefix, StringRef Str) {
+  std::pair<MCSymbol *, unsigned> &Entry =
+      Pool.GetOrCreateValue(Str).getValue();
+  if (!Entry.first) {
+    Entry.second = Pool.size() - 1;
+    Entry.first = Asm.GetTempSymbol(Prefix, Entry.second);
+  }
+  return Entry;
+}
+
+MCSymbol *DwarfStringPool::getSymbol(AsmPrinter &Asm, StringRef Str) {
+  return getEntry(Asm, Pool, Prefix, Str).first;
+}
+
+unsigned DwarfStringPool::getIndex(AsmPrinter &Asm, StringRef Str) {
+  return getEntry(Asm, Pool, Prefix, Str).second;
+}
+
+void DwarfStringPool::emit(AsmPrinter &Asm, const MCSection *StrSection,
+                           const MCSection *OffsetSection,
+                           const MCSymbol *StrSecSym) {
+  if (Pool.empty())
+    return;
+
+  // Start the dwarf str section.
+  Asm.OutStreamer.SwitchSection(StrSection);
+
+  // Get all of the string pool entries and put them in an array by their ID so
+  // we can sort them.
+  SmallVector<const StringMapEntry<std::pair<MCSymbol *, unsigned>> *, 64>
+  Entries(Pool.size());
+
+  for (const auto &E : Pool)
+    Entries[E.getValue().second] = &E;
+
+  for (const auto &Entry : Entries) {
+    // Emit a label for reference from debug information entries.
+    Asm.OutStreamer.EmitLabel(Entry->getValue().first);
+
+    // Emit the string itself with a terminating null byte.
+    Asm.OutStreamer.EmitBytes(
+        StringRef(Entry->getKeyData(), Entry->getKeyLength() + 1));
+  }
+
+  // If we've got an offset section go ahead and emit that now as well.
+  if (OffsetSection) {
+    Asm.OutStreamer.SwitchSection(OffsetSection);
+    unsigned offset = 0;
+    unsigned size = 4; // FIXME: DWARF64 is 8.
+    for (const auto &Entry : Entries) {
+      Asm.OutStreamer.EmitIntValue(offset, size);
+      offset += Entry->getKeyLength() + 1;
+    }
+  }
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfStringPool.h b/lib/CodeGen/AsmPrinter/DwarfStringPool.h
new file mode 100644
index 0000000..c1615fb
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfStringPool.h
@@ -0,0 +1,55 @@
+//===-- llvm/CodeGen/DwarfStringPool.h - Dwarf Debug Framework -*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_STRINGPOOL_H__
+#define CODEGEN_ASMPRINTER_STRINGPOOL_H__
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Allocator.h"
+
+#include <utility>
+
+namespace llvm {
+
+class MCSymbol;
+class MCSection;
+class StringRef;
+
+// Collection of strings for this unit and assorted symbols.
+// A String->Symbol mapping of strings used by indirect
+// references.
+class DwarfStringPool {
+  StringMap<std::pair<MCSymbol *, unsigned>, BumpPtrAllocator &> Pool;
+  StringRef Prefix;
+  MCSymbol *SectionSymbol;
+
+public:
+  DwarfStringPool(BumpPtrAllocator &A, AsmPrinter &Asm, StringRef Prefix)
+      : Pool(A), Prefix(Prefix), SectionSymbol(Asm.GetTempSymbol(Prefix)) {}
+
+  void emit(AsmPrinter &Asm, const MCSection *StrSection,
+            const MCSection *OffsetSection = nullptr,
+            const MCSymbol *StrSecSym = nullptr);
+
+  /// \brief Returns the entry into the start of the pool.
+  MCSymbol *getSectionSymbol();
+
+  /// \brief Returns an entry into the string pool with the given
+  /// string text.
+  MCSymbol *getSymbol(AsmPrinter &Asm, StringRef Str);
+
+  /// \brief Returns the index into the string pool with the given
+  /// string text.
+  unsigned getIndex(AsmPrinter &Asm, StringRef Str);
+
+  bool empty() const { return Pool.empty(); }
+};
+}
+#endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 82e9bb0..a70c0f7 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "dwarfdebug"
-
 #include "DwarfUnit.h"
 #include "DwarfAccelTable.h"
 #include "DwarfDebug.h"
@@ -35,33 +33,38 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "dwarfdebug"
+
 static cl::opt<bool>
 GenerateDwarfTypeUnits("generate-type-units", cl::Hidden,
                        cl::desc("Generate DWARF4 type units."),
                        cl::init(false));
 
 /// Unit - Unit constructor.
-DwarfUnit::DwarfUnit(unsigned UID, DIE *D, DICompileUnit Node, AsmPrinter *A,
-                     DwarfDebug *DW, DwarfFile *DWU)
-    : UniqueID(UID), CUNode(Node), UnitDie(D), DebugInfoOffset(0), Asm(A),
-      DD(DW), DU(DWU), IndexTyDie(0), Section(0), Skeleton(0) {
+DwarfUnit::DwarfUnit(unsigned UID, dwarf::Tag UnitTag, DICompileUnit Node,
+                     AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU)
+    : UniqueID(UID), CUNode(Node), UnitDie(UnitTag), DebugInfoOffset(0), Asm(A),
+      DD(DW), DU(DWU), IndexTyDie(nullptr), Section(nullptr),
+      Skeleton(nullptr) {
+  assert(UnitTag == dwarf::DW_TAG_compile_unit ||
+         UnitTag == dwarf::DW_TAG_type_unit);
   DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
 }
 
-DwarfCompileUnit::DwarfCompileUnit(unsigned UID, DIE *D, DICompileUnit Node,
+DwarfCompileUnit::DwarfCompileUnit(unsigned UID, DICompileUnit Node,
                                    AsmPrinter *A, DwarfDebug *DW,
                                    DwarfFile *DWU)
-    : DwarfUnit(UID, D, Node, A, DW, DWU) {
-  insertDIE(Node, D);
+    : DwarfUnit(UID, dwarf::DW_TAG_compile_unit, Node, A, DW, DWU) {
+  insertDIE(Node, &getUnitDie());
 }
 
-DwarfTypeUnit::DwarfTypeUnit(unsigned UID, DIE *D, DwarfCompileUnit &CU,
-                             AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU,
+DwarfTypeUnit::DwarfTypeUnit(unsigned UID, DwarfCompileUnit &CU, AsmPrinter *A,
+                             DwarfDebug *DW, DwarfFile *DWU,
                              MCDwarfDwoLineTable *SplitLineTable)
-    : DwarfUnit(UID, D, CU.getCUNode(), A, DW, DWU), CU(CU),
-      SplitLineTable(SplitLineTable) {
+    : DwarfUnit(UID, dwarf::DW_TAG_type_unit, CU.getCUNode(), A, DW, DWU),
+      CU(CU), SplitLineTable(SplitLineTable) {
   if (SplitLineTable)
-    addSectionOffset(UnitDie.get(), dwarf::DW_AT_stmt_list, 0);
+    addSectionOffset(UnitDie, dwarf::DW_AT_stmt_list, 0);
 }
 
 /// ~Unit - Destructor for compile unit.
@@ -74,7 +77,7 @@ DwarfUnit::~DwarfUnit() {
 
 /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
 /// information entry.
-DIEEntry *DwarfUnit::createDIEEntry(DIE *Entry) {
+DIEEntry *DwarfUnit::createDIEEntry(DIE &Entry) {
   DIEEntry *Value = new (DIEValueAllocator) DIEEntry(Entry);
   return Value;
 }
@@ -159,39 +162,39 @@ void DwarfUnit::insertDIE(DIDescriptor Desc, DIE *D) {
 }
 
 /// addFlag - Add a flag that is true.
-void DwarfUnit::addFlag(DIE *Die, dwarf::Attribute Attribute) {
+void DwarfUnit::addFlag(DIE &Die, dwarf::Attribute Attribute) {
   if (DD->getDwarfVersion() >= 4)
-    Die->addValue(Attribute, dwarf::DW_FORM_flag_present, DIEIntegerOne);
+    Die.addValue(Attribute, dwarf::DW_FORM_flag_present, DIEIntegerOne);
   else
-    Die->addValue(Attribute, dwarf::DW_FORM_flag, DIEIntegerOne);
+    Die.addValue(Attribute, dwarf::DW_FORM_flag, DIEIntegerOne);
 }
 
 /// addUInt - Add an unsigned integer attribute data and value.
 ///
-void DwarfUnit::addUInt(DIE *Die, dwarf::Attribute Attribute,
+void DwarfUnit::addUInt(DIE &Die, dwarf::Attribute Attribute,
                         Optional<dwarf::Form> Form, uint64_t Integer) {
   if (!Form)
     Form = DIEInteger::BestForm(false, Integer);
   DIEValue *Value = Integer == 1 ? DIEIntegerOne : new (DIEValueAllocator)
                         DIEInteger(Integer);
-  Die->addValue(Attribute, *Form, Value);
+  Die.addValue(Attribute, *Form, Value);
 }
 
-void DwarfUnit::addUInt(DIE *Block, dwarf::Form Form, uint64_t Integer) {
+void DwarfUnit::addUInt(DIE &Block, dwarf::Form Form, uint64_t Integer) {
   addUInt(Block, (dwarf::Attribute)0, Form, Integer);
 }
 
 /// addSInt - Add an signed integer attribute data and value.
 ///
-void DwarfUnit::addSInt(DIE *Die, dwarf::Attribute Attribute,
+void DwarfUnit::addSInt(DIE &Die, dwarf::Attribute Attribute,
                         Optional<dwarf::Form> Form, int64_t Integer) {
   if (!Form)
     Form = DIEInteger::BestForm(true, Integer);
   DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer);
-  Die->addValue(Attribute, *Form, Value);
+  Die.addValue(Attribute, *Form, Value);
 }
 
-void DwarfUnit::addSInt(DIELoc *Die, Optional<dwarf::Form> Form,
+void DwarfUnit::addSInt(DIELoc &Die, Optional<dwarf::Form> Form,
                         int64_t Integer) {
   addSInt(Die, (dwarf::Attribute)0, Form, Integer);
 }
@@ -201,66 +204,66 @@ void DwarfUnit::addSInt(DIELoc *Die, Optional<dwarf::Form> Form,
 /// more predictable sizes. In the case of split dwarf we emit an index
 /// into another table which gets us the static offset into the string
 /// table.
-void DwarfUnit::addString(DIE *Die, dwarf::Attribute Attribute,
+void DwarfUnit::addString(DIE &Die, dwarf::Attribute Attribute,
                           StringRef String) {
 
   if (!DD->useSplitDwarf())
     return addLocalString(Die, Attribute, String);
 
-  unsigned idx = DU->getStringPoolIndex(String);
+  unsigned idx = DU->getStringPool().getIndex(*Asm, String);
   DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
   DIEValue *Str = new (DIEValueAllocator) DIEString(Value, String);
-  Die->addValue(Attribute, dwarf::DW_FORM_GNU_str_index, Str);
+  Die.addValue(Attribute, dwarf::DW_FORM_GNU_str_index, Str);
 }
 
 /// addLocalString - Add a string attribute data and value. This is guaranteed
 /// to be in the local string pool instead of indirected.
-void DwarfUnit::addLocalString(DIE *Die, dwarf::Attribute Attribute,
+void DwarfUnit::addLocalString(DIE &Die, dwarf::Attribute Attribute,
                                StringRef String) {
-  MCSymbol *Symb = DU->getStringPoolEntry(String);
+  MCSymbol *Symb = DU->getStringPool().getSymbol(*Asm, String);
   DIEValue *Value;
   if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
     Value = new (DIEValueAllocator) DIELabel(Symb);
   else {
-    MCSymbol *StringPool = DU->getStringPoolSym();
+    MCSymbol *StringPool = DU->getStringPool().getSectionSymbol();
     Value = new (DIEValueAllocator) DIEDelta(Symb, StringPool);
   }
   DIEValue *Str = new (DIEValueAllocator) DIEString(Value, String);
-  Die->addValue(Attribute, dwarf::DW_FORM_strp, Str);
+  Die.addValue(Attribute, dwarf::DW_FORM_strp, Str);
 }
 
 /// addExpr - Add a Dwarf expression attribute data and value.
 ///
-void DwarfUnit::addExpr(DIELoc *Die, dwarf::Form Form, const MCExpr *Expr) {
+void DwarfUnit::addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr) {
   DIEValue *Value = new (DIEValueAllocator) DIEExpr(Expr);
-  Die->addValue((dwarf::Attribute)0, Form, Value);
+  Die.addValue((dwarf::Attribute)0, Form, Value);
 }
 
 /// addLocationList - Add a Dwarf loclistptr attribute data and value.
 ///
-void DwarfUnit::addLocationList(DIE *Die, dwarf::Attribute Attribute,
+void DwarfUnit::addLocationList(DIE &Die, dwarf::Attribute Attribute,
                                 unsigned Index) {
   DIEValue *Value = new (DIEValueAllocator) DIELocList(Index);
   dwarf::Form Form = DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
                                                 : dwarf::DW_FORM_data4;
-  Die->addValue(Attribute, Form, Value);
+  Die.addValue(Attribute, Form, Value);
 }
 
 /// addLabel - Add a Dwarf label attribute data and value.
 ///
-void DwarfUnit::addLabel(DIE *Die, dwarf::Attribute Attribute, dwarf::Form Form,
+void DwarfUnit::addLabel(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form,
                          const MCSymbol *Label) {
   DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
-  Die->addValue(Attribute, Form, Value);
+  Die.addValue(Attribute, Form, Value);
 }
 
-void DwarfUnit::addLabel(DIELoc *Die, dwarf::Form Form, const MCSymbol *Label) {
+void DwarfUnit::addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label) {
   addLabel(Die, (dwarf::Attribute)0, Form, Label);
 }
 
 /// addSectionLabel - Add a Dwarf section label attribute data and value.
 ///
-void DwarfUnit::addSectionLabel(DIE *Die, dwarf::Attribute Attribute,
+void DwarfUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
                                 const MCSymbol *Label) {
   if (DD->getDwarfVersion() >= 4)
     addLabel(Die, Attribute, dwarf::DW_FORM_sec_offset, Label);
@@ -270,7 +273,7 @@ void DwarfUnit::addSectionLabel(DIE *Die, dwarf::Attribute Attribute,
 
 /// addSectionOffset - Add an offset into a section attribute data and value.
 ///
-void DwarfUnit::addSectionOffset(DIE *Die, dwarf::Attribute Attribute,
+void DwarfUnit::addSectionOffset(DIE &Die, dwarf::Attribute Attribute,
                                  uint64_t Integer) {
   if (DD->getDwarfVersion() >= 4)
     addUInt(Die, Attribute, dwarf::DW_FORM_sec_offset, Integer);
@@ -281,7 +284,7 @@ void DwarfUnit::addSectionOffset(DIE *Die, dwarf::Attribute Attribute,
 /// addLabelAddress - Add a dwarf label attribute data and value using
 /// DW_FORM_addr or DW_FORM_GNU_addr_index.
 ///
-void DwarfCompileUnit::addLabelAddress(DIE *Die, dwarf::Attribute Attribute,
+void DwarfCompileUnit::addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
                                        const MCSymbol *Label) {
 
   if (!DD->useSplitDwarf())
@@ -290,24 +293,20 @@ void DwarfCompileUnit::addLabelAddress(DIE *Die, dwarf::Attribute Attribute,
   if (Label)
     DD->addArangeLabel(SymbolCU(this, Label));
 
-  unsigned idx = DU->getAddrPoolIndex(Label);
+  unsigned idx = DD->getAddressPool().getIndex(Label);
   DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
-  Die->addValue(Attribute, dwarf::DW_FORM_GNU_addr_index, Value);
+  Die.addValue(Attribute, dwarf::DW_FORM_GNU_addr_index, Value);
 }
 
-void DwarfCompileUnit::addLocalLabelAddress(DIE *Die,
+void DwarfCompileUnit::addLocalLabelAddress(DIE &Die,
                                             dwarf::Attribute Attribute,
                                             const MCSymbol *Label) {
   if (Label)
     DD->addArangeLabel(SymbolCU(this, Label));
 
-  if (Label) {
-    DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
-    Die->addValue(Attribute, dwarf::DW_FORM_addr, Value);
-  } else {
-    DIEValue *Value = new (DIEValueAllocator) DIEInteger(0);
-    Die->addValue(Attribute, dwarf::DW_FORM_addr, Value);
-  }
+  Die.addValue(Attribute, dwarf::DW_FORM_addr,
+               Label ? (DIEValue *)new (DIEValueAllocator) DIELabel(Label)
+                     : new (DIEValueAllocator) DIEInteger(0));
 }
 
 unsigned DwarfCompileUnit::getOrCreateSourceID(StringRef FileName, StringRef DirName) {
@@ -329,86 +328,94 @@ unsigned DwarfTypeUnit::getOrCreateSourceID(StringRef FileName, StringRef DirNam
 /// addOpAddress - Add a dwarf op address data and value using the
 /// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
 ///
-void DwarfUnit::addOpAddress(DIELoc *Die, const MCSymbol *Sym) {
+void DwarfUnit::addOpAddress(DIELoc &Die, const MCSymbol *Sym) {
   if (!DD->useSplitDwarf()) {
     addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
     addLabel(Die, dwarf::DW_FORM_udata, Sym);
   } else {
     addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
-    addUInt(Die, dwarf::DW_FORM_GNU_addr_index, DU->getAddrPoolIndex(Sym));
+    addUInt(Die, dwarf::DW_FORM_GNU_addr_index,
+            DD->getAddressPool().getIndex(Sym));
   }
 }
 
 /// addSectionDelta - Add a section label delta attribute data and value.
 ///
-void DwarfUnit::addSectionDelta(DIE *Die, dwarf::Attribute Attribute,
+void DwarfUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
                                 const MCSymbol *Hi, const MCSymbol *Lo) {
   DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
-  if (DD->getDwarfVersion() >= 4)
-    Die->addValue(Attribute, dwarf::DW_FORM_sec_offset, Value);
-  else
-    Die->addValue(Attribute, dwarf::DW_FORM_data4, Value);
+  Die.addValue(Attribute, DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                                     : dwarf::DW_FORM_data4,
+               Value);
 }
 
-void DwarfUnit::addLabelDelta(DIE *Die, dwarf::Attribute Attribute,
+void DwarfUnit::addLabelDelta(DIE &Die, dwarf::Attribute Attribute,
                               const MCSymbol *Hi, const MCSymbol *Lo) {
   DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
-  Die->addValue(Attribute, dwarf::DW_FORM_data4, Value);
+  Die.addValue(Attribute, dwarf::DW_FORM_data4, Value);
 }
 
 /// addDIEEntry - Add a DIE attribute data and value.
 ///
-void DwarfUnit::addDIEEntry(DIE *Die, dwarf::Attribute Attribute, DIE *Entry) {
+void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIE &Entry) {
   addDIEEntry(Die, Attribute, createDIEEntry(Entry));
 }
 
-void DwarfUnit::addDIETypeSignature(DIE *Die, const DwarfTypeUnit &Type) {
-  Die->addValue(dwarf::DW_AT_signature, dwarf::DW_FORM_ref_sig8,
-                new (DIEValueAllocator) DIETypeSignature(Type));
+void DwarfUnit::addDIETypeSignature(DIE &Die, const DwarfTypeUnit &Type) {
+  // Flag the type unit reference as a declaration so that if it contains
+  // members (implicit special members, static data member definitions, member
+  // declarations for definitions in this CU, etc) consumers don't get confused
+  // and think this is a full definition.
+  addFlag(Die, dwarf::DW_AT_declaration);
+
+  Die.addValue(dwarf::DW_AT_signature, dwarf::DW_FORM_ref_sig8,
+               new (DIEValueAllocator) DIETypeSignature(Type));
 }
 
-void DwarfUnit::addDIEEntry(DIE *Die, dwarf::Attribute Attribute,
+void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute,
                             DIEEntry *Entry) {
-  const DIE *DieCU = Die->getUnitOrNull();
-  const DIE *EntryCU = Entry->getEntry()->getUnitOrNull();
+  const DIE *DieCU = Die.getUnitOrNull();
+  const DIE *EntryCU = Entry->getEntry().getUnitOrNull();
   if (!DieCU)
     // We assume that Die belongs to this CU, if it is not linked to any CU yet.
-    DieCU = getUnitDie();
+    DieCU = &getUnitDie();
   if (!EntryCU)
-    EntryCU = getUnitDie();
-  Die->addValue(Attribute, EntryCU == DieCU ? dwarf::DW_FORM_ref4
-                                            : dwarf::DW_FORM_ref_addr,
-                Entry);
+    EntryCU = &getUnitDie();
+  Die.addValue(Attribute,
+               EntryCU == DieCU ? dwarf::DW_FORM_ref4 : dwarf::DW_FORM_ref_addr,
+               Entry);
 }
 
 /// Create a DIE with the given Tag, add the DIE to its parent, and
 /// call insertDIE if MD is not null.
-DIE *DwarfUnit::createAndAddDIE(unsigned Tag, DIE &Parent, DIDescriptor N) {
-  DIE *Die = new DIE(Tag);
-  Parent.addChild(Die);
+DIE &DwarfUnit::createAndAddDIE(unsigned Tag, DIE &Parent, DIDescriptor N) {
+  assert(Tag != dwarf::DW_TAG_auto_variable &&
+         Tag != dwarf::DW_TAG_arg_variable);
+  Parent.addChild(make_unique<DIE>((dwarf::Tag)Tag));
+  DIE &Die = *Parent.getChildren().back();
   if (N)
-    insertDIE(N, Die);
+    insertDIE(N, &Die);
   return Die;
 }
 
 /// addBlock - Add block data.
 ///
-void DwarfUnit::addBlock(DIE *Die, dwarf::Attribute Attribute, DIELoc *Loc) {
+void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Loc) {
   Loc->ComputeSize(Asm);
   DIELocs.push_back(Loc); // Memoize so we can call the destructor later on.
-  Die->addValue(Attribute, Loc->BestForm(DD->getDwarfVersion()), Loc);
+  Die.addValue(Attribute, Loc->BestForm(DD->getDwarfVersion()), Loc);
 }
 
-void DwarfUnit::addBlock(DIE *Die, dwarf::Attribute Attribute,
+void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute,
                          DIEBlock *Block) {
   Block->ComputeSize(Asm);
   DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on.
-  Die->addValue(Attribute, Block->BestForm(), Block);
+  Die.addValue(Attribute, Block->BestForm(), Block);
 }
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
-void DwarfUnit::addSourceLine(DIE *Die, unsigned Line, StringRef File,
+void DwarfUnit::addSourceLine(DIE &Die, unsigned Line, StringRef File,
                               StringRef Directory) {
   if (Line == 0)
     return;
@@ -421,7 +428,7 @@ void DwarfUnit::addSourceLine(DIE *Die, unsigned Line, StringRef File,
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
-void DwarfUnit::addSourceLine(DIE *Die, DIVariable V) {
+void DwarfUnit::addSourceLine(DIE &Die, DIVariable V) {
   assert(V.isVariable());
 
   addSourceLine(Die, V.getLineNumber(), V.getContext().getFilename(),
@@ -430,7 +437,7 @@ void DwarfUnit::addSourceLine(DIE *Die, DIVariable V) {
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
-void DwarfUnit::addSourceLine(DIE *Die, DIGlobalVariable G) {
+void DwarfUnit::addSourceLine(DIE &Die, DIGlobalVariable G) {
   assert(G.isGlobalVariable());
 
   addSourceLine(Die, G.getLineNumber(), G.getFilename(), G.getDirectory());
@@ -438,7 +445,7 @@ void DwarfUnit::addSourceLine(DIE *Die, DIGlobalVariable G) {
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
-void DwarfUnit::addSourceLine(DIE *Die, DISubprogram SP) {
+void DwarfUnit::addSourceLine(DIE &Die, DISubprogram SP) {
   assert(SP.isSubprogram());
 
   addSourceLine(Die, SP.getLineNumber(), SP.getFilename(), SP.getDirectory());
@@ -446,7 +453,7 @@ void DwarfUnit::addSourceLine(DIE *Die, DISubprogram SP) {
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
-void DwarfUnit::addSourceLine(DIE *Die, DIType Ty) {
+void DwarfUnit::addSourceLine(DIE &Die, DIType Ty) {
   assert(Ty.isType());
 
   addSourceLine(Die, Ty.getLineNumber(), Ty.getFilename(), Ty.getDirectory());
@@ -454,7 +461,7 @@ void DwarfUnit::addSourceLine(DIE *Die, DIType Ty) {
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
-void DwarfUnit::addSourceLine(DIE *Die, DIObjCProperty Ty) {
+void DwarfUnit::addSourceLine(DIE &Die, DIObjCProperty Ty) {
   assert(Ty.isObjCProperty());
 
   DIFile File = Ty.getFile();
@@ -464,7 +471,7 @@ void DwarfUnit::addSourceLine(DIE *Die, DIObjCProperty Ty) {
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
-void DwarfUnit::addSourceLine(DIE *Die, DINameSpace NS) {
+void DwarfUnit::addSourceLine(DIE &Die, DINameSpace NS) {
   assert(NS.Verify());
 
   addSourceLine(Die, NS.getLineNumber(), NS.getFilename(), NS.getDirectory());
@@ -472,7 +479,7 @@ void DwarfUnit::addSourceLine(DIE *Die, DINameSpace NS) {
 
 /// addVariableAddress - Add DW_AT_location attribute for a
 /// DbgVariable based on provided MachineLocation.
-void DwarfUnit::addVariableAddress(const DbgVariable &DV, DIE *Die,
+void DwarfUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
                                    MachineLocation Location) {
   if (DV.variableHasComplexAddress())
     addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
@@ -484,7 +491,7 @@ void DwarfUnit::addVariableAddress(const DbgVariable &DV, DIE *Die,
 }
 
 /// addRegisterOp - Add register operand.
-void DwarfUnit::addRegisterOp(DIELoc *TheDie, unsigned Reg) {
+void DwarfUnit::addRegisterOp(DIELoc &TheDie, unsigned Reg) {
   const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
   int DWReg = RI->getDwarfRegNum(Reg, false);
   bool isSubRegister = DWReg < 0;
@@ -529,7 +536,7 @@ void DwarfUnit::addRegisterOp(DIELoc *TheDie, unsigned Reg) {
 }
 
 /// addRegisterOffset - Add register offset.
-void DwarfUnit::addRegisterOffset(DIELoc *TheDie, unsigned Reg,
+void DwarfUnit::addRegisterOffset(DIELoc &TheDie, unsigned Reg,
                                   int64_t Offset) {
   const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
   unsigned DWReg = RI->getDwarfRegNum(Reg, false);
@@ -548,16 +555,16 @@ void DwarfUnit::addRegisterOffset(DIELoc *TheDie, unsigned Reg,
 
 /// addAddress - Add an address attribute to a die based on the location
 /// provided.
-void DwarfUnit::addAddress(DIE *Die, dwarf::Attribute Attribute,
+void DwarfUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,
                            const MachineLocation &Location, bool Indirect) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc();
 
   if (Location.isReg() && !Indirect)
-    addRegisterOp(Loc, Location.getReg());
+    addRegisterOp(*Loc, Location.getReg());
   else {
-    addRegisterOffset(Loc, Location.getReg(), Location.getOffset());
+    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
     if (Indirect && !Location.isReg()) {
-      addUInt(Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
     }
   }
 
@@ -570,7 +577,7 @@ void DwarfUnit::addAddress(DIE *Die, dwarf::Attribute Attribute,
 /// given the extra address information encoded in the DbgVariable, starting
 /// from the starting location.  Add the DWARF information to the die.
 ///
-void DwarfUnit::addComplexAddress(const DbgVariable &DV, DIE *Die,
+void DwarfUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
                                   dwarf::Attribute Attribute,
                                   const MachineLocation &Location) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc();
@@ -580,21 +587,21 @@ void DwarfUnit::addComplexAddress(const DbgVariable &DV, DIE *Die,
     if (N >= 2 && DV.getAddrElement(0) == DIBuilder::OpPlus) {
       // If first address element is OpPlus then emit
       // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
-      addRegisterOffset(Loc, Location.getReg(), DV.getAddrElement(1));
+      addRegisterOffset(*Loc, Location.getReg(), DV.getAddrElement(1));
       i = 2;
     } else
-      addRegisterOp(Loc, Location.getReg());
+      addRegisterOp(*Loc, Location.getReg());
   } else
-    addRegisterOffset(Loc, Location.getReg(), Location.getOffset());
+    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
 
   for (; i < N; ++i) {
     uint64_t Element = DV.getAddrElement(i);
     if (Element == DIBuilder::OpPlus) {
-      addUInt(Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-      addUInt(Loc, dwarf::DW_FORM_udata, DV.getAddrElement(++i));
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+      addUInt(*Loc, dwarf::DW_FORM_udata, DV.getAddrElement(++i));
     } else if (Element == DIBuilder::OpDeref) {
       if (!Location.isReg())
-        addUInt(Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
     } else
       llvm_unreachable("unknown DIBuilder Opcode");
   }
@@ -663,7 +670,7 @@ void DwarfUnit::addComplexAddress(const DbgVariable &DV, DIE *Die,
 /// starting location.  Add the DWARF information to the die.  For
 /// more information, read large comment just above here.
 ///
-void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE *Die,
+void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
                                      dwarf::Attribute Attribute,
                                      const MachineLocation &Location) {
   DIType Ty = DV.getType();
@@ -705,68 +712,78 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE *Die,
   DIELoc *Loc = new (DIEValueAllocator) DIELoc();
 
   if (Location.isReg())
-    addRegisterOp(Loc, Location.getReg());
+    addRegisterOp(*Loc, Location.getReg());
   else
-    addRegisterOffset(Loc, Location.getReg(), Location.getOffset());
+    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
 
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
   if (isPointer)
-    addUInt(Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
 
   // Next add the offset for the '__forwarding' field:
   // DW_OP_plus_uconst ForwardingFieldOffset.  Note there's no point in
   // adding the offset if it's 0.
   if (forwardingFieldOffset > 0) {
-    addUInt(Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-    addUInt(Loc, dwarf::DW_FORM_udata, forwardingFieldOffset);
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(*Loc, dwarf::DW_FORM_udata, forwardingFieldOffset);
   }
 
   // Now dereference the __forwarding field to get to the real __Block_byref
   // struct:  DW_OP_deref.
-  addUInt(Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+  addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
 
   // Now that we've got the real __Block_byref... struct, add the offset
   // for the variable's field to get to the location of the actual variable:
   // DW_OP_plus_uconst varFieldOffset.  Again, don't add if it's 0.
   if (varFieldOffset > 0) {
-    addUInt(Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-    addUInt(Loc, dwarf::DW_FORM_udata, varFieldOffset);
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(*Loc, dwarf::DW_FORM_udata, varFieldOffset);
   }
 
   // Now attach the location information to the DIE.
   addBlock(Die, Attribute, Loc);
 }
 
-/// isTypeSigned - Return true if the type is signed.
-static bool isTypeSigned(DwarfDebug *DD, DIType Ty, int *SizeInBits) {
-  if (Ty.isDerivedType())
-    return isTypeSigned(DD, DD->resolve(DIDerivedType(Ty).getTypeDerivedFrom()),
-                        SizeInBits);
-  if (Ty.isBasicType())
-    if (DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed ||
-        DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed_char) {
-      *SizeInBits = Ty.getSizeInBits();
-      return true;
-    }
-  return false;
-}
-
 /// Return true if type encoding is unsigned.
 static bool isUnsignedDIType(DwarfDebug *DD, DIType Ty) {
   DIDerivedType DTy(Ty);
-  if (DTy.isDerivedType())
-    return isUnsignedDIType(DD, DD->resolve(DTy.getTypeDerivedFrom()));
-
-  DIBasicType BTy(Ty);
-  if (BTy.isBasicType()) {
-    unsigned Encoding = BTy.getEncoding();
-    if (Encoding == dwarf::DW_ATE_unsigned ||
-        Encoding == dwarf::DW_ATE_unsigned_char ||
-        Encoding == dwarf::DW_ATE_boolean)
+  if (DTy.isDerivedType()) {
+    dwarf::Tag T = (dwarf::Tag)Ty.getTag();
+    // Encode pointer constants as unsigned bytes. This is used at least for
+    // null pointer constant emission.
+    // FIXME: reference and rvalue_reference /probably/ shouldn't be allowed
+    // here, but accept them for now due to a bug in SROA producing bogus
+    // dbg.values.
+    if (T == dwarf::DW_TAG_pointer_type ||
+        T == dwarf::DW_TAG_ptr_to_member_type ||
+        T == dwarf::DW_TAG_reference_type ||
+        T == dwarf::DW_TAG_rvalue_reference_type)
       return true;
+    assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type ||
+           T == dwarf::DW_TAG_volatile_type ||
+           T == dwarf::DW_TAG_restrict_type ||
+           T == dwarf::DW_TAG_enumeration_type);
+    if (DITypeRef Deriv = DTy.getTypeDerivedFrom())
+      return isUnsignedDIType(DD, DD->resolve(Deriv));
+    // FIXME: Enums without a fixed underlying type have unknown signedness
+    // here, leading to incorrectly emitted constants.
+    assert(DTy.getTag() == dwarf::DW_TAG_enumeration_type);
+    return false;
   }
-  return false;
+
+  DIBasicType BTy(Ty);
+  assert(BTy.isBasicType());
+  unsigned Encoding = BTy.getEncoding();
+  assert((Encoding == dwarf::DW_ATE_unsigned ||
+          Encoding == dwarf::DW_ATE_unsigned_char ||
+          Encoding == dwarf::DW_ATE_signed ||
+          Encoding == dwarf::DW_ATE_signed_char ||
+          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean) &&
+         "Unsupported encoding");
+  return (Encoding == dwarf::DW_ATE_unsigned ||
+          Encoding == dwarf::DW_ATE_unsigned_char ||
+          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean);
 }
 
 /// If this type is derived from a base type then return base type size.
@@ -798,47 +815,8 @@ static uint64_t getBaseTypeSize(DwarfDebug *DD, DIDerivedType Ty) {
   return BaseType.getSizeInBits();
 }
 
-/// addConstantValue - Add constant value entry in variable DIE.
-void DwarfUnit::addConstantValue(DIE *Die, const MachineOperand &MO,
-                                 DIType Ty) {
-  // FIXME: This is a bit conservative/simple - it emits negative values at
-  // their maximum bit width which is a bit unfortunate (& doesn't prefer
-  // udata/sdata over dataN as suggested by the DWARF spec)
-  assert(MO.isImm() && "Invalid machine operand!");
-  int SizeInBits = -1;
-  bool SignedConstant = isTypeSigned(DD, Ty, &SizeInBits);
-  dwarf::Form Form;
-
-  // If we're a signed constant definitely use sdata.
-  if (SignedConstant) {
-    addSInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, MO.getImm());
-    return;
-  }
-
-  // Else use data for now unless it's larger than we can deal with.
-  switch (SizeInBits) {
-  case 8:
-    Form = dwarf::DW_FORM_data1;
-    break;
-  case 16:
-    Form = dwarf::DW_FORM_data2;
-    break;
-  case 32:
-    Form = dwarf::DW_FORM_data4;
-    break;
-  case 64:
-    Form = dwarf::DW_FORM_data8;
-    break;
-  default:
-    Form = dwarf::DW_FORM_udata;
-    addUInt(Die, dwarf::DW_AT_const_value, Form, MO.getImm());
-    return;
-  }
-  addUInt(Die, dwarf::DW_AT_const_value, Form, MO.getImm());
-}
-
 /// addConstantFPValue - Add constant value entry in variable DIE.
-void DwarfUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
+void DwarfUnit::addConstantFPValue(DIE &Die, const MachineOperand &MO) {
   assert(MO.isFPImm() && "Invalid machine operand!");
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
   APFloat FPImm = MO.getFPImm()->getValueAPF();
@@ -855,55 +833,47 @@ void DwarfUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
 
   // Output the constant to DWARF one byte at a time.
   for (; Start != Stop; Start += Incr)
-    addUInt(Block, dwarf::DW_FORM_data1, (unsigned char)0xFF & FltPtr[Start]);
+    addUInt(*Block, dwarf::DW_FORM_data1, (unsigned char)0xFF & FltPtr[Start]);
 
   addBlock(Die, dwarf::DW_AT_const_value, Block);
 }
 
 /// addConstantFPValue - Add constant value entry in variable DIE.
-void DwarfUnit::addConstantFPValue(DIE *Die, const ConstantFP *CFP) {
+void DwarfUnit::addConstantFPValue(DIE &Die, const ConstantFP *CFP) {
   // Pass this down to addConstantValue as an unsigned bag of bits.
   addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), true);
 }
 
 /// addConstantValue - Add constant value entry in variable DIE.
-void DwarfUnit::addConstantValue(DIE *Die, const ConstantInt *CI,
-                                 bool Unsigned) {
-  addConstantValue(Die, CI->getValue(), Unsigned);
+void DwarfUnit::addConstantValue(DIE &Die, const ConstantInt *CI, DIType Ty) {
+  addConstantValue(Die, CI->getValue(), Ty);
+}
+
+/// addConstantValue - Add constant value entry in variable DIE.
+void DwarfUnit::addConstantValue(DIE &Die, const MachineOperand &MO,
+                                 DIType Ty) {
+  assert(MO.isImm() && "Invalid machine operand!");
+
+  addConstantValue(Die, isUnsignedDIType(DD, Ty), MO.getImm());
+}
+
+void DwarfUnit::addConstantValue(DIE &Die, bool Unsigned, uint64_t Val) {
+  // FIXME: This is a bit conservative/simple - it emits negative values always
+  // sign extended to 64 bits rather than minimizing the number of bytes.
+  addUInt(Die, dwarf::DW_AT_const_value,
+          Unsigned ? dwarf::DW_FORM_udata : dwarf::DW_FORM_sdata, Val);
+}
+
+void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, DIType Ty) {
+  addConstantValue(Die, Val, isUnsignedDIType(DD, Ty));
 }
 
 // addConstantValue - Add constant value entry in variable DIE.
-void DwarfUnit::addConstantValue(DIE *Die, const APInt &Val, bool Unsigned) {
+void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, bool Unsigned) {
   unsigned CIBitWidth = Val.getBitWidth();
   if (CIBitWidth <= 64) {
-    // If we're a signed constant definitely use sdata.
-    if (!Unsigned) {
-      addSInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata,
-              Val.getSExtValue());
-      return;
-    }
-
-    // Else use data for now unless it's larger than we can deal with.
-    dwarf::Form Form;
-    switch (CIBitWidth) {
-    case 8:
-      Form = dwarf::DW_FORM_data1;
-      break;
-    case 16:
-      Form = dwarf::DW_FORM_data2;
-      break;
-    case 32:
-      Form = dwarf::DW_FORM_data4;
-      break;
-    case 64:
-      Form = dwarf::DW_FORM_data8;
-      break;
-    default:
-      addUInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata,
-              Val.getZExtValue());
-      return;
-    }
-    addUInt(Die, dwarf::DW_AT_const_value, Form, Val.getZExtValue());
+    addConstantValue(Die, Unsigned,
+                     Unsigned ? Val.getZExtValue() : Val.getSExtValue());
     return;
   }
 
@@ -922,7 +892,7 @@ void DwarfUnit::addConstantValue(DIE *Die, const APInt &Val, bool Unsigned) {
       c = Ptr64[i / 8] >> (8 * (i & 7));
     else
       c = Ptr64[(NumBytes - 1 - i) / 8] >> (8 * ((NumBytes - 1 - i) & 7));
-    addUInt(Block, dwarf::DW_FORM_data1, c);
+    addUInt(*Block, dwarf::DW_FORM_data1, c);
   }
 
   addBlock(Die, dwarf::DW_AT_const_value, Block);
@@ -945,7 +915,7 @@ void DwarfUnit::addTemplateParams(DIE &Buffer, DIArray TParams) {
 /// getOrCreateContextDIE - Get context owner's DIE.
 DIE *DwarfUnit::getOrCreateContextDIE(DIScope Context) {
   if (!Context || Context.isFile())
-    return getUnitDie();
+    return &getUnitDie();
   if (Context.isType())
     return getOrCreateTypeDIE(DIType(Context));
   if (Context.isNameSpace())
@@ -959,66 +929,68 @@ DIE *DwarfUnit::createTypeDIE(DICompositeType Ty) {
   DIScope Context = resolve(Ty.getContext());
   DIE *ContextDIE = getOrCreateContextDIE(Context);
 
-  DIE *TyDIE = getDIE(Ty);
-  if (TyDIE)
+  if (DIE *TyDIE = getDIE(Ty))
     return TyDIE;
 
   // Create new type.
-  TyDIE = createAndAddDIE(Ty.getTag(), *ContextDIE, Ty);
+  DIE &TyDIE = createAndAddDIE(Ty.getTag(), *ContextDIE, Ty);
 
-  constructTypeDIE(*TyDIE, Ty);
+  constructTypeDIE(TyDIE, Ty);
 
   updateAcceleratorTables(Context, Ty, TyDIE);
-  return TyDIE;
+  return &TyDIE;
 }
 
 /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
 /// given DIType.
 DIE *DwarfUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
   if (!TyNode)
-    return NULL;
+    return nullptr;
 
   DIType Ty(TyNode);
   assert(Ty.isType());
   assert(Ty == resolve(Ty.getRef()) &&
          "type was not uniqued, possible ODR violation.");
 
+  // DW_TAG_restrict_type is not supported in DWARF2
+  if (Ty.getTag() == dwarf::DW_TAG_restrict_type && DD->getDwarfVersion() <= 2)
+    return getOrCreateTypeDIE(resolve(DIDerivedType(Ty).getTypeDerivedFrom()));
+
   // Construct the context before querying for the existence of the DIE in case
   // such construction creates the DIE.
   DIScope Context = resolve(Ty.getContext());
   DIE *ContextDIE = getOrCreateContextDIE(Context);
   assert(ContextDIE);
 
-  DIE *TyDIE = getDIE(Ty);
-  if (TyDIE)
+  if (DIE *TyDIE = getDIE(Ty))
     return TyDIE;
 
   // Create new type.
-  TyDIE = createAndAddDIE(Ty.getTag(), *ContextDIE, Ty);
+  DIE &TyDIE = createAndAddDIE(Ty.getTag(), *ContextDIE, Ty);
 
   updateAcceleratorTables(Context, Ty, TyDIE);
 
   if (Ty.isBasicType())
-    constructTypeDIE(*TyDIE, DIBasicType(Ty));
+    constructTypeDIE(TyDIE, DIBasicType(Ty));
   else if (Ty.isCompositeType()) {
     DICompositeType CTy(Ty);
     if (GenerateDwarfTypeUnits && !Ty.isForwardDecl())
       if (MDString *TypeId = CTy.getIdentifier()) {
         DD->addDwarfTypeUnitType(getCU(), TypeId->getString(), TyDIE, CTy);
         // Skip updating the accelerator tables since this is not the full type.
-        return TyDIE;
+        return &TyDIE;
       }
-    constructTypeDIE(*TyDIE, CTy);
+    constructTypeDIE(TyDIE, CTy);
   } else {
     assert(Ty.isDerivedType() && "Unknown kind of DIType");
-    constructTypeDIE(*TyDIE, DIDerivedType(Ty));
+    constructTypeDIE(TyDIE, DIDerivedType(Ty));
   }
 
-  return TyDIE;
+  return &TyDIE;
 }
 
 void DwarfUnit::updateAcceleratorTables(DIScope Context, DIType Ty,
-                                        const DIE *TyDIE) {
+                                        const DIE &TyDIE) {
   if (!Ty.getName().empty() && !Ty.isForwardDecl()) {
     bool IsImplementation = 0;
     if (Ty.isCompositeType()) {
@@ -1028,17 +1000,18 @@ void DwarfUnit::updateAcceleratorTables(DIScope Context, DIType Ty,
       IsImplementation = (CT.getRunTimeLang() == 0) || CT.isObjcClassComplete();
     }
     unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
-    addAccelType(Ty.getName(), std::make_pair(TyDIE, Flags));
+    DD->addAccelType(Ty.getName(), TyDIE, Flags);
 
     if ((!Context || Context.isCompileUnit() || Context.isFile() ||
          Context.isNameSpace()) &&
         getCUNode().getEmissionKind() != DIBuilder::LineTablesOnly)
-      GlobalTypes[getParentContextString(Context) + Ty.getName().str()] = TyDIE;
+      GlobalTypes[getParentContextString(Context) + Ty.getName().str()] =
+          &TyDIE;
   }
 }
 
 /// addType - Add a new type attribute to the specified entity.
-void DwarfUnit::addType(DIE *Entity, DIType Ty, dwarf::Attribute Attribute) {
+void DwarfUnit::addType(DIE &Entity, DIType Ty, dwarf::Attribute Attribute) {
   assert(Ty && "Trying to add a type that doesn't exist?");
 
   // Check for pre-existence.
@@ -1053,54 +1026,17 @@ void DwarfUnit::addType(DIE *Entity, DIType Ty, dwarf::Attribute Attribute) {
   DIE *Buffer = getOrCreateTypeDIE(Ty);
 
   // Set up proxy.
-  Entry = createDIEEntry(Buffer);
+  Entry = createDIEEntry(*Buffer);
   insertDIEEntry(Ty, Entry);
   addDIEEntry(Entity, Attribute, Entry);
 }
 
-// Accelerator table mutators - add each name along with its companion
-// DIE to the proper table while ensuring that the name that we're going
-// to reference is in the string table. We do this since the names we
-// add may not only be identical to the names in the DIE.
-void DwarfUnit::addAccelName(StringRef Name, const DIE *Die) {
-  if (!DD->useDwarfAccelTables())
-    return;
-  DU->getStringPoolEntry(Name);
-  std::vector<const DIE *> &DIEs = AccelNames[Name];
-  DIEs.push_back(Die);
-}
-
-void DwarfUnit::addAccelObjC(StringRef Name, const DIE *Die) {
-  if (!DD->useDwarfAccelTables())
-    return;
-  DU->getStringPoolEntry(Name);
-  std::vector<const DIE *> &DIEs = AccelObjC[Name];
-  DIEs.push_back(Die);
-}
-
-void DwarfUnit::addAccelNamespace(StringRef Name, const DIE *Die) {
-  if (!DD->useDwarfAccelTables())
-    return;
-  DU->getStringPoolEntry(Name);
-  std::vector<const DIE *> &DIEs = AccelNamespace[Name];
-  DIEs.push_back(Die);
-}
-
-void DwarfUnit::addAccelType(StringRef Name,
-                             std::pair<const DIE *, unsigned> Die) {
-  if (!DD->useDwarfAccelTables())
-    return;
-  DU->getStringPoolEntry(Name);
-  std::vector<std::pair<const DIE *, unsigned> > &DIEs = AccelTypes[Name];
-  DIEs.push_back(Die);
-}
-
 /// addGlobalName - Add a new global name to the compile unit.
-void DwarfUnit::addGlobalName(StringRef Name, DIE *Die, DIScope Context) {
+void DwarfUnit::addGlobalName(StringRef Name, DIE &Die, DIScope Context) {
   if (getCUNode().getEmissionKind() == DIBuilder::LineTablesOnly)
     return;
   std::string FullName = getParentContextString(Context) + Name.str();
-  GlobalNames[FullName] = Die;
+  GlobalNames[FullName] = &Die;
 }
 
 /// getParentContextString - Walks the metadata parent chain in a language
@@ -1149,17 +1085,17 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DIBasicType BTy) {
   StringRef Name = BTy.getName();
   // Add name if not anonymous or intermediate type.
   if (!Name.empty())
-    addString(&Buffer, dwarf::DW_AT_name, Name);
+    addString(Buffer, dwarf::DW_AT_name, Name);
 
   // An unspecified type only has a name attribute.
   if (BTy.getTag() == dwarf::DW_TAG_unspecified_type)
     return;
 
-  addUInt(&Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+  addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
           BTy.getEncoding());
 
   uint64_t Size = BTy.getSizeInBits() >> 3;
-  addUInt(&Buffer, dwarf::DW_AT_byte_size, None, Size);
+  addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
 }
 
 /// constructTypeDIE - Construct derived type die from DIDerivedType.
@@ -1172,22 +1108,22 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
   // Map to main type, void will not have a type.
   DIType FromTy = resolve(DTy.getTypeDerivedFrom());
   if (FromTy)
-    addType(&Buffer, FromTy);
+    addType(Buffer, FromTy);
 
   // Add name if not anonymous or intermediate type.
   if (!Name.empty())
-    addString(&Buffer, dwarf::DW_AT_name, Name);
+    addString(Buffer, dwarf::DW_AT_name, Name);
 
   // Add size if non-zero (derived types might be zero-sized.)
   if (Size && Tag != dwarf::DW_TAG_pointer_type)
-    addUInt(&Buffer, dwarf::DW_AT_byte_size, None, Size);
+    addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
 
   if (Tag == dwarf::DW_TAG_ptr_to_member_type)
-    addDIEEntry(&Buffer, dwarf::DW_AT_containing_type,
-                getOrCreateTypeDIE(resolve(DTy.getClassType())));
+    addDIEEntry(Buffer, dwarf::DW_AT_containing_type,
+                *getOrCreateTypeDIE(resolve(DTy.getClassType())));
   // Add source line info if available and TyDesc is not a forward declaration.
   if (!DTy.isForwardDecl())
-    addSourceLine(&Buffer, DTy);
+    addSourceLine(Buffer, DTy);
 }
 
 /// constructSubprogramArguments - Construct function argument DIEs.
@@ -1198,7 +1134,7 @@ void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DIArray Args) {
       assert(i == N-1 && "Unspecified parameter must be the last argument");
       createAndAddDIE(dwarf::DW_TAG_unspecified_parameters, Buffer);
     } else {
-      DIE *Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, Buffer);
+      DIE &Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, Buffer);
       addType(Arg, DIType(Ty));
       if (DIType(Ty).isArtificial())
         addFlag(Arg, dwarf::DW_AT_artificial);
@@ -1226,7 +1162,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     DIArray Elements = CTy.getTypeArray();
     DIType RTy(Elements.getElement(0));
     if (RTy)
-      addType(&Buffer, RTy);
+      addType(Buffer, RTy);
 
     bool isPrototyped = true;
     if (Elements.getNumElements() == 2 &&
@@ -1241,13 +1177,13 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     if (isPrototyped &&
         (Language == dwarf::DW_LANG_C89 || Language == dwarf::DW_LANG_C99 ||
          Language == dwarf::DW_LANG_ObjC))
-      addFlag(&Buffer, dwarf::DW_AT_prototyped);
+      addFlag(Buffer, dwarf::DW_AT_prototyped);
 
     if (CTy.isLValueReference())
-      addFlag(&Buffer, dwarf::DW_AT_reference);
+      addFlag(Buffer, dwarf::DW_AT_reference);
 
     if (CTy.isRValueReference())
-      addFlag(&Buffer, dwarf::DW_AT_rvalue_reference);
+      addFlag(Buffer, dwarf::DW_AT_rvalue_reference);
   } break;
   case dwarf::DW_TAG_structure_type:
   case dwarf::DW_TAG_union_type:
@@ -1256,13 +1192,12 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     DIArray Elements = CTy.getTypeArray();
     for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
       DIDescriptor Element = Elements.getElement(i);
-      DIE *ElemDie = NULL;
       if (Element.isSubprogram())
-        ElemDie = getOrCreateSubprogramDIE(DISubprogram(Element));
+        getOrCreateSubprogramDIE(DISubprogram(Element));
       else if (Element.isDerivedType()) {
         DIDerivedType DDTy(Element);
         if (DDTy.getTag() == dwarf::DW_TAG_friend) {
-          ElemDie = createAndAddDIE(dwarf::DW_TAG_friend, Buffer);
+          DIE &ElemDie = createAndAddDIE(dwarf::DW_TAG_friend, Buffer);
           addType(ElemDie, resolve(DDTy.getTypeDerivedFrom()),
                   dwarf::DW_AT_friend);
         } else if (DDTy.isStaticMember()) {
@@ -1272,7 +1207,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         }
       } else if (Element.isObjCProperty()) {
         DIObjCProperty Property(Element);
-        ElemDie = createAndAddDIE(Property.getTag(), Buffer);
+        DIE &ElemDie = createAndAddDIE(Property.getTag(), Buffer);
         StringRef PropertyName = Property.getObjCPropertyName();
         addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
         if (Property.getType())
@@ -1311,15 +1246,15 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     }
 
     if (CTy.isAppleBlockExtension())
-      addFlag(&Buffer, dwarf::DW_AT_APPLE_block);
+      addFlag(Buffer, dwarf::DW_AT_APPLE_block);
 
     DICompositeType ContainingType(resolve(CTy.getContainingType()));
     if (ContainingType)
-      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type,
-                  getOrCreateTypeDIE(ContainingType));
+      addDIEEntry(Buffer, dwarf::DW_AT_containing_type,
+                  *getOrCreateTypeDIE(ContainingType));
 
     if (CTy.isObjcClassComplete())
-      addFlag(&Buffer, dwarf::DW_AT_APPLE_objc_complete_type);
+      addFlag(Buffer, dwarf::DW_AT_APPLE_objc_complete_type);
 
     // Add template parameters to a class, structure or union types.
     // FIXME: The support isn't in the metadata for this yet.
@@ -1335,7 +1270,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
 
   // Add name if not anonymous or intermediate type.
   if (!Name.empty())
-    addString(&Buffer, dwarf::DW_AT_name, Name);
+    addString(Buffer, dwarf::DW_AT_name, Name);
 
   if (Tag == dwarf::DW_TAG_enumeration_type ||
       Tag == dwarf::DW_TAG_class_type || Tag == dwarf::DW_TAG_structure_type ||
@@ -1343,23 +1278,23 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     // Add size if non-zero (derived types might be zero-sized.)
     // TODO: Do we care about size for enum forward declarations?
     if (Size)
-      addUInt(&Buffer, dwarf::DW_AT_byte_size, None, Size);
+      addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
     else if (!CTy.isForwardDecl())
       // Add zero size if it is not a forward declaration.
-      addUInt(&Buffer, dwarf::DW_AT_byte_size, None, 0);
+      addUInt(Buffer, dwarf::DW_AT_byte_size, None, 0);
 
     // If we're a forward decl, say so.
     if (CTy.isForwardDecl())
-      addFlag(&Buffer, dwarf::DW_AT_declaration);
+      addFlag(Buffer, dwarf::DW_AT_declaration);
 
     // Add source line info if available.
     if (!CTy.isForwardDecl())
-      addSourceLine(&Buffer, CTy);
+      addSourceLine(Buffer, CTy);
 
     // No harm in adding the runtime language to the declaration.
     unsigned RLang = CTy.getRunTimeLang();
     if (RLang)
-      addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class, dwarf::DW_FORM_data1,
+      addUInt(Buffer, dwarf::DW_AT_APPLE_runtime_class, dwarf::DW_FORM_data1,
               RLang);
   }
 }
@@ -1368,7 +1303,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
 /// DITemplateTypeParameter.
 void DwarfUnit::constructTemplateTypeParameterDIE(DIE &Buffer,
                                                   DITemplateTypeParameter TP) {
-  DIE *ParamDIE =
+  DIE &ParamDIE =
       createAndAddDIE(dwarf::DW_TAG_template_type_parameter, Buffer);
   // Add the type if it exists, it could be void and therefore no type.
   if (TP.getType())
@@ -1382,7 +1317,7 @@ void DwarfUnit::constructTemplateTypeParameterDIE(DIE &Buffer,
 void
 DwarfUnit::constructTemplateValueParameterDIE(DIE &Buffer,
                                               DITemplateValueParameter VP) {
-  DIE *ParamDIE = createAndAddDIE(VP.getTag(), Buffer);
+  DIE &ParamDIE = createAndAddDIE(VP.getTag(), Buffer);
 
   // Add the type if there is one, template template and template parameter
   // packs will not have a type.
@@ -1392,16 +1327,15 @@ DwarfUnit::constructTemplateValueParameterDIE(DIE &Buffer,
     addString(ParamDIE, dwarf::DW_AT_name, VP.getName());
   if (Value *Val = VP.getValue()) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Val))
-      addConstantValue(ParamDIE, CI,
-                       isUnsignedDIType(DD, resolve(VP.getType())));
+      addConstantValue(ParamDIE, CI, resolve(VP.getType()));
     else if (GlobalValue *GV = dyn_cast<GlobalValue>(Val)) {
       // For declaration non-type template parameters (such as global values and
       // functions)
       DIELoc *Loc = new (DIEValueAllocator) DIELoc();
-      addOpAddress(Loc, Asm->getSymbol(GV));
+      addOpAddress(*Loc, Asm->getSymbol(GV));
       // Emit DW_OP_stack_value to use the address as the immediate value of the
       // parameter, rather than a pointer to it.
-      addUInt(Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
       addBlock(ParamDIE, dwarf::DW_AT_location, Loc);
     } else if (VP.getTag() == dwarf::DW_TAG_GNU_template_template_param) {
       assert(isa<MDString>(Val));
@@ -1410,7 +1344,7 @@ DwarfUnit::constructTemplateValueParameterDIE(DIE &Buffer,
     } else if (VP.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack) {
       assert(isa<MDNode>(Val));
       DIArray A(cast<MDNode>(Val));
-      addTemplateParams(*ParamDIE, A);
+      addTemplateParams(ParamDIE, A);
     }
   }
 }
@@ -1421,19 +1355,18 @@ DIE *DwarfUnit::getOrCreateNameSpace(DINameSpace NS) {
   // such construction creates the DIE.
   DIE *ContextDIE = getOrCreateContextDIE(NS.getContext());
 
-  DIE *NDie = getDIE(NS);
-  if (NDie)
+  if (DIE *NDie = getDIE(NS))
     return NDie;
-  NDie = createAndAddDIE(dwarf::DW_TAG_namespace, *ContextDIE, NS);
+  DIE &NDie = createAndAddDIE(dwarf::DW_TAG_namespace, *ContextDIE, NS);
 
   if (!NS.getName().empty()) {
     addString(NDie, dwarf::DW_AT_name, NS.getName());
-    addAccelNamespace(NS.getName(), NDie);
+    DD->addAccelNamespace(NS.getName(), NDie);
     addGlobalName(NS.getName(), NDie, NS.getContext());
   } else
-    addAccelNamespace("(anonymous namespace)", NDie);
+    DD->addAccelNamespace("(anonymous namespace)", NDie);
   addSourceLine(NDie, NS);
-  return NDie;
+  return &NDie;
 }
 
 /// getOrCreateSubprogramDIE - Create new DIE using SP.
@@ -1441,47 +1374,58 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
   // Construct the context before querying for the existence of the DIE in case
   // such construction creates the DIE (as is the case for member function
   // declarations).
-  DIScope Context = resolve(SP.getContext());
-  DIE *ContextDIE = getOrCreateContextDIE(Context);
+  DIE *ContextDIE = getOrCreateContextDIE(resolve(SP.getContext()));
 
-  // Unique declarations based on the ODR, where applicable.
-  SP = DISubprogram(DD->resolve(SP.getRef()));
-  assert(SP.Verify());
-
-  DIE *SPDie = getDIE(SP);
-  if (SPDie)
+  if (DIE *SPDie = getDIE(SP))
     return SPDie;
 
-  DISubprogram SPDecl = SP.getFunctionDeclaration();
-  if (SPDecl.isSubprogram())
+  if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
     // Add subprogram definitions to the CU die directly.
-    ContextDIE = UnitDie.get();
+    ContextDIE = &getUnitDie();
+    // Build the decl now to ensure it preceeds the definition.
+    getOrCreateSubprogramDIE(SPDecl);
+  }
 
   // DW_TAG_inlined_subroutine may refer to this DIE.
-  SPDie = createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, SP);
-
-  DIE *DeclDie = NULL;
-  if (SPDecl.isSubprogram())
-    DeclDie = getOrCreateSubprogramDIE(SPDecl);
+  DIE &SPDie = createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, SP);
 
-  // Add function template parameters.
-  addTemplateParams(*SPDie, SP.getTemplateParams());
+  // Abort here and fill this in later, depending on whether or not this
+  // subprogram turns out to have inlined instances or not.
+  if (SP.isDefinition())
+    return &SPDie;
 
-  // If this DIE is going to refer declaration info using AT_specification
-  // then there is no need to add other attributes.
-  if (DeclDie) {
-    // Refer function declaration directly.
-    addDIEEntry(SPDie, dwarf::DW_AT_specification, DeclDie);
+  applySubprogramAttributes(SP, SPDie);
+  return &SPDie;
+}
 
-    return SPDie;
+void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie) {
+  DIE *DeclDie = nullptr;
+  StringRef DeclLinkageName;
+  if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
+    DeclDie = getDIE(SPDecl);
+    assert(DeclDie);
+    DeclLinkageName = SPDecl.getLinkageName();
   }
 
-  // Add the linkage name if we have one.
+  // Add function template parameters.
+  addTemplateParams(SPDie, SP.getTemplateParams());
+
+  // Add the linkage name if we have one and it isn't in the Decl.
   StringRef LinkageName = SP.getLinkageName();
-  if (!LinkageName.empty())
+  assert(((LinkageName.empty() || DeclLinkageName.empty()) ||
+          LinkageName == DeclLinkageName) &&
+         "decl has a linkage name and it is different");
+  if (!LinkageName.empty() && DeclLinkageName.empty())
     addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
               GlobalValue::getRealLinkageName(LinkageName));
 
+  if (DeclDie) {
+    // Refer to the function declaration where all the other attributes will be
+    // found.
+    addDIEEntry(SPDie, dwarf::DW_AT_specification, *DeclDie);
+    return;
+  }
+
   // Constructors and operators for anonymous aggregates do not have names.
   if (!SP.getName().empty())
     addString(SPDie, dwarf::DW_AT_name, SP.getName());
@@ -1510,11 +1454,11 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
   if (VK) {
     addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1, VK);
     DIELoc *Block = getDIELoc();
-    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    addUInt(Block, dwarf::DW_FORM_udata, SP.getVirtualIndex());
+    addUInt(*Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(*Block, dwarf::DW_FORM_udata, SP.getVirtualIndex());
     addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, Block);
     ContainingTypeMap.insert(
-        std::make_pair(SPDie, resolve(SP.getContainingType())));
+        std::make_pair(&SPDie, resolve(SP.getContainingType())));
   }
 
   if (!SP.isDefinition()) {
@@ -1522,7 +1466,7 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
 
     // Add arguments. Do not add arguments for subprogram definition. They will
     // be handled while processing variables.
-    constructSubprogramArguments(*SPDie, Args);
+    constructSubprogramArguments(SPDie, Args);
   }
 
   if (SP.isArtificial())
@@ -1556,8 +1500,6 @@ DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
 
   if (SP.isExplicit())
     addFlag(SPDie, dwarf::DW_AT_explicit);
-
-  return SPDie;
 }
 
 // Return const expression if value is a GEP to access merged global
@@ -1567,22 +1509,22 @@ static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
   const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(V);
   if (!CE || CE->getNumOperands() != 3 ||
       CE->getOpcode() != Instruction::GetElementPtr)
-    return NULL;
+    return nullptr;
 
   // First operand points to a global struct.
   Value *Ptr = CE->getOperand(0);
   if (!isa<GlobalValue>(Ptr) ||
       !isa<StructType>(cast<PointerType>(Ptr->getType())->getElementType()))
-    return NULL;
+    return nullptr;
 
   // Second operand is zero.
   const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(CE->getOperand(1));
   if (!CI || !CI->isZero())
-    return NULL;
+    return nullptr;
 
   // Third operand is offset.
   if (!isa<ConstantInt>(CE->getOperand(2)))
-    return NULL;
+    return nullptr;
 
   return CE;
 }
@@ -1600,7 +1542,7 @@ void DwarfCompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
 
   // If this is a static data member definition, some attributes belong
   // to the declaration DIE.
-  DIE *VariableDIE = NULL;
+  DIE *VariableDIE = nullptr;
   bool IsStaticMember = false;
   DIDerivedType SDMDecl = GV.getStaticDataMemberDeclaration();
   if (SDMDecl.Verify()) {
@@ -1618,24 +1560,24 @@ void DwarfCompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
     DIE *ContextDIE = getOrCreateContextDIE(GVContext);
 
     // Add to map.
-    VariableDIE = createAndAddDIE(GV.getTag(), *ContextDIE, GV);
+    VariableDIE = &createAndAddDIE(GV.getTag(), *ContextDIE, GV);
 
     // Add name and type.
-    addString(VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
-    addType(VariableDIE, GTy);
+    addString(*VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
+    addType(*VariableDIE, GTy);
 
     // Add scoping info.
     if (!GV.isLocalToUnit())
-      addFlag(VariableDIE, dwarf::DW_AT_external);
+      addFlag(*VariableDIE, dwarf::DW_AT_external);
 
     // Add line number info.
-    addSourceLine(VariableDIE, GV);
+    addSourceLine(*VariableDIE, GV);
   }
 
   // Add location.
   bool addToAccelTable = false;
-  DIE *VariableSpecDIE = NULL;
-  bool isGlobalVariable = GV.getGlobal() != NULL;
+  DIE *VariableSpecDIE = nullptr;
+  bool isGlobalVariable = GV.getGlobal() != nullptr;
   if (isGlobalVariable) {
     addToAccelTable = true;
     DIELoc *Loc = new (DIEValueAllocator) DIELoc();
@@ -1648,36 +1590,36 @@ void DwarfCompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
       // Based on GCC's support for TLS:
       if (!DD->useSplitDwarf()) {
         // 1) Start with a constNu of the appropriate pointer size
-        addUInt(Loc, dwarf::DW_FORM_data1,
+        addUInt(*Loc, dwarf::DW_FORM_data1,
                 PointerSize == 4 ? dwarf::DW_OP_const4u : dwarf::DW_OP_const8u);
         // 2) containing the (relocated) offset of the TLS variable
         //    within the module's TLS block.
-        addExpr(Loc, dwarf::DW_FORM_udata,
+        addExpr(*Loc, dwarf::DW_FORM_udata,
                 Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
       } else {
-        addUInt(Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
-        addUInt(Loc, dwarf::DW_FORM_udata,
-                DU->getAddrPoolIndex(Sym, /* TLS */ true));
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+        addUInt(*Loc, dwarf::DW_FORM_udata,
+                DD->getAddressPool().getIndex(Sym, /* TLS */ true));
       }
       // 3) followed by a custom OP to make the debugger do a TLS lookup.
-      addUInt(Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_push_tls_address);
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_push_tls_address);
     } else {
       DD->addArangeLabel(SymbolCU(this, Sym));
-      addOpAddress(Loc, Sym);
+      addOpAddress(*Loc, Sym);
     }
     // Do not create specification DIE if context is either compile unit
     // or a subprogram.
     if (GVContext && GV.isDefinition() && !GVContext.isCompileUnit() &&
         !GVContext.isFile() && !DD->isSubprogramContext(GVContext)) {
       // Create specification DIE.
-      VariableSpecDIE = createAndAddDIE(dwarf::DW_TAG_variable, *UnitDie);
-      addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification, VariableDIE);
-      addBlock(VariableSpecDIE, dwarf::DW_AT_location, Loc);
+      VariableSpecDIE = &createAndAddDIE(dwarf::DW_TAG_variable, UnitDie);
+      addDIEEntry(*VariableSpecDIE, dwarf::DW_AT_specification, *VariableDIE);
+      addBlock(*VariableSpecDIE, dwarf::DW_AT_location, Loc);
       // A static member's declaration is already flagged as such.
       if (!SDMDecl.Verify())
-        addFlag(VariableDIE, dwarf::DW_AT_declaration);
+        addFlag(*VariableDIE, dwarf::DW_AT_declaration);
     } else {
-      addBlock(VariableDIE, dwarf::DW_AT_location, Loc);
+      addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
     }
     // Add the linkage name.
     StringRef LinkageName = GV.getLinkageName();
@@ -1685,8 +1627,8 @@ void DwarfCompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
       // From DWARF4: DIEs to which DW_AT_linkage_name may apply include:
       // TAG_common_block, TAG_constant, TAG_entry_point, TAG_subprogram and
       // TAG_variable.
-      addString(IsStaticMember && VariableSpecDIE ? VariableSpecDIE
-                                                  : VariableDIE,
+      addString(IsStaticMember && VariableSpecDIE ? *VariableSpecDIE
+                                                  : *VariableDIE,
                 DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
                                            : dwarf::DW_AT_MIPS_linkage_name,
                 GlobalValue::getRealLinkageName(LinkageName));
@@ -1696,7 +1638,7 @@ void DwarfCompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
     // emitting AT_const_value multiple times, we only add AT_const_value when
     // it is not a static member.
     if (!IsStaticMember)
-      addConstantValue(VariableDIE, CI, isUnsignedDIType(DD, GTy));
+      addConstantValue(*VariableDIE, CI, GTy);
   } else if (const ConstantExpr *CE = getMergedGlobalExpr(GV->getOperand(11))) {
     addToAccelTable = true;
     // GV is a merged global.
@@ -1704,34 +1646,35 @@ void DwarfCompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
     Value *Ptr = CE->getOperand(0);
     MCSymbol *Sym = Asm->getSymbol(cast<GlobalValue>(Ptr));
     DD->addArangeLabel(SymbolCU(this, Sym));
-    addOpAddress(Loc, Sym);
-    addUInt(Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addOpAddress(*Loc, Sym);
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
     SmallVector<Value *, 3> Idx(CE->op_begin() + 1, CE->op_end());
-    addUInt(Loc, dwarf::DW_FORM_udata,
+    addUInt(*Loc, dwarf::DW_FORM_udata,
             Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
-    addUInt(Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-    addBlock(VariableDIE, dwarf::DW_AT_location, Loc);
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+    addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
   }
 
   if (addToAccelTable) {
-    DIE *AddrDIE = VariableSpecDIE ? VariableSpecDIE : VariableDIE;
-    addAccelName(GV.getName(), AddrDIE);
+    DIE &AddrDIE = VariableSpecDIE ? *VariableSpecDIE : *VariableDIE;
+    DD->addAccelName(GV.getName(), AddrDIE);
 
     // If the linkage name is different than the name, go ahead and output
     // that as well into the name table.
     if (GV.getLinkageName() != "" && GV.getName() != GV.getLinkageName())
-      addAccelName(GV.getLinkageName(), AddrDIE);
+      DD->addAccelName(GV.getLinkageName(), AddrDIE);
   }
 
   if (!GV.isLocalToUnit())
-    addGlobalName(GV.getName(), VariableSpecDIE ? VariableSpecDIE : VariableDIE,
+    addGlobalName(GV.getName(),
+                  VariableSpecDIE ? *VariableSpecDIE : *VariableDIE,
                   GV.getContext());
 }
 
 /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
 void DwarfUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy) {
-  DIE *DW_Subrange = createAndAddDIE(dwarf::DW_TAG_subrange_type, Buffer);
-  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, IndexTy);
+  DIE &DW_Subrange = createAndAddDIE(dwarf::DW_TAG_subrange_type, Buffer);
+  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, *IndexTy);
 
   // The LowerBound value defines the lower bounds which is typically zero for
   // C/C++. The Count value is the number of elements.  Values are 64 bit. If
@@ -1756,10 +1699,10 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy) {
 /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
 void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, DICompositeType CTy) {
   if (CTy.isVector())
-    addFlag(&Buffer, dwarf::DW_AT_GNU_vector);
+    addFlag(Buffer, dwarf::DW_AT_GNU_vector);
 
   // Emit the element type.
-  addType(&Buffer, resolve(CTy.getTypeDerivedFrom()));
+  addType(Buffer, resolve(CTy.getTypeDerivedFrom()));
 
   // Get an anonymous type for index type.
   // FIXME: This type should be passed down from the front end
@@ -1767,10 +1710,10 @@ void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, DICompositeType CTy) {
   DIE *IdxTy = getIndexTyDie();
   if (!IdxTy) {
     // Construct an integer type to use for indexes.
-    IdxTy = createAndAddDIE(dwarf::DW_TAG_base_type, *UnitDie);
-    addString(IdxTy, dwarf::DW_AT_name, "sizetype");
-    addUInt(IdxTy, dwarf::DW_AT_byte_size, None, sizeof(int64_t));
-    addUInt(IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+    IdxTy = &createAndAddDIE(dwarf::DW_TAG_base_type, UnitDie);
+    addString(*IdxTy, dwarf::DW_AT_name, "sizetype");
+    addUInt(*IdxTy, dwarf::DW_AT_byte_size, None, sizeof(int64_t));
+    addUInt(*IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
             dwarf::DW_ATE_unsigned);
     setIndexTyDie(IdxTy);
   }
@@ -1792,7 +1735,7 @@ void DwarfUnit::constructEnumTypeDIE(DIE &Buffer, DICompositeType CTy) {
   for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
     DIEnumerator Enum(Elements.getElement(i));
     if (Enum.isEnumerator()) {
-      DIE *Enumerator = createAndAddDIE(dwarf::DW_TAG_enumerator, Buffer);
+      DIE &Enumerator = createAndAddDIE(dwarf::DW_TAG_enumerator, Buffer);
       StringRef Name = Enum.getName();
       addString(Enumerator, dwarf::DW_AT_name, Name);
       int64_t Value = Enum.getEnumValue();
@@ -1802,8 +1745,8 @@ void DwarfUnit::constructEnumTypeDIE(DIE &Buffer, DICompositeType CTy) {
   }
   DIType DTy = resolve(CTy.getTypeDerivedFrom());
   if (DTy) {
-    addType(&Buffer, DTy);
-    addFlag(&Buffer, dwarf::DW_AT_enum_class);
+    addType(Buffer, DTy);
+    addFlag(Buffer, dwarf::DW_AT_enum_class);
   }
 }
 
@@ -1813,48 +1756,51 @@ void DwarfUnit::constructContainingTypeDIEs() {
   for (DenseMap<DIE *, const MDNode *>::iterator CI = ContainingTypeMap.begin(),
                                                  CE = ContainingTypeMap.end();
        CI != CE; ++CI) {
-    DIE *SPDie = CI->first;
+    DIE &SPDie = *CI->first;
     DIDescriptor D(CI->second);
     if (!D)
       continue;
     DIE *NDie = getDIE(D);
     if (!NDie)
       continue;
-    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, NDie);
+    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, *NDie);
   }
 }
 
 /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-DIE *DwarfUnit::constructVariableDIE(DbgVariable &DV, bool isScopeAbstract) {
+std::unique_ptr<DIE> DwarfUnit::constructVariableDIE(DbgVariable &DV,
+                                                     bool Abstract) {
+  auto D = constructVariableDIEImpl(DV, Abstract);
+  DV.setDIE(*D);
+  return D;
+}
+
+std::unique_ptr<DIE> DwarfUnit::constructVariableDIEImpl(const DbgVariable &DV,
+                                                         bool Abstract) {
   StringRef Name = DV.getName();
 
   // Define variable debug information entry.
-  DIE *VariableDie = new DIE(DV.getTag());
+  auto VariableDie = make_unique<DIE>(DV.getTag());
   DbgVariable *AbsVar = DV.getAbstractVariable();
-  DIE *AbsDIE = AbsVar ? AbsVar->getDIE() : NULL;
-  if (AbsDIE)
-    addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin, AbsDIE);
+  if (AbsVar && AbsVar->getDIE())
+    addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin, *AbsVar->getDIE());
   else {
     if (!Name.empty())
-      addString(VariableDie, dwarf::DW_AT_name, Name);
-    addSourceLine(VariableDie, DV.getVariable());
-    addType(VariableDie, DV.getType());
+      addString(*VariableDie, dwarf::DW_AT_name, Name);
+    addSourceLine(*VariableDie, DV.getVariable());
+    addType(*VariableDie, DV.getType());
+    if (DV.isArtificial())
+      addFlag(*VariableDie, dwarf::DW_AT_artificial);
   }
 
-  if (DV.isArtificial())
-    addFlag(VariableDie, dwarf::DW_AT_artificial);
-
-  if (isScopeAbstract) {
-    DV.setDIE(VariableDie);
+  if (Abstract)
     return VariableDie;
-  }
 
   // Add variable address.
 
   unsigned Offset = DV.getDotDebugLocOffset();
   if (Offset != ~0U) {
-    addLocationList(VariableDie, dwarf::DW_AT_location, Offset);
-    DV.setDIE(VariableDie);
+    addLocationList(*VariableDie, dwarf::DW_AT_location, Offset);
     return VariableDie;
   }
 
@@ -1867,38 +1813,36 @@ DIE *DwarfUnit::constructVariableDIE(DbgVariable &DV, bool isScopeAbstract) {
       if (DVInsn->getOperand(1).isImm()) {
         MachineLocation Location(RegOp.getReg(),
                                  DVInsn->getOperand(1).getImm());
-        addVariableAddress(DV, VariableDie, Location);
+        addVariableAddress(DV, *VariableDie, Location);
       } else if (RegOp.getReg())
-        addVariableAddress(DV, VariableDie, MachineLocation(RegOp.getReg()));
+        addVariableAddress(DV, *VariableDie, MachineLocation(RegOp.getReg()));
     } else if (DVInsn->getOperand(0).isImm())
-      addConstantValue(VariableDie, DVInsn->getOperand(0), DV.getType());
+      addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType());
     else if (DVInsn->getOperand(0).isFPImm())
-      addConstantFPValue(VariableDie, DVInsn->getOperand(0));
+      addConstantFPValue(*VariableDie, DVInsn->getOperand(0));
     else if (DVInsn->getOperand(0).isCImm())
-      addConstantValue(VariableDie, DVInsn->getOperand(0).getCImm(),
-                       isUnsignedDIType(DD, DV.getType()));
+      addConstantValue(*VariableDie, DVInsn->getOperand(0).getCImm(),
+                       DV.getType());
 
-    DV.setDIE(VariableDie);
     return VariableDie;
-  } else {
-    // .. else use frame index.
-    int FI = DV.getFrameIndex();
-    if (FI != ~0) {
-      unsigned FrameReg = 0;
-      const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-      int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
-      MachineLocation Location(FrameReg, Offset);
-      addVariableAddress(DV, VariableDie, Location);
-    }
   }
 
-  DV.setDIE(VariableDie);
+  // .. else use frame index.
+  int FI = DV.getFrameIndex();
+  if (FI != ~0) {
+    unsigned FrameReg = 0;
+    const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
+    int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
+    MachineLocation Location(FrameReg, Offset);
+    addVariableAddress(DV, *VariableDie, Location);
+  }
+
   return VariableDie;
 }
 
 /// constructMemberDIE - Construct member DIE from DIDerivedType.
 void DwarfUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
-  DIE *MemberDie = createAndAddDIE(DT.getTag(), Buffer);
+  DIE &MemberDie = createAndAddDIE(DT.getTag(), Buffer);
   StringRef Name = DT.getName();
   if (!Name.empty())
     addString(MemberDie, dwarf::DW_AT_name, Name);
@@ -1914,13 +1858,13 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
     // BaseAddr = ObAddr + *((*ObAddr) - Offset)
 
     DIELoc *VBaseLocationDie = new (DIEValueAllocator) DIELoc();
-    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
-    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    addUInt(VBaseLocationDie, dwarf::DW_FORM_udata, DT.getOffsetInBits());
-    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
-    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_udata, DT.getOffsetInBits());
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
 
     addBlock(MemberDie, dwarf::DW_AT_data_member_location, VBaseLocationDie);
   } else {
@@ -1953,8 +1897,8 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
 
     if (DD->getDwarfVersion() <= 2) {
       DIELoc *MemLocationDie = new (DIEValueAllocator) DIELoc();
-      addUInt(MemLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-      addUInt(MemLocationDie, dwarf::DW_FORM_udata, OffsetInBytes);
+      addUInt(*MemLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+      addUInt(*MemLocationDie, dwarf::DW_FORM_udata, OffsetInBytes);
       addBlock(MemberDie, dwarf::DW_AT_data_member_location, MemLocationDie);
     } else
       addUInt(MemberDie, dwarf::DW_AT_data_member_location, None,
@@ -1978,8 +1922,8 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
   // Objective-C properties.
   if (MDNode *PNode = DT.getObjCProperty())
     if (DIEEntry *PropertyDie = getDIEEntry(PNode))
-      MemberDie->addValue(dwarf::DW_AT_APPLE_property, dwarf::DW_FORM_ref4,
-                          PropertyDie);
+      MemberDie.addValue(dwarf::DW_AT_APPLE_property, dwarf::DW_FORM_ref4,
+                         PropertyDie);
 
   if (DT.isArtificial())
     addFlag(MemberDie, dwarf::DW_AT_artificial);
@@ -1988,7 +1932,7 @@ void DwarfUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
 /// getOrCreateStaticMemberDIE - Create new DIE for C++ static member.
 DIE *DwarfUnit::getOrCreateStaticMemberDIE(DIDerivedType DT) {
   if (!DT.Verify())
-    return NULL;
+    return nullptr;
 
   // Construct the context before querying for the existence of the DIE in case
   // such construction creates the DIE.
@@ -1996,11 +1940,10 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(DIDerivedType DT) {
   assert(dwarf::isType(ContextDIE->getTag()) &&
          "Static member should belong to a type.");
 
-  DIE *StaticMemberDIE = getDIE(DT);
-  if (StaticMemberDIE)
+  if (DIE *StaticMemberDIE = getDIE(DT))
     return StaticMemberDIE;
 
-  StaticMemberDIE = createAndAddDIE(DT.getTag(), *ContextDIE, DT);
+  DIE &StaticMemberDIE = createAndAddDIE(DT.getTag(), *ContextDIE, DT);
 
   DIType Ty = resolve(DT.getTypeDerivedFrom());
 
@@ -2023,11 +1966,11 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(DIDerivedType DT) {
             dwarf::DW_ACCESS_public);
 
   if (const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(DT.getConstant()))
-    addConstantValue(StaticMemberDIE, CI, isUnsignedDIType(DD, Ty));
+    addConstantValue(StaticMemberDIE, CI, Ty);
   if (const ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(DT.getConstant()))
     addConstantFPValue(StaticMemberDIE, CFP);
 
-  return StaticMemberDIE;
+  return &StaticMemberDIE;
 }
 
 void DwarfUnit::emitHeader(const MCSymbol *ASectionSym) const {
@@ -2072,7 +2015,7 @@ void DwarfCompileUnit::initStmtList(MCSymbol *DwarfLineSectionSym) {
   MCSymbol *LineTableStartSym =
       Asm->OutStreamer.getDwarfLineTableSymbol(getUniqueID());
 
-  stmtListIndex = UnitDie->getValues().size();
+  stmtListIndex = UnitDie.getValues().size();
 
   // DW_AT_stmt_list is a offset of line number information for this
   // compile unit in debug_line section. For split dwarf this is
@@ -2080,16 +2023,16 @@ void DwarfCompileUnit::initStmtList(MCSymbol *DwarfLineSectionSym) {
   // The line table entries are not always emitted in assembly, so it
   // is not okay to use line_table_start here.
   if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-    addSectionLabel(UnitDie.get(), dwarf::DW_AT_stmt_list, LineTableStartSym);
+    addSectionLabel(UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym);
   else
-    addSectionDelta(UnitDie.get(), dwarf::DW_AT_stmt_list, LineTableStartSym,
+    addSectionDelta(UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym,
                     DwarfLineSectionSym);
 }
 
 void DwarfCompileUnit::applyStmtList(DIE &D) {
   D.addValue(dwarf::DW_AT_stmt_list,
-             UnitDie->getAbbrev().getData()[stmtListIndex].getForm(),
-             UnitDie->getValues()[stmtListIndex]);
+             UnitDie.getAbbrev().getData()[stmtListIndex].getForm(),
+             UnitDie.getValues()[stmtListIndex]);
 }
 
 void DwarfTypeUnit::emitHeader(const MCSymbol *ASectionSym) const {
@@ -2114,5 +2057,4 @@ void DwarfTypeUnit::initSection(const MCSection *Section) {
       Asm->GetTempSymbol(Section->getLabelBeginName(), getUniqueID());
   this->LabelEnd =
       Asm->GetTempSymbol(Section->getLabelEndName(), getUniqueID());
-  this->LabelRange = Asm->GetTempSymbol("gnu_ranges", getUniqueID());
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index ef713f7..acb7528 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -73,7 +73,7 @@ protected:
   DICompileUnit CUNode;
 
   /// Unit debug information entry.
-  const std::unique_ptr<DIE> UnitDie;
+  DIE UnitDie;
 
   /// Offset of the UnitDie from beginning of debug info section.
   unsigned DebugInfoOffset;
@@ -102,18 +102,6 @@ protected:
   /// GlobalTypes - A map of globally visible types for this unit.
   StringMap<const DIE *> GlobalTypes;
 
-  /// AccelNames - A map of names for the name accelerator table.
-  StringMap<std::vector<const DIE *> > AccelNames;
-
-  /// AccelObjC - A map of objc spec for the objc accelerator table.
-  StringMap<std::vector<const DIE *> > AccelObjC;
-
-  /// AccelNamespace - A map of names for the namespace accelerator table.
-  StringMap<std::vector<const DIE *> > AccelNamespace;
-
-  /// AccelTypes - A map of names for the type accelerator table.
-  StringMap<std::vector<std::pair<const DIE *, unsigned> > > AccelTypes;
-
   /// DIEBlocks - A list of all the DIEBlocks in use.
   std::vector<DIEBlock *> DIEBlocks;
   
@@ -150,20 +138,17 @@ protected:
   /// The end of the unit within its section.
   MCSymbol *LabelEnd;
 
-  /// The label for the start of the range sets for the elements of this unit.
-  MCSymbol *LabelRange;
-
   /// Skeleton unit associated with this unit.
   DwarfUnit *Skeleton;
 
-  DwarfUnit(unsigned UID, DIE *D, DICompileUnit CU, AsmPrinter *A,
+  DwarfUnit(unsigned UID, dwarf::Tag, DICompileUnit CU, AsmPrinter *A,
             DwarfDebug *DW, DwarfFile *DWU);
 
 public:
   virtual ~DwarfUnit();
 
   /// Set the skeleton unit associated with this unit.
-  void setSkeleton(DwarfUnit *Skel) { Skeleton = Skel; }
+  void setSkeleton(DwarfUnit &Skel) { Skeleton = &Skel; }
 
   /// Get the skeleton unit associated with this unit.
   DwarfUnit *getSkeleton() const { return Skeleton; }
@@ -179,7 +164,6 @@ public:
         Asm->GetTempSymbol(Section->getLabelBeginName(), getUniqueID());
     this->LabelEnd =
         Asm->GetTempSymbol(Section->getLabelEndName(), getUniqueID());
-    this->LabelRange = Asm->GetTempSymbol("gnu_ranges", getUniqueID());
   }
 
   const MCSection *getSection() const {
@@ -218,38 +202,19 @@ public:
     return LabelEnd;
   }
 
-  MCSymbol *getLabelRange() const {
-    assert(Section);
-    return LabelRange;
-  }
-
   // Accessors.
   unsigned getUniqueID() const { return UniqueID; }
   uint16_t getLanguage() const { return CUNode.getLanguage(); }
   DICompileUnit getCUNode() const { return CUNode; }
-  DIE *getUnitDie() const { return UnitDie.get(); }
+  DIE &getUnitDie() { return UnitDie; }
   const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; }
   const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; }
 
-  const StringMap<std::vector<const DIE *> > &getAccelNames() const {
-    return AccelNames;
-  }
-  const StringMap<std::vector<const DIE *> > &getAccelObjC() const {
-    return AccelObjC;
-  }
-  const StringMap<std::vector<const DIE *> > &getAccelNamespace() const {
-    return AccelNamespace;
-  }
-  const StringMap<std::vector<std::pair<const DIE *, unsigned> > > &
-  getAccelTypes() const {
-    return AccelTypes;
-  }
-
   unsigned getDebugInfoOffset() const { return DebugInfoOffset; }
   void setDebugInfoOffset(unsigned DbgInfoOff) { DebugInfoOffset = DbgInfoOff; }
 
   /// hasContent - Return true if this compile unit has something to write out.
-  bool hasContent() const { return !UnitDie->getChildren().empty(); }
+  bool hasContent() const { return !UnitDie.getChildren().empty(); }
 
   /// addRange - Add an address range to the list of ranges for this unit.
   void addRange(RangeSpan Range);
@@ -273,19 +238,10 @@ public:
 
   /// addGlobalName - Add a new global entity to the compile unit.
   ///
-  void addGlobalName(StringRef Name, DIE *Die, DIScope Context);
-
-  /// addAccelName - Add a new name to the name accelerator table.
-  void addAccelName(StringRef Name, const DIE *Die);
-
-  /// addAccelObjC - Add a new name to the ObjC accelerator table.
-  void addAccelObjC(StringRef Name, const DIE *Die);
+  void addGlobalName(StringRef Name, DIE &Die, DIScope Context);
 
   /// addAccelNamespace - Add a new name to the namespace accelerator table.
-  void addAccelNamespace(StringRef Name, const DIE *Die);
-
-  /// addAccelType - Add a new type to the type accelerator table.
-  void addAccelType(StringRef Name, std::pair<const DIE *, unsigned> Die);
+  void addAccelNamespace(StringRef Name, const DIE &Die);
 
   /// getDIE - Returns the debug information entry map slot for the
   /// specified debug variable. We delegate the request to DwarfDebug
@@ -303,118 +259,116 @@ public:
   /// kept in DwarfDebug.
   void insertDIE(DIDescriptor Desc, DIE *D);
 
-  /// addDie - Adds or interns the DIE to the compile unit.
-  ///
-  void addDie(DIE *Buffer) { UnitDie->addChild(Buffer); }
-
   /// addFlag - Add a flag that is true to the DIE.
-  void addFlag(DIE *Die, dwarf::Attribute Attribute);
+  void addFlag(DIE &Die, dwarf::Attribute Attribute);
 
   /// addUInt - Add an unsigned integer attribute data and value.
-  void addUInt(DIE *Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
+  void addUInt(DIE &Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
                uint64_t Integer);
 
-  void addUInt(DIE *Block, dwarf::Form Form, uint64_t Integer);
+  void addUInt(DIE &Block, dwarf::Form Form, uint64_t Integer);
 
   /// addSInt - Add an signed integer attribute data and value.
-  void addSInt(DIE *Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
+  void addSInt(DIE &Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
                int64_t Integer);
 
-  void addSInt(DIELoc *Die, Optional<dwarf::Form> Form, int64_t Integer);
+  void addSInt(DIELoc &Die, Optional<dwarf::Form> Form, int64_t Integer);
 
   /// addString - Add a string attribute data and value.
-  void addString(DIE *Die, dwarf::Attribute Attribute, const StringRef Str);
+  void addString(DIE &Die, dwarf::Attribute Attribute, const StringRef Str);
 
   /// addLocalString - Add a string attribute data and value.
-  void addLocalString(DIE *Die, dwarf::Attribute Attribute,
+  void addLocalString(DIE &Die, dwarf::Attribute Attribute,
                       const StringRef Str);
 
   /// addExpr - Add a Dwarf expression attribute data and value.
-  void addExpr(DIELoc *Die, dwarf::Form Form, const MCExpr *Expr);
+  void addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr);
 
   /// addLabel - Add a Dwarf label attribute data and value.
-  void addLabel(DIE *Die, dwarf::Attribute Attribute, dwarf::Form Form,
+  void addLabel(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form,
                 const MCSymbol *Label);
 
-  void addLabel(DIELoc *Die, dwarf::Form Form, const MCSymbol *Label);
+  void addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label);
 
   /// addLocationList - Add a Dwarf loclistptr attribute data and value.
-  void addLocationList(DIE *Die, dwarf::Attribute Attribute, unsigned Index);
+  void addLocationList(DIE &Die, dwarf::Attribute Attribute, unsigned Index);
 
   /// addSectionLabel - Add a Dwarf section label attribute data and value.
   ///
-  void addSectionLabel(DIE *Die, dwarf::Attribute Attribute,
+  void addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
                        const MCSymbol *Label);
 
   /// addSectionOffset - Add an offset into a section attribute data and value.
   ///
-  void addSectionOffset(DIE *Die, dwarf::Attribute Attribute, uint64_t Integer);
+  void addSectionOffset(DIE &Die, dwarf::Attribute Attribute, uint64_t Integer);
 
   /// addOpAddress - Add a dwarf op address data and value using the
   /// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
-  void addOpAddress(DIELoc *Die, const MCSymbol *Label);
+  void addOpAddress(DIELoc &Die, const MCSymbol *Label);
 
   /// addSectionDelta - Add a label delta attribute data and value.
-  void addSectionDelta(DIE *Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
+  void addSectionDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
                        const MCSymbol *Lo);
 
   /// addLabelDelta - Add a label delta attribute data and value.
-  void addLabelDelta(DIE *Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
+  void addLabelDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
                      const MCSymbol *Lo);
 
   /// addDIEEntry - Add a DIE attribute data and value.
-  void addDIEEntry(DIE *Die, dwarf::Attribute Attribute, DIE *Entry);
+  void addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIE &Entry);
 
   /// addDIEEntry - Add a DIE attribute data and value.
-  void addDIEEntry(DIE *Die, dwarf::Attribute Attribute, DIEEntry *Entry);
+  void addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIEEntry *Entry);
 
-  void addDIETypeSignature(DIE *Die, const DwarfTypeUnit &Type);
+  void addDIETypeSignature(DIE &Die, const DwarfTypeUnit &Type);
 
   /// addBlock - Add block data.
-  void addBlock(DIE *Die, dwarf::Attribute Attribute, DIELoc *Block);
+  void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block);
 
   /// addBlock - Add block data.
-  void addBlock(DIE *Die, dwarf::Attribute Attribute, DIEBlock *Block);
+  void addBlock(DIE &Die, dwarf::Attribute Attribute, DIEBlock *Block);
 
   /// addSourceLine - Add location information to specified debug information
   /// entry.
-  void addSourceLine(DIE *Die, unsigned Line, StringRef File,
+  void addSourceLine(DIE &Die, unsigned Line, StringRef File,
                      StringRef Directory);
-  void addSourceLine(DIE *Die, DIVariable V);
-  void addSourceLine(DIE *Die, DIGlobalVariable G);
-  void addSourceLine(DIE *Die, DISubprogram SP);
-  void addSourceLine(DIE *Die, DIType Ty);
-  void addSourceLine(DIE *Die, DINameSpace NS);
-  void addSourceLine(DIE *Die, DIObjCProperty Ty);
+  void addSourceLine(DIE &Die, DIVariable V);
+  void addSourceLine(DIE &Die, DIGlobalVariable G);
+  void addSourceLine(DIE &Die, DISubprogram SP);
+  void addSourceLine(DIE &Die, DIType Ty);
+  void addSourceLine(DIE &Die, DINameSpace NS);
+  void addSourceLine(DIE &Die, DIObjCProperty Ty);
 
   /// addAddress - Add an address attribute to a die based on the location
   /// provided.
-  void addAddress(DIE *Die, dwarf::Attribute Attribute,
+  void addAddress(DIE &Die, dwarf::Attribute Attribute,
                   const MachineLocation &Location, bool Indirect = false);
 
   /// addConstantValue - Add constant value entry in variable DIE.
-  void addConstantValue(DIE *Die, const MachineOperand &MO, DIType Ty);
-  void addConstantValue(DIE *Die, const ConstantInt *CI, bool Unsigned);
-  void addConstantValue(DIE *Die, const APInt &Val, bool Unsigned);
+  void addConstantValue(DIE &Die, const MachineOperand &MO, DIType Ty);
+  void addConstantValue(DIE &Die, const ConstantInt *CI, DIType Ty);
+  void addConstantValue(DIE &Die, const APInt &Val, DIType Ty);
+  void addConstantValue(DIE &Die, const APInt &Val, bool Unsigned);
+  void addConstantValue(DIE &Die, bool Unsigned, uint64_t Val);
 
   /// addConstantFPValue - Add constant value entry in variable DIE.
-  void addConstantFPValue(DIE *Die, const MachineOperand &MO);
-  void addConstantFPValue(DIE *Die, const ConstantFP *CFP);
+  void addConstantFPValue(DIE &Die, const MachineOperand &MO);
+  void addConstantFPValue(DIE &Die, const ConstantFP *CFP);
 
   /// addTemplateParams - Add template parameters in buffer.
   void addTemplateParams(DIE &Buffer, DIArray TParams);
 
   /// addRegisterOp - Add register operand.
-  void addRegisterOp(DIELoc *TheDie, unsigned Reg);
+  void addRegisterOp(DIELoc &TheDie, unsigned Reg);
 
   /// addRegisterOffset - Add register offset.
-  void addRegisterOffset(DIELoc *TheDie, unsigned Reg, int64_t Offset);
+  void addRegisterOffset(DIELoc &TheDie, unsigned Reg, int64_t Offset);
 
   /// addComplexAddress - Start with the address based on the location provided,
   /// and generate the DWARF information necessary to find the actual variable
   /// (navigating the extra location information encoded in the type) based on
   /// the starting location.  Add the DWARF information to the die.
-  void addComplexAddress(const DbgVariable &DV, DIE *Die,
+  void addComplexAddress(const DbgVariable &DV, DIE &Die,
                          dwarf::Attribute Attribute,
                          const MachineLocation &Location);
 
@@ -424,19 +378,19 @@ public:
   /// actual Block variable (navigating the Block struct) based on the
   /// starting location.  Add the DWARF information to the die.  Obsolete,
   /// please use addComplexAddress instead.
-  void addBlockByrefAddress(const DbgVariable &DV, DIE *Die,
+  void addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
                             dwarf::Attribute Attribute,
                             const MachineLocation &Location);
 
   /// addVariableAddress - Add DW_AT_location attribute for a
   /// DbgVariable based on provided MachineLocation.
-  void addVariableAddress(const DbgVariable &DV, DIE *Die,
+  void addVariableAddress(const DbgVariable &DV, DIE &Die,
                           MachineLocation Location);
 
   /// addType - Add a new type attribute to the specified entity. This takes
   /// and attribute parameter because DW_AT_friend attributes are also
   /// type references.
-  void addType(DIE *Entity, DIType Ty,
+  void addType(DIE &Entity, DIType Ty,
                dwarf::Attribute Attribute = dwarf::DW_AT_type);
 
   /// getOrCreateNameSpace - Create a DIE for DINameSpace.
@@ -445,6 +399,8 @@ public:
   /// getOrCreateSubprogramDIE - Create new DIE using SP.
   DIE *getOrCreateSubprogramDIE(DISubprogram SP);
 
+  void applySubprogramAttributes(DISubprogram SP, DIE &SPDie);
+
   /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
   /// given DIType.
   DIE *getOrCreateTypeDIE(const MDNode *N);
@@ -460,14 +416,15 @@ public:
   void constructContainingTypeDIEs();
 
   /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-  DIE *constructVariableDIE(DbgVariable &DV, bool isScopeAbstract);
+  std::unique_ptr<DIE> constructVariableDIE(DbgVariable &DV,
+                                            bool Abstract = false);
 
   /// constructSubprogramArguments - Construct function argument DIEs.
   void constructSubprogramArguments(DIE &Buffer, DIArray Args);
 
   /// Create a DIE with the given Tag, add the DIE to its parent, and
   /// call insertDIE if MD is not null.
-  DIE *createAndAddDIE(unsigned Tag, DIE &Parent,
+  DIE &createAndAddDIE(unsigned Tag, DIE &Parent,
                        DIDescriptor N = DIDescriptor());
 
   /// Compute the size of a header for this unit, not including the initial
@@ -483,6 +440,9 @@ public:
 
   virtual DwarfCompileUnit &getCU() = 0;
 
+  /// constructTypeDIE - Construct type DIE from DICompositeType.
+  void constructTypeDIE(DIE &Buffer, DICompositeType CTy);
+
 protected:
   /// getOrCreateStaticMemberDIE - Create new static data member DIE.
   DIE *getOrCreateStaticMemberDIE(DIDerivedType DT);
@@ -492,15 +452,17 @@ protected:
   virtual unsigned getOrCreateSourceID(StringRef File, StringRef Directory) = 0;
 
 private:
+  /// \brief Construct a DIE for the given DbgVariable without initializing the
+  /// DbgVariable's DIE reference.
+  std::unique_ptr<DIE> constructVariableDIEImpl(const DbgVariable &DV,
+                                                bool Abstract);
+
   /// constructTypeDIE - Construct basic type die from DIBasicType.
   void constructTypeDIE(DIE &Buffer, DIBasicType BTy);
 
   /// constructTypeDIE - Construct derived type die from DIDerivedType.
   void constructTypeDIE(DIE &Buffer, DIDerivedType DTy);
 
-  /// constructTypeDIE - Construct type DIE from DICompositeType.
-  void constructTypeDIE(DIE &Buffer, DICompositeType CTy);
-
   /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
   void constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy);
 
@@ -547,7 +509,7 @@ private:
 
   /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
   /// information entry.
-  DIEEntry *createDIEEntry(DIE *Entry);
+  DIEEntry *createDIEEntry(DIE &Entry);
 
   /// resolve - Look in the DwarfDebug map for the MDNode that
   /// corresponds to the reference.
@@ -557,7 +519,7 @@ private:
 
   /// If this is a named finished type then include it in the list of types for
   /// the accelerator tables.
-  void updateAcceleratorTables(DIScope Context, DIType Ty, const DIE *TyDIE);
+  void updateAcceleratorTables(DIScope Context, DIType Ty, const DIE &TyDIE);
 };
 
 class DwarfCompileUnit : public DwarfUnit {
@@ -566,7 +528,7 @@ class DwarfCompileUnit : public DwarfUnit {
   unsigned stmtListIndex;
 
 public:
-  DwarfCompileUnit(unsigned UID, DIE *D, DICompileUnit Node, AsmPrinter *A,
+  DwarfCompileUnit(unsigned UID, DICompileUnit Node, AsmPrinter *A,
                    DwarfDebug *DW, DwarfFile *DWU);
 
   void initStmtList(MCSymbol *DwarfLineSectionSym);
@@ -579,12 +541,12 @@ public:
 
   /// addLabelAddress - Add a dwarf label attribute data and value using
   /// either DW_FORM_addr or DW_FORM_GNU_addr_index.
-  void addLabelAddress(DIE *Die, dwarf::Attribute Attribute,
+  void addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
                        const MCSymbol *Label);
 
   /// addLocalLabelAddress - Add a dwarf label attribute data and value using
   /// DW_FORM_addr only.
-  void addLocalLabelAddress(DIE *Die, dwarf::Attribute Attribute,
+  void addLocalLabelAddress(DIE &Die, dwarf::Attribute Attribute,
                             const MCSymbol *Label);
 
   DwarfCompileUnit &getCU() override { return *this; }
@@ -600,7 +562,7 @@ private:
   MCDwarfDwoLineTable *SplitLineTable;
 
 public:
-  DwarfTypeUnit(unsigned UID, DIE *D, DwarfCompileUnit &CU, AsmPrinter *A,
+  DwarfTypeUnit(unsigned UID, DwarfCompileUnit &CU, AsmPrinter *A,
                 DwarfDebug *DW, DwarfFile *DWU,
                 MCDwarfDwoLineTable *SplitLineTable = nullptr);
 
diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
index 50b2ca8..2212941 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
@@ -29,7 +29,7 @@ StringRef WinCodeViewLineTables::getFullFilepath(const MDNode *S) {
   StringRef Dir = Scope.getDirectory(),
             Filename = Scope.getFilename();
   char *&Result = DirAndFilenameToFilepathMap[std::make_pair(Dir, Filename)];
-  if (Result != 0)
+  if (Result)
     return Result;
 
   // Clang emits directory and relative filename info into the IR, but CodeView
@@ -102,7 +102,7 @@ void WinCodeViewLineTables::maybeRecordLocation(DebugLoc DL,
 }
 
 WinCodeViewLineTables::WinCodeViewLineTables(AsmPrinter *AP)
-    : Asm(0), CurFn(0) {
+    : Asm(nullptr), CurFn(nullptr) {
   MachineModuleInfo *MMI = AP->MMI;
 
   // If module doesn't have named metadata anchors or COFF debug section
@@ -171,7 +171,7 @@ void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
   EmitLabelDiff(Asm->OutStreamer, Fn, FI.End);
 
   // PC-to-linenumber lookup table:
-  MCSymbol *FileSegmentEnd = 0;
+  MCSymbol *FileSegmentEnd = nullptr;
   for (size_t J = 0, F = FI.Instrs.size(); J != F; ++J) {
     MCSymbol *Instr = FI.Instrs[J];
     assert(InstrInfo.count(Instr));
@@ -216,7 +216,7 @@ void WinCodeViewLineTables::endModule() {
   if (FnDebugInfo.empty())
     return;
 
-  assert(Asm != 0);
+  assert(Asm != nullptr);
   Asm->OutStreamer.SwitchSection(
       Asm->getObjFileLowering().getCOFFDebugSymbolsSection());
   Asm->EmitInt32(COFF::DEBUG_SECTION_MAGIC);
@@ -277,20 +277,19 @@ void WinCodeViewLineTables::beginFunction(const MachineFunction *MF) {
   // for the first instruction of the function, not the last of the prolog?
   DebugLoc PrologEndLoc;
   bool EmptyPrologue = true;
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-       I != E && PrologEndLoc.isUnknown(); ++I) {
-    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
-         II != IE; ++II) {
-      const MachineInstr *MI = II;
-      if (MI->isDebugValue())
+  for (const auto &MBB : *MF) {
+    if (!PrologEndLoc.isUnknown())
+      break;
+    for (const auto &MI : MBB) {
+      if (MI.isDebugValue())
         continue;
 
       // First known non-DBG_VALUE and non-frame setup location marks
       // the beginning of the function body.
       // FIXME: do we need the first subcondition?
-      if (!MI->getFlag(MachineInstr::FrameSetup) &&
-          (!MI->getDebugLoc().isUnknown())) {
-        PrologEndLoc = MI->getDebugLoc();
+      if (!MI.getFlag(MachineInstr::FrameSetup) &&
+          (!MI.getDebugLoc().isUnknown())) {
+        PrologEndLoc = MI.getDebugLoc();
         break;
       }
       EmptyPrologue = false;
@@ -321,7 +320,7 @@ void WinCodeViewLineTables::endFunction(const MachineFunction *MF) {
     Asm->OutStreamer.EmitLabel(FunctionEndSym);
     CurFn->End = FunctionEndSym;
   }
-  CurFn = 0;
+  CurFn = nullptr;
 }
 
 void WinCodeViewLineTables::beginInstruction(const MachineInstr *MI) {
diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
index a7a6205..0734d97 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
@@ -38,7 +38,7 @@ class WinCodeViewLineTables : public AsmPrinterHandler {
   struct FunctionInfo {
     SmallVector<MCSymbol *, 10> Instrs;
     MCSymbol *End;
-    FunctionInfo() : End(0) {}
+    FunctionInfo() : End(nullptr) {}
   } *CurFn;
 
   typedef DenseMap<const Function *, FunctionInfo> FnDebugInfoTy;
@@ -104,7 +104,7 @@ class WinCodeViewLineTables : public AsmPrinterHandler {
   void maybeRecordLocation(DebugLoc DL, const MachineFunction *MF);
 
   void clear() {
-    assert(CurFn == 0);
+    assert(CurFn == nullptr);
     FileNameRegistry.clear();
     InstrInfo.clear();
   }
diff --git a/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp b/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
new file mode 100644
index 0000000..d995333
--- /dev/null
+++ b/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
@@ -0,0 +1,337 @@
+//===-- AtomicExpandLoadLinkedPass.cpp - Expand atomic instructions -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass (at IR level) to replace atomic instructions with
+// appropriate (intrinsic-based) ldrex/strex loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-atomic-expand"
+
+namespace {
+  class AtomicExpandLoadLinked : public FunctionPass {
+    const TargetLowering *TLI;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit AtomicExpandLoadLinked(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TLI(TM ? TM->getTargetLowering() : nullptr) {
+      initializeAtomicExpandLoadLinkedPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+    bool expandAtomicInsts(Function &F);
+
+    bool expandAtomicLoad(LoadInst *LI);
+    bool expandAtomicStore(StoreInst *LI);
+    bool expandAtomicRMW(AtomicRMWInst *AI);
+    bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
+
+    AtomicOrdering insertLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
+    void insertTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
+  };
+}
+
+char AtomicExpandLoadLinked::ID = 0;
+char &llvm::AtomicExpandLoadLinkedID = AtomicExpandLoadLinked::ID;
+
+static void *initializeAtomicExpandLoadLinkedPassOnce(PassRegistry &Registry) {
+  PassInfo *PI = new PassInfo(
+      "Expand Atomic calls in terms of load-linked & store-conditional",
+      "atomic-ll-sc", &AtomicExpandLoadLinked::ID,
+      PassInfo::NormalCtor_t(callDefaultCtor<AtomicExpandLoadLinked>), false,
+      false, PassInfo::TargetMachineCtor_t(
+                 callTargetMachineCtor<AtomicExpandLoadLinked>));
+  Registry.registerPass(*PI, true);
+  return PI;
+}
+
+void llvm::initializeAtomicExpandLoadLinkedPass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializeAtomicExpandLoadLinkedPassOnce)
+}
+
+
+FunctionPass *llvm::createAtomicExpandLoadLinkedPass(const TargetMachine *TM) {
+  return new AtomicExpandLoadLinked(TM);
+}
+
+bool AtomicExpandLoadLinked::runOnFunction(Function &F) {
+  if (!TLI)
+    return false;
+
+  SmallVector<Instruction *, 1> AtomicInsts;
+
+  // Changing control-flow while iterating through it is a bad idea, so gather a
+  // list of all atomic instructions before we start.
+  for (BasicBlock &BB : F)
+    for (Instruction &Inst : BB) {
+      if (isa<AtomicRMWInst>(&Inst) || isa<AtomicCmpXchgInst>(&Inst) ||
+          (isa<LoadInst>(&Inst) && cast<LoadInst>(&Inst)->isAtomic()) ||
+          (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
+        AtomicInsts.push_back(&Inst);
+    }
+
+  bool MadeChange = false;
+  for (Instruction *Inst : AtomicInsts) {
+    if (!TLI->shouldExpandAtomicInIR(Inst))
+      continue;
+
+    if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
+      MadeChange |= expandAtomicRMW(AI);
+    else if (AtomicCmpXchgInst *CI = dyn_cast<AtomicCmpXchgInst>(Inst))
+      MadeChange |= expandAtomicCmpXchg(CI);
+    else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+      MadeChange |= expandAtomicLoad(LI);
+    else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      MadeChange |= expandAtomicStore(SI);
+    else
+      llvm_unreachable("Unknown atomic instruction");
+  }
+
+  return MadeChange;
+}
+
+bool AtomicExpandLoadLinked::expandAtomicLoad(LoadInst *LI) {
+  // Load instructions don't actually need a leading fence, even in the
+  // SequentiallyConsistent case.
+  AtomicOrdering MemOpOrder =
+    TLI->getInsertFencesForAtomic() ? Monotonic : LI->getOrdering();
+
+  // The only 64-bit load guaranteed to be single-copy atomic by the ARM ARM is
+  // an ldrexd (A3.5.3).
+  IRBuilder<> Builder(LI);
+  Value *Val =
+      TLI->emitLoadLinked(Builder, LI->getPointerOperand(), MemOpOrder);
+
+  insertTrailingFence(Builder, LI->getOrdering());
+
+  LI->replaceAllUsesWith(Val);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+bool AtomicExpandLoadLinked::expandAtomicStore(StoreInst *SI) {
+  // The only atomic 64-bit store on ARM is an strexd that succeeds, which means
+  // we need a loop and the entire instruction is essentially an "atomicrmw
+  // xchg" that ignores the value loaded.
+  IRBuilder<> Builder(SI);
+  AtomicRMWInst *AI =
+      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
+                              SI->getValueOperand(), SI->getOrdering());
+  SI->eraseFromParent();
+
+  // Now we have an appropriate swap instruction, lower it as usual.
+  return expandAtomicRMW(AI);
+}
+
+bool AtomicExpandLoadLinked::expandAtomicRMW(AtomicRMWInst *AI) {
+  AtomicOrdering Order = AI->getOrdering();
+  Value *Addr = AI->getPointerOperand();
+  BasicBlock *BB = AI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     fence?
+  // atomicrmw.start:
+  //     %loaded = @load.linked(%addr)
+  //     %new = some_op iN %loaded, %incr
+  //     %stored = @store_conditional(%new, %addr)
+  //     %try_again = icmp i32 ne %stored, 0
+  //     br i1 %try_again, label %loop, label %atomicrmw.end
+  // atomicrmw.end:
+  //     fence?
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
+  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // This grabs the DebugLoc from AI.
+  IRBuilder<> Builder(AI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we might want a fence too. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, Order);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+
+  Value *NewVal;
+  switch (AI->getOperation()) {
+  case AtomicRMWInst::Xchg:
+    NewVal = AI->getValOperand();
+    break;
+  case AtomicRMWInst::Add:
+    NewVal = Builder.CreateAdd(Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::Sub:
+    NewVal = Builder.CreateSub(Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::And:
+    NewVal = Builder.CreateAnd(Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::Nand:
+    NewVal = Builder.CreateAnd(Loaded, Builder.CreateNot(AI->getValOperand()),
+                               "new");
+    break;
+  case AtomicRMWInst::Or:
+    NewVal = Builder.CreateOr(Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::Xor:
+    NewVal = Builder.CreateXor(Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::Max:
+    NewVal = Builder.CreateICmpSGT(Loaded, AI->getValOperand());
+    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::Min:
+    NewVal = Builder.CreateICmpSLE(Loaded, AI->getValOperand());
+    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::UMax:
+    NewVal = Builder.CreateICmpUGT(Loaded, AI->getValOperand());
+    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
+    break;
+  case AtomicRMWInst::UMin:
+    NewVal = Builder.CreateICmpULE(Loaded, AI->getValOperand());
+    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
+    break;
+  default:
+    llvm_unreachable("Unknown atomic op");
+  }
+
+  Value *StoreSuccess =
+      TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
+  Value *TryAgain = Builder.CreateICmpNE(
+      StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
+  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  insertTrailingFence(Builder, Order);
+
+  AI->replaceAllUsesWith(Loaded);
+  AI->eraseFromParent();
+
+  return true;
+}
+
+bool AtomicExpandLoadLinked::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
+  AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
+  AtomicOrdering FailureOrder = CI->getFailureOrdering();
+  Value *Addr = CI->getPointerOperand();
+  BasicBlock *BB = CI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+
+  // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
+  //
+  // The full expansion we produce is:
+  //     [...]
+  //     fence?
+  // cmpxchg.start:
+  //     %loaded = @load.linked(%addr)
+  //     %should_store = icmp eq %loaded, %desired
+  //     br i1 %should_store, label %cmpxchg.trystore,
+  //                          label %cmpxchg.end/%cmpxchg.barrier
+  // cmpxchg.trystore:
+  //     %stored = @store_conditional(%new, %addr)
+  //     %try_again = icmp i32 ne %stored, 0
+  //     br i1 %try_again, label %loop, label %cmpxchg.end
+  // cmpxchg.barrier:
+  //     fence?
+  //     br label %cmpxchg.end
+  // cmpxchg.end:
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end");
+  auto BarrierBB = BasicBlock::Create(Ctx, "cmpxchg.barrier", F, ExitBB);
+  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, BarrierBB);
+  auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
+
+  // This grabs the DebugLoc from CI
+  IRBuilder<> Builder(CI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we might want a fence too. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, SuccessOrder);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+  Value *ShouldStore =
+      Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store");
+
+  // If the the cmpxchg doesn't actually need any ordering when it fails, we can
+  // jump straight past that fence instruction (if it exists).
+  BasicBlock *FailureBB = FailureOrder == Monotonic ? ExitBB : BarrierBB;
+  Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
+
+  Builder.SetInsertPoint(TryStoreBB);
+  Value *StoreSuccess = TLI->emitStoreConditional(
+      Builder, CI->getNewValOperand(), Addr, MemOpOrder);
+  Value *TryAgain = Builder.CreateICmpNE(
+      StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
+  Builder.CreateCondBr(TryAgain, LoopBB, BarrierBB);
+
+  // Finally, make sure later instructions don't get reordered with a fence if
+  // necessary.
+  Builder.SetInsertPoint(BarrierBB);
+  insertTrailingFence(Builder, SuccessOrder);
+  Builder.CreateBr(ExitBB);
+
+  CI->replaceAllUsesWith(Loaded);
+  CI->eraseFromParent();
+
+  return true;
+}
+
+AtomicOrdering AtomicExpandLoadLinked::insertLeadingFence(IRBuilder<> &Builder,
+                                                       AtomicOrdering Ord) {
+  if (!TLI->getInsertFencesForAtomic())
+    return Ord;
+
+  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
+    Builder.CreateFence(Release);
+
+  // The exclusive operations don't need any barrier if we're adding separate
+  // fences.
+  return Monotonic;
+}
+
+void AtomicExpandLoadLinked::insertTrailingFence(IRBuilder<> &Builder,
+                                              AtomicOrdering Ord) {
+  if (!TLI->getInsertFencesForAtomic())
+    return;
+
+  if (Ord == Acquire || Ord == AcquireRelease)
+    Builder.CreateFence(Acquire);
+  else if (Ord == SequentiallyConsistent)
+    Builder.CreateFence(SequentiallyConsistent);
+}
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index c6654ec2..7f31b1a 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -15,13 +15,21 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "basictti"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <utility>
 using namespace llvm;
 
+static cl::opt<unsigned>
+PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0),
+  cl::desc("Threshold for partial unrolling"), cl::Hidden);
+
+#define DEBUG_TYPE "basictti"
+
 namespace {
 
 class BasicTTI final : public ImmutablePass, public TargetTransformInfo {
@@ -34,7 +42,7 @@ class BasicTTI final : public ImmutablePass, public TargetTransformInfo {
   const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); }
 
 public:
-  BasicTTI() : ImmutablePass(ID), TM(0) {
+  BasicTTI() : ImmutablePass(ID), TM(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
@@ -186,7 +194,61 @@ bool BasicTTI::haveFastSqrt(Type *Ty) const {
   return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
 }
 
-void BasicTTI::getUnrollingPreferences(Loop *, UnrollingPreferences &) const { }
+void BasicTTI::getUnrollingPreferences(Loop *L,
+                                       UnrollingPreferences &UP) const {
+  // This unrolling functionality is target independent, but to provide some
+  // motivation for its intended use, for x86:
+
+  // According to the Intel 64 and IA-32 Architectures Optimization Reference
+  // Manual, Intel Core models and later have a loop stream detector
+  // (and associated uop queue) that can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
+  //    taken, and none of them may be calls.
+  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
+
+  // According to the Software Optimization Guide for AMD Family 15h Processors,
+  // models 30h-4fh (Steamroller and later) have a loop predictor and loop
+  // buffer which can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have fewer than 16 branches
+  //  - The loop must have less than 40 uops in all executed loop branches
+
+  // The number of taken branches in a loop is hard to estimate here, and
+  // benchmarking has revealed that it is better not to be conservative when
+  // estimating the branch count. As a result, we'll ignore the branch limits
+  // until someone finds a case where it matters in practice.
+
+  unsigned MaxOps;
+  const TargetSubtargetInfo *ST = &TM->getSubtarget<TargetSubtargetInfo>();
+  if (PartialUnrollingThreshold.getNumOccurrences() > 0)
+    MaxOps = PartialUnrollingThreshold;
+  else if (ST->getSchedModel()->LoopMicroOpBufferSize > 0)
+    MaxOps = ST->getSchedModel()->LoopMicroOpBufferSize;
+  else
+    return;
+
+  // Scan the loop: don't unroll loops with calls.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+
+    for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
+      if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
+        ImmutableCallSite CS(J);
+        if (const Function *F = CS.getCalledFunction()) {
+          if (!TopTTI->isLoweredToCall(F))
+            continue;
+        }
+
+        return;
+      }
+  }
+
+  // Enable runtime and partial unrolling up to the specified size.
+  UP.Partial = UP.Runtime = true;
+  UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
+}
 
 //===----------------------------------------------------------------------===//
 //
@@ -424,12 +486,14 @@ unsigned BasicTTI::getMemoryOpCost(unsigned Opcode, Type *Src,
     // This is a vector load that legalizes to a larger type than the vector
     // itself. Unless the corresponding extending load or truncating store is
     // legal, then this will scalarize.
-    TargetLowering::LegalizeAction LA;
-    MVT MemVT = getTLI()->getSimpleValueType(Src, true);
-    if (Opcode == Instruction::Store)
-      LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
-    else
-      LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, MemVT);
+    TargetLowering::LegalizeAction LA = TargetLowering::Expand;
+    EVT MemVT = getTLI()->getValueType(Src, true);
+    if (MemVT.isSimple() && MemVT != MVT::Other) {
+      if (Opcode == Instruction::Store)
+        LA = getTLI()->getTruncStoreAction(LT.second, MemVT.getSimpleVT());
+      else
+        LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, MemVT.getSimpleVT());
+    }
 
     if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
       // This is a vector load/store for some illegal type that is scalarized.
@@ -484,7 +548,7 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   case Intrinsic::round:   ISD = ISD::FROUND; break;
   case Intrinsic::pow:     ISD = ISD::FPOW;   break;
   case Intrinsic::fma:     ISD = ISD::FMA;    break;
-  case Intrinsic::fmuladd: ISD = ISD::FMA;    break; // FIXME: mul + add?
+  case Intrinsic::fmuladd: ISD = ISD::FMA;    break;
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end:
     return 0;
@@ -509,6 +573,12 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     return LT.first * 2;
   }
 
+  // If we can't lower fmuladd into an FMA estimate the cost as a floating
+  // point mul followed by an add.
+  if (IID == Intrinsic::fmuladd)
+    return TopTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
+           TopTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
+
   // Else, assume that we need to scalarize this intrinsic. For math builtins
   // this will emit a costly libcall, adding call overhead and spills. Make it
   // very expensive.
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index b39777e..f623a48 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "branchfolding"
 #include "BranchFolding.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -38,6 +37,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "branchfolding"
+
 STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
 STATISTIC(NumBranchOpts, "Number of branches optimized");
 STATISTIC(NumTailMerge , "Number of block tails merged");
@@ -189,7 +190,7 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
   TII = tii;
   TRI = tri;
   MMI = mmi;
-  RS = NULL;
+  RS = nullptr;
 
   // Use a RegScavenger to help update liveness when required.
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -201,7 +202,7 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
   // Fix CFG.  The later algorithms expect it to be right.
   bool MadeChange = false;
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; I++) {
-    MachineBasicBlock *MBB = I, *TBB = 0, *FBB = 0;
+    MachineBasicBlock *MBB = I, *TBB = nullptr, *FBB = nullptr;
     SmallVector<MachineOperand, 4> Cond;
     if (!TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, true))
       MadeChange |= MBB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
@@ -220,7 +221,7 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
   // See if any jump tables have become dead as the code generator
   // did its thing.
   MachineJumpTableInfo *JTI = MF.getJumpTableInfo();
-  if (JTI == 0) {
+  if (!JTI) {
     delete RS;
     return MadeChange;
   }
@@ -416,7 +417,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
                                             MachineBasicBlock::iterator BBI1,
                                             const BasicBlock *BB) {
   if (!TII->isLegalToSplitMBBAt(CurMBB, BBI1))
-    return 0;
+    return nullptr;
 
   MachineFunction &MF = *CurMBB.getParent();
 
@@ -466,7 +467,7 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
                     const TargetInstrInfo *TII) {
   MachineFunction *MF = CurMBB->getParent();
   MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB));
-  MachineBasicBlock *TBB = 0, *FBB = 0;
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   DebugLoc dl;  // FIXME: this is nowhere
   if (I != MF->end() &&
@@ -475,12 +476,12 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
     if (TBB == NextBB && !Cond.empty() && !FBB) {
       if (!TII->ReverseBranchCondition(Cond)) {
         TII->RemoveBranch(*CurMBB);
-        TII->InsertBranch(*CurMBB, SuccBB, NULL, Cond, dl);
+        TII->InsertBranch(*CurMBB, SuccBB, nullptr, Cond, dl);
         return;
       }
     }
   }
-  TII->InsertBranch(*CurMBB, SuccBB, NULL,
+  TII->InsertBranch(*CurMBB, SuccBB, nullptr,
                     SmallVector<MachineOperand, 0>(), dl);
 }
 
@@ -849,7 +850,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
 
   // See if we can do any tail merging on those.
   if (MergePotentials.size() >= 2)
-    MadeChange |= TryTailMergeBlocks(NULL, NULL);
+    MadeChange |= TryTailMergeBlocks(nullptr, nullptr);
 
   // Look at blocks (IBB) with multiple predecessors (PBB).
   // We change each predecessor to a canonical form, by
@@ -896,7 +897,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
       if (PBB->getLandingPadSuccessor())
         continue;
 
-      MachineBasicBlock *TBB = 0, *FBB = 0;
+      MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
       SmallVector<MachineOperand, 4> Cond;
       if (!TII->AnalyzeBranch(*PBB, TBB, FBB, Cond, true)) {
         // Failing case: IBB is the target of a cbr, and we cannot reverse the
@@ -915,10 +916,10 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
         // a bit in the edge so we didn't have to do all this.
         if (IBB->isLandingPad()) {
           MachineFunction::iterator IP = PBB;  IP++;
-          MachineBasicBlock *PredNextBB = NULL;
+          MachineBasicBlock *PredNextBB = nullptr;
           if (IP != MF.end())
             PredNextBB = IP;
-          if (TBB == NULL) {
+          if (!TBB) {
             if (IBB != PredNextBB)      // fallthrough
               continue;
           } else if (FBB) {
@@ -939,7 +940,8 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
           TII->RemoveBranch(*PBB);
           if (!Cond.empty())
             // reinsert conditional branch only, for now
-            TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond, dl);
+            TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, nullptr,
+                              NewCond, dl);
         }
 
         MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P));
@@ -1099,7 +1101,7 @@ ReoptimizeBlock:
   // one.
   MachineBasicBlock &PrevBB = *std::prev(MachineFunction::iterator(MBB));
 
-  MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0;
+  MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
   SmallVector<MachineOperand, 4> PriorCond;
   bool PriorUnAnalyzable =
     TII->AnalyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
@@ -1116,7 +1118,7 @@ ReoptimizeBlock:
       TII->RemoveBranch(PrevBB);
       PriorCond.clear();
       if (PriorTBB != MBB)
-        TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond, dl);
+        TII->InsertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl);
       MadeChange = true;
       ++NumBranchOpts;
       goto ReoptimizeBlock;
@@ -1160,7 +1162,7 @@ ReoptimizeBlock:
 
     // If the previous branch *only* branches to *this* block (conditional or
     // not) remove the branch.
-    if (PriorTBB == MBB && PriorFBB == 0) {
+    if (PriorTBB == MBB && !PriorFBB) {
       TII->RemoveBranch(PrevBB);
       MadeChange = true;
       ++NumBranchOpts;
@@ -1172,7 +1174,7 @@ ReoptimizeBlock:
     if (PriorFBB == MBB) {
       DebugLoc dl = getBranchDebugLoc(PrevBB);
       TII->RemoveBranch(PrevBB);
-      TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond, dl);
+      TII->InsertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl);
       MadeChange = true;
       ++NumBranchOpts;
       goto ReoptimizeBlock;
@@ -1186,7 +1188,7 @@ ReoptimizeBlock:
       if (!TII->ReverseBranchCondition(NewPriorCond)) {
         DebugLoc dl = getBranchDebugLoc(PrevBB);
         TII->RemoveBranch(PrevBB);
-        TII->InsertBranch(PrevBB, PriorFBB, 0, NewPriorCond, dl);
+        TII->InsertBranch(PrevBB, PriorFBB, nullptr, NewPriorCond, dl);
         MadeChange = true;
         ++NumBranchOpts;
         goto ReoptimizeBlock;
@@ -1201,7 +1203,7 @@ ReoptimizeBlock:
     // We consider it more likely that execution will stay in the function (e.g.
     // due to loops) than it is to exit it.  This asserts in loops etc, moving
     // the assert condition out of the loop body.
-    if (MBB->succ_empty() && !PriorCond.empty() && PriorFBB == 0 &&
+    if (MBB->succ_empty() && !PriorCond.empty() && !PriorFBB &&
         MachineFunction::iterator(PriorTBB) == FallThrough &&
         !MBB->canFallThrough()) {
       bool DoTransform = true;
@@ -1224,7 +1226,7 @@ ReoptimizeBlock:
 
           DebugLoc dl = getBranchDebugLoc(PrevBB);
           TII->RemoveBranch(PrevBB);
-          TII->InsertBranch(PrevBB, MBB, 0, NewPriorCond, dl);
+          TII->InsertBranch(PrevBB, MBB, nullptr, NewPriorCond, dl);
 
           // Move this block to the end of the function.
           MBB->moveAfter(--MF.end());
@@ -1237,7 +1239,7 @@ ReoptimizeBlock:
   }
 
   // Analyze the branch in the current block.
-  MachineBasicBlock *CurTBB = 0, *CurFBB = 0;
+  MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr;
   SmallVector<MachineOperand, 4> CurCond;
   bool CurUnAnalyzable= TII->AnalyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
   if (!CurUnAnalyzable) {
@@ -1263,7 +1265,7 @@ ReoptimizeBlock:
 
     // If this branch is the only thing in its block, see if we can forward
     // other blocks across it.
-    if (CurTBB && CurCond.empty() && CurFBB == 0 &&
+    if (CurTBB && CurCond.empty() && !CurFBB &&
         IsBranchOnlyBlock(MBB) && CurTBB != MBB &&
         !MBB->hasAddressTaken()) {
       DebugLoc dl = getBranchDebugLoc(*MBB);
@@ -1301,12 +1303,12 @@ ReoptimizeBlock:
           // explicit branch to us to make updates simpler.
           if (!PredHasNoFallThrough && PrevBB.isSuccessor(MBB) &&
               PriorTBB != MBB && PriorFBB != MBB) {
-            if (PriorTBB == 0) {
-              assert(PriorCond.empty() && PriorFBB == 0 &&
+            if (!PriorTBB) {
+              assert(PriorCond.empty() && !PriorFBB &&
                      "Bad branch analysis");
               PriorTBB = MBB;
             } else {
-              assert(PriorFBB == 0 && "Machine CFG out of date!");
+              assert(!PriorFBB && "Machine CFG out of date!");
               PriorFBB = MBB;
             }
             DebugLoc pdl = getBranchDebugLoc(PrevBB);
@@ -1330,7 +1332,7 @@ ReoptimizeBlock:
               // If this change resulted in PMBB ending in a conditional
               // branch where both conditions go to the same destination,
               // change this to an unconditional branch (and fix the CFG).
-              MachineBasicBlock *NewCurTBB = 0, *NewCurFBB = 0;
+              MachineBasicBlock *NewCurTBB = nullptr, *NewCurFBB = nullptr;
               SmallVector<MachineOperand, 4> NewCurCond;
               bool NewCurUnAnalyzable = TII->AnalyzeBranch(*PMBB, NewCurTBB,
                       NewCurFBB, NewCurCond, true);
@@ -1338,10 +1340,10 @@ ReoptimizeBlock:
                 DebugLoc pdl = getBranchDebugLoc(*PMBB);
                 TII->RemoveBranch(*PMBB);
                 NewCurCond.clear();
-                TII->InsertBranch(*PMBB, NewCurTBB, 0, NewCurCond, pdl);
+                TII->InsertBranch(*PMBB, NewCurTBB, nullptr, NewCurCond, pdl);
                 MadeChange = true;
                 ++NumBranchOpts;
-                PMBB->CorrectExtraCFGEdges(NewCurTBB, 0, false);
+                PMBB->CorrectExtraCFGEdges(NewCurTBB, nullptr, false);
               }
             }
           }
@@ -1358,7 +1360,7 @@ ReoptimizeBlock:
       }
 
       // Add the branch back if the block is more than just an uncond branch.
-      TII->InsertBranch(*MBB, CurTBB, 0, CurCond, dl);
+      TII->InsertBranch(*MBB, CurTBB, nullptr, CurCond, dl);
     }
   }
 
@@ -1379,7 +1381,7 @@ ReoptimizeBlock:
         // Analyze the branch at the end of the pred.
         MachineBasicBlock *PredBB = *PI;
         MachineFunction::iterator PredFallthrough = PredBB; ++PredFallthrough;
-        MachineBasicBlock *PredTBB = 0, *PredFBB = 0;
+        MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
         SmallVector<MachineOperand, 4> PredCond;
         if (PredBB != MBB && !PredBB->canFallThrough() &&
             !TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true)
@@ -1399,7 +1401,7 @@ ReoptimizeBlock:
             MachineBasicBlock *NextBB =
                 std::next(MachineFunction::iterator(MBB));
             CurCond.clear();
-            TII->InsertBranch(*MBB, NextBB, 0, CurCond, DebugLoc());
+            TII->InsertBranch(*MBB, NextBB, nullptr, CurCond, DebugLoc());
           }
           MBB->moveAfter(PredBB);
           MadeChange = true;
@@ -1432,7 +1434,7 @@ ReoptimizeBlock:
       // Okay, there is no really great place to put this block.  If, however,
       // the block before this one would be a fall-through if this block were
       // removed, move this block to the end of the function.
-      MachineBasicBlock *PrevTBB = 0, *PrevFBB = 0;
+      MachineBasicBlock *PrevTBB = nullptr, *PrevFBB = nullptr;
       SmallVector<MachineOperand, 4> PrevCond;
       if (FallThrough != MF.end() &&
           !TII->AnalyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
@@ -1473,7 +1475,7 @@ static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
     if (SuccBB != TrueBB)
       return SuccBB;
   }
-  return NULL;
+  return nullptr;
 }
 
 /// findHoistingInsertPosAndDeps - Find the location to move common instructions
@@ -1547,7 +1549,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   // Also avoid moving code above predicated instruction since it's hard to
   // reason about register liveness with predicated instruction.
   bool DontMoveAcrossStore = true;
-  if (!PI->isSafeToMove(TII, 0, DontMoveAcrossStore) ||
+  if (!PI->isSafeToMove(TII, nullptr, DontMoveAcrossStore) ||
       TII->isPredicated(PI))
     return MBB->end();
 
@@ -1581,7 +1583,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
 /// sequence at the start of the function, move the instructions before MBB
 /// terminator if it's legal.
 bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
-  MachineBasicBlock *TBB = 0, *FBB = 0;
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, true) || !TBB || Cond.empty())
     return false;
@@ -1686,7 +1688,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       break;
 
     bool DontMoveAcrossStore = true;
-    if (!TIB->isSafeToMove(TII, 0, DontMoveAcrossStore))
+    if (!TIB->isSafeToMove(TII, nullptr, DontMoveAcrossStore))
       break;
 
     // Remove kills from LocalDefsSet, these registers had short live ranges.
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 8943cb1..0b492a9 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(LLVMCodeGen
   AggressiveAntiDepBreaker.cpp
   AllocationOrder.cpp
   Analysis.cpp
+  AtomicExpandLoadLinkedPass.cpp
   BasicTargetTransformInfo.cpp
   BranchFolding.cpp
   CalcSpillWeights.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index 4833731..bc033f9 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "calcspillweights"
-
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -22,6 +20,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "calcspillweights"
+
 void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
                            MachineFunction &MF,
                            const MachineLoopInfo &MLI,
@@ -96,8 +96,8 @@ void
 VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
   MachineRegisterInfo &mri = MF.getRegInfo();
   const TargetRegisterInfo &tri = *MF.getTarget().getRegisterInfo();
-  MachineBasicBlock *mbb = 0;
-  MachineLoop *loop = 0;
+  MachineBasicBlock *mbb = nullptr;
+  MachineLoop *loop = nullptr;
   bool isExiting = false;
   float totalWeight = 0;
   SmallPtrSet<MachineInstr*, 8> visited;
@@ -149,7 +149,11 @@ VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
     unsigned hint = copyHint(mi, li.reg, tri, mri);
     if (!hint)
       continue;
-    float hweight = Hint[hint] += weight;
+    // Force hweight onto the stack so that x86 doesn't add hidden precision,
+    // making the comparison incorrectly pass (i.e., 1 > 1 == true??).
+    //
+    // FIXME: we probably shouldn't use floats at all.
+    volatile float hweight = Hint[hint] += weight;
     if (TargetRegisterInfo::isPhysicalRegister(hint)) {
       if (hweight > bestPhys && mri.isAllocatable(hint))
         bestPhys = hweight, hintPhys = hint;
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index fcfc9dc..add861a 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -76,7 +76,7 @@ CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
       dbgs() << "Formal argument #" << i << " has unhandled type "
              << EVT(ArgVT).getEVTString() << '\n';
 #endif
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
   }
 }
@@ -108,7 +108,7 @@ void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
       dbgs() << "Return operand #" << i << " has unhandled type "
              << EVT(VT).getEVTString() << '\n';
 #endif
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
   }
 }
@@ -126,7 +126,7 @@ void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
       dbgs() << "Call operand #" << i << " has unhandled type "
              << EVT(ArgVT).getEVTString() << '\n';
 #endif
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
   }
 }
@@ -145,7 +145,7 @@ void CCState::AnalyzeCallOperands(SmallVectorImpl<MVT> &ArgVTs,
       dbgs() << "Call operand #" << i << " has unhandled type "
              << EVT(ArgVT).getEVTString() << '\n';
 #endif
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
   }
 }
@@ -162,7 +162,7 @@ void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
       dbgs() << "Call result #" << i << " has unhandled type "
              << EVT(VT).getEVTString() << '\n';
 #endif
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
   }
 }
@@ -175,6 +175,6 @@ void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) {
     dbgs() << "Call result has unhandled type "
            << EVT(VT).getEVTString() << '\n';
 #endif
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 }
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 17402f0..b3beac3 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -20,6 +20,7 @@ using namespace llvm;
 
 /// initializeCodeGen - Initialize all passes linked into the CodeGen library.
 void llvm::initializeCodeGen(PassRegistry &Registry) {
+  initializeAtomicExpandLoadLinkedPass(Registry);
   initializeBasicTTIPass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeCodeGenPreparePass(Registry);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index e82a306..6aa60c6 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "codegenprepare"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
@@ -39,6 +38,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
@@ -46,6 +46,8 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+#define DEBUG_TYPE "codegenprepare"
+
 STATISTIC(NumBlocksElim, "Number of blocks eliminated");
 STATISTIC(NumPHIsElim,   "Number of trivial PHIs eliminated");
 STATISTIC(NumGEPsElim,   "Number of GEPs converted to casts");
@@ -70,6 +72,10 @@ static cl::opt<bool> DisableSelectToBranch(
   "disable-cgp-select2branch", cl::Hidden, cl::init(false),
   cl::desc("Disable select to branch conversion."));
 
+static cl::opt<bool> AddrSinkUsingGEPs(
+  "addr-sink-using-gep", cl::Hidden, cl::init(false),
+  cl::desc("Address sinking in CGP using GEPs."));
+
 static cl::opt<bool> EnableAndCmpSinking(
    "enable-andcmp-sinking", cl::Hidden, cl::init(true),
    cl::desc("Enable sinkinig and/cmp into branches."));
@@ -111,8 +117,8 @@ typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit CodeGenPrepare(const TargetMachine *TM = 0)
-      : FunctionPass(ID), TM(TM), TLI(0) {
+    explicit CodeGenPrepare(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM), TLI(nullptr) {
         initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
       }
     bool runOnFunction(Function &F) override;
@@ -177,7 +183,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   TLInfo = &getAnalysis<TargetLibraryInfo>();
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : 0;
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
   OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                            Attribute::OptimizeForSize);
 
@@ -623,6 +629,187 @@ static bool OptimizeCmpExpression(CmpInst *CI) {
   return MadeChange;
 }
 
+/// isExtractBitsCandidateUse - Check if the candidates could
+/// be combined with shift instruction, which includes:
+/// 1. Truncate instruction
+/// 2. And instruction and the imm is a mask of the low bits:
+/// imm & (imm+1) == 0
+static bool isExtractBitsCandidateUse(Instruction *User) {
+  if (!isa<TruncInst>(User)) {
+    if (User->getOpcode() != Instruction::And ||
+        !isa<ConstantInt>(User->getOperand(1)))
+      return false;
+
+    const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue();
+
+    if ((Cimm & (Cimm + 1)).getBoolValue())
+      return false;
+  }
+  return true;
+}
+
+/// SinkShiftAndTruncate - sink both shift and truncate instruction
+/// to the use of truncate's BB.
+static bool
+SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
+                     DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
+                     const TargetLowering &TLI) {
+  BasicBlock *UserBB = User->getParent();
+  DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
+  TruncInst *TruncI = dyn_cast<TruncInst>(User);
+  bool MadeChange = false;
+
+  for (Value::user_iterator TruncUI = TruncI->user_begin(),
+                            TruncE = TruncI->user_end();
+       TruncUI != TruncE;) {
+
+    Use &TruncTheUse = TruncUI.getUse();
+    Instruction *TruncUser = cast<Instruction>(*TruncUI);
+    // Preincrement use iterator so we don't invalidate it.
+
+    ++TruncUI;
+
+    int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
+    if (!ISDOpcode)
+      continue;
+
+    // If the use is actually a legal node, there will not be an implicit
+    // truncate.
+    if (TLI.isOperationLegalOrCustom(ISDOpcode,
+                                     EVT::getEVT(TruncUser->getType())))
+      continue;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(TruncUser))
+      continue;
+
+    BasicBlock *TruncUserBB = TruncUser->getParent();
+
+    if (UserBB == TruncUserBB)
+      continue;
+
+    BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
+    CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
+
+    if (!InsertedShift && !InsertedTrunc) {
+      BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
+      // Sink the shift
+      if (ShiftI->getOpcode() == Instruction::AShr)
+        InsertedShift =
+            BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt);
+      else
+        InsertedShift =
+            BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt);
+
+      // Sink the trunc
+      BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
+      TruncInsertPt++;
+
+      InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
+                                       TruncI->getType(), "", TruncInsertPt);
+
+      MadeChange = true;
+
+      TruncTheUse = InsertedTrunc;
+    }
+  }
+  return MadeChange;
+}
+
+/// OptimizeExtractBits - sink the shift *right* instruction into user blocks if
+/// the uses could potentially be combined with this shift instruction and
+/// generate BitExtract instruction. It will only be applied if the architecture
+/// supports BitExtract instruction. Here is an example:
+/// BB1:
+///   %x.extract.shift = lshr i64 %arg1, 32
+/// BB2:
+///   %x.extract.trunc = trunc i64 %x.extract.shift to i16
+/// ==>
+///
+/// BB2:
+///   %x.extract.shift.1 = lshr i64 %arg1, 32
+///   %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
+///
+/// CodeGen will recoginze the pattern in BB2 and generate BitExtract
+/// instruction.
+/// Return true if any changes are made.
+static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
+                                const TargetLowering &TLI) {
+  BasicBlock *DefBB = ShiftI->getParent();
+
+  /// Only insert instructions in each block once.
+  DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
+
+  bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(ShiftI->getType()));
+
+  bool MadeChange = false;
+  for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
+       UI != E;) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(User))
+      continue;
+
+    if (!isExtractBitsCandidateUse(User))
+      continue;
+
+    BasicBlock *UserBB = User->getParent();
+
+    if (UserBB == DefBB) {
+      // If the shift and truncate instruction are in the same BB. The use of
+      // the truncate(TruncUse) may still introduce another truncate if not
+      // legal. In this case, we would like to sink both shift and truncate
+      // instruction to the BB of TruncUse.
+      // for example:
+      // BB1:
+      // i64 shift.result = lshr i64 opnd, imm
+      // trunc.result = trunc shift.result to i16
+      //
+      // BB2:
+      //   ----> We will have an implicit truncate here if the architecture does
+      //   not have i16 compare.
+      // cmp i16 trunc.result, opnd2
+      //
+      if (isa<TruncInst>(User) && shiftIsLegal
+          // If the type of the truncate is legal, no trucate will be
+          // introduced in other basic blocks.
+          && (!TLI.isTypeLegal(TLI.getValueType(User->getType()))))
+        MadeChange =
+            SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI);
+
+      continue;
+    }
+    // If we have already inserted a shift into this block, use it.
+    BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
+
+    if (!InsertedShift) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+
+      if (ShiftI->getOpcode() == Instruction::AShr)
+        InsertedShift =
+            BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI, "", InsertPt);
+      else
+        InsertedShift =
+            BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI, "", InsertPt);
+
+      MadeChange = true;
+    }
+
+    // Replace a use of the shift with a use of the new shift.
+    TheUse = InsertedShift;
+  }
+
+  // If we removed all uses, nuke the shift.
+  if (ShiftI->use_empty())
+    ShiftI->eraseFromParent();
+
+  return MadeChange;
+}
+
 namespace {
 class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls {
 protected:
@@ -671,8 +858,9 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
     // happens.
     WeakVH IterHandle(CurInstIterator);
 
-    replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getDataLayout() : 0,
-                                  TLInfo, ModifiedDT ? 0 : DT);
+    replaceAndRecursivelySimplify(CI, RetVal,
+                                  TLI ? TLI->getDataLayout() : nullptr,
+                                  TLInfo, ModifiedDT ? nullptr : DT);
 
     // If the iterator instruction was recursively deleted, start over at the
     // start of the block.
@@ -693,10 +881,10 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   }
 
   // From here on out we're working with named functions.
-  if (CI->getCalledFunction() == 0) return false;
+  if (!CI->getCalledFunction()) return false;
 
   // We'll need DataLayout from here on out.
-  const DataLayout *TD = TLI ? TLI->getDataLayout() : 0;
+  const DataLayout *TD = TLI ? TLI->getDataLayout() : nullptr;
   if (!TD) return false;
 
   // Lower all default uses of _chk calls.  This is very similar
@@ -746,8 +934,8 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
   if (!RI)
     return false;
 
-  PHINode *PN = 0;
-  BitCastInst *BCI = 0;
+  PHINode *PN = nullptr;
+  BitCastInst *BCI = nullptr;
   Value *V = RI->getReturnValue();
   if (V) {
     BCI = dyn_cast<BitCastInst>(V);
@@ -862,7 +1050,7 @@ namespace {
 struct ExtAddrMode : public TargetLowering::AddrMode {
   Value *BaseReg;
   Value *ScaledReg;
-  ExtAddrMode() : BaseReg(0), ScaledReg(0) {}
+  ExtAddrMode() : BaseReg(nullptr), ScaledReg(nullptr) {}
   void print(raw_ostream &OS) const;
   void dump() const;
 
@@ -1189,10 +1377,10 @@ class TypePromotionTransaction {
   public:
     /// \brief Remove all reference of \p Inst and optinally replace all its
     /// uses with New.
-    /// \pre If !Inst->use_empty(), then New != NULL
-    InstructionRemover(Instruction *Inst, Value *New = NULL)
+    /// \pre If !Inst->use_empty(), then New != nullptr
+    InstructionRemover(Instruction *Inst, Value *New = nullptr)
         : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
-          Replacer(NULL) {
+          Replacer(nullptr) {
       if (New)
         Replacer = new UsesReplacer(Inst, New);
       DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
@@ -1232,7 +1420,7 @@ public:
   /// Same as Instruction::setOperand.
   void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal);
   /// Same as Instruction::eraseFromParent.
-  void eraseInstruction(Instruction *Inst, Value *NewVal = NULL);
+  void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr);
   /// Same as Value::replaceAllUsesWith.
   void replaceAllUsesWith(Instruction *Inst, Value *New);
   /// Same as Value::mutateType.
@@ -1245,84 +1433,75 @@ public:
   void moveBefore(Instruction *Inst, Instruction *Before);
   /// @}
 
-  ~TypePromotionTransaction();
-
 private:
   /// The ordered list of actions made so far.
-  SmallVector<TypePromotionAction *, 16> Actions;
-  typedef SmallVectorImpl<TypePromotionAction *>::iterator CommitPt;
+  SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
+  typedef SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator CommitPt;
 };
 
 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
                                           Value *NewVal) {
   Actions.push_back(
-      new TypePromotionTransaction::OperandSetter(Inst, Idx, NewVal));
+      make_unique<TypePromotionTransaction::OperandSetter>(Inst, Idx, NewVal));
 }
 
 void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
                                                 Value *NewVal) {
   Actions.push_back(
-      new TypePromotionTransaction::InstructionRemover(Inst, NewVal));
+      make_unique<TypePromotionTransaction::InstructionRemover>(Inst, NewVal));
 }
 
 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
                                                   Value *New) {
-  Actions.push_back(new TypePromotionTransaction::UsesReplacer(Inst, New));
+  Actions.push_back(make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
 }
 
 void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
-  Actions.push_back(new TypePromotionTransaction::TypeMutator(Inst, NewTy));
+  Actions.push_back(make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
 }
 
 Instruction *TypePromotionTransaction::createTrunc(Instruction *Opnd,
                                                    Type *Ty) {
-  TruncBuilder *TB = new TruncBuilder(Opnd, Ty);
-  Actions.push_back(TB);
-  return TB->getBuiltInstruction();
+  std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
+  Instruction *I = Ptr->getBuiltInstruction();
+  Actions.push_back(std::move(Ptr));
+  return I;
 }
 
 Instruction *TypePromotionTransaction::createSExt(Instruction *Inst,
                                                   Value *Opnd, Type *Ty) {
-  SExtBuilder *SB = new SExtBuilder(Inst, Opnd, Ty);
-  Actions.push_back(SB);
-  return SB->getBuiltInstruction();
+  std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
+  Instruction *I = Ptr->getBuiltInstruction();
+  Actions.push_back(std::move(Ptr));
+  return I;
 }
 
 void TypePromotionTransaction::moveBefore(Instruction *Inst,
                                           Instruction *Before) {
   Actions.push_back(
-      new TypePromotionTransaction::InstructionMoveBefore(Inst, Before));
+      make_unique<TypePromotionTransaction::InstructionMoveBefore>(Inst, Before));
 }
 
 TypePromotionTransaction::ConstRestorationPt
 TypePromotionTransaction::getRestorationPoint() const {
-  return Actions.rbegin() != Actions.rend() ? *Actions.rbegin() : NULL;
+  return !Actions.empty() ? Actions.back().get() : nullptr;
 }
 
 void TypePromotionTransaction::commit() {
   for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;
-       ++It) {
+       ++It)
     (*It)->commit();
-    delete *It;
-  }
   Actions.clear();
 }
 
 void TypePromotionTransaction::rollback(
     TypePromotionTransaction::ConstRestorationPt Point) {
-  while (!Actions.empty() && Point != (*Actions.rbegin())) {
-    TypePromotionAction *Curr = Actions.pop_back_val();
+  while (!Actions.empty() && Point != Actions.back().get()) {
+    std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();
     Curr->undo();
-    delete Curr;
   }
 }
 
-TypePromotionTransaction::~TypePromotionTransaction() {
-  for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt; ++It)
-    delete *It;
-  Actions.clear();
-}
-
 /// \brief A helper class for matching addressing modes.
 ///
 /// This encapsulates the logic for matching the target-legal addressing modes.
@@ -1390,7 +1569,7 @@ private:
   bool MatchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
   bool MatchAddr(Value *V, unsigned Depth);
   bool MatchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth,
-                          bool *MovedAway = NULL);
+                          bool *MovedAway = nullptr);
   bool IsProfitableToFoldIntoAddressingMode(Instruction *I,
                                             ExtAddrMode &AMBefore,
                                             ExtAddrMode &AMAfter);
@@ -1435,7 +1614,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
   // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
   // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
   // X*Scale + C*Scale to addr mode.
-  ConstantInt *CI = 0; Value *AddLHS = 0;
+  ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
   if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
       match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
     TestAddrMode.ScaledReg = AddLHS;
@@ -1461,6 +1640,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
 static bool MightBeFoldableInst(Instruction *I) {
   switch (I->getOpcode()) {
   case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
     // Don't touch identity bitcasts.
     if (I->getType() == I->getOperand(0)->getType())
       return false;
@@ -1612,13 +1792,13 @@ TypePromotionHelper::Action TypePromotionHelper::getAction(
   // get through.
   // If it, check we can get through.
   if (!SExtOpnd || !canGetThrough(SExtOpnd, SExtTy, PromotedInsts))
-    return NULL;
+    return nullptr;
 
   // Do not promote if the operand has been added by codegenprepare.
   // Otherwise, it means we are undoing an optimization that is likely to be
   // redone, thus causing potential infinite loop.
   if (isa<TruncInst>(SExtOpnd) && InsertedTruncs.count(SExtOpnd))
-    return NULL;
+    return nullptr;
 
   // SExt or Trunc instructions.
   // Return the related handler.
@@ -1629,7 +1809,7 @@ TypePromotionHelper::Action TypePromotionHelper::getAction(
   // Abort early if we will have to insert non-free instructions.
   if (!SExtOpnd->hasOneUse() &&
       !TLI.isTruncateFree(SExtTy, SExtOpnd->getType()))
-    return NULL;
+    return nullptr;
   return promoteOperandForOther;
 }
 
@@ -1740,7 +1920,7 @@ TypePromotionHelper::promoteOperandForOther(Instruction *SExt,
     TPT.moveBefore(SExtForOpnd, SExtOpnd);
     TPT.setOperand(SExtOpnd, OpIdx, SExtForOpnd);
     // If more sext are required, new instructions will have to be created.
-    SExtForOpnd = NULL;
+    SExtForOpnd = nullptr;
   }
   if (SExtForOpnd == SExt) {
     DEBUG(dbgs() << "Sign extension is useless now\n");
@@ -1815,6 +1995,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
       return MatchAddr(AddrInst->getOperand(0), Depth);
     return false;
   case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
     // BitCast is always a noop, and we can handle it as long as it is
     // int->int or pointer->pointer (we don't want int<->fp or something).
     if ((AddrInst->getOperand(0)->getType()->isPointerTy() ||
@@ -2022,11 +2203,11 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
     AddrMode.BaseOffs -= CI->getSExtValue();
   } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
     // If this is a global variable, try to fold it into the addressing mode.
-    if (AddrMode.BaseGV == 0) {
+    if (!AddrMode.BaseGV) {
       AddrMode.BaseGV = GV;
       if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
         return true;
-      AddrMode.BaseGV = 0;
+      AddrMode.BaseGV = nullptr;
     }
   } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
     ExtAddrMode BackupAddrMode = AddrMode;
@@ -2071,7 +2252,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
     if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
       return true;
     AddrMode.HasBaseReg = false;
-    AddrMode.BaseReg = 0;
+    AddrMode.BaseReg = nullptr;
   }
 
   // If the base register is already taken, see if we can do [r+r].
@@ -2081,7 +2262,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
     if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
       return true;
     AddrMode.Scale = 0;
-    AddrMode.ScaledReg = 0;
+    AddrMode.ScaledReg = nullptr;
   }
   // Couldn't match.
   TPT.rollback(LastKnownGood);
@@ -2166,7 +2347,7 @@ static bool FindAllMemoryUses(Instruction *I,
 bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
                                                    Value *KnownLive2) {
   // If Val is either of the known-live values, we know it is live!
-  if (Val == 0 || Val == KnownLive1 || Val == KnownLive2)
+  if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)
     return true;
 
   // All values other than instructions and arguments (e.g. constants) are live.
@@ -2225,13 +2406,13 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
   // lifetime wasn't extended by adding this instruction.
   if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
-    BaseReg = 0;
+    BaseReg = nullptr;
   if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
-    ScaledReg = 0;
+    ScaledReg = nullptr;
 
   // If folding this instruction (and it's subexprs) didn't extend any live
   // ranges, we're ok with it.
-  if (BaseReg == 0 && ScaledReg == 0)
+  if (!BaseReg && !ScaledReg)
     return true;
 
   // If all uses of this instruction are ultimately load/store/inlineasm's,
@@ -2320,7 +2501,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   // Use a worklist to iteratively look through PHI nodes, and ensure that
   // the addressing mode obtained from the non-PHI roots of the graph
   // are equivalent.
-  Value *Consensus = 0;
+  Value *Consensus = nullptr;
   unsigned NumUsesConsensus = 0;
   bool IsNumUsesConsensusValid = false;
   SmallVector<Instruction*, 16> AddrModeInsts;
@@ -2334,7 +2515,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
 
     // Break use-def graph loops.
     if (!Visited.insert(V)) {
-      Consensus = 0;
+      Consensus = nullptr;
       break;
     }
 
@@ -2380,7 +2561,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       continue;
     }
 
-    Consensus = 0;
+    Consensus = nullptr;
     break;
   }
 
@@ -2420,14 +2601,135 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   Value *&SunkAddr = SunkAddrs[Addr];
   if (SunkAddr) {
     DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
-                 << *MemoryInst);
+                 << *MemoryInst << "\n");
     if (SunkAddr->getType() != Addr->getType())
       SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
+  } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
+               TM && TM->getSubtarget<TargetSubtargetInfo>().useAA())) {
+    // By default, we use the GEP-based method when AA is used later. This
+    // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
+    DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
+                 << *MemoryInst << "\n");
+    Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType());
+    Value *ResultPtr = nullptr, *ResultIndex = nullptr;
+
+    // First, find the pointer.
+    if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {
+      ResultPtr = AddrMode.BaseReg;
+      AddrMode.BaseReg = nullptr;
+    }
+
+    if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {
+      // We can't add more than one pointer together, nor can we scale a
+      // pointer (both of which seem meaningless).
+      if (ResultPtr || AddrMode.Scale != 1)
+        return false;
+
+      ResultPtr = AddrMode.ScaledReg;
+      AddrMode.Scale = 0;
+    }
+
+    if (AddrMode.BaseGV) {
+      if (ResultPtr)
+        return false;
+
+      ResultPtr = AddrMode.BaseGV;
+    }
+
+    // If the real base value actually came from an inttoptr, then the matcher
+    // will look through it and provide only the integer value. In that case,
+    // use it here.
+    if (!ResultPtr && AddrMode.BaseReg) {
+      ResultPtr =
+        Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), "sunkaddr");
+      AddrMode.BaseReg = nullptr;
+    } else if (!ResultPtr && AddrMode.Scale == 1) {
+      ResultPtr =
+        Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), "sunkaddr");
+      AddrMode.Scale = 0;
+    }
+
+    if (!ResultPtr &&
+        !AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {
+      SunkAddr = Constant::getNullValue(Addr->getType());
+    } else if (!ResultPtr) {
+      return false;
+    } else {
+      Type *I8PtrTy =
+        Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
+
+      // Start with the base register. Do this first so that subsequent address
+      // matching finds it last, which will prevent it from trying to match it
+      // as the scaled value in case it happens to be a mul. That would be
+      // problematic if we've sunk a different mul for the scale, because then
+      // we'd end up sinking both muls.
+      if (AddrMode.BaseReg) {
+        Value *V = AddrMode.BaseReg;
+        if (V->getType() != IntPtrTy)
+          V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
+
+        ResultIndex = V;
+      }
+
+      // Add the scale value.
+      if (AddrMode.Scale) {
+        Value *V = AddrMode.ScaledReg;
+        if (V->getType() == IntPtrTy) {
+          // done.
+        } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
+                   cast<IntegerType>(V->getType())->getBitWidth()) {
+          V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
+        } else {
+          // It is only safe to sign extend the BaseReg if we know that the math
+          // required to create it did not overflow before we extend it. Since
+          // the original IR value was tossed in favor of a constant back when
+          // the AddrMode was created we need to bail out gracefully if widths
+          // do not match instead of extending it.
+          Instruction *I = dyn_cast_or_null<Instruction>(ResultIndex);
+          if (I && (ResultIndex != AddrMode.BaseReg))
+            I->eraseFromParent();
+          return false;
+        }
+
+        if (AddrMode.Scale != 1)
+          V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
+                                "sunkaddr");
+        if (ResultIndex)
+          ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr");
+        else
+          ResultIndex = V;
+      }
+
+      // Add in the Base Offset if present.
+      if (AddrMode.BaseOffs) {
+        Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+        if (ResultIndex) {
+	  // We need to add this separately from the scale above to help with
+	  // SDAG consecutive load/store merging.
+          if (ResultPtr->getType() != I8PtrTy)
+            ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+          ResultPtr = Builder.CreateGEP(ResultPtr, ResultIndex, "sunkaddr");
+        }
+
+        ResultIndex = V;
+      }
+
+      if (!ResultIndex) {
+        SunkAddr = ResultPtr;
+      } else {
+        if (ResultPtr->getType() != I8PtrTy)
+          ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+        SunkAddr = Builder.CreateGEP(ResultPtr, ResultIndex, "sunkaddr");
+      }
+
+      if (SunkAddr->getType() != Addr->getType())
+        SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
+    }
   } else {
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
-                 << *MemoryInst);
+                 << *MemoryInst << "\n");
     Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType());
-    Value *Result = 0;
+    Value *Result = nullptr;
 
     // Start with the base register. Do this first so that subsequent address
     // matching finds it last, which will prevent it from trying to match it
@@ -2459,8 +2761,9 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         // the original IR value was tossed in favor of a constant back when
         // the AddrMode was created we need to bail out gracefully if widths
         // do not match instead of extending it.
-        if (Result != AddrMode.BaseReg)
-            cast<Instruction>(Result)->eraseFromParent();
+        Instruction *I = dyn_cast_or_null<Instruction>(Result);
+        if (I && (Result != AddrMode.BaseReg))
+          I->eraseFromParent();
         return false;
       }
       if (AddrMode.Scale != 1)
@@ -2490,7 +2793,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         Result = V;
     }
 
-    if (Result == 0)
+    if (!Result)
       SunkAddr = Constant::getNullValue(Addr->getType());
     else
       SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
@@ -2815,7 +3118,7 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
     // trivial PHI, go ahead and zap it here.
-    if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : 0,
+    if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : nullptr,
                                        TLInfo, DT)) {
       P->replaceAllUsesWith(V);
       P->eraseFromParent();
@@ -2870,6 +3173,17 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     return false;
   }
 
+  BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
+
+  if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
+                BinOp->getOpcode() == Instruction::LShr)) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+    if (TLI && CI && TLI->hasExtractBitsInsn())
+      return OptimizeExtractBits(BinOp, CI, *TLI);
+
+    return false;
+  }
+
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
     if (GEPI->hasAllZeroIndices()) {
       /// The GEP operand must be a pointer, so must its result -> BitCast
@@ -2918,11 +3232,16 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
 bool CodeGenPrepare::PlaceDbgValues(Function &F) {
   bool MadeChange = false;
   for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    Instruction *PrevNonDbgInst = NULL;
+    Instruction *PrevNonDbgInst = nullptr;
     for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
       Instruction *Insn = BI; ++BI;
       DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
-      if (!DVI) {
+      // Leave dbg.values that refer to an alloca alone. These
+      // instrinsics describe the address of a variable (= the alloca)
+      // being taken.  They should not be moved next to the alloca
+      // (and to the beginning of the scope), but rather stay close to
+      // where said address is used.
+      if (!DVI || (DVI->getValue() && isa<AllocaInst>(DVI->getValue()))) {
         PrevNonDbgInst = Insn;
         continue;
       }
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 463eb86..822636f 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "post-RA-sched"
 #include "CriticalAntiDepBreaker.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -26,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "post-RA-sched"
+
 CriticalAntiDepBreaker::
 CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo &RCI) :
   AntiDepBreaker(), MF(MFi),
@@ -33,7 +34,7 @@ CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo &RCI) :
   TII(MF.getTarget().getInstrInfo()),
   TRI(MF.getTarget().getRegisterInfo()),
   RegClassInfo(RCI),
-  Classes(TRI->getNumRegs(), static_cast<const TargetRegisterClass *>(0)),
+  Classes(TRI->getNumRegs(), nullptr),
   KillIndices(TRI->getNumRegs(), 0),
   DefIndices(TRI->getNumRegs(), 0),
   KeepRegs(TRI->getNumRegs(), false) {}
@@ -45,7 +46,7 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   const unsigned BBSize = BB->size();
   for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i) {
     // Clear out the register class data.
-    Classes[i] = static_cast<const TargetRegisterClass *>(0);
+    Classes[i] = nullptr;
 
     // Initialize the indices to indicate that no registers are live.
     KillIndices[i] = ~0u;
@@ -75,7 +76,7 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   // callee-saved register that is not saved in the prolog.
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   BitVector Pristine = MFI->getPristineRegs(BB);
-  for (const uint16_t *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
+  for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
     if (!IsReturnBlock && !Pristine.test(*I)) continue;
     for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
       unsigned Reg = *AI;
@@ -124,7 +125,7 @@ void CriticalAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
 /// CriticalPathStep - Return the next SUnit after SU on the bottom-up
 /// critical path.
 static const SDep *CriticalPathStep(const SUnit *SU) {
-  const SDep *Next = 0;
+  const SDep *Next = nullptr;
   unsigned NextDepth = 0;
   // Find the predecessor edge with the greatest depth.
   for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
@@ -171,7 +172,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
     if (!MO.isReg()) continue;
     unsigned Reg = MO.getReg();
     if (Reg == 0) continue;
-    const TargetRegisterClass *NewRC = 0;
+    const TargetRegisterClass *NewRC = nullptr;
 
     if (i < MI->getDesc().getNumOperands())
       NewRC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
@@ -227,7 +228,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
             DefIndices[i] = Count;
             KillIndices[i] = ~0u;
             KeepRegs.reset(i);
-            Classes[i] = 0;
+            Classes[i] = nullptr;
             RegRefs.erase(i);
           }
 
@@ -244,7 +245,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
               (DefIndices[Reg] == ~0u)) &&
              "Kill and Def maps aren't consistent for Reg!");
       KeepRegs.reset(Reg);
-      Classes[Reg] = 0;
+      Classes[Reg] = nullptr;
       RegRefs.erase(Reg);
       // Repeat, for all subregs.
       for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
@@ -252,7 +253,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
         DefIndices[SubregReg] = Count;
         KillIndices[SubregReg] = ~0u;
         KeepRegs.reset(SubregReg);
-        Classes[SubregReg] = 0;
+        Classes[SubregReg] = nullptr;
         RegRefs.erase(SubregReg);
       }
       // Conservatively mark super-registers as unusable.
@@ -267,7 +268,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
     if (Reg == 0) continue;
     if (!MO.isUse()) continue;
 
-    const TargetRegisterClass *NewRC = 0;
+    const TargetRegisterClass *NewRC = nullptr;
     if (i < MI->getDesc().getNumOperands())
       NewRC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
 
@@ -419,7 +420,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
   DenseMap<MachineInstr*,const SUnit*> MISUnitMap;
 
   // Find the node at the bottom of the critical path.
-  const SUnit *Max = 0;
+  const SUnit *Max = nullptr;
   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
     const SUnit *SU = &SUnits[i];
     MISUnitMap[SU->getInstr()] = SU;
@@ -551,8 +552,8 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
         CriticalPathMI = CriticalPathSU->getInstr();
       } else {
         // We've reached the end of the critical path.
-        CriticalPathSU = 0;
-        CriticalPathMI = 0;
+        CriticalPathSU = nullptr;
+        CriticalPathMI = nullptr;
       }
     }
 
@@ -589,8 +590,9 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
 
     // Determine AntiDepReg's register class, if it is live and is
     // consistently used within a single class.
-    const TargetRegisterClass *RC = AntiDepReg != 0 ? Classes[AntiDepReg] : 0;
-    assert((AntiDepReg == 0 || RC != NULL) &&
+    const TargetRegisterClass *RC = AntiDepReg != 0 ? Classes[AntiDepReg]
+                                                    : nullptr;
+    assert((AntiDepReg == 0 || RC != nullptr) &&
            "Register should be live if it's causing an anti-dependence!");
     if (RC == reinterpret_cast<TargetRegisterClass *>(-1))
       AntiDepReg = 0;
@@ -638,7 +640,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                 (DefIndices[NewReg] == ~0u)) &&
              "Kill and Def maps aren't consistent for NewReg!");
 
-        Classes[AntiDepReg] = 0;
+        Classes[AntiDepReg] = nullptr;
         DefIndices[AntiDepReg] = KillIndices[AntiDepReg];
         KillIndices[AntiDepReg] = ~0u;
         assert(((KillIndices[AntiDepReg] == ~0u) !=
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index 5b40ae1..bc6e9dc 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -121,7 +121,7 @@ DefaultVLIWScheduler::DefaultVLIWScheduler(
 
 void DefaultVLIWScheduler::schedule() {
   // Build the scheduling graph.
-  buildSchedGraph(0);
+  buildSchedGraph(nullptr);
 }
 
 // VLIWPacketizerList Ctor
@@ -129,7 +129,7 @@ VLIWPacketizerList::VLIWPacketizerList(
   MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
   bool IsPostRA) : TM(MF.getTarget()), MF(MF)  {
   TII = TM.getInstrInfo();
-  ResourceTracker = TII->CreateTargetScheduleState(&TM, 0);
+  ResourceTracker = TII->CreateTargetScheduleState(&TM, nullptr);
   VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, MDT, IsPostRA);
 }
 
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index aa03e77..2b144d8 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "codegen-dce"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -23,6 +22,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "codegen-dce"
+
 STATISTIC(NumDeletes,          "Number of dead instructions deleted");
 
 namespace {
@@ -59,7 +60,7 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
 
   // Don't delete instructions with side effects.
   bool SawStore = false;
-  if (!MI->isSafeToMove(TII, 0, SawStore) && !MI->isPHI())
+  if (!MI->isSafeToMove(TII, nullptr, SawStore) && !MI->isPHI())
     return false;
 
   // Examine each operand.
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index d543baf..a195586 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "dwarfehprepare"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/CallSite.h"
@@ -28,6 +27,8 @@
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "dwarfehprepare"
+
 STATISTIC(NumResumesLowered, "Number of resume calls lowered");
 
 namespace {
@@ -43,7 +44,7 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid.
     DwarfEHPrepare(const TargetMachine *TM)
-        : FunctionPass(ID), TM(TM), RewindFunction(0) {
+        : FunctionPass(ID), TM(TM), RewindFunction(nullptr) {
       initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
     }
 
@@ -68,10 +69,10 @@ FunctionPass *llvm::createDwarfEHPass(const TargetMachine *TM) {
 /// instructions, including the 'resume' instruction.
 Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
   Value *V = RI->getOperand(0);
-  Value *ExnObj = 0;
+  Value *ExnObj = nullptr;
   InsertValueInst *SelIVI = dyn_cast<InsertValueInst>(V);
-  LoadInst *SelLoad = 0;
-  InsertValueInst *ExcIVI = 0;
+  LoadInst *SelLoad = nullptr;
+  InsertValueInst *ExcIVI = nullptr;
   bool EraseIVIs = false;
 
   if (SelIVI) {
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index f8887ef..c470632 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "early-ifcvt"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
@@ -40,6 +39,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "early-ifcvt"
+
 // Absolute maximum number of instructions allowed per speculated block.
 // This bypasses all other heuristics, so it should be set fairly high.
 static cl::opt<unsigned>
@@ -219,7 +220,7 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
 
     // We never speculate stores, so an AA pointer isn't necessary.
     bool DontMoveAcrossStore = true;
-    if (!I->isSafeToMove(TII, 0, DontMoveAcrossStore)) {
+    if (!I->isSafeToMove(TII, nullptr, DontMoveAcrossStore)) {
       DEBUG(dbgs() << "Can't speculate: " << *I);
       return false;
     }
@@ -338,7 +339,7 @@ bool SSAIfConv::findInsertionPoint() {
 ///
 bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
   Head = MBB;
-  TBB = FBB = Tail = 0;
+  TBB = FBB = Tail = nullptr;
 
   if (Head->succ_size() != 2)
     return false;
@@ -463,7 +464,7 @@ void SSAIfConv::replacePHIInstrs() {
     TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg);
     DEBUG(dbgs() << "          --> " << *std::prev(FirstTerm));
     PI.PHI->eraseFromParent();
-    PI.PHI = 0;
+    PI.PHI = nullptr;
   }
 }
 
@@ -564,7 +565,7 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
     // We need a branch to Tail, let code placement work it out later.
     DEBUG(dbgs() << "Converting to unconditional branch.\n");
     SmallVector<MachineOperand, 0> EmptyCond;
-    TII->InsertBranch(*Head, Tail, 0, EmptyCond, HeadDL);
+    TII->InsertBranch(*Head, Tail, nullptr, EmptyCond, HeadDL);
     Head->addSuccessor(Tail);
   }
   DEBUG(dbgs() << *Head);
@@ -775,6 +776,12 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
 bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
                << "********** Function: " << MF.getName() << '\n');
+  // Only run if conversion if the target wants it.
+  if (!MF.getTarget()
+           .getSubtarget<TargetSubtargetInfo>()
+           .enableEarlyIfConversion())
+    return false;
+
   TII = MF.getTarget().getInstrInfo();
   TRI = MF.getTarget().getRegisterInfo();
   SchedModel =
@@ -783,7 +790,7 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   DomTree = &getAnalysis<MachineDominatorTree>();
   Loops = getAnalysisIfAvailable<MachineLoopInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
-  MinInstr = 0;
+  MinInstr = nullptr;
 
   bool Changed = false;
   IfConv.runOnMachineFunction(MF);
diff --git a/lib/CodeGen/EdgeBundles.cpp b/lib/CodeGen/EdgeBundles.cpp
index 3bb0465..aea7c31 100644
--- a/lib/CodeGen/EdgeBundles.cpp
+++ b/lib/CodeGen/EdgeBundles.cpp
@@ -41,9 +41,7 @@ bool EdgeBundles::runOnMachineFunction(MachineFunction &mf) {
   EC.clear();
   EC.grow(2 * MF->getNumBlockIDs());
 
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
-       ++I) {
-    const MachineBasicBlock &MBB = *I;
+  for (const auto &MBB : *MF) {
     unsigned OutE = 2 * MBB.getNumber() + 1;
     // Join the outgoing bundle with the ingoing bundles of all successors.
     for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
@@ -69,29 +67,31 @@ bool EdgeBundles::runOnMachineFunction(MachineFunction &mf) {
   return false;
 }
 
-/// view - Visualize the annotated bipartite CFG with Graphviz.
-void EdgeBundles::view() const {
-  ViewGraph(*this, "EdgeBundles");
-}
-
 /// Specialize WriteGraph, the standard implementation won't work.
-raw_ostream &llvm::WriteGraph(raw_ostream &O, const EdgeBundles &G,
-                              bool ShortNames,
-                              const Twine &Title) {
+namespace llvm {
+template<>
+raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
+                          bool ShortNames,
+                          const Twine &Title) {
   const MachineFunction *MF = G.getMachineFunction();
 
   O << "digraph {\n";
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-       I != E; ++I) {
-    unsigned BB = I->getNumber();
+  for (const auto &MBB : *MF) {
+    unsigned BB = MBB.getNumber();
     O << "\t\"BB#" << BB << "\" [ shape=box ]\n"
       << '\t' << G.getBundle(BB, false) << " -> \"BB#" << BB << "\"\n"
       << "\t\"BB#" << BB << "\" -> " << G.getBundle(BB, true) << '\n';
-    for (MachineBasicBlock::const_succ_iterator SI = I->succ_begin(),
-           SE = I->succ_end(); SI != SE; ++SI)
+    for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+           SE = MBB.succ_end(); SI != SE; ++SI)
       O << "\t\"BB#" << BB << "\" -> \"BB#" << (*SI)->getNumber()
         << "\" [ color=lightgray ]\n";
   }
   O << "}\n";
   return O;
 }
+}
+
+/// view - Visualize the annotated bipartite CFG with Graphviz.
+void EdgeBundles::view() const {
+  ViewGraph(*this, "EdgeBundles");
+}
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index a08eb6b..cf55b68 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -20,7 +20,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "execution-fix"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -33,6 +32,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "execution-fix"
+
 /// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
 /// of execution domains.
 ///
@@ -100,7 +101,7 @@ struct DomainValue {
   // Clear this DomainValue and point to next which has all its data.
   void clear() {
     AvailableDomains = 0;
-    Next = 0;
+    Next = nullptr;
     Instrs.clear();
   }
 };
@@ -275,7 +276,7 @@ void ExeDepsFix::kill(int rx) {
     return;
 
   release(LiveRegs[rx].Value);
-  LiveRegs[rx].Value = 0;
+  LiveRegs[rx].Value = nullptr;
 }
 
 /// Force register rx into domain.
@@ -360,7 +361,7 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
 
   // Default values are 'nothing happened a long time ago'.
   for (unsigned rx = 0; rx != NumRegs; ++rx) {
-    LiveRegs[rx].Value = 0;
+    LiveRegs[rx].Value = nullptr;
     LiveRegs[rx].Def = -(1 << 20);
   }
 
@@ -404,7 +405,7 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
 
       // We have a live DomainValue from more than one predecessor.
       if (LiveRegs[rx].Value->isCollapsed()) {
-        // We are already collapsed, but predecessor is not. Force him.
+        // We are already collapsed, but predecessor is not. Force it.
         unsigned Domain = LiveRegs[rx].Value->getFirstDomain();
         if (!pdv->isCollapsed() && pdv->hasDomain(Domain))
           collapse(pdv, Domain);
@@ -440,7 +441,7 @@ void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
       release(LiveRegs[i].Value);
     delete[] LiveRegs;
   }
-  LiveRegs = 0;
+  LiveRegs = nullptr;
 }
 
 void ExeDepsFix::visitInstr(MachineInstr *MI) {
@@ -664,7 +665,7 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
 
   // doms are now sorted in order of appearance. Try to merge them all, giving
   // priority to the latest ones.
-  DomainValue *dv = 0;
+  DomainValue *dv = nullptr;
   while (!Regs.empty()) {
     if (!dv) {
       dv = Regs.pop_back_val().Value;
@@ -714,7 +715,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   TII = MF->getTarget().getInstrInfo();
   TRI = MF->getTarget().getRegisterInfo();
-  LiveRegs = 0;
+  LiveRegs = nullptr;
   assert(NumRegs == RC->getNumRegs() && "Bad regclass");
 
   DEBUG(dbgs() << "********** FIX EXECUTION DEPENDENCIES: "
diff --git a/lib/CodeGen/ExpandISelPseudos.cpp b/lib/CodeGen/ExpandISelPseudos.cpp
index fb2e446..90b62b5 100644
--- a/lib/CodeGen/ExpandISelPseudos.cpp
+++ b/lib/CodeGen/ExpandISelPseudos.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "expand-isel-pseudos"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -23,6 +22,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "expand-isel-pseudos"
+
 namespace {
   class ExpandISelPseudos : public MachineFunctionPass {
   public:
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 1b0315a..8969bcc 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "postrapseudos"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -25,6 +24,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "postrapseudos"
+
 namespace {
 struct ExpandPostRA : public MachineFunctionPass {
 private:
diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp
index 54b047b..c3e4f3e 100644
--- a/lib/CodeGen/GCMetadata.cpp
+++ b/lib/CodeGen/GCMetadata.cpp
@@ -61,10 +61,6 @@ GCModuleInfo::GCModuleInfo()
   initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
 }
 
-GCModuleInfo::~GCModuleInfo() {
-  clear();
-}
-
 GCStrategy *GCModuleInfo::getOrCreateStrategy(const Module *M,
                                               const std::string &Name) {
   strategy_map_type::iterator NMI = StrategyMap.find(Name);
@@ -74,17 +70,17 @@ GCStrategy *GCModuleInfo::getOrCreateStrategy(const Module *M,
   for (GCRegistry::iterator I = GCRegistry::begin(),
                             E = GCRegistry::end(); I != E; ++I) {
     if (Name == I->getName()) {
-      GCStrategy *S = I->instantiate();
+      std::unique_ptr<GCStrategy> S = I->instantiate();
       S->M = M;
       S->Name = Name;
-      StrategyMap.GetOrCreateValue(Name).setValue(S);
-      StrategyList.push_back(S);
-      return S;
+      StrategyMap.GetOrCreateValue(Name).setValue(S.get());
+      StrategyList.push_back(std::move(S));
+      return StrategyList.back().get();
     }
   }
  
   dbgs() << "unsupported GC: " << Name << "\n";
-  llvm_unreachable(0);
+  llvm_unreachable(nullptr);
 }
 
 GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) {
@@ -104,9 +100,6 @@ GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) {
 void GCModuleInfo::clear() {
   FInfoMap.clear();
   StrategyMap.clear();
-  
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    delete *I;
   StrategyList.clear();
 }
 
diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp
index b31a0f2..1fdff6b 100644
--- a/lib/CodeGen/GCStrategy.cpp
+++ b/lib/CodeGen/GCStrategy.cpp
@@ -101,13 +101,6 @@ GCStrategy::GCStrategy() :
   UsesMetadata(false)
 {}
 
-GCStrategy::~GCStrategy() {
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    delete *I;
-
-  Functions.clear();
-}
-
 bool GCStrategy::initializeCustomLowering(Module &M) { return false; }
 
 bool GCStrategy::performCustomLowering(Function &F) {
@@ -118,14 +111,13 @@ bool GCStrategy::performCustomLowering(Function &F) {
 
 bool GCStrategy::findCustomSafePoints(GCFunctionInfo& FI, MachineFunction &F) {
   dbgs() << "gc " << getName() << " must override findCustomSafePoints.\n";
-  llvm_unreachable(0);
+  llvm_unreachable(nullptr);
 }
 
 
 GCFunctionInfo *GCStrategy::insertFunctionInfo(const Function &F) {
-  GCFunctionInfo *FI = new GCFunctionInfo(F, *this);
-  Functions.push_back(FI);
-  return FI;
+  Functions.push_back(make_unique<GCFunctionInfo>(F, *this));
+  return Functions.back().get();
 }
 
 // -----------------------------------------------------------------------------
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 1a18b1a..1502d5f 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ifcvt"
 #include "llvm/CodeGen/Passes.h"
 #include "BranchFolding.h"
 #include "llvm/ADT/STLExtras.h"
@@ -37,6 +36,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ifcvt"
+
 // Hidden options for help debugging.
 static cl::opt<int> IfCvtFnStart("ifcvt-fn-start", cl::init(-1), cl::Hidden);
 static cl::opt<int> IfCvtFnStop("ifcvt-fn-stop", cl::init(-1), cl::Hidden);
@@ -127,7 +128,8 @@ namespace {
                  IsAnalyzed(false), IsEnqueued(false), IsBrAnalyzable(false),
                  HasFallThrough(false), IsUnpredicable(false),
                  CannotBeCopied(false), ClobbersPred(false), NonPredSize(0),
-                 ExtraCost(0), ExtraCost2(0), BB(0), TrueBB(0), FalseBB(0) {}
+                 ExtraCost(0), ExtraCost2(0), BB(nullptr), TrueBB(nullptr),
+                 FalseBB(nullptr) {}
     };
 
     /// IfcvtToken - Record information about pending if-conversions to attempt:
@@ -205,7 +207,7 @@ namespace {
     void PredicateBlock(BBInfo &BBI,
                         MachineBasicBlock::iterator E,
                         SmallVectorImpl<MachineOperand> &Cond,
-                        SmallSet<unsigned, 4> *LaterRedefs = 0);
+                        SmallSet<unsigned, 4> *LaterRedefs = nullptr);
     void CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                SmallVectorImpl<MachineOperand> &Cond,
                                bool IgnoreBr = false);
@@ -230,7 +232,7 @@ namespace {
 
     // blockAlwaysFallThrough - Block ends without a terminator.
     bool blockAlwaysFallThrough(BBInfo &BBI) const {
-      return BBI.IsBrAnalyzable && BBI.TrueBB == NULL;
+      return BBI.IsBrAnalyzable && BBI.TrueBB == nullptr;
     }
 
     // IfcvtTokenCmp - Used to sort if-conversion candidates.
@@ -438,7 +440,7 @@ static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
     if (SuccBB != TrueBB)
       return SuccBB;
   }
-  return NULL;
+  return nullptr;
 }
 
 /// ReverseBranchCondition - Reverse the condition of the end of the block
@@ -460,7 +462,7 @@ static inline MachineBasicBlock *getNextBlock(MachineBasicBlock *BB) {
   MachineFunction::iterator I = BB;
   MachineFunction::iterator E = BB->getParent()->end();
   if (++I == E)
-    return NULL;
+    return nullptr;
   return I;
 }
 
@@ -551,7 +553,7 @@ bool IfConverter::ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
     FT = getNextBlock(FalseBBI.BB);
   if (TT != FT)
     return false;
-  if (TT == NULL && (TrueBBI.IsBrAnalyzable || FalseBBI.IsBrAnalyzable))
+  if (!TT && (TrueBBI.IsBrAnalyzable || FalseBBI.IsBrAnalyzable))
     return false;
   if  (TrueBBI.BB->pred_size() > 1 || FalseBBI.BB->pred_size() > 1)
     return false;
@@ -641,11 +643,11 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
 
   bool AlreadyPredicated = !BBI.Predicate.empty();
   // First analyze the end of BB branches.
-  BBI.TrueBB = BBI.FalseBB = NULL;
+  BBI.TrueBB = BBI.FalseBB = nullptr;
   BBI.BrCond.clear();
   BBI.IsBrAnalyzable =
     !TII->AnalyzeBranch(*BBI.BB, BBI.TrueBB, BBI.FalseBB, BBI.BrCond);
-  BBI.HasFallThrough = BBI.IsBrAnalyzable && BBI.FalseBB == NULL;
+  BBI.HasFallThrough = BBI.IsBrAnalyzable && BBI.FalseBB == nullptr;
 
   if (BBI.BrCond.size()) {
     // No false branch. This BB must end with a conditional branch and a
@@ -954,13 +956,13 @@ static void InsertUncondBranch(MachineBasicBlock *BB, MachineBasicBlock *ToBB,
                                const TargetInstrInfo *TII) {
   DebugLoc dl;  // FIXME: this is nowhere
   SmallVector<MachineOperand, 0> NoCond;
-  TII->InsertBranch(*BB, ToBB, NULL, NoCond, dl);
+  TII->InsertBranch(*BB, ToBB, nullptr, NoCond, dl);
 }
 
 /// RemoveExtraEdges - Remove true / false edges if either / both are no longer
 /// successors.
 void IfConverter::RemoveExtraEdges(BBInfo &BBI) {
-  MachineBasicBlock *TBB = NULL, *FBB = NULL;
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   if (!TII->AnalyzeBranch(*BBI.BB, TBB, FBB, Cond))
     BBI.BB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
@@ -1179,7 +1181,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
 
   DontKill.clear();
 
-  bool HasEarlyExit = CvtBBI->FalseBB != NULL;
+  bool HasEarlyExit = CvtBBI->FalseBB != nullptr;
   uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0;
   uint32_t WeightScale = 0;
   if (HasEarlyExit) {
@@ -1215,7 +1217,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
                                            CvtBBI->BrCond.end());
     if (TII->ReverseBranchCondition(RevCond))
       llvm_unreachable("Unable to reverse branch condition!");
-    TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, NULL, RevCond, dl);
+    TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl);
     BBI.BB->addSuccessor(CvtBBI->FalseBB);
     // Update the edge weight for both CvtBBI->FalseBB and NextBBI.
     // New_Weight(BBI.BB, NextBBI->BB) =
@@ -1453,8 +1455,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   PredicateBlock(*BBI2, DI2, *Cond2);
 
   // Merge the true block into the entry of the diamond.
-  MergeBlocks(BBI, *BBI1, TailBB == 0);
-  MergeBlocks(BBI, *BBI2, TailBB == 0);
+  MergeBlocks(BBI, *BBI1, TailBB == nullptr);
+  MergeBlocks(BBI, *BBI2, TailBB == nullptr);
 
   // If the if-converted block falls through or unconditionally branches into
   // the tail block, and the tail block does not have other predecessors, then
@@ -1503,7 +1505,7 @@ static bool MaySpeculate(const MachineInstr *MI,
                          SmallSet<unsigned, 4> &LaterRedefs,
                          const TargetInstrInfo *TII) {
   bool SawStore = true;
-  if (!MI->isSafeToMove(TII, 0, SawStore))
+  if (!MI->isSafeToMove(TII, nullptr, SawStore))
     return false;
 
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -1527,7 +1529,7 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  SmallSet<unsigned, 4> *LaterRedefs) {
   bool AnyUnpred = false;
-  bool MaySpec = LaterRedefs != 0;
+  bool MaySpec = LaterRedefs != nullptr;
   for (MachineBasicBlock::iterator I = BBI.BB->begin(); I != E; ++I) {
     if (I->isDebugValue() || TII->isPredicated(I))
       continue;
@@ -1545,7 +1547,7 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
 #ifndef NDEBUG
       dbgs() << "Unable to predicate " << *I << "!\n";
 #endif
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
 
     // If the predicated instruction now redefines a register as the result of
@@ -1590,7 +1592,7 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
 #ifndef NDEBUG
         dbgs() << "Unable to predicate " << *I << "!\n";
 #endif
-        llvm_unreachable(0);
+        llvm_unreachable(nullptr);
       }
     }
 
@@ -1607,7 +1609,7 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
     std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
                                            FromBBI.BB->succ_end());
     MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
-    MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : NULL;
+    MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr;
 
     for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
       MachineBasicBlock *Succ = Succs[i];
@@ -1643,7 +1645,7 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
   std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
                                          FromBBI.BB->succ_end());
   MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
-  MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : NULL;
+  MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr;
 
   for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
     MachineBasicBlock *Succ = Succs[i];
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 0f7ba8e..f3c8d3d 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "Spiller.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -39,6 +38,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 STATISTIC(NumSpilledRanges,   "Number of spilled live ranges");
 STATISTIC(NumSnippets,        "Number of spilled snippets");
 STATISTIC(NumSpills,          "Number of spills inserted");
@@ -121,7 +122,7 @@ public:
 
     SibValueInfo(unsigned Reg, VNInfo *VNI)
       : AllDefsAreReloads(true), DefByOrigPHI(false), KillsSource(false),
-        SpillReg(Reg), SpillVNI(VNI), SpillMBB(0), DefMI(0) {}
+        SpillReg(Reg), SpillVNI(VNI), SpillMBB(nullptr), DefMI(nullptr) {}
 
     // Returns true when a def has been found.
     bool hasDef() const { return DefByOrigPHI || DefMI; }
@@ -167,7 +168,7 @@ private:
 
   bool isSibling(unsigned Reg);
   MachineInstr *traceSiblingValue(unsigned, VNInfo*, VNInfo*);
-  void propagateSiblingValue(SibValueMap::iterator, VNInfo *VNI = 0);
+  void propagateSiblingValue(SibValueMap::iterator, VNInfo *VNI = nullptr);
   void analyzeSiblingValues();
 
   bool hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI);
@@ -179,7 +180,7 @@ private:
 
   bool coalesceStackAccess(MachineInstr *MI, unsigned Reg);
   bool foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> >,
-                         MachineInstr *LoadMI = 0);
+                         MachineInstr *LoadMI = nullptr);
   void insertReload(unsigned VReg, SlotIndex, MachineBasicBlock::iterator MI);
   void insertSpill(unsigned VReg, bool isKill, MachineBasicBlock::iterator MI);
 
@@ -236,7 +237,7 @@ bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
   if (SnipLI.getNumValNums() > 2 || !LIS.intervalIsInOneMBB(SnipLI))
     return false;
 
-  MachineInstr *UseMI = 0;
+  MachineInstr *UseMI = nullptr;
 
   // Check that all uses satisfy our criteria.
   for (MachineRegisterInfo::reg_instr_nodbg_iterator
@@ -367,7 +368,7 @@ void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVIIter,
   do {
     SVI = WorkList.pop_back_val();
     TinyPtrVector<VNInfo*> *Deps = VNI ? &FirstDeps : &SVI->second.Deps;
-    VNI = 0;
+    VNI = nullptr;
 
     SibValueInfo &SV = SVI->second;
     if (!SV.SpillMBB)
@@ -659,7 +660,7 @@ void InlineSpiller::analyzeSiblingValues() {
       VNInfo *VNI = *VI;
       if (VNI->isUnused())
         continue;
-      MachineInstr *DefMI = 0;
+      MachineInstr *DefMI = nullptr;
       if (!VNI->isPHIDef()) {
        DefMI = LIS.getInstructionFromIndex(VNI->def);
        assert(DefMI && "No defining instruction");
@@ -1359,7 +1360,7 @@ void InlineSpiller::spill(LiveRangeEdit &edit) {
   // Share a stack slot among all descendants of Original.
   Original = VRM.getOriginal(edit.getReg());
   StackSlot = VRM.getStackSlot(Original);
-  StackInt = 0;
+  StackInt = nullptr;
 
   DEBUG(dbgs() << "Inline spilling "
                << MRI.getRegClass(edit.getReg())->getName()
diff --git a/lib/CodeGen/InterferenceCache.cpp b/lib/CodeGen/InterferenceCache.cpp
index 61d065a..187e015 100644
--- a/lib/CodeGen/InterferenceCache.cpp
+++ b/lib/CodeGen/InterferenceCache.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "InterferenceCache.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -19,6 +18,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 // Static member used for null interference cursors.
 InterferenceCache::BlockInterference InterferenceCache::Cursor::NoInterference;
 
diff --git a/lib/CodeGen/InterferenceCache.h b/lib/CodeGen/InterferenceCache.h
index d3482d0..91a1da9 100644
--- a/lib/CodeGen/InterferenceCache.h
+++ b/lib/CodeGen/InterferenceCache.h
@@ -77,7 +77,8 @@ class InterferenceCache {
       /// Iterator pointing into the fixed RegUnit interference.
       LiveInterval::iterator FixedI;
 
-      RegUnitInfo(LiveIntervalUnion &LIU) : VirtTag(LIU.getTag()), Fixed(0) {
+      RegUnitInfo(LiveIntervalUnion &LIU)
+          : VirtTag(LIU.getTag()), Fixed(nullptr) {
         VirtI.setMap(LIU.getMap());
       }
     };
@@ -93,7 +94,7 @@ class InterferenceCache {
     void update(unsigned MBBNum);
 
   public:
-    Entry() : PhysReg(0), Tag(0), RefCount(0), Indexes(0), LIS(0) {}
+    Entry() : PhysReg(0), Tag(0), RefCount(0), Indexes(nullptr), LIS(nullptr) {}
 
     void clear(MachineFunction *mf, SlotIndexes *indexes, LiveIntervals *lis) {
       assert(!hasRefs() && "Cannot clear cache entry with references");
@@ -148,8 +149,9 @@ class InterferenceCache {
   Entry *get(unsigned PhysReg);
 
 public:
-  InterferenceCache() : TRI(0), LIUArray(0), MF(0), PhysRegEntries(NULL),
-                        PhysRegEntriesCount(0), RoundRobin(0) {}
+  InterferenceCache()
+    : TRI(nullptr), LIUArray(nullptr), MF(nullptr), PhysRegEntries(nullptr),
+      PhysRegEntriesCount(0), RoundRobin(0) {}
 
   ~InterferenceCache() {
     free(PhysRegEntries);
@@ -172,7 +174,7 @@ public:
     static BlockInterference NoInterference;
 
     void setEntry(Entry *E) {
-      Current = 0;
+      Current = nullptr;
       // Update reference counts. Nothing happens when RefCount reaches 0, so
       // we don't have to check for E == CacheEntry etc.
       if (CacheEntry)
@@ -184,10 +186,10 @@ public:
 
   public:
     /// Cursor - Create a dangling cursor.
-    Cursor() : CacheEntry(0), Current(0) {}
-    ~Cursor() { setEntry(0); }
+    Cursor() : CacheEntry(nullptr), Current(nullptr) {}
+    ~Cursor() { setEntry(nullptr); }
 
-    Cursor(const Cursor &O) : CacheEntry(0), Current(0) {
+    Cursor(const Cursor &O) : CacheEntry(nullptr), Current(nullptr) {
       setEntry(O.CacheEntry);
     }
 
@@ -200,7 +202,7 @@ public:
     void setPhysReg(InterferenceCache &Cache, unsigned PhysReg) {
       // Release reference before getting a new one. That guarantees we can
       // actually have CacheEntries live cursors.
-      setEntry(0);
+      setEntry(nullptr);
       if (PhysReg)
         setEntry(Cache.get(PhysReg));
     }
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index 9977c6b..a8b8600 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -115,21 +115,21 @@ void IntrinsicLowering::AddPrototypes(Module &M) {
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt8PtrTy(Context), 
-                              DL.getIntPtrType(Context), (Type *)0);
+                              DL.getIntPtrType(Context), nullptr);
         break;
       case Intrinsic::memmove:
         M.getOrInsertFunction("memmove",
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt8PtrTy(Context), 
-                              DL.getIntPtrType(Context), (Type *)0);
+                              DL.getIntPtrType(Context), nullptr);
         break;
       case Intrinsic::memset:
         M.getOrInsertFunction("memset",
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt32Ty(M.getContext()), 
-                              DL.getIntPtrType(Context), (Type *)0);
+                              DL.getIntPtrType(Context), nullptr);
         break;
       case Intrinsic::sqrt:
         EnsureFPIntrinsicsExist(M, I, "sqrtf", "sqrt", "sqrtl");
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 9c2718b..a5ac057 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -43,24 +43,6 @@ static cl::opt<cl::boolOrDefault>
 EnableFastISelOption("fast-isel", cl::Hidden,
   cl::desc("Enable the \"fast\" instruction selector"));
 
-static cl::opt<bool> ShowMCEncoding("show-mc-encoding", cl::Hidden,
-    cl::desc("Show encoding in .s output"));
-static cl::opt<bool> ShowMCInst("show-mc-inst", cl::Hidden,
-    cl::desc("Show instruction structure in .s output"));
-
-static cl::opt<cl::boolOrDefault>
-AsmVerbose("asm-verbose", cl::desc("Add comments to directives."),
-           cl::init(cl::BOU_UNSET));
-
-static bool getVerboseAsm() {
-  switch (AsmVerbose) {
-  case cl::BOU_UNSET: return TargetMachine::getAsmVerbosityDefault();
-  case cl::BOU_TRUE:  return true;
-  case cl::BOU_FALSE: return false;
-  }
-  llvm_unreachable("Invalid verbose asm state");
-}
-
 void LLVMTargetMachine::initAsmInfo() {
   MCAsmInfo *TmpAsmInfo = TheTarget.createMCAsmInfo(*getRegisterInfo(),
                                                     TargetTriple);
@@ -103,7 +85,8 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
   // Add internal analysis passes from the target machine.
   TM->addAnalysisPasses(PM);
 
-  // Targets may override createPassConfig to provide a target-specific sublass.
+  // Targets may override createPassConfig to provide a target-specific
+  // subclass.
   TargetPassConfig *PassConfig = TM->createPassConfig(PM);
   PassConfig->setStartStopPasses(StartAfter, StopAfter);
 
@@ -138,7 +121,7 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
 
   // Ask the target for an isel.
   if (PassConfig->addInstSelector())
-    return NULL;
+    return nullptr;
 
   PassConfig->addMachinePasses();
 
@@ -169,7 +152,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
     return false;
   }
 
-  if (hasMCSaveTempLabels())
+  if (Options.MCOptions.MCSaveTempLabels)
     Context->setAllowTemporaryLabels(false);
 
   const MCAsmInfo &MAI = *getMCAsmInfo();
@@ -185,19 +168,16 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                       MII, MRI, STI);
 
     // Create a code emitter if asked to show the encoding.
-    MCCodeEmitter *MCE = 0;
-    if (ShowMCEncoding)
+    MCCodeEmitter *MCE = nullptr;
+    if (Options.MCOptions.ShowMCEncoding)
       MCE = getTarget().createMCCodeEmitter(MII, MRI, STI, *Context);
 
     MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
                                                        TargetCPU);
-    MCStreamer *S = getTarget().createAsmStreamer(*Context, Out,
-                                                  getVerboseAsm(),
-                                                  hasMCUseCFI(),
-                                                  hasMCUseDwarfDirectory(),
-                                                  InstPrinter,
-                                                  MCE, MAB,
-                                                  ShowMCInst);
+    MCStreamer *S = getTarget().createAsmStreamer(
+        *Context, Out, Options.MCOptions.AsmVerbose,
+        Options.MCOptions.MCUseDwarfDirectory, InstPrinter, MCE, MAB,
+        Options.MCOptions.ShowMCInst);
     AsmStreamer.reset(S);
     break;
   }
@@ -208,12 +188,12 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                                          *Context);
     MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
                                                        TargetCPU);
-    if (MCE == 0 || MAB == 0)
+    if (!MCE || !MAB)
       return true;
 
     AsmStreamer.reset(getTarget().createMCObjectStreamer(
-        getTargetTriple(), *Context, *MAB, Out, MCE, STI, hasMCRelaxAll(),
-        hasMCNoExecStack()));
+        getTargetTriple(), *Context, *MAB, Out, MCE, STI,
+        Options.MCOptions.MCRelaxAll, Options.MCOptions.MCNoExecStack));
     break;
   }
   case CGFT_Null:
@@ -225,7 +205,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
 
   // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
   FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);
-  if (Printer == 0)
+  if (!Printer)
     return true;
 
   // If successful, createAsmPrinter took ownership of AsmStreamer.
@@ -246,7 +226,8 @@ bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
                                                    JITCodeEmitter &JCE,
                                                    bool DisableVerify) {
   // Add common CodeGen passes.
-  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify, 0, 0);
+  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify, nullptr,
+                                               nullptr);
   if (!Context)
     return true;
 
@@ -265,11 +246,11 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
                                           raw_ostream &Out,
                                           bool DisableVerify) {
   // Add common CodeGen passes.
-  Ctx = addPassesToGenerateCode(this, PM, DisableVerify, 0, 0);
+  Ctx = addPassesToGenerateCode(this, PM, DisableVerify, nullptr, nullptr);
   if (!Ctx)
     return true;
 
-  if (hasMCSaveTempLabels())
+  if (Options.MCOptions.MCSaveTempLabels)
     Ctx->setAllowTemporaryLabels(false);
 
   // Create the code emitter for the target if it exists.  If not, .o file
@@ -280,17 +261,17 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
                                                        STI, *Ctx);
   MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
                                                      TargetCPU);
-  if (MCE == 0 || MAB == 0)
+  if (!MCE || !MAB)
     return true;
 
   std::unique_ptr<MCStreamer> AsmStreamer;
   AsmStreamer.reset(getTarget().createMCObjectStreamer(
-      getTargetTriple(), *Ctx, *MAB, Out, MCE, STI, hasMCRelaxAll(),
-      hasMCNoExecStack()));
+      getTargetTriple(), *Ctx, *MAB, Out, MCE, STI,
+      Options.MCOptions.MCRelaxAll, Options.MCOptions.MCNoExecStack));
 
   // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
   FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);
-  if (Printer == 0)
+  if (!Printer)
     return true;
 
   // If successful, createAsmPrinter took ownership of AsmStreamer.
diff --git a/lib/CodeGen/LatencyPriorityQueue.cpp b/lib/CodeGen/LatencyPriorityQueue.cpp
index e88d537..cdf505e 100644
--- a/lib/CodeGen/LatencyPriorityQueue.cpp
+++ b/lib/CodeGen/LatencyPriorityQueue.cpp
@@ -13,12 +13,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "scheduler"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "scheduler"
+
 bool latency_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
   // The isScheduleHigh flag allows nodes with wraparound dependencies that
   // cannot easily be modeled as edges with latencies to be scheduled as
@@ -53,7 +54,7 @@ bool latency_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
 /// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
 /// of SU, return it, otherwise return null.
 SUnit *LatencyPriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
-  SUnit *OnlyAvailablePred = 0;
+  SUnit *OnlyAvailablePred = nullptr;
   for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     SUnit &Pred = *I->getSUnit();
@@ -61,7 +62,7 @@ SUnit *LatencyPriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
       // We found an available, but not scheduled, predecessor.  If it's the
       // only one we have found, keep track of it... otherwise give up.
       if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
-        return 0;
+        return nullptr;
       OnlyAvailablePred = &Pred;
     }
   }
@@ -105,7 +106,7 @@ void LatencyPriorityQueue::AdjustPriorityOfUnscheduledPreds(SUnit *SU) {
   if (SU->isAvailable) return;  // All preds scheduled.
 
   SUnit *OnlyAvailablePred = getSingleUnscheduledPred(SU);
-  if (OnlyAvailablePred == 0 || !OnlyAvailablePred->isAvailable) return;
+  if (!OnlyAvailablePred || !OnlyAvailablePred->isAvailable) return;
 
   // Okay, we found a single predecessor that is available, but not scheduled.
   // Since it is available, it must be in the priority queue.  First remove it.
@@ -117,7 +118,7 @@ void LatencyPriorityQueue::AdjustPriorityOfUnscheduledPreds(SUnit *SU) {
 }
 
 SUnit *LatencyPriorityQueue::pop() {
-  if (empty()) return NULL;
+  if (empty()) return nullptr;
   std::vector<SUnit *>::iterator Best = Queue.begin();
   for (std::vector<SUnit *>::iterator I = std::next(Queue.begin()),
        E = Queue.end(); I != E; ++I)
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index c22ab11..d12c234 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "lexicalscopes"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -25,15 +24,14 @@
 #include "llvm/Support/FormattedStream.h"
 using namespace llvm;
 
-/// ~LexicalScopes - final cleanup after ourselves.
-LexicalScopes::~LexicalScopes() { reset(); }
+#define DEBUG_TYPE "lexicalscopes"
 
 /// reset - Reset the instance so that it's prepared for another function.
 void LexicalScopes::reset() {
-  MF = NULL;
-  CurrentFnLexicalScope = NULL;
-  DeleteContainerSeconds(LexicalScopeMap);
-  DeleteContainerSeconds(AbstractScopeMap);
+  MF = nullptr;
+  CurrentFnLexicalScope = nullptr;
+  LexicalScopeMap.clear();
+  AbstractScopeMap.clear();
   InlinedLexicalScopeMap.clear();
   AbstractScopesList.clear();
 }
@@ -58,30 +56,26 @@ void LexicalScopes::extractLexicalScopes(
     DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
 
   // Scan each instruction and create scopes. First build working set of scopes.
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
-       ++I) {
-    const MachineInstr *RangeBeginMI = NULL;
-    const MachineInstr *PrevMI = NULL;
+  for (const auto &MBB : *MF) {
+    const MachineInstr *RangeBeginMI = nullptr;
+    const MachineInstr *PrevMI = nullptr;
     DebugLoc PrevDL;
-    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
-         II != IE; ++II) {
-      const MachineInstr *MInsn = II;
-
+    for (const auto &MInsn : MBB) {
       // Check if instruction has valid location information.
-      const DebugLoc MIDL = MInsn->getDebugLoc();
+      const DebugLoc MIDL = MInsn.getDebugLoc();
       if (MIDL.isUnknown()) {
-        PrevMI = MInsn;
+        PrevMI = &MInsn;
         continue;
       }
 
       // If scope has not changed then skip this instruction.
       if (MIDL == PrevDL) {
-        PrevMI = MInsn;
+        PrevMI = &MInsn;
         continue;
       }
 
       // Ignore DBG_VALUE. It does not contribute to any instruction in output.
-      if (MInsn->isDebugValue())
+      if (MInsn.isDebugValue())
         continue;
 
       if (RangeBeginMI) {
@@ -94,10 +88,10 @@ void LexicalScopes::extractLexicalScopes(
       }
 
       // This is a beginning of a new instruction range.
-      RangeBeginMI = MInsn;
+      RangeBeginMI = &MInsn;
 
       // Reset previous markers.
-      PrevMI = MInsn;
+      PrevMI = &MInsn;
       PrevDL = MIDL;
     }
 
@@ -110,14 +104,22 @@ void LexicalScopes::extractLexicalScopes(
   }
 }
 
+LexicalScope *LexicalScopes::findInlinedScope(DebugLoc DL) {
+  MDNode *Scope = nullptr;
+  MDNode *IA = nullptr;
+  DL.getScopeAndInlinedAt(Scope, IA, MF->getFunction()->getContext());
+  auto I = InlinedLexicalScopeMap.find(std::make_pair(Scope, IA));
+  return I != InlinedLexicalScopeMap.end() ? &I->second : nullptr;
+}
+
 /// findLexicalScope - Find lexical scope, either regular or inlined, for the
 /// given DebugLoc. Return NULL if not found.
 LexicalScope *LexicalScopes::findLexicalScope(DebugLoc DL) {
-  MDNode *Scope = NULL;
-  MDNode *IA = NULL;
+  MDNode *Scope = nullptr;
+  MDNode *IA = nullptr;
   DL.getScopeAndInlinedAt(Scope, IA, MF->getFunction()->getContext());
   if (!Scope)
-    return NULL;
+    return nullptr;
 
   // The scope that we were created with could have an extra file - which
   // isn't what we care about in this case.
@@ -125,16 +127,18 @@ LexicalScope *LexicalScopes::findLexicalScope(DebugLoc DL) {
   if (D.isLexicalBlockFile())
     Scope = DILexicalBlockFile(Scope).getScope();
 
-  if (IA)
-    return InlinedLexicalScopeMap.lookup(DebugLoc::getFromDILocation(IA));
-  return LexicalScopeMap.lookup(Scope);
+  if (IA) {
+    auto I = InlinedLexicalScopeMap.find(std::make_pair(Scope, IA));
+    return I != InlinedLexicalScopeMap.end() ? &I->second : nullptr;
+  }
+  return findLexicalScope(Scope);
 }
 
 /// getOrCreateLexicalScope - Find lexical scope for the given DebugLoc. If
 /// not available then create new lexical scope.
 LexicalScope *LexicalScopes::getOrCreateLexicalScope(DebugLoc DL) {
-  MDNode *Scope = NULL;
-  MDNode *InlinedAt = NULL;
+  MDNode *Scope = nullptr;
+  MDNode *InlinedAt = nullptr;
   DL.getScopeAndInlinedAt(Scope, InlinedAt, MF->getFunction()->getContext());
 
   if (InlinedAt) {
@@ -155,35 +159,48 @@ LexicalScope *LexicalScopes::getOrCreateRegularScope(MDNode *Scope) {
     D = DIDescriptor(Scope);
   }
 
-  LexicalScope *WScope = LexicalScopeMap.lookup(Scope);
-  if (WScope)
-    return WScope;
+  auto I = LexicalScopeMap.find(Scope);
+  if (I != LexicalScopeMap.end())
+    return &I->second;
 
-  LexicalScope *Parent = NULL;
+  LexicalScope *Parent = nullptr;
   if (D.isLexicalBlock())
     Parent = getOrCreateLexicalScope(DebugLoc::getFromDILexicalBlock(Scope));
-  WScope = new LexicalScope(Parent, DIDescriptor(Scope), NULL, false);
-  LexicalScopeMap.insert(std::make_pair(Scope, WScope));
+  // FIXME: Use forward_as_tuple instead of make_tuple, once MSVC2012
+  // compatibility is no longer required.
+  I = LexicalScopeMap.emplace(std::piecewise_construct, std::make_tuple(Scope),
+                              std::make_tuple(Parent, DIDescriptor(Scope),
+                                              nullptr, false)).first;
+
   if (!Parent && DIDescriptor(Scope).isSubprogram() &&
       DISubprogram(Scope).describes(MF->getFunction()))
-    CurrentFnLexicalScope = WScope;
+    CurrentFnLexicalScope = &I->second;
 
-  return WScope;
+  return &I->second;
 }
 
 /// getOrCreateInlinedScope - Find or create an inlined lexical scope.
-LexicalScope *LexicalScopes::getOrCreateInlinedScope(MDNode *Scope,
+LexicalScope *LexicalScopes::getOrCreateInlinedScope(MDNode *ScopeNode,
                                                      MDNode *InlinedAt) {
-  LexicalScope *InlinedScope = LexicalScopeMap.lookup(InlinedAt);
-  if (InlinedScope)
-    return InlinedScope;
-
-  DebugLoc InlinedLoc = DebugLoc::getFromDILocation(InlinedAt);
-  InlinedScope = new LexicalScope(getOrCreateLexicalScope(InlinedLoc),
-                                  DIDescriptor(Scope), InlinedAt, false);
-  InlinedLexicalScopeMap[InlinedLoc] = InlinedScope;
-  LexicalScopeMap[InlinedAt] = InlinedScope;
-  return InlinedScope;
+  std::pair<const MDNode*, const MDNode*> P(ScopeNode, InlinedAt);
+  auto I = InlinedLexicalScopeMap.find(P);
+  if (I != InlinedLexicalScopeMap.end())
+    return &I->second;
+
+  LexicalScope *Parent;
+  DILexicalBlock Scope(ScopeNode);
+  if (Scope.isSubprogram())
+    Parent = getOrCreateLexicalScope(DebugLoc::getFromDILocation(InlinedAt));
+  else
+    Parent = getOrCreateInlinedScope(Scope.getContext(), InlinedAt);
+
+  // FIXME: Use forward_as_tuple instead of make_tuple, once MSVC2012
+  // compatibility is no longer required.
+  I = InlinedLexicalScopeMap.emplace(std::piecewise_construct,
+                                     std::make_tuple(P),
+                                     std::make_tuple(Parent, Scope, InlinedAt,
+                                                     false)).first;
+  return &I->second;
 }
 
 /// getOrCreateAbstractScope - Find or create an abstract lexical scope.
@@ -193,21 +210,23 @@ LexicalScope *LexicalScopes::getOrCreateAbstractScope(const MDNode *N) {
   DIDescriptor Scope(N);
   if (Scope.isLexicalBlockFile())
     Scope = DILexicalBlockFile(Scope).getScope();
-  LexicalScope *AScope = AbstractScopeMap.lookup(N);
-  if (AScope)
-    return AScope;
+  auto I = AbstractScopeMap.find(Scope);
+  if (I != AbstractScopeMap.end())
+    return &I->second;
 
-  LexicalScope *Parent = NULL;
+  LexicalScope *Parent = nullptr;
   if (Scope.isLexicalBlock()) {
-    DILexicalBlock DB(N);
+    DILexicalBlock DB(Scope);
     DIDescriptor ParentDesc = DB.getContext();
     Parent = getOrCreateAbstractScope(ParentDesc);
   }
-  AScope = new LexicalScope(Parent, DIDescriptor(N), NULL, true);
-  AbstractScopeMap[N] = AScope;
-  if (DIDescriptor(N).isSubprogram())
-    AbstractScopesList.push_back(AScope);
-  return AScope;
+  I = AbstractScopeMap.emplace(std::piecewise_construct,
+                               std::forward_as_tuple(Scope),
+                               std::forward_as_tuple(Parent, Scope,
+                                                     nullptr, true)).first;
+  if (Scope.isSubprogram())
+    AbstractScopesList.push_back(&I->second);
+  return &I->second;
 }
 
 /// constructScopeNest
@@ -244,7 +263,7 @@ void LexicalScopes::assignInstructionRanges(
     SmallVectorImpl<InsnRange> &MIRanges,
     DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
 
-  LexicalScope *PrevLexicalScope = NULL;
+  LexicalScope *PrevLexicalScope = nullptr;
   for (SmallVectorImpl<InsnRange>::const_iterator RI = MIRanges.begin(),
                                                   RE = MIRanges.end();
        RI != RE; ++RI) {
@@ -273,9 +292,8 @@ void LexicalScopes::getMachineBasicBlocks(
     return;
 
   if (Scope == CurrentFnLexicalScope) {
-    for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
-         ++I)
-      MBBs.insert(I);
+    for (const auto &MBB : *MF)
+      MBBs.insert(&MBB);
     return;
   }
 
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index bef4156..388f58f 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -19,7 +19,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "livedebug"
 #include "LiveDebugVariables.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/Statistic.h"
@@ -41,8 +40,12 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
+#include <memory>
+
 using namespace llvm;
 
+#define DEBUG_TYPE "livedebug"
+
 static cl::opt<bool>
 EnableLDV("live-debug-variables", cl::init(true),
           cl::desc("Enable the live debug variables pass"), cl::Hidden);
@@ -64,7 +67,7 @@ void LiveDebugVariables::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-LiveDebugVariables::LiveDebugVariables() : MachineFunctionPass(ID), pImpl(0) {
+LiveDebugVariables::LiveDebugVariables() : MachineFunctionPass(ID), pImpl(nullptr) {
   initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry());
 }
 
@@ -139,7 +142,7 @@ public:
   UserValue(const MDNode *var, unsigned o, bool i, DebugLoc L,
             LocMap::Allocator &alloc)
     : variable(var), offset(o), IsIndirect(i), dl(L), leader(this),
-      next(0), locInts(alloc)
+      next(nullptr), locInts(alloc)
   {}
 
   /// getLeader - Get the leader of this value's equivalence class.
@@ -154,8 +157,8 @@ public:
   UserValue *getNext() const { return next; }
 
   /// match - Does this UserValue match the parameters?
-  bool match(const MDNode *Var, unsigned Offset) const {
-    return Var == variable && Offset == offset;
+  bool match(const MDNode *Var, unsigned Offset, bool indirect) const {
+    return Var == variable && Offset == offset && indirect == IsIndirect;
   }
 
   /// merge - Merge equivalence classes.
@@ -292,7 +295,7 @@ class LDVImpl {
   bool ModifiedMF;
 
   /// userValues - All allocated UserValue instances.
-  SmallVector<UserValue*, 8> userValues;
+  SmallVector<std::unique_ptr<UserValue>, 8> userValues;
 
   /// Map virtual register to eq class leader.
   typedef DenseMap<unsigned, UserValue*> VRMap;
@@ -332,7 +335,6 @@ public:
 
   /// clear - Release all memory.
   void clear() {
-    DeleteContainerPointers(userValues);
     userValues.clear();
     virtRegToEqClass.clear();
     userVarMap.clear();
@@ -425,12 +427,13 @@ UserValue *LDVImpl::getUserValue(const MDNode *Var, unsigned Offset,
     UserValue *UV = Leader->getLeader();
     Leader = UV;
     for (; UV; UV = UV->getNext())
-      if (UV->match(Var, Offset))
+      if (UV->match(Var, Offset, IsIndirect))
         return UV;
   }
 
-  UserValue *UV = new UserValue(Var, Offset, IsIndirect, DL, allocator);
-  userValues.push_back(UV);
+  userValues.push_back(
+      make_unique<UserValue>(Var, Offset, IsIndirect, DL, allocator));
+  UserValue *UV = userValues.back().get();
   Leader = UserValue::merge(Leader, UV);
   return UV;
 }
@@ -444,7 +447,7 @@ void LDVImpl::mapVirtReg(unsigned VirtReg, UserValue *EC) {
 UserValue *LDVImpl::lookupVirtReg(unsigned VirtReg) {
   if (UserValue *UV = virtRegToEqClass.lookup(VirtReg))
     return UV->getLeader();
-  return 0;
+  return nullptr;
 }
 
 bool LDVImpl::handleDebugValue(MachineInstr *MI, SlotIndex Idx) {
@@ -646,14 +649,14 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI,
     const MachineOperand &Loc = locations[LocNo];
 
     if (!Loc.isReg()) {
-      extendDef(Idx, LocNo, 0, 0, 0, LIS, MDT, UVS);
+      extendDef(Idx, LocNo, nullptr, nullptr, nullptr, LIS, MDT, UVS);
       continue;
     }
 
     // Register locations are constrained to where the register value is live.
     if (TargetRegisterInfo::isVirtualRegister(Loc.getReg())) {
-      LiveInterval *LI = 0;
-      const VNInfo *VNI = 0;
+      LiveInterval *LI = nullptr;
+      const VNInfo *VNI = nullptr;
       if (LIS.hasInterval(Loc.getReg())) {
         LI = &LIS.getInterval(Loc.getReg());
         VNI = LI->getVNInfoAt(Idx);
@@ -670,7 +673,7 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI,
     LiveRange *LR = &LIS.getRegUnit(Unit);
     const VNInfo *VNI = LR->getVNInfoAt(Idx);
     // Don't track copies from physregs, it is too expensive.
-    extendDef(Idx, LocNo, LR, VNI, 0, LIS, MDT, UVS);
+    extendDef(Idx, LocNo, LR, VNI, nullptr, LIS, MDT, UVS);
   }
 
   // Finally, erase all the undefs.
@@ -733,7 +736,7 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
                          LiveIntervals& LIS) {
   DEBUG({
     dbgs() << "Splitting Loc" << OldLocNo << '\t';
-    print(dbgs(), 0);
+    print(dbgs(), nullptr);
   });
   bool DidChange = false;
   LocMap::iterator LocMapI;
@@ -823,7 +826,7 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
     }
   }
 
-  DEBUG({dbgs() << "Split result: \t"; print(dbgs(), 0);});
+  DEBUG({dbgs() << "Split result: \t"; print(dbgs(), nullptr);});
   return DidChange;
 }
 
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 3a7ac11..ce8ce96 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -331,13 +331,13 @@ LiveRange::iterator LiveRange::addSegmentFrom(Segment S, iterator From) {
 /// the value. If there is no live range before Kill, return NULL.
 VNInfo *LiveRange::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
   if (empty())
-    return 0;
+    return nullptr;
   iterator I = std::upper_bound(begin(), end(), Kill.getPrevSlot());
   if (I == begin())
-    return 0;
+    return nullptr;
   --I;
   if (I->end <= StartIdx)
-    return 0;
+    return nullptr;
   if (I->end < Kill)
     extendSegmentEndTo(I, Kill);
   return I->valno;
@@ -435,7 +435,7 @@ void LiveRange::join(LiveRange &Other,
     OutIt->valno = NewVNInfo[LHSValNoAssignments[OutIt->valno->id]];
     for (iterator I = std::next(OutIt), E = end(); I != E; ++I) {
       VNInfo* nextValNo = NewVNInfo[LHSValNoAssignments[I->valno->id]];
-      assert(nextValNo != 0 && "Huh?");
+      assert(nextValNo && "Huh?");
 
       // If this live range has the same value # as its immediate predecessor,
       // and if they are neighbors, remove one Segment.  This happens when we
@@ -638,7 +638,7 @@ void LiveRange::verify() const {
     assert(I->start.isValid());
     assert(I->end.isValid());
     assert(I->start < I->end);
-    assert(I->valno != 0);
+    assert(I->valno != nullptr);
     assert(I->valno->id < valnos.size());
     assert(I->valno == valnos[I->valno->id]);
     if (std::next(I) != E) {
@@ -857,7 +857,7 @@ unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) {
   EqClass.clear();
   EqClass.grow(LI->getNumValNums());
 
-  const VNInfo *used = 0, *unused = 0;
+  const VNInfo *used = nullptr, *unused = nullptr;
 
   // Determine connections.
   for (LiveInterval::const_vni_iterator I = LI->vni_begin(), E = LI->vni_end();
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index fdc673f..3563f8e 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "LiveRangeCalc.h"
 #include "llvm/ADT/DenseSet.h"
@@ -42,6 +41,8 @@
 #include <limits>
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 char LiveIntervals::ID = 0;
 char &llvm::LiveIntervalsID = LiveIntervals::ID;
 INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals",
@@ -79,7 +80,7 @@ void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 LiveIntervals::LiveIntervals() : MachineFunctionPass(ID),
-  DomTree(0), LRCalc(0) {
+  DomTree(nullptr), LRCalc(nullptr) {
   initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
 }
 
@@ -572,9 +573,9 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
         break;
       }
       if (CancelKill)
-        MI->clearRegisterKills(Reg, NULL);
+        MI->clearRegisterKills(Reg, nullptr);
       else
-        MI->addRegisterKilled(Reg, NULL);
+        MI->addRegisterKilled(Reg, nullptr);
     }
   }
 }
@@ -590,17 +591,17 @@ LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
 
   SlotIndex Start = LI.beginIndex();
   if (Start.isBlock())
-    return NULL;
+    return nullptr;
 
   SlotIndex Stop = LI.endIndex();
   if (Stop.isBlock())
-    return NULL;
+    return nullptr;
 
   // getMBBFromIndex doesn't need to search the MBB table when both indexes
   // belong to proper instructions.
   MachineBasicBlock *MBB1 = Indexes->getMBBFromIndex(Start);
   MachineBasicBlock *MBB2 = Indexes->getMBBFromIndex(Stop);
-  return MBB1 == MBB2 ? MBB1 : NULL;
+  return MBB1 == MBB2 ? MBB1 : nullptr;
 }
 
 bool
diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp
index d5a81a3..d81221b 100644
--- a/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/lib/CodeGen/LiveIntervalUnion.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/ADT/SparseBitVector.h"
 #include "llvm/Support/Debug.h"
@@ -23,6 +22,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 
 // Merge a LiveInterval's segments. Guarantee no overlaps.
 void LiveIntervalUnion::unify(LiveInterval &VirtReg) {
@@ -138,7 +139,7 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) {
   }
 
   LiveInterval::iterator VirtRegEnd = VirtReg->end();
-  LiveInterval *RecentReg = 0;
+  LiveInterval *RecentReg = nullptr;
   while (LiveUnionI.valid()) {
     assert(VirtRegI != VirtRegEnd && "Reached end of VirtReg");
 
@@ -200,5 +201,5 @@ void LiveIntervalUnion::Array::clear() {
     LIUs[i].~LiveIntervalUnion();
   free(LIUs);
   Size =  0;
-  LIUs = 0;
+  LIUs = nullptr;
 }
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index ecd75b4..a558e14 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -11,13 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "LiveRangeCalc.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 void LiveRangeCalc::reset(const MachineFunction *mf,
                           SlotIndexes *SI,
                           MachineDominatorTree *MDT,
@@ -121,7 +122,7 @@ void LiveRangeCalc::updateLiveIns() {
       // The value is live-through, update LiveOut as well.
       // Defer the Domtree lookup until it is needed.
       assert(Seen.test(MBB->getNumber()));
-      LiveOut[MBB] = LiveOutPair(I->Value, (MachineDomTreeNode *)0);
+      LiveOut[MBB] = LiveOutPair(I->Value, (MachineDomTreeNode *)nullptr);
     }
     Updater.setDest(&I->LR);
     Updater.add(Start, End, I->Value);
@@ -174,7 +175,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
 
   // Remember if we have seen more than one value.
   bool UniqueVNI = true;
-  VNInfo *TheVNI = 0;
+  VNInfo *TheVNI = nullptr;
 
   // Using Seen as a visited set, perform a BFS for all reaching defs.
   for (unsigned i = 0; i != WorkList.size(); ++i) {
@@ -251,7 +252,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
          End = Kill;
        else
          LiveOut[MF->getBlockNumbered(*I)] =
-           LiveOutPair(TheVNI, (MachineDomTreeNode *)0);
+           LiveOutPair(TheVNI, nullptr);
        Updater.add(Start, End, TheVNI);
     }
     return true;
@@ -345,7 +346,7 @@ void LiveRangeCalc::updateSSA() {
         VNInfo *VNI = LR.getNextValue(Start, *Alloc);
         I->Value = VNI;
         // This block is done, we know the final value.
-        I->DomNode = 0;
+        I->DomNode = nullptr;
 
         // Add liveness since updateLiveIns now skips this node.
         if (I->Kill.isValid())
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index a3a3fbb..67ab559 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -92,7 +92,7 @@ class LiveRangeCalc {
     VNInfo *Value;
 
     LiveInBlock(LiveRange &LR, MachineDomTreeNode *node, SlotIndex kill)
-      : LR(LR), DomNode(node), Kill(kill), Value(0) {}
+      : LR(LR), DomNode(node), Kill(kill), Value(nullptr) {}
   };
 
   /// LiveIn - Work list of blocks where the live-in value has yet to be
@@ -125,7 +125,8 @@ class LiveRangeCalc {
   void updateLiveIns();
 
 public:
-  LiveRangeCalc() : MF(0), MRI(0), Indexes(0), DomTree(0), Alloc(0) {}
+  LiveRangeCalc() : MF(nullptr), MRI(nullptr), Indexes(nullptr),
+                    DomTree(nullptr), Alloc(nullptr) {}
 
   //===--------------------------------------------------------------------===//
   // High-level interface.
@@ -203,7 +204,7 @@ public:
   /// addLiveInBlock().
   void setLiveOutValue(MachineBasicBlock *MBB, VNInfo *VNI) {
     Seen.set(MBB->getNumber());
-    LiveOut[MBB] = LiveOutPair(VNI, (MachineDomTreeNode *)0);
+    LiveOut[MBB] = LiveOutPair(VNI, nullptr);
   }
 
   /// addLiveInBlock - Add a block with an unknown live-in value.  This
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 891eaab..431241f 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -11,7 +11,6 @@
 // is spilled or split.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
@@ -24,6 +23,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 STATISTIC(NumDCEDeleted,     "Number of instructions deleted by DCE");
 STATISTIC(NumDCEFoldedLoads, "Number of single use loads folded after DCE");
 STATISTIC(NumFracRanges,     "Number of live ranges fractured by DCE");
@@ -164,7 +165,7 @@ void LiveRangeEdit::eraseVirtReg(unsigned Reg) {
 
 bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
                                SmallVectorImpl<MachineInstr*> &Dead) {
-  MachineInstr *DefMI = 0, *UseMI = 0;
+  MachineInstr *DefMI = nullptr, *UseMI = nullptr;
 
   // Check that there is a single def and a single use.
   for (MachineOperand &MO : MRI.reg_nodbg_operands(LI->reg)) {
@@ -197,7 +198,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   // We also need to make sure it is safe to move the load.
   // Assume there are stores between DefMI and UseMI.
   bool SawStore = true;
-  if (!DefMI->isSafeToMove(&TII, 0, SawStore))
+  if (!DefMI->isSafeToMove(&TII, nullptr, SawStore))
     return false;
 
   DEBUG(dbgs() << "Try to fold single def: " << *DefMI
@@ -213,7 +214,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   DEBUG(dbgs() << "                folded: " << *FoldMI);
   LIS.ReplaceMachineInstrInMaps(UseMI, FoldMI);
   UseMI->eraseFromParent();
-  DefMI->addRegisterDead(LI->reg, 0);
+  DefMI->addRegisterDead(LI->reg, nullptr);
   Dead.push_back(DefMI);
   ++NumDCEFoldedLoads;
   return true;
@@ -236,7 +237,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
 
   // Use the same criteria as DeadMachineInstructionElim.
   bool SawStore = false;
-  if (!MI->isSafeToMove(&TII, 0, SawStore)) {
+  if (!MI->isSafeToMove(&TII, nullptr, SawStore)) {
     DEBUG(dbgs() << "Can't delete: " << Idx << '\t' << *MI);
     return;
   }
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index 7f797be..de2ce22 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "RegisterCoalescer.h"
 #include "llvm/ADT/Statistic.h"
@@ -25,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 STATISTIC(NumAssigned   , "Number of registers assigned");
 STATISTIC(NumUnassigned , "Number of registers unassigned");
 
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
index be11a8f..b3161a4 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStackAnalysis.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "livestacks"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -24,6 +23,8 @@
 #include <limits>
 using namespace llvm;
 
+#define DEBUG_TYPE "livestacks"
+
 char LiveStacks::ID = 0;
 INITIALIZE_PASS_BEGIN(LiveStacks, "livestacks",
                 "Live Stack Slot Analysis", false, false)
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index ed55d7a..758b216 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -61,7 +61,7 @@ LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const {
   for (unsigned i = 0, e = Kills.size(); i != e; ++i)
     if (Kills[i]->getParent() == MBB)
       return Kills[i];
-  return NULL;
+  return nullptr;
 }
 
 void LiveVariables::VarInfo::dump() const {
@@ -193,7 +193,7 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
                                             SmallSet<unsigned,4> &PartDefRegs) {
   unsigned LastDefReg = 0;
   unsigned LastDefDist = 0;
-  MachineInstr *LastDef = NULL;
+  MachineInstr *LastDef = nullptr;
   for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
     unsigned SubReg = *SubRegs;
     MachineInstr *Def = PhysRegDef[SubReg];
@@ -208,7 +208,7 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
   }
 
   if (!LastDef)
-    return 0;
+    return nullptr;
 
   PartDefRegs.insert(LastDefReg);
   for (unsigned i = 0, e = LastDef->getNumOperands(); i != e; ++i) {
@@ -282,7 +282,7 @@ MachineInstr *LiveVariables::FindLastRefOrPartRef(unsigned Reg) {
   MachineInstr *LastDef = PhysRegDef[Reg];
   MachineInstr *LastUse = PhysRegUse[Reg];
   if (!LastDef && !LastUse)
-    return 0;
+    return nullptr;
 
   MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
   unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
@@ -333,7 +333,7 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
   // AX<dead> = AL<imp-def>
   //    = AL<kill>
   // AX =
-  MachineInstr *LastPartDef = 0;
+  MachineInstr *LastPartDef = nullptr;
   unsigned LastPartDefDist = 0;
   SmallSet<unsigned, 8> PartUses;
   for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
@@ -436,7 +436,7 @@ void LiveVariables::HandleRegMask(const MachineOperand &MO) {
     for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
       if ((PhysRegDef[*SR] || PhysRegUse[*SR]) && MO.clobbersPhysReg(*SR))
         Super = *SR;
-    HandlePhysRegKill(Super, 0);
+    HandlePhysRegKill(Super, nullptr);
   }
 }
 
@@ -492,7 +492,7 @@ void LiveVariables::UpdatePhysRegDefs(MachineInstr *MI,
          SubRegs.isValid(); ++SubRegs) {
       unsigned SubReg = *SubRegs;
       PhysRegDef[SubReg]  = MI;
-      PhysRegUse[SubReg]  = NULL;
+      PhysRegUse[SubReg]  = nullptr;
     }
   }
 }
@@ -506,8 +506,8 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   PhysRegDef  = new MachineInstr*[NumRegs];
   PhysRegUse  = new MachineInstr*[NumRegs];
   PHIVarInfo = new SmallVector<unsigned, 4>[MF->getNumBlockIDs()];
-  std::fill(PhysRegDef,  PhysRegDef  + NumRegs, (MachineInstr*)0);
-  std::fill(PhysRegUse,  PhysRegUse  + NumRegs, (MachineInstr*)0);
+  std::fill(PhysRegDef,  PhysRegDef  + NumRegs, nullptr);
+  std::fill(PhysRegUse,  PhysRegUse  + NumRegs, nullptr);
   PHIJoins.clear();
 
   // FIXME: LiveIntervals will be updated to remove its dependence on
@@ -536,7 +536,7 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
            EE = MBB->livein_end(); II != EE; ++II) {
       assert(TargetRegisterInfo::isPhysicalRegister(*II) &&
              "Cannot have a live-in virtual register!");
-      HandlePhysRegDef(*II, 0, Defs);
+      HandlePhysRegDef(*II, nullptr, Defs);
     }
 
     // Loop over all of the instructions, processing them.
@@ -639,10 +639,10 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
     // available at the end of the basic block.
     for (unsigned i = 0; i != NumRegs; ++i)
       if ((PhysRegDef[i] || PhysRegUse[i]) && !LiveOuts.count(i))
-        HandlePhysRegDef(i, 0, Defs);
+        HandlePhysRegDef(i, nullptr, Defs);
 
-    std::fill(PhysRegDef,  PhysRegDef  + NumRegs, (MachineInstr*)0);
-    std::fill(PhysRegUse,  PhysRegUse  + NumRegs, (MachineInstr*)0);
+    std::fill(PhysRegDef,  PhysRegDef  + NumRegs, nullptr);
+    std::fill(PhysRegUse,  PhysRegUse  + NumRegs, nullptr);
   }
 
   // Convert and transfer the dead / killed information we have gathered into
@@ -701,14 +701,15 @@ void LiveVariables::removeVirtualRegistersKilled(MachineInstr *MI) {
 /// which is used in a PHI node. We map that to the BB the vreg is coming from.
 ///
 void LiveVariables::analyzePHINodes(const MachineFunction& Fn) {
-  for (MachineFunction::const_iterator I = Fn.begin(), E = Fn.end();
-       I != E; ++I)
-    for (MachineBasicBlock::const_iterator BBI = I->begin(), BBE = I->end();
-         BBI != BBE && BBI->isPHI(); ++BBI)
-      for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
-        if (BBI->getOperand(i).readsReg())
-          PHIVarInfo[BBI->getOperand(i + 1).getMBB()->getNumber()]
-            .push_back(BBI->getOperand(i).getReg());
+  for (const auto &MBB : Fn)
+    for (const auto &BBI : MBB) {
+      if (!BBI.isPHI())
+        break;
+      for (unsigned i = 1, e = BBI.getNumOperands(); i != e; i += 2)
+        if (BBI.getOperand(i).readsReg())
+          PHIVarInfo[BBI.getOperand(i + 1).getMBB()->getNumber()]
+            .push_back(BBI.getOperand(i).getReg());
+    }
 }
 
 bool LiveVariables::VarInfo::isLiveIn(const MachineBasicBlock &MBB,
diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index 122d467..36885e8 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "localstackalloc"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -40,6 +39,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "localstackalloc"
+
 STATISTIC(NumAllocations, "Number of frame indices allocated into local block");
 STATISTIC(NumBaseRegisters, "Number of virtual frame base registers allocated");
 STATISTIC(NumReplacements, "Number of frame indices references replaced");
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 888c20e..0ec5c33 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -35,9 +35,11 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "codegen"
+
 MachineBasicBlock::MachineBasicBlock(MachineFunction &mf, const BasicBlock *bb)
   : BB(bb), Number(-1), xParent(&mf), Alignment(0), IsLandingPad(false),
-    AddressTaken(false), CachedMCSymbol(NULL) {
+    AddressTaken(false), CachedMCSymbol(nullptr) {
   Insts.Parent = this;
 }
 
@@ -98,7 +100,7 @@ void ilist_traits<MachineBasicBlock>::removeNodeFromList(MachineBasicBlock *N) {
 /// list, we update its parent pointer and add its operands from reg use/def
 /// lists if appropriate.
 void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) {
-  assert(N->getParent() == 0 && "machine instruction already in a basic block");
+  assert(!N->getParent() && "machine instruction already in a basic block");
   N->setParent(Parent);
 
   // Add the instruction's register operands to their corresponding
@@ -113,13 +115,13 @@ void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) {
 /// list, we update its parent pointer and remove its operands from reg use/def
 /// lists if appropriate.
 void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
-  assert(N->getParent() != 0 && "machine instruction not in a basic block");
+  assert(N->getParent() && "machine instruction not in a basic block");
 
   // Remove from the use/def lists.
   if (MachineFunction *MF = N->getParent()->getParent())
     N->RemoveRegOperandsFromUseLists(MF->getRegInfo());
 
-  N->setParent(0);
+  N->setParent(nullptr);
 
   LeakDetector::addGarbageObject(N);
 }
@@ -229,11 +231,11 @@ MachineBasicBlock::getLastNonDebugInstr() const {
 const MachineBasicBlock *MachineBasicBlock::getLandingPadSuccessor() const {
   // A block with a landing pad successor only has one other successor.
   if (succ_size() > 2)
-    return 0;
+    return nullptr;
   for (const_succ_iterator I = succ_begin(), E = succ_end(); I != E; ++I)
     if ((*I)->isLandingPad())
       return *I;
-  return 0;
+  return nullptr;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -392,7 +394,7 @@ void MachineBasicBlock::updateTerminator() {
   // A block with no successors has no concerns with fall-through edges.
   if (this->succ_empty()) return;
 
-  MachineBasicBlock *TBB = 0, *FBB = 0;
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   DebugLoc dl;  // FIXME: this is nowhere
   bool B = TII->AnalyzeBranch(*this, TBB, FBB, Cond);
@@ -423,7 +425,7 @@ void MachineBasicBlock::updateTerminator() {
       // Finally update the unconditional successor to be reached via a branch
       // if it would not be reached by fallthrough.
       if (!isLayoutSuccessor(TBB))
-        TII->InsertBranch(*this, TBB, 0, Cond, dl);
+        TII->InsertBranch(*this, TBB, nullptr, Cond, dl);
     }
   } else {
     if (FBB) {
@@ -434,16 +436,16 @@ void MachineBasicBlock::updateTerminator() {
         if (TII->ReverseBranchCondition(Cond))
           return;
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, FBB, 0, Cond, dl);
+        TII->InsertBranch(*this, FBB, nullptr, Cond, dl);
       } else if (isLayoutSuccessor(FBB)) {
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, TBB, 0, Cond, dl);
+        TII->InsertBranch(*this, TBB, nullptr, Cond, dl);
       }
     } else {
       // Walk through the successors and find the successor which is not
       // a landing pad and is not the conditional branch destination (in TBB)
       // as the fallthrough successor.
-      MachineBasicBlock *FallthroughBB = 0;
+      MachineBasicBlock *FallthroughBB = nullptr;
       for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
         if ((*SI)->isLandingPad() || *SI == TBB)
           continue;
@@ -461,7 +463,7 @@ void MachineBasicBlock::updateTerminator() {
         // Finally update the unconditional successor to be reached via a branch
         // if it would not be reached by fallthrough.
         if (!isLayoutSuccessor(TBB))
-          TII->InsertBranch(*this, TBB, 0, Cond, dl);
+          TII->InsertBranch(*this, TBB, nullptr, Cond, dl);
         return;
       }
 
@@ -470,11 +472,11 @@ void MachineBasicBlock::updateTerminator() {
         if (TII->ReverseBranchCondition(Cond)) {
           // We can't reverse the condition, add an unconditional branch.
           Cond.clear();
-          TII->InsertBranch(*this, FallthroughBB, 0, Cond, dl);
+          TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, dl);
           return;
         }
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, FallthroughBB, 0, Cond, dl);
+        TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, dl);
       } else if (!isLayoutSuccessor(FallthroughBB)) {
         TII->RemoveBranch(*this);
         TII->InsertBranch(*this, TBB, FallthroughBB, Cond, dl);
@@ -641,7 +643,7 @@ bool MachineBasicBlock::canFallThrough() {
     return false;
 
   // Analyze the branches, if any, at the end of the block.
-  MachineBasicBlock *TBB = 0, *FBB = 0;
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
   if (TII->AnalyzeBranch(*this, TBB, FBB, Cond)) {
@@ -654,7 +656,7 @@ bool MachineBasicBlock::canFallThrough() {
   }
 
   // If there is no branch, control always falls through.
-  if (TBB == 0) return true;
+  if (!TBB) return true;
 
   // If there is some explicit branch to the fallthrough block, it can obviously
   // reach, even though the branch should get folded to fall through implicitly.
@@ -668,7 +670,7 @@ bool MachineBasicBlock::canFallThrough() {
 
   // Otherwise, if it is conditional and has no explicit false block, it falls
   // through.
-  return FBB == 0;
+  return FBB == nullptr;
 }
 
 MachineBasicBlock *
@@ -676,7 +678,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   // Splitting the critical edge to a landing pad block is non-trivial. Don't do
   // it in this generic function.
   if (Succ->isLandingPad())
-    return NULL;
+    return nullptr;
 
   MachineFunction *MF = getParent();
   DebugLoc dl;  // FIXME: this is nowhere
@@ -684,15 +686,15 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   // Performance might be harmed on HW that implements branching using exec mask
   // where both sides of the branches are always executed.
   if (MF->getTarget().requiresStructuredCFG())
-    return NULL;
+    return nullptr;
 
   // We may need to update this's terminator, but we can't do that if
   // AnalyzeBranch fails. If this uses a jump table, we won't touch it.
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
-  MachineBasicBlock *TBB = 0, *FBB = 0;
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   if (TII->AnalyzeBranch(*this, TBB, FBB, Cond))
-    return NULL;
+    return nullptr;
 
   // Avoid bugpoint weirdness: A block may end with a conditional branch but
   // jumps to the same MBB is either case. We have duplicate CFG edges in that
@@ -701,7 +703,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   if (TBB && TBB == FBB) {
     DEBUG(dbgs() << "Won't split critical edge after degenerate BB#"
                  << getNumber() << '\n');
-    return NULL;
+    return nullptr;
   }
 
   MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
@@ -793,7 +795,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   NMBB->addSuccessor(Succ);
   if (!NMBB->isLayoutSuccessor(Succ)) {
     Cond.clear();
-    MF->getTarget().getInstrInfo()->InsertBranch(*NMBB, Succ, NULL, Cond, dl);
+    MF->getTarget().getInstrInfo()->InsertBranch(*NMBB, Succ, nullptr, Cond, dl);
 
     if (Indexes) {
       for (instr_iterator I = NMBB->instr_begin(), E = NMBB->instr_end();
@@ -1065,11 +1067,11 @@ bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
   MachineFunction::iterator FallThru =
     std::next(MachineFunction::iterator(this));
 
-  if (DestA == 0 && DestB == 0) {
+  if (!DestA && !DestB) {
     // Block falls through to successor.
     DestA = FallThru;
     DestB = FallThru;
-  } else if (DestA != 0 && DestB == 0) {
+  } else if (DestA && !DestB) {
     if (isCond)
       // Block ends in conditional jump that falls through to successor.
       DestB = FallThru;
diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 13203d5..9151d99 100644
--- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -1,4 +1,4 @@
-//====------ MachineBlockFrequencyInfo.cpp - MBB Frequency Analysis ------====//
+//===- MachineBlockFrequencyInfo.cpp - MBB Frequency Analysis -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
-#include "llvm/Analysis/BlockFrequencyImpl.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
@@ -22,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "block-freq"
+
 #ifndef NDEBUG
 enum GVDAGType {
   GVDT_None,
@@ -112,6 +116,7 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo*> :
 INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq",
                       "Machine Block Frequency Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq",
                     "Machine Block Frequency Analysis", true, true)
 
@@ -127,16 +132,18 @@ MachineBlockFrequencyInfo::~MachineBlockFrequencyInfo() {}
 
 void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineLoopInfo>();
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
   MachineBranchProbabilityInfo &MBPI =
-    getAnalysis<MachineBranchProbabilityInfo>();
+      getAnalysis<MachineBranchProbabilityInfo>();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
   if (!MBFI)
     MBFI.reset(new ImplType);
-  MBFI->doFunction(&F, &MBPI);
+  MBFI->doFunction(&F, &MBPI, &MLI);
 #ifndef NDEBUG
   if (ViewMachineBlockFreqPropagationDAG != GVDT_None) {
     view();
@@ -166,7 +173,7 @@ getBlockFreq(const MachineBasicBlock *MBB) const {
 }
 
 const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
-  return MBFI ? MBFI->Fn : nullptr;
+  return MBFI ? MBFI->getFunction() : nullptr;
 }
 
 raw_ostream &
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 771e7ce..74af1e2 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -25,7 +25,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "block-placement2"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -46,6 +45,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "block-placement2"
+
 STATISTIC(NumCondBranches, "Number of conditional branches");
 STATISTIC(NumUncondBranches, "Number of uncondittional branches");
 STATISTIC(CondBranchTakenFreq,
@@ -206,7 +207,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   void markChainSuccessors(BlockChain &Chain,
                            MachineBasicBlock *LoopHeaderBB,
                            SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
-                           const BlockFilterSet *BlockFilter = 0);
+                           const BlockFilterSet *BlockFilter = nullptr);
   MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB,
                                          BlockChain &Chain,
                                          const BlockFilterSet *BlockFilter);
@@ -220,7 +221,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
       const BlockFilterSet *BlockFilter);
   void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
                   SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
-                  const BlockFilterSet *BlockFilter = 0);
+                  const BlockFilterSet *BlockFilter = nullptr);
   MachineBasicBlock *findBestLoopTop(MachineLoop &L,
                                      const BlockFilterSet &LoopBlockSet);
   MachineBasicBlock *findBestLoopExit(MachineFunction &F,
@@ -334,7 +335,7 @@ MachineBasicBlock *MachineBlockPlacement::selectBestSuccessor(
     const BlockFilterSet *BlockFilter) {
   const BranchProbability HotProb(4, 5); // 80%
 
-  MachineBasicBlock *BestSucc = 0;
+  MachineBasicBlock *BestSucc = nullptr;
   // FIXME: Due to the performance of the probability and weight routines in
   // the MBPI analysis, we manually compute probabilities using the edge
   // weights. This is suboptimal as it means that the somewhat subtle
@@ -432,7 +433,7 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
                  }),
                  WorkList.end());
 
-  MachineBasicBlock *BestBlock = 0;
+  MachineBasicBlock *BestBlock = nullptr;
   BlockFrequency BestFreq;
   for (SmallVectorImpl<MachineBasicBlock *>::iterator WBI = WorkList.begin(),
                                                       WBE = WorkList.end();
@@ -479,7 +480,7 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
       return *BlockToChain[I]->begin();
     }
   }
-  return 0;
+  return nullptr;
 }
 
 void MachineBlockPlacement::buildChain(
@@ -560,7 +561,7 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
                << getBlockName(L.getHeader()) << "\n");
 
   BlockFrequency BestPredFreq;
-  MachineBasicBlock *BestPred = 0;
+  MachineBasicBlock *BestPred = nullptr;
   for (MachineBasicBlock::pred_iterator PI = L.getHeader()->pred_begin(),
                                         PE = L.getHeader()->pred_end();
        PI != PE; ++PI) {
@@ -616,11 +617,11 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F,
   // header and only rotate if safe.
   BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
   if (!LoopBlockSet.count(*HeaderChain.begin()))
-    return 0;
+    return nullptr;
 
   BlockFrequency BestExitEdgeFreq;
   unsigned BestExitLoopDepth = 0;
-  MachineBasicBlock *ExitingBB = 0;
+  MachineBasicBlock *ExitingBB = nullptr;
   // If there are exits to outer loops, loop rotation can severely limit
   // fallthrough opportunites unless it selects such an exit. Keep a set of
   // blocks where rotating to exit with that block will reach an outer loop.
@@ -709,14 +710,14 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F,
   // Without a candidate exiting block or with only a single block in the
   // loop, just use the loop header to layout the loop.
   if (!ExitingBB || L.getNumBlocks() == 1)
-    return 0;
+    return nullptr;
 
   // Also, if we have exit blocks which lead to outer loops but didn't select
   // one of them as the exiting block we are rotating toward, disable loop
   // rotation altogether.
   if (!BlocksExitingToOuterLoop.empty() &&
       !BlocksExitingToOuterLoop.count(ExitingBB))
-    return 0;
+    return nullptr;
 
   DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB) << "\n");
   return ExitingBB;
@@ -795,7 +796,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
   // If we selected just the header for the loop top, look for a potentially
   // profitable exit block in the event that rotating the loop can eliminate
   // branches by placing an exit edge at the bottom.
-  MachineBasicBlock *ExitingBB = 0;
+  MachineBasicBlock *ExitingBB = nullptr;
   if (LoopTop == L.getHeader())
     ExitingBB = findBestLoopExit(F, L, LoopBlockSet);
 
@@ -883,7 +884,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     // the exact fallthrough behavior for.
     for (;;) {
       Cond.clear();
-      MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
+      MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
       if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
         break;
 
@@ -895,7 +896,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
       DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
                    << getBlockName(BB) << " -> " << getBlockName(NextBB)
                    << "\n");
-      Chain->merge(NextBB, 0);
+      Chain->merge(NextBB, nullptr);
       FI = NextFI;
       BB = NextBB;
     }
@@ -987,7 +988,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     // than assert when the branch cannot be analyzed in order to remove this
     // boiler plate.
     Cond.clear();
-    MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
     if (!TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) {
       // The "PrevBB" is not yet updated to reflect current code layout, so,
       //   o. it may fall-through to a block without explict "goto" instruction
@@ -1004,10 +1005,10 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
         PrevBB->updateTerminator();
         needUpdateBr = false;
         Cond.clear();
-        TBB = FBB = 0;
+        TBB = FBB = nullptr;
         if (TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) {
           // FIXME: This should never take place.
-          TBB = FBB = 0;
+          TBB = FBB = nullptr;
         }
       }
 
@@ -1032,7 +1033,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
 
   // Fixup the last block.
   Cond.clear();
-  MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
   if (!TII->AnalyzeBranch(F.back(), TBB, FBB, Cond))
     F.back().updateTerminator();
 
diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 1d6879b..6fbc2be 100644
--- a/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -88,7 +88,7 @@ MachineBranchProbabilityInfo::isEdgeHot(const MachineBasicBlock *Src,
 MachineBasicBlock *
 MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
   uint32_t MaxWeight = 0;
-  MachineBasicBlock *MaxSucc = 0;
+  MachineBasicBlock *MaxSucc = nullptr;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
     uint32_t Weight = getEdgeWeight(MBB, I);
@@ -101,7 +101,7 @@ MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
   if (getEdgeProbability(MBB, MaxSucc) >= BranchProbability(4, 5))
     return MaxSucc;
 
-  return 0;
+  return nullptr;
 }
 
 BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 9c3bcc4..7da439c 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "machine-cse"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ScopedHashTable.h"
@@ -28,6 +27,8 @@
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "machine-cse"
+
 STATISTIC(NumCoalesces, "Number of copies coalesced");
 STATISTIC(NumCSEs,      "Number of common subexpression eliminated");
 STATISTIC(NumPhysCSEs,
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 7e1970c..3119a35 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "codegen-cp"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
@@ -28,6 +27,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "codegen-cp"
+
 STATISTIC(NumDeletes, "Number of dead copies deleted");
 
 namespace {
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 061efdb..eb3d71f 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -38,6 +38,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "codegen"
+
 //===----------------------------------------------------------------------===//
 // MachineFunction implementation
 //===----------------------------------------------------------------------===//
@@ -56,9 +58,9 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
   if (TM.getRegisterInfo())
     RegInfo = new (Allocator) MachineRegisterInfo(TM);
   else
-    RegInfo = 0;
+    RegInfo = nullptr;
 
-  MFInfo = 0;
+  MFInfo = nullptr;
   FrameInfo =
     new (Allocator) MachineFrameInfo(TM,!F->hasFnAttribute("no-realign-stack"));
 
@@ -77,7 +79,7 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
                          TM.getTargetLowering()->getPrefFunctionAlignment());
 
   FunctionNumber = FunctionNum;
-  JumpTableInfo = 0;
+  JumpTableInfo = nullptr;
 }
 
 MachineFunction::~MachineFunction() {
@@ -123,6 +125,11 @@ getOrCreateJumpTableInfo(unsigned EntryKind) {
   return JumpTableInfo;
 }
 
+/// Should we be emitting segmented stack stuff for the function
+bool MachineFunction::shouldSplitStack() {
+  return getFunction()->hasFnAttribute("split-stack");
+}
+
 /// RenumberBlocks - This discards all of the MachineBasicBlock numbers and
 /// recomputes them.  This guarantees that the MBB numbers are sequential,
 /// dense, and match the ordering of the blocks within the function.  If a
@@ -131,7 +138,7 @@ getOrCreateJumpTableInfo(unsigned EntryKind) {
 void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
   if (empty()) { MBBNumbering.clear(); return; }
   MachineFunction::iterator MBBI, E = end();
-  if (MBB == 0)
+  if (MBB == nullptr)
     MBBI = begin();
   else
     MBBI = MBB;
@@ -147,7 +154,7 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
       if (MBBI->getNumber() != -1) {
         assert(MBBNumbering[MBBI->getNumber()] == &*MBBI &&
                "MBB number mismatch!");
-        MBBNumbering[MBBI->getNumber()] = 0;
+        MBBNumbering[MBBI->getNumber()] = nullptr;
       }
 
       // If BlockNo is already taken, set that block's number to -1.
@@ -231,11 +238,17 @@ MachineFunction::getMachineMemOperand(MachinePointerInfo PtrInfo, unsigned f,
 MachineMemOperand *
 MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
                                       int64_t Offset, uint64_t Size) {
+  if (MMO->getValue())
+    return new (Allocator)
+               MachineMemOperand(MachinePointerInfo(MMO->getValue(),
+                                                    MMO->getOffset()+Offset),
+                                 MMO->getFlags(), Size,
+                                 MMO->getBaseAlignment(), nullptr);
   return new (Allocator)
-             MachineMemOperand(MachinePointerInfo(MMO->getValue(),
+             MachineMemOperand(MachinePointerInfo(MMO->getPseudoValue(),
                                                   MMO->getOffset()+Offset),
                                MMO->getFlags(), Size,
-                               MMO->getBaseAlignment(), 0);
+                               MMO->getBaseAlignment(), nullptr);
 }
 
 MachineInstr::mmo_iterator
@@ -352,9 +365,9 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
     OS << '\n';
   }
 
-  for (const_iterator BB = begin(), E = end(); BB != E; ++BB) {
+  for (const auto &BB : *this) {
     OS << '\n';
-    BB->print(OS, Indexes);
+    BB.print(OS, Indexes);
   }
 
   OS << "\n# End machine code for function " << getName() << ".\n\n";
@@ -564,7 +577,7 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
                         Align, getFrameLowering()->getStackAlignment()); 
   Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
                                               /*isSS*/   false,
-                                              /*Alloca*/ 0));
+                                              /*Alloca*/ nullptr));
   return -++NumFixedObjects;
 }
 
@@ -583,7 +596,7 @@ MachineFrameInfo::getPristineRegs(const MachineBasicBlock *MBB) const {
   if (!isCalleeSavedInfoValid())
     return BV;
 
-  for (const uint16_t *CSR = TRI->getCalleeSavedRegs(MF); CSR && *CSR; ++CSR)
+  for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF); CSR && *CSR; ++CSR)
     BV.set(*CSR);
 
   // The entry MBB always has all CSRs pristine.
diff --git a/lib/CodeGen/MachineFunctionAnalysis.cpp b/lib/CodeGen/MachineFunctionAnalysis.cpp
index 35591e1..46cd60a 100644
--- a/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -20,7 +20,7 @@ using namespace llvm;
 char MachineFunctionAnalysis::ID = 0;
 
 MachineFunctionAnalysis::MachineFunctionAnalysis(const TargetMachine &tm) :
-  FunctionPass(ID), TM(tm), MF(0) {
+  FunctionPass(ID), TM(tm), MF(nullptr) {
   initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
 }
 
@@ -53,5 +53,5 @@ bool MachineFunctionAnalysis::runOnFunction(Function &F) {
 
 void MachineFunctionAnalysis::releaseMemory() {
   delete MF;
-  MF = 0;
+  MF = nullptr;
 }
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index d102794..5122165 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -128,7 +128,7 @@ void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
 void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
                                       bool isKill, bool isDead, bool isUndef,
                                       bool isDebug) {
-  MachineRegisterInfo *RegInfo = 0;
+  MachineRegisterInfo *RegInfo = nullptr;
   if (MachineInstr *MI = getParent())
     if (MachineBasicBlock *MBB = MI->getParent())
       if (MachineFunction *MF = MBB->getParent())
@@ -152,7 +152,7 @@ void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
   IsEarlyClobber = false;
   IsDebug = isDebug;
   // Ensure isOnRegUseList() returns false.
-  Contents.Reg.Prev = 0;
+  Contents.Reg.Prev = nullptr;
   // Preserve the tie when the operand was already a register.
   if (!WasReg)
     TiedTo = 0;
@@ -265,7 +265,7 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
       if (const MachineBasicBlock *MBB = MI->getParent())
         if (const MachineFunction *MF = MBB->getParent())
           TM = &MF->getTarget();
-  const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : 0;
+  const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : nullptr;
 
   switch (getType()) {
   case MachineOperand::MO_Register:
@@ -399,8 +399,8 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
 /// getAddrSpace - Return the LLVM IR address space number that this pointer
 /// points into.
 unsigned MachinePointerInfo::getAddrSpace() const {
-  if (V == 0) return 0;
-  return cast<PointerType>(V->getType())->getAddressSpace();
+  if (V.isNull() || V.is<const PseudoSourceValue*>()) return 0;
+  return cast<PointerType>(V.get<const Value*>()->getType())->getAddressSpace();
 }
 
 /// getConstantPool - Return a MachinePointerInfo record that refers to the
@@ -434,7 +434,8 @@ MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, unsigned f,
   : PtrInfo(ptrinfo), Size(s),
     Flags((f & ((1 << MOMaxBits) - 1)) | ((Log2_32(a) + 1) << MOMaxBits)),
     TBAAInfo(TBAAInfo), Ranges(Ranges) {
-  assert((PtrInfo.V == 0 || isa<PointerType>(PtrInfo.V->getType())) &&
+  assert((PtrInfo.V.isNull() || PtrInfo.V.is<const PseudoSourceValue*>() ||
+          isa<PointerType>(PtrInfo.V.get<const Value*>()->getType())) &&
          "invalid pointer value");
   assert(getBaseAlignment() == a && "Alignment is not a power of 2!");
   assert((isLoad() || isStore()) && "Not a load/store!");
@@ -445,7 +446,7 @@ MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, unsigned f,
 void MachineMemOperand::Profile(FoldingSetNodeID &ID) const {
   ID.AddInteger(getOffset());
   ID.AddInteger(Size);
-  ID.AddPointer(getValue());
+  ID.AddPointer(getOpaqueValue());
   ID.AddInteger(Flags);
 }
 
@@ -486,10 +487,12 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) {
 
   // Print the address information.
   OS << "[";
-  if (!MMO.getValue())
-    OS << "<unknown>";
+  if (const Value *V = MMO.getValue())
+    V->printAsOperand(OS, /*PrintType=*/false);
+  else if (const PseudoSourceValue *PSV = MMO.getPseudoValue())
+    PSV->printCustom(OS);
   else
-    MMO.getValue()->printAsOperand(OS, /*PrintType=*/false);
+    OS << "<unknown>";
 
   unsigned AS = MMO.getAddrSpace();
   if (AS != 0)
@@ -545,9 +548,9 @@ void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
 /// the MCInstrDesc.
 MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
                            const DebugLoc dl, bool NoImp)
-  : MCID(&tid), Parent(0), Operands(0), NumOperands(0),
+  : MCID(&tid), Parent(nullptr), Operands(nullptr), NumOperands(0),
     Flags(0), AsmPrinterFlags(0),
-    NumMemRefs(0), MemRefs(0), debugLoc(dl) {
+    NumMemRefs(0), MemRefs(nullptr), debugLoc(dl) {
   // Reserve space for the expected number of operands.
   if (unsigned NumOps = MCID->getNumOperands() +
     MCID->getNumImplicitDefs() + MCID->getNumImplicitUses()) {
@@ -562,7 +565,7 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
 /// MachineInstr ctor - Copies MachineInstr arg exactly
 ///
 MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
-  : MCID(&MI.getDesc()), Parent(0), Operands(0), NumOperands(0),
+  : MCID(&MI.getDesc()), Parent(nullptr), Operands(nullptr), NumOperands(0),
     Flags(0), AsmPrinterFlags(0),
     NumMemRefs(MI.NumMemRefs), MemRefs(MI.MemRefs),
     debugLoc(MI.getDebugLoc()) {
@@ -583,7 +586,7 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
 MachineRegisterInfo *MachineInstr::getRegInfo() {
   if (MachineBasicBlock *MBB = getParent())
     return &MBB->getParent()->getRegInfo();
-  return 0;
+  return nullptr;
 }
 
 /// RemoveRegOperandsFromUseLists - Unlink all of the register operands in
@@ -702,7 +705,7 @@ void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) {
   // When adding a register operand, tell MRI about it.
   if (NewMO->isReg()) {
     // Ensure isOnRegUseList() returns false, regardless of Op's status.
-    NewMO->Contents.Reg.Prev = 0;
+    NewMO->Contents.Reg.Prev = nullptr;
     // Ignore existing ties. This is not a property that can be copied.
     NewMO->TiedTo = 0;
     // Add the new operand to MRI, but only for instructions in an MBB.
@@ -974,7 +977,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
     return TII->getRegClass(getDesc(), OpIdx, TRI, MF);
 
   if (!getOperand(OpIdx).isReg())
-    return NULL;
+    return nullptr;
 
   // For tied uses on inline asm, get the constraint from the def.
   unsigned DefIdx;
@@ -984,7 +987,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
   // Inline asm stores register class constraints in the flag word.
   int FlagIdx = findInlineAsmFlagIdx(OpIdx);
   if (FlagIdx < 0)
-    return NULL;
+    return nullptr;
 
   unsigned Flag = getOperand(FlagIdx).getImm();
   unsigned RCID;
@@ -995,7 +998,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
   if (InlineAsm::getKind(Flag) == InlineAsm::Kind_Mem)
     return TRI->getPointerRegClass(MF);
 
-  return NULL;
+  return nullptr;
 }
 
 const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVReg(
@@ -1366,11 +1369,13 @@ bool MachineInstr::isInvariantLoad(AliasAnalysis *AA) const {
     if ((*I)->isStore()) return false;
     if ((*I)->isInvariant()) return true;
 
+
+    // A load from a constant PseudoSourceValue is invariant.
+    if (const PseudoSourceValue *PSV = (*I)->getPseudoValue())
+      if (PSV->isConstant(MFI))
+        continue;
+
     if (const Value *V = (*I)->getValue()) {
-      // A load from a constant PseudoSourceValue is invariant.
-      if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V))
-        if (PSV->isConstant(MFI))
-          continue;
       // If we have an AliasAnalysis, ask it whether the memory is constant.
       if (AA && AA->pointsToConstantMemory(
                       AliasAnalysis::Location(V, (*I)->getSize(),
@@ -1448,32 +1453,14 @@ void MachineInstr::dump() const {
 static void printDebugLoc(DebugLoc DL, const MachineFunction *MF,
                          raw_ostream &CommentOS) {
   const LLVMContext &Ctx = MF->getFunction()->getContext();
-  if (!DL.isUnknown()) {          // Print source line info.
-    DIScope Scope(DL.getScope(Ctx));
-    assert((!Scope || Scope.isScope()) &&
-      "Scope of a DebugLoc should be null or a DIScope.");
-    // Omit the directory, because it's likely to be long and uninteresting.
-    if (Scope)
-      CommentOS << Scope.getFilename();
-    else
-      CommentOS << "<unknown>";
-    CommentOS << ':' << DL.getLine();
-    if (DL.getCol() != 0)
-      CommentOS << ':' << DL.getCol();
-    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
-    if (!InlinedAtDL.isUnknown()) {
-      CommentOS << " @[ ";
-      printDebugLoc(InlinedAtDL, MF, CommentOS);
-      CommentOS << " ]";
-    }
-  }
+  DL.print(Ctx, CommentOS);
 }
 
 void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM,
                          bool SkipOpers) const {
   // We can be a bit tidier if we know the TargetMachine and/or MachineFunction.
-  const MachineFunction *MF = 0;
-  const MachineRegisterInfo *MRI = 0;
+  const MachineFunction *MF = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
   if (const MachineBasicBlock *MBB = getParent()) {
     MF = MBB->getParent();
     if (!TM && MF)
@@ -1679,7 +1666,7 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM,
     OS << " line no:" <<  DV.getLineNumber();
     if (MDNode *InlinedAt = DV.getInlinedAt()) {
       DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(InlinedAt);
-      if (!InlinedAtDL.isUnknown()) {
+      if (!InlinedAtDL.isUnknown() && MF) {
         OS << " inlined @[ ";
         printDebugLoc(InlinedAtDL, MF, OS);
         OS << " ]";
@@ -1756,7 +1743,7 @@ bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
 void MachineInstr::clearRegisterKills(unsigned Reg,
                                       const TargetRegisterInfo *RegInfo) {
   if (!TargetRegisterInfo::isPhysicalRegister(Reg))
-    RegInfo = 0;
+    RegInfo = nullptr;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     MachineOperand &MO = getOperand(i);
     if (!MO.isReg() || !MO.isUse() || !MO.isKill())
@@ -1889,7 +1876,7 @@ MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
 void MachineInstr::emitError(StringRef Msg) const {
   // Find the source location cookie.
   unsigned LocCookie = 0;
-  const MDNode *LocMD = 0;
+  const MDNode *LocMD = nullptr;
   for (unsigned i = getNumOperands(); i != 0; --i) {
     if (getOperand(i-1).isMetadata() &&
         (LocMD = getOperand(i-1).getMetadata()) &&
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index d3a1ee7..68d2efd 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -20,7 +20,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "machine-licm"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
@@ -42,6 +41,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "machine-licm"
+
 static cl::opt<bool>
 AvoidSpeculation("avoid-speculation",
                  cl::desc("MachineLICM should avoid speculation"),
@@ -358,7 +359,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
   SmallVector<MachineLoop *, 8> Worklist(MLI->begin(), MLI->end());
   while (!Worklist.empty()) {
     CurLoop = Worklist.pop_back_val();
-    CurPreheader = 0;
+    CurPreheader = nullptr;
     ExitBlocks.clear();
 
     // If this is done before regalloc, only visit outer-most preheader-sporting
@@ -390,10 +391,10 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
 static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
   for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
          oe = MI->memoperands_end(); o != oe; ++o) {
-    if (!(*o)->isStore() || !(*o)->getValue())
+    if (!(*o)->isStore() || !(*o)->getPseudoValue())
       continue;
     if (const FixedStackPseudoSourceValue *Value =
-        dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+        dyn_cast<FixedStackPseudoSourceValue>((*o)->getPseudoValue())) {
       if (Value->getFrameIndex() == FI)
         return true;
     }
@@ -700,7 +701,7 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
   WorkList.push_back(HeaderN);
   do {
     MachineDomTreeNode *Node = WorkList.pop_back_val();
-    assert(Node != 0 && "Null dominator tree node?");
+    assert(Node && "Null dominator tree node?");
     MachineBasicBlock *BB = Node->getBlock();
 
     // If the header of the loop containing this basic block is a landing pad,
@@ -804,7 +805,7 @@ void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
   // defs as well. This happens whenever the preheader is created by splitting
   // the critical edge from the loop predecessor to the loop header.
   if (BB->pred_size() == 1) {
-    MachineBasicBlock *TBB = 0, *FBB = 0;
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
     SmallVector<MachineOperand, 4> Cond;
     if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond, false) && Cond.empty())
       InitRegPressure(*BB->pred_begin());
@@ -882,10 +883,9 @@ static bool isLoadFromGOTOrConstantPool(MachineInstr &MI) {
   assert (MI.mayLoad() && "Expected MI that loads!");
   for (MachineInstr::mmo_iterator I = MI.memoperands_begin(),
          E = MI.memoperands_end(); I != E; ++I) {
-    if (const Value *V = (*I)->getValue()) {
-      if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V))
-        if (PSV == PSV->getGOT() || PSV == PSV->getConstantPool())
-          return true;
+    if (const PseudoSourceValue *PSV = (*I)->getPseudoValue()) {
+      if (PSV == PSV->getGOT() || PSV == PSV->getConstantPool())
+        return true;
     }
   }
   return false;
@@ -1241,13 +1241,13 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
 MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
   // Don't unfold simple loads.
   if (MI->canFoldAsLoad())
-    return 0;
+    return nullptr;
 
   // If not, we may be able to unfold a load and hoist that.
   // First test whether the instruction is loading from an amenable
   // memory location.
   if (!MI->isInvariantLoad(AA))
-    return 0;
+    return nullptr;
 
   // Next determine the register class for a temporary register.
   unsigned LoadRegIndex;
@@ -1256,9 +1256,9 @@ MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
                                     /*UnfoldLoad=*/true,
                                     /*UnfoldStore=*/false,
                                     &LoadRegIndex);
-  if (NewOpc == 0) return 0;
+  if (NewOpc == 0) return nullptr;
   const MCInstrDesc &MID = TII->get(NewOpc);
-  if (MID.getNumDefs() != 1) return 0;
+  if (MID.getNumDefs() != 1) return nullptr;
   MachineFunction &MF = *MI->getParent()->getParent();
   const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF);
   // Ok, we're unfolding. Create a temporary register and do the unfold.
@@ -1284,7 +1284,7 @@ MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
   if (!IsLoopInvariantInst(*NewMIs[0]) || !IsProfitableToHoist(*NewMIs[0])) {
     NewMIs[0]->eraseFromParent();
     NewMIs[1]->eraseFromParent();
-    return 0;
+    return nullptr;
   }
 
   // Update register pressure for the unfolded instruction.
@@ -1316,10 +1316,10 @@ MachineLICM::LookForDuplicate(const MachineInstr *MI,
                               std::vector<const MachineInstr*> &PrevMIs) {
   for (unsigned i = 0, e = PrevMIs.size(); i != e; ++i) {
     const MachineInstr *PrevMI = PrevMIs[i];
-    if (TII->produceSameValue(MI, PrevMI, (PreRegAlloc ? MRI : 0)))
+    if (TII->produceSameValue(MI, PrevMI, (PreRegAlloc ? MRI : nullptr)))
       return PrevMI;
   }
-  return 0;
+  return nullptr;
 }
 
 bool MachineLICM::EliminateCSE(MachineInstr *MI,
@@ -1390,7 +1390,7 @@ bool MachineLICM::MayCSE(MachineInstr *MI) {
   if (CI == CSEMap.end() || MI->isImplicitDef())
     return false;
 
-  return LookForDuplicate(MI, CI->second) != 0;
+  return LookForDuplicate(MI, CI->second) != nullptr;
 }
 
 /// Hoist - When an instruction is found to use only loop invariant operands
@@ -1466,7 +1466,7 @@ MachineBasicBlock *MachineLICM::getCurPreheader() {
 
   // If we've tried to get a preheader and failed, don't try again.
   if (CurPreheader == reinterpret_cast<MachineBasicBlock *>(-1))
-    return 0;
+    return nullptr;
 
   if (!CurPreheader) {
     CurPreheader = CurLoop->getLoopPreheader();
@@ -1474,13 +1474,13 @@ MachineBasicBlock *MachineLICM::getCurPreheader() {
       MachineBasicBlock *Pred = CurLoop->getLoopPredecessor();
       if (!Pred) {
         CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
-        return 0;
+        return nullptr;
       }
 
       CurPreheader = Pred->SplitCriticalEdge(CurLoop->getHeader(), this);
       if (!CurPreheader) {
         CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
-        return 0;
+        return nullptr;
       }
     }
   }
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 7181025..4976e35 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -36,8 +36,8 @@ namespace llvm {
 class MMIAddrLabelMapCallbackPtr : CallbackVH {
   MMIAddrLabelMap *Map;
 public:
-  MMIAddrLabelMapCallbackPtr() : Map(0) {}
-  MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V), Map(0) {}
+  MMIAddrLabelMapCallbackPtr() : Map(nullptr) {}
+  MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V), Map(nullptr) {}
 
   void setPtr(BasicBlock *BB) {
     ValueHandleBase::operator=(BB);
@@ -163,9 +163,9 @@ void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) {
   AddrLabelSymEntry Entry = AddrLabelSymbols[BB];
   AddrLabelSymbols.erase(BB);
   assert(!Entry.Symbols.isNull() && "Didn't have a symbol, why a callback?");
-  BBCallbacks[Entry.Index] = 0;  // Clear the callback.
+  BBCallbacks[Entry.Index] = nullptr;  // Clear the callback.
 
-  assert((BB->getParent() == 0 || BB->getParent() == Entry.Fn) &&
+  assert((BB->getParent() == nullptr || BB->getParent() == Entry.Fn) &&
          "Block/parent mismatch");
 
   // Handle both the single and the multiple symbols cases.
@@ -213,7 +213,7 @@ void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) {
     return;
   }
 
-  BBCallbacks[OldEntry.Index] = 0;    // Update the callback.
+  BBCallbacks[OldEntry.Index] = nullptr;    // Update the callback.
 
   // Otherwise, we need to add the old symbol to the new block's set.  If it is
   // just a single entry, upgrade it to a symbol list.
@@ -253,12 +253,12 @@ void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
 MachineModuleInfo::MachineModuleInfo(const MCAsmInfo &MAI,
                                      const MCRegisterInfo &MRI,
                                      const MCObjectFileInfo *MOFI)
-  : ImmutablePass(ID), Context(&MAI, &MRI, MOFI, 0, false) {
+  : ImmutablePass(ID), Context(&MAI, &MRI, MOFI, nullptr, false) {
   initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
 }
 
 MachineModuleInfo::MachineModuleInfo()
-  : ImmutablePass(ID), Context(0, 0, 0) {
+  : ImmutablePass(ID), Context(nullptr, nullptr, nullptr) {
   llvm_unreachable("This MachineModuleInfo constructor should never be called, "
                    "MMI should always be explicitly constructed by "
                    "LLVMTargetMachine");
@@ -269,16 +269,16 @@ MachineModuleInfo::~MachineModuleInfo() {
 
 bool MachineModuleInfo::doInitialization(Module &M) {
 
-  ObjFileMMI = 0;
+  ObjFileMMI = nullptr;
   CompactUnwindEncoding = 0;
   CurCallSite = 0;
   CallsEHReturn = 0;
   CallsUnwindInit = 0;
   DbgInfoAvailable = UsesVAFloatArgument = false; 
   // Always emit some info, by default "no personality" info.
-  Personalities.push_back(NULL);
-  AddrLabelSymbols = 0;
-  TheModule = 0;
+  Personalities.push_back(nullptr);
+  AddrLabelSymbols = nullptr;
+  TheModule = nullptr;
 
   return false;
 }
@@ -288,12 +288,12 @@ bool MachineModuleInfo::doFinalization(Module &M) {
   Personalities.clear();
 
   delete AddrLabelSymbols;
-  AddrLabelSymbols = 0;
+  AddrLabelSymbols = nullptr;
 
   Context.reset();
 
   delete ObjFileMMI;
-  ObjFileMMI = 0;
+  ObjFileMMI = nullptr;
 
   return false;
 }
@@ -341,7 +341,7 @@ void MachineModuleInfo::AnalyzeModule(const Module &M) {
 /// because the block may be accessed outside its containing function.
 MCSymbol *MachineModuleInfo::getAddrLabelSymbol(const BasicBlock *BB) {
   // Lazily create AddrLabelSymbols.
-  if (AddrLabelSymbols == 0)
+  if (!AddrLabelSymbols)
     AddrLabelSymbols = new MMIAddrLabelMap(Context);
   return AddrLabelSymbols->getAddrLabelSymbol(const_cast<BasicBlock*>(BB));
 }
@@ -352,7 +352,7 @@ MCSymbol *MachineModuleInfo::getAddrLabelSymbol(const BasicBlock *BB) {
 std::vector<MCSymbol*> MachineModuleInfo::
 getAddrLabelSymbolToEmit(const BasicBlock *BB) {
   // Lazily create AddrLabelSymbols.
-  if (AddrLabelSymbols == 0)
+  if (!AddrLabelSymbols)
     AddrLabelSymbols = new MMIAddrLabelMap(Context);
  return AddrLabelSymbols->getAddrLabelSymbolToEmit(const_cast<BasicBlock*>(BB));
 }
@@ -366,7 +366,7 @@ void MachineModuleInfo::
 takeDeletedSymbolsForFunction(const Function *F,
                               std::vector<MCSymbol*> &Result) {
   // If no blocks have had their addresses taken, we're done.
-  if (AddrLabelSymbols == 0) return;
+  if (!AddrLabelSymbols) return;
   return AddrLabelSymbols->
      takeDeletedSymbolsForFunction(const_cast<Function*>(F), Result);
 }
@@ -419,7 +419,7 @@ void MachineModuleInfo::addPersonality(MachineBasicBlock *LandingPad,
 
   // If this is the first personality we're adding go
   // ahead and add it at the beginning.
-  if (Personalities[0] == NULL)
+  if (!Personalities[0])
     Personalities[0] = Personality;
   else
     Personalities.push_back(Personality);
@@ -462,7 +462,7 @@ void MachineModuleInfo::TidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
     if (LandingPad.LandingPadLabel &&
         !LandingPad.LandingPadLabel->isDefined() &&
         (!LPMap || (*LPMap)[LandingPad.LandingPadLabel] == 0))
-      LandingPad.LandingPadLabel = 0;
+      LandingPad.LandingPadLabel = nullptr;
 
     // Special case: we *should* emit LPs with null LP MBB. This indicates
     // "nounwind" case.
@@ -550,13 +550,13 @@ try_next:;
 const Function *MachineModuleInfo::getPersonality() const {
   // FIXME: Until PR1414 will be fixed, we're using 1 personality function per
   // function
-  return !LandingPads.empty() ? LandingPads[0].Personality : NULL;
+  return !LandingPads.empty() ? LandingPads[0].Personality : nullptr;
 }
 
 /// getPersonalityIndex - Return unique index for current personality
 /// function. NULL/first personality function should always get zero index.
 unsigned MachineModuleInfo::getPersonalityIndex() const {
-  const Function* Personality = NULL;
+  const Function* Personality = nullptr;
 
   // Scan landing pads. If there is at least one non-NULL personality - use it.
   for (unsigned i = 0, e = LandingPads.size(); i != e; ++i)
diff --git a/lib/CodeGen/MachinePassRegistry.cpp b/lib/CodeGen/MachinePassRegistry.cpp
index cb204fd..3ee3e40 100644
--- a/lib/CodeGen/MachinePassRegistry.cpp
+++ b/lib/CodeGen/MachinePassRegistry.cpp
@@ -20,7 +20,7 @@ void MachinePassRegistryListener::anchor() { }
 
 /// setDefault - Set the default constructor by name.
 void MachinePassRegistry::setDefault(StringRef Name) {
-  MachinePassCtor Ctor = 0;
+  MachinePassCtor Ctor = nullptr;
   for(MachinePassRegistryNode *R = getList(); R; R = R->getNext()) {
     if (R->getName() == Name) {
       Ctor = R->getCtor();
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index db3eec3..f560259 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 void MachineRegisterInfo::Delegate::anchor() {}
 
 MachineRegisterInfo::MachineRegisterInfo(const TargetMachine &TM)
-  : TM(TM), TheDelegate(0), IsSSA(true), TracksLiveness(true) {
+  : TM(TM), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
   UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits());
@@ -60,7 +60,7 @@ MachineRegisterInfo::constrainRegClass(unsigned Reg,
   if (!NewRC || NewRC == OldRC)
     return NewRC;
   if (NewRC->getNumRegs() < MinNumRegs)
-    return 0;
+    return nullptr;
   setRegClass(Reg, NewRC);
   return NewRC;
 }
@@ -182,7 +182,7 @@ void MachineRegisterInfo::addRegOperandToUseList(MachineOperand *MO) {
   // Head is NULL for an empty list.
   if (!Head) {
     MO->Contents.Reg.Prev = MO;
-    MO->Contents.Reg.Next = 0;
+    MO->Contents.Reg.Next = nullptr;
     HeadRef = MO;
     return;
   }
@@ -203,7 +203,7 @@ void MachineRegisterInfo::addRegOperandToUseList(MachineOperand *MO) {
     HeadRef = MO;
   } else {
     // Insert use at the end.
-    MO->Contents.Reg.Next = 0;
+    MO->Contents.Reg.Next = nullptr;
     Last->Contents.Reg.Next = MO;
   }
 }
@@ -227,8 +227,8 @@ void MachineRegisterInfo::removeRegOperandFromUseList(MachineOperand *MO) {
 
   (Next ? Next : Head)->Contents.Reg.Prev = Prev;
 
-  MO->Contents.Reg.Prev = 0;
-  MO->Contents.Reg.Next = 0;
+  MO->Contents.Reg.Prev = nullptr;
+  MO->Contents.Reg.Next = nullptr;
 }
 
 /// Move NumOps operands from Src to Dst, updating use-def lists as needed.
@@ -303,17 +303,17 @@ MachineInstr *MachineRegisterInfo::getVRegDef(unsigned Reg) const {
   def_instr_iterator I = def_instr_begin(Reg);
   assert((I.atEnd() || std::next(I) == def_instr_end()) &&
          "getVRegDef assumes a single definition or no definition");
-  return !I.atEnd() ? &*I : 0;
+  return !I.atEnd() ? &*I : nullptr;
 }
 
 /// getUniqueVRegDef - Return the unique machine instr that defines the
 /// specified virtual register or null if none is found.  If there are
 /// multiple definitions or no definition, return null.
 MachineInstr *MachineRegisterInfo::getUniqueVRegDef(unsigned Reg) const {
-  if (def_empty(Reg)) return 0;
+  if (def_empty(Reg)) return nullptr;
   def_instr_iterator I = def_instr_begin(Reg);
   if (std::next(I) != def_instr_end())
-    return 0;
+    return nullptr;
   return &*I;
 }
 
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
index 77496ad..d9173a2 100644
--- a/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -29,6 +29,8 @@
 #include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "machine-ssaupdater"
+
 typedef DenseMap<MachineBasicBlock*, unsigned> AvailableValsTy;
 static AvailableValsTy &getAvailableVals(void *AV) {
   return *static_cast<AvailableValsTy*>(AV);
@@ -36,7 +38,7 @@ static AvailableValsTy &getAvailableVals(void *AV) {
 
 MachineSSAUpdater::MachineSSAUpdater(MachineFunction &MF,
                                      SmallVectorImpl<MachineInstr*> *NewPHI)
-  : AV(0), InsertedPHIs(NewPHI) {
+  : AV(nullptr), InsertedPHIs(NewPHI) {
   TII = MF.getTarget().getInstrInfo();
   MRI = &MF.getRegInfo();
 }
@@ -48,7 +50,7 @@ MachineSSAUpdater::~MachineSSAUpdater() {
 /// Initialize - Reset this object to get ready for a new set of SSA
 /// updates.  ProtoValue is the value used to name PHI nodes.
 void MachineSSAUpdater::Initialize(unsigned V) {
-  if (AV == 0)
+  if (!AV)
     AV = new AvailableValsTy();
   else
     getAvailableVals(AV).clear();
@@ -313,7 +315,7 @@ public:
   static MachineInstr *InstrIsPHI(MachineInstr *I) {
     if (I && I->isPHI())
       return I;
-    return 0;
+    return nullptr;
   }
 
   /// ValueIsPHI - Check if the instruction that defines the specified register
@@ -328,7 +330,7 @@ public:
     MachineInstr *PHI = ValueIsPHI(Val, Updater);
     if (PHI && PHI->getNumOperands() <= 1)
       return PHI;
-    return 0;
+    return nullptr;
   }
 
   /// GetPHIValue - For the specified PHI instruction, return the register
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index d90cd23..23847d6 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -12,8 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "misched"
-
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/ADT/PriorityQueue.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -35,6 +33,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "misched"
+
 namespace llvm {
 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
                            cl::desc("Force top-down list scheduling"));
@@ -85,7 +85,7 @@ void ScheduleDAGMutation::anchor() {}
 //===----------------------------------------------------------------------===//
 
 MachineSchedContext::MachineSchedContext():
-    MF(0), MLI(0), MDT(0), PassConfig(0), AA(0), LIS(0) {
+    MF(nullptr), MLI(nullptr), MDT(nullptr), PassConfig(nullptr), AA(nullptr), LIS(nullptr) {
   RegClassInfo = new RegisterClassInfo();
 }
 
@@ -100,7 +100,7 @@ class MachineSchedulerBase : public MachineSchedContext,
 public:
   MachineSchedulerBase(char &ID): MachineFunctionPass(ID) {}
 
-  void print(raw_ostream &O, const Module* = 0) const override;
+  void print(raw_ostream &O, const Module* = nullptr) const override;
 
 protected:
   void scheduleRegions(ScheduleDAGInstrs &Scheduler);
@@ -192,7 +192,7 @@ MachinePassRegistry MachineSchedRegistry::Registry;
 /// A dummy default scheduler factory indicates whether the scheduler
 /// is overridden on the command line.
 static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
-  return 0;
+  return nullptr;
 }
 
 /// MachineSchedOpt allows command line selection of the scheduler.
@@ -487,9 +487,8 @@ void ReadyQueue::dump() {
 // virtual registers.
 // ===----------------------------------------------------------------------===/
 
+// Provide a vtable anchor.
 ScheduleDAGMI::~ScheduleDAGMI() {
-  DeleteContainerPointers(Mutations);
-  delete SchedImpl;
 }
 
 bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {
@@ -527,7 +526,7 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
     dbgs() << "*** Scheduling failed! ***\n";
     SuccSU->dump(this);
     dbgs() << " has been released too many times!\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 #endif
   --SuccSU->NumPredsLeft;
@@ -561,7 +560,7 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
     dbgs() << "*** Scheduling failed! ***\n";
     PredSU->dump(this);
     dbgs() << " has been released too many times!\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 #endif
   --PredSU->NumSuccsLeft;
@@ -723,8 +722,8 @@ findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
 /// Identify DAG roots and setup scheduler queues.
 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
                                ArrayRef<SUnit*> BotRoots) {
-  NextClusterSucc = NULL;
-  NextClusterPred = NULL;
+  NextClusterSucc = nullptr;
+  NextClusterPred = nullptr;
 
   // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
   //
@@ -782,7 +781,7 @@ void ScheduleDAGMI::placeDebugValues() {
       RegionEnd = DbgValue;
   }
   DbgValues.clear();
-  FirstDbgValue = NULL;
+  FirstDbgValue = nullptr;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1549,7 +1548,7 @@ void SchedBoundary::reset() {
   // invalid, placeholder HazardRecs.
   if (HazardRec && HazardRec->isEnabled()) {
     delete HazardRec;
-    HazardRec = 0;
+    HazardRec = nullptr;
   }
   Available.clear();
   Pending.clear();
@@ -1679,7 +1678,7 @@ bool SchedBoundary::checkHazard(SUnit *SU) {
 // Find the unscheduled node in ReadySUs with the highest latency.
 unsigned SchedBoundary::
 findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
-  SUnit *LateSU = 0;
+  SUnit *LateSU = nullptr;
   unsigned RemLatency = 0;
   for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end();
        I != E; ++I) {
@@ -2057,7 +2056,7 @@ SUnit *SchedBoundary::pickOnlyChoice() {
   }
   if (Available.size() == 1)
     return *Available.begin();
-  return NULL;
+  return nullptr;
 }
 
 #ifndef NDEBUG
@@ -2157,7 +2156,7 @@ public:
     SchedResourceDelta ResDelta;
 
     SchedCandidate(const CandPolicy &policy)
-      : Policy(policy), SU(NULL), Reason(NoCand), RepeatReasonSet(0) {}
+      : Policy(policy), SU(nullptr), Reason(NoCand), RepeatReasonSet(0) {}
 
     bool isValid() const { return SU; }
 
@@ -2185,7 +2184,7 @@ protected:
   SchedRemainder Rem;
 protected:
   GenericSchedulerBase(const MachineSchedContext *C):
-    Context(C), SchedModel(0), TRI(0) {}
+    Context(C), SchedModel(nullptr), TRI(nullptr) {}
 
   void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone,
                  SchedBoundary *OtherZone);
@@ -2444,7 +2443,7 @@ class GenericScheduler : public GenericSchedulerBase {
   MachineSchedPolicy RegionPolicy;
 public:
   GenericScheduler(const MachineSchedContext *C):
-    GenericSchedulerBase(C), DAG(0), Top(SchedBoundary::TopQID, "TopQ"),
+    GenericSchedulerBase(C), DAG(nullptr), Top(SchedBoundary::TopQID, "TopQ"),
     Bot(SchedBoundary::BotQID, "BotQ") {}
 
   void initPolicy(MachineBasicBlock::iterator Begin,
@@ -2910,7 +2909,7 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
   if (DAG->top() == DAG->bottom()) {
     assert(Top.Available.empty() && Top.Pending.empty() &&
            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
-    return NULL;
+    return nullptr;
   }
   SUnit *SU;
   do {
@@ -3002,17 +3001,17 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 /// Create the standard converging machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
 static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) {
-  ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, new GenericScheduler(C));
+  ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, make_unique<GenericScheduler>(C));
   // Register DAG post-processors.
   //
   // FIXME: extend the mutation API to allow earlier mutations to instantiate
   // data and pass it to later mutations. Have a single mutation that gathers
   // the interesting nodes in one pass.
-  DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI));
+  DAG->addMutation(make_unique<CopyConstrain>(DAG->TII, DAG->TRI));
   if (EnableLoadCluster && DAG->TII->enableClusterLoads())
-    DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
+    DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI));
   if (EnableMacroFusion)
-    DAG->addMutation(new MacroFusion(DAG->TII));
+    DAG->addMutation(make_unique<MacroFusion>(DAG->TII));
   return DAG;
 }
 
@@ -3164,7 +3163,7 @@ void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
 SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
   if (DAG->top() == DAG->bottom()) {
     assert(Top.Available.empty() && Top.Pending.empty() && "ReadyQ garbage");
-    return NULL;
+    return nullptr;
   }
   SUnit *SU;
   do {
@@ -3174,7 +3173,7 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
       SchedCandidate TopCand(NoPolicy);
       // Set the top-down policy based on the state of the current top zone and
       // the instructions outside the zone, including the bottom zone.
-      setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, NULL);
+      setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
       pickNodeFromQueue(TopCand);
       assert(TopCand.Reason != NoCand && "failed to find a candidate");
       tracePick(TopCand, true);
@@ -3198,7 +3197,7 @@ void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 
 /// Create a generic scheduler with no vreg liveness or DAG mutation passes.
 static ScheduleDAGInstrs *createGenericSchedPostRA(MachineSchedContext *C) {
-  return new ScheduleDAGMI(C, new PostGenericScheduler(C), /*IsPostRA=*/true);
+  return new ScheduleDAGMI(C, make_unique<PostGenericScheduler>(C), /*IsPostRA=*/true);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3212,7 +3211,8 @@ struct ILPOrder {
   const BitVector *ScheduledTrees;
   bool MaximizeILP;
 
-  ILPOrder(bool MaxILP): DFSResult(0), ScheduledTrees(0), MaximizeILP(MaxILP) {}
+  ILPOrder(bool MaxILP)
+    : DFSResult(nullptr), ScheduledTrees(nullptr), MaximizeILP(MaxILP) {}
 
   /// \brief Apply a less-than relation on node priority.
   ///
@@ -3246,7 +3246,7 @@ class ILPScheduler : public MachineSchedStrategy {
 
   std::vector<SUnit*> ReadyQ;
 public:
-  ILPScheduler(bool MaximizeILP): DAG(0), Cmp(MaximizeILP) {}
+  ILPScheduler(bool MaximizeILP): DAG(nullptr), Cmp(MaximizeILP) {}
 
   void initialize(ScheduleDAGMI *dag) override {
     assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness");
@@ -3267,7 +3267,7 @@ public:
 
   /// Callback to select the highest priority node from the ready Q.
   SUnit *pickNode(bool &IsTopNode) override {
-    if (ReadyQ.empty()) return NULL;
+    if (ReadyQ.empty()) return nullptr;
     std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
     SUnit *SU = ReadyQ.back();
     ReadyQ.pop_back();
@@ -3302,10 +3302,10 @@ public:
 } // namespace
 
 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, new ILPScheduler(true));
+  return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(true));
 }
 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, new ILPScheduler(false));
+  return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(false));
 }
 static MachineSchedRegistry ILPMaxRegistry(
   "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
@@ -3347,7 +3347,7 @@ public:
   InstructionShuffler(bool alternate, bool topdown)
     : IsAlternating(alternate), IsTopDown(topdown) {}
 
-  virtual void initialize(ScheduleDAGMI*) {
+  void initialize(ScheduleDAGMI*) override {
     TopQ.clear();
     BottomQ.clear();
   }
@@ -3355,11 +3355,11 @@ public:
   /// Implement MachineSchedStrategy interface.
   /// -----------------------------------------
 
-  virtual SUnit *pickNode(bool &IsTopNode) {
+  SUnit *pickNode(bool &IsTopNode) override {
     SUnit *SU;
     if (IsTopDown) {
       do {
-        if (TopQ.empty()) return NULL;
+        if (TopQ.empty()) return nullptr;
         SU = TopQ.top();
         TopQ.pop();
       } while (SU->isScheduled);
@@ -3367,7 +3367,7 @@ public:
     }
     else {
       do {
-        if (BottomQ.empty()) return NULL;
+        if (BottomQ.empty()) return nullptr;
         SU = BottomQ.top();
         BottomQ.pop();
       } while (SU->isScheduled);
@@ -3378,12 +3378,12 @@ public:
     return SU;
   }
 
-  virtual void schedNode(SUnit *SU, bool IsTopNode) {}
+  void schedNode(SUnit *SU, bool IsTopNode) override {}
 
-  virtual void releaseTopNode(SUnit *SU) {
+  void releaseTopNode(SUnit *SU) override {
     TopQ.push(SU);
   }
-  virtual void releaseBottomNode(SUnit *SU) {
+  void releaseBottomNode(SUnit *SU) override {
     BottomQ.push(SU);
   }
 };
@@ -3394,7 +3394,7 @@ static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
   bool TopDown = !ForceBottomUp;
   assert((TopDown || !ForceTopDown) &&
          "-misched-topdown incompatible with -misched-bottomup");
-  return new ScheduleDAGMILive(C, new InstructionShuffler(Alternate, TopDown));
+  return new ScheduleDAGMILive(C, make_unique<InstructionShuffler>(Alternate, TopDown));
 }
 static MachineSchedRegistry ShufflerRegistry(
   "shuffle", "Shuffle machine instructions alternating directions",
@@ -3450,7 +3450,7 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
     raw_string_ostream SS(Str);
     const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
     const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
-      static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : 0;
+      static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
     SS << "SU:" << SU->NodeNum;
     if (DFS)
       SS << " I:" << DFS->getNumInstrs(SU);
@@ -3464,7 +3464,7 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
     std::string Str("shape=Mrecord");
     const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
     const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
-      static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : 0;
+      static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
     if (DFS) {
       Str += ",style=filled,fillcolor=\"#";
       Str += DOT::getColorString(DFS->getSubtreeID(N));
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index dbff1f6..f44e4d1 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "machine-sink"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -32,6 +31,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "machine-sink"
+
 static cl::opt<bool>
 SplitEdges("machine-sink-split",
            cl::desc("Split critical edges during machine sinking"),
@@ -332,16 +333,16 @@ MachineBasicBlock *MachineSinking::SplitCriticalEdge(MachineInstr *MI,
                                                      MachineBasicBlock *ToBB,
                                                      bool BreakPHIEdge) {
   if (!isWorthBreakingCriticalEdge(MI, FromBB, ToBB))
-    return 0;
+    return nullptr;
 
   // Avoid breaking back edge. From == To means backedge for single BB loop.
   if (!SplitEdges || FromBB == ToBB)
-    return 0;
+    return nullptr;
 
   // Check for backedges of more "complex" loops.
   if (LI->getLoopFor(FromBB) == LI->getLoopFor(ToBB) &&
       LI->isLoopHeader(ToBB))
-    return 0;
+    return nullptr;
 
   // It's not always legal to break critical edges and sink the computation
   // to the edge.
@@ -388,7 +389,7 @@ MachineBasicBlock *MachineSinking::SplitCriticalEdge(MachineInstr *MI,
       if (*PI == FromBB)
         continue;
       if (!DT->dominates(ToBB, *PI))
-        return 0;
+        return nullptr;
     }
   }
 
@@ -484,7 +485,7 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
 
   // SuccToSinkTo - This is the successor to sink this instruction to, once we
   // decide.
-  MachineBasicBlock *SuccToSinkTo = 0;
+  MachineBasicBlock *SuccToSinkTo = nullptr;
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg()) continue;  // Ignore non-register operands.
@@ -498,10 +499,10 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
         // and we can freely move its uses. Alternatively, if it's allocatable,
         // it could get allocated to something with a def during allocation.
         if (!MRI->isConstantPhysReg(Reg, *MBB->getParent()))
-          return NULL;
+          return nullptr;
       } else if (!MO.isDead()) {
         // A def that isn't dead. We can't move it.
-        return NULL;
+        return nullptr;
       }
     } else {
       // Virtual register uses are always safe to sink.
@@ -509,7 +510,7 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
 
       // If it's not safe to move defs of the register class, then abort.
       if (!TII->isSafeToMoveRegClassDefs(MRI->getRegClass(Reg)))
-        return NULL;
+        return nullptr;
 
       // FIXME: This picks a successor to sink into based on having one
       // successor that dominates all the uses.  However, there are cases where
@@ -532,7 +533,7 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
         bool LocalUse = false;
         if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo, MBB,
                                      BreakPHIEdge, LocalUse))
-          return NULL;
+          return nullptr;
 
         continue;
       }
@@ -558,26 +559,26 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
         }
         if (LocalUse)
           // Def is used locally, it's never safe to move this def.
-          return NULL;
+          return nullptr;
       }
 
       // If we couldn't find a block to sink to, ignore this instruction.
-      if (SuccToSinkTo == 0)
-        return NULL;
-      else if (!isProfitableToSinkTo(Reg, MI, MBB, SuccToSinkTo))
-        return NULL;
+      if (!SuccToSinkTo)
+        return nullptr;
+      if (!isProfitableToSinkTo(Reg, MI, MBB, SuccToSinkTo))
+        return nullptr;
     }
   }
 
   // It is not possible to sink an instruction into its own block.  This can
   // happen with loops.
   if (MBB == SuccToSinkTo)
-    return NULL;
+    return nullptr;
 
   // It's not safe to sink instructions to EH landing pad. Control flow into
   // landing pad is implicitly defined.
   if (SuccToSinkTo && SuccToSinkTo->isLandingPad())
-    return NULL;
+    return nullptr;
 
   return SuccToSinkTo;
 }
@@ -607,7 +608,7 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
   MachineBasicBlock *SuccToSinkTo = FindSuccToSinkTo(MI, ParentBlock, BreakPHIEdge);
 
   // If there are no outputs, it must have side-effects.
-  if (SuccToSinkTo == 0)
+  if (!SuccToSinkTo)
     return false;
 
 
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index d07178e..1bbf0ad 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "machine-trace-metrics"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SparseSet.h"
@@ -26,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "machine-trace-metrics"
+
 char MachineTraceMetrics::ID = 0;
 char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
 
@@ -37,8 +38,9 @@ INITIALIZE_PASS_END(MachineTraceMetrics,
                   "machine-trace-metrics", "Machine Trace Metrics", false, true)
 
 MachineTraceMetrics::MachineTraceMetrics()
-  : MachineFunctionPass(ID), MF(0), TII(0), TRI(0), MRI(0), Loops(0) {
-  std::fill(Ensembles, array_endof(Ensembles), (Ensemble*)0);
+  : MachineFunctionPass(ID), MF(nullptr), TII(nullptr), TRI(nullptr),
+    MRI(nullptr), Loops(nullptr) {
+  std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr);
 }
 
 void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -64,11 +66,11 @@ bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
 }
 
 void MachineTraceMetrics::releaseMemory() {
-  MF = 0;
+  MF = nullptr;
   BlockInfo.clear();
   for (unsigned i = 0; i != TS_NumStrategies; ++i) {
     delete Ensembles[i];
-    Ensembles[i] = 0;
+    Ensembles[i] = nullptr;
   }
 }
 
@@ -95,19 +97,17 @@ MachineTraceMetrics::getResources(const MachineBasicBlock *MBB) {
   unsigned PRKinds = SchedModel.getNumProcResourceKinds();
   SmallVector<unsigned, 32> PRCycles(PRKinds);
 
-  for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
-       I != E; ++I) {
-    const MachineInstr *MI = I;
-    if (MI->isTransient())
+  for (const auto &MI : *MBB) {
+    if (MI.isTransient())
       continue;
     ++InstrCount;
-    if (MI->isCall())
+    if (MI.isCall())
       FBI->HasCalls = true;
 
     // Count processor resources used.
     if (!SchedModel.hasInstrSchedModel())
       continue;
-    const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(MI);
+    const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(&MI);
     if (!SC->isValid())
       continue;
 
@@ -233,7 +233,7 @@ const MachineTraceMetrics::TraceBlockInfo*
 MachineTraceMetrics::Ensemble::
 getDepthResources(const MachineBasicBlock *MBB) const {
   const TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
-  return TBI->hasValidDepth() ? TBI : 0;
+  return TBI->hasValidDepth() ? TBI : nullptr;
 }
 
 // Check if height resources for MBB are valid and return the TBI.
@@ -242,7 +242,7 @@ const MachineTraceMetrics::TraceBlockInfo*
 MachineTraceMetrics::Ensemble::
 getHeightResources(const MachineBasicBlock *MBB) const {
   const TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
-  return TBI->hasValidHeight() ? TBI : 0;
+  return TBI->hasValidHeight() ? TBI : nullptr;
 }
 
 /// Get an array of processor resource depths for MBB. Indexed by processor
@@ -316,13 +316,13 @@ public:
 const MachineBasicBlock*
 MinInstrCountEnsemble::pickTracePred(const MachineBasicBlock *MBB) {
   if (MBB->pred_empty())
-    return 0;
+    return nullptr;
   const MachineLoop *CurLoop = getLoopFor(MBB);
   // Don't leave loops, and never follow back-edges.
   if (CurLoop && MBB == CurLoop->getHeader())
-    return 0;
+    return nullptr;
   unsigned CurCount = MTM.getResources(MBB)->InstrCount;
-  const MachineBasicBlock *Best = 0;
+  const MachineBasicBlock *Best = nullptr;
   unsigned BestDepth = 0;
   for (MachineBasicBlock::const_pred_iterator
        I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
@@ -344,9 +344,9 @@ MinInstrCountEnsemble::pickTracePred(const MachineBasicBlock *MBB) {
 const MachineBasicBlock*
 MinInstrCountEnsemble::pickTraceSucc(const MachineBasicBlock *MBB) {
   if (MBB->pred_empty())
-    return 0;
+    return nullptr;
   const MachineLoop *CurLoop = getLoopFor(MBB);
-  const MachineBasicBlock *Best = 0;
+  const MachineBasicBlock *Best = nullptr;
   unsigned BestHeight = 0;
   for (MachineBasicBlock::const_succ_iterator
        I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
@@ -568,9 +568,8 @@ MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
   // invalidated, but their instructions will stay the same, so there is no
   // need to erase the Cycle entries. They will be overwritten when we
   // recompute.
-  for (MachineBasicBlock::const_iterator I = BadMBB->begin(), E = BadMBB->end();
-       I != E; ++I)
-    Cycles.erase(I);
+  for (const auto &I : *BadMBB)
+    Cycles.erase(&I);
 }
 
 void MachineTraceMetrics::Ensemble::verify() const {
@@ -690,7 +689,7 @@ struct LiveRegUnit {
 
   unsigned getSparseSetIndex() const { return RegUnit; }
 
-  LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(0), Op(0) {}
+  LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(nullptr), Op(0) {}
 };
 }
 
@@ -828,16 +827,13 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
     if (TBI.HasValidInstrHeights)
       TBI.CriticalPath = computeCrossBlockCriticalPath(TBI);
 
-    for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
-         I != E; ++I) {
-      const MachineInstr *UseMI = I;
-
+    for (const auto &UseMI : *MBB) {
       // Collect all data dependencies.
       Deps.clear();
-      if (UseMI->isPHI())
-        getPHIDeps(UseMI, Deps, TBI.Pred, MTM.MRI);
-      else if (getDataDeps(UseMI, Deps, MTM.MRI))
-        updatePhysDepsDownwards(UseMI, Deps, RegUnits, MTM.TRI);
+      if (UseMI.isPHI())
+        getPHIDeps(&UseMI, Deps, TBI.Pred, MTM.MRI);
+      else if (getDataDeps(&UseMI, Deps, MTM.MRI))
+        updatePhysDepsDownwards(&UseMI, Deps, RegUnits, MTM.TRI);
 
       // Filter and process dependencies, computing the earliest issue cycle.
       unsigned Cycle = 0;
@@ -853,20 +849,20 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
         // Add latency if DefMI is a real instruction. Transients get latency 0.
         if (!Dep.DefMI->isTransient())
           DepCycle += MTM.SchedModel
-            .computeOperandLatency(Dep.DefMI, Dep.DefOp, UseMI, Dep.UseOp);
+            .computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI, Dep.UseOp);
         Cycle = std::max(Cycle, DepCycle);
       }
       // Remember the instruction depth.
-      InstrCycles &MICycles = Cycles[UseMI];
+      InstrCycles &MICycles = Cycles[&UseMI];
       MICycles.Depth = Cycle;
 
       if (!TBI.HasValidInstrHeights) {
-        DEBUG(dbgs() << Cycle << '\t' << *UseMI);
+        DEBUG(dbgs() << Cycle << '\t' << UseMI);
         continue;
       }
       // Update critical path length.
       TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Height);
-      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << *UseMI);
+      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << UseMI);
     }
   }
 }
@@ -1055,16 +1051,16 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
           Succ = Loop->getHeader();
 
     if (Succ) {
-      for (MachineBasicBlock::const_iterator I = Succ->begin(), E = Succ->end();
-           I != E && I->isPHI(); ++I) {
-        const MachineInstr *PHI = I;
+      for (const auto &PHI : *Succ) {
+        if (!PHI.isPHI())
+          break;
         Deps.clear();
-        getPHIDeps(PHI, Deps, MBB, MTM.MRI);
+        getPHIDeps(&PHI, Deps, MBB, MTM.MRI);
         if (!Deps.empty()) {
           // Loop header PHI heights are all 0.
-          unsigned Height = TBI.Succ ? Cycles.lookup(PHI).Height : 0;
-          DEBUG(dbgs() << "pred\t" << Height << '\t' << *PHI);
-          if (pushDepHeight(Deps.front(), PHI, Height,
+          unsigned Height = TBI.Succ ? Cycles.lookup(&PHI).Height : 0;
+          DEBUG(dbgs() << "pred\t" << Height << '\t' << PHI);
+          if (pushDepHeight(Deps.front(), &PHI, Height,
                             Heights, MTM.SchedModel, MTM.TII))
             addLiveIns(Deps.front().DefMI, Deps.front().DefOp, Stack);
         }
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 1bd75f7..8515b0f 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -33,7 +33,6 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/BasicBlock.h"
@@ -42,6 +41,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -241,7 +241,7 @@ namespace {
     static char ID; // Pass ID, replacement for typeid
     const char *const Banner;
 
-    MachineVerifierPass(const char *b = 0)
+    MachineVerifierPass(const char *b = nullptr)
       : MachineFunctionPass(ID), Banner(b) {
         initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry());
       }
@@ -273,7 +273,7 @@ void MachineFunction::verify(Pass *p, const char *Banner) const {
 }
 
 bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
-  raw_ostream *OutFile = 0;
+  raw_ostream *OutFile = nullptr;
   if (OutFileName) {
     std::string ErrorInfo;
     OutFile = new raw_fd_ostream(OutFileName, ErrorInfo,
@@ -296,10 +296,10 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
   TRI = TM->getRegisterInfo();
   MRI = &MF.getRegInfo();
 
-  LiveVars = NULL;
-  LiveInts = NULL;
-  LiveStks = NULL;
-  Indexes = NULL;
+  LiveVars = nullptr;
+  LiveInts = nullptr;
+  LiveStks = nullptr;
+  Indexes = nullptr;
   if (PASS) {
     LiveInts = PASS->getAnalysisIfAvailable<LiveIntervals>();
     // We don't want to verify LiveVariables if LiveIntervals is available.
@@ -314,7 +314,7 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
        MFI!=MFE; ++MFI) {
     visitMachineBasicBlockBefore(MFI);
     // Keep track of the current bundle header.
-    const MachineInstr *CurBundle = 0;
+    const MachineInstr *CurBundle = nullptr;
     // Do we expect the next instruction to be part of the same bundle?
     bool InBundle = false;
 
@@ -469,18 +469,17 @@ void MachineVerifier::visitMachineFunctionBefore() {
 
   // Build a set of the basic blocks in the function.
   FunctionBlocks.clear();
-  for (MachineFunction::const_iterator
-       I = MF->begin(), E = MF->end(); I != E; ++I) {
-    FunctionBlocks.insert(I);
-    BBInfo &MInfo = MBBInfoMap[I];
-
-    MInfo.Preds.insert(I->pred_begin(), I->pred_end());
-    if (MInfo.Preds.size() != I->pred_size())
-      report("MBB has duplicate entries in its predecessor list.", I);
-
-    MInfo.Succs.insert(I->succ_begin(), I->succ_end());
-    if (MInfo.Succs.size() != I->succ_size())
-      report("MBB has duplicate entries in its successor list.", I);
+  for (const auto &MBB : *MF) {
+    FunctionBlocks.insert(&MBB);
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+
+    MInfo.Preds.insert(MBB.pred_begin(), MBB.pred_end());
+    if (MInfo.Preds.size() != MBB.pred_size())
+      report("MBB has duplicate entries in its predecessor list.", &MBB);
+
+    MInfo.Succs.insert(MBB.succ_begin(), MBB.succ_end());
+    if (MInfo.Succs.size() != MBB.succ_size())
+      report("MBB has duplicate entries in its successor list.", &MBB);
   }
 
   // Check that the register use lists are sane.
@@ -501,7 +500,7 @@ static bool matchPair(MachineBasicBlock::const_succ_iterator i,
 
 void
 MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
-  FirstTerminator = 0;
+  FirstTerminator = nullptr;
 
   if (MRI->isSSA()) {
     // If this block has allocatable physical registers live-in, check that
@@ -553,7 +552,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     report("MBB has more than one landing pad successor", MBB);
 
   // Call AnalyzeBranch. If it succeeds, there several more conditions to check.
-  MachineBasicBlock *TBB = 0, *FBB = 0;
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   if (!TII->AnalyzeBranch(*const_cast<MachineBasicBlock *>(MBB),
                           TBB, FBB, Cond)) {
@@ -578,8 +577,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
         report("MBB exits via unconditional fall-through but its successor "
                "differs from its CFG successor!", MBB);
       }
-      if (!MBB->empty() && getBundleStart(&MBB->back())->isBarrier() &&
-          !TII->isPredicated(getBundleStart(&MBB->back()))) {
+      if (!MBB->empty() && MBB->back().isBarrier() &&
+          !TII->isPredicated(&MBB->back())) {
         report("MBB exits via unconditional fall-through but ends with a "
                "barrier instruction!", MBB);
       }
@@ -599,10 +598,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       if (MBB->empty()) {
         report("MBB exits via unconditional branch but doesn't contain "
                "any instructions!", MBB);
-      } else if (!getBundleStart(&MBB->back())->isBarrier()) {
+      } else if (!MBB->back().isBarrier()) {
         report("MBB exits via unconditional branch but doesn't end with a "
                "barrier instruction!", MBB);
-      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
+      } else if (!MBB->back().isTerminator()) {
         report("MBB exits via unconditional branch but the branch isn't a "
                "terminator instruction!", MBB);
       }
@@ -630,10 +629,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       if (MBB->empty()) {
         report("MBB exits via conditional branch/fall-through but doesn't "
                "contain any instructions!", MBB);
-      } else if (getBundleStart(&MBB->back())->isBarrier()) {
+      } else if (MBB->back().isBarrier()) {
         report("MBB exits via conditional branch/fall-through but ends with a "
                "barrier instruction!", MBB);
-      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
+      } else if (!MBB->back().isTerminator()) {
         report("MBB exits via conditional branch/fall-through but the branch "
                "isn't a terminator instruction!", MBB);
       }
@@ -658,10 +657,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       if (MBB->empty()) {
         report("MBB exits via conditional branch/branch but doesn't "
                "contain any instructions!", MBB);
-      } else if (!getBundleStart(&MBB->back())->isBarrier()) {
+      } else if (!MBB->back().isBarrier()) {
         report("MBB exits via conditional branch/branch but doesn't end with a "
                "barrier instruction!", MBB);
-      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
+      } else if (!MBB->back().isTerminator()) {
         report("MBB exits via conditional branch/branch but the branch "
                "isn't a terminator instruction!", MBB);
       }
@@ -1158,9 +1157,7 @@ void MachineVerifier::calcRegsPassed() {
   // First push live-out regs to successors' vregsPassed. Remember the MBBs that
   // have any vregsPassed.
   SmallPtrSet<const MachineBasicBlock*, 8> todo;
-  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
-       MFI != MFE; ++MFI) {
-    const MachineBasicBlock &MBB(*MFI);
+  for (const auto &MBB : *MF) {
     BBInfo &MInfo = MBBInfoMap[&MBB];
     if (!MInfo.reachable)
       continue;
@@ -1195,9 +1192,7 @@ void MachineVerifier::calcRegsPassed() {
 void MachineVerifier::calcRegsRequired() {
   // First push live-in regs to predecessors' vregsRequired.
   SmallPtrSet<const MachineBasicBlock*, 8> todo;
-  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
-       MFI != MFE; ++MFI) {
-    const MachineBasicBlock &MBB(*MFI);
+  for (const auto &MBB : *MF) {
     BBInfo &MInfo = MBBInfoMap[&MBB];
     for (MachineBasicBlock::const_pred_iterator PrI = MBB.pred_begin(),
            PrE = MBB.pred_end(); PrI != PrE; ++PrI) {
@@ -1228,27 +1223,28 @@ void MachineVerifier::calcRegsRequired() {
 // calcRegsPassed has been run so BBInfo::isLiveOut is valid.
 void MachineVerifier::checkPHIOps(const MachineBasicBlock *MBB) {
   SmallPtrSet<const MachineBasicBlock*, 8> seen;
-  for (MachineBasicBlock::const_iterator BBI = MBB->begin(), BBE = MBB->end();
-       BBI != BBE && BBI->isPHI(); ++BBI) {
+  for (const auto &BBI : *MBB) {
+    if (!BBI.isPHI())
+      break;
     seen.clear();
 
-    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
-      unsigned Reg = BBI->getOperand(i).getReg();
-      const MachineBasicBlock *Pre = BBI->getOperand(i + 1).getMBB();
+    for (unsigned i = 1, e = BBI.getNumOperands(); i != e; i += 2) {
+      unsigned Reg = BBI.getOperand(i).getReg();
+      const MachineBasicBlock *Pre = BBI.getOperand(i + 1).getMBB();
       if (!Pre->isSuccessor(MBB))
         continue;
       seen.insert(Pre);
       BBInfo &PrInfo = MBBInfoMap[Pre];
       if (PrInfo.reachable && !PrInfo.isLiveOut(Reg))
         report("PHI operand is not live-out from predecessor",
-               &BBI->getOperand(i), i);
+               &BBI.getOperand(i), i);
     }
 
     // Did we see all predecessors?
     for (MachineBasicBlock::const_pred_iterator PrI = MBB->pred_begin(),
            PrE = MBB->pred_end(); PrI != PrE; ++PrI) {
       if (!seen.count(*PrI)) {
-        report("Missing PHI operand", BBI);
+        report("Missing PHI operand", &BBI);
         *OS << "BB#" << (*PrI)->getNumber()
             << " is a predecessor according to the CFG.\n";
       }
@@ -1259,29 +1255,27 @@ void MachineVerifier::checkPHIOps(const MachineBasicBlock *MBB) {
 void MachineVerifier::visitMachineFunctionAfter() {
   calcRegsPassed();
 
-  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
-       MFI != MFE; ++MFI) {
-    BBInfo &MInfo = MBBInfoMap[MFI];
+  for (const auto &MBB : *MF) {
+    BBInfo &MInfo = MBBInfoMap[&MBB];
 
     // Skip unreachable MBBs.
     if (!MInfo.reachable)
       continue;
 
-    checkPHIOps(MFI);
+    checkPHIOps(&MBB);
   }
 
   // Now check liveness info if available
   calcRegsRequired();
 
   // Check for killed virtual registers that should be live out.
-  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
-       MFI != MFE; ++MFI) {
-    BBInfo &MInfo = MBBInfoMap[MFI];
+  for (const auto &MBB : *MF) {
+    BBInfo &MInfo = MBBInfoMap[&MBB];
     for (RegSet::iterator
          I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
          ++I)
       if (MInfo.regsKilled.count(*I)) {
-        report("Virtual register killed in block, but needed live out.", MFI);
+        report("Virtual register killed in block, but needed live out.", &MBB);
         *OS << "Virtual register " << PrintReg(*I)
             << " is used after the block.\n";
       }
@@ -1307,20 +1301,19 @@ void MachineVerifier::verifyLiveVariables() {
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
-    for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
-         MFI != MFE; ++MFI) {
-      BBInfo &MInfo = MBBInfoMap[MFI];
+    for (const auto &MBB : *MF) {
+      BBInfo &MInfo = MBBInfoMap[&MBB];
 
       // Our vregsRequired should be identical to LiveVariables' AliveBlocks
       if (MInfo.vregsRequired.count(Reg)) {
-        if (!VI.AliveBlocks.test(MFI->getNumber())) {
-          report("LiveVariables: Block missing from AliveBlocks", MFI);
+        if (!VI.AliveBlocks.test(MBB.getNumber())) {
+          report("LiveVariables: Block missing from AliveBlocks", &MBB);
           *OS << "Virtual register " << PrintReg(Reg)
               << " must be live through the block.\n";
         }
       } else {
-        if (VI.AliveBlocks.test(MFI->getNumber())) {
-          report("LiveVariables: Block should not be in AliveBlocks", MFI);
+        if (VI.AliveBlocks.test(MBB.getNumber())) {
+          report("LiveVariables: Block should not be in AliveBlocks", &MBB);
           *OS << "Virtual register " << PrintReg(Reg)
               << " is not needed live through the block.\n";
         }
@@ -1675,32 +1668,31 @@ void MachineVerifier::verifyStackFrame() {
     }
 
     // Update stack state by checking contents of MBB.
-    for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
-         I != E; ++I) {
-      if (I->getOpcode() == FrameSetupOpcode) {
+    for (const auto &I : *MBB) {
+      if (I.getOpcode() == FrameSetupOpcode) {
         // The first operand of a FrameOpcode should be i32.
-        int Size = I->getOperand(0).getImm();
+        int Size = I.getOperand(0).getImm();
         assert(Size >= 0 &&
           "Value should be non-negative in FrameSetup and FrameDestroy.\n");
 
         if (BBState.ExitIsSetup)
-          report("FrameSetup is after another FrameSetup", I); 
+          report("FrameSetup is after another FrameSetup", &I);
         BBState.ExitValue -= Size;
         BBState.ExitIsSetup = true;
       }
 
-      if (I->getOpcode() == FrameDestroyOpcode) {
+      if (I.getOpcode() == FrameDestroyOpcode) {
         // The first operand of a FrameOpcode should be i32.
-        int Size = I->getOperand(0).getImm();
+        int Size = I.getOperand(0).getImm();
         assert(Size >= 0 &&
           "Value should be non-negative in FrameSetup and FrameDestroy.\n");
 
         if (!BBState.ExitIsSetup)
-          report("FrameDestroy is not after a FrameSetup", I);
+          report("FrameDestroy is not after a FrameSetup", &I);
         int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue :
                                                BBState.ExitValue;
         if (BBState.ExitIsSetup && AbsSPAdj != Size) {
-          report("FrameDestroy <n> is after FrameSetup <m>", I);
+          report("FrameDestroy <n> is after FrameSetup <m>", &I);
           *OS << "FrameDestroy <" << Size << "> is after FrameSetup <"
               << AbsSPAdj << ">.\n";
         }
diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index 56cb673..95a2934 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "phi-opt"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -23,6 +22,8 @@
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "phi-opt"
+
 STATISTIC(NumPHICycles, "Number of PHI cycles replaced");
 STATISTIC(NumDeadPHICycles, "Number of dead PHI cycles");
 
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index 0e9df58..c8d0819 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "phielim"
 #include "llvm/CodeGen/Passes.h"
 #include "PHIEliminationUtils.h"
 #include "llvm/ADT/STLExtras.h"
@@ -35,6 +34,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "phielim"
+
 static cl::opt<bool>
 DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false),
                      cl::Hidden, cl::desc("Disable critical edge splitting "
@@ -377,7 +378,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       findPHICopyInsertPoint(&opBlock, &MBB, SrcReg);
 
     // Insert the copy.
-    MachineInstr *NewSrcInstr = 0;
+    MachineInstr *NewSrcInstr = nullptr;
     if (!reusedIncoming && IncomingReg) {
       if (SrcUndef) {
         // The source register is undefined, so there is no need for a real
@@ -531,13 +532,14 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
 /// used later to determine when the vreg is killed in the BB.
 ///
 void PHIElimination::analyzePHINodes(const MachineFunction& MF) {
-  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
-       I != E; ++I)
-    for (MachineBasicBlock::const_iterator BBI = I->begin(), BBE = I->end();
-         BBI != BBE && BBI->isPHI(); ++BBI)
-      for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
-        ++VRegPHIUseCount[BBVRegPair(BBI->getOperand(i+1).getMBB()->getNumber(),
-                                     BBI->getOperand(i).getReg())];
+  for (const auto &MBB : MF)
+    for (const auto &BBI : MBB) {
+      if (!BBI.isPHI())
+        break;
+      for (unsigned i = 1, e = BBI.getNumOperands(); i != e; i += 2)
+        ++VRegPHIUseCount[BBVRegPair(BBI.getOperand(i+1).getMBB()->getNumber(),
+                                     BBI.getOperand(i).getReg())];
+    }
 }
 
 bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
@@ -546,7 +548,7 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
   if (MBB.empty() || !MBB.front().isPHI() || MBB.isLandingPad())
     return false;   // Quick exit for basic blocks without PHIs.
 
-  const MachineLoop *CurLoop = MLI ? MLI->getLoopFor(&MBB) : 0;
+  const MachineLoop *CurLoop = MLI ? MLI->getLoopFor(&MBB) : nullptr;
   bool IsLoopHeader = CurLoop && &MBB == CurLoop->getHeader();
 
   bool Changed = false;
@@ -563,7 +565,7 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
       // out-of-line blocks into the loop which is very bad for code placement.
       if (PreMBB == &MBB && !SplitAllCriticalEdges)
         continue;
-      const MachineLoop *PreLoop = MLI ? MLI->getLoopFor(PreMBB) : 0;
+      const MachineLoop *PreLoop = MLI ? MLI->getLoopFor(PreMBB) : nullptr;
       if (IsLoopHeader && PreLoop == CurLoop && !SplitAllCriticalEdges)
         continue;
 
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 080b20d..b3f7198 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -84,7 +84,7 @@ static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden,
     cl::desc("Dump garbage collector data"));
 static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
     cl::desc("Verify generated machine code"),
-    cl::init(getenv("LLVM_VERIFY_MACHINEINSTRS")!=NULL));
+    cl::init(getenv("LLVM_VERIFY_MACHINEINSTRS")!=nullptr));
 static cl::opt<std::string>
 PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
                    cl::desc("Print machine instrs"),
@@ -126,7 +126,7 @@ static IdentifyingPassPtr applyOverride(IdentifyingPassPtr TargetID,
   case cl::BOU_TRUE:
     if (TargetID.isValid())
       return TargetID;
-    if (StandardID == 0)
+    if (StandardID == nullptr)
       report_fatal_error("Target cannot enable pass");
     return StandardID;
   case cl::BOU_FALSE:
@@ -232,8 +232,8 @@ TargetPassConfig::~TargetPassConfig() {
 // Out of line constructor provides default values for pass options and
 // registers all common codegen passes.
 TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
-  : ImmutablePass(ID), PM(&pm), StartAfter(0), StopAfter(0),
-    Started(true), Stopped(false), TM(tm), Impl(0), Initialized(false),
+  : ImmutablePass(ID), PM(&pm), StartAfter(nullptr), StopAfter(nullptr),
+    Started(true), Stopped(false), TM(tm), Impl(nullptr), Initialized(false),
     DisableVerify(false),
     EnableTailMerge(true) {
 
@@ -274,7 +274,7 @@ TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 TargetPassConfig::TargetPassConfig()
-  : ImmutablePass(ID), PM(0) {
+  : ImmutablePass(ID), PM(nullptr) {
   llvm_unreachable("TargetPassConfig should not be constructed on-the-fly");
 }
 
@@ -332,7 +332,7 @@ AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
   IdentifyingPassPtr TargetID = getPassSubstitution(PassID);
   IdentifyingPassPtr FinalPtr = overridePass(PassID, TargetID);
   if (!FinalPtr.isValid())
-    return 0;
+    return nullptr;
 
   Pass *P;
   if (FinalPtr.isInstance())
@@ -384,8 +384,10 @@ void TargetPassConfig::addIRPasses() {
 
   // Before running any passes, run the verifier to determine if the input
   // coming from the front-end and/or optimizer is valid.
-  if (!DisableVerify)
+  if (!DisableVerify) {
     addPass(createVerifierPass());
+    addPass(createDebugInfoVerifierPass());
+  }
 
   // Run loop strength reduction before anything else.
   if (getOptLevel() != CodeGenOpt::None && !DisableLSR) {
@@ -443,6 +445,12 @@ void TargetPassConfig::addCodeGenPrepare() {
 void TargetPassConfig::addISelPrepare() {
   addPreISel();
 
+  // Need to verify DebugInfo *before* creating the stack protector analysis.
+  // It's a function pass, and verifying between it and its users causes a
+  // crash.
+  if (!DisableVerify)
+    addPass(createDebugInfoVerifierPass());
+
   addPass(createStackProtectorPass(TM));
 
   if (PrintISelInput)
@@ -620,7 +628,7 @@ MachinePassRegistry RegisterRegAlloc::Registry;
 
 /// A dummy default pass factory indicates whether the register allocator is
 /// overridden on the command line.
-static FunctionPass *useDefaultRegisterAllocator() { return 0; }
+static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
 static RegisterRegAlloc
 defaultRegAlloc("default",
                 "pick register allocator based on -O option",
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index e18d9635..eeee93a 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -66,7 +66,6 @@
 //     C = copy A    <-- same-bank copy
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "peephole-opt"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -81,6 +80,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "peephole-opt"
+
 // Optimize Extensions
 static cl::opt<bool>
 Aggressive("aggressive-ext-opt", cl::Hidden,
@@ -183,7 +184,7 @@ optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
   // If UseSrcSubIdx is Set, SubIdx also applies to SrcReg, and only uses of
   // SrcReg:SubIdx should be replaced.
   bool UseSrcSubIdx = TM->getRegisterInfo()->
-    getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != 0;
+    getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != nullptr;
 
   // The source has other uses. See if we can replace the other uses with use of
   // the result of the extension.
@@ -358,7 +359,7 @@ static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
   unsigned SrcIdx, DefIdx;
   if (SrcSubReg && DefSubReg)
     return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg,
-                                      SrcIdx, DefIdx) != NULL;
+                                      SrcIdx, DefIdx) != nullptr;
   // At most one of the register is a sub register, make it Src to avoid
   // duplicating the test.
   if (!SrcSubReg) {
@@ -368,9 +369,9 @@ static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
 
   // One of the register is a sub register, check if we can get a superclass.
   if (SrcSubReg)
-    return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != NULL;
+    return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr;
   // Plain copy.
-  return TRI.getCommonSubClass(DefRC, SrcRC) != NULL;
+  return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr;
 }
 
 /// \brief Get the index of the definition and source for \p Copy
@@ -568,7 +569,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   TM  = &MF.getTarget();
   TII = TM->getInstrInfo();
   MRI = &MF.getRegInfo();
-  DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : 0;
+  DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
 
   bool Changed = false;
 
@@ -643,7 +644,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
             // Save FoldAsLoadDefReg because optimizeLoadInstr() resets it and
             // we need it for markUsesInDebugValueAsUndef().
             unsigned FoldedReg = FoldAsLoadDefReg;
-            MachineInstr *DefMI = 0;
+            MachineInstr *DefMI = nullptr;
             MachineInstr *FoldMI = TII->optimizeLoadInstr(MI, MRI,
                                                           FoldAsLoadDefReg,
                                                           DefMI);
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index a13e51f..db3933e 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -18,7 +18,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "post-RA-sched"
 #include "llvm/CodeGen/Passes.h"
 #include "AggressiveAntiDepBreaker.h"
 #include "AntiDepBreaker.h"
@@ -47,6 +46,8 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "post-RA-sched"
+
 STATISTIC(NumNoops, "Number of noops inserted");
 STATISTIC(NumStalls, "Number of pipeline stalls");
 STATISTIC(NumFixedAnti, "Number of fixed anti-dependencies");
@@ -205,7 +206,7 @@ SchedulePostRATDList::SchedulePostRATDList(
     ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_ALL) ?
      (AntiDepBreaker *)new AggressiveAntiDepBreaker(MF, RCI, CriticalPathRCs) :
      ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_CRITICAL) ?
-      (AntiDepBreaker *)new CriticalAntiDepBreaker(MF, RCI) : NULL));
+      (AntiDepBreaker *)new CriticalAntiDepBreaker(MF, RCI) : nullptr));
 }
 
 SchedulePostRATDList::~SchedulePostRATDList() {
@@ -355,7 +356,7 @@ void SchedulePostRATDList::startBlock(MachineBasicBlock *BB) {
 
   // Reset the hazard recognizer and anti-dep breaker.
   HazardRec->Reset();
-  if (AntiDepBreak != NULL)
+  if (AntiDepBreak)
     AntiDepBreak->StartBlock(BB);
 }
 
@@ -365,7 +366,7 @@ void SchedulePostRATDList::schedule() {
   // Build the scheduling graph.
   buildSchedGraph(AA);
 
-  if (AntiDepBreak != NULL) {
+  if (AntiDepBreak) {
     unsigned Broken =
       AntiDepBreak->BreakAntiDependencies(SUnits, RegionBegin, RegionEnd,
                                           EndIndex, DbgValues);
@@ -397,14 +398,14 @@ void SchedulePostRATDList::schedule() {
 /// instruction, which will not be scheduled.
 ///
 void SchedulePostRATDList::Observe(MachineInstr *MI, unsigned Count) {
-  if (AntiDepBreak != NULL)
+  if (AntiDepBreak)
     AntiDepBreak->Observe(MI, Count, EndIndex);
 }
 
 /// FinishBlock - Clean up register live-range state.
 ///
 void SchedulePostRATDList::finishBlock() {
-  if (AntiDepBreak != NULL)
+  if (AntiDepBreak)
     AntiDepBreak->FinishBlock();
 
   // Call the superclass.
@@ -429,7 +430,7 @@ void SchedulePostRATDList::ReleaseSucc(SUnit *SU, SDep *SuccEdge) {
     dbgs() << "*** Scheduling failed! ***\n";
     SuccSU->dump(this);
     dbgs() << " has been released too many times!\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 #endif
   --SuccSU->NumPredsLeft;
@@ -480,7 +481,7 @@ void SchedulePostRATDList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
 void SchedulePostRATDList::emitNoop(unsigned CurCycle) {
   DEBUG(dbgs() << "*** Emitting noop in cycle " << CurCycle << '\n');
   HazardRec->EmitNoop();
-  Sequence.push_back(0);   // NULL here means noop
+  Sequence.push_back(nullptr);   // NULL here means noop
   ++NumNoops;
 }
 
@@ -532,7 +533,7 @@ void SchedulePostRATDList::ListScheduleTopDown() {
 
     DEBUG(dbgs() << "\n*** Examining Available\n"; AvailableQueue.dump(this));
 
-    SUnit *FoundSUnit = 0, *NotPreferredSUnit = 0;
+    SUnit *FoundSUnit = nullptr, *NotPreferredSUnit = nullptr;
     bool HasNoopHazards = false;
     while (!AvailableQueue.empty()) {
       SUnit *CurSUnit = AvailableQueue.pop();
@@ -572,7 +573,7 @@ void SchedulePostRATDList::ListScheduleTopDown() {
         AvailableQueue.push(NotPreferredSUnit);
       }
 
-      NotPreferredSUnit = 0;
+      NotPreferredSUnit = nullptr;
     }
 
     // Add the nodes that aren't ready back onto the available list.
@@ -662,5 +663,5 @@ void SchedulePostRATDList::EmitSchedule() {
     BB->splice(++OrigPrivMI, BB, DbgValue);
   }
   DbgValues.clear();
-  FirstDbgValue = NULL;
+  FirstDbgValue = nullptr;
 }
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index 360e8d7..3129927 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "processimplicitdefs"
-
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -21,6 +19,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "processimplicitdefs"
+
 namespace {
 /// Process IMPLICIT_DEF instructions and make sure there is one implicit_def
 /// for each use. Add isUndef marker to implicit_def defs and their uses.
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 136b1ed..c74a42f 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "pei"
 #include "PrologEpilogInserter.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -46,6 +45,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "pei"
+
 char PEI::ID = 0;
 char &llvm::PrologEpilogCodeInserterID = PEI::ID;
 
@@ -114,7 +115,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
 
   assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs");
 
-  RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : NULL;
+  RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr;
   FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn);
 
   // Calculate the MaxCallFrameSize and AdjustsStack variables for the
@@ -243,14 +244,14 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
   MachineFrameInfo *MFI = F.getFrameInfo();
 
   // Get the callee saved register list...
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&F);
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F);
 
   // These are used to keep track the callee-save area. Initialize them.
   MinCSFrameIndex = INT_MAX;
   MaxCSFrameIndex = 0;
 
   // Early exit for targets which have no callee saved registers.
-  if (CSRegs == 0 || CSRegs[0] == 0)
+  if (!CSRegs || CSRegs[0] == 0)
     return;
 
   // In Naked functions we aren't going to save any registers.
@@ -680,7 +681,7 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
   // we've been asked for it.  This, when linked with a runtime with support
   // for segmented stacks (libgcc is one), will result in allocating stack
   // space in small chunks instead of one large contiguous block.
-  if (Fn.getTarget().Options.EnableSegmentedStacks)
+  if (Fn.shouldSplitStack())
     TFI.adjustForSegmentedStacks(Fn);
 
   // Emit additional code that is required to explicitly handle the stack in
@@ -805,7 +806,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
       // use that target machine register info object to eliminate
       // it.
       TRI.eliminateFrameIndex(MI, SPAdj, i,
-                              FrameIndexVirtualScavenging ?  NULL : RS);
+                              FrameIndexVirtualScavenging ?  nullptr : RS);
 
       // Reset the iterator if we were at the beginning of the BB.
       if (AtBeginning) {
@@ -813,7 +814,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
         DoIncr = false;
       }
 
-      MI = 0;
+      MI = nullptr;
       break;
     }
 
@@ -845,13 +846,14 @@ void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
       // We might end up here again with a NULL iterator if we scavenged a
       // register for which we inserted spill code for definition by what was
       // originally the first instruction in BB.
-      if (I == MachineBasicBlock::iterator(NULL))
+      if (I == MachineBasicBlock::iterator(nullptr))
         I = BB->begin();
 
       MachineInstr *MI = I;
       MachineBasicBlock::iterator J = std::next(I);
       MachineBasicBlock::iterator P =
-          I == BB->begin() ? MachineBasicBlock::iterator(NULL) : std::prev(I);
+                         I == BB->begin() ? MachineBasicBlock::iterator(nullptr)
+                                          : std::prev(I);
 
       // RS should process this instruction before we might scavenge at this
       // location. This is because we might be replacing a virtual register
diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp
index 8564911..12b2c90 100644
--- a/lib/CodeGen/PseudoSourceValue.cpp
+++ b/lib/CodeGen/PseudoSourceValue.cpp
@@ -58,13 +58,9 @@ static const char *const PSVNames[] = {
   "ConstantPool"
 };
 
-// FIXME: THIS IS A HACK!!!!
-// Eventually these should be uniqued on LLVMContext rather than in a managed
-// static.  For now, we can safely use the global context for the time being to
-// squeak by.
-PseudoSourceValue::PseudoSourceValue(enum ValueTy Subclass) :
-  Value(Type::getInt8PtrTy(getGlobalContext()),
-        Subclass) {}
+PseudoSourceValue::PseudoSourceValue(bool isFixed) : isFixed(isFixed) {}
+
+PseudoSourceValue::~PseudoSourceValue() {}
 
 void PseudoSourceValue::printCustom(raw_ostream &O) const {
   O << PSVNames[this - PSVGlobals->PSVs];
diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index 33584f8..894aee7 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "RegAllocBase.h"
 #include "Spiller.h"
 #include "llvm/ADT/Statistic.h"
@@ -35,6 +34,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 STATISTIC(NumNewQueued    , "Number of new live ranges queued");
 
 // Temporary verification option until we can put verification inside
@@ -110,7 +111,7 @@ void RegAllocBase::allocatePhysRegs() {
     if (AvailablePhysReg == ~0u) {
       // selectOrSplit failed to find a register!
       // Probably caused by an inline asm.
-      MachineInstr *MI = 0;
+      MachineInstr *MI = nullptr;
       for (MachineRegisterInfo::reg_instr_iterator
            I = MRI->reg_instr_begin(VirtReg->reg), E = MRI->reg_instr_end();
            I != E; ) {
diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index 68bd4b5..b333c36 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@@ -65,7 +65,8 @@ protected:
   LiveRegMatrix *Matrix;
   RegisterClassInfo RegClassInfo;
 
-  RegAllocBase(): TRI(0), MRI(0), VRM(0), LIS(0), Matrix(0) {}
+  RegAllocBase()
+    : TRI(nullptr), MRI(nullptr), VRM(nullptr), LIS(nullptr), Matrix(nullptr) {}
 
   virtual ~RegAllocBase() {}
 
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index b8c04fc..b722098 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "llvm/CodeGen/Passes.h"
 #include "AllocationOrder.h"
 #include "LiveDebugVariables.h"
@@ -41,6 +40,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 static RegisterRegAlloc basicRegAlloc("basic", "basic register allocator",
                                       createBasicRegisterAllocator);
 
@@ -93,7 +94,7 @@ public:
 
   LiveInterval *dequeue() override {
     if (Queue.empty())
-      return 0;
+      return nullptr;
     LiveInterval *LI = Queue.top();
     Queue.pop();
     return LI;
@@ -156,7 +157,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 void RABasic::releaseMemory() {
-  SpillerInstance.reset(0);
+  SpillerInstance.reset(nullptr);
 }
 
 
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 8dc44f5..97b9f76 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
@@ -38,6 +37,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 STATISTIC(NumStores, "Number of stores added");
 STATISTIC(NumLoads , "Number of loads added");
 STATISTIC(NumCopies, "Number of copies coalesced");
@@ -75,7 +76,7 @@ namespace {
       bool Dirty;               // Register needs spill.
 
       explicit LiveReg(unsigned v)
-        : LastUse(0), VirtReg(v), PhysReg(0), LastOpNum(0), Dirty(false) {}
+        : LastUse(nullptr), VirtReg(v), PhysReg(0), LastOpNum(0), Dirty(false){}
 
       unsigned getSparseSetIndex() const {
         return TargetRegisterInfo::virtReg2Index(VirtReg);
@@ -319,7 +320,7 @@ void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
     // now.
     LRIDbgValues.clear();
     if (SpillKill)
-      LR.LastUse = 0; // Don't kill register again
+      LR.LastUse = nullptr; // Don't kill register again
   }
   killVirtReg(LRI);
 }
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 6a623b8..aa7c178 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "llvm/CodeGen/Passes.h"
 #include "AllocationOrder.h"
 #include "InterferenceCache.h"
@@ -37,7 +36,9 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -47,6 +48,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 STATISTIC(NumGlobalSplits, "Number of split global live ranges");
 STATISTIC(NumLocalSplits,  "Number of split local live ranges");
 STATISTIC(NumEvicted,      "Number of interferences evicted");
@@ -71,6 +74,11 @@ static cl::opt<unsigned> LastChanceRecoloringMaxInterference(
              " interference at a time"),
     cl::init(8));
 
+static cl::opt<bool>
+ExhaustiveSearch("exhaustive-register-search", cl::NotHidden,
+                 cl::desc("Exhaustive Search for registers bypassing the depth "
+                          "and interference cutoffs of last chance recoloring"));
+
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned>
 CSRFirstTimeCost("regalloc-csr-first-time-cost",
@@ -147,6 +155,22 @@ class RAGreedy : public MachineFunctionPass,
     RS_Done
   };
 
+  // Enum CutOffStage to keep a track whether the register allocation failed
+  // because of the cutoffs encountered in last chance recoloring.
+  // Note: This is used as bitmask. New value should be next power of 2.
+  enum CutOffStage {
+    // No cutoffs encountered
+    CO_None = 0,
+
+    // lcr-max-depth cutoff encountered
+    CO_Depth = 1,
+
+    // lcr-max-interf cutoff encountered
+    CO_Interf = 2
+  };
+
+  uint8_t CutOffInfo;
+
 #ifndef NDEBUG
   static const char *const StageName[];
 #endif
@@ -258,6 +282,9 @@ class RAGreedy : public MachineFunctionPass,
   /// NoCand which indicates the stack interval.
   SmallVector<unsigned, 32> BundleCand;
 
+  /// Callee-save register cost, calculated once per machine function.
+  BlockFrequency CSRCost;
+
 public:
   RAGreedy();
 
@@ -326,6 +353,7 @@ private:
   unsigned tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order,
                                  unsigned PhysReg, unsigned &CostPerUseLimit,
                                  SmallVectorImpl<unsigned> &NewVRegs);
+  void initializeCSRCost();
   unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
                          SmallVectorImpl<unsigned>&);
   unsigned tryInstructionSplit(LiveInterval&, AllocationOrder&,
@@ -447,7 +475,7 @@ void RAGreedy::LRE_DidCloneVirtReg(unsigned New, unsigned Old) {
 }
 
 void RAGreedy::releaseMemory() {
-  SpillerInstance.reset(0);
+  SpillerInstance.reset(nullptr);
   ExtraRegInfo.clear();
   GlobalCand.clear();
 }
@@ -514,7 +542,7 @@ LiveInterval *RAGreedy::dequeue() { return dequeue(Queue); }
 
 LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) {
   if (CurQueue.empty())
-    return 0;
+    return nullptr;
   LiveInterval *LI = &LIS->getInterval(~CurQueue.top().second);
   CurQueue.pop();
   return LI;
@@ -1910,8 +1938,9 @@ RAGreedy::mayRecolorAllInterferences(unsigned PhysReg, LiveInterval &VirtReg,
     // If there is LastChanceRecoloringMaxInterference or more interferences,
     // chances are one would not be recolorable.
     if (Q.collectInterferingVRegs(LastChanceRecoloringMaxInterference) >=
-        LastChanceRecoloringMaxInterference) {
+        LastChanceRecoloringMaxInterference && !ExhaustiveSearch) {
       DEBUG(dbgs() << "Early abort: too many interferences.\n");
+      CutOffInfo |= CO_Interf;
       return false;
     }
     for (unsigned i = Q.interferingVRegs().size(); i; --i) {
@@ -1982,8 +2011,9 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
   // We may want to reconsider that if we end up with a too large search space
   // for target with hundreds of registers.
   // Indeed, in that case we may want to cut the search space earlier.
-  if (Depth >= LastChanceRecoloringMaxDepth) {
+  if (Depth >= LastChanceRecoloringMaxDepth && !ExhaustiveSearch) {
     DEBUG(dbgs() << "Abort because max depth has been reached.\n");
+    CutOffInfo |= CO_Depth;
     return ~0u;
   }
 
@@ -2108,8 +2138,26 @@ bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
 
 unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
                                  SmallVectorImpl<unsigned> &NewVRegs) {
+  CutOffInfo = CO_None;
+  LLVMContext &Ctx = MF->getFunction()->getContext();
   SmallVirtRegSet FixedRegisters;
-  return selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters);
+  unsigned Reg = selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters);
+  if (Reg == ~0U && (CutOffInfo != CO_None)) {
+    uint8_t CutOffEncountered = CutOffInfo & (CO_Depth | CO_Interf);
+    if (CutOffEncountered == CO_Depth)
+      Ctx.emitError("register allocation failed: maximum depth for recoloring "
+                    "reached. Use -fexhaustive-register-search to skip "
+                    "cutoffs");
+    else if (CutOffEncountered == CO_Interf)
+      Ctx.emitError("register allocation failed: maximum interference for "
+                    "recoloring reached. Use -fexhaustive-register-search "
+                    "to skip cutoffs");
+    else if (CutOffEncountered == (CO_Depth | CO_Interf))
+      Ctx.emitError("register allocation failed: maximum interference and "
+                    "depth for recoloring reached. Use "
+                    "-fexhaustive-register-search to skip cutoffs");
+  }
+  return Reg;
 }
 
 /// Using a CSR for the first time has a cost because it causes push|pop
@@ -2123,10 +2171,6 @@ unsigned RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg,
                                          unsigned PhysReg,
                                          unsigned &CostPerUseLimit,
                                          SmallVectorImpl<unsigned> &NewVRegs) {
-  // We use the larger one out of the command-line option and the value report
-  // by TRI.
-  BlockFrequency CSRCost(std::max((unsigned)CSRFirstTimeCost,
-                                  TRI->getCSRFirstUseCost()));
   if (getStage(VirtReg) == RS_Spill && VirtReg.isSpillable()) {
     // We choose spill over using the CSR for the first time if the spill cost
     // is lower than CSRCost.
@@ -2144,9 +2188,9 @@ unsigned RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg,
     // the cost of splitting is lower than CSRCost.
     SA->analyze(&VirtReg);
     unsigned NumCands = 0;
-    unsigned BestCand =
-      calculateRegionSplitCost(VirtReg, Order, CSRCost, NumCands,
-                               true/*IgnoreCSR*/);
+    BlockFrequency BestCost = CSRCost; // Don't modify CSRCost.
+    unsigned BestCand = calculateRegionSplitCost(VirtReg, Order, BestCost,
+                                                 NumCands, true /*IgnoreCSR*/);
     if (BestCand == NoCand)
       // Use the CSR if we can't find a region split below CSRCost.
       return PhysReg;
@@ -2158,6 +2202,31 @@ unsigned RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg,
   return PhysReg;
 }
 
+void RAGreedy::initializeCSRCost() {
+  // We use the larger one out of the command-line option and the value report
+  // by TRI.
+  CSRCost = BlockFrequency(
+      std::max((unsigned)CSRFirstTimeCost, TRI->getCSRFirstUseCost()));
+  if (!CSRCost.getFrequency())
+    return;
+
+  // Raw cost is relative to Entry == 2^14; scale it appropriately.
+  uint64_t ActualEntry = MBFI->getEntryFreq();
+  if (!ActualEntry) {
+    CSRCost = 0;
+    return;
+  }
+  uint64_t FixedEntry = 1 << 14;
+  if (ActualEntry < FixedEntry)
+    CSRCost *= BranchProbability(ActualEntry, FixedEntry);
+  else if (ActualEntry <= UINT32_MAX)
+    // Invert the fraction and divide.
+    CSRCost /= BranchProbability(FixedEntry, ActualEntry);
+  else
+    // Can't use BranchProbability in general, since it takes 32-bit numbers.
+    CSRCost = CSRCost.getFrequency() * (ActualEntry / FixedEntry);
+}
+
 unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
                                      SmallVectorImpl<unsigned> &NewVRegs,
                                      SmallVirtRegSet &FixedRegisters,
@@ -2175,8 +2244,7 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
     // When NewVRegs is not empty, we may have made decisions such as evicting
     // a virtual register, go with the earlier decisions and use the physical
     // register.
-    if ((CSRFirstTimeCost || TRI->getCSRFirstUseCost()) &&
-        CSRFirstUse && NewVRegs.empty()) {
+    if (CSRCost.getFrequency() && CSRFirstUse && NewVRegs.empty()) {
       unsigned CSRReg = tryAssignCSRFirstTime(VirtReg, Order, PhysReg,
                                               CostPerUseLimit, NewVRegs);
       if (CSRReg || !NewVRegs.empty())
@@ -2258,6 +2326,8 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   SpillPlacer = &getAnalysis<SpillPlacement>();
   DebugVars = &getAnalysis<LiveDebugVariables>();
 
+  initializeCSRCost();
+
   calculateSpillWeightsAndHints(*LIS, mf, *Loops, *MBFI);
 
   DEBUG(LIS->dump());
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 96dbd9a..b8d2325 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -29,8 +29,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
-
 #include "llvm/CodeGen/RegAllocPBQP.h"
 #include "RegisterCoalescer.h"
 #include "Spiller.h"
@@ -48,6 +46,7 @@
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -59,6 +58,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 static RegisterRegAlloc
 registerPBQPRepAlloc("pbqp", "PBQP register allocator",
                        createDefaultPBQPRegisterAllocator);
@@ -87,7 +88,7 @@ public:
   static char ID;
 
   /// Construct a PBQP register allocator.
-  RegAllocPBQP(std::unique_ptr<PBQPBuilder> &b, char *cPassID=0)
+  RegAllocPBQP(std::unique_ptr<PBQPBuilder> &b, char *cPassID=nullptr)
       : MachineFunctionPass(ID), builder(b.release()), customPassID(cPassID) {
     initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
     initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
@@ -215,7 +216,7 @@ PBQPRAProblem *PBQPBuilder::build(MachineFunction *mf, const LiveIntervals *lis,
     // Compute an initial allowed set for the current vreg.
     typedef std::vector<unsigned> VRAllowed;
     VRAllowed vrAllowed;
-    ArrayRef<uint16_t> rawOrder = trc->getRawAllocationOrder(*mf);
+    ArrayRef<MCPhysReg> rawOrder = trc->getRawAllocationOrder(*mf);
     for (unsigned i = 0; i != rawOrder.size(); ++i) {
       unsigned preg = rawOrder[i];
       if (mri->isReserved(preg))
@@ -320,17 +321,9 @@ PBQPRAProblem *PBQPBuilderWithCoalescing::build(MachineFunction *mf,
 
   // Scan the machine function and add a coalescing cost whenever CoalescerPair
   // gives the Ok.
-  for (MachineFunction::const_iterator mbbItr = mf->begin(),
-                                       mbbEnd = mf->end();
-       mbbItr != mbbEnd; ++mbbItr) {
-    const MachineBasicBlock *mbb = &*mbbItr;
-
-    for (MachineBasicBlock::const_iterator miItr = mbb->begin(),
-                                           miEnd = mbb->end();
-         miItr != miEnd; ++miItr) {
-      const MachineInstr *mi = &*miItr;
-
-      if (!cp.setRegisters(mi)) {
+  for (const auto &mbb : *mf) {
+    for (const auto &mi : mbb) {
+      if (!cp.setRegisters(&mi)) {
         continue; // Not coalescable.
       }
 
@@ -345,7 +338,7 @@ PBQPRAProblem *PBQPBuilderWithCoalescing::build(MachineFunction *mf,
       // value plucked randomly out of the air.
 
       PBQP::PBQPNum cBenefit =
-        copyFactor * LiveIntervals::getSpillWeight(false, true, mbfi, mi);
+        copyFactor * LiveIntervals::getSpillWeight(false, true, mbfi, &mi);
 
       if (cp.isPhys()) {
         if (!mf->getRegInfo().isAllocatable(dst)) {
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index aa84446..8b5445c 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -25,12 +24,14 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 static cl::opt<unsigned>
 StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"),
          cl::desc("Limit all regclasses to N registers"));
 
-RegisterClassInfo::RegisterClassInfo() : Tag(0), MF(0), TRI(0), CalleeSaved(0)
-{}
+RegisterClassInfo::RegisterClassInfo()
+  : Tag(0), MF(nullptr), TRI(nullptr), CalleeSaved(nullptr) {}
 
 void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   bool Update = false;
@@ -151,7 +152,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
 /// nonoverlapping reserved registers. However, computing the allocation order
 /// for all register classes would be too expensive.
 unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
-  const TargetRegisterClass *RC = 0;
+  const TargetRegisterClass *RC = nullptr;
   unsigned NumRCUnits = 0;
   for (TargetRegisterInfo::regclass_iterator
          RI = TRI->regclass_begin(), RE = TRI->regclass_end(); RI != RE; ++RI) {
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 682c26c..5aaeb87 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "RegisterCoalescer.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -42,6 +41,8 @@
 #include <cmath>
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 STATISTIC(numJoins    , "Number of interval joins performed");
 STATISTIC(numCrossRCs , "Number of cross class joins performed");
 STATISTIC(numCommutes , "Number of instruction commuting performed");
@@ -195,7 +196,7 @@ namespace {
     bool runOnMachineFunction(MachineFunction&) override;
 
     /// print - Implement the dump method.
-    void print(raw_ostream &O, const Module* = 0) const override;
+    void print(raw_ostream &O, const Module* = nullptr) const override;
   };
 } /// end anonymous namespace
 
@@ -240,9 +241,8 @@ static bool isSplitEdge(const MachineBasicBlock *MBB) {
   if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
     return false;
 
-  for (MachineBasicBlock::const_iterator MII = MBB->begin(), E = MBB->end();
-       MII != E; ++MII) {
-    if (!MII->isCopyLike() && !MII->isUnconditionalBranch())
+  for (const auto &MI : *MBB) {
+    if (!MI.isCopyLike() && !MI.isUnconditionalBranch())
       return false;
   }
   return true;
@@ -251,7 +251,7 @@ static bool isSplitEdge(const MachineBasicBlock *MBB) {
 bool CoalescerPair::setRegisters(const MachineInstr *MI) {
   SrcReg = DstReg = 0;
   SrcIdx = DstIdx = 0;
-  NewRC = 0;
+  NewRC = nullptr;
   Flipped = CrossClass = false;
 
   unsigned Src, Dst, SrcSub, DstSub;
@@ -397,7 +397,8 @@ void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
 
 void RegisterCoalescer::eliminateDeadDefs() {
   SmallVector<unsigned, 8> NewRegs;
-  LiveRangeEdit(0, NewRegs, *MF, *LIS, 0, this).eliminateDeadDefs(DeadDefs);
+  LiveRangeEdit(nullptr, NewRegs, *MF, *LIS,
+                nullptr, this).eliminateDeadDefs(DeadDefs);
 }
 
 // Callback from eliminateDeadDefs().
@@ -844,6 +845,27 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
                                                 true  /*IsDef*/,
                                                 true  /*IsImp*/,
                                                 false /*IsKill*/));
+    // Record small dead def live-ranges for all the subregisters
+    // of the destination register.
+    // Otherwise, variables that live through may miss some
+    // interferences, thus creating invalid allocation.
+    // E.g., i386 code:
+    // vreg1 = somedef ; vreg1 GR8
+    // vreg2 = remat ; vreg2 GR32
+    // CL = COPY vreg2.sub_8bit
+    // = somedef vreg1 ; vreg1 GR8
+    // =>
+    // vreg1 = somedef ; vreg1 GR8
+    // ECX<def, dead> = remat ; CL<imp-def>
+    // = somedef vreg1 ; vreg1 GR8
+    // vreg1 will see the inteferences with CL but not with CH since
+    // no live-ranges would have been created for ECX.
+    // Fix that!
+    SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
+    for (MCRegUnitIterator Units(NewMI->getOperand(0).getReg(), TRI);
+         Units.isValid(); ++Units)
+      if (LiveRange *LR = LIS->getCachedRegUnit(*Units))
+        LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
   }
 
   if (NewMI->getOperand(0).getSubReg())
@@ -902,7 +924,7 @@ bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI,
   // No intervals are live-in to CopyMI - it is undef.
   if (CP.isFlipped())
     DstInt = SrcInt;
-  SrcInt = 0;
+  SrcInt = nullptr;
 
   VNInfo *DeadVNI = DstInt->getVNInfoAt(Idx.getRegSlot());
   assert(DeadVNI && "No value defined in DstInt");
@@ -931,7 +953,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
                                           unsigned DstReg,
                                           unsigned SubIdx) {
   bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
-  LiveInterval *DstInt = DstIsPhys ? 0 : &LIS->getInterval(DstReg);
+  LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
 
   SmallPtrSet<MachineInstr*, 8> Visited;
   for (MachineRegisterInfo::reg_instr_iterator
@@ -1355,7 +1377,7 @@ class JoinVals {
     bool PrunedComputed;
 
     Val() : Resolution(CR_Keep), WriteLanes(0), ValidLanes(0),
-            RedefVNI(0), OtherVNI(0), ErasableImplicitDef(false),
+            RedefVNI(nullptr), OtherVNI(nullptr), ErasableImplicitDef(false),
             Pruned(false), PrunedComputed(false) {}
 
     bool isAnalyzed() const { return WriteLanes != 0; }
@@ -1461,7 +1483,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   }
 
   // Get the instruction defining this value, compute the lanes written.
-  const MachineInstr *DefMI = 0;
+  const MachineInstr *DefMI = nullptr;
   if (VNI->isPHIDef()) {
     // Conservatively assume that all lanes in a PHI are valid.
     V.ValidLanes = V.WriteLanes = TRI->getSubRegIndexLaneMask(SubIdx);
@@ -2085,14 +2107,14 @@ copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
     // Skip instruction pointers that have already been erased, for example by
     // dead code elimination.
     if (ErasedInstrs.erase(CurrList[i])) {
-      CurrList[i] = 0;
+      CurrList[i] = nullptr;
       continue;
     }
     bool Again = false;
     bool Success = joinCopy(CurrList[i], Again);
     Progress |= Success;
     if (Success || !Again)
-      CurrList[i] = 0;
+      CurrList[i] = nullptr;
   }
   return Progress;
 }
@@ -2132,7 +2154,7 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
     CurrList(WorkList.begin() + PrevSize, WorkList.end());
   if (copyCoalesceWorkList(CurrList))
     WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
-                               (MachineInstr*)0), WorkList.end());
+                               (MachineInstr*)nullptr), WorkList.end());
 }
 
 void RegisterCoalescer::coalesceLocals() {
diff --git a/lib/CodeGen/RegisterCoalescer.h b/lib/CodeGen/RegisterCoalescer.h
index 47c3df1..e57ceab 100644
--- a/lib/CodeGen/RegisterCoalescer.h
+++ b/lib/CodeGen/RegisterCoalescer.h
@@ -61,14 +61,14 @@ namespace llvm {
   public:
     CoalescerPair(const TargetRegisterInfo &tri)
       : TRI(tri), DstReg(0), SrcReg(0), DstIdx(0), SrcIdx(0),
-        Partial(false), CrossClass(false), Flipped(false), NewRC(0) {}
+        Partial(false), CrossClass(false), Flipped(false), NewRC(nullptr) {}
 
     /// Create a CoalescerPair representing a virtreg-to-physreg copy.
     /// No need to call setRegisters().
     CoalescerPair(unsigned VirtReg, unsigned PhysReg,
                   const TargetRegisterInfo &tri)
       : TRI(tri), DstReg(PhysReg), SrcReg(VirtReg), DstIdx(0), SrcIdx(0),
-        Partial(false), CrossClass(false), Flipped(false), NewRC(0) {}
+        Partial(false), CrossClass(false), Flipped(false), NewRC(nullptr) {}
 
     /// setRegisters - set registers to match the copy instruction MI. Return
     /// false if MI is not a coalescable copy instruction.
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 97817da..b2909e0 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -154,8 +154,8 @@ const LiveRange *RegPressureTracker::getLiveRange(unsigned Reg) const {
 }
 
 void RegPressureTracker::reset() {
-  MBB = 0;
-  LIS = 0;
+  MBB = nullptr;
+  LIS = nullptr;
 
   CurrSetPressure.clear();
   LiveThruPressure.clear();
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index bfd26dc..72b6285 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "reg-scavenging"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -29,6 +28,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "reg-scavenging"
+
 /// setUsed - Set the register and its sub-registers as being used.
 void RegScavenger::setUsed(unsigned Reg) {
   for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
@@ -47,7 +48,7 @@ void RegScavenger::initRegState() {
   for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(),
          IE = Scavenged.end(); I != IE; ++I) {
     I->Reg = 0;
-    I->Restore = NULL;
+    I->Restore = nullptr;
   }
 
   // All registers started out unused.
@@ -91,8 +92,8 @@ void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
 
     // Create callee-saved registers bitvector.
     CalleeSavedRegs.resize(NumPhysRegs);
-    const uint16_t *CSRegs = TRI->getCalleeSavedRegs(&MF);
-    if (CSRegs != NULL)
+    const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+    if (CSRegs != nullptr)
       for (unsigned i = 0; CSRegs[i]; ++i)
         CalleeSavedRegs.set(CSRegs[i]);
   }
@@ -162,7 +163,7 @@ void RegScavenger::unprocess() {
   }
 
   if (MBBI == MBB->begin()) {
-    MBBI = MachineBasicBlock::iterator(NULL);
+    MBBI = MachineBasicBlock::iterator(nullptr);
     Tracking = false;
   } else
     --MBBI;
@@ -187,7 +188,7 @@ void RegScavenger::forward() {
       continue;
 
     I->Reg = 0;
-    I->Restore = NULL;
+    I->Restore = nullptr;
   }
 
   if (MI->isDebugValue())
@@ -223,7 +224,7 @@ void RegScavenger::forward() {
             break;
           }
         if (!SubUsed) {
-          MBB->getParent()->verify(NULL, "In Register Scavenger");
+          MBB->getParent()->verify(nullptr, "In Register Scavenger");
           llvm_unreachable("Using an undefined register!");
         }
         (void)SubUsed;
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index d08eb65..6a2a080 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "pre-RA-sched"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -25,6 +24,8 @@
 #include <climits>
 using namespace llvm;
 
+#define DEBUG_TYPE "pre-RA-sched"
+
 #ifndef NDEBUG
 static cl::opt<bool> StressSchedOpt(
   "stress-sched", cl::Hidden, cl::init(false),
@@ -55,7 +56,7 @@ void ScheduleDAG::clearDAG() {
 
 /// getInstrDesc helper to handle SDNodes.
 const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {
-  if (!Node || !Node->isMachineOpcode()) return NULL;
+  if (!Node || !Node->isMachineOpcode()) return nullptr;
   return &TII->get(Node->getMachineOpcode());
 }
 
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index c8328ad..92a9a30 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "misched"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -41,18 +40,14 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "misched"
+
 static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
     cl::ZeroOrMore, cl::init(false),
     cl::desc("Enable use of AA during MI GAD construction"));
 
-// FIXME: Enable the use of TBAA. There are two known issues preventing this:
-//   1. Stack coloring does not update TBAA when merging allocas
-//   2. CGP inserts ptrtoint/inttoptr pairs when sinking address computations.
-//      Because BasicAA does not handle inttoptr, we'll often miss basic type
-//      punning idioms that we need to catch so we don't miscompile real-world
-//      code.
 static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
-    cl::init(false), cl::desc("Enable use of TBAA during MI GAD construction"));
+    cl::init(true), cl::desc("Enable use of TBAA during MI GAD construction"));
 
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo &mli,
@@ -62,7 +57,7 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      LiveIntervals *lis)
   : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()), LIS(lis),
     IsPostRA(IsPostRAFlag), RemoveKillFlags(RemoveKillFlags),
-    CanHandleTerminators(false), FirstDbgValue(0) {
+    CanHandleTerminators(false), FirstDbgValue(nullptr) {
   assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals");
   DbgValues.clear();
   assert(!(IsPostRA && MRI.getNumVirtRegs()) &&
@@ -104,7 +99,7 @@ static const Value *getUnderlyingObjectFromInt(const Value *V) {
 /// and adds support for basic ptrtoint+arithmetic+inttoptr sequences.
 static void getUnderlyingObjects(const Value *V,
                                  SmallVectorImpl<Value *> &Objects) {
-  SmallPtrSet<const Value*, 16> Visited;
+  SmallPtrSet<const Value *, 16> Visited;
   SmallVector<const Value *, 4> Working(1, V);
   do {
     V = Working.pop_back_val();
@@ -130,7 +125,8 @@ static void getUnderlyingObjects(const Value *V,
   } while (!Working.empty());
 }
 
-typedef SmallVector<PointerIntPair<const Value *, 1, bool>, 4>
+typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
+typedef SmallVector<PointerIntPair<ValueType, 1, bool>, 4>
 UnderlyingObjectsVector;
 
 /// getUnderlyingObjectsForInstr - If this machine instr has memory reference
@@ -140,25 +136,27 @@ static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
                                          const MachineFrameInfo *MFI,
                                          UnderlyingObjectsVector &Objects) {
   if (!MI->hasOneMemOperand() ||
-      !(*MI->memoperands_begin())->getValue() ||
+      (!(*MI->memoperands_begin())->getValue() &&
+       !(*MI->memoperands_begin())->getPseudoValue()) ||
       (*MI->memoperands_begin())->isVolatile())
     return;
 
-  const Value *V = (*MI->memoperands_begin())->getValue();
-  if (!V)
-    return;
-
-  if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
+  if (const PseudoSourceValue *PSV =
+      (*MI->memoperands_begin())->getPseudoValue()) {
     // For now, ignore PseudoSourceValues which may alias LLVM IR values
     // because the code that uses this function has no way to cope with
     // such aliases.
     if (!PSV->isAliased(MFI)) {
       bool MayAlias = PSV->mayAlias(MFI);
-      Objects.push_back(UnderlyingObjectsVector::value_type(V, MayAlias));
+      Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias));
     }
     return;
   }
 
+  const Value *V = (*MI->memoperands_begin())->getValue();
+  if (!V)
+    return;
+
   SmallVector<Value *, 4> Objs;
   getUnderlyingObjects(V, Objs);
 
@@ -166,8 +164,6 @@ static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
          I != IE; ++I) {
     V = *I;
 
-    assert(!isa<PseudoSourceValue>(V) && "Underlying value is a stack slot!");
-
     if (!isIdentifiedObject(V)) {
       Objects.clear();
       return;
@@ -183,7 +179,7 @@ void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
 
 void ScheduleDAGInstrs::finishBlock() {
   // Subclasses should no longer refer to the old block.
-  BB = 0;
+  BB = nullptr;
 }
 
 /// Initialize the DAG and common scheduler state for the current scheduling
@@ -215,7 +211,7 @@ void ScheduleDAGInstrs::exitRegion() {
 /// are too high to be hidden by the branch or when the liveout registers
 /// used by instructions in the fallthrough block.
 void ScheduleDAGInstrs::addSchedBarrierDeps() {
-  MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : 0;
+  MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr;
   ExitSU.setInstr(ExitMI);
   bool AllDepKnown = ExitMI &&
     (ExitMI->isCall() || ExitMI->isBarrier());
@@ -272,7 +268,7 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
       // Adjust the dependence latency using operand def/use information,
       // then allow the target to perform its own adjustments.
       int UseOp = I->OpIdx;
-      MachineInstr *RegUse = 0;
+      MachineInstr *RegUse = nullptr;
       SDep Dep;
       if (UseOp < 0)
         Dep = SDep(SU, SDep::Artificial);
@@ -483,6 +479,15 @@ static inline bool isUnsafeMemoryObject(MachineInstr *MI,
   if ((*MI->memoperands_begin())->isVolatile() ||
        MI->hasUnmodeledSideEffects())
     return true;
+
+  if ((*MI->memoperands_begin())->getPseudoValue()) {
+    // Similarly to getUnderlyingObjectForInstr:
+    // For now, ignore PseudoSourceValues which may alias LLVM IR values
+    // because the code that uses this function has no way to cope with
+    // such aliases.
+    return true;
+  }
+
   const Value *V = (*MI->memoperands_begin())->getValue();
   if (!V)
     return true;
@@ -491,19 +496,8 @@ static inline bool isUnsafeMemoryObject(MachineInstr *MI,
   getUnderlyingObjects(V, Objs);
   for (SmallVectorImpl<Value *>::iterator I = Objs.begin(),
          IE = Objs.end(); I != IE; ++I) {
-    V = *I;
-
-    if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
-      // Similarly to getUnderlyingObjectForInstr:
-      // For now, ignore PseudoSourceValues which may alias LLVM IR values
-      // because the code that uses this function has no way to cope with
-      // such aliases.
-      if (PSV->isAliased(MFI))
-        return true;
-    }
-
     // Does this pointer refer to a distinct and identifiable object?
-    if (!isIdentifiedObject(V))
+    if (!isIdentifiedObject(*I))
       return true;
   }
 
@@ -541,6 +535,9 @@ static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
   MachineMemOperand *MMOa = *MIa->memoperands_begin();
   MachineMemOperand *MMOb = *MIb->memoperands_begin();
 
+  if (!MMOa->getValue() || !MMOb->getValue())
+    return true;
+
   // The following interface to AA is fashioned after DAGCombiner::isAlias
   // and operates with MachineMemOperand offset with some important
   // assumptions:
@@ -566,9 +563,9 @@ static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
 
   AliasAnalysis::AliasResult AAResult = AA->alias(
       AliasAnalysis::Location(MMOa->getValue(), Overlapa,
-                              UseTBAA ? MMOa->getTBAAInfo() : 0),
+                              UseTBAA ? MMOa->getTBAAInfo() : nullptr),
       AliasAnalysis::Location(MMOb->getValue(), Overlapb,
-                              UseTBAA ? MMOb->getTBAAInfo() : 0));
+                              UseTBAA ? MMOb->getTBAAInfo() : nullptr));
 
   return (AAResult != AliasAnalysis::NoAlias);
 }
@@ -703,10 +700,14 @@ void ScheduleDAGInstrs::initSUnits() {
     // Assign the Latency field of SU using target-provided information.
     SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
 
-    // If this SUnit uses an unbuffered resource, mark it as such.
-    // These resources are used for in-order execution pipelines within an
-    // out-of-order core and are identified by BufferSize=1. BufferSize=0 is
-    // used for dispatch/issue groups and is not considered here.
+    // If this SUnit uses a reserved or unbuffered resource, mark it as such.
+    //
+    // Reserved resources block an instruction from issuing and stall the
+    // entire pipeline. These are identified by BufferSize=0.
+    //
+    // Unbuffered resources prevent execution of subsequent instructions that
+    // require the same resources. This is used for in-order execution pipelines
+    // within an out-of-order core. These are identified by BufferSize=1.
     if (SchedModel.hasInstrSchedModel()) {
       const MCSchedClassDesc *SC = getSchedClass(SU);
       for (TargetSchedModel::ProcResIter
@@ -736,7 +737,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
   bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
                                                        : ST.useAA();
-  AliasAnalysis *AAForDep = UseAA ? AA : 0;
+  AliasAnalysis *AAForDep = UseAA ? AA : nullptr;
 
   MISUnitMap.clear();
   ScheduleDAG::clearDAG();
@@ -751,20 +752,20 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   // to top.
 
   // Remember where a generic side-effecting instruction is as we procede.
-  SUnit *BarrierChain = 0, *AliasChain = 0;
+  SUnit *BarrierChain = nullptr, *AliasChain = nullptr;
 
   // Memory references to specific known memory locations are tracked
   // so that they can be given more precise dependencies. We track
   // separately the known memory locations that may alias and those
   // that are known not to alias
-  MapVector<const Value *, std::vector<SUnit *> > AliasMemDefs, NonAliasMemDefs;
-  MapVector<const Value *, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses;
+  MapVector<ValueType, std::vector<SUnit *> > AliasMemDefs, NonAliasMemDefs;
+  MapVector<ValueType, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses;
   std::set<SUnit*> RejectMemNodes;
 
   // Remove any stale debug info; sometimes BuildSchedGraph is called again
   // without emitting the info from the previous call.
   DbgValues.clear();
-  FirstDbgValue = NULL;
+  FirstDbgValue = nullptr;
 
   assert(Defs.empty() && Uses.empty() &&
          "Only BuildGraph should update Defs/Uses");
@@ -781,13 +782,13 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   addSchedBarrierDeps();
 
   // Walk the list of instructions, from bottom moving up.
-  MachineInstr *DbgMI = NULL;
+  MachineInstr *DbgMI = nullptr;
   for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
        MII != MIE; --MII) {
     MachineInstr *MI = std::prev(MII);
     if (MI && DbgMI) {
       DbgValues.push_back(std::make_pair(DbgMI, MI));
-      DbgMI = NULL;
+      DbgMI = nullptr;
     }
 
     if (MI->isDebugValue()) {
@@ -798,8 +799,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
     assert(SU && "No SUnit mapped to this MI");
 
     if (RPTracker) {
-      PressureDiff *PDiff = PDiffs ? &(*PDiffs)[SU->NodeNum] : 0;
-      RPTracker->recede(/*LiveUses=*/0, PDiff);
+      PressureDiff *PDiff = PDiffs ? &(*PDiffs)[SU->NodeNum] : nullptr;
+      RPTracker->recede(/*LiveUses=*/nullptr, PDiff);
       assert(RPTracker->getPos() == std::prev(MII) &&
              "RPTracker can't find MI");
     }
@@ -854,13 +855,13 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
     if (isGlobalMemoryObject(AA, MI)) {
       // Be conservative with these and add dependencies on all memory
       // references, even those that are known to not alias.
-      for (MapVector<const Value *, std::vector<SUnit *> >::iterator I =
+      for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
              NonAliasMemDefs.begin(), E = NonAliasMemDefs.end(); I != E; ++I) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i) {
           I->second[i]->addPred(SDep(SU, SDep::Barrier));
         }
       }
-      for (MapVector<const Value *, std::vector<SUnit *> >::iterator I =
+      for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
              NonAliasMemUses.begin(), E = NonAliasMemUses.end(); I != E; ++I) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i) {
           SDep Dep(SU, SDep::Barrier);
@@ -894,12 +895,12 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
         addChainDependency(AAForDep, MFI, SU, PendingLoads[k], RejectMemNodes,
                            TrueMemOrderLatency);
-      for (MapVector<const Value *, std::vector<SUnit *> >::iterator I =
+      for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
            AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i)
           addChainDependency(AAForDep, MFI, SU, I->second[i], RejectMemNodes);
       }
-      for (MapVector<const Value *, std::vector<SUnit *> >::iterator I =
+      for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
            AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i)
           addChainDependency(AAForDep, MFI, SU, I->second[i], RejectMemNodes,
@@ -922,7 +923,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       bool MayAlias = false;
       for (UnderlyingObjectsVector::iterator K = Objs.begin(), KE = Objs.end();
            K != KE; ++K) {
-        const Value *V = K->getPointer();
+        ValueType V = K->getPointer();
         bool ThisMayAlias = K->getInt();
         if (ThisMayAlias)
           MayAlias = true;
@@ -930,9 +931,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         // A store to a specific PseudoSourceValue. Add precise dependencies.
         // Record the def in MemDefs, first adding a dep if there is
         // an existing def.
-        MapVector<const Value *, std::vector<SUnit *> >::iterator I =
+        MapVector<ValueType, std::vector<SUnit *> >::iterator I =
           ((ThisMayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
-        MapVector<const Value *, std::vector<SUnit *> >::iterator IE =
+        MapVector<ValueType, std::vector<SUnit *> >::iterator IE =
           ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
         if (I != IE) {
           for (unsigned i = 0, e = I->second.size(); i != e; ++i)
@@ -955,9 +956,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           }
         }
         // Handle the uses in MemUses, if there are any.
-        MapVector<const Value *, std::vector<SUnit *> >::iterator J =
+        MapVector<ValueType, std::vector<SUnit *> >::iterator J =
           ((ThisMayAlias) ? AliasMemUses.find(V) : NonAliasMemUses.find(V));
-        MapVector<const Value *, std::vector<SUnit *> >::iterator JE =
+        MapVector<ValueType, std::vector<SUnit *> >::iterator JE =
           ((ThisMayAlias) ? AliasMemUses.end() : NonAliasMemUses.end());
         if (J != JE) {
           for (unsigned i = 0, e = J->second.size(); i != e; ++i)
@@ -986,11 +987,6 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       // we have lost all RejectMemNodes below barrier.
       if (BarrierChain)
         BarrierChain->addPred(SDep(SU, SDep::Barrier));
-
-      if (!ExitSU.isPred(SU))
-        // Push store's up a bit to avoid them getting in between cmp
-        // and branches.
-        ExitSU.addPred(SDep(SU, SDep::Artificial));
     } else if (MI->mayLoad()) {
       bool MayAlias = true;
       if (MI->isInvariantLoad(AA)) {
@@ -1002,7 +998,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         if (Objs.empty()) {
           // A load with no underlying object. Depend on all
           // potentially aliasing stores.
-          for (MapVector<const Value *, std::vector<SUnit *> >::iterator I =
+          for (MapVector<ValueType, std::vector<SUnit *> >::iterator I =
                  AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
             for (unsigned i = 0, e = I->second.size(); i != e; ++i)
               addChainDependency(AAForDep, MFI, SU, I->second[i],
@@ -1016,16 +1012,16 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
 
         for (UnderlyingObjectsVector::iterator
              J = Objs.begin(), JE = Objs.end(); J != JE; ++J) {
-          const Value *V = J->getPointer();
+          ValueType V = J->getPointer();
           bool ThisMayAlias = J->getInt();
 
           if (ThisMayAlias)
             MayAlias = true;
 
           // A load from a specific PseudoSourceValue. Add precise dependencies.
-          MapVector<const Value *, std::vector<SUnit *> >::iterator I =
+          MapVector<ValueType, std::vector<SUnit *> >::iterator I =
             ((ThisMayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
-          MapVector<const Value *, std::vector<SUnit *> >::iterator IE =
+          MapVector<ValueType, std::vector<SUnit *> >::iterator IE =
             ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
           if (I != IE)
             for (unsigned i = 0, e = I->second.size(); i != e; ++i)
@@ -1429,7 +1425,7 @@ public:
 
   const SDep *backtrack() {
     DFSStack.pop_back();
-    return DFSStack.empty() ? 0 : std::prev(DFSStack.back().second);
+    return DFSStack.empty() ? nullptr : std::prev(DFSStack.back().second);
   }
 
   const SUnit *getCurr() const { return DFSStack.back().first; }
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 2cd84d6..004c685 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE ::llvm::ScoreboardHazardRecognizer::DebugType
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/MC/MCInstrItineraries.h"
@@ -24,6 +23,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE ::llvm::ScoreboardHazardRecognizer::DebugType
+
 #ifndef NDEBUG
 const char *ScoreboardHazardRecognizer::DebugType = "";
 #endif
@@ -126,7 +127,7 @@ ScoreboardHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   // free FU's in the scoreboard at the appropriate future cycles.
 
   const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
-  if (MCID == NULL) {
+  if (!MCID) {
     // Don't check hazards for non-machineinstr Nodes.
     return NoHazard;
   }
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cc0c5fa..2d2fd53 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "dagcombine"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -40,6 +39,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "dagcombine"
+
 STATISTIC(NodesCombined   , "Number of dag nodes combined");
 STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
 STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
@@ -56,14 +57,8 @@ namespace {
     CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
                cl::desc("Enable DAG combiner's use of IR alias analysis"));
 
-// FIXME: Enable the use of TBAA. There are two known issues preventing this:
-//   1. Stack coloring does not update TBAA when merging allocas
-//   2. CGP inserts ptrtoint/inttoptr pairs when sinking address computations.
-//      Because BasicAA does not handle inttoptr, we'll often miss basic type
-//      punning idioms that we need to catch so we don't miscompile real-world
-//      code.
   static cl::opt<bool>
-    UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(false),
+    UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true),
                cl::desc("Enable DAG combiner's use of TBAA"));
 
 #ifndef NDEBUG
@@ -120,9 +115,8 @@ namespace {
     /// now.
     ///
     void AddUsersToWorkList(SDNode *N) {
-      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
-           UI != UE; ++UI)
-        AddToWorkList(*UI);
+      for (SDNode *Node : N->uses())
+        AddToWorkList(Node);
     }
 
     /// visit - call the node-specific routine that knows how to fold each
@@ -173,6 +167,7 @@ namespace {
 
     bool CombineToPreIndexedLoadStore(SDNode *N);
     bool CombineToPostIndexedLoadStore(SDNode *N);
+    SDValue SplitIndexingFromLoad(LoadSDNode *LD);
     bool SliceUpLoad(SDNode *N);
 
     void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
@@ -324,26 +319,7 @@ namespace {
 
     /// isAlias - Return true if there is any possibility that the two addresses
     /// overlap.
-    bool isAlias(SDValue Ptr1, int64_t Size1, bool IsVolatile1,
-                 const Value *SrcValue1, int SrcValueOffset1,
-                 unsigned SrcValueAlign1,
-                 const MDNode *TBAAInfo1,
-                 SDValue Ptr2, int64_t Size2, bool IsVolatile2,
-                 const Value *SrcValue2, int SrcValueOffset2,
-                 unsigned SrcValueAlign2,
-                 const MDNode *TBAAInfo2) const;
-
-    /// isAlias - Return true if there is any possibility that the two addresses
-    /// overlap.
-    bool isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1);
-
-    /// FindAliasInfo - Extracts the relevant alias information from the memory
-    /// node.  Returns true if the operand was a load.
-    bool FindAliasInfo(SDNode *N,
-                       SDValue &Ptr, int64_t &Size, bool &IsVolatile,
-                       const Value *&SrcValue, int &SrcValueOffset,
-                       unsigned &SrcValueAlignment,
-                       const MDNode *&TBAAInfo) const;
+    bool isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const;
 
     /// FindBetterChain - Walk up chain skipping non-aliasing memory nodes,
     /// looking for a better chain (aliasing node.)
@@ -660,7 +636,7 @@ static SDNode *isConstantBuildVectorOrConstantInt(SDValue N) {
   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N);
   if(BV && BV->isConstant())
     return BV;
-  return NULL;
+  return nullptr;
 }
 
 // \brief Returns the SDNode if it is a constant splat BuildVector or constant
@@ -669,8 +645,13 @@ static ConstantSDNode *isConstOrConstSplat(SDValue N) {
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
     return CN;
 
-  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N))
-    return BV->getConstantSplatValue();
+  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
+    ConstantSDNode *CN = BV->getConstantSplatValue();
+
+    // BuildVectors can truncate their operands. Ignore that case here.
+    if (CN && CN->getValueType(0) == N.getValueType().getScalarType())
+      return CN;
+  }
 
   return nullptr;
 }
@@ -781,10 +762,14 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
 
     // If the operands of this node are only used by the node, they will now
     // be dead.  Make sure to visit them first to delete dead nodes early.
-    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands(); i != e; ++i)
-      if (TLO.Old.getNode()->getOperand(i).getNode()->hasOneUse())
-        AddToWorkList(TLO.Old.getNode()->getOperand(i).getNode());
-
+    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands(); i != e; ++i) {
+      SDNode *Op = TLO.Old.getNode()->getOperand(i).getNode();
+      // For an operand generating multiple values, one of the values may
+      // become dead allowing further simplification (e.g. split index
+      // arithmetic from an indexed load).
+      if (Op->hasOneUse() || Op->getNumValues() > 1)
+        AddToWorkList(Op);
+    }
     DAG.DeleteNode(TLO.Old.getNode());
   }
 }
@@ -876,7 +861,7 @@ SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
   SDLoc dl(Op);
   bool Replace = false;
   SDValue NewOp = PromoteOperand(Op, PVT, Replace);
-  if (NewOp.getNode() == 0)
+  if (!NewOp.getNode())
     return SDValue();
   AddToWorkList(NewOp.getNode());
 
@@ -891,7 +876,7 @@ SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
   SDLoc dl(Op);
   bool Replace = false;
   SDValue NewOp = PromoteOperand(Op, PVT, Replace);
-  if (NewOp.getNode() == 0)
+  if (!NewOp.getNode())
     return SDValue();
   AddToWorkList(NewOp.getNode());
 
@@ -926,7 +911,7 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
     bool Replace0 = false;
     SDValue N0 = Op.getOperand(0);
     SDValue NN0 = PromoteOperand(N0, PVT, Replace0);
-    if (NN0.getNode() == 0)
+    if (!NN0.getNode())
       return SDValue();
 
     bool Replace1 = false;
@@ -936,7 +921,7 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
       NN1 = NN0;
     else {
       NN1 = PromoteOperand(N1, PVT, Replace1);
-      if (NN1.getNode() == 0)
+      if (!NN1.getNode())
         return SDValue();
     }
 
@@ -989,7 +974,7 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
       N0 = ZExtPromoteOperand(Op.getOperand(0), PVT);
     else
       N0 = PromoteOperand(N0, PVT, Replace);
-    if (N0.getNode() == 0)
+    if (!N0.getNode())
       return SDValue();
 
     AddToWorkList(N0.getNode());
@@ -1134,7 +1119,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
 
     SDValue RV = combine(N);
 
-    if (RV.getNode() == 0)
+    if (!RV.getNode())
       continue;
 
     ++NodesCombined;
@@ -1282,7 +1267,7 @@ SDValue DAGCombiner::combine(SDNode *N) {
   SDValue RV = visit(N);
 
   // If nothing happened, try a target-specific DAG combine.
-  if (RV.getNode() == 0) {
+  if (!RV.getNode()) {
     assert(N->getOpcode() != ISD::DELETED_NODE &&
            "Node was deleted but visit returned NULL!");
 
@@ -1298,7 +1283,7 @@ SDValue DAGCombiner::combine(SDNode *N) {
   }
 
   // If nothing happened still, try promoting the operation.
-  if (RV.getNode() == 0) {
+  if (!RV.getNode()) {
     switch (N->getOpcode()) {
     default: break;
     case ISD::ADD:
@@ -1328,8 +1313,7 @@ SDValue DAGCombiner::combine(SDNode *N) {
 
   // If N is a commutative binary node, try commuting it to enable more
   // sdisel CSE.
-  if (RV.getNode() == 0 &&
-      SelectionDAG::isCommutativeBinOp(N->getOpcode()) &&
+  if (!RV.getNode() && SelectionDAG::isCommutativeBinOp(N->getOpcode()) &&
       N->getNumValues() == 1) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
@@ -1338,7 +1322,7 @@ SDValue DAGCombiner::combine(SDNode *N) {
     if (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1)) {
       SDValue Ops[] = { N1, N0 };
       SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(),
-                                            Ops, 2);
+                                            Ops);
       if (CSENode)
         return SDValue(CSENode, 0);
     }
@@ -1428,8 +1412,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
       Result = DAG.getEntryNode();
     } else {
       // New and improved token factor.
-      Result = DAG.getNode(ISD::TokenFactor, SDLoc(N),
-                           MVT::Other, &Ops[0], Ops.size());
+      Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
     }
 
     // Don't add users to work list.
@@ -1528,7 +1511,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
                          N0.getOperand(1));
   // reassociate add
   SDValue RADD = ReassociateOps(ISD::ADD, SDLoc(N), N0, N1);
-  if (RADD.getNode() != 0)
+  if (RADD.getNode())
     return RADD;
   // fold ((0-A) + B) -> B-A
   if (N0.getOpcode() == ISD::SUB && isa<ConstantSDNode>(N0.getOperand(0)) &&
@@ -1581,10 +1564,10 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (VT.isInteger() && !VT.isVector()) {
     APInt LHSZero, LHSOne;
     APInt RHSZero, RHSOne;
-    DAG.ComputeMaskedBits(N0, LHSZero, LHSOne);
+    DAG.computeKnownBits(N0, LHSZero, LHSOne);
 
     if (LHSZero.getBoolValue()) {
-      DAG.ComputeMaskedBits(N1, RHSZero, RHSOne);
+      DAG.computeKnownBits(N1, RHSZero, RHSOne);
 
       // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
       // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
@@ -1676,10 +1659,10 @@ SDValue DAGCombiner::visitADDC(SDNode *N) {
   // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits.
   APInt LHSZero, LHSOne;
   APInt RHSZero, RHSOne;
-  DAG.ComputeMaskedBits(N0, LHSZero, LHSOne);
+  DAG.computeKnownBits(N0, LHSZero, LHSOne);
 
   if (LHSZero.getBoolValue()) {
-    DAG.ComputeMaskedBits(N1, RHSZero, RHSOne);
+    DAG.computeKnownBits(N1, RHSZero, RHSOne);
 
     // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
     // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
@@ -1728,7 +1711,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
-  ConstantSDNode *N1C1 = N1.getOpcode() != ISD::ADD ? 0 :
+  ConstantSDNode *N1C1 = N1.getOpcode() != ISD::ADD ? nullptr :
     dyn_cast<ConstantSDNode>(N1.getOperand(1).getNode());
   EVT VT = N0.getValueType();
 
@@ -1881,10 +1864,10 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     N0IsConst = isConstantSplatVector(N0.getNode(), ConstValue0);
     N1IsConst = isConstantSplatVector(N1.getNode(), ConstValue1);
   } else {
-    N0IsConst = dyn_cast<ConstantSDNode>(N0) != 0;
+    N0IsConst = dyn_cast<ConstantSDNode>(N0) != nullptr;
     ConstValue0 = N0IsConst ? (dyn_cast<ConstantSDNode>(N0))->getAPIntValue()
                             : APInt();
-    N1IsConst = dyn_cast<ConstantSDNode>(N1) != 0;
+    N1IsConst = dyn_cast<ConstantSDNode>(N1) != nullptr;
     ConstValue1 = N1IsConst ? (dyn_cast<ConstantSDNode>(N1))->getAPIntValue()
                             : APInt();
   }
@@ -1942,7 +1925,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
   // use.
   {
-    SDValue Sh(0,0), Y(0,0);
+    SDValue Sh(nullptr,0), Y(nullptr,0);
     // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
     if (N0.getOpcode() == ISD::SHL &&
         (isConstantSplatVector(N0.getOperand(1).getNode(), Val) ||
@@ -1975,7 +1958,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
 
   // reassociate mul
   SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1);
-  if (RMUL.getNode() != 0)
+  if (RMUL.getNode())
     return RMUL;
 
   return SDValue();
@@ -1984,8 +1967,8 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
 SDValue DAGCombiner::visitSDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   EVT VT = N->getValueType(0);
 
   // fold vector ops
@@ -2011,10 +1994,10 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
       return DAG.getNode(ISD::UDIV, SDLoc(N), N1.getValueType(),
                          N0, N1);
   }
+
   // fold (sdiv X, pow2) -> simple ops after legalize
-  if (N1C && !N1C->isNullValue() &&
-      (N1C->getAPIntValue().isPowerOf2() ||
-       (-N1C->getAPIntValue()).isPowerOf2())) {
+  if (N1C && !N1C->isNullValue() && (N1C->getAPIntValue().isPowerOf2() ||
+                                     (-N1C->getAPIntValue()).isPowerOf2())) {
     // If dividing by powers of two is cheap, then don't perform the following
     // fold.
     if (TLI.isPow2DivCheap())
@@ -2023,15 +2006,17 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
     unsigned lg2 = N1C->getAPIntValue().countTrailingZeros();
 
     // Splat the sign bit into the register
-    SDValue SGN = DAG.getNode(ISD::SRA, SDLoc(N), VT, N0,
-                              DAG.getConstant(VT.getSizeInBits()-1,
-                                       getShiftAmountTy(N0.getValueType())));
+    SDValue SGN =
+        DAG.getNode(ISD::SRA, SDLoc(N), VT, N0,
+                    DAG.getConstant(VT.getScalarSizeInBits() - 1,
+                                    getShiftAmountTy(N0.getValueType())));
     AddToWorkList(SGN.getNode());
 
     // Add (N0 < 0) ? abs2 - 1 : 0;
-    SDValue SRL = DAG.getNode(ISD::SRL, SDLoc(N), VT, SGN,
-                              DAG.getConstant(VT.getSizeInBits() - lg2,
-                                       getShiftAmountTy(SGN.getValueType())));
+    SDValue SRL =
+        DAG.getNode(ISD::SRL, SDLoc(N), VT, SGN,
+                    DAG.getConstant(VT.getScalarSizeInBits() - lg2,
+                                    getShiftAmountTy(SGN.getValueType())));
     SDValue ADD = DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, SRL);
     AddToWorkList(SRL.getNode());
     AddToWorkList(ADD.getNode());    // Divide by pow2
@@ -2044,13 +2029,12 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
       return SRA;
 
     AddToWorkList(SRA.getNode());
-    return DAG.getNode(ISD::SUB, SDLoc(N), VT,
-                       DAG.getConstant(0, VT), SRA);
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), SRA);
   }
 
   // if integer divide is expensive and we satisfy the requirements, emit an
   // alternate sequence.
-  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap()) {
+  if (N1C && !TLI.isIntDivCheap()) {
     SDValue Op = BuildSDIV(N);
     if (Op.getNode()) return Op;
   }
@@ -2068,8 +2052,8 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
 SDValue DAGCombiner::visitUDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   EVT VT = N->getValueType(0);
 
   // fold vector ops
@@ -2102,7 +2086,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
     }
   }
   // fold (udiv x, c) -> alternate
-  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap()) {
+  if (N1C && !TLI.isIntDivCheap()) {
     SDValue Op = BuildUDIV(N);
     if (Op.getNode()) return Op;
   }
@@ -2120,8 +2104,8 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
 SDValue DAGCombiner::visitSREM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   EVT VT = N->getValueType(0);
 
   // fold (srem c1, c2) -> c1%c2
@@ -2162,8 +2146,8 @@ SDValue DAGCombiner::visitSREM(SDNode *N) {
 SDValue DAGCombiner::visitUREM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   EVT VT = N->getValueType(0);
 
   // fold (urem c1, c2) -> c1%c2
@@ -2298,7 +2282,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
       (!LegalOperations ||
        TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
     SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0),
-                              N->op_begin(), N->getNumOperands());
+                              ArrayRef<SDUse>(N->op_begin(), N->op_end()));
     return CombineTo(N, Res, Res);
   }
 
@@ -2308,7 +2292,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
       (!LegalOperations ||
        TLI.isOperationLegal(HiOp, N->getValueType(1)))) {
     SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1),
-                              N->op_begin(), N->getNumOperands());
+                              ArrayRef<SDUse>(N->op_begin(), N->op_end()));
     return CombineTo(N, Res, Res);
   }
 
@@ -2319,7 +2303,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
   // If the two computed results can be simplified separately, separate them.
   if (LoExists) {
     SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0),
-                             N->op_begin(), N->getNumOperands());
+                             ArrayRef<SDUse>(N->op_begin(), N->op_end()));
     AddToWorkList(Lo.getNode());
     SDValue LoOpt = combine(Lo.getNode());
     if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
@@ -2330,7 +2314,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
 
   if (HiExists) {
     SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1),
-                             N->op_begin(), N->getNumOperands());
+                             ArrayRef<SDUse>(N->op_begin(), N->op_end()));
     AddToWorkList(Hi.getNode());
     SDValue HiOpt = combine(Hi.getNode());
     if (HiOpt.getNode() && HiOpt != Hi &&
@@ -2532,7 +2516,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
 
     assert(N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
            "Inputs to shuffles are not the same type");
- 
+
     // Check that both shuffles use the same mask. The masks are known to be of
     // the same length because the result vector type is the same.
     // Check also that shuffles have only one use to avoid introducing extra
@@ -2632,7 +2616,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     return DAG.getConstant(0, VT);
   // reassociate and
   SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1);
-  if (RAND.getNode() != 0)
+  if (RAND.getNode())
     return RAND;
   // fold (and (or x, C), D) -> D if (C & D) == D
   if (N1C && N0.getOpcode() == ISD::OR)
@@ -3165,7 +3149,7 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   if (!TLI.isOperationLegal(ISD::BSWAP, VT))
     return SDValue();
 
-  SmallVector<SDNode*,4> Parts(4, (SDNode*)0);
+  SmallVector<SDNode*,4> Parts(4, (SDNode*)nullptr);
   // Look for either
   // (or (or (and), (and)), (or (and), (and)))
   // (or (or (or (and), (and)), (and)), (and))
@@ -3270,11 +3254,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
       // two ways to fold this node into a shuffle.
       SmallVector<int,4> Mask1;
       SmallVector<int,4> Mask2;
-      
+
       for (unsigned i = 0; i != NumElts && CanFold; ++i) {
         int M0 = SV0->getMaskElt(i);
         int M1 = SV1->getMaskElt(i);
-   
+
         // Both shuffle indexes are undef. Propagate Undef.
         if (M0 < 0 && M1 < 0) {
           Mask1.push_back(M0);
@@ -3288,7 +3272,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
           CanFold = false;
           break;
         }
-        
+
         Mask1.push_back(M0 < (int)NumElts ? M0 : M1 + NumElts);
         Mask2.push_back(M1 < (int)NumElts ? M1 : M0 + NumElts);
       }
@@ -3329,15 +3313,15 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
 
   // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
   SDValue BSwap = MatchBSwapHWord(N, N0, N1);
-  if (BSwap.getNode() != 0)
+  if (BSwap.getNode())
     return BSwap;
   BSwap = MatchBSwapHWordLow(N, N0, N1);
-  if (BSwap.getNode() != 0)
+  if (BSwap.getNode())
     return BSwap;
 
   // reassociate or
   SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1);
-  if (ROR.getNode() != 0)
+  if (ROR.getNode())
     return ROR;
   // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
   // iff (c1 & c2) == 0.
@@ -3582,28 +3566,7 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
                        HasPos ? Pos : Neg).getNode();
   }
 
-  // fold (or (shl (*ext x), (*ext y)),
-  //          (srl (*ext x), (*ext (sub 32, y)))) ->
-  //   (*ext (rotl x, y)) or (*ext (rotr x, (sub 32, y)))
-  //
-  // fold (or (shl (*ext x), (*ext (sub 32, y))),
-  //          (srl (*ext x), (*ext y))) ->
-  //   (*ext (rotr x, y)) or (*ext (rotl x, (sub 32, y)))
-  if (Shifted.getOpcode() == ISD::ZERO_EXTEND ||
-      Shifted.getOpcode() == ISD::ANY_EXTEND) {
-    SDValue InnerShifted = Shifted.getOperand(0);
-    EVT InnerVT = InnerShifted.getValueType();
-    bool HasPosInner = TLI.isOperationLegalOrCustom(PosOpcode, InnerVT);
-    if (HasPosInner || TLI.isOperationLegalOrCustom(NegOpcode, InnerVT)) {
-      if (matchRotateSub(InnerPos, InnerNeg, InnerVT.getSizeInBits())) {
-        SDValue V = DAG.getNode(HasPosInner ? PosOpcode : NegOpcode, DL,
-                                InnerVT, InnerShifted, HasPosInner ? Pos : Neg);
-        return DAG.getNode(Shifted.getOpcode(), DL, VT, V).getNode();
-      }
-    }
-  }
-
-  return 0;
+  return nullptr;
 }
 
 // MatchRotate - Handle an 'or' of two operands.  If this is one of the many
@@ -3612,29 +3575,29 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
 SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
   // Must be a legal type.  Expanded 'n promoted things won't work with rotates.
   EVT VT = LHS.getValueType();
-  if (!TLI.isTypeLegal(VT)) return 0;
+  if (!TLI.isTypeLegal(VT)) return nullptr;
 
   // The target must have at least one rotate flavor.
   bool HasROTL = TLI.isOperationLegalOrCustom(ISD::ROTL, VT);
   bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT);
-  if (!HasROTL && !HasROTR) return 0;
+  if (!HasROTL && !HasROTR) return nullptr;
 
   // Match "(X shl/srl V1) & V2" where V2 may not be present.
   SDValue LHSShift;   // The shift.
   SDValue LHSMask;    // AND value if any.
   if (!MatchRotateHalf(LHS, LHSShift, LHSMask))
-    return 0; // Not part of a rotate.
+    return nullptr; // Not part of a rotate.
 
   SDValue RHSShift;   // The shift.
   SDValue RHSMask;    // AND value if any.
   if (!MatchRotateHalf(RHS, RHSShift, RHSMask))
-    return 0; // Not part of a rotate.
+    return nullptr; // Not part of a rotate.
 
   if (LHSShift.getOperand(0) != RHSShift.getOperand(0))
-    return 0;   // Not shifting the same value.
+    return nullptr;   // Not shifting the same value.
 
   if (LHSShift.getOpcode() == RHSShift.getOpcode())
-    return 0;   // Shifts must disagree.
+    return nullptr;   // Shifts must disagree.
 
   // Canonicalize shl to left side in a shl/srl pair.
   if (RHSShift.getOpcode() == ISD::SHL) {
@@ -3656,7 +3619,7 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
     uint64_t LShVal = cast<ConstantSDNode>(LHSShiftAmt)->getZExtValue();
     uint64_t RShVal = cast<ConstantSDNode>(RHSShiftAmt)->getZExtValue();
     if ((LShVal + RShVal) != OpSizeInBits)
-      return 0;
+      return nullptr;
 
     SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
                               LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);
@@ -3683,7 +3646,7 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
   // If there is a mask here, and we have a variable shift, we can't be sure
   // that we're masking out the right stuff.
   if (LHSMask.getNode() || RHSMask.getNode())
-    return 0;
+    return nullptr;
 
   // If the shift amount is sign/zext/any-extended just peel it off.
   SDValue LExtOp0 = LHSShiftAmt;
@@ -3710,7 +3673,7 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
   if (TryR)
     return TryR;
 
-  return 0;
+  return nullptr;
 }
 
 SDValue DAGCombiner::visitXOR(SDNode *N) {
@@ -3752,7 +3715,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     return N0;
   // reassociate xor
   SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1);
-  if (RXOR.getNode() != 0)
+  if (RXOR.getNode())
     return RXOR;
 
   // fold !(x cc y) -> (x !cc y)
@@ -3909,6 +3872,9 @@ SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) {
       return SDValue();
   }
 
+  if (!TLI.isDesirableToCommuteWithShift(LHS))
+    return SDValue();
+
   // Fold the constants, shifting the binop RHS by the shift amount.
   SDValue NewRHS = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(1)),
                                N->getValueType(0),
@@ -4382,7 +4348,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (N1C && N0.getOpcode() == ISD::CTLZ &&
       N1C->getAPIntValue() == Log2_32(OpSizeInBits)) {
     APInt KnownZero, KnownOne;
-    DAG.ComputeMaskedBits(N0.getOperand(0), KnownZero, KnownOne);
+    DAG.computeKnownBits(N0.getOperand(0), KnownZero, KnownOne);
 
     // If any of the input bits are KnownOne, then the input couldn't be all
     // zeros, thus the result of the srl will always be zero.
@@ -4745,7 +4711,7 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) {
 // tryToFoldExtendOfConstant - Try to fold a sext/zext/aext
 // dag node into a ConstantSDNode or a build_vector of constants.
 // This function is called by the DAGCombiner when visiting sext/zext/aext
-// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). 
+// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
 // Vector extends are not folded if operations are legal; this is to
 // avoid introducing illegal build_vector dag nodes.
 static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
@@ -4771,8 +4737,8 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
   if (!(VT.isVector() &&
       (!LegalTypes || (!LegalOperations && TLI.isTypeLegal(SVT))) &&
       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())))
-    return 0;
-  
+    return nullptr;
+
   // We can fold this node into a build_vector.
   unsigned VTBits = SVT.getSizeInBits();
   unsigned EVTBits = N0->getValueType(0).getScalarType().getSizeInBits();
@@ -4798,7 +4764,7 @@ static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
                                      SVT));
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], NumElts).getNode();
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Elts).getNode();
 }
 
 // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
@@ -4882,8 +4848,7 @@ void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
     }
 
     Ops.push_back(SetCC->getOperand(2));
-    CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0),
-                                 &Ops[0], Ops.size()));
+    CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops));
   }
 }
 
@@ -4957,6 +4922,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   // on vectors in one instruction.  We only perform this transformation on
   // scalars.
   if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()))) {
     bool DoXform = true;
@@ -5009,7 +4975,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()) &&
       (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
-    if (LN0->getExtensionType() != ISD::ZEXTLOAD) {
+    if (LN0->getExtensionType() != ISD::ZEXTLOAD && LN0->isUnindexed()) {
       bool DoXform = true;
       SmallVector<SDNode*, 4> SetCCs;
       if (!N0.hasOneUse())
@@ -5108,13 +5074,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
 // isTruncateOf - If N is a truncate of some other value, return true, record
 // the value being truncated in Op and which of Op's bits are zero in KnownZero.
 // This function computes KnownZero to avoid a duplicated call to
-// ComputeMaskedBits in the caller.
+// computeKnownBits in the caller.
 static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
                          APInt &KnownZero) {
   APInt KnownOne;
   if (N->getOpcode() == ISD::TRUNCATE) {
     Op = N->getOperand(0);
-    DAG.ComputeMaskedBits(Op, KnownZero, KnownOne);
+    DAG.computeKnownBits(Op, KnownZero, KnownOne);
     return true;
   }
 
@@ -5135,7 +5101,7 @@ static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
   else
     return false;
 
-  DAG.ComputeMaskedBits(Op, KnownZero, KnownOne);
+  DAG.computeKnownBits(Op, KnownZero, KnownOne);
 
   if (!(KnownZero | APInt(Op.getValueSizeInBits(), 1)).isAllOnesValue())
     return false;
@@ -5250,6 +5216,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   // on vectors in one instruction.  We only perform this transformation on
   // scalars.
   if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()))) {
     bool DoXform = true;
@@ -5282,7 +5249,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()) &&
       (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
-    if (LN0->getExtensionType() != ISD::SEXTLOAD) {
+    if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) {
       bool DoXform = true;
       SmallVector<SDNode*, 4> SetCCs;
       if (!N0.hasOneUse())
@@ -5353,7 +5320,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                                          N0.getOperand(1),
                                  cast<CondCodeSDNode>(N0.getOperand(2))->get()),
                            DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT,
-                                       &OneOps[0], OneOps.size()));
+                                       OneOps));
 
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
@@ -5370,8 +5337,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                       cast<CondCodeSDNode>(N0.getOperand(2))->get());
       return DAG.getNode(ISD::AND, SDLoc(N), VT,
                          DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT),
-                         DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT,
-                                     &OneOps[0], OneOps.size()));
+                         DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, OneOps));
     }
 
     // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
@@ -5478,6 +5444,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   // on vectors in one instruction.  We only perform this transformation on
   // scalars.
   if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
     bool DoXform = true;
@@ -5507,20 +5474,26 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       !ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
       N0.hasOneUse()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    ISD::LoadExtType ExtType = LN0->getExtensionType();
     EVT MemVT = LN0->getMemoryVT();
-    SDValue ExtLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(N),
-                                     VT, LN0->getChain(), LN0->getBasePtr(),
-                                     MemVT, LN0->getMemOperand());
-    CombineTo(N, ExtLoad);
-    CombineTo(N0.getNode(),
-              DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
-                          N0.getValueType(), ExtLoad),
-              ExtLoad.getValue(1));
-    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    if (!LegalOperations || TLI.isLoadExtLegal(ExtType, MemVT)) {
+      SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
+                                       VT, LN0->getChain(), LN0->getBasePtr(),
+                                       MemVT, LN0->getMemOperand());
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+                            N0.getValueType(), ExtLoad),
+                ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
   }
 
   if (N0.getOpcode() == ISD::SETCC) {
-    // aext(setcc) -> sext_in_reg(vsetcc) for vectors.
+    // For vectors:
+    // aext(setcc) -> vsetcc
+    // aext(setcc) -> truncate(vsetcc)
+    // aext(setcc) -> aext(vsetcc)
     // Only do this before legalize for now.
     if (VT.isVector() && !LegalOperations) {
       EVT N0VT = N0.getOperand(0).getValueType();
@@ -5535,19 +5508,14 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
                              cast<CondCodeSDNode>(N0.getOperand(2))->get());
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
-      // truncate/sign extend
+      // truncate/any extend
       else {
-        EVT MatchingElementType =
-          EVT::getIntegerVT(*DAG.getContext(),
-                            N0VT.getScalarType().getSizeInBits());
-        EVT MatchingVectorType =
-          EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
-                           N0VT.getVectorNumElements());
+        EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger();
         SDValue VsetCC =
           DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
                         N0.getOperand(1),
                         cast<CondCodeSDNode>(N0.getOperand(2))->get());
-        return DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT);
+        return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
       }
     }
 
@@ -5571,7 +5539,7 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
   default: break;
   case ISD::Constant: {
     const ConstantSDNode *CV = cast<ConstantSDNode>(V.getNode());
-    assert(CV != 0 && "Const value should be ConstSDNode.");
+    assert(CV && "Const value should be ConstSDNode.");
     const APInt &CVal = CV->getAPIntValue();
     APInt NewVal = CVal & Mask;
     if (NewVal != CVal)
@@ -5872,7 +5840,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) {
     SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
                                        N0.getOperand(1), false);
-    if (BSwap.getNode() != 0)
+    if (BSwap.getNode())
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
                          BSwap, N1);
   }
@@ -5897,7 +5865,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
                                      Op.getValueType()));
     }
 
-    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, &Elts[0], NumElts);
+    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Elts);
   }
 
   return SDValue();
@@ -5998,8 +5966,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
         Opnds.push_back(BuildVect.getOperand(i));
 
-      return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, &Opnds[0],
-                         Opnds.size());
+      return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds);
     }
   }
 
@@ -6074,8 +6041,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
         AddToWorkList(NV.getNode());
         Opnds.push_back(NV);
       }
-      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
-                         &Opnds[0], Opnds.size());
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
     }
   }
 
@@ -6313,8 +6279,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
                                 DstEltVT, Op));
       AddToWorkList(Ops.back().getNode());
     }
-    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT,
-                       &Ops[0], Ops.size());
+    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops);
   }
 
   // Otherwise, we're growing or shrinking the elements.  To avoid having to
@@ -6370,8 +6335,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
     }
 
     EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
-    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT,
-                       &Ops[0], Ops.size());
+    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops);
   }
 
   // Finally, this must be the case where we are shrinking elements: each input
@@ -6407,8 +6371,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
       std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT,
-                     &Ops[0], Ops.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops);
 }
 
 SDValue DAGCombiner::visitFADD(SDNode *N) {
@@ -7006,7 +6969,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
         { N0.getOperand(0), N0.getOperand(1),
           DAG.getConstantFP(-1.0, VT) , DAG.getConstantFP(0.0, VT),
           N0.getOperand(2) };
-      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, Ops, 5);
+      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, Ops);
     }
 
     // fold (sint_to_fp (zext (setcc x, y, cc))) ->
@@ -7019,7 +6982,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
         { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),
           DAG.getConstantFP(1.0, VT) , DAG.getConstantFP(0.0, VT),
           N0.getOperand(0).getOperand(2) };
-      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, Ops, 5);
+      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, Ops);
     }
   }
 
@@ -7063,7 +7026,7 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
         { N0.getOperand(0), N0.getOperand(1),
           DAG.getConstantFP(1.0, VT),  DAG.getConstantFP(0.0, VT),
           N0.getOperand(2) };
-      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, Ops, 5);
+      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, Ops);
     }
   }
 
@@ -7223,11 +7186,16 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   // (fneg (fmul c, x)) -> (fmul -c, x)
   if (N0.getOpcode() == ISD::FMUL) {
     ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
-    if (CFP1)
-      return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                         N0.getOperand(0),
-                         DAG.getNode(ISD::FNEG, SDLoc(N), VT,
-                                     N0.getOperand(1)));
+    if (CFP1) {
+      APFloat CVal = CFP1->getValueAPF();
+      CVal.changeSign();
+      if (Level >= AfterLegalizeDAG &&
+          (TLI.isFPImmLegal(CVal, N->getValueType(0)) ||
+           TLI.isOperationLegal(ISD::ConstantFP, N->getValueType(0))))
+        return DAG.getNode(
+            ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
+            DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)));
+    }
   }
 
   return SDValue();
@@ -7335,7 +7303,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
       ((N1.getOpcode() == ISD::TRUNCATE && N1.hasOneUse()) &&
        (N1.getOperand(0).hasOneUse() &&
         N1.getOperand(0).getOpcode() == ISD::SRL))) {
-    SDNode *Trunc = 0;
+    SDNode *Trunc = nullptr;
     if (N1.getOpcode() == ISD::TRUNCATE) {
       // Look pass the truncate.
       Trunc = N1.getNode();
@@ -7616,9 +7584,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   // a copy of the original base pointer.
   SmallVector<SDNode *, 16> OtherUses;
   if (isa<ConstantSDNode>(Offset))
-    for (SDNode::use_iterator I = BasePtr.getNode()->use_begin(),
-         E = BasePtr.getNode()->use_end(); I != E; ++I) {
-      SDNode *Use = *I;
+    for (SDNode *Use : BasePtr.getNode()->uses()) {
       if (Use == Ptr.getNode())
         continue;
 
@@ -7660,9 +7626,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   SmallPtrSet<const SDNode *, 32> Visited;
   SmallVector<const SDNode *, 16> Worklist;
 
-  for (SDNode::use_iterator I = Ptr.getNode()->use_begin(),
-         E = Ptr.getNode()->use_end(); I != E; ++I) {
-    SDNode *Use = *I;
+  for (SDNode *Use : Ptr.getNode()->uses()) {
     if (Use == N)
       continue;
     if (N->hasPredecessorHelper(Use, Visited, Worklist))
@@ -7798,9 +7762,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
   if (Ptr.getNode()->hasOneUse())
     return false;
 
-  for (SDNode::use_iterator I = Ptr.getNode()->use_begin(),
-         E = Ptr.getNode()->use_end(); I != E; ++I) {
-    SDNode *Op = *I;
+  for (SDNode *Op : Ptr.getNode()->uses()) {
     if (Op == N ||
         (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB))
       continue;
@@ -7826,9 +7788,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
 
       // Check for #1.
       bool TryNext = false;
-      for (SDNode::use_iterator II = BasePtr.getNode()->use_begin(),
-             EE = BasePtr.getNode()->use_end(); II != EE; ++II) {
-        SDNode *Use = *II;
+      for (SDNode *Use : BasePtr.getNode()->uses()) {
         if (Use == Ptr.getNode())
           continue;
 
@@ -7836,9 +7796,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
         // transformation.
         if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){
           bool RealUse = false;
-          for (SDNode::use_iterator III = Use->use_begin(),
-                 EEE = Use->use_end(); III != EEE; ++III) {
-            SDNode *UseUse = *III;
+          for (SDNode *UseUse : Use->uses()) {
             if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))
               RealUse = true;
           }
@@ -7891,6 +7849,17 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
   return false;
 }
 
+/// \brief Return the base-pointer arithmetic from an indexed \p LD.
+SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  assert(AM != ISD::UNINDEXED);
+  SDValue BP = LD->getOperand(1);
+  SDValue Inc = LD->getOperand(2);
+  unsigned Opc =
+      (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
+  return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
+}
+
 SDValue DAGCombiner::visitLOAD(SDNode *N) {
   LoadSDNode *LD  = cast<LoadSDNode>(N);
   SDValue Chain = LD->getChain();
@@ -7927,8 +7896,16 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
     } else {
       // Indexed loads.
       assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
-      if (!N->hasAnyUseOfValue(0) && !N->hasAnyUseOfValue(1)) {
+      if (!N->hasAnyUseOfValue(0)) {
         SDValue Undef = DAG.getUNDEF(N->getValueType(0));
+        SDValue Index;
+        if (N->hasAnyUseOfValue(1)) {
+          Index = SplitIndexingFromLoad(LD);
+          // Try to fold the base pointer arithmetic into subsequent loads and
+          // stores.
+          AddUsersToWorkList(N);
+        } else
+          Index = DAG.getUNDEF(N->getValueType(1));
         DEBUG(dbgs() << "\nReplacing.7 ";
               N->dump(&DAG);
               dbgs() << "\nWith: ";
@@ -7936,8 +7913,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
               dbgs() << " and 2 other values\n");
         WorkListRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1),
-                                      DAG.getUNDEF(N->getValueType(1)));
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
         removeFromWorkList(N);
         DAG.DeleteNode(N);
@@ -8131,8 +8107,8 @@ struct LoadedSlice {
   // This is used to get some contextual information about legal types, etc.
   SelectionDAG *DAG;
 
-  LoadedSlice(SDNode *Inst = NULL, LoadSDNode *Origin = NULL,
-              unsigned Shift = 0, SelectionDAG *DAG = NULL)
+  LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr,
+              unsigned Shift = 0, SelectionDAG *DAG = nullptr)
       : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
 
   LoadedSlice(const LoadedSlice &LS)
@@ -8228,7 +8204,7 @@ struct LoadedSlice {
 
   /// \brief Get the offset in bytes of this slice in the original chunk of
   /// bits.
-  /// \pre DAG != NULL.
+  /// \pre DAG != nullptr.
   uint64_t getOffsetFromBase() const {
     assert(DAG && "Missing context.");
     bool IsBigEndian =
@@ -8384,8 +8360,8 @@ static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
   const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
   // First (resp. Second) is the first (resp. Second) potentially candidate
   // to be placed in a paired load.
-  const LoadedSlice *First = NULL;
-  const LoadedSlice *Second = NULL;
+  const LoadedSlice *First = nullptr;
+  const LoadedSlice *Second = nullptr;
   for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
                 // Set the beginning of the pair.
                                                            First = Second) {
@@ -8407,7 +8383,7 @@ static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
     unsigned RequiredAlignment = 0;
     if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
       // move to the next pair, this type is hopeless.
-      Second = NULL;
+      Second = nullptr;
       continue;
     }
     // Check if we meet the alignment requirement.
@@ -8421,7 +8397,7 @@ static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
     assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
     --GlobalLSCost.Loads;
     // Move to the next pair.
-    Second = NULL;
+    Second = nullptr;
   }
 }
 
@@ -8565,7 +8541,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
   }
 
   SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
-                              &ArgChains[0], ArgChains.size());
+                              ArgChains);
   DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
   return true;
 }
@@ -8660,14 +8636,14 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
   // that uses this.  If not, this is not a replacement.
   APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
                                   ByteShift*8, (ByteShift+NumBytes)*8);
-  if (!DAG.MaskedValueIsZero(IVal, Mask)) return 0;
+  if (!DAG.MaskedValueIsZero(IVal, Mask)) return nullptr;
 
   // Check that it is legal on the target to do this.  It is legal if the new
   // VT we're shrinking to (i8/i16/i32) is legal or we're still before type
   // legalization.
   MVT VT = MVT::getIntegerVT(NumBytes*8);
   if (!DC->isTypeLegal(VT))
-    return 0;
+    return nullptr;
 
   // Okay, we can do this!  Replace the 'St' store with a store of IVal that is
   // shifted by ByteShift and truncated down to NumBytes.
@@ -9081,7 +9057,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
         break;
       } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
         if (Ldn->isVolatile()) {
-          Index = NULL;
+          Index = nullptr;
           break;
         }
 
@@ -9090,7 +9066,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
         NextInChain = Ldn->getChain().getNode();
         continue;
       } else {
-        Index = NULL;
+        Index = nullptr;
         break;
       }
     }
@@ -9719,8 +9695,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   }
 
   // Return the new vector
-  return DAG.getNode(ISD::BUILD_VECTOR, dl,
-                     VT, &Ops[0], Ops.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
 
 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
@@ -9826,8 +9801,8 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       NewLoad = true;
     }
 
-    LoadSDNode *LN0 = NULL;
-    const ShuffleVectorSDNode *SVN = NULL;
+    LoadSDNode *LN0 = nullptr;
+    const ShuffleVectorSDNode *SVN = nullptr;
     if (ISD::isNormalLoad(InVec.getNode())) {
       LN0 = cast<LoadSDNode>(InVec);
     } else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR &&
@@ -10052,7 +10027,7 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
   if (!isTypeLegal(VecVT)) return SDValue();
 
   // Make the new BUILD_VECTOR.
-  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], Ops.size());
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
 
   // The new BUILD_VECTOR node has the potential to be further optimized.
   AddToWorkList(BV.getNode());
@@ -10120,8 +10095,7 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
     else
       Opnds.push_back(In.getOperand(0));
   }
-  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT,
-                           &Opnds[0], Opnds.size());
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Opnds);
   AddToWorkList(BV.getNode());
 
   return DAG.getNode(Opcode, dl, VT, BV);
@@ -10162,7 +10136,7 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     // constant index, bail out.
     if (N->getOperand(i).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         !isa<ConstantSDNode>(N->getOperand(i).getOperand(1))) {
-      VecIn1 = VecIn2 = SDValue(0, 0);
+      VecIn1 = VecIn2 = SDValue(nullptr, 0);
       break;
     }
 
@@ -10171,18 +10145,18 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     if (ExtractedFromVec == VecIn1 || ExtractedFromVec == VecIn2)
       continue;
 
-    if (VecIn1.getNode() == 0) {
+    if (!VecIn1.getNode()) {
       VecIn1 = ExtractedFromVec;
-    } else if (VecIn2.getNode() == 0) {
+    } else if (!VecIn2.getNode()) {
       VecIn2 = ExtractedFromVec;
     } else {
       // Too many inputs.
-      VecIn1 = VecIn2 = SDValue(0, 0);
+      VecIn1 = VecIn2 = SDValue(nullptr, 0);
       break;
     }
   }
 
-    // If everything is good, we can make a shuffle operation.
+  // If everything is good, we can make a shuffle operation.
   if (VecIn1.getNode()) {
     SmallVector<int, 8> Mask;
     for (unsigned i = 0; i != NumInScalars; ++i) {
@@ -10212,7 +10186,7 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     // Attempt to transform a single input vector to the correct type.
     if ((VT != VecIn1.getValueType())) {
       // We don't support shuffeling between TWO values of different types.
-      if (VecIn2.getNode() != 0)
+      if (VecIn2.getNode())
         return SDValue();
 
       // We only support widening of vectors which are half the size of the
@@ -10311,8 +10285,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     for (unsigned i = 0; i != BuildVecNumElts; ++i)
       Opnds.push_back(N1.getOperand(i));
 
-    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, &Opnds[0],
-                       Opnds.size());
+    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds);
   }
 
   // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
@@ -10469,8 +10442,7 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
     }
   }
 
-  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops.data(),
-                     Ops.size());
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
 }
 
 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
@@ -10685,8 +10657,7 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
       EVT EltVT = RVT.getVectorElementType();
       SmallVector<SDValue,8> ZeroOps(RVT.getVectorNumElements(),
                                      DAG.getConstant(0, EltVT));
-      SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
-                                 RVT, &ZeroOps[0], ZeroOps.size());
+      SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), RVT, ZeroOps);
       LHS = DAG.getNode(ISD::BITCAST, dl, RVT, LHS);
       SDValue Shuf = DAG.getVectorShuffle(RVT, dl, LHS, Zero, &Indices[0]);
       return DAG.getNode(ISD::BITCAST, dl, VT, Shuf);
@@ -10755,8 +10726,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
     }
 
     if (Ops.size() == LHS.getNumOperands())
-      return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
-                         LHS.getValueType(), &Ops[0], Ops.size());
+      return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops);
   }
 
   return SDValue();
@@ -10791,8 +10761,7 @@ SDValue DAGCombiner::SimplifyVUnaryOp(SDNode *N) {
   if (Ops.size() != N0.getNumOperands())
     return SDValue();
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
-                     N0.getValueType(), &Ops[0], Ops.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N0.getValueType(), Ops);
 }
 
 SDValue DAGCombiner::SimplifySelect(SDLoc DL, SDValue N0,
@@ -10994,7 +10963,9 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
     if (ConstantFPSDNode *FV = dyn_cast<ConstantFPSDNode>(N3)) {
       if (TLI.isTypeLegal(N2.getValueType()) &&
           (TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) !=
-           TargetLowering::Legal) &&
+               TargetLowering::Legal &&
+           !TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) &&
+           !TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0))) &&
           // If both constants have multiple uses, then we won't need to do an
           // extra load, they are likely around in registers for other users.
           (TV->hasOneUse() || FV->hasOneUse())) {
@@ -11201,7 +11172,7 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
   // select_cc setlt    X,  1, -X,  X ->
   // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
   if (N1C) {
-    ConstantSDNode *SubC = NULL;
+    ConstantSDNode *SubC = nullptr;
     if (((N1C->isNullValue() && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
          (N1C->isAllOnesValue() && CC == ISD::SETGT)) &&
         N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1))
@@ -11242,26 +11213,42 @@ SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0,
 /// multiplying by a magic number.  See:
 /// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
 SDValue DAGCombiner::BuildSDIV(SDNode *N) {
+  ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  // Avoid division by zero.
+  if (!C->getAPIntValue())
+    return SDValue();
+
   std::vector<SDNode*> Built;
-  SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, &Built);
+  SDValue S =
+      TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built);
 
-  for (std::vector<SDNode*>::iterator ii = Built.begin(), ee = Built.end();
-       ii != ee; ++ii)
-    AddToWorkList(*ii);
+  for (SDNode *N : Built)
+    AddToWorkList(N);
   return S;
 }
 
-/// BuildUDIVSequence - Given an ISD::UDIV node expressing a divide by constant,
+/// BuildUDIV - Given an ISD::UDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.  See:
 /// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
 SDValue DAGCombiner::BuildUDIV(SDNode *N) {
+  ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  // Avoid division by zero.
+  if (!C->getAPIntValue())
+    return SDValue();
+
   std::vector<SDNode*> Built;
-  SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, &Built);
+  SDValue S =
+      TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built);
 
-  for (std::vector<SDNode*>::iterator ii = Built.begin(), ee = Built.end();
-       ii != ee; ++ii)
-    AddToWorkList(*ii);
+  for (SDNode *N : Built)
+    AddToWorkList(N);
   return S;
 }
 
@@ -11271,7 +11258,7 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) {
 static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
                            const GlobalValue *&GV, const void *&CV) {
   // Assume it is a primitive operation.
-  Base = Ptr; Offset = 0; GV = 0; CV = 0;
+  Base = Ptr; Offset = 0; GV = nullptr; CV = nullptr;
 
   // If it's an adding a simple constant then integrate the offset.
   if (Base.getOpcode() == ISD::ADD) {
@@ -11305,31 +11292,27 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
 
 /// isAlias - Return true if there is any possibility that the two addresses
 /// overlap.
-bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1, bool IsVolatile1,
-                          const Value *SrcValue1, int SrcValueOffset1,
-                          unsigned SrcValueAlign1,
-                          const MDNode *TBAAInfo1,
-                          SDValue Ptr2, int64_t Size2, bool IsVolatile2,
-                          const Value *SrcValue2, int SrcValueOffset2,
-                          unsigned SrcValueAlign2,
-                          const MDNode *TBAAInfo2) const {
+bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
   // If they are the same then they must be aliases.
-  if (Ptr1 == Ptr2) return true;
+  if (Op0->getBasePtr() == Op1->getBasePtr()) return true;
 
   // If they are both volatile then they cannot be reordered.
-  if (IsVolatile1 && IsVolatile2) return true;
+  if (Op0->isVolatile() && Op1->isVolatile()) return true;
 
   // Gather base node and offset information.
   SDValue Base1, Base2;
   int64_t Offset1, Offset2;
   const GlobalValue *GV1, *GV2;
   const void *CV1, *CV2;
-  bool isFrameIndex1 = FindBaseOffset(Ptr1, Base1, Offset1, GV1, CV1);
-  bool isFrameIndex2 = FindBaseOffset(Ptr2, Base2, Offset2, GV2, CV2);
+  bool isFrameIndex1 = FindBaseOffset(Op0->getBasePtr(),
+                                      Base1, Offset1, GV1, CV1);
+  bool isFrameIndex2 = FindBaseOffset(Op1->getBasePtr(),
+                                      Base2, Offset2, GV2, CV2);
 
   // If they have a same base address then check to see if they overlap.
   if (Base1 == Base2 || (GV1 && (GV1 == GV2)) || (CV1 && (CV1 == CV2)))
-    return !((Offset1 + Size1) <= Offset2 || (Offset2 + Size2) <= Offset1);
+    return !((Offset1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= Offset2 ||
+             (Offset2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= Offset1);
 
   // It is possible for different frame indices to alias each other, mostly
   // when tail call optimization reuses return address slots for arguments.
@@ -11339,7 +11322,8 @@ bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1, bool IsVolatile1,
     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
     Offset1 += MFI->getObjectOffset(cast<FrameIndexSDNode>(Base1)->getIndex());
     Offset2 += MFI->getObjectOffset(cast<FrameIndexSDNode>(Base2)->getIndex());
-    return !((Offset1 + Size1) <= Offset2 || (Offset2 + Size2) <= Offset1);
+    return !((Offset1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= Offset2 ||
+             (Offset2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= Offset1);
   }
 
   // Otherwise, if we know what the bases are, and they aren't identical, then
@@ -11351,15 +11335,18 @@ bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1, bool IsVolatile1,
   // compared to the size and offset of the access, we may be able to prove they
   // do not alias.  This check is conservative for now to catch cases created by
   // splitting vector types.
-  if ((SrcValueAlign1 == SrcValueAlign2) &&
-      (SrcValueOffset1 != SrcValueOffset2) &&
-      (Size1 == Size2) && (SrcValueAlign1 > Size1)) {
-    int64_t OffAlign1 = SrcValueOffset1 % SrcValueAlign1;
-    int64_t OffAlign2 = SrcValueOffset2 % SrcValueAlign1;
+  if ((Op0->getOriginalAlignment() == Op1->getOriginalAlignment()) &&
+      (Op0->getSrcValueOffset() != Op1->getSrcValueOffset()) &&
+      (Op0->getMemoryVT().getSizeInBits() >> 3 ==
+       Op1->getMemoryVT().getSizeInBits() >> 3) &&
+      (Op0->getOriginalAlignment() > Op0->getMemoryVT().getSizeInBits()) >> 3) {
+    int64_t OffAlign1 = Op0->getSrcValueOffset() % Op0->getOriginalAlignment();
+    int64_t OffAlign2 = Op1->getSrcValueOffset() % Op1->getOriginalAlignment();
 
     // There is no overlap between these relatively aligned accesses of similar
     // size, return no alias.
-    if ((OffAlign1 + Size1) <= OffAlign2 || (OffAlign2 + Size2) <= OffAlign1)
+    if ((OffAlign1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= OffAlign2 ||
+        (OffAlign2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= OffAlign1)
       return false;
   }
 
@@ -11370,16 +11357,22 @@ bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1, bool IsVolatile1,
       CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
     UseAA = false;
 #endif
-  if (UseAA && SrcValue1 && SrcValue2) {
+  if (UseAA &&
+      Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) {
     // Use alias analysis information.
-    int64_t MinOffset = std::min(SrcValueOffset1, SrcValueOffset2);
-    int64_t Overlap1 = Size1 + SrcValueOffset1 - MinOffset;
-    int64_t Overlap2 = Size2 + SrcValueOffset2 - MinOffset;
+    int64_t MinOffset = std::min(Op0->getSrcValueOffset(),
+                                 Op1->getSrcValueOffset());
+    int64_t Overlap1 = (Op0->getMemoryVT().getSizeInBits() >> 3) +
+        Op0->getSrcValueOffset() - MinOffset;
+    int64_t Overlap2 = (Op1->getMemoryVT().getSizeInBits() >> 3) +
+        Op1->getSrcValueOffset() - MinOffset;
     AliasAnalysis::AliasResult AAResult =
-      AA.alias(AliasAnalysis::Location(SrcValue1, Overlap1,
-                                       UseTBAA ? TBAAInfo1 : 0),
-               AliasAnalysis::Location(SrcValue2, Overlap2,
-                                       UseTBAA ? TBAAInfo2 : 0));
+        AA.alias(AliasAnalysis::Location(Op0->getMemOperand()->getValue(),
+                                         Overlap1,
+                                         UseTBAA ? Op0->getTBAAInfo() : nullptr),
+                 AliasAnalysis::Location(Op1->getMemOperand()->getValue(),
+                                         Overlap2,
+                                         UseTBAA ? Op1->getTBAAInfo() : nullptr));
     if (AAResult == AliasAnalysis::NoAlias)
       return false;
   }
@@ -11388,44 +11381,6 @@ bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1, bool IsVolatile1,
   return true;
 }
 
-bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) {
-  SDValue Ptr0, Ptr1;
-  int64_t Size0, Size1;
-  bool IsVolatile0, IsVolatile1;
-  const Value *SrcValue0, *SrcValue1;
-  int SrcValueOffset0, SrcValueOffset1;
-  unsigned SrcValueAlign0, SrcValueAlign1;
-  const MDNode *SrcTBAAInfo0, *SrcTBAAInfo1;
-  FindAliasInfo(Op0, Ptr0, Size0, IsVolatile0, SrcValue0, SrcValueOffset0,
-                SrcValueAlign0, SrcTBAAInfo0);
-  FindAliasInfo(Op1, Ptr1, Size1, IsVolatile1, SrcValue1, SrcValueOffset1,
-                SrcValueAlign1, SrcTBAAInfo1);
-  return isAlias(Ptr0, Size0, IsVolatile0, SrcValue0, SrcValueOffset0,
-                 SrcValueAlign0, SrcTBAAInfo0,
-                 Ptr1, Size1, IsVolatile1, SrcValue1, SrcValueOffset1,
-                 SrcValueAlign1, SrcTBAAInfo1);
-}
-
-/// FindAliasInfo - Extracts the relevant alias information from the memory
-/// node.  Returns true if the operand was a nonvolatile load.
-bool DAGCombiner::FindAliasInfo(SDNode *N,
-                                SDValue &Ptr, int64_t &Size, bool &IsVolatile,
-                                const Value *&SrcValue,
-                                int &SrcValueOffset,
-                                unsigned &SrcValueAlign,
-                                const MDNode *&TBAAInfo) const {
-  LSBaseSDNode *LS = cast<LSBaseSDNode>(N);
-
-  Ptr = LS->getBasePtr();
-  Size = LS->getMemoryVT().getSizeInBits() >> 3;
-  IsVolatile = LS->isVolatile();
-  SrcValue = LS->getSrcValue();
-  SrcValueOffset = LS->getSrcValueOffset();
-  SrcValueAlign = LS->getOriginalAlignment();
-  TBAAInfo = LS->getTBAAInfo();
-  return isa<LoadSDNode>(LS) && !IsVolatile;
-}
-
 /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
 /// looking for aliasing nodes and adding them to the Aliases vector.
 void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
@@ -11434,15 +11389,7 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
   SmallPtrSet<SDNode *, 16> Visited;  // Visited node set.
 
   // Get alias information for node.
-  SDValue Ptr;
-  int64_t Size;
-  bool IsVolatile;
-  const Value *SrcValue;
-  int SrcValueOffset;
-  unsigned SrcValueAlign;
-  const MDNode *SrcTBAAInfo;
-  bool IsLoad = FindAliasInfo(N, Ptr, Size, IsVolatile, SrcValue,
-                              SrcValueOffset, SrcValueAlign, SrcTBAAInfo);
+  bool IsLoad = isa<LoadSDNode>(N) && !cast<LSBaseSDNode>(N)->isVolatile();
 
   // Starting off.
   Chains.push_back(OriginalChain);
@@ -11481,24 +11428,12 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
     case ISD::LOAD:
     case ISD::STORE: {
       // Get alias information for Chain.
-      SDValue OpPtr;
-      int64_t OpSize;
-      bool OpIsVolatile;
-      const Value *OpSrcValue;
-      int OpSrcValueOffset;
-      unsigned OpSrcValueAlign;
-      const MDNode *OpSrcTBAAInfo;
-      bool IsOpLoad = FindAliasInfo(Chain.getNode(), OpPtr, OpSize,
-                                    OpIsVolatile, OpSrcValue, OpSrcValueOffset,
-                                    OpSrcValueAlign,
-                                    OpSrcTBAAInfo);
+      bool IsOpLoad = isa<LoadSDNode>(Chain.getNode()) &&
+          !cast<LSBaseSDNode>(Chain.getNode())->isVolatile();
 
       // If chain is alias then stop here.
       if (!(IsLoad && IsOpLoad) &&
-          isAlias(Ptr, Size, IsVolatile, SrcValue, SrcValueOffset,
-                  SrcValueAlign, SrcTBAAInfo,
-                  OpPtr, OpSize, OpIsVolatile, OpSrcValue, OpSrcValueOffset,
-                  OpSrcValueAlign, OpSrcTBAAInfo)) {
+          isAlias(cast<LSBaseSDNode>(N), cast<LSBaseSDNode>(Chain.getNode()))) {
         Aliases.push_back(Chain);
       } else {
         // Look further up the chain.
@@ -11604,8 +11539,7 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
     return Aliases[0];
 
   // Construct a custom tailored token factor.
-  return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other,
-                     &Aliases[0], Aliases.size());
+  return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
 }
 
 // SelectionDAG::Combine - This is the entry point for the file.
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index baba51e..99931c1 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -39,7 +39,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "isel"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
@@ -64,6 +63,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "isel"
+
 STATISTIC(NumFastIselSuccessIndependent, "Number of insts selected by "
           "target-independent selector");
 STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "
@@ -79,7 +80,7 @@ void FastISel::startNewBlock() {
   // Instructions are appended to FuncInfo.MBB. If the basic block already
   // contains labels or copies, use the last instruction as the last local
   // value.
-  EmitStartPt = 0;
+  EmitStartPt = nullptr;
   if (!FuncInfo.MBB->empty())
     EmitStartPt = &FuncInfo.MBB->back();
   LastLocalValue = EmitStartPt;
@@ -826,15 +827,21 @@ FastISel::SelectInstruction(const Instruction *I) {
 
   MachineBasicBlock::iterator SavedInsertPt = FuncInfo.InsertPt;
 
-  // As a special case, don't handle calls to builtin library functions that
-  // may be translated directly to target instructions.
   if (const CallInst *Call = dyn_cast<CallInst>(I)) {
     const Function *F = Call->getCalledFunction();
     LibFunc::Func Func;
+
+    // As a special case, don't handle calls to builtin library functions that
+    // may be translated directly to target instructions.
     if (F && !F->hasLocalLinkage() && F->hasName() &&
         LibInfo->getLibFunc(F->getName(), Func) &&
         LibInfo->hasOptimizedCodeGen(Func))
       return false;
+
+    // Don't handle Intrinsic::trap if a trap funciton is specified.
+    if (F && F->getIntrinsicID() == Intrinsic::trap &&
+        !TM.Options.getTrapFunctionName().empty())
+      return false;
   }
 
   // First, try doing target-independent selection.
@@ -880,7 +887,7 @@ FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) {
     // fall-through case, which needs no instructions.
   } else {
     // The unconditional branch case.
-    TII.InsertBranch(*FuncInfo.MBB, MSucc, NULL,
+    TII.InsertBranch(*FuncInfo.MBB, MSucc, nullptr,
                      SmallVector<MachineOperand, 0>(), DbgLoc);
   }
   FuncInfo.MBB->addSuccessor(MSucc);
@@ -1035,8 +1042,10 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
   }
 
   case Instruction::Unreachable:
-    // Nothing to emit.
-    return true;
+    if (TM.Options.TrapUnreachable)
+      return FastEmit_(MVT::Other, MVT::Other, ISD::TRAP) != 0;
+    else
+      return true;
 
   case Instruction::Alloca:
     // FunctionLowering has the static-sized case covered.
@@ -1204,6 +1213,23 @@ unsigned FastISel::createResultReg(const TargetRegisterClass* RC) {
   return MRI.createVirtualRegister(RC);
 }
 
+unsigned FastISel::constrainOperandRegClass(const MCInstrDesc &II,
+                                            unsigned Op, unsigned OpNum) {
+  if (TargetRegisterInfo::isVirtualRegister(Op)) {
+    const TargetRegisterClass *RegClass =
+        TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
+    if (!MRI.constrainRegClass(Op, RegClass)) {
+      // If it's not legal to COPY between the register classes, something
+      // has gone very wrong before we got here.
+      unsigned NewOp = createResultReg(RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), NewOp).addReg(Op);
+      return NewOp;
+    }
+  }
+  return Op;
+}
+
 unsigned FastISel::FastEmitInst_(unsigned MachineInstOpcode,
                                  const TargetRegisterClass* RC) {
   unsigned ResultReg = createResultReg(RC);
@@ -1216,9 +1242,11 @@ unsigned FastISel::FastEmitInst_(unsigned MachineInstOpcode,
 unsigned FastISel::FastEmitInst_r(unsigned MachineInstOpcode,
                                   const TargetRegisterClass *RC,
                                   unsigned Op0, bool Op0IsKill) {
-  unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill);
@@ -1236,9 +1264,12 @@ unsigned FastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
                                    const TargetRegisterClass *RC,
                                    unsigned Op0, bool Op0IsKill,
                                    unsigned Op1, bool Op1IsKill) {
-  unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+  Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -1258,9 +1289,13 @@ unsigned FastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
                                    unsigned Op0, bool Op0IsKill,
                                    unsigned Op1, bool Op1IsKill,
                                    unsigned Op2, bool Op2IsKill) {
-  unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+  Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+  Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
+
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -1281,9 +1316,12 @@ unsigned FastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
                                    const TargetRegisterClass *RC,
                                    unsigned Op0, bool Op0IsKill,
                                    uint64_t Imm) {
-  unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  unsigned ResultReg = createResultReg(RC);
+  RC = TII.getRegClass(II, II.getNumDefs(), &TRI, *FuncInfo.MF);
+  MRI.constrainRegClass(Op0, RC);
+
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -1302,9 +1340,11 @@ unsigned FastISel::FastEmitInst_rii(unsigned MachineInstOpcode,
                                    const TargetRegisterClass *RC,
                                    unsigned Op0, bool Op0IsKill,
                                    uint64_t Imm1, uint64_t Imm2) {
-  unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -1325,9 +1365,11 @@ unsigned FastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
                                    const TargetRegisterClass *RC,
                                    unsigned Op0, bool Op0IsKill,
                                    const ConstantFP *FPImm) {
-  unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -1347,9 +1389,12 @@ unsigned FastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
                                     unsigned Op0, bool Op0IsKill,
                                     unsigned Op1, bool Op1IsKill,
                                     uint64_t Imm) {
-  unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+  Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -1371,9 +1416,12 @@ unsigned FastISel::FastEmitInst_rrii(unsigned MachineInstOpcode,
                                      unsigned Op0, bool Op0IsKill,
                                      unsigned Op1, bool Op1IsKill,
                                      uint64_t Imm1, uint64_t Imm2) {
-  unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+  Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
       .addReg(Op0, Op0IsKill * RegState::Kill)
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 5f0006e..ae124e8 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "function-lowering-info"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/Analysis.h"
@@ -40,6 +39,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "function-lowering-info"
+
 /// isUsedOutsideOfDefiningBlock - Return true if this instruction is used by
 /// PHI nodes or outside of the basic block that defines it, or used by a
 /// switch or atomic instruction, which may expand to multiple basic blocks.
@@ -283,11 +284,11 @@ unsigned FunctionLoweringInfo::CreateRegs(Type *Ty) {
 const FunctionLoweringInfo::LiveOutInfo *
 FunctionLoweringInfo::GetLiveOutRegInfo(unsigned Reg, unsigned BitWidth) {
   if (!LiveOutRegInfo.inBounds(Reg))
-    return NULL;
+    return nullptr;
 
   LiveOutInfo *LOI = &LiveOutRegInfo[Reg];
   if (!LOI->IsValid)
-    return NULL;
+    return nullptr;
 
   if (BitWidth > LOI->KnownZero.getBitWidth()) {
     LOI->NumSignBits = 1;
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 1c596b8..7c124b8 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "instr-emitter"
 #include "InstrEmitter.h"
 #include "SDNodeDbgValue.h"
 #include "llvm/ADT/Statistic.h"
@@ -31,6 +30,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "instr-emitter"
+
 /// MinRCSize - Smallest register class we allow when constraining virtual
 /// registers.  If satisfying all register class constraints would require
 /// using a smaller register class, emit a COPY to a new virtual register
@@ -99,7 +100,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
   // If the node is only used by a CopyToReg and the dest reg is a vreg, use
   // the CopyToReg'd destination register instead of creating a new vreg.
   bool MatchReg = true;
-  const TargetRegisterClass *UseRC = NULL;
+  const TargetRegisterClass *UseRC = nullptr;
   MVT VT = Node->getSimpleValueType(ResNo);
 
   // Stick to the preferred register classes for legal types.
@@ -107,9 +108,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
     UseRC = TLI->getRegClassFor(VT);
 
   if (!IsClone && !IsCloned)
-    for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
-         UI != E; ++UI) {
-      SDNode *User = *UI;
+    for (SDNode *User : Node->uses()) {
       bool Match = true;
       if (User->getOpcode() == ISD::CopyToReg &&
           User->getOperand(2).getNode() == Node &&
@@ -131,7 +130,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
           Match = false;
           if (User->isMachineOpcode()) {
             const MCInstrDesc &II = TII->get(User->getMachineOpcode());
-            const TargetRegisterClass *RC = 0;
+            const TargetRegisterClass *RC = nullptr;
             if (i+II.getNumDefs() < II.getNumOperands()) {
               RC = TRI->getAllocatableClass(
                 TII->getRegClass(II, i+II.getNumDefs(), TRI, *MF));
@@ -154,7 +153,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
         break;
     }
 
-  const TargetRegisterClass *SrcRC = 0, *DstRC = 0;
+  const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;
   SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
 
   // Figure out the register class to create for the destreg.
@@ -242,9 +241,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
     }
 
     if (!VRBase && !IsClone && !IsCloned)
-      for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
-           UI != E; ++UI) {
-        SDNode *User = *UI;
+      for (SDNode *User : Node->uses()) {
         if (User->getOpcode() == ISD::CopyToReg &&
             User->getOperand(2).getNode() == Node &&
             User->getOperand(2).getResNo() == i) {
@@ -329,7 +326,7 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
   // shrink VReg's register class within reason.  For example, if VReg == GR32
   // and II requires a GR32_NOSP, just constrain VReg to GR32_NOSP.
   if (II) {
-    const TargetRegisterClass *DstRC = 0;
+    const TargetRegisterClass *DstRC = nullptr;
     if (IIOpNum < II->getNumOperands())
       DstRC = TRI->getAllocatableClass(TII->getRegClass(*II,IIOpNum,TRI,*MF));
     if (DstRC && !MRI->constrainRegClass(VReg, DstRC, MinRCSize)) {
@@ -470,9 +467,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
 
   // If the node is only used by a CopyToReg and the dest reg is a vreg, use
   // the CopyToReg'd destination register instead of creating a new vreg.
-  for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
-       UI != E; ++UI) {
-    SDNode *User = *UI;
+  for (SDNode *User : Node->uses()) {
     if (User->getOpcode() == ISD::CopyToReg &&
         User->getOperand(2).getNode() == Node) {
       unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
@@ -561,10 +556,10 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       const ConstantSDNode *SD = cast<ConstantSDNode>(N0);
       MIB.addImm(SD->getZExtValue());
     } else
-      AddOperand(MIB, N0, 0, 0, VRBaseMap, /*IsDebug=*/false,
+      AddOperand(MIB, N0, 0, nullptr, VRBaseMap, /*IsDebug=*/false,
                  IsClone, IsCloned);
     // Add the subregster being inserted
-    AddOperand(MIB, N1, 0, 0, VRBaseMap, /*IsDebug=*/false,
+    AddOperand(MIB, N1, 0, nullptr, VRBaseMap, /*IsDebug=*/false,
                IsClone, IsCloned);
     MIB.addImm(SubIdx);
     MBB->insert(InsertPos, MIB);
@@ -693,10 +688,13 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
     MIB.addReg(0U);
   }
 
-  if (Offset != 0) // Indirect addressing.
+  // Indirect addressing is indicated by an Imm as the second parameter.
+  if (SD->isIndirect())
     MIB.addImm(Offset);
-  else
+  else {
+    assert(Offset == 0 && "direct value cannot have an offset");
     MIB.addReg(0U, RegState::Debug);
+  }
 
   MIB.addMetadata(MDPtr);
 
@@ -738,7 +736,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   const MCInstrDesc &II = TII->get(Opc);
   unsigned NumResults = CountResults(Node);
   unsigned NumDefs = II.getNumDefs();
-  const uint16_t *ScratchRegs = NULL;
+  const MCPhysReg *ScratchRegs = nullptr;
 
   // Handle STACKMAP and PATCHPOINT specially and then use the generic code.
   if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
@@ -756,7 +754,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   unsigned NumImpUses = 0;
   unsigned NodeOperands =
     countOperands(Node, II.getNumOperands() - NumDefs, NumImpUses);
-  bool HasPhysRegOuts = NumResults > NumDefs && II.getImplicitDefs()!=0;
+  bool HasPhysRegOuts = NumResults > NumDefs && II.getImplicitDefs()!=nullptr;
 #ifndef NDEBUG
   unsigned NumMIOperands = NodeOperands + NumResults;
   if (II.isVariadic())
@@ -982,7 +980,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
         // The addressing mode has been selected, just add all of the
         // operands to the machine instruction.
         for (unsigned j = 0; j != NumVals; ++j, ++i)
-          AddOperand(MIB, Node->getOperand(i), 0, 0, VRBaseMap,
+          AddOperand(MIB, Node->getOperand(i), 0, nullptr, VRBaseMap,
                      /*IsDebug=*/false, IsClone, IsCloned);
 
         // Manually set isTied bits.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 20afb3d..a59e895 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -387,9 +387,7 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                        MinAlign(ST->getAlignment(), Offset),
                                        ST->getTBAAInfo()));
     // The order of the stores doesn't matter - say it with a TokenFactor.
-    SDValue Result =
-      DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
-                  Stores.size());
+    SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
     DAGLegalize->ReplaceNode(SDValue(ST, 0), Result);
     return;
   }
@@ -506,8 +504,7 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                        false, false, 0));
 
     // The order of the stores doesn't matter - say it with a TokenFactor.
-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
-                             Stores.size());
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
 
     // Finally, perform the original load only redirected to the stack slot.
     Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
@@ -705,7 +702,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
       }
     }
   }
-  return SDValue(0, 0);
+  return SDValue(nullptr, 0);
 }
 
 void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
@@ -1268,6 +1265,13 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     if (Action == TargetLowering::Legal)
       Action = TargetLowering::Custom;
     break;
+  case ISD::READ_REGISTER:
+  case ISD::WRITE_REGISTER:
+    // Named register is legal in the DAG, but blocked by register name
+    // selection if not implemented by target (to chose the correct register)
+    // They'll be converted to Copy(To/From)Reg.
+    Action = TargetLowering::Legal;
+    break;
   case ISD::DEBUGTRAP:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     if (Action == TargetLowering::Expand) {
@@ -1528,8 +1532,7 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
 
   SDValue StoreChain;
   if (!Stores.empty())    // Not all undef elements?
-    StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                             &Stores[0], Stores.size());
+    StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
   else
     StoreChain = DAG.getEntryNode();
 
@@ -1649,8 +1652,8 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
 /// If the SETCC has been legalized using the inverse condcode, then LHS and
 /// RHS will be unchanged, CC will set to the inverted condcode, and NeedInvert
 /// will be set to true. The caller must invert the result of the SETCC with
-/// SelectionDAG::getNOT() or take equivalent action to swap the effect of a
-/// true/false result.
+/// SelectionDAG::getLogicalNOT() or take equivalent action to swap the effect
+/// of a true/false result.
 ///
 /// \returns true if the SetCC has been legalized, false if it hasn't.
 bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
@@ -2055,13 +2058,12 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   if (isTailCall)
     InChain = TCChain;
 
-  TargetLowering::
-  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
-                    0, TLI.getLibcallCallingConv(LC), isTailCall,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, SDLoc(Node));
-  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setTailCall(isTailCall).setSExtResult(isSigned).setZExtResult(!isSigned);
 
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   if (!CallInfo.second.getNode())
     // It's a tailcall, return the chain (which is the DAG root).
@@ -2090,12 +2092,12 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
                                          TLI.getPointerTy());
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-  TargetLowering::
-  CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
-                       false, 0, TLI.getLibcallCallingConv(LC),
-                       /*isTailCall=*/false,
-                  /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                  Callee, Args, DAG, dl);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setSExtResult(isSigned).setZExtResult(!isSigned);
+
   std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo.first;
@@ -2124,11 +2126,12 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
                                          TLI.getPointerTy());
 
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
-  TargetLowering::
-  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
-                    0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, SDLoc(Node));
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setSExtResult(isSigned).setZExtResult(!isSigned);
+
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo;
@@ -2183,7 +2186,7 @@ static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
   case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
   }
 
-  return TLI.getLibcallName(LC) != 0;
+  return TLI.getLibcallName(LC) != nullptr;
 }
 
 /// useDivRem - Only issue divrem libcall if both quotient and remainder are
@@ -2261,11 +2264,11 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
                                          TLI.getPointerTy());
 
   SDLoc dl(Node);
-  TargetLowering::
-  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
-                    0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, dl);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setSExtResult(isSigned).setZExtResult(!isSigned);
+
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   // Remainder is loaded back from the stack frame.
@@ -2286,7 +2289,7 @@ static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
   case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
   case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
   }
-  return TLI.getLibcallName(LC) != 0;
+  return TLI.getLibcallName(LC) != nullptr;
 }
 
 /// canCombineSinCosLibcall - Return true if sincos libcall is available and
@@ -2375,12 +2378,11 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
                                          TLI.getPointerTy());
 
   SDLoc dl(Node);
-  TargetLowering::
-  CallLoweringInfo CLI(InChain, Type::getVoidTy(*DAG.getContext()),
-                       false, false, false, false,
-                       0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
-                       /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                       Callee, Args, DAG, dl);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(TLI.getLibcallCallingConv(LC),
+               Type::getVoidTy(*DAG.getContext()), Callee, &Args, 0);
+
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   Results.push_back(DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr,
@@ -2990,15 +2992,13 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // If the target didn't lower this, lower it to '__sync_synchronize()' call
     // FIXME: handle "fence singlethread" more efficiently.
     TargetLowering::ArgListTy Args;
-    TargetLowering::
-    CallLoweringInfo CLI(Node->getOperand(0),
-                         Type::getVoidTy(*DAG.getContext()),
-                      false, false, false, false, 0, CallingConv::C,
-                      /*isTailCall=*/false,
-                      /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                      DAG.getExternalSymbol("__sync_synchronize",
-                                            TLI.getPointerTy()),
-                      Args, DAG, dl);
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl).setChain(Node->getOperand(0))
+      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                 DAG.getExternalSymbol("__sync_synchronize", TLI.getPointerTy()),
+                 &Args, 0);
+
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
     Results.push_back(CallResult.second);
@@ -3071,14 +3071,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::TRAP: {
     // If this operation is not supported, lower it to 'abort()' call
     TargetLowering::ArgListTy Args;
-    TargetLowering::
-    CallLoweringInfo CLI(Node->getOperand(0),
-                         Type::getVoidTy(*DAG.getContext()),
-                      false, false, false, false, 0, CallingConv::C,
-                      /*isTailCall=*/false,
-                      /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                      DAG.getExternalSymbol("abort", TLI.getPointerTy()),
-                      Args, DAG, dl);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl).setChain(Node->getOperand(0))
+      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                 DAG.getExternalSymbol("abort", TLI.getPointerTy()), &Args, 0);
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
     Results.push_back(CallResult.second);
@@ -3304,7 +3300,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                                   TLI.getVectorIdxTy())));
     }
 
-    Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
+    Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
     // We may have changed the BUILD_VECTOR type. Cast it back to the Node type.
     Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1);
     Results.push_back(Tmp1);
@@ -3625,6 +3621,23 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                     Node->getOperand(1)));
       break;
     }
+
+    SDValue Lo, Hi;
+    EVT HalfType = VT.getHalfSizedIntegerVT(*DAG.getContext());
+    if (TLI.isOperationLegalOrCustom(ISD::ZERO_EXTEND, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::SHL, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::OR, VT) &&
+        TLI.expandMUL(Node, Lo, Hi, HalfType, DAG)) {
+      Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
+      Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Hi);
+      SDValue Shift = DAG.getConstant(HalfType.getSizeInBits(),
+                                      TLI.getShiftAmountTy(HalfType));
+      Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
+      Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
+      break;
+    }
+
     Tmp1 = ExpandIntLibCall(Node, false,
                             RTLIB::MUL_I8,
                             RTLIB::MUL_I16, RTLIB::MUL_I32,
@@ -3698,8 +3711,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       BottomHalf = DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS,
                                RHS);
       TopHalf = BottomHalf.getValue(1);
-    } else if (TLI.isTypeLegal(EVT::getIntegerVT(*DAG.getContext(),
-                                                 VT.getSizeInBits() * 2))) {
+    } else if (TLI.isTypeLegal(WideVT)) {
       LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
       RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
       Tmp1 = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
@@ -3857,7 +3869,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       // If we expanded the SETCC by inverting the condition code, then wrap
       // the existing SETCC in a NOT to restore the intended condition.
       if (NeedInvert)
-        Tmp1 = DAG.getNOT(dl, Tmp1, Tmp1->getValueType(0));
+        Tmp1 = DAG.getLogicalNOT(dl, Tmp1, Tmp1->getValueType(0));
 
       Results.push_back(Tmp1);
       break;
@@ -3994,8 +4006,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                     VT.getScalarType(), Ex, Sh));
     }
     SDValue Result =
-      DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0),
-                  &Scalars[0], Scalars.size());
+      DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Scalars);
     ReplaceNode(SDValue(Node, 0), Result);
     break;
   }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index ecf4c5d..6b8fec6 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -24,6 +24,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "legalize-types"
+
 /// GetFPLibCall - Return the right libcall for the given floating point type.
 static RTLIB::Libcall GetFPLibCall(EVT VT,
                                    RTLIB::Libcall Call_F32,
@@ -674,7 +676,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
 
   // If softenSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
-  if (NewRHS.getNode() == 0) {
+  if (!NewRHS.getNode()) {
     NewRHS = DAG.getConstant(0, NewLHS.getValueType());
     CCCode = ISD::SETNE;
   }
@@ -720,7 +722,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
 
   // If softenSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
-  if (NewRHS.getNode() == 0) {
+  if (!NewRHS.getNode()) {
     NewRHS = DAG.getConstant(0, NewLHS.getValueType());
     CCCode = ISD::SETNE;
   }
@@ -742,7 +744,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
   TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If softenSetCCOperands returned a scalar, use it.
-  if (NewRHS.getNode() == 0) {
+  if (!NewRHS.getNode()) {
     assert(NewLHS.getValueType() == N->getValueType(0) &&
            "Unexpected setcc expansion!");
     return NewLHS;
@@ -1340,7 +1342,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
-  if (NewRHS.getNode() == 0) {
+  if (!NewRHS.getNode()) {
     NewRHS = DAG.getConstant(0, NewLHS.getValueType());
     CCCode = ISD::SETNE;
   }
@@ -1433,7 +1435,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
-  if (NewRHS.getNode() == 0) {
+  if (!NewRHS.getNode()) {
     NewRHS = DAG.getConstant(0, NewLHS.getValueType());
     CCCode = ISD::SETNE;
   }
@@ -1450,7 +1452,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
   FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, use it.
-  if (NewRHS.getNode() == 0) {
+  if (!NewRHS.getNode()) {
     assert(NewLHS.getValueType() == N->getValueType(0) &&
            "Unexpected setcc expansion!");
     return NewLHS;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 18b2376..2483184 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -24,6 +24,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "legalize-types"
+
 //===----------------------------------------------------------------------===//
 //  Integer Result Promotion
 //===----------------------------------------------------------------------===//
@@ -266,9 +268,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
   EVT NVT = Op.getValueType();
   SDLoc dl(N);
 
-  unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
+  unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
   return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
-                     DAG.getConstant(DiffBits, TLI.getPointerTy()));
+                     DAG.getConstant(DiffBits, TLI.getShiftAmountTy(NVT)));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
@@ -432,7 +434,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   EVT ValueVTs[] = { N->getValueType(0), NVT };
   SDValue Ops[] = { N->getOperand(0), N->getOperand(1) };
   SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N),
-                            DAG.getVTList(ValueVTs, 2), Ops, 2);
+                            DAG.getVTList(ValueVTs), Ops);
 
   // Modified the sum result - switch anything that used the old sum to use
   // the new one.
@@ -931,7 +933,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
   for (unsigned i = 0; i < NumElts; ++i)
     NewOps.push_back(GetPromotedInteger(N->getOperand(i)));
 
-  return SDValue(DAG.UpdateNodeOperands(N, &NewOps[0], NumElts), 0);
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_CONVERT_RNDSAT(SDNode *N) {
@@ -1270,6 +1272,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
 /// and the shift amount is a constant 'Amt'.  Expand the operation.
 void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt,
                                              SDValue &Lo, SDValue &Hi) {
+  assert(Amt && "Expected zero shifts to be already optimized away.");
   SDLoc DL(N);
   // Expand the incoming operand to be shifted, so that we have its parts
   SDValue InL, InH;
@@ -1296,9 +1299,9 @@ void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt,
       // Emit this X << 1 as X+X.
       SDVTList VTList = DAG.getVTList(NVT, MVT::Glue);
       SDValue LoOps[2] = { InL, InL };
-      Lo = DAG.getNode(ISD::ADDC, DL, VTList, LoOps, 2);
+      Lo = DAG.getNode(ISD::ADDC, DL, VTList, LoOps);
       SDValue HiOps[3] = { InH, InH, Lo.getValue(1) };
-      Hi = DAG.getNode(ISD::ADDE, DL, VTList, HiOps, 3);
+      Hi = DAG.getNode(ISD::ADDE, DL, VTList, HiOps);
     } else {
       Lo = DAG.getNode(ISD::SHL, DL, NVT, InL, DAG.getConstant(Amt, ShTy));
       Hi = DAG.getNode(ISD::OR, DL, NVT,
@@ -1372,7 +1375,7 @@ ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits));
   APInt KnownZero, KnownOne;
-  DAG.ComputeMaskedBits(N->getOperand(1), KnownZero, KnownOne);
+  DAG.computeKnownBits(N->getOperand(1), KnownZero, KnownOne);
 
   // If we don't know anything about the high bits, exit.
   if (((KnownZero|KnownOne) & HighBitMask) == 0)
@@ -1547,20 +1550,20 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
   if (hasCarry) {
     SDVTList VTList = DAG.getVTList(NVT, MVT::Glue);
     if (N->getOpcode() == ISD::ADD) {
-      Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps, 2);
+      Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
-      Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps, 3);
+      Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps);
     } else {
-      Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps, 2);
+      Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
-      Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps, 3);
+      Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps);
     }
     return;
   }
 
   if (N->getOpcode() == ISD::ADD) {
-    Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps, 2);
-    Hi = DAG.getNode(ISD::ADD, dl, NVT, HiOps, 2);
+    Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps);
+    Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2));
     SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0],
                                 ISD::SETULT);
     SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1,
@@ -1572,8 +1575,8 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
                                    DAG.getConstant(1, NVT), Carry1);
     Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry2);
   } else {
-    Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps, 2);
-    Hi = DAG.getNode(ISD::SUB, dl, NVT, HiOps, 2);
+    Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps);
+    Hi = DAG.getNode(ISD::SUB, dl, NVT, makeArrayRef(HiOps, 2));
     SDValue Cmp =
       DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()),
                    LoOps[0], LoOps[1], ISD::SETULT);
@@ -1596,13 +1599,13 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBC(SDNode *N,
   SDValue HiOps[3] = { LHSH, RHSH };
 
   if (N->getOpcode() == ISD::ADDC) {
-    Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps, 2);
+    Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps);
     HiOps[2] = Lo.getValue(1);
-    Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps, 3);
+    Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps);
   } else {
-    Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps, 2);
+    Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps);
     HiOps[2] = Lo.getValue(1);
-    Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps, 3);
+    Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps);
   }
 
   // Legalized the flag result - switch anything that used the old flag to
@@ -1621,9 +1624,9 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N,
   SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) };
   SDValue HiOps[3] = { LHSH, RHSH };
 
-  Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps, 3);
+  Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps);
   HiOps[2] = Lo.getValue(1);
-  Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps, 3);
+  Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps);
 
   // Legalized the flag result - switch anything that used the old flag to
   // use the new one.
@@ -1712,9 +1715,13 @@ void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned NBitWidth = NVT.getSizeInBits();
-  const APInt &Cst = cast<ConstantSDNode>(N)->getAPIntValue();
-  Lo = DAG.getConstant(Cst.trunc(NBitWidth), NVT);
-  Hi = DAG.getConstant(Cst.lshr(NBitWidth).trunc(NBitWidth), NVT);
+  auto Constant = cast<ConstantSDNode>(N);
+  const APInt &Cst = Constant->getAPIntValue();
+  bool IsTarget = Constant->isTargetOpcode();
+  bool IsOpaque = Constant->isOpaque();
+  Lo = DAG.getConstant(Cst.trunc(NBitWidth), NVT, IsTarget, IsOpaque);
+  Hi = DAG.getConstant(Cst.lshr(NBitWidth).trunc(NBitWidth), NVT, IsTarget,
+                       IsOpaque);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
@@ -1923,73 +1930,12 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDLoc dl(N);
 
-  bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, NVT);
-  bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, NVT);
-  bool HasSMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::SMUL_LOHI, NVT);
-  bool HasUMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, NVT);
-  if (HasMULHU || HasMULHS || HasUMUL_LOHI || HasSMUL_LOHI) {
-    SDValue LL, LH, RL, RH;
-    GetExpandedInteger(N->getOperand(0), LL, LH);
-    GetExpandedInteger(N->getOperand(1), RL, RH);
-    unsigned OuterBitSize = VT.getSizeInBits();
-    unsigned InnerBitSize = NVT.getSizeInBits();
-    unsigned LHSSB = DAG.ComputeNumSignBits(N->getOperand(0));
-    unsigned RHSSB = DAG.ComputeNumSignBits(N->getOperand(1));
-
-    APInt HighMask = APInt::getHighBitsSet(OuterBitSize, InnerBitSize);
-    if (DAG.MaskedValueIsZero(N->getOperand(0), HighMask) &&
-        DAG.MaskedValueIsZero(N->getOperand(1), HighMask)) {
-      // The inputs are both zero-extended.
-      if (HasUMUL_LOHI) {
-        // We can emit a umul_lohi.
-        Lo = DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(NVT, NVT), LL, RL);
-        Hi = SDValue(Lo.getNode(), 1);
-        return;
-      }
-      if (HasMULHU) {
-        // We can emit a mulhu+mul.
-        Lo = DAG.getNode(ISD::MUL, dl, NVT, LL, RL);
-        Hi = DAG.getNode(ISD::MULHU, dl, NVT, LL, RL);
-        return;
-      }
-    }
-    if (LHSSB > InnerBitSize && RHSSB > InnerBitSize) {
-      // The input values are both sign-extended.
-      if (HasSMUL_LOHI) {
-        // We can emit a smul_lohi.
-        Lo = DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(NVT, NVT), LL, RL);
-        Hi = SDValue(Lo.getNode(), 1);
-        return;
-      }
-      if (HasMULHS) {
-        // We can emit a mulhs+mul.
-        Lo = DAG.getNode(ISD::MUL, dl, NVT, LL, RL);
-        Hi = DAG.getNode(ISD::MULHS, dl, NVT, LL, RL);
-        return;
-      }
-    }
-    if (HasUMUL_LOHI) {
-      // Lo,Hi = umul LHS, RHS.
-      SDValue UMulLOHI = DAG.getNode(ISD::UMUL_LOHI, dl,
-                                       DAG.getVTList(NVT, NVT), LL, RL);
-      Lo = UMulLOHI;
-      Hi = UMulLOHI.getValue(1);
-      RH = DAG.getNode(ISD::MUL, dl, NVT, LL, RH);
-      LH = DAG.getNode(ISD::MUL, dl, NVT, LH, RL);
-      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, RH);
-      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, LH);
-      return;
-    }
-    if (HasMULHU) {
-      Lo = DAG.getNode(ISD::MUL, dl, NVT, LL, RL);
-      Hi = DAG.getNode(ISD::MULHU, dl, NVT, LL, RL);
-      RH = DAG.getNode(ISD::MUL, dl, NVT, LL, RH);
-      LH = DAG.getNode(ISD::MUL, dl, NVT, LH, RL);
-      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, RH);
-      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, LH);
-      return;
-    }
-  }
+  SDValue LL, LH, RL, RH;
+  GetExpandedInteger(N->getOperand(0), LL, LH);
+  GetExpandedInteger(N->getOperand(1), RL, RH);
+
+  if (TLI.expandMUL(N, Lo, Hi, NVT, DAG, LL, LH, RL, RH))
+    return;
 
   // If nothing else, we can make a libcall.
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
@@ -2120,7 +2066,7 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
       ShiftOp = DAG.getZExtOrTrunc(ShiftOp, dl, ShiftTy);
 
     SDValue Ops[] = { LHSL, LHSH, ShiftOp };
-    Lo = DAG.getNode(PartsOpc, dl, DAG.getVTList(VT, VT), Ops, 3);
+    Lo = DAG.getNode(PartsOpc, dl, DAG.getVTList(VT, VT), Ops);
     Hi = Lo.getValue(1);
     return;
   }
@@ -2352,12 +2298,12 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
   Args.push_back(Entry);
 
   SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT);
-  TargetLowering::
-  CallLoweringInfo CLI(Chain, RetTy, true, false, false, false,
-                       0, TLI.getLibcallCallingConv(LC),
-                       /*isTailCall=*/false,
-                       /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                       Func, Args, DAG, dl);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, &Args, 0)
+    .setSExtResult();
+
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   SplitInteger(CallInfo.first, Lo, Hi);
@@ -2576,7 +2522,8 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
 
   // NOTE: on targets without efficient SELECT of bools, we can always use
   // this identity: (B1 ? B2 : B3) --> (B1 & B2)|(!B1&B3)
-  TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, AfterLegalizeTypes, true, NULL);
+  TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, AfterLegalizeTypes, true,
+                                                 nullptr);
   SDValue Tmp1, Tmp2;
   if (TLI.isTypeLegal(LHSLo.getValueType()) &&
       TLI.isTypeLegal(RHSLo.getValueType()))
@@ -2629,7 +2576,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) {
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
-  if (NewRHS.getNode() == 0) {
+  if (!NewRHS.getNode()) {
     NewRHS = DAG.getConstant(0, NewLHS.getValueType());
     CCCode = ISD::SETNE;
   }
@@ -2647,7 +2594,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
-  if (NewRHS.getNode() == 0) {
+  if (!NewRHS.getNode()) {
     NewRHS = DAG.getConstant(0, NewLHS.getValueType());
     CCCode = ISD::SETNE;
   }
@@ -2664,7 +2611,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
   IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, use it.
-  if (NewRHS.getNode() == 0) {
+  if (!NewRHS.getNode()) {
     assert(NewLHS.getValueType() == N->getValueType(0) &&
            "Unexpected setcc expansion!");
     return NewLHS;
@@ -2912,7 +2859,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
     Ops.push_back(Op);
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops);
 }
 
 
@@ -2959,7 +2906,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) {
     Ops.push_back(Op);
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
@@ -3007,7 +2954,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
     }
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
@@ -3063,6 +3010,5 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
     }
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl,  N->getValueType(0),
-    &NewOps[0], NewOps.size());
-  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl,  N->getValueType(0), NewOps);
+}
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index e141883..3971fc3 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -22,6 +22,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "legalize-types"
+
 static cl::opt<bool>
 EnableExpensiveChecks("enable-legalize-types-checking", cl::Hidden);
 
@@ -159,7 +161,7 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
         if (Mapped & 128)
           dbgs() << " WidenedVectors";
         dbgs() << "\n";
-        llvm_unreachable(0);
+        llvm_unreachable(nullptr);
       }
     }
   }
@@ -433,7 +435,7 @@ NodeDone:
 
     if (Failed) {
       I->dump(&DAG); dbgs() << "\n";
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
   }
 #endif
@@ -488,7 +490,7 @@ SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
 
   // Some operands changed - update the node.
   if (!NewOps.empty()) {
-    SDNode *M = DAG.UpdateNodeOperands(N, &NewOps[0], NewOps.size());
+    SDNode *M = DAG.UpdateNodeOperands(N, NewOps);
     if (M != N) {
       // The node morphed into a different node.  Normally for this to happen
       // the original node would have to be marked NewNode.  However this can
@@ -736,7 +738,7 @@ void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
   AnalyzeNewValue(Result);
 
   SDValue &OpEntry = PromotedIntegers[Op];
-  assert(OpEntry.getNode() == 0 && "Node is already promoted!");
+  assert(!OpEntry.getNode() && "Node is already promoted!");
   OpEntry = Result;
 }
 
@@ -747,7 +749,7 @@ void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
   AnalyzeNewValue(Result);
 
   SDValue &OpEntry = SoftenedFloats[Op];
-  assert(OpEntry.getNode() == 0 && "Node is already converted to integer!");
+  assert(!OpEntry.getNode() && "Node is already converted to integer!");
   OpEntry = Result;
 }
 
@@ -761,7 +763,7 @@ void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
   AnalyzeNewValue(Result);
 
   SDValue &OpEntry = ScalarizedVectors[Op];
-  assert(OpEntry.getNode() == 0 && "Node is already scalarized!");
+  assert(!OpEntry.getNode() && "Node is already scalarized!");
   OpEntry = Result;
 }
 
@@ -787,7 +789,7 @@ void DAGTypeLegalizer::SetExpandedInteger(SDValue Op, SDValue Lo,
 
   // Remember that this is the result of the node.
   std::pair<SDValue, SDValue> &Entry = ExpandedIntegers[Op];
-  assert(Entry.first.getNode() == 0 && "Node already expanded");
+  assert(!Entry.first.getNode() && "Node already expanded");
   Entry.first = Lo;
   Entry.second = Hi;
 }
@@ -814,7 +816,7 @@ void DAGTypeLegalizer::SetExpandedFloat(SDValue Op, SDValue Lo,
 
   // Remember that this is the result of the node.
   std::pair<SDValue, SDValue> &Entry = ExpandedFloats[Op];
-  assert(Entry.first.getNode() == 0 && "Node already expanded");
+  assert(!Entry.first.getNode() && "Node already expanded");
   Entry.first = Lo;
   Entry.second = Hi;
 }
@@ -843,7 +845,7 @@ void DAGTypeLegalizer::SetSplitVector(SDValue Op, SDValue Lo,
 
   // Remember that this is the result of the node.
   std::pair<SDValue, SDValue> &Entry = SplitVectors[Op];
-  assert(Entry.first.getNode() == 0 && "Node already split");
+  assert(!Entry.first.getNode() && "Node already split");
   Entry.first = Lo;
   Entry.second = Hi;
 }
@@ -855,7 +857,7 @@ void DAGTypeLegalizer::SetWidenedVector(SDValue Op, SDValue Result) {
   AnalyzeNewValue(Result);
 
   SDValue &OpEntry = WidenedVectors[Op];
-  assert(OpEntry.getNode() == 0 && "Node already widened!");
+  assert(!OpEntry.getNode() && "Node already widened!");
   OpEntry = Result;
 }
 
@@ -1007,7 +1009,7 @@ SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
   unsigned NumOps = N->getNumOperands();
   SDLoc dl(N);
   if (NumOps == 0) {
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), 0, 0, isSigned,
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), nullptr, 0, isSigned,
                            dl).first;
   } else if (NumOps == 1) {
     SDValue Op = N->getOperand(0);
@@ -1049,11 +1051,12 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
                                          TLI.getPointerTy());
 
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
-  TargetLowering::
-  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
-                    0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, SDLoc(Node));
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setSExtResult(isSigned).setZExtResult(!isSigned);
+
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 947ea10..e4bbc78 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -16,7 +16,6 @@
 #ifndef SELECTIONDAG_LEGALIZETYPES_H
 #define SELECTIONDAG_LEGALIZETYPES_H
 
-#define DEBUG_TYPE "legalize-types"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -540,6 +539,7 @@ private:
   SDValue ScalarizeVecOp_UnaryOp(SDNode *N);
   SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue ScalarizeVecOp_VSELECT(SDNode *N);
   SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index e9424f2..f40ed76 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -23,6 +23,8 @@
 #include "llvm/IR/DataLayout.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "legalize-types"
+
 //===----------------------------------------------------------------------===//
 // Generic Result Expansion.
 //===----------------------------------------------------------------------===//
@@ -352,7 +354,8 @@ SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) {
     SmallVector<SDValue, 8> Ops;
     IntegerToVector(N->getOperand(0), NumElts, Ops, NVT.getVectorElementType());
 
-    SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &Ops[0], NumElts);
+    SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT,
+                              makeArrayRef(Ops.data(), NumElts));
     return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec);
   }
 
@@ -388,7 +391,7 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
   SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl,
                                EVT::getVectorVT(*DAG.getContext(),
                                                 NewVT, NewElts.size()),
-                               &NewElts[0], NewElts.size());
+                               NewElts);
 
   // Convert the new vector to the old vector type.
   return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec);
@@ -447,7 +450,7 @@ SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) {
   SDValue UndefVal = DAG.getUNDEF(Ops[0].getValueType());
   for (unsigned i = 1; i < NumElts; ++i)
     Ops[i] = UndefVal;
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
 
 SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 551d054..898cd29 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -63,6 +63,8 @@ class VectorLegalizer {
   SDValue ExpandUINT_TO_FLOAT(SDValue Op);
   // Implement expansion for SIGN_EXTEND_INREG using SRL and SRA.
   SDValue ExpandSEXTINREG(SDValue Op);
+  // Expand bswap of vectors into a shuffle if legal.
+  SDValue ExpandBSWAP(SDValue Op);
   // Implement vselect in terms of XOR, AND, OR when blend is not supported
   // by the target.
   SDValue ExpandVSELECT(SDValue Op);
@@ -152,8 +154,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
     Ops.push_back(LegalizeOp(Node->getOperand(i)));
 
-  SDValue Result =
-    SDValue(DAG.UpdateNodeOperands(Op.getNode(), Ops.data(), Ops.size()), 0);
+  SDValue Result = SDValue(DAG.UpdateNodeOperands(Op.getNode(), Ops), 0);
 
   if (Op.getOpcode() == ISD::LOAD) {
     LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
@@ -298,6 +299,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case TargetLowering::Expand:
     if (Node->getOpcode() == ISD::SIGN_EXTEND_INREG)
       Result = ExpandSEXTINREG(Op);
+    else if (Node->getOpcode() == ISD::BSWAP)
+      Result = ExpandBSWAP(Op);
     else if (Node->getOpcode() == ISD::VSELECT)
       Result = ExpandVSELECT(Op);
     else if (Node->getOpcode() == ISD::SELECT)
@@ -343,7 +346,7 @@ SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) {
       Operands[j] = Op.getOperand(j);
   }
 
-  Op = DAG.getNode(Op.getOpcode(), dl, NVT, &Operands[0], Operands.size());
+  Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands);
 
   return DAG.getNode(ISD::BITCAST, dl, VT, Op);
 }
@@ -377,8 +380,7 @@ SDValue VectorLegalizer::PromoteVectorOpINT_TO_FP(SDValue Op) {
       Operands[j] = Op.getOperand(j);
   }
 
-  return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), &Operands[0],
-                     Operands.size());
+  return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Operands);
 }
 
 // For FP_TO_INT we promote the result type to a vector type with wider
@@ -546,10 +548,9 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
     }
   }
 
-  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-            &LoadChains[0], LoadChains.size());
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
   SDValue Value = DAG.getNode(ISD::BUILD_VECTOR, dl,
-            Op.getNode()->getValueType(0), &Vals[0], Vals.size());
+                              Op.getNode()->getValueType(0), Vals);
 
   AddLegalizedOperand(Op.getValue(0), Value);
   AddLegalizedOperand(Op.getValue(1), NewChain);
@@ -603,8 +604,7 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
 
     Stores.push_back(Store);
   }
-  SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                            &Stores[0], Stores.size());
+  SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
   AddLegalizedOperand(Op, TF);
   return TF;
 }
@@ -648,7 +648,7 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
 
   // Broadcast the mask so that the entire vector is all-one or all zero.
   SmallVector<SDValue, 8> Ops(NumElem, Mask);
-  Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskTy, &Ops[0], Ops.size());
+  Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskTy, Ops);
 
   // Bitcast the operands to be the same type as the mask.
   // This is needed when we select between FP types because
@@ -686,6 +686,29 @@ SDValue VectorLegalizer::ExpandSEXTINREG(SDValue Op) {
   return DAG.getNode(ISD::SRA, DL, VT, Op, ShiftSz);
 }
 
+SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) {
+  EVT VT = Op.getValueType();
+
+  // Generate a byte wise shuffle mask for the BSWAP.
+  SmallVector<int, 16> ShuffleMask;
+  int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
+  for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I)
+    for (int J = ScalarSizeInBytes - 1; J >= 0; --J)
+      ShuffleMask.push_back((I * ScalarSizeInBytes) + J);
+
+  EVT ByteVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, ShuffleMask.size());
+
+  // Only emit a shuffle if the mask is legal.
+  if (!TLI.isShuffleMaskLegal(ShuffleMask, ByteVT))
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  SDLoc DL(Op);
+  Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Op.getOperand(0));
+  Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT),
+                            ShuffleMask.data());
+  return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+}
+
 SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
   // Implement VSELECT in terms of XOR, AND, OR
   // on platforms which do not support blend natively.
@@ -803,7 +826,7 @@ SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) {
                                            (EltVT.getSizeInBits()), EltVT),
                            DAG.getConstant(0, EltVT));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElems);
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
 
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 940a9c9..368eba3 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -26,6 +26,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "legalize-types"
+
 //===----------------------------------------------------------------------===//
 //  Result Vector Scalarization: <1 x ty> -> ty.
 //===----------------------------------------------------------------------===//
@@ -331,12 +333,24 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VSETCC(SDNode *N) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
          "Operand types must be vectors");
-
-  SDValue LHS = GetScalarizedVector(N->getOperand(0));
-  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  EVT OpVT = LHS.getValueType();
   EVT NVT = N->getValueType(0).getVectorElementType();
   SDLoc DL(N);
 
+  // The result needs scalarizing, but it's not a given that the source does.
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    LHS = GetScalarizedVector(LHS);
+    RHS = GetScalarizedVector(RHS);
+  } else {
+    EVT VT = OpVT.getVectorElementType();
+    LHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, LHS,
+                      DAG.getConstant(0, TLI.getVectorIdxTy()));
+    RHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, RHS,
+                      DAG.getConstant(0, TLI.getVectorIdxTy()));
+  }
+
   // Turn it into a scalar SETCC.
   SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
                             N->getOperand(2));
@@ -358,7 +372,7 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
         dbgs() << "\n");
   SDValue Res = SDValue();
 
-  if (Res.getNode() == 0) {
+  if (!Res.getNode()) {
     switch (N->getOpcode()) {
     default:
 #ifndef NDEBUG
@@ -382,6 +396,9 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::EXTRACT_VECTOR_ELT:
       Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N);
       break;
+    case ISD::VSELECT:
+      Res = ScalarizeVecOp_VSELECT(N);
+      break;
     case ISD::STORE:
       Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo);
       break;
@@ -420,13 +437,11 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
   assert(N->getValueType(0).getVectorNumElements() == 1 &&
          "Unexected vector type!");
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
-  SmallVector<SDValue, 1> Ops(1);
-  Ops[0] = DAG.getNode(N->getOpcode(), SDLoc(N),
-                       N->getValueType(0).getScalarType(), Elt);
+  SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N),
+                           N->getValueType(0).getScalarType(), Elt);
   // Revectorize the result so the types line up with what the uses of this
   // expression expect.
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0),
-                     &Ops[0], 1);
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Op);
 }
 
 /// ScalarizeVecOp_CONCAT_VECTORS - The vectors to concatenate have length one -
@@ -435,8 +450,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
   SmallVector<SDValue, 8> Ops(N->getNumOperands());
   for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
     Ops[i] = GetScalarizedVector(N->getOperand(i));
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0),
-                     &Ops[0], Ops.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Ops);
 }
 
 /// ScalarizeVecOp_EXTRACT_VECTOR_ELT - If the input is a vector that needs to
@@ -450,6 +464,18 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   return Res;
 }
 
+
+/// ScalarizeVecOp_VSELECT - If the input condition is a vector that needs to be
+/// scalarized, it must be <1 x i1>, so just convert to a normal ISD::SELECT
+/// (still with vector output type since that was acceptable if we got here).
+SDValue DAGTypeLegalizer::ScalarizeVecOp_VSELECT(SDNode *N) {
+  SDValue ScalarCond = GetScalarizedVector(N->getOperand(0));
+  EVT VT = N->getValueType(0);
+
+  return DAG.getNode(ISD::SELECT, SDLoc(N), VT, ScalarCond, N->getOperand(1),
+                     N->getOperand(2));
+}
+
 /// ScalarizeVecOp_STORE - If the value to store is a vector that needs to be
 /// scalarized, it must be <1 x ty>.  Just store the element.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
@@ -696,10 +722,10 @@ void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   unsigned LoNumElts = LoVT.getVectorNumElements();
   SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
-  Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, &LoOps[0], LoOps.size());
+  Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, LoOps);
 
   SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end());
-  Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, &HiOps[0], HiOps.size());
+  Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, HiOps);
 }
 
 void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
@@ -717,10 +743,10 @@ void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
-  Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, &LoOps[0], LoOps.size());
+  Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, LoOps);
 
   SmallVector<SDValue, 8> HiOps(N->op_begin()+NumSubvectors, N->op_end());
-  Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HiVT, &HiOps[0], HiOps.size());
+  Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HiVT, HiOps);
 }
 
 void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
@@ -1064,7 +1090,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
       }
 
       // Construct the Lo/Hi output using a BUILD_VECTOR.
-      Output = DAG.getNode(ISD::BUILD_VECTOR,dl,NewVT, &SVOps[0], SVOps.size());
+      Output = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, SVOps);
     } else if (InputUsed[0] == -1U) {
       // No input vectors were used!  The result is undefined.
       Output = DAG.getUNDEF(NewVT);
@@ -1100,7 +1126,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
     return false;
 
-  if (Res.getNode() == 0) {
+  if (!Res.getNode()) {
     switch (N->getOpcode()) {
     default:
 #ifndef NDEBUG
@@ -1342,8 +1368,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
     }
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, N->getValueType(0),
-                     &Elts[0], Elts.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, N->getValueType(0), Elts);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_TRUNCATE(SDNode *N) {
@@ -1700,8 +1725,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
       while (SubConcatEnd < OpsToConcat)
         SubConcatOps[SubConcatEnd++] = undefVec;
       ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
-                                            NextVT, &SubConcatOps[0],
-                                            OpsToConcat);
+                                            NextVT, SubConcatOps);
       ConcatEnd = SubConcatIdx + 1;
     }
   }
@@ -1720,7 +1744,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
     for (unsigned j = ConcatEnd; j < NumOps; ++j)
       ConcatOps[j] = UndefVal;
   }
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &ConcatOps[0], NumOps);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
+                     makeArrayRef(ConcatOps.data(), NumOps));
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
@@ -1762,8 +1787,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
       SDValue UndefVal = DAG.getUNDEF(InVT);
       for (unsigned i = 1; i != NumConcat; ++i)
         Ops[i] = UndefVal;
-      SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT,
-                                  &Ops[0], NumConcat);
+      SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops);
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InVec);
       return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1));
@@ -1798,7 +1822,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   for (; i < WidenNumElts; ++i)
     Ops[i] = UndefVal;
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, &Ops[0], WidenNumElts);
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
@@ -1922,11 +1946,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
 
       SDValue NewVec;
       if (InVT.isVector())
-        NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl,
-                             NewInVT, &Ops[0], NewNumElts);
+        NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
       else
-        NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl,
-                             NewInVT, &Ops[0], NewNumElts);
+        NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, NewInVT, Ops);
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec);
     }
   }
@@ -1951,7 +1973,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
   assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
   NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT));
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &NewOps[0], NewOps.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, NewOps);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
@@ -1974,7 +1996,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
         Ops[i] = N->getOperand(i);
       for (unsigned i = NumOperands; i != NumConcat; ++i)
         Ops[i] = UndefVal;
-      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &Ops[0], NumConcat);
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Ops);
     }
   } else {
     InputWidened = true;
@@ -2020,7 +2042,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; Idx < WidenNumElts; ++Idx)
     Ops[Idx] = UndefVal;
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
@@ -2065,7 +2087,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
       for (unsigned i = 1; i != NumConcat; ++i)
         Ops[i] = UndefVal;
 
-      InOp = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWidenVT, &Ops[0],NumConcat);
+      InOp = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWidenVT, Ops);
       return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
                                   SatOp, CvtCode);
     }
@@ -2098,7 +2120,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
   for (; i < WidenNumElts; ++i)
     Ops[i] = UndefVal;
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
@@ -2137,7 +2159,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; i < WidenNumElts; ++i)
     Ops[i] = UndefVal;
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
@@ -2165,8 +2187,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   if (LdChain.size() == 1)
     NewChain = LdChain[0];
   else
-    NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
-                           &LdChain[0], LdChain.size());
+    NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain);
 
   // Modified the chain - switch anything that used the old chain to use
   // the new one.
@@ -2372,7 +2393,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
                                      DAG.getConstant(i, TLI.getVectorIdxTy())));
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
@@ -2421,7 +2442,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
       Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
                                DAG.getConstant(j, TLI.getVectorIdxTy()));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
@@ -2450,8 +2471,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
   if (StChain.size() == 1)
     return StChain[0];
   else
-    return DAG.getNode(ISD::TokenFactor, SDLoc(ST),
-                       MVT::Other,&StChain[0],StChain.size());
+    return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
@@ -2626,8 +2646,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     ConcatOps[0] = LdOp;
     for (unsigned i = 1; i != NumConcat; ++i)
       ConcatOps[i] = UndefVal;
-    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &ConcatOps[0],
-                       NumConcat);
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps);
   }
 
   // Load vector by using multiple loads from largest vector to scalar
@@ -2661,8 +2680,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
           Loads.push_back(DAG.getUNDEF(L->getValueType(0)));
           size += L->getValueSizeInBits(0);
         }
-        L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0),
-                        &Loads[0], Loads.size());
+        L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads);
       }
     } else {
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
@@ -2706,7 +2724,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     if (NewLdTy != LdTy) {
       // Create a larger vector
       ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
-                                     &ConcatOps[Idx], End - Idx);
+                                     makeArrayRef(&ConcatOps[Idx], End - Idx));
       Idx = End - 1;
       LdTy = NewLdTy;
     }
@@ -2715,7 +2733,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
 
   if (WidenWidth == LdTy.getSizeInBits()*(End - Idx))
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
-                       &ConcatOps[Idx], End - Idx);
+                       makeArrayRef(&ConcatOps[Idx], End - Idx));
 
   // We need to fill the rest with undefs to build the vector
   unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
@@ -2728,7 +2746,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     for (; i != NumOps; ++i)
       WidenOps[i] = UndefVal;
   }
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &WidenOps[0],NumOps);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, WidenOps);
 }
 
 SDValue
@@ -2779,7 +2797,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   for (; i != WidenNumElts; ++i)
     Ops[i] = UndefVal;
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], Ops.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
 }
 
 
@@ -2925,7 +2943,7 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
     for (unsigned i = 1; i != NumConcat; ++i)
       Ops[i] = UndefVal;
 
-    return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, &Ops[0], NumConcat);
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops);
   }
 
   if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
@@ -2944,5 +2962,5 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for ( ; Idx < WidenNumElts; ++Idx)
     Ops[Idx] = UndefVal;
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &Ops[0], WidenNumElts);
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
 }
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 3b3424d..f92230c 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -19,7 +19,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "scheduler"
 #include "llvm/CodeGen/ResourcePriorityQueue.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -31,6 +30,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "scheduler"
+
 static cl::opt<bool> DisableDFASched("disable-dfa-sched", cl::Hidden,
   cl::ZeroOrMore, cl::init(false),
   cl::desc("Disable use of DFA during scheduling"));
@@ -49,7 +50,7 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) :
    TLI = IS->getTargetLowering();
 
    const TargetMachine &tm = (*IS->MF).getTarget();
-   ResourcesModel = tm.getInstrInfo()->CreateTargetScheduleState(&tm,NULL);
+   ResourcesModel = tm.getInstrInfo()->CreateTargetScheduleState(&tm,nullptr);
    // This hard requirement could be relaxed, but for now
    // do not let it procede.
    assert (ResourcesModel && "Unimplemented CreateTargetScheduleState.");
@@ -214,7 +215,7 @@ bool resource_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
 /// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
 /// of SU, return it, otherwise return null.
 SUnit *ResourcePriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
-  SUnit *OnlyAvailablePred = 0;
+  SUnit *OnlyAvailablePred = nullptr;
   for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     SUnit &Pred = *I->getSUnit();
@@ -222,7 +223,7 @@ SUnit *ResourcePriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
       // We found an available, but not scheduled, predecessor.  If it's the
       // only one we have found, keep track of it... otherwise give up.
       if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
-        return 0;
+        return nullptr;
       OnlyAvailablePred = &Pred;
     }
   }
@@ -581,7 +582,7 @@ void ResourcePriorityQueue::adjustPriorityOfUnscheduledPreds(SUnit *SU) {
   if (SU->isAvailable) return;  // All preds scheduled.
 
   SUnit *OnlyAvailablePred = getSingleUnscheduledPred(SU);
-  if (OnlyAvailablePred == 0 || !OnlyAvailablePred->isAvailable)
+  if (!OnlyAvailablePred || !OnlyAvailablePred->isAvailable)
     return;
 
   // Okay, we found a single predecessor that is available, but not scheduled.
@@ -598,7 +599,7 @@ void ResourcePriorityQueue::adjustPriorityOfUnscheduledPreds(SUnit *SU) {
 /// to be placed in scheduling sequence.
 SUnit *ResourcePriorityQueue::pop() {
   if (empty())
-    return 0;
+    return nullptr;
 
   std::vector<SUnit *>::iterator Best = Queue.begin();
   if (!DisableDFASched) {
diff --git a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
index b62bd62..ee54292 100644
--- a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
+++ b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
@@ -45,14 +45,17 @@ private:
     unsigned FrameIx;       // valid for stack objects
   } u;
   MDNode *mdPtr;
+  bool IsIndirect;
   uint64_t Offset;
   DebugLoc DL;
   unsigned Order;
   bool Invalid;
 public:
   // Constructor for non-constants.
-  SDDbgValue(MDNode *mdP, SDNode *N, unsigned R, uint64_t off, DebugLoc dl,
-             unsigned O) : mdPtr(mdP), Offset(off), DL(dl), Order(O),
+  SDDbgValue(MDNode *mdP, SDNode *N, unsigned R,
+	     bool indir, uint64_t off, DebugLoc dl,
+             unsigned O) : mdPtr(mdP), IsIndirect(indir),
+			   Offset(off), DL(dl), Order(O),
                            Invalid(false) {
     kind = SDNODE;
     u.s.Node = N;
@@ -62,14 +65,16 @@ public:
   // Constructor for constants.
   SDDbgValue(MDNode *mdP, const Value *C, uint64_t off, DebugLoc dl,
              unsigned O) : 
-    mdPtr(mdP), Offset(off), DL(dl), Order(O), Invalid(false) {
+    mdPtr(mdP), IsIndirect(false), Offset(off), DL(dl), Order(O),
+    Invalid(false) {
     kind = CONST;
     u.Const = C;
   }
 
   // Constructor for frame indices.
   SDDbgValue(MDNode *mdP, unsigned FI, uint64_t off, DebugLoc dl, unsigned O) : 
-    mdPtr(mdP), Offset(off), DL(dl), Order(O), Invalid(false) {
+    mdPtr(mdP), IsIndirect(false), Offset(off), DL(dl), Order(O),
+    Invalid(false) {
     kind = FRAMEIX;
     u.FrameIx = FI;
   }
@@ -92,6 +97,9 @@ public:
   // Returns the FrameIx for a stack object
   unsigned getFrameIx() { assert (kind==FRAMEIX); return u.FrameIx; }
 
+  // Returns whether this is an indirect value.
+  bool isIndirect() { return IsIndirect; }
+
   // Returns the offset.
   uint64_t getOffset() { return Offset; }
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 0687392..4d8c2c7 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "pre-RA-sched"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "InstrEmitter.h"
 #include "ScheduleDAGSDNodes.h"
@@ -28,6 +27,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "pre-RA-sched"
+
 STATISTIC(NumUnfolds,    "Number of nodes unfolded");
 STATISTIC(NumDups,       "Number of duplicated nodes");
 STATISTIC(NumPRCopies,   "Number of physical copies");
@@ -54,7 +55,7 @@ namespace {
     }
 
     SUnit *pop() {
-      if (empty()) return NULL;
+      if (empty()) return nullptr;
       SUnit *V = Queue.back();
       Queue.pop_back();
       return V;
@@ -117,11 +118,11 @@ void ScheduleDAGFast::Schedule() {
   DEBUG(dbgs() << "********** List Scheduling **********\n");
 
   NumLiveRegs = 0;
-  LiveRegDefs.resize(TRI->getNumRegs(), NULL);
+  LiveRegDefs.resize(TRI->getNumRegs(), nullptr);
   LiveRegCycles.resize(TRI->getNumRegs(), 0);
 
   // Build the scheduling graph.
-  BuildSchedGraph(NULL);
+  BuildSchedGraph(nullptr);
 
   DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
           SUnits[su].dumpAll(this));
@@ -144,7 +145,7 @@ void ScheduleDAGFast::ReleasePred(SUnit *SU, SDep *PredEdge) {
     dbgs() << "*** Scheduling failed! ***\n";
     PredSU->dump(this);
     dbgs() << " has been released too many times!\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 #endif
   --PredSU->NumSuccsLeft;
@@ -198,7 +199,7 @@ void ScheduleDAGFast::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
         assert(LiveRegDefs[I->getReg()] == SU &&
                "Physical register dependency violated?");
         --NumLiveRegs;
-        LiveRegDefs[I->getReg()] = NULL;
+        LiveRegDefs[I->getReg()] = nullptr;
         LiveRegCycles[I->getReg()] = 0;
       }
     }
@@ -211,18 +212,18 @@ void ScheduleDAGFast::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
 /// successors to the newly created node.
 SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
   if (SU->getNode()->getGluedNode())
-    return NULL;
+    return nullptr;
 
   SDNode *N = SU->getNode();
   if (!N)
-    return NULL;
+    return nullptr;
 
   SUnit *NewSU;
   bool TryUnfold = false;
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
     EVT VT = N->getValueType(i);
     if (VT == MVT::Glue)
-      return NULL;
+      return nullptr;
     else if (VT == MVT::Other)
       TryUnfold = true;
   }
@@ -230,13 +231,13 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
     const SDValue &Op = N->getOperand(i);
     EVT VT = Op.getNode()->getValueType(Op.getResNo());
     if (VT == MVT::Glue)
-      return NULL;
+      return nullptr;
   }
 
   if (TryUnfold) {
     SmallVector<SDNode*, 2> NewNodes;
     if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
-      return NULL;
+      return nullptr;
 
     DEBUG(dbgs() << "Unfolding SU # " << SU->NodeNum << "\n");
     assert(NewNodes.size() == 2 && "Expected a load folding node!");
@@ -388,11 +389,11 @@ void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
                                               const TargetRegisterClass *DestRC,
                                               const TargetRegisterClass *SrcRC,
                                               SmallVectorImpl<SUnit*> &Copies) {
-  SUnit *CopyFromSU = newSUnit(static_cast<SDNode *>(NULL));
+  SUnit *CopyFromSU = newSUnit(static_cast<SDNode *>(nullptr));
   CopyFromSU->CopySrcRC = SrcRC;
   CopyFromSU->CopyDstRC = DestRC;
 
-  SUnit *CopyToSU = newSUnit(static_cast<SDNode *>(NULL));
+  SUnit *CopyToSU = newSUnit(static_cast<SDNode *>(nullptr));
   CopyToSU->CopySrcRC = DestRC;
   CopyToSU->CopyDstRC = SrcRC;
 
@@ -583,7 +584,7 @@ void ScheduleDAGFast::ListScheduleBottomUp() {
         // and it is expensive.
         // If cross copy register class is null, then it's not possible to copy
         // the value at all.
-        SUnit *NewDef = 0;
+        SUnit *NewDef = nullptr;
         if (DestRC != RC) {
           NewDef = CopyAndMoveSuccessors(LRDef);
           if (!DestRC && !NewDef)
@@ -661,7 +662,7 @@ private:
 
 void ScheduleDAGLinearize::ScheduleNode(SDNode *N) {
   if (N->getNodeId() != 0)
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
 
   if (!N->isMachineOpcode() &&
       (N->getOpcode() == ISD::EntryToken || isPassiveNode(N)))
@@ -674,7 +675,7 @@ void ScheduleDAGLinearize::ScheduleNode(SDNode *N) {
 
   unsigned NumOps = N->getNumOperands();
   if (unsigned NumLeft = NumOps) {
-    SDNode *GluedOpN = 0;
+    SDNode *GluedOpN = nullptr;
     do {
       const SDValue &Op = N->getOperand(NumLeft-1);
       SDNode *OpN = Op.getNode();
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index c283664..78ec4df 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "pre-RA-sched"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/ADT/STLExtras.h"
@@ -36,6 +35,8 @@
 #include <climits>
 using namespace llvm;
 
+#define DEBUG_TYPE "pre-RA-sched"
+
 STATISTIC(NumBacktracks, "Number of times scheduler backtracked");
 STATISTIC(NumUnfolds,    "Number of nodes unfolded");
 STATISTIC(NumDups,       "Number of duplicated nodes");
@@ -163,7 +164,7 @@ public:
                     CodeGenOpt::Level OptLevel)
     : ScheduleDAGSDNodes(mf),
       NeedLatency(needlatency), AvailableQueue(availqueue), CurCycle(0),
-      Topo(SUnits, NULL) {
+      Topo(SUnits, nullptr) {
 
     const TargetMachine &tm = mf.getTarget();
     if (DisableSchedCycles || !NeedLatency)
@@ -327,13 +328,13 @@ void ScheduleDAGRRList::Schedule() {
   NumLiveRegs = 0;
   // Allocate slots for each physical register, plus one for a special register
   // to track the virtual resource of a calling sequence.
-  LiveRegDefs.resize(TRI->getNumRegs() + 1, NULL);
-  LiveRegGens.resize(TRI->getNumRegs() + 1, NULL);
+  LiveRegDefs.resize(TRI->getNumRegs() + 1, nullptr);
+  LiveRegGens.resize(TRI->getNumRegs() + 1, nullptr);
   CallSeqEndForStart.clear();
   assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences");
 
   // Build the scheduling graph.
-  BuildSchedGraph(NULL);
+  BuildSchedGraph(nullptr);
 
   DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
           SUnits[su].dumpAll(this));
@@ -369,7 +370,7 @@ void ScheduleDAGRRList::ReleasePred(SUnit *SU, const SDep *PredEdge) {
     dbgs() << "*** Scheduling failed! ***\n";
     PredSU->dump(this);
     dbgs() << " has been released too many times!\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 #endif
   --PredSU->NumSuccsLeft;
@@ -461,7 +462,7 @@ FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest,
     // to get to the CALLSEQ_BEGIN, but we need to find the path with the
     // most nesting in order to ensure that we find the corresponding match.
     if (N->getOpcode() == ISD::TokenFactor) {
-      SDNode *Best = 0;
+      SDNode *Best = nullptr;
       unsigned BestMaxNest = MaxNest;
       for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
         unsigned MyNestLevel = NestLevel;
@@ -497,10 +498,10 @@ FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest,
         N = N->getOperand(i).getNode();
         goto found_chain_operand;
       }
-    return 0;
+    return nullptr;
   found_chain_operand:;
     if (N->getOpcode() == ISD::EntryToken)
-      return 0;
+      return nullptr;
   }
 }
 
@@ -742,8 +743,8 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
     if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] == SU) {
       assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
       --NumLiveRegs;
-      LiveRegDefs[I->getReg()] = NULL;
-      LiveRegGens[I->getReg()] = NULL;
+      LiveRegDefs[I->getReg()] = nullptr;
+      LiveRegGens[I->getReg()] = nullptr;
       releaseInterferences(I->getReg());
     }
   }
@@ -757,8 +758,8 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
           SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) {
         assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
         --NumLiveRegs;
-        LiveRegDefs[CallResource] = NULL;
-        LiveRegGens[CallResource] = NULL;
+        LiveRegDefs[CallResource] = nullptr;
+        LiveRegGens[CallResource] = nullptr;
         releaseInterferences(CallResource);
       }
     }
@@ -813,8 +814,8 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
       assert(LiveRegDefs[I->getReg()] == I->getSUnit() &&
              "Physical register dependency violated?");
       --NumLiveRegs;
-      LiveRegDefs[I->getReg()] = NULL;
-      LiveRegGens[I->getReg()] = NULL;
+      LiveRegDefs[I->getReg()] = nullptr;
+      LiveRegGens[I->getReg()] = nullptr;
       releaseInterferences(I->getReg());
     }
   }
@@ -841,8 +842,8 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
           SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
         assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
         --NumLiveRegs;
-        LiveRegDefs[CallResource] = NULL;
-        LiveRegGens[CallResource] = NULL;
+        LiveRegDefs[CallResource] = nullptr;
+        LiveRegGens[CallResource] = nullptr;
         releaseInterferences(CallResource);
       }
     }
@@ -855,7 +856,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
       // This becomes the nearest def. Note that an earlier def may still be
       // pending if this is a two-address node.
       LiveRegDefs[I->getReg()] = SU;
-      if (LiveRegGens[I->getReg()] == NULL ||
+      if (LiveRegGens[I->getReg()] == nullptr ||
           I->getSUnit()->getHeight() < LiveRegGens[I->getReg()]->getHeight())
         LiveRegGens[I->getReg()] = I->getSUnit();
     }
@@ -936,17 +937,17 @@ static bool isOperandOf(const SUnit *SU, SDNode *N) {
 SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
   SDNode *N = SU->getNode();
   if (!N)
-    return NULL;
+    return nullptr;
 
   if (SU->getNode()->getGluedNode())
-    return NULL;
+    return nullptr;
 
   SUnit *NewSU;
   bool TryUnfold = false;
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
     EVT VT = N->getValueType(i);
     if (VT == MVT::Glue)
-      return NULL;
+      return nullptr;
     else if (VT == MVT::Other)
       TryUnfold = true;
   }
@@ -954,18 +955,18 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
     const SDValue &Op = N->getOperand(i);
     EVT VT = Op.getNode()->getValueType(Op.getResNo());
     if (VT == MVT::Glue)
-      return NULL;
+      return nullptr;
   }
 
   if (TryUnfold) {
     SmallVector<SDNode*, 2> NewNodes;
     if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
-      return NULL;
+      return nullptr;
 
     // unfolding an x86 DEC64m operation results in store, dec, load which
     // can't be handled here so quit
     if (NewNodes.size() == 3)
-      return NULL;
+      return nullptr;
 
     DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
     assert(NewNodes.size() == 2 && "Expected a load folding node!");
@@ -1136,11 +1137,11 @@ void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
                                               const TargetRegisterClass *DestRC,
                                               const TargetRegisterClass *SrcRC,
                                               SmallVectorImpl<SUnit*> &Copies) {
-  SUnit *CopyFromSU = CreateNewSUnit(NULL);
+  SUnit *CopyFromSU = CreateNewSUnit(nullptr);
   CopyFromSU->CopySrcRC = SrcRC;
   CopyFromSU->CopyDstRC = DestRC;
 
-  SUnit *CopyToSU = CreateNewSUnit(NULL);
+  SUnit *CopyToSU = CreateNewSUnit(nullptr);
   CopyToSU->CopySrcRC = DestRC;
   CopyToSU->CopyDstRC = SrcRC;
 
@@ -1244,7 +1245,7 @@ static const uint32_t *getNodeRegMask(const SDNode *N) {
     if (const RegisterMaskSDNode *Op =
         dyn_cast<RegisterMaskSDNode>(N->getOperand(i).getNode()))
       return Op->getRegMask();
-  return NULL;
+  return nullptr;
 }
 
 /// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
@@ -1355,7 +1356,7 @@ void ScheduleDAGRRList::releaseInterferences(unsigned Reg) {
 /// (2) No Hazards: resources are available
 /// (3) No Interferences: may unschedule to break register interferences.
 SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
-  SUnit *CurSU = AvailableQueue->empty() ? 0 : AvailableQueue->pop();
+  SUnit *CurSU = AvailableQueue->empty() ? nullptr : AvailableQueue->pop();
   while (CurSU) {
     SmallVector<unsigned, 4> LRegs;
     if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
@@ -1389,7 +1390,7 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
 
     // Try unscheduling up to the point where it's safe to schedule
     // this node.
-    SUnit *BtSU = NULL;
+    SUnit *BtSU = nullptr;
     unsigned LiveCycle = UINT_MAX;
     for (unsigned j = 0, ee = LRegs.size(); j != ee; ++j) {
       unsigned Reg = LRegs[j];
@@ -1449,7 +1450,7 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
     // expensive.
     // If cross copy register class is null, then it's not possible to copy
     // the value at all.
-    SUnit *NewDef = 0;
+    SUnit *NewDef = nullptr;
     if (DestRC != RC) {
       NewDef = CopyAndMoveSuccessors(LRDef);
       if (!DestRC && !NewDef)
@@ -1646,7 +1647,7 @@ public:
                      const TargetLowering *tli)
     : SchedulingPriorityQueue(hasReadyFilter),
       CurQueueId(0), TracksRegPressure(tracksrp), SrcOrder(srcorder),
-      MF(mf), TII(tii), TRI(tri), TLI(tli), scheduleDAG(NULL) {
+      MF(mf), TII(tii), TRI(tri), TLI(tli), scheduleDAG(nullptr) {
     if (TracksRegPressure) {
       unsigned NumRC = TRI->getNumRegClasses();
       RegLimit.resize(NumRC);
@@ -1674,7 +1675,7 @@ public:
   void updateNode(const SUnit *SU) override;
 
   void releaseState() override {
-    SUnits = 0;
+    SUnits = nullptr;
     SethiUllmanNumbers.clear();
     std::fill(RegPressure.begin(), RegPressure.end(), 0);
   }
@@ -1775,7 +1776,7 @@ public:
   }
 
   SUnit *pop() override {
-    if (Queue.empty()) return NULL;
+    if (Queue.empty()) return nullptr;
 
     SUnit *V = popFromQueue(Queue, Picker, scheduleDAG);
     V->NodeQueueId = 0;
@@ -1783,7 +1784,7 @@ public:
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump(ScheduleDAG *DAG) const {
+  void dump(ScheduleDAG *DAG) const override {
     // Emulate pop() without clobbering NodeQueueIds.
     std::vector<SUnit*> DumpQueue = Queue;
     SF DumpPicker = Picker;
@@ -2824,7 +2825,7 @@ void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
         continue;
 
     // Locate the single data predecessor.
-    SUnit *PredSU = 0;
+    SUnit *PredSU = nullptr;
     for (SUnit::const_pred_iterator II = SU->Preds.begin(),
          EE = SU->Preds.end(); II != EE; ++II)
       if (!II->isCtrl()) {
@@ -2980,7 +2981,7 @@ llvm::createBURRListDAGScheduler(SelectionDAGISel *IS,
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
 
   BURegReductionPriorityQueue *PQ =
-    new BURegReductionPriorityQueue(*IS->MF, false, false, TII, TRI, 0);
+    new BURegReductionPriorityQueue(*IS->MF, false, false, TII, TRI, nullptr);
   ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, PQ, OptLevel);
   PQ->setScheduleDAG(SD);
   return SD;
@@ -2994,7 +2995,7 @@ llvm::createSourceListDAGScheduler(SelectionDAGISel *IS,
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
 
   SrcRegReductionPriorityQueue *PQ =
-    new SrcRegReductionPriorityQueue(*IS->MF, false, true, TII, TRI, 0);
+    new SrcRegReductionPriorityQueue(*IS->MF, false, true, TII, TRI, nullptr);
   ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, PQ, OptLevel);
   PQ->setScheduleDAG(SD);
   return SD;
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 5639894..de910b7 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "pre-RA-sched"
 #include "ScheduleDAGSDNodes.h"
 #include "InstrEmitter.h"
 #include "SDNodeDbgValue.h"
@@ -35,6 +34,8 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "pre-RA-sched"
+
 STATISTIC(LoadsClustered, "Number of loads clustered together");
 
 // This allows latency based scheduler to notice high latency instructions
@@ -46,7 +47,7 @@ static cl::opt<int> HighLatencyCycles(
            "instructions take for targets with no itinerary"));
 
 ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
-  : ScheduleDAG(mf), BB(0), DAG(0),
+  : ScheduleDAG(mf), BB(nullptr), DAG(nullptr),
     InstrItins(mf.getTarget().getInstrItineraryData()) {}
 
 /// Run - perform scheduling.
@@ -67,12 +68,12 @@ void ScheduleDAGSDNodes::Run(SelectionDAG *dag, MachineBasicBlock *bb) {
 ///
 SUnit *ScheduleDAGSDNodes::newSUnit(SDNode *N) {
 #ifndef NDEBUG
-  const SUnit *Addr = 0;
+  const SUnit *Addr = nullptr;
   if (!SUnits.empty())
     Addr = &SUnits[0];
 #endif
   SUnits.push_back(SUnit(N, (unsigned)SUnits.size()));
-  assert((Addr == 0 || Addr == &SUnits[0]) &&
+  assert((Addr == nullptr || Addr == &SUnits[0]) &&
          "SUnits std::vector reallocated on the fly!");
   SUnits.back().OrigNode = &SUnits.back();
   SUnit *SU = &SUnits.back();
@@ -142,8 +143,8 @@ static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG,
   if (ExtraOper.getNode())
     Ops.push_back(ExtraOper);
 
-  SDVTList VTList = DAG->getVTList(&VTs[0], VTs.size());
-  MachineSDNode::mmo_iterator Begin = 0, End = 0;
+  SDVTList VTList = DAG->getVTList(VTs);
+  MachineSDNode::mmo_iterator Begin = nullptr, End = nullptr;
   MachineSDNode *MN = dyn_cast<MachineSDNode>(N);
 
   // Store memory references.
@@ -152,7 +153,7 @@ static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG,
     End = MN->memoperands_end();
   }
 
-  DAG->MorphNodeTo(N, N->getOpcode(), VTList, &Ops[0], Ops.size());
+  DAG->MorphNodeTo(N, N->getOpcode(), VTList, Ops);
 
   // Reset the memory references
   if (MN)
@@ -205,7 +206,7 @@ static void RemoveUnusedGlue(SDNode *N, SelectionDAG *DAG) {
 /// outputs to ensure they are scheduled together and in order. This
 /// optimization may benefit some targets by improving cache locality.
 void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
-  SDNode *Chain = 0;
+  SDNode *Chain = nullptr;
   unsigned NumOps = Node->getNumOperands();
   if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
     Chain = Node->getOperand(NumOps-1).getNode();
@@ -219,8 +220,11 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
   DenseMap<long long, SDNode*> O2SMap;  // Map from offset to SDNode.
   bool Cluster = false;
   SDNode *Base = Node;
+  // This algorithm requires a reasonably low use count before finding a match
+  // to avoid uselessly blowing up compile time in large blocks.
+  unsigned UseCount = 0;
   for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
-       I != E; ++I) {
+       I != E && UseCount < 100; ++I, ++UseCount) {
     SDNode *User = *I;
     if (User == Node || !Visited.insert(User))
       continue;
@@ -237,6 +241,8 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
     if (Offset2 < Offset1)
       Base = User;
     Cluster = true;
+    // Reset UseCount to allow more matches.
+    UseCount = 0;
   }
 
   if (!Cluster)
@@ -266,7 +272,7 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
   // Cluster loads by adding MVT::Glue outputs and inputs. This also
   // ensure they are scheduled in order of increasing addresses.
   SDNode *Lead = Loads[0];
-  SDValue InGlue = SDValue(0, 0);
+  SDValue InGlue = SDValue(nullptr, 0);
   if (AddGlue(Lead, InGlue, true, DAG))
     InGlue = SDValue(Lead, Lead->getNumValues() - 1);
   for (unsigned I = 1, E = Loads.size(); I != E; ++I) {
@@ -567,7 +573,7 @@ void ScheduleDAGSDNodes::RegDefIter::Advance() {
       return; // Found a normal regdef.
     }
     Node = Node->getGluedNode();
-    if (Node == NULL) {
+    if (!Node) {
       return; // No values left to visit.
     }
     InitNodeNumDefs();
@@ -740,7 +746,7 @@ ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
       // BB->back().isPHI() test will not fire when we want it to.
       std::prev(Emitter.getInsertPos())->isPHI()) {
     // Did not insert any instruction.
-    Orders.push_back(std::make_pair(Order, (MachineInstr*)0));
+    Orders.push_back(std::make_pair(Order, (MachineInstr*)nullptr));
     return;
   }
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 5e11dbb..39ebadf 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -139,7 +139,7 @@ namespace llvm {
     public:
       RegDefIter(const SUnit *SU, const ScheduleDAGSDNodes *SD);
 
-      bool IsValid() const { return Node != NULL; }
+      bool IsValid() const { return Node != nullptr; }
 
       MVT GetValue() const {
         assert(IsValid() && "bad iterator");
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index fb86103..51c51d6 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -18,7 +18,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "pre-RA-sched"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/ADT/Statistic.h"
@@ -35,6 +34,8 @@
 #include <climits>
 using namespace llvm;
 
+#define DEBUG_TYPE "pre-RA-sched"
+
 STATISTIC(NumNoops , "Number of noops inserted");
 STATISTIC(NumStalls, "Number of pipeline stalls");
 
@@ -120,7 +121,7 @@ void ScheduleDAGVLIW::releaseSucc(SUnit *SU, const SDep &D) {
     dbgs() << "*** Scheduling failed! ***\n";
     SuccSU->dump(this);
     dbgs() << " has been released too many times!\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 #endif
   assert(!D.isWeak() && "unexpected artificial DAG edge");
@@ -204,12 +205,12 @@ void ScheduleDAGVLIW::listScheduleTopDown() {
     // don't advance the hazard recognizer.
     if (AvailableQueue->empty()) {
       // Reset DFA state.
-      AvailableQueue->scheduledNode(0);
+      AvailableQueue->scheduledNode(nullptr);
       ++CurCycle;
       continue;
     }
 
-    SUnit *FoundSUnit = 0;
+    SUnit *FoundSUnit = nullptr;
 
     bool HasNoopHazards = false;
     while (!AvailableQueue->empty()) {
@@ -256,7 +257,7 @@ void ScheduleDAGVLIW::listScheduleTopDown() {
       // processors without pipeline interlocks and other cases.
       DEBUG(dbgs() << "*** Emitting noop\n");
       HazardRec->EmitNoop();
-      Sequence.push_back(0);   // NULL here means noop
+      Sequence.push_back(nullptr);   // NULL here means noop
       ++NumNoops;
       ++CurCycle;
     }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d11ce80..b1b8035 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -364,29 +364,28 @@ static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
 /// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
 ///
 static void AddNodeIDOperands(FoldingSetNodeID &ID,
-                              const SDValue *Ops, unsigned NumOps) {
-  for (; NumOps; --NumOps, ++Ops) {
-    ID.AddPointer(Ops->getNode());
-    ID.AddInteger(Ops->getResNo());
+                              ArrayRef<SDValue> Ops) {
+  for (auto& Op : Ops) {
+    ID.AddPointer(Op.getNode());
+    ID.AddInteger(Op.getResNo());
   }
 }
 
 /// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
 ///
 static void AddNodeIDOperands(FoldingSetNodeID &ID,
-                              const SDUse *Ops, unsigned NumOps) {
-  for (; NumOps; --NumOps, ++Ops) {
-    ID.AddPointer(Ops->getNode());
-    ID.AddInteger(Ops->getResNo());
+                              ArrayRef<SDUse> Ops) {
+  for (auto& Op : Ops) {
+    ID.AddPointer(Op.getNode());
+    ID.AddInteger(Op.getResNo());
   }
 }
 
-static void AddNodeIDNode(FoldingSetNodeID &ID,
-                          unsigned short OpC, SDVTList VTList,
-                          const SDValue *OpList, unsigned N) {
+static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
+                          SDVTList VTList, ArrayRef<SDValue> OpList) {
   AddNodeIDOpcode(ID, OpC);
   AddNodeIDValueTypes(ID, VTList);
-  AddNodeIDOperands(ID, OpList, N);
+  AddNodeIDOperands(ID, OpList);
 }
 
 /// AddNodeIDCustom - If this is an SDNode with special info, add this info to
@@ -528,7 +527,7 @@ static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
   // Add the return value info.
   AddNodeIDValueTypes(ID, N->getVTList());
   // Add the operand info.
-  AddNodeIDOperands(ID, N->op_begin(), N->getNumOperands());
+  AddNodeIDOperands(ID, makeArrayRef(N->op_begin(), N->op_end()));
 
   // Handle SDNode leafs with special info.
   AddNodeIDCustom(ID, N);
@@ -606,7 +605,7 @@ void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {
     SDNode *N = DeadNodes.pop_back_val();
 
     for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
-      DUL->NodeDeleted(N, 0);
+      DUL->NodeDeleted(N, nullptr);
 
     // Take the node out of the appropriate CSE map.
     RemoveNodeFromCSEMaps(N);
@@ -684,8 +683,8 @@ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
   case ISD::CONDCODE:
     assert(CondCodeNodes[cast<CondCodeSDNode>(N)->get()] &&
            "Cond code doesn't exist!");
-    Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != 0;
-    CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = 0;
+    Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != nullptr;
+    CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = nullptr;
     break;
   case ISD::ExternalSymbol:
     Erased = ExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
@@ -702,8 +701,8 @@ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
     if (VT.isExtended()) {
       Erased = ExtendedValueTypeNodes.erase(VT);
     } else {
-      Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != 0;
-      ValueTypeNodes[VT.getSimpleVT().SimpleTy] = 0;
+      Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != nullptr;
+      ValueTypeNodes[VT.getSimpleVT().SimpleTy] = nullptr;
     }
     break;
   }
@@ -765,11 +764,11 @@ SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
 SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
                                            void *&InsertPos) {
   if (doNotCSE(N))
-    return 0;
+    return nullptr;
 
   SDValue Ops[] = { Op };
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, 1);
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
   SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
   return Node;
@@ -783,11 +782,11 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
                                            SDValue Op1, SDValue Op2,
                                            void *&InsertPos) {
   if (doNotCSE(N))
-    return 0;
+    return nullptr;
 
   SDValue Ops[] = { Op1, Op2 };
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, 2);
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
   SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
   return Node;
@@ -798,14 +797,13 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
 /// were replaced with those specified.  If this node is never memoized,
 /// return null, otherwise return a pointer to the slot it would take.  If a
 /// node already exists with these operands, the slot will be non-null.
-SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
-                                           const SDValue *Ops,unsigned NumOps,
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
                                            void *&InsertPos) {
   if (doNotCSE(N))
-    return 0;
+    return nullptr;
 
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, NumOps);
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
   AddNodeIDCustom(ID, N);
   SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
   return Node;
@@ -901,10 +899,10 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
 
 // EntryNode could meaningfully have debug info if we can find it...
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
-  : TM(tm), TSI(*tm.getSelectionDAGInfo()), TLI(0), OptLevel(OL),
+  : TM(tm), TSI(*tm.getSelectionDAGInfo()), TLI(nullptr), OptLevel(OL),
     EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
     Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
-    UpdateListeners(0) {
+    UpdateListeners(nullptr) {
   AllNodes.push_back(&EntryNode);
   DbgInfo = new SDDbgInfo();
 }
@@ -937,11 +935,11 @@ void SelectionDAG::clear() {
   ExternalSymbols.clear();
   TargetExternalSymbols.clear();
   std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
-            static_cast<CondCodeSDNode*>(0));
+            static_cast<CondCodeSDNode*>(nullptr));
   std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
-            static_cast<SDNode*>(0));
+            static_cast<SDNode*>(nullptr));
 
-  EntryNode.UseList = 0;
+  EntryNode.UseList = nullptr;
   AllNodes.push_back(&EntryNode);
   Root = getEntryNode();
   DbgInfo->clear();
@@ -965,6 +963,14 @@ SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) {
     getNode(ISD::TRUNCATE, DL, VT, Op);
 }
 
+SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, SDLoc SL, EVT VT) {
+  if (VT.bitsLE(Op.getValueType()))
+    return getNode(ISD::TRUNCATE, SL, VT, Op);
+
+  TargetLowering::BooleanContent BType = TLI->getBooleanContents(VT.isVector());
+  return getNode(TLI->getExtendForContent(BType), SL, VT, Op);
+}
+
 SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, SDLoc DL, EVT VT) {
   assert(!VT.isVector() &&
          "getZeroExtendInReg should use the vector element type instead of "
@@ -986,6 +992,22 @@ SDValue SelectionDAG::getNOT(SDLoc DL, SDValue Val, EVT VT) {
   return getNode(ISD::XOR, DL, VT, Val, NegOne);
 }
 
+SDValue SelectionDAG::getLogicalNOT(SDLoc DL, SDValue Val, EVT VT) {
+  EVT EltVT = VT.getScalarType();
+  SDValue TrueValue;
+  switch (TLI->getBooleanContents(VT.isVector())) {
+    case TargetLowering::ZeroOrOneBooleanContent:
+    case TargetLowering::UndefinedBooleanContent:
+      TrueValue = getConstant(1, VT);
+      break;
+    case TargetLowering::ZeroOrNegativeOneBooleanContent:
+      TrueValue = getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()),
+                              VT);
+      break;
+  }
+  return getNode(ISD::XOR, DL, VT, Val, TrueValue);
+}
+
 SDValue SelectionDAG::getConstant(uint64_t Val, EVT VT, bool isT, bool isO) {
   EVT EltVT = VT.getScalarType();
   assert((EltVT.getSizeInBits() >= 64 ||
@@ -1063,7 +1085,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT,
 
     SDValue Result = getNode(ISD::BITCAST, SDLoc(), VT,
                              getNode(ISD::BUILD_VECTOR, SDLoc(), ViaVecVT,
-                                     &Ops[0], Ops.size()));
+                                     Ops));
     return Result;
   }
 
@@ -1071,11 +1093,11 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT,
          "APInt size does not match type size!");
   unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(EltVT), 0, 0);
+  AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
   ID.AddPointer(Elt);
   ID.AddBoolean(isO);
-  void *IP = 0;
-  SDNode *N = NULL;
+  void *IP = nullptr;
+  SDNode *N = nullptr;
   if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
     if (!VT.isVector())
       return SDValue(N, 0);
@@ -1090,7 +1112,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT,
   if (VT.isVector()) {
     SmallVector<SDValue, 8> Ops;
     Ops.assign(VT.getVectorNumElements(), Result);
-    Result = getNode(ISD::BUILD_VECTOR, SDLoc(), VT, &Ops[0], Ops.size());
+    Result = getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Ops);
   }
   return Result;
 }
@@ -1114,10 +1136,10 @@ SDValue SelectionDAG::getConstantFP(const ConstantFP& V, EVT VT, bool isTarget){
   // we don't have issues with SNANs.
   unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(EltVT), 0, 0);
+  AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
   ID.AddPointer(&V);
-  void *IP = 0;
-  SDNode *N = NULL;
+  void *IP = nullptr;
+  SDNode *N = nullptr;
   if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
     if (!VT.isVector())
       return SDValue(N, 0);
@@ -1133,7 +1155,7 @@ SDValue SelectionDAG::getConstantFP(const ConstantFP& V, EVT VT, bool isTarget){
     SmallVector<SDValue, 8> Ops;
     Ops.assign(VT.getVectorNumElements(), Result);
     // FIXME SDLoc info might be appropriate here
-    Result = getNode(ISD::BUILD_VECTOR, SDLoc(), VT, &Ops[0], Ops.size());
+    Result = getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Ops);
   }
   return Result;
 }
@@ -1172,7 +1194,7 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
   if (!GVar) {
     // If GV is an alias then use the aliasee for determining thread-localness.
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GVar = dyn_cast_or_null<GlobalVariable>(GA->getAliasedGlobal());
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->getAliasee());
   }
 
   unsigned Opc;
@@ -1182,12 +1204,12 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
     Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;
 
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  AddNodeIDNode(ID, Opc, getVTList(VT), None);
   ID.AddPointer(GV);
   ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
   ID.AddInteger(GV->getType()->getAddressSpace());
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1202,9 +1224,9 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
 SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
   unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  AddNodeIDNode(ID, Opc, getVTList(VT), None);
   ID.AddInteger(FI);
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1220,10 +1242,10 @@ SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
          "Cannot set target flags on target-independent jump tables");
   unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  AddNodeIDNode(ID, Opc, getVTList(VT), None);
   ID.AddInteger(JTI);
   ID.AddInteger(TargetFlags);
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1245,12 +1267,12 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
     TM.getTargetLowering()->getDataLayout()->getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  AddNodeIDNode(ID, Opc, getVTList(VT), None);
   ID.AddInteger(Alignment);
   ID.AddInteger(Offset);
   ID.AddPointer(C);
   ID.AddInteger(TargetFlags);
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1273,12 +1295,12 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
     TM.getTargetLowering()->getDataLayout()->getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  AddNodeIDNode(ID, Opc, getVTList(VT), None);
   ID.AddInteger(Alignment);
   ID.AddInteger(Offset);
   C->addSelectionDAGCSEId(ID);
   ID.AddInteger(TargetFlags);
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1292,11 +1314,11 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
 SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
                                      unsigned char TargetFlags) {
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), 0, 0);
+  AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), None);
   ID.AddInteger(Index);
   ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1309,9 +1331,9 @@ SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
 
 SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), 0, 0);
+  AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), None);
   ID.AddPointer(MBB);
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1358,7 +1380,7 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
   if ((unsigned)Cond >= CondCodeNodes.size())
     CondCodeNodes.resize(Cond+1);
 
-  if (CondCodeNodes[Cond] == 0) {
+  if (!CondCodeNodes[Cond]) {
     CondCodeSDNode *N = new (NodeAllocator) CondCodeSDNode(Cond);
     CondCodeNodes[Cond] = N;
     AllNodes.push_back(N);
@@ -1441,13 +1463,18 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
   if (Identity && NElts)
     return N1;
 
+  // Shuffling a constant splat doesn't change the result.
+  if (N2Undef && N1.getOpcode() == ISD::BUILD_VECTOR)
+    if (cast<BuildVectorSDNode>(N1)->getConstantSplatValue())
+      return N1;
+
   FoldingSetNodeID ID;
   SDValue Ops[2] = { N1, N2 };
-  AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops, 2);
+  AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops);
   for (unsigned i = 0; i != NElts; ++i)
     ID.AddInteger(MaskVec[i]);
 
-  void* IP = 0;
+  void* IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1478,14 +1505,14 @@ SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl,
 
   FoldingSetNodeID ID;
   SDValue Ops[] = { Val, DTy, STy, Rnd, Sat };
-  AddNodeIDNode(ID, ISD::CONVERT_RNDSAT, getVTList(VT), &Ops[0], 5);
-  void* IP = 0;
+  AddNodeIDNode(ID, ISD::CONVERT_RNDSAT, getVTList(VT), Ops);
+  void* IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
   CvtRndSatSDNode *N = new (NodeAllocator) CvtRndSatSDNode(VT, dl.getIROrder(),
                                                            dl.getDebugLoc(),
-                                                           Ops, 5, Code);
+                                                           Ops, Code);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -1493,9 +1520,9 @@ SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl,
 
 SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::Register, getVTList(VT), 0, 0);
+  AddNodeIDNode(ID, ISD::Register, getVTList(VT), None);
   ID.AddInteger(RegNo);
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1507,9 +1534,9 @@ SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
 
 SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), 0, 0);
+  AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), None);
   ID.AddPointer(RegMask);
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1522,9 +1549,9 @@ SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
 SDValue SelectionDAG::getEHLabel(SDLoc dl, SDValue Root, MCSymbol *Label) {
   FoldingSetNodeID ID;
   SDValue Ops[] = { Root };
-  AddNodeIDNode(ID, ISD::EH_LABEL, getVTList(MVT::Other), &Ops[0], 1);
+  AddNodeIDNode(ID, ISD::EH_LABEL, getVTList(MVT::Other), Ops);
   ID.AddPointer(Label);
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1543,11 +1570,11 @@ SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
   unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
 
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  AddNodeIDNode(ID, Opc, getVTList(VT), None);
   ID.AddPointer(BA);
   ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1563,10 +1590,10 @@ SDValue SelectionDAG::getSrcValue(const Value *V) {
          "SrcValue is not a pointer?");
 
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), 0, 0);
+  AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None);
   ID.AddPointer(V);
 
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1579,10 +1606,10 @@ SDValue SelectionDAG::getSrcValue(const Value *V) {
 /// getMDNode - Return an MDNodeSDNode which holds an MDNode.
 SDValue SelectionDAG::getMDNode(const MDNode *MD) {
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), 0, 0);
+  AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), None);
   ID.AddPointer(MD);
 
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1597,11 +1624,11 @@ SDValue SelectionDAG::getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr,
                                        unsigned SrcAS, unsigned DestAS) {
   SDValue Ops[] = {Ptr};
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), &Ops[0], 1);
+  AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), Ops);
   ID.AddInteger(SrcAS);
   ID.AddInteger(DestAS);
 
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -1780,17 +1807,14 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
 bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
                                      unsigned Depth) const {
   APInt KnownZero, KnownOne;
-  ComputeMaskedBits(Op, KnownZero, KnownOne, Depth);
-  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+  computeKnownBits(Op, KnownZero, KnownOne, Depth);
   return (KnownZero & Mask) == Mask;
 }
 
-/// ComputeMaskedBits - Determine which of the bits specified in Mask are
-/// known to be either zero or one and return them in the KnownZero/KnownOne
-/// bitsets.  This code only analyzes bits in Mask, in order to short-circuit
-/// processing.
-void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
-                                     APInt &KnownOne, unsigned Depth) const {
+/// Determine which bits of Op are known to be either zero or one and return
+/// them in the KnownZero/KnownOne bitsets.
+void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
+                                    APInt &KnownOne, unsigned Depth) const {
   const TargetLowering *TLI = TM.getTargetLowering();
   unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
 
@@ -1805,48 +1829,40 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
     // We know all of the bits for a constant!
     KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue();
     KnownZero = ~KnownOne;
-    return;
+    break;
   case ISD::AND:
     // If either the LHS or the RHS are Zero, the result is zero.
-    ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
     KnownOne &= KnownOne2;
     // Output known-0 are known to be clear if zero in either the LHS | RHS.
     KnownZero |= KnownZero2;
-    return;
+    break;
   case ISD::OR:
-    ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
     KnownZero &= KnownZero2;
     // Output known-1 are known to be set if set in either the LHS | RHS.
     KnownOne |= KnownOne2;
-    return;
+    break;
   case ISD::XOR: {
-    ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
     APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
     // Output known-1 are known to be set if set in only one of the LHS, RHS.
     KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
     KnownZero = KnownZeroOut;
-    return;
+    break;
   }
   case ISD::MUL: {
-    ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
 
     // If low bits are zero in either operand, output low known-0 bits.
     // Also compute a conserative estimate for high known-0 bits.
@@ -1863,46 +1879,42 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
     LeadZ = std::min(LeadZ, BitWidth);
     KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
                 APInt::getHighBitsSet(BitWidth, LeadZ);
-    return;
+    break;
   }
   case ISD::UDIV: {
     // For the purposes of computing leading zeros we can conservatively
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
-    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
     unsigned LeadZ = KnownZero2.countLeadingOnes();
 
     KnownOne2.clearAllBits();
     KnownZero2.clearAllBits();
-    ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
     unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
     if (RHSUnknownLeadingOnes != BitWidth)
       LeadZ = std::min(BitWidth,
                        LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
 
     KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
-    return;
+    break;
   }
   case ISD::SELECT:
-    ComputeMaskedBits(Op.getOperand(2), KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(Op.getOperand(2), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
 
     // Only known if known in both the LHS and RHS.
     KnownOne &= KnownOne2;
     KnownZero &= KnownZero2;
-    return;
+    break;
   case ISD::SELECT_CC:
-    ComputeMaskedBits(Op.getOperand(3), KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(2), KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(Op.getOperand(3), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(2), KnownZero2, KnownOne2, Depth+1);
 
     // Only known if known in both the LHS and RHS.
     KnownOne &= KnownOne2;
     KnownZero &= KnownZero2;
-    return;
+    break;
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
@@ -1910,14 +1922,14 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
   case ISD::SMULO:
   case ISD::UMULO:
     if (Op.getResNo() != 1)
-      return;
+      break;
     // The boolean result conforms to getBooleanContents.  Fall through.
   case ISD::SETCC:
     // If we know the result of a setcc has the top bits zero, use this info.
     if (TLI->getBooleanContents(Op.getValueType().isVector()) ==
         TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1)
       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
-    return;
+    break;
   case ISD::SHL:
     // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
     if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
@@ -1925,16 +1937,15 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
 
       // If the shift count is an invalid immediate, don't do anything.
       if (ShAmt >= BitWidth)
-        return;
+        break;
 
-      ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
       KnownZero <<= ShAmt;
       KnownOne  <<= ShAmt;
       // low bits known zero.
       KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt);
     }
-    return;
+    break;
   case ISD::SRL:
     // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
     if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
@@ -1942,31 +1953,29 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
 
       // If the shift count is an invalid immediate, don't do anything.
       if (ShAmt >= BitWidth)
-        return;
+        break;
 
-      ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
       KnownZero = KnownZero.lshr(ShAmt);
       KnownOne  = KnownOne.lshr(ShAmt);
 
       APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
       KnownZero |= HighBits;  // High bits known zero.
     }
-    return;
+    break;
   case ISD::SRA:
     if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       unsigned ShAmt = SA->getZExtValue();
 
       // If the shift count is an invalid immediate, don't do anything.
       if (ShAmt >= BitWidth)
-        return;
+        break;
 
       // If any of the demanded bits are produced by the sign extension, we also
       // demand the input sign bit.
       APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
 
-      ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
       KnownZero = KnownZero.lshr(ShAmt);
       KnownOne  = KnownOne.lshr(ShAmt);
 
@@ -1980,7 +1989,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
         KnownOne  |= HighBits;  // New bits are known one.
       }
     }
-    return;
+    break;
   case ISD::SIGN_EXTEND_INREG: {
     EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
     unsigned EBits = EVT.getScalarType().getSizeInBits();
@@ -1998,10 +2007,9 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
     if (NewBits.getBoolValue())
       InputDemandedBits |= InSignBit;
 
-    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     KnownOne &= InputDemandedBits;
     KnownZero &= InputDemandedBits;
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
 
     // If the sign bit of the input is known set or clear, then we know the
     // top bits of the result.
@@ -2015,7 +2023,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
       KnownZero &= ~NewBits;
       KnownOne  &= ~NewBits;
     }
-    return;
+    break;
   }
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
@@ -2025,7 +2033,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
     unsigned LowBits = Log2_32(BitWidth)+1;
     KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
     KnownOne.clearAllBits();
-    return;
+    break;
   }
   case ISD::LOAD: {
     LoadSDNode *LD = cast<LoadSDNode>(Op);
@@ -2035,9 +2043,9 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
       unsigned MemBits = VT.getScalarType().getSizeInBits();
       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
     } else if (const MDNode *Ranges = LD->getRanges()) {
-      computeMaskedBitsLoad(*Ranges, KnownZero);
+      computeKnownBitsLoad(*Ranges, KnownZero);
     }
-    return;
+    break;
   }
   case ISD::ZERO_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
@@ -2045,11 +2053,11 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
     APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
     KnownZero = KnownZero.trunc(InBits);
     KnownOne = KnownOne.trunc(InBits);
-    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
     KnownZero |= NewBits;
-    return;
+    break;
   }
   case ISD::SIGN_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
@@ -2058,13 +2066,11 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
 
     KnownZero = KnownZero.trunc(InBits);
     KnownOne = KnownOne.trunc(InBits);
-    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
 
     // Note if the sign bit is known to be zero or one.
     bool SignBitKnownZero = KnownZero.isNegative();
     bool SignBitKnownOne  = KnownOne.isNegative();
-    assert(!(SignBitKnownZero && SignBitKnownOne) &&
-           "Sign bit can't be known to be both zero and one!");
 
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
@@ -2074,25 +2080,24 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
       KnownZero |= NewBits;
     else if (SignBitKnownOne)
       KnownOne  |= NewBits;
-    return;
+    break;
   }
   case ISD::ANY_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarType().getSizeInBits();
     KnownZero = KnownZero.trunc(InBits);
     KnownOne = KnownOne.trunc(InBits);
-    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
-    return;
+    break;
   }
   case ISD::TRUNCATE: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarType().getSizeInBits();
     KnownZero = KnownZero.zext(InBits);
     KnownOne = KnownOne.zext(InBits);
-    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     KnownZero = KnownZero.trunc(BitWidth);
     KnownOne = KnownOne.trunc(BitWidth);
     break;
@@ -2100,15 +2105,15 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
   case ISD::AssertZext: {
     EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
     APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
-    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     KnownZero |= (~InMask);
     KnownOne  &= (~KnownZero);
-    return;
+    break;
   }
   case ISD::FGETSIGN:
     // All bits are zero except the low bit.
     KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1);
-    return;
+    break;
 
   case ISD::SUB: {
     if (ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0))) {
@@ -2119,7 +2124,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
         unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
         // NLZ can't be BitWidth with no sign bit
         APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-        ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+        computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
 
         // If all of the MaskV bits are known to be zero, then we know the
         // output top bits are zero, because we now know that the output is
@@ -2138,18 +2143,16 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
     // Output known-0 bits are known if clear or set in both the low clear bits
     // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
     // low 3 bits clear.
-    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
     unsigned KnownZeroOut = KnownZero2.countTrailingOnes();
 
-    ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
     KnownZeroOut = std::min(KnownZeroOut,
                             KnownZero2.countTrailingOnes());
 
     if (Op.getOpcode() == ISD::ADD) {
       KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroOut);
-      return;
+      break;
     }
 
     // With ADDE, a carry bit may be added in, so we can only use this
@@ -2158,14 +2161,14 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
     // are known zero.
     if (KnownZeroOut >= 2) // ADDE
       KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroOut);
-    return;
+    break;
   }
   case ISD::SREM:
     if (ConstantSDNode *Rem = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       const APInt &RA = Rem->getAPIntValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
-        ComputeMaskedBits(Op.getOperand(0), KnownZero2,KnownOne2,Depth+1);
+        computeKnownBits(Op.getOperand(0), KnownZero2,KnownOne2,Depth+1);
 
         // The low bits of the first operand are unchanged by the srem.
         KnownZero = KnownZero2 & LowBits;
@@ -2183,36 +2186,35 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
         assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
       }
     }
-    return;
+    break;
   case ISD::UREM: {
     if (ConstantSDNode *Rem = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       const APInt &RA = Rem->getAPIntValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
         KnownZero |= ~LowBits;
-        ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne,Depth+1);
-        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
+        computeKnownBits(Op.getOperand(0), KnownZero, KnownOne,Depth+1);
         break;
       }
     }
 
     // Since the result is less than or equal to either operand, any leading
     // zero bits in either operand must also exist in the result.
-    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
 
     uint32_t Leaders = std::max(KnownZero.countLeadingOnes(),
                                 KnownZero2.countLeadingOnes());
     KnownOne.clearAllBits();
     KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
-    return;
+    break;
   }
   case ISD::FrameIndex:
   case ISD::TargetFrameIndex:
     if (unsigned Align = InferPtrAlignment(Op)) {
       // The low bits are known zero if the pointer is aligned.
       KnownZero = APInt::getLowBitsSet(BitWidth, Log2_32(Align));
-      return;
+      break;
     }
     break;
 
@@ -2224,9 +2226,11 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_VOID:
     // Allow the target to implement this method for its nodes.
-    TLI->computeMaskedBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth);
-    return;
+    TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth);
+    break;
   }
+
+  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
 }
 
 /// ComputeNumSignBits - Return the number of times the sign bit of the
@@ -2300,7 +2304,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
       FirstAnswer = std::min(Tmp, Tmp2);
       // We computed what we know about the sign bits as our first
       // answer. Now proceed to the generic code that uses
-      // ComputeMaskedBits, and pick whichever answer is better.
+      // computeKnownBits, and pick whichever answer is better.
     }
     break;
 
@@ -2350,7 +2354,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
     if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
       if (CRHS->isAllOnesValue()) {
         APInt KnownZero, KnownOne;
-        ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+        computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
@@ -2375,7 +2379,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
     if (ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0)))
       if (CLHS->isNullValue()) {
         APInt KnownZero, KnownOne;
-        ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+        computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
         if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue())
@@ -2422,14 +2426,14 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
       Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
       Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
       Op.getOpcode() == ISD::INTRINSIC_VOID) {
-    unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, Depth);
+    unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, *this, Depth);
     if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits);
   }
 
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
   APInt KnownZero, KnownOne;
-  ComputeMaskedBits(Op, KnownZero, KnownOne, Depth);
+  computeKnownBits(Op, KnownZero, KnownOne, Depth);
 
   APInt Mask;
   if (KnownZero.isNegative()) {        // sign bit is 0
@@ -2517,8 +2521,8 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
 ///
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) {
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, Opcode, getVTList(VT), 0, 0);
-  void *IP = 0;
+  AddNodeIDNode(ID, Opcode, getVTList(VT), None);
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -2789,8 +2793,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
   if (VT != MVT::Glue) { // Don't CSE flag producing nodes
     FoldingSetNodeID ID;
     SDValue Ops[1] = { Operand };
-    AddNodeIDNode(ID, Opcode, VTs, Ops, 1);
-    void *IP = 0;
+    AddNodeIDNode(ID, Opcode, VTs, Ops);
+    void *IP = nullptr;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
@@ -2811,6 +2815,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
 
 SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, EVT VT,
                                              SDNode *Cst1, SDNode *Cst2) {
+  // If the opcode is a target-specific ISD node, there's nothing we can
+  // do here and the operand rules may not line up with the below, so
+  // bail early.
+  if (Opcode >= ISD::BUILTIN_OP_END)
+    return SDValue();
+
   SmallVector<std::pair<ConstantSDNode *, ConstantSDNode *>, 4> Inputs;
   SmallVector<SDValue, 4> Outputs;
   EVT SVT = VT.getScalarType();
@@ -2915,13 +2925,18 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, EVT VT,
     }
   }
 
+  assert((Scalar1 && Scalar2) || (VT.getVectorNumElements() == Outputs.size() &&
+                                  "Expected a scalar or vector!"));
+
   // Handle the scalar case first.
-  if (Scalar1 && Scalar2)
+  if (!VT.isVector())
     return Outputs.back();
 
-  // Otherwise build a big vector out of the scalar elements we generated.
-  return getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Outputs.data(),
-                 Outputs.size());
+  // We may have a vector type but a scalar result. Create a splat.
+  Outputs.resize(VT.getVectorNumElements(), Outputs.back());
+
+  // Build a big vector out of the scalar elements we generated.
+  return getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Outputs);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
@@ -2951,7 +2966,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
       SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
                                     N1.getNode()->op_end());
       Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
-      return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
+      return getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
     }
     break;
   case ISD::AND:
@@ -3370,8 +3385,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
   if (VT != MVT::Glue) {
     SDValue Ops[] = { N1, N2 };
     FoldingSetNodeID ID;
-    AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
-    void *IP = 0;
+    AddNodeIDNode(ID, Opcode, VTs, Ops);
+    void *IP = nullptr;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
@@ -3420,7 +3435,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
                                     N1.getNode()->op_end());
       Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
       Elts.append(N3.getNode()->op_begin(), N3.getNode()->op_end());
-      return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
+      return getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
     }
     break;
   case ISD::SETCC: {
@@ -3477,8 +3492,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
   if (VT != MVT::Glue) {
     SDValue Ops[] = { N1, N2, N3 };
     FoldingSetNodeID ID;
-    AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
-    void *IP = 0;
+    AddNodeIDNode(ID, Opcode, VTs, Ops);
+    void *IP = nullptr;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
@@ -3501,14 +3516,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3,
                               SDValue N4) {
   SDValue Ops[] = { N1, N2, N3, N4 };
-  return getNode(Opcode, DL, VT, Ops, 4);
+  return getNode(Opcode, DL, VT, Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3,
                               SDValue N4, SDValue N5) {
   SDValue Ops[] = { N1, N2, N3, N4, N5 };
-  return getNode(Opcode, DL, VT, Ops, 5);
+  return getNode(Opcode, DL, VT, Ops);
 }
 
 /// getStackArgumentTokenFactor - Compute a TokenFactor to force all
@@ -3530,8 +3545,7 @@ SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
           ArgChains.push_back(SDValue(L, 1));
 
   // Build a tokenfactor for all the chains.
-  return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
-                 &ArgChains[0], ArgChains.size());
+  return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
 }
 
 /// getMemsetValue - Vectorized representation of the memset value
@@ -3600,7 +3614,7 @@ static SDValue getMemsetStringVal(EVT VT, SDLoc dl, SelectionDAG &DAG,
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
   if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty))
     return DAG.getConstant(Val, VT);
-  return SDValue(0, 0);
+  return SDValue(nullptr, 0);
 }
 
 /// getMemBasePlusOffset - Returns base and offset node for the
@@ -3616,7 +3630,7 @@ static SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset, SDLoc dl,
 ///
 static bool isMemSrcFromString(SDValue Src, StringRef &Str) {
   unsigned SrcDelta = 0;
-  GlobalAddressSDNode *G = NULL;
+  GlobalAddressSDNode *G = nullptr;
   if (Src.getOpcode() == ISD::GlobalAddress)
     G = cast<GlobalAddressSDNode>(Src);
   else if (Src.getOpcode() == ISD::ADD &&
@@ -3852,8 +3866,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
     Size -= VTSize;
   }
 
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                     &OutChains[0], OutChains.size());
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
 static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
@@ -3918,8 +3931,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
     LoadChains.push_back(Value.getValue(1));
     SrcOff += VTSize;
   }
-  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                      &LoadChains[0], LoadChains.size());
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
   OutChains.clear();
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
@@ -3933,8 +3945,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
     DstOff += VTSize;
   }
 
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                     &OutChains[0], OutChains.size());
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
 /// \brief Lower the call to 'memset' intrinsic function into a series of store
@@ -4035,8 +4046,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl,
     Size -= VTSize;
   }
 
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                     &OutChains[0], OutChains.size());
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
 SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
@@ -4095,15 +4105,13 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
   // FIXME: pass in SDLoc
-  TargetLowering::
-  CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
-                    false, false, false, false, 0,
-                    TLI->getLibcallCallingConv(RTLIB::MEMCPY),
-                    /*isTailCall=*/false,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/false,
-                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
-                                      TLI->getPointerTy()),
-                    Args, *this, dl);
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+               Type::getVoidTy(*getContext()),
+               getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
+                                 TLI->getPointerTy()), &Args, 0)
+    .setDiscardResult();
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
 
   return CallResult.second;
@@ -4153,15 +4161,13 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst,
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
   // FIXME:  pass in SDLoc
-  TargetLowering::
-  CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
-                    false, false, false, false, 0,
-                    TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
-                    /*isTailCall=*/false,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/false,
-                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
-                                      TLI->getPointerTy()),
-                    Args, *this, dl);
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
+               Type::getVoidTy(*getContext()),
+               getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
+                                 TLI->getPointerTy()), &Args, 0)
+    .setDiscardResult();
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
 
   return CallResult.second;
@@ -4217,32 +4223,31 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
   Entry.Ty = IntPtrTy;
   Entry.isSExt = false;
   Args.push_back(Entry);
+
   // FIXME: pass in SDLoc
-  TargetLowering::
-  CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
-                    false, false, false, false, 0,
-                    TLI->getLibcallCallingConv(RTLIB::MEMSET),
-                    /*isTailCall=*/false,
-                    /*doesNotReturn*/false, /*isReturnValueUsed=*/false,
-                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
-                                      TLI->getPointerTy()),
-                    Args, *this, dl);
-  std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
+               Type::getVoidTy(*getContext()),
+               getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
+                                 TLI->getPointerTy()), &Args, 0)
+    .setDiscardResult();
 
+  std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   return CallResult.second;
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
-                                SDVTList VTList, SDValue *Ops, unsigned NumOps,
+                                SDVTList VTList, ArrayRef<SDValue> Ops,
                                 MachineMemOperand *MMO,
                                 AtomicOrdering SuccessOrdering,
                                 AtomicOrdering FailureOrdering,
                                 SynchronizationScope SynchScope) {
   FoldingSetNodeID ID;
   ID.AddInteger(MemVT.getRawBits());
-  AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+  AddNodeIDNode(ID, Opcode, VTList, Ops);
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void* IP = 0;
+  void* IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
@@ -4253,11 +4258,13 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
   // the node is deallocated, but recovered when the allocator is released.
   // If the number of operands is less than 5 we use AtomicSDNode's internal
   // storage.
-  SDUse *DynOps = NumOps > 4 ? OperandAllocator.Allocate<SDUse>(NumOps) : 0;
+  unsigned NumOps = Ops.size();
+  SDUse *DynOps = NumOps > 4 ? OperandAllocator.Allocate<SDUse>(NumOps)
+                             : nullptr;
 
   SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(),
                                                dl.getDebugLoc(), VTList, MemVT,
-                                               Ops, DynOps, NumOps, MMO,
+                                               Ops.data(), DynOps, NumOps, MMO,
                                                SuccessOrdering, FailureOrdering,
                                                SynchScope);
   CSEMap.InsertNode(N, IP);
@@ -4266,11 +4273,11 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
-                                SDVTList VTList, SDValue *Ops, unsigned NumOps,
+                                SDVTList VTList, ArrayRef<SDValue> Ops,
                                 MachineMemOperand *MMO,
                                 AtomicOrdering Ordering,
                                 SynchronizationScope SynchScope) {
-  return getAtomic(Opcode, dl, MemVT, VTList, Ops, NumOps, MMO, Ordering,
+  return getAtomic(Opcode, dl, MemVT, VTList, Ops, MMO, Ordering,
                    Ordering, SynchScope);
 }
 
@@ -4317,7 +4324,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
 
   SDVTList VTs = getVTList(VT, MVT::Other);
   SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
-  return getAtomic(Opcode, dl, MemVT, VTs, Ops, 4, MMO, SuccessOrdering,
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, SuccessOrdering,
                    FailureOrdering, SynchScope);
 }
 
@@ -4377,38 +4384,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
   SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
                                                getVTList(VT, MVT::Other);
   SDValue Ops[] = {Chain, Ptr, Val};
-  return getAtomic(Opcode, dl, MemVT, VTs, Ops, 3, MMO, Ordering, SynchScope);
-}
-
-SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
-                                EVT VT, SDValue Chain,
-                                SDValue Ptr,
-                                const Value* PtrVal,
-                                unsigned Alignment,
-                                AtomicOrdering Ordering,
-                                SynchronizationScope SynchScope) {
-  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
-    Alignment = getEVTAlignment(MemVT);
-
-  MachineFunction &MF = getMachineFunction();
-  // An atomic store does not load. An atomic load does not store.
-  // (An atomicrmw obviously both loads and stores.)
-  // For now, atomics are considered to be volatile always, and they are
-  // chained as such.
-  // FIXME: Volatile isn't really correct; we should keep track of atomic
-  // orderings in the memoperand.
-  unsigned Flags = MachineMemOperand::MOVolatile;
-  if (Opcode != ISD::ATOMIC_STORE)
-    Flags |= MachineMemOperand::MOLoad;
-  if (Opcode != ISD::ATOMIC_LOAD)
-    Flags |= MachineMemOperand::MOStore;
-
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
-                            MemVT.getStoreSize(), Alignment);
-
-  return getAtomic(Opcode, dl, MemVT, VT, Chain, Ptr, MMO,
-                   Ordering, SynchScope);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, Ordering, SynchScope);
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
@@ -4421,38 +4397,24 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
 
   SDVTList VTs = getVTList(VT, MVT::Other);
   SDValue Ops[] = {Chain, Ptr};
-  return getAtomic(Opcode, dl, MemVT, VTs, Ops, 2, MMO, Ordering, SynchScope);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, Ordering, SynchScope);
 }
 
 /// getMergeValues - Create a MERGE_VALUES node from the given operands.
-SDValue SelectionDAG::getMergeValues(const SDValue *Ops, unsigned NumOps,
-                                     SDLoc dl) {
-  if (NumOps == 1)
+SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, SDLoc dl) {
+  if (Ops.size() == 1)
     return Ops[0];
 
   SmallVector<EVT, 4> VTs;
-  VTs.reserve(NumOps);
-  for (unsigned i = 0; i < NumOps; ++i)
+  VTs.reserve(Ops.size());
+  for (unsigned i = 0; i < Ops.size(); ++i)
     VTs.push_back(Ops[i].getValueType());
-  return getNode(ISD::MERGE_VALUES, dl, getVTList(&VTs[0], NumOps),
-                 Ops, NumOps);
-}
-
-SDValue
-SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl,
-                                  const EVT *VTs, unsigned NumVTs,
-                                  const SDValue *Ops, unsigned NumOps,
-                                  EVT MemVT, MachinePointerInfo PtrInfo,
-                                  unsigned Align, bool Vol,
-                                  bool ReadMem, bool WriteMem) {
-  return getMemIntrinsicNode(Opcode, dl, makeVTList(VTs, NumVTs), Ops, NumOps,
-                             MemVT, PtrInfo, Align, Vol,
-                             ReadMem, WriteMem);
+  return getNode(ISD::MERGE_VALUES, dl, getVTList(VTs), Ops);
 }
 
 SDValue
 SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
-                                  const SDValue *Ops, unsigned NumOps,
+                                  ArrayRef<SDValue> Ops,
                                   EVT MemVT, MachinePointerInfo PtrInfo,
                                   unsigned Align, bool Vol,
                                   bool ReadMem, bool WriteMem) {
@@ -4470,13 +4432,13 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Align);
 
-  return getMemIntrinsicNode(Opcode, dl, VTList, Ops, NumOps, MemVT, MMO);
+  return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
 }
 
 SDValue
 SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
-                                  const SDValue *Ops, unsigned NumOps,
-                                  EVT MemVT, MachineMemOperand *MMO) {
+                                  ArrayRef<SDValue> Ops, EVT MemVT,
+                                  MachineMemOperand *MMO) {
   assert((Opcode == ISD::INTRINSIC_VOID ||
           Opcode == ISD::INTRINSIC_W_CHAIN ||
           Opcode == ISD::PREFETCH ||
@@ -4490,9 +4452,9 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
   MemIntrinsicSDNode *N;
   if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;
-    AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+    AddNodeIDNode(ID, Opcode, VTList, Ops);
     ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-    void *IP = 0;
+    void *IP = nullptr;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
       cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
       return SDValue(E, 0);
@@ -4500,12 +4462,12 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
 
     N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(),
                                                dl.getDebugLoc(), VTList, Ops,
-                                               NumOps, MemVT, MMO);
+                                               MemVT, MMO);
     CSEMap.InsertNode(N, IP);
   } else {
     N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(),
                                                dl.getDebugLoc(), VTList, Ops,
-                                               NumOps, MemVT, MMO);
+                                               MemVT, MMO);
   }
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -4568,7 +4530,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
 
   // If we don't have a PtrInfo, infer the trivial frame index case to simplify
   // clients.
-  if (PtrInfo.V == 0)
+  if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(Ptr, Offset);
 
   MachineFunction &MF = getMachineFunction();
@@ -4608,13 +4570,13 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
     getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
   SDValue Ops[] = { Chain, Ptr, Offset };
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::LOAD, VTs, Ops, 3);
+  AddNodeIDNode(ID, ISD::LOAD, VTs, Ops);
   ID.AddInteger(MemVT.getRawBits());
   ID.AddInteger(encodeMemSDNodeFlags(ExtType, AM, MMO->isVolatile(),
                                      MMO->isNonTemporal(),
                                      MMO->isInvariant()));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<LoadSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
@@ -4695,7 +4657,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
   if (isNonTemporal)
     Flags |= MachineMemOperand::MONonTemporal;
 
-  if (PtrInfo.V == 0)
+  if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(Ptr);
 
   MachineFunction &MF = getMachineFunction();
@@ -4716,12 +4678,12 @@ SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
   SDValue Undef = getUNDEF(Ptr.getValueType());
   SDValue Ops[] = { Chain, Val, Ptr, Undef };
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
                                      MMO->isNonTemporal(), MMO->isInvariant()));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
@@ -4750,7 +4712,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
   if (isNonTemporal)
     Flags |= MachineMemOperand::MONonTemporal;
 
-  if (PtrInfo.V == 0)
+  if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(Ptr);
 
   MachineFunction &MF = getMachineFunction();
@@ -4785,12 +4747,12 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
   SDValue Undef = getUNDEF(Ptr.getValueType());
   SDValue Ops[] = { Chain, Val, Ptr, Undef };
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
   ID.AddInteger(SVT.getRawBits());
   ID.AddInteger(encodeMemSDNodeFlags(true, ISD::UNINDEXED, MMO->isVolatile(),
                                      MMO->isNonTemporal(), MMO->isInvariant()));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
@@ -4812,11 +4774,11 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base,
   SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
   SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
   FoldingSetNodeID ID;
-  AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
   ID.AddInteger(ST->getMemoryVT().getRawBits());
   ID.AddInteger(ST->getRawSubclassData());
   ID.AddInteger(ST->getPointerInfo().getAddrSpace());
-  void *IP = 0;
+  void *IP = nullptr;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
@@ -4835,14 +4797,14 @@ SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl,
                                SDValue SV,
                                unsigned Align) {
   SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, MVT::i32) };
-  return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops, 4);
+  return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                              const SDUse *Ops, unsigned NumOps) {
-  switch (NumOps) {
+                              ArrayRef<SDUse> Ops) {
+  switch (Ops.size()) {
   case 0: return getNode(Opcode, DL, VT);
-  case 1: return getNode(Opcode, DL, VT, Ops[0]);
+  case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0]));
   case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
   case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
   default: break;
@@ -4850,12 +4812,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
 
   // Copy from an SDUse array into an SDValue array for use with
   // the regular getNode logic.
-  SmallVector<SDValue, 8> NewOps(Ops, Ops + NumOps);
-  return getNode(Opcode, DL, VT, &NewOps[0], NumOps);
+  SmallVector<SDValue, 8> NewOps(Ops.begin(), Ops.end());
+  return getNode(Opcode, DL, VT, NewOps);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
-                              const SDValue *Ops, unsigned NumOps) {
+                              ArrayRef<SDValue> Ops) {
+  unsigned NumOps = Ops.size();
   switch (NumOps) {
   case 0: return getNode(Opcode, DL, VT);
   case 1: return getNode(Opcode, DL, VT, Ops[0]);
@@ -4890,18 +4853,18 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
 
   if (VT != MVT::Glue) {
     FoldingSetNodeID ID;
-    AddNodeIDNode(ID, Opcode, VTs, Ops, NumOps);
-    void *IP = 0;
+    AddNodeIDNode(ID, Opcode, VTs, Ops);
+    void *IP = nullptr;
 
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
     N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
-                                   VTs, Ops, NumOps);
+                                   VTs, Ops);
     CSEMap.InsertNode(N, IP);
   } else {
     N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
-                                   VTs, Ops, NumOps);
+                                   VTs, Ops);
   }
 
   AllNodes.push_back(N);
@@ -4912,24 +4875,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
-                              ArrayRef<EVT> ResultTys,
-                              const SDValue *Ops, unsigned NumOps) {
-  return getNode(Opcode, DL, getVTList(&ResultTys[0], ResultTys.size()),
-                 Ops, NumOps);
-}
-
-SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
-                              const EVT *VTs, unsigned NumVTs,
-                              const SDValue *Ops, unsigned NumOps) {
-  if (NumVTs == 1)
-    return getNode(Opcode, DL, VTs[0], Ops, NumOps);
-  return getNode(Opcode, DL, makeVTList(VTs, NumVTs), Ops, NumOps);
+                              ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
+  return getNode(Opcode, DL, getVTList(ResultTys), Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
-                              const SDValue *Ops, unsigned NumOps) {
+                              ArrayRef<SDValue> Ops) {
   if (VTList.NumVTs == 1)
-    return getNode(Opcode, DL, VTList.VTs[0], Ops, NumOps);
+    return getNode(Opcode, DL, VTList.VTs[0], Ops);
 
 #if 0
   switch (Opcode) {
@@ -4956,10 +4909,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
 
   // Memoize the node unless it returns a flag.
   SDNode *N;
+  unsigned NumOps = Ops.size();
   if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;
-    AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
-    void *IP = 0;
+    AddNodeIDNode(ID, Opcode, VTList, Ops);
+    void *IP = nullptr;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
@@ -4976,7 +4930,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                                             Ops[1], Ops[2]);
     } else {
       N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
-                                     VTList, Ops, NumOps);
+                                     VTList, Ops);
     }
     CSEMap.InsertNode(N, IP);
   } else {
@@ -4993,7 +4947,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                                             Ops[1], Ops[2]);
     } else {
       N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
-                                     VTList, Ops, NumOps);
+                                     VTList, Ops);
     }
   }
   AllNodes.push_back(N);
@@ -5004,39 +4958,39 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList) {
-  return getNode(Opcode, DL, VTList, 0, 0);
+  return getNode(Opcode, DL, VTList, ArrayRef<SDValue>());
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                               SDValue N1) {
   SDValue Ops[] = { N1 };
-  return getNode(Opcode, DL, VTList, Ops, 1);
+  return getNode(Opcode, DL, VTList, Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                               SDValue N1, SDValue N2) {
   SDValue Ops[] = { N1, N2 };
-  return getNode(Opcode, DL, VTList, Ops, 2);
+  return getNode(Opcode, DL, VTList, Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                               SDValue N1, SDValue N2, SDValue N3) {
   SDValue Ops[] = { N1, N2, N3 };
-  return getNode(Opcode, DL, VTList, Ops, 3);
+  return getNode(Opcode, DL, VTList, Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                               SDValue N1, SDValue N2, SDValue N3,
                               SDValue N4) {
   SDValue Ops[] = { N1, N2, N3, N4 };
-  return getNode(Opcode, DL, VTList, Ops, 4);
+  return getNode(Opcode, DL, VTList, Ops);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                               SDValue N1, SDValue N2, SDValue N3,
                               SDValue N4, SDValue N5) {
   SDValue Ops[] = { N1, N2, N3, N4, N5 };
-  return getNode(Opcode, DL, VTList, Ops, 5);
+  return getNode(Opcode, DL, VTList, Ops);
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT) {
@@ -5049,9 +5003,9 @@ SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
   ID.AddInteger(VT1.getRawBits());
   ID.AddInteger(VT2.getRawBits());
 
-  void *IP = 0;
+  void *IP = nullptr;
   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
-  if (Result == NULL) {
+  if (!Result) {
     EVT *Array = Allocator.Allocate<EVT>(2);
     Array[0] = VT1;
     Array[1] = VT2;
@@ -5068,9 +5022,9 @@ SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
   ID.AddInteger(VT2.getRawBits());
   ID.AddInteger(VT3.getRawBits());
 
-  void *IP = 0;
+  void *IP = nullptr;
   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
-  if (Result == NULL) {
+  if (!Result) {
     EVT *Array = Allocator.Allocate<EVT>(3);
     Array[0] = VT1;
     Array[1] = VT2;
@@ -5089,9 +5043,9 @@ SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
   ID.AddInteger(VT3.getRawBits());
   ID.AddInteger(VT4.getRawBits());
 
-  void *IP = 0;
+  void *IP = nullptr;
   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
-  if (Result == NULL) {
+  if (!Result) {
     EVT *Array = Allocator.Allocate<EVT>(4);
     Array[0] = VT1;
     Array[1] = VT2;
@@ -5103,18 +5057,19 @@ SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
   return Result->getSDVTList();
 }
 
-SDVTList SelectionDAG::getVTList(const EVT *VTs, unsigned NumVTs) {
+SDVTList SelectionDAG::getVTList(ArrayRef<EVT> VTs) {
+  unsigned NumVTs = VTs.size();
   FoldingSetNodeID ID;
   ID.AddInteger(NumVTs);
   for (unsigned index = 0; index < NumVTs; index++) {
     ID.AddInteger(VTs[index].getRawBits());
   }
 
-  void *IP = 0;
+  void *IP = nullptr;
   SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
-  if (Result == NULL) {
+  if (!Result) {
     EVT *Array = Allocator.Allocate<EVT>(NumVTs);
-    std::copy(VTs, VTs + NumVTs, Array);
+    std::copy(VTs.begin(), VTs.end(), Array);
     Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
     VTListMap.InsertNode(Result, IP);
   }
@@ -5135,14 +5090,14 @@ SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) {
   if (Op == N->getOperand(0)) return N;
 
   // See if the modified node already exists.
-  void *InsertPos = 0;
+  void *InsertPos = nullptr;
   if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
     return Existing;
 
   // Nope it doesn't.  Remove the node from its current place in the maps.
   if (InsertPos)
     if (!RemoveNodeFromCSEMaps(N))
-      InsertPos = 0;
+      InsertPos = nullptr;
 
   // Now we update the operands.
   N->OperandList[0].set(Op);
@@ -5160,14 +5115,14 @@ SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) {
     return N;   // No operands changed, just return the input node.
 
   // See if the modified node already exists.
-  void *InsertPos = 0;
+  void *InsertPos = nullptr;
   if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
     return Existing;
 
   // Nope it doesn't.  Remove the node from its current place in the maps.
   if (InsertPos)
     if (!RemoveNodeFromCSEMaps(N))
-      InsertPos = 0;
+      InsertPos = nullptr;
 
   // Now we update the operands.
   if (N->OperandList[0] != Op1)
@@ -5183,25 +5138,26 @@ SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) {
 SDNode *SelectionDAG::
 UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
   SDValue Ops[] = { Op1, Op2, Op3 };
-  return UpdateNodeOperands(N, Ops, 3);
+  return UpdateNodeOperands(N, Ops);
 }
 
 SDNode *SelectionDAG::
 UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
                    SDValue Op3, SDValue Op4) {
   SDValue Ops[] = { Op1, Op2, Op3, Op4 };
-  return UpdateNodeOperands(N, Ops, 4);
+  return UpdateNodeOperands(N, Ops);
 }
 
 SDNode *SelectionDAG::
 UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
                    SDValue Op3, SDValue Op4, SDValue Op5) {
   SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
-  return UpdateNodeOperands(N, Ops, 5);
+  return UpdateNodeOperands(N, Ops);
 }
 
 SDNode *SelectionDAG::
-UpdateNodeOperands(SDNode *N, const SDValue *Ops, unsigned NumOps) {
+UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops) {
+  unsigned NumOps = Ops.size();
   assert(N->getNumOperands() == NumOps &&
          "Update with wrong number of operands");
 
@@ -5218,14 +5174,14 @@ UpdateNodeOperands(SDNode *N, const SDValue *Ops, unsigned NumOps) {
   if (!AnyChange) return N;
 
   // See if the modified node already exists.
-  void *InsertPos = 0;
-  if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, NumOps, InsertPos))
+  void *InsertPos = nullptr;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, InsertPos))
     return Existing;
 
   // Nope it doesn't.  Remove the node from its current place in the maps.
   if (InsertPos)
     if (!RemoveNodeFromCSEMaps(N))
-      InsertPos = 0;
+      InsertPos = nullptr;
 
   // Now we update the operands.
   for (unsigned i = 0; i != NumOps; ++i)
@@ -5254,14 +5210,14 @@ void SDNode::DropOperands() {
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT) {
   SDVTList VTs = getVTList(VT);
-  return SelectNodeTo(N, MachineOpc, VTs, 0, 0);
+  return SelectNodeTo(N, MachineOpc, VTs, None);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT, SDValue Op1) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1 };
-  return SelectNodeTo(N, MachineOpc, VTs, Ops, 1);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
@@ -5269,7 +5225,7 @@ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    SDValue Op2) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1, Op2 };
-  return SelectNodeTo(N, MachineOpc, VTs, Ops, 2);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
@@ -5277,41 +5233,39 @@ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    SDValue Op2, SDValue Op3) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1, Op2, Op3 };
-  return SelectNodeTo(N, MachineOpc, VTs, Ops, 3);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
-                                   EVT VT, const SDValue *Ops,
-                                   unsigned NumOps) {
+                                   EVT VT, ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT);
-  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
-                                   EVT VT1, EVT VT2, const SDValue *Ops,
-                                   unsigned NumOps) {
+                                   EVT VT1, EVT VT2, ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2);
-  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT1, EVT VT2) {
   SDVTList VTs = getVTList(VT1, VT2);
-  return SelectNodeTo(N, MachineOpc, VTs, (SDValue *)0, 0);
+  return SelectNodeTo(N, MachineOpc, VTs, None);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT1, EVT VT2, EVT VT3,
-                                   const SDValue *Ops, unsigned NumOps) {
+                                   ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2, VT3);
-  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    EVT VT1, EVT VT2, EVT VT3, EVT VT4,
-                                   const SDValue *Ops, unsigned NumOps) {
+                                   ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2, VT3, VT4);
-  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
@@ -5319,7 +5273,7 @@ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    SDValue Op1) {
   SDVTList VTs = getVTList(VT1, VT2);
   SDValue Ops[] = { Op1 };
-  return SelectNodeTo(N, MachineOpc, VTs, Ops, 1);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
@@ -5327,7 +5281,7 @@ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    SDValue Op1, SDValue Op2) {
   SDVTList VTs = getVTList(VT1, VT2);
   SDValue Ops[] = { Op1, Op2 };
-  return SelectNodeTo(N, MachineOpc, VTs, Ops, 2);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
@@ -5336,7 +5290,7 @@ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    SDValue Op3) {
   SDVTList VTs = getVTList(VT1, VT2);
   SDValue Ops[] = { Op1, Op2, Op3 };
-  return SelectNodeTo(N, MachineOpc, VTs, Ops, 3);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
@@ -5345,13 +5299,12 @@ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
                                    SDValue Op3) {
   SDVTList VTs = getVTList(VT1, VT2, VT3);
   SDValue Ops[] = { Op1, Op2, Op3 };
-  return SelectNodeTo(N, MachineOpc, VTs, Ops, 3);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
 }
 
 SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
-                                   SDVTList VTs, const SDValue *Ops,
-                                   unsigned NumOps) {
-  N = MorphNodeTo(N, ~MachineOpc, VTs, Ops, NumOps);
+                                   SDVTList VTs,ArrayRef<SDValue> Ops) {
+  N = MorphNodeTo(N, ~MachineOpc, VTs, Ops);
   // Reset the NodeID to -1.
   N->setNodeId(-1);
   return N;
@@ -5388,19 +5341,19 @@ SDNode *SelectionDAG::UpdadeSDLocOnMergedSDNode(SDNode *N, SDLoc OLoc) {
 /// the node's users.
 ///
 SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
-                                  SDVTList VTs, const SDValue *Ops,
-                                  unsigned NumOps) {
+                                  SDVTList VTs, ArrayRef<SDValue> Ops) {
+  unsigned NumOps = Ops.size();
   // If an identical node already exists, use it.
-  void *IP = 0;
+  void *IP = nullptr;
   if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;
-    AddNodeIDNode(ID, Opc, VTs, Ops, NumOps);
+    AddNodeIDNode(ID, Opc, VTs, Ops);
     if (SDNode *ON = CSEMap.FindNodeOrInsertPos(ID, IP))
       return UpdadeSDLocOnMergedSDNode(ON, SDLoc(N));
   }
 
   if (!RemoveNodeFromCSEMaps(N))
-    IP = 0;
+    IP = nullptr;
 
   // Start the morphing.
   N->NodeType = Opc;
@@ -5420,7 +5373,7 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
 
   if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N)) {
     // Initialize the memory references information.
-    MN->setMemRefs(0, 0);
+    MN->setMemRefs(nullptr, nullptr);
     // If NumOps is larger than the # of operands we can have in a
     // MachineSDNode, reallocate the operand list.
     if (NumOps > MN->NumOperands || !MN->OperandsNeedDelete) {
@@ -5431,22 +5384,22 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
         // remainder of the current SelectionDAG iteration, so we can allocate
         // the operands directly out of a pool with no recycling metadata.
         MN->InitOperands(OperandAllocator.Allocate<SDUse>(NumOps),
-                         Ops, NumOps);
+                         Ops.data(), NumOps);
       else
-        MN->InitOperands(MN->LocalOperands, Ops, NumOps);
+        MN->InitOperands(MN->LocalOperands, Ops.data(), NumOps);
       MN->OperandsNeedDelete = false;
     } else
-      MN->InitOperands(MN->OperandList, Ops, NumOps);
+      MN->InitOperands(MN->OperandList, Ops.data(), NumOps);
   } else {
     // If NumOps is larger than the # of operands we currently have, reallocate
     // the operand list.
     if (NumOps > N->NumOperands) {
       if (N->OperandsNeedDelete)
         delete[] N->OperandList;
-      N->InitOperands(new SDUse[NumOps], Ops, NumOps);
+      N->InitOperands(new SDUse[NumOps], Ops.data(), NumOps);
       N->OperandsNeedDelete = true;
     } else
-      N->InitOperands(N->OperandList, Ops, NumOps);
+      N->InitOperands(N->OperandList, Ops.data(), NumOps);
   }
 
   // Delete any nodes that are still dead after adding the uses for the
@@ -5585,7 +5538,7 @@ MachineSDNode *
 SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
                              ArrayRef<EVT> ResultTys,
                              ArrayRef<SDValue> Ops) {
-  SDVTList VTs = getVTList(&ResultTys[0], ResultTys.size());
+  SDVTList VTs = getVTList(ResultTys);
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
@@ -5594,14 +5547,14 @@ SelectionDAG::getMachineNode(unsigned Opcode, SDLoc DL, SDVTList VTs,
                              ArrayRef<SDValue> OpsArray) {
   bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue;
   MachineSDNode *N;
-  void *IP = 0;
+  void *IP = nullptr;
   const SDValue *Ops = OpsArray.data();
   unsigned NumOps = OpsArray.size();
 
   if (DoCSE) {
     FoldingSetNodeID ID;
-    AddNodeIDNode(ID, ~Opcode, VTs, Ops, NumOps);
-    IP = 0;
+    AddNodeIDNode(ID, ~Opcode, VTs, OpsArray);
+    IP = nullptr;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
       return cast<MachineSDNode>(UpdadeSDLocOnMergedSDNode(E, DL));
     }
@@ -5657,34 +5610,39 @@ SelectionDAG::getTargetInsertSubreg(int SRIdx, SDLoc DL, EVT VT,
 /// getNodeIfExists - Get the specified node if it's already available, or
 /// else return NULL.
 SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
-                                      const SDValue *Ops, unsigned NumOps) {
+                                      ArrayRef<SDValue> Ops) {
   if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;
-    AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
-    void *IP = 0;
+    AddNodeIDNode(ID, Opcode, VTList, Ops);
+    void *IP = nullptr;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return E;
   }
-  return NULL;
+  return nullptr;
 }
 
 /// getDbgValue - Creates a SDDbgValue node.
 ///
+/// SDNode
 SDDbgValue *
-SelectionDAG::getDbgValue(MDNode *MDPtr, SDNode *N, unsigned R, uint64_t Off,
+SelectionDAG::getDbgValue(MDNode *MDPtr, SDNode *N, unsigned R,
+			  bool IsIndirect, uint64_t Off,
                           DebugLoc DL, unsigned O) {
-  return new (Allocator) SDDbgValue(MDPtr, N, R, Off, DL, O);
+  return new (Allocator) SDDbgValue(MDPtr, N, R, IsIndirect, Off, DL, O);
 }
 
+/// Constant
 SDDbgValue *
-SelectionDAG::getDbgValue(MDNode *MDPtr, const Value *C, uint64_t Off,
-                          DebugLoc DL, unsigned O) {
+SelectionDAG::getConstantDbgValue(MDNode *MDPtr, const Value *C,
+				  uint64_t Off,
+				  DebugLoc DL, unsigned O) {
   return new (Allocator) SDDbgValue(MDPtr, C, Off, DL, O);
 }
 
+/// FrameIndex
 SDDbgValue *
-SelectionDAG::getDbgValue(MDNode *MDPtr, unsigned FI, uint64_t Off,
-                          DebugLoc DL, unsigned O) {
+SelectionDAG::getFrameIndexDbgValue(MDNode *MDPtr, unsigned FI, uint64_t Off,
+				    DebugLoc DL, unsigned O) {
   return new (Allocator) SDDbgValue(MDPtr, FI, Off, DL, O);
 }
 
@@ -6049,7 +6007,7 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
       dbgs() << "Overran sorted position:\n";
       S->dumprFull();
 #endif
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
   }
 
@@ -6090,6 +6048,7 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
     SDDbgValue *Dbg = *I;
     if (Dbg->getKind() == SDDbgValue::SDNODE) {
       SDDbgValue *Clone = getDbgValue(Dbg->getMDPtr(), ToNode, To.getResNo(),
+				      Dbg->isIndirect(),
                                       Dbg->getOffset(), Dbg->getDebugLoc(),
                                       Dbg->getOrder());
       ClonedDVs.push_back(Clone);
@@ -6133,9 +6092,8 @@ MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
 }
 
 MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
-                     const SDValue *Ops, unsigned NumOps, EVT memvt,
-                     MachineMemOperand *mmo)
-   : SDNode(Opc, Order, dl, VTs, Ops, NumOps),
+                     ArrayRef<SDValue> Ops, EVT memvt, MachineMemOperand *mmo)
+   : SDNode(Opc, Order, dl, VTs, Ops),
      MemoryVT(memvt), MMO(mmo) {
   SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(),
                                       MMO->isNonTemporal(), MMO->isInvariant());
@@ -6354,12 +6312,10 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
 
     switch (N->getOpcode()) {
     default:
-      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
-                                &Operands[0], Operands.size()));
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands));
       break;
     case ISD::VSELECT:
-      Scalars.push_back(getNode(ISD::SELECT, dl, EltVT,
-                                &Operands[0], Operands.size()));
+      Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands));
       break;
     case ISD::SHL:
     case ISD::SRA:
@@ -6384,8 +6340,7 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
     Scalars.push_back(getUNDEF(EltVT));
 
   return getNode(ISD::BUILD_VECTOR, dl,
-                 EVT::getVectorVT(*getContext(), EltVT, ResNE),
-                 &Scalars[0], Scalars.size());
+                 EVT::getVectorVT(*getContext(), EltVT, ResNE), Scalars);
 }
 
 
@@ -6419,8 +6374,8 @@ bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base,
       cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
     return true;
 
-  const GlobalValue *GV1 = NULL;
-  const GlobalValue *GV2 = NULL;
+  const GlobalValue *GV1 = nullptr;
+  const GlobalValue *GV2 = nullptr;
   int64_t Offset1 = 0;
   int64_t Offset2 = 0;
   const TargetLowering *TLI = TM.getTargetLowering();
@@ -6442,8 +6397,8 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
     unsigned PtrWidth = TLI->getPointerTypeSizeInBits(GV->getType());
     APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0);
-    llvm::ComputeMaskedBits(const_cast<GlobalValue*>(GV), KnownZero, KnownOne,
-                            TLI->getDataLayout());
+    llvm::computeKnownBits(const_cast<GlobalValue*>(GV), KnownZero, KnownOne,
+                           TLI->getDataLayout());
     unsigned AlignBits = KnownZero.countTrailingOnes();
     unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
     if (Align)
@@ -6505,6 +6460,22 @@ SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
   return std::make_pair(Lo, Hi);
 }
 
+void SelectionDAG::ExtractVectorElements(SDValue Op,
+                                         SmallVectorImpl<SDValue> &Args,
+                                         unsigned Start, unsigned Count) {
+  EVT VT = Op.getValueType();
+  if (Count == 0)
+    Count = VT.getVectorNumElements();
+
+  EVT EltVT = VT.getVectorElementType();
+  EVT IdxTy = TLI->getVectorIdxTy();
+  SDLoc SL(Op);
+  for (unsigned i = Start, e = Start + Count; i != e; ++i) {
+    Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                           Op, getConstant(i, IdxTy)));
+  }
+}
+
 // getAddressSpace - Return the address space this GlobalAddress belongs to.
 unsigned GlobalAddressSDNode::getAddressSpace() const {
   return getGlobal()->getType()->getAddressSpace();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 4a6e5cf..070e929 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "isel"
 #include "SelectionDAGBuilder.h"
 #include "SDNodeDbgValue.h"
 #include "llvm/ADT/BitVector.h"
@@ -62,6 +61,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "isel"
+
 /// LimitFloatPrecision - Generate low-precision inline sequences for
 /// some float libcalls (6, 8 or 12 bits).
 static unsigned LimitFloatPrecision;
@@ -276,9 +277,9 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL,
 
     // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
     // intermediate operands.
-    Val = DAG.getNode(IntermediateVT.isVector() ?
-                      ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, DL,
-                      ValueVT, &Ops[0], NumIntermediates);
+    Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS
+                                                : ISD::BUILD_VECTOR,
+                      DL, ValueVT, Ops);
   }
 
   // There is now one part, held in Val.  Correct it to match ValueVT.
@@ -495,7 +496,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL,
            e = PartVT.getVectorNumElements(); i != e; ++i)
         Ops.push_back(DAG.getUNDEF(ElementVT));
 
-      Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT, &Ops[0], Ops.size());
+      Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT, Ops);
 
       // FIXME: Use CONCAT for 2x -> 4x.
 
@@ -638,7 +639,7 @@ namespace {
     SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
                             SDLoc dl,
                             SDValue &Chain, SDValue *Flag,
-                            const Value *V = 0) const;
+                            const Value *V = nullptr) const;
 
     /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
     /// specified value into the registers specified by this object.  This uses
@@ -684,7 +685,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
     Parts.resize(NumRegs);
     for (unsigned i = 0; i != NumRegs; ++i) {
       SDValue P;
-      if (Flag == 0) {
+      if (!Flag) {
         P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
       } else {
         P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
@@ -752,9 +753,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
     Parts.clear();
   }
 
-  return DAG.getNode(ISD::MERGE_VALUES, dl,
-                     DAG.getVTList(&ValueVTs[0], ValueVTs.size()),
-                     &Values[0], ValueVTs.size());
+  return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values);
 }
 
 /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
@@ -785,7 +784,7 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl,
   SmallVector<SDValue, 8> Chains(NumRegs);
   for (unsigned i = 0; i != NumRegs; ++i) {
     SDValue Part;
-    if (Flag == 0) {
+    if (!Flag) {
       Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
     } else {
       Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
@@ -808,7 +807,7 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl,
     //        = op c3, ..., f2
     Chain = Chains[NumRegs-1];
   else
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], NumRegs);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
 }
 
 /// AddInlineAsmOperands - Add this value to the specified inlineasm node
@@ -877,7 +876,7 @@ void SelectionDAGBuilder::clear() {
   UnusedArgNodeMap.clear();
   PendingLoads.clear();
   PendingExports.clear();
-  CurInst = NULL;
+  CurInst = nullptr;
   HasTailCall = false;
   SDNodeOrder = LowestSDNodeOrder;
 }
@@ -910,7 +909,7 @@ SDValue SelectionDAGBuilder::getRoot() {
 
   // Otherwise, we have to make a token factor node.
   SDValue Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
-                               &PendingLoads[0], PendingLoads.size());
+                             PendingLoads);
   PendingLoads.clear();
   DAG.setRoot(Root);
   return Root;
@@ -940,8 +939,7 @@ SDValue SelectionDAGBuilder::getControlRoot() {
   }
 
   Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
-                     &PendingExports[0],
-                     PendingExports.size());
+                     PendingExports);
   PendingExports.clear();
   DAG.setRoot(Root);
   return Root;
@@ -961,7 +959,7 @@ void SelectionDAGBuilder::visit(const Instruction &I) {
   if (!isa<TerminatorInst>(&I) && !HasTailCall)
     CopyToExportRegsIfNeeded(&I);
 
-  CurInst = NULL;
+  CurInst = nullptr;
 }
 
 void SelectionDAGBuilder::visitPHI(const PHINode &) {
@@ -991,11 +989,14 @@ void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
     unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
     MDNode *Variable = DI->getVariable();
     uint64_t Offset = DI->getOffset();
+    // A dbg.value for an alloca is always indirect.
+    bool IsIndirect = isa<AllocaInst>(V) || Offset != 0;
     SDDbgValue *SDV;
     if (Val.getNode()) {
-      if (!EmitFuncArgumentDbgValue(V, Variable, Offset, Val)) {
+      if (!EmitFuncArgumentDbgValue(V, Variable, Offset, IsIndirect, Val)) {
         SDV = DAG.getDbgValue(Variable, Val.getNode(),
-                              Val.getResNo(), Offset, dl, DbgSDNodeOrder);
+                              Val.getResNo(), IsIndirect,
+			      Offset, dl, DbgSDNodeOrder);
         DAG.AddDbgValue(SDV, Val.getNode(), false);
       }
     } else
@@ -1020,7 +1021,7 @@ SDValue SelectionDAGBuilder::getValue(const Value *V) {
     RegsForValue RFV(*DAG.getContext(), *TM.getTargetLowering(),
                      InReg, V->getType());
     SDValue Chain = DAG.getEntryNode();
-    N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, NULL, V);
+    N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
     resolveDanglingDebugInfo(V, N);
     return N;
   }
@@ -1091,8 +1092,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
           Constants.push_back(SDValue(Val, i));
       }
 
-      return DAG.getMergeValues(&Constants[0], Constants.size(),
-                                getCurSDLoc());
+      return DAG.getMergeValues(Constants, getCurSDLoc());
     }
 
     if (const ConstantDataSequential *CDS =
@@ -1107,9 +1107,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
       }
 
       if (isa<ArrayType>(CDS->getType()))
-        return DAG.getMergeValues(&Ops[0], Ops.size(), getCurSDLoc());
+        return DAG.getMergeValues(Ops, getCurSDLoc());
       return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(),
-                                      VT, &Ops[0], Ops.size());
+                                      VT, Ops);
     }
 
     if (C->getType()->isStructTy() || C->getType()->isArrayTy()) {
@@ -1132,8 +1132,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
           Constants[i] = DAG.getConstant(0, EltVT);
       }
 
-      return DAG.getMergeValues(&Constants[0], NumElts,
-                                getCurSDLoc());
+      return DAG.getMergeValues(Constants, getCurSDLoc());
     }
 
     if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
@@ -1161,8 +1160,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     }
 
     // Create a BUILD_VECTOR node.
-    return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(),
-                                    VT, &Ops[0], Ops.size());
+    return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), VT, Ops);
   }
 
   // If this is a static alloca, generate it as the frameindex instead of
@@ -1179,7 +1177,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
     RegsForValue RFV(*DAG.getContext(), *TLI, InReg, Inst->getType());
     SDValue Chain = DAG.getEntryNode();
-    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, NULL, V);
+    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
 
   llvm_unreachable("Can't get register for value!");
@@ -1223,7 +1221,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     }
 
     Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
-                        MVT::Other, &Chains[0], NumValues);
+                        MVT::Other, Chains);
   } else if (I.getNumOperands() != 0) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*TLI, I.getOperand(0)->getType(), ValueVTs);
@@ -1406,8 +1404,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
         llvm_unreachable("Unknown compare instruction");
       }
 
-      CaseBlock CB(Condition, BOp->getOperand(0),
-                   BOp->getOperand(1), NULL, TBB, FBB, CurBB, TWeight, FWeight);
+      CaseBlock CB(Condition, BOp->getOperand(0), BOp->getOperand(1), nullptr,
+                   TBB, FBB, CurBB, TWeight, FWeight);
       SwitchCases.push_back(CB);
       return;
     }
@@ -1415,7 +1413,7 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
 
   // Create a CaseBlock record representing this branch.
   CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()),
-               NULL, TBB, FBB, CurBB, TWeight, FWeight);
+               nullptr, TBB, FBB, CurBB, TWeight, FWeight);
   SwitchCases.push_back(CB);
 }
 
@@ -1562,7 +1560,7 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
   MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)];
 
   // Figure out which block is immediately after the current one.
-  MachineBasicBlock *NextBlock = 0;
+  MachineBasicBlock *NextBlock = nullptr;
   MachineFunction::iterator BBI = BrMBB;
   if (++BBI != FuncInfo.MF->end())
     NextBlock = BBI;
@@ -1639,7 +1637,7 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
 
   // Create a CaseBlock record representing this branch.
   CaseBlock CB(ISD::SETEQ, CondVal, ConstantInt::getTrue(*DAG.getContext()),
-               NULL, Succ0MBB, Succ1MBB, BrMBB);
+               nullptr, Succ0MBB, Succ1MBB, BrMBB);
 
   // Use visitSwitchCase to actually insert the fast branch sequence for this
   // cond branch.
@@ -1655,7 +1653,7 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
   SDLoc dl = getCurSDLoc();
 
   // Build the setcc now.
-  if (CB.CmpMHS == NULL) {
+  if (!CB.CmpMHS) {
     // Fold "(X == true)" to X and "(X == false)" to !X to
     // handle common cases produced by branch lowering.
     if (CB.CmpRHS == ConstantInt::getTrue(*DAG.getContext()) &&
@@ -1696,7 +1694,7 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
   // This is used to avoid emitting unnecessary branches to the next block.
-  MachineBasicBlock *NextBlock = 0;
+  MachineBasicBlock *NextBlock = nullptr;
   MachineFunction::iterator BBI = SwitchBB;
   if (++BBI != FuncInfo.MF->end())
     NextBlock = BBI;
@@ -1774,7 +1772,7 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
   // This is used to avoid emitting unnecessary branches to the next block.
-  MachineBasicBlock *NextBlock = 0;
+  MachineBasicBlock *NextBlock = nullptr;
   MachineFunction::iterator BBI = SwitchBB;
 
   if (++BBI != FuncInfo.MF->end())
@@ -1857,8 +1855,8 @@ void
 SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
   const TargetLowering *TLI = TM.getTargetLowering();
   SDValue Chain = TLI->makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL,
-                                   MVT::isVoid, 0, 0, false, getCurSDLoc(),
-                                   false, false).second;
+                                   MVT::isVoid, nullptr, 0, false,
+                                   getCurSDLoc(), false, false).second;
   DAG.setRoot(Chain);
 }
 
@@ -1905,7 +1903,7 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
   // This is used to avoid emitting unnecessary branches to the next block.
-  MachineBasicBlock *NextBlock = 0;
+  MachineBasicBlock *NextBlock = nullptr;
   MachineFunction::iterator BBI = SwitchBB;
   if (++BBI != FuncInfo.MF->end())
     NextBlock = BBI;
@@ -1979,7 +1977,7 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
   // This is used to avoid emitting unnecessary branches to the next block.
-  MachineBasicBlock *NextBlock = 0;
+  MachineBasicBlock *NextBlock = nullptr;
   MachineFunction::iterator BBI = SwitchBB;
   if (++BBI != FuncInfo.MF->end())
     NextBlock = BBI;
@@ -2059,8 +2057,7 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
 
   // Merge into one.
   SDValue Res = DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
-                            DAG.getVTList(&ValueVTs[0], ValueVTs.size()),
-                            &Ops[0], 2);
+                            DAG.getVTList(ValueVTs), Ops);
   setValue(&LP, Res);
 }
 
@@ -2081,7 +2078,7 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
   MachineFunction *CurMF = FuncInfo.MF;
 
   // Figure out which block is immediately after the current one.
-  MachineBasicBlock *NextBlock = 0;
+  MachineBasicBlock *NextBlock = nullptr;
   MachineFunction::iterator BBI = CR.CaseBB;
 
   if (++BBI != FuncInfo.MF->end())
@@ -2192,7 +2189,7 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
     if (I->High == I->Low) {
       // This is just small small case range :) containing exactly 1 case
       CC = ISD::SETEQ;
-      LHS = SV; RHS = I->High; MHS = NULL;
+      LHS = SV; RHS = I->High; MHS = nullptr;
     } else {
       CC = ISD::SETLE;
       LHS = I->Low; MHS = SV; RHS = I->High;
@@ -2427,7 +2424,7 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
   CaseRange LHSR(CR.Range.first, Pivot);
   CaseRange RHSR(Pivot, CR.Range.second);
   const Constant *C = Pivot->Low;
-  MachineBasicBlock *FalseBB = 0, *TrueBB = 0;
+  MachineBasicBlock *FalseBB = nullptr, *TrueBB = nullptr;
 
   // We know that we branch to the LHS if the Value being switched on is
   // less than the Pivot value, C.  We use this to optimize our binary
@@ -2469,7 +2466,7 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
   // Create a CaseBlock record representing a conditional branch to
   // the LHS node if the value being switched on SV is less than C.
   // Otherwise, branch to LHS.
-  CaseBlock CB(ISD::SETLT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
+  CaseBlock CB(ISD::SETLT, SV, C, nullptr, TrueBB, FalseBB, CR.CaseBB);
 
   if (CR.CaseBB == SwitchBB)
     visitSwitchCase(CB, SwitchBB);
@@ -2682,7 +2679,7 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
   MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
 
   // Figure out which block is immediately after the current one.
-  MachineBasicBlock *NextBlock = 0;
+  MachineBasicBlock *NextBlock = nullptr;
   MachineBasicBlock *Default = FuncInfo.MBBMap[SI.getDefaultDest()];
 
   // If there is only the default destination, branch to it if it is not the
@@ -2716,7 +2713,7 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
 
   // Push the initial CaseRec onto the worklist
   CaseRecVector WorkList;
-  WorkList.push_back(CaseRec(SwitchMBB,0,0,
+  WorkList.push_back(CaseRec(SwitchMBB,nullptr,nullptr,
                              CaseRange(Cases.begin(),Cases.end())));
 
   while (!WorkList.empty()) {
@@ -2765,6 +2762,11 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
                           getValue(I.getAddress())));
 }
 
+void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
+  if (DAG.getTarget().Options.TrapUnreachable)
+    DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
+}
+
 void SelectionDAGBuilder::visitFSub(const User &I) {
   // -0.0 - X --> fneg
   Type *Ty = I.getType();
@@ -2887,8 +2889,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
                                     FalseVal.getResNo() + i));
 
   setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
-                           DAG.getVTList(&ValueVTs[0], NumValues),
-                           &Values[0], NumValues));
+                           DAG.getVTList(ValueVTs), Values));
 }
 
 void SelectionDAGBuilder::visitTrunc(const User &I) {
@@ -3097,11 +3098,9 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
     MOps2[0] = Src2;
 
     Src1 = Src1U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS,
-                                                  getCurSDLoc(), VT,
-                                                  &MOps1[0], NumConcat);
+                                                  getCurSDLoc(), VT, MOps1);
     Src2 = Src2U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS,
-                                                  getCurSDLoc(), VT,
-                                                  &MOps2[0], NumConcat);
+                                                  getCurSDLoc(), VT, MOps2);
 
     // Readjust mask for new input vector length.
     SmallVector<int, 8> MappedOps;
@@ -3219,8 +3218,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
     Ops.push_back(Res);
   }
 
-  setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(),
-                           VT, &Ops[0], Ops.size()));
+  setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), VT, Ops));
 }
 
 void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
@@ -3262,8 +3260,7 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
                 SDValue(Agg.getNode(), Agg.getResNo() + i);
 
   setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
-                           DAG.getVTList(&AggValueVTs[0], NumAggValues),
-                           &Values[0], NumAggValues));
+                           DAG.getVTList(AggValueVTs), Values));
 }
 
 void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
@@ -3297,8 +3294,7 @@ void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
         SDValue(Agg.getNode(), Agg.getResNo() + i);
 
   setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
-                           DAG.getVTList(&ValValueVTs[0], NumValValues),
-                           &Values[0], NumValValues));
+                           DAG.getVTList(ValValueVTs), Values));
 }
 
 void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
@@ -3420,8 +3416,7 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
 
   SDValue Ops[] = { getRoot(), AllocSize, DAG.getIntPtrConstant(Align) };
   SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
-  SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, getCurSDLoc(),
-                            VTs, Ops, 3);
+  SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, getCurSDLoc(), VTs, Ops);
   setValue(&I, DSA);
   DAG.setRoot(DSA.getValue(1));
 
@@ -3438,8 +3433,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   Type *Ty = I.getType();
 
   bool isVolatile = I.isVolatile();
-  bool isNonTemporal = I.getMetadata("nontemporal") != 0;
-  bool isInvariant = I.getMetadata("invariant.load") != 0;
+  bool isNonTemporal = I.getMetadata("nontemporal") != nullptr;
+  bool isInvariant = I.getMetadata("invariant.load") != nullptr;
   unsigned Alignment = I.getAlignment();
   const MDNode *TBAAInfo = I.getMetadata(LLVMContext::MD_tbaa);
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
@@ -3484,8 +3479,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
     // (MaxParallelChains should always remain as failsafe).
     if (ChainI == MaxParallelChains) {
       assert(PendingLoads.empty() && "PendingLoads must be serialized first");
-      SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
-                                  MVT::Other, &Chains[0], ChainI);
+      SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
+                                  makeArrayRef(Chains.data(), ChainI));
       Root = Chain;
       ChainI = 0;
     }
@@ -3502,8 +3497,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   }
 
   if (!ConstantMemory) {
-    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
-                                MVT::Other, &Chains[0], ChainI);
+    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
+                                makeArrayRef(Chains.data(), ChainI));
     if (isVolatile)
       DAG.setRoot(Chain);
     else
@@ -3511,8 +3506,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   }
 
   setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
-                           DAG.getVTList(&ValueVTs[0], NumValues),
-                           &Values[0], NumValues));
+                           DAG.getVTList(ValueVTs), Values));
 }
 
 void SelectionDAGBuilder::visitStore(const StoreInst &I) {
@@ -3540,7 +3534,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
                                           NumValues));
   EVT PtrVT = Ptr.getValueType();
   bool isVolatile = I.isVolatile();
-  bool isNonTemporal = I.getMetadata("nontemporal") != 0;
+  bool isNonTemporal = I.getMetadata("nontemporal") != nullptr;
   unsigned Alignment = I.getAlignment();
   const MDNode *TBAAInfo = I.getMetadata(LLVMContext::MD_tbaa);
 
@@ -3548,8 +3542,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
     // See visitLoad comments.
     if (ChainI == MaxParallelChains) {
-      SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
-                                  MVT::Other, &Chains[0], ChainI);
+      SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
+                                  makeArrayRef(Chains.data(), ChainI));
       Root = Chain;
       ChainI = 0;
     }
@@ -3562,8 +3556,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
     Chains[ChainI] = St;
   }
 
-  SDValue StoreNode = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
-                                  MVT::Other, &Chains[0], ChainI);
+  SDValue StoreNode = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
+                                  makeArrayRef(Chains.data(), ChainI));
   DAG.setRoot(StoreNode);
 }
 
@@ -3588,7 +3582,7 @@ static SDValue InsertFenceForAtomic(SDValue Chain, AtomicOrdering Order,
   Ops[0] = Chain;
   Ops[1] = DAG.getConstant(Order, TLI.getPointerTy());
   Ops[2] = DAG.getConstant(Scope, TLI.getPointerTy());
-  return DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops, 3);
+  return DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops);
 }
 
 void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
@@ -3680,7 +3674,7 @@ void SelectionDAGBuilder::visitFence(const FenceInst &I) {
   Ops[0] = getRoot();
   Ops[1] = DAG.getConstant(I.getOrdering(), TLI->getPointerTy());
   Ops[2] = DAG.getConstant(I.getSynchScope(), TLI->getPointerTy());
-  DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops, 3));
+  DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops));
 }
 
 void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
@@ -3696,13 +3690,21 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
   if (I.getAlignment() < VT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic load");
 
+  MachineMemOperand *MMO =
+      DAG.getMachineFunction().
+      getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
+                           MachineMemOperand::MOVolatile |
+                           MachineMemOperand::MOLoad,
+                           VT.getStoreSize(),
+                           I.getAlignment() ? I.getAlignment() :
+                                              DAG.getEVTAlignment(VT));
+
   InChain = TLI->prepareVolatileOrAtomicLoad(InChain, dl, DAG);
   SDValue L =
-    DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain,
-                  getValue(I.getPointerOperand()),
-                  I.getPointerOperand(), I.getAlignment(),
-                  TLI->getInsertFencesForAtomic() ? Monotonic : Order,
-                  Scope);
+      DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain,
+                    getValue(I.getPointerOperand()), MMO,
+                    TLI->getInsertFencesForAtomic() ? Monotonic : Order,
+                    Scope);
 
   SDValue OutChain = L.getValue(1);
 
@@ -3788,27 +3790,23 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   if (HasChain)
     ValueVTs.push_back(MVT::Other);
 
-  SDVTList VTs = DAG.getVTList(ValueVTs.data(), ValueVTs.size());
+  SDVTList VTs = DAG.getVTList(ValueVTs);
 
   // Create the node.
   SDValue Result;
   if (IsTgtIntrinsic) {
     // This is target intrinsic that touches memory
     Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(),
-                                     VTs, &Ops[0], Ops.size(),
-                                     Info.memVT,
+                                     VTs, Ops, Info.memVT,
                                    MachinePointerInfo(Info.ptrVal, Info.offset),
                                      Info.align, Info.vol,
                                      Info.readMem, Info.writeMem);
   } else if (!HasChain) {
-    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(),
-                         VTs, &Ops[0], Ops.size());
+    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
   } else if (!I.getType()->isVoidTy()) {
-    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(),
-                         VTs, &Ops[0], Ops.size());
+    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
   } else {
-    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(),
-                         VTs, &Ops[0], Ops.size());
+    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
   }
 
   if (HasChain) {
@@ -4530,7 +4528,7 @@ static unsigned getTruncatedArgReg(const SDValue &N) {
 /// At the end of instruction selection, they will be inserted to the entry BB.
 bool
 SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
-                                              int64_t Offset,
+                                              int64_t Offset, bool IsIndirect,
                                               const SDValue &N) {
   const Argument *Arg = dyn_cast<Argument>(V);
   if (!Arg)
@@ -4582,8 +4580,6 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
   if (!Op)
     return false;
 
-  // FIXME: This does not handle register-indirect values at offset 0.
-  bool IsIndirect = Offset != 0;
   if (Op->isReg())
     FuncInfo.ArgDbgValues.push_back(BuildMI(MF, getCurDebugLoc(),
                                             TII->get(TargetOpcode::DBG_VALUE),
@@ -4619,18 +4615,34 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   default:
     // By default, turn this into a target intrinsic node.
     visitTargetIntrinsic(I, Intrinsic);
-    return 0;
-  case Intrinsic::vastart:  visitVAStart(I); return 0;
-  case Intrinsic::vaend:    visitVAEnd(I); return 0;
-  case Intrinsic::vacopy:   visitVACopy(I); return 0;
+    return nullptr;
+  case Intrinsic::vastart:  visitVAStart(I); return nullptr;
+  case Intrinsic::vaend:    visitVAEnd(I); return nullptr;
+  case Intrinsic::vacopy:   visitVACopy(I); return nullptr;
   case Intrinsic::returnaddress:
     setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl, TLI->getPointerTy(),
                              getValue(I.getArgOperand(0))));
-    return 0;
+    return nullptr;
   case Intrinsic::frameaddress:
     setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl, TLI->getPointerTy(),
                              getValue(I.getArgOperand(0))));
-    return 0;
+    return nullptr;
+  case Intrinsic::read_register: {
+    Value *Reg = I.getArgOperand(0);
+    SDValue RegName = DAG.getMDNode(cast<MDNode>(Reg));
+    EVT VT = TM.getTargetLowering()->getValueType(I.getType());
+    setValue(&I, DAG.getNode(ISD::READ_REGISTER, sdl, VT, RegName));
+    return nullptr;
+  }
+  case Intrinsic::write_register: {
+    Value *Reg = I.getArgOperand(0);
+    Value *RegValue = I.getArgOperand(1);
+    SDValue Chain = getValue(RegValue).getOperand(0);
+    SDValue RegName = DAG.getMDNode(cast<MDNode>(Reg));
+    DAG.setRoot(DAG.getNode(ISD::WRITE_REGISTER, sdl, MVT::Other, Chain,
+                            RegName, getValue(RegValue)));
+    return nullptr;
+  }
   case Intrinsic::setjmp:
     return &"_setjmp"[!TLI->usesUnderscoreSetJmp()];
   case Intrinsic::longjmp:
@@ -4653,7 +4665,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     DAG.setRoot(DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol, false,
                               MachinePointerInfo(I.getArgOperand(0)),
                               MachinePointerInfo(I.getArgOperand(1))));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::memset: {
     // Assert for address < 256 since we support only user defined address
@@ -4670,7 +4682,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
     DAG.setRoot(DAG.getMemset(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
                               MachinePointerInfo(I.getArgOperand(0))));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::memmove: {
     // Assert for address < 256 since we support only user defined address
@@ -4690,7 +4702,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     DAG.setRoot(DAG.getMemmove(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
                                MachinePointerInfo(I.getArgOperand(0)),
                                MachinePointerInfo(I.getArgOperand(1))));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst &DI = cast<DbgDeclareInst>(I);
@@ -4701,14 +4713,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       "Variable in DbgDeclareInst should be either null or a DIVariable.");
     if (!Address || !DIVar) {
       DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
-      return 0;
+      return nullptr;
     }
 
     // Check if address has undef value.
     if (isa<UndefValue>(Address) ||
         (Address->use_empty() && !isa<Argument>(Address))) {
       DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
-      return 0;
+      return nullptr;
     }
 
     SDValue &N = NodeMap[Address];
@@ -4730,29 +4742,29 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
         FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
         if (FINode)
           // Byval parameter.  We have a frame index at this point.
-          SDV = DAG.getDbgValue(Variable, FINode->getIndex(),
-                                0, dl, SDNodeOrder);
+          SDV = DAG.getFrameIndexDbgValue(Variable, FINode->getIndex(),
+					  0, dl, SDNodeOrder);
         else {
           // Address is an argument, so try to emit its dbg value using
           // virtual register info from the FuncInfo.ValueMap.
-          EmitFuncArgumentDbgValue(Address, Variable, 0, N);
-          return 0;
+          EmitFuncArgumentDbgValue(Address, Variable, 0, false, N);
+          return nullptr;
         }
       } else if (AI)
         SDV = DAG.getDbgValue(Variable, N.getNode(), N.getResNo(),
-                              0, dl, SDNodeOrder);
+                              true, 0, dl, SDNodeOrder);
       else {
         // Can't do anything with other non-AI cases yet.
         DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
         DEBUG(dbgs() << "non-AllocaInst issue for Address: \n\t");
         DEBUG(Address->dump());
-        return 0;
+        return nullptr;
       }
       DAG.AddDbgValue(SDV, N.getNode(), isParameter);
     } else {
       // If Address is an argument then try to emit its dbg value using
       // virtual register info from the FuncInfo.ValueMap.
-      if (!EmitFuncArgumentDbgValue(Address, Variable, 0, N)) {
+      if (!EmitFuncArgumentDbgValue(Address, Variable, 0, false, N)) {
         // If variable is pinned by a alloca in dominating bb then
         // use StaticAllocaMap.
         if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
@@ -4760,17 +4772,17 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
             DenseMap<const AllocaInst*, int>::iterator SI =
               FuncInfo.StaticAllocaMap.find(AI);
             if (SI != FuncInfo.StaticAllocaMap.end()) {
-              SDV = DAG.getDbgValue(Variable, SI->second,
-                                    0, dl, SDNodeOrder);
-              DAG.AddDbgValue(SDV, 0, false);
-              return 0;
+              SDV = DAG.getFrameIndexDbgValue(Variable, SI->second,
+                                              0, dl, SDNodeOrder);
+              DAG.AddDbgValue(SDV, nullptr, false);
+              return nullptr;
             }
           }
         }
         DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
       }
     }
-    return 0;
+    return nullptr;
   }
   case Intrinsic::dbg_value: {
     const DbgValueInst &DI = cast<DbgValueInst>(I);
@@ -4778,18 +4790,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     assert((!DIVar || DIVar.isVariable()) &&
       "Variable in DbgValueInst should be either null or a DIVariable.");
     if (!DIVar)
-      return 0;
+      return nullptr;
 
     MDNode *Variable = DI.getVariable();
     uint64_t Offset = DI.getOffset();
     const Value *V = DI.getValue();
     if (!V)
-      return 0;
+      return nullptr;
 
     SDDbgValue *SDV;
     if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V)) {
-      SDV = DAG.getDbgValue(Variable, V, Offset, dl, SDNodeOrder);
-      DAG.AddDbgValue(SDV, 0, false);
+      SDV = DAG.getConstantDbgValue(Variable, V, Offset, dl, SDNodeOrder);
+      DAG.AddDbgValue(SDV, nullptr, false);
     } else {
       // Do not use getValue() in here; we don't want to generate code at
       // this point if it hasn't been done yet.
@@ -4798,9 +4810,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
         // Check unused arguments map.
         N = UnusedArgNodeMap[V];
       if (N.getNode()) {
-        if (!EmitFuncArgumentDbgValue(V, Variable, Offset, N)) {
+        // A dbg.value for an alloca is always indirect.
+        bool IsIndirect = isa<AllocaInst>(V) || Offset != 0;
+        if (!EmitFuncArgumentDbgValue(V, Variable, Offset, IsIndirect, N)) {
           SDV = DAG.getDbgValue(Variable, N.getNode(),
-                                N.getResNo(), Offset, dl, SDNodeOrder);
+                                N.getResNo(), IsIndirect,
+				Offset, dl, SDNodeOrder);
           DAG.AddDbgValue(SDV, N.getNode(), false);
         }
       } else if (!V->use_empty() ) {
@@ -4823,18 +4838,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     if (!AI) {
       DEBUG(dbgs() << "Dropping debug location info for:\n  " << DI << "\n");
       DEBUG(dbgs() << "  Last seen at:\n    " << *V << "\n");
-      return 0;
+      return nullptr;
     }
     DenseMap<const AllocaInst*, int>::iterator SI =
       FuncInfo.StaticAllocaMap.find(AI);
     if (SI == FuncInfo.StaticAllocaMap.end())
-      return 0; // VLAs.
-    int FI = SI->second;
-
-    MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
-    if (!DI.getDebugLoc().isUnknown() && MMI.hasDebugInfo())
-      MMI.setVariableDbgInfo(Variable, FI, DI.getDebugLoc());
-    return 0;
+      return nullptr; // VLAs.
+    return nullptr;
   }
 
   case Intrinsic::eh_typeid_for: {
@@ -4843,7 +4853,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     unsigned TypeID = DAG.getMachineFunction().getMMI().getTypeIDFor(GV);
     Res = DAG.getConstant(TypeID, MVT::i32);
     setValue(&I, Res);
-    return 0;
+    return nullptr;
   }
 
   case Intrinsic::eh_return_i32:
@@ -4854,10 +4864,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                             getControlRoot(),
                             getValue(I.getArgOperand(0)),
                             getValue(I.getArgOperand(1))));
-    return 0;
+    return nullptr;
   case Intrinsic::eh_unwind_init:
     DAG.getMachineFunction().getMMI().setCallsUnwindInit(true);
-    return 0;
+    return nullptr;
   case Intrinsic::eh_dwarf_cfa: {
     SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), sdl,
                                         TLI->getPointerTy());
@@ -4871,7 +4881,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              DAG.getConstant(0, TLI->getPointerTy()));
     setValue(&I, DAG.getNode(ISD::ADD, sdl, FA.getValueType(),
                              FA, Offset));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::eh_sjlj_callsite: {
     MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
@@ -4880,7 +4890,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");
 
     MMI.setCurrentCallSite(CI->getZExtValue());
-    return 0;
+    return nullptr;
   }
   case Intrinsic::eh_sjlj_functioncontext: {
     // Get and store the index of the function context.
@@ -4889,23 +4899,22 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       cast<AllocaInst>(I.getArgOperand(0)->stripPointerCasts());
     int FI = FuncInfo.StaticAllocaMap[FnCtx];
     MFI->setFunctionContextIndex(FI);
-    return 0;
+    return nullptr;
   }
   case Intrinsic::eh_sjlj_setjmp: {
     SDValue Ops[2];
     Ops[0] = getRoot();
     Ops[1] = getValue(I.getArgOperand(0));
     SDValue Op = DAG.getNode(ISD::EH_SJLJ_SETJMP, sdl,
-                             DAG.getVTList(MVT::i32, MVT::Other),
-                             Ops, 2);
+                             DAG.getVTList(MVT::i32, MVT::Other), Ops);
     setValue(&I, Op.getValue(0));
     DAG.setRoot(Op.getValue(1));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::eh_sjlj_longjmp: {
     DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, sdl, MVT::Other,
                             getRoot(), getValue(I.getArgOperand(0))));
-    return 0;
+    return nullptr;
   }
 
   case Intrinsic::x86_mmx_pslli_w:
@@ -4919,7 +4928,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue ShAmt = getValue(I.getArgOperand(1));
     if (isa<ConstantSDNode>(ShAmt)) {
       visitTargetIntrinsic(I, Intrinsic);
-      return 0;
+      return nullptr;
     }
     unsigned NewIntrinsic = 0;
     EVT ShAmtVT = MVT::v2i32;
@@ -4958,14 +4967,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue ShOps[2];
     ShOps[0] = ShAmt;
     ShOps[1] = DAG.getConstant(0, MVT::i32);
-    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, sdl, ShAmtVT, &ShOps[0], 2);
+    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, sdl, ShAmtVT, ShOps);
     EVT DestVT = TLI->getValueType(I.getType());
     ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
     Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
                        DAG.getConstant(NewIntrinsic, MVT::i32),
                        getValue(I.getArgOperand(0)), ShAmt);
     setValue(&I, Res);
-    return 0;
+    return nullptr;
   }
   case Intrinsic::x86_avx_vinsertf128_pd_256:
   case Intrinsic::x86_avx_vinsertf128_ps_256:
@@ -4980,7 +4989,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                       getValue(I.getArgOperand(1)),
                       DAG.getConstant(Idx, TLI->getVectorIdxTy()));
     setValue(&I, Res);
-    return 0;
+    return nullptr;
   }
   case Intrinsic::x86_avx_vextractf128_pd_256:
   case Intrinsic::x86_avx_vextractf128_ps_256:
@@ -4993,7 +5002,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                       getValue(I.getArgOperand(0)),
                       DAG.getConstant(Idx, TLI->getVectorIdxTy()));
     setValue(&I, Res);
-    return 0;
+    return nullptr;
   }
   case Intrinsic::convertff:
   case Intrinsic::convertfsi:
@@ -5026,31 +5035,31 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                                getValue(I.getArgOperand(2)),
                                Code);
     setValue(&I, Res);
-    return 0;
+    return nullptr;
   }
   case Intrinsic::powi:
     setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
                             getValue(I.getArgOperand(1)), DAG));
-    return 0;
+    return nullptr;
   case Intrinsic::log:
     setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
-    return 0;
+    return nullptr;
   case Intrinsic::log2:
     setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
-    return 0;
+    return nullptr;
   case Intrinsic::log10:
     setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
-    return 0;
+    return nullptr;
   case Intrinsic::exp:
     setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
-    return 0;
+    return nullptr;
   case Intrinsic::exp2:
     setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
-    return 0;
+    return nullptr;
   case Intrinsic::pow:
     setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)),
                            getValue(I.getArgOperand(1)), DAG, *TLI));
-    return 0;
+    return nullptr;
   case Intrinsic::sqrt:
   case Intrinsic::fabs:
   case Intrinsic::sin:
@@ -5079,21 +5088,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, DAG.getNode(Opcode, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0))));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::copysign:
     setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1))));
-    return 0;
+    return nullptr;
   case Intrinsic::fma:
     setValue(&I, DAG.getNode(ISD::FMA, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1)),
                              getValue(I.getArgOperand(2))));
-    return 0;
+    return nullptr;
   case Intrinsic::fmuladd: {
     EVT VT = TLI->getValueType(I.getType());
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
@@ -5114,42 +5123,41 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                                 getValue(I.getArgOperand(2)));
       setValue(&I, Add);
     }
-    return 0;
+    return nullptr;
   }
   case Intrinsic::convert_to_fp16:
     setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, sdl,
                              MVT::i16, getValue(I.getArgOperand(0))));
-    return 0;
+    return nullptr;
   case Intrinsic::convert_from_fp16:
     setValue(&I, DAG.getNode(ISD::FP16_TO_FP32, sdl,
                              MVT::f32, getValue(I.getArgOperand(0))));
-    return 0;
+    return nullptr;
   case Intrinsic::pcmarker: {
     SDValue Tmp = getValue(I.getArgOperand(0));
     DAG.setRoot(DAG.getNode(ISD::PCMARKER, sdl, MVT::Other, getRoot(), Tmp));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::readcyclecounter: {
     SDValue Op = getRoot();
     Res = DAG.getNode(ISD::READCYCLECOUNTER, sdl,
-                      DAG.getVTList(MVT::i64, MVT::Other),
-                      &Op, 1);
+                      DAG.getVTList(MVT::i64, MVT::Other), Op);
     setValue(&I, Res);
     DAG.setRoot(Res.getValue(1));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::bswap:
     setValue(&I, DAG.getNode(ISD::BSWAP, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0))));
-    return 0;
+    return nullptr;
   case Intrinsic::cttz: {
     SDValue Arg = getValue(I.getArgOperand(0));
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTTZ : ISD::CTTZ_ZERO_UNDEF,
                              sdl, Ty, Arg));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::ctlz: {
     SDValue Arg = getValue(I.getArgOperand(0));
@@ -5157,26 +5165,26 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTLZ : ISD::CTLZ_ZERO_UNDEF,
                              sdl, Ty, Arg));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::ctpop: {
     SDValue Arg = getValue(I.getArgOperand(0));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(ISD::STACKSAVE, sdl,
-                      DAG.getVTList(TLI->getPointerTy(), MVT::Other), &Op, 1);
+                      DAG.getVTList(TLI->getPointerTy(), MVT::Other), Op);
     setValue(&I, Res);
     DAG.setRoot(Res.getValue(1));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::stackrestore: {
     Res = getValue(I.getArgOperand(0));
     DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::stackprotector: {
     // Emit code into the DAG to store the stack guard onto the stack.
@@ -5198,7 +5206,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                        true, false, 0);
     setValue(&I, Res);
     DAG.setRoot(Res);
-    return 0;
+    return nullptr;
   }
   case Intrinsic::objectsize: {
     // If we don't know by now, we're never going to know.
@@ -5215,16 +5223,16 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       Res = DAG.getConstant(0, Ty);
 
     setValue(&I, Res);
-    return 0;
+    return nullptr;
   }
   case Intrinsic::annotation:
   case Intrinsic::ptr_annotation:
     // Drop the intrinsic, but forward the value
     setValue(&I, getValue(I.getOperand(0)));
-    return 0;
+    return nullptr;
   case Intrinsic::var_annotation:
     // Discard annotate attributes
-    return 0;
+    return nullptr;
 
   case Intrinsic::init_trampoline: {
     const Function *F = cast<Function>(I.getArgOperand(1)->stripPointerCasts());
@@ -5237,16 +5245,16 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
     Ops[5] = DAG.getSrcValue(F);
 
-    Res = DAG.getNode(ISD::INIT_TRAMPOLINE, sdl, MVT::Other, Ops, 6);
+    Res = DAG.getNode(ISD::INIT_TRAMPOLINE, sdl, MVT::Other, Ops);
 
     DAG.setRoot(Res);
-    return 0;
+    return nullptr;
   }
   case Intrinsic::adjust_trampoline: {
     setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, sdl,
                              TLI->getPointerTy(),
                              getValue(I.getArgOperand(0))));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::gcroot:
     if (GFI) {
@@ -5256,18 +5264,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
       GFI->addStackRoot(FI->getIndex(), TypeMap);
     }
-    return 0;
+    return nullptr;
   case Intrinsic::gcread:
   case Intrinsic::gcwrite:
     llvm_unreachable("GC failed to lower gcread/gcwrite intrinsics!");
   case Intrinsic::flt_rounds:
     setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, sdl, MVT::i32));
-    return 0;
+    return nullptr;
 
   case Intrinsic::expect: {
     // Just replace __builtin_expect(exp, c) with EXP.
     setValue(&I, getValue(I.getArgOperand(0)));
-    return 0;
+    return nullptr;
   }
 
   case Intrinsic::debugtrap:
@@ -5277,20 +5285,19 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ?
         ISD::TRAP : ISD::DEBUGTRAP;
       DAG.setRoot(DAG.getNode(Op, sdl,MVT::Other, getRoot()));
-      return 0;
+      return nullptr;
     }
     TargetLowering::ArgListTy Args;
-    TargetLowering::
-    CallLoweringInfo CLI(getRoot(), I.getType(),
-                 false, false, false, false, 0, CallingConv::C,
-                 /*isTailCall=*/false,
-                 /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
-                 DAG.getExternalSymbol(TrapFuncName.data(),
-                                       TLI->getPointerTy()),
-                 Args, DAG, sdl);
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(sdl).setChain(getRoot())
+      .setCallee(CallingConv::C, I.getType(),
+                 DAG.getExternalSymbol(TrapFuncName.data(), TLI->getPointerTy()),
+                 &Args, 0);
+
     std::pair<SDValue, SDValue> Result = TLI->LowerCallTo(CLI);
     DAG.setRoot(Result.second);
-    return 0;
+    return nullptr;
   }
 
   case Intrinsic::uadd_with_overflow:
@@ -5314,7 +5321,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 
     SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
     setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::prefetch: {
     SDValue Ops[5];
@@ -5325,22 +5332,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     Ops[3] = getValue(I.getArgOperand(2));
     Ops[4] = getValue(I.getArgOperand(3));
     DAG.setRoot(DAG.getMemIntrinsicNode(ISD::PREFETCH, sdl,
-                                        DAG.getVTList(MVT::Other),
-                                        &Ops[0], 5,
+                                        DAG.getVTList(MVT::Other), Ops,
                                         EVT::getIntegerVT(*Context, 8),
                                         MachinePointerInfo(I.getArgOperand(0)),
                                         0, /* align */
                                         false, /* volatile */
                                         rw==0, /* read */
                                         rw==1)); /* write */
-    return 0;
+    return nullptr;
   }
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end: {
     bool IsStart = (Intrinsic == Intrinsic::lifetime_start);
     // Stack coloring is not enabled in O0, discard region information.
     if (TM.getOptLevel() == CodeGenOpt::None)
-      return 0;
+      return nullptr;
 
     SmallVector<Value *, 4> Allocas;
     GetUnderlyingObjects(I.getArgOperand(1), Allocas, DL);
@@ -5360,18 +5366,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       Ops[1] = DAG.getFrameIndex(FI, TLI->getPointerTy(), true);
       unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END);
 
-      Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops, 2);
+      Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops);
       DAG.setRoot(Res);
     }
-    return 0;
+    return nullptr;
   }
   case Intrinsic::invariant_start:
     // Discard region information.
     setValue(&I, DAG.getUNDEF(TLI->getPointerTy()));
-    return 0;
+    return nullptr;
   case Intrinsic::invariant_end:
     // Discard region information.
-    return 0;
+    return nullptr;
   case Intrinsic::stackprotectorcheck: {
     // Do not actually emit anything for this basic block. Instead we initialize
     // the stack protector descriptor and export the guard variable so we can
@@ -5382,21 +5388,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 
     // Flush our exports since we are going to process a terminator.
     (void)getControlRoot();
-    return 0;
+    return nullptr;
   }
   case Intrinsic::clear_cache:
     return TLI->getClearCacheBuiltinName();
   case Intrinsic::donothing:
     // ignore
-    return 0;
+    return nullptr;
   case Intrinsic::experimental_stackmap: {
     visitStackmap(I);
-    return 0;
+    return nullptr;
   }
   case Intrinsic::experimental_patchpoint_void:
   case Intrinsic::experimental_patchpoint_i64: {
     visitPatchpoint(I);
-    return 0;
+    return nullptr;
   }
   }
 }
@@ -5408,7 +5414,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   FunctionType *FTy = cast<FunctionType>(PT->getElementType());
   Type *RetTy = FTy->getReturnType();
   MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
-  MCSymbol *BeginLabel = 0;
+  MCSymbol *BeginLabel = nullptr;
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -5496,9 +5502,10 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   if (isTailCall && !isInTailCallPosition(CS, *TLI))
     isTailCall = false;
 
-  TargetLowering::
-  CallLoweringInfo CLI(getRoot(), RetTy, FTy, isTailCall, Callee, Args, DAG,
-                       getCurSDLoc(), CS);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
+    .setCallee(RetTy, FTy, Callee, &Args, CS).setTailCall(isTailCall);
+
   std::pair<SDValue,SDValue> Result = TLI->LowerCallTo(CLI);
   assert((isTailCall || Result.second.getNode()) &&
          "Non-null chain expected with non-tail call!");
@@ -5537,13 +5544,12 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     }
 
     SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
-                                MVT::Other, &Chains[0], NumValues);
+                                MVT::Other, Chains);
     PendingLoads.push_back(Chain);
 
     setValue(CS.getInstruction(),
              DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
-                         DAG.getVTList(&RetTys[0], RetTys.size()),
-                         &Values[0], Values.size()));
+                         DAG.getVTList(RetTys), Values));
   }
 
   if (!Result.second.getNode()) {
@@ -5683,7 +5689,7 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
     switch (CSize->getZExtValue()) {
     default:
       LoadVT = MVT::Other;
-      LoadTy = 0;
+      LoadTy = nullptr;
       ActuallyDoIt = false;
       break;
     case 2:
@@ -5910,7 +5916,7 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
   MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
   ComputeUsesVAFloatArgument(I, &MMI);
 
-  const char *RenameFn = 0;
+  const char *RenameFn = nullptr;
   if (Function *F = I.getCalledFunction()) {
     if (F->isDeclaration()) {
       if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo()) {
@@ -6085,7 +6091,7 @@ public:
   RegsForValue AssignedRegs;
 
   explicit SDISelAsmOperandInfo(const TargetLowering::AsmOperandInfo &info)
-    : TargetLowering::AsmOperandInfo(info), CallOperand(0,0) {
+    : TargetLowering::AsmOperandInfo(info), CallOperand(nullptr,0) {
   }
 
   /// getCallOperandValEVT - Return the EVT of the Value* that this operand
@@ -6094,7 +6100,7 @@ public:
   EVT getCallOperandValEVT(LLVMContext &Context,
                            const TargetLowering &TLI,
                            const DataLayout *DL) const {
-    if (CallOperandVal == 0) return MVT::Other;
+    if (!CallOperandVal) return MVT::Other;
 
     if (isa<BasicBlock>(CallOperandVal))
       return TLI.getPointerTy();
@@ -6415,7 +6421,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       }
 
       // There is no longer a Value* corresponding to this operand.
-      OpInfo.CallOperandVal = 0;
+      OpInfo.CallOperandVal = nullptr;
 
       // It is now an indirect operand.
       OpInfo.isIndirect = true;
@@ -6704,8 +6710,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   if (Flag.getNode()) AsmNodeOperands.push_back(Flag);
 
   Chain = DAG.getNode(ISD::INLINEASM, getCurSDLoc(),
-                      DAG.getVTList(MVT::Other, MVT::Glue),
-                      &AsmNodeOperands[0], AsmNodeOperands.size());
+                      DAG.getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
   Flag = Chain.getValue(1);
 
   // If this asm returns a register value, copy the result from that register
@@ -6768,8 +6773,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   }
 
   if (!OutChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
-                        &OutChains[0], OutChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, OutChains);
 
   DAG.setRoot(Chain);
 }
@@ -6839,10 +6843,10 @@ SelectionDAGBuilder::LowerCallOperands(const CallInst &CI, unsigned ArgIdx,
   }
 
   Type *retTy = useVoidTy ? Type::getVoidTy(*DAG.getContext()) : CI.getType();
-  TargetLowering::CallLoweringInfo CLI(getRoot(), retTy, /*retSExt*/ false,
-    /*retZExt*/ false, /*isVarArg*/ false, /*isInReg*/ false, NumArgs,
-    CI.getCallingConv(), /*isTailCall*/ false, /*doesNotReturn*/ false,
-    /*isReturnValueUsed*/ CI.use_empty(), Callee, Args, DAG, getCurSDLoc());
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
+    .setCallee(CI.getCallingConv(), retTy, Callee, &Args, NumArgs)
+    .setDiscardResult(!CI.use_empty());
 
   const TargetLowering *TLI = TM.getTargetLowering();
   return TLI->LowerCallTo(CLI);
@@ -7056,7 +7060,7 @@ void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
     // There is always a chain and a glue type at the end
     ValueVTs.push_back(MVT::Other);
     ValueVTs.push_back(MVT::Glue);
-    NodeTys = DAG.getVTList(ValueVTs.data(), ValueVTs.size());
+    NodeTys = DAG.getVTList(ValueVTs);
   } else
     NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
@@ -7120,19 +7124,23 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   // Handle all of the outgoing arguments.
   CLI.Outs.clear();
   CLI.OutVals.clear();
-  ArgListTy &Args = CLI.Args;
+  ArgListTy &Args = CLI.getArgs();
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*this, Args[i].Ty, ValueVTs);
-    for (unsigned Value = 0, NumValues = ValueVTs.size();
-         Value != NumValues; ++Value) {
+    Type *FinalType = Args[i].Ty;
+    if (Args[i].isByVal)
+      FinalType = cast<PointerType>(Args[i].Ty)->getElementType();
+    bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
+        FinalType, CLI.CallConv, CLI.IsVarArg);
+    for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues;
+         ++Value) {
       EVT VT = ValueVTs[Value];
       Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
       SDValue Op = SDValue(Args[i].Node.getNode(),
                            Args[i].Node.getResNo() + Value);
       ISD::ArgFlagsTy Flags;
-      unsigned OriginalAlignment =
-        getDataLayout()->getABITypeAlignment(ArgTy);
+      unsigned OriginalAlignment = getDataLayout()->getABITypeAlignment(ArgTy);
 
       if (Args[i].isZExt)
         Flags.setZExt();
@@ -7168,6 +7176,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       }
       if (Args[i].isNest)
         Flags.setNest();
+      if (NeedsRegBlock)
+        Flags.setInConsecutiveRegs();
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
@@ -7200,8 +7210,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         Flags.setReturned();
       }
 
-      getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts,
-                     PartVT, CLI.CS ? CLI.CS->getInstruction() : 0, ExtendKind);
+      getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT,
+                     CLI.CS ? CLI.CS->getInstruction() : nullptr, ExtendKind);
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
@@ -7213,6 +7223,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         else if (j != 0)
           MyFlags.Flags.setOrigAlign(1);
 
+        // Only mark the end at the last register of the last value.
+        if (NeedsRegBlock && Value == NumValues - 1 && j == NumParts - 1)
+          MyFlags.Flags.setInConsecutiveRegsLast();
+
         CLI.Outs.push_back(MyFlags);
         CLI.OutVals.push_back(Parts[j]);
       }
@@ -7261,7 +7275,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
 
     ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
-                                            NumRegs, RegisterVT, VT, NULL,
+                                            NumRegs, RegisterVT, VT, nullptr,
                                             AssertOp));
     CurReg += NumRegs;
   }
@@ -7273,8 +7287,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     return std::make_pair(SDValue(), CLI.Chain);
 
   SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
-                                CLI.DAG.getVTList(&RetTys[0], RetTys.size()),
-                            &ReturnValues[0], ReturnValues.size());
+                                CLI.DAG.getVTList(RetTys), ReturnValues);
   return std::make_pair(Res, CLI.Chain);
 }
 
@@ -7301,7 +7314,7 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
   const TargetLowering *TLI = TM.getTargetLowering();
   RegsForValue RFV(V->getContext(), *TLI, Reg, V->getType());
   SDValue Chain = DAG.getEntryNode();
-  RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, 0, V);
+  RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V);
   PendingExports.push_back(Chain);
 }
 
@@ -7354,13 +7367,17 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     ComputeValueVTs(*TLI, I->getType(), ValueVTs);
     bool isArgValueUsed = !I->use_empty();
     unsigned PartBase = 0;
+    Type *FinalType = I->getType();
+    if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal))
+      FinalType = cast<PointerType>(FinalType)->getElementType();
+    bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
+        FinalType, F.getCallingConv(), F.isVarArg());
     for (unsigned Value = 0, NumValues = ValueVTs.size();
          Value != NumValues; ++Value) {
       EVT VT = ValueVTs[Value];
       Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
       ISD::ArgFlagsTy Flags;
-      unsigned OriginalAlignment =
-        DL->getABITypeAlignment(ArgTy);
+      unsigned OriginalAlignment = DL->getABITypeAlignment(ArgTy);
 
       if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
         Flags.setZExt();
@@ -7396,6 +7413,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       }
       if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
         Flags.setNest();
+      if (NeedsRegBlock)
+        Flags.setInConsecutiveRegs();
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
@@ -7408,6 +7427,11 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         // if it isn't first piece, alignment must be 1
         else if (i > 0)
           MyFlags.Flags.setOrigAlign(1);
+
+        // Only mark the end at the last register of the last value.
+        if (NeedsRegBlock && Value == NumValues - 1 && i == NumRegs - 1)
+          MyFlags.Flags.setInConsecutiveRegsLast();
+
         Ins.push_back(MyFlags);
       }
       PartBase += VT.getStoreSize();
@@ -7449,7 +7473,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
     ISD::NodeType AssertOp = ISD::DELETED_NODE;
     SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1,
-                                        RegVT, VT, NULL, AssertOp);
+                                        RegVT, VT, nullptr, AssertOp);
 
     MachineFunction& MF = SDB->DAG.getMachineFunction();
     MachineRegisterInfo& RegInfo = MF.getRegInfo();
@@ -7496,7 +7520,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
 
         ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i],
                                              NumParts, PartVT, VT,
-                                             NULL, AssertOp));
+                                             nullptr, AssertOp));
       }
 
       i += NumParts;
@@ -7511,7 +7535,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
       FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
 
-    SDValue Res = DAG.getMergeValues(&ArgValues[0], NumValues,
+    SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues),
                                      SDB->getCurSDLoc());
 
     SDB->setValue(I, Res);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 66835bf..fb29691 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -96,7 +96,7 @@ class SelectionDAGBuilder {
     DebugLoc dl;
     unsigned SDNodeOrder;
   public:
-    DanglingDebugInfo() : DI(0), dl(DebugLoc()), SDNodeOrder(0) { }
+    DanglingDebugInfo() : DI(nullptr), dl(DebugLoc()), SDNodeOrder(0) { }
     DanglingDebugInfo(const DbgValueInst *di, DebugLoc DL, unsigned SDNO) :
       DI(di), dl(DL), SDNodeOrder(SDNO) { }
     const DbgValueInst* getDI() { return DI; }
@@ -135,7 +135,7 @@ private:
     MachineBasicBlock* BB;
     uint32_t ExtraWeight;
 
-    Case() : Low(0), High(0), BB(0), ExtraWeight(0) { }
+    Case() : Low(nullptr), High(nullptr), BB(nullptr), ExtraWeight(0) { }
     Case(const Constant *low, const Constant *high, MachineBasicBlock *bb,
          uint32_t extraweight) : Low(low), High(high), BB(bb),
          ExtraWeight(extraweight) { }
@@ -396,8 +396,8 @@ private:
   ///        the same function, use the same failure basic block).
   class StackProtectorDescriptor {
   public:
-    StackProtectorDescriptor() : ParentMBB(0), SuccessMBB(0), FailureMBB(0),
-                                 Guard(0) { }
+    StackProtectorDescriptor() : ParentMBB(nullptr), SuccessMBB(nullptr),
+                                 FailureMBB(nullptr), Guard(nullptr) { }
     ~StackProtectorDescriptor() { }
 
     /// Returns true if all fields of the stack protector descriptor are
@@ -432,8 +432,8 @@ private:
     /// parent mbb after we create the stack protector check (SuccessMBB). This
     /// BB is visited only on stack protector check success.
     void resetPerBBState() {
-      ParentMBB = 0;
-      SuccessMBB = 0;
+      ParentMBB = nullptr;
+      SuccessMBB = nullptr;
     }
 
     /// Reset state that only changes when we switch functions.
@@ -446,8 +446,8 @@ private:
     /// 2.The guard variable since the guard variable we are checking against is
     /// always the same.
     void resetPerFunctionState() {
-      FailureMBB = 0;
-      Guard = 0;
+      FailureMBB = nullptr;
+      Guard = nullptr;
     }
 
     MachineBasicBlock *getParentMBB() { return ParentMBB; }
@@ -482,7 +482,7 @@ private:
     /// block will be created.
     MachineBasicBlock *AddSuccessorMBB(const BasicBlock *BB,
                                        MachineBasicBlock *ParentMBB,
-                                       MachineBasicBlock *SuccMBB = 0);
+                                       MachineBasicBlock *SuccMBB = nullptr);
   };
 
 private:
@@ -538,7 +538,7 @@ public:
 
   SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
                       CodeGenOpt::Level ol)
-    : CurInst(NULL), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()),
+    : CurInst(nullptr), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()),
       DAG(dag), FuncInfo(funcinfo), OptLevel(ol),
       HasTailCall(false) {
   }
@@ -600,13 +600,13 @@ public:
 
   void setValue(const Value *V, SDValue NewN) {
     SDValue &N = NodeMap[V];
-    assert(N.getNode() == 0 && "Already set a value for this node!");
+    assert(!N.getNode() && "Already set a value for this node!");
     N = NewN;
   }
 
   void setUnusedArgValue(const Value *V, SDValue NewN) {
     SDValue &N = UnusedArgNodeMap[V];
-    assert(N.getNode() == 0 && "Already set a value for this node!");
+    assert(!N.getNode() && "Already set a value for this node!");
     N = NewN;
   }
 
@@ -624,7 +624,7 @@ public:
   void CopyToExportRegsIfNeeded(const Value *V);
   void ExportFromCurrentBlock(const Value *V);
   void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall,
-                   MachineBasicBlock *LandingPad = NULL);
+                   MachineBasicBlock *LandingPad = nullptr);
 
   std::pair<SDValue, SDValue> LowerCallOperands(const CallInst &CI,
                                                 unsigned ArgIdx,
@@ -642,7 +642,7 @@ private:
   void visitBr(const BranchInst &I);
   void visitSwitch(const SwitchInst &I);
   void visitIndirectBr(const IndirectBrInst &I);
-  void visitUnreachable(const UnreachableInst &I) { /* noop */ }
+  void visitUnreachable(const UnreachableInst &I);
 
   // Helpers for visitSwitch
   bool handleSmallSwitchRange(CaseRec& CR,
@@ -785,7 +785,8 @@ private:
   /// corresponding DBG_VALUE machine instruction for it now. At the end of
   /// instruction selection, they will be inserted to the entry BB.
   bool EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
-                                int64_t Offset, const SDValue &N);
+                                int64_t Offset, bool IsIndirect,
+                                const SDValue &N);
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 535feba..d6b5255 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -93,6 +93,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::GLOBAL_OFFSET_TABLE:        return "GLOBAL_OFFSET_TABLE";
   case ISD::RETURNADDR:                 return "RETURNADDR";
   case ISD::FRAMEADDR:                  return "FRAMEADDR";
+  case ISD::READ_REGISTER:              return "READ_REGISTER";
+  case ISD::WRITE_REGISTER:             return "WRITE_REGISTER";
   case ISD::FRAME_TO_ARGS_OFFSET:       return "FRAME_TO_ARGS_OFFSET";
   case ISD::EH_RETURN:                  return "EH_RETURN";
   case ISD::EH_SJLJ_SETJMP:             return "EH_SJLJ_SETJMP";
@@ -330,7 +332,7 @@ const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
   }
 }
 
-void SDNode::dump() const { dump(0); }
+void SDNode::dump() const { dump(nullptr); }
 void SDNode::dump(const SelectionDAG *G) const {
   print(dbgs(), G);
   dbgs() << '\n';
@@ -427,7 +429,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
       OS << LBB->getName() << " ";
     OS << (const void*)BBDN->getBasicBlock() << ">";
   } else if (const RegisterSDNode *R = dyn_cast<RegisterSDNode>(this)) {
-    OS << ' ' << PrintReg(R->getReg(), G ? G->getTarget().getRegisterInfo() :0);
+    OS << ' ' << PrintReg(R->getReg(), G ? G->getTarget().getRegisterInfo() :nullptr);
   } else if (const ExternalSymbolSDNode *ES =
              dyn_cast<ExternalSymbolSDNode>(this)) {
     OS << "'" << ES->getSymbol() << "'";
@@ -595,7 +597,7 @@ static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
 
 void SDNode::dumpr() const {
   VisitedSDNodeSet once;
-  DumpNodesr(dbgs(), this, 0, 0, once);
+  DumpNodesr(dbgs(), this, 0, nullptr, once);
 }
 
 void SDNode::dumpr(const SelectionDAG *G) const {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 5d0e2b9..472fc9c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "isel"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
@@ -58,6 +57,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "isel"
+
 STATISTIC(NumFastIselFailures, "Number of instructions fast isel failed on");
 STATISTIC(NumFastIselSuccess, "Number of instructions fast isel selected");
 STATISTIC(NumFastIselBlocks, "Number of blocks selected entirely by fast isel");
@@ -299,7 +300,7 @@ TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
           "'usesCustomInserter', it must implement "
           "TargetLowering::EmitInstrWithCustomInserter!";
 #endif
-  llvm_unreachable(0);
+  llvm_unreachable(nullptr);
 }
 
 void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
@@ -356,7 +357,7 @@ static void SplitCriticalSideEffectEdges(Function &Fn, Pass *SDISel) {
   // Loop for blocks with phi nodes.
   for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
     PHINode *PN = dyn_cast<PHINode>(BB->begin());
-    if (PN == 0) continue;
+    if (!PN) continue;
 
   ReprocessBlock:
     // For each block with a PHI node, check to see if any of the input values
@@ -366,7 +367,7 @@ static void SplitCriticalSideEffectEdges(Function &Fn, Pass *SDISel) {
     for (BasicBlock::iterator I = BB->begin(); (PN = dyn_cast<PHINode>(I)); ++I)
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         ConstantExpr *CE = dyn_cast<ConstantExpr>(PN->getIncomingValue(i));
-        if (CE == 0 || !CE->canTrap()) continue;
+        if (!CE || !CE->canTrap()) continue;
 
         // The only case we have to worry about is when the edge is critical.
         // Since this block has a PHI Node, we assume it has multiple input
@@ -399,7 +400,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   RegInfo = &MF->getRegInfo();
   AA = &getAnalysis<AliasAnalysis>();
   LibInfo = &getAnalysis<TargetLibraryInfo>();
-  GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : 0;
+  GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
 
   TargetSubtargetInfo &ST =
     const_cast<TargetSubtargetInfo&>(TM.getSubtarget<TargetSubtargetInfo>());
@@ -422,7 +423,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     FuncInfo->BPI = &getAnalysis<BranchProbabilityInfo>();
   else
-    FuncInfo->BPI = 0;
+    FuncInfo->BPI = nullptr;
 
   SDB->init(GFI, *AA, LibInfo);
 
@@ -482,7 +483,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       // If this vreg is directly copied into an exported register then
       // that COPY instructions also need DBG_VALUE, if it is the only
       // user of LDI->second.
-      MachineInstr *CopyUseMI = NULL;
+      MachineInstr *CopyUseMI = nullptr;
       for (MachineRegisterInfo::use_instr_iterator
            UI = RegInfo->use_instr_begin(LDI->second),
            E = RegInfo->use_instr_end(); UI != E; ) {
@@ -492,7 +493,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
           CopyUseMI = UseMI; continue;
         }
         // Otherwise this is another use or second copy use.
-        CopyUseMI = NULL; break;
+        CopyUseMI = nullptr; break;
       }
       if (CopyUseMI) {
         MachineInstr *NewMI =
@@ -509,21 +510,17 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
 
   // Determine if there are any calls in this machine function.
   MachineFrameInfo *MFI = MF->getFrameInfo();
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
-       ++I) {
-
+  for (const auto &MBB : *MF) {
     if (MFI->hasCalls() && MF->hasInlineAsm())
       break;
 
-    const MachineBasicBlock *MBB = I;
-    for (MachineBasicBlock::const_iterator II = MBB->begin(), IE = MBB->end();
-         II != IE; ++II) {
-      const MCInstrDesc &MCID = TM.getInstrInfo()->get(II->getOpcode());
+    for (const auto &MI : MBB) {
+      const MCInstrDesc &MCID = TM.getInstrInfo()->get(MI.getOpcode());
       if ((MCID.isCall() && !MCID.isReturn()) ||
-          II->isStackAligningInlineAsm()) {
+          MI.isStackAligningInlineAsm()) {
         MFI->setHasCalls(true);
       }
-      if (II->isInlineAsm()) {
+      if (MI.isInlineAsm()) {
         MF->setHasInlineAsm(true);
       }
     }
@@ -624,7 +621,7 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() {
       continue;
 
     unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src);
-    CurDAG->ComputeMaskedBits(Src, KnownZero, KnownOne);
+    CurDAG->computeKnownBits(Src, KnownZero, KnownOne);
     FuncInfo->AddLiveOutRegInfo(DestReg, NumSignBits, KnownZero, KnownOne);
   } while (!Worklist.empty());
 }
@@ -994,7 +991,7 @@ static void collectFailStats(const Instruction *I) {
 
 void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   // Initialize the Fast-ISel state, if needed.
-  FastISel *FastIS = 0;
+  FastISel *FastIS = nullptr;
   if (TM.Options.EnableFastISel)
     FastIS = getTargetLowering()->createFastISel(*FuncInfo, LibInfo);
 
@@ -1069,7 +1066,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
           FastIS->setLastLocalValue(std::prev(FuncInfo->InsertPt));
         else
-          FastIS->setLastLocalValue(0);
+          FastIS->setLastLocalValue(nullptr);
       }
 
       unsigned NumFastIselRemaining = std::distance(Begin, End);
@@ -1607,7 +1604,7 @@ bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS,
   APInt NeededMask = DesiredMask & ~ActualMask;
 
   APInt KnownZero, KnownOne;
-  CurDAG->ComputeMaskedBits(LHS, KnownZero, KnownOne);
+  CurDAG->computeKnownBits(LHS, KnownZero, KnownOne);
 
   // If all the missing bits in the or are already known to be set, match!
   if ((NeededMask & KnownOne) == NeededMask)
@@ -1676,7 +1673,7 @@ static SDNode *findGlueUse(SDNode *N) {
     if (Use.getResNo() == FlagResNo)
       return Use.getUser();
   }
-  return NULL;
+  return nullptr;
 }
 
 /// findNonImmUse - Return true if "Use" is a non-immediate use of "Def".
@@ -1783,7 +1780,7 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
   EVT VT = Root->getValueType(Root->getNumValues()-1);
   while (VT == MVT::Glue) {
     SDNode *GU = findGlueUse(Root);
-    if (GU == NULL)
+    if (!GU)
       break;
     Root = GU;
     VT = Root->getValueType(Root->getNumValues()-1);
@@ -1805,12 +1802,39 @@ SDNode *SelectionDAGISel::Select_INLINEASM(SDNode *N) {
   SelectInlineAsmMemoryOperands(Ops);
 
   EVT VTs[] = { MVT::Other, MVT::Glue };
-  SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
-                                VTs, &Ops[0], Ops.size());
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N), VTs, Ops);
+  New->setNodeId(-1);
+  return New.getNode();
+}
+
+SDNode
+*SelectionDAGISel::Select_READ_REGISTER(SDNode *Op) {
+  SDLoc dl(Op);
+  MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(0));
+  const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  unsigned Reg = getTargetLowering()->getRegisterByName(
+                 RegStr->getString().data(), Op->getValueType(0));
+  SDValue New = CurDAG->getCopyFromReg(
+                        CurDAG->getEntryNode(), dl, Reg, Op->getValueType(0));
   New->setNodeId(-1);
   return New.getNode();
 }
 
+SDNode
+*SelectionDAGISel::Select_WRITE_REGISTER(SDNode *Op) {
+  SDLoc dl(Op);
+  MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
+  const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  unsigned Reg = getTargetLowering()->getRegisterByName(
+                 RegStr->getString().data(), Op->getOperand(2).getValueType());
+  SDValue New = CurDAG->getCopyToReg(
+                        CurDAG->getEntryNode(), dl, Reg, Op->getOperand(2));
+  New->setNodeId(-1);
+  return New.getNode();
+}
+
+
+
 SDNode *SelectionDAGISel::Select_UNDEF(SDNode *N) {
   return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF,N->getValueType(0));
 }
@@ -1846,7 +1870,7 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
   // Now that all the normal results are replaced, we replace the chain and
   // glue results if present.
   if (!ChainNodesMatched.empty()) {
-    assert(InputChain.getNode() != 0 &&
+    assert(InputChain.getNode() &&
            "Matched input chains but didn't produce a chain");
     // Loop over all of the nodes we matched that produced a chain result.
     // Replace all the chain results with the final chain we ended up with.
@@ -1877,7 +1901,7 @@ UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
 
   // If the result produces glue, update any glue results in the matched
   // pattern with the glue result.
-  if (InputGlue.getNode() != 0) {
+  if (InputGlue.getNode()) {
     // Handle any interior nodes explicitly marked.
     for (unsigned i = 0, e = GlueResultNodesMatched.size(); i != e; ++i) {
       SDNode *FRN = GlueResultNodesMatched[i];
@@ -2080,13 +2104,13 @@ HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
   if (InputChains.size() == 1)
     return InputChains[0];
   return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]),
-                         MVT::Other, &InputChains[0], InputChains.size());
+                         MVT::Other, InputChains);
 }
 
 /// MorphNode - Handle morphing a node in place for the selector.
 SDNode *SelectionDAGISel::
 MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
-          const SDValue *Ops, unsigned NumOps, unsigned EmitNodeInfo) {
+          ArrayRef<SDValue> Ops, unsigned EmitNodeInfo) {
   // It is possible we're using MorphNodeTo to replace a node with no
   // normal results with one that has a normal result (or we could be
   // adding a chain) and the input could have glue and chains as well.
@@ -2106,7 +2130,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
 
   // Call the underlying SelectionDAG routine to do the transmogrification. Note
   // that this deletes operands of the old node that become dead.
-  SDNode *Res = CurDAG->MorphNodeTo(Node, ~TargetOpc, VTList, Ops, NumOps);
+  SDNode *Res = CurDAG->MorphNodeTo(Node, ~TargetOpc, VTList, Ops);
 
   // MorphNodeTo can operate in two ways: if an existing node with the
   // specified operands exists, it can just return it.  Otherwise, it
@@ -2230,7 +2254,7 @@ CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
     Val = GetVBR(Val, MatcherTable, MatcherIndex);
 
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
-  return C != 0 && C->getSExtValue() == Val;
+  return C && C->getSExtValue() == Val;
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
@@ -2251,7 +2275,7 @@ CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   if (N->getOpcode() != ISD::AND) return false;
 
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
-  return C != 0 && SDISel.CheckAndMask(N.getOperand(0), C, Val);
+  return C && SDISel.CheckAndMask(N.getOperand(0), C, Val);
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
@@ -2264,7 +2288,7 @@ CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   if (N->getOpcode() != ISD::OR) return false;
 
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
-  return C != 0 && SDISel.CheckOrMask(N.getOperand(0), C, Val);
+  return C && SDISel.CheckOrMask(N.getOperand(0), C, Val);
 }
 
 /// IsPredicateKnownToFail - If we know how and can do so without pushing a
@@ -2396,13 +2420,15 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
   case ISD::LIFETIME_START:
   case ISD::LIFETIME_END:
     NodeToMatch->setNodeId(-1); // Mark selected.
-    return 0;
+    return nullptr;
   case ISD::AssertSext:
   case ISD::AssertZext:
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, 0),
                                       NodeToMatch->getOperand(0));
-    return 0;
+    return nullptr;
   case ISD::INLINEASM: return Select_INLINEASM(NodeToMatch);
+  case ISD::READ_REGISTER: return Select_READ_REGISTER(NodeToMatch);
+  case ISD::WRITE_REGISTER: return Select_WRITE_REGISTER(NodeToMatch);
   case ISD::UNDEF:     return Select_UNDEF(NodeToMatch);
   }
 
@@ -2548,7 +2574,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     }
     case OPC_RecordNode: {
       // Remember this node, it may end up being an operand in the pattern.
-      SDNode *Parent = 0;
+      SDNode *Parent = nullptr;
       if (NodeStack.size() > 1)
         Parent = NodeStack[NodeStack.size()-2].getNode();
       RecordedNodes.push_back(std::make_pair(N, Parent));
@@ -2755,7 +2781,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       if (Val & 128)
         Val = GetVBR(Val, MatcherTable, MatcherIndex);
       RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
-                              CurDAG->getTargetConstant(Val, VT), (SDNode*)0));
+                              CurDAG->getTargetConstant(Val, VT), nullptr));
       continue;
     }
     case OPC_EmitRegister: {
@@ -2763,7 +2789,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
       unsigned RegNo = MatcherTable[MatcherIndex++];
       RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
-                              CurDAG->getRegister(RegNo, VT), (SDNode*)0));
+                              CurDAG->getRegister(RegNo, VT), nullptr));
       continue;
     }
     case OPC_EmitRegister2: {
@@ -2775,7 +2801,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       unsigned RegNo = MatcherTable[MatcherIndex++];
       RegNo |= MatcherTable[MatcherIndex++] << 8;
       RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
-                              CurDAG->getRegister(RegNo, VT), (SDNode*)0));
+                              CurDAG->getRegister(RegNo, VT), nullptr));
       continue;
     }
 
@@ -2800,7 +2826,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_EmitMergeInputChains1_0:    // OPC_EmitMergeInputChains, 1, 0
     case OPC_EmitMergeInputChains1_1: {  // OPC_EmitMergeInputChains, 1, 1
       // These are space-optimized forms of OPC_EmitMergeInputChains.
-      assert(InputChain.getNode() == 0 &&
+      assert(!InputChain.getNode() &&
              "EmitMergeInputChains should be the first chain producing node");
       assert(ChainNodesMatched.empty() &&
              "Should only have one EmitMergeInputChains per match");
@@ -2821,13 +2847,13 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       // Merge the input chains if they are not intra-pattern references.
       InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
 
-      if (InputChain.getNode() == 0)
+      if (!InputChain.getNode())
         break;  // Failed to merge.
       continue;
     }
 
     case OPC_EmitMergeInputChains: {
-      assert(InputChain.getNode() == 0 &&
+      assert(!InputChain.getNode() &&
              "EmitMergeInputChains should be the first chain producing node");
       // This node gets a list of nodes we matched in the input that have
       // chains.  We want to token factor all of the input chains to these nodes
@@ -2863,7 +2889,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       // Merge the input chains if they are not intra-pattern references.
       InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
 
-      if (InputChain.getNode() == 0)
+      if (!InputChain.getNode())
         break;  // Failed to merge.
 
       continue;
@@ -2874,7 +2900,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       assert(RecNo < RecordedNodes.size() && "Invalid EmitCopyToReg");
       unsigned DestPhysReg = MatcherTable[MatcherIndex++];
 
-      if (InputChain.getNode() == 0)
+      if (!InputChain.getNode())
         InputChain = CurDAG->getEntryNode();
 
       InputChain = CurDAG->getCopyToReg(InputChain, SDLoc(NodeToMatch),
@@ -2890,7 +2916,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       unsigned RecNo = MatcherTable[MatcherIndex++];
       assert(RecNo < RecordedNodes.size() && "Invalid EmitNodeXForm");
       SDValue Res = RunSDNodeXForm(RecordedNodes[RecNo].first, XFormNo);
-      RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, (SDNode*) 0));
+      RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, nullptr));
       continue;
     }
 
@@ -2922,7 +2948,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       else if (VTs.size() == 2)
         VTList = CurDAG->getVTList(VTs[0], VTs[1]);
       else
-        VTList = CurDAG->getVTList(VTs.data(), VTs.size());
+        VTList = CurDAG->getVTList(VTs);
 
       // Get the operand list.
       unsigned NumOps = MatcherTable[MatcherIndex++];
@@ -2956,11 +2982,11 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       // If this has chain/glue inputs, add them.
       if (EmitNodeInfo & OPFL_Chain)
         Ops.push_back(InputChain);
-      if ((EmitNodeInfo & OPFL_GlueInput) && InputGlue.getNode() != 0)
+      if ((EmitNodeInfo & OPFL_GlueInput) && InputGlue.getNode() != nullptr)
         Ops.push_back(InputGlue);
 
       // Create the node.
-      SDNode *Res = 0;
+      SDNode *Res = nullptr;
       if (Opcode != OPC_MorphNodeTo) {
         // If this is a normal EmitNode command, just create the new node and
         // add the results to the RecordedNodes list.
@@ -2971,17 +2997,16 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
           if (VTs[i] == MVT::Other || VTs[i] == MVT::Glue) break;
           RecordedNodes.push_back(std::pair<SDValue,SDNode*>(SDValue(Res, i),
-                                                             (SDNode*) 0));
+                                                             nullptr));
         }
 
       } else if (NodeToMatch->getOpcode() != ISD::DELETED_NODE) {
-        Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops.data(), Ops.size(),
-                        EmitNodeInfo);
+        Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops, EmitNodeInfo);
       } else {
         // NodeToMatch was eliminated by CSE when the target changed the DAG.
         // We will visit the equivalent node later.
         DEBUG(dbgs() << "Node was eliminated by CSE\n");
-        return 0;
+        return nullptr;
       }
 
       // If the node had chain/glue results, update our notion of the current
@@ -3111,7 +3136,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
 
       // FIXME: We just return here, which interacts correctly with SelectRoot
       // above.  We should fix this to not return an SDNode* anymore.
-      return 0;
+      return nullptr;
     }
     }
 
@@ -3123,7 +3148,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     while (1) {
       if (MatchScopes.empty()) {
         CannotYetSelect(NodeToMatch);
-        return 0;
+        return nullptr;
       }
 
       // Restore the interpreter state back to the point where the scope was
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index 1483fdd..4df5ede 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -27,6 +27,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "dag-printer"
+
 namespace llvm {
   template<>
   struct DOTGraphTraits<SelectionDAG*> : public DefaultDOTGraphTraits {
@@ -124,9 +126,9 @@ namespace llvm {
 
     static void addCustomGraphFeatures(SelectionDAG *G,
                                        GraphWriter<SelectionDAG*> &GW) {
-      GW.emitSimpleNode(0, "plaintext=circle", "GraphRoot");
+      GW.emitSimpleNode(nullptr, "plaintext=circle", "GraphRoot");
       if (G->getRoot().getNode())
-        GW.emitEdge(0, -1, G->getRoot().getNode(), G->getRoot().getResNo(),
+        GW.emitEdge(nullptr, -1, G->getRoot().getNode(), G->getRoot().getResNo(),
                     "color=blue,style=dashed");
     }
   };
@@ -289,10 +291,10 @@ std::string ScheduleDAGSDNodes::getGraphNodeLabel(const SUnit *SU) const {
 void ScheduleDAGSDNodes::getCustomGraphFeatures(GraphWriter<ScheduleDAG*> &GW) const {
   if (DAG) {
     // Draw a special "GraphRoot" node to indicate the root of the graph.
-    GW.emitSimpleNode(0, "plaintext=circle", "GraphRoot");
+    GW.emitSimpleNode(nullptr, "plaintext=circle", "GraphRoot");
     const SDNode *N = DAG->getRoot().getNode();
     if (N && N->getNodeId() != -1)
-      GW.emitEdge(0, -1, &SUnits[N->getNodeId()], -1,
+      GW.emitEdge(nullptr, -1, &SUnits[N->getNodeId()], -1,
                   "color=blue,style=dashed");
   }
 }
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5de0b03..b75d805 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -40,7 +40,7 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   : TargetLoweringBase(tm, tlof) {}
 
 const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
-  return NULL;
+  return nullptr;
 }
 
 /// Check whether a given call node is in tail position within its function. If
@@ -103,12 +103,11 @@ TargetLowering::makeLibCall(SelectionDAG &DAG,
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy());
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-  TargetLowering::
-  CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
-                    false, 0, getLibcallCallingConv(LC),
-                    /*isTailCall=*/false,
-                    doesNotReturn, isReturnValueUsed, Callee, Args,
-                    DAG, dl);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed)
+    .setSExtResult(isSigned).setZExtResult(!isSigned);
   return LowerCallTo(CLI);
 }
 
@@ -226,7 +225,7 @@ unsigned TargetLowering::getJumpTableEncoding() const {
     return MachineJumpTableInfo::EK_BlockAddress;
 
   // In PIC mode, if the target supports a GPRel32 directive, use it.
-  if (getTargetMachine().getMCAsmInfo()->getGPRel32Directive() != 0)
+  if (getTargetMachine().getMCAsmInfo()->getGPRel32Directive() != nullptr)
     return MachineJumpTableInfo::EK_GPRel32BlockAddress;
 
   // Otherwise, use a label difference.
@@ -386,7 +385,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (Depth != 0) {
       // If not at the root, Just compute the KnownZero/KnownOne bits to
       // simplify things downstream.
-      TLO.DAG.ComputeMaskedBits(Op, KnownZero, KnownOne, Depth);
+      TLO.DAG.computeKnownBits(Op, KnownZero, KnownOne, Depth);
       return false;
     }
     // If this is the root being simplified, allow it to have multiple uses,
@@ -416,7 +415,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       APInt LHSZero, LHSOne;
       // Do not increment Depth here; that can cause an infinite loop.
-      TLO.DAG.ComputeMaskedBits(Op.getOperand(0), LHSZero, LHSOne, Depth);
+      TLO.DAG.computeKnownBits(Op.getOperand(0), LHSZero, LHSOne, Depth);
       // If the LHS already has zeros where RHSC does, this and is dead.
       if ((LHSZero & NewMask) == (~RHSC->getAPIntValue() & NewMask))
         return TLO.CombineTo(Op, Op.getOperand(0));
@@ -848,6 +847,31 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     }
     break;
   }
+  case ISD::BUILD_PAIR: {
+    EVT HalfVT = Op.getOperand(0).getValueType();
+    unsigned HalfBitWidth = HalfVT.getScalarSizeInBits();
+
+    APInt MaskLo = NewMask.getLoBits(HalfBitWidth).trunc(HalfBitWidth);
+    APInt MaskHi = NewMask.getHiBits(HalfBitWidth).trunc(HalfBitWidth);
+
+    APInt KnownZeroLo, KnownOneLo;
+    APInt KnownZeroHi, KnownOneHi;
+
+    if (SimplifyDemandedBits(Op.getOperand(0), MaskLo, KnownZeroLo,
+                             KnownOneLo, TLO, Depth + 1))
+      return true;
+
+    if (SimplifyDemandedBits(Op.getOperand(1), MaskHi, KnownZeroHi,
+                             KnownOneHi, TLO, Depth + 1))
+      return true;
+
+    KnownZero = KnownZeroLo.zext(BitWidth) |
+                KnownZeroHi.zext(BitWidth).shl(HalfBitWidth);
+
+    KnownOne = KnownOneLo.zext(BitWidth) |
+               KnownOneHi.zext(BitWidth).shl(HalfBitWidth);
+    break;
+  }
   case ISD::ZERO_EXTEND: {
     unsigned OperandBitWidth =
       Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
@@ -1040,8 +1064,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   }
   // FALL THROUGH
   default:
-    // Just use ComputeMaskedBits to compute output bits.
-    TLO.DAG.ComputeMaskedBits(Op, KnownZero, KnownOne, Depth);
+    // Just use computeKnownBits to compute output bits.
+    TLO.DAG.computeKnownBits(Op, KnownZero, KnownOne, Depth);
     break;
   }
 
@@ -1053,14 +1077,14 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   return false;
 }
 
-/// computeMaskedBitsForTargetNode - Determine which of the bits specified
+/// computeKnownBitsForTargetNode - Determine which of the bits specified
 /// in Mask are known to be either zero or one and return them in the
 /// KnownZero/KnownOne bitsets.
-void TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                    APInt &KnownZero,
-                                                    APInt &KnownOne,
-                                                    const SelectionDAG &DAG,
-                                                    unsigned Depth) const {
+void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                   APInt &KnownZero,
+                                                   APInt &KnownOne,
+                                                   const SelectionDAG &DAG,
+                                                   unsigned Depth) const {
   assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
           Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
           Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
@@ -1074,6 +1098,7 @@ void TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
 /// targets that want to expose additional information about sign bits to the
 /// DAG Combiner.
 unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                         const SelectionDAG &,
                                                          unsigned Depth) const {
   assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
           Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
@@ -1085,7 +1110,7 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
 }
 
 /// ValueHasExactlyOneBitSet - Test if the given value is known to have exactly
-/// one bit set. This differs from ComputeMaskedBits in that it doesn't need to
+/// one bit set. This differs from computeKnownBits in that it doesn't need to
 /// determine which bit is set.
 ///
 static bool ValueHasExactlyOneBitSet(SDValue Val, const SelectionDAG &DAG) {
@@ -1108,11 +1133,11 @@ static bool ValueHasExactlyOneBitSet(SDValue Val, const SelectionDAG &DAG) {
   // More could be done here, though the above checks are enough
   // to handle some common cases.
 
-  // Fall back to ComputeMaskedBits to catch other known cases.
+  // Fall back to computeKnownBits to catch other known cases.
   EVT OpVT = Val.getValueType();
   unsigned BitWidth = OpVT.getScalarType().getSizeInBits();
   APInt KnownZero, KnownOne;
-  DAG.ComputeMaskedBits(Val, KnownZero, KnownOne);
+  DAG.computeKnownBits(Val, KnownZero, KnownOne);
   return (KnownZero.countPopulation() == BitWidth - 1) &&
          (KnownOne.countPopulation() == 1);
 }
@@ -1381,10 +1406,14 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         EVT newVT = N0.getOperand(0).getValueType();
         if (DCI.isBeforeLegalizeOps() ||
             (isOperationLegal(ISD::SETCC, newVT) &&
-             getCondCodeAction(Cond, newVT.getSimpleVT())==Legal))
-          return DAG.getSetCC(dl, VT, N0.getOperand(0),
-                              DAG.getConstant(C1.trunc(InSize), newVT),
-                              Cond);
+             getCondCodeAction(Cond, newVT.getSimpleVT()) == Legal)) {
+          EVT NewSetCCVT = getSetCCResultType(*DAG.getContext(), newVT);
+          SDValue NewConst = DAG.getConstant(C1.trunc(InSize), newVT);
+
+          SDValue NewSetCC = DAG.getSetCC(dl, NewSetCCVT, N0.getOperand(0),
+                                          NewConst, Cond);
+          return DAG.getBoolExtOrTrunc(NewSetCC, dl, VT);
+        }
         break;
       }
       default:
@@ -2052,7 +2081,7 @@ const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{
     return "r";
   if (ConstraintVT.isFloatingPoint())
     return "f";      // works for many targets
-  return 0;
+  return nullptr;
 }
 
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
@@ -2086,12 +2115,12 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     if (Op.getOpcode() == ISD::ADD) {
       C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
       GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
-      if (C == 0 || GA == 0) {
+      if (!C || !GA) {
         C = dyn_cast<ConstantSDNode>(Op.getOperand(0));
         GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(1));
       }
-      if (C == 0 || GA == 0)
-        C = 0, GA = 0;
+      if (!C || !GA)
+        C = nullptr, GA = nullptr;
     }
 
     // If we find a valid operand, map to the TargetXXX version so that the
@@ -2126,14 +2155,14 @@ std::pair<unsigned, const TargetRegisterClass*> TargetLowering::
 getRegForInlineAsmConstraint(const std::string &Constraint,
                              MVT VT) const {
   if (Constraint.empty() || Constraint[0] != '{')
-    return std::make_pair(0u, static_cast<TargetRegisterClass*>(0));
+    return std::make_pair(0u, static_cast<TargetRegisterClass*>(nullptr));
   assert(*(Constraint.end()-1) == '}' && "Not a brace enclosed constraint?");
 
   // Remove the braces from around the name.
   StringRef RegName(Constraint.data()+1, Constraint.size()-2);
 
   std::pair<unsigned, const TargetRegisterClass*> R =
-    std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
+    std::make_pair(0u, static_cast<const TargetRegisterClass*>(nullptr));
 
   // Figure out which register class contains this reg.
   const TargetRegisterInfo *RI = getTargetMachine().getRegisterInfo();
@@ -2428,7 +2457,7 @@ TargetLowering::ConstraintWeight
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
   // Look at the constraint type.
   switch (*constraint) {
@@ -2601,9 +2630,9 @@ SDValue TargetLowering::BuildExactSDIV(SDValue Op1, SDValue Op2, SDLoc dl,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.  See:
 /// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
-SDValue TargetLowering::
-BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
-          std::vector<SDNode*> *Created) const {
+SDValue TargetLowering::BuildSDIV(SDNode *N, const APInt &Divisor,
+                                  SelectionDAG &DAG, bool IsAfterLegalization,
+                                  std::vector<SDNode *> *Created) const {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
@@ -2612,8 +2641,7 @@ BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
   if (!isTypeLegal(VT))
     return SDValue();
 
-  APInt d = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
-  APInt::ms magics = d.magic();
+  APInt::ms magics = Divisor.magic();
 
   // Multiply the numerator (operand 0) by the magic value
   // FIXME: We should support doing a MUL in a wider type
@@ -2630,13 +2658,13 @@ BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
   else
     return SDValue();       // No mulhs or equvialent
   // If d > 0 and m < 0, add the numerator
-  if (d.isStrictlyPositive() && magics.m.isNegative()) {
+  if (Divisor.isStrictlyPositive() && magics.m.isNegative()) {
     Q = DAG.getNode(ISD::ADD, dl, VT, Q, N->getOperand(0));
     if (Created)
       Created->push_back(Q.getNode());
   }
   // If d < 0 and m > 0, subtract the numerator.
-  if (d.isNegative() && magics.m.isStrictlyPositive()) {
+  if (Divisor.isNegative() && magics.m.isStrictlyPositive()) {
     Q = DAG.getNode(ISD::SUB, dl, VT, Q, N->getOperand(0));
     if (Created)
       Created->push_back(Q.getNode());
@@ -2649,9 +2677,9 @@ BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
       Created->push_back(Q.getNode());
   }
   // Extract the sign bit and add it to the quotient
-  SDValue T =
-    DAG.getNode(ISD::SRL, dl, VT, Q, DAG.getConstant(VT.getSizeInBits()-1,
-                                           getShiftAmountTy(Q.getValueType())));
+  SDValue T = DAG.getNode(ISD::SRL, dl, VT, Q,
+                          DAG.getConstant(VT.getScalarSizeInBits() - 1,
+                                          getShiftAmountTy(Q.getValueType())));
   if (Created)
     Created->push_back(T.getNode());
   return DAG.getNode(ISD::ADD, dl, VT, Q, T);
@@ -2661,9 +2689,9 @@ BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.  See:
 /// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
-SDValue TargetLowering::
-BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
-          std::vector<SDNode*> *Created) const {
+SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor,
+                                  SelectionDAG &DAG, bool IsAfterLegalization,
+                                  std::vector<SDNode *> *Created) const {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
@@ -2674,22 +2702,21 @@ BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
 
   // FIXME: We should use a narrower constant when the upper
   // bits are known to be zero.
-  const APInt &N1C = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
-  APInt::mu magics = N1C.magicu();
+  APInt::mu magics = Divisor.magicu();
 
   SDValue Q = N->getOperand(0);
 
   // If the divisor is even, we can avoid using the expensive fixup by shifting
   // the divided value upfront.
-  if (magics.a != 0 && !N1C[0]) {
-    unsigned Shift = N1C.countTrailingZeros();
+  if (magics.a != 0 && !Divisor[0]) {
+    unsigned Shift = Divisor.countTrailingZeros();
     Q = DAG.getNode(ISD::SRL, dl, VT, Q,
                     DAG.getConstant(Shift, getShiftAmountTy(Q.getValueType())));
     if (Created)
       Created->push_back(Q.getNode());
 
     // Get magic number for the shifted divisor.
-    magics = N1C.lshr(Shift).magicu(Shift);
+    magics = Divisor.lshr(Shift).magicu(Shift);
     assert(magics.a == 0 && "Should use cheap fixup now");
   }
 
@@ -2708,7 +2735,7 @@ BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
     Created->push_back(Q.getNode());
 
   if (magics.a == 0) {
-    assert(magics.s < N1C.getBitWidth() &&
+    assert(magics.s < Divisor.getBitWidth() &&
            "We shouldn't generate an undefined shift!");
     return DAG.getNode(ISD::SRL, dl, VT, Q,
                  DAG.getConstant(magics.s, getShiftAmountTy(Q.getValueType())));
@@ -2738,3 +2765,110 @@ verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
 
   return false;
 }
+
+//===----------------------------------------------------------------------===//
+// Legalization Utilities
+//===----------------------------------------------------------------------===//
+
+bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
+                               SelectionDAG &DAG, SDValue LL, SDValue LH,
+			       SDValue RL, SDValue RH) const {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  bool HasMULHS = isOperationLegalOrCustom(ISD::MULHS, HiLoVT);
+  bool HasMULHU = isOperationLegalOrCustom(ISD::MULHU, HiLoVT);
+  bool HasSMUL_LOHI = isOperationLegalOrCustom(ISD::SMUL_LOHI, HiLoVT);
+  bool HasUMUL_LOHI = isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT);
+  if (HasMULHU || HasMULHS || HasUMUL_LOHI || HasSMUL_LOHI) {
+    unsigned OuterBitSize = VT.getSizeInBits();
+    unsigned InnerBitSize = HiLoVT.getSizeInBits();
+    unsigned LHSSB = DAG.ComputeNumSignBits(N->getOperand(0));
+    unsigned RHSSB = DAG.ComputeNumSignBits(N->getOperand(1));
+
+    // LL, LH, RL, and RH must be either all NULL or all set to a value.
+    assert((LL.getNode() && LH.getNode() && RL.getNode() && RH.getNode()) ||
+           (!LL.getNode() && !LH.getNode() && !RL.getNode() && !RH.getNode()));
+
+    if (!LL.getNode() && !RL.getNode() &&
+        isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
+      LL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, N->getOperand(0));
+      RL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, N->getOperand(1));
+    }
+
+    if (!LL.getNode())
+      return false;
+
+    APInt HighMask = APInt::getHighBitsSet(OuterBitSize, InnerBitSize);
+    if (DAG.MaskedValueIsZero(N->getOperand(0), HighMask) &&
+        DAG.MaskedValueIsZero(N->getOperand(1), HighMask)) {
+      // The inputs are both zero-extended.
+      if (HasUMUL_LOHI) {
+        // We can emit a umul_lohi.
+        Lo = DAG.getNode(ISD::UMUL_LOHI, dl,
+	                 DAG.getVTList(HiLoVT, HiLoVT), LL, RL);
+        Hi = SDValue(Lo.getNode(), 1);
+        return true;
+      }
+      if (HasMULHU) {
+        // We can emit a mulhu+mul.
+        Lo = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RL);
+        Hi = DAG.getNode(ISD::MULHU, dl, HiLoVT, LL, RL);
+        return true;
+      }
+    }
+    if (LHSSB > InnerBitSize && RHSSB > InnerBitSize) {
+      // The input values are both sign-extended.
+      if (HasSMUL_LOHI) {
+        // We can emit a smul_lohi.
+        Lo = DAG.getNode(ISD::SMUL_LOHI, dl,
+	                 DAG.getVTList(HiLoVT, HiLoVT), LL, RL);
+        Hi = SDValue(Lo.getNode(), 1);
+        return true;
+      }
+      if (HasMULHS) {
+        // We can emit a mulhs+mul.
+        Lo = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RL);
+        Hi = DAG.getNode(ISD::MULHS, dl, HiLoVT, LL, RL);
+        return true;
+      }
+    }
+
+    if (!LH.getNode() && !RH.getNode() &&
+        isOperationLegalOrCustom(ISD::SRL, VT) &&
+        isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
+      unsigned ShiftAmt = VT.getSizeInBits() - HiLoVT.getSizeInBits();
+      SDValue Shift = DAG.getConstant(ShiftAmt, getShiftAmountTy(VT));
+      LH = DAG.getNode(ISD::SRL, dl, VT, N->getOperand(0), Shift);
+      LH = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, LH);
+      RH = DAG.getNode(ISD::SRL, dl, VT, N->getOperand(1), Shift);
+      RH = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, RH);
+    }
+
+    if (!LH.getNode())
+      return false;
+
+    if (HasUMUL_LOHI) {
+      // Lo,Hi = umul LHS, RHS.
+      SDValue UMulLOHI = DAG.getNode(ISD::UMUL_LOHI, dl,
+                                     DAG.getVTList(HiLoVT, HiLoVT), LL, RL);
+      Lo = UMulLOHI;
+      Hi = UMulLOHI.getValue(1);
+      RH = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RH);
+      LH = DAG.getNode(ISD::MUL, dl, HiLoVT, LH, RL);
+      Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, RH);
+      Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, LH);
+      return true;
+    }
+    if (HasMULHU) {
+      Lo = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RL);
+      Hi = DAG.getNode(ISD::MULHU, dl, HiLoVT, LL, RL);
+      RH = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RH);
+      LH = DAG.getNode(ISD::MUL, dl, HiLoVT, LH, RL);
+      Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, RH);
+      Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, LH);
+      return true;
+    }
+  }
+  return false;
+}
diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp
index adb3ef9..f7c64da 100644
--- a/lib/CodeGen/ShadowStackGC.cpp
+++ b/lib/CodeGen/ShadowStackGC.cpp
@@ -25,7 +25,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "shadowstackgc"
 #include "llvm/CodeGen/GCs.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/GCStrategy.h"
@@ -36,6 +35,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "shadowstackgc"
+
 namespace {
 
   class ShadowStackGC : public GCStrategy {
@@ -101,7 +102,7 @@ namespace {
     IRBuilder<> *Next() {
       switch (State) {
       default:
-        return 0;
+        return nullptr;
 
       case 0:
         StateBB = F.begin();
@@ -137,7 +138,7 @@ namespace {
                 Calls.push_back(CI);
 
         if (Calls.empty())
-          return 0;
+          return nullptr;
 
         // Create a cleanup block.
         LLVMContext &C = F.getContext();
@@ -194,7 +195,7 @@ namespace {
 
 void llvm::linkShadowStackGC() { }
 
-ShadowStackGC::ShadowStackGC() : Head(0), StackEntryTy(0) {
+ShadowStackGC::ShadowStackGC() : Head(nullptr), StackEntryTy(nullptr) {
   InitRoots = true;
   CustomRoots = true;
 }
@@ -390,8 +391,8 @@ bool ShadowStackGC::performCustomLowering(Function &F) {
   BasicBlock::iterator IP = F.getEntryBlock().begin();
   IRBuilder<> AtEntry(IP->getParent(), IP);
 
-  Instruction *StackEntry   = AtEntry.CreateAlloca(ConcreteStackEntryTy, 0,
-                                                   "gc_frame");
+  Instruction *StackEntry = AtEntry.CreateAlloca(ConcreteStackEntryTy, nullptr,
+                                                 "gc_frame");
 
   while (isa<AllocaInst>(IP)) ++IP;
   AtEntry.SetInsertPoint(IP->getParent(), IP);
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index dc7ca2b..d2f3955 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sjljehprepare"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
@@ -38,6 +37,8 @@
 #include <set>
 using namespace llvm;
 
+#define DEBUG_TYPE "sjljehprepare"
+
 STATISTIC(NumInvokes, "Number of invokes replaced");
 STATISTIC(NumSpilled, "Number of registers live across unwind edges");
 
@@ -100,10 +101,10 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
                                       NULL);
   RegisterFn = M.getOrInsertFunction(
       "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
-      PointerType::getUnqual(FunctionContextTy), (Type *)0);
+      PointerType::getUnqual(FunctionContextTy), (Type *)nullptr);
   UnregisterFn = M.getOrInsertFunction(
       "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
-      PointerType::getUnqual(FunctionContextTy), (Type *)0);
+      PointerType::getUnqual(FunctionContextTy), (Type *)nullptr);
   FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
   StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
   StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
@@ -111,7 +112,7 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
   LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda);
   CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite);
   FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
-  PersonalityFn = 0;
+  PersonalityFn = nullptr;
 
   return true;
 }
@@ -192,7 +193,7 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,
   const TargetLowering *TLI = TM->getTargetLowering();
   unsigned Align =
       TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
-  FuncCtx = new AllocaInst(FunctionContextTy, 0, Align, "fn_context",
+  FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context",
                            EntryBB->begin());
 
   // Fill in the function context structure.
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index a6c6261..d46621d 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "slotindexes"
-
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -18,6 +16,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "slotindexes"
+
 char SlotIndexes::ID = 0;
 INITIALIZE_PASS(SlotIndexes, "slotindexes",
                 "Slot index numbering", false, false)
@@ -66,7 +66,7 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
   MBBRanges.resize(mf->getNumBlockIDs());
   idx2MBBMap.reserve(mf->size());
 
-  indexList.push_back(createEntry(0, index));
+  indexList.push_back(createEntry(nullptr, index));
 
   // Iterate over the function.
   for (MachineFunction::iterator mbbItr = mf->begin(), mbbEnd = mf->end();
@@ -91,7 +91,7 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
     }
 
     // We insert one blank instructions between basic blocks.
-    indexList.push_back(createEntry(0, index += SlotIndex::InstrDist));
+    indexList.push_back(createEntry(nullptr, index += SlotIndex::InstrDist));
 
     MBBRanges[mbb->getNumber()].first = blockStartIndex;
     MBBRanges[mbb->getNumber()].second = SlotIndex(&indexList.back(),
@@ -182,7 +182,7 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
            "Decremented past the beginning of region to repair.");
 
     MachineInstr *SlotMI = ListI->getInstr();
-    MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? MBBI : 0;
+    MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? MBBI : nullptr;
     bool MBBIAtBegin = MBBI == Begin && (!includeStart || pastStart);
 
     if (SlotMI == MI && !MBBIAtBegin) {
@@ -219,7 +219,7 @@ void SlotIndexes::dump() const {
        itr != indexList.end(); ++itr) {
     dbgs() << itr->getIndex() << " ";
 
-    if (itr->getInstr() != 0) {
+    if (itr->getInstr()) {
       dbgs() << *itr->getInstr();
     } else {
       dbgs() << "\n";
diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp
index 5f73469..24e94d1 100644
--- a/lib/CodeGen/SpillPlacement.cpp
+++ b/lib/CodeGen/SpillPlacement.cpp
@@ -27,7 +27,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "spillplacement"
 #include "SpillPlacement.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/EdgeBundles.h"
@@ -41,6 +40,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "spillplacement"
+
 char SpillPlacement::ID = 0;
 INITIALIZE_PASS_BEGIN(SpillPlacement, "spill-code-placement",
                       "Spill Code Placement Analysis", true, true)
@@ -59,9 +60,26 @@ void SpillPlacement::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
+namespace {
+static BlockFrequency Threshold;
+}
+
 /// Decision threshold. A node gets the output value 0 if the weighted sum of
 /// its inputs falls in the open interval (-Threshold;Threshold).
-static const BlockFrequency Threshold = 2;
+static BlockFrequency getThreshold() { return Threshold; }
+
+/// \brief Set the threshold for a given entry frequency.
+///
+/// Set the threshold relative to \c Entry.  Since the threshold is used as a
+/// bound on the open interval (-Threshold;Threshold), 1 is the minimum
+/// threshold.
+static void setThreshold(const BlockFrequency &Entry) {
+  // Apparently 2 is a good threshold when Entry==2^14, but we need to scale
+  // it.  Divide by 2^13, rounding as appropriate.
+  uint64_t Freq = Entry.getFrequency();
+  uint64_t Scaled = (Freq >> 13) + bool(Freq & (1 << 12));
+  Threshold = std::max(UINT64_C(1), Scaled);
+}
 
 /// Node - Each edge bundle corresponds to a Hopfield node.
 ///
@@ -110,7 +128,7 @@ struct SpillPlacement::Node {
   // the CFG.
   void clear() {
     BiasN = BiasP = Value = 0;
-    SumLinkWeights = Threshold;
+    SumLinkWeights = getThreshold();
     Links.clear();
   }
 
@@ -168,9 +186,9 @@ struct SpillPlacement::Node {
     //  2. It helps tame rounding errors when the links nominally sum to 0.
     //
     bool Before = preferReg();
-    if (SumN >= SumP + Threshold)
+    if (SumN >= SumP + getThreshold())
       Value = -1;
-    else if (SumP >= SumN + Threshold)
+    else if (SumP >= SumN + getThreshold())
       Value = 1;
     else
       Value = 0;
@@ -189,6 +207,7 @@ bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
   // Compute total ingoing and outgoing block frequencies for all bundles.
   BlockFrequencies.resize(mf.getNumBlockIDs());
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  setThreshold(MBFI->getEntryFreq());
   for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) {
     unsigned Num = I->getNumber();
     BlockFrequencies[Num] = MBFI->getBlockFreq(I);
@@ -200,7 +219,7 @@ bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
 
 void SpillPlacement::releaseMemory() {
   delete[] nodes;
-  nodes = 0;
+  nodes = nullptr;
 }
 
 /// activate - mark node n as active if it wasn't already.
@@ -375,6 +394,6 @@ SpillPlacement::finish() {
       ActiveNodes->reset(n);
       Perfect = false;
     }
-  ActiveNodes = 0;
+  ActiveNodes = nullptr;
   return Perfect;
 }
diff --git a/lib/CodeGen/SpillPlacement.h b/lib/CodeGen/SpillPlacement.h
index a88d7ac..43fc7f5 100644
--- a/lib/CodeGen/SpillPlacement.h
+++ b/lib/CodeGen/SpillPlacement.h
@@ -65,7 +65,7 @@ class SpillPlacement  : public MachineFunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid.
 
-  SpillPlacement() : MachineFunctionPass(ID), nodes(0) {}
+  SpillPlacement() : MachineFunctionPass(ID), nodes(nullptr) {}
   ~SpillPlacement() { releaseMemory(); }
 
   /// BorderConstraint - A basic block has separate constraints for entry and
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index 094641c..0649448 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "spiller"
-
 #include "Spiller.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
@@ -28,6 +26,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "spiller"
+
 namespace {
   enum SpillerName { trivial, inline_ };
 }
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 16fe979..7d4f568 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "SplitKit.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -29,6 +28,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 STATISTIC(NumFinished, "Number of splits finished");
 STATISTIC(NumSimple,   "Number of splits that were simple");
 STATISTIC(NumCopies,   "Number of copies inserted for splitting");
@@ -47,14 +48,14 @@ SplitAnalysis::SplitAnalysis(const VirtRegMap &vrm,
     LIS(lis),
     Loops(mli),
     TII(*MF.getTarget().getInstrInfo()),
-    CurLI(0),
+    CurLI(nullptr),
     LastSplitPoint(MF.getNumBlockIDs()) {}
 
 void SplitAnalysis::clear() {
   UseSlots.clear();
   UseBlocks.clear();
   ThroughBlocks.clear();
-  CurLI = 0;
+  CurLI = nullptr;
   DidRepairRange = false;
 }
 
@@ -331,7 +332,7 @@ SplitEditor::SplitEditor(SplitAnalysis &sa,
     TII(*vrm.getMachineFunction().getTarget().getInstrInfo()),
     TRI(*vrm.getMachineFunction().getTarget().getRegisterInfo()),
     MBFI(mbfi),
-    Edit(0),
+    Edit(nullptr),
     OpenIdx(0),
     SpillMode(SM_Partition),
     RegAssign(Allocator)
@@ -353,7 +354,7 @@ void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
 
   // We don't need an AliasAnalysis since we will only be performing
   // cheap-as-a-copy remats anyway.
-  Edit->anyRematerializable(0);
+  Edit->anyRematerializable(nullptr);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -423,7 +424,7 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
   LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
   LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), VNI));
   // Mark as complex mapped, forced.
-  VFP = ValueForcePair(0, true);
+  VFP = ValueForcePair(nullptr, true);
 }
 
 VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
@@ -431,7 +432,7 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
                                    SlotIndex UseIdx,
                                    MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I) {
-  MachineInstr *CopyMI = 0;
+  MachineInstr *CopyMI = nullptr;
   SlotIndex Def;
   LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
 
@@ -922,7 +923,7 @@ bool SplitEditor::transferValues() {
           else {
             // Live-through, and we don't know the value.
             LRC.addLiveInBlock(LR, MDT[MBB]);
-            LRC.setLiveOutValue(MBB, 0);
+            LRC.setLiveOutValue(MBB, nullptr);
           }
         }
         BlockStart = BlockEnd;
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index f029c73..7048ee3 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -377,7 +377,7 @@ public:
   SlotIndex enterIntvAfter(SlotIndex Idx);
 
   /// enterIntvAtEnd - Enter the open interval at the end of MBB.
-  /// Use the open interval from he inserted copy to the MBB end.
+  /// Use the open interval from the inserted copy to the MBB end.
   /// Return the beginning of the new live range.
   SlotIndex enterIntvAtEnd(MachineBasicBlock &MBB);
 
@@ -417,7 +417,7 @@ public:
   /// @param LRMap When not null, this vector will map each live range in Edit
   ///              back to the indices returned by openIntv.
   ///              There may be extra indices created by dead code elimination.
-  void finish(SmallVectorImpl<unsigned> *LRMap = 0);
+  void finish(SmallVectorImpl<unsigned> *LRMap = nullptr);
 
   /// dump - print the current interval maping to dbgs().
   void dump() const;
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index 7b1de85..370430c 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -21,7 +21,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "stackcoloring"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DepthFirstIterator.h"
@@ -58,6 +57,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "stackcoloring"
+
 static cl::opt<bool>
 DisableColoring("no-stack-coloring",
         cl::init(false), cl::Hidden,
@@ -193,12 +194,11 @@ void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 void StackColoring::dump() const {
-  for (df_iterator<MachineFunction*> FI = df_begin(MF), FE = df_end(MF);
-       FI != FE; ++FI) {
-    DEBUG(dbgs()<<"Inspecting block #"<<BasicBlocks.lookup(*FI)<<
-          " ["<<FI->getName()<<"]\n");
+  for (MachineBasicBlock *MBB : depth_first(MF)) {
+    DEBUG(dbgs() << "Inspecting block #" << BasicBlocks.lookup(MBB) << " ["
+                 << MBB->getName() << "]\n");
 
-    LivenessMap::const_iterator BI = BlockLiveness.find(*FI);
+    LivenessMap::const_iterator BI = BlockLiveness.find(MBB);
     assert(BI != BlockLiveness.end() && "Block not found");
     const BlockLifetimeInfo &BlockInfo = BI->second;
 
@@ -231,20 +231,19 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
   // NOTE: We use the a reverse-post-order iteration to ensure that we obtain a
   // deterministic numbering, and because we'll need a post-order iteration
   // later for solving the liveness dataflow problem.
-  for (df_iterator<MachineFunction*> FI = df_begin(MF), FE = df_end(MF);
-       FI != FE; ++FI) {
+  for (MachineBasicBlock *MBB : depth_first(MF)) {
 
     // Assign a serial number to this basic block.
-    BasicBlocks[*FI] = BasicBlockNumbering.size();
-    BasicBlockNumbering.push_back(*FI);
+    BasicBlocks[MBB] = BasicBlockNumbering.size();
+    BasicBlockNumbering.push_back(MBB);
 
     // Keep a reference to avoid repeated lookups.
-    BlockLifetimeInfo &BlockInfo = BlockLiveness[*FI];
+    BlockLifetimeInfo &BlockInfo = BlockLiveness[MBB];
 
     BlockInfo.Begin.resize(NumSlot);
     BlockInfo.End.resize(NumSlot);
 
-    for (MachineInstr &MI : **FI) {
+    for (MachineInstr &MI : *MBB) {
       if (MI.getOpcode() != TargetOpcode::LIFETIME_START &&
           MI.getOpcode() != TargetOpcode::LIFETIME_END)
         continue;
@@ -511,11 +510,6 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
 
       // Update the MachineMemOperand to use the new alloca.
       for (MachineMemOperand *MMO : I.memoperands()) {
-        const Value *V = MMO->getValue();
-
-        if (!V)
-          continue;
-
         // FIXME: In order to enable the use of TBAA when using AA in CodeGen,
         // we'll also need to update the TBAA nodes in MMOs with values
         // derived from the merged allocas. When doing this, we'll need to use
@@ -525,10 +519,10 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
 
         // We've replaced IR-level uses of the remapped allocas, so we only
         // need to replace direct uses here.
-        if (!isa<AllocaInst>(V))
+        const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(MMO->getValue());
+        if (!AI)
           continue;
 
-        const AllocaInst *AI= cast<AllocaInst>(V);
         if (!Allocas.count(AI))
           continue;
 
diff --git a/lib/CodeGen/StackMapLivenessAnalysis.cpp b/lib/CodeGen/StackMapLivenessAnalysis.cpp
index a374417..4dd87dd 100644
--- a/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "stackmaps"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -26,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "stackmaps"
+
 namespace llvm {
 cl::opt<bool> EnableStackMapLiveness("enable-stackmap-liveness",
   cl::Hidden, cl::desc("Enable StackMap Liveness Analysis Pass"));
@@ -99,7 +100,7 @@ bool StackMapLiveness::calculateLiveness() {
         HasStackMap = true;
         ++NumStackMaps;
       }
-      DEBUG(dbgs() << "   " << *I << "   " << LiveRegs);
+      DEBUG(dbgs() << "   " << LiveRegs << "   " << *I);
       LiveRegs.stepBackward(*I);
     }
     ++NumBBsVisited;
diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
index a6522dc..1473fc1 100644
--- a/lib/CodeGen/StackMaps.cpp
+++ b/lib/CodeGen/StackMaps.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "stackmaps"
-
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -20,6 +18,7 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -29,6 +28,13 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "stackmaps"
+
+static cl::opt<int> StackMapVersion("stackmap-version", cl::init(1),
+  cl::desc("Specify the stackmap encoding version (default = 1)"));
+
+const char *StackMaps::WSMP = "Stack Maps: ";
+
 PatchPointOpers::PatchPointOpers(const MachineInstr *MI)
   : MI(MI),
     HasDef(MI->getOperand(0).isReg() && MI->getOperand(0).isDef() &&
@@ -64,6 +70,11 @@ unsigned PatchPointOpers::getNextScratchIdx(unsigned StartIdx) const {
   return ScratchIdx;
 }
 
+StackMaps::StackMaps(AsmPrinter &AP) : AP(AP) {
+  if (StackMapVersion != 1)
+    llvm_unreachable("Unsupported stackmap version!");
+}
+
 MachineInstr::const_mop_iterator
 StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
                         MachineInstr::const_mop_iterator MOE,
@@ -209,7 +220,8 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID,
     if (I->LocType == Location::Constant &&
         ((I->Offset + (int64_t(1)<<31)) >> 32) != 0) {
       I->LocType = Location::ConstantIndex;
-      I->Offset = ConstPool.getConstantIndex(I->Offset);
+      auto Result = ConstPool.insert(std::make_pair(I->Offset, I->Offset));
+      I->Offset = Result.first - ConstPool.begin();
     }
   }
 
@@ -259,7 +271,7 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) {
 #endif
 }
 
-/// serializeToStackMapSection conceptually populates the following fields:
+/// Emit the stackmap header.
 ///
 /// Header {
 ///   uint8  : Stack Map Version (currently 1)
@@ -269,11 +281,54 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) {
 /// uint32 : NumFunctions
 /// uint32 : NumConstants
 /// uint32 : NumRecords
+void StackMaps::emitStackmapHeader(MCStreamer &OS) {
+  // Header.
+  OS.EmitIntValue(StackMapVersion, 1); // Version.
+  OS.EmitIntValue(0, 1); // Reserved.
+  OS.EmitIntValue(0, 2); // Reserved.
+
+  // Num functions.
+  DEBUG(dbgs() << WSMP << "#functions = " << FnStackSize.size() << '\n');
+  OS.EmitIntValue(FnStackSize.size(), 4);
+  // Num constants.
+  DEBUG(dbgs() << WSMP << "#constants = " << ConstPool.size() << '\n');
+  OS.EmitIntValue(ConstPool.size(), 4);
+  // Num callsites.
+  DEBUG(dbgs() << WSMP << "#callsites = " << CSInfos.size() << '\n');
+  OS.EmitIntValue(CSInfos.size(), 4);
+}
+
+/// Emit the function frame record for each function.
+///
 /// StkSizeRecord[NumFunctions] {
 ///   uint64 : Function Address
 ///   uint64 : Stack Size
 /// }
+void StackMaps::emitFunctionFrameRecords(MCStreamer &OS) {
+  // Function Frame records.
+  DEBUG(dbgs() << WSMP << "functions:\n");
+  for (auto const &FR : FnStackSize) {
+    DEBUG(dbgs() << WSMP << "function addr: " << FR.first
+                         << " frame size: " << FR.second);
+    OS.EmitSymbolValue(FR.first, 8);
+    OS.EmitIntValue(FR.second, 8);
+  }
+}
+
+/// Emit the constant pool.
+///
 /// int64  : Constants[NumConstants]
+void StackMaps::emitConstantPoolEntries(MCStreamer &OS) {
+  // Constant pool entries.
+  DEBUG(dbgs() << WSMP << "constants:\n");
+  for (auto ConstEntry : ConstPool) {
+    DEBUG(dbgs() << WSMP << ConstEntry.second << '\n');
+    OS.EmitIntValue(ConstEntry.second, 8);
+  }
+}
+
+/// Emit the callsite info for each callsite.
+///
 /// StkMapRecord[NumRecords] {
 ///   uint64 : PatchPoint ID
 ///   uint32 : Instruction Offset
@@ -301,95 +356,43 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) {
 ///   0x3, Indirect, [Reg + Offset]      (spilled value)
 ///   0x4, Constant, Offset              (small constant)
 ///   0x5, ConstIndex, Constants[Offset] (large constant)
-///
-void StackMaps::serializeToStackMapSection() {
-  // Bail out if there's no stack map data.
-  if (CSInfos.empty())
-    return;
-
-  MCContext &OutContext = AP.OutStreamer.getContext();
-  const TargetRegisterInfo *TRI = AP.TM.getRegisterInfo();
-
-  // Create the section.
-  const MCSection *StackMapSection =
-    OutContext.getObjectFileInfo()->getStackMapSection();
-  AP.OutStreamer.SwitchSection(StackMapSection);
-
-  // Emit a dummy symbol to force section inclusion.
-  AP.OutStreamer.EmitLabel(
-    OutContext.GetOrCreateSymbol(Twine("__LLVM_StackMaps")));
-
-  // Serialize data.
-  const char *WSMP = "Stack Maps: ";
-  (void)WSMP;
-
-  DEBUG(dbgs() << "********** Stack Map Output **********\n");
-
-  // Header.
-  AP.OutStreamer.EmitIntValue(1, 1); // Version.
-  AP.OutStreamer.EmitIntValue(0, 1); // Reserved.
-  AP.OutStreamer.EmitIntValue(0, 2); // Reserved.
-
-  // Num functions.
-  DEBUG(dbgs() << WSMP << "#functions = " << FnStackSize.size() << '\n');
-  AP.OutStreamer.EmitIntValue(FnStackSize.size(), 4);
-  // Num constants.
-  DEBUG(dbgs() << WSMP << "#constants = " << ConstPool.getNumConstants()
-               << '\n');
-  AP.OutStreamer.EmitIntValue(ConstPool.getNumConstants(), 4);
-  // Num callsites.
-  DEBUG(dbgs() << WSMP << "#callsites = " << CSInfos.size() << '\n');
-  AP.OutStreamer.EmitIntValue(CSInfos.size(), 4);
-
-  // Function stack size entries.
-  for (FnStackSizeMap::iterator I = FnStackSize.begin(), E = FnStackSize.end();
-       I != E; ++I) {
-    AP.OutStreamer.EmitSymbolValue(I->first, 8);
-    AP.OutStreamer.EmitIntValue(I->second, 8);
-  }
-
-  // Constant pool entries.
-  for (unsigned i = 0; i < ConstPool.getNumConstants(); ++i)
-    AP.OutStreamer.EmitIntValue(ConstPool.getConstant(i), 8);
-
+void StackMaps::emitCallsiteEntries(MCStreamer &OS,
+                                    const TargetRegisterInfo *TRI) {
   // Callsite entries.
-  for (CallsiteInfoList::const_iterator CSII = CSInfos.begin(),
-       CSIE = CSInfos.end(); CSII != CSIE; ++CSII) {
-    uint64_t CallsiteID = CSII->ID;
-    const LocationVec &CSLocs = CSII->Locations;
-    const LiveOutVec &LiveOuts = CSII->LiveOuts;
+  DEBUG(dbgs() << WSMP << "callsites:\n");
+  for (const auto &CSI : CSInfos) {
+    const LocationVec &CSLocs = CSI.Locations;
+    const LiveOutVec &LiveOuts = CSI.LiveOuts;
 
-    DEBUG(dbgs() << WSMP << "callsite " << CallsiteID << "\n");
+    DEBUG(dbgs() << WSMP << "callsite " << CSI.ID << "\n");
 
     // Verify stack map entry. It's better to communicate a problem to the
     // runtime than crash in case of in-process compilation. Currently, we do
     // simple overflow checks, but we may eventually communicate other
     // compilation errors this way.
     if (CSLocs.size() > UINT16_MAX || LiveOuts.size() > UINT16_MAX) {
-      AP.OutStreamer.EmitIntValue(UINT64_MAX, 8); // Invalid ID.
-      AP.OutStreamer.EmitValue(CSII->CSOffsetExpr, 4);
-      AP.OutStreamer.EmitIntValue(0, 2); // Reserved.
-      AP.OutStreamer.EmitIntValue(0, 2); // 0 locations.
-      AP.OutStreamer.EmitIntValue(0, 2); // padding.
-      AP.OutStreamer.EmitIntValue(0, 2); // 0 live-out registers.
-      AP.OutStreamer.EmitIntValue(0, 4); // padding.
+      OS.EmitIntValue(UINT64_MAX, 8); // Invalid ID.
+      OS.EmitValue(CSI.CSOffsetExpr, 4);
+      OS.EmitIntValue(0, 2); // Reserved.
+      OS.EmitIntValue(0, 2); // 0 locations.
+      OS.EmitIntValue(0, 2); // padding.
+      OS.EmitIntValue(0, 2); // 0 live-out registers.
+      OS.EmitIntValue(0, 4); // padding.
       continue;
     }
 
-    AP.OutStreamer.EmitIntValue(CallsiteID, 8);
-    AP.OutStreamer.EmitValue(CSII->CSOffsetExpr, 4);
+    OS.EmitIntValue(CSI.ID, 8);
+    OS.EmitValue(CSI.CSOffsetExpr, 4);
 
     // Reserved for flags.
-    AP.OutStreamer.EmitIntValue(0, 2);
+    OS.EmitIntValue(0, 2);
 
     DEBUG(dbgs() << WSMP << "  has " << CSLocs.size() << " locations\n");
 
-    AP.OutStreamer.EmitIntValue(CSLocs.size(), 2);
+    OS.EmitIntValue(CSLocs.size(), 2);
 
-    unsigned operIdx = 0;
-    for (LocationVec::const_iterator LocI = CSLocs.begin(), LocE = CSLocs.end();
-         LocI != LocE; ++LocI, ++operIdx) {
-      const Location &Loc = *LocI;
+    unsigned OperIdx = 0;
+    for (const auto &Loc : CSLocs) {
       unsigned RegNo = 0;
       int Offset = Loc.Offset;
       if(Loc.Reg) {
@@ -410,67 +413,97 @@ void StackMaps::serializeToStackMapSection() {
                "Missing location register");
       }
 
-      DEBUG(
-        dbgs() << WSMP << "  Loc " << operIdx << ": ";
-        switch (Loc.LocType) {
-        case Location::Unprocessed:
-          dbgs() << "<Unprocessed operand>";
-          break;
-        case Location::Register:
-          dbgs() << "Register " << TRI->getName(Loc.Reg);
-          break;
-        case Location::Direct:
-          dbgs() << "Direct " << TRI->getName(Loc.Reg);
-          if (Loc.Offset)
-            dbgs() << " + " << Loc.Offset;
-          break;
-        case Location::Indirect:
-          dbgs() << "Indirect " << TRI->getName(Loc.Reg)
-                 << " + " << Loc.Offset;
-          break;
-        case Location::Constant:
-          dbgs() << "Constant " << Loc.Offset;
-          break;
-        case Location::ConstantIndex:
-          dbgs() << "Constant Index " << Loc.Offset;
-          break;
-        }
-        dbgs() << "     [encoding: .byte " << Loc.LocType
-               << ", .byte " << Loc.Size
-               << ", .short " << RegNo
-               << ", .int " << Offset << "]\n";
-      );
-
-      AP.OutStreamer.EmitIntValue(Loc.LocType, 1);
-      AP.OutStreamer.EmitIntValue(Loc.Size, 1);
-      AP.OutStreamer.EmitIntValue(RegNo, 2);
-      AP.OutStreamer.EmitIntValue(Offset, 4);
+      DEBUG(dbgs() << WSMP << "  Loc " << OperIdx << ": ";
+            switch (Loc.LocType) {
+            case Location::Unprocessed:
+              dbgs() << "<Unprocessed operand>";
+              break;
+            case Location::Register:
+              dbgs() << "Register " << TRI->getName(Loc.Reg);
+              break;
+            case Location::Direct:
+              dbgs() << "Direct " << TRI->getName(Loc.Reg);
+              if (Loc.Offset)
+              dbgs() << " + " << Loc.Offset;
+              break;
+            case Location::Indirect:
+              dbgs() << "Indirect " << TRI->getName(Loc.Reg)
+              << " + " << Loc.Offset;
+              break;
+            case Location::Constant:
+              dbgs() << "Constant " << Loc.Offset;
+              break;
+            case Location::ConstantIndex:
+              dbgs() << "Constant Index " << Loc.Offset;
+              break;
+              }
+            dbgs() << "     [encoding: .byte " << Loc.LocType
+            << ", .byte " << Loc.Size
+            << ", .short " << RegNo
+            << ", .int " << Offset << "]\n";
+            );
+
+      OS.EmitIntValue(Loc.LocType, 1);
+      OS.EmitIntValue(Loc.Size, 1);
+      OS.EmitIntValue(RegNo, 2);
+      OS.EmitIntValue(Offset, 4);
+      OperIdx++;
     }
 
     DEBUG(dbgs() << WSMP << "  has " << LiveOuts.size()
-                 << " live-out registers\n");
+                         << " live-out registers\n");
 
     // Num live-out registers and padding to align to 4 byte.
-    AP.OutStreamer.EmitIntValue(0, 2);
-    AP.OutStreamer.EmitIntValue(LiveOuts.size(), 2);
-
-    operIdx = 0;
-    for (LiveOutVec::const_iterator LI = LiveOuts.begin(), LE = LiveOuts.end();
-         LI != LE; ++LI, ++operIdx) {
-      DEBUG(dbgs() << WSMP << "  LO " << operIdx << ": "
-                   << TRI->getName(LI->Reg)
-                   << "     [encoding: .short " << LI->RegNo
-                   << ", .byte 0, .byte " << LI->Size << "]\n");
-
-      AP.OutStreamer.EmitIntValue(LI->RegNo, 2);
-      AP.OutStreamer.EmitIntValue(0, 1);
-      AP.OutStreamer.EmitIntValue(LI->Size, 1);
+    OS.EmitIntValue(0, 2);
+    OS.EmitIntValue(LiveOuts.size(), 2);
+
+    OperIdx = 0;
+    for (const auto &LO : LiveOuts) {
+      DEBUG(dbgs() << WSMP << "  LO " << OperIdx << ": "
+                           << TRI->getName(LO.Reg)
+                           << "     [encoding: .short " << LO.RegNo
+                           << ", .byte 0, .byte " << LO.Size << "]\n");
+      OS.EmitIntValue(LO.RegNo, 2);
+      OS.EmitIntValue(0, 1);
+      OS.EmitIntValue(LO.Size, 1);
     }
     // Emit alignment to 8 byte.
-    AP.OutStreamer.EmitValueToAlignment(8);
+    OS.EmitValueToAlignment(8);
   }
+}
+
+/// Serialize the stackmap data.
+void StackMaps::serializeToStackMapSection() {
+  (void) WSMP;
+  // Bail out if there's no stack map data.
+  assert((!CSInfos.empty() || (CSInfos.empty() && ConstPool.empty())) &&
+         "Expected empty constant pool too!");
+  assert((!CSInfos.empty() || (CSInfos.empty() && FnStackSize.empty())) &&
+         "Expected empty function record too!");
+  if (CSInfos.empty())
+    return;
 
-  AP.OutStreamer.AddBlankLine();
+  MCContext &OutContext = AP.OutStreamer.getContext();
+  MCStreamer &OS = AP.OutStreamer;
+  const TargetRegisterInfo *TRI = AP.TM.getRegisterInfo();
+
+  // Create the section.
+  const MCSection *StackMapSection =
+    OutContext.getObjectFileInfo()->getStackMapSection();
+  OS.SwitchSection(StackMapSection);
+
+  // Emit a dummy symbol to force section inclusion.
+  OS.EmitLabel(OutContext.GetOrCreateSymbol(Twine("__LLVM_StackMaps")));
+
+  // Serialize data.
+  DEBUG(dbgs() << "********** Stack Map Output **********\n");
+  emitStackmapHeader(OS);
+  emitFunctionFrameRecords(OS);
+  emitConstantPoolEntries(OS);
+  emitCallsiteEntries(OS, TRI);
+  OS.AddBlankLine();
 
+  // Clean up.
   CSInfos.clear();
+  ConstPool.clear();
 }
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index f3749e5..accfe7b 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "stack-protector"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -37,6 +36,8 @@
 #include <cstdlib>
 using namespace llvm;
 
+#define DEBUG_TYPE "stack-protector"
+
 STATISTIC(NumFunProtected, "Number of functions protected");
 STATISTIC(NumAddrTaken, "Number of local variables that have their address"
                         " taken.");
@@ -83,18 +84,18 @@ bool StackProtector::runOnFunction(Function &Fn) {
   M = F->getParent();
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : 0;
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
   TLI = TM->getTargetLowering();
 
-  if (!RequiresStackProtector())
-    return false;
-
   Attribute Attr = Fn.getAttributes().getAttribute(
       AttributeSet::FunctionIndex, "stack-protector-buffer-size");
   if (Attr.isStringAttribute() &&
       Attr.getValueAsString().getAsInteger(10, SSPBufferSize))
       return false; // Invalid integer string
 
+  if (!RequiresStackProtector())
+    return false;
+
   ++NumFunProtected;
   return InsertStackProtectors();
 }
@@ -319,7 +320,7 @@ static CallInst *FindPotentialTailCall(BasicBlock *BB, ReturnInst *RI,
     SearchCounter++;
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// Insert code into the entry block that stores the __stack_chk_guard
@@ -354,7 +355,7 @@ static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI,
   }
 
   IRBuilder<> B(&F->getEntryBlock().front());
-  AI = B.CreateAlloca(PtrTy, 0, "StackGuardSlot");
+  AI = B.CreateAlloca(PtrTy, nullptr, "StackGuardSlot");
   LoadInst *LI = B.CreateLoad(StackGuardVar, "StackGuard");
   B.CreateCall2(Intrinsic::getDeclaration(M, Intrinsic::stackprotector), LI,
                 AI);
@@ -372,8 +373,8 @@ bool StackProtector::InsertStackProtectors() {
   bool HasPrologue = false;
   bool SupportsSelectionDAGSP =
       EnableSelectionDAGSP && !TM->Options.EnableFastISel;
-  AllocaInst *AI = 0;       // Place on stack that stores the stack guard.
-  Value *StackGuardVar = 0; // The stack guard variable.
+  AllocaInst *AI = nullptr;       // Place on stack that stores the stack guard.
+  Value *StackGuardVar = nullptr; // The stack guard variable.
 
   for (Function::iterator I = F->begin(), E = F->end(); I != E;) {
     BasicBlock *BB = I++;
@@ -390,14 +391,14 @@ bool StackProtector::InsertStackProtectors() {
     if (SupportsSelectionDAGSP) {
       // Since we have a potential tail call, insert the special stack check
       // intrinsic.
-      Instruction *InsertionPt = 0;
+      Instruction *InsertionPt = nullptr;
       if (CallInst *CI = FindPotentialTailCall(BB, RI, TLI)) {
         InsertionPt = CI;
       } else {
         InsertionPt = RI;
         // At this point we know that BB has a return statement so it *DOES*
         // have a terminator.
-        assert(InsertionPt != 0 && "BB must have a terminator instruction at "
+        assert(InsertionPt != nullptr && "BB must have a terminator instruction at "
                                    "this point.");
       }
 
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 2717f4c..791168f 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "stackslotcoloring"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -33,6 +32,8 @@
 #include <vector>
 using namespace llvm;
 
+#define DEBUG_TYPE "stackslotcoloring"
+
 static cl::opt<bool>
 DisableSharing("no-stack-slot-sharing",
              cl::init(false), cl::Hidden,
@@ -161,13 +162,12 @@ void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
       for (MachineInstr::mmo_iterator MMOI = MI->memoperands_begin(),
            EE = MI->memoperands_end(); MMOI != EE; ++MMOI) {
         MachineMemOperand *MMO = *MMOI;
-        if (const Value *V = MMO->getValue()) {
-          if (const FixedStackPseudoSourceValue *FSV =
-              dyn_cast<FixedStackPseudoSourceValue>(V)) {
-            int FI = FSV->getFrameIndex();
-            if (FI >= 0)
-              SSRefs[FI].push_back(MMO);
-          }
+        if (const FixedStackPseudoSourceValue *FSV =
+            dyn_cast_or_null<FixedStackPseudoSourceValue>(
+                MMO->getPseudoValue())) {
+          int FI = FSV->getFrameIndex();
+          if (FI >= 0)
+            SSRefs[FI].push_back(MMO);
         }
       }
     }
@@ -310,7 +310,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
     if (NewFI == -1 || (NewFI == (int)SS))
       continue;
 
-    const Value *NewSV = PseudoSourceValue::getFixedStack(NewFI);
+    const PseudoSourceValue *NewSV = PseudoSourceValue::getFixedStack(NewFI);
     SmallVectorImpl<MachineMemOperand *> &RefMMOs = SSRefs[SS];
     for (unsigned i = 0, e = RefMMOs.size(); i != e; ++i)
       RefMMOs[i]->setValue(NewSV);
@@ -398,7 +398,7 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
     ++NumDead;
     changed = true;
 
-    if (NextMI->findRegisterUseOperandIdx(LoadReg, true, 0) != -1) {
+    if (NextMI->findRegisterUseOperandIdx(LoadReg, true, nullptr) != -1) {
       ++NumDead;
       toErase.push_back(I);
     }
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 3b7a04c..723a629 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "tailduplication"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
@@ -34,6 +33,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "tailduplication"
+
 STATISTIC(NumTails     , "Number of tails duplicated");
 STATISTIC(NumTailDups  , "Number of tail duplicated blocks");
 STATISTIC(NumInstrDups , "Additional instructions due to tail duplication");
@@ -181,7 +182,7 @@ static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
           dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
           dbgs() << "  missing input from predecessor BB#"
                  << PredBB->getNumber() << '\n';
-          llvm_unreachable(0);
+          llvm_unreachable(nullptr);
         }
       }
 
@@ -192,12 +193,12 @@ static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
                  << ": " << *MI;
           dbgs() << "  extra input from predecessor BB#"
                  << PHIBB->getNumber() << '\n';
-          llvm_unreachable(0);
+          llvm_unreachable(nullptr);
         }
         if (PHIBB->getNumber() < 0) {
           dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
           dbgs() << "  non-existing BB#" << PHIBB->getNumber() << '\n';
-          llvm_unreachable(0);
+          llvm_unreachable(nullptr);
         }
       }
       ++MI;
@@ -247,7 +248,7 @@ TailDuplicatePass::TailDuplicateAndUpdate(MachineBasicBlock *MBB,
       // If the original definition is still around, add it as an available
       // value.
       MachineInstr *DefMI = MRI->getVRegDef(VReg);
-      MachineBasicBlock *DefBB = 0;
+      MachineBasicBlock *DefBB = nullptr;
       if (DefMI) {
         DefBB = DefMI->getParent();
         SSAUpdate.AddAvailableValue(DefBB, VReg);
@@ -363,9 +364,7 @@ static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) {
 // block (which is why we need to copy the information).
 static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
                               DenseSet<unsigned> *UsedByPhi) {
-  for(MachineBasicBlock::const_iterator I = BB.begin(), E = BB.end();
-      I != E; ++I) {
-    const MachineInstr &MI = *I;
+  for (const auto &MI : BB) {
     if (!MI.isPHI())
       break;
     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
@@ -656,7 +655,7 @@ TailDuplicatePass::canCompletelyDuplicateBB(MachineBasicBlock &BB) {
     if (PredBB->succ_size() > 1)
       return false;
 
-    MachineBasicBlock *PredTBB = NULL, *PredFBB = NULL;
+    MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
     SmallVector<MachineOperand, 4> PredCond;
     if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
       return false;
@@ -687,7 +686,7 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB,
     if (bothUsedInPHI(*PredBB, Succs))
       continue;
 
-    MachineBasicBlock *PredTBB = NULL, *PredFBB = NULL;
+    MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
     SmallVector<MachineOperand, 4> PredCond;
     if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
       continue;
@@ -718,14 +717,14 @@ TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB,
     // Make the branch unconditional if possible
     if (PredTBB == PredFBB) {
       PredCond.clear();
-      PredFBB = NULL;
+      PredFBB = nullptr;
     }
 
     // Avoid adding fall through branches.
     if (PredFBB == NextBB)
-      PredFBB = NULL;
-    if (PredTBB == NextBB && PredFBB == NULL)
-      PredTBB = NULL;
+      PredFBB = nullptr;
+    if (PredTBB == NextBB && PredFBB == nullptr)
+      PredTBB = nullptr;
 
     TII->RemoveBranch(*PredBB);
 
@@ -858,7 +857,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
   // block, which falls through unconditionally, move the contents of this
   // block into the prior block.
   MachineBasicBlock *PrevBB = std::prev(MachineFunction::iterator(TailBB));
-  MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0;
+  MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
   SmallVector<MachineOperand, 4> PriorCond;
   // This has to check PrevBB->succ_size() because EH edges are ignored by
   // AnalyzeBranch.
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index cae3ccd..c3f84c6 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -43,7 +43,7 @@ TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
                              const TargetRegisterInfo *TRI,
                              const MachineFunction &MF) const {
   if (OpNum >= MCID.getNumOperands())
-    return 0;
+    return nullptr;
 
   short RegClass = MCID.OpInfo[OpNum].RegClass;
   if (MCID.OpInfo[OpNum].isLookupPtrRegClass())
@@ -51,7 +51,7 @@ TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
 
   // Instructions like INSERT_SUBREG do not have fixed register classes.
   if (RegClass < 0)
-    return 0;
+    return nullptr;
 
   // Otherwise just look it up normally.
   return TRI->getRegClass(RegClass);
@@ -111,7 +111,7 @@ TargetInstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
 
   // If MBB isn't immediately before MBB, insert a branch to it.
   if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(NewDest))
-    InsertBranch(*MBB, NewDest, 0, SmallVector<MachineOperand, 0>(),
+    InsertBranch(*MBB, NewDest, nullptr, SmallVector<MachineOperand, 0>(),
                  Tail->getDebugLoc());
   MBB->addSuccessor(NewDest);
 }
@@ -124,13 +124,11 @@ MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI,
   bool HasDef = MCID.getNumDefs();
   if (HasDef && !MI->getOperand(0).isReg())
     // No idea how to commute this instruction. Target should implement its own.
-    return 0;
+    return nullptr;
   unsigned Idx1, Idx2;
   if (!findCommutedOpIndices(MI, Idx1, Idx2)) {
-    std::string msg;
-    raw_string_ostream Msg(msg);
-    Msg << "Don't know how to commute: " << *MI;
-    report_fatal_error(Msg.str());
+    assert(MI->isCommutable() && "Precondition violation: MI must be commutable.");
+    return nullptr;
   }
 
   assert(MI->getOperand(Idx1).isReg() && MI->getOperand(Idx2).isReg() &&
@@ -250,13 +248,15 @@ bool TargetInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
          oe = MI->memoperands_end();
        o != oe;
        ++o) {
-    if ((*o)->isLoad() && (*o)->getValue())
+    if ((*o)->isLoad()) {
       if (const FixedStackPseudoSourceValue *Value =
-          dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+          dyn_cast_or_null<FixedStackPseudoSourceValue>(
+              (*o)->getPseudoValue())) {
         FrameIndex = Value->getFrameIndex();
         MMO = *o;
         return true;
       }
+    }
   }
   return false;
 }
@@ -268,13 +268,15 @@ bool TargetInstrInfo::hasStoreToStackSlot(const MachineInstr *MI,
          oe = MI->memoperands_end();
        o != oe;
        ++o) {
-    if ((*o)->isStore() && (*o)->getValue())
+    if ((*o)->isStore()) {
       if (const FixedStackPseudoSourceValue *Value =
-          dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+          dyn_cast_or_null<FixedStackPseudoSourceValue>(
+              (*o)->getPseudoValue())) {
         FrameIndex = Value->getFrameIndex();
         MMO = *o;
         return true;
       }
+    }
   }
   return false;
 }
@@ -340,14 +342,14 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI,
                                               unsigned FoldIdx) {
   assert(MI->isCopy() && "MI must be a COPY instruction");
   if (MI->getNumOperands() != 2)
-    return 0;
+    return nullptr;
   assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand");
 
   const MachineOperand &FoldOp = MI->getOperand(FoldIdx);
   const MachineOperand &LiveOp = MI->getOperand(1-FoldIdx);
 
   if (FoldOp.getSubReg() || LiveOp.getSubReg())
-    return 0;
+    return nullptr;
 
   unsigned FoldReg = FoldOp.getReg();
   unsigned LiveReg = LiveOp.getReg();
@@ -359,13 +361,13 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI,
   const TargetRegisterClass *RC = MRI.getRegClass(FoldReg);
 
   if (TargetRegisterInfo::isPhysicalRegister(LiveOp.getReg()))
-    return RC->contains(LiveOp.getReg()) ? RC : 0;
+    return RC->contains(LiveOp.getReg()) ? RC : nullptr;
 
   if (RC->hasSubClassEq(MRI.getRegClass(LiveReg)))
     return RC;
 
   // FIXME: Allow folding when register classes are memory compatible.
-  return 0;
+  return nullptr;
 }
 
 bool TargetInstrInfo::
@@ -399,7 +401,7 @@ static MachineInstr* foldPatchpoint(MachineFunction &MF,
   for (SmallVectorImpl<unsigned>::const_iterator I = Ops.begin(), E = Ops.end();
        I != E; ++I) {
     if (*I < StartIdx)
-      return 0;
+      return nullptr;
   }
 
   MachineInstr *NewMI =
@@ -454,7 +456,7 @@ TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
   assert(MBB && "foldMemoryOperand needs an inserted instruction");
   MachineFunction &MF = *MBB->getParent();
 
-  MachineInstr *NewMI = 0;
+  MachineInstr *NewMI = nullptr;
 
   if (MI->getOpcode() == TargetOpcode::STACKMAP ||
       MI->getOpcode() == TargetOpcode::PATCHPOINT) {
@@ -488,11 +490,11 @@ TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
 
   // Straight COPY may fold as load/store.
   if (!MI->isCopy() || Ops.size() != 1)
-    return 0;
+    return nullptr;
 
   const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]);
   if (!RC)
-    return 0;
+    return nullptr;
 
   const MachineOperand &MO = MI->getOperand(1-Ops[0]);
   MachineBasicBlock::iterator Pos = MI;
@@ -521,7 +523,7 @@ TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
   MachineFunction &MF = *MBB.getParent();
 
   // Ask the target to do the actual folding.
-  MachineInstr *NewMI = 0;
+  MachineInstr *NewMI = nullptr;
   int FrameIndex = 0;
 
   if ((MI->getOpcode() == TargetOpcode::STACKMAP ||
@@ -534,7 +536,7 @@ TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
     NewMI = foldMemoryOperandImpl(MF, MI, Ops, LoadMI);
   }
 
-  if (!NewMI) return 0;
+  if (!NewMI) return nullptr;
 
   NewMI = MBB.insert(MI, NewMI);
 
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 870370b..2634d71 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -82,16 +82,16 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
   Names[RTLIB::UREM_I128] = "__umodti3";
 
   // These are generally not available.
-  Names[RTLIB::SDIVREM_I8] = 0;
-  Names[RTLIB::SDIVREM_I16] = 0;
-  Names[RTLIB::SDIVREM_I32] = 0;
-  Names[RTLIB::SDIVREM_I64] = 0;
-  Names[RTLIB::SDIVREM_I128] = 0;
-  Names[RTLIB::UDIVREM_I8] = 0;
-  Names[RTLIB::UDIVREM_I16] = 0;
-  Names[RTLIB::UDIVREM_I32] = 0;
-  Names[RTLIB::UDIVREM_I64] = 0;
-  Names[RTLIB::UDIVREM_I128] = 0;
+  Names[RTLIB::SDIVREM_I8] = nullptr;
+  Names[RTLIB::SDIVREM_I16] = nullptr;
+  Names[RTLIB::SDIVREM_I32] = nullptr;
+  Names[RTLIB::SDIVREM_I64] = nullptr;
+  Names[RTLIB::SDIVREM_I128] = nullptr;
+  Names[RTLIB::UDIVREM_I8] = nullptr;
+  Names[RTLIB::UDIVREM_I16] = nullptr;
+  Names[RTLIB::UDIVREM_I32] = nullptr;
+  Names[RTLIB::UDIVREM_I64] = nullptr;
+  Names[RTLIB::UDIVREM_I128] = nullptr;
 
   Names[RTLIB::NEG_I32] = "__negsi2";
   Names[RTLIB::NEG_I64] = "__negdi2";
@@ -392,18 +392,18 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
     Names[RTLIB::SINCOS_PPCF128] = "sincosl";
   } else {
     // These are generally not available.
-    Names[RTLIB::SINCOS_F32] = 0;
-    Names[RTLIB::SINCOS_F64] = 0;
-    Names[RTLIB::SINCOS_F80] = 0;
-    Names[RTLIB::SINCOS_F128] = 0;
-    Names[RTLIB::SINCOS_PPCF128] = 0;
+    Names[RTLIB::SINCOS_F32] = nullptr;
+    Names[RTLIB::SINCOS_F64] = nullptr;
+    Names[RTLIB::SINCOS_F80] = nullptr;
+    Names[RTLIB::SINCOS_F128] = nullptr;
+    Names[RTLIB::SINCOS_PPCF128] = nullptr;
   }
 
   if (Triple(TM.getTargetTriple()).getOS() != Triple::OpenBSD) {
     Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = "__stack_chk_fail";
   } else {
     // These are generally not available.
-    Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = 0;
+    Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = nullptr;
   }
 }
 
@@ -680,6 +680,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
   UseUnderscoreLongJmp = false;
   SelectIsExpensive = false;
   HasMultipleConditionRegisters = false;
+  HasExtractBitsInsn = false;
   IntDivIsCheap = false;
   Pow2DivIsCheap = false;
   JumpIsExpensive = false;
@@ -914,7 +915,6 @@ bool TargetLoweringBase::isLegalRC(const TargetRegisterClass *RC) const {
 MachineBasicBlock*
 TargetLoweringBase::emitPatchPoint(MachineInstr *MI,
                                    MachineBasicBlock *MBB) const {
-  const TargetMachine &TM = getTargetMachine();
   MachineFunction &MF = *MI->getParent()->getParent();
 
   // MI changes inside this loop as we grow operands.
@@ -1006,7 +1006,7 @@ void TargetLoweringBase::computeRegisterProperties() {
 
   // Find the largest integer register class.
   unsigned LargestIntReg = MVT::LAST_INTEGER_VALUETYPE;
-  for (; RegClassForVT[LargestIntReg] == 0; --LargestIntReg)
+  for (; RegClassForVT[LargestIntReg] == nullptr; --LargestIntReg)
     assert(LargestIntReg != MVT::i1 && "No integer registers defined!");
 
   // Every integer value type larger than this largest register takes twice as
@@ -1326,7 +1326,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
   case Mul:            return ISD::MUL;
   case FMul:           return ISD::FMUL;
   case UDiv:           return ISD::UDIV;
-  case SDiv:           return ISD::UDIV;
+  case SDiv:           return ISD::SDIV;
   case FDiv:           return ISD::FDIV;
   case URem:           return ISD::UREM;
   case SRem:           return ISD::SREM;
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index e41fbfc..dda2259 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -100,7 +100,7 @@ const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(
     // Add information about the stub reference to ELFMMI so that the stub
     // gets emitted by the asmprinter.
     MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
-    if (StubSym.getPointer() == 0) {
+    if (!StubSym.getPointer()) {
       MCSymbol *Sym = TM.getSymbol(GV, Mang);
       StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
     }
@@ -339,8 +339,8 @@ getSectionForConstant(SectionKind Kind) const {
   return DataRelROSection;
 }
 
-const MCSection *
-TargetLoweringObjectFileELF::getStaticCtorSection(unsigned Priority) const {
+const MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
+    unsigned Priority, const MCSymbol *KeySym, const MCSection *KeySec) const {
   // The default scheme is .ctor / .dtor, so we have to invert the priority
   // numbering.
   if (Priority == 65535)
@@ -359,8 +359,8 @@ TargetLoweringObjectFileELF::getStaticCtorSection(unsigned Priority) const {
   }
 }
 
-const MCSection *
-TargetLoweringObjectFileELF::getStaticDtorSection(unsigned Priority) const {
+const MCSection *TargetLoweringObjectFileELF::getStaticDtorSection(
+    unsigned Priority, const MCSymbol *KeySym, const MCSection *KeySec) const {
   // The default scheme is .ctor / .dtor, so we have to invert the priority
   // numbering.
   if (Priority == 65535)
@@ -418,7 +418,7 @@ emitModuleFlags(MCStreamer &Streamer,
                 Mangler &Mang, const TargetMachine &TM) const {
   unsigned VersionVal = 0;
   unsigned ImageInfoFlags = 0;
-  MDNode *LinkerOptions = 0;
+  MDNode *LinkerOptions = nullptr;
   StringRef SectionVal;
 
   for (ArrayRef<Module::ModuleFlagEntry>::iterator
@@ -659,7 +659,7 @@ const MCExpr *TargetLoweringObjectFileMachO::getTTypeGlobalReference(
     MachineModuleInfoImpl::StubValueTy &StubSym =
       GV->hasHiddenVisibility() ? MachOMMI.getHiddenGVStubEntry(SSym) :
                                   MachOMMI.getGVStubEntry(SSym);
-    if (StubSym.getPointer() == 0) {
+    if (!StubSym.getPointer()) {
       MCSymbol *Sym = TM.getSymbol(GV, Mang);
       StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
     }
@@ -685,7 +685,7 @@ MCSymbol *TargetLoweringObjectFileMachO::getCFIPersonalitySymbol(
   // Add information about the stub reference to MachOMMI so that the stub
   // gets emitted by the asmprinter.
   MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym);
-  if (StubSym.getPointer() == 0) {
+  if (!StubSym.getPointer()) {
     MCSymbol *Sym = TM.getSymbol(GV, Mang);
     StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
   }
@@ -755,7 +755,7 @@ const MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal(
 static const char *getCOFFSectionNameForUniqueGlobal(SectionKind Kind) {
   if (Kind.isText())
     return ".text";
-  if (Kind.isBSS ())
+  if (Kind.isBSS())
     return ".bss";
   if (Kind.isThreadLocal())
     return ".tls$";
@@ -781,7 +781,7 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
   // Section names depend on the name of the symbol which is not feasible if the
   // symbol has private linkage.
   if ((GV->isWeakForLinker() || EmitUniquedSection) &&
-      !GV->hasPrivateLinkage()) {
+      !GV->hasPrivateLinkage() && !Kind.isCommon()) {
     const char *Name = getCOFFSectionNameForUniqueGlobal(Kind);
     unsigned Characteristics = getCOFFSectionFlags(Kind);
 
@@ -802,7 +802,10 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
   if (Kind.isReadOnly())
     return ReadOnlySection;
 
-  if (Kind.isBSS())
+  // Note: we claim that common symbols are put in BSSSection, but they are
+  // really emitted with the magic .comm directive, which creates a symbol table
+  // entry but not a section.
+  if (Kind.isBSS() || Kind.isCommon())
     return BSSSection;
 
   return DataSection;
@@ -820,7 +823,7 @@ void TargetLoweringObjectFileCOFF::
 emitModuleFlags(MCStreamer &Streamer,
                 ArrayRef<Module::ModuleFlagEntry> ModuleFlags,
                 Mangler &Mang, const TargetMachine &TM) const {
-  MDNode *LinkerOptions = 0;
+  MDNode *LinkerOptions = nullptr;
 
   // Look for the "Linker Options" flag, since it's the only one we support.
   for (ArrayRef<Module::ModuleFlagEntry>::iterator
@@ -862,3 +865,32 @@ emitModuleFlags(MCStreamer &Streamer,
     }
   }
 }
+
+static const MCSection *getAssociativeCOFFSection(MCContext &Ctx,
+                                                  const MCSection *Sec,
+                                                  const MCSymbol *KeySym,
+                                                  const MCSection *KeySec) {
+  // Return the normal section if we don't have to be associative.
+  if (!KeySym)
+    return Sec;
+
+  // Make an associative section with the same name and kind as the normal
+  // section.
+  const MCSectionCOFF *SecCOFF = cast<MCSectionCOFF>(Sec);
+  const MCSectionCOFF *KeySecCOFF = cast<MCSectionCOFF>(KeySec);
+  unsigned Characteristics =
+      SecCOFF->getCharacteristics() | COFF::IMAGE_SCN_LNK_COMDAT;
+  return Ctx.getCOFFSection(SecCOFF->getSectionName(), Characteristics,
+                            SecCOFF->getKind(), KeySym->getName(),
+                            COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, KeySecCOFF);
+}
+
+const MCSection *TargetLoweringObjectFileCOFF::getStaticCtorSection(
+    unsigned Priority, const MCSymbol *KeySym, const MCSection *KeySec) const {
+  return getAssociativeCOFFSection(getContext(), StaticCtorSection, KeySym, KeySec);
+}
+
+const MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
+    unsigned Priority, const MCSymbol *KeySym, const MCSection *KeySec) const {
+  return getAssociativeCOFFSection(getContext(), StaticDtorSection, KeySym, KeySec);
+}
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index 5a15243..a3a4fb3 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -101,7 +101,7 @@ TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const {
       Idx += Offset + 1;
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 /// getMinimalPhysRegClass - Returns the Register Class of a physical
@@ -113,7 +113,7 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, EVT VT) const {
 
   // Pick the most sub register class of the right type that contains
   // this physreg.
-  const TargetRegisterClass* BestRC = 0;
+  const TargetRegisterClass* BestRC = nullptr;
   for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I){
     const TargetRegisterClass* RC = *I;
     if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) &&
@@ -130,7 +130,7 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, EVT VT) const {
 static void getAllocatableSetForRC(const MachineFunction &MF,
                                    const TargetRegisterClass *RC, BitVector &R){
   assert(RC->isAllocatable() && "invalid for nonallocatable sets");
-  ArrayRef<uint16_t> Order = RC->getRawAllocationOrder(MF);
+  ArrayRef<MCPhysReg> Order = RC->getRawAllocationOrder(MF);
   for (unsigned i = 0; i != Order.size(); ++i)
     R.set(Order[i]);
 }
@@ -164,7 +164,7 @@ const TargetRegisterClass *firstCommonClass(const uint32_t *A,
   for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
     if (unsigned Common = *A++ & *B++)
       return TRI->getRegClass(I + countTrailingZeros(Common));
-  return 0;
+  return nullptr;
 }
 
 const TargetRegisterClass *
@@ -174,7 +174,7 @@ TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
   if (A == B)
     return A;
   if (!A || !B)
-    return 0;
+    return nullptr;
 
   // Register classes are ordered topologically, so the largest common
   // sub-class it the common sub-class with the smallest ID.
@@ -194,7 +194,7 @@ TargetRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
       // The bit mask contains all register classes that are projected into B
       // by Idx. Find a class that is also a sub-class of A.
       return firstCommonClass(RCI.getMask(), A->getSubClassMask(), this);
-  return 0;
+  return nullptr;
 }
 
 const TargetRegisterClass *TargetRegisterInfo::
@@ -215,7 +215,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
   // Arrange for RCA to be the larger register so the answer will be found in
   // the first iteration. This makes the search linear for the most common
   // case.
-  const TargetRegisterClass *BestRC = 0;
+  const TargetRegisterClass *BestRC = nullptr;
   unsigned *BestPreA = &PreA;
   unsigned *BestPreB = &PreB;
   if (RCA->getSize() < RCB->getSize()) {
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index d9e5aae..f42d47b 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -27,7 +27,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "twoaddrinstr"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
@@ -51,6 +50,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "twoaddrinstr"
+
 STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions");
 STATISTIC(NumCommuted        , "Number of instructions commuted to coalesce");
 STATISTIC(NumAggrCommuted    , "Number of instructions aggressively commuted");
@@ -211,7 +212,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
   }
 
   // Find the instruction that kills SavedReg.
-  MachineInstr *KillMI = NULL;
+  MachineInstr *KillMI = nullptr;
   if (LIS) {
     LiveInterval &LI = LIS->getInterval(SavedReg);
     assert(LI.end() != LI.begin() &&
@@ -250,7 +251,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
   // FIXME: This can be sped up if there is an easy way to query whether an
   // instruction is before or after another instruction. Then we can use
   // MachineRegisterInfo def / use instead.
-  MachineOperand *KillMO = NULL;
+  MachineOperand *KillMO = nullptr;
   MachineBasicBlock::iterator KillPos = KillMI;
   ++KillPos;
 
@@ -454,10 +455,10 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
                                      unsigned &DstReg, bool &IsDstPhys) {
   if (!MRI->hasOneNonDBGUse(Reg))
     // None or more than one use.
-    return 0;
+    return nullptr;
   MachineInstr &UseMI = *MRI->use_instr_nodbg_begin(Reg);
   if (UseMI.getParent() != MBB)
-    return 0;
+    return nullptr;
   unsigned SrcReg;
   bool IsSrcPhys;
   if (isCopyToReg(UseMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys)) {
@@ -469,7 +470,7 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
     IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
     return &UseMI;
   }
-  return 0;
+  return nullptr;
 }
 
 /// getMappedReg - Return the physical register the specified virtual register
@@ -576,7 +577,7 @@ commuteInstruction(MachineBasicBlock::iterator &mi,
   DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
   MachineInstr *NewMI = TII->commuteInstruction(MI);
 
-  if (NewMI == 0) {
+  if (NewMI == nullptr) {
     DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");
     return false;
   }
@@ -755,7 +756,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     // Must be created from unfolded load. Don't waste time trying this.
     return false;
 
-  MachineInstr *KillMI = 0;
+  MachineInstr *KillMI = nullptr;
   if (LIS) {
     LiveInterval &LI = LIS->getInterval(Reg);
     assert(LI.end() != LI.begin() &&
@@ -947,7 +948,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
     // Must be created from unfolded load. Don't waste time trying this.
     return false;
 
-  MachineInstr *KillMI = 0;
+  MachineInstr *KillMI = nullptr;
   if (LIS) {
     LiveInterval &LI = LIS->getInterval(Reg);
     assert(LI.end() != LI.begin() &&
@@ -1394,7 +1395,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
                                              SubRegB) &&
                "tied subregister must be a truncation");
         // The superreg class will not be used to constrain the subreg class.
-        RC = 0;
+        RC = nullptr;
       }
       else {
         assert(TRI->getMatchingSuperReg(RegA, SubRegB, MRI->getRegClass(RegB))
@@ -1631,7 +1632,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
       TargetRegisterInfo::isPhysicalRegister(DstReg) ||
       !(MI->getNumOperands() & 1)) {
     DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << *MI);
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 
   SmallVector<unsigned, 4> OrigRegs;
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index f892e94..704736f 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "LiveDebugVariables.h"
 #include "llvm/ADT/STLExtras.h"
@@ -40,6 +39,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc"
+
 STATISTIC(NumSpillSlots, "Number of spill slots allocated");
 STATISTIC(NumIdCopies,   "Number of identity moves eliminated after rewriting");
 
diff --git a/lib/CodeGen/module.modulemap b/lib/CodeGen/module.modulemap
new file mode 100644
index 0000000..d4f68bc
--- /dev/null
+++ b/lib/CodeGen/module.modulemap
@@ -0,0 +1 @@
+module CodeGen { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h
index d1853d8..2ed188e 100644
--- a/lib/DebugInfo/DWARFCompileUnit.h
+++ b/lib/DebugInfo/DWARFCompileUnit.h
@@ -16,10 +16,10 @@ namespace llvm {
 
 class DWARFCompileUnit : public DWARFUnit {
 public:
-  DWARFCompileUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
-                   StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+  DWARFCompileUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef RS,
+                   StringRef SS, StringRef SOS, StringRef AOS,
                    const RelocAddrMap *M, bool LE)
-      : DWARFUnit(DA, IS, AS, RS, SS, SOS, AOS, M, LE) {}
+      : DWARFUnit(DA, IS, RS, SS, SOS, AOS, M, LE) {}
   void dump(raw_ostream &OS);
   // VTable anchor.
   ~DWARFCompileUnit() override;
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index 60c5f6a..e52e8af 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp
@@ -8,6 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "DWARFContext.h"
+#include "DWARFDebugArangeSet.h"
+
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Compression.h"
@@ -20,7 +22,11 @@ using namespace llvm;
 using namespace dwarf;
 using namespace object;
 
+#define DEBUG_TYPE "dwarf"
+
 typedef DWARFDebugLine::LineTable DWARFLineTable;
+typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
+typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind;
 
 static void dumpPubSection(raw_ostream &OS, StringRef Name, StringRef Data,
                            bool LittleEndian, bool GnuStyle) {
@@ -126,8 +132,9 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
       if (stmtOffset != -1U) {
         DataExtractor lineData(getLineSection().Data, isLittleEndian(),
                                savedAddressByteSize);
-        DWARFDebugLine::DumpingState state(OS);
-        DWARFDebugLine::parseStatementTable(lineData, &getLineSection().Relocs, &stmtOffset, state);
+        DWARFDebugLine::LineTable LineTable;
+        LineTable.parse(lineData, &getLineSection().Relocs, &stmtOffset);
+        LineTable.dump(OS);
       }
     }
   }
@@ -137,9 +144,11 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
     unsigned stmtOffset = 0;
     DataExtractor lineData(getLineDWOSection().Data, isLittleEndian(),
                            savedAddressByteSize);
-    DWARFDebugLine::DumpingState state(OS);
-    while (DWARFDebugLine::parsePrologue(lineData, &stmtOffset, &state.Prologue))
-      state.finalize();
+    DWARFDebugLine::LineTable LineTable;
+    while (LineTable.Prologue.parse(lineData, &stmtOffset)) {
+      LineTable.dump(OS);
+      LineTable.clear();
+    }
   }
 
   if (DumpType == DIDT_All || DumpType == DIDT_Str) {
@@ -216,7 +225,7 @@ const DWARFDebugAbbrev *DWARFContext::getDebugAbbrev() {
   DataExtractor abbrData(getAbbrevSection(), isLittleEndian(), 0);
 
   Abbrev.reset(new DWARFDebugAbbrev());
-  Abbrev->parse(abbrData);
+  Abbrev->extract(abbrData);
   return Abbrev.get();
 }
 
@@ -226,7 +235,7 @@ const DWARFDebugAbbrev *DWARFContext::getDebugAbbrevDWO() {
 
   DataExtractor abbrData(getAbbrevDWOSection(), isLittleEndian(), 0);
   AbbrevDWO.reset(new DWARFDebugAbbrev());
-  AbbrevDWO->parse(abbrData);
+  AbbrevDWO->extract(abbrData);
   return AbbrevDWO.get();
 }
 
@@ -290,7 +299,7 @@ DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
       cu->getCompileUnitDIE()->getAttributeValueAsSectionOffset(
           cu, DW_AT_stmt_list, -1U);
   if (stmtOffset == -1U)
-    return 0; // No line table for this compile unit.
+    return nullptr; // No line table for this compile unit.
 
   // See if the line table is cached.
   if (const DWARFLineTable *lt = Line->getLineTable(stmtOffset))
@@ -310,8 +319,8 @@ void DWARFContext::parseCompileUnits() {
                                               isLittleEndian(), 0);
   while (DIData.isValidOffset(offset)) {
     std::unique_ptr<DWARFCompileUnit> CU(new DWARFCompileUnit(
-        getDebugAbbrev(), getInfoSection().Data, getAbbrevSection(),
-        getRangeSection(), getStringSection(), StringRef(), getAddrSection(),
+        getDebugAbbrev(), getInfoSection().Data, getRangeSection(),
+        getStringSection(), StringRef(), getAddrSection(),
         &getInfoSection().Relocs, isLittleEndian()));
     if (!CU->extract(DIData, &offset)) {
       break;
@@ -329,10 +338,10 @@ void DWARFContext::parseTypeUnits() {
     const DataExtractor &DIData =
         DataExtractor(I.second.Data, isLittleEndian(), 0);
     while (DIData.isValidOffset(offset)) {
-      std::unique_ptr<DWARFTypeUnit> TU(new DWARFTypeUnit(
-          getDebugAbbrev(), I.second.Data, getAbbrevSection(),
-          getRangeSection(), getStringSection(), StringRef(), getAddrSection(),
-          &I.second.Relocs, isLittleEndian()));
+      std::unique_ptr<DWARFTypeUnit> TU(
+          new DWARFTypeUnit(getDebugAbbrev(), I.second.Data, getRangeSection(),
+                            getStringSection(), StringRef(), getAddrSection(),
+                            &I.second.Relocs, isLittleEndian()));
       if (!TU->extract(DIData, &offset))
         break;
       TUs.push_back(std::move(TU));
@@ -349,9 +358,8 @@ void DWARFContext::parseDWOCompileUnits() {
       DataExtractor(getInfoDWOSection().Data, isLittleEndian(), 0);
   while (DIData.isValidOffset(offset)) {
     std::unique_ptr<DWARFCompileUnit> DWOCU(new DWARFCompileUnit(
-        getDebugAbbrevDWO(), getInfoDWOSection().Data, getAbbrevDWOSection(),
-        getRangeDWOSection(), getStringDWOSection(),
-        getStringOffsetDWOSection(), getAddrSection(),
+        getDebugAbbrevDWO(), getInfoDWOSection().Data, getRangeDWOSection(),
+        getStringDWOSection(), getStringOffsetDWOSection(), getAddrSection(),
         &getInfoDWOSection().Relocs, isLittleEndian()));
     if (!DWOCU->extract(DIData, &offset)) {
       break;
@@ -370,10 +378,9 @@ void DWARFContext::parseDWOTypeUnits() {
         DataExtractor(I.second.Data, isLittleEndian(), 0);
     while (DIData.isValidOffset(offset)) {
       std::unique_ptr<DWARFTypeUnit> TU(new DWARFTypeUnit(
-          getDebugAbbrevDWO(), I.second.Data, getAbbrevDWOSection(),
-          getRangeDWOSection(), getStringDWOSection(),
-          getStringOffsetDWOSection(), getAddrSection(), &I.second.Relocs,
-          isLittleEndian()));
+          getDebugAbbrevDWO(), I.second.Data, getRangeDWOSection(),
+          getStringDWOSection(), getStringOffsetDWOSection(), getAddrSection(),
+          &I.second.Relocs, isLittleEndian()));
       if (!TU->extract(DIData, &offset))
         break;
       DWOTUs.push_back(std::move(TU));
@@ -408,7 +415,7 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
   if (CU != CUs.end()) {
     return CU->get();
   }
-  return 0;
+  return nullptr;
 }
 
 DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
@@ -420,15 +427,13 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
 
 static bool getFileNameForCompileUnit(DWARFCompileUnit *CU,
                                       const DWARFLineTable *LineTable,
-                                      uint64_t FileIndex,
-                                      bool NeedsAbsoluteFilePath,
+                                      uint64_t FileIndex, FileLineInfoKind Kind,
                                       std::string &FileName) {
-  if (CU == 0 ||
-      LineTable == 0 ||
-      !LineTable->getFileNameByIndex(FileIndex, NeedsAbsoluteFilePath,
-                                     FileName))
+  if (!CU || !LineTable || Kind == FileLineInfoKind::None ||
+      !LineTable->getFileNameByIndex(FileIndex, Kind, FileName))
     return false;
-  if (NeedsAbsoluteFilePath && sys::path::is_relative(FileName)) {
+  if (Kind == FileLineInfoKind::AbsoluteFilePath &&
+      sys::path::is_relative(FileName)) {
     // We may still need to append compilation directory of compile unit.
     SmallString<16> AbsolutePath;
     if (const char *CompilationDir = CU->getCompilationDir()) {
@@ -443,10 +448,9 @@ static bool getFileNameForCompileUnit(DWARFCompileUnit *CU,
 static bool getFileLineInfoForCompileUnit(DWARFCompileUnit *CU,
                                           const DWARFLineTable *LineTable,
                                           uint64_t Address,
-                                          bool NeedsAbsoluteFilePath,
-                                          std::string &FileName,
-                                          uint32_t &Line, uint32_t &Column) {
-  if (CU == 0 || LineTable == 0)
+                                          FileLineInfoKind Kind,
+                                          DILineInfo &Result) {
+  if (!CU || !LineTable)
     return false;
   // Get the index of row we're looking for in the line table.
   uint32_t RowIndex = LineTable->lookupAddress(Address);
@@ -454,80 +458,71 @@ static bool getFileLineInfoForCompileUnit(DWARFCompileUnit *CU,
     return false;
   // Take file number and line/column from the row.
   const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
-  if (!getFileNameForCompileUnit(CU, LineTable, Row.File,
-                                 NeedsAbsoluteFilePath, FileName))
+  if (!getFileNameForCompileUnit(CU, LineTable, Row.File, Kind,
+                                 Result.FileName))
     return false;
-  Line = Row.Line;
-  Column = Row.Column;
+  Result.Line = Row.Line;
+  Result.Column = Row.Column;
   return true;
 }
 
+static bool getFunctionNameForAddress(DWARFCompileUnit *CU, uint64_t Address,
+                                      FunctionNameKind Kind,
+                                      std::string &FunctionName) {
+  if (Kind == FunctionNameKind::None)
+    return false;
+  // The address may correspond to instruction in some inlined function,
+  // so we have to build the chain of inlined functions and take the
+  // name of the topmost function in it.
+  const DWARFDebugInfoEntryInlinedChain &InlinedChain =
+      CU->getInlinedChainForAddress(Address);
+  if (InlinedChain.DIEs.size() == 0)
+    return false;
+  const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain.DIEs[0];
+  if (const char *Name =
+          TopFunctionDIE.getSubroutineName(InlinedChain.U, Kind)) {
+    FunctionName = Name;
+    return true;
+  }
+  return false;
+}
+
 DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
-    DILineInfoSpecifier Specifier) {
+                                               DILineInfoSpecifier Spec) {
+  DILineInfo Result;
+
   DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
   if (!CU)
-    return DILineInfo();
-  std::string FileName = "<invalid>";
-  std::string FunctionName = "<invalid>";
-  uint32_t Line = 0;
-  uint32_t Column = 0;
-  if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
-    // The address may correspond to instruction in some inlined function,
-    // so we have to build the chain of inlined functions and take the
-    // name of the topmost function in it.
-    const DWARFDebugInfoEntryInlinedChain &InlinedChain =
-        CU->getInlinedChainForAddress(Address);
-    if (InlinedChain.DIEs.size() > 0) {
-      const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain.DIEs[0];
-      if (const char *Name = TopFunctionDIE.getSubroutineName(InlinedChain.U))
-        FunctionName = Name;
-    }
-  }
-  if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
+    return Result;
+  getFunctionNameForAddress(CU, Address, Spec.FNKind, Result.FunctionName);
+  if (Spec.FLIKind != FileLineInfoKind::None) {
     const DWARFLineTable *LineTable = getLineTableForCompileUnit(CU);
-    const bool NeedsAbsoluteFilePath =
-        Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath);
-    getFileLineInfoForCompileUnit(CU, LineTable, Address,
-                                  NeedsAbsoluteFilePath,
-                                  FileName, Line, Column);
+    getFileLineInfoForCompileUnit(CU, LineTable, Address, Spec.FLIKind, Result);
   }
-  return DILineInfo(StringRef(FileName), StringRef(FunctionName),
-                    Line, Column);
+  return Result;
 }
 
-DILineInfoTable DWARFContext::getLineInfoForAddressRange(uint64_t Address,
-    uint64_t Size,
-    DILineInfoSpecifier Specifier) {
+DILineInfoTable
+DWARFContext::getLineInfoForAddressRange(uint64_t Address, uint64_t Size,
+                                         DILineInfoSpecifier Spec) {
   DILineInfoTable  Lines;
   DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
   if (!CU)
     return Lines;
 
   std::string FunctionName = "<invalid>";
-  if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
-    // The address may correspond to instruction in some inlined function,
-    // so we have to build the chain of inlined functions and take the
-    // name of the topmost function in it.
-    const DWARFDebugInfoEntryInlinedChain &InlinedChain =
-        CU->getInlinedChainForAddress(Address);
-    if (InlinedChain.DIEs.size() > 0) {
-      const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain.DIEs[0];
-      if (const char *Name = TopFunctionDIE.getSubroutineName(InlinedChain.U))
-        FunctionName = Name;
-    }
-  }
+  getFunctionNameForAddress(CU, Address, Spec.FNKind, FunctionName);
 
   // If the Specifier says we don't need FileLineInfo, just
   // return the top-most function at the starting address.
-  if (!Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
-    Lines.push_back(
-        std::make_pair(Address, DILineInfo("<invalid>", FunctionName, 0, 0)));
+  if (Spec.FLIKind == FileLineInfoKind::None) {
+    DILineInfo Result;
+    Result.FunctionName = FunctionName;
+    Lines.push_back(std::make_pair(Address, Result));
     return Lines;
   }
 
   const DWARFLineTable *LineTable = getLineTableForCompileUnit(CU);
-  const bool NeedsAbsoluteFilePath =
-      Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath);
 
   // Get the index of row we're looking for in the line table.
   std::vector<uint32_t> RowVector;
@@ -537,59 +532,67 @@ DILineInfoTable DWARFContext::getLineInfoForAddressRange(uint64_t Address,
   for (uint32_t RowIndex : RowVector) {
     // Take file number and line/column from the row.
     const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
-    std::string FileName = "<invalid>";
-    getFileNameForCompileUnit(CU, LineTable, Row.File,
-                              NeedsAbsoluteFilePath, FileName);
-    Lines.push_back(std::make_pair(
-        Row.Address, DILineInfo(FileName, FunctionName, Row.Line, Row.Column)));
+    DILineInfo Result;
+    getFileNameForCompileUnit(CU, LineTable, Row.File, Spec.FLIKind,
+                              Result.FileName);
+    Result.FunctionName = FunctionName;
+    Result.Line = Row.Line;
+    Result.Column = Row.Column;
+    Lines.push_back(std::make_pair(Row.Address, Result));
   }
 
   return Lines;
 }
 
-DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
-    DILineInfoSpecifier Specifier) {
+DIInliningInfo
+DWARFContext::getInliningInfoForAddress(uint64_t Address,
+                                        DILineInfoSpecifier Spec) {
+  DIInliningInfo InliningInfo;
+
   DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
   if (!CU)
-    return DIInliningInfo();
+    return InliningInfo;
 
+  const DWARFLineTable *LineTable = nullptr;
   const DWARFDebugInfoEntryInlinedChain &InlinedChain =
       CU->getInlinedChainForAddress(Address);
-  if (InlinedChain.DIEs.size() == 0)
-    return DIInliningInfo();
+  if (InlinedChain.DIEs.size() == 0) {
+    // If there is no DIE for address (e.g. it is in unavailable .dwo file),
+    // try to at least get file/line info from symbol table.
+    if (Spec.FLIKind != FileLineInfoKind::None) {
+      DILineInfo Frame;
+      LineTable = getLineTableForCompileUnit(CU);
+      if (getFileLineInfoForCompileUnit(CU, LineTable, Address, Spec.FLIKind,
+                                        Frame)) {
+        InliningInfo.addFrame(Frame);
+      }
+    }
+    return InliningInfo;
+  }
 
-  DIInliningInfo InliningInfo;
   uint32_t CallFile = 0, CallLine = 0, CallColumn = 0;
-  const DWARFLineTable *LineTable = 0;
   for (uint32_t i = 0, n = InlinedChain.DIEs.size(); i != n; i++) {
     const DWARFDebugInfoEntryMinimal &FunctionDIE = InlinedChain.DIEs[i];
-    std::string FileName = "<invalid>";
-    std::string FunctionName = "<invalid>";
-    uint32_t Line = 0;
-    uint32_t Column = 0;
+    DILineInfo Frame;
     // Get function name if necessary.
-    if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
-      if (const char *Name = FunctionDIE.getSubroutineName(InlinedChain.U))
-        FunctionName = Name;
-    }
-    if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
-      const bool NeedsAbsoluteFilePath =
-          Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath);
+    if (const char *Name =
+            FunctionDIE.getSubroutineName(InlinedChain.U, Spec.FNKind))
+      Frame.FunctionName = Name;
+    if (Spec.FLIKind != FileLineInfoKind::None) {
       if (i == 0) {
         // For the topmost frame, initialize the line table of this
         // compile unit and fetch file/line info from it.
         LineTable = getLineTableForCompileUnit(CU);
         // For the topmost routine, get file/line info from line table.
-        getFileLineInfoForCompileUnit(CU, LineTable, Address,
-                                      NeedsAbsoluteFilePath,
-                                      FileName, Line, Column);
+        getFileLineInfoForCompileUnit(CU, LineTable, Address, Spec.FLIKind,
+                                      Frame);
       } else {
         // Otherwise, use call file, call line and call column from
         // previous DIE in inlined chain.
-        getFileNameForCompileUnit(CU, LineTable, CallFile,
-                                  NeedsAbsoluteFilePath, FileName);
-        Line = CallLine;
-        Column = CallColumn;
+        getFileNameForCompileUnit(CU, LineTable, CallFile, Spec.FLIKind,
+                                  Frame.FileName);
+        Frame.Line = CallLine;
+        Frame.Column = CallColumn;
       }
       // Get call file/line/column of a current DIE.
       if (i + 1 < n) {
@@ -597,8 +600,6 @@ DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
                                    CallColumn);
       }
     }
-    DILineInfo Frame(StringRef(FileName), StringRef(FunctionName),
-                     Line, Column);
     InliningInfo.addFrame(Frame);
   }
   return InliningInfo;
@@ -637,14 +638,15 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj)
       if (!zlib::isAvailable() ||
           !consumeCompressedDebugSectionHeader(data, OriginalSize))
         continue;
-      std::unique_ptr<MemoryBuffer> UncompressedSection;
-      if (zlib::uncompress(data, UncompressedSection, OriginalSize) !=
-          zlib::StatusOK)
+      UncompressedSections.resize(UncompressedSections.size() + 1);
+      if (zlib::uncompress(data, UncompressedSections.back(), OriginalSize) !=
+          zlib::StatusOK) {
+        UncompressedSections.pop_back();
         continue;
+      }
       // Make data point to uncompressed section contents and save its contents.
       name = name.substr(1);
-      data = UncompressedSection->getBuffer();
-      UncompressedSections.push_back(std::move(UncompressedSection));
+      data = UncompressedSections.back();
     }
 
     StringRef *SectionData =
@@ -669,7 +671,7 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj)
             .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
             .Case("debug_addr", &AddrSection)
             // Any more debug info sections go here.
-            .Default(0);
+            .Default(nullptr);
     if (SectionData) {
       *SectionData = data;
       if (name == "debug_ranges") {
@@ -700,7 +702,7 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj)
         .Case("debug_loc", &LocSection.Relocs)
         .Case("debug_info.dwo", &InfoDWOSection.Relocs)
         .Case("debug_line", &LineSection.Relocs)
-        .Default(0);
+        .Default(nullptr);
     if (!Map) {
       // Find debug_types relocs by section rather than name as there are
       // multiple, comdat grouped, debug_types sections.
diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
index ad6841a..6d1ae92 100644
--- a/lib/DebugInfo/DWARFContext.h
+++ b/lib/DebugInfo/DWARFContext.h
@@ -242,7 +242,7 @@ class DWARFContextInMemory : public DWARFContext {
   StringRef RangeDWOSection;
   StringRef AddrSection;
 
-  SmallVector<std::unique_ptr<MemoryBuffer>, 4> UncompressedSections;
+  SmallVector<SmallString<32>, 4> UncompressedSections;
 
 public:
   DWARFContextInMemory(object::ObjectFile *);
diff --git a/lib/DebugInfo/DWARFDebugAbbrev.cpp b/lib/DebugInfo/DWARFDebugAbbrev.cpp
index fd5f5e9..8426bf9 100644
--- a/lib/DebugInfo/DWARFDebugAbbrev.cpp
+++ b/lib/DebugInfo/DWARFDebugAbbrev.cpp
@@ -12,24 +12,36 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-bool DWARFAbbreviationDeclarationSet::extract(DataExtractor data,
-                                              uint32_t* offset_ptr) {
-  const uint32_t beginOffset = *offset_ptr;
-  Offset = beginOffset;
+DWARFAbbreviationDeclarationSet::DWARFAbbreviationDeclarationSet() {
   clear();
-  DWARFAbbreviationDeclaration abbrevDeclaration;
-  uint32_t prevAbbrAode = 0;
-  while (abbrevDeclaration.extract(data, offset_ptr)) {
-    Decls.push_back(abbrevDeclaration);
-    if (IdxOffset == 0) {
-      IdxOffset = abbrevDeclaration.getCode();
+}
+
+void DWARFAbbreviationDeclarationSet::clear() {
+  Offset = 0;
+  FirstAbbrCode = 0;
+  Decls.clear();
+}
+
+bool DWARFAbbreviationDeclarationSet::extract(DataExtractor Data,
+                                              uint32_t *OffsetPtr) {
+  clear();
+  const uint32_t BeginOffset = *OffsetPtr;
+  Offset = BeginOffset;
+  DWARFAbbreviationDeclaration AbbrDecl;
+  uint32_t PrevAbbrCode = 0;
+  while (AbbrDecl.extract(Data, OffsetPtr)) {
+    Decls.push_back(AbbrDecl);
+    if (FirstAbbrCode == 0) {
+      FirstAbbrCode = AbbrDecl.getCode();
     } else {
-      if (prevAbbrAode + 1 != abbrevDeclaration.getCode())
-        IdxOffset = UINT32_MAX;// Out of order indexes, we can't do O(1) lookups
+      if (PrevAbbrCode + 1 != AbbrDecl.getCode()) {
+        // Codes are not consecutive, can't do O(1) lookups.
+        FirstAbbrCode = UINT32_MAX;
+      }
     }
-    prevAbbrAode = abbrevDeclaration.getCode();
+    PrevAbbrCode = AbbrDecl.getCode();
   }
-  return beginOffset != *offset_ptr;
+  return BeginOffset != *OffsetPtr;
 }
 
 void DWARFAbbreviationDeclarationSet::dump(raw_ostream &OS) const {
@@ -37,67 +49,67 @@ void DWARFAbbreviationDeclarationSet::dump(raw_ostream &OS) const {
     Decl.dump(OS);
 }
 
-const DWARFAbbreviationDeclaration*
-DWARFAbbreviationDeclarationSet::getAbbreviationDeclaration(uint32_t abbrCode)
-  const {
-  if (IdxOffset == UINT32_MAX) {
+const DWARFAbbreviationDeclaration *
+DWARFAbbreviationDeclarationSet::getAbbreviationDeclaration(
+    uint32_t AbbrCode) const {
+  if (FirstAbbrCode == UINT32_MAX) {
     for (const auto &Decl : Decls) {
-      if (Decl.getCode() == abbrCode)
+      if (Decl.getCode() == AbbrCode)
         return &Decl;
     }
-  } else {
-    uint32_t idx = abbrCode - IdxOffset;
-    if (idx < Decls.size())
-      return &Decls[idx];
+    return nullptr;
   }
-  return NULL;
+  if (AbbrCode < FirstAbbrCode || AbbrCode >= FirstAbbrCode + Decls.size())
+    return nullptr;
+  return &Decls[AbbrCode - FirstAbbrCode];
 }
 
-DWARFDebugAbbrev::DWARFDebugAbbrev() :
-  AbbrevCollMap(),
-  PrevAbbrOffsetPos(AbbrevCollMap.end()) {}
-
+DWARFDebugAbbrev::DWARFDebugAbbrev() {
+  clear();
+}
 
-void DWARFDebugAbbrev::parse(DataExtractor data) {
-  uint32_t offset = 0;
+void DWARFDebugAbbrev::clear() {
+  AbbrDeclSets.clear();
+  PrevAbbrOffsetPos = AbbrDeclSets.end();
+}
 
-  while (data.isValidOffset(offset)) {
-    uint32_t initial_cu_offset = offset;
-    DWARFAbbreviationDeclarationSet abbrevDeclSet;
+void DWARFDebugAbbrev::extract(DataExtractor Data) {
+  clear();
 
-    if (abbrevDeclSet.extract(data, &offset))
-      AbbrevCollMap[initial_cu_offset] = abbrevDeclSet;
-    else
+  uint32_t Offset = 0;
+  DWARFAbbreviationDeclarationSet AbbrDecls;
+  while (Data.isValidOffset(Offset)) {
+    uint32_t CUAbbrOffset = Offset;
+    if (!AbbrDecls.extract(Data, &Offset))
       break;
+    AbbrDeclSets[CUAbbrOffset] = AbbrDecls;
   }
-  PrevAbbrOffsetPos = AbbrevCollMap.end();
 }
 
 void DWARFDebugAbbrev::dump(raw_ostream &OS) const {
-  if (AbbrevCollMap.empty()) {
+  if (AbbrDeclSets.empty()) {
     OS << "< EMPTY >\n";
     return;
   }
 
-  for (const auto &I : AbbrevCollMap) {
+  for (const auto &I : AbbrDeclSets) {
     OS << format("Abbrev table for offset: 0x%8.8" PRIx64 "\n", I.first);
     I.second.dump(OS);
   }
 }
 
 const DWARFAbbreviationDeclarationSet*
-DWARFDebugAbbrev::getAbbreviationDeclarationSet(uint64_t cu_abbr_offset) const {
-  DWARFAbbreviationDeclarationCollMapConstIter end = AbbrevCollMap.end();
-  DWARFAbbreviationDeclarationCollMapConstIter pos;
-  if (PrevAbbrOffsetPos != end &&
-      PrevAbbrOffsetPos->first == cu_abbr_offset) {
+DWARFDebugAbbrev::getAbbreviationDeclarationSet(uint64_t CUAbbrOffset) const {
+  const auto End = AbbrDeclSets.end();
+  if (PrevAbbrOffsetPos != End && PrevAbbrOffsetPos->first == CUAbbrOffset) {
     return &(PrevAbbrOffsetPos->second);
-  } else {
-    pos = AbbrevCollMap.find(cu_abbr_offset);
-    PrevAbbrOffsetPos = pos;
   }
 
-  if (pos != AbbrevCollMap.end())
-    return &(pos->second);
-  return NULL;
+  const auto Pos = AbbrDeclSets.find(CUAbbrOffset);
+  if (Pos != End) {
+    PrevAbbrOffsetPos = Pos;
+    return &(Pos->second);
+  }
+
+  return nullptr;
 }
diff --git a/lib/DebugInfo/DWARFDebugAbbrev.h b/lib/DebugInfo/DWARFDebugAbbrev.h
index c7c0436..3a9adba 100644
--- a/lib/DebugInfo/DWARFDebugAbbrev.h
+++ b/lib/DebugInfo/DWARFDebugAbbrev.h
@@ -17,55 +17,45 @@
 
 namespace llvm {
 
-typedef std::vector<DWARFAbbreviationDeclaration>
-  DWARFAbbreviationDeclarationColl;
-typedef DWARFAbbreviationDeclarationColl::iterator
-  DWARFAbbreviationDeclarationCollIter;
-typedef DWARFAbbreviationDeclarationColl::const_iterator
-  DWARFAbbreviationDeclarationCollConstIter;
-
 class DWARFAbbreviationDeclarationSet {
   uint32_t Offset;
-  uint32_t IdxOffset;
+  /// Code of the first abbreviation, if all abbreviations in the set have
+  /// consecutive codes. UINT32_MAX otherwise.
+  uint32_t FirstAbbrCode;
   std::vector<DWARFAbbreviationDeclaration> Decls;
-  public:
-  DWARFAbbreviationDeclarationSet()
-    : Offset(0), IdxOffset(0) {}
 
-  DWARFAbbreviationDeclarationSet(uint32_t offset, uint32_t idxOffset)
-    : Offset(offset), IdxOffset(idxOffset) {}
+public:
+  DWARFAbbreviationDeclarationSet();
 
-  void clear() {
-    IdxOffset = 0;
-    Decls.clear();
-  }
   uint32_t getOffset() const { return Offset; }
   void dump(raw_ostream &OS) const;
-  bool extract(DataExtractor data, uint32_t* offset_ptr);
+  bool extract(DataExtractor Data, uint32_t *OffsetPtr);
 
   const DWARFAbbreviationDeclaration *
-    getAbbreviationDeclaration(uint32_t abbrCode) const;
+  getAbbreviationDeclaration(uint32_t AbbrCode) const;
+
+private:
+  void clear();
 };
 
 class DWARFDebugAbbrev {
-public:
   typedef std::map<uint64_t, DWARFAbbreviationDeclarationSet>
-    DWARFAbbreviationDeclarationCollMap;
-  typedef DWARFAbbreviationDeclarationCollMap::iterator
-    DWARFAbbreviationDeclarationCollMapIter;
-  typedef DWARFAbbreviationDeclarationCollMap::const_iterator
-    DWARFAbbreviationDeclarationCollMapConstIter;
+    DWARFAbbreviationDeclarationSetMap;
 
-private:
-  DWARFAbbreviationDeclarationCollMap AbbrevCollMap;
-  mutable DWARFAbbreviationDeclarationCollMapConstIter PrevAbbrOffsetPos;
+  DWARFAbbreviationDeclarationSetMap AbbrDeclSets;
+  mutable DWARFAbbreviationDeclarationSetMap::const_iterator PrevAbbrOffsetPos;
 
 public:
   DWARFDebugAbbrev();
+
   const DWARFAbbreviationDeclarationSet *
-    getAbbreviationDeclarationSet(uint64_t cu_abbr_offset) const;
+  getAbbreviationDeclarationSet(uint64_t CUAbbrOffset) const;
+
   void dump(raw_ostream &OS) const;
-  void parse(DataExtractor data);
+  void extract(DataExtractor Data);
+
+private:
+  void clear();
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugArangeSet.h b/lib/DebugInfo/DWARFDebugArangeSet.h
index c18b3c5..d6c2d8b 100644
--- a/lib/DebugInfo/DWARFDebugArangeSet.h
+++ b/lib/DebugInfo/DWARFDebugArangeSet.h
@@ -63,7 +63,6 @@ public:
     return desc_iterator_range(ArangeDescriptors.begin(),
                                ArangeDescriptors.end());
   }
-  uint32_t getNumDescriptors() const { return ArangeDescriptors.size(); }
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
index dfab788..2524adc 100644
--- a/lib/DebugInfo/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARFDebugAranges.cpp
@@ -10,6 +10,7 @@
 #include "DWARFDebugAranges.h"
 #include "DWARFCompileUnit.h"
 #include "DWARFContext.h"
+#include "DWARFDebugArangeSet.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -20,23 +21,11 @@ void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
   if (!DebugArangesData.isValidOffset(0))
     return;
   uint32_t Offset = 0;
-  typedef std::vector<DWARFDebugArangeSet> RangeSetColl;
-  RangeSetColl Sets;
   DWARFDebugArangeSet Set;
-  uint32_t TotalRanges = 0;
 
   while (Set.extract(DebugArangesData, &Offset)) {
-    Sets.push_back(Set);
-    TotalRanges += Set.getNumDescriptors();
-  }
-  if (TotalRanges == 0)
-    return;
-
-  Aranges.reserve(TotalRanges);
-  for (const auto &I : Sets) {
-    uint32_t CUOffset = I.getCompileUnitDIEOffset();
-
-    for (const auto &Desc : I.descriptors()) {
+    uint32_t CUOffset = Set.getCompileUnitDIEOffset();
+    for (const auto &Desc : Set.descriptors()) {
       uint64_t LowPC = Desc.Address;
       uint64_t HighPC = Desc.getEndAddress();
       appendRange(CUOffset, LowPC, HighPC);
@@ -58,13 +47,23 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
   // manually build aranges for the rest of them.
   for (const auto &CU : CTX->compile_units()) {
     uint32_t CUOffset = CU->getOffset();
-    if (ParsedCUOffsets.insert(CUOffset).second)
-      CU->buildAddressRangeTable(this, true, CUOffset);
+    if (ParsedCUOffsets.insert(CUOffset).second) {
+      DWARFAddressRangesVector CURanges;
+      CU->collectAddressRanges(CURanges);
+      for (const auto &R : CURanges) {
+        appendRange(CUOffset, R.first, R.second);
+      }
+    }
   }
 
   sortAndMinimize();
 }
 
+void DWARFDebugAranges::clear() {
+  Aranges.clear();
+  ParsedCUOffsets.clear();
+}
+
 void DWARFDebugAranges::appendRange(uint32_t CUOffset, uint64_t LowPC,
                                     uint64_t HighPC) {
   if (!Aranges.empty()) {
@@ -101,11 +100,6 @@ void DWARFDebugAranges::sortAndMinimize() {
       ++minimal_size;
   }
 
-  // If the sizes are the same, then no consecutive aranges can be
-  // combined, we are done.
-  if (minimal_size == orig_arange_size)
-    return;
-
   // Else, make a new RangeColl that _only_ contains what we need.
   RangeColl minimal_aranges;
   minimal_aranges.resize(minimal_size);
diff --git a/lib/DebugInfo/DWARFDebugAranges.h b/lib/DebugInfo/DWARFDebugAranges.h
index 35ad8e5..de96d7f 100644
--- a/lib/DebugInfo/DWARFDebugAranges.h
+++ b/lib/DebugInfo/DWARFDebugAranges.h
@@ -10,9 +10,9 @@
 #ifndef LLVM_DEBUGINFO_DWARFDEBUGARANGES_H
 #define LLVM_DEBUGINFO_DWARFDEBUGARANGES_H
 
-#include "DWARFDebugArangeSet.h"
 #include "llvm/ADT/DenseSet.h"
-#include <list>
+#include "llvm/Support/DataExtractor.h"
+#include <vector>
 
 namespace llvm {
 
@@ -20,20 +20,15 @@ class DWARFContext;
 
 class DWARFDebugAranges {
 public:
-  void clear() {
-    Aranges.clear();
-    ParsedCUOffsets.clear();
-  }
-
   void generate(DWARFContext *CTX);
-
-  // Use appendRange multiple times and then call sortAndMinimize.
-  void appendRange(uint32_t CUOffset, uint64_t LowPC, uint64_t HighPC);
-
   uint32_t findAddress(uint64_t Address) const;
 
 private:
+  void clear();
   void extract(DataExtractor DebugArangesData);
+
+  // Use appendRange multiple times and then call sortAndMinimize.
+  void appendRange(uint32_t CUOffset, uint64_t LowPC, uint64_t HighPC);
   void sortAndMinimize();
 
   struct Range {
diff --git a/lib/DebugInfo/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARFDebugFrame.cpp
index 5bf7b07..a33548e 100644
--- a/lib/DebugInfo/DWARFDebugFrame.cpp
+++ b/lib/DebugInfo/DWARFDebugFrame.cpp
@@ -26,8 +26,8 @@ using namespace dwarf;
 class llvm::FrameEntry {
 public:
   enum FrameKind {FK_CIE, FK_FDE};
-  FrameEntry(FrameKind K, DataExtractor D, uint64_t Offset, uint64_t Length)
-    : Kind(K), Data(D), Offset(Offset), Length(Length) {}
+  FrameEntry(FrameKind K, uint64_t Offset, uint64_t Length)
+      : Kind(K), Offset(Offset), Length(Length) {}
 
   virtual ~FrameEntry() {
   }
@@ -35,11 +35,12 @@ public:
   FrameKind getKind() const { return Kind; }
   virtual uint64_t getOffset() const { return Offset; }
 
-  /// \brief Parse and store a sequence of CFI instructions from our data
-  /// stream, starting at *Offset and ending at EndOffset. If everything
+  /// \brief Parse and store a sequence of CFI instructions from Data,
+  /// starting at *Offset and ending at EndOffset. If everything
   /// goes well, *Offset should be equal to EndOffset when this method
   /// returns. Otherwise, an error occurred.
-  virtual void parseInstructions(uint32_t *Offset, uint32_t EndOffset);
+  virtual void parseInstructions(DataExtractor Data, uint32_t *Offset,
+                                 uint32_t EndOffset);
 
   /// \brief Dump the entry header to the given output stream.
   virtual void dumpHeader(raw_ostream &OS) const = 0;
@@ -50,10 +51,6 @@ public:
 protected:
   const FrameKind Kind;
 
-  /// \brief The data stream holding the section from which the entry was
-  /// parsed.
-  DataExtractor Data;
-
   /// \brief Offset of this entry in the section.
   uint64_t Offset;
 
@@ -97,8 +94,8 @@ protected:
 const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
 const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f;
 
-
-void FrameEntry::parseInstructions(uint32_t *Offset, uint32_t EndOffset) {
+void FrameEntry::parseInstructions(DataExtractor Data, uint32_t *Offset,
+                                   uint32_t EndOffset) {
   while (*Offset < EndOffset) {
     uint8_t Opcode = Data.getU8(Offset);
     // Some instructions have a primary opcode encoded in the top bits.
@@ -201,13 +198,13 @@ class CIE : public FrameEntry {
 public:
   // CIEs (and FDEs) are simply container classes, so the only sensible way to
   // create them is by providing the full parsed contents in the constructor.
-  CIE(DataExtractor D, uint64_t Offset, uint64_t Length, uint8_t Version,
+  CIE(uint64_t Offset, uint64_t Length, uint8_t Version,
       SmallString<8> Augmentation, uint64_t CodeAlignmentFactor,
       int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister)
-   : FrameEntry(FK_CIE, D, Offset, Length), Version(Version),
-     Augmentation(Augmentation), CodeAlignmentFactor(CodeAlignmentFactor),
-     DataAlignmentFactor(DataAlignmentFactor),
-     ReturnAddressRegister(ReturnAddressRegister) {}
+      : FrameEntry(FK_CIE, Offset, Length), Version(Version),
+        Augmentation(Augmentation), CodeAlignmentFactor(CodeAlignmentFactor),
+        DataAlignmentFactor(DataAlignmentFactor),
+        ReturnAddressRegister(ReturnAddressRegister) {}
 
   ~CIE() {
   }
@@ -229,7 +226,7 @@ public:
 
   static bool classof(const FrameEntry *FE) {
     return FE->getKind() == FK_CIE;
-  } 
+  }
 
 private:
   /// The following fields are defined in section 6.4.1 of the DWARF standard v3
@@ -247,11 +244,11 @@ public:
   // Each FDE has a CIE it's "linked to". Our FDE contains is constructed with
   // an offset to the CIE (provided by parsing the FDE header). The CIE itself
   // is obtained lazily once it's actually required.
-  FDE(DataExtractor D, uint64_t Offset, uint64_t Length,
-      int64_t LinkedCIEOffset, uint64_t InitialLocation, uint64_t AddressRange)
-   : FrameEntry(FK_FDE, D, Offset, Length), LinkedCIEOffset(LinkedCIEOffset),
-     InitialLocation(InitialLocation), AddressRange(AddressRange),
-     LinkedCIE(NULL) {}
+  FDE(uint64_t Offset, uint64_t Length, int64_t LinkedCIEOffset,
+      uint64_t InitialLocation, uint64_t AddressRange)
+      : FrameEntry(FK_FDE, Offset, Length), LinkedCIEOffset(LinkedCIEOffset),
+        InitialLocation(InitialLocation), AddressRange(AddressRange),
+        LinkedCIE(nullptr) {}
 
   ~FDE() {
   }
@@ -270,9 +267,9 @@ public:
 
   static bool classof(const FrameEntry *FE) {
     return FE->getKind() == FK_FDE;
-  } 
-private:
+  }
 
+private:
   /// The following fields are defined in section 6.4.1 of the DWARF standard v3
   uint64_t LinkedCIEOffset;
   uint64_t InitialLocation;
@@ -285,14 +282,9 @@ private:
 DWARFDebugFrame::DWARFDebugFrame() {
 }
 
-
 DWARFDebugFrame::~DWARFDebugFrame() {
-  for (const auto &Entry : Entries) {
-    delete Entry;
-  }
 }
 
-
 static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
                                               uint32_t Offset, int Length) {
   errs() << "DUMP: ";
@@ -334,7 +326,6 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
     Id = Data.getUnsigned(&Offset, IsDWARF64 ? 8 : 4);
     bool IsCIE = ((IsDWARF64 && Id == DW64_CIE_ID) || Id == DW_CIE_ID);
 
-    FrameEntry *Entry = 0;
     if (IsCIE) {
       // Note: this is specifically DWARFv3 CIE header structure. It was
       // changed in DWARFv4. We currently don't support reading DWARFv4
@@ -346,30 +337,25 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
       int64_t DataAlignmentFactor = Data.getSLEB128(&Offset);
       uint64_t ReturnAddressRegister = Data.getULEB128(&Offset);
 
-      Entry = new CIE(Data, StartOffset, Length, Version,
-                      StringRef(Augmentation), CodeAlignmentFactor,
-                      DataAlignmentFactor, ReturnAddressRegister);
+      Entries.emplace_back(new CIE(StartOffset, Length, Version,
+                                   StringRef(Augmentation), CodeAlignmentFactor,
+                                   DataAlignmentFactor, ReturnAddressRegister));
     } else {
       // FDE
       uint64_t CIEPointer = Id;
       uint64_t InitialLocation = Data.getAddress(&Offset);
       uint64_t AddressRange = Data.getAddress(&Offset);
 
-      Entry = new FDE(Data, StartOffset, Length, CIEPointer,
-                      InitialLocation, AddressRange);
+      Entries.emplace_back(new FDE(StartOffset, Length, CIEPointer,
+                                   InitialLocation, AddressRange));
     }
 
-    assert(Entry && "Expected Entry to be populated with CIE or FDE");
-    Entry->parseInstructions(&Offset, EndStructureOffset);
+    Entries.back()->parseInstructions(Data, &Offset, EndStructureOffset);
 
-    if (Offset == EndStructureOffset) {
-      // Entry instrucitons parsed successfully.
-      Entries.push_back(Entry);
-    } else {
+    if (Offset != EndStructureOffset) {
       std::string Str;
       raw_string_ostream OS(Str);
-      OS << format("Parsing entry instructions at %lx failed",
-                   Entry->getOffset());
+      OS << format("Parsing entry instructions at %lx failed", StartOffset);
       report_fatal_error(Str);
     }
   }
diff --git a/lib/DebugInfo/DWARFDebugFrame.h b/lib/DebugInfo/DWARFDebugFrame.h
index 7683849..bd4ef45 100644
--- a/lib/DebugInfo/DWARFDebugFrame.h
+++ b/lib/DebugInfo/DWARFDebugFrame.h
@@ -12,14 +12,13 @@
 
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
 #include <vector>
 
-
 namespace llvm {
 
 class FrameEntry;
 
-
 /// \brief A parsed .debug_frame section
 ///
 class DWARFDebugFrame {
@@ -35,8 +34,7 @@ public:
   void parse(DataExtractor Data);
 
 private:
-  typedef std::vector<FrameEntry *> EntryVector;
-  EntryVector Entries;
+  std::vector<std::unique_ptr<FrameEntry>> Entries;
 };
 
 
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
index bde25ec..b811ed7 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 using namespace dwarf;
+typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind;
 
 void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS, const DWARFUnit *u,
                                       unsigned recurseDepth,
@@ -99,11 +100,11 @@ bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFUnit *U,
   uint64_t AbbrCode = DebugInfoData.getULEB128(OffsetPtr);
   if (0 == AbbrCode) {
     // NULL debug tag entry.
-    AbbrevDecl = NULL;
+    AbbrevDecl = nullptr;
     return true;
   }
   AbbrevDecl = U->getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
-  if (0 == AbbrevDecl) {
+  if (nullptr == AbbrevDecl) {
     // Restore the original offset.
     *OffsetPtr = Offset;
     return false;
@@ -226,54 +227,66 @@ bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFUnit *U,
   return (HighPC != -1ULL);
 }
 
-void DWARFDebugInfoEntryMinimal::buildAddressRangeTable(
-    const DWARFUnit *U, DWARFDebugAranges *DebugAranges,
-    uint32_t UOffsetInAranges) const {
-  if (AbbrevDecl) {
-    if (isSubprogramDIE()) {
-      uint64_t LowPC, HighPC;
-      if (getLowAndHighPC(U, LowPC, HighPC))
-        DebugAranges->appendRange(UOffsetInAranges, LowPC, HighPC);
-      // FIXME: try to append ranges from .debug_ranges section.
-    }
-
-    const DWARFDebugInfoEntryMinimal *Child = getFirstChild();
-    while (Child) {
-      Child->buildAddressRangeTable(U, DebugAranges, UOffsetInAranges);
-      Child = Child->getSibling();
-    }
-  }
-}
-
-bool DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
-    const DWARFUnit *U, const uint64_t Address) const {
+DWARFAddressRangesVector
+DWARFDebugInfoEntryMinimal::getAddressRanges(const DWARFUnit *U) const {
   if (isNULL())
-    return false;
+    return DWARFAddressRangesVector();
+  // Single range specified by low/high PC.
   uint64_t LowPC, HighPC;
-  if (getLowAndHighPC(U, LowPC, HighPC))
-    return (LowPC <= Address && Address <= HighPC);
-  // Try to get address ranges from .debug_ranges section.
+  if (getLowAndHighPC(U, LowPC, HighPC)) {
+    return DWARFAddressRangesVector(1, std::make_pair(LowPC, HighPC));
+  }
+  // Multiple ranges from .debug_ranges section.
   uint32_t RangesOffset =
       getAttributeValueAsSectionOffset(U, DW_AT_ranges, -1U);
   if (RangesOffset != -1U) {
     DWARFDebugRangeList RangeList;
     if (U->extractRangeList(RangesOffset, RangeList))
-      return RangeList.containsAddress(U->getBaseAddress(), Address);
+      return RangeList.getAbsoluteRanges(U->getBaseAddress());
+  }
+  return DWARFAddressRangesVector();
+}
+
+void DWARFDebugInfoEntryMinimal::collectChildrenAddressRanges(
+    const DWARFUnit *U, DWARFAddressRangesVector& Ranges) const {
+  if (isNULL())
+    return;
+  if (isSubprogramDIE()) {
+    const auto &DIERanges = getAddressRanges(U);
+    Ranges.insert(Ranges.end(), DIERanges.begin(), DIERanges.end());
+  }
+
+  const DWARFDebugInfoEntryMinimal *Child = getFirstChild();
+  while (Child) {
+    Child->collectChildrenAddressRanges(U, Ranges);
+    Child = Child->getSibling();
+  }
+}
+
+bool DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
+    const DWARFUnit *U, const uint64_t Address) const {
+  for (const auto& R : getAddressRanges(U)) {
+    if (R.first <= Address && Address < R.second)
+      return true;
   }
   return false;
 }
 
 const char *
-DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFUnit *U) const {
-  if (!isSubroutineDIE())
-    return 0;
-  // Try to get mangled name if possible.
-  if (const char *name =
-      getAttributeValueAsString(U, DW_AT_MIPS_linkage_name, 0))
-    return name;
-  if (const char *name = getAttributeValueAsString(U, DW_AT_linkage_name, 0))
-    return name;
-  if (const char *name = getAttributeValueAsString(U, DW_AT_name, 0))
+DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFUnit *U,
+                                              FunctionNameKind Kind) const {
+  if (!isSubroutineDIE() || Kind == FunctionNameKind::None)
+    return nullptr;
+  // Try to get mangled name only if it was asked for.
+  if (Kind == FunctionNameKind::LinkageName) {
+    if (const char *name =
+            getAttributeValueAsString(U, DW_AT_MIPS_linkage_name, nullptr))
+      return name;
+    if (const char *name =
+            getAttributeValueAsString(U, DW_AT_linkage_name, nullptr))
+      return name;
+  }
+  if (const char *name = getAttributeValueAsString(U, DW_AT_name, nullptr))
     return name;
   // Try to get name from specification DIE.
   uint32_t spec_ref =
@@ -281,7 +294,7 @@ DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFUnit *U) const {
   if (spec_ref != -1U) {
     DWARFDebugInfoEntryMinimal spec_die;
     if (spec_die.extractFast(U, &spec_ref)) {
-      if (const char *name = spec_die.getSubroutineName(U))
+      if (const char *name = spec_die.getSubroutineName(U, Kind))
         return name;
     }
   }
@@ -291,11 +304,11 @@ DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFUnit *U) const {
   if (abs_origin_ref != -1U) {
     DWARFDebugInfoEntryMinimal abs_origin_die;
     if (abs_origin_die.extractFast(U, &abs_origin_ref)) {
-      if (const char *name = abs_origin_die.getSubroutineName(U))
+      if (const char *name = abs_origin_die.getSubroutineName(U, Kind))
         return name;
     }
   }
-  return 0;
+  return nullptr;
 }
 
 void DWARFDebugInfoEntryMinimal::getCallerFrame(const DWARFUnit *U,
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
index f30e531..916e1ed 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.h
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.h
@@ -11,7 +11,9 @@
 #define LLVM_DEBUGINFO_DWARFDEBUGINFOENTRY_H
 
 #include "DWARFAbbreviationDeclaration.h"
+#include "DWARFDebugRangeList.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
@@ -28,17 +30,13 @@ class DWARFDebugInfoEntryMinimal {
   /// Offset within the .debug_info of the start of this entry.
   uint32_t Offset;
 
-  /// How many to subtract from "this" to get the parent.
-  /// If zero this die has no parent.
-  uint32_t ParentIdx;
-
   /// How many to add to "this" to get the sibling.
   uint32_t SiblingIdx;
 
   const DWARFAbbreviationDeclaration *AbbrevDecl;
 public:
   DWARFDebugInfoEntryMinimal()
-    : Offset(0), ParentIdx(0), SiblingIdx(0), AbbrevDecl(0) {}
+    : Offset(0), SiblingIdx(0), AbbrevDecl(nullptr) {}
 
   void dump(raw_ostream &OS, const DWARFUnit *u, unsigned recurseDepth,
             unsigned indent = 0) const;
@@ -51,7 +49,7 @@ public:
   bool extractFast(const DWARFUnit *U, uint32_t *OffsetPtr);
 
   uint32_t getTag() const { return AbbrevDecl ? AbbrevDecl->getTag() : 0; }
-  bool isNULL() const { return AbbrevDecl == 0; }
+  bool isNULL() const { return AbbrevDecl == nullptr; }
 
   /// Returns true if DIE represents a subprogram (not inlined).
   bool isSubprogramDIE() const;
@@ -63,45 +61,23 @@ public:
   bool hasChildren() const { return !isNULL() && AbbrevDecl->hasChildren(); }
 
   // We know we are kept in a vector of contiguous entries, so we know
-  // our parent will be some index behind "this".
-  DWARFDebugInfoEntryMinimal *getParent() {
-    return ParentIdx > 0 ? this - ParentIdx : 0;
-  }
-  const DWARFDebugInfoEntryMinimal *getParent() const {
-    return ParentIdx > 0 ? this - ParentIdx : 0;
-  }
-  // We know we are kept in a vector of contiguous entries, so we know
   // our sibling will be some index after "this".
-  DWARFDebugInfoEntryMinimal *getSibling() {
-    return SiblingIdx > 0 ? this + SiblingIdx : 0;
-  }
   const DWARFDebugInfoEntryMinimal *getSibling() const {
-    return SiblingIdx > 0 ? this + SiblingIdx : 0;
+    return SiblingIdx > 0 ? this + SiblingIdx : nullptr;
   }
+
   // We know we are kept in a vector of contiguous entries, so we know
   // we don't need to store our child pointer, if we have a child it will
   // be the next entry in the list...
-  DWARFDebugInfoEntryMinimal *getFirstChild() {
-    return hasChildren() ? this + 1 : 0;
-  }
   const DWARFDebugInfoEntryMinimal *getFirstChild() const {
-    return hasChildren() ? this + 1 : 0;
+    return hasChildren() ? this + 1 : nullptr;
   }
 
-  void setParent(DWARFDebugInfoEntryMinimal *parent) {
-    if (parent) {
-      // We know we are kept in a vector of contiguous entries, so we know
-      // our parent will be some index behind "this".
-      ParentIdx = this - parent;
-    } else
-      ParentIdx = 0;
-  }
-  void setSibling(DWARFDebugInfoEntryMinimal *sibling) {
-    if (sibling) {
+  void setSibling(const DWARFDebugInfoEntryMinimal *Sibling) {
+    if (Sibling) {
       // We know we are kept in a vector of contiguous entries, so we know
       // our sibling will be some index after "this".
-      SiblingIdx = sibling - this;
-      sibling->setParent(getParent());
+      SiblingIdx = Sibling - this;
     } else
       SiblingIdx = 0;
   }
@@ -135,9 +111,10 @@ public:
   bool getLowAndHighPC(const DWARFUnit *U, uint64_t &LowPC,
                        uint64_t &HighPC) const;
 
-  void buildAddressRangeTable(const DWARFUnit *U,
-                              DWARFDebugAranges *DebugAranges,
-                              uint32_t CUOffsetInAranges) const;
+  DWARFAddressRangesVector getAddressRanges(const DWARFUnit *U) const;
+
+  void collectChildrenAddressRanges(const DWARFUnit *U,
+                                    DWARFAddressRangesVector &Ranges) const;
 
   bool addressRangeContainsAddress(const DWARFUnit *U,
                                    const uint64_t Address) const;
@@ -146,7 +123,9 @@ public:
   /// returns its mangled name (or short name, if mangled is missing).
   /// This name may be fetched from specification or abstract origin
   /// for this subprogram. Returns null if no name is found.
-  const char *getSubroutineName(const DWARFUnit *U) const;
+  const char *
+  getSubroutineName(const DWARFUnit *U,
+                    DILineInfoSpecifier::FunctionNameKind Kind) const;
 
   /// Retrieves values of DW_AT_call_file, DW_AT_call_line and
   /// DW_AT_call_column from DIE (or zeroes if they are missing).
@@ -166,7 +145,7 @@ public:
 /// (except the last DIE) in this chain is contained in address
 /// range for next DIE in the chain.
 struct DWARFDebugInfoEntryInlinedChain {
-  DWARFDebugInfoEntryInlinedChain() : U(0) {}
+  DWARFDebugInfoEntryInlinedChain() : U(nullptr) {}
   SmallVector<DWARFDebugInfoEntryMinimal, 4> DIEs;
   const DWARFUnit *U;
 };
diff --git a/lib/DebugInfo/DWARFDebugLine.cpp b/lib/DebugInfo/DWARFDebugLine.cpp
index 43d9764..ce87635 100644
--- a/lib/DebugInfo/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARFDebugLine.cpp
@@ -15,6 +15,20 @@
 #include <algorithm>
 using namespace llvm;
 using namespace dwarf;
+typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
+
+DWARFDebugLine::Prologue::Prologue() {
+  clear();
+}
+
+void DWARFDebugLine::Prologue::clear() {
+  TotalLength = Version = PrologueLength = 0;
+  MinInstLength = MaxOpsPerInst = DefaultIsStmt = LineBase = LineRange = 0;
+  OpcodeBase = 0;
+  StandardOpcodeLengths.clear();
+  IncludeDirectories.clear();
+  FileNames.clear();
+}
 
 void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
   OS << "Line table prologue:\n"
@@ -51,6 +65,67 @@ void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
   }
 }
 
+bool DWARFDebugLine::Prologue::parse(DataExtractor debug_line_data,
+                                     uint32_t *offset_ptr) {
+  const uint32_t prologue_offset = *offset_ptr;
+
+  clear();
+  TotalLength = debug_line_data.getU32(offset_ptr);
+  Version = debug_line_data.getU16(offset_ptr);
+  if (Version < 2)
+    return false;
+
+  PrologueLength = debug_line_data.getU32(offset_ptr);
+  const uint32_t end_prologue_offset = PrologueLength + *offset_ptr;
+  MinInstLength = debug_line_data.getU8(offset_ptr);
+  if (Version >= 4)
+    MaxOpsPerInst = debug_line_data.getU8(offset_ptr);
+  DefaultIsStmt = debug_line_data.getU8(offset_ptr);
+  LineBase = debug_line_data.getU8(offset_ptr);
+  LineRange = debug_line_data.getU8(offset_ptr);
+  OpcodeBase = debug_line_data.getU8(offset_ptr);
+
+  StandardOpcodeLengths.reserve(OpcodeBase - 1);
+  for (uint32_t i = 1; i < OpcodeBase; ++i) {
+    uint8_t op_len = debug_line_data.getU8(offset_ptr);
+    StandardOpcodeLengths.push_back(op_len);
+  }
+
+  while (*offset_ptr < end_prologue_offset) {
+    const char *s = debug_line_data.getCStr(offset_ptr);
+    if (s && s[0])
+      IncludeDirectories.push_back(s);
+    else
+      break;
+  }
+
+  while (*offset_ptr < end_prologue_offset) {
+    const char *name = debug_line_data.getCStr(offset_ptr);
+    if (name && name[0]) {
+      FileNameEntry fileEntry;
+      fileEntry.Name = name;
+      fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
+      fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
+      fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
+      FileNames.push_back(fileEntry);
+    } else {
+      break;
+    }
+  }
+
+  if (*offset_ptr != end_prologue_offset) {
+    fprintf(stderr, "warning: parsing line table prologue at 0x%8.8x should"
+                    " have ended at 0x%8.8x but it ended at 0x%8.8x\n",
+            prologue_offset, end_prologue_offset, *offset_ptr);
+    return false;
+  }
+  return true;
+}
+
+DWARFDebugLine::Row::Row(bool default_is_stmt) {
+  reset(default_is_stmt);
+}
+
 void DWARFDebugLine::Row::postAppend() {
   BasicBlock = false;
   PrologueEnd = false;
@@ -82,6 +157,22 @@ void DWARFDebugLine::Row::dump(raw_ostream &OS) const {
      << '\n';
 }
 
+DWARFDebugLine::Sequence::Sequence() {
+  reset();
+}
+
+void DWARFDebugLine::Sequence::reset() {
+  LowPC = 0;
+  HighPC = 0;
+  FirstRowIndex = 0;
+  LastRowIndex = 0;
+  Empty = true;
+}
+
+DWARFDebugLine::LineTable::LineTable() {
+  clear();
+}
+
 void DWARFDebugLine::LineTable::dump(raw_ostream &OS) const {
   Prologue.dump(OS);
   OS << '\n';
@@ -96,50 +187,40 @@ void DWARFDebugLine::LineTable::dump(raw_ostream &OS) const {
   }
 }
 
-DWARFDebugLine::State::~State() {}
-
-void DWARFDebugLine::State::appendRowToMatrix(uint32_t offset) {
-  if (Sequence::Empty) {
-    // Record the beginning of instruction sequence.
-    Sequence::Empty = false;
-    Sequence::LowPC = Address;
-    Sequence::FirstRowIndex = row;
-  }
-  ++row;  // Increase the row number.
-  LineTable::appendRow(*this);
-  if (EndSequence) {
-    // Record the end of instruction sequence.
-    Sequence::HighPC = Address;
-    Sequence::LastRowIndex = row;
-    if (Sequence::isValid())
-      LineTable::appendSequence(*this);
-    Sequence::reset();
-  }
-  Row::postAppend();
+void DWARFDebugLine::LineTable::clear() {
+  Prologue.clear();
+  Rows.clear();
+  Sequences.clear();
 }
 
-void DWARFDebugLine::State::finalize() {
-  row = DoneParsingLineTable;
-  if (!Sequence::Empty) {
-    fprintf(stderr, "warning: last sequence in debug line table is not"
-                    "terminated!\n");
-  }
-  // Sort all sequences so that address lookup will work faster.
-  if (!Sequences.empty()) {
-    std::sort(Sequences.begin(), Sequences.end(), Sequence::orderByLowPC);
-    // Note: actually, instruction address ranges of sequences should not
-    // overlap (in shared objects and executables). If they do, the address
-    // lookup would still work, though, but result would be ambiguous.
-    // We don't report warning in this case. For example,
-    // sometimes .so compiled from multiple object files contains a few
-    // rudimentary sequences for address ranges [0x0, 0xsomething).
-  }
+DWARFDebugLine::ParsingState::ParsingState(struct LineTable *LT)
+    : LineTable(LT), RowNumber(0) {
+  resetRowAndSequence();
 }
 
-DWARFDebugLine::DumpingState::~DumpingState() {}
+void DWARFDebugLine::ParsingState::resetRowAndSequence() {
+  Row.reset(LineTable->Prologue.DefaultIsStmt);
+  Sequence.reset();
+}
 
-void DWARFDebugLine::DumpingState::finalize() {
-  LineTable::dump(OS);
+void DWARFDebugLine::ParsingState::appendRowToMatrix(uint32_t offset) {
+  if (Sequence.Empty) {
+    // Record the beginning of instruction sequence.
+    Sequence.Empty = false;
+    Sequence.LowPC = Row.Address;
+    Sequence.FirstRowIndex = RowNumber;
+  }
+  ++RowNumber;
+  LineTable->appendRow(Row);
+  if (Row.EndSequence) {
+    // Record the end of instruction sequence.
+    Sequence.HighPC = Row.Address;
+    Sequence.LastRowIndex = RowNumber;
+    if (Sequence.isValid())
+      LineTable->appendSequence(Sequence);
+    Sequence.reset();
+  }
+  Row.postAppend();
 }
 
 const DWARFDebugLine::LineTable *
@@ -147,7 +228,7 @@ DWARFDebugLine::getLineTable(uint32_t offset) const {
   LineTableConstIter pos = LineTableMap.find(offset);
   if (pos != LineTableMap.end())
     return &pos->second;
-  return 0;
+  return nullptr;
 }
 
 const DWARFDebugLine::LineTable *
@@ -155,91 +236,31 @@ DWARFDebugLine::getOrParseLineTable(DataExtractor debug_line_data,
                                     uint32_t offset) {
   std::pair<LineTableIter, bool> pos =
     LineTableMap.insert(LineTableMapTy::value_type(offset, LineTable()));
+  LineTable *LT = &pos.first->second;
   if (pos.second) {
-    // Parse and cache the line table for at this offset.
-    State state;
-    if (!parseStatementTable(debug_line_data, RelocMap, &offset, state))
-      return 0;
-    pos.first->second = state;
+    if (!LT->parse(debug_line_data, RelocMap, &offset))
+      return nullptr;
   }
-  return &pos.first->second;
+  return LT;
 }
 
-bool
-DWARFDebugLine::parsePrologue(DataExtractor debug_line_data,
-                              uint32_t *offset_ptr, Prologue *prologue) {
-  const uint32_t prologue_offset = *offset_ptr;
-
-  prologue->clear();
-  prologue->TotalLength = debug_line_data.getU32(offset_ptr);
-  prologue->Version = debug_line_data.getU16(offset_ptr);
-  if (prologue->Version < 2)
-    return false;
-
-  prologue->PrologueLength = debug_line_data.getU32(offset_ptr);
-  const uint32_t end_prologue_offset = prologue->PrologueLength + *offset_ptr;
-  prologue->MinInstLength = debug_line_data.getU8(offset_ptr);
-  if (prologue->Version >= 4)
-    prologue->MaxOpsPerInst = debug_line_data.getU8(offset_ptr);
-  prologue->DefaultIsStmt = debug_line_data.getU8(offset_ptr);
-  prologue->LineBase = debug_line_data.getU8(offset_ptr);
-  prologue->LineRange = debug_line_data.getU8(offset_ptr);
-  prologue->OpcodeBase = debug_line_data.getU8(offset_ptr);
-
-  prologue->StandardOpcodeLengths.reserve(prologue->OpcodeBase-1);
-  for (uint32_t i = 1; i < prologue->OpcodeBase; ++i) {
-    uint8_t op_len = debug_line_data.getU8(offset_ptr);
-    prologue->StandardOpcodeLengths.push_back(op_len);
-  }
-
-  while (*offset_ptr < end_prologue_offset) {
-    const char *s = debug_line_data.getCStr(offset_ptr);
-    if (s && s[0])
-      prologue->IncludeDirectories.push_back(s);
-    else
-      break;
-  }
-
-  while (*offset_ptr < end_prologue_offset) {
-    const char *name = debug_line_data.getCStr(offset_ptr);
-    if (name && name[0]) {
-      FileNameEntry fileEntry;
-      fileEntry.Name = name;
-      fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
-      fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
-      fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
-      prologue->FileNames.push_back(fileEntry);
-    } else {
-      break;
-    }
-  }
-
-  if (*offset_ptr != end_prologue_offset) {
-    fprintf(stderr, "warning: parsing line table prologue at 0x%8.8x should"
-                    " have ended at 0x%8.8x but it ended at 0x%8.8x\n",
-            prologue_offset, end_prologue_offset, *offset_ptr);
-    return false;
-  }
-  return true;
-}
-
-bool DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
-                                         const RelocAddrMap *RMap,
-                                         uint32_t *offset_ptr, State &state) {
+bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
+                                      const RelocAddrMap *RMap,
+                                      uint32_t *offset_ptr) {
   const uint32_t debug_line_offset = *offset_ptr;
 
-  Prologue *prologue = &state.Prologue;
+  clear();
 
-  if (!parsePrologue(debug_line_data, offset_ptr, prologue)) {
+  if (!Prologue.parse(debug_line_data, offset_ptr)) {
     // Restore our offset and return false to indicate failure!
     *offset_ptr = debug_line_offset;
     return false;
   }
 
-  const uint32_t end_offset = debug_line_offset + prologue->TotalLength +
-                              sizeof(prologue->TotalLength);
+  const uint32_t end_offset = debug_line_offset + Prologue.TotalLength +
+                              sizeof(Prologue.TotalLength);
 
-  state.reset();
+  ParsingState State(this);
 
   while (*offset_ptr < end_offset) {
     uint8_t opcode = debug_line_data.getU8(offset_ptr);
@@ -261,9 +282,9 @@ bool DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
         // with a DW_LNE_end_sequence instruction which creates a row whose
         // address is that of the byte after the last target machine instruction
         // of the sequence.
-        state.EndSequence = true;
-        state.appendRowToMatrix(*offset_ptr);
-        state.reset();
+        State.Row.EndSequence = true;
+        State.appendRowToMatrix(*offset_ptr);
+        State.resetRowAndSequence();
         break;
 
       case DW_LNE_set_address:
@@ -278,9 +299,10 @@ bool DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
           RelocAddrMap::const_iterator AI = RMap->find(*offset_ptr);
           if (AI != RMap->end()) {
              const std::pair<uint8_t, int64_t> &R = AI->second;
-             state.Address = debug_line_data.getAddress(offset_ptr) + R.second;
+             State.Row.Address =
+                 debug_line_data.getAddress(offset_ptr) + R.second;
           } else
-            state.Address = debug_line_data.getAddress(offset_ptr);
+            State.Row.Address = debug_line_data.getAddress(offset_ptr);
         }
         break;
 
@@ -311,12 +333,12 @@ bool DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
           fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
           fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
           fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
-          prologue->FileNames.push_back(fileEntry);
+          Prologue.FileNames.push_back(fileEntry);
         }
         break;
 
       case DW_LNE_set_discriminator:
-        state.Discriminator = debug_line_data.getULEB128(offset_ptr);
+        State.Row.Discriminator = debug_line_data.getULEB128(offset_ptr);
         break;
 
       default:
@@ -325,52 +347,52 @@ bool DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
         (*offset_ptr) += arg_size;
         break;
       }
-    } else if (opcode < prologue->OpcodeBase) {
+    } else if (opcode < Prologue.OpcodeBase) {
       switch (opcode) {
       // Standard Opcodes
       case DW_LNS_copy:
         // Takes no arguments. Append a row to the matrix using the
         // current values of the state-machine registers. Then set
         // the basic_block register to false.
-        state.appendRowToMatrix(*offset_ptr);
+        State.appendRowToMatrix(*offset_ptr);
         break;
 
       case DW_LNS_advance_pc:
         // Takes a single unsigned LEB128 operand, multiplies it by the
         // min_inst_length field of the prologue, and adds the
         // result to the address register of the state machine.
-        state.Address += debug_line_data.getULEB128(offset_ptr) *
-                         prologue->MinInstLength;
+        State.Row.Address +=
+            debug_line_data.getULEB128(offset_ptr) * Prologue.MinInstLength;
         break;
 
       case DW_LNS_advance_line:
         // Takes a single signed LEB128 operand and adds that value to
         // the line register of the state machine.
-        state.Line += debug_line_data.getSLEB128(offset_ptr);
+        State.Row.Line += debug_line_data.getSLEB128(offset_ptr);
         break;
 
       case DW_LNS_set_file:
         // Takes a single unsigned LEB128 operand and stores it in the file
         // register of the state machine.
-        state.File = debug_line_data.getULEB128(offset_ptr);
+        State.Row.File = debug_line_data.getULEB128(offset_ptr);
         break;
 
       case DW_LNS_set_column:
         // Takes a single unsigned LEB128 operand and stores it in the
         // column register of the state machine.
-        state.Column = debug_line_data.getULEB128(offset_ptr);
+        State.Row.Column = debug_line_data.getULEB128(offset_ptr);
         break;
 
       case DW_LNS_negate_stmt:
         // Takes no arguments. Set the is_stmt register of the state
         // machine to the logical negation of its current value.
-        state.IsStmt = !state.IsStmt;
+        State.Row.IsStmt = !State.Row.IsStmt;
         break;
 
       case DW_LNS_set_basic_block:
         // Takes no arguments. Set the basic_block register of the
         // state machine to true
-        state.BasicBlock = true;
+        State.Row.BasicBlock = true;
         break;
 
       case DW_LNS_const_add_pc:
@@ -386,10 +408,10 @@ bool DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
         // than twice that range will it need to use both DW_LNS_advance_pc
         // and a special opcode, requiring three or more bytes.
         {
-          uint8_t adjust_opcode = 255 - prologue->OpcodeBase;
-          uint64_t addr_offset = (adjust_opcode / prologue->LineRange) *
-                                 prologue->MinInstLength;
-          state.Address += addr_offset;
+          uint8_t adjust_opcode = 255 - Prologue.OpcodeBase;
+          uint64_t addr_offset =
+              (adjust_opcode / Prologue.LineRange) * Prologue.MinInstLength;
+          State.Row.Address += addr_offset;
         }
         break;
 
@@ -403,25 +425,25 @@ bool DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
         // judge when the computation of a special opcode overflows and
         // requires the use of DW_LNS_advance_pc. Such assemblers, however,
         // can use DW_LNS_fixed_advance_pc instead, sacrificing compression.
-        state.Address += debug_line_data.getU16(offset_ptr);
+        State.Row.Address += debug_line_data.getU16(offset_ptr);
         break;
 
       case DW_LNS_set_prologue_end:
         // Takes no arguments. Set the prologue_end register of the
         // state machine to true
-        state.PrologueEnd = true;
+        State.Row.PrologueEnd = true;
         break;
 
       case DW_LNS_set_epilogue_begin:
         // Takes no arguments. Set the basic_block register of the
         // state machine to true
-        state.EpilogueBegin = true;
+        State.Row.EpilogueBegin = true;
         break;
 
       case DW_LNS_set_isa:
         // Takes a single unsigned LEB128 operand and stores it in the
         // column register of the state machine.
-        state.Isa = debug_line_data.getULEB128(offset_ptr);
+        State.Row.Isa = debug_line_data.getULEB128(offset_ptr);
         break;
 
       default:
@@ -429,9 +451,9 @@ bool DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
         // of such opcodes because they are specified in the prologue
         // as a multiple of LEB128 operands for each opcode.
         {
-          assert(opcode - 1U < prologue->StandardOpcodeLengths.size());
-          uint8_t opcode_length = prologue->StandardOpcodeLengths[opcode - 1];
-          for (uint8_t i=0; i<opcode_length; ++i)
+          assert(opcode - 1U < Prologue.StandardOpcodeLengths.size());
+          uint8_t opcode_length = Prologue.StandardOpcodeLengths[opcode - 1];
+          for (uint8_t i = 0; i < opcode_length; ++i)
             debug_line_data.getULEB128(offset_ptr);
         }
         break;
@@ -470,24 +492,37 @@ bool DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
       //
       // line increment = line_base + (adjusted opcode % line_range)
 
-      uint8_t adjust_opcode = opcode - prologue->OpcodeBase;
-      uint64_t addr_offset = (adjust_opcode / prologue->LineRange) *
-                             prologue->MinInstLength;
-      int32_t line_offset = prologue->LineBase +
-                            (adjust_opcode % prologue->LineRange);
-      state.Line += line_offset;
-      state.Address += addr_offset;
-      state.appendRowToMatrix(*offset_ptr);
+      uint8_t adjust_opcode = opcode - Prologue.OpcodeBase;
+      uint64_t addr_offset =
+          (adjust_opcode / Prologue.LineRange) * Prologue.MinInstLength;
+      int32_t line_offset =
+          Prologue.LineBase + (adjust_opcode % Prologue.LineRange);
+      State.Row.Line += line_offset;
+      State.Row.Address += addr_offset;
+      State.appendRowToMatrix(*offset_ptr);
     }
   }
 
-  state.finalize();
+  if (!State.Sequence.Empty) {
+    fprintf(stderr, "warning: last sequence in debug line table is not"
+                    "terminated!\n");
+  }
+
+  // Sort all sequences so that address lookup will work faster.
+  if (!Sequences.empty()) {
+    std::sort(Sequences.begin(), Sequences.end(), Sequence::orderByLowPC);
+    // Note: actually, instruction address ranges of sequences should not
+    // overlap (in shared objects and executables). If they do, the address
+    // lookup would still work, though, but result would be ambiguous.
+    // We don't report warning in this case. For example,
+    // sometimes .so compiled from multiple object files contains a few
+    // rudimentary sequences for address ranges [0x0, 0xsomething).
+  }
 
   return end_offset;
 }
 
-uint32_t
-DWARFDebugLine::LineTable::lookupAddress(uint64_t address) const {
+uint32_t DWARFDebugLine::LineTable::lookupAddress(uint64_t address) const {
   uint32_t unknown_index = UINT32_MAX;
   if (Sequences.empty())
     return unknown_index;
@@ -532,10 +567,8 @@ DWARFDebugLine::LineTable::lookupAddress(uint64_t address) const {
   return index;
 }
 
-bool
-DWARFDebugLine::LineTable::lookupAddressRange(uint64_t address,
-                                       uint64_t size, 
-                                       std::vector<uint32_t>& result) const {
+bool DWARFDebugLine::LineTable::lookupAddressRange(
+    uint64_t address, uint64_t size, std::vector<uint32_t> &result) const {
   if (Sequences.empty())
     return false;
   uint64_t end_addr = address + size;
@@ -611,13 +644,14 @@ DWARFDebugLine::LineTable::lookupAddressRange(uint64_t address,
 
 bool
 DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
-                                              bool NeedsAbsoluteFilePath,
+                                              FileLineInfoKind Kind,
                                               std::string &Result) const {
-  if (FileIndex == 0 || FileIndex > Prologue.FileNames.size())
+  if (FileIndex == 0 || FileIndex > Prologue.FileNames.size() ||
+      Kind == FileLineInfoKind::None)
     return false;
   const FileNameEntry &Entry = Prologue.FileNames[FileIndex - 1];
   const char *FileName = Entry.Name;
-  if (!NeedsAbsoluteFilePath ||
+  if (Kind != FileLineInfoKind::AbsoluteFilePath ||
       sys::path::is_absolute(FileName)) {
     Result = FileName;
     return true;
diff --git a/lib/DebugInfo/DWARFDebugLine.h b/lib/DebugInfo/DWARFDebugLine.h
index a336f49..c7b7ec2 100644
--- a/lib/DebugInfo/DWARFDebugLine.h
+++ b/lib/DebugInfo/DWARFDebugLine.h
@@ -11,6 +11,7 @@
 #define LLVM_DEBUGINFO_DWARFDEBUGLINE_H
 
 #include "DWARFRelocMap.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/Support/DataExtractor.h"
 #include <map>
 #include <string>
@@ -24,7 +25,7 @@ class DWARFDebugLine {
 public:
   DWARFDebugLine(const RelocAddrMap* LineInfoRelocMap) : RelocMap(LineInfoRelocMap) {}
   struct FileNameEntry {
-    FileNameEntry() : Name(0), DirIdx(0), ModTime(0), Length(0) {}
+    FileNameEntry() : Name(nullptr), DirIdx(0), ModTime(0), Length(0) {}
 
     const char *Name;
     uint64_t DirIdx;
@@ -33,10 +34,7 @@ public:
   };
 
   struct Prologue {
-    Prologue()
-        : TotalLength(0), Version(0), PrologueLength(0), MinInstLength(0),
-          MaxOpsPerInst(0), DefaultIsStmt(0), LineBase(0), LineRange(0),
-          OpcodeBase(0) {}
+    Prologue();
 
     // The size in bytes of the statement information for this compilation unit
     // (not including the total_length field itself).
@@ -77,19 +75,16 @@ public:
     int32_t getMaxLineIncrementForSpecialOpcode() const {
       return LineBase + (int8_t)LineRange - 1;
     }
+
+    void clear();
     void dump(raw_ostream &OS) const;
-    void clear() {
-      TotalLength = Version = PrologueLength = 0;
-      MinInstLength = LineBase = LineRange = OpcodeBase = 0;
-      StandardOpcodeLengths.clear();
-      IncludeDirectories.clear();
-      FileNames.clear();
-    }
+    bool parse(DataExtractor debug_line_data, uint32_t *offset_ptr);
   };
 
   // Standard .debug_line state machine structure.
   struct Row {
-    Row(bool default_is_stmt = false) { reset(default_is_stmt); }
+    explicit Row(bool default_is_stmt = false);
+
     /// Called after a row is appended to the matrix.
     void postAppend();
     void reset(bool default_is_stmt);
@@ -151,14 +146,9 @@ public:
     unsigned LastRowIndex;
     bool Empty;
 
-    Sequence() { reset(); }
-    void reset() {
-      LowPC = 0;
-      HighPC = 0;
-      FirstRowIndex = 0;
-      LastRowIndex = 0;
-      Empty = true;
-    }
+    Sequence();
+    void reset();
+
     static bool orderByLowPC(const Sequence& LHS, const Sequence& RHS) {
       return LHS.LowPC < RHS.LowPC;
     }
@@ -171,31 +161,34 @@ public:
   };
 
   struct LineTable {
-    void appendRow(const DWARFDebugLine::Row &state) { Rows.push_back(state); }
-    void appendSequence(const DWARFDebugLine::Sequence &sequence) {
-      Sequences.push_back(sequence);
+    LineTable();
+
+    void appendRow(const DWARFDebugLine::Row &R) {
+      Rows.push_back(R);
     }
-    void clear() {
-      Prologue.clear();
-      Rows.clear();
-      Sequences.clear();
+    void appendSequence(const DWARFDebugLine::Sequence &S) {
+      Sequences.push_back(S);
     }
 
     // Returns the index of the row with file/line info for a given address,
     // or -1 if there is no such row.
     uint32_t lookupAddress(uint64_t address) const;
 
-    bool lookupAddressRange(uint64_t address,
-                            uint64_t size, 
-                            std::vector<uint32_t>& result) const;
+    bool lookupAddressRange(uint64_t address, uint64_t size,
+                            std::vector<uint32_t> &result) const;
 
     // Extracts filename by its index in filename table in prologue.
     // Returns true on success.
     bool getFileNameByIndex(uint64_t FileIndex,
-                            bool NeedsAbsoluteFilePath,
+                            DILineInfoSpecifier::FileLineInfoKind Kind,
                             std::string &Result) const;
 
     void dump(raw_ostream &OS) const;
+    void clear();
+
+    /// Parse prologue and all rows.
+    bool parse(DataExtractor debug_line_data, const RelocAddrMap *RMap,
+               uint32_t *offset_ptr);
 
     struct Prologue Prologue;
     typedef std::vector<Row> RowVector;
@@ -206,48 +199,26 @@ public:
     SequenceVector Sequences;
   };
 
-  struct State : public Row, public Sequence, public LineTable {
-    // Special row codes.
-    enum {
-      StartParsingLineTable = 0,
-      DoneParsingLineTable = -1
-    };
-
-    State() : row(StartParsingLineTable) {}
-    virtual ~State();
-
-    virtual void appendRowToMatrix(uint32_t offset);
-    virtual void finalize();
-    virtual void reset() {
-      Row::reset(Prologue.DefaultIsStmt);
-      Sequence::reset();
-    }
-
-    // The row number that starts at zero for the prologue, and increases for
-    // each row added to the matrix.
-    unsigned row;
-  };
-
-  struct DumpingState : public State {
-    DumpingState(raw_ostream &OS) : OS(OS) {}
-    virtual ~DumpingState();
-    void finalize() override;
-  private:
-    raw_ostream &OS;
-  };
-
-  static bool parsePrologue(DataExtractor debug_line_data, uint32_t *offset_ptr,
-                            Prologue *prologue);
-  /// Parse a single line table (prologue and all rows).
-  static bool parseStatementTable(DataExtractor debug_line_data,
-                                  const RelocAddrMap *RMap,
-                                  uint32_t *offset_ptr, State &state);
-
   const LineTable *getLineTable(uint32_t offset) const;
   const LineTable *getOrParseLineTable(DataExtractor debug_line_data,
                                        uint32_t offset);
 
 private:
+  struct ParsingState {
+    ParsingState(struct LineTable *LT);
+
+    void resetRowAndSequence();
+    void appendRowToMatrix(uint32_t offset);
+
+    // Line table we're currently parsing.
+    struct LineTable *LineTable;
+    // The row number that starts at zero for the prologue, and increases for
+    // each row added to the matrix.
+    unsigned RowNumber;
+    struct Row Row;
+    struct Sequence Sequence;
+  };
+
   typedef std::map<uint32_t, LineTable> LineTableMapTy;
   typedef LineTableMapTy::iterator LineTableIter;
   typedef LineTableMapTy::const_iterator LineTableConstIter;
diff --git a/lib/DebugInfo/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARFDebugRangeList.cpp
index aa2a2be..07b23b3 100644
--- a/lib/DebugInfo/DWARFDebugRangeList.cpp
+++ b/lib/DebugInfo/DWARFDebugRangeList.cpp
@@ -54,13 +54,16 @@ void DWARFDebugRangeList::dump(raw_ostream &OS) const {
   OS << format("%08x <End of list>\n", Offset);
 }
 
-bool DWARFDebugRangeList::containsAddress(uint64_t BaseAddress,
-                                          uint64_t Address) const {
+DWARFAddressRangesVector
+DWARFDebugRangeList::getAbsoluteRanges(uint64_t BaseAddress) const {
+  DWARFAddressRangesVector Res;
   for (const RangeListEntry &RLE : Entries) {
-    if (RLE.isBaseAddressSelectionEntry(AddressSize))
+    if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
       BaseAddress = RLE.EndAddress;
-    else if (RLE.containsAddress(BaseAddress, Address))
-      return true;
+    } else {
+      Res.push_back(std::make_pair(BaseAddress + RLE.StartAddress,
+                                   BaseAddress + RLE.EndAddress));
+    }
   }
-  return false;
+  return Res;
 }
diff --git a/lib/DebugInfo/DWARFDebugRangeList.h b/lib/DebugInfo/DWARFDebugRangeList.h
index 4e34a91..587b550 100644
--- a/lib/DebugInfo/DWARFDebugRangeList.h
+++ b/lib/DebugInfo/DWARFDebugRangeList.h
@@ -17,6 +17,9 @@ namespace llvm {
 
 class raw_ostream;
 
+/// DWARFAddressRangesVector - represents a set of absolute address ranges.
+typedef std::vector<std::pair<uint64_t, uint64_t>> DWARFAddressRangesVector;
+
 class DWARFDebugRangeList {
 public:
   struct RangeListEntry {
@@ -50,10 +53,6 @@ public:
       else
         return StartAddress == -1ULL;
     }
-    bool containsAddress(uint64_t BaseAddress, uint64_t Address) const {
-      return (BaseAddress + StartAddress <= Address) &&
-             (Address < BaseAddress + EndAddress);
-    }
   };
 
 private:
@@ -67,10 +66,10 @@ public:
   void clear();
   void dump(raw_ostream &OS) const;
   bool extract(DataExtractor data, uint32_t *offset_ptr);
-  /// containsAddress - Returns true if range list contains the given
-  /// address. Has to be passed base address of the compile unit that
-  /// references this range list.
-  bool containsAddress(uint64_t BaseAddress, uint64_t Address) const;
+  /// getAbsoluteRanges - Returns absolute address ranges defined by this range
+  /// list. Has to be passed base address of the compile unit referencing this
+  /// range list.
+  DWARFAddressRangesVector getAbsoluteRanges(uint64_t BaseAddress) const;
 };
 
 }  // namespace llvm
diff --git a/lib/DebugInfo/DWARFFormValue.cpp b/lib/DebugInfo/DWARFFormValue.cpp
index da71fb3..8d0f966 100644
--- a/lib/DebugInfo/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARFFormValue.cpp
@@ -131,7 +131,7 @@ bool DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
                                   const DWARFUnit *cu) {
   bool indirect = false;
   bool is_block = false;
-  Value.data = NULL;
+  Value.data = nullptr;
   // Read the value for the form into value and follow and DW_FORM_indirect
   // instances we run into
   do {
@@ -241,7 +241,7 @@ bool DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
 
   if (is_block) {
     StringRef str = data.getData().substr(*offset_ptr, Value.uval);
-    Value.data = NULL;
+    Value.data = nullptr;
     if (!str.empty()) {
       Value.data = reinterpret_cast<const uint8_t *>(str.data());
       *offset_ptr += Value.uval;
@@ -488,7 +488,7 @@ Optional<const char *> DWARFFormValue::getAsCString(const DWARFUnit *U) const {
     return None;
   if (Form == DW_FORM_string)
     return Value.cstr;
-  if (U == 0)
+  if (!U)
     return None;
   uint32_t Offset = Value.uval;
   if (Form == DW_FORM_GNU_str_index) {
@@ -509,7 +509,7 @@ Optional<uint64_t> DWARFFormValue::getAsAddress(const DWARFUnit *U) const {
   if (Form == DW_FORM_GNU_addr_index) {
     uint32_t Index = Value.uval;
     uint64_t Result;
-    if (U == 0 || !U->getAddrOffsetSectionItem(Index, Result))
+    if (!U || !U->getAddrOffsetSectionItem(Index, Result))
       return None;
     return Result;
   }
@@ -525,7 +525,7 @@ Optional<uint64_t> DWARFFormValue::getAsReference(const DWARFUnit *U) const {
   case DW_FORM_ref4:
   case DW_FORM_ref8:
   case DW_FORM_ref_udata:
-    if (U == 0)
+    if (!U)
       return None;
     return Value.uval + U->getOffset();
   case DW_FORM_ref_addr:
diff --git a/lib/DebugInfo/DWARFTypeUnit.h b/lib/DebugInfo/DWARFTypeUnit.h
index 05e13ff..cf773b8 100644
--- a/lib/DebugInfo/DWARFTypeUnit.h
+++ b/lib/DebugInfo/DWARFTypeUnit.h
@@ -19,11 +19,13 @@ private:
   uint64_t TypeHash;
   uint32_t TypeOffset;
 public:
-  DWARFTypeUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
-                StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+  DWARFTypeUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef RS,
+                StringRef SS, StringRef SOS, StringRef AOS,
                 const RelocAddrMap *M, bool LE)
-      : DWARFUnit(DA, IS, AS, RS, SS, SOS, AOS, M, LE) {}
-  uint32_t getSize() const override { return DWARFUnit::getSize() + 12; }
+      : DWARFUnit(DA, IS, RS, SS, SOS, AOS, M, LE) {}
+  uint32_t getHeaderSize() const override {
+    return DWARFUnit::getHeaderSize() + 12;
+  }
   void dump(raw_ostream &OS);
 protected:
   bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) override;
diff --git a/lib/DebugInfo/DWARFUnit.cpp b/lib/DebugInfo/DWARFUnit.cpp
index 316c208..f5f5072 100644
--- a/lib/DebugInfo/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARFUnit.cpp
@@ -17,12 +17,12 @@
 using namespace llvm;
 using namespace dwarf;
 
-DWARFUnit::DWARFUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
-                     StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+DWARFUnit::DWARFUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef RS,
+                     StringRef SS, StringRef SOS, StringRef AOS,
                      const RelocAddrMap *M, bool LE)
-    : Abbrev(DA), InfoSection(IS), AbbrevSection(AS), RangeSection(RS),
-      StringSection(SS), StringOffsetSection(SOS), AddrOffsetSection(AOS),
-      RelocMap(M), isLittleEndian(LE) {
+    : Abbrev(DA), InfoSection(IS), RangeSection(RS), StringSection(SS),
+      StringOffsetSection(SOS), AddrOffsetSection(AOS), RelocMap(M),
+      isLittleEndian(LE) {
   clear();
 }
 
@@ -54,18 +54,20 @@ bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
 bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
   Length = debug_info.getU32(offset_ptr);
   Version = debug_info.getU16(offset_ptr);
-  uint64_t abbrOffset = debug_info.getU32(offset_ptr);
+  uint64_t AbbrOffset = debug_info.getU32(offset_ptr);
   AddrSize = debug_info.getU8(offset_ptr);
 
-  bool lengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
-  bool versionOK = DWARFContext::isSupportedVersion(Version);
-  bool abbrOffsetOK = AbbrevSection.size() > abbrOffset;
-  bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
+  bool LengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
+  bool VersionOK = DWARFContext::isSupportedVersion(Version);
+  bool AddrSizeOK = AddrSize == 4 || AddrSize == 8;
 
-  if (!lengthOK || !versionOK || !addrSizeOK || !abbrOffsetOK)
+  if (!LengthOK || !VersionOK || !AddrSizeOK)
+    return false;
+
+  Abbrevs = Abbrev->getAbbreviationDeclarationSet(AbbrOffset);
+  if (Abbrevs == nullptr)
     return false;
 
-  Abbrevs = Abbrev->getAbbreviationDeclarationSet(abbrOffset);
   return true;
 }
 
@@ -98,7 +100,7 @@ void DWARFUnit::clear() {
   Offset = 0;
   Length = 0;
   Version = 0;
-  Abbrevs = 0;
+  Abbrevs = nullptr;
   AddrSize = 0;
   BaseAddr = 0;
   RangeSectionBase = 0;
@@ -110,8 +112,8 @@ void DWARFUnit::clear() {
 const char *DWARFUnit::getCompilationDir() {
   extractDIEsIfNeeded(true);
   if (DieArray.empty())
-    return 0;
-  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
+    return nullptr;
+  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, nullptr);
 }
 
 uint64_t DWARFUnit::getDWOId() {
@@ -124,38 +126,32 @@ uint64_t DWARFUnit::getDWOId() {
 }
 
 void DWARFUnit::setDIERelations() {
-  if (DieArray.empty())
+  if (DieArray.size() <= 1)
     return;
-  DWARFDebugInfoEntryMinimal *die_array_begin = &DieArray.front();
-  DWARFDebugInfoEntryMinimal *die_array_end = &DieArray.back();
-  DWARFDebugInfoEntryMinimal *curr_die;
-  // We purposely are skipping the last element in the array in the loop below
-  // so that we can always have a valid next item
-  for (curr_die = die_array_begin; curr_die < die_array_end; ++curr_die) {
-    // Since our loop doesn't include the last element, we can always
-    // safely access the next die in the array.
-    DWARFDebugInfoEntryMinimal *next_die = curr_die + 1;
-
-    const DWARFAbbreviationDeclaration *curr_die_abbrev =
-      curr_die->getAbbreviationDeclarationPtr();
-
-    if (curr_die_abbrev) {
-      // Normal DIE
-      if (curr_die_abbrev->hasChildren())
-        next_die->setParent(curr_die);
-      else
-        curr_die->setSibling(next_die);
+
+  std::vector<DWARFDebugInfoEntryMinimal *> ParentChain;
+  DWARFDebugInfoEntryMinimal *SiblingChain = nullptr;
+  for (auto &DIE : DieArray) {
+    if (SiblingChain) {
+      SiblingChain->setSibling(&DIE);
+    }
+    if (const DWARFAbbreviationDeclaration *AbbrDecl =
+            DIE.getAbbreviationDeclarationPtr()) {
+      // Normal DIE.
+      if (AbbrDecl->hasChildren()) {
+        ParentChain.push_back(&DIE);
+        SiblingChain = nullptr;
+      } else {
+        SiblingChain = &DIE;
+      }
     } else {
-      // NULL DIE that terminates a sibling chain
-      DWARFDebugInfoEntryMinimal *parent = curr_die->getParent();
-      if (parent)
-        parent->setSibling(next_die);
+      // NULL entry terminates the sibling chain.
+      SiblingChain = ParentChain.back();
+      ParentChain.pop_back();
     }
   }
-
-  // Since we skipped the last element, we need to fix it up!
-  if (die_array_begin < die_array_end)
-    curr_die->setParent(die_array_begin);
+  assert(SiblingChain == nullptr || SiblingChain == &DieArray[0]);
+  assert(ParentChain.empty());
 }
 
 void DWARFUnit::extractDIEsToVector(
@@ -166,13 +162,13 @@ void DWARFUnit::extractDIEsToVector(
 
   // Set the offset to that of the first DIE and calculate the start of the
   // next compilation unit header.
-  uint32_t Offset = getFirstDIEOffset();
+  uint32_t DIEOffset = Offset + getHeaderSize();
   uint32_t NextCUOffset = getNextUnitOffset();
   DWARFDebugInfoEntryMinimal DIE;
   uint32_t Depth = 0;
   bool IsCUDie = true;
 
-  while (Offset < NextCUOffset && DIE.extractFast(this, &Offset)) {
+  while (DIEOffset < NextCUOffset && DIE.extractFast(this, &DIEOffset)) {
     if (IsCUDie) {
       if (AppendCUDie)
         Dies.push_back(DIE);
@@ -187,9 +183,8 @@ void DWARFUnit::extractDIEsToVector(
       Dies.push_back(DIE);
     }
 
-    const DWARFAbbreviationDeclaration *AbbrDecl =
-      DIE.getAbbreviationDeclarationPtr();
-    if (AbbrDecl) {
+    if (const DWARFAbbreviationDeclaration *AbbrDecl =
+            DIE.getAbbreviationDeclarationPtr()) {
       // Normal DIE
       if (AbbrDecl->hasChildren())
         ++Depth;
@@ -205,9 +200,9 @@ void DWARFUnit::extractDIEsToVector(
   // Give a little bit of info if we encounter corrupt DWARF (our offset
   // should always terminate at or before the start of the next compilation
   // unit header).
-  if (Offset > NextCUOffset)
+  if (DIEOffset > NextCUOffset)
     fprintf(stderr, "warning: DWARF compile unit extends beyond its "
-                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), Offset);
+                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), DIEOffset);
 }
 
 size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
@@ -241,25 +236,25 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
 DWARFUnit::DWOHolder::DWOHolder(object::ObjectFile *DWOFile)
     : DWOFile(DWOFile),
       DWOContext(cast<DWARFContext>(DIContext::getDWARFContext(DWOFile))),
-      DWOU(0) {
+      DWOU(nullptr) {
   if (DWOContext->getNumDWOCompileUnits() > 0)
     DWOU = DWOContext->getDWOCompileUnitAtIndex(0);
 }
 
 bool DWARFUnit::parseDWO() {
-  if (DWO.get() != 0)
+  if (DWO.get())
     return false;
   extractDIEsIfNeeded(true);
   if (DieArray.empty())
     return false;
   const char *DWOFileName =
-      DieArray[0].getAttributeValueAsString(this, DW_AT_GNU_dwo_name, 0);
-  if (DWOFileName == 0)
+      DieArray[0].getAttributeValueAsString(this, DW_AT_GNU_dwo_name, nullptr);
+  if (!DWOFileName)
     return false;
   const char *CompilationDir =
-      DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
+      DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, nullptr);
   SmallString<16> AbsolutePath;
-  if (sys::path::is_relative(DWOFileName) && CompilationDir != 0) {
+  if (sys::path::is_relative(DWOFileName) && CompilationDir != nullptr) {
     sys::path::append(AbsolutePath, CompilationDir);
   }
   sys::path::append(AbsolutePath, DWOFileName);
@@ -271,7 +266,7 @@ bool DWARFUnit::parseDWO() {
   DWO.reset(new DWOHolder(DWOFile.get()));
   DWARFUnit *DWOCU = DWO->getUnit();
   // Verify that compile unit in .dwo file is valid.
-  if (DWOCU == 0 || DWOCU->getDWOId() != getDWOId()) {
+  if (!DWOCU || DWOCU->getDWOId() != getDWOId()) {
     DWO.reset();
     return false;
   }
@@ -298,33 +293,33 @@ void DWARFUnit::clearDIEs(bool KeepCUDie) {
   }
 }
 
-void
-DWARFUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
-                                         bool clear_dies_if_already_not_parsed,
-                                         uint32_t CUOffsetInAranges) {
+void DWARFUnit::collectAddressRanges(DWARFAddressRangesVector &CURanges) {
+  // First, check if CU DIE describes address ranges for the unit.
+  const auto &CUDIERanges = getCompileUnitDIE()->getAddressRanges(this);
+  if (!CUDIERanges.empty()) {
+    CURanges.insert(CURanges.end(), CUDIERanges.begin(), CUDIERanges.end());
+    return;
+  }
+
   // This function is usually called if there in no .debug_aranges section
   // in order to produce a compile unit level set of address ranges that
   // is accurate. If the DIEs weren't parsed, then we don't want all dies for
   // all compile units to stay loaded when they weren't needed. So we can end
   // up parsing the DWARF and then throwing them all away to keep memory usage
   // down.
-  const bool clear_dies = extractDIEsIfNeeded(false) > 1 &&
-                          clear_dies_if_already_not_parsed;
-  DieArray[0].buildAddressRangeTable(this, debug_aranges, CUOffsetInAranges);
+  const bool ClearDIEs = extractDIEsIfNeeded(false) > 1;
+  DieArray[0].collectChildrenAddressRanges(this, CURanges);
+
+  // Collect address ranges from DIEs in .dwo if necessary.
   bool DWOCreated = parseDWO();
-  if (DWO.get()) {
-    // If there is a .dwo file for this compile unit, then skeleton CU DIE
-    // doesn't have children, and we should instead build address range table
-    // from DIEs in the .debug_info.dwo section of .dwo file.
-    DWO->getUnit()->buildAddressRangeTable(
-        debug_aranges, clear_dies_if_already_not_parsed, CUOffsetInAranges);
-  }
-  if (DWOCreated && clear_dies_if_already_not_parsed)
+  if (DWO.get())
+    DWO->getUnit()->collectAddressRanges(CURanges);
+  if (DWOCreated)
     DWO.reset();
 
   // Keep memory down by clearing DIEs if this generate function
   // caused them to be parsed.
-  if (clear_dies)
+  if (ClearDIEs)
     clearDIEs(true);
 }
 
@@ -337,14 +332,14 @@ DWARFUnit::getSubprogramForAddress(uint64_t Address) {
       return &DIE;
     }
   }
-  return 0;
+  return nullptr;
 }
 
 DWARFDebugInfoEntryInlinedChain
 DWARFUnit::getInlinedChainForAddress(uint64_t Address) {
   // First, find a subprogram that contains the given address (the root
   // of inlined chain).
-  const DWARFUnit *ChainCU = 0;
+  const DWARFUnit *ChainCU = nullptr;
   const DWARFDebugInfoEntryMinimal *SubprogramDIE =
       getSubprogramForAddress(Address);
   if (SubprogramDIE) {
diff --git a/lib/DebugInfo/DWARFUnit.h b/lib/DebugInfo/DWARFUnit.h
index 5b4cf09..471da36 100644
--- a/lib/DebugInfo/DWARFUnit.h
+++ b/lib/DebugInfo/DWARFUnit.h
@@ -29,7 +29,6 @@ class raw_ostream;
 class DWARFUnit {
   const DWARFDebugAbbrev *Abbrev;
   StringRef InfoSection;
-  StringRef AbbrevSection;
   StringRef RangeSection;
   uint32_t RangeSectionBase;
   StringRef StringSection;
@@ -60,12 +59,13 @@ class DWARFUnit {
 
 protected:
   virtual bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr);
+  /// Size in bytes of the unit header.
+  virtual uint32_t getHeaderSize() const { return 11; }
 
 public:
-
-  DWARFUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
-            StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
-            const RelocAddrMap *M, bool LE);
+  DWARFUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef RS,
+            StringRef SS, StringRef SOS, StringRef AOS, const RelocAddrMap *M,
+            bool LE);
 
   virtual ~DWARFUnit();
 
@@ -102,12 +102,7 @@ public:
                         DWARFDebugRangeList &RangeList) const;
   void clear();
   uint32_t getOffset() const { return Offset; }
-  /// Size in bytes of the compile unit header.
-  virtual uint32_t getSize() const { return 11; }
-  uint32_t getFirstDIEOffset() const { return Offset + getSize(); }
   uint32_t getNextUnitOffset() const { return Offset + Length + 4; }
-  /// Size in bytes of the .debug_info data associated with this compile unit.
-  size_t getDebugInfoSize() const { return Length + 4 - getSize(); }
   uint32_t getLength() const { return Length; }
   uint16_t getVersion() const { return Version; }
   const DWARFAbbreviationDeclarationSet *getAbbreviations() const {
@@ -123,15 +118,13 @@ public:
   const DWARFDebugInfoEntryMinimal *
   getCompileUnitDIE(bool extract_cu_die_only = true) {
     extractDIEsIfNeeded(extract_cu_die_only);
-    return DieArray.empty() ? NULL : &DieArray[0];
+    return DieArray.empty() ? nullptr : &DieArray[0];
   }
 
   const char *getCompilationDir();
   uint64_t getDWOId();
 
-  void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
-                              bool clear_dies_if_already_not_parsed,
-                              uint32_t CUOffsetInAranges);
+  void collectAddressRanges(DWARFAddressRangesVector &CURanges);
 
   /// getInlinedChainForAddress - fetches inlined chain for a given address.
   /// Returns empty chain if there is no subprogram containing address. The
@@ -139,6 +132,9 @@ public:
   DWARFDebugInfoEntryInlinedChain getInlinedChainForAddress(uint64_t Address);
 
 private:
+  /// Size in bytes of the .debug_info data associated with this compile unit.
+  size_t getDebugInfoSize() const { return Length + 4 - getHeaderSize(); }
+
   /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
   /// hasn't already been done. Returns the number of DIEs parsed at this call.
   size_t extractDIEsIfNeeded(bool CUDieOnly);
diff --git a/lib/DebugInfo/module.modulemap b/lib/DebugInfo/module.modulemap
new file mode 100644
index 0000000..1fe5ab1
--- /dev/null
+++ b/lib/DebugInfo/module.modulemap
@@ -0,0 +1 @@
+module DebugInfo { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 4768e67..6766ef1 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
@@ -25,6 +24,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -37,6 +37,8 @@
 #include <cstring>
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 STATISTIC(NumInitBytes, "Number of bytes of global vars initialized");
 STATISTIC(NumGlobals  , "Number of global vars initialized");
 
@@ -50,22 +52,31 @@ ExecutionEngine *(*ExecutionEngine::JITCtor)(
   std::string *ErrorStr,
   JITMemoryManager *JMM,
   bool GVsWithCode,
-  TargetMachine *TM) = 0;
+  TargetMachine *TM) = nullptr;
 ExecutionEngine *(*ExecutionEngine::MCJITCtor)(
   Module *M,
   std::string *ErrorStr,
   RTDyldMemoryManager *MCJMM,
   bool GVsWithCode,
-  TargetMachine *TM) = 0;
+  TargetMachine *TM) = nullptr;
 ExecutionEngine *(*ExecutionEngine::InterpCtor)(Module *M,
-                                                std::string *ErrorStr) = 0;
+                                                std::string *ErrorStr) =nullptr;
 
 ExecutionEngine::ExecutionEngine(Module *M)
   : EEState(*this),
-    LazyFunctionCreator(0) {
+    LazyFunctionCreator(nullptr) {
   CompilingLazily         = false;
   GVCompilationDisabled   = false;
   SymbolSearchingDisabled = false;
+
+  // IR module verification is enabled by default in debug builds, and disabled
+  // by default in release builds.
+#ifndef NDEBUG
+  VerifyModules = true;
+#else
+  VerifyModules = false;
+#endif
+
   Modules.push_back(M);
   assert(M && "Module is null?");
 }
@@ -111,6 +122,10 @@ char *ExecutionEngine::getMemoryForGV(const GlobalVariable *GV) {
   return GVMemoryBlock::Create(GV, *getDataLayout());
 }
 
+void ExecutionEngine::addObjectFile(std::unique_ptr<object::ObjectFile> O) {
+  llvm_unreachable("ExecutionEngine subclass doesn't implement addObjectFile.");
+}
+
 bool ExecutionEngine::removeModule(Module *M) {
   for(SmallVectorImpl<Module *>::iterator I = Modules.begin(),
         E = Modules.end(); I != E; ++I) {
@@ -129,7 +144,7 @@ Function *ExecutionEngine::FindFunctionNamed(const char *FnName) {
     if (Function *F = Modules[i]->getFunction(FnName))
       return F;
   }
-  return 0;
+  return nullptr;
 }
 
 
@@ -141,7 +156,7 @@ void *ExecutionEngineState::RemoveMapping(const MutexGuard &,
   // FIXME: This is silly, we shouldn't end up with a mapping -> 0 in the
   // GlobalAddressMap.
   if (I == GlobalAddressMap.end())
-    OldVal = 0;
+    OldVal = nullptr;
   else {
     OldVal = I->second;
     GlobalAddressMap.erase(I);
@@ -157,14 +172,14 @@ void ExecutionEngine::addGlobalMapping(const GlobalValue *GV, void *Addr) {
   DEBUG(dbgs() << "JIT: Map \'" << GV->getName()
         << "\' to [" << Addr << "]\n";);
   void *&CurVal = EEState.getGlobalAddressMap(locked)[GV];
-  assert((CurVal == 0 || Addr == 0) && "GlobalMapping already established!");
+  assert((!CurVal || !Addr) && "GlobalMapping already established!");
   CurVal = Addr;
 
   // If we are using the reverse mapping, add it too.
   if (!EEState.getGlobalAddressReverseMap(locked).empty()) {
     AssertingVH<const GlobalValue> &V =
       EEState.getGlobalAddressReverseMap(locked)[Addr];
-    assert((V == 0 || GV == 0) && "GlobalMapping already established!");
+    assert((!V || !GV) && "GlobalMapping already established!");
     V = GV;
   }
 }
@@ -193,7 +208,7 @@ void *ExecutionEngine::updateGlobalMapping(const GlobalValue *GV, void *Addr) {
     EEState.getGlobalAddressMap(locked);
 
   // Deleting from the mapping?
-  if (Addr == 0)
+  if (!Addr)
     return EEState.RemoveMapping(locked, GV);
 
   void *&CurVal = Map[GV];
@@ -207,7 +222,7 @@ void *ExecutionEngine::updateGlobalMapping(const GlobalValue *GV, void *Addr) {
   if (!EEState.getGlobalAddressReverseMap(locked).empty()) {
     AssertingVH<const GlobalValue> &V =
       EEState.getGlobalAddressReverseMap(locked)[Addr];
-    assert((V == 0 || GV == 0) && "GlobalMapping already established!");
+    assert((!V || !GV) && "GlobalMapping already established!");
     V = GV;
   }
   return OldVal;
@@ -218,7 +233,7 @@ void *ExecutionEngine::getPointerToGlobalIfAvailable(const GlobalValue *GV) {
 
   ExecutionEngineState::GlobalAddressMapTy::iterator I =
     EEState.getGlobalAddressMap(locked).find(GV);
-  return I != EEState.getGlobalAddressMap(locked).end() ? I->second : 0;
+  return I != EEState.getGlobalAddressMap(locked).end() ? I->second : nullptr;
 }
 
 const GlobalValue *ExecutionEngine::getGlobalValueAtAddress(void *Addr) {
@@ -235,7 +250,7 @@ const GlobalValue *ExecutionEngine::getGlobalValueAtAddress(void *Addr) {
 
   std::map<void *, AssertingVH<const GlobalValue> >::iterator I =
     EEState.getGlobalAddressReverseMap(locked).find(Addr);
-  return I != EEState.getGlobalAddressReverseMap(locked).end() ? I->second : 0;
+  return I != EEState.getGlobalAddressReverseMap(locked).end() ? I->second : nullptr;
 }
 
 namespace {
@@ -243,11 +258,11 @@ class ArgvArray {
   char *Array;
   std::vector<char*> Values;
 public:
-  ArgvArray() : Array(NULL) {}
+  ArgvArray() : Array(nullptr) {}
   ~ArgvArray() { clear(); }
   void clear() {
     delete[] Array;
-    Array = NULL;
+    Array = nullptr;
     for (size_t I = 0, E = Values.size(); I != E; ++I) {
       delete[] Values[I];
     }
@@ -283,7 +298,7 @@ void *ArgvArray::reset(LLVMContext &C, ExecutionEngine *EE,
   }
 
   // Null terminate it
-  EE->StoreValueToMemory(PTOGV(0),
+  EE->StoreValueToMemory(PTOGV(nullptr),
                          (GenericValue*)(Array+InputArgv.size()*PtrSize),
                          SBytePtr);
   return Array;
@@ -303,11 +318,11 @@ void ExecutionEngine::runStaticConstructorsDestructors(Module *module,
   // Should be an array of '{ i32, void ()* }' structs.  The first value is
   // the init priority, which we ignore.
   ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
-  if (InitList == 0)
+  if (!InitList)
     return;
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
     ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i));
-    if (CS == 0) continue;
+    if (!CS) continue;
 
     Constant *FP = CS->getOperand(1);
     if (FP->isNullValue())
@@ -418,10 +433,10 @@ ExecutionEngine *ExecutionEngine::createJIT(Module *M,
                                             bool GVsWithCode,
                                             Reloc::Model RM,
                                             CodeModel::Model CMM) {
-  if (ExecutionEngine::JITCtor == 0) {
+  if (!ExecutionEngine::JITCtor) {
     if (ErrorStr)
       *ErrorStr = "JIT has not been linked in.";
-    return 0;
+    return nullptr;
   }
 
   // Use the defaults for extra parameters.  Users can use EngineBuilder to
@@ -437,7 +452,7 @@ ExecutionEngine *ExecutionEngine::createJIT(Module *M,
 
   // TODO: permit custom TargetOptions here
   TargetMachine *TM = EB.selectTarget();
-  if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0;
+  if (!TM || (ErrorStr && ErrorStr->length() > 0)) return nullptr;
 
   return ExecutionEngine::JITCtor(M, ErrorStr, JMM, GVsWithCode, TM);
 }
@@ -447,8 +462,8 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
 
   // Make sure we can resolve symbols in the program as well. The zero arg
   // to the function tells DynamicLibrary to load the program, not a library.
-  if (sys::DynamicLibrary::LoadLibraryPermanently(0, ErrorStr))
-    return 0;
+  if (sys::DynamicLibrary::LoadLibraryPermanently(nullptr, ErrorStr))
+    return nullptr;
 
   assert(!(JMM && MCJMM));
   
@@ -461,7 +476,7 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
     else {
       if (ErrorStr)
         *ErrorStr = "Cannot create an interpreter with a memory manager.";
-      return 0;
+      return nullptr;
     }
   }
   
@@ -470,7 +485,7 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
       *ErrorStr =
         "Cannot create a legacy JIT with a runtime dyld memory "
         "manager.";
-    return 0;
+    return nullptr;
   }
 
   // Unless the interpreter was explicitly selected or the JIT is not linked,
@@ -483,16 +498,17 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
              << " a different -march switch.\n";
     }
 
-    if (UseMCJIT && ExecutionEngine::MCJITCtor) {
-      ExecutionEngine *EE =
-        ExecutionEngine::MCJITCtor(M, ErrorStr, MCJMM ? MCJMM : JMM,
-                                   AllocateGVsWithCode, TheTM.release());
-      if (EE) return EE;
-    } else if (ExecutionEngine::JITCtor) {
-      ExecutionEngine *EE =
-        ExecutionEngine::JITCtor(M, ErrorStr, JMM,
-                                 AllocateGVsWithCode, TheTM.release());
-      if (EE) return EE;
+    ExecutionEngine *EE = nullptr;
+    if (UseMCJIT && ExecutionEngine::MCJITCtor)
+      EE = ExecutionEngine::MCJITCtor(M, ErrorStr, MCJMM ? MCJMM : JMM,
+                                      AllocateGVsWithCode, TheTM.release());
+    else if (ExecutionEngine::JITCtor)
+      EE = ExecutionEngine::JITCtor(M, ErrorStr, JMM,
+                                    AllocateGVsWithCode, TheTM.release());
+
+    if (EE) {
+      EE->setVerifyModules(VerifyModules);
+      return EE;
     }
   }
 
@@ -503,16 +519,16 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
       return ExecutionEngine::InterpCtor(M, ErrorStr);
     if (ErrorStr)
       *ErrorStr = "Interpreter has not been linked in.";
-    return 0;
+    return nullptr;
   }
 
-  if ((WhichEngine & EngineKind::JIT) && ExecutionEngine::JITCtor == 0 &&
-      ExecutionEngine::MCJITCtor == 0) {
+  if ((WhichEngine & EngineKind::JIT) && !ExecutionEngine::JITCtor &&
+      !ExecutionEngine::MCJITCtor) {
     if (ErrorStr)
       *ErrorStr = "JIT has not been linked in.";
   }
 
-  return 0;
+  return nullptr;
 }
 
 void *ExecutionEngine::getPointerToGlobal(const GlobalValue *GV) {
@@ -848,7 +864,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
     break;
   case Type::PointerTyID:
     if (isa<ConstantPointerNull>(C))
-      Result.PointerVal = 0;
+      Result.PointerVal = nullptr;
     else if (const Function *F = dyn_cast<Function>(C))
       Result = PTOGV(getPointerToFunctionOrStub(const_cast<Function*>(F)));
     else if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
@@ -1193,20 +1209,18 @@ void ExecutionEngine::emitGlobals() {
   if (Modules.size() != 1) {
     for (unsigned m = 0, e = Modules.size(); m != e; ++m) {
       Module &M = *Modules[m];
-      for (Module::const_global_iterator I = M.global_begin(),
-           E = M.global_end(); I != E; ++I) {
-        const GlobalValue *GV = I;
-        if (GV->hasLocalLinkage() || GV->isDeclaration() ||
-            GV->hasAppendingLinkage() || !GV->hasName())
+      for (const auto &GV : M.globals()) {
+        if (GV.hasLocalLinkage() || GV.isDeclaration() ||
+            GV.hasAppendingLinkage() || !GV.hasName())
           continue;// Ignore external globals and globals with internal linkage.
 
         const GlobalValue *&GVEntry =
-          LinkedGlobalsMap[std::make_pair(GV->getName(), GV->getType())];
+          LinkedGlobalsMap[std::make_pair(GV.getName(), GV.getType())];
 
         // If this is the first time we've seen this global, it is the canonical
         // version.
         if (!GVEntry) {
-          GVEntry = GV;
+          GVEntry = &GV;
           continue;
         }
 
@@ -1216,8 +1230,8 @@ void ExecutionEngine::emitGlobals() {
 
         // Otherwise, we know it's linkonce/weak, replace it if this is a strong
         // symbol.  FIXME is this right for common?
-        if (GV->hasExternalLinkage() || GVEntry->hasExternalWeakLinkage())
-          GVEntry = GV;
+        if (GV.hasExternalLinkage() || GVEntry->hasExternalWeakLinkage())
+          GVEntry = &GV;
       }
     }
   }
@@ -1225,31 +1239,30 @@ void ExecutionEngine::emitGlobals() {
   std::vector<const GlobalValue*> NonCanonicalGlobals;
   for (unsigned m = 0, e = Modules.size(); m != e; ++m) {
     Module &M = *Modules[m];
-    for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-         I != E; ++I) {
+    for (const auto &GV : M.globals()) {
       // In the multi-module case, see what this global maps to.
       if (!LinkedGlobalsMap.empty()) {
         if (const GlobalValue *GVEntry =
-              LinkedGlobalsMap[std::make_pair(I->getName(), I->getType())]) {
+              LinkedGlobalsMap[std::make_pair(GV.getName(), GV.getType())]) {
           // If something else is the canonical global, ignore this one.
-          if (GVEntry != &*I) {
-            NonCanonicalGlobals.push_back(I);
+          if (GVEntry != &GV) {
+            NonCanonicalGlobals.push_back(&GV);
             continue;
           }
         }
       }
 
-      if (!I->isDeclaration()) {
-        addGlobalMapping(I, getMemoryForGV(I));
+      if (!GV.isDeclaration()) {
+        addGlobalMapping(&GV, getMemoryForGV(&GV));
       } else {
         // External variable reference. Try to use the dynamic loader to
         // get a pointer to it.
         if (void *SymAddr =
-            sys::DynamicLibrary::SearchForAddressOfSymbol(I->getName()))
-          addGlobalMapping(I, SymAddr);
+            sys::DynamicLibrary::SearchForAddressOfSymbol(GV.getName()))
+          addGlobalMapping(&GV, SymAddr);
         else {
           report_fatal_error("Could not resolve external global address: "
-                            +I->getName());
+                            +GV.getName());
         }
       }
     }
@@ -1269,16 +1282,15 @@ void ExecutionEngine::emitGlobals() {
 
     // Now that all of the globals are set up in memory, loop through them all
     // and initialize their contents.
-    for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
-         I != E; ++I) {
-      if (!I->isDeclaration()) {
+    for (const auto &GV : M.globals()) {
+      if (!GV.isDeclaration()) {
         if (!LinkedGlobalsMap.empty()) {
           if (const GlobalValue *GVEntry =
-                LinkedGlobalsMap[std::make_pair(I->getName(), I->getType())])
-            if (GVEntry != &*I)  // Not the canonical variable.
+                LinkedGlobalsMap[std::make_pair(GV.getName(), GV.getType())])
+            if (GVEntry != &GV)  // Not the canonical variable.
               continue;
         }
-        EmitGlobalVariable(I);
+        EmitGlobalVariable(&GV);
       }
     }
   }
@@ -1290,12 +1302,12 @@ void ExecutionEngine::emitGlobals() {
 void ExecutionEngine::EmitGlobalVariable(const GlobalVariable *GV) {
   void *GA = getPointerToGlobalIfAvailable(GV);
 
-  if (GA == 0) {
+  if (!GA) {
     // If it's not already specified, allocate memory for the global.
     GA = getMemoryForGV(GV);
 
     // If we failed to allocate memory for this global, return.
-    if (GA == 0) return;
+    if (!GA) return;
 
     addGlobalMapping(GV, GA);
   }
diff --git a/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index db3dead..6ff1e7a 100644
--- a/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "llvm-c/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
@@ -23,17 +22,11 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 // Wrapping the C bindings types.
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(GenericValue, LLVMGenericValueRef)
 
-inline DataLayout *unwrap(LLVMTargetDataRef P) {
-  return reinterpret_cast<DataLayout*>(P);
-}
-  
-inline LLVMTargetDataRef wrap(const DataLayout *P) {
-  return reinterpret_cast<LLVMTargetDataRef>(const_cast<DataLayout*>(P));
-}
-
 inline TargetLibraryInfo *unwrap(LLVMTargetLibraryInfoRef P) {
   return reinterpret_cast<TargetLibraryInfo*>(P);
 }
@@ -410,7 +403,7 @@ uint8_t *SimpleBindingMemoryManager::allocateDataSection(
 }
 
 bool SimpleBindingMemoryManager::finalizeMemory(std::string *ErrMsg) {
-  char *errMsgCString = 0;
+  char *errMsgCString = nullptr;
   bool result = Functions.FinalizeMemory(Opaque, &errMsgCString);
   assert((result || !errMsgCString) &&
          "Did not expect an error message if FinalizeMemory succeeded");
@@ -433,7 +426,7 @@ LLVMMCJITMemoryManagerRef LLVMCreateSimpleMCJITMemoryManager(
   
   if (!AllocateCodeSection || !AllocateDataSection || !FinalizeMemory ||
       !Destroy)
-    return NULL;
+    return nullptr;
   
   SimpleBindingMMFunctions functions;
   functions.AllocateCodeSection = AllocateCodeSection;
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 2ca4e3e..9a65fa0 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Config/config.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 
-#define DEBUG_TYPE "amplifier-jit-event-listener"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
@@ -34,6 +33,8 @@
 using namespace llvm;
 using namespace llvm::jitprofiling;
 
+#define DEBUG_TYPE "amplifier-jit-event-listener"
+
 namespace {
 
 class IntelJITEventListener : public JITEventListener {
@@ -193,11 +194,10 @@ void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
   MethodAddressVector Functions;
 
   // Use symbol info to iterate functions in the object.
-  error_code ec;
   for (object::symbol_iterator I = Obj.begin_symbols(),
                                E = Obj.end_symbols();
-                        I != E && !ec;
-                        I.increment(ec)) {
+                        I != E;
+                        ++I) {
     std::vector<LineNumberInfo> LineInfo;
     std::string SourceFileName;
 
@@ -234,7 +234,7 @@ void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
           FunctionMessage.line_number_table = 0;
         } else {
           SourceFileName = Lines.front().second.getFileName();
-          FunctionMessage.source_file_name = (char *)SourceFileName.c_str();
+          FunctionMessage.source_file_name = const_cast<char *>(SourceFileName.c_str());
           FunctionMessage.line_number_size = LineInfo.size();
           FunctionMessage.line_number_table = &*LineInfo.begin();
         }
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index 8a80285..93bb2d1 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "interpreter"
 #include "Interpreter.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/Statistic.h"
@@ -28,6 +27,8 @@
 #include <cmath>
 using namespace llvm;
 
+#define DEBUG_TYPE "interpreter"
+
 STATISTIC(NumDynamicInsts, "Number of dynamic instructions executed");
 
 static cl::opt<bool> PrintVolatile("interpreter-print-volatile", cl::Hidden,
@@ -57,7 +58,7 @@ static void executeFAddInst(GenericValue &Dest, GenericValue Src1,
     IMPLEMENT_BINARY_OPERATOR(+, Double);
   default:
     dbgs() << "Unhandled type for FAdd instruction: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 }
 
@@ -68,7 +69,7 @@ static void executeFSubInst(GenericValue &Dest, GenericValue Src1,
     IMPLEMENT_BINARY_OPERATOR(-, Double);
   default:
     dbgs() << "Unhandled type for FSub instruction: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 }
 
@@ -79,7 +80,7 @@ static void executeFMulInst(GenericValue &Dest, GenericValue Src1,
     IMPLEMENT_BINARY_OPERATOR(*, Double);
   default:
     dbgs() << "Unhandled type for FMul instruction: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 }
 
@@ -90,7 +91,7 @@ static void executeFDivInst(GenericValue &Dest, GenericValue Src1,
     IMPLEMENT_BINARY_OPERATOR(/, Double);
   default:
     dbgs() << "Unhandled type for FDiv instruction: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 }
 
@@ -105,7 +106,7 @@ static void executeFRemInst(GenericValue &Dest, GenericValue Src1,
     break;
   default:
     dbgs() << "Unhandled type for Rem instruction: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 }
 
@@ -142,7 +143,7 @@ static GenericValue executeICMP_EQ(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_POINTER_ICMP(==);
   default:
     dbgs() << "Unhandled type for ICMP_EQ predicate: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -156,7 +157,7 @@ static GenericValue executeICMP_NE(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_POINTER_ICMP(!=);
   default:
     dbgs() << "Unhandled type for ICMP_NE predicate: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -170,7 +171,7 @@ static GenericValue executeICMP_ULT(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_POINTER_ICMP(<);
   default:
     dbgs() << "Unhandled type for ICMP_ULT predicate: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -184,7 +185,7 @@ static GenericValue executeICMP_SLT(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_POINTER_ICMP(<);
   default:
     dbgs() << "Unhandled type for ICMP_SLT predicate: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -198,7 +199,7 @@ static GenericValue executeICMP_UGT(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_POINTER_ICMP(>);
   default:
     dbgs() << "Unhandled type for ICMP_UGT predicate: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -212,7 +213,7 @@ static GenericValue executeICMP_SGT(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_POINTER_ICMP(>);
   default:
     dbgs() << "Unhandled type for ICMP_SGT predicate: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -226,7 +227,7 @@ static GenericValue executeICMP_ULE(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_POINTER_ICMP(<=);
   default:
     dbgs() << "Unhandled type for ICMP_ULE predicate: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -240,7 +241,7 @@ static GenericValue executeICMP_SLE(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_POINTER_ICMP(<=);
   default:
     dbgs() << "Unhandled type for ICMP_SLE predicate: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -254,7 +255,7 @@ static GenericValue executeICMP_UGE(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_POINTER_ICMP(>=);
   default:
     dbgs() << "Unhandled type for ICMP_UGE predicate: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -268,7 +269,7 @@ static GenericValue executeICMP_SGE(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_POINTER_ICMP(>=);
   default:
     dbgs() << "Unhandled type for ICMP_SGE predicate: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -293,7 +294,7 @@ void Interpreter::visitICmpInst(ICmpInst &I) {
   case ICmpInst::ICMP_SGE: R = executeICMP_SGE(Src1, Src2, Ty); break;
   default:
     dbgs() << "Don't know how to handle this ICmp predicate!\n-->" << I;
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
  
   SetValue(&I, R, SF);
@@ -329,7 +330,7 @@ static GenericValue executeFCMP_OEQ(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_VECTOR_FCMP(==);
   default:
     dbgs() << "Unhandled type for FCmp EQ instruction: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -385,7 +386,7 @@ static GenericValue executeFCMP_ONE(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_VECTOR_FCMP(!=);
     default:
       dbgs() << "Unhandled type for FCmp NE instruction: " << *Ty << "\n";
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
   }
   // in vector case mask out NaN elements
   if (Ty->isVectorTy())
@@ -405,7 +406,7 @@ static GenericValue executeFCMP_OLE(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_VECTOR_FCMP(<=);
   default:
     dbgs() << "Unhandled type for FCmp LE instruction: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -419,7 +420,7 @@ static GenericValue executeFCMP_OGE(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_VECTOR_FCMP(>=);
   default:
     dbgs() << "Unhandled type for FCmp GE instruction: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -433,7 +434,7 @@ static GenericValue executeFCMP_OLT(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_VECTOR_FCMP(<);
   default:
     dbgs() << "Unhandled type for FCmp LT instruction: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -447,7 +448,7 @@ static GenericValue executeFCMP_OGT(GenericValue Src1, GenericValue Src2,
     IMPLEMENT_VECTOR_FCMP(>);
   default:
     dbgs() << "Unhandled type for FCmp GT instruction: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
   return Dest;
 }
@@ -615,7 +616,7 @@ void Interpreter::visitFCmpInst(FCmpInst &I) {
   switch (I.getPredicate()) {
   default:
     dbgs() << "Don't know how to handle this FCmp predicate!\n-->" << I;
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   break;
   case FCmpInst::FCMP_FALSE: R = executeFCMP_BOOL(Src1, Src2, Ty, false); 
   break;
@@ -672,7 +673,7 @@ static GenericValue executeCmpInst(unsigned predicate, GenericValue Src1,
   case FCmpInst::FCMP_TRUE:  return executeFCMP_BOOL(Src1, Src2, Ty, true);
   default:
     dbgs() << "Unhandled Cmp predicate\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 }
 
@@ -726,7 +727,7 @@ void Interpreter::visitBinaryOperator(BinaryOperator &I) {
     switch(I.getOpcode()){
     default:
       dbgs() << "Don't know how to handle this binary operator!\n-->" << I;
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
       break;
     case Instruction::Add:   INTEGER_VECTOR_OPERATION(+) break;
     case Instruction::Sub:   INTEGER_VECTOR_OPERATION(-) break;
@@ -754,7 +755,7 @@ void Interpreter::visitBinaryOperator(BinaryOperator &I) {
             fmod(Src1.AggregateVal[i].DoubleVal, Src2.AggregateVal[i].DoubleVal);
         else {
           dbgs() << "Unhandled type for Rem instruction: " << *Ty << "\n";
-          llvm_unreachable(0);
+          llvm_unreachable(nullptr);
         }
       }
       break;
@@ -763,7 +764,7 @@ void Interpreter::visitBinaryOperator(BinaryOperator &I) {
     switch (I.getOpcode()) {
     default:
       dbgs() << "Don't know how to handle this binary operator!\n-->" << I;
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
       break;
     case Instruction::Add:   R.IntVal = Src1.IntVal + Src2.IntVal; break;
     case Instruction::Sub:   R.IntVal = Src1.IntVal - Src2.IntVal; break;
@@ -896,7 +897,7 @@ void Interpreter::visitSwitchInst(SwitchInst &I) {
   GenericValue CondVal = getOperandValue(Cond, SF);
 
   // Check to see if any of the cases match...
-  BasicBlock *Dest = 0;
+  BasicBlock *Dest = nullptr;
   for (SwitchInst::CaseIt i = I.case_begin(), e = I.case_end(); i != e; ++i) {
     GenericValue CaseVal = getOperandValue(i.getCaseValue(), SF);
     if (executeICMP_EQ(CondVal, CaseVal, ElTy).IntVal != 0) {
@@ -979,7 +980,7 @@ void Interpreter::visitAllocaInst(AllocaInst &I) {
                << uintptr_t(Memory) << '\n');
 
   GenericValue Result = PTOGV(Memory);
-  assert(Result.PointerVal != 0 && "Null pointer returned by malloc!");
+  assert(Result.PointerVal && "Null pointer returned by malloc!");
   SetValue(&I, Result, SF);
 
   if (I.getOpcode() == Instruction::Alloca)
@@ -1732,7 +1733,7 @@ void Interpreter::visitVAArgInst(VAArgInst &I) {
   IMPLEMENT_VAARG(Double);
   default:
     dbgs() << "Unhandled dest type for vaarg instruction: " << *Ty << "\n";
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 
   // Set the Value of this Instruction.
@@ -1756,7 +1757,7 @@ void Interpreter::visitExtractElementInst(ExtractElementInst &I) {
     default:
       dbgs() << "Unhandled destination type for extractelement instruction: "
       << *Ty << "\n";
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
       break;
     case Type::IntegerTyID:
       Dest.IntVal = Src1.AggregateVal[indx].IntVal;
@@ -2073,7 +2074,7 @@ GenericValue Interpreter::getOperandValue(Value *V, ExecutionContext &SF) {
 //
 void Interpreter::callFunction(Function *F,
                                const std::vector<GenericValue> &ArgVals) {
-  assert((ECStack.empty() || ECStack.back().Caller.getInstruction() == 0 ||
+  assert((ECStack.empty() || !ECStack.back().Caller.getInstruction() ||
           ECStack.back().Caller.arg_size() == ArgVals.size()) &&
          "Incorrect number of arguments passed into function call!");
   // Make a new stack frame... and fill it in.
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index a03c7f5..671bbee 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -98,13 +98,13 @@ static ExFunc lookupFunction(const Function *F) {
 
   sys::ScopedLock Writer(*FunctionsLock);
   ExFunc FnPtr = FuncNames[ExtName];
-  if (FnPtr == 0)
+  if (!FnPtr)
     FnPtr = FuncNames["lle_X_" + F->getName().str()];
-  if (FnPtr == 0)  // Try calling a generic function... if it exists...
+  if (!FnPtr)  // Try calling a generic function... if it exists...
     FnPtr = (ExFunc)(intptr_t)
       sys::DynamicLibrary::SearchForAddressOfSymbol("lle_X_" +
                                                     F->getName().str());
-  if (FnPtr != 0)
+  if (FnPtr)
     ExportedFunctions->insert(std::make_pair(F, FnPtr));  // Cache for later
   return FnPtr;
 }
diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.cpp b/lib/ExecutionEngine/Interpreter/Interpreter.cpp
index 6d4f6f7..c589457 100644
--- a/lib/ExecutionEngine/Interpreter/Interpreter.cpp
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.cpp
@@ -38,7 +38,7 @@ ExecutionEngine *Interpreter::create(Module *M, std::string* ErrStr) {
     if (ErrStr)
       *ErrStr = EC.message();
     // We got an error, just return 0
-    return 0;
+    return nullptr;
   }
 
   return new Interpreter(M);
diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.h b/lib/ExecutionEngine/Interpreter/Interpreter.h
index 2e93cae..2145cde 100644
--- a/lib/ExecutionEngine/Interpreter/Interpreter.h
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.h
@@ -108,7 +108,7 @@ public:
   
   /// create - Create an interpreter ExecutionEngine. This can never fail.
   ///
-  static ExecutionEngine *create(Module *M, std::string *ErrorStr = 0);
+  static ExecutionEngine *create(Module *M, std::string *ErrorStr = nullptr);
 
   /// run - Start execution with the specified function and arguments.
   ///
@@ -118,7 +118,7 @@ public:
   void *getPointerToNamedFunction(const std::string &Name,
                                   bool AbortOnFailure = true) override {
     // FIXME: not implemented.
-    return 0;
+    return nullptr;
   }
 
   /// recompileAndRelinkFunction - For the interpreter, functions are always
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index d3ad77b..f8b2827 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -79,7 +79,7 @@ ExecutionEngine *JIT::createJIT(Module *M,
   // Try to register the program as a source of symbols to resolve against.
   //
   // FIXME: Don't do this here.
-  sys::DynamicLibrary::LoadLibraryPermanently(0, NULL);
+  sys::DynamicLibrary::LoadLibraryPermanently(nullptr, nullptr);
 
   // If the target supports JIT code generation, create the JIT.
   if (TargetJITInfo *TJ = TM->getJITInfo()) {
@@ -87,7 +87,7 @@ ExecutionEngine *JIT::createJIT(Module *M,
   } else {
     if (ErrorStr)
       *ErrorStr = "target does not support JIT code generation";
-    return 0;
+    return nullptr;
   }
 }
 
@@ -157,7 +157,7 @@ JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
 
   // Turn the machine code intermediate representation into bytes in memory that
   // may be executed.
-  if (TM.addPassesToEmitMachineCode(PM, *JCE)) {
+  if (TM.addPassesToEmitMachineCode(PM, *JCE, !getVerifyModules())) {
     report_fatal_error("Target does not support machine code emission!");
   }
 
@@ -190,7 +190,7 @@ void JIT::addModule(Module *M) {
 
     // Turn the machine code intermediate representation into bytes in memory
     // that may be executed.
-    if (TM.addPassesToEmitMachineCode(PM, *JCE)) {
+    if (TM.addPassesToEmitMachineCode(PM, *JCE, !getVerifyModules())) {
       report_fatal_error("Target does not support machine code emission!");
     }
 
@@ -210,7 +210,7 @@ bool JIT::removeModule(Module *M) {
 
   if (jitstate && jitstate->getModule() == M) {
     delete jitstate;
-    jitstate = 0;
+    jitstate = nullptr;
   }
 
   if (!jitstate && !Modules.empty()) {
@@ -222,7 +222,7 @@ bool JIT::removeModule(Module *M) {
 
     // Turn the machine code intermediate representation into bytes in memory
     // that may be executed.
-    if (TM.addPassesToEmitMachineCode(PM, *JCE)) {
+    if (TM.addPassesToEmitMachineCode(PM, *JCE, !getVerifyModules())) {
       report_fatal_error("Target does not support machine code emission!");
     }
 
@@ -353,7 +353,7 @@ GenericValue JIT::runFunction(Function *F,
   // currently don't support varargs.
   SmallVector<Value*, 8> Args;
   for (unsigned i = 0, e = ArgValues.size(); i != e; ++i) {
-    Constant *C = 0;
+    Constant *C = nullptr;
     Type *ArgTy = FTy->getParamType(i);
     const GenericValue &AV = ArgValues[i];
     switch (ArgTy->getTypeID()) {
@@ -406,13 +406,13 @@ GenericValue JIT::runFunction(Function *F,
 }
 
 void JIT::RegisterJITEventListener(JITEventListener *L) {
-  if (L == NULL)
+  if (!L)
     return;
   MutexGuard locked(lock);
   EventListeners.push_back(L);
 }
 void JIT::UnregisterJITEventListener(JITEventListener *L) {
-  if (L == NULL)
+  if (!L)
     return;
   MutexGuard locked(lock);
   std::vector<JITEventListener*>::reverse_iterator I=
@@ -584,7 +584,7 @@ void *JIT::getPointerToNamedFunction(const std::string &Name,
     report_fatal_error("Program used external function '"+Name+
                       "' which could not be resolved!");
   }
-  return 0;
+  return nullptr;
 }
 
 
@@ -604,7 +604,7 @@ void *JIT::getOrEmitGlobalVariable(const GlobalVariable *GV) {
       return (void*)&__dso_handle;
 #endif
     Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(GV->getName());
-    if (Ptr == 0) {
+    if (!Ptr) {
       report_fatal_error("Could not resolve external global address: "
                         +GV->getName());
     }
@@ -629,10 +629,10 @@ void *JIT::recompileAndRelinkFunction(Function *F) {
   void *OldAddr = getPointerToGlobalIfAvailable(F);
 
   // If it's not already compiled there is no reason to patch it up.
-  if (OldAddr == 0) { return getPointerToFunction(F); }
+  if (!OldAddr) return getPointerToFunction(F);
 
   // Delete the old function mapping.
-  addGlobalMapping(F, 0);
+  addGlobalMapping(F, nullptr);
 
   // Recodegen the function
   runJITOnFunction(F);
diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h
index b1b0768..d2bd508 100644
--- a/lib/ExecutionEngine/JIT/JIT.h
+++ b/lib/ExecutionEngine/JIT/JIT.h
@@ -189,7 +189,7 @@ public:
                                     TargetMachine *TM);
 
   // Run the JIT on F and return information about the generated code
-  void runJITOnFunction(Function *F, MachineCodeInfo *MCI = 0) override;
+  void runJITOnFunction(Function *F, MachineCodeInfo *MCI = nullptr) override;
 
   void RegisterJITEventListener(JITEventListener *L) override;
   void UnregisterJITEventListener(JITEventListener *L) override;
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index 9d215ec..cd7a500 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "JIT.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -52,6 +51,8 @@
 #endif
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 STATISTIC(NumBytes, "Number of bytes of machine code compiled");
 STATISTIC(NumRelos, "Number of relocations applied");
 STATISTIC(NumRetries, "Number of retries with more memory");
@@ -343,7 +344,8 @@ namespace {
       void *FunctionBody;  // Beginning of the function's allocation.
       void *Code;  // The address the function's code actually starts at.
       void *ExceptionTable;
-      EmittedCode() : FunctionBody(0), Code(0), ExceptionTable(0) {}
+      EmittedCode() : FunctionBody(nullptr), Code(nullptr),
+                      ExceptionTable(nullptr) {}
     };
     struct EmittedFunctionConfig : public ValueMapConfig<const Function*> {
       typedef JITEmitter *ExtraData;
@@ -360,7 +362,7 @@ namespace {
 
   public:
     JITEmitter(JIT &jit, JITMemoryManager *JMM, TargetMachine &TM)
-      : SizeEstimate(0), Resolver(jit, *this), MMI(0), CurFn(0),
+      : SizeEstimate(0), Resolver(jit, *this), MMI(nullptr), CurFn(nullptr),
         EmittedFunctions(this), TheJIT(&jit) {
       MemMgr = JMM ? JMM : JITMemoryManager::CreateDefaultMemManager();
       if (jit.getJITInfo().needsGOT()) {
@@ -516,7 +518,7 @@ void *JITResolver::getLazyFunctionStub(Function *F) {
   // Call the lazy resolver function if we are JIT'ing lazily.  Otherwise we
   // must resolve the symbol now.
   void *Actual = TheJIT->isCompilingLazily()
-    ? (void *)(intptr_t)LazyResolverFn : (void *)0;
+    ? (void *)(intptr_t)LazyResolverFn : (void *)nullptr;
 
   // If this is an external declaration, attempt to resolve the address now
   // to place in the stub.
@@ -525,7 +527,7 @@ void *JITResolver::getLazyFunctionStub(Function *F) {
 
     // If we resolved the symbol to a null address (eg. a weak external)
     // don't emit a stub. Return a null pointer to the application.
-    if (!Actual) return 0;
+    if (!Actual) return nullptr;
   }
 
   TargetJITInfo::StubLayout SL = TheJIT->getJITInfo().getStubLayout();
@@ -592,8 +594,8 @@ void *JITResolver::getExternalFunctionStub(void *FnAddr) {
   if (Stub) return Stub;
 
   TargetJITInfo::StubLayout SL = TheJIT->getJITInfo().getStubLayout();
-  JE.startGVStub(0, SL.Size, SL.Alignment);
-  Stub = TheJIT->getJITInfo().emitFunctionStub(0, FnAddr, JE);
+  JE.startGVStub(nullptr, SL.Size, SL.Alignment);
+  Stub = TheJIT->getJITInfo().emitFunctionStub(nullptr, FnAddr, JE);
   JE.finishGVStub();
 
   DEBUG(dbgs() << "JIT: Stub emitted at [" << Stub
@@ -619,8 +621,8 @@ void *JITResolver::JITCompilerFn(void *Stub) {
   JITResolver *JR = StubToResolverMap->getResolverFromStub(Stub);
   assert(JR && "Unable to find the corresponding JITResolver to the call site");
 
-  Function* F = 0;
-  void* ActualPtr = 0;
+  Function* F = nullptr;
+  void* ActualPtr = nullptr;
 
   {
     // Only lock for getting the Function. The call getPointerToFunction made
@@ -688,7 +690,7 @@ void *JITEmitter::getPointerToGlobal(GlobalValue *V, void *Reference,
     return TheJIT->getOrEmitGlobalVariable(GV);
 
   if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
-    return TheJIT->getPointerToGlobal(GA->getAliasedGlobal());
+    return TheJIT->getPointerToGlobal(GA->getAliasee());
 
   // If we have already compiled the function, return a pointer to its body.
   Function *F = cast<Function>(V);
@@ -735,7 +737,7 @@ void JITEmitter::processDebugLoc(DebugLoc DL, bool BeforePrintingInsn) {
 
   const LLVMContext &Context = EmissionDetails.MF->getFunction()->getContext();
 
-  if (DL.getScope(Context) != 0 && PrevDL != DL) {
+  if (DL.getScope(Context) != nullptr && PrevDL != DL) {
     JITEvent_EmittedFunctionDetails::LineStart NextLine;
     NextLine.Address = getCurrentPCValue();
     NextLine.Loc = DL;
@@ -824,7 +826,7 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
     // Resolve the relocations to concrete pointers.
     for (unsigned i = 0, e = Relocations.size(); i != e; ++i) {
       MachineRelocation &MR = Relocations[i];
-      void *ResultPtr = 0;
+      void *ResultPtr = nullptr;
       if (!MR.letTargetResolve()) {
         if (MR.isExternalSymbol()) {
           ResultPtr = TheJIT->getPointerToNamedFunction(MR.getExternalSymbol(),
@@ -870,7 +872,7 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
       }
     }
 
-    CurFn = 0;
+    CurFn = nullptr;
     TheJIT->getJITInfo().relocate(BufferBegin, &Relocations[0],
                                   Relocations.size(), MemMgr->getGOTBase());
   }
@@ -899,7 +901,7 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
     SizeEstimate = 0;
   }
 
-  BufferBegin = CurBufferPtr = 0;
+  BufferBegin = CurBufferPtr = nullptr;
   NumBytes += FnEnd-FnStart;
 
   // Invalidate the icache if necessary.
@@ -1017,7 +1019,7 @@ void JITEmitter::emitConstantPool(MachineConstantPool *MCP) {
   ConstantPoolBase = allocateSpace(Size, Align);
   ConstantPool = MCP;
 
-  if (ConstantPoolBase == 0) return;  // Buffer overflow.
+  if (!ConstantPoolBase) return;  // Buffer overflow.
 
   DEBUG(dbgs() << "JIT: Emitted constant pool at [" << ConstantPoolBase
                << "] (size: " << Size << ", alignment: " << Align << ")\n");
@@ -1073,7 +1075,7 @@ void JITEmitter::emitJumpTableInfo(MachineJumpTableInfo *MJTI) {
     return;
 
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
-  if (JT.empty() || JumpTableBase == 0) return;
+  if (JT.empty() || !JumpTableBase) return;
 
 
   switch (MJTI->getEntryKind()) {
@@ -1243,7 +1245,7 @@ void JIT::updateFunctionStub(Function *F) {
 void JIT::freeMachineCodeForFunction(Function *F) {
   // Delete translation for this from the ExecutionEngine, so it will get
   // retranslated next time it is used.
-  updateGlobalMapping(F, 0);
+  updateGlobalMapping(F, nullptr);
 
   // Free the actual memory for the function body and related stuff.
   static_cast<JITEmitter*>(JCE)->deallocateMemForFunction(F);
diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
index 0d1ea02..584b93f 100644
--- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
+++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "llvm/ExecutionEngine/JITMemoryManager.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -40,6 +39,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 STATISTIC(NumSlabs, "Number of slabs of memory allocated by the JIT");
 
 JITMemoryManager::~JITMemoryManager() {}
@@ -80,7 +81,7 @@ namespace {
     /// getFreeBlockBefore - If the block before this one is free, return it,
     /// otherwise return null.
     FreeRangeHeader *getFreeBlockBefore() const {
-      if (PrevAllocated) return 0;
+      if (PrevAllocated) return nullptr;
       intptr_t PrevSize = reinterpret_cast<intptr_t *>(
                             const_cast<MemoryRangeHeader *>(this))[-1];
       return reinterpret_cast<FreeRangeHeader *>(
@@ -174,7 +175,7 @@ FreeRangeHeader *MemoryRangeHeader::FreeBlock(FreeRangeHeader *FreeList) {
     // coalesce with it, update our notion of what the free list is.
     if (&FollowingFreeBlock == FreeList) {
       FreeList = FollowingFreeBlock.Next;
-      FreeListToReturn = 0;
+      FreeListToReturn = nullptr;
       assert(&FollowingFreeBlock != FreeList && "No tombstone block?");
     }
     FollowingFreeBlock.RemoveFromFreeList();
@@ -269,13 +270,12 @@ namespace {
 
   class DefaultJITMemoryManager;
 
-  class JITSlabAllocator : public SlabAllocator {
+  class JITAllocator {
     DefaultJITMemoryManager &JMM;
   public:
-    JITSlabAllocator(DefaultJITMemoryManager &jmm) : JMM(jmm) { }
-    virtual ~JITSlabAllocator() { }
-    MemSlab *Allocate(size_t Size) override;
-    void Deallocate(MemSlab *Slab) override;
+    JITAllocator(DefaultJITMemoryManager &jmm) : JMM(jmm) { }
+    void *Allocate(size_t Size, size_t /*Alignment*/);
+    void Deallocate(void *Slab, size_t Size);
   };
 
   /// DefaultJITMemoryManager - Manage memory for the JIT code generation.
@@ -313,9 +313,10 @@ namespace {
     // Memory slabs allocated by the JIT.  We refer to them as slabs so we don't
     // confuse them with the blocks of memory described above.
     std::vector<sys::MemoryBlock> CodeSlabs;
-    JITSlabAllocator BumpSlabAllocator;
-    BumpPtrAllocatorImpl<DefaultSlabSize, DefaultSizeThreshold> StubAllocator;
-    BumpPtrAllocatorImpl<DefaultSlabSize, DefaultSizeThreshold> DataAllocator;
+    BumpPtrAllocatorImpl<JITAllocator, DefaultSlabSize,
+                         DefaultSizeThreshold> StubAllocator;
+    BumpPtrAllocatorImpl<JITAllocator, DefaultSlabSize,
+                         DefaultSizeThreshold> DataAllocator;
 
     // Circular list of free blocks.
     FreeRangeHeader *FreeMemoryList;
@@ -568,30 +569,24 @@ namespace {
   };
 }
 
-MemSlab *JITSlabAllocator::Allocate(size_t Size) {
+void *JITAllocator::Allocate(size_t Size, size_t /*Alignment*/) {
   sys::MemoryBlock B = JMM.allocateNewSlab(Size);
-  MemSlab *Slab = (MemSlab*)B.base();
-  Slab->Size = B.size();
-  Slab->NextPtr = 0;
-  return Slab;
+  return B.base();
 }
 
-void JITSlabAllocator::Deallocate(MemSlab *Slab) {
-  sys::MemoryBlock B(Slab, Slab->Size);
+void JITAllocator::Deallocate(void *Slab, size_t Size) {
+  sys::MemoryBlock B(Slab, Size);
   sys::Memory::ReleaseRWX(B);
 }
 
 DefaultJITMemoryManager::DefaultJITMemoryManager()
-  :
+    :
 #ifdef NDEBUG
-    PoisonMemory(false),
+      PoisonMemory(false),
 #else
-    PoisonMemory(true),
+      PoisonMemory(true),
 #endif
-    LastSlab(0, 0),
-    BumpSlabAllocator(*this),
-    StubAllocator(BumpSlabAllocator),
-    DataAllocator(BumpSlabAllocator) {
+      LastSlab(nullptr, 0), StubAllocator(*this), DataAllocator(*this) {
 
   // Allocate space for code.
   sys::MemoryBlock MemBlock = allocateNewSlab(DefaultCodeSlabSize);
@@ -644,11 +639,11 @@ DefaultJITMemoryManager::DefaultJITMemoryManager()
   // Start out with the freelist pointing to Mem0.
   FreeMemoryList = Mem0;
 
-  GOTBase = NULL;
+  GOTBase = nullptr;
 }
 
 void DefaultJITMemoryManager::AllocateGOT() {
-  assert(GOTBase == 0 && "Cannot allocate the got multiple times");
+  assert(!GOTBase && "Cannot allocate the got multiple times");
   GOTBase = new uint8_t[sizeof(void*) * 8192];
   HasGOT = true;
 }
@@ -663,9 +658,9 @@ DefaultJITMemoryManager::~DefaultJITMemoryManager() {
 sys::MemoryBlock DefaultJITMemoryManager::allocateNewSlab(size_t size) {
   // Allocate a new block close to the last one.
   std::string ErrMsg;
-  sys::MemoryBlock *LastSlabPtr = LastSlab.base() ? &LastSlab : 0;
+  sys::MemoryBlock *LastSlabPtr = LastSlab.base() ? &LastSlab : nullptr;
   sys::MemoryBlock B = sys::Memory::AllocateRWX(size, LastSlabPtr, &ErrMsg);
-  if (B.base() == 0) {
+  if (!B.base()) {
     report_fatal_error("Allocation failed when allocating new memory in the"
                        " JIT\n" + Twine(ErrMsg));
   }
@@ -726,7 +721,7 @@ bool DefaultJITMemoryManager::CheckInvariants(std::string &ErrorStr) {
     char *End = Start + I->size();
 
     // Check each memory range.
-    for (MemoryRangeHeader *Hdr = (MemoryRangeHeader*)Start, *LastHdr = NULL;
+    for (MemoryRangeHeader *Hdr = (MemoryRangeHeader*)Start, *LastHdr = nullptr;
          Start <= (char*)Hdr && (char*)Hdr < End;
          Hdr = &Hdr->getBlockAfter()) {
       if (Hdr->ThisAllocated == 0) {
@@ -895,7 +890,7 @@ void *DefaultJITMemoryManager::getPointerToNamedFunction(const std::string &Name
     report_fatal_error("Program used external function '"+Name+
                       "' which could not be resolved!");
   }
-  return 0;
+  return nullptr;
 }
 
 
diff --git a/lib/ExecutionEngine/MCJIT/LLVMBuild.txt b/lib/ExecutionEngine/MCJIT/LLVMBuild.txt
index 90f4d2f..922cd0d 100644
--- a/lib/ExecutionEngine/MCJIT/LLVMBuild.txt
+++ b/lib/ExecutionEngine/MCJIT/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = MCJIT
 parent = ExecutionEngine
-required_libraries = Core ExecutionEngine RuntimeDyld Support Target
+required_libraries = Core ExecutionEngine Object RuntimeDyld Support Target
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 49b6727..42cb4ea 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -50,7 +50,7 @@ ExecutionEngine *MCJIT::createJIT(Module *M,
   // Try to register the program as a source of symbols to resolve against.
   //
   // FIXME: Don't do this here.
-  sys::DynamicLibrary::LoadLibraryPermanently(0, NULL);
+  sys::DynamicLibrary::LoadLibraryPermanently(nullptr, nullptr);
 
   return new MCJIT(M, TM, MemMgr ? MemMgr : new SectionMemoryManager(),
                    GVsWithCode);
@@ -58,8 +58,8 @@ ExecutionEngine *MCJIT::createJIT(Module *M,
 
 MCJIT::MCJIT(Module *m, TargetMachine *tm, RTDyldMemoryManager *MM,
              bool AllocateGVsWithCode)
-  : ExecutionEngine(m), TM(tm), Ctx(0), MemMgr(this, MM), Dyld(&MemMgr),
-    ObjCache(0) {
+  : ExecutionEngine(m), TM(tm), Ctx(nullptr), MemMgr(this, MM), Dyld(&MemMgr),
+    ObjCache(nullptr) {
 
   OwnedModules.addModule(m);
   setDataLayout(TM->getDataLayout());
@@ -113,8 +113,8 @@ bool MCJIT::removeModule(Module *M) {
 
 
 
-void MCJIT::addObjectFile(object::ObjectFile *Obj) {
-  ObjectImage *LoadedObject = Dyld.loadObject(Obj);
+void MCJIT::addObjectFile(std::unique_ptr<object::ObjectFile> Obj) {
+  ObjectImage *LoadedObject = Dyld.loadObject(std::move(Obj));
   if (!LoadedObject || Dyld.hasError())
     report_fatal_error(Dyld.getErrorString());
 
@@ -150,7 +150,8 @@ ObjectBufferStream* MCJIT::emitObject(Module *M) {
 
   // Turn the machine code intermediate representation into bytes in memory
   // that may be executed.
-  if (TM->addPassesToEmitMC(PM, Ctx, CompiledObject->getOStream(), false)) {
+  if (TM->addPassesToEmitMC(PM, Ctx, CompiledObject->getOStream(),
+                            !getVerifyModules())) {
     report_fatal_error("Target does not support MC emission!");
   }
 
@@ -185,9 +186,9 @@ void MCJIT::generateCodeForModule(Module *M) {
 
   std::unique_ptr<ObjectBuffer> ObjectToLoad;
   // Try to load the pre-compiled object from cache if possible
-  if (0 != ObjCache) {
+  if (ObjCache) {
     std::unique_ptr<MemoryBuffer> PreCompiledObject(ObjCache->getObject(M));
-    if (0 != PreCompiledObject.get())
+    if (PreCompiledObject.get())
       ObjectToLoad.reset(new ObjectBuffer(PreCompiledObject.release()));
   }
 
@@ -285,7 +286,7 @@ Module *MCJIT::findModuleForSymbol(const std::string &Name,
     }
   }
   // We didn't find the symbol in any of our modules.
-  return NULL;
+  return nullptr;
 }
 
 uint64_t MCJIT::getSymbolAddress(const std::string &Name,
@@ -307,10 +308,10 @@ uint64_t MCJIT::getSymbolAddress(const std::string &Name,
       std::unique_ptr<object::Binary> ChildBin;
       // FIXME: Support nested archives?
       if (!ChildIt->getAsBinary(ChildBin) && ChildBin->isObject()) {
-        object::ObjectFile *OF = reinterpret_cast<object::ObjectFile *>(
-                                                            ChildBin.release());
+        std::unique_ptr<object::ObjectFile> OF(
+            static_cast<object::ObjectFile *>(ChildBin.release()));
         // This causes the object file to be loaded.
-        addObjectFile(OF);
+        addObjectFile(std::move(OF));
         // The address should be here now.
         Addr = getExistingSymbolAddress(Name);
         if (Addr)
@@ -365,7 +366,7 @@ void *MCJIT::getPointerToFunction(Function *F) {
     generateCodeForModule(M);
   else if (!OwnedModules.hasModuleBeenLoaded(M))
     // If this function doesn't belong to one of our modules, we're done.
-    return NULL;
+    return nullptr;
 
   // FIXME: Should the Dyld be retaining module information? Probably not.
   //
@@ -409,7 +410,7 @@ Function *MCJIT::FindFunctionNamedInModulePtrSet(const char *FnName,
     if (Function *F = (*I)->getFunction(FnName))
       return F;
   }
-  return 0;
+  return nullptr;
 }
 
 Function *MCJIT::FindFunctionNamed(const char *FnName) {
@@ -541,17 +542,17 @@ void *MCJIT::getPointerToNamedFunction(const std::string &Name,
     report_fatal_error("Program used external function '"+Name+
                        "' which could not be resolved!");
   }
-  return 0;
+  return nullptr;
 }
 
 void MCJIT::RegisterJITEventListener(JITEventListener *L) {
-  if (L == NULL)
+  if (!L)
     return;
   MutexGuard locked(lock);
   EventListeners.push_back(L);
 }
 void MCJIT::UnregisterJITEventListener(JITEventListener *L) {
-  if (L == NULL)
+  if (!L)
     return;
   MutexGuard locked(lock);
   SmallVector<JITEventListener*, 2>::reverse_iterator I=
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index 066eceb..100e9a2 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -71,7 +71,7 @@ public:
     ClientMM->deregisterEHFrames(Addr, LoadAddr, Size);
   }
 
-  bool finalizeMemory(std::string *ErrMsg = 0) override {
+  bool finalizeMemory(std::string *ErrMsg = nullptr) override {
     return ClientMM->finalizeMemory(ErrMsg);
   }
 
@@ -239,7 +239,7 @@ public:
   /// @name ExecutionEngine interface implementation
   /// @{
   void addModule(Module *M) override;
-  void addObjectFile(object::ObjectFile *O) override;
+  void addObjectFile(std::unique_ptr<object::ObjectFile> O) override;
   void addArchive(object::Archive *O) override;
   bool removeModule(Module *M) override;
 
diff --git a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp b/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
index f1dd5a6..9ceaa90 100644
--- a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
+++ b/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
@@ -79,7 +79,7 @@ uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup,
                                                           ec);
   if (ec) {
     // FIXME: Add error propagation to the interface.
-    return NULL;
+    return nullptr;
   }
 
   // Save this address as the basis for our next request
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index 87cef2e..fd37a13 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Config/config.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 
-#define DEBUG_TYPE "oprofile-jit-event-listener"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -33,6 +32,8 @@
 using namespace llvm;
 using namespace llvm::jitprofiling;
 
+#define DEBUG_TYPE "oprofile-jit-event-listener"
+
 namespace {
 
 class OProfileJITEventListener : public JITEventListener {
@@ -170,11 +171,8 @@ void OProfileJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
   }
 
   // Use symbol info to iterate functions in the object.
-  error_code ec;
-  for (object::symbol_iterator I = Obj.begin_symbols(),
-                               E = Obj.end_symbols();
-                        I != E && !ec;
-                        I.increment(ec)) {
+  for (object::symbol_iterator I = Obj.begin_symbols(), E = Obj.end_symbols();
+       I != E; ++I) {
     object::SymbolRef::Type SymType;
     if (I->getType(SymType)) continue;
     if (SymType == object::SymbolRef::ST_Function) {
@@ -203,11 +201,8 @@ void OProfileJITEventListener::NotifyFreeingObject(const ObjectImage &Obj) {
   }
 
   // Use symbol info to iterate functions in the object.
-  error_code ec;
-  for (object::symbol_iterator I = Obj.begin_symbols(),
-                               E = Obj.end_symbols();
-                        I != E && !ec;
-                        I.increment(ec)) {
+  for (object::symbol_iterator I = Obj.begin_symbols(), E = Obj.end_symbols();
+       I != E; ++I) {
     object::SymbolRef::Type SymType;
     if (I->getType(SymType)) continue;
     if (SymType == object::SymbolRef::ST_Function) {
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
index 6702e20..04edbd2 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "oprofile-wrapper"
 #include "llvm/ExecutionEngine/OProfileWrapper.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Debug.h"
@@ -29,6 +28,8 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#define DEBUG_TYPE "oprofile-wrapper"
+
 namespace {
 
 // Global mutex to ensure a single thread initializes oprofile agent.
diff --git a/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp b/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
index 1d0e9b3..8546571 100644
--- a/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
@@ -45,7 +45,7 @@ extern "C" {
   // We put information about the JITed function in this global, which the
   // debugger reads.  Make sure to specify the version statically, because the
   // debugger checks the version before we can set it during runtime.
-  struct jit_descriptor __jit_debug_descriptor = { 1, 0, 0, 0 };
+  struct jit_descriptor __jit_debug_descriptor = { 1, 0, nullptr, nullptr };
 
   // Debuggers puts a breakpoint in this function.
   LLVM_ATTRIBUTE_NOINLINE void __jit_debug_register_code() {
@@ -108,10 +108,10 @@ void NotifyDebugger(jit_code_entry* JITCodeEntry) {
   __jit_debug_descriptor.action_flag = JIT_REGISTER_FN;
 
   // Insert this entry at the head of the list.
-  JITCodeEntry->prev_entry = NULL;
+  JITCodeEntry->prev_entry = nullptr;
   jit_code_entry* NextEntry = __jit_debug_descriptor.first_entry;
   JITCodeEntry->next_entry = NextEntry;
-  if (NextEntry != NULL) {
+  if (NextEntry) {
     NextEntry->prev_entry = JITCodeEntry;
   }
   __jit_debug_descriptor.first_entry = JITCodeEntry;
@@ -142,11 +142,10 @@ void GDBJITRegistrar::registerObject(const ObjectBuffer &Object) {
          "Second attempt to perform debug registration.");
   jit_code_entry* JITCodeEntry = new jit_code_entry();
 
-  if (JITCodeEntry == 0) {
+  if (!JITCodeEntry) {
     llvm::report_fatal_error(
       "Allocation failed when registering a JIT entry!\n");
-  }
-  else {
+  } else {
     JITCodeEntry->symfile_addr = Buffer;
     JITCodeEntry->symfile_size = Size;
 
@@ -198,7 +197,7 @@ void GDBJITRegistrar::deregisterObjectInternal(
   }
 
   delete JITCodeEntry;
-  JITCodeEntry = NULL;
+  JITCodeEntry = nullptr;
 }
 
 llvm::ManagedStatic<GDBJITRegistrar> TheRegistrar;
diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
index 3693c69..4917b93 100644
--- a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
+++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
@@ -18,6 +18,8 @@
 #include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/Object/ObjectFile.h"
 
+#include <memory>
+
 namespace llvm {
 
 namespace object {
@@ -30,13 +32,13 @@ class ObjectImageCommon : public ObjectImage {
   void anchor() override;
 
 protected:
-  object::ObjectFile *ObjFile;
+  std::unique_ptr<object::ObjectFile> ObjFile;
 
   // This form of the constructor allows subclasses to use
   // format-specific subclasses of ObjectFile directly
-  ObjectImageCommon(ObjectBuffer *Input, object::ObjectFile *Obj)
+  ObjectImageCommon(ObjectBuffer *Input, std::unique_ptr<object::ObjectFile> Obj)
   : ObjectImage(Input), // saves Input as Buffer and takes ownership
-    ObjFile(Obj)
+    ObjFile(std::move(Obj))
   {
   }
 
@@ -44,12 +46,13 @@ public:
   ObjectImageCommon(ObjectBuffer* Input)
   : ObjectImage(Input) // saves Input as Buffer and takes ownership
   {
-    ObjFile =
-        object::ObjectFile::createObjectFile(Buffer->getMemBuffer()).get();
+    // FIXME: error checking? createObjectFile returns an ErrorOr<ObjectFile*>
+    // and should probably be checked for failure.
+    ObjFile.reset(object::ObjectFile::createObjectFile(Buffer->getMemBuffer()).get());
   }
-  ObjectImageCommon(object::ObjectFile* Input)
-  : ObjectImage(NULL), ObjFile(Input)  {}
-  virtual ~ObjectImageCommon() { delete ObjFile; }
+  ObjectImageCommon(std::unique_ptr<object::ObjectFile> Input)
+  : ObjectImage(nullptr), ObjFile(std::move(Input))  {}
+  virtual ~ObjectImageCommon() { }
 
   object::symbol_iterator begin_symbols() const override
       { return ObjFile->symbol_begin(); }
@@ -66,7 +69,7 @@ public:
 
   StringRef getData() const override { return ObjFile->getData(); }
 
-  object::ObjectFile* getObjectFile() const override { return ObjFile; }
+  object::ObjectFile* getObjectFile() const override { return ObjFile.get(); }
 
   // Subclasses can override these methods to update the image with loaded
   // addresses for sections and common symbols
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 986d3a0..c1eb0fd 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "dyld"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "JITRegistrar.h"
 #include "ObjectImageCommon.h"
@@ -25,6 +24,8 @@
 using namespace llvm;
 using namespace llvm::object;
 
+#define DEBUG_TYPE "dyld"
+
 // Empty out-of-line virtual destructor as the key function.
 RuntimeDyldImpl::~RuntimeDyldImpl() {}
 
@@ -72,12 +73,40 @@ void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
   llvm_unreachable("Attempting to remap address of unknown section!");
 }
 
+static error_code getOffset(const SymbolRef &Sym, uint64_t &Result) {
+  uint64_t Address;
+  if (error_code EC = Sym.getAddress(Address))
+    return EC;
+
+  if (Address == UnknownAddressOrSize) {
+    Result = UnknownAddressOrSize;
+    return object_error::success;
+  }
+
+  const ObjectFile *Obj = Sym.getObject();
+  section_iterator SecI(Obj->section_begin());
+  if (error_code EC = Sym.getSection(SecI))
+    return EC;
+
+ if (SecI == Obj->section_end()) {
+   Result = UnknownAddressOrSize;
+   return object_error::success;
+ }
+
+  uint64_t SectionAddress;
+  if (error_code EC = SecI->getAddress(SectionAddress))
+    return EC;
+
+  Result = Address - SectionAddress;
+  return object_error::success;
+}
+
 ObjectImage *RuntimeDyldImpl::loadObject(ObjectImage *InputObject) {
   MutexGuard locked(lock);
 
   std::unique_ptr<ObjectImage> Obj(InputObject);
   if (!Obj)
-    return NULL;
+    return nullptr;
 
   // Save information about our target
   Arch = (Triple::ArchType)Obj->getArch();
@@ -115,36 +144,33 @@ ObjectImage *RuntimeDyldImpl::loadObject(ObjectImage *InputObject) {
     bool IsCommon = Flags & SymbolRef::SF_Common;
     if (IsCommon) {
       // Add the common symbols to a list.  We'll allocate them all below.
-      uint32_t Align;
-      Check(I->getAlignment(Align));
-      uint64_t Size = 0;
-      Check(I->getSize(Size));
-      CommonSize += Size + Align;
-      CommonSymbols[*I] = CommonSymbolInfo(Size, Align);
+      if (!GlobalSymbolTable.count(Name)) {
+        uint32_t Align;
+        Check(I->getAlignment(Align));
+        uint64_t Size = 0;
+        Check(I->getSize(Size));
+        CommonSize += Size + Align;
+        CommonSymbols[*I] = CommonSymbolInfo(Size, Align);
+      }
     } else {
       if (SymType == object::SymbolRef::ST_Function ||
           SymType == object::SymbolRef::ST_Data ||
           SymType == object::SymbolRef::ST_Unknown) {
-        uint64_t FileOffset;
+        uint64_t SectOffset;
         StringRef SectionData;
         bool IsCode;
         section_iterator SI = Obj->end_sections();
-        Check(I->getFileOffset(FileOffset));
+        Check(getOffset(*I, SectOffset));
         Check(I->getSection(SI));
         if (SI == Obj->end_sections())
           continue;
         Check(SI->getContents(SectionData));
         Check(SI->isText(IsCode));
-        const uint8_t *SymPtr =
-            (const uint8_t *)Obj->getData().data() + (uintptr_t)FileOffset;
-        uintptr_t SectOffset =
-            (uintptr_t)(SymPtr - (const uint8_t *)SectionData.begin());
         unsigned SectionID =
             findOrEmitSection(*Obj, *SI, IsCode, LocalSections);
         LocalSymbols[Name.data()] = SymbolLoc(SectionID, SectOffset);
-        DEBUG(dbgs() << "\tFileOffset: " << format("%p", (uintptr_t)FileOffset)
-                     << " flags: " << Flags << " SID: " << SectionID
-                     << " Offset: " << format("%p", SectOffset));
+        DEBUG(dbgs() << "\tOffset: " << format("%p", (uintptr_t)SectOffset)
+                     << " flags: " << Flags << " SID: " << SectionID);
         GlobalSymbolTable[Name] = SymbolLoc(SectionID, SectOffset);
       }
     }
@@ -153,7 +179,7 @@ ObjectImage *RuntimeDyldImpl::loadObject(ObjectImage *InputObject) {
 
   // Allocate common symbols
   if (CommonSize != 0)
-    emitCommonSymbols(*Obj, CommonSymbols, CommonSize, LocalSymbols);
+    emitCommonSymbols(*Obj, CommonSymbols, CommonSize, GlobalSymbolTable);
 
   // Parse and process relocations
   DEBUG(dbgs() << "Parse relocations:\n");
@@ -163,7 +189,10 @@ ObjectImage *RuntimeDyldImpl::loadObject(ObjectImage *InputObject) {
     StubMap Stubs;
     section_iterator RelocatedSection = SI->getRelocatedSection();
 
-    if (SI->relocation_empty() && !ProcessAllSections)
+    relocation_iterator I = SI->relocation_begin();
+    relocation_iterator E = SI->relocation_end();
+
+    if (I == E && !ProcessAllSections)
       continue;
 
     bool IsCode = false;
@@ -172,14 +201,13 @@ ObjectImage *RuntimeDyldImpl::loadObject(ObjectImage *InputObject) {
         findOrEmitSection(*Obj, *RelocatedSection, IsCode, LocalSections);
     DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n");
 
-    for (relocation_iterator I = SI->relocation_begin(),
-         E = SI->relocation_end(); I != E;)
+    for (; I != E;)
       I = processRelocationRef(SectionID, I, *Obj, LocalSections, LocalSymbols,
                                Stubs);
   }
 
   // Give the subclasses a chance to tie-up any loose ends.
-  finalizeLoad(LocalSections);
+  finalizeLoad(*Obj, LocalSections);
 
   return Obj.release();
 }
@@ -400,7 +428,7 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   uintptr_t Allocate;
   unsigned SectionID = Sections.size();
   uint8_t *Addr;
-  const char *pData = 0;
+  const char *pData = nullptr;
 
   // Some sections, such as debug info, don't need to be loaded for execution.
   // Leave those where they are.
@@ -441,7 +469,7 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
     // to handle later processing (and by 'handle' I mean don't do anything
     // with these sections).
     Allocate = 0;
-    Addr = 0;
+    Addr = nullptr;
     DEBUG(dbgs() << "emitSection SectionID: " << SectionID << " Name: " << Name
                  << " obj addr: " << format("%p", data.data()) << " new addr: 0"
                  << " DataSize: " << DataSize << " StubBufSize: " << StubBufSize
@@ -490,7 +518,8 @@ void RuntimeDyldImpl::addRelocationForSymbol(const RelocationEntry &RE,
 }
 
 uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
-  if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be) {
+  if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be ||
+      Arch == Triple::arm64 || Arch == Triple::arm64_be) {
     // This stub has to be able to access the full address space,
     // since symbol lookup won't necessarily find a handy, in-range,
     // PLT stub for functions which could be anywhere.
@@ -560,6 +589,8 @@ uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
     *Addr      = 0xFF; // jmp
     *(Addr+1)  = 0x25; // rip
     // 32-bit PC-relative address of the GOT entry will be stored at Addr+2
+  } else if (Arch == Triple::x86) {
+    *Addr      = 0xE9; // 32-bit pc-relative jump.
   }
   return Addr;
 }
@@ -586,7 +617,7 @@ void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
   for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
     const RelocationEntry &RE = Relocs[i];
     // Ignore relocations for sections that were not loaded
-    if (Sections[RE.SectionID].Address == 0)
+    if (Sections[RE.SectionID].Address == nullptr)
       continue;
     resolveRelocation(RE, Value);
   }
@@ -651,7 +682,7 @@ RuntimeDyld::RuntimeDyld(RTDyldMemoryManager *mm) {
   // though the public class spawns a new 'impl' instance for each load,
   // they share a single memory manager.  This can become a problem when page
   // permissions are applied.
-  Dyld = 0;
+  Dyld = nullptr;
   MM = mm;
   ProcessAllSections = false;
 }
@@ -672,21 +703,23 @@ createRuntimeDyldMachO(RTDyldMemoryManager *MM, bool ProcessAllSections) {
   return Dyld;
 }
 
-ObjectImage *RuntimeDyld::loadObject(ObjectFile *InputObject) {
+ObjectImage *RuntimeDyld::loadObject(std::unique_ptr<ObjectFile> InputObject) {
   std::unique_ptr<ObjectImage> InputImage;
 
+  ObjectFile &Obj = *InputObject;
+
   if (InputObject->isELF()) {
-    InputImage.reset(RuntimeDyldELF::createObjectImageFromFile(InputObject));
+    InputImage.reset(RuntimeDyldELF::createObjectImageFromFile(std::move(InputObject)));
     if (!Dyld)
       Dyld = createRuntimeDyldELF(MM, ProcessAllSections).release();
   } else if (InputObject->isMachO()) {
-    InputImage.reset(RuntimeDyldMachO::createObjectImageFromFile(InputObject));
+    InputImage.reset(RuntimeDyldMachO::createObjectImageFromFile(std::move(InputObject)));
     if (!Dyld)
       Dyld = createRuntimeDyldMachO(MM, ProcessAllSections).release();
   } else
     report_fatal_error("Incompatible object format!");
 
-  if (!Dyld->isCompatibleFile(InputObject))
+  if (!Dyld->isCompatibleFile(&Obj))
     report_fatal_error("Incompatible object format!");
 
   Dyld->loadObject(InputImage.get());
@@ -740,7 +773,7 @@ ObjectImage *RuntimeDyld::loadObject(ObjectBuffer *InputBuffer) {
 
 void *RuntimeDyld::getSymbolAddress(StringRef Name) {
   if (!Dyld)
-    return NULL;
+    return nullptr;
   return Dyld->getSymbolAddress(Name);
 }
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 3204b81..6ba24b9 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "dyld"
 #include "RuntimeDyldELF.h"
 #include "JITRegistrar.h"
 #include "ObjectImageCommon.h"
@@ -29,6 +28,8 @@
 using namespace llvm;
 using namespace llvm::object;
 
+#define DEBUG_TYPE "dyld"
+
 namespace {
 
 static inline error_code check(error_code Err) {
@@ -50,7 +51,12 @@ template <class ELFT> class DyldELFObject : public ELFObjectFile<ELFT> {
 
   typedef typename ELFDataTypeTypedefHelper<ELFT>::value_type addr_type;
 
+  std::unique_ptr<ObjectFile> UnderlyingFile;
+
 public:
+  DyldELFObject(std::unique_ptr<ObjectFile> UnderlyingFile,
+                MemoryBuffer *Wrapper, error_code &ec);
+
   DyldELFObject(MemoryBuffer *Wrapper, error_code &ec);
 
   void updateSectionAddress(const SectionRef &Sec, uint64_t Addr);
@@ -67,13 +73,11 @@ public:
 };
 
 template <class ELFT> class ELFObjectImage : public ObjectImageCommon {
-protected:
-  DyldELFObject<ELFT> *DyldObj;
   bool Registered;
 
 public:
-  ELFObjectImage(ObjectBuffer *Input, DyldELFObject<ELFT> *Obj)
-      : ObjectImageCommon(Input, Obj), DyldObj(Obj), Registered(false) {}
+  ELFObjectImage(ObjectBuffer *Input, std::unique_ptr<DyldELFObject<ELFT>> Obj)
+      : ObjectImageCommon(Input, std::move(Obj)), Registered(false) {}
 
   virtual ~ELFObjectImage() {
     if (Registered)
@@ -83,11 +87,13 @@ public:
   // Subclasses can override these methods to update the image with loaded
   // addresses for sections and common symbols
   void updateSectionAddress(const SectionRef &Sec, uint64_t Addr) override {
-    DyldObj->updateSectionAddress(Sec, Addr);
+    static_cast<DyldELFObject<ELFT>*>(getObjectFile())
+        ->updateSectionAddress(Sec, Addr);
   }
 
   void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr) override {
-    DyldObj->updateSymbolAddress(Sym, Addr);
+    static_cast<DyldELFObject<ELFT>*>(getObjectFile())
+        ->updateSymbolAddress(Sym, Addr);
   }
 
   void registerWithDebugger() override {
@@ -109,6 +115,14 @@ DyldELFObject<ELFT>::DyldELFObject(MemoryBuffer *Wrapper, error_code &ec)
 }
 
 template <class ELFT>
+DyldELFObject<ELFT>::DyldELFObject(std::unique_ptr<ObjectFile> UnderlyingFile,
+                                   MemoryBuffer *Wrapper, error_code &ec)
+    : ELFObjectFile<ELFT>(Wrapper, ec),
+      UnderlyingFile(std::move(UnderlyingFile)) {
+  this->isDyldELFObject = true;
+}
+
+template <class ELFT>
 void DyldELFObject<ELFT>::updateSectionAddress(const SectionRef &Sec,
                                                uint64_t Addr) {
   DataRefImpl ShdrRef = Sec.getRawDataRefImpl();
@@ -164,30 +178,36 @@ void RuntimeDyldELF::deregisterEHFrames() {
 }
 
 ObjectImage *
-RuntimeDyldELF::createObjectImageFromFile(object::ObjectFile *ObjFile) {
+RuntimeDyldELF::createObjectImageFromFile(std::unique_ptr<object::ObjectFile> ObjFile) {
   if (!ObjFile)
-    return NULL;
+    return nullptr;
 
   error_code ec;
   MemoryBuffer *Buffer =
       MemoryBuffer::getMemBuffer(ObjFile->getData(), "", false);
 
   if (ObjFile->getBytesInAddress() == 4 && ObjFile->isLittleEndian()) {
-    DyldELFObject<ELFType<support::little, 2, false>> *Obj =
-        new DyldELFObject<ELFType<support::little, 2, false>>(Buffer, ec);
-    return new ELFObjectImage<ELFType<support::little, 2, false>>(NULL, Obj);
+    auto Obj =
+        llvm::make_unique<DyldELFObject<ELFType<support::little, 2, false>>>(
+            std::move(ObjFile), Buffer, ec);
+    return new ELFObjectImage<ELFType<support::little, 2, false>>(
+        nullptr, std::move(Obj));
   } else if (ObjFile->getBytesInAddress() == 4 && !ObjFile->isLittleEndian()) {
-    DyldELFObject<ELFType<support::big, 2, false>> *Obj =
-        new DyldELFObject<ELFType<support::big, 2, false>>(Buffer, ec);
-    return new ELFObjectImage<ELFType<support::big, 2, false>>(NULL, Obj);
+    auto Obj =
+        llvm::make_unique<DyldELFObject<ELFType<support::big, 2, false>>>(
+            std::move(ObjFile), Buffer, ec);
+    return new ELFObjectImage<ELFType<support::big, 2, false>>(nullptr, std::move(Obj));
   } else if (ObjFile->getBytesInAddress() == 8 && !ObjFile->isLittleEndian()) {
-    DyldELFObject<ELFType<support::big, 2, true>> *Obj =
-        new DyldELFObject<ELFType<support::big, 2, true>>(Buffer, ec);
-    return new ELFObjectImage<ELFType<support::big, 2, true>>(NULL, Obj);
+    auto Obj = llvm::make_unique<DyldELFObject<ELFType<support::big, 2, true>>>(
+        std::move(ObjFile), Buffer, ec);
+    return new ELFObjectImage<ELFType<support::big, 2, true>>(nullptr,
+                                                              std::move(Obj));
   } else if (ObjFile->getBytesInAddress() == 8 && ObjFile->isLittleEndian()) {
-    DyldELFObject<ELFType<support::little, 2, true>> *Obj =
-        new DyldELFObject<ELFType<support::little, 2, true>>(Buffer, ec);
-    return new ELFObjectImage<ELFType<support::little, 2, true>>(NULL, Obj);
+    auto Obj =
+        llvm::make_unique<DyldELFObject<ELFType<support::little, 2, true>>>(
+            std::move(ObjFile), Buffer, ec);
+    return new ELFObjectImage<ELFType<support::little, 2, true>>(
+        nullptr, std::move(Obj));
   } else
     llvm_unreachable("Unexpected ELF format");
 }
@@ -201,28 +221,29 @@ ObjectImage *RuntimeDyldELF::createObjectImage(ObjectBuffer *Buffer) {
   error_code ec;
 
   if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB) {
-    DyldELFObject<ELFType<support::little, 4, false>> *Obj =
-        new DyldELFObject<ELFType<support::little, 4, false>>(
+    auto Obj =
+        llvm::make_unique<DyldELFObject<ELFType<support::little, 4, false>>>(
             Buffer->getMemBuffer(), ec);
-    return new ELFObjectImage<ELFType<support::little, 4, false>>(Buffer, Obj);
+    return new ELFObjectImage<ELFType<support::little, 4, false>>(
+        Buffer, std::move(Obj));
   } else if (Ident.first == ELF::ELFCLASS32 &&
              Ident.second == ELF::ELFDATA2MSB) {
-    DyldELFObject<ELFType<support::big, 4, false>> *Obj =
-        new DyldELFObject<ELFType<support::big, 4, false>>(
+    auto Obj =
+        llvm::make_unique<DyldELFObject<ELFType<support::big, 4, false>>>(
             Buffer->getMemBuffer(), ec);
-    return new ELFObjectImage<ELFType<support::big, 4, false>>(Buffer, Obj);
+    return new ELFObjectImage<ELFType<support::big, 4, false>>(Buffer,
+                                                               std::move(Obj));
   } else if (Ident.first == ELF::ELFCLASS64 &&
              Ident.second == ELF::ELFDATA2MSB) {
-    DyldELFObject<ELFType<support::big, 8, true>> *Obj =
-        new DyldELFObject<ELFType<support::big, 8, true>>(
-            Buffer->getMemBuffer(), ec);
-    return new ELFObjectImage<ELFType<support::big, 8, true>>(Buffer, Obj);
+    auto Obj = llvm::make_unique<DyldELFObject<ELFType<support::big, 8, true>>>(
+        Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<ELFType<support::big, 8, true>>(Buffer, std::move(Obj));
   } else if (Ident.first == ELF::ELFCLASS64 &&
              Ident.second == ELF::ELFDATA2LSB) {
-    DyldELFObject<ELFType<support::little, 8, true>> *Obj =
-        new DyldELFObject<ELFType<support::little, 8, true>>(
+    auto Obj =
+        llvm::make_unique<DyldELFObject<ELFType<support::little, 8, true>>>(
             Buffer->getMemBuffer(), ec);
-    return new ELFObjectImage<ELFType<support::little, 8, true>>(Buffer, Obj);
+    return new ELFObjectImage<ELFType<support::little, 8, true>>(Buffer, std::move(Obj));
   } else
     llvm_unreachable("Unexpected ELF format");
 }
@@ -845,6 +866,8 @@ void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section,
     break;
   case Triple::aarch64:
   case Triple::aarch64_be:
+  case Triple::arm64:
+  case Triple::arm64_be:
     resolveAArch64Relocation(Section, Offset, Value, Type, Addend);
     break;
   case Triple::arm: // Fall through.
@@ -950,7 +973,8 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
 
   DEBUG(dbgs() << "\t\tSectionID: " << SectionID << " Offset: " << Offset
                << "\n");
-  if (Arch == Triple::aarch64 &&
+  if ((Arch == Triple::aarch64 || Arch == Triple::aarch64_be ||
+       Arch == Triple::arm64 || Arch == Triple::arm64_be) &&
       (RelType == ELF::R_AARCH64_CALL26 || RelType == ELF::R_AARCH64_JUMP26)) {
     // This is an AArch64 branch relocation, need to use a stub function.
     DEBUG(dbgs() << "\t\tThis is an AArch64 branch relocation.");
@@ -1151,7 +1175,7 @@ relocation_iterator RuntimeDyldELF::processRelocationRef(
       // Extra check to avoid relocation againt empty symbols (usually
       // the R_PPC64_TOC).
       if (SymType != SymbolRef::ST_Unknown && TargetName.empty())
-        Value.SymbolName = NULL;
+        Value.SymbolName = nullptr;
 
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
@@ -1283,7 +1307,8 @@ void RuntimeDyldELF::updateGOTEntries(StringRef Name, uint64_t Addr) {
   for (it = GOTs.begin(); it != end; ++it) {
     GOTRelocations &GOTEntries = it->second;
     for (int i = 0, e = GOTEntries.size(); i != e; ++i) {
-      if (GOTEntries[i].SymbolName != 0 && GOTEntries[i].SymbolName == Name) {
+      if (GOTEntries[i].SymbolName != nullptr &&
+          GOTEntries[i].SymbolName == Name) {
         GOTEntries[i].Offset = Addr;
       }
     }
@@ -1297,6 +1322,9 @@ size_t RuntimeDyldELF::getGOTEntrySize() {
   switch (Arch) {
   case Triple::x86_64:
   case Triple::aarch64:
+  case Triple::aarch64_be:
+  case Triple::arm64:
+  case Triple::arm64_be:
   case Triple::ppc64:
   case Triple::ppc64le:
   case Triple::systemz:
@@ -1331,7 +1359,7 @@ uint64_t RuntimeDyldELF::findGOTEntry(uint64_t LoadAddress, uint64_t Offset) {
     // Find the matching entry in our vector.
     uint64_t SymbolOffset = 0;
     for (int i = 0, e = GOTEntries.size(); i != e; ++i) {
-      if (GOTEntries[i].SymbolName == 0) {
+      if (!GOTEntries[i].SymbolName) {
         if (getSectionLoadAddress(GOTEntries[i].SectionID) == LoadAddress &&
             GOTEntries[i].Offset == Offset) {
           GOTIndex = i;
@@ -1369,7 +1397,8 @@ uint64_t RuntimeDyldELF::findGOTEntry(uint64_t LoadAddress, uint64_t Offset) {
   return 0;
 }
 
-void RuntimeDyldELF::finalizeLoad(ObjSectionToIDMap &SectionMap) {
+void RuntimeDyldELF::finalizeLoad(ObjectImage &ObjImg,
+                                  ObjSectionToIDMap &SectionMap) {
   // If necessary, allocate the global offset table
   if (MemMgr) {
     // Allocate the GOT if necessary
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 27db5cd..a526073 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -59,7 +59,8 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
                                 uint64_t Value, uint32_t Type, int64_t Addend);
 
   unsigned getMaxStubSize() override {
-    if (Arch == Triple::aarch64)
+    if (Arch == Triple::aarch64 || Arch == Triple::arm64 ||
+        Arch == Triple::aarch64_be || Arch == Triple::arm64_be)
       return 20; // movz; movk; movk; movk; br
     if (Arch == Triple::arm || Arch == Triple::thumb)
       return 8; // 32-bit instruction and 32-bit address
@@ -115,11 +116,12 @@ public:
   bool isCompatibleFile(const object::ObjectFile *Buffer) const override;
   void registerEHFrames() override;
   void deregisterEHFrames() override;
-  void finalizeLoad(ObjSectionToIDMap &SectionMap) override;
+  void finalizeLoad(ObjectImage &ObjImg,
+                    ObjSectionToIDMap &SectionMap) override;
   virtual ~RuntimeDyldELF();
 
   static ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
-  static ObjectImage *createObjectImageFromFile(object::ObjectFile *Obj);
+  static ObjectImage *createObjectImageFromFile(std::unique_ptr<object::ObjectFile> Obj);
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index c153ee1..412cf20 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -90,9 +90,17 @@ public:
   /// used to make a relocation section relative instead of symbol relative.
   int64_t Addend;
 
+  struct SectionPair {
+      uint32_t SectionA;
+      uint32_t SectionB;
+  };
+
   /// SymOffset - Section offset of the relocation entry's symbol (used for GOT
   /// lookup).
-  uint64_t SymOffset;
+  union {
+    uint64_t SymOffset;
+    SectionPair Sections;
+  };
 
   /// True if this is a PCRel relocation (MachO specific).
   bool IsPCRel;
@@ -113,6 +121,16 @@ public:
                   bool IsPCRel, unsigned Size)
       : SectionID(id), Offset(offset), RelType(type), Addend(addend),
         SymOffset(0), IsPCRel(IsPCRel), Size(Size) {}
+
+  RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend,
+                  unsigned SectionA, uint64_t SectionAOffset, unsigned SectionB,
+                  uint64_t SectionBOffset, bool IsPCRel, unsigned Size)
+      : SectionID(id), Offset(offset), RelType(type),
+        Addend(SectionAOffset - SectionBOffset + addend), IsPCRel(IsPCRel),
+        Size(Size) {
+    Sections.SectionA = SectionA;
+    Sections.SectionB = SectionB;
+  }
 };
 
 class RelocationValueRef {
@@ -121,7 +139,8 @@ public:
   uint64_t Offset;
   int64_t Addend;
   const char *SymbolName;
-  RelocationValueRef() : SectionID(0), Offset(0), Addend(0), SymbolName(0) {}
+  RelocationValueRef() : SectionID(0), Offset(0), Addend(0),
+                         SymbolName(nullptr) {}
 
   inline bool operator==(const RelocationValueRef &Other) const {
     return SectionID == Other.SectionID && Offset == Other.Offset &&
@@ -335,7 +354,7 @@ public:
     // Work in progress.
     SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name);
     if (pos == GlobalSymbolTable.end())
-      return 0;
+      return nullptr;
     SymbolLoc Loc = pos->second;
     return getSectionAddress(Loc.first) + Loc.second;
   }
@@ -372,7 +391,7 @@ public:
 
   virtual void deregisterEHFrames();
 
-  virtual void finalizeLoad(ObjSectionToIDMap &SectionMap) {}
+  virtual void finalizeLoad(ObjectImage &ObjImg, ObjSectionToIDMap &SectionMap) {}
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 7eae9c2..2b425fb 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -11,17 +11,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "dyld"
 #include "RuntimeDyldMachO.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 using namespace llvm;
 using namespace llvm::object;
 
+#define DEBUG_TYPE "dyld"
+
 namespace llvm {
 
 static unsigned char *processFDE(unsigned char *P, intptr_t DeltaForText,
                                  intptr_t DeltaForEH) {
+  DEBUG(dbgs() << "Processing FDE: Delta for text: " << DeltaForText
+               << ", Delta for EH: " << DeltaForEH << "\n");
   uint32_t Length = *((uint32_t *)P);
   P += 4;
   unsigned char *Ret = P + Length;
@@ -66,7 +69,7 @@ void RuntimeDyldMachO::registerEHFrames() {
       continue;
     SectionEntry *Text = &Sections[SectionInfo.TextSID];
     SectionEntry *EHFrame = &Sections[SectionInfo.EHFrameSID];
-    SectionEntry *ExceptTab = NULL;
+    SectionEntry *ExceptTab = nullptr;
     if (SectionInfo.ExceptTabSID != RTDYLD_INVALID_SECTION_ID)
       ExceptTab = &Sections[SectionInfo.ExceptTabSID];
 
@@ -87,7 +90,8 @@ void RuntimeDyldMachO::registerEHFrames() {
   UnregisteredEHFrameSections.clear();
 }
 
-void RuntimeDyldMachO::finalizeLoad(ObjSectionToIDMap &SectionMap) {
+void RuntimeDyldMachO::finalizeLoad(ObjectImage &ObjImg,
+                                    ObjSectionToIDMap &SectionMap) {
   unsigned EHFrameSID = RTDYLD_INVALID_SECTION_ID;
   unsigned TextSID = RTDYLD_INVALID_SECTION_ID;
   unsigned ExceptTabSID = RTDYLD_INVALID_SECTION_ID;
@@ -102,6 +106,12 @@ void RuntimeDyldMachO::finalizeLoad(ObjSectionToIDMap &SectionMap) {
       TextSID = i->second;
     else if (Name == "__gcc_except_tab")
       ExceptTabSID = i->second;
+    else if (Name == "__jump_table")
+      populateJumpTable(cast<MachOObjectFile>(*ObjImg.getObjectFile()),
+                        Section, i->second);
+    else if (Name == "__pointers")
+      populatePointersSection(cast<MachOObjectFile>(*ObjImg.getObjectFile()),
+                              Section, i->second);
   }
   UnregisteredEHFrameSections.push_back(
       EHFrameRelatedSections(EHFrameSID, TextSID, ExceptTabSID));
@@ -129,91 +139,87 @@ void RuntimeDyldMachO::finalizeLoad(ObjSectionToIDMap &SectionMap) {
 // symbol in the target address space.
 void RuntimeDyldMachO::resolveRelocation(const RelocationEntry &RE,
                                          uint64_t Value) {
-  const SectionEntry &Section = Sections[RE.SectionID];
-  return resolveRelocation(Section, RE.Offset, Value, RE.RelType, RE.Addend,
-                           RE.IsPCRel, RE.Size);
-}
-
-void RuntimeDyldMachO::resolveRelocation(const SectionEntry &Section,
-                                         uint64_t Offset, uint64_t Value,
-                                         uint32_t Type, int64_t Addend,
-                                         bool isPCRel, unsigned LogSize) {
-  uint8_t *LocalAddress = Section.Address + Offset;
-  uint64_t FinalAddress = Section.LoadAddress + Offset;
-  unsigned MachoType = Type;
-  unsigned Size = 1 << LogSize;
-
-  DEBUG(dbgs() << "resolveRelocation LocalAddress: "
-               << format("%p", LocalAddress)
-               << " FinalAddress: " << format("%p", FinalAddress)
-               << " Value: " << format("%p", Value) << " Addend: " << Addend
-               << " isPCRel: " << isPCRel << " MachoType: " << MachoType
-               << " Size: " << Size << "\n");
+  DEBUG (
+    const SectionEntry &Section = Sections[RE.SectionID];
+    uint8_t* LocalAddress = Section.Address + RE.Offset;
+    uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+
+    dbgs() << "resolveRelocation Section: " << RE.SectionID
+           << " LocalAddress: " << format("%p", LocalAddress)
+           << " FinalAddress: " << format("%p", FinalAddress)
+           << " Value: " << format("%p", Value)
+           << " Addend: " << RE.Addend
+           << " isPCRel: " << RE.IsPCRel
+           << " MachoType: " << RE.RelType
+           << " Size: " << (1 << RE.Size) << "\n";
+  );
 
   // This just dispatches to the proper target specific routine.
   switch (Arch) {
   default:
     llvm_unreachable("Unsupported CPU type!");
   case Triple::x86_64:
-    resolveX86_64Relocation(LocalAddress, FinalAddress, (uintptr_t)Value,
-                            isPCRel, MachoType, Size, Addend);
+    resolveX86_64Relocation(RE, Value);
     break;
   case Triple::x86:
-    resolveI386Relocation(LocalAddress, FinalAddress, (uintptr_t)Value, isPCRel,
-                          MachoType, Size, Addend);
+    resolveI386Relocation(RE, Value);
     break;
   case Triple::arm: // Fall through.
   case Triple::thumb:
-    resolveARMRelocation(LocalAddress, FinalAddress, (uintptr_t)Value, isPCRel,
-                         MachoType, Size, Addend);
+    resolveARMRelocation(RE, Value);
     break;
+  case Triple::aarch64:
   case Triple::arm64:
-    resolveARM64Relocation(LocalAddress, FinalAddress, (uintptr_t)Value,
-                           isPCRel, MachoType, Size, Addend);
+    resolveAArch64Relocation(RE, Value);
     break;
   }
 }
 
-bool RuntimeDyldMachO::resolveI386Relocation(uint8_t *LocalAddress,
-                                             uint64_t FinalAddress,
-                                             uint64_t Value, bool isPCRel,
-                                             unsigned Type, unsigned Size,
-                                             int64_t Addend) {
-  if (isPCRel)
-    Value -= FinalAddress + 4; // see resolveX86_64Relocation
+bool RuntimeDyldMachO::resolveI386Relocation(const RelocationEntry &RE,
+                                             uint64_t Value) {
+  const SectionEntry &Section = Sections[RE.SectionID];
+  uint8_t* LocalAddress = Section.Address + RE.Offset;
 
-  switch (Type) {
-  default:
-    llvm_unreachable("Invalid relocation type!");
-  case MachO::GENERIC_RELOC_VANILLA: {
-    uint8_t *p = LocalAddress;
-    uint64_t ValueToWrite = Value + Addend;
-    for (unsigned i = 0; i < Size; ++i) {
-      *p++ = (uint8_t)(ValueToWrite & 0xff);
-      ValueToWrite >>= 8;
-    }
-    return false;
+  if (RE.IsPCRel) {
+    uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+    Value -= FinalAddress + 4; // see MachOX86_64::resolveRelocation.
   }
-  case MachO::GENERIC_RELOC_SECTDIFF:
-  case MachO::GENERIC_RELOC_LOCAL_SECTDIFF:
-  case MachO::GENERIC_RELOC_PB_LA_PTR:
-    return Error("Relocation type not implemented yet!");
+
+  switch (RE.RelType) {
+    default:
+      llvm_unreachable("Invalid relocation type!");
+    case MachO::GENERIC_RELOC_VANILLA:
+      return applyRelocationValue(LocalAddress, Value + RE.Addend,
+                                  1 << RE.Size);
+    case MachO::GENERIC_RELOC_SECTDIFF:
+    case MachO::GENERIC_RELOC_LOCAL_SECTDIFF: {
+      uint64_t SectionABase = Sections[RE.Sections.SectionA].LoadAddress;
+      uint64_t SectionBBase = Sections[RE.Sections.SectionB].LoadAddress;
+      assert((Value == SectionABase || Value == SectionBBase) &&
+             "Unexpected SECTDIFF relocation value.");
+      Value = SectionABase - SectionBBase + RE.Addend;
+      return applyRelocationValue(LocalAddress, Value, 1 << RE.Size);
+    }
+    case MachO::GENERIC_RELOC_PB_LA_PTR:
+      return Error("Relocation type not implemented yet!");
   }
 }
 
-bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress,
-                                               uint64_t FinalAddress,
-                                               uint64_t Value, bool isPCRel,
-                                               unsigned Type, unsigned Size,
-                                               int64_t Addend) {
+bool RuntimeDyldMachO::resolveX86_64Relocation(const RelocationEntry &RE,
+                                               uint64_t Value) {
+  const SectionEntry &Section = Sections[RE.SectionID];
+  uint8_t* LocalAddress = Section.Address + RE.Offset;
+
   // If the relocation is PC-relative, the value to be encoded is the
   // pointer difference.
-  if (isPCRel)
+  if (RE.IsPCRel) {
     // FIXME: It seems this value needs to be adjusted by 4 for an effective PC
     // address. Is that expected? Only for branches, perhaps?
-    Value -= FinalAddress + 4;
+    uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+    Value -= FinalAddress + 4; // see MachOX86_64::resolveRelocation.
+  }
 
-  switch (Type) {
+  switch (RE.RelType) {
   default:
     llvm_unreachable("Invalid relocation type!");
   case MachO::X86_64_RELOC_SIGNED_1:
@@ -221,17 +227,8 @@ bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress,
   case MachO::X86_64_RELOC_SIGNED_4:
   case MachO::X86_64_RELOC_SIGNED:
   case MachO::X86_64_RELOC_UNSIGNED:
-  case MachO::X86_64_RELOC_BRANCH: {
-    Value += Addend;
-    // Mask in the target value a byte at a time (we don't have an alignment
-    // guarantee for the target address, so this is safest).
-    uint8_t *p = (uint8_t *)LocalAddress;
-    for (unsigned i = 0; i < Size; ++i) {
-      *p++ = (uint8_t)Value;
-      Value >>= 8;
-    }
-    return false;
-  }
+  case MachO::X86_64_RELOC_BRANCH:
+    return applyRelocationValue(LocalAddress, Value + RE.Addend, 1 << RE.Size);
   case MachO::X86_64_RELOC_GOT_LOAD:
   case MachO::X86_64_RELOC_GOT:
   case MachO::X86_64_RELOC_SUBTRACTOR:
@@ -240,14 +237,15 @@ bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress,
   }
 }
 
-bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
-                                            uint64_t FinalAddress,
-                                            uint64_t Value, bool isPCRel,
-                                            unsigned Type, unsigned Size,
-                                            int64_t Addend) {
+bool RuntimeDyldMachO::resolveARMRelocation(const RelocationEntry &RE,
+                                            uint64_t Value) {
+  const SectionEntry &Section = Sections[RE.SectionID];
+  uint8_t* LocalAddress = Section.Address + RE.Offset;
+
   // If the relocation is PC-relative, the value to be encoded is the
   // pointer difference.
-  if (isPCRel) {
+  if (RE.IsPCRel) {
+    uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
     Value -= FinalAddress;
     // ARM PCRel relocations have an effective-PC offset of two instructions
     // (four bytes in Thumb mode, 8 bytes in ARM mode).
@@ -255,19 +253,11 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
     Value -= 8;
   }
 
-  switch (Type) {
+  switch (RE.RelType) {
   default:
     llvm_unreachable("Invalid relocation type!");
-  case MachO::ARM_RELOC_VANILLA: {
-    // Mask in the target value a byte at a time (we don't have an alignment
-    // guarantee for the target address, so this is safest).
-    uint8_t *p = (uint8_t *)LocalAddress;
-    for (unsigned i = 0; i < Size; ++i) {
-      *p++ = (uint8_t)Value;
-      Value >>= 8;
-    }
-    break;
-  }
+  case MachO::ARM_RELOC_VANILLA:
+    return applyRelocationValue(LocalAddress, Value, 1 << RE.Size);
   case MachO::ARM_RELOC_BR24: {
     // Mask the value into the target address. We know instructions are
     // 32-bit aligned, so we can do it all at once.
@@ -275,13 +265,16 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
     // The low two bits of the value are not encoded.
     Value >>= 2;
     // Mask the value to 24 bits.
-    Value &= 0xffffff;
+    uint64_t FinalValue = Value & 0xffffff;
+    // Check for overflow.
+    if (Value != FinalValue)
+      return Error("ARM BR24 relocation out of range.");
     // FIXME: If the destination is a Thumb function (and the instruction
     // is a non-predicated BL instruction), we need to change it to a BLX
     // instruction instead.
 
     // Insert the value into the instruction.
-    *p = (*p & ~0xffffff) | Value;
+    *p = (*p & ~0xffffff) | FinalValue;
     break;
   }
   case MachO::ARM_THUMB_RELOC_BR22:
@@ -297,29 +290,23 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
   return false;
 }
 
-bool RuntimeDyldMachO::resolveARM64Relocation(uint8_t *LocalAddress,
-                                              uint64_t FinalAddress,
-                                              uint64_t Value, bool isPCRel,
-                                              unsigned Type, unsigned Size,
-                                              int64_t Addend) {
+bool RuntimeDyldMachO::resolveAArch64Relocation(const RelocationEntry &RE,
+                                                uint64_t Value) {
+  const SectionEntry &Section = Sections[RE.SectionID];
+  uint8_t* LocalAddress = Section.Address + RE.Offset;
+
   // If the relocation is PC-relative, the value to be encoded is the
   // pointer difference.
-  if (isPCRel)
+  if (RE.IsPCRel) {
+    uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
     Value -= FinalAddress;
+  }
 
-  switch (Type) {
+  switch (RE.RelType) {
   default:
     llvm_unreachable("Invalid relocation type!");
-  case MachO::ARM64_RELOC_UNSIGNED: {
-    // Mask in the target value a byte at a time (we don't have an alignment
-    // guarantee for the target address, so this is safest).
-    uint8_t *p = (uint8_t *)LocalAddress;
-    for (unsigned i = 0; i < Size; ++i) {
-      *p++ = (uint8_t)Value;
-      Value >>= 8;
-    }
-    break;
-  }
+  case MachO::ARM64_RELOC_UNSIGNED:
+    return applyRelocationValue(LocalAddress, Value, 1 << RE.Size);
   case MachO::ARM64_RELOC_BRANCH26: {
     // Mask the value into the target address. We know instructions are
     // 32-bit aligned, so we can do it all at once.
@@ -327,9 +314,12 @@ bool RuntimeDyldMachO::resolveARM64Relocation(uint8_t *LocalAddress,
     // The low two bits of the value are not encoded.
     Value >>= 2;
     // Mask the value to 26 bits.
-    Value &= 0x3ffffff;
+    uint64_t FinalValue = Value & 0x3ffffff;
+    // Check for overflow.
+    if (FinalValue != Value)
+      return Error("ARM64 BRANCH26 relocation out of range.");
     // Insert the value into the instruction.
-    *p = (*p & ~0x3ffffff) | Value;
+    *p = (*p & ~0x3ffffff) | FinalValue;
     break;
   }
   case MachO::ARM64_RELOC_SUBTRACTOR:
@@ -346,6 +336,198 @@ bool RuntimeDyldMachO::resolveARM64Relocation(uint8_t *LocalAddress,
   return false;
 }
 
+void RuntimeDyldMachO::populateJumpTable(MachOObjectFile &Obj,
+                                         const SectionRef &JTSection,
+                                         unsigned JTSectionID) {
+  assert(!Obj.is64Bit() &&
+         "__jump_table section not supported in 64-bit MachO.");
+
+  MachO::dysymtab_command DySymTabCmd = Obj.getDysymtabLoadCommand();
+  MachO::section Sec32 = Obj.getSection(JTSection.getRawDataRefImpl());
+  uint32_t JTSectionSize = Sec32.size;
+  unsigned FirstIndirectSymbol = Sec32.reserved1;
+  unsigned JTEntrySize = Sec32.reserved2;
+  unsigned NumJTEntries = JTSectionSize / JTEntrySize;
+  uint8_t* JTSectionAddr = getSectionAddress(JTSectionID);
+  unsigned JTEntryOffset = 0;
+
+  assert((JTSectionSize % JTEntrySize) == 0 &&
+         "Jump-table section does not contain a whole number of stubs?");
+
+  for (unsigned i = 0; i < NumJTEntries; ++i) {
+    unsigned SymbolIndex =
+      Obj.getIndirectSymbolTableEntry(DySymTabCmd, FirstIndirectSymbol + i);
+    symbol_iterator SI = Obj.getSymbolByIndex(SymbolIndex);
+    StringRef IndirectSymbolName;
+    SI->getName(IndirectSymbolName);
+    uint8_t* JTEntryAddr = JTSectionAddr + JTEntryOffset;
+    createStubFunction(JTEntryAddr);
+    RelocationEntry RE(JTSectionID, JTEntryOffset + 1,
+                       MachO::GENERIC_RELOC_VANILLA, 0, true, 2);
+    addRelocationForSymbol(RE, IndirectSymbolName);
+    JTEntryOffset += JTEntrySize;
+  }
+}
+
+void RuntimeDyldMachO::populatePointersSection(MachOObjectFile &Obj,
+                                               const SectionRef &PTSection,
+                                               unsigned PTSectionID) {
+  assert(!Obj.is64Bit() &&
+         "__pointers section not supported in 64-bit MachO.");
+
+  MachO::dysymtab_command DySymTabCmd = Obj.getDysymtabLoadCommand();
+  MachO::section Sec32 = Obj.getSection(PTSection.getRawDataRefImpl());
+  uint32_t PTSectionSize = Sec32.size;
+  unsigned FirstIndirectSymbol = Sec32.reserved1;
+  const unsigned PTEntrySize = 4;
+  unsigned NumPTEntries = PTSectionSize / PTEntrySize;
+  unsigned PTEntryOffset = 0;
+
+  assert((PTSectionSize % PTEntrySize) == 0 &&
+         "Pointers section does not contain a whole number of stubs?");
+
+  DEBUG(dbgs() << "Populating __pointers, Section ID " << PTSectionID
+               << ", " << NumPTEntries << " entries, "
+               << PTEntrySize << " bytes each:\n");
+
+  for (unsigned i = 0; i < NumPTEntries; ++i) {
+    unsigned SymbolIndex =
+      Obj.getIndirectSymbolTableEntry(DySymTabCmd, FirstIndirectSymbol + i);
+    symbol_iterator SI = Obj.getSymbolByIndex(SymbolIndex);
+    StringRef IndirectSymbolName;
+    SI->getName(IndirectSymbolName);
+    DEBUG(dbgs() << "  " << IndirectSymbolName << ": index " << SymbolIndex
+          << ", PT offset: " << PTEntryOffset << "\n");
+    RelocationEntry RE(PTSectionID, PTEntryOffset,
+                       MachO::GENERIC_RELOC_VANILLA, 0, false, 2);
+    addRelocationForSymbol(RE, IndirectSymbolName);
+    PTEntryOffset += PTEntrySize;
+  }
+}
+
+
+section_iterator getSectionByAddress(const MachOObjectFile &Obj,
+                                     uint64_t Addr) {
+  section_iterator SI = Obj.section_begin();
+  section_iterator SE = Obj.section_end();
+
+  for (; SI != SE; ++SI) {
+    uint64_t SAddr, SSize;
+    SI->getAddress(SAddr);
+    SI->getSize(SSize);
+    if ((Addr >= SAddr) && (Addr < SAddr + SSize))
+      return SI;
+  }
+
+  return SE;
+}
+
+relocation_iterator RuntimeDyldMachO::processSECTDIFFRelocation(
+                                            unsigned SectionID,
+                                            relocation_iterator RelI,
+                                            ObjectImage &Obj,
+                                            ObjSectionToIDMap &ObjSectionToID) {
+  const MachOObjectFile *MachO =
+    static_cast<const MachOObjectFile*>(Obj.getObjectFile());
+  MachO::any_relocation_info RE =
+    MachO->getRelocation(RelI->getRawDataRefImpl());
+
+  SectionEntry &Section = Sections[SectionID];
+  uint32_t RelocType = MachO->getAnyRelocationType(RE);
+  bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
+  unsigned Size = MachO->getAnyRelocationLength(RE);
+  uint64_t Offset;
+  RelI->getOffset(Offset);
+  uint8_t *LocalAddress = Section.Address + Offset;
+  unsigned NumBytes = 1 << Size;
+  int64_t Addend = 0;
+  memcpy(&Addend, LocalAddress, NumBytes);
+
+  ++RelI;
+  MachO::any_relocation_info RE2 =
+    MachO->getRelocation(RelI->getRawDataRefImpl());
+
+  uint32_t AddrA = MachO->getScatteredRelocationValue(RE);
+  section_iterator SAI = getSectionByAddress(*MachO, AddrA);
+  assert(SAI != MachO->section_end() && "Can't find section for address A");
+  uint64_t SectionABase;
+  SAI->getAddress(SectionABase);
+  uint64_t SectionAOffset = AddrA - SectionABase;
+  SectionRef SectionA = *SAI;
+  bool IsCode;
+  SectionA.isText(IsCode);
+  uint32_t SectionAID = findOrEmitSection(Obj, SectionA, IsCode,
+                                          ObjSectionToID);
+
+  uint32_t AddrB = MachO->getScatteredRelocationValue(RE2);
+  section_iterator SBI = getSectionByAddress(*MachO, AddrB);
+  assert(SBI != MachO->section_end() && "Can't find section for address B");
+  uint64_t SectionBBase;
+  SBI->getAddress(SectionBBase);
+  uint64_t SectionBOffset = AddrB - SectionBBase;
+  SectionRef SectionB = *SBI;
+  uint32_t SectionBID = findOrEmitSection(Obj, SectionB, IsCode,
+                                          ObjSectionToID);
+
+  if (Addend != AddrA - AddrB)
+    Error("Unexpected SECTDIFF relocation addend.");
+
+  DEBUG(dbgs() << "Found SECTDIFF: AddrA: " << AddrA << ", AddrB: " << AddrB
+               << ", Addend: " << Addend << ", SectionA ID: "
+               << SectionAID << ", SectionAOffset: " << SectionAOffset
+               << ", SectionB ID: " << SectionBID << ", SectionBOffset: "
+               << SectionBOffset << "\n");
+  RelocationEntry R(SectionID, Offset, RelocType, 0,
+                    SectionAID, SectionAOffset, SectionBID, SectionBOffset,
+                    IsPCRel, Size);
+
+  addRelocationForSection(R, SectionAID);
+  addRelocationForSection(R, SectionBID);
+
+  return ++RelI;
+}
+
+relocation_iterator RuntimeDyldMachO::processI386ScatteredVANILLA(
+                                            unsigned SectionID,
+                                            relocation_iterator RelI,
+                                            ObjectImage &Obj,
+                                            ObjSectionToIDMap &ObjSectionToID) {
+  const MachOObjectFile *MachO =
+    static_cast<const MachOObjectFile*>(Obj.getObjectFile());
+  MachO::any_relocation_info RE =
+    MachO->getRelocation(RelI->getRawDataRefImpl());
+
+  SectionEntry &Section = Sections[SectionID];
+  uint32_t RelocType = MachO->getAnyRelocationType(RE);
+  bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
+  unsigned Size = MachO->getAnyRelocationLength(RE);
+  uint64_t Offset;
+  RelI->getOffset(Offset);
+  uint8_t *LocalAddress = Section.Address + Offset;
+  unsigned NumBytes = 1 << Size;
+  int64_t Addend = 0;
+  memcpy(&Addend, LocalAddress, NumBytes);
+
+  unsigned SymbolBaseAddr = MachO->getScatteredRelocationValue(RE);
+  section_iterator TargetSI = getSectionByAddress(*MachO, SymbolBaseAddr);
+  assert(TargetSI != MachO->section_end() && "Can't find section for symbol");
+  uint64_t SectionBaseAddr;
+  TargetSI->getAddress(SectionBaseAddr);
+  SectionRef TargetSection = *TargetSI;
+  bool IsCode;
+  TargetSection.isText(IsCode);
+  uint32_t TargetSectionID = findOrEmitSection(Obj, TargetSection, IsCode,
+                                               ObjSectionToID);
+
+  Addend -= SectionBaseAddr;
+  RelocationEntry R(SectionID, Offset, RelocType, Addend,
+                    IsPCRel, Size);
+
+  addRelocationForSection(R, TargetSectionID);
+
+  return ++RelI;
+}
+
 relocation_iterator RuntimeDyldMachO::processRelocationRef(
     unsigned SectionID, relocation_iterator RelI, ObjectImage &Obj,
     ObjSectionToIDMap &ObjSectionToID, const SymbolTableMap &Symbols,
@@ -358,18 +540,28 @@ relocation_iterator RuntimeDyldMachO::processRelocationRef(
   uint32_t RelType = MachO->getAnyRelocationType(RE);
 
   // FIXME: Properly handle scattered relocations.
-  //        For now, optimistically skip these: they can often be ignored, as
-  //        the static linker will already have applied the relocation, and it
-  //        only needs to be reapplied if symbols move relative to one another.
-  //        Note: This will fail horribly where the relocations *do* need to be
-  //        applied, but that was already the case.
-  if (MachO->isRelocationScattered(RE))
-    return ++RelI;
+  //        Special case the couple of scattered relocations that we know how
+  //        to handle: SECTDIFF relocations, and scattered VANILLA relocations
+  //        on I386.
+  //        For all other scattered relocations, just bail out and hope for the
+  //        best, since the offsets computed by scattered relocations have often
+  //        been optimisticaly filled in by the compiler. This will fail
+  //        horribly where the relocations *do* need to be applied, but that was
+  //        already the case.
+  if (MachO->isRelocationScattered(RE)) {
+    if (RelType == MachO::GENERIC_RELOC_SECTDIFF ||
+        RelType == MachO::GENERIC_RELOC_LOCAL_SECTDIFF)
+      return processSECTDIFFRelocation(SectionID, RelI, Obj, ObjSectionToID);
+    else if (Arch == Triple::x86 && RelType == MachO::GENERIC_RELOC_VANILLA)
+      return processI386ScatteredVANILLA(SectionID, RelI, Obj, ObjSectionToID);
+    else
+      return ++RelI;
+  }
 
   RelocationValueRef Value;
   SectionEntry &Section = Sections[SectionID];
 
-  bool isExtern = MachO->getPlainRelocationExternal(RE);
+  bool IsExtern = MachO->getPlainRelocationExternal(RE);
   bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
   unsigned Size = MachO->getAnyRelocationLength(RE);
   uint64_t Offset;
@@ -379,7 +571,7 @@ relocation_iterator RuntimeDyldMachO::processRelocationRef(
   uint64_t Addend = 0;
   memcpy(&Addend, LocalAddress, NumBytes);
 
-  if (isExtern) {
+  if (IsExtern) {
     // Obtain the symbol name which is referenced in the relocation
     symbol_iterator Symbol = RelI->getSymbol();
     StringRef TargetName;
@@ -401,6 +593,17 @@ relocation_iterator RuntimeDyldMachO::processRelocationRef(
         Value.Addend = Addend;
       }
     }
+
+    // Addends for external, PC-rel relocations on i386 point back to the zero
+    // offset. Calculate the final offset from the relocation target instead.
+    // This allows us to use the same logic for both external and internal
+    // relocations in resolveI386RelocationRef.
+    if (Arch == Triple::x86 && IsPCRel) {
+      uint64_t RelocAddr = 0;
+      RelI->getAddress(RelocAddr);
+      Value.Addend += RelocAddr + 4;
+    }
+
   } else {
     SectionRef Sec = MachO->getRelocationSection(RE);
     bool IsCode = false;
@@ -417,6 +620,10 @@ relocation_iterator RuntimeDyldMachO::processRelocationRef(
                                  RelType == MachO::X86_64_RELOC_GOT_LOAD)) {
     assert(IsPCRel);
     assert(Size == 2);
+
+    // FIXME: Teach the generic code above not to prematurely conflate
+    //        relocation addends and symbol offsets.
+    Value.Addend -= Addend;
     StubMap::const_iterator i = Stubs.find(Value);
     uint8_t *Addr;
     if (i != Stubs.end()) {
@@ -424,41 +631,45 @@ relocation_iterator RuntimeDyldMachO::processRelocationRef(
     } else {
       Stubs[Value] = Section.StubOffset;
       uint8_t *GOTEntry = Section.Address + Section.StubOffset;
-      RelocationEntry RE(SectionID, Section.StubOffset,
-                         MachO::X86_64_RELOC_UNSIGNED, 0, false, 3);
+      RelocationEntry GOTRE(SectionID, Section.StubOffset,
+                            MachO::X86_64_RELOC_UNSIGNED, Value.Addend, false,
+                            3);
       if (Value.SymbolName)
-        addRelocationForSymbol(RE, Value.SymbolName);
+        addRelocationForSymbol(GOTRE, Value.SymbolName);
       else
-        addRelocationForSection(RE, Value.SectionID);
+        addRelocationForSection(GOTRE, Value.SectionID);
       Section.StubOffset += 8;
       Addr = GOTEntry;
     }
-    resolveRelocation(Section, Offset, (uint64_t)Addr,
-                      MachO::X86_64_RELOC_UNSIGNED, Value.Addend, true, 2);
+    RelocationEntry TargetRE(SectionID, Offset,
+                             MachO::X86_64_RELOC_UNSIGNED, Addend, true,
+                             2);
+    resolveRelocation(TargetRE, (uint64_t)Addr);
   } else if (Arch == Triple::arm && (RelType & 0xf) == MachO::ARM_RELOC_BR24) {
     // This is an ARM branch relocation, need to use a stub function.
 
     //  Look up for existing stub.
     StubMap::const_iterator i = Stubs.find(Value);
-    if (i != Stubs.end())
-      resolveRelocation(Section, Offset, (uint64_t)Section.Address + i->second,
-                        RelType, 0, IsPCRel, Size);
-    else {
+    uint8_t *Addr;
+    if (i != Stubs.end()) {
+      Addr = Section.Address + i->second;
+    } else {
       // Create a new stub function.
       Stubs[Value] = Section.StubOffset;
       uint8_t *StubTargetAddr =
           createStubFunction(Section.Address + Section.StubOffset);
-      RelocationEntry RE(SectionID, StubTargetAddr - Section.Address,
-                         MachO::GENERIC_RELOC_VANILLA, Value.Addend);
+      RelocationEntry StubRE(SectionID, StubTargetAddr - Section.Address,
+                             MachO::GENERIC_RELOC_VANILLA, Value.Addend);
       if (Value.SymbolName)
-        addRelocationForSymbol(RE, Value.SymbolName);
+        addRelocationForSymbol(StubRE, Value.SymbolName);
       else
-        addRelocationForSection(RE, Value.SectionID);
-      resolveRelocation(Section, Offset,
-                        (uint64_t)Section.Address + Section.StubOffset, RelType,
-                        0, IsPCRel, Size);
+        addRelocationForSection(StubRE, Value.SectionID);
+      Addr = Section.Address + Section.StubOffset;
       Section.StubOffset += getMaxStubSize();
     }
+    RelocationEntry TargetRE(Value.SectionID, Offset, RelType, 0, IsPCRel,
+                             Size);
+    resolveRelocation(TargetRE, (uint64_t)Addr);
   } else {
     RelocationEntry RE(SectionID, Offset, RelType, Value.Addend, IsPCRel, Size);
     if (Value.SymbolName)
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 1006176..060eb8c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -25,22 +25,31 @@ using namespace llvm::object;
 
 namespace llvm {
 class RuntimeDyldMachO : public RuntimeDyldImpl {
-  bool resolveI386Relocation(uint8_t *LocalAddress, uint64_t FinalAddress,
-                             uint64_t Value, bool isPCRel, unsigned Type,
-                             unsigned Size, int64_t Addend);
-  bool resolveX86_64Relocation(uint8_t *LocalAddress, uint64_t FinalAddress,
-                               uint64_t Value, bool isPCRel, unsigned Type,
-                               unsigned Size, int64_t Addend);
-  bool resolveARMRelocation(uint8_t *LocalAddress, uint64_t FinalAddress,
-                            uint64_t Value, bool isPCRel, unsigned Type,
-                            unsigned Size, int64_t Addend);
-  bool resolveARM64Relocation(uint8_t *LocalAddress, uint64_t FinalAddress,
-                              uint64_t Value, bool IsPCRel, unsigned Type,
-                              unsigned Size, int64_t Addend);
-
-  void resolveRelocation(const SectionEntry &Section, uint64_t Offset,
-                         uint64_t Value, uint32_t Type, int64_t Addend,
-                         bool isPCRel, unsigned Size);
+private:
+
+  /// Write the least significant 'Size' bytes in 'Value' out at the address
+  /// pointed to by Addr.
+  bool applyRelocationValue(uint8_t *Addr, uint64_t Value, unsigned Size) {
+    for (unsigned i = 0; i < Size; ++i) {
+      *Addr++ = (uint8_t)Value;
+      Value >>= 8;
+    }
+
+    return false;
+  }
+
+  bool resolveI386Relocation(const RelocationEntry &RE, uint64_t Value);
+  bool resolveX86_64Relocation(const RelocationEntry &RE, uint64_t Value);
+  bool resolveARMRelocation(const RelocationEntry &RE, uint64_t Value);
+  bool resolveAArch64Relocation(const RelocationEntry &RE, uint64_t Value);
+
+  // Populate stubs in __jump_table section.
+  void populateJumpTable(MachOObjectFile &Obj, const SectionRef &JTSection,
+                         unsigned JTSectionID);
+
+  // Populate __pointers section.
+  void populatePointersSection(MachOObjectFile &Obj, const SectionRef &PTSection,
+                               unsigned PTSectionID);
 
   unsigned getMaxStubSize() override {
     if (Arch == Triple::arm || Arch == Triple::thumb)
@@ -53,6 +62,18 @@ class RuntimeDyldMachO : public RuntimeDyldImpl {
 
   unsigned getStubAlignment() override { return 1; }
 
+  relocation_iterator processSECTDIFFRelocation(
+                                             unsigned SectionID,
+                                             relocation_iterator RelI,
+                                             ObjectImage &ObjImg,
+                                             ObjSectionToIDMap &ObjSectionToID);
+
+  relocation_iterator processI386ScatteredVANILLA(
+					     unsigned SectionID,
+					     relocation_iterator RelI,
+					     ObjectImage &ObjImg,
+					     ObjSectionToIDMap &ObjSectionToID);
+
   struct EHFrameRelatedSections {
     EHFrameRelatedSections()
         : EHFrameSID(RTDYLD_INVALID_SECTION_ID),
@@ -81,15 +102,16 @@ public:
   bool isCompatibleFormat(const ObjectBuffer *Buffer) const override;
   bool isCompatibleFile(const object::ObjectFile *Obj) const override;
   void registerEHFrames() override;
-  void finalizeLoad(ObjSectionToIDMap &SectionMap) override;
+  void finalizeLoad(ObjectImage &ObjImg,
+                    ObjSectionToIDMap &SectionMap) override;
 
   static ObjectImage *createObjectImage(ObjectBuffer *InputBuffer) {
     return new ObjectImageCommon(InputBuffer);
   }
 
   static ObjectImage *
-  createObjectImageFromFile(object::ObjectFile *InputObject) {
-    return new ObjectImageCommon(InputObject);
+  createObjectImageFromFile(std::unique_ptr<object::ObjectFile> InputObject) {
+    return new ObjectImageCommon(std::move(InputObject));
   }
 };
 
diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index 9b7d348..b10d51f 100644
--- a/lib/ExecutionEngine/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -47,7 +47,7 @@ TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
     TheTriple.setTriple(sys::getProcessTriple());
 
   // Adjust the triple to match what the user requested.
-  const Target *TheTarget = 0;
+  const Target *TheTarget = nullptr;
   if (!MArch.empty()) {
     for (TargetRegistry::iterator it = TargetRegistry::begin(),
            ie = TargetRegistry::end(); it != ie; ++it) {
@@ -61,7 +61,7 @@ TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
       if (ErrorStr)
         *ErrorStr = "No available targets are compatible with this -march, "
                     "see -version for the available targets.\n";
-      return 0;
+      return nullptr;
     }
 
     // Adjust the triple to match (if known), otherwise stick with the
@@ -72,10 +72,10 @@ TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
   } else {
     std::string Error;
     TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), Error);
-    if (TheTarget == 0) {
+    if (!TheTarget) {
       if (ErrorStr)
         *ErrorStr = Error;
-      return 0;
+      return nullptr;
     }
   }
 
diff --git a/lib/IR/Android.mk b/lib/IR/Android.mk
index 071bb04..dd95703 100644
--- a/lib/IR/Android.mk
+++ b/lib/IR/Android.mk
@@ -30,6 +30,7 @@ vmcore_SRC_FILES := \
   LeakDetector.cpp \
   LegacyPassManager.cpp \
   Mangler.cpp \
+  MDBuilder.cpp \
   Metadata.cpp \
   Module.cpp \
   Pass.cpp \
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index d4670e4..0fef0d0 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -51,19 +51,19 @@ AssemblyAnnotationWriter::~AssemblyAnnotationWriter() {}
 
 static const Module *getModuleFromVal(const Value *V) {
   if (const Argument *MA = dyn_cast<Argument>(V))
-    return MA->getParent() ? MA->getParent()->getParent() : 0;
+    return MA->getParent() ? MA->getParent()->getParent() : nullptr;
 
   if (const BasicBlock *BB = dyn_cast<BasicBlock>(V))
-    return BB->getParent() ? BB->getParent()->getParent() : 0;
+    return BB->getParent() ? BB->getParent()->getParent() : nullptr;
 
   if (const Instruction *I = dyn_cast<Instruction>(V)) {
-    const Function *M = I->getParent() ? I->getParent()->getParent() : 0;
-    return M ? M->getParent() : 0;
+    const Function *M = I->getParent() ? I->getParent()->getParent() : nullptr;
+    return M ? M->getParent() : nullptr;
   }
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
     return GV->getParent();
-  return 0;
+  return nullptr;
 }
 
 static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
@@ -78,7 +78,6 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
   case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
   case CallingConv::X86_ThisCall:  Out << "x86_thiscallcc"; break;
-  case CallingConv::X86_CDeclMethod:Out << "x86_cdeclmethodcc"; break;
   case CallingConv::Intel_OCL_BI:  Out << "intel_ocl_bicc"; break;
   case CallingConv::ARM_APCS:      Out << "arm_apcscc"; break;
   case CallingConv::ARM_AAPCS:     Out << "arm_aapcscc"; break;
@@ -421,10 +420,10 @@ static SlotTracker *createSlotTracker(const Value *V) {
     if (!MD->isFunctionLocal())
       return new SlotTracker(MD->getFunction());
 
-    return new SlotTracker((Function *)0);
+    return new SlotTracker((Function *)nullptr);
   }
 
-  return 0;
+  return nullptr;
 }
 
 #if 0
@@ -436,21 +435,21 @@ static SlotTracker *createSlotTracker(const Value *V) {
 // Module level constructor. Causes the contents of the Module (sans functions)
 // to be added to the slot table.
 SlotTracker::SlotTracker(const Module *M)
-  : TheModule(M), TheFunction(0), FunctionProcessed(false),
+  : TheModule(M), TheFunction(nullptr), FunctionProcessed(false),
     mNext(0), fNext(0),  mdnNext(0), asNext(0) {
 }
 
 // Function level constructor. Causes the contents of the Module and the one
 // function provided to be added to the slot table.
 SlotTracker::SlotTracker(const Function *F)
-  : TheModule(F ? F->getParent() : 0), TheFunction(F), FunctionProcessed(false),
-    mNext(0), fNext(0), mdnNext(0), asNext(0) {
+  : TheModule(F ? F->getParent() : nullptr), TheFunction(F),
+    FunctionProcessed(false), mNext(0), fNext(0), mdnNext(0), asNext(0) {
 }
 
 inline void SlotTracker::initialize() {
   if (TheModule) {
     processModule();
-    TheModule = 0; ///< Prevent re-processing next time we're called.
+    TheModule = nullptr; ///< Prevent re-processing next time we're called.
   }
 
   if (TheFunction && !FunctionProcessed)
@@ -560,7 +559,7 @@ void SlotTracker::processFunction() {
 void SlotTracker::purgeFunction() {
   ST_DEBUG("begin purgeFunction!\n");
   fMap.clear(); // Simply discard the function level map
-  TheFunction = 0;
+  TheFunction = nullptr;
   FunctionProcessed = false;
   ST_DEBUG("end purgeFunction!\n");
 }
@@ -1048,7 +1047,7 @@ static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
   Out << "!{";
   for (unsigned mi = 0, me = Node->getNumOperands(); mi != me; ++mi) {
     const Value *V = Node->getOperand(mi);
-    if (V == 0)
+    if (!V)
       Out << "null";
     else {
       TypePrinter->print(V->getType(), Out);
@@ -1126,12 +1125,6 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
     return;
   }
 
-  if (V->getValueID() == Value::PseudoSourceValueVal ||
-      V->getValueID() == Value::FixedStackPseudoSourceValueVal) {
-    V->print(Out);
-    return;
-  }
-
   char Prefix = '%';
   int Slot;
   // If we have a SlotTracker, use it.
@@ -1160,7 +1153,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
       Slot = Machine->getLocalSlot(V);
     }
     delete Machine;
-    Machine = 0;
+    Machine = nullptr;
   } else {
     Slot = -1;
   }
@@ -1194,7 +1187,7 @@ AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, const Module *M,
 AssemblyWriter::~AssemblyWriter() { }
 
 void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
-  if (Operand == 0) {
+  if (!Operand) {
     Out << "<null operand!>";
     return;
   }
@@ -1259,7 +1252,7 @@ void AssemblyWriter::writeAtomicCmpXchg(AtomicOrdering SuccessOrdering,
 
 void AssemblyWriter::writeParamOperand(const Value *Operand,
                                        AttributeSet Attrs, unsigned Idx) {
-  if (Operand == 0) {
+  if (!Operand) {
     Out << "<null operand!>";
     return;
   }
@@ -1500,10 +1493,16 @@ void AssemblyWriter::printAlias(const GlobalAlias *GA) {
 
   PrintLinkage(GA->getLinkage(), Out);
 
+  PointerType *Ty = GA->getType();
   const Constant *Aliasee = GA->getAliasee();
+  if (!Aliasee || Ty != Aliasee->getType()) {
+    if (unsigned AddressSpace = Ty->getAddressSpace())
+      Out << "addrspace(" << AddressSpace << ") ";
+    TypePrinter.print(Ty->getElementType(), Out);
+    Out << ", ";
+  }
 
-  if (Aliasee == 0) {
-    TypePrinter.print(GA->getType(), Out);
+  if (!Aliasee) {
     Out << " <<NULL ALIASEE>>";
   } else {
     writeOperand(Aliasee, !isa<ConstantExpr>(Aliasee));
@@ -1707,7 +1706,7 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
       Out << "<badref>";
   }
 
-  if (BB->getParent() == 0) {
+  if (!BB->getParent()) {
     Out.PadToColumn(50);
     Out << "; Error: Block without parent!";
   } else if (BB != &BB->getParent()->getEntryBlock()) {  // Not the entry block?
@@ -1774,8 +1773,12 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       Out << '%' << SlotNum << " = ";
   }
 
-  if (isa<CallInst>(I) && cast<CallInst>(I).isTailCall())
-    Out << "tail ";
+  if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+    if (CI->isMustTailCall())
+      Out << "musttail ";
+    else if (CI->isTailCall())
+      Out << "tail ";
+  }
 
   // Print out the opcode...
   Out << I.getOpcodeName();
@@ -1804,7 +1807,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     writeAtomicRMWOperation(Out, RMWI->getOperation());
 
   // Print out the type of the operands...
-  const Value *Operand = I.getNumOperands() ? I.getOperand(0) : 0;
+  const Value *Operand = I.getNumOperands() ? I.getOperand(0) : nullptr;
 
   // Special case conditional branches to swizzle the condition out to the front
   if (isa<BranchInst>(I) && cast<BranchInst>(I).isConditional()) {
@@ -2147,15 +2150,15 @@ void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW) const {
   W.printModule(this);
 }
 
-void NamedMDNode::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW) const {
+void NamedMDNode::print(raw_ostream &ROS) const {
   SlotTracker SlotTable(getParent());
   formatted_raw_ostream OS(ROS);
-  AssemblyWriter W(OS, SlotTable, getParent(), AAW);
+  AssemblyWriter W(OS, SlotTable, getParent(), nullptr);
   W.printNamedMDNode(this);
 }
 
 void Type::print(raw_ostream &OS) const {
-  if (this == 0) {
+  if (!this) {
     OS << "<null Type>";
     return;
   }
@@ -2170,24 +2173,24 @@ void Type::print(raw_ostream &OS) const {
     }
 }
 
-void Value::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW) const {
-  if (this == 0) {
+void Value::print(raw_ostream &ROS) const {
+  if (!this) {
     ROS << "printing a <null> value\n";
     return;
   }
   formatted_raw_ostream OS(ROS);
   if (const Instruction *I = dyn_cast<Instruction>(this)) {
-    const Function *F = I->getParent() ? I->getParent()->getParent() : 0;
+    const Function *F = I->getParent() ? I->getParent()->getParent() : nullptr;
     SlotTracker SlotTable(F);
-    AssemblyWriter W(OS, SlotTable, getModuleFromVal(I), AAW);
+    AssemblyWriter W(OS, SlotTable, getModuleFromVal(I), nullptr);
     W.printInstruction(*I);
   } else if (const BasicBlock *BB = dyn_cast<BasicBlock>(this)) {
     SlotTracker SlotTable(BB->getParent());
-    AssemblyWriter W(OS, SlotTable, getModuleFromVal(BB), AAW);
+    AssemblyWriter W(OS, SlotTable, getModuleFromVal(BB), nullptr);
     W.printBasicBlock(BB);
   } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(this)) {
     SlotTracker SlotTable(GV->getParent());
-    AssemblyWriter W(OS, SlotTable, GV->getParent(), AAW);
+    AssemblyWriter W(OS, SlotTable, GV->getParent(), nullptr);
     if (const GlobalVariable *V = dyn_cast<GlobalVariable>(GV))
       W.printGlobal(V);
     else if (const Function *F = dyn_cast<Function>(GV))
@@ -2197,20 +2200,18 @@ void Value::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW) const {
   } else if (const MDNode *N = dyn_cast<MDNode>(this)) {
     const Function *F = N->getFunction();
     SlotTracker SlotTable(F);
-    AssemblyWriter W(OS, SlotTable, F ? F->getParent() : 0, AAW);
+    AssemblyWriter W(OS, SlotTable, F ? F->getParent() : nullptr, nullptr);
     W.printMDNodeBody(N);
   } else if (const Constant *C = dyn_cast<Constant>(this)) {
     TypePrinting TypePrinter;
     TypePrinter.print(C->getType(), OS);
     OS << ' ';
-    WriteConstantInternal(OS, C, TypePrinter, 0, 0);
+    WriteConstantInternal(OS, C, TypePrinter, nullptr, nullptr);
   } else if (isa<InlineAsm>(this) || isa<MDString>(this) ||
              isa<Argument>(this)) {
     this->printAsOperand(OS);
   } else {
-    // Otherwise we don't know what it is. Call the virtual function to
-    // allow a subclass to print itself.
-    printCustom(OS);
+    llvm_unreachable("Unknown value to print out!");
   }
 }
 
@@ -2220,7 +2221,7 @@ void Value::printAsOperand(raw_ostream &O, bool PrintType, const Module *M) cons
   if (!PrintType &&
       ((!isa<Constant>(this) && !isa<MDNode>(this)) ||
        hasName() || isa<GlobalValue>(this))) {
-    WriteAsOperandInternal(O, this, 0, 0, M);
+    WriteAsOperandInternal(O, this, nullptr, nullptr, M);
     return;
   }
 
@@ -2235,12 +2236,7 @@ void Value::printAsOperand(raw_ostream &O, bool PrintType, const Module *M) cons
     O << ' ';
   }
 
-  WriteAsOperandInternal(O, this, &TypePrinter, 0, M);
-}
-
-// Value::printCustom - subclasses should override this to implement printing.
-void Value::printCustom(raw_ostream &OS) const {
-  llvm_unreachable("Unknown value to print out!");
+  WriteAsOperandInternal(O, this, &TypePrinter, nullptr, M);
 }
 
 // Value::dump - allow easy printing of Values from the debugger.
@@ -2250,7 +2246,7 @@ void Value::dump() const { print(dbgs()); dbgs() << '\n'; }
 void Type::dump() const { print(dbgs()); }
 
 // Module::dump() - Allow printing of Modules from the debugger.
-void Module::dump() const { print(dbgs(), 0); }
+void Module::dump() const { print(dbgs(), nullptr); }
 
 // NamedMDNode::dump() - Allow printing of NamedMDNodes from the debugger.
-void NamedMDNode::dump() const { print(dbgs(), 0); }
+void NamedMDNode::dump() const { print(dbgs()); }
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index 9d9d948..a9074bb 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/Attributes.h"
 #include "AttributeImpl.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Atomic.h"
@@ -192,6 +193,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "noinline";
   if (hasAttribute(Attribute::NonLazyBind))
     return "nonlazybind";
+  if (hasAttribute(Attribute::NonNull))
+    return "nonnull";
   if (hasAttribute(Attribute::NoRedZone))
     return "noredzone";
   if (hasAttribute(Attribute::NoReturn))
@@ -391,6 +394,7 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
   case Attribute::Builtin:         return 1ULL << 41;
   case Attribute::OptimizeNone:    return 1ULL << 42;
   case Attribute::InAlloca:        return 1ULL << 43;
+  case Attribute::NonNull:         return 1ULL << 44;
   }
   llvm_unreachable("Unsupported attribute type");
 }
@@ -402,7 +406,7 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
 AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
                                         ArrayRef<Attribute> Attrs) {
   if (Attrs.empty())
-    return 0;
+    return nullptr;
 
   // Otherwise, build a key to look up the existing attributes.
   LLVMContextImpl *pImpl = C.pImpl;
@@ -595,7 +599,8 @@ AttributeSet AttributeSet::get(LLVMContext &C,
   return getImpl(C, Attrs);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index, AttrBuilder &B) {
+AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
+                               const AttrBuilder &B) {
   if (!B.hasAttributes())
     return AttributeSet();
 
@@ -617,9 +622,9 @@ AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index, AttrBuilder &B) {
   }
 
   // Add target-dependent (string) attributes.
-  for (AttrBuilder::td_iterator I = B.td_begin(), E = B.td_end();
-       I != E; ++I)
-    Attrs.push_back(std::make_pair(Index, Attribute::get(C, I->first,I->second)));
+  for (const AttrBuilder::td_type &TDA : B.td_attrs())
+    Attrs.push_back(
+        std::make_pair(Index, Attribute::get(C, TDA.first, TDA.second)));
 
   return get(C, Attrs);
 }
@@ -836,7 +841,7 @@ bool AttributeSet::hasAttributes(unsigned Index) const {
 /// \brief Return true if the specified attribute is set for at least one
 /// parameter or for the return value.
 bool AttributeSet::hasAttrSomewhere(Attribute::AttrKind Attr) const {
-  if (pImpl == 0) return false;
+  if (!pImpl) return false;
 
   for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I)
     for (AttributeSetImpl::iterator II = pImpl->begin(I),
@@ -877,14 +882,14 @@ std::string AttributeSet::getAsString(unsigned Index,
 
 /// \brief The attributes for the specified index are returned.
 AttributeSetNode *AttributeSet::getAttributes(unsigned Index) const {
-  if (!pImpl) return 0;
+  if (!pImpl) return nullptr;
 
   // Loop through to find the attribute node we want.
   for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I)
     if (pImpl->getSlotIndex(I) == Index)
       return pImpl->getSlotNode(I);
 
-  return 0;
+  return nullptr;
 }
 
 AttributeSet::iterator AttributeSet::begin(unsigned Slot) const {
@@ -1175,6 +1180,7 @@ AttributeSet AttributeFuncs::typeIncompatible(Type *Ty, uint64_t Index) {
       .addAttribute(Attribute::Nest)
       .addAttribute(Attribute::NoAlias)
       .addAttribute(Attribute::NoCapture)
+      .addAttribute(Attribute::NonNull)
       .addAttribute(Attribute::ReadNone)
       .addAttribute(Attribute::ReadOnly)
       .addAttribute(Attribute::StructRet)
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index b7429b3..e255113 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -115,7 +115,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name == "x86.avx.movnt.ps.256" ||
         Name == "x86.sse42.crc32.64.8" ||
         (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
-      NewFn = 0;
+      NewFn = nullptr;
       return true;
     }
     // SSE4.1 ptest functions may have an old signature.
@@ -158,7 +158,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
 }
 
 bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
-  NewFn = 0;
+  NewFn = nullptr;
   bool Upgraded = UpgradeIntrinsicFunction1(F, NewFn);
 
   // Upgrade intrinsic attributes.  This does not change the function.
@@ -170,7 +170,62 @@ bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
   return Upgraded;
 }
 
+static bool UpgradeGlobalStructors(GlobalVariable *GV) {
+  ArrayType *ATy = dyn_cast<ArrayType>(GV->getType()->getElementType());
+  StructType *OldTy =
+      ATy ? dyn_cast<StructType>(ATy->getElementType()) : nullptr;
+
+  // Only upgrade an array of a two field struct with the appropriate field
+  // types.
+  if (!OldTy || OldTy->getNumElements() != 2)
+    return false;
+
+  // Get the upgraded 3 element type.
+  PointerType *VoidPtrTy = Type::getInt8Ty(GV->getContext())->getPointerTo();
+  Type *Tys[3] = {
+    OldTy->getElementType(0),
+    OldTy->getElementType(1),
+    VoidPtrTy
+  };
+  StructType *NewTy =
+      StructType::get(GV->getContext(), Tys, /*isPacked=*/false);
+
+  // Build new constants with a null third field filled in.
+  Constant *OldInitC = GV->getInitializer();
+  ConstantArray *OldInit = dyn_cast<ConstantArray>(OldInitC);
+  if (!OldInit && !isa<ConstantAggregateZero>(OldInitC))
+    return false;
+  std::vector<Constant *> Initializers;
+  if (OldInit) {
+    for (Use &U : OldInit->operands()) {
+      ConstantStruct *Init = cast<ConstantStruct>(&U);
+      Constant *NewInit =
+        ConstantStruct::get(NewTy, Init->getOperand(0), Init->getOperand(1),
+                            Constant::getNullValue(VoidPtrTy), nullptr);
+      Initializers.push_back(NewInit);
+    }
+  }
+  assert(Initializers.size() == ATy->getNumElements());
+
+  // Replace the old GV with a new one.
+  ATy = ArrayType::get(NewTy, Initializers.size());
+  Constant *NewInit = ConstantArray::get(ATy, Initializers);
+  GlobalVariable *NewGV = new GlobalVariable(
+      *GV->getParent(), ATy, GV->isConstant(), GV->getLinkage(), NewInit, "",
+      GV, GV->getThreadLocalMode(), GV->getType()->getAddressSpace(),
+      GV->isExternallyInitialized());
+  NewGV->copyAttributesFrom(GV);
+  NewGV->takeName(GV);
+  assert(GV->use_empty() && "program cannot use initializer list");
+  GV->eraseFromParent();
+  return true;
+}
+
 bool llvm::UpgradeGlobalVariable(GlobalVariable *GV) {
+  if (GV->getName() == "llvm.global_ctors" ||
+      GV->getName() == "llvm.global_dtors")
+    return UpgradeGlobalStructors(GV);
+
   // Nothing to do yet.
   return false;
 }
@@ -453,9 +508,9 @@ void llvm::UpgradeInstWithTBAATag(Instruction *I) {
 Instruction *llvm::UpgradeBitCastInst(unsigned Opc, Value *V, Type *DestTy,
                                       Instruction *&Temp) {
   if (Opc != Instruction::BitCast)
-    return 0;
+    return nullptr;
 
-  Temp = 0;
+  Temp = nullptr;
   Type *SrcTy = V->getType();
   if (SrcTy->isPtrOrPtrVectorTy() && DestTy->isPtrOrPtrVectorTy() &&
       SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace()) {
@@ -469,12 +524,12 @@ Instruction *llvm::UpgradeBitCastInst(unsigned Opc, Value *V, Type *DestTy,
     return CastInst::Create(Instruction::IntToPtr, Temp, DestTy);
   }
 
-  return 0;
+  return nullptr;
 }
 
 Value *llvm::UpgradeBitCastExpr(unsigned Opc, Constant *C, Type *DestTy) {
   if (Opc != Instruction::BitCast)
-    return 0;
+    return nullptr;
 
   Type *SrcTy = C->getType();
   if (SrcTy->isPtrOrPtrVectorTy() && DestTy->isPtrOrPtrVectorTy() &&
@@ -489,7 +544,7 @@ Value *llvm::UpgradeBitCastExpr(unsigned Opc, Constant *C, Type *DestTy) {
                                      DestTy);
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// Check the debug info version number, if it is out-dated, drop the debug
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index 3079f0a..ba07433 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -27,7 +27,7 @@ using namespace llvm;
 ValueSymbolTable *BasicBlock::getValueSymbolTable() {
   if (Function *F = getParent())
     return &F->getValueSymbolTable();
-  return 0;
+  return nullptr;
 }
 
 const DataLayout *BasicBlock::getDataLayout() const {
@@ -45,7 +45,7 @@ template class llvm::SymbolTableListTraits<Instruction, BasicBlock>;
 
 BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
                        BasicBlock *InsertBefore)
-  : Value(Type::getLabelTy(C), Value::BasicBlockVal), Parent(0) {
+  : Value(Type::getLabelTy(C), Value::BasicBlockVal), Parent(nullptr) {
 
   // Make sure that we get added to a function
   LeakDetector::addGarbageObject(this);
@@ -81,7 +81,7 @@ BasicBlock::~BasicBlock() {
     }
   }
 
-  assert(getParent() == 0 && "BasicBlock still linked into the program!");
+  assert(getParent() == nullptr && "BasicBlock still linked into the program!");
   dropAllReferences();
   InstList.clear();
 }
@@ -122,12 +122,12 @@ void BasicBlock::moveAfter(BasicBlock *MovePos) {
 
 
 TerminatorInst *BasicBlock::getTerminator() {
-  if (InstList.empty()) return 0;
+  if (InstList.empty()) return nullptr;
   return dyn_cast<TerminatorInst>(&InstList.back());
 }
 
 const TerminatorInst *BasicBlock::getTerminator() const {
-  if (InstList.empty()) return 0;
+  if (InstList.empty()) return nullptr;
   return dyn_cast<TerminatorInst>(&InstList.back());
 }
 
@@ -186,10 +186,10 @@ void BasicBlock::dropAllReferences() {
 /// return the block, otherwise return a null pointer.
 BasicBlock *BasicBlock::getSinglePredecessor() {
   pred_iterator PI = pred_begin(this), E = pred_end(this);
-  if (PI == E) return 0;         // No preds.
+  if (PI == E) return nullptr;         // No preds.
   BasicBlock *ThePred = *PI;
   ++PI;
-  return (PI == E) ? ThePred : 0 /*multiple preds*/;
+  return (PI == E) ? ThePred : nullptr /*multiple preds*/;
 }
 
 /// getUniquePredecessor - If this basic block has a unique predecessor block,
@@ -199,12 +199,12 @@ BasicBlock *BasicBlock::getSinglePredecessor() {
 /// a switch statement with multiple cases having the same destination).
 BasicBlock *BasicBlock::getUniquePredecessor() {
   pred_iterator PI = pred_begin(this), E = pred_end(this);
-  if (PI == E) return 0; // No preds.
+  if (PI == E) return nullptr; // No preds.
   BasicBlock *PredBB = *PI;
   ++PI;
   for (;PI != E; ++PI) {
     if (*PI != PredBB)
-      return 0;
+      return nullptr;
     // The same predecessor appears multiple times in the predecessor list.
     // This is OK.
   }
@@ -277,7 +277,7 @@ void BasicBlock::removePredecessor(BasicBlock *Pred,
       PN->removeIncomingValue(Pred, false);
       // If all incoming values to the Phi are the same, we can replace the Phi
       // with that value.
-      Value* PNV = 0;
+      Value* PNV = nullptr;
       if (!DontDeleteUselessPHIs && (PNV = PN->hasConstantValue()))
         if (PNV != PN) {
           PN->replaceAllUsesWith(PNV);
diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt
index 09117aa..b027ae5 100644
--- a/lib/IR/CMakeLists.txt
+++ b/lib/IR/CMakeLists.txt
@@ -7,12 +7,12 @@ add_llvm_library(LLVMCore
   ConstantRange.cpp
   Constants.cpp
   Core.cpp
-  DiagnosticInfo.cpp
-  DiagnosticPrinter.cpp
   DIBuilder.cpp
   DataLayout.cpp
   DebugInfo.cpp
   DebugLoc.cpp
+  DiagnosticInfo.cpp
+  DiagnosticPrinter.cpp
   Dominators.cpp
   Function.cpp
   GCOV.cpp
@@ -28,6 +28,7 @@ add_llvm_library(LLVMCore
   LLVMContextImpl.cpp
   LeakDetector.cpp
   LegacyPassManager.cpp
+  MDBuilder.cpp
   Mangler.cpp
   Metadata.cpp
   Module.cpp
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 612aba0..706e66f 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -51,7 +51,7 @@ static Constant *BitCastConstantVector(Constant *CV, VectorType *DstTy) {
   // Analysis/ConstantFolding.cpp
   unsigned NumElts = DstTy->getNumElements();
   if (NumElts != CV->getType()->getVectorNumElements())
-    return 0;
+    return nullptr;
   
   Type *DstEltTy = DstTy->getElementType();
 
@@ -94,7 +94,7 @@ foldConstantCastPair(
 
   // Let CastInst::isEliminableCastPair do the heavy lifting.
   return CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, DstTy,
-                                        0, FakeIntPtrTy, 0);
+                                        nullptr, FakeIntPtrTy, nullptr);
 }
 
 static Constant *FoldBitCast(Constant *V, Type *DestTy) {
@@ -139,7 +139,7 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
     if (VectorType *SrcTy = dyn_cast<VectorType>(V->getType())) {
       assert(DestPTy->getBitWidth() == SrcTy->getBitWidth() &&
              "Not cast between same sized vectors!");
-      SrcTy = NULL;
+      SrcTy = nullptr;
       // First, check for null.  Undef is already handled.
       if (isa<ConstantAggregateZero>(V))
         return Constant::getNullValue(DestTy);
@@ -173,7 +173,7 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
                                      CI->getValue()));
 
     // Otherwise, can't fold this (vector?)
-    return 0;
+    return nullptr;
   }
 
   // Handle ConstantFP input: FP -> Integral.
@@ -181,7 +181,7 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
     return ConstantInt::get(FP->getContext(),
                             FP->getValueAPF().bitcastToAPInt());
 
-  return 0;
+  return nullptr;
 }
 
 
@@ -216,14 +216,14 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
   // In the input is a constant expr, we might be able to recursively simplify.
   // If not, we definitely can't do anything.
   ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
-  if (CE == 0) return 0;
-  
+  if (!CE) return nullptr;
+
   switch (CE->getOpcode()) {
-  default: return 0;
+  default: return nullptr;
   case Instruction::Or: {
     Constant *RHS = ExtractConstantBytes(CE->getOperand(1), ByteStart,ByteSize);
-    if (RHS == 0)
-      return 0;
+    if (!RHS)
+      return nullptr;
     
     // X | -1 -> -1.
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS))
@@ -231,32 +231,32 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
         return RHSC;
     
     Constant *LHS = ExtractConstantBytes(CE->getOperand(0), ByteStart,ByteSize);
-    if (LHS == 0)
-      return 0;
+    if (!LHS)
+      return nullptr;
     return ConstantExpr::getOr(LHS, RHS);
   }
   case Instruction::And: {
     Constant *RHS = ExtractConstantBytes(CE->getOperand(1), ByteStart,ByteSize);
-    if (RHS == 0)
-      return 0;
+    if (!RHS)
+      return nullptr;
     
     // X & 0 -> 0.
     if (RHS->isNullValue())
       return RHS;
     
     Constant *LHS = ExtractConstantBytes(CE->getOperand(0), ByteStart,ByteSize);
-    if (LHS == 0)
-      return 0;
+    if (!LHS)
+      return nullptr;
     return ConstantExpr::getAnd(LHS, RHS);
   }
   case Instruction::LShr: {
     ConstantInt *Amt = dyn_cast<ConstantInt>(CE->getOperand(1));
-    if (Amt == 0)
-      return 0;
+    if (!Amt)
+      return nullptr;
     unsigned ShAmt = Amt->getZExtValue();
     // Cannot analyze non-byte shifts.
     if ((ShAmt & 7) != 0)
-      return 0;
+      return nullptr;
     ShAmt >>= 3;
     
     // If the extract is known to be all zeros, return zero.
@@ -268,17 +268,17 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
       return ExtractConstantBytes(CE->getOperand(0), ByteStart+ShAmt, ByteSize);
     
     // TODO: Handle the 'partially zero' case.
-    return 0;
+    return nullptr;
   }
     
   case Instruction::Shl: {
     ConstantInt *Amt = dyn_cast<ConstantInt>(CE->getOperand(1));
-    if (Amt == 0)
-      return 0;
+    if (!Amt)
+      return nullptr;
     unsigned ShAmt = Amt->getZExtValue();
     // Cannot analyze non-byte shifts.
     if ((ShAmt & 7) != 0)
-      return 0;
+      return nullptr;
     ShAmt >>= 3;
     
     // If the extract is known to be all zeros, return zero.
@@ -290,7 +290,7 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
       return ExtractConstantBytes(CE->getOperand(0), ByteStart-ShAmt, ByteSize);
     
     // TODO: Handle the 'partially zero' case.
-    return 0;
+    return nullptr;
   }
       
   case Instruction::ZExt: {
@@ -324,7 +324,7 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
     }
     
     // TODO: Handle the 'partially zero' case.
-    return 0;
+    return nullptr;
   }
   }
 }
@@ -376,7 +376,7 @@ static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy,
   // If there's no interesting folding happening, bail so that we don't create
   // a constant that looks like it needs folding but really doesn't.
   if (!Folded)
-    return 0;
+    return nullptr;
 
   // Base case: Get a regular sizeof expression.
   Constant *C = ConstantExpr::getSizeOf(Ty);
@@ -442,7 +442,7 @@ static Constant *getFoldedAlignOf(Type *Ty, Type *DestTy,
   // If there's no interesting folding happening, bail so that we don't create
   // a constant that looks like it needs folding but really doesn't.
   if (!Folded)
-    return 0;
+    return nullptr;
 
   // Base case: Get a regular alignof expression.
   Constant *C = ConstantExpr::getAlignOf(Ty);
@@ -473,7 +473,7 @@ static Constant *getFoldedOffsetOf(Type *Ty, Constant *FieldNo,
       unsigned NumElems = STy->getNumElements();
       // An empty struct has no members.
       if (NumElems == 0)
-        return 0;
+        return nullptr;
       // Check for a struct with all members having the same size.
       Constant *MemberSize =
         getFoldedSizeOf(STy->getElementType(0), DestTy, true);
@@ -497,7 +497,7 @@ static Constant *getFoldedOffsetOf(Type *Ty, Constant *FieldNo,
   // If there's no interesting folding happening, bail so that we don't create
   // a constant that looks like it needs folding but really doesn't.
   if (!Folded)
-    return 0;
+    return nullptr;
 
   // Base case: Get a regular offsetof expression.
   Constant *C = ConstantExpr::getOffsetOf(Ty, FieldNo);
@@ -582,7 +582,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
                   APFloat::rmNearestTiesToEven, &ignored);
       return ConstantFP::get(V->getContext(), Val);
     }
-    return 0; // Can't fold.
+    return nullptr; // Can't fold.
   case Instruction::FPToUI: 
   case Instruction::FPToSI:
     if (ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
@@ -595,11 +595,11 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
       APInt Val(DestBitWidth, x);
       return ConstantInt::get(FPC->getContext(), Val);
     }
-    return 0; // Can't fold.
+    return nullptr; // Can't fold.
   case Instruction::IntToPtr:   //always treated as unsigned
     if (V->isNullValue())       // Is it an integral null value?
       return ConstantPointerNull::get(cast<PointerType>(DestTy));
-    return 0;                   // Other pointer types cannot be casted
+    return nullptr;                   // Other pointer types cannot be casted
   case Instruction::PtrToInt:   // always treated as unsigned
     // Is it a null pointer value?
     if (V->isNullValue())
@@ -643,7 +643,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
         }
       }
     // Other pointer types cannot be casted
-    return 0;
+    return nullptr;
   case Instruction::UIToFP:
   case Instruction::SIToFP:
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
@@ -655,21 +655,21 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
                                  APFloat::rmNearestTiesToEven);
       return ConstantFP::get(V->getContext(), apf);
     }
-    return 0;
+    return nullptr;
   case Instruction::ZExt:
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
       uint32_t BitWidth = cast<IntegerType>(DestTy)->getBitWidth();
       return ConstantInt::get(V->getContext(),
                               CI->getValue().zext(BitWidth));
     }
-    return 0;
+    return nullptr;
   case Instruction::SExt:
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
       uint32_t BitWidth = cast<IntegerType>(DestTy)->getBitWidth();
       return ConstantInt::get(V->getContext(),
                               CI->getValue().sext(BitWidth));
     }
-    return 0;
+    return nullptr;
   case Instruction::Trunc: {
     uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
@@ -685,12 +685,12 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
       if (Constant *Res = ExtractConstantBytes(V, 0, DestBitWidth / 8))
         return Res;
       
-    return 0;
+    return nullptr;
   }
   case Instruction::BitCast:
     return FoldBitCast(V, DestTy);
   case Instruction::AddrSpaceCast:
-    return 0;
+    return nullptr;
   }
 }
 
@@ -746,7 +746,7 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
         return ConstantExpr::getSelect(Cond, V1, FalseVal->getOperand(2));
   }
 
-  return 0;
+  return nullptr;
 }
 
 Constant *llvm::ConstantFoldExtractElementInstruction(Constant *Val,
@@ -766,14 +766,14 @@ Constant *llvm::ConstantFoldExtractElementInstruction(Constant *Val,
       return UndefValue::get(Val->getType()->getVectorElementType());
     return Val->getAggregateElement(Index);
   }
-  return 0;
+  return nullptr;
 }
 
 Constant *llvm::ConstantFoldInsertElementInstruction(Constant *Val,
                                                      Constant *Elt,
                                                      Constant *Idx) {
   ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx);
-  if (!CIdx) return 0;
+  if (!CIdx) return nullptr;
   const APInt &IdxVal = CIdx->getValue();
   
   SmallVector<Constant*, 16> Result;
@@ -803,7 +803,7 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1,
     return UndefValue::get(VectorType::get(EltTy, MaskNumElts));
 
   // Don't break the bitcode reader hack.
-  if (isa<ConstantExpr>(Mask)) return 0;
+  if (isa<ConstantExpr>(Mask)) return nullptr;
   
   unsigned SrcNumElts = V1->getType()->getVectorNumElements();
 
@@ -842,7 +842,7 @@ Constant *llvm::ConstantFoldExtractValueInstruction(Constant *Agg,
   if (Constant *C = Agg->getAggregateElement(Idxs[0]))
     return ConstantFoldExtractValueInstruction(C, Idxs.slice(1));
 
-  return 0;
+  return nullptr;
 }
 
 Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
@@ -863,8 +863,8 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
   SmallVector<Constant*, 32> Result;
   for (unsigned i = 0; i != NumElts; ++i) {
     Constant *C = Agg->getAggregateElement(i);
-    if (C == 0) return 0;
-    
+    if (!C) return nullptr;
+
     if (Idxs[0] == i)
       C = ConstantFoldInsertValueInstruction(C, Val, Idxs.slice(1));
     
@@ -1209,7 +1209,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
   }
 
   // We don't know how to fold this.
-  return 0;
+  return nullptr;
 }
 
 /// isZeroSizedType - This type is zero sized if its an array or structure of
@@ -1289,7 +1289,7 @@ static FCmpInst::Predicate evaluateFCmpRelation(Constant *V1, Constant *V2) {
   if (!isa<ConstantExpr>(V1)) {
     if (!isa<ConstantExpr>(V2)) {
       // We distilled thisUse the standard constant folder for a few cases
-      ConstantInt *R = 0;
+      ConstantInt *R = nullptr;
       R = dyn_cast<ConstantInt>(
                       ConstantExpr::getFCmp(FCmpInst::FCMP_OEQ, V1, V2));
       if (R && !R->isZero()) 
@@ -1355,7 +1355,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
         !isa<BlockAddress>(V2)) {
       // We distilled this down to a simple case, use the standard constant
       // folder.
-      ConstantInt *R = 0;
+      ConstantInt *R = nullptr;
       ICmpInst::Predicate pred = ICmpInst::ICMP_EQ;
       R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, V1, V2));
       if (R && !R->isZero()) 
@@ -1885,7 +1885,7 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
       return ConstantExpr::getICmp(pred, C2, C1);
     }
   }
-  return 0;
+  return nullptr;
 }
 
 /// isInBoundsIndices - Test whether the given sequence of *normalized* indices
@@ -1951,7 +1951,7 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
   if (isa<UndefValue>(C)) {
     PointerType *Ptr = cast<PointerType>(C->getType());
     Type *Ty = GetElementPtrInst::getIndexedType(Ptr, Idxs);
-    assert(Ty != 0 && "Invalid indices for GEP!");
+    assert(Ty && "Invalid indices for GEP!");
     return UndefValue::get(PointerType::get(Ty, Ptr->getAddressSpace()));
   }
 
@@ -1965,7 +1965,7 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
     if (isNull) {
       PointerType *Ptr = cast<PointerType>(C->getType());
       Type *Ty = GetElementPtrInst::getIndexedType(Ptr, Idxs);
-      assert(Ty != 0 && "Invalid indices for GEP!");
+      assert(Ty && "Invalid indices for GEP!");
       return ConstantPointerNull::get(PointerType::get(Ty,
                                                        Ptr->getAddressSpace()));
     }
@@ -1977,7 +1977,7 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
     // getelementptr instructions into a single instruction.
     //
     if (CE->getOpcode() == Instruction::GetElementPtr) {
-      Type *LastTy = 0;
+      Type *LastTy = nullptr;
       for (gep_type_iterator I = gep_type_begin(CE), E = gep_type_end(CE);
            I != E; ++I)
         LastTy = *I;
@@ -2072,7 +2072,7 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
   bool Unknown = false;
   SmallVector<Constant *, 8> NewIdxs;
   Type *Ty = C->getType();
-  Type *Prev = 0;
+  Type *Prev = nullptr;
   for (unsigned i = 0, e = Idxs.size(); i != e;
        Prev = Ty, Ty = cast<CompositeType>(Ty)->getTypeAtIndex(Idxs[i]), ++i) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Idxs[i])) {
@@ -2130,7 +2130,7 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
       isa<GlobalVariable>(C) && isInBoundsIndices(Idxs))
     return ConstantExpr::getInBoundsGetElementPtr(C, Idxs);
 
-  return 0;
+  return nullptr;
 }
 
 Constant *llvm::ConstantFoldGetElementPtr(Constant *C,
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index 2a3a5fd..bb8d60b 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -182,13 +182,13 @@ Constant *Constant::getAllOnesValue(Type *Ty) {
 /// 'this' is a constant expr.
 Constant *Constant::getAggregateElement(unsigned Elt) const {
   if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(this))
-    return Elt < CS->getNumOperands() ? CS->getOperand(Elt) : 0;
+    return Elt < CS->getNumOperands() ? CS->getOperand(Elt) : nullptr;
 
   if (const ConstantArray *CA = dyn_cast<ConstantArray>(this))
-    return Elt < CA->getNumOperands() ? CA->getOperand(Elt) : 0;
+    return Elt < CA->getNumOperands() ? CA->getOperand(Elt) : nullptr;
 
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
-    return Elt < CV->getNumOperands() ? CV->getOperand(Elt) : 0;
+    return Elt < CV->getNumOperands() ? CV->getOperand(Elt) : nullptr;
 
   if (const ConstantAggregateZero *CAZ =dyn_cast<ConstantAggregateZero>(this))
     return CAZ->getElementValue(Elt);
@@ -197,15 +197,16 @@ Constant *Constant::getAggregateElement(unsigned Elt) const {
     return UV->getElementValue(Elt);
 
   if (const ConstantDataSequential *CDS =dyn_cast<ConstantDataSequential>(this))
-    return Elt < CDS->getNumElements() ? CDS->getElementAsConstant(Elt) : 0;
-  return 0;
+    return Elt < CDS->getNumElements() ? CDS->getElementAsConstant(Elt)
+                                       : nullptr;
+  return nullptr;
 }
 
 Constant *Constant::getAggregateElement(Constant *Elt) const {
   assert(isa<IntegerType>(Elt->getType()) && "Index must be an integer");
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Elt))
     return getAggregateElement(CI->getZExtValue());
-  return 0;
+  return nullptr;
 }
 
 
@@ -309,7 +310,7 @@ bool Constant::isThreadDependent() const {
 bool Constant::isConstantUsed() const {
   for (const User *U : users()) {
     const Constant *UC = dyn_cast<Constant>(U);
-    if (UC == 0 || isa<GlobalValue>(UC))
+    if (!UC || isa<GlobalValue>(UC))
       return true;
 
     if (UC->isConstantUsed())
@@ -397,7 +398,7 @@ void Constant::removeDeadConstantUsers() const {
   Value::const_user_iterator LastNonDeadUser = E;
   while (I != E) {
     const Constant *User = dyn_cast<Constant>(*I);
-    if (User == 0) {
+    if (!User) {
       LastNonDeadUser = I;
       ++I;
       continue;
@@ -431,7 +432,7 @@ void Constant::removeDeadConstantUsers() const {
 void ConstantInt::anchor() { }
 
 ConstantInt::ConstantInt(IntegerType *Ty, const APInt& V)
-  : Constant(Ty, ConstantIntVal, 0, 0), Val(V) {
+  : Constant(Ty, ConstantIntVal, nullptr, 0), Val(V) {
   assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type");
 }
 
@@ -644,7 +645,7 @@ Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
 }
 
 ConstantFP::ConstantFP(Type *Ty, const APFloat& V)
-  : Constant(Ty, ConstantFPVal, 0, 0), Val(V) {
+  : Constant(Ty, ConstantFPVal, nullptr, 0), Val(V) {
   assert(&V.getSemantics() == TypeToFloatSemantics(Ty) &&
          "FP type Mismatch");
 }
@@ -1235,7 +1236,7 @@ ConstantAggregateZero *ConstantAggregateZero::get(Type *Ty) {
          "Cannot create an aggregate zero of non-aggregate type!");
   
   ConstantAggregateZero *&Entry = Ty->getContext().pImpl->CAZConstants[Ty];
-  if (Entry == 0)
+  if (!Entry)
     Entry = new ConstantAggregateZero(Ty);
 
   return Entry;
@@ -1283,7 +1284,7 @@ Constant *Constant::getSplatValue() const {
     return CV->getSplatValue();
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
     return CV->getSplatValue();
-  return 0;
+  return nullptr;
 }
 
 /// getSplatValue - If this is a splat constant, where all of the
@@ -1294,7 +1295,7 @@ Constant *ConstantVector::getSplatValue() const {
   // Then make sure all remaining elements point to the same value.
   for (unsigned I = 1, E = getNumOperands(); I < E; ++I)
     if (getOperand(I) != Elt)
-      return 0;
+      return nullptr;
   return Elt;
 }
 
@@ -1315,7 +1316,7 @@ const APInt &Constant::getUniqueInteger() const {
 
 ConstantPointerNull *ConstantPointerNull::get(PointerType *Ty) {
   ConstantPointerNull *&Entry = Ty->getContext().pImpl->CPNConstants[Ty];
-  if (Entry == 0)
+  if (!Entry)
     Entry = new ConstantPointerNull(Ty);
 
   return Entry;
@@ -1335,7 +1336,7 @@ void ConstantPointerNull::destroyConstant() {
 
 UndefValue *UndefValue::get(Type *Ty) {
   UndefValue *&Entry = Ty->getContext().pImpl->UVConstants[Ty];
-  if (Entry == 0)
+  if (!Entry)
     Entry = new UndefValue(Ty);
 
   return Entry;
@@ -1353,14 +1354,14 @@ void UndefValue::destroyConstant() {
 //
 
 BlockAddress *BlockAddress::get(BasicBlock *BB) {
-  assert(BB->getParent() != 0 && "Block must have a parent");
+  assert(BB->getParent() && "Block must have a parent");
   return get(BB->getParent(), BB);
 }
 
 BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) {
   BlockAddress *&BA =
     F->getContext().pImpl->BlockAddresses[std::make_pair(F, BB)];
-  if (BA == 0)
+  if (!BA)
     BA = new BlockAddress(F, BB);
 
   assert(BA->getFunction() == F && "Basic block moved between functions");
@@ -1377,10 +1378,10 @@ BlockAddress::BlockAddress(Function *F, BasicBlock *BB)
 
 BlockAddress *BlockAddress::lookup(const BasicBlock *BB) {
   if (!BB->hasAddressTaken())
-    return 0;
+    return nullptr;
 
   const Function *F = BB->getParent();
-  assert(F != 0 && "Block must have a parent");
+  assert(F && "Block must have a parent");
   BlockAddress *BA =
       F->getContext().pImpl->BlockAddresses.lookup(std::make_pair(F, BB));
   assert(BA && "Refcount and block address map disagree!");
@@ -1411,7 +1412,7 @@ void BlockAddress::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) {
   // and return early.
   BlockAddress *&NewBA =
     getContext().pImpl->BlockAddresses[std::make_pair(NewF, NewBB)];
-  if (NewBA == 0) {
+  if (!NewBA) {
     getBasicBlock()->AdjustBlockAddressRefCount(-1);
 
     // Remove the old entry, this can't cause the map to rehash (just a
@@ -1792,7 +1793,7 @@ Constant *ConstantExpr::getAlignOf(Type* Ty) {
   // Note that a non-inbounds gep is used, as null isn't within any object.
   Type *AligningTy = 
     StructType::get(Type::getInt1Ty(Ty->getContext()), Ty, NULL);
-  Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo());
+  Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo(0));
   Constant *Zero = ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0);
   Constant *One = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
   Constant *Indices[2] = { Zero, One };
@@ -1936,8 +1937,8 @@ ConstantExpr::getFCmp(unsigned short pred, Constant *LHS, Constant *RHS) {
 Constant *ConstantExpr::getExtractElement(Constant *Val, Constant *Idx) {
   assert(Val->getType()->isVectorTy() &&
          "Tried to create extractelement operation on non-vector type!");
-  assert(Idx->getType()->isIntegerTy(32) &&
-         "Extractelement index must be i32 type!");
+  assert(Idx->getType()->isIntegerTy() &&
+         "Extractelement index must be an integer type!");
 
   if (Constant *FC = ConstantFoldExtractElementInstruction(Val, Idx))
     return FC;          // Fold a few common cases.
@@ -1957,7 +1958,7 @@ Constant *ConstantExpr::getInsertElement(Constant *Val, Constant *Elt,
          "Tried to create insertelement operation on non-vector type!");
   assert(Elt->getType() == Val->getType()->getVectorElementType() &&
          "Insertelement types must match!");
-  assert(Idx->getType()->isIntegerTy(32) &&
+  assert(Idx->getType()->isIntegerTy() &&
          "Insertelement index must be i32 type!");
 
   if (Constant *FC = ConstantFoldInsertElementInstruction(Val, Elt, Idx))
@@ -2145,7 +2146,7 @@ Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty) {
   switch (Opcode) {
   default:
     // Doesn't have an identity.
-    return 0;
+    return nullptr;
 
   case Instruction::Add:
   case Instruction::Or:
@@ -2168,7 +2169,7 @@ Constant *ConstantExpr::getBinOpAbsorber(unsigned Opcode, Type *Ty) {
   switch (Opcode) {
   default:
     // Doesn't have an absorber.
-    return 0;
+    return nullptr;
 
   case Instruction::Or:
     return Constant::getAllOnesValue(Ty);
@@ -2285,7 +2286,7 @@ Constant *ConstantDataSequential::getImpl(StringRef Elements, Type *Ty) {
   // of i8, or a 1-element array of i32.  They'll both end up in the same
   /// StringMap bucket, linked up by their Next pointers.  Walk the list.
   ConstantDataSequential **Entry = &Slot.getValue();
-  for (ConstantDataSequential *Node = *Entry; Node != 0;
+  for (ConstantDataSequential *Node = *Entry; Node;
        Entry = &Node->Next, Node = *Entry)
     if (Node->getType() == Ty)
       return Node;
@@ -2312,7 +2313,7 @@ void ConstantDataSequential::destroyConstant() {
   ConstantDataSequential **Entry = &Slot->getValue();
 
   // Remove the entry from the hash table.
-  if ((*Entry)->Next == 0) {
+  if (!(*Entry)->Next) {
     // If there is only one value in the bucket (common case) it must be this
     // entry, and removing the entry should remove the bucket completely.
     assert((*Entry) == this && "Hash mismatch in ConstantDataSequential");
@@ -2333,7 +2334,7 @@ void ConstantDataSequential::destroyConstant() {
 
   // If we were part of a list, make sure that we don't delete the list that is
   // still owned by the uniquing map.
-  Next = 0;
+  Next = nullptr;
 
   // Finally, actually delete it.
   destroyConstantImpl();
@@ -2561,7 +2562,7 @@ Constant *ConstantDataVector::getSplatValue() const {
   unsigned EltSize = getElementByteSize();
   for (unsigned i = 1, e = getNumElements(); i != e; ++i)
     if (memcmp(Base, Base+i*EltSize, EltSize))
-      return 0;
+      return nullptr;
 
   // If they're all the same, return the 0th one as a representative.
   return getElementAsConstant(0);
@@ -2609,7 +2610,7 @@ void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
     AllSame &= Val == ToC;
   }
 
-  Constant *Replacement = 0;
+  Constant *Replacement = nullptr;
   if (AllSame && ToC->isNullValue()) {
     Replacement = ConstantAggregateZero::get(getType());
   } else if (AllSame && isa<UndefValue>(ToC)) {
@@ -2695,7 +2696,7 @@ void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
 
   LLVMContextImpl *pImpl = getContext().pImpl;
 
-  Constant *Replacement = 0;
+  Constant *Replacement = nullptr;
   if (isAllZeros) {
     Replacement = ConstantAggregateZero::get(getType());
   } else if (isAllUndef) {
diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index 59b9d4d..f06509f 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h
@@ -24,6 +24,9 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
+#include <tuple>
+
+#define DEBUG_TYPE "ir"
 
 namespace llvm {
 template<class ValType>
@@ -584,7 +587,7 @@ public:
   /// necessary.
   ConstantClass *getOrCreate(TypeClass *Ty, ValRefType V) {
     MapKey Lookup(Ty, V);
-    ConstantClass* Result = 0;
+    ConstantClass* Result = nullptr;
     
     typename MapTy::iterator I = Map.find(Lookup);
     // Is it in the map?  
@@ -720,7 +723,7 @@ public:
   /// necessary.
   ConstantClass *getOrCreate(TypeClass *Ty, Operands V) {
     LookupKey Lookup(Ty, V);
-    ConstantClass* Result = 0;
+    ConstantClass* Result = nullptr;
 
     typename MapTy::iterator I = Map.find_as(Lookup);
     // Is it in the map?
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index f52f466..27ce503 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -17,6 +17,8 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -28,6 +30,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Threading.h"
@@ -39,6 +42,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ir"
+
 void llvm::initializeCore(PassRegistry &Registry) {
   initializeDominatorTreeWrapperPassPass(Registry);
   initializePrintModulePassWrapperPass(Registry);
@@ -76,6 +81,21 @@ LLVMContextRef LLVMGetGlobalContext() {
   return wrap(&getGlobalContext());
 }
 
+void LLVMContextSetDiagnosticHandler(LLVMContextRef C,
+                                     LLVMDiagnosticHandler Handler,
+                                     void *DiagnosticContext) {
+  unwrap(C)->setDiagnosticHandler(
+      LLVM_EXTENSION reinterpret_cast<LLVMContext::DiagnosticHandlerTy>(Handler),
+      DiagnosticContext);
+}
+
+void LLVMContextSetYieldCallback(LLVMContextRef C, LLVMYieldCallback Callback,
+                                 void *OpaqueHandle) {
+  auto YieldCallback =
+    LLVM_EXTENSION reinterpret_cast<LLVMContext::YieldCallbackTy>(Callback);
+  unwrap(C)->setYieldCallback(YieldCallback, OpaqueHandle);
+}
+
 void LLVMContextDispose(LLVMContextRef C) {
   delete unwrap(C);
 }
@@ -89,6 +109,40 @@ unsigned LLVMGetMDKindID(const char* Name, unsigned SLen) {
   return LLVMGetMDKindIDInContext(LLVMGetGlobalContext(), Name, SLen);
 }
 
+char *LLVMGetDiagInfoDescription(LLVMDiagnosticInfoRef DI) {
+  std::string MsgStorage;
+  raw_string_ostream Stream(MsgStorage);
+  DiagnosticPrinterRawOStream DP(Stream);
+
+  unwrap(DI)->print(DP);
+  Stream.flush();
+
+  return LLVMCreateMessage(MsgStorage.c_str());
+}
+
+LLVMDiagnosticSeverity LLVMGetDiagInfoSeverity(LLVMDiagnosticInfoRef DI){
+    LLVMDiagnosticSeverity severity;
+
+    switch(unwrap(DI)->getSeverity()) {
+    default:
+      severity = LLVMDSError;
+      break;
+    case DS_Warning:
+      severity = LLVMDSWarning;
+      break;
+    case DS_Remark:
+      severity = LLVMDSRemark;
+      break;
+    case DS_Note:
+      severity = LLVMDSNote;
+      break;
+    }
+
+    return severity;
+}
+
+
+
 
 /*===-- Operations on modules ---------------------------------------------===*/
 
@@ -136,7 +190,7 @@ LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
     return true;
   }
 
-  unwrap(M)->print(dest, NULL);
+  unwrap(M)->print(dest, nullptr);
 
   if (!error.empty()) {
     *ErrorMessage = strdup(error.c_str());
@@ -150,7 +204,7 @@ char *LLVMPrintModuleToString(LLVMModuleRef M) {
   std::string buf;
   raw_string_ostream os(buf);
 
-  unwrap(M)->print(os, NULL);
+  unwrap(M)->print(os, nullptr);
   os.flush();
 
   return strdup(buf.c_str());
@@ -374,7 +428,7 @@ const char *LLVMGetStructName(LLVMTypeRef Ty)
 {
   StructType *Type = unwrap<StructType>(Ty);
   if (!Type->hasName())
-    return 0;
+    return nullptr;
   return Type->getName().data();
 }
 
@@ -496,7 +550,8 @@ LLVMValueRef LLVMGetMetadata(LLVMValueRef Inst, unsigned KindID) {
 }
 
 void LLVMSetMetadata(LLVMValueRef Inst, unsigned KindID, LLVMValueRef MD) {
-  unwrap<Instruction>(Inst)->setMetadata(KindID, MD? unwrap<MDNode>(MD) : NULL);
+  unwrap<Instruction>(Inst)->setMetadata(KindID,
+                                         MD ? unwrap<MDNode>(MD) : nullptr);
 }
 
 /*--.. Conversion functions ................................................--*/
@@ -513,7 +568,7 @@ LLVMUseRef LLVMGetFirstUse(LLVMValueRef Val) {
   Value *V = unwrap(Val);
   Value::use_iterator I = V->use_begin();
   if (I == V->use_end())
-    return 0;
+    return nullptr;
   return wrap(&*I);
 }
 
@@ -521,7 +576,7 @@ LLVMUseRef LLVMGetNextUse(LLVMUseRef U) {
   Use *Next = unwrap(U)->getNext();
   if (Next)
     return wrap(Next);
-  return 0;
+  return nullptr;
 }
 
 LLVMValueRef LLVMGetUser(LLVMUseRef U) {
@@ -611,7 +666,7 @@ const char *LLVMGetMDString(LLVMValueRef V, unsigned* Len) {
     return S->getString().data();
   }
   *Len = 0;
-  return 0;
+  return nullptr;
 }
 
 unsigned LLVMGetMDNodeNumOperands(LLVMValueRef V)
@@ -650,7 +705,7 @@ void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char* name,
   NamedMDNode *N = unwrap(M)->getOrInsertNamedMetadata(name);
   if (!N)
     return;
-  MDNode *Op = Val ? unwrap<MDNode>(Val) : NULL;
+  MDNode *Op = Val ? unwrap<MDNode>(Val) : nullptr;
   if (Op)
     N->addOperand(Op);
 }
@@ -1235,7 +1290,7 @@ const char *LLVMGetSection(LLVMValueRef Global) {
 }
 
 void LLVMSetSection(LLVMValueRef Global, const char *Section) {
-  unwrap<GlobalValue>(Global)->setSection(Section);
+  unwrap<GlobalObject>(Global)->setSection(Section);
 }
 
 LLVMVisibility LLVMGetVisibility(LLVMValueRef Global) {
@@ -1285,7 +1340,7 @@ unsigned LLVMGetAlignment(LLVMValueRef V) {
 
 void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) {
   Value *P = unwrap<Value>(V);
-  if (GlobalValue *GV = dyn_cast<GlobalValue>(P))
+  if (GlobalObject *GV = dyn_cast<GlobalObject>(P))
     GV->setAlignment(Bytes);
   else if (AllocaInst *AI = dyn_cast<AllocaInst>(P))
     AI->setAlignment(Bytes);
@@ -1302,15 +1357,16 @@ void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) {
 
 LLVMValueRef LLVMAddGlobal(LLVMModuleRef M, LLVMTypeRef Ty, const char *Name) {
   return wrap(new GlobalVariable(*unwrap(M), unwrap(Ty), false,
-                                 GlobalValue::ExternalLinkage, 0, Name));
+                                 GlobalValue::ExternalLinkage, nullptr, Name));
 }
 
 LLVMValueRef LLVMAddGlobalInAddressSpace(LLVMModuleRef M, LLVMTypeRef Ty,
                                          const char *Name,
                                          unsigned AddressSpace) {
   return wrap(new GlobalVariable(*unwrap(M), unwrap(Ty), false,
-                                 GlobalValue::ExternalLinkage, 0, Name, 0,
-                                 GlobalVariable::NotThreadLocal, AddressSpace));
+                                 GlobalValue::ExternalLinkage, nullptr, Name,
+                                 nullptr, GlobalVariable::NotThreadLocal,
+                                 AddressSpace));
 }
 
 LLVMValueRef LLVMGetNamedGlobal(LLVMModuleRef M, const char *Name) {
@@ -1321,7 +1377,7 @@ LLVMValueRef LLVMGetFirstGlobal(LLVMModuleRef M) {
   Module *Mod = unwrap(M);
   Module::global_iterator I = Mod->global_begin();
   if (I == Mod->global_end())
-    return 0;
+    return nullptr;
   return wrap(I);
 }
 
@@ -1329,7 +1385,7 @@ LLVMValueRef LLVMGetLastGlobal(LLVMModuleRef M) {
   Module *Mod = unwrap(M);
   Module::global_iterator I = Mod->global_end();
   if (I == Mod->global_begin())
-    return 0;
+    return nullptr;
   return wrap(--I);
 }
 
@@ -1337,7 +1393,7 @@ LLVMValueRef LLVMGetNextGlobal(LLVMValueRef GlobalVar) {
   GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar);
   Module::global_iterator I = GV;
   if (++I == GV->getParent()->global_end())
-    return 0;
+    return nullptr;
   return wrap(I);
 }
 
@@ -1345,7 +1401,7 @@ LLVMValueRef LLVMGetPreviousGlobal(LLVMValueRef GlobalVar) {
   GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar);
   Module::global_iterator I = GV;
   if (I == GV->getParent()->global_begin())
-    return 0;
+    return nullptr;
   return wrap(--I);
 }
 
@@ -1356,7 +1412,7 @@ void LLVMDeleteGlobal(LLVMValueRef GlobalVar) {
 LLVMValueRef LLVMGetInitializer(LLVMValueRef GlobalVar) {
   GlobalVariable* GV = unwrap<GlobalVariable>(GlobalVar);
   if ( !GV->hasInitializer() )
-    return 0;
+    return nullptr;
   return wrap(GV->getInitializer());
 }
 
@@ -1432,8 +1488,10 @@ void LLVMSetExternallyInitialized(LLVMValueRef GlobalVar, LLVMBool IsExtInit) {
 
 LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee,
                           const char *Name) {
-  return wrap(new GlobalAlias(unwrap(Ty), GlobalValue::ExternalLinkage, Name,
-                              unwrap<Constant>(Aliasee), unwrap (M)));
+  auto *PTy = cast<PointerType>(unwrap(Ty));
+  return wrap(GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                                  GlobalValue::ExternalLinkage, Name,
+                                  unwrap<GlobalObject>(Aliasee), unwrap(M)));
 }
 
 /*--.. Operations on functions .............................................--*/
@@ -1452,7 +1510,7 @@ LLVMValueRef LLVMGetFirstFunction(LLVMModuleRef M) {
   Module *Mod = unwrap(M);
   Module::iterator I = Mod->begin();
   if (I == Mod->end())
-    return 0;
+    return nullptr;
   return wrap(I);
 }
 
@@ -1460,7 +1518,7 @@ LLVMValueRef LLVMGetLastFunction(LLVMModuleRef M) {
   Module *Mod = unwrap(M);
   Module::iterator I = Mod->end();
   if (I == Mod->begin())
-    return 0;
+    return nullptr;
   return wrap(--I);
 }
 
@@ -1468,7 +1526,7 @@ LLVMValueRef LLVMGetNextFunction(LLVMValueRef Fn) {
   Function *Func = unwrap<Function>(Fn);
   Module::iterator I = Func;
   if (++I == Func->getParent()->end())
-    return 0;
+    return nullptr;
   return wrap(I);
 }
 
@@ -1476,7 +1534,7 @@ LLVMValueRef LLVMGetPreviousFunction(LLVMValueRef Fn) {
   Function *Func = unwrap<Function>(Fn);
   Module::iterator I = Func;
   if (I == Func->getParent()->begin())
-    return 0;
+    return nullptr;
   return wrap(--I);
 }
 
@@ -1501,7 +1559,7 @@ void LLVMSetFunctionCallConv(LLVMValueRef Fn, unsigned CC) {
 
 const char *LLVMGetGC(LLVMValueRef Fn) {
   Function *F = unwrap<Function>(Fn);
-  return F->hasGC()? F->getGC() : 0;
+  return F->hasGC()? F->getGC() : nullptr;
 }
 
 void LLVMSetGC(LLVMValueRef Fn, const char *GC) {
@@ -1582,7 +1640,7 @@ LLVMValueRef LLVMGetFirstParam(LLVMValueRef Fn) {
   Function *Func = unwrap<Function>(Fn);
   Function::arg_iterator I = Func->arg_begin();
   if (I == Func->arg_end())
-    return 0;
+    return nullptr;
   return wrap(I);
 }
 
@@ -1590,7 +1648,7 @@ LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn) {
   Function *Func = unwrap<Function>(Fn);
   Function::arg_iterator I = Func->arg_end();
   if (I == Func->arg_begin())
-    return 0;
+    return nullptr;
   return wrap(--I);
 }
 
@@ -1598,7 +1656,7 @@ LLVMValueRef LLVMGetNextParam(LLVMValueRef Arg) {
   Argument *A = unwrap<Argument>(Arg);
   Function::arg_iterator I = A;
   if (++I == A->getParent()->arg_end())
-    return 0;
+    return nullptr;
   return wrap(I);
 }
 
@@ -1606,7 +1664,7 @@ LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) {
   Argument *A = unwrap<Argument>(Arg);
   Function::arg_iterator I = A;
   if (I == A->getParent()->arg_begin())
-    return 0;
+    return nullptr;
   return wrap(--I);
 }
 
@@ -1676,7 +1734,7 @@ LLVMBasicBlockRef LLVMGetFirstBasicBlock(LLVMValueRef Fn) {
   Function *Func = unwrap<Function>(Fn);
   Function::iterator I = Func->begin();
   if (I == Func->end())
-    return 0;
+    return nullptr;
   return wrap(I);
 }
 
@@ -1684,7 +1742,7 @@ LLVMBasicBlockRef LLVMGetLastBasicBlock(LLVMValueRef Fn) {
   Function *Func = unwrap<Function>(Fn);
   Function::iterator I = Func->end();
   if (I == Func->begin())
-    return 0;
+    return nullptr;
   return wrap(--I);
 }
 
@@ -1692,7 +1750,7 @@ LLVMBasicBlockRef LLVMGetNextBasicBlock(LLVMBasicBlockRef BB) {
   BasicBlock *Block = unwrap(BB);
   Function::iterator I = Block;
   if (++I == Block->getParent()->end())
-    return 0;
+    return nullptr;
   return wrap(I);
 }
 
@@ -1700,7 +1758,7 @@ LLVMBasicBlockRef LLVMGetPreviousBasicBlock(LLVMBasicBlockRef BB) {
   BasicBlock *Block = unwrap(BB);
   Function::iterator I = Block;
   if (I == Block->getParent()->begin())
-    return 0;
+    return nullptr;
   return wrap(--I);
 }
 
@@ -1752,7 +1810,7 @@ LLVMValueRef LLVMGetFirstInstruction(LLVMBasicBlockRef BB) {
   BasicBlock *Block = unwrap(BB);
   BasicBlock::iterator I = Block->begin();
   if (I == Block->end())
-    return 0;
+    return nullptr;
   return wrap(I);
 }
 
@@ -1760,7 +1818,7 @@ LLVMValueRef LLVMGetLastInstruction(LLVMBasicBlockRef BB) {
   BasicBlock *Block = unwrap(BB);
   BasicBlock::iterator I = Block->end();
   if (I == Block->begin())
-    return 0;
+    return nullptr;
   return wrap(--I);
 }
 
@@ -1768,7 +1826,7 @@ LLVMValueRef LLVMGetNextInstruction(LLVMValueRef Inst) {
   Instruction *Instr = unwrap<Instruction>(Inst);
   BasicBlock::iterator I = Instr;
   if (++I == Instr->getParent()->end())
-    return 0;
+    return nullptr;
   return wrap(I);
 }
 
@@ -1776,7 +1834,7 @@ LLVMValueRef LLVMGetPreviousInstruction(LLVMValueRef Inst) {
   Instruction *Instr = unwrap<Instruction>(Inst);
   BasicBlock::iterator I = Instr;
   if (I == Instr->getParent()->begin())
-    return 0;
+    return nullptr;
   return wrap(--I);
 }
 
@@ -1939,7 +1997,7 @@ void LLVMDisposeBuilder(LLVMBuilderRef Builder) {
 /*--.. Metadata builders ...................................................--*/
 
 void LLVMSetCurrentDebugLocation(LLVMBuilderRef Builder, LLVMValueRef L) {
-  MDNode *Loc = L ? unwrap<MDNode>(L) : NULL;
+  MDNode *Loc = L ? unwrap<MDNode>(L) : nullptr;
   unwrap(Builder)->SetCurrentDebugLocation(DebugLoc::getFromDILocation(Loc));
 }
 
@@ -2195,7 +2253,7 @@ LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
   AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
   Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(),
                                                ITy, unwrap(Ty), AllocSize,
-                                               0, 0, "");
+                                               nullptr, nullptr, "");
   return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
 }
 
@@ -2206,13 +2264,13 @@ LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
   AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
   Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(),
                                                ITy, unwrap(Ty), AllocSize,
-                                               unwrap(Val), 0, "");
+                                               unwrap(Val), nullptr, "");
   return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
 }
 
 LLVMValueRef LLVMBuildAlloca(LLVMBuilderRef B, LLVMTypeRef Ty,
                              const char *Name) {
-  return wrap(unwrap(B)->CreateAlloca(unwrap(Ty), 0, Name));
+  return wrap(unwrap(B)->CreateAlloca(unwrap(Ty), nullptr, Name));
 }
 
 LLVMValueRef LLVMBuildArrayAlloca(LLVMBuilderRef B, LLVMTypeRef Ty,
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 1ea381a..92edacc 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -30,8 +30,9 @@ static Constant *GetTagConstant(LLVMContext &VMContext, unsigned Tag) {
 }
 
 DIBuilder::DIBuilder(Module &m)
-    : M(m), VMContext(M.getContext()), TempEnumTypes(0), TempRetainTypes(0),
-      TempSubprograms(0), TempGVs(0), DeclareFn(0), ValueFn(0) {}
+    : M(m), VMContext(M.getContext()), TempEnumTypes(nullptr),
+      TempRetainTypes(nullptr), TempSubprograms(nullptr), TempGVs(nullptr),
+      DeclareFn(nullptr), ValueFn(nullptr) {}
 
 /// finalize - Construct any deferred debug info descriptors.
 void DIBuilder::finalize() {
@@ -80,7 +81,7 @@ void DIBuilder::finalize() {
 /// N.
 static MDNode *getNonCompileUnitScope(MDNode *N) {
   if (DIDescriptor(N).isCompileUnit())
-    return NULL;
+    return nullptr;
   return N;
 }
 
@@ -103,7 +104,7 @@ DICompileUnit DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
                                            StringRef SplitName,
                                            DebugEmissionKind Kind) {
 
-  assert(((Lang <= dwarf::DW_LANG_Python && Lang >= dwarf::DW_LANG_C89) ||
+  assert(((Lang <= dwarf::DW_LANG_OCaml && Lang >= dwarf::DW_LANG_C89) ||
           (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
          "Invalid Language tag");
   assert(!Filename.empty() &&
@@ -146,13 +147,13 @@ DICompileUnit DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
 }
 
 static DIImportedEntity
-createImportedModule(LLVMContext &C, DIScope Context, DIDescriptor NS,
-                     unsigned Line, StringRef Name,
-                     SmallVectorImpl<TrackingVH<MDNode> > &AllImportedModules) {
+createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope Context,
+                     Value *NS, unsigned Line, StringRef Name,
+                     SmallVectorImpl<TrackingVH<MDNode>> &AllImportedModules) {
   const MDNode *R;
   if (Name.empty()) {
     Value *Elts[] = {
-      GetTagConstant(C, dwarf::DW_TAG_imported_module),
+      GetTagConstant(C, Tag),
       Context,
       NS,
       ConstantInt::get(Type::getInt32Ty(C), Line),
@@ -160,7 +161,7 @@ createImportedModule(LLVMContext &C, DIScope Context, DIDescriptor NS,
     R = MDNode::get(C, Elts);
   } else {
     Value *Elts[] = {
-      GetTagConstant(C, dwarf::DW_TAG_imported_module),
+      GetTagConstant(C, Tag),
       Context,
       NS,
       ConstantInt::get(Type::getInt32Ty(C), Line),
@@ -175,33 +176,32 @@ createImportedModule(LLVMContext &C, DIScope Context, DIDescriptor NS,
 }
 
 DIImportedEntity DIBuilder::createImportedModule(DIScope Context,
-                                                 DINameSpace NS, unsigned Line,
-                                                 StringRef Name) {
-  return ::createImportedModule(VMContext, Context, NS, Line, Name,
-                                AllImportedModules);
+                                                 DINameSpace NS,
+                                                 unsigned Line) {
+  return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
+                                Context, NS, Line, StringRef(), AllImportedModules);
 }
 
 DIImportedEntity DIBuilder::createImportedModule(DIScope Context,
                                                  DIImportedEntity NS,
-                                                 unsigned Line,
-                                                 StringRef Name) {
-  return ::createImportedModule(VMContext, Context, NS, Line, Name,
-                                AllImportedModules);
+                                                 unsigned Line) {
+  return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
+                                Context, NS, Line, StringRef(), AllImportedModules);
 }
 
 DIImportedEntity DIBuilder::createImportedDeclaration(DIScope Context,
                                                       DIScope Decl,
-                                                      unsigned Line) {
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_imported_declaration),
-    Context,
-    Decl.getRef(),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Line),
-  };
-  DIImportedEntity M(MDNode::get(VMContext, Elts));
-  assert(M.Verify() && "Imported module should be valid");
-  AllImportedModules.push_back(TrackingVH<MDNode>(M));
-  return M;
+                                                      unsigned Line, StringRef Name) {
+  return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_declaration,
+                                Context, Decl.getRef(), Line, Name,
+                                AllImportedModules);
+}
+
+DIImportedEntity DIBuilder::createImportedDeclaration(DIScope Context,
+                                                      DIImportedEntity Imp,
+                                                      unsigned Line, StringRef Name) {
+  return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_declaration,
+                                Context, Imp, Line, Name, AllImportedModules);
 }
 
 /// createFile - Create a file descriptor to hold debugging information
@@ -232,8 +232,8 @@ DIBasicType DIBuilder::createUnspecifiedType(StringRef Name) {
   // size, alignment, offset and flags are always empty here.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_unspecified_type),
-    NULL, // Filename
-    NULL, // Unused
+    nullptr, // Filename
+    nullptr, // Unused
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
@@ -260,8 +260,8 @@ DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
   // offset and flags are always empty here.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_base_type),
-    NULL, // File/directory name
-    NULL, // Unused
+    nullptr, // File/directory name
+    nullptr, // Unused
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
@@ -279,8 +279,8 @@ DIDerivedType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) {
   // Qualified types are encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
-    NULL, // Filename
-    NULL, // Unused
+    nullptr, // Filename
+    nullptr, // Unused
     MDString::get(VMContext, StringRef()), // Empty name.
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
@@ -299,8 +299,8 @@ DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
   // Pointer types are encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_pointer_type),
-    NULL, // Filename
-    NULL, // Unused
+    nullptr, // Filename
+    nullptr, // Unused
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
@@ -317,9 +317,9 @@ DIDerivedType DIBuilder::createMemberPointerType(DIType PointeeTy,
   // Pointer types are encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_ptr_to_member_type),
-    NULL, // Filename
-    NULL, // Unused
-    NULL,
+    nullptr, // Filename
+    nullptr, // Unused
+    nullptr,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
@@ -338,9 +338,9 @@ DIDerivedType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
   // References are encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
-    NULL, // Filename
-    NULL, // TheCU,
-    NULL, // Name
+    nullptr, // Filename
+    nullptr, // TheCU,
+    nullptr, // Name
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
@@ -355,7 +355,6 @@ DIDerivedType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
 DIDerivedType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
                                        unsigned LineNo, DIDescriptor Context) {
   // typedefs are encoded in DIDerivedType format.
-  assert(Ty.isType() && "Invalid typedef type!");
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_typedef),
     File.getFileNode(),
@@ -378,9 +377,9 @@ DIDerivedType DIBuilder::createFriend(DIType Ty, DIType FriendTy) {
   assert(FriendTy.isType() && "Invalid friend type!");
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_friend),
-    NULL,
+    nullptr,
     Ty.getRef(),
-    NULL, // Name
+    nullptr, // Name
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
@@ -400,9 +399,9 @@ DIDerivedType DIBuilder::createInheritance(DIType Ty, DIType BaseTy,
   // TAG_inheritance is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_inheritance),
-    NULL,
+    nullptr,
     Ty.getRef(),
-    NULL, // Name
+    nullptr, // Name
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
@@ -631,7 +630,8 @@ DICompositeType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     VTableHolder.getRef(),
     TemplateParams,
-    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
+    UniqueIdentifier.empty() ? nullptr
+                             : MDString::get(VMContext, UniqueIdentifier)
   };
   DICompositeType R(MDNode::get(VMContext, Elts));
   assert(R.isCompositeType() &&
@@ -667,8 +667,9 @@ DICompositeType DIBuilder::createStructType(DIDescriptor Context,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
     VTableHolder.getRef(),
-    NULL,
-    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
+    nullptr,
+    UniqueIdentifier.empty() ? nullptr
+                             : MDString::get(VMContext, UniqueIdentifier)
   };
   DICompositeType R(MDNode::get(VMContext, Elts));
   assert(R.isCompositeType() &&
@@ -697,12 +698,13 @@ DICompositeType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    NULL,
+    nullptr,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    NULL,
-    NULL,
-    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
+    nullptr,
+    nullptr,
+    UniqueIdentifier.empty() ? nullptr
+                             : MDString::get(VMContext, UniqueIdentifier)
   };
   DICompositeType R(MDNode::get(VMContext, Elts));
   if (!UniqueIdentifier.empty())
@@ -718,19 +720,19 @@ DICompositeType DIBuilder::createSubroutineType(DIFile File,
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subroutine_type),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    NULL,
+    nullptr,
     MDString::get(VMContext, ""),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags), // Flags
-    NULL,
+    nullptr,
     ParameterTypes,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    NULL,
-    NULL,
-    NULL  // Type Identifer
+    nullptr,
+    nullptr,
+    nullptr  // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
@@ -755,9 +757,10 @@ DICompositeType DIBuilder::createEnumerationType(
     UnderlyingType.getRef(),
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    NULL,
-    NULL,
-    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
+    nullptr,
+    nullptr,
+    UniqueIdentifier.empty() ? nullptr
+                             : MDString::get(VMContext, UniqueIdentifier)
   };
   DICompositeType CTy(MDNode::get(VMContext, Elts));
   AllEnumTypes.push_back(CTy);
@@ -772,8 +775,8 @@ DICompositeType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
   // TAG_array_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_array_type),
-    NULL, // Filename/Directory,
-    NULL, // Unused
+    nullptr, // Filename/Directory,
+    nullptr, // Unused
     MDString::get(VMContext, ""),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), Size),
@@ -783,9 +786,9 @@ DICompositeType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
     Ty.getRef(),
     Subscripts,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    NULL,
-    NULL,
-    NULL  // Type Identifer
+    nullptr,
+    nullptr,
+    nullptr  // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
@@ -796,8 +799,8 @@ DICompositeType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
   // A vector is an array type with the FlagVector flag applied.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_array_type),
-    NULL, // Filename/Directory,
-    NULL, // Unused
+    nullptr, // Filename/Directory,
+    nullptr, // Unused
     MDString::get(VMContext, ""),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), Size),
@@ -807,9 +810,9 @@ DICompositeType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
     Ty.getRef(),
     Subscripts,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    NULL,
-    NULL,
-    NULL  // Type Identifer
+    nullptr,
+    nullptr,
+    nullptr  // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
@@ -890,12 +893,47 @@ DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIDescriptor Scope,
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), DIDescriptor::FlagFwdDecl),
-    NULL,
+    nullptr,
+    DIArray(),
+    ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang),
+    nullptr,
+    nullptr, //TemplateParams
+    UniqueIdentifier.empty() ? nullptr
+                             : MDString::get(VMContext, UniqueIdentifier)
+  };
+  MDNode *Node = MDNode::get(VMContext, Elts);
+  DICompositeType RetTy(Node);
+  assert(RetTy.isCompositeType() &&
+         "createForwardDecl result should be a DIType");
+  if (!UniqueIdentifier.empty())
+    retainType(RetTy);
+  return RetTy;
+}
+
+/// createForwardDecl - Create a temporary forward-declared type that
+/// can be RAUW'd if the full type is seen.
+DICompositeType DIBuilder::createReplaceableForwardDecl(
+    unsigned Tag, StringRef Name, DIDescriptor Scope, DIFile F, unsigned Line,
+    unsigned RuntimeLang, uint64_t SizeInBits, uint64_t AlignInBits,
+    StringRef UniqueIdentifier) {
+  // Create a temporary MDNode.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, Tag),
+    F.getFileNode(),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Line),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), DIDescriptor::FlagFwdDecl),
+    nullptr,
     DIArray(),
     ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang),
-    NULL,
-    NULL, //TemplateParams
-    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
+    nullptr,
+    nullptr, //TemplateParams
+    UniqueIdentifier.empty() ? nullptr
+                             : MDString::get(VMContext, UniqueIdentifier)
   };
   MDNode *Node = MDNode::getTemporary(VMContext, Elts);
   DICompositeType RetTy(Node);
@@ -932,7 +970,7 @@ DIGlobalVariable DIBuilder::createGlobalVariable(StringRef Name,
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    NULL, // TheCU,
+    nullptr, // TheCU,
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, LinkageName),
@@ -1087,7 +1125,7 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context, StringRef Name,
     ConstantInt::get(Type::getInt1Ty(VMContext), isDefinition),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    NULL,
+    nullptr,
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
     Fn,
@@ -1121,7 +1159,6 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context, StringRef Name,
   assert(getNonCompileUnitScope(Context) &&
          "Methods should have both a Context and a context that isn't "
          "the compile unit.");
-  Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
     F.getFileNode(),
@@ -1141,7 +1178,7 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context, StringRef Name,
     Fn,
     TParam,
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    MDNode::getTemporary(VMContext, TElts),
+    nullptr,
     // FIXME: Do we want to use different scope/lines?
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo)
   };
@@ -1189,6 +1226,13 @@ DILexicalBlockFile DIBuilder::createLexicalBlockFile(DIDescriptor Scope,
 DILexicalBlock DIBuilder::createLexicalBlock(DIDescriptor Scope, DIFile File,
                                              unsigned Line, unsigned Col,
                                              unsigned Discriminator) {
+  // FIXME: This isn't thread safe nor the right way to defeat MDNode uniquing.
+  // I believe the right way is to have a self-referential element in the node.
+  // Also: why do we bother with line/column - they're not used and the
+  // documentation (SourceLevelDebugging.rst) claims the line/col are necessary
+  // for uniquing, yet then we have this other solution (because line/col were
+  // inadequate) anyway. Remove all 3 and replace them with a self-reference.
+
   // Defeat MDNode uniquing for lexical blocks by using unique id.
   static unsigned int unique_id = 0;
   Value *Elts[] = {
diff --git a/lib/IR/DataLayout.cpp b/lib/IR/DataLayout.cpp
index 6c18387..dea05fb 100644
--- a/lib/IR/DataLayout.cpp
+++ b/lib/IR/DataLayout.cpp
@@ -178,7 +178,7 @@ static const LayoutAlignElem DefaultAlignments[] = {
 void DataLayout::reset(StringRef Desc) {
   clear();
 
-  LayoutMap = 0;
+  LayoutMap = nullptr;
   LittleEndian = false;
   StackNaturalAlign = 0;
   ManglingMode = MM_None;
@@ -344,7 +344,7 @@ void DataLayout::parseSpecifier(StringRef Desc) {
   }
 }
 
-DataLayout::DataLayout(const Module *M) : LayoutMap(0) {
+DataLayout::DataLayout(const Module *M) : LayoutMap(nullptr) {
   const DataLayout *Other = M->getDataLayout();
   if (Other)
     *this = *Other;
@@ -357,7 +357,7 @@ bool DataLayout::operator==(const DataLayout &Other) const {
              StackNaturalAlign == Other.StackNaturalAlign &&
              ManglingMode == Other.ManglingMode &&
              LegalIntWidths == Other.LegalIntWidths &&
-             Alignments == Other.Alignments && Pointers == Pointers;
+             Alignments == Other.Alignments && Pointers == Other.Pointers;
   assert(Ret == (getStringRepresentation() == Other.getStringRepresentation()));
   return Ret;
 }
@@ -488,7 +488,7 @@ void DataLayout::clear() {
   Alignments.clear();
   Pointers.clear();
   delete static_cast<StructLayoutMap *>(LayoutMap);
-  LayoutMap = 0;
+  LayoutMap = nullptr;
 }
 
 DataLayout::~DataLayout() {
@@ -687,7 +687,7 @@ unsigned DataLayout::getABITypeAlignment(Type *Ty) const {
 /// getABIIntegerTypeAlignment - Return the minimum ABI-required alignment for
 /// an integer type of the specified bitwidth.
 unsigned DataLayout::getABIIntegerTypeAlignment(unsigned BitWidth) const {
-  return getAlignmentInfo(INTEGER_ALIGN, BitWidth, true, 0);
+  return getAlignmentInfo(INTEGER_ALIGN, BitWidth, true, nullptr);
 }
 
 unsigned DataLayout::getPrefTypeAlignment(Type *Ty) const {
@@ -708,7 +708,7 @@ IntegerType *DataLayout::getIntPtrType(LLVMContext &C,
 Type *DataLayout::getIntPtrType(Type *Ty) const {
   assert(Ty->isPtrOrPtrVectorTy() &&
          "Expected a pointer or pointer vector type.");
-  unsigned NumBits = getTypeSizeInBits(Ty->getScalarType());
+  unsigned NumBits = getPointerTypeSizeInBits(Ty);
   IntegerType *IntTy = IntegerType::get(Ty->getContext(), NumBits);
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
     return VectorType::get(IntTy, VecTy->getNumElements());
@@ -719,7 +719,7 @@ Type *DataLayout::getSmallestLegalIntType(LLVMContext &C, unsigned Width) const
   for (unsigned LegalIntWidth : LegalIntWidths)
     if (Width <= LegalIntWidth)
       return Type::getIntNTy(C, LegalIntWidth);
-  return 0;
+  return nullptr;
 }
 
 unsigned DataLayout::getLargestLegalIntTypeSize() const {
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index c9d68af..db9e56d 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -53,8 +53,8 @@ bool DIDescriptor::Verify() const {
 }
 
 static Value *getField(const MDNode *DbgNode, unsigned Elt) {
-  if (DbgNode == 0 || Elt >= DbgNode->getNumOperands())
-    return 0;
+  if (!DbgNode || Elt >= DbgNode->getNumOperands())
+    return nullptr;
   return DbgNode->getOperand(Elt);
 }
 
@@ -73,7 +73,7 @@ StringRef DIDescriptor::getStringField(unsigned Elt) const {
 }
 
 uint64_t DIDescriptor::getUInt64Field(unsigned Elt) const {
-  if (DbgNode == 0)
+  if (!DbgNode)
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
@@ -85,7 +85,7 @@ uint64_t DIDescriptor::getUInt64Field(unsigned Elt) const {
 }
 
 int64_t DIDescriptor::getInt64Field(unsigned Elt) const {
-  if (DbgNode == 0)
+  if (!DbgNode)
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
@@ -102,34 +102,34 @@ DIDescriptor DIDescriptor::getDescriptorField(unsigned Elt) const {
 }
 
 GlobalVariable *DIDescriptor::getGlobalVariableField(unsigned Elt) const {
-  if (DbgNode == 0)
-    return 0;
+  if (!DbgNode)
+    return nullptr;
 
   if (Elt < DbgNode->getNumOperands())
     return dyn_cast_or_null<GlobalVariable>(DbgNode->getOperand(Elt));
-  return 0;
+  return nullptr;
 }
 
 Constant *DIDescriptor::getConstantField(unsigned Elt) const {
-  if (DbgNode == 0)
-    return 0;
+  if (!DbgNode)
+    return nullptr;
 
   if (Elt < DbgNode->getNumOperands())
     return dyn_cast_or_null<Constant>(DbgNode->getOperand(Elt));
-  return 0;
+  return nullptr;
 }
 
 Function *DIDescriptor::getFunctionField(unsigned Elt) const {
-  if (DbgNode == 0)
-    return 0;
+  if (!DbgNode)
+    return nullptr;
 
   if (Elt < DbgNode->getNumOperands())
     return dyn_cast_or_null<Function>(DbgNode->getOperand(Elt));
-  return 0;
+  return nullptr;
 }
 
 void DIDescriptor::replaceFunctionField(unsigned Elt, Function *F) {
-  if (DbgNode == 0)
+  if (!DbgNode)
     return;
 
   if (Elt < DbgNode->getNumOperands()) {
@@ -335,7 +335,7 @@ unsigned DIArray::getNumElements() const {
 
 /// replaceAllUsesWith - Replace all uses of the MDNode used by this
 /// type with the one in the passed descriptor.
-void DIType::replaceAllUsesWith(DIDescriptor &D) {
+void DIType::replaceAllUsesWith(LLVMContext &VMContext, DIDescriptor D) {
 
   assert(DbgNode && "Trying to replace an unverified type!");
 
@@ -344,13 +344,19 @@ void DIType::replaceAllUsesWith(DIDescriptor &D) {
   // which, due to uniquing, has merged with the source. We shield clients from
   // this detail by allowing a value to be replaced with replaceAllUsesWith()
   // itself.
-  if (DbgNode != D) {
-    MDNode *Node = const_cast<MDNode *>(DbgNode);
-    const MDNode *DN = D;
-    const Value *V = cast_or_null<Value>(DN);
-    Node->replaceAllUsesWith(const_cast<Value *>(V));
-    MDNode::deleteTemporary(Node);
+  const MDNode *DN = D;
+  if (DbgNode == DN) {
+    SmallVector<Value*, 10> Ops(DbgNode->getNumOperands());
+    for (size_t i = 0; i != Ops.size(); ++i)
+      Ops[i] = DbgNode->getOperand(i);
+    DN = MDNode::get(VMContext, Ops);
   }
+
+  MDNode *Node = const_cast<MDNode *>(DbgNode);
+  const Value *V = cast_or_null<Value>(DN);
+  Node->replaceAllUsesWith(const_cast<Value *>(V));
+  MDNode::deleteTemporary(Node);
+  DbgNode = D;
 }
 
 /// replaceAllUsesWith - Replace all uses of the MDNode used by this
@@ -358,19 +364,12 @@ void DIType::replaceAllUsesWith(DIDescriptor &D) {
 void DIType::replaceAllUsesWith(MDNode *D) {
 
   assert(DbgNode && "Trying to replace an unverified type!");
-
-  // Since we use a TrackingVH for the node, its easy for clients to manufacture
-  // legitimate situations where they want to replaceAllUsesWith() on something
-  // which, due to uniquing, has merged with the source. We shield clients from
-  // this detail by allowing a value to be replaced with replaceAllUsesWith()
-  // itself.
-  if (DbgNode != D) {
-    MDNode *Node = const_cast<MDNode *>(DbgNode);
-    const MDNode *DN = D;
-    const Value *V = cast_or_null<Value>(DN);
-    Node->replaceAllUsesWith(const_cast<Value *>(V));
-    MDNode::deleteTemporary(Node);
-  }
+  assert(DbgNode != D && "This replacement should always happen");
+  MDNode *Node = const_cast<MDNode *>(DbgNode);
+  const MDNode *DN = D;
+  const Value *V = cast_or_null<Value>(DN);
+  Node->replaceAllUsesWith(const_cast<Value *>(V));
+  MDNode::deleteTemporary(Node);
 }
 
 /// Verify - Verify that a compile unit is well formed.
@@ -759,7 +758,7 @@ DIScopeRef DIScope::getContext() const {
     return DIScopeRef(DINameSpace(DbgNode).getContext());
 
   assert((isFile() || isCompileUnit()) && "Unhandled type of scope.");
-  return DIScopeRef(NULL);
+  return DIScopeRef(nullptr);
 }
 
 // If the scope node has a name, return that, else return an empty string.
diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index 1a2521e..43360d3 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 MDNode *DebugLoc::getScope(const LLVMContext &Ctx) const {
-  if (ScopeIdx == 0) return 0;
+  if (ScopeIdx == 0) return nullptr;
   
   if (ScopeIdx > 0) {
     // Positive ScopeIdx is an index into ScopeRecords, which has no inlined-at
@@ -37,7 +37,7 @@ MDNode *DebugLoc::getScope(const LLVMContext &Ctx) const {
 MDNode *DebugLoc::getInlinedAt(const LLVMContext &Ctx) const {
   // Positive ScopeIdx is an index into ScopeRecords, which has no inlined-at
   // position specified.  Zero is invalid.
-  if (ScopeIdx >= 0) return 0;
+  if (ScopeIdx >= 0) return nullptr;
   
   // Otherwise, the index is in the ScopeInlinedAtRecords array.
   assert(unsigned(-ScopeIdx) <= Ctx.pImpl->ScopeInlinedAtRecords.size() &&
@@ -49,7 +49,7 @@ MDNode *DebugLoc::getInlinedAt(const LLVMContext &Ctx) const {
 void DebugLoc::getScopeAndInlinedAt(MDNode *&Scope, MDNode *&IA,
                                     const LLVMContext &Ctx) const {
   if (ScopeIdx == 0) {
-    Scope = IA = 0;
+    Scope = IA = nullptr;
     return;
   }
   
@@ -59,7 +59,7 @@ void DebugLoc::getScopeAndInlinedAt(MDNode *&Scope, MDNode *&IA,
     assert(unsigned(ScopeIdx) <= Ctx.pImpl->ScopeRecords.size() &&
            "Invalid ScopeIdx!");
     Scope = Ctx.pImpl->ScopeRecords[ScopeIdx-1].get();
-    IA = 0;
+    IA = nullptr;
     return;
   }
   
@@ -96,8 +96,8 @@ DebugLoc DebugLoc::get(unsigned Line, unsigned Col,
   DebugLoc Result;
   
   // If no scope is available, this is an unknown location.
-  if (Scope == 0) return Result;
-  
+  if (!Scope) return Result;
+
   // Saturate line and col to "unknown".
   if (Col > 255) Col = 0;
   if (Line >= (1 << 24)) Line = 0;
@@ -106,7 +106,7 @@ DebugLoc DebugLoc::get(unsigned Line, unsigned Col,
   LLVMContext &Ctx = Scope->getContext();
   
   // If there is no inlined-at location, use the ScopeRecords array.
-  if (InlinedAt == 0)
+  if (!InlinedAt)
     Result.ScopeIdx = Ctx.pImpl->getOrAddScopeRecordIdxEntry(Scope, 0);
   else
     Result.ScopeIdx = Ctx.pImpl->getOrAddScopeInlinedAtIdxEntry(Scope,
@@ -118,7 +118,7 @@ DebugLoc DebugLoc::get(unsigned Line, unsigned Col,
 /// getAsMDNode - This method converts the compressed DebugLoc node into a
 /// DILocation-compatible MDNode.
 MDNode *DebugLoc::getAsMDNode(const LLVMContext &Ctx) const {
-  if (isUnknown()) return 0;
+  if (isUnknown()) return nullptr;
   
   MDNode *Scope, *IA;
   getScopeAndInlinedAt(Scope, IA, Ctx);
@@ -137,7 +137,7 @@ MDNode *DebugLoc::getAsMDNode(const LLVMContext &Ctx) const {
 DebugLoc DebugLoc::getFromDILocation(MDNode *N) {
   DILocation Loc(N);
   MDNode *Scope = Loc.getScope();
-  if (Scope == 0) return DebugLoc();
+  if (!Scope) return DebugLoc();
   return get(Loc.getLineNumber(), Loc.getColumnNumber(), Scope,
              Loc.getOrigLocation());
 }
@@ -146,8 +146,9 @@ DebugLoc DebugLoc::getFromDILocation(MDNode *N) {
 DebugLoc DebugLoc::getFromDILexicalBlock(MDNode *N) {
   DILexicalBlock LexBlock(N);
   MDNode *Scope = LexBlock.getContext();
-  if (Scope == 0) return DebugLoc();
-  return get(LexBlock.getLineNumber(), LexBlock.getColumnNumber(), Scope, NULL);
+  if (!Scope) return DebugLoc();
+  return get(LexBlock.getLineNumber(), LexBlock.getColumnNumber(), Scope,
+             nullptr);
 }
 
 void DebugLoc::dump(const LLVMContext &Ctx) const {
@@ -166,6 +167,28 @@ void DebugLoc::dump(const LLVMContext &Ctx) const {
 #endif
 }
 
+void DebugLoc::print(const LLVMContext &Ctx, raw_ostream &OS) const {
+  if (!isUnknown()) {
+    // Print source line info.
+    DIScope Scope(getScope(Ctx));
+    assert((!Scope || Scope.isScope()) &&
+           "Scope of a DebugLoc should be null or a DIScope.");
+    if (Scope)
+      OS << Scope.getFilename();
+    else
+      OS << "<unknown>";
+    OS << ':' << getLine();
+    if (getCol() != 0)
+      OS << ':' << getCol();
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(getInlinedAt(Ctx));
+    if (!InlinedAtDL.isUnknown()) {
+      OS << " @[ ";
+      InlinedAtDL.print(Ctx, OS);
+      OS << " ]";
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // DenseMap specialization
 //===----------------------------------------------------------------------===//
@@ -234,7 +257,7 @@ void DebugRecVH::deleted() {
   // If this is a non-canonical reference, just drop the value to null, we know
   // it doesn't have a map entry.
   if (Idx == 0) {
-    setValPtr(0);
+    setValPtr(nullptr);
     return;
   }
     
@@ -245,7 +268,7 @@ void DebugRecVH::deleted() {
     assert(Ctx->ScopeRecordIdx[Cur] == Idx && "Mapping out of date!");
     Ctx->ScopeRecordIdx.erase(Cur);
     // Reset this VH to null and we're done.
-    setValPtr(0);
+    setValPtr(nullptr);
     Idx = 0;
     return;
   }
@@ -259,7 +282,7 @@ void DebugRecVH::deleted() {
   
   MDNode *OldScope = Entry.first.get();
   MDNode *OldInlinedAt = Entry.second.get();
-  assert(OldScope != 0 && OldInlinedAt != 0 &&
+  assert(OldScope && OldInlinedAt &&
          "Entry should be non-canonical if either val dropped to null");
 
   // Otherwise, we do have an entry in it, nuke it and we're done.
@@ -269,7 +292,7 @@ void DebugRecVH::deleted() {
   
   // Reset this VH to null.  Drop both 'Idx' values to null to indicate that
   // we're in non-canonical form now.
-  setValPtr(0);
+  setValPtr(nullptr);
   Entry.first.Idx = Entry.second.Idx = 0;
 }
 
@@ -277,8 +300,8 @@ void DebugRecVH::allUsesReplacedWith(Value *NewVa) {
   // If being replaced with a non-mdnode value (e.g. undef) handle this as if
   // the mdnode got deleted.
   MDNode *NewVal = dyn_cast<MDNode>(NewVa);
-  if (NewVal == 0) return deleted();
-  
+  if (!NewVal) return deleted();
+
   // If this is a non-canonical reference, just change it, we know it already
   // doesn't have a map entry.
   if (Idx == 0) {
@@ -313,7 +336,7 @@ void DebugRecVH::allUsesReplacedWith(Value *NewVa) {
   
   MDNode *OldScope = Entry.first.get();
   MDNode *OldInlinedAt = Entry.second.get();
-  assert(OldScope != 0 && OldInlinedAt != 0 &&
+  assert(OldScope && OldInlinedAt &&
          "Entry should be non-canonical if either val dropped to null");
   
   // Otherwise, we do have an entry in it, nuke it and we're done.
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index d59d4cf..6eeb162 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -12,18 +12,80 @@
 // Diagnostics reporting is still done as part of the LLVMContext.
 //===----------------------------------------------------------------------===//
 
+#include "LLVMContextImpl.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Atomic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Regex.h"
 #include <string>
 
 using namespace llvm;
 
+namespace {
+
+/// \brief Regular expression corresponding to the value given in one of the
+/// -pass-remarks* command line flags. Passes whose name matches this regexp
+/// will emit a diagnostic when calling the associated diagnostic function
+/// (emitOptimizationRemark, emitOptimizationRemarkMissed or
+/// emitOptimizationRemarkAnalysis).
+struct PassRemarksOpt {
+  std::shared_ptr<Regex> Pattern;
+
+  void operator=(const std::string &Val) {
+    // Create a regexp object to match pass names for emitOptimizationRemark.
+    if (!Val.empty()) {
+      Pattern = std::make_shared<Regex>(Val);
+      std::string RegexError;
+      if (!Pattern->isValid(RegexError))
+        report_fatal_error("Invalid regular expression '" + Val +
+                               "' in -pass-remarks: " + RegexError,
+                           false);
+    }
+  };
+};
+
+static PassRemarksOpt PassRemarksOptLoc;
+static PassRemarksOpt PassRemarksMissedOptLoc;
+static PassRemarksOpt PassRemarksAnalysisOptLoc;
+
+// -pass-remarks
+//    Command line flag to enable emitOptimizationRemark()
+static cl::opt<PassRemarksOpt, true, cl::parser<std::string>>
+PassRemarks("pass-remarks", cl::value_desc("pattern"),
+            cl::desc("Enable optimization remarks from passes whose name match "
+                     "the given regular expression"),
+            cl::Hidden, cl::location(PassRemarksOptLoc), cl::ValueRequired,
+            cl::ZeroOrMore);
+
+// -pass-remarks-missed
+//    Command line flag to enable emitOptimizationRemarkMissed()
+static cl::opt<PassRemarksOpt, true, cl::parser<std::string>> PassRemarksMissed(
+    "pass-remarks-missed", cl::value_desc("pattern"),
+    cl::desc("Enable missed optimization remarks from passes whose name match "
+             "the given regular expression"),
+    cl::Hidden, cl::location(PassRemarksMissedOptLoc), cl::ValueRequired,
+    cl::ZeroOrMore);
+
+// -pass-remarks-analysis
+//    Command line flag to enable emitOptimizationRemarkAnalysis()
+static cl::opt<PassRemarksOpt, true, cl::parser<std::string>>
+PassRemarksAnalysis(
+    "pass-remarks-analysis", cl::value_desc("pattern"),
+    cl::desc(
+        "Enable optimization analysis remarks from passes whose name match "
+        "the given regular expression"),
+    cl::Hidden, cl::location(PassRemarksAnalysisOptLoc), cl::ValueRequired,
+    cl::ZeroOrMore);
+}
+
 int llvm::getNextAvailablePluginDiagnosticKind() {
   static sys::cas_flag PluginKindID = DK_FirstPluginKind;
   return (int)sys::AtomicIncrement(&PluginKindID);
@@ -64,3 +126,66 @@ void DiagnosticInfoSampleProfile::print(DiagnosticPrinter &DP) const {
     DP << getFileName() << ": ";
   DP << getMsg();
 }
+
+bool DiagnosticInfoOptimizationRemarkBase::isLocationAvailable() const {
+  return getFunction().getParent()->getNamedMetadata("llvm.dbg.cu") != nullptr;
+}
+
+void DiagnosticInfoOptimizationRemarkBase::getLocation(StringRef *Filename,
+                                                       unsigned *Line,
+                                                       unsigned *Column) const {
+  DILocation DIL(getDebugLoc().getAsMDNode(getFunction().getContext()));
+  *Filename = DIL.getFilename();
+  *Line = DIL.getLineNumber();
+  *Column = DIL.getColumnNumber();
+}
+
+const std::string DiagnosticInfoOptimizationRemarkBase::getLocationStr() const {
+  StringRef Filename("<unknown>");
+  unsigned Line = 0;
+  unsigned Column = 0;
+  if (isLocationAvailable())
+    getLocation(&Filename, &Line, &Column);
+  return Twine(Filename + ":" + Twine(Line) + ":" + Twine(Column)).str();
+}
+
+void DiagnosticInfoOptimizationRemarkBase::print(DiagnosticPrinter &DP) const {
+  DP << getLocationStr() << ": " << getMsg();
+}
+
+bool DiagnosticInfoOptimizationRemark::isEnabled() const {
+  return PassRemarksOptLoc.Pattern &&
+         PassRemarksOptLoc.Pattern->match(getPassName());
+}
+
+bool DiagnosticInfoOptimizationRemarkMissed::isEnabled() const {
+  return PassRemarksMissedOptLoc.Pattern &&
+         PassRemarksMissedOptLoc.Pattern->match(getPassName());
+}
+
+bool DiagnosticInfoOptimizationRemarkAnalysis::isEnabled() const {
+  return PassRemarksAnalysisOptLoc.Pattern &&
+         PassRemarksAnalysisOptLoc.Pattern->match(getPassName());
+}
+
+void llvm::emitOptimizationRemark(LLVMContext &Ctx, const char *PassName,
+                                  const Function &Fn, const DebugLoc &DLoc,
+                                  const Twine &Msg) {
+  Ctx.diagnose(DiagnosticInfoOptimizationRemark(PassName, Fn, DLoc, Msg));
+}
+
+void llvm::emitOptimizationRemarkMissed(LLVMContext &Ctx, const char *PassName,
+                                        const Function &Fn,
+                                        const DebugLoc &DLoc,
+                                        const Twine &Msg) {
+  Ctx.diagnose(DiagnosticInfoOptimizationRemarkMissed(PassName, Fn, DLoc, Msg));
+}
+
+void llvm::emitOptimizationRemarkAnalysis(LLVMContext &Ctx,
+                                          const char *PassName,
+                                          const Function &Fn,
+                                          const DebugLoc &DLoc,
+                                          const Twine &Msg) {
+  Ctx.diagnose(
+      DiagnosticInfoOptimizationRemarkAnalysis(PassName, Fn, DLoc, Msg));
+}
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index c2ea0e1..fe32c46 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -44,7 +44,7 @@ void Argument::anchor() { }
 
 Argument::Argument(Type *Ty, const Twine &Name, Function *Par)
   : Value(Ty, Value::ArgumentVal) {
-  Parent = 0;
+  Parent = nullptr;
 
   // Make sure that we get added to a function
   LeakDetector::addGarbageObject(this);
@@ -76,6 +76,14 @@ unsigned Argument::getArgNo() const {
   return ArgIdx;
 }
 
+/// hasNonNullAttr - Return true if this argument has the nonnull attribute on
+/// it in its containing function.
+bool Argument::hasNonNullAttr() const {
+  if (!getType()->isPointerTy()) return false;
+  return getParent()->getAttributes().
+    hasAttribute(getArgNo()+1, Attribute::NonNull);
+}
+
 /// hasByValAttr - Return true if this argument has the byval attribute on it
 /// in its containing function.
 bool Argument::hasByValAttr() const {
@@ -209,8 +217,8 @@ void Function::eraseFromParent() {
 
 Function::Function(FunctionType *Ty, LinkageTypes Linkage,
                    const Twine &name, Module *ParentModule)
-  : GlobalValue(PointerType::getUnqual(Ty),
-                Value::FunctionVal, 0, 0, Linkage, name) {
+  : GlobalObject(PointerType::getUnqual(Ty),
+                Value::FunctionVal, nullptr, 0, Linkage, name) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   SymTab = new ValueSymbolTable();
@@ -293,7 +301,7 @@ void Function::dropAllReferences() {
     BasicBlocks.begin()->eraseFromParent();
 
   // Prefix data is stored in a side table.
-  setPrefixData(0);
+  setPrefixData(nullptr);
 }
 
 void Function::addAttribute(unsigned i, Attribute::AttrKind attr) {
@@ -348,10 +356,10 @@ void Function::clearGC() {
     GCNames->erase(this);
     if (GCNames->empty()) {
       delete GCNames;
-      GCNames = 0;
+      GCNames = nullptr;
       if (GCNamePool->empty()) {
         delete GCNamePool;
-        GCNamePool = 0;
+        GCNamePool = nullptr;
       }
     }
   }
@@ -361,7 +369,7 @@ void Function::clearGC() {
 /// create a Function) from the Function Src to this one.
 void Function::copyAttributesFrom(const GlobalValue *Src) {
   assert(isa<Function>(Src) && "Expected a Function!");
-  GlobalValue::copyAttributesFrom(Src);
+  GlobalObject::copyAttributesFrom(Src);
   const Function *SrcF = cast<Function>(Src);
   setCallingConv(SrcF->getCallingConv());
   setAttributes(SrcF->getAttributes());
@@ -372,7 +380,7 @@ void Function::copyAttributesFrom(const GlobalValue *Src) {
   if (SrcF->hasPrefixData())
     setPrefixData(SrcF->getPrefixData());
   else
-    setPrefixData(0);
+    setPrefixData(nullptr);
 }
 
 /// getIntrinsicID - This method returns the ID number of the specified
diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp
index f69bdc4..f2099d6 100644
--- a/lib/IR/GCOV.cpp
+++ b/lib/IR/GCOV.cpp
@@ -26,11 +26,6 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 // GCOVFile implementation.
 
-/// ~GCOVFile - Delete GCOVFile and its content.
-GCOVFile::~GCOVFile() {
-  DeleteContainerPointers(Functions);
-}
-
 /// readGCNO - Read GCNO buffer.
 bool GCOVFile::readGCNO(GCOVBuffer &Buffer) {
   if (!Buffer.readGCNOFormat()) return false;
@@ -39,10 +34,10 @@ bool GCOVFile::readGCNO(GCOVBuffer &Buffer) {
   if (!Buffer.readInt(Checksum)) return false;
   while (true) {
     if (!Buffer.readFunctionTag()) break;
-    GCOVFunction *GFun = new GCOVFunction(*this);
+    auto GFun = make_unique<GCOVFunction>(*this);
     if (!GFun->readGCNO(Buffer, Version))
       return false;
-    Functions.push_back(GFun);
+    Functions.push_back(std::move(GFun));
   }
 
   GCNOInitialized = true;
@@ -97,17 +92,15 @@ bool GCOVFile::readGCDA(GCOVBuffer &Buffer) {
 
 /// dump - Dump GCOVFile content to dbgs() for debugging purposes.
 void GCOVFile::dump() const {
-  for (SmallVectorImpl<GCOVFunction *>::const_iterator I = Functions.begin(),
-         E = Functions.end(); I != E; ++I)
-    (*I)->dump();
+  for (const auto &FPtr : Functions)
+    FPtr->dump();
 }
 
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
 void GCOVFile::collectLineCounts(FileInfo &FI) {
-  for (SmallVectorImpl<GCOVFunction *>::iterator I = Functions.begin(),
-         E = Functions.end(); I != E; ++I)
-    (*I)->collectLineCounts(FI);
+  for (const auto &FPtr : Functions)
+    FPtr->collectLineCounts(FI);
   FI.setRunCount(RunCount);
   FI.setProgramCount(ProgramCount);
 }
@@ -115,12 +108,6 @@ void GCOVFile::collectLineCounts(FileInfo &FI) {
 //===----------------------------------------------------------------------===//
 // GCOVFunction implementation.
 
-/// ~GCOVFunction - Delete GCOVFunction and its content.
-GCOVFunction::~GCOVFunction() {
-  DeleteContainerPointers(Blocks);
-  DeleteContainerPointers(Edges);
-}
-
 /// readGCNO - Read a function from the GCNO buffer. Return false if an error
 /// occurs.
 bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
@@ -150,7 +137,7 @@ bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
   if (!Buff.readInt(BlockCount)) return false;
   for (uint32_t i = 0, e = BlockCount; i != e; ++i) {
     if (!Buff.readInt(Dummy)) return false; // Block flags;
-    Blocks.push_back(new GCOVBlock(*this, i));
+    Blocks.push_back(make_unique<GCOVBlock>(*this, i));
   }
 
   // read edges.
@@ -168,8 +155,8 @@ bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
     for (uint32_t i = 0, e = EdgeCount; i != e; ++i) {
       uint32_t Dst;
       if (!Buff.readInt(Dst)) return false;
-      GCOVEdge *Edge = new GCOVEdge(Blocks[BlockNo], Blocks[Dst]);
-      Edges.push_back(Edge);
+      Edges.push_back(make_unique<GCOVEdge>(*Blocks[BlockNo], *Blocks[Dst]));
+      GCOVEdge *Edge = Edges.back().get();
       Blocks[BlockNo]->addDstEdge(Edge);
       Blocks[Dst]->addSrcEdge(Edge);
       if (!Buff.readInt(Dummy)) return false; // Edge flag
@@ -179,34 +166,46 @@ bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
   // read line table.
   while (Buff.readLineTag()) {
     uint32_t LineTableLength;
+    // Read the length of this line table.
     if (!Buff.readInt(LineTableLength)) return false;
     uint32_t EndPos = Buff.getCursor() + LineTableLength*4;
     uint32_t BlockNo;
+    // Read the block number this table is associated with.
     if (!Buff.readInt(BlockNo)) return false;
     if (BlockNo >= BlockCount) {
       errs() << "Unexpected block number: " << BlockNo << " (in " << Name
              << ").\n";
       return false;
     }
-    GCOVBlock *Block = Blocks[BlockNo];
-    if (!Buff.readInt(Dummy)) return false; // flag
-    while (Buff.getCursor() != (EndPos - 4)) {
+    GCOVBlock &Block = *Blocks[BlockNo];
+    // Read the word that pads the beginning of the line table. This may be a
+    // flag of some sort, but seems to always be zero.
+    if (!Buff.readInt(Dummy)) return false;
+
+    // Line information starts here and continues up until the last word.
+    if (Buff.getCursor() != (EndPos - sizeof(uint32_t))) {
       StringRef F;
+      // Read the source file name.
       if (!Buff.readString(F)) return false;
       if (Filename != F) {
         errs() << "Multiple sources for a single basic block: " << Filename
                << " != " << F << " (in " << Name << ").\n";
         return false;
       }
-      if (Buff.getCursor() == (EndPos - 4)) break;
-      while (true) {
+      // Read lines up to, but not including, the null terminator.
+      while (Buff.getCursor() < (EndPos - 2 * sizeof(uint32_t))) {
         uint32_t Line;
         if (!Buff.readInt(Line)) return false;
-        if (!Line) break;
-        Block->addLine(Line);
+        // Line 0 means this instruction was injected by the compiler. Skip it.
+        if (!Line) continue;
+        Block.addLine(Line);
       }
+      // Read the null terminator.
+      if (!Buff.readInt(Dummy)) return false;
     }
-    if (!Buff.readInt(Dummy)) return false; // flag
+    // The last word is either a flag or padding, it isn't clear which. Skip
+    // over it.
+    if (!Buff.readInt(Dummy)) return false;
   }
   return true;
 }
@@ -300,9 +299,8 @@ uint64_t GCOVFunction::getExitCount() const {
 /// dump - Dump GCOVFunction content to dbgs() for debugging purposes.
 void GCOVFunction::dump() const {
   dbgs() <<  "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n";
-  for (SmallVectorImpl<GCOVBlock *>::const_iterator I = Blocks.begin(),
-         E = Blocks.end(); I != E; ++I)
-    (*I)->dump();
+  for (const auto &Block : Blocks)
+    Block->dump();
 }
 
 /// collectLineCounts - Collect line counts. This must be used after
@@ -313,9 +311,8 @@ void GCOVFunction::collectLineCounts(FileInfo &FI) {
   if (LineNumber == 0)
     return;
 
-  for (SmallVectorImpl<GCOVBlock *>::iterator I = Blocks.begin(),
-         E = Blocks.end(); I != E; ++I)
-    (*I)->collectLineCounts(FI);
+  for (const auto &Block : Blocks)
+    Block->collectLineCounts(FI);
   FI.addFunctionLine(Filename, LineNumber, this);
 }
 
@@ -335,8 +332,8 @@ void GCOVBlock::addCount(size_t DstEdgeNo, uint64_t N) {
   assert(DstEdgeNo < DstEdges.size()); // up to caller to ensure EdgeNo is valid
   DstEdges[DstEdgeNo]->Count = N;
   Counter += N;
-  if (!DstEdges[DstEdgeNo]->Dst->getNumDstEdges())
-    DstEdges[DstEdgeNo]->Dst->Counter += N;
+  if (!DstEdges[DstEdgeNo]->Dst.getNumDstEdges())
+    DstEdges[DstEdgeNo]->Dst.Counter += N;
 }
 
 /// sortDstEdges - Sort destination edges by block number, nop if already
@@ -363,7 +360,7 @@ void GCOVBlock::dump() const {
     dbgs() << "\tSource Edges : ";
     for (EdgeIterator I = SrcEdges.begin(), E = SrcEdges.end(); I != E; ++I) {
       const GCOVEdge *Edge = *I;
-      dbgs() << Edge->Src->Number << " (" << Edge->Count << "), ";
+      dbgs() << Edge->Src.Number << " (" << Edge->Count << "), ";
     }
     dbgs() << "\n";
   }
@@ -371,7 +368,7 @@ void GCOVBlock::dump() const {
     dbgs() << "\tDestination Edges : ";
     for (EdgeIterator I = DstEdges.begin(), E = DstEdges.end(); I != E; ++I) {
       const GCOVEdge *Edge = *I;
-      dbgs() << Edge->Dst->Number << " (" << Edge->Count << "), ";
+      dbgs() << Edge->Dst.Number << " (" << Edge->Count << "), ";
     }
     dbgs() << "\n";
   }
@@ -435,11 +432,35 @@ static raw_ostream &operator<<(raw_ostream &OS, const formatBranchInfo &FBI) {
   return OS;
 }
 
+namespace {
+class LineConsumer {
+  std::unique_ptr<MemoryBuffer> Buffer;
+  StringRef Remaining;
+public:
+  LineConsumer(StringRef Filename) {
+    if (error_code EC = MemoryBuffer::getFileOrSTDIN(Filename, Buffer)) {
+      errs() << Filename << ": " << EC.message() << "\n";
+      Remaining = "";
+    } else
+      Remaining = Buffer->getBuffer();
+  }
+  bool empty() { return Remaining.empty(); }
+  void printNext(raw_ostream &OS, uint32_t LineNum) {
+    StringRef Line;
+    if (empty())
+      Line = "/*EOF*/";
+    else
+      std::tie(Line, Remaining) = Remaining.split("\n");
+    OS << format("%5u:", LineNum) << Line << "\n";
+  }
+};
+}
+
 /// Convert a path to a gcov filename. If PreservePaths is true, this
 /// translates "/" to "#", ".." to "^", and drops ".", to match gcov.
 static std::string mangleCoveragePath(StringRef Filename, bool PreservePaths) {
   if (!PreservePaths)
-    return (sys::path::filename(Filename) + ".gcov").str();
+    return sys::path::filename(Filename).str();
 
   // This behaviour is defined by gcov in terms of text replacements, so it's
   // not likely to do anything useful on filesystems with different textual
@@ -467,28 +488,52 @@ static std::string mangleCoveragePath(StringRef Filename, bool PreservePaths) {
 
   if (S < I)
     Result.append(S, I);
-  Result.append(".gcov");
   return Result.str();
 }
 
+std::string FileInfo::getCoveragePath(StringRef Filename,
+                                      StringRef MainFilename) {
+  if (Options.NoOutput)
+    // This is probably a bug in gcov, but when -n is specified, paths aren't
+    // mangled at all, and the -l and -p options are ignored. Here, we do the
+    // same.
+    return Filename;
+
+  std::string CoveragePath;
+  if (Options.LongFileNames && !Filename.equals(MainFilename))
+    CoveragePath =
+        mangleCoveragePath(MainFilename, Options.PreservePaths) + "##";
+  CoveragePath +=
+      mangleCoveragePath(Filename, Options.PreservePaths) + ".gcov";
+  return CoveragePath;
+}
+
+std::unique_ptr<raw_ostream>
+FileInfo::openCoveragePath(StringRef CoveragePath) {
+  if (Options.NoOutput)
+    return llvm::make_unique<raw_null_ostream>();
+
+  std::string ErrorInfo;
+  auto OS = llvm::make_unique<raw_fd_ostream>(CoveragePath.str().c_str(),
+                                              ErrorInfo, sys::fs::F_Text);
+  if (!ErrorInfo.empty()) {
+    errs() << ErrorInfo << "\n";
+    return llvm::make_unique<raw_null_ostream>();
+  }
+  return std::move(OS);
+}
+
 /// print -  Print source files with collected line count information.
-void FileInfo::print(StringRef GCNOFile, StringRef GCDAFile) {
+void FileInfo::print(StringRef MainFilename, StringRef GCNOFile,
+                     StringRef GCDAFile) {
   for (StringMap<LineData>::const_iterator I = LineInfo.begin(),
          E = LineInfo.end(); I != E; ++I) {
     StringRef Filename = I->first();
-    std::unique_ptr<MemoryBuffer> Buff;
-    if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
-      errs() << Filename << ": " << ec.message() << "\n";
-      return;
-    }
-    StringRef AllLines = Buff->getBuffer();
+    auto AllLines = LineConsumer(Filename);
 
-    std::string CoveragePath = mangleCoveragePath(Filename,
-                                                  Options.PreservePaths);
-    std::string ErrorInfo;
-    raw_fd_ostream OS(CoveragePath.c_str(), ErrorInfo, sys::fs::F_Text);
-    if (!ErrorInfo.empty())
-      errs() << ErrorInfo << "\n";
+    std::string CoveragePath = getCoveragePath(Filename, MainFilename);
+    std::unique_ptr<raw_ostream> S = openCoveragePath(CoveragePath);
+    raw_ostream &OS = *S;
 
     OS << "        -:    0:Source:" << Filename << "\n";
     OS << "        -:    0:Graph:" << GCNOFile << "\n";
@@ -498,7 +543,8 @@ void FileInfo::print(StringRef GCNOFile, StringRef GCDAFile) {
 
     const LineData &Line = I->second;
     GCOVCoverage FileCoverage(Filename);
-    for (uint32_t LineIndex = 0; !AllLines.empty(); ++LineIndex) {
+    for (uint32_t LineIndex = 0;
+         LineIndex < Line.LastLine || !AllLines.empty(); ++LineIndex) {
       if (Options.BranchInfo) {
         FunctionLines::const_iterator FuncsIt = Line.Functions.find(LineIndex);
         if (FuncsIt != Line.Functions.end())
@@ -509,9 +555,7 @@ void FileInfo::print(StringRef GCNOFile, StringRef GCDAFile) {
       if (BlocksIt == Line.Blocks.end()) {
         // No basic blocks are on this line. Not an executable line of code.
         OS << "        -:";
-        std::pair<StringRef, StringRef> P = AllLines.split('\n');
-        OS << format("%5u:", LineIndex+1) << P.first << "\n";
-        AllLines = P.second;
+        AllLines.printNext(OS, LineIndex + 1);
       } else {
         const BlockVector &Blocks = BlocksIt->second;
 
@@ -573,9 +617,7 @@ void FileInfo::print(StringRef GCNOFile, StringRef GCDAFile) {
         }
         ++FileCoverage.LogicalLines;
 
-        std::pair<StringRef, StringRef> P = AllLines.split('\n');
-        OS << format("%5u:", LineIndex+1) << P.first << "\n";
-        AllLines = P.second;
+        AllLines.printNext(OS, LineIndex + 1);
 
         uint32_t BlockNo = 0;
         uint32_t EdgeNo = 0;
@@ -605,10 +647,11 @@ void FileInfo::print(StringRef GCNOFile, StringRef GCDAFile) {
   if (Options.FuncCoverage)
     printFuncCoverage();
   printFileCoverage();
+  return;
 }
 
 /// printFunctionSummary - Print function and block summary.
-void FileInfo::printFunctionSummary(raw_fd_ostream &OS,
+void FileInfo::printFunctionSummary(raw_ostream &OS,
                                     const FunctionVector &Funcs) const {
   for (FunctionVector::const_iterator I = Funcs.begin(), E = Funcs.end();
          I != E; ++I) {
@@ -617,8 +660,8 @@ void FileInfo::printFunctionSummary(raw_fd_ostream &OS,
     uint32_t BlocksExec = 0;
     for (GCOVFunction::BlockIterator I = Func->block_begin(),
            E = Func->block_end(); I != E; ++I) {
-      const GCOVBlock *Block = *I;
-      if (Block->getNumDstEdges() && Block->getCount())
+      const GCOVBlock &Block = **I;
+      if (Block.getNumDstEdges() && Block.getCount())
           ++BlocksExec;
     }
 
@@ -630,7 +673,7 @@ void FileInfo::printFunctionSummary(raw_fd_ostream &OS,
 }
 
 /// printBlockInfo - Output counts for each block.
-void FileInfo::printBlockInfo(raw_fd_ostream &OS, const GCOVBlock &Block,
+void FileInfo::printBlockInfo(raw_ostream &OS, const GCOVBlock &Block,
                               uint32_t LineIndex, uint32_t &BlockNo) const {
   if (Block.getCount() == 0)
     OS << "    $$$$$:";
@@ -640,7 +683,7 @@ void FileInfo::printBlockInfo(raw_fd_ostream &OS, const GCOVBlock &Block,
 }
 
 /// printBranchInfo - Print conditional branch probabilities.
-void FileInfo::printBranchInfo(raw_fd_ostream &OS, const GCOVBlock &Block,
+void FileInfo::printBranchInfo(raw_ostream &OS, const GCOVBlock &Block,
                                GCOVCoverage &Coverage, uint32_t &EdgeNo) {
   SmallVector<uint64_t, 16> BranchCounts;
   uint64_t TotalCounts = 0;
@@ -670,7 +713,7 @@ void FileInfo::printBranchInfo(raw_fd_ostream &OS, const GCOVBlock &Block,
 }
 
 /// printUncondBranchInfo - Print unconditional branch probabilities.
-void FileInfo::printUncondBranchInfo(raw_fd_ostream &OS, uint32_t &EdgeNo,
+void FileInfo::printUncondBranchInfo(raw_ostream &OS, uint32_t &EdgeNo,
                                      uint64_t Count) const {
   OS << format("unconditional %2u ", EdgeNo++)
      << formatBranchInfo(Options, Count, Count) << "\n";
@@ -716,6 +759,8 @@ void FileInfo::printFileCoverage() const {
     const GCOVCoverage &Coverage = I->second;
     outs() << "File '" << Coverage.Name << "'\n";
     printCoverage(Coverage);
-    outs() << Coverage.Name << ":creating '" << Filename << "'\n\n";
+    if (!Options.NoOutput)
+      outs() << Coverage.Name << ":creating '" << Filename << "'\n";
+    outs() << "\n";
   }
 }
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index f338dd7..c905cfe 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -53,23 +53,41 @@ void GlobalValue::destroyConstant() {
 /// copyAttributesFrom - copy all additional attributes (those not needed to
 /// create a GlobalValue) from the GlobalValue Src to this one.
 void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
-  setAlignment(Src->getAlignment());
-  setSection(Src->getSection());
   setVisibility(Src->getVisibility());
   setUnnamedAddr(Src->hasUnnamedAddr());
   setDLLStorageClass(Src->getDLLStorageClass());
 }
 
-void GlobalValue::setAlignment(unsigned Align) {
-  assert((!isa<GlobalAlias>(this) || !Align) &&
-         "GlobalAlias should not have an alignment!");
+unsigned GlobalValue::getAlignment() const {
+  if (auto *GA = dyn_cast<GlobalAlias>(this))
+    return GA->getAliasee()->getAlignment();
+
+  return cast<GlobalObject>(this)->getAlignment();
+}
+
+void GlobalObject::setAlignment(unsigned Align) {
   assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
   assert(Align <= MaximumAlignment &&
          "Alignment is greater than MaximumAlignment!");
-  Alignment = Log2_32(Align) + 1;
+  setGlobalValueSubClassData(Log2_32(Align) + 1);
   assert(getAlignment() == Align && "Alignment representation error!");
 }
 
+void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
+  const auto *GV = cast<GlobalObject>(Src);
+  GlobalValue::copyAttributesFrom(GV);
+  setAlignment(GV->getAlignment());
+  setSection(GV->getSection());
+}
+
+const std::string &GlobalValue::getSection() const {
+  if (auto *GA = dyn_cast<GlobalAlias>(this))
+    return GA->getAliasee()->getSection();
+  return cast<GlobalObject>(this)->getSection();
+}
+
+void GlobalObject::setSection(StringRef S) { Section = S; }
+
 bool GlobalValue::isDeclaration() const {
   // Globals are definitions if they have an initializer.
   if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(this))
@@ -83,22 +101,20 @@ bool GlobalValue::isDeclaration() const {
   assert(isa<GlobalAlias>(this));
   return false;
 }
-  
+
 //===----------------------------------------------------------------------===//
 // GlobalVariable Implementation
 //===----------------------------------------------------------------------===//
 
 GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
-                               Constant *InitVal,
-                               const Twine &Name, ThreadLocalMode TLMode,
-                               unsigned AddressSpace,
+                               Constant *InitVal, const Twine &Name,
+                               ThreadLocalMode TLMode, unsigned AddressSpace,
                                bool isExternallyInitialized)
-  : GlobalValue(PointerType::get(Ty, AddressSpace),
-                Value::GlobalVariableVal,
-                OperandTraits<GlobalVariable>::op_begin(this),
-                InitVal != 0, Link, Name),
-    isConstantGlobal(constant), threadLocalMode(TLMode),
-    isExternallyInitializedConstant(isExternallyInitialized) {
+    : GlobalObject(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal,
+                   OperandTraits<GlobalVariable>::op_begin(this),
+                   InitVal != nullptr, Link, Name),
+      isConstantGlobal(constant), threadLocalMode(TLMode),
+      isExternallyInitializedConstant(isExternallyInitialized) {
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
@@ -110,24 +126,22 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
 
 GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
                                LinkageTypes Link, Constant *InitVal,
-                               const Twine &Name,
-                               GlobalVariable *Before, ThreadLocalMode TLMode,
-                               unsigned AddressSpace,
+                               const Twine &Name, GlobalVariable *Before,
+                               ThreadLocalMode TLMode, unsigned AddressSpace,
                                bool isExternallyInitialized)
-  : GlobalValue(PointerType::get(Ty, AddressSpace),
-                Value::GlobalVariableVal,
-                OperandTraits<GlobalVariable>::op_begin(this),
-                InitVal != 0, Link, Name),
-    isConstantGlobal(constant), threadLocalMode(TLMode),
-    isExternallyInitializedConstant(isExternallyInitialized) {
+    : GlobalObject(PointerType::get(Ty, AddressSpace), Value::GlobalVariableVal,
+                   OperandTraits<GlobalVariable>::op_begin(this),
+                   InitVal != nullptr, Link, Name),
+      isConstantGlobal(constant), threadLocalMode(TLMode),
+      isExternallyInitializedConstant(isExternallyInitialized) {
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
     Op<0>() = InitVal;
   }
-  
+
   LeakDetector::addGarbageObject(this);
-  
+
   if (Before)
     Before->getParent()->getGlobalList().insert(Before, this);
   else
@@ -171,9 +185,9 @@ void GlobalVariable::replaceUsesOfWithOnConstant(Value *From, Value *To,
 }
 
 void GlobalVariable::setInitializer(Constant *InitVal) {
-  if (InitVal == 0) {
+  if (!InitVal) {
     if (hasInitializer()) {
-      Op<0>().set(0);
+      Op<0>().set(nullptr);
       NumOperands = 0;
     }
   } else {
@@ -189,7 +203,7 @@ void GlobalVariable::setInitializer(Constant *InitVal) {
 /// create a GlobalVariable) from the GlobalVariable Src to this one.
 void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
   assert(isa<GlobalVariable>(Src) && "Expected a GlobalVariable!");
-  GlobalValue::copyAttributesFrom(Src);
+  GlobalObject::copyAttributesFrom(Src);
   const GlobalVariable *SrcVar = cast<GlobalVariable>(Src);
   setThreadLocalMode(SrcVar->getThreadLocalMode());
 }
@@ -199,20 +213,47 @@ void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
 // GlobalAlias Implementation
 //===----------------------------------------------------------------------===//
 
-GlobalAlias::GlobalAlias(Type *Ty, LinkageTypes Link,
-                         const Twine &Name, Constant* aliasee,
+GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
+                         const Twine &Name, GlobalObject *Aliasee,
                          Module *ParentModule)
-  : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name) {
+    : GlobalValue(PointerType::get(Ty, AddressSpace), Value::GlobalAliasVal,
+                  &Op<0>(), 1, Link, Name) {
   LeakDetector::addGarbageObject(this);
-
-  if (aliasee)
-    assert(aliasee->getType() == Ty && "Alias and aliasee types should match!");
-  Op<0>() = aliasee;
+  Op<0>() = Aliasee;
 
   if (ParentModule)
     ParentModule->getAliasList().push_back(this);
 }
 
+GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace,
+                                 LinkageTypes Link, const Twine &Name,
+                                 GlobalObject *Aliasee, Module *ParentModule) {
+  return new GlobalAlias(Ty, AddressSpace, Link, Name, Aliasee, ParentModule);
+}
+
+GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace,
+                                 LinkageTypes Linkage, const Twine &Name,
+                                 Module *Parent) {
+  return create(Ty, AddressSpace, Linkage, Name, nullptr, Parent);
+}
+
+GlobalAlias *GlobalAlias::create(Type *Ty, unsigned AddressSpace,
+                                 LinkageTypes Linkage, const Twine &Name,
+                                 GlobalObject *Aliasee) {
+  return create(Ty, AddressSpace, Linkage, Name, Aliasee, Aliasee->getParent());
+}
+
+GlobalAlias *GlobalAlias::create(LinkageTypes Link, const Twine &Name,
+                                 GlobalObject *Aliasee) {
+  PointerType *PTy = Aliasee->getType();
+  return create(PTy->getElementType(), PTy->getAddressSpace(), Link, Name,
+                Aliasee);
+}
+
+GlobalAlias *GlobalAlias::create(const Twine &Name, GlobalObject *Aliasee) {
+  return create(Aliasee->getLinkage(), Name, Aliasee);
+}
+
 void GlobalAlias::setParent(Module *parent) {
   if (getParent())
     LeakDetector::addGarbageObject(this);
@@ -229,42 +270,4 @@ void GlobalAlias::eraseFromParent() {
   getParent()->getAliasList().erase(this);
 }
 
-void GlobalAlias::setAliasee(Constant *Aliasee) {
-  assert((!Aliasee || Aliasee->getType() == getType()) &&
-         "Alias and aliasee types should match!");
-  
-  setOperand(0, Aliasee);
-}
-
-static GlobalValue *getAliaseeGV(GlobalAlias *GA) {
-  Constant *C = GA->getAliasee();
-  assert(C && "Must alias something");
-
-  if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
-    return GV;
-
-  ConstantExpr *CE = cast<ConstantExpr>(C);
-  assert((CE->getOpcode() == Instruction::BitCast ||
-          CE->getOpcode() == Instruction::AddrSpaceCast ||
-          CE->getOpcode() == Instruction::GetElementPtr) &&
-         "Unsupported aliasee");
-
-  return cast<GlobalValue>(CE->getOperand(0));
-}
-
-GlobalValue *GlobalAlias::getAliasedGlobal() {
-  SmallPtrSet<GlobalValue*, 3> Visited;
-
-  GlobalAlias *GA = this;
-
-  for (;;) {
-    GlobalValue *GV = getAliaseeGV(GA);
-    if (!Visited.insert(GV))
-      return 0;
-
-    // Iterate over aliasing chain.
-    GA = dyn_cast<GlobalAlias>(GV);
-    if (!GA)
-      return GV;
-  }
-}
+void GlobalAlias::setAliasee(GlobalObject *Aliasee) { setOperand(0, Aliasee); }
diff --git a/lib/IR/IRPrintingPasses.cpp b/lib/IR/IRPrintingPasses.cpp
index 099c27c..c8a1747 100644
--- a/lib/IR/IRPrintingPasses.cpp
+++ b/lib/IR/IRPrintingPasses.cpp
@@ -94,7 +94,7 @@ public:
     return false;
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override{
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
 };
diff --git a/lib/IR/InlineAsm.cpp b/lib/IR/InlineAsm.cpp
index 62d191d..a3e1da3b1 100644
--- a/lib/IR/InlineAsm.cpp
+++ b/lib/IR/InlineAsm.cpp
@@ -274,7 +274,7 @@ bool InlineAsm::Verify(FunctionType *Ty, StringRef ConstStr) {
     break;
   default:
     StructType *STy = dyn_cast<StructType>(Ty->getReturnType());
-    if (STy == 0 || STy->getNumElements() != NumOutputs)
+    if (!STy || STy->getNumElements() != NumOutputs)
       return false;
     break;
   }      
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index d31a92e..28cc4cb 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 
 Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
                          Instruction *InsertBefore)
-  : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(0) {
+  : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(nullptr) {
   // Make sure that we get added to a basicblock
   LeakDetector::addGarbageObject(this);
 
@@ -41,7 +41,7 @@ const DataLayout *Instruction::getDataLayout() const {
 
 Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
                          BasicBlock *InsertAtEnd)
-  : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(0) {
+  : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(nullptr) {
   // Make sure that we get added to a basicblock
   LeakDetector::addGarbageObject(this);
 
@@ -53,7 +53,7 @@ Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
 
 // Out of line virtual method, so the vtable, etc has a home.
 Instruction::~Instruction() {
-  assert(Parent == 0 && "Instruction still linked in the program!");
+  assert(!Parent && "Instruction still linked in the program!");
   if (hasMetadataHashEntry())
     clearMetadataHashEntries();
 }
@@ -262,6 +262,58 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   }
 }
 
+/// Return true if both instructions have the same special state
+/// This must be kept in sync with lib/Transforms/IPO/MergeFunctions.cpp.
+static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2,
+                                 bool IgnoreAlignment = false) {
+  assert(I1->getOpcode() == I2->getOpcode() &&
+         "Can not compare special state of different instructions");
+
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I1))
+    return LI->isVolatile() == cast<LoadInst>(I2)->isVolatile() &&
+           (LI->getAlignment() == cast<LoadInst>(I2)->getAlignment() ||
+            IgnoreAlignment) &&
+           LI->getOrdering() == cast<LoadInst>(I2)->getOrdering() &&
+           LI->getSynchScope() == cast<LoadInst>(I2)->getSynchScope();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(I1))
+    return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() &&
+           (SI->getAlignment() == cast<StoreInst>(I2)->getAlignment() ||
+            IgnoreAlignment) &&
+           SI->getOrdering() == cast<StoreInst>(I2)->getOrdering() &&
+           SI->getSynchScope() == cast<StoreInst>(I2)->getSynchScope();
+  if (const CmpInst *CI = dyn_cast<CmpInst>(I1))
+    return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate();
+  if (const CallInst *CI = dyn_cast<CallInst>(I1))
+    return CI->isTailCall() == cast<CallInst>(I2)->isTailCall() &&
+           CI->getCallingConv() == cast<CallInst>(I2)->getCallingConv() &&
+           CI->getAttributes() == cast<CallInst>(I2)->getAttributes();
+  if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1))
+    return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() &&
+           CI->getAttributes() ==
+             cast<InvokeInst>(I2)->getAttributes();
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1))
+    return IVI->getIndices() == cast<InsertValueInst>(I2)->getIndices();
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1))
+    return EVI->getIndices() == cast<ExtractValueInst>(I2)->getIndices();
+  if (const FenceInst *FI = dyn_cast<FenceInst>(I1))
+    return FI->getOrdering() == cast<FenceInst>(I2)->getOrdering() &&
+           FI->getSynchScope() == cast<FenceInst>(I2)->getSynchScope();
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I1))
+    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I2)->isVolatile() &&
+           CXI->getSuccessOrdering() ==
+               cast<AtomicCmpXchgInst>(I2)->getSuccessOrdering() &&
+           CXI->getFailureOrdering() ==
+               cast<AtomicCmpXchgInst>(I2)->getFailureOrdering() &&
+           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I2)->getSynchScope();
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I1))
+    return RMWI->getOperation() == cast<AtomicRMWInst>(I2)->getOperation() &&
+           RMWI->isVolatile() == cast<AtomicRMWInst>(I2)->isVolatile() &&
+           RMWI->getOrdering() == cast<AtomicRMWInst>(I2)->getOrdering() &&
+           RMWI->getSynchScope() == cast<AtomicRMWInst>(I2)->getSynchScope();
+
+  return true;
+}
+
 /// isIdenticalTo - Return true if the specified instruction is exactly
 /// identical to the current one.  This means that all operands match and any
 /// extra information (e.g. load is volatile) agree.
@@ -284,51 +336,13 @@ bool Instruction::isIdenticalToWhenDefined(const Instruction *I) const {
   if (!std::equal(op_begin(), op_end(), I->op_begin()))
     return false;
 
-  // Check special state that is a part of some instructions.
-  if (const LoadInst *LI = dyn_cast<LoadInst>(this))
-    return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() &&
-           LI->getAlignment() == cast<LoadInst>(I)->getAlignment() &&
-           LI->getOrdering() == cast<LoadInst>(I)->getOrdering() &&
-           LI->getSynchScope() == cast<LoadInst>(I)->getSynchScope();
-  if (const StoreInst *SI = dyn_cast<StoreInst>(this))
-    return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() &&
-           SI->getAlignment() == cast<StoreInst>(I)->getAlignment() &&
-           SI->getOrdering() == cast<StoreInst>(I)->getOrdering() &&
-           SI->getSynchScope() == cast<StoreInst>(I)->getSynchScope();
-  if (const CmpInst *CI = dyn_cast<CmpInst>(this))
-    return CI->getPredicate() == cast<CmpInst>(I)->getPredicate();
-  if (const CallInst *CI = dyn_cast<CallInst>(this))
-    return CI->isTailCall() == cast<CallInst>(I)->isTailCall() &&
-           CI->getCallingConv() == cast<CallInst>(I)->getCallingConv() &&
-           CI->getAttributes() == cast<CallInst>(I)->getAttributes();
-  if (const InvokeInst *CI = dyn_cast<InvokeInst>(this))
-    return CI->getCallingConv() == cast<InvokeInst>(I)->getCallingConv() &&
-           CI->getAttributes() == cast<InvokeInst>(I)->getAttributes();
-  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(this))
-    return IVI->getIndices() == cast<InsertValueInst>(I)->getIndices();
-  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(this))
-    return EVI->getIndices() == cast<ExtractValueInst>(I)->getIndices();
-  if (const FenceInst *FI = dyn_cast<FenceInst>(this))
-    return FI->getOrdering() == cast<FenceInst>(FI)->getOrdering() &&
-           FI->getSynchScope() == cast<FenceInst>(FI)->getSynchScope();
-  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(this))
-    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I)->isVolatile() &&
-           CXI->getSuccessOrdering() ==
-               cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() &&
-           CXI->getFailureOrdering() ==
-               cast<AtomicCmpXchgInst>(I)->getFailureOrdering() &&
-           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I)->getSynchScope();
-  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(this))
-    return RMWI->getOperation() == cast<AtomicRMWInst>(I)->getOperation() &&
-           RMWI->isVolatile() == cast<AtomicRMWInst>(I)->isVolatile() &&
-           RMWI->getOrdering() == cast<AtomicRMWInst>(I)->getOrdering() &&
-           RMWI->getSynchScope() == cast<AtomicRMWInst>(I)->getSynchScope();
   if (const PHINode *thisPHI = dyn_cast<PHINode>(this)) {
     const PHINode *otherPHI = cast<PHINode>(I);
     return std::equal(thisPHI->block_begin(), thisPHI->block_end(),
                       otherPHI->block_begin());
   }
-  return true;
+
+  return haveSameSpecialState(this, I);
 }
 
 // isSameOperationAs
@@ -355,50 +369,7 @@ bool Instruction::isSameOperationAs(const Instruction *I,
         getOperand(i)->getType() != I->getOperand(i)->getType())
       return false;
 
-  // Check special state that is a part of some instructions.
-  if (const LoadInst *LI = dyn_cast<LoadInst>(this))
-    return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() &&
-           (LI->getAlignment() == cast<LoadInst>(I)->getAlignment() ||
-            IgnoreAlignment) &&
-           LI->getOrdering() == cast<LoadInst>(I)->getOrdering() &&
-           LI->getSynchScope() == cast<LoadInst>(I)->getSynchScope();
-  if (const StoreInst *SI = dyn_cast<StoreInst>(this))
-    return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() &&
-           (SI->getAlignment() == cast<StoreInst>(I)->getAlignment() ||
-            IgnoreAlignment) &&
-           SI->getOrdering() == cast<StoreInst>(I)->getOrdering() &&
-           SI->getSynchScope() == cast<StoreInst>(I)->getSynchScope();
-  if (const CmpInst *CI = dyn_cast<CmpInst>(this))
-    return CI->getPredicate() == cast<CmpInst>(I)->getPredicate();
-  if (const CallInst *CI = dyn_cast<CallInst>(this))
-    return CI->isTailCall() == cast<CallInst>(I)->isTailCall() &&
-           CI->getCallingConv() == cast<CallInst>(I)->getCallingConv() &&
-           CI->getAttributes() == cast<CallInst>(I)->getAttributes();
-  if (const InvokeInst *CI = dyn_cast<InvokeInst>(this))
-    return CI->getCallingConv() == cast<InvokeInst>(I)->getCallingConv() &&
-           CI->getAttributes() ==
-             cast<InvokeInst>(I)->getAttributes();
-  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(this))
-    return IVI->getIndices() == cast<InsertValueInst>(I)->getIndices();
-  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(this))
-    return EVI->getIndices() == cast<ExtractValueInst>(I)->getIndices();
-  if (const FenceInst *FI = dyn_cast<FenceInst>(this))
-    return FI->getOrdering() == cast<FenceInst>(I)->getOrdering() &&
-           FI->getSynchScope() == cast<FenceInst>(I)->getSynchScope();
-  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(this))
-    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I)->isVolatile() &&
-           CXI->getSuccessOrdering() ==
-               cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() &&
-           CXI->getFailureOrdering() ==
-               cast<AtomicCmpXchgInst>(I)->getFailureOrdering() &&
-           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I)->getSynchScope();
-  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(this))
-    return RMWI->getOperation() == cast<AtomicRMWInst>(I)->getOperation() &&
-           RMWI->isVolatile() == cast<AtomicRMWInst>(I)->isVolatile() &&
-           RMWI->getOrdering() == cast<AtomicRMWInst>(I)->getOrdering() &&
-           RMWI->getSynchScope() == cast<AtomicRMWInst>(I)->getSynchScope();
-
-  return true;
+  return haveSameSpecialState(this, I, IgnoreAlignment);
 }
 
 /// isUsedOutsideOfBlock - Return true if there are any uses of I outside of the
@@ -410,7 +381,7 @@ bool Instruction::isUsedOutsideOfBlock(const BasicBlock *BB) const {
     // instructions, just check to see whether the parent of the use matches up.
     const Instruction *I = cast<Instruction>(U.getUser());
     const PHINode *PN = dyn_cast<PHINode>(I);
-    if (PN == 0) {
+    if (!PN) {
       if (I->getParent() != BB)
         return true;
       continue;
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 3aa8413..13c51b8 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -68,7 +68,7 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
     if (VT->getElementType() != Type::getInt1Ty(Op0->getContext()))
       return "vector select condition element type must be i1";
     VectorType *ET = dyn_cast<VectorType>(Op1->getType());
-    if (ET == 0)
+    if (!ET)
       return "selected values for vector select must be vectors";
     if (ET->getNumElements() != VT->getNumElements())
       return "vector select requires selected vectors to have "
@@ -76,7 +76,7 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
   } else if (Op0->getType() != Type::getInt1Ty(Op0->getContext())) {
     return "select condition must be i1 or <n x i1>";
   }
-  return 0;
+  return nullptr;
 }
 
 
@@ -123,7 +123,7 @@ Value *PHINode::removeIncomingValue(unsigned Idx, bool DeletePHIIfEmpty) {
   std::copy(block_begin() + Idx + 1, block_end(), block_begin() + Idx);
 
   // Nuke the last value.
-  Op<-1>().set(0);
+  Op<-1>().set(nullptr);
   --NumOperands;
 
   // If the PHI node is dead, because it has zero entries, nuke it now.
@@ -164,7 +164,7 @@ Value *PHINode::hasConstantValue() const {
   for (unsigned i = 1, e = getNumIncomingValues(); i != e; ++i)
     if (getIncomingValue(i) != ConstantValue && getIncomingValue(i) != this) {
       if (ConstantValue != this)
-        return 0; // Incoming values not all the same.
+        return nullptr; // Incoming values not all the same.
        // The case where the first value is this PHI.
       ConstantValue = getIncomingValue(i);
     }
@@ -180,14 +180,14 @@ Value *PHINode::hasConstantValue() const {
 LandingPadInst::LandingPadInst(Type *RetTy, Value *PersonalityFn,
                                unsigned NumReservedValues, const Twine &NameStr,
                                Instruction *InsertBefore)
-  : Instruction(RetTy, Instruction::LandingPad, 0, 0, InsertBefore) {
+  : Instruction(RetTy, Instruction::LandingPad, nullptr, 0, InsertBefore) {
   init(PersonalityFn, 1 + NumReservedValues, NameStr);
 }
 
 LandingPadInst::LandingPadInst(Type *RetTy, Value *PersonalityFn,
                                unsigned NumReservedValues, const Twine &NameStr,
                                BasicBlock *InsertAtEnd)
-  : Instruction(RetTy, Instruction::LandingPad, 0, 0, InsertAtEnd) {
+  : Instruction(RetTy, Instruction::LandingPad, nullptr, 0, InsertAtEnd) {
   init(PersonalityFn, 1 + NumReservedValues, NameStr);
 }
 
@@ -324,7 +324,7 @@ CallInst::CallInst(const CallInst &CI)
                 OperandTraits<CallInst>::op_end(this) - CI.getNumOperands(),
                 CI.getNumOperands()) {
   setAttributes(CI.getAttributes());
-  setTailCall(CI.isTailCall());
+  setTailCallKind(CI.getTailCallKind());
   setCallingConv(CI.getCallingConv());
     
   std::copy(CI.op_begin(), CI.op_end(), op_begin());
@@ -420,8 +420,8 @@ static Instruction *createMalloc(Instruction *InsertBefore,
     // prototype malloc as "void *malloc(size_t)"
     MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy, NULL);
   PointerType *AllocPtrType = PointerType::getUnqual(AllocTy);
-  CallInst *MCall = NULL;
-  Instruction *Result = NULL;
+  CallInst *MCall = nullptr;
+  Instruction *Result = nullptr;
   if (InsertBefore) {
     MCall = CallInst::Create(MallocFunc, AllocSize, "malloccall", InsertBefore);
     Result = MCall;
@@ -458,7 +458,7 @@ Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
                                     Value *AllocSize, Value *ArraySize,
                                     Function * MallocF,
                                     const Twine &Name) {
-  return createMalloc(InsertBefore, NULL, IntPtrTy, AllocTy, AllocSize,
+  return createMalloc(InsertBefore, nullptr, IntPtrTy, AllocTy, AllocSize,
                       ArraySize, MallocF, Name);
 }
 
@@ -474,7 +474,7 @@ Instruction *CallInst::CreateMalloc(BasicBlock *InsertAtEnd,
                                     Type *IntPtrTy, Type *AllocTy,
                                     Value *AllocSize, Value *ArraySize, 
                                     Function *MallocF, const Twine &Name) {
-  return createMalloc(NULL, InsertAtEnd, IntPtrTy, AllocTy, AllocSize,
+  return createMalloc(nullptr, InsertAtEnd, IntPtrTy, AllocTy, AllocSize,
                       ArraySize, MallocF, Name);
 }
 
@@ -492,7 +492,7 @@ static Instruction* createFree(Value* Source, Instruction *InsertBefore,
   Type *IntPtrTy = Type::getInt8PtrTy(M->getContext());
   // prototype free as "void free(void*)"
   Value *FreeFunc = M->getOrInsertFunction("free", VoidTy, IntPtrTy, NULL);
-  CallInst* Result = NULL;
+  CallInst* Result = nullptr;
   Value *PtrCast = Source;
   if (InsertBefore) {
     if (Source->getType() != IntPtrTy)
@@ -512,14 +512,14 @@ static Instruction* createFree(Value* Source, Instruction *InsertBefore,
 
 /// CreateFree - Generate the IR for a call to the builtin free function.
 Instruction * CallInst::CreateFree(Value* Source, Instruction *InsertBefore) {
-  return createFree(Source, InsertBefore, NULL);
+  return createFree(Source, InsertBefore, nullptr);
 }
 
 /// CreateFree - Generate the IR for a call to the builtin free function.
 /// Note: This function does not add the call to the basic block, that is the
 /// responsibility of the caller.
 Instruction* CallInst::CreateFree(Value* Source, BasicBlock *InsertAtEnd) {
-  Instruction* FreeCall = createFree(Source, NULL, InsertAtEnd);
+  Instruction* FreeCall = createFree(Source, nullptr, InsertAtEnd);
   assert(FreeCall && "CreateFree did not create a CallInst");
   return FreeCall;
 }
@@ -699,11 +699,11 @@ BasicBlock *ResumeInst::getSuccessorV(unsigned idx) const {
 UnreachableInst::UnreachableInst(LLVMContext &Context, 
                                  Instruction *InsertBefore)
   : TerminatorInst(Type::getVoidTy(Context), Instruction::Unreachable,
-                   0, 0, InsertBefore) {
+                   nullptr, 0, InsertBefore) {
 }
 UnreachableInst::UnreachableInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
   : TerminatorInst(Type::getVoidTy(Context), Instruction::Unreachable,
-                   0, 0, InsertAtEnd) {
+                   nullptr, 0, InsertAtEnd) {
 }
 
 unsigned UnreachableInst::getNumSuccessorsV() const {
@@ -732,7 +732,7 @@ BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore)
   : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
                    OperandTraits<BranchInst>::op_end(this) - 1,
                    1, InsertBefore) {
-  assert(IfTrue != 0 && "Branch destination may not be null!");
+  assert(IfTrue && "Branch destination may not be null!");
   Op<-1>() = IfTrue;
 }
 BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
@@ -752,7 +752,7 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *InsertAtEnd)
   : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
                    OperandTraits<BranchInst>::op_end(this) - 1,
                    1, InsertAtEnd) {
-  assert(IfTrue != 0 && "Branch destination may not be null!");
+  assert(IfTrue && "Branch destination may not be null!");
   Op<-1>() = IfTrue;
 }
 
@@ -852,7 +852,7 @@ AllocaInst::AllocaInst(Type *Ty, Value *ArraySize,
 AllocaInst::AllocaInst(Type *Ty, const Twine &Name,
                        Instruction *InsertBefore)
   : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
-                     getAISize(Ty->getContext(), 0), InsertBefore) {
+                     getAISize(Ty->getContext(), nullptr), InsertBefore) {
   setAlignment(0);
   assert(!Ty->isVoidTy() && "Cannot allocate void!");
   setName(Name);
@@ -861,7 +861,7 @@ AllocaInst::AllocaInst(Type *Ty, const Twine &Name,
 AllocaInst::AllocaInst(Type *Ty, const Twine &Name,
                        BasicBlock *InsertAtEnd)
   : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
-                     getAISize(Ty->getContext(), 0), InsertAtEnd) {
+                     getAISize(Ty->getContext(), nullptr), InsertAtEnd) {
   setAlignment(0);
   assert(!Ty->isVoidTy() && "Cannot allocate void!");
   setName(Name);
@@ -1323,7 +1323,7 @@ AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
 FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, 
                      SynchronizationScope SynchScope,
                      Instruction *InsertBefore)
-  : Instruction(Type::getVoidTy(C), Fence, 0, 0, InsertBefore) {
+  : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertBefore) {
   setOrdering(Ordering);
   setSynchScope(SynchScope);
 }
@@ -1331,7 +1331,7 @@ FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering,
 FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, 
                      SynchronizationScope SynchScope,
                      BasicBlock *InsertAtEnd)
-  : Instruction(Type::getVoidTy(C), Fence, 0, 0, InsertAtEnd) {
+  : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertAtEnd) {
   setOrdering(Ordering);
   setSynchScope(SynchScope);
 }
@@ -1369,7 +1369,7 @@ GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI)
 template <typename IndexTy>
 static Type *getIndexedTypeInternal(Type *Ptr, ArrayRef<IndexTy> IdxList) {
   PointerType *PTy = dyn_cast<PointerType>(Ptr->getScalarType());
-  if (!PTy) return 0;   // Type isn't a pointer type!
+  if (!PTy) return nullptr;   // Type isn't a pointer type!
   Type *Agg = PTy->getElementType();
 
   // Handle the special case of the empty set index set, which is always valid.
@@ -1379,17 +1379,17 @@ static Type *getIndexedTypeInternal(Type *Ptr, ArrayRef<IndexTy> IdxList) {
   // If there is at least one index, the top level type must be sized, otherwise
   // it cannot be 'stepped over'.
   if (!Agg->isSized())
-    return 0;
+    return nullptr;
 
   unsigned CurIdx = 1;
   for (; CurIdx != IdxList.size(); ++CurIdx) {
     CompositeType *CT = dyn_cast<CompositeType>(Agg);
-    if (!CT || CT->isPointerTy()) return 0;
+    if (!CT || CT->isPointerTy()) return nullptr;
     IndexTy Index = IdxList[CurIdx];
-    if (!CT->indexValid(Index)) return 0;
+    if (!CT->indexValid(Index)) return nullptr;
     Agg = CT->getTypeAtIndex(Index);
   }
-  return CurIdx == IdxList.size() ? Agg : 0;
+  return CurIdx == IdxList.size() ? Agg : nullptr;
 }
 
 Type *GetElementPtrInst::getIndexedType(Type *Ptr, ArrayRef<Value *> IdxList) {
@@ -1479,7 +1479,7 @@ ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
 
 
 bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) {
-  if (!Val->getType()->isVectorTy() || !Index->getType()->isIntegerTy(32))
+  if (!Val->getType()->isVectorTy() || !Index->getType()->isIntegerTy())
     return false;
   return true;
 }
@@ -1526,7 +1526,7 @@ bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt,
   if (Elt->getType() != cast<VectorType>(Vec->getType())->getElementType())
     return false;// Second operand of insertelement must be vector element type.
     
-  if (!Index->getType()->isIntegerTy(32))
+  if (!Index->getType()->isIntegerTy())
     return false;  // Third operand of insertelement must be i32.
   return true;
 }
@@ -1579,7 +1579,7 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
   
   // Mask must be vector of i32.
   VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
-  if (MaskTy == 0 || !MaskTy->getElementType()->isIntegerTy(32))
+  if (!MaskTy || !MaskTy->getElementType()->isIntegerTy(32))
     return false;
 
   // Check to see if Mask is valid.
@@ -1721,13 +1721,13 @@ Type *ExtractValueInst::getIndexedType(Type *Agg,
     // as easy to check those manually as well.
     if (ArrayType *AT = dyn_cast<ArrayType>(Agg)) {
       if (Index >= AT->getNumElements())
-        return 0;
+        return nullptr;
     } else if (StructType *ST = dyn_cast<StructType>(Agg)) {
       if (Index >= ST->getNumElements())
-        return 0;
+        return nullptr;
     } else {
       // Not a valid type to index into.
-      return 0;
+      return nullptr;
     }
 
     Agg = cast<CompositeType>(Agg)->getTypeAtIndex(Index);
@@ -2130,7 +2130,7 @@ bool CastInst::isNoopCast(const DataLayout *DL) const {
     return isNoopCast(Type::getInt64Ty(getContext()));
   }
 
-  Type *PtrOpTy = 0;
+  Type *PtrOpTy = nullptr;
   if (getOpcode() == Instruction::PtrToInt)
     PtrOpTy = getOperand(0)->getType();
   else if (getOpcode() == Instruction::IntToPtr)
@@ -3361,7 +3361,7 @@ void SwitchInst::init(Value *Value, BasicBlock *Default, unsigned NumReserved) {
 SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
                        Instruction *InsertBefore)
   : TerminatorInst(Type::getVoidTy(Value->getContext()), Instruction::Switch,
-                   0, 0, InsertBefore) {
+                   nullptr, 0, InsertBefore) {
   init(Value, Default, 2+NumCases*2);
 }
 
@@ -3372,12 +3372,12 @@ SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
 SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
                        BasicBlock *InsertAtEnd)
   : TerminatorInst(Type::getVoidTy(Value->getContext()), Instruction::Switch,
-                   0, 0, InsertAtEnd) {
+                   nullptr, 0, InsertAtEnd) {
   init(Value, Default, 2+NumCases*2);
 }
 
 SwitchInst::SwitchInst(const SwitchInst &SI)
-  : TerminatorInst(SI.getType(), Instruction::Switch, 0, 0) {
+  : TerminatorInst(SI.getType(), Instruction::Switch, nullptr, 0) {
   init(SI.getCondition(), SI.getDefaultDest(), SI.getNumOperands());
   NumOperands = SI.getNumOperands();
   Use *OL = OperandList, *InOL = SI.OperandList;
@@ -3425,8 +3425,8 @@ void SwitchInst::removeCase(CaseIt i) {
   }
 
   // Nuke the last value.
-  OL[NumOps-2].set(0);
-  OL[NumOps-2+1].set(0);
+  OL[NumOps-2].set(nullptr);
+  OL[NumOps-2+1].set(nullptr);
   NumOperands = NumOps-2;
 }
 
@@ -3492,14 +3492,14 @@ void IndirectBrInst::growOperands() {
 IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases,
                                Instruction *InsertBefore)
 : TerminatorInst(Type::getVoidTy(Address->getContext()),Instruction::IndirectBr,
-                 0, 0, InsertBefore) {
+                 nullptr, 0, InsertBefore) {
   init(Address, NumCases);
 }
 
 IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases,
                                BasicBlock *InsertAtEnd)
 : TerminatorInst(Type::getVoidTy(Address->getContext()),Instruction::IndirectBr,
-                 0, 0, InsertAtEnd) {
+                 nullptr, 0, InsertAtEnd) {
   init(Address, NumCases);
 }
 
@@ -3541,7 +3541,7 @@ void IndirectBrInst::removeDestination(unsigned idx) {
   OL[idx+1] = OL[NumOps-1];
   
   // Nuke the last value.
-  OL[NumOps-1].set(0);
+  OL[NumOps-1].set(nullptr);
   NumOperands = NumOps-1;
 }
 
@@ -3587,9 +3587,10 @@ InsertValueInst *InsertValueInst::clone_impl() const {
 }
 
 AllocaInst *AllocaInst::clone_impl() const {
-  return new AllocaInst(getAllocatedType(),
-                        (Value*)getOperand(0),
-                        getAlignment());
+  AllocaInst *Result = new AllocaInst(getAllocatedType(),
+                                      (Value *)getOperand(0), getAlignment());
+  Result->setUsedWithInAlloca(isUsedWithInAlloca());
+  return Result;
 }
 
 LoadInst *LoadInst::clone_impl() const {
diff --git a/lib/IR/IntrinsicInst.cpp b/lib/IR/IntrinsicInst.cpp
index 554f2be..5725284 100644
--- a/lib/IR/IntrinsicInst.cpp
+++ b/lib/IR/IntrinsicInst.cpp
@@ -35,7 +35,7 @@ static Value *CastOperand(Value *C) {
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (CE->isCast())
       return CE->getOperand(0);
-  return NULL;
+  return nullptr;
 }
 
 Value *DbgInfoIntrinsic::StripCast(Value *C) {
@@ -57,7 +57,7 @@ Value *DbgDeclareInst::getAddress() const {
   if (MDNode* MD = cast_or_null<MDNode>(getArgOperand(0)))
     return MD->getOperand(0);
   else
-    return NULL;
+    return nullptr;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index 1bfc515..de825f0 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp
@@ -15,6 +15,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "LLVMContextImpl.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Instruction.h"
@@ -114,6 +115,17 @@ void *LLVMContext::getDiagnosticContext() const {
   return pImpl->DiagnosticContext;
 }
 
+void LLVMContext::setYieldCallback(YieldCallbackTy Callback, void *OpaqueHandle)
+{
+  pImpl->YieldCallback = Callback;
+  pImpl->YieldOpaqueHandle = OpaqueHandle;
+}
+
+void LLVMContext::yield() {
+  if (pImpl->YieldCallback)
+    pImpl->YieldCallback(this, pImpl->YieldOpaqueHandle);
+}
+
 void LLVMContext::emitError(const Twine &ErrorStr) {
   diagnose(DiagnosticInfoInlineAsm(ErrorStr));
 }
@@ -125,10 +137,32 @@ void LLVMContext::emitError(const Instruction *I, const Twine &ErrorStr) {
 
 void LLVMContext::diagnose(const DiagnosticInfo &DI) {
   // If there is a report handler, use it.
-  if (pImpl->DiagnosticHandler != 0) {
+  if (pImpl->DiagnosticHandler) {
     pImpl->DiagnosticHandler(DI, pImpl->DiagnosticContext);
     return;
   }
+
+  // Optimization remarks are selective. They need to check whether the regexp
+  // pattern, passed via one of the -pass-remarks* flags, matches the name of
+  // the pass that is emitting the diagnostic. If there is no match, ignore the
+  // diagnostic and return.
+  switch (DI.getKind()) {
+  case llvm::DK_OptimizationRemark:
+    if (!cast<DiagnosticInfoOptimizationRemark>(DI).isEnabled())
+      return;
+    break;
+  case llvm::DK_OptimizationRemarkMissed:
+    if (!cast<DiagnosticInfoOptimizationRemarkMissed>(DI).isEnabled())
+      return;
+    break;
+  case llvm::DK_OptimizationRemarkAnalysis:
+    if (!cast<DiagnosticInfoOptimizationRemarkAnalysis>(DI).isEnabled())
+      return;
+    break;
+  default:
+    break;
+  }
+
   // Otherwise, print the message with a prefix based on the severity.
   std::string MsgStorage;
   raw_string_ostream Stream(MsgStorage);
diff --git a/lib/IR/LLVMContextImpl.cpp b/lib/IR/LLVMContextImpl.cpp
index ebff9d3..4c2791f 100644
--- a/lib/IR/LLVMContextImpl.cpp
+++ b/lib/IR/LLVMContextImpl.cpp
@@ -14,12 +14,13 @@
 #include "LLVMContextImpl.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Module.h"
 #include <algorithm>
 using namespace llvm;
 
 LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
-  : TheTrueVal(0), TheFalseVal(0),
+  : TheTrueVal(nullptr), TheFalseVal(nullptr),
     VoidTy(C, Type::VoidTyID),
     LabelTy(C, Type::LabelTyID),
     HalfTy(C, Type::HalfTyID),
@@ -35,10 +36,12 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
     Int16Ty(C, 16),
     Int32Ty(C, 32),
     Int64Ty(C, 64) {
-  InlineAsmDiagHandler = 0;
-  InlineAsmDiagContext = 0;
-  DiagnosticHandler = 0;
-  DiagnosticContext = 0;
+  InlineAsmDiagHandler = nullptr;
+  InlineAsmDiagContext = nullptr;
+  DiagnosticHandler = nullptr;
+  DiagnosticContext = nullptr;
+  YieldCallback = nullptr;
+  YieldOpaqueHandle = nullptr;
   NamedStructTypesUniqueID = 0;
 }
 
@@ -46,8 +49,7 @@ namespace {
 struct DropReferences {
   // Takes the value_type of a ConstantUniqueMap's internal map, whose 'second'
   // is a Constant*.
-  template<typename PairT>
-  void operator()(const PairT &P) {
+  template <typename PairT> void operator()(const PairT &P) {
     P.second->dropAllReferences();
   }
 };
@@ -64,12 +66,11 @@ struct DropFirst {
 }
 
 LLVMContextImpl::~LLVMContextImpl() {
-  // NOTE: We need to delete the contents of OwnedModules, but we have to
-  // duplicate it into a temporary vector, because the destructor of Module
-  // will try to remove itself from OwnedModules set.  This would cause
-  // iterator invalidation if we iterated on the set directly.
-  std::vector<Module*> Modules(OwnedModules.begin(), OwnedModules.end());
-  DeleteContainerPointers(Modules);
+  // NOTE: We need to delete the contents of OwnedModules, but Module's dtor
+  // will call LLVMContextImpl::removeModule, thus invalidating iterators into
+  // the container. Avoid iterators during this operation:
+  while (!OwnedModules.empty())
+    delete *OwnedModules.begin();
   
   // Free the constants.  This is important to do here to ensure that they are
   // freed before the LeakDetector is torn down.
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index dc77d29..808c239 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -37,6 +37,9 @@ namespace llvm {
 
 class ConstantInt;
 class ConstantFP;
+class DiagnosticInfoOptimizationRemark;
+class DiagnosticInfoOptimizationRemarkMissed;
+class DiagnosticInfoOptimizationRemarkAnalysis;
 class LLVMContext;
 class Type;
 class Value;
@@ -56,8 +59,8 @@ struct DenseMapAPIntKeyInfo {
       return hash_combine(Key.type, Key.val);
     }
   };
-  static inline KeyTy getEmptyKey() { return KeyTy(APInt(1,0), 0); }
-  static inline KeyTy getTombstoneKey() { return KeyTy(APInt(1,1), 0); }
+  static inline KeyTy getEmptyKey() { return KeyTy(APInt(1,0), nullptr); }
+  static inline KeyTy getTombstoneKey() { return KeyTy(APInt(1,1), nullptr); }
   static unsigned getHashValue(const KeyTy &Key) {
     return static_cast<unsigned>(hash_value(Key));
   }
@@ -242,6 +245,9 @@ public:
   LLVMContext::DiagnosticHandlerTy DiagnosticHandler;
   void *DiagnosticContext;
 
+  LLVMContext::YieldCallbackTy YieldCallback;
+  void *YieldOpaqueHandle;
+
   typedef DenseMap<DenseMapAPIntKeyInfo::KeyTy, ConstantInt *,
                    DenseMapAPIntKeyInfo> IntMapTy;
   IntMapTy IntConstants;
diff --git a/lib/IR/LeaksContext.h b/lib/IR/LeaksContext.h
index 5038dc9..52ac170 100644
--- a/lib/IR/LeaksContext.h
+++ b/lib/IR/LeaksContext.h
@@ -12,8 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_IR_LEAKSCONTEXT_H
+#define LLVM_IR_LEAKSCONTEXT_H
+
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
@@ -30,10 +34,10 @@ struct PrinterTrait<Value> {
 template <typename T>
 struct LeakDetectorImpl {
   explicit LeakDetectorImpl(const char* const name = "") : 
-    Cache(0), Name(name) { }
+    Cache(nullptr), Name(name) { }
 
   void clear() {
-    Cache = 0;
+    Cache = nullptr;
     Ts.clear();
   }
     
@@ -57,15 +61,15 @@ struct LeakDetectorImpl {
 
   void removeGarbage(const T* o) {
     if (o == Cache)
-      Cache = 0; // Cache hit
+      Cache = nullptr; // Cache hit
     else
       Ts.erase(o);
   }
 
   bool hasGarbage(const std::string& Message) {
-    addGarbage(0); // Flush the Cache
+    addGarbage(nullptr); // Flush the Cache
 
-    assert(Cache == 0 && "No value should be cached anymore!");
+    assert(!Cache && "No value should be cached anymore!");
 
     if (!Ts.empty()) {
       errs() << "Leaked " << Name << " objects found: " << Message << ":\n";
@@ -90,3 +94,5 @@ private:
 };
 
 }
+
+#endif // LLVM_IR_LEAKSCONTEXT_H
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index 7c5cc68..d3f3482 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/LegacyPassManagers.h"
@@ -22,6 +23,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm/Support/TimeValue.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -118,7 +120,7 @@ bool PMDataManager::isPassDebuggingExecutionsOrMore() const {
 
 
 void PassManagerPrettyStackEntry::print(raw_ostream &OS) const {
-  if (V == 0 && M == 0)
+  if (!V && !M)
     OS << "Releasing pass '";
   else
     OS << "Running pass '";
@@ -129,7 +131,7 @@ void PassManagerPrettyStackEntry::print(raw_ostream &OS) const {
     OS << " on module '" << M->getModuleIdentifier() << "'.\n";
     return;
   }
-  if (V == 0) {
+  if (!V) {
     OS << '\n';
     return;
   }
@@ -484,11 +486,11 @@ public:
   /// getPassTimer - Return the timer for the specified pass if it exists.
   Timer *getPassTimer(Pass *P) {
     if (P->getAsPMDataManager())
-      return 0;
+      return nullptr;
 
     sys::SmartScopedLock<true> Lock(*TimingInfoMutex);
     Timer *&T = TimingData[P];
-    if (T == 0)
+    if (!T)
       T = new Timer(P->getPassName(), TG);
     return T;
   }
@@ -579,7 +581,7 @@ void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses,
 }
 
 AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
-  AnalysisUsage *AnUsage = NULL;
+  AnalysisUsage *AnUsage = nullptr;
   DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
   if (DMI != AnUsageMap.end())
     AnUsage = DMI->second;
@@ -626,7 +628,7 @@ void PMTopLevelManager::schedulePass(Pass *P) {
       if (!AnalysisPass) {
         const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
 
-        if (PI == NULL) {
+        if (!PI) {
           // Pass P is not in the global PassRegistry
           dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
           dbgs() << "Verify if there is a pass dependency cycle." << "\n";
@@ -733,7 +735,7 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 // Print passes managed by this top level manager.
@@ -830,7 +832,7 @@ void PMDataManager::recordAvailableAnalysis(Pass *P) {
   // This pass is the current implementation of all of the interfaces it
   // implements as well.
   const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI);
-  if (PInf == 0) return;
+  if (!PInf) return;
   const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
   for (unsigned i = 0, e = II.size(); i != e; ++i)
     AvailableAnalysis[II[i]->getTypeInfo()] = P;
@@ -847,7 +849,7 @@ bool PMDataManager::preserveHigherLevelAnalysis(Pass *P) {
   for (SmallVectorImpl<Pass *>::iterator I = HigherLevelAnalysis.begin(),
          E = HigherLevelAnalysis.end(); I  != E; ++I) {
     Pass *P1 = *I;
-    if (P1->getAsImmutablePass() == 0 &&
+    if (P1->getAsImmutablePass() == nullptr &&
         std::find(PreservedSet.begin(), PreservedSet.end(),
                   P1->getPassID()) ==
            PreservedSet.end())
@@ -887,7 +889,7 @@ void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
   for (DenseMap<AnalysisID, Pass*>::iterator I = AvailableAnalysis.begin(),
          E = AvailableAnalysis.end(); I != E; ) {
     DenseMap<AnalysisID, Pass*>::iterator Info = I++;
-    if (Info->second->getAsImmutablePass() == 0 &&
+    if (Info->second->getAsImmutablePass() == nullptr &&
         std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
         PreservedSet.end()) {
       // Remove this analysis
@@ -911,7 +913,7 @@ void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
            I = InheritedAnalysis[Index]->begin(),
            E = InheritedAnalysis[Index]->end(); I != E; ) {
       DenseMap<AnalysisID, Pass *>::iterator Info = I++;
-      if (Info->second->getAsImmutablePass() == 0 &&
+      if (Info->second->getAsImmutablePass() == nullptr &&
           std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
              PreservedSet.end()) {
         // Remove this analysis
@@ -1028,7 +1030,7 @@ void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
   // Set P as P's last user until someone starts using P.
   // However, if P is a Pass Manager then it does not need
   // to record its last user.
-  if (P->getAsPMDataManager() == 0)
+  if (!P->getAsPMDataManager())
     LastUses.push_back(P);
   TPM->setLastUser(LastUses, P);
 
@@ -1095,7 +1097,7 @@ void PMDataManager::initializeAnalysisImpl(Pass *P) {
          I = AnUsage->getRequiredSet().begin(),
          E = AnUsage->getRequiredSet().end(); I != E; ++I) {
     Pass *Impl = findAnalysisPass(*I, true);
-    if (Impl == 0)
+    if (!Impl)
       // This may be analysis pass that is initialized on the fly.
       // If that is not the case then it will raise an assert when it is used.
       continue;
@@ -1119,7 +1121,7 @@ Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) {
   if (SearchParent)
     return TPM->findAnalysisPass(AID);
 
-  return NULL;
+  return nullptr;
 }
 
 // Print list of passes that are last used by P.
@@ -1158,7 +1160,8 @@ void PMDataManager::dumpPassInfo(Pass *P, enum PassDebuggingString S1,
                                  StringRef Msg) {
   if (PassDebugging < Executions)
     return;
-  dbgs() << (void*)this << std::string(getDepth()*2+1, ' ');
+  dbgs() << "[" << sys::TimeValue::now().str() << "] " << (void *)this
+         << std::string(getDepth() * 2 + 1, ' ');
   switch (S1) {
   case EXECUTION_MSG:
     dbgs() << "Executing Pass '" << P->getPassName();
@@ -1487,8 +1490,10 @@ bool FunctionPassManagerImpl::run(Function &F) {
   TimingInfo::createTheTimeInfo();
 
   initializeAllAnalysisInfo();
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
     Changed |= getContainedManager(Index)->runOnFunction(F);
+    F.getContext().yield();
+  }
 
   for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
     getContainedManager(Index)->cleanup();
@@ -1657,6 +1662,8 @@ void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
   assert((P->getPotentialPassManagerType() <
           RequiredPass->getPotentialPassManagerType()) &&
          "Unable to handle Pass that requires lower level Analysis pass");
+  if (!RequiredPass)
+    return;
 
   FunctionPassManagerImpl *FPP = OnTheFlyManagers[P];
   if (!FPP) {
@@ -1666,14 +1673,24 @@ void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
 
     OnTheFlyManagers[P] = FPP;
   }
-  FPP->add(RequiredPass);
+  const PassInfo * RequiredPassPI =
+    PassRegistry::getPassRegistry()->getPassInfo(RequiredPass->getPassID());
 
-  // Register P as the last user of RequiredPass.
-  if (RequiredPass) {
-    SmallVector<Pass *, 1> LU;
-    LU.push_back(RequiredPass);
-    FPP->setLastUser(LU,  P);
+  Pass *FoundPass = nullptr;
+  if (RequiredPassPI && RequiredPassPI->isAnalysis()) {
+    FoundPass =
+      ((PMTopLevelManager*)FPP)->findAnalysisPass(RequiredPass->getPassID());
   }
+  if (!FoundPass) {
+    FoundPass = RequiredPass;
+    // This should be guaranteed to add RequiredPass to the passmanager given
+    // that we checked for an avaiable analysis above.
+    FPP->add(RequiredPass);
+  }
+  // Register P as the last user of FoundPass or RequiredPass.
+  SmallVector<Pass *, 1> LU;
+  LU.push_back(FoundPass);
+  FPP->setLastUser(LU,  P);
 }
 
 /// Return function pass corresponding to PassInfo PI, that is
@@ -1709,8 +1726,10 @@ bool PassManagerImpl::run(Module &M) {
   }
 
   initializeAllAnalysisInfo();
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
     Changed |= getContainedManager(Index)->runOnModule(M);
+    M.getContext().yield();
+  }
 
   for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
        E = IPV.end(); I != E; ++I) {
@@ -1773,7 +1792,7 @@ void TimingInfo::createTheTimeInfo() {
 Timer *llvm::getPassTimer(Pass *P) {
   if (TheTimeInfo)
     return TheTimeInfo->getPassTimer(P);
-  return 0;
+  return nullptr;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/MDBuilder.cpp b/lib/IR/MDBuilder.cpp
new file mode 100644
index 0000000..65cdf38
--- /dev/null
+++ b/lib/IR/MDBuilder.cpp
@@ -0,0 +1,139 @@
+//===---- llvm/MDBuilder.cpp - Builder for LLVM metadata ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MDBuilder class, which is used as a convenient way to
+// create LLVM metadata with a consistent and simplified interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Metadata.h"
+using namespace llvm;
+
+MDString *MDBuilder::createString(StringRef Str) {
+  return MDString::get(Context, Str);
+}
+
+MDNode *MDBuilder::createFPMath(float Accuracy) {
+  if (Accuracy == 0.0)
+    return nullptr;
+  assert(Accuracy > 0.0 && "Invalid fpmath accuracy!");
+  Value *Op = ConstantFP::get(Type::getFloatTy(Context), Accuracy);
+  return MDNode::get(Context, Op);
+}
+
+MDNode *MDBuilder::createBranchWeights(uint32_t TrueWeight,
+                                       uint32_t FalseWeight) {
+  uint32_t Weights[] = {TrueWeight, FalseWeight};
+  return createBranchWeights(Weights);
+}
+
+MDNode *MDBuilder::createBranchWeights(ArrayRef<uint32_t> Weights) {
+  assert(Weights.size() >= 2 && "Need at least two branch weights!");
+
+  SmallVector<Value *, 4> Vals(Weights.size() + 1);
+  Vals[0] = createString("branch_weights");
+
+  Type *Int32Ty = Type::getInt32Ty(Context);
+  for (unsigned i = 0, e = Weights.size(); i != e; ++i)
+    Vals[i + 1] = ConstantInt::get(Int32Ty, Weights[i]);
+
+  return MDNode::get(Context, Vals);
+}
+
+MDNode *MDBuilder::createRange(const APInt &Lo, const APInt &Hi) {
+  assert(Lo.getBitWidth() == Hi.getBitWidth() && "Mismatched bitwidths!");
+  // If the range is everything then it is useless.
+  if (Hi == Lo)
+    return nullptr;
+
+  // Return the range [Lo, Hi).
+  Type *Ty = IntegerType::get(Context, Lo.getBitWidth());
+  Value *Range[2] = {ConstantInt::get(Ty, Lo), ConstantInt::get(Ty, Hi)};
+  return MDNode::get(Context, Range);
+}
+
+MDNode *MDBuilder::createAnonymousTBAARoot() {
+  // To ensure uniqueness the root node is self-referential.
+  MDNode *Dummy = MDNode::getTemporary(Context, ArrayRef<Value *>());
+  MDNode *Root = MDNode::get(Context, Dummy);
+  // At this point we have
+  //   !0 = metadata !{}            <- dummy
+  //   !1 = metadata !{metadata !0} <- root
+  // Replace the dummy operand with the root node itself and delete the dummy.
+  Root->replaceOperandWith(0, Root);
+  MDNode::deleteTemporary(Dummy);
+  // We now have
+  //   !1 = metadata !{metadata !1} <- self-referential root
+  return Root;
+}
+
+MDNode *MDBuilder::createTBAARoot(StringRef Name) {
+  return MDNode::get(Context, createString(Name));
+}
+
+/// \brief Return metadata for a non-root TBAA node with the given name,
+/// parent in the TBAA tree, and value for 'pointsToConstantMemory'.
+MDNode *MDBuilder::createTBAANode(StringRef Name, MDNode *Parent,
+                                  bool isConstant) {
+  if (isConstant) {
+    Constant *Flags = ConstantInt::get(Type::getInt64Ty(Context), 1);
+    Value *Ops[3] = {createString(Name), Parent, Flags};
+    return MDNode::get(Context, Ops);
+  } else {
+    Value *Ops[2] = {createString(Name), Parent};
+    return MDNode::get(Context, Ops);
+  }
+}
+
+/// \brief Return metadata for a tbaa.struct node with the given
+/// struct field descriptions.
+MDNode *MDBuilder::createTBAAStructNode(ArrayRef<TBAAStructField> Fields) {
+  SmallVector<Value *, 4> Vals(Fields.size() * 3);
+  Type *Int64 = Type::getInt64Ty(Context);
+  for (unsigned i = 0, e = Fields.size(); i != e; ++i) {
+    Vals[i * 3 + 0] = ConstantInt::get(Int64, Fields[i].Offset);
+    Vals[i * 3 + 1] = ConstantInt::get(Int64, Fields[i].Size);
+    Vals[i * 3 + 2] = Fields[i].TBAA;
+  }
+  return MDNode::get(Context, Vals);
+}
+
+/// \brief Return metadata for a TBAA struct node in the type DAG
+/// with the given name, a list of pairs (offset, field type in the type DAG).
+MDNode *MDBuilder::createTBAAStructTypeNode(
+    StringRef Name, ArrayRef<std::pair<MDNode *, uint64_t>> Fields) {
+  SmallVector<Value *, 4> Ops(Fields.size() * 2 + 1);
+  Type *Int64 = Type::getInt64Ty(Context);
+  Ops[0] = createString(Name);
+  for (unsigned i = 0, e = Fields.size(); i != e; ++i) {
+    Ops[i * 2 + 1] = Fields[i].first;
+    Ops[i * 2 + 2] = ConstantInt::get(Int64, Fields[i].second);
+  }
+  return MDNode::get(Context, Ops);
+}
+
+/// \brief Return metadata for a TBAA scalar type node with the
+/// given name, an offset and a parent in the TBAA type DAG.
+MDNode *MDBuilder::createTBAAScalarTypeNode(StringRef Name, MDNode *Parent,
+                                            uint64_t Offset) {
+  ConstantInt *Off = ConstantInt::get(Type::getInt64Ty(Context), Offset);
+  Value *Ops[3] = {createString(Name), Parent, Off};
+  return MDNode::get(Context, Ops);
+}
+
+/// \brief Return metadata for a TBAA tag node with the given
+/// base type, access type and offset relative to the base type.
+MDNode *MDBuilder::createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType,
+                                           uint64_t Offset) {
+  Type *Int64 = Type::getInt64Ty(Context);
+  Value *Ops[3] = {BaseType, AccessType, ConstantInt::get(Int64, Offset)};
+  return MDNode::get(Context, Ops);
+}
diff --git a/lib/IR/Mangler.cpp b/lib/IR/Mangler.cpp
index d82388f..27d973b 100644
--- a/lib/IR/Mangler.cpp
+++ b/lib/IR/Mangler.cpp
@@ -108,7 +108,7 @@ void Mangler::getNameWithPrefix(raw_ostream &OS, const GlobalValue *GV,
   }
 
   bool UseAt = false;
-  const Function *MSFunc = NULL;
+  const Function *MSFunc = nullptr;
   CallingConv::ID CC;
   if (DL->hasMicrosoftFastStdCallMangling()) {
     if ((MSFunc = dyn_cast<Function>(GV))) {
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index ba39334..4d932d0 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -87,7 +87,7 @@ public:
 MDNodeOperand::~MDNodeOperand() {}
 
 void MDNodeOperand::deleted() {
-  getParent()->replaceOperand(this, 0);
+  getParent()->replaceOperand(this, nullptr);
 }
 
 void MDNodeOperand::allUsesReplacedWith(Value *NV) {
@@ -148,10 +148,10 @@ MDNode::~MDNode() {
 }
 
 static const Function *getFunctionForValue(Value *V) {
-  if (!V) return NULL;
+  if (!V) return nullptr;
   if (Instruction *I = dyn_cast<Instruction>(V)) {
     BasicBlock *BB = I->getParent();
-    return BB ? BB->getParent() : 0;
+    return BB ? BB->getParent() : nullptr;
   }
   if (Argument *A = dyn_cast<Argument>(V))
     return A->getParent();
@@ -159,15 +159,15 @@ static const Function *getFunctionForValue(Value *V) {
     return BB->getParent();
   if (MDNode *MD = dyn_cast<MDNode>(V))
     return MD->getFunction();
-  return NULL;
+  return nullptr;
 }
 
 #ifndef NDEBUG
 static const Function *assertLocalFunction(const MDNode *N) {
-  if (!N->isFunctionLocal()) return 0;
+  if (!N->isFunctionLocal()) return nullptr;
 
   // FIXME: This does not handle cyclic function local metadata.
-  const Function *F = 0, *NewF = 0;
+  const Function *F = nullptr, *NewF = nullptr;
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     if (Value *V = N->getOperand(i)) {
       if (MDNode *MD = dyn_cast<MDNode>(V))
@@ -175,10 +175,11 @@ static const Function *assertLocalFunction(const MDNode *N) {
       else
         NewF = getFunctionForValue(V);
     }
-    if (F == 0)
+    if (!F)
       F = NewF;
-    else 
-      assert((NewF == 0 || F == NewF) &&"inconsistent function-local metadata");
+    else
+      assert((NewF == nullptr || F == NewF) &&
+             "inconsistent function-local metadata");
   }
   return F;
 }
@@ -192,11 +193,11 @@ const Function *MDNode::getFunction() const {
 #ifndef NDEBUG
   return assertLocalFunction(this);
 #else
-  if (!isFunctionLocal()) return NULL;
+  if (!isFunctionLocal()) return nullptr;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
     if (const Function *F = getFunctionForValue(getOperand(i)))
       return F;
-  return NULL;
+  return nullptr;
 #endif
 }
 
@@ -335,14 +336,14 @@ void MDNode::replaceOperand(MDNodeOperand *Op, Value *To) {
   // Likewise if the MDNode is function-local but for a different function.
   if (To && isFunctionLocalValue(To)) {
     if (!isFunctionLocal())
-      To = 0;
+      To = nullptr;
     else {
       const Function *F = getFunction();
       const Function *FV = getFunctionForValue(To);
       // Metadata can be function-local without having an associated function.
       // So only consider functions to have changed if non-null.
       if (F && FV && F != FV)
-        To = 0;
+        To = nullptr;
     }
   }
   
@@ -366,7 +367,7 @@ void MDNode::replaceOperand(MDNodeOperand *Op, Value *To) {
   // anymore.  This commonly occurs during destruction, and uniquing these
   // brings little reuse.  Also, this means we don't need to include
   // isFunctionLocal bits in FoldingSetNodeIDs for MDNodes.
-  if (To == 0) {
+  if (!To) {
     setIsNotUniqued();
     return;
   }
@@ -407,7 +408,7 @@ void MDNode::replaceOperand(MDNodeOperand *Op, Value *To) {
 
 MDNode *MDNode::getMostGenericFPMath(MDNode *A, MDNode *B) {
   if (!A || !B)
-    return NULL;
+    return nullptr;
 
   APFloat AVal = cast<ConstantFP>(A->getOperand(0))->getValueAPF();
   APFloat BVal = cast<ConstantFP>(B->getOperand(0))->getValueAPF();
@@ -457,7 +458,7 @@ MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
   // the ones that overlap.
 
   if (!A || !B)
-    return NULL;
+    return nullptr;
 
   if (A == B)
     return A;
@@ -512,7 +513,7 @@ MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
     ConstantRange Range(cast<ConstantInt>(EndPoints[0])->getValue(),
                         cast<ConstantInt>(EndPoints[1])->getValue());
     if (Range.isFullSet())
-      return NULL;
+      return nullptr;
   }
 
   return MDNode::get(A->getContext(), EndPoints);
@@ -527,7 +528,7 @@ static SmallVector<TrackingVH<MDNode>, 4> &getNMDOps(void *Operands) {
 }
 
 NamedMDNode::NamedMDNode(const Twine &N)
-  : Name(N.str()), Parent(0),
+  : Name(N.str()), Parent(nullptr),
     Operands(new SmallVector<TrackingVH<MDNode>, 4>()) {
 }
 
@@ -575,7 +576,7 @@ StringRef NamedMDNode::getName() const {
 //
 
 void Instruction::setMetadata(StringRef Kind, MDNode *Node) {
-  if (Node == 0 && !hasMetadata()) return;
+  if (!Node && !hasMetadata()) return;
   setMetadata(getContext().getMDKindID(Kind), Node);
 }
 
@@ -631,7 +632,7 @@ void Instruction::dropUnknownMetadata(ArrayRef<unsigned> KnownIDs) {
 /// node.  This updates/replaces metadata if already present, or removes it if
 /// Node is null.
 void Instruction::setMetadata(unsigned KindID, MDNode *Node) {
-  if (Node == 0 && !hasMetadata()) return;
+  if (!Node && !hasMetadata()) return;
 
   // Handle 'dbg' as a special case since it is not stored in the hash table.
   if (KindID == LLVMContext::MD_dbg) {
@@ -691,7 +692,7 @@ MDNode *Instruction::getMetadataImpl(unsigned KindID) const {
   if (KindID == LLVMContext::MD_dbg)
     return DbgLoc.getAsMDNode(getContext());
   
-  if (!hasMetadataHashEntry()) return 0;
+  if (!hasMetadataHashEntry()) return nullptr;
   
   LLVMContextImpl::MDMapTy &Info = getContext().pImpl->MetadataStore[this];
   assert(!Info.empty() && "bit out of sync with hash table");
@@ -699,7 +700,7 @@ MDNode *Instruction::getMetadataImpl(unsigned KindID) const {
   for (const auto &I : Info)
     if (I.first == KindID)
       return I.second;
-  return 0;
+  return nullptr;
 }
 
 void Instruction::getAllMetadataImpl(SmallVectorImpl<std::pair<unsigned,
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 1accd47..5dbed69 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LeakDetector.h"
+#include "llvm/Support/Dwarf.h"
 #include <algorithm>
 #include <cstdarg>
 #include <cstdlib>
@@ -95,7 +96,7 @@ Constant *Module::getOrInsertFunction(StringRef Name,
                                       AttributeSet AttributeList) {
   // See if we have a definition for the specified function already.
   GlobalValue *F = getNamedValue(Name);
-  if (F == 0) {
+  if (!F) {
     // Nope, add it
     Function *New = Function::Create(Ty, GlobalVariable::ExternalLinkage, Name);
     if (!New->isIntrinsic())       // Intrinsics get attrs set on construction
@@ -183,7 +184,7 @@ GlobalVariable *Module::getGlobalVariable(StringRef Name, bool AllowLocal) {
       dyn_cast_or_null<GlobalVariable>(getNamedValue(Name)))
     if (AllowLocal || !Result->hasLocalLinkage())
       return Result;
-  return 0;
+  return nullptr;
 }
 
 /// getOrInsertGlobal - Look up the specified global in the module symbol table.
@@ -195,11 +196,11 @@ GlobalVariable *Module::getGlobalVariable(StringRef Name, bool AllowLocal) {
 Constant *Module::getOrInsertGlobal(StringRef Name, Type *Ty) {
   // See if we have a definition for the specified global already.
   GlobalVariable *GV = dyn_cast_or_null<GlobalVariable>(getNamedValue(Name));
-  if (GV == 0) {
+  if (!GV) {
     // Nope, add it
     GlobalVariable *New =
       new GlobalVariable(*this, Ty, false, GlobalVariable::ExternalLinkage,
-                         0, Name);
+                         nullptr, Name);
      return New;                    // Return the new declaration.
   }
 
@@ -284,7 +285,7 @@ Value *Module::getModuleFlag(StringRef Key) const {
     if (Key == MFE.Key->getString())
       return MFE.Val;
   }
-  return 0;
+  return nullptr;
 }
 
 /// getModuleFlagsMetadata - Returns the NamedMDNode in the module that
@@ -350,7 +351,7 @@ void Module::setDataLayout(const DataLayout *Other) {
 
 const DataLayout *Module::getDataLayout() const {
   if (DataLayoutStr.empty())
-    return 0;
+    return nullptr;
   return &DL;
 }
 
@@ -429,3 +430,10 @@ void Module::dropAllReferences() {
   for(Module::alias_iterator I = alias_begin(), E = alias_end(); I != E; ++I)
     I->dropAllReferences();
 }
+
+unsigned Module::getDwarfVersion() const {
+  Value *Val = getModuleFlag("Dwarf Version");
+  if (!Val)
+    return dwarf::DWARF_VERSION;
+  return cast<ConstantInt>(Val)->getZExtValue();
+}
diff --git a/lib/IR/Pass.cpp b/lib/IR/Pass.cpp
index e16c5b7..bb55d2a 100644
--- a/lib/IR/Pass.cpp
+++ b/lib/IR/Pass.cpp
@@ -22,6 +22,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "ir"
+
 //===----------------------------------------------------------------------===//
 // Pass Implementation
 //
@@ -44,7 +46,7 @@ PassManagerType ModulePass::getPotentialPassManagerType() const {
 }
 
 bool Pass::mustPreserveAnalysisID(char &AID) const {
-  return Resolver->getAnalysisIfAvailable(&AID, true) != 0;
+  return Resolver->getAnalysisIfAvailable(&AID, true) != nullptr;
 }
 
 // dumpPassStructure - Implement the -debug-pass=Structure option
@@ -90,11 +92,11 @@ void *Pass::getAdjustedAnalysisPointer(AnalysisID AID) {
 }
 
 ImmutablePass *Pass::getAsImmutablePass() {
-  return 0;
+  return nullptr;
 }
 
 PMDataManager *Pass::getAsPMDataManager() {
-  return 0;
+  return nullptr;
 }
 
 void Pass::setResolver(AnalysisResolver *AR) {
@@ -112,7 +114,7 @@ void Pass::print(raw_ostream &O,const Module*) const {
 
 // dump - call print(cerr);
 void Pass::dump() const {
-  print(dbgs(), 0);
+  print(dbgs(), nullptr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -193,7 +195,7 @@ const PassInfo *Pass::lookupPassInfo(StringRef Arg) {
 Pass *Pass::createPass(AnalysisID ID) {
   const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(ID);
   if (!PI)
-    return NULL;
+    return nullptr;
   return PI->createPass();
 }
 
diff --git a/lib/IR/PassManager.cpp b/lib/IR/PassManager.cpp
index ea15455..0defb6a 100644
--- a/lib/IR/PassManager.cpp
+++ b/lib/IR/PassManager.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -32,6 +33,8 @@ PreservedAnalyses ModulePassManager::run(Module *M, ModuleAnalysisManager *AM) {
     if (AM)
       AM->invalidate(M, PassPA);
     PA.intersect(std::move(PassPA));
+
+    M->getContext().yield();
   }
 
   if (DebugPM)
@@ -59,7 +62,7 @@ ModuleAnalysisManager::ResultConceptT *
 ModuleAnalysisManager::getCachedResultImpl(void *PassID, Module *M) const {
   ModuleAnalysisResultMapT::const_iterator RI =
       ModuleAnalysisResults.find(PassID);
-  return RI == ModuleAnalysisResults.end() ? 0 : &*RI->second;
+  return RI == ModuleAnalysisResults.end() ? nullptr : &*RI->second;
 }
 
 void ModuleAnalysisManager::invalidateImpl(void *PassID, Module *M) {
@@ -92,6 +95,8 @@ PreservedAnalyses FunctionPassManager::run(Function *F,
     if (AM)
       AM->invalidate(F, PassPA);
     PA.intersect(std::move(PassPA));
+
+    F->getContext().yield();
   }
 
   if (DebugPM)
@@ -135,7 +140,7 @@ FunctionAnalysisManager::ResultConceptT *
 FunctionAnalysisManager::getCachedResultImpl(void *PassID, Function *F) const {
   FunctionAnalysisResultMapT::const_iterator RI =
       FunctionAnalysisResults.find(std::make_pair(PassID, F));
-  return RI == FunctionAnalysisResults.end() ? 0 : &*RI->second->second;
+  return RI == FunctionAnalysisResults.end() ? nullptr : &*RI->second->second;
 }
 
 void FunctionAnalysisManager::invalidateImpl(void *PassID, Function *F) {
@@ -165,6 +170,8 @@ void FunctionAnalysisManager::invalidateImpl(Function *F,
   while (!InvalidatedPassIDs.empty())
     FunctionAnalysisResults.erase(
         std::make_pair(InvalidatedPassIDs.pop_back_val(), F));
+  if (ResultsList.empty())
+    FunctionAnalysisResultLists.erase(F);
 }
 
 char FunctionAnalysisManagerModuleProxy::PassID;
diff --git a/lib/IR/PassRegistry.cpp b/lib/IR/PassRegistry.cpp
index 74dc0f1..6a5bee2 100644
--- a/lib/IR/PassRegistry.cpp
+++ b/lib/IR/PassRegistry.cpp
@@ -57,7 +57,7 @@ struct PassRegistryImpl {
   };
   DenseMap<const PassInfo*, AnalysisGroupInfo> AnalysisGroupInfoMap;
   
-  std::vector<const PassInfo*> ToFree;
+  std::vector<std::unique_ptr<const PassInfo>> ToFree;
   std::vector<PassRegistrationListener*> Listeners;
 };
 } // end anonymous namespace
@@ -75,20 +75,15 @@ void *PassRegistry::getImpl() const {
 PassRegistry::~PassRegistry() {
   sys::SmartScopedWriter<true> Guard(*Lock);
   PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(pImpl);
-  
-  for (std::vector<const PassInfo*>::iterator I = Impl->ToFree.begin(),
-       E = Impl->ToFree.end(); I != E; ++I)
-    delete *I;
-  
   delete Impl;
-  pImpl = 0;
+  pImpl = nullptr;
 }
 
 const PassInfo *PassRegistry::getPassInfo(const void *TI) const {
   sys::SmartScopedReader<true> Guard(*Lock);
   PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
   PassRegistryImpl::MapType::const_iterator I = Impl->PassInfoMap.find(TI);
-  return I != Impl->PassInfoMap.end() ? I->second : 0;
+  return I != Impl->PassInfoMap.end() ? I->second : nullptr;
 }
 
 const PassInfo *PassRegistry::getPassInfo(StringRef Arg) const {
@@ -96,7 +91,7 @@ const PassInfo *PassRegistry::getPassInfo(StringRef Arg) const {
   PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
   PassRegistryImpl::StringMapType::const_iterator
     I = Impl->PassInfoStringMap.find(Arg);
-  return I != Impl->PassInfoStringMap.end() ? I->second : 0;
+  return I != Impl->PassInfoStringMap.end() ? I->second : nullptr;
 }
 
 //===----------------------------------------------------------------------===//
@@ -117,7 +112,7 @@ void PassRegistry::registerPass(const PassInfo &PI, bool ShouldFree) {
        I = Impl->Listeners.begin(), E = Impl->Listeners.end(); I != E; ++I)
     (*I)->passRegistered(&PI);
   
-  if (ShouldFree) Impl->ToFree.push_back(&PI);
+  if (ShouldFree) Impl->ToFree.push_back(std::unique_ptr<const PassInfo>(&PI));
 }
 
 void PassRegistry::unregisterPass(const PassInfo &PI) {
@@ -148,7 +143,7 @@ void PassRegistry::registerAnalysisGroup(const void *InterfaceID,
                                          bool isDefault,
                                          bool ShouldFree) {
   PassInfo *InterfaceInfo =  const_cast<PassInfo*>(getPassInfo(InterfaceID));
-  if (InterfaceInfo == 0) {
+  if (!InterfaceInfo) {
     // First reference to Interface, register it now.
     registerPass(Registeree);
     InterfaceInfo = &Registeree;
@@ -174,7 +169,7 @@ void PassRegistry::registerAnalysisGroup(const void *InterfaceID,
            "Cannot add a pass to the same analysis group more than once!");
     AGI.Implementations.insert(ImplementationInfo);
     if (isDefault) {
-      assert(InterfaceInfo->getNormalCtor() == 0 &&
+      assert(InterfaceInfo->getNormalCtor() == nullptr &&
              "Default implementation for analysis group already specified!");
       assert(ImplementationInfo->getNormalCtor() &&
            "Cannot specify pass as default if it does not have a default ctor");
@@ -185,7 +180,8 @@ void PassRegistry::registerAnalysisGroup(const void *InterfaceID,
   }
   
   PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
-  if (ShouldFree) Impl->ToFree.push_back(&Registeree);
+  if (ShouldFree)
+    Impl->ToFree.push_back(std::unique_ptr<const PassInfo>(&Registeree));
 }
 
 void PassRegistry::addRegistrationListener(PassRegistrationListener *L) {
diff --git a/lib/IR/SymbolTableListTraitsImpl.h b/lib/IR/SymbolTableListTraitsImpl.h
index 5a383ee..8302597 100644
--- a/lib/IR/SymbolTableListTraitsImpl.h
+++ b/lib/IR/SymbolTableListTraitsImpl.h
@@ -65,7 +65,7 @@ void SymbolTableListTraits<ValueSubClass,ItemParentClass>
 template<typename ValueSubClass, typename ItemParentClass>
 void SymbolTableListTraits<ValueSubClass,ItemParentClass>
 ::addNodeToList(ValueSubClass *V) {
-  assert(V->getParent() == 0 && "Value already in a container!!");
+  assert(!V->getParent() && "Value already in a container!!");
   ItemParentClass *Owner = getListOwner();
   V->setParent(Owner);
   if (V->hasName())
@@ -76,7 +76,7 @@ void SymbolTableListTraits<ValueSubClass,ItemParentClass>
 template<typename ValueSubClass, typename ItemParentClass>
 void SymbolTableListTraits<ValueSubClass,ItemParentClass>
 ::removeNodeFromList(ValueSubClass *V) {
-  V->setParent(0);
+  V->setParent(nullptr);
   if (V->hasName())
     if (ValueSymbolTable *ST = TraitsClass::getSymTab(getListOwner()))
       ST->removeValueName(V->getValueName());
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index b02509f..1efde47 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -36,7 +36,7 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   case MetadataTyID  : return getMetadataTy(C);
   case X86_MMXTyID   : return getX86_MMXTy(C);
   default:
-    return 0;
+    return nullptr;
   }
 }
 
@@ -312,8 +312,8 @@ IntegerType *IntegerType::get(LLVMContext &C, unsigned NumBits) {
   }
   
   IntegerType *&Entry = C.pImpl->IntegerTypes[NumBits];
-  
-  if (Entry == 0)
+
+  if (!Entry)
     Entry = new (C.pImpl->TypeAllocator) IntegerType(C, NumBits);
   
   return Entry;
@@ -448,7 +448,7 @@ void StructType::setName(StringRef Name) {
     if (SymbolTableEntry) {
       // Delete the old string data.
       ((EntryTy *)SymbolTableEntry)->Destroy(SymbolTable.getAllocator());
-      SymbolTableEntry = 0;
+      SymbolTableEntry = nullptr;
     }
     return;
   }
@@ -497,7 +497,7 @@ StructType *StructType::get(LLVMContext &Context, bool isPacked) {
 }
 
 StructType *StructType::get(Type *type, ...) {
-  assert(type != 0 && "Cannot create a struct type with no elements with this");
+  assert(type && "Cannot create a struct type with no elements with this");
   LLVMContext &Ctx = type->getContext();
   va_list ap;
   SmallVector<llvm::Type*, 8> StructFields;
@@ -538,7 +538,7 @@ StructType *StructType::create(ArrayRef<Type*> Elements) {
 }
 
 StructType *StructType::create(StringRef Name, Type *type, ...) {
-  assert(type != 0 && "Cannot create a struct type with no elements with this");
+  assert(type && "Cannot create a struct type with no elements with this");
   LLVMContext &Ctx = type->getContext();
   va_list ap;
   SmallVector<llvm::Type*, 8> StructFields;
@@ -576,13 +576,13 @@ bool StructType::isSized(SmallPtrSet<const Type*, 4> *Visited) const {
 
 StringRef StructType::getName() const {
   assert(!isLiteral() && "Literal structs never have names");
-  if (SymbolTableEntry == 0) return StringRef();
-  
+  if (!SymbolTableEntry) return StringRef();
+
   return ((StringMapEntry<StructType*> *)SymbolTableEntry)->getKey();
 }
 
 void StructType::setBody(Type *type, ...) {
-  assert(type != 0 && "Cannot create a struct type with no elements with this");
+  assert(type && "Cannot create a struct type with no elements with this");
   va_list ap;
   SmallVector<llvm::Type*, 8> StructFields;
   va_start(ap, type);
@@ -680,8 +680,8 @@ ArrayType *ArrayType::get(Type *elementType, uint64_t NumElements) {
   LLVMContextImpl *pImpl = ElementType->getContext().pImpl;
   ArrayType *&Entry = 
     pImpl->ArrayTypes[std::make_pair(ElementType, NumElements)];
-  
-  if (Entry == 0)
+
+  if (!Entry)
     Entry = new (pImpl->TypeAllocator) ArrayType(ElementType, NumElements);
   return Entry;
 }
@@ -709,8 +709,8 @@ VectorType *VectorType::get(Type *elementType, unsigned NumElements) {
   LLVMContextImpl *pImpl = ElementType->getContext().pImpl;
   VectorType *&Entry = ElementType->getContext().pImpl
     ->VectorTypes[std::make_pair(ElementType, NumElements)];
-  
-  if (Entry == 0)
+
+  if (!Entry)
     Entry = new (pImpl->TypeAllocator) VectorType(ElementType, NumElements);
   return Entry;
 }
@@ -734,7 +734,7 @@ PointerType *PointerType::get(Type *EltTy, unsigned AddressSpace) {
   PointerType *&Entry = AddressSpace == 0 ? CImpl->PointerTypes[EltTy]
      : CImpl->ASPointerTypes[std::make_pair(EltTy, AddressSpace)];
 
-  if (Entry == 0)
+  if (!Entry)
     Entry = new (CImpl->TypeAllocator) PointerType(EltTy, AddressSpace);
   return Entry;
 }
diff --git a/lib/IR/Use.cpp b/lib/IR/Use.cpp
index 60a0c56..047861c 100644
--- a/lib/IR/Use.cpp
+++ b/lib/IR/Use.cpp
@@ -27,14 +27,14 @@ void Use::swap(Use &RHS) {
     Val = RHS.Val;
     Val->addUse(*this);
   } else {
-    Val = 0;
+    Val = nullptr;
   }
 
   if (OldVal) {
     RHS.Val = OldVal;
     RHS.Val->addUse(RHS);
   } else {
-    RHS.Val = 0;
+    RHS.Val = nullptr;
   }
 }
 
@@ -49,7 +49,7 @@ unsigned Use::getOperandNo() const {
   return this - getUser()->op_begin();
 }
 
-// Sets up the waymarking algoritm's tags for a series of Uses. See the
+// Sets up the waymarking algorithm's tags for a series of Uses. See the
 // algorithm details here:
 //
 //   http://www.llvm.org/docs/ProgrammersManual.html#UserLayout
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 97a562e..d734e4e 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -44,7 +44,7 @@ static inline Type *checkType(Type *Ty) {
 Value::Value(Type *ty, unsigned scid)
   : SubclassID(scid), HasValueHandle(0),
     SubclassOptionalData(0), SubclassData(0), VTy((Type*)checkType(ty)),
-    UseList(0), Name(0) {
+    UseList(nullptr), Name(nullptr) {
   // FIXME: Why isn't this in the subclass gunk??
   // Note, we cannot call isa<CallInst> before the CallInst has been
   // constructed.
@@ -141,7 +141,7 @@ unsigned Value::getNumUses() const {
 }
 
 static bool getSymTab(Value *V, ValueSymbolTable *&ST) {
-  ST = 0;
+  ST = nullptr;
   if (Instruction *I = dyn_cast<Instruction>(V)) {
     if (BasicBlock *P = I->getParent())
       if (Function *PP = P->getParent())
@@ -203,7 +203,7 @@ void Value::setName(const Twine &NewName) {
     if (NameRef.empty()) {
       // Free the name for this value.
       Name->Destroy();
-      Name = 0;
+      Name = nullptr;
       return;
     }
 
@@ -225,7 +225,7 @@ void Value::setName(const Twine &NewName) {
     // Remove old name.
     ST->removeValueName(Name);
     Name->Destroy();
-    Name = 0;
+    Name = nullptr;
 
     if (NameRef.empty())
       return;
@@ -241,7 +241,7 @@ void Value::setName(const Twine &NewName) {
 void Value::takeName(Value *V) {
   assert(SubclassID != MDStringVal && "Cannot take the name of an MDString!");
 
-  ValueSymbolTable *ST = 0;
+  ValueSymbolTable *ST = nullptr;
   // If this value has a name, drop it.
   if (hasName()) {
     // Get the symtab this is in.
@@ -256,7 +256,7 @@ void Value::takeName(Value *V) {
     if (ST)
       ST->removeValueName(Name);
     Name->Destroy();
-    Name = 0;
+    Name = nullptr;
   }
 
   // Now we know that this has no name.
@@ -283,7 +283,7 @@ void Value::takeName(Value *V) {
   if (ST == VST) {
     // Take the name!
     Name = V->Name;
-    V->Name = 0;
+    V->Name = nullptr;
     Name->setValue(this);
     return;
   }
@@ -294,17 +294,73 @@ void Value::takeName(Value *V) {
   if (VST)
     VST->removeValueName(V->Name);
   Name = V->Name;
-  V->Name = 0;
+  V->Name = nullptr;
   Name->setValue(this);
 
   if (ST)
     ST->reinsertValue(this);
 }
 
+static GlobalObject &findReplacementForAliasUse(Value &C) {
+  if (auto *GO = dyn_cast<GlobalObject>(&C))
+    return *GO;
+  if (auto *GA = dyn_cast<GlobalAlias>(&C))
+    return *GA->getAliasee();
+  auto *CE = cast<ConstantExpr>(&C);
+  assert(CE->getOpcode() == Instruction::BitCast ||
+         CE->getOpcode() == Instruction::GetElementPtr ||
+         CE->getOpcode() == Instruction::AddrSpaceCast);
+  if (CE->getOpcode() == Instruction::GetElementPtr)
+    assert(cast<GEPOperator>(CE)->hasAllZeroIndices());
+  return findReplacementForAliasUse(*CE->getOperand(0));
+}
+
+static void replaceAliasUseWith(Use &U, Value *New) {
+  GlobalObject &Replacement = findReplacementForAliasUse(*New);
+  assert(&cast<GlobalObject>(*U) != &Replacement &&
+         "replaceAliasUseWith cannot form an alias cycle");
+  U.set(&Replacement);
+}
+
+#ifndef NDEBUG
+static bool contains(SmallPtrSet<ConstantExpr *, 4> &Cache, ConstantExpr *Expr,
+                     Constant *C) {
+  if (!Cache.insert(Expr))
+    return false;
+
+  for (auto &O : Expr->operands()) {
+    if (O == C)
+      return true;
+    auto *CE = dyn_cast<ConstantExpr>(O);
+    if (!CE)
+      continue;
+    if (contains(Cache, CE, C))
+      return true;
+  }
+  return false;
+}
+
+static bool contains(Value *Expr, Value *V) {
+  if (Expr == V)
+    return true;
+
+  auto *C = dyn_cast<Constant>(V);
+  if (!C)
+    return false;
+
+  auto *CE = dyn_cast<ConstantExpr>(Expr);
+  if (!CE)
+    return false;
+
+  SmallPtrSet<ConstantExpr *, 4> Cache;
+  return contains(Cache, CE, C);
+}
+#endif
 
 void Value::replaceAllUsesWith(Value *New) {
   assert(New && "Value::replaceAllUsesWith(<null>) is invalid!");
-  assert(New != this && "this->replaceAllUsesWith(this) is NOT valid!");
+  assert(!contains(New, this) &&
+         "this->replaceAllUsesWith(expr(this)) is NOT valid!");
   assert(New->getType() == getType() &&
          "replaceAllUses of value with new value of different type!");
 
@@ -316,7 +372,11 @@ void Value::replaceAllUsesWith(Value *New) {
     Use &U = *UseList;
     // Must handle Constants specially, we cannot call replaceUsesOfWith on a
     // constant because they are uniqued.
-    if (Constant *C = dyn_cast<Constant>(U.getUser())) {
+    if (auto *C = dyn_cast<Constant>(U.getUser())) {
+      if (isa<GlobalAlias>(C)) {
+        replaceAliasUseWith(U, New);
+        continue;
+      }
       if (!isa<GlobalValue>(C)) {
         C->replaceUsesOfWithOnConstant(this, New, &U);
         continue;
@@ -557,7 +617,7 @@ void ValueHandleBase::AddToUseList() {
     // If this value already has a ValueHandle, then it must be in the
     // ValueHandles map already.
     ValueHandleBase *&Entry = pImpl->ValueHandles[VP.getPointer()];
-    assert(Entry != 0 && "Value doesn't have any handles?");
+    assert(Entry && "Value doesn't have any handles?");
     AddToExistingUseList(&Entry);
     return;
   }
@@ -571,7 +631,7 @@ void ValueHandleBase::AddToUseList() {
   const void *OldBucketPtr = Handles.getPointerIntoBucketsArray();
 
   ValueHandleBase *&Entry = Handles[VP.getPointer()];
-  assert(Entry == 0 && "Value really did already have handles?");
+  assert(!Entry && "Value really did already have handles?");
   AddToExistingUseList(&Entry);
   VP.getPointer()->HasValueHandle = true;
 
@@ -652,7 +712,7 @@ void ValueHandleBase::ValueIsDeleted(Value *V) {
       break;
     case Weak:
       // Weak just goes to null, which will unlink it from the list.
-      Entry->operator=(0);
+      Entry->operator=(nullptr);
       break;
     case Callback:
       // Forward to the subclass's implementation.
diff --git a/lib/IR/ValueSymbolTable.cpp b/lib/IR/ValueSymbolTable.cpp
index fffacb3..e9e979a 100644
--- a/lib/IR/ValueSymbolTable.cpp
+++ b/lib/IR/ValueSymbolTable.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "valuesymtab"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/GlobalValue.h"
@@ -20,6 +19,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "valuesymtab"
+
 // Class destructor
 ValueSymbolTable::~ValueSymbolTable() {
 #ifndef NDEBUG   // Only do this in -g mode...
@@ -56,7 +57,7 @@ void ValueSymbolTable::reinsertValue(Value* V) {
 
     // Try insert the vmap entry with this suffix.
     ValueName &NewName = vmap.GetOrCreateValue(UniqueName);
-    if (NewName.getValue() == 0) {
+    if (!NewName.getValue()) {
       // Newly inserted name.  Success!
       NewName.setValue(V);
       V->Name = &NewName;
@@ -78,7 +79,7 @@ void ValueSymbolTable::removeValueName(ValueName *V) {
 ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) {
   // In the common case, the name is not already in the symbol table.
   ValueName &Entry = vmap.GetOrCreateValue(Name);
-  if (Entry.getValue() == 0) {
+  if (!Entry.getValue()) {
     Entry.setValue(V);
     //DEBUG(dbgs() << " Inserted value: " << Entry.getKeyData() << ": "
     //           << *V << "\n");
@@ -95,7 +96,7 @@ ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) {
     
     // Try insert the vmap entry with this suffix.
     ValueName &NewName = vmap.GetOrCreateValue(UniqueName);
-    if (NewName.getValue() == 0) {
+    if (!NewName.getValue()) {
       // Newly inserted name.  Success!
       NewName.setValue(V);
      //DEBUG(dbgs() << " Inserted value: " << UniqueName << ": " << *V << "\n");
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 089ad1c..bcc38c1 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -61,6 +61,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
@@ -76,15 +77,71 @@
 #include <cstdarg>
 using namespace llvm;
 
-static cl::opt<bool> DisableDebugInfoVerifier("disable-debug-info-verifier",
-                                              cl::init(true));
+static cl::opt<bool> VerifyDebugInfo("verify-debug-info", cl::init(false));
 
 namespace {
-class Verifier : public InstVisitor<Verifier> {
-  friend class InstVisitor<Verifier>;
-
+struct VerifierSupport {
   raw_ostream &OS;
   const Module *M;
+
+  /// \brief Track the brokenness of the module while recursively visiting.
+  bool Broken;
+
+  explicit VerifierSupport(raw_ostream &OS)
+      : OS(OS), M(nullptr), Broken(false) {}
+
+  void WriteValue(const Value *V) {
+    if (!V)
+      return;
+    if (isa<Instruction>(V)) {
+      OS << *V << '\n';
+    } else {
+      V->printAsOperand(OS, true, M);
+      OS << '\n';
+    }
+  }
+
+  void WriteType(Type *T) {
+    if (!T)
+      return;
+    OS << ' ' << *T;
+  }
+
+  // CheckFailed - A check failed, so print out the condition and the message
+  // that failed.  This provides a nice place to put a breakpoint if you want
+  // to see why something is not correct.
+  void CheckFailed(const Twine &Message, const Value *V1 = nullptr,
+                   const Value *V2 = nullptr, const Value *V3 = nullptr,
+                   const Value *V4 = nullptr) {
+    OS << Message.str() << "\n";
+    WriteValue(V1);
+    WriteValue(V2);
+    WriteValue(V3);
+    WriteValue(V4);
+    Broken = true;
+  }
+
+  void CheckFailed(const Twine &Message, const Value *V1, Type *T2,
+                   const Value *V3 = nullptr) {
+    OS << Message.str() << "\n";
+    WriteValue(V1);
+    WriteType(T2);
+    WriteValue(V3);
+    Broken = true;
+  }
+
+  void CheckFailed(const Twine &Message, Type *T1, Type *T2 = nullptr,
+                   Type *T3 = nullptr) {
+    OS << Message.str() << "\n";
+    WriteType(T1);
+    WriteType(T2);
+    WriteType(T3);
+    Broken = true;
+  }
+};
+class Verifier : public InstVisitor<Verifier>, VerifierSupport {
+  friend class InstVisitor<Verifier>;
+
   LLVMContext *Context;
   const DataLayout *DL;
   DominatorTree DT;
@@ -104,15 +161,10 @@ class Verifier : public InstVisitor<Verifier> {
   /// personality function.
   const Value *PersonalityFn;
 
-  /// \brief Finder keeps track of all debug info MDNodes in a Module.
-  DebugInfoFinder Finder;
-
-  /// \brief Track the brokenness of the module while recursively visiting.
-  bool Broken;
-
 public:
   explicit Verifier(raw_ostream &OS = dbgs())
-      : OS(OS), M(0), Context(0), DL(0), PersonalityFn(0), Broken(false) {}
+      : VerifierSupport(OS), Context(nullptr), DL(nullptr),
+        PersonalityFn(nullptr) {}
 
   bool verify(const Function &F) {
     M = F.getParent();
@@ -142,16 +194,11 @@ public:
     // FIXME: It's really gross that we have to cast away constness here.
     DT.recalculate(const_cast<Function &>(F));
 
-    Finder.reset();
     Broken = false;
     // FIXME: We strip const here because the inst visitor strips const.
     visit(const_cast<Function &>(F));
     InstsInThisBlock.clear();
-    PersonalityFn = 0;
-
-    if (!DisableDebugInfoVerifier)
-      // Verify Debug Info.
-      verifyDebugInfo();
+    PersonalityFn = nullptr;
 
     return !Broken;
   }
@@ -159,7 +206,6 @@ public:
   bool verify(const Module &M) {
     this->M = &M;
     Context = &M.getContext();
-    Finder.reset();
     Broken = false;
 
     // Scan through, checking all of the external function's linkage now...
@@ -187,13 +233,6 @@ public:
     visitModuleFlags(M);
     visitModuleIdents(M);
 
-    if (!DisableDebugInfoVerifier) {
-      Finder.reset();
-      Finder.processModule(M);
-      // Verify Debug Info.
-      verifyDebugInfo();
-    }
-
     return !Broken;
   }
 
@@ -262,6 +301,7 @@ private:
   void visitLandingPadInst(LandingPadInst &LPI);
 
   void VerifyCallSite(CallSite CS);
+  void verifyMustTailCall(CallInst &CI);
   bool PerformTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty, int VT,
                         unsigned ArgNo, std::string &Suffix);
   bool VerifyIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
@@ -278,56 +318,21 @@ private:
 
   void VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy);
   void VerifyConstantExprBitcastType(const ConstantExpr *CE);
+};
+class DebugInfoVerifier : public VerifierSupport {
+public:
+  explicit DebugInfoVerifier(raw_ostream &OS = dbgs()) : VerifierSupport(OS) {}
 
-  void verifyDebugInfo();
-
-  void WriteValue(const Value *V) {
-    if (!V)
-      return;
-    if (isa<Instruction>(V)) {
-      OS << *V << '\n';
-    } else {
-      V->printAsOperand(OS, true, M);
-      OS << '\n';
-    }
-  }
-
-  void WriteType(Type *T) {
-    if (!T)
-      return;
-    OS << ' ' << *T;
-  }
-
-  // CheckFailed - A check failed, so print out the condition and the message
-  // that failed.  This provides a nice place to put a breakpoint if you want
-  // to see why something is not correct.
-  void CheckFailed(const Twine &Message, const Value *V1 = 0,
-                   const Value *V2 = 0, const Value *V3 = 0,
-                   const Value *V4 = 0) {
-    OS << Message.str() << "\n";
-    WriteValue(V1);
-    WriteValue(V2);
-    WriteValue(V3);
-    WriteValue(V4);
-    Broken = true;
-  }
-
-  void CheckFailed(const Twine &Message, const Value *V1, Type *T2,
-                   const Value *V3 = 0) {
-    OS << Message.str() << "\n";
-    WriteValue(V1);
-    WriteType(T2);
-    WriteValue(V3);
-    Broken = true;
+  bool verify(const Module &M) {
+    this->M = &M;
+    verifyDebugInfo();
+    return !Broken;
   }
 
-  void CheckFailed(const Twine &Message, Type *T1, Type *T2 = 0, Type *T3 = 0) {
-    OS << Message.str() << "\n";
-    WriteType(T1);
-    WriteType(T2);
-    WriteType(T3);
-    Broken = true;
-  }
+private:
+  void verifyDebugInfo();
+  void processInstructions(DebugInfoFinder &Finder);
+  void processCallInst(DebugInfoFinder &Finder, const CallInst &CI);
 };
 } // End anonymous namespace
 
@@ -345,18 +350,14 @@ private:
 
 void Verifier::visit(Instruction &I) {
   for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
-    Assert1(I.getOperand(i) != 0, "Operand is null", &I);
+    Assert1(I.getOperand(i) != nullptr, "Operand is null", &I);
   InstVisitor<Verifier>::visit(I);
 }
 
 
 void Verifier::visitGlobalValue(const GlobalValue &GV) {
-  Assert1(!GV.isDeclaration() ||
-          GV.isMaterializable() ||
-          GV.hasExternalLinkage() ||
-          GV.hasExternalWeakLinkage() ||
-          (isa<GlobalAlias>(GV) &&
-           (GV.hasLocalLinkage() || GV.hasWeakLinkage())),
+  Assert1(!GV.isDeclaration() || GV.isMaterializable() ||
+              GV.hasExternalLinkage() || GV.hasExternalWeakLinkage(),
           "Global is external, but doesn't have external or weak linkage!",
           &GV);
 
@@ -395,14 +396,22 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
             "invalid linkage for intrinsic global variable", &GV);
     // Don't worry about emitting an error for it not being an array,
     // visitGlobalValue will complain on appending non-array.
-    if (ArrayType *ATy = dyn_cast<ArrayType>(GV.getType())) {
+    if (ArrayType *ATy = dyn_cast<ArrayType>(GV.getType()->getElementType())) {
       StructType *STy = dyn_cast<StructType>(ATy->getElementType());
       PointerType *FuncPtrTy =
           FunctionType::get(Type::getVoidTy(*Context), false)->getPointerTo();
-      Assert1(STy && STy->getNumElements() == 2 &&
+      // FIXME: Reject the 2-field form in LLVM 4.0.
+      Assert1(STy && (STy->getNumElements() == 2 ||
+                      STy->getNumElements() == 3) &&
               STy->getTypeAtIndex(0u)->isIntegerTy(32) &&
               STy->getTypeAtIndex(1) == FuncPtrTy,
               "wrong type for intrinsic global variable", &GV);
+      if (STy->getNumElements() == 3) {
+        Type *ETy = STy->getTypeAtIndex(2);
+        Assert1(ETy->isPointerTy() &&
+                    cast<PointerType>(ETy)->getElementType()->isIntegerTy(8),
+                "wrong type for intrinsic global variable", &GV);
+      }
     }
   }
 
@@ -472,11 +481,7 @@ void Verifier::visitGlobalAlias(const GlobalAlias &GA) {
           "Alias should have external or external weak linkage!", &GA);
   Assert1(GA.getAliasee(),
           "Aliasee cannot be NULL!", &GA);
-  Assert1(GA.getType() == GA.getAliasee()->getType(),
-          "Alias and aliasee types should match!", &GA);
   Assert1(!GA.hasUnnamedAddr(), "Alias cannot have unnamed_addr!", &GA);
-  Assert1(!GA.hasSection(), "Alias cannot have a section!", &GA);
-  Assert1(!GA.getAlignment(), "Alias connot have an alignment", &GA);
 
   const Constant *Aliasee = GA.getAliasee();
   const GlobalValue *GV = dyn_cast<GlobalValue>(Aliasee);
@@ -492,14 +497,7 @@ void Verifier::visitGlobalAlias(const GlobalAlias &GA) {
                 "addrspacecast of GlobalValue",
             &GA);
 
-    if (CE->getOpcode() == Instruction::BitCast) {
-      unsigned SrcAS = GV->getType()->getPointerAddressSpace();
-      unsigned DstAS = CE->getType()->getPointerAddressSpace();
-
-      Assert1(SrcAS == DstAS,
-              "Alias bitcasts cannot be between different address spaces",
-              &GA);
-    }
+    VerifyConstantExprBitcastType(CE);
   }
   Assert1(!GV->isDeclaration(), "Alias must point to a definition", &GA);
   if (const GlobalAlias *GAAliasee = dyn_cast<GlobalAlias>(GV)) {
@@ -507,10 +505,6 @@ void Verifier::visitGlobalAlias(const GlobalAlias &GA) {
             &GA);
   }
 
-  const GlobalValue *AG = GA.getAliasedGlobal();
-  Assert1(AG, "Aliasing chain should end with function or global variable",
-          &GA);
-
   visitGlobalValue(GA);
 }
 
@@ -522,7 +516,7 @@ void Verifier::visitNamedMDNode(const NamedMDNode &NMD) {
 
     Assert1(!MD->isFunctionLocal(),
             "Named metadata operand cannot be function local!", MD);
-    visitMDNode(*MD, 0);
+    visitMDNode(*MD, nullptr);
   }
 }
 
@@ -548,7 +542,7 @@ void Verifier::visitMDNode(MDNode &MD, Function *F) {
 
     // If this was an instruction, bb, or argument, verify that it is in the
     // function that we expect.
-    Function *ActualF = 0;
+    Function *ActualF = nullptr;
     if (Instruction *I = dyn_cast<Instruction>(Op))
       ActualF = I->getParent()->getParent();
     else if (BasicBlock *BB = dyn_cast<BasicBlock>(Op))
@@ -821,6 +815,7 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
 
   bool SawNest = false;
   bool SawReturned = false;
+  bool SawSRet = false;
 
   for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
     unsigned Idx = Attrs.getSlotIndex(i);
@@ -851,8 +846,12 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
       SawReturned = true;
     }
 
-    if (Attrs.hasAttribute(Idx, Attribute::StructRet))
-      Assert1(Idx == 1, "Attribute sret is not on first parameter!", V);
+    if (Attrs.hasAttribute(Idx, Attribute::StructRet)) {
+      Assert1(!SawSRet, "Cannot have multiple 'sret' parameters!", V);
+      Assert1(Idx == 1 || Idx == 2,
+              "Attribute 'sret' is not on first or second parameter!", V);
+      SawSRet = true;
+    }
 
     if (Attrs.hasAttribute(Idx, Attribute::InAlloca)) {
       Assert1(Idx == FT->getNumParams(),
@@ -1489,6 +1488,16 @@ void Verifier::VerifyCallSite(CallSite CS) {
   // Verify call attributes.
   VerifyFunctionAttrs(FTy, Attrs, I);
 
+  // Conservatively check the inalloca argument.
+  // We have a bug if we can find that there is an underlying alloca without
+  // inalloca.
+  if (CS.hasInAllocaArgument()) {
+    Value *InAllocaArg = CS.getArgument(FTy->getNumParams() - 1);
+    if (auto AI = dyn_cast<AllocaInst>(InAllocaArg->stripInBoundsOffsets()))
+      Assert2(AI->isUsedWithInAlloca(),
+              "inalloca argument for call has mismatched alloca", AI, I);
+  }
+
   if (FTy->isVarArg()) {
     // FIXME? is 'nest' even legal here?
     bool SawNest = false;
@@ -1530,7 +1539,7 @@ void Verifier::VerifyCallSite(CallSite CS) {
   }
 
   // Verify that there's no metadata unless it's a direct call to an intrinsic.
-  if (CS.getCalledFunction() == 0 ||
+  if (CS.getCalledFunction() == nullptr ||
       !CS.getCalledFunction()->getName().startswith("llvm.")) {
     for (FunctionType::param_iterator PI = FTy->param_begin(),
            PE = FTy->param_end(); PI != PE; ++PI)
@@ -1541,9 +1550,102 @@ void Verifier::VerifyCallSite(CallSite CS) {
   visitInstruction(*I);
 }
 
+/// Two types are "congruent" if they are identical, or if they are both pointer
+/// types with different pointee types and the same address space.
+static bool isTypeCongruent(Type *L, Type *R) {
+  if (L == R)
+    return true;
+  PointerType *PL = dyn_cast<PointerType>(L);
+  PointerType *PR = dyn_cast<PointerType>(R);
+  if (!PL || !PR)
+    return false;
+  return PL->getAddressSpace() == PR->getAddressSpace();
+}
+
+static AttrBuilder getParameterABIAttributes(int I, AttributeSet Attrs) {
+  static const Attribute::AttrKind ABIAttrs[] = {
+      Attribute::StructRet, Attribute::ByVal, Attribute::InAlloca,
+      Attribute::InReg, Attribute::Returned};
+  AttrBuilder Copy;
+  for (auto AK : ABIAttrs) {
+    if (Attrs.hasAttribute(I + 1, AK))
+      Copy.addAttribute(AK);
+  }
+  if (Attrs.hasAttribute(I + 1, Attribute::Alignment))
+    Copy.addAlignmentAttr(Attrs.getParamAlignment(I + 1));
+  return Copy;
+}
+
+void Verifier::verifyMustTailCall(CallInst &CI) {
+  Assert1(!CI.isInlineAsm(), "cannot use musttail call with inline asm", &CI);
+
+  // - The caller and callee prototypes must match.  Pointer types of
+  //   parameters or return types may differ in pointee type, but not
+  //   address space.
+  Function *F = CI.getParent()->getParent();
+  auto GetFnTy = [](Value *V) {
+    return cast<FunctionType>(
+        cast<PointerType>(V->getType())->getElementType());
+  };
+  FunctionType *CallerTy = GetFnTy(F);
+  FunctionType *CalleeTy = GetFnTy(CI.getCalledValue());
+  Assert1(CallerTy->getNumParams() == CalleeTy->getNumParams(),
+          "cannot guarantee tail call due to mismatched parameter counts", &CI);
+  Assert1(CallerTy->isVarArg() == CalleeTy->isVarArg(),
+          "cannot guarantee tail call due to mismatched varargs", &CI);
+  Assert1(isTypeCongruent(CallerTy->getReturnType(), CalleeTy->getReturnType()),
+          "cannot guarantee tail call due to mismatched return types", &CI);
+  for (int I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
+    Assert1(
+        isTypeCongruent(CallerTy->getParamType(I), CalleeTy->getParamType(I)),
+        "cannot guarantee tail call due to mismatched parameter types", &CI);
+  }
+
+  // - The calling conventions of the caller and callee must match.
+  Assert1(F->getCallingConv() == CI.getCallingConv(),
+          "cannot guarantee tail call due to mismatched calling conv", &CI);
+
+  // - All ABI-impacting function attributes, such as sret, byval, inreg,
+  //   returned, and inalloca, must match.
+  AttributeSet CallerAttrs = F->getAttributes();
+  AttributeSet CalleeAttrs = CI.getAttributes();
+  for (int I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
+    AttrBuilder CallerABIAttrs = getParameterABIAttributes(I, CallerAttrs);
+    AttrBuilder CalleeABIAttrs = getParameterABIAttributes(I, CalleeAttrs);
+    Assert2(CallerABIAttrs == CalleeABIAttrs,
+            "cannot guarantee tail call due to mismatched ABI impacting "
+            "function attributes", &CI, CI.getOperand(I));
+  }
+
+  // - The call must immediately precede a :ref:`ret <i_ret>` instruction,
+  //   or a pointer bitcast followed by a ret instruction.
+  // - The ret instruction must return the (possibly bitcasted) value
+  //   produced by the call or void.
+  Value *RetVal = &CI;
+  Instruction *Next = CI.getNextNode();
+
+  // Handle the optional bitcast.
+  if (BitCastInst *BI = dyn_cast_or_null<BitCastInst>(Next)) {
+    Assert1(BI->getOperand(0) == RetVal,
+            "bitcast following musttail call must use the call", BI);
+    RetVal = BI;
+    Next = BI->getNextNode();
+  }
+
+  // Check the return.
+  ReturnInst *Ret = dyn_cast_or_null<ReturnInst>(Next);
+  Assert1(Ret, "musttail call must be precede a ret with an optional bitcast",
+          &CI);
+  Assert1(!Ret->getReturnValue() || Ret->getReturnValue() == RetVal,
+          "musttail call result must be returned", Ret);
+}
+
 void Verifier::visitCallInst(CallInst &CI) {
   VerifyCallSite(&CI);
 
+  if (CI.isMustTailCall())
+    verifyMustTailCall(CI);
+
   if (Function *F = CI.getCalledFunction())
     if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
       visitIntrinsicFunctionCall(ID, CI);
@@ -1731,11 +1833,11 @@ void Verifier::visitLoadInst(LoadInst &LI) {
             "Atomic load must specify explicit alignment", &LI);
     if (!ElTy->isPointerTy()) {
       Assert2(ElTy->isIntegerTy(),
-              "atomic store operand must have integer type!",
+              "atomic load operand must have integer type!",
               &LI, ElTy);
       unsigned Size = ElTy->getPrimitiveSizeInBits();
       Assert2(Size >= 8 && !(Size & (Size - 1)),
-              "atomic store operand must be power-of-two byte-sized integer",
+              "atomic load operand must be power-of-two byte-sized integer",
               &LI, ElTy);
     }
   } else {
@@ -2020,8 +2122,8 @@ void Verifier::visitInstruction(Instruction &I) {
   // instruction, it is an error!
   for (Use &U : I.uses()) {
     if (Instruction *Used = dyn_cast<Instruction>(U.getUser()))
-      Assert2(Used->getParent() != 0, "Instruction referencing instruction not"
-              " embedded in a basic block!", &I, Used);
+      Assert2(Used->getParent() != nullptr, "Instruction referencing"
+              " instruction not embedded in a basic block!", &I, Used);
     else {
       CheckFailed("Use of instruction is not an instruction!", U);
       return;
@@ -2029,7 +2131,7 @@ void Verifier::visitInstruction(Instruction &I) {
   }
 
   for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
-    Assert1(I.getOperand(i) != 0, "Instruction has null operand!", &I);
+    Assert1(I.getOperand(i) != nullptr, "Instruction has null operand!", &I);
 
     // Check to make sure that only first-class-values are operands to
     // instructions.
@@ -2103,11 +2205,6 @@ void Verifier::visitInstruction(Instruction &I) {
   MDNode *MD = I.getMetadata(LLVMContext::MD_range);
   Assert1(!MD || isa<LoadInst>(I), "Ranges are only for loads!", &I);
 
-  if (!DisableDebugInfoVerifier) {
-    MD = I.getMetadata(LLVMContext::MD_dbg);
-    Finder.processLocation(*M, DILocation(MD));
-  }
-
   InstsInThisBlock.insert(&I);
 }
 
@@ -2137,18 +2234,18 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
   case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width);
   case IITDescriptor::Vector: {
     VectorType *VT = dyn_cast<VectorType>(Ty);
-    return VT == 0 || VT->getNumElements() != D.Vector_Width ||
+    return !VT || VT->getNumElements() != D.Vector_Width ||
            VerifyIntrinsicType(VT->getElementType(), Infos, ArgTys);
   }
   case IITDescriptor::Pointer: {
     PointerType *PT = dyn_cast<PointerType>(Ty);
-    return PT == 0 || PT->getAddressSpace() != D.Pointer_AddressSpace ||
+    return !PT || PT->getAddressSpace() != D.Pointer_AddressSpace ||
            VerifyIntrinsicType(PT->getElementType(), Infos, ArgTys);
   }
 
   case IITDescriptor::Struct: {
     StructType *ST = dyn_cast<StructType>(Ty);
-    if (ST == 0 || ST->getNumElements() != D.Struct_NumElements)
+    if (!ST || ST->getNumElements() != D.Struct_NumElements)
       return true;
 
     for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
@@ -2307,17 +2404,7 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
     MDNode *MD = cast<MDNode>(CI.getArgOperand(0));
     Assert1(MD->getNumOperands() == 1,
                 "invalid llvm.dbg.declare intrinsic call 2", &CI);
-    if (!DisableDebugInfoVerifier)
-      Finder.processDeclare(*M, cast<DbgDeclareInst>(&CI));
   } break;
-  case Intrinsic::dbg_value: { //llvm.dbg.value
-    if (!DisableDebugInfoVerifier) {
-      Assert1(CI.getArgOperand(0) && isa<MDNode>(CI.getArgOperand(0)),
-              "invalid llvm.dbg.value intrinsic call 1", &CI);
-      Finder.processValue(*M, cast<DbgValueInst>(&CI));
-    }
-    break;
-  }
   case Intrinsic::memcpy:
   case Intrinsic::memmove:
   case Intrinsic::memset:
@@ -2379,25 +2466,58 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   }
 }
 
-void Verifier::verifyDebugInfo() {
+void DebugInfoVerifier::verifyDebugInfo() {
+  if (!VerifyDebugInfo)
+    return;
+
+  DebugInfoFinder Finder;
+  Finder.processModule(*M);
+  processInstructions(Finder);
+
   // Verify Debug Info.
-  if (!DisableDebugInfoVerifier) {
-    for (DICompileUnit CU : Finder.compile_units()) {
-      Assert1(CU.Verify(), "DICompileUnit does not Verify!", CU);
-    }
-    for (DISubprogram S : Finder.subprograms()) {
-      Assert1(S.Verify(), "DISubprogram does not Verify!", S);
-    }
-    for (DIGlobalVariable GV : Finder.global_variables()) {
-      Assert1(GV.Verify(), "DIGlobalVariable does not Verify!", GV);
-    }
-    for (DIType T : Finder.types()) {
-      Assert1(T.Verify(), "DIType does not Verify!", T);
-    }
-    for (DIScope S : Finder.scopes()) {
-      Assert1(S.Verify(), "DIScope does not Verify!", S);
-    }
+  //
+  // NOTE:  The loud braces are necessary for MSVC compatibility.
+  for (DICompileUnit CU : Finder.compile_units()) {
+    Assert1(CU.Verify(), "DICompileUnit does not Verify!", CU);
   }
+  for (DISubprogram S : Finder.subprograms()) {
+    Assert1(S.Verify(), "DISubprogram does not Verify!", S);
+  }
+  for (DIGlobalVariable GV : Finder.global_variables()) {
+    Assert1(GV.Verify(), "DIGlobalVariable does not Verify!", GV);
+  }
+  for (DIType T : Finder.types()) {
+    Assert1(T.Verify(), "DIType does not Verify!", T);
+  }
+  for (DIScope S : Finder.scopes()) {
+    Assert1(S.Verify(), "DIScope does not Verify!", S);
+  }
+}
+
+void DebugInfoVerifier::processInstructions(DebugInfoFinder &Finder) {
+  for (const Function &F : *M)
+    for (auto I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+      if (MDNode *MD = I->getMetadata(LLVMContext::MD_dbg))
+        Finder.processLocation(*M, DILocation(MD));
+      if (const CallInst *CI = dyn_cast<CallInst>(&*I))
+        processCallInst(Finder, *CI);
+    }
+}
+
+void DebugInfoVerifier::processCallInst(DebugInfoFinder &Finder,
+                                        const CallInst &CI) {
+  if (Function *F = CI.getCalledFunction())
+    if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
+      switch (ID) {
+      case Intrinsic::dbg_declare:
+        Finder.processDeclare(*M, cast<DbgDeclareInst>(&CI));
+        break;
+      case Intrinsic::dbg_value:
+        Finder.processValue(*M, cast<DbgValueInst>(&CI));
+        break;
+      default:
+        break;
+      }
 }
 
 //===----------------------------------------------------------------------===//
@@ -2427,7 +2547,8 @@ bool llvm::verifyModule(const Module &M, raw_ostream *OS) {
 
   // Note that this function's return value is inverted from what you would
   // expect of a function called "verify".
-  return !V.verify(M) || Broken;
+  DebugInfoVerifier DIV(OS ? *OS : NullStr);
+  return !V.verify(M) || !DIV.verify(M) || Broken;
 }
 
 namespace {
@@ -2463,15 +2584,48 @@ struct VerifierLegacyPass : public FunctionPass {
     AU.setPreservesAll();
   }
 };
+struct DebugInfoVerifierLegacyPass : public ModulePass {
+  static char ID;
+
+  DebugInfoVerifier V;
+  bool FatalErrors;
+
+  DebugInfoVerifierLegacyPass() : ModulePass(ID), FatalErrors(true) {
+    initializeDebugInfoVerifierLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  explicit DebugInfoVerifierLegacyPass(bool FatalErrors)
+      : ModulePass(ID), V(dbgs()), FatalErrors(FatalErrors) {
+    initializeDebugInfoVerifierLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    if (!V.verify(M) && FatalErrors)
+      report_fatal_error("Broken debug info found, compilation aborted!");
+
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+};
 }
 
 char VerifierLegacyPass::ID = 0;
 INITIALIZE_PASS(VerifierLegacyPass, "verify", "Module Verifier", false, false)
 
+char DebugInfoVerifierLegacyPass::ID = 0;
+INITIALIZE_PASS(DebugInfoVerifierLegacyPass, "verify-di", "Debug Info Verifier",
+                false, false)
+
 FunctionPass *llvm::createVerifierPass(bool FatalErrors) {
   return new VerifierLegacyPass(FatalErrors);
 }
 
+ModulePass *llvm::createDebugInfoVerifierPass(bool FatalErrors) {
+  return new DebugInfoVerifierLegacyPass(FatalErrors);
+}
+
 PreservedAnalyses VerifierPass::run(Module *M) {
   if (verifyModule(*M, &dbgs()) && FatalErrors)
     report_fatal_error("Broken module found, compilation aborted!");
diff --git a/lib/IR/module.modulemap b/lib/IR/module.modulemap
new file mode 100644
index 0000000..9698e91
--- /dev/null
+++ b/lib/IR/module.modulemap
@@ -0,0 +1 @@
+module IR { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/IRReader/IRReader.cpp b/lib/IRReader/IRReader.cpp
index 8be8ab8..f4ed437 100644
--- a/lib/IRReader/IRReader.cpp
+++ b/lib/IRReader/IRReader.cpp
@@ -42,12 +42,12 @@ Module *llvm::getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err,
       // ParseBitcodeFile does not take ownership of the Buffer in the
       // case of an error.
       delete Buffer;
-      return NULL;
+      return nullptr;
     }
     return ModuleOrErr.get();
   }
 
-  return ParseAssembly(Buffer, 0, Err, Context);
+  return ParseAssembly(Buffer, nullptr, Err, Context);
 }
 
 Module *llvm::getLazyIRFileModule(const std::string &Filename, SMDiagnostic &Err,
@@ -56,7 +56,7 @@ Module *llvm::getLazyIRFileModule(const std::string &Filename, SMDiagnostic &Err
   if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
     Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
                        "Could not open input file: " + ec.message());
-    return 0;
+    return nullptr;
   }
 
   return getLazyIRModule(File.release(), Err, Context);
@@ -69,7 +69,7 @@ Module *llvm::ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err,
   if (isBitcode((const unsigned char *)Buffer->getBufferStart(),
                 (const unsigned char *)Buffer->getBufferEnd())) {
     ErrorOr<Module *> ModuleOrErr = parseBitcodeFile(Buffer, Context);
-    Module *M = 0;
+    Module *M = nullptr;
     if (error_code EC = ModuleOrErr.getError())
       Err = SMDiagnostic(Buffer->getBufferIdentifier(), SourceMgr::DK_Error,
                          EC.message());
@@ -80,7 +80,7 @@ Module *llvm::ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err,
     return M;
   }
 
-  return ParseAssembly(Buffer, 0, Err, Context);
+  return ParseAssembly(Buffer, nullptr, Err, Context);
 }
 
 Module *llvm::ParseIRFile(const std::string &Filename, SMDiagnostic &Err,
@@ -89,7 +89,7 @@ Module *llvm::ParseIRFile(const std::string &Filename, SMDiagnostic &Err,
   if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
     Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
                        "Could not open input file: " + ec.message());
-    return 0;
+    return nullptr;
   }
 
   return ParseIR(File.release(), Err, Context);
@@ -111,7 +111,7 @@ LLVMBool LLVMParseIRInContext(LLVMContextRef ContextRef,
       std::string buf;
       raw_string_ostream os(buf);
 
-      Diag.print(NULL, os, false);
+      Diag.print(nullptr, os, false);
       os.flush();
 
       *OutMessage = strdup(buf.c_str());
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 51d0899..99236bd 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -63,20 +63,20 @@ const char* LTOCodeGenerator::getVersionString() {
 }
 
 LTOCodeGenerator::LTOCodeGenerator()
-    : Context(getGlobalContext()), Linker(new Module("ld-temp.o", Context)),
-      TargetMach(NULL), EmitDwarfDebugInfo(false), ScopeRestrictionsDone(false),
-      CodeModel(LTO_CODEGEN_PIC_MODEL_DYNAMIC), NativeObjectFile(NULL),
-      DiagHandler(NULL), DiagContext(NULL) {
+    : Context(getGlobalContext()), IRLinker(new Module("ld-temp.o", Context)),
+      TargetMach(nullptr), EmitDwarfDebugInfo(false),
+      ScopeRestrictionsDone(false), CodeModel(LTO_CODEGEN_PIC_MODEL_DEFAULT),
+      NativeObjectFile(nullptr), DiagHandler(nullptr), DiagContext(nullptr) {
   initializeLTOPasses();
 }
 
 LTOCodeGenerator::~LTOCodeGenerator() {
   delete TargetMach;
   delete NativeObjectFile;
-  TargetMach = NULL;
-  NativeObjectFile = NULL;
+  TargetMach = nullptr;
+  NativeObjectFile = nullptr;
 
-  Linker.deleteModule();
+  IRLinker.deleteModule();
 
   for (std::vector<char *>::iterator I = CodegenOptions.begin(),
                                      E = CodegenOptions.end();
@@ -114,7 +114,7 @@ void LTOCodeGenerator::initializeLTOPasses() {
 }
 
 bool LTOCodeGenerator::addModule(LTOModule* mod, std::string& errMsg) {
-  bool ret = Linker.linkInModule(mod->getLLVVMModule(), &errMsg);
+  bool ret = IRLinker.linkInModule(mod->getLLVVMModule(), &errMsg);
 
   const std::vector<const char*> &undefs = mod->getAsmUndefinedRefs();
   for (int i = 0, e = undefs.size(); i != e; ++i)
@@ -140,7 +140,6 @@ void LTOCodeGenerator::setTargetOptions(TargetOptions options) {
   Options.StackAlignmentOverride = options.StackAlignmentOverride;
   Options.TrapFuncName = options.TrapFuncName;
   Options.PositionIndependentExecutable = options.PositionIndependentExecutable;
-  Options.EnableSegmentedStacks = options.EnableSegmentedStacks;
   Options.UseInitArray = options.UseInitArray;
 }
 
@@ -162,6 +161,7 @@ void LTOCodeGenerator::setCodePICModel(lto_codegen_model model) {
   case LTO_CODEGEN_PIC_MODEL_STATIC:
   case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
   case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
+  case LTO_CODEGEN_PIC_MODEL_DEFAULT:
     CodeModel = model;
     return;
   }
@@ -186,7 +186,7 @@ bool LTOCodeGenerator::writeMergedModules(const char *path,
   }
 
   // write bitcode to it
-  WriteBitcodeToFile(Linker.getModule(), Out.os());
+  WriteBitcodeToFile(IRLinker.getModule(), Out.os());
   Out.os().close();
 
   if (Out.os().has_error()) {
@@ -245,7 +245,7 @@ const void* LTOCodeGenerator::compile(size_t* length,
   const char *name;
   if (!compile_to_file(&name, disableOpt, disableInline, disableGVNLoadPRE,
                        errMsg))
-    return NULL;
+    return nullptr;
 
   // remove old buffer if compile() called twice
   delete NativeObjectFile;
@@ -255,7 +255,7 @@ const void* LTOCodeGenerator::compile(size_t* length,
   if (error_code ec = MemoryBuffer::getFile(name, BuffPtr, -1, false)) {
     errMsg = ec.message();
     sys::fs::remove(NativeObjectPath);
-    return NULL;
+    return nullptr;
   }
   NativeObjectFile = BuffPtr.release();
 
@@ -263,24 +263,24 @@ const void* LTOCodeGenerator::compile(size_t* length,
   sys::fs::remove(NativeObjectPath);
 
   // return buffer, unless error
-  if (NativeObjectFile == NULL)
-    return NULL;
+  if (!NativeObjectFile)
+    return nullptr;
   *length = NativeObjectFile->getBufferSize();
   return NativeObjectFile->getBufferStart();
 }
 
 bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
-  if (TargetMach != NULL)
+  if (TargetMach)
     return true;
 
-  std::string TripleStr = Linker.getModule()->getTargetTriple();
+  std::string TripleStr = IRLinker.getModule()->getTargetTriple();
   if (TripleStr.empty())
     TripleStr = sys::getDefaultTargetTriple();
   llvm::Triple Triple(TripleStr);
 
   // create target machine from info for merged modules
   const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
-  if (march == NULL)
+  if (!march)
     return false;
 
   // The relocation model is actually a static member of TargetMachine and
@@ -296,10 +296,14 @@ bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
   case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
     RelocModel = Reloc::DynamicNoPIC;
     break;
+  case LTO_CODEGEN_PIC_MODEL_DEFAULT:
+    // RelocModel is already the default, so leave it that way.
+    break;
   }
 
-  // construct LTOModule, hand over ownership of module and target
-  SubtargetFeatures Features;
+  // Construct LTOModule, hand over ownership of module and target. Use MAttr as
+  // the default set of features.
+  SubtargetFeatures Features(MAttr);
   Features.getDefaultSubtargetFeatures(Triple);
   std::string FeatureStr = Features.getString();
   // Set a default CPU for Darwin triples.
@@ -308,7 +312,8 @@ bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
       MCpu = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       MCpu = "yonah";
-    else if (Triple.getArch() == llvm::Triple::arm64)
+    else if (Triple.getArch() == llvm::Triple::arm64 ||
+             Triple.getArch() == llvm::Triple::aarch64)
       MCpu = "cyclone";
   }
 
@@ -352,7 +357,7 @@ applyRestriction(GlobalValue &GV,
 
 static void findUsedValues(GlobalVariable *LLVMUsed,
                            SmallPtrSet<GlobalValue*, 8> &UsedValues) {
-  if (LLVMUsed == 0) return;
+  if (!LLVMUsed) return;
 
   ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
   for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
@@ -391,11 +396,12 @@ static void accumulateAndSortLibcalls(std::vector<StringRef> &Libcalls,
 void LTOCodeGenerator::applyScopeRestrictions() {
   if (ScopeRestrictionsDone)
     return;
-  Module *mergedModule = Linker.getModule();
+  Module *mergedModule = IRLinker.getModule();
 
   // Start off with a verification pass.
   PassManager passes;
   passes.add(createVerifierPass());
+  passes.add(createDebugInfoVerifierPass());
 
   // mark which symbols can not be internalized
   Mangler Mangler(TargetMach->getDataLayout());
@@ -424,9 +430,7 @@ void LTOCodeGenerator::applyScopeRestrictions() {
   if (!AsmUsed.empty()) {
     llvm::Type *i8PTy = llvm::Type::getInt8PtrTy(Context);
     std::vector<Constant*> asmUsed2;
-    for (SmallPtrSet<GlobalValue*, 16>::const_iterator i = AsmUsed.begin(),
-           e = AsmUsed.end(); i !=e; ++i) {
-      GlobalValue *GV = *i;
+    for (auto *GV : AsmUsed) {
       Constant *c = ConstantExpr::getBitCast(GV, i8PTy);
       asmUsed2.push_back(c);
     }
@@ -458,7 +462,7 @@ bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
   if (!this->determineTarget(errMsg))
     return false;
 
-  Module *mergedModule = Linker.getModule();
+  Module *mergedModule = IRLinker.getModule();
 
   // Mark which symbols can not be internalized
   this->applyScopeRestrictions();
@@ -468,6 +472,7 @@ bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
 
   // Start off with a verification pass.
   passes.add(createVerifierPass());
+  passes.add(createDebugInfoVerifierPass());
 
   // Add an appropriate DataLayout instance for this module...
   mergedModule->setDataLayout(TargetMach->getDataLayout());
@@ -489,6 +494,7 @@ bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
 
   // Make sure everything is still good.
   passes.add(createVerifierPass());
+  passes.add(createDebugInfoVerifierPass());
 
   PassManager codeGenPasses;
 
@@ -576,7 +582,7 @@ LTOCodeGenerator::setDiagnosticHandler(lto_diagnostic_handler_t DiagHandler,
   this->DiagHandler = DiagHandler;
   this->DiagContext = Ctxt;
   if (!DiagHandler)
-    return Context.setDiagnosticHandler(NULL, NULL);
+    return Context.setDiagnosticHandler(nullptr, nullptr);
   // Register the LTOCodeGenerator stub in the LLVMContext to forward the
   // diagnostic to the external DiagHandler.
   Context.setDiagnosticHandler(LTOCodeGenerator::DiagnosticHandler, this);
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index 567da04..d117514 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -100,7 +100,7 @@ LTOModule *LTOModule::makeLTOModule(const char *path, TargetOptions options,
   std::unique_ptr<MemoryBuffer> buffer;
   if (error_code ec = MemoryBuffer::getFile(path, buffer)) {
     errMsg = ec.message();
-    return NULL;
+    return nullptr;
   }
   return makeLTOModule(buffer.release(), options, errMsg);
 }
@@ -120,7 +120,7 @@ LTOModule *LTOModule::makeLTOModule(int fd, const char *path,
   if (error_code ec =
           MemoryBuffer::getOpenFileSlice(fd, path, buffer, map_size, offset)) {
     errMsg = ec.message();
-    return NULL;
+    return nullptr;
   }
   return makeLTOModule(buffer.release(), options, errMsg);
 }
@@ -130,7 +130,7 @@ LTOModule *LTOModule::makeLTOModule(const void *mem, size_t length,
                                     std::string &errMsg, StringRef path) {
   std::unique_ptr<MemoryBuffer> buffer(makeBuffer(mem, length, path));
   if (!buffer)
-    return NULL;
+    return nullptr;
   return makeLTOModule(buffer.release(), options, errMsg);
 }
 
@@ -143,7 +143,7 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
   if (error_code EC = ModuleOrErr.getError()) {
     errMsg = EC.message();
     delete buffer;
-    return NULL;
+    return nullptr;
   }
   std::unique_ptr<Module> m(ModuleOrErr.get());
 
@@ -155,7 +155,7 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
   // find machine architecture for this module
   const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
   if (!march)
-    return NULL;
+    return nullptr;
 
   // construct LTOModule, hand over ownership of module and target
   SubtargetFeatures Features;
@@ -168,7 +168,8 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
       CPU = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       CPU = "yonah";
-    else if (Triple.getArch() == llvm::Triple::arm64)
+    else if (Triple.getArch() == llvm::Triple::arm64 ||
+             Triple.getArch() == llvm::Triple::aarch64)
       CPU = "cyclone";
   }
 
@@ -189,7 +190,7 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
 
   if (Ret->parseSymbols(errMsg)) {
     delete Ret;
-    return NULL;
+    return nullptr;
   }
 
   Ret->parseMetadata();
@@ -396,7 +397,7 @@ void LTOModule::addDefinedSymbol(const GlobalValue *def, bool isFunction) {
 
   // set alignment part log2() can have rounding errors
   uint32_t align = def->getAlignment();
-  uint32_t attr = align ? countTrailingZeros(def->getAlignment()) : 0;
+  uint32_t attr = align ? countTrailingZeros(align) : 0;
 
   // set permissions part
   if (isFunction) {
@@ -418,17 +419,17 @@ void LTOModule::addDefinedSymbol(const GlobalValue *def, bool isFunction) {
     attr |= LTO_SYMBOL_DEFINITION_REGULAR;
 
   // set scope part
-  if (def->hasHiddenVisibility())
+  if (def->hasLocalLinkage())
+    // Ignore visibility if linkage is local.
+    attr |= LTO_SYMBOL_SCOPE_INTERNAL;
+  else if (def->hasHiddenVisibility())
     attr |= LTO_SYMBOL_SCOPE_HIDDEN;
   else if (def->hasProtectedVisibility())
     attr |= LTO_SYMBOL_SCOPE_PROTECTED;
   else if (canBeHidden(def))
     attr |= LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN;
-  else if (def->hasExternalLinkage() || def->hasWeakLinkage() ||
-           def->hasLinkOnceLinkage() || def->hasCommonLinkage())
-    attr |= LTO_SYMBOL_SCOPE_DEFAULT;
   else
-    attr |= LTO_SYMBOL_SCOPE_INTERNAL;
+    attr |= LTO_SYMBOL_SCOPE_DEFAULT;
 
   StringSet::value_type &entry = _defines.GetOrCreateValue(Buffer);
   entry.setValue(1);
@@ -460,7 +461,7 @@ void LTOModule::addAsmGlobalSymbol(const char *name,
 
   NameAndAttributes &info = _undefines[entry.getKey().data()];
 
-  if (info.symbol == 0) {
+  if (info.symbol == nullptr) {
     // FIXME: This is trying to take care of module ASM like this:
     //
     //   module asm ".zerofill __FOO, __foo, _bar_baz_qux, 0"
@@ -474,7 +475,7 @@ void LTOModule::addAsmGlobalSymbol(const char *name,
     info.attributes =
       LTO_SYMBOL_PERMISSIONS_DATA | LTO_SYMBOL_DEFINITION_REGULAR | scope;
     info.isFunction = false;
-    info.symbol = 0;
+    info.symbol = nullptr;
 
     // add to table of symbols
     _symbols.push_back(info);
@@ -502,13 +503,13 @@ void LTOModule::addAsmGlobalSymbolUndef(const char *name) {
   if (entry.getValue().name)
     return;
 
-  uint32_t attr = LTO_SYMBOL_DEFINITION_UNDEFINED;;
+  uint32_t attr = LTO_SYMBOL_DEFINITION_UNDEFINED;
   attr |= LTO_SYMBOL_SCOPE_DEFAULT;
   NameAndAttributes info;
   info.name = entry.getKey().data();
   info.attributes = attr;
   info.isFunction = false;
-  info.symbol = 0;
+  info.symbol = nullptr;
 
   entry.setValue(info);
 }
@@ -698,7 +699,8 @@ namespace {
     void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                         uint64_t Size, unsigned ByteAlignment) override {}
     void EmitBytes(StringRef Data) override {}
-    void EmitValueImpl(const MCExpr *Value, unsigned Size) override {}
+    void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                       const SMLoc &Loc) override {}
     void EmitULEB128Value(const MCExpr *Value) override {}
     void EmitSLEB128Value(const MCExpr *Value) override {}
     void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
@@ -709,9 +711,6 @@ namespace {
     bool EmitValueToOffset(const MCExpr *Offset,
                            unsigned char Value) override { return false; }
     void EmitFileDirective(StringRef Filename) override {}
-    void EmitDwarfAdvanceLineAddr(int64_t LineDelta, const MCSymbol *LastLabel,
-                                  const MCSymbol *Label,
-                                  unsigned PointerSize) override {}
     void FinishImpl() override {}
     void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override {
       RecordProcEnd(Frame);
@@ -738,7 +737,8 @@ bool LTOModule::addAsmGlobalSymbols(std::string &errMsg) {
       _target->getTargetTriple(), _target->getTargetCPU(),
       _target->getTargetFeatureString()));
   std::unique_ptr<MCTargetAsmParser> TAP(
-      T.createMCAsmParser(*STI, *Parser.get(), *MCII));
+      T.createMCAsmParser(*STI, *Parser.get(), *MCII,
+                          _target->Options.MCOptions));
   if (!TAP) {
     errMsg = "target " + std::string(T.getName()) +
       " does not define AsmParser.";
@@ -801,14 +801,8 @@ bool LTOModule::parseSymbols(std::string &errMsg) {
     return true;
 
   // add aliases
-  for (Module::alias_iterator a = _module->alias_begin(),
-         e = _module->alias_end(); a != e; ++a) {
-    if (isDeclaration(*a->getAliasedGlobal()))
-      // Is an alias to a declaration.
-      addPotentialUndefinedSymbol(a, false);
-    else
-      addDefinedDataSymbol(a);
-  }
+  for (const auto &Alias : _module->aliases())
+    addDefinedDataSymbol(&Alias);
 
   // make symbols for all undefines
   for (StringMap<NameAndAttributes>::iterator u =_undefines.begin(),
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index c6476ce..45f2d4e 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -43,12 +43,12 @@ class TypeMapTy : public ValueMapTypeRemapper {
   /// we speculatively add types to MappedTypes, but keep track of them here in
   /// case we need to roll back.
   SmallVector<Type*, 16> SpeculativeTypes;
-  
+
   /// SrcDefinitionsToResolve - This is a list of non-opaque structs in the
   /// source module that are mapped to an opaque struct in the destination
   /// module.
   SmallVector<StructType*, 16> SrcDefinitionsToResolve;
-  
+
   /// DstResolvedOpaqueTypes - This is the set of opaque types in the
   /// destination modules who are getting a body from the source module.
   SmallPtrSet<StructType*, 16> DstResolvedOpaqueTypes;
@@ -65,7 +65,7 @@ public:
   /// linkDefinedTypeBodies - Produce a body for an opaque type in the dest
   /// module from a type definition in the source module.
   void linkDefinedTypeBodies();
-  
+
   /// get - Return the mapped type to use for the specified input type from the
   /// source module.
   Type *get(Type *SrcTy);
@@ -90,7 +90,7 @@ private:
   Type *remapType(Type *SrcTy) override {
     return get(SrcTy);
   }
-  
+
   bool areTypesIsomorphic(Type *DstTy, Type *SrcTy);
 };
 }
@@ -98,12 +98,12 @@ private:
 void TypeMapTy::addTypeMapping(Type *DstTy, Type *SrcTy) {
   Type *&Entry = MappedTypes[SrcTy];
   if (Entry) return;
-  
+
   if (DstTy == SrcTy) {
     Entry = DstTy;
     return;
   }
-  
+
   // Check to see if these types are recursively isomorphic and establish a
   // mapping between them if so.
   if (!areTypesIsomorphic(DstTy, SrcTy)) {
@@ -132,7 +132,7 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
     Entry = DstTy;
     return true;
   }
-  
+
   // Okay, we have two types with identical kinds that we haven't seen before.
 
   // If this is an opaque struct type, special case it.
@@ -158,18 +158,18 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
       return true;
     }
   }
-  
+
   // If the number of subtypes disagree between the two types, then we fail.
   if (SrcTy->getNumContainedTypes() != DstTy->getNumContainedTypes())
     return false;
-  
+
   // Fail if any of the extra properties (e.g. array size) of the type disagree.
   if (isa<IntegerType>(DstTy))
     return false;  // bitwidth disagrees.
   if (PointerType *PT = dyn_cast<PointerType>(DstTy)) {
     if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace())
       return false;
-    
+
   } else if (FunctionType *FT = dyn_cast<FunctionType>(DstTy)) {
     if (FT->isVarArg() != cast<FunctionType>(SrcTy)->isVarArg())
       return false;
@@ -195,7 +195,7 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
     if (!areTypesIsomorphic(DstTy->getContainedType(i),
                             SrcTy->getContainedType(i)))
       return false;
-  
+
   // If everything seems to have lined up, then everything is great.
   return true;
 }
@@ -205,31 +205,31 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
 void TypeMapTy::linkDefinedTypeBodies() {
   SmallVector<Type*, 16> Elements;
   SmallString<16> TmpName;
-  
+
   // Note that processing entries in this loop (calling 'get') can add new
   // entries to the SrcDefinitionsToResolve vector.
   while (!SrcDefinitionsToResolve.empty()) {
     StructType *SrcSTy = SrcDefinitionsToResolve.pop_back_val();
     StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
-    
+
     // TypeMap is a many-to-one mapping, if there were multiple types that
     // provide a body for DstSTy then previous iterations of this loop may have
     // already handled it.  Just ignore this case.
     if (!DstSTy->isOpaque()) continue;
     assert(!SrcSTy->isOpaque() && "Not resolving a definition?");
-    
+
     // Map the body of the source type over to a new body for the dest type.
     Elements.resize(SrcSTy->getNumElements());
     for (unsigned i = 0, e = Elements.size(); i != e; ++i)
       Elements[i] = getImpl(SrcSTy->getElementType(i));
-    
+
     DstSTy->setBody(Elements, SrcSTy->isPacked());
-    
+
     // If DstSTy has no name or has a longer name than STy, then viciously steal
     // STy's name.
     if (!SrcSTy->hasName()) continue;
     StringRef SrcName = SrcSTy->getName();
-    
+
     if (!DstSTy->hasName() || DstSTy->getName().size() > SrcName.size()) {
       TmpName.insert(TmpName.end(), SrcName.begin(), SrcName.end());
       SrcSTy->setName("");
@@ -237,7 +237,7 @@ void TypeMapTy::linkDefinedTypeBodies() {
       TmpName.clear();
     }
   }
-  
+
   DstResolvedOpaqueTypes.clear();
 }
 
@@ -245,7 +245,7 @@ void TypeMapTy::linkDefinedTypeBodies() {
 /// source module.
 Type *TypeMapTy::get(Type *Ty) {
   Type *Result = getImpl(Ty);
-  
+
   // If this caused a reference to any struct type, resolve it before returning.
   if (!SrcDefinitionsToResolve.empty())
     linkDefinedTypeBodies();
@@ -257,7 +257,7 @@ Type *TypeMapTy::getImpl(Type *Ty) {
   // If we already have an entry for this type, return it.
   Type **Entry = &MappedTypes[Ty];
   if (*Entry) return *Entry;
-  
+
   // If this is not a named struct type, then just map all of the elements and
   // then rebuild the type from inside out.
   if (!isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral()) {
@@ -265,7 +265,7 @@ Type *TypeMapTy::getImpl(Type *Ty) {
     // true for the anonymous {} struct, things like 'float', integers, etc.
     if (Ty->getNumContainedTypes() == 0)
       return *Entry = Ty;
-    
+
     // Remap all of the elements, keeping track of whether any of them change.
     bool AnyChange = false;
     SmallVector<Type*, 4> ElementTypes;
@@ -274,23 +274,23 @@ Type *TypeMapTy::getImpl(Type *Ty) {
       ElementTypes[i] = getImpl(Ty->getContainedType(i));
       AnyChange |= ElementTypes[i] != Ty->getContainedType(i);
     }
-    
+
     // If we found our type while recursively processing stuff, just use it.
     Entry = &MappedTypes[Ty];
     if (*Entry) return *Entry;
-    
+
     // If all of the element types mapped directly over, then the type is usable
     // as-is.
     if (!AnyChange)
       return *Entry = Ty;
-    
+
     // Otherwise, rebuild a modified type.
     switch (Ty->getTypeID()) {
     default: llvm_unreachable("unknown derived type to remap");
     case Type::ArrayTyID:
       return *Entry = ArrayType::get(ElementTypes[0],
                                      cast<ArrayType>(Ty)->getNumElements());
-    case Type::VectorTyID: 
+    case Type::VectorTyID:
       return *Entry = VectorType::get(ElementTypes[0],
                                       cast<VectorType>(Ty)->getNumElements());
     case Type::PointerTyID:
@@ -331,7 +331,7 @@ Type *TypeMapTy::getImpl(Type *Ty) {
   // and is not required for the prettiness of the linked module, we just skip
   // it and always rebuild a type here.
   StructType *STy = cast<StructType>(Ty);
-  
+
   // If the type is opaque, we can just use it directly.
   if (STy->isOpaque()) {
     // A named structure type from src module is used. Add it to the Set of
@@ -339,7 +339,7 @@ Type *TypeMapTy::getImpl(Type *Ty) {
     DstStructTypesSet.insert(STy);
     return *Entry = STy;
   }
-  
+
   // Otherwise we create a new type and resolve its body later.  This will be
   // resolved by the top level of get().
   SrcDefinitionsToResolve.push_back(STy);
@@ -379,8 +379,8 @@ namespace {
   /// function, which is the entrypoint for this file.
   class ModuleLinker {
     Module *DstM, *SrcM;
-    
-    TypeMapTy TypeMap; 
+
+    TypeMapTy TypeMap;
     ValueMaterializerTy ValMaterializer;
 
     /// ValueMap - Mapping of values from what they used to be in Src, to what
@@ -388,25 +388,27 @@ namespace {
     /// some overhead due to the use of Value handles which the Linker doesn't
     /// actually need, but this allows us to reuse the ValueMapper code.
     ValueToValueMapTy ValueMap;
-    
+
+    std::vector<std::pair<GlobalValue *, GlobalAlias *>> ReplaceWithAlias;
+
     struct AppendingVarInfo {
       GlobalVariable *NewGV;  // New aggregate global in dest module.
       Constant *DstInit;      // Old initializer from dest module.
       Constant *SrcInit;      // Old initializer from src module.
     };
-    
+
     std::vector<AppendingVarInfo> AppendingVars;
-    
+
     unsigned Mode; // Mode to treat source module.
-    
+
     // Set of items not to link in from source.
     SmallPtrSet<const Value*, 16> DoNotLinkFromSource;
-    
+
     // Vector of functions to lazily link in.
     std::vector<Function*> LazilyLinkFunctions;
 
     bool SuppressWarnings;
-    
+
   public:
     std::string ErrorMsg;
 
@@ -417,7 +419,7 @@ namespace {
           SuppressWarnings(SuppressWarnings) {}
 
     bool run();
-    
+
   private:
     /// emitError - Helper method for setting a message and returning an error
     /// code.
@@ -425,7 +427,7 @@ namespace {
       ErrorMsg = Message.str();
       return true;
     }
-    
+
     /// getLinkageResult - This analyzes the two global values and determines
     /// what the result will look like in the destination module.
     bool getLinkageResult(GlobalValue *Dest, const GlobalValue *Src,
@@ -439,29 +441,29 @@ namespace {
       // If the source has no name it can't link.  If it has local linkage,
       // there is no name match-up going on.
       if (!SrcGV->hasName() || SrcGV->hasLocalLinkage())
-        return 0;
-      
+        return nullptr;
+
       // Otherwise see if we have a match in the destination module's symtab.
       GlobalValue *DGV = DstM->getNamedValue(SrcGV->getName());
-      if (DGV == 0) return 0;
-        
+      if (!DGV) return nullptr;
+
       // If we found a global with the same name in the dest module, but it has
       // internal linkage, we are really not doing any linkage here.
       if (DGV->hasLocalLinkage())
-        return 0;
+        return nullptr;
 
       // Otherwise, we do in fact link to the destination global.
       return DGV;
     }
-    
+
     void computeTypeMapping();
-    
+
     bool linkAppendingVarProto(GlobalVariable *DstGV, GlobalVariable *SrcGV);
     bool linkGlobalProto(GlobalVariable *SrcGV);
     bool linkFunctionProto(Function *SrcF);
     bool linkAliasProto(GlobalAlias *SrcA);
     bool linkModuleFlagsMetadata();
-    
+
     void linkAppendingVarInit(const AppendingVarInfo &AVI);
     void linkGlobalInits();
     void linkFunctionBody(Function *Dst, Function *Src);
@@ -495,10 +497,16 @@ static void forceRenaming(GlobalValue *GV, StringRef Name) {
 /// a GlobalValue) from the SrcGV to the DestGV.
 static void copyGVAttributes(GlobalValue *DestGV, const GlobalValue *SrcGV) {
   // Use the maximum alignment, rather than just copying the alignment of SrcGV.
-  unsigned Alignment = std::max(DestGV->getAlignment(), SrcGV->getAlignment());
+  auto *DestGO = dyn_cast<GlobalObject>(DestGV);
+  unsigned Alignment;
+  if (DestGO)
+    Alignment = std::max(DestGO->getAlignment(), SrcGV->getAlignment());
+
   DestGV->copyAttributesFrom(SrcGV);
-  DestGV->setAlignment(Alignment);
-  
+
+  if (DestGO)
+    DestGO->setAlignment(Alignment);
+
   forceRenaming(DestGV, SrcGV->getName());
 }
 
@@ -518,7 +526,7 @@ static bool isLessConstraining(GlobalValue::VisibilityTypes a,
 Value *ValueMaterializerTy::materializeValueFor(Value *V) {
   Function *SF = dyn_cast<Function>(V);
   if (!SF)
-    return NULL;
+    return nullptr;
 
   Function *DF = Function::Create(TypeMap.get(SF->getFunctionType()),
                                   SF->getLinkage(), SF->getName(), DstM);
@@ -541,10 +549,10 @@ bool ModuleLinker::getLinkageResult(GlobalValue *Dest, const GlobalValue *Src,
   assert(Dest && "Must have two globals being queried");
   assert(!Src->hasLocalLinkage() &&
          "If Src has internal linkage, Dest shouldn't be set!");
-  
+
   bool SrcIsDeclaration = Src->isDeclaration() && !Src->isMaterializable();
   bool DestIsDeclaration = Dest->isDeclaration();
-  
+
   if (SrcIsDeclaration) {
     // If Src is external or if both Src & Dest are external..  Just link the
     // external globals, we aren't adding anything.
@@ -598,6 +606,8 @@ bool ModuleLinker::getLinkageResult(GlobalValue *Dest, const GlobalValue *Src,
 
   // Compute the visibility. We follow the rules in the System V Application
   // Binary Interface.
+  assert(!GlobalValue::isLocalLinkage(LT) &&
+         "Symbols with local linkage should not be merged");
   Vis = isLessConstraining(Src->getVisibility(), Dest->getVisibility()) ?
     Dest->getVisibility() : Src->getVisibility();
   return false;
@@ -612,19 +622,19 @@ void ModuleLinker::computeTypeMapping() {
   for (Module::global_iterator I = SrcM->global_begin(),
        E = SrcM->global_end(); I != E; ++I) {
     GlobalValue *DGV = getLinkedToGlobal(I);
-    if (DGV == 0) continue;
-    
+    if (!DGV) continue;
+
     if (!DGV->hasAppendingLinkage() || !I->hasAppendingLinkage()) {
       TypeMap.addTypeMapping(DGV->getType(), I->getType());
-      continue;      
+      continue;
     }
-    
+
     // Unify the element type of appending arrays.
     ArrayType *DAT = cast<ArrayType>(DGV->getType()->getElementType());
     ArrayType *SAT = cast<ArrayType>(I->getType()->getElementType());
     TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
   }
-  
+
   // Incorporate functions.
   for (Module::iterator I = SrcM->begin(), E = SrcM->end(); I != E; ++I) {
     if (GlobalValue *DGV = getLinkedToGlobal(I))
@@ -643,14 +653,14 @@ void ModuleLinker::computeTypeMapping() {
   for (unsigned i = 0, e = SrcStructTypes.size(); i != e; ++i) {
     StructType *ST = SrcStructTypes[i];
     if (!ST->hasName()) continue;
-    
+
     // Check to see if there is a dot in the name followed by a digit.
     size_t DotPos = ST->getName().rfind('.');
     if (DotPos == 0 || DotPos == StringRef::npos ||
         ST->getName().back() == '.' ||
         !isdigit(static_cast<unsigned char>(ST->getName()[DotPos+1])))
       continue;
-    
+
     // Check to see if the destination module has a struct with the prefix name.
     if (StructType *DST = DstM->getTypeByName(ST->getName().substr(0, DotPos)))
       // Don't use it if this actually came from the source module. They're in
@@ -675,9 +685,9 @@ void ModuleLinker::computeTypeMapping() {
   }
 
   // Don't bother incorporating aliases, they aren't generally typed well.
-  
+
   // Now that we have discovered all of the type equivalences, get a body for
-  // any 'opaque' types in the dest module that are now resolved. 
+  // any 'opaque' types in the dest module that are now resolved.
   TypeMap.linkDefinedTypeBodies();
 }
 
@@ -685,26 +695,26 @@ void ModuleLinker::computeTypeMapping() {
 /// them together now.  Return true on error.
 bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
                                          GlobalVariable *SrcGV) {
- 
+
   if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage())
     return emitError("Linking globals named '" + SrcGV->getName() +
            "': can only link appending global with another appending global!");
-  
+
   ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
   ArrayType *SrcTy =
     cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()));
   Type *EltTy = DstTy->getElementType();
-  
+
   // Check to see that they two arrays agree on type.
   if (EltTy != SrcTy->getElementType())
     return emitError("Appending variables with different element types!");
   if (DstGV->isConstant() != SrcGV->isConstant())
     return emitError("Appending variables linked with different const'ness!");
-  
+
   if (DstGV->getAlignment() != SrcGV->getAlignment())
     return emitError(
              "Appending variables with different alignment need to be linked!");
-  
+
   if (DstGV->getVisibility() != SrcGV->getVisibility())
     return emitError(
             "Appending variables with different visibility need to be linked!");
@@ -716,20 +726,20 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   if (DstGV->getSection() != SrcGV->getSection())
     return emitError(
           "Appending variables with different section name need to be linked!");
-  
+
   uint64_t NewSize = DstTy->getNumElements() + SrcTy->getNumElements();
   ArrayType *NewType = ArrayType::get(EltTy, NewSize);
-  
+
   // Create the new global variable.
   GlobalVariable *NG =
     new GlobalVariable(*DstGV->getParent(), NewType, SrcGV->isConstant(),
-                       DstGV->getLinkage(), /*init*/0, /*name*/"", DstGV,
+                       DstGV->getLinkage(), /*init*/nullptr, /*name*/"", DstGV,
                        DstGV->getThreadLocalMode(),
                        DstGV->getType()->getAddressSpace());
-  
+
   // Propagate alignment, visibility and section info.
   copyGVAttributes(NG, DstGV);
-  
+
   AppendingVarInfo AVI;
   AVI.NewGV = NG;
   AVI.DstInit = DstGV->getInitializer();
@@ -742,10 +752,10 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
 
   DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
   DstGV->eraseFromParent();
-  
+
   // Track the source variable so we don't try to link it.
   DoNotLinkFromSource.insert(SrcGV);
-  
+
   return false;
 }
 
@@ -760,7 +770,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
     // Concatenation of appending linkage variables is magic and handled later.
     if (DGV->hasAppendingLinkage() || SGV->hasAppendingLinkage())
       return linkAppendingVarProto(cast<GlobalVariable>(DGV), SGV);
-    
+
     // Determine whether linkage of these two globals follows the source
     // module's definition or the destination module's definition.
     GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
@@ -786,22 +796,22 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
 
       // Make sure to remember this mapping.
       ValueMap[SGV] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGV->getType()));
-      
-      // Track the source global so that we don't attempt to copy it over when 
+
+      // Track the source global so that we don't attempt to copy it over when
       // processing global initializers.
       DoNotLinkFromSource.insert(SGV);
-      
+
       return false;
     }
   }
-  
+
   // No linking to be performed or linking from the source: simply create an
   // identical version of the symbol over in the dest module... the
   // initializer will be filled in later by LinkGlobalInits.
   GlobalVariable *NewDGV =
     new GlobalVariable(*DstM, TypeMap.get(SGV->getType()->getElementType()),
-                       SGV->isConstant(), SGV->getLinkage(), /*init*/0,
-                       SGV->getName(), /*insertbefore*/0,
+                       SGV->isConstant(), SGV->getLinkage(), /*init*/nullptr,
+                       SGV->getName(), /*insertbefore*/nullptr,
                        SGV->getThreadLocalMode(),
                        SGV->getType()->getAddressSpace());
   // Propagate alignment, visibility and section info.
@@ -814,7 +824,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
     DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDGV, DGV->getType()));
     DGV->eraseFromParent();
   }
-  
+
   // Make sure to remember this mapping.
   ValueMap[SGV] = NewDGV;
   return false;
@@ -844,15 +854,15 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
 
       // Make sure to remember this mapping.
       ValueMap[SF] = ConstantExpr::getBitCast(DGV, TypeMap.get(SF->getType()));
-      
-      // Track the function from the source module so we don't attempt to remap 
+
+      // Track the function from the source module so we don't attempt to remap
       // it.
       DoNotLinkFromSource.insert(SF);
-      
+
       return false;
     }
   }
-  
+
   // If the function is to be lazily linked, don't create it just yet.
   // The ValueMaterializerTy will deal with creating it if it's used.
   if (!DGV && (SF->hasLocalLinkage() || SF->hasLinkOnceLinkage() ||
@@ -875,7 +885,7 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
     DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDF, DGV->getType()));
     DGV->eraseFromParent();
   }
-  
+
   ValueMap[SF] = NewDF;
   return false;
 }
@@ -901,29 +911,27 @@ bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) {
 
       // Make sure to remember this mapping.
       ValueMap[SGA] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGA->getType()));
-      
+
       // Track the alias from the source module so we don't attempt to remap it.
       DoNotLinkFromSource.insert(SGA);
-      
+
       return false;
     }
   }
-  
+
   // If there is no linkage to be performed or we're linking from the source,
   // bring over SGA.
-  GlobalAlias *NewDA = new GlobalAlias(TypeMap.get(SGA->getType()),
-                                       SGA->getLinkage(), SGA->getName(),
-                                       /*aliasee*/0, DstM);
+  auto *PTy = cast<PointerType>(TypeMap.get(SGA->getType()));
+  auto *NewDA =
+      GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                          SGA->getLinkage(), SGA->getName(), DstM);
   copyGVAttributes(NewDA, SGA);
   if (NewVisibility)
     NewDA->setVisibility(*NewVisibility);
 
-  if (DGV) {
-    // Any uses of DGV need to change to NewDA, with cast.
-    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDA, DGV->getType()));
-    DGV->eraseFromParent();
-  }
-  
+  if (DGV)
+    ReplaceWithAlias.push_back(std::make_pair(DGV, NewDA));
+
   ValueMap[SGA] = NewDA;
   return false;
 }
@@ -934,15 +942,15 @@ static void getArrayElements(Constant *C, SmallVectorImpl<Constant*> &Dest) {
   for (unsigned i = 0; i != NumElements; ++i)
     Dest.push_back(C->getAggregateElement(i));
 }
-                             
+
 void ModuleLinker::linkAppendingVarInit(const AppendingVarInfo &AVI) {
   // Merge the initializer.
   SmallVector<Constant*, 16> Elements;
   getArrayElements(AVI.DstInit, Elements);
-  
+
   Constant *SrcInit = MapValue(AVI.SrcInit, ValueMap, RF_None, &TypeMap, &ValMaterializer);
   getArrayElements(SrcInit, Elements);
-  
+
   ArrayType *NewType = cast<ArrayType>(AVI.NewGV->getType()->getElementType());
   AVI.NewGV->setInitializer(ConstantArray::get(NewType, Elements));
 }
@@ -953,10 +961,10 @@ void ModuleLinker::linkGlobalInits() {
   // Loop over all of the globals in the src module, mapping them over as we go
   for (Module::const_global_iterator I = SrcM->global_begin(),
        E = SrcM->global_end(); I != E; ++I) {
-    
+
     // Only process initialized GV's or ones not already in dest.
-    if (!I->hasInitializer() || DoNotLinkFromSource.count(I)) continue;          
-    
+    if (!I->hasInitializer() || DoNotLinkFromSource.count(I)) continue;
+
     // Grab destination global variable.
     GlobalVariable *DGV = cast<GlobalVariable>(ValueMap[I]);
     // Figure out what the initializer looks like in the dest module.
@@ -984,7 +992,7 @@ void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
   if (Mode == Linker::DestroySource) {
     // Splice the body of the source function into the dest function.
     Dst->getBasicBlockList().splice(Dst->end(), Src->getBasicBlockList());
-    
+
     // At this point, all of the instructions and values of the function are now
     // copied over.  The only problem is that they are still referencing values in
     // the Source function as operands.  Loop through all of the operands of the
@@ -993,19 +1001,32 @@ void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
         RemapInstruction(I, ValueMap, RF_IgnoreMissingEntries,
                          &TypeMap, &ValMaterializer);
-    
+
   } else {
     // Clone the body of the function into the dest function.
     SmallVector<ReturnInst*, 8> Returns; // Ignore returns.
-    CloneFunctionInto(Dst, Src, ValueMap, false, Returns, "", NULL,
+    CloneFunctionInto(Dst, Src, ValueMap, false, Returns, "", nullptr,
                       &TypeMap, &ValMaterializer);
   }
-  
+
   // There is no need to map the arguments anymore.
   for (Function::arg_iterator I = Src->arg_begin(), E = Src->arg_end();
        I != E; ++I)
     ValueMap.erase(I);
-  
+
+}
+
+static GlobalObject &getGlobalObjectInExpr(Constant &C) {
+  auto *GO = dyn_cast<GlobalObject>(&C);
+  if (GO)
+    return *GO;
+  auto *GA = dyn_cast<GlobalAlias>(&C);
+  if (GA)
+    return *GA->getAliasee();
+  auto &CE = cast<ConstantExpr>(C);
+  assert(CE.getOpcode() == Instruction::BitCast ||
+         CE.getOpcode() == Instruction::AddrSpaceCast);
+  return getGlobalObjectInExpr(*CE.getOperand(0));
 }
 
 /// linkAliasBodies - Insert all of the aliases in Src into the Dest module.
@@ -1016,10 +1037,27 @@ void ModuleLinker::linkAliasBodies() {
       continue;
     if (Constant *Aliasee = I->getAliasee()) {
       GlobalAlias *DA = cast<GlobalAlias>(ValueMap[I]);
-      DA->setAliasee(MapValue(Aliasee, ValueMap, RF_None,
-                              &TypeMap, &ValMaterializer));
+      Constant *Val =
+          MapValue(Aliasee, ValueMap, RF_None, &TypeMap, &ValMaterializer);
+      DA->setAliasee(&getGlobalObjectInExpr(*Val));
     }
   }
+
+  // Any uses of DGV need to change to NewDA, with cast.
+  for (auto &Pair : ReplaceWithAlias) {
+    GlobalValue *DGV = Pair.first;
+    GlobalAlias *NewDA = Pair.second;
+
+    for (auto *User : DGV->users()) {
+      if (auto *GA = dyn_cast<GlobalAlias>(User)) {
+        if (GA == NewDA)
+          report_fatal_error("Linking these modules creates an alias cycle.");
+      }
+    }
+
+    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDA, DGV->getType()));
+    DGV->eraseFromParent();
+  }
 }
 
 /// linkNamedMDNodes - Insert all of the named MDNodes in Src into the Dest
@@ -1193,7 +1231,7 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
 
   return HasErr;
 }
-  
+
 bool ModuleLinker::run() {
   assert(DstM && "Null destination module");
   assert(SrcM && "Null source module");
@@ -1264,13 +1302,13 @@ bool ModuleLinker::run() {
 
   for (unsigned i = 0, e = AppendingVars.size(); i != e; ++i)
     linkAppendingVarInit(AppendingVars[i]);
-  
+
   // Link in the function bodies that are defined in the source module into
   // DstM.
   for (Module::iterator SF = SrcM->begin(), E = SrcM->end(); SF != E; ++SF) {
     // Skip if not linking from source.
     if (DoNotLinkFromSource.count(SF)) continue;
-    
+
     Function *DF = cast<Function>(ValueMap[SF]);
     if (SF->hasPrefixData()) {
       // Link in the prefix data.
@@ -1285,7 +1323,7 @@ bool ModuleLinker::run() {
       if (SF->Materialize(&ErrorMsg))
         return true;
     }
-    
+
     linkFunctionBody(DF, SF);
     SF->Dematerialize();
   }
@@ -1310,9 +1348,9 @@ bool ModuleLinker::run() {
   bool LinkedInAnyFunctions;
   do {
     LinkedInAnyFunctions = false;
-    
+
     for(std::vector<Function*>::iterator I = LazilyLinkFunctions.begin(),
-        E = LazilyLinkFunctions.end(); I != E; ++I) {      
+        E = LazilyLinkFunctions.end(); I != E; ++I) {
       Function *SF = *I;
       if (!SF)
         continue;
@@ -1334,7 +1372,7 @@ bool ModuleLinker::run() {
         if (SF->Materialize(&ErrorMsg))
           return true;
       }
-      
+
       // Erase from vector *before* the function body is linked - linkFunctionBody could
       // invalidate I.
       LazilyLinkFunctions.erase(I);
@@ -1349,11 +1387,11 @@ bool ModuleLinker::run() {
       break;
     }
   } while (LinkedInAnyFunctions);
-  
+
   // Now that all of the types from the source are used, resolve any structs
   // copied over to the dest that didn't exist there.
   TypeMap.linkDefinedTypeBodies();
-  
+
   return false;
 }
 
@@ -1369,7 +1407,7 @@ Linker::~Linker() {
 
 void Linker::deleteModule() {
   delete Composite;
-  Composite = NULL;
+  Composite = nullptr;
 }
 
 bool Linker::linkInModule(Module *Src, unsigned Mode, std::string *ErrorMsg) {
@@ -1392,7 +1430,7 @@ bool Linker::linkInModule(Module *Src, unsigned Mode, std::string *ErrorMsg) {
 /// error occurs, true is returned and ErrorMsg (if not null) is set to indicate
 /// the problem.  Upon failure, the Dest module could be in a modified state,
 /// and shouldn't be relied on to be consistent.
-bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Mode, 
+bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Mode,
                          std::string *ErrorMsg) {
   Linker L(Dest);
   return L.linkInModule(Src, Mode, ErrorMsg);
@@ -1406,7 +1444,7 @@ LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src,
                          LLVMLinkerMode Mode, char **OutMessages) {
   std::string Messages;
   LLVMBool Result = Linker::LinkModules(unwrap(Dest), unwrap(Src),
-                                        Mode, OutMessages? &Messages : 0);
+                                        Mode, OutMessages? &Messages : nullptr);
   if (OutMessages)
     *OutMessages = strdup(Messages.c_str());
   return Result;
diff --git a/lib/MC/Android.mk b/lib/MC/Android.mk
index abf346b..975f4e3 100644
--- a/lib/MC/Android.mk
+++ b/lib/MC/Android.mk
@@ -20,7 +20,6 @@ mc_SRC_FILES := \
   MCELFStreamer.cpp \
   MCExpr.cpp \
   MCExternalSymbolizer.cpp \
-  MCFixup.cpp \
   MCInst.cpp \
   MCInstPrinter.cpp \
   MCInstrAnalysis.cpp \
@@ -35,13 +34,14 @@ mc_SRC_FILES := \
   MCRegisterInfo.cpp \
   MCRelocationInfo.cpp \
   MCSection.cpp \
-  MCSectionCOFF.cpp	\
+  MCSectionCOFF.cpp \
   MCSectionELF.cpp \
   MCSectionMachO.cpp \
   MCStreamer.cpp \
   MCSubtargetInfo.cpp \
   MCSymbol.cpp \
   MCSymbolizer.cpp \
+  MCTargetOptions.cpp \
   MCValue.cpp \
   MCWin64EH.cpp \
   WinCOFFObjectWriter.cpp \
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index ab7dabc..6a384c1 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -16,7 +16,6 @@ add_llvm_library(LLVMMC
   MCELF.cpp
   MCELFObjectTargetWriter.cpp
   MCELFStreamer.cpp
-  MCFixup.cpp
   MCFunction.cpp
   MCExpr.cpp
   MCExternalSymbolizer.cpp
@@ -45,6 +44,7 @@ add_llvm_library(LLVMMC
   MCSubtargetInfo.cpp
   MCSymbol.cpp
   MCSymbolizer.cpp
+  MCTargetOptions.cpp
   MCValue.cpp
   MCWin64EH.cpp
   MachObjectWriter.cpp
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index e9b8fe2..0a54627 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -27,6 +28,8 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Object/StringTableBuilder.h"
+#include "llvm/Support/Compression.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ELF.h"
@@ -105,10 +108,9 @@ class ELFObjectWriter : public MCObjectWriter {
     static bool isFixupKindPCRel(const MCAssembler &Asm, unsigned Kind);
     static bool RelocNeedsGOT(MCSymbolRefExpr::VariantKind Variant);
     static uint64_t SymbolValue(MCSymbolData &Data, const MCAsmLayout &Layout);
-    static bool isInSymtab(const MCAssembler &Asm, const MCSymbolData &Data,
+    static bool isInSymtab(const MCAsmLayout &Layout, const MCSymbolData &Data,
                            bool Used, bool Renamed);
-    static bool isLocal(const MCSymbolData &Data, bool isSignature,
-                        bool isUsedInReloc);
+    static bool isLocal(const MCSymbolData &Data, bool isUsedInReloc);
     static bool IsELFMetaDataSection(const MCSectionData &SD);
     static uint64_t DataSectionSize(const MCSectionData &SD);
     static uint64_t GetSectionFileSize(const MCAsmLayout &Layout,
@@ -131,11 +133,11 @@ class ELFObjectWriter : public MCObjectWriter {
       MCSymbolData *SymbolData;
       uint64_t StringIndex;
       uint32_t SectionIndex;
+      StringRef Name;
 
       // Support lexicographic sorting.
       bool operator<(const ELFSymbolData &RHS) const {
-        return SymbolData->getSymbol().getName() <
-               RHS.SymbolData->getSymbol().getName();
+        return Name < RHS.Name;
       }
     };
 
@@ -148,13 +150,13 @@ class ELFObjectWriter : public MCObjectWriter {
 
     llvm::DenseMap<const MCSectionData *, std::vector<ELFRelocationEntry>>
     Relocations;
-    DenseMap<const MCSection*, uint64_t> SectionStringTableIndex;
+    StringTableBuilder ShStrTabBuilder;
 
     /// @}
     /// @name Symbol Table Data
     /// @{
 
-    SmallString<256> StringTable;
+    StringTableBuilder StrTabBuilder;
     std::vector<uint64_t> FileSymbolData;
     std::vector<ELFSymbolData> LocalSymbolData;
     std::vector<ELFSymbolData> ExternalSymbolData;
@@ -214,7 +216,8 @@ class ELFObjectWriter : public MCObjectWriter {
                           const MCAsmLayout &Layout,
                           SectionIndexMapTy &SectionIndexMap);
 
-    bool shouldRelocateWithSymbol(const MCSymbolRefExpr *RefA,
+    bool shouldRelocateWithSymbol(const MCAssembler &Asm,
+                                  const MCSymbolRefExpr *RefA,
                                   const MCSymbolData *SD, uint64_t C,
                                   unsigned Type) const;
 
@@ -253,6 +256,8 @@ class ELFObjectWriter : public MCObjectWriter {
     void CreateRelocationSections(MCAssembler &Asm, MCAsmLayout &Layout,
                                   RelMapTy &RelMap);
 
+    void CompressDebugSections(MCAssembler &Asm, MCAsmLayout &Layout);
+
     void WriteRelocations(MCAssembler &Asm, MCAsmLayout &Layout,
                           const RelMapTy &RelMap);
 
@@ -481,43 +486,18 @@ void ELFObjectWriter::WriteHeader(const MCAssembler &Asm,
     Write16(ShstrtabIndex);
 }
 
-uint64_t ELFObjectWriter::SymbolValue(MCSymbolData &OrigData,
+uint64_t ELFObjectWriter::SymbolValue(MCSymbolData &Data,
                                       const MCAsmLayout &Layout) {
-  MCSymbolData *Data = &OrigData;
-  if (Data->isCommon() && Data->isExternal())
-    return Data->getCommonAlignment();
-
-  const MCSymbol *Symbol = &Data->getSymbol();
-  bool IsThumbFunc = OrigData.getFlags() & ELF_Other_ThumbFunc;
-
-  uint64_t Res = 0;
-  if (Symbol->isVariable()) {
-    const MCExpr *Expr = Symbol->getVariableValue();
-    MCValue Value;
-    if (!Expr->EvaluateAsRelocatable(Value, &Layout))
-      llvm_unreachable("Invalid expression");
+  if (Data.isCommon() && Data.isExternal())
+    return Data.getCommonAlignment();
 
-    assert(!Value.getSymB());
-
-    Res = Value.getConstant();
-
-    if (const MCSymbolRefExpr *A = Value.getSymA()) {
-      Symbol = &A->getSymbol();
-      Data = &Layout.getAssembler().getSymbolData(*Symbol);
-    } else {
-      Symbol = 0;
-      Data = 0;
-    }
-  }
+  uint64_t Res;
+  if (!Layout.getSymbolOffset(&Data, Res))
+    return 0;
 
-  if (IsThumbFunc)
+  if (Layout.getAssembler().isThumbFunc(&Data.getSymbol()))
     Res |= 1;
 
-  if (!Symbol || !Symbol->isInSection())
-    return Res;
-
-  Res += Layout.getSymbolOffset(Data);
-
   return Res;
 }
 
@@ -526,15 +506,17 @@ void ELFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
   // The presence of symbol versions causes undefined symbols and
   // versions declared with @@@ to be renamed.
 
-  for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
-         ie = Asm.symbol_end(); it != ie; ++it) {
-    const MCSymbol &Alias = it->getSymbol();
-    const MCSymbol &Symbol = Alias.AliasedSymbol();
-    MCSymbolData &SD = Asm.getSymbolData(Symbol);
+  for (MCSymbolData &OriginalData : Asm.symbols()) {
+    const MCSymbol &Alias = OriginalData.getSymbol();
 
     // Not an alias.
-    if (&Symbol == &Alias)
+    if (!Alias.isVariable())
+      continue;
+    auto *Ref = dyn_cast<MCSymbolRefExpr>(Alias.getVariableValue());
+    if (!Ref)
       continue;
+    const MCSymbol &Symbol = Ref->getSymbol();
+    MCSymbolData &SD = Asm.getSymbolData(Symbol);
 
     StringRef AliasName = Alias.getName();
     size_t Pos = AliasName.find('@');
@@ -543,8 +525,8 @@ void ELFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
 
     // Aliases defined with .symvar copy the binding from the symbol they alias.
     // This is the first place we are able to copy this information.
-    it->setExternal(SD.isExternal());
-    MCELF::SetBinding(*it, MCELF::GetBinding(SD));
+    OriginalData.setExternal(SD.isExternal());
+    MCELF::SetBinding(OriginalData, MCELF::GetBinding(SD));
 
     StringRef Rest = AliasName.substr(Pos);
     if (!Symbol.isUndefined() && !Rest.startswith("@@@"))
@@ -594,26 +576,14 @@ static uint8_t mergeTypeForSet(uint8_t origType, uint8_t newType) {
   return Type;
 }
 
-static const MCSymbol *getBaseSymbol(const MCAsmLayout &Layout,
-                                     const MCSymbol &Symbol) {
-  if (!Symbol.isVariable())
-    return &Symbol;
-
-  const MCExpr *Expr = Symbol.getVariableValue();
-  MCValue Value;
-  if (!Expr->EvaluateAsRelocatable(Value, &Layout))
-    llvm_unreachable("Invalid Expression");
-  assert(!Value.getSymB());
-  const MCSymbolRefExpr *A = Value.getSymA();
-  if (!A)
-    return nullptr;
-  return getBaseSymbol(Layout, A->getSymbol());
-}
-
 void ELFObjectWriter::WriteSymbol(SymbolTableWriter &Writer, ELFSymbolData &MSD,
                                   const MCAsmLayout &Layout) {
   MCSymbolData &OrigData = *MSD.SymbolData;
-  const MCSymbol *Base = getBaseSymbol(Layout, OrigData.getSymbol());
+  assert((!OrigData.getFragment() ||
+          (&OrigData.getFragment()->getParent()->getSection() ==
+           &OrigData.getSymbol().getSection())) &&
+         "The symbol's section doesn't match the fragment's symbol");
+  const MCSymbol *Base = Layout.getBaseSymbol(OrigData.getSymbol());
 
   // This has to be in sync with when computeSymbolTable uses SHN_ABS or
   // SHN_COMMON.
@@ -627,8 +597,6 @@ void ELFObjectWriter::WriteSymbol(SymbolTableWriter &Writer, ELFSymbolData &MSD,
     BaseSD = &Layout.getAssembler().getSymbolData(*Base);
     Type = mergeTypeForSet(Type, MCELF::GetType(*BaseSD));
   }
-  if (OrigData.getFlags() & ELF_Other_ThumbFunc)
-    Type = ELF::STT_FUNC;
   uint8_t Info = (Binding << ELF_STB_Shift) | (Type << ELF_STT_Shift);
 
   // Other and Visibility share the same byte with Visibility using the lower
@@ -638,8 +606,6 @@ void ELFObjectWriter::WriteSymbol(SymbolTableWriter &Writer, ELFSymbolData &MSD,
   Other |= Visibility;
 
   uint64_t Value = SymbolValue(OrigData, Layout);
-  if (OrigData.getFlags() & ELF_Other_ThumbFunc)
-    Value |= 1;
   uint64_t Size = 0;
 
   const MCExpr *ESize = OrigData.getSize();
@@ -664,7 +630,6 @@ void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF,
                                        SectionIndexMapTy &SectionIndexMap) {
   // The string table must be emitted first because we need the index
   // into the string table for all the symbol names.
-  assert(StringTable.size() && "Missing string table");
 
   // FIXME: Make sure the start of the symbol table is aligned.
 
@@ -725,7 +690,8 @@ void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF,
 // It is always valid to create a relocation with a symbol. It is preferable
 // to use a relocation with a section if that is possible. Using the section
 // allows us to omit some local symbols from the symbol table.
-bool ELFObjectWriter::shouldRelocateWithSymbol(const MCSymbolRefExpr *RefA,
+bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm,
+                                               const MCSymbolRefExpr *RefA,
                                                const MCSymbolData *SD,
                                                uint64_t C,
                                                unsigned Type) const {
@@ -809,11 +775,37 @@ bool ELFObjectWriter::shouldRelocateWithSymbol(const MCSymbolRefExpr *RefA,
   if (Flags & ELF::SHF_TLS)
     return true;
 
+  // If the symbol is a thumb function the final relocation must set the lowest
+  // bit. With a symbol that is done by just having the symbol have that bit
+  // set, so we would lose the bit if we relocated with the section.
+  // FIXME: We could use the section but add the bit to the relocation value.
+  if (Asm.isThumbFunc(&Sym))
+    return true;
+
   if (TargetObjectWriter->needsRelocateWithSymbol(Type))
     return true;
   return false;
 }
 
+static const MCSymbol *getWeakRef(const MCSymbolRefExpr &Ref) {
+  const MCSymbol &Sym = Ref.getSymbol();
+
+  if (Ref.getKind() == MCSymbolRefExpr::VK_WEAKREF)
+    return &Sym;
+
+  if (!Sym.isVariable())
+    return nullptr;
+
+  const MCExpr *Expr = Sym.getVariableValue();
+  const auto *Inner = dyn_cast<MCSymbolRefExpr>(Expr);
+  if (!Inner)
+    return nullptr;
+
+  if (Inner->getKind() == MCSymbolRefExpr::VK_WEAKREF)
+    return &Inner->getSymbol();
+  return nullptr;
+}
+
 void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
                                        const MCAsmLayout &Layout,
                                        const MCFragment *Fragment,
@@ -868,7 +860,7 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
   const MCSymbolData *SymAD = SymA ? &Asm.getSymbolData(*SymA) : nullptr;
 
   unsigned Type = GetRelocType(Target, Fixup, IsPCRel);
-  bool RelocateWithSymbol = shouldRelocateWithSymbol(RefA, SymAD, C, Type);
+  bool RelocateWithSymbol = shouldRelocateWithSymbol(Asm, RefA, SymAD, C, Type);
   if (!RelocateWithSymbol && SymA && !SymA->isUndefined())
     C += Layout.getSymbolOffset(SymAD);
 
@@ -899,8 +891,8 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
     if (const MCSymbol *R = Renames.lookup(SymA))
       SymA = R;
 
-    if (RefA->getKind() == MCSymbolRefExpr::VK_WEAKREF)
-      WeakrefUsedInReloc.insert(SymA);
+    if (const MCSymbol *WeakRef = getWeakRef(*RefA))
+      WeakrefUsedInReloc.insert(WeakRef);
     else
       UsedInReloc.insert(SymA);
   }
@@ -913,13 +905,13 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
 uint64_t
 ELFObjectWriter::getSymbolIndexInSymbolTable(const MCAssembler &Asm,
                                              const MCSymbol *S) {
-  MCSymbolData &SD = Asm.getSymbolData(*S);
+  const MCSymbolData &SD = Asm.getSymbolData(*S);
   return SD.getIndex();
 }
 
-bool ELFObjectWriter::isInSymtab(const MCAssembler &Asm,
-                                 const MCSymbolData &Data,
-                                 bool Used, bool Renamed) {
+bool ELFObjectWriter::isInSymtab(const MCAsmLayout &Layout,
+                                 const MCSymbolData &Data, bool Used,
+                                 bool Renamed) {
   const MCSymbol &Symbol = Data.getSymbol();
   if (Symbol.isVariable()) {
     const MCExpr *Expr = Symbol.getVariableValue();
@@ -938,9 +930,11 @@ bool ELFObjectWriter::isInSymtab(const MCAssembler &Asm,
   if (Symbol.getName() == "_GLOBAL_OFFSET_TABLE_")
     return true;
 
-  const MCSymbol &A = Symbol.AliasedSymbol();
-  if (Symbol.isVariable() && !A.isVariable() && A.isUndefined())
-    return false;
+  if (Symbol.isVariable()) {
+    const MCSymbol *Base = Layout.getBaseSymbol(Symbol);
+    if (Base && Base->isUndefined())
+      return false;
+  }
 
   bool IsGlobal = MCELF::GetBinding(Data) == ELF::STB_GLOBAL;
   if (!Symbol.isVariable() && Symbol.isUndefined() && !IsGlobal)
@@ -952,20 +946,16 @@ bool ELFObjectWriter::isInSymtab(const MCAssembler &Asm,
   return true;
 }
 
-bool ELFObjectWriter::isLocal(const MCSymbolData &Data, bool isSignature,
-                              bool isUsedInReloc) {
+bool ELFObjectWriter::isLocal(const MCSymbolData &Data, bool isUsedInReloc) {
   if (Data.isExternal())
     return false;
 
   const MCSymbol &Symbol = Data.getSymbol();
-  const MCSymbol &RefSymbol = Symbol.AliasedSymbol();
-
-  if (RefSymbol.isUndefined() && !RefSymbol.isVariable()) {
-    if (isSignature && !isUsedInReloc)
-      return true;
+  if (Symbol.isDefined())
+    return true;
 
+  if (isUsedInReloc)
     return false;
-  }
 
   return true;
 }
@@ -1013,58 +1003,36 @@ ELFObjectWriter::computeSymbolTable(MCAssembler &Asm, const MCAsmLayout &Layout,
     MCELF::SetBinding(Data, ELF::STB_GLOBAL);
   }
 
-  // Index 0 is always the empty string.
-  StringMap<uint64_t> StringIndexMap;
-  StringTable += '\x00';
-
-  // FIXME: We could optimize suffixes in strtab in the same way we
-  // optimize them in shstrtab.
-
-  for (MCAssembler::const_file_name_iterator it = Asm.file_names_begin(),
-                                            ie = Asm.file_names_end();
-                                            it != ie;
-                                            ++it) {
-    StringRef Name = *it;
-    uint64_t &Entry = StringIndexMap[Name];
-    if (!Entry) {
-      Entry = StringTable.size();
-      StringTable += Name;
-      StringTable += '\x00';
-    }
-    FileSymbolData.push_back(Entry);
-  }
-
   // Add the data for the symbols.
-  for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
-         ie = Asm.symbol_end(); it != ie; ++it) {
-    const MCSymbol &Symbol = it->getSymbol();
+  for (MCSymbolData &SD : Asm.symbols()) {
+    const MCSymbol &Symbol = SD.getSymbol();
 
     bool Used = UsedInReloc.count(&Symbol);
     bool WeakrefUsed = WeakrefUsedInReloc.count(&Symbol);
     bool isSignature = RevGroupMap.count(&Symbol);
 
-    if (!isInSymtab(Asm, *it,
+    if (!isInSymtab(Layout, SD,
                     Used || WeakrefUsed || isSignature,
                     Renames.count(&Symbol)))
       continue;
 
     ELFSymbolData MSD;
-    MSD.SymbolData = it;
-    const MCSymbol *BaseSymbol = getBaseSymbol(Layout, Symbol);
+    MSD.SymbolData = &SD;
+    const MCSymbol *BaseSymbol = Layout.getBaseSymbol(Symbol);
 
     // Undefined symbols are global, but this is the first place we
     // are able to set it.
-    bool Local = isLocal(*it, isSignature, Used);
-    if (!Local && MCELF::GetBinding(*it) == ELF::STB_LOCAL) {
+    bool Local = isLocal(SD, Used);
+    if (!Local && MCELF::GetBinding(SD) == ELF::STB_LOCAL) {
       assert(BaseSymbol);
-      MCSymbolData &SD = Asm.getSymbolData(*BaseSymbol);
-      MCELF::SetBinding(*it, ELF::STB_GLOBAL);
+      MCSymbolData &BaseData = Asm.getSymbolData(*BaseSymbol);
       MCELF::SetBinding(SD, ELF::STB_GLOBAL);
+      MCELF::SetBinding(BaseData, ELF::STB_GLOBAL);
     }
 
     if (!BaseSymbol) {
       MSD.SectionIndex = ELF::SHN_ABS;
-    } else if (it->isCommon()) {
+    } else if (SD.isCommon()) {
       assert(!Local);
       MSD.SectionIndex = ELF::SHN_COMMON;
     } else if (BaseSymbol->isUndefined()) {
@@ -1073,7 +1041,7 @@ ELFObjectWriter::computeSymbolTable(MCAssembler &Asm, const MCAsmLayout &Layout,
       else
         MSD.SectionIndex = ELF::SHN_UNDEF;
       if (!Used && WeakrefUsed)
-        MCELF::SetBinding(*it, ELF::STB_WEAK);
+        MCELF::SetBinding(SD, ELF::STB_WEAK);
     } else {
       const MCSectionELF &Section =
         static_cast<const MCSectionELF&>(BaseSymbol->getSection());
@@ -1085,7 +1053,6 @@ ELFObjectWriter::computeSymbolTable(MCAssembler &Asm, const MCAsmLayout &Layout,
     // @@ in defined ones.
     StringRef Name = Symbol.getName();
     SmallString<32> Buf;
-
     size_t Pos = Name.find("@@@");
     if (Pos != StringRef::npos) {
       Buf += Name.substr(0, Pos);
@@ -1093,14 +1060,8 @@ ELFObjectWriter::computeSymbolTable(MCAssembler &Asm, const MCAsmLayout &Layout,
       Buf += Name.substr(Pos + Skip);
       Name = Buf;
     }
+    MSD.Name = StrTabBuilder.add(Name);
 
-    uint64_t &Entry = StringIndexMap[Name];
-    if (!Entry) {
-      Entry = StringTable.size();
-      StringTable += Name;
-      StringTable += '\x00';
-    }
-    MSD.StringIndex = Entry;
     if (MSD.SectionIndex == ELF::SHN_UNDEF)
       UndefinedSymbolData.push_back(MSD);
     else if (Local)
@@ -1109,6 +1070,21 @@ ELFObjectWriter::computeSymbolTable(MCAssembler &Asm, const MCAsmLayout &Layout,
       ExternalSymbolData.push_back(MSD);
   }
 
+  for (auto i = Asm.file_names_begin(), e = Asm.file_names_end(); i != e; ++i)
+    StrTabBuilder.add(*i);
+
+  StrTabBuilder.finalize();
+
+  for (auto i = Asm.file_names_begin(), e = Asm.file_names_end(); i != e; ++i)
+    FileSymbolData.push_back(StrTabBuilder.getOffset(*i));
+
+  for (ELFSymbolData& MSD : LocalSymbolData)
+    MSD.StringIndex = StrTabBuilder.getOffset(MSD.Name);
+  for (ELFSymbolData& MSD : ExternalSymbolData)
+    MSD.StringIndex = StrTabBuilder.getOffset(MSD.Name);
+  for (ELFSymbolData& MSD : UndefinedSymbolData)
+    MSD.StringIndex = StrTabBuilder.getOffset(MSD.Name);
+
   // Symbols are required to be in lexicographic order.
   array_pod_sort(LocalSymbolData.begin(), LocalSymbolData.end());
   array_pod_sort(ExternalSymbolData.begin(), ExternalSymbolData.end());
@@ -1168,6 +1144,151 @@ void ELFObjectWriter::CreateRelocationSections(MCAssembler &Asm,
   }
 }
 
+static SmallVector<char, 128>
+getUncompressedData(MCAsmLayout &Layout,
+                    MCSectionData::FragmentListType &Fragments) {
+  SmallVector<char, 128> UncompressedData;
+  for (const MCFragment &F : Fragments) {
+    const SmallVectorImpl<char> *Contents;
+    switch (F.getKind()) {
+    case MCFragment::FT_Data:
+      Contents = &cast<MCDataFragment>(F).getContents();
+      break;
+    case MCFragment::FT_Dwarf:
+      Contents = &cast<MCDwarfLineAddrFragment>(F).getContents();
+      break;
+    case MCFragment::FT_DwarfFrame:
+      Contents = &cast<MCDwarfCallFrameFragment>(F).getContents();
+      break;
+    default:
+      llvm_unreachable(
+          "Not expecting any other fragment types in a debug_* section");
+    }
+    UncompressedData.append(Contents->begin(), Contents->end());
+  }
+  return UncompressedData;
+}
+
+// Include the debug info compression header:
+// "ZLIB" followed by 8 bytes representing the uncompressed size of the section,
+// useful for consumers to preallocate a buffer to decompress into.
+static bool
+prependCompressionHeader(uint64_t Size,
+                         SmallVectorImpl<char> &CompressedContents) {
+  static const StringRef Magic = "ZLIB";
+  if (Size <= Magic.size() + sizeof(Size) + CompressedContents.size())
+    return false;
+  if (sys::IsLittleEndianHost)
+    Size = sys::SwapByteOrder(Size);
+  CompressedContents.insert(CompressedContents.begin(),
+                            Magic.size() + sizeof(Size), 0);
+  std::copy(Magic.begin(), Magic.end(), CompressedContents.begin());
+  std::copy(reinterpret_cast<char *>(&Size),
+            reinterpret_cast<char *>(&Size + 1),
+            CompressedContents.begin() + Magic.size());
+  return true;
+}
+
+// Return a single fragment containing the compressed contents of the whole
+// section. Null if the section was not compressed for any reason.
+static std::unique_ptr<MCDataFragment>
+getCompressedFragment(MCAsmLayout &Layout,
+                      MCSectionData::FragmentListType &Fragments) {
+  std::unique_ptr<MCDataFragment> CompressedFragment(new MCDataFragment());
+
+  // Gather the uncompressed data from all the fragments, recording the
+  // alignment fragment, if seen, and any fixups.
+  SmallVector<char, 128> UncompressedData =
+      getUncompressedData(Layout, Fragments);
+
+  SmallVectorImpl<char> &CompressedContents = CompressedFragment->getContents();
+
+  zlib::Status Success = zlib::compress(
+      StringRef(UncompressedData.data(), UncompressedData.size()),
+      CompressedContents);
+  if (Success != zlib::StatusOK)
+    return nullptr;
+
+  if (!prependCompressionHeader(UncompressedData.size(), CompressedContents))
+    return nullptr;
+
+  return CompressedFragment;
+}
+
+typedef DenseMap<const MCSectionData *, std::vector<MCSymbolData *>>
+DefiningSymbolMap;
+
+static void UpdateSymbols(const MCAsmLayout &Layout,
+                          const std::vector<MCSymbolData *> &Symbols,
+                          MCFragment &NewFragment) {
+  for (MCSymbolData *Sym : Symbols) {
+    Sym->setOffset(Sym->getOffset() +
+                   Layout.getFragmentOffset(Sym->getFragment()));
+    Sym->setFragment(&NewFragment);
+  }
+}
+
+static void CompressDebugSection(MCAssembler &Asm, MCAsmLayout &Layout,
+                                 const DefiningSymbolMap &DefiningSymbols,
+                                 const MCSectionELF &Section,
+                                 MCSectionData &SD) {
+  StringRef SectionName = Section.getSectionName();
+  MCSectionData::FragmentListType &Fragments = SD.getFragmentList();
+
+  std::unique_ptr<MCDataFragment> CompressedFragment =
+      getCompressedFragment(Layout, Fragments);
+
+  // Leave the section as-is if the fragments could not be compressed.
+  if (!CompressedFragment)
+    return;
+
+  // Update the fragment+offsets of any symbols referring to fragments in this
+  // section to refer to the new fragment.
+  auto I = DefiningSymbols.find(&SD);
+  if (I != DefiningSymbols.end())
+    UpdateSymbols(Layout, I->second, *CompressedFragment);
+
+  // Invalidate the layout for the whole section since it will have new and
+  // different fragments now.
+  Layout.invalidateFragmentsFrom(&Fragments.front());
+  Fragments.clear();
+
+  // Complete the initialization of the new fragment
+  CompressedFragment->setParent(&SD);
+  CompressedFragment->setLayoutOrder(0);
+  Fragments.push_back(CompressedFragment.release());
+
+  // Rename from .debug_* to .zdebug_*
+  Asm.getContext().renameELFSection(&Section,
+                                    (".z" + SectionName.drop_front(1)).str());
+}
+
+void ELFObjectWriter::CompressDebugSections(MCAssembler &Asm,
+                                            MCAsmLayout &Layout) {
+  if (!Asm.getContext().getAsmInfo()->compressDebugSections())
+    return;
+
+  DefiningSymbolMap DefiningSymbols;
+
+  for (MCSymbolData &SD : Asm.symbols())
+    if (MCFragment *F = SD.getFragment())
+      DefiningSymbols[F->getParent()].push_back(&SD);
+
+  for (MCSectionData &SD : Asm) {
+    const MCSectionELF &Section =
+        static_cast<const MCSectionELF &>(SD.getSection());
+    StringRef SectionName = Section.getSectionName();
+
+    // Compressing debug_frame requires handling alignment fragments which is
+    // more work (possibly generalizing MCAssembler.cpp:writeFragment to allow
+    // for writing to arbitrary buffers) for little benefit.
+    if (!SectionName.startswith(".debug_") || SectionName == ".debug_frame")
+      continue;
+
+    CompressDebugSection(Asm, Layout, DefiningSymbols, Section, SD);
+  }
+}
+
 void ELFObjectWriter::WriteRelocations(MCAssembler &Asm, MCAsmLayout &Layout,
                                        const RelMapTy &RelMap) {
   for (MCAssembler::const_iterator it = Asm.begin(),
@@ -1274,23 +1395,6 @@ void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm,
   }
 }
 
-static int compareBySuffix(const MCSectionELF *const *a,
-                           const MCSectionELF *const *b) {
-  const StringRef &NameA = (*a)->getSectionName();
-  const StringRef &NameB = (*b)->getSectionName();
-  const unsigned sizeA = NameA.size();
-  const unsigned sizeB = NameB.size();
-  const unsigned len = std::min(sizeA, sizeB);
-  for (unsigned int i = 0; i < len; ++i) {
-    char ca = NameA[sizeA - i - 1];
-    char cb = NameB[sizeB - i - 1];
-    if (ca != cb)
-      return cb - ca;
-  }
-
-  return sizeB - sizeA;
-}
-
 void ELFObjectWriter::CreateMetadataSections(MCAssembler &Asm,
                                              MCAsmLayout &Layout,
                                              SectionIndexMapTy &SectionIndexMap,
@@ -1331,45 +1435,20 @@ void ELFObjectWriter::CreateMetadataSections(MCAssembler &Asm,
   WriteSymbolTable(F, Asm, Layout, SectionIndexMap);
 
   F = new MCDataFragment(&StrtabSD);
-  F->getContents().append(StringTable.begin(), StringTable.end());
+  F->getContents().append(StrTabBuilder.data().begin(),
+                          StrTabBuilder.data().end());
 
   F = new MCDataFragment(&ShstrtabSD);
 
-  std::vector<const MCSectionELF*> Sections;
-  for (MCAssembler::const_iterator it = Asm.begin(),
-         ie = Asm.end(); it != ie; ++it) {
+  // Section header string table.
+  for (auto it = Asm.begin(), ie = Asm.end(); it != ie; ++it) {
     const MCSectionELF &Section =
       static_cast<const MCSectionELF&>(it->getSection());
-    Sections.push_back(&Section);
-  }
-  array_pod_sort(Sections.begin(), Sections.end(), compareBySuffix);
-
-  // Section header string table.
-  //
-  // The first entry of a string table holds a null character so skip
-  // section 0.
-  uint64_t Index = 1;
-  F->getContents().push_back('\x00');
-
-  for (unsigned int I = 0, E = Sections.size(); I != E; ++I) {
-    const MCSectionELF &Section = *Sections[I];
-
-    StringRef Name = Section.getSectionName();
-    if (I != 0) {
-      StringRef PreviousName = Sections[I - 1]->getSectionName();
-      if (PreviousName.endswith(Name)) {
-        SectionStringTableIndex[&Section] = Index - Name.size() - 1;
-        continue;
-      }
-    }
-    // Remember the index into the string table so we can write it
-    // into the sh_name field of the section header table.
-    SectionStringTableIndex[&Section] = Index;
-
-    Index += Name.size() + 1;
-    F->getContents().append(Name.begin(), Name.end());
-    F->getContents().push_back('\x00');
+    ShStrTabBuilder.add(Section.getSectionName());
   }
+  ShStrTabBuilder.finalize();
+  F->getContents().append(ShStrTabBuilder.data().begin(),
+                          ShStrTabBuilder.data().end());
 }
 
 void ELFObjectWriter::CreateIndexedSections(MCAssembler &Asm,
@@ -1437,7 +1516,7 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
 
   switch(Section.getType()) {
   case ELF::SHT_DYNAMIC:
-    sh_link = SectionStringTableIndex[&Section];
+    sh_link = ShStrTabBuilder.getOffset(Section.getSectionName());
     sh_info = 0;
     break;
 
@@ -1518,7 +1597,8 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
     }
   }
 
-  WriteSecHdrEntry(SectionStringTableIndex[&Section], Section.getType(),
+  WriteSecHdrEntry(ShStrTabBuilder.getOffset(Section.getSectionName()),
+                   Section.getType(),
                    Section.getFlags(), 0, Offset, Size, sh_link, sh_info,
                    Alignment, Section.getEntrySize());
 }
@@ -1652,6 +1732,8 @@ void ELFObjectWriter::WriteObject(MCAssembler &Asm,
 
   unsigned NumUserSections = Asm.size();
 
+  CompressDebugSections(Asm, const_cast<MCAsmLayout &>(Layout));
+
   DenseMap<const MCSectionELF*, const MCSectionELF*> RelMap;
   CreateRelocationSections(Asm, const_cast<MCAsmLayout&>(Layout), RelMap);
 
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 9667145..c0777a6 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -61,8 +61,8 @@ MCAsmInfo::MCAsmInfo() {
   UsesELFSectionDirectiveForBSS = false;
   AlignmentIsInBytes = true;
   TextAlignFillValue = 0;
-  GPRel64Directive = 0;
-  GPRel32Directive = 0;
+  GPRel64Directive = nullptr;
+  GPRel32Directive = nullptr;
   GlobalDirective = "\t.globl\t";
   HasSetDirective = true;
   HasAggressiveSymbolFolding = true;
@@ -72,7 +72,7 @@ MCAsmInfo::MCAsmInfo() {
   HasSingleParameterDotFile = true;
   HasIdentDirective = false;
   HasNoDeadStrip = false;
-  WeakRefDirective = 0;
+  WeakRefDirective = nullptr;
   HasWeakDefDirective = false;
   HasWeakDefCanBeHiddenDirective = false;
   HasLinkOnceDirective = false;
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 884ccf9..7f8ae54 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -31,6 +32,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include <cctype>
+#include <unordered_map>
 using namespace llvm;
 
 namespace {
@@ -49,34 +51,24 @@ private:
 
   unsigned IsVerboseAsm : 1;
   unsigned ShowInst : 1;
-  unsigned UseCFI : 1;
   unsigned UseDwarfDirectory : 1;
 
-  enum EHSymbolFlags { EHGlobal         = 1,
-                       EHWeakDefinition = 1 << 1,
-                       EHPrivateExtern  = 1 << 2 };
-  DenseMap<const MCSymbol*, unsigned> FlagMap;
-
-  DenseMap<const MCSymbol*, MCSymbolData*> SymbolMap;
-
   void EmitRegisterName(int64_t Register);
   void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
   void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
 
 public:
   MCAsmStreamer(MCContext &Context, formatted_raw_ostream &os,
-                bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                bool isVerboseAsm, bool useDwarfDirectory,
                 MCInstPrinter *printer, MCCodeEmitter *emitter,
                 MCAsmBackend *asmbackend, bool showInst)
       : MCStreamer(Context), OS(os), MAI(Context.getAsmInfo()),
         InstPrinter(printer), Emitter(emitter), AsmBackend(asmbackend),
         CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm),
-        ShowInst(showInst), UseCFI(useCFI),
-        UseDwarfDirectory(useDwarfDirectory) {
+        ShowInst(showInst), UseDwarfDirectory(useDwarfDirectory) {
     if (InstPrinter && IsVerboseAsm)
       InstPrinter->setCommentStream(CommentStream);
   }
-  ~MCAsmStreamer() {}
 
   inline void EmitEOL() {
     // If we don't have any comments, just emit a \n.
@@ -130,7 +122,6 @@ public:
   void EmitLabel(MCSymbol *Symbol) override;
   void EmitDebugLabel(MCSymbol *Symbol) override;
 
-  void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol) override;
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitLinkerOptions(ArrayRef<std::string> Options) override;
   void EmitDataRegion(MCDataRegionType Kind) override;
@@ -140,12 +131,6 @@ public:
 
   void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override;
-  void EmitDwarfAdvanceLineAddr(int64_t LineDelta, const MCSymbol *LastLabel,
-                                const MCSymbol *Label,
-                                unsigned PointerSize) override;
-  void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
-                                 const MCSymbol *Label) override;
-
   bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
 
   void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
@@ -167,7 +152,7 @@ public:
   void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                              unsigned ByteAlignment) override;
 
-  void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
+  void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = nullptr,
                     uint64_t Size = 0, unsigned ByteAlignment = 0) override;
 
   void EmitTBSSSymbol (const MCSection *Section, MCSymbol *Symbol,
@@ -175,7 +160,8 @@ public:
 
   void EmitBytes(StringRef Data) override;
 
-  void EmitValueImpl(const MCExpr *Value, unsigned Size) override;
+  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                     const SMLoc &Loc = SMLoc()) override;
   void EmitIntValue(uint64_t Value, unsigned Size) override;
 
   void EmitULEB128Value(const MCExpr *Value) override;
@@ -254,8 +240,6 @@ public:
   void EmitRawTextImpl(StringRef String) override;
 
   void FinishImpl() override;
-
-  virtual MCSymbolData &getOrCreateSymbolData(const MCSymbol *Symbol) override;
 };
 
 } // end anonymous namespace.
@@ -321,21 +305,6 @@ void MCAsmStreamer::ChangeSection(const MCSection *Section,
   Section->PrintSwitchToSection(*MAI, OS, Subsection);
 }
 
-void MCAsmStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
-                                        MCSymbol *EHSymbol) {
-  if (UseCFI)
-    return;
-
-  unsigned Flags = FlagMap.lookup(Symbol);
-
-  if (Flags & EHGlobal)
-    EmitSymbolAttribute(EHSymbol, MCSA_Global);
-  if (Flags & EHWeakDefinition)
-    EmitSymbolAttribute(EHSymbol, MCSA_WeakDefinition);
-  if (Flags & EHPrivateExtern)
-    EmitSymbolAttribute(EHSymbol, MCSA_PrivateExtern);
-}
-
 void MCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
   MCStreamer::EmitLabel(Symbol);
@@ -441,22 +410,6 @@ void MCAsmStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
-                                             const MCSymbol *LastLabel,
-                                             const MCSymbol *Label,
-                                             unsigned PointerSize) {
-  EmitDwarfSetLineAddr(LineDelta, Label, PointerSize);
-}
-
-void MCAsmStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
-                                              const MCSymbol *Label) {
-  EmitIntValue(dwarf::DW_CFA_advance_loc4, 1);
-  const MCExpr *AddrDelta = BuildSymbolDiff(getContext(), Label, LastLabel);
-  AddrDelta = ForceExpAbs(AddrDelta);
-  EmitValue(AddrDelta, 4);
-}
-
-
 bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                         MCSymbolAttr Attribute) {
   switch (Attribute) {
@@ -486,7 +439,6 @@ bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     return true;
   case MCSA_Global: // .globl/.global
     OS << MAI->getGlobalDirective();
-    FlagMap[Symbol] |= EHGlobal;
     break;
   case MCSA_Hidden:         OS << "\t.hidden\t";          break;
   case MCSA_IndirectSymbol: OS << "\t.indirect_symbol\t"; break;
@@ -497,14 +449,12 @@ bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_SymbolResolver: OS << "\t.symbol_resolver\t"; break;
   case MCSA_PrivateExtern:
     OS << "\t.private_extern\t";
-    FlagMap[Symbol] |= EHPrivateExtern;
     break;
   case MCSA_Protected:      OS << "\t.protected\t";       break;
   case MCSA_Reference:      OS << "\t.reference\t";       break;
   case MCSA_Weak:           OS << "\t.weak\t";            break;
   case MCSA_WeakDefinition:
     OS << "\t.weak_definition\t";
-    FlagMap[Symbol] |= EHWeakDefinition;
     break;
       // .weak_reference
   case MCSA_WeakReference:  OS << MAI->getWeakRefDirective(); break;
@@ -560,7 +510,7 @@ void MCAsmStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
 void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                      unsigned ByteAlignment) {
   // Common symbols do not belong to any actual section.
-  AssignSection(Symbol, NULL);
+  AssignSection(Symbol, nullptr);
 
   OS << "\t.comm\t" << *Symbol << ',' << Size;
   if (ByteAlignment != 0) {
@@ -579,7 +529,7 @@ void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                           unsigned ByteAlign) {
   // Common symbols do not belong to any actual section.
-  AssignSection(Symbol, NULL);
+  AssignSection(Symbol, nullptr);
 
   OS << "\t.lcomm\t" << *Symbol << ',' << Size;
   if (ByteAlign > 1) {
@@ -610,7 +560,7 @@ void MCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
   const MCSectionMachO *MOSection = ((const MCSectionMachO*)Section);
   OS << MOSection->getSegmentName() << "," << MOSection->getSectionName();
 
-  if (Symbol != NULL) {
+  if (Symbol) {
     OS << ',' << *Symbol << ',' << Size;
     if (ByteAlignment != 0)
       OS << ',' << Log2_32(ByteAlignment);
@@ -625,7 +575,7 @@ void MCAsmStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
   AssignSection(Symbol, Section);
 
-  assert(Symbol != NULL && "Symbol shouldn't be NULL!");
+  assert(Symbol && "Symbol shouldn't be NULL!");
   // Instead of using the Section we'll just use the shortcut.
   // This is a mach-o specific directive and section.
   OS << ".tbss " << *Symbol << ", " << Size;
@@ -702,11 +652,12 @@ void MCAsmStreamer::EmitIntValue(uint64_t Value, unsigned Size) {
   EmitValue(MCConstantExpr::Create(Value, getContext()), Size);
 }
 
-void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size) {
+void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                                  const SMLoc &Loc) {
   assert(Size <= 8 && "Invalid size");
   assert(getCurrentSection().first &&
          "Cannot emit contents before setting section!");
-  const char *Directive = 0;
+  const char *Directive = nullptr;
   switch (Size) {
   default: break;
   case 1: Directive = MAI->getData8bitsDirective();  break;
@@ -775,13 +726,13 @@ void MCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) {
 }
 
 void MCAsmStreamer::EmitGPRel64Value(const MCExpr *Value) {
-  assert(MAI->getGPRel64Directive() != 0);
+  assert(MAI->getGPRel64Directive() != nullptr);
   OS << MAI->getGPRel64Directive() << *Value;
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) {
-  assert(MAI->getGPRel32Directive() != 0);
+  assert(MAI->getGPRel32Directive() != nullptr);
   OS << MAI->getGPRel32Directive() << *Value;
   EmitEOL();
 }
@@ -973,10 +924,6 @@ void MCAsmStreamer::EmitIdent(StringRef IdentString) {
 
 void MCAsmStreamer::EmitCFISections(bool EH, bool Debug) {
   MCStreamer::EmitCFISections(EH, Debug);
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_sections ";
   if (EH) {
     OS << ".eh_frame";
@@ -990,11 +937,6 @@ void MCAsmStreamer::EmitCFISections(bool EH, bool Debug) {
 }
 
 void MCAsmStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
-  if (!UseCFI) {
-    RecordProcStart(Frame);
-    return;
-  }
-
   OS << "\t.cfi_startproc";
   if (Frame.IsSimple)
     OS << " simple";
@@ -1002,11 +944,6 @@ void MCAsmStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
 }
 
 void MCAsmStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
-  if (!UseCFI) {
-    RecordProcEnd(Frame);
-    return;
-  }
-
   // Put a dummy non-null value in Frame.End to mark that this frame has been
   // closed.
   Frame.End = (MCSymbol *) 1;
@@ -1027,10 +964,6 @@ void MCAsmStreamer::EmitRegisterName(int64_t Register) {
 
 void MCAsmStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) {
   MCStreamer::EmitCFIDefCfa(Register, Offset);
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_def_cfa ";
   EmitRegisterName(Register);
   OS << ", " << Offset;
@@ -1039,20 +972,12 @@ void MCAsmStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) {
 
 void MCAsmStreamer::EmitCFIDefCfaOffset(int64_t Offset) {
   MCStreamer::EmitCFIDefCfaOffset(Offset);
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_def_cfa_offset " << Offset;
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitCFIDefCfaRegister(int64_t Register) {
   MCStreamer::EmitCFIDefCfaRegister(Register);
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_def_cfa_register ";
   EmitRegisterName(Register);
   EmitEOL();
@@ -1060,10 +985,6 @@ void MCAsmStreamer::EmitCFIDefCfaRegister(int64_t Register) {
 
 void MCAsmStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
   this->MCStreamer::EmitCFIOffset(Register, Offset);
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_offset ";
   EmitRegisterName(Register);
   OS << ", " << Offset;
@@ -1073,50 +994,30 @@ void MCAsmStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
 void MCAsmStreamer::EmitCFIPersonality(const MCSymbol *Sym,
                                        unsigned Encoding) {
   MCStreamer::EmitCFIPersonality(Sym, Encoding);
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_personality " << Encoding << ", " << *Sym;
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
   MCStreamer::EmitCFILsda(Sym, Encoding);
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_lsda " << Encoding << ", " << *Sym;
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitCFIRememberState() {
   MCStreamer::EmitCFIRememberState();
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_remember_state";
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitCFIRestoreState() {
   MCStreamer::EmitCFIRestoreState();
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_restore_state";
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitCFISameValue(int64_t Register) {
   MCStreamer::EmitCFISameValue(Register);
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_same_value ";
   EmitRegisterName(Register);
   EmitEOL();
@@ -1124,10 +1025,6 @@ void MCAsmStreamer::EmitCFISameValue(int64_t Register) {
 
 void MCAsmStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) {
   MCStreamer::EmitCFIRelOffset(Register, Offset);
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_rel_offset ";
   EmitRegisterName(Register);
   OS << ", " << Offset;
@@ -1136,50 +1033,30 @@ void MCAsmStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) {
 
 void MCAsmStreamer::EmitCFIAdjustCfaOffset(int64_t Adjustment) {
   MCStreamer::EmitCFIAdjustCfaOffset(Adjustment);
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_adjust_cfa_offset " << Adjustment;
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitCFISignalFrame() {
   MCStreamer::EmitCFISignalFrame();
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_signal_frame";
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitCFIUndefined(int64_t Register) {
   MCStreamer::EmitCFIUndefined(Register);
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_undefined " << Register;
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) {
   MCStreamer::EmitCFIRegister(Register1, Register2);
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_register " << Register1 << ", " << Register2;
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitCFIWindowSave() {
   MCStreamer::EmitCFIWindowSave();
-
-  if (!UseCFI)
-    return;
-
   OS << "\t.cfi_window_save";
   EmitEOL();
 }
@@ -1257,14 +1134,17 @@ void MCAsmStreamer::EmitWin64EHHandlerData() {
 void MCAsmStreamer::EmitWin64EHPushReg(unsigned Register) {
   MCStreamer::EmitWin64EHPushReg(Register);
 
-  OS << "\t.seh_pushreg " << Register;
+  OS << "\t.seh_pushreg ";
+  EmitRegisterName(Register);
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
   MCStreamer::EmitWin64EHSetFrame(Register, Offset);
 
-  OS << "\t.seh_setframe " << Register << ", " << Offset;
+  OS << "\t.seh_setframe ";
+  EmitRegisterName(Register);
+  OS << ", " << Offset;
   EmitEOL();
 }
 
@@ -1278,14 +1158,18 @@ void MCAsmStreamer::EmitWin64EHAllocStack(unsigned Size) {
 void MCAsmStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) {
   MCStreamer::EmitWin64EHSaveReg(Register, Offset);
 
-  OS << "\t.seh_savereg " << Register << ", " << Offset;
+  OS << "\t.seh_savereg ";
+  EmitRegisterName(Register);
+  OS << ", " << Offset;
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) {
   MCStreamer::EmitWin64EHSaveXMM(Register, Offset);
 
-  OS << "\t.seh_savexmm " << Register << ", " << Offset;
+  OS << "\t.seh_savexmm ";
+  EmitRegisterName(Register);
+  OS << ", " << Offset;
   EmitEOL();
 }
 
@@ -1455,26 +1339,13 @@ void MCAsmStreamer::FinishImpl() {
       EmitLabel(Label);
     }
   }
-
-  if (!UseCFI)
-    EmitFrames(AsmBackend.get(), false);
-}
-
-MCSymbolData &MCAsmStreamer::getOrCreateSymbolData(const MCSymbol *Symbol) {
-  MCSymbolData *&Entry = SymbolMap[Symbol];
-
-  if (!Entry)
-    Entry = new MCSymbolData(*Symbol, 0, 0, 0);
-
-  return *Entry;
 }
 
 MCStreamer *llvm::createAsmStreamer(MCContext &Context,
                                     formatted_raw_ostream &OS,
-                                    bool isVerboseAsm, bool useCFI,
-                                    bool useDwarfDirectory, MCInstPrinter *IP,
-                                    MCCodeEmitter *CE, MCAsmBackend *MAB,
-                                    bool ShowInst) {
-  return new MCAsmStreamer(Context, OS, isVerboseAsm, useCFI, useDwarfDirectory,
-                           IP, CE, MAB, ShowInst);
+                                    bool isVerboseAsm, bool useDwarfDirectory,
+                                    MCInstPrinter *IP, MCCodeEmitter *CE,
+                                    MCAsmBackend *MAB, bool ShowInst) {
+  return new MCAsmStreamer(Context, OS, isVerboseAsm, useDwarfDirectory, IP, CE,
+                           MAB, ShowInst);
 }
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 724ca29..886a5f5 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "assembler"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -28,12 +27,11 @@
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Compression.h"
-#include "llvm/Support/Host.h"
-
+#include <tuple>
 using namespace llvm;
 
+#define DEBUG_TYPE "assembler"
+
 namespace {
 namespace stats {
 STATISTIC(EmittedFragments, "Number of emitted assembler fragments - total");
@@ -119,36 +117,89 @@ uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const {
   return F->Offset;
 }
 
-uint64_t MCAsmLayout::getSymbolOffset(const MCSymbolData *SD) const {
+// Simple getSymbolOffset helper for the non-varibale case.
+static bool getLabelOffset(const MCAsmLayout &Layout, const MCSymbolData &SD,
+                           bool ReportError, uint64_t &Val) {
+  if (!SD.getFragment()) {
+    if (ReportError)
+      report_fatal_error("unable to evaluate offset to undefined symbol '" +
+                         SD.getSymbol().getName() + "'");
+    return false;
+  }
+  Val = Layout.getFragmentOffset(SD.getFragment()) + SD.getOffset();
+  return true;
+}
+
+static bool getSymbolOffsetImpl(const MCAsmLayout &Layout,
+                                const MCSymbolData *SD, bool ReportError,
+                                uint64_t &Val) {
   const MCSymbol &S = SD->getSymbol();
 
-  // If this is a variable, then recursively evaluate now.
-  if (S.isVariable()) {
-    MCValue Target;
-    if (!S.getVariableValue()->EvaluateAsRelocatable(Target, this))
-      report_fatal_error("unable to evaluate offset for variable '" +
-                         S.getName() + "'");
+  if (!S.isVariable())
+    return getLabelOffset(Layout, *SD, ReportError, Val);
 
-    // Verify that any used symbols are defined.
-    if (Target.getSymA() && Target.getSymA()->getSymbol().isUndefined())
-      report_fatal_error("unable to evaluate offset to undefined symbol '" +
-                         Target.getSymA()->getSymbol().getName() + "'");
-    if (Target.getSymB() && Target.getSymB()->getSymbol().isUndefined())
-      report_fatal_error("unable to evaluate offset to undefined symbol '" +
-                         Target.getSymB()->getSymbol().getName() + "'");
-
-    uint64_t Offset = Target.getConstant();
-    if (Target.getSymA())
-      Offset += getSymbolOffset(&Assembler.getSymbolData(
-                                  Target.getSymA()->getSymbol()));
-    if (Target.getSymB())
-      Offset -= getSymbolOffset(&Assembler.getSymbolData(
-                                  Target.getSymB()->getSymbol()));
-    return Offset;
+  // If SD is a variable, evaluate it.
+  MCValue Target;
+  if (!S.getVariableValue()->EvaluateAsValue(Target, &Layout))
+    report_fatal_error("unable to evaluate offset for variable '" +
+                       S.getName() + "'");
+
+  uint64_t Offset = Target.getConstant();
+
+  const MCAssembler &Asm = Layout.getAssembler();
+
+  const MCSymbolRefExpr *A = Target.getSymA();
+  if (A) {
+    uint64_t ValA;
+    if (!getLabelOffset(Layout, Asm.getSymbolData(A->getSymbol()), ReportError,
+                        ValA))
+      return false;
+    Offset += ValA;
   }
 
-  assert(SD->getFragment() && "Invalid getOffset() on undefined symbol!");
-  return getFragmentOffset(SD->getFragment()) + SD->getOffset();
+  const MCSymbolRefExpr *B = Target.getSymB();
+  if (B) {
+    uint64_t ValB;
+    if (!getLabelOffset(Layout, Asm.getSymbolData(B->getSymbol()), ReportError,
+                        ValB))
+      return false;
+    Offset -= ValB;
+  }
+
+  Val = Offset;
+  return true;
+}
+
+bool MCAsmLayout::getSymbolOffset(const MCSymbolData *SD, uint64_t &Val) const {
+  return getSymbolOffsetImpl(*this, SD, false, Val);
+}
+
+uint64_t MCAsmLayout::getSymbolOffset(const MCSymbolData *SD) const {
+  uint64_t Val;
+  getSymbolOffsetImpl(*this, SD, true, Val);
+  return Val;
+}
+
+const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const {
+  if (!Symbol.isVariable())
+    return &Symbol;
+
+  const MCExpr *Expr = Symbol.getVariableValue();
+  MCValue Value;
+  if (!Expr->EvaluateAsValue(Value, this))
+    llvm_unreachable("Invalid Expression");
+
+  const MCSymbolRefExpr *RefB = Value.getSymB();
+  if (RefB)
+    Assembler.getContext().FatalError(
+        SMLoc(), Twine("symbol '") + RefB->getSymbol().getName() +
+                     "' could not be evaluated in a subtraction expression");
+
+  const MCSymbolRefExpr *A = Value.getSymA();
+  if (!A)
+    return nullptr;
+
+  return &A->getSymbol();
 }
 
 uint64_t MCAsmLayout::getSectionAddressSize(const MCSectionData *SD) const {
@@ -215,7 +266,7 @@ MCFragment::~MCFragment() {
 }
 
 MCFragment::MCFragment(FragmentType _Kind, MCSectionData *_Parent)
-  : Kind(_Kind), Parent(_Parent), Atom(0), Offset(~UINT64_C(0))
+  : Kind(_Kind), Parent(_Parent), Atom(nullptr), Offset(~UINT64_C(0))
 {
   if (Parent)
     Parent->getFragmentList().push_back(this);
@@ -233,40 +284,7 @@ MCEncodedFragmentWithFixups::~MCEncodedFragmentWithFixups() {
 
 /* *** */
 
-const SmallVectorImpl<char> &MCCompressedFragment::getCompressedContents() const {
-  assert(getParent()->size() == 1 &&
-         "Only compress sections containing a single fragment");
-  if (CompressedContents.empty()) {
-    std::unique_ptr<MemoryBuffer> CompressedSection;
-    zlib::Status Success =
-        zlib::compress(StringRef(getContents().data(), getContents().size()),
-                       CompressedSection);
-    (void)Success;
-    assert(Success == zlib::StatusOK);
-    CompressedContents.push_back('Z');
-    CompressedContents.push_back('L');
-    CompressedContents.push_back('I');
-    CompressedContents.push_back('B');
-    uint64_t Size = getContents().size();
-    if (sys::IsLittleEndianHost)
-      Size = sys::SwapByteOrder(Size);
-    CompressedContents.append(reinterpret_cast<char *>(&Size),
-                              reinterpret_cast<char *>(&Size + 1));
-    CompressedContents.append(CompressedSection->getBuffer().begin(),
-                              CompressedSection->getBuffer().end());
-  }
-  return CompressedContents;
-}
-
-SmallVectorImpl<char> &MCCompressedFragment::getContents() {
-  assert(CompressedContents.empty() &&
-         "Fragment contents should not be altered after compression");
-  return MCDataFragment::getContents();
-}
-
-/* *** */
-
-MCSectionData::MCSectionData() : Section(0) {}
+MCSectionData::MCSectionData() : Section(nullptr) {}
 
 MCSectionData::MCSectionData(const MCSection &_Section, MCAssembler *A)
   : Section(&_Section),
@@ -286,7 +304,7 @@ MCSectionData::getSubsectionInsertionPoint(unsigned Subsection) {
 
   SmallVectorImpl<std::pair<unsigned, MCFragment *> >::iterator MI =
     std::lower_bound(SubsectionFragmentMap.begin(), SubsectionFragmentMap.end(),
-                     std::make_pair(Subsection, (MCFragment *)0));
+                     std::make_pair(Subsection, (MCFragment *)nullptr));
   bool ExactMatch = false;
   if (MI != SubsectionFragmentMap.end()) {
     ExactMatch = MI->first == Subsection;
@@ -311,13 +329,13 @@ MCSectionData::getSubsectionInsertionPoint(unsigned Subsection) {
 
 /* *** */
 
-MCSymbolData::MCSymbolData() : Symbol(0) {}
+MCSymbolData::MCSymbolData() : Symbol(nullptr) {}
 
 MCSymbolData::MCSymbolData(const MCSymbol &_Symbol, MCFragment *_Fragment,
                            uint64_t _Offset, MCAssembler *A)
   : Symbol(&_Symbol), Fragment(_Fragment), Offset(_Offset),
     IsExternal(false), IsPrivateExtern(false),
-    CommonSize(0), SymbolSize(0), CommonAlign(0),
+    CommonSize(0), SymbolSize(nullptr), CommonAlign(0),
     Flags(0), Index(0)
 {
   if (A)
@@ -358,6 +376,31 @@ void MCAssembler::reset() {
   getLOHContainer().reset();
 }
 
+bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const {
+  if (ThumbFuncs.count(Symbol))
+    return true;
+
+  if (!Symbol->isVariable())
+    return false;
+
+  // FIXME: It looks like gas supports some cases of the form "foo + 2". It
+  // is not clear if that is a bug or a feature.
+  const MCExpr *Expr = Symbol->getVariableValue();
+  const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(Expr);
+  if (!Ref)
+    return false;
+
+  if (Ref->getKind() != MCSymbolRefExpr::VK_None)
+    return false;
+
+  const MCSymbol &Sym = Ref->getSymbol();
+  if (!isThumbFunc(&Sym))
+    return false;
+
+  ThumbFuncs.insert(Symbol); // Cache it.
+  return true;
+}
+
 bool MCAssembler::isSymbolLinkerVisible(const MCSymbol &Symbol) const {
   // Non-temporary labels should always be visible to the linker.
   if (!Symbol.isTemporary())
@@ -378,13 +421,13 @@ const MCSymbolData *MCAssembler::getAtom(const MCSymbolData *SD) const {
 
   // Absolute and undefined symbols have no defining atom.
   if (!SD->getFragment())
-    return 0;
+    return nullptr;
 
   // Non-linker visible symbols in sections which can't be atomized have no
   // defining atom.
   if (!getBackend().isSectionAtomizable(
         SD->getFragment()->getParent()->getSection()))
-    return 0;
+    return nullptr;
 
   // Otherwise, return the atom for the containing fragment.
   return SD->getFragment()->getAtom();
@@ -467,8 +510,6 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
   case MCFragment::FT_Relaxable:
   case MCFragment::FT_CompactEncodedInst:
     return cast<MCEncodedFragment>(F).getContents().size();
-  case MCFragment::FT_Compressed:
-    return cast<MCCompressedFragment>(F).getCompressedContents().size();
   case MCFragment::FT_Fill:
     return cast<MCFillFragment>(F).getSize();
 
@@ -657,11 +698,6 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
     break;
   }
 
-  case MCFragment::FT_Compressed:
-    ++stats::EmittedDataFragments;
-    OW->WriteBytes(cast<MCCompressedFragment>(F).getCompressedContents());
-    break;
-
   case MCFragment::FT_Data: 
     ++stats::EmittedDataFragments;
     writeFragmentContents(F, OW);
@@ -738,7 +774,6 @@ void MCAssembler::writeSectionData(const MCSectionData *SD,
            ie = SD->end(); it != ie; ++it) {
       switch (it->getKind()) {
       default: llvm_unreachable("Invalid fragment in virtual section!");
-      case MCFragment::FT_Compressed:
       case MCFragment::FT_Data: {
         // Check that we aren't trying to write a non-zero contents (or fixups)
         // into a virtual section. This is to support clients which use standard
@@ -992,7 +1027,7 @@ bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSectionData &SD) {
   // remain NULL if none were relaxed.
   // When a fragment is relaxed, all the fragments following it should get
   // invalidated because their offset is going to change.
-  MCFragment *FirstRelaxedFragment = NULL;
+  MCFragment *FirstRelaxedFragment = nullptr;
 
   // Attempt to relax all the fragments in the section.
   for (MCSectionData::iterator I = SD.begin(), IE = SD.end(); I != IE; ++I) {
@@ -1070,8 +1105,6 @@ void MCFragment::dump() {
   switch (getKind()) {
   case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
   case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
-  case MCFragment::FT_Compressed:
-    OS << "MCCompressedFragment"; break;
   case MCFragment::FT_CompactEncodedInst:
     OS << "MCCompactEncodedInstFragment"; break;
   case MCFragment::FT_Fill:  OS << "MCFillFragment"; break;
@@ -1098,7 +1131,6 @@ void MCFragment::dump() {
        << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">";
     break;
   }
-  case MCFragment::FT_Compressed:
   case MCFragment::FT_Data:  {
     const MCDataFragment *DF = cast<MCDataFragment>(this);
     OS << "\n       ";
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 73ffdc0..c163268 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -29,19 +29,13 @@
 
 using namespace llvm;
 
-typedef std::pair<std::string, std::string> SectionGroupPair;
-
-typedef StringMap<const MCSectionMachO*> MachOUniqueMapTy;
-typedef std::map<SectionGroupPair, const MCSectionELF *> ELFUniqueMapTy;
-typedef std::map<SectionGroupPair, const MCSectionCOFF *> COFFUniqueMapTy;
-
 MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
                      const MCObjectFileInfo *mofi, const SourceMgr *mgr,
                      bool DoAutoReset)
     : SrcMgr(mgr), MAI(mai), MRI(mri), MOFI(mofi), Allocator(),
       Symbols(Allocator), UsedNames(Allocator), NextUniqueID(0),
       CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0), DwarfLocSeen(false),
-      GenDwarfForAssembly(false), GenDwarfFileNumber(0),
+      GenDwarfForAssembly(false), GenDwarfFileNumber(0), DwarfVersion(4),
       AllowTemporaryLabels(true), DwarfCompileUnitID(0),
       AutoReset(DoAutoReset) {
 
@@ -49,12 +43,8 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
   if (EC)
     CompilationDir.clear();
 
-  MachOUniquingMap = 0;
-  ELFUniquingMap = 0;
-  COFFUniquingMap = 0;
-
   SecureLogFile = getenv("AS_SECURE_LOG_FILE");
-  SecureLog = 0;
+  SecureLog = nullptr;
   SecureLogUsed = false;
 
   if (SrcMgr && SrcMgr->getNumBuffers() > 0)
@@ -88,13 +78,9 @@ void MCContext::reset() {
   DwarfCompileUnitID = 0;
   CurrentDwarfLoc = MCDwarfLoc(0,0,0,DWARF2_FLAG_IS_STMT,0,0);
 
-  // If we have the MachO uniquing map, free it.
-  delete (MachOUniqueMapTy*)MachOUniquingMap;
-  delete (ELFUniqueMapTy*)ELFUniquingMap;
-  delete (COFFUniqueMapTy*)COFFUniquingMap;
-  MachOUniquingMap = 0;
-  ELFUniquingMap = 0;
-  COFFUniquingMap = 0;
+  MachOUniquingMap.clear();
+  ELFUniquingMap.clear();
+  COFFUniquingMap.clear();
 
   NextUniqueID = 0;
   AllowTemporaryLabels = true;
@@ -225,11 +211,6 @@ getMachOSection(StringRef Segment, StringRef Section,
   // may not have the same flags as the requested section, if so this should be
   // diagnosed by the client as an error.
 
-  // Create the map if it doesn't already exist.
-  if (MachOUniquingMap == 0)
-    MachOUniquingMap = new MachOUniqueMapTy();
-  MachOUniqueMapTy &Map = *(MachOUniqueMapTy*)MachOUniquingMap;
-
   // Form the name to look up.
   SmallString<64> Name;
   Name += Segment;
@@ -237,7 +218,7 @@ getMachOSection(StringRef Segment, StringRef Section,
   Name += Section;
 
   // Do the lookup, if we have a hit, return it.
-  const MCSectionMachO *&Entry = Map[Name.str()];
+  const MCSectionMachO *&Entry = MachOUniquingMap[Name.str()];
   if (Entry) return Entry;
 
   // Otherwise, return a new section.
@@ -251,42 +232,48 @@ getELFSection(StringRef Section, unsigned Type, unsigned Flags,
   return getELFSection(Section, Type, Flags, Kind, 0, "");
 }
 
+void MCContext::renameELFSection(const MCSectionELF *Section, StringRef Name) {
+  StringRef GroupName;
+  if (const MCSymbol *Group = Section->getGroup())
+    GroupName = Group->getName();
+
+  ELFUniquingMap.erase(SectionGroupPair(Section->getSectionName(), GroupName));
+  auto I =
+      ELFUniquingMap.insert(std::make_pair(SectionGroupPair(Name, GroupName),
+                                           Section)).first;
+  StringRef CachedName = I->first.first;
+  const_cast<MCSectionELF*>(Section)->setSectionName(CachedName);
+}
+
 const MCSectionELF *MCContext::
 getELFSection(StringRef Section, unsigned Type, unsigned Flags,
               SectionKind Kind, unsigned EntrySize, StringRef Group) {
-  if (ELFUniquingMap == 0)
-    ELFUniquingMap = new ELFUniqueMapTy();
-  ELFUniqueMapTy &Map = *(ELFUniqueMapTy*)ELFUniquingMap;
-
-  SmallString<32> ZDebugName;
-  if (MAI->compressDebugSections() && Section.startswith(".debug_") &&
-      Section != ".debug_frame" && Section != ".debug_line")
-    Section = (".z" + Section.drop_front(1)).toStringRef(ZDebugName);
-
   // Do the lookup, if we have a hit, return it.
-  std::pair<ELFUniqueMapTy::iterator, bool> Entry = Map.insert(
-      std::make_pair(SectionGroupPair(Section, Group), (MCSectionELF *)0));
-  if (!Entry.second) return Entry.first->second;
+  auto IterBool = ELFUniquingMap.insert(
+      std::make_pair(SectionGroupPair(Section, Group), nullptr));
+  auto &Entry = *IterBool.first;
+  if (!IterBool.second) return Entry.second;
 
   // Possibly refine the entry size first.
   if (!EntrySize) {
     EntrySize = MCSectionELF::DetermineEntrySize(Kind);
   }
 
-  MCSymbol *GroupSym = NULL;
+  MCSymbol *GroupSym = nullptr;
   if (!Group.empty())
     GroupSym = GetOrCreateSymbol(Group);
 
-  MCSectionELF *Result = new (*this) MCSectionELF(
-      Entry.first->first.first, Type, Flags, Kind, EntrySize, GroupSym);
-  Entry.first->second = Result;
+  StringRef CachedName = Entry.first.first;
+  MCSectionELF *Result = new (*this)
+      MCSectionELF(CachedName, Type, Flags, Kind, EntrySize, GroupSym);
+  Entry.second = Result;
   return Result;
 }
 
 const MCSectionELF *MCContext::CreateELFGroupSection() {
   MCSectionELF *Result =
     new (*this) MCSectionELF(".group", ELF::SHT_GROUP, 0,
-                             SectionKind::getReadOnly(), 4, NULL);
+                             SectionKind::getReadOnly(), 4, nullptr);
   return Result;
 }
 
@@ -294,26 +281,21 @@ const MCSectionCOFF *
 MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
                           SectionKind Kind, StringRef COMDATSymName,
                           int Selection, const MCSectionCOFF *Assoc) {
-  if (COFFUniquingMap == 0)
-    COFFUniquingMap = new COFFUniqueMapTy();
-  COFFUniqueMapTy &Map = *(COFFUniqueMapTy*)COFFUniquingMap;
-
   // Do the lookup, if we have a hit, return it.
 
   SectionGroupPair P(Section, COMDATSymName);
-  std::pair<COFFUniqueMapTy::iterator, bool> Entry =
-      Map.insert(std::make_pair(P, (MCSectionCOFF *)0));
-  COFFUniqueMapTy::iterator Iter = Entry.first;
-  if (!Entry.second)
+  auto IterBool = COFFUniquingMap.insert(std::make_pair(P, nullptr));
+  auto Iter = IterBool.first;
+  if (!IterBool.second)
     return Iter->second;
 
-  const MCSymbol *COMDATSymbol = NULL;
+  const MCSymbol *COMDATSymbol = nullptr;
   if (!COMDATSymName.empty())
     COMDATSymbol = GetOrCreateSymbol(COMDATSymName);
 
-  MCSectionCOFF *Result =
-      new (*this) MCSectionCOFF(Iter->first.first, Characteristics,
-                                COMDATSymbol, Selection, Assoc, Kind);
+  StringRef CachedName = Iter->first.first;
+  MCSectionCOFF *Result = new (*this) MCSectionCOFF(
+      CachedName, Characteristics, COMDATSymbol, Selection, Assoc, Kind);
 
   Iter->second = Result;
   return Result;
@@ -326,14 +308,10 @@ MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
 }
 
 const MCSectionCOFF *MCContext::getCOFFSection(StringRef Section) {
-  if (COFFUniquingMap == 0)
-    COFFUniquingMap = new COFFUniqueMapTy();
-  COFFUniqueMapTy &Map = *(COFFUniqueMapTy*)COFFUniquingMap;
-
   SectionGroupPair P(Section, "");
-  COFFUniqueMapTy::iterator Iter = Map.find(P);
-  if (Iter == Map.end())
-    return 0;
+  auto Iter = COFFUniquingMap.find(P);
+  if (Iter == COFFUniquingMap.end())
+    return nullptr;
   return Iter->second;
 }
 
@@ -361,7 +339,7 @@ bool MCContext::isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID) {
   return !MCDwarfFiles[FileNumber].Name.empty();
 }
 
-void MCContext::FatalError(SMLoc Loc, const Twine &Msg) {
+void MCContext::FatalError(SMLoc Loc, const Twine &Msg) const {
   // If we have a source manager and a location, use it. Otherwise just
   // use the generic report_fatal_error().
   if (!SrcMgr || Loc == SMLoc())
diff --git a/lib/MC/MCDisassembler.cpp b/lib/MC/MCDisassembler.cpp
index 7a2b1a1..77d9ce1 100644
--- a/lib/MC/MCDisassembler.cpp
+++ b/lib/MC/MCDisassembler.cpp
@@ -16,20 +16,6 @@ using namespace llvm;
 MCDisassembler::~MCDisassembler() {
 }
 
-void MCDisassembler::setupForSymbolicDisassembly(
-    LLVMOpInfoCallback GetOpInfo, LLVMSymbolLookupCallback SymbolLookUp,
-    void *DisInfo, MCContext *Ctx, std::unique_ptr<MCRelocationInfo> &RelInfo) {
-  this->GetOpInfo = GetOpInfo;
-  this->SymbolLookUp = SymbolLookUp;
-  this->DisInfo = DisInfo;
-  this->Ctx = Ctx;
-  assert(Ctx != 0 && "No MCContext given for symbolic disassembly");
-  if (!Symbolizer)
-    Symbolizer.reset(new MCExternalSymbolizer(*Ctx, std::move(RelInfo),
-                                              GetOpInfo, SymbolLookUp,
-                                              DisInfo));
-}
-
 bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value,
                                               uint64_t Address, bool IsBranch,
                                               uint64_t Offset,
diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp
index b935b83..0530c26 100644
--- a/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/lib/MC/MCDisassembler/Disassembler.cpp
@@ -41,20 +41,20 @@ LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
   std::string Error;
   const Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error);
   if (!TheTarget)
-    return 0;
+    return nullptr;
 
   const MCRegisterInfo *MRI = TheTarget->createMCRegInfo(Triple);
   if (!MRI)
-    return 0;
+    return nullptr;
 
   // Get the assembler info needed to setup the MCContext.
   const MCAsmInfo *MAI = TheTarget->createMCAsmInfo(*MRI, Triple);
   if (!MAI)
-    return 0;
+    return nullptr;
 
   const MCInstrInfo *MII = TheTarget->createMCInstrInfo();
   if (!MII)
-    return 0;
+    return nullptr;
 
   // Package up features to be passed to target/subtarget
   std::string FeaturesStr;
@@ -62,41 +62,40 @@ LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
   const MCSubtargetInfo *STI = TheTarget->createMCSubtargetInfo(Triple, CPU,
                                                                 FeaturesStr);
   if (!STI)
-    return 0;
+    return nullptr;
 
   // Set up the MCContext for creating symbols and MCExpr's.
-  MCContext *Ctx = new MCContext(MAI, MRI, 0);
+  MCContext *Ctx = new MCContext(MAI, MRI, nullptr);
   if (!Ctx)
-    return 0;
+    return nullptr;
 
   // Set up disassembler.
-  MCDisassembler *DisAsm = TheTarget->createMCDisassembler(*STI);
+  MCDisassembler *DisAsm = TheTarget->createMCDisassembler(*STI, *Ctx);
   if (!DisAsm)
-    return 0;
+    return nullptr;
 
   std::unique_ptr<MCRelocationInfo> RelInfo(
       TheTarget->createMCRelocationInfo(Triple, *Ctx));
   if (!RelInfo)
-    return 0;
+    return nullptr;
 
   std::unique_ptr<MCSymbolizer> Symbolizer(TheTarget->createMCSymbolizer(
       Triple, GetOpInfo, SymbolLookUp, DisInfo, Ctx, RelInfo.release()));
   DisAsm->setSymbolizer(std::move(Symbolizer));
-  DisAsm->setupForSymbolicDisassembly(GetOpInfo, SymbolLookUp, DisInfo,
-                                      Ctx, RelInfo);
+
   // Set up the instruction printer.
   int AsmPrinterVariant = MAI->getAssemblerDialect();
   MCInstPrinter *IP = TheTarget->createMCInstPrinter(AsmPrinterVariant,
                                                      *MAI, *MII, *MRI, *STI);
   if (!IP)
-    return 0;
+    return nullptr;
 
   LLVMDisasmContext *DC = new LLVMDisasmContext(Triple, DisInfo, TagType,
                                                 GetOpInfo, SymbolLookUp,
                                                 TheTarget, MAI, MRI,
                                                 STI, MII, Ctx, DisAsm, IP);
   if (!DC)
-    return 0;
+    return nullptr;
 
   DC->setCPU(CPU);
   return DC;
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 72836ff..be6731a 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
@@ -16,8 +17,8 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -62,7 +63,7 @@ static inline uint64_t ScaleAddrDelta(MCContext &Context, uint64_t AddrDelta) {
 // and if there is information from the last .loc directive that has yet to have
 // a line entry made for it is made.
 //
-void MCLineEntry::Make(MCStreamer *MCOS, const MCSection *Section) {
+void MCLineEntry::Make(MCObjectStreamer *MCOS, const MCSection *Section) {
   if (!MCOS->getContext().getDwarfLocSeen())
     return;
 
@@ -113,7 +114,7 @@ static inline const MCExpr *MakeStartMinusEndExpr(const MCStreamer &MCOS,
 // in the LineSection.
 //
 static inline void
-EmitDwarfLineTable(MCStreamer *MCOS, const MCSection *Section,
+EmitDwarfLineTable(MCObjectStreamer *MCOS, const MCSection *Section,
                    const MCLineSection::MCLineEntryCollection &LineEntries) {
   unsigned FileNum = 1;
   unsigned LastLine = 1;
@@ -121,7 +122,7 @@ EmitDwarfLineTable(MCStreamer *MCOS, const MCSection *Section,
   unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
   unsigned Isa = 0;
   unsigned Discriminator = 0;
-  MCSymbol *LastLabel = NULL;
+  MCSymbol *LastLabel = nullptr;
 
   // Loop through each MCLineEntry and encode the dwarf line number table.
   for (auto it = LineEntries.begin(),
@@ -204,7 +205,7 @@ EmitDwarfLineTable(MCStreamer *MCOS, const MCSection *Section,
 //
 // This emits the Dwarf file and the line tables.
 //
-void MCDwarfLineTable::Emit(MCStreamer *MCOS) {
+void MCDwarfLineTable::Emit(MCObjectStreamer *MCOS) {
   MCContext &context = MCOS->getContext();
 
   auto &LineTables = context.getMCDwarfLineTables();
@@ -318,7 +319,7 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS,
   return std::make_pair(LineStartSym, LineEndSym);
 }
 
-void MCDwarfLineTable::EmitCU(MCStreamer *MCOS) const {
+void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS) const {
   MCSymbol *LineEndSym = Header.Emit(MCOS).second;
 
   // Put out the line tables.
@@ -644,8 +645,8 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   const MCExpr *Length = MakeStartMinusEndExpr(*MCOS, *InfoStart, *InfoEnd, 4);
   MCOS->EmitAbsValue(Length, 4);
 
-  // The 2 byte DWARF version, which is 2.
-  MCOS->EmitIntValue(2, 2);
+  // The 2 byte DWARF version.
+  MCOS->EmitIntValue(context.getDwarfVersion(), 2);
 
   // The 4 byte offset to the debug abbrevs from the start of the .debug_abbrev,
   // it is at the start of that section so this is zero.
@@ -688,7 +689,7 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   const SmallVectorImpl<std::string> &MCDwarfDirs = context.getMCDwarfDirs();
   if (MCDwarfDirs.size() > 0) {
     MCOS->EmitBytes(MCDwarfDirs[0]);
-    MCOS->EmitBytes("/");
+    MCOS->EmitBytes(sys::path::get_separator());
   }
   const SmallVectorImpl<MCDwarfFile> &MCDwarfFiles =
     MCOS->getContext().getMCDwarfFiles();
@@ -727,28 +728,24 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   // Third part: the list of label DIEs.
 
   // Loop on saved info for dwarf labels and create the DIEs for them.
-  const std::vector<const MCGenDwarfLabelEntry *> &Entries =
-    MCOS->getContext().getMCGenDwarfLabelEntries();
-  for (std::vector<const MCGenDwarfLabelEntry *>::const_iterator it =
-       Entries.begin(), ie = Entries.end(); it != ie;
-       ++it) {
-    const MCGenDwarfLabelEntry *Entry = *it;
-
+  const std::vector<MCGenDwarfLabelEntry> &Entries =
+      MCOS->getContext().getMCGenDwarfLabelEntries();
+  for (const auto &Entry : Entries) {
     // The DW_TAG_label DIE abbrev (2).
     MCOS->EmitULEB128IntValue(2);
 
     // AT_name, of the label without any leading underbar.
-    MCOS->EmitBytes(Entry->getName());
+    MCOS->EmitBytes(Entry.getName());
     MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
 
     // AT_decl_file, index into the file table.
-    MCOS->EmitIntValue(Entry->getFileNumber(), 4);
+    MCOS->EmitIntValue(Entry.getFileNumber(), 4);
 
     // AT_decl_line, source line number.
-    MCOS->EmitIntValue(Entry->getLineNumber(), 4);
+    MCOS->EmitIntValue(Entry.getLineNumber(), 4);
 
     // AT_low_pc, start address of the label.
-    const MCExpr *AT_low_pc = MCSymbolRefExpr::Create(Entry->getLabel(),
+    const MCExpr *AT_low_pc = MCSymbolRefExpr::Create(Entry.getLabel(),
                                              MCSymbolRefExpr::VK_None, context);
     MCOS->EmitValue(AT_low_pc, AddrSize);
 
@@ -761,14 +758,6 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
     // Add the NULL DIE terminating the DW_TAG_unspecified_parameters DIE's.
     MCOS->EmitIntValue(0, 1);
   }
-  // Deallocate the MCGenDwarfLabelEntry classes that saved away the info
-  // for the dwarf labels.
-  for (std::vector<const MCGenDwarfLabelEntry *>::const_iterator it =
-       Entries.begin(), ie = Entries.end(); it != ie;
-       ++it) {
-    const MCGenDwarfLabelEntry *Entry = *it;
-    delete Entry;
-  }
 
   // Add the NULL DIE terminating the Compile Unit DIE's.
   MCOS->EmitIntValue(0, 1);
@@ -790,8 +779,8 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) {
   MCSymbol *LineSectionSymbol = nullptr;
   if (CreateDwarfSectionSymbols)
     LineSectionSymbol = MCOS->getDwarfLineTableSymbol(0);
-  MCSymbol *AbbrevSectionSymbol = NULL;
-  MCSymbol *InfoSectionSymbol = NULL;
+  MCSymbol *AbbrevSectionSymbol = nullptr;
+  MCSymbol *InfoSectionSymbol = nullptr;
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
   if (CreateDwarfSectionSymbols) {
     InfoSectionSymbol = context.CreateTempSymbol();
@@ -856,9 +845,8 @@ void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS,
   MCOS->EmitLabel(Label);
 
   // Create and entry for the info and add it to the other entries.
-  MCGenDwarfLabelEntry *Entry =
-    new MCGenDwarfLabelEntry(Name, FileNumber, LineNumber, Label);
-  MCOS->getContext().addMCGenDwarfLabelEntry(Entry);
+  MCOS->getContext().addMCGenDwarfLabelEntry(
+      MCGenDwarfLabelEntry(Name, FileNumber, LineNumber, Label));
 }
 
 static int getDataAlignmentFactor(MCStreamer &streamer) {
@@ -894,7 +882,7 @@ static unsigned getSizeForEncoding(MCStreamer &streamer,
 
 static void EmitFDESymbol(MCStreamer &streamer, const MCSymbol &symbol,
                        unsigned symbolEncoding, bool isEH,
-                       const char *comment = 0) {
+                       const char *comment = nullptr) {
   MCContext &context = streamer.getContext();
   const MCAsmInfo *asmInfo = context.getAsmInfo();
   const MCExpr *v = asmInfo->getExprForFDESymbol(&symbol,
@@ -923,13 +911,11 @@ namespace {
   class FrameEmitterImpl {
     int CFAOffset;
     int CIENum;
-    bool UsingCFI;
     bool IsEH;
     const MCSymbol *SectionStart;
   public:
-    FrameEmitterImpl(bool usingCFI, bool isEH)
-      : CFAOffset(0), CIENum(0), UsingCFI(usingCFI), IsEH(isEH),
-        SectionStart(0) {}
+    FrameEmitterImpl(bool isEH)
+        : CFAOffset(0), CIENum(0), IsEH(isEH), SectionStart(nullptr) {}
 
     void setSectionStart(const MCSymbol *Label) { SectionStart = Label; }
 
@@ -937,20 +923,20 @@ namespace {
     void EmitCompactUnwind(MCStreamer &streamer,
                            const MCDwarfFrameInfo &frame);
 
-    const MCSymbol &EmitCIE(MCStreamer &streamer,
+    const MCSymbol &EmitCIE(MCObjectStreamer &streamer,
                             const MCSymbol *personality,
                             unsigned personalityEncoding,
                             const MCSymbol *lsda,
                             bool IsSignalFrame,
                             unsigned lsdaEncoding,
                             bool IsSimple);
-    MCSymbol *EmitFDE(MCStreamer &streamer,
+    MCSymbol *EmitFDE(MCObjectStreamer &streamer,
                       const MCSymbol &cieStart,
                       const MCDwarfFrameInfo &frame);
-    void EmitCFIInstructions(MCStreamer &streamer,
+    void EmitCFIInstructions(MCObjectStreamer &streamer,
                              ArrayRef<MCCFIInstruction> Instrs,
                              MCSymbol *BaseLabel);
-    void EmitCFIInstruction(MCStreamer &Streamer,
+    void EmitCFIInstruction(MCObjectStreamer &Streamer,
                             const MCCFIInstruction &Instr);
   };
 
@@ -1001,7 +987,7 @@ static void EmitEncodingByte(MCStreamer &Streamer, unsigned Encoding,
   Streamer.EmitIntValue(Encoding, 1);
 }
 
-void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer,
+void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer,
                                           const MCCFIInstruction &Instr) {
   int dataAlignmentFactor = getDataAlignmentFactor(Streamer);
   bool VerboseAsm = Streamer.isVerboseAsm();
@@ -1153,7 +1139,7 @@ void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer,
 
 /// EmitFrameMoves - Emit frame instructions to describe the layout of the
 /// frame.
-void FrameEmitterImpl::EmitCFIInstructions(MCStreamer &streamer,
+void FrameEmitterImpl::EmitCFIInstructions(MCObjectStreamer &streamer,
                                            ArrayRef<MCCFIInstruction> Instrs,
                                            MCSymbol *BaseLabel) {
   for (unsigned i = 0, N = Instrs.size(); i < N; ++i) {
@@ -1214,7 +1200,7 @@ void FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer,
     Encoding |= 0x40000000;
 
   // Range Start
-  unsigned FDEEncoding = MOFI->getFDEEncoding(UsingCFI);
+  unsigned FDEEncoding = MOFI->getFDEEncoding();
   unsigned Size = getSizeForEncoding(Streamer, FDEEncoding);
   if (VerboseAsm) Streamer.AddComment("Range Start");
   Streamer.EmitSymbolValue(Frame.Function, Size);
@@ -1248,7 +1234,7 @@ void FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer,
     Streamer.EmitIntValue(0, Size); // No LSDA
 }
 
-const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
+const MCSymbol &FrameEmitterImpl::EmitCIE(MCObjectStreamer &streamer,
                                           const MCSymbol *personality,
                                           unsigned personalityEncoding,
                                           const MCSymbol *lsda,
@@ -1346,8 +1332,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
       EmitEncodingByte(streamer, lsdaEncoding, "LSDA Encoding");
 
     // Encoding of the FDE pointers
-    EmitEncodingByte(streamer, MOFI->getFDEEncoding(UsingCFI),
-                     "FDE Encoding");
+    EmitEncodingByte(streamer, MOFI->getFDEEncoding(), "FDE Encoding");
   }
 
   // Initial Instructions
@@ -1356,7 +1341,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
   if (!IsSimple) {
     const std::vector<MCCFIInstruction> &Instructions =
         MAI->getInitialFrameState();
-    EmitCFIInstructions(streamer, Instructions, NULL);
+    EmitCFIInstructions(streamer, Instructions, nullptr);
   }
 
   // Padding
@@ -1366,7 +1351,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
   return *sectionStart;
 }
 
-MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
+MCSymbol *FrameEmitterImpl::EmitFDE(MCObjectStreamer &streamer,
                                     const MCSymbol &cieStart,
                                     const MCDwarfFrameInfo &frame) {
   MCContext &context = streamer.getContext();
@@ -1405,8 +1390,8 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
   }
 
   // PC Begin
-  unsigned PCEncoding = IsEH ? MOFI->getFDEEncoding(UsingCFI)
-                             : (unsigned)dwarf::DW_EH_PE_absptr;
+  unsigned PCEncoding =
+      IsEH ? MOFI->getFDEEncoding() : (unsigned)dwarf::DW_EH_PE_absptr;
   unsigned PCSize = getSizeForEncoding(streamer, PCEncoding);
   EmitFDESymbol(streamer, *frame.Begin, PCEncoding, IsEH, "FDE initial location");
 
@@ -1443,8 +1428,12 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
 
 namespace {
   struct CIEKey {
-    static const CIEKey getEmptyKey() { return CIEKey(0, 0, -1, false, false); }
-    static const CIEKey getTombstoneKey() { return CIEKey(0, -1, 0, false, false); }
+    static const CIEKey getEmptyKey() {
+      return CIEKey(nullptr, 0, -1, false, false);
+    }
+    static const CIEKey getTombstoneKey() {
+      return CIEKey(nullptr, -1, 0, false, false);
+    }
 
     CIEKey(const MCSymbol* Personality_, unsigned PersonalityEncoding_,
            unsigned LsdaEncoding_, bool IsSignalFrame_, bool IsSimple_) :
@@ -1487,13 +1476,13 @@ namespace llvm {
   };
 }
 
-void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer, MCAsmBackend *MAB,
-                               bool UsingCFI, bool IsEH) {
+void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
+                               bool IsEH) {
   Streamer.generateCompactUnwindEncodings(MAB);
 
   MCContext &Context = Streamer.getContext();
   const MCObjectFileInfo *MOFI = Context.getObjectFileInfo();
-  FrameEmitterImpl Emitter(UsingCFI, IsEH);
+  FrameEmitterImpl Emitter(IsEH);
   ArrayRef<MCDwarfFrameInfo> FrameArray = Streamer.getFrameInfos();
 
   // Emit the compact unwind info if available.
@@ -1526,10 +1515,10 @@ void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer, MCAsmBackend *MAB,
   Streamer.EmitLabel(SectionStart);
   Emitter.setSectionStart(SectionStart);
 
-  MCSymbol *FDEEnd = NULL;
+  MCSymbol *FDEEnd = nullptr;
   DenseMap<CIEKey, const MCSymbol*> CIEStarts;
 
-  const MCSymbol *DummyDebugKey = NULL;
+  const MCSymbol *DummyDebugKey = nullptr;
   NeedsEHFrameSection = !MOFI->getSupportsCompactUnwindWithoutEHFrame();
   for (unsigned i = 0, n = FrameArray.size(); i < n; ++i) {
     const MCDwarfFrameInfo &Frame = FrameArray[i];
@@ -1537,7 +1526,7 @@ void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer, MCAsmBackend *MAB,
     // Emit the label from the previous iteration
     if (FDEEnd) {
       Streamer.EmitLabel(FDEEnd);
-      FDEEnd = NULL;
+      FDEEnd = nullptr;
     }
 
     if (!NeedsEHFrameSection && Frame.CompactUnwindEncoding !=
@@ -1564,7 +1553,7 @@ void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer, MCAsmBackend *MAB,
     Streamer.EmitLabel(FDEEnd);
 }
 
-void MCDwarfFrameEmitter::EmitAdvanceLoc(MCStreamer &Streamer,
+void MCDwarfFrameEmitter::EmitAdvanceLoc(MCObjectStreamer &Streamer,
                                          uint64_t AddrDelta) {
   MCContext &Context = Streamer.getContext();
   SmallString<256> Tmp;
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index f710c3e..767348c 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -275,11 +275,12 @@ void MCELFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   EmitCommonSymbol(Symbol, Size, ByteAlignment);
 }
 
-void MCELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size) {
+void MCELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                                  const SMLoc &Loc) {
   if (getCurrentSectionData()->isBundleLocked())
     report_fatal_error("Emitting values inside a locked bundle is forbidden");
   fixSymbolsInTLSFixups(Value);
-  MCObjectStreamer::EmitValueImpl(Value, Size);
+  MCObjectStreamer::EmitValueImpl(Value, Size, Loc);
 }
 
 void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
@@ -537,7 +538,7 @@ void MCELFStreamer::Flush() {
 }
 
 void MCELFStreamer::FinishImpl() {
-  EmitFrames(NULL, true);
+  EmitFrames(nullptr);
 
   Flush();
 
@@ -559,10 +560,6 @@ void MCELFStreamer::EmitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("Generic ELF doesn't support this directive");
 }
 
-MCSymbolData &MCELFStreamer::getOrCreateSymbolData(const MCSymbol *Symbol) {
-  return getAssembler().getOrCreateSymbolData(*Symbol);
-}
-
 void MCELFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   llvm_unreachable("ELF doesn't support this directive");
 }
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 7f2c478..f724716 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mcexpr"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -23,6 +22,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mcexpr"
+
 namespace {
 namespace stats {
 STATISTIC(MCExprEvaluate, "Number of MCExpr evaluations");
@@ -270,6 +271,8 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_Mips_GOT_LO16: return "GOT_LO16";
   case VK_Mips_CALL_HI16: return "CALL_HI16";
   case VK_Mips_CALL_LO16: return "CALL_LO16";
+  case VK_Mips_PCREL_HI16: return "PCREL_HI16";
+  case VK_Mips_PCREL_LO16: return "PCREL_LO16";
   case VK_COFF_IMGREL32: return "IMGREL32";
   }
   llvm_unreachable("Invalid variant kind");
@@ -284,6 +287,8 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("gotoff", VK_GOTOFF)
     .Case("GOTPCREL", VK_GOTPCREL)
     .Case("gotpcrel", VK_GOTPCREL)
+    .Case("GOT_PREL", VK_GOTPCREL)
+    .Case("got_prel", VK_GOTPCREL)
     .Case("GOTTPOFF", VK_GOTTPOFF)
     .Case("gottpoff", VK_GOTTPOFF)
     .Case("INDNTPOFF", VK_INDNTPOFF)
@@ -444,12 +449,12 @@ void MCTargetExpr::anchor() {}
 /* *** */
 
 bool MCExpr::EvaluateAsAbsolute(int64_t &Res) const {
-  return EvaluateAsAbsolute(Res, 0, 0, 0);
+  return EvaluateAsAbsolute(Res, nullptr, nullptr, nullptr);
 }
 
 bool MCExpr::EvaluateAsAbsolute(int64_t &Res,
                                 const MCAsmLayout &Layout) const {
-  return EvaluateAsAbsolute(Res, &Layout.getAssembler(), &Layout, 0);
+  return EvaluateAsAbsolute(Res, &Layout.getAssembler(), &Layout, nullptr);
 }
 
 bool MCExpr::EvaluateAsAbsolute(int64_t &Res,
@@ -459,7 +464,7 @@ bool MCExpr::EvaluateAsAbsolute(int64_t &Res,
 }
 
 bool MCExpr::EvaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm) const {
-  return EvaluateAsAbsolute(Res, &Asm, 0, 0);
+  return EvaluateAsAbsolute(Res, &Asm, nullptr, nullptr);
 }
 
 bool MCExpr::EvaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
@@ -477,7 +482,8 @@ bool MCExpr::EvaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
   // absolutize differences across sections and that is what the MachO writer
   // uses Addrs for.
   bool IsRelocatable =
-    EvaluateAsRelocatableImpl(Value, Asm, Layout, Addrs, /*InSet*/ Addrs);
+      EvaluateAsRelocatableImpl(Value, Asm, Layout, Addrs, /*InSet*/ Addrs,
+                                /*ForceVarExpansion*/ false);
 
   // Record the current value.
   Res = Value.getConstant();
@@ -505,8 +511,8 @@ static void AttemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
   if (!Asm->getWriter().IsSymbolRefDifferenceFullyResolved(*Asm, A, B, InSet))
     return;
 
-  MCSymbolData &AD = Asm->getSymbolData(SA);
-  MCSymbolData &BD = Asm->getSymbolData(SB);
+  const MCSymbolData &AD = Asm->getSymbolData(SA);
+  const MCSymbolData &BD = Asm->getSymbolData(SB);
 
   if (AD.getFragment() == BD.getFragment()) {
     Addend += (AD.getOffset() - BD.getOffset());
@@ -518,7 +524,7 @@ static void AttemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
 
     // Clear the symbol expr pointers to indicate we have folded these
     // operands.
-    A = B = 0;
+    A = B = nullptr;
     return;
   }
 
@@ -544,7 +550,7 @@ static void AttemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
 
   // Clear the symbol expr pointers to indicate we have folded these
   // operands.
-  A = B = 0;
+  A = B = nullptr;
 }
 
 /// \brief Evaluate the result of an add between (conceptually) two MCValues.
@@ -627,15 +633,21 @@ static bool EvaluateSymbolicAdd(const MCAssembler *Asm,
 
 bool MCExpr::EvaluateAsRelocatable(MCValue &Res,
                                    const MCAsmLayout *Layout) const {
-  MCAssembler *Assembler = Layout ? &Layout->getAssembler() : 0;
-  return EvaluateAsRelocatableImpl(Res, Assembler, Layout, 0, false);
+  MCAssembler *Assembler = Layout ? &Layout->getAssembler() : nullptr;
+  return EvaluateAsRelocatableImpl(Res, Assembler, Layout, nullptr, false,
+                                   /*ForceVarExpansion*/ false);
 }
 
-bool MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                       const MCAssembler *Asm,
+bool MCExpr::EvaluateAsValue(MCValue &Res, const MCAsmLayout *Layout) const {
+  MCAssembler *Assembler = Layout ? &Layout->getAssembler() : nullptr;
+  return EvaluateAsRelocatableImpl(Res, Assembler, Layout, nullptr, false,
+                                   /*ForceVarExpansion*/ true);
+}
+
+bool MCExpr::EvaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
                                        const MCAsmLayout *Layout,
-                                       const SectionAddrMap *Addrs,
-                                       bool InSet) const {
+                                       const SectionAddrMap *Addrs, bool InSet,
+                                       bool ForceVarExpansion) const {
   ++stats::MCExprEvaluate;
 
   switch (getKind()) {
@@ -652,9 +664,9 @@ bool MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
     const MCAsmInfo &MCAsmInfo = SRE->getMCAsmInfo();
 
     // Evaluate recursively if this is a variable.
-    if (Sym.isVariable()) {
-      if (Sym.getVariableValue()->EvaluateAsRelocatableImpl(Res, Asm, Layout,
-                                                            Addrs, true)) {
+    if (Sym.isVariable() && SRE->getKind() == MCSymbolRefExpr::VK_None) {
+      if (Sym.getVariableValue()->EvaluateAsRelocatableImpl(
+              Res, Asm, Layout, Addrs, true, ForceVarExpansion)) {
         const MCSymbolRefExpr *A = Res.getSymA();
         const MCSymbolRefExpr *B = Res.getSymB();
 
@@ -668,15 +680,16 @@ bool MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
           if (!A && !B)
             return true;
         } else {
+          if (ForceVarExpansion)
+            return true;
           bool IsSymbol = A && A->getSymbol().isDefined();
-          bool IsWeakRef = SRE->getKind() == MCSymbolRefExpr::VK_WEAKREF;
-          if (!IsSymbol && !IsWeakRef)
+          if (!IsSymbol)
             return true;
         }
       }
     }
 
-    Res = MCValue::get(SRE, 0, 0);
+    Res = MCValue::get(SRE, nullptr, 0);
     return true;
   }
 
@@ -684,8 +697,8 @@ bool MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
     const MCUnaryExpr *AUE = cast<MCUnaryExpr>(this);
     MCValue Value;
 
-    if (!AUE->getSubExpr()->EvaluateAsRelocatableImpl(Value, Asm, Layout,
-                                                      Addrs, InSet))
+    if (!AUE->getSubExpr()->EvaluateAsRelocatableImpl(Value, Asm, Layout, Addrs,
+                                                      InSet, ForceVarExpansion))
       return false;
 
     switch (AUE->getOpcode()) {
@@ -718,10 +731,10 @@ bool MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
     const MCBinaryExpr *ABE = cast<MCBinaryExpr>(this);
     MCValue LHSValue, RHSValue;
 
-    if (!ABE->getLHS()->EvaluateAsRelocatableImpl(LHSValue, Asm, Layout,
-                                                  Addrs, InSet) ||
-        !ABE->getRHS()->EvaluateAsRelocatableImpl(RHSValue, Asm, Layout,
-                                                  Addrs, InSet))
+    if (!ABE->getLHS()->EvaluateAsRelocatableImpl(LHSValue, Asm, Layout, Addrs,
+                                                  InSet, ForceVarExpansion) ||
+        !ABE->getRHS()->EvaluateAsRelocatableImpl(RHSValue, Asm, Layout, Addrs,
+                                                  InSet, ForceVarExpansion))
       return false;
 
     // We only support a few operations on non-constant expressions, handle
@@ -795,7 +808,7 @@ const MCSection *MCExpr::FindAssociatedSection() const {
     if (Sym.isDefined())
       return &Sym.getSection();
 
-    return 0;
+    return nullptr;
   }
 
   case Unary:
diff --git a/lib/MC/MCExternalSymbolizer.cpp b/lib/MC/MCExternalSymbolizer.cpp
index 839516e..7c3073a 100644
--- a/lib/MC/MCExternalSymbolizer.cpp
+++ b/lib/MC/MCExternalSymbolizer.cpp
@@ -83,7 +83,7 @@ bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
       return false;
   }
 
-  const MCExpr *Add = NULL;
+  const MCExpr *Add = nullptr;
   if (SymbolicOp.AddSymbol.Present) {
     if (SymbolicOp.AddSymbol.Name) {
       StringRef Name(SymbolicOp.AddSymbol.Name);
@@ -94,7 +94,7 @@ bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
     }
   }
 
-  const MCExpr *Sub = NULL;
+  const MCExpr *Sub = nullptr;
   if (SymbolicOp.SubtractSymbol.Present) {
       if (SymbolicOp.SubtractSymbol.Name) {
       StringRef Name(SymbolicOp.SubtractSymbol.Name);
@@ -105,7 +105,7 @@ bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
     }
   }
 
-  const MCExpr *Off = NULL;
+  const MCExpr *Off = nullptr;
   if (SymbolicOp.Value != 0)
     Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx);
 
@@ -116,17 +116,17 @@ bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
       LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx);
     else
       LHS = MCUnaryExpr::CreateMinus(Sub, Ctx);
-    if (Off != 0)
+    if (Off)
       Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx);
     else
       Expr = LHS;
   } else if (Add) {
-    if (Off != 0)
+    if (Off)
       Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx);
     else
       Expr = Add;
   } else {
-    if (Off != 0)
+    if (Off)
       Expr = Off;
     else
       Expr = MCConstantExpr::Create(0, Ctx);
@@ -189,7 +189,7 @@ MCSymbolizer *createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
                                  void *DisInfo,
                                  MCContext *Ctx,
                                  MCRelocationInfo *RelInfo) {
-  assert(Ctx != 0 && "No MCContext given for symbolic disassembly");
+  assert(Ctx && "No MCContext given for symbolic disassembly");
 
   return new MCExternalSymbolizer(*Ctx,
                                   std::unique_ptr<MCRelocationInfo>(RelInfo),
diff --git a/lib/MC/MCFixup.cpp b/lib/MC/MCFixup.cpp
deleted file mode 100644
index 8f15db5..0000000
--- a/lib/MC/MCFixup.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===- MCFixup.cpp - Assembly Fixup Implementation ------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCFixup.h"
-using namespace llvm;
-
-static MCSymbolRefExpr::VariantKind getAccessVariant(const MCExpr *Expr) {
-  switch (Expr->getKind()) {
-  case MCExpr::Unary:
-  case MCExpr::Target:
-    llvm_unreachable("unsupported");
-
-  case MCExpr::Constant:
-    return MCSymbolRefExpr::VK_None;
-
-  case MCExpr::SymbolRef: {
-    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(Expr);
-    return SRE->getKind();
-  }
-  case MCExpr::Binary: {
-    const MCBinaryExpr *ABE = cast<MCBinaryExpr>(Expr);
-    assert(getAccessVariant(ABE->getRHS()) == MCSymbolRefExpr::VK_None);
-    return getAccessVariant(ABE->getLHS());
-  }
-  }
-  llvm_unreachable("unknown MCExpr kind");
-}
-
-MCSymbolRefExpr::VariantKind MCFixup::getAccessVariant() const {
-  return ::getAccessVariant(getValue());
-}
diff --git a/lib/MC/MCFunction.cpp b/lib/MC/MCFunction.cpp
index 767e1e0..1ddc250 100644
--- a/lib/MC/MCFunction.cpp
+++ b/lib/MC/MCFunction.cpp
@@ -20,22 +20,17 @@ MCFunction::MCFunction(StringRef Name, MCModule *Parent)
   : Name(Name), ParentModule(Parent)
 {}
 
-MCFunction::~MCFunction() {
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    delete *I;
-}
-
 MCBasicBlock &MCFunction::createBlock(const MCTextAtom &TA) {
-  MCBasicBlock *MCBB = new MCBasicBlock(TA, this);
-  Blocks.push_back(MCBB);
-  return *MCBB;
+  std::unique_ptr<MCBasicBlock> MCBB(new MCBasicBlock(TA, this));
+  Blocks.push_back(std::move(MCBB));
+  return *Blocks.back();
 }
 
 MCBasicBlock *MCFunction::find(uint64_t StartAddr) {
   for (const_iterator I = begin(), E = end(); I != E; ++I)
     if ((*I)->getInsts()->getBeginAddr() == StartAddr)
-      return *I;
-  return 0;
+      return I->get();
+  return nullptr;
 }
 
 const MCBasicBlock *MCFunction::find(uint64_t StartAddr) const {
diff --git a/lib/MC/MCInst.cpp b/lib/MC/MCInst.cpp
index 124cc14..d7b80f5 100644
--- a/lib/MC/MCInst.cpp
+++ b/lib/MC/MCInst.cpp
@@ -34,7 +34,7 @@ void MCOperand::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCOperand::dump() const {
-  print(dbgs(), 0);
+  print(dbgs(), nullptr);
   dbgs() << "\n";
 }
 #endif
@@ -66,7 +66,7 @@ void MCInst::dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI,
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCInst::dump() const {
-  print(dbgs(), 0);
+  print(dbgs(), nullptr);
   dbgs() << "\n";
 }
 #endif
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 7e437f4..37d05e9 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -89,7 +89,7 @@ public:
   }
   void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                              unsigned ByteAlignment) override;
-  void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
+  void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = nullptr,
                     uint64_t Size = 0, unsigned ByteAlignment = 0) override;
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                       uint64_t Size, unsigned ByteAlignment = 0) override;
@@ -172,7 +172,7 @@ void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) {
   MCSymbol *Start = getContext().CreateTempSymbol();
   EmitLabel(Start);
   // Record the region for the object writer to use.
-  DataRegionData Data = { Kind, Start, NULL };
+  DataRegionData Data = { Kind, Start, nullptr };
   std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
   Regions.push_back(Data);
 }
@@ -183,7 +183,7 @@ void MCMachOStreamer::EmitDataRegionEnd() {
   std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
   assert(Regions.size() && "Mismatched .end_data_region!");
   DataRegionData &Data = Regions.back();
-  assert(Data.End == NULL && "Mismatched .end_data_region!");
+  assert(!Data.End && "Mismatched .end_data_region!");
   // Create a temporary label to mark the end of the data region.
   Data.End = getContext().CreateTempSymbol();
   EmitLabel(Data.End);
@@ -237,10 +237,6 @@ void MCMachOStreamer::EmitThumbFunc(MCSymbol *Symbol) {
   // Remember that the function is a thumb function. Fixup and relocation
   // values will need adjusted.
   getAssembler().setIsThumbFunc(Symbol);
-
-  // Mark the thumb bit on the symbol.
-  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-  SD.setFlags(SD.getFlags() | SF_ThumbFunc);
 }
 
 bool MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
@@ -352,7 +348,7 @@ void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   // FIXME: Darwin 'as' does appear to allow redef of a .comm by itself.
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
-  AssignSection(Symbol, NULL);
+  AssignSection(Symbol, nullptr);
 
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
   SD.setExternal(true);
@@ -422,7 +418,7 @@ void MCMachOStreamer::EmitInstToData(const MCInst &Inst,
 }
 
 void MCMachOStreamer::FinishImpl() {
-  EmitFrames(&getAssembler().getBackend(), true);
+  EmitFrames(&getAssembler().getBackend());
 
   // We have to set the fragment atom associations so we can relax properly for
   // Mach-O.
@@ -430,13 +426,12 @@ void MCMachOStreamer::FinishImpl() {
   // First, scan the symbol table to build a lookup table from fragments to
   // defining symbols.
   DenseMap<const MCFragment*, MCSymbolData*> DefiningSymbolMap;
-  for (MCAssembler::symbol_iterator it = getAssembler().symbol_begin(),
-         ie = getAssembler().symbol_end(); it != ie; ++it) {
-    if (getAssembler().isSymbolLinkerVisible(it->getSymbol()) &&
-        it->getFragment()) {
+  for (MCSymbolData &SD : getAssembler().symbols()) {
+    if (getAssembler().isSymbolLinkerVisible(SD.getSymbol()) &&
+        SD.getFragment()) {
       // An atom defining symbol should never be internal to a fragment.
-      assert(it->getOffset() == 0 && "Invalid offset in atom defining symbol!");
-      DefiningSymbolMap[it->getFragment()] = it;
+      assert(SD.getOffset() == 0 && "Invalid offset in atom defining symbol!");
+      DefiningSymbolMap[SD.getFragment()] = &SD;
     }
   }
 
@@ -444,7 +439,7 @@ void MCMachOStreamer::FinishImpl() {
   // symbol.
   for (MCAssembler::iterator it = getAssembler().begin(),
          ie = getAssembler().end(); it != ie; ++it) {
-    MCSymbolData *CurrentAtom = 0;
+    MCSymbolData *CurrentAtom = nullptr;
     for (MCSectionData::iterator it2 = it->begin(),
            ie2 = it->end(); it2 != ie2; ++it2) {
       if (MCSymbolData *SD = DefiningSymbolMap.lookup(it2))
diff --git a/lib/MC/MCModule.cpp b/lib/MC/MCModule.cpp
index 7e9e18a..3ed7356 100644
--- a/lib/MC/MCModule.cpp
+++ b/lib/MC/MCModule.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCModule.h"
 #include "llvm/MC/MCAtom.h"
 #include "llvm/MC/MCFunction.h"
@@ -77,7 +78,7 @@ const MCAtom *MCModule::findAtomContaining(uint64_t Addr) const {
                                                   Addr, AtomComp);
   if (I != atom_end() && (*I)->getBeginAddr() <= Addr)
     return *I;
-  return 0;
+  return nullptr;
 }
 
 MCAtom *MCModule::findAtomContaining(uint64_t Addr) {
@@ -90,7 +91,7 @@ const MCAtom *MCModule::findFirstAtomAfter(uint64_t Addr) const {
                                                   Addr, AtomCompInv);
   if (I != atom_end())
     return *I;
-  return 0;
+  return nullptr;
 }
 
 MCAtom *MCModule::findFirstAtomAfter(uint64_t Addr) {
@@ -99,8 +100,9 @@ MCAtom *MCModule::findFirstAtomAfter(uint64_t Addr) {
 }
 
 MCFunction *MCModule::createFunction(StringRef Name) {
-  Functions.push_back(new MCFunction(Name, this));
-  return Functions.back();
+  std::unique_ptr<MCFunction> MCF(new MCFunction(Name, this));
+  Functions.push_back(std::move(MCF));
+  return Functions.back().get();
 }
 
 static bool CompBBToAtom(MCBasicBlock *BB, const MCTextAtom *Atom) {
@@ -130,13 +132,11 @@ void MCModule::trackBBForAtom(const MCTextAtom *Atom, MCBasicBlock *BB) {
   BBsByAtom.insert(I, BB);
 }
 
+MCModule::MCModule() : Entrypoint(0) { }
+
 MCModule::~MCModule() {
   for (AtomListTy::iterator AI = atom_begin(),
                             AE = atom_end();
                             AI != AE; ++AI)
     delete *AI;
-  for (FunctionListTy::iterator FI = func_begin(),
-                                FE = func_end();
-                                FI != FE; ++FI)
-    delete *FI;
 }
diff --git a/lib/MC/MCModuleYAML.cpp b/lib/MC/MCModuleYAML.cpp
index 102971b..f81cb14 100644
--- a/lib/MC/MCModuleYAML.cpp
+++ b/lib/MC/MCModuleYAML.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Object/YAML.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <vector>
@@ -162,12 +163,14 @@ template <> struct ScalarTraits<MCModuleYAML::Operand> {
   static void output(const MCModuleYAML::Operand &, void *,
                      llvm::raw_ostream &);
   static StringRef input(StringRef, void *, MCModuleYAML::Operand &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 template <> struct ScalarTraits<MCModuleYAML::OpcodeEnum> {
   static void output(const MCModuleYAML::OpcodeEnum &, void *,
                      llvm::raw_ostream &);
   static StringRef input(StringRef, void *, MCModuleYAML::OpcodeEnum &);
+  static bool mustQuote(StringRef) { return false; }
 };
 
 void ScalarEnumerationTraits<MCAtom::AtomKind>::enumeration(
@@ -276,7 +279,7 @@ class MCModule2YAML {
   const MCModule &MCM;
   MCModuleYAML::Module YAMLModule;
   void dumpAtom(const MCAtom *MCA);
-  void dumpFunction(const MCFunction *MCF);
+  void dumpFunction(const MCFunction &MCF);
   void dumpBasicBlock(const MCBasicBlock *MCBB);
 
 public:
@@ -300,7 +303,7 @@ MCModule2YAML::MCModule2YAML(const MCModule &MCM) : MCM(MCM), YAMLModule() {
     dumpAtom(*AI);
   for (MCModule::const_func_iterator FI = MCM.func_begin(), FE = MCM.func_end();
        FI != FE; ++FI)
-    dumpFunction(*FI);
+    dumpFunction(**FI);
 }
 
 void MCModule2YAML::dumpAtom(const MCAtom *MCA) {
@@ -328,22 +331,22 @@ void MCModule2YAML::dumpAtom(const MCAtom *MCA) {
   }
 }
 
-void MCModule2YAML::dumpFunction(const MCFunction *MCF) {
+void MCModule2YAML::dumpFunction(const MCFunction &MCF) {
   YAMLModule.Functions.resize(YAMLModule.Functions.size() + 1);
   MCModuleYAML::Function &F = YAMLModule.Functions.back();
-  F.Name = MCF->getName();
-  for (MCFunction::const_iterator BBI = MCF->begin(), BBE = MCF->end();
+  F.Name = MCF.getName();
+  for (MCFunction::const_iterator BBI = MCF.begin(), BBE = MCF.end();
        BBI != BBE; ++BBI) {
-    const MCBasicBlock *MCBB = *BBI;
+    const MCBasicBlock &MCBB = **BBI;
     F.BasicBlocks.resize(F.BasicBlocks.size() + 1);
     MCModuleYAML::BasicBlock &BB = F.BasicBlocks.back();
-    BB.Address = MCBB->getInsts()->getBeginAddr();
-    for (MCBasicBlock::pred_const_iterator PI = MCBB->pred_begin(),
-                                           PE = MCBB->pred_end();
+    BB.Address = MCBB.getInsts()->getBeginAddr();
+    for (MCBasicBlock::pred_const_iterator PI = MCBB.pred_begin(),
+                                           PE = MCBB.pred_end();
          PI != PE; ++PI)
       BB.Preds.push_back((*PI)->getInsts()->getBeginAddr());
-    for (MCBasicBlock::succ_const_iterator SI = MCBB->succ_begin(),
-                                           SE = MCBB->succ_end();
+    for (MCBasicBlock::succ_const_iterator SI = MCBB.succ_begin(),
+                                           SE = MCBB.succ_end();
          SI != SE; ++SI)
       BB.Succs.push_back((*SI)->getInsts()->getBeginAddr());
   }
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index 894eada..4f2740e 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -41,11 +41,6 @@ namespace {
 
     void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override {}
     void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override {}
-    void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
-                                  const MCSymbol *LastLabel,
-                                  const MCSymbol *Label,
-                                  unsigned PointerSize) override {}
-
     bool EmitSymbolAttribute(MCSymbol *Symbol,
                              MCSymbolAttr Attribute) override {
       return true;
@@ -64,13 +59,14 @@ namespace {
                           unsigned ByteAlignment) override {}
     void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                unsigned ByteAlignment) override {}
-    void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
+    void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = nullptr,
                       uint64_t Size = 0, unsigned ByteAlignment = 0) override {}
     void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                         uint64_t Size, unsigned ByteAlignment) override {}
     void EmitBytes(StringRef Data) override {}
 
-    void EmitValueImpl(const MCExpr *Value, unsigned Size) override {}
+    void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                       const SMLoc &Loc = SMLoc()) override {}
     void EmitULEB128Value(const MCExpr *Value) override {}
     void EmitSLEB128Value(const MCExpr *Value) override {}
     void EmitGPRel32Value(const MCExpr *Value) override {}
diff --git a/lib/MC/MCObjectDisassembler.cpp b/lib/MC/MCObjectDisassembler.cpp
index 146da6d..8a258cb 100644
--- a/lib/MC/MCObjectDisassembler.cpp
+++ b/lib/MC/MCObjectDisassembler.cpp
@@ -31,10 +31,12 @@
 using namespace llvm;
 using namespace object;
 
+#define DEBUG_TYPE "mc"
+
 MCObjectDisassembler::MCObjectDisassembler(const ObjectFile &Obj,
                                            const MCDisassembler &Dis,
                                            const MCInstrAnalysis &MIA)
-    : Obj(Obj), Dis(Dis), MIA(MIA), MOS(0) {}
+    : Obj(Obj), Dis(Dis), MIA(MIA), MOS(nullptr) {}
 
 uint64_t MCObjectDisassembler::getEntrypoint() {
   for (const SymbolRef &Symbol : Obj.symbols()) {
@@ -115,8 +117,8 @@ void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
     Section.getName(SecName);
 
     if (isText) {
-      MCTextAtom *Text = 0;
-      MCDataAtom *InvalidData = 0;
+      MCTextAtom *Text = nullptr;
+      MCDataAtom *InvalidData = nullptr;
 
       uint64_t InstSize;
       for (uint64_t Index = 0; Index < SecSize; Index += InstSize) {
@@ -129,11 +131,11 @@ void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
             Text->setName(SecName);
           }
           Text->addInst(Inst, InstSize);
-          InvalidData = 0;
+          InvalidData = nullptr;
         } else {
           assert(InstSize && "getInstruction() consumed no bytes");
           if (!InvalidData) {
-            Text = 0;
+            Text = nullptr;
             InvalidData = Module->createDataAtom(CurAddr, CurAddr+InstSize - 1);
           }
           for (uint64_t I = 0; I < InstSize; ++I)
@@ -160,7 +162,7 @@ namespace {
     BBInfoSetTy Preds;
     MCObjectDisassembler::AddressSetTy SuccAddrs;
 
-    BBInfo() : Atom(0), BB(0) {}
+    BBInfo() : Atom(nullptr), BB(nullptr) {}
 
     void addSucc(BBInfo &Succ) {
       Succs.insert(&Succ);
@@ -480,7 +482,7 @@ MCObjectDisassembler::createFunction(MCModule *Module, uint64_t BeginAddr,
       continue;
     // FIXME: MCModule should provide a findFunctionByAddr()
     if ((*FI)->getEntryBlock()->getInsts()->getBeginAddr() == BeginAddr)
-      return *FI;
+      return FI->get();
   }
 
   // Finally, just create a new one.
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 3b011c8..9d413af 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -10,6 +10,7 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
@@ -22,12 +23,13 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
   IsFunctionEHFrameSymbolPrivate = false;
   SupportsWeakOmittedEHFrame = false;
 
-  if (T.isOSDarwin() && T.getArch() == Triple::arm64)
+  if (T.isOSDarwin() &&
+      (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64))
     SupportsCompactUnwindWithoutEHFrame = true;
 
   PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel
     | dwarf::DW_EH_PE_sdata4;
-  LSDAEncoding = FDEEncoding = FDECFIEncoding = dwarf::DW_EH_PE_pcrel;
+  LSDAEncoding = FDECFIEncoding = dwarf::DW_EH_PE_pcrel;
   TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
     dwarf::DW_EH_PE_sdata4;
 
@@ -44,7 +46,7 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
                            SectionKind::getDataRel());
 
   // BSSSection might not be expected initialized on msvc.
-  BSSSection = 0;
+  BSSSection = nullptr;
 
   TLSDataSection // .tdata
     = Ctx->getMachOSection("__DATA", "__thread_data",
@@ -147,10 +149,11 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
   LSDASection = Ctx->getMachOSection("__TEXT", "__gcc_except_tab", 0,
                                      SectionKind::getReadOnlyWithRel());
 
-  COFFDebugSymbolsSection = 0;
+  COFFDebugSymbolsSection = nullptr;
 
   if ((T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) ||
-      (T.isOSDarwin() && T.getArch() == Triple::arm64)) {
+      (T.isOSDarwin() &&
+       (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64))) {
     CompactUnwindSection =
       Ctx->getMachOSection("__LD", "__compact_unwind",
                            MachO::S_ATTR_DEBUG,
@@ -158,7 +161,7 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
 
     if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86)
       CompactUnwindDwarfEHFrameOnly = 0x04000000;
-    else if (T.getArch() == Triple::arm64)
+    else if (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64)
       CompactUnwindDwarfEHFrameOnly = 0x03000000;
   }
 
@@ -245,29 +248,40 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
 }
 
 void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
-  if (T.getArch() == Triple::mips ||
-      T.getArch() == Triple::mipsel)
+  switch (T.getArch()) {
+  case Triple::mips:
+  case Triple::mipsel:
     FDECFIEncoding = dwarf::DW_EH_PE_sdata4;
-  else if (T.getArch() == Triple::mips64 ||
-           T.getArch() == Triple::mips64el)
+    break;
+  case Triple::mips64:
+  case Triple::mips64el:
     FDECFIEncoding = dwarf::DW_EH_PE_sdata8;
-  else
+    break;
+  default:
     FDECFIEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+    break;
+  }
 
-  if (T.getArch() == Triple::x86) {
+  switch (T.getArch()) {
+  case Triple::arm:
+  case Triple::armeb:
+  case Triple::thumb:
+  case Triple::thumbeb:
+    if (Ctx->getAsmInfo()->getExceptionHandlingType() == ExceptionHandling::ARM)
+      break;
+    // Fallthrough if not using EHABI
+  case Triple::x86:
     PersonalityEncoding = (RelocM == Reloc::PIC_)
      ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
      : dwarf::DW_EH_PE_absptr;
     LSDAEncoding = (RelocM == Reloc::PIC_)
       ? dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
       : dwarf::DW_EH_PE_absptr;
-    FDEEncoding = (RelocM == Reloc::PIC_)
-      ? dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
-      : dwarf::DW_EH_PE_absptr;
     TTypeEncoding = (RelocM == Reloc::PIC_)
      ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
      : dwarf::DW_EH_PE_absptr;
-  } else if (T.getArch() == Triple::x86_64) {
+    break;
+  case Triple::x86_64:
     if (RelocM == Reloc::PIC_) {
       PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         ((CMModel == CodeModel::Small || CMModel == CodeModel::Medium)
@@ -275,7 +289,6 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
       LSDAEncoding = dwarf::DW_EH_PE_pcrel |
         (CMModel == CodeModel::Small
          ? dwarf::DW_EH_PE_sdata4 : dwarf::DW_EH_PE_sdata8);
-      FDEEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
       TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         ((CMModel == CodeModel::Small || CMModel == CodeModel::Medium)
          ? dwarf::DW_EH_PE_sdata4 : dwarf::DW_EH_PE_sdata8);
@@ -285,12 +298,14 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
         ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
       LSDAEncoding = (CMModel == CodeModel::Small)
         ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
-      FDEEncoding = dwarf::DW_EH_PE_udata4;
       TTypeEncoding = (CMModel == CodeModel::Small)
         ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
     }
-  }  else if (T.getArch() == Triple::aarch64 ||
-              T.getArch() == Triple::aarch64_be ) {
+    break;
+  case Triple::aarch64:
+  case Triple::aarch64_be:
+  case Triple::arm64:
+  case Triple::arm64_be:
     // The small model guarantees static code/data size < 4GB, but not where it
     // will be in memory. Most of these could end up >2GB away so even a signed
     // pc-relative 32-bit address is insufficient, theoretically.
@@ -298,65 +313,64 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
       PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         dwarf::DW_EH_PE_sdata8;
       LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8;
-      FDEEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
       TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         dwarf::DW_EH_PE_sdata8;
     } else {
       PersonalityEncoding = dwarf::DW_EH_PE_absptr;
       LSDAEncoding = dwarf::DW_EH_PE_absptr;
-      FDEEncoding = dwarf::DW_EH_PE_udata4;
       TTypeEncoding = dwarf::DW_EH_PE_absptr;
     }
-  } else if (T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le) {
+    break;
+  case Triple::ppc64:
+  case Triple::ppc64le:
     PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
       dwarf::DW_EH_PE_udata8;
     LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8;
-    FDEEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8;
     TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
       dwarf::DW_EH_PE_udata8;
-  } else if (T.getArch() == Triple::sparc) {
+    break;
+  case Triple::sparc:
     if (RelocM == Reloc::PIC_) {
       LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
       PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         dwarf::DW_EH_PE_sdata4;
-      FDEEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
       TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         dwarf::DW_EH_PE_sdata4;
     } else {
       LSDAEncoding = dwarf::DW_EH_PE_absptr;
       PersonalityEncoding = dwarf::DW_EH_PE_absptr;
-      FDEEncoding = dwarf::DW_EH_PE_udata4;
       TTypeEncoding = dwarf::DW_EH_PE_absptr;
     }
-  } else if (T.getArch() == Triple::sparcv9) {
+    break;
+  case Triple::sparcv9:
     LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
     if (RelocM == Reloc::PIC_) {
       PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         dwarf::DW_EH_PE_sdata4;
-      FDEEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
       TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         dwarf::DW_EH_PE_sdata4;
     } else {
       PersonalityEncoding = dwarf::DW_EH_PE_absptr;
-      FDEEncoding = dwarf::DW_EH_PE_udata4;
       TTypeEncoding = dwarf::DW_EH_PE_absptr;
     }
-  } else if (T.getArch() == Triple::systemz) {
+    break;
+  case Triple::systemz:
     // All currently-defined code models guarantee that 4-byte PC-relative
     // values will be in range.
     if (RelocM == Reloc::PIC_) {
       PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         dwarf::DW_EH_PE_sdata4;
       LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
-      FDEEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
       TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
         dwarf::DW_EH_PE_sdata4;
     } else {
       PersonalityEncoding = dwarf::DW_EH_PE_absptr;
       LSDAEncoding = dwarf::DW_EH_PE_absptr;
-      FDEEncoding = dwarf::DW_EH_PE_absptr;
       TTypeEncoding = dwarf::DW_EH_PE_absptr;
     }
+    break;
+  default:
+    break;
   }
 
   // Solaris requires different flags for .eh_frame to seemingly every other
@@ -461,7 +475,7 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
                        ELF::SHF_ALLOC,
                        SectionKind::getReadOnly());
 
-  COFFDebugSymbolsSection = 0;
+  COFFDebugSymbolsSection = nullptr;
 
   // Debug Info Sections.
   DwarfAbbrevSection =
@@ -548,6 +562,10 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
 
 
 void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
+  // The object file format cannot represent common symbols with explicit
+  // alignments.
+  CommDirectiveSupportsAlignment = false;
+
   // COFF
   BSSSection =
     Ctx->getCOFFSection(".bss",
@@ -716,7 +734,7 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
 
   DrectveSection =
     Ctx->getCOFFSection(".drectve",
-                        COFF::IMAGE_SCN_LNK_INFO,
+                        COFF::IMAGE_SCN_LNK_INFO | COFF::IMAGE_SCN_LNK_REMOVE,
                         SectionKind::getMetadata());
 
   PDataSection =
@@ -751,17 +769,17 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
   IsFunctionEHFrameSymbolPrivate = true;
   SupportsCompactUnwindWithoutEHFrame = false;
 
-  PersonalityEncoding = LSDAEncoding = FDEEncoding = FDECFIEncoding =
-    TTypeEncoding = dwarf::DW_EH_PE_absptr;
+  PersonalityEncoding = LSDAEncoding = FDECFIEncoding = TTypeEncoding =
+      dwarf::DW_EH_PE_absptr;
 
   CompactUnwindDwarfEHFrameOnly = 0;
 
-  EHFrameSection = 0;             // Created on demand.
-  CompactUnwindSection = 0;       // Used only by selected targets.
-  DwarfAccelNamesSection = 0;     // Used only by selected targets.
-  DwarfAccelObjCSection = 0;      // Used only by selected targets.
-  DwarfAccelNamespaceSection = 0; // Used only by selected targets.
-  DwarfAccelTypesSection = 0;     // Used only by selected targets.
+  EHFrameSection = nullptr;             // Created on demand.
+  CompactUnwindSection = nullptr;       // Used only by selected targets.
+  DwarfAccelNamesSection = nullptr;     // Used only by selected targets.
+  DwarfAccelObjCSection = nullptr;      // Used only by selected targets.
+  DwarfAccelNamespaceSection = nullptr; // Used only by selected targets.
+  DwarfAccelTypesSection = nullptr;     // Used only by selected targets.
 
   Triple T(TT);
   Triple::ArchType Arch = T.getArch();
@@ -769,14 +787,15 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
   // cellspu-apple-darwin. Perhaps we should fix in Triple?
   if ((Arch == Triple::x86 || Arch == Triple::x86_64 ||
        Arch == Triple::arm || Arch == Triple::thumb ||
-       Arch == Triple::arm64 ||
+       Arch == Triple::arm64 || Arch == Triple::aarch64 ||
        Arch == Triple::ppc || Arch == Triple::ppc64 ||
        Arch == Triple::UnknownArch) &&
       (T.isOSDarwin() || T.isOSBinFormatMachO())) {
     Env = IsMachO;
     InitMachOMCObjectFileInfo(T);
-  } else if ((Arch == Triple::x86 || Arch == Triple::x86_64) &&
-             T.getObjectFormat() != Triple::ELF && T.isOSWindows()) {
+  } else if ((Arch == Triple::x86 || Arch == Triple::x86_64 ||
+              Arch == Triple::arm || Arch == Triple::thumb) &&
+             (T.isOSWindows() && T.getObjectFormat() == Triple::COFF)) {
     Env = IsCOFF;
     InitCOFFMCObjectFileInfo(T);
   } else {
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 4451264..a1aa602 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -20,7 +20,6 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/MC/MCSectionELF.h"
 using namespace llvm;
 
 MCObjectStreamer::MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB,
@@ -28,12 +27,13 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB,
     : MCStreamer(Context),
       Assembler(new MCAssembler(Context, TAB, *Emitter_,
                                 *TAB.createObjectWriter(OS), OS)),
-      CurSectionData(0) {}
+      CurSectionData(nullptr), EmitEHFrame(true), EmitDebugFrame(false) {}
 
 MCObjectStreamer::MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB,
                                    raw_ostream &OS, MCCodeEmitter *Emitter_,
                                    MCAssembler *_Assembler)
-    : MCStreamer(Context), Assembler(_Assembler), CurSectionData(0) {}
+    : MCStreamer(Context), Assembler(_Assembler), CurSectionData(nullptr),
+      EmitEHFrame(true), EmitDebugFrame(false) {}
 
 MCObjectStreamer::~MCObjectStreamer() {
   delete &Assembler->getBackend();
@@ -45,18 +45,31 @@ MCObjectStreamer::~MCObjectStreamer() {
 void MCObjectStreamer::reset() {
   if (Assembler)
     Assembler->reset();
-  CurSectionData = 0;
+  CurSectionData = nullptr;
   CurInsertionPoint = MCSectionData::iterator();
+  EmitEHFrame = true;
+  EmitDebugFrame = false;
   MCStreamer::reset();
 }
 
+void MCObjectStreamer::EmitFrames(MCAsmBackend *MAB) {
+  if (!getNumFrameInfos())
+    return;
+
+  if (EmitEHFrame)
+    MCDwarfFrameEmitter::Emit(*this, MAB, true);
+
+  if (EmitDebugFrame)
+    MCDwarfFrameEmitter::Emit(*this, MAB, false);
+}
+
 MCFragment *MCObjectStreamer::getCurrentFragment() const {
   assert(getCurrentSectionData() && "No current section!");
 
   if (CurInsertionPoint != getCurrentSectionData()->getFragmentList().begin())
     return std::prev(CurInsertionPoint);
 
-  return 0;
+  return nullptr;
 }
 
 MCDataFragment *MCObjectStreamer::getOrCreateDataFragment() const {
@@ -64,11 +77,7 @@ MCDataFragment *MCObjectStreamer::getOrCreateDataFragment() const {
   // When bundling is enabled, we don't want to add data to a fragment that
   // already has instructions (see MCELFStreamer::EmitInstToData for details)
   if (!F || (Assembler->isBundlingEnabled() && F->hasInstructions())) {
-    const auto *Sec = dyn_cast<MCSectionELF>(&getCurrentSectionData()->getSection());
-    if (Sec && Sec->getSectionName().startswith(".zdebug_"))
-      F = new MCCompressedFragment();
-    else
-      F = new MCDataFragment();
+    F = new MCDataFragment();
     insert(F);
   }
   return F;
@@ -102,7 +111,14 @@ const MCExpr *MCObjectStreamer::AddValueSymbols(const MCExpr *Value) {
   return Value;
 }
 
-void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size) {
+void MCObjectStreamer::EmitCFISections(bool EH, bool Debug) {
+  MCStreamer::EmitCFISections(EH, Debug);
+  EmitEHFrame = EH;
+  EmitDebugFrame = Debug;
+}
+
+void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                                     const SMLoc &Loc) {
   MCDataFragment *DF = getOrCreateDataFragment();
 
   MCLineEntry::Make(this, getCurrentSection().first);
@@ -115,7 +131,7 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size) {
   }
   DF->getFixups().push_back(
       MCFixup::Create(DF->getContents().size(), Value,
-                      MCFixup::getKindForSize(Size, false)));
+                      MCFixup::getKindForSize(Size, false), Loc));
   DF->getContents().resize(DF->getContents().size() + Size, 0);
 }
 
diff --git a/lib/MC/MCObjectSymbolizer.cpp b/lib/MC/MCObjectSymbolizer.cpp
index ba80d15..b149596 100644
--- a/lib/MC/MCObjectSymbolizer.cpp
+++ b/lib/MC/MCObjectSymbolizer.cpp
@@ -215,11 +215,11 @@ const SectionRef *MCObjectSymbolizer::findSectionContaining(uint64_t Addr) {
     It = std::lower_bound(SortedSections.begin(), EndIt,
                           Addr, SectionStartsBefore);
   if (It == EndIt)
-    return 0;
+    return nullptr;
   uint64_t SAddr; It->getAddress(SAddr);
   uint64_t SSize; It->getSize(SSize);
   if (Addr >= SAddr + SSize)
-    return 0;
+    return nullptr;
   return &*It;
 }
 
@@ -229,7 +229,7 @@ const RelocationRef *MCObjectSymbolizer::findRelocationAt(uint64_t Addr) {
 
   AddrToRelocMap::const_iterator RI = AddrToReloc.find(Addr);
   if (RI == AddrToReloc.end())
-    return 0;
+    return nullptr;
   return &RI->second;
 }
 
@@ -257,40 +257,12 @@ void MCObjectSymbolizer::buildSectionList() {
 
 void MCObjectSymbolizer::buildRelocationByAddrMap() {
   for (const SectionRef &Section : Obj->sections()) {
-    section_iterator RelSecI = Section.getRelocatedSection();
-    if (RelSecI == Obj->section_end())
-      continue;
-
-    uint64_t StartAddr; RelSecI->getAddress(StartAddr);
-    uint64_t Size; RelSecI->getSize(Size);
-    bool RequiredForExec;
-    RelSecI->isRequiredForExecution(RequiredForExec);
-    if (RequiredForExec == false || Size == 0)
-      continue;
     for (const RelocationRef &Reloc : Section.relocations()) {
-      // FIXME: libObject is inconsistent regarding error handling. The
-      // overwhelming majority of methods always return object_error::success,
-      // and assert for simple errors.. Here, ELFObjectFile::getRelocationOffset
-      // asserts when the file type isn't ET_REL.
-      // This workaround handles x86-64 elf, the only one that has a relocinfo.
-      uint64_t Offset;
-      if (Obj->isELF()) {
-        const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj);
-        if (ELFObj == 0)
-          break;
-        if (ELFObj->getELFFile()->getHeader()->e_type == ELF::ET_REL) {
-          Reloc.getOffset(Offset);
-          Offset += StartAddr;
-        } else {
-          Reloc.getAddress(Offset);
-        }
-      } else {
-        Reloc.getOffset(Offset);
-        Offset += StartAddr;
-      }
+      uint64_t Address;
+      Reloc.getAddress(Address);
       // At a specific address, only keep the first relocation.
-      if (AddrToReloc.find(Offset) == AddrToReloc.end())
-        AddrToReloc[Offset] = Reloc;
+      if (AddrToReloc.find(Address) == AddrToReloc.end())
+        AddrToReloc[Address] = Reloc;
     }
   }
 }
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index a3b68d8..bca516e 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -22,8 +22,8 @@
 using namespace llvm;
 
 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
-  CurBuf = NULL;
-  CurPtr = NULL;
+  CurBuf = nullptr;
+  CurPtr = nullptr;
   isAtStartOfLine = true;
   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
 }
@@ -39,7 +39,7 @@ void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) {
   else
     CurPtr = CurBuf->getBufferStart();
 
-  TokStart = 0;
+  TokStart = nullptr;
 }
 
 /// ReturnError - Set the error to the specified string at the specified
@@ -218,7 +218,7 @@ static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
 // integer as a hexadecimal, possibly with leading zeroes.
 static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
-  const char *FirstHex = 0;
+  const char *FirstHex = nullptr;
   const char *LookAhead = CurPtr;
   while (1) {
     if (isdigit(*LookAhead)) {
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 910a424..168597f 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
+#include <deque>
 #include <set>
 #include <string>
 #include <vector>
@@ -59,8 +60,9 @@ struct MCAsmMacroParameter {
   StringRef Name;
   MCAsmMacroArgument Value;
   bool Required;
+  bool Vararg;
 
-  MCAsmMacroParameter() : Required(false) { }
+  MCAsmMacroParameter() : Required(false), Vararg(false) {}
 };
 
 typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
@@ -110,7 +112,7 @@ struct ParseStatementInfo {
 
   SmallVectorImpl<AsmRewrite> *AsmRewrites;
 
-  ParseStatementInfo() : Opcode(~0U), ParseError(false), AsmRewrites(0) {}
+  ParseStatementInfo() : Opcode(~0U), ParseError(false), AsmRewrites(nullptr) {}
   ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
     : Opcode(~0), ParseError(false), AsmRewrites(rewrites) {}
 
@@ -292,7 +294,7 @@ private:
   void handleMacroExit();
 
   /// \brief Extract AsmTokens for a macro argument.
-  bool parseMacroArgument(MCAsmMacroArgument &MA);
+  bool parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg);
 
   /// \brief Parse all macro arguments for a given macro.
   bool parseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
@@ -495,9 +497,9 @@ enum { DEFAULT_ADDRSPACE = 0 };
 AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx, MCStreamer &_Out,
                      const MCAsmInfo &_MAI)
     : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
-      PlatformParser(0), CurBuffer(0), MacrosEnabledFlag(true),
-      CppHashLineNumber(0), AssemblerDialect(~0U), IsDarwin(false),
-      ParsingInlineAsm(false) {
+      PlatformParser(nullptr), CurBuffer(0), MacrosEnabledFlag(true),
+      HadError(false), CppHashLineNumber(0), AssemblerDialect(~0U),
+      IsDarwin(false), ParsingInlineAsm(false) {
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
   SavedDiagContext = SrcMgr.getDiagContext();
@@ -526,7 +528,8 @@ AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx, MCStreamer &_Out,
 }
 
 AsmParser::~AsmParser() {
-  assert(ActiveMacros.empty() && "Unexpected active macro instantiation!");
+  assert((HadError || ActiveMacros.empty()) &&
+         "Unexpected active macro instantiation!");
 
   // Destroy any macros.
   for (StringMap<MCAsmMacro *>::iterator it = MacroMap.begin(),
@@ -959,7 +962,7 @@ AsmParser::applyModifierToExpr(const MCExpr *E,
   switch (E->getKind()) {
   case MCExpr::Target:
   case MCExpr::Constant:
-    return 0;
+    return nullptr;
 
   case MCExpr::SymbolRef: {
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
@@ -977,7 +980,7 @@ AsmParser::applyModifierToExpr(const MCExpr *E,
     const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
     const MCExpr *Sub = applyModifierToExpr(UE->getSubExpr(), Variant);
     if (!Sub)
-      return 0;
+      return nullptr;
     return MCUnaryExpr::Create(UE->getOpcode(), Sub, getContext());
   }
 
@@ -987,7 +990,7 @@ AsmParser::applyModifierToExpr(const MCExpr *E,
     const MCExpr *RHS = applyModifierToExpr(BE->getRHS(), Variant);
 
     if (!LHS && !RHS)
-      return 0;
+      return nullptr;
 
     if (!LHS)
       LHS = BE->getLHS();
@@ -1013,7 +1016,7 @@ AsmParser::applyModifierToExpr(const MCExpr *E,
 ///
 bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
   // Parse the expression.
-  Res = 0;
+  Res = nullptr;
   if (parsePrimaryExpr(Res, EndLoc) || parseBinOpRHS(1, Res, EndLoc))
     return true;
 
@@ -1050,7 +1053,7 @@ bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
 }
 
 bool AsmParser::parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) {
-  Res = 0;
+  Res = nullptr;
   return parseParenExpr(Res, EndLoc) || parseBinOpRHS(1, Res, EndLoc);
 }
 
@@ -1701,7 +1704,7 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
     if (Parser->SavedDiagHandler)
       Parser->SavedDiagHandler(Diag, Parser->SavedDiagContext);
     else
-      Diag.print(0, OS);
+      Diag.print(nullptr, OS);
     return;
   }
 
@@ -1723,7 +1726,7 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   if (Parser->SavedDiagHandler)
     Parser->SavedDiagHandler(NewDiag, Parser->SavedDiagContext);
   else
-    NewDiag.print(0, OS);
+    NewDiag.print(nullptr, OS);
 }
 
 // FIXME: This is mostly duplicated from the function in AsmLexer.cpp. The
@@ -1739,6 +1742,7 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
                             ArrayRef<MCAsmMacroParameter> Parameters,
                             ArrayRef<MCAsmMacroArgument> A, const SMLoc &L) {
   unsigned NParameters = Parameters.size();
+  bool HasVararg = NParameters ? Parameters.back().Vararg : false;
   if ((!IsDarwin || NParameters != 0) && NParameters != A.size())
     return Error(L, "Wrong number of arguments");
 
@@ -1820,13 +1824,16 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
           Pos = I;
         }
       } else {
+        bool VarargParameter = HasVararg && Index == (NParameters - 1);
         for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
                                                 ie = A[Index].end();
              it != ie; ++it)
-          if (it->getKind() == AsmToken::String)
-            OS << it->getStringContents();
-          else
+          // We expect no quotes around the string's contents when
+          // parsing for varargs.
+          if (it->getKind() != AsmToken::String || VarargParameter)
             OS << it->getString();
+          else
+            OS << it->getStringContents();
 
         Pos += 1 + Argument.size();
       }
@@ -1890,7 +1897,16 @@ private:
 };
 }
 
-bool AsmParser::parseMacroArgument(MCAsmMacroArgument &MA) {
+bool AsmParser::parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg) {
+
+  if (Vararg) {
+    if (Lexer.isNot(AsmToken::EndOfStatement)) {
+      StringRef Str = parseStringToEndOfStatement();
+      MA.push_back(AsmToken(AsmToken::String, Str));
+    }
+    return false;
+  }
+
   unsigned ParenLevel = 0;
   unsigned AddTokens = 0;
 
@@ -1961,6 +1977,7 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
   // Parse two kinds of macro invocations:
   // - macros defined without any parameters accept an arbitrary number of them
   // - macros defined with parameters accept at most that many of them
+  bool HasVararg = NParameters ? M->Parameters.back().Vararg : false;
   for (unsigned Parameter = 0; !NParameters || Parameter < NParameters;
        ++Parameter) {
     SMLoc IDLoc = Lexer.getLoc();
@@ -1989,7 +2006,8 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
       return true;
     }
 
-    if (parseMacroArgument(FA.Value))
+    bool Vararg = HasVararg && Parameter == (NParameters - 1);
+    if (parseMacroArgument(FA.Value, Vararg))
       return true;
 
     unsigned PI = Parameter;
@@ -2050,7 +2068,7 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
 
 const MCAsmMacro *AsmParser::lookupMacro(StringRef Name) {
   StringMap<MCAsmMacro *>::iterator I = MacroMap.find(Name);
-  return (I == MacroMap.end()) ? NULL : I->getValue();
+  return (I == MacroMap.end()) ? nullptr : I->getValue();
 }
 
 void AsmParser::defineMacro(StringRef Name, const MCAsmMacro &Macro) {
@@ -2364,7 +2382,7 @@ bool AsmParser::parseDirectiveValue(unsigned Size) {
           return Error(ExprLoc, "literal value out of range for directive");
         getStreamer().EmitIntValue(IntValue, Size);
       } else
-        getStreamer().EmitValue(Value, Size);
+        getStreamer().EmitValue(Value, Size, ExprLoc);
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
@@ -3240,6 +3258,12 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
 
   MCAsmMacroParameters Parameters;
   while (getLexer().isNot(AsmToken::EndOfStatement)) {
+
+    if (Parameters.size() && Parameters.back().Vararg)
+      return Error(Lexer.getLoc(),
+                   "Vararg parameter '" + Parameters.back().Name +
+                   "' should be last one in the list of parameters.");
+
     MCAsmMacroParameter Parameter;
     if (parseIdentifier(Parameter.Name))
       return TokError("expected identifier in '.macro' directive");
@@ -3257,6 +3281,8 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
 
       if (Qualifier == "req")
         Parameter.Required = true;
+      else if (Qualifier == "vararg" && !IsDarwin)
+        Parameter.Vararg = true;
       else
         return Error(QualLoc, Qualifier + " is not a valid parameter qualifier "
                      "for '" + Parameter.Name + "' in macro '" + Name + "'");
@@ -3268,7 +3294,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
       SMLoc ParamLoc;
 
       ParamLoc = Lexer.getLoc();
-      if (parseMacroArgument(Parameter.Value))
+      if (parseMacroArgument(Parameter.Value, /*Vararg=*/false ))
         return true;
 
       if (Parameter.Required)
@@ -3906,9 +3932,9 @@ bool AsmParser::parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
     MCSymbol *Sym = getContext().LookupSymbol(Name);
 
     if (expect_defined)
-      TheCondState.CondMet = (Sym != NULL && !Sym->isUndefined());
+      TheCondState.CondMet = (Sym && !Sym->isUndefined());
     else
-      TheCondState.CondMet = (Sym == NULL || Sym->isUndefined());
+      TheCondState.CondMet = (!Sym || Sym->isUndefined());
     TheCondState.Ignore = !TheCondState.CondMet;
   }
 
@@ -4151,7 +4177,7 @@ MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
     // Check whether we have reached the end of the file.
     if (getLexer().is(AsmToken::Eof)) {
       Error(DirectiveLoc, "no matching '.endr' in definition");
-      return 0;
+      return nullptr;
     }
 
     if (Lexer.is(AsmToken::Identifier) &&
@@ -4166,7 +4192,7 @@ MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
         Lex();
         if (Lexer.isNot(AsmToken::EndOfStatement)) {
           TokError("unexpected token in '.endr' directive");
-          return 0;
+          return nullptr;
         }
         break;
       }
@@ -4260,7 +4286,7 @@ bool AsmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
   Lex();
 
   MCAsmMacroArguments A;
-  if (parseMacroArguments(0, A))
+  if (parseMacroArguments(nullptr, A))
     return true;
 
   // Eat the end of statement.
@@ -4300,7 +4326,7 @@ bool AsmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
   Lex();
 
   MCAsmMacroArguments A;
-  if (parseMacroArguments(0, A))
+  if (parseMacroArguments(nullptr, A))
     return true;
 
   if (A.size() != 1 || A.front().size() != 1)
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index 76d3f81..decf01c 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -293,7 +293,7 @@ bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
                                        unsigned Characteristics,
                                        SectionKind Kind) {
   return ParseSectionSwitch(Section, Characteristics, Kind, "",
-                            COFF::IMAGE_COMDAT_SELECT_ANY, 0);
+                            COFF::IMAGE_COMDAT_SELECT_ANY, nullptr);
 }
 
 bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
@@ -359,7 +359,7 @@ bool COFFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
   }
 
   COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
-  const MCSectionCOFF *Assoc = 0;
+  const MCSectionCOFF *Assoc = nullptr;
   StringRef COMDATSymName;
   if (getLexer().is(AsmToken::Comma)) {
     Lex();
@@ -504,7 +504,7 @@ bool COFFAsmParser::parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
 ///  ::= .linkonce [ identifier [ identifier ] ]
 bool COFFAsmParser::ParseDirectiveLinkOnce(StringRef, SMLoc Loc) {
   COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
-  const MCSectionCOFF *Assoc = 0;
+  const MCSectionCOFF *Assoc = nullptr;
   if (getLexer().is(AsmToken::Identifier))
     if (parseCOMDATTypeAndAssoc(Type, Assoc))
       return true;
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 0856b6e..f74b30a 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 using namespace llvm;
@@ -612,8 +613,8 @@ bool DarwinAsmParser::parseDirectivePopSection(StringRef, SMLoc) {
 ///   ::= .previous
 bool DarwinAsmParser::parseDirectivePrevious(StringRef DirName, SMLoc) {
   MCSectionSubPair PreviousSection = getStreamer().getPreviousSection();
-  if (PreviousSection.first == NULL)
-      return TokError(".previous without corresponding .section");
+  if (!PreviousSection.first)
+    return TokError(".previous without corresponding .section");
   getStreamer().SwitchSection(PreviousSection.first, PreviousSection.second);
   return false;
 }
@@ -630,13 +631,13 @@ bool DarwinAsmParser::parseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) {
 
   // Get the secure log path.
   const char *SecureLogFile = getContext().getSecureLogFile();
-  if (SecureLogFile == NULL)
+  if (!SecureLogFile)
     return Error(IDLoc, ".secure_log_unique used but AS_SECURE_LOG_FILE "
                  "environment variable unset.");
 
   // Open the secure log file if we haven't already.
   raw_ostream *OS = getContext().getSecureLog();
-  if (OS == NULL) {
+  if (!OS) {
     std::string Err;
     OS = new raw_fd_ostream(SecureLogFile, Err,
                             sys::fs::F_Append | sys::fs::F_Text);
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index d79dd67..95c4971 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -193,7 +193,7 @@ bool ELFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
 
 bool ELFAsmParser::ParseSectionSwitch(StringRef Section, unsigned Type,
                                       unsigned Flags, SectionKind Kind) {
-  const MCExpr *Subsection = 0;
+  const MCExpr *Subsection = nullptr;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     if (getParser().parseExpression(Subsection))
       return true;
@@ -411,7 +411,7 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush) {
   int64_t Size = 0;
   StringRef GroupName;
   unsigned Flags = 0;
-  const MCExpr *Subsection = 0;
+  const MCExpr *Subsection = nullptr;
   bool UseLastGroup = false;
 
   // Set the defaults first.
@@ -554,7 +554,7 @@ EndStmt:
 
 bool ELFAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
   MCSectionSubPair PreviousSection = getStreamer().getPreviousSection();
-  if (PreviousSection.first == NULL)
+  if (PreviousSection.first == nullptr)
       return TokError(".previous without corresponding .section");
   getStreamer().SwitchSection(PreviousSection.first, PreviousSection.second);
 
@@ -730,7 +730,7 @@ bool ELFAsmParser::ParseDirectiveWeakref(StringRef, SMLoc) {
 }
 
 bool ELFAsmParser::ParseDirectiveSubsection(StringRef, SMLoc) {
-  const MCExpr *Subsection = 0;
+  const MCExpr *Subsection = nullptr;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     if (getParser().parseExpression(Subsection))
      return true;
diff --git a/lib/MC/MCParser/MCAsmLexer.cpp b/lib/MC/MCParser/MCAsmLexer.cpp
index 3867691..530814b 100644
--- a/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/lib/MC/MCParser/MCAsmLexer.cpp
@@ -13,7 +13,7 @@
 using namespace llvm;
 
 MCAsmLexer::MCAsmLexer() : CurTok(AsmToken::Error, StringRef()),
-                           TokStart(0), SkipSpace(true) {
+                           TokStart(nullptr), SkipSpace(true) {
 }
 
 MCAsmLexer::~MCAsmLexer() {
diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index 6e1ebad..e417aa9 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp
@@ -17,7 +17,7 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-MCAsmParser::MCAsmParser() : TargetParser(0), ShowParsedOperands(0) {
+MCAsmParser::MCAsmParser() : TargetParser(nullptr), ShowParsedOperands(0) {
 }
 
 MCAsmParser::~MCAsmParser() {
diff --git a/lib/MC/MCRelocationInfo.cpp b/lib/MC/MCRelocationInfo.cpp
index 7d2ec1f..a00c009 100644
--- a/lib/MC/MCRelocationInfo.cpp
+++ b/lib/MC/MCRelocationInfo.cpp
@@ -23,14 +23,14 @@ MCRelocationInfo::~MCRelocationInfo() {
 
 const MCExpr *
 MCRelocationInfo::createExprForRelocation(object::RelocationRef Rel) {
-  return 0;
+  return nullptr;
 }
 
 const MCExpr *
 MCRelocationInfo::createExprForCAPIVariantKind(const MCExpr *SubExpr,
                                                unsigned VariantKind) {
   if (VariantKind != LLVMDisassembler_VariantKind_None)
-    return 0;
+    return nullptr;
   return SubExpr;
 }
 
diff --git a/lib/MC/MCSectionCOFF.cpp b/lib/MC/MCSectionCOFF.cpp
index ad9ca88..335b8cd 100644
--- a/lib/MC/MCSectionCOFF.cpp
+++ b/lib/MC/MCSectionCOFF.cpp
@@ -34,7 +34,7 @@ void MCSectionCOFF::setSelection(int Selection,
                                  const MCSectionCOFF *Assoc) const {
   assert(Selection != 0 && "invalid COMDAT selection type");
   assert((Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) ==
-         (Assoc != 0) &&
+         (Assoc != nullptr) &&
     "associative COMDAT section must have an associated section");
   this->Selection = Selection;
   this->Assoc = Assoc;
@@ -62,7 +62,8 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << 'r';
   if (getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE)
     OS << 'n';
-
+  if (getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
+    OS << 'd';
   OS << '"';
 
   if (getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
diff --git a/lib/MC/MCSectionMachO.cpp b/lib/MC/MCSectionMachO.cpp
index 9cc534d..46beda4 100644
--- a/lib/MC/MCSectionMachO.cpp
+++ b/lib/MC/MCSectionMachO.cpp
@@ -20,7 +20,7 @@ static const struct {
   const char *AssemblerName, *EnumName;
 } SectionTypeDescriptors[MachO::LAST_KNOWN_SECTION_TYPE+1] = {
   { "regular",                  "S_REGULAR" },                    // 0x00
-  { 0,                          "S_ZEROFILL" },                   // 0x01
+  { nullptr,                    "S_ZEROFILL" },                   // 0x01
   { "cstring_literals",         "S_CSTRING_LITERALS" },           // 0x02
   { "4byte_literals",           "S_4BYTE_LITERALS" },             // 0x03
   { "8byte_literals",           "S_8BYTE_LITERALS" },             // 0x04
@@ -31,11 +31,11 @@ static const struct {
   { "mod_init_funcs",           "S_MOD_INIT_FUNC_POINTERS" },     // 0x09
   { "mod_term_funcs",           "S_MOD_TERM_FUNC_POINTERS" },     // 0x0A
   { "coalesced",                "S_COALESCED" },                  // 0x0B
-  { 0, /*FIXME??*/              "S_GB_ZEROFILL" },                // 0x0C
+  { nullptr, /*FIXME??*/        "S_GB_ZEROFILL" },                // 0x0C
   { "interposing",              "S_INTERPOSING" },                // 0x0D
   { "16byte_literals",          "S_16BYTE_LITERALS" },            // 0x0E
-  { 0, /*FIXME??*/              "S_DTRACE_DOF" },                 // 0x0F
-  { 0, /*FIXME??*/              "S_LAZY_DYLIB_SYMBOL_POINTERS" }, // 0x10
+  { nullptr, /*FIXME??*/        "S_DTRACE_DOF" },                 // 0x0F
+  { nullptr, /*FIXME??*/        "S_LAZY_DYLIB_SYMBOL_POINTERS" }, // 0x10
   { "thread_local_regular",     "S_THREAD_LOCAL_REGULAR" },       // 0x11
   { "thread_local_zerofill",    "S_THREAD_LOCAL_ZEROFILL" },      // 0x12
   { "thread_local_variables",   "S_THREAD_LOCAL_VARIABLES" },     // 0x13
@@ -62,11 +62,11 @@ ENTRY("no_dead_strip",       S_ATTR_NO_DEAD_STRIP)
 ENTRY("live_support",        S_ATTR_LIVE_SUPPORT)
 ENTRY("self_modifying_code", S_ATTR_SELF_MODIFYING_CODE)
 ENTRY("debug",               S_ATTR_DEBUG)
-ENTRY(0 /*FIXME*/,           S_ATTR_SOME_INSTRUCTIONS)
-ENTRY(0 /*FIXME*/,           S_ATTR_EXT_RELOC)
-ENTRY(0 /*FIXME*/,           S_ATTR_LOC_RELOC)
+ENTRY(nullptr /*FIXME*/,     S_ATTR_SOME_INSTRUCTIONS)
+ENTRY(nullptr /*FIXME*/,     S_ATTR_EXT_RELOC)
+ENTRY(nullptr /*FIXME*/,     S_ATTR_LOC_RELOC)
 #undef ENTRY
-  { 0, "none", 0 }, // used if section has no attributes but has a stub size
+  { 0, "none", nullptr }, // used if section has no attributes but has a stub size
 };
 
 MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 8fa55aa..7dccf0d 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -37,8 +37,7 @@ void MCTargetStreamer::finish() {}
 void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}
 
 MCStreamer::MCStreamer(MCContext &Ctx)
-    : Context(Ctx), EmitEHFrame(true), EmitDebugFrame(false),
-      CurrentW64UnwindInfo(0), LastSymbol(0) {
+    : Context(Ctx), CurrentW64UnwindInfo(nullptr), LastSymbol(nullptr) {
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
 
@@ -51,10 +50,8 @@ void MCStreamer::reset() {
   for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i)
     delete W64UnwindInfos[i];
   W64UnwindInfos.clear();
-  EmitEHFrame = true;
-  EmitDebugFrame = false;
-  CurrentW64UnwindInfo = 0;
-  LastSymbol = 0;
+  CurrentW64UnwindInfo = nullptr;
+  LastSymbol = nullptr;
   SectionStack.clear();
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
@@ -147,8 +144,9 @@ void MCStreamer::EmitAbsValue(const MCExpr *Value, unsigned Size) {
 }
 
 
-void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size) {
-  EmitValueImpl(Value, Size);
+void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size,
+                           const SMLoc &Loc) {
+  EmitValueImpl(Value, Size, Loc);
 }
 
 void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size) {
@@ -203,7 +201,7 @@ MCSymbol *MCStreamer::getDwarfLineTableSymbol(unsigned CUID) {
 
 MCDwarfFrameInfo *MCStreamer::getCurrentFrameInfo() {
   if (FrameInfos.empty())
-    return 0;
+    return nullptr;
   return &FrameInfos.back();
 }
 
@@ -258,8 +256,6 @@ void MCStreamer::EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding) {
 
 void MCStreamer::EmitCFISections(bool EH, bool Debug) {
   assert(EH || Debug);
-  EmitEHFrame = EH;
-  EmitDebugFrame = Debug;
 }
 
 void MCStreamer::EmitCFIStartProc(bool IsSimple) {
@@ -278,6 +274,10 @@ void MCStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
 }
 
 void MCStreamer::RecordProcStart(MCDwarfFrameInfo &Frame) {
+  // Report an error if we haven't seen a symbol yet where we'd bind
+  // .cfi_startproc.
+  if (!LastSymbol)
+    report_fatal_error("No symbol to start a frame");
   Frame.Function = LastSymbol;
   // We need to create a local symbol to avoid relocations.
   Frame.Begin = getContext().CreateTempSymbol();
@@ -610,17 +610,6 @@ void MCStreamer::EmitRawText(const Twine &T) {
   EmitRawTextImpl(T.toStringRef(Str));
 }
 
-void MCStreamer::EmitFrames(MCAsmBackend *MAB, bool usingCFI) {
-  if (!getNumFrameInfos())
-    return;
-
-  if (EmitEHFrame)
-    MCDwarfFrameEmitter::Emit(*this, MAB, usingCFI, true);
-
-  if (EmitDebugFrame)
-    MCDwarfFrameEmitter::Emit(*this, MAB, usingCFI, false);
-}
-
 void MCStreamer::EmitW64Tables() {
   if (!getNumW64UnwindInfos())
     return;
@@ -639,11 +628,6 @@ void MCStreamer::Finish() {
   FinishImpl();
 }
 
-MCSymbolData &MCStreamer::getOrCreateSymbolData(const MCSymbol *Symbol) {
-  report_fatal_error("Not supported!");
-  return *(static_cast<MCSymbolData*>(0));
-}
-
 void MCStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
   Symbol->setVariableValue(Value);
 
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index 8d8e290..4424c91 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -24,9 +24,7 @@ MCSchedModel MCSchedModel::DefaultSchedModel; // For unknown processors.
 void
 MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef FS) {
   SubtargetFeatures Features(FS);
-  FeatureBits = Features.getFeatureBits(CPU, ProcDesc, NumProcs,
-                                        ProcFeatures, NumFeatures);
-
+  FeatureBits = Features.getFeatureBits(CPU, ProcDesc, ProcFeatures);
   InitCPUSchedModel(CPU);
 }
 
@@ -40,16 +38,15 @@ MCSubtargetInfo::InitCPUSchedModel(StringRef CPU) {
 
 void
 MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
-                                     const SubtargetFeatureKV *PF,
-                                     const SubtargetFeatureKV *PD,
+                                     ArrayRef<SubtargetFeatureKV> PF,
+                                     ArrayRef<SubtargetFeatureKV> PD,
                                      const SubtargetInfoKV *ProcSched,
                                      const MCWriteProcResEntry *WPR,
                                      const MCWriteLatencyEntry *WL,
                                      const MCReadAdvanceEntry *RA,
                                      const InstrStage *IS,
                                      const unsigned *OC,
-                                     const unsigned *FP,
-                                     unsigned NF, unsigned NP) {
+                                     const unsigned *FP) {
   TargetTriple = TT;
   ProcFeatures = PF;
   ProcDesc = PD;
@@ -61,8 +58,6 @@ MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
   Stages = IS;
   OperandCycles = OC;
   ForwardingPaths = FP;
-  NumFeatures = NF;
-  NumProcs = NP;
 
   InitMCProcessorInfo(CPU, FS);
 }
@@ -78,8 +73,7 @@ uint64_t MCSubtargetInfo::ToggleFeature(uint64_t FB) {
 /// bits. This version will also change all implied bits.
 uint64_t MCSubtargetInfo::ToggleFeature(StringRef FS) {
   SubtargetFeatures Features;
-  FeatureBits = Features.ToggleFeature(FeatureBits, FS,
-                                       ProcFeatures, NumFeatures);
+  FeatureBits = Features.ToggleFeature(FeatureBits, FS, ProcFeatures);
   return FeatureBits;
 }
 
@@ -88,6 +82,7 @@ const MCSchedModel *
 MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
   assert(ProcSchedModels && "Processor machine model not available!");
 
+  unsigned NumProcs = ProcDesc.size();
 #ifndef NDEBUG
   for (size_t i = 1; i < NumProcs; i++) {
     assert(strcmp(ProcSchedModels[i - 1].Key, ProcSchedModels[i].Key) < 0 &&
diff --git a/lib/MC/MCTargetOptions.cpp b/lib/MC/MCTargetOptions.cpp
new file mode 100644
index 0000000..8e946d5
--- /dev/null
+++ b/lib/MC/MCTargetOptions.cpp
@@ -0,0 +1,19 @@
+//===- lib/MC/MCTargetOptions.cpp - MC Target Options --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCTargetOptions.h"
+
+namespace llvm {
+
+MCTargetOptions::MCTargetOptions()
+    : SanitizeAddress(false), MCRelaxAll(false), MCNoExecStack(false),
+      MCSaveTempLabels(false), MCUseDwarfDirectory(false),
+      ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false) {}
+
+} // end namespace llvm
diff --git a/lib/MC/MCValue.cpp b/lib/MC/MCValue.cpp
index 68ecffb..9dfc56e 100644
--- a/lib/MC/MCValue.cpp
+++ b/lib/MC/MCValue.cpp
@@ -10,6 +10,7 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -38,6 +39,23 @@ void MCValue::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCValue::dump() const {
-  print(dbgs(), 0);
+  print(dbgs(), nullptr);
 }
 #endif
+
+MCSymbolRefExpr::VariantKind MCValue::getAccessVariant() const {
+  const MCSymbolRefExpr *B = getSymB();
+  if (B) {
+    if (B->getKind() != MCSymbolRefExpr::VK_None)
+      llvm_unreachable("unsupported");
+  }
+
+  const MCSymbolRefExpr *A = getSymA();
+  if (!A)
+    return MCSymbolRefExpr::VK_None;
+
+  MCSymbolRefExpr::VariantKind Kind = A->getKind();
+  if (Kind == MCSymbolRefExpr::VK_WEAKREF)
+    return MCSymbolRefExpr::VK_None;
+  return Kind;
+}
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 5fcea5f..cbaf0b8 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -26,6 +26,8 @@
 #include <vector>
 using namespace llvm;
 
+#define DEBUG_TYPE "mc"
+
 void MachObjectWriter::reset() {
   Relocations.clear();
   IndirectSymBase.clear();
@@ -349,6 +351,9 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
     }
   }
 
+  if (Layout.getAssembler().isThumbFunc(&Symbol))
+    Flags |= SF_ThumbFunc;
+
   // struct nlist (12 bytes)
 
   Write32(MSD.StringIndex);
@@ -516,15 +521,14 @@ ComputeSymbolTable(MCAssembler &Asm, SmallString<256> &StringTable,
   // table, then sort the symbols is chosen to match 'as'. Even though it
   // doesn't matter for correctness, this is important for letting us diff .o
   // files.
-  for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
-         ie = Asm.symbol_end(); it != ie; ++it) {
-    const MCSymbol &Symbol = it->getSymbol();
+  for (MCSymbolData &SD : Asm.symbols()) {
+    const MCSymbol &Symbol = SD.getSymbol();
 
     // Ignore non-linker visible symbols.
-    if (!Asm.isSymbolLinkerVisible(it->getSymbol()))
+    if (!Asm.isSymbolLinkerVisible(SD.getSymbol()))
       continue;
 
-    if (!it->isExternal() && !Symbol.isUndefined())
+    if (!SD.isExternal() && !Symbol.isUndefined())
       continue;
 
     uint64_t &Entry = StringIndexMap[Symbol.getName()];
@@ -535,7 +539,7 @@ ComputeSymbolTable(MCAssembler &Asm, SmallString<256> &StringTable,
     }
 
     MachSymbolData MSD;
-    MSD.SymbolData = it;
+    MSD.SymbolData = &SD;
     MSD.StringIndex = Entry;
 
     if (Symbol.isUndefined()) {
@@ -552,15 +556,14 @@ ComputeSymbolTable(MCAssembler &Asm, SmallString<256> &StringTable,
   }
 
   // Now add the data for local symbols.
-  for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
-         ie = Asm.symbol_end(); it != ie; ++it) {
-    const MCSymbol &Symbol = it->getSymbol();
+  for (MCSymbolData &SD : Asm.symbols()) {
+    const MCSymbol &Symbol = SD.getSymbol();
 
     // Ignore non-linker visible symbols.
-    if (!Asm.isSymbolLinkerVisible(it->getSymbol()))
+    if (!Asm.isSymbolLinkerVisible(SD.getSymbol()))
       continue;
 
-    if (it->isExternal() || Symbol.isUndefined())
+    if (SD.isExternal() || Symbol.isUndefined())
       continue;
 
     uint64_t &Entry = StringIndexMap[Symbol.getName()];
@@ -571,7 +574,7 @@ ComputeSymbolTable(MCAssembler &Asm, SmallString<256> &StringTable,
     }
 
     MachSymbolData MSD;
-    MSD.SymbolData = it;
+    MSD.SymbolData = &SD;
     MSD.StringIndex = Entry;
 
     if (Symbol.isAbsolute()) {
@@ -621,10 +624,7 @@ void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm,
 
 void MachObjectWriter::markAbsoluteVariableSymbols(MCAssembler &Asm,
                                                    const MCAsmLayout &Layout) {
-  for (MCAssembler::symbol_iterator i = Asm.symbol_begin(),
-                                    e = Asm.symbol_end();
-      i != e; ++i) {
-    MCSymbolData &SD = *i;
+  for (MCSymbolData &SD : Asm.symbols()) {
     if (!SD.getSymbol().isVariable())
       continue;
 
@@ -669,7 +669,7 @@ IsSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
   //   - addr(atom(B)) - offset(B)
   // and the offsets are not relocatable, so the fixup is fully resolved when
   //  addr(atom(A)) - addr(atom(B)) == 0.
-  const MCSymbolData *A_Base = 0, *B_Base = 0;
+  const MCSymbolData *A_Base = nullptr, *B_Base = nullptr;
 
   const MCSymbol &SA = DataA.getSymbol().AliasedSymbol();
   const MCSection &SecA = SA.getSection();
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index 2fb91f2..27525c7 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -51,40 +51,12 @@ static inline bool isEnabled(const StringRef Feature) {
   return Ch == '+';
 }
 
-/// PrependFlag - Return a string with a prepended flag; '+' or '-'.
-///
-static inline std::string PrependFlag(const StringRef Feature,
-                                    bool IsEnabled) {
-  assert(!Feature.empty() && "Empty string");
-  if (hasFlag(Feature))
-    return Feature;
-  std::string Prefix = IsEnabled ? "+" : "-";
-  Prefix += Feature;
-  return Prefix;
-}
-
 /// Split - Splits a string of comma separated items in to a vector of strings.
 ///
 static void Split(std::vector<std::string> &V, const StringRef S) {
-  if (S.empty())
-    return;
-
-  // Start at beginning of string.
-  size_t Pos = 0;
-  while (true) {
-    // Find the next comma
-    size_t Comma = S.find(',', Pos);
-    // If no comma found then the rest of the string is used
-    if (Comma == std::string::npos) {
-      // Add string to vector
-      V.push_back(S.substr(Pos));
-      break;
-    }
-    // Otherwise add substring to vector
-    V.push_back(S.substr(Pos, Comma - Pos));
-    // Advance to next item
-    Pos = Comma + 1;
-  }
+  SmallVector<StringRef, 2> Tmp;
+  S.split(Tmp, ",", -1, false /* KeepEmpty */);
+  V.assign(Tmp.begin(), Tmp.end());
 }
 
 /// Join a vector of strings to a string with a comma separating each element.
@@ -109,63 +81,55 @@ static std::string Join(const std::vector<std::string> &V) {
 }
 
 /// Adding features.
-void SubtargetFeatures::AddFeature(const StringRef String,
-                                   bool IsEnabled) {
-  // Don't add empty features
-  if (!String.empty()) {
-    // Convert to lowercase, prepend flag and add to vector
-    Features.push_back(PrependFlag(String.lower(), IsEnabled));
-  }
+void SubtargetFeatures::AddFeature(const StringRef String) {
+  // Don't add empty features or features we already have.
+  if (!String.empty())
+    // Convert to lowercase, prepend flag if we don't already have a flag.
+    Features.push_back(hasFlag(String) ? String.str() : "+" + String.lower());
 }
 
 /// Find KV in array using binary search.
-static const SubtargetFeatureKV *Find(StringRef S, const SubtargetFeatureKV *A,
-                                      size_t L) {
-  // Determine the end of the array
-  const SubtargetFeatureKV *Hi = A + L;
+static const SubtargetFeatureKV *Find(StringRef S,
+                                      ArrayRef<SubtargetFeatureKV> A) {
   // Binary search the array
-  const SubtargetFeatureKV *F = std::lower_bound(A, Hi, S);
+  auto F = std::lower_bound(A.begin(), A.end(), S);
   // If not found then return NULL
-  if (F == Hi || StringRef(F->Key) != S) return NULL;
+  if (F == A.end() || StringRef(F->Key) != S) return nullptr;
   // Return the found array item
   return F;
 }
 
 /// getLongestEntryLength - Return the length of the longest entry in the table.
 ///
-static size_t getLongestEntryLength(const SubtargetFeatureKV *Table,
-                                    size_t Size) {
+static size_t getLongestEntryLength(ArrayRef<SubtargetFeatureKV> Table) {
   size_t MaxLen = 0;
-  for (size_t i = 0; i < Size; i++)
-    MaxLen = std::max(MaxLen, std::strlen(Table[i].Key));
+  for (auto &I : Table)
+    MaxLen = std::max(MaxLen, std::strlen(I.Key));
   return MaxLen;
 }
 
 /// Display help for feature choices.
 ///
-static void Help(const SubtargetFeatureKV *CPUTable, size_t CPUTableSize,
-                 const SubtargetFeatureKV *FeatTable, size_t FeatTableSize) {
+static void Help(ArrayRef<SubtargetFeatureKV> CPUTable,
+                 ArrayRef<SubtargetFeatureKV> FeatTable) {
   // Determine the length of the longest CPU and Feature entries.
-  unsigned MaxCPULen  = getLongestEntryLength(CPUTable, CPUTableSize);
-  unsigned MaxFeatLen = getLongestEntryLength(FeatTable, FeatTableSize);
+  unsigned MaxCPULen  = getLongestEntryLength(CPUTable);
+  unsigned MaxFeatLen = getLongestEntryLength(FeatTable);
 
   // Print the CPU table.
   errs() << "Available CPUs for this target:\n\n";
-  for (size_t i = 0; i != CPUTableSize; i++)
-    errs() << format("  %-*s - %s.\n",
-                     MaxCPULen, CPUTable[i].Key, CPUTable[i].Desc);
+  for (auto &CPU : CPUTable)
+    errs() << format("  %-*s - %s.\n", MaxCPULen, CPU.Key, CPU.Desc);
   errs() << '\n';
 
   // Print the Feature table.
   errs() << "Available features for this target:\n\n";
-  for (size_t i = 0; i != FeatTableSize; i++)
-    errs() << format("  %-*s - %s.\n",
-                     MaxFeatLen, FeatTable[i].Key, FeatTable[i].Desc);
+  for (auto &Feature : FeatTable)
+    errs() << format("  %-*s - %s.\n", MaxFeatLen, Feature.Key, Feature.Desc);
   errs() << '\n';
 
   errs() << "Use +feature to enable a feature, or -feature to disable it.\n"
             "For example, llc -mcpu=mycpu -mattr=+feature1,-feature2\n";
-  std::exit(1);
 }
 
 //===----------------------------------------------------------------------===//
@@ -187,16 +151,13 @@ std::string SubtargetFeatures::getString() const {
 ///
 static
 void SetImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
-                    const SubtargetFeatureKV *FeatureTable,
-                    size_t FeatureTableSize) {
-  for (size_t i = 0; i < FeatureTableSize; ++i) {
-    const SubtargetFeatureKV &FE = FeatureTable[i];
-
+                    ArrayRef<SubtargetFeatureKV> FeatureTable) {
+  for (auto &FE : FeatureTable) {
     if (FeatureEntry->Value == FE.Value) continue;
 
     if (FeatureEntry->Implies & FE.Value) {
       Bits |= FE.Value;
-      SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+      SetImpliedBits(Bits, &FE, FeatureTable);
     }
   }
 }
@@ -206,16 +167,13 @@ void SetImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
 ///
 static
 void ClearImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
-                      const SubtargetFeatureKV *FeatureTable,
-                      size_t FeatureTableSize) {
-  for (size_t i = 0; i < FeatureTableSize; ++i) {
-    const SubtargetFeatureKV &FE = FeatureTable[i];
-
+                      ArrayRef<SubtargetFeatureKV> FeatureTable) {
+  for (auto &FE : FeatureTable) {
     if (FeatureEntry->Value == FE.Value) continue;
 
     if (FE.Implies & FeatureEntry->Value) {
       Bits &= ~FE.Value;
-      ClearImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+      ClearImpliedBits(Bits, &FE, FeatureTable);
     }
   }
 }
@@ -224,23 +182,23 @@ void ClearImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
 /// bits.
 uint64_t
 SubtargetFeatures::ToggleFeature(uint64_t Bits, const StringRef Feature,
-                                 const SubtargetFeatureKV *FeatureTable,
-                                 size_t FeatureTableSize) {
+                                 ArrayRef<SubtargetFeatureKV> FeatureTable) {
+
   // Find feature in table.
   const SubtargetFeatureKV *FeatureEntry =
-    Find(StripFlag(Feature), FeatureTable, FeatureTableSize);
+      Find(StripFlag(Feature), FeatureTable);
   // If there is a match
   if (FeatureEntry) {
     if ((Bits & FeatureEntry->Value) == FeatureEntry->Value) {
       Bits &= ~FeatureEntry->Value;
 
       // For each feature that implies this, clear it.
-      ClearImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+      ClearImpliedBits(Bits, FeatureEntry, FeatureTable);
     } else {
       Bits |=  FeatureEntry->Value;
 
       // For each feature that this implies, set it.
-      SetImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+      SetImpliedBits(Bits, FeatureEntry, FeatureTable);
     }
   } else {
     errs() << "'" << Feature
@@ -254,20 +212,20 @@ SubtargetFeatures::ToggleFeature(uint64_t Bits, const StringRef Feature,
 
 /// getFeatureBits - Get feature bits a CPU.
 ///
-uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU,
-                                         const SubtargetFeatureKV *CPUTable,
-                                         size_t CPUTableSize,
-                                         const SubtargetFeatureKV *FeatureTable,
-                                         size_t FeatureTableSize) {
-  if (!FeatureTableSize || !CPUTableSize)
+uint64_t
+SubtargetFeatures::getFeatureBits(const StringRef CPU,
+                                  ArrayRef<SubtargetFeatureKV> CPUTable,
+                                  ArrayRef<SubtargetFeatureKV> FeatureTable) {
+
+  if (CPUTable.empty() || FeatureTable.empty())
     return 0;
 
 #ifndef NDEBUG
-  for (size_t i = 1; i < CPUTableSize; i++) {
+  for (size_t i = 1, e = CPUTable.size(); i != e; ++i) {
     assert(strcmp(CPUTable[i - 1].Key, CPUTable[i].Key) < 0 &&
            "CPU table is not sorted");
   }
-  for (size_t i = 1; i < FeatureTableSize; i++) {
+  for (size_t i = 1, e = FeatureTable.size(); i != e; ++i) {
     assert(strcmp(FeatureTable[i - 1].Key, FeatureTable[i].Key) < 0 &&
           "CPU features table is not sorted");
   }
@@ -276,21 +234,21 @@ uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU,
 
   // Check if help is needed
   if (CPU == "help")
-    Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
+    Help(CPUTable, FeatureTable);
 
   // Find CPU entry if CPU name is specified.
-  if (!CPU.empty()) {
-    const SubtargetFeatureKV *CPUEntry = Find(CPU, CPUTable, CPUTableSize);
+  else if (!CPU.empty()) {
+    const SubtargetFeatureKV *CPUEntry = Find(CPU, CPUTable);
+
     // If there is a match
     if (CPUEntry) {
       // Set base feature bits
       Bits = CPUEntry->Value;
 
       // Set the feature implied by this CPU feature, if any.
-      for (size_t i = 0; i < FeatureTableSize; ++i) {
-        const SubtargetFeatureKV &FE = FeatureTable[i];
+      for (auto &FE : FeatureTable) {
         if (CPUEntry->Value & FE.Value)
-          SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+          SetImpliedBits(Bits, &FE, FeatureTable);
       }
     } else {
       errs() << "'" << CPU
@@ -300,16 +258,14 @@ uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU,
   }
 
   // Iterate through each feature
-  for (size_t i = 0, E = Features.size(); i < E; i++) {
-    const StringRef Feature = Features[i];
-
+  for (auto &Feature : Features) {
     // Check for help
     if (Feature == "+help")
-      Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
+      Help(CPUTable, FeatureTable);
 
     // Find feature in table.
     const SubtargetFeatureKV *FeatureEntry =
-                       Find(StripFlag(Feature), FeatureTable, FeatureTableSize);
+        Find(StripFlag(Feature), FeatureTable);
     // If there is a match
     if (FeatureEntry) {
       // Enable/disable feature in bits
@@ -317,12 +273,12 @@ uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU,
         Bits |=  FeatureEntry->Value;
 
         // For each feature that this implies, set it.
-        SetImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+        SetImpliedBits(Bits, FeatureEntry, FeatureTable);
       } else {
         Bits &= ~FeatureEntry->Value;
 
         // For each feature that implies this, clear it.
-        ClearImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+        ClearImpliedBits(Bits, FeatureEntry, FeatureTable);
       }
     } else {
       errs() << "'" << Feature
@@ -337,8 +293,8 @@ uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU,
 /// print - Print feature string.
 ///
 void SubtargetFeatures::print(raw_ostream &OS) const {
-  for (size_t i = 0, e = Features.size(); i != e; ++i)
-    OS << Features[i] << "  ";
+  for (auto &F : Features)
+    OS << F << " ";
   OS << "\n";
 }
 
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 500acd8..961cbc6 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -11,12 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "WinCOFFObjectWriter"
-
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
@@ -35,6 +34,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "WinCOFFObjectWriter"
+
 namespace {
 typedef SmallString<COFF::NameSize> name;
 
@@ -81,7 +82,7 @@ struct COFFRelocation {
   COFF::relocation Data;
   COFFSymbol          *Symb;
 
-  COFFRelocation() : Symb(NULL) {}
+  COFFRelocation() : Symb(nullptr) {}
   static size_t size() { return COFF::RelocationSize; }
 };
 
@@ -118,8 +119,8 @@ public:
 class WinCOFFObjectWriter : public MCObjectWriter {
 public:
 
-  typedef std::vector<COFFSymbol*>  symbols;
-  typedef std::vector<COFFSection*> sections;
+  typedef std::vector<std::unique_ptr<COFFSymbol>>  symbols;
+  typedef std::vector<std::unique_ptr<COFFSection>> sections;
 
   typedef DenseMap<MCSymbol  const *, COFFSymbol *>   symbol_map;
   typedef DenseMap<MCSection const *, COFFSection *> section_map;
@@ -137,7 +138,6 @@ public:
   symbol_map  SymbolMap;
 
   WinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW, raw_ostream &OS);
-  virtual ~WinCOFFObjectWriter();
 
   COFFSymbol *createSymbol(StringRef Name);
   COFFSymbol *GetOrCreateCOFFSymbol(const MCSymbol * Symbol);
@@ -160,7 +160,7 @@ public:
   // Entity writing methods.
 
   void WriteFileHeader(const COFF::header &Header);
-  void WriteSymbol(const COFFSymbol *S);
+  void WriteSymbol(const COFFSymbol &S);
   void WriteAuxiliarySymbols(const COFFSymbol::AuxiliarySymbols &S);
   void WriteSectionHeader(const COFF::section &S);
   void WriteRelocation(const COFF::relocation &R);
@@ -192,10 +192,10 @@ static inline void write_uint32_le(void *Data, uint32_t const &Value) {
 
 COFFSymbol::COFFSymbol(StringRef name)
   : Name(name.begin(), name.end())
-  , Other(NULL)
-  , Section(NULL)
+  , Other(nullptr)
+  , Section(nullptr)
   , Relocations(0)
-  , MCData(NULL) {
+  , MCData(nullptr) {
   memset(&Data, 0, sizeof(Data));
 }
 
@@ -214,7 +214,7 @@ void COFFSymbol::set_name_offset(uint32_t Offset) {
 /// logic to decide if the symbol should be reported in the symbol table
 bool COFFSymbol::should_keep() const {
   // no section means its external, keep it
-  if (Section == NULL)
+  if (!Section)
     return true;
 
   // if it has relocations pointing at it, keep it
@@ -244,8 +244,8 @@ bool COFFSymbol::should_keep() const {
 
 COFFSection::COFFSection(StringRef name)
   : Name(name)
-  , MCData(NULL)
-  , Symbol(NULL) {
+  , MCData(nullptr)
+  , Symbol(nullptr) {
   memset(&Header, 0, sizeof(Header));
 }
 
@@ -308,13 +308,6 @@ WinCOFFObjectWriter::WinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW,
   Header.Machine = TargetObjectWriter->getMachine();
 }
 
-WinCOFFObjectWriter::~WinCOFFObjectWriter() {
-  for (symbols::iterator I = Symbols.begin(), E = Symbols.end(); I != E; ++I)
-    delete *I;
-  for (sections::iterator I = Sections.begin(), E = Sections.end(); I != E; ++I)
-    delete *I;
-}
-
 COFFSymbol *WinCOFFObjectWriter::createSymbol(StringRef Name) {
   return createCOFFEntity<COFFSymbol>(Name, Symbols);
 }
@@ -338,11 +331,9 @@ COFFSection *WinCOFFObjectWriter::createSection(StringRef Name) {
 template <typename object_t, typename list_t>
 object_t *WinCOFFObjectWriter::createCOFFEntity(StringRef Name,
                                                 list_t &List) {
-  object_t *Object = new object_t(Name);
-
-  List.push_back(Object);
+  List.push_back(make_unique<object_t>(Name));
 
-  return Object;
+  return List.back().get();
 }
 
 /// This function takes a section data object from the assembler
@@ -394,7 +385,19 @@ void WinCOFFObjectWriter::DefineSection(MCSectionData const &SectionData) {
   SectionMap[&SectionData.getSection()] = coff_section;
 }
 
-/// This function takes a section data object from the assembler
+static uint64_t getSymbolValue(const MCSymbolData &Data,
+                               const MCAsmLayout &Layout) {
+  if (Data.isCommon() && Data.isExternal())
+    return Data.getCommonSize();
+
+  uint64_t Res;
+  if (!Layout.getSymbolOffset(&Data, Res))
+    return 0;
+
+  return Res;
+}
+
+/// This function takes a symbol data object from the assembler
 /// and creates the associated COFF symbol staging object.
 void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
                                        MCAssembler &Assembler,
@@ -436,31 +439,29 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
 
     coff_symbol->MCData = &SymbolData;
   } else {
-    const MCSymbolData &ResSymData =
-      Assembler.getSymbolData(Symbol.AliasedSymbol());
-
-    if (Symbol.isVariable()) {
-      int64_t Addr;
-      if (Symbol.getVariableValue()->EvaluateAsAbsolute(Addr, Layout))
-        coff_symbol->Data.Value = Addr;
-    }
+    const MCSymbolData &ResSymData = Assembler.getSymbolData(Symbol);
+    const MCSymbol *Base = Layout.getBaseSymbol(Symbol);
+    coff_symbol->Data.Value = getSymbolValue(ResSymData, Layout);
 
     coff_symbol->Data.Type         = (ResSymData.getFlags() & 0x0000FFFF) >>  0;
     coff_symbol->Data.StorageClass = (ResSymData.getFlags() & 0x00FF0000) >> 16;
 
     // If no storage class was specified in the streamer, define it here.
     if (coff_symbol->Data.StorageClass == 0) {
-      bool external = ResSymData.isExternal() || (ResSymData.Fragment == NULL);
+      bool external = ResSymData.isExternal() || !ResSymData.Fragment;
 
       coff_symbol->Data.StorageClass =
        external ? COFF::IMAGE_SYM_CLASS_EXTERNAL : COFF::IMAGE_SYM_CLASS_STATIC;
     }
 
-    if (Symbol.isAbsolute() || Symbol.AliasedSymbol().isVariable())
+    if (!Base) {
       coff_symbol->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
-    else if (ResSymData.Fragment != NULL)
-      coff_symbol->Section =
-        SectionMap[&ResSymData.Fragment->getParent()->getSection()];
+    } else {
+      const MCSymbolData &BaseData = Assembler.getSymbolData(*Base);
+      if (BaseData.Fragment)
+        coff_symbol->Section =
+            SectionMap[&BaseData.Fragment->getParent()->getSection()];
+    }
 
     coff_symbol->MCData = &ResSymData;
   }
@@ -561,14 +562,14 @@ void WinCOFFObjectWriter::WriteFileHeader(const COFF::header &Header) {
   WriteLE16(Header.Characteristics);
 }
 
-void WinCOFFObjectWriter::WriteSymbol(const COFFSymbol *S) {
-  WriteBytes(StringRef(S->Data.Name, COFF::NameSize));
-  WriteLE32(S->Data.Value);
-  WriteLE16(S->Data.SectionNumber);
-  WriteLE16(S->Data.Type);
-  Write8(S->Data.StorageClass);
-  Write8(S->Data.NumberOfAuxSymbols);
-  WriteAuxiliarySymbols(S->Aux);
+void WinCOFFObjectWriter::WriteSymbol(const COFFSymbol &S) {
+  WriteBytes(StringRef(S.Data.Name, COFF::NameSize));
+  WriteLE32(S.Data.Value);
+  WriteLE16(S.Data.SectionNumber);
+  WriteLE16(S.Data.Type);
+  Write8(S.Data.StorageClass);
+  Write8(S.Data.NumberOfAuxSymbols);
+  WriteAuxiliarySymbols(S.Aux);
 }
 
 void WinCOFFObjectWriter::WriteAuxiliarySymbols(
@@ -640,16 +641,42 @@ void WinCOFFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
   // "Define" each section & symbol. This creates section & symbol
   // entries in the staging area.
 
-  for (MCAssembler::const_iterator i = Asm.begin(), e = Asm.end(); i != e; i++)
-    DefineSection(*i);
+  static_assert(sizeof(((COFF::AuxiliaryFile *)nullptr)->FileName) == COFF::SymbolSize,
+                "size mismatch for COFF::AuxiliaryFile::FileName");
+  for (auto FI = Asm.file_names_begin(), FE = Asm.file_names_end();
+       FI != FE; ++FI) {
+    // round up to calculate the number of auxiliary symbols required
+    unsigned Count = (FI->size() + COFF::SymbolSize - 1) / COFF::SymbolSize;
+
+    COFFSymbol *file = createSymbol(".file");
+    file->Data.SectionNumber = COFF::IMAGE_SYM_DEBUG;
+    file->Data.StorageClass = COFF::IMAGE_SYM_CLASS_FILE;
+    file->Aux.resize(Count);
+
+    unsigned Offset = 0;
+    unsigned Length = FI->size();
+    for (auto & Aux : file->Aux) {
+      Aux.AuxType = ATFile;
+
+      if (Length > COFF::SymbolSize) {
+        memcpy(Aux.Aux.File.FileName, FI->c_str() + Offset, COFF::SymbolSize);
+        Length = Length - COFF::SymbolSize;
+      } else {
+        memcpy(Aux.Aux.File.FileName, FI->c_str() + Offset, Length);
+        memset(&Aux.Aux.File.FileName[Length], 0, COFF::SymbolSize - Length);
+        Length = 0;
+      }
 
-  for (MCAssembler::const_symbol_iterator i = Asm.symbol_begin(),
-                                          e = Asm.symbol_end();
-       i != e; i++) {
-    if (ExportSymbol(*i, Asm)) {
-      DefineSymbol(*i, Asm, Layout);
+      Offset = Offset + COFF::SymbolSize;
     }
   }
+
+  for (const auto & Section : Asm)
+    DefineSection(Section);
+
+  for (MCSymbolData &SD : Asm.symbols())
+    if (ExportSymbol(SD, Asm))
+      DefineSymbol(SD, Asm, Layout);
 }
 
 void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
@@ -659,7 +686,7 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
                                            MCValue Target,
                                            bool &IsPCRel,
                                            uint64_t &FixedValue) {
-  assert(Target.getSymA() != NULL && "Relocation must reference a symbol!");
+  assert(Target.getSymA() && "Relocation must reference a symbol!");
 
   const MCSymbol &Symbol = Target.getSymA()->getSymbol();
   const MCSymbol &A = Symbol.AliasedSymbol();
@@ -668,7 +695,7 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
         Fixup.getLoc(),
         Twine("symbol '") + A.getName() + "' can not be undefined");
 
-  MCSymbolData &A_SD = Asm.getSymbolData(A);
+  const MCSymbolData &A_SD = Asm.getSymbolData(A);
 
   MCSectionData const *SectionData = Fragment->getParent();
 
@@ -685,7 +712,7 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
 
   if (SymB) {
     const MCSymbol *B = &SymB->getSymbol();
-    MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData &B_SD = Asm.getSymbolData(*B);
     if (!B_SD.getFragment())
       Asm.getContext().FatalError(
           Fixup.getLoc(),
@@ -737,11 +764,52 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
 
   // FIXME: Can anyone explain what this does other than adjust for the size
   // of the offset?
-  if (Reloc.Data.Type == COFF::IMAGE_REL_AMD64_REL32 ||
-      Reloc.Data.Type == COFF::IMAGE_REL_I386_REL32)
+  if ((Header.Machine == COFF::IMAGE_FILE_MACHINE_AMD64 &&
+       Reloc.Data.Type == COFF::IMAGE_REL_AMD64_REL32) ||
+      (Header.Machine == COFF::IMAGE_FILE_MACHINE_I386 &&
+       Reloc.Data.Type == COFF::IMAGE_REL_I386_REL32))
     FixedValue += 4;
 
-  coff_section->Relocations.push_back(Reloc);
+  if (Header.Machine == COFF::IMAGE_FILE_MACHINE_ARMNT) {
+    switch (Reloc.Data.Type) {
+    case COFF::IMAGE_REL_ARM_ABSOLUTE:
+    case COFF::IMAGE_REL_ARM_ADDR32:
+    case COFF::IMAGE_REL_ARM_ADDR32NB:
+    case COFF::IMAGE_REL_ARM_TOKEN:
+    case COFF::IMAGE_REL_ARM_SECTION:
+    case COFF::IMAGE_REL_ARM_SECREL:
+      break;
+    case COFF::IMAGE_REL_ARM_BRANCH11:
+    case COFF::IMAGE_REL_ARM_BLX11:
+      // IMAGE_REL_ARM_BRANCH11 and IMAGE_REL_ARM_BLX11 are only used for
+      // pre-ARMv7, which implicitly rules it out of ARMNT (it would be valid
+      // for Windows CE).
+    case COFF::IMAGE_REL_ARM_BRANCH24:
+    case COFF::IMAGE_REL_ARM_BLX24:
+    case COFF::IMAGE_REL_ARM_MOV32A:
+      // IMAGE_REL_ARM_BRANCH24, IMAGE_REL_ARM_BLX24, IMAGE_REL_ARM_MOV32A are
+      // only used for ARM mode code, which is documented as being unsupported
+      // by Windows on ARM.  Empirical proof indicates that masm is able to
+      // generate the relocations however the rest of the MSVC toolchain is
+      // unable to handle it.
+      llvm_unreachable("unsupported relocation");
+      break;
+    case COFF::IMAGE_REL_ARM_MOV32T:
+      break;
+    case COFF::IMAGE_REL_ARM_BRANCH20T:
+    case COFF::IMAGE_REL_ARM_BRANCH24T:
+    case COFF::IMAGE_REL_ARM_BLX23T:
+      // IMAGE_REL_BRANCH20T, IMAGE_REL_ARM_BRANCH24T, IMAGE_REL_ARM_BLX23T all
+      // perform a 4 byte adjustment to the relocation.  Relative branches are
+      // offset by 4 on ARM, however, because there is no RELA relocations, all
+      // branches are offset by 4.
+      FixedValue = FixedValue + 4;
+      break;
+    }
+  }
+
+  if (TargetObjectWriter->recordRelocation(Fixup))
+    coff_section->Relocations.push_back(Reloc);
 }
 
 void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
@@ -750,77 +818,64 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
   Header.NumberOfSections = 0;
 
   DenseMap<COFFSection *, uint16_t> SectionIndices;
-  for (sections::iterator i = Sections.begin(),
-                          e = Sections.end(); i != e; i++) {
-    if (Layout.getSectionAddressSize((*i)->MCData) > 0) {
+  for (auto & Section : Sections) {
+    if (Layout.getSectionAddressSize(Section->MCData) > 0) {
       size_t Number = ++Header.NumberOfSections;
-      SectionIndices[*i] = Number;
-      MakeSectionReal(**i, Number);
+      SectionIndices[Section.get()] = Number;
+      MakeSectionReal(*Section, Number);
     } else {
-      (*i)->Number = -1;
+      Section->Number = -1;
     }
   }
 
   Header.NumberOfSymbols = 0;
 
-  for (symbols::iterator i = Symbols.begin(), e = Symbols.end(); i != e; i++) {
-    COFFSymbol *coff_symbol = *i;
-    MCSymbolData const *SymbolData = coff_symbol->MCData;
-
+  for (auto & Symbol : Symbols) {
     // Update section number & offset for symbols that have them.
-    if ((SymbolData != NULL) && (SymbolData->Fragment != NULL)) {
-      assert(coff_symbol->Section != NULL);
+    if (Symbol->Section)
+      Symbol->Data.SectionNumber = Symbol->Section->Number;
 
-      coff_symbol->Data.SectionNumber = coff_symbol->Section->Number;
-      coff_symbol->Data.Value = Layout.getFragmentOffset(SymbolData->Fragment)
-                              + SymbolData->Offset;
-    }
-
-    if (coff_symbol->should_keep()) {
-      MakeSymbolReal(*coff_symbol, Header.NumberOfSymbols++);
+    if (Symbol->should_keep()) {
+      MakeSymbolReal(*Symbol, Header.NumberOfSymbols++);
 
       // Update auxiliary symbol info.
-      coff_symbol->Data.NumberOfAuxSymbols = coff_symbol->Aux.size();
-      Header.NumberOfSymbols += coff_symbol->Data.NumberOfAuxSymbols;
+      Symbol->Data.NumberOfAuxSymbols = Symbol->Aux.size();
+      Header.NumberOfSymbols += Symbol->Data.NumberOfAuxSymbols;
     } else
-      coff_symbol->Index = -1;
+      Symbol->Index = -1;
   }
 
   // Fixup weak external references.
-  for (symbols::iterator i = Symbols.begin(), e = Symbols.end(); i != e; i++) {
-    COFFSymbol *coff_symbol = *i;
-    if (coff_symbol->Other != NULL) {
-      assert(coff_symbol->Index != -1);
-      assert(coff_symbol->Aux.size() == 1 &&
-             "Symbol must contain one aux symbol!");
-      assert(coff_symbol->Aux[0].AuxType == ATWeakExternal &&
+  for (auto & Symbol : Symbols) {
+    if (Symbol->Other) {
+      assert(Symbol->Index != -1);
+      assert(Symbol->Aux.size() == 1 && "Symbol must contain one aux symbol!");
+      assert(Symbol->Aux[0].AuxType == ATWeakExternal &&
              "Symbol's aux symbol must be a Weak External!");
-      coff_symbol->Aux[0].Aux.WeakExternal.TagIndex = coff_symbol->Other->Index;
+      Symbol->Aux[0].Aux.WeakExternal.TagIndex = Symbol->Other->Index;
     }
   }
 
   // Fixup associative COMDAT sections.
-  for (sections::iterator i = Sections.begin(),
-                          e = Sections.end(); i != e; i++) {
-    if ((*i)->Symbol->Aux[0].Aux.SectionDefinition.Selection !=
+  for (auto & Section : Sections) {
+    if (Section->Symbol->Aux[0].Aux.SectionDefinition.Selection !=
         COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
       continue;
 
-    const MCSectionCOFF &MCSec = static_cast<const MCSectionCOFF &>(
-                                                    (*i)->MCData->getSection());
+    const MCSectionCOFF &MCSec =
+      static_cast<const MCSectionCOFF &>(Section->MCData->getSection());
 
     COFFSection *Assoc = SectionMap.lookup(MCSec.getAssocSection());
-    if (!Assoc) {
+    if (!Assoc)
       report_fatal_error(Twine("Missing associated COMDAT section ") +
                          MCSec.getAssocSection()->getSectionName() +
                          " for section " + MCSec.getSectionName());
-    }
 
     // Skip this section if the associated section is unused.
     if (Assoc->Number == -1)
       continue;
 
-    (*i)->Symbol->Aux[0].Aux.SectionDefinition.Number = SectionIndices[Assoc];
+    Section->Symbol->Aux[0].Aux.SectionDefinition.Number = SectionIndices[Assoc];
   }
 
 
@@ -831,15 +886,13 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
   offset += COFF::HeaderSize;
   offset += COFF::SectionSize * Header.NumberOfSections;
 
-  for (MCAssembler::const_iterator i = Asm.begin(),
-                                   e = Asm.end();
-                                   i != e; i++) {
-    COFFSection *Sec = SectionMap[&i->getSection()];
+  for (const auto & Section : Asm) {
+    COFFSection *Sec = SectionMap[&Section.getSection()];
 
     if (Sec->Number == -1)
       continue;
 
-    Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(i);
+    Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(&Section);
 
     if (IsPhysicalSection(Sec)) {
       Sec->Header.PointerToRawData = offset;
@@ -866,16 +919,14 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
 
       offset += COFF::RelocationSize * Sec->Relocations.size();
 
-      for (relocations::iterator cr = Sec->Relocations.begin(),
-                                 er = Sec->Relocations.end();
-                                 cr != er; ++cr) {
-        assert((*cr).Symb->Index != -1);
-        (*cr).Data.SymbolTableIndex = (*cr).Symb->Index;
+      for (auto & Relocation : Sec->Relocations) {
+        assert(Relocation.Symb->Index != -1);
+        Relocation.Data.SymbolTableIndex = Relocation.Symb->Index;
       }
     }
 
-    assert(Sec->Symbol->Aux.size() == 1
-      && "Section's symbol must have one aux!");
+    assert(Sec->Symbol->Aux.size() == 1 &&
+           "Section's symbol must have one aux!");
     AuxSymbol &Aux = Sec->Symbol->Aux[0];
     assert(Aux.AuxType == ATSectionDefinition &&
            "Section's symbol's aux symbol must be a Section Definition!");
@@ -898,13 +949,13 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
     sections::iterator i, ie;
     MCAssembler::const_iterator j, je;
 
-    for (i = Sections.begin(), ie = Sections.end(); i != ie; i++)
-      if ((*i)->Number != -1) {
-        if ((*i)->Relocations.size() >= 0xffff) {
-          (*i)->Header.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
-        }
-        WriteSectionHeader((*i)->Header);
+    for (auto & Section : Sections) {
+      if (Section->Number != -1) {
+        if (Section->Relocations.size() >= 0xffff)
+          Section->Header.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
+        WriteSectionHeader(Section->Header);
       }
+    }
 
     for (i = Sections.begin(), ie = Sections.end(),
          j = Asm.begin(), je = Asm.end();
@@ -934,11 +985,8 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
           WriteRelocation(r);
         }
 
-        for (relocations::const_iterator k = (*i)->Relocations.begin(),
-                                               ke = (*i)->Relocations.end();
-                                               k != ke; k++) {
-          WriteRelocation(k->Data);
-        }
+        for (const auto & Relocation : (*i)->Relocations)
+          WriteRelocation(Relocation.Data);
       } else
         assert((*i)->Header.PointerToRelocations == 0 &&
                "Section::PointerToRelocations is insane!");
@@ -948,9 +996,9 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
   assert(OS.tell() == Header.PointerToSymbolTable &&
          "Header::PointerToSymbolTable is insane!");
 
-  for (symbols::iterator i = Symbols.begin(), e = Symbols.end(); i != e; i++)
-    if ((*i)->Index != -1)
-      WriteSymbol(*i);
+  for (auto & Symbol : Symbols)
+    if (Symbol->Index != -1)
+      WriteSymbol(*Symbol);
 
   OS.write((char const *)&Strings.Data.front(), Strings.Data.size());
 }
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 5bd7b8f..e6df465 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -7,12 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains an implementation of a Win32 COFF object file streamer.
+// This file contains an implementation of a Windows COFF object file streamer.
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "WinCOFFStreamer"
-
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
@@ -27,6 +26,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -35,95 +35,33 @@
 
 using namespace llvm;
 
-namespace {
-class WinCOFFStreamer : public MCObjectStreamer {
-public:
-  MCSymbol const *CurSymbol;
-
-  WinCOFFStreamer(MCContext &Context,
-                  MCAsmBackend &MAB,
-                  MCCodeEmitter &CE,
-                  raw_ostream &OS);
-
-  void AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                       unsigned ByteAlignment, bool External);
-
-  // MCStreamer interface
-
-  void InitSections() override;
-  void EmitLabel(MCSymbol *Symbol) override;
-  void EmitDebugLabel(MCSymbol *Symbol) override;
-  void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
-  void EmitThumbFunc(MCSymbol *Func) override;
-  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
-  void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
-  void BeginCOFFSymbolDef(MCSymbol const *Symbol) override;
-  void EmitCOFFSymbolStorageClass(int StorageClass) override;
-  void EmitCOFFSymbolType(int Type) override;
-  void EndCOFFSymbolDef() override;
-  void EmitCOFFSectionIndex(MCSymbol const *Symbol) override;
-  void EmitCOFFSecRel32(MCSymbol const *Symbol) override;
-  void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) override;
-  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                        unsigned ByteAlignment) override;
-  void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                             unsigned ByteAlignment) override;
-  void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                    uint64_t Size,unsigned ByteAlignment) override;
-  void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
-                      uint64_t Size, unsigned ByteAlignment) override;
-  void EmitFileDirective(StringRef Filename) override;
-  void EmitIdent(StringRef IdentString) override;
-  void EmitWin64EHHandlerData() override;
-  void FinishImpl() override;
-
-private:
-  void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo &STI) override {
-    MCDataFragment *DF = getOrCreateDataFragment();
-
-    SmallVector<MCFixup, 4> Fixups;
-    SmallString<256> Code;
-    raw_svector_ostream VecOS(Code);
-    getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, Fixups, STI);
-    VecOS.flush();
-
-    // Add the fixups and data.
-    for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
-      Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
-      DF->getFixups().push_back(Fixups[i]);
-    }
-    DF->getContents().append(Code.begin(), Code.end());
-  }
-};
-} // end anonymous namespace.
-
-WinCOFFStreamer::WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                 MCCodeEmitter &CE, raw_ostream &OS)
-    : MCObjectStreamer(Context, MAB, OS, &CE), CurSymbol(NULL) {}
-
-void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                      unsigned ByteAlignment, bool External) {
-  assert(!Symbol->isInSection() && "Symbol must not already have a section!");
+#define DEBUG_TYPE "WinCOFFStreamer"
 
-  const MCSection *Section = getContext().getObjectFileInfo()->getBSSSection();
-  MCSectionData &SectionData = getAssembler().getOrCreateSectionData(*Section);
-  if (SectionData.getAlignment() < ByteAlignment)
-    SectionData.setAlignment(ByteAlignment);
+namespace llvm {
+MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     MCCodeEmitter &CE, raw_ostream &OS)
+    : MCObjectStreamer(Context, MAB, OS, &CE), CurSymbol(nullptr) {}
 
-  MCSymbolData &SymbolData = getAssembler().getOrCreateSymbolData(*Symbol);
-  SymbolData.setExternal(External);
+void MCWinCOFFStreamer::EmitInstToData(const MCInst &Inst,
+                                       const MCSubtargetInfo &STI) {
+  MCDataFragment *DF = getOrCreateDataFragment();
 
-  AssignSection(Symbol, Section);
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, Fixups, STI);
+  VecOS.flush();
 
-  if (ByteAlignment != 1)
-      new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, &SectionData);
+  // Add the fixups and data.
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+    Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
+    DF->getFixups().push_back(Fixups[i]);
+  }
 
-  SymbolData.setFragment(new MCFillFragment(0, 0, Size, &SectionData));
+  DF->getContents().append(Code.begin(), Code.end());
 }
 
-// MCStreamer interface
-
-void WinCOFFStreamer::InitSections() {
+void MCWinCOFFStreamer::InitSections() {
   // FIXME: this is identical to the ELF one.
   // This emulates the same behavior of GNU as. This makes it easier
   // to compare the output as the major sections are in the same order.
@@ -139,165 +77,182 @@ void WinCOFFStreamer::InitSections() {
   SwitchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
-void WinCOFFStreamer::EmitLabel(MCSymbol *Symbol) {
+void MCWinCOFFStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
   MCObjectStreamer::EmitLabel(Symbol);
 }
 
-void WinCOFFStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+void MCWinCOFFStreamer::EmitDebugLabel(MCSymbol *Symbol) {
   EmitLabel(Symbol);
 }
-void WinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+
+void MCWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   llvm_unreachable("not implemented");
 }
 
-void WinCOFFStreamer::EmitThumbFunc(MCSymbol *Func) {
+void MCWinCOFFStreamer::EmitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("not implemented");
 }
 
-bool WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
-                                          MCSymbolAttr Attribute) {
+bool MCWinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+                                            MCSymbolAttr Attribute) {
   assert(Symbol && "Symbol must be non-null!");
-  assert((Symbol->isInSection()
-         ? Symbol->getSection().getVariant() == MCSection::SV_COFF
-         : true) && "Got non-COFF section in the COFF backend!");
+  assert((!Symbol->isInSection() ||
+          Symbol->getSection().getVariant() == MCSection::SV_COFF) &&
+         "Got non-COFF section in the COFF backend!");
+
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+
   switch (Attribute) {
+  default: return false;
   case MCSA_WeakReference:
-  case MCSA_Weak: {
-      MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-      SD.modifyFlags(COFF::SF_WeakExternal, COFF::SF_WeakExternal);
-      SD.setExternal(true);
-    }
+  case MCSA_Weak:
+    SD.modifyFlags(COFF::SF_WeakExternal, COFF::SF_WeakExternal);
+    SD.setExternal(true);
     break;
-
   case MCSA_Global:
-    getAssembler().getOrCreateSymbolData(*Symbol).setExternal(true);
+    SD.setExternal(true);
     break;
-
-  default:
-    return false;
   }
 
   return true;
 }
 
-void WinCOFFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+void MCWinCOFFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   llvm_unreachable("not implemented");
 }
 
-void WinCOFFStreamer::BeginCOFFSymbolDef(MCSymbol const *Symbol) {
-  assert((Symbol->isInSection()
-         ? Symbol->getSection().getVariant() == MCSection::SV_COFF
-         : true) && "Got non-COFF section in the COFF backend!");
-  assert(CurSymbol == NULL && "EndCOFFSymbolDef must be called between calls "
-                              "to BeginCOFFSymbolDef!");
+void MCWinCOFFStreamer::BeginCOFFSymbolDef(MCSymbol const *Symbol) {
+  assert((!Symbol->isInSection() ||
+          Symbol->getSection().getVariant() == MCSection::SV_COFF) &&
+         "Got non-COFF section in the COFF backend!");
+
+  if (CurSymbol)
+    FatalError("starting a new symbol definition without completing the "
+               "previous one");
   CurSymbol = Symbol;
 }
 
-void WinCOFFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
-  assert(CurSymbol != NULL && "BeginCOFFSymbolDef must be called first!");
-  assert((StorageClass & ~0xFF) == 0 && "StorageClass must only have data in "
-                                        "the first byte!");
+void MCWinCOFFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
+  if (!CurSymbol)
+    FatalError("storage class specified outside of symbol definition");
+
+  if (StorageClass & ~0xff)
+    FatalError(Twine("storage class value '") + itostr(StorageClass) +
+               "' out of range");
 
-  getAssembler().getOrCreateSymbolData(*CurSymbol).modifyFlags(
-    StorageClass << COFF::SF_ClassShift,
-    COFF::SF_ClassMask);
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*CurSymbol);
+  SD.modifyFlags(StorageClass << COFF::SF_ClassShift, COFF::SF_ClassMask);
 }
 
-void WinCOFFStreamer::EmitCOFFSymbolType(int Type) {
-  assert(CurSymbol != NULL && "BeginCOFFSymbolDef must be called first!");
-  assert((Type & ~0xFFFF) == 0 && "Type must only have data in the first 2 "
-                                  "bytes");
+void MCWinCOFFStreamer::EmitCOFFSymbolType(int Type) {
+  if (!CurSymbol)
+    FatalError("symbol type specified outside of a symbol definition");
+
+  if (Type & ~0xffff)
+    FatalError(Twine("type value '") + itostr(Type) + "' out of range");
 
-  getAssembler().getOrCreateSymbolData(*CurSymbol).modifyFlags(
-    Type << COFF::SF_TypeShift,
-    COFF::SF_TypeMask);
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*CurSymbol);
+  SD.modifyFlags(Type << COFF::SF_TypeShift, COFF::SF_TypeMask);
 }
 
-void WinCOFFStreamer::EndCOFFSymbolDef() {
-  assert(CurSymbol != NULL && "BeginCOFFSymbolDef must be called first!");
-  CurSymbol = NULL;
+void MCWinCOFFStreamer::EndCOFFSymbolDef() {
+  if (!CurSymbol)
+    FatalError("ending symbol definition without starting one");
+  CurSymbol = nullptr;
 }
 
-void WinCOFFStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) {
+void MCWinCOFFStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) {
   MCDataFragment *DF = getOrCreateDataFragment();
-  DF->getFixups().push_back(MCFixup::Create(
-      DF->getContents().size(), MCSymbolRefExpr::Create(Symbol, getContext()),
-      FK_SecRel_2));
+  const MCSymbolRefExpr *SRE = MCSymbolRefExpr::Create(Symbol, getContext());
+  MCFixup Fixup = MCFixup::Create(DF->getContents().size(), SRE, FK_SecRel_2);
+  DF->getFixups().push_back(Fixup);
   DF->getContents().resize(DF->getContents().size() + 4, 0);
 }
 
-void WinCOFFStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol) {
+void MCWinCOFFStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol) {
   MCDataFragment *DF = getOrCreateDataFragment();
-  DF->getFixups().push_back(MCFixup::Create(
-      DF->getContents().size(), MCSymbolRefExpr::Create(Symbol, getContext()),
-      FK_SecRel_4));
+  const MCSymbolRefExpr *SRE = MCSymbolRefExpr::Create(Symbol, getContext());
+  MCFixup Fixup = MCFixup::Create(DF->getContents().size(), SRE, FK_SecRel_4);
+  DF->getFixups().push_back(Fixup);
   DF->getContents().resize(DF->getContents().size() + 4, 0);
 }
 
-void WinCOFFStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
-  llvm_unreachable("not implemented");
+void MCWinCOFFStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
+  llvm_unreachable("not supported");
 }
 
-void WinCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                       unsigned ByteAlignment) {
-  assert((Symbol->isInSection()
-         ? Symbol->getSection().getVariant() == MCSection::SV_COFF
-         : true) && "Got non-COFF section in the COFF backend!");
-  AddCommonSymbol(Symbol, Size, ByteAlignment, true);
+void MCWinCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                         unsigned ByteAlignment) {
+  assert((!Symbol->isInSection() ||
+          Symbol->getSection().getVariant() == MCSection::SV_COFF) &&
+         "Got non-COFF section in the COFF backend!");
+
+  if (ByteAlignment > 32)
+    report_fatal_error("alignment is limited to 32-bytes");
+
+  AssignSection(Symbol, nullptr);
+
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+  SD.setExternal(true);
+  SD.setCommon(Size, ByteAlignment);
 }
 
-void WinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                            unsigned ByteAlignment) {
-  assert((Symbol->isInSection()
-         ? Symbol->getSection().getVariant() == MCSection::SV_COFF
-         : true) && "Got non-COFF section in the COFF backend!");
-  AddCommonSymbol(Symbol, Size, ByteAlignment, false);
+void MCWinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                              unsigned ByteAlignment) {
+  assert(!Symbol->isInSection() && "Symbol must not already have a section!");
+
+  const MCSection *Section = getContext().getObjectFileInfo()->getBSSSection();
+  MCSectionData &SectionData = getAssembler().getOrCreateSectionData(*Section);
+  if (SectionData.getAlignment() < ByteAlignment)
+    SectionData.setAlignment(ByteAlignment);
+
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+  SD.setExternal(false);
+
+  AssignSection(Symbol, Section);
+
+  if (ByteAlignment != 1)
+    new MCAlignFragment(ByteAlignment, /*_Value=*/0, /*_ValueSize=*/0,
+                        ByteAlignment, &SectionData);
+
+  MCFillFragment *Fragment =
+      new MCFillFragment(/*_Value=*/0, /*_ValueSize=*/0, Size, &SectionData);
+  SD.setFragment(Fragment);
 }
 
-void WinCOFFStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                                   uint64_t Size,unsigned ByteAlignment) {
+void MCWinCOFFStreamer::EmitZerofill(const MCSection *Section,
+                                     MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment) {
   llvm_unreachable("not implemented");
 }
 
-void WinCOFFStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
-                                     uint64_t Size, unsigned ByteAlignment) {
+void MCWinCOFFStreamer::EmitTBSSSymbol(const MCSection *Section,
+                                       MCSymbol *Symbol, uint64_t Size,
+                                       unsigned ByteAlignment) {
   llvm_unreachable("not implemented");
 }
 
-void WinCOFFStreamer::EmitFileDirective(StringRef Filename) {
-  // Ignore for now, linkers don't care, and proper debug
-  // info will be a much large effort.
+void MCWinCOFFStreamer::EmitFileDirective(StringRef Filename) {
+  getAssembler().addFileName(Filename);
 }
 
 // TODO: Implement this if you want to emit .comment section in COFF obj files.
-void WinCOFFStreamer::EmitIdent(StringRef IdentString) {
-  llvm_unreachable("unsupported directive");
+void MCWinCOFFStreamer::EmitIdent(StringRef IdentString) {
+  llvm_unreachable("not implemented");
 }
 
-void WinCOFFStreamer::EmitWin64EHHandlerData() {
-  MCStreamer::EmitWin64EHHandlerData();
-
-  // We have to emit the unwind info now, because this directive
-  // actually switches to the .xdata section!
-  MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentW64UnwindInfo());
+void MCWinCOFFStreamer::EmitWin64EHHandlerData() {
+  llvm_unreachable("not implemented");
 }
 
-void WinCOFFStreamer::FinishImpl() {
-  EmitFrames(NULL, true);
-  EmitW64Tables();
+void MCWinCOFFStreamer::FinishImpl() {
   MCObjectStreamer::FinishImpl();
 }
 
-namespace llvm
-{
-  MCStreamer *createWinCOFFStreamer(MCContext &Context,
-                                    MCAsmBackend &MAB,
-                                    MCCodeEmitter &CE,
-                                    raw_ostream &OS,
-                                    bool RelaxAll) {
-    WinCOFFStreamer *S = new WinCOFFStreamer(Context, MAB, CE, OS);
-    S->getAssembler().setRelaxAll(RelaxAll);
-    return S;
-  }
+LLVM_ATTRIBUTE_NORETURN
+void MCWinCOFFStreamer::FatalError(const Twine &Msg) const {
+  getContext().FatalError(SMLoc(), Msg);
+}
 }
+
diff --git a/lib/Object/Android.mk b/lib/Object/Android.mk
index 7dfa44f..bd9659c 100644
--- a/lib/Object/Android.mk
+++ b/lib/Object/Android.mk
@@ -12,6 +12,7 @@ object_SRC_FILES := \
   MachOUniversal.cpp \
   Object.cpp \
   ObjectFile.cpp \
+  StringTableBuilder.cpp \
   SymbolicFile.cpp
 
 
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index 999bf28..304ca47 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/Object/Archive.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Endian.h"
@@ -111,7 +110,7 @@ Archive::Child Archive::Child::getNext() const {
 
   // Check to see if this is past the end of the archive.
   if (NextLoc >= Parent->Data->getBufferEnd())
-    return Child(Parent, NULL);
+    return Child(Parent, nullptr);
 
   return Child(Parent, NextLoc);
 }
@@ -183,14 +182,6 @@ error_code Archive::Child::getMemoryBuffer(std::unique_ptr<MemoryBuffer> &Result
   return error_code::success();
 }
 
-error_code Archive::Child::getMemoryBuffer(OwningPtr<MemoryBuffer> &Result,
-                                           bool FullPath) const {
-  std::unique_ptr<MemoryBuffer> MB;
-  error_code ec = getMemoryBuffer(MB, FullPath);
-  Result = std::move(MB);
-  return ec;
-}
-
 error_code Archive::Child::getAsBinary(std::unique_ptr<Binary> &Result,
                                        LLVMContext *Context) const {
   std::unique_ptr<Binary> ret;
@@ -204,14 +195,6 @@ error_code Archive::Child::getAsBinary(std::unique_ptr<Binary> &Result,
   return object_error::success;
 }
 
-error_code Archive::Child::getAsBinary(OwningPtr<Binary> &Result,
-                                       LLVMContext *Context) const {
-  std::unique_ptr<Binary> B;
-  error_code ec = getAsBinary(B, Context);
-  Result = std::move(B);
-  return ec;
-}
-
 ErrorOr<Archive*> Archive::create(MemoryBuffer *Source) {
   error_code EC;
   std::unique_ptr<Archive> Ret(new Archive(Source, EC));
@@ -349,7 +332,7 @@ Archive::child_iterator Archive::child_begin(bool SkipInternal) const {
 }
 
 Archive::child_iterator Archive::child_end() const {
-  return Child(this, NULL);
+  return Child(this, nullptr);
 }
 
 error_code Archive::Symbol::getName(StringRef &Result) const {
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index dc18296..cd8c9ef 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -12,6 +12,7 @@ add_llvm_library(LLVMObject
   MachOUniversal.cpp
   Object.cpp
   ObjectFile.cpp
+  StringTableBuilder.cpp
   SymbolicFile.cpp
   YAML.cpp
   )
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index a75ebbf..262c040 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -138,7 +138,7 @@ error_code COFFObjectFile::getSymbolName(DataRefImpl Ref,
 error_code COFFObjectFile::getSymbolAddress(DataRefImpl Ref,
                                             uint64_t &Result) const {
   const coff_symbol *Symb = toSymb(Ref);
-  const coff_section *Section = NULL;
+  const coff_section *Section = nullptr;
   if (error_code EC = getSection(Symb->SectionNumber, Section))
     return EC;
 
@@ -163,7 +163,7 @@ error_code COFFObjectFile::getSymbolType(DataRefImpl Ref,
   } else {
     uint32_t Characteristics = 0;
     if (!COFF::isReservedSectionNumber(Symb->SectionNumber)) {
-      const coff_section *Section = NULL;
+      const coff_section *Section = nullptr;
       if (error_code EC = getSection(Symb->SectionNumber, Section))
         return EC;
       Characteristics = Section->Characteristics;
@@ -208,7 +208,7 @@ error_code COFFObjectFile::getSymbolSize(DataRefImpl Ref,
   //        in the same section as this symbol, and looking for either the next
   //        symbol, or the end of the section.
   const coff_symbol *Symb = toSymb(Ref);
-  const coff_section *Section = NULL;
+  const coff_section *Section = nullptr;
   if (error_code EC = getSection(Symb->SectionNumber, Section))
     return EC;
 
@@ -227,7 +227,7 @@ error_code COFFObjectFile::getSymbolSection(DataRefImpl Ref,
   if (COFF::isReservedSectionNumber(Symb->SectionNumber)) {
     Result = section_end();
   } else {
-    const coff_section *Sec = 0;
+    const coff_section *Sec = nullptr;
     if (error_code EC = getSection(Symb->SectionNumber, Sec)) return EC;
     DataRefImpl Ref;
     Ref.p = reinterpret_cast<uintptr_t>(Sec);
@@ -334,7 +334,7 @@ error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl SecRef,
                                                  bool &Result) const {
   const coff_section *Sec = toSec(SecRef);
   const coff_symbol *Symb = toSymb(SymbRef);
-  const coff_section *SymbSec = 0;
+  const coff_section *SymbSec = nullptr;
   if (error_code EC = getSection(Symb->SectionNumber, SymbSec)) return EC;
   if (SymbSec == Sec)
     Result = true;
@@ -389,11 +389,6 @@ relocation_iterator COFFObjectFile::section_rel_end(DataRefImpl Ref) const {
   return relocation_iterator(RelocationRef(Ret, this));
 }
 
-bool COFFObjectFile::section_rel_empty(DataRefImpl Ref) const {
-  const coff_section *Sec = toSec(Ref);
-  return Sec->NumberOfRelocations == 0;
-}
-
 // Initialize the pointer to the symbol table.
 error_code COFFObjectFile::initSymbolTablePtr() {
   if (error_code EC = getObject(
@@ -512,10 +507,11 @@ error_code COFFObjectFile::initExportTablePtr() {
 
 COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &EC,
                                bool BufferOwned)
-    : ObjectFile(Binary::ID_COFF, Object, BufferOwned), COFFHeader(0),
-      PE32Header(0), PE32PlusHeader(0), DataDirectory(0), SectionTable(0),
-      SymbolTable(0), StringTable(0), StringTableSize(0), ImportDirectory(0),
-      NumberOfImportDirectory(0), ExportDirectory(0) {
+    : ObjectFile(Binary::ID_COFF, Object, BufferOwned), COFFHeader(nullptr),
+      PE32Header(nullptr), PE32PlusHeader(nullptr), DataDirectory(nullptr),
+      SectionTable(nullptr), SymbolTable(nullptr), StringTable(nullptr),
+      StringTableSize(0), ImportDirectory(nullptr), NumberOfImportDirectory(0),
+      ExportDirectory(nullptr) {
   // Check that we at least have enough room for a header.
   if (!checkSize(Data, EC, sizeof(coff_file_header))) return;
 
@@ -637,8 +633,8 @@ export_directory_iterator COFFObjectFile::export_directory_begin() const {
 }
 
 export_directory_iterator COFFObjectFile::export_directory_end() const {
-  if (ExportDirectory == 0)
-    return export_directory_iterator(ExportDirectoryEntryRef(0, 0, this));
+  if (!ExportDirectory)
+    return export_directory_iterator(ExportDirectoryEntryRef(nullptr, 0, this));
   ExportDirectoryEntryRef Ref(ExportDirectory,
                               ExportDirectory->AddressTableEntries, this);
   return export_directory_iterator(Ref);
@@ -728,7 +724,7 @@ error_code COFFObjectFile::getSection(int32_t Index,
                                       const coff_section *&Result) const {
   // Check for special index values.
   if (COFF::isReservedSectionNumber(Index))
-    Result = NULL;
+    Result = nullptr;
   else if (Index > 0 && Index <= COFFHeader->NumberOfSections)
     // We already verified the section table data, so no need to check again.
     Result = SectionTable + (Index - 1);
@@ -778,7 +774,7 @@ error_code COFFObjectFile::getSymbolName(const coff_symbol *Symbol,
 
 ArrayRef<uint8_t> COFFObjectFile::getSymbolAuxData(
                                   const coff_symbol *Symbol) const {
-  const uint8_t *Aux = NULL;
+  const uint8_t *Aux = nullptr;
 
   if (Symbol->NumberOfAuxSymbols > 0) {
   // AUX data comes immediately after the symbol in COFF
@@ -923,6 +919,27 @@ error_code COFFObjectFile::getRelocationTypeName(DataRefImpl Rel,
       Res = "Unknown";
     }
     break;
+  case COFF::IMAGE_FILE_MACHINE_ARMNT:
+    switch (Reloc->Type) {
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_ABSOLUTE);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_ADDR32);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_ADDR32NB);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_BRANCH24);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_BRANCH11);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_TOKEN);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_BLX24);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_BLX11);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_SECTION);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_SECREL);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_MOV32A);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_MOV32T);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_BRANCH20T);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_BRANCH24T);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM_BLX23T);
+    default:
+      Res = "Unknown";
+    }
+    break;
   case COFF::IMAGE_FILE_MACHINE_I386:
     switch (Reloc->Type) {
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_ABSOLUTE);
@@ -952,7 +969,7 @@ error_code COFFObjectFile::getRelocationTypeName(DataRefImpl Rel,
 error_code COFFObjectFile::getRelocationValueString(DataRefImpl Rel,
                                           SmallVectorImpl<char> &Result) const {
   const coff_relocation *Reloc = toRel(Rel);
-  const coff_symbol *Symb = 0;
+  const coff_symbol *Symb = nullptr;
   if (error_code EC = getSymbol(Reloc->SymbolTableIndex, Symb)) return EC;
   DataRefImpl Sym;
   Sym.p = reinterpret_cast<uintptr_t>(Symb);
diff --git a/lib/Object/COFFYAML.cpp b/lib/Object/COFFYAML.cpp
index 94b72ff..49c5dda 100644
--- a/lib/Object/COFFYAML.cpp
+++ b/lib/Object/COFFYAML.cpp
@@ -38,6 +38,7 @@ void ScalarEnumerationTraits<COFFYAML::COMDATType>::enumeration(
 void
 ScalarEnumerationTraits<COFFYAML::WeakExternalCharacteristics>::enumeration(
     IO &IO, COFFYAML::WeakExternalCharacteristics &Value) {
+  IO.enumCase(Value, "0", 0);
   ECase(IMAGE_WEAK_EXTERN_SEARCH_NOLIBRARY);
   ECase(IMAGE_WEAK_EXTERN_SEARCH_LIBRARY);
   ECase(IMAGE_WEAK_EXTERN_SEARCH_ALIAS);
@@ -132,8 +133,8 @@ void ScalarEnumerationTraits<COFF::SymbolComplexType>::enumeration(
   ECase(IMAGE_SYM_DTYPE_ARRAY);
 }
 
-void ScalarEnumerationTraits<COFF::RelocationTypeX86>::enumeration(
-    IO &IO, COFF::RelocationTypeX86 &Value) {
+void ScalarEnumerationTraits<COFF::RelocationTypeI386>::enumeration(
+    IO &IO, COFF::RelocationTypeI386 &Value) {
   ECase(IMAGE_REL_I386_ABSOLUTE);
   ECase(IMAGE_REL_I386_DIR16);
   ECase(IMAGE_REL_I386_REL16);
@@ -145,6 +146,10 @@ void ScalarEnumerationTraits<COFF::RelocationTypeX86>::enumeration(
   ECase(IMAGE_REL_I386_TOKEN);
   ECase(IMAGE_REL_I386_SECREL7);
   ECase(IMAGE_REL_I386_REL32);
+}
+
+void ScalarEnumerationTraits<COFF::RelocationTypeAMD64>::enumeration(
+    IO &IO, COFF::RelocationTypeAMD64 &Value) {
   ECase(IMAGE_REL_AMD64_ABSOLUTE);
   ECase(IMAGE_REL_AMD64_ADDR64);
   ECase(IMAGE_REL_AMD64_ADDR32);
@@ -272,22 +277,33 @@ struct NHeaderCharacteristics {
   COFF::Characteristics Characteristics;
 };
 
+template <typename RelocType>
 struct NType {
-  NType(IO &) : Type(COFF::RelocationTypeX86(0)) {}
-  NType(IO &, uint16_t T) : Type(COFF::RelocationTypeX86(T)) {}
+  NType(IO &) : Type(RelocType(0)) {}
+  NType(IO &, uint16_t T) : Type(RelocType(T)) {}
   uint16_t denormalize(IO &) { return Type; }
-  COFF::RelocationTypeX86 Type;
+  RelocType Type;
 };
 
 }
 
 void MappingTraits<COFFYAML::Relocation>::mapping(IO &IO,
                                                   COFFYAML::Relocation &Rel) {
-  MappingNormalization<NType, uint16_t> NT(IO, Rel.Type);
-
   IO.mapRequired("VirtualAddress", Rel.VirtualAddress);
   IO.mapRequired("SymbolName", Rel.SymbolName);
-  IO.mapRequired("Type", NT->Type);
+
+  COFF::header &H = *static_cast<COFF::header *>(IO.getContext());
+  if (H.Machine == COFF::IMAGE_FILE_MACHINE_I386) {
+    MappingNormalization<NType<COFF::RelocationTypeI386>, uint16_t> NT(
+        IO, Rel.Type);
+    IO.mapRequired("Type", NT->Type);
+  } else if (H.Machine == COFF::IMAGE_FILE_MACHINE_AMD64) {
+    MappingNormalization<NType<COFF::RelocationTypeAMD64>, uint16_t> NT(
+        IO, Rel.Type);
+    IO.mapRequired("Type", NT->Type);
+  } else {
+    IO.mapRequired("Type", Rel.Type);
+  }
 }
 
 void MappingTraits<COFF::header>::mapping(IO &IO, COFF::header &H) {
@@ -297,6 +313,7 @@ void MappingTraits<COFF::header>::mapping(IO &IO, COFF::header &H) {
 
   IO.mapRequired("Machine", NM->Machine);
   IO.mapOptional("Characteristics", NC->Characteristics);
+  IO.setContext(static_cast<void *>(&H));
 }
 
 void MappingTraits<COFF::AuxiliaryFunctionDefinition>::mapping(
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
index e9a88bf..df4dd5e 100644
--- a/lib/Object/ELF.cpp
+++ b/lib/Object/ELF.cpp
@@ -159,6 +159,15 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL_HI16);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL_LO16);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC21_S2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC26_S2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC18_S3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC19_S2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PCHI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PCLO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS16_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS16_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS16_LO16);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_COPY);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_JUMP_SLOT);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_26_S1);
@@ -177,6 +186,7 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_TPREL_HI16);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_TPREL_LO16);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_NUM);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC32);
     default:
       break;
     }
diff --git a/lib/Object/ELFYAML.cpp b/lib/Object/ELFYAML.cpp
index d513670..7d50f23 100644
--- a/lib/Object/ELFYAML.cpp
+++ b/lib/Object/ELFYAML.cpp
@@ -12,8 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/ELFYAML.h"
+#include "llvm/Support/Casting.h"
 
 namespace llvm {
+
+ELFYAML::Section::~Section() {}
+
 namespace yaml {
 
 void
@@ -239,44 +243,57 @@ void ScalarEnumerationTraits<ELFYAML::ELF_ELFOSABI>::enumeration(
 
 void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
                                                  ELFYAML::ELF_EF &Value) {
+  const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
+  assert(Object && "The IO context is not initialized");
 #define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
-  BCase(EF_ARM_SOFT_FLOAT)
-  BCase(EF_ARM_VFP_FLOAT)
-  BCase(EF_ARM_EABI_UNKNOWN)
-  BCase(EF_ARM_EABI_VER1)
-  BCase(EF_ARM_EABI_VER2)
-  BCase(EF_ARM_EABI_VER3)
-  BCase(EF_ARM_EABI_VER4)
-  BCase(EF_ARM_EABI_VER5)
-  BCase(EF_ARM_EABIMASK)
-  BCase(EF_MIPS_NOREORDER)
-  BCase(EF_MIPS_PIC)
-  BCase(EF_MIPS_CPIC)
-  BCase(EF_MIPS_ABI2)
-  BCase(EF_MIPS_32BITMODE)
-  BCase(EF_MIPS_ABI_O32)
-  BCase(EF_MIPS_MICROMIPS)
-  BCase(EF_MIPS_ARCH_ASE_M16)
-  BCase(EF_MIPS_ARCH_1)
-  BCase(EF_MIPS_ARCH_2)
-  BCase(EF_MIPS_ARCH_3)
-  BCase(EF_MIPS_ARCH_4)
-  BCase(EF_MIPS_ARCH_5)
-  BCase(EF_MIPS_ARCH_32)
-  BCase(EF_MIPS_ARCH_64)
-  BCase(EF_MIPS_ARCH_32R2)
-  BCase(EF_MIPS_ARCH_64R2)
-  BCase(EF_MIPS_ARCH)
-  BCase(EF_HEXAGON_MACH_V2)
-  BCase(EF_HEXAGON_MACH_V3)
-  BCase(EF_HEXAGON_MACH_V4)
-  BCase(EF_HEXAGON_MACH_V5)
-  BCase(EF_HEXAGON_ISA_MACH)
-  BCase(EF_HEXAGON_ISA_V2)
-  BCase(EF_HEXAGON_ISA_V3)
-  BCase(EF_HEXAGON_ISA_V4)
-  BCase(EF_HEXAGON_ISA_V5)
+#define BCaseMask(X, M) IO.maskedBitSetCase(Value, #X, ELF::X, ELF::M);
+  switch (Object->Header.Machine) {
+  case ELF::EM_ARM:
+    BCase(EF_ARM_SOFT_FLOAT)
+    BCase(EF_ARM_VFP_FLOAT)
+    BCaseMask(EF_ARM_EABI_UNKNOWN, EF_ARM_EABIMASK)
+    BCaseMask(EF_ARM_EABI_VER1, EF_ARM_EABIMASK)
+    BCaseMask(EF_ARM_EABI_VER2, EF_ARM_EABIMASK)
+    BCaseMask(EF_ARM_EABI_VER3, EF_ARM_EABIMASK)
+    BCaseMask(EF_ARM_EABI_VER4, EF_ARM_EABIMASK)
+    BCaseMask(EF_ARM_EABI_VER5, EF_ARM_EABIMASK)
+    break;
+  case ELF::EM_MIPS:
+    BCase(EF_MIPS_NOREORDER)
+    BCase(EF_MIPS_PIC)
+    BCase(EF_MIPS_CPIC)
+    BCase(EF_MIPS_ABI2)
+    BCase(EF_MIPS_32BITMODE)
+    BCase(EF_MIPS_ABI_O32)
+    BCase(EF_MIPS_MICROMIPS)
+    BCase(EF_MIPS_ARCH_ASE_M16)
+    BCaseMask(EF_MIPS_ARCH_1, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_2, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_3, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_4, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_5, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_32, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_64, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_32R2, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_64R2, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_32R6, EF_MIPS_ARCH)
+    BCaseMask(EF_MIPS_ARCH_64R6, EF_MIPS_ARCH)
+    break;
+  case ELF::EM_HEXAGON:
+    BCase(EF_HEXAGON_MACH_V2)
+    BCase(EF_HEXAGON_MACH_V3)
+    BCase(EF_HEXAGON_MACH_V4)
+    BCase(EF_HEXAGON_MACH_V5)
+    BCase(EF_HEXAGON_ISA_V2)
+    BCase(EF_HEXAGON_ISA_V3)
+    BCase(EF_HEXAGON_ISA_V4)
+    BCase(EF_HEXAGON_ISA_V5)
+    break;
+  default:
+    llvm_unreachable("Unsupported architecture");
+  }
 #undef BCase
+#undef BCaseMask
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
@@ -300,6 +317,23 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
   ECase(SHT_PREINIT_ARRAY)
   ECase(SHT_GROUP)
   ECase(SHT_SYMTAB_SHNDX)
+  ECase(SHT_LOOS)
+  ECase(SHT_GNU_ATTRIBUTES)
+  ECase(SHT_GNU_HASH)
+  ECase(SHT_GNU_verdef)
+  ECase(SHT_GNU_verneed)
+  ECase(SHT_GNU_versym)
+  ECase(SHT_HIOS)
+  ECase(SHT_LOPROC)
+  ECase(SHT_ARM_EXIDX)
+  ECase(SHT_ARM_PREEMPTMAP)
+  ECase(SHT_ARM_ATTRIBUTES)
+  ECase(SHT_ARM_DEBUGOVERLAY)
+  ECase(SHT_ARM_OVERLAYSECTION)
+  ECase(SHT_HEX_ORDERED)
+  ECase(SHT_X86_64_UNWIND)
+  ECase(SHT_MIPS_REGINFO)
+  ECase(SHT_MIPS_OPTIONS)
 #undef ECase
 }
 
@@ -334,6 +368,270 @@ void ScalarEnumerationTraits<ELFYAML::ELF_STT>::enumeration(
 #undef ECase
 }
 
+void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
+    IO &IO, ELFYAML::ELF_REL &Value) {
+  const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
+  assert(Object && "The IO context is not initialized");
+#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+  switch (Object->Header.Machine) {
+  case ELF::EM_X86_64:
+    ECase(R_X86_64_NONE)
+    ECase(R_X86_64_64)
+    ECase(R_X86_64_PC32)
+    ECase(R_X86_64_GOT32)
+    ECase(R_X86_64_PLT32)
+    ECase(R_X86_64_COPY)
+    ECase(R_X86_64_GLOB_DAT)
+    ECase(R_X86_64_JUMP_SLOT)
+    ECase(R_X86_64_RELATIVE)
+    ECase(R_X86_64_GOTPCREL)
+    ECase(R_X86_64_32)
+    ECase(R_X86_64_32S)
+    ECase(R_X86_64_16)
+    ECase(R_X86_64_PC16)
+    ECase(R_X86_64_8)
+    ECase(R_X86_64_PC8)
+    ECase(R_X86_64_DTPMOD64)
+    ECase(R_X86_64_DTPOFF64)
+    ECase(R_X86_64_TPOFF64)
+    ECase(R_X86_64_TLSGD)
+    ECase(R_X86_64_TLSLD)
+    ECase(R_X86_64_DTPOFF32)
+    ECase(R_X86_64_GOTTPOFF)
+    ECase(R_X86_64_TPOFF32)
+    ECase(R_X86_64_PC64)
+    ECase(R_X86_64_GOTOFF64)
+    ECase(R_X86_64_GOTPC32)
+    ECase(R_X86_64_GOT64)
+    ECase(R_X86_64_GOTPCREL64)
+    ECase(R_X86_64_GOTPC64)
+    ECase(R_X86_64_GOTPLT64)
+    ECase(R_X86_64_PLTOFF64)
+    ECase(R_X86_64_SIZE32)
+    ECase(R_X86_64_SIZE64)
+    ECase(R_X86_64_GOTPC32_TLSDESC)
+    ECase(R_X86_64_TLSDESC_CALL)
+    ECase(R_X86_64_TLSDESC)
+    ECase(R_X86_64_IRELATIVE)
+    break;
+  case ELF::EM_MIPS:
+    ECase(R_MIPS_NONE)
+    ECase(R_MIPS_16)
+    ECase(R_MIPS_32)
+    ECase(R_MIPS_REL32)
+    ECase(R_MIPS_26)
+    ECase(R_MIPS_HI16)
+    ECase(R_MIPS_LO16)
+    ECase(R_MIPS_GPREL16)
+    ECase(R_MIPS_LITERAL)
+    ECase(R_MIPS_GOT16)
+    ECase(R_MIPS_PC16)
+    ECase(R_MIPS_CALL16)
+    ECase(R_MIPS_GPREL32)
+    ECase(R_MIPS_UNUSED1)
+    ECase(R_MIPS_UNUSED2)
+    ECase(R_MIPS_SHIFT5)
+    ECase(R_MIPS_SHIFT6)
+    ECase(R_MIPS_64)
+    ECase(R_MIPS_GOT_DISP)
+    ECase(R_MIPS_GOT_PAGE)
+    ECase(R_MIPS_GOT_OFST)
+    ECase(R_MIPS_GOT_HI16)
+    ECase(R_MIPS_GOT_LO16)
+    ECase(R_MIPS_SUB)
+    ECase(R_MIPS_INSERT_A)
+    ECase(R_MIPS_INSERT_B)
+    ECase(R_MIPS_DELETE)
+    ECase(R_MIPS_HIGHER)
+    ECase(R_MIPS_HIGHEST)
+    ECase(R_MIPS_CALL_HI16)
+    ECase(R_MIPS_CALL_LO16)
+    ECase(R_MIPS_SCN_DISP)
+    ECase(R_MIPS_REL16)
+    ECase(R_MIPS_ADD_IMMEDIATE)
+    ECase(R_MIPS_PJUMP)
+    ECase(R_MIPS_RELGOT)
+    ECase(R_MIPS_JALR)
+    ECase(R_MIPS_TLS_DTPMOD32)
+    ECase(R_MIPS_TLS_DTPREL32)
+    ECase(R_MIPS_TLS_DTPMOD64)
+    ECase(R_MIPS_TLS_DTPREL64)
+    ECase(R_MIPS_TLS_GD)
+    ECase(R_MIPS_TLS_LDM)
+    ECase(R_MIPS_TLS_DTPREL_HI16)
+    ECase(R_MIPS_TLS_DTPREL_LO16)
+    ECase(R_MIPS_TLS_GOTTPREL)
+    ECase(R_MIPS_TLS_TPREL32)
+    ECase(R_MIPS_TLS_TPREL64)
+    ECase(R_MIPS_TLS_TPREL_HI16)
+    ECase(R_MIPS_TLS_TPREL_LO16)
+    ECase(R_MIPS_GLOB_DAT)
+    ECase(R_MIPS_PC21_S2)
+    ECase(R_MIPS_PC26_S2)
+    ECase(R_MIPS_PC18_S3)
+    ECase(R_MIPS_PC19_S2)
+    ECase(R_MIPS_PCHI16)
+    ECase(R_MIPS_PCLO16)
+    ECase(R_MIPS16_GOT16)
+    ECase(R_MIPS16_HI16)
+    ECase(R_MIPS16_LO16)
+    ECase(R_MIPS_COPY)
+    ECase(R_MIPS_JUMP_SLOT)
+    ECase(R_MICROMIPS_26_S1)
+    ECase(R_MICROMIPS_HI16)
+    ECase(R_MICROMIPS_LO16)
+    ECase(R_MICROMIPS_GOT16)
+    ECase(R_MICROMIPS_PC16_S1)
+    ECase(R_MICROMIPS_CALL16)
+    ECase(R_MICROMIPS_GOT_DISP)
+    ECase(R_MICROMIPS_GOT_PAGE)
+    ECase(R_MICROMIPS_GOT_OFST)
+    ECase(R_MICROMIPS_TLS_GD)
+    ECase(R_MICROMIPS_TLS_LDM)
+    ECase(R_MICROMIPS_TLS_DTPREL_HI16)
+    ECase(R_MICROMIPS_TLS_DTPREL_LO16)
+    ECase(R_MICROMIPS_TLS_TPREL_HI16)
+    ECase(R_MICROMIPS_TLS_TPREL_LO16)
+    ECase(R_MIPS_NUM)
+    ECase(R_MIPS_PC32)
+    break;
+  case ELF::EM_HEXAGON:
+    ECase(R_HEX_NONE)
+    ECase(R_HEX_B22_PCREL)
+    ECase(R_HEX_B15_PCREL)
+    ECase(R_HEX_B7_PCREL)
+    ECase(R_HEX_LO16)
+    ECase(R_HEX_HI16)
+    ECase(R_HEX_32)
+    ECase(R_HEX_16)
+    ECase(R_HEX_8)
+    ECase(R_HEX_GPREL16_0)
+    ECase(R_HEX_GPREL16_1)
+    ECase(R_HEX_GPREL16_2)
+    ECase(R_HEX_GPREL16_3)
+    ECase(R_HEX_HL16)
+    ECase(R_HEX_B13_PCREL)
+    ECase(R_HEX_B9_PCREL)
+    ECase(R_HEX_B32_PCREL_X)
+    ECase(R_HEX_32_6_X)
+    ECase(R_HEX_B22_PCREL_X)
+    ECase(R_HEX_B15_PCREL_X)
+    ECase(R_HEX_B13_PCREL_X)
+    ECase(R_HEX_B9_PCREL_X)
+    ECase(R_HEX_B7_PCREL_X)
+    ECase(R_HEX_16_X)
+    ECase(R_HEX_12_X)
+    ECase(R_HEX_11_X)
+    ECase(R_HEX_10_X)
+    ECase(R_HEX_9_X)
+    ECase(R_HEX_8_X)
+    ECase(R_HEX_7_X)
+    ECase(R_HEX_6_X)
+    ECase(R_HEX_32_PCREL)
+    ECase(R_HEX_COPY)
+    ECase(R_HEX_GLOB_DAT)
+    ECase(R_HEX_JMP_SLOT)
+    ECase(R_HEX_RELATIVE)
+    ECase(R_HEX_PLT_B22_PCREL)
+    ECase(R_HEX_GOTREL_LO16)
+    ECase(R_HEX_GOTREL_HI16)
+    ECase(R_HEX_GOTREL_32)
+    ECase(R_HEX_GOT_LO16)
+    ECase(R_HEX_GOT_HI16)
+    ECase(R_HEX_GOT_32)
+    ECase(R_HEX_GOT_16)
+    ECase(R_HEX_DTPMOD_32)
+    ECase(R_HEX_DTPREL_LO16)
+    ECase(R_HEX_DTPREL_HI16)
+    ECase(R_HEX_DTPREL_32)
+    ECase(R_HEX_DTPREL_16)
+    ECase(R_HEX_GD_PLT_B22_PCREL)
+    ECase(R_HEX_GD_GOT_LO16)
+    ECase(R_HEX_GD_GOT_HI16)
+    ECase(R_HEX_GD_GOT_32)
+    ECase(R_HEX_GD_GOT_16)
+    ECase(R_HEX_IE_LO16)
+    ECase(R_HEX_IE_HI16)
+    ECase(R_HEX_IE_32)
+    ECase(R_HEX_IE_GOT_LO16)
+    ECase(R_HEX_IE_GOT_HI16)
+    ECase(R_HEX_IE_GOT_32)
+    ECase(R_HEX_IE_GOT_16)
+    ECase(R_HEX_TPREL_LO16)
+    ECase(R_HEX_TPREL_HI16)
+    ECase(R_HEX_TPREL_32)
+    ECase(R_HEX_TPREL_16)
+    ECase(R_HEX_6_PCREL_X)
+    ECase(R_HEX_GOTREL_32_6_X)
+    ECase(R_HEX_GOTREL_16_X)
+    ECase(R_HEX_GOTREL_11_X)
+    ECase(R_HEX_GOT_32_6_X)
+    ECase(R_HEX_GOT_16_X)
+    ECase(R_HEX_GOT_11_X)
+    ECase(R_HEX_DTPREL_32_6_X)
+    ECase(R_HEX_DTPREL_16_X)
+    ECase(R_HEX_DTPREL_11_X)
+    ECase(R_HEX_GD_GOT_32_6_X)
+    ECase(R_HEX_GD_GOT_16_X)
+    ECase(R_HEX_GD_GOT_11_X)
+    ECase(R_HEX_IE_32_6_X)
+    ECase(R_HEX_IE_16_X)
+    ECase(R_HEX_IE_GOT_32_6_X)
+    ECase(R_HEX_IE_GOT_16_X)
+    ECase(R_HEX_IE_GOT_11_X)
+    ECase(R_HEX_TPREL_32_6_X)
+    ECase(R_HEX_TPREL_16_X)
+    ECase(R_HEX_TPREL_11_X)
+    break;
+  case ELF::EM_386:
+    ECase(R_386_NONE)
+    ECase(R_386_32)
+    ECase(R_386_PC32)
+    ECase(R_386_GOT32)
+    ECase(R_386_PLT32)
+    ECase(R_386_COPY)
+    ECase(R_386_GLOB_DAT)
+    ECase(R_386_JUMP_SLOT)
+    ECase(R_386_RELATIVE)
+    ECase(R_386_GOTOFF)
+    ECase(R_386_GOTPC)
+    ECase(R_386_32PLT)
+    ECase(R_386_TLS_TPOFF)
+    ECase(R_386_TLS_IE)
+    ECase(R_386_TLS_GOTIE)
+    ECase(R_386_TLS_LE)
+    ECase(R_386_TLS_GD)
+    ECase(R_386_TLS_LDM)
+    ECase(R_386_16)
+    ECase(R_386_PC16)
+    ECase(R_386_8)
+    ECase(R_386_PC8)
+    ECase(R_386_TLS_GD_32)
+    ECase(R_386_TLS_GD_PUSH)
+    ECase(R_386_TLS_GD_CALL)
+    ECase(R_386_TLS_GD_POP)
+    ECase(R_386_TLS_LDM_32)
+    ECase(R_386_TLS_LDM_PUSH)
+    ECase(R_386_TLS_LDM_CALL)
+    ECase(R_386_TLS_LDM_POP)
+    ECase(R_386_TLS_LDO_32)
+    ECase(R_386_TLS_IE_32)
+    ECase(R_386_TLS_LE_32)
+    ECase(R_386_TLS_DTPMOD32)
+    ECase(R_386_TLS_DTPOFF32)
+    ECase(R_386_TLS_TPOFF32)
+    ECase(R_386_TLS_GOTDESC)
+    ECase(R_386_TLS_DESC_CALL)
+    ECase(R_386_TLS_DESC)
+    ECase(R_386_IRELATIVE)
+    ECase(R_386_NUM)
+    break;
+  default:
+    llvm_unreachable("Unsupported architecture");
+  }
+#undef ECase
+}
+
 void MappingTraits<ELFYAML::FileHeader>::mapping(IO &IO,
                                                  ELFYAML::FileHeader &FileHdr) {
   IO.mapRequired("Class", FileHdr.Class);
@@ -360,21 +658,72 @@ void MappingTraits<ELFYAML::LocalGlobalWeakSymbols>::mapping(
   IO.mapOptional("Weak", Symbols.Weak);
 }
 
-void MappingTraits<ELFYAML::Section>::mapping(IO &IO,
-                                              ELFYAML::Section &Section) {
+static void commonSectionMapping(IO &IO, ELFYAML::Section &Section) {
   IO.mapOptional("Name", Section.Name, StringRef());
   IO.mapRequired("Type", Section.Type);
   IO.mapOptional("Flags", Section.Flags, ELFYAML::ELF_SHF(0));
   IO.mapOptional("Address", Section.Address, Hex64(0));
-  IO.mapOptional("Content", Section.Content);
-  IO.mapOptional("Link", Section.Link);
+  IO.mapOptional("Link", Section.Link, StringRef());
+  IO.mapOptional("Info", Section.Info, StringRef());
   IO.mapOptional("AddressAlign", Section.AddressAlign, Hex64(0));
 }
 
+static void sectionMapping(IO &IO, ELFYAML::RawContentSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Content", Section.Content);
+  IO.mapOptional("Size", Section.Size, Hex64(Section.Content.binary_size()));
+}
+
+static void sectionMapping(IO &IO, ELFYAML::RelocationSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Relocations", Section.Relocations);
+}
+
+void MappingTraits<std::unique_ptr<ELFYAML::Section>>::mapping(
+    IO &IO, std::unique_ptr<ELFYAML::Section> &Section) {
+  ELFYAML::ELF_SHT sectionType;
+  if (IO.outputting())
+    sectionType = Section->Type;
+  else
+    IO.mapRequired("Type", sectionType);
+
+  switch (sectionType) {
+  case ELF::SHT_REL:
+  case ELF::SHT_RELA:
+    if (!IO.outputting())
+      Section.reset(new ELFYAML::RelocationSection());
+    sectionMapping(IO, *cast<ELFYAML::RelocationSection>(Section.get()));
+    break;
+  default:
+    if (!IO.outputting())
+      Section.reset(new ELFYAML::RawContentSection());
+    sectionMapping(IO, *cast<ELFYAML::RawContentSection>(Section.get()));
+  }
+}
+
+StringRef MappingTraits<std::unique_ptr<ELFYAML::Section>>::validate(
+    IO &io, std::unique_ptr<ELFYAML::Section> &Section) {
+  const auto *RawSection = dyn_cast<ELFYAML::RawContentSection>(Section.get());
+  if (!RawSection || RawSection->Size >= RawSection->Content.binary_size())
+    return StringRef();
+  return "Section size must be greater or equal to the content size";
+}
+
+void MappingTraits<ELFYAML::Relocation>::mapping(IO &IO,
+                                                 ELFYAML::Relocation &Rel) {
+  IO.mapRequired("Offset", Rel.Offset);
+  IO.mapRequired("Symbol", Rel.Symbol);
+  IO.mapRequired("Type", Rel.Type);
+  IO.mapOptional("Addend", Rel.Addend);
+}
+
 void MappingTraits<ELFYAML::Object>::mapping(IO &IO, ELFYAML::Object &Object) {
+  assert(!IO.getContext() && "The IO context is initialized already");
+  IO.setContext(&Object);
   IO.mapRequired("FileHeader", Object.Header);
   IO.mapOptional("Sections", Object.Sections);
   IO.mapOptional("Symbols", Object.Symbols);
+  IO.setContext(nullptr);
 }
 
 } // end namespace yaml
diff --git a/lib/Object/LLVMBuild.txt b/lib/Object/LLVMBuild.txt
index a87da6e..7813832 100644
--- a/lib/Object/LLVMBuild.txt
+++ b/lib/Object/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = Object
 parent = Libraries
-required_libraries = Support BitReader
+required_libraries = BitReader Core Support
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 12132a4..c6bab03 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -420,7 +420,8 @@ MachOObjectFile::MachOObjectFile(MemoryBuffer *Object, bool IsLittleEndian,
                                  bool Is64bits, error_code &EC,
                                  bool BufferOwned)
     : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object, BufferOwned),
-      SymtabLoadCmd(NULL), DysymtabLoadCmd(NULL), DataInCodeLoadCmd(NULL) {
+      SymtabLoadCmd(nullptr), DysymtabLoadCmd(nullptr),
+      DataInCodeLoadCmd(nullptr) {
   uint32_t LoadCommandCount = this->getHeader().ncmds;
   MachO::LoadCommandType SegmentLoadType = is64Bit() ?
     MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT;
@@ -471,10 +472,18 @@ error_code MachOObjectFile::getSymbolAddress(DataRefImpl Symb,
                                              uint64_t &Res) const {
   if (is64Bit()) {
     MachO::nlist_64 Entry = getSymbol64TableEntry(Symb);
-    Res = Entry.n_value;
+    if ((Entry.n_type & MachO::N_TYPE) == MachO::N_UNDF &&
+        Entry.n_value == 0)
+      Res = UnknownAddressOrSize;
+    else
+      Res = Entry.n_value;
   } else {
     MachO::nlist Entry = getSymbolTableEntry(Symb);
-    Res = Entry.n_value;
+    if ((Entry.n_type & MachO::N_TYPE) == MachO::N_UNDF &&
+        Entry.n_value == 0)
+      Res = UnknownAddressOrSize;
+    else
+      Res = Entry.n_value;
   }
   return object_error::success;
 }
@@ -500,6 +509,10 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
   nlist_base Entry = getSymbolTableEntryBase(this, DRI);
   uint64_t Value;
   getSymbolAddress(DRI, Value);
+  if (Value == UnknownAddressOrSize) {
+    Result = UnknownAddressOrSize;
+    return object_error::success;
+  }
 
   BeginOffset = Value;
 
@@ -518,6 +531,8 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
     DataRefImpl DRI = Symbol.getRawDataRefImpl();
     Entry = getSymbolTableEntryBase(this, DRI);
     getSymbolAddress(DRI, Value);
+    if (Value == UnknownAddressOrSize)
+      continue;
     if (Entry.n_sect == SectionIndex && Value > BeginOffset)
       if (!EndOffset || Value < EndOffset)
         EndOffset = Value;
@@ -577,7 +592,7 @@ uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
     if ((MachOType & MachO::N_TYPE) == MachO::N_UNDF) {
       uint64_t Value;
       getSymbolAddress(DRI, Value);
-      if (Value)
+      if (Value && Value != UnknownAddressOrSize)
         Result |= SymbolRef::SF_Common;
     }
   }
@@ -685,15 +700,21 @@ MachOObjectFile::isSectionText(DataRefImpl Sec, bool &Res) const {
   return object_error::success;
 }
 
-error_code MachOObjectFile::isSectionData(DataRefImpl DRI, bool &Result) const {
-  // FIXME: Unimplemented.
-  Result = false;
+error_code MachOObjectFile::isSectionData(DataRefImpl Sec, bool &Result) const {
+  uint32_t Flags = getSectionFlags(this, Sec);
+  unsigned SectionType = Flags & MachO::SECTION_TYPE;
+  Result = !(Flags & MachO::S_ATTR_PURE_INSTRUCTIONS) &&
+           !(SectionType == MachO::S_ZEROFILL ||
+             SectionType == MachO::S_GB_ZEROFILL);
   return object_error::success;
 }
 
-error_code MachOObjectFile::isSectionBSS(DataRefImpl DRI, bool &Result) const {
-  // FIXME: Unimplemented.
-  Result = false;
+error_code MachOObjectFile::isSectionBSS(DataRefImpl Sec, bool &Result) const {
+  uint32_t Flags = getSectionFlags(this, Sec);
+  unsigned SectionType = Flags & MachO::SECTION_TYPE;
+  Result = !(Flags & MachO::S_ATTR_PURE_INSTRUCTIONS) &&
+           (SectionType == MachO::S_ZEROFILL ||
+            SectionType == MachO::S_GB_ZEROFILL);
   return object_error::success;
 }
 
@@ -755,65 +776,50 @@ MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
 }
 
 relocation_iterator MachOObjectFile::section_rel_begin(DataRefImpl Sec) const {
-  uint32_t Offset;
-  if (is64Bit()) {
-    MachO::section_64 Sect = getSection64(Sec);
-    Offset = Sect.reloff;
-  } else {
-    MachO::section Sect = getSection(Sec);
-    Offset = Sect.reloff;
-  }
-
   DataRefImpl Ret;
-  Ret.p = reinterpret_cast<uintptr_t>(getPtr(this, Offset));
+  Ret.d.a = Sec.d.a;
+  Ret.d.b = 0;
   return relocation_iterator(RelocationRef(Ret, this));
 }
 
 relocation_iterator
 MachOObjectFile::section_rel_end(DataRefImpl Sec) const {
-  uint32_t Offset;
   uint32_t Num;
   if (is64Bit()) {
     MachO::section_64 Sect = getSection64(Sec);
-    Offset = Sect.reloff;
     Num = Sect.nreloc;
   } else {
     MachO::section Sect = getSection(Sec);
-    Offset = Sect.reloff;
     Num = Sect.nreloc;
   }
 
-  const MachO::any_relocation_info *P =
-    reinterpret_cast<const MachO::any_relocation_info *>(getPtr(this, Offset));
-
   DataRefImpl Ret;
-  Ret.p = reinterpret_cast<uintptr_t>(P + Num);
+  Ret.d.a = Sec.d.a;
+  Ret.d.b = Num;
   return relocation_iterator(RelocationRef(Ret, this));
 }
 
-bool MachOObjectFile::section_rel_empty(DataRefImpl Sec) const {
-  if (is64Bit()) {
-    MachO::section_64 Sect = getSection64(Sec);
-    return Sect.nreloc == 0;
-  } else {
-    MachO::section Sect = getSection(Sec);
-    return Sect.nreloc == 0;
-  }
-}
-
 void MachOObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
-  const MachO::any_relocation_info *P =
-    reinterpret_cast<const MachO::any_relocation_info *>(Rel.p);
-  Rel.p = reinterpret_cast<uintptr_t>(P + 1);
+  ++Rel.d.b;
 }
 
 error_code
 MachOObjectFile::getRelocationAddress(DataRefImpl Rel, uint64_t &Res) const {
-  report_fatal_error("getRelocationAddress not implemented in MachOObjectFile");
+  uint64_t Offset;
+  getRelocationOffset(Rel, Offset);
+
+  DataRefImpl Sec;
+  Sec.d.a = Rel.d.a;
+  uint64_t SecAddress;
+  getSectionAddress(Sec, SecAddress);
+  Res = SecAddress + Offset;
+  return object_error::success;
 }
 
 error_code MachOObjectFile::getRelocationOffset(DataRefImpl Rel,
                                                 uint64_t &Res) const {
+  assert(getHeader().filetype == MachO::MH_OBJECT &&
+         "Only implemented for MH_OBJECT");
   MachO::any_relocation_info RE = getRelocation(Rel);
   Res = getAnyRelocationAddress(RE);
   return object_error::success;
@@ -986,7 +992,7 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
       }
       case MachO::X86_64_RELOC_SUBTRACTOR: {
         DataRefImpl RelNext = Rel;
-        RelNext.d.a++;
+        moveRelocationNext(RelNext);
         MachO::any_relocation_info RENext = getRelocation(RelNext);
 
         // X86_64_RELOC_SUBTRACTOR must be followed by a relocation of type
@@ -1034,7 +1040,7 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
         return object_error::success;
       case MachO::GENERIC_RELOC_SECTDIFF: {
         DataRefImpl RelNext = Rel;
-        RelNext.d.a++;
+        moveRelocationNext(RelNext);
         MachO::any_relocation_info RENext = getRelocation(RelNext);
 
         // X86 sect diff's must be followed by a relocation of type
@@ -1056,7 +1062,7 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
       switch (Type) {
         case MachO::GENERIC_RELOC_LOCAL_SECTDIFF: {
           DataRefImpl RelNext = Rel;
-          RelNext.d.a++;
+          moveRelocationNext(RelNext);
           MachO::any_relocation_info RENext = getRelocation(RelNext);
 
           // X86 sect diff's must be followed by a relocation of type
@@ -1095,7 +1101,7 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
           printRelocationTargetName(this, RE, fmt);
 
           DataRefImpl RelNext = Rel;
-          RelNext.d.a++;
+          moveRelocationNext(RelNext);
           MachO::any_relocation_info RENext = getRelocation(RelNext);
 
           // ARM half relocs must be followed by a relocation of type
@@ -1172,13 +1178,7 @@ error_code MachOObjectFile::getLibraryPath(DataRefImpl LibData,
 }
 
 basic_symbol_iterator MachOObjectFile::symbol_begin_impl() const {
-  DataRefImpl DRI;
-  if (!SymtabLoadCmd)
-    return basic_symbol_iterator(SymbolRef(DRI, this));
-
-  MachO::symtab_command Symtab = getSymtabLoadCommand();
-  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Symtab.symoff));
-  return basic_symbol_iterator(SymbolRef(DRI, this));
+  return getSymbolByIndex(0);
 }
 
 basic_symbol_iterator MachOObjectFile::symbol_end_impl() const {
@@ -1196,6 +1196,20 @@ basic_symbol_iterator MachOObjectFile::symbol_end_impl() const {
   return basic_symbol_iterator(SymbolRef(DRI, this));
 }
 
+basic_symbol_iterator MachOObjectFile::getSymbolByIndex(unsigned Index) const {
+  DataRefImpl DRI;
+  if (!SymtabLoadCmd)
+    return basic_symbol_iterator(SymbolRef(DRI, this));
+
+  MachO::symtab_command Symtab = getSymtabLoadCommand();
+  assert(Index < Symtab.nsyms && "Requested symbol index is out of range.");
+  unsigned SymbolTableEntrySize =
+    is64Bit() ? sizeof(MachO::nlist_64) : sizeof(MachO::nlist);
+  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Symtab.symoff));
+  DRI.p += Index * SymbolTableEntrySize;
+  return basic_symbol_iterator(SymbolRef(DRI, this));
+}
+
 section_iterator MachOObjectFile::section_begin() const {
   DataRefImpl DRI;
   return section_iterator(SectionRef(DRI, this));
@@ -1486,8 +1500,21 @@ MachOObjectFile::getVersionMinLoadCommand(const LoadCommandInfo &L) const {
 
 MachO::any_relocation_info
 MachOObjectFile::getRelocation(DataRefImpl Rel) const {
-  const char *P = reinterpret_cast<const char *>(Rel.p);
-  return getStruct<MachO::any_relocation_info>(this, P);
+  DataRefImpl Sec;
+  Sec.d.a = Rel.d.a;
+  uint32_t Offset;
+  if (is64Bit()) {
+    MachO::section_64 Sect = getSection64(Sec);
+    Offset = Sect.reloff;
+  } else {
+    MachO::section Sect = getSection(Sec);
+    Offset = Sect.reloff;
+  }
+
+  auto P = reinterpret_cast<const MachO::any_relocation_info *>(
+      getPtr(this, Offset)) + Rel.d.b;
+  return getStruct<MachO::any_relocation_info>(
+      this, reinterpret_cast<const char *>(P));
 }
 
 MachO::data_in_code_entry
diff --git a/lib/Object/MachOUniversal.cpp b/lib/Object/MachOUniversal.cpp
index 70baa9f..5085efd 100644
--- a/lib/Object/MachOUniversal.cpp
+++ b/lib/Object/MachOUniversal.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/Archive.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -57,7 +58,7 @@ static T getUniversalBinaryStruct(const char *Ptr) {
 MachOUniversalBinary::ObjectForArch::ObjectForArch(
     const MachOUniversalBinary *Parent, uint32_t Index)
     : Parent(Parent), Index(Index) {
-  if (Parent == 0 || Index > Parent->getNumberOfObjects()) {
+  if (!Parent || Index > Parent->getNumberOfObjects()) {
     clear();
   } else {
     // Parse object header.
@@ -90,6 +91,25 @@ error_code MachOUniversalBinary::ObjectForArch::getAsObjectFile(
   return object_error::parse_failed;
 }
 
+error_code MachOUniversalBinary::ObjectForArch::getAsArchive(
+    std::unique_ptr<Archive> &Result) const {
+  if (Parent) {
+    StringRef ParentData = Parent->getData();
+    StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
+    std::string ObjectName =
+        Parent->getFileName().str() + ":" +
+        Triple::getArchTypeName(MachOObjectFile::getArch(Header.cputype));
+    MemoryBuffer *ObjBuffer = MemoryBuffer::getMemBuffer(
+        ObjectData, ObjectName, false);
+    ErrorOr<Archive *> Obj = Archive::create(ObjBuffer);
+    if (error_code EC = Obj.getError())
+      return EC;
+    Result.reset(Obj.get());
+    return object_error::success;
+  }
+  return object_error::parse_failed;
+}
+
 void MachOUniversalBinary::anchor() { }
 
 ErrorOr<MachOUniversalBinary *>
diff --git a/lib/Object/Object.cpp b/lib/Object/Object.cpp
index 243bd44..b0068a8 100644
--- a/lib/Object/Object.cpp
+++ b/lib/Object/Object.cpp
@@ -60,7 +60,7 @@ wrap(const relocation_iterator *SI) {
 // ObjectFile creation
 LLVMObjectFileRef LLVMCreateObjectFile(LLVMMemoryBufferRef MemBuf) {
   ErrorOr<ObjectFile*> ObjOrErr(ObjectFile::createObjectFile(unwrap(MemBuf)));
-  ObjectFile *Obj = ObjOrErr ? ObjOrErr.get() : 0;
+  ObjectFile *Obj = ObjOrErr ? ObjOrErr.get() : nullptr;
   return wrap(Obj);
 }
 
@@ -184,13 +184,6 @@ uint64_t LLVMGetSymbolAddress(LLVMSymbolIteratorRef SI) {
   return ret;
 }
 
-uint64_t LLVMGetSymbolFileOffset(LLVMSymbolIteratorRef SI) {
-  uint64_t ret;
-  if (error_code ec = (*unwrap(SI))->getFileOffset(ret))
-    report_fatal_error(ec.message());
-  return ret;
-}
-
 uint64_t LLVMGetSymbolSize(LLVMSymbolIteratorRef SI) {
   uint64_t ret;
   if (error_code ec = (*unwrap(SI))->getSize(ret))
diff --git a/lib/Object/StringTableBuilder.cpp b/lib/Object/StringTableBuilder.cpp
new file mode 100644
index 0000000..9152834
--- /dev/null
+++ b/lib/Object/StringTableBuilder.cpp
@@ -0,0 +1,51 @@
+//===-- StringTableBuilder.cpp - String table building utility ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Object/StringTableBuilder.h"
+
+using namespace llvm;
+
+static bool compareBySuffix(StringRef a, StringRef b) {
+  size_t sizeA = a.size();
+  size_t sizeB = b.size();
+  size_t len = std::min(sizeA, sizeB);
+  for (size_t i = 0; i < len; ++i) {
+    char ca = a[sizeA - i - 1];
+    char cb = b[sizeB - i - 1];
+    if (ca != cb)
+      return ca > cb;
+  }
+  return sizeA > sizeB;
+}
+
+void StringTableBuilder::finalize() {
+  SmallVector<StringRef, 8> Strings;
+  for (auto i = StringIndexMap.begin(), e = StringIndexMap.end(); i != e; ++i)
+    Strings.push_back(i->getKey());
+
+  std::sort(Strings.begin(), Strings.end(), compareBySuffix);
+
+  // FIXME: Starting with a null byte is ELF specific. Generalize this so we
+  // can use the class with other object formats.
+  StringTable += '\x00';
+
+  StringRef Previous;
+  for (StringRef s : Strings) {
+    if (Previous.endswith(s)) {
+      StringIndexMap[s] = StringTable.size() - 1 - s.size();
+      continue;
+    }
+
+    StringIndexMap[s] = StringTable.size();
+    StringTable += s;
+    StringTable += '\x00';
+    Previous = s;
+  }
+}
diff --git a/lib/Option/ArgList.cpp b/lib/Option/ArgList.cpp
index fecd237..a5ab8d7 100644
--- a/lib/Option/ArgList.cpp
+++ b/lib/Option/ArgList.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/Option/ArgList.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/Option.h"
@@ -32,11 +33,6 @@ void arg_iterator::SkipToNextArg() {
   }
 }
 
-//
-
-ArgList::ArgList() {
-}
-
 ArgList::~ArgList() {
 }
 
@@ -45,14 +41,9 @@ void ArgList::append(Arg *A) {
 }
 
 void ArgList::eraseArg(OptSpecifier Id) {
-  for (iterator it = begin(), ie = end(); it != ie; ) {
-    if ((*it)->getOption().matches(Id)) {
-      it = Args.erase(it);
-      ie = end();
-    } else {
-      ++it;
-    }
-  }
+  Args.erase(std::remove_if(begin(), end(),
+                            [=](Arg *A) { return A->getOption().matches(Id); }),
+             end());
 }
 
 Arg *ArgList::getLastArgNoClaim(OptSpecifier Id) const {
@@ -60,11 +51,11 @@ Arg *ArgList::getLastArgNoClaim(OptSpecifier Id) const {
   for (const_reverse_iterator it = rbegin(), ie = rend(); it != ie; ++it)
     if ((*it)->getOption().matches(Id))
       return *it;
-  return 0;
+  return nullptr;
 }
 
 Arg *ArgList::getLastArg(OptSpecifier Id) const {
-  Arg *Res = 0;
+  Arg *Res = nullptr;
   for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
     if ((*it)->getOption().matches(Id)) {
       Res = *it;
@@ -76,7 +67,7 @@ Arg *ArgList::getLastArg(OptSpecifier Id) const {
 }
 
 Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1) const {
-  Arg *Res = 0;
+  Arg *Res = nullptr;
   for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
     if ((*it)->getOption().matches(Id0) ||
         (*it)->getOption().matches(Id1)) {
@@ -91,7 +82,7 @@ Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1) const {
 
 Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
                          OptSpecifier Id2) const {
-  Arg *Res = 0;
+  Arg *Res = nullptr;
   for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
     if ((*it)->getOption().matches(Id0) ||
         (*it)->getOption().matches(Id1) ||
@@ -106,7 +97,7 @@ Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
 
 Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
                          OptSpecifier Id2, OptSpecifier Id3) const {
-  Arg *Res = 0;
+  Arg *Res = nullptr;
   for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
     if ((*it)->getOption().matches(Id0) ||
         (*it)->getOption().matches(Id1) ||
@@ -123,7 +114,7 @@ Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
 Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
                          OptSpecifier Id2, OptSpecifier Id3,
                          OptSpecifier Id4) const {
-  Arg *Res = 0;
+  Arg *Res = nullptr;
   for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
     if ((*it)->getOption().matches(Id0) ||
         (*it)->getOption().matches(Id1) ||
@@ -141,7 +132,7 @@ Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
 Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
                          OptSpecifier Id2, OptSpecifier Id3,
                          OptSpecifier Id4, OptSpecifier Id5) const {
-  Arg *Res = 0;
+  Arg *Res = nullptr;
   for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
     if ((*it)->getOption().matches(Id0) ||
         (*it)->getOption().matches(Id1) ||
@@ -161,7 +152,7 @@ Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
                          OptSpecifier Id2, OptSpecifier Id3,
                          OptSpecifier Id4, OptSpecifier Id5,
                          OptSpecifier Id6) const {
-  Arg *Res = 0;
+  Arg *Res = nullptr;
   for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
     if ((*it)->getOption().matches(Id0) ||
         (*it)->getOption().matches(Id1) ||
@@ -182,7 +173,7 @@ Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
                          OptSpecifier Id2, OptSpecifier Id3,
                          OptSpecifier Id4, OptSpecifier Id5,
                          OptSpecifier Id6, OptSpecifier Id7) const {
-  Arg *Res = 0;
+  Arg *Res = nullptr;
   for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
     if ((*it)->getOption().matches(Id0) ||
         (*it)->getOption().matches(Id1) ||
@@ -348,52 +339,50 @@ DerivedArgList::DerivedArgList(const InputArgList &_BaseArgs)
   : BaseArgs(_BaseArgs) {
 }
 
-DerivedArgList::~DerivedArgList() {
-  // We only own the arguments we explicitly synthesized.
-  for (iterator it = SynthesizedArgs.begin(), ie = SynthesizedArgs.end();
-       it != ie; ++it)
-    delete *it;
-}
+DerivedArgList::~DerivedArgList() {}
 
 const char *DerivedArgList::MakeArgString(StringRef Str) const {
   return BaseArgs.MakeArgString(Str);
 }
 
+void DerivedArgList::AddSynthesizedArg(Arg *A) {
+  SynthesizedArgs.push_back(std::unique_ptr<Arg>(A));
+}
+
 Arg *DerivedArgList::MakeFlagArg(const Arg *BaseArg, const Option Opt) const {
-  Arg *A = new Arg(Opt, ArgList::MakeArgString(Twine(Opt.getPrefix()) +
-                                               Twine(Opt.getName())),
-                   BaseArgs.MakeIndex(Opt.getName()), BaseArg);
-  SynthesizedArgs.push_back(A);
-  return A;
+  SynthesizedArgs.push_back(make_unique<Arg>(
+      Opt,
+      ArgList::MakeArgString(Twine(Opt.getPrefix()) + Twine(Opt.getName())),
+      BaseArgs.MakeIndex(Opt.getName()), BaseArg));
+  return SynthesizedArgs.back().get();
 }
 
 Arg *DerivedArgList::MakePositionalArg(const Arg *BaseArg, const Option Opt,
                                        StringRef Value) const {
   unsigned Index = BaseArgs.MakeIndex(Value);
-  Arg *A = new Arg(Opt, ArgList::MakeArgString(Twine(Opt.getPrefix()) +
-                                               Twine(Opt.getName())),
-                   Index, BaseArgs.getArgString(Index), BaseArg);
-  SynthesizedArgs.push_back(A);
-  return A;
+  SynthesizedArgs.push_back(make_unique<Arg>(
+      Opt,
+      ArgList::MakeArgString(Twine(Opt.getPrefix()) + Twine(Opt.getName())),
+      Index, BaseArgs.getArgString(Index), BaseArg));
+  return SynthesizedArgs.back().get();
 }
 
 Arg *DerivedArgList::MakeSeparateArg(const Arg *BaseArg, const Option Opt,
                                      StringRef Value) const {
   unsigned Index = BaseArgs.MakeIndex(Opt.getName(), Value);
-  Arg *A = new Arg(Opt, ArgList::MakeArgString(Twine(Opt.getPrefix()) +
-                                               Twine(Opt.getName())),
-                   Index, BaseArgs.getArgString(Index + 1), BaseArg);
-  SynthesizedArgs.push_back(A);
-  return A;
+  SynthesizedArgs.push_back(make_unique<Arg>(
+      Opt,
+      ArgList::MakeArgString(Twine(Opt.getPrefix()) + Twine(Opt.getName())),
+      Index, BaseArgs.getArgString(Index + 1), BaseArg));
+  return SynthesizedArgs.back().get();
 }
 
 Arg *DerivedArgList::MakeJoinedArg(const Arg *BaseArg, const Option Opt,
                                    StringRef Value) const {
   unsigned Index = BaseArgs.MakeIndex(Opt.getName().str() + Value.str());
-  Arg *A = new Arg(Opt, ArgList::MakeArgString(Twine(Opt.getPrefix()) +
-                                               Twine(Opt.getName())), Index,
-                   BaseArgs.getArgString(Index) + Opt.getName().size(),
-                   BaseArg);
-  SynthesizedArgs.push_back(A);
-  return A;
+  SynthesizedArgs.push_back(make_unique<Arg>(
+      Opt,
+      ArgList::MakeArgString(Twine(Opt.getPrefix()) + Twine(Opt.getName())),
+      Index, BaseArgs.getArgString(Index) + Opt.getName().size(), BaseArg));
+  return SynthesizedArgs.back().get();
 }
diff --git a/lib/Option/OptTable.cpp b/lib/Option/OptTable.cpp
index 6fa459a..6842f4d 100644
--- a/lib/Option/OptTable.cpp
+++ b/lib/Option/OptTable.cpp
@@ -62,7 +62,7 @@ static inline bool operator<(const OptTable::Info &A, const OptTable::Info &B) {
 
   for (const char * const *APre = A.Prefixes,
                   * const *BPre = B.Prefixes;
-                          *APre != 0 && *BPre != 0; ++APre, ++BPre) {
+                          *APre != nullptr && *BPre != nullptr; ++APre, ++BPre){
     if (int N = StrCmpOptionName(*APre, *BPre))
       return N < 0;
   }
@@ -136,7 +136,7 @@ OptTable::OptTable(const Info *_OptionInfos, unsigned _NumOptionInfos,
   for (unsigned i = FirstSearchableIndex + 1, e = getNumOptions() + 1;
                 i != e; ++i) {
     if (const char *const *P = getInfo(i).Prefixes) {
-      for (; *P != 0; ++P) {
+      for (; *P != nullptr; ++P) {
         PrefixesUnion.insert(*P);
       }
     }
@@ -160,7 +160,7 @@ OptTable::~OptTable() {
 const Option OptTable::getOption(OptSpecifier Opt) const {
   unsigned id = Opt.getID();
   if (id == 0)
-    return Option(0, 0);
+    return Option(nullptr, nullptr);
   assert((unsigned) (id - 1) < getNumOptions() && "Invalid ID.");
   return Option(&getInfo(id), this);
 }
@@ -178,7 +178,7 @@ static bool isInput(const llvm::StringSet<> &Prefixes, StringRef Arg) {
 /// \returns Matched size. 0 means no match.
 static unsigned matchOption(const OptTable::Info *I, StringRef Str,
                             bool IgnoreCase) {
-  for (const char * const *Pre = I->Prefixes; *Pre != 0; ++Pre) {
+  for (const char * const *Pre = I->Prefixes; *Pre != nullptr; ++Pre) {
     StringRef Prefix(*Pre);
     if (Str.startswith(Prefix)) {
       StringRef Rest = Str.substr(Prefix.size());
@@ -240,7 +240,7 @@ Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
 
     // Otherwise, see if this argument was missing values.
     if (Prev != Index)
-      return 0;
+      return nullptr;
   }
 
   // If we failed to find an option and this arg started with /, then it's
diff --git a/lib/Option/Option.cpp b/lib/Option/Option.cpp
index 7b5ff2b..10662a3 100644
--- a/lib/Option/Option.cpp
+++ b/lib/Option/Option.cpp
@@ -58,8 +58,8 @@ void Option::dump() const {
 
   if (Info->Prefixes) {
     llvm::errs() << " Prefixes:[";
-    for (const char * const *Pre = Info->Prefixes; *Pre != 0; ++Pre) {
-      llvm::errs() << '"' << *Pre << (*(Pre + 1) == 0 ? "\"" : "\", ");
+    for (const char * const *Pre = Info->Prefixes; *Pre != nullptr; ++Pre) {
+      llvm::errs() << '"' << *Pre << (*(Pre + 1) == nullptr ? "\"" : "\", ");
     }
     llvm::errs() << ']';
   }
@@ -116,7 +116,7 @@ Arg *Option::accept(const ArgList &Args,
   switch (getKind()) {
   case FlagClass: {
     if (ArgSize != strlen(Args.getArgString(Index)))
-      return 0;
+      return nullptr;
 
     Arg *A = new Arg(UnaliasedOption, Spelling, Index++);
     if (getAliasArgs()) {
@@ -166,11 +166,11 @@ Arg *Option::accept(const ArgList &Args,
     // Matches iff this is an exact match.
     // FIXME: Avoid strlen.
     if (ArgSize != strlen(Args.getArgString(Index)))
-      return 0;
+      return nullptr;
 
     Index += 2;
     if (Index > Args.getNumInputArgStrings())
-      return 0;
+      return nullptr;
 
     return new Arg(UnaliasedOption, Spelling,
                    Index - 2, Args.getArgString(Index - 1));
@@ -178,11 +178,11 @@ Arg *Option::accept(const ArgList &Args,
     // Matches iff this is an exact match.
     // FIXME: Avoid strlen.
     if (ArgSize != strlen(Args.getArgString(Index)))
-      return 0;
+      return nullptr;
 
     Index += 1 + getNumArgs();
     if (Index > Args.getNumInputArgStrings())
-      return 0;
+      return nullptr;
 
     Arg *A = new Arg(UnaliasedOption, Spelling, Index - 1 - getNumArgs(),
                       Args.getArgString(Index - getNumArgs()));
@@ -201,7 +201,7 @@ Arg *Option::accept(const ArgList &Args,
     // Otherwise it must be separate.
     Index += 2;
     if (Index > Args.getNumInputArgStrings())
-      return 0;
+      return nullptr;
 
     return new Arg(UnaliasedOption, Spelling,
                    Index - 2, Args.getArgString(Index - 1));
@@ -210,7 +210,7 @@ Arg *Option::accept(const ArgList &Args,
     // Always matches.
     Index += 2;
     if (Index > Args.getNumInputArgStrings())
-      return 0;
+      return nullptr;
 
     return new Arg(UnaliasedOption, Spelling, Index - 2,
                    Args.getArgString(Index - 2) + ArgSize,
@@ -219,7 +219,7 @@ Arg *Option::accept(const ArgList &Args,
     // Matches iff this is an exact match.
     // FIXME: Avoid strlen.
     if (ArgSize != strlen(Args.getArgString(Index)))
-      return 0;
+      return nullptr;
     Arg *A = new Arg(UnaliasedOption, Spelling, Index++);
     while (Index < Args.getNumInputArgStrings())
       A->getValues().push_back(Args.getArgString(Index++));
diff --git a/lib/ProfileData/Android.mk b/lib/ProfileData/Android.mk
new file mode 100644
index 0000000..5ae5ba8
--- /dev/null
+++ b/lib/ProfileData/Android.mk
@@ -0,0 +1,33 @@
+LOCAL_PATH:= $(call my-dir)
+
+profiledata_SRC_FILES := \
+  InstrProf.cpp \
+  InstrProfReader.cpp \
+  InstrProfWriter.cpp
+
+
+# For the host
+# =====================================================
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= libLLVMProfileData
+LOCAL_MODULE_TAGS := optional
+LOCAL_SRC_FILES := $(profiledata_SRC_FILES)
+
+include $(LLVM_HOST_BUILD_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
+include $(BUILD_HOST_STATIC_LIBRARY)
+
+# For the device
+# =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
+include $(CLEAR_VARS)
+
+LOCAL_MODULE:= libLLVMProfileData
+LOCAL_MODULE_TAGS := optional
+LOCAL_SRC_FILES := $(profiledata_SRC_FILES)
+
+include $(LLVM_DEVICE_BUILD_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
+include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index 850f613..de2b13d 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -33,6 +33,8 @@ class InstrProfErrorCategoryType : public error_category {
       return "Invalid header";
     case instrprof_error::unsupported_version:
       return "Unsupported format version";
+    case instrprof_error::unsupported_hash_type:
+      return "Unsupported hash function";
     case instrprof_error::too_large:
       return "Too much profile data";
     case instrprof_error::truncated:
@@ -50,7 +52,7 @@ class InstrProfErrorCategoryType : public error_category {
     }
     llvm_unreachable("A value of instrprof_error has no message.");
   }
-  error_condition default_error_condition(int EV) const {
+  error_condition default_error_condition(int EV) const override {
     if (EV == instrprof_error::success)
       return errc::success;
     return errc::invalid_argument;
diff --git a/lib/ProfileData/InstrProfIndexed.h b/lib/ProfileData/InstrProfIndexed.h
new file mode 100644
index 0000000..7761704
--- /dev/null
+++ b/lib/ProfileData/InstrProfIndexed.h
@@ -0,0 +1,55 @@
+//=-- InstrProfIndexed.h - Indexed profiling format support -------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Shared header for the instrumented profile data reader and writer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PROFILEDATA_INSTRPROF_INDEXED_H_
+#define LLVM_PROFILEDATA_INSTRPROF_INDEXED_H_
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MD5.h"
+
+namespace llvm {
+
+namespace IndexedInstrProf {
+enum class HashT : uint32_t {
+  MD5,
+
+  Last = MD5
+};
+
+static inline uint64_t MD5Hash(StringRef Str) {
+  MD5 Hash;
+  Hash.update(Str);
+  llvm::MD5::MD5Result Result;
+  Hash.final(Result);
+  // Return the least significant 8 bytes. Our MD5 implementation returns the
+  // result in little endian, so we may need to swap bytes.
+  using namespace llvm::support;
+  return endian::read<uint64_t, little, unaligned>(Result);
+}
+
+static inline uint64_t ComputeHash(HashT Type, StringRef K) {
+  switch (Type) {
+  case HashT::MD5:
+    return IndexedInstrProf::MD5Hash(K);
+  }
+  llvm_unreachable("Unhandled hash type");
+}
+
+const uint64_t Magic = 0x8169666f72706cff; // "\xfflprofi\x81"
+const uint64_t Version = 1;
+const HashT HashType = HashT::MD5;
+}
+
+} // end namespace llvm
+
+#endif // LLVM_PROFILEDATA_INSTRPROF_INDEXED_H_
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index b07f402..7014f5e 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -15,30 +15,62 @@
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ProfileData/InstrProf.h"
 
+#include "InstrProfIndexed.h"
+
 #include <cassert>
 
 using namespace llvm;
 
-error_code InstrProfReader::create(std::string Path,
-                                   std::unique_ptr<InstrProfReader> &Result) {
-  std::unique_ptr<MemoryBuffer> Buffer;
+static error_code setupMemoryBuffer(std::string Path,
+                                    std::unique_ptr<MemoryBuffer> &Buffer) {
   if (error_code EC = MemoryBuffer::getFileOrSTDIN(Path, Buffer))
     return EC;
 
   // Sanity check the file.
   if (Buffer->getBufferSize() > std::numeric_limits<unsigned>::max())
     return instrprof_error::too_large;
+  return instrprof_error::success;
+}
+
+static error_code initializeReader(InstrProfReader &Reader) {
+  return Reader.readHeader();
+}
+
+error_code InstrProfReader::create(std::string Path,
+                                   std::unique_ptr<InstrProfReader> &Result) {
+  // Set up the buffer to read.
+  std::unique_ptr<MemoryBuffer> Buffer;
+  if (error_code EC = setupMemoryBuffer(Path, Buffer))
+    return EC;
 
   // Create the reader.
-  if (RawInstrProfReader64::hasFormat(*Buffer))
+  if (IndexedInstrProfReader::hasFormat(*Buffer))
+    Result.reset(new IndexedInstrProfReader(std::move(Buffer)));
+  else if (RawInstrProfReader64::hasFormat(*Buffer))
     Result.reset(new RawInstrProfReader64(std::move(Buffer)));
   else if (RawInstrProfReader32::hasFormat(*Buffer))
     Result.reset(new RawInstrProfReader32(std::move(Buffer)));
   else
     Result.reset(new TextInstrProfReader(std::move(Buffer)));
 
-  // Read the header and return the result.
-  return Result->readHeader();
+  // Initialize the reader and return the result.
+  return initializeReader(*Result);
+}
+
+error_code IndexedInstrProfReader::create(
+    std::string Path, std::unique_ptr<IndexedInstrProfReader> &Result) {
+  // Set up the buffer to read.
+  std::unique_ptr<MemoryBuffer> Buffer;
+  if (error_code EC = setupMemoryBuffer(Path, Buffer))
+    return EC;
+
+  // Create the reader.
+  if (!IndexedInstrProfReader::hasFormat(*Buffer))
+    return instrprof_error::bad_magic;
+  Result.reset(new IndexedInstrProfReader(std::move(Buffer)));
+
+  // Initialize the reader and return the result.
+  return initializeReader(*Result);
 }
 
 void InstrProfIterator::Increment() {
@@ -69,6 +101,8 @@ error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
     return error(instrprof_error::truncated);
   if ((Line++)->getAsInteger(10, NumCounters))
     return error(instrprof_error::malformed);
+  if (NumCounters == 0)
+    return error(instrprof_error::malformed);
 
   // Read each counter and fill our internal storage with the values.
   Counts.clear();
@@ -138,6 +172,29 @@ error_code RawInstrProfReader<IntPtrT>::readHeader() {
   return readHeader(*Header);
 }
 
+template <class IntPtrT>
+error_code RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) {
+  const char *End = DataBuffer->getBufferEnd();
+  // Skip zero padding between profiles.
+  while (CurrentPos != End && *CurrentPos == 0)
+    ++CurrentPos;
+  // If there's nothing left, we're done.
+  if (CurrentPos == End)
+    return instrprof_error::eof;
+  // If there isn't enough space for another header, this is probably just
+  // garbage at the end of the file.
+  if (CurrentPos + sizeof(RawHeader) > End)
+    return instrprof_error::malformed;
+  // The magic should have the same byte order as in the previous header.
+  uint64_t Magic = *reinterpret_cast<const uint64_t *>(CurrentPos);
+  if (Magic != swap(getRawMagic<IntPtrT>()))
+    return instrprof_error::bad_magic;
+
+  // There's another profile to read, so we need to process the header.
+  auto *Header = reinterpret_cast<const RawHeader *>(CurrentPos);
+  return readHeader(*Header);
+}
+
 static uint64_t getRawVersion() {
   return 1;
 }
@@ -156,16 +213,17 @@ error_code RawInstrProfReader<IntPtrT>::readHeader(const RawHeader &Header) {
   ptrdiff_t DataOffset = sizeof(RawHeader);
   ptrdiff_t CountersOffset = DataOffset + sizeof(ProfileData) * DataSize;
   ptrdiff_t NamesOffset = CountersOffset + sizeof(uint64_t) * CountersSize;
-  size_t FileSize = NamesOffset + sizeof(char) * NamesSize;
+  size_t ProfileSize = NamesOffset + sizeof(char) * NamesSize;
 
-  if (FileSize != DataBuffer->getBufferSize())
+  auto *Start = reinterpret_cast<const char *>(&Header);
+  if (Start + ProfileSize > DataBuffer->getBufferEnd())
     return error(instrprof_error::bad_header);
 
-  const char *Start = DataBuffer->getBufferStart();
   Data = reinterpret_cast<const ProfileData *>(Start + DataOffset);
   DataEnd = Data + DataSize;
   CountersStart = reinterpret_cast<const uint64_t *>(Start + CountersOffset);
   NamesStart = Start + NamesOffset;
+  ProfileEnd = Start + ProfileSize;
 
   return success();
 }
@@ -174,12 +232,15 @@ template <class IntPtrT>
 error_code
 RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
   if (Data == DataEnd)
-    return error(instrprof_error::eof);
+    if (error_code EC = readNextHeader(ProfileEnd))
+      return EC;
 
   // Get the raw data.
   StringRef RawName(getName(Data->NamePtr), swap(Data->NameSize));
-  auto RawCounts = makeArrayRef(getCounter(Data->CounterPtr),
-                                swap(Data->NumCounters));
+  uint32_t NumCounters = swap(Data->NumCounters);
+  if (NumCounters == 0)
+    return error(instrprof_error::malformed);
+  auto RawCounts = makeArrayRef(getCounter(Data->CounterPtr), NumCounters);
 
   // Check bounds.
   auto *NamesStartAsCounter = reinterpret_cast<const uint64_t *>(NamesStart);
@@ -210,3 +271,83 @@ namespace llvm {
 template class RawInstrProfReader<uint32_t>;
 template class RawInstrProfReader<uint64_t>;
 }
+
+InstrProfLookupTrait::hash_value_type
+InstrProfLookupTrait::ComputeHash(StringRef K) {
+  return IndexedInstrProf::ComputeHash(HashType, K);
+}
+
+bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
+  if (DataBuffer.getBufferSize() < 8)
+    return false;
+  using namespace support;
+  uint64_t Magic =
+      endian::read<uint64_t, little, aligned>(DataBuffer.getBufferStart());
+  return Magic == IndexedInstrProf::Magic;
+}
+
+error_code IndexedInstrProfReader::readHeader() {
+  const unsigned char *Start =
+      (const unsigned char *)DataBuffer->getBufferStart();
+  const unsigned char *Cur = Start;
+  if ((const unsigned char *)DataBuffer->getBufferEnd() - Cur < 24)
+    return error(instrprof_error::truncated);
+
+  using namespace support;
+
+  // Check the magic number.
+  uint64_t Magic = endian::readNext<uint64_t, little, unaligned>(Cur);
+  if (Magic != IndexedInstrProf::Magic)
+    return error(instrprof_error::bad_magic);
+
+  // Read the version.
+  uint64_t Version = endian::readNext<uint64_t, little, unaligned>(Cur);
+  if (Version != IndexedInstrProf::Version)
+    return error(instrprof_error::unsupported_version);
+
+  // Read the maximal function count.
+  MaxFunctionCount = endian::readNext<uint64_t, little, unaligned>(Cur);
+
+  // Read the hash type and start offset.
+  IndexedInstrProf::HashT HashType = static_cast<IndexedInstrProf::HashT>(
+      endian::readNext<uint64_t, little, unaligned>(Cur));
+  if (HashType > IndexedInstrProf::HashT::Last)
+    return error(instrprof_error::unsupported_hash_type);
+  uint64_t HashOffset = endian::readNext<uint64_t, little, unaligned>(Cur);
+
+  // The rest of the file is an on disk hash table.
+  Index.reset(InstrProfReaderIndex::Create(Start + HashOffset, Cur, Start,
+                                           InstrProfLookupTrait(HashType)));
+  // Set up our iterator for readNextRecord.
+  RecordIterator = Index->data_begin();
+
+  return success();
+}
+
+error_code IndexedInstrProfReader::getFunctionCounts(
+    StringRef FuncName, uint64_t &FuncHash, std::vector<uint64_t> &Counts) {
+  const auto &Iter = Index->find(FuncName);
+  if (Iter == Index->end())
+    return error(instrprof_error::unknown_function);
+
+  // Found it. Make sure it's valid before giving back a result.
+  const InstrProfRecord &Record = *Iter;
+  if (Record.Name.empty())
+    return error(instrprof_error::malformed);
+  FuncHash = Record.Hash;
+  Counts = Record.Counts;
+  return success();
+}
+
+error_code IndexedInstrProfReader::readNextRecord(InstrProfRecord &Record) {
+  // Are we out of records?
+  if (RecordIterator == Index->data_end())
+    return error(instrprof_error::eof);
+
+  // Read the next one.
+  Record = *RecordIterator;
+  ++RecordIterator;
+  if (Record.Name.empty())
+    return error(instrprof_error::malformed);
+  return success();
+}
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index 3024f96..83c41d9 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -13,10 +13,59 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/InstrProfWriter.h"
-#include "llvm/Support/Endian.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/OnDiskHashTable.h"
+
+#include "InstrProfIndexed.h"
 
 using namespace llvm;
 
+namespace {
+class InstrProfRecordTrait {
+public:
+  typedef StringRef key_type;
+  typedef StringRef key_type_ref;
+
+  typedef const InstrProfWriter::CounterData *const data_type;
+  typedef const InstrProfWriter::CounterData *const data_type_ref;
+
+  typedef uint64_t hash_value_type;
+  typedef uint64_t offset_type;
+
+  static hash_value_type ComputeHash(key_type_ref K) {
+    return IndexedInstrProf::ComputeHash(IndexedInstrProf::HashType, K);
+  }
+
+  static std::pair<offset_type, offset_type>
+  EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
+    using namespace llvm::support;
+    endian::Writer<little> LE(Out);
+
+    offset_type N = K.size();
+    LE.write<offset_type>(N);
+
+    offset_type M = (1 + V->Counts.size()) * sizeof(uint64_t);
+    LE.write<offset_type>(M);
+
+    return std::make_pair(N, M);
+  }
+
+  static void EmitKey(raw_ostream &Out, key_type_ref K, offset_type N){
+    Out.write(K.data(), N);
+  }
+
+  static void EmitData(raw_ostream &Out, key_type_ref, data_type_ref V,
+                       offset_type) {
+    using namespace llvm::support;
+    endian::Writer<little> LE(Out);
+    LE.write<uint64_t>(V->Hash);
+    for (uint64_t I : V->Counts)
+      LE.write<uint64_t>(I);
+  }
+};
+}
+
 error_code InstrProfWriter::addFunctionCounts(StringRef FunctionName,
                                               uint64_t FunctionHash,
                                               ArrayRef<uint64_t> Counters) {
@@ -26,7 +75,7 @@ error_code InstrProfWriter::addFunctionCounts(StringRef FunctionName,
     auto &Data = FunctionData[FunctionName];
     Data.Hash = FunctionHash;
     Data.Counts = Counters;
-    return instrprof_error::success;;
+    return instrprof_error::success;
   }
 
   auto &Data = Where->getValue();
@@ -45,16 +94,33 @@ error_code InstrProfWriter::addFunctionCounts(StringRef FunctionName,
   return instrprof_error::success;
 }
 
-void InstrProfWriter::write(raw_ostream &OS) {
-  // Write out the counts for each function.
+void InstrProfWriter::write(raw_fd_ostream &OS) {
+  OnDiskChainedHashTableGenerator<InstrProfRecordTrait> Generator;
+  uint64_t MaxFunctionCount = 0;
+
+  // Populate the hash table generator.
   for (const auto &I : FunctionData) {
-    StringRef Name = I.getKey();
-    uint64_t Hash = I.getValue().Hash;
-    const std::vector<uint64_t> &Counts = I.getValue().Counts;
-
-    OS << Name << "\n" << Hash << "\n" << Counts.size() << "\n";
-    for (uint64_t Count : Counts)
-      OS << Count << "\n";
-    OS << "\n";
+    Generator.insert(I.getKey(), &I.getValue());
+    if (I.getValue().Counts[0] > MaxFunctionCount)
+      MaxFunctionCount = I.getValue().Counts[0];
   }
+
+  using namespace llvm::support;
+  endian::Writer<little> LE(OS);
+
+  // Write the header.
+  LE.write<uint64_t>(IndexedInstrProf::Magic);
+  LE.write<uint64_t>(IndexedInstrProf::Version);
+  LE.write<uint64_t>(MaxFunctionCount);
+  LE.write<uint64_t>(static_cast<uint64_t>(IndexedInstrProf::HashType));
+
+  // Save a space to write the hash table start location.
+  uint64_t HashTableStartLoc = OS.tell();
+  LE.write<uint64_t>(0);
+  // Write the hash table.
+  uint64_t HashTableStart = Generator.Emit(OS);
+
+  // Go back and fill in the hash table start.
+  OS.seek(HashTableStartLoc);
+  LE.write<uint64_t>(HashTableStart);
 }
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 85ce31b..f9fe095 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -1358,7 +1358,7 @@ APFloat::addOrSubtractSpecials(const APFloat &rhs, bool subtract)
 {
   switch (PackCategoriesIntoKey(category, rhs.category)) {
   default:
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
 
   case PackCategoriesIntoKey(fcNaN, fcZero):
   case PackCategoriesIntoKey(fcNaN, fcNormal):
@@ -1485,7 +1485,7 @@ APFloat::multiplySpecials(const APFloat &rhs)
 {
   switch (PackCategoriesIntoKey(category, rhs.category)) {
   default:
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
 
   case PackCategoriesIntoKey(fcNaN, fcZero):
   case PackCategoriesIntoKey(fcNaN, fcNormal):
@@ -1529,7 +1529,7 @@ APFloat::divideSpecials(const APFloat &rhs)
 {
   switch (PackCategoriesIntoKey(category, rhs.category)) {
   default:
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
 
   case PackCategoriesIntoKey(fcZero, fcNaN):
   case PackCategoriesIntoKey(fcNormal, fcNaN):
@@ -1570,7 +1570,7 @@ APFloat::modSpecials(const APFloat &rhs)
 {
   switch (PackCategoriesIntoKey(category, rhs.category)) {
   default:
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
 
   case PackCategoriesIntoKey(fcNaN, fcZero):
   case PackCategoriesIntoKey(fcNaN, fcNormal):
@@ -1679,7 +1679,7 @@ APFloat::multiply(const APFloat &rhs, roundingMode rounding_mode)
   fs = multiplySpecials(rhs);
 
   if (isFiniteNonZero()) {
-    lostFraction lost_fraction = multiplySignificand(rhs, 0);
+    lostFraction lost_fraction = multiplySignificand(rhs, nullptr);
     fs = normalize(rounding_mode, lost_fraction);
     if (lost_fraction != lfExactlyZero)
       fs = (opStatus) (fs | opInexact);
@@ -1882,7 +1882,7 @@ APFloat::compare(const APFloat &rhs) const
 
   switch (PackCategoriesIntoKey(category, rhs.category)) {
   default:
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
 
   case PackCategoriesIntoKey(fcNaN, fcZero):
   case PackCategoriesIntoKey(fcNaN, fcNormal):
@@ -2439,7 +2439,7 @@ APFloat::roundSignificandWithExponent(const integerPart *decSigParts,
 
     if (exp >= 0) {
       /* multiplySignificand leaves the precision-th bit set to 1.  */
-      calcLostFraction = decSig.multiplySignificand(pow5, NULL);
+      calcLostFraction = decSig.multiplySignificand(pow5, nullptr);
       powHUerr = powStatus != opOK;
     } else {
       calcLostFraction = decSig.divideSignificand(pow5);
@@ -3331,7 +3331,7 @@ APFloat::initFromAPInt(const fltSemantics* Sem, const APInt& api)
   if (Sem == &PPCDoubleDouble)
     return initFromPPCDoubleDoubleAPInt(api);
 
-  llvm_unreachable(0);
+  llvm_unreachable(nullptr);
 }
 
 APFloat
@@ -3795,7 +3795,7 @@ APFloat::opStatus APFloat::next(bool nextDown) {
     if (isSignaling()) {
       result = opInvalidOp;
       // For consistency, propagate the sign of the sNaN to the qNaN.
-      makeNaN(false, isNegative(), 0);
+      makeNaN(false, isNegative(), nullptr);
     }
     break;
   case fcZero:
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 0c46725..fa929eb 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "apint"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Hashing.h"
@@ -28,6 +27,8 @@
 #include <limits>
 using namespace llvm;
 
+#define DEBUG_TYPE "apint"
+
 /// A utility function for allocating memory, checking for allocation failures,
 /// and ensuring the contents are zeroed.
 inline static uint64_t* getClearedMemory(unsigned numWords) {
@@ -1683,10 +1684,10 @@ void APInt::divide(const APInt LHS, unsigned lhsWords,
   // Allocate space for the temporary values we need either on the stack, if
   // it will fit, or on the heap if it won't.
   unsigned SPACE[128];
-  unsigned *U = 0;
-  unsigned *V = 0;
-  unsigned *Q = 0;
-  unsigned *R = 0;
+  unsigned *U = nullptr;
+  unsigned *V = nullptr;
+  unsigned *Q = nullptr;
+  unsigned *R = nullptr;
   if ((Remainder?4:3)*n+2*m+1 <= 128) {
     U = &SPACE[0];
     V = &SPACE[m+n+1];
@@ -1872,7 +1873,7 @@ APInt APInt::udiv(const APInt& RHS) const {
 
   // We have to compute it the hard way. Invoke the Knuth divide algorithm.
   APInt Quotient(1,0); // to hold result.
-  divide(*this, lhsWords, RHS, rhsWords, &Quotient, 0);
+  divide(*this, lhsWords, RHS, rhsWords, &Quotient, nullptr);
   return Quotient;
 }
 
@@ -1920,7 +1921,7 @@ APInt APInt::urem(const APInt& RHS) const {
 
   // We have to compute it the hard way. Invoke the Knuth divide algorithm.
   APInt Remainder(1,0);
-  divide(*this, lhsWords, RHS, rhsWords, 0, &Remainder);
+  divide(*this, lhsWords, RHS, rhsWords, nullptr, &Remainder);
   return Remainder;
 }
 
diff --git a/lib/Support/Allocator.cpp b/lib/Support/Allocator.cpp
index 7e17748..7c306b2 100644
--- a/lib/Support/Allocator.cpp
+++ b/lib/Support/Allocator.cpp
@@ -21,29 +21,10 @@
 
 namespace llvm {
 
-SlabAllocator::~SlabAllocator() { }
-
-MallocSlabAllocator::~MallocSlabAllocator() { }
-
-MemSlab *MallocSlabAllocator::Allocate(size_t Size) {
-  MemSlab *Slab = (MemSlab*)Allocator.Allocate(Size, 0);
-  Slab->Size = Size;
-  Slab->NextPtr = 0;
-  return Slab;
-}
-
-void MallocSlabAllocator::Deallocate(MemSlab *Slab) {
-  Allocator.Deallocate(Slab);
-}
-
-void BumpPtrAllocatorBase::PrintStats() const {
-  unsigned NumSlabs = 0;
-  size_t TotalMemory = 0;
-  for (MemSlab *Slab = CurSlab; Slab != 0; Slab = Slab->NextPtr) {
-    TotalMemory += Slab->Size;
-    ++NumSlabs;
-  }
+namespace detail {
 
+void printBumpPtrAllocatorStats(unsigned NumSlabs, size_t BytesAllocated,
+                                size_t TotalMemory) {
   errs() << "\nNumber of memory regions: " << NumSlabs << '\n'
          << "Bytes used: " << BytesAllocated << '\n'
          << "Bytes allocated: " << TotalMemory << '\n'
@@ -51,13 +32,7 @@ void BumpPtrAllocatorBase::PrintStats() const {
          << " (includes alignment, etc)\n";
 }
 
-size_t BumpPtrAllocatorBase::getTotalMemory() const {
-  size_t TotalMemory = 0;
-  for (MemSlab *Slab = CurSlab; Slab != 0; Slab = Slab->NextPtr) {
-    TotalMemory += Slab->Size;
-  }
-  return TotalMemory;
-}
+} // End namespace detail.
 
 void PrintRecyclerStats(size_t Size,
                         size_t Align,
diff --git a/lib/Support/Atomic.cpp b/lib/Support/Atomic.cpp
index 9559ad7..2ef32b0 100644
--- a/lib/Support/Atomic.cpp
+++ b/lib/Support/Atomic.cpp
@@ -17,6 +17,7 @@
 using namespace llvm;
 
 #if defined(_MSC_VER)
+#include <Intrin.h>
 #include <windows.h>
 #undef MemoryFence
 #endif
diff --git a/lib/Support/BlockFrequency.cpp b/lib/Support/BlockFrequency.cpp
index 00cf75b..6f7e341 100644
--- a/lib/Support/BlockFrequency.cpp
+++ b/lib/Support/BlockFrequency.cpp
@@ -18,94 +18,8 @@
 
 using namespace llvm;
 
-/// Multiply FREQ by N and store result in W array.
-static void mult96bit(uint64_t freq, uint32_t N, uint32_t W[3]) {
-  uint64_t u0 = freq & UINT32_MAX;
-  uint64_t u1 = freq >> 32;
-
-  // Represent 96-bit value as W[2]:W[1]:W[0];
-  uint64_t t = u0 * N;
-  uint64_t k = t >> 32;
-  W[0] = t;
-  t = u1 * N + k;
-  W[1] = t;
-  W[2] = t >> 32;
-}
-
-/// Divide 96-bit value stored in W[2]:W[1]:W[0] by D. Since our word size is a
-/// 32 bit unsigned integer, we can use a short division algorithm.
-static uint64_t divrem96bit(uint32_t W[3], uint32_t D, uint32_t *Rout) {
-  // We assume that W[2] is non-zero since if W[2] is not then the user should
-  // just use hardware division.
-  assert(W[2] && "This routine assumes that W[2] is non-zero since if W[2] is "
-         "zero, the caller should just use 64/32 hardware.");
-  uint32_t Q[3] = { 0, 0, 0 };
-
-  // The generalized short division algorithm sets i to m + n - 1, where n is
-  // the number of words in the divisior and m is the number of words by which
-  // the divident exceeds the divisor (i.e. m + n == the length of the dividend
-  // in words). Due to our assumption that W[2] is non-zero, we know that the
-  // dividend is of length 3 implying since n is 1 that m = 2. Thus we set i to
-  // m + n - 1 = 2 + 1 - 1 = 2.
-  uint32_t R = 0;
-  for (int i = 2; i >= 0; --i) {
-    uint64_t PartialD = uint64_t(R) << 32 | W[i];
-    if (PartialD == 0) {
-      Q[i] = 0;
-      R = 0;
-    } else if (PartialD < D) {
-      Q[i] = 0;
-      R = uint32_t(PartialD);
-    } else if (PartialD == D) {
-      Q[i] = 1;
-      R = 0;
-    } else {
-      Q[i] = uint32_t(PartialD / D);
-      R = uint32_t(PartialD - (Q[i] * D));
-    }
-  }
-
-  // If Q[2] is non-zero, then we overflowed.
-  uint64_t Result;
-  if (Q[2]) {
-    Result = UINT64_MAX;
-    R = D;
-  } else {
-    // Form the final uint64_t result, avoiding endianness issues.
-    Result = uint64_t(Q[0]) | (uint64_t(Q[1]) << 32);
-  }
-
-  if (Rout)
-    *Rout = R;
-
-  return Result;
-}
-
-uint32_t BlockFrequency::scale(uint32_t N, uint32_t D) {
-  assert(D != 0 && "Division by zero");
-
-  // Calculate Frequency * N.
-  uint64_t MulLo = (Frequency & UINT32_MAX) * N;
-  uint64_t MulHi = (Frequency >> 32) * N;
-  uint64_t MulRes = (MulHi << 32) + MulLo;
-
-  // If the product fits in 64 bits, just use built-in division.
-  if (MulHi <= UINT32_MAX && MulRes >= MulLo) {
-    Frequency = MulRes / D;
-    return MulRes % D;
-  }
-
-  // Product overflowed, use 96-bit operations.
-  // 96-bit value represented as W[2]:W[1]:W[0].
-  uint32_t W[3];
-  uint32_t R;
-  mult96bit(Frequency, N, W);
-  Frequency = divrem96bit(W, D, &R);
-  return R;
-}
-
 BlockFrequency &BlockFrequency::operator*=(const BranchProbability &Prob) {
-  scale(Prob.getNumerator(), Prob.getDenominator());
+  Frequency = Prob.scale(Frequency);
   return *this;
 }
 
@@ -117,7 +31,7 @@ BlockFrequency::operator*(const BranchProbability &Prob) const {
 }
 
 BlockFrequency &BlockFrequency::operator/=(const BranchProbability &Prob) {
-  scale(Prob.getDenominator(), Prob.getNumerator());
+  Frequency = Prob.scaleByInverse(Frequency);
   return *this;
 }
 
@@ -156,8 +70,3 @@ BlockFrequency &BlockFrequency::operator>>=(const unsigned count) {
   Frequency |= Frequency == 0;
   return *this;
 }
-
-uint32_t BlockFrequency::scale(const BranchProbability &Prob) {
-  return scale(Prob.getNumerator(), Prob.getDenominator());
-}
-
diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp
index e8b83e5..65878d6 100644
--- a/lib/Support/BranchProbability.cpp
+++ b/lib/Support/BranchProbability.cpp
@@ -18,19 +18,56 @@
 
 using namespace llvm;
 
-void BranchProbability::print(raw_ostream &OS) const {
-  OS << N << " / " << D << " = " << format("%g%%", ((double)N / D) * 100.0);
+raw_ostream &BranchProbability::print(raw_ostream &OS) const {
+  return OS << N << " / " << D << " = "
+            << format("%g%%", ((double)N / D) * 100.0);
 }
 
-void BranchProbability::dump() const {
-  dbgs() << *this << '\n';
-}
+void BranchProbability::dump() const { print(dbgs()) << '\n'; }
+
+static uint64_t scale(uint64_t Num, uint32_t N, uint32_t D) {
+  assert(D && "divide by 0");
+
+  // Fast path for multiplying by 1.0.
+  if (!Num || D == N)
+    return Num;
+
+  // Split Num into upper and lower parts to multiply, then recombine.
+  uint64_t ProductHigh = (Num >> 32) * N;
+  uint64_t ProductLow = (Num & UINT32_MAX) * N;
+
+  // Split into 32-bit digits.
+  uint32_t Upper32 = ProductHigh >> 32;
+  uint32_t Lower32 = ProductLow & UINT32_MAX;
+  uint32_t Mid32Partial = ProductHigh & UINT32_MAX;
+  uint32_t Mid32 = Mid32Partial + (ProductLow >> 32);
+
+  // Carry.
+  Upper32 += Mid32 < Mid32Partial;
 
-namespace llvm {
+  // Check for overflow.
+  if (Upper32 >= D)
+    return UINT64_MAX;
+
+  uint64_t Rem = (uint64_t(Upper32) << 32) | Mid32;
+  uint64_t UpperQ = Rem / D;
+
+  // Check for overflow.
+  if (UpperQ > UINT32_MAX)
+    return UINT64_MAX;
+
+  Rem = ((Rem % D) << 32) | Lower32;
+  uint64_t LowerQ = Rem / D;
+  uint64_t Q = (UpperQ << 32) + LowerQ;
+
+  // Check for overflow.
+  return Q < LowerQ ? UINT64_MAX : Q;
+}
 
-raw_ostream &operator<<(raw_ostream &OS, const BranchProbability &Prob) {
-  Prob.print(OS);
-  return OS;
+uint64_t BranchProbability::scale(uint64_t Num) const {
+  return ::scale(Num, N, D);
 }
 
+uint64_t BranchProbability::scaleByInverse(uint64_t Num) const {
+  return ::scale(Num, D, N);
 }
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index b3c2614..37bbf48 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -38,6 +38,8 @@
 using namespace llvm;
 using namespace cl;
 
+#define DEBUG_TYPE "commandline"
+
 //===----------------------------------------------------------------------===//
 // Template instantiations and anchors.
 //
@@ -81,7 +83,7 @@ void StringSaver::anchor() {}
 // Globals for name and overview of program.  Program name is not a string to
 // avoid static ctor/dtor issues.
 static char ProgramName[80] = "<premain>";
-static const char *ProgramOverview = 0;
+static const char *ProgramOverview = nullptr;
 
 // This collects additional help to be printed.
 static ManagedStatic<std::vector<const char*> > MoreHelp;
@@ -100,10 +102,10 @@ void cl::MarkOptionsChanged() {
 
 /// RegisteredOptionList - This is the list of the command line options that
 /// have statically constructed themselves.
-static Option *RegisteredOptionList = 0;
+static Option *RegisteredOptionList = nullptr;
 
 void Option::addArgument() {
-  assert(NextRegistered == 0 && "argument multiply registered!");
+  assert(!NextRegistered && "argument multiply registered!");
 
   NextRegistered = RegisteredOptionList;
   RegisteredOptionList = this;
@@ -111,7 +113,7 @@ void Option::addArgument() {
 }
 
 void Option::removeArgument() {
-  assert(NextRegistered != 0 && "argument never registered");
+  assert(NextRegistered && "argument never registered");
   assert(RegisteredOptionList == this && "argument is not the last registered");
   RegisteredOptionList = NextRegistered;
   MarkOptionsChanged();
@@ -144,7 +146,7 @@ static void GetOptionInfo(SmallVectorImpl<Option*> &PositionalOpts,
                           SmallVectorImpl<Option*> &SinkOpts,
                           StringMap<Option*> &OptionsMap) {
   SmallVector<const char*, 16> OptionNames;
-  Option *CAOpt = 0;  // The ConsumeAfter option if it exists.
+  Option *CAOpt = nullptr;  // The ConsumeAfter option if it exists.
   for (Option *O = RegisteredOptionList; O; O = O->getNextRegisteredOption()) {
     // If this option wants to handle multiple option names, get the full set.
     // This handles enum options like "-O1 -O2" etc.
@@ -189,7 +191,7 @@ static void GetOptionInfo(SmallVectorImpl<Option*> &PositionalOpts,
 static Option *LookupOption(StringRef &Arg, StringRef &Value,
                             const StringMap<Option*> &OptionsMap) {
   // Reject all dashes.
-  if (Arg.empty()) return 0;
+  if (Arg.empty()) return nullptr;
 
   size_t EqualPos = Arg.find('=');
 
@@ -197,14 +199,14 @@ static Option *LookupOption(StringRef &Arg, StringRef &Value,
   if (EqualPos == StringRef::npos) {
     // Look up the option.
     StringMap<Option*>::const_iterator I = OptionsMap.find(Arg);
-    return I != OptionsMap.end() ? I->second : 0;
+    return I != OptionsMap.end() ? I->second : nullptr;
   }
 
   // If the argument before the = is a valid option name, we match.  If not,
   // return Arg unmolested.
   StringMap<Option*>::const_iterator I =
     OptionsMap.find(Arg.substr(0, EqualPos));
-  if (I == OptionsMap.end()) return 0;
+  if (I == OptionsMap.end()) return nullptr;
 
   Value = Arg.substr(EqualPos+1);
   Arg = Arg.substr(0, EqualPos);
@@ -219,7 +221,7 @@ static Option *LookupNearestOption(StringRef Arg,
                                    const StringMap<Option*> &OptionsMap,
                                    std::string &NearestString) {
   // Reject all dashes.
-  if (Arg.empty()) return 0;
+  if (Arg.empty()) return nullptr;
 
   // Split on any equal sign.
   std::pair<StringRef, StringRef> SplitArg = Arg.split('=');
@@ -227,7 +229,7 @@ static Option *LookupNearestOption(StringRef Arg,
   StringRef &RHS = SplitArg.second;
 
   // Find the closest match.
-  Option *Best = 0;
+  Option *Best = nullptr;
   unsigned BestDistance = 0;
   for (StringMap<Option*>::const_iterator it = OptionsMap.begin(),
          ie = OptionsMap.end(); it != ie; ++it) {
@@ -300,7 +302,7 @@ static inline bool ProvideOption(Option *Handler, StringRef ArgName,
   // Enforce value requirements
   switch (Handler->getValueExpectedFlag()) {
   case ValueRequired:
-    if (Value.data() == 0) {       // No value specified?
+    if (!Value.data()) { // No value specified?
       if (i+1 >= argc)
         return Handler->error("requires a value!");
       // Steal the next argument, like for '-o filename'
@@ -349,7 +351,7 @@ static inline bool ProvideOption(Option *Handler, StringRef ArgName,
 
 static bool ProvidePositionalOption(Option *Handler, StringRef Arg, int i) {
   int Dummy = i;
-  return ProvideOption(Handler, Handler->ArgStr, Arg, 0, 0, Dummy);
+  return ProvideOption(Handler, Handler->ArgStr, Arg, 0, nullptr, Dummy);
 }
 
 
@@ -385,7 +387,7 @@ static Option *getOptionPred(StringRef Name, size_t &Length,
     Length = Name.size();
     return OMI->second;    // Found one!
   }
-  return 0;                // No option found!
+  return nullptr;          // No option found!
 }
 
 /// HandlePrefixedOrGroupedOption - The specified argument string (which started
@@ -395,12 +397,12 @@ static Option *getOptionPred(StringRef Name, size_t &Length,
 static Option *HandlePrefixedOrGroupedOption(StringRef &Arg, StringRef &Value,
                                              bool &ErrorParsing,
                                          const StringMap<Option*> &OptionsMap) {
-  if (Arg.size() == 1) return 0;
+  if (Arg.size() == 1) return nullptr;
 
   // Do the lookup!
   size_t Length = 0;
   Option *PGOpt = getOptionPred(Arg, Length, isPrefixedOrGrouping, OptionsMap);
-  if (PGOpt == 0) return 0;
+  if (!PGOpt) return nullptr;
 
   // If the option is a prefixed option, then the value is simply the
   // rest of the name...  so fall through to later processing, by
@@ -427,7 +429,7 @@ static Option *HandlePrefixedOrGroupedOption(StringRef &Arg, StringRef &Value,
            "Option can not be cl::Grouping AND cl::ValueRequired!");
     int Dummy = 0;
     ErrorParsing |= ProvideOption(PGOpt, OneArgName,
-                                  StringRef(), 0, 0, Dummy);
+                                  StringRef(), 0, nullptr, Dummy);
 
     // Get the next grouping option.
     PGOpt = getOptionPred(Arg, Length, isGrouping, OptionsMap);
@@ -746,7 +748,7 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
   argc = static_cast<int>(newArgv.size());
 
   // Copy the program name into ProgName, making sure not to overflow it.
-  std::string ProgName = sys::path::filename(argv[0]);
+  StringRef ProgName = sys::path::filename(argv[0]);
   size_t Len = std::min(ProgName.size(), size_t(79));
   memcpy(ProgramName, ProgName.data(), Len);
   ProgramName[Len] = '\0';
@@ -760,7 +762,7 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
   // Determine whether or not there are an unlimited number of positionals
   bool HasUnlimitedPositionals = false;
 
-  Option *ConsumeAfterOpt = 0;
+  Option *ConsumeAfterOpt = nullptr;
   if (!PositionalOpts.empty()) {
     if (PositionalOpts[0]->getNumOccurrencesFlag() == cl::ConsumeAfter) {
       assert(PositionalOpts.size() > 1 &&
@@ -770,7 +772,7 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
 
     // Calculate how many positional values are _required_.
     bool UnboundedFound = false;
-    for (size_t i = ConsumeAfterOpt != 0, e = PositionalOpts.size();
+    for (size_t i = ConsumeAfterOpt ? 1 : 0, e = PositionalOpts.size();
          i != e; ++i) {
       Option *Opt = PositionalOpts[i];
       if (RequiresValue(Opt))
@@ -806,13 +808,13 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
   // If the program has named positional arguments, and the name has been run
   // across, keep track of which positional argument was named.  Otherwise put
   // the positional args into the PositionalVals list...
-  Option *ActivePositionalArg = 0;
+  Option *ActivePositionalArg = nullptr;
 
   // Loop over all of the arguments... processing them.
   bool DashDashFound = false;  // Have we read '--'?
   for (int i = 1; i < argc; ++i) {
-    Option *Handler = 0;
-    Option *NearestHandler = 0;
+    Option *Handler = nullptr;
+    Option *NearestHandler = nullptr;
     std::string NearestHandlerString;
     StringRef Value;
     StringRef ArgName = "";
@@ -845,8 +847,7 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
         // All of the positional arguments have been fulfulled, give the rest to
         // the consume after option... if it's specified...
         //
-        if (PositionalVals.size() >= NumPositionalRequired &&
-            ConsumeAfterOpt != 0) {
+        if (PositionalVals.size() >= NumPositionalRequired && ConsumeAfterOpt) {
           for (++i; i < argc; ++i)
             PositionalVals.push_back(std::make_pair(argv[i],i));
           break;   // Handle outside of the argument processing loop...
@@ -884,18 +885,18 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
       Handler = LookupOption(ArgName, Value, Opts);
 
       // Check to see if this "option" is really a prefixed or grouped argument.
-      if (Handler == 0)
+      if (!Handler)
         Handler = HandlePrefixedOrGroupedOption(ArgName, Value,
                                                 ErrorParsing, Opts);
 
       // Otherwise, look for the closest available option to report to the user
       // in the upcoming error.
-      if (Handler == 0 && SinkOpts.empty())
+      if (!Handler && SinkOpts.empty())
         NearestHandler = LookupNearestOption(ArgName, Opts,
                                              NearestHandlerString);
     }
 
-    if (Handler == 0) {
+    if (!Handler) {
       if (SinkOpts.empty()) {
         errs() << ProgramName << ": Unknown command line argument '"
              << argv[i] << "'.  Try: '" << argv[0] << " -help'\n";
@@ -939,7 +940,7 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
          << " positional arguments: See: " << argv[0] << " -help\n";
     ErrorParsing = true;
 
-  } else if (ConsumeAfterOpt == 0) {
+  } else if (!ConsumeAfterOpt) {
     // Positional args have already been handled if ConsumeAfter is specified.
     unsigned ValNo = 0, NumVals = static_cast<unsigned>(PositionalVals.size());
     for (size_t i = 0, e = PositionalOpts.size(); i != e; ++i) {
@@ -1044,7 +1045,7 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
 //
 
 bool Option::error(const Twine &Message, StringRef ArgName) {
-  if (ArgName.data() == 0) ArgName = ArgStr;
+  if (!ArgName.data()) ArgName = ArgStr;
   if (ArgName.empty())
     errs() << HelpStr;  // Be nice for positional arguments
   else
@@ -1455,12 +1456,12 @@ public:
     outs() << "USAGE: " << ProgramName << " [options]";
 
     // Print out the positional options.
-    Option *CAOpt = 0;   // The cl::ConsumeAfter option, if it exists...
+    Option *CAOpt = nullptr;   // The cl::ConsumeAfter option, if it exists...
     if (!PositionalOpts.empty() &&
         PositionalOpts[0]->getNumOccurrencesFlag() == ConsumeAfter)
       CAOpt = PositionalOpts[0];
 
-    for (size_t i = CAOpt != 0, e = PositionalOpts.size(); i != e; ++i) {
+    for (size_t i = CAOpt != nullptr, e = PositionalOpts.size(); i != e; ++i) {
       if (PositionalOpts[i]->ArgStr[0])
         outs() << " --" << PositionalOpts[i]->ArgStr;
       outs() << " " << PositionalOpts[i]->HelpStr;
@@ -1555,7 +1556,7 @@ protected:
       outs() << (*Category)->getName() << ":\n";
 
       // Check if description is set.
-      if ((*Category)->getDescription() != 0)
+      if ((*Category)->getDescription() != nullptr)
         outs() << (*Category)->getDescription() << "\n\n";
       else
         outs() << "\n";
@@ -1686,9 +1687,9 @@ void cl::PrintOptionValues() {
     Opts[i].second->printOptionValue(MaxArgLen, PrintAllOptions);
 }
 
-static void (*OverrideVersionPrinter)() = 0;
+static void (*OverrideVersionPrinter)() = nullptr;
 
-static std::vector<void (*)()>* ExtraVersionPrinters = 0;
+static std::vector<void (*)()>* ExtraVersionPrinters = nullptr;
 
 namespace {
 class VersionPrinter {
@@ -1721,7 +1722,7 @@ public:
   void operator=(bool OptionWasSpecified) {
     if (!OptionWasSpecified) return;
 
-    if (OverrideVersionPrinter != 0) {
+    if (OverrideVersionPrinter != nullptr) {
       (*OverrideVersionPrinter)();
       exit(0);
     }
@@ -1729,7 +1730,7 @@ public:
 
     // Iterate over any registered extra printers and call them to add further
     // information.
-    if (ExtraVersionPrinters != 0) {
+    if (ExtraVersionPrinters != nullptr) {
       outs() << '\n';
       for (std::vector<void (*)()>::iterator I = ExtraVersionPrinters->begin(),
                                              E = ExtraVersionPrinters->end();
@@ -1779,7 +1780,7 @@ void cl::SetVersionPrinter(void (*func)()) {
 }
 
 void cl::AddExtraVersionPrinter(void (*func)()) {
-  if (ExtraVersionPrinters == 0)
+  if (!ExtraVersionPrinters)
     ExtraVersionPrinters = new std::vector<void (*)()>;
 
   ExtraVersionPrinters->push_back(func);
diff --git a/lib/Support/Compression.cpp b/lib/Support/Compression.cpp
index 5e53361..c32eb213 100644
--- a/lib/Support/Compression.cpp
+++ b/lib/Support/Compression.cpp
@@ -16,7 +16,6 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MemoryBuffer.h"
 #if LLVM_ENABLE_ZLIB == 1 && HAVE_ZLIB_H
 #include <zlib.h>
 #endif
@@ -47,36 +46,26 @@ static zlib::Status encodeZlibReturnValue(int ReturnValue) {
 
 bool zlib::isAvailable() { return true; }
 zlib::Status zlib::compress(StringRef InputBuffer,
-                            std::unique_ptr<MemoryBuffer> &CompressedBuffer,
+                            SmallVectorImpl<char> &CompressedBuffer,
                             CompressionLevel Level) {
   unsigned long CompressedSize = ::compressBound(InputBuffer.size());
-  std::unique_ptr<char[]> TmpBuffer(new char[CompressedSize]);
+  CompressedBuffer.resize(CompressedSize);
   int CLevel = encodeZlibCompressionLevel(Level);
   Status Res = encodeZlibReturnValue(::compress2(
-      (Bytef *)TmpBuffer.get(), &CompressedSize,
+      (Bytef *)CompressedBuffer.data(), &CompressedSize,
       (const Bytef *)InputBuffer.data(), InputBuffer.size(), CLevel));
-  if (Res == StatusOK) {
-    CompressedBuffer.reset(MemoryBuffer::getMemBufferCopy(
-        StringRef(TmpBuffer.get(), CompressedSize)));
-    // Tell MSan that memory initialized by zlib is valid.
-    __msan_unpoison(CompressedBuffer->getBufferStart(), CompressedSize);
-  }
+  CompressedBuffer.resize(CompressedSize);
   return Res;
 }
 
 zlib::Status zlib::uncompress(StringRef InputBuffer,
-                              std::unique_ptr<MemoryBuffer> &UncompressedBuffer,
+                              SmallVectorImpl<char> &UncompressedBuffer,
                               size_t UncompressedSize) {
-  std::unique_ptr<char[]> TmpBuffer(new char[UncompressedSize]);
-  Status Res = encodeZlibReturnValue(
-      ::uncompress((Bytef *)TmpBuffer.get(), (uLongf *)&UncompressedSize,
-                   (const Bytef *)InputBuffer.data(), InputBuffer.size()));
-  if (Res == StatusOK) {
-    UncompressedBuffer.reset(MemoryBuffer::getMemBufferCopy(
-        StringRef(TmpBuffer.get(), UncompressedSize)));
-    // Tell MSan that memory initialized by zlib is valid.
-    __msan_unpoison(UncompressedBuffer->getBufferStart(), UncompressedSize);
-  }
+  UncompressedBuffer.resize(UncompressedSize);
+  Status Res = encodeZlibReturnValue(::uncompress(
+      (Bytef *)UncompressedBuffer.data(), (uLongf *)&UncompressedSize,
+      (const Bytef *)InputBuffer.data(), InputBuffer.size()));
+  UncompressedBuffer.resize(UncompressedSize);
   return Res;
 }
 
@@ -87,12 +76,12 @@ uint32_t zlib::crc32(StringRef Buffer) {
 #else
 bool zlib::isAvailable() { return false; }
 zlib::Status zlib::compress(StringRef InputBuffer,
-                            std::unique_ptr<MemoryBuffer> &CompressedBuffer,
+                            SmallVectorImpl<char> &CompressedBuffer,
                             CompressionLevel Level) {
   return zlib::StatusUnsupported;
 }
 zlib::Status zlib::uncompress(StringRef InputBuffer,
-                              std::unique_ptr<MemoryBuffer> &UncompressedBuffer,
+                              SmallVectorImpl<char> &UncompressedBuffer,
                               size_t UncompressedSize) {
   return zlib::StatusUnsupported;
 }
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index ccc0089..a426377 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -89,16 +89,16 @@ CrashRecoveryContext::~CrashRecoveryContext() {
 }
 
 bool CrashRecoveryContext::isRecoveringFromCrash() {
-  return tlIsRecoveringFromCrash->get() != 0;
+  return tlIsRecoveringFromCrash->get() != nullptr;
 }
 
 CrashRecoveryContext *CrashRecoveryContext::GetCurrent() {
   if (!gCrashRecoveryEnabled)
-    return 0;
+    return nullptr;
 
   const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
   if (!CRCI)
-    return 0;
+    return nullptr;
 
   return CRCI->CRC;
 }
@@ -120,7 +120,7 @@ CrashRecoveryContext::unregisterCleanup(CrashRecoveryContextCleanup *cleanup) {
   if (cleanup == head) {
     head = cleanup->next;
     if (head)
-      head->prev = 0;
+      head->prev = nullptr;
   }
   else {
     cleanup->prev->next = cleanup->next;
@@ -261,7 +261,7 @@ static void CrashRecoverySignalHandler(int Signal) {
   sigset_t SigMask;
   sigemptyset(&SigMask);
   sigaddset(&SigMask, Signal);
-  sigprocmask(SIG_UNBLOCK, &SigMask, 0);
+  sigprocmask(SIG_UNBLOCK, &SigMask, nullptr);
 
   if (CRCI)
     const_cast<CrashRecoveryContextImpl*>(CRCI)->HandleCrash();
@@ -296,12 +296,12 @@ void CrashRecoveryContext::Disable() {
 
   // Restore the previous signal handlers.
   for (unsigned i = 0; i != NumSignals; ++i)
-    sigaction(Signals[i], &PrevActions[i], 0);
+    sigaction(Signals[i], &PrevActions[i], nullptr);
 }
 
 #endif
 
-bool CrashRecoveryContext::RunSafely(void (*Fn)(void*), void *UserData) {
+bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
   // If crash recovery is disabled, do nothing.
   if (gCrashRecoveryEnabled) {
     assert(!Impl && "Crash recovery context already initialized!");
@@ -313,7 +313,7 @@ bool CrashRecoveryContext::RunSafely(void (*Fn)(void*), void *UserData) {
     }
   }
 
-  Fn(UserData);
+  Fn();
   return true;
 }
 
@@ -334,8 +334,7 @@ const std::string &CrashRecoveryContext::getBacktrace() const {
 
 namespace {
 struct RunSafelyOnThreadInfo {
-  void (*Fn)(void*);
-  void *Data;
+  function_ref<void()> Fn;
   CrashRecoveryContext *CRC;
   bool Result;
 };
@@ -344,11 +343,11 @@ struct RunSafelyOnThreadInfo {
 static void RunSafelyOnThread_Dispatch(void *UserData) {
   RunSafelyOnThreadInfo *Info =
     reinterpret_cast<RunSafelyOnThreadInfo*>(UserData);
-  Info->Result = Info->CRC->RunSafely(Info->Fn, Info->Data);
+  Info->Result = Info->CRC->RunSafely(Info->Fn);
 }
-bool CrashRecoveryContext::RunSafelyOnThread(void (*Fn)(void*), void *UserData,
+bool CrashRecoveryContext::RunSafelyOnThread(function_ref<void()> Fn,
                                              unsigned RequestedStackSize) {
-  RunSafelyOnThreadInfo Info = { Fn, UserData, this, false };
+  RunSafelyOnThreadInfo Info = { Fn, this, false };
   llvm_execute_on_thread(RunSafelyOnThread_Dispatch, &Info, RequestedStackSize);
   if (CrashRecoveryContextImpl *CRC = (CrashRecoveryContextImpl *)Impl)
     CRC->setSwitchedThread();
diff --git a/lib/Support/DAGDeltaAlgorithm.cpp b/lib/Support/DAGDeltaAlgorithm.cpp
index 29acb7d..0d504ee 100644
--- a/lib/Support/DAGDeltaAlgorithm.cpp
+++ b/lib/Support/DAGDeltaAlgorithm.cpp
@@ -42,6 +42,8 @@
 #include <map>
 using namespace llvm;
 
+#define DEBUG_TYPE "dag-delta"
+
 namespace {
 
 class DAGDeltaAlgorithmImpl {
diff --git a/lib/Support/DataExtractor.cpp b/lib/Support/DataExtractor.cpp
index a564d21..7b82921 100644
--- a/lib/Support/DataExtractor.cpp
+++ b/lib/Support/DataExtractor.cpp
@@ -44,7 +44,7 @@ static T *getUs(uint32_t *offset_ptr, T *dst, uint32_t count,
     // success
     return dst;
   }
-  return NULL;
+  return nullptr;
 }
 
 uint8_t DataExtractor::getU8(uint32_t *offset_ptr) const {
@@ -125,7 +125,7 @@ const char *DataExtractor::getCStr(uint32_t *offset_ptr) const {
     *offset_ptr = pos + 1;
     return Data.data() + offset;
   }
-  return NULL;
+  return nullptr;
 }
 
 uint64_t DataExtractor::getULEB128(uint32_t *offset_ptr) const {
diff --git a/lib/Support/DataStream.cpp b/lib/Support/DataStream.cpp
index 1caeddf..eec8584 100644
--- a/lib/Support/DataStream.cpp
+++ b/lib/Support/DataStream.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "Data-stream"
 #include "llvm/Support/DataStream.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/FileSystem.h"
@@ -30,6 +29,8 @@
 #endif
 using namespace llvm;
 
+#define DEBUG_TYPE "Data-stream"
+
 // Interface goals:
 // * StreamableMemoryObject doesn't care about complexities like using
 //   threads/async callbacks to actually overlap download+compile
@@ -83,7 +84,7 @@ DataStreamer *getDataFileStreamer(const std::string &Filename,
   if (error_code e = s->OpenFile(Filename)) {
     *StrError = std::string("Could not open ") + Filename + ": " +
         e.message() + "\n";
-    return NULL;
+    return nullptr;
   }
   return s;
 }
diff --git a/lib/Support/Debug.cpp b/lib/Support/Debug.cpp
index d9cb8a9..ad4d4ef 100644
--- a/lib/Support/Debug.cpp
+++ b/lib/Support/Debug.cpp
@@ -109,7 +109,7 @@ raw_ostream &llvm::dbgs() {
       if (EnableDebugBuffering && DebugFlag && DebugBufferSize != 0)
         // TODO: Add a handler for SIGUSER1-type signals so the user can
         // force a debug dump.
-        sys::AddSignalHandler(&debug_user_sig_handler, 0);
+        sys::AddSignalHandler(&debug_user_sig_handler, nullptr);
       // Otherwise we've already set the debug stream buffer size to
       // zero, disabling buffering so it will output directly to errs().
     }
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index 6604cc7..c9efa61 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -100,7 +100,7 @@ const char *llvm::dwarf::TagString(unsigned Tag) {
     return "DW_TAG_GNU_formal_parameter_pack";
   case DW_TAG_APPLE_property:            return "DW_TAG_APPLE_property";
   }
-  return 0;
+  return nullptr;
 }
 
 /// ChildrenString - Return the string for the specified children flag.
@@ -110,7 +110,7 @@ const char *llvm::dwarf::ChildrenString(unsigned Children) {
   case DW_CHILDREN_no:                   return "DW_CHILDREN_no";
   case DW_CHILDREN_yes:                  return "DW_CHILDREN_yes";
   }
-  return 0;
+  return nullptr;
 }
 
 /// AttributeString - Return the string for the specified attribute.
@@ -271,7 +271,7 @@ const char *llvm::dwarf::AttributeString(unsigned Attribute) {
   case DW_AT_GNU_pubnames:               return "DW_AT_GNU_pubnames";
   case DW_AT_GNU_pubtypes:               return "DW_AT_GNU_pubtypes";
   }
-  return 0;
+  return nullptr;
 }
 
 /// FormEncodingString - Return the string for the specified form encoding.
@@ -308,7 +308,7 @@ const char *llvm::dwarf::FormEncodingString(unsigned Encoding) {
   case DW_FORM_GNU_addr_index:           return "DW_FORM_GNU_addr_index";
   case DW_FORM_GNU_str_index:            return "DW_FORM_GNU_str_index";
   }
-  return 0;
+  return nullptr;
 }
 
 /// OperationEncodingString - Return the string for the specified operation
@@ -477,7 +477,7 @@ const char *llvm::dwarf::OperationEncodingString(unsigned Encoding) {
   case DW_OP_GNU_addr_index:             return "DW_OP_GNU_addr_index";
   case DW_OP_GNU_const_index:            return "DW_OP_GNU_const_index";
   }
-  return 0;
+  return nullptr;
 }
 
 /// AttributeEncodingString - Return the string for the specified attribute
@@ -503,7 +503,7 @@ const char *llvm::dwarf::AttributeEncodingString(unsigned Encoding) {
   case DW_ATE_lo_user:                   return "DW_ATE_lo_user";
   case DW_ATE_hi_user:                   return "DW_ATE_hi_user";
   }
-  return 0;
+  return nullptr;
 }
 
 /// DecimalSignString - Return the string for the specified decimal sign
@@ -516,7 +516,7 @@ const char *llvm::dwarf::DecimalSignString(unsigned Sign) {
   case DW_DS_leading_separate:           return "DW_DS_leading_separate";
   case DW_DS_trailing_separate:          return "DW_DS_trailing_separate";
   }
-  return 0;
+  return nullptr;
 }
 
 /// EndianityString - Return the string for the specified endianity.
@@ -529,7 +529,7 @@ const char *llvm::dwarf::EndianityString(unsigned Endian) {
   case DW_END_lo_user:                   return "DW_END_lo_user";
   case DW_END_hi_user:                   return "DW_END_hi_user";
   }
-  return 0;
+  return nullptr;
 }
 
 /// AccessibilityString - Return the string for the specified accessibility.
@@ -541,7 +541,7 @@ const char *llvm::dwarf::AccessibilityString(unsigned Access) {
   case DW_ACCESS_protected:              return "DW_ACCESS_protected";
   case DW_ACCESS_private:                return "DW_ACCESS_private";
   }
-  return 0;
+  return nullptr;
 }
 
 /// VisibilityString - Return the string for the specified visibility.
@@ -552,7 +552,7 @@ const char *llvm::dwarf::VisibilityString(unsigned Visibility) {
   case DW_VIS_exported:                  return "DW_VIS_exported";
   case DW_VIS_qualified:                 return "DW_VIS_qualified";
   }
-  return 0;
+  return nullptr;
 }
 
 /// VirtualityString - Return the string for the specified virtuality.
@@ -563,7 +563,7 @@ const char *llvm::dwarf::VirtualityString(unsigned Virtuality) {
   case DW_VIRTUALITY_virtual:            return "DW_VIRTUALITY_virtual";
   case DW_VIRTUALITY_pure_virtual:       return "DW_VIRTUALITY_pure_virtual";
   }
-  return 0;
+  return nullptr;
 }
 
 /// LanguageString - Return the string for the specified language.
@@ -600,7 +600,7 @@ const char *llvm::dwarf::LanguageString(unsigned Language) {
   case DW_LANG_lo_user:                  return "DW_LANG_lo_user";
   case DW_LANG_hi_user:                  return "DW_LANG_hi_user";
   }
-  return 0;
+  return nullptr;
 }
 
 /// CaseString - Return the string for the specified identifier case.
@@ -612,7 +612,7 @@ const char *llvm::dwarf::CaseString(unsigned Case) {
   case DW_ID_down_case:                  return "DW_ID_down_case";
   case DW_ID_case_insensitive:           return "DW_ID_case_insensitive";
   }
-  return 0;
+  return nullptr;
 }
 
 /// ConventionString - Return the string for the specified calling convention.
@@ -625,7 +625,7 @@ const char *llvm::dwarf::ConventionString(unsigned Convention) {
    case DW_CC_lo_user:                    return "DW_CC_lo_user";
    case DW_CC_hi_user:                    return "DW_CC_hi_user";
   }
-  return 0;
+  return nullptr;
 }
 
 /// InlineCodeString - Return the string for the specified inline code.
@@ -637,7 +637,7 @@ const char *llvm::dwarf::InlineCodeString(unsigned Code) {
   case DW_INL_declared_not_inlined:      return "DW_INL_declared_not_inlined";
   case DW_INL_declared_inlined:          return "DW_INL_declared_inlined";
   }
-  return 0;
+  return nullptr;
 }
 
 /// ArrayOrderString - Return the string for the specified array order.
@@ -647,7 +647,7 @@ const char *llvm::dwarf::ArrayOrderString(unsigned Order) {
   case DW_ORD_row_major:                 return "DW_ORD_row_major";
   case DW_ORD_col_major:                 return "DW_ORD_col_major";
   }
-  return 0;
+  return nullptr;
 }
 
 /// DiscriminantString - Return the string for the specified discriminant
@@ -657,7 +657,7 @@ const char *llvm::dwarf::DiscriminantString(unsigned Discriminant) {
   case DW_DSC_label:                     return "DW_DSC_label";
   case DW_DSC_range:                     return "DW_DSC_range";
   }
-  return 0;
+  return nullptr;
 }
 
 /// LNStandardString - Return the string for the specified line number standard.
@@ -677,7 +677,7 @@ const char *llvm::dwarf::LNStandardString(unsigned Standard) {
   case DW_LNS_set_epilogue_begin:        return "DW_LNS_set_epilogue_begin";
   case DW_LNS_set_isa:                   return "DW_LNS_set_isa";
   }
-  return 0;
+  return nullptr;
 }
 
 /// LNExtendedString - Return the string for the specified line number extended
@@ -692,7 +692,7 @@ const char *llvm::dwarf::LNExtendedString(unsigned Encoding) {
   case DW_LNE_lo_user:                   return "DW_LNE_lo_user";
   case DW_LNE_hi_user:                   return "DW_LNE_hi_user";
   }
-  return 0;
+  return nullptr;
 }
 
 /// MacinfoString - Return the string for the specified macinfo type encodings.
@@ -706,7 +706,7 @@ const char *llvm::dwarf::MacinfoString(unsigned Encoding) {
   case DW_MACINFO_end_file:              return "DW_MACINFO_end_file";
   case DW_MACINFO_vendor_ext:            return "DW_MACINFO_vendor_ext";
   }
-  return 0;
+  return nullptr;
 }
 
 /// CallFrameString - Return the string for the specified call frame instruction
@@ -745,7 +745,7 @@ const char *llvm::dwarf::CallFrameString(unsigned Encoding) {
   case DW_CFA_lo_user:                   return "DW_CFA_lo_user";
   case DW_CFA_hi_user:                   return "DW_CFA_hi_user";
   }
-  return 0;
+  return nullptr;
 }
 
 const char *llvm::dwarf::AtomTypeString(unsigned AT) {
@@ -761,7 +761,7 @@ const char *llvm::dwarf::AtomTypeString(unsigned AT) {
   case DW_ATOM_type_flags:
     return "DW_ATOM_type_flags";
   }
-  return 0;
+  return nullptr;
 }
 
 const char *llvm::dwarf::GDBIndexEntryKindString(GDBIndexEntryKind Kind) {
diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index 5d77153..82d7c0c 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp
@@ -51,14 +51,14 @@ using namespace llvm::sys;
 //===          independent code.
 //===----------------------------------------------------------------------===//
 
-static DenseSet<void *> *OpenedHandles = 0;
+static DenseSet<void *> *OpenedHandles = nullptr;
 
 DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
                                                    std::string *errMsg) {
   SmartScopedLock<true> lock(*SymbolsMutex);
 
   void *handle = dlopen(filename, RTLD_LAZY|RTLD_GLOBAL);
-  if (handle == 0) {
+  if (!handle) {
     if (errMsg) *errMsg = dlerror();
     return DynamicLibrary();
   }
@@ -66,11 +66,11 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
 #ifdef __CYGWIN__
   // Cygwin searches symbols only in the main
   // with the handle of dlopen(NULL, RTLD_GLOBAL).
-  if (filename == NULL)
+  if (!filename)
     handle = RTLD_DEFAULT;
 #endif
 
-  if (OpenedHandles == 0)
+  if (!OpenedHandles)
     OpenedHandles = new DenseSet<void *>();
 
   // If we've already loaded this library, dlclose() the handle in order to
@@ -83,7 +83,7 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
 
 void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
   if (!isValid())
-    return NULL;
+    return nullptr;
   return dlsym(Data, symbolName);
 }
 
@@ -166,7 +166,7 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
 #endif
 #undef EXPLICIT_SYMBOL
 
-  return 0;
+  return nullptr;
 }
 
 #endif // LLVM_ON_WIN32
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index 1aa8303..342c4f0 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -34,8 +34,8 @@
 
 using namespace llvm;
 
-static fatal_error_handler_t ErrorHandler = 0;
-static void *ErrorHandlerUserData = 0;
+static fatal_error_handler_t ErrorHandler = nullptr;
+static void *ErrorHandlerUserData = nullptr;
 
 void llvm::install_fatal_error_handler(fatal_error_handler_t handler,
                                        void *user_data) {
@@ -47,7 +47,7 @@ void llvm::install_fatal_error_handler(fatal_error_handler_t handler,
 }
 
 void llvm::remove_fatal_error_handler() {
-  ErrorHandler = 0;
+  ErrorHandler = nullptr;
 }
 
 void llvm::report_fatal_error(const char *Reason, bool GenCrashDiag) {
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index 8f2c9fc..49311c2 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FileOutputBuffer.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
@@ -85,19 +84,9 @@ error_code FileOutputBuffer::create(StringRef FilePath,
   return error_code::success();
 }
 
-error_code FileOutputBuffer::create(StringRef FilePath,
-                                    size_t Size,
-                                    OwningPtr<FileOutputBuffer> &Result,
-                                    unsigned Flags) {
-  std::unique_ptr<FileOutputBuffer> FOB;
-  error_code ec = create(FilePath, Size, FOB, Flags);
-  Result = std::move(FOB);
-  return ec;
-}
-
 error_code FileOutputBuffer::commit(int64_t NewSmallerSize) {
   // Unmap buffer, letting OS flush dirty pages to file on disk.
-  Region.reset(0);
+  Region.reset(nullptr);
 
   // If requested, resize file as part of commit.
   if ( NewSmallerSize != -1 ) {
diff --git a/lib/Support/FoldingSet.cpp b/lib/Support/FoldingSet.cpp
index 145f12d..4635114 100644
--- a/lib/Support/FoldingSet.cpp
+++ b/lib/Support/FoldingSet.cpp
@@ -190,7 +190,7 @@ FoldingSetNodeID::Intern(BumpPtrAllocator &Allocator) const {
 static FoldingSetImpl::Node *GetNextPtr(void *NextInBucketPtr) {
   // The low bit is set if this is the pointer back to the bucket.
   if (reinterpret_cast<intptr_t>(NextInBucketPtr) & 1)
-    return 0;
+    return nullptr;
   
   return static_cast<FoldingSetImpl::Node*>(NextInBucketPtr);
 }
@@ -262,7 +262,7 @@ void FoldingSetImpl::GrowHashTable() {
     while (Node *NodeInBucket = GetNextPtr(Probe)) {
       // Figure out the next link, remove NodeInBucket from the old link.
       Probe = NodeInBucket->getNextInBucket();
-      NodeInBucket->SetNextInBucket(0);
+      NodeInBucket->SetNextInBucket(nullptr);
 
       // Insert the node into the new bucket, after recomputing the hash.
       InsertNode(NodeInBucket,
@@ -285,7 +285,7 @@ FoldingSetImpl::Node
   void **Bucket = GetBucketFor(IDHash, Buckets, NumBuckets);
   void *Probe = *Bucket;
   
-  InsertPos = 0;
+  InsertPos = nullptr;
   
   FoldingSetNodeID TempID;
   while (Node *NodeInBucket = GetNextPtr(Probe)) {
@@ -298,14 +298,14 @@ FoldingSetImpl::Node
   
   // Didn't find the node, return null with the bucket as the InsertPos.
   InsertPos = Bucket;
-  return 0;
+  return nullptr;
 }
 
 /// InsertNode - Insert the specified node into the folding set, knowing that it
 /// is not already in the map.  InsertPos must be obtained from 
 /// FindNodeOrInsertPos.
 void FoldingSetImpl::InsertNode(Node *N, void *InsertPos) {
-  assert(N->getNextInBucket() == 0);
+  assert(!N->getNextInBucket());
   // Do we need to grow the hashtable?
   if (NumNodes+1 > NumBuckets*2) {
     GrowHashTable();
@@ -323,7 +323,7 @@ void FoldingSetImpl::InsertNode(Node *N, void *InsertPos) {
   // If this is the first insertion into this bucket, its next pointer will be
   // null.  Pretend as if it pointed to itself, setting the low bit to indicate
   // that it is a pointer to the bucket.
-  if (Next == 0)
+  if (!Next)
     Next = reinterpret_cast<void*>(reinterpret_cast<intptr_t>(Bucket)|1);
 
   // Set the node's next pointer, and make the bucket point to the node.
@@ -337,10 +337,10 @@ bool FoldingSetImpl::RemoveNode(Node *N) {
   // Because each bucket is a circular list, we don't need to compute N's hash
   // to remove it.
   void *Ptr = N->getNextInBucket();
-  if (Ptr == 0) return false;  // Not in folding set.
+  if (!Ptr) return false;  // Not in folding set.
 
   --NumNodes;
-  N->SetNextInBucket(0);
+  N->SetNextInBucket(nullptr);
 
   // Remember what N originally pointed to, either a bucket or another node.
   void *NodeNextPtr = Ptr;
@@ -390,7 +390,7 @@ FoldingSetImpl::Node *FoldingSetImpl::GetOrInsertNode(FoldingSetImpl::Node *N) {
 FoldingSetIteratorImpl::FoldingSetIteratorImpl(void **Bucket) {
   // Skip to the first non-null non-self-cycle bucket.
   while (*Bucket != reinterpret_cast<void*>(-1) &&
-         (*Bucket == 0 || GetNextPtr(*Bucket) == 0))
+         (!*Bucket || !GetNextPtr(*Bucket)))
     ++Bucket;
   
   NodePtr = static_cast<FoldingSetNode*>(*Bucket);
@@ -410,7 +410,7 @@ void FoldingSetIteratorImpl::advance() {
     do {
       ++Bucket;
     } while (*Bucket != reinterpret_cast<void*>(-1) &&
-             (*Bucket == 0 || GetNextPtr(*Bucket) == 0));
+             (!*Bucket || !GetNextPtr(*Bucket)));
     
     NodePtr = static_cast<FoldingSetNode*>(*Bucket);
   }
@@ -420,5 +420,5 @@ void FoldingSetIteratorImpl::advance() {
 // FoldingSetBucketIteratorImpl Implementation
 
 FoldingSetBucketIteratorImpl::FoldingSetBucketIteratorImpl(void **Bucket) {
-  Ptr = (*Bucket == 0 || GetNextPtr(*Bucket) == 0) ? (void*) Bucket : *Bucket;
+  Ptr = (!*Bucket || !GetNextPtr(*Bucket)) ? (void*) Bucket : *Bucket;
 }
diff --git a/lib/Support/FormattedStream.cpp b/lib/Support/FormattedStream.cpp
index 9febf66..618ec26 100644
--- a/lib/Support/FormattedStream.cpp
+++ b/lib/Support/FormattedStream.cpp
@@ -81,7 +81,7 @@ void formatted_raw_ostream::write_impl(const char *Ptr, size_t Size) {
   TheStream->write(Ptr, Size);
 
   // Reset the scanning pointer.
-  Scanned = 0;
+  Scanned = nullptr;
 }
 
 /// fouts() - This returns a reference to a formatted_raw_ostream for
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index 83aa255..f5b2943 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp
@@ -83,7 +83,7 @@ static bool LLVM_ATTRIBUTE_UNUSED
 ExecGraphViewer(StringRef ExecPath, std::vector<const char*> &args,
                 StringRef Filename, bool wait, std::string &ErrMsg) {
   if (wait) {
-    if (sys::ExecuteAndWait(ExecPath, &args[0],0,0,0,0,&ErrMsg)) {
+    if (sys::ExecuteAndWait(ExecPath, &args[0],nullptr,nullptr,0,0,&ErrMsg)) {
       errs() << "Error: " << ErrMsg << "\n";
       return false;
     }
@@ -91,7 +91,7 @@ ExecGraphViewer(StringRef ExecPath, std::vector<const char*> &args,
     errs() << " done. \n";
   }
   else {
-    sys::ExecuteNoWait(ExecPath, &args[0],0,0,0,&ErrMsg);
+    sys::ExecuteNoWait(ExecPath, &args[0],nullptr,nullptr,0,&ErrMsg);
     errs() << "Remember to erase graph file: " << Filename.str() << "\n";
   }
   return true;
@@ -108,7 +108,7 @@ void llvm::DisplayGraph(StringRef FilenameRef, bool wait,
   std::vector<const char*> args;
   args.push_back(Graphviz.c_str());
   args.push_back(Filename.c_str());
-  args.push_back(0);
+  args.push_back(nullptr);
 
   errs() << "Running 'Graphviz' program... ";
   if (!ExecGraphViewer(Graphviz, args, Filename, wait, ErrMsg))
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index b6e2cb1..fd0472e 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -39,6 +39,8 @@
 #include <mach/machine.h>
 #endif
 
+#define DEBUG_TYPE "host-detection"
+
 //===----------------------------------------------------------------------===//
 //
 //  Implementations of the CPU detection routines
@@ -221,6 +223,7 @@ StringRef sys::getHostCPUName() {
                  (EBX & 0x20);
   GetX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
   bool Em64T = (EDX >> 29) & 0x1;
+  bool HasTBM = (ECX >> 21) & 0x1;
 
   if (memcmp(text.c, "GenuineIntel", 12) == 0) {
     switch (Family) {
@@ -433,9 +436,11 @@ StringRef sys::getHostCPUName() {
       case 21:
         if (!HasAVX) // If the OS doesn't support AVX provide a sane fallback.
           return "btver1";
+        if (Model >= 0x50)
+          return "bdver4"; // 50h-6Fh: Excavator
         if (Model >= 0x30)
           return "bdver3"; // 30h-3Fh: Steamroller
-        if (Model >= 0x10)
+        if (Model >= 0x10 || HasTBM)
           return "bdver2"; // 10h-1Fh: Piledriver
         return "bdver1";   // 00h-0Fh: Bulldozer
       case 22:
@@ -681,7 +686,7 @@ StringRef sys::getHostCPUName() {
 }
 #endif
 
-#if defined(__linux__) && defined(__arm__)
+#if defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
 bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   std::string Err;
   DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
@@ -710,8 +715,24 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
       break;
     }
 
+#if defined(__aarch64__)
+  // Keep track of which crypto features we have seen
+  enum {
+    CAP_AES   = 0x1,
+    CAP_PMULL = 0x2,
+    CAP_SHA1  = 0x4,
+    CAP_SHA2  = 0x8
+  };
+  uint32_t crypto = 0;
+#endif
+
   for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
     StringRef LLVMFeatureStr = StringSwitch<StringRef>(CPUFeatures[I])
+#if defined(__aarch64__)
+      .Case("asimd", "neon")
+      .Case("fp", "fp-armv8")
+      .Case("crc32", "crc")
+#else
       .Case("half", "fp16")
       .Case("neon", "neon")
       .Case("vfpv3", "vfp3")
@@ -719,12 +740,32 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
       .Case("vfpv4", "vfp4")
       .Case("idiva", "hwdiv-arm")
       .Case("idivt", "hwdiv")
+#endif
       .Default("");
 
+#if defined(__aarch64__)
+    // We need to check crypto seperately since we need all of the crypto
+    // extensions to enable the subtarget feature
+    if (CPUFeatures[I] == "aes")
+      crypto |= CAP_AES;
+    else if (CPUFeatures[I] == "pmull")
+      crypto |= CAP_PMULL;
+    else if (CPUFeatures[I] == "sha1")
+      crypto |= CAP_SHA1;
+    else if (CPUFeatures[I] == "sha2")
+      crypto |= CAP_SHA2;
+#endif
+
     if (LLVMFeatureStr != "")
       Features.GetOrCreateValue(LLVMFeatureStr).setValue(true);
   }
 
+#if defined(__aarch64__)
+  // If we have all crypto bits we can add the feature
+  if (crypto == (CAP_AES | CAP_PMULL | CAP_SHA1 | CAP_SHA2))
+    Features.GetOrCreateValue("crypto").setValue(true);
+#endif
+
   return true;
 }
 #else
diff --git a/lib/Support/IntervalMap.cpp b/lib/Support/IntervalMap.cpp
index 4dfcc40..e11a7f2 100644
--- a/lib/Support/IntervalMap.cpp
+++ b/lib/Support/IntervalMap.cpp
@@ -58,7 +58,7 @@ void Path::moveLeft(unsigned Level) {
     }
   } else if (height() < Level)
     // end() may have created a height=0 path.
-    path.resize(Level + 1, Entry(0, 0, 0));
+    path.resize(Level + 1, Entry(nullptr, 0, 0));
 
   // NR is the subtree containing our left sibling.
   --path[l].offset;
diff --git a/lib/Support/LineIterator.cpp b/lib/Support/LineIterator.cpp
index 056d817..947a8fb 100644
--- a/lib/Support/LineIterator.cpp
+++ b/lib/Support/LineIterator.cpp
@@ -13,9 +13,10 @@
 using namespace llvm;
 
 line_iterator::line_iterator(const MemoryBuffer &Buffer, char CommentMarker)
-    : Buffer(Buffer.getBufferSize() ? &Buffer : 0),
+    : Buffer(Buffer.getBufferSize() ? &Buffer : nullptr),
       CommentMarker(CommentMarker), LineNumber(1),
-      CurrentLine(Buffer.getBufferSize() ? Buffer.getBufferStart() : 0, 0) {
+      CurrentLine(Buffer.getBufferSize() ? Buffer.getBufferStart() : nullptr,
+                  0) {
   // Ensure that if we are constructed on a non-empty memory buffer that it is
   // a null terminated buffer.
   if (Buffer.getBufferSize()) {
@@ -53,7 +54,7 @@ void line_iterator::advance() {
 
   if (*Pos == '\0') {
     // We've hit the end of the buffer, reset ourselves to the end state.
-    Buffer = 0;
+    Buffer = nullptr;
     CurrentLine = StringRef();
     return;
   }
diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index cd1cbcb..9b4bfbe 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp
@@ -43,8 +43,11 @@ LockFileManager::readLockFile(StringRef LockFileName) {
   std::tie(Hostname, PIDStr) = getToken(MB->getBuffer(), " ");
   PIDStr = PIDStr.substr(PIDStr.find_first_not_of(" "));
   int PID;
-  if (!PIDStr.getAsInteger(10, PID))
-    return std::make_pair(std::string(Hostname), PID);
+  if (!PIDStr.getAsInteger(10, PID)) {
+    auto Owner = std::make_pair(std::string(Hostname), PID);
+    if (processStillExecuting(Owner.first, Owner.second))
+      return Owner;
+  }
 
   // Delete the lock file. It's invalid anyway.
   sys::fs::remove(LockFileName);
@@ -171,9 +174,9 @@ LockFileManager::~LockFileManager() {
   sys::fs::remove(UniqueLockFileName.str());
 }
 
-void LockFileManager::waitForUnlock() {
+LockFileManager::WaitForUnlockResult LockFileManager::waitForUnlock() {
   if (getState() != LFS_Shared)
-    return;
+    return Res_Success;
 
 #if LLVM_ON_WIN32
   unsigned long Interval = 1;
@@ -193,7 +196,7 @@ void LockFileManager::waitForUnlock() {
 #if LLVM_ON_WIN32
     Sleep(Interval);
 #else
-    nanosleep(&Interval, NULL);
+    nanosleep(&Interval, nullptr);
 #endif
     bool LockFileJustDisappeared = false;
 
@@ -211,7 +214,7 @@ void LockFileManager::waitForUnlock() {
     // available now.
     if (LockFileGone) {
       if (sys::fs::exists(FileName.str())) {
-        return;
+        return Res_Success;
       }
 
       // The lock file is gone, so now we're waiting for the original file to
@@ -234,7 +237,7 @@ void LockFileManager::waitForUnlock() {
     // owning the lock died without cleaning up, just bail out.
     if (!LockFileGone &&
         !processStillExecuting((*Owner).first, (*Owner).second)) {
-      return;
+      return Res_OwnerDied;
     }
 
     // Exponentially increase the time we wait for the lock to be removed.
@@ -257,4 +260,5 @@ void LockFileManager::waitForUnlock() {
            );
 
   // Give up.
+  return Res_Timeout;
 }
diff --git a/lib/Support/ManagedStatic.cpp b/lib/Support/ManagedStatic.cpp
index 098cccb..6a1c2a5 100644
--- a/lib/Support/ManagedStatic.cpp
+++ b/lib/Support/ManagedStatic.cpp
@@ -17,15 +17,16 @@
 #include <cassert>
 using namespace llvm;
 
-static const ManagedStaticBase *StaticList = 0;
+static const ManagedStaticBase *StaticList = nullptr;
 
 void ManagedStaticBase::RegisterManagedStatic(void *(*Creator)(),
                                               void (*Deleter)(void*)) const {
+  assert(Creator);
   if (llvm_is_multithreaded()) {
     llvm_acquire_global_lock();
 
-    if (Ptr == 0) {
-      void* tmp = Creator ? Creator() : 0;
+    if (!Ptr) {
+      void* tmp = Creator();
 
       TsanHappensBefore(this);
       sys::MemoryFence();
@@ -45,9 +46,9 @@ void ManagedStaticBase::RegisterManagedStatic(void *(*Creator)(),
 
     llvm_release_global_lock();
   } else {
-    assert(Ptr == 0 && DeleterFn == 0 && Next == 0 &&
+    assert(!Ptr && !DeleterFn && !Next &&
            "Partially initialized ManagedStatic!?");
-    Ptr = Creator ? Creator() : 0;
+    Ptr = Creator();
     DeleterFn = Deleter;
   
     // Add to list of managed statics.
@@ -62,14 +63,14 @@ void ManagedStaticBase::destroy() const {
          "Not destroyed in reverse order of construction?");
   // Unlink from list.
   StaticList = Next;
-  Next = 0;
+  Next = nullptr;
 
   // Destroy memory.
   DeleterFn(Ptr);
   
   // Cleanup.
-  Ptr = 0;
-  DeleterFn = 0;
+  Ptr = nullptr;
+  DeleterFn = nullptr;
 }
 
 /// llvm_shutdown - Deallocate and destroy all ManagedStatic variables.
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 2d593a8..629d885 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Errno.h"
@@ -27,19 +26,11 @@
 #include <cstdio>
 #include <cstring>
 #include <new>
-#include <sys/stat.h>
 #include <sys/types.h>
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
 #include <unistd.h>
 #else
 #include <io.h>
-// Simplistic definitinos of these macros for use in getOpenFile.
-#ifndef S_ISREG
-#define S_ISREG(x) (1)
-#endif
-#ifndef S_ISBLK
-#define S_ISBLK(x) (0)
-#endif
 #endif
 using namespace llvm;
 
@@ -117,7 +108,7 @@ MemoryBuffer *MemoryBuffer::getMemBuffer(StringRef InputData,
 MemoryBuffer *MemoryBuffer::getMemBufferCopy(StringRef InputData,
                                              StringRef BufferName) {
   MemoryBuffer *Buf = getNewUninitMemBuffer(InputData.size(), BufferName);
-  if (!Buf) return 0;
+  if (!Buf) return nullptr;
   memcpy(const_cast<char*>(Buf->getBufferStart()), InputData.data(),
          InputData.size());
   return Buf;
@@ -137,7 +128,7 @@ MemoryBuffer *MemoryBuffer::getNewUninitMemBuffer(size_t Size,
       RoundUpToAlignment(sizeof(MemoryBufferMem) + BufferName.size() + 1, 16);
   size_t RealLen = AlignedStringLen + Size + 1;
   char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow));
-  if (!Mem) return 0;
+  if (!Mem) return nullptr;
 
   // The name is stored after the class itself.
   CopyStringRef(Mem + sizeof(MemoryBufferMem), BufferName);
@@ -155,7 +146,7 @@ MemoryBuffer *MemoryBuffer::getNewUninitMemBuffer(size_t Size,
 /// the MemoryBuffer object.
 MemoryBuffer *MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) {
   MemoryBuffer *SB = getNewUninitMemBuffer(Size, BufferName);
-  if (!SB) return 0;
+  if (!SB) return nullptr;
   memset(const_cast<char*>(SB->getBufferStart()), 0, Size);
   return SB;
 }
@@ -173,15 +164,6 @@ error_code MemoryBuffer::getFileOrSTDIN(StringRef Filename,
   return getFile(Filename, Result, FileSize);
 }
 
-error_code MemoryBuffer::getFileOrSTDIN(StringRef Filename,
-                                        OwningPtr<MemoryBuffer> &Result,
-                                        int64_t FileSize) {
-  std::unique_ptr<MemoryBuffer> MB;
-  error_code ec = getFileOrSTDIN(Filename, MB, FileSize);
-  Result = std::move(MB);
-  return ec;
-}
-
 
 //===----------------------------------------------------------------------===//
 // MemoryBuffer::getFile implementation.
@@ -252,44 +234,38 @@ static error_code getMemoryBufferForStream(int FD,
 static error_code getFileAux(const char *Filename,
                              std::unique_ptr<MemoryBuffer> &Result,
                              int64_t FileSize,
-                             bool RequiresNullTerminator);
+                             bool RequiresNullTerminator,
+                             bool IsVolatileSize);
 
 error_code MemoryBuffer::getFile(Twine Filename,
                                  std::unique_ptr<MemoryBuffer> &Result,
                                  int64_t FileSize,
-                                 bool RequiresNullTerminator) {
+                                 bool RequiresNullTerminator,
+                                 bool IsVolatileSize) {
   // Ensure the path is null terminated.
   SmallString<256> PathBuf;
   StringRef NullTerminatedName = Filename.toNullTerminatedStringRef(PathBuf);
   return getFileAux(NullTerminatedName.data(), Result, FileSize,
-                    RequiresNullTerminator);
-}
-
-error_code MemoryBuffer::getFile(Twine Filename,
-                                 OwningPtr<MemoryBuffer> &Result,
-                                 int64_t FileSize,
-                                 bool RequiresNullTerminator) {
-  std::unique_ptr<MemoryBuffer> MB;
-  error_code ec = getFile(Filename, MB, FileSize, RequiresNullTerminator);
-  Result = std::move(MB);
-  return ec;
+                    RequiresNullTerminator, IsVolatileSize);
 }
 
 static error_code getOpenFileImpl(int FD, const char *Filename,
                                   std::unique_ptr<MemoryBuffer> &Result,
                                   uint64_t FileSize, uint64_t MapSize,
-                                  int64_t Offset, bool RequiresNullTerminator);
+                                  int64_t Offset, bool RequiresNullTerminator,
+                                  bool IsVolatileSize);
 
 static error_code getFileAux(const char *Filename,
                              std::unique_ptr<MemoryBuffer> &Result, int64_t FileSize,
-                             bool RequiresNullTerminator) {
+                             bool RequiresNullTerminator,
+                             bool IsVolatileSize) {
   int FD;
   error_code EC = sys::fs::openFileForRead(Filename, FD);
   if (EC)
     return EC;
 
   error_code ret = getOpenFileImpl(FD, Filename, Result, FileSize, FileSize, 0,
-                                   RequiresNullTerminator);
+                                   RequiresNullTerminator, IsVolatileSize);
   close(FD);
   return ret;
 }
@@ -299,7 +275,14 @@ static bool shouldUseMmap(int FD,
                           size_t MapSize,
                           off_t Offset,
                           bool RequiresNullTerminator,
-                          int PageSize) {
+                          int PageSize,
+                          bool IsVolatileSize) {
+  // mmap may leave the buffer without null terminator if the file size changed
+  // by the time the last page is mapped in, so avoid it if the file size is
+  // likely to change.
+  if (IsVolatileSize)
+    return false;
+
   // We don't use mmap for small files because this can severely fragment our
   // address space.
   if (MapSize < 4 * 4096 || MapSize < (unsigned)PageSize)
@@ -315,9 +298,8 @@ static bool shouldUseMmap(int FD,
   // RequiresNullTerminator = false and MapSize != -1.
   if (FileSize == size_t(-1)) {
     sys::fs::file_status Status;
-    error_code EC = sys::fs::status(FD, Status);
-    if (EC)
-      return EC;
+    if (sys::fs::status(FD, Status))
+      return false;
     FileSize = Status.getSize();
   }
 
@@ -328,15 +310,6 @@ static bool shouldUseMmap(int FD,
   if (End != FileSize)
     return false;
 
-#if defined(_WIN32) || defined(__CYGWIN__)
-  // Don't peek the next page if file is multiple of *physical* pagesize(4k)
-  // but is not multiple of AllocationGranularity(64k),
-  // when a null terminator is required.
-  // FIXME: It's not good to hardcode 4096 here. dwPageSize shows 4096.
-  if ((FileSize & (4096 - 1)) == 0)
-    return false;
-#endif
-
   // Don't try to map files that are exactly a multiple of the system page size
   // if we need a null terminator.
   if ((FileSize & (PageSize -1)) == 0)
@@ -348,7 +321,8 @@ static bool shouldUseMmap(int FD,
 static error_code getOpenFileImpl(int FD, const char *Filename,
                                   std::unique_ptr<MemoryBuffer> &Result,
                                   uint64_t FileSize, uint64_t MapSize,
-                                  int64_t Offset, bool RequiresNullTerminator) {
+                                  int64_t Offset, bool RequiresNullTerminator,
+                                  bool IsVolatileSize) {
   static int PageSize = sys::process::get_self()->page_size();
 
   // Default is to map the full file.
@@ -375,7 +349,7 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
   }
 
   if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator,
-                    PageSize)) {
+                    PageSize, IsVolatileSize)) {
     error_code EC;
     Result.reset(new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile(
         RequiresNullTerminator, FD, MapSize, Offset, EC));
@@ -412,9 +386,7 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
       return error_code(errno, posix_category());
     }
     if (NumRead == 0) {
-      assert(0 && "We got inaccurate FileSize value or fstat reported an "
-                   "invalid file size.");
-      *BufPtr = '\0'; // null-terminate at the actual size.
+      memset(BufPtr, 0, BytesLeft); // zero-initialize rest of the buffer.
       break;
     }
     BytesLeft -= NumRead;
@@ -428,35 +400,18 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
 error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
                                      std::unique_ptr<MemoryBuffer> &Result,
                                      uint64_t FileSize,
-                                     bool RequiresNullTerminator) {
+                                     bool RequiresNullTerminator,
+                                     bool IsVolatileSize) {
   return getOpenFileImpl(FD, Filename, Result, FileSize, FileSize, 0,
-                         RequiresNullTerminator);
-}
-
-error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
-                                     OwningPtr<MemoryBuffer> &Result,
-                                     uint64_t FileSize,
-                                     bool RequiresNullTerminator) {
-  std::unique_ptr<MemoryBuffer> MB;
-  error_code ec = getOpenFileImpl(FD, Filename, MB, FileSize, FileSize, 0,
-                                  RequiresNullTerminator);
-  Result = std::move(MB);
-  return ec;
+                         RequiresNullTerminator, IsVolatileSize);
 }
 
 error_code MemoryBuffer::getOpenFileSlice(int FD, const char *Filename,
                                           std::unique_ptr<MemoryBuffer> &Result,
-                                          uint64_t MapSize, int64_t Offset) {
-  return getOpenFileImpl(FD, Filename, Result, -1, MapSize, Offset, false);
-}
-
-error_code MemoryBuffer::getOpenFileSlice(int FD, const char *Filename,
-                                          OwningPtr<MemoryBuffer> &Result,
-                                          uint64_t MapSize, int64_t Offset) {
-  std::unique_ptr<MemoryBuffer> MB;
-  error_code ec = getOpenFileImpl(FD, Filename, MB, -1, MapSize, Offset, false);
-  Result = std::move(MB);
-  return ec;
+                                          uint64_t MapSize, int64_t Offset,
+                                          bool IsVolatileSize) {
+  return getOpenFileImpl(FD, Filename, Result, -1, MapSize, Offset, false,
+                         IsVolatileSize);
 }
 
 //===----------------------------------------------------------------------===//
@@ -472,10 +427,3 @@ error_code MemoryBuffer::getSTDIN(std::unique_ptr<MemoryBuffer> &Result) {
 
   return getMemoryBufferForStream(0, "<stdin>", Result);
 }
-
-error_code MemoryBuffer::getSTDIN(OwningPtr<MemoryBuffer> &Result) {
-  std::unique_ptr<MemoryBuffer> MB;
-  error_code ec = getSTDIN(MB);
-  Result = std::move(MB);
-  return ec;
-}
diff --git a/lib/Support/Mutex.cpp b/lib/Support/Mutex.cpp
index 37c9d73..c8d3844 100644
--- a/lib/Support/Mutex.cpp
+++ b/lib/Support/Mutex.cpp
@@ -42,7 +42,7 @@ using namespace sys;
 
 // Construct a Mutex using pthread calls
 MutexImpl::MutexImpl( bool recursive)
-  : data_(0)
+  : data_(nullptr)
 {
   // Declare the pthread_mutex data structures
   pthread_mutex_t* mutex =
@@ -75,7 +75,7 @@ MutexImpl::MutexImpl( bool recursive)
 MutexImpl::~MutexImpl()
 {
   pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
-  assert(mutex != 0);
+  assert(mutex != nullptr);
   pthread_mutex_destroy(mutex);
   free(mutex);
 }
@@ -84,7 +84,7 @@ bool
 MutexImpl::acquire()
 {
   pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
-  assert(mutex != 0);
+  assert(mutex != nullptr);
 
   int errorcode = pthread_mutex_lock(mutex);
   return errorcode == 0;
@@ -94,7 +94,7 @@ bool
 MutexImpl::release()
 {
   pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
-  assert(mutex != 0);
+  assert(mutex != nullptr);
 
   int errorcode = pthread_mutex_unlock(mutex);
   return errorcode == 0;
@@ -104,7 +104,7 @@ bool
 MutexImpl::tryacquire()
 {
   pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
-  assert(mutex != 0);
+  assert(mutex != nullptr);
 
   int errorcode = pthread_mutex_trylock(mutex);
   return errorcode == 0;
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index 5b73631..b8d676f 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -569,6 +569,12 @@ bool is_separator(char value) {
   }
 }
 
+static const char preferred_separator_string[] = { preferred_separator, '\0' };
+
+const StringRef get_separator() {
+  return preferred_separator_string;
+}
+
 void system_temp_directory(bool erasedOnReboot, SmallVectorImpl<char> &result) {
   result.clear();
 
@@ -577,7 +583,7 @@ void system_temp_directory(bool erasedOnReboot, SmallVectorImpl<char> &result) {
   // macros defined in <unistd.h> on darwin >= 9
   int ConfName = erasedOnReboot? _CS_DARWIN_USER_TEMP_DIR
                                : _CS_DARWIN_USER_CACHE_DIR;
-  size_t ConfLen = confstr(ConfName, 0, 0);
+  size_t ConfLen = confstr(ConfName, nullptr, 0);
   if (ConfLen > 0) {
     do {
       result.resize(ConfLen);
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
index d4e205c..987778a 100644
--- a/lib/Support/PrettyStackTrace.cpp
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -46,7 +46,7 @@ static unsigned PrintStack(const PrettyStackTraceEntry *Entry, raw_ostream &OS){
 /// PrintCurStackTrace - Print the current stack trace to the specified stream.
 static void PrintCurStackTrace(raw_ostream &OS) {
   // Don't print an empty trace.
-  if (PrettyStackTraceHead->get() == 0) return;
+  if (!PrettyStackTraceHead->get()) return;
   
   // If there are pretty stack frames registered, walk and emit them.
   OS << "Stack dump:\n";
@@ -136,7 +136,7 @@ void PrettyStackTraceProgram::print(raw_ostream &OS) const {
 }
 
 static bool RegisterCrashPrinter() {
-  sys::AddSignalHandler(CrashHandler, 0);
+  sys::AddSignalHandler(CrashHandler, nullptr);
   return false;
 }
 
diff --git a/lib/Support/RWMutex.cpp b/lib/Support/RWMutex.cpp
index 6a34f2d..3b6309c 100644
--- a/lib/Support/RWMutex.cpp
+++ b/lib/Support/RWMutex.cpp
@@ -44,7 +44,7 @@ using namespace sys;
 
 // Construct a RWMutex using pthread calls
 RWMutexImpl::RWMutexImpl()
-  : data_(0)
+  : data_(nullptr)
 {
   // Declare the pthread_rwlock data structures
   pthread_rwlock_t* rwlock =
@@ -56,7 +56,7 @@ RWMutexImpl::RWMutexImpl()
 #endif
 
   // Initialize the rwlock
-  int errorcode = pthread_rwlock_init(rwlock, NULL);
+  int errorcode = pthread_rwlock_init(rwlock, nullptr);
   (void)errorcode;
   assert(errorcode == 0);
 
@@ -68,7 +68,7 @@ RWMutexImpl::RWMutexImpl()
 RWMutexImpl::~RWMutexImpl()
 {
   pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
-  assert(rwlock != 0);
+  assert(rwlock != nullptr);
   pthread_rwlock_destroy(rwlock);
   free(rwlock);
 }
@@ -77,7 +77,7 @@ bool
 RWMutexImpl::reader_acquire()
 {
   pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
-  assert(rwlock != 0);
+  assert(rwlock != nullptr);
 
   int errorcode = pthread_rwlock_rdlock(rwlock);
   return errorcode == 0;
@@ -87,7 +87,7 @@ bool
 RWMutexImpl::reader_release()
 {
   pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
-  assert(rwlock != 0);
+  assert(rwlock != nullptr);
 
   int errorcode = pthread_rwlock_unlock(rwlock);
   return errorcode == 0;
@@ -97,7 +97,7 @@ bool
 RWMutexImpl::writer_acquire()
 {
   pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
-  assert(rwlock != 0);
+  assert(rwlock != nullptr);
 
   int errorcode = pthread_rwlock_wrlock(rwlock);
   return errorcode == 0;
@@ -107,7 +107,7 @@ bool
 RWMutexImpl::writer_release()
 {
   pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
-  assert(rwlock != 0);
+  assert(rwlock != nullptr);
 
   int errorcode = pthread_rwlock_unlock(rwlock);
   return errorcode == 0;
diff --git a/lib/Support/Regex.cpp b/lib/Support/Regex.cpp
index 1115534..f7fe1e4 100644
--- a/lib/Support/Regex.cpp
+++ b/lib/Support/Regex.cpp
@@ -43,7 +43,7 @@ bool Regex::isValid(std::string &Error) {
   if (!error)
     return true;
   
-  size_t len = llvm_regerror(error, preg, NULL, 0);
+  size_t len = llvm_regerror(error, preg, nullptr, 0);
   
   Error.resize(len - 1);
   llvm_regerror(error, preg, &Error[0], len);
diff --git a/lib/Support/SearchForAddressOfSpecialSymbol.cpp b/lib/Support/SearchForAddressOfSpecialSymbol.cpp
index 2d23902..55f3320 100644
--- a/lib/Support/SearchForAddressOfSpecialSymbol.cpp
+++ b/lib/Support/SearchForAddressOfSpecialSymbol.cpp
@@ -48,7 +48,7 @@ static void *DoSearch(const char* symbolName) {
 #endif
 
 #undef EXPLICIT_SYMBOL
-  return 0;
+  return nullptr;
 }
 
 namespace llvm {
diff --git a/lib/Support/SmallPtrSet.cpp b/lib/Support/SmallPtrSet.cpp
index 844e416..a80e095 100644
--- a/lib/Support/SmallPtrSet.cpp
+++ b/lib/Support/SmallPtrSet.cpp
@@ -103,7 +103,7 @@ const void * const *SmallPtrSetImplBase::FindBucketFor(const void *Ptr) const {
   unsigned ArraySize = CurArraySize;
   unsigned ProbeAmt = 1;
   const void *const *Array = CurArray;
-  const void *const *Tombstone = 0;
+  const void *const *Tombstone = nullptr;
   while (1) {
     // Found Ptr's bucket?
     if (Array[Bucket] == Ptr)
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index 4bfd96a..acd75fb 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Locale.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
 using namespace llvm;
@@ -60,7 +61,7 @@ size_t SourceMgr::AddIncludeFile(const std::string &Filename,
 
   // If the file didn't exist directly, see if it's in an include path.
   for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBuf; ++i) {
-    IncludedFile = IncludeDirectories[i] + "/" + Filename;
+    IncludedFile = IncludeDirectories[i] + sys::path::get_separator().data() + Filename;
     MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf);
   }
 
@@ -114,7 +115,7 @@ SourceMgr::getLineAndColumn(SMLoc Loc, int BufferID) const {
     if (*Ptr == '\n') ++LineNo;
 
   // Allocate the line number cache if it doesn't exist.
-  if (LineNoCache == 0)
+  if (!LineNoCache)
     LineNoCache = new LineNoCacheTy();
 
   // Update the line # cache.
@@ -228,7 +229,7 @@ void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
     PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
   }
 
-  Diagnostic.print(0, OS, ShowColors);
+  Diagnostic.print(nullptr, OS, ShowColors);
 }
 
 void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
diff --git a/lib/Support/StringMap.cpp b/lib/Support/StringMap.cpp
index 9ac1f86..72a6d82 100644
--- a/lib/Support/StringMap.cpp
+++ b/lib/Support/StringMap.cpp
@@ -27,7 +27,7 @@ StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) {
   }
   
   // Otherwise, initialize it with zero buckets to avoid the allocation.
-  TheTable = 0;
+  TheTable = nullptr;
   NumBuckets = 0;
   NumItems = 0;
   NumTombstones = 0;
@@ -70,7 +70,7 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
   while (1) {
     StringMapEntryBase *BucketItem = TheTable[BucketNo];
     // If we found an empty bucket, this key isn't in the table yet, return it.
-    if (LLVM_LIKELY(BucketItem == 0)) {
+    if (LLVM_LIKELY(!BucketItem)) {
       // If we found a tombstone, we want to reuse the tombstone instead of an
       // empty bucket.  This reduces probing.
       if (FirstTombstone != -1) {
@@ -124,7 +124,7 @@ int StringMapImpl::FindKey(StringRef Key) const {
   while (1) {
     StringMapEntryBase *BucketItem = TheTable[BucketNo];
     // If we found an empty bucket, this key isn't in the table yet, return.
-    if (LLVM_LIKELY(BucketItem == 0))
+    if (LLVM_LIKELY(!BucketItem))
       return -1;
     
     if (BucketItem == getTombstoneVal()) {
@@ -166,7 +166,7 @@ void StringMapImpl::RemoveKey(StringMapEntryBase *V) {
 /// table, returning it.  If the key is not in the table, this returns null.
 StringMapEntryBase *StringMapImpl::RemoveKey(StringRef Key) {
   int Bucket = FindKey(Key);
-  if (Bucket == -1) return 0;
+  if (Bucket == -1) return nullptr;
   
   StringMapEntryBase *Result = TheTable[Bucket];
   TheTable[Bucket] = getTombstoneVal();
@@ -212,7 +212,7 @@ void StringMapImpl::RehashTable() {
       // Fast case, bucket available.
       unsigned FullHash = HashTable[I];
       unsigned NewBucket = FullHash & (NewSize-1);
-      if (NewTableArray[NewBucket] == 0) {
+      if (!NewTableArray[NewBucket]) {
         NewTableArray[FullHash & (NewSize-1)] = Bucket;
         NewHashArray[FullHash & (NewSize-1)] = FullHash;
         continue;
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index bd2a37b..cde8258 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -281,7 +281,7 @@ void StringRef::split(SmallVectorImpl<StringRef> &A,
   // rest.data() is used to distinguish cases like "a," that splits into
   // "a" + "" and "a" that splits into "a" + 0.
   for (int splits = 0;
-       rest.data() != NULL && (MaxSplit < 0 || splits < MaxSplit);
+       rest.data() != nullptr && (MaxSplit < 0 || splits < MaxSplit);
        ++splits) {
     std::pair<StringRef, StringRef> p = rest.split(Separators);
 
@@ -290,7 +290,7 @@ void StringRef::split(SmallVectorImpl<StringRef> &A,
     rest = p.second;
   }
   // If we have a tail left, add it.
-  if (rest.data() != NULL && (rest.size() != 0 || KeepEmpty))
+  if (rest.data() != nullptr && (rest.size() != 0 || KeepEmpty))
     A.push_back(rest);
 }
 
diff --git a/lib/Support/TargetRegistry.cpp b/lib/Support/TargetRegistry.cpp
index 8d91a53..a008831 100644
--- a/lib/Support/TargetRegistry.cpp
+++ b/lib/Support/TargetRegistry.cpp
@@ -17,7 +17,7 @@
 using namespace llvm;
 
 // Clients are responsible for avoid race conditions in registration.
-static Target *FirstTarget = 0;
+static Target *FirstTarget = nullptr;
 
 TargetRegistry::iterator TargetRegistry::begin() {
   return iterator(FirstTarget);
@@ -29,7 +29,7 @@ const Target *TargetRegistry::lookupTarget(const std::string &ArchName,
   // Allocate target machine.  First, check whether the user has explicitly
   // specified an architecture to compile for. If so we have to look it up by
   // name, because it might be a backend that has no mapping to a target triple.
-  const Target *TheTarget = 0;
+  const Target *TheTarget = nullptr;
   if (!ArchName.empty()) {
     for (TargetRegistry::iterator it = TargetRegistry::begin(),
            ie = TargetRegistry::end(); it != ie; ++it) {
@@ -41,7 +41,7 @@ const Target *TargetRegistry::lookupTarget(const std::string &ArchName,
 
     if (!TheTarget) {
       Error = "error: invalid target '" + ArchName + "'.\n";
-      return 0;
+      return nullptr;
     }
 
     // Adjust the triple to match (if known), otherwise stick with the
@@ -53,11 +53,11 @@ const Target *TargetRegistry::lookupTarget(const std::string &ArchName,
     // Get the target specific parser.
     std::string TempError;
     TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), TempError);
-    if (TheTarget == 0) {
+    if (!TheTarget) {
       Error = ": error: unable to get target for '"
             + TheTriple.getTriple()
             + "', see --version and --triple.\n";
-      return 0;
+      return nullptr;
     }
   }
 
@@ -69,16 +69,16 @@ const Target *TargetRegistry::lookupTarget(const std::string &TT,
   // Provide special warning when no targets are initialized.
   if (begin() == end()) {
     Error = "Unable to find target for this triple (no targets are registered)";
-    return 0;
+    return nullptr;
   }
-  const Target *Matching = 0;
+  const Target *Matching = nullptr;
   Triple::ArchType Arch =  Triple(TT).getArch();
   for (iterator it = begin(), ie = end(); it != ie; ++it) {
     if (it->ArchMatchFn(Arch)) {
       if (Matching) {
         Error = std::string("Cannot choose between targets \"") +
           Matching->Name  + "\" and \"" + it->Name + "\"";
-        return 0;
+        return nullptr;
       }
       Matching = &*it;
     }
@@ -87,7 +87,7 @@ const Target *TargetRegistry::lookupTarget(const std::string &TT,
   if (!Matching) {
     Error = "No available targets are compatible with this triple, "
       "see -version for the available targets.";
-    return 0;
+    return nullptr;
   }
 
   return Matching;
@@ -121,7 +121,7 @@ const Target *TargetRegistry::getClosestTargetForJIT(std::string &Error) {
 
   if (TheTarget && !TheTarget->hasJIT()) {
     Error = "No JIT compatible target available for this host";
-    return 0;
+    return nullptr;
   }
 
   return TheTarget;
diff --git a/lib/Support/ThreadLocal.cpp b/lib/Support/ThreadLocal.cpp
index aebbcad..2dec9eb 100644
--- a/lib/Support/ThreadLocal.cpp
+++ b/lib/Support/ThreadLocal.cpp
@@ -53,7 +53,7 @@ using namespace sys;
 ThreadLocalImpl::ThreadLocalImpl() : data() {
   static_assert(sizeof(pthread_key_t) <= sizeof(data), "size too big");
   pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
-  int errorcode = pthread_key_create(key, NULL);
+  int errorcode = pthread_key_create(key, nullptr);
   assert(errorcode == 0);
   (void) errorcode;
 }
@@ -78,7 +78,7 @@ const void* ThreadLocalImpl::getInstance() {
 }
 
 void ThreadLocalImpl::removeInstance() {
-  setInstance(0);
+  setInstance(nullptr);
 }
 
 }
diff --git a/lib/Support/Threading.cpp b/lib/Support/Threading.cpp
index 9d7ac6c..1acfa79 100644
--- a/lib/Support/Threading.cpp
+++ b/lib/Support/Threading.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 
 static bool multithreaded_mode = false;
 
-static sys::Mutex* global_lock = 0;
+static sys::Mutex* global_lock = nullptr;
 
 bool llvm::llvm_start_multithreaded() {
 #if LLVM_ENABLE_THREADS != 0
@@ -73,7 +73,7 @@ struct ThreadInfo {
 static void *ExecuteOnThread_Dispatch(void *Arg) {
   ThreadInfo *TI = reinterpret_cast<ThreadInfo*>(Arg);
   TI->UserFn(TI->UserData);
-  return 0;
+  return nullptr;
 }
 
 void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
@@ -97,7 +97,7 @@ void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
     goto error;
 
   // Wait for the thread and clean up.
-  ::pthread_join(Thread, 0);
+  ::pthread_join(Thread, nullptr);
 
  error:
   ::pthread_attr_destroy(&Attr);
diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
index 7cf4d37..61465ae 100644
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
@@ -77,7 +78,7 @@ raw_ostream *llvm::CreateInfoOutputFile() {
 }
 
 
-static TimerGroup *DefaultTimerGroup = 0;
+static TimerGroup *DefaultTimerGroup = nullptr;
 static TimerGroup *getDefaultTimerGroup() {
   TimerGroup *tmp = DefaultTimerGroup;
   sys::MemoryFence();
@@ -100,7 +101,7 @@ static TimerGroup *getDefaultTimerGroup() {
 //===----------------------------------------------------------------------===//
 
 void Timer::init(StringRef N) {
-  assert(TG == 0 && "Timer already initialized");
+  assert(!TG && "Timer already initialized");
   Name.assign(N.begin(), N.end());
   Started = false;
   TG = getDefaultTimerGroup();
@@ -108,7 +109,7 @@ void Timer::init(StringRef N) {
 }
 
 void Timer::init(StringRef N, TimerGroup &tg) {
-  assert(TG == 0 && "Timer already initialized");
+  assert(!TG && "Timer already initialized");
   Name.assign(N.begin(), N.end());
   Started = false;
   TG = &tg;
@@ -235,11 +236,11 @@ static Timer &getNamedRegionTimer(StringRef Name) {
 
 NamedRegionTimer::NamedRegionTimer(StringRef Name,
                                    bool Enabled)
-  : TimeRegion(!Enabled ? 0 : &getNamedRegionTimer(Name)) {}
+  : TimeRegion(!Enabled ? nullptr : &getNamedRegionTimer(Name)) {}
 
 NamedRegionTimer::NamedRegionTimer(StringRef Name, StringRef GroupName,
                                    bool Enabled)
-  : TimeRegion(!Enabled ? 0 : &NamedGroupedTimers->get(Name, GroupName)) {}
+  : TimeRegion(!Enabled ? nullptr : &NamedGroupedTimers->get(Name, GroupName)){}
 
 //===----------------------------------------------------------------------===//
 //   TimerGroup Implementation
@@ -247,10 +248,10 @@ NamedRegionTimer::NamedRegionTimer(StringRef Name, StringRef GroupName,
 
 /// TimerGroupList - This is the global list of TimerGroups, maintained by the
 /// TimerGroup ctor/dtor and is protected by the TimerLock lock.
-static TimerGroup *TimerGroupList = 0;
+static TimerGroup *TimerGroupList = nullptr;
 
 TimerGroup::TimerGroup(StringRef name)
-  : Name(name.begin(), name.end()), FirstTimer(0) {
+  : Name(name.begin(), name.end()), FirstTimer(nullptr) {
     
   // Add the group to TimerGroupList.
   sys::SmartScopedLock<true> L(*TimerLock);
@@ -264,7 +265,7 @@ TimerGroup::TimerGroup(StringRef name)
 TimerGroup::~TimerGroup() {
   // If the timer group is destroyed before the timers it owns, accumulate and
   // print the timing data.
-  while (FirstTimer != 0)
+  while (FirstTimer)
     removeTimer(*FirstTimer);
   
   // Remove the group from the TimerGroupList.
@@ -282,7 +283,7 @@ void TimerGroup::removeTimer(Timer &T) {
   if (T.Started)
     TimersToPrint.push_back(std::make_pair(T.Time, T.Name));
 
-  T.TG = 0;
+  T.TG = nullptr;
   
   // Unlink the timer from our list.
   *T.Prev = T.Next;
@@ -291,7 +292,7 @@ void TimerGroup::removeTimer(Timer &T) {
   
   // Print the report when all timers in this group are destroyed if some of
   // them were started.
-  if (FirstTimer != 0 || TimersToPrint.empty())
+  if (FirstTimer || TimersToPrint.empty())
     return;
   
   raw_ostream *OutStream = CreateInfoOutputFile();
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 71abb9d..b3d48fb 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -24,6 +24,7 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case arm:         return "arm";
   case armeb:       return "armeb";
   case arm64:       return "arm64";
+  case arm64_be:    return "arm64_be";
   case hexagon:     return "hexagon";
   case mips:        return "mips";
   case mipsel:      return "mipsel";
@@ -57,7 +58,7 @@ const char *Triple::getArchTypeName(ArchType Kind) {
 const char *Triple::getArchTypePrefix(ArchType Kind) {
   switch (Kind) {
   default:
-    return 0;
+    return nullptr;
 
   case aarch64:
   case aarch64_be:  return "aarch64";
@@ -67,7 +68,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
   case thumb:
   case thumbeb:     return "arm";
 
-  case arm64:       return "arm64";
+  case arm64:       
+  case arm64_be:    return "arm64";
 
   case ppc64:
   case ppc64le:
@@ -178,6 +180,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("arm", arm)
     .Case("armeb", armeb)
     .Case("arm64", arm64)
+    .Case("arm64_be", arm64_be)
     .Case("mips", mips)
     .Case("mipsel", mipsel)
     .Case("mips64", mips64)
@@ -210,7 +213,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
 // Returns architecture name that is understood by the target assembler.
 const char *Triple::getArchNameForAssembler() {
   if (!isOSDarwin() && getVendor() != Triple::Apple)
-    return NULL;
+    return nullptr;
 
   return StringSwitch<const char*>(getArchName())
     .Case("i386", "i386")
@@ -225,6 +228,7 @@ const char *Triple::getArchNameForAssembler() {
     .Cases("armv7", "thumbv7", "armv7")
     .Case("armeb", "armeb")
     .Case("arm64", "arm64")
+    .Case("arm64_be", "arm64")
     .Case("r600", "r600")
     .Case("nvptx", "nvptx")
     .Case("nvptx64", "nvptx64")
@@ -232,7 +236,7 @@ const char *Triple::getArchNameForAssembler() {
     .Case("amdil", "amdil")
     .Case("spir", "spir")
     .Case("spir64", "spir64")
-    .Default(NULL);
+    .Default(nullptr);
 }
 
 static Triple::ArchType parseArch(StringRef ArchName) {
@@ -257,6 +261,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("thumbeb", Triple::thumbeb)
     .StartsWith("thumbebv", Triple::thumbeb)
     .Case("arm64", Triple::arm64)
+    .Case("arm64_be", Triple::arm64_be)
     .Case("msp430", Triple::msp430)
     .Cases("mips", "mipseb", "mipsallegrex", Triple::mips)
     .Cases("mipsel", "mipsallegrexel", Triple::mipsel)
@@ -797,6 +802,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
     return 32;
 
   case llvm::Triple::arm64:
+  case llvm::Triple::arm64_be:
   case llvm::Triple::aarch64:
   case llvm::Triple::aarch64_be:
   case llvm::Triple::mips64:
@@ -832,6 +838,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::aarch64:
   case Triple::aarch64_be:
   case Triple::arm64:
+  case Triple::arm64_be:
   case Triple::msp430:
   case Triple::systemz:
   case Triple::ppc64le:
@@ -899,6 +906,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::systemz:
   case Triple::x86_64:
   case Triple::arm64:
+  case Triple::arm64_be:
     // Already 64-bit.
     break;
 
diff --git a/lib/Support/Unix/Memory.inc b/lib/Support/Unix/Memory.inc
index 08cd34d..23b49b7 100644
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@@ -121,7 +121,7 @@ Memory::allocateMappedMemory(size_t NumBytes,
                       Protect, MMFlags, fd, 0);
   if (Addr == MAP_FAILED) {
     if (NearBlock) //Try again without a near hint
-      return allocateMappedMemory(NumBytes, 0, PFlags, EC);
+      return allocateMappedMemory(NumBytes, nullptr, PFlags, EC);
 
     EC = error_code(errno, system_category());
     return MemoryBlock();
@@ -139,13 +139,13 @@ Memory::allocateMappedMemory(size_t NumBytes,
 
 error_code
 Memory::releaseMappedMemory(MemoryBlock &M) {
-  if (M.Address == 0 || M.Size == 0)
+  if (M.Address == nullptr || M.Size == 0)
     return error_code::success();
 
   if (0 != ::munmap(M.Address, M.Size))
     return error_code(errno, system_category());
 
-  M.Address = 0;
+  M.Address = nullptr;
   M.Size = 0;
 
   return error_code::success();
@@ -153,7 +153,7 @@ Memory::releaseMappedMemory(MemoryBlock &M) {
 
 error_code
 Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) {
-  if (M.Address == 0 || M.Size == 0)
+  if (M.Address == nullptr || M.Size == 0)
     return error_code::success();
 
   if (!Flags)
@@ -203,7 +203,7 @@ Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
   ;
 
   void* start = NearBlock ? (unsigned char*)NearBlock->base() +
-                            NearBlock->size() : 0;
+                            NearBlock->size() : nullptr;
 
 #if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__))
   void *pa = ::mmap(start, PageSize*NumPages, PROT_READ|PROT_EXEC,
@@ -214,7 +214,7 @@ Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
 #endif
   if (pa == MAP_FAILED) {
     if (NearBlock) //Try again without a near hint
-      return AllocateRWX(NumBytes, 0);
+      return AllocateRWX(NumBytes, nullptr);
 
     MakeErrMsg(ErrMsg, "Can't allocate RWX Memory");
     return MemoryBlock();
@@ -246,7 +246,7 @@ Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
 }
 
 bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
-  if (M.Address == 0 || M.Size == 0) return false;
+  if (M.Address == nullptr || M.Size == 0) return false;
   if (0 != ::munmap(M.Address, M.Size))
     return MakeErrMsg(ErrMsg, "Can't release RWX Memory");
   return false;
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 1c91053..519a016 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -89,7 +89,7 @@ namespace {
 
 static error_code TempDir(SmallVectorImpl<char> &result) {
   // FIXME: Don't use TMPDIR if program is SUID or SGID enabled.
-  const char *dir = 0;
+  const char *dir = nullptr;
   (dir = std::getenv("TMPDIR")) || (dir = std::getenv("TMP")) ||
       (dir = std::getenv("TEMP")) || (dir = std::getenv("TEMPDIR")) ||
 #ifdef P_tmpdir
@@ -246,7 +246,7 @@ error_code current_path(SmallVectorImpl<char> &result) {
 #endif
 
   while (true) {
-    if (::getcwd(result.data(), result.capacity()) == 0) {
+    if (::getcwd(result.data(), result.capacity()) == nullptr) {
       // See if there was a real error.
       if (errno != errc::not_enough_memory)
         return error_code(errno, system_category());
@@ -494,7 +494,7 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
 #ifdef MAP_FILE
   flags |= MAP_FILE;
 #endif
-  Mapping = ::mmap(0, Size, prot, flags, FD, Offset);
+  Mapping = ::mmap(nullptr, Size, prot, flags, FD, Offset);
   if (Mapping == MAP_FAILED)
     return error_code(errno, system_category());
   return error_code::success();
@@ -525,7 +525,7 @@ mapped_file_region::mapped_file_region(const Twine &path,
 
   ec = init(ofd, true, offset);
   if (ec)
-    Mapping = 0;
+    Mapping = nullptr;
 }
 
 mapped_file_region::mapped_file_region(int fd,
@@ -545,7 +545,7 @@ mapped_file_region::mapped_file_region(int fd,
 
   ec = init(fd, closefd, offset);
   if (ec)
-    Mapping = 0;
+    Mapping = nullptr;
 }
 
 mapped_file_region::~mapped_file_region() {
@@ -555,7 +555,7 @@ mapped_file_region::~mapped_file_region() {
 
 mapped_file_region::mapped_file_region(mapped_file_region &&other)
   : Mode(other.Mode), Size(other.Size), Mapping(other.Mapping) {
-  other.Mapping = 0;
+  other.Mapping = nullptr;
 }
 
 mapped_file_region::mapmode mapped_file_region::flags() const {
@@ -587,7 +587,7 @@ error_code detail::directory_iterator_construct(detail::DirIterState &it,
                                                 StringRef path){
   SmallString<128> path_null(path);
   DIR *directory = ::opendir(path_null.c_str());
-  if (directory == 0)
+  if (!directory)
     return error_code(errno, system_category());
 
   it.IterationHandle = reinterpret_cast<intptr_t>(directory);
@@ -608,9 +608,9 @@ error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
 error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   errno = 0;
   dirent *cur_dir = ::readdir(reinterpret_cast<DIR *>(it.IterationHandle));
-  if (cur_dir == 0 && errno != 0) {
+  if (cur_dir == nullptr && errno != 0) {
     return error_code(errno, system_category());
-  } else if (cur_dir != 0) {
+  } else if (cur_dir != nullptr) {
     StringRef name(cur_dir->d_name, NAMLEN(cur_dir));
     if ((name.size() == 1 && name[0] == '.') ||
         (name.size() == 2 && name[0] == '.' && name[1] == '.'))
@@ -630,7 +630,7 @@ error_code get_magic(const Twine &path, uint32_t len,
 
   // Open path.
   std::FILE *file = std::fopen(Path.data(), "rb");
-  if (file == 0)
+  if (!file)
     return error_code(errno, system_category());
 
   // Reserve storage.
@@ -667,7 +667,7 @@ error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,
 #ifdef MAP_FILE
   flags |= MAP_FILE;
 #endif
-  result = ::mmap(0, size, prot, flags, fd, file_offset);
+  result = ::mmap(nullptr, size, prot, flags, fd, file_offset);
   if (result == MAP_FAILED) {
     return error_code(errno, system_category());
   }
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 9fb4356..8faa638 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -270,7 +270,7 @@ static bool terminalHasColors(int fd) {
   MutexGuard G(M);
 
   int errret = 0;
-  if (setupterm((char *)0, fd, &errret) != 0)
+  if (setupterm((char *)nullptr, fd, &errret) != 0)
     // Regardless of why, if we can't get terminfo, we shouldn't try to print
     // colors.
     return false;
@@ -292,7 +292,7 @@ static bool terminalHasColors(int fd) {
 
   // Now extract the structure allocated by setupterm and free its memory
   // through a really silly dance.
-  struct term *termp = set_curterm((struct term *)0);
+  struct term *termp = set_curterm((struct term *)nullptr);
   (void)del_curterm(termp); // Drop any errors here.
 
   // Return true if we found a color capabilities for the current terminal.
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index b4df928..1225a9c 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -70,7 +70,7 @@ sys::FindProgramByName(const std::string& progName) {
 
   // Get the path. If its empty, we can't do anything to find it.
   const char *PathStr = getenv("PATH");
-  if (PathStr == 0)
+  if (!PathStr)
     return "";
 
   // Now we have a colon separated list of directories to search; try them.
@@ -99,7 +99,7 @@ sys::FindProgramByName(const std::string& progName) {
 }
 
 static bool RedirectIO(const StringRef *Path, int FD, std::string* ErrMsg) {
-  if (Path == 0) // Noop
+  if (!Path) // Noop
     return false;
   std::string File;
   if (Path->empty())
@@ -129,7 +129,7 @@ static bool RedirectIO(const StringRef *Path, int FD, std::string* ErrMsg) {
 #ifdef HAVE_POSIX_SPAWN
 static bool RedirectIO_PS(const std::string *Path, int FD, std::string *ErrMsg,
                           posix_spawn_file_actions_t *FileActions) {
-  if (Path == 0) // Noop
+  if (!Path) // Noop
     return false;
   const char *File;
   if (Path->empty())
@@ -195,7 +195,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
 #ifdef HAVE_POSIX_SPAWN
   if (memoryLimit == 0) {
     posix_spawn_file_actions_t FileActionsStore;
-    posix_spawn_file_actions_t *FileActions = 0;
+    posix_spawn_file_actions_t *FileActions = nullptr;
 
     // If we call posix_spawn_file_actions_addopen we have to make sure the
     // c strings we pass to it stay alive until the call to posix_spawn,
@@ -203,7 +203,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
     std::string RedirectsStorage[3];
 
     if (redirects) {
-      std::string *RedirectsStr[3] = {0, 0, 0};
+      std::string *RedirectsStr[3] = {nullptr, nullptr, nullptr};
       for (int I = 0; I < 3; ++I) {
         if (redirects[I]) {
           RedirectsStorage[I] = *redirects[I];
@@ -218,7 +218,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
       if (RedirectIO_PS(RedirectsStr[0], 0, ErrMsg, FileActions) ||
           RedirectIO_PS(RedirectsStr[1], 1, ErrMsg, FileActions))
         return false;
-      if (redirects[1] == 0 || redirects[2] == 0 ||
+      if (redirects[1] == nullptr || redirects[2] == nullptr ||
           *redirects[1] != *redirects[2]) {
         // Just redirect stderr
         if (RedirectIO_PS(RedirectsStr[2], 2, ErrMsg, FileActions))
@@ -242,8 +242,9 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
     // Explicitly initialized to prevent what appears to be a valgrind false
     // positive.
     pid_t PID = 0;
-    int Err = posix_spawn(&PID, Program.str().c_str(), FileActions, /*attrp*/0,
-                          const_cast<char **>(args), const_cast<char **>(envp));
+    int Err = posix_spawn(&PID, Program.str().c_str(), FileActions,
+                          /*attrp*/nullptr, const_cast<char **>(args),
+                          const_cast<char **>(envp));
 
     if (FileActions)
       posix_spawn_file_actions_destroy(FileActions);
@@ -294,7 +295,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
 
       // Execute!
       std::string PathStr = Program;
-      if (envp != 0)
+      if (envp != nullptr)
         execve(PathStr.c_str(),
                const_cast<char **>(args),
                const_cast<char **>(envp));
@@ -360,7 +361,7 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
 
         // Turn off the alarm and restore the signal handler
         alarm(0);
-        sigaction(SIGALRM, &Old, 0);
+        sigaction(SIGALRM, &Old, nullptr);
 
         // Wait for child to die
         if (wait(&status) != ChildPid)
@@ -381,7 +382,7 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
   // We exited normally without timeout, so turn off the timer.
   if (SecondsToWait && !WaitUntilTerminates) {
     alarm(0);
-    sigaction(SIGALRM, &Old, 0);
+    sigaction(SIGALRM, &Old, nullptr);
   }
 
   // Return the proper exit status. Detect error conditions
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index b4c78d6..1841fea 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -44,7 +44,7 @@ static RETSIGTYPE SignalHandler(int Sig);  // defined below.
 static SmartMutex<true> SignalsMutex;
 
 /// InterruptFunction - The function to call if ctrl-c is pressed.
-static void (*InterruptFunction)() = 0;
+static void (*InterruptFunction)() = nullptr;
 
 static std::vector<std::string> FilesToRemove;
 static std::vector<std::pair<void(*)(void*), void*> > CallBacksToRun;
@@ -55,7 +55,7 @@ static std::vector<std::pair<void(*)(void*), void*> > CallBacksToRun;
 static const int IntSigs[] = {
   SIGHUP, SIGINT, SIGPIPE, SIGTERM, SIGUSR1, SIGUSR2
 };
-static const int *const IntSigsEnd = array_endof(IntSigs);
+static const int *const IntSigsEnd = std::end(IntSigs);
 
 // KillSigs - Signals that represent that we have a bug, and our prompt
 // termination has been ordered.
@@ -74,7 +74,7 @@ static const int KillSigs[] = {
   , SIGEMT
 #endif
 };
-static const int *const KillSigsEnd = array_endof(KillSigs);
+static const int *const KillSigsEnd = std::end(KillSigs);
 
 static unsigned NumRegisteredSignals = 0;
 static struct {
@@ -113,7 +113,7 @@ static void UnregisterHandlers() {
   // Restore all of the signal handlers to how they were before we showed up.
   for (unsigned i = 0, e = NumRegisteredSignals; i != e; ++i)
     sigaction(RegisteredSignalInfo[i].SigNo,
-              &RegisteredSignalInfo[i].SA, 0);
+              &RegisteredSignalInfo[i].SA, nullptr);
   NumRegisteredSignals = 0;
 }
 
@@ -160,7 +160,7 @@ static RETSIGTYPE SignalHandler(int Sig) {
   // Unmask all potentially blocked kill signals.
   sigset_t SigMask;
   sigfillset(&SigMask);
-  sigprocmask(SIG_UNBLOCK, &SigMask, 0);
+  sigprocmask(SIG_UNBLOCK, &SigMask, nullptr);
 
   SignalsMutex.acquire();
   RemoveFilesToRemove();
@@ -169,7 +169,7 @@ static RETSIGTYPE SignalHandler(int Sig) {
     if (InterruptFunction) {
       void (*IF)() = InterruptFunction;
       SignalsMutex.release();
-      InterruptFunction = 0;
+      InterruptFunction = nullptr;
       IF();        // run the interrupt function.
       return;
     }
@@ -212,7 +212,7 @@ void llvm::sys::SetInterruptFunction(void (*IF)()) {
 bool llvm::sys::RemoveFileOnSignal(StringRef Filename,
                                    std::string* ErrMsg) {
   SignalsMutex.acquire();
-  std::string *OldPtr = FilesToRemove.empty() ? 0 : &FilesToRemove[0];
+  std::string *OldPtr = FilesToRemove.empty() ? nullptr : &FilesToRemove[0];
   FilesToRemove.push_back(Filename);
 
   // We want to call 'c_str()' on every std::string in this vector so that if
@@ -279,8 +279,8 @@ void llvm::sys::PrintStackTrace(FILE *FD) {
     const char* name = strrchr(dlinfo.dli_fname, '/');
 
     int nwidth;
-    if (name == NULL) nwidth = strlen(dlinfo.dli_fname);
-    else              nwidth = strlen(name) - 1;
+    if (!name) nwidth = strlen(dlinfo.dli_fname);
+    else       nwidth = strlen(name) - 1;
 
     if (nwidth > width) width = nwidth;
   }
@@ -292,22 +292,22 @@ void llvm::sys::PrintStackTrace(FILE *FD) {
     fprintf(FD, "%-2d", i);
 
     const char* name = strrchr(dlinfo.dli_fname, '/');
-    if (name == NULL) fprintf(FD, " %-*s", width, dlinfo.dli_fname);
-    else              fprintf(FD, " %-*s", width, name+1);
+    if (!name) fprintf(FD, " %-*s", width, dlinfo.dli_fname);
+    else       fprintf(FD, " %-*s", width, name+1);
 
     fprintf(FD, " %#0*lx",
             (int)(sizeof(void*) * 2) + 2, (unsigned long)StackTrace[i]);
 
-    if (dlinfo.dli_sname != NULL) {
+    if (dlinfo.dli_sname != nullptr) {
       fputc(' ', FD);
 #  if HAVE_CXXABI_H
       int res;
-      char* d = abi::__cxa_demangle(dlinfo.dli_sname, NULL, NULL, &res);
+      char* d = abi::__cxa_demangle(dlinfo.dli_sname, nullptr, nullptr, &res);
 #  else
       char* d = NULL;
 #  endif
-      if (d == NULL) fputs(dlinfo.dli_sname, FD);
-      else           fputs(d, FD);
+      if (!d) fputs(dlinfo.dli_sname, FD);
+      else    fputs(d, FD);
       free(d);
 
       // FIXME: When we move to C++11, use %t length modifier. It's not in
@@ -331,7 +331,7 @@ static void PrintStackTraceSignalHandler(void *) {
 /// PrintStackTraceOnErrorSignal - When an error signal (such as SIGABRT or
 /// SIGSEGV) is delivered to the process, print a stack trace and then exit.
 void llvm::sys::PrintStackTraceOnErrorSignal() {
-  AddSignalHandler(PrintStackTraceSignalHandler, 0);
+  AddSignalHandler(PrintStackTraceSignalHandler, nullptr);
 
 #if defined(__APPLE__) && defined(ENABLE_CRASH_OVERRIDES)
   // Environment variable to disable any kind of crash dialog.
diff --git a/lib/Support/Unix/TimeValue.inc b/lib/Support/Unix/TimeValue.inc
index 80532b0..7d4acf7 100644
--- a/lib/Support/Unix/TimeValue.inc
+++ b/lib/Support/Unix/TimeValue.inc
@@ -26,15 +26,17 @@ std::string TimeValue::str() const {
   struct tm Storage;
   struct tm *LT = ::localtime_r(&OurTime, &Storage);
   assert(LT);
-  char Buffer[25];
-  strftime(Buffer, 25, "%b %e %H:%M %Y", LT);
-  return std::string(Buffer);
+  char Buffer1[sizeof("YYYY-MM-DD HH:MM:SS")];
+  strftime(Buffer1, sizeof(Buffer1), "%Y-%m-%d %H:%M:%S", LT);
+  char Buffer2[sizeof("YYYY-MM-DD HH:MM:SS.MMMUUUNNN")];
+  snprintf(Buffer2, sizeof(Buffer2), "%s.%.9u", Buffer1, this->nanoseconds());
+  return std::string(Buffer2);
 }
 
 TimeValue TimeValue::now() {
   struct timeval the_time;
   timerclear(&the_time);
-  if (0 != ::gettimeofday(&the_time,0)) {
+  if (0 != ::gettimeofday(&the_time,nullptr)) {
     // This is *really* unlikely to occur because the only gettimeofday
     // errors concern the timezone parameter which we're passing in as 0.
     // In the unlikely case it does happen, just return MinTime, no error
diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 504471e..5d0278f 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -58,7 +58,7 @@ extern "C" {
         stricmp(ModuleName, "msvcr70") != 0 &&
 #ifndef __MINGW32__
         // Mingw32 uses msvcrt.dll by default. Don't ignore it.
-        // Otherwise, user should be aware, what he's doing :)
+        // Otherwise the user should be aware what they are doing.
         stricmp(ModuleName, "msvcrt") != 0 &&
 #endif
         stricmp(ModuleName, "msvcrt20") != 0 &&
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index a87c9e8..c3df801 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -82,16 +82,14 @@ TimeValue self_process::get_system_time() const {
   return getTimeValueFromFILETIME(KernelTime);
 }
 
-// This function retrieves the page size using GetSystemInfo and is present
-// solely so it can be called once to initialize the self_process member below.
+// This function retrieves the page size using GetNativeSystemInfo() and is
+// present solely so it can be called once to initialize the self_process member
+// below.
 static unsigned getPageSize() {
-  // NOTE: A 32-bit application running under WOW64 is supposed to use
-  // GetNativeSystemInfo.  However, this interface is not present prior
-  // to Windows XP so to use it requires dynamic linking.  It is not clear
-  // how this affects the reported page size, if at all.  One could argue
-  // that LLVM ought to run as 64-bits on a 64-bit system, anyway.
+  // GetNativeSystemInfo() provides the physical page size which may differ
+  // from GetSystemInfo() in 32-bit applications running under WOW64.
   SYSTEM_INFO info;
-  GetSystemInfo(&info);
+  GetNativeSystemInfo(&info);
   // FIXME: FileOffset in MapViewOfFile() should be aligned to not dwPageSize,
   // but dwAllocationGranularity.
   return static_cast<unsigned>(info.dwPageSize);
diff --git a/lib/Support/Windows/TimeValue.inc b/lib/Support/Windows/TimeValue.inc
index 6c59024..0223ab4 100644
--- a/lib/Support/Windows/TimeValue.inc
+++ b/lib/Support/Windows/TimeValue.inc
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "WindowsSupport.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cctype>
 #include <time.h>
 
@@ -32,6 +34,7 @@ TimeValue TimeValue::now() {
 }
 
 std::string TimeValue::str() const {
+  std::string S;
   struct tm *LT;
 #ifdef __MINGW32__
   // Old versions of mingw don't have _localtime64_s. Remove this once we drop support
@@ -47,13 +50,11 @@ std::string TimeValue::str() const {
   LT = &Storage;
 #endif
 
-  char Buffer[25];
-  // FIXME: the windows version of strftime doesn't support %e
-  strftime(Buffer, 25, "%b %d %H:%M %Y", LT);
-  assert((Buffer[3] == ' ' && isdigit(Buffer[5]) && Buffer[6] == ' ') &&
-         "Unexpected format in strftime()!");
-  // Emulate %e on %d to mute '0'.
-  if (Buffer[4] == '0')
-    Buffer[4] = ' ';
-  return std::string(Buffer);
+  char Buffer[sizeof("YYYY-MM-DD HH:MM:SS")];
+  strftime(Buffer, sizeof(Buffer), "%Y-%m-%d %H:%M:%S", LT);
+  raw_string_ostream OS(S);
+  OS << format("%s.%.9u", static_cast<const char *>(Buffer),
+               this->nanoseconds());
+  OS.flush();
+  return S;
 }
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index 73ce5e0..3be02ee 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp
@@ -1876,14 +1876,14 @@ Node *KeyValueNode::getValue() {
 void MappingNode::increment() {
   if (failed()) {
     IsAtEnd = true;
-    CurrentEntry = 0;
+    CurrentEntry = nullptr;
     return;
   }
   if (CurrentEntry) {
     CurrentEntry->skip();
     if (Type == MT_Inline) {
       IsAtEnd = true;
-      CurrentEntry = 0;
+      CurrentEntry = nullptr;
       return;
     }
   }
@@ -1896,13 +1896,13 @@ void MappingNode::increment() {
     case Token::TK_BlockEnd:
       getNext();
       IsAtEnd = true;
-      CurrentEntry = 0;
+      CurrentEntry = nullptr;
       break;
     default:
       setError("Unexpected token. Expected Key or Block End", T);
     case Token::TK_Error:
       IsAtEnd = true;
-      CurrentEntry = 0;
+      CurrentEntry = nullptr;
     }
   } else {
     switch (T.Kind) {
@@ -1915,14 +1915,14 @@ void MappingNode::increment() {
     case Token::TK_Error:
       // Set this to end iterator.
       IsAtEnd = true;
-      CurrentEntry = 0;
+      CurrentEntry = nullptr;
       break;
     default:
       setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
                 "Mapping End."
               , T);
       IsAtEnd = true;
-      CurrentEntry = 0;
+      CurrentEntry = nullptr;
     }
   }
 }
@@ -1930,7 +1930,7 @@ void MappingNode::increment() {
 void SequenceNode::increment() {
   if (failed()) {
     IsAtEnd = true;
-    CurrentEntry = 0;
+    CurrentEntry = nullptr;
     return;
   }
   if (CurrentEntry)
@@ -1941,37 +1941,37 @@ void SequenceNode::increment() {
     case Token::TK_BlockEntry:
       getNext();
       CurrentEntry = parseBlockNode();
-      if (CurrentEntry == 0) { // An error occurred.
+      if (!CurrentEntry) { // An error occurred.
         IsAtEnd = true;
-        CurrentEntry = 0;
+        CurrentEntry = nullptr;
       }
       break;
     case Token::TK_BlockEnd:
       getNext();
       IsAtEnd = true;
-      CurrentEntry = 0;
+      CurrentEntry = nullptr;
       break;
     default:
       setError( "Unexpected token. Expected Block Entry or Block End."
               , T);
     case Token::TK_Error:
       IsAtEnd = true;
-      CurrentEntry = 0;
+      CurrentEntry = nullptr;
     }
   } else if (SeqType == ST_Indentless) {
     switch (T.Kind) {
     case Token::TK_BlockEntry:
       getNext();
       CurrentEntry = parseBlockNode();
-      if (CurrentEntry == 0) { // An error occurred.
+      if (!CurrentEntry) { // An error occurred.
         IsAtEnd = true;
-        CurrentEntry = 0;
+        CurrentEntry = nullptr;
       }
       break;
     default:
     case Token::TK_Error:
       IsAtEnd = true;
-      CurrentEntry = 0;
+      CurrentEntry = nullptr;
     }
   } else if (SeqType == ST_Flow) {
     switch (T.Kind) {
@@ -1985,7 +1985,7 @@ void SequenceNode::increment() {
     case Token::TK_Error:
       // Set this to end iterator.
       IsAtEnd = true;
-      CurrentEntry = 0;
+      CurrentEntry = nullptr;
       break;
     case Token::TK_StreamEnd:
     case Token::TK_DocumentEnd:
@@ -1993,13 +1993,13 @@ void SequenceNode::increment() {
       setError("Could not find closing ]!", T);
       // Set this to end iterator.
       IsAtEnd = true;
-      CurrentEntry = 0;
+      CurrentEntry = nullptr;
       break;
     default:
       if (!WasPreviousTokenFlowEntry) {
         setError("Expected , between entries!", T);
         IsAtEnd = true;
-        CurrentEntry = 0;
+        CurrentEntry = nullptr;
         break;
       }
       // Otherwise it must be a flow entry.
@@ -2013,7 +2013,7 @@ void SequenceNode::increment() {
   }
 }
 
-Document::Document(Stream &S) : stream(S), Root(0) {
+Document::Document(Stream &S) : stream(S), Root(nullptr) {
   // Tag maps starts with two default mappings.
   TagMap["!"] = "!";
   TagMap["!!"] = "tag:yaml.org,2002:";
@@ -2070,7 +2070,7 @@ parse_property:
   case Token::TK_Anchor:
     if (AnchorInfo.Kind == Token::TK_Anchor) {
       setError("Already encountered an anchor for this node!", T);
-      return 0;
+      return nullptr;
     }
     AnchorInfo = getNext(); // Consume TK_Anchor.
     T = peekNext();
@@ -2078,7 +2078,7 @@ parse_property:
   case Token::TK_Tag:
     if (TagInfo.Kind == Token::TK_Tag) {
       setError("Already encountered a tag for this node!", T);
-      return 0;
+      return nullptr;
     }
     TagInfo = getNext(); // Consume TK_Tag.
     T = peekNext();
@@ -2146,10 +2146,10 @@ parse_property:
     //       !!null null.
     return new (NodeAllocator) NullNode(stream.CurrentDoc);
   case Token::TK_Error:
-    return 0;
+    return nullptr;
   }
   llvm_unreachable("Control flow shouldn't reach here.");
-  return 0;
+  return nullptr;
 }
 
 bool Document::parseDirectives() {
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index 5472e0e..e5f9494 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -47,7 +47,7 @@ Input::Input(StringRef InputContent,
              void *DiagHandlerCtxt)
   : IO(Ctxt),
     Strm(new Stream(InputContent, SrcMgr)),
-    CurrentNode(NULL) {
+    CurrentNode(nullptr) {
   if (DiagHandler)
     SrcMgr.setDiagHandler(DiagHandler, DiagHandlerCtxt);
   DocIterator = Strm->begin();
@@ -158,10 +158,9 @@ void Input::endMapping() {
   MapHNode *MN = dyn_cast_or_null<MapHNode>(CurrentNode);
   if (!MN)
     return;
-  for (MapHNode::NameToNode::iterator i = MN->Mapping.begin(),
-       End = MN->Mapping.end(); i != End; ++i) {
-    if (!MN->isValidKey(i->first())) {
-      setError(i->second, Twine("unknown key '") + i->first() + "'");
+  for (const auto &NN : MN->Mapping) {
+    if (!MN->isValidKey(NN.first())) {
+      setError(NN.second, Twine("unknown key '") + NN.first() + "'");
       break;
     }
   }
@@ -255,9 +254,8 @@ bool Input::bitSetMatch(const char *Str, bool) {
     return false;
   if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
     unsigned Index = 0;
-    for (std::vector<HNode *>::iterator i = SQ->Entries.begin(),
-         End = SQ->Entries.end(); i != End; ++i) {
-      if (ScalarHNode *SN = dyn_cast<ScalarHNode>(*i)) {
+    for (HNode *N : SQ->Entries) {
+      if (ScalarHNode *SN = dyn_cast<ScalarHNode>(N)) {
         if (SN->value().equals(Str)) {
           BitValuesUsed[Index] = true;
           return true;
@@ -287,7 +285,7 @@ void Input::endBitSetScalar() {
   }
 }
 
-void Input::scalarString(StringRef &S) {
+void Input::scalarString(StringRef &S, bool) {
   if (ScalarHNode *SN = dyn_cast<ScalarHNode>(CurrentNode)) {
     S = SN->value();
   } else {
@@ -319,9 +317,8 @@ Input::HNode *Input::createHNodes(Node *N) {
     return new ScalarHNode(N, KeyStr);
   } else if (SequenceNode *SQ = dyn_cast<SequenceNode>(N)) {
     SequenceHNode *SQHNode = new SequenceHNode(N);
-    for (SequenceNode::iterator i = SQ->begin(), End = SQ->end(); i != End;
-         ++i) {
-      HNode *Entry = this->createHNodes(i);
+    for (Node &SN : *SQ) {
+      HNode *Entry = this->createHNodes(&SN);
       if (EC)
         break;
       SQHNode->Entries.push_back(Entry);
@@ -329,9 +326,8 @@ Input::HNode *Input::createHNodes(Node *N) {
     return SQHNode;
   } else if (MappingNode *Map = dyn_cast<MappingNode>(N)) {
     MapHNode *mapHNode = new MapHNode(N);
-    for (MappingNode::iterator i = Map->begin(), End = Map->end(); i != End;
-         ++i) {
-      ScalarNode *KeyScalar = dyn_cast<ScalarNode>(i->getKey());
+    for (KeyValueNode &KVN : *Map) {
+      ScalarNode *KeyScalar = dyn_cast<ScalarNode>(KVN.getKey());
       StringStorage.clear();
       StringRef KeyStr = KeyScalar->getValue(StringStorage);
       if (!StringStorage.empty()) {
@@ -341,7 +337,7 @@ Input::HNode *Input::createHNodes(Node *N) {
         memcpy(Buf, &StringStorage[0], Len);
         KeyStr = StringRef(Buf, Len);
       }
-      HNode *ValueHNode = this->createHNodes(i->getValue());
+      HNode *ValueHNode = this->createHNodes(KVN.getValue());
       if (EC)
         break;
       mapHNode->Mapping[KeyStr] = ValueHNode;
@@ -351,14 +347,13 @@ Input::HNode *Input::createHNodes(Node *N) {
     return new EmptyHNode(N);
   } else {
     setError(N, "unknown node kind");
-    return NULL;
+    return nullptr;
   }
 }
 
 bool Input::MapHNode::isValidKey(StringRef Key) {
-  for (SmallVectorImpl<const char *>::iterator i = ValidKeys.begin(),
-       End = ValidKeys.end(); i != End; ++i) {
-    if (Key.equals(*i))
+  for (const char *K : ValidKeys) {
+    if (Key.equals(K))
       return true;
   }
   return false;
@@ -373,17 +368,13 @@ bool Input::canElideEmptySequence() {
 }
 
 Input::MapHNode::~MapHNode() {
-  for (MapHNode::NameToNode::iterator i = Mapping.begin(), End = Mapping.end();
-                                                                i != End; ++i) {
-    delete i->second;
-  }
+  for (auto &N : Mapping)
+    delete N.second;
 }
 
 Input::SequenceHNode::~SequenceHNode() {
-  for (std::vector<HNode*>::iterator i = Entries.begin(), End = Entries.end();
-                                                                i != End; ++i) {
-    delete *i;
-  }
+  for (HNode *N : Entries)
+    delete N;
 }
 
 
@@ -550,10 +541,7 @@ void Output::endBitSetScalar() {
   this->outputUpToEndOfLine(" ]");
 }
 
-void Output::scalarString(StringRef &S) {
-  const char ScalarSafeChars[] = "abcdefghijklmnopqrstuvwxyz"
-      "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-/^., \t";
-
+void Output::scalarString(StringRef &S, bool MustQuote) {
   this->newLineCheck();
   if (S.empty()) {
     // Print '' for the empty string because leaving the field empty is not
@@ -561,10 +549,8 @@ void Output::scalarString(StringRef &S) {
     this->outputUpToEndOfLine("''");
     return;
   }
-  if (S.find_first_not_of(ScalarSafeChars) == StringRef::npos &&
-      !isspace(S.front()) && !isspace(S.back())) {
-    // If the string consists only of safe characters, print it out without
-    // quotes.
+  if (!MustQuote) {
+    // Only quote if we must.
     this->outputUpToEndOfLine(S);
     return;
   }
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 3c45743..f55838e 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -87,8 +87,8 @@ void raw_ostream::SetBuffered() {
 
 void raw_ostream::SetBufferAndMode(char *BufferStart, size_t Size,
                                    BufferKind Mode) {
-  assert(((Mode == Unbuffered && BufferStart == 0 && Size == 0) ||
-          (Mode != Unbuffered && BufferStart && Size)) &&
+  assert(((Mode == Unbuffered && !BufferStart && Size == 0) ||
+          (Mode != Unbuffered && BufferStart && Size != 0)) &&
          "stream must be unbuffered or have at least one byte");
   // Make sure the current buffer is free of content (we can't flush here; the
   // child buffer management logic will be in write_impl).
@@ -433,7 +433,7 @@ void format_object_base::home() {
 raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
                                sys::fs::OpenFlags Flags)
     : Error(false), UseAtomicWrites(false), pos(0) {
-  assert(Filename != 0 && "Filename is null");
+  assert(Filename && "Filename is null");
   ErrorInfo.clear();
 
   // Handle "-" as stdout. Note that when we do this, we consider ourself
diff --git a/lib/Support/regengine.inc b/lib/Support/regengine.inc
index 7e41f96..62d8c26 100644
--- a/lib/Support/regengine.inc
+++ b/lib/Support/regengine.inc
@@ -205,7 +205,7 @@ matcher(struct re_guts *g, const char *string, size_t nmatch,
 		if (nmatch == 1 && !g->backrefs)
 			break;		/* no further info needed */
 
-		/* oh my, he wants the subexpressions... */
+		/* oh my, they want the subexpressions... */
 		if (m->pmatch == NULL)
 			m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) *
 							sizeof(llvm_regmatch_t));
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index fd81ab4..476026d 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp
@@ -17,6 +17,7 @@
 
 #include "TGParser.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/system_error.h"
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index a43665b..c553a21 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -101,13 +101,13 @@ bool RecTy::baseClassOf(const RecTy *RHS) const{
 }
 
 Init *BitRecTy::convertValue(BitsInit *BI) {
-  if (BI->getNumBits() != 1) return 0; // Only accept if just one bit!
+  if (BI->getNumBits() != 1) return nullptr; // Only accept if just one bit!
   return BI->getBit(0);
 }
 
 Init *BitRecTy::convertValue(IntInit *II) {
   int64_t Val = II->getValue();
-  if (Val != 0 && Val != 1) return 0;  // Only accept 0 or 1 for a bit!
+  if (Val != 0 && Val != 1) return nullptr;  // Only accept 0 or 1 for a bit!
 
   return BitInit::get(Val != 0);
 }
@@ -116,7 +116,7 @@ Init *BitRecTy::convertValue(TypedInit *VI) {
   RecTy *Ty = VI->getType();
   if (isa<BitRecTy>(Ty) || isa<BitsRecTy>(Ty) || isa<IntRecTy>(Ty))
     return VI;  // Accept variable if it is already of bit type!
-  return 0;
+  return nullptr;
 }
 
 bool BitRecTy::baseClassOf(const RecTy *RHS) const{
@@ -151,7 +151,7 @@ Init *BitsRecTy::convertValue(UnsetInit *UI) {
 }
 
 Init *BitsRecTy::convertValue(BitInit *UI) {
-  if (Size != 1) return 0;  // Can only convert single bit.
+  if (Size != 1) return nullptr;  // Can only convert single bit.
   return BitsInit::get(UI);
 }
 
@@ -170,7 +170,7 @@ Init *BitsRecTy::convertValue(IntInit *II) {
   int64_t Value = II->getValue();
   // Make sure this bitfield is large enough to hold the integer value.
   if (!canFitInBitfield(Value, Size))
-    return 0;
+    return nullptr;
 
   SmallVector<Init *, 16> NewBits(Size);
 
@@ -184,7 +184,7 @@ Init *BitsRecTy::convertValue(BitsInit *BI) {
   // If the number of bits is right, return it.  Otherwise we need to expand or
   // truncate.
   if (BI->getNumBits() == Size) return BI;
-  return 0;
+  return nullptr;
 }
 
 Init *BitsRecTy::convertValue(TypedInit *VI) {
@@ -199,7 +199,7 @@ Init *BitsRecTy::convertValue(TypedInit *VI) {
     return BitsInit::get(NewBits);
   }
 
-  return 0;
+  return nullptr;
 }
 
 bool BitsRecTy::baseClassOf(const RecTy *RHS) const{
@@ -219,7 +219,7 @@ Init *IntRecTy::convertValue(BitsInit *BI) {
     if (BitInit *Bit = dyn_cast<BitInit>(BI->getBit(i))) {
       Result |= Bit->getValue() << i;
     } else {
-      return 0;
+      return nullptr;
     }
   return IntInit::get(Result);
 }
@@ -227,7 +227,7 @@ Init *IntRecTy::convertValue(BitsInit *BI) {
 Init *IntRecTy::convertValue(TypedInit *TI) {
   if (TI->getType()->typeIsConvertibleTo(this))
     return TI;  // Accept variable if already of the right type!
-  return 0;
+  return nullptr;
 }
 
 bool IntRecTy::baseClassOf(const RecTy *RHS) const{
@@ -238,7 +238,7 @@ bool IntRecTy::baseClassOf(const RecTy *RHS) const{
 Init *StringRecTy::convertValue(UnOpInit *BO) {
   if (BO->getOpcode() == UnOpInit::CAST) {
     Init *L = BO->getOperand()->convertInitializerTo(this);
-    if (L == 0) return 0;
+    if (!L) return nullptr;
     if (L != BO->getOperand())
       return UnOpInit::get(UnOpInit::CAST, L, new StringRecTy);
     return BO;
@@ -251,7 +251,7 @@ Init *StringRecTy::convertValue(BinOpInit *BO) {
   if (BO->getOpcode() == BinOpInit::STRCONCAT) {
     Init *L = BO->getLHS()->convertInitializerTo(this);
     Init *R = BO->getRHS()->convertInitializerTo(this);
-    if (L == 0 || R == 0) return 0;
+    if (!L || !R) return nullptr;
     if (L != BO->getLHS() || R != BO->getRHS())
       return BinOpInit::get(BinOpInit::STRCONCAT, L, R, new StringRecTy);
     return BO;
@@ -264,7 +264,7 @@ Init *StringRecTy::convertValue(BinOpInit *BO) {
 Init *StringRecTy::convertValue(TypedInit *TI) {
   if (isa<StringRecTy>(TI->getType()))
     return TI;  // Accept variable if already of the right type!
-  return 0;
+  return nullptr;
 }
 
 std::string ListRecTy::getAsString() const {
@@ -280,10 +280,10 @@ Init *ListRecTy::convertValue(ListInit *LI) {
     if (Init *CI = LI->getElement(i)->convertInitializerTo(Ty))
       Elements.push_back(CI);
     else
-      return 0;
+      return nullptr;
 
   if (!isa<ListRecTy>(LI->getType()))
-    return 0;
+    return nullptr;
 
   return ListInit::get(Elements, this);
 }
@@ -293,7 +293,7 @@ Init *ListRecTy::convertValue(TypedInit *TI) {
   if (ListRecTy *LRT = dyn_cast<ListRecTy>(TI->getType()))
     if (LRT->getElementType()->typeIsConvertibleTo(getElementType()))
       return TI;
-  return 0;
+  return nullptr;
 }
 
 bool ListRecTy::baseClassOf(const RecTy *RHS) const{
@@ -305,30 +305,30 @@ bool ListRecTy::baseClassOf(const RecTy *RHS) const{
 Init *DagRecTy::convertValue(TypedInit *TI) {
   if (TI->getType()->typeIsConvertibleTo(this))
     return TI;
-  return 0;
+  return nullptr;
 }
 
 Init *DagRecTy::convertValue(UnOpInit *BO) {
   if (BO->getOpcode() == UnOpInit::CAST) {
     Init *L = BO->getOperand()->convertInitializerTo(this);
-    if (L == 0) return 0;
+    if (!L) return nullptr;
     if (L != BO->getOperand())
       return UnOpInit::get(UnOpInit::CAST, L, new DagRecTy);
     return BO;
   }
-  return 0;
+  return nullptr;
 }
 
 Init *DagRecTy::convertValue(BinOpInit *BO) {
   if (BO->getOpcode() == BinOpInit::CONCAT) {
     Init *L = BO->getLHS()->convertInitializerTo(this);
     Init *R = BO->getRHS()->convertInitializerTo(this);
-    if (L == 0 || R == 0) return 0;
+    if (!L || !R) return nullptr;
     if (L != BO->getLHS() || R != BO->getRHS())
       return BinOpInit::get(BinOpInit::CONCAT, L, R, new DagRecTy);
     return BO;
   }
-  return 0;
+  return nullptr;
 }
 
 RecordRecTy *RecordRecTy::get(Record *R) {
@@ -342,7 +342,7 @@ std::string RecordRecTy::getAsString() const {
 Init *RecordRecTy::convertValue(DefInit *DI) {
   // Ensure that DI is a subclass of Rec.
   if (!DI->getDef()->isSubClassOf(Rec))
-    return 0;
+    return nullptr;
   return DI;
 }
 
@@ -352,7 +352,7 @@ Init *RecordRecTy::convertValue(TypedInit *TI) {
     if (RRT->getRecord()->isSubClassOf(getRecord()) ||
         RRT->getRecord() == getRecord())
       return TI;
-  return 0;
+  return nullptr;
 }
 
 bool RecordRecTy::baseClassOf(const RecTy *RHS) const{
@@ -391,7 +391,7 @@ RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
         ++i) {
       RecordRecTy *SuperRecTy1 = RecordRecTy::get(*i);
       RecTy *NewType1 = resolveTypes(SuperRecTy1, T2);
-      if (NewType1 != 0) {
+      if (NewType1) {
         if (NewType1 != SuperRecTy1) {
           delete SuperRecTy1;
         }
@@ -409,7 +409,7 @@ RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
         ++i) {
       RecordRecTy *SuperRecTy2 = RecordRecTy::get(*i);
       RecTy *NewType2 = resolveTypes(T1, SuperRecTy2);
-      if (NewType2 != 0) {
+      if (NewType2) {
         if (NewType2 != SuperRecTy2) {
           delete SuperRecTy2;
         }
@@ -417,7 +417,7 @@ RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
       }
     }
   }
-  return 0;
+  return nullptr;
 }
 
 
@@ -462,7 +462,7 @@ BitsInit *BitsInit::get(ArrayRef<Init *> Range) {
   FoldingSetNodeID ID;
   ProfileBitsInit(ID, Range);
 
-  void *IP = 0;
+  void *IP = nullptr;
   if (BitsInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
     return I;
 
@@ -482,7 +482,7 @@ BitsInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
 
   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
     if (Bits[i] >= getNumBits())
-      return 0;
+      return nullptr;
     NewBits[i] = getBit(Bits[i]);
   }
   return BitsInit::get(NewBits);
@@ -516,8 +516,8 @@ Init *BitsInit::resolveReferences(Record &R, const RecordVal *RV) const {
   bool Changed = false;
   SmallVector<Init *, 16> NewBits(getNumBits());
 
-  Init *CachedInit = 0;
-  Init *CachedBitVar = 0;
+  Init *CachedInit = nullptr;
+  Init *CachedBitVar = nullptr;
   bool CachedBitVarChanged = false;
 
   for (unsigned i = 0, e = getNumBits(); i != e; ++i) {
@@ -590,7 +590,7 @@ IntInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
 
   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
     if (Bits[i] >= 64)
-      return 0;
+      return nullptr;
 
     NewBits[i] = BitInit::get(Value & (INT64_C(1) << Bits[i]));
   }
@@ -623,18 +623,18 @@ static void ProfileListInit(FoldingSetNodeID &ID,
 ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
   typedef FoldingSet<ListInit> Pool;
   static Pool ThePool;
+  static std::vector<std::unique_ptr<ListInit>> TheActualPool;
 
-  // Just use the FoldingSetNodeID to compute a hash.  Use a DenseMap
-  // for actual storage.
   FoldingSetNodeID ID;
   ProfileListInit(ID, Range, EltTy);
 
-  void *IP = 0;
+  void *IP = nullptr;
   if (ListInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
     return I;
 
   ListInit *I = new ListInit(Range, EltTy);
   ThePool.InsertNode(I, IP);
+  TheActualPool.push_back(std::unique_ptr<ListInit>(I));
   return I;
 }
 
@@ -651,7 +651,7 @@ ListInit::convertInitListSlice(const std::vector<unsigned> &Elements) const {
   std::vector<Init*> Vals;
   for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
     if (Elements[i] >= getSize())
-      return 0;
+      return nullptr;
     Vals.push_back(getElement(Elements[i]));
   }
   return ListInit::get(Vals, getType());
@@ -660,7 +660,7 @@ ListInit::convertInitListSlice(const std::vector<unsigned> &Elements) const {
 Record *ListInit::getElementAsRecord(unsigned i) const {
   assert(i < Values.size() && "List element index out of range!");
   DefInit *DI = dyn_cast<DefInit>(Values[i]);
-  if (DI == 0)
+  if (!DI)
     PrintFatalError("Expected record in list!");
   return DI->getDef();
 }
@@ -690,14 +690,14 @@ Init *ListInit::resolveReferences(Record &R, const RecordVal *RV) const {
 Init *ListInit::resolveListElementReference(Record &R, const RecordVal *IRV,
                                             unsigned Elt) const {
   if (Elt >= getSize())
-    return 0;  // Out of range reference.
+    return nullptr;  // Out of range reference.
   Init *E = getElement(Elt);
   // If the element is set to some value, or if we are resolving a reference
   // to a specific variable and that variable is explicitly unset, then
   // replace the VarListElementInit with it.
   if (IRV || !isa<UnsetInit>(E))
     return E;
-  return 0;
+  return nullptr;
 }
 
 std::string ListInit::getAsString() const {
@@ -714,7 +714,7 @@ Init *OpInit::resolveListElementReference(Record &R, const RecordVal *IRV,
   Init *Resolved = resolveReferences(R, IRV);
   OpInit *OResolved = dyn_cast<OpInit>(Resolved);
   if (OResolved) {
-    Resolved = OResolved->Fold(&R, 0);
+    Resolved = OResolved->Fold(&R, nullptr);
   }
 
   if (Resolved != this) {
@@ -728,7 +728,7 @@ Init *OpInit::resolveListElementReference(Record &R, const RecordVal *IRV,
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 Init *OpInit::getBit(unsigned Bit) const {
@@ -813,7 +813,7 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
       if (LHSl->getSize() == 0) {
         assert(0 && "Empty list in car");
-        return 0;
+        return nullptr;
       }
       return LHSl->getElement(0);
     }
@@ -823,7 +823,7 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
       if (LHSl->getSize() == 0) {
         assert(0 && "Empty list in cdr");
-        return 0;
+        return nullptr;
       }
       // Note the +1.  We can't just pass the result of getValues()
       // directly.
@@ -862,8 +862,8 @@ Init *UnOpInit::resolveReferences(Record &R, const RecordVal *RV) const {
   Init *lhs = LHS->resolveReferences(R, RV);
 
   if (LHS != lhs)
-    return (UnOpInit::get(getOpcode(), lhs, getType()))->Fold(&R, 0);
-  return Fold(&R, 0);
+    return (UnOpInit::get(getOpcode(), lhs, getType()))->Fold(&R, nullptr);
+  return Fold(&R, nullptr);
 }
 
 std::string UnOpInit::getAsString() const {
@@ -902,7 +902,7 @@ Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
     if (LHSs && RHSs) {
       DefInit *LOp = dyn_cast<DefInit>(LHSs->getOperator());
       DefInit *ROp = dyn_cast<DefInit>(RHSs->getOperator());
-      if (LOp == 0 || ROp == 0 || LOp->getDef() != ROp->getDef())
+      if (!LOp || !ROp || LOp->getDef() != ROp->getDef())
         PrintFatalError("Concated Dag operators do not match!");
       std::vector<Init*> Args;
       std::vector<std::string> ArgNames;
@@ -918,6 +918,18 @@ Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
     }
     break;
   }
+  case LISTCONCAT: {
+    ListInit *LHSs = dyn_cast<ListInit>(LHS);
+    ListInit *RHSs = dyn_cast<ListInit>(RHS);
+    if (LHSs && RHSs) {
+      std::vector<Init *> Args;
+      Args.insert(Args.end(), LHSs->begin(), LHSs->end());
+      Args.insert(Args.end(), RHSs->begin(), RHSs->end());
+      return ListInit::get(
+          Args, static_cast<ListRecTy *>(LHSs->getType())->getElementType());
+    }
+    break;
+  }
   case STRCONCAT: {
     StringInit *LHSs = dyn_cast<StringInit>(LHS);
     StringInit *RHSs = dyn_cast<StringInit>(RHS);
@@ -974,8 +986,8 @@ Init *BinOpInit::resolveReferences(Record &R, const RecordVal *RV) const {
   Init *rhs = RHS->resolveReferences(R, RV);
 
   if (LHS != lhs || RHS != rhs)
-    return (BinOpInit::get(getOpcode(), lhs, rhs, getType()))->Fold(&R, 0);
-  return Fold(&R, 0);
+    return (BinOpInit::get(getOpcode(), lhs, rhs, getType()))->Fold(&R,nullptr);
+  return Fold(&R, nullptr);
 }
 
 std::string BinOpInit::getAsString() const {
@@ -987,6 +999,7 @@ std::string BinOpInit::getAsString() const {
   case SRA: Result = "!sra"; break;
   case SRL: Result = "!srl"; break;
   case EQ: Result = "!eq"; break;
+  case LISTCONCAT: Result = "!listconcat"; break;
   case STRCONCAT: Result = "!strconcat"; break;
   }
   return Result + "(" + LHS->getAsString() + ", " + RHS->getAsString() + ")";
@@ -1031,11 +1044,7 @@ static Init *EvaluateOperation(OpInit *RHSo, Init *LHS, Init *Arg,
   if (TArg && TArg->getType()->getAsString() == "dag") {
     Init *Result = ForeachHelper(LHS, Arg, RHSo, Type,
                                  CurRec, CurMultiClass);
-    if (Result != 0) {
-      return Result;
-    } else {
-      return 0;
-    }
+    return Result;
   }
 
   for (int i = 0; i < RHSo->getNumOperands(); ++i) {
@@ -1044,7 +1053,7 @@ static Init *EvaluateOperation(OpInit *RHSo, Init *LHS, Init *Arg,
     if (RHSoo) {
       Init *Result = EvaluateOperation(RHSoo, LHS, Arg,
                                        Type, CurRec, CurMultiClass);
-      if (Result != 0) {
+      if (Result) {
         NewOperands.push_back(Result);
       } else {
         NewOperands.push_back(Arg);
@@ -1059,10 +1068,7 @@ static Init *EvaluateOperation(OpInit *RHSo, Init *LHS, Init *Arg,
   // Now run the operator and use its result as the new leaf
   const OpInit *NewOp = RHSo->clone(NewOperands);
   Init *NewVal = NewOp->Fold(CurRec, CurMultiClass);
-  if (NewVal != NewOp)
-    return NewVal;
-
-  return 0;
+  return (NewVal != NewOp) ? NewVal : nullptr;
 }
 
 static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
@@ -1086,7 +1092,7 @@ static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
       Init *Val = MHSd->getOperator();
       Init *Result = EvaluateOperation(RHSo, LHS, Val,
                                        Type, CurRec, CurMultiClass);
-      if (Result != 0) {
+      if (Result) {
         Val = Result;
       }
 
@@ -1100,7 +1106,7 @@ static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
         // Process args
         Init *Result = EvaluateOperation(RHSo, LHS, Arg, Type,
                                          CurRec, CurMultiClass);
-        if (Result != 0) {
+        if (Result) {
           Arg = Result;
         }
 
@@ -1138,7 +1144,7 @@ static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
       return ListInit::get(NewList, MHSl->getType());
     }
   }
-  return 0;
+  return nullptr;
 }
 
 Init *TernOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
@@ -1195,7 +1201,7 @@ Init *TernOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
   case FOREACH: {
     Init *Result = ForeachHelper(LHS, MHS, RHS, getType(),
                                  CurRec, CurMultiClass);
-    if (Result != 0) {
+    if (Result) {
       return Result;
     }
     break;
@@ -1227,16 +1233,16 @@ Init *TernOpInit::resolveReferences(Record &R,
     IntInit *Value = dyn_cast<IntInit>(lhs);
     if (Init *I = lhs->convertInitializerTo(IntRecTy::get()))
       Value = dyn_cast<IntInit>(I);
-    if (Value != 0) {
+    if (Value) {
       // Short-circuit
       if (Value->getValue()) {
         Init *mhs = MHS->resolveReferences(R, RV);
         return (TernOpInit::get(getOpcode(), lhs, mhs,
-                                RHS, getType()))->Fold(&R, 0);
+                                RHS, getType()))->Fold(&R, nullptr);
       } else {
         Init *rhs = RHS->resolveReferences(R, RV);
         return (TernOpInit::get(getOpcode(), lhs, MHS,
-                                rhs, getType()))->Fold(&R, 0);
+                                rhs, getType()))->Fold(&R, nullptr);
       }
     }
   }
@@ -1246,8 +1252,8 @@ Init *TernOpInit::resolveReferences(Record &R,
 
   if (LHS != lhs || MHS != mhs || RHS != rhs)
     return (TernOpInit::get(getOpcode(), lhs, mhs, rhs,
-                            getType()))->Fold(&R, 0);
-  return Fold(&R, 0);
+                            getType()))->Fold(&R, nullptr);
+  return Fold(&R, nullptr);
 }
 
 std::string TernOpInit::getAsString() const {
@@ -1265,19 +1271,19 @@ RecTy *TypedInit::getFieldType(const std::string &FieldName) const {
   if (RecordRecTy *RecordType = dyn_cast<RecordRecTy>(getType()))
     if (RecordVal *Field = RecordType->getRecord()->getValue(FieldName))
       return Field->getType();
-  return 0;
+  return nullptr;
 }
 
 Init *
 TypedInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
   BitsRecTy *T = dyn_cast<BitsRecTy>(getType());
-  if (T == 0) return 0;  // Cannot subscript a non-bits variable.
+  if (!T) return nullptr;  // Cannot subscript a non-bits variable.
   unsigned NumBits = T->getNumBits();
 
   SmallVector<Init *, 16> NewBits(Bits.size());
   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
     if (Bits[i] >= NumBits)
-      return 0;
+      return nullptr;
 
     NewBits[i] = VarBitInit::get(const_cast<TypedInit *>(this), Bits[i]);
   }
@@ -1287,7 +1293,7 @@ TypedInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
 Init *
 TypedInit::convertInitListSlice(const std::vector<unsigned> &Elements) const {
   ListRecTy *T = dyn_cast<ListRecTy>(getType());
-  if (T == 0) return 0;  // Cannot subscript a non-list variable.
+  if (!T) return nullptr;  // Cannot subscript a non-list variable.
 
   if (Elements.size() == 1)
     return VarListElementInit::get(const_cast<TypedInit *>(this), Elements[0]);
@@ -1332,8 +1338,8 @@ Init *VarInit::getBit(unsigned Bit) const {
 Init *VarInit::resolveListElementReference(Record &R,
                                            const RecordVal *IRV,
                                            unsigned Elt) const {
-  if (R.isTemplateArg(getNameInit())) return 0;
-  if (IRV && IRV->getNameInit() != getNameInit()) return 0;
+  if (R.isTemplateArg(getNameInit())) return nullptr;
+  if (IRV && IRV->getNameInit() != getNameInit()) return nullptr;
 
   RecordVal *RV = R.getValue(getNameInit());
   assert(RV && "Reference to a non-existent variable?");
@@ -1345,14 +1351,14 @@ Init *VarInit::resolveListElementReference(Record &R,
   }
 
   if (Elt >= LI->getSize())
-    return 0;  // Out of range reference.
+    return nullptr;  // Out of range reference.
   Init *E = LI->getElement(Elt);
   // If the element is set to some value, or if we are resolving a reference
   // to a specific variable and that variable is explicitly unset, then
   // replace the VarListElementInit with it.
   if (IRV || !isa<UnsetInit>(E))
     return E;
-  return 0;
+  return nullptr;
 }
 
 
@@ -1360,7 +1366,7 @@ RecTy *VarInit::getFieldType(const std::string &FieldName) const {
   if (RecordRecTy *RTy = dyn_cast<RecordRecTy>(getType()))
     if (const RecordVal *RV = RTy->getRecord()->getValue(FieldName))
       return RV->getType();
-  return 0;
+  return nullptr;
 }
 
 Init *VarInit::getFieldInit(Record &R, const RecordVal *RV,
@@ -1368,15 +1374,15 @@ Init *VarInit::getFieldInit(Record &R, const RecordVal *RV,
   if (isa<RecordRecTy>(getType()))
     if (const RecordVal *Val = R.getValue(VarName)) {
       if (RV != Val && (RV || isa<UnsetInit>(Val->getValue())))
-        return 0;
+        return nullptr;
       Init *TheInit = Val->getValue();
       assert(TheInit != this && "Infinite loop detected!");
       if (Init *I = TheInit->getFieldInit(R, RV, FieldName))
         return I;
       else
-        return 0;
+        return nullptr;
     }
-  return 0;
+  return nullptr;
 }
 
 /// resolveReferences - This method is used by classes that refer to other
@@ -1386,7 +1392,7 @@ Init *VarInit::getFieldInit(Record &R, const RecordVal *RV,
 ///
 Init *VarInit::resolveReferences(Record &R, const RecordVal *RV) const {
   if (RecordVal *Val = R.getValue(VarName))
-    if (RV == Val || (RV == 0 && !isa<UnsetInit>(Val->getValue())))
+    if (RV == Val || (!RV && !isa<UnsetInit>(Val->getValue())))
       return Val->getValue();
   return const_cast<VarInit *>(this);
 }
@@ -1462,7 +1468,7 @@ Init *VarListElementInit:: resolveListElementReference(Record &R,
     return Result;
   }
  
-  return 0;
+  return nullptr;
 }
 
 DefInit *DefInit::get(Record *R) {
@@ -1472,7 +1478,7 @@ DefInit *DefInit::get(Record *R) {
 RecTy *DefInit::getFieldType(const std::string &FieldName) const {
   if (const RecordVal *RV = Def->getValue(FieldName))
     return RV->getType();
-  return 0;
+  return nullptr;
 }
 
 Init *DefInit::getFieldInit(Record &R, const RecordVal *RV,
@@ -1507,7 +1513,7 @@ Init *FieldInit::resolveListElementReference(Record &R, const RecordVal *RV,
                                              unsigned Elt) const {
   if (Init *ListVal = Rec->getFieldInit(R, RV, FieldName))
     if (ListInit *LI = dyn_cast<ListInit>(ListVal)) {
-      if (Elt >= LI->getSize()) return 0;
+      if (Elt >= LI->getSize()) return nullptr;
       Init *E = LI->getElement(Elt);
 
       // If the element is set to some value, or if we are resolving a
@@ -1516,7 +1522,7 @@ Init *FieldInit::resolveListElementReference(Record &R, const RecordVal *RV,
       if (RV || !isa<UnsetInit>(E))
         return E;
     }
-  return 0;
+  return nullptr;
 }
 
 Init *FieldInit::resolveReferences(Record &R, const RecordVal *RV) const {
@@ -1560,7 +1566,7 @@ DagInit::get(Init *V, const std::string &VN,
   FoldingSetNodeID ID;
   ProfileDagInit(ID, V, VN, ArgRange, NameRange);
 
-  void *IP = 0;
+  void *IP = nullptr;
   if (DagInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
     return I;
 
@@ -1784,7 +1790,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) {
 ///
 Init *Record::getValueInit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
-  if (R == 0 || R->getValue() == 0)
+  if (!R || !R->getValue())
     PrintFatalError(getLoc(), "Record `" + getName() +
       "' does not have a field named `" + FieldName + "'!\n");
   return R->getValue();
@@ -1797,7 +1803,7 @@ Init *Record::getValueInit(StringRef FieldName) const {
 ///
 std::string Record::getValueAsString(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
-  if (R == 0 || R->getValue() == 0)
+  if (!R || !R->getValue())
     PrintFatalError(getLoc(), "Record `" + getName() +
       "' does not have a field named `" + FieldName + "'!\n");
 
@@ -1813,7 +1819,7 @@ std::string Record::getValueAsString(StringRef FieldName) const {
 ///
 BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
-  if (R == 0 || R->getValue() == 0)
+  if (!R || !R->getValue())
     PrintFatalError(getLoc(), "Record `" + getName() +
       "' does not have a field named `" + FieldName + "'!\n");
 
@@ -1829,7 +1835,7 @@ BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const {
 ///
 ListInit *Record::getValueAsListInit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
-  if (R == 0 || R->getValue() == 0)
+  if (!R || !R->getValue())
     PrintFatalError(getLoc(), "Record `" + getName() +
       "' does not have a field named `" + FieldName + "'!\n");
 
@@ -1864,7 +1870,7 @@ Record::getValueAsListOfDefs(StringRef FieldName) const {
 ///
 int64_t Record::getValueAsInt(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
-  if (R == 0 || R->getValue() == 0)
+  if (!R || !R->getValue())
     PrintFatalError(getLoc(), "Record `" + getName() +
       "' does not have a field named `" + FieldName + "'!\n");
 
@@ -1918,7 +1924,7 @@ Record::getValueAsListOfStrings(StringRef FieldName) const {
 ///
 Record *Record::getValueAsDef(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
-  if (R == 0 || R->getValue() == 0)
+  if (!R || !R->getValue())
     PrintFatalError(getLoc(), "Record `" + getName() +
       "' does not have a field named `" + FieldName + "'!\n");
 
@@ -1934,7 +1940,7 @@ Record *Record::getValueAsDef(StringRef FieldName) const {
 ///
 bool Record::getValueAsBit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
-  if (R == 0 || R->getValue() == 0)
+  if (!R || !R->getValue())
     PrintFatalError(getLoc(), "Record `" + getName() +
       "' does not have a field named `" + FieldName + "'!\n");
 
@@ -1946,7 +1952,7 @@ bool Record::getValueAsBit(StringRef FieldName) const {
 
 bool Record::getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const {
   const RecordVal *R = getValue(FieldName);
-  if (R == 0 || R->getValue() == 0)
+  if (!R || !R->getValue())
     PrintFatalError(getLoc(), "Record `" + getName() +
       "' does not have a field named `" + FieldName.str() + "'!\n");
 
@@ -1967,7 +1973,7 @@ bool Record::getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const {
 ///
 DagInit *Record::getValueAsDag(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
-  if (R == 0 || R->getValue() == 0)
+  if (!R || !R->getValue())
     PrintFatalError(getLoc(), "Record `" + getName() +
       "' does not have a field named `" + FieldName + "'!\n");
 
diff --git a/lib/TableGen/TGLexer.cpp b/lib/TableGen/TGLexer.cpp
index c6be4f8..1ec2eea 100644
--- a/lib/TableGen/TGLexer.cpp
+++ b/lib/TableGen/TGLexer.cpp
@@ -30,7 +30,7 @@ TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
   CurBuffer = 0;
   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
   CurPtr = CurBuf->getBufferStart();
-  TokStart = 0;
+  TokStart = nullptr;
 }
 
 SMLoc TGLexer::getLoc() const {
@@ -389,12 +389,12 @@ tgtok::TokKind TGLexer::LexNumber() {
         return ReturnError(TokStart, "Invalid hexadecimal number");
 
       errno = 0;
-      CurIntVal = strtoll(NumStart, 0, 16);
+      CurIntVal = strtoll(NumStart, nullptr, 16);
       if (errno == EINVAL)
         return ReturnError(TokStart, "Invalid hexadecimal number");
       if (errno == ERANGE) {
         errno = 0;
-        CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
+        CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16);
         if (errno == EINVAL)
           return ReturnError(TokStart, "Invalid hexadecimal number");
         if (errno == ERANGE)
@@ -410,7 +410,7 @@ tgtok::TokKind TGLexer::LexNumber() {
       // Requires at least one binary digit.
       if (CurPtr == NumStart)
         return ReturnError(CurPtr-2, "Invalid binary number");
-      CurIntVal = strtoll(NumStart, 0, 2);
+      CurIntVal = strtoll(NumStart, nullptr, 2);
       return tgtok::IntVal;
     }
   }
@@ -425,7 +425,7 @@ tgtok::TokKind TGLexer::LexNumber() {
   
   while (isdigit(CurPtr[0]))
     ++CurPtr;
-  CurIntVal = strtoll(TokStart, 0, 10);
+  CurIntVal = strtoll(TokStart, nullptr, 10);
   return tgtok::IntVal;
 }
 
@@ -478,6 +478,7 @@ tgtok::TokKind TGLexer::LexExclaim() {
     .Case("empty", tgtok::XEmpty)
     .Case("subst", tgtok::XSubst)
     .Case("foreach", tgtok::XForEach)
+    .Case("listconcat", tgtok::XListConcat)
     .Case("strconcat", tgtok::XStrConcat)
     .Default(tgtok::Error);
 
diff --git a/lib/TableGen/TGLexer.h b/lib/TableGen/TGLexer.h
index d1bd70d..1e599f8 100644
--- a/lib/TableGen/TGLexer.h
+++ b/lib/TableGen/TGLexer.h
@@ -47,7 +47,7 @@ namespace tgtok {
     MultiClass, String,
     
     // !keywords.
-    XConcat, XADD, XSRA, XSRL, XSHL, XStrConcat, XCast, XSubst,
+    XConcat, XADD, XSRA, XSRL, XSHL, XListConcat, XStrConcat, XCast, XSubst,
     XForEach, XHead, XTail, XEmpty, XIf, XEq,
 
     // Integer value.
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 4ba769c..038e018 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -29,18 +29,18 @@ struct SubClassReference {
   SMRange RefRange;
   Record *Rec;
   std::vector<Init*> TemplateArgs;
-  SubClassReference() : Rec(0) {}
+  SubClassReference() : Rec(nullptr) {}
 
-  bool isInvalid() const { return Rec == 0; }
+  bool isInvalid() const { return Rec == nullptr; }
 };
 
 struct SubMultiClassReference {
   SMRange RefRange;
   MultiClass *MC;
   std::vector<Init*> TemplateArgs;
-  SubMultiClassReference() : MC(0) {}
+  SubMultiClassReference() : MC(nullptr) {}
 
-  bool isInvalid() const { return MC == 0; }
+  bool isInvalid() const { return MC == nullptr; }
   void dump() const;
 };
 
@@ -61,7 +61,7 @@ void SubMultiClassReference::dump() const {
 } // end namespace llvm
 
 bool TGParser::AddValue(Record *CurRec, SMLoc Loc, const RecordVal &RV) {
-  if (CurRec == 0)
+  if (!CurRec)
     CurRec = &CurMultiClass->Rec;
 
   if (RecordVal *ERV = CurRec->getValue(RV.getNameInit())) {
@@ -83,10 +83,10 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
                         const std::vector<unsigned> &BitList, Init *V) {
   if (!V) return false;
 
-  if (CurRec == 0) CurRec = &CurMultiClass->Rec;
+  if (!CurRec) CurRec = &CurMultiClass->Rec;
 
   RecordVal *RV = CurRec->getValue(ValName);
-  if (RV == 0)
+  if (!RV)
     return Error(Loc, "Value '" + ValName->getAsUnquotedString()
                  + "' unknown!");
 
@@ -103,19 +103,19 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
   //
   if (!BitList.empty()) {
     BitsInit *CurVal = dyn_cast<BitsInit>(RV->getValue());
-    if (CurVal == 0)
+    if (!CurVal)
       return Error(Loc, "Value '" + ValName->getAsUnquotedString()
                    + "' is not a bits type");
 
     // Convert the incoming value to a bits type of the appropriate size...
     Init *BI = V->convertInitializerTo(BitsRecTy::get(BitList.size()));
-    if (BI == 0) {
+    if (!BI) {
       return Error(Loc, "Initializer is not compatible with bit range");
     }
 
     // We should have a BitsInit type now.
     BitsInit *BInit = dyn_cast<BitsInit>(BI);
-    assert(BInit != 0);
+    assert(BInit != nullptr);
 
     SmallVector<Init *, 16> NewBits(CurVal->getNumBits());
 
@@ -129,7 +129,7 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
     }
 
     for (unsigned i = 0, e = CurVal->getNumBits(); i != e; ++i)
-      if (NewBits[i] == 0)
+      if (!NewBits[i])
         NewBits[i] = CurVal->getBit(i);
 
     V = BitsInit::get(NewBits);
@@ -314,14 +314,14 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
     assert(IterVals.size() < Loops.size());
     ForeachLoop &CurLoop = Loops[IterVals.size()];
     ListInit *List = dyn_cast<ListInit>(CurLoop.ListValue);
-    if (List == 0) {
+    if (!List) {
       Error(Loc, "Loop list is not a list");
       return true;
     }
 
     // Process each value.
     for (int64_t i = 0; i < List->getSize(); ++i) {
-      Init *ItemVal = List->resolveListElementReference(*CurRec, 0, i);
+      Init *ItemVal = List->resolveListElementReference(*CurRec, nullptr, i);
       IterVals.push_back(IterRecord(CurLoop.IterVar, ItemVal));
       if (ProcessForeachDefs(CurRec, Loc, IterVals))
         return true;
@@ -339,7 +339,7 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
   for (unsigned i = 0, e = IterVals.size(); i != e; ++i) {
     VarInit *IterVar = IterVals[i].IterVar;
     TypedInit *IVal = dyn_cast<TypedInit>(IterVals[i].IterValue);
-    if (IVal == 0) {
+    if (!IVal) {
       Error(Loc, "foreach iterator value is untyped");
       return true;
     }
@@ -400,21 +400,21 @@ Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) {
     // These are all of the tokens that can begin an object body.
     // Some of these can also begin values but we disallow those cases
     // because they are unlikely to be useful.
-    return 0;
+    return nullptr;
   default:
     break;
   }
 
-  Record *CurRec = 0;
+  Record *CurRec = nullptr;
   if (CurMultiClass)
     CurRec = &CurMultiClass->Rec;
 
-  RecTy *Type = 0;
+  RecTy *Type = nullptr;
   if (CurRec) {
     const TypedInit *CurRecName = dyn_cast<TypedInit>(CurRec->getNameInit());
     if (!CurRecName) {
       TokError("Record name is not typed!");
-      return 0;
+      return nullptr;
     }
     Type = CurRecName->getType();
   }
@@ -430,11 +430,11 @@ Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) {
 Record *TGParser::ParseClassID() {
   if (Lex.getCode() != tgtok::Id) {
     TokError("expected name for ClassID");
-    return 0;
+    return nullptr;
   }
 
   Record *Result = Records.getClass(Lex.getCurStrVal());
-  if (Result == 0)
+  if (!Result)
     TokError("Couldn't find class '" + Lex.getCurStrVal() + "'");
 
   Lex.Lex();
@@ -449,11 +449,11 @@ Record *TGParser::ParseClassID() {
 MultiClass *TGParser::ParseMultiClassID() {
   if (Lex.getCode() != tgtok::Id) {
     TokError("expected name for MultiClassID");
-    return 0;
+    return nullptr;
   }
 
   MultiClass *Result = MultiClasses[Lex.getCurStrVal()];
-  if (Result == 0)
+  if (!Result)
     TokError("Couldn't find multiclass '" + Lex.getCurStrVal() + "'");
 
   Lex.Lex();
@@ -477,7 +477,7 @@ ParseSubClassReference(Record *CurRec, bool isDefm) {
   } else {
     Result.Rec = ParseClassID();
   }
-  if (Result.Rec == 0) return Result;
+  if (!Result.Rec) return Result;
 
   // If there is no template arg list, we're done.
   if (Lex.getCode() != tgtok::less) {
@@ -488,19 +488,19 @@ ParseSubClassReference(Record *CurRec, bool isDefm) {
 
   if (Lex.getCode() == tgtok::greater) {
     TokError("subclass reference requires a non-empty list of template values");
-    Result.Rec = 0;
+    Result.Rec = nullptr;
     return Result;
   }
 
   Result.TemplateArgs = ParseValueList(CurRec, Result.Rec);
   if (Result.TemplateArgs.empty()) {
-    Result.Rec = 0;   // Error parsing value list.
+    Result.Rec = nullptr;   // Error parsing value list.
     return Result;
   }
 
   if (Lex.getCode() != tgtok::greater) {
     TokError("expected '>' in template value list");
-    Result.Rec = 0;
+    Result.Rec = nullptr;
     return Result;
   }
   Lex.Lex();
@@ -522,7 +522,7 @@ ParseSubMultiClassReference(MultiClass *CurMC) {
   Result.RefRange.Start = Lex.getLoc();
 
   Result.MC = ParseMultiClassID();
-  if (Result.MC == 0) return Result;
+  if (!Result.MC) return Result;
 
   // If there is no template arg list, we're done.
   if (Lex.getCode() != tgtok::less) {
@@ -533,19 +533,19 @@ ParseSubMultiClassReference(MultiClass *CurMC) {
 
   if (Lex.getCode() == tgtok::greater) {
     TokError("subclass reference requires a non-empty list of template values");
-    Result.MC = 0;
+    Result.MC = nullptr;
     return Result;
   }
 
   Result.TemplateArgs = ParseValueList(&CurMC->Rec, &Result.MC->Rec);
   if (Result.TemplateArgs.empty()) {
-    Result.MC = 0;   // Error parsing value list.
+    Result.MC = nullptr;   // Error parsing value list.
     return Result;
   }
 
   if (Lex.getCode() != tgtok::greater) {
     TokError("expected '>' in template value list");
-    Result.MC = 0;
+    Result.MC = nullptr;
     return Result;
   }
   Lex.Lex();
@@ -677,7 +677,7 @@ bool TGParser::ParseOptionalBitList(std::vector<unsigned> &Ranges) {
 ///
 RecTy *TGParser::ParseType() {
   switch (Lex.getCode()) {
-  default: TokError("Unknown token when expecting a type"); return 0;
+  default: TokError("Unknown token when expecting a type"); return nullptr;
   case tgtok::String: Lex.Lex(); return StringRecTy::get();
   case tgtok::Code:   Lex.Lex(); return StringRecTy::get();
   case tgtok::Bit:    Lex.Lex(); return BitRecTy::get();
@@ -685,20 +685,20 @@ RecTy *TGParser::ParseType() {
   case tgtok::Dag:    Lex.Lex(); return DagRecTy::get();
   case tgtok::Id:
     if (Record *R = ParseClassID()) return RecordRecTy::get(R);
-    return 0;
+    return nullptr;
   case tgtok::Bits: {
     if (Lex.Lex() != tgtok::less) { // Eat 'bits'
       TokError("expected '<' after bits type");
-      return 0;
+      return nullptr;
     }
     if (Lex.Lex() != tgtok::IntVal) {  // Eat '<'
       TokError("expected integer in bits<n> type");
-      return 0;
+      return nullptr;
     }
     uint64_t Val = Lex.getCurIntVal();
     if (Lex.Lex() != tgtok::greater) {  // Eat count.
       TokError("expected '>' at end of bits<n> type");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // Eat '>'
     return BitsRecTy::get(Val);
@@ -706,15 +706,15 @@ RecTy *TGParser::ParseType() {
   case tgtok::List: {
     if (Lex.Lex() != tgtok::less) { // Eat 'bits'
       TokError("expected '<' after list type");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // Eat '<'
     RecTy *SubType = ParseType();
-    if (SubType == 0) return 0;
+    if (!SubType) return nullptr;
 
     if (Lex.getCode() != tgtok::greater) {
       TokError("expected '>' at end of list<ty> type");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // Eat '>'
     return ListRecTy::get(SubType);
@@ -772,7 +772,7 @@ Init *TGParser::ParseIDValue(Record *CurRec,
 
   if (Mode == ParseValueMode) {
     Error(NameLoc, "Variable not defined: '" + Name + "'");
-    return 0;
+    return nullptr;
   }
   
   return StringInit::get(Name);
@@ -786,13 +786,13 @@ Init *TGParser::ParseOperation(Record *CurRec) {
   switch (Lex.getCode()) {
   default:
     TokError("unknown operation");
-    return 0;
+    return nullptr;
   case tgtok::XHead:
   case tgtok::XTail:
   case tgtok::XEmpty:
   case tgtok::XCast: {  // Value ::= !unop '(' Value ')'
     UnOpInit::UnaryOp Code;
-    RecTy *Type = 0;
+    RecTy *Type = nullptr;
 
     switch (Lex.getCode()) {
     default: llvm_unreachable("Unhandled code!");
@@ -802,9 +802,9 @@ Init *TGParser::ParseOperation(Record *CurRec) {
 
       Type = ParseOperatorType();
 
-      if (Type == 0) {
+      if (!Type) {
         TokError("did not get type for unary operator");
-        return 0;
+        return nullptr;
       }
 
       break;
@@ -824,12 +824,12 @@ Init *TGParser::ParseOperation(Record *CurRec) {
     }
     if (Lex.getCode() != tgtok::l_paren) {
       TokError("expected '(' after unary operator");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // eat the '('
 
     Init *LHS = ParseValue(CurRec);
-    if (LHS == 0) return 0;
+    if (!LHS) return nullptr;
 
     if (Code == UnOpInit::HEAD
         || Code == UnOpInit::TAIL
@@ -837,36 +837,36 @@ Init *TGParser::ParseOperation(Record *CurRec) {
       ListInit *LHSl = dyn_cast<ListInit>(LHS);
       StringInit *LHSs = dyn_cast<StringInit>(LHS);
       TypedInit *LHSt = dyn_cast<TypedInit>(LHS);
-      if (LHSl == 0 && LHSs == 0 && LHSt == 0) {
+      if (!LHSl && !LHSs && !LHSt) {
         TokError("expected list or string type argument in unary operator");
-        return 0;
+        return nullptr;
       }
       if (LHSt) {
         ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
         StringRecTy *SType = dyn_cast<StringRecTy>(LHSt->getType());
-        if (LType == 0 && SType == 0) {
+        if (!LType && !SType) {
           TokError("expected list or string type argumnet in unary operator");
-          return 0;
+          return nullptr;
         }
       }
 
       if (Code == UnOpInit::HEAD
           || Code == UnOpInit::TAIL) {
-        if (LHSl == 0 && LHSt == 0) {
+        if (!LHSl && !LHSt) {
           TokError("expected list type argumnet in unary operator");
-          return 0;
+          return nullptr;
         }
 
         if (LHSl && LHSl->getSize() == 0) {
           TokError("empty list argument in unary operator");
-          return 0;
+          return nullptr;
         }
         if (LHSl) {
           Init *Item = LHSl->getElement(0);
           TypedInit *Itemt = dyn_cast<TypedInit>(Item);
-          if (Itemt == 0) {
+          if (!Itemt) {
             TokError("untyped list element in unary operator");
-            return 0;
+            return nullptr;
           }
           if (Code == UnOpInit::HEAD) {
             Type = Itemt->getType();
@@ -876,9 +876,9 @@ Init *TGParser::ParseOperation(Record *CurRec) {
         } else {
           assert(LHSt && "expected list type argument in unary operator");
           ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
-          if (LType == 0) {
+          if (!LType) {
             TokError("expected list type argumnet in unary operator");
-            return 0;
+            return nullptr;
           }
           if (Code == UnOpInit::HEAD) {
             Type = LType->getElementType();
@@ -891,7 +891,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
 
     if (Lex.getCode() != tgtok::r_paren) {
       TokError("expected ')' in unary operator");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // eat the ')'
     return (UnOpInit::get(Code, LHS, Type))->Fold(CurRec, CurMultiClass);
@@ -903,13 +903,14 @@ Init *TGParser::ParseOperation(Record *CurRec) {
   case tgtok::XSRL:
   case tgtok::XSHL:
   case tgtok::XEq:
+  case tgtok::XListConcat:
   case tgtok::XStrConcat: {  // Value ::= !binop '(' Value ',' Value ')'
     tgtok::TokKind OpTok = Lex.getCode();
     SMLoc OpLoc = Lex.getLoc();
     Lex.Lex();  // eat the operation
 
     BinOpInit::BinaryOp Code;
-    RecTy *Type = 0;
+    RecTy *Type = nullptr;
 
     switch (OpTok) {
     default: llvm_unreachable("Unhandled code!");
@@ -919,6 +920,10 @@ Init *TGParser::ParseOperation(Record *CurRec) {
     case tgtok::XSRL:    Code = BinOpInit::SRL;   Type = IntRecTy::get(); break;
     case tgtok::XSHL:    Code = BinOpInit::SHL;   Type = IntRecTy::get(); break;
     case tgtok::XEq:     Code = BinOpInit::EQ;    Type = BitRecTy::get(); break;
+    case tgtok::XListConcat:
+      Code = BinOpInit::LISTCONCAT;
+      // We don't know the list type until we parse the first argument
+      break;
     case tgtok::XStrConcat:
       Code = BinOpInit::STRCONCAT;
       Type = StringRecTy::get();
@@ -927,31 +932,44 @@ Init *TGParser::ParseOperation(Record *CurRec) {
 
     if (Lex.getCode() != tgtok::l_paren) {
       TokError("expected '(' after binary operator");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // eat the '('
 
     SmallVector<Init*, 2> InitList;
 
     InitList.push_back(ParseValue(CurRec));
-    if (InitList.back() == 0) return 0;
+    if (!InitList.back()) return nullptr;
 
     while (Lex.getCode() == tgtok::comma) {
       Lex.Lex();  // eat the ','
 
       InitList.push_back(ParseValue(CurRec));
-      if (InitList.back() == 0) return 0;
+      if (!InitList.back()) return nullptr;
     }
 
     if (Lex.getCode() != tgtok::r_paren) {
       TokError("expected ')' in operator");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // eat the ')'
 
+    // If we are doing !listconcat, we should know the type by now
+    if (OpTok == tgtok::XListConcat) {
+      if (VarInit *Arg0 = dyn_cast<VarInit>(InitList[0]))
+        Type = Arg0->getType();
+      else if (ListInit *Arg0 = dyn_cast<ListInit>(InitList[0]))
+        Type = Arg0->getType();
+      else {
+        InitList[0]->dump();
+        Error(OpLoc, "expected a list");
+        return nullptr;
+      }
+    }
+
     // We allow multiple operands to associative operators like !strconcat as
     // shorthand for nesting them.
-    if (Code == BinOpInit::STRCONCAT) {
+    if (Code == BinOpInit::STRCONCAT || Code == BinOpInit::LISTCONCAT) {
       while (InitList.size() > 2) {
         Init *RHS = InitList.pop_back_val();
         RHS = (BinOpInit::get(Code, InitList.back(), RHS, Type))
@@ -965,14 +983,14 @@ Init *TGParser::ParseOperation(Record *CurRec) {
         ->Fold(CurRec, CurMultiClass);
 
     Error(OpLoc, "expected two operands to operator");
-    return 0;
+    return nullptr;
   }
 
   case tgtok::XIf:
   case tgtok::XForEach:
   case tgtok::XSubst: {  // Value ::= !ternop '(' Value ',' Value ',' Value ')'
     TernOpInit::TernaryOp Code;
-    RecTy *Type = 0;
+    RecTy *Type = nullptr;
 
     tgtok::TokKind LexCode = Lex.getCode();
     Lex.Lex();  // eat the operation
@@ -990,42 +1008,42 @@ Init *TGParser::ParseOperation(Record *CurRec) {
     }
     if (Lex.getCode() != tgtok::l_paren) {
       TokError("expected '(' after ternary operator");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // eat the '('
 
     Init *LHS = ParseValue(CurRec);
-    if (LHS == 0) return 0;
+    if (!LHS) return nullptr;
 
     if (Lex.getCode() != tgtok::comma) {
       TokError("expected ',' in ternary operator");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // eat the ','
 
     Init *MHS = ParseValue(CurRec);
-    if (MHS == 0) return 0;
+    if (!MHS) return nullptr;
 
     if (Lex.getCode() != tgtok::comma) {
       TokError("expected ',' in ternary operator");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // eat the ','
 
     Init *RHS = ParseValue(CurRec);
-    if (RHS == 0) return 0;
+    if (!RHS) return nullptr;
 
     if (Lex.getCode() != tgtok::r_paren) {
       TokError("expected ')' in binary operator");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // eat the ')'
 
     switch (LexCode) {
     default: llvm_unreachable("Unhandled code!");
     case tgtok::XIf: {
-      RecTy *MHSTy = 0;
-      RecTy *RHSTy = 0;
+      RecTy *MHSTy = nullptr;
+      RecTy *RHSTy = nullptr;
 
       if (TypedInit *MHSt = dyn_cast<TypedInit>(MHS))
         MHSTy = MHSt->getType();
@@ -1049,7 +1067,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
 
       if (!MHSTy || !RHSTy) {
         TokError("could not get type for !if");
-        return 0;
+        return nullptr;
       }
 
       if (MHSTy->typeIsConvertibleTo(RHSTy)) {
@@ -1058,24 +1076,24 @@ Init *TGParser::ParseOperation(Record *CurRec) {
         Type = MHSTy;
       } else {
         TokError("inconsistent types for !if");
-        return 0;
+        return nullptr;
       }
       break;
     }
     case tgtok::XForEach: {
       TypedInit *MHSt = dyn_cast<TypedInit>(MHS);
-      if (MHSt == 0) {
+      if (!MHSt) {
         TokError("could not get type for !foreach");
-        return 0;
+        return nullptr;
       }
       Type = MHSt->getType();
       break;
     }
     case tgtok::XSubst: {
       TypedInit *RHSt = dyn_cast<TypedInit>(RHS);
-      if (RHSt == 0) {
+      if (!RHSt) {
         TokError("could not get type for !subst");
-        return 0;
+        return nullptr;
       }
       Type = RHSt->getType();
       break;
@@ -1093,24 +1111,24 @@ Init *TGParser::ParseOperation(Record *CurRec) {
 /// OperatorType ::= '<' Type '>'
 ///
 RecTy *TGParser::ParseOperatorType() {
-  RecTy *Type = 0;
+  RecTy *Type = nullptr;
 
   if (Lex.getCode() != tgtok::less) {
     TokError("expected type name for operator");
-    return 0;
+    return nullptr;
   }
   Lex.Lex();  // eat the <
 
   Type = ParseType();
 
-  if (Type == 0) {
+  if (!Type) {
     TokError("expected type name for operator");
-    return 0;
+    return nullptr;
   }
 
   if (Lex.getCode() != tgtok::greater) {
     TokError("expected type name for operator");
-    return 0;
+    return nullptr;
   }
   Lex.Lex();  // eat the >
 
@@ -1134,11 +1152,12 @@ RecTy *TGParser::ParseOperatorType() {
 ///   SimpleValue ::= SHLTOK '(' Value ',' Value ')'
 ///   SimpleValue ::= SRATOK '(' Value ',' Value ')'
 ///   SimpleValue ::= SRLTOK '(' Value ',' Value ')'
+///   SimpleValue ::= LISTCONCATTOK '(' Value ',' Value ')'
 ///   SimpleValue ::= STRCONCATTOK '(' Value ',' Value ')'
 ///
 Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
                                  IDParseMode Mode) {
-  Init *R = 0;
+  Init *R = nullptr;
   switch (Lex.getCode()) {
   default: TokError("Unknown token when parsing a value"); break;
   case tgtok::paste:
@@ -1177,7 +1196,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     // Value ::= ID '<' ValueListNE '>'
     if (Lex.Lex() == tgtok::greater) {
       TokError("expected non-empty value list");
-      return 0;
+      return nullptr;
     }
 
     // This is a CLASS<initvalslist> expression.  This is supposed to synthesize
@@ -1186,15 +1205,15 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     Record *Class = Records.getClass(Name);
     if (!Class) {
       Error(NameLoc, "Expected a class name, got '" + Name + "'");
-      return 0;
+      return nullptr;
     }
 
     std::vector<Init*> ValueList = ParseValueList(CurRec, Class);
-    if (ValueList.empty()) return 0;
+    if (ValueList.empty()) return nullptr;
 
     if (Lex.getCode() != tgtok::greater) {
       TokError("expected '>' at end of value list");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // eat the '>'
     SMLoc EndLoc = Lex.getLoc();
@@ -1208,7 +1227,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     SCRef.TemplateArgs = ValueList;
     // Add info about the subclass to NewRec.
     if (AddSubClass(NewRec, SCRef))
-      return 0;
+      return nullptr;
     if (!CurMultiClass) {
       NewRec->resolveReferences();
       Records.addDef(NewRec);
@@ -1250,11 +1269,11 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
 
     if (Lex.getCode() != tgtok::r_brace) {
       Vals = ParseValueList(CurRec);
-      if (Vals.empty()) return 0;
+      if (Vals.empty()) return nullptr;
     }
     if (Lex.getCode() != tgtok::r_brace) {
       TokError("expected '}' at end of bit list value");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // eat the '}'
 
@@ -1262,10 +1281,10 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
 
     for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
       Init *Bit = Vals[i]->convertInitializerTo(BitRecTy::get());
-      if (Bit == 0) {
+      if (!Bit) {
         Error(BraceLoc, "Element #" + utostr(i) + " (" + Vals[i]->getAsString()+
               ") is not convertable to a bit");
-        return 0;
+        return nullptr;
       }
       NewBits[Vals.size()-i-1] = Bit;
     }
@@ -1275,87 +1294,87 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     Lex.Lex(); // eat the '['
     std::vector<Init*> Vals;
 
-    RecTy *DeducedEltTy = 0;
-    ListRecTy *GivenListTy = 0;
+    RecTy *DeducedEltTy = nullptr;
+    ListRecTy *GivenListTy = nullptr;
 
-    if (ItemType != 0) {
+    if (ItemType) {
       ListRecTy *ListType = dyn_cast<ListRecTy>(ItemType);
-      if (ListType == 0) {
+      if (!ListType) {
         std::string s;
         raw_string_ostream ss(s);
         ss << "Type mismatch for list, expected list type, got "
            << ItemType->getAsString();
         TokError(ss.str());
-        return 0;
+        return nullptr;
       }
       GivenListTy = ListType;
     }
 
     if (Lex.getCode() != tgtok::r_square) {
-      Vals = ParseValueList(CurRec, 0,
-                            GivenListTy ? GivenListTy->getElementType() : 0);
-      if (Vals.empty()) return 0;
+      Vals = ParseValueList(CurRec, nullptr,
+                            GivenListTy ? GivenListTy->getElementType() : nullptr);
+      if (Vals.empty()) return nullptr;
     }
     if (Lex.getCode() != tgtok::r_square) {
       TokError("expected ']' at end of list value");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // eat the ']'
 
-    RecTy *GivenEltTy = 0;
+    RecTy *GivenEltTy = nullptr;
     if (Lex.getCode() == tgtok::less) {
       // Optional list element type
       Lex.Lex();  // eat the '<'
 
       GivenEltTy = ParseType();
-      if (GivenEltTy == 0) {
+      if (!GivenEltTy) {
         // Couldn't parse element type
-        return 0;
+        return nullptr;
       }
 
       if (Lex.getCode() != tgtok::greater) {
         TokError("expected '>' at end of list element type");
-        return 0;
+        return nullptr;
       }
       Lex.Lex();  // eat the '>'
     }
 
     // Check elements
-    RecTy *EltTy = 0;
+    RecTy *EltTy = nullptr;
     for (std::vector<Init *>::iterator i = Vals.begin(), ie = Vals.end();
          i != ie;
          ++i) {
       TypedInit *TArg = dyn_cast<TypedInit>(*i);
-      if (TArg == 0) {
+      if (!TArg) {
         TokError("Untyped list element");
-        return 0;
+        return nullptr;
       }
-      if (EltTy != 0) {
+      if (EltTy) {
         EltTy = resolveTypes(EltTy, TArg->getType());
-        if (EltTy == 0) {
+        if (!EltTy) {
           TokError("Incompatible types in list elements");
-          return 0;
+          return nullptr;
         }
       } else {
         EltTy = TArg->getType();
       }
     }
 
-    if (GivenEltTy != 0) {
-      if (EltTy != 0) {
+    if (GivenEltTy) {
+      if (EltTy) {
         // Verify consistency
         if (!EltTy->typeIsConvertibleTo(GivenEltTy)) {
           TokError("Incompatible types in list elements");
-          return 0;
+          return nullptr;
         }
       }
       EltTy = GivenEltTy;
     }
 
-    if (EltTy == 0) {
-      if (ItemType == 0) {
+    if (!EltTy) {
+      if (!ItemType) {
         TokError("No type for list");
-        return 0;
+        return nullptr;
       }
       DeducedEltTy = GivenListTy->getElementType();
     } else {
@@ -1363,7 +1382,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
       if (GivenListTy) {
         if (!EltTy->typeIsConvertibleTo(GivenListTy->getElementType())) {
           TokError("Element type mismatch for list");
-          return 0;
+          return nullptr;
         }
       }
       DeducedEltTy = EltTy;
@@ -1375,18 +1394,18 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     Lex.Lex();   // eat the '('
     if (Lex.getCode() != tgtok::Id && Lex.getCode() != tgtok::XCast) {
       TokError("expected identifier in dag init");
-      return 0;
+      return nullptr;
     }
 
     Init *Operator = ParseValue(CurRec);
-    if (Operator == 0) return 0;
+    if (!Operator) return nullptr;
 
     // If the operator name is present, parse it.
     std::string OperatorName;
     if (Lex.getCode() == tgtok::colon) {
       if (Lex.Lex() != tgtok::VarName) { // eat the ':'
         TokError("expected variable name in dag operator");
-        return 0;
+        return nullptr;
       }
       OperatorName = Lex.getCurStrVal();
       Lex.Lex();  // eat the VarName.
@@ -1395,12 +1414,12 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     std::vector<std::pair<llvm::Init*, std::string> > DagArgs;
     if (Lex.getCode() != tgtok::r_paren) {
       DagArgs = ParseDagArgList(CurRec);
-      if (DagArgs.empty()) return 0;
+      if (DagArgs.empty()) return nullptr;
     }
 
     if (Lex.getCode() != tgtok::r_paren) {
       TokError("expected ')' in dag init");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();  // eat the ')'
 
@@ -1417,6 +1436,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
   case tgtok::XSRL:
   case tgtok::XSHL:
   case tgtok::XEq:
+  case tgtok::XListConcat:
   case tgtok::XStrConcat:   // Value ::= !binop '(' Value ',' Value ')'
   case tgtok::XIf:
   case tgtok::XForEach:
@@ -1437,7 +1457,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
 ///
 Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
   Init *Result = ParseSimpleValue(CurRec, ItemType, Mode);
-  if (Result == 0) return 0;
+  if (!Result) return nullptr;
 
   // Parse the suffixes now if present.
   while (1) {
@@ -1451,20 +1471,20 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
       SMLoc CurlyLoc = Lex.getLoc();
       Lex.Lex(); // eat the '{'
       std::vector<unsigned> Ranges = ParseRangeList();
-      if (Ranges.empty()) return 0;
+      if (Ranges.empty()) return nullptr;
 
       // Reverse the bitlist.
       std::reverse(Ranges.begin(), Ranges.end());
       Result = Result->convertInitializerBitRange(Ranges);
-      if (Result == 0) {
+      if (!Result) {
         Error(CurlyLoc, "Invalid bit range for value");
-        return 0;
+        return nullptr;
       }
 
       // Eat the '}'.
       if (Lex.getCode() != tgtok::r_brace) {
         TokError("expected '}' at end of bit range list");
-        return 0;
+        return nullptr;
       }
       Lex.Lex();
       break;
@@ -1473,18 +1493,18 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
       SMLoc SquareLoc = Lex.getLoc();
       Lex.Lex(); // eat the '['
       std::vector<unsigned> Ranges = ParseRangeList();
-      if (Ranges.empty()) return 0;
+      if (Ranges.empty()) return nullptr;
 
       Result = Result->convertInitListSlice(Ranges);
-      if (Result == 0) {
+      if (!Result) {
         Error(SquareLoc, "Invalid range for list slice");
-        return 0;
+        return nullptr;
       }
 
       // Eat the ']'.
       if (Lex.getCode() != tgtok::r_square) {
         TokError("expected ']' at end of list slice");
-        return 0;
+        return nullptr;
       }
       Lex.Lex();
       break;
@@ -1492,12 +1512,12 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
     case tgtok::period:
       if (Lex.Lex() != tgtok::Id) {  // eat the .
         TokError("expected field identifier after '.'");
-        return 0;
+        return nullptr;
       }
       if (!Result->getFieldType(Lex.getCurStrVal())) {
         TokError("Cannot access field '" + Lex.getCurStrVal() + "' of value '" +
                  Result->getAsString() + "'");
-        return 0;
+        return nullptr;
       }
       Result = FieldInit::get(Result, Lex.getCurStrVal());
       Lex.Lex();  // eat field name
@@ -1512,14 +1532,14 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
       TypedInit *LHS = dyn_cast<TypedInit>(Result);
       if (!LHS) {
         Error(PasteLoc, "LHS of paste is not typed!");
-        return 0;
+        return nullptr;
       }
   
       if (LHS->getType() != StringRecTy::get()) {
         LHS = UnOpInit::get(UnOpInit::CAST, LHS, StringRecTy::get());
       }
 
-      TypedInit *RHS = 0;
+      TypedInit *RHS = nullptr;
 
       Lex.Lex();  // Eat the '#'.
       switch (Lex.getCode()) { 
@@ -1539,7 +1559,7 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
         RHS = dyn_cast<TypedInit>(RHSResult);
         if (!RHS) {
           Error(PasteLoc, "RHS of paste is not typed!");
-          return 0;
+          return nullptr;
         }
 
         if (RHS->getType() != StringRecTy::get()) {
@@ -1575,7 +1595,7 @@ TGParser::ParseDagArgList(Record *CurRec) {
     } else {
       // DagArg ::= Value (':' VARNAME)?
       Init *Val = ParseValue(CurRec);
-      if (Val == 0)
+      if (!Val)
         return std::vector<std::pair<llvm::Init*, std::string> >();
 
       // If the variable name is present, add it.
@@ -1610,7 +1630,7 @@ std::vector<Init*> TGParser::ParseValueList(Record *CurRec, Record *ArgsRec,
   std::vector<Init*> Result;
   RecTy *ItemType = EltTy;
   unsigned int ArgN = 0;
-  if (ArgsRec != 0 && EltTy == 0) {
+  if (ArgsRec && !EltTy) {
     const std::vector<Init *> &TArgs = ArgsRec->getTemplateArgs();
     if (!TArgs.size()) {
       TokError("template argument provided to non-template class");
@@ -1626,12 +1646,12 @@ std::vector<Init*> TGParser::ParseValueList(Record *CurRec, Record *ArgsRec,
     ++ArgN;
   }
   Result.push_back(ParseValue(CurRec, ItemType));
-  if (Result.back() == 0) return std::vector<Init*>();
+  if (!Result.back()) return std::vector<Init*>();
 
   while (Lex.getCode() == tgtok::comma) {
     Lex.Lex();  // Eat the comma
 
-    if (ArgsRec != 0 && EltTy == 0) {
+    if (ArgsRec && !EltTy) {
       const std::vector<Init *> &TArgs = ArgsRec->getTemplateArgs();
       if (ArgN >= TArgs.size()) {
         TokError("too many template arguments");
@@ -1643,7 +1663,7 @@ std::vector<Init*> TGParser::ParseValueList(Record *CurRec, Record *ArgsRec,
       ++ArgN;
     }
     Result.push_back(ParseValue(CurRec, ItemType));
-    if (Result.back() == 0) return std::vector<Init*>();
+    if (!Result.back()) return std::vector<Init*>();
   }
 
   return Result;
@@ -1667,11 +1687,11 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
   if (HasField) Lex.Lex();
 
   RecTy *Type = ParseType();
-  if (Type == 0) return 0;
+  if (!Type) return nullptr;
 
   if (Lex.getCode() != tgtok::Id) {
     TokError("Expected identifier in declaration");
-    return 0;
+    return nullptr;
   }
 
   SMLoc IdLoc = Lex.getLoc();
@@ -1691,16 +1711,16 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
 
   // Add the value.
   if (AddValue(CurRec, IdLoc, RecordVal(DeclName, Type, HasField)))
-    return 0;
+    return nullptr;
 
   // If a value is present, parse it.
   if (Lex.getCode() == tgtok::equal) {
     Lex.Lex();
     SMLoc ValLoc = Lex.getLoc();
     Init *Val = ParseValue(CurRec, Type);
-    if (Val == 0 ||
+    if (!Val ||
         SetValue(CurRec, ValLoc, DeclName, std::vector<unsigned>(), Val))
-      return 0;
+      return nullptr;
   }
 
   return DeclName;
@@ -1717,7 +1737,7 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
 VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) {
   if (Lex.getCode() != tgtok::Id) {
     TokError("Expected identifier in foreach declaration");
-    return 0;
+    return nullptr;
   }
 
   Init *DeclName = StringInit::get(Lex.getCurStrVal());
@@ -1726,27 +1746,27 @@ VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) {
   // If a value is present, parse it.
   if (Lex.getCode() != tgtok::equal) {
     TokError("Expected '=' in foreach declaration");
-    return 0;
+    return nullptr;
   }
   Lex.Lex();  // Eat the '='
 
-  RecTy *IterType = 0;
+  RecTy *IterType = nullptr;
   std::vector<unsigned> Ranges;
 
   switch (Lex.getCode()) {
-  default: TokError("Unknown token when expecting a range list"); return 0;
+  default: TokError("Unknown token when expecting a range list"); return nullptr;
   case tgtok::l_square: { // '[' ValueList ']'
-    Init *List = ParseSimpleValue(0, 0, ParseForeachMode);
+    Init *List = ParseSimpleValue(nullptr, nullptr, ParseForeachMode);
     ForeachListValue = dyn_cast<ListInit>(List);
-    if (ForeachListValue == 0) {
+    if (!ForeachListValue) {
       TokError("Expected a Value list");
-      return 0;
+      return nullptr;
     }
     RecTy *ValueType = ForeachListValue->getType();
     ListRecTy *ListType = dyn_cast<ListRecTy>(ValueType);
-    if (ListType == 0) {
+    if (!ListType) {
       TokError("Value list is not of list type");
-      return 0;
+      return nullptr;
     }
     IterType = ListType->getElementType();
     break;
@@ -1754,7 +1774,7 @@ VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) {
 
   case tgtok::IntVal: { // RangePiece.
     if (ParseRangePiece(Ranges))
-      return 0;
+      return nullptr;
     break;
   }
 
@@ -1763,7 +1783,7 @@ VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) {
     Ranges = ParseRangeList();
     if (Lex.getCode() != tgtok::r_brace) {
       TokError("expected '}' at end of bit range list");
-      return 0;
+      return nullptr;
     }
     Lex.Lex();
     break;
@@ -1780,7 +1800,7 @@ VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) {
   }
 
   if (!IterType)
-    return 0;
+    return nullptr;
 
   return VarInit::get(DeclName, IterType);
 }
@@ -1800,7 +1820,7 @@ bool TGParser::ParseTemplateArgList(Record *CurRec) {
 
   // Read the first declaration.
   Init *TemplArg = ParseDeclaration(CurRec, true/*templateargs*/);
-  if (TemplArg == 0)
+  if (!TemplArg)
     return true;
 
   TheRecToAddTo->addTemplateArg(TemplArg);
@@ -1810,7 +1830,7 @@ bool TGParser::ParseTemplateArgList(Record *CurRec) {
 
     // Read the following declarations.
     TemplArg = ParseDeclaration(CurRec, true/*templateargs*/);
-    if (TemplArg == 0)
+    if (!TemplArg)
       return true;
     TheRecToAddTo->addTemplateArg(TemplArg);
   }
@@ -1828,7 +1848,7 @@ bool TGParser::ParseTemplateArgList(Record *CurRec) {
 ///   BodyItem ::= LET ID OptionalBitList '=' Value ';'
 bool TGParser::ParseBodyItem(Record *CurRec) {
   if (Lex.getCode() != tgtok::Let) {
-    if (ParseDeclaration(CurRec, false) == 0)
+    if (!ParseDeclaration(CurRec, false))
       return true;
 
     if (Lex.getCode() != tgtok::semi)
@@ -1855,13 +1875,13 @@ bool TGParser::ParseBodyItem(Record *CurRec) {
   Lex.Lex();  // eat the '='.
 
   RecordVal *Field = CurRec->getValue(FieldName);
-  if (Field == 0)
+  if (!Field)
     return TokError("Value '" + FieldName + "' unknown!");
 
   RecTy *Type = Field->getType();
 
   Init *Val = ParseValue(CurRec, Type);
-  if (Val == 0) return true;
+  if (!Val) return true;
 
   if (Lex.getCode() != tgtok::semi)
     return TokError("expected ';' after let expression");
@@ -1927,7 +1947,7 @@ bool TGParser::ParseObjectBody(Record *CurRec) {
     SubClassReference SubClass = ParseSubClassReference(CurRec, false);
     while (1) {
       // Check for error.
-      if (SubClass.Rec == 0) return true;
+      if (!SubClass.Rec) return true;
 
       // Add it.
       if (AddSubClass(CurRec, SubClass))
@@ -1998,7 +2018,7 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
   } else if (ParseObjectBody(CurRec))
     return true;
 
-  if (CurMultiClass == 0)  // Def's in multiclasses aren't really defs.
+  if (!CurMultiClass)  // Def's in multiclasses aren't really defs.
     // See Record::setName().  This resolve step will see any new name
     // for the def that might have been created when resolving
     // inheritance, values and arguments above.
@@ -2040,9 +2060,9 @@ bool TGParser::ParseForeach(MultiClass *CurMultiClass) {
 
   // Make a temporary object to record items associated with the for
   // loop.
-  ListInit *ListValue = 0;
+  ListInit *ListValue = nullptr;
   VarInit *IterName = ParseForeachDeclaration(ListValue);
-  if (IterName == 0)
+  if (!IterName)
     return TokError("expected declaration in for");
 
   if (Lex.getCode() != tgtok::In)
@@ -2144,8 +2164,8 @@ std::vector<LetRecord> TGParser::ParseLetList() {
     }
     Lex.Lex();  // eat the '='.
 
-    Init *Val = ParseValue(0);
-    if (Val == 0) return std::vector<LetRecord>();
+    Init *Val = ParseValue(nullptr);
+    if (!Val) return std::vector<LetRecord>();
 
     // Now that we have everything, add the record.
     Result.push_back(LetRecord(Name, Bits, Val, NameLoc));
@@ -2228,7 +2248,7 @@ bool TGParser::ParseMultiClass() {
 
   // If there are template args, parse them.
   if (Lex.getCode() == tgtok::less)
-    if (ParseTemplateArgList(0))
+    if (ParseTemplateArgList(nullptr))
       return true;
 
   bool inherits = false;
@@ -2244,7 +2264,7 @@ bool TGParser::ParseMultiClass() {
       ParseSubMultiClassReference(CurMultiClass);
     while (1) {
       // Check for error.
-      if (SubMultiClass.MC == 0) return true;
+      if (!SubMultiClass.MC) return true;
 
       // Add it.
       if (AddSubMultiClass(CurMultiClass, SubMultiClass))
@@ -2283,7 +2303,7 @@ bool TGParser::ParseMultiClass() {
     Lex.Lex();  // eat the '}'.
   }
 
-  CurMultiClass = 0;
+  CurMultiClass = nullptr;
   return false;
 }
 
@@ -2301,7 +2321,7 @@ InstantiateMulticlassDef(MultiClass &MC,
   // as a prefix.
 
   bool IsAnonymous = false;
-  if (DefmPrefix == 0) {
+  if (!DefmPrefix) {
     DefmPrefix = StringInit::get(GetNewAnonymousName());
     IsAnonymous = true;
   }
@@ -2310,7 +2330,7 @@ InstantiateMulticlassDef(MultiClass &MC,
 
   StringInit *DefNameString = dyn_cast<StringInit>(DefName);
 
-  if (DefNameString != 0) {
+  if (DefNameString) {
     // We have a fully expanded string so there are no operators to
     // resolve.  We should concatenate the given prefix and name.
     DefName =
@@ -2338,13 +2358,13 @@ InstantiateMulticlassDef(MultiClass &MC,
     Error(DefmPrefixRange.Start, "Could not resolve "
           + CurRec->getNameInitAsString() + ":NAME to '"
           + DefmPrefix->getAsUnquotedString() + "'");
-    return 0;
+    return nullptr;
   }
 
   // If the DefNameString didn't resolve, we probably have a reference to
   // NAME and need to replace it. We need to do at least this much greedily,
   // otherwise nested multiclasses will end up with incorrect NAME expansions.
-  if (DefNameString == 0) {
+  if (!DefNameString) {
     RecordVal *DefNameRV = CurRec->getValue("NAME");
     CurRec->resolveReferencesTo(DefNameRV);
   }
@@ -2369,7 +2389,7 @@ InstantiateMulticlassDef(MultiClass &MC,
       Error(DefmPrefixRange.Start, "def '" + CurRec->getNameInitAsString() +
             "' already defined, instantiating defm with subdef '" + 
             DefProto->getNameInitAsString() + "'");
-      return 0;
+      return nullptr;
     }
 
     Records.addDef(CurRec);
@@ -2453,7 +2473,7 @@ bool TGParser::ResolveMulticlassDef(MultiClass &MC,
 bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
   assert(Lex.getCode() == tgtok::Defm && "Unexpected token!");
   SMLoc DefmLoc = Lex.getLoc();
-  Init *DefmPrefix = 0;
+  Init *DefmPrefix = nullptr;
 
   if (Lex.Lex() == tgtok::Id) {  // eat the defm.
     DefmPrefix = ParseObjectName(CurMultiClass);
@@ -2473,10 +2493,10 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
   Lex.Lex();
 
   SMLoc SubClassLoc = Lex.getLoc();
-  SubClassReference Ref = ParseSubClassReference(0, true);
+  SubClassReference Ref = ParseSubClassReference(nullptr, true);
 
   while (1) {
-    if (Ref.Rec == 0) return true;
+    if (!Ref.Rec) return true;
 
     // To instantiate a multiclass, we need to first get the multiclass, then
     // instantiate each def contained in the multiclass with the SubClassRef
@@ -2522,21 +2542,21 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
 
     // A defm can inherit from regular classes (non-multiclass) as
     // long as they come in the end of the inheritance list.
-    InheritFromClass = (Records.getClass(Lex.getCurStrVal()) != 0);
+    InheritFromClass = (Records.getClass(Lex.getCurStrVal()) != nullptr);
 
     if (InheritFromClass)
       break;
 
-    Ref = ParseSubClassReference(0, true);
+    Ref = ParseSubClassReference(nullptr, true);
   }
 
   if (InheritFromClass) {
     // Process all the classes to inherit as if they were part of a
     // regular 'def' and inherit all record values.
-    SubClassReference SubClass = ParseSubClassReference(0, false);
+    SubClassReference SubClass = ParseSubClassReference(nullptr, false);
     while (1) {
       // Check for error.
-      if (SubClass.Rec == 0) return true;
+      if (!SubClass.Rec) return true;
 
       // Get the expanded definition prototypes and teach them about
       // the record values the current class to inherit has
@@ -2553,7 +2573,7 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
 
       if (Lex.getCode() != tgtok::comma) break;
       Lex.Lex(); // eat ','.
-      SubClass = ParseSubClassReference(0, false);
+      SubClass = ParseSubClassReference(nullptr, false);
     }
   }
 
diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h
index ce31f8e..6fd442a 100644
--- a/lib/TableGen/TGParser.h
+++ b/lib/TableGen/TGParser.h
@@ -85,7 +85,7 @@ class TGParser {
 
 public:
   TGParser(SourceMgr &SrcMgr, RecordKeeper &records)
-      : Lex(SrcMgr), CurMultiClass(0), Records(records), AnonCounter(0) {}
+      : Lex(SrcMgr), CurMultiClass(nullptr), Records(records), AnonCounter(0) {}
 
   /// ParseFile - Main entrypoint for parsing a tblgen file.  These parser
   /// routines return true on error, or false on success.
@@ -131,7 +131,7 @@ private:  // Semantic analysis methods.
   bool ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals);
 
 private:  // Parser methods.
-  bool ParseObjectList(MultiClass *MC = 0);
+  bool ParseObjectList(MultiClass *MC = nullptr);
   bool ParseObject(MultiClass *MC);
   bool ParseClass();
   bool ParseMultiClass();
@@ -169,12 +169,12 @@ private:  // Parser methods.
 
   Init *ParseIDValue(Record *CurRec, const std::string &Name, SMLoc NameLoc,
                      IDParseMode Mode = ParseValueMode);
-  Init *ParseSimpleValue(Record *CurRec, RecTy *ItemType = 0,
+  Init *ParseSimpleValue(Record *CurRec, RecTy *ItemType = nullptr,
                          IDParseMode Mode = ParseValueMode);
-  Init *ParseValue(Record *CurRec, RecTy *ItemType = 0,
+  Init *ParseValue(Record *CurRec, RecTy *ItemType = nullptr,
                    IDParseMode Mode = ParseValueMode);
-  std::vector<Init*> ParseValueList(Record *CurRec, Record *ArgsRec = 0,
-                                    RecTy *EltTy = 0);
+  std::vector<Init*> ParseValueList(Record *CurRec, Record *ArgsRec = nullptr,
+                                    RecTy *EltTy = nullptr);
   std::vector<std::pair<llvm::Init*, std::string> > ParseDagArgList(Record *);
   bool ParseOptionalRangeList(std::vector<unsigned> &Ranges);
   bool ParseOptionalBitList(std::vector<unsigned> &Ranges);
diff --git a/lib/TableGen/module.modulemap b/lib/TableGen/module.modulemap
new file mode 100644
index 0000000..8dac0a2
--- /dev/null
+++ b/lib/TableGen/module.modulemap
@@ -0,0 +1 @@
+module TableGen { requires cplusplus umbrella "." module * { export * } }
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 0297de1..1c022aa 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -1,4 +1,4 @@
-//==-- AArch64.h - Top-level interface for AArch64 representation -*- C++ -*-=//
+//==-- AArch64.h - Top-level interface for AArch64  --------------*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,35 +12,38 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64_H
-#define LLVM_TARGET_AARCH64_H
+#ifndef TARGET_AArch64_H
+#define TARGET_AArch64_H
 
+#include "Utils/AArch64BaseInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/DataTypes.h"
 
 namespace llvm {
 
-class AArch64AsmPrinter;
-class FunctionPass;
 class AArch64TargetMachine;
-class MachineInstr;
-class MCInst;
-
-FunctionPass *createAArch64ISelDAG(AArch64TargetMachine &TM,
-                                   CodeGenOpt::Level OptLevel);
+class FunctionPass;
+class MachineFunctionPass;
+
+FunctionPass *createAArch64DeadRegisterDefinitions();
+FunctionPass *createAArch64ConditionalCompares();
+FunctionPass *createAArch64AdvSIMDScalar();
+FunctionPass *createAArch64BranchRelaxation();
+FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
+                                 CodeGenOpt::Level OptLevel);
+FunctionPass *createAArch64StorePairSuppressPass();
+FunctionPass *createAArch64ExpandPseudoPass();
+FunctionPass *createAArch64LoadStoreOptimizationPass();
+ModulePass *createAArch64PromoteConstantPass();
+FunctionPass *createAArch64AddressTypePromotionPass();
+/// \brief Creates an ARM-specific Target Transformation Info pass.
+ImmutablePass *
+createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);
 
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
-FunctionPass *createAArch64BranchFixupPass();
-
-/// \brief Creates an AArch64-specific Target Transformation Info pass.
-ImmutablePass *createAArch64TargetTransformInfoPass(
-                                                const AArch64TargetMachine *TM);
-
-void LowerAArch64MachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                      AArch64AsmPrinter &AP);
-
-
-}
+FunctionPass *createAArch64CollectLOHPass();
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e49afd6..1ad5ac8 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -1,4 +1,4 @@
-//===- AArch64.td - Describe the AArch64 Target Machine -------*- tblgen -*-==//
+//=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is the top level entry point for the AArch64 target.
 //
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Target-independent interfaces
+// Target-independent interfaces which we are implementing
 //===----------------------------------------------------------------------===//
 
 include "llvm/Target/Target.td"
@@ -22,7 +21,7 @@ include "llvm/Target/Target.td"
 //
 
 def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
-  "Enable ARMv8 FP">;
+                                       "Enable ARMv8 FP">;
 
 def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
   "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
@@ -30,54 +29,106 @@ def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
   "Enable cryptographic instructions">;
 
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+  "Enable ARMv8 CRC-32 checksum instructions">;
+
+/// Cyclone has register move instructions which are "free".
+def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
+                                        "Has zero-cycle register moves">;
+
+/// Cyclone has instructions which zero registers for "free".
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                        "Has zero-cycle zeroing instructions">;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "AArch64RegisterInfo.td"
+include "AArch64CallingConvention.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
 //===----------------------------------------------------------------------===//
-// AArch64 Processors
-//
 
 include "AArch64Schedule.td"
+include "AArch64InstrInfo.td"
 
-class ProcNoItin<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
+def AArch64InstrInfo : InstrInfo;
 
-def : Processor<"generic", GenericItineraries, [FeatureFPARMv8, FeatureNEON]>;
+//===----------------------------------------------------------------------===//
+// AArch64 Processors supported.
+//
+include "AArch64SchedA53.td"
+include "AArch64SchedCyclone.td"
 
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors",
                                    [FeatureFPARMv8,
                                    FeatureNEON,
-                                   FeatureCrypto]>;
+                                   FeatureCrypto,
+                                   FeatureCRC]>;
 
 def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors",
                                    [FeatureFPARMv8,
                                    FeatureNEON,
-                                   FeatureCrypto]>;
+                                   FeatureCrypto,
+                                   FeatureCRC]>;
+
+def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
+                                   "Cyclone",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC,
+                                   FeatureZCRegMove, FeatureZCZeroing]>;
+
+def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
+                                              FeatureNEON,
+                                              FeatureCRC]>;
 
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
-def : Processor<"cortex-a57", NoItineraries, [ProcA57]>;
+def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>;
+def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
 
 //===----------------------------------------------------------------------===//
-// Register File Description
+// Assembly parser
 //===----------------------------------------------------------------------===//
 
-include "AArch64RegisterInfo.td"
+def GenericAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string Name = "generic";
+}
 
-include "AArch64CallingConv.td"
+def AppleAsmParserVariant : AsmParserVariant {
+  int Variant = 1;
+  string Name = "apple-neon";
+}
 
 //===----------------------------------------------------------------------===//
-// Instruction Descriptions
+// Assembly printer
 //===----------------------------------------------------------------------===//
+// AArch64 Uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def GenericAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
 
-include "AArch64InstrInfo.td"
-
-def AArch64InstrInfo : InstrInfo {
-  let noNamedPositionallyEncodedOperands = 1;
+def AppleAsmWriter : AsmWriter {
+  let AsmWriterClassName = "AppleInstPrinter";
+  int Variant = 1;
+  int isMCAsmWriter = 1;
 }
 
 //===----------------------------------------------------------------------===//
-// Declare the target which we are implementing
+// Target Declaration
 //===----------------------------------------------------------------------===//
 
 def AArch64 : Target {
   let InstructionSet = AArch64InstrInfo;
+  let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
+  let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
 }
diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
new file mode 100644
index 0000000..04906f6
--- /dev/null
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -0,0 +1,492 @@
+//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to promote the computations use to obtained a sign extended
+// value used into memory accesses.
+// E.g.
+// a = add nsw i32 b, 3
+// d = sext i32 a to i64
+// e = getelementptr ..., i64 d
+//
+// =>
+// f = sext i32 b to i64
+// a = add nsw i64 f, 3
+// e = getelementptr ..., i64 a
+//
+// This is legal to do so if the computations are markers with either nsw or nuw
+// markers.
+// Moreover, the current heuristic is simple: it does not create new sext
+// operations, i.e., it gives up when a sext would have forked (e.g., if
+// a = add i32 b, c, two sexts are required to promote the computation).
+//
+// FIXME: This pass may be useful for other targets too.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-type-promotion"
+
+static cl::opt<bool>
+EnableAddressTypePromotion("aarch64-type-promotion", cl::Hidden,
+                           cl::desc("Enable the type promotion pass"),
+                           cl::init(true));
+static cl::opt<bool>
+EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
+            cl::desc("Enable merging of redundant sexts when one is dominating"
+                     " the other."),
+            cl::init(true));
+
+//===----------------------------------------------------------------------===//
+//                       AArch64AddressTypePromotion
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+void initializeAArch64AddressTypePromotionPass(PassRegistry &);
+}
+
+namespace {
+class AArch64AddressTypePromotion : public FunctionPass {
+
+public:
+  static char ID;
+  AArch64AddressTypePromotion()
+      : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) {
+    initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override {
+    return "AArch64 Address Type Promotion";
+  }
+
+  /// Iterate over the functions and promote the computation of interesting
+  // sext instructions.
+  bool runOnFunction(Function &F) override;
+
+private:
+  /// The current function.
+  Function *Func;
+  /// Filter out all sexts that does not have this type.
+  /// Currently initialized with Int64Ty.
+  Type *ConsideredSExtType;
+
+  // This transformation requires dominator info.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  typedef SmallPtrSet<Instruction *, 32> SetOfInstructions;
+  typedef SmallVector<Instruction *, 16> Instructions;
+  typedef DenseMap<Value *, Instructions> ValueToInsts;
+
+  /// Check if it is profitable to move a sext through this instruction.
+  /// Currently, we consider it is profitable if:
+  /// - Inst is used only once (no need to insert truncate).
+  /// - Inst has only one operand that will require a sext operation (we do
+  ///   do not create new sext operation).
+  bool shouldGetThrough(const Instruction *Inst);
+
+  /// Check if it is possible and legal to move a sext through this
+  /// instruction.
+  /// Current heuristic considers that we can get through:
+  /// - Arithmetic operation marked with the nsw or nuw flag.
+  /// - Other sext operation.
+  /// - Truncate operation if it was just dropping sign extended bits.
+  bool canGetThrough(const Instruction *Inst);
+
+  /// Move sext operations through safe to sext instructions.
+  bool propagateSignExtension(Instructions &SExtInsts);
+
+  /// Is this sext should be considered for code motion.
+  /// We look for sext with ConsideredSExtType and uses in at least one
+  // GetElementPtrInst.
+  bool shouldConsiderSExt(const Instruction *SExt) const;
+
+  /// Collect all interesting sext operations, i.e., the ones with the right
+  /// type and used in memory accesses.
+  /// More precisely, a sext instruction is considered as interesting if it
+  /// is used in a "complex" getelementptr or it exits at least another
+  /// sext instruction that sign extended the same initial value.
+  /// A getelementptr is considered as "complex" if it has more than 2
+  // operands.
+  void analyzeSExtension(Instructions &SExtInsts);
+
+  /// Merge redundant sign extension operations in common dominator.
+  void mergeSExts(ValueToInsts &ValToSExtendedUses,
+                  SetOfInstructions &ToRemove);
+};
+} // end anonymous namespace.
+
+char AArch64AddressTypePromotion::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
+                      "AArch64 Type Promotion Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
+                    "AArch64 Type Promotion Pass", false, false)
+
+FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
+  return new AArch64AddressTypePromotion();
+}
+
+bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
+  if (isa<SExtInst>(Inst))
+    return true;
+
+  const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+  if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
+      (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
+    return true;
+
+  // sext(trunc(sext)) --> sext
+  if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) {
+    const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0));
+    // Check that the truncate just drop sign extended bits.
+    if (Inst->getType()->getIntegerBitWidth() >=
+            Opnd->getOperand(0)->getType()->getIntegerBitWidth() &&
+        Inst->getOperand(0)->getType()->getIntegerBitWidth() <=
+            ConsideredSExtType->getIntegerBitWidth())
+      return true;
+  }
+
+  return false;
+}
+
+bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
+  // If the type of the sext is the same as the considered one, this sext
+  // will become useless.
+  // Otherwise, we will have to do something to preserve the original value,
+  // unless it is used once.
+  if (isa<SExtInst>(Inst) &&
+      (Inst->getType() == ConsideredSExtType || Inst->hasOneUse()))
+    return true;
+
+  // If the Inst is used more that once, we may need to insert truncate
+  // operations and we don't do that at the moment.
+  if (!Inst->hasOneUse())
+    return false;
+
+  // This truncate is used only once, thus if we can get thourgh, it will become
+  // useless.
+  if (isa<TruncInst>(Inst))
+    return true;
+
+  // If both operands are not constant, a new sext will be created here.
+  // Current heuristic is: each step should be profitable.
+  // Therefore we don't allow to increase the number of sext even if it may
+  // be profitable later on.
+  if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1)))
+    return true;
+
+  return false;
+}
+
+static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
+  if (isa<SelectInst>(Inst) && OpIdx == 0)
+    return false;
+  return true;
+}
+
+bool
+AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
+  if (SExt->getType() != ConsideredSExtType)
+    return false;
+
+  for (const Use &U : SExt->uses()) {
+    if (isa<GetElementPtrInst>(*U))
+      return true;
+  }
+
+  return false;
+}
+
+// Input:
+// - SExtInsts contains all the sext instructions that are use direclty in
+//   GetElementPtrInst, i.e., access to memory.
+// Algorithm:
+// - For each sext operation in SExtInsts:
+//   Let var be the operand of sext.
+//   while it is profitable (see shouldGetThrough), legal, and safe
+//   (see canGetThrough) to move sext through var's definition:
+//   * promote the type of var's definition.
+//   * fold var into sext uses.
+//   * move sext above var's definition.
+//   * update sext operand to use the operand of var that should be sign
+//     extended (by construction there is only one).
+//
+//   E.g.,
+//   a = ... i32 c, 3
+//   b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a'
+//   ...
+//   = b
+// => Yes, update the code
+//   b = sext i32 c to i64
+//   a = ... i64 b, 3
+//   ...
+//   = a
+// Iterate on 'c'.
+bool
+AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
+  DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
+
+  bool LocalChange = false;
+  SetOfInstructions ToRemove;
+  ValueToInsts ValToSExtendedUses;
+  while (!SExtInsts.empty()) {
+    // Get through simple chain.
+    Instruction *SExt = SExtInsts.pop_back_val();
+
+    DEBUG(dbgs() << "Consider:\n" << *SExt << '\n');
+
+    // If this SExt has already been merged continue.
+    if (SExt->use_empty() && ToRemove.count(SExt)) {
+      DEBUG(dbgs() << "No uses => marked as delete\n");
+      continue;
+    }
+
+    // Now try to get through the chain of definitions.
+    while (isa<Instruction>(SExt->getOperand(0))) {
+      Instruction *Inst = dyn_cast<Instruction>(SExt->getOperand(0));
+      DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
+      if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
+        // We cannot get through something that is not an Instruction
+        // or not safe to SExt.
+        DEBUG(dbgs() << "Cannot get through\n");
+        break;
+      }
+
+      LocalChange = true;
+      // If this is a sign extend, it becomes useless.
+      if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) {
+        DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n");
+        // We cannot use replaceAllUsesWith here because we may trigger some
+        // assertion on the type as all involved sext operation may have not
+        // been moved yet.
+        while (!Inst->use_empty()) {
+          Value::use_iterator UseIt = Inst->use_begin();
+          Instruction *UseInst = dyn_cast<Instruction>(*UseIt);
+          assert(UseInst && "Use of sext is not an Instruction!");
+          UseInst->setOperand(UseIt->getOperandNo(), SExt);
+        }
+        ToRemove.insert(Inst);
+        SExt->setOperand(0, Inst->getOperand(0));
+        SExt->moveBefore(Inst);
+        continue;
+      }
+
+      // Get through the Instruction:
+      // 1. Update its type.
+      // 2. Replace the uses of SExt by Inst.
+      // 3. Sign extend each operand that needs to be sign extended.
+
+      // Step #1.
+      Inst->mutateType(SExt->getType());
+      // Step #2.
+      SExt->replaceAllUsesWith(Inst);
+      // Step #3.
+      Instruction *SExtForOpnd = SExt;
+
+      DEBUG(dbgs() << "Propagate SExt to operands\n");
+      for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx;
+           ++OpIdx) {
+        DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n');
+        if (Inst->getOperand(OpIdx)->getType() == SExt->getType() ||
+            !shouldSExtOperand(Inst, OpIdx)) {
+          DEBUG(dbgs() << "No need to propagate\n");
+          continue;
+        }
+        // Check if we can statically sign extend the operand.
+        Value *Opnd = Inst->getOperand(OpIdx);
+        if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
+          DEBUG(dbgs() << "Statically sign extend\n");
+          Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(),
+                                                         Cst->getSExtValue()));
+          continue;
+        }
+        // UndefValue are typed, so we have to statically sign extend them.
+        if (isa<UndefValue>(Opnd)) {
+          DEBUG(dbgs() << "Statically sign extend\n");
+          Inst->setOperand(OpIdx, UndefValue::get(SExt->getType()));
+          continue;
+        }
+
+        // Otherwise we have to explicity sign extend it.
+        assert(SExtForOpnd &&
+               "Only one operand should have been sign extended");
+
+        SExtForOpnd->setOperand(0, Opnd);
+
+        DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n");
+        // Move the sign extension before the insertion point.
+        SExtForOpnd->moveBefore(Inst);
+        Inst->setOperand(OpIdx, SExtForOpnd);
+        // If more sext are required, new instructions will have to be created.
+        SExtForOpnd = nullptr;
+      }
+      if (SExtForOpnd == SExt) {
+        DEBUG(dbgs() << "Sign extension is useless now\n");
+        ToRemove.insert(SExt);
+        break;
+      }
+    }
+
+    // If the use is already of the right type, connect its uses to its argument
+    // and delete it.
+    // This can happen for an Instruction which all uses are sign extended.
+    if (!ToRemove.count(SExt) &&
+        SExt->getType() == SExt->getOperand(0)->getType()) {
+      DEBUG(dbgs() << "Sign extension is useless, attach its use to "
+                      "its argument\n");
+      SExt->replaceAllUsesWith(SExt->getOperand(0));
+      ToRemove.insert(SExt);
+    } else
+      ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt);
+  }
+
+  if (EnableMerge)
+    mergeSExts(ValToSExtendedUses, ToRemove);
+
+  // Remove all instructions marked as ToRemove.
+  for (Instruction *I: ToRemove)
+    I->eraseFromParent();
+  return LocalChange;
+}
+
+void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
+                                             SetOfInstructions &ToRemove) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  for (auto &Entry : ValToSExtendedUses) {
+    Instructions &Insts = Entry.second;
+    Instructions CurPts;
+    for (Instruction *Inst : Insts) {
+      if (ToRemove.count(Inst))
+        continue;
+      bool inserted = false;
+      for (auto Pt : CurPts) {
+        if (DT.dominates(Inst, Pt)) {
+          DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n"
+                       << *Inst << '\n');
+          (Pt)->replaceAllUsesWith(Inst);
+          ToRemove.insert(Pt);
+          Pt = Inst;
+          inserted = true;
+          break;
+        }
+        if (!DT.dominates(Pt, Inst))
+          // Give up if we need to merge in a common dominator as the
+          // expermients show it is not profitable.
+          continue;
+
+        DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n"
+                     << *Pt << '\n');
+        Inst->replaceAllUsesWith(Pt);
+        ToRemove.insert(Inst);
+        inserted = true;
+        break;
+      }
+      if (!inserted)
+        CurPts.push_back(Inst);
+    }
+  }
+}
+
+void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
+  DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
+
+  DenseMap<Value *, Instruction *> SeenChains;
+
+  for (auto &BB : *Func) {
+    for (auto &II : BB) {
+      Instruction *SExt = &II;
+
+      // Collect all sext operation per type.
+      if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt))
+        continue;
+
+      DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n');
+
+      // Cases where we actually perform the optimization:
+      // 1. SExt is used in a getelementptr with more than 2 operand =>
+      //    likely we can merge some computation if they are done on 64 bits.
+      // 2. The beginning of the SExt chain is SExt several time. =>
+      //    code sharing is possible.
+
+      bool insert = false;
+      // #1.
+      for (const Use &U : SExt->uses()) {
+        const Instruction *Inst = dyn_cast<GetElementPtrInst>(U);
+        if (Inst && Inst->getNumOperands() > 2) {
+          DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
+                       << '\n');
+          insert = true;
+          break;
+        }
+      }
+
+      // #2.
+      // Check the head of the chain.
+      Instruction *Inst = SExt;
+      Value *Last;
+      do {
+        int OpdIdx = 0;
+        const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+        if (BinOp && isa<ConstantInt>(BinOp->getOperand(0)))
+          OpdIdx = 1;
+        Last = Inst->getOperand(OpdIdx);
+        Inst = dyn_cast<Instruction>(Last);
+      } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst));
+
+      DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n');
+      DenseMap<Value *, Instruction *>::iterator AlreadySeen =
+          SeenChains.find(Last);
+      if (insert || AlreadySeen != SeenChains.end()) {
+        DEBUG(dbgs() << "Insert\n");
+        SExtInsts.push_back(SExt);
+        if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) {
+          DEBUG(dbgs() << "Insert chain member\n");
+          SExtInsts.push_back(AlreadySeen->second);
+          SeenChains[Last] = nullptr;
+        }
+      } else {
+        DEBUG(dbgs() << "Record its chain membership\n");
+        SeenChains[Last] = SExt;
+      }
+    }
+  }
+}
+
+bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
+  if (!EnableAddressTypePromotion || F.isDeclaration())
+    return false;
+  Func = &F;
+  ConsideredSExtType = Type::getInt64Ty(Func->getContext());
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n');
+
+  Instructions SExtInsts;
+  analyzeSExtension(SExtInsts);
+  return propagateSignExtension(SExtInsts);
+}
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
new file mode 100644
index 0000000..734fb21
--- /dev/null
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -0,0 +1,387 @@
+//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When profitable, replace GPR targeting i64 instructions with their
+// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
+// as minimizing the number of cross-class register copies.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO: Graph based predicate heuristics.
+// Walking the instruction list linearly will get many, perhaps most, of
+// the cases, but to do a truly thorough job of this, we need a more
+// wholistic approach.
+//
+// This optimization is very similar in spirit to the register allocator's
+// spill placement, only here we're determining where to place cross-class
+// register copies rather than spills. As such, a similar approach is
+// called for.
+//
+// We want to build up a set of graphs of all instructions which are candidates
+// for transformation along with instructions which generate their inputs and
+// consume their outputs. For each edge in the graph, we assign a weight
+// based on whether there is a copy required there (weight zero if not) and
+// the block frequency of the block containing the defining or using
+// instruction, whichever is less. Our optimization is then a graph problem
+// to minimize the total weight of all the graphs, then transform instructions
+// and add or remove copy instructions as called for to implement the
+// solution.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-simd-scalar"
+
+// Allow forcing all i64 operations with equivalent SIMD instructions to use
+// them. For stress-testing the transformation function.
+static cl::opt<bool>
+TransformAll("aarch64-simd-scalar-force-all",
+             cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
+             cl::init(false), cl::Hidden);
+
+STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
+STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
+STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
+
+namespace {
+class AArch64AdvSIMDScalar : public MachineFunctionPass {
+  MachineRegisterInfo *MRI;
+  const AArch64InstrInfo *TII;
+
+private:
+  // isProfitableToTransform - Predicate function to determine whether an
+  // instruction should be transformed to its equivalent AdvSIMD scalar
+  // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+  bool isProfitableToTransform(const MachineInstr *MI) const;
+
+  // transformInstruction - Perform the transformation of an instruction
+  // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+  // to be the correct register class, minimizing cross-class copies.
+  void transformInstruction(MachineInstr *MI);
+
+  // processMachineBasicBlock - Main optimzation loop.
+  bool processMachineBasicBlock(MachineBasicBlock *MBB);
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  const char *getPassName() const override {
+    return "AdvSIMD Scalar Operation Optimization";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64AdvSIMDScalar::ID = 0;
+} // end anonymous namespace
+
+static bool isGPR64(unsigned Reg, unsigned SubReg,
+                    const MachineRegisterInfo *MRI) {
+  if (SubReg)
+    return false;
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
+  return AArch64::GPR64RegClass.contains(Reg);
+}
+
+static bool isFPR64(unsigned Reg, unsigned SubReg,
+                    const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
+            SubReg == 0) ||
+           (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
+            SubReg == AArch64::dsub);
+  // Physical register references just check the register class directly.
+  return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
+         (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
+}
+
+// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
+// copy instruction. Return zero_reg if the instruction is not a copy.
+static unsigned getSrcFromCopy(const MachineInstr *MI,
+                               const MachineRegisterInfo *MRI,
+                               unsigned &SubReg) {
+  SubReg = 0;
+  // The "FMOV Xd, Dn" instruction is the typical form.
+  if (MI->getOpcode() == AArch64::FMOVDXr ||
+      MI->getOpcode() == AArch64::FMOVXDr)
+    return MI->getOperand(1).getReg();
+  // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
+  // these at this stage, but it's easy to check for.
+  if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
+    SubReg = AArch64::dsub;
+    return MI->getOperand(1).getReg();
+  }
+  // Or just a plain COPY instruction. This can be directly to/from FPR64,
+  // or it can be a dsub subreg reference to an FPR128.
+  if (MI->getOpcode() == AArch64::COPY) {
+    if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+                MRI) &&
+        isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
+      return MI->getOperand(1).getReg();
+    if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
+                MRI) &&
+        isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
+                MRI)) {
+      SubReg = MI->getOperand(1).getSubReg();
+      return MI->getOperand(1).getReg();
+    }
+  }
+
+  // Otherwise, this is some other kind of instruction.
+  return 0;
+}
+
+// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
+// that we're considering transforming to, return that AdvSIMD opcode. For all
+// others, return the original opcode.
+static int getTransformOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    break;
+  // FIXME: Lots more possibilities.
+  case AArch64::ADDXrr:
+    return AArch64::ADDv1i64;
+  case AArch64::SUBXrr:
+    return AArch64::SUBv1i64;
+  }
+  // No AdvSIMD equivalent, so just return the original opcode.
+  return Opc;
+}
+
+static bool isTransformable(const MachineInstr *MI) {
+  int Opc = MI->getOpcode();
+  return Opc != getTransformOpcode(Opc);
+}
+
+// isProfitableToTransform - Predicate function to determine whether an
+// instruction should be transformed to its equivalent AdvSIMD scalar
+// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
+bool
+AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
+  // If this instruction isn't eligible to be transformed (no SIMD equivalent),
+  // early exit since that's the common case.
+  if (!isTransformable(MI))
+    return false;
+
+  // Count the number of copies we'll need to add and approximate the number
+  // of copies that a transform will enable us to remove.
+  unsigned NumNewCopies = 3;
+  unsigned NumRemovableCopies = 0;
+
+  unsigned OrigSrc0 = MI->getOperand(1).getReg();
+  unsigned OrigSrc1 = MI->getOperand(2).getReg();
+  unsigned Src0 = 0, SubReg0;
+  unsigned Src1 = 0, SubReg1;
+  if (!MRI->def_empty(OrigSrc0)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc0);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    // If the source was from a copy, we don't need to insert a new copy.
+    if (Src0)
+      --NumNewCopies;
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0))
+      ++NumRemovableCopies;
+  }
+  if (!MRI->def_empty(OrigSrc1)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc1);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    if (Src1)
+      --NumNewCopies;
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1))
+      ++NumRemovableCopies;
+  }
+
+  // If any of the uses of the original instructions is a cross class copy,
+  // that's a copy that will be removable if we transform. Likewise, if
+  // any of the uses is a transformable instruction, it's likely the tranforms
+  // will chain, enabling us to save a copy there, too. This is an aggressive
+  // heuristic that approximates the graph based cost analysis described above.
+  unsigned Dst = MI->getOperand(0).getReg();
+  bool AllUsesAreCopies = true;
+  for (MachineRegisterInfo::use_instr_nodbg_iterator
+           Use = MRI->use_instr_nodbg_begin(Dst),
+           E = MRI->use_instr_nodbg_end();
+       Use != E; ++Use) {
+    unsigned SubReg;
+    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use))
+      ++NumRemovableCopies;
+    // If the use is an INSERT_SUBREG, that's still something that can
+    // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
+    // preferable to have it use the FPR64 in most cases, as if the source
+    // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
+    // Ditto for a lane insert.
+    else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
+             Use->getOpcode() == AArch64::INSvi64gpr)
+      ;
+    else
+      AllUsesAreCopies = false;
+  }
+  // If all of the uses of the original destination register are copies to
+  // FPR64, then we won't end up having a new copy back to GPR64 either.
+  if (AllUsesAreCopies)
+    --NumNewCopies;
+
+  // If a transform will not increase the number of cross-class copies required,
+  // return true.
+  if (NumNewCopies <= NumRemovableCopies)
+    return true;
+
+  // Finally, even if we otherwise wouldn't transform, check if we're forcing
+  // transformation of everything.
+  return TransformAll;
+}
+
+static MachineInstr *insertCopy(const AArch64InstrInfo *TII, MachineInstr *MI,
+                                unsigned Dst, unsigned Src, bool IsKill) {
+  MachineInstrBuilder MIB =
+      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY),
+              Dst)
+          .addReg(Src, getKillRegState(IsKill));
+  DEBUG(dbgs() << "    adding copy: " << *MIB);
+  ++NumCopiesInserted;
+  return MIB;
+}
+
+// transformInstruction - Perform the transformation of an instruction
+// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
+// to be the correct register class, minimizing cross-class copies.
+void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
+  DEBUG(dbgs() << "Scalar transform: " << *MI);
+
+  MachineBasicBlock *MBB = MI->getParent();
+  int OldOpc = MI->getOpcode();
+  int NewOpc = getTransformOpcode(OldOpc);
+  assert(OldOpc != NewOpc && "transform an instruction to itself?!");
+
+  // Check if we need a copy for the source registers.
+  unsigned OrigSrc0 = MI->getOperand(1).getReg();
+  unsigned OrigSrc1 = MI->getOperand(2).getReg();
+  unsigned Src0 = 0, SubReg0;
+  unsigned Src1 = 0, SubReg1;
+  if (!MRI->def_empty(OrigSrc0)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc0);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) {
+      assert(Src0 && "Can't delete copy w/o a valid original source!");
+      Def->eraseFromParent();
+      ++NumCopiesDeleted;
+    }
+  }
+  if (!MRI->def_empty(OrigSrc1)) {
+    MachineRegisterInfo::def_instr_iterator Def =
+        MRI->def_instr_begin(OrigSrc1);
+    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
+    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    // If there are no other users of the original source, we can delete
+    // that instruction.
+    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) {
+      assert(Src1 && "Can't delete copy w/o a valid original source!");
+      Def->eraseFromParent();
+      ++NumCopiesDeleted;
+    }
+  }
+  // If we weren't able to reference the original source directly, create a
+  // copy.
+  if (!Src0) {
+    SubReg0 = 0;
+    Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+    insertCopy(TII, MI, Src0, OrigSrc0, true);
+  }
+  if (!Src1) {
+    SubReg1 = 0;
+    Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+    insertCopy(TII, MI, Src1, OrigSrc1, true);
+  }
+
+  // Create a vreg for the destination.
+  // FIXME: No need to do this if the ultimate user expects an FPR64.
+  // Check for that and avoid the copy if possible.
+  unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
+
+  // For now, all of the new instructions have the same simple three-register
+  // form, so no need to special case based on what instruction we're
+  // building.
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst)
+      .addReg(Src0, getKillRegState(true), SubReg0)
+      .addReg(Src1, getKillRegState(true), SubReg1);
+
+  // Now copy the result back out to a GPR.
+  // FIXME: Try to avoid this if all uses could actually just use the FPR64
+  // directly.
+  insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true);
+
+  // Erase the old instruction.
+  MI->eraseFromParent();
+
+  ++NumScalarInsnsUsed;
+}
+
+// processMachineBasicBlock - Main optimzation loop.
+bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr *MI = I;
+    ++I;
+    if (isProfitableToTransform(MI)) {
+      transformInstruction(MI);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+// runOnMachineFunction - Pass entry point from PassManager.
+bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
+
+  const TargetMachine &TM = mf.getTarget();
+  MRI = &mf.getRegInfo();
+  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+
+  // Just check things on a one-block-at-a-time basis.
+  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
+    if (processMachineBasicBlock(I))
+      Changed = true;
+  return Changed;
+}
+
+// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
+// to add the pass to the PassManager.
+FunctionPass *llvm::createAArch64AdvSIMDScalar() {
+  return new AArch64AdvSIMDScalar();
+}
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index f0b52d3..c3ee9bb 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64AsmPrinter.cpp - Print machine code to an AArch64 .s file --===//
+//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,236 +8,337 @@
 //===----------------------------------------------------------------------===//
 //
 // This file contains a printer that converts from our internal representation
-// of machine-dependent LLVM code to GAS-format AArch64 assembly language.
+// of machine-dependent LLVM code to the AArch64 assembly language.
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
-#include "AArch64AsmPrinter.h"
+#include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64MCInstLower.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
 #include "InstPrinter/AArch64InstPrinter.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
-
 using namespace llvm;
 
-/// Try to print a floating-point register as if it belonged to a specified
-/// register-class. For example the inline asm operand modifier "b" requires its
-/// argument to be printed as "bN".
-static bool printModifiedFPRAsmOperand(const MachineOperand &MO,
-                                       const TargetRegisterInfo *TRI,
-                                       char RegType, raw_ostream &O) {
-  if (!MO.isReg())
-    return true;
-
-  for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
-    if (AArch64::FPR8RegClass.contains(*AR)) {
-      O << RegType << TRI->getEncodingValue(MO.getReg());
-      return false;
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+
+class AArch64AsmPrinter : public AsmPrinter {
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when printing asm code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  AArch64MCInstLower MCInstLowering;
+  StackMaps SM;
+
+public:
+  AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer),
+        Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
+        MCInstLowering(OutContext, *Mang, *this), SM(*this), AArch64FI(nullptr),
+        LOHLabelCounter(0) {}
+
+  const char *getPassName() const override {
+    return "AArch64 Assembly Printer";
+  }
+
+  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
+  /// tblgen'erated pseudo lowering.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+    return MCInstLowering.lowerOperand(MO, MCOp);
+  }
+
+  void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                     const MachineInstr &MI);
+  void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                       const MachineInstr &MI);
+  /// \brief tblgen'erated driver function for lowering simple MI->MC
+  /// pseudo instructions.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  void EmitInstruction(const MachineInstr *MI) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AsmPrinter::getAnalysisUsage(AU);
+    AU.setPreservesAll();
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override {
+    AArch64FI = F.getInfo<AArch64FunctionInfo>();
+    return AsmPrinter::runOnMachineFunction(F);
+  }
+
+private:
+  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
+  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+  bool printAsmRegInClass(const MachineOperand &MO,
+                          const TargetRegisterClass *RC, bool isVector,
+                          raw_ostream &O);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O) override;
+
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  void EmitFunctionBodyEnd() override;
+
+  MCSymbol *GetCPISymbol(unsigned CPID) const override;
+  void EmitEndOfAsmFile(Module &M) override;
+  AArch64FunctionInfo *AArch64FI;
+
+  /// \brief Emit the LOHs contained in AArch64FI.
+  void EmitLOHs();
+
+  typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
+  MInstToMCSymbol LOHInstToLabel;
+  unsigned LOHLabelCounter;
+};
+
+} // end of anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetMachO()) {
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+    SM.serializeToStackMapSection();
+  }
+
+  // Emit a .data.rel section containing any stubs that were created.
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        OutStreamer.EmitLabel(Stubs[i].first);
+        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+                                    TD->getPointerSize(0));
+      }
+      Stubs.clear();
     }
   }
 
-  // The register doesn't correspond to anything floating-point like.
-  return true;
 }
 
-/// Implements the 'w' and 'x' inline asm operand modifiers, which print a GPR
-/// with the obvious type and an immediate 0 as either wzr or xzr.
-static bool printModifiedGPRAsmOperand(const MachineOperand &MO,
-                                       const TargetRegisterInfo *TRI,
-                                       const TargetRegisterClass &RegClass,
-                                       raw_ostream &O) {
-  char Prefix = &RegClass == &AArch64::GPR32RegClass ? 'w' : 'x';
+MachineLocation
+AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+  MachineLocation Location;
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  // Frame address.  Currently handles register +- offset only.
+  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
+    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+  else {
+    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+  }
+  return Location;
+}
 
-  if (MO.isImm() && MO.getImm() == 0) {
-    O << Prefix << "zr";
-    return false;
-  } else if (MO.isReg()) {
-    if (MO.getReg() == AArch64::XSP || MO.getReg() == AArch64::WSP) {
-      O << (Prefix == 'x' ? "sp" : "wsp");
-      return false;
-    }
+void AArch64AsmPrinter::EmitLOHs() {
+  SmallVector<MCSymbol *, 3> MCArgs;
 
-    for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
-      if (RegClass.contains(*AR)) {
-        O << AArch64InstPrinter::getRegisterName(*AR);
-        return false;
-      }
+  for (const auto &D : AArch64FI->getLOHContainer()) {
+    for (const MachineInstr *MI : D.getArgs()) {
+      MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
+      assert(LabelIt != LOHInstToLabel.end() &&
+             "Label hasn't been inserted for LOH related instruction");
+      MCArgs.push_back(LabelIt->second);
     }
+    OutStreamer.EmitLOHDirective(D.getKind(), MCArgs);
+    MCArgs.clear();
   }
+}
 
-  return true;
+void AArch64AsmPrinter::EmitFunctionBodyEnd() {
+  if (!AArch64FI->getLOHRelated().empty())
+    EmitLOHs();
 }
 
-bool AArch64AsmPrinter::printSymbolicAddress(const MachineOperand &MO,
-                                             bool PrintImmediatePrefix,
-                                             StringRef Suffix, raw_ostream &O) {
-  StringRef Name;
-  StringRef Modifier;
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  // Darwin uses a linker-private symbol name for constant-pools (to
+  // avoid addends on the relocation?), ELF has no such concept and
+  // uses a normal private symbol.
+  if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
+    return OutContext.GetOrCreateSymbol(
+        Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
+        Twine(getFunctionNumber()) + "_" + Twine(CPID));
+
+  return OutContext.GetOrCreateSymbol(
+      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
+      Twine(getFunctionNumber()) + "_" + Twine(CPID));
+}
+
+void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
   switch (MO.getType()) {
   default:
-    return true;
-  case MachineOperand::MO_GlobalAddress:
-    Name = getSymbol(MO.getGlobal())->getName();
-
-    // Global variables may be accessed either via a GOT or in various fun and
-    // interesting TLS-model specific ways. Set the prefix modifier as
-    // appropriate here.
-    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(MO.getGlobal())) {
-      Reloc::Model RelocM = TM.getRelocationModel();
-      if (GV->isThreadLocal()) {
-        switch (TM.getTLSModel(GV)) {
-        case TLSModel::GeneralDynamic:
-          Modifier = "tlsdesc";
-          break;
-        case TLSModel::LocalDynamic:
-          Modifier = "dtprel";
-          break;
-        case TLSModel::InitialExec:
-          Modifier = "gottprel";
-          break;
-        case TLSModel::LocalExec:
-          Modifier = "tprel";
-          break;
-        }
-      } else if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
-        Modifier = "got";
-      }
-    }
+    assert(0 && "<unknown operand type>");
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    O << AArch64InstPrinter::getRegisterName(Reg);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    int64_t Imm = MO.getImm();
+    O << '#' << Imm;
     break;
-  case MachineOperand::MO_BlockAddress:
-    Name = GetBlockAddressSymbol(MO.getBlockAddress())->getName();
+  }
+  }
+}
+
+bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+                                          raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default:
+    return true; // Unknown mode.
+  case 'w':
+    Reg = getWRegFromXReg(Reg);
     break;
-  case MachineOperand::MO_ConstantPoolIndex:
-    Name = GetCPISymbol(MO.getIndex())->getName();
+  case 'x':
+    Reg = getXRegFromWReg(Reg);
     break;
   }
 
-  // Some instructions (notably ADRP) don't take the # prefix for
-  // immediates. Only print it if asked to.
-  if (PrintImmediatePrefix)
-    O << '#';
-
-  // Only need the joining "_" if both the prefix and the suffix are
-  // non-null. This little block simply takes care of the four possibly
-  // combinations involved there.
-  if (Modifier == "" && Suffix == "")
-    O << Name;
-  else if (Modifier == "" && Suffix != "")
-    O << ":" << Suffix << ':' << Name;
-  else if (Modifier != "" && Suffix == "")
-    O << ":" << Modifier << ':' << Name;
-  else
-    O << ":" << Modifier << '_' << Suffix << ':' << Name;
+  O << AArch64InstPrinter::getRegisterName(Reg);
+  return false;
+}
 
+// Prints the register in MO using class RC using the offset in the
+// new register class. This should not be used for cross class
+// printing.
+bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
+                                           const TargetRegisterClass *RC,
+                                           bool isVector, raw_ostream &O) {
+  assert(MO.isReg() && "Should only get here with a register!");
+  const AArch64RegisterInfo *RI =
+      static_cast<const AArch64RegisterInfo *>(TM.getRegisterInfo());
+  unsigned Reg = MO.getReg();
+  unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
+  assert(RI->regsOverlap(RegToPrint, Reg));
+  O << AArch64InstPrinter::getRegisterName(
+           RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
   return false;
 }
 
 bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                         unsigned AsmVariant,
                                         const char *ExtraCode, raw_ostream &O) {
-  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  const MachineOperand &MO = MI->getOperand(OpNum);
 
-  if (!ExtraCode)
-    ExtraCode = "";
+  // First try the generic code, which knows about modifiers like 'c' and 'n'.
+  if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+    return false;
 
-  switch(ExtraCode[0]) {
-  default:
-    if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      return true; // Unknown modifier.
+    case 'w':      // Print W register
+    case 'x':      // Print X register
+      if (MO.isReg())
+        return printAsmMRegister(MO, ExtraCode[0], O);
+      if (MO.isImm() && MO.getImm() == 0) {
+        unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR;
+        O << AArch64InstPrinter::getRegisterName(Reg);
         return false;
-    break;
-  case 'w':
-    // Output 32-bit general register operand, constant zero as wzr, or stack
-    // pointer as wsp. Ignored when used with other operand types.
-    if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                    AArch64::GPR32RegClass, O))
-      return false;
-    break;
-  case 'x':
-    // Output 64-bit general register operand, constant zero as xzr, or stack
-    // pointer as sp. Ignored when used with other operand types.
-    if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                    AArch64::GPR64RegClass, O))
-      return false;
-    break;
-  case 'H':
-    // Output higher numbered of a 64-bit general register pair
-  case 'Q':
-    // Output least significant register of a 64-bit general register pair
-  case 'R':
-    // Output most significant register of a 64-bit general register pair
-
-    // FIXME note: these three operand modifiers will require, to some extent,
-    // adding a paired GPR64 register class. Initial investigation suggests that
-    // assertions are hit unless it has a type and is made legal for that type
-    // in ISelLowering. After that step is made, the number of modifications
-    // needed explodes (operation legality, calling conventions, stores, reg
-    // copies ...).
-    llvm_unreachable("FIXME: Unimplemented register pairs");
-  case 'b':
-  case 'h':
-  case 's':
-  case 'd':
-  case 'q':
-    if (!printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                    ExtraCode[0], O))
-      return false;
-    break;
-  case 'A':
-    // Output symbolic address with appropriate relocation modifier (also
-    // suitable for ADRP).
-    if (!printSymbolicAddress(MI->getOperand(OpNum), false, "", O))
-      return false;
-    break;
-  case 'L':
-    // Output bits 11:0 of symbolic address with appropriate :lo12: relocation
-    // modifier.
-    if (!printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O))
+      }
+      printOperand(MI, OpNum, O);
       return false;
-    break;
-  case 'G':
-    // Output bits 23:12 of symbolic address with appropriate :hi12: relocation
-    // modifier (currently only for TLS local exec).
-    if (!printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O))
+    case 'b': // Print B register.
+    case 'h': // Print H register.
+    case 's': // Print S register.
+    case 'd': // Print D register.
+    case 'q': // Print Q register.
+      if (MO.isReg()) {
+        const TargetRegisterClass *RC;
+        switch (ExtraCode[0]) {
+        case 'b':
+          RC = &AArch64::FPR8RegClass;
+          break;
+        case 'h':
+          RC = &AArch64::FPR16RegClass;
+          break;
+        case 's':
+          RC = &AArch64::FPR32RegClass;
+          break;
+        case 'd':
+          RC = &AArch64::FPR64RegClass;
+          break;
+        case 'q':
+          RC = &AArch64::FPR128RegClass;
+          break;
+        default:
+          return true;
+        }
+        return printAsmRegInClass(MO, RC, false /* vector */, O);
+      }
+      printOperand(MI, OpNum, O);
       return false;
-    break;
-  case 'a':
-    return PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+    }
   }
 
-  // There's actually no operand modifier, which leads to a slightly eclectic
-  // set of behaviour which we have to handle here.
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  switch (MO.getType()) {
-  default:
-    llvm_unreachable("Unexpected operand for inline assembly");
-  case MachineOperand::MO_Register:
-    // GCC prints the unmodified operand of a 'w' constraint as the vector
-    // register. Technically, we could allocate the argument as a VPR128, but
-    // that leads to extremely dodgy copies being generated to get the data
-    // there.
-    if (printModifiedFPRAsmOperand(MO, TRI, 'v', O))
-      O << AArch64InstPrinter::getRegisterName(MO.getReg());
-    break;
-  case MachineOperand::MO_Immediate:
-    O << '#' << MO.getImm();
-    break;
-  case MachineOperand::MO_FPImmediate:
-    assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected");
-    O << "#0.0";
-    break;
-  case MachineOperand::MO_BlockAddress:
-  case MachineOperand::MO_ConstantPoolIndex:
-  case MachineOperand::MO_GlobalAddress:
-    return printSymbolicAddress(MO, false, "", O);
+  // According to ARM, we should emit x and v registers unless we have a
+  // modifier.
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+
+    // If this is a w or x register, print an x register.
+    if (AArch64::GPR32allRegClass.contains(Reg) ||
+        AArch64::GPR64allRegClass.contains(Reg))
+      return printAsmMRegister(MO, 'x', O);
+
+    // If this is a b, h, s, d, or q register, print it as a v register.
+    return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
+                              O);
   }
 
+  printOperand(MI, OpNum, O);
   return false;
 }
 
@@ -246,15 +347,90 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                               unsigned AsmVariant,
                                               const char *ExtraCode,
                                               raw_ostream &O) {
-  // Currently both the memory constraints (m and Q) behave the same and amount
-  // to the address as a single register. In future, we may allow "m" to provide
-  // both a base and an offset.
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
   const MachineOperand &MO = MI->getOperand(OpNum);
-  assert(MO.isReg() && "unexpected inline assembly memory operand");
-  O << '[' << AArch64InstPrinter::getRegisterName(MO.getReg()) << ']';
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]";
   return false;
 }
 
+void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                               raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps == 4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '[';
+  printOperand(MI, 0, OS);
+  OS << '+';
+  printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps - 2, OS);
+}
+
+void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                                      const MachineInstr &MI) {
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+  SM.recordStackMap(MI);
+  // Emit padding.
+  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+  for (unsigned i = 0; i < NumNOPBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                                        const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers Opers(&MI);
+
+  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+  unsigned EncodedBytes = 0;
+  if (CallTarget) {
+    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+           "High 16 bits of call target should be zero.");
+    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+    EncodedBytes = 16;
+    // Materialize the jump address:
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 32) & 0xFFFF)
+                                    .addImm(32));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 16) & 0xFFFF)
+                                    .addImm(16));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm(CallTarget & 0xFFFF)
+                                    .addImm(0));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
+  }
+  // Emit padding.
+  unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+  assert((NumBytes - EncodedBytes) % 4 == 0 &&
+         "Invalid number of NOP bytes requested!");
+  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
 
 void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
@@ -262,41 +438,87 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (emitPseudoExpansionLowering(OutStreamer, MI))
     return;
 
-  MCInst TmpInst;
-  LowerAArch64MachineInstrToMCInst(MI, TmpInst, *this);
-  EmitToStreamer(OutStreamer, TmpInst);
-}
+  if (AArch64FI->getLOHRelated().count(MI)) {
+    // Generate a label for LOH related instruction
+    MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
+    // Associate the instruction with the label
+    LOHInstToLabel[MI] = LOHLabel;
+    OutStreamer.EmitLabel(LOHLabel);
+  }
 
-void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (Subtarget->isTargetELF()) {
-    const TargetLoweringObjectFileELF &TLOFELF =
-      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+  // Do any manual lowerings.
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
 
-    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+  // Tail calls use pseudo instructions so they have the proper code-gen
+  // attributes (isCall, isReturn, etc.). We lower them to the real
+  // instruction here.
+  case AArch64::TCRETURNri: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(AArch64::BR);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case AArch64::TCRETURNdi: {
+    MCOperand Dest;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
+    MCInst TmpInst;
+    TmpInst.setOpcode(AArch64::B);
+    TmpInst.addOperand(Dest);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case AArch64::TLSDESC_BLR: {
+    MCOperand Callee, Sym;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
+    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
 
-    // Output stubs for external and common global variables.
-    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
-    if (!Stubs.empty()) {
-      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getDataLayout();
+    // First emit a relocation-annotation. This expands to no code, but requests
+    // the following instruction gets an R_AARCH64_TLSDESC_CALL.
+    MCInst TLSDescCall;
+    TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
+    TLSDescCall.addOperand(Sym);
+    EmitToStreamer(OutStreamer, TLSDescCall);
 
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        OutStreamer.EmitLabel(Stubs[i].first);
-        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
-                                    TD->getPointerSize(0));
-      }
-      Stubs.clear();
-    }
+    // Other than that it's just a normal indirect call to the function loaded
+    // from the descriptor.
+    MCInst BLR;
+    BLR.setOpcode(AArch64::BLR);
+    BLR.addOperand(Callee);
+    EmitToStreamer(OutStreamer, BLR);
+
+    return;
+  }
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(OutStreamer, SM, *MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(OutStreamer, SM, *MI);
   }
-}
 
-bool AArch64AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
-  return AsmPrinter::runOnMachineFunction(MF);
+  // Finally, do the automated lowerings for everything else.
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
 }
 
 // Force static initialization.
 extern "C" void LLVMInitializeAArch64AsmPrinter() {
-    RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
-    RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
-}
+  RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
+  RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
 
+  RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64leTarget);
+  RegisterAsmPrinter<AArch64AsmPrinter> W(TheARM64beTarget);
+}
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.h b/lib/Target/AArch64/AArch64AsmPrinter.h
deleted file mode 100644
index 824f003..0000000
--- a/lib/Target/AArch64/AArch64AsmPrinter.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// AArch64AsmPrinter.h - Print machine code to an AArch64 .s file -*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the AArch64 assembly printer class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_AARCH64ASMPRINTER_H
-#define LLVM_AARCH64ASMPRINTER_H
-
-#include "AArch64.h"
-#include "AArch64TargetMachine.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-
-class MCOperand;
-
-class LLVM_LIBRARY_VISIBILITY AArch64AsmPrinter : public AsmPrinter {
-
-  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
-  /// make the right decision when printing asm code for different targets.
-  const AArch64Subtarget *Subtarget;
-
-  // emitPseudoExpansionLowering - tblgen'erated.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
-
-  public:
-  explicit AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {
-    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
-  }
-
-  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
-
-  MCOperand lowerSymbolOperand(const MachineOperand &MO,
-                               const MCSymbol *Sym) const;
-
-  void EmitInstruction(const MachineInstr *MI);
-  void EmitEndOfAsmFile(Module &M);
-
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &O);
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
-                             unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &O);
-
-  /// printSymbolicAddress - Given some kind of reasonably bare symbolic
-  /// reference, print out the appropriate asm string to represent it. If
-  /// appropriate, a relocation-specifier will be produced, composed of a
-  /// general class derived from the MO parameter and an instruction-specific
-  /// suffix, provided in Suffix. E.g. ":got_lo12:" if a Suffix of "lo12" is
-  /// given.
-  bool printSymbolicAddress(const MachineOperand &MO,
-                            bool PrintImmediatePrefix,
-                            StringRef Suffix, raw_ostream &O);
-
-  virtual const char *getPassName() const {
-    return "AArch64 Assembly Printer";
-  }
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-};
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/AArch64/AArch64BranchFixupPass.cpp b/lib/Target/AArch64/AArch64BranchFixupPass.cpp
deleted file mode 100644
index c03cdde..0000000
--- a/lib/Target/AArch64/AArch64BranchFixupPass.cpp
+++ /dev/null
@@ -1,600 +0,0 @@
-//===-- AArch64BranchFixupPass.cpp - AArch64 branch fixup -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that fixes AArch64 branches which have ended up out
-// of range for their immediate operands.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "aarch64-branch-fixup"
-#include "AArch64.h"
-#include "AArch64InstrInfo.h"
-#include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(NumSplit,      "Number of uncond branches inserted");
-STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
-
-/// Return the worst case padding that could result from unknown offset bits.
-/// This does not include alignment padding caused by known offset bits.
-///
-/// @param LogAlign log2(alignment)
-/// @param KnownBits Number of known low offset bits.
-static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
-  if (KnownBits < LogAlign)
-    return (1u << LogAlign) - (1u << KnownBits);
-  return 0;
-}
-
-namespace {
-  /// Due to limited PC-relative displacements, conditional branches to distant
-  /// blocks may need converting into an unconditional equivalent. For example:
-  ///     tbz w1, #0, far_away
-  /// becomes
-  ///     tbnz w1, #0, skip
-  ///     b far_away
-  ///   skip:
-  class AArch64BranchFixup : public MachineFunctionPass {
-    /// Information about the offset and size of a single basic block.
-    struct BasicBlockInfo {
-      /// Distance from the beginning of the function to the beginning of this
-      /// basic block.
-      ///
-      /// Offsets are computed assuming worst case padding before an aligned
-      /// block. This means that subtracting basic block offsets always gives a
-      /// conservative estimate of the real distance which may be smaller.
-      ///
-      /// Because worst case padding is used, the computed offset of an aligned
-      /// block may not actually be aligned.
-      unsigned Offset;
-
-      /// Size of the basic block in bytes.  If the block contains inline
-      /// assembly, this is a worst case estimate.
-      ///
-      /// The size does not include any alignment padding whether from the
-      /// beginning of the block, or from an aligned jump table at the end.
-      unsigned Size;
-
-      /// The number of low bits in Offset that are known to be exact.  The
-      /// remaining bits of Offset are an upper bound.
-      uint8_t KnownBits;
-
-      /// When non-zero, the block contains instructions (inline asm) of unknown
-      /// size.  The real size may be smaller than Size bytes by a multiple of 1
-      /// << Unalign.
-      uint8_t Unalign;
-
-      BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0) {}
-
-      /// Compute the number of known offset bits internally to this block.
-      /// This number should be used to predict worst case padding when
-      /// splitting the block.
-      unsigned internalKnownBits() const {
-        unsigned Bits = Unalign ? Unalign : KnownBits;
-        // If the block size isn't a multiple of the known bits, assume the
-        // worst case padding.
-        if (Size & ((1u << Bits) - 1))
-          Bits = countTrailingZeros(Size);
-        return Bits;
-      }
-
-      /// Compute the offset immediately following this block.  If LogAlign is
-      /// specified, return the offset the successor block will get if it has
-      /// this alignment.
-      unsigned postOffset(unsigned LogAlign = 0) const {
-        unsigned PO = Offset + Size;
-        if (!LogAlign)
-          return PO;
-        // Add alignment padding from the terminator.
-        return PO + UnknownPadding(LogAlign, internalKnownBits());
-      }
-
-      /// Compute the number of known low bits of postOffset.  If this block
-      /// contains inline asm, the number of known bits drops to the
-      /// instruction alignment.  An aligned terminator may increase the number
-      /// of know bits.
-      /// If LogAlign is given, also consider the alignment of the next block.
-      unsigned postKnownBits(unsigned LogAlign = 0) const {
-        return std::max(LogAlign, internalKnownBits());
-      }
-    };
-
-    std::vector<BasicBlockInfo> BBInfo;
-
-    /// One per immediate branch, keeping the machine instruction pointer,
-    /// conditional or unconditional, the max displacement, and (if IsCond is
-    /// true) the corresponding inverted branch opcode.
-    struct ImmBranch {
-      MachineInstr *MI;
-      unsigned OffsetBits : 31;
-      bool IsCond : 1;
-      ImmBranch(MachineInstr *mi, unsigned offsetbits, bool cond)
-        : MI(mi), OffsetBits(offsetbits), IsCond(cond) {}
-    };
-
-    /// Keep track of all the immediate branch instructions.
-    ///
-    std::vector<ImmBranch> ImmBranches;
-
-    MachineFunction *MF;
-    const AArch64InstrInfo *TII;
-  public:
-    static char ID;
-    AArch64BranchFixup() : MachineFunctionPass(ID) {}
-
-    virtual bool runOnMachineFunction(MachineFunction &MF);
-
-    virtual const char *getPassName() const {
-      return "AArch64 branch fixup pass";
-    }
-
-  private:
-    void initializeFunctionInfo();
-    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
-    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
-    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB,
-                     unsigned OffsetBits);
-    bool fixupImmediateBr(ImmBranch &Br);
-    bool fixupConditionalBr(ImmBranch &Br);
-
-    void computeBlockSize(MachineBasicBlock *MBB);
-    unsigned getOffsetOf(MachineInstr *MI) const;
-    void dumpBBs();
-    void verify();
-  };
-  char AArch64BranchFixup::ID = 0;
-}
-
-/// check BBOffsets
-void AArch64BranchFixup::verify() {
-#ifndef NDEBUG
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock *MBB = MBBI;
-    unsigned MBBId = MBB->getNumber();
-    assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
-  }
-#endif
-}
-
-/// print block size and offset information - debugging
-void AArch64BranchFixup::dumpBBs() {
-  DEBUG({
-    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
-      const BasicBlockInfo &BBI = BBInfo[J];
-      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
-             << " kb=" << unsigned(BBI.KnownBits)
-             << " ua=" << unsigned(BBI.Unalign)
-             << format(" size=%#x\n", BBInfo[J].Size);
-    }
-  });
-}
-
-/// Returns an instance of the branch fixup pass.
-FunctionPass *llvm::createAArch64BranchFixupPass() {
-  return new AArch64BranchFixup();
-}
-
-bool AArch64BranchFixup::runOnMachineFunction(MachineFunction &mf) {
-  MF = &mf;
-  DEBUG(dbgs() << "***** AArch64BranchFixup ******");
-  TII = (const AArch64InstrInfo*)MF->getTarget().getInstrInfo();
-
-  // This pass invalidates liveness information when it splits basic blocks.
-  MF->getRegInfo().invalidateLiveness();
-
-  // Renumber all of the machine basic blocks in the function, guaranteeing that
-  // the numbers agree with the position of the block in the function.
-  MF->RenumberBlocks();
-
-  // Do the initial scan of the function, building up information about the
-  // sizes of each block and location of each immediate branch.
-  initializeFunctionInfo();
-
-  // Iteratively fix up branches until there is no change.
-  unsigned NoBRIters = 0;
-  bool MadeChange = false;
-  while (true) {
-    DEBUG(dbgs() << "Beginning iteration #" << NoBRIters << '\n');
-    bool BRChange = false;
-    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
-      BRChange |= fixupImmediateBr(ImmBranches[i]);
-    if (BRChange && ++NoBRIters > 30)
-      report_fatal_error("Branch Fix Up pass failed to converge!");
-    DEBUG(dumpBBs());
-
-    if (!BRChange)
-      break;
-    MadeChange = true;
-  }
-
-  // After a while, this might be made debug-only, but it is not expensive.
-  verify();
-
-  DEBUG(dbgs() << '\n'; dumpBBs());
-
-  BBInfo.clear();
-  ImmBranches.clear();
-
-  return MadeChange;
-}
-
-/// Return true if the specified basic block can fallthrough into the block
-/// immediately after it.
-static bool BBHasFallthrough(MachineBasicBlock *MBB) {
-  // Get the next machine basic block in the function.
-  MachineFunction::iterator MBBI = MBB;
-  // Can't fall off end of function.
-  if (std::next(MBBI) == MBB->getParent()->end())
-    return false;
-
-  MachineBasicBlock *NextBB = std::next(MBBI);
-  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I)
-    if (*I == NextBB)
-      return true;
-
-  return false;
-}
-
-/// Do the initial scan of the function, building up information about the sizes
-/// of each block, and each immediate branch.
-void AArch64BranchFixup::initializeFunctionInfo() {
-  BBInfo.clear();
-  BBInfo.resize(MF->getNumBlockIDs());
-
-  // First thing, compute the size of all basic blocks, and see if the function
-  // has any inline assembly in it. If so, we have to be conservative about
-  // alignment assumptions, as we don't know for sure the size of any
-  // instructions in the inline assembly.
-  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
-    computeBlockSize(I);
-
-  // The known bits of the entry block offset are determined by the function
-  // alignment.
-  BBInfo.front().KnownBits = MF->getAlignment();
-
-  // Compute block offsets and known bits.
-  adjustBBOffsetsAfter(MF->begin());
-
-  // Now go back through the instructions and build up our data structures.
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock &MBB = *MBBI;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      if (I->isDebugValue())
-        continue;
-
-      int Opc = I->getOpcode();
-      if (I->isBranch()) {
-        bool IsCond = false;
-
-        // The offsets encoded in instructions here scale by the instruction
-        // size (4 bytes), effectively increasing their range by 2 bits.
-        unsigned Bits = 0;
-        switch (Opc) {
-        default:
-          continue;  // Ignore other JT branches
-        case AArch64::TBZxii:
-        case AArch64::TBZwii:
-        case AArch64::TBNZxii:
-        case AArch64::TBNZwii:
-          IsCond = true;
-          Bits = 14 + 2;
-          break;
-        case AArch64::Bcc:
-        case AArch64::CBZx:
-        case AArch64::CBZw:
-        case AArch64::CBNZx:
-        case AArch64::CBNZw:
-          IsCond = true;
-          Bits = 19 + 2;
-          break;
-        case AArch64::Bimm:
-          Bits = 26 + 2;
-          break;
-        }
-
-        // Record this immediate branch.
-        ImmBranches.push_back(ImmBranch(I, Bits, IsCond));
-      }
-    }
-  }
-}
-
-/// Compute the size and some alignment information for MBB.  This function
-/// updates BBInfo directly.
-void AArch64BranchFixup::computeBlockSize(MachineBasicBlock *MBB) {
-  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
-  BBI.Size = 0;
-  BBI.Unalign = 0;
-
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-       ++I) {
-    BBI.Size += TII->getInstSizeInBytes(*I);
-    // For inline asm, GetInstSizeInBytes returns a conservative estimate.
-    // The actual size may be smaller, but still a multiple of the instr size.
-    if (I->isInlineAsm())
-      BBI.Unalign = 2;
-  }
-}
-
-/// Return the current offset of the specified machine instruction from the
-/// start of the function.  This offset changes as stuff is moved around inside
-/// the function.
-unsigned AArch64BranchFixup::getOffsetOf(MachineInstr *MI) const {
-  MachineBasicBlock *MBB = MI->getParent();
-
-  // The offset is composed of two things: the sum of the sizes of all MBB's
-  // before this instruction's block, and the offset from the start of the block
-  // it is in.
-  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
-
-  // Sum instructions before MI in MBB.
-  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
-    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
-    Offset += TII->getInstSizeInBytes(*I);
-  }
-  return Offset;
-}
-
-/// Split the basic block containing MI into two blocks, which are joined by
-/// an unconditional branch.  Update data structures and renumber blocks to
-/// account for this change and returns the newly created block.
-MachineBasicBlock *
-AArch64BranchFixup::splitBlockBeforeInstr(MachineInstr *MI) {
-  MachineBasicBlock *OrigBB = MI->getParent();
-
-  // Create a new MBB for the code after the OrigBB.
-  MachineBasicBlock *NewBB =
-    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
-  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
-  MF->insert(MBBI, NewBB);
-
-  // Splice the instructions starting with MI over to NewBB.
-  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
-
-  // Add an unconditional branch from OrigBB to NewBB.
-  // Note the new unconditional branch is not being recorded.
-  // There doesn't seem to be meaningful DebugInfo available; this doesn't
-  // correspond to anything in the source.
-  BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::Bimm)).addMBB(NewBB);
-  ++NumSplit;
-
-  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
-  NewBB->transferSuccessors(OrigBB);
-
-  // OrigBB branches to NewBB.
-  OrigBB->addSuccessor(NewBB);
-
-  // Update internal data structures to account for the newly inserted MBB.
-  MF->RenumberBlocks(NewBB);
-
-  // Insert an entry into BBInfo to align it properly with the (newly
-  // renumbered) block numbers.
-  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
-
-  // Figure out how large the OrigBB is.  As the first half of the original
-  // block, it cannot contain a tablejump.  The size includes
-  // the new jump we added.  (It should be possible to do this without
-  // recounting everything, but it's very confusing, and this is rarely
-  // executed.)
-  computeBlockSize(OrigBB);
-
-  // Figure out how large the NewMBB is.  As the second half of the original
-  // block, it may contain a tablejump.
-  computeBlockSize(NewBB);
-
-  // All BBOffsets following these blocks must be modified.
-  adjustBBOffsetsAfter(OrigBB);
-
-  return NewBB;
-}
-
-void AArch64BranchFixup::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
-  unsigned BBNum = BB->getNumber();
-  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
-    // Get the offset and known bits at the end of the layout predecessor.
-    // Include the alignment of the current block.
-    unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment();
-    unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
-    unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
-
-    // This is where block i begins.  Stop if the offset is already correct,
-    // and we have updated 2 blocks.  This is the maximum number of blocks
-    // changed before calling this function.
-    if (i > BBNum + 2 &&
-        BBInfo[i].Offset == Offset &&
-        BBInfo[i].KnownBits == KnownBits)
-      break;
-
-    BBInfo[i].Offset = Offset;
-    BBInfo[i].KnownBits = KnownBits;
-  }
-}
-
-/// Returns true if the distance between specific MI and specific BB can fit in
-/// MI's displacement field.
-bool AArch64BranchFixup::isBBInRange(MachineInstr *MI,
-                                     MachineBasicBlock *DestBB,
-                                     unsigned OffsetBits) {
-  int64_t BrOffset   = getOffsetOf(MI);
-  int64_t DestOffset = BBInfo[DestBB->getNumber()].Offset;
-
-  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
-               << " from BB#" << MI->getParent()->getNumber()
-               << " bits available=" << OffsetBits
-               << " from " << getOffsetOf(MI) << " to " << DestOffset
-               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
-
-  return isIntN(OffsetBits, DestOffset - BrOffset);
-}
-
-/// Fix up an immediate branch whose destination is too far away to fit in its
-/// displacement field.
-bool AArch64BranchFixup::fixupImmediateBr(ImmBranch &Br) {
-  MachineInstr *MI = Br.MI;
-  MachineBasicBlock *DestBB = 0;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    if (MI->getOperand(i).isMBB()) {
-      DestBB = MI->getOperand(i).getMBB();
-      break;
-    }
-  }
-  assert(DestBB && "Branch with no destination BB?");
-
-  // Check to see if the DestBB is already in-range.
-  if (isBBInRange(MI, DestBB, Br.OffsetBits))
-    return false;
-
-  assert(Br.IsCond && "Only conditional branches should need fixup");
-  return fixupConditionalBr(Br);
-}
-
-/// Fix up a conditional branch whose destination is too far away to fit in its
-/// displacement field. It is converted to an inverse conditional branch + an
-/// unconditional branch to the destination.
-bool
-AArch64BranchFixup::fixupConditionalBr(ImmBranch &Br) {
-  MachineInstr *MI = Br.MI;
-  MachineBasicBlock *MBB = MI->getParent();
-  unsigned CondBrMBBOperand = 0;
-
-  // The general idea is to add an unconditional branch to the destination and
-  // invert the conditional branch to jump over it. Complications occur around
-  // fallthrough and unreachable ends to the block.
-  //   b.lt L1
-  //   =>
-  //   b.ge L2
-  //   b   L1
-  // L2:
-
-  // First we invert the conditional branch, by creating a replacement if
-  // necessary. This if statement contains all the special handling of different
-  // branch types.
-  if (MI->getOpcode() == AArch64::Bcc) {
-    // The basic block is operand number 1 for Bcc
-    CondBrMBBOperand = 1;
-
-    A64CC::CondCodes CC = (A64CC::CondCodes)MI->getOperand(0).getImm();
-    CC = A64InvertCondCode(CC);
-    MI->getOperand(0).setImm(CC);
-  } else {
-    MachineInstrBuilder InvertedMI;
-    int InvertedOpcode;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("Unknown branch type");
-    case AArch64::TBZxii: InvertedOpcode = AArch64::TBNZxii; break;
-    case AArch64::TBZwii: InvertedOpcode = AArch64::TBNZwii; break;
-    case AArch64::TBNZxii: InvertedOpcode = AArch64::TBZxii; break;
-    case AArch64::TBNZwii: InvertedOpcode = AArch64::TBZwii; break;
-    case AArch64::CBZx: InvertedOpcode = AArch64::CBNZx; break;
-    case AArch64::CBZw: InvertedOpcode = AArch64::CBNZw; break;
-    case AArch64::CBNZx: InvertedOpcode = AArch64::CBZx; break;
-    case AArch64::CBNZw: InvertedOpcode = AArch64::CBZw; break;
-    }
-
-    InvertedMI = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(InvertedOpcode));
-    for (unsigned i = 0, e= MI->getNumOperands(); i != e; ++i) {
-      InvertedMI.addOperand(MI->getOperand(i));
-      if (MI->getOperand(i).isMBB())
-        CondBrMBBOperand = i;
-    }
-
-    MI->eraseFromParent();
-    MI = Br.MI = InvertedMI;
-  }
-
-  // If the branch is at the end of its MBB and that has a fall-through block,
-  // direct the updated conditional branch to the fall-through
-  // block. Otherwise, split the MBB before the next instruction.
-  MachineInstr *BMI = &MBB->back();
-  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
-
-  ++NumCBrFixed;
-  if (BMI != MI) {
-    if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
-        BMI->getOpcode() == AArch64::Bimm) {
-      // Last MI in the BB is an unconditional branch. We can swap destinations:
-      // b.eq L1 (temporarily b.ne L1 after first change)
-      // b   L2
-      // =>
-      // b.ne L2
-      // b   L1
-      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
-      if (isBBInRange(MI, NewDest, Br.OffsetBits)) {
-        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
-                     << *BMI);
-        MachineBasicBlock *DestBB = MI->getOperand(CondBrMBBOperand).getMBB();
-        BMI->getOperand(0).setMBB(DestBB);
-        MI->getOperand(CondBrMBBOperand).setMBB(NewDest);
-        return true;
-      }
-    }
-  }
-
-  if (NeedSplit) {
-    MachineBasicBlock::iterator MBBI = MI; ++MBBI;
-    splitBlockBeforeInstr(MBBI);
-    // No need for the branch to the next block. We're adding an unconditional
-    // branch to the destination.
-    int delta = TII->getInstSizeInBytes(MBB->back());
-    BBInfo[MBB->getNumber()].Size -= delta;
-    MBB->back().eraseFromParent();
-    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
-  }
-
-  // After splitting and removing the unconditional branch from the original BB,
-  // the structure is now:
-  // oldbb:
-  //   [things]
-  //   b.invertedCC L1
-  // splitbb/fallthroughbb:
-  //   [old b L2/real continuation]
-  //
-  // We now have to change the conditional branch to point to splitbb and add an
-  // unconditional branch after it to L1, giving the final structure:
-  // oldbb:
-  //   [things]
-  //   b.invertedCC splitbb
-  //   b L1
-  // splitbb/fallthroughbb:
-  //   [old b L2/real continuation]
-  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
-
-  DEBUG(dbgs() << "  Insert B to BB#"
-               << MI->getOperand(CondBrMBBOperand).getMBB()->getNumber()
-               << " also invert condition and change dest. to BB#"
-               << NextBB->getNumber() << "\n");
-
-  // Insert a new unconditional branch and fixup the destination of the
-  // conditional one.  Also update the ImmBranch as well as adding a new entry
-  // for the new branch.
-  BuildMI(MBB, DebugLoc(), TII->get(AArch64::Bimm))
-    .addMBB(MI->getOperand(CondBrMBBOperand).getMBB());
-  MI->getOperand(CondBrMBBOperand).setMBB(NextBB);
-
-  BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
-
-  // 26 bits written down in Bimm, specifying a multiple of 4.
-  unsigned OffsetBits = 26 + 2;
-  ImmBranches.push_back(ImmBranch(&MBB->back(), OffsetBits, false));
-
-  adjustBBOffsetsAfter(MBB);
-  return true;
-}
diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
new file mode 100644
index 0000000..5209452
--- /dev/null
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -0,0 +1,510 @@
+//===-- AArch64BranchRelaxation.cpp - AArch64 branch relaxation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-branch-relax"
+
+static cl::opt<bool>
+BranchRelaxation("aarch64-branch-relax", cl::Hidden, cl::init(true),
+                 cl::desc("Relax out of range conditional branches"));
+
+static cl::opt<unsigned>
+TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
+                    cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
+                    cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
+
+static cl::opt<unsigned>
+BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
+                    cl::desc("Restrict range of Bcc instructions (DEBUG)"));
+
+STATISTIC(NumSplit, "Number of basic blocks split");
+STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
+
+namespace {
+class AArch64BranchRelaxation : public MachineFunctionPass {
+  /// BasicBlockInfo - Information about the offset and size of a single
+  /// basic block.
+  struct BasicBlockInfo {
+    /// Offset - Distance from the beginning of the function to the beginning
+    /// of this basic block.
+    ///
+    /// The offset is always aligned as required by the basic block.
+    unsigned Offset;
+
+    /// Size - Size of the basic block in bytes.  If the block contains
+    /// inline assembly, this is a worst case estimate.
+    ///
+    /// The size does not include any alignment padding whether from the
+    /// beginning of the block, or from an aligned jump table at the end.
+    unsigned Size;
+
+    BasicBlockInfo() : Offset(0), Size(0) {}
+
+    /// Compute the offset immediately following this block.  If LogAlign is
+    /// specified, return the offset the successor block will get if it has
+    /// this alignment.
+    unsigned postOffset(unsigned LogAlign = 0) const {
+      unsigned PO = Offset + Size;
+      unsigned Align = 1 << LogAlign;
+      return (PO + Align - 1) / Align * Align;
+    }
+  };
+
+  SmallVector<BasicBlockInfo, 16> BlockInfo;
+
+  MachineFunction *MF;
+  const AArch64InstrInfo *TII;
+
+  bool relaxBranchInstructions();
+  void scanFunction();
+  MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+  void adjustBlockOffsets(MachineBasicBlock &MBB);
+  bool isBlockInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+  bool fixupConditionalBranch(MachineInstr *MI);
+  void computeBlockSize(const MachineBasicBlock &MBB);
+  unsigned getInstrOffset(MachineInstr *MI) const;
+  void dumpBBs();
+  void verify();
+
+public:
+  static char ID;
+  AArch64BranchRelaxation() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "AArch64 branch relaxation pass";
+  }
+};
+char AArch64BranchRelaxation::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void AArch64BranchRelaxation::verify() {
+#ifndef NDEBUG
+  unsigned PrevNum = MF->begin()->getNumber();
+  for (MachineBasicBlock &MBB : *MF) {
+    unsigned Align = MBB.getAlignment();
+    unsigned Num = MBB.getNumber();
+    assert(BlockInfo[Num].Offset % (1u << Align) == 0);
+    assert(!Num || BlockInfo[PrevNum].postOffset() <= BlockInfo[Num].Offset);
+    PrevNum = Num;
+  }
+#endif
+}
+
+/// print block size and offset information - debugging
+void AArch64BranchRelaxation::dumpBBs() {
+  for (auto &MBB : *MF) {
+    const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
+    dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
+           << format("size=%#x\n", BBI.Size);
+  }
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  MachineBasicBlock *NextBB = std::next(MBBI);
+  if (NextBB == MBB->getParent()->end())
+    return false;
+
+  for (MachineBasicBlock *S : MBB->successors()) 
+    if (S == NextBB)
+      return true;
+
+  return false;
+}
+
+/// scanFunction - Do the initial scan of the function, building up
+/// information about each block.
+void AArch64BranchRelaxation::scanFunction() {
+  BlockInfo.clear();
+  BlockInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineBasicBlock &MBB : *MF)
+    computeBlockSize(MBB);
+
+  // Compute block offsets and known bits.
+  adjustBlockOffsets(*MF->begin());
+}
+
+/// computeBlockSize - Compute the size for MBB.
+/// This function updates BlockInfo directly.
+void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) {
+  unsigned Size = 0;
+  for (const MachineInstr &MI : MBB)
+    Size += TII->GetInstSizeInBytes(&MI);
+  BlockInfo[MBB.getNumber()].Size = Size;
+}
+
+/// getInstrOffset - Return the current offset of the specified machine
+/// instruction from the start of the function.  This offset changes as stuff is
+/// moved around inside the function.
+unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BlockInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+  return Offset;
+}
+
+void AArch64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
+  unsigned PrevNum = Start.getNumber();
+  for (auto &MBB : make_range(MachineFunction::iterator(Start), MF->end())) {
+    unsigned Num = MBB.getNumber();
+    if (!Num) // block zero is never changed from offset zero.
+      continue;
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned LogAlign = MBB.getAlignment();
+    BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(LogAlign);
+    PrevNum = Num;
+  }
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+/// NOTE: Successor list of the original BB is out of date after this function,
+/// and must be updated by the caller! Other transforms follow using this
+/// utility function, so no point updating now rather than waiting.
+MachineBasicBlock *
+AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+      MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB;
+  ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::B)).addMBB(NewBB);
+
+  // Insert an entry into BlockInfo to align it properly with the block numbers.
+  BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(*OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(*NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBlockOffsets(*OrigBB);
+
+  ++NumSplit;
+
+  return NewBB;
+}
+
+/// isBlockInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool AArch64BranchRelaxation::isBlockInRange(MachineInstr *MI,
+                                             MachineBasicBlock *DestBB,
+                                             unsigned Bits) {
+  unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2;
+  unsigned BrOffset = getInstrOffset(MI);
+  unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxOffs << " from " << getInstrOffset(MI)
+               << " to " << DestOffset << " offset "
+               << int(DestOffset - BrOffset) << "\t" << *MI);
+
+  // Branch before the Dest.
+  if (BrOffset <= DestOffset)
+    return (DestOffset - BrOffset <= MaxOffs);
+  return (BrOffset - DestOffset <= MaxOffs);
+}
+
+static bool isConditionalBranch(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::TBZW:
+  case AArch64::TBNZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZX:
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+  case AArch64::Bcc:
+    return true;
+  }
+}
+
+static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    assert(0 && "unexpected opcode!");
+  case AArch64::TBZW:
+  case AArch64::TBNZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZX:
+    return MI->getOperand(2).getMBB();
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+  case AArch64::Bcc:
+    return MI->getOperand(1).getMBB();
+  }
+}
+
+static unsigned getOppositeConditionOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    assert(0 && "unexpected opcode!");
+  case AArch64::TBNZW:   return AArch64::TBZW;
+  case AArch64::TBNZX:   return AArch64::TBZX;
+  case AArch64::TBZW:    return AArch64::TBNZW;
+  case AArch64::TBZX:    return AArch64::TBNZX;
+  case AArch64::CBNZW:   return AArch64::CBZW;
+  case AArch64::CBNZX:   return AArch64::CBZX;
+  case AArch64::CBZW:    return AArch64::CBNZW;
+  case AArch64::CBZX:    return AArch64::CBNZX;
+  case AArch64::Bcc:     return AArch64::Bcc; // Condition is an operand for Bcc.
+  }
+}
+
+static unsigned getBranchDisplacementBits(unsigned Opc) {
+  switch (Opc) {
+  default:
+    assert(0 && "unexpected opcode!");
+  case AArch64::TBNZW:
+  case AArch64::TBZW:
+  case AArch64::TBNZX:
+  case AArch64::TBZX:
+    return TBZDisplacementBits;
+  case AArch64::CBNZW:
+  case AArch64::CBZW:
+  case AArch64::CBNZX:
+  case AArch64::CBZX:
+    return CBZDisplacementBits;
+  case AArch64::Bcc:
+    return BCCDisplacementBits;
+  }
+}
+
+static inline void invertBccCondition(MachineInstr *MI) {
+  assert(MI->getOpcode() == AArch64::Bcc && "Unexpected opcode!");
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(0).getImm();
+  CC = AArch64CC::getInvertedCondCode(CC);
+  MI->getOperand(0).setImm((int64_t)CC);
+}
+
+/// fixupConditionalBranch - Fix up a conditional branch whose destination is
+/// too far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
+  MachineBasicBlock *DestBB = getDestBlock(MI);
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // tbz L1
+  // =>
+  // tbnz L2
+  // b   L1
+  // L2:
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  if (BMI != MI) {
+    if (std::next(MachineBasicBlock::iterator(MI)) ==
+            std::prev(MBB->getLastNonDebugInstr()) &&
+        BMI->getOpcode() == AArch64::B) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBlockInRange(MI, NewDest,
+                         getBranchDisplacementBits(MI->getOpcode()))) {
+        DEBUG(dbgs() << "  Invert condition and swap its destination with "
+                     << *BMI);
+        BMI->getOperand(0).setMBB(DestBB);
+        unsigned OpNum = (MI->getOpcode() == AArch64::TBZW ||
+                          MI->getOpcode() == AArch64::TBNZW ||
+                          MI->getOpcode() == AArch64::TBZX ||
+                          MI->getOpcode() == AArch64::TBNZX)
+                             ? 2
+                             : 1;
+        MI->getOperand(OpNum).setMBB(NewDest);
+        MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode())));
+        if (MI->getOpcode() == AArch64::Bcc)
+          invertBccCondition(MI);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    // Analyze the branch so we know how to update the successor lists.
+    MachineBasicBlock *TBB, *FBB;
+    SmallVector<MachineOperand, 2> Cond;
+    TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false);
+
+    MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BlockInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below
+
+    // Update the successor lists according to the transformation to follow.
+    // Do it here since if there's no split, no update is needed.
+    MBB->replaceSuccessor(FBB, NewBB);
+    NewBB->addSuccessor(FBB);
+  }
+  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << ", invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  MachineInstrBuilder MIB = BuildMI(
+      MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode())))
+                                .addOperand(MI->getOperand(0));
+  if (MI->getOpcode() == AArch64::TBZW || MI->getOpcode() == AArch64::TBNZW ||
+      MI->getOpcode() == AArch64::TBZX || MI->getOpcode() == AArch64::TBNZX)
+    MIB.addOperand(MI->getOperand(1));
+  if (MI->getOpcode() == AArch64::Bcc)
+    invertBccCondition(MIB);
+  MIB.addMBB(NextBB);
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB);
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+
+  // Finally, keep the block offsets up to date.
+  adjustBlockOffsets(*MBB);
+  return true;
+}
+
+bool AArch64BranchRelaxation::relaxBranchInstructions() {
+  bool Changed = false;
+  // Relaxing branches involves creating new basic blocks, so re-eval
+  // end() for termination.
+  for (auto &MBB : *MF) {
+    MachineInstr *MI = MBB.getFirstTerminator();
+    if (isConditionalBranch(MI->getOpcode()) &&
+        !isBlockInRange(MI, getDestBlock(MI),
+                        getBranchDisplacementBits(MI->getOpcode()))) {
+      fixupConditionalBranch(MI);
+      ++NumRelaxed;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+
+  // If the pass is disabled, just bail early.
+  if (!BranchRelaxation)
+    return false;
+
+  DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n");
+
+  TII = (const AArch64InstrInfo *)MF->getTarget().getInstrInfo();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block.
+  scanFunction();
+
+  DEBUG(dbgs() << "  Basic blocks before relaxation\n");
+  DEBUG(dumpBBs());
+
+  bool MadeChange = false;
+  while (relaxBranchInstructions())
+    MadeChange = true;
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify();
+
+  DEBUG(dbgs() << "  Basic blocks after relaxation\n");
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BlockInfo.clear();
+
+  return MadeChange;
+}
+
+/// createAArch64BranchRelaxation - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createAArch64BranchRelaxation() {
+  return new AArch64BranchRelaxation();
+}
diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td
deleted file mode 100644
index 9fe6aae..0000000
--- a/lib/Target/AArch64/AArch64CallingConv.td
+++ /dev/null
@@ -1,197 +0,0 @@
-//==-- AArch64CallingConv.td - Calling Conventions for ARM ----*- tblgen -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// This describes the calling conventions for AArch64 architecture.
-//===----------------------------------------------------------------------===//
-
-
-// The AArch64 Procedure Call Standard is unfortunately specified at a slightly
-// higher level of abstraction than LLVM's target interface presents. In
-// particular, it refers (like other ABIs, in fact) directly to
-// structs. However, generic LLVM code takes the liberty of lowering structure
-// arguments to the component fields before we see them.
-//
-// As a result, the obvious direct map from LLVM IR to PCS concepts can't be
-// implemented, so the goals of this calling convention are, in decreasing
-// priority order:
-//     1. Expose *some* way to express the concepts required to implement the
-//        generic PCS from a front-end.
-//     2. Provide a sane ABI for pure LLVM.
-//     3. Follow the generic PCS as closely as is naturally possible.
-//
-// The suggested front-end implementation of PCS features is:
-//     * Integer, float and vector arguments of all sizes which end up in
-//       registers are passed and returned via the natural LLVM type.
-//     * Structure arguments with size <= 16 bytes are passed and returned in
-//       registers as similar integer or composite types. For example:
-//       [1 x i64], [2 x i64] or [1 x i128] (if alignment 16 needed).
-//     * HFAs in registers follow rules similar to small structs: appropriate
-//       composite types.
-//     * Structure arguments with size > 16 bytes are passed via a pointer,
-//       handled completely by the front-end.
-//     * Structure return values > 16 bytes via an sret pointer argument.
-//     * Other stack-based arguments (not large structs) are passed using byval
-//       pointers. Padding arguments are added beforehand to guarantee a large
-//       struct doesn't later use integer registers.
-//
-// N.b. this means that it is the front-end's responsibility (if it cares about
-// PCS compliance) to check whether enough registers are available for an
-// argument when deciding how to pass it.
-
-class CCIfAlign<int Align, CCAction A>:
-  CCIf<"ArgFlags.getOrigAlign() == " # Align, A>;
-
-def CC_A64_APCS : CallingConv<[
-  // SRet is an LLVM-specific concept, so it takes precedence over general ABI
-  // concerns. However, this rule will be used by C/C++ frontends to implement
-  // structure return.
-  CCIfSRet<CCAssignToReg<[X8]>>,
-
-  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
-  // slot is 64-bit.
-  CCIfByVal<CCPassByVal<8, 8>>,
-
-  // Canonicalise the various types that live in different floating-point
-  // registers. This makes sense because the PCS does not distinguish Short
-  // Vectors and Floating-point types.
-  CCIfType<[v1i16, v2i8], CCBitConvertToType<f16>>,
-  CCIfType<[v1i32, v4i8, v2i16], CCBitConvertToType<f32>>,
-  CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64, v1f64], CCBitConvertToType<f64>>,
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-           CCBitConvertToType<f128>>,
-
-  // PCS: "C.1: If the argument is a Half-, Single-, Double- or Quad- precision
-  // Floating-point or Short Vector Type and the NSRN is less than 8, then the
-  // argument is allocated to the least significant bits of register
-  // v[NSRN]. The NSRN is incremented by one. The argument has now been
-  // allocated."
-  CCIfType<[v1i8], CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>,
-  CCIfType<[f16],  CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
-  CCIfType<[f32],  CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
-  CCIfType<[f64],  CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
-  CCIfType<[f128], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-
-  // PCS: "C.2: If the argument is an HFA and there are sufficient unallocated
-  // SIMD and Floating-point registers (NSRN - number of elements < 8), then the
-  // argument is allocated to SIMD and Floating-point registers (with one
-  // register per element of the HFA). The NSRN is incremented by the number of
-  // registers used. The argument has now been allocated."
-  //
-  // N.b. As above, this rule is the responsibility of the front-end.
-
-  // "C.3: If the argument is an HFA then the NSRN is set to 8 and the size of
-  // the argument is rounded up to the nearest multiple of 8 bytes."
-  //
-  // "C.4: If the argument is an HFA, a Quad-precision Floating-point or Short
-  // Vector Type then the NSAA is rounded up to the larger of 8 or the Natural
-  // Alignment of the Argument's type."
-  //
-  // It is expected that these will be satisfied by adding dummy arguments to
-  // the prototype.
-
-  // PCS: "C.5: If the argument is a Half- or Single- precision Floating-point
-  // type then the size of the argument is set to 8 bytes. The effect is as if
-  // the argument had been copied to the least significant bits of a 64-bit
-  // register and the remaining bits filled with unspecified values."
-  CCIfType<[f16, f32], CCPromoteToType<f64>>,
-
-  // PCS: "C.6: If the argument is an HFA, a Half-, Single-, Double- or Quad-
-  // precision Floating-point or Short Vector Type, then the argument is copied
-  // to memory at the adjusted NSAA. The NSAA is incremented by the size of the
-  // argument. The argument has now been allocated."
-  CCIfType<[f64], CCAssignToStack<8, 8>>,
-  CCIfType<[f128], CCAssignToStack<16, 16>>,
-
-  // PCS: "C.7: If the argument is an Integral Type, the size of the argument is
-  // less than or equal to 8 bytes and the NGRN is less than 8, the argument is
-  // copied to the least significant bits of x[NGRN]. The NGRN is incremented by
-  // one. The argument has now been allocated."
-
-  // First we implement C.8 and C.9 (128-bit types get even registers). i128 is
-  // represented as two i64s, the first one being split. If we delayed this
-  // operation C.8 would never be reached.
-  CCIfType<[i64],
-        CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6], [X0, X1, X3, X5]>>>,
-
-  // Note: the promotion also implements C.14.
-  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
-
-  // And now the real implementation of C.7
-  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
-
-  // PCS: "C.8: If the argument has an alignment of 16 then the NGRN is rounded
-  // up to the next even number."
-  //
-  // "C.9: If the argument is an Integral Type, the size of the argument is
-  // equal to 16 and the NGRN is less than 7, the argument is copied to x[NGRN]
-  // and x[NGRN+1], x[NGRN] shall contain the lower addressed double-word of the
-  // memory representation of the argument. The NGRN is incremented by two. The
-  // argument has now been allocated."
-  //
-  // Subtlety here: what if alignment is 16 but it is not an integral type? All
-  // floating-point types have been allocated already, which leaves composite
-  // types: this is why a front-end may need to produce i128 for a struct <= 16
-  // bytes.
-
-  // PCS: "C.10 If the argument is a Composite Type and the size in double-words
-  // of the argument is not more than 8 minus NGRN, then the argument is copied
-  // into consecutive general-purpose registers, starting at x[NGRN]. The
-  // argument is passed as though it had been loaded into the registers from a
-  // double-word aligned address with an appropriate sequence of LDR
-  // instructions loading consecutive registers from memory (the contents of any
-  // unused parts of the registers are unspecified by this standard). The NGRN
-  // is incremented by the number of registers used. The argument has now been
-  // allocated."
-  //
-  // Another one that's the responsibility of the front-end (sigh).
-
-  // PCS: "C.11: The NGRN is set to 8."
-  CCCustom<"CC_AArch64NoMoreRegs">,
-
-  // PCS: "C.12: The NSAA is rounded up to the larger of 8 or the Natural
-  // Alignment of the argument's type."
-  //
-  // PCS: "C.13: If the argument is a composite type then the argument is copied
-  // to memory at the adjusted NSAA. The NSAA is by the size of the
-  // argument. The argument has now been allocated."
-  //
-  // Note that the effect of this corresponds to a memcpy rather than register
-  // stores so that the struct ends up correctly addressable at the adjusted
-  // NSAA.
-
-  // PCS: "C.14: If the size of the argument is less than 8 bytes then the size
-  // of the argument is set to 8 bytes. The effect is as if the argument was
-  // copied to the least significant bits of a 64-bit register and the remaining
-  // bits filled with unspecified values."
-  //
-  // Integer types were widened above. Floating-point and composite types have
-  // already been allocated completely. Nothing to do.
-
-  // PCS: "C.15: The argument is copied to memory at the adjusted NSAA. The NSAA
-  // is incremented by the size of the argument. The argument has now been
-  // allocated."
-  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
-  CCIfType<[i64], CCAssignToStack<8, 8>>
-
-]>;
-
-// According to the PCS, X19-X30 are callee-saved, however only the low 64-bits
-// of vector registers (8-15) are callee-saved. The order here is is picked up
-// by PrologEpilogInserter.cpp to allocate stack slots, starting from top of
-// stack upon entry. This gives the customary layout of x30 at [sp-8], x29 at
-// [sp-16], ...
-def CSR_PCS : CalleeSavedRegs<(add (sequence "X%u", 30, 19),
-                                   (sequence "D%u", 15, 8))>;
-
-
-// TLS descriptor calls are extremely restricted in their changes, to allow
-// optimisations in the (hopefully) more common fast path where no real action
-// is needed. They actually have to preserve all registers, except for the
-// unavoidable X30 and the return register X0.
-def TLSDesc : CalleeSavedRegs<(add (sequence "X%u", 29, 1),
-                                   (sequence "Q%u", 31, 0))>;
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
new file mode 100644
index 0000000..ded2e17
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -0,0 +1,240 @@
+//=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for AArch64 architecture.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A> :
+  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+/// CCIfBigEndian - Match only if we're in big endian mode.
+class CCIfBigEndian<CCAction A> :
+  CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>;
+
+class CCIfUnallocated<string Reg, CCAction A> :
+  CCIf<"!State.isAllocated(AArch64::" # Reg # ")", A>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_AArch64_AAPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+  // Big endian vectors must be passed as if they were 1-element vectors so that
+  // their lanes are in a consistent order.
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+                         CCBitConvertToType<f64>>>,
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+                         CCBitConvertToType<f128>>>,
+
+  // An SRet is passed in X8, not X0 like a normal pointer parameter.
+  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+  // slot is 64-bit.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+  // up to eight each of GPR and FPR.
+  CCIfType<[i1, i8, i16], CCIfUnallocated<"X7", CCPromoteToType<i32>>>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  // i128 is split to two i64s, we can't fit half to register X7.
+  CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6],
+                                                    [X0, X1, X3, X5]>>>,
+
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // If more than will fit in registers, pass them on the stack instead.
+  CCIfType<[i1, i8, i16], CCAssignToStack<8, 8>>,
+  CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToStack<16, 16>>
+]>;
+
+def RetCC_AArch64_AAPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+  // Big endian vectors must be passed as if they were 1-element vectors so that
+  // their lanes are in a consistent order.
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+                         CCBitConvertToType<f64>>>,
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+                         CCBitConvertToType<f128>>>,
+
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+      CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                              [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+      CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+
+// Darwin uses a calling convention which differs in only two ways
+// from the standard one at this level:
+//     + i128s (i.e. split i64s) don't need even registers.
+//     + Stack slots are sized as needed rather than being at least 64-bit.
+def CC_AArch64_DarwinPCS : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // An SRet is passed in X8, not X0 like a normal pointer parameter.
+  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+
+  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+  // slot is 64-bit.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
+  // up to eight each of GPR and FPR.
+  CCIfType<[i1, i8, i16], CCIfUnallocated<"X7", CCPromoteToType<i32>>>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  // i128 is split to two i64s, we can't fit half to register X7.
+  CCIfType<[i64],
+           CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6],
+                                             [W0, W1, W2, W3, W4, W5, W6]>>>,
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // If more than will fit in registers, pass them on the stack instead.
+  CCIfType<[i1, i8], CCAssignToStack<1, 1>>,
+  CCIfType<[i16], CCAssignToStack<2, 2>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+]>;
+
+def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // Handle all scalar types as either i64 or f64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+  CCIfType<[f32],          CCPromoteToType<f64>>,
+
+  // Everything is on the stack.
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],   CCAssignToStack<16, 16>>
+]>;
+
+// The WebKit_JS calling convention only passes the first argument (the callee)
+// in register and the remaining arguments on stack. We allow 32bit stack slots,
+// so that WebKit can write partial values in the stack and define the other
+// 32bit quantity as undef.
+def CC_AArch64_WebKit_JS : CallingConv<[
+  // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
+  CCIfType<[i1, i8, i16], CCIfUnallocated<"X0", CCPromoteToType<i32>>>,
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
+
+  // Pass the remaining arguments on the stack instead.
+  CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_AArch64_WebKit_JS : CallingConv<[
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
+                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
+                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+]>;
+
+// FIXME: LR is only callee-saved in the sense that *we* preserve it and are
+// presumably a callee to someone. External functions may not do so, but this
+// is currently safe since BL has LR as an implicit-def and what happens after a
+// tail call doesn't matter.
+//
+// It would be better to model its preservation semantics properly (create a
+// vreg on entry, use it in RET & tail call generation; make that vreg def if we
+// end up saving LR as part of a call frame). Watch this space...
+def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+                                           X23, X24, X25, X26, X27, X28,
+                                           D8,  D9,  D10, D11,
+                                           D12, D13, D14, D15)>;
+
+// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
+// 'this' and the pointer return value are both passed in X0 in these cases,
+// this can be partially modelled by treating X0 as a callee-saved register;
+// only the resulting RegMask is used; the SaveList is ignored
+//
+// (For generic ARM 64-bit ABI code, clang will not generate constructors or
+// destructors with 'this' returns, so this RegMask will not be used in that
+// case)
+def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// guarantees more than a normal AAPCS function. x16 and x17 are used on the
+// fast path for calculation, but other registers except X0 (argument/return)
+// and LR (it is a call, after all) are preserved.
+def CSR_AArch64_TLS_Darwin
+    : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
+                           FP,
+                           (sequence "Q%u", 0, 31))>;
+
+// The ELF stub used for TLS-descriptor access saves every feasible
+// register. Only X0 and LR are clobbered.
+def CSR_AArch64_TLS_ELF
+    : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP,
+                           (sequence "Q%u", 0, 31))>;
+
+def CSR_AArch64_AllRegs
+    : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP,
+                           (sequence "X%u", 0, 28), FP, LR, SP,
+                           (sequence "B%u", 0, 31), (sequence "H%u", 0, 31),
+                           (sequence "S%u", 0, 31), (sequence "D%u", 0, 31),
+                           (sequence "Q%u", 0, 31))>;
+
diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
new file mode 100644
index 0000000..4d23dc5
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -0,0 +1,147 @@
+//===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Local-dynamic access to thread-local variables proceeds in three stages.
+//
+// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated
+//    in much the same way as a general-dynamic TLS-descriptor access against
+//    the special symbol _TLS_MODULE_BASE.
+// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using
+//    instructions with "dtprel" modifiers.
+// 3. These two are added, together with TPIDR_EL0, to obtain the variable's
+//    true address.
+//
+// This is only better than general-dynamic access to the variable if two or
+// more of the first stage TLS-descriptor calculations can be combined. This
+// pass looks through a function and performs such combinations.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+namespace {
+struct LDTLSCleanup : public MachineFunctionPass {
+  static char ID;
+  LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
+      // No point folding accesses if there isn't at least two.
+      return false;
+    }
+
+    MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+    return VisitNode(DT->getRootNode(), 0);
+  }
+
+  // Visit the dominator subtree rooted at Node in pre-order.
+  // If TLSBaseAddrReg is non-null, then use that to replace any
+  // TLS_base_addr instructions. Otherwise, create the register
+  // when the first such instruction is seen, and then use it
+  // as we encounter more instructions.
+  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+    MachineBasicBlock *BB = Node->getBlock();
+    bool Changed = false;
+
+    // Traverse the current block.
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+         ++I) {
+      switch (I->getOpcode()) {
+      case AArch64::TLSDESC_BLR:
+        // Make sure it's a local dynamic access.
+        if (!I->getOperand(1).isSymbol() ||
+            strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
+          break;
+
+        if (TLSBaseAddrReg)
+          I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+        else
+          I = setRegister(I, &TLSBaseAddrReg);
+        Changed = true;
+        break;
+      default:
+        break;
+      }
+    }
+
+    // Visit the children of this block in the dominator tree.
+    for (MachineDomTreeNode *N : *Node) {
+      Changed |= VisitNode(N, TLSBaseAddrReg);
+    }
+
+    return Changed;
+  }
+
+  // Replace the TLS_base_addr instruction I with a copy from
+  // TLSBaseAddrReg, returning the new instruction.
+  MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
+                                       unsigned TLSBaseAddrReg) {
+    MachineFunction *MF = I->getParent()->getParent();
+    const AArch64TargetMachine *TM =
+        static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+    const AArch64InstrInfo *TII = TM->getInstrInfo();
+
+    // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
+    // code sequence assumes the address will be.
+    MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 AArch64::X0).addReg(TLSBaseAddrReg);
+
+    // Erase the TLS_base_addr instruction.
+    I->eraseFromParent();
+
+    return Copy;
+  }
+
+  // Create a virtal register in *TLSBaseAddrReg, and populate it by
+  // inserting a copy instruction after I. Returns the new instruction.
+  MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+    MachineFunction *MF = I->getParent()->getParent();
+    const AArch64TargetMachine *TM =
+        static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+    const AArch64InstrInfo *TII = TM->getInstrInfo();
+
+    // Create a virtual register for the TLS base address.
+    MachineRegisterInfo &RegInfo = MF->getRegInfo();
+    *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
+
+    // Insert a copy from X0 to TLSBaseAddrReg for later.
+    MachineInstr *Next = I->getNextNode();
+    MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 *TLSBaseAddrReg).addReg(AArch64::X0);
+
+    return Copy;
+  }
+
+  const char *getPassName() const override {
+    return "Local Dynamic TLS Access Clean-up";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass *llvm::createAArch64CleanupLocalDynamicTLSPass() {
+  return new LDTLSCleanup();
+}
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
new file mode 100644
index 0000000..6b1f096
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -0,0 +1,1117 @@
+//===---------- AArch64CollectLOH.cpp - AArch64 collect LOH pass --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that collect the Linker Optimization Hint (LOH).
+// This pass should be run at the very end of the compilation flow, just before
+// assembly printer.
+// To be useful for the linker, the LOH must be printed into the assembly file.
+//
+// A LOH describes a sequence of instructions that may be optimized by the
+// linker.
+// This same sequence cannot be optimized by the compiler because some of
+// the information will be known at link time.
+// For instance, consider the following sequence:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: ldr xC, [xB, #imm]
+// This sequence can be turned into:
+// A literal load if sym@PAGE + sym@PAGEOFF + #imm - address(L3) is < 1MB:
+//     L3: ldr xC, sym+#imm
+// It may also be turned into either the following more efficient
+// code sequences:
+// - If sym@PAGEOFF + #imm fits the encoding space of L3.
+//     L1: adrp xA, sym@PAGE
+//     L3: ldr xC, [xB, sym@PAGEOFF + #imm]
+// - If sym@PAGE + sym@PAGEOFF - address(L1) < 1MB:
+//     L1: adr xA, sym
+//     L3: ldr xC, [xB, #imm]
+//
+// To be valid a LOH must meet all the requirements needed by all the related
+// possible linker transformations.
+// For instance, using the running example, the constraints to emit
+// ".loh AdrpAddLdr" are:
+// - L1, L2, and L3 instructions are of the expected type, i.e.,
+//   respectively ADRP, ADD (immediate), and LD.
+// - The result of L1 is used only by L2.
+// - The register argument (xA) used in the ADD instruction is defined
+//   only by L1.
+// - The result of L2 is used only by L3.
+// - The base address (xB) in L3 is defined only L2.
+// - The ADRP in L1 and the ADD in L2 must reference the same symbol using
+//   @PAGE/@PAGEOFF with no additional constants
+//
+// Currently supported LOHs are:
+// * So called non-ADRP-related:
+//   - .loh AdrpAddLdr L1, L2, L3:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: ldr xC, [xB, #imm]
+//   - .loh AdrpLdrGotLdr L1, L2, L3:
+//     L1: adrp xA, sym@GOTPAGE
+//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
+//     L3: ldr xC, [xB, #imm]
+//   - .loh AdrpLdr L1, L3:
+//     L1: adrp xA, sym@PAGE
+//     L3: ldr xC, [xA, sym@PAGEOFF]
+//   - .loh AdrpAddStr L1, L2, L3:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//     L3: str xC, [xB, #imm]
+//   - .loh AdrpLdrGotStr L1, L2, L3:
+//     L1: adrp xA, sym@GOTPAGE
+//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
+//     L3: str xC, [xB, #imm]
+//   - .loh AdrpAdd L1, L2:
+//     L1: adrp xA, sym@PAGE
+//     L2: add xB, xA, sym@PAGEOFF
+//   For all these LOHs, L1, L2, L3 form a simple chain:
+//   L1 result is used only by L2 and L2 result by L3.
+//   L3 LOH-related argument is defined only by L2 and L2 LOH-related argument
+//   by L1.
+// All these LOHs aim at using more efficient load/store patterns by folding
+// some instructions used to compute the address directly into the load/store.
+//
+// * So called ADRP-related:
+//  - .loh AdrpAdrp L2, L1:
+//    L2: ADRP xA, sym1@PAGE
+//    L1: ADRP xA, sym2@PAGE
+//    L2 dominates L1 and xA is not redifined between L2 and L1
+// This LOH aims at getting rid of redundant ADRP instructions.
+//
+// The overall design for emitting the LOHs is:
+// 1. AArch64CollectLOH (this pass) records the LOHs in the AArch64FunctionInfo.
+// 2. AArch64AsmPrinter reads the LOHs from AArch64FunctionInfo and it:
+//     1. Associates them a label.
+//     2. Emits them in a MCStreamer (EmitLOHDirective).
+//         - The MCMachOStreamer records them into the MCAssembler.
+//         - The MCAsmStreamer prints them.
+//         - Other MCStreamers ignore them.
+//     3. Closes the MCStreamer:
+//         - The MachObjectWriter gets them from the MCAssembler and writes
+//           them in the object file.
+//         - Other ObjectWriters ignore them.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-collect-loh"
+
+static cl::opt<bool>
+PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden,
+                   cl::desc("Restrict analysis to registers invovled"
+                            " in LOHs"),
+                   cl::init(true));
+
+static cl::opt<bool>
+BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden,
+                    cl::desc("Restrict analysis at basic block scope"),
+                    cl::init(true));
+
+STATISTIC(NumADRPSimpleCandidate,
+          "Number of simplifiable ADRP dominate by another");
+STATISTIC(NumADRPComplexCandidate2,
+          "Number of simplifiable ADRP reachable by 2 defs");
+STATISTIC(NumADRPComplexCandidate3,
+          "Number of simplifiable ADRP reachable by 3 defs");
+STATISTIC(NumADRPComplexCandidateOther,
+          "Number of simplifiable ADRP reachable by 4 or more defs");
+STATISTIC(NumADDToSTRWithImm,
+          "Number of simplifiable STR with imm reachable by ADD");
+STATISTIC(NumLDRToSTRWithImm,
+          "Number of simplifiable STR with imm reachable by LDR");
+STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD");
+STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR");
+STATISTIC(NumADDToLDRWithImm,
+          "Number of simplifiable LDR with imm reachable by ADD");
+STATISTIC(NumLDRToLDRWithImm,
+          "Number of simplifiable LDR with imm reachable by LDR");
+STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD");
+STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR");
+STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP");
+STATISTIC(NumCplxLvl1, "Number of complex case of level 1");
+STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1");
+STATISTIC(NumCplxLvl2, "Number of complex case of level 2");
+STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2");
+STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
+STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
+
+namespace llvm {
+void initializeAArch64CollectLOHPass(PassRegistry &);
+}
+
+namespace {
+struct AArch64CollectLOH : public MachineFunctionPass {
+  static char ID;
+  AArch64CollectLOH() : MachineFunctionPass(ID) {
+    initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "AArch64 Collect Linker Optimization Hint (LOH)";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineDominatorTree>();
+  }
+
+private:
+};
+
+/// A set of MachineInstruction.
+typedef SetVector<const MachineInstr *> SetOfMachineInstr;
+/// Map a basic block to a set of instructions per register.
+/// This is used to represent the exposed uses of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *, SetOfMachineInstr *>
+BlockToSetOfInstrsPerColor;
+/// Map a basic block to an instruction per register.
+/// This is used to represent the live-out definitions of a basic block
+/// per register.
+typedef MapVector<const MachineBasicBlock *, const MachineInstr **>
+BlockToInstrPerColor;
+/// Map an instruction to a set of instructions. Used to represent the
+/// mapping def to reachable uses or use to definitions.
+typedef MapVector<const MachineInstr *, SetOfMachineInstr> InstrToInstrs;
+/// Map a basic block to a BitVector.
+/// This is used to record the kill registers per basic block.
+typedef MapVector<const MachineBasicBlock *, BitVector> BlockToRegSet;
+
+/// Map a register to a dense id.
+typedef DenseMap<unsigned, unsigned> MapRegToId;
+/// Map a dense id to a register. Used for debug purposes.
+typedef SmallVector<unsigned, 32> MapIdToReg;
+} // end anonymous namespace.
+
+char AArch64CollectLOH::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
+                      "AArch64 Collect Linker Optimization Hint (LOH)", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
+                    "AArch64 Collect Linker Optimization Hint (LOH)", false,
+                    false)
+
+/// Given a couple (MBB, reg) get the corresponding set of instruction from
+/// the given "sets".
+/// If this couple does not reference any set, an empty set is added to "sets"
+/// for this couple and returned.
+/// \param nbRegs is used internally allocate some memory. It must be consistent
+/// with the way sets is used.
+static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
+                                 const MachineBasicBlock &MBB, unsigned reg,
+                                 unsigned nbRegs) {
+  SetOfMachineInstr *result;
+  BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
+  if (it != sets.end())
+    result = it->second;
+  else
+    result = sets[&MBB] = new SetOfMachineInstr[nbRegs];
+
+  return result[reg];
+}
+
+/// Given a couple (reg, MI) get the corresponding set of instructions from the
+/// the given "sets".
+/// This is used to get the uses record in sets of a definition identified by
+/// MI and reg, i.e., MI defines reg.
+/// If the couple does not reference anything, an empty set is added to
+/// "sets[reg]".
+/// \pre set[reg] is valid.
+static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg,
+                                  const MachineInstr &MI) {
+  return sets[reg][&MI];
+}
+
+/// Same as getUses but does not modify the input map: sets.
+/// \return NULL if the couple (reg, MI) is not in sets.
+static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
+                                        const MachineInstr &MI) {
+  InstrToInstrs::const_iterator Res = sets[reg].find(&MI);
+  if (Res != sets[reg].end())
+    return &(Res->second);
+  return nullptr;
+}
+
+/// Initialize the reaching definition algorithm:
+/// For each basic block BB in MF, record:
+/// - its kill set.
+/// - its reachable uses (uses that are exposed to BB's predecessors).
+/// - its the generated definitions.
+/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to
+/// the list of uses of exposed defintions.
+/// \param ADRPMode specifies to only consider ADRP instructions for generated
+/// definition. It also consider definitions of ADRP instructions as uses and
+/// ignore other uses. The ADRPMode is used to collect the information for LHO
+/// that involve ADRP operation only.
+static void initReachingDef(MachineFunction &MF,
+                            InstrToInstrs *ColorOpToReachedUses,
+                            BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+                            BlockToSetOfInstrsPerColor &ReachableUses,
+                            const MapRegToId &RegToId,
+                            const MachineInstr *DummyOp, bool ADRPMode) {
+  const TargetMachine &TM = MF.getTarget();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+
+  unsigned NbReg = RegToId.size();
+
+  for (MachineBasicBlock &MBB : MF) {
+    const MachineInstr **&BBGen = Gen[&MBB];
+    BBGen = new const MachineInstr *[NbReg];
+    memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg);
+
+    BitVector &BBKillSet = Kill[&MBB];
+    BBKillSet.resize(NbReg);
+    for (const MachineInstr &MI : MBB) {
+      bool IsADRP = MI.getOpcode() == AArch64::ADRP;
+
+      // Process uses first.
+      if (IsADRP || !ADRPMode)
+        for (const MachineOperand &MO : MI.operands()) {
+          // Treat ADRP def as use, as the goal of the analysis is to find
+          // ADRP defs reached by other ADRP defs.
+          if (!MO.isReg() || (!ADRPMode && !MO.isUse()) ||
+              (ADRPMode && (!IsADRP || !MO.isDef())))
+            continue;
+          unsigned CurReg = MO.getReg();
+          MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+          if (ItCurRegId == RegToId.end())
+            continue;
+          CurReg = ItCurRegId->second;
+
+          // if CurReg has not been defined, this use is reachable.
+          if (!BBGen[CurReg] && !BBKillSet.test(CurReg))
+            getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI);
+          // current basic block definition for this color, if any, is in Gen.
+          if (BBGen[CurReg])
+            getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI);
+        }
+
+      // Process clobbers.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
+          continue;
+        // Clobbers kill the related colors.
+        const uint32_t *PreservedRegs = MO.getRegMask();
+
+        // Set generated regs.
+        for (const auto Entry : RegToId) {
+          unsigned Reg = Entry.second;
+          // Use the global register ID when querying APIs external to this
+          // pass.
+          if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) {
+            // Do not register clobbered definition for no ADRP.
+            // This definition is not used anyway (otherwise register
+            // allocation is wrong).
+            BBGen[Reg] = ADRPMode ? &MI : nullptr;
+            BBKillSet.set(Reg);
+          }
+        }
+      }
+
+      // Process register defs.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || !MO.isDef())
+          continue;
+        unsigned CurReg = MO.getReg();
+        MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
+        if (ItCurRegId == RegToId.end())
+          continue;
+
+        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
+          MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
+          assert(ItRegId != RegToId.end() &&
+                 "Sub-register of an "
+                 "involved register, not recorded as involved!");
+          BBKillSet.set(ItRegId->second);
+          BBGen[ItRegId->second] = &MI;
+        }
+        BBGen[ItCurRegId->second] = &MI;
+      }
+    }
+
+    // If we restrict our analysis to basic block scope, conservatively add a
+    // dummy
+    // use for each generated value.
+    if (!ADRPMode && DummyOp && !MBB.succ_empty())
+      for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg)
+        if (BBGen[CurReg])
+          getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp);
+  }
+}
+
+/// Reaching def core algorithm:
+/// while an Out has changed
+///    for each bb
+///       for each color
+///           In[bb][color] = U Out[bb.predecessors][color]
+///           insert reachableUses[bb][color] in each in[bb][color]
+///                 op.reachedUses
+///
+///           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+static void reachingDefAlgorithm(MachineFunction &MF,
+                                 InstrToInstrs *ColorOpToReachedUses,
+                                 BlockToSetOfInstrsPerColor &In,
+                                 BlockToSetOfInstrsPerColor &Out,
+                                 BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
+                                 BlockToSetOfInstrsPerColor &ReachableUses,
+                                 unsigned NbReg) {
+  bool HasChanged;
+  do {
+    HasChanged = false;
+    for (MachineBasicBlock &MBB : MF) {
+      unsigned CurReg;
+      for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+        SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg);
+        SetOfMachineInstr &BBReachableUses =
+            getSet(ReachableUses, MBB, CurReg, NbReg);
+        SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg);
+        unsigned Size = BBOutSet.size();
+        //   In[bb][color] = U Out[bb.predecessors][color]
+        for (MachineBasicBlock *PredMBB : MBB.predecessors()) {
+          SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg);
+          BBInSet.insert(PredOutSet.begin(), PredOutSet.end());
+        }
+        //   insert reachableUses[bb][color] in each in[bb][color] op.reachedses
+        for (const MachineInstr *MI : BBInSet) {
+          SetOfMachineInstr &OpReachedUses =
+              getUses(ColorOpToReachedUses, CurReg, *MI);
+          OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end());
+        }
+        //           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
+        if (!Kill[&MBB].test(CurReg))
+          BBOutSet.insert(BBInSet.begin(), BBInSet.end());
+        if (Gen[&MBB][CurReg])
+          BBOutSet.insert(Gen[&MBB][CurReg]);
+        HasChanged |= BBOutSet.size() != Size;
+      }
+    }
+  } while (HasChanged);
+}
+
+/// Release all memory dynamically allocated during the reaching
+/// definition algorithm.
+static void finitReachingDef(BlockToSetOfInstrsPerColor &In,
+                             BlockToSetOfInstrsPerColor &Out,
+                             BlockToInstrPerColor &Gen,
+                             BlockToSetOfInstrsPerColor &ReachableUses) {
+  for (auto &IT : Out)
+    delete[] IT.second;
+  for (auto &IT : In)
+    delete[] IT.second;
+  for (auto &IT : ReachableUses)
+    delete[] IT.second;
+  for (auto &IT : Gen)
+    delete[] IT.second;
+}
+
+/// Reaching definition algorithm.
+/// \param MF function on which the algorithm will operate.
+/// \param[out] ColorOpToReachedUses will contain the result of the reaching
+/// def algorithm.
+/// \param ADRPMode specify whether the reaching def algorithm should be tuned
+/// for ADRP optimization. \see initReachingDef for more details.
+/// \param DummyOp if not NULL, the algorithm will work at
+/// basic block scope and will set for every exposed definition a use to
+/// @p DummyOp.
+/// \pre ColorOpToReachedUses is an array of at least number of registers of
+/// InstrToInstrs.
+static void reachingDef(MachineFunction &MF,
+                        InstrToInstrs *ColorOpToReachedUses,
+                        const MapRegToId &RegToId, bool ADRPMode = false,
+                        const MachineInstr *DummyOp = nullptr) {
+  // structures:
+  // For each basic block.
+  // Out: a set per color of definitions that reach the
+  //      out boundary of this block.
+  // In: Same as Out but for in boundary.
+  // Gen: generated color in this block (one operation per color).
+  // Kill: register set of killed color in this block.
+  // ReachableUses: a set per color of uses (operation) reachable
+  //                for "In" definitions.
+  BlockToSetOfInstrsPerColor Out, In, ReachableUses;
+  BlockToInstrPerColor Gen;
+  BlockToRegSet Kill;
+
+  // Initialize Gen, kill and reachableUses.
+  initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId,
+                  DummyOp, ADRPMode);
+
+  // Algo.
+  if (!DummyOp)
+    reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
+                         ReachableUses, RegToId.size());
+
+  // finit.
+  finitReachingDef(In, Out, Gen, ReachableUses);
+}
+
+#ifndef NDEBUG
+/// print the result of the reaching definition algorithm.
+static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses,
+                             unsigned NbReg, const TargetRegisterInfo *TRI,
+                             const MapIdToReg &IdToReg) {
+  unsigned CurReg;
+  for (CurReg = 0; CurReg < NbReg; ++CurReg) {
+    if (ColorOpToReachedUses[CurReg].empty())
+      continue;
+    DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n");
+
+    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+      DEBUG(dbgs() << "Def:\n");
+      DEBUG(DefsIt.first->print(dbgs()));
+      DEBUG(dbgs() << "Reachable uses:\n");
+      for (const MachineInstr *MI : DefsIt.second) {
+        DEBUG(MI->print(dbgs()));
+      }
+    }
+  }
+}
+#endif // NDEBUG
+
+/// Answer the following question: Can Def be one of the definition
+/// involved in a part of a LOH?
+static bool canDefBePartOfLOH(const MachineInstr *Def) {
+  unsigned Opc = Def->getOpcode();
+  // Accept ADRP, ADDLow and LOADGot.
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::ADRP:
+    return true;
+  case AArch64::ADDXri:
+    // Check immediate to see if the immediate is an address.
+    switch (Def->getOperand(2).getType()) {
+    default:
+      return false;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_JumpTableIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_BlockAddress:
+      return true;
+    }
+  case AArch64::LDRXui:
+    // Check immediate to see if the immediate is an address.
+    switch (Def->getOperand(2).getType()) {
+    default:
+      return false;
+    case MachineOperand::MO_GlobalAddress:
+      return true;
+    }
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction can the end of a LOH chain involving a
+/// store.
+static bool isCandidateStore(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRBui:
+  case AArch64::STRHui:
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+    // In case we have str xA, [xA, #imm], this is two different uses
+    // of xA and we cannot fold, otherwise the xA stored may be wrong,
+    // even if #imm == 0.
+    if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg())
+      return true;
+  }
+  return false;
+}
+
+/// Given the result of a reaching definition algorithm in ColorOpToReachedUses,
+/// Build the Use to Defs information and filter out obvious non-LOH candidates.
+/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions.
+/// In non-ADRPMode, non-LOH candidates are "uses" with several definition,
+/// i.e., no simple chain.
+/// \param ADRPMode -- \see initReachingDef.
+static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
+                              const InstrToInstrs *ColorOpToReachedUses,
+                              const MapRegToId &RegToId,
+                              bool ADRPMode = false) {
+
+  SetOfMachineInstr NotCandidate;
+  unsigned NbReg = RegToId.size();
+  MapRegToId::const_iterator EndIt = RegToId.end();
+  for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) {
+    // If this color is never defined, continue.
+    if (ColorOpToReachedUses[CurReg].empty())
+      continue;
+
+    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+      for (const MachineInstr *MI : DefsIt.second) {
+        const MachineInstr *Def = DefsIt.first;
+        MapRegToId::const_iterator It;
+        // if all the reaching defs are not adrp, this use will not be
+        // simplifiable.
+        if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) ||
+            (!ADRPMode && !canDefBePartOfLOH(Def)) ||
+            (!ADRPMode && isCandidateStore(MI) &&
+             // store are LOH candidate iff the end of the chain is used as
+             // base.
+             ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt ||
+              It->second != CurReg))) {
+          NotCandidate.insert(MI);
+          continue;
+        }
+        // Do not consider self reaching as a simplifiable case for ADRP.
+        if (!ADRPMode || MI != DefsIt.first) {
+          UseToReachingDefs[MI].insert(DefsIt.first);
+          // If UsesIt has several reaching definitions, it is not
+          // candidate for simplificaton in non-ADRPMode.
+          if (!ADRPMode && UseToReachingDefs[MI].size() > 1)
+            NotCandidate.insert(MI);
+        }
+      }
+    }
+  }
+  for (const MachineInstr *Elem : NotCandidate) {
+    DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n");
+    // It would have been better if we could just remove the entry
+    // from the map.  Because of that, we have to filter the garbage
+    // (second.empty) in the subsequence analysis.
+    UseToReachingDefs[Elem].clear();
+  }
+}
+
+/// Based on the use to defs information (in ADRPMode), compute the
+/// opportunities of LOH ADRP-related.
+static void computeADRP(const InstrToInstrs &UseToDefs,
+                        AArch64FunctionInfo &AArch64FI,
+                        const MachineDominatorTree *MDT) {
+  DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
+  for (const auto &Entry : UseToDefs) {
+    unsigned Size = Entry.second.size();
+    if (Size == 0)
+      continue;
+    if (Size == 1) {
+      const MachineInstr *L2 = *Entry.second.begin();
+      const MachineInstr *L1 = Entry.first;
+      if (!MDT->dominates(L2, L1)) {
+        DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1
+                     << '\n');
+        continue;
+      }
+      DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
+      SmallVector<const MachineInstr *, 2> Args;
+      Args.push_back(L2);
+      Args.push_back(L1);
+      AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
+      ++NumADRPSimpleCandidate;
+    }
+#ifdef DEBUG
+    else if (Size == 2)
+      ++NumADRPComplexCandidate2;
+    else if (Size == 3)
+      ++NumADRPComplexCandidate3;
+    else
+      ++NumADRPComplexCandidateOther;
+#endif
+    // if Size < 1, the use should have been removed from the candidates
+    assert(Size >= 1 && "No reaching defs for that use!");
+  }
+}
+
+/// Check whether the given instruction can be the end of a LOH chain
+/// involving a load.
+static bool isCandidateLoad(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::LDRBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)
+      return false;
+    return true;
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction can load a litteral.
+static bool supportLoadFromLiteral(const MachineInstr *Instr) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDRSWui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    return true;
+  }
+  // Unreachable.
+  return false;
+}
+
+/// Check whether the given instruction is a LOH candidate.
+/// \param UseToDefs is used to check that Instr is at the end of LOH supported
+/// chain.
+/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are
+/// already been filtered out.
+static bool isCandidate(const MachineInstr *Instr,
+                        const InstrToInstrs &UseToDefs,
+                        const MachineDominatorTree *MDT) {
+  if (!isCandidateLoad(Instr) && !isCandidateStore(Instr))
+    return false;
+
+  const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
+  if (Def->getOpcode() != AArch64::ADRP) {
+    // At this point, Def is ADDXri or LDRXui of the right type of
+    // symbol, because we filtered out the uses that were not defined
+    // by these kind of instructions (+ ADRP).
+
+    // Check if this forms a simple chain: each intermediate node must
+    // dominates the next one.
+    if (!MDT->dominates(Def, Instr))
+      return false;
+    // Move one node up in the simple chain.
+    if (UseToDefs.find(Def) ==
+            UseToDefs.end()
+            // The map may contain garbage we have to ignore.
+        ||
+        UseToDefs.find(Def)->second.empty())
+      return false;
+    Instr = Def;
+    Def = *UseToDefs.find(Def)->second.begin();
+  }
+  // Check if we reached the top of the simple chain:
+  // - top is ADRP.
+  // - check the simple chain property: each intermediate node must
+  // dominates the next one.
+  if (Def->getOpcode() == AArch64::ADRP)
+    return MDT->dominates(Def, Instr);
+  return false;
+}
+
+static bool registerADRCandidate(const MachineInstr &Use,
+                                 const InstrToInstrs &UseToDefs,
+                                 const InstrToInstrs *DefsPerColorToUses,
+                                 AArch64FunctionInfo &AArch64FI,
+                                 SetOfMachineInstr *InvolvedInLOHs,
+                                 const MapRegToId &RegToId) {
+  // Look for opportunities to turn ADRP -> ADD or
+  // ADRP -> LDR GOTPAGEOFF into ADR.
+  // If ADRP has more than one use. Give up.
+  if (Use.getOpcode() != AArch64::ADDXri &&
+      (Use.getOpcode() != AArch64::LDRXui ||
+       !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT)))
+    return false;
+  InstrToInstrs::const_iterator It = UseToDefs.find(&Use);
+  // The map may contain garbage that we need to ignore.
+  if (It == UseToDefs.end() || It->second.empty())
+    return false;
+  const MachineInstr &Def = **It->second.begin();
+  if (Def.getOpcode() != AArch64::ADRP)
+    return false;
+  // Check the number of users of ADRP.
+  const SetOfMachineInstr *Users =
+      getUses(DefsPerColorToUses,
+              RegToId.find(Def.getOperand(0).getReg())->second, Def);
+  if (Users->size() > 1) {
+    ++NumADRComplexCandidate;
+    return false;
+  }
+  ++NumADRSimpleCandidate;
+  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) &&
+         "ADRP already involved in LOH.");
+  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) &&
+         "ADD already involved in LOH.");
+  DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
+
+  SmallVector<const MachineInstr *, 2> Args;
+  Args.push_back(&Def);
+  Args.push_back(&Use);
+
+  AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd
+                                                           : MCLOH_AdrpLdrGot,
+                          Args);
+  return true;
+}
+
+/// Based on the use to defs information (in non-ADRPMode), compute the
+/// opportunities of LOH non-ADRP-related
+static void computeOthers(const InstrToInstrs &UseToDefs,
+                          const InstrToInstrs *DefsPerColorToUses,
+                          AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId,
+                          const MachineDominatorTree *MDT) {
+  SetOfMachineInstr *InvolvedInLOHs = nullptr;
+#ifdef DEBUG
+  SetOfMachineInstr InvolvedInLOHsStorage;
+  InvolvedInLOHs = &InvolvedInLOHsStorage;
+#endif // DEBUG
+  DEBUG(dbgs() << "*** Compute LOH for Others\n");
+  // ADRP -> ADD/LDR -> LDR/STR pattern.
+  // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern.
+
+  // FIXME: When the statistics are not important,
+  // This initial filtering loop can be merged into the next loop.
+  // Currently, we didn't do it to have the same code for both DEBUG and
+  // NDEBUG builds. Indeed, the iterator of the second loop would need
+  // to be changed.
+  SetOfMachineInstr PotentialCandidates;
+  SetOfMachineInstr PotentialADROpportunities;
+  for (auto &Use : UseToDefs) {
+    // If no definition is available, this is a non candidate.
+    if (Use.second.empty())
+      continue;
+    // Keep only instructions that are load or store and at the end of
+    // a ADRP -> ADD/LDR/Nothing chain.
+    // We already filtered out the no-chain cases.
+    if (!isCandidate(Use.first, UseToDefs, MDT)) {
+      PotentialADROpportunities.insert(Use.first);
+      continue;
+    }
+    PotentialCandidates.insert(Use.first);
+  }
+
+  // Make the following distinctions for statistics as the linker does
+  // know how to decode instructions:
+  // - ADD/LDR/Nothing make there different patterns.
+  // - LDR/STR make two different patterns.
+  // Hence, 6 - 1 base patterns.
+  // (because ADRP-> Nothing -> STR is not simplifiable)
+
+  // The linker is only able to have a simple semantic, i.e., if pattern A
+  // do B.
+  // However, we want to see the opportunity we may miss if we were able to
+  // catch more complex cases.
+
+  // PotentialCandidates are result of a chain ADRP -> ADD/LDR ->
+  // A potential candidate becomes a candidate, if its current immediate
+  // operand is zero and all nodes of the chain have respectively only one user
+#ifdef DEBUG
+  SetOfMachineInstr DefsOfPotentialCandidates;
+#endif
+  for (const MachineInstr *Candidate : PotentialCandidates) {
+    // Get the definition of the candidate i.e., ADD or LDR.
+    const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin();
+    // Record the elements of the chain.
+    const MachineInstr *L1 = Def;
+    const MachineInstr *L2 = nullptr;
+    unsigned ImmediateDefOpc = Def->getOpcode();
+    if (Def->getOpcode() != AArch64::ADRP) {
+      // Check the number of users of this node.
+      const SetOfMachineInstr *Users =
+          getUses(DefsPerColorToUses,
+                  RegToId.find(Def->getOperand(0).getReg())->second, *Def);
+      if (Users->size() > 1) {
+#ifdef DEBUG
+        // if all the uses of this def are in potential candidate, this is
+        // a complex candidate of level 2.
+        bool IsLevel2 = true;
+        for (const MachineInstr *MI : *Users) {
+          if (!PotentialCandidates.count(MI)) {
+            ++NumTooCplxLvl2;
+            IsLevel2 = false;
+            break;
+          }
+        }
+        if (IsLevel2)
+          ++NumCplxLvl2;
+#endif // DEBUG
+        PotentialADROpportunities.insert(Def);
+        continue;
+      }
+      L2 = Def;
+      Def = *UseToDefs.find(Def)->second.begin();
+      L1 = Def;
+    } // else the element in the middle of the chain is nothing, thus
+      // Def already contains the first element of the chain.
+
+    // Check the number of users of the first node in the chain, i.e., ADRP
+    const SetOfMachineInstr *Users =
+        getUses(DefsPerColorToUses,
+                RegToId.find(Def->getOperand(0).getReg())->second, *Def);
+    if (Users->size() > 1) {
+#ifdef DEBUG
+      // if all the uses of this def are in the defs of the potential candidate,
+      // this is a complex candidate of level 1
+      if (DefsOfPotentialCandidates.empty()) {
+        // lazy init
+        DefsOfPotentialCandidates = PotentialCandidates;
+        for (const MachineInstr *Candidate : PotentialCandidates) {
+          if (!UseToDefs.find(Candidate)->second.empty())
+            DefsOfPotentialCandidates.insert(
+                *UseToDefs.find(Candidate)->second.begin());
+        }
+      }
+      bool Found = false;
+      for (auto &Use : *Users) {
+        if (!DefsOfPotentialCandidates.count(Use)) {
+          ++NumTooCplxLvl1;
+          Found = true;
+          break;
+        }
+      }
+      if (!Found)
+        ++NumCplxLvl1;
+#endif // DEBUG
+      continue;
+    }
+
+    bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
+    // If the chain is three instructions long and ldr is the second element,
+    // then this ldr must load form GOT, otherwise this is not a correct chain.
+    if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT)
+      continue;
+    SmallVector<const MachineInstr *, 3> Args;
+    MCLOHType Kind;
+    if (isCandidateLoad(Candidate)) {
+      if (!L2) {
+        // At this point, the candidate LOH indicates that the ldr instruction
+        // may use a direct access to the symbol. There is not such encoding
+        // for loads of byte and half.
+        if (!supportLoadFromLiteral(Candidate))
+          continue;
+
+        DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate
+                     << '\n');
+        Kind = MCLOH_AdrpLdr;
+        Args.push_back(L1);
+        Args.push_back(Candidate);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+        ++NumADRPToLDR;
+      } else {
+        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+                     << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+                     << '\n');
+
+        Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr;
+        Args.push_back(L1);
+        Args.push_back(L2);
+        Args.push_back(Candidate);
+
+        PotentialADROpportunities.remove(L2);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+               "L2 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+#ifdef DEBUG
+        // get the immediate of the load
+        if (Candidate->getOperand(2).getImm() == 0)
+          if (ImmediateDefOpc == AArch64::ADDXri)
+            ++NumADDToLDR;
+          else
+            ++NumLDRToLDR;
+        else if (ImmediateDefOpc == AArch64::ADDXri)
+          ++NumADDToLDRWithImm;
+        else
+          ++NumLDRToLDRWithImm;
+#endif // DEBUG
+      }
+    } else {
+      if (ImmediateDefOpc == AArch64::ADRP)
+        continue;
+      else {
+
+        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
+                     << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
+                     << '\n');
+
+        Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr;
+        Args.push_back(L1);
+        Args.push_back(L2);
+        Args.push_back(Candidate);
+
+        PotentialADROpportunities.remove(L2);
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
+               "L1 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
+               "L2 already involved in LOH.");
+        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
+               "Candidate already involved in LOH.");
+#ifdef DEBUG
+        // get the immediate of the store
+        if (Candidate->getOperand(2).getImm() == 0)
+          if (ImmediateDefOpc == AArch64::ADDXri)
+            ++NumADDToSTR;
+          else
+            ++NumLDRToSTR;
+        else if (ImmediateDefOpc == AArch64::ADDXri)
+          ++NumADDToSTRWithImm;
+        else
+          ++NumLDRToSTRWithImm;
+#endif // DEBUG
+      }
+    }
+    AArch64FI.addLOHDirective(Kind, Args);
+  }
+
+  // Now, we grabbed all the big patterns, check ADR opportunities.
+  for (const MachineInstr *Candidate : PotentialADROpportunities)
+    registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI,
+                         InvolvedInLOHs, RegToId);
+}
+
+/// Look for every register defined by potential LOHs candidates.
+/// Map these registers with dense id in @p RegToId and vice-versa in
+/// @p IdToReg. @p IdToReg is populated only in DEBUG mode.
+static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
+                               MapIdToReg &IdToReg,
+                               const TargetRegisterInfo *TRI) {
+  unsigned CurRegId = 0;
+  if (!PreCollectRegister) {
+    unsigned NbReg = TRI->getNumRegs();
+    for (; CurRegId < NbReg; ++CurRegId) {
+      RegToId[CurRegId] = CurRegId;
+      DEBUG(IdToReg.push_back(CurRegId));
+      DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches"));
+    }
+    return;
+  }
+
+  DEBUG(dbgs() << "** Collect Involved Register\n");
+  for (const auto &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (!canDefBePartOfLOH(&MI))
+        continue;
+
+      // Process defs
+      for (MachineInstr::const_mop_iterator IO = MI.operands_begin(),
+                                            IOEnd = MI.operands_end();
+           IO != IOEnd; ++IO) {
+        if (!IO->isReg() || !IO->isDef())
+          continue;
+        unsigned CurReg = IO->getReg();
+        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI)
+          if (RegToId.find(*AI) == RegToId.end()) {
+            DEBUG(IdToReg.push_back(*AI);
+                  assert(IdToReg[CurRegId] == *AI &&
+                         "Reg index mismatches insertion index."));
+            RegToId[*AI] = CurRegId++;
+            DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n');
+          }
+      }
+    }
+  }
+}
+
+bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+
+  MapRegToId RegToId;
+  MapIdToReg IdToReg;
+  AArch64FunctionInfo *AArch64FI = MF.getInfo<AArch64FunctionInfo>();
+  assert(AArch64FI && "No MachineFunctionInfo for this function!");
+
+  DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n');
+
+  collectInvolvedReg(MF, RegToId, IdToReg, TRI);
+  if (RegToId.empty())
+    return false;
+
+  MachineInstr *DummyOp = nullptr;
+  if (BasicBlockScopeOnly) {
+    const AArch64InstrInfo *TII =
+        static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+    // For local analysis, create a dummy operation to record uses that are not
+    // local.
+    DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
+  }
+
+  unsigned NbReg = RegToId.size();
+  bool Modified = false;
+
+  // Start with ADRP.
+  InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+  // Compute the reaching def in ADRP mode, meaning ADRP definitions
+  // are first considered as uses.
+  reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp);
+  DEBUG(dbgs() << "ADRP reaching defs\n");
+  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+  // Translate the definition to uses map into a use to definitions map to ease
+  // statistic computation.
+  InstrToInstrs ADRPToReachingDefs;
+  reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
+
+  // Compute LOH for ADRP.
+  computeADRP(ADRPToReachingDefs, *AArch64FI, MDT);
+  delete[] ColorOpToReachedUses;
+
+  // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
+  ColorOpToReachedUses = new InstrToInstrs[NbReg];
+
+  // first perform a regular reaching def analysis.
+  reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp);
+  DEBUG(dbgs() << "All reaching defs\n");
+  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
+
+  // Turn that into a use to defs to ease statistic computation.
+  InstrToInstrs UsesToReachingDefs;
+  reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
+
+  // Compute other than AdrpAdrp LOH.
+  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId,
+                MDT);
+  delete[] ColorOpToReachedUses;
+
+  if (BasicBlockScopeOnly)
+    MF.DeleteMachineInstr(DummyOp);
+
+  return Modified;
+}
+
+/// createAArch64CollectLOHPass - returns an instance of the Statistic for
+/// linker optimization pass.
+FunctionPass *llvm::createAArch64CollectLOHPass() {
+  return new AArch64CollectLOH();
+}
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
new file mode 100644
index 0000000..452cdec
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -0,0 +1,919 @@
+//===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64ConditionalCompares pass which reduces
+// branching and code size by using the conditional compare instructions CCMP,
+// CCMN, and FCMP.
+//
+// The CFG transformations for forming conditional compares are very similar to
+// if-conversion, and this pass should run immediately before the early
+// if-conversion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ccmp"
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned> BlockInstrLimit(
+    "aarch64-ccmp-limit", cl::init(30), cl::Hidden,
+    cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("aarch64-stress-ccmp", cl::Hidden,
+                            cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumConsidered, "Number of ccmps considered");
+STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)");
+STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)");
+STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)");
+STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)");
+STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)");
+STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)");
+STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)");
+STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)");
+STATISTIC(NumMultNZCVUses, "Number of ccmps rejected (NZCV used)");
+STATISTIC(NumUnknNZCVDefs, "Number of ccmps rejected (NZCV def unknown)");
+
+STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)");
+
+STATISTIC(NumConverted, "Number of ccmp instructions created");
+STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted");
+
+//===----------------------------------------------------------------------===//
+//                                 SSACCmpConv
+//===----------------------------------------------------------------------===//
+//
+// The SSACCmpConv class performs ccmp-conversion on SSA form machine code
+// after determining if it is possible. The class contains no heuristics;
+// external code should be used to determine when ccmp-conversion is a good
+// idea.
+//
+// CCmp-formation works on a CFG representing chained conditions, typically
+// from C's short-circuit || and && operators:
+//
+//   From:         Head            To:         Head
+//                 / |                         CmpBB
+//                /  |                         / |
+//               |  CmpBB                     /  |
+//               |  / |                    Tail  |
+//               | /  |                      |   |
+//              Tail  |                      |   |
+//                |   |                      |   |
+//               ... ...                    ... ...
+//
+// The Head block is terminated by a br.cond instruction, and the CmpBB block
+// contains compare + br.cond. Tail must be a successor of both.
+//
+// The cmp-conversion turns the compare instruction in CmpBB into a conditional
+// compare, and merges CmpBB into Head, speculatively executing its
+// instructions. The AArch64 conditional compare instructions have an immediate
+// operand that specifies the NZCV flag values when the condition is false and
+// the compare isn't executed. This makes it possible to chain compares with
+// different condition codes.
+//
+// Example:
+//
+//    if (a == 5 || b == 17)
+//      foo();
+//
+//    Head:
+//       cmp  w0, #5
+//       b.eq Tail
+//    CmpBB:
+//       cmp  w1, #17
+//       b.eq Tail
+//    ...
+//    Tail:
+//      bl _foo
+//
+//  Becomes:
+//
+//    Head:
+//       cmp  w0, #5
+//       ccmp w1, #17, 4, ne  ; 4 = nZcv
+//       b.eq Tail
+//    ...
+//    Tail:
+//      bl _foo
+//
+// The ccmp condition code is the one that would cause the Head terminator to
+// branch to CmpBB.
+//
+// FIXME: It should also be possible to speculate a block on the critical edge
+// between Head and Tail, just like if-converting a diamond.
+//
+// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion).
+
+namespace {
+class SSACCmpConv {
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  /// The first block containing a conditional branch, dominating everything
+  /// else.
+  MachineBasicBlock *Head;
+
+  /// The block containing cmp+br.cond with a successor shared with Head.
+  MachineBasicBlock *CmpBB;
+
+  /// The common successor for Head and CmpBB.
+  MachineBasicBlock *Tail;
+
+  /// The compare instruction in CmpBB that can be converted to a ccmp.
+  MachineInstr *CmpMI;
+
+private:
+  /// The branch condition in Head as determined by AnalyzeBranch.
+  SmallVector<MachineOperand, 4> HeadCond;
+
+  /// The condition code that makes Head branch to CmpBB.
+  AArch64CC::CondCode HeadCmpBBCC;
+
+  /// The branch condition in CmpBB.
+  SmallVector<MachineOperand, 4> CmpBBCond;
+
+  /// The condition code that makes CmpBB branch to Tail.
+  AArch64CC::CondCode CmpBBTailCC;
+
+  /// Check if the Tail PHIs are trivially convertible.
+  bool trivialTailPHIs();
+
+  /// Remove CmpBB from the Tail PHIs.
+  void updateTailPHIs();
+
+  /// Check if an operand defining DstReg is dead.
+  bool isDeadDef(unsigned DstReg);
+
+  /// Find the compare instruction in MBB that controls the conditional branch.
+  /// Return NULL if a convertible instruction can't be found.
+  MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB);
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI);
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF) {
+    this->MF = &MF;
+    TII = MF.getTarget().getInstrInfo();
+    TRI = MF.getTarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+  }
+
+  /// If the sub-CFG headed by MBB can be cmp-converted, initialize the
+  /// internal state, and return true.
+  bool canConvert(MachineBasicBlock *MBB);
+
+  /// Cmo-convert the last block passed to canConvertCmp(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks);
+
+  /// Return the expected code size delta if the conversion into a
+  /// conditional compare is performed.
+  int expectedCodeSizeDelta() const;
+};
+} // end anonymous namespace
+
+// Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
+// This means that no if-conversion is required when merging CmpBB into Head.
+bool SSACCmpConv::trivialTailPHIs() {
+  for (auto &I : *Tail) {
+    if (!I.isPHI())
+      break;
+    unsigned HeadReg = 0, CmpBBReg = 0;
+    // PHI operands come in (VReg, MBB) pairs.
+    for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
+      MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
+      unsigned Reg = I.getOperand(oi).getReg();
+      if (MBB == Head) {
+        assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
+        HeadReg = Reg;
+      }
+      if (MBB == CmpBB) {
+        assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands");
+        CmpBBReg = Reg;
+      }
+    }
+    if (HeadReg != CmpBBReg)
+      return false;
+  }
+  return true;
+}
+
+// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply
+// removing the CmpBB operands. The Head operands will be identical.
+void SSACCmpConv::updateTailPHIs() {
+  for (auto &I : *Tail) {
+    if (!I.isPHI())
+      break;
+    // I is a PHI. It can have multiple entries for CmpBB.
+    for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) {
+      // PHI operands are (Reg, MBB) at (oi-2, oi-1).
+      if (I.getOperand(oi - 1).getMBB() == CmpBB) {
+        I.RemoveOperand(oi - 1);
+        I.RemoveOperand(oi - 2);
+      }
+    }
+  }
+}
+
+// This pass runs before the AArch64DeadRegisterDefinitions pass, so compares
+// are still writing virtual registers without any uses.
+bool SSACCmpConv::isDeadDef(unsigned DstReg) {
+  // Writes to the zero register are dead.
+  if (DstReg == AArch64::WZR || DstReg == AArch64::XZR)
+    return true;
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+  // A virtual register def without any uses will be marked dead later, and
+  // eventually replaced by the zero register.
+  return MRI->use_nodbg_empty(DstReg);
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Return
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
+  // A normal br.cond simply has the condition code.
+  if (Cond[0].getImm() != -1) {
+    assert(Cond.size() == 1 && "Unknown Cond array format");
+    CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    return true;
+  }
+  // For tbz and cbz instruction, the opcode is next.
+  switch (Cond[1].getImm()) {
+  default:
+    // This includes tbz / tbnz branches which can't be converted to
+    // ccmp + br.cond.
+    return false;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+    assert(Cond.size() == 3 && "Unknown Cond array format");
+    CC = AArch64CC::EQ;
+    return true;
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    assert(Cond.size() == 3 && "Unknown Cond array format");
+    CC = AArch64CC::NE;
+    return true;
+  }
+}
+
+MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+  if (I == MBB->end())
+    return nullptr;
+  // The terminator must be controlled by the flags.
+  if (!I->readsRegister(AArch64::NZCV)) {
+    switch (I->getOpcode()) {
+    case AArch64::CBZW:
+    case AArch64::CBZX:
+    case AArch64::CBNZW:
+    case AArch64::CBNZX:
+      // These can be converted into a ccmp against #0.
+      return I;
+    }
+    ++NumCmpTermRejs;
+    DEBUG(dbgs() << "Flags not used by terminator: " << *I);
+    return nullptr;
+  }
+
+  // Now find the instruction controlling the terminator.
+  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+    --I;
+    assert(!I->isTerminator() && "Spurious terminator");
+    switch (I->getOpcode()) {
+    // cmp is an alias for subs with a dead destination register.
+    case AArch64::SUBSWri:
+    case AArch64::SUBSXri:
+    // cmn is an alias for adds with a dead destination register.
+    case AArch64::ADDSWri:
+    case AArch64::ADDSXri:
+      // Check that the immediate operand is within range, ccmp wants a uimm5.
+      // Rd = SUBSri Rn, imm, shift
+      if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
+        DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
+        ++NumImmRangeRejs;
+        return nullptr;
+      }
+    // Fall through.
+    case AArch64::SUBSWrr:
+    case AArch64::SUBSXrr:
+    case AArch64::ADDSWrr:
+    case AArch64::ADDSXrr:
+      if (isDeadDef(I->getOperand(0).getReg()))
+        return I;
+      DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
+      ++NumLiveDstRejs;
+      return nullptr;
+    case AArch64::FCMPSrr:
+    case AArch64::FCMPDrr:
+    case AArch64::FCMPESrr:
+    case AArch64::FCMPEDrr:
+      return I;
+    }
+
+    // Check for flag reads and clobbers.
+    MIOperands::PhysRegInfo PRI =
+        MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI);
+
+    if (PRI.Reads) {
+      // The ccmp doesn't produce exactly the same flags as the original
+      // compare, so reject the transform if there are uses of the flags
+      // besides the terminators.
+      DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
+      ++NumMultNZCVUses;
+      return nullptr;
+    }
+
+    if (PRI.Clobbers) {
+      DEBUG(dbgs() << "Not convertible compare: " << *I);
+      ++NumUnknNZCVDefs;
+      return nullptr;
+    }
+  }
+  DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+  return nullptr;
+}
+
+/// Determine if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// Only CmpMI is allowed to clobber the flags.
+///
+bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
+                                     const MachineInstr *CmpMI) {
+  // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) {
+    if (I.isDebugValue())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+                   << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I.isPHI()) {
+      DEBUG(dbgs() << "Can't hoist: " << I);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (I.mayLoad()) {
+      DEBUG(dbgs() << "Won't speculate load: " << I);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!I.isSafeToMove(TII, nullptr, DontMoveAcrossStore)) {
+      DEBUG(dbgs() << "Can't speculate: " << I);
+      return false;
+    }
+
+    // Only CmpMI is allowed to clobber the flags.
+    if (&I != CmpMI && I.modifiesRegister(AArch64::NZCV, TRI)) {
+      DEBUG(dbgs() << "Clobbers flags: " << I);
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential
+/// candidate for cmp-conversion. Fill out the internal state.
+///
+bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
+  Head = MBB;
+  Tail = CmpBB = nullptr;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // CmpBB can only have a single predecessor. Tail is allowed many.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  // Succ0 is our candidate for CmpBB.
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2)
+    return false;
+
+  CmpBB = Succ0;
+  Tail = Succ1;
+
+  if (!CmpBB->isSuccessor(Tail))
+    return false;
+
+  // The CFG topology checks out.
+  DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#"
+               << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n');
+  ++NumConsidered;
+
+  // Tail is allowed to have many predecessors, but we can't handle PHIs yet.
+  //
+  // FIXME: Real PHIs could be if-converted as long as the CmpBB values are
+  // defined before The CmpBB cmp clobbers the flags. Alternatively, it should
+  // always be safe to sink the ccmp down to immediately before the CmpBB
+  // terminators.
+  if (!trivialTailPHIs()) {
+    DEBUG(dbgs() << "Can't handle phis in Tail.\n");
+    ++NumPhiRejs;
+    return false;
+  }
+
+  if (!Tail->livein_empty()) {
+    DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // CmpBB should never have PHIs since Head is its only predecessor.
+  // FIXME: Clean them up if it happens.
+  if (!CmpBB->empty() && CmpBB->front().isPHI()) {
+    DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
+    ++NumPhi2Rejs;
+    return false;
+  }
+
+  if (!CmpBB->livein_empty()) {
+    DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
+    ++NumPhysRejs;
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  HeadCond.clear();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) {
+    DEBUG(dbgs() << "Head branch not analyzable.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG, or an edge to a
+  // landing pad.
+  if (!TBB || HeadCond.empty()) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(HeadCond, HeadCmpBBCC)) {
+    DEBUG(dbgs() << "Unsupported branch type on Head\n");
+    ++NumHeadBranchRejs;
+    return false;
+  }
+
+  // Make sure the branch direction is right.
+  if (TBB != CmpBB) {
+    assert(TBB == Tail && "Unexpected TBB");
+    HeadCmpBBCC = AArch64CC::getInvertedCondCode(HeadCmpBBCC);
+  }
+
+  CmpBBCond.clear();
+  TBB = FBB = nullptr;
+  if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
+    DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!TBB || CmpBBCond.empty()) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (!parseCond(CmpBBCond, CmpBBTailCC)) {
+    DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
+    ++NumCmpBranchRejs;
+    return false;
+  }
+
+  if (TBB != Tail)
+    CmpBBTailCC = AArch64CC::getInvertedCondCode(CmpBBTailCC);
+
+  DEBUG(dbgs() << "Head->CmpBB on " << AArch64CC::getCondCodeName(HeadCmpBBCC)
+               << ", CmpBB->Tail on " << AArch64CC::getCondCodeName(CmpBBTailCC)
+               << '\n');
+
+  CmpMI = findConvertibleCompare(CmpBB);
+  if (!CmpMI)
+    return false;
+
+  if (!canSpeculateInstrs(CmpBB, CmpMI)) {
+    ++NumSpeculateRejs;
+    return false;
+  }
+  return true;
+}
+
+void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
+  DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#"
+               << Head->getNumber() << ":\n" << *CmpBB);
+
+  // All CmpBB instructions are moved into Head, and CmpBB is deleted.
+  // Update the CFG first.
+  updateTailPHIs();
+  Head->removeSuccessor(CmpBB);
+  CmpBB->removeSuccessor(Tail);
+  Head->transferSuccessorsAndUpdatePHIs(CmpBB);
+  DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
+  TII->RemoveBranch(*Head);
+
+  // If the Head terminator was one of the cbz / tbz branches with built-in
+  // compare, we need to insert an explicit compare instruction in its place.
+  if (HeadCond[0].getImm() == -1) {
+    ++NumCompBranches;
+    unsigned Opc = 0;
+    switch (HeadCond[1].getImm()) {
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+      Opc = AArch64::SUBSWri;
+      break;
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+      Opc = AArch64::SUBSXri;
+      break;
+    default:
+      llvm_unreachable("Cannot convert Head branch");
+    }
+    const MCInstrDesc &MCID = TII->get(Opc);
+    // Create a dummy virtual register for the SUBS def.
+    unsigned DestReg =
+        MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
+    // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
+    BuildMI(*Head, Head->end(), TermDL, MCID)
+        .addReg(DestReg, RegState::Define | RegState::Dead)
+        .addOperand(HeadCond[2])
+        .addImm(0)
+        .addImm(0);
+    // SUBS uses the GPR*sp register classes.
+    MRI->constrainRegClass(HeadCond[2].getReg(),
+                           TII->getRegClass(MCID, 1, TRI, *MF));
+  }
+
+  Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
+
+  // Now replace CmpMI with a ccmp instruction that also considers the incoming
+  // flags.
+  unsigned Opc = 0;
+  unsigned FirstOp = 1;   // First CmpMI operand to copy.
+  bool isZBranch = false; // CmpMI is a cbz/cbnz instruction.
+  switch (CmpMI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown compare opcode");
+  case AArch64::SUBSWri:    Opc = AArch64::CCMPWi; break;
+  case AArch64::SUBSWrr:    Opc = AArch64::CCMPWr; break;
+  case AArch64::SUBSXri:    Opc = AArch64::CCMPXi; break;
+  case AArch64::SUBSXrr:    Opc = AArch64::CCMPXr; break;
+  case AArch64::ADDSWri:    Opc = AArch64::CCMNWi; break;
+  case AArch64::ADDSWrr:    Opc = AArch64::CCMNWr; break;
+  case AArch64::ADDSXri:    Opc = AArch64::CCMNXi; break;
+  case AArch64::ADDSXrr:    Opc = AArch64::CCMNXr; break;
+  case AArch64::FCMPSrr:    Opc = AArch64::FCCMPSrr; FirstOp = 0; break;
+  case AArch64::FCMPDrr:    Opc = AArch64::FCCMPDrr; FirstOp = 0; break;
+  case AArch64::FCMPESrr:   Opc = AArch64::FCCMPESrr; FirstOp = 0; break;
+  case AArch64::FCMPEDrr:   Opc = AArch64::FCCMPEDrr; FirstOp = 0; break;
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+    Opc = AArch64::CCMPWi;
+    FirstOp = 0;
+    isZBranch = true;
+    break;
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+    Opc = AArch64::CCMPXi;
+    FirstOp = 0;
+    isZBranch = true;
+    break;
+  }
+
+  // The ccmp instruction should set the flags according to the comparison when
+  // Head would have branched to CmpBB.
+  // The NZCV immediate operand should provide flags for the case where Head
+  // would have branched to Tail. These flags should cause the new Head
+  // terminator to branch to tail.
+  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
+  const MCInstrDesc &MCID = TII->get(Opc);
+  MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
+                         TII->getRegClass(MCID, 0, TRI, *MF));
+  if (CmpMI->getOperand(FirstOp + 1).isReg())
+    MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
+                           TII->getRegClass(MCID, 1, TRI, *MF));
+  MachineInstrBuilder MIB =
+      BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
+          .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
+  if (isZBranch)
+    MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
+  else
+    MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
+  MIB.addImm(NZCV).addImm(HeadCmpBBCC);
+
+  // If CmpMI was a terminator, we need a new conditional branch to replace it.
+  // This now becomes a Head terminator.
+  if (isZBranch) {
+    bool isNZ = CmpMI->getOpcode() == AArch64::CBNZW ||
+                CmpMI->getOpcode() == AArch64::CBNZX;
+    BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc))
+        .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ)
+        .addOperand(CmpMI->getOperand(1)); // Branch target.
+  }
+  CmpMI->eraseFromParent();
+  Head->updateTerminator();
+
+  RemovedBlocks.push_back(CmpBB);
+  CmpBB->eraseFromParent();
+  DEBUG(dbgs() << "Result:\n" << *Head);
+  ++NumConverted;
+}
+
+int SSACCmpConv::expectedCodeSizeDelta() const {
+  int delta = 0;
+  // If the Head terminator was one of the cbz / tbz branches with built-in
+  // compare, we need to insert an explicit compare instruction in its place
+  // plus a branch instruction.
+  if (HeadCond[0].getImm() == -1) {
+    switch (HeadCond[1].getImm()) {
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+      // Therefore delta += 1
+      delta = 1;
+      break;
+    default:
+      llvm_unreachable("Cannot convert Head branch");
+    }
+  }
+  // If the Cmp terminator was one of the cbz / tbz branches with
+  // built-in compare, it will be turned into a compare instruction
+  // into Head, but we do not save any instruction.
+  // Otherwise, we save the branch instruction.
+  switch (CmpMI->getOpcode()) {
+  default:
+    --delta;
+    break;
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+    break;
+  }
+  return delta;
+}
+
+//===----------------------------------------------------------------------===//
+//                       AArch64ConditionalCompares Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64ConditionalCompares : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MCSchedModel *SchedModel;
+  // Does the proceeded function has Oz attribute.
+  bool MinSize;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+  SSACCmpConv CmpConv;
+
+public:
+  static char ID;
+  AArch64ConditionalCompares() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const char *getPassName() const override {
+    return "AArch64 Conditional Compares";
+  }
+
+private:
+  bool tryConvert(MachineBasicBlock *);
+  void updateDomTree(ArrayRef<MachineBasicBlock *> Removed);
+  void updateLoops(ArrayRef<MachineBasicBlock *> Removed);
+  void invalidateTraces();
+  bool shouldConvert();
+};
+} // end anonymous namespace
+
+char AArch64ConditionalCompares::ID = 0;
+
+namespace llvm {
+void initializeAArch64ConditionalComparesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
+                      "AArch64 CCMP Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
+                    "AArch64 CCMP Pass", false, false)
+
+FunctionPass *llvm::createAArch64ConditionalCompares() {
+  return new AArch64ConditionalCompares();
+}
+
+void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void AArch64ConditionalCompares::updateDomTree(
+    ArrayRef<MachineBasicBlock *> Removed) {
+  // convert() removes CmpBB which was previously dominated by Head.
+  // CmpBB children should be transferred to Head.
+  MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+    assert(Node != HeadNode && "Cannot erase the head node");
+    assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
+    while (Node->getNumChildren())
+      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+    DomTree->eraseNode(Removed[i]);
+  }
+}
+
+/// Update LoopInfo after if-conversion.
+void
+AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
+  if (!Loops)
+    return;
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
+    Loops->removeBlock(Removed[i]);
+}
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void AArch64ConditionalCompares::invalidateTraces() {
+  Traces->invalidate(CmpConv.Head);
+  Traces->invalidate(CmpConv.CmpBB);
+}
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool AArch64ConditionalCompares::shouldConvert() {
+  // Stress testing mode disables all cost considerations.
+  if (Stress)
+    return true;
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  // Head dominates CmpBB, so it is always included in its trace.
+  MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB);
+
+  // If code size is the main concern
+  if (MinSize) {
+    int CodeSizeDelta = CmpConv.expectedCodeSizeDelta();
+    DEBUG(dbgs() << "Code size delta:  " << CodeSizeDelta << '\n');
+    // If we are minimizing the code size, do the conversion whatever
+    // the cost is.
+    if (CodeSizeDelta < 0)
+      return true;
+    if (CodeSizeDelta > 0) {
+      DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
+      return false;
+    }
+    // CodeSizeDelta == 0, continue with the regular heuristics
+  }
+
+  // Heuristic: The compare conversion delays the execution of the branch
+  // instruction because we must wait for the inputs to the second compare as
+  // well. The branch has no dependent instructions, but delaying it increases
+  // the cost of a misprediction.
+  //
+  // Set a limit on the delay we will accept.
+  unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4;
+
+  // Instruction depths can be computed for all trace instructions above CmpBB.
+  unsigned HeadDepth =
+      Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth;
+  unsigned CmpBBDepth =
+      Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth;
+  DEBUG(dbgs() << "Head depth:  " << HeadDepth
+               << "\nCmpBB depth: " << CmpBBDepth << '\n');
+  if (CmpBBDepth > HeadDepth + DelayLimit) {
+    DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
+                 << " cycles.\n");
+    return false;
+  }
+
+  // Check the resource depth at the bottom of CmpBB - these instructions will
+  // be speculated.
+  unsigned ResDepth = Trace.getResourceDepth(true);
+  DEBUG(dbgs() << "Resources:   " << ResDepth << '\n');
+
+  // Heuristic: The speculatively executed instructions must all be able to
+  // merge into the Head block. The Head critical path should dominate the
+  // resource cost of the speculated instructions.
+  if (ResDepth > HeadDepth) {
+    DEBUG(dbgs() << "Too many instructions to speculate.\n");
+    return false;
+  }
+  return true;
+}
+
+bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (CmpConv.canConvert(MBB) && shouldConvert()) {
+    invalidateTraces();
+    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
+    CmpConv.convert(RemovedBlocks);
+    Changed = true;
+    updateDomTree(RemovedBlocks);
+    updateLoops(RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  SchedModel =
+      MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
+  MRI = &MF.getRegInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = nullptr;
+  MinSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
+
+  bool Changed = false;
+  CmpConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+  // cmp-conversions from the same head block.
+  // Note that updateDomTree() modifies the children of the DomTree node
+  // currently being visited. The df_iterator supports that; it doesn't look at
+  // child_begin() / child_end() until after a node has been visited.
+  for (auto *I : depth_first(DomTree))
+    if (tryConvert(I->getBlock()))
+      Changed = true;
+
+  return Changed;
+}
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
new file mode 100644
index 0000000..a2d853c
--- /dev/null
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -0,0 +1,134 @@
+//==-- AArch64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When allowed by the instruction, replace a dead definition of a GPR with
+// the zero register. This makes the code a bit friendlier towards the
+// hardware's register renamer.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-dead-defs"
+
+STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
+
+namespace {
+class AArch64DeadRegisterDefinitions : public MachineFunctionPass {
+private:
+  const TargetRegisterInfo *TRI;
+  bool implicitlyDefinesOverlappingReg(unsigned Reg, const MachineInstr &MI);
+  bool processMachineBasicBlock(MachineBasicBlock &MBB);
+  bool usesFrameIndex(const MachineInstr &MI);
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &F) override;
+
+  const char *getPassName() const override { return "Dead register definitions"; }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64DeadRegisterDefinitions::ID = 0;
+} // end anonymous namespace
+
+bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg(
+    unsigned Reg, const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.implicit_operands())
+    if (MO.isReg() && MO.isDef())
+      if (TRI->regsOverlap(Reg, MO.getReg()))
+        return true;
+  return false;
+}
+
+bool AArch64DeadRegisterDefinitions::usesFrameIndex(const MachineInstr &MI) {
+  for (const MachineOperand &Op : MI.uses())
+    if (Op.isFI())
+      return true;
+  return false;
+}
+
+bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
+    MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineInstr &MI : MBB) {
+    if (usesFrameIndex(MI)) {
+      // We need to skip this instruction because while it appears to have a
+      // dead def it uses a frame index which might expand into a multi
+      // instruction sequence during EPI.
+      DEBUG(dbgs() << "    Ignoring, operand is frame index\n");
+      continue;
+    }
+    for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && MO.isDead() && MO.isDef()) {
+        assert(!MO.isImplicit() && "Unexpected implicit def!");
+        DEBUG(dbgs() << "  Dead def operand #" << i << " in:\n    ";
+              MI.print(dbgs()));
+        // Be careful not to change the register if it's a tied operand.
+        if (MI.isRegTiedToUseOperand(i)) {
+          DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
+          continue;
+        }
+        // Don't change the register if there's an implicit def of a subreg or
+        // supperreg.
+        if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) {
+          DEBUG(dbgs() << "    Ignoring, implicitly defines overlap reg.\n");
+          continue;
+        }
+        // Make sure the instruction take a register class that contains
+        // the zero register and replace it if so.
+        unsigned NewReg;
+        switch (MI.getDesc().OpInfo[i].RegClass) {
+        default:
+          DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
+          continue;
+        case AArch64::GPR32RegClassID:
+          NewReg = AArch64::WZR;
+          break;
+        case AArch64::GPR64RegClassID:
+          NewReg = AArch64::XZR;
+          break;
+        }
+        DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
+        MO.setReg(NewReg);
+        DEBUG(MI.print(dbgs()));
+        ++NumDeadDefsReplaced;
+      }
+    }
+  }
+  return Changed;
+}
+
+// Scan the function for instructions that have a dead definition of a
+// register. Replace that register with the zero register when possible.
+bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getTarget().getRegisterInfo();
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
+
+  for (auto &MBB : MF)
+    if (processMachineBasicBlock(MBB))
+      Changed = true;
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64DeadRegisterDefinitions() {
+  return new AArch64DeadRegisterDefinitions();
+}
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
new file mode 100644
index 0000000..a76fd76
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -0,0 +1,749 @@
+//==-- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling and other late optimizations.  This
+// pass should be run after register allocation but before the post-regalloc
+// scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+namespace {
+class AArch64ExpandPseudo : public MachineFunctionPass {
+public:
+  static char ID;
+  AArch64ExpandPseudo() : MachineFunctionPass(ID) {}
+
+  const AArch64InstrInfo *TII;
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  const char *getPassName() const override {
+    return "AArch64 pseudo instruction expansion pass";
+  }
+
+private:
+  bool expandMBB(MachineBasicBlock &MBB);
+  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                    unsigned BitSize);
+};
+char AArch64ExpandPseudo::ID = 0;
+}
+
+/// \brief Transfer implicit operands on the pseudo instruction to the
+/// instructions created from the expansion.
+static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
+                           MachineInstrBuilder &DefMI) {
+  const MCInstrDesc &Desc = OldMI.getDesc();
+  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
+       ++i) {
+    const MachineOperand &MO = OldMI.getOperand(i);
+    assert(MO.isReg() && MO.getReg());
+    if (MO.isUse())
+      UseMI.addOperand(MO);
+    else
+      DefMI.addOperand(MO);
+  }
+}
+
+/// \brief Helper function which extracts the specified 16-bit chunk from a
+/// 64-bit value.
+static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
+  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+
+  return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
+}
+
+/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
+/// value. Indices correspond to element numbers in a v4i16.
+static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
+  assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
+  const unsigned ShiftAmt = ToIdx * 16;
+
+  // Replicate the source chunk to the destination position.
+  const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
+  // Clear the destination chunk.
+  Imm &= ~(0xFFFFLL << ShiftAmt);
+  // Insert the replicated chunk.
+  return Imm | Chunk;
+}
+
+/// \brief Helper function which tries to materialize a 64-bit value with an
+/// ORR + MOVK instruction sequence.
+static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
+                       MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator &MBBI,
+                       const AArch64InstrInfo *TII, unsigned ChunkIdx) {
+  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
+  const unsigned ShiftAmt = ChunkIdx * 16;
+
+  uint64_t Encoding;
+  if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
+    // Create the ORR-immediate instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(AArch64::XZR)
+            .addImm(Encoding);
+
+    // Create the MOVK instruction.
+    const unsigned Imm16 = getChunk(UImm, ChunkIdx);
+    const unsigned DstReg = MI.getOperand(0).getReg();
+    const bool DstIsDead = MI.getOperand(0).isDead();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+    transferImpOps(MI, MIB, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
+/// can be materialized with an ORR instruction.
+static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
+  Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
+
+  return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
+}
+
+/// \brief Check for identical 16-bit chunks within the constant and if so
+/// materialize them with a single ORR instruction. The remaining one or two
+/// 16-bit chunks will be materialized with MOVK instructions.
+///
+/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
+/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
+/// an ORR instruction.
+///
+static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
+                                 MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI,
+                                 const AArch64InstrInfo *TII) {
+  typedef DenseMap<uint64_t, unsigned> CountMap;
+  CountMap Counts;
+
+  // Scan the constant and count how often every chunk occurs.
+  for (unsigned Idx = 0; Idx < 4; ++Idx)
+    ++Counts[getChunk(UImm, Idx)];
+
+  // Traverse the chunks to find one which occurs more than once.
+  for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
+       Chunk != End; ++Chunk) {
+    const uint64_t ChunkVal = Chunk->first;
+    const unsigned Count = Chunk->second;
+
+    uint64_t Encoding = 0;
+
+    // We are looking for chunks which have two or three instances and can be
+    // materialized with an ORR instruction.
+    if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
+      continue;
+
+    const bool CountThree = Count == 3;
+    // Create the ORR-immediate instruction.
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(AArch64::XZR)
+            .addImm(Encoding);
+
+    const unsigned DstReg = MI.getOperand(0).getReg();
+    const bool DstIsDead = MI.getOperand(0).isDead();
+
+    unsigned ShiftAmt = 0;
+    uint64_t Imm16 = 0;
+    // Find the first chunk not materialized with the ORR instruction.
+    for (; ShiftAmt < 64; ShiftAmt += 16) {
+      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+      if (Imm16 != ChunkVal)
+        break;
+    }
+
+    // Create the first MOVK instruction.
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+            .addReg(DstReg,
+                    RegState::Define | getDeadRegState(DstIsDead && CountThree))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+    // In case we have three instances the whole constant is now materialized
+    // and we can exit.
+    if (CountThree) {
+      transferImpOps(MI, MIB, MIB1);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    // Find the remaining chunk which needs to be materialized.
+    for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
+      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
+
+      if (Imm16 != ChunkVal)
+        break;
+    }
+
+    // Create the second MOVK instruction.
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstReg)
+            .addImm(Imm16)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
+
+    transferImpOps(MI, MIB, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
+/// starts a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isStartChunk(uint64_t Chunk) {
+  if (Chunk == 0 || Chunk == UINT64_MAX)
+    return false;
+
+  return (CountLeadingOnes_64(Chunk) + countTrailingZeros(Chunk)) == 64;
+}
+
+/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
+/// ends a contiguous sequence of ones if we look at the bits from the LSB
+/// towards the MSB.
+static bool isEndChunk(uint64_t Chunk) {
+  if (Chunk == 0 || Chunk == UINT64_MAX)
+    return false;
+
+  return (countLeadingZeros(Chunk) + CountTrailingOnes_64(Chunk)) == 64;
+}
+
+/// \brief Clear or set all bits in the chunk at the given index.
+static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
+  const uint64_t Mask = 0xFFFF;
+
+  if (Clear)
+    // Clear chunk in the immediate.
+    Imm &= ~(Mask << (Idx * 16));
+  else
+    // Set all bits in the immediate for the particular chunk.
+    Imm |= Mask << (Idx * 16);
+
+  return Imm;
+}
+
+/// \brief Check whether the constant contains a sequence of contiguous ones,
+/// which might be interrupted by one or two chunks. If so, materialize the
+/// sequence of contiguous ones with an ORR instruction.
+/// Materialize the chunks which are either interrupting the sequence or outside
+/// of the sequence with a MOVK instruction.
+///
+/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
+/// which ends the sequence (0...1...). Then we are looking for constants which
+/// contain at least one S and E chunk.
+/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
+///
+/// We are also looking for constants like |S|A|B|E| where the contiguous
+/// sequence of ones wraps around the MSB into the LSB.
+///
+static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              const AArch64InstrInfo *TII) {
+  const int NotSet = -1;
+  const uint64_t Mask = 0xFFFF;
+
+  int StartIdx = NotSet;
+  int EndIdx = NotSet;
+  // Try to find the chunks which start/end a contiguous sequence of ones.
+  for (int Idx = 0; Idx < 4; ++Idx) {
+    int64_t Chunk = getChunk(UImm, Idx);
+    // Sign extend the 16-bit chunk to 64-bit.
+    Chunk = (Chunk << 48) >> 48;
+
+    if (isStartChunk(Chunk))
+      StartIdx = Idx;
+    else if (isEndChunk(Chunk))
+      EndIdx = Idx;
+  }
+
+  // Early exit in case we can't find a start/end chunk.
+  if (StartIdx == NotSet || EndIdx == NotSet)
+    return false;
+
+  // Outside of the contiguous sequence of ones everything needs to be zero.
+  uint64_t Outside = 0;
+  // Chunks between the start and end chunk need to have all their bits set.
+  uint64_t Inside = Mask;
+
+  // If our contiguous sequence of ones wraps around from the MSB into the LSB,
+  // just swap indices and pretend we are materializing a contiguous sequence
+  // of zeros surrounded by a contiguous sequence of ones.
+  if (StartIdx > EndIdx) {
+    std::swap(StartIdx, EndIdx);
+    std::swap(Outside, Inside);
+  }
+
+  uint64_t OrrImm = UImm;
+  int FirstMovkIdx = NotSet;
+  int SecondMovkIdx = NotSet;
+
+  // Find out which chunks we need to patch up to obtain a contiguous sequence
+  // of ones.
+  for (int Idx = 0; Idx < 4; ++Idx) {
+    const uint64_t Chunk = getChunk(UImm, Idx);
+
+    // Check whether we are looking at a chunk which is not part of the
+    // contiguous sequence of ones.
+    if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
+      OrrImm = updateImm(OrrImm, Idx, Outside == 0);
+
+      // Remember the index we need to patch.
+      if (FirstMovkIdx == NotSet)
+        FirstMovkIdx = Idx;
+      else
+        SecondMovkIdx = Idx;
+
+      // Check whether we are looking a chunk which is part of the contiguous
+      // sequence of ones.
+    } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
+      OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
+
+      // Remember the index we need to patch.
+      if (FirstMovkIdx == NotSet)
+        FirstMovkIdx = Idx;
+      else
+        SecondMovkIdx = Idx;
+    }
+  }
+  assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
+
+  // Create the ORR-immediate instruction.
+  uint64_t Encoding = 0;
+  AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
+  MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
+          .addOperand(MI.getOperand(0))
+          .addReg(AArch64::XZR)
+          .addImm(Encoding);
+
+  const unsigned DstReg = MI.getOperand(0).getReg();
+  const bool DstIsDead = MI.getOperand(0).isDead();
+
+  const bool SingleMovk = SecondMovkIdx == NotSet;
+  // Create the first MOVK instruction.
+  MachineInstrBuilder MIB1 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+          .addReg(DstReg,
+                  RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
+          .addReg(DstReg)
+          .addImm(getChunk(UImm, FirstMovkIdx))
+          .addImm(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16));
+
+  // Early exit in case we only need to emit a single MOVK instruction.
+  if (SingleMovk) {
+    transferImpOps(MI, MIB, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Create the second MOVK instruction.
+  MachineInstrBuilder MIB2 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
+          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstReg)
+          .addImm(getChunk(UImm, SecondMovkIdx))
+          .addImm(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16));
+
+  transferImpOps(MI, MIB, MIB2);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
+/// real move-immediate instructions to synthesize the immediate.
+bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       unsigned BitSize) {
+  MachineInstr &MI = *MBBI;
+  uint64_t Imm = MI.getOperand(1).getImm();
+  const unsigned Mask = 0xFFFF;
+
+  // Try a MOVI instruction (aka ORR-immediate with the zero register).
+  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+  uint64_t Encoding;
+  if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+    unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+            .addOperand(MI.getOperand(0))
+            .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
+            .addImm(Encoding);
+    transferImpOps(MI, MIB, MIB);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Scan the immediate and count the number of 16-bit chunks which are either
+  // all ones or all zeros.
+  unsigned OneChunks = 0;
+  unsigned ZeroChunks = 0;
+  for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
+    const unsigned Chunk = (Imm >> Shift) & Mask;
+    if (Chunk == Mask)
+      OneChunks++;
+    else if (Chunk == 0)
+      ZeroChunks++;
+  }
+
+  // Since we can't materialize the constant with a single ORR instruction,
+  // let's see whether we can materialize 3/4 of the constant with an ORR
+  // instruction and use an additional MOVK instruction to materialize the
+  // remaining 1/4.
+  //
+  // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
+  //
+  // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
+  // we would create the following instruction sequence:
+  //
+  // ORR x0, xzr, |A|X|A|X|
+  // MOVK x0, |B|, LSL #16
+  //
+  // Only look at 64-bit constants which can't be materialized with a single
+  // instruction e.g. which have less than either three all zero or all one
+  // chunks.
+  //
+  // Ignore 32-bit constants here, they always can be materialized with a
+  // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
+  // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
+  // Thus we fall back to the default code below which in the best case creates
+  // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
+  //
+  if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
+    // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
+    // identical?
+    if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 3 into element 1.
+      uint64_t OrrImm = replicateChunk(UImm, 3, 1);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
+        return true;
+
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 1 into element 3.
+      OrrImm = replicateChunk(UImm, 1, 3);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
+        return true;
+
+      // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
+      // identical?
+    } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 2 into element 0.
+      uint64_t OrrImm = replicateChunk(UImm, 2, 0);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
+        return true;
+
+      // See if we can come up with a constant which can be materialized with
+      // ORR-immediate by replicating element 1 into element 3.
+      OrrImm = replicateChunk(UImm, 0, 2);
+      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
+        return true;
+    }
+  }
+
+  // Check for identical 16-bit chunks within the constant and if so materialize
+  // them with a single ORR instruction. The remaining one or two 16-bit chunks
+  // will be materialized with MOVK instructions.
+  if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
+    return true;
+
+  // Check whether the constant contains a sequence of contiguous ones, which
+  // might be interrupted by one or two chunks. If so, materialize the sequence
+  // of contiguous ones with an ORR instruction. Materialize the chunks which
+  // are either interrupting the sequence or outside of the sequence with a
+  // MOVK instruction.
+  if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
+    return true;
+
+  // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
+  // more MOVK instructions to insert additional 16-bit portions into the
+  // lower bits.
+  bool isNeg = false;
+
+  // Use MOVN to materialize the high bits if we have more all one chunks
+  // than all zero chunks.
+  if (OneChunks > ZeroChunks) {
+    isNeg = true;
+    Imm = ~Imm;
+  }
+
+  unsigned FirstOpc;
+  if (BitSize == 32) {
+    Imm &= (1LL << 32) - 1;
+    FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
+  } else {
+    FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
+  }
+  unsigned Shift = 0;     // LSL amount for high bits with MOVZ/MOVN
+  unsigned LastShift = 0; // LSL amount for last MOVK
+  if (Imm != 0) {
+    unsigned LZ = countLeadingZeros(Imm);
+    unsigned TZ = countTrailingZeros(Imm);
+    Shift = ((63 - LZ) / 16) * 16;
+    LastShift = (TZ / 16) * 16;
+  }
+  unsigned Imm16 = (Imm >> Shift) & Mask;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  MachineInstrBuilder MIB1 =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
+          .addReg(DstReg, RegState::Define |
+                              getDeadRegState(DstIsDead && Shift == LastShift))
+          .addImm(Imm16)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+
+  // If a MOVN was used for the high bits of a negative value, flip the rest
+  // of the bits back for use with MOVK.
+  if (isNeg)
+    Imm = ~Imm;
+
+  if (Shift == LastShift) {
+    transferImpOps(MI, MIB1, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  MachineInstrBuilder MIB2;
+  unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
+  while (Shift != LastShift) {
+    Shift -= 16;
+    Imm16 = (Imm >> Shift) & Mask;
+    if (Imm16 == (isNeg ? Mask : 0))
+      continue; // This 16-bit portion is already set correctly.
+    MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+               .addReg(DstReg,
+                       RegState::Define |
+                           getDeadRegState(DstIsDead && Shift == LastShift))
+               .addReg(DstReg)
+               .addImm(Imm16)
+               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+  }
+
+  transferImpOps(MI, MIB1, MIB2);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// \brief If MBBI references a pseudo instruction that should be expanded here,
+/// do the expansion and return true.  Otherwise return false.
+bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  default:
+    break;
+
+  case AArch64::ADDWrr:
+  case AArch64::SUBWrr:
+  case AArch64::ADDXrr:
+  case AArch64::SUBXrr:
+  case AArch64::ADDSWrr:
+  case AArch64::SUBSWrr:
+  case AArch64::ADDSXrr:
+  case AArch64::SUBSXrr:
+  case AArch64::ANDWrr:
+  case AArch64::ANDXrr:
+  case AArch64::BICWrr:
+  case AArch64::BICXrr:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSXrr:
+  case AArch64::BICSWrr:
+  case AArch64::BICSXrr:
+  case AArch64::EONWrr:
+  case AArch64::EONXrr:
+  case AArch64::EORWrr:
+  case AArch64::EORXrr:
+  case AArch64::ORNWrr:
+  case AArch64::ORNXrr:
+  case AArch64::ORRWrr:
+  case AArch64::ORRXrr: {
+    unsigned Opcode;
+    switch (MI.getOpcode()) {
+    default:
+      return false;
+    case AArch64::ADDWrr:      Opcode = AArch64::ADDWrs; break;
+    case AArch64::SUBWrr:      Opcode = AArch64::SUBWrs; break;
+    case AArch64::ADDXrr:      Opcode = AArch64::ADDXrs; break;
+    case AArch64::SUBXrr:      Opcode = AArch64::SUBXrs; break;
+    case AArch64::ADDSWrr:     Opcode = AArch64::ADDSWrs; break;
+    case AArch64::SUBSWrr:     Opcode = AArch64::SUBSWrs; break;
+    case AArch64::ADDSXrr:     Opcode = AArch64::ADDSXrs; break;
+    case AArch64::SUBSXrr:     Opcode = AArch64::SUBSXrs; break;
+    case AArch64::ANDWrr:      Opcode = AArch64::ANDWrs; break;
+    case AArch64::ANDXrr:      Opcode = AArch64::ANDXrs; break;
+    case AArch64::BICWrr:      Opcode = AArch64::BICWrs; break;
+    case AArch64::BICXrr:      Opcode = AArch64::BICXrs; break;
+    case AArch64::ANDSWrr:     Opcode = AArch64::ANDSWrs; break;
+    case AArch64::ANDSXrr:     Opcode = AArch64::ANDSXrs; break;
+    case AArch64::BICSWrr:     Opcode = AArch64::BICSWrs; break;
+    case AArch64::BICSXrr:     Opcode = AArch64::BICSXrs; break;
+    case AArch64::EONWrr:      Opcode = AArch64::EONWrs; break;
+    case AArch64::EONXrr:      Opcode = AArch64::EONXrs; break;
+    case AArch64::EORWrr:      Opcode = AArch64::EORWrs; break;
+    case AArch64::EORXrr:      Opcode = AArch64::EORXrs; break;
+    case AArch64::ORNWrr:      Opcode = AArch64::ORNWrs; break;
+    case AArch64::ORNXrr:      Opcode = AArch64::ORNXrs; break;
+    case AArch64::ORRWrr:      Opcode = AArch64::ORRWrs; break;
+    case AArch64::ORRXrr:      Opcode = AArch64::ORRXrs; break;
+    }
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
+                MI.getOperand(0).getReg())
+            .addOperand(MI.getOperand(1))
+            .addOperand(MI.getOperand(2))
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    transferImpOps(MI, MIB1, MIB1);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case AArch64::FCVTSHpseudo: {
+    MachineOperand Src = MI.getOperand(1);
+    Src.setImplicit();
+    unsigned SrcH =
+        TII->getRegisterInfo().getSubReg(Src.getReg(), AArch64::hsub);
+    auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::FCVTSHr))
+                   .addOperand(MI.getOperand(0))
+                   .addReg(SrcH, RegState::Undef)
+                   .addOperand(Src);
+    transferImpOps(MI, MIB, MIB);
+    MI.eraseFromParent();
+    return true;
+  }
+  case AArch64::LOADgot: {
+    // Expand into ADRP + LDR.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    const MachineOperand &MO1 = MI.getOperand(1);
+    unsigned Flags = MO1.getTargetFlags();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
+            .addOperand(MI.getOperand(0))
+            .addReg(DstReg);
+
+    if (MO1.isGlobal()) {
+      MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
+      MIB2.addGlobalAddress(MO1.getGlobal(), 0,
+                            Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    } else if (MO1.isSymbol()) {
+      MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE);
+      MIB2.addExternalSymbol(MO1.getSymbolName(),
+                             Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    } else {
+      assert(MO1.isCPI() &&
+             "Only expect globals, externalsymbols, or constant pools");
+      MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+                                Flags | AArch64II::MO_PAGE);
+      MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
+                                Flags | AArch64II::MO_PAGEOFF |
+                                    AArch64II::MO_NC);
+    }
+
+    transferImpOps(MI, MIB1, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case AArch64::MOVaddr:
+  case AArch64::MOVaddrJT:
+  case AArch64::MOVaddrCP:
+  case AArch64::MOVaddrBA:
+  case AArch64::MOVaddrTLS:
+  case AArch64::MOVaddrEXT: {
+    // Expand into ADRP + ADD.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    MachineInstrBuilder MIB1 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
+            .addOperand(MI.getOperand(1));
+
+    MachineInstrBuilder MIB2 =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
+            .addOperand(MI.getOperand(0))
+            .addReg(DstReg)
+            .addOperand(MI.getOperand(2))
+            .addImm(0);
+
+    transferImpOps(MI, MIB1, MIB2);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  case AArch64::MOVi32imm:
+    return expandMOVImm(MBB, MBBI, 32);
+  case AArch64::MOVi64imm:
+    return expandMOVImm(MBB, MBBI, 64);
+  case AArch64::RET_ReallyLR:
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET))
+        .addReg(AArch64::LR);
+    MI.eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+/// \brief Iterate over the instructions in basic block MBB and expand any
+/// pseudo instructions.  Return true if anything was modified.
+bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= expandMI(MBB, MBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+
+  bool Modified = false;
+  for (auto &MBB : MF)
+    Modified |= expandMBB(MBB);
+  return Modified;
+}
+
+/// \brief Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createAArch64ExpandPseudoPass() {
+  return new AArch64ExpandPseudo();
+}
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
new file mode 100644
index 0000000..c3b5369
--- /dev/null
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -0,0 +1,1981 @@
+//===-- AArch6464FastISel.cpp - AArch64 FastISel implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// AArch64GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+namespace {
+
+class AArch64FastISel : public FastISel {
+
+  class Address {
+  public:
+    typedef enum {
+      RegBase,
+      FrameIndexBase
+    } BaseKind;
+
+  private:
+    BaseKind Kind;
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+    int64_t Offset;
+
+  public:
+    Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; }
+    void setKind(BaseKind K) { Kind = K; }
+    BaseKind getKind() const { return Kind; }
+    bool isRegBase() const { return Kind == RegBase; }
+    bool isFIBase() const { return Kind == FrameIndexBase; }
+    void setReg(unsigned Reg) {
+      assert(isRegBase() && "Invalid base register access!");
+      Base.Reg = Reg;
+    }
+    unsigned getReg() const {
+      assert(isRegBase() && "Invalid base register access!");
+      return Base.Reg;
+    }
+    void setFI(unsigned FI) {
+      assert(isFIBase() && "Invalid base frame index  access!");
+      Base.FI = FI;
+    }
+    unsigned getFI() const {
+      assert(isFIBase() && "Invalid base frame index access!");
+      return Base.FI;
+    }
+    void setOffset(int64_t O) { Offset = O; }
+    int64_t getOffset() { return Offset; }
+
+    bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); }
+  };
+
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+  LLVMContext *Context;
+
+private:
+  // Selection routines.
+  bool SelectLoad(const Instruction *I);
+  bool SelectStore(const Instruction *I);
+  bool SelectBranch(const Instruction *I);
+  bool SelectIndirectBr(const Instruction *I);
+  bool SelectCmp(const Instruction *I);
+  bool SelectSelect(const Instruction *I);
+  bool SelectFPExt(const Instruction *I);
+  bool SelectFPTrunc(const Instruction *I);
+  bool SelectFPToInt(const Instruction *I, bool Signed);
+  bool SelectIntToFP(const Instruction *I, bool Signed);
+  bool SelectRem(const Instruction *I, unsigned ISDOpcode);
+  bool SelectCall(const Instruction *I, const char *IntrMemName);
+  bool SelectIntrinsicCall(const IntrinsicInst &I);
+  bool SelectRet(const Instruction *I);
+  bool SelectTrunc(const Instruction *I);
+  bool SelectIntExt(const Instruction *I);
+  bool SelectMul(const Instruction *I);
+
+  // Utility helper routines.
+  bool isTypeLegal(Type *Ty, MVT &VT);
+  bool isLoadStoreTypeLegal(Type *Ty, MVT &VT);
+  bool ComputeAddress(const Value *Obj, Address &Addr);
+  bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
+                       bool UseUnscaled);
+  void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
+                            unsigned Flags, bool UseUnscaled);
+  bool IsMemCpySmall(uint64_t Len, unsigned Alignment);
+  bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+                          unsigned Alignment);
+  // Emit functions.
+  bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt);
+  bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
+                bool UseUnscaled = false);
+  bool EmitStore(MVT VT, unsigned SrcReg, Address Addr,
+                 bool UseUnscaled = false);
+  unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+  unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+
+  unsigned AArch64MaterializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned AArch64MaterializeGV(const GlobalValue *GV);
+
+  // Call handling routines.
+private:
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
+  bool ProcessCallArgs(SmallVectorImpl<Value *> &Args,
+                       SmallVectorImpl<unsigned> &ArgRegs,
+                       SmallVectorImpl<MVT> &ArgVTs,
+                       SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                       SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+                       unsigned &NumBytes);
+  bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                  const Instruction *I, CallingConv::ID CC, unsigned &NumBytes);
+
+public:
+  // Backend specific FastISel code.
+  unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
+  unsigned TargetMaterializeConstant(const Constant *C) override;
+
+  explicit AArch64FastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo)
+      : FastISel(funcInfo, libInfo) {
+    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+    Context = &funcInfo.Fn->getContext();
+  }
+
+  bool TargetSelectInstruction(const Instruction *I) override;
+
+#include "AArch64GenFastISel.inc"
+};
+
+} // end anonymous namespace
+
+#include "AArch64GenCallingConv.inc"
+
+CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
+  if (CC == CallingConv::WebKit_JS)
+    return CC_AArch64_WebKit_JS;
+  return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
+}
+
+unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+  assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
+         "Alloca should always return a pointer.");
+
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI))
+    return 0;
+
+  DenseMap<const AllocaInst *, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+            ResultReg)
+        .addFrameIndex(SI->second)
+        .addImm(0)
+        .addImm(0);
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return 0;
+
+  const APFloat Val = CFP->getValueAPF();
+  bool is64bit = (VT == MVT::f64);
+
+  // This checks to see if we can use FMOV instructions to materialize
+  // a constant, otherwise we have to materialize via the constant pool.
+  if (TLI.isFPImmLegal(Val, VT)) {
+    int Imm;
+    unsigned Opc;
+    if (is64bit) {
+      Imm = AArch64_AM::getFP64Imm(Val);
+      Opc = AArch64::FMOVDi;
+    } else {
+      Imm = AArch64_AM::getFP32Imm(Val);
+      Opc = AArch64::FMOVSi;
+    }
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+        .addImm(Imm);
+    return ResultReg;
+  }
+
+  // Materialize via constant pool.  MachineConstantPool wants an explicit
+  // alignment.
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
+  if (Align == 0)
+    Align = DL.getTypeAllocSize(CFP->getType());
+
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+          ADRPReg).addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGE);
+
+  unsigned Opc = is64bit ? AArch64::LDRDui : AArch64::LDRSui;
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(ADRPReg)
+      .addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) {
+  // We can't handle thread-local variables quickly yet. Unfortunately we have
+  // to peer through any aliases to find out if that rule applies.
+  const GlobalValue *TLSGV = GV;
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+    TLSGV = GA->getAliasee();
+
+  // MachO still uses GOT for large code-model accesses, but ELF requires
+  // movz/movk sequences, which FastISel doesn't handle yet.
+  if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO())
+    return 0;
+
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(TLSGV))
+    if (GVar->isThreadLocal())
+      return 0;
+
+  unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+  EVT DestEVT = TLI.getValueType(GV->getType(), true);
+  if (!DestEVT.isSimple())
+    return 0;
+
+  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  unsigned ResultReg;
+
+  if (OpFlags & AArch64II::MO_GOT) {
+    // ADRP + LDRX
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+            ADRPReg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+
+    ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+            ResultReg)
+        .addReg(ADRPReg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+                          AArch64II::MO_NC);
+  } else {
+    // ADRP + ADDX
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+            ADRPReg).addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+
+    ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
+            ResultReg)
+        .addReg(ADRPReg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
+        .addImm(0);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  // FIXME: Handle ConstantInt.
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return AArch64MaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return AArch64MaterializeGV(GV);
+
+  return 0;
+}
+
+// Computes the address to get to an object.
+bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  switch (Opcode) {
+  default:
+    break;
+  case Instruction::BitCast: {
+    // Look through bitcasts.
+    return ComputeAddress(U->getOperand(0), Addr);
+  }
+  case Instruction::IntToPtr: {
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return ComputeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::PtrToInt: {
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return ComputeAddress(U->getOperand(0), Addr);
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    Address SavedAddr = Addr;
+    uint64_t TmpOffset = Addr.getOffset();
+
+    // Iterate through the GEP folding the constants into offsets where
+    // we can.
+    gep_type_iterator GTI = gep_type_begin(U);
+    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
+         ++i, ++GTI) {
+      const Value *Op = *i;
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+        TmpOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
+        for (;;) {
+          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+            // Constant-offset addressing.
+            TmpOffset += CI->getSExtValue() * S;
+            break;
+          }
+          if (canFoldAddIntoGEP(U, Op)) {
+            // A compatible add with a constant operand. Fold the constant.
+            ConstantInt *CI =
+                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+            TmpOffset += CI->getSExtValue() * S;
+            // Iterate on the other operand.
+            Op = cast<AddOperator>(Op)->getOperand(0);
+            continue;
+          }
+          // Unsupported
+          goto unsupported_gep;
+        }
+      }
+    }
+
+    // Try to grab the base operand now.
+    Addr.setOffset(TmpOffset);
+    if (ComputeAddress(U->getOperand(0), Addr))
+      return true;
+
+    // We failed, restore everything and try the other options.
+    Addr = SavedAddr;
+
+  unsupported_gep:
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst *AI = cast<AllocaInst>(Obj);
+    DenseMap<const AllocaInst *, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      Addr.setKind(Address::FrameIndexBase);
+      Addr.setFI(SI->second);
+      return true;
+    }
+    break;
+  }
+  }
+
+  // Try to get this in a register if nothing else has worked.
+  if (!Addr.isValid())
+    Addr.setReg(getRegForValue(Obj));
+  return Addr.isValid();
+}
+
+bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(Ty, true);
+
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple())
+    return false;
+  VT = evt.getSimpleVT();
+
+  // This is a legal type, but it's not something we handle in fast-isel.
+  if (VT == MVT::f128)
+    return false;
+
+  // Handle all other legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+bool AArch64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT))
+    return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now. For stores, this reflects truncation.
+  if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+    return true;
+
+  return false;
+}
+
+bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT,
+                                      int64_t ScaleFactor, bool UseUnscaled) {
+  bool needsLowering = false;
+  int64_t Offset = Addr.getOffset();
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+  case MVT::f32:
+  case MVT::f64:
+    if (!UseUnscaled)
+      // Using scaled, 12-bit, unsigned immediate offsets.
+      needsLowering = ((Offset & 0xfff) != Offset);
+    else
+      // Using unscaled, 9-bit, signed immediate offsets.
+      needsLowering = (Offset > 256 || Offset < -256);
+    break;
+  }
+
+  // FIXME: If this is a stack pointer and the offset needs to be simplified
+  // then put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (needsLowering && Addr.getKind() == Address::FrameIndexBase) {
+    return false;
+  }
+
+  // Since the offset is too large for the load/store instruction get the
+  // reg+offset into a register.
+  if (needsLowering) {
+    uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor;
+    unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false,
+                                      UnscaledOffset, MVT::i64);
+    if (ResultReg == 0)
+      return false;
+    Addr.setReg(ResultReg);
+    Addr.setOffset(0);
+  }
+  return true;
+}
+
+void AArch64FastISel::AddLoadStoreOperands(Address &Addr,
+                                           const MachineInstrBuilder &MIB,
+                                           unsigned Flags, bool UseUnscaled) {
+  int64_t Offset = Addr.getOffset();
+  // Frame base works a bit differently. Handle it separately.
+  if (Addr.getKind() == Address::FrameIndexBase) {
+    int FI = Addr.getFI();
+    // FIXME: We shouldn't be using getObjectSize/getObjectAlignment.  The size
+    // and alignment should be based on the VT.
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(FI, Offset), Flags,
+        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+    // Now add the rest of the operands.
+    MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+  } else {
+    // Now add the rest of the operands.
+    MIB.addReg(Addr.getReg());
+    MIB.addImm(Offset);
+  }
+}
+
+bool AArch64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
+                               bool UseUnscaled) {
+  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+  if (!UseUnscaled && Addr.getOffset() < 0)
+    UseUnscaled = true;
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  bool VTIsi1 = false;
+  int64_t ScaleFactor = 0;
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+    VTIsi1 = true;
+  // Intentional fall-through.
+  case MVT::i8:
+    Opc = UseUnscaled ? AArch64::LDURBBi : AArch64::LDRBBui;
+    RC = &AArch64::GPR32RegClass;
+    ScaleFactor = 1;
+    break;
+  case MVT::i16:
+    Opc = UseUnscaled ? AArch64::LDURHHi : AArch64::LDRHHui;
+    RC = &AArch64::GPR32RegClass;
+    ScaleFactor = 2;
+    break;
+  case MVT::i32:
+    Opc = UseUnscaled ? AArch64::LDURWi : AArch64::LDRWui;
+    RC = &AArch64::GPR32RegClass;
+    ScaleFactor = 4;
+    break;
+  case MVT::i64:
+    Opc = UseUnscaled ? AArch64::LDURXi : AArch64::LDRXui;
+    RC = &AArch64::GPR64RegClass;
+    ScaleFactor = 8;
+    break;
+  case MVT::f32:
+    Opc = UseUnscaled ? AArch64::LDURSi : AArch64::LDRSui;
+    RC = TLI.getRegClassFor(VT);
+    ScaleFactor = 4;
+    break;
+  case MVT::f64:
+    Opc = UseUnscaled ? AArch64::LDURDi : AArch64::LDRDui;
+    RC = TLI.getRegClassFor(VT);
+    ScaleFactor = 8;
+    break;
+  }
+  // Scale the offset.
+  if (!UseUnscaled) {
+    int64_t Offset = Addr.getOffset();
+    if (Offset & (ScaleFactor - 1))
+      // Retry using an unscaled, 9-bit, signed immediate offset.
+      return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true);
+
+    Addr.setOffset(Offset / ScaleFactor);
+  }
+
+  // Simplify this down to something we can handle.
+  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
+    return false;
+
+  // Create the base instruction, then add the operands.
+  ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(Opc), ResultReg);
+  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled);
+
+  // Loading an i1 requires special handling.
+  if (VTIsi1) {
+    MRI.constrainRegClass(ResultReg, &AArch64::GPR32RegClass);
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+            ANDReg)
+        .addReg(ResultReg)
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+    ResultReg = ANDReg;
+  }
+  return true;
+}
+
+bool AArch64FastISel::SelectLoad(const Instruction *I) {
+  MVT VT;
+  // Verify we have a legal type before going any further.  Currently, we handle
+  // simple types that will directly fit in a register (i32/f32/i64/f64) or
+  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+  if (!isLoadStoreTypeLegal(I->getType(), VT) || cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg;
+  if (!EmitLoad(VT, ResultReg, Addr))
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
+                                bool UseUnscaled) {
+  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+  if (!UseUnscaled && Addr.getOffset() < 0)
+    UseUnscaled = true;
+
+  unsigned StrOpc;
+  bool VTIsi1 = false;
+  int64_t ScaleFactor = 0;
+  // Using scaled, 12-bit, unsigned immediate offsets.
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+    VTIsi1 = true;
+  case MVT::i8:
+    StrOpc = UseUnscaled ? AArch64::STURBBi : AArch64::STRBBui;
+    ScaleFactor = 1;
+    break;
+  case MVT::i16:
+    StrOpc = UseUnscaled ? AArch64::STURHHi : AArch64::STRHHui;
+    ScaleFactor = 2;
+    break;
+  case MVT::i32:
+    StrOpc = UseUnscaled ? AArch64::STURWi : AArch64::STRWui;
+    ScaleFactor = 4;
+    break;
+  case MVT::i64:
+    StrOpc = UseUnscaled ? AArch64::STURXi : AArch64::STRXui;
+    ScaleFactor = 8;
+    break;
+  case MVT::f32:
+    StrOpc = UseUnscaled ? AArch64::STURSi : AArch64::STRSui;
+    ScaleFactor = 4;
+    break;
+  case MVT::f64:
+    StrOpc = UseUnscaled ? AArch64::STURDi : AArch64::STRDui;
+    ScaleFactor = 8;
+    break;
+  }
+  // Scale the offset.
+  if (!UseUnscaled) {
+    int64_t Offset = Addr.getOffset();
+    if (Offset & (ScaleFactor - 1))
+      // Retry using an unscaled, 9-bit, signed immediate offset.
+      return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true);
+
+    Addr.setOffset(Offset / ScaleFactor);
+  }
+
+  // Simplify this down to something we can handle.
+  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
+    return false;
+
+  // Storing an i1 requires special handling.
+  if (VTIsi1) {
+    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+            ANDReg)
+        .addReg(SrcReg)
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+    SrcReg = ANDReg;
+  }
+  // Create the base instruction, then add the operands.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(StrOpc)).addReg(SrcReg);
+  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled);
+  return true;
+}
+
+bool AArch64FastISel::SelectStore(const Instruction *I) {
+  MVT VT;
+  Value *Op0 = I->getOperand(0);
+  // Verify we have a legal type before going any further.  Currently, we handle
+  // simple types that will directly fit in a register (i32/f32/i64/f64) or
+  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
+  if (!isLoadStoreTypeLegal(Op0->getType(), VT) ||
+      cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Get the value to be stored into a register.
+  unsigned SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!EmitStore(VT, SrcReg, Addr))
+    return false;
+  return true;
+}
+
+static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
+  switch (Pred) {
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UEQ:
+  default:
+    // AL is our "false" for now. The other two need more compares.
+    return AArch64CC::AL;
+  case CmpInst::ICMP_EQ:
+  case CmpInst::FCMP_OEQ:
+    return AArch64CC::EQ;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::FCMP_OGT:
+    return AArch64CC::GT;
+  case CmpInst::ICMP_SGE:
+  case CmpInst::FCMP_OGE:
+    return AArch64CC::GE;
+  case CmpInst::ICMP_UGT:
+  case CmpInst::FCMP_UGT:
+    return AArch64CC::HI;
+  case CmpInst::FCMP_OLT:
+    return AArch64CC::MI;
+  case CmpInst::ICMP_ULE:
+  case CmpInst::FCMP_OLE:
+    return AArch64CC::LS;
+  case CmpInst::FCMP_ORD:
+    return AArch64CC::VC;
+  case CmpInst::FCMP_UNO:
+    return AArch64CC::VS;
+  case CmpInst::FCMP_UGE:
+    return AArch64CC::PL;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::FCMP_ULT:
+    return AArch64CC::LT;
+  case CmpInst::ICMP_SLE:
+  case CmpInst::FCMP_ULE:
+    return AArch64CC::LE;
+  case CmpInst::FCMP_UNE:
+  case CmpInst::ICMP_NE:
+    return AArch64CC::NE;
+  case CmpInst::ICMP_UGE:
+    return AArch64CC::HS;
+  case CmpInst::ICMP_ULT:
+    return AArch64CC::LO;
+  }
+}
+
+bool AArch64FastISel::SelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
+      // We may not handle every CC for now.
+      AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
+      if (CC == AArch64CC::AL)
+        return false;
+
+      // Emit the cmp.
+      if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+        return false;
+
+      // Emit the branch.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+          .addImm(CC)
+          .addMBB(TBB);
+      FuncInfo.MBB->addSuccessor(TBB);
+
+      FastEmitBranch(FBB, DbgLoc);
+      return true;
+    }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    MVT SrcVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) {
+      unsigned CondReg = getRegForValue(TI->getOperand(0));
+      if (CondReg == 0)
+        return false;
+
+      // Issue an extract_subreg to get the lower 32-bits.
+      if (SrcVT == MVT::i64)
+        CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true,
+                                             AArch64::sub_32);
+
+      MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
+      unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::ANDWri), ANDReg)
+          .addReg(CondReg)
+          .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::SUBSWri))
+          .addReg(ANDReg)
+          .addReg(ANDReg)
+          .addImm(0)
+          .addImm(0);
+
+      unsigned CC = AArch64CC::NE;
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        CC = AArch64CC::EQ;
+      }
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+          .addImm(CC)
+          .addMBB(TBB);
+      FuncInfo.MBB->addSuccessor(TBB);
+      FastEmitBranch(FBB, DbgLoc);
+      return true;
+    }
+  } else if (const ConstantInt *CI =
+                 dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
+        .addMBB(Target);
+    FuncInfo.MBB->addSuccessor(Target);
+    return true;
+  }
+
+  unsigned CondReg = getRegForValue(BI->getCondition());
+  if (CondReg == 0)
+    return false;
+
+  // We've been divorced from our compare!  Our block was split, and
+  // now our compare lives in a predecessor block.  We musn't
+  // re-compare here, as the children of the compare aren't guaranteed
+  // live across the block boundary (we *could* check for this).
+  // Regardless, the compare has been done in the predecessor block,
+  // and it left a value for us in a virtual register.  Ergo, we test
+  // the one-bit value left in the virtual register.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri),
+          AArch64::WZR)
+      .addReg(CondReg)
+      .addImm(0)
+      .addImm(0);
+
+  unsigned CC = AArch64CC::NE;
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    CC = AArch64CC::EQ;
+  }
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+      .addImm(CC)
+      .addMBB(TBB);
+  FuncInfo.MBB->addSuccessor(TBB);
+  FastEmitBranch(FBB, DbgLoc);
+  return true;
+}
+
+bool AArch64FastISel::SelectIndirectBr(const Instruction *I) {
+  const IndirectBrInst *BI = cast<IndirectBrInst>(I);
+  unsigned AddrReg = getRegForValue(BI->getOperand(0));
+  if (AddrReg == 0)
+    return false;
+
+  // Emit the indirect branch.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BR))
+      .addReg(AddrReg);
+
+  // Make sure the CFG is up-to-date.
+  for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]);
+
+  return true;
+}
+
+bool AArch64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
+  Type *Ty = Src1Value->getType();
+  EVT SrcEVT = TLI.getValueType(Ty, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  // Check to see if the 2nd operand is a constant that we can encode directly
+  // in the compare.
+  uint64_t Imm;
+  bool UseImm = false;
+  bool isNegativeImm = false;
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
+    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
+        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+      const APInt &CIVal = ConstInt->getValue();
+
+      Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue();
+      if (CIVal.isNegative()) {
+        isNegativeImm = true;
+        Imm = -Imm;
+      }
+      // FIXME: We can handle more immediates using shifts.
+      UseImm = ((Imm & 0xfff) == Imm);
+    }
+  } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
+    if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
+      if (ConstFP->isZero() && !ConstFP->isNegative())
+        UseImm = true;
+  }
+
+  unsigned ZReg;
+  unsigned CmpOpc;
+  bool isICmp = true;
+  bool needsExt = false;
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+    needsExt = true;
+  // Intentional fall-through.
+  case MVT::i32:
+    ZReg = AArch64::WZR;
+    if (UseImm)
+      CmpOpc = isNegativeImm ? AArch64::ADDSWri : AArch64::SUBSWri;
+    else
+      CmpOpc = AArch64::SUBSWrr;
+    break;
+  case MVT::i64:
+    ZReg = AArch64::XZR;
+    if (UseImm)
+      CmpOpc = isNegativeImm ? AArch64::ADDSXri : AArch64::SUBSXri;
+    else
+      CmpOpc = AArch64::SUBSXrr;
+    break;
+  case MVT::f32:
+    isICmp = false;
+    CmpOpc = UseImm ? AArch64::FCMPSri : AArch64::FCMPSrr;
+    break;
+  case MVT::f64:
+    isICmp = false;
+    CmpOpc = UseImm ? AArch64::FCMPDri : AArch64::FCMPDrr;
+    break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(Src1Value);
+  if (SrcReg1 == 0)
+    return false;
+
+  unsigned SrcReg2;
+  if (!UseImm) {
+    SrcReg2 = getRegForValue(Src2Value);
+    if (SrcReg2 == 0)
+      return false;
+  }
+
+  // We have i1, i8, or i16, we need to either zero extend or sign extend.
+  if (needsExt) {
+    SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
+    if (SrcReg1 == 0)
+      return false;
+    if (!UseImm) {
+      SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
+      if (SrcReg2 == 0)
+        return false;
+    }
+  }
+
+  if (isICmp) {
+    if (UseImm)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(ZReg)
+          .addReg(SrcReg1)
+          .addImm(Imm)
+          .addImm(0);
+    else
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(ZReg)
+          .addReg(SrcReg1)
+          .addReg(SrcReg2);
+  } else {
+    if (UseImm)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(SrcReg1);
+    else
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
+          .addReg(SrcReg1)
+          .addReg(SrcReg2);
+  }
+  return true;
+}
+
+bool AArch64FastISel::SelectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  // We may not handle every CC for now.
+  AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
+  if (CC == AArch64CC::AL)
+    return false;
+
+  // Emit the cmp.
+  if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+    return false;
+
+  // Now set a register based on the comparison.
+  AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
+  unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+          ResultReg)
+      .addReg(AArch64::WZR)
+      .addReg(AArch64::WZR)
+      .addImm(invertedCC);
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectSelect(const Instruction *I) {
+  const SelectInst *SI = cast<SelectInst>(I);
+
+  EVT DestEVT = TLI.getValueType(SI->getType(), true);
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT DestVT = DestEVT.getSimpleVT();
+  if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 &&
+      DestVT != MVT::f64)
+    return false;
+
+  unsigned CondReg = getRegForValue(SI->getCondition());
+  if (CondReg == 0)
+    return false;
+  unsigned TrueReg = getRegForValue(SI->getTrueValue());
+  if (TrueReg == 0)
+    return false;
+  unsigned FalseReg = getRegForValue(SI->getFalseValue());
+  if (FalseReg == 0)
+    return false;
+
+
+  MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
+  unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+          ANDReg)
+      .addReg(CondReg)
+      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri))
+      .addReg(ANDReg)
+      .addReg(ANDReg)
+      .addImm(0)
+      .addImm(0);
+
+  unsigned SelectOpc;
+  switch (DestVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i32:
+    SelectOpc = AArch64::CSELWr;
+    break;
+  case MVT::i64:
+    SelectOpc = AArch64::CSELXr;
+    break;
+  case MVT::f32:
+    SelectOpc = AArch64::FCSELSrrr;
+    break;
+  case MVT::f64:
+    SelectOpc = AArch64::FCSELDrrr;
+    break;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc),
+          ResultReg)
+      .addReg(TrueReg)
+      .addReg(FalseReg)
+      .addImm(AArch64CC::NE);
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectFPExt(const Instruction *I) {
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
+    return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
+          ResultReg).addReg(Op);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectFPTrunc(const Instruction *I) {
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
+    return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0)
+    return false;
+
+  unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
+          ResultReg).addReg(Op);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// FPToUI and FPToSI
+bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
+  MVT DestVT;
+  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+    return false;
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (SrcReg == 0)
+    return false;
+
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+  if (SrcVT == MVT::f128)
+    return false;
+
+  unsigned Opc;
+  if (SrcVT == MVT::f64) {
+    if (Signed)
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWDr : AArch64::FCVTZSUXDr;
+    else
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWDr : AArch64::FCVTZUUXDr;
+  } else {
+    if (Signed)
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWSr : AArch64::FCVTZSUXSr;
+    else
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr;
+  }
+  unsigned ResultReg = createResultReg(
+      DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
+  MVT DestVT;
+  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
+    return false;
+  assert ((DestVT == MVT::f32 || DestVT == MVT::f64) &&
+          "Unexpected value type.");
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (SrcReg == 0)
+    return false;
+
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+
+  // Handle sign-extension.
+  if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+    SrcReg =
+        EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
+    if (SrcReg == 0)
+      return false;
+  }
+
+  MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &AArch64::GPR64RegClass
+                                                  : &AArch64::GPR32RegClass);
+
+  unsigned Opc;
+  if (SrcVT == MVT::i64) {
+    if (Signed)
+      Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUXSri : AArch64::SCVTFUXDri;
+    else
+      Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUXSri : AArch64::UCVTFUXDri;
+  } else {
+    if (Signed)
+      Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUWSri : AArch64::SCVTFUWDri;
+    else
+      Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::ProcessCallArgs(
+    SmallVectorImpl<Value *> &Args, SmallVectorImpl<unsigned> &ArgRegs,
+    SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+    SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+    unsigned &NumBytes) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
+      .addImm(NumBytes);
+
+  // Process the args.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Handle arg promotion: SExt, ZExt, AExt.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
+      if (Arg == 0)
+        return false;
+      ArgVT = DestVT;
+      break;
+    }
+    case CCValAssign::AExt:
+    // Intentional fall-through.
+    case CCValAssign::ZExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
+      if (Arg == 0)
+        return false;
+      ArgVT = DestVT;
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown arg promotion!");
+    }
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
+      RegArgs.push_back(VA.getLocReg());
+    } else if (VA.needsCustom()) {
+      // FIXME: Handle custom args.
+      return false;
+    } else {
+      assert(VA.isMemLoc() && "Assuming store on stack.");
+
+      // Need to store on the stack.
+      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+
+      unsigned BEAlign = 0;
+      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+        BEAlign = 8 - ArgSize;
+
+      Address Addr;
+      Addr.setKind(Address::RegBase);
+      Addr.setReg(AArch64::SP);
+      Addr.setOffset(VA.getLocMemOffset() + BEAlign);
+
+      if (!EmitStore(ArgVT, Arg, Addr))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                                 const Instruction *I, CallingConv::ID CC,
+                                 unsigned &NumBytes) {
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+      .addImm(NumBytes)
+      .addImm(0);
+
+  // Now the return value.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
+
+    // Only handle a single return value.
+    if (RVLocs.size() != 1)
+      return false;
+
+    // Copy all of the result registers out of their specified physreg.
+    MVT CopyVT = RVLocs[0].getValVT();
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(RVLocs[0].getLocReg());
+    UsedRegs.push_back(RVLocs[0].getLocReg());
+
+    // Finally update the result.
+    UpdateValueMap(I, ResultReg);
+  }
+
+  return true;
+}
+
+bool AArch64FastISel::SelectCall(const Instruction *I,
+                                 const char *IntrMemName = nullptr) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Don't handle inline asm or intrinsics.
+  if (isa<InlineAsm>(Callee))
+    return false;
+
+  // Only handle global variable Callees.
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV)
+    return false;
+
+  // Check the calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  // Let SDISel handle vararg functions.
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  if (FTy->isVarArg())
+    return false;
+
+  // Handle *simple* calls for now.
+  MVT RetVT;
+  Type *RetTy = I->getType();
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  // Set up the argument vectors.
+  SmallVector<Value *, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  Args.reserve(CS.arg_size());
+  ArgRegs.reserve(CS.arg_size());
+  ArgVTs.reserve(CS.arg_size());
+  ArgFlags.reserve(CS.arg_size());
+
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    // If we're lowering a memory intrinsic instead of a regular call, skip the
+    // last two arguments, which shouldn't be passed to the underlying function.
+    if (IntrMemName && e - i <= 2)
+      break;
+
+    unsigned Arg = getRegForValue(*i);
+    if (Arg == 0)
+      return false;
+
+    ISD::ArgFlagsTy Flags;
+    unsigned AttrInd = i - CS.arg_begin() + 1;
+    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+      Flags.setZExt();
+
+    // FIXME: Only handle *easy* calls for now.
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+      return false;
+
+    MVT ArgVT;
+    Type *ArgTy = (*i)->getType();
+    if (!isTypeLegal(ArgTy, ArgVT) &&
+        !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16))
+      return false;
+
+    // We don't handle vector parameters yet.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+      return false;
+
+    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*i);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Handle the arguments now that we've gotten them.
+  SmallVector<unsigned, 4> RegArgs;
+  unsigned NumBytes;
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+    return false;
+
+  // Issue the call.
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BL));
+  if (!IntrMemName)
+    MIB.addGlobalAddress(GV, 0, 0);
+  else
+    MIB.addExternalSymbol(IntrMemName, 0);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes))
+    return false;
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+bool AArch64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
+  if (Alignment)
+    return Len / Alignment <= 4;
+  else
+    return Len < 32;
+}
+
+bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
+                                         uint64_t Len, unsigned Alignment) {
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!IsMemCpySmall(Len, Alignment))
+    return false;
+
+  int64_t UnscaledOffset = 0;
+  Address OrigDest = Dest;
+  Address OrigSrc = Src;
+
+  while (Len) {
+    MVT VT;
+    if (!Alignment || Alignment >= 8) {
+      if (Len >= 8)
+        VT = MVT::i64;
+      else if (Len >= 4)
+        VT = MVT::i32;
+      else if (Len >= 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    } else {
+      // Bound based on alignment.
+      if (Len >= 4 && Alignment == 4)
+        VT = MVT::i32;
+      else if (Len >= 2 && Alignment == 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    }
+
+    bool RV;
+    unsigned ResultReg;
+    RV = EmitLoad(VT, ResultReg, Src);
+    assert(RV == true && "Should be able to handle this load.");
+    RV = EmitStore(VT, ResultReg, Dest);
+    assert(RV == true && "Should be able to handle this store.");
+    (void)RV;
+
+    int64_t Size = VT.getSizeInBits() / 8;
+    Len -= Size;
+    UnscaledOffset += Size;
+
+    // We need to recompute the unscaled offset for each iteration.
+    Dest.setOffset(OrigDest.getOffset() + UnscaledOffset);
+    Src.setOffset(OrigSrc.getOffset() + UnscaledOffset);
+  }
+
+  return true;
+}
+
+bool AArch64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
+  // FIXME: Handle more intrinsics.
+  switch (I.getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove: {
+    const MemTransferInst &MTI = cast<MemTransferInst>(I);
+    // Don't handle volatile.
+    if (MTI.isVolatile())
+      return false;
+
+    // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
+    // we would emit dead code because we don't currently handle memmoves.
+    bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
+    if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+      // Small memcpy's are common enough that we want to do them without a call
+      // if possible.
+      uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
+      unsigned Alignment = MTI.getAlignment();
+      if (IsMemCpySmall(Len, Alignment)) {
+        Address Dest, Src;
+        if (!ComputeAddress(MTI.getRawDest(), Dest) ||
+            !ComputeAddress(MTI.getRawSource(), Src))
+          return false;
+        if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+          return true;
+      }
+    }
+
+    if (!MTI.getLength()->getType()->isIntegerTy(64))
+      return false;
+
+    if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+    const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
+    return SelectCall(&I, IntrMemName);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst &MSI = cast<MemSetInst>(I);
+    // Don't handle volatile.
+    if (MSI.isVolatile())
+      return false;
+
+    if (!MSI.getLength()->getType()->isIntegerTy(64))
+      return false;
+
+    if (MSI.getDestAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+    return SelectCall(&I, "memset");
+  }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
+        .addImm(1);
+    return true;
+  }
+  }
+  return false;
+}
+
+bool AArch64FastISel::SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  if (F.isVarArg())
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  if (Ret->getNumOperands() > 0) {
+    CallingConv::ID CC = F.getCallingConv();
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
+                   I->getContext());
+    CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
+                                                     : RetCC_AArch64_AAPCS;
+    CCInfo.AnalyzeReturn(Outs, RetCC);
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+    const Value *RV = Ret->getOperand(0);
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    unsigned DestReg = VA.getLocReg();
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!MRI.getRegClass(SrcReg)->contains(DestReg))
+      return false;
+
+    EVT RVEVT = TLI.getValueType(RV->getType());
+    if (!RVEVT.isSimple())
+      return false;
+
+    // Vectors (of > 1 lane) in big endian need tricky handling.
+    if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1)
+      return false;
+
+    MVT RVVT = RVEVT.getSimpleVT();
+    if (RVVT == MVT::f128)
+      return false;
+    MVT DestVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (RVVT != DestVT) {
+      if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+        return false;
+
+      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+        return false;
+
+      bool isZExt = Outs[0].Flags.isZExt();
+      SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt);
+      if (SrcReg == 0)
+        return false;
+    }
+
+    // Make the copy.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(AArch64::RET_ReallyLR));
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+  return true;
+}
+
+bool AArch64FastISel::SelectTrunc(const Instruction *I) {
+  Type *DestTy = I->getType();
+  Value *Op = I->getOperand(0);
+  Type *SrcTy = Op->getType();
+
+  EVT SrcEVT = TLI.getValueType(SrcTy, true);
+  EVT DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i8)
+    return false;
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 &&
+      DestVT != MVT::i1)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Op);
+  if (!SrcReg)
+    return false;
+
+  // If we're truncating from i64 to a smaller non-legal type then generate an
+  // AND.  Otherwise, we know the high bits are undefined and a truncate doesn't
+  // generate any code.
+  if (SrcVT == MVT::i64) {
+    uint64_t Mask = 0;
+    switch (DestVT.SimpleTy) {
+    default:
+      // Trunc i64 to i32 is handled by the target-independent fast-isel.
+      return false;
+    case MVT::i1:
+      Mask = 0x1;
+      break;
+    case MVT::i8:
+      Mask = 0xff;
+      break;
+    case MVT::i16:
+      Mask = 0xffff;
+      break;
+    }
+    // Issue an extract_subreg to get the lower 32-bits.
+    unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true,
+                                                AArch64::sub_32);
+    MRI.constrainRegClass(Reg32, &AArch64::GPR32RegClass);
+    // Create the AND instruction which performs the actual truncation.
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+            ANDReg)
+        .addReg(Reg32)
+        .addImm(AArch64_AM::encodeLogicalImmediate(Mask, 32));
+    SrcReg = ANDReg;
+  }
+
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
+  assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
+          DestVT == MVT::i64) &&
+         "Unexpected value type.");
+  // Handle i8 and i16 as i32.
+  if (DestVT == MVT::i8 || DestVT == MVT::i16)
+    DestVT = MVT::i32;
+
+  if (isZExt) {
+    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+    unsigned ResultReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
+            ResultReg)
+        .addReg(SrcReg)
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+
+    if (DestVT == MVT::i64) {
+      // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
+      // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
+      unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::SUBREG_TO_REG), Reg64)
+          .addImm(0)
+          .addReg(ResultReg)
+          .addImm(AArch64::sub_32);
+      ResultReg = Reg64;
+    }
+    return ResultReg;
+  } else {
+    if (DestVT == MVT::i64) {
+      // FIXME: We're SExt i1 to i64.
+      return 0;
+    }
+    unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SBFMWri),
+            ResultReg)
+        .addReg(SrcReg)
+        .addImm(0)
+        .addImm(0);
+    return ResultReg;
+  }
+}
+
+unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                     bool isZExt) {
+  assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
+  unsigned Opc;
+  unsigned Imm = 0;
+
+  switch (SrcVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+    return Emiti1Ext(SrcReg, DestVT, isZExt);
+  case MVT::i8:
+    if (DestVT == MVT::i64)
+      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    else
+      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+    Imm = 7;
+    break;
+  case MVT::i16:
+    if (DestVT == MVT::i64)
+      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    else
+      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+    Imm = 15;
+    break;
+  case MVT::i32:
+    assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
+    Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    Imm = 31;
+    break;
+  }
+
+  // Handle i8 and i16 as i32.
+  if (DestVT == MVT::i8 || DestVT == MVT::i16)
+    DestVT = MVT::i32;
+  else if (DestVT == MVT::i64) {
+    unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), Src64)
+        .addImm(0)
+        .addReg(SrcReg)
+        .addImm(AArch64::sub_32);
+    SrcReg = Src64;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg)
+      .addImm(0)
+      .addImm(Imm);
+
+  return ResultReg;
+}
+
+bool AArch64FastISel::SelectIntExt(const Instruction *I) {
+  // On ARM, in general, integer casts don't involve legal types; this code
+  // handles promotable integers.  The high bits for a type smaller than
+  // the register size are assumed to be undefined.
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool isZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  EVT SrcEVT = TLI.getValueType(SrcTy, true);
+  EVT DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+  unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
+  if (ResultReg == 0)
+    return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestEVT = TLI.getValueType(I->getType(), true);
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT DestVT = DestEVT.getSimpleVT();
+  if (DestVT != MVT::i64 && DestVT != MVT::i32)
+    return false;
+
+  unsigned DivOpc;
+  bool is64bit = (DestVT == MVT::i64);
+  switch (ISDOpcode) {
+  default:
+    return false;
+  case ISD::SREM:
+    DivOpc = is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
+    break;
+  case ISD::UREM:
+    DivOpc = is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
+    break;
+  }
+  unsigned MSubOpc = is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src1Reg)
+    return false;
+
+  unsigned QuotReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), QuotReg)
+      .addReg(Src0Reg)
+      .addReg(Src1Reg);
+  // The remainder is computed as numerator - (quotient * denominator) using the
+  // MSUB instruction.
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg)
+      .addReg(QuotReg)
+      .addReg(Src1Reg)
+      .addReg(Src0Reg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::SelectMul(const Instruction *I) {
+  EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  // Must be simple value type.  Don't handle vectors.
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i8)
+    return false;
+
+  unsigned Opc;
+  unsigned ZReg;
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    ZReg = AArch64::WZR;
+    Opc = AArch64::MADDWrrr;
+    break;
+  case MVT::i64:
+    ZReg = AArch64::XZR;
+    Opc = AArch64::MADDXrrr;
+    break;
+  }
+
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src1Reg)
+    return false;
+
+  // Create the base instruction, then add the operands.
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(Src0Reg)
+      .addReg(Src1Reg)
+      .addReg(ZReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::TargetSelectInstruction(const Instruction *I) {
+  switch (I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Load:
+    return SelectLoad(I);
+  case Instruction::Store:
+    return SelectStore(I);
+  case Instruction::Br:
+    return SelectBranch(I);
+  case Instruction::IndirectBr:
+    return SelectIndirectBr(I);
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+    return SelectCmp(I);
+  case Instruction::Select:
+    return SelectSelect(I);
+  case Instruction::FPExt:
+    return SelectFPExt(I);
+  case Instruction::FPTrunc:
+    return SelectFPTrunc(I);
+  case Instruction::FPToSI:
+    return SelectFPToInt(I, /*Signed=*/true);
+  case Instruction::FPToUI:
+    return SelectFPToInt(I, /*Signed=*/false);
+  case Instruction::SIToFP:
+    return SelectIntToFP(I, /*Signed=*/true);
+  case Instruction::UIToFP:
+    return SelectIntToFP(I, /*Signed=*/false);
+  case Instruction::SRem:
+    return SelectRem(I, ISD::SREM);
+  case Instruction::URem:
+    return SelectRem(I, ISD::UREM);
+  case Instruction::Call:
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+      return SelectIntrinsicCall(*II);
+    return SelectCall(I);
+  case Instruction::Ret:
+    return SelectRet(I);
+  case Instruction::Trunc:
+    return SelectTrunc(I);
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    return SelectIntExt(I);
+  case Instruction::Mul:
+    // FIXME: This really should be handled by the target-independent selector.
+    return SelectMul(I);
+  }
+  return false;
+  // Silence warnings.
+  (void)&CC_AArch64_DarwinPCS_VarArg;
+}
+
+namespace llvm {
+llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &funcInfo,
+                                        const TargetLibraryInfo *libInfo) {
+  return new AArch64FastISel(funcInfo, libInfo);
+}
+}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index b29587a..deb306a 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1,4 +1,4 @@
-//===- AArch64FrameLowering.cpp - AArch64 Frame Information ---------------===//
+//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,227 +11,444 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
 #include "AArch64FrameLowering.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
-void AArch64FrameLowering::splitSPAdjustments(uint64_t Total,
-                                              uint64_t &Initial,
-                                              uint64_t &Residual) const {
-  // 0x1f0 here is a pessimistic (i.e. realistic) boundary: x-register LDP
-  // instructions have a 7-bit signed immediate scaled by 8, giving a reach of
-  // 0x1f8, but stack adjustment should always be a multiple of 16.
-  if (Total <= 0x1f0) {
-    Initial = Total;
-    Residual = 0;
-  } else {
-    Initial = 0x1f0;
-    Residual = Total - Initial;
+#define DEBUG_TYPE "frame-info"
+
+static cl::opt<bool> EnableRedZone("aarch64-redzone",
+                                   cl::desc("enable use of redzone on AArch64"),
+                                   cl::init(false), cl::Hidden);
+
+STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+
+static unsigned estimateStackSize(MachineFunction &MF) {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  int Offset = 0;
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -FFI->getObjectOffset(i);
+    if (FixedOff > Offset)
+      Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    Offset += FFI->getObjectSize(i);
+    unsigned Align = FFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset + Align - 1) / Align * Align;
   }
+  // This does not include the 16 bytes used for fp and lr.
+  return (unsigned)Offset;
 }
 
-void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
-  AArch64MachineFunctionInfo *FuncInfo =
-    MF.getInfo<AArch64MachineFunctionInfo>();
-  MachineBasicBlock &MBB = MF.front();
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
+  if (!EnableRedZone)
+    return false;
+  // Don't use the red zone if the function explicitly asks us not to.
+  // This is typically used for kernel code.
+  if (MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoRedZone))
+    return false;
 
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  unsigned NumBytes = AFI->getLocalStackSize();
+
+  // Note: currently hasFP() is always true for hasCalls(), but that's an
+  // implementation detail of the current code, not a strict requirement,
+  // so stay safe here and check both.
+  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
+    return false;
+  return true;
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.
+bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+#ifndef NDEBUG
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  assert(!RegInfo->needsStackRealignment(MF) &&
+         "No stack realignment on AArch64!");
+#endif
+
+  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function.  This eliminates the need for
+/// add/sub sp brackets around call sites.  Returns true if the call frame is
+/// included as part of the stack frame.
+bool
+AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void AArch64FrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  const AArch64InstrInfo *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  DebugLoc DL = I->getDebugLoc();
+  int Opc = I->getOpcode();
+  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  if (!TFI->hasReservedCallFrame(MF)) {
+    unsigned Align = getStackAlignment();
+
+    int64_t Amount = I->getOperand(0).getImm();
+    Amount = RoundUpToAlignment(Amount, Align);
+    if (!IsDestroy)
+      Amount = -Amount;
+
+    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
+    // doesn't have to pop anything), then the first operand will be zero too so
+    // this adjustment is a no-op.
+    if (CalleePopAmount == 0) {
+      // FIXME: in-function stack adjustment for calls is limited to 24-bits
+      // because there's no guaranteed temporary register available.
+      //
+      // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
+      // 1) For offset <= 12-bit, we use LSL #0
+      // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
+      // LSL #0, and the other uses LSL #12.
+      //
+      // Mostly call frames will be allocated at the start of a function so
+      // this is OK, but it is a limitation that needs dealing with.
+      assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
+      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
+    }
+  } else if (CalleePopAmount != 0) {
+    // If the calling convention demands that the callee pops arguments from the
+    // stack, we want to add it back if we have a reserved call frame.
+    assert(CalleePopAmount < 0xffffff && "call frame too large");
+    emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
+                    TII);
+  }
+  MBB.erase(I);
+}
+
+void AArch64FrameLowering::emitCalleeSavedFrameMoves(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    unsigned FramePtr) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  bool NeedsFrameMoves = MMI.hasDebugInfo()
-    || MF.getFunction()->needsUnwindTableEntry();
-
-  uint64_t NumInitialBytes, NumResidualBytes;
-
-  // Currently we expect the stack to be laid out by
-  //     sub sp, sp, #initial
-  //     stp x29, x30, [sp, #offset]
-  //     ...
-  //     str xxx, [sp, #offset]
-  //     sub sp, sp, #rest (possibly via extra instructions).
-  if (MFI->getCalleeSavedInfo().size()) {
-    // If there are callee-saved registers, we want to store them efficiently as
-    // a block, and virtual base assignment happens too early to do it for us so
-    // we adjust the stack in two phases: first just for callee-saved fiddling,
-    // then to allocate the rest of the frame.
-    splitSPAdjustments(MFI->getStackSize(), NumInitialBytes, NumResidualBytes);
-  } else {
-    // If there aren't any callee-saved registers, two-phase adjustment is
-    // inefficient. It's more efficient to adjust with NumInitialBytes too
-    // because when we're in a "callee pops argument space" situation, that pop
-    // must be tacked onto Initial for correctness.
-    NumInitialBytes = MFI->getStackSize();
-    NumResidualBytes = 0;
-  }
+  const AArch64InstrInfo *TII = TM.getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
+
+  const DataLayout *TD = MF.getTarget().getDataLayout();
+  bool HasFP = hasFP(MF);
+
+  // Calculate amount of bytes used for return address storing.
+  int stackGrowth = -TD->getPointerSize(0);
+
+  // Calculate offsets.
+  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
+  unsigned TotalSkipped = 0;
+  for (const auto &Info : CSI) {
+    unsigned Reg = Info.getReg();
+    int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
+                     getOffsetOfLocalArea() + saveAreaOffset;
+
+    // Don't output a new CFI directive if we're re-saving the frame pointer or
+    // link register. This happens when the PrologEpilogInserter has inserted an
+    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
+    // method automatically generates the directives when frame pointers are
+    // used. If we generate CFI directives for the extra "STP"s, the linker will
+    // lose track of the correct values for the frame pointer and link register.
+    if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) {
+      TotalSkipped += stackGrowth;
+      continue;
+    }
 
-  // Tell everyone else how much adjustment we're expecting them to use. In
-  // particular if an adjustment is required for a tail call the epilogue could
-  // have a different view of things.
-  FuncInfo->setInitialStackAdjust(NumInitialBytes);
-
-  emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16, -NumInitialBytes,
-               MachineInstr::FrameSetup);
-
-  if (NeedsFrameMoves && NumInitialBytes) {
-    // We emit this update even if the CFA is set from a frame pointer later so
-    // that the CFA is valid in the interim.
-    MachineLocation Dst(MachineLocation::VirtualFP);
-    unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true);
-    unsigned CFIIndex = MMI.addFrameInst(
-        MCCFIInstruction::createDefCfa(nullptr, Reg, -NumInitialBytes));
-    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, DwarfReg, Offset - TotalSkipped));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
   }
+}
 
-  // Otherwise we need to set the frame pointer and/or add a second stack
-  // adjustment.
-
-  bool FPNeedsSetting = hasFP(MF);
-  for (; MBBI != MBB.end(); ++MBBI) {
-    // Note that this search makes strong assumptions about the operation used
-    // to store the frame-pointer: it must be "STP x29, x30, ...". This could
-    // change in future, but until then there's no point in implementing
-    // untestable more generic cases.
-    if (FPNeedsSetting && MBBI->getOpcode() == AArch64::LSPair64_STR
-                       && MBBI->getOperand(0).getReg() == AArch64::X29) {
-      int64_t X29FrameIdx = MBBI->getOperand(2).getIndex();
-      FuncInfo->setFramePointerOffset(MFI->getObjectOffset(X29FrameIdx));
-
-      ++MBBI;
-      emitRegUpdate(MBB, MBBI, DL, TII, AArch64::X29, AArch64::XSP,
-                    AArch64::X29,
-                    NumInitialBytes + MFI->getObjectOffset(X29FrameIdx),
-                    MachineInstr::FrameSetup);
+void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  const AArch64RegisterInfo *RegInfo = TM.getRegisterInfo();
+  const AArch64InstrInfo *TII = TM.getInstrInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
+  bool HasFP = hasFP(MF);
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
 
-      // The offset adjustment used when emitting debugging locations relative
-      // to whatever frame base is set. AArch64 uses the default frame base (FP
-      // or SP) and this adjusts the calculations to be correct.
-      MFI->setOffsetAdjustment(- MFI->getObjectOffset(X29FrameIdx)
-                               - MFI->getStackSize());
-
-      if (NeedsFrameMoves) {
-        unsigned Reg = MRI->getDwarfRegNum(AArch64::X29, true);
-        unsigned Offset = MFI->getObjectOffset(X29FrameIdx);
-        unsigned CFIIndex = MMI.addFrameInst(
-            MCCFIInstruction::createDefCfa(nullptr, Reg, Offset));
-        BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-            .addCFIIndex(CFIIndex);
-      }
+  int NumBytes = (int)MFI->getStackSize();
+  if (!AFI->hasStackFrame()) {
+    assert(!HasFP && "unexpected function without stack frame but with FP");
+
+    // All of the stack allocation is for locals.
+    AFI->setLocalStackSize(NumBytes);
 
-      FPNeedsSetting = false;
+    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+    MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+
+    // REDZONE: If the stack size is less than 128 bytes, we don't need
+    // to actually allocate.
+    if (NumBytes && !canUseRedZone(MF)) {
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else if (NumBytes) {
+      ++NumRedZoneFunctions;
     }
 
-    if (!MBBI->getFlag(MachineInstr::FrameSetup))
-      break;
+    return;
   }
 
-  assert(!FPNeedsSetting && "Frame pointer couldn't be set");
+  // Only set up FP if we actually need to.
+  int FPOffset = 0;
+  if (HasFP) {
+    // First instruction must a) allocate the stack  and b) have an immediate
+    // that is a multiple of -2.
+    assert((MBBI->getOpcode() == AArch64::STPXpre ||
+            MBBI->getOpcode() == AArch64::STPDpre) &&
+           MBBI->getOperand(3).getReg() == AArch64::SP &&
+           MBBI->getOperand(4).getImm() < 0 &&
+           (MBBI->getOperand(4).getImm() & 1) == 0);
+
+    // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
+    // required for the callee saved register area we get the frame pointer
+    // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
+    FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8;
+    assert(FPOffset >= 0 && "Bad Framepointer Offset");
+  }
 
-  emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16, -NumResidualBytes,
-               MachineInstr::FrameSetup);
+  // Move past the saves of the callee-saved registers.
+  while (MBBI->getOpcode() == AArch64::STPXi ||
+         MBBI->getOpcode() == AArch64::STPDi ||
+         MBBI->getOpcode() == AArch64::STPXpre ||
+         MBBI->getOpcode() == AArch64::STPDpre) {
+    ++MBBI;
+    NumBytes -= 16;
+  }
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+  if (HasFP) {
+    // Issue    sub fp, sp, FPOffset or
+    //          mov fp,sp          when FPOffset is zero.
+    // Note: All stores of callee-saved registers are marked as "FrameSetup".
+    // This code marks the instruction(s) that set the FP also.
+    emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
+                    MachineInstr::FrameSetup);
+  }
 
-  // Now we emit the rest of the frame setup information, if necessary: we've
-  // already noted the FP and initial SP moves so we're left with the prologue's
-  // final SP update and callee-saved register locations.
-  if (!NeedsFrameMoves)
-    return;
+  // All of the remaining stack allocations are for locals.
+  AFI->setLocalStackSize(NumBytes);
 
-  // The rest of the stack adjustment
-  if (!hasFP(MF) && NumResidualBytes) {
-    MachineLocation Dst(MachineLocation::VirtualFP);
-    unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true);
-    unsigned Offset = NumResidualBytes + NumInitialBytes;
-    unsigned CFIIndex =
-        MMI.addFrameInst(MCCFIInstruction::createDefCfa(nullptr, Reg, -Offset));
-    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+  // Allocate space for the rest of the frame.
+  if (NumBytes) {
+    // If we're a leaf function, try using the red zone.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
   }
 
-  // And any callee-saved registers (it's fine to leave them to the end here,
-  // because the old values are still valid at this point.
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  if (CSI.size()) {
-    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-           E = CSI.end(); I != E; ++I) {
-      unsigned Offset = MFI->getObjectOffset(I->getFrameIdx());
-      unsigned Reg = MRI->getDwarfRegNum(I->getReg(), true);
+  // If we need a base pointer, set it up here. It's whatever the value of the
+  // stack pointer is at this point. Any variable size objects will be allocated
+  // after this, so we can still use the base pointer to reference locals.
+  //
+  // FIXME: Clarify FrameSetup flags here.
+  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
+  // needed.
+  //
+  if (RegInfo->hasBasePointer(MF))
+    TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false);
+
+  if (needsFrameMoves) {
+    const DataLayout *TD = MF.getTarget().getDataLayout();
+    const int StackGrowth = -TD->getPointerSize(0);
+    unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+    // An example of the prologue:
+    //
+    //     .globl __foo
+    //     .align 2
+    //  __foo:
+    // Ltmp0:
+    //     .cfi_startproc
+    //     .cfi_personality 155, ___gxx_personality_v0
+    // Leh_func_begin:
+    //     .cfi_lsda 16, Lexception33
+    //
+    //     stp  xa,bx, [sp, -#offset]!
+    //     ...
+    //     stp  x28, x27, [sp, #offset-32]
+    //     stp  fp, lr, [sp, #offset-16]
+    //     add  fp, sp, #offset - 16
+    //     sub  sp, sp, #1360
+    //
+    // The Stack:
+    //       +-------------------------------------------+
+    // 10000 | ........ | ........ | ........ | ........ |
+    // 10004 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10008 | ........ | ........ | ........ | ........ |
+    // 1000c | ........ | ........ | ........ | ........ |
+    //       +===========================================+
+    // 10010 |                X28 Register               |
+    // 10014 |                X28 Register               |
+    //       +-------------------------------------------+
+    // 10018 |                X27 Register               |
+    // 1001c |                X27 Register               |
+    //       +===========================================+
+    // 10020 |                Frame Pointer              |
+    // 10024 |                Frame Pointer              |
+    //       +-------------------------------------------+
+    // 10028 |                Link Register              |
+    // 1002c |                Link Register              |
+    //       +===========================================+
+    // 10030 | ........ | ........ | ........ | ........ |
+    // 10034 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10038 | ........ | ........ | ........ | ........ |
+    // 1003c | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    //
+    //     [sp] = 10030        ::    >>initial value<<
+    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
+    //     fp = sp == 10020    ::  mov fp, sp
+    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
+    //     sp == 10010         ::    >>final value<<
+    //
+    // The frame pointer (w29) points to address 10020. If we use an offset of
+    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
+    // for w27, and -32 for w28:
+    //
+    //  Ltmp1:
+    //     .cfi_def_cfa w29, 16
+    //  Ltmp2:
+    //     .cfi_offset w30, -8
+    //  Ltmp3:
+    //     .cfi_offset w29, -16
+    //  Ltmp4:
+    //     .cfi_offset w27, -24
+    //  Ltmp5:
+    //     .cfi_offset w28, -32
+
+    if (HasFP) {
+      // Define the current CFA rule to use the provided FP.
+      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
       unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, Reg, Offset));
-      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored LR
+      unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored FP
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else {
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
+
+    // Now emit the moves for whatever callee saved regs we have.
+    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
   }
 }
 
-void
-AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
-                                   MachineBasicBlock &MBB) const {
-  AArch64MachineFunctionInfo *FuncInfo =
-    MF.getInfo<AArch64MachineFunctionInfo>();
+static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
+  unsigned RtIdx = 0;
+  if (MI->getOpcode() == AArch64::LDPXpost ||
+      MI->getOpcode() == AArch64::LDPDpost)
+    RtIdx = 1;
+
+  if (MI->getOpcode() == AArch64::LDPXpost ||
+      MI->getOpcode() == AArch64::LDPDpost ||
+      MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) {
+    if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) ||
+        !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) ||
+        MI->getOperand(RtIdx + 2).getReg() != AArch64::SP)
+      return false;
+    return true;
+  }
 
+  return false;
+}
+
+void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64InstrInfo *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
   DebugLoc DL = MBBI->getDebugLoc();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned RetOpcode = MBBI->getOpcode();
 
+  int NumBytes = MFI->getStackSize();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+
   // Initial and residual are named for consitency with the prologue. Note that
   // in the epilogue, the residual adjustment is executed first.
-  uint64_t NumInitialBytes = FuncInfo->getInitialStackAdjust();
-  uint64_t NumResidualBytes = MFI.getStackSize() - NumInitialBytes;
   uint64_t ArgumentPopSize = 0;
-  if (RetOpcode == AArch64::TC_RETURNdi ||
-      RetOpcode == AArch64::TC_RETURNxi) {
-    MachineOperand &JumpTarget = MBBI->getOperand(0);
+  if (RetOpcode == AArch64::TCRETURNdi || RetOpcode == AArch64::TCRETURNri) {
     MachineOperand &StackAdjust = MBBI->getOperand(1);
 
-    MachineInstrBuilder MIB;
-    if (RetOpcode == AArch64::TC_RETURNdi) {
-      MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::TAIL_Bimm));
-      if (JumpTarget.isGlobal()) {
-        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
-                             JumpTarget.getTargetFlags());
-      } else {
-        assert(JumpTarget.isSymbol() && "unexpected tail call destination");
-        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
-                              JumpTarget.getTargetFlags());
-      }
-    } else {
-      assert(RetOpcode == AArch64::TC_RETURNxi && JumpTarget.isReg()
-             && "Unexpected tail call");
-
-      MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::TAIL_BRx));
-      MIB.addReg(JumpTarget.getReg(), RegState::Kill);
-    }
-
-    // Add the extra operands onto the new tail call instruction even though
-    // they're not used directly (so that liveness is tracked properly etc).
-    for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
-        MIB->addOperand(MBBI->getOperand(i));
-
-
-    // Delete the pseudo instruction TC_RETURN.
-    MachineInstr *NewMI = std::prev(MBBI);
-    MBB.erase(MBBI);
-    MBBI = NewMI;
-
     // For a tail-call in a callee-pops-arguments environment, some or all of
     // the stack may actually be in use for the call's arguments, this is
     // calculated during LowerCall and consumed here...
@@ -241,386 +458,434 @@ AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     // conveniently stored in the MachineFunctionInfo by
     // LowerFormalArguments. This will, of course, be zero for the C calling
     // convention.
-    ArgumentPopSize = FuncInfo->getArgumentStackToRestore();
+    ArgumentPopSize = AFI->getArgumentStackToRestore();
   }
 
-  assert(NumInitialBytes % 16 == 0 && NumResidualBytes % 16 == 0
-         && "refusing to adjust stack by misaligned amt");
-
-  // We may need to address callee-saved registers differently, so find out the
-  // bound on the frame indices.
-  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
-  int MinCSFI = 0;
-  int MaxCSFI = -1;
-
-  if (CSI.size()) {
-    MinCSFI = CSI[0].getFrameIdx();
-    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  // The stack frame should be like below,
+  //
+  //      ----------------------                     ---
+  //      |                    |                      |
+  //      | BytesInStackArgArea|              CalleeArgStackSize
+  //      | (NumReusableBytes) |                (of tail call)
+  //      |                    |                     ---
+  //      |                    |                      |
+  //      ---------------------|        ---           |
+  //      |                    |         |            |
+  //      |   CalleeSavedReg   |         |            |
+  //      | (NumRestores * 16) |         |            |
+  //      |                    |         |            |
+  //      ---------------------|         |         NumBytes
+  //      |                    |     StackSize  (StackAdjustUp)
+  //      |   LocalStackSize   |         |            |
+  //      | (covering callee   |         |            |
+  //      |       args)        |         |            |
+  //      |                    |         |            |
+  //      ----------------------        ---          ---
+  //
+  // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
+  //             = StackSize + ArgumentPopSize
+  //
+  // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
+  // it as the 2nd argument of AArch64ISD::TC_RETURN.
+  NumBytes += ArgumentPopSize;
+
+  unsigned NumRestores = 0;
+  // Move past the restores of the callee-saved registers.
+  MachineBasicBlock::iterator LastPopI = MBBI;
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  if (LastPopI != MBB.begin()) {
+    do {
+      ++NumRestores;
+      --LastPopI;
+    } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
+    if (!isCSRestore(LastPopI, CSRegs)) {
+      ++LastPopI;
+      --NumRestores;
+    }
   }
-
-  // The "residual" stack update comes first from this direction and guarantees
-  // that SP is NumInitialBytes below its value on function entry, either by a
-  // direct update or restoring it from the frame pointer.
-  if (NumInitialBytes + ArgumentPopSize != 0) {
-    emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16,
-                 NumInitialBytes + ArgumentPopSize);
-    --MBBI;
+  NumBytes -= NumRestores * 16;
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  if (!hasFP(MF)) {
+    // If this was a redzone leaf function, we don't need to restore the
+    // stack pointer.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes,
+                      TII);
+    return;
   }
 
+  // Restore the original stack pointer.
+  // FIXME: Rather than doing the math here, we should instead just use
+  // non-post-indexed loads for the restores if we aren't actually going to
+  // be able to save any instructions.
+  if (NumBytes || MFI->hasVarSizedObjects())
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
+                    -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
+}
 
-  // MBBI now points to the instruction just past the last callee-saved
-  // restoration (either RET/B if NumInitialBytes == 0, or the "ADD sp, sp"
-  // otherwise).
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index.
+int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                              int FI) const {
+  unsigned FrameReg;
+  return getFrameIndexReference(MF, FI, FrameReg);
+}
 
-  // Now we need to find out where to put the bulk of the stack adjustment
-  MachineBasicBlock::iterator FirstEpilogue = MBBI;
-  while (MBBI != MBB.begin()) {
-    --MBBI;
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info.  It's the same as what we use for resolving the code-gen
+/// references for now.  FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                 int FI,
+                                                 unsigned &FrameReg) const {
+  return resolveFrameIndexReference(MF, FI, FrameReg);
+}
 
-    unsigned FrameOp;
-    for (FrameOp = 0; FrameOp < MBBI->getNumOperands(); ++FrameOp) {
-      if (MBBI->getOperand(FrameOp).isFI())
-        break;
+int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
+                                                     int FI, unsigned &FrameReg,
+                                                     bool PreferFP) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  int FPOffset = MFI->getObjectOffset(FI) + 16;
+  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+  bool isFixed = MFI->isFixedObjectIndex(FI);
+
+  // Use frame pointer to reference fixed objects. Use it for locals if
+  // there are VLAs (and thus the SP isn't reliable as a base).
+  // Make sure useFPForScavengingIndex() does the right thing for the emergency
+  // spill slot.
+  bool UseFP = false;
+  if (AFI->hasStackFrame()) {
+    // Note: Keeping the following as multiple 'if' statements rather than
+    // merging to a single expression for readability.
+    //
+    // Argument access should always use the FP.
+    if (isFixed) {
+      UseFP = hasFP(MF);
+    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) {
+      // Use SP or FP, whichever gives us the best chance of the offset
+      // being in range for direct access. If the FPOffset is positive,
+      // that'll always be best, as the SP will be even further away.
+      // If the FPOffset is negative, we have to keep in mind that the
+      // available offset range for negative offsets is smaller than for
+      // positive ones. If we have variable sized objects, we're stuck with
+      // using the FP regardless, though, as the SP offset is unknown
+      // and we don't have a base pointer available. If an offset is
+      // available via the FP and the SP, use whichever is closest.
+      if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
+          (FPOffset >= -256 && Offset > -FPOffset))
+        UseFP = true;
     }
-
-    // If this instruction doesn't have a frame index we've reached the end of
-    // the callee-save restoration.
-    if (FrameOp == MBBI->getNumOperands())
-      break;
-
-    // Likewise if it *is* a local reference, but not to a callee-saved object.
-    int FrameIdx = MBBI->getOperand(FrameOp).getIndex();
-    if (FrameIdx < MinCSFI || FrameIdx > MaxCSFI)
-      break;
-
-    FirstEpilogue = MBBI;
   }
 
-  if (MF.getFrameInfo()->hasVarSizedObjects()) {
-    int64_t StaticFrameBase;
-    StaticFrameBase = -(NumInitialBytes + FuncInfo->getFramePointerOffset());
-    emitRegUpdate(MBB, FirstEpilogue, DL, TII,
-                  AArch64::XSP, AArch64::X29, AArch64::NoRegister,
-                  StaticFrameBase);
-  } else {
-    emitSPUpdate(MBB, FirstEpilogue, DL,TII, AArch64::X16, NumResidualBytes);
+  if (UseFP) {
+    FrameReg = RegInfo->getFrameRegister(MF);
+    return FPOffset;
   }
-}
 
-int64_t
-AArch64FrameLowering::resolveFrameIndexReference(MachineFunction &MF,
-                                                 int FrameIndex,
-                                                 unsigned &FrameReg,
-                                                 int SPAdj,
-                                                 bool IsCalleeSaveOp) const {
-  AArch64MachineFunctionInfo *FuncInfo =
-    MF.getInfo<AArch64MachineFunctionInfo>();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  int64_t TopOfFrameOffset = MFI->getObjectOffset(FrameIndex);
-
-  assert(!(IsCalleeSaveOp && FuncInfo->getInitialStackAdjust() == 0)
-         && "callee-saved register in unexpected place");
-
-  // If the frame for this function is particularly large, we adjust the stack
-  // in two phases which means the callee-save related operations see a
-  // different (intermediate) stack size.
-  int64_t FrameRegPos;
-  if (IsCalleeSaveOp) {
-    FrameReg = AArch64::XSP;
-    FrameRegPos = -static_cast<int64_t>(FuncInfo->getInitialStackAdjust());
-  } else if (useFPForAddressing(MF)) {
-    // Have to use the frame pointer since we have no idea where SP is.
-    FrameReg = AArch64::X29;
-    FrameRegPos = FuncInfo->getFramePointerOffset();
-  } else {
-    FrameReg = AArch64::XSP;
-    FrameRegPos = -static_cast<int64_t>(MFI->getStackSize()) + SPAdj;
+  // Use the base pointer if we have one.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  else {
+    FrameReg = AArch64::SP;
+    // If we're using the red zone for this function, the SP won't actually
+    // be adjusted, so the offsets will be negative. They're also all
+    // within range of the signed 9-bit immediate instructions.
+    if (canUseRedZone(MF))
+      Offset -= AFI->getLocalStackSize();
   }
 
-  return TopOfFrameOffset - FrameRegPos;
+  return Offset;
 }
 
-void
-AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                       RegScavenger *RS) const {
-  const AArch64RegisterInfo *RegInfo =
-    static_cast<const AArch64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const AArch64InstrInfo &TII =
-    *static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
-
-  if (hasFP(MF)) {
-    MF.getRegInfo().setPhysRegUsed(AArch64::X29);
-    MF.getRegInfo().setPhysRegUsed(AArch64::X30);
-  }
+static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
+  if (Reg != AArch64::LR)
+    return getKillRegState(true);
 
-  // If addressing of local variables is going to be more complicated than
-  // shoving a base register and an offset into the instruction then we may well
-  // need to scavenge registers. We should either specifically add an
-  // callee-save register for this purpose or allocate an extra spill slot.
-  bool BigStack =
-    MFI->estimateStackSize(MF) >= TII.estimateRSStackLimit(MF)
-    || MFI->hasVarSizedObjects() // Access will be from X29: messes things up
-    || (MFI->adjustsStack() && !hasReservedCallFrame(MF));
-
-  if (!BigStack)
-    return;
-
-  // We certainly need some slack space for the scavenger, preferably an extra
-  // register.
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
-  uint16_t ExtraReg = AArch64::NoRegister;
-
-  for (unsigned i = 0; CSRegs[i]; ++i) {
-    if (AArch64::GPR64RegClass.contains(CSRegs[i]) &&
-        !MF.getRegInfo().isPhysRegUsed(CSRegs[i])) {
-      ExtraReg = CSRegs[i];
-      break;
-    }
-  }
-
-  if (ExtraReg != 0) {
-    MF.getRegInfo().setPhysRegUsed(ExtraReg);
-  } else {
-    assert(RS && "Expect register scavenger to be available");
-
-    // Create a stack slot for scavenging purposes. PrologEpilogInserter
-    // helpfully places it near either SP or FP for us to avoid
-    // infinitely-regression during scavenging.
-    const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
-    RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
-                                                       RC->getAlignment(),
-                                                       false));
-  }
+  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
+  bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR);
+  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
+  return getKillRegState(LRKill);
 }
 
-bool AArch64FrameLowering::determinePrologueDeath(MachineBasicBlock &MBB,
-                                                  unsigned Reg) const {
-  // If @llvm.returnaddress is called then it will refer to X30 by some means;
-  // the prologue store does not kill the register.
-  if (Reg == AArch64::X30) {
-    if (MBB.getParent()->getFrameInfo()->isReturnAddressTaken()
-        && MBB.getParent()->getRegInfo().isLiveIn(Reg))
-    return false;
+bool AArch64FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned idx = Count - i - 2;
+    unsigned Reg1 = CSI[idx].getReg();
+    unsigned Reg2 = CSI[idx + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    //
+    // The order of the registers in the list is controlled by
+    // getCalleeSavedRegs(), so they will always be in-order, as well.
+    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    unsigned StrOpc;
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
+    // first spill is a pre-increment that allocates the stack.
+    // For example:
+    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
+    //    stp     x20, x19, [sp, #16]    // addImm(+2)
+    //    stp     fp, lr, [sp, #32]      // addImm(+4)
+    // Rationale: This sequence saves uop updates compared to a sequence of
+    // pre-increment spills like stp xi,xj,[sp,#-16]!
+    // Note: Similar rational and sequence for restores in epilog.
+    if (AArch64::GPR64RegClass.contains(Reg1)) {
+      assert(AArch64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = AArch64::STPXpre;
+      else
+        StrOpc = AArch64::STPXi;
+    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
+      assert(AArch64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = AArch64::STPDpre;
+      else
+        StrOpc = AArch64::STPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
+                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
+    // Compute offset: i = 0 => offset = -Count;
+    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
+    const int Offset = (i == 0) ? -Count : i;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for STP immediate");
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+    if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre)
+      MIB.addReg(AArch64::SP, RegState::Define);
+
+    MIB.addReg(Reg2, getPrologueDeath(MF, Reg2))
+        .addReg(Reg1, getPrologueDeath(MF, Reg1))
+        .addReg(AArch64::SP)
+        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
+        .setMIFlag(MachineInstr::FrameSetup);
   }
-
-  // In all other cases, physical registers are dead after they've been saved
-  // but live at the beginning of the prologue block.
-  MBB.addLiveIn(Reg);
   return true;
 }
 
-void
-AArch64FrameLowering::emitFrameMemOps(bool isPrologue, MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator MBBI,
-                                      const std::vector<CalleeSavedInfo> &CSI,
-                                      const TargetRegisterInfo *TRI,
-                                      const LoadStoreMethod PossClasses[],
-                                      unsigned NumClasses) const {
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
+bool AArch64FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned Reg1 = CSI[i].getReg();
+    unsigned Reg2 = CSI[i + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
+    // the last load is sp-pi post-increment and de-allocates the stack:
+    // For example:
+    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
+    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
+    //    ldp     x22, x21, [sp], #48     // addImm(+6)
+    // Note: see comment in spillCalleeSavedRegisters()
+    unsigned LdrOpc;
+
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    if (AArch64::GPR64RegClass.contains(Reg1)) {
+      assert(AArch64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = AArch64::LDPXpost;
+      else
+        LdrOpc = AArch64::LDPXi;
+    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
+      assert(AArch64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = AArch64::LDPDpost;
+      else
+        LdrOpc = AArch64::LDPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
+                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
+
+    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
+    // etc.
+    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for LDP immediate");
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
+    if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost)
+      MIB.addReg(AArch64::SP, RegState::Define);
+
+    MIB.addReg(Reg2, getDefRegState(true))
+        .addReg(Reg1, getDefRegState(true))
+        .addReg(AArch64::SP)
+        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
+                         // where the factor * 8 is implicit
+  }
+  return true;
+}
 
-  // A certain amount of implicit contract is present here. The actual stack
-  // offsets haven't been allocated officially yet, so for strictly correct code
-  // we rely on the fact that the elements of CSI are allocated in order
-  // starting at SP, purely as dictated by size and alignment. In practice since
-  // this function handles the only accesses to those slots it's not quite so
-  // important.
-  //
-  // We have also ordered the Callee-saved register list in AArch64CallingConv
-  // so that the above scheme puts registers in order: in particular we want
-  // &X30 to be &X29+8 for an ABI-correct frame record (PCS 5.2.2)
-  for (unsigned i = 0, e = CSI.size(); i < e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-
-    // First we need to find out which register class the register belongs to so
-    // that we can use the correct load/store instrucitons.
-    unsigned ClassIdx;
-    for (ClassIdx = 0; ClassIdx < NumClasses; ++ClassIdx) {
-      if (PossClasses[ClassIdx].RegClass->contains(Reg))
-        break;
-    }
-    assert(ClassIdx != NumClasses
-           && "Asked to store register in unexpected class");
-    const TargetRegisterClass &TheClass = *PossClasses[ClassIdx].RegClass;
-
-    // Now we need to decide whether it's possible to emit a paired instruction:
-    // for this we want the next register to be in the same class.
-    MachineInstrBuilder NewMI;
-    bool Pair = false;
-    if (i + 1 < CSI.size() && TheClass.contains(CSI[i+1].getReg())) {
-      Pair = true;
-      unsigned StLow = 0, StHigh = 0;
-      if (isPrologue) {
-        // Most of these registers will be live-in to the MBB and killed by our
-        // store, though there are exceptions (see determinePrologueDeath).
-        StLow = getKillRegState(determinePrologueDeath(MBB, CSI[i+1].getReg()));
-        StHigh = getKillRegState(determinePrologueDeath(MBB, CSI[i].getReg()));
-      } else {
-        StLow = RegState::Define;
-        StHigh = RegState::Define;
-      }
+void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
+    MachineFunction &MF, RegScavenger *RS) const {
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  SmallVector<unsigned, 4> UnspilledCSGPRs;
+  SmallVector<unsigned, 4> UnspilledCSFPRs;
 
-      NewMI = BuildMI(MBB, MBBI, DL, TII.get(PossClasses[ClassIdx].PairOpcode))
-                .addReg(CSI[i+1].getReg(), StLow)
-                .addReg(CSI[i].getReg(), StHigh);
+  // The frame record needs to be created by saving the appropriate registers
+  if (hasFP(MF)) {
+    MRI->setPhysRegUsed(AArch64::FP);
+    MRI->setPhysRegUsed(AArch64::LR);
+  }
 
-      // If it's a paired op, we've consumed two registers
-      ++i;
-    } else {
-      unsigned State;
-      if (isPrologue) {
-        State = getKillRegState(determinePrologueDeath(MBB, CSI[i].getReg()));
+  // Spill the BasePtr if it's used. Do this first thing so that the
+  // getCalleeSavedRegs() below will get the right answer.
+  if (RegInfo->hasBasePointer(MF))
+    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
+
+  // If any callee-saved registers are used, the frame cannot be eliminated.
+  unsigned NumGPRSpilled = 0;
+  unsigned NumFPRSpilled = 0;
+  bool ExtraCSSpill = false;
+  bool CanEliminateFrame = true;
+  DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+
+  // Check pairs of consecutive callee-saved registers.
+  for (unsigned i = 0; CSRegs[i]; i += 2) {
+    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
+
+    const unsigned OddReg = CSRegs[i];
+    const unsigned EvenReg = CSRegs[i + 1];
+    assert((AArch64::GPR64RegClass.contains(OddReg) &&
+            AArch64::GPR64RegClass.contains(EvenReg)) ^
+               (AArch64::FPR64RegClass.contains(OddReg) &&
+                AArch64::FPR64RegClass.contains(EvenReg)) &&
+           "Register class mismatch!");
+
+    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
+    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
+
+    // Early exit if none of the registers in the register pair is actually
+    // used.
+    if (!OddRegUsed && !EvenRegUsed) {
+      if (AArch64::GPR64RegClass.contains(OddReg)) {
+        UnspilledCSGPRs.push_back(OddReg);
+        UnspilledCSGPRs.push_back(EvenReg);
       } else {
-        State = RegState::Define;
+        UnspilledCSFPRs.push_back(OddReg);
+        UnspilledCSFPRs.push_back(EvenReg);
       }
+      continue;
+    }
 
-      NewMI = BuildMI(MBB, MBBI, DL,
-                      TII.get(PossClasses[ClassIdx].SingleOpcode))
-                .addReg(CSI[i].getReg(), State);
+    unsigned Reg = AArch64::NoRegister;
+    // If only one of the registers of the register pair is used, make sure to
+    // mark the other one as used as well.
+    if (OddRegUsed ^ EvenRegUsed) {
+      // Find out which register is the additional spill.
+      Reg = OddRegUsed ? EvenReg : OddReg;
+      MRI->setPhysRegUsed(Reg);
     }
 
-    // Note that the FrameIdx refers to the second register in a pair: it will
-    // be allocated the smaller numeric address and so is the one an LDP/STP
-    // address must use.
-    int FrameIdx = CSI[i].getFrameIdx();
-    MachineMemOperand::MemOperandFlags Flags;
-    Flags = isPrologue ? MachineMemOperand::MOStore : MachineMemOperand::MOLoad;
-    MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                             Flags,
-                             Pair ? TheClass.getSize() * 2 : TheClass.getSize(),
-                             MFI.getObjectAlignment(FrameIdx));
-
-    NewMI.addFrameIndex(FrameIdx)
-      .addImm(0)                  // address-register offset
-      .addMemOperand(MMO);
-
-    if (isPrologue)
-      NewMI.setMIFlags(MachineInstr::FrameSetup);
-
-    // For aesthetic reasons, during an epilogue we want to emit complementary
-    // operations to the prologue, but in the opposite order. So we still
-    // iterate through the CalleeSavedInfo list in order, but we put the
-    // instructions successively earlier in the MBB.
-    if (!isPrologue)
-      --MBBI;
+    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
+    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
+
+    assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) ||
+            (RegInfo->getEncodingValue(OddReg) + 1 ==
+             RegInfo->getEncodingValue(EvenReg))) &&
+           "Register pair of non-adjacent registers!");
+    if (AArch64::GPR64RegClass.contains(OddReg)) {
+      NumGPRSpilled += 2;
+      // If it's not a reserved register, we can use it in lieu of an
+      // emergency spill slot for the register scavenger.
+      // FIXME: It would be better to instead keep looking and choose another
+      // unspilled register that isn't reserved, if there is one.
+      if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
+        ExtraCSSpill = true;
+    } else
+      NumFPRSpilled += 2;
+
+    CanEliminateFrame = false;
   }
-}
-
-bool
-AArch64FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MBBI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const {
-  if (CSI.empty())
-    return false;
-
-  static const LoadStoreMethod PossibleClasses[] = {
-    {&AArch64::GPR64RegClass, AArch64::LSPair64_STR, AArch64::LS64_STR},
-    {&AArch64::FPR64RegClass, AArch64::LSFPPair64_STR, AArch64::LSFP64_STR},
-  };
-  const unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
-
-  emitFrameMemOps(/* isPrologue = */ true, MBB, MBBI, CSI, TRI,
-                  PossibleClasses, NumClasses);
-
-  return true;
-}
-
-bool
-AArch64FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MBBI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const {
-
-  if (CSI.empty())
-    return false;
-
-  static const LoadStoreMethod PossibleClasses[] = {
-    {&AArch64::GPR64RegClass, AArch64::LSPair64_LDR, AArch64::LS64_LDR},
-    {&AArch64::FPR64RegClass, AArch64::LSFPPair64_LDR, AArch64::LSFP64_LDR},
-  };
-  const unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
-
-  emitFrameMemOps(/* isPrologue = */ false, MBB, MBBI, CSI, TRI,
-                  PossibleClasses, NumClasses);
-
-  return true;
-}
-
-bool
-AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetRegisterInfo *RI = MF.getTarget().getRegisterInfo();
-
-  // This is a decision of ABI compliance. The AArch64 PCS gives various options
-  // for conformance, and even at the most stringent level more or less permits
-  // elimination for leaf functions because there's no loss of functionality
-  // (for debugging etc)..
-  if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->hasCalls())
-    return true;
 
-  // The following are hard-limits: incorrect code will be generated if we try
-  // to omit the frame.
-  return (RI->needsStackRealignment(MF) ||
-          MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken());
-}
-
-bool
-AArch64FrameLowering::useFPForAddressing(const MachineFunction &MF) const {
-  return MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-bool
-AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Of the various reasons for having a frame pointer, it's actually only
-  // variable-sized objects that prevent reservation of a call frame.
-  return !(hasFP(MF) && MFI->hasVarSizedObjects());
-}
-
-void
-AArch64FrameLowering::eliminateCallFramePseudoInstr(
-                                MachineFunction &MF,
-                                MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI) const {
-  const AArch64InstrInfo &TII =
-    *static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
-  DebugLoc dl = MI->getDebugLoc();
-  int Opcode = MI->getOpcode();
-  bool IsDestroy = Opcode == TII.getCallFrameDestroyOpcode();
-  uint64_t CalleePopAmount = IsDestroy ? MI->getOperand(1).getImm() : 0;
-
-  if (!hasReservedCallFrame(MF)) {
-    unsigned Align = getStackAlignment();
-
-    int64_t Amount = MI->getOperand(0).getImm();
-    Amount = RoundUpToAlignment(Amount, Align);
-    if (!IsDestroy) Amount = -Amount;
+  // FIXME: Set BigStack if any stack slot references may be out of range.
+  // For now, just conservatively guestimate based on unscaled indexing
+  // range. We'll end up allocating an unnecessary spill slot a lot, but
+  // realistically that's not a big deal at this stage of the game.
+  // The CSR spill slots have not been allocated yet, so estimateStackSize
+  // won't include them.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
+  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
+  bool BigStack = (CFSize >= 256);
+  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
+    AFI->setHasStackFrame(true);
+
+  // Estimate if we might need to scavenge a register at some point in order
+  // to materialize a stack offset. If so, either spill one additional
+  // callee-saved register or reserve a special spill slot to facilitate
+  // register scavenging. If we already spilled an extra callee-saved register
+  // above to keep the number of spills even, we don't need to do anything else
+  // here.
+  if (BigStack && !ExtraCSSpill) {
+
+    // If we're adding a register to spill here, we have to add two of them
+    // to keep the number of regs to spill even.
+    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
+    unsigned Count = 0;
+    while (!UnspilledCSGPRs.empty() && Count < 2) {
+      unsigned Reg = UnspilledCSGPRs.back();
+      UnspilledCSGPRs.pop_back();
+      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
+                   << " to get a scratch register.\n");
+      MRI->setPhysRegUsed(Reg);
+      ExtraCSSpill = true;
+      ++Count;
+    }
 
-    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
-    // doesn't have to pop anything), then the first operand will be zero too so
-    // this adjustment is a no-op.
-    if (CalleePopAmount == 0) {
-      // FIXME: in-function stack adjustment for calls is limited to 12-bits
-      // because there's no guaranteed temporary register available. Mostly call
-      // frames will be allocated at the start of a function so this is OK, but
-      // it is a limitation that needs dealing with.
-      assert(Amount > -0xfff && Amount < 0xfff && "call frame too large");
-      emitSPUpdate(MBB, MI, dl, TII, AArch64::NoRegister, Amount);
+    // If we didn't find an extra callee-saved register to spill, create
+    // an emergency spill slot.
+    if (!ExtraCSSpill) {
+      const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
+      int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+      RS->addScavengingFrameIndex(FI);
+      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
+                   << " as the emergency spill slot.\n");
     }
-  } else if (CalleePopAmount != 0) {
-    // If the calling convention demands that the callee pops arguments from the
-    // stack, we want to add it back if we have a reserved call frame.
-    assert(CalleePopAmount < 0xfff && "call frame too large");
-    emitSPUpdate(MBB, MI, dl, TII, AArch64::NoRegister, -CalleePopAmount);
   }
-
-  MBB.erase(MI);
 }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 032dd90..0e00d16 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -1,4 +1,4 @@
-//==- AArch64FrameLowering.h - Define frame lowering for AArch64 -*- C++ -*--=//
+//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,100 +7,67 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This class implements the AArch64-specific parts of the TargetFrameLowering
-// class.
+//
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64_FRAMEINFO_H
-#define LLVM_AARCH64_FRAMEINFO_H
+#ifndef AArch64_FRAMELOWERING_H
+#define AArch64_FRAMELOWERING_H
 
-#include "AArch64Subtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
+
 class AArch64Subtarget;
+class AArch64TargetMachine;
 
 class AArch64FrameLowering : public TargetFrameLowering {
-private:
-  // In order to unify the spilling and restoring of callee-saved registers into
-  // emitFrameMemOps, we need to be able to specify which instructions to use
-  // for the relevant memory operations on each register class. An array of the
-  // following struct is populated and passed in to achieve this.
-  struct LoadStoreMethod {
-    const TargetRegisterClass *RegClass; // E.g. GPR64RegClass
-
-    // The preferred instruction.
-    unsigned PairOpcode; // E.g. LSPair64_STR
-
-    // Sometimes only a single register can be handled at once.
-    unsigned SingleOpcode; // E.g. LS64_STR
-  };
-protected:
-  const AArch64Subtarget &STI;
+  const AArch64TargetMachine &TM;
 
 public:
-  explicit AArch64FrameLowering(const AArch64Subtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 16, 0, 16),
-      STI(sti) {
-  }
+  explicit AArch64FrameLowering(const AArch64TargetMachine &TM,
+                              const AArch64Subtarget &STI)
+      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+                            false /*StackRealignable*/),
+        TM(TM) {}
 
-  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
-  /// the function.
-  virtual void emitPrologue(MachineFunction &MF) const;
-  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-
-  /// Decides how much stack adjustment to perform in each phase of the prologue
-  /// and epilogue.
-  void splitSPAdjustments(uint64_t Total, uint64_t &Initial,
-                          uint64_t &Residual) const;
-
-  int64_t resolveFrameIndexReference(MachineFunction &MF, int FrameIndex,
-                                     unsigned &FrameReg, int SPAdj,
-                                     bool IsCalleeSaveOp) const;
-
-  virtual void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                    RegScavenger *RS) const;
-
-  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const;
-  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const;
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 unsigned FramePtr) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MI) const;
-
-  /// If the register is X30 (i.e. LR) and the return address is used in the
-  /// function then the callee-save store doesn't actually kill the register,
-  /// otherwise it does.
-  bool determinePrologueDeath(MachineBasicBlock &MBB, unsigned Reg) const;
-
-  /// This function emits the loads or stores required during prologue and
-  /// epilogue as efficiently as possible.
-  ///
-  /// The operations involved in setting up and tearing down the frame are
-  /// similar enough to warrant a shared function, particularly as discrepancies
-  /// between the two would be disastrous.
-  void emitFrameMemOps(bool isStore, MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator MI,
-                       const std::vector<CalleeSavedInfo> &CSI,
-                       const TargetRegisterInfo *TRI,
-                       const LoadStoreMethod PossibleClasses[],
-                       unsigned NumClasses) const;
-
-
-  virtual bool hasFP(const MachineFunction &MF) const;
-
-  virtual bool useFPForAddressing(const MachineFunction &MF) const;
-
-  /// On AA
-  virtual bool hasReservedCallFrame(const MachineFunction &MF) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                 unsigned &FrameReg,
+                                 bool PreferFP = false) const;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  /// \brief Can this function use the red zone for local allocations.
+  bool canUseRedZone(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index dac4b32..7007ffc 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -11,118 +11,119 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "aarch64-isel"
-#include "AArch64.h"
-#include "AArch64InstrInfo.h"
-#include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
-#include "Utils/AArch64BaseInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h" // To access function attributes.
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64-isel"
+
 //===--------------------------------------------------------------------===//
-/// AArch64 specific code to select AArch64 machine instructions for
-/// SelectionDAG operations.
+/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
+/// instructions for SelectionDAG operations.
 ///
 namespace {
 
 class AArch64DAGToDAGISel : public SelectionDAGISel {
   AArch64TargetMachine &TM;
 
-  /// Keep a pointer to the AArch64Subtarget around so that we can
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
 
+  bool ForCodeSize;
+
 public:
   explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
                                CodeGenOpt::Level OptLevel)
-    : SelectionDAGISel(tm, OptLevel), TM(tm),
-      Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
-  }
+      : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr),
+        ForCodeSize(false) {}
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "AArch64 Instruction Selection";
   }
 
-  // Include the pieces autogenerated from the target description.
-#include "AArch64GenDAGISel.inc"
-
-  template<unsigned MemSize>
-  bool SelectOffsetUImm12(SDValue N, SDValue &UImm12) {
-    const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
-    if (!CN || CN->getZExtValue() % MemSize != 0
-        || CN->getZExtValue() / MemSize > 0xfff)
-      return false;
-
-    UImm12 =  CurDAG->getTargetConstant(CN->getZExtValue() / MemSize, MVT::i64);
-    return true;
-  }
-
-  template<unsigned RegWidth>
-  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
-    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
-  }
-
-  /// Used for pre-lowered address-reference nodes, so we already know
-  /// the fields match. This operand's job is simply to add an
-  /// appropriate shift operand to the MOVZ/MOVK instruction.
-  template<unsigned LogShift>
-  bool SelectMOVWAddressRef(SDValue N, SDValue &Imm, SDValue &Shift) {
-    Imm = N;
-    Shift = CurDAG->getTargetConstant(LogShift, MVT::i32);
-    return true;
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+    ForCodeSize =
+        FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                             Attribute::OptimizeForSize) ||
+        FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
-  bool SelectFPZeroOperand(SDValue N, SDValue &Dummy);
-
-  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
-                                unsigned RegWidth);
+  SDNode *Select(SDNode *Node) override;
 
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                     char ConstraintCode,
-                                    std::vector<SDValue> &OutOps);
-
-  bool SelectLogicalImm(SDValue N, SDValue &Imm);
-
-  template<unsigned RegWidth>
-  bool SelectTSTBOperand(SDValue N, SDValue &FixedPos) {
-    return SelectTSTBOperand(N, FixedPos, RegWidth);
+                                    std::vector<SDValue> &OutOps) override;
+
+  SDNode *SelectMLAV64LaneV128(SDNode *N);
+  SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
+  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
+  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, false, Reg, Shift);
+  }
+  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, true, Reg, Shift);
+  }
+  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 16, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
   }
 
-  bool SelectTSTBOperand(SDValue N, SDValue &FixedPos, unsigned RegWidth);
-
-  SDNode *SelectAtomic(SDNode *N, unsigned Op8, unsigned Op16, unsigned Op32,
-                       unsigned Op64);
-
-  /// Put the given constant into a pool and return a DAG which will give its
-  /// address.
-  SDValue getConstantPoolItemAddress(SDLoc DL, const Constant *CV);
-
-  SDNode *TrySelectToMoveImm(SDNode *N);
-  SDNode *LowerToFPLitPool(SDNode *Node);
-  SDNode *SelectToLitPool(SDNode *N);
-
-  SDNode* Select(SDNode*);
-private:
-  /// Get the opcode for table lookup instruction
-  unsigned getTBLOpc(bool IsExt, bool Is64Bit, unsigned NumOfVec);
-
-  /// Select NEON table lookup intrinsics.  NumVecs should be 1, 2, 3 or 4.
-  /// IsExt is to indicate if the result will be extended with an argument.
-  SDNode *SelectVTBL(SDNode *N, unsigned NumVecs, bool IsExt);
+  template<int Width>
+  bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &SignExtend, SDValue &DoShift) {
+    return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+  }
 
-  /// Select NEON load intrinsics.  NumVecs should be 1, 2, 3 or 4.
-  SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *Opcode);
+  template<int Width>
+  bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &SignExtend, SDValue &DoShift) {
+    return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+  }
 
-  /// Select NEON store intrinsics.  NumVecs should be 1, 2, 3 or 4.
-  SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *Opcodes);
 
   /// Form sequences of consecutive 64/128-bit registers for use in NEON
   /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
@@ -136,315 +137,713 @@ private:
   SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
                       unsigned SubRegs[]);
 
-  /// Select NEON load-duplicate intrinsics.  NumVecs should be 2, 3 or 4.
-  /// The opcode array specifies the instructions used for load.
-  SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                       const uint16_t *Opcodes);
+  SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
+
+  SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
+
+  SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                     unsigned SubRegIdx);
+  SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                         unsigned SubRegIdx);
+  SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node);
+  SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node);
+
+  SDNode *SelectBitfieldExtractOp(SDNode *N);
+  SDNode *SelectBitfieldInsertOp(SDNode *N);
+
+  SDNode *SelectLIBM(SDNode *N);
+
+// Include the pieces autogenerated from the target description.
+#include "AArch64GenDAGISel.inc"
+
+private:
+  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
+                             SDValue &Shift);
+  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
+                             SDValue &OffImm);
+  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
+                              SDValue &OffImm);
+  bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
+                         SDValue &Offset, SDValue &SignExtend,
+                         SDValue &DoShift);
+  bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
+                         SDValue &Offset, SDValue &SignExtend,
+                         SDValue &DoShift);
+  bool isWorthFolding(SDValue V) const;
+  bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
+                         SDValue &Offset, SDValue &SignExtend);
 
-  /// Select NEON load/store lane intrinsics.  NumVecs should be 2, 3 or 4.
-  /// The opcode arrays specify the instructions used for load/store.
-  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
-                          unsigned NumVecs, const uint16_t *Opcodes);
+  template<unsigned RegWidth>
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
+    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
+  }
 
-  SDValue getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
-                               SDValue Operand);
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
 };
+} // end anonymous namespace
+
+/// isIntImmediate - This method tests to see if the node is a constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
+  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
+    Imm = C->getZExtValue();
+    return true;
+  }
+  return false;
 }
 
-bool
-AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
-                                              unsigned RegWidth) {
-  const ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
-  if (!CN) return false;
+// isIntImmediate - This method tests to see if a constant operand.
+// If so Imm will receive the value.
+static bool isIntImmediate(SDValue N, uint64_t &Imm) {
+  return isIntImmediate(N.getNode(), Imm);
+}
 
-  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
-  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
-  // x-register.
-  //
-  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
-  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
-  // integers.
-  bool IsExact;
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
+                                  uint64_t &Imm) {
+  return N->getOpcode() == Opc &&
+         isIntImmediate(N->getOperand(1).getNode(), Imm);
+}
 
-  // fbits is between 1 and 64 in the worst-case, which means the fmul
-  // could have 2^64 as an actual operand. Need 65 bits of precision.
-  APSInt IntVal(65, true);
-  CN->getValueAPF().convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
+bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+  // Require the address to be in a register.  That is safe for all AArch64
+  // variants and it is hard to do anything much smarter without knowing
+  // how the operand is used.
+  OutOps.push_back(Op);
+  return false;
+}
 
-  // N.b. isPowerOf2 also checks for > 0.
-  if (!IsExact || !IntVal.isPowerOf2()) return false;
-  unsigned FBits = IntVal.logBase2();
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
+                                           SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
 
-  // Checks above should have guaranteed that we haven't lost information in
-  // finding FBits, but it must still be in range.
-  if (FBits == 0 || FBits > RegWidth) return false;
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+  unsigned ShiftAmt;
 
-  FixedPos = CurDAG->getTargetConstant(64 - FBits, MVT::i32);
+  if (Immed >> 12 == 0) {
+    ShiftAmt = 0;
+  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+    ShiftAmt = 12;
+    Immed = Immed >> 12;
+  } else
+    return false;
+
+  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+  Val = CurDAG->getTargetConstant(Immed, MVT::i32);
+  Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
   return true;
 }
 
-bool
-AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                                 char ConstraintCode,
-                                                 std::vector<SDValue> &OutOps) {
-  switch (ConstraintCode) {
-  default: llvm_unreachable("Unrecognised AArch64 memory constraint");
-  case 'm':
-    // FIXME: more freedom is actually permitted for 'm'. We can go
-    // hunting for a base and an offset if we want. Of course, since
-    // we don't really know how the operand is going to be used we're
-    // probably restricted to the load/store pair's simm7 as an offset
-    // range anyway.
-  case 'Q':
-    OutOps.push_back(Op);
+/// SelectNegArithImmed - As above, but negates the value before trying to
+/// select it.
+bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
+                                              SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
+
+  // The immediate operand must be a 24-bit zero-extended immediate.
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+
+  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
+  // have the opposite effect on the C flag, so this pattern mustn't match under
+  // those circumstances.
+  if (Immed == 0)
+    return false;
+
+  if (N.getValueType() == MVT::i32)
+    Immed = ~((uint32_t)Immed) + 1;
+  else
+    Immed = ~Immed + 1ULL;
+  if (Immed & 0xFFFFFFFFFF000000ULL)
+    return false;
+
+  Immed &= 0xFFFFFFULL;
+  return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift);
+}
+
+/// getShiftTypeForNode - Translate a shift node to the corresponding
+/// ShiftType value.
+static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
+  switch (N.getOpcode()) {
+  default:
+    return AArch64_AM::InvalidShiftExtend;
+  case ISD::SHL:
+    return AArch64_AM::LSL;
+  case ISD::SRL:
+    return AArch64_AM::LSR;
+  case ISD::SRA:
+    return AArch64_AM::ASR;
+  case ISD::ROTR:
+    return AArch64_AM::ROR;
   }
+}
 
+/// \brief Determine wether it is worth to fold V into an extended register.
+bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
+  // it hurts if the a value is used at least twice, unless we are optimizing
+  // for code size.
+  if (ForCodeSize || V.hasOneUse())
+    return true;
   return false;
 }
 
-bool
-AArch64DAGToDAGISel::SelectFPZeroOperand(SDValue N, SDValue &Dummy) {
-  ConstantFPSDNode *Imm = dyn_cast<ConstantFPSDNode>(N);
-  if (!Imm || !Imm->getValueAPF().isPosZero())
+/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
+/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
+/// instructions allow the shifted register to be rotated, but the arithmetic
+/// instructions do not.  The AllowROR parameter specifies whether ROR is
+/// supported.
+bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
+                                                SDValue &Reg, SDValue &Shift) {
+  AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
+  if (ShType == AArch64_AM::InvalidShiftExtend)
+    return false;
+  if (!AllowROR && ShType == AArch64_AM::ROR)
     return false;
 
-  // Doesn't actually carry any information, but keeps TableGen quiet.
-  Dummy = CurDAG->getTargetConstant(0, MVT::i32);
-  return true;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
+    unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
+
+    Reg = N.getOperand(0);
+    Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
+    return isWorthFolding(N);
+  }
+
+  return false;
+}
+
+/// getExtendTypeForNode - Translate an extend node to the corresponding
+/// ExtendType value.
+static AArch64_AM::ShiftExtendType
+getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
+  if (N.getOpcode() == ISD::SIGN_EXTEND ||
+      N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT SrcVT;
+    if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
+      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
+    else
+      SrcVT = N.getOperand(0).getValueType();
+
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return AArch64_AM::SXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return AArch64_AM::SXTH;
+    else if (SrcVT == MVT::i32)
+      return AArch64_AM::SXTW;
+    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+    return AArch64_AM::InvalidShiftExtend;
+  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
+             N.getOpcode() == ISD::ANY_EXTEND) {
+    EVT SrcVT = N.getOperand(0).getValueType();
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return AArch64_AM::UXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return AArch64_AM::UXTH;
+    else if (SrcVT == MVT::i32)
+      return AArch64_AM::UXTW;
+    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+    return AArch64_AM::InvalidShiftExtend;
+  } else if (N.getOpcode() == ISD::AND) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return AArch64_AM::InvalidShiftExtend;
+    uint64_t AndMask = CSD->getZExtValue();
+
+    switch (AndMask) {
+    default:
+      return AArch64_AM::InvalidShiftExtend;
+    case 0xFF:
+      return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
+    case 0xFFFF:
+      return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
+    case 0xFFFFFFFF:
+      return AArch64_AM::UXTW;
+    }
+  }
+
+  return AArch64_AM::InvalidShiftExtend;
 }
 
-bool AArch64DAGToDAGISel::SelectLogicalImm(SDValue N, SDValue &Imm) {
-  uint32_t Bits;
-  uint32_t RegWidth = N.getValueType().getSizeInBits();
+// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
+static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
+  if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
+      DL->getOpcode() != AArch64ISD::DUPLANE32)
+    return false;
 
-  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
-  if (!CN) return false;
+  SDValue SV = DL->getOperand(0);
+  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
+    return false;
 
-  if (!A64Imms::isLogicalImm(RegWidth, CN->getZExtValue(), Bits))
+  SDValue EV = SV.getOperand(1);
+  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
     return false;
 
-  Imm = CurDAG->getTargetConstant(Bits, MVT::i32);
+  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
+  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
+  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
+  LaneOp = EV.getOperand(0);
+
   return true;
 }
 
-SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) {
-  SDNode *ResNode;
-  SDLoc dl(Node);
-  EVT DestType = Node->getValueType(0);
-  unsigned DestWidth = DestType.getSizeInBits();
-
-  unsigned MOVOpcode;
-  EVT MOVType;
-  int UImm16, Shift;
-  uint32_t LogicalBits;
-
-  uint64_t BitPat = cast<ConstantSDNode>(Node)->getZExtValue();
-  if (A64Imms::isMOVZImm(DestWidth, BitPat, UImm16, Shift)) {
-    MOVType = DestType;
-    MOVOpcode = DestWidth == 64 ? AArch64::MOVZxii : AArch64::MOVZwii;
-  } else if (A64Imms::isMOVNImm(DestWidth, BitPat, UImm16, Shift)) {
-    MOVType = DestType;
-    MOVOpcode = DestWidth == 64 ? AArch64::MOVNxii : AArch64::MOVNwii;
-  } else if (DestWidth == 64 && A64Imms::isMOVNImm(32, BitPat, UImm16, Shift)) {
-    // To get something like 0x0000_0000_ffff_1234 into a 64-bit register we can
-    // use a 32-bit instruction: "movn w0, 0xedbc".
-    MOVType = MVT::i32;
-    MOVOpcode = AArch64::MOVNwii;
-  } else if (A64Imms::isLogicalImm(DestWidth, BitPat, LogicalBits))  {
-    MOVOpcode = DestWidth == 64 ? AArch64::ORRxxi : AArch64::ORRwwi;
-    uint16_t ZR = DestWidth == 64 ? AArch64::XZR : AArch64::WZR;
-
-    return CurDAG->getMachineNode(MOVOpcode, dl, DestType,
-                              CurDAG->getRegister(ZR, DestType),
-                              CurDAG->getTargetConstant(LogicalBits, MVT::i32));
-  } else {
-    // Can't handle it in one instruction. There's scope for permitting two (or
-    // more) instructions, but that'll need more thought.
-    return NULL;
+// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
+// high lane extract.
+static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
+                             SDValue &LaneOp, int &LaneIdx) {
+
+  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
+      return false;
+  }
+  StdOp = Op1;
+  return true;
+}
+
+/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
+/// is a lane in the upper half of a 128-bit vector.  Recognize and select this
+/// so that we don't emit unnecessary lane extracts.
+SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
+  SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
+  int LaneIdx = -1; // Will hold the lane index.
+
+  if (Op1.getOpcode() != ISD::MUL ||
+      !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                        LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (Op1.getOpcode() != ISD::MUL ||
+        !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                          LaneIdx))
+      return nullptr;
+  }
+
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+
+  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
+
+  unsigned MLAOpc = ~0U;
+
+  switch (N->getSimpleValueType(0).SimpleTy) {
+  default:
+    llvm_unreachable("Unrecognized MLA.");
+  case MVT::v4i16:
+    MLAOpc = AArch64::MLAv4i16_indexed;
+    break;
+  case MVT::v8i16:
+    MLAOpc = AArch64::MLAv8i16_indexed;
+    break;
+  case MVT::v2i32:
+    MLAOpc = AArch64::MLAv2i32_indexed;
+    break;
+  case MVT::v4i32:
+    MLAOpc = AArch64::MLAv4i32_indexed;
+    break;
   }
 
-  ResNode = CurDAG->getMachineNode(MOVOpcode, dl, MOVType,
-                                   CurDAG->getTargetConstant(UImm16, MVT::i32),
-                                   CurDAG->getTargetConstant(Shift, MVT::i32));
+  return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+  SDValue SMULLOp0;
+  SDValue SMULLOp1;
+  int LaneIdx;
+
+  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
+                        LaneIdx))
+    return nullptr;
+
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+
+  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
+
+  unsigned SMULLOpc = ~0U;
+
+  if (IntNo == Intrinsic::aarch64_neon_smull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = AArch64::SMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = AArch64::SMULLv2i32_indexed;
+      break;
+    }
+  } else if (IntNo == Intrinsic::aarch64_neon_umull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = AArch64::UMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = AArch64::UMULLv2i32_indexed;
+      break;
+    }
+  } else
+    llvm_unreachable("Unrecognized intrinsic.");
+
+  return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops);
+}
+
+/// Instructions that accept extend modifiers like UXTW expect the register
+/// being extended to be a GPR32, but the incoming DAG might be acting on a
+/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
+/// this is the case.
+static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
+  if (N.getValueType() == MVT::i32)
+    return N;
+
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+  MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                               SDLoc(N), MVT::i32, N, SubReg);
+  return SDValue(Node, 0);
+}
+
+
+/// SelectArithExtendedRegister - Select a "extended register" operand.  This
+/// operand folds in an extend followed by an optional left shift.
+bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
+                                                      SDValue &Shift) {
+  unsigned ShiftVal = 0;
+  AArch64_AM::ShiftExtendType Ext;
+
+  if (N.getOpcode() == ISD::SHL) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return false;
+    ShiftVal = CSD->getZExtValue();
+    if (ShiftVal > 4)
+      return false;
+
+    Ext = getExtendTypeForNode(N.getOperand(0));
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Reg = N.getOperand(0).getOperand(0);
+  } else {
+    Ext = getExtendTypeForNode(N);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
 
-  if (MOVType != DestType) {
-    ResNode = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
-                          MVT::i64, MVT::i32, MVT::Other,
-                          CurDAG->getTargetConstant(0, MVT::i64),
-                          SDValue(ResNode, 0),
-                          CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32));
+    Reg = N.getOperand(0);
   }
 
-  return ResNode;
+  // AArch64 mandates that the RHS of the operation must use the smallest
+  // register classs that could contain the size being extended from.  Thus,
+  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
+  // there might not be an actual 32-bit value in the program.  We can
+  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
+  assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
+  Reg = narrowIfNeeded(CurDAG, Reg);
+  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32);
+  return isWorthFolding(N);
 }
 
-SDValue
-AArch64DAGToDAGISel::getConstantPoolItemAddress(SDLoc DL,
-                                                const Constant *CV) {
-  EVT PtrVT = getTargetLowering()->getPointerTy();
-
-  switch (getTargetLowering()->getTargetMachine().getCodeModel()) {
-  case CodeModel::Small: {
-    unsigned Alignment =
-      getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType());
-    return CurDAG->getNode(
-        AArch64ISD::WrapperSmall, DL, PtrVT,
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_NO_FLAG),
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_LO12),
-        CurDAG->getConstant(Alignment, MVT::i32));
-  }
-  case CodeModel::Large: {
-    SDNode *LitAddr;
-    LitAddr = CurDAG->getMachineNode(
-        AArch64::MOVZxii, DL, PtrVT,
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
-        CurDAG->getTargetConstant(3, MVT::i32));
-    LitAddr = CurDAG->getMachineNode(
-        AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
-        CurDAG->getTargetConstant(2, MVT::i32));
-    LitAddr = CurDAG->getMachineNode(
-        AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
-        CurDAG->getTargetConstant(1, MVT::i32));
-    LitAddr = CurDAG->getMachineNode(
-        AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC),
-        CurDAG->getTargetConstant(0, MVT::i32));
-    return SDValue(LitAddr, 0);
+/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
+/// immediate" address.  The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
+                                              SDValue &Base, SDValue &OffImm) {
+  const TargetLowering *TLI = getTargetLowering();
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+    return true;
   }
-  default:
-    llvm_unreachable("Only small and large code models supported now");
+
+  if (N.getOpcode() == AArch64ISD::ADDlow) {
+    GlobalAddressSDNode *GAN =
+        dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
+    Base = N.getOperand(0);
+    OffImm = N.getOperand(1);
+    if (!GAN)
+      return true;
+
+    const GlobalValue *GV = GAN->getGlobal();
+    unsigned Alignment = GV->getAlignment();
+    const DataLayout *DL = TLI->getDataLayout();
+    if (Alignment == 0 && !Subtarget->isTargetDarwin())
+      Alignment = DL->getABITypeAlignment(GV->getType()->getElementType());
+
+    if (Alignment >= Size)
+      return true;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int64_t RHSC = (int64_t)RHS->getZExtValue();
+      unsigned Scale = Log2_32(Size);
+      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        }
+        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64);
+        return true;
+      }
+    }
   }
+
+  // Before falling back to our general case, check if the unscaled
+  // instructions can handle this. If so, that's preferable.
+  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
+    return false;
+
+  // Base only. The address will be materialized into a register before
+  // the memory is accessed.
+  //    add x0, Xbase, #offset
+  //    ldr x0, [x0]
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+  return true;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) {
-  SDLoc DL(Node);
-  uint64_t UnsignedVal = cast<ConstantSDNode>(Node)->getZExtValue();
-  int64_t SignedVal = cast<ConstantSDNode>(Node)->getSExtValue();
-  EVT DestType = Node->getValueType(0);
-
-  // Since we may end up loading a 64-bit constant from a 32-bit entry the
-  // constant in the pool may have a different type to the eventual node.
-  ISD::LoadExtType Extension;
-  EVT MemType;
-
-  assert((DestType == MVT::i64 || DestType == MVT::i32)
-         && "Only expect integer constants at the moment");
-
-  if (DestType == MVT::i32) {
-    Extension = ISD::NON_EXTLOAD;
-    MemType = MVT::i32;
-  } else if (UnsignedVal <= UINT32_MAX) {
-    Extension = ISD::ZEXTLOAD;
-    MemType = MVT::i32;
-  } else if (SignedVal >= INT32_MIN && SignedVal <= INT32_MAX) {
-    Extension = ISD::SEXTLOAD;
-    MemType = MVT::i32;
-  } else {
-    Extension = ISD::NON_EXTLOAD;
-    MemType = MVT::i64;
-  }
-
-  Constant *CV = ConstantInt::get(Type::getIntNTy(*CurDAG->getContext(),
-                                                  MemType.getSizeInBits()),
-                                  UnsignedVal);
-  SDValue PoolAddr = getConstantPoolItemAddress(DL, CV);
-  unsigned Alignment =
-    getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType());
-
-  return CurDAG->getExtLoad(Extension, DL, DestType, CurDAG->getEntryNode(),
-                            PoolAddr,
-                            MachinePointerInfo::getConstantPool(), MemType,
-                            /* isVolatile = */ false,
-                            /* isNonTemporal = */ false,
-                            Alignment).getNode();
+/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
+/// immediate" address.  This should only match when there is an offset that
+/// is not valid for a scaled immediate addressing mode.  The "Size" argument
+/// is the size in bytes of the memory reference, which is needed here to know
+/// what is valid for a scaled immediate.
+bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
+                                                 SDValue &Base,
+                                                 SDValue &OffImm) {
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int64_t RHSC = RHS->getSExtValue();
+    // If the offset is valid as a scaled immediate, don't match here.
+    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
+        RHSC < (0x1000 << Log2_32(Size)))
+      return false;
+    if (RHSC >= -256 && RHSC < 256) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        const TargetLowering *TLI = getTargetLowering();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64);
+      return true;
+    }
+  }
+  return false;
 }
 
-SDNode *AArch64DAGToDAGISel::LowerToFPLitPool(SDNode *Node) {
-  SDLoc DL(Node);
-  const ConstantFP *FV = cast<ConstantFPSDNode>(Node)->getConstantFPValue();
-  EVT DestType = Node->getValueType(0);
-
-  unsigned Alignment =
-    getTargetLowering()->getDataLayout()->getABITypeAlignment(FV->getType());
-  SDValue PoolAddr = getConstantPoolItemAddress(DL, FV);
-
-  return CurDAG->getLoad(DestType, DL, CurDAG->getEntryNode(), PoolAddr,
-                         MachinePointerInfo::getConstantPool(),
-                         /* isVolatile = */ false,
-                         /* isNonTemporal = */ false,
-                         /* isInvariant = */ true,
-                         Alignment).getNode();
+static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+  SDValue ImpDef = SDValue(
+      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64),
+      0);
+  MachineSDNode *Node = CurDAG->getMachineNode(
+      TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg);
+  return SDValue(Node, 0);
 }
 
-bool
-AArch64DAGToDAGISel::SelectTSTBOperand(SDValue N, SDValue &FixedPos,
-                                       unsigned RegWidth) {
-  const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
-  if (!CN) return false;
+/// \brief Check if the given SHL node (\p N), can be used to form an
+/// extended register for an addressing mode.
+bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
+                                            bool WantExtend, SDValue &Offset,
+                                            SDValue &SignExtend) {
+  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
+  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
+    return false;
+
+  if (WantExtend) {
+    AArch64_AM::ShiftExtendType Ext =
+        getExtendTypeForNode(N.getOperand(0), true);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
 
-  uint64_t Val = CN->getZExtValue();
+    Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+  } else {
+    Offset = N.getOperand(0);
+    SignExtend = CurDAG->getTargetConstant(0, MVT::i32);
+  }
 
-  if (!isPowerOf2_64(Val)) return false;
+  unsigned LegalShiftVal = Log2_32(Size);
+  unsigned ShiftVal = CSD->getZExtValue();
 
-  unsigned TestedBit = Log2_64(Val);
-  // Checks above should have guaranteed that we haven't lost information in
-  // finding TestedBit, but it must still be in range.
-  if (TestedBit >= RegWidth) return false;
+  if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
+    return false;
 
-  FixedPos = CurDAG->getTargetConstant(TestedBit, MVT::i64);
-  return true;
+  if (isWorthFolding(N))
+    return true;
+
+  return false;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
-                                          unsigned Op16,unsigned Op32,
-                                          unsigned Op64) {
-  // Mostly direct translation to the given operations, except that we preserve
-  // the AtomicOrdering for use later on.
-  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
-  EVT VT = AN->getMemoryVT();
-
-  unsigned Op;
-  if (VT == MVT::i8)
-    Op = Op8;
-  else if (VT == MVT::i16)
-    Op = Op16;
-  else if (VT == MVT::i32)
-    Op = Op32;
-  else if (VT == MVT::i64)
-    Op = Op64;
-  else
-    llvm_unreachable("Unexpected atomic operation");
+bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
+                                            SDValue &Base, SDValue &Offset,
+                                            SDValue &SignExtend,
+                                            SDValue &DoShift) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
 
-  SmallVector<SDValue, 4> Ops;
-  for (unsigned i = 1; i < AN->getNumOperands(); ++i)
-      Ops.push_back(AN->getOperand(i));
+  // We don't want to match immediate adds here, because they are better lowered
+  // to the register-immediate addressing modes.
+  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode *UI : Node->uses()) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
+    Base = LHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
+    Base = RHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // There was no shift, whatever else we find.
+  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+
+  AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
+  // Try to match an unshifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(LHS, true)) !=
+          AArch64_AM::InvalidShiftExtend) {
+    Base = RHS;
+    Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    if (isWorthFolding(LHS))
+      return true;
+  }
+
+  // Try to match an unshifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(RHS, true)) !=
+          AArch64_AM::InvalidShiftExtend) {
+    Base = LHS;
+    Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    if (isWorthFolding(RHS))
+      return true;
+  }
+
+  return false;
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
+                                            SDValue &Base, SDValue &Offset,
+                                            SDValue &SignExtend,
+                                            SDValue &DoShift) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
+
+  // We don't want to match immediate adds here, because they are better lowered
+  // to the register-immediate addressing modes.
+  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode *UI : Node->uses()) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
+    Base = LHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
 
-  Ops.push_back(CurDAG->getTargetConstant(AN->getOrdering(), MVT::i32));
-  Ops.push_back(AN->getOperand(0)); // Chain moves to the end
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
+    Base = RHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
 
-  return CurDAG->SelectNodeTo(Node, Op,
-                              AN->getValueType(0), MVT::Other,
-                              &Ops[0], Ops.size());
+  // Match any non-shifted, non-extend, non-immediate add expression.
+  Base = LHS;
+  Offset = RHS;
+  SignExtend = CurDAG->getTargetConstant(false, MVT::i32);
+  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+  // Reg1 + Reg2 is free: no check needed.
+  return true;
 }
 
 SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { AArch64::DPairRegClassID,
-                                    AArch64::DTripleRegClassID,
-                                    AArch64::DQuadRegClassID };
-  static unsigned SubRegs[] = { AArch64::dsub_0, AArch64::dsub_1,
-                                AArch64::dsub_2, AArch64::dsub_3 };
+  static unsigned RegClassIDs[] = {
+      AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
+  static unsigned SubRegs[] = { AArch64::dsub0, AArch64::dsub1,
+                                AArch64::dsub2, AArch64::dsub3 };
 
   return createTuple(Regs, RegClassIDs, SubRegs);
 }
 
 SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { AArch64::QPairRegClassID,
-                                    AArch64::QTripleRegClassID,
-                                    AArch64::QQuadRegClassID };
-  static unsigned SubRegs[] = { AArch64::qsub_0, AArch64::qsub_1,
-                                AArch64::qsub_2, AArch64::qsub_3 };
+  static unsigned RegClassIDs[] = {
+      AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
+  static unsigned SubRegs[] = { AArch64::qsub0, AArch64::qsub1,
+                                AArch64::qsub2, AArch64::qsub3 };
 
   return createTuple(Regs, RegClassIDs, SubRegs);
 }
@@ -478,1100 +877,2159 @@ SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
   return SDValue(N, 0);
 }
 
+SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc, bool isExt) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
 
-// Get the register stride update opcode of a VLD/VST instruction that
-// is otherwise equivalent to the given fixed stride updating instruction.
-static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
-  switch (Opc) {
-  default: break;
-  case AArch64::LD1WB_8B_fixed: return AArch64::LD1WB_8B_register;
-  case AArch64::LD1WB_4H_fixed: return AArch64::LD1WB_4H_register;
-  case AArch64::LD1WB_2S_fixed: return AArch64::LD1WB_2S_register;
-  case AArch64::LD1WB_1D_fixed: return AArch64::LD1WB_1D_register;
-  case AArch64::LD1WB_16B_fixed: return AArch64::LD1WB_16B_register;
-  case AArch64::LD1WB_8H_fixed: return AArch64::LD1WB_8H_register;
-  case AArch64::LD1WB_4S_fixed: return AArch64::LD1WB_4S_register;
-  case AArch64::LD1WB_2D_fixed: return AArch64::LD1WB_2D_register;
-
-  case AArch64::LD2WB_8B_fixed: return AArch64::LD2WB_8B_register;
-  case AArch64::LD2WB_4H_fixed: return AArch64::LD2WB_4H_register;
-  case AArch64::LD2WB_2S_fixed: return AArch64::LD2WB_2S_register;
-  case AArch64::LD2WB_16B_fixed: return AArch64::LD2WB_16B_register;
-  case AArch64::LD2WB_8H_fixed: return AArch64::LD2WB_8H_register;
-  case AArch64::LD2WB_4S_fixed: return AArch64::LD2WB_4S_register;
-  case AArch64::LD2WB_2D_fixed: return AArch64::LD2WB_2D_register;
-
-  case AArch64::LD3WB_8B_fixed: return AArch64::LD3WB_8B_register;
-  case AArch64::LD3WB_4H_fixed: return AArch64::LD3WB_4H_register;
-  case AArch64::LD3WB_2S_fixed: return AArch64::LD3WB_2S_register;
-  case AArch64::LD3WB_16B_fixed: return AArch64::LD3WB_16B_register;
-  case AArch64::LD3WB_8H_fixed: return AArch64::LD3WB_8H_register;
-  case AArch64::LD3WB_4S_fixed: return AArch64::LD3WB_4S_register;
-  case AArch64::LD3WB_2D_fixed: return AArch64::LD3WB_2D_register;
-
-  case AArch64::LD4WB_8B_fixed: return AArch64::LD4WB_8B_register;
-  case AArch64::LD4WB_4H_fixed: return AArch64::LD4WB_4H_register;
-  case AArch64::LD4WB_2S_fixed: return AArch64::LD4WB_2S_register;
-  case AArch64::LD4WB_16B_fixed: return AArch64::LD4WB_16B_register;
-  case AArch64::LD4WB_8H_fixed: return AArch64::LD4WB_8H_register;
-  case AArch64::LD4WB_4S_fixed: return AArch64::LD4WB_4S_register;
-  case AArch64::LD4WB_2D_fixed: return AArch64::LD4WB_2D_register;
-
-  case AArch64::LD1x2WB_8B_fixed: return AArch64::LD1x2WB_8B_register;
-  case AArch64::LD1x2WB_4H_fixed: return AArch64::LD1x2WB_4H_register;
-  case AArch64::LD1x2WB_2S_fixed: return AArch64::LD1x2WB_2S_register;
-  case AArch64::LD1x2WB_1D_fixed: return AArch64::LD1x2WB_1D_register;
-  case AArch64::LD1x2WB_16B_fixed: return AArch64::LD1x2WB_16B_register;
-  case AArch64::LD1x2WB_8H_fixed: return AArch64::LD1x2WB_8H_register;
-  case AArch64::LD1x2WB_4S_fixed: return AArch64::LD1x2WB_4S_register;
-  case AArch64::LD1x2WB_2D_fixed: return AArch64::LD1x2WB_2D_register;
-
-  case AArch64::LD1x3WB_8B_fixed: return AArch64::LD1x3WB_8B_register;
-  case AArch64::LD1x3WB_4H_fixed: return AArch64::LD1x3WB_4H_register;
-  case AArch64::LD1x3WB_2S_fixed: return AArch64::LD1x3WB_2S_register;
-  case AArch64::LD1x3WB_1D_fixed: return AArch64::LD1x3WB_1D_register;
-  case AArch64::LD1x3WB_16B_fixed: return AArch64::LD1x3WB_16B_register;
-  case AArch64::LD1x3WB_8H_fixed: return AArch64::LD1x3WB_8H_register;
-  case AArch64::LD1x3WB_4S_fixed: return AArch64::LD1x3WB_4S_register;
-  case AArch64::LD1x3WB_2D_fixed: return AArch64::LD1x3WB_2D_register;
-
-  case AArch64::LD1x4WB_8B_fixed: return AArch64::LD1x4WB_8B_register;
-  case AArch64::LD1x4WB_4H_fixed: return AArch64::LD1x4WB_4H_register;
-  case AArch64::LD1x4WB_2S_fixed: return AArch64::LD1x4WB_2S_register;
-  case AArch64::LD1x4WB_1D_fixed: return AArch64::LD1x4WB_1D_register;
-  case AArch64::LD1x4WB_16B_fixed: return AArch64::LD1x4WB_16B_register;
-  case AArch64::LD1x4WB_8H_fixed: return AArch64::LD1x4WB_8H_register;
-  case AArch64::LD1x4WB_4S_fixed: return AArch64::LD1x4WB_4S_register;
-  case AArch64::LD1x4WB_2D_fixed: return AArch64::LD1x4WB_2D_register;
-
-  case AArch64::ST1WB_8B_fixed: return AArch64::ST1WB_8B_register;
-  case AArch64::ST1WB_4H_fixed: return AArch64::ST1WB_4H_register;
-  case AArch64::ST1WB_2S_fixed: return AArch64::ST1WB_2S_register;
-  case AArch64::ST1WB_1D_fixed: return AArch64::ST1WB_1D_register;
-  case AArch64::ST1WB_16B_fixed: return AArch64::ST1WB_16B_register;
-  case AArch64::ST1WB_8H_fixed: return AArch64::ST1WB_8H_register;
-  case AArch64::ST1WB_4S_fixed: return AArch64::ST1WB_4S_register;
-  case AArch64::ST1WB_2D_fixed: return AArch64::ST1WB_2D_register;
-
-  case AArch64::ST2WB_8B_fixed: return AArch64::ST2WB_8B_register;
-  case AArch64::ST2WB_4H_fixed: return AArch64::ST2WB_4H_register;
-  case AArch64::ST2WB_2S_fixed: return AArch64::ST2WB_2S_register;
-  case AArch64::ST2WB_16B_fixed: return AArch64::ST2WB_16B_register;
-  case AArch64::ST2WB_8H_fixed: return AArch64::ST2WB_8H_register;
-  case AArch64::ST2WB_4S_fixed: return AArch64::ST2WB_4S_register;
-  case AArch64::ST2WB_2D_fixed: return AArch64::ST2WB_2D_register;
-
-  case AArch64::ST3WB_8B_fixed: return AArch64::ST3WB_8B_register;
-  case AArch64::ST3WB_4H_fixed: return AArch64::ST3WB_4H_register;
-  case AArch64::ST3WB_2S_fixed: return AArch64::ST3WB_2S_register;
-  case AArch64::ST3WB_16B_fixed: return AArch64::ST3WB_16B_register;
-  case AArch64::ST3WB_8H_fixed: return AArch64::ST3WB_8H_register;
-  case AArch64::ST3WB_4S_fixed: return AArch64::ST3WB_4S_register;
-  case AArch64::ST3WB_2D_fixed: return AArch64::ST3WB_2D_register;
-
-  case AArch64::ST4WB_8B_fixed: return AArch64::ST4WB_8B_register;
-  case AArch64::ST4WB_4H_fixed: return AArch64::ST4WB_4H_register;
-  case AArch64::ST4WB_2S_fixed: return AArch64::ST4WB_2S_register;
-  case AArch64::ST4WB_16B_fixed: return AArch64::ST4WB_16B_register;
-  case AArch64::ST4WB_8H_fixed: return AArch64::ST4WB_8H_register;
-  case AArch64::ST4WB_4S_fixed: return AArch64::ST4WB_4S_register;
-  case AArch64::ST4WB_2D_fixed: return AArch64::ST4WB_2D_register;
-
-  case AArch64::ST1x2WB_8B_fixed: return AArch64::ST1x2WB_8B_register;
-  case AArch64::ST1x2WB_4H_fixed: return AArch64::ST1x2WB_4H_register;
-  case AArch64::ST1x2WB_2S_fixed: return AArch64::ST1x2WB_2S_register;
-  case AArch64::ST1x2WB_1D_fixed: return AArch64::ST1x2WB_1D_register;
-  case AArch64::ST1x2WB_16B_fixed: return AArch64::ST1x2WB_16B_register;
-  case AArch64::ST1x2WB_8H_fixed: return AArch64::ST1x2WB_8H_register;
-  case AArch64::ST1x2WB_4S_fixed: return AArch64::ST1x2WB_4S_register;
-  case AArch64::ST1x2WB_2D_fixed: return AArch64::ST1x2WB_2D_register;
-
-  case AArch64::ST1x3WB_8B_fixed: return AArch64::ST1x3WB_8B_register;
-  case AArch64::ST1x3WB_4H_fixed: return AArch64::ST1x3WB_4H_register;
-  case AArch64::ST1x3WB_2S_fixed: return AArch64::ST1x3WB_2S_register;
-  case AArch64::ST1x3WB_1D_fixed: return AArch64::ST1x3WB_1D_register;
-  case AArch64::ST1x3WB_16B_fixed: return AArch64::ST1x3WB_16B_register;
-  case AArch64::ST1x3WB_8H_fixed: return AArch64::ST1x3WB_8H_register;
-  case AArch64::ST1x3WB_4S_fixed: return AArch64::ST1x3WB_4S_register;
-  case AArch64::ST1x3WB_2D_fixed: return AArch64::ST1x3WB_2D_register;
-
-  case AArch64::ST1x4WB_8B_fixed: return AArch64::ST1x4WB_8B_register;
-  case AArch64::ST1x4WB_4H_fixed: return AArch64::ST1x4WB_4H_register;
-  case AArch64::ST1x4WB_2S_fixed: return AArch64::ST1x4WB_2S_register;
-  case AArch64::ST1x4WB_1D_fixed: return AArch64::ST1x4WB_1D_register;
-  case AArch64::ST1x4WB_16B_fixed: return AArch64::ST1x4WB_16B_register;
-  case AArch64::ST1x4WB_8H_fixed: return AArch64::ST1x4WB_8H_register;
-  case AArch64::ST1x4WB_4S_fixed: return AArch64::ST1x4WB_4S_register;
-  case AArch64::ST1x4WB_2D_fixed: return AArch64::ST1x4WB_2D_register;
-
-  // Post-index of duplicate loads
-  case AArch64::LD2R_WB_8B_fixed: return AArch64::LD2R_WB_8B_register;
-  case AArch64::LD2R_WB_4H_fixed: return AArch64::LD2R_WB_4H_register;
-  case AArch64::LD2R_WB_2S_fixed: return AArch64::LD2R_WB_2S_register;
-  case AArch64::LD2R_WB_1D_fixed: return AArch64::LD2R_WB_1D_register;
-  case AArch64::LD2R_WB_16B_fixed: return AArch64::LD2R_WB_16B_register;
-  case AArch64::LD2R_WB_8H_fixed: return AArch64::LD2R_WB_8H_register;
-  case AArch64::LD2R_WB_4S_fixed: return AArch64::LD2R_WB_4S_register;
-  case AArch64::LD2R_WB_2D_fixed: return AArch64::LD2R_WB_2D_register;
-
-  case AArch64::LD3R_WB_8B_fixed: return AArch64::LD3R_WB_8B_register;
-  case AArch64::LD3R_WB_4H_fixed: return AArch64::LD3R_WB_4H_register;
-  case AArch64::LD3R_WB_2S_fixed: return AArch64::LD3R_WB_2S_register;
-  case AArch64::LD3R_WB_1D_fixed: return AArch64::LD3R_WB_1D_register;
-  case AArch64::LD3R_WB_16B_fixed: return AArch64::LD3R_WB_16B_register;
-  case AArch64::LD3R_WB_8H_fixed: return AArch64::LD3R_WB_8H_register;
-  case AArch64::LD3R_WB_4S_fixed: return AArch64::LD3R_WB_4S_register;
-  case AArch64::LD3R_WB_2D_fixed: return AArch64::LD3R_WB_2D_register;
-
-  case AArch64::LD4R_WB_8B_fixed: return AArch64::LD4R_WB_8B_register;
-  case AArch64::LD4R_WB_4H_fixed: return AArch64::LD4R_WB_4H_register;
-  case AArch64::LD4R_WB_2S_fixed: return AArch64::LD4R_WB_2S_register;
-  case AArch64::LD4R_WB_1D_fixed: return AArch64::LD4R_WB_1D_register;
-  case AArch64::LD4R_WB_16B_fixed: return AArch64::LD4R_WB_16B_register;
-  case AArch64::LD4R_WB_8H_fixed: return AArch64::LD4R_WB_8H_register;
-  case AArch64::LD4R_WB_4S_fixed: return AArch64::LD4R_WB_4S_register;
-  case AArch64::LD4R_WB_2D_fixed: return AArch64::LD4R_WB_2D_register;
-
-  // Post-index of lane loads
-  case AArch64::LD2LN_WB_B_fixed: return AArch64::LD2LN_WB_B_register;
-  case AArch64::LD2LN_WB_H_fixed: return AArch64::LD2LN_WB_H_register;
-  case AArch64::LD2LN_WB_S_fixed: return AArch64::LD2LN_WB_S_register;
-  case AArch64::LD2LN_WB_D_fixed: return AArch64::LD2LN_WB_D_register;
-
-  case AArch64::LD3LN_WB_B_fixed: return AArch64::LD3LN_WB_B_register;
-  case AArch64::LD3LN_WB_H_fixed: return AArch64::LD3LN_WB_H_register;
-  case AArch64::LD3LN_WB_S_fixed: return AArch64::LD3LN_WB_S_register;
-  case AArch64::LD3LN_WB_D_fixed: return AArch64::LD3LN_WB_D_register;
-
-  case AArch64::LD4LN_WB_B_fixed: return AArch64::LD4LN_WB_B_register;
-  case AArch64::LD4LN_WB_H_fixed: return AArch64::LD4LN_WB_H_register;
-  case AArch64::LD4LN_WB_S_fixed: return AArch64::LD4LN_WB_S_register;
-  case AArch64::LD4LN_WB_D_fixed: return AArch64::LD4LN_WB_D_register;
-
-  // Post-index of lane stores
-  case AArch64::ST2LN_WB_B_fixed: return AArch64::ST2LN_WB_B_register;
-  case AArch64::ST2LN_WB_H_fixed: return AArch64::ST2LN_WB_H_register;
-  case AArch64::ST2LN_WB_S_fixed: return AArch64::ST2LN_WB_S_register;
-  case AArch64::ST2LN_WB_D_fixed: return AArch64::ST2LN_WB_D_register;
-
-  case AArch64::ST3LN_WB_B_fixed: return AArch64::ST3LN_WB_B_register;
-  case AArch64::ST3LN_WB_H_fixed: return AArch64::ST3LN_WB_H_register;
-  case AArch64::ST3LN_WB_S_fixed: return AArch64::ST3LN_WB_S_register;
-  case AArch64::ST3LN_WB_D_fixed: return AArch64::ST3LN_WB_D_register;
-
-  case AArch64::ST4LN_WB_B_fixed: return AArch64::ST4LN_WB_B_register;
-  case AArch64::ST4LN_WB_H_fixed: return AArch64::ST4LN_WB_H_register;
-  case AArch64::ST4LN_WB_S_fixed: return AArch64::ST4LN_WB_S_register;
-  case AArch64::ST4LN_WB_D_fixed: return AArch64::ST4LN_WB_D_register;
-  }
-  return Opc; // If not one we handle, return it unchanged.
-}
-
-SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating,
-                                       unsigned NumVecs,
-                                       const uint16_t *Opcodes) {
-  assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+  unsigned ExtOff = isExt;
 
-  EVT VT = N->getValueType(0);
-  unsigned OpcodeIndex;
-  bool is64BitVector = VT.is64BitVector();
-  switch (VT.getScalarType().getSizeInBits()) {
-  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
-  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
-  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
-  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
-  default: llvm_unreachable("unhandled vector load type");
-  }
-  unsigned Opc = Opcodes[OpcodeIndex];
+  // Form a REG_SEQUENCE to force register allocation.
+  unsigned Vec0Off = ExtOff + 1;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
+                               N->op_begin() + Vec0Off + NumVecs);
+  SDValue RegSeq = createQTuple(Regs);
 
-  SmallVector<SDValue, 2> Ops;
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
-  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+  SmallVector<SDValue, 6> Ops;
+  if (isExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+}
 
-  if (isUpdating) {
-    SDValue Inc = N->getOperand(AddrOpIdx + 1);
-    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    Ops.push_back(Inc);
+SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->isUnindexed())
+    return nullptr;
+  EVT VT = LD->getMemoryVT();
+  EVT DstVT = N->getValueType(0);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+
+  // We're not doing validity checking here. That was done when checking
+  // if we should mark the load as indexed or not. We're just selecting
+  // the right instruction.
+  unsigned Opcode = 0;
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  bool InsertTo64 = false;
+  if (VT == MVT::i64)
+    Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
+  else if (VT == MVT::i32) {
+    if (ExtType == ISD::NON_EXTLOAD)
+      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+    else if (ExtType == ISD::SEXTLOAD)
+      Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
+    else {
+      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+      InsertTo64 = true;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i16) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
+      else
+        Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
+    } else {
+      Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i8) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
+      else
+        Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
+    } else {
+      Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::f32) {
+    Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
+  } else if (VT == MVT::f64 || VT.is64BitVector()) {
+    Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
+  } else if (VT.is128BitVector()) {
+    Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
+  } else
+    return nullptr;
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
+  int OffsetVal = (int)OffsetOp->getZExtValue();
+  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64);
+  SDValue Ops[] = { Base, Offset, Chain };
+  SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i64, DstVT,
+                                       MVT::Other, Ops);
+  // Either way, we're replacing the node, so tell the caller that.
+  Done = true;
+  SDValue LoadedVal = SDValue(Res, 1);
+  if (InsertTo64) {
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+    LoadedVal =
+        SDValue(CurDAG->getMachineNode(
+                    AArch64::SUBREG_TO_REG, SDLoc(N), MVT::i64,
+                    CurDAG->getTargetConstant(0, MVT::i64), LoadedVal, SubReg),
+                0);
   }
 
-  Ops.push_back(N->getOperand(0)); // Push back the Chain
+  ReplaceUses(SDValue(N, 0), LoadedVal);
+  ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
+  ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
 
-  SmallVector<EVT, 3> ResTys;
-  // Push back the type of return super register
-  if (NumVecs == 1)
-    ResTys.push_back(VT);
-  else if (NumVecs == 3)
-    ResTys.push_back(MVT::Untyped);
-  else {
-    EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
-                                 is64BitVector ? NumVecs : NumVecs * 2);
-    ResTys.push_back(ResTy);
-  }
-
-  if (isUpdating)
-    ResTys.push_back(MVT::i64); // Type of the updated register
-  ResTys.push_back(MVT::Other); // Type of the Chain
+  return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
+                                        unsigned Opc, unsigned SubRegIdx) {
   SDLoc dl(N);
-  SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
 
-  // Transfer memoperands.
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(2)); // Mem operand;
+  Ops.push_back(Chain);
 
-  if (NumVecs == 1)
-    return VLd;
-
-  // If NumVecs > 1, the return result is a super register containing 2-4
-  // consecutive vector registers.
-  SDValue SuperReg = SDValue(VLd, 0);
-
-  unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-    ReplaceUses(SDValue(N, Vec),
-                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
-  // Update users of the Chain
-  ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
-  if (isUpdating)
-    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
-
-  return NULL;
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+  for (unsigned i = 0; i < NumVecs; ++i)
+    ReplaceUses(SDValue(N, i),
+        CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+  return nullptr;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, bool isUpdating,
-                                       unsigned NumVecs,
-                                       const uint16_t *Opcodes) {
-  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
+                                            unsigned Opc, unsigned SubRegIdx) {
   SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
 
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(1)); // Mem operand
+  Ops.push_back(N->getOperand(2)); // Incremental
+  Ops.push_back(Chain);
 
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
-  unsigned Vec0Idx = 3;
-  EVT VT = N->getOperand(Vec0Idx).getValueType();
-  unsigned OpcodeIndex;
-  bool is64BitVector = VT.is64BitVector();
-  switch (VT.getScalarType().getSizeInBits()) {
-  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
-  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
-  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
-  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
-  default: llvm_unreachable("unhandled vector store type");
-  }
-  unsigned Opc = Opcodes[OpcodeIndex];
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::i64); // Type of the write back register
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Update uses of write back register
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+  // Update uses of vector list
+  SDValue SuperReg = SDValue(Ld, 1);
+  if (NumVecs == 1)
+    ReplaceUses(SDValue(N, 0), SuperReg);
+  else
+    for (unsigned i = 0; i < NumVecs; ++i)
+      ReplaceUses(SDValue(N, i),
+          CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+  // Update the chain
+  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+  return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + 2));
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
+
+  return St;
+}
 
+SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
   SmallVector<EVT, 2> ResTys;
-  if (isUpdating)
-    ResTys.push_back(MVT::i64);
+  ResTys.push_back(MVT::i64);   // Type of the write back register
   ResTys.push_back(MVT::Other); // Type for the Chain
 
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
   SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + 1)); // base register
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental
+  Ops.push_back(N->getOperand(0)); // Chain
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  return St;
+}
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+class WidenVector {
+  SelectionDAG &DAG;
 
-  if (isUpdating) {
-    SDValue Inc = N->getOperand(AddrOpIdx + 1);
-    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    Ops.push_back(Inc);
+public:
+  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
+
+  SDValue operator()(SDValue V64Reg) {
+    EVT VT = V64Reg.getValueType();
+    unsigned NarrowSize = VT.getVectorNumElements();
+    MVT EltTy = VT.getVectorElementType().getSimpleVT();
+    MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+    SDLoc DL(V64Reg);
+
+    SDValue Undef =
+        SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
+    return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
   }
+};
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+
+  return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
+                                    V128Reg);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
+                                            unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
 
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
-                               N->op_begin() + Vec0Idx + NumVecs);
-  SDValue SrcReg = is64BitVector ? createDTuple(Regs) : createQTuple(Regs);
-  Ops.push_back(SrcReg);
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
 
-  // Push back the Chain
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 3));
   Ops.push_back(N->getOperand(0));
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+
+  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+  static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
+                              AArch64::qsub3 };
+  for (unsigned i = 0; i < NumVecs; ++i) {
+    SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
+    if (Narrow)
+      NV = NarrowVector(NV, *CurDAG);
+    ReplaceUses(SDValue(N, i), NV);
+  }
 
-  // Transfer memoperands.
-  SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-  cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
 
-  return VSt;
+  return Ld;
 }
 
-SDValue
-AArch64DAGToDAGISel::getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
-                                          SDValue Operand) {
-  SDNode *Reg = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, DL,
-                        VT, VTD, MVT::Other,
-                        CurDAG->getTargetConstant(0, MVT::i64),
-                        Operand,
-                        CurDAG->getTargetConstant(AArch64::sub_64, MVT::i32));
-  return SDValue(Reg, 0);
+SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
+                                                unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::i64); // Type of the write back register
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Base register
+  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
+  Ops.push_back(N->getOperand(0));
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Update uses of the write back register
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+  // Update uses of the vector list
+  SDValue SuperReg = SDValue(Ld, 1);
+  if (NumVecs == 1) {
+    ReplaceUses(SDValue(N, 0),
+                Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
+  } else {
+    EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+    static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
+                                AArch64::qsub3 };
+    for (unsigned i = 0; i < NumVecs; ++i) {
+      SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
+                                                  SuperReg);
+      if (Narrow)
+        NV = NarrowVector(NV, *CurDAG);
+      ReplaceUses(SDValue(N, i), NV);
+    }
+  }
+
+  // Update the Chain
+  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+
+  return Ld;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
-                                          unsigned NumVecs,
-                                          const uint16_t *Opcodes) {
-  assert(NumVecs >=2 && NumVecs <= 4 && "Load Dup NumVecs out-of-range");
+SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
   SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
 
-  EVT VT = N->getValueType(0);
-  unsigned OpcodeIndex;
-  bool is64BitVector = VT.is64BitVector();
-  switch (VT.getScalarType().getSizeInBits()) {
-  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
-  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
-  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
-  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
-  default: llvm_unreachable("unhandled vector duplicate lane load type");
-  }
-  unsigned Opc = Opcodes[OpcodeIndex];
-
-  SDValue SuperReg;
   SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(1)); // Push back the Memory Address
-  if (isUpdating) {
-    SDValue Inc = N->getOperand(2);
-    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    Ops.push_back(Inc);
-  }
-  Ops.push_back(N->getOperand(0)); // Push back the Chain
-
-  SmallVector<EVT, 3> ResTys;
-  // Push back the type of return super register
-  if (NumVecs == 3)
-    ResTys.push_back(MVT::Untyped);
-  else {
-    EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
-                                 is64BitVector ? NumVecs : NumVecs * 2);
-    ResTys.push_back(ResTy);
-  }
-  if (isUpdating)
-    ResTys.push_back(MVT::i64); // Type of the updated register
-  ResTys.push_back(MVT::Other); // Type of the Chain
-  SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 3));
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
 
   // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
-
-  SuperReg = SDValue(VLdDup, 0);
-  unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
-  // Update uses of each registers in super register
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-    ReplaceUses(SDValue(N, Vec),
-                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
-  // Update uses of the Chain
-  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
-  if (isUpdating)
-    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
-  return NULL;
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  return St;
 }
 
-// We only have 128-bit vector type of load/store lane instructions.
-// If it is 64-bit vector, we also select it to the 128-bit instructions.
-// Just use SUBREG_TO_REG to adapt the input to 128-bit vector and
-// EXTRACT_SUBREG to get the 64-bit vector from the 128-bit vector output.
-SDNode *AArch64DAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
-                                             bool isUpdating, unsigned NumVecs,
-                                             const uint16_t *Opcodes) {
-  assert(NumVecs >= 2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
+                                                 unsigned Opc) {
   SDLoc dl(N);
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
-  unsigned Vec0Idx = 3;
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
 
-  SDValue Chain = N->getOperand(0);
-  unsigned Lane =
-      cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
-  EVT VT = N->getOperand(Vec0Idx).getValueType();
-  bool is64BitVector = VT.is64BitVector();
-  EVT VT64; // 64-bit Vector Type
-
-  if (is64BitVector) {
-    VT64 = VT;
-    VT = EVT::getVectorVT(*CurDAG->getContext(), VT.getVectorElementType(),
-                          VT.getVectorNumElements() * 2);
-  }
-
-  unsigned OpcodeIndex;
-  switch (VT.getScalarType().getSizeInBits()) {
-  case 8: OpcodeIndex = 0; break;
-  case 16: OpcodeIndex = 1; break;
-  case 32: OpcodeIndex = 2; break;
-  case 64: OpcodeIndex = 3; break;
-  default: llvm_unreachable("unhandled vector lane load/store type");
-  }
-  unsigned Opc = Opcodes[OpcodeIndex];
-
-  SmallVector<EVT, 3> ResTys;
-  if (IsLoad) {
-    // Push back the type of return super register
-    if (NumVecs == 3)
-      ResTys.push_back(MVT::Untyped);
-    else {
-      EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
-                                   is64BitVector ? NumVecs : NumVecs * 2);
-      ResTys.push_back(ResTy);
-    }
-  }
-  if (isUpdating)
-    ResTys.push_back(MVT::i64); // Type of the updated register
-  ResTys.push_back(MVT::Other); // Type of Chain
-  SmallVector<SDValue, 5> Ops;
-  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
-  if (isUpdating) {
-    SDValue Inc = N->getOperand(AddrOpIdx + 1);
-    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    Ops.push_back(Inc);
-  }
-
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
-                               N->op_begin() + Vec0Idx + NumVecs);
-  if (is64BitVector)
-    for (unsigned i = 0; i < Regs.size(); i++)
-      Regs[i] = getTargetSubregToReg(AArch64::sub_64, dl, VT, VT64, Regs[i]);
-  SDValue SuperReg = createQTuple(Regs);
-
-  Ops.push_back(SuperReg); // Source Reg
-  SDValue LaneValue = CurDAG->getTargetConstant(Lane, MVT::i32);
-  Ops.push_back(LaneValue);
-  Ops.push_back(Chain); // Push back the Chain
-
-  SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  SmallVector<EVT, 2> ResTys;
+  ResTys.push_back(MVT::i64);   // Type of the write back register
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register
+  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
-  if (!IsLoad)
-    return VLdLn;
-
-  // Extract the subregisters.
-  SuperReg = SDValue(VLdLn, 0);
-  unsigned Sub0 = AArch64::qsub_0;
-  // Update uses of each registers in super register
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-    SDValue SUB0 = CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg);
-    if (is64BitVector) {
-      SUB0 = CurDAG->getTargetExtractSubreg(AArch64::sub_64, dl, VT64, SUB0);
-    }
-    ReplaceUses(SDValue(N, Vec), SUB0);
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  return St;
+}
+
+static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
+                                       unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       unsigned NumberOfIgnoredLowBits,
+                                       bool BiggerPattern) {
+  assert(N->getOpcode() == ISD::AND &&
+         "N must be a AND operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // FIXME: simplify-demanded-bits in DAGCombine will probably have
+  // changed the AND node to a 32-bit mask operation. We'll have to
+  // undo that as part of the transform here if we want to catch all
+  // the opportunities.
+  // Currently the NumberOfIgnoredLowBits argument helps to recover
+  // form these situations when matching bigger pattern (bitfield insert).
+
+  // For unsigned extracts, check for a shift right and mask
+  uint64_t And_imm = 0;
+  if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
+    return false;
+
+  const SDNode *Op0 = N->getOperand(0).getNode();
+
+  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
+  // simplified. Try to undo that
+  And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
+
+  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+  if (And_imm & (And_imm + 1))
+    return false;
+
+  bool ClampMSB = false;
+  uint64_t Srl_imm = 0;
+  // Handle the SRL + ANY_EXTEND case.
+  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
+      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
+    // Extend the incoming operand of the SRL to 64-bit.
+    Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
+    // Make sure to clamp the MSB so that we preserve the semantics of the
+    // original operations.
+    ClampMSB = true;
+  } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
+             isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
+                                   Srl_imm)) {
+    // If the shift result was truncated, we can still combine them.
+    Opd0 = Op0->getOperand(0).getOperand(0);
+
+    // Use the type of SRL node.
+    VT = Opd0->getValueType(0);
+  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
+    Opd0 = Op0->getOperand(0);
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift right has been performed.
+    // The resulting code will be at least as good as the original one
+    // plus it may expose more opportunities for bitfield insert pattern.
+    // FIXME: Currently we limit this to the bigger pattern, because
+    // some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
+         "bad amount in shift node!");
+
+  LSB = Srl_imm;
+  MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
+                                  : CountTrailingOnes_64(And_imm)) -
+        1;
+  if (ClampMSB)
+    // Since we're moving the extend before the right shift operation, we need
+    // to clamp the MSB to make sure we don't shift in undefined bits instead of
+    // the zeros which would get shifted in with the original right shift
+    // operation.
+    MSB = MSB > 31 ? 31 : MSB;
+
+  Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+  return true;
+}
+
+static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                     unsigned &LSB, unsigned &MSB) {
+  // We are looking for the following pattern which basically extracts a single
+  // bit from the source value and places it in the LSB of the destination
+  // value, all other bits of the destination value or set to zero:
+  //
+  // Value2 = AND Value, MaskImm
+  // SRL Value2, ShiftImm
+  //
+  // with MaskImm >> ShiftImm == 1.
+  //
+  // This gets selected into a single UBFM:
+  //
+  // UBFM Value, ShiftImm, ShiftImm
+  //
+
+  if (N->getOpcode() != ISD::SRL)
+    return false;
+
+  uint64_t And_mask = 0;
+  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
+    return false;
+
+  Opd0 = N->getOperand(0).getOperand(0);
+
+  uint64_t Srl_imm = 0;
+  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+    return false;
+
+  // Check whether we really have a one bit extract here.
+  if (And_mask >> Srl_imm == 0x1) {
+    if (N->getValueType(0) == MVT::i32)
+      Opc = AArch64::UBFMWri;
+    else
+      Opc = AArch64::UBFMXri;
+
+    LSB = MSB = Srl_imm;
+
+    return true;
   }
-  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
-  if (isUpdating)
-    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
-  return NULL;
+
+  return false;
 }
 
-unsigned AArch64DAGToDAGISel::getTBLOpc(bool IsExt, bool Is64Bit,
-                                        unsigned NumOfVec) {
-  assert(NumOfVec >= 1 && NumOfVec <= 4 && "VST NumVecs out-of-range");
+static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       bool BiggerPattern) {
+  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
+         "N must be a SHR/SRA operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // Check for AND + SRL doing a one bit extract.
+  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+    return true;
 
-  unsigned Opc = 0;
-  switch (NumOfVec) {
+  // we're looking for a shift of a shift
+  uint64_t Shl_imm = 0;
+  uint64_t Trunc_bits = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+    Opd0 = N->getOperand(0).getOperand(0);
+  } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
+             N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
+    // We are looking for a shift of truncate. Truncate from i64 to i32 could
+    // be considered as setting high 32 bits as zero. Our strategy here is to
+    // always generate 64bit UBFM. This consistency will help the CSE pass
+    // later find more redundancy.
+    Opd0 = N->getOperand(0).getOperand(0);
+    Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
+    VT = Opd0->getValueType(0);
+    assert(VT == MVT::i64 && "the promoted type should be i64");
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift left has been performed.
+    // FIXME: Currently we limit this to the bigger pattern case,
+    // because some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
+  uint64_t Srl_imm = 0;
+  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+    return false;
+
+  assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
+         "bad amount in shift node!");
+  // Note: The width operand is encoded as width-1.
+  unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
+  int sLSB = Srl_imm - Shl_imm;
+  if (sLSB < 0)
+    return false;
+  LSB = sLSB;
+  MSB = LSB + Width;
+  // SRA requires a signed extraction
+  if (VT == MVT::i32)
+    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
+  else
+    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
+  return true;
+}
+
+static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
+                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
+                                unsigned NumberOfIgnoredLowBits = 0,
+                                bool BiggerPattern = false) {
+  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
+    return false;
+
+  switch (N->getOpcode()) {
   default:
+    if (!N->isMachineOpcode())
+      return false;
     break;
-  case 1:
-    if (IsExt)
-      Opc = Is64Bit ? AArch64::TBX1_8b : AArch64::TBX1_16b;
+  case ISD::AND:
+    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
+                                      NumberOfIgnoredLowBits, BiggerPattern);
+  case ISD::SRL:
+  case ISD::SRA:
+    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
+  }
+
+  unsigned NOpc = N->getMachineOpcode();
+  switch (NOpc) {
+  default:
+    return false;
+  case AArch64::SBFMWri:
+  case AArch64::UBFMWri:
+  case AArch64::SBFMXri:
+  case AArch64::UBFMXri:
+    Opc = NOpc;
+    Opd0 = N->getOperand(0);
+    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+    return true;
+  }
+  // Unreachable
+  return false;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
+  unsigned Opc, LSB, MSB;
+  SDValue Opd0;
+  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
+    return nullptr;
+
+  EVT VT = N->getValueType(0);
+
+  // If the bit extract operation is 64bit but the original type is 32bit, we
+  // need to add one EXTRACT_SUBREG.
+  if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
+    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
+                       CurDAG->getTargetConstant(MSB, MVT::i64)};
+
+    SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+    MachineSDNode *Node =
+        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32,
+                               SDValue(BFM, 0), SubReg);
+    return Node;
+  }
+
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
+                   CurDAG->getTargetConstant(MSB, VT)};
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
+/// Does DstMask form a complementary pair with the mask provided by
+/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
+/// this asks whether DstMask zeroes precisely those bits that will be set by
+/// the other half.
+static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
+                              unsigned NumberOfIgnoredHighBits, EVT VT) {
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "i32 or i64 mask type expected!");
+  unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
+
+  APInt SignificantDstMask = APInt(BitWidth, DstMask);
+  APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
+
+  return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
+         (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
+}
+
+// Look for bits that will be useful for later uses.
+// A bit is consider useless as soon as it is dropped and never used
+// before it as been dropped.
+// E.g., looking for useful bit of x
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// After #1, x useful bits are 0x7, then the useful bits of x, live through
+// y.
+// After #2, the useful bits of x are 0x4.
+// However, if x is used on an unpredicatable instruction, then all its bits
+// are useful.
+// E.g.
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// 3. str x, [@x]
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
+
+static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
+  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
+  getUsefulBits(Op, UsefulBits, Depth + 1);
+}
+
+static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
+                                             uint64_t Imm, uint64_t MSB,
+                                             unsigned Depth) {
+  // inherit the bitwidth value
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    // The interesting part will be in the lower part of the result
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was starting at Imm in the argument
+    OpUsefulBits = OpUsefulBits.shl(Imm);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    // The interesting part will be shifted in the result
+    OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was at zero in the argument
+    OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
+  }
+
+  UsefulBits &= OpUsefulBits;
+}
+
+static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
+                                  unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+
+  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+}
+
+static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t ShiftTypeAndValue =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  APInt Mask(UsefulBits);
+  Mask.clearAllBits();
+  Mask.flipAllBits();
+
+  if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
+    // Shift Left
+    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.shl(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.lshr(ShiftAmt);
+  } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
+    // Shift Right
+    // We do not handle AArch64_AM::ASR, because the sign will change the
+    // number of useful bits
+    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.lshr(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.shl(ShiftAmt);
+  } else
+    return;
+
+  UsefulBits &= Mask;
+}
+
+static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
+                                 unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
+
+  if (Op.getOperand(1) == Orig)
+    return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    UsefulBits &= ~OpUsefulBits;
+    getUsefulBits(Op, UsefulBits, Depth + 1);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm));
+    getUsefulBits(Op, UsefulBits, Depth + 1);
+  }
+}
+
+static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
+                                SDValue Orig, unsigned Depth) {
+
+  // Users of this node should have already been instruction selected
+  // FIXME: Can we turn that into an assert?
+  if (!UserNode->isMachineOpcode())
+    return;
+
+  switch (UserNode->getMachineOpcode()) {
+  default:
+    return;
+  case AArch64::ANDSWri:
+  case AArch64::ANDSXri:
+  case AArch64::ANDWri:
+  case AArch64::ANDXri:
+    // We increment Depth only when we call the getUsefulBits
+    return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case AArch64::UBFMWri:
+  case AArch64::UBFMXri:
+    return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
+
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+    if (UserNode->getOperand(1) != Orig)
+      return;
+    return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case AArch64::BFMWri:
+  case AArch64::BFMXri:
+    return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
+  }
+}
+
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
+  if (Depth >= 6)
+    return;
+  // Initialize UsefulBits
+  if (!Depth) {
+    unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits();
+    // At the beginning, assume every produced bits is useful
+    UsefulBits = APInt(Bitwidth, 0);
+    UsefulBits.flipAllBits();
+  }
+  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
+
+  for (SDNode *Node : Op.getNode()->uses()) {
+    // A use cannot produce useful bits
+    APInt UsefulBitsForUse = APInt(UsefulBits);
+    getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
+    UsersUsefulBits |= UsefulBitsForUse;
+  }
+  // UsefulBits contains the produced bits that are meaningful for the
+  // current definition, thus a user cannot make a bit meaningful at
+  // this point
+  UsefulBits &= UsersUsefulBits;
+}
+
+/// Create a machine node performing a notional SHL of Op by ShlAmount. If
+/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
+/// 0, return Op unchanged.
+static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
+  if (ShlAmount == 0)
+    return Op;
+
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+
+  SDNode *ShiftNode;
+  if (ShlAmount > 0) {
+    // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, SDLoc(Op), VT, Op,
+        CurDAG->getTargetConstant(BitWidth - ShlAmount, VT),
+        CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT));
+  } else {
+    // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
+    assert(ShlAmount < 0 && "expected right shift");
+    int ShrAmount = -ShlAmount;
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT),
+        CurDAG->getTargetConstant(BitWidth - 1, VT));
+  }
+
+  return SDValue(ShiftNode, 0);
+}
+
+/// Does this tree qualify as an attempt to move a bitfield into position,
+/// essentially "(and (shl VAL, N), Mask)".
+static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+                                    SDValue &Src, int &ShiftAmount,
+                                    int &MaskWidth) {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  (void)BitWidth;
+  assert(BitWidth == 32 || BitWidth == 64);
+
+  APInt KnownZero, KnownOne;
+  CurDAG->computeKnownBits(Op, KnownZero, KnownOne);
+
+  // Non-zero in the sense that they're not provably zero, which is the key
+  // point if we want to use this value
+  uint64_t NonZeroBits = (~KnownZero).getZExtValue();
+
+  // Discard a constant AND mask if present. It's safe because the node will
+  // already have been factored into the computeKnownBits calculation above.
+  uint64_t AndImm;
+  if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
+    assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
+    Op = Op.getOperand(0);
+  }
+
+  uint64_t ShlImm;
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
+    return false;
+  Op = Op.getOperand(0);
+
+  if (!isShiftedMask_64(NonZeroBits))
+    return false;
+
+  ShiftAmount = countTrailingZeros(NonZeroBits);
+  MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
+
+  // BFI encompasses sufficiently many nodes that it's worth inserting an extra
+  // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
+  // amount.
+  Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
+
+  return true;
+}
+
+// Given a OR operation, check if we have the following pattern
+// ubfm c, b, imm, imm2 (or something that does the same jobs, see
+//                       isBitfieldExtractOp)
+// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
+//                 countTrailingZeros(mask2) == imm2 - imm + 1
+// f = d | c
+// if yes, given reference arguments will be update so that one can replace
+// the OR instruction with:
+// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
+static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
+                                     SDValue &Src, unsigned &ImmR,
+                                     unsigned &ImmS, SelectionDAG *CurDAG) {
+  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+  // Set Opc
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::i32)
+    Opc = AArch64::BFMWri;
+  else if (VT == MVT::i64)
+    Opc = AArch64::BFMXri;
+  else
+    return false;
+
+  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
+  // have the expected shape. Try to undo that.
+  APInt UsefulBits;
+  getUsefulBits(SDValue(N, 0), UsefulBits);
+
+  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
+  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
+
+  // OR is commutative, check both possibilities (does llvm provide a
+  // way to do that directely, e.g., via code matcher?)
+  SDValue OrOpd1Val = N->getOperand(1);
+  SDNode *OrOpd0 = N->getOperand(0).getNode();
+  SDNode *OrOpd1 = N->getOperand(1).getNode();
+  for (int i = 0; i < 2;
+       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
+    unsigned BFXOpc;
+    int DstLSB, Width;
+    if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
+                            NumberOfIgnoredLowBits, true)) {
+      // Check that the returned opcode is compatible with the pattern,
+      // i.e., same type and zero extended (U and not S)
+      if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
+          (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
+        continue;
+
+      // Compute the width of the bitfield insertion
+      DstLSB = 0;
+      Width = ImmS - ImmR + 1;
+      // FIXME: This constraint is to catch bitfield insertion we may
+      // want to widen the pattern if we want to grab general bitfied
+      // move case
+      if (Width <= 0)
+        continue;
+
+      // If the mask on the insertee is correct, we have a BFXIL operation. We
+      // can share the ImmR and ImmS values from the already-computed UBFM.
+    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
+                                       DstLSB, Width)) {
+      ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+      ImmS = Width - 1;
+    } else
+      continue;
+
+    // Check the second part of the pattern
+    EVT VT = OrOpd1->getValueType(0);
+    assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
+
+    // Compute the Known Zero for the candidate of the first operand.
+    // This allows to catch more general case than just looking for
+    // AND with imm. Indeed, simplify-demanded-bits may have removed
+    // the AND instruction because it proves it was useless.
+    APInt KnownZero, KnownOne;
+    CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne);
+
+    // Check if there is enough room for the second operand to appear
+    // in the first one
+    APInt BitsToBeInserted =
+        APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
+
+    if ((BitsToBeInserted & ~KnownZero) != 0)
+      continue;
+
+    // Set the first operand
+    uint64_t Imm;
+    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
+        isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
+      // In that case, we can eliminate the AND
+      Dst = OrOpd1->getOperand(0);
     else
-      Opc = Is64Bit ? AArch64::TBL1_8b : AArch64::TBL1_16b;
+      // Maybe the AND has been removed by simplify-demanded-bits
+      // or is useful because it discards more bits
+      Dst = OrOpd1Val;
+
+    // both parts match
+    return true;
+  }
+
+  return false;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
+  if (N->getOpcode() != ISD::OR)
+    return nullptr;
+
+  unsigned Opc;
+  unsigned LSB, MSB;
+  SDValue Opd0, Opd1;
+
+  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+    return nullptr;
+
+  EVT VT = N->getValueType(0);
+  SDValue Ops[] = { Opd0,
+                    Opd1,
+                    CurDAG->getTargetConstant(LSB, VT),
+                    CurDAG->getTargetConstant(MSB, VT) };
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  unsigned Variant;
+  unsigned Opc;
+  unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr };
+
+  if (VT == MVT::f32) {
+    Variant = 0;
+  } else if (VT == MVT::f64) {
+    Variant = 1;
+  } else
+    return nullptr; // Unrecognized argument type. Fall back on default codegen.
+
+  // Pick the FRINTX variant needed to set the flags.
+  unsigned FRINTXOpc = FRINTXOpcs[Variant];
+
+  switch (N->getOpcode()) {
+  default:
+    return nullptr; // Unrecognized libm ISD node. Fall back on default codegen.
+  case ISD::FCEIL: {
+    unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr };
+    Opc = FRINTPOpcs[Variant];
     break;
-  case 2:
-    if (IsExt)
-      Opc = Is64Bit ? AArch64::TBX2_8b : AArch64::TBX2_16b;
-    else
-      Opc = Is64Bit ? AArch64::TBL2_8b : AArch64::TBL2_16b;
+  }
+  case ISD::FFLOOR: {
+    unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr };
+    Opc = FRINTMOpcs[Variant];
     break;
-  case 3:
-    if (IsExt)
-      Opc = Is64Bit ? AArch64::TBX3_8b : AArch64::TBX3_16b;
-    else
-      Opc = Is64Bit ? AArch64::TBL3_8b : AArch64::TBL3_16b;
+  }
+  case ISD::FTRUNC: {
+    unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr };
+    Opc = FRINTZOpcs[Variant];
     break;
-  case 4:
-    if (IsExt)
-      Opc = Is64Bit ? AArch64::TBX4_8b : AArch64::TBX4_16b;
-    else
-      Opc = Is64Bit ? AArch64::TBL4_8b : AArch64::TBL4_16b;
+  }
+  case ISD::FROUND: {
+    unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr };
+    Opc = FRINTAOpcs[Variant];
     break;
   }
+  }
 
-  return Opc;
+  SDLoc dl(N);
+  SDValue In = N->getOperand(0);
+  SmallVector<SDValue, 2> Ops;
+  Ops.push_back(In);
+
+  if (!TM.Options.UnsafeFPMath) {
+    SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
+    Ops.push_back(SDValue(FRINTX, 1));
+  }
+
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectVTBL(SDNode *N, unsigned NumVecs,
-                                        bool IsExt) {
-  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
-  SDLoc dl(N);
+bool
+AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
+                                              unsigned RegWidth) {
+  APFloat FVal(0.0);
+  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+    FVal = CN->getValueAPF();
+  else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
+    // Some otherwise illegal constants are allowed in this case.
+    if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
+        !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
+      return false;
+
+    ConstantPoolSDNode *CN =
+        dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
+    FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
+  } else
+    return false;
+
+  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
+  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
+  // x-register.
+  //
+  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
+  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
+  // integers.
+  bool IsExact;
 
-  // Check the element of look up table is 64-bit or not
-  unsigned Vec0Idx = IsExt ? 2 : 1;
-  assert(!N->getOperand(Vec0Idx + 0).getValueType().is64BitVector() &&
-         "The element of lookup table for vtbl and vtbx must be 128-bit");
+  // fbits is between 1 and 64 in the worst-case, which means the fmul
+  // could have 2^64 as an actual operand. Need 65 bits of precision.
+  APSInt IntVal(65, true);
+  FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
 
-  // Check the return value type is 64-bit or not
-  EVT ResVT = N->getValueType(0);
-  bool is64BitRes = ResVT.is64BitVector();
+  // N.b. isPowerOf2 also checks for > 0.
+  if (!IsExact || !IntVal.isPowerOf2()) return false;
+  unsigned FBits = IntVal.logBase2();
 
-  // Create new SDValue for vector list
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
-                               N->op_begin() + Vec0Idx + NumVecs);
-  SDValue TblReg = createQTuple(Regs);
-  unsigned Opc = getTBLOpc(IsExt, is64BitRes, NumVecs);
+  // Checks above should have guaranteed that we haven't lost information in
+  // finding FBits, but it must still be in range.
+  if (FBits == 0 || FBits > RegWidth) return false;
 
-  SmallVector<SDValue, 3> Ops;
-  if (IsExt)
-    Ops.push_back(N->getOperand(1));
-  Ops.push_back(TblReg);
-  Ops.push_back(N->getOperand(Vec0Idx + NumVecs));
-  return CurDAG->getMachineNode(Opc, dl, ResVT, Ops);
+  FixedPos = CurDAG->getTargetConstant(FBits, MVT::i32);
+  return true;
 }
 
 SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
-  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n");
+  DEBUG(errs() << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
 
+  // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
-    return NULL;
+    return nullptr;
   }
 
-  switch (Node->getOpcode()) {
-  case ISD::ATOMIC_LOAD_ADD:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_ADD_I8,
-                        AArch64::ATOMIC_LOAD_ADD_I16,
-                        AArch64::ATOMIC_LOAD_ADD_I32,
-                        AArch64::ATOMIC_LOAD_ADD_I64);
-  case ISD::ATOMIC_LOAD_SUB:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_SUB_I8,
-                        AArch64::ATOMIC_LOAD_SUB_I16,
-                        AArch64::ATOMIC_LOAD_SUB_I32,
-                        AArch64::ATOMIC_LOAD_SUB_I64);
-  case ISD::ATOMIC_LOAD_AND:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_AND_I8,
-                        AArch64::ATOMIC_LOAD_AND_I16,
-                        AArch64::ATOMIC_LOAD_AND_I32,
-                        AArch64::ATOMIC_LOAD_AND_I64);
-  case ISD::ATOMIC_LOAD_OR:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_OR_I8,
-                        AArch64::ATOMIC_LOAD_OR_I16,
-                        AArch64::ATOMIC_LOAD_OR_I32,
-                        AArch64::ATOMIC_LOAD_OR_I64);
-  case ISD::ATOMIC_LOAD_XOR:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_XOR_I8,
-                        AArch64::ATOMIC_LOAD_XOR_I16,
-                        AArch64::ATOMIC_LOAD_XOR_I32,
-                        AArch64::ATOMIC_LOAD_XOR_I64);
-  case ISD::ATOMIC_LOAD_NAND:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_NAND_I8,
-                        AArch64::ATOMIC_LOAD_NAND_I16,
-                        AArch64::ATOMIC_LOAD_NAND_I32,
-                        AArch64::ATOMIC_LOAD_NAND_I64);
-  case ISD::ATOMIC_LOAD_MIN:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_MIN_I8,
-                        AArch64::ATOMIC_LOAD_MIN_I16,
-                        AArch64::ATOMIC_LOAD_MIN_I32,
-                        AArch64::ATOMIC_LOAD_MIN_I64);
-  case ISD::ATOMIC_LOAD_MAX:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_MAX_I8,
-                        AArch64::ATOMIC_LOAD_MAX_I16,
-                        AArch64::ATOMIC_LOAD_MAX_I32,
-                        AArch64::ATOMIC_LOAD_MAX_I64);
-  case ISD::ATOMIC_LOAD_UMIN:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_UMIN_I8,
-                        AArch64::ATOMIC_LOAD_UMIN_I16,
-                        AArch64::ATOMIC_LOAD_UMIN_I32,
-                        AArch64::ATOMIC_LOAD_UMIN_I64);
-  case ISD::ATOMIC_LOAD_UMAX:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_UMAX_I8,
-                        AArch64::ATOMIC_LOAD_UMAX_I16,
-                        AArch64::ATOMIC_LOAD_UMAX_I32,
-                        AArch64::ATOMIC_LOAD_UMAX_I64);
-  case ISD::ATOMIC_SWAP:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_SWAP_I8,
-                        AArch64::ATOMIC_SWAP_I16,
-                        AArch64::ATOMIC_SWAP_I32,
-                        AArch64::ATOMIC_SWAP_I64);
-  case ISD::ATOMIC_CMP_SWAP:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_CMP_SWAP_I8,
-                        AArch64::ATOMIC_CMP_SWAP_I16,
-                        AArch64::ATOMIC_CMP_SWAP_I32,
-                        AArch64::ATOMIC_CMP_SWAP_I64);
-  case ISD::FrameIndex: {
-    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-    EVT PtrTy = getTargetLowering()->getPointerTy();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, PtrTy);
-    return CurDAG->SelectNodeTo(Node, AArch64::ADDxxi_lsl0_s, PtrTy,
-                                TFI, CurDAG->getTargetConstant(0, PtrTy));
-  }
-  case ISD::Constant: {
-    SDNode *ResNode = 0;
-    if (cast<ConstantSDNode>(Node)->getZExtValue() == 0) {
-      // XZR and WZR are probably even better than an actual move: most of the
-      // time they can be folded into another instruction with *no* cost.
-
-      EVT Ty = Node->getValueType(0);
-      assert((Ty == MVT::i32 || Ty == MVT::i64) && "unexpected type");
-      uint16_t Register = Ty == MVT::i32 ? AArch64::WZR : AArch64::XZR;
-      ResNode = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
-                                       SDLoc(Node),
-                                       Register, Ty).getNode();
-    }
+  // Few custom selection stuff.
+  SDNode *ResNode = nullptr;
+  EVT VT = Node->getValueType(0);
 
-    // Next best option is a move-immediate, see if we can do that.
-    if (!ResNode) {
-      ResNode = TrySelectToMoveImm(Node);
-    }
-
-    if (ResNode)
-      return ResNode;
+  switch (Node->getOpcode()) {
+  default:
+    break;
 
-    // If even that fails we fall back to a lit-pool entry at the moment. Future
-    // tuning may change this to a sequence of MOVZ/MOVN/MOVK instructions.
-    ResNode = SelectToLitPool(Node);
-    assert(ResNode && "We need *some* way to materialise a constant");
+  case ISD::ADD:
+    if (SDNode *I = SelectMLAV64LaneV128(Node))
+      return I;
+    break;
 
-    // We want to continue selection at this point since the litpool access
-    // generated used generic nodes for simplicity.
-    ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
-    Node = ResNode;
+  case ISD::LOAD: {
+    // Try to select as an indexed load. Fall through to normal processing
+    // if we can't.
+    bool Done = false;
+    SDNode *I = SelectIndexedLoad(Node, Done);
+    if (Done)
+      return I;
     break;
   }
-  case ISD::ConstantFP: {
-    if (A64Imms::isFPImm(cast<ConstantFPSDNode>(Node)->getValueAPF())) {
-      // FMOV will take care of it from TableGen
-      break;
-    }
 
-    SDNode *ResNode = LowerToFPLitPool(Node);
-    ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+  case ISD::SRL:
+  case ISD::AND:
+  case ISD::SRA:
+    if (SDNode *I = SelectBitfieldExtractOp(Node))
+      return I;
+    break;
 
-    // We want to continue selection at this point since the litpool access
-    // generated used generic nodes for simplicity.
-    Node = ResNode;
+  case ISD::OR:
+    if (SDNode *I = SelectBitfieldInsertOp(Node))
+      return I;
     break;
+
+  case ISD::EXTRACT_VECTOR_ELT: {
+    // Extracting lane zero is a special case where we can just use a plain
+    // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
+    // the rest of the compiler, especially the register allocator and copyi
+    // propagation, to reason about, so is preferred when it's possible to
+    // use it.
+    ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
+    // Bail and use the default Select() for non-zero lanes.
+    if (LaneNode->getZExtValue() != 0)
+      break;
+    // If the element type is not the same as the result type, likewise
+    // bail and use the default Select(), as there's more to do than just
+    // a cross-class COPY. This catches extracts of i8 and i16 elements
+    // since they will need an explicit zext.
+    if (VT != Node->getOperand(0).getValueType().getVectorElementType())
+      break;
+    unsigned SubReg;
+    switch (Node->getOperand(0)
+                .getValueType()
+                .getVectorElementType()
+                .getSizeInBits()) {
+    default:
+      assert(0 && "Unexpected vector element type!");
+    case 64:
+      SubReg = AArch64::dsub;
+      break;
+    case 32:
+      SubReg = AArch64::ssub;
+      break;
+    case 16: // FALLTHROUGH
+    case 8:
+      llvm_unreachable("unexpected zext-requiring extract element!");
+    }
+    SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
+                                                     Node->getOperand(0));
+    DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
+    DEBUG(Extract->dumpr(CurDAG));
+    DEBUG(dbgs() << "\n");
+    return Extract.getNode();
   }
-  case AArch64ISD::NEON_LD1_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD1WB_8B_fixed,  AArch64::LD1WB_4H_fixed,
-      AArch64::LD1WB_2S_fixed,  AArch64::LD1WB_1D_fixed,
-      AArch64::LD1WB_16B_fixed, AArch64::LD1WB_8H_fixed,
-      AArch64::LD1WB_4S_fixed,  AArch64::LD1WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 1, Opcodes);
-  }
-  case AArch64ISD::NEON_LD2_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD2WB_8B_fixed,  AArch64::LD2WB_4H_fixed,
-      AArch64::LD2WB_2S_fixed,  AArch64::LD1x2WB_1D_fixed,
-      AArch64::LD2WB_16B_fixed, AArch64::LD2WB_8H_fixed,
-      AArch64::LD2WB_4S_fixed,  AArch64::LD2WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD3_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD3WB_8B_fixed,  AArch64::LD3WB_4H_fixed,
-      AArch64::LD3WB_2S_fixed,  AArch64::LD1x3WB_1D_fixed,
-      AArch64::LD3WB_16B_fixed, AArch64::LD3WB_8H_fixed,
-      AArch64::LD3WB_4S_fixed,  AArch64::LD3WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD4_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD4WB_8B_fixed,  AArch64::LD4WB_4H_fixed,
-      AArch64::LD4WB_2S_fixed,  AArch64::LD1x4WB_1D_fixed,
-      AArch64::LD4WB_16B_fixed, AArch64::LD4WB_8H_fixed,
-      AArch64::LD4WB_4S_fixed,  AArch64::LD4WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_LD1x2_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD1x2WB_8B_fixed,  AArch64::LD1x2WB_4H_fixed,
-      AArch64::LD1x2WB_2S_fixed,  AArch64::LD1x2WB_1D_fixed,
-      AArch64::LD1x2WB_16B_fixed, AArch64::LD1x2WB_8H_fixed,
-      AArch64::LD1x2WB_4S_fixed,  AArch64::LD1x2WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD1x3_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD1x3WB_8B_fixed,  AArch64::LD1x3WB_4H_fixed,
-      AArch64::LD1x3WB_2S_fixed,  AArch64::LD1x3WB_1D_fixed,
-      AArch64::LD1x3WB_16B_fixed, AArch64::LD1x3WB_8H_fixed,
-      AArch64::LD1x3WB_4S_fixed,  AArch64::LD1x3WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD1x4_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD1x4WB_8B_fixed,  AArch64::LD1x4WB_4H_fixed,
-      AArch64::LD1x4WB_2S_fixed,  AArch64::LD1x4WB_1D_fixed,
-      AArch64::LD1x4WB_16B_fixed, AArch64::LD1x4WB_8H_fixed,
-      AArch64::LD1x4WB_4S_fixed,  AArch64::LD1x4WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_ST1_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST1WB_8B_fixed,  AArch64::ST1WB_4H_fixed,
-      AArch64::ST1WB_2S_fixed,  AArch64::ST1WB_1D_fixed,
-      AArch64::ST1WB_16B_fixed, AArch64::ST1WB_8H_fixed,
-      AArch64::ST1WB_4S_fixed,  AArch64::ST1WB_2D_fixed
-    };
-    return SelectVST(Node, true, 1, Opcodes);
-  }
-  case AArch64ISD::NEON_ST2_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST2WB_8B_fixed,  AArch64::ST2WB_4H_fixed,
-      AArch64::ST2WB_2S_fixed,  AArch64::ST1x2WB_1D_fixed,
-      AArch64::ST2WB_16B_fixed, AArch64::ST2WB_8H_fixed,
-      AArch64::ST2WB_4S_fixed,  AArch64::ST2WB_2D_fixed
-    };
-    return SelectVST(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_ST3_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST3WB_8B_fixed,  AArch64::ST3WB_4H_fixed,
-      AArch64::ST3WB_2S_fixed,  AArch64::ST1x3WB_1D_fixed,
-      AArch64::ST3WB_16B_fixed, AArch64::ST3WB_8H_fixed,
-      AArch64::ST3WB_4S_fixed,  AArch64::ST3WB_2D_fixed
-    };
-    return SelectVST(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_ST4_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST4WB_8B_fixed,  AArch64::ST4WB_4H_fixed,
-      AArch64::ST4WB_2S_fixed,  AArch64::ST1x4WB_1D_fixed,
-      AArch64::ST4WB_16B_fixed, AArch64::ST4WB_8H_fixed,
-      AArch64::ST4WB_4S_fixed,  AArch64::ST4WB_2D_fixed
-    };
-    return SelectVST(Node, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_LD2DUP: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD2R_8B, AArch64::LD2R_4H, AArch64::LD2R_2S,
-        AArch64::LD2R_1D, AArch64::LD2R_16B, AArch64::LD2R_8H,
-        AArch64::LD2R_4S, AArch64::LD2R_2D
-    };
-    return SelectVLDDup(Node, false, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD3DUP: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD3R_8B, AArch64::LD3R_4H, AArch64::LD3R_2S,
-        AArch64::LD3R_1D, AArch64::LD3R_16B, AArch64::LD3R_8H,
-        AArch64::LD3R_4S, AArch64::LD3R_2D
-    };
-    return SelectVLDDup(Node, false, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD4DUP: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD4R_8B, AArch64::LD4R_4H, AArch64::LD4R_2S,
-        AArch64::LD4R_1D, AArch64::LD4R_16B, AArch64::LD4R_8H,
-        AArch64::LD4R_4S, AArch64::LD4R_2D
-    };
-    return SelectVLDDup(Node, false, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_LD2DUP_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD2R_WB_8B_fixed,  AArch64::LD2R_WB_4H_fixed,
-      AArch64::LD2R_WB_2S_fixed,  AArch64::LD2R_WB_1D_fixed,
-      AArch64::LD2R_WB_16B_fixed, AArch64::LD2R_WB_8H_fixed,
-      AArch64::LD2R_WB_4S_fixed,  AArch64::LD2R_WB_2D_fixed
-    };
-    return SelectVLDDup(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD3DUP_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD3R_WB_8B_fixed,  AArch64::LD3R_WB_4H_fixed,
-      AArch64::LD3R_WB_2S_fixed,  AArch64::LD3R_WB_1D_fixed,
-      AArch64::LD3R_WB_16B_fixed, AArch64::LD3R_WB_8H_fixed,
-      AArch64::LD3R_WB_4S_fixed,  AArch64::LD3R_WB_2D_fixed
-    };
-    return SelectVLDDup(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD4DUP_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD4R_WB_8B_fixed,  AArch64::LD4R_WB_4H_fixed,
-      AArch64::LD4R_WB_2S_fixed,  AArch64::LD4R_WB_1D_fixed,
-      AArch64::LD4R_WB_16B_fixed, AArch64::LD4R_WB_8H_fixed,
-      AArch64::LD4R_WB_4S_fixed,  AArch64::LD4R_WB_2D_fixed
-    };
-    return SelectVLDDup(Node, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_LD2LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD2LN_WB_B_fixed, AArch64::LD2LN_WB_H_fixed,
-        AArch64::LD2LN_WB_S_fixed, AArch64::LD2LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, true, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD3LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD3LN_WB_B_fixed, AArch64::LD3LN_WB_H_fixed,
-        AArch64::LD3LN_WB_S_fixed, AArch64::LD3LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, true, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD4LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD4LN_WB_B_fixed, AArch64::LD4LN_WB_H_fixed,
-        AArch64::LD4LN_WB_S_fixed, AArch64::LD4LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, true, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_ST2LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::ST2LN_WB_B_fixed, AArch64::ST2LN_WB_H_fixed,
-        AArch64::ST2LN_WB_S_fixed, AArch64::ST2LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, false, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_ST3LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::ST3LN_WB_B_fixed, AArch64::ST3LN_WB_H_fixed,
-        AArch64::ST3LN_WB_S_fixed, AArch64::ST3LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, false, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_ST4LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::ST4LN_WB_B_fixed, AArch64::ST4LN_WB_H_fixed,
-        AArch64::ST4LN_WB_S_fixed, AArch64::ST4LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, false, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_ST1x2_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST1x2WB_8B_fixed,  AArch64::ST1x2WB_4H_fixed,
-      AArch64::ST1x2WB_2S_fixed,  AArch64::ST1x2WB_1D_fixed,
-      AArch64::ST1x2WB_16B_fixed, AArch64::ST1x2WB_8H_fixed,
-      AArch64::ST1x2WB_4S_fixed,  AArch64::ST1x2WB_2D_fixed
-    };
-    return SelectVST(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_ST1x3_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST1x3WB_8B_fixed,  AArch64::ST1x3WB_4H_fixed,
-      AArch64::ST1x3WB_2S_fixed,  AArch64::ST1x3WB_1D_fixed,
-      AArch64::ST1x3WB_16B_fixed, AArch64::ST1x3WB_8H_fixed,
-      AArch64::ST1x3WB_4S_fixed,  AArch64::ST1x3WB_2D_fixed
-    };
-    return SelectVST(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_ST1x4_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST1x4WB_8B_fixed,  AArch64::ST1x4WB_4H_fixed,
-      AArch64::ST1x4WB_2S_fixed,  AArch64::ST1x4WB_1D_fixed,
-      AArch64::ST1x4WB_16B_fixed, AArch64::ST1x4WB_8H_fixed,
-      AArch64::ST1x4WB_4S_fixed,  AArch64::ST1x4WB_2D_fixed
-    };
-    return SelectVST(Node, true, 4, Opcodes);
-  }
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
-    bool IsExt = false;
-    switch (IntNo) {
-      default:
-        break;
-      case Intrinsic::aarch64_neon_vtbx1:
-        IsExt = true;
-      case Intrinsic::aarch64_neon_vtbl1:
-        return SelectVTBL(Node, 1, IsExt);
-      case Intrinsic::aarch64_neon_vtbx2:
-        IsExt = true;
-      case Intrinsic::aarch64_neon_vtbl2:
-        return SelectVTBL(Node, 2, IsExt);
-      case Intrinsic::aarch64_neon_vtbx3:
-        IsExt = true;
-      case Intrinsic::aarch64_neon_vtbl3:
-        return SelectVTBL(Node, 3, IsExt);
-      case Intrinsic::aarch64_neon_vtbx4:
-        IsExt = true;
-      case Intrinsic::aarch64_neon_vtbl4:
-        return SelectVTBL(Node, 4, IsExt);
+  case ISD::Constant: {
+    // Materialize zero constants as copies from WZR/XZR.  This allows
+    // the coalescer to propagate these into other instructions.
+    ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+    if (ConstNode->isNullValue()) {
+      if (VT == MVT::i32)
+        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+                                      AArch64::WZR, MVT::i32).getNode();
+      else if (VT == MVT::i64)
+        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+                                      AArch64::XZR, MVT::i64).getNode();
     }
     break;
   }
-  case ISD::INTRINSIC_VOID:
+
+  case ISD::FrameIndex: {
+    // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+    const TargetLowering *TLI = getTargetLowering();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                      CurDAG->getTargetConstant(Shifter, MVT::i32) };
+    return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+  }
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
     switch (IntNo) {
     default:
       break;
-    case Intrinsic::arm_neon_vld1: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD1_8B,  AArch64::LD1_4H, AArch64::LD1_2S, AArch64::LD1_1D,
-          AArch64::LD1_16B, AArch64::LD1_8H, AArch64::LD1_4S, AArch64::LD1_2D
-      };
-      return SelectVLD(Node, false, 1, Opcodes);
-    }
-    case Intrinsic::arm_neon_vld2: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD2_8B,  AArch64::LD2_4H, AArch64::LD2_2S, AArch64::LD1x2_1D,
-          AArch64::LD2_16B, AArch64::LD2_8H, AArch64::LD2_4S, AArch64::LD2_2D
-      };
-      return SelectVLD(Node, false, 2, Opcodes);
+    case Intrinsic::aarch64_ldaxp:
+    case Intrinsic::aarch64_ldxp: {
+      unsigned Op =
+          IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
+      SDValue MemAddr = Node->getOperand(2);
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+
+      SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
+                                          MVT::Other, MemAddr, Chain);
+
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+      return Ld;
     }
-    case Intrinsic::arm_neon_vld3: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD3_8B,  AArch64::LD3_4H, AArch64::LD3_2S, AArch64::LD1x3_1D,
-          AArch64::LD3_16B, AArch64::LD3_8H, AArch64::LD3_4S, AArch64::LD3_2D
-      };
-      return SelectVLD(Node, false, 3, Opcodes);
+    case Intrinsic::aarch64_stlxp:
+    case Intrinsic::aarch64_stxp: {
+      unsigned Op =
+          IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+      SDValue ValLo = Node->getOperand(2);
+      SDValue ValHi = Node->getOperand(3);
+      SDValue MemAddr = Node->getOperand(4);
+
+      // Place arguments in the right order.
+      SmallVector<SDValue, 7> Ops;
+      Ops.push_back(ValLo);
+      Ops.push_back(ValHi);
+      Ops.push_back(MemAddr);
+      Ops.push_back(Chain);
+
+      SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+      return St;
     }
-    case Intrinsic::arm_neon_vld4: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD4_8B,  AArch64::LD4_4H, AArch64::LD4_2S, AArch64::LD1x4_1D,
-          AArch64::LD4_16B, AArch64::LD4_8H, AArch64::LD4_4S, AArch64::LD4_2D
-      };
-      return SelectVLD(Node, false, 4, Opcodes);
-    }
-    case Intrinsic::aarch64_neon_vld1x2: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD1x2_8B, AArch64::LD1x2_4H,  AArch64::LD1x2_2S,
-          AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H,
-          AArch64::LD1x2_4S, AArch64::LD1x2_2D
-      };
-      return SelectVLD(Node, false, 2, Opcodes);
-    }
-    case Intrinsic::aarch64_neon_vld1x3: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD1x3_8B, AArch64::LD1x3_4H,  AArch64::LD1x3_2S,
-          AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H,
-          AArch64::LD1x3_4S, AArch64::LD1x3_2D
-      };
-      return SelectVLD(Node, false, 3, Opcodes);
-    }
-    case Intrinsic::aarch64_neon_vld1x4: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD1x4_8B, AArch64::LD1x4_4H,  AArch64::LD1x4_2S,
-          AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H,
-          AArch64::LD1x4_4S, AArch64::LD1x4_2D
-      };
-      return SelectVLD(Node, false, 4, Opcodes);
-    }
-    case Intrinsic::arm_neon_vst1: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST1_8B,  AArch64::ST1_4H, AArch64::ST1_2S, AArch64::ST1_1D,
-          AArch64::ST1_16B, AArch64::ST1_8H, AArch64::ST1_4S, AArch64::ST1_2D
-      };
-      return SelectVST(Node, false, 1, Opcodes);
-    }
-    case Intrinsic::arm_neon_vst2: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST2_8B,  AArch64::ST2_4H, AArch64::ST2_2S, AArch64::ST1x2_1D,
-          AArch64::ST2_16B, AArch64::ST2_8H, AArch64::ST2_4S, AArch64::ST2_2D
-      };
-      return SelectVST(Node, false, 2, Opcodes);
+    case Intrinsic::aarch64_neon_ld1x2:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld1x3:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld1x4:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld3:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld4:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld3r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld4r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 2, AArch64::LD2i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 2, AArch64::LD2i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 2, AArch64::LD2i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 2, AArch64::LD2i64);
+      break;
+    case Intrinsic::aarch64_neon_ld3lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 3, AArch64::LD3i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 3, AArch64::LD3i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 3, AArch64::LD3i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 3, AArch64::LD3i64);
+      break;
+    case Intrinsic::aarch64_neon_ld4lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 4, AArch64::LD4i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 4, AArch64::LD4i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 4, AArch64::LD4i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 4, AArch64::LD4i64);
+      break;
     }
-    case Intrinsic::arm_neon_vst3: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST3_8B,  AArch64::ST3_4H, AArch64::ST3_2S, AArch64::ST1x3_1D,
-          AArch64::ST3_16B, AArch64::ST3_8H, AArch64::ST3_4S, AArch64::ST3_2D
-      };
-      return SelectVST(Node, false, 3, Opcodes);
+  } break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_tbl2:
+      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two
+                                                  : AArch64::TBLv16i8Two,
+                         false);
+    case Intrinsic::aarch64_neon_tbl3:
+      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
+                                                  : AArch64::TBLv16i8Three,
+                         false);
+    case Intrinsic::aarch64_neon_tbl4:
+      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
+                                                  : AArch64::TBLv16i8Four,
+                         false);
+    case Intrinsic::aarch64_neon_tbx2:
+      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two
+                                                  : AArch64::TBXv16i8Two,
+                         true);
+    case Intrinsic::aarch64_neon_tbx3:
+      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
+                                                  : AArch64::TBXv16i8Three,
+                         true);
+    case Intrinsic::aarch64_neon_tbx4:
+      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
+                                                  : AArch64::TBXv16i8Four,
+                         true);
+    case Intrinsic::aarch64_neon_smull:
+    case Intrinsic::aarch64_neon_umull:
+      if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
+        return N;
+      break;
     }
-    case Intrinsic::arm_neon_vst4: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST4_8B,  AArch64::ST4_4H, AArch64::ST4_2S, AArch64::ST1x4_1D,
-          AArch64::ST4_16B, AArch64::ST4_8H, AArch64::ST4_4S, AArch64::ST4_2D
-      };
-      return SelectVST(Node, false, 4, Opcodes);
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    if (Node->getNumOperands() >= 3)
+      VT = Node->getOperand(2)->getValueType(0);
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_st1x2: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 2, AArch64::ST1Twov8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 2, AArch64::ST1Twov16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 2, AArch64::ST1Twov4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 2, AArch64::ST1Twov8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 2, AArch64::ST1Twov2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 2, AArch64::ST1Twov4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      break;
     }
-    case Intrinsic::aarch64_neon_vst1x2: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST1x2_8B, AArch64::ST1x2_4H,  AArch64::ST1x2_2S,
-          AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H,
-          AArch64::ST1x2_4S, AArch64::ST1x2_2D
-      };
-      return SelectVST(Node, false, 2, Opcodes);
+    case Intrinsic::aarch64_neon_st1x3: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 3, AArch64::ST1Threev8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 3, AArch64::ST1Threev16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 3, AArch64::ST1Threev4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 3, AArch64::ST1Threev8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 3, AArch64::ST1Threev2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 3, AArch64::ST1Threev4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      break;
     }
-    case Intrinsic::aarch64_neon_vst1x3: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST1x3_8B, AArch64::ST1x3_4H,  AArch64::ST1x3_2S,
-          AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H,
-          AArch64::ST1x3_4S, AArch64::ST1x3_2D
-      };
-      return SelectVST(Node, false, 3, Opcodes);
+    case Intrinsic::aarch64_neon_st1x4: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 4, AArch64::ST1Fourv8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 4, AArch64::ST1Fourv16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 4, AArch64::ST1Fourv4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 4, AArch64::ST1Fourv8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 4, AArch64::ST1Fourv2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 4, AArch64::ST1Fourv4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      break;
     }
-    case Intrinsic::aarch64_neon_vst1x4: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST1x4_8B, AArch64::ST1x4_4H,  AArch64::ST1x4_2S,
-          AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H,
-          AArch64::ST1x4_4S, AArch64::ST1x4_2D
-      };
-      return SelectVST(Node, false, 4, Opcodes);
+    case Intrinsic::aarch64_neon_st2: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 2, AArch64::ST2Twov8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 2, AArch64::ST2Twov16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 2, AArch64::ST2Twov4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 2, AArch64::ST2Twov8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 2, AArch64::ST2Twov2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 2, AArch64::ST2Twov4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 2, AArch64::ST2Twov2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      break;
     }
-    case Intrinsic::arm_neon_vld2lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD2LN_B, AArch64::LD2LN_H, AArch64::LD2LN_S, AArch64::LD2LN_D
-      };
-      return SelectVLDSTLane(Node, true, false, 2, Opcodes);
+    case Intrinsic::aarch64_neon_st3: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 3, AArch64::ST3Threev8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 3, AArch64::ST3Threev16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 3, AArch64::ST3Threev4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 3, AArch64::ST3Threev8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 3, AArch64::ST3Threev2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 3, AArch64::ST3Threev4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 3, AArch64::ST3Threev2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      break;
     }
-    case Intrinsic::arm_neon_vld3lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD3LN_B, AArch64::LD3LN_H, AArch64::LD3LN_S, AArch64::LD3LN_D
-      };
-      return SelectVLDSTLane(Node, true, false, 3, Opcodes);
+    case Intrinsic::aarch64_neon_st4: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 4, AArch64::ST4Fourv8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 4, AArch64::ST4Fourv16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 4, AArch64::ST4Fourv4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 4, AArch64::ST4Fourv8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 4, AArch64::ST4Fourv2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 4, AArch64::ST4Fourv4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 4, AArch64::ST4Fourv2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      break;
     }
-    case Intrinsic::arm_neon_vld4lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD4LN_B, AArch64::LD4LN_H, AArch64::LD4LN_S, AArch64::LD4LN_D
-      };
-      return SelectVLDSTLane(Node, true, false, 4, Opcodes);
+    case Intrinsic::aarch64_neon_st2lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 2, AArch64::ST2i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 2, AArch64::ST2i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 2, AArch64::ST2i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 2, AArch64::ST2i64);
+      break;
     }
-    case Intrinsic::arm_neon_vst2lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST2LN_B, AArch64::ST2LN_H, AArch64::ST2LN_S, AArch64::ST2LN_D
-      };
-      return SelectVLDSTLane(Node, false, false, 2, Opcodes);
+    case Intrinsic::aarch64_neon_st3lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 3, AArch64::ST3i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 3, AArch64::ST3i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 3, AArch64::ST3i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 3, AArch64::ST3i64);
+      break;
     }
-    case Intrinsic::arm_neon_vst3lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST3LN_B, AArch64::ST3LN_H, AArch64::ST3LN_S, AArch64::ST3LN_D
-      };
-      return SelectVLDSTLane(Node, false, false, 3, Opcodes);
+    case Intrinsic::aarch64_neon_st4lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 4, AArch64::ST4i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 4, AArch64::ST4i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 4, AArch64::ST4i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 4, AArch64::ST4i64);
+      break;
     }
-    case Intrinsic::arm_neon_vst4lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST4LN_B, AArch64::ST4LN_H, AArch64::ST4LN_S, AArch64::ST4LN_D
-      };
-      return SelectVLDSTLane(Node, false, false, 4, Opcodes);
     }
-    } // End of switch IntNo
+  }
+  case AArch64ISD::LD2post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD3post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD4post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x2post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x3post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x4post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD2DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD3DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD4DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+    break;
+  }
+  case AArch64ISD::LD2LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+    break;
+  }
+  case AArch64ISD::LD3LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+    break;
+  }
+  case AArch64ISD::LD4LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+    break;
+  }
+  case AArch64ISD::ST2post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+    break;
+  }
+  case AArch64ISD::ST3post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+    break;
+  }
+  case AArch64ISD::ST4post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x2post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x3post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x4post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+    break;
+  }
+  case AArch64ISD::ST2LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+    break;
+  }
+  case AArch64ISD::ST3LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+    break;
+  }
+  case AArch64ISD::ST4LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
     break;
-  } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN
-  default:
-    break; // Let generic code handle it
   }
 
-  SDNode *ResNode = SelectCode(Node);
+  case ISD::FCEIL:
+  case ISD::FFLOOR:
+  case ISD::FTRUNC:
+  case ISD::FROUND:
+    if (SDNode *I = SelectLIBM(Node))
+      return I;
+    break;
+  }
 
-  DEBUG(dbgs() << "=> ";
-        if (ResNode == NULL || ResNode == Node)
-          Node->dump(CurDAG);
-        else
-          ResNode->dump(CurDAG);
-        dbgs() << "\n");
+  // Select the default instruction
+  ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == nullptr || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
 
   return ResNode;
 }
 
-/// This pass converts a legalized DAG into a AArch64-specific DAG, ready for
-/// instruction scheduling.
-FunctionPass *llvm::createAArch64ISelDAG(AArch64TargetMachine &TM,
+/// createAArch64ISelDag - This pass converts a legalized DAG into a
+/// AArch64-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
                                          CodeGenOpt::Level OptLevel) {
   return new AArch64DAGToDAGISel(TM, OptLevel);
 }
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 388973a..80d6669 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===//
+//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,46 +7,87 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the interfaces that AArch64 uses to lower LLVM code into a
-// selection DAG.
+// This file implements the AArch64TargetLowering class.
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "aarch64-isel"
-#include "AArch64.h"
 #include "AArch64ISelLowering.h"
+#include "AArch64PerfectShuffle.h"
+#include "AArch64Subtarget.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
-#include "Utils/AArch64BaseInfo.h"
-#include "llvm/CodeGen/Analysis.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/Support/MathExtras.h"
-
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
-static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
-  assert (TM.getSubtarget<AArch64Subtarget>().isTargetELF() &&
-          "unknown subtarget type");
-  return new AArch64ElfTargetObjectFile();
-}
+#define DEBUG_TYPE "aarch64-lower"
 
-AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
-  : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) {
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+
+enum AlignMode {
+  StrictAlign,
+  NoStrictAlign
+};
+
+static cl::opt<AlignMode>
+Align(cl::desc("Load/store alignment support"),
+      cl::Hidden, cl::init(NoStrictAlign),
+      cl::values(
+          clEnumValN(StrictAlign,   "aarch64-strict-align",
+                     "Disallow all unaligned memory accesses"),
+          clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
+                     "Allow unaligned memory accesses"),
+          clEnumValEnd));
+
+// Place holder until extr generation is tested fully.
+static cl::opt<bool>
+EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
+                          cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
+                          cl::init(true));
+
+static cl::opt<bool>
+EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
+                         cl::desc("Allow AArch64 SLI/SRI formation"),
+                         cl::init(false));
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering public interface.
+//===----------------------------------------------------------------------===//
+static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
+  if (TM.getSubtarget<AArch64Subtarget>().isTargetDarwin())
+    return new AArch64_MachoTargetObjectFile();
 
-  const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+  return new AArch64_ELFTargetObjectFile();
+}
 
-  // SIMD compares set the entire lane's bits to 1
+AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
+    : TargetLowering(TM, createTLOF(TM)) {
+  Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+
+  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
+  // we have to make something up. Arbitrarily, choose ZeroOrOne.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // When comparing vectors the result sets the different elements in the
+  // vector to all-one or all-zero.
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
-  // Scalar register <-> type mapping
-  addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
-  addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
+  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
 
   if (Subtarget->hasFPARMv8()) {
     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
@@ -56,201 +97,86 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   }
 
   if (Subtarget->hasNEON()) {
-    // And the vectors
-    addRegisterClass(MVT::v1i8,  &AArch64::FPR8RegClass);
-    addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass);
-    addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass);
-    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v8i8,  &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
+    addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
+    // Someone set us up the NEON.
+    addDRTypeForNEON(MVT::v2f32);
+    addDRTypeForNEON(MVT::v8i8);
+    addDRTypeForNEON(MVT::v4i16);
+    addDRTypeForNEON(MVT::v2i32);
+    addDRTypeForNEON(MVT::v1i64);
+    addDRTypeForNEON(MVT::v1f64);
+
+    addQRTypeForNEON(MVT::v4f32);
+    addQRTypeForNEON(MVT::v2f64);
+    addQRTypeForNEON(MVT::v16i8);
+    addQRTypeForNEON(MVT::v8i16);
+    addQRTypeForNEON(MVT::v4i32);
+    addQRTypeForNEON(MVT::v2i64);
   }
 
+  // Compute derived properties from the register classes
   computeRegisterProperties();
 
-  // We combine OR nodes for bitfield and NEON BSL operations.
-  setTargetDAGCombine(ISD::OR);
-
-  setTargetDAGCombine(ISD::AND);
-  setTargetDAGCombine(ISD::SRA);
-  setTargetDAGCombine(ISD::SRL);
-  setTargetDAGCombine(ISD::SHL);
-
-  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
-  setTargetDAGCombine(ISD::INTRINSIC_VOID);
-  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
-
-  // AArch64 does not have i1 loads, or much of anything for i1 really.
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
-
-  setStackPointerRegisterToSaveRestore(AArch64::XSP);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
-  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
-  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
-
-  // We'll lower globals to wrappers for selection.
+  // Provide all sorts of operation actions
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
-
-  // A64 instructions have the comparison predicate attached to the user of the
-  // result, but having a separate comparison is valuable for matching.
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCC, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::f32, Custom);
+  setOperationAction(ISD::SETCC, MVT::f64, Custom);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
-
   setOperationAction(ISD::SELECT, MVT::i32, Custom);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f32, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Custom);
-
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
-
-  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
-
-  setOperationAction(ISD::SETCC, MVT::i32, Custom);
-  setOperationAction(ISD::SETCC, MVT::i64, Custom);
-  setOperationAction(ISD::SETCC, MVT::f32, Custom);
-  setOperationAction(ISD::SETCC, MVT::f64, Custom);
-
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
 
-  setOperationAction(ISD::VASTART, MVT::Other, Custom);
-  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
-  setOperationAction(ISD::VAEND, MVT::Other, Expand);
-  setOperationAction(ISD::VAARG, MVT::Other, Expand);
-
-  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
-  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
-
-  setOperationAction(ISD::ROTL, MVT::i32, Expand);
-  setOperationAction(ISD::ROTL, MVT::i64, Expand);
-
-  setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
-
-  setOperationAction(ISD::SREM, MVT::i32, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
-
-  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-
-  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
-  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
-
-  // Legal floating-point operations.
-  setOperationAction(ISD::FABS, MVT::f32, Legal);
-  setOperationAction(ISD::FABS, MVT::f64, Legal);
-
-  setOperationAction(ISD::FCEIL, MVT::f32, Legal);
-  setOperationAction(ISD::FCEIL, MVT::f64, Legal);
-
-  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
-
-  setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-  setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-
-  setOperationAction(ISD::FNEG, MVT::f32, Legal);
-  setOperationAction(ISD::FNEG, MVT::f64, Legal);
-
-  setOperationAction(ISD::FRINT, MVT::f32, Legal);
-  setOperationAction(ISD::FRINT, MVT::f64, Legal);
-
-  setOperationAction(ISD::FSQRT, MVT::f32, Legal);
-  setOperationAction(ISD::FSQRT, MVT::f64, Legal);
-
-  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
-
-  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f128, Legal);
-
-  // Illegal floating-point operations.
-  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-
-  setOperationAction(ISD::FCOS, MVT::f32, Expand);
-  setOperationAction(ISD::FCOS, MVT::f64, Expand);
-
-  setOperationAction(ISD::FEXP, MVT::f32, Expand);
-  setOperationAction(ISD::FEXP, MVT::f64, Expand);
-
-  setOperationAction(ISD::FEXP2, MVT::f32, Expand);
-  setOperationAction(ISD::FEXP2, MVT::f64, Expand);
-
-  setOperationAction(ISD::FLOG, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG, MVT::f64, Expand);
-
-  setOperationAction(ISD::FLOG2, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG2, MVT::f64, Expand);
-
-  setOperationAction(ISD::FLOG10, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG10, MVT::f64, Expand);
-
-  setOperationAction(ISD::FPOW, MVT::f32, Expand);
-  setOperationAction(ISD::FPOW, MVT::f64, Expand);
-
-  setOperationAction(ISD::FPOWI, MVT::f32, Expand);
-  setOperationAction(ISD::FPOWI, MVT::f64, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
 
   setOperationAction(ISD::FREM, MVT::f32, Expand);
   setOperationAction(ISD::FREM, MVT::f64, Expand);
+  setOperationAction(ISD::FREM, MVT::f80, Expand);
 
-  setOperationAction(ISD::FSIN, MVT::f32, Expand);
-  setOperationAction(ISD::FSIN, MVT::f64, Expand);
-
-  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  // Custom lowering hooks are needed for XOR
+  // to fold it into CSINC/CSINV.
+  setOperationAction(ISD::XOR, MVT::i32, Custom);
+  setOperationAction(ISD::XOR, MVT::i64, Custom);
 
   // Virtually no operation on f128 is legal, but LLVM can't expand them when
   // there's a valid register class, so we need custom operations in most cases.
-  setOperationAction(ISD::FABS,       MVT::f128, Expand);
-  setOperationAction(ISD::FADD,       MVT::f128, Custom);
-  setOperationAction(ISD::FCOPYSIGN,  MVT::f128, Expand);
-  setOperationAction(ISD::FCOS,       MVT::f128, Expand);
-  setOperationAction(ISD::FDIV,       MVT::f128, Custom);
-  setOperationAction(ISD::FMA,        MVT::f128, Expand);
-  setOperationAction(ISD::FMUL,       MVT::f128, Custom);
-  setOperationAction(ISD::FNEG,       MVT::f128, Expand);
-  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Expand);
-  setOperationAction(ISD::FP_ROUND,   MVT::f128, Expand);
-  setOperationAction(ISD::FPOW,       MVT::f128, Expand);
-  setOperationAction(ISD::FREM,       MVT::f128, Expand);
-  setOperationAction(ISD::FRINT,      MVT::f128, Expand);
-  setOperationAction(ISD::FSIN,       MVT::f128, Expand);
-  setOperationAction(ISD::FSINCOS,    MVT::f128, Expand);
-  setOperationAction(ISD::FSQRT,      MVT::f128, Expand);
-  setOperationAction(ISD::FSUB,       MVT::f128, Custom);
-  setOperationAction(ISD::FTRUNC,     MVT::f128, Expand);
-  setOperationAction(ISD::SETCC,      MVT::f128, Custom);
-  setOperationAction(ISD::BR_CC,      MVT::f128, Custom);
-  setOperationAction(ISD::SELECT,     MVT::f128, Expand);
-  setOperationAction(ISD::SELECT_CC,  MVT::f128, Custom);
-  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Custom);
+  setOperationAction(ISD::FABS, MVT::f128, Expand);
+  setOperationAction(ISD::FADD, MVT::f128, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+  setOperationAction(ISD::FCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FDIV, MVT::f128, Custom);
+  setOperationAction(ISD::FMA, MVT::f128, Expand);
+  setOperationAction(ISD::FMUL, MVT::f128, Custom);
+  setOperationAction(ISD::FNEG, MVT::f128, Expand);
+  setOperationAction(ISD::FPOW, MVT::f128, Expand);
+  setOperationAction(ISD::FREM, MVT::f128, Expand);
+  setOperationAction(ISD::FRINT, MVT::f128, Expand);
+  setOperationAction(ISD::FSIN, MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+  setOperationAction(ISD::FSUB, MVT::f128, Custom);
+  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
+  setOperationAction(ISD::SETCC, MVT::f128, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
 
   // Lowering for many of the conversions is actually specified by the non-f128
   // type. The LowerXXX function will be trivial when f128 isn't involved.
@@ -266,623 +192,583 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
-  setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
 
-  // i128 shift operation support
-  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
-
-  // This prevents LLVM trying to compress double constants into a floating
-  // constant-pool entry and trying to load from there. It's of doubtful benefit
-  // for A64: we'd need LDR followed by FCVT, I believe.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  // Variable arguments.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
 
-  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  // Variable-sized objects.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
 
+  // Exception handling.
+  // FIXME: These are guesses. Has this been defined yet?
   setExceptionPointerRegister(AArch64::X0);
   setExceptionSelectorRegister(AArch64::X1);
 
-  if (Subtarget->hasNEON()) {
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v1i64, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v16i8, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
-
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
-
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i32, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i8, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i16, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
-
-    setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
-    setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
-    setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
-    setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
-    setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
-    setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
-    setOperationAction(ISD::SETCC, MVT::v1i64, Custom);
-    setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
-    setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
-    setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
-    setOperationAction(ISD::SETCC, MVT::v1f64, Custom);
-    setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
-
-    setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
-    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
-    setOperationAction(ISD::FFLOOR, MVT::v1f64, Legal);
-    setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
-    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
-    setOperationAction(ISD::FCEIL, MVT::v1f64, Legal);
-    setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::v1f64, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
-    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
-    setOperationAction(ISD::FRINT, MVT::v1f64, Legal);
-    setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal);
-    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
-    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Legal);
-    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
-    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
-    setOperationAction(ISD::FROUND, MVT::v1f64, Legal);
-    setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i8, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i16, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i32, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+  // Constant pool entries
+  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
 
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i8, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i16, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
+  // BlockAddress
+  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
 
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i8, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i16, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i32, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Custom);
-
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i8, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i16, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Custom);
-
-    // Neon does not support vector divide/remainder operations except
-    // floating-point divide.
-    setOperationAction(ISD::SDIV, MVT::v1i8, Expand);
-    setOperationAction(ISD::SDIV, MVT::v8i8, Expand);
-    setOperationAction(ISD::SDIV, MVT::v16i8, Expand);
-    setOperationAction(ISD::SDIV, MVT::v1i16, Expand);
-    setOperationAction(ISD::SDIV, MVT::v4i16, Expand);
-    setOperationAction(ISD::SDIV, MVT::v8i16, Expand);
-    setOperationAction(ISD::SDIV, MVT::v1i32, Expand);
-    setOperationAction(ISD::SDIV, MVT::v2i32, Expand);
-    setOperationAction(ISD::SDIV, MVT::v4i32, Expand);
-    setOperationAction(ISD::SDIV, MVT::v1i64, Expand);
-    setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::UDIV, MVT::v1i8, Expand);
-    setOperationAction(ISD::UDIV, MVT::v8i8, Expand);
-    setOperationAction(ISD::UDIV, MVT::v16i8, Expand);
-    setOperationAction(ISD::UDIV, MVT::v1i16, Expand);
-    setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
-    setOperationAction(ISD::UDIV, MVT::v8i16, Expand);
-    setOperationAction(ISD::UDIV, MVT::v1i32, Expand);
-    setOperationAction(ISD::UDIV, MVT::v2i32, Expand);
-    setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
-    setOperationAction(ISD::UDIV, MVT::v1i64, Expand);
-    setOperationAction(ISD::UDIV, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::SREM, MVT::v1i8, Expand);
-    setOperationAction(ISD::SREM, MVT::v8i8, Expand);
-    setOperationAction(ISD::SREM, MVT::v16i8, Expand);
-    setOperationAction(ISD::SREM, MVT::v1i16, Expand);
-    setOperationAction(ISD::SREM, MVT::v4i16, Expand);
-    setOperationAction(ISD::SREM, MVT::v8i16, Expand);
-    setOperationAction(ISD::SREM, MVT::v1i32, Expand);
-    setOperationAction(ISD::SREM, MVT::v2i32, Expand);
-    setOperationAction(ISD::SREM, MVT::v4i32, Expand);
-    setOperationAction(ISD::SREM, MVT::v1i64, Expand);
-    setOperationAction(ISD::SREM, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::UREM, MVT::v1i8, Expand);
-    setOperationAction(ISD::UREM, MVT::v8i8, Expand);
-    setOperationAction(ISD::UREM, MVT::v16i8, Expand);
-    setOperationAction(ISD::UREM, MVT::v1i16, Expand);
-    setOperationAction(ISD::UREM, MVT::v4i16, Expand);
-    setOperationAction(ISD::UREM, MVT::v8i16, Expand);
-    setOperationAction(ISD::UREM, MVT::v1i32, Expand);
-    setOperationAction(ISD::UREM, MVT::v2i32, Expand);
-    setOperationAction(ISD::UREM, MVT::v4i32, Expand);
-    setOperationAction(ISD::UREM, MVT::v1i64, Expand);
-    setOperationAction(ISD::UREM, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::FREM, MVT::v2f32, Expand);
-    setOperationAction(ISD::FREM, MVT::v4f32, Expand);
-    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
-    setOperationAction(ISD::FREM, MVT::v2f64, Expand);
-
-    setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
-    setOperationAction(ISD::SELECT, MVT::v16i8, Expand);
-    setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
-    setOperationAction(ISD::SELECT, MVT::v8i16, Expand);
-    setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
-    setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
-    setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v2i64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
-    setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
-    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v2f64, Expand);
-
-    setOperationAction(ISD::SELECT_CC, MVT::v8i8, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v16i8, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v4i16, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v8i16, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v2i32, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v4i32, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v1i64, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v2i64, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v2f32, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v4f32, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v2f64, Custom);
-
-    // Vector ExtLoad and TruncStore are expanded.
-    for (unsigned I = MVT::FIRST_VECTOR_VALUETYPE;
-         I <= MVT::LAST_VECTOR_VALUETYPE; ++I) {
-      MVT VT = (MVT::SimpleValueType) I;
-      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
-      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
-      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
-      for (unsigned II = MVT::FIRST_VECTOR_VALUETYPE;
-           II <= MVT::LAST_VECTOR_VALUETYPE; ++II) {
-        MVT VT1 = (MVT::SimpleValueType) II;
-        // A TruncStore has two vector types of the same number of elements
-        // and different element sizes.
-        if (VT.getVectorNumElements() == VT1.getVectorNumElements() &&
-            VT.getVectorElementType().getSizeInBits()
-                > VT1.getVectorElementType().getSizeInBits())
-          setTruncStoreAction(VT, VT1, Expand);
-      }
-    }
+  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
+  setOperationAction(ISD::ADDC, MVT::i32, Custom);
+  setOperationAction(ISD::ADDE, MVT::i32, Custom);
+  setOperationAction(ISD::SUBC, MVT::i32, Custom);
+  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+  setOperationAction(ISD::ADDC, MVT::i64, Custom);
+  setOperationAction(ISD::ADDE, MVT::i64, Custom);
+  setOperationAction(ISD::SUBC, MVT::i64, Custom);
+  setOperationAction(ISD::SUBE, MVT::i64, Custom);
+
+  // AArch64 lacks both left-rotate and popcount instructions.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
 
-    // There is no v1i64/v2i64 multiply, expand v1i64/v2i64 to GPR i64 multiply.
-    // FIXME: For a v2i64 multiply, we copy VPR to GPR and do 2 i64 multiplies,
-    // and then copy back to VPR. This solution may be optimized by Following 3
-    // NEON instructions:
-    //        pmull  v2.1q, v0.1d, v1.1d
-    //        pmull2 v3.1q, v0.2d, v1.2d
-    //        ins    v2.d[1], v3.d[0]
-    // As currently we can't verify the correctness of such assumption, we can
-    // do such optimization in the future.
-    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
-    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+  // AArch64 doesn't have {U|S}MUL_LOHI.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 
-    setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
-    setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
-    setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
-    setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
-    setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
-    setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
-    setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
-    setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
-    setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
-  }
 
-  setTargetDAGCombine(ISD::SETCC);
-  setTargetDAGCombine(ISD::SIGN_EXTEND);
-  setTargetDAGCombine(ISD::VSELECT);
-}
+  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
+  // counterparts, which AArch64 supports directly.
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 
-EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-  // It's reasonably important that this value matches the "natural" legal
-  // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself
-  // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64).
-  if (!VT.isVector()) return MVT::i32;
-  return VT.changeVectorElementTypeToInteger();
-}
+  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
 
-static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
-                                  unsigned &LdrOpc,
-                                  unsigned &StrOpc) {
-  static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword,
-                                       AArch64::LDXR_word, AArch64::LDXR_dword};
-  static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword,
-                                     AArch64::LDAXR_word, AArch64::LDAXR_dword};
-  static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword,
-                                       AArch64::STXR_word, AArch64::STXR_dword};
-  static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword,
-                                     AArch64::STLXR_word, AArch64::STLXR_dword};
-
-  const unsigned *LoadOps, *StoreOps;
-  if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    LoadOps = LoadAcqs;
-  else
-    LoadOps = LoadBares;
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
 
-  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    StoreOps = StoreRels;
-  else
-    StoreOps = StoreBares;
+  // Custom lower Add/Sub/Mul with overflow.
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SADDO, MVT::i64, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i64, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i64, Custom);
+  setOperationAction(ISD::SMULO, MVT::i32, Custom);
+  setOperationAction(ISD::SMULO, MVT::i64, Custom);
+  setOperationAction(ISD::UMULO, MVT::i32, Custom);
+  setOperationAction(ISD::UMULO, MVT::i64, Custom);
 
-  assert(isPowerOf2_32(Size) && Size <= 8 &&
-         "unsupported size for atomic binary op!");
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+  // AArch64 has implementations of a lot of rounding-like FP operations.
+  static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
+  for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
+    MVT Ty = RoundingTypes[I];
+    setOperationAction(ISD::FFLOOR, Ty, Legal);
+    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+    setOperationAction(ISD::FCEIL, Ty, Legal);
+    setOperationAction(ISD::FRINT, Ty, Legal);
+    setOperationAction(ISD::FTRUNC, Ty, Legal);
+    setOperationAction(ISD::FROUND, Ty, Legal);
+  }
 
-  LdrOpc = LoadOps[Log2_32(Size)];
-  StrOpc = StoreOps[Log2_32(Size)];
-}
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
-// FIXME: AArch64::DTripleRegClass and AArch64::QTripleRegClass don't really
-// have value type mapped, and they are both being defined as MVT::untyped.
-// Without knowing the MVT type, MachineLICM::getRegisterClassIDAndCost
-// would fail to figure out the register pressure correctly.
-std::pair<const TargetRegisterClass*, uint8_t>
-AArch64TargetLowering::findRepresentativeClass(MVT VT) const{
-  const TargetRegisterClass *RRC = 0;
-  uint8_t Cost = 1;
-  switch (VT.SimpleTy) {
-  default:
-    return TargetLowering::findRepresentativeClass(VT);
-  case MVT::v4i64:
-    RRC = &AArch64::QPairRegClass;
-    Cost = 2;
-    break;
-  case MVT::v8i64:
-    RRC = &AArch64::QQuadRegClass;
-    Cost = 4;
-    break;
+  if (Subtarget->isTargetMachO()) {
+    // For iOS, we don't want to the normal expansion of a libcall to
+    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+    // traffic.
+    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  } else {
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   }
-  return std::make_pair(RRC, Cost);
-}
 
-MachineBasicBlock *
-AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
-                                        unsigned Size,
-                                        unsigned BinOpcode) const {
-  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  // AArch64 does not have floating-point extending loads, i1 sign-extending
+  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+  // Indexed loads and stores are supported.
+  for (unsigned im = (unsigned)ISD::PRE_INC;
+       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+    setIndexedLoadAction(im, MVT::i8, Legal);
+    setIndexedLoadAction(im, MVT::i16, Legal);
+    setIndexedLoadAction(im, MVT::i32, Legal);
+    setIndexedLoadAction(im, MVT::i64, Legal);
+    setIndexedLoadAction(im, MVT::f64, Legal);
+    setIndexedLoadAction(im, MVT::f32, Legal);
+    setIndexedStoreAction(im, MVT::i8, Legal);
+    setIndexedStoreAction(im, MVT::i16, Legal);
+    setIndexedStoreAction(im, MVT::i32, Legal);
+    setIndexedStoreAction(im, MVT::i64, Legal);
+    setIndexedStoreAction(im, MVT::f64, Legal);
+    setIndexedStoreAction(im, MVT::f32, Legal);
+  }
 
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
+  // Trap.
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptr = MI->getOperand(1).getReg();
-  unsigned incr = MI->getOperand(2).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
-  DebugLoc dl = MI->getDebugLoc();
-
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loopMBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  const TargetRegisterClass *TRC
-    = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
-  unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
-
-  //  loopMBB:
-  //   ldxr dest, ptr
-  //   <binop> scratch, dest, incr
-  //   stxr stxr_status, scratch, ptr
-  //   cbnz stxr_status, loopMBB
-  //   fallthrough --> exitMBB
-  BB = loopMBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-  if (BinOpcode) {
-    // All arithmetic operations we'll be creating are designed to take an extra
-    // shift or extend operand, which we can conveniently set to zero.
-
-    // Operand order needs to go the other way for NAND.
-    if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl)
-      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
-        .addReg(incr).addReg(dest).addImm(0);
-    else
-      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
-        .addReg(dest).addReg(incr).addImm(0);
-  }
+  // We combine OR nodes for bitfield operations.
+  setTargetDAGCombine(ISD::OR);
 
-  // From the stxr, the register is GPR32; from the cmp it's GPR32wsp
-  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
+  // Vector add and sub nodes may conceal a high-half opportunity.
+  // Also, try to fold ADD into CSINC/CSINV..
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
 
-  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr);
-  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
-    .addReg(stxr_status).addMBB(loopMBB);
+  setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
 
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::BITCAST);
+  setTargetDAGCombine(ISD::CONCAT_VECTORS);
+  setTargetDAGCombine(ISD::STORE);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  setTargetDAGCombine(ISD::MUL);
 
-  return BB;
-}
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::VSELECT);
 
-MachineBasicBlock *
-AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
-                                              MachineBasicBlock *BB,
-                                              unsigned Size,
-                                              unsigned CmpOp,
-                                              A64CC::CondCodes Cond) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
 
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptr = MI->getOperand(1).getReg();
-  unsigned incr = MI->getOperand(2).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
+  setStackPointerRegisterToSaveRestore(AArch64::SP);
 
-  unsigned oldval = dest;
-  DebugLoc dl = MI->getDebugLoc();
+  setSchedulingPreference(Sched::Hybrid);
 
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  const TargetRegisterClass *TRC, *TRCsp;
-  if (Size == 8) {
-    TRC = &AArch64::GPR64RegClass;
-    TRCsp = &AArch64::GPR64xspRegClass;
-  } else {
-    TRC = &AArch64::GPR32RegClass;
-    TRCsp = &AArch64::GPR32wspRegClass;
-  }
+  // Enable TBZ/TBNZ
+  MaskAndBranchFoldingIsLegal = true;
+
+  setMinFunctionAlignment(2);
+
+  RequireStrictAlign = (Align == StrictAlign);
+
+  setHasExtractBitsInsn(true);
+
+  if (Subtarget->hasNEON()) {
+    // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
+    // silliness like this:
+    setOperationAction(ISD::FABS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FADD, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
+    setOperationAction(ISD::FMA, MVT::v1f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
+    setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
+    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
+    setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
+    setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
+
+    setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
+
+    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+
+    // AArch64 doesn't have a direct vector ->f32 conversion instructions for
+    // elements smaller than i32, so promote the input to i32 first.
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
+    // Similarly, there is no direct i32 -> f64 vector conversion instruction.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
 
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
+    // AArch64 doesn't have MUL.2d:
+    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
+    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+    // Likewise, narrowing and extending vector loads/stores aren't handled
+    // directly.
+    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+
+      setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+                         Expand);
+
+      setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+
+      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+
+      for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+           InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+        setTruncStoreAction((MVT::SimpleValueType)VT,
+                            (MVT::SimpleValueType)InnerVT, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    }
 
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loopMBB);
-  MF->insert(It, exitMBB);
+    // AArch64 has implementations of a lot of rounding-like FP operations.
+    static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
+    for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
+      MVT Ty = RoundingVecTypes[I];
+      setOperationAction(ISD::FFLOOR, Ty, Legal);
+      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+      setOperationAction(ISD::FCEIL, Ty, Legal);
+      setOperationAction(ISD::FRINT, Ty, Legal);
+      setOperationAction(ISD::FTRUNC, Ty, Legal);
+      setOperationAction(ISD::FROUND, Ty, Legal);
+    }
+  }
+}
 
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
+  if (VT == MVT::v2f32) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
 
-  unsigned scratch = MRI.createVirtualRegister(TRC);
-  MRI.constrainRegClass(scratch, TRCsp);
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
+  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
 
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
+  }
 
-  //  loopMBB:
-  //   ldxr dest, ptr
-  //   cmp incr, dest (, sign extend if necessary)
-  //   csel scratch, dest, incr, cond
-  //   stxr stxr_status, scratch, ptr
-  //   cbnz stxr_status, loopMBB
-  //   fallthrough --> exitMBB
-  BB = loopMBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  // Mark vector float intrinsics as expand.
+  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
+    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+  }
 
-  // Build compare and cmov instructions.
-  MRI.constrainRegClass(incr, TRCsp);
-  BuildMI(BB, dl, TII->get(CmpOp))
-    .addReg(incr).addReg(oldval).addImm(0);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
+
+  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
+  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+
+  // CNT supports only B element sizes.
+  if (VT != MVT::v8i8 && VT != MVT::v16i8)
+    setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
+
+  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+
+  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+
+  if (Subtarget->isLittleEndian()) {
+    for (unsigned im = (unsigned)ISD::PRE_INC;
+         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+      setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
+      setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
+    }
+  }
+}
 
-  BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc),
-          scratch)
-    .addReg(oldval).addReg(incr).addImm(Cond);
+void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &AArch64::FPR64RegClass);
+  addTypeForNEON(VT, MVT::v2i32);
+}
 
-  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
+void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &AArch64::FPR128RegClass);
+  addTypeForNEON(VT, MVT::v4i32);
+}
 
-  BuildMI(BB, dl, TII->get(strOpc), stxr_status)
-    .addReg(scratch).addReg(ptr);
-  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
-    .addReg(stxr_status).addMBB(loopMBB);
+EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
 
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
+/// computeKnownBitsForTargetNode - Determine which of the bits specified in
+/// Mask are known to be either zero or one and return them in the
+/// KnownZero/KnownOne bitsets.
+void AArch64TargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
+    const SelectionDAG &DAG, unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    break;
+  case AArch64ISD::CSEL: {
+    APInt KnownZero2, KnownOne2;
+    DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
+    KnownZero &= KnownZero2;
+    KnownOne &= KnownOne2;
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+   ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+    switch (IntID) {
+    default: return;
+    case Intrinsic::aarch64_ldaxr:
+    case Intrinsic::aarch64_ldxr: {
+      unsigned BitWidth = KnownOne.getBitWidth();
+      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+      unsigned MemBits = VT.getScalarType().getSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      return;
+    }
+    }
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_umaxv:
+    case Intrinsic::aarch64_neon_uminv: {
+      // Figure out the datatype of the vector operand. The UMINV instruction
+      // will zero extend the result, so we can mark as known zero all the
+      // bits larger than the element datatype. 32-bit or larget doesn't need
+      // this as those are legal types and will be handled by isel directly.
+      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
+      unsigned BitWidth = KnownZero.getBitWidth();
+      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
+        assert(BitWidth >= 8 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
+        KnownZero |= Mask;
+      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
+        assert(BitWidth >= 16 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
+        KnownZero |= Mask;
+      }
+      break;
+    } break;
+    }
+  }
+  }
+}
 
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
+MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
+  return MVT::i64;
+}
 
-  MI->eraseFromParent();   // The instruction is gone now.
+unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
+  // FIXME: On AArch64, this depends on the type.
+  // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes().
+  // and the offset has to be a multiple of the related size in bytes.
+  return 4095;
+}
 
-  return BB;
+FastISel *
+AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                      const TargetLibraryInfo *libInfo) const {
+  return AArch64::createFastISel(funcInfo, libInfo);
 }
 
-MachineBasicBlock *
-AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
-                                         MachineBasicBlock *BB,
-                                         unsigned Size) const {
-  unsigned dest    = MI->getOperand(0).getReg();
-  unsigned ptr     = MI->getOperand(1).getReg();
-  unsigned oldval  = MI->getOperand(2).getReg();
-  unsigned newval  = MI->getOperand(3).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  const TargetRegisterClass *TRCsp;
-  TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass;
-
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
-  MachineFunction *MF = BB->getParent();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It; // insert the new blocks after the current block
-
-  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loop1MBB);
-  MF->insert(It, loop2MBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loop1MBB
-  BB->addSuccessor(loop1MBB);
-
-  // loop1MBB:
-  //   ldxr dest, [ptr]
-  //   cmp dest, oldval
-  //   b.ne exitMBB
-  BB = loop1MBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-
-  unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl;
-  MRI.constrainRegClass(dest, TRCsp);
-  BuildMI(BB, dl, TII->get(CmpOp))
-    .addReg(dest).addReg(oldval).addImm(0);
-  BuildMI(BB, dl, TII->get(AArch64::Bcc))
-    .addImm(A64CC::NE).addMBB(exitMBB);
-  BB->addSuccessor(loop2MBB);
-  BB->addSuccessor(exitMBB);
-
-  // loop2MBB:
-  //   strex stxr_status, newval, [ptr]
-  //   cbnz stxr_status, loop1MBB
-  BB = loop2MBB;
-  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
-
-  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr);
-  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
-    .addReg(stxr_status).addMBB(loop1MBB);
-  BB->addSuccessor(loop1MBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
-
-  MI->eraseFromParent();   // The instruction is gone now.
-
-  return BB;
+const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    return nullptr;
+  case AArch64ISD::CALL:              return "AArch64ISD::CALL";
+  case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
+  case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
+  case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
+  case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
+  case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
+  case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
+  case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
+  case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
+  case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
+  case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
+  case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
+  case AArch64ISD::TLSDESC_CALL:      return "AArch64ISD::TLSDESC_CALL";
+  case AArch64ISD::ADC:               return "AArch64ISD::ADC";
+  case AArch64ISD::SBC:               return "AArch64ISD::SBC";
+  case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
+  case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
+  case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
+  case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
+  case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
+  case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
+  case AArch64ISD::FMIN:              return "AArch64ISD::FMIN";
+  case AArch64ISD::FMAX:              return "AArch64ISD::FMAX";
+  case AArch64ISD::DUP:               return "AArch64ISD::DUP";
+  case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
+  case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
+  case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
+  case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
+  case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
+  case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
+  case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
+  case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
+  case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
+  case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
+  case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
+  case AArch64ISD::BICi:              return "AArch64ISD::BICi";
+  case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
+  case AArch64ISD::BSL:               return "AArch64ISD::BSL";
+  case AArch64ISD::NEG:               return "AArch64ISD::NEG";
+  case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
+  case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
+  case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
+  case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
+  case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
+  case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
+  case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
+  case AArch64ISD::REV16:             return "AArch64ISD::REV16";
+  case AArch64ISD::REV32:             return "AArch64ISD::REV32";
+  case AArch64ISD::REV64:             return "AArch64ISD::REV64";
+  case AArch64ISD::EXT:               return "AArch64ISD::EXT";
+  case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
+  case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
+  case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
+  case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
+  case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
+  case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
+  case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
+  case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
+  case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
+  case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
+  case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
+  case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
+  case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
+  case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
+  case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
+  case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
+  case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
+  case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
+  case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
+  case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
+  case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
+  case AArch64ISD::NOT:               return "AArch64ISD::NOT";
+  case AArch64ISD::BIT:               return "AArch64ISD::BIT";
+  case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
+  case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
+  case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
+  case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
+  case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
+  case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
+  case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
+  case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
+  case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
+  case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
+  case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
+  case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
+  case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
+  case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
+  case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
+  case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
+  case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
+  case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
+  case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
+  case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
+  case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
+  case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
+  case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
+  case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
+  case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
+  case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
+  case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
+  case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
+  case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
+  case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
+  case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
+  case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
+  case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
+  case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
+  case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
+  case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
+  }
 }
 
 MachineBasicBlock *
 AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
                                     MachineBasicBlock *MBB) const {
-  // We materialise the F128CSEL pseudo-instruction using conditional branches
-  // and loads, giving an instruciton sequence like:
-  //     str q0, [sp]
-  //     b.ne IfTrue
-  //     b Finish
-  // IfTrue:
-  //     str q1, [sp]
-  // Finish:
-  //     ldr q0, [sp]
-  //
-  // Using virtual registers would probably not be beneficial since COPY
-  // instructions are expensive for f128 (there's no actual instruction to
-  // implement them).
-  //
-  // An alternative would be to do an integer-CSEL on some address. E.g.:
-  //     mov x0, sp
-  //     add x1, sp, #16
-  //     str q0, [x0]
-  //     str q1, [x1]
-  //     csel x0, x0, x1, ne
-  //     ldr q0, [x0]
-  //
-  // It's unclear which approach is actually optimal.
+  // We materialise the F128CSEL pseudo-instruction as some control flow and a
+  // phi node:
+
+  // OrigBB:
+  //     [... previous instrs leading to comparison ...]
+  //     b.ne TrueBB
+  //     b EndBB
+  // TrueBB:
+  //     ; Fallthrough
+  // EndBB:
+  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
+
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   MachineFunction *MF = MBB->getParent();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
@@ -906,49 +792,24 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
                 MBB->end());
   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
 
-  // We need somewhere to store the f128 value needed.
-  int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16);
-
-  //     [... start of incoming MBB ...]
-  //     str qIFFALSE, [sp]
-  //     b.cc IfTrue
-  //     b Done
-  BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR))
-    .addReg(IfFalseReg)
-    .addFrameIndex(ScratchFI)
-    .addImm(0);
-  BuildMI(MBB, DL, TII->get(AArch64::Bcc))
-    .addImm(CondCode)
-    .addMBB(TrueBB);
-  BuildMI(MBB, DL, TII->get(AArch64::Bimm))
-    .addMBB(EndBB);
+  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
+  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
   MBB->addSuccessor(TrueBB);
   MBB->addSuccessor(EndBB);
 
+  // TrueBB falls through to the end.
+  TrueBB->addSuccessor(EndBB);
+
   if (!NZCVKilled) {
-    // NZCV is live-through TrueBB.
     TrueBB->addLiveIn(AArch64::NZCV);
     EndBB->addLiveIn(AArch64::NZCV);
   }
 
-  // IfTrue:
-  //     str qIFTRUE, [sp]
-  BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
-    .addReg(IfTrueReg)
-    .addFrameIndex(ScratchFI)
-    .addImm(0);
-
-  // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the
-  // blocks.
-  TrueBB->addSuccessor(EndBB);
-
-  // Done:
-  //     ldr qDEST, [sp]
-  //     [... rest of incoming MBB ...]
-  MachineInstr *StartOfEnd = EndBB->begin();
-  BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
-    .addFrameIndex(ScratchFI)
-    .addImm(0);
+  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
+      .addReg(IfTrueReg)
+      .addMBB(TrueBB)
+      .addReg(IfFalseReg)
+      .addMBB(MBB);
 
   MI->eraseFromParent();
   return EndBB;
@@ -956,853 +817,1140 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
 
 MachineBasicBlock *
 AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *MBB) const {
+                                                 MachineBasicBlock *BB) const {
   switch (MI->getOpcode()) {
-  default: llvm_unreachable("Unhandled instruction with custom inserter");
-  case AArch64::F128CSEL:
-    return EmitF128CSEL(MI, MBB);
-  case AArch64::ATOMIC_LOAD_ADD_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl);
-  case AArch64::ATOMIC_LOAD_ADD_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl);
-  case AArch64::ATOMIC_LOAD_ADD_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl);
-  case AArch64::ATOMIC_LOAD_ADD_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_SUB_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl);
-  case AArch64::ATOMIC_LOAD_SUB_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl);
-  case AArch64::ATOMIC_LOAD_SUB_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl);
-  case AArch64::ATOMIC_LOAD_SUB_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_AND_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl);
-  case AArch64::ATOMIC_LOAD_AND_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl);
-  case AArch64::ATOMIC_LOAD_AND_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl);
-  case AArch64::ATOMIC_LOAD_AND_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_OR_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl);
-  case AArch64::ATOMIC_LOAD_OR_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl);
-  case AArch64::ATOMIC_LOAD_OR_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl);
-  case AArch64::ATOMIC_LOAD_OR_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_XOR_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl);
-  case AArch64::ATOMIC_LOAD_XOR_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl);
-  case AArch64::ATOMIC_LOAD_XOR_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl);
-  case AArch64::ATOMIC_LOAD_XOR_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_NAND_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl);
-  case AArch64::ATOMIC_LOAD_NAND_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl);
-  case AArch64::ATOMIC_LOAD_NAND_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl);
-  case AArch64::ATOMIC_LOAD_NAND_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_MIN_I8:
-    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT);
-  case AArch64::ATOMIC_LOAD_MIN_I16:
-    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT);
-  case AArch64::ATOMIC_LOAD_MIN_I32:
-    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT);
-  case AArch64::ATOMIC_LOAD_MIN_I64:
-    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT);
-
-  case AArch64::ATOMIC_LOAD_MAX_I8:
-    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT);
-  case AArch64::ATOMIC_LOAD_MAX_I16:
-    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT);
-  case AArch64::ATOMIC_LOAD_MAX_I32:
-    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT);
-  case AArch64::ATOMIC_LOAD_MAX_I64:
-    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT);
-
-  case AArch64::ATOMIC_LOAD_UMIN_I8:
-    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI);
-  case AArch64::ATOMIC_LOAD_UMIN_I16:
-    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI);
-  case AArch64::ATOMIC_LOAD_UMIN_I32:
-    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI);
-  case AArch64::ATOMIC_LOAD_UMIN_I64:
-    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI);
-
-  case AArch64::ATOMIC_LOAD_UMAX_I8:
-    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO);
-  case AArch64::ATOMIC_LOAD_UMAX_I16:
-    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO);
-  case AArch64::ATOMIC_LOAD_UMAX_I32:
-    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO);
-  case AArch64::ATOMIC_LOAD_UMAX_I64:
-    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO);
-
-  case AArch64::ATOMIC_SWAP_I8:
-    return emitAtomicBinary(MI, MBB, 1, 0);
-  case AArch64::ATOMIC_SWAP_I16:
-    return emitAtomicBinary(MI, MBB, 2, 0);
-  case AArch64::ATOMIC_SWAP_I32:
-    return emitAtomicBinary(MI, MBB, 4, 0);
-  case AArch64::ATOMIC_SWAP_I64:
-    return emitAtomicBinary(MI, MBB, 8, 0);
-
-  case AArch64::ATOMIC_CMP_SWAP_I8:
-    return emitAtomicCmpSwap(MI, MBB, 1);
-  case AArch64::ATOMIC_CMP_SWAP_I16:
-    return emitAtomicCmpSwap(MI, MBB, 2);
-  case AArch64::ATOMIC_CMP_SWAP_I32:
-    return emitAtomicCmpSwap(MI, MBB, 4);
-  case AArch64::ATOMIC_CMP_SWAP_I64:
-    return emitAtomicCmpSwap(MI, MBB, 8);
-  }
-}
+  default:
+#ifndef NDEBUG
+    MI->dump();
+#endif
+    assert(0 && "Unexpected instruction for custom inserter!");
+    break;
 
+  case AArch64::F128CSEL:
+    return EmitF128CSEL(MI, BB);
 
-const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (Opcode) {
-  case AArch64ISD::BR_CC:          return "AArch64ISD::BR_CC";
-  case AArch64ISD::Call:           return "AArch64ISD::Call";
-  case AArch64ISD::FPMOV:          return "AArch64ISD::FPMOV";
-  case AArch64ISD::GOTLoad:        return "AArch64ISD::GOTLoad";
-  case AArch64ISD::BFI:            return "AArch64ISD::BFI";
-  case AArch64ISD::EXTR:           return "AArch64ISD::EXTR";
-  case AArch64ISD::Ret:            return "AArch64ISD::Ret";
-  case AArch64ISD::SBFX:           return "AArch64ISD::SBFX";
-  case AArch64ISD::SELECT_CC:      return "AArch64ISD::SELECT_CC";
-  case AArch64ISD::SETCC:          return "AArch64ISD::SETCC";
-  case AArch64ISD::TC_RETURN:      return "AArch64ISD::TC_RETURN";
-  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
-  case AArch64ISD::TLSDESCCALL:    return "AArch64ISD::TLSDESCCALL";
-  case AArch64ISD::WrapperLarge:   return "AArch64ISD::WrapperLarge";
-  case AArch64ISD::WrapperSmall:   return "AArch64ISD::WrapperSmall";
-
-  case AArch64ISD::NEON_MOVIMM:
-    return "AArch64ISD::NEON_MOVIMM";
-  case AArch64ISD::NEON_MVNIMM:
-    return "AArch64ISD::NEON_MVNIMM";
-  case AArch64ISD::NEON_FMOVIMM:
-    return "AArch64ISD::NEON_FMOVIMM";
-  case AArch64ISD::NEON_CMP:
-    return "AArch64ISD::NEON_CMP";
-  case AArch64ISD::NEON_CMPZ:
-    return "AArch64ISD::NEON_CMPZ";
-  case AArch64ISD::NEON_TST:
-    return "AArch64ISD::NEON_TST";
-  case AArch64ISD::NEON_QSHLs:
-    return "AArch64ISD::NEON_QSHLs";
-  case AArch64ISD::NEON_QSHLu:
-    return "AArch64ISD::NEON_QSHLu";
-  case AArch64ISD::NEON_VDUP:
-    return "AArch64ISD::NEON_VDUP";
-  case AArch64ISD::NEON_VDUPLANE:
-    return "AArch64ISD::NEON_VDUPLANE";
-  case AArch64ISD::NEON_REV16:
-    return "AArch64ISD::NEON_REV16";
-  case AArch64ISD::NEON_REV32:
-    return "AArch64ISD::NEON_REV32";
-  case AArch64ISD::NEON_REV64:
-    return "AArch64ISD::NEON_REV64";
-  case AArch64ISD::NEON_UZP1:
-    return "AArch64ISD::NEON_UZP1";
-  case AArch64ISD::NEON_UZP2:
-    return "AArch64ISD::NEON_UZP2";
-  case AArch64ISD::NEON_ZIP1:
-    return "AArch64ISD::NEON_ZIP1";
-  case AArch64ISD::NEON_ZIP2:
-    return "AArch64ISD::NEON_ZIP2";
-  case AArch64ISD::NEON_TRN1:
-    return "AArch64ISD::NEON_TRN1";
-  case AArch64ISD::NEON_TRN2:
-    return "AArch64ISD::NEON_TRN2";
-  case AArch64ISD::NEON_LD1_UPD:
-    return "AArch64ISD::NEON_LD1_UPD";
-  case AArch64ISD::NEON_LD2_UPD:
-    return "AArch64ISD::NEON_LD2_UPD";
-  case AArch64ISD::NEON_LD3_UPD:
-    return "AArch64ISD::NEON_LD3_UPD";
-  case AArch64ISD::NEON_LD4_UPD:
-    return "AArch64ISD::NEON_LD4_UPD";
-  case AArch64ISD::NEON_ST1_UPD:
-    return "AArch64ISD::NEON_ST1_UPD";
-  case AArch64ISD::NEON_ST2_UPD:
-    return "AArch64ISD::NEON_ST2_UPD";
-  case AArch64ISD::NEON_ST3_UPD:
-    return "AArch64ISD::NEON_ST3_UPD";
-  case AArch64ISD::NEON_ST4_UPD:
-    return "AArch64ISD::NEON_ST4_UPD";
-  case AArch64ISD::NEON_LD1x2_UPD:
-    return "AArch64ISD::NEON_LD1x2_UPD";
-  case AArch64ISD::NEON_LD1x3_UPD:
-    return "AArch64ISD::NEON_LD1x3_UPD";
-  case AArch64ISD::NEON_LD1x4_UPD:
-    return "AArch64ISD::NEON_LD1x4_UPD";
-  case AArch64ISD::NEON_ST1x2_UPD:
-    return "AArch64ISD::NEON_ST1x2_UPD";
-  case AArch64ISD::NEON_ST1x3_UPD:
-    return "AArch64ISD::NEON_ST1x3_UPD";
-  case AArch64ISD::NEON_ST1x4_UPD:
-    return "AArch64ISD::NEON_ST1x4_UPD";
-  case AArch64ISD::NEON_LD2DUP:
-    return "AArch64ISD::NEON_LD2DUP";
-  case AArch64ISD::NEON_LD3DUP:
-    return "AArch64ISD::NEON_LD3DUP";
-  case AArch64ISD::NEON_LD4DUP:
-    return "AArch64ISD::NEON_LD4DUP";
-  case AArch64ISD::NEON_LD2DUP_UPD:
-    return "AArch64ISD::NEON_LD2DUP_UPD";
-  case AArch64ISD::NEON_LD3DUP_UPD:
-    return "AArch64ISD::NEON_LD3DUP_UPD";
-  case AArch64ISD::NEON_LD4DUP_UPD:
-    return "AArch64ISD::NEON_LD4DUP_UPD";
-  case AArch64ISD::NEON_LD2LN_UPD:
-    return "AArch64ISD::NEON_LD2LN_UPD";
-  case AArch64ISD::NEON_LD3LN_UPD:
-    return "AArch64ISD::NEON_LD3LN_UPD";
-  case AArch64ISD::NEON_LD4LN_UPD:
-    return "AArch64ISD::NEON_LD4LN_UPD";
-  case AArch64ISD::NEON_ST2LN_UPD:
-    return "AArch64ISD::NEON_ST2LN_UPD";
-  case AArch64ISD::NEON_ST3LN_UPD:
-    return "AArch64ISD::NEON_ST3LN_UPD";
-  case AArch64ISD::NEON_ST4LN_UPD:
-    return "AArch64ISD::NEON_ST4LN_UPD";
-  case AArch64ISD::NEON_VEXTRACT:
-    return "AArch64ISD::NEON_VEXTRACT";
-  default:
-    return NULL;
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
   }
+  llvm_unreachable("Unexpected instruction for custom inserter!");
 }
 
-static const uint16_t AArch64FPRArgRegs[] = {
-  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
-  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7
-};
-static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs);
-
-static const uint16_t AArch64ArgRegs[] = {
-  AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3,
-  AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7
-};
-static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs);
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering private implementation.
+//===----------------------------------------------------------------------===//
 
-static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                 CCValAssign::LocInfo LocInfo,
-                                 ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  // Mark all remaining general purpose registers as allocated. We don't
-  // backtrack: if (for example) an i128 gets put on the stack, no subsequent
-  // i64 will go in registers (C.11).
-  for (unsigned i = 0; i < NumArgRegs; ++i)
-    State.AllocateReg(AArch64ArgRegs[i]);
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
 
-  return false;
+/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
+/// CC
+static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown condition code!");
+  case ISD::SETNE:
+    return AArch64CC::NE;
+  case ISD::SETEQ:
+    return AArch64CC::EQ;
+  case ISD::SETGT:
+    return AArch64CC::GT;
+  case ISD::SETGE:
+    return AArch64CC::GE;
+  case ISD::SETLT:
+    return AArch64CC::LT;
+  case ISD::SETLE:
+    return AArch64CC::LE;
+  case ISD::SETUGT:
+    return AArch64CC::HI;
+  case ISD::SETUGE:
+    return AArch64CC::HS;
+  case ISD::SETULT:
+    return AArch64CC::LO;
+  case ISD::SETULE:
+    return AArch64CC::LS;
+  }
 }
 
-#include "AArch64GenCallingConv.inc"
-
-CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
-
-  switch(CC) {
-  default: llvm_unreachable("Unsupported calling convention");
-  case CallingConv::Fast:
-  case CallingConv::C:
-    return CC_A64_APCS;
+/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
+static void changeFPCCToAArch64CC(ISD::CondCode CC,
+                                  AArch64CC::CondCode &CondCode,
+                                  AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    CondCode = AArch64CC::EQ;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    CondCode = AArch64CC::GT;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    CondCode = AArch64CC::GE;
+    break;
+  case ISD::SETOLT:
+    CondCode = AArch64CC::MI;
+    break;
+  case ISD::SETOLE:
+    CondCode = AArch64CC::LS;
+    break;
+  case ISD::SETONE:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GT;
+    break;
+  case ISD::SETO:
+    CondCode = AArch64CC::VC;
+    break;
+  case ISD::SETUO:
+    CondCode = AArch64CC::VS;
+    break;
+  case ISD::SETUEQ:
+    CondCode = AArch64CC::EQ;
+    CondCode2 = AArch64CC::VS;
+    break;
+  case ISD::SETUGT:
+    CondCode = AArch64CC::HI;
+    break;
+  case ISD::SETUGE:
+    CondCode = AArch64CC::PL;
+    break;
+  case ISD::SETLT:
+  case ISD::SETULT:
+    CondCode = AArch64CC::LT;
+    break;
+  case ISD::SETLE:
+  case ISD::SETULE:
+    CondCode = AArch64CC::LE;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    CondCode = AArch64CC::NE;
+    break;
   }
 }
 
-void
-AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                                           SDLoc DL, SDValue &Chain) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
-
-  SmallVector<SDValue, 8> MemOps;
+/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
+/// CC usable with the vector instructions. Fewer operations are available
+/// without a real NZCV register, so we have to use less efficient combinations
+/// to get the same effect.
+static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
+                                        AArch64CC::CondCode &CondCode,
+                                        AArch64CC::CondCode &CondCode2,
+                                        bool &Invert) {
+  Invert = false;
+  switch (CC) {
+  default:
+    // Mostly the scalar mappings work fine.
+    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+    break;
+  case ISD::SETUO:
+    Invert = true; // Fallthrough
+  case ISD::SETO:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GE;
+    break;
+  case ISD::SETUEQ:
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    // All of the compare-mask comparisons are ordered, but we can switch
+    // between the two by a double inversion. E.g. ULE == !OGT.
+    Invert = true;
+    changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
+    break;
+  }
+}
 
-  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs,
-                                                         NumArgRegs);
-  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs,
-                                                         NumFPRArgRegs);
+static bool isLegalArithImmed(uint64_t C) {
+  // Matches AArch64DAGToDAGISel::SelectArithImmed().
+  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
+}
 
-  unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR);
-  int GPRIdx = 0;
-  if (GPRSaveSize != 0) {
-    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
+static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                              SDLoc dl, SelectionDAG &DAG) {
+  EVT VT = LHS.getValueType();
+
+  if (VT.isFloatingPoint())
+    return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+
+  // The CMP instruction is just an alias for SUBS, and representing it as
+  // SUBS means that it's possible to get CSE with subtract operations.
+  // A later phase can perform the optimization of setting the destination
+  // register to WZR/XZR if it ends up being unused.
+  unsigned Opcode = AArch64ISD::SUBS;
+
+  if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
+      cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
+    // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
+    // can be set differently by this operation. It comes down to whether
+    // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
+    // everything is fine. If not then the optimization is wrong. Thus general
+    // comparisons are only valid if op2 != 0.
+
+    // So, finally, the only LLVM-native comparisons that don't mention C and V
+    // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+    // the absence of information about op2.
+    Opcode = AArch64ISD::ADDS;
+    RHS = RHS.getOperand(1);
+  } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
+             cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+             !isUnsignedIntSetCC(CC)) {
+    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+    // of the signed comparisons.
+    Opcode = AArch64ISD::ANDS;
+    RHS = LHS.getOperand(1);
+    LHS = LHS.getOperand(0);
+  }
 
-    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+      .getValue(1);
+}
 
-    for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass);
-      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
-      SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                                   MachinePointerInfo::getStack(i * 8),
-                                   false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(8, getPointerTy()));
+static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                             SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    EVT VT = RHS.getValueType();
+    uint64_t C = RHSC->getZExtValue();
+    if (!isLegalArithImmed(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if ((VT == MVT::i32 && C != 0x80000000 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0x80000000ULL &&
+             isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if ((VT == MVT::i32 && C != 0 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if ((VT == MVT::i32 && C != 0x7fffffff &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if ((VT == MVT::i32 && C != 0xffffffff &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      }
     }
   }
 
-  if (getSubtarget()->hasFPARMv8()) {
-  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
-  int FPRIdx = 0;
-    // According to the AArch64 Procedure Call Standard, section B.1/B.3, we
-    // can omit a register save area if we know we'll never use registers of
-    // that class.
-    if (FPRSaveSize != 0) {
-      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
-
-      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+  AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
+  return Cmp;
+}
 
-      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
-        unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
-            &AArch64::FPR128RegClass);
-        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
-        SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
-            MachinePointerInfo::getStack(i * 16),
-            false, false, 0);
-        MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-            DAG.getConstant(16, getPointerTy()));
+static std::pair<SDValue, SDValue>
+getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
+  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
+         "Unsupported value type");
+  SDValue Value, Overflow;
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  unsigned Opc = 0;
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown overflow instruction!");
+  case ISD::SADDO:
+    Opc = AArch64ISD::ADDS;
+    CC = AArch64CC::VS;
+    break;
+  case ISD::UADDO:
+    Opc = AArch64ISD::ADDS;
+    CC = AArch64CC::HS;
+    break;
+  case ISD::SSUBO:
+    Opc = AArch64ISD::SUBS;
+    CC = AArch64CC::VS;
+    break;
+  case ISD::USUBO:
+    Opc = AArch64ISD::SUBS;
+    CC = AArch64CC::LO;
+    break;
+  // Multiply needs a little bit extra work.
+  case ISD::SMULO:
+  case ISD::UMULO: {
+    CC = AArch64CC::NE;
+    bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
+    if (Op.getValueType() == MVT::i32) {
+      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      // For a 32 bit multiply with overflow check we want the instruction
+      // selector to generate a widening multiply (SMADDL/UMADDL). For that we
+      // need to generate the following pattern:
+      // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
+      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
+      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
+                                DAG.getConstant(0, MVT::i64));
+      // On AArch64 the upper 32 bits are always zero extended for a 32 bit
+      // operation. We need to clear out the upper 32 bits, because we used a
+      // widening multiply that wrote all 64 bits. In the end this should be a
+      // noop.
+      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
+      if (IsSigned) {
+        // The signed overflow check requires more than just a simple check for
+        // any bit set in the upper 32 bits of the result. These bits could be
+        // just the sign bits of a negative number. To perform the overflow
+        // check we have to arithmetic shift right the 32nd bit of the result by
+        // 31 bits. Then we compare the result to the upper 32 bits.
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
+                                        DAG.getConstant(32, MVT::i64));
+        UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
+        SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
+                                        DAG.getConstant(31, MVT::i64));
+        // It is important that LowerBits is last, otherwise the arithmetic
+        // shift will not be folded into the compare (SUBS).
+        SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
+        Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                       .getValue(1);
+      } else {
+        // The overflow check for unsigned multiply is easy. We only need to
+        // check if any of the upper 32 bits are set. This can be done with a
+        // CMP (shifted register). For that we need to generate the following
+        // pattern:
+        // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
+                                        DAG.getConstant(32, MVT::i64));
+        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+        Overflow =
+            DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+                        UpperBits).getValue(1);
       }
+      break;
+    }
+    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
+    // For the 64 bit multiply
+    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+    if (IsSigned) {
+      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
+      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
+                                      DAG.getConstant(63, MVT::i64));
+      // It is important that LowerBits is last, otherwise the arithmetic
+      // shift will not be folded into the compare (SUBS).
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                     .getValue(1);
+    } else {
+      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow =
+          DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+                      UpperBits).getValue(1);
     }
-    FuncInfo->setVariadicFPRIdx(FPRIdx);
-    FuncInfo->setVariadicFPRSize(FPRSaveSize);
+    break;
   }
+  } // switch (...)
 
-  unsigned StackOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), 8);
-  int StackIdx = MFI->CreateFixedObject(8, StackOffset, true);
+  if (Opc) {
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
 
-  FuncInfo->setVariadicStackIdx(StackIdx);
-  FuncInfo->setVariadicGPRIdx(GPRIdx);
-  FuncInfo->setVariadicGPRSize(GPRSaveSize);
-
-  if (!MemOps.empty()) {
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
-                        MemOps.size());
+    // Emit the AArch64 operation with overflow check.
+    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
+    Overflow = Value.getValue(1);
   }
+  return std::make_pair(Value, Overflow);
 }
 
+SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                                             RTLIB::Libcall Call) const {
+  SmallVector<SDValue, 2> Ops;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
 
-SDValue
-AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
-                                      CallingConv::ID CallConv, bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc dl, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
+                     SDLoc(Op)).first;
+}
 
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
+static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
+  SDValue Sel = Op.getOperand(0);
+  SDValue Other = Op.getOperand(1);
 
-  SmallVector<SDValue, 16> ArgValues;
+  // If neither operand is a SELECT_CC, give up.
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    std::swap(Sel, Other);
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    return Op;
 
-  SDValue ArgValue;
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+  // The folding we want to perform is:
+  // (xor x, (select_cc a, b, cc, 0, -1) )
+  //   -->
+  // (csel x, (xor x, -1), cc ...)
+  //
+  // The latter will get matched to a CSINV instruction.
 
-    if (Flags.isByVal()) {
-      // Byval is used for small structs and HFAs in the PCS, but the system
-      // should work in a non-compliant manner for larger structs.
-      EVT PtrTy = getPointerTy();
-      int Size = Flags.getByValSize();
-      unsigned NumRegs = (Size + 7) / 8;
+  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
+  SDValue LHS = Sel.getOperand(0);
+  SDValue RHS = Sel.getOperand(1);
+  SDValue TVal = Sel.getOperand(2);
+  SDValue FVal = Sel.getOperand(3);
+  SDLoc dl(Sel);
 
-      uint32_t BEAlign = 0;
-      if (Size < 8 && !getSubtarget()->isLittle())
-        BEAlign = 8-Size;
-      unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs,
-                                                 VA.getLocMemOffset() + BEAlign,
-                                                 false);
-      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
-      InVals.push_back(FrameIdxN);
+  // FIXME: This could be generalized to non-integer comparisons.
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return Op;
 
-      continue;
-    } else if (VA.isRegLoc()) {
-      MVT RegVT = VA.getLocVT();
-      const TargetRegisterClass *RC = getRegClassFor(RegVT);
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
 
-      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
-    } else { // VA.isRegLoc()
-      assert(VA.isMemLoc());
+  // The the values aren't constants, this isn't the pattern we're looking for.
+  if (!CFVal || !CTVal)
+    return Op;
 
-      int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
-                                      VA.getLocMemOffset(), true);
+  // We can commute the SELECT_CC by inverting the condition.  This
+  // might be needed to make this fit into a CSINV pattern.
+  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+    std::swap(TVal, FVal);
+    std::swap(CTVal, CFVal);
+    CC = ISD::getSetCCInverse(CC, true);
+  }
 
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
-                             MachinePointerInfo::getFixedStack(FI),
-                             false, false, false, 0);
+  // If the constants line up, perform the transform!
+  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
 
+    FVal = Other;
+    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
+                       DAG.getConstant(-1ULL, Other.getValueType()));
 
-    }
+    return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
+                       CCVal, Cmp);
+  }
 
-    switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full: break;
-    case CCValAssign::BCvt:
-      ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue);
-      break;
-    case CCValAssign::SExt:
-    case CCValAssign::ZExt:
-    case CCValAssign::AExt:
-    case CCValAssign::FPExt: {
-      unsigned DestSize = VA.getValVT().getSizeInBits();
-      unsigned DestSubReg;
-
-      switch (DestSize) {
-      case 8: DestSubReg = AArch64::sub_8; break;
-      case 16: DestSubReg = AArch64::sub_16; break;
-      case 32: DestSubReg = AArch64::sub_32; break;
-      case 64: DestSubReg = AArch64::sub_64; break;
-      default: llvm_unreachable("Unexpected argument promotion");
-      }
+  return Op;
+}
 
-      ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
-                                   VA.getValVT(), ArgValue,
-                                   DAG.getTargetConstant(DestSubReg, MVT::i32)),
-                         0);
-      break;
-    }
-    }
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
 
-    InVals.push_back(ArgValue);
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default:
+    assert(0 && "Invalid code");
+  case ISD::ADDC:
+    Opc = AArch64ISD::ADDS;
+    break;
+  case ISD::SUBC:
+    Opc = AArch64ISD::SUBS;
+    break;
+  case ISD::ADDE:
+    Opc = AArch64ISD::ADCS;
+    ExtraOp = true;
+    break;
+  case ISD::SUBE:
+    Opc = AArch64ISD::SBCS;
+    ExtraOp = true;
+    break;
   }
 
-  if (isVarArg)
-    SaveVarArgRegisters(CCInfo, DAG, dl, Chain);
+  if (!ExtraOp)
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
+                     Op.getOperand(2));
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+    return SDValue();
 
-  unsigned StackArgSize = CCInfo.getNextStackOffset();
-  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
-    // This is a non-standard ABI so by fiat I say we're allowed to make full
-    // use of the stack area to be popped, which must be aligned to 16 bytes in
-    // any case:
-    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+  AArch64CC::CondCode CC;
+  // The actual operation that sets the overflow or carry flag.
+  SDValue Value, Overflow;
+  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
 
-    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
-    // a multiple of 16.
-    FuncInfo->setArgumentStackToRestore(StackArgSize);
+  // We use 0 and 1 as false and true values.
+  SDValue TVal = DAG.getConstant(1, MVT::i32);
+  SDValue FVal = DAG.getConstant(0, MVT::i32);
 
-    // This realignment carries over to the available bytes below. Our own
-    // callers will guarantee the space is free by giving an aligned value to
-    // CALLSEQ_START.
-  }
-  // Even if we're not expected to free up the space, it's useful to know how
-  // much is there while considering tail calls (because we can reuse it).
-  FuncInfo->setBytesInStackArgArea(StackArgSize);
+  // We use an inverted condition, because the conditional select is inverted
+  // too. This will allow it to be selected to a single instruction:
+  // CSINC Wd, WZR, WZR, invert(cond).
+  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
+  Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal,
+                         CCVal, Overflow);
 
-  return Chain;
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
 }
 
-SDValue
-AArch64TargetLowering::LowerReturn(SDValue Chain,
-                                   CallingConv::ID CallConv, bool isVarArg,
-                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                   const SmallVectorImpl<SDValue> &OutVals,
-                                   SDLoc dl, SelectionDAG &DAG) const {
-  // CCValAssign - represent the assignment of the return value to a location.
-  SmallVector<CCValAssign, 16> RVLocs;
+// Prefetch operands are:
+// 1: Address to prefetch
+// 2: bool isWrite
+// 3: int locality (0 = no locality ... 3 = extreme locality)
+// 4: bool isDataCache
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+  // The data thing is not used.
+  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+
+  bool IsStream = !Locality;
+  // When the locality number is set
+  if (Locality) {
+    // The front-end should have filtered out the out-of-range values
+    assert(Locality <= 3 && "Prefetch locality out-of-range");
+    // The locality degree is the opposite of the cache speed.
+    // Put the number the other way around.
+    // The encoding starts at 0 for level 1
+    Locality = 3 - Locality;
+  }
 
-  // CCState - Info about the registers and stack slots.
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  // built the mask value encoding the expected behavior.
+  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
+                   (Locality << 1) |    // Cache level bits
+                   (unsigned)IsStream;  // Stream bit
+  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
+}
 
-  // Analyze outgoing return values.
-  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv));
+SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
 
-  SDValue Flag;
-  SmallVector<SDValue, 4> RetOps(1, Chain);
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
 
-  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
-    // PCS: "If the type, T, of the result of a function is such that
-    // void func(T arg) would require that arg be passed as a value in a
-    // register (or set of registers) according to the rules in 5.4, then the
-    // result is returned in the same registers as would be used for such an
-    // argument.
-    //
-    // Otherwise, the caller shall reserve a block of memory of sufficient
-    // size and alignment to hold the result. The address of the memory block
-    // shall be passed as an additional argument to the function in x8."
-    //
-    // This is implemented in two places. The register-return values are dealt
-    // with here, more complex returns are passed as an sret parameter, which
-    // means we don't have to worry about it during actual return.
-    CCValAssign &VA = RVLocs[i];
-    assert(VA.isRegLoc() && "Only register-returns should be created by PCS");
+  return LowerF128Call(Op, DAG, LC);
+}
 
+SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
 
-    SDValue Arg = OutVals[i];
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
 
-    // There's no convenient note in the ABI about this as there is for normal
-    // arguments, but it says return values are passed in the same registers as
-    // an argument would be. I believe that includes the comments about
-    // unspecified higher bits, putting the burden of widening on the *caller*
-    // for return values.
-    switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info");
-    case CCValAssign::Full: break;
-    case CCValAssign::SExt:
-    case CCValAssign::ZExt:
-    case CCValAssign::AExt:
-      // Floating-point values should only be extended when they're going into
-      // memory, which can't happen here so an integer extend is acceptable.
-      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
-      break;
-    }
+  // FP_ROUND node has a second operand indicating whether it is known to be
+  // precise. That doesn't take part in the LibCall so we can't directly use
+  // LowerF128Call.
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
+                     /*isSigned*/ false, SDLoc(Op)).first;
+}
 
-    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
-    Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
-  }
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT InVT = Op.getOperand(0).getValueType();
+  EVT VT = Op.getValueType();
 
-  RetOps[0] = Chain;  // Update chain.
+  // FP_TO_XINT conversion from the same type are legal.
+  if (VT.getSizeInBits() == InVT.getSizeInBits())
+    return Op;
 
-  // Add the flag if we have it.
-  if (Flag.getNode())
-    RetOps.push_back(Flag);
+  if (InVT == MVT::v2f64 || InVT == MVT::v4f32) {
+    SDLoc dl(Op);
+    SDValue Cv =
+        DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
+                    Op.getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
+  } else if (InVT == MVT::v2f32) {
+    SDLoc dl(Op);
+    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+    return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
+  }
 
-  return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other,
-                     &RetOps[0], RetOps.size());
+  // Type changing conversions are illegal.
+  return SDValue();
 }
 
-unsigned AArch64TargetLowering::getByValTypeAlignment(Type *Ty) const {
-  // This is a new backend. For anything more precise than this a FE should
-  // set an explicit alignment.
-  return 4;
-}
+SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType().isVector())
+    return LowerVectorFP_TO_INT(Op, DAG);
 
-SDValue
-AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
-                                 SmallVectorImpl<SDValue> &InVals) const {
-  SelectionDAG &DAG                     = CLI.DAG;
-  SDLoc &dl                             = CLI.DL;
-  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
-  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
-  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
-  SDValue Chain                         = CLI.Chain;
-  SDValue Callee                        = CLI.Callee;
-  bool &IsTailCall                      = CLI.IsTailCall;
-  CallingConv::ID CallConv              = CLI.CallConv;
-  bool IsVarArg                         = CLI.IsVarArg;
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
 
-  MachineFunction &MF = DAG.getMachineFunction();
-  AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
-  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
-  bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet();
-  bool IsSibCall = false;
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::FP_TO_SINT)
+    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
 
-  if (IsTailCall) {
-    IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
-                    IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
-                                                   Outs, OutVals, Ins, DAG);
+  SmallVector<SDValue, 2> Ops;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
 
-    // A sibling call is one where we're under the usual C ABI and not planning
-    // to change that but can still do a tail call:
-    if (!TailCallOpt && IsTailCall)
-      IsSibCall = true;
-  }
+  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
+                     SDLoc(Op)).first;
+}
 
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  SDValue In = Op.getOperand(0);
+  EVT InVT = In.getValueType();
 
-  // On AArch64 (and all other architectures I'm aware of) the most this has to
-  // do is adjust the stack pointer.
-  unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16);
-  if (IsSibCall) {
-    // Since we're not changing the ABI to make this a tail call, the memory
-    // operands are already available in the caller's incoming argument space.
-    NumBytes = 0;
+  // v2i32 to v2f32 is legal.
+  if (VT == MVT::v2f32 && InVT == MVT::v2i32)
+    return Op;
+
+  // This function only handles v2f64 outputs.
+  if (VT == MVT::v2f64) {
+    // Extend the input argument to a v2i64 that we can feed into the
+    // floating point conversion. Zero or sign extend based on whether
+    // we're doing a signed or unsigned float conversion.
+    unsigned Opc =
+        Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+    assert(Op.getNumOperands() == 1 && "FP conversions take one argument");
+    SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0));
+    return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted);
   }
 
-  // FPDiff is the byte offset of the call's argument area from the callee's.
-  // Stores to callee stack arguments will be placed in FixedStackSlots offset
-  // by this amount for a tail call. In a sibling call it must be 0 because the
-  // caller will deallocate the entire stack and the callee still expects its
-  // arguments to begin at SP+0. Completely unused for non-tail calls.
-  int FPDiff = 0;
+  // Scalarize v2i64 to v2f32 conversions.
+  std::vector<SDValue> BuildVectorOps;
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+    SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In,
+                               DAG.getConstant(i, MVT::i64));
+    Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr);
+    BuildVectorOps.push_back(Sclr);
+  }
 
-  if (IsTailCall && !IsSibCall) {
-    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BuildVectorOps);
+}
 
-    // FPDiff will be negative if this tail call requires more space than we
-    // would automatically have in our incoming argument space. Positive if we
-    // can actually shrink the stack.
-    FPDiff = NumReusableBytes - NumBytes;
+SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  if (Op.getValueType().isVector())
+    return LowerVectorINT_TO_FP(Op, DAG);
 
-    // The stack pointer must be 16-byte aligned at all times it's used for a
-    // memory operation, which in practice means at *all* times and in
-    // particular across call boundaries. Therefore our own arguments started at
-    // a 16-byte aligned SP and the delta applied for the tail call should
-    // satisfy the same constraint.
-    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+  // i128 conversions are libcalls.
+  if (Op.getOperand(0).getValueType() == MVT::i128)
+    return SDValue();
+
+  // Other conversions are legal, unless it's to the completely software-based
+  // fp128.
+  if (Op.getValueType() != MVT::f128)
+    return Op;
+
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::SINT_TO_FP)
+    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128Call(Op, DAG, LC);
+}
+
+SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two S / D registers.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  const char *LibcallName =
+      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0);
+
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.first;
+}
+
+SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unimplemented operand");
+    return SDValue();
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG);
+  case ISD::BR_CC:
+    return LowerBR_CC(Op, DAG);
+  case ISD::SELECT:
+    return LowerSELECT(Op, DAG);
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::VACOPY:
+    return LowerVACOPY(Op, DAG);
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:
+    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    return LowerXALUO(Op, DAG);
+  case ISD::FADD:
+    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
+  case ISD::FSUB:
+    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
+  case ISD::FMUL:
+    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+  case ISD::FDIV:
+    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
+  case ISD::FP_ROUND:
+    return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND:
+    return LowerFP_EXTEND(Op, DAG);
+  case ISD::FRAMEADDR:
+    return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:
+    return LowerRETURNADDR(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:
+    return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:
+    return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:
+    return LowerVectorSRA_SRL_SHL(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRL_PARTS:
+  case ISD::SRA_PARTS:
+    return LowerShiftRightParts(Op, DAG);
+  case ISD::CTPOP:
+    return LowerCTPOP(Op, DAG);
+  case ISD::FCOPYSIGN:
+    return LowerFCOPYSIGN(Op, DAG);
+  case ISD::AND:
+    return LowerVectorAND(Op, DAG);
+  case ISD::OR:
+    return LowerVectorOR(Op, DAG);
+  case ISD::XOR:
+    return LowerXOR(Op, DAG);
+  case ISD::PREFETCH:
+    return LowerPREFETCH(Op, DAG);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FSINCOS:
+    return LowerFSINCOS(Op, DAG);
   }
+}
 
-  if (!IsSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                                 dl);
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
+  return 2;
+}
 
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP,
-                                        getPointerTy());
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
 
-  SmallVector<SDValue, 8> MemOpChains;
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+#include "AArch64GenCallingConv.inc"
+
+/// Selects the correct CCAssignFn for a the given CallingConvention
+/// value.
+CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                     bool IsVarArg) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention.");
+  case CallingConv::WebKit_JS:
+    return CC_AArch64_WebKit_JS;
+  case CallingConv::C:
+  case CallingConv::Fast:
+    if (!Subtarget->isTargetDarwin())
+      return CC_AArch64_AAPCS;
+    return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
+  }
+}
 
+SDValue AArch64TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // At this point, Ins[].VT may already be promoted to i32. To correctly
+  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
+  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
+  // LocVT.
+  unsigned NumArgs = Ins.size();
+  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ValVT = Ins[i].VT;
+    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[i].OrigArgIndex;
+
+    // Get type of the original argument.
+    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+      ValVT = MVT::i8;
+    else if (ActualMVT == MVT::i16)
+      ValVT = MVT::i16;
+
+    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+    bool Res =
+        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+  assert(ArgLocs.size() == Ins.size());
+  SmallVector<SDValue, 16> ArgValues;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    ISD::ArgFlagsTy Flags = Outs[i].Flags;
-    SDValue Arg = OutVals[i];
-
-    // Callee does the actual widening, so all extensions just use an implicit
-    // definition of the rest of the Loc. Aesthetically, this would be nicer as
-    // an ANY_EXTEND, but that isn't valid for floating-point types and this
-    // alternative works on integer types too.
-    switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full: break;
-    case CCValAssign::SExt:
-    case CCValAssign::ZExt:
-    case CCValAssign::AExt:
-    case CCValAssign::FPExt: {
-      unsigned SrcSize = VA.getValVT().getSizeInBits();
-      unsigned SrcSubReg;
-
-      switch (SrcSize) {
-      case 8: SrcSubReg = AArch64::sub_8; break;
-      case 16: SrcSubReg = AArch64::sub_16; break;
-      case 32: SrcSubReg = AArch64::sub_32; break;
-      case 64: SrcSubReg = AArch64::sub_64; break;
-      default: llvm_unreachable("Unexpected argument promotion");
-      }
 
-      Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
-                                    VA.getLocVT(),
-                                    DAG.getUNDEF(VA.getLocVT()),
-                                    Arg,
-                                    DAG.getTargetConstant(SrcSubReg, MVT::i32)),
-                    0);
+    if (Ins[i].Flags.isByVal()) {
+      // Byval is used for HFAs in the PCS, but the system should work in a
+      // non-compliant manner for larger structs.
+      EVT PtrTy = getPointerTy();
+      int Size = Ins[i].Flags.getByValSize();
+      unsigned NumRegs = (Size + 7) / 8;
 
-      break;
-    }
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
-      break;
-    }
+      // FIXME: This works on big-endian for composite byvals, which are the common
+      // case. It should also work for fundamental types too.
+      unsigned FrameIdx =
+        MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
+      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
+      InVals.push_back(FrameIdxN);
 
-    if (VA.isRegLoc()) {
-      // A normal register (sub-) argument. For now we just note it down because
-      // we want to copy things into registers as late as possible to avoid
-      // register-pressure (and possibly worse).
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
       continue;
-    }
+    } if (VA.isRegLoc()) {
+      // Arguments stored in registers.
+      EVT RegVT = VA.getLocVT();
+
+      SDValue ArgValue;
+      const TargetRegisterClass *RC;
+
+      if (RegVT == MVT::i32)
+        RC = &AArch64::GPR32RegClass;
+      else if (RegVT == MVT::i64)
+        RC = &AArch64::GPR64RegClass;
+      else if (RegVT == MVT::f32)
+        RC = &AArch64::FPR32RegClass;
+      else if (RegVT == MVT::f64 || RegVT.is64BitVector())
+        RC = &AArch64::FPR64RegClass;
+      else if (RegVT == MVT::f128 || RegVT.is128BitVector())
+        RC = &AArch64::FPR128RegClass;
+      else
+        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
-    assert(VA.isMemLoc() && "unexpected argument location");
+      // Transform the arguments in physical registers into virtual ones.
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
 
-    SDValue DstAddr;
-    MachinePointerInfo DstInfo;
-    if (IsTailCall) {
-      uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() :
-                                          VA.getLocVT().getSizeInBits();
-      OpSize = (OpSize + 7) / 8;
-      int32_t Offset = VA.getLocMemOffset() + FPDiff;
-      int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+      // If this is an 8, 16 or 32-bit value, it is really passed promoted
+      // to 64 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::AExt:
+      case CCValAssign::SExt:
+      case CCValAssign::ZExt:
+        // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
+        // nodes after our lowering.
+        assert(RegVT == Ins[i].VT && "incorrect register location selected");
+        break;
+      }
 
-      DstAddr = DAG.getFrameIndex(FI, getPointerTy());
-      DstInfo = MachinePointerInfo::getFixedStack(FI);
+      InVals.push_back(ArgValue);
+
+    } else { // VA.isRegLoc()
+      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
+      unsigned ArgOffset = VA.getLocMemOffset();
+      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
 
-      // Make sure any stack arguments overlapping with where we're storing are
-      // loaded before this eventual operation. Otherwise they'll be clobbered.
-      Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
-    } else {
-      uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize()*8 :
-                                          VA.getLocVT().getSizeInBits();
-      OpSize = (OpSize + 7) / 8;
       uint32_t BEAlign = 0;
-      if (OpSize < 8 && !getSubtarget()->isLittle())
-        BEAlign = 8-OpSize;
-      SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() + BEAlign);
+      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+        BEAlign = 8 - ArgSize;
 
-      DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
-      DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset());
-    }
+      int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
 
-    if (Flags.isByVal()) {
-      SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64);
-      SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode,
-                                  Flags.getByValAlign(),
-                                  /*isVolatile = */ false,
-                                  /*alwaysInline = */ false,
-                                  DstInfo, MachinePointerInfo(0));
-      MemOpChains.push_back(Cpy);
-    } else {
-      // Normal stack argument, put it where it's needed.
-      SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo,
-                                   false, false, 0);
-      MemOpChains.push_back(Store);
-    }
-  }
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue ArgValue;
 
-  // The loads and stores generated above shouldn't clash with each
-  // other. Combining them with this TokenFactor notes that fact for the rest of
-  // the backend.
-  if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+      ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+      switch (VA.getLocInfo()) {
+      default:
+        break;
+      case CCValAssign::SExt:
+        ExtType = ISD::SEXTLOAD;
+        break;
+      case CCValAssign::ZExt:
+        ExtType = ISD::ZEXTLOAD;
+        break;
+      case CCValAssign::AExt:
+        ExtType = ISD::EXTLOAD;
+        break;
+      }
 
-  // Most of the rest of the instructions need to be glued together; we don't
-  // want assignments to actual registers used by a call to be rearranged by a
-  // well-meaning scheduler.
-  SDValue InFlag;
+      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getValVT(), Chain, FIN,
+                                MachinePointerInfo::getFixedStack(FI),
+                                VA.getLocVT(),
+                                false, false, false, 0);
 
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
+      InVals.push_back(ArgValue);
+    }
   }
 
-  // The linker is responsible for inserting veneers when necessary to put a
-  // function call destination in range, so we don't need to bother with a
-  // wrapper here.
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = G->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
-  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    const char *Sym = S->getSymbol();
-    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
-  }
+  // varargs
+  if (isVarArg) {
+    if (!Subtarget->isTargetDarwin()) {
+      // The AAPCS variadic function ABI is identical to the non-variadic
+      // one. As a result there may be more arguments in registers and we should
+      // save them for future reference.
+      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
+    }
 
-  // We don't usually want to end the call-sequence here because we would tidy
-  // the frame up *after* the call, however in the ABI-changing tail-call case
-  // we've carefully laid out the parameters so that when sp is reset they'll be
-  // in the correct location.
-  if (IsTailCall && !IsSibCall) {
-    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                               DAG.getIntPtrConstant(0, true), InFlag, dl);
-    InFlag = Chain.getValue(1);
+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    // This will point to the next argument passed via stack.
+    unsigned StackOffset = CCInfo.getNextStackOffset();
+    // We currently pass all varargs at 8-byte alignment.
+    StackOffset = ((StackOffset + 7) & ~7);
+    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
   }
 
-  // We produce the following DAG scheme for the actual call instruction:
-  //     (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag?
-  //
-  // Most arguments aren't going to be used and just keep the values live as
-  // far as LLVM is concerned. It's expected to be selected as simply "bl
-  // callee" (for a direct, non-tail call).
-  std::vector<SDValue> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Callee);
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  unsigned StackArgSize = CCInfo.getNextStackOffset();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
+    // This is a non-standard ABI so by fiat I say we're allowed to make full
+    // use of the stack area to be popped, which must be aligned to 16 bytes in
+    // any case:
+    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
 
-  if (IsTailCall) {
-    // Each tail call may have to adjust the stack by a different amount, so
-    // this information must travel along with the operation for eventual
-    // consumption by emitEpilogue.
-    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
+    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
+    // a multiple of 16.
+    FuncInfo->setArgumentStackToRestore(StackArgSize);
+
+    // This realignment carries over to the available bytes below. Our own
+    // callers will guarantee the space is free by giving an aligned value to
+    // CALLSEQ_START.
   }
+  // Even if we're not expected to free up the space, it's useful to know how
+  // much is there while considering tail calls (because we can reuse it).
+  FuncInfo->setBytesInStackArgArea(StackArgSize);
 
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
-    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
-                                  RegsToPass[i].second.getValueType()));
+  return Chain;
+}
 
+void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
+                                                SelectionDAG &DAG, SDLoc DL,
+                                                SDValue &Chain) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
-  // Add a register mask operand representing the call-preserved registers. This
-  // is used later in codegen to constrain register-allocation.
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
+  SmallVector<SDValue, 8> MemOps;
 
-  // If we needed glue, put it in as the last argument.
-  if (InFlag.getNode())
-    Ops.push_back(InFlag);
+  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
+                                          AArch64::X3, AArch64::X4, AArch64::X5,
+                                          AArch64::X6, AArch64::X7 };
+  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
+  unsigned FirstVariadicGPR =
+      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
 
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
+  int GPRIdx = 0;
+  if (GPRSaveSize != 0) {
+    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
 
-  if (IsTailCall) {
-    return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+
+    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
+      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                       MachinePointerInfo::getStack(i * 8), false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                        DAG.getConstant(8, getPointerTy()));
+    }
   }
+  FuncInfo->setVarArgsGPRIndex(GPRIdx);
+  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
 
-  Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size());
-  InFlag = Chain.getValue(1);
+  if (Subtarget->hasFPARMv8()) {
+    static const MCPhysReg FPRArgRegs[] = {
+        AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
+        AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
+    static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
+    unsigned FirstVariadicFPR =
+        CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
+
+    unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
+    int FPRIdx = 0;
+    if (FPRSaveSize != 0) {
+      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
 
-  // Now we can reclaim the stack, just as well do it before working out where
-  // our return value is.
-  if (!IsSibCall) {
-    uint64_t CalleePopBytes
-      = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0;
+      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
 
-    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                               DAG.getIntPtrConstant(CalleePopBytes, true),
-                               InFlag, dl);
-    InFlag = Chain.getValue(1);
+      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+        unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
+
+        SDValue Store =
+            DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                         MachinePointerInfo::getStack(i * 16), false, false, 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                          DAG.getConstant(16, getPointerTy()));
+      }
+    }
+    FuncInfo->setVarArgsFPRIndex(FPRIdx);
+    FuncInfo->setVarArgsFPRSize(FPRSaveSize);
   }
 
-  return LowerCallResult(Chain, InFlag, CallConv,
-                         IsVarArg, Ins, dl, DAG, InVals);
+  if (!MemOps.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+  }
 }
 
-SDValue
-AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                      CallingConv::ID CallConv, bool IsVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc dl, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue AArch64TargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+    SDValue ThisVal) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), RVLocs, *DAG.getContext());
-  CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv));
+  CCInfo.AnalyzeCallResult(Ins, RetCC);
 
+  // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign VA = RVLocs[i];
 
-    // Return values that are too big to fit into registers should use an sret
-    // pointer, so this can be a lot simpler than the main argument code.
-    assert(VA.isRegLoc() && "Memory locations not expected for call return");
+    // Pass 'this' value directly from the argument to return value, to avoid
+    // reg unit interference
+    if (i == 0 && isThisReturn) {
+      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
+             "unexpected return calling convention register assignment");
+      InVals.push_back(ThisVal);
+      continue;
+    }
 
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
-                                     InFlag);
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
     Chain = Val.getValue(1);
     InFlag = Val.getValue(2);
 
     switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full: break;
-    case CCValAssign::BCvt:
-      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
       break;
-    case CCValAssign::ZExt:
-    case CCValAssign::SExt:
-    case CCValAssign::AExt:
-      // Floating-point arguments only get extended/truncated if they're going
-      // in memory, so using the integer operation is acceptable here.
-      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
       break;
     }
 
@@ -1812,17 +1960,12 @@ AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   return Chain;
 }
 
-bool
-AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
-                                    CallingConv::ID CalleeCC,
-                                    bool IsVarArg,
-                                    bool IsCalleeStructRet,
-                                    bool IsCallerStructRet,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SelectionDAG& DAG) const {
-
+bool AArch64TargetLowering::isEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    bool isCalleeStructRet, bool isCallerStructRet,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   // For CallingConv::C this function knows whether the ABI needs
   // changing. That's not true for other conventions so they will have to opt in
   // manually.
@@ -1838,7 +1981,8 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // we want to reuse during a tail call. Working around this *is* possible (see
   // X86) but less efficient and uglier in LowerCall.
   for (Function::const_arg_iterator i = CallerF->arg_begin(),
-         e = CallerF->arg_end(); i != e; ++i)
+                                    e = CallerF->arg_end();
+       i != e; ++i)
     if (i->hasByValAttr())
       return false;
 
@@ -1854,10 +1998,10 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
   // I want anyone implementing a new calling convention to think long and hard
   // about this assert.
-  assert((!IsVarArg || CalleeCC == CallingConv::C)
-         && "Unexpected variadic calling convention");
+  assert((!isVarArg || CalleeCC == CallingConv::C) &&
+         "Unexpected variadic calling convention");
 
-  if (IsVarArg && !Outs.empty()) {
+  if (isVarArg && !Outs.empty()) {
     // At least two cases here: if caller is fastcc then we can't have any
     // memory arguments (we'd be expected to clean up the stack afterwards). If
     // caller is C then we could potentially use its argument area.
@@ -1865,10 +2009,10 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // FIXME: for now we take the most conservative of these in both cases:
     // disallow all variadic memory operands.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
                    getTargetMachine(), ArgLocs, *DAG.getContext());
 
-    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
       if (!ArgLocs[i].isRegLoc())
         return false;
@@ -1880,12 +2024,12 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     SmallVector<CCValAssign, 16> RVLocs1;
     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
                     getTargetMachine(), RVLocs1, *DAG.getContext());
-    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC));
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
 
     SmallVector<CCValAssign, 16> RVLocs2;
     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
                     getTargetMachine(), RVLocs2, *DAG.getContext());
-    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC));
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
 
     if (RVLocs1.size() != RVLocs2.size())
       return false;
@@ -1909,28 +2053,18 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     return true;
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
+  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
 
-  const AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
+  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
   // If the stack arguments for this call would fit into our own save area then
   // the call can be made tail.
   return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
 }
 
-bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
-                                                   bool TailCallOpt) const {
-  return CallCC == CallingConv::Fast && TailCallOpt;
-}
-
-bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
-  return CallCC == CallingConv::Fast;
-}
-
 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
                                                    SelectionDAG &DAG,
                                                    MachineFrameInfo *MFI,
@@ -1946,7 +2080,8 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
 
   // Add a chain value for each stack argument corresponding
   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
-         UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U)
+                            UE = DAG.getEntryNode().getNode()->use_end();
+       U != UE; ++U)
     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
         if (FI->getIndex() < 0) {
@@ -1959,625 +2094,609 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
             ArgChains.push_back(SDValue(L, 1));
         }
 
-   // Build a tokenfactor for all the chains.
-   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
-                      &ArgChains[0], ArgChains.size());
+  // Build a tokenfactor for all the chains.
+  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
 }
 
-static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) {
-  switch (CC) {
-  case ISD::SETEQ:  return A64CC::EQ;
-  case ISD::SETGT:  return A64CC::GT;
-  case ISD::SETGE:  return A64CC::GE;
-  case ISD::SETLT:  return A64CC::LT;
-  case ISD::SETLE:  return A64CC::LE;
-  case ISD::SETNE:  return A64CC::NE;
-  case ISD::SETUGT: return A64CC::HI;
-  case ISD::SETUGE: return A64CC::HS;
-  case ISD::SETULT: return A64CC::LO;
-  case ISD::SETULE: return A64CC::LS;
-  default: llvm_unreachable("Unexpected condition code");
-  }
+bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
+                                                   bool TailCallOpt) const {
+  return CallCC == CallingConv::Fast && TailCallOpt;
 }
 
-bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const {
-  // icmp is implemented using adds/subs immediate, which take an unsigned
-  // 12-bit immediate, optionally shifted left by 12 bits.
+bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
+  return CallCC == CallingConv::Fast;
+}
 
-  // Symmetric by using adds/subs
-  if (Val < 0)
-    Val = -Val;
+/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
+/// and add input and output parameter nodes.
+SDValue
+AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
 
-  return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0;
-}
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool IsThisReturn = false;
 
-SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS,
-                                        ISD::CondCode CC, SDValue &A64cc,
-                                        SelectionDAG &DAG, SDLoc &dl) const {
-  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
-    int64_t C = 0;
-    EVT VT = RHSC->getValueType(0);
-    bool knownInvalid = false;
-
-    // I'm not convinced the rest of LLVM handles these edge cases properly, but
-    // we can at least get it right.
-    if (isSignedIntSetCC(CC)) {
-      C = RHSC->getSExtValue();
-    } else if (RHSC->getZExtValue() > INT64_MAX) {
-      // A 64-bit constant not representable by a signed 64-bit integer is far
-      // too big to fit into a SUBS immediate anyway.
-      knownInvalid = true;
-    } else {
-      C = RHSC->getZExtValue();
-    }
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  bool IsSibCall = false;
 
-    if (!knownInvalid && !isLegalICmpImmediate(C)) {
-      // Constant does not fit, try adjusting it by one?
-      switch (CC) {
-      default: break;
-      case ISD::SETLT:
-      case ISD::SETGE:
-        if (isLegalICmpImmediate(C-1)) {
-          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
-          RHS = DAG.getConstant(C-1, VT);
-        }
-        break;
-      case ISD::SETULT:
-      case ISD::SETUGE:
-        if (isLegalICmpImmediate(C-1)) {
-          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
-          RHS = DAG.getConstant(C-1, VT);
-        }
-        break;
-      case ISD::SETLE:
-      case ISD::SETGT:
-        if (isLegalICmpImmediate(C+1)) {
-          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
-          RHS = DAG.getConstant(C+1, VT);
-        }
-        break;
-      case ISD::SETULE:
-      case ISD::SETUGT:
-        if (isLegalICmpImmediate(C+1)) {
-          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
-          RHS = DAG.getConstant(C+1, VT);
-        }
-        break;
-      }
-    }
+  if (IsTailCall) {
+    // Check if it's really possible to do a tail call.
+    IsTailCall = isEligibleForTailCallOptimization(
+        Callee, CallConv, IsVarArg, IsStructRet,
+        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
+    if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+      report_fatal_error("failed to perform tail call elimination on a call "
+                         "site marked musttail");
+
+    // A sibling call is one where we're under the usual C ABI and not planning
+    // to change that but can still do a tail call:
+    if (!TailCallOpt && IsTailCall)
+      IsSibCall = true;
+
+    if (IsTailCall)
+      ++NumTailCalls;
   }
 
-  A64CC::CondCodes CondCode = IntCCToA64CC(CC);
-  A64cc = DAG.getConstant(CondCode, MVT::i32);
-  return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                     DAG.getCondCode(CC));
-}
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
-static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC,
-                                    A64CC::CondCodes &Alternative) {
-  A64CC::CondCodes CondCode = A64CC::Invalid;
-  Alternative = A64CC::Invalid;
+  if (IsVarArg) {
+    // Handle fixed and variable vector arguments differently.
+    // Variable vector arguments always go into memory.
+    unsigned NumArgs = Outs.size();
+
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ArgVT = Outs[i].VT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
+                                               /*IsVarArg=*/ !Outs[i].IsFixed);
+      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  } else {
+    // At this point, Outs[].VT may already be promoted to i32. To correctly
+    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+    // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
+    // we use a special version of AnalyzeCallOperands to pass in ValVT and
+    // LocVT.
+    unsigned NumArgs = Outs.size();
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ValVT = Outs[i].VT;
+      // Get type of the original argument.
+      EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
+                                  /*AllowUnknown*/ true);
+      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+        ValVT = MVT::i8;
+      else if (ActualMVT == MVT::i16)
+        ValVT = MVT::i16;
+
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+      bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  }
 
-  switch (CC) {
-  default: llvm_unreachable("Unknown FP condition!");
-  case ISD::SETEQ:
-  case ISD::SETOEQ: CondCode = A64CC::EQ; break;
-  case ISD::SETGT:
-  case ISD::SETOGT: CondCode = A64CC::GT; break;
-  case ISD::SETGE:
-  case ISD::SETOGE: CondCode = A64CC::GE; break;
-  case ISD::SETOLT: CondCode = A64CC::MI; break;
-  case ISD::SETOLE: CondCode = A64CC::LS; break;
-  case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break;
-  case ISD::SETO:   CondCode = A64CC::VC; break;
-  case ISD::SETUO:  CondCode = A64CC::VS; break;
-  case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break;
-  case ISD::SETUGT: CondCode = A64CC::HI; break;
-  case ISD::SETUGE: CondCode = A64CC::PL; break;
-  case ISD::SETLT:
-  case ISD::SETULT: CondCode = A64CC::LT; break;
-  case ISD::SETLE:
-  case ISD::SETULE: CondCode = A64CC::LE; break;
-  case ISD::SETNE:
-  case ISD::SETUNE: CondCode = A64CC::NE; break;
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  if (IsSibCall) {
+    // Since we're not changing the ABI to make this a tail call, the memory
+    // operands are already available in the caller's incoming argument space.
+    NumBytes = 0;
   }
-  return CondCode;
-}
 
-SDValue
-AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT PtrVT = getPointerTy();
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  // FPDiff is the byte offset of the call's argument area from the callee's.
+  // Stores to callee stack arguments will be placed in FixedStackSlots offset
+  // by this amount for a tail call. In a sibling call it must be 0 because the
+  // caller will deallocate the entire stack and the callee still expects its
+  // arguments to begin at SP+0. Completely unused for non-tail calls.
+  int FPDiff = 0;
 
-  switch(getTargetMachine().getCodeModel()) {
-  case CodeModel::Small:
-    // The most efficient code is PC-relative anyway for the small memory model,
-    // so we don't need to worry about relocation model.
-    return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                       DAG.getTargetBlockAddress(BA, PtrVT, 0,
-                                                 AArch64II::MO_NO_FLAG),
-                       DAG.getTargetBlockAddress(BA, PtrVT, 0,
-                                                 AArch64II::MO_LO12),
-                       DAG.getConstant(/*Alignment=*/ 4, MVT::i32));
-  case CodeModel::Large:
-    return DAG.getNode(
-      AArch64ISD::WrapperLarge, DL, PtrVT,
-      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G3),
-      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
-      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
-      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
-  default:
-    llvm_unreachable("Only small and large code models supported now");
+  if (IsTailCall && !IsSibCall) {
+    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+
+    // Since callee will pop argument stack as a tail call, we must keep the
+    // popped size 16-byte aligned.
+    NumBytes = RoundUpToAlignment(NumBytes, 16);
+
+    // FPDiff will be negative if this tail call requires more space than we
+    // would automatically have in our incoming argument space. Positive if we
+    // can actually shrink the stack.
+    FPDiff = NumReusableBytes - NumBytes;
+
+    // The stack pointer must be 16-byte aligned at all times it's used for a
+    // memory operation, which in practice means at *all* times and in
+    // particular across call boundaries. Therefore our own arguments started at
+    // a 16-byte aligned SP and the delta applied for the tail call should
+    // satisfy the same constraint.
+    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
   }
-}
 
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  if (!IsSibCall)
+    Chain =
+        DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
 
-// (BRCOND chain, val, dest)
-SDValue
-AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue TheBit = Op.getOperand(1);
-  SDValue DestBB = Op.getOperand(2);
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
 
-  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
-  // that as the consumer we are responsible for ignoring rubbish in higher
-  // bits.
-  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
-                       DAG.getConstant(1, MVT::i32));
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
 
-  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
-                               DAG.getConstant(0, TheBit.getValueType()),
-                               DAG.getCondCode(ISD::SETNE));
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
 
-  return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain,
-                     A64CMP, DAG.getConstant(A64CC::NE, MVT::i32),
-                     DestBB);
-}
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      if (Outs[realArgIdx].ArgVT == MVT::i1) {
+        // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
+      }
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::FPExt:
+      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    }
 
-// (BR_CC chain, condcode, lhs, rhs, dest)
-SDValue
-AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue Chain = Op.getOperand(0);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
-  SDValue LHS = Op.getOperand(2);
-  SDValue RHS = Op.getOperand(3);
-  SDValue DestBB = Op.getOperand(4);
+    if (VA.isRegLoc()) {
+      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
+        assert(VA.getLocVT() == MVT::i64 &&
+               "unexpected calling convention register assignment");
+        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
+               "unexpected use of 'returned'");
+        IsThisReturn = true;
+      }
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
 
-  if (LHS.getValueType() == MVT::f128) {
-    // f128 comparisons are lowered to runtime calls by a routine which sets
-    // LHS, RHS and CC appropriately for the rest of this function to continue.
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+      SDValue DstAddr;
+      MachinePointerInfo DstInfo;
 
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (RHS.getNode() == 0) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
+      // FIXME: This works on big-endian for composite byvals, which are the
+      // common case. It should also work for fundamental types too.
+      uint32_t BEAlign = 0;
+      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+                                        : VA.getLocVT().getSizeInBits();
+      OpSize = (OpSize + 7) / 8;
+      if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+        if (OpSize < 8)
+          BEAlign = 8 - OpSize;
+      }
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      int32_t Offset = LocMemOffset + BEAlign;
+      SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+
+      if (IsTailCall) {
+        Offset = Offset + FPDiff;
+        int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+
+        DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+        DstInfo = MachinePointerInfo::getFixedStack(FI);
+
+        // Make sure any stack arguments overlapping with where we're storing
+        // are loaded before this eventual operation. Otherwise they'll be
+        // clobbered.
+        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+      } else {
+        SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+
+        DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+        DstInfo = MachinePointerInfo::getStack(LocMemOffset);
+      }
+
+      if (Outs[i].Flags.isByVal()) {
+        SDValue SizeNode =
+            DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
+        SDValue Cpy = DAG.getMemcpy(
+            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+            /*isVolatile = */ false,
+            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
+
+        MemOpChains.push_back(Cpy);
+      } else {
+        // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
+        // promoted to a legal register type i32, we should truncate Arg back to
+        // i1/i8/i16.
+        if (Arg.getValueType().isSimple() &&
+            Arg.getValueType().getSimpleVT() == MVT::i32 &&
+            (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
+             VA.getLocVT() == MVT::i16))
+          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
+
+        SDValue Store =
+            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
+        MemOpChains.push_back(Store);
+      }
     }
   }
 
-  if (LHS.getValueType().isInteger()) {
-    SDValue A64cc;
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
-    // Integers are handled in a separate function because the combinations of
-    // immediates and tests can get hairy and we may want to fiddle things.
-    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
 
-    return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
-                       Chain, CmpOp, A64cc, DestBB);
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      Subtarget->isTargetMachO()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      const GlobalValue *GV = G->getGlobal();
+      bool InternalLinkage = GV->hasInternalLinkage();
+      if (InternalLinkage)
+        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+      else {
+        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
+                                            AArch64II::MO_GOT);
+        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+      }
+    } else if (ExternalSymbolSDNode *S =
+                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      const char *Sym = S->getSymbol();
+      Callee =
+          DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
+      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+    }
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
   }
 
-  // Note that some LLVM floating-point CondCodes can't be lowered to a single
-  // conditional branch, hence FPCCToA64CC can set a second test, where either
-  // passing is sufficient.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
-                                 Chain, SetCC, A64cc, DestBB);
+  // We don't usually want to end the call-sequence here because we would tidy
+  // the frame up *after* the call, however in the ABI-changing tail-call case
+  // we've carefully laid out the parameters so that when sp is reset they'll be
+  // in the correct location.
+  if (IsTailCall && !IsSibCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(0, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
+  }
 
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
-                           A64BR_CC, SetCC, A64cc, DestBB);
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
 
+  if (IsTailCall) {
+    // Each tail call may have to adjust the stack by a different amount, so
+    // this information must travel along with the operation for eventual
+    // consumption by emitEpilogue.
+    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
   }
 
-  return A64BR_CC;
-}
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
 
-SDValue
-AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
-                                       RTLIB::Libcall Call) const {
-  ArgListTy Args;
-  ArgListEntry Entry;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
-    EVT ArgVT = Op.getOperand(i).getValueType();
-    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-    Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy;
-    Entry.isSExt = false;
-    Entry.isZExt = false;
-    Args.push_back(Entry);
-  }
-  SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy());
+  // Add a register mask operand representing the call-preserved registers.
+  const uint32_t *Mask;
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  if (IsThisReturn) {
+    // For 'this' returns, use the X0-preserving mask if applicable
+    Mask = ARI->getThisReturnPreservedMask(CallConv);
+    if (!Mask) {
+      IsThisReturn = false;
+      Mask = ARI->getCallPreservedMask(CallConv);
+    }
+  } else
+    Mask = ARI->getCallPreservedMask(CallConv);
 
-  Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
 
-  // By default, the input chain to this libcall is the entry node of the
-  // function. If the libcall is going to be emitted as a tail call then
-  // isUsedByReturnOnly will change it to the right chain if the return
-  // node which is being folded has a non-entry input chain.
-  SDValue InChain = DAG.getEntryNode();
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
 
-  // isTailCall may be true since the callee does not reference caller stack
-  // frame. Check if it's in the right position.
-  SDValue TCChain = InChain;
-  bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain);
-  if (isTailCall)
-    InChain = TCChain;
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
-  TargetLowering::
-  CallLoweringInfo CLI(InChain, RetTy, false, false, false, false,
-                    0, getLibcallCallingConv(Call), isTailCall,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, SDLoc(Op));
-  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  // If we're doing a tall call, use a TC_RETURN here rather than an
+  // actual call instruction.
+  if (IsTailCall)
+    return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
 
-  if (!CallInfo.second.getNode())
-    // It's a tailcall, return the chain (which is the DAG root).
-    return DAG.getRoot();
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
 
-  return CallInfo.first;
-}
+  uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
+                                ? RoundUpToAlignment(NumBytes, 16)
+                                : 0;
 
-SDValue
-AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
-  }
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(CalleePopBytes, true),
+                             InFlag, DL);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
 
-  RTLIB::Libcall LC;
-  LC  = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+                         InVals, IsThisReturn,
+                         IsThisReturn ? OutVals[0] : SDValue());
+}
 
-  SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op)).first;
+bool AArch64TargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC);
 }
 
 SDValue
-AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                   bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   SDLoc DL, SelectionDAG &DAG) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC);
 
-  RTLIB::Libcall LC;
-  LC  = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+  // Copy the result values into the output registers.
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue Arg = OutVals[realRVLocIdx];
 
-  return LowerF128ToCall(Op, DAG, LC);
-}
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      if (Outs[i].ArgVT == MVT::i1) {
+        // AAPCS requires i1 to be zero-extended to i8 by the producer of the
+        // value. This is strictly redundant on Darwin (which uses "zeroext
+        // i1"), but will be optimised out before ISel.
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      }
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
 
-static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG,
-                                    bool IsSigned) {
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  SDValue Vec = Op.getOperand(0);
-  EVT OpVT = Vec.getValueType();
-  unsigned Opc = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
-
-  if (VT.getVectorNumElements() == 1) {
-    assert(OpVT == MVT::v1f64 && "Unexpected vector type!");
-    if (VT.getSizeInBits() == OpVT.getSizeInBits())
-      return Op;
-    return DAG.UnrollVectorOp(Op.getNode());
-  }
-
-  if (VT.getSizeInBits() > OpVT.getSizeInBits()) {
-    assert(Vec.getValueType() == MVT::v2f32 && VT == MVT::v2i64 &&
-           "Unexpected vector type!");
-    Vec = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Vec);
-    return DAG.getNode(Opc, dl, VT, Vec);
-  } else if (VT.getSizeInBits() < OpVT.getSizeInBits()) {
-    EVT CastVT = EVT::getIntegerVT(*DAG.getContext(),
-                                   OpVT.getVectorElementType().getSizeInBits());
-    CastVT =
-        EVT::getVectorVT(*DAG.getContext(), CastVT, VT.getVectorNumElements());
-    Vec = DAG.getNode(Opc, dl, CastVT, Vec);
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, Vec);
-  }
-  return DAG.getNode(Opc, dl, VT, Vec);
-}
-
-static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
-  // We custom lower concat_vectors with 4, 8, or 16 operands that are all the
-  // same operand and of type v1* using the DUP instruction.
-  unsigned NumOps = Op->getNumOperands();
-  if (NumOps == 2) {
-    assert(Op.getValueType().getSizeInBits() == 128 && "unexpected concat");
-    return Op;
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
-  if (NumOps != 4 && NumOps != 8 && NumOps != 16)
-    return SDValue();
+  RetOps[0] = Chain; // Update chain.
 
-  // Must be a single value for VDUP.
-  SDValue Op0 = Op.getOperand(0);
-  for (unsigned i = 1; i < NumOps; ++i) {
-    SDValue OpN = Op.getOperand(i);
-    if (Op0 != OpN)
-      return SDValue();
-  }
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
 
-  // Verify the value type.
-  EVT EltVT = Op0.getValueType();
-  switch (NumOps) {
-  default: llvm_unreachable("Unexpected number of operands");
-  case 4:
-    if (EltVT != MVT::v1i16 && EltVT != MVT::v1i32)
-      return SDValue();
-    break;
-  case 8:
-    if (EltVT != MVT::v1i8 && EltVT != MVT::v1i16)
-      return SDValue();
-    break;
-  case 16:
-    if (EltVT != MVT::v1i8)
-      return SDValue();
-    break;
-  }
+  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
 
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
   SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  // VDUP produces better code for constants.
-  if (Op0->getOpcode() == ISD::BUILD_VECTOR)
-    return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Op0->getOperand(0));
-  return DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, Op0,
-                     DAG.getConstant(0, MVT::i64));
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  unsigned char OpFlags =
+      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+
+  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
+         "unexpected offset in global node");
+
+  // This also catched the large code model case for Darwin.
+  if ((OpFlags & AArch64II::MO_GOT) != 0) {
+    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes instead of using a wrapper node.
+    return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+  }
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
+    // the only correct model on Darwin.
+    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                            OpFlags | AArch64II::MO_PAGE);
+    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
+
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
 }
 
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address (for Darwin, currently) and
+/// return an SDValue containing the final node.
+
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+///     + "extern __thread" declaration.
+///     + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i64] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first xword, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "x0".
+///
+/// Since this descriptor may be in a different unit, in general even the
+/// descriptor must be accessed via an indirect load. The "ideal" code sequence
+/// is:
+///     adrp x0, _var@TLVPPAGE
+///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
+///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
+///                                      ; the function pointer
+///     blr x1                           ; Uses descriptor address in x0
+///     ; Address of _var is now in x0.
+///
+/// If the address of _var's descriptor *is* known to the linker, then it can
+/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
+/// a slight efficiency gain.
 SDValue
-AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
-                                      bool IsSigned) const {
-  if (Op.getValueType().isVector())
-    return LowerVectorFP_TO_INT(Op, DAG, IsSigned);
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
-  }
-
-  RTLIB::Libcall LC;
-  if (IsSigned)
-    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  return LowerF128ToCall(Op, DAG, LC);
-}
-
-SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MFI->setReturnAddressIsTaken(true);
+AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
 
-  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
-    return SDValue();
-
-  EVT VT = Op.getValueType();
-  SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  if (Depth) {
-    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset = DAG.getConstant(8, MVT::i64);
-    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
-  }
+  SDLoc DL(Op);
+  MVT PtrVT = getPointerTy();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
-  // Return X30, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64));
-  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64);
-}
+  SDValue TLVPAddr =
+      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
 
+  // The first entry in the descriptor is a function pointer that we must call
+  // to obtain the address of the variable.
+  SDValue Chain = DAG.getEntryNode();
+  SDValue FuncTLVGet =
+      DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
+                  false, true, true, 8);
+  Chain = FuncTLVGet.getValue(1);
 
-SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG)
-                                              const {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  MFI->setFrameAddressIsTaken(true);
-
-  EVT VT = Op.getValueType();
-  SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  unsigned FrameReg = AArch64::X29;
-  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
-  while (Depth--)
-    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(),
-                            false, false, false, 0);
-  return FrameAddr;
-}
-
-SDValue
-AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  assert(getTargetMachine().getCodeModel() == CodeModel::Large);
-  assert(getTargetMachine().getRelocationModel() == Reloc::Static);
-
-  EVT PtrVT = getPointerTy();
-  SDLoc dl(Op);
-  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
-  const GlobalValue *GV = GN->getGlobal();
-
-  SDValue GlobalAddr = DAG.getNode(
-      AArch64ISD::WrapperLarge, dl, PtrVT,
-      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G3),
-      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
-      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
-      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
-
-  if (GN->getOffset() != 0)
-    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
-                       DAG.getConstant(GN->getOffset(), PtrVT));
-
-  return GlobalAddr;
-}
-
-SDValue
-AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small);
-
-  EVT PtrVT = getPointerTy();
-  SDLoc dl(Op);
-  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
-  const GlobalValue *GV = GN->getGlobal();
-  unsigned Alignment = GV->getAlignment();
-  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
-  if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) {
-    // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate
-    // to zero when they remain undefined. In PIC mode the GOT can take care of
-    // this, but in absolute mode we use a constant pool load.
-    SDValue PoolAddr;
-    PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
-                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
-                                                     AArch64II::MO_NO_FLAG),
-                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
-                                                     AArch64II::MO_LO12),
-                           DAG.getConstant(8, MVT::i32));
-    SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr,
-                                     MachinePointerInfo::getConstantPool(),
-                                     /*isVolatile=*/ false,
-                                     /*isNonTemporal=*/ true,
-                                     /*isInvariant=*/ true, 8);
-    if (GN->getOffset() != 0)
-      return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
-                         DAG.getConstant(GN->getOffset(), PtrVT));
-
-    return GlobalAddr;
-  }
-
-  if (Alignment == 0) {
-    const PointerType *GVPtrTy = cast<PointerType>(GV->getType());
-    if (GVPtrTy->getElementType()->isSized()) {
-      Alignment
-        = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType());
-    } else {
-      // Be conservative if we can't guess, not that it really matters:
-      // functions and labels aren't valid for loads, and the methods used to
-      // actually calculate an address work with any alignment.
-      Alignment = 1;
-    }
-  }
-
-  unsigned char HiFixup, LoFixup;
-  bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM);
-
-  if (UseGOT) {
-    HiFixup = AArch64II::MO_GOT;
-    LoFixup = AArch64II::MO_GOT_LO12;
-    Alignment = 8;
-  } else {
-    HiFixup = AArch64II::MO_NO_FLAG;
-    LoFixup = AArch64II::MO_LO12;
-  }
-
-  // AArch64's small model demands the following sequence:
-  // ADRP x0, somewhere
-  // ADD x0, x0, #:lo12:somewhere ; (or LDR directly).
-  SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
-                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                                             HiFixup),
-                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                                             LoFixup),
-                                  DAG.getConstant(Alignment, MVT::i32));
-
-  if (UseGOT) {
-    GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(),
-                            GlobalRef);
-  }
-
-  if (GN->getOffset() != 0)
-    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef,
-                       DAG.getConstant(GN->getOffset(), PtrVT));
-
-  return GlobalRef;
-}
+  MFI->setAdjustsStack(true);
 
-SDValue
-AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  // TableGen doesn't have easy access to the CodeModel or RelocationModel, so
-  // we make those distinctions here.
-
-  switch (getTargetMachine().getCodeModel()) {
-  case CodeModel::Small:
-    return LowerGlobalAddressELFSmall(Op, DAG);
-  case CodeModel::Large:
-    return LowerGlobalAddressELFLarge(Op, DAG);
-  default:
-    llvm_unreachable("Only small and large code models supported now");
-  }
-}
-
-SDValue
-AArch64TargetLowering::LowerConstantPool(SDValue Op,
-                                         SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT PtrVT = getPointerTy();
-  ConstantPoolSDNode *CN = cast<ConstantPoolSDNode>(Op);
-  const Constant *C = CN->getConstVal();
-
-  switch(getTargetMachine().getCodeModel()) {
-  case CodeModel::Small:
-    // The most efficient code is PC-relative anyway for the small memory model,
-    // so we don't need to worry about relocation model.
-    return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                       DAG.getTargetConstantPool(C, PtrVT, 0, 0,
-                                                 AArch64II::MO_NO_FLAG),
-                       DAG.getTargetConstantPool(C, PtrVT, 0, 0,
-                                                 AArch64II::MO_LO12),
-                       DAG.getConstant(CN->getAlignment(), MVT::i32));
-  case CodeModel::Large:
-    return DAG.getNode(
-      AArch64ISD::WrapperLarge, DL, PtrVT,
-      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
-      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
-      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
-      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC));
-  default:
-    llvm_unreachable("Only small and large code models supported now");
-  }
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+  // silly).
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+
+  // Finally, we can make the call. This is just a degenerate version of a
+  // normal AArch64 call node: x0 takes the address of the descriptor, and
+  // returns the address of the variable in this thread.
+  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
+  Chain =
+      DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+                  Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
+                  DAG.getRegisterMask(Mask), Chain.getValue(1));
+  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
 }
 
-SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
-                                                SDValue DescAddr,
-                                                SDLoc DL,
-                                                SelectionDAG &DAG) const {
+/// When accessing thread-local variables under either the general-dynamic or
+/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
+/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
+/// is a function pointer to carry out the resolution. This function takes the
+/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
+/// other registers (except LR, NZCV) are preserved.
+///
+/// Thus, the ideal call sequence on AArch64 is:
+///
+///     adrp x0, :tlsdesc:thread_var
+///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
+///     add x0, x0, :tlsdesc_lo12:thread_var
+///     .tlsdesccall thread_var
+///     blr x8
+///     (TPIDR_EL0 offset now in x0).
+///
+/// The ".tlsdesccall" directive instructs the assembler to insert a particular
+/// relocation to help the linker relax this sequence if it turns out to be too
+/// conservative.
+///
+/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
+/// is harmless.
+SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
+                                                   SDValue DescAddr, SDLoc DL,
+                                                   SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
 
   // The function we need to call is simply the first entry in the GOT for this
   // descriptor, load it in preparation.
-  SDValue Func, Chain;
-  Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
-                     DescAddr);
+  SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+  // silly).
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
 
   // The function takes only one argument: the address of the descriptor itself
   // in X0.
-  SDValue Glue;
+  SDValue Glue, Chain;
   Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
   Glue = Chain.getValue(1);
 
-  // Finally, there's a special calling-convention which means that the lookup
-  // must preserve all registers (except X0, obviously).
-  const TargetRegisterInfo *TRI  = getTargetMachine().getRegisterInfo();
-  const AArch64RegisterInfo *A64RI
-    = static_cast<const AArch64RegisterInfo *>(TRI);
-  const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask();
-
   // We're now ready to populate the argument list, as with a normal call:
-  std::vector<SDValue> Ops;
+  SmallVector<SDValue, 6> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Func);
   Ops.push_back(SymAddr);
@@ -2586,22 +2705,18 @@ SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
   Ops.push_back(Glue);
 
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0],
-                      Ops.size());
+  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
   Glue = Chain.getValue(1);
 
-  // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it
-  // back to the generic handling code.
   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
 }
 
 SDValue
-AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  assert(getSubtarget()->isTargetELF() &&
-         "TLS not implemented for non-ELF targets");
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small
-         && "TLS only supported in small memory model");
+AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
+  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+         "ELF TLS only supported in small memory model");
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
@@ -2613,39 +2728,22 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
 
   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
 
-  if (Model == TLSModel::InitialExec) {
-    TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                                   AArch64II::MO_GOTTPREL),
-                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                                   AArch64II::MO_GOTTPREL_LO12),
-                        DAG.getConstant(8, MVT::i32));
-    TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
-                        TPOff);
-  } else if (Model == TLSModel::LocalExec) {
-    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
-                                               AArch64II::MO_TPREL_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
-                                               AArch64II::MO_TPREL_G0_NC);
-
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(1, MVT::i32)), 0);
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
-                                       TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)), 0);
-  } else if (Model == TLSModel::GeneralDynamic) {
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                                AArch64II::MO_TLSDESC);
-    SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                                AArch64II::MO_TLSDESC_LO12);
-    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                                   HiDesc, LoDesc,
-                                   DAG.getConstant(8, MVT::i32));
-    SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0);
-
-    TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
+  if (Model == TLSModel::LocalExec) {
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0,
+        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+                                       DAG.getTargetConstant(16, MVT::i32)),
+                    0);
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
+                                       DAG.getTargetConstant(0, MVT::i32)),
+                    0);
+  } else if (Model == TLSModel::InitialExec) {
+    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+    TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
   } else if (Model == TLSModel::LocalDynamic) {
     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
@@ -2653,367 +2751,354 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
     // calculation.
 
     // These accesses will need deduplicating if there's more than one.
-    AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction()
-      .getInfo<AArch64MachineFunctionInfo>();
+    AArch64FunctionInfo *MFI =
+        DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
     MFI->incNumLocalDynamicTLSAccesses();
 
-
-    // Get the location of _TLS_MODULE_BASE_:
-    SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
-                                                AArch64II::MO_TLSDESC);
-    SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
-                                                AArch64II::MO_TLSDESC_LO12);
-    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                                   HiDesc, LoDesc,
-                                   DAG.getConstant(8, MVT::i32));
-    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT);
-
-    ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
-
-    // Get the variable's offset from _TLS_MODULE_BASE_
-    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
-                                               AArch64II::MO_DTPREL_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
-                                               AArch64II::MO_DTPREL_G0_NC);
-
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(0, MVT::i32)), 0);
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
-                                       TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)), 0);
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetExternalSymbol(
+        "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    SDValue LoDesc = DAG.getTargetExternalSymbol(
+        "_TLS_MODULE_BASE_", PtrVT,
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    // First argument to the descriptor call is the address of the descriptor
+    // itself.
+    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
+    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+                                                  AArch64II::MO_TLS);
+
+    // Now we can calculate the offset from TPIDR_EL0 to this module's
+    // thread-local area.
+    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+
+    // Now use :dtprel_whatever: operations to calculate this variable's offset
+    // in its thread-storage area.
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0,
+        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+
+    SDValue DTPOff =
+        SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+                                   DAG.getTargetConstant(16, MVT::i32)),
+                0);
+    DTPOff =
+        SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
+                                   DAG.getTargetConstant(0, MVT::i32)),
+                0);
+
+    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
+  } else if (Model == TLSModel::GeneralDynamic) {
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    SDValue LoDesc = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0,
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    // First argument to the descriptor call is the address of the descriptor
+    // itself.
+    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
+    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr =
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+
+    // Finally we can make a call to calculate the offset from tpidr_el0.
+    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
   } else
-      llvm_unreachable("Unsupported TLS access model");
-
+    llvm_unreachable("Unsupported ELF TLS access model");
 
   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
 }
 
-static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG,
-                                    bool IsSigned) {
+SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  if (Subtarget->isTargetDarwin())
+    return LowerDarwinGlobalTLSAddress(Op, DAG);
+  else if (Subtarget->isTargetELF())
+    return LowerELFGlobalTLSAddress(Op, DAG);
+
+  llvm_unreachable("Unexpected platform trying to use TLS");
+}
+SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
   SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  SDValue Vec = Op.getOperand(0);
-  unsigned Opc = IsSigned ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
 
-  if (VT.getVectorNumElements() == 1) {
-    assert(VT == MVT::v1f64 && "Unexpected vector type!");
-    if (VT.getSizeInBits() == Vec.getValueSizeInBits())
-      return Op;
-    return DAG.UnrollVectorOp(Op.getNode());
-  }
+  // Handle f128 first, since lowering it will result in comparing the return
+  // value of a libcall against zero, which is just what the rest of LowerBR_CC
+  // is expecting to deal with.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
-  if (VT.getSizeInBits() < Vec.getValueSizeInBits()) {
-    assert(Vec.getValueType() == MVT::v2i64 && VT == MVT::v2f32 &&
-           "Unexpected vector type!");
-    Vec = DAG.getNode(Opc, dl, MVT::v2f64, Vec);
-    return DAG.getNode(ISD::FP_ROUND, dl, VT, Vec, DAG.getIntPtrConstant(0));
-  } else if (VT.getSizeInBits() > Vec.getValueSizeInBits()) {
-    unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-    EVT CastVT = EVT::getIntegerVT(*DAG.getContext(),
-                                   VT.getVectorElementType().getSizeInBits());
-    CastVT =
-        EVT::getVectorVT(*DAG.getContext(), CastVT, VT.getVectorNumElements());
-    Vec = DAG.getNode(CastOpc, dl, CastVT, Vec);
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
   }
 
-  return DAG.getNode(Opc, dl, VT, Vec);
-}
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+  // instruction.
+  unsigned Opc = LHS.getOpcode();
+  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->isOne() &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+           "Unexpected condition code.");
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
+      return SDValue();
 
-SDValue
-AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
-                                      bool IsSigned) const {
-  if (Op.getValueType().isVector())
-    return LowerVectorINT_TO_FP(Op, DAG, IsSigned);
-  if (Op.getValueType() != MVT::f128) {
-    // Legal for everything except f128.
-    return Op;
-  }
+    // The actual operation with overflow check.
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
 
-  RTLIB::Libcall LC;
-  if (IsSigned)
-    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+    if (CC == ISD::SETNE)
+      OFCC = getInvertedCondCode(OFCC);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
 
-  return LowerF128ToCall(Op, DAG, LC);
-}
+    return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
+                       CCVal, Overflow);
+  }
 
+  if (LHS.getValueType().isInteger()) {
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    // If the RHS of the comparison is zero, we can potentially fold this
+    // to a specialized branch.
+    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if (RHSC && RHSC->getZExtValue() == 0) {
+      if (CC == ISD::SETEQ) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+
+          // TBZ only operates on i64's, but the ext should be free.
+          if (Test.getValueType() == MVT::i32)
+            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+          return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+        }
 
-SDValue
-AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
-  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  SDLoc dl(JT);
-  EVT PtrVT = getPointerTy();
+        return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
+      } else if (CC == ISD::SETNE) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+
+          // TBNZ only operates on i64's, but the ext should be free.
+          if (Test.getValueType() == MVT::i32)
+            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+          return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+        }
 
-  // When compiling PIC, jump tables get put in the code section so a static
-  // relocation-style is acceptable for both cases.
-  switch (getTargetMachine().getCodeModel()) {
-  case CodeModel::Small:
-    return DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
-                       DAG.getTargetJumpTable(JT->getIndex(), PtrVT),
-                       DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
-                                              AArch64II::MO_LO12),
-                       DAG.getConstant(1, MVT::i32));
-  case CodeModel::Large:
-    return DAG.getNode(
-      AArch64ISD::WrapperLarge, dl, PtrVT,
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G3),
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G2_NC),
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G1_NC),
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G0_NC));
-  default:
-    llvm_unreachable("Only small and large code models supported now");
-  }
-}
+        return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+      }
+    }
 
-// (SELECT testbit, iftrue, iffalse)
-SDValue
-AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue TheBit = Op.getOperand(0);
-  SDValue IfTrue = Op.getOperand(1);
-  SDValue IfFalse = Op.getOperand(2);
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                       Cmp);
+  }
 
-  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
-  // that as the consumer we are responsible for ignoring rubbish in higher
-  // bits.
-  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
-                       DAG.getConstant(1, MVT::i32));
-  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
-                               DAG.getConstant(0, TheBit.getValueType()),
-                               DAG.getCondCode(ISD::SETNE));
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+  SDValue BR1 =
+      DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
+  if (CC2 != AArch64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
+                       Cmp);
+  }
 
-  return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                     A64CMP, IfTrue, IfFalse,
-                     DAG.getConstant(A64CC::NE, MVT::i32));
+  return BR1;
 }
 
-static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
+                                              SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  bool Invert = false;
-  SDValue Op0, Op1;
-  unsigned Opcode;
+  SDLoc DL(Op);
 
-  if (LHS.getValueType().isInteger()) {
+  SDValue In1 = Op.getOperand(0);
+  SDValue In2 = Op.getOperand(1);
+  EVT SrcVT = In2.getValueType();
+  if (SrcVT != VT) {
+    if (SrcVT == MVT::f32 && VT == MVT::f64)
+      In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+    else if (SrcVT == MVT::f64 && VT == MVT::f32)
+      In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
+    else
+      // FIXME: Src type is different, bail out for now. Can VT really be a
+      // vector type?
+      return SDValue();
+  }
 
-    // Attempt to use Vector Integer Compare Mask Test instruction.
-    // TST = icmp ne (and (op0, op1), zero).
-    if (CC == ISD::SETNE) {
-      if (((LHS.getOpcode() == ISD::AND) &&
-           ISD::isBuildVectorAllZeros(RHS.getNode())) ||
-          ((RHS.getOpcode() == ISD::AND) &&
-           ISD::isBuildVectorAllZeros(LHS.getNode()))) {
-
-        SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS;
-        SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0));
-        SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1));
-        return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS);
-      }
+  EVT VecVT;
+  EVT EltVT;
+  SDValue EltMask, VecVal1, VecVal2;
+  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
+    EltVT = MVT::i32;
+    VecVT = MVT::v4i32;
+    EltMask = DAG.getConstant(0x80000000ULL, EltVT);
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
+    } else {
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
     }
-
-    // Attempt to use Vector Integer Compare Mask against Zero instr (Signed).
-    // Note: Compare against Zero does not support unsigned predicates.
-    if ((ISD::isBuildVectorAllZeros(RHS.getNode()) ||
-         ISD::isBuildVectorAllZeros(LHS.getNode())) &&
-        !isUnsignedIntSetCC(CC)) {
-
-      // If LHS is the zero value, swap operands and CondCode.
-      if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
-        CC = getSetCCSwappedOperands(CC);
-        Op0 = RHS;
-      } else
-        Op0 = LHS;
-
-      // Ensure valid CondCode for Compare Mask against Zero instruction:
-      // EQ, GE, GT, LE, LT.
-      if (ISD::SETNE == CC) {
-        Invert = true;
-        CC = ISD::SETEQ;
-      }
-
-      // Using constant type to differentiate integer and FP compares with zero.
-      Op1 = DAG.getConstant(0, MVT::i32);
-      Opcode = AArch64ISD::NEON_CMPZ;
-
+  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
+    EltVT = MVT::i64;
+    VecVT = MVT::v2i64;
+
+    // We want to materialize a mask with the the high bit set, but the AdvSIMD
+    // immediate moves cannot materialize that in a single instruction for
+    // 64-bit elements. Instead, materialize zero and then negate it.
+    EltMask = DAG.getConstant(0, EltVT);
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
     } else {
-      // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned).
-      // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT.
-      bool Swap = false;
-      switch (CC) {
-      default:
-        llvm_unreachable("Illegal integer comparison.");
-      case ISD::SETEQ:
-      case ISD::SETGT:
-      case ISD::SETGE:
-      case ISD::SETUGT:
-      case ISD::SETUGE:
-        break;
-      case ISD::SETNE:
-        Invert = true;
-        CC = ISD::SETEQ;
-        break;
-      case ISD::SETULT:
-      case ISD::SETULE:
-      case ISD::SETLT:
-      case ISD::SETLE:
-        Swap = true;
-        CC = getSetCCSwappedOperands(CC);
-      }
-
-      if (Swap)
-        std::swap(LHS, RHS);
-
-      Opcode = AArch64ISD::NEON_CMP;
-      Op0 = LHS;
-      Op1 = RHS;
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
     }
+  } else {
+    llvm_unreachable("Invalid type for copysign!");
+  }
 
-    // Generate Compare Mask instr or Compare Mask against Zero instr.
-    SDValue NeonCmp =
-        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
+  std::vector<SDValue> BuildVectorOps;
+  for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
+    BuildVectorOps.push_back(EltMask);
 
-    if (Invert)
-      NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
+  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
 
-    return NeonCmp;
+  // If we couldn't materialize the mask above, then the mask vector will be
+  // the zero vector, and we need to negate it here.
+  if (VT == MVT::f64 || VT == MVT::v2f64) {
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
   }
 
-  // Now handle Floating Point cases.
-  // Attempt to use Vector Floating Point Compare Mask against Zero instruction.
-  if (ISD::isBuildVectorAllZeros(RHS.getNode()) ||
-      ISD::isBuildVectorAllZeros(LHS.getNode())) {
+  SDValue Sel =
+      DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
 
-    // If LHS is the zero value, swap operands and CondCode.
-    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
-      CC = getSetCCSwappedOperands(CC);
-      Op0 = RHS;
-    } else
-      Op0 = LHS;
-
-    // Using constant type to differentiate integer and FP compares with zero.
-    Op1 = DAG.getConstantFP(0, MVT::f32);
-    Opcode = AArch64ISD::NEON_CMPZ;
-  } else {
-    // Attempt to use Vector Floating Point Compare Mask instruction.
-    Op0 = LHS;
-    Op1 = RHS;
-    Opcode = AArch64ISD::NEON_CMP;
-  }
+  if (VT == MVT::f32)
+    return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
+  else if (VT == MVT::f64)
+    return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
+  else
+    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
+}
 
-  SDValue NeonCmpAlt;
-  // Some register compares have to be implemented with swapped CC and operands,
-  // e.g.: OLT implemented as OGT with swapped operands.
-  bool SwapIfRegArgs = false;
+SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
+    return SDValue();
 
-  // Ensure valid CondCode for FP Compare Mask against Zero instruction:
-  // EQ, GE, GT, LE, LT.
-  // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT.
-  switch (CC) {
-  default:
-    llvm_unreachable("Illegal FP comparison");
-  case ISD::SETUNE:
-  case ISD::SETNE:
-    Invert = true; // Fallthrough
-  case ISD::SETOEQ:
-  case ISD::SETEQ:
-    CC = ISD::SETEQ;
-    break;
-  case ISD::SETOLT:
-  case ISD::SETLT:
-    CC = ISD::SETLT;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETOGT:
-  case ISD::SETGT:
-    CC = ISD::SETGT;
-    break;
-  case ISD::SETOLE:
-  case ISD::SETLE:
-    CC = ISD::SETLE;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETOGE:
-  case ISD::SETGE:
-    CC = ISD::SETGE;
-    break;
-  case ISD::SETUGE:
-    Invert = true;
-    CC = ISD::SETLT;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETULE:
-    Invert = true;
-    CC = ISD::SETGT;
-    break;
-  case ISD::SETUGT:
-    Invert = true;
-    CC = ISD::SETLE;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETULT:
-    Invert = true;
-    CC = ISD::SETGE;
-    break;
-  case ISD::SETUEQ:
-    Invert = true; // Fallthrough
-  case ISD::SETONE:
-    // Expand this to (OGT |OLT).
-    NeonCmpAlt =
-        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT));
-    CC = ISD::SETLT;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETUO:
-    Invert = true; // Fallthrough
-  case ISD::SETO:
-    // Expand this to (OGE | OLT).
-    NeonCmpAlt =
-        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE));
-    CC = ISD::SETLT;
-    SwapIfRegArgs = true;
-    break;
-  }
+  // While there is no integer popcount instruction, it can
+  // be more efficiently lowered to the following sequence that uses
+  // AdvSIMD registers/instructions as long as the copies to/from
+  // the AdvSIMD registers are cheap.
+  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
+  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
+  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
+  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
+  SDValue Val = Op.getOperand(0);
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
 
-  if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) {
-    CC = getSetCCSwappedOperands(CC);
-    std::swap(Op0, Op1);
+  SDValue VecVal;
+  if (VT == MVT::i32) {
+    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
+    VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
+                                       VecVal);
+  } else {
+    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
   }
 
-  // Generate FP Compare Mask instr or FP Compare Mask against Zero instr
-  SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
+  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
+  SDValue UaddLV = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+      DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
 
-  if (NeonCmpAlt.getNode())
-    NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt);
+  if (VT == MVT::i64)
+    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
+  return UaddLV;
+}
 
-  if (Invert)
-    NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
+SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
-  return NeonCmp;
-}
+  if (Op.getValueType().isVector())
+    return LowerVSETCC(Op, DAG);
 
-// (SETCC lhs, rhs, condcode)
-SDValue
-AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
 
-  if (VT.isVector())
-    return LowerVectorSETCC(Op, DAG);
+  // We chose ZeroOrOneBooleanContents, so use zero and one.
+  EVT VT = Op.getValueType();
+  SDValue TVal = DAG.getConstant(1, VT);
+  SDValue FVal = DAG.getConstant(0, VT);
 
+  // Handle f128 first, since one possible outcome is a normal integer
+  // comparison which gets picked up by the next if statement.
   if (LHS.getValueType() == MVT::f128) {
-    // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS
-    // for the rest of the function (some i32 or i64 values).
     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
     // If softenSetCCOperands returned a scalar, use it.
-    if (RHS.getNode() == 0) {
+    if (!RHS.getNode()) {
       assert(LHS.getValueType() == Op.getValueType() &&
              "Unexpected setcc expansion!");
       return LHS;
@@ -3021,205 +3106,403 @@ AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (LHS.getValueType().isInteger()) {
-    SDValue A64cc;
+    SDValue CCVal;
+    SDValue Cmp =
+        getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
+
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+  }
+
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
 
-    // Integers are handled in a separate function because the combinations of
-    // immediates and tests can get hairy and we may want to fiddle things.
-    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
 
-    return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
-                       CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT),
-                       A64cc);
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  if (CC2 == AArch64CC::AL) {
+    changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
+    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
+  } else {
+    // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
+    // totally clean.  Some of them require two CSELs to implement.  As is in
+    // this case, we emit the first CSEL and then emit a second using the output
+    // of the first as the RHS.  We're effectively OR'ing the two CC's together.
+
+    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
+    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+    SDValue CS1 =
+        DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   }
+}
 
-  // Note that some LLVM floating-point CondCodes can't be lowered to a single
-  // conditional branch, hence FPCCToA64CC can set a second test, where either
-  // passing is sufficient.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
-                                     CmpOp, DAG.getConstant(1, VT),
-                                     DAG.getConstant(0, VT), A64cc);
+/// A SELECT_CC operation is really some kind of max or min if both values being
+/// compared are, in some sense, equal to the results in either case. However,
+/// it is permissible to compare f32 values and produce directly extended f64
+/// values.
+///
+/// Extending the comparison operands would also be allowed, but is less likely
+/// to happen in practice since their use is right here. Note that truncate
+/// operations would *not* be semantically equivalent.
+static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
+  if (Cmp == Result)
+    return true;
 
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                               DAG.getConstant(1, VT), A64SELECT_CC, A64cc);
+  ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
+  ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
+  if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
+      Result.getValueType() == MVT::f64) {
+    bool Lossy;
+    APFloat CmpVal = CCmp->getValueAPF();
+    CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
+    return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
   }
 
-  return A64SELECT_CC;
+  return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
 }
 
-static SDValue LowerVectorSELECT_CC(SDValue Op, SelectionDAG &DAG) {
-  SDLoc dl(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue IfTrue = Op.getOperand(2);
-  SDValue IfFalse = Op.getOperand(3);
-  EVT IfTrueVT = IfTrue.getValueType();
-  EVT CondVT = IfTrueVT.changeVectorElementTypeToInteger();
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue CC = Op->getOperand(0);
+  SDValue TVal = Op->getOperand(1);
+  SDValue FVal = Op->getOperand(2);
+  SDLoc DL(Op);
 
-  // If LHS & RHS are floating point and IfTrue & IfFalse are vectors, we will
-  // use NEON compare.
-  if ((LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64)) {
-    EVT EltVT = LHS.getValueType();
-    unsigned EltNum = 128 / EltVT.getSizeInBits();
-    EVT VT = EVT::getVectorVT(*DAG.getContext(), EltVT, EltNum);
-    unsigned SubConstant =
-        (LHS.getValueType() == MVT::f32) ? AArch64::sub_32 :AArch64::sub_64;
-    EVT CEltT = (LHS.getValueType() == MVT::f32) ? MVT::i32 : MVT::i64;
-    EVT CVT = EVT::getVectorVT(*DAG.getContext(), CEltT, EltNum);
-
-    LHS
-      = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
-                  VT, DAG.getTargetConstant(0, MVT::i32), LHS,
-                  DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
-    RHS
-      = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
-                  VT, DAG.getTargetConstant(0, MVT::i32), RHS,
-                  DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
-
-    SDValue VSetCC = DAG.getSetCC(dl, CVT, LHS, RHS, CC);
-    SDValue ResCC = LowerVectorSETCC(VSetCC, DAG);
-    if (CEltT.getSizeInBits() < IfTrueVT.getSizeInBits()) {
-      EVT DUPVT =
-          EVT::getVectorVT(*DAG.getContext(), CEltT,
-                           IfTrueVT.getSizeInBits() / CEltT.getSizeInBits());
-      ResCC = DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, DUPVT, ResCC,
-                          DAG.getConstant(0, MVT::i64, false));
-
-      ResCC = DAG.getNode(ISD::BITCAST, dl, CondVT, ResCC);
-    } else {
-      // FIXME: If IfTrue & IfFalse hold v1i8, v1i16 or v1i32, this function
-      // can't handle them and will hit this assert.
-      assert(CEltT.getSizeInBits() == IfTrueVT.getSizeInBits() &&
-             "Vector of IfTrue & IfFalse is too small.");
-
-      unsigned ExEltNum =
-          EltNum * IfTrueVT.getSizeInBits() / ResCC.getValueSizeInBits();
-      EVT ExVT = EVT::getVectorVT(*DAG.getContext(), CEltT, ExEltNum);
-      ResCC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExVT, ResCC,
-                          DAG.getConstant(0, MVT::i64, false));
-      ResCC = DAG.getNode(ISD::BITCAST, dl, CondVT, ResCC);
-    }
-    SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
-                                  ResCC, IfTrue, IfFalse);
-    return VSelect;
-  }
-
-  // Here we handle the case that LHS & RHS are integer and IfTrue & IfFalse are
-  // vectors.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  EVT SEVT = MVT::i32;
-  if (IfTrue.getValueType().getVectorElementType().getSizeInBits() > 32)
-    SEVT = MVT::i64;
-  SDValue AllOne = DAG.getConstant(-1, SEVT);
-  SDValue AllZero = DAG.getConstant(0, SEVT);
-  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, SEVT, SetCC,
-                                     AllOne, AllZero, A64cc);
-
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                               SetCC, AllOne, A64SELECT_CC, A64cc);
-  }
-  SDValue VDup;
-  if (IfTrue.getValueType().getVectorNumElements() == 1)
-    VDup = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, CondVT, A64SELECT_CC);
-  else
-    VDup = DAG.getNode(AArch64ISD::NEON_VDUP, dl, CondVT, A64SELECT_CC);
-  SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
-                                VDup, IfTrue, IfFalse);
-  return VSelect;
-}
+  unsigned Opc = CC.getOpcode();
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
+  // instruction.
+  if (CC.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
+      return SDValue();
 
-// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
-SDValue
-AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
+                       CCVal, Overflow);
+  }
+
+  if (CC.getOpcode() == ISD::SETCC)
+    return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
+                           cast<CondCodeSDNode>(CC.getOperand(2))->get());
+  else
+    return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
+                           FVal, ISD::SETNE);
+}
+
+SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
-  SDValue IfTrue = Op.getOperand(2);
-  SDValue IfFalse = Op.getOperand(3);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-
-  if (IfTrue.getValueType().isVector())
-    return LowerVectorSELECT_CC(Op, DAG);
+  SDValue TVal = Op.getOperand(2);
+  SDValue FVal = Op.getOperand(3);
+  SDLoc dl(Op);
 
+  // Handle f128 first, because it will result in a comparison of some RTLIB
+  // call result against zero.
   if (LHS.getValueType() == MVT::f128) {
-    // f128 comparisons are lowered to libcalls, but slot in nicely here
-    // afterwards.
     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
     // If softenSetCCOperands returned a scalar, we need to compare the result
     // against zero to select between true and false values.
-    if (RHS.getNode() == 0) {
+    if (!RHS.getNode()) {
       RHS = DAG.getConstant(0, LHS.getValueType());
       CC = ISD::SETNE;
     }
   }
 
+  // Handle integers first.
   if (LHS.getValueType().isInteger()) {
-    SDValue A64cc;
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    unsigned Opcode = AArch64ISD::CSEL;
+
+    // If both the TVal and the FVal are constants, see if we can swap them in
+    // order to for a CSINV or CSINC out of them.
+    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (TVal.getOpcode() == ISD::XOR) {
+      // If TVal is a NOT we want to swap TVal and FVal so that we can match
+      // with a CSINV rather than a CSEL.
+      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
+
+      if (CVal && CVal->isAllOnesValue()) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (TVal.getOpcode() == ISD::SUB) {
+      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
+      // that we can match with a CSNEG rather than a CSEL.
+      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
+
+      if (CVal && CVal->isNullValue()) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (CTVal && CFVal) {
+      const int64_t TrueVal = CTVal->getSExtValue();
+      const int64_t FalseVal = CFVal->getSExtValue();
+      bool Swap = false;
+
+      // If both TVal and FVal are constants, see if FVal is the
+      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
+      // instead of a CSEL in that case.
+      if (TrueVal == ~FalseVal) {
+        Opcode = AArch64ISD::CSINV;
+      } else if (TrueVal == -FalseVal) {
+        Opcode = AArch64ISD::CSNEG;
+      } else if (TVal.getValueType() == MVT::i32) {
+        // If our operands are only 32-bit wide, make sure we use 32-bit
+        // arithmetic for the check whether we can use CSINC. This ensures that
+        // the addition in the check will wrap around properly in case there is
+        // an overflow (which would not be the case if we do the check with
+        // 64-bit arithmetic).
+        const uint32_t TrueVal32 = CTVal->getZExtValue();
+        const uint32_t FalseVal32 = CFVal->getZExtValue();
+
+        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
+          Opcode = AArch64ISD::CSINC;
+
+          if (TrueVal32 > FalseVal32) {
+            Swap = true;
+          }
+        }
+        // 64-bit check whether we can use CSINC.
+      } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
+        Opcode = AArch64ISD::CSINC;
+
+        if (TrueVal > FalseVal) {
+          Swap = true;
+        }
+      }
+
+      // Swap TVal and FVal if necessary.
+      if (Swap) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+
+      if (Opcode != AArch64ISD::CSEL) {
+        // Drop FVal since we can get its value by simply inverting/negating
+        // TVal.
+        FVal = TVal;
+      }
+    }
+
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+
+    EVT VT = Op.getValueType();
+    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
+  }
+
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+  assert(LHS.getValueType() == RHS.getValueType());
+  EVT VT = Op.getValueType();
+
+  // Try to match this select into a max/min operation, which have dedicated
+  // opcode in the instruction set.
+  // FIXME: This is not correct in the presence of NaNs, so we only enable this
+  // in no-NaNs mode.
+  if (getTargetMachine().Options.NoNaNsFPMath) {
+    SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
+    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
+        selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
+      CC = ISD::getSetCCSwappedOperands(CC);
+      std::swap(MinMaxLHS, MinMaxRHS);
+    }
 
-    // Integers are handled in a separate function because the combinations of
-    // immediates and tests can get hairy and we may want to fiddle things.
-    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
+        selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETGT:
+      case ISD::SETGE:
+      case ISD::SETUGT:
+      case ISD::SETUGE:
+      case ISD::SETOGT:
+      case ISD::SETOGE:
+        return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
+        break;
+      case ISD::SETLT:
+      case ISD::SETLE:
+      case ISD::SETULT:
+      case ISD::SETULE:
+      case ISD::SETOLT:
+      case ISD::SETOLE:
+        return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
+        break;
+      }
+    }
+  }
 
-    return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), CmpOp,
-                       IfTrue, IfFalse, A64cc);
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two CSELs to implement.
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+  SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+  // If we need a second CSEL, emit it, using the output of the first as the
+  // RHS.  We're effectively OR'ing the two CC's together.
+  if (CC2 != AArch64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   }
 
-  // Note that some LLVM floating-point CondCodes can't be lowered to a single
-  // conditional branch, hence FPCCToA64CC can set a second test, where either
-  // passing is sufficient.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
-                                     Op.getValueType(),
-                                     SetCC, IfTrue, IfFalse, A64cc);
+  // Otherwise, return the output of the first CSEL.
+  return CS1;
+}
 
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                               SetCC, IfTrue, A64SELECT_CC, A64cc);
+SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  // Jump table entries as PC relative offsets. No additional tweaking
+  // is necessary here. Just get the address of the jump table.
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
 
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                               AArch64II::MO_G0 | MO_NC));
   }
 
-  return A64SELECT_CC;
+  SDValue Hi =
+      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
+  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                      AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+  return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
 }
 
-SDValue
-AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
-  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
-  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    // Use the GOT for the large code model on iOS.
+    if (Subtarget->isTargetMachO()) {
+      SDValue GotAddr = DAG.getTargetConstantPool(
+          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+          AArch64II::MO_GOT);
+      return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+    }
 
-  // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
-  // rather than just 8.
-  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op),
-                       Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(32, MVT::i32), 8, false, false,
-                       MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G3),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
+    // ELF, the only valid one on Darwin.
+    SDValue Hi =
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetConstantPool(
+        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+        AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
 }
 
-SDValue
-AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+  } else {
+    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
+                                                             AArch64II::MO_NC);
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  AArch64FunctionInfo *FuncInfo =
+      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FR =
+      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
+SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
+                                                SelectionDAG &DAG) const {
   // The layout of the va_list struct is specified in the AArch64 Procedure Call
   // Standard, section B.3.
   MachineFunction &MF = DAG.getMachineFunction();
-  AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   SDLoc DL(Op);
 
   SDValue Chain = Op.getOperand(0);
@@ -3228,1471 +3511,1911 @@ AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   SmallVector<SDValue, 4> MemOps;
 
   // void *__stack at offset 0
-  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(),
-                                    getPointerTy());
+  SDValue Stack =
+      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
-                                MachinePointerInfo(SV), false, false, 0));
+                                MachinePointerInfo(SV), false, false, 8));
 
   // void *__gr_top at offset 8
-  int GPRSize = FuncInfo->getVariadicGPRSize();
+  int GPRSize = FuncInfo->getVarArgsGPRSize();
   if (GPRSize > 0) {
     SDValue GRTop, GRTopAddr;
 
     GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                             DAG.getConstant(8, getPointerTy()));
 
-    GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy());
+    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
     GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
                         DAG.getConstant(GPRSize, getPointerTy()));
 
     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
-                                  MachinePointerInfo(SV, 8),
-                                  false, false, 0));
+                                  MachinePointerInfo(SV, 8), false, false, 8));
   }
 
   // void *__vr_top at offset 16
-  int FPRSize = FuncInfo->getVariadicFPRSize();
+  int FPRSize = FuncInfo->getVarArgsFPRSize();
   if (FPRSize > 0) {
     SDValue VRTop, VRTopAddr;
     VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                             DAG.getConstant(16, getPointerTy()));
 
-    VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy());
+    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
     VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
                         DAG.getConstant(FPRSize, getPointerTy()));
 
     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
-                                  MachinePointerInfo(SV, 16),
-                                  false, false, 0));
+                                  MachinePointerInfo(SV, 16), false, false, 8));
   }
 
   // int __gr_offs at offset 24
   SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                                    DAG.getConstant(24, getPointerTy()));
   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
-                                GROffsAddr, MachinePointerInfo(SV, 24),
-                                false, false, 0));
+                                GROffsAddr, MachinePointerInfo(SV, 24), false,
+                                false, 4));
 
   // int __vr_offs at offset 28
   SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                                    DAG.getConstant(28, getPointerTy()));
   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
-                                VROffsAddr, MachinePointerInfo(SV, 28),
-                                false, false, 0));
+                                VROffsAddr, MachinePointerInfo(SV, 28), false,
+                                false, 4));
 
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
-                     MemOps.size());
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
 
-SDValue
-AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Don't know how to custom lower this!");
-  case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128);
-  case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128);
-  case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128);
-  case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128);
-  case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true);
-  case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false);
-  case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true);
-  case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
-  case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
-  case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
-  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
-  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
-
-  case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
-  case ISD::SRL_PARTS:
-  case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
-
-  case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
-  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
-  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
-  case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG);
-  case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
-  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
-  case ISD::JumpTable: return LowerJumpTable(Op, DAG);
-  case ISD::SELECT: return LowerSELECT(Op, DAG);
-  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::SETCC: return LowerSETCC(Op, DAG);
-  case ISD::VACOPY: return LowerVACOPY(Op, DAG);
-  case ISD::VASTART: return LowerVASTART(Op, DAG);
-  case ISD::BUILD_VECTOR:
-    return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
-  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
-  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
-  }
-
-  return SDValue();
+SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
+                                     : LowerAAPCS_VASTART(Op, DAG);
 }
 
-/// Check if the specified splat value corresponds to a valid vector constant
-/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI).  If
-/// so, return the encoded 8-bit immediate and the OpCmode instruction fields
-/// values.
-static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
-                              unsigned SplatBitSize, SelectionDAG &DAG,
-                              bool is128Bits, NeonModImmType type, EVT &VT,
-                              unsigned &Imm, unsigned &OpCmode) {
-  switch (SplatBitSize) {
-  default:
-    llvm_unreachable("unexpected size for isNeonModifiedImm");
-  case 8: {
-    if (type != Neon_Mov_Imm)
-      return false;
-    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
-    // Neon movi per byte: Op=0, Cmode=1110.
-    OpCmode = 0xe;
-    Imm = SplatBits;
-    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
-    break;
-  }
-  case 16: {
-    // Neon move inst per halfword
-    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
-    if ((SplatBits & ~0xff) == 0) {
-      // Value = 0x00nn is 0x00nn LSL 0
-      // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000
-      // bic:  Op=1, Cmode=1001;  orr:  Op=0, Cmode=1001
-      // Op=x, Cmode=100y
-      Imm = SplatBits;
-      OpCmode = 0x8;
-      break;
-    }
-    if ((SplatBits & ~0xff00) == 0) {
-      // Value = 0xnn00 is 0x00nn LSL 8
-      // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010
-      // bic:  Op=1, Cmode=1011;  orr:  Op=0, Cmode=1011
-      // Op=x, Cmode=101x
-      Imm = SplatBits >> 8;
-      OpCmode = 0xa;
-      break;
-    }
-    // can't handle any other
-    return false;
-  }
-
-  case 32: {
-    // First the LSL variants (MSL is unusable by some interested instructions).
+SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
+  // pointer.
+  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
-    // Neon move instr per word, shift zeros
-    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
-    if ((SplatBits & ~0xff) == 0) {
-      // Value = 0x000000nn is 0x000000nn LSL 0
-      // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000
-      // bic:  Op=1, Cmode= 0001; orr:  Op=0, Cmode= 0001
-      // Op=x, Cmode=000x
-      Imm = SplatBits;
-      OpCmode = 0;
-      break;
-    }
-    if ((SplatBits & ~0xff00) == 0) {
-      // Value = 0x0000nn00 is 0x000000nn LSL 8
-      // movi: Op=0, Cmode= 0010;  mvni: Op=1, Cmode= 0010
-      // bic:  Op=1, Cmode= 0011;  orr : Op=0, Cmode= 0011
-      // Op=x, Cmode=001x
-      Imm = SplatBits >> 8;
-      OpCmode = 0x2;
-      break;
-    }
-    if ((SplatBits & ~0xff0000) == 0) {
-      // Value = 0x00nn0000 is 0x000000nn LSL 16
-      // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100
-      // bic:  Op=1, Cmode= 0101; orr:  Op=0, Cmode= 0101
-      // Op=x, Cmode=010x
-      Imm = SplatBits >> 16;
-      OpCmode = 0x4;
-      break;
-    }
-    if ((SplatBits & ~0xff000000) == 0) {
-      // Value = 0xnn000000 is 0x000000nn LSL 24
-      // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110
-      // bic:  Op=1, Cmode= 0111; orr:  Op=0, Cmode= 0111
-      // Op=x, Cmode=011x
-      Imm = SplatBits >> 24;
-      OpCmode = 0x6;
-      break;
-    }
+  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
+                       Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
+                       8, false, false, MachinePointerInfo(DestSV),
+                       MachinePointerInfo(SrcSV));
+}
 
-    // Now the MSL immediates.
+SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() &&
+         "automatic va_arg instruction only works on Darwin");
 
-    // Neon move instr per word, shift ones
-    if ((SplatBits & ~0xffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
-      // Value = 0x0000nnff is 0x000000nn MSL 8
-      // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100
-      // Op=x, Cmode=1100
-      Imm = SplatBits >> 8;
-      OpCmode = 0xc;
-      break;
-    }
-    if ((SplatBits & ~0xffffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
-      // Value = 0x00nnffff is 0x000000nn MSL 16
-      // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101
-      // Op=x, Cmode=1101
-      Imm = SplatBits >> 16;
-      OpCmode = 0xd;
-      break;
-    }
-    // can't handle any other
-    return false;
+  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  unsigned Align = Op.getConstantOperandVal(3);
+
+  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
+                               MachinePointerInfo(V), false, false, false, 0);
+  Chain = VAList.getValue(1);
+
+  if (Align > 8) {
+    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
+    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                         DAG.getConstant(Align - 1, getPointerTy()));
+    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
+                         DAG.getConstant(-(int64_t)Align, getPointerTy()));
   }
 
-  case 64: {
-    if (type != Neon_Mov_Imm)
-      return false;
-    // Neon move instr bytemask, where each byte is either 0x00 or 0xff.
-    // movi Op=1, Cmode=1110.
-    OpCmode = 0x1e;
-    uint64_t BitMask = 0xff;
-    uint64_t Val = 0;
-    unsigned ImmMask = 1;
-    Imm = 0;
-    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
-      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
-        Val |= BitMask;
-        Imm |= ImmMask;
-      } else if ((SplatBits & BitMask) != 0) {
-        return false;
-      }
-      BitMask <<= 8;
-      ImmMask <<= 1;
-    }
-    SplatBits = Val;
-    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
-    break;
+  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+
+  // Scalar integer and FP values smaller than 64 bits are implicitly extended
+  // up to 64 bits.  At the very least, we have to increase the striding of the
+  // vaargs list to match this, and for FP values we need to introduce
+  // FP_ROUND nodes as well.
+  if (VT.isInteger() && !VT.isVector())
+    ArgSize = 8;
+  bool NeedFPTrunc = false;
+  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
+    ArgSize = 8;
+    NeedFPTrunc = true;
   }
+
+  // Increment the pointer, VAList, to the next vaarg
+  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                               DAG.getConstant(ArgSize, getPointerTy()));
+  // Store the incremented VAList to the legalized pointer
+  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
+                                 false, false, 0);
+
+  // Load the actual argument out of the pointer VAList
+  if (NeedFPTrunc) {
+    // Load the value as an f64.
+    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
+                                 MachinePointerInfo(), false, false, false, 0);
+    // Round the value down to an f32.
+    SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
+                                   DAG.getIntPtrConstant(1));
+    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
+    // Merge the rounded value with the chain output of the load.
+    return DAG.getMergeValues(Ops, DL);
   }
 
-  return true;
+  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
+                     false, false, 0);
 }
 
-static SDValue PerformANDCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
-
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  // We're looking for an SRA/SHL pair which form an SBFX.
-
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  if (!isa<ConstantSDNode>(N->getOperand(1)))
-    return SDValue();
-
-  uint64_t TruncMask = N->getConstantOperandVal(1);
-  if (!isMask_64(TruncMask))
-    return SDValue();
-
-  uint64_t Width = CountPopulation_64(TruncMask);
-  SDValue Shift = N->getOperand(0);
-
-  if (Shift.getOpcode() != ISD::SRL)
-    return SDValue();
+SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
 
-  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
-    return SDValue();
-  uint64_t LSB = Shift->getConstantOperandVal(1);
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr =
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(), false, false, false, 0);
+  return FrameAddr;
+}
 
-  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
-    return SDValue();
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
+                                                  EVT VT) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("sp", AArch64::SP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
 
-  return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0),
-                     DAG.getConstant(LSB, MVT::i64),
-                     DAG.getConstant(LSB + Width - 1, MVT::i64));
-}
-
-/// For a true bitfield insert, the bits getting into that contiguous mask
-/// should come from the low part of an existing value: they must be formed from
-/// a compatible SHL operation (unless they're already low). This function
-/// checks that condition and returns the least-significant bit that's
-/// intended. If the operation not a field preparation, -1 is returned.
-static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT,
-                            SDValue &MaskedVal, uint64_t Mask) {
-  if (!isShiftedMask_64(Mask))
-    return -1;
-
-  // Now we need to alter MaskedVal so that it is an appropriate input for a BFI
-  // instruction. BFI will do a left-shift by LSB before applying the mask we've
-  // spotted, so in general we should pre-emptively "undo" that by making sure
-  // the incoming bits have had a right-shift applied to them.
-  //
-  // This right shift, however, will combine with existing left/right shifts. In
-  // the simplest case of a completely straight bitfield operation, it will be
-  // expected to completely cancel out with an existing SHL. More complicated
-  // cases (e.g. bitfield to bitfield copy) may still need a real shift before
-  // the BFI.
-
-  uint64_t LSB = countTrailingZeros(Mask);
-  int64_t ShiftRightRequired = LSB;
-  if (MaskedVal.getOpcode() == ISD::SHL &&
-      isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
-    ShiftRightRequired -= MaskedVal.getConstantOperandVal(1);
-    MaskedVal = MaskedVal.getOperand(0);
-  } else if (MaskedVal.getOpcode() == ISD::SRL &&
-             isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
-    ShiftRightRequired += MaskedVal.getConstantOperandVal(1);
-    MaskedVal = MaskedVal.getOperand(0);
-  }
-
-  if (ShiftRightRequired > 0)
-    MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal,
-                            DAG.getConstant(ShiftRightRequired, MVT::i64));
-  else if (ShiftRightRequired < 0) {
-    // We could actually end up with a residual left shift, for example with
-    // "struc.bitfield = val << 1".
-    MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal,
-                            DAG.getConstant(-ShiftRightRequired, MVT::i64));
-  }
-
-  return LSB;
-}
-
-/// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by
-/// a mask and an extension. Returns true if a BFI was found and provides
-/// information on its surroundings.
-static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask,
-                          bool &Extended) {
-  Extended = false;
-  if (N.getOpcode() == ISD::ZERO_EXTEND) {
-    Extended = true;
-    N = N.getOperand(0);
-  }
-
-  if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
-    Mask = N->getConstantOperandVal(1);
-    N = N.getOperand(0);
-  } else {
-    // Mask is the whole width.
-    Mask = -1ULL >> (64 - N.getValueType().getSizeInBits());
-  }
+SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
 
-  if (N.getOpcode() == AArch64ISD::BFI) {
-    BFI = N;
-    return true;
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(8, getPointerTy());
+    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
   }
 
-  return false;
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
+  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
 }
 
-/// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which
-/// is roughly equivalent to (and (BFI ...), mask). This form is used because it
-/// can often be further combined with a larger mask. Ultimately, we want mask
-/// to be 2^32-1 or 2^64-1 so the AND can be skipped.
-static SDValue tryCombineToBFI(SDNode *N,
-                               TargetLowering::DAGCombinerInfo &DCI,
-                               const AArch64Subtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  assert(N->getOpcode() == ISD::OR && "Unexpected root");
-
-  // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or
-  // abandon the effort.
-  SDValue LHS = N->getOperand(0);
-  if (LHS.getOpcode() != ISD::AND)
-    return SDValue();
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  SDValue ARMcc;
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
 
-  uint64_t LHSMask;
-  if (isa<ConstantSDNode>(LHS.getOperand(1)))
-    LHSMask = LHS->getConstantOperandVal(1);
-  else
-    return SDValue();
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
 
-  // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask
-  // is or abandon the effort.
-  SDValue RHS = N->getOperand(1);
-  if (RHS.getOpcode() != ISD::AND)
-    return SDValue();
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
 
-  uint64_t RHSMask;
-  if (isa<ConstantSDNode>(RHS.getOperand(1)))
-    RHSMask = RHS->getConstantOperandVal(1);
-  else
-    return SDValue();
+  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+                               ISD::SETGE, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
+
+  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue Lo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+
+  // AArch64 shifts larger than the register width are wrapped rather than
+  // clamped, so we can't just emit "hi >> x".
+  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue TrueValHi = Opc == ISD::SRA
+                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
+                                        DAG.getConstant(VTBits - 1, MVT::i64))
+                          : DAG.getConstant(0, VT);
+  SDValue Hi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
 
-  // Can't do anything if the masks are incompatible.
-  if (LHSMask & RHSMask)
-    return SDValue();
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
 
-  // Now we need one of the masks to be a contiguous field. Without loss of
-  // generality that should be the RHS one.
-  SDValue Bitfield = LHS.getOperand(0);
-  if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) {
-    // We know that LHS is a candidate new value, and RHS isn't already a better
-    // one.
-    std::swap(LHS, RHS);
-    std::swap(LHSMask, RHSMask);
-  }
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  SDValue ARMcc;
 
-  // We've done our best to put the right operands in the right places, all we
-  // can do now is check whether a BFI exists.
-  Bitfield = RHS.getOperand(0);
-  int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask);
-  if (LSB == -1)
-    return SDValue();
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
 
-  uint32_t Width = CountPopulation_64(RHSMask);
-  assert(Width && "Expected non-zero bitfield width");
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
 
-  SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
-                            LHS.getOperand(0), Bitfield,
-                            DAG.getConstant(LSB, MVT::i64),
-                            DAG.getConstant(Width, MVT::i64));
+  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+                               ISD::SETGE, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
+  SDValue Hi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
 
-  // Mask is trivial
-  if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits())))
-    return BFI;
+  // AArch64 shifts of larger than register sizes are wrapped rather than
+  // clamped, so we can't just emit "lo << a" if a is too big.
+  SDValue TrueValLo = DAG.getConstant(0, VT);
+  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
 
-  return DAG.getNode(ISD::AND, DL, VT, BFI,
-                     DAG.getConstant(LHSMask | RHSMask, VT));
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
 }
 
-/// Search for the bitwise combining (with careful masks) of a MaskedBFI and its
-/// original input. This is surprisingly common because SROA splits things up
-/// into i8 chunks, so the originally detected MaskedBFI may actually only act
-/// on the low (say) byte of a word. This is then orred into the rest of the
-/// word afterwards.
-///
-/// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)).
-///
-/// If MASK1 and MASK2 are compatible, we can fold the whole thing into the
-/// MaskedBFI. We can also deal with a certain amount of extend/truncate being
-/// involved.
-static SDValue tryCombineToLargerBFI(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     const AArch64Subtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
+bool AArch64TargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode *GA) const {
+  // The AArch64 target doesn't support folding offsets into global addresses.
+  return false;
+}
 
-  // First job is to hunt for a MaskedBFI on either the left or right. Swap
-  // operands if it's actually on the right.
-  SDValue BFI;
-  SDValue PossExtraMask;
-  uint64_t ExistingMask = 0;
-  bool Extended = false;
-  if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended))
-    PossExtraMask = N->getOperand(1);
-  else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended))
-    PossExtraMask = N->getOperand(0);
-  else
-    return SDValue();
+bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
+  // FIXME: We should be able to handle f128 as well with a clever lowering.
+  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
+    return true;
 
-  // We can only combine a BFI with another compatible mask.
-  if (PossExtraMask.getOpcode() != ISD::AND ||
-      !isa<ConstantSDNode>(PossExtraMask.getOperand(1)))
-    return SDValue();
+  if (VT == MVT::f64)
+    return AArch64_AM::getFP64Imm(Imm) != -1;
+  else if (VT == MVT::f32)
+    return AArch64_AM::getFP32Imm(Imm) != -1;
+  return false;
+}
 
-  uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1);
+//===----------------------------------------------------------------------===//
+//                          AArch64 Optimization Hooks
+//===----------------------------------------------------------------------===//
 
-  // Masks must be compatible.
-  if (ExtraMask & ExistingMask)
-    return SDValue();
+//===----------------------------------------------------------------------===//
+//                          AArch64 Inline Assembly Support
+//===----------------------------------------------------------------------===//
 
-  SDValue OldBFIVal = BFI.getOperand(0);
-  SDValue NewBFIVal = BFI.getOperand(1);
-  if (Extended) {
-    // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be
-    // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments
-    // need to be made compatible.
-    assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32
-           && "Invalid types for BFI");
-    OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal);
-    NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal);
+// Table of Constraints
+// TODO: This is the current set of constraints supported by ARM for the
+// compiler, not all of them may make sense, e.g. S may be difficult to support.
+//
+// r - A general register
+// w - An FP/SIMD register of some size in the range v0-v31
+// x - An FP/SIMD register of some size in the range v0-v15
+// I - Constant that can be used with an ADD instruction
+// J - Constant that can be used with a SUB instruction
+// K - Constant that can be used with a 32-bit logical instruction
+// L - Constant that can be used with a 64-bit logical instruction
+// M - Constant that can be used as a 32-bit MOV immediate
+// N - Constant that can be used as a 64-bit MOV immediate
+// Q - A memory reference with base register and no offset
+// S - A symbolic address
+// Y - Floating point constant zero
+// Z - Integer constant zero
+//
+//   Note that general register operands will be output using their 64-bit x
+// register name, whatever the size of the variable, unless the asm operand
+// is prefixed by the %w modifier. Floating-point and SIMD register operands
+// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
+// %q modifier.
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+AArch64TargetLowering::ConstraintType
+AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'z':
+      return C_Other;
+    case 'x':
+    case 'w':
+      return C_RegisterClass;
+    // An address with a single base register. Due to the way we
+    // currently handle addresses it is the same as 'r'.
+    case 'Q':
+      return C_Memory;
+    }
   }
+  return TargetLowering::getConstraintType(Constraint);
+}
 
-  // We need the MaskedBFI to be combined with a mask of the *same* value.
-  if (PossExtraMask.getOperand(0) != OldBFIVal)
-    return SDValue();
-
-  BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
-                    OldBFIVal, NewBFIVal,
-                    BFI.getOperand(2), BFI.getOperand(3));
-
-  // If the masking is trivial, we don't need to create it.
-  if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits())))
-    return BFI;
-
-  return DAG.getNode(ISD::AND, DL, VT, BFI,
-                     DAG.getConstant(ExtraMask | ExistingMask, VT));
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+AArch64TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'x':
+  case 'w':
+    if (type->isFloatingPointTy() || type->isVectorTy())
+      weight = CW_Register;
+    break;
+  case 'z':
+    weight = CW_Constant;
+    break;
+  }
+  return weight;
 }
 
-/// An EXTR instruction is made up of two shifts, ORed together. This helper
-/// searches for and classifies those shifts.
-static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
-                         bool &FromHi) {
-  if (N.getOpcode() == ISD::SHL)
-    FromHi = false;
-  else if (N.getOpcode() == ISD::SRL)
-    FromHi = true;
-  else
-    return false;
+std::pair<unsigned, const TargetRegisterClass *>
+AArch64TargetLowering::getRegForInlineAsmConstraint(
+    const std::string &Constraint, MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::GPR64commonRegClass);
+      return std::make_pair(0U, &AArch64::GPR32commonRegClass);
+    case 'w':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &AArch64::FPR32RegClass);
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::FPR64RegClass);
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::FPR128RegClass);
+      break;
+    // The instructions that this constraint is designed for can
+    // only take 128-bit registers so just use that regclass.
+    case 'x':
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::FPR128_loRegClass);
+      break;
+    }
+  }
+  if (StringRef("{cc}").equals_lower(Constraint))
+    return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
 
-  if (!isa<ConstantSDNode>(N.getOperand(1)))
-    return false;
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass *> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // Not found as a standard register?
+  if (!Res.second) {
+    unsigned Size = Constraint.size();
+    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
+      const std::string Reg =
+          std::string(&Constraint[2], &Constraint[Size - 1]);
+      int RegNo = atoi(Reg.c_str());
+      if (RegNo >= 0 && RegNo <= 31) {
+        // v0 - v31 are aliases of q0 - q31.
+        // By default we'll emit v0-v31 for this unless there's a modifier where
+        // we'll emit the correct register as well.
+        Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
+        Res.second = &AArch64::FPR128RegClass;
+      }
+    }
+  }
 
-  ShiftAmount = N->getConstantOperandVal(1);
-  Src = N->getOperand(0);
-  return true;
+  return Res;
 }
 
-/// EXTR instruction extracts a contiguous chunk of bits from two existing
-/// registers viewed as a high/low pair. This function looks for the pattern:
-/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
-/// EXTR. Can't quite be done in TableGen because the two immediates aren't
-/// independent.
-static SDValue tryCombineToEXTR(SDNode *N,
-                                TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void AArch64TargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  SDValue Result;
 
-  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1)
+    return;
 
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default:
+    break;
 
-  SDValue LHS;
-  uint32_t ShiftLHS = 0;
-  bool LHSFromHi = 0;
-  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
-    return SDValue();
+  // This set of constraints deal with valid constants for various instructions.
+  // Validate and return a target constant for them if we can.
+  case 'z': {
+    // 'z' maps to xzr or wzr so it needs an input of 0.
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C || C->getZExtValue() != 0)
+      return;
 
-  SDValue RHS;
-  uint32_t ShiftRHS = 0;
-  bool RHSFromHi = 0;
-  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
-    return SDValue();
+    if (Op.getValueType() == MVT::i64)
+      Result = DAG.getRegister(AArch64::XZR, MVT::i64);
+    else
+      Result = DAG.getRegister(AArch64::WZR, MVT::i32);
+    break;
+  }
 
-  // If they're both trying to come from the high part of the register, they're
-  // not really an EXTR.
-  if (LHSFromHi == RHSFromHi)
-    return SDValue();
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
 
-  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
-    return SDValue();
+    // Grab the value and do some validation.
+    uint64_t CVal = C->getZExtValue();
+    switch (ConstraintLetter) {
+    // The I constraint applies only to simple ADD or SUB immediate operands:
+    // i.e. 0 to 4095 with optional shift by 12
+    // The J constraint applies only to ADD or SUB immediates that would be
+    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
+    // instruction [or vice versa], in other words -1 to -4095 with optional
+    // left shift by 12.
+    case 'I':
+      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
+        break;
+      return;
+    case 'J': {
+      uint64_t NVal = -C->getSExtValue();
+      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
+        break;
+      return;
+    }
+    // The K and L constraints apply *only* to logical immediates, including
+    // what used to be the MOVI alias for ORR (though the MOVI alias has now
+    // been removed and MOV should be used). So these constraints have to
+    // distinguish between bit patterns that are valid 32-bit or 64-bit
+    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
+    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
+    // versa.
+    case 'K':
+      if (AArch64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      return;
+    case 'L':
+      if (AArch64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      return;
+    // The M and N constraints are a superset of K and L respectively, for use
+    // with the MOV (immediate) alias. As well as the logical immediates they
+    // also match 32 or 64-bit immediates that can be loaded either using a
+    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
+    // (M) or 64-bit 0x1234000000000000 (N) etc.
+    // As a note some of this code is liberally stolen from the asm parser.
+    case 'M': {
+      if (!isUInt<32>(CVal))
+        return;
+      if (AArch64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      if ((CVal & 0xFFFF) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~(uint32_t)CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      return;
+    }
+    case 'N': {
+      if (AArch64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      if ((CVal & 0xFFFFULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF00000000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF000000000000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
+        break;
+      return;
+    }
+    default:
+      return;
+    }
 
-  if (LHSFromHi) {
-    std::swap(LHS, RHS);
-    std::swap(ShiftLHS, ShiftRHS);
+    // All assembler immediates are 64-bit integers.
+    Result = DAG.getTargetConstant(CVal, MVT::i64);
+    break;
   }
 
-  return DAG.getNode(AArch64ISD::EXTR, DL, VT,
-                     LHS, RHS,
-                     DAG.getConstant(ShiftRHS, MVT::i64));
-}
-
-/// Target-specific dag combine xforms for ISD::OR
-static SDValue PerformORCombine(SDNode *N,
-                                TargetLowering::DAGCombinerInfo &DCI,
-                                const AArch64Subtarget *Subtarget) {
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
 
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
 
-  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
-    return SDValue();
+//===----------------------------------------------------------------------===//
+//                     AArch64 Advanced SIMD Support
+//===----------------------------------------------------------------------===//
 
-  // Attempt to recognise bitfield-insert operations.
-  SDValue Res = tryCombineToBFI(N, DCI, Subtarget);
-  if (Res.getNode())
-    return Res;
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
+  EVT VT = V64Reg.getValueType();
+  unsigned NarrowSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+  SDLoc DL(V64Reg);
+
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
+                     V64Reg, DAG.getConstant(0, MVT::i32));
+}
 
-  // Attempt to combine an existing MaskedBFI operation into one with a larger
-  // mask.
-  Res = tryCombineToLargerBFI(N, DCI, Subtarget);
-  if (Res.getNode())
-    return Res;
+/// getExtFactor - Determine the adjustment factor for the position when
+/// generating an "extract from vector registers" instruction.
+static unsigned getExtFactor(SDValue &V) {
+  EVT EltType = V.getValueType().getVectorElementType();
+  return EltType.getSizeInBits() / 8;
+}
 
-  Res = tryCombineToEXTR(N, DCI);
-  if (Res.getNode())
-    return Res;
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+  SDLoc DL(V128Reg);
 
-  if (!Subtarget->hasNEON())
-    return SDValue();
+  return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
+}
 
-  // Attempt to use vector immediate-form BSL
-  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
 
-  SDValue N0 = N->getOperand(0);
-  if (N0.getOpcode() != ISD::AND)
-    return SDValue();
+  SmallVector<SDValue, 2> SourceVecs;
+  SmallVector<unsigned, 2> MinElts;
+  SmallVector<unsigned, 2> MaxElts;
 
-  SDValue N1 = N->getOperand(1);
-  if (N1.getOpcode() != ISD::AND)
-    return SDValue();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+      // A shuffle can only come from building a vector from various
+      // elements of other vectors.
+      return SDValue();
+    }
 
-  if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
-    APInt SplatUndef;
-    unsigned SplatBitSize;
-    bool HasAnyUndefs;
-    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
-    APInt SplatBits0;
-    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
-                                      HasAnyUndefs) &&
-        !HasAnyUndefs) {
-      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
-      APInt SplatBits1;
-      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
-                                        HasAnyUndefs) && !HasAnyUndefs &&
-          SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
-          SplatBits0 == ~SplatBits1) {
-
-        return DAG.getNode(ISD::VSELECT, DL, VT, N0->getOperand(1),
-                           N0->getOperand(0), N1->getOperand(0));
+    // Record this extraction against the appropriate vector if possible...
+    SDValue SourceVec = V.getOperand(0);
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    bool FoundSource = false;
+    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
+      if (SourceVecs[j] == SourceVec) {
+        if (MinElts[j] > EltNo)
+          MinElts[j] = EltNo;
+        if (MaxElts[j] < EltNo)
+          MaxElts[j] = EltNo;
+        FoundSource = true;
+        break;
       }
     }
+
+    // Or record a new source if not...
+    if (!FoundSource) {
+      SourceVecs.push_back(SourceVec);
+      MinElts.push_back(EltNo);
+      MaxElts.push_back(EltNo);
+    }
   }
 
-  return SDValue();
-}
+  // Currently only do something sane when at most two source vectors
+  // involved.
+  if (SourceVecs.size() > 2)
+    return SDValue();
 
-/// Target-specific dag combine xforms for ISD::SRA
-static SDValue PerformSRACombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
+  int VEXTOffsets[2] = { 0, 0 };
 
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
+  // This loop extracts the usage patterns of the source vectors
+  // and prepares appropriate SDValues for a shuffle if possible.
+  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
+    if (SourceVecs[i].getValueType() == VT) {
+      // No VEXT necessary
+      ShuffleSrcs[i] = SourceVecs[i];
+      VEXTOffsets[i] = 0;
+      continue;
+    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
+      // We can pad out the smaller vector for free, so if it's part of a
+      // shuffle...
+      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i],
+                                   DAG.getUNDEF(SourceVecs[i].getValueType()));
+      continue;
+    }
 
-  // We're looking for an SRA/SHL pair which form an SBFX.
+    // Don't attempt to extract subvectors from BUILD_VECTOR sources
+    // that expand or trunc the original value.
+    // TODO: We can try to bitcast and ANY_EXTEND the result but
+    // we need to consider the cost of vector ANY_EXTEND, and the
+    // legality of all the types.
+    if (SourceVecs[i].getValueType().getVectorElementType() !=
+        VT.getVectorElementType())
+      return SDValue();
 
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
+    // Since only 64-bit and 128-bit vectors are legal on ARM and
+    // we've eliminated the other cases...
+    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts &&
+           "unexpected vector sizes in ReconstructShuffle");
 
-  if (!isa<ConstantSDNode>(N->getOperand(1)))
-    return SDValue();
+    if (MaxElts[i] - MinElts[i] >= NumElts) {
+      // Span too large for a VEXT to cope
+      return SDValue();
+    }
 
-  uint64_t ExtraSignBits = N->getConstantOperandVal(1);
-  SDValue Shift = N->getOperand(0);
+    if (MinElts[i] >= NumElts) {
+      // The extraction can just take the second half
+      VEXTOffsets[i] = NumElts;
+      ShuffleSrcs[i] =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
+                      DAG.getIntPtrConstant(NumElts));
+    } else if (MaxElts[i] < NumElts) {
+      // The extraction can just take the first half
+      VEXTOffsets[i] = 0;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                   SourceVecs[i], DAG.getIntPtrConstant(0));
+    } else {
+      // An actual VEXT is needed
+      VEXTOffsets[i] = MinElts[i];
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                     SourceVecs[i], DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc2 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
+                      DAG.getIntPtrConstant(NumElts));
+      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
+      ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
+                                   DAG.getConstant(Imm, MVT::i32));
+    }
+  }
 
-  if (Shift.getOpcode() != ISD::SHL)
-    return SDValue();
+  SmallVector<int, 8> Mask;
 
-  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
-    return SDValue();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Entry = Op.getOperand(i);
+    if (Entry.getOpcode() == ISD::UNDEF) {
+      Mask.push_back(-1);
+      continue;
+    }
 
-  uint64_t BitsOnLeft = Shift->getConstantOperandVal(1);
-  uint64_t Width = VT.getSizeInBits() - ExtraSignBits;
-  uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft;
+    SDValue ExtractVec = Entry.getOperand(0);
+    int ExtractElt =
+        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
+    if (ExtractVec == SourceVecs[0]) {
+      Mask.push_back(ExtractElt - VEXTOffsets[0]);
+    } else {
+      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
+    }
+  }
 
-  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
-    return SDValue();
+  // Final check before we try to produce nonsense...
+  if (isShuffleMaskLegal(Mask, VT))
+    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
+                                &Mask[0]);
 
-  return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0),
-                     DAG.getConstant(LSB, MVT::i64),
-                     DAG.getConstant(LSB + Width - 1, MVT::i64));
+  return SDValue();
 }
 
-/// Check if this is a valid build_vector for the immediate operand of
-/// a vector shift operation, where all the elements of the build_vector
-/// must have the same constant integer value.
-static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
-  // Ignore bit_converts.
-  while (Op.getOpcode() == ISD::BITCAST)
-    Op = Op.getOperand(0);
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
-                                      HasAnyUndefs, ElementBits) ||
-      SplatBitSize > ElementBits)
-    return false;
-  Cnt = SplatBits.getSExtValue();
-  return true;
-}
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
 
-/// Check if this is a valid build_vector for the immediate operand of
-/// a vector shift left operation.  That value must be in the range:
-/// 0 <= Value < ElementBits
-static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
     return false;
-  return (Cnt >= 0 && Cnt < ElementBits);
-}
 
-/// Check if this is a valid build_vector for the immediate operand of a
-/// vector shift right operation. The value must be in the range:
-///   1 <= Value <= ElementBits
-static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
-    return false;
-  return (Cnt >= 1 && Cnt <= ElementBits);
-}
+  Imm = M[0];
 
-static SDValue GenForSextInreg(SDNode *N,
-                               TargetLowering::DAGCombinerInfo &DCI,
-                               EVT SrcVT, EVT DestVT, EVT SubRegVT,
-                               const int *Mask, SDValue Src) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDValue Bitcast
-    = DAG.getNode(ISD::BITCAST, SDLoc(N), SrcVT, Src);
-  SDValue Sext
-    = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), DestVT, Bitcast);
-  SDValue ShuffleVec
-    = DAG.getVectorShuffle(DestVT, SDLoc(N), Sext, DAG.getUNDEF(DestVT), Mask);
-  SDValue ExtractSubreg
-    = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N),
-                SubRegVT, ShuffleVec,
-                DAG.getTargetConstant(AArch64::sub_64, MVT::i32)), 0);
-  return ExtractSubreg;
-}
-
-/// Checks for vector shifts and lowers them.
-static SDValue PerformShiftCombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   const AArch64Subtarget *ST) {
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-  if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64))
-    return PerformSRACombine(N, DCI);
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
 
-  // We're looking for an SRA/SHL pair to help generating instruction
-  //   sshll  v0.8h, v0.8b, #0
-  // The instruction STXL is also the alias of this instruction.
-  //
-  // For example, for DAG like below,
-  //   v2i32 = sra (v2i32 (shl v2i32, 16)), 16
-  // we can transform it into
-  //   v2i32 = EXTRACT_SUBREG 
-  //             (v4i32 (suffle_vector
-  //                       (v4i32 (sext (v4i16 (bitcast v2i32))), 
-  //                       undef, (0, 2, u, u)),
-  //             sub_64
-  //
-  // With this transformation we expect to generate "SSHLL + UZIP1"
-  // Sometimes UZIP1 can be optimized away by combining with other context.
-  int64_t ShrCnt, ShlCnt;
-  if (N->getOpcode() == ISD::SRA
-      && (VT == MVT::v2i32 || VT == MVT::v4i16)
-      && isVShiftRImm(N->getOperand(1), VT, ShrCnt)
-      && N->getOperand(0).getOpcode() == ISD::SHL
-      && isVShiftRImm(N->getOperand(0).getOperand(1), VT, ShlCnt)) {
-    SDValue Src = N->getOperand(0).getOperand(0);
-    if (VT == MVT::v2i32 && ShrCnt == 16 && ShlCnt == 16) {
-      // sext_inreg(v2i32, v2i16)
-      // We essentially only care the Mask {0, 2, u, u}
-      int Mask[4] = {0, 2, 4, 6};
-      return GenForSextInreg(N, DCI, MVT::v4i16, MVT::v4i32, MVT::v2i32,
-                             Mask, Src); 
-    }
-    else if (VT == MVT::v2i32 && ShrCnt == 24 && ShlCnt == 24) {
-      // sext_inreg(v2i16, v2i8)
-      // We essentially only care the Mask {0, u, 4, u, u, u, u, u, u, u, u, u}
-      int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14};
-      return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v2i32,
-                             Mask, Src);
-    }
-    else if (VT == MVT::v4i16 && ShrCnt == 8 && ShlCnt == 8) {
-      // sext_inreg(v4i16, v4i8)
-      // We essentially only care the Mask {0, 2, 4, 6, u, u, u, u, u, u, u, u}
-      int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14};
-      return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v4i16,
-                             Mask, Src);
-    }
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
   }
 
-  // Nothing to be done for scalar shifts.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!VT.isVector() || !TLI.isTypeLegal(VT))
-    return SDValue();
-
-  assert(ST->hasNEON() && "unexpected vector shift");
-  int64_t Cnt;
+  return true;
+}
 
-  switch (N->getOpcode()) {
-  default:
-    llvm_unreachable("unexpected shift opcode");
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are different.
+static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
+                      unsigned &Imm) {
+  // Look for the first non-undef element.
+  const int *FirstRealElt = std::find_if(M.begin(), M.end(),
+      [](int Elt) {return Elt >= 0;});
 
-  case ISD::SHL:
-    if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
-      SDValue RHS =
-          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
-                      DAG.getConstant(Cnt, MVT::i32));
-      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
-    }
-    break;
+  // Benefit form APInt to handle overflow when calculating expected element.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+  // The following shuffle indices must be the successive elements after the
+  // first real element.
+  const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
+      [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
+  if (FirstWrongElt != M.end())
+    return false;
 
-  case ISD::SRA:
-  case ISD::SRL:
-    if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
-      SDValue RHS =
-          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
-                      DAG.getConstant(Cnt, MVT::i32));
-      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
-    }
-    break;
-  }
+  // The index of an EXT is the first element if it is not UNDEF.
+  // Watch out for the beginning UNDEFs. The EXT index should be the expected
+  // value of the first element.  E.g. 
+  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
+  // ExpectedElt is the last mask index plus 1.
+  Imm = ExpectedElt.getZExtValue();
+
+  // There are two difference cases requiring to reverse input vectors.
+  // For example, for vector <4 x i32> we have the following cases,
+  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
+  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
+  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
+  // to reverse two input vectors.
+  if (Imm < NumElts)
+    ReverseEXT = true;
+  else
+    Imm -= NumElts;
 
-  return SDValue();
+  return true;
 }
 
-/// ARM-specific DAG combining for intrinsics.
-static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-
-  switch (IntNo) {
-  default:
-    // Don't do anything for most intrinsics.
-    break;
-
-  case Intrinsic::arm_neon_vqshifts:
-  case Intrinsic::arm_neon_vqshiftu:
-    EVT VT = N->getOperand(1).getValueType();
-    int64_t Cnt;
-    if (!isVShiftLImm(N->getOperand(2), VT, Cnt))
-      break;
-    unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts)
-                             ? AArch64ISD::NEON_QSHLs
-                             : AArch64ISD::NEON_QSHLu;
-    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for REV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
   }
 
-  return SDValue();
+  return true;
 }
 
-/// Target-specific DAG combine function for NEON load/store intrinsics
-/// to merge base address updates.
-static SDValue CombineBaseUpdate(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
-  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
-    return SDValue();
+static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
+      return false;
+    Idx += 1;
+  }
 
-  SelectionDAG &DAG = DCI.DAG;
-  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
-                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
-  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
-  SDValue Addr = N->getOperand(AddrOpIdx);
+  return true;
+}
 
-  // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
-       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (User->getOpcode() != ISD::ADD ||
-        UI.getUse().getResNo() != Addr.getResNo())
-      continue;
+static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != 2 * i + WhichResult)
+      return false;
+  }
 
-    // Check that the add is independent of the load/store.  Otherwise, folding
-    // it would create a cycle.
-    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
-      continue;
+  return true;
+}
 
-    // Find the new opcode for the updating load/store.
-    bool isLoad = true;
-    bool isLaneOp = false;
-    unsigned NewOpc = 0;
-    unsigned NumVecs = 0;
-    if (isIntrinsic) {
-      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-      switch (IntNo) {
-      default: llvm_unreachable("unexpected intrinsic for Neon base update");
-      case Intrinsic::arm_neon_vld1:       NewOpc = AArch64ISD::NEON_LD1_UPD;
-        NumVecs = 1; break;
-      case Intrinsic::arm_neon_vld2:       NewOpc = AArch64ISD::NEON_LD2_UPD;
-        NumVecs = 2; break;
-      case Intrinsic::arm_neon_vld3:       NewOpc = AArch64ISD::NEON_LD3_UPD;
-        NumVecs = 3; break;
-      case Intrinsic::arm_neon_vld4:       NewOpc = AArch64ISD::NEON_LD4_UPD;
-        NumVecs = 4; break;
-      case Intrinsic::arm_neon_vst1:       NewOpc = AArch64ISD::NEON_ST1_UPD;
-        NumVecs = 1; isLoad = false; break;
-      case Intrinsic::arm_neon_vst2:       NewOpc = AArch64ISD::NEON_ST2_UPD;
-        NumVecs = 2; isLoad = false; break;
-      case Intrinsic::arm_neon_vst3:       NewOpc = AArch64ISD::NEON_ST3_UPD;
-        NumVecs = 3; isLoad = false; break;
-      case Intrinsic::arm_neon_vst4:       NewOpc = AArch64ISD::NEON_ST4_UPD;
-        NumVecs = 4; isLoad = false; break;
-      case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
-        NumVecs = 2; break;
-      case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
-        NumVecs = 3; break;
-      case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
-        NumVecs = 4; break;
-      case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
-        NumVecs = 2; isLoad = false; break;
-      case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
-        NumVecs = 3; isLoad = false; break;
-      case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
-        NumVecs = 4; isLoad = false; break;
-      case Intrinsic::arm_neon_vld2lane:   NewOpc = AArch64ISD::NEON_LD2LN_UPD;
-        NumVecs = 2; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vld3lane:   NewOpc = AArch64ISD::NEON_LD3LN_UPD;
-        NumVecs = 3; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vld4lane:   NewOpc = AArch64ISD::NEON_LD4LN_UPD;
-        NumVecs = 4; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst2lane:   NewOpc = AArch64ISD::NEON_ST2LN_UPD;
-        NumVecs = 2; isLoad = false; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst3lane:   NewOpc = AArch64ISD::NEON_ST3LN_UPD;
-        NumVecs = 3; isLoad = false; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst4lane:   NewOpc = AArch64ISD::NEON_ST4LN_UPD;
-        NumVecs = 4; isLoad = false; isLaneOp = true; break;
-      }
-    } else {
-      isLaneOp = true;
-      switch (N->getOpcode()) {
-      default: llvm_unreachable("unexpected opcode for Neon base update");
-      case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD;
-        NumVecs = 2; break;
-      case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD;
-        NumVecs = 3; break;
-      case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD;
-        NumVecs = 4; break;
-      }
-    }
+static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
+      return false;
+  }
+  return true;
+}
 
-    // Find the size of memory referenced by the load/store.
-    EVT VecTy;
-    if (isLoad)
-      VecTy = N->getValueType(0);
-    else
-      VecTy = N->getOperand(AddrOpIdx + 1).getValueType();
-    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
-    if (isLaneOp)
-      NumBytes /= VecTy.getVectorNumElements();
+/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
+      return false;
+    Idx += 1;
+  }
 
-    // If the increment is a constant, it must match the memory ref size.
-    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
-    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
-      uint32_t IncVal = CInc->getZExtValue();
-      if (IncVal != NumBytes)
-        continue;
-      Inc = DAG.getTargetConstant(IncVal, MVT::i32);
-    }
+  return true;
+}
 
-    // Create the new updating load/store node.
-    EVT Tys[6];
-    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
-    unsigned n;
-    for (n = 0; n < NumResultVecs; ++n)
-      Tys[n] = VecTy;
-    Tys[n++] = MVT::i64;
-    Tys[n] = MVT::Other;
-    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2);
-    SmallVector<SDValue, 8> Ops;
-    Ops.push_back(N->getOperand(0)); // incoming chain
-    Ops.push_back(N->getOperand(AddrOpIdx));
-    Ops.push_back(Inc);
-    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
-      Ops.push_back(N->getOperand(i));
+/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned Half = VT.getVectorNumElements() / 2;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned j = 0; j != 2; ++j) {
+    unsigned Idx = WhichResult;
+    for (unsigned i = 0; i != Half; ++i) {
+      int MIdx = M[i + j * Half];
+      if (MIdx >= 0 && (unsigned)MIdx != Idx)
+        return false;
+      Idx += 2;
     }
-    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
-                                           Ops.data(), Ops.size(),
-                                           MemInt->getMemoryVT(),
-                                           MemInt->getMemOperand());
+  }
 
-    // Update the uses.
-    std::vector<SDValue> NewResults;
-    for (unsigned i = 0; i < NumResultVecs; ++i) {
-      NewResults.push_back(SDValue(UpdN.getNode(), i));
-    }
-    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
-    DCI.CombineTo(N, NewResults);
-    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+  return true;
+}
 
-    break;
+/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
+      return false;
   }
-  return SDValue();
+  return true;
 }
 
-/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1)
-/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs.
-/// If so, combine them to a vldN-dup operation and return true.
-static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
+static bool isINSMask(ArrayRef<int> M, int NumInputElements,
+                      bool &DstIsLeft, int &Anomaly) {
+  if (M.size() != static_cast<size_t>(NumInputElements))
+    return false;
 
-  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
-  SDNode *VLD = N->getOperand(0).getNode();
-  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
-    return SDValue();
-  unsigned NumVecs = 0;
-  unsigned NewOpc = 0;
-  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
-  if (IntNo == Intrinsic::arm_neon_vld2lane) {
-    NumVecs = 2;
-    NewOpc = AArch64ISD::NEON_LD2DUP;
-  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
-    NumVecs = 3;
-    NewOpc = AArch64ISD::NEON_LD3DUP;
-  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
-    NumVecs = 4;
-    NewOpc = AArch64ISD::NEON_LD4DUP;
-  } else {
-    return SDValue();
-  }
+  int NumLHSMatch = 0, NumRHSMatch = 0;
+  int LastLHSMismatch = -1, LastRHSMismatch = -1;
 
-  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
-  // numbers match the load.
-  unsigned VLDLaneNo =
-      cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue();
-  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
-       UI != UE; ++UI) {
-    // Ignore uses of the chain result.
-    if (UI.getUse().getResNo() == NumVecs)
+  for (int i = 0; i < NumInputElements; ++i) {
+    if (M[i] == -1) {
+      ++NumLHSMatch;
+      ++NumRHSMatch;
       continue;
-    SDNode *User = *UI;
-    if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE ||
-        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
-      return SDValue();
-  }
+    }
 
-  // Create the vldN-dup node.
-  EVT Tys[5];
-  unsigned n;
-  for (n = 0; n < NumVecs; ++n)
-    Tys[n] = VT;
-  Tys[n] = MVT::Other;
-  SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1);
-  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
-  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
-  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2,
-                                           VLDMemInt->getMemoryVT(),
-                                           VLDMemInt->getMemOperand());
-
-  // Update the uses.
-  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
-       UI != UE; ++UI) {
-    unsigned ResNo = UI.getUse().getResNo();
-    // Ignore uses of the chain result.
-    if (ResNo == NumVecs)
-      continue;
-    SDNode *User = *UI;
-    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
+    if (M[i] == i)
+      ++NumLHSMatch;
+    else
+      LastLHSMismatch = i;
+
+    if (M[i] == i + NumInputElements)
+      ++NumRHSMatch;
+    else
+      LastRHSMismatch = i;
   }
 
-  // Now the vldN-lane intrinsic is dead except for its chain result.
-  // Update uses of the chain.
-  std::vector<SDValue> VLDDupResults;
-  for (unsigned n = 0; n < NumVecs; ++n)
-    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
-  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
-  DCI.CombineTo(VLD, VLDDupResults);
+  if (NumLHSMatch == NumInputElements - 1) {
+    DstIsLeft = true;
+    Anomaly = LastLHSMismatch;
+    return true;
+  } else if (NumRHSMatch == NumInputElements - 1) {
+    DstIsLeft = false;
+    Anomaly = LastRHSMismatch;
+    return true;
+  }
 
-  return SDValue(N, 0);
+  return false;
 }
 
-// v1i1 setcc ->
-//     v1i1 (bitcast (i1 setcc (extract_vector_elt, extract_vector_elt))
-// FIXME: Currently the type legalizer can't handle SETCC having v1i1 as result.
-// If it can legalize "v1i1 SETCC" correctly, no need to combine such SETCC.
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT ResVT = N->getValueType(0);
-
-  if (!ResVT.isVector() || ResVT.getVectorNumElements() != 1 ||
-      ResVT.getVectorElementType() != MVT::i1)
-    return SDValue();
-
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  EVT CmpVT = LHS.getValueType();
-  LHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
-                    CmpVT.getVectorElementType(), LHS,
-                    DAG.getConstant(0, MVT::i64));
-  RHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
-                    CmpVT.getVectorElementType(), RHS,
-                    DAG.getConstant(0, MVT::i64));
-  SDValue SetCC =
-      DAG.getSetCC(SDLoc(N), MVT::i1, LHS, RHS,
-                   cast<CondCodeSDNode>(N->getOperand(2))->get());
-  return DAG.getNode(ISD::BITCAST, SDLoc(N), ResVT, SetCC);
-}
+static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
+  if (VT.getSizeInBits() != 128)
+    return false;
 
-// vselect (v1i1 setcc) ->
-//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
-// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
-// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
-// such VSELECT.
-static SDValue PerformVSelectCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  EVT CCVT = N0.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
 
-  if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
-      CCVT.getVectorElementType() != MVT::i1)
-    return SDValue();
+  for (int I = 0, E = NumElts / 2; I != E; I++) {
+    if (Mask[I] != I)
+      return false;
+  }
 
-  EVT ResVT = N->getValueType(0);
-  EVT CmpVT = N0.getOperand(0).getValueType();
-  // Only combine when the result type is of the same size as the compared
-  // operands.
-  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
-    return SDValue();
+  int Offset = NumElts / 2;
+  for (int I = NumElts / 2, E = NumElts; I != E; I++) {
+    if (Mask[I] != I + SplitLHS * Offset)
+      return false;
+  }
 
-  SDValue IfTrue = N->getOperand(1);
-  SDValue IfFalse = N->getOperand(2);
-  SDValue SetCC =
-      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
-                   N0.getOperand(0), N0.getOperand(1),
-                   cast<CondCodeSDNode>(N0.getOperand(2))->get());
-  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
-                     IfTrue, IfFalse);
+  return true;
 }
 
-// sign_extend (extract_vector_elt (v1i1 setcc)) ->
-//     extract_vector_elt (v1iXX setcc)
-// (XX is the size of the compared operand type)
-static SDValue PerformSignExtendCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  SDValue Vec = N0.getOperand(0);
+static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue V0 = Op.getOperand(0);
+  SDValue V1 = Op.getOperand(1);
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
 
-  if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-      Vec.getOpcode() != ISD::SETCC)
+  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
+      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
     return SDValue();
 
-  EVT ResVT = N->getValueType(0);
-  EVT CmpVT = Vec.getOperand(0).getValueType();
-  // Only optimize when the result type is of the same size as the element
-  // type of the compared operand.
-  if (ResVT.getSizeInBits() != CmpVT.getVectorElementType().getSizeInBits())
-    return SDValue();
+  bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
 
-  SDValue Lane = N0.getOperand(1);
-  SDValue SetCC =
-      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
-                   Vec.getOperand(0), Vec.getOperand(1),
-                   cast<CondCodeSDNode>(Vec.getOperand(2))->get());
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ResVT,
-                     SetCC, Lane);
-}
+  if (!isConcatMask(Mask, VT, SplitV0))
+    return SDValue();
 
-SDValue
-AArch64TargetLowering::PerformDAGCombine(SDNode *N,
-                                         DAGCombinerInfo &DCI) const {
-  switch (N->getOpcode()) {
-  default: break;
-  case ISD::AND: return PerformANDCombine(N, DCI);
-  case ISD::OR: return PerformORCombine(N, DCI, getSubtarget());
-  case ISD::SHL:
-  case ISD::SRA:
-  case ISD::SRL:
-    return PerformShiftCombine(N, DCI, getSubtarget());
-  case ISD::SETCC: return PerformSETCCCombine(N, DCI.DAG);
-  case ISD::VSELECT: return PerformVSelectCombine(N, DCI.DAG);
-  case ISD::SIGN_EXTEND: return PerformSignExtendCombine(N, DCI.DAG);
-  case ISD::INTRINSIC_WO_CHAIN:
-    return PerformIntrinsicCombine(N, DCI.DAG);
-  case AArch64ISD::NEON_VDUPLANE:
-    return CombineVLDDUP(N, DCI);
-  case AArch64ISD::NEON_LD2DUP:
-  case AArch64ISD::NEON_LD3DUP:
-  case AArch64ISD::NEON_LD4DUP:
-    return CombineBaseUpdate(N, DCI);
-  case ISD::INTRINSIC_VOID:
-  case ISD::INTRINSIC_W_CHAIN:
-    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
-    case Intrinsic::arm_neon_vld1:
-    case Intrinsic::arm_neon_vld2:
-    case Intrinsic::arm_neon_vld3:
-    case Intrinsic::arm_neon_vld4:
-    case Intrinsic::arm_neon_vst1:
-    case Intrinsic::arm_neon_vst2:
-    case Intrinsic::arm_neon_vst3:
-    case Intrinsic::arm_neon_vst4:
-    case Intrinsic::arm_neon_vld2lane:
-    case Intrinsic::arm_neon_vld3lane:
-    case Intrinsic::arm_neon_vld4lane:
-    case Intrinsic::aarch64_neon_vld1x2:
-    case Intrinsic::aarch64_neon_vld1x3:
-    case Intrinsic::aarch64_neon_vld1x4:
-    case Intrinsic::aarch64_neon_vst1x2:
-    case Intrinsic::aarch64_neon_vst1x3:
-    case Intrinsic::aarch64_neon_vst1x4:
-    case Intrinsic::arm_neon_vst2lane:
-    case Intrinsic::arm_neon_vst3lane:
-    case Intrinsic::arm_neon_vst4lane:
-      return CombineBaseUpdate(N, DCI);
-    default:
-      break;
-    }
+  EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                                VT.getVectorNumElements() / 2);
+  if (SplitV0) {
+    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
+                     DAG.getConstant(0, MVT::i64));
   }
-  return SDValue();
+  if (V1.getValueType().getSizeInBits() == 128) {
+    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
+                     DAG.getConstant(0, MVT::i64));
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
 }
 
-bool
-AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  VT = VT.getScalarType();
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      SDLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
+  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
+
+  enum {
+    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VREV,
+    OP_VDUP0,
+    OP_VDUP1,
+    OP_VDUP2,
+    OP_VDUP3,
+    OP_VEXT1,
+    OP_VEXT2,
+    OP_VEXT3,
+    OP_VUZPL, // VUZP, left result
+    OP_VUZPR, // VUZP, right result
+    OP_VZIPL, // VZIP, left result
+    OP_VZIPR, // VZIP, right result
+    OP_VTRNL, // VTRN, left result
+    OP_VTRNR  // VTRN, right result
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1 * 9 + 2) * 9 + 3)
+      return LHS;
+    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
+    return RHS;
+  }
 
-  if (!VT.isSimple())
-    return false;
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  EVT VT = OpLHS.getValueType();
 
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::f16:
-  case MVT::f32:
-  case MVT::f64:
-    return true;
-  case MVT::f128:
-    return false;
+  switch (OpNum) {
   default:
-    break;
-  }
+    llvm_unreachable("Unknown shuffle opcode!");
+  case OP_VREV:
+    // VREV divides the vector in half and swaps within the half.
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
+      return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
+    // vrev <4 x i16> -> REV32
+    if (VT.getVectorElementType() == MVT::i16)
+      return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
+    // vrev <4 x i8> -> REV16
+    assert(VT.getVectorElementType() == MVT::i8);
+    return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
+  case OP_VDUP0:
+  case OP_VDUP1:
+  case OP_VDUP2:
+  case OP_VDUP3: {
+    EVT EltTy = VT.getVectorElementType();
+    unsigned Opcode;
+    if (EltTy == MVT::i8)
+      Opcode = AArch64ISD::DUPLANE8;
+    else if (EltTy == MVT::i16)
+      Opcode = AArch64ISD::DUPLANE16;
+    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
+      Opcode = AArch64ISD::DUPLANE32;
+    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
+      Opcode = AArch64ISD::DUPLANE64;
+    else
+      llvm_unreachable("Invalid vector element type?");
 
-  return false;
+    if (VT.getSizeInBits() == 64)
+      OpLHS = WidenVector(OpLHS, DAG);
+    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
+    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
+  }
+  case OP_VEXT1:
+  case OP_VEXT2:
+  case OP_VEXT3: {
+    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
+    return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+  case OP_VUZPL:
+    return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VUZPR:
+    return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VZIPL:
+    return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VZIPR:
+    return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VTRNL:
+    return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VTRNR:
+    return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  }
 }
-// Check whether a shuffle_vector could be presented as concat_vector.
-bool AArch64TargetLowering::isConcatVector(SDValue Op, SelectionDAG &DAG,
-                                           SDValue V0, SDValue V1,
-                                           const int *Mask,
-                                           SDValue &Res) const {
+
+static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
+                           SelectionDAG &DAG) {
+  // Check to see if we can use the TBL instruction.
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
   SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  if (VT.getSizeInBits() != 128)
-    return false;
-  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
-      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
-    return false;
 
-  unsigned NumElts = VT.getVectorNumElements();
-  bool isContactVector = true;
-  bool splitV0 = false;
-  if (V0.getValueType().getSizeInBits() == 128)
-    splitV0 = true;
+  EVT EltVT = Op.getValueType().getVectorElementType();
+  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
 
-  for (int I = 0, E = NumElts / 2; I != E; I++) {
-    if (Mask[I] != I) {
-      isContactVector = false;
-      break;
+  SmallVector<SDValue, 8> TBLMask;
+  for (int Val : ShuffleMask) {
+    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+      unsigned Offset = Byte + Val * BytesPerElt;
+      TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
     }
   }
 
-  if (isContactVector) {
-    int offset = NumElts / 2;
-    for (int I = NumElts / 2, E = NumElts; I != E; I++) {
-      if (Mask[I] != I + splitV0 * offset) {
-        isContactVector = false;
-        break;
-      }
-    }
+  MVT IndexVT = MVT::v8i8;
+  unsigned IndexLen = 8;
+  if (Op.getValueType().getSizeInBits() == 128) {
+    IndexVT = MVT::v16i8;
+    IndexLen = 16;
   }
 
-  if (isContactVector) {
-    EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                                  NumElts / 2);
-    if (splitV0) {
-      V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
-                       DAG.getConstant(0, MVT::i64));
-    }
-    if (V1.getValueType().getSizeInBits() == 128) {
-      V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
-                       DAG.getConstant(0, MVT::i64));
+  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
+  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
+
+  SDValue Shuffle;
+  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
+    if (IndexLen == 8)
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
+    Shuffle = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+        DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                    makeArrayRef(TBLMask.data(), IndexLen)));
+  } else {
+    if (IndexLen == 8) {
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                      makeArrayRef(TBLMask.data(), IndexLen)));
+    } else {
+      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
+      // cannot currently represent the register constraints on the input
+      // table registers.
+      //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
+      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+      //                               &TBLMask[0], IndexLen));
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                      makeArrayRef(TBLMask.data(), IndexLen)));
     }
-    Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
-    return true;
   }
-  return false;
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
 }
 
-// Check whether a Build Vector could be presented as Shuffle Vector.
-// This Shuffle Vector maybe not legalized, so the length of its operand and
-// the length of result may not equal.
-bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
-                                                 SDValue &V0, SDValue &V1,
-                                                 int *Mask) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned V0NumElts = 0;
-
-  // Check if all elements are extracted from less than 3 vectors.
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Elt = Op.getOperand(i);
-    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        Elt.getOperand(0).getValueType().getVectorElementType() !=
-            VT.getVectorElementType())
-      return false;
-
-    if (V0.getNode() == 0) {
-      V0 = Elt.getOperand(0);
-      V0NumElts = V0.getValueType().getVectorNumElements();
-    }
-    if (Elt.getOperand(0) == V0) {
-      Mask[i] = (cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue());
-      continue;
-    } else if (V1.getNode() == 0) {
-      V1 = Elt.getOperand(0);
-    }
-    if (Elt.getOperand(0) == V1) {
-      unsigned Lane = cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue();
-      Mask[i] = (Lane + V0NumElts);
-      continue;
-    } else {
-      return false;
-    }
-  }
-  return true;
+static unsigned getDUPLANEOp(EVT EltType) {
+  if (EltType == MVT::i8)
+    return AArch64ISD::DUPLANE8;
+  if (EltType == MVT::i16)
+    return AArch64ISD::DUPLANE16;
+  if (EltType == MVT::i32 || EltType == MVT::f32)
+    return AArch64ISD::DUPLANE32;
+  if (EltType == MVT::i64 || EltType == MVT::f64)
+    return AArch64ISD::DUPLANE64;
+
+  llvm_unreachable("Invalid vector element type?");
 }
 
-// LowerShiftRightParts - Lower SRL_PARTS and SRA_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a quad-shift!");
+SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt  = Op.getOperand(2);
-  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
 
-  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-  SDValue Tmp3 = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
 
-  SDValue A64cc;
-  SDValue CmpOp = getSelectableIntSetCC(ExtraShAmt,
-                                        DAG.getConstant(0, MVT::i64),
-                                        ISD::SETGE, A64cc,
-                                        DAG, dl);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+
+  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
+                                       V1.getValueType().getSimpleVT())) {
+    int Lane = SVN->getSplatIndex();
+    // If this is undef splat, generate it via "just" vdup, if possible.
+    if (Lane == -1)
+      Lane = 0;
+
+    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
+                         V1.getOperand(0));
+    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
+    // constant. If so, we can just reference the lane's definition directly.
+    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
+        !isa<ConstantSDNode>(V1.getOperand(Lane)))
+      return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
+
+    // Otherwise, duplicate from the lane of the input vector.
+    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
+
+    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
+    // to make a vector of the same size as this SHUFFLE. We can ignore the
+    // extract entirely, and canonicalise the concat using WidenVector.
+    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+      V1 = V1.getOperand(0);
+    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
+      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
+      Lane -= Idx * VT.getVectorNumElements() / 2;
+      V1 = WidenVector(V1.getOperand(Idx), DAG);
+    } else if (VT.getSizeInBits() == 64)
+      V1 = WidenVector(V1, DAG);
+
+    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
+  }
 
-  SDValue Hi = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                           DAG.getConstant(0, Tmp3.getValueType()), Tmp3,
-                           A64cc);
-  SDValue Lo = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                           TrueVal, FalseVal, A64cc);
+  if (isREVMask(ShuffleMask, VT, 64))
+    return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 32))
+    return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 16))
+    return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
+
+  bool ReverseEXT = false;
+  unsigned Imm;
+  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
+    if (ReverseEXT)
+      std::swap(V1, V2);
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
+                       DAG.getConstant(Imm, MVT::i32));
+  } else if (V2->getOpcode() == ISD::UNDEF &&
+             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
 
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
+  unsigned WhichResult;
+  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+
+  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+
+  SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
+  if (Concat.getNode())
+    return Concat;
+
+  bool DstIsLeft;
+  int Anomaly;
+  int NumInputElements = V1.getValueType().getVectorNumElements();
+  if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
+    SDValue DstVec = DstIsLeft ? V1 : V2;
+    SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
+
+    SDValue SrcVec = V1;
+    int SrcLane = ShuffleMask[Anomaly];
+    if (SrcLane >= NumInputElements) {
+      SrcVec = V2;
+      SrcLane -= VT.getVectorNumElements();
+    }
+    SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
+
+    EVT ScalarVT = VT.getVectorElementType();
+    if (ScalarVT.getSizeInBits() < 32)
+      ScalarVT = MVT::i32;
+
+    return DAG.getNode(
+        ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
+        DstLaneV);
+  }
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 4) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (ShuffleMask[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = ShuffleMask[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  return GenerateTBL(Op, ShuffleMask, DAG);
 }
 
-/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a quad-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
+static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
+                               APInt &UndefBits) {
+  EVT VT = BVN->getValueType(0);
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
+
+    for (unsigned i = 0; i < NumSplats; ++i) {
+      CnstBits <<= SplatBitSize;
+      UndefBits <<= SplatBitSize;
+      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
+      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  SDValue LHS = Op.getOperand(0);
   SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt  = Op.getOperand(2);
+  EVT VT = Op.getValueType();
 
-  assert(Op.getOpcode() == ISD::SHL_PARTS);
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
-  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue Tmp4 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  if (!BVN)
+    return Op;
 
-  SDValue A64cc;
-  SDValue CmpOp = getSelectableIntSetCC(ExtraShAmt,
-                                        DAG.getConstant(0, MVT::i64),
-                                        ISD::SETGE, A64cc,
-                                        DAG, dl);
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We only have BIC vector immediate instruction, which is and-not.
+    CnstBits = ~CnstBits;
+
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
 
-  SDValue Lo = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                           DAG.getConstant(0, Tmp4.getValueType()), Tmp4,
-                           A64cc);
-  SDValue Hi = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                           Tmp3, FalseVal, A64cc);
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
 
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = ~UndefBits;
+    goto AttemptModImm;
+  }
+
+// We can always fall back to a non-immediate AND.
+FailedModImm:
+  return Op;
 }
 
-// If this is a case we can't handle, return null and let the default
-// expansion code take care of it.
-SDValue
-AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
-                                         const AArch64Subtarget *ST) const {
+// Specialized code to quickly find if PotentialBVec is a BuildVector that
+// consists of only the same constant int value, returned in reference arg
+// ConstVal
+static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
+                                     uint64_t &ConstVal) {
+  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
+  if (!Bvec)
+    return false;
+  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
+  if (!FirstElt)
+    return false;
+  EVT VT = Bvec->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned i = 1; i < NumElts; ++i)
+    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
+      return false;
+  ConstVal = FirstElt->getZExtValue();
+  return true;
+}
 
-  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
-  SDLoc DL(Op);
+static unsigned getIntrinsicID(const SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  switch (Opcode) {
+  default:
+    return Intrinsic::not_intrinsic;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return IID;
+    return Intrinsic::not_intrinsic;
+  }
+  }
+}
+
+// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
+// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
+// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
+// Also, logical shift right -> sri, with the same structure.
+static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // Is the first op an AND?
+  const SDValue And = N->getOperand(0);
+  if (And.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // Is the second op an shl or lshr?
+  SDValue Shift = N->getOperand(1);
+  // This will have been turned into: AArch64ISD::VSHL vector, #shift
+  // or AArch64ISD::VLSHR vector, #shift
+  unsigned ShiftOpc = Shift.getOpcode();
+  if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
+    return SDValue();
+  bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
+
+  // Is the shift amount constant?
+  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+  if (!C2node)
+    return SDValue();
+
+  // Is the and mask vector all constant?
+  uint64_t C1;
+  if (!isAllConstantBuildVector(And.getOperand(1), C1))
+    return SDValue();
+
+  // Is C1 == ~C2, taking into account how much one can shift elements of a
+  // particular size?
+  uint64_t C2 = C2node->getZExtValue();
+  unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
+  if (C2 > ElemSizeInBits)
+    return SDValue();
+  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
+  if ((C1 & ElemMask) != (~C2 & ElemMask))
+    return SDValue();
+
+  SDValue X = And.getOperand(0);
+  SDValue Y = Shift.getOperand(0);
+
+  unsigned Intrin =
+      IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
+  SDValue ResultSLI =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                  DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
+
+  DEBUG(dbgs() << "aarch64-lower: transformed: \n");
+  DEBUG(N->dump(&DAG));
+  DEBUG(dbgs() << "into: \n");
+  DEBUG(ResultSLI->dump(&DAG));
+
+  ++NumShiftInserts;
+  return ResultSLI;
+}
+
+SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
+  if (EnableAArch64SlrGeneration) {
+    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
+    if (Res.getNode())
+      return Res;
+  }
+
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
+  SDValue LHS = Op.getOperand(1);
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
+  // OR commutes, so try swapping the operands.
+  if (!BVN) {
+    LHS = Op.getOperand(0);
+    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  }
+  if (!BVN)
+    return Op;
 
-  unsigned UseNeonMov = VT.getSizeInBits() >= 64;
-
-  // Note we favor lowering MOVI over MVNI.
-  // This has implications on the definition of patterns in TableGen to select
-  // BIC immediate instructions but not ORR immediate instructions.
-  // If this lowering order is changed, TableGen patterns for BIC immediate and
-  // ORR immediate instructions have to be updated.
-  if (UseNeonMov &&
-      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    if (SplatBitSize <= 64) {
-      // First attempt to use vector immediate-form MOVI
-      EVT NeonMovVT;
-      unsigned Imm = 0;
-      unsigned OpCmode = 0;
-
-      if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
-                            SplatBitSize, DAG, VT.is128BitVector(),
-                            Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) {
-        SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
-        SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
-
-        if (ImmVal.getNode() && OpCmodeVal.getNode()) {
-          SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT,
-                                        ImmVal, OpCmodeVal);
-          return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
-        }
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
       }
 
-      // Then attempt to use vector immediate-form MVNI
-      uint64_t NegatedImm = (~SplatBits).getZExtValue();
-      if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
-                            DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT,
-                            Imm, OpCmode)) {
-        SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
-        SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
-        if (ImmVal.getNode() && OpCmodeVal.getNode()) {
-          SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT,
-                                        ImmVal, OpCmodeVal);
-          return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
-        }
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
       }
 
-      // Attempt to use vector immediate-form FMOV
-      if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) ||
-          (VT == MVT::v2f64 && SplatBitSize == 64)) {
-        APFloat RealVal(
-            SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble,
-            SplatBits);
-        uint32_t ImmVal;
-        if (A64Imms::isFPImm(RealVal, ImmVal)) {
-          SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
-          return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val);
-        }
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
       }
     }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
   }
 
+// We can always fall back to a non-immediate OR.
+FailedModImm:
+  return Op;
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      // Certain magic vector constants (used to express things like NOT
+      // and NEG) are passed through unmodified.  This allows codegen patterns
+      // for these operations to match.  Special-purpose patterns will lower
+      // these immediates to MOVIs if it proves necessary.
+      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
+        return Op;
+
+      // The many faces of MOVI...
+      if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
+        if (VT.getSizeInBits() == 128) {
+          SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
+                                    DAG.getConstant(CnstVal, MVT::i32));
+          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        }
+
+        // Support the V64 version via subregister insertion.
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(264, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(272, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      // The few faces of FMOV...
+      if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
+        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
+          VT.getSizeInBits() == 128) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
+        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      // The many faces of MVNI...
+      CnstVal = ~CnstVal;
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(264, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(272, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+FailedModImm:
+
+  // Scan through the operands to find some interesting properties we can
+  // exploit:
+  //   1) If only one value is used, we can use a DUP, or
+  //   2) if only the low element is not undef, we can just insert that, or
+  //   3) if only one constant value is used (w/ some non-constant lanes),
+  //      we can splat the constant value into the whole vector then fill
+  //      in the non-constant lanes.
+  //   4) FIXME: If different constant values are used, but we can intelligently
+  //             select the values we'll be overwriting for the non-constant
+  //             lanes such that we can directly materialize the vector
+  //             some other way (MOVI, e.g.), we can be sneaky.
   unsigned NumElts = VT.getVectorNumElements();
   bool isOnlyLowElement = true;
   bool usesOnlyOneValue = true;
-  bool hasDominantValue = false;
+  bool usesOnlyOneConstantValue = true;
   bool isConstant = true;
-
-  // Map of the number of times a particular SDValue appears in the
-  // element list.
-  DenseMap<SDValue, unsigned> ValueCounts;
+  unsigned NumConstantLanes = 0;
   SDValue Value;
+  SDValue ConstantValue;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
     if (V.getOpcode() == ISD::UNDEF)
@@ -4702,143 +5425,90 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
       isConstant = false;
 
-    ValueCounts.insert(std::make_pair(V, 0));
-    unsigned &Count = ValueCounts[V];
+    if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
+      ++NumConstantLanes;
+      if (!ConstantValue.getNode())
+        ConstantValue = V;
+      else if (ConstantValue != V)
+        usesOnlyOneConstantValue = false;
+    }
 
-    // Is this value dominant? (takes up more than half of the lanes)
-    if (++Count > (NumElts / 2)) {
-      hasDominantValue = true;
+    if (!Value.getNode())
       Value = V;
-    }
+    else if (V != Value)
+      usesOnlyOneValue = false;
   }
-  if (ValueCounts.size() != 1)
-    usesOnlyOneValue = false;
-  if (!Value.getNode() && ValueCounts.size() > 0)
-    Value = ValueCounts.begin()->first;
 
-  if (ValueCounts.size() == 0)
+  if (!Value.getNode())
     return DAG.getUNDEF(VT);
 
   if (isOnlyLowElement)
-    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
 
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  if (hasDominantValue && EltSize <= 64) {
-    // Use VDUP for non-constant splats.
+  // Use DUP for non-constant splats.  For f32 constant splats, reduce to
+  // i32 and try again.
+  if (usesOnlyOneValue) {
     if (!isConstant) {
-      SDValue N;
-
-      // If we are DUPing a value that comes directly from a vector, we could
-      // just use DUPLANE. We can only do this if the lane being extracted
-      // is at a constant index, as the DUP from lane instructions only have
-      // constant-index forms.
-      //
-      // If there is a TRUNCATE between EXTRACT_VECTOR_ELT and DUP, we can
-      // remove TRUNCATE for DUPLANE by apdating the source vector to
-      // appropriate vector type and lane index.
-      //
-      // FIXME: for now we have v1i8, v1i16, v1i32 legal vector types, if they
-      // are not legal any more, no need to check the type size in bits should
-      // be large than 64.
-      SDValue V = Value;
-      if (Value->getOpcode() == ISD::TRUNCATE)
-        V = Value->getOperand(0);
-      if (V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-          isa<ConstantSDNode>(V->getOperand(1)) &&
-          V->getOperand(0).getValueType().getSizeInBits() >= 64) {
-
-        // If the element size of source vector is larger than DUPLANE
-        // element size, we can do transformation by,
-        // 1) bitcasting source register to smaller element vector
-        // 2) mutiplying the lane index by SrcEltSize/ResEltSize
-        // For example, we can lower
-        //     "v8i16 vdup_lane(v4i32, 1)"
-        // to be
-        //     "v8i16 vdup_lane(v8i16 bitcast(v4i32), 2)".
-        SDValue SrcVec = V->getOperand(0);
-        unsigned SrcEltSize =
-            SrcVec.getValueType().getVectorElementType().getSizeInBits();
-        unsigned ResEltSize = VT.getVectorElementType().getSizeInBits();
-        if (SrcEltSize > ResEltSize) {
-          assert((SrcEltSize % ResEltSize == 0) && "Invalid element size");
-          SDValue BitCast;
-          unsigned SrcSize = SrcVec.getValueType().getSizeInBits();
-          unsigned ResSize = VT.getSizeInBits();
-
-          if (SrcSize > ResSize) {
-            assert((SrcSize % ResSize == 0) && "Invalid vector size");
-            EVT CastVT =
-                EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                                 SrcSize / ResEltSize);
-            BitCast = DAG.getNode(ISD::BITCAST, DL, CastVT, SrcVec);
-          } else {
-            assert((SrcSize == ResSize) && "Invalid vector size of source vec");
-            BitCast = DAG.getNode(ISD::BITCAST, DL, VT, SrcVec);
-          }
+      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+          Value.getValueType() != VT)
+        return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
 
-          unsigned LaneIdx = V->getConstantOperandVal(1);
-          SDValue Lane =
-              DAG.getConstant((SrcEltSize / ResEltSize) * LaneIdx, MVT::i64);
-          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, BitCast, Lane);
-        } else {
-          assert((SrcEltSize == ResEltSize) &&
-                 "Invalid element size of source vec");
-          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, V->getOperand(0),
-                          V->getOperand(1));
-        }
-      } else
-        N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
-
-      if (!usesOnlyOneValue) {
-        // The dominant value was splatted as 'N', but we now have to insert
-        // all differing elements.
-        for (unsigned I = 0; I < NumElts; ++I) {
-          if (Op.getOperand(I) == Value)
-            continue;
-          SmallVector<SDValue, 3> Ops;
-          Ops.push_back(N);
-          Ops.push_back(Op.getOperand(I));
-          Ops.push_back(DAG.getConstant(I, MVT::i64));
-          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
-        }
-      }
-      return N;
+      // This is actually a DUPLANExx operation, which keeps everything vectory.
+
+      // DUPLANE works on 128-bit vectors, widen it if necessary.
+      SDValue Lane = Value.getOperand(1);
+      Value = Value.getOperand(0);
+      if (Value.getValueType().getSizeInBits() == 64)
+        Value = WidenVector(Value, DAG);
+
+      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
+      return DAG.getNode(Opcode, dl, VT, Value, Lane);
+    }
+
+    if (VT.getVectorElementType().isFloatingPoint()) {
+      SmallVector<SDValue, 8> Ops;
+      MVT NewType =
+          (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+      for (unsigned i = 0; i < NumElts; ++i)
+        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
+      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+      Val = LowerBUILD_VECTOR(Val, DAG);
+      if (Val.getNode())
+        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
     }
-    if (usesOnlyOneValue && isConstant) {
-      return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
+  }
+
+  // If there was only one constant value used and for more than one lane,
+  // start by splatting that value, then replace the non-constant lanes. This
+  // is better than the default, which will perform a separate initialization
+  // for each lane.
+  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
+    SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
+    // Now insert the non-constant lanes.
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
+        // Note that type legalization likely mucked about with the VT of the
+        // source operand, so we may have to convert it here before inserting.
+        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
+      }
     }
+    return Val;
   }
+
   // If all elements are constants and the case above didn't get hit, fall back
   // to the default expansion, which will generate a load from the constant
   // pool.
   if (isConstant)
     return SDValue();
 
-  // Try to lower this in lowering ShuffleVector way.
-  SDValue V0, V1;
-  int Mask[16];
-  if (isKnownShuffleVector(Op, DAG, V0, V1, Mask)) {
-    unsigned V0NumElts = V0.getValueType().getVectorNumElements();
-    if (!V1.getNode() && V0NumElts == NumElts * 2) {
-      V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
-                       DAG.getConstant(NumElts, MVT::i64));
-      V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
-                       DAG.getConstant(0, MVT::i64));
-      V0NumElts = V0.getValueType().getVectorNumElements();
-    }
-
-    if (V1.getNode() && NumElts == V0NumElts &&
-        V0NumElts == V1.getValueType().getVectorNumElements()) {
-      SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
-      if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
-        return Shuffle;
-      else
-        return LowerVECTOR_SHUFFLE(Shuffle, DAG);
-    } else {
-      SDValue Res;
-      if (isConcatVector(Op, DAG, V0, V1, Mask, Res))
-        return Res;
-    }
+  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+  if (NumElts >= 4) {
+    SDValue shuffle = ReconstructShuffle(Op, DAG);
+    if (shuffle != SDValue())
+      return shuffle;
   }
 
   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
@@ -4849,550 +5519,2419 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   // on the stack followed by a load for everything else.
   if (!isConstant && !usesOnlyOneValue) {
     SDValue Vec = DAG.getUNDEF(VT);
-    for (unsigned i = 0 ; i < NumElts; ++i) {
+    SDValue Op0 = Op.getOperand(0);
+    unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
+    unsigned i = 0;
+    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+    // a) Avoid a RMW dependency on the full vector register, and
+    // b) Allow the register coalescer to fold away the copy if the
+    //    value is already in an S or D register.
+    if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
+      unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
+      MachineSDNode *N =
+          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
+                             DAG.getTargetConstant(SubIdx, MVT::i32));
+      Vec = SDValue(N, 0);
+      ++i;
+    }
+    for (; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
       if (V.getOpcode() == ISD::UNDEF)
         continue;
       SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
-      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
     }
     return Vec;
   }
+
+  // Just use the default expansion. We failed to find a better alternative.
   return SDValue();
 }
 
-/// isREVMask - Check if a vector shuffle corresponds to a REV
-/// instruction with the specified blocksize.  (The order of the elements
-/// within each block of the vector is reversed.)
-static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
-  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
-         "Only possible block sizes for REV are: 16, 32, 64");
+SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
 
-  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
-  if (EltSz == 64)
-    return false;
+  // Check for non-constant lane.
+  if (!isa<ConstantSDNode>(Op.getOperand(2)))
+    return SDValue();
 
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned BlockElts = M[0] + 1;
-  // If the first shuffle index is UNDEF, be optimistic.
-  if (M[0] < 0)
-    BlockElts = BlockSize / EltSz;
+  EVT VT = Op.getOperand(0).getValueType();
 
-  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
-    return false;
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+    return Op;
 
-  for (unsigned i = 0; i < NumElts; ++i) {
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
-      return false;
-  }
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32)
+    return SDValue();
 
-  return true;
+  // For V64 types, we perform insertion by expanding the value
+  // to a V128 type and perform the insertion on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
+
+  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
+                             Op.getOperand(1), Op.getOperand(2));
+  // Re-narrow the resultant vector.
+  return NarrowVector(Node, DAG);
+}
+
+SDValue
+AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
+
+  // Check for non-constant lane.
+  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+    return SDValue();
+
+  EVT VT = Op.getOperand(0).getValueType();
+
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+    return Op;
+
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32)
+    return SDValue();
+
+  // For V64 types, we perform extraction by expanding the value
+  // to a V128 type and perform the extraction on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
+
+  EVT ExtrTy = WideTy.getVectorElementType();
+  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
+    ExtrTy = MVT::i32;
+
+  // For extractions, we just return the result directly.
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
+                     Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  EVT VT = Op.getOperand(0).getValueType();
+  SDLoc dl(Op);
+  // Just in case...
+  if (!VT.isVector())
+    return SDValue();
+
+  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Cst)
+    return SDValue();
+  unsigned Val = Cst->getZExtValue();
+
+  unsigned Size = Op.getValueType().getSizeInBits();
+  if (Val == 0) {
+    switch (Size) {
+    case 8:
+      return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 16:
+      return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 32:
+      return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 64:
+      return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    default:
+      llvm_unreachable("Unexpected vector type in extract_subvector!");
+    }
+  }
+  // If this is extracting the upper 64-bits of a 128-bit vector, we match
+  // that directly.
+  if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
+    return Op;
+
+  return SDValue();
+}
+
+bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                               EVT VT) const {
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (M[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = M[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return true;
+  }
+
+  bool DummyBool;
+  int DummyInt;
+  unsigned DummyUnsigned;
+
+  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
+          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
+          isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
+          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
+          isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
+          isZIPMask(M, VT, DummyUnsigned) ||
+          isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
+          isConcatMask(M, VT, VT.getSizeInBits() == 128));
+}
+
+/// getVShiftImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                    HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation.  That value must be in the range:
+///   0 <= Value < ElementBits for a left shift; or
+///   0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation.  For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+///   1 <= |Value| <= ElementBits for a right shift; or
+///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+                         int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  if (isIntrinsic)
+    Cnt = -Cnt;
+  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
+}
+
+SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  int64_t Cnt;
+
+  if (!Op.getOperand(1).getValueType().isVector())
+    return Op;
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
+      return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32),
+                       Op.getOperand(0), Op.getOperand(1));
+  case ISD::SRA:
+  case ISD::SRL:
+    // Right shift immediate
+    if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
+        Cnt < EltSize) {
+      unsigned Opc =
+          (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
+      return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    }
+
+    // Right shift register.  Note, there is not a shift right register
+    // instruction, but the shift left register instruction takes a signed
+    // value, where negative numbers specify a right shift.
+    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
+                                                : Intrinsic::aarch64_neon_ushl;
+    // negate the shift amount
+    SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
+    SDValue NegShiftLeft =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                    DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
+    return NegShiftLeft;
+  }
+
+  return SDValue();
+}
+
+static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
+                                    AArch64CC::CondCode CC, bool NoNans, EVT VT,
+                                    SDLoc dl, SelectionDAG &DAG) {
+  EVT SrcVT = LHS.getValueType();
+
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
+  bool IsZero = IsCnst && (CnstBits == 0);
+
+  if (SrcVT.getVectorElementType().isFloatingPoint()) {
+    switch (CC) {
+    default:
+      return SDValue();
+    case AArch64CC::NE: {
+      SDValue Fcmeq;
+      if (IsZero)
+        Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+      else
+        Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+      return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
+    }
+    case AArch64CC::EQ:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+    case AArch64CC::GE:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
+    case AArch64CC::GT:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
+    case AArch64CC::LS:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
+    case AArch64CC::LT:
+      if (!NoNans)
+        return SDValue();
+    // If we ignore NaNs then we can use to the MI implementation.
+    // Fallthrough.
+    case AArch64CC::MI:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
+    }
+  }
+
+  switch (CC) {
+  default:
+    return SDValue();
+  case AArch64CC::NE: {
+    SDValue Cmeq;
+    if (IsZero)
+      Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+    else
+      Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+    return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
+  }
+  case AArch64CC::EQ:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+  case AArch64CC::GE:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
+  case AArch64CC::GT:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
+  case AArch64CC::LE:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
+  case AArch64CC::LS:
+    return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
+  case AArch64CC::LO:
+    return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
+  case AArch64CC::LT:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
+  case AArch64CC::HI:
+    return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
+  case AArch64CC::HS:
+    return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  if (LHS.getValueType().getVectorElementType().isInteger()) {
+    assert(LHS.getValueType() == RHS.getValueType());
+    AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+    return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(),
+                                dl, DAG);
+  }
+
+  assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
+         LHS.getValueType().getVectorElementType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  AArch64CC::CondCode CC1, CC2;
+  bool ShouldInvert;
+  changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
+
+  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
+  SDValue Cmp =
+      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
+  if (!Cmp.getNode())
+    return SDValue();
+
+  if (CC2 != AArch64CC::AL) {
+    SDValue Cmp2 =
+        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
+    if (!Cmp2.getNode())
+      return SDValue();
+
+    Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
+  }
+
+  if (ShouldInvert)
+    return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
+
+  return Cmp;
+}
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                               const CallInst &I,
+                                               unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_ld4:
+  case Intrinsic::aarch64_neon_ld1x2:
+  case Intrinsic::aarch64_neon_ld1x3:
+  case Intrinsic::aarch64_neon_ld1x4:
+  case Intrinsic::aarch64_neon_ld2lane:
+  case Intrinsic::aarch64_neon_ld3lane:
+  case Intrinsic::aarch64_neon_ld4lane:
+  case Intrinsic::aarch64_neon_ld2r:
+  case Intrinsic::aarch64_neon_ld3r:
+  case Intrinsic::aarch64_neon_ld4r: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_neon_st2:
+  case Intrinsic::aarch64_neon_st3:
+  case Intrinsic::aarch64_neon_st4:
+  case Intrinsic::aarch64_neon_st1x2:
+  case Intrinsic::aarch64_neon_st1x3:
+  case Intrinsic::aarch64_neon_st1x4:
+  case Intrinsic::aarch64_neon_st2lane:
+  case Intrinsic::aarch64_neon_st3lane:
+  case Intrinsic::aarch64_neon_st4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::aarch64_ldaxr:
+  case Intrinsic::aarch64_ldxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_stlxr:
+  case Intrinsic::aarch64_stxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::aarch64_ldaxp:
+  case Intrinsic::aarch64_ldxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_stlxp:
+  case Intrinsic::aarch64_stxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
+// Truncations from 64-bit GPR to 32-bit GPR is free.
+bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  if (NumBits1 <= NumBits2)
+    return false;
+  return true;
+}
+bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  if (NumBits1 <= NumBits2)
+    return false;
+  return true;
+}
+
+// All 32-bit GPR operations implicitly zero the high-half of the corresponding
+// 64-bit GPR.
+bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  if (NumBits1 == 32 && NumBits2 == 64)
+    return true;
+  return false;
+}
+bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  if (NumBits1 == 32 && NumBits2 == 64)
+    return true;
+  return false;
+}
+
+bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  EVT VT1 = Val.getValueType();
+  if (isZExtFree(VT1, VT2)) {
+    return true;
+  }
+
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
+  return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() &&
+          VT2.isInteger() && VT1.getSizeInBits() <= 32);
+}
+
+bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
+                                          unsigned &RequiredAligment) const {
+  if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
+                                          unsigned &RequiredAligment) const {
+  if (!LoadedType.isSimple() ||
+      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType.getSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+                       unsigned AlignCheck) {
+  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+          (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                               unsigned SrcAlign, bool IsMemset,
+                                               bool ZeroMemset,
+                                               bool MemcpyStrSrc,
+                                               MachineFunction &MF) const {
+  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
+  // instruction to materialize the v2i64 zero and one store (with restrictive
+  // addressing mode). Just do two i64 store of zero-registers.
+  bool Fast;
+  const Function *F = MF.getFunction();
+  if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
+      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::NoImplicitFloat) &&
+      (memOpAlign(SrcAlign, DstAlign, 16) ||
+       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+    return MVT::f128;
+
+  return Size >= 8 ? MVT::i64 : MVT::i32;
+}
+
+// 12-bit optionally shifted immediates are legal for adds.
+bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
+  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
+    return true;
+  return false;
+}
+
+// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
+// immediates is the same as for an add or a sub.
+bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
+  if (Immed < 0)
+    Immed *= -1;
+  return isLegalAddImmediate(Immed);
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                                  Type *Ty) const {
+  // AArch64 has five basic addressing modes:
+  //  reg
+  //  reg + 9-bit signed offset
+  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
+  //  reg1 + reg2
+  //  reg + SIZE_IN_BYTES * reg
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // No reg+reg+imm addressing.
+  if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
+    return false;
+
+  // check reg + imm case:
+  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
+  uint64_t NumBytes = 0;
+  if (Ty->isSized()) {
+    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
+    NumBytes = NumBits / 8;
+    if (!isPowerOf2_64(NumBits))
+      NumBytes = 0;
+  }
+
+  if (!AM.Scale) {
+    int64_t Offset = AM.BaseOffs;
+
+    // 9-bit signed offset
+    if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
+      return true;
+
+    // 12-bit unsigned offset
+    unsigned shift = Log2_64(NumBytes);
+    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
+        // Must be a multiple of NumBytes (NumBytes is a power of 2)
+        (Offset >> shift) << shift == Offset)
+      return true;
+    return false;
+  }
+
+  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
+
+  if (!AM.Scale || AM.Scale == 1 ||
+      (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
+    return true;
+  return false;
+}
+
+int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
+                                                Type *Ty) const {
+  // Scaling factors are not free at all.
+  // Operands                     | Rt Latency
+  // -------------------------------------------
+  // Rt, [Xn, Xm]                 | 4
+  // -------------------------------------------
+  // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
+  // Rt, [Xn, Wm, <extend> #imm]  |
+  if (isLegalAddressingMode(AM, Ty))
+    // Scale represents reg2 * scale, thus account for 1 if
+    // it is not equal to 0 or 1.
+    return AM.Scale != 0 && AM.Scale != 1;
+  return -1;
+}
+
+bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+const MCPhysReg *
+AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  // LR is a callee-save register, but we must treat it as clobbered by any call
+  // site. Hence we include LR in the scratch registers, which are in turn added
+  // as implicit-defs for stackmaps and patchpoints.
+  static const MCPhysReg ScratchRegs[] = {
+    AArch64::X16, AArch64::X17, AArch64::LR, 0
+  };
+  return ScratchRegs;
+}
+
+bool
+AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
+  EVT VT = N->getValueType(0);
+    // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
+    // it with shift to let it be lowered to UBFX.
+  if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
+      isa<ConstantSDNode>(N->getOperand(1))) {
+    uint64_t TruncMask = N->getConstantOperandVal(1);
+    if (isMask_64(TruncMask) &&
+      N->getOperand(0).getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+      return false;
+  }
+  return true;
+}
+
+bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                              Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return false;
+
+  int64_t Val = Imm.getSExtValue();
+  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
+    return true;
+
+  if ((int64_t)Val < 0)
+    Val = ~Val;
+  if (BitSize == 32)
+    Val &= (1LL << 32) - 1;
+
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  unsigned Shift = (63 - LZ) / 16;
+  // MOVZ is free so return true for one or fewer MOVK.
+  return (Shift < 3) ? true : false;
+}
+
+// Generate SUBS and CSEL for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CSEL.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
+        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                  N0.getOperand(0));
+        // Generate SUBS & CSEL.
+        SDValue Cmp =
+            DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+                        N0.getOperand(0), DAG.getConstant(0, VT));
+        return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
+                           DAG.getConstant(AArch64CC::PL, MVT::i32),
+                           SDValue(Cmp.getNode(), 1));
+      }
+  return SDValue();
+}
+
+// performXorCombine - Attempts to handle integer ABS.
+static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  return performIntegerAbsCombine(N, DAG);
+}
+
+static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Multiplication of a power of two plus/minus one can be done more
+  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
+  // future CPUs have a cheaper MADD instruction, this may need to be
+  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
+  // 64-bit is 5 cycles, so this is always a win.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+    APInt Value = C->getAPIntValue();
+    EVT VT = N->getValueType(0);
+    APInt VP1 = Value + 1;
+    if (VP1.isPowerOf2()) {
+      // Multiplying by one less than a power of two, replace with a shift
+      // and a subtract.
+      SDValue ShiftedVal =
+          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                      DAG.getConstant(VP1.logBase2(), MVT::i64));
+      return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+    }
+    APInt VM1 = Value - 1;
+    if (VM1.isPowerOf2()) {
+      // Multiplying by one more than a power of two, replace with a shift
+      // and an add.
+      SDValue ShiftedVal =
+          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                      DAG.getConstant(VM1.logBase2(), MVT::i64));
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+    }
+  }
+  return SDValue();
+}
+
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return SDValue();
+  // Only optimize when the source and destination types have the same width.
+  if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
+    return SDValue();
+
+  // If the result of an integer load is only used by an integer-to-float
+  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
+  // This eliminates an "integer-to-vector-move UOP and improve throughput.
+  SDValue N0 = N->getOperand(0);
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+                               LN0->getPointerInfo(), LN0->isVolatile(),
+                               LN0->isNonTemporal(), LN0->isInvariant(),
+                               LN0->getAlignment());
+
+    // Make sure successors of the original load stay after it by updating them
+    // to use the new Chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
+
+    unsigned Opcode =
+        (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
+    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
+  }
+
+  return SDValue();
+}
+
+/// An EXTR instruction is made up of two shifts, ORed together. This helper
+/// searches for and classifies those shifts.
+static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
+                         bool &FromHi) {
+  if (N.getOpcode() == ISD::SHL)
+    FromHi = false;
+  else if (N.getOpcode() == ISD::SRL)
+    FromHi = true;
+  else
+    return false;
+
+  if (!isa<ConstantSDNode>(N.getOperand(1)))
+    return false;
+
+  ShiftAmount = N->getConstantOperandVal(1);
+  Src = N->getOperand(0);
+  return true;
+}
+
+/// EXTR instruction extracts a contiguous chunk of bits from two existing
+/// registers viewed as a high/low pair. This function looks for the pattern:
+/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
+/// EXTR. Can't quite be done in TableGen because the two immediates aren't
+/// independent.
+static SDValue tryCombineToEXTR(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDValue LHS;
+  uint32_t ShiftLHS = 0;
+  bool LHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
+    return SDValue();
+
+  SDValue RHS;
+  uint32_t ShiftRHS = 0;
+  bool RHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
+    return SDValue();
+
+  // If they're both trying to come from the high part of the register, they're
+  // not really an EXTR.
+  if (LHSFromHi == RHSFromHi)
+    return SDValue();
+
+  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
+    return SDValue();
+
+  if (LHSFromHi) {
+    std::swap(LHS, RHS);
+    std::swap(ShiftLHS, ShiftRHS);
+  }
+
+  return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
+                     DAG.getConstant(ShiftRHS, MVT::i64));
+}
+
+static SDValue tryCombineToBSL(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // We only have to look for constant vectors here since the general, variable
+  // case can be handled in TableGen.
+  unsigned Bits = VT.getVectorElementType().getSizeInBits();
+  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+  for (int i = 1; i >= 0; --i)
+    for (int j = 1; j >= 0; --j) {
+      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+      if (!BVN0 || !BVN1)
+        continue;
+
+      bool FoundMatch = true;
+      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+        if (!CN0 || !CN1 ||
+            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+          FoundMatch = false;
+          break;
+        }
+      }
+
+      if (FoundMatch)
+        return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+                           N0->getOperand(1 - i), N1->getOperand(1 - j));
+    }
+
+  return SDValue();
+}
+
+static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                                const AArch64Subtarget *Subtarget) {
+  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
+  if (!EnableAArch64ExtrGeneration)
+    return SDValue();
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDValue Res = tryCombineToEXTR(N, DCI);
+  if (Res.getNode())
+    return Res;
+
+  Res = tryCombineToBSL(N, DCI);
+  if (Res.getNode())
+    return Res;
+
+  return SDValue();
+}
+
+static SDValue performBitcastCombine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Remove extraneous bitcasts around an extract_subvector.
+  // For example,
+  //    (v4i16 (bitconvert
+  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
+  //  becomes
+  //    (extract_subvector ((v8i16 ...), (i64 4)))
+
+  // Only interested in 64-bit vectors as the ultimate result.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+  if (VT.getSimpleVT().getSizeInBits() != 64)
+    return SDValue();
+  // Is the operand an extract_subvector starting at the beginning or halfway
+  // point of the vector? A low half may also come through as an
+  // EXTRACT_SUBREG, so look for that, too.
+  SDValue Op0 = N->getOperand(0);
+  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
+      !(Op0->isMachineOpcode() &&
+        Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
+    return SDValue();
+  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
+  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
+      return SDValue();
+  } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
+    if (idx != AArch64::dsub)
+      return SDValue();
+    // The dsub reference is equivalent to a lane zero subvector reference.
+    idx = 0;
+  }
+  // Look through the bitcast of the input to the extract.
+  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue Source = Op0->getOperand(0)->getOperand(0);
+  // If the source type has twice the number of elements as our destination
+  // type, we know this is an extract of the high or low half of the vector.
+  EVT SVT = Source->getValueType(0);
+  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
+    return SDValue();
+
+  DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
+
+  // Create the simplified form to just extract the low or high half of the
+  // vector directly rather than bothering with the bitcasts.
+  SDLoc dl(N);
+  unsigned NumElements = VT.getVectorNumElements();
+  if (idx) {
+    SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
+  } else {
+    SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32);
+    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
+                                      Source, SubReg),
+                   0);
+  }
+}
+
+static SDValue performConcatVectorsCombine(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
+  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
+  // canonicalise to that.
+  if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
+    assert(VT.getVectorElementType().getSizeInBits() == 64);
+    return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT,
+                       WidenVector(N->getOperand(0), DAG),
+                       DAG.getConstant(0, MVT::i64));
+  }
+
+  // Canonicalise concat_vectors so that the right-hand vector has as few
+  // bit-casts as possible before its real operation. The primary matching
+  // destination for these operations will be the narrowing "2" instructions,
+  // which depend on the operation being performed on this right-hand vector.
+  // For example,
+  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
+  // becomes
+  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
+
+  SDValue Op1 = N->getOperand(1);
+  if (Op1->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue RHS = Op1->getOperand(0);
+  MVT RHSTy = RHS.getValueType().getSimpleVT();
+  // If the RHS is not a vector, this is not the pattern we're looking for.
+  if (!RHSTy.isVector())
+    return SDValue();
+
+  DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
+
+  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
+                                  RHSTy.getVectorNumElements() * 2);
+  return DAG.getNode(
+      ISD::BITCAST, dl, VT,
+      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
+                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
+}
+
+static SDValue tryCombineFixedPointConvert(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+  // Transform a scalar conversion of a value from a lane extract into a
+  // lane extract of a vector conversion. E.g., from foo1 to foo2:
+  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
+  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
+  //
+  // The second form interacts better with instruction selection and the
+  // register allocator to avoid cross-class register copies that aren't
+  // coalescable due to a lane reference.
+
+  // Check the operand and see if it originates from a lane extract.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    // Yep, no additional predication needed. Perform the transform.
+    SDValue IID = N->getOperand(0);
+    SDValue Shift = N->getOperand(2);
+    SDValue Vec = Op1.getOperand(0);
+    SDValue Lane = Op1.getOperand(1);
+    EVT ResTy = N->getValueType(0);
+    EVT VecResTy;
+    SDLoc DL(N);
+
+    // The vector width should be 128 bits by the time we get here, even
+    // if it started as 64 bits (the extract_vector handling will have
+    // done so).
+    assert(Vec.getValueType().getSizeInBits() == 128 &&
+           "unexpected vector size on extract_vector_elt!");
+    if (Vec.getValueType() == MVT::v4i32)
+      VecResTy = MVT::v4f32;
+    else if (Vec.getValueType() == MVT::v2i64)
+      VecResTy = MVT::v2f64;
+    else
+      assert(0 && "unexpected vector type!");
+
+    SDValue Convert =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
+  }
+  return SDValue();
 }
 
-// isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and
-// TRN instruction.
-static unsigned isPermuteMask(ArrayRef<int> M, EVT VT, bool isV2undef) {
-  unsigned NumElts = VT.getVectorNumElements();
-  if (NumElts < 4)
-    return 0;
+// AArch64 high-vector "long" operations are formed by performing the non-high
+// version on an extract_subvector of each operand which gets the high half:
+//
+//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
+//
+// However, there are cases which don't have an extract_high explicitly, but
+// have another operation that can be made compatible with one for free. For
+// example:
+//
+//  (dupv64 scalar) --> (extract_high (dup128 scalar))
+//
+// This routine does the actual conversion of such DUPs, once outer routines
+// have determined that everything else is in order.
+static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+  // We can handle most types of duplicate, but the lane ones have an extra
+  // operand saying *which* lane, so we need to know.
+  bool IsDUPLANE;
+  switch (N.getOpcode()) {
+  case AArch64ISD::DUP:
+    IsDUPLANE = false;
+    break;
+  case AArch64ISD::DUPLANE8:
+  case AArch64ISD::DUPLANE16:
+  case AArch64ISD::DUPLANE32:
+  case AArch64ISD::DUPLANE64:
+    IsDUPLANE = true;
+    break;
+  default:
+    return SDValue();
+  }
+
+  MVT NarrowTy = N.getSimpleValueType();
+  if (!NarrowTy.is64BitVector())
+    return SDValue();
+
+  MVT ElementTy = NarrowTy.getVectorElementType();
+  unsigned NumElems = NarrowTy.getVectorNumElements();
+  MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+
+  SDValue NewDUP;
+  if (IsDUPLANE)
+    NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
+                         N.getOperand(1));
+  else
+    NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
+
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
+                     NewDUP, DAG.getConstant(NumElems, MVT::i64));
+}
+
+static bool isEssentiallyExtractSubvector(SDValue N) {
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    return true;
+
+  return N.getOpcode() == ISD::BITCAST &&
+         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
+}
+
+/// \brief Helper structure to keep track of ISD::SET_CC operands.
+struct GenericSetCCInfo {
+  const SDValue *Opnd0;
+  const SDValue *Opnd1;
+  ISD::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
+struct AArch64SetCCInfo {
+  const SDValue *Cmp;
+  AArch64CC::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of SetCC information.
+union SetCCInfo {
+  GenericSetCCInfo Generic;
+  AArch64SetCCInfo AArch64;
+};
+
+/// \brief Helper structure to be able to read SetCC information.  If set to
+/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
+/// GenericSetCCInfo.
+struct SetCCInfoAndKind {
+  SetCCInfo Info;
+  bool IsAArch64;
+};
+
+/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
+/// an
+/// AArch64 lowered one.
+/// \p SetCCInfo is filled accordingly.
+/// \post SetCCInfo is meanginfull only when this function returns true.
+/// \return True when Op is a kind of SET_CC operation.
+static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
+  // If this is a setcc, this is straight forward.
+  if (Op.getOpcode() == ISD::SETCC) {
+    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
+    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
+    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    SetCCInfo.IsAArch64 = false;
+    return true;
+  }
+  // Otherwise, check if this is a matching csel instruction.
+  // In other words:
+  // - csel 1, 0, cc
+  // - csel 0, 1, !cc
+  if (Op.getOpcode() != AArch64ISD::CSEL)
+    return false;
+  // Set the information about the operands.
+  // TODO: we want the operands of the Cmp not the csel
+  SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
+  SetCCInfo.IsAArch64 = true;
+  SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
+      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // Check that the operands matches the constraints:
+  // (1) Both operands must be constants.
+  // (2) One must be 1 and the other must be 0.
+  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+
+  // Check (1).
+  if (!TValue || !FValue)
+    return false;
+
+  // Check (2).
+  if (!TValue->isOne()) {
+    // Update the comparison when we are interested in !cc.
+    std::swap(TValue, FValue);
+    SetCCInfo.Info.AArch64.CC =
+        AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
+  }
+  return TValue->isOne() && FValue->isNullValue();
+}
+
+// Returns true if Op is setcc or zext of setcc.
+static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
+  if (isSetCC(Op, Info))
+    return true;
+  return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
+    isSetCC(Op->getOperand(0), Info));
+}
+
+// The folding we want to perform is:
+// (add x, [zext] (setcc cc ...) )
+//   -->
+// (csel x, (add x, 1), !cc ...)
+//
+// The latter will get matched to a CSINC instruction.
+static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
+  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
+  SDValue LHS = Op->getOperand(0);
+  SDValue RHS = Op->getOperand(1);
+  SetCCInfoAndKind InfoAndKind;
+
+  // If neither operand is a SET_CC, give up.
+  if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
+    std::swap(LHS, RHS);
+    if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
+      return SDValue();
+  }
+
+  // FIXME: This could be generatized to work for FP comparisons.
+  EVT CmpVT = InfoAndKind.IsAArch64
+                  ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
+                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
+  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
+    return SDValue();
+
+  SDValue CCVal;
+  SDValue Cmp;
+  SDLoc dl(Op);
+  if (InfoAndKind.IsAArch64) {
+    CCVal = DAG.getConstant(
+        AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32);
+    Cmp = *InfoAndKind.Info.AArch64.Cmp;
+  } else
+    Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
+                      *InfoAndKind.Info.Generic.Opnd1,
+                      ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
+                      CCVal, DAG, dl);
+
+  EVT VT = Op->getValueType(0);
+  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
+  return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
+}
+
+// The basic add/sub long vector instructions have variants with "2" on the end
+// which act on the high-half of their inputs. They are normally matched by
+// patterns like:
+//
+// (add (zeroext (extract_high LHS)),
+//      (zeroext (extract_high RHS)))
+// -> uaddl2 vD, vN, vM
+//
+// However, if one of the extracts is something like a duplicate, this
+// instruction can still be used profitably. This function puts the DAG into a
+// more appropriate form for those patterns to trigger.
+static SDValue performAddSubLongCombine(SDNode *N,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  if (!VT.is128BitVector()) {
+    if (N->getOpcode() == ISD::ADD)
+      return performSetccAddFolding(N, DAG);
+    return SDValue();
+  }
+
+  // Make sure both branches are extended in the same way.
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
+       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
+      LHS.getOpcode() != RHS.getOpcode())
+    return SDValue();
+
+  unsigned ExtType = LHS.getOpcode();
+
+  // It's not worth doing if at least one of the inputs isn't already an
+  // extract, but we don't know which it'll be so we have to try both.
+  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
+    if (!RHS.getNode())
+      return SDValue();
+
+    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
+  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
+    if (!LHS.getNode())
+      return SDValue();
+
+    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
+  }
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
+}
+
+// Massage DAGs which we can use the high-half "long" operations on into
+// something isel will recognize better. E.g.
+//
+// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
+//   (aarch64_neon_umull (extract_high (v2i64 vec)))
+//                     (extract_high (v2i64 (dup128 scalar)))))
+//
+static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  assert(LHS.getValueType().is64BitVector() &&
+         RHS.getValueType().is64BitVector() &&
+         "unexpected shape for long operation");
+
+  // Either node could be a DUP, but it's not worth doing both of them (you'd
+  // just as well use the non-high version) so look for a corresponding extract
+  // operation on the other "wing".
+  if (isEssentiallyExtractSubvector(LHS)) {
+    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
+    if (!RHS.getNode())
+      return SDValue();
+  } else if (isEssentiallyExtractSubvector(RHS)) {
+    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
+    if (!LHS.getNode())
+      return SDValue();
+  }
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), LHS, RHS);
+}
+
+static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
+  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
+  unsigned ElemBits = ElemTy.getSizeInBits();
+
+  int64_t ShiftAmount;
+  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
+    APInt SplatValue, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, ElemBits) ||
+        SplatBitSize != ElemBits)
+      return SDValue();
+
+    ShiftAmount = SplatValue.getSExtValue();
+  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
+    ShiftAmount = CVN->getSExtValue();
+  } else
+    return SDValue();
+
+  unsigned Opcode;
+  bool IsRightShift;
+  switch (IID) {
+  default:
+    llvm_unreachable("Unknown shift intrinsic");
+  case Intrinsic::aarch64_neon_sqshl:
+    Opcode = AArch64ISD::SQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::aarch64_neon_uqshl:
+    Opcode = AArch64ISD::UQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::aarch64_neon_srshl:
+    Opcode = AArch64ISD::SRSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::aarch64_neon_urshl:
+    Opcode = AArch64ISD::URSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::aarch64_neon_sqshlu:
+    Opcode = AArch64ISD::SQSHLU_I;
+    IsRightShift = false;
+    break;
+  }
+
+  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
+    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(-ShiftAmount, MVT::i32));
+  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits)
+    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(ShiftAmount, MVT::i32));
+
+  return SDValue();
+}
+
+// The CRC32[BH] instructions ignore the high bits of their data operand. Since
+// the intrinsics must be legal and take an i32, this means there's almost
+// certainly going to be a zext in the DAG which we can eliminate.
+static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
+  SDValue AndN = N->getOperand(2);
+  if (AndN.getOpcode() != ISD::AND)
+    return SDValue();
+
+  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
+  if (!CMask || CMask->getZExtValue() != Mask)
+    return SDValue();
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
+                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const AArch64Subtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned IID = getIntrinsicID(N);
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::aarch64_neon_vcvtfxs2fp:
+  case Intrinsic::aarch64_neon_vcvtfxu2fp:
+    return tryCombineFixedPointConvert(N, DCI, DAG);
+    break;
+  case Intrinsic::aarch64_neon_fmax:
+    return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_fmin:
+    return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull:
+  case Intrinsic::aarch64_neon_pmull:
+  case Intrinsic::aarch64_neon_sqdmull:
+    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
+  case Intrinsic::aarch64_neon_sqshl:
+  case Intrinsic::aarch64_neon_uqshl:
+  case Intrinsic::aarch64_neon_sqshlu:
+  case Intrinsic::aarch64_neon_srshl:
+  case Intrinsic::aarch64_neon_urshl:
+    return tryCombineShiftImm(IID, N, DAG);
+  case Intrinsic::aarch64_crc32b:
+  case Intrinsic::aarch64_crc32cb:
+    return tryCombineCRC32(0xff, N, DAG);
+  case Intrinsic::aarch64_crc32h:
+  case Intrinsic::aarch64_crc32ch:
+    return tryCombineCRC32(0xffff, N, DAG);
+  }
+  return SDValue();
+}
+
+static SDValue performExtendCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
+  // we can convert that DUP into another extract_high (of a bigger DUP), which
+  // helps the backend to decide that an sabdl2 would be useful, saving a real
+  // extract_high operation.
+  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
+      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    SDNode *ABDNode = N->getOperand(0).getNode();
+    unsigned IID = getIntrinsicID(ABDNode);
+    if (IID == Intrinsic::aarch64_neon_sabd ||
+        IID == Intrinsic::aarch64_neon_uabd) {
+      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
+      if (!NewABD.getNode())
+        return SDValue();
+
+      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
+                         NewABD);
+    }
+  }
+
+  // This is effectively a custom type legalization for AArch64.
+  //
+  // Type legalization will split an extend of a small, legal, type to a larger
+  // illegal type by first splitting the destination type, often creating
+  // illegal source types, which then get legalized in isel-confusing ways,
+  // leading to really terrible codegen. E.g.,
+  //   %result = v8i32 sext v8i8 %value
+  // becomes
+  //   %losrc = extract_subreg %value, ...
+  //   %hisrc = extract_subreg %value, ...
+  //   %lo = v4i32 sext v4i8 %losrc
+  //   %hi = v4i32 sext v4i8 %hisrc
+  // Things go rapidly downhill from there.
+  //
+  // For AArch64, the [sz]ext vector instructions can only go up one element
+  // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
+  // take two instructions.
+  //
+  // This implies that the most efficient way to do the extend from v8i8
+  // to two v4i32 values is to first extend the v8i8 to v8i16, then do
+  // the normal splitting to happen for the v8i16->v8i32.
+
+  // This is pre-legalization to catch some cases where the default
+  // type legalization will create ill-tempered code.
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // We're only interested in cleaning things up for non-legal vector types
+  // here. If both the source and destination are legal, things will just
+  // work naturally without any fiddling.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT ResVT = N->getValueType(0);
+  if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
+    return SDValue();
+  // If the vector type isn't a simple VT, it's beyond the scope of what
+  // we're  worried about here. Let legalization do its thing and hope for
+  // the best.
+  if (!ResVT.isSimple())
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+  MVT SrcVT = Src->getValueType(0).getSimpleVT();
+  // If the source VT is a 64-bit vector, we can play games and get the
+  // better results we want.
+  if (SrcVT.getSizeInBits() != 64)
+    return SDValue();
+
+  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
+  unsigned ElementCount = SrcVT.getVectorNumElements();
+  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
+  SDLoc DL(N);
+  Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
+
+  // Now split the rest of the operation into two halves, each with a 64
+  // bit source.
+  EVT LoVT, HiVT;
+  SDValue Lo, Hi;
+  unsigned NumElements = ResVT.getVectorNumElements();
+  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
+                                 ResVT.getVectorElementType(), NumElements / 2);
+
+  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
+                               LoVT.getVectorNumElements());
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getIntPtrConstant(0));
+  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
+
+  // Now combine the parts back together so we still have a single result
+  // like the combiner expects.
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+}
+
+/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
+/// value. The load store optimizer pass will merge them to store pair stores.
+/// This has better performance than a splat of the scalar followed by a split
+/// vector store. Even if the stores are not merged it is four stores vs a dup,
+/// followed by an ext.b and two stores.
+static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't replace floating point stores, they possibly won't be transformed to
+  // stp because of the store pair suppress pass.
+  if (VT.isFloatingPoint())
+    return SDValue();
+
+  // Check for insert vector elements.
+  if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
+    return SDValue();
+
+  // We can express a splat as store pair(s) for 2 or 4 elements.
+  unsigned NumVecElts = VT.getVectorNumElements();
+  if (NumVecElts != 4 && NumVecElts != 2)
+    return SDValue();
+  SDValue SplatVal = StVal.getOperand(1);
+  unsigned RemainInsertElts = NumVecElts - 1;
+
+  // Check that this is a splat.
+  while (--RemainInsertElts) {
+    SDValue NextInsertElt = StVal.getOperand(0);
+    if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
+      return SDValue();
+    if (NextInsertElt.getOperand(1) != SplatVal)
+      return SDValue();
+    StVal = NextInsertElt;
+  }
+  unsigned OrigAlignment = St->getAlignment();
+  unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
+  unsigned Alignment = std::min(OrigAlignment, EltOffset);
+
+  // Create scalar stores. This is at least as good as the code sequence for a
+  // split unaligned store wich is a dup.s, ext.b, and two stores.
+  // Most of the time the three stores should be replaced by store pair
+  // instructions (stp).
+  SDLoc DL(St);
+  SDValue BasePtr = St->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
+                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
+
+  unsigned Offset = EltOffset;
+  while (--NumVecElts) {
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                    DAG.getConstant(Offset, MVT::i64));
+    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
+                          St->getPointerInfo(), St->isVolatile(),
+                          St->isNonTemporal(), Alignment);
+    Offset += EltOffset;
+  }
+  return NewST1;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   SelectionDAG &DAG,
+                                   const AArch64Subtarget *Subtarget) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  StoreSDNode *S = cast<StoreSDNode>(N);
+  if (S->isVolatile())
+    return SDValue();
+
+  // Cyclone has bad performance on unaligned 16B stores when crossing line and
+  // page boundries. We want to split such stores.
+  if (!Subtarget->isCyclone())
+    return SDValue();
+
+  // Don't split at Oz.
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
+  if (IsMinSize)
+    return SDValue();
+
+  SDValue StVal = S->getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
+  // those up regresses performance on micro-benchmarks and olden/bh.
+  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+    return SDValue();
+
+  // Split unaligned 16B stores. They are terrible for performance.
+  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
+  // extensions can use this to mark that it does not want splitting to happen
+  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
+  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
+  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
+      S->getAlignment() <= 2)
+    return SDValue();
+
+  // If we get a splat of a scalar convert this vector store to a store of
+  // scalars. They will be merged into store pairs thereby removing two
+  // instructions.
+  SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
+  if (ReplacedSplat != SDValue())
+    return ReplacedSplat;
+
+  SDLoc DL(S);
+  unsigned NumElts = VT.getVectorNumElements() / 2;
+  // Split VT into two.
+  EVT HalfVT =
+      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getIntPtrConstant(0));
+  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getIntPtrConstant(NumElts));
+  SDValue BasePtr = S->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
+                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
+  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                  DAG.getConstant(8, MVT::i64));
+  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
+                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
+                      S->getAlignment());
+}
+
+/// Target-specific DAG combine function for post-increment LD1 (lane) and
+/// post-increment LD1R.
+static SDValue performPostLD1Combine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     bool IsLaneOp) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  unsigned LoadIdx = IsLaneOp ? 1 : 0;
+  SDNode *LD = N->getOperand(LoadIdx).getNode();
+  // If it is not LOAD, can not do such combine.
+  if (LD->getOpcode() != ISD::LOAD)
+    return SDValue();
+
+  LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
+  EVT MemVT = LoadSDN->getMemoryVT();
+  // Check if memory operand is the same type as the vector element.
+  if (MemVT != VT.getVectorElementType())
+    return SDValue();
+
+  // Check if there are other uses. If so, do not combine as it will introduce
+  // an extra load.
+  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
+       ++UI) {
+    if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
+      continue;
+    if (*UI != N)
+      return SDValue();
+  }
+
+  SDValue Addr = LD->getOperand(1);
+  SDValue Vector = N->getOperand(0);
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
+       Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD
+        || UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load.  Otherwise, folding it
+    // would create a cycle.
+    if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
+      continue;
+    // Also check that add is not used in the vector operand.  This would also
+    // create a cycle.
+    if (User->isPredecessorOf(Vector.getNode()))
+      continue;
 
-  bool ismatch = true;
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      unsigned NumBytes = VT.getScalarSizeInBits() / 8;
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+    }
+
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(LD->getOperand(0));  // Chain
+    if (IsLaneOp) {
+      Ops.push_back(Vector);           // The vector to be inserted
+      Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
+    }
+    Ops.push_back(Addr);
+    Ops.push_back(Inc);
+
+    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
+    unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
+                                           MemVT,
+                                           LoadSDN->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    NewResults.push_back(SDValue(LD, 0));             // The result of load
+    NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
+    DCI.CombineTo(LD, NewResults);
+    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
+
+    break;
+  }
+  return SDValue();
+}
+
+/// Target-specific DAG combine function for NEON load/store intrinsics
+/// to merge base address updates.
+static SDValue performNEONPostLDSTCombine(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  unsigned AddrOpIdx = N->getNumOperands() - 1;
+  SDValue Addr = N->getOperand(AddrOpIdx);
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle.
+    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool IsStore = false;
+    bool IsLaneOp = false;
+    bool IsDupOp = false;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: llvm_unreachable("unexpected intrinsic for Neon base update");
+    case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
+      NumVecs = 2; break;
+    case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
+      NumVecs = 3; break;
+    case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
+      NumVecs = 4; break;
+    case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
+      NumVecs = 2; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
+      NumVecs = 3; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
+      NumVecs = 4; IsStore = true; break;
+    case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
+      NumVecs = 2; break;
+    case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
+      NumVecs = 3; break;
+    case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
+      NumVecs = 4; break;
+    case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
+      NumVecs = 2; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
+      NumVecs = 3; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
+      NumVecs = 4; IsStore = true; break;
+    case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
+      NumVecs = 2; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
+      NumVecs = 3; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
+      NumVecs = 4; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
+      NumVecs = 2; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
+      NumVecs = 3; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
+      NumVecs = 4; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
+      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
+      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
+      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
+    }
+
+    EVT VecTy;
+    if (IsStore)
+      VecTy = N->getOperand(2).getValueType();
+    else
+      VecTy = N->getValueType(0);
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+      if (IsLaneOp || IsDupOp)
+        NumBytes /= VecTy.getVectorNumElements();
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+    }
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // Incoming chain
+    // Load lane and store have vector list as input.
+    if (IsLaneOp || IsStore)
+      for (unsigned i = 2; i < AddrOpIdx; ++i)
+        Ops.push_back(N->getOperand(i));
+    Ops.push_back(Addr); // Base register
+    Ops.push_back(Inc);
+
+    // Return Types.
+    EVT Tys[6];
+    unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = VecTy;
+    Tys[n++] = MVT::i64;  // Type of write back register
+    Tys[n] = MVT::Other;  // Type of the chain
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
 
-  // Check UZP1
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = i * 2;
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_UZP1;
+    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
+                                           MemInt->getMemoryVT(),
+                                           MemInt->getMemOperand());
 
-  // Check UZP2
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = i * 2 + 1;
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i) {
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
     }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_UZP2;
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
 
-  // Check ZIP1
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = i / 2 + NumElts * (i % 2);
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
+    break;
   }
-  if (ismatch)
-    return AArch64ISD::NEON_ZIP1;
+  return SDValue();
+}
 
-  // Check ZIP2
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = (NumElts + i) / 2 + NumElts * (i % 2);
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_ZIP2;
+// Optimize compare with zero and branch.
+static SDValue performBRCONDCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue CCVal = N->getOperand(2);
+  SDValue Cmp = N->getOperand(3);
+
+  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
+  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
+  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
+    return SDValue();
 
-  // Check TRN1
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = i + (NumElts - 1) * (i % 2);
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_TRN1;
+  unsigned CmpOpc = Cmp.getOpcode();
+  if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
+    return SDValue();
 
-  // Check TRN2
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = 1 + i + (NumElts - 1) * (i % 2);
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_TRN2;
+  // Only attempt folding if there is only one use of the flag and no use of the
+  // value.
+  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
+    return SDValue();
 
-  return 0;
-}
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
 
-SDValue
-AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+  assert(LHS.getValueType() == RHS.getValueType() &&
+         "Expected the value type to be the same for both operands!");
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return SDValue();
 
-  // Convert shuffles that are directly supported on NEON to target-specific
-  // DAG nodes, instead of keeping them as shuffles and matching them again
-  // during code selection.  This is more efficient and avoids the possibility
-  // of inconsistencies between legalization and selection.
-  ArrayRef<int> ShuffleMask = SVN->getMask();
+  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
+    std::swap(LHS, RHS);
 
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  if (EltSize > 64)
+  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
     return SDValue();
 
-  if (isREVMask(ShuffleMask, VT, 64))
-    return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1);
-  if (isREVMask(ShuffleMask, VT, 32))
-    return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1);
-  if (isREVMask(ShuffleMask, VT, 16))
-    return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
+  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
+      LHS.getOpcode() == ISD::SRL)
+    return SDValue();
 
-  unsigned ISDNo;
-  if (V2.getOpcode() == ISD::UNDEF)
-    ISDNo = isPermuteMask(ShuffleMask, VT, true);
+  // Fold the compare into the branch instruction.
+  SDValue BR;
+  if (CC == AArch64CC::EQ)
+    BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
   else
-    ISDNo = isPermuteMask(ShuffleMask, VT, false);
+    BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
 
-  if (ISDNo) {
-    if (V2.getOpcode() == ISD::UNDEF)
-      return DAG.getNode(ISDNo, dl, VT, V1, V1);
-    else
-      return DAG.getNode(ISDNo, dl, VT, V1, V2);
-  }
+  // Do not add new nodes to DAG combiner worklist.
+  DCI.CombineTo(N, BR, false);
 
-  SDValue Res;
-  if (isConcatVector(Op, DAG, V1, V2, &ShuffleMask[0], Res))
-    return Res;
+  return SDValue();
+}
 
-  // If the element of shuffle mask are all the same constant, we can
-  // transform it into either NEON_VDUP or NEON_VDUPLANE
-  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
-    int Lane = SVN->getSplatIndex();
-    // If this is undef splat, generate it via "just" vdup, if possible.
-    if (Lane == -1) Lane = 0;
+// vselect (v1i1 setcc) ->
+//     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
+// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
+// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
+// such VSELECT.
+static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT CCVT = N0.getValueType();
 
-    // Test if V1 is a SCALAR_TO_VECTOR.
-    if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-      return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
-    }
-    // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
-    if (V1.getOpcode() == ISD::BUILD_VECTOR) {
-      bool IsScalarToVector = true;
-      for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
-        if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
-            i != (unsigned)Lane) {
-          IsScalarToVector = false;
-          break;
-        }
-      if (IsScalarToVector)
-        return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
-                           V1.getOperand(Lane));
-    }
+  if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
+      CCVT.getVectorElementType() != MVT::i1)
+    return SDValue();
 
-    // Test if V1 is a EXTRACT_SUBVECTOR.
-    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-      int ExtLane = cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
-      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1.getOperand(0),
-                         DAG.getConstant(Lane + ExtLane, MVT::i64));
-    }
-    // Test if V1 is a CONCAT_VECTORS.
-    if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
-        V1.getOperand(1).getOpcode() == ISD::UNDEF) {
-      SDValue Op0 = V1.getOperand(0);
-      assert((unsigned)Lane < Op0.getValueType().getVectorNumElements() &&
-             "Invalid vector lane access");
-      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, Op0,
-                         DAG.getConstant(Lane, MVT::i64));
-    }
+  EVT ResVT = N->getValueType(0);
+  EVT CmpVT = N0.getOperand(0).getValueType();
+  // Only combine when the result type is of the same size as the compared
+  // operands.
+  if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
+    return SDValue();
 
-    return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
-                       DAG.getConstant(Lane, MVT::i64));
-  }
+  SDValue IfTrue = N->getOperand(1);
+  SDValue IfFalse = N->getOperand(2);
+  SDValue SetCC =
+      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
+                   N0.getOperand(0), N0.getOperand(1),
+                   cast<CondCodeSDNode>(N0.getOperand(2))->get());
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
+                     IfTrue, IfFalse);
+}
 
-  int Length = ShuffleMask.size();
-  int V1EltNum = V1.getValueType().getVectorNumElements();
+/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
+/// the compare-mask instructions rather than going via NZCV, even if LHS and
+/// RHS are really scalar. This replaces any scalar setcc in the above pattern
+/// with a vector one followed by a DUP shuffle on the result.
+static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT ResVT = N->getValueType(0);
 
-  // If the number of v1 elements is the same as the number of shuffle mask
-  // element and the shuffle masks are sequential values, we can transform
-  // it into NEON_VEXTRACT.
-  if (V1EltNum == Length) {
-    // Check if the shuffle mask is sequential.
-    int SkipUndef = 0;
-    while (ShuffleMask[SkipUndef] == -1) {
-      SkipUndef++;
-    }
-    int CurMask = ShuffleMask[SkipUndef];
-    if (CurMask >= SkipUndef) {
-      bool IsSequential = true;
-      for (int I = SkipUndef; I < Length; ++I) {
-        if (ShuffleMask[I] != -1 && ShuffleMask[I] != CurMask) {
-          IsSequential = false;
-          break;
-        }
-        CurMask++;
-      }
-      if (IsSequential) {
-        assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
-        unsigned VecSize = EltSize * V1EltNum;
-        unsigned Index = (EltSize / 8) * (ShuffleMask[SkipUndef] - SkipUndef);
-        if (VecSize == 64 || VecSize == 128)
-          return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
-                             DAG.getConstant(Index, MVT::i64));
-      }
-    }
-  }
+  if (!N->getOperand(1).getValueType().isVector())
+    return SDValue();
 
-  // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
-  // by element from V2 to V1 .
-  // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
-  // better choice to be inserted than V1 as less insert needed, so we count
-  // element to be inserted for both V1 and V2, and select less one as insert
-  // target.
-
-  // Collect elements need to be inserted and their index.
-  SmallVector<int, 8> NV1Elt;
-  SmallVector<int, 8> N1Index;
-  SmallVector<int, 8> NV2Elt;
-  SmallVector<int, 8> N2Index;
-  for (int I = 0; I != Length; ++I) {
-    if (ShuffleMask[I] != I) {
-      NV1Elt.push_back(ShuffleMask[I]);
-      N1Index.push_back(I);
-    }
-  }
-  for (int I = 0; I != Length; ++I) {
-    if (ShuffleMask[I] != (I + V1EltNum)) {
-      NV2Elt.push_back(ShuffleMask[I]);
-      N2Index.push_back(I);
-    }
-  }
+  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
+    return SDValue();
 
-  // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
-  // will be inserted.
-  SDValue InsV = V1;
-  SmallVector<int, 8> InsMasks = NV1Elt;
-  SmallVector<int, 8> InsIndex = N1Index;
-  if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
-    if (NV1Elt.size() > NV2Elt.size()) {
-      InsV = V2;
-      InsMasks = NV2Elt;
-      InsIndex = N2Index;
-    }
-  } else {
-    InsV = DAG.getNode(ISD::UNDEF, dl, VT);
-  }
+  SDLoc DL(N0);
 
-  for (int I = 0, E = InsMasks.size(); I != E; ++I) {
-    SDValue ExtV = V1;
-    int Mask = InsMasks[I];
-    if (Mask >= V1EltNum) {
-      ExtV = V2;
-      Mask -= V1EltNum;
-    }
-    // Any value type smaller than i32 is illegal in AArch64, and this lower
-    // function is called after legalize pass, so we need to legalize
-    // the result here.
-    EVT EltVT;
-    if (VT.getVectorElementType().isFloatingPoint())
-      EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32;
-    else
-      EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32;
+  EVT SrcVT = N0.getOperand(0).getValueType();
+  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
+                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+  EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
+
+  // First perform a vector comparison, where lane 0 is the one we're interested
+  // in.
+  SDValue LHS =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
+  SDValue RHS =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
+  SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
+
+  // Now duplicate the comparison mask we want across all other lanes.
+  SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
+  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
+  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
+                     Mask);
+
+  return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
+}
 
-    if (Mask >= 0) {
-      ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
-                         DAG.getConstant(Mask, MVT::i64));
-      InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
-                         DAG.getConstant(InsIndex[I], MVT::i64));
+SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::ADD:
+  case ISD::SUB:
+    return performAddSubLongCombine(N, DCI, DAG);
+  case ISD::XOR:
+    return performXorCombine(N, DAG, DCI, Subtarget);
+  case ISD::MUL:
+    return performMulCombine(N, DAG, DCI, Subtarget);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return performIntToFpCombine(N, DAG);
+  case ISD::OR:
+    return performORCombine(N, DCI, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return performIntrinsicCombine(N, DCI, Subtarget);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+    return performExtendCombine(N, DCI, DAG);
+  case ISD::BITCAST:
+    return performBitcastCombine(N, DCI, DAG);
+  case ISD::CONCAT_VECTORS:
+    return performConcatVectorsCombine(N, DCI, DAG);
+  case ISD::SELECT:
+    return performSelectCombine(N, DAG);
+  case ISD::VSELECT:
+    return performVSelectCombine(N, DCI.DAG);
+  case ISD::STORE:
+    return performSTORECombine(N, DCI, DAG, Subtarget);
+  case AArch64ISD::BRCOND:
+    return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::DUP:
+    return performPostLD1Combine(N, DCI, false);
+  case ISD::INSERT_VECTOR_ELT:
+    return performPostLD1Combine(N, DCI, true);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::aarch64_neon_ld2:
+    case Intrinsic::aarch64_neon_ld3:
+    case Intrinsic::aarch64_neon_ld4:
+    case Intrinsic::aarch64_neon_ld1x2:
+    case Intrinsic::aarch64_neon_ld1x3:
+    case Intrinsic::aarch64_neon_ld1x4:
+    case Intrinsic::aarch64_neon_ld2lane:
+    case Intrinsic::aarch64_neon_ld3lane:
+    case Intrinsic::aarch64_neon_ld4lane:
+    case Intrinsic::aarch64_neon_ld2r:
+    case Intrinsic::aarch64_neon_ld3r:
+    case Intrinsic::aarch64_neon_ld4r:
+    case Intrinsic::aarch64_neon_st2:
+    case Intrinsic::aarch64_neon_st3:
+    case Intrinsic::aarch64_neon_st4:
+    case Intrinsic::aarch64_neon_st1x2:
+    case Intrinsic::aarch64_neon_st1x3:
+    case Intrinsic::aarch64_neon_st1x4:
+    case Intrinsic::aarch64_neon_st2lane:
+    case Intrinsic::aarch64_neon_st3lane:
+    case Intrinsic::aarch64_neon_st4lane:
+      return performNEONPostLDSTCombine(N, DCI, DAG);
+    default:
+      break;
     }
   }
-  return InsV;
+  return SDValue();
 }
 
-AArch64TargetLowering::ConstraintType
-AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    default: break;
-    case 'w': // An FP/SIMD vector register
-      return C_RegisterClass;
-    case 'I': // Constant that can be used with an ADD instruction
-    case 'J': // Constant that can be used with a SUB instruction
-    case 'K': // Constant that can be used with a 32-bit logical instruction
-    case 'L': // Constant that can be used with a 64-bit logical instruction
-    case 'M': // Constant that can be used as a 32-bit MOV immediate
-    case 'N': // Constant that can be used as a 64-bit MOV immediate
-    case 'Y': // Floating point constant zero
-    case 'Z': // Integer constant zero
-      return C_Other;
-    case 'Q': // A memory reference with base register and no offset
-      return C_Memory;
-    case 'S': // A symbolic address
-      return C_Other;
-    }
+// Check if the return value is used as only a return value, as otherwise
+// we can't perform a tail-call. In particular, we need to check for
+// target ISD nodes that are returns and any other "odd" constructs
+// that the generic analysis code won't necessarily catch.
+bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
+                                               SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
+        MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+    return false;
+
+  bool HasRet = false;
+  for (SDNode *Node : Copy->uses()) {
+    if (Node->getOpcode() != AArch64ISD::RET_FLAG)
+      return false;
+    HasRet = true;
   }
 
-  // FIXME: Ump, Utf, Usa, Ush
-  // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes,
-  //      whatever they may be
-  // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be
-  // Usa: An absolute symbolic address
-  // Ush: The high part (bits 32:12) of a pc-relative symbolic address
-  assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa"
-         && Constraint != "Ush" && "Unimplemented constraints");
+  if (!HasRet)
+    return false;
 
-  return TargetLowering::getConstraintType(Constraint);
+  Chain = TCChain;
+  return true;
 }
 
-TargetLowering::ConstraintWeight
-AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info,
-                                                const char *Constraint) const {
+// Return whether the an instruction can potentially be optimized to a tail
+// call. This will cause the optimizers to attempt to move, or duplicate,
+// return instructions to help enable tail call optimizations for this
+// instruction.
+bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!CI->isTailCall())
+    return false;
 
-  llvm_unreachable("Constraint weight unimplemented");
+  return true;
 }
 
-void
-AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
-                                                    std::string &Constraint,
-                                                    std::vector<SDValue> &Ops,
-                                                    SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
-
-  // Only length 1 constraints are C_Other.
-  if (Constraint.size() != 1) return;
+bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
+                                                   SDValue &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   bool &IsInc,
+                                                   SelectionDAG &DAG) const {
+  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
+    return false;
 
-  // Only C_Other constraints get lowered like this. That means constants for us
-  // so return early if there's no hope the constraint can be lowered.
+  Base = Op->getOperand(0);
+  // All of the indexed addressing mode instructions take a signed
+  // 9 bit immediate offset.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    int64_t RHSC = (int64_t)RHS->getZExtValue();
+    if (RHSC >= 256 || RHSC <= -256)
+      return false;
+    IsInc = (Op->getOpcode() == ISD::ADD);
+    Offset = Op->getOperand(1);
+    return true;
+  }
+  return false;
+}
 
-  switch(Constraint[0]) {
-  default: break;
-  case 'I': case 'J': case 'K': case 'L':
-  case 'M': case 'N': case 'Z': {
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C)
-      return;
+bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                      SDValue &Offset,
+                                                      ISD::MemIndexedMode &AM,
+                                                      SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
 
-    uint64_t CVal = C->getZExtValue();
-    uint32_t Bits;
+  bool IsInc;
+  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
+    return false;
+  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
+  return true;
+}
 
-    switch (Constraint[0]) {
-    default:
-      // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J'
-      // is a peculiarly useless SUB constraint.
-      llvm_unreachable("Unimplemented C_Other constraint");
-    case 'I':
-      if (CVal <= 0xfff)
-        break;
-      return;
-    case 'K':
-      if (A64Imms::isLogicalImm(32, CVal, Bits))
-        break;
-      return;
-    case 'L':
-      if (A64Imms::isLogicalImm(64, CVal, Bits))
-        break;
-      return;
-    case 'Z':
-      if (CVal == 0)
-        break;
-      return;
-    }
+bool AArch64TargetLowering::getPostIndexedAddressParts(
+    SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
+    ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
 
-    Result = DAG.getTargetConstant(CVal, Op.getValueType());
-    break;
-  }
-  case 'S': {
-    // An absolute symbolic address or label reference.
-    if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
-      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
-                                          GA->getValueType(0));
-    } else if (const BlockAddressSDNode *BA
-                 = dyn_cast<BlockAddressSDNode>(Op)) {
-      Result = DAG.getTargetBlockAddress(BA->getBlockAddress(),
-                                         BA->getValueType(0));
-    } else if (const ExternalSymbolSDNode *ES
-                 = dyn_cast<ExternalSymbolSDNode>(Op)) {
-      Result = DAG.getTargetExternalSymbol(ES->getSymbol(),
-                                           ES->getValueType(0));
-    } else
-      return;
-    break;
-  }
-  case 'Y':
-    if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
-      if (CFP->isExactlyValue(0.0)) {
-        Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0));
-        break;
-      }
-    }
-    return;
-  }
+  bool IsInc;
+  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+    return false;
+  // Post-indexing updates the base, so it's not a valid transform
+  // if that's not the same as the load's pointer.
+  if (Ptr != Base)
+    return false;
+  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+  return true;
+}
 
-  if (Result.getNode()) {
-    Ops.push_back(Result);
+void AArch64TargetLowering::ReplaceNodeResults(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this");
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
+    // Let normal code take care of it by not adding anything to Results.
     return;
   }
+}
 
-  // It's an unknown constraint for us. Let generic code have a go.
-  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
+  // Loads and stores less than 128-bits are already atomic; ones above that
+  // are doomed anyway, so defer to the default libcall and blame the OS when
+  // things go wrong:
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
+  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return LI->getType()->getPrimitiveSizeInBits() == 128;
+
+  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
+  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
 }
 
-std::pair<unsigned, const TargetRegisterClass*>
-AArch64TargetLowering::getRegForInlineAsmConstraint(
-                                                  const std::string &Constraint,
-                                                  MVT VT) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    case 'r':
-      if (VT.getSizeInBits() <= 32)
-        return std::make_pair(0U, &AArch64::GPR32RegClass);
-      else if (VT == MVT::i64)
-        return std::make_pair(0U, &AArch64::GPR64RegClass);
-      break;
-    case 'w':
-      if (VT == MVT::f16)
-        return std::make_pair(0U, &AArch64::FPR16RegClass);
-      else if (VT == MVT::f32)
-        return std::make_pair(0U, &AArch64::FPR32RegClass);
-      else if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &AArch64::FPR64RegClass);
-      else if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &AArch64::FPR128RegClass);
-      break;
-    }
+Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                                             AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+  bool IsAcquire =
+      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
+  // intrinsic must return {i64, i64} and we have to recombine them into a
+  // single i128 here.
+  if (ValTy->getPrimitiveSizeInBits() == 128) {
+    Intrinsic::ID Int =
+        IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
+    Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
+
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
+
+    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+    return Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
   }
 
-  // Use the default implementation in TargetLowering to convert the register
-  // constraint into a member of a register class.
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  Type *Tys[] = { Addr->getType() };
+  Intrinsic::ID Int =
+      IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
+  Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateTruncOrBitCast(
+      Builder.CreateCall(Ldxr, Addr),
+      cast<PointerType>(Addr->getType())->getElementType());
 }
 
-/// Represent NEON load and store intrinsics as MemIntrinsicNodes.
-/// The associated MachineMemOperands record the alignment specified
-/// in the intrinsic calls.
-bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                               const CallInst &I,
-                                               unsigned Intrinsic) const {
-  switch (Intrinsic) {
-  case Intrinsic::arm_neon_vld1:
-  case Intrinsic::arm_neon_vld2:
-  case Intrinsic::arm_neon_vld3:
-  case Intrinsic::arm_neon_vld4:
-  case Intrinsic::aarch64_neon_vld1x2:
-  case Intrinsic::aarch64_neon_vld1x3:
-  case Intrinsic::aarch64_neon_vld1x4:
-  case Intrinsic::arm_neon_vld2lane:
-  case Intrinsic::arm_neon_vld3lane:
-  case Intrinsic::arm_neon_vld4lane: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
-    Info.vol = false; // volatile loads with NEON intrinsics not supported
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm_neon_vst1:
-  case Intrinsic::arm_neon_vst2:
-  case Intrinsic::arm_neon_vst3:
-  case Intrinsic::arm_neon_vst4:
-  case Intrinsic::aarch64_neon_vst1x2:
-  case Intrinsic::aarch64_neon_vst1x3:
-  case Intrinsic::aarch64_neon_vst1x4:
-  case Intrinsic::arm_neon_vst2lane:
-  case Intrinsic::arm_neon_vst3lane:
-  case Intrinsic::arm_neon_vst4lane: {
-    Info.opc = ISD::INTRINSIC_VOID;
-    // Conservatively set memVT to the entire set of vectors stored.
-    unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
-      Type *ArgTy = I.getArgOperand(ArgI)->getType();
-      if (!ArgTy->isVectorTy())
-        break;
-      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
-    }
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
-    Info.vol = false; // volatile stores with NEON intrinsics not supported
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  default:
-    break;
+Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+                                                   Value *Val, Value *Addr,
+                                                   AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  bool IsRelease =
+      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since the intrinsics must have legal type, the i128 intrinsics take two
+  // parameters: "i64, i64". We must marshal Val into the appropriate form
+  // before the call.
+  if (Val->getType()->getPrimitiveSizeInBits() == 128) {
+    Intrinsic::ID Int =
+        IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
+    Function *Stxr = Intrinsic::getDeclaration(M, Int);
+    Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
+    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
   }
 
-  return false;
+  Intrinsic::ID Int =
+      IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
+  Type *Tys[] = { Addr->getType() };
+  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateCall2(
+      Stxr, Builder.CreateZExtOrBitCast(
+                Val, Stxr->getFunctionType()->getParamType(0)),
+      Addr);
 }
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index e946b25..de16c4d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -12,364 +12,453 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64_ISELLOWERING_H
-#define LLVM_TARGET_AARCH64_ISELLOWERING_H
+#ifndef LLVM_TARGET_AArch64_ISELLOWERING_H
+#define LLVM_TARGET_AArch64_ISELLOWERING_H
 
-#include "Utils/AArch64BaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
+
 namespace AArch64ISD {
-  enum NodeType {
-    // Start the numbering from where ISD NodeType finishes.
-    FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-    // This is a conditional branch which also notes the flag needed
-    // (eq/sgt/...). A64 puts this information on the branches rather than
-    // compares as LLVM does.
-    BR_CC,
-
-    // A node to be selected to an actual call operation: either BL or BLR in
-    // the absence of tail calls.
-    Call,
-
-    // Indicates a floating-point immediate which fits into the format required
-    // by the FMOV instructions. First (and only) operand is the 8-bit encoded
-    // value of that immediate.
-    FPMOV,
-
-    // Corresponds directly to an EXTR instruction. Operands are an LHS an RHS
-    // and an LSB.
-    EXTR,
-
-    // Wraps a load from the GOT, which should always be performed with a 64-bit
-    // load instruction. This prevents the DAG combiner folding a truncate to
-    // form a smaller memory access.
-    GOTLoad,
-
-    // Performs a bitfield insert. Arguments are: the value being inserted into;
-    // the value being inserted; least significant bit changed; width of the
-    // field.
-    BFI,
-
-    // Simply a convenient node inserted during ISelLowering to represent
-    // procedure return. Will almost certainly be selected to "RET".
-    Ret,
-
-    /// Extracts a field of contiguous bits from the source and sign extends
-    /// them into a single register. Arguments are: source; immr; imms. Note
-    /// these are pre-encoded since DAG matching can't cope with combining LSB
-    /// and Width into these values itself.
-    SBFX,
-
-    /// This is an A64-ification of the standard LLVM SELECT_CC operation. The
-    /// main difference is that it only has the values and an A64 condition,
-    /// which will be produced by a setcc instruction.
-    SELECT_CC,
-
-    /// This serves most of the functions of the LLVM SETCC instruction, for two
-    /// purposes. First, it prevents optimisations from fiddling with the
-    /// compare after we've moved the CondCode information onto the SELECT_CC or
-    /// BR_CC instructions. Second, it gives a legal instruction for the actual
-    /// comparison.
-    ///
-    /// It keeps a record of the condition flags asked for because certain
-    /// instructions are only valid for a subset of condition codes.
-    SETCC,
-
-    // Designates a node which is a tail call: both a call and a return
-    // instruction as far as selction is concerned. It should be selected to an
-    // unconditional branch. Has the usual plethora of call operands, but: 1st
-    // is callee, 2nd is stack adjustment required immediately before branch.
-    TC_RETURN,
-
-    // Designates a call used to support the TLS descriptor ABI. The call itself
-    // will be indirect ("BLR xN") but a relocation-specifier (".tlsdesccall
-    // var") must be attached somehow during code generation. It takes two
-    // operands: the callee and the symbol to be relocated against.
-    TLSDESCCALL,
-
-    // Leaf node which will be lowered to an appropriate MRS to obtain the
-    // thread pointer: TPIDR_EL0.
-    THREAD_POINTER,
-
-    /// Extracts a field of contiguous bits from the source and zero extends
-    /// them into a single register. Arguments are: source; immr; imms. Note
-    /// these are pre-encoded since DAG matching can't cope with combining LSB
-    /// and Width into these values itself.
-    UBFX,
-
-    // Wraps an address which the ISelLowering phase has decided should be
-    // created using the large memory model style: i.e. a sequence of four
-    // movz/movk instructions.
-    WrapperLarge,
-
-    // Wraps an address which the ISelLowering phase has decided should be
-    // created using the small memory model style: i.e. adrp/add or
-    // adrp/mem-op. This exists to prevent bare TargetAddresses which may never
-    // get selected.
-    WrapperSmall,
-
-    // Vector move immediate
-    NEON_MOVIMM,
-
-    // Vector Move Inverted Immediate
-    NEON_MVNIMM,
-
-    // Vector FP move immediate
-    NEON_FMOVIMM,
-
-    // Vector permute
-    NEON_UZP1,
-    NEON_UZP2,
-    NEON_ZIP1,
-    NEON_ZIP2,
-    NEON_TRN1,
-    NEON_TRN2,
-
-    // Vector Element reverse
-    NEON_REV64,
-    NEON_REV32,
-    NEON_REV16,
-
-    // Vector compare
-    NEON_CMP,
-
-    // Vector compare zero
-    NEON_CMPZ,
-
-    // Vector compare bitwise test
-    NEON_TST,
-
-    // Vector saturating shift
-    NEON_QSHLs,
-    NEON_QSHLu,
-
-    // Vector dup
-    NEON_VDUP,
-
-    // Vector dup by lane
-    NEON_VDUPLANE,
-
-    // Vector extract
-    NEON_VEXTRACT,
-
-    // NEON duplicate lane loads
-    NEON_LD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
-    NEON_LD3DUP,
-    NEON_LD4DUP,
-
-    // NEON loads with post-increment base updates:
-    NEON_LD1_UPD,
-    NEON_LD2_UPD,
-    NEON_LD3_UPD,
-    NEON_LD4_UPD,
-    NEON_LD1x2_UPD,
-    NEON_LD1x3_UPD,
-    NEON_LD1x4_UPD,
-
-    // NEON stores with post-increment base updates:
-    NEON_ST1_UPD,
-    NEON_ST2_UPD,
-    NEON_ST3_UPD,
-    NEON_ST4_UPD,
-    NEON_ST1x2_UPD,
-    NEON_ST1x3_UPD,
-    NEON_ST1x4_UPD,
-
-    // NEON duplicate lane loads with post-increment base updates:
-    NEON_LD2DUP_UPD,
-    NEON_LD3DUP_UPD,
-    NEON_LD4DUP_UPD,
-
-    // NEON lane loads with post-increment base updates:
-    NEON_LD2LN_UPD,
-    NEON_LD3LN_UPD,
-    NEON_LD4LN_UPD,
-
-    // NEON lane store with post-increment base updates:
-    NEON_ST2LN_UPD,
-    NEON_ST3LN_UPD,
-    NEON_ST4LN_UPD
-  };
-}
 
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
+  CALL,         // Function call.
+
+  // Almost the same as a normal call node, except that a TLSDesc relocation is
+  // needed so the linker can relax it correctly if possible.
+  TLSDESC_CALL,
+  ADRP,     // Page address of a TargetGlobalAddress operand.
+  ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
+  LOADgot,  // Load from automatically generated descriptor (e.g. Global
+            // Offset Table, TLS record).
+  RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
+  BRCOND,   // Conditional branch instruction; "b.cond".
+  CSEL,
+  FCSEL, // Conditional move instruction.
+  CSINV, // Conditional select invert.
+  CSNEG, // Conditional select negate.
+  CSINC, // Conditional select increment.
+
+  // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
+  // ELF.
+  THREAD_POINTER,
+  ADC,
+  SBC, // adc, sbc instructions
+
+  // Arithmetic instructions which write flags.
+  ADDS,
+  SUBS,
+  ADCS,
+  SBCS,
+  ANDS,
+
+  // Floating point comparison
+  FCMP,
+
+  // Floating point max and min instructions.
+  FMAX,
+  FMIN,
+
+  // Scalar extract
+  EXTR,
+
+  // Scalar-to-vector duplication
+  DUP,
+  DUPLANE8,
+  DUPLANE16,
+  DUPLANE32,
+  DUPLANE64,
+
+  // Vector immedate moves
+  MOVI,
+  MOVIshift,
+  MOVIedit,
+  MOVImsl,
+  FMOV,
+  MVNIshift,
+  MVNImsl,
+
+  // Vector immediate ops
+  BICi,
+  ORRi,
+
+  // Vector bit select: similar to ISD::VSELECT but not all bits within an
+  // element must be identical.
+  BSL,
+
+  // Vector arithmetic negation
+  NEG,
+
+  // Vector shuffles
+  ZIP1,
+  ZIP2,
+  UZP1,
+  UZP2,
+  TRN1,
+  TRN2,
+  REV16,
+  REV32,
+  REV64,
+  EXT,
+
+  // Vector shift by scalar
+  VSHL,
+  VLSHR,
+  VASHR,
+
+  // Vector shift by scalar (again)
+  SQSHL_I,
+  UQSHL_I,
+  SQSHLU_I,
+  SRSHR_I,
+  URSHR_I,
+
+  // Vector comparisons
+  CMEQ,
+  CMGE,
+  CMGT,
+  CMHI,
+  CMHS,
+  FCMEQ,
+  FCMGE,
+  FCMGT,
+
+  // Vector zero comparisons
+  CMEQz,
+  CMGEz,
+  CMGTz,
+  CMLEz,
+  CMLTz,
+  FCMEQz,
+  FCMGEz,
+  FCMGTz,
+  FCMLEz,
+  FCMLTz,
+
+  // Vector bitwise negation
+  NOT,
+
+  // Vector bitwise selection
+  BIT,
+
+  // Compare-and-branch
+  CBZ,
+  CBNZ,
+  TBZ,
+  TBNZ,
+
+  // Tail calls
+  TC_RETURN,
+
+  // Custom prefetch handling
+  PREFETCH,
+
+  // {s|u}int to FP within a FP register.
+  SITOF,
+  UITOF,
+
+  // NEON Load/Store with post-increment base updates
+  LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  LD3post,
+  LD4post,
+  ST2post,
+  ST3post,
+  ST4post,
+  LD1x2post,
+  LD1x3post,
+  LD1x4post,
+  ST1x2post,
+  ST1x3post,
+  ST1x4post,
+  LD1DUPpost,
+  LD2DUPpost,
+  LD3DUPpost,
+  LD4DUPpost,
+  LD1LANEpost,
+  LD2LANEpost,
+  LD3LANEpost,
+  LD4LANEpost,
+  ST2LANEpost,
+  ST3LANEpost,
+  ST4LANEpost
+};
+
+} // end namespace AArch64ISD
 
 class AArch64Subtarget;
 class AArch64TargetMachine;
 
 class AArch64TargetLowering : public TargetLowering {
+  bool RequireStrictAlign;
+
 public:
   explicit AArch64TargetLowering(AArch64TargetMachine &TM);
 
-  const char *getTargetNodeName(unsigned Opcode) const;
+  /// Selects the correct CCAssignFn for a the given CallingConvention
+  /// value.
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
+
+  /// computeKnownBitsForTargetNode - Determine which of the bits specified in
+  /// Mask are known to be either zero or one and return them in the
+  /// KnownZero/KnownOne bitsets.
+  void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                     APInt &KnownOne, const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  MVT getScalarShiftAmountTy(EVT LHSTy) const override;
+
+  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+  /// unaligned memory accesses. of the specified type.
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+                                     bool *Fast = nullptr) const override {
+    if (RequireStrictAlign)
+      return false;
+    // FIXME: True for Cyclone, but not necessary others.
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
 
-  CCAssignFn *CCAssignFnForNode(CallingConv::ID CC) const;
+  /// LowerOperation - Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-  SDValue LowerFormalArguments(SDValue Chain,
-                               CallingConv::ID CallConv, bool isVarArg,
-                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                               SDLoc dl, SelectionDAG &DAG,
-                               SmallVectorImpl<SDValue> &InVals) const;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
-  SDValue LowerReturn(SDValue Chain,
-                      CallingConv::ID CallConv, bool isVarArg,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals,
-                      SDLoc dl, SelectionDAG &DAG) const;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-  virtual unsigned getByValTypeAlignment(Type *Ty) const override;
+  /// getFunctionAlignment - Return the Log2 alignment of this function.
+  unsigned getFunctionAlignment(const Function *F) const;
 
-  SDValue LowerCall(CallLoweringInfo &CLI,
-                    SmallVectorImpl<SDValue> &InVals) const;
+  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
+  /// be used for loads / stores from the global.
+  unsigned getMaximalGlobalOffset() const override;
 
-  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
-                          CallingConv::ID CallConv, bool IsVarArg,
-                          const SmallVectorImpl<ISD::InputArg> &Ins,
-                          SDLoc dl, SelectionDAG &DAG,
-                          SmallVectorImpl<SDValue> &InVals) const;
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+    // Addrspacecasts are always noops.
+    return true;
+  }
 
-  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  /// createFastISel - This method returns a target specific FastISel object,
+  /// or null if the target does not support "fast" ISel.
+  FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                           const TargetLibraryInfo *libInfo) const override;
 
-  bool isConcatVector(SDValue Op, SelectionDAG &DAG, SDValue V0, SDValue V1,
-                      const int *Mask, SDValue &Res) const;
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
-  bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &V0,
-                            SDValue &V1, int *Mask) const;
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
-  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
-                            const AArch64Subtarget *ST) const;
+  /// isShuffleMaskLegal - Return true if the given shuffle mask can be
+  /// codegen'd directly, or if it should be stack expanded.
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
 
-  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  /// getSetCCResultType - Return the ISD::SETCC ValueType
+  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
-  void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
-                           SDValue &Chain) const;
+  SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
-  /// IsEligibleForTailCallOptimization - Check whether the call is eligible
-  /// for tail call optimization. Targets which want to do tail call
-  /// optimization should implement this function.
-  bool IsEligibleForTailCallOptimization(SDValue Callee,
-                                    CallingConv::ID CalleeCC,
-                                    bool IsVarArg,
-                                    bool IsCalleeStructRet,
-                                    bool IsCallerStructRet,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SelectionDAG& DAG) const;
+  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const;
 
-  /// Finds the incoming stack arguments which overlap the given fixed stack
-  /// object and incorporates their load into the current chain. This prevents
-  /// an upcoming store from clobbering the stack argument before it's used.
-  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
-                              MachineFrameInfo *MFI, int ClobberedFI) const;
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr *MI,
+                              MachineBasicBlock *MBB) const override;
 
-  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          unsigned Intrinsic) const override;
 
-  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+  bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
-  bool IsTailCallConvention(CallingConv::ID CallCC) const;
+  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+  bool isZExtFree(EVT VT1, EVT VT2) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
 
-  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+  bool hasPairedLoad(Type *LoadedType,
+                     unsigned &RequiredAligment) const override;
+  bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
 
-  bool isLegalICmpImmediate(int64_t Val) const;
-  SDValue getSelectableIntSetCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                         SDValue &A64cc, SelectionDAG &DAG, SDLoc &dl) const;
+  bool isLegalAddImmediate(int64_t) const override;
+  bool isLegalICmpImmediate(int64_t) const override;
 
-  virtual MachineBasicBlock *
-  EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                          MachineFunction &MF) const override;
 
-  MachineBasicBlock *
-  emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *MBB,
-                   unsigned Size, unsigned Opcode) const;
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type.
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
-  MachineBasicBlock *
-  emitAtomicBinaryMinMax(MachineInstr *MI, MachineBasicBlock *BB,
-                         unsigned Size, unsigned CmpOp,
-                         A64CC::CondCodes Cond) const;
-  MachineBasicBlock *
-  emitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *BB,
-                    unsigned Size) const;
+  /// \brief Return the cost of the scaling factor used in the addressing
+  /// mode represented by AM for this target, for a load/store
+  /// of the specified type.
+  /// If the AM is supported, the return value must be >= 0.
+  /// If the AM is not supported, it returns a negative value.
+  int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
 
-  MachineBasicBlock *
-  EmitF128CSEL(MachineInstr *MI, MachineBasicBlock *MBB) const;
+  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+  /// expanded to FMAs when this method returns true, otherwise fmuladd is
+  /// expanded to fmul + fadd.
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 
-  SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
-                          RTLIB::Libcall Call) const;
-  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
-  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
-  SDValue LowerGlobalAddressELFSmall(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
+  /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
+  bool isDesirableToCommuteWithShift(const SDNode *N) const override;
 
-  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  /// \brief Returns true if it is beneficial to convert a load of a constant
+  /// to just the constant itself.
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                         Type *Ty) const override;
 
-  SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
-                           SelectionDAG &DAG) const;
-  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
-  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                        AtomicOrdering Ord) const override;
+  Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                              Value *Addr, AtomicOrdering Ord) const override;
 
-  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  bool shouldExpandAtomicInIR(Instruction *Inst) const override;
 
-  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
-  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
-  /// expanded to FMAs when this method returns true, otherwise fmuladd is
-  /// expanded to fmul + fadd.
-  virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
+private:
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
 
-  ConstraintType getConstraintType(const std::string &Constraint) const;
+  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
+  void addDRTypeForNEON(MVT VT);
+  void addQRTypeForNEON(MVT VT);
 
-  ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info,
-                                                  const char *Constraint) const;
-  void LowerAsmOperandForConstraint(SDValue Op,
-                                    std::string &Constraint,
-                                    std::vector<SDValue> &Ops,
-                                    SelectionDAG &DAG) const;
+  SDValue
+  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                       SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const override;
 
-  std::pair<unsigned, const TargetRegisterClass*>
-  getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+  SDValue LowerCall(CallLoweringInfo & /*CLI*/,
+                    SmallVectorImpl<SDValue> &InVals) const override;
 
-  virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
-                                  unsigned Intrinsic) const override;
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+                          bool isThisReturn, SDValue ThisVal) const;
+
+  bool isEligibleForTailCallOptimization(
+      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+      bool isCalleeStructRet, bool isCallerStructRet,
+      const SmallVectorImpl<ISD::OutputArg> &Outs,
+      const SmallVectorImpl<SDValue> &OutVals,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
 
-protected:
-  std::pair<const TargetRegisterClass*, uint8_t>
-  findRepresentativeClass(MVT VT) const;
+  /// Finds the incoming stack arguments which overlap the given fixed stack
+  /// object and incorporates their load into the current chain. This prevents
+  /// an upcoming store from clobbering the stack argument before it's used.
+  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+                              MachineFrameInfo *MFI, int ClobberedFI) const;
 
-private:
-  const InstrItineraryData *Itins;
+  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
 
-  const AArch64Subtarget *getSubtarget() const {
-    return &getTargetMachine().getSubtarget<AArch64Subtarget>();
-  }
-};
-enum NeonModImmType {
-  Neon_Mov_Imm,
-  Neon_Mvn_Imm
+  bool IsTailCallConvention(CallingConv::ID CallCC) const;
+
+  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+                           SDValue &Chain) const;
+
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+                      SelectionDAG &DAG) const override;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
+                              SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                        RTLIB::Libcall Call) const;
+  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+
+  ConstraintType
+  getConstraintType(const std::string &Constraint) const override;
+  unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
+  /// Examine constraint string and operand type and determine a weight value.
+  /// The operand object must already have been set up with the operand type.
+  ConstraintWeight
+  getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                 const char *constraint) const override;
+
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const std::string &Constraint,
+                               MVT VT) const override;
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+  bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
+                              ISD::MemIndexedMode &AM, bool &IsInc,
+                              SelectionDAG &DAG) const;
+  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                 ISD::MemIndexedMode &AM,
+                                 SelectionDAG &DAG) const override;
+  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                  SDValue &Offset, ISD::MemIndexedMode &AM,
+                                  SelectionDAG &DAG) const override;
+
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
 };
 
-extern SDValue ScanBUILD_VECTOR(SDValue Op, bool &isOnlyLowElement,
-                                bool &usesOnlyOneValue, bool &hasDominantValue,
-                                bool &isConstant, bool &isUNDEF);
-} // namespace llvm
+namespace AArch64 {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo);
+} // end namespace AArch64
+
+} // end namespace llvm
 
-#endif // LLVM_TARGET_AARCH64_ISELLOWERING_H
+#endif // LLVM_TARGET_AArch64_ISELLOWERING_H
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
new file mode 100644
index 0000000..3b9e3c6
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -0,0 +1,364 @@
+//=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Atomic operand code-gen constructs.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------
+// Atomic fences
+//===----------------------------------
+def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
+def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
+
+//===----------------------------------
+// Atomic loads
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A atomic load operation that actually needs acquire semantics.
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected load ordering");
+  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic load operation that does not need either acquire or release
+// semantics.
+class relaxed_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit loads
+def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                                     ro_Wextend8:$offset)),
+          (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                                     ro_Xextend8:$offset)),
+          (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit loads
+def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend16:$extend)),
+          (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend16:$extend)),
+          (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)),
+          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_load<atomic_load_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit loads
+def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend32:$extend)),
+          (LDRWroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend32:$extend)),
+          (LDRWroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s4:$offset)),
+          (LDRWui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_load<atomic_load_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+          (LDURWi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit loads
+def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend64:$extend)),
+          (LDRXroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend64:$extend)),
+          (LDRXroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset)),
+          (LDRXui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (LDURXi GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Atomic stores
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A store operation that actually needs release semantics.
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected store ordering");
+  return Ordering == Release || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic store operation that doesn't actually need to be atomic on AArch64.
+class relaxed_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit stores
+def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
+          (STLRB GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+               GPR32:$val),
+          (STRBBroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+               GPR32:$val),
+          (STRBBroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), GPR32:$val),
+          (STRBBui GPR32:$val, GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURBBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit stores
+def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
+          (STLRH GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend16:$extend),
+                                          GPR32:$val),
+          (STRHHroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend16:$extend),
+                                          GPR32:$val),
+          (STRHHroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16>
+              (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), GPR32:$val),
+          (STRHHui GPR32:$val, GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_store<atomic_store_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURHHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit stores
+def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
+          (STLRW GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend32:$extend),
+                                          GPR32:$val),
+          (STRWroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend32:$extend),
+                                          GPR32:$val),
+          (STRWroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32>
+              (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), GPR32:$val),
+          (STRWui GPR32:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_store<atomic_store_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit stores
+def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
+          (STLRX GPR64:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend16:$extend),
+                                          GPR64:$val),
+          (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend16:$extend),
+                                          GPR64:$val),
+          (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64>
+              (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), GPR64:$val),
+          (STRXui GPR64:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_store<atomic_store_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val),
+          (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Low-level exclusive operations
+//===----------------------------------
+
+// Load-exclusives.
+
+def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldxr_1 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_2 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_4 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_8 GPR64sp:$addr), (LDXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldxr_1 GPR64sp:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_2 GPR64sp:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+
+// Load-exclusives.
+
+def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldaxr_1 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_2 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_4 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_8 GPR64sp:$addr), (LDAXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldaxr_1 GPR64sp:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_2 GPR64sp:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+
+// Store-exclusives.
+
+def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 GPR64:$val, GPR64sp:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 GPR64:$val, GPR64sp:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_8 GPR64:$val, GPR64sp:$addr),
+          (STXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+          (STXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+          (STXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_4 (zext GPR32:$val), GPR64sp:$addr),
+          (STXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+// Store-release-exclusives.
+
+def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr),
+          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 GPR64:$val, GPR64sp:$addr),
+          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 GPR64:$val, GPR64sp:$addr),
+          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_8 GPR64:$val, GPR64sp:$addr),
+          (STLXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+          (STLXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+          (STLXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (zext GPR32:$val), GPR64sp:$addr),
+          (STLXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+
+// And clear exclusive.
+
+def : Pat<(int_aarch64_clrex), (CLREX 0xf)>;
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 4cc3813..d455d7e 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1,4 +1,4 @@
-//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tablegen -*-=//
+//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,1482 +6,8569 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-// This file describes AArch64 instruction formats, down to the level of the
-// instruction's overall class.
-//===----------------------------------------------------------------------===//
-
 
 //===----------------------------------------------------------------------===//
-// A64 Instruction Format Definitions.
-//===----------------------------------------------------------------------===//
+//  Describe AArch64 instructions format here
+//
 
-// A64 is currently the only instruction set supported by the AArch64
-// architecture.
-class A64Inst<dag outs, dag ins, string asmstr, list<dag> patterns,
-              InstrItinClass itin>
-    : Instruction {
-  // All A64 instructions are 32-bit. This field will be filled in
-  // gradually going down the hierarchy.
-  field bits<32> Inst;
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def PseudoFrm   : Format<0>;
+def NormalFrm   : Format<1>; // Do we need any others?
 
+// AArch64 Instruction Format
+class AArch64Inst<Format f, string cstr> : Instruction {
+  field bits<32> Inst; // Instruction encoding.
+  // Mask of bits that cause an encoding to be UNPREDICTABLE.
+  // If a bit is set, then if the corresponding bit in the
+  // target encoding differs from its value in the "Inst" field,
+  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
   field bits<32> Unpredictable = 0;
   // SoftFail is the generic name for this field, but we alias it so
   // as to make it more obvious what it means in ARM-land.
   field bits<32> SoftFail = Unpredictable;
-
-  // LLVM-level model of the AArch64/A64 distinction.
-  let Namespace = "AArch64";
-  let DecoderNamespace = "A64";
-  let Size = 4;
-
-  // Set the templated fields
-  let OutOperandList = outs;
-  let InOperandList = ins;
-  let AsmString = asmstr;
-  let Pattern = patterns;
-  let Itinerary = itin;
+  let Namespace   = "AArch64";
+  Format F        = f;
+  bits<2> Form    = F.Value;
+  let Pattern     = [];
+  let Constraints = cstr;
 }
 
-class PseudoInst<dag outs, dag ins, list<dag> patterns> : Instruction {
-  let Namespace = "AArch64";
-
-  let OutOperandList = outs;
-  let InOperandList= ins;
-  let Pattern = patterns;
-  let isCodeGenOnly = 1;
-  let isPseudo = 1;
+// Pseudo instructions (don't have encoding information)
+class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
+    : AArch64Inst<PseudoFrm, cstr> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let Pattern        = pattern;
+  let isCodeGenOnly  = 1;
 }
 
-// Represents a pseudo-instruction that represents a single A64 instruction for
-// whatever reason, the eventual result will be a 32-bit real instruction.
-class A64PseudoInst<dag outs, dag ins, list<dag> patterns>
-  : PseudoInst<outs, ins, patterns> {
+// Real instructions (have encoding information)
+class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> {
+  let Pattern = pattern;
   let Size = 4;
 }
 
-// As above, this will be a single A64 instruction, but we can actually give the
-// expansion in TableGen.
-class A64PseudoExpand<dag outs, dag ins, list<dag> patterns, dag Result>
-  : A64PseudoInst<outs, ins, patterns>,
-    PseudoInstExpansion<Result>;
+// Normal instructions
+class I<dag oops, dag iops, string asm, string operands, string cstr,
+        list<dag> pattern>
+    : EncodedI<cstr, pattern> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let AsmString      = !strconcat(asm, operands);
+}
 
+class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag<dag res>  : PatFrag<(ops node:$LHS), res>;
 
-// First, some common cross-hierarchy register formats.
+// Helper fragment for an extract of the high portion of a 128-bit vector.
+def extract_high_v16i8 :
+   UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
+def extract_high_v8i16 :
+   UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
+def extract_high_v4i32 :
+   UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
+def extract_high_v2i64 :
+   UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
 
-class A64InstRd<dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rd;
+//===----------------------------------------------------------------------===//
+// Asm Operand Classes.
+//
 
-  let Inst{4-0} = Rd;
+// Shifter operand for arithmetic shifted encodings.
+def ShifterOperand : AsmOperandClass {
+  let Name = "Shifter";
 }
 
-class A64InstRt<dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rt;
-
-  let Inst{4-0} = Rt;
+// Shifter operand for mov immediate encodings.
+def MovImm32ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm32Shifter";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "InvalidMovImm32Shift";
+}
+def MovImm64ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm64Shifter";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "InvalidMovImm64Shift";
 }
 
+// Shifter operand for arithmetic register shifted encodings.
+class ArithmeticShifterOperand<int width> : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "ArithmeticShifter" # width;
+  let PredicateMethod = "isArithmeticShifter<" # width # ">";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "AddSubRegShift" # width;
+}
 
-class A64InstRdn<dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-    : A64InstRd<outs, ins, asmstr, patterns, itin> {
-  // Inherit rdt
-  bits<5> Rn;
+def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>;
+def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>;
 
-  let Inst{9-5} = Rn;
+// Shifter operand for logical register shifted encodings.
+class LogicalShifterOperand<int width> : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalShifter" # width;
+  let PredicateMethod = "isLogicalShifter<" # width # ">";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "AddSubRegShift" # width;
 }
 
-class A64InstRtn<dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-    : A64InstRt<outs, ins, asmstr, patterns, itin> {
-  // Inherit rdt
-  bits<5> Rn;
+def LogicalShifterOperand32 : LogicalShifterOperand<32>;
+def LogicalShifterOperand64 : LogicalShifterOperand<64>;
 
-  let Inst{9-5} = Rn;
+// Shifter operand for logical vector 128/64-bit shifted encodings.
+def LogicalVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalVecShifter";
+  let RenderMethod = "addShifterOperands";
+}
+def LogicalVecHalfWordShifterOperand : AsmOperandClass {
+  let SuperClasses = [LogicalVecShifterOperand];
+  let Name = "LogicalVecHalfWordShifter";
+  let RenderMethod = "addShifterOperands";
 }
 
-// Instructions taking Rt,Rt2,Rn
-class A64InstRtt2n<dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rt2;
-
-  let Inst{14-10} = Rt2;
+// The "MSL" shifter on the vector MOVI instruction.
+def MoveVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MoveVecShifter";
+  let RenderMethod = "addShifterOperands";
 }
 
-class A64InstRdnm<dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rm;
+// Extend operand for arithmetic encodings.
+def ExtendOperand : AsmOperandClass {
+  let Name = "Extend";
+  let DiagnosticType = "AddSubRegExtendLarge";
+}
+def ExtendOperand64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "Extend64";
+  let DiagnosticType = "AddSubRegExtendSmall";
+}
+// 'extend' that's a lsl of a 64-bit register.
+def ExtendOperandLSL64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "ExtendLSL64";
+  let RenderMethod = "addExtend64Operands";
+  let DiagnosticType = "AddSubRegExtendLarge";
+}
 
-  let Inst{20-16} = Rm;
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+  let Name = "FPImm";
+  let ParserMethod = "tryParseFPImm";
+  let DiagnosticType = "InvalidFPImm";
 }
 
-class A64InstRtnm<dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rm;
+def CondCode : AsmOperandClass {
+  let Name = "CondCode";
+  let DiagnosticType = "InvalidCondCode";
+}
 
-  let Inst{20-16} = Rm;
+// A 32-bit register pasrsed as 64-bit
+def GPR32as64Operand : AsmOperandClass {
+  let Name = "GPR32as64";
+}
+def GPR32as64 : RegisterOperand<GPR32> {
+  let ParserMatchClass = GPR32as64Operand;
 }
 
-//===----------------------------------------------------------------------===//
-//
-// Actual A64 Instruction Formats
-//
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
 
-// Format for Add-subtract (extended register) instructions.
-class A64I_addsubext<bit sf, bit op, bit S, bits<2> opt, bits<3> option,
-                     dag outs, dag ins, string asmstr, list<dag> patterns,
-                     InstrItinClass itin>
-    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-    bits<3> Imm3;
-
-    let Inst{31} = sf;
-    let Inst{30} = op;
-    let Inst{29} = S;
-    let Inst{28-24} = 0b01011;
-    let Inst{23-22} = opt;
-    let Inst{21} = 0b1;
-    // Rm inherited in 20-16
-    let Inst{15-13} = option;
-    let Inst{12-10} = Imm3;
-    // Rn inherited in 9-5
-    // Rd inherited in 4-0
-}
-
-// Format for Add-subtract (immediate) instructions.
-class A64I_addsubimm<bit sf, bit op, bit S, bits<2> shift,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<12> Imm12;
 
-  let Inst{31} = sf;
-  let Inst{30} = op;
-  let Inst{29} = S;
-  let Inst{28-24} = 0b10001;
-  let Inst{23-22} = shift;
-  let Inst{21-10} = Imm12;
-}
-
-// Format for Add-subtract (shifted register) instructions.
-class A64I_addsubshift<bit sf, bit op, bit S, bits<2> shift,
-                       dag outs, dag ins, string asmstr, list<dag> patterns,
-                       InstrItinClass itin>
-    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-    bits<6> Imm6;
-
-    let Inst{31} = sf;
-    let Inst{30} = op;
-    let Inst{29} = S;
-    let Inst{28-24} = 0b01011;
-    let Inst{23-22} = shift;
-    let Inst{21} = 0b0;
-    // Rm inherited in 20-16
-    let Inst{15-10} = Imm6;
-    // Rn inherited in 9-5
-    // Rd inherited in 4-0
-}
-
-// Format for Add-subtract (with carry) instructions.
-class A64I_addsubcarry<bit sf, bit op, bit S, bits<6> opcode2,
-                       dag outs, dag ins, string asmstr, list<dag> patterns,
-                       InstrItinClass itin>
-    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-    let Inst{31} = sf;
-    let Inst{30} = op;
-    let Inst{29} = S;
-    let Inst{28-21} = 0b11010000;
-    // Rm inherited in 20-16
-    let Inst{15-10} = opcode2;
-    // Rn inherited in 9-5
-    // Rd inherited in 4-0
-}
-
-
-// Format for Bitfield instructions
-class A64I_bitfield<bit sf, bits<2> opc, bit n,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<6> ImmR;
-  bits<6> ImmS;
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
 
-  let Inst{31} = sf;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100110;
-  let Inst{22} = n;
-  let Inst{21-16} = ImmR;
-  let Inst{15-10} = ImmS;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+// ADR[P] instruction labels.
+def AdrpOperand : AsmOperandClass {
+  let Name = "AdrpLabel";
+  let ParserMethod = "tryParseAdrpLabel";
+  let DiagnosticType = "InvalidLabel";
+}
+def adrplabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let PrintMethod = "printAdrpLabel";
+  let ParserMatchClass = AdrpOperand;
 }
 
-// Format for compare and branch (immediate) instructions.
-class A64I_cmpbr<bit sf, bit op,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRt<outs, ins, asmstr, patterns, itin> {
-  bits<19> Label;
-
-  let Inst{31} = sf;
-  let Inst{30-25} = 0b011010;
-  let Inst{24} = op;
-  let Inst{23-5} = Label;
-  // Inherit Rt in 4-0
+def AdrOperand : AsmOperandClass {
+  let Name = "AdrLabel";
+  let ParserMethod = "tryParseAdrLabel";
+  let DiagnosticType = "InvalidLabel";
+}
+def adrlabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let ParserMatchClass = AdrOperand;
 }
 
-// Format for conditional branch (immediate) instructions.
-class A64I_condbr<bit o1, bit o0,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<19> Label;
-  bits<4> Cond;
+// simm9 predicate - True if the immediate is in the range [-256, 255].
+def SImm9Operand : AsmOperandClass {
+  let Name = "SImm9";
+  let DiagnosticType = "InvalidMemoryIndexedSImm9";
+}
+def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
+  let ParserMatchClass = SImm9Operand;
+}
 
-  let Inst{31-25} = 0b0101010;
-  let Inst{24} = o1;
-  let Inst{23-5} = Label;
-  let Inst{4} = o0;
-  let Inst{3-0} = Cond;
+// simm7sN predicate - True if the immediate is a multiple of N in the range
+// [-64 * N, 63 * N].
+class SImm7Scaled<int Scale> : AsmOperandClass {
+  let Name = "SImm7s" # Scale;
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7";
 }
 
-// Format for conditional compare (immediate) instructions.
-class A64I_condcmpimm<bit sf, bit op, bit o2, bit o3, bit s,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rn;
-  bits<5> UImm5;
-  bits<4> NZCVImm;
-  bits<4> Cond;
+def SImm7s4Operand : SImm7Scaled<4>;
+def SImm7s8Operand : SImm7Scaled<8>;
+def SImm7s16Operand : SImm7Scaled<16>;
 
-  let Inst{31} = sf;
-  let Inst{30} = op;
-  let Inst{29} = s;
-  let Inst{28-21} = 0b11010010;
-  let Inst{20-16} = UImm5;
-  let Inst{15-12} = Cond;
-  let Inst{11} = 0b1;
-  let Inst{10} = o2;
-  let Inst{9-5} = Rn;
-  let Inst{4} = o3;
-  let Inst{3-0} = NZCVImm;
+def simm7s4 : Operand<i32> {
+  let ParserMatchClass = SImm7s4Operand;
+  let PrintMethod = "printImmScale<4>";
 }
 
-// Format for conditional compare (register) instructions.
-class A64I_condcmpreg<bit sf, bit op, bit o2, bit o3, bit s,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> NZCVImm;
-  bits<4> Cond;
+def simm7s8 : Operand<i32> {
+  let ParserMatchClass = SImm7s8Operand;
+  let PrintMethod = "printImmScale<8>";
+}
 
+def simm7s16 : Operand<i32> {
+  let ParserMatchClass = SImm7s16Operand;
+  let PrintMethod = "printImmScale<16>";
+}
 
-  let Inst{31} = sf;
-  let Inst{30} = op;
-  let Inst{29} = s;
-  let Inst{28-21} = 0b11010010;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = Cond;
-  let Inst{11} = 0b0;
-  let Inst{10} = o2;
-  let Inst{9-5} = Rn;
-  let Inst{4} = o3;
-  let Inst{3-0} = NZCVImm;
+class AsmImmRange<int Low, int High> : AsmOperandClass {
+  let Name = "Imm" # Low # "_" # High;
+  let DiagnosticType = "InvalidImm" # Low # "_" # High;
 }
 
-// Format for conditional select instructions.
-class A64I_condsel<bit sf, bit op, bit s, bits<2> op2,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<4> Cond;
+def Imm1_8Operand : AsmImmRange<1, 8>;
+def Imm1_16Operand : AsmImmRange<1, 16>;
+def Imm1_32Operand : AsmImmRange<1, 32>;
+def Imm1_64Operand : AsmImmRange<1, 64>;
 
-  let Inst{31} = sf;
-  let Inst{30} = op;
-  let Inst{29} = s;
-  let Inst{28-21} = 0b11010100;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = Cond;
-  let Inst{11-10} = op2;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+def MovZSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG3";
+  let RenderMethod = "addImmOperands";
 }
 
-// Format for data processing (1 source) instructions
-class A64I_dp_1src<bit sf, bit S, bits<5> opcode2, bits<6> opcode,
-                string asmstr, dag outs, dag ins,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = sf;
-  let Inst{30} = 0b1;
-  let Inst{29} = S;
-  let Inst{28-21} = 0b11010110;
-  let Inst{20-16} = opcode2;
-  let Inst{15-10} = opcode;
-}
-
-// Format for data processing (2 source) instructions
-class A64I_dp_2src<bit sf, bits<6> opcode, bit S,
-                string asmstr, dag outs, dag ins,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = sf;
-  let Inst{30} = 0b0;
-  let Inst{29} = S;
-  let Inst{28-21} = 0b11010110;
-  let Inst{15-10} = opcode;
+def movz_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG3AsmOperand;
 }
 
-// Format for data-processing (3 source) instructions
+def MovZSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
 
-class A64I_dp3<bit sf, bits<6> opcode,
-               dag outs, dag ins, string asmstr,
-               list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = sf;
-  let Inst{30-29} = opcode{5-4};
-  let Inst{28-24} = 0b11011;
-  let Inst{23-21} = opcode{3-1};
-  // Inherits Rm in 20-16
-  let Inst{15} = opcode{0};
-  // {14-10} mostly Ra, but unspecified for SMULH/UMULH
-  // Inherits Rn in 9-5
-  // Inherits Rd in 4-0
-}
-
-// Format for exception generation instructions
-class A64I_exception<bits<3> opc, bits<3> op2, bits<2> ll,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<16> UImm16;
+def movz_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG2AsmOperand;
+}
 
-  let Inst{31-24} = 0b11010100;
-  let Inst{23-21} = opc;
-  let Inst{20-5} = UImm16;
-  let Inst{4-2} = op2;
-  let Inst{1-0} = ll;
+def MovZSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG1";
+  let RenderMethod = "addImmOperands";
 }
 
-// Format for extract (immediate) instructions
-class A64I_extract<bit sf, bits<3> op, bit n,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<6> LSB;
+def movz_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG1AsmOperand;
+}
 
-  let Inst{31} = sf;
-  let Inst{30-29} = op{2-1};
-  let Inst{28-23} = 0b100111;
-  let Inst{22} = n;
-  let Inst{21} = op{0};
-  // Inherits Rm in bits 20-16
-  let Inst{15-10} = LSB;
-  // Inherits Rn in 9-5
-  // Inherits Rd in 4-0
+def MovZSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG0";
+  let RenderMethod = "addImmOperands";
 }
 
-let Predicates = [HasFPARMv8] in {
+def movz_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG0AsmOperand;
+}
 
-// Format for floating-point compare instructions.
-class A64I_fpcmp<bit m, bit s, bits<2> type, bits<2> op, bits<5> opcode2,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rn;
-  bits<5> Rm;
+def MovKSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG3";
+  let RenderMethod = "addImmOperands";
+}
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-16} = Rm;
-  let Inst{15-14} = op;
-  let Inst{13-10} = 0b1000;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = opcode2;
+def movk_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG3AsmOperand;
 }
 
-// Format for floating-point conditional compare instructions.
-class A64I_fpccmp<bit m, bit s, bits<2> type, bit op,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> NZCVImm;
-  bits<4> Cond;
+def MovKSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = Cond;
-  let Inst{11-10} = 0b01;
-  let Inst{9-5} = Rn;
-  let Inst{4} = op;
-  let Inst{3-0} = NZCVImm;
+def movk_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG2AsmOperand;
 }
 
-// Format for floating-point conditional select instructions.
-class A64I_fpcondsel<bit m, bit s, bits<2> type,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<4> Cond;
+def MovKSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG1";
+  let RenderMethod = "addImmOperands";
+}
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = Cond;
-  let Inst{11-10} = 0b11;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+def movk_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG1AsmOperand;
 }
 
+def MovKSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG0";
+  let RenderMethod = "addImmOperands";
+}
 
-// Format for floating-point data-processing (1 source) instructions.
-class A64I_fpdp1<bit m, bit s, bits<2> type, bits<6> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-15} = opcode;
-  let Inst{14-10} = 0b10000;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format for floating-point data-processing (2 sources) instructions.
-class A64I_fpdp2<bit m, bit s, bits<2> type, bits<4> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+def movk_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG0AsmOperand;
 }
 
-// Format for floating-point data-processing (3 sources) instructions.
-class A64I_fpdp3<bit m, bit s, bits<2> type, bit o1, bit o0,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<5> Ra;
+class fixedpoint_i32<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm32";
+  let ParserMatchClass = Imm1_32Operand;
+}
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11111;
-  let Inst{23-22} = type;
-  let Inst{21} = o1;
-  // Inherit Rm in 20-16
-  let Inst{15} = o0;
-  let Inst{14-10} = Ra;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+class fixedpoint_i64<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm64";
+  let ParserMatchClass = Imm1_64Operand;
 }
 
-// Format for floating-point <-> fixed-point conversion instructions.
-class A64I_fpfixed<bit sf, bit s, bits<2> type, bits<2> mode, bits<3> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<6> Scale;
+def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
+def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
 
-  let Inst{31} = sf;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b0;
-  let Inst{20-19} = mode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = Scale;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
+def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
+
+def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR8OpValue";
+  let DecoderMethod = "DecodeVecShiftR8Imm";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16Imm";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32Imm";
+  let ParserMatchClass = Imm1_32Operand;
+}
+def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64Imm";
+  let ParserMatchClass = Imm1_64Operand;
+}
+def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
+  let ParserMatchClass = Imm1_32Operand;
 }
 
-// Format for floating-point <-> integer conversion instructions.
-class A64I_fpint<bit sf, bit s, bits<2> type, bits<2> rmode, bits<3> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = sf;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+def Imm0_7Operand : AsmImmRange<0, 7>;
+def Imm0_15Operand : AsmImmRange<0, 15>;
+def Imm0_31Operand : AsmImmRange<0, 31>;
+def Imm0_63Operand : AsmImmRange<0, 63>;
+
+def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 8);
+}]> {
+  let EncoderMethod = "getVecShiftL8OpValue";
+  let DecoderMethod = "DecodeVecShiftL8Imm";
+  let ParserMatchClass = Imm0_7Operand;
+}
+def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 16);
+}]> {
+  let EncoderMethod = "getVecShiftL16OpValue";
+  let DecoderMethod = "DecodeVecShiftL16Imm";
+  let ParserMatchClass = Imm0_15Operand;
+}
+def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let EncoderMethod = "getVecShiftL32OpValue";
+  let DecoderMethod = "DecodeVecShiftL32Imm";
+  let ParserMatchClass = Imm0_31Operand;
+}
+def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 64);
+}]> {
+  let EncoderMethod = "getVecShiftL64OpValue";
+  let DecoderMethod = "DecodeVecShiftL64Imm";
+  let ParserMatchClass = Imm0_63Operand;
 }
 
 
-// Format for floating-point immediate instructions.
-class A64I_fpimm<bit m, bit s, bits<2> type, bits<5> imm5,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs, ins, asmstr, patterns, itin> {
-  bits<8> Imm8;
+// Crazy immediate formats used by 32-bit and 64-bit logical immediate
+// instructions for splatting repeating bit patterns across the immediate.
+def logical_imm32_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
+  return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+def logical_imm64_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
+  return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+
+def LogicalImm32Operand : AsmOperandClass {
+  let Name = "LogicalImm32";
+  let DiagnosticType = "LogicalSecondSource";
+}
+def LogicalImm64Operand : AsmOperandClass {
+  let Name = "LogicalImm64";
+  let DiagnosticType = "LogicalSecondSource";
+}
+def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
+  return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32);
+}], logical_imm32_XFORM> {
+  let PrintMethod = "printLogicalImm32";
+  let ParserMatchClass = LogicalImm32Operand;
+}
+def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
+  return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 64);
+}], logical_imm64_XFORM> {
+  let PrintMethod = "printLogicalImm64";
+  let ParserMatchClass = LogicalImm64Operand;
+}
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-13} = Imm8;
-  let Inst{12-10} = 0b100;
-  let Inst{9-5} = imm5;
-  // Inherit Rd in 4-0
+// imm0_65535 predicate - True if the immediate is in the range [0,65535].
+def Imm0_65535Operand : AsmImmRange<0, 65535>;
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 65536;
+}]> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let PrintMethod = "printHexImm";
 }
 
+// imm0_255 predicate - True if the immediate is in the range [0,255].
+def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 256;
+}]> {
+  let ParserMatchClass = Imm0_255Operand;
+  let PrintMethod = "printHexImm";
 }
 
-// Format for load-register (literal) instructions.
-class A64I_LDRlit<bits<2> opc, bit v,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRt<outs, ins, asmstr, patterns, itin> {
-  bits<19> Imm19;
+// imm0_127 predicate - True if the immediate is in the range [0,127]
+def Imm0_127Operand : AsmImmRange<0, 127>;
+def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 128;
+}]> {
+  let ParserMatchClass = Imm0_127Operand;
+  let PrintMethod = "printHexImm";
+}
 
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b011;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b00;
-  let Inst{23-5} = Imm19;
-  // Inherit Rt in 4-0
+// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
+// for all shift-amounts.
+
+// imm0_63 predicate - True if the immediate is in the range [0,63]
+def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 64;
+}]> {
+  let ParserMatchClass = Imm0_63Operand;
 }
 
-// Format for load-store exclusive instructions.
-class A64I_LDSTex_tn<bits<2> size, bit o2, bit L, bit o1, bit o0,
-                 dag outs, dag ins, string asmstr,
-                 list <dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-30} = size;
-  let Inst{29-24} = 0b001000;
-  let Inst{23} = o2;
-  let Inst{22} = L;
-  let Inst{21} = o1;
-  let Inst{15} = o0;
+// imm0_31 predicate - True if the immediate is in the range [0,31]
+def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 32;
+}]> {
+  let ParserMatchClass = Imm0_31Operand;
 }
 
-class A64I_LDSTex_tt2n<bits<2> size, bit o2, bit L, bit o1, bit o0,
-                     dag outs, dag ins, string asmstr,
-                     list <dag> patterns, InstrItinClass itin>:
-      A64I_LDSTex_tn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
-   bits<5> Rt2;
-   let Inst{14-10} = Rt2;
+// imm0_15 predicate - True if the immediate is in the range [0,15]
+def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = Imm0_15Operand;
 }
 
-class A64I_LDSTex_stn<bits<2> size, bit o2, bit L, bit o1, bit o0,
-                     dag outs, dag ins, string asmstr,
-                     list <dag> patterns, InstrItinClass itin>:
-      A64I_LDSTex_tn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
-   bits<5> Rs;
-   let Inst{20-16} = Rs;
+// imm0_7 predicate - True if the immediate is in the range [0,7]
+def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = Imm0_7Operand;
 }
 
-class A64I_LDSTex_stt2n<bits<2> size, bit o2, bit L, bit o1, bit o0,
-                     dag outs, dag ins, string asmstr,
-                     list <dag> patterns, InstrItinClass itin>:
-      A64I_LDSTex_stn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
-   bits<5> Rt2;
-   let Inst{14-10} = Rt2;
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
+//  {5-0} - imm6
+class arith_shift<ValueType Ty, int width> : Operand<Ty> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = !cast<AsmOperandClass>(
+                         "ArithmeticShifterOperand" # width);
 }
 
-// Format for load-store register (immediate post-indexed) instructions
-class A64I_LSpostind<bits<2> size, bit v, bits<2> opc,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<9> SImm9;
+def arith_shift32 : arith_shift<i32, 32>;
+def arith_shift64 : arith_shift<i64, 64>;
 
-  let Inst{31-30} = size;
-  let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21} = 0b0;
-  let Inst{20-12} = SImm9;
-  let Inst{11-10} = 0b01;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width));
 }
 
-// Format for load-store register (immediate pre-indexed) instructions
-class A64I_LSpreind<bits<2> size, bit v, bits<2> opc,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<9> SImm9;
-
+def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
+def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
 
-  let Inst{31-30} = size;
-  let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21} = 0b0;
-  let Inst{20-12} = SImm9;
-  let Inst{11-10} = 0b11;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
+//  {5-0} - imm6
+class logical_shift<int width> : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = !cast<AsmOperandClass>(
+                         "LogicalShifterOperand" # width);
 }
 
-// Format for load-store register (unprivileged) instructions
-class A64I_LSunpriv<bits<2> size, bit v, bits<2> opc,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<9> SImm9;
+def logical_shift32 : logical_shift<32>;
+def logical_shift64 : logical_shift<64>;
 
-
-  let Inst{31-30} = size;
-  let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21} = 0b0;
-  let Inst{20-12} = SImm9;
-  let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, shiftop);
 }
 
-// Format for load-store (unscaled immediate) instructions.
-class A64I_LSunalimm<bits<2> size, bit v, bits<2> opc,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<9> SImm9;
+def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
+def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
 
-  let Inst{31-30} = size;
-  let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21} = 0b0;
-  let Inst{20-12} = SImm9;
-  let Inst{11-10} = 0b00;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+// A logical vector shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0, #8, #16, or #24
+def logical_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecShifterOperand;
 }
 
+// A logical vector half-word shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #8
+def logical_vec_hw_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecHalfWordShifterOperand;
+}
 
-// Format for load-store (unsigned immediate) instructions.
-class A64I_LSunsigimm<bits<2> size, bit v, bits<2> opc,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<12> UImm12;
+// A vector move shifter operand:
+//  {0} - imm1: #8 or #16
+def move_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getMoveVecShifterOpValue";
+  let ParserMatchClass = MoveVecShifterOperand;
+}
 
-  let Inst{31-30} = size;
-  let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b01;
-  let Inst{23-22} = opc;
-  let Inst{21-10} = UImm12;
+def AddSubImmOperand : AsmOperandClass {
+  let Name = "AddSubImm";
+  let ParserMethod = "tryParseAddSubImm";
+  let DiagnosticType = "AddSubSecondSource";
+}
+// An ADD/SUB immediate shifter operand:
+//  second operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #12
+class addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
 }
 
-// Format for load-store register (register offset) instructions.
-class A64I_LSregoff<bits<2> size, bit v, bits<2> opc, bit optionlo,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rm;
+def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
+def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
 
-  // Complex operand selection needed for these instructions, so they
-  // need an "addr" field for encoding/decoding to be generated.
-  bits<3> Ext;
-  // OptionHi = Ext{2-1}
-  // S = Ext{0}
+class neg_addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
 
-  let Inst{31-30} = size;
-  let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21} = 0b1;
-  let Inst{20-16} = Rm;
-  let Inst{15-14} = Ext{2-1};
-  let Inst{13} = optionlo;
-  let Inst{12} = Ext{0};
-  let Inst{11-10} = 0b10;
-  // Inherits Rn in 9-5
-  // Inherits Rt in 4-0
+def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
+def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
 
-  let AddedComplexity = 50;
+// An extend operand:
+//  {5-3} - extend type
+//  {2-0} - imm3
+def arith_extend : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperand;
+}
+def arith_extend64 : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperand64;
 }
 
-// Format for Load-store register pair (offset) instructions
-class A64I_LSPoffset<bits<2> opc, bit v, bit l,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
-  bits<7> SImm7;
-
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26} = v;
-  let Inst{25-23} = 0b010;
-  let Inst{22} = l;
-  let Inst{21-15} = SImm7;
-  // Inherit Rt2 in 14-10
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+// 'extend' that's a lsl of a 64-bit register.
+def arith_extendlsl64 : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperandLSL64;
 }
 
-// Format for Load-store register pair (post-indexed) instructions
-class A64I_LSPpostind<bits<2> opc, bit v, bit l,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
-  bits<7> SImm7;
+class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend);
+}
 
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26} = v;
-  let Inst{25-23} = 0b001;
-  let Inst{22} = l;
-  let Inst{21-15} = SImm7;
-  // Inherit Rt2 in 14-10
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend64);
 }
 
-// Format for Load-store register pair (pre-indexed) instructions
-class A64I_LSPpreind<bits<2> opc, bit v, bit l,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
-  bits<7> SImm7;
+// Floating-point immediate.
+def fpimm32 : Operand<f32>,
+              PatLeaf<(f32 fpimm), [{
+      return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP32Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+def fpimm64 : Operand<f64>,
+              PatLeaf<(f64 fpimm), [{
+      return AArch64_AM::getFP64Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP64Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
 
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26} = v;
-  let Inst{25-23} = 0b011;
-  let Inst{22} = l;
-  let Inst{21-15} = SImm7;
-  // Inherit Rt2 in 14-10
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+def fpimm8 : Operand<i32> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
 }
 
-// Format for Load-store non-temporal register pair (offset) instructions
-class A64I_LSPnontemp<bits<2> opc, bit v, bit l,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
-  bits<7> SImm7;
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
 
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26} = v;
-  let Inst{25-23} = 0b000;
-  let Inst{22} = l;
-  let Inst{21-15} = SImm7;
-  // Inherit Rt2 in 14-10
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format for Logical (immediate) instructions
-class A64I_logicalimm<bit sf, bits<2> opc,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bit N;
-  bits<6> ImmR;
-  bits<6> ImmS;
-
-  // N, ImmR and ImmS have no separate existence in any assembly syntax (or for
-  // selection), so we'll combine them into a single field here.
-  bits<13> Imm;
-  // N = Imm{12};
-  // ImmR = Imm{11-6};
-  // ImmS = Imm{5-0};
+// Vector lane operands
+class AsmVectorIndex<string Suffix> : AsmOperandClass {
+  let Name = "VectorIndex" # Suffix;
+  let DiagnosticType = "InvalidIndex" # Suffix;
+}
+def VectorIndex1Operand : AsmVectorIndex<"1">;
+def VectorIndexBOperand : AsmVectorIndex<"B">;
+def VectorIndexHOperand : AsmVectorIndex<"H">;
+def VectorIndexSOperand : AsmVectorIndex<"S">;
+def VectorIndexDOperand : AsmVectorIndex<"D">;
+
+def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) == 1;
+}]> {
+  let ParserMatchClass = VectorIndex1Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = VectorIndexBOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = VectorIndexHOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 4;
+}]> {
+  let ParserMatchClass = VectorIndexSOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = VectorIndexDOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
 
-  let Inst{31} = sf;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100100;
-  let Inst{22} = Imm{12};
-  let Inst{21-16} = Imm{11-6};
-  let Inst{15-10} = Imm{5-0};
-  // Rn inherited in 9-5
-  // Rd inherited in 4-0
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def simdimmtype10 : Operand<i32>,
+                    PatLeaf<(f64 fpimm), [{
+      return AArch64_AM::isAdvSIMDModImmType10(N->getValueAPF()
+                                               .bitcastToAPInt()
+                                               .getZExtValue());
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
+                                                           .bitcastToAPInt()
+                                                           .getZExtValue());
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = SIMDImmType10Operand;
+  let PrintMethod = "printSIMDType10Operand";
 }
 
-// Format for Logical (shifted register) instructions
-class A64I_logicalshift<bit sf, bits<2> opc, bits<2> shift, bit N,
-                        dag outs, dag ins, string asmstr,
-                        list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<6> Imm6;
 
-  let Inst{31} = sf;
-  let Inst{30-29} = opc;
-  let Inst{28-24} = 0b01010;
-  let Inst{23-22} = shift;
-  let Inst{21} = N;
-  // Rm inherited
-  let Inst{15-10} = Imm6;
-  // Rn inherited
-  // Rd inherited
-}
-
-// Format for Move wide (immediate)
-class A64I_movw<bit sf, bits<2> opc,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs, ins, asmstr, patterns, itin> {
-  bits<16> UImm16;
-  bits<2> Shift; // Called "hw" officially
+//---
+// System management
+//---
 
-  let Inst{31} = sf;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100101;
-  let Inst{22-21} = Shift;
-  let Inst{20-5} = UImm16;
-  // Inherits Rd in 4-0
+// Base encoding for system instruction operands.
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands>
+    : I<oops, iops, asm, operands, "", []> {
+  let Inst{31-22} = 0b1101010100;
+  let Inst{21}    = L;
 }
 
-// Format for PC-relative addressing instructions, ADR and ADRP.
-class A64I_PCADR<bit op,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs, ins, asmstr, patterns, itin> {
-  bits<21> Label;
-
-  let Inst{31} = op;
-  let Inst{30-29} = Label{1-0};
-  let Inst{28-24} = 0b10000;
-  let Inst{23-5} = Label{20-2};
+// System instructions which do not have an Rt register.
+class SimpleSystemI<bit L, dag iops, string asm, string operands>
+    : BaseSystemI<L, (outs), iops, asm, operands> {
+  let Inst{4-0} = 0b11111;
 }
 
-// Format for system instructions
-class A64I_system<bit l,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<2> Op0;
-  bits<3> Op1;
-  bits<4> CRn;
-  bits<4> CRm;
-  bits<3> Op2;
+// System instructions which have an Rt register.
+class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
+    : BaseSystemI<L, oops, iops, asm, operands>,
+      Sched<[WriteSys]> {
   bits<5> Rt;
+  let Inst{4-0} = Rt;
+}
 
-  let Inst{31-22} = 0b1101010100;
-  let Inst{21} = l;
-  let Inst{20-19} = Op0;
-  let Inst{18-16} = Op1;
-  let Inst{15-12} = CRn;
+// Hint instructions that take both a CRm and a 3-bit immediate.
+class HintI<string mnemonic>
+    : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">,
+      Sched<[WriteHint]> {
+  bits <7> imm;
+  let Inst{20-12} = 0b000110010;
+  let Inst{11-5} = imm;
+}
+
+// System instructions taking a single literal operand which encodes into
+// CRm. op2 differentiates the opcodes.
+def BarrierAsmOperand : AsmOperandClass {
+  let Name = "Barrier";
+  let ParserMethod = "tryParseBarrierOperand";
+}
+def barrier_op : Operand<i32> {
+  let PrintMethod = "printBarrierOption";
+  let ParserMatchClass = BarrierAsmOperand;
+}
+class CRmSystemI<Operand crmtype, bits<3> opc, string asm>
+    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">,
+      Sched<[WriteBarrier]> {
+  bits<4> CRm;
+  let Inst{20-12} = 0b000110011;
   let Inst{11-8} = CRm;
-  let Inst{7-5} = Op2;
-  let Inst{4-0} = Rt;
+  let Inst{7-5} = opc;
+}
 
-  // These instructions can do horrible things.
-  let hasSideEffects = 1;
+// MRS/MSR system instructions. These have different operand classes because
+// a different subset of registers can be accessed through each instruction.
+def MRSSystemRegisterOperand : AsmOperandClass {
+  let Name = "MRSSystemRegister";
+  let ParserMethod = "tryParseSysReg";
+  let DiagnosticType = "MRS";
+}
+// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
+def mrs_sysreg_op : Operand<i32> {
+  let ParserMatchClass = MRSSystemRegisterOperand;
+  let DecoderMethod = "DecodeMRSSystemRegister";
+  let PrintMethod = "printMRSSystemRegister";
 }
 
-// Format for unconditional branch (immediate) instructions
-class A64I_Bimm<bit op,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  // Doubly special in not even sharing register fields with other
-  // instructions, so we create our own Rn here.
-  bits<26> Label;
+def MSRSystemRegisterOperand : AsmOperandClass {
+  let Name = "MSRSystemRegister";
+  let ParserMethod = "tryParseSysReg";
+  let DiagnosticType = "MSR";
+}
+def msr_sysreg_op : Operand<i32> {
+  let ParserMatchClass = MSRSystemRegisterOperand;
+  let DecoderMethod = "DecodeMSRSystemRegister";
+  let PrintMethod = "printMSRSystemRegister";
+}
 
-  let Inst{31} = op;
-  let Inst{30-26} = 0b00101;
-  let Inst{25-0} = Label;
+class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
+                       "mrs", "\t$Rt, $systemreg"> {
+  bits<15> systemreg;
+  let Inst{20} = 1;
+  let Inst{19-5} = systemreg;
 }
 
-// Format for Test & branch (immediate) instructions
-class A64I_TBimm<bit op,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRt<outs, ins, asmstr, patterns, itin> {
-  // Doubly special in not even sharing register fields with other
-  // instructions, so we create our own Rn here.
-  bits<6> Imm;
-  bits<14> Label;
+// FIXME: Some of these def NZCV, others don't. Best way to model that?
+// Explicitly modeling each of the system register as a register class
+// would do it, but feels like overkill at this point.
+class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
+                       "msr", "\t$systemreg, $Rt"> {
+  bits<15> systemreg;
+  let Inst{20} = 1;
+  let Inst{19-5} = systemreg;
+}
 
-  let Inst{31} = Imm{5};
-  let Inst{30-25} = 0b011011;
-  let Inst{24} = op;
-  let Inst{23-19} = Imm{4-0};
-  let Inst{18-5} = Label;
-  // Inherit Rt in 4-0
+def SystemPStateFieldOperand : AsmOperandClass {
+  let Name = "SystemPStateField";
+  let ParserMethod = "tryParseSysReg";
+}
+def pstatefield_op : Operand<i32> {
+  let ParserMatchClass = SystemPStateFieldOperand;
+  let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateI
+  : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm),
+                  "msr", "\t$pstate_field, $imm">,
+    Sched<[WriteSys]> {
+  bits<6> pstatefield;
+  bits<4> imm;
+  let Inst{20-19} = 0b00;
+  let Inst{18-16} = pstatefield{5-3};
+  let Inst{15-12} = 0b0100;
+  let Inst{11-8} = imm;
+  let Inst{7-5} = pstatefield{2-0};
+
+  let DecoderMethod = "DecodeSystemPStateInstruction";
+}
+
+// SYS and SYSL generic system instructions.
+def SysCRAsmOperand : AsmOperandClass {
+  let Name = "SysCR";
+  let ParserMethod = "tryParseSysCROperand";
+}
+
+def sys_cr_op : Operand<i32> {
+  let PrintMethod = "printSysCROperand";
+  let ParserMatchClass = SysCRAsmOperand;
+}
+
+class SystemXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
+       asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+class SystemLXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
+       asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
 }
 
-// Format for Unconditional branch (register) instructions, including
-// RET.  Shares no fields with instructions further up the hierarchy
-// so top-level.
-class A64I_Breg<bits<4> opc, bits<5> op2, bits<6> op3, bits<5> op4,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  // Doubly special in not even sharing register fields with other
-  // instructions, so we create our own Rn here.
-  bits<5> Rn;
 
+// Branch (register) instructions:
+//
+//  case opc of
+//    0001 blr
+//    0000 br
+//    0101 dret
+//    0100 eret
+//    0010 ret
+//    otherwise UNDEFINED
+class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
+                    string operands, list<dag> pattern>
+    : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
   let Inst{31-25} = 0b1101011;
   let Inst{24-21} = opc;
-  let Inst{20-16} = op2;
-  let Inst{15-10} = op3;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = op4;
+  let Inst{20-16} = 0b11111;
+  let Inst{15-10} = 0b000000;
+  let Inst{4-0}   = 0b00000;
 }
 
+class BranchReg<bits<4> opc, string asm, list<dag> pattern>
+    : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
+  bits<5> Rn;
+  let Inst{9-5} = Rn;
+}
 
-//===----------------------------------------------------------------------===//
-//
-// Neon Instruction Format Definitions.
-//
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
+class SpecialReturn<bits<4> opc, string asm>
+    : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
+  let Inst{9-5} = 0b11111;
+}
 
-let Predicates = [HasNEON] in {
+//---
+// Conditional branch instruction.
+//---
 
-class NeonInstAlias<string Asm, dag Result, bit Emit = 0b1>
-  : InstAlias<Asm, Result, Emit> {
+// Condition code.
+// 4-bit immediate. Pretty-printed as <cc>
+def ccode : Operand<i32> {
+  let PrintMethod = "printCondCode";
+  let ParserMatchClass = CondCode;
+}
+def inv_ccode : Operand<i32> {
+  let PrintMethod = "printInverseCondCode";
+  let ParserMatchClass = CondCode;
 }
 
-// Format AdvSIMD bitwise extract
-class NeonI_BitExtract<bit q, bits<2> op2,
-                       dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-24} = 0b101110;
-  let Inst{23-22} = op2;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
-  let Inst{15} = 0b0;
-  // imm4 in 14-11
-  let Inst{10} = 0b0;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD perm
-class NeonI_Perm<bit q, bits<2> size, bits<3> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-24} = 0b001110;
-  let Inst{23-22} = size;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
-  let Inst{15} = 0b0;
-  let Inst{14-12} = opcode;
-  let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+// Conditional branch target. 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def PCRelLabel19Operand : AsmOperandClass {
+  let Name = "PCRelLabel19";
+  let DiagnosticType = "InvalidLabel";
+}
+def am_brcond : Operand<OtherVT> {
+  let EncoderMethod = "getCondBranchTargetOpValue";
+  let DecoderMethod = "DecodePCRelLabel19";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PCRelLabel19Operand;
 }
 
-// Format AdvSIMD table lookup
-class NeonI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-24} = 0b001110;
-  let Inst{23-22} = op2;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
-  let Inst{15} = 0b0;
-  let Inst{14-13} = len;
-  let Inst{12} = op;
-  let Inst{11-10} = 0b00;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD 3 vector registers with same vector type
-class NeonI_3VSame<bit q, bit u, bits<2> size, bits<5> opcode,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
+class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
+                     "b", ".$cond\t$target", "",
+                     [(AArch64brcond bb:$target, imm:$cond, NZCV)]>,
+                   Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let Uses = [NZCV];
+
+  bits<4> cond;
+  bits<19> target;
+  let Inst{31-24} = 0b01010100;
+  let Inst{23-5} = target;
+  let Inst{4} = 0;
+  let Inst{3-0} = cond;
+}
+
+//---
+// Compare-and-branch instructions.
+//---
+class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, am_brcond:$target),
+         asm, "\t$Rt, $target", "",
+         [(node regtype:$Rt, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<19> target;
+  let Inst{30-25} = 0b011010;
+  let Inst{24}    = op;
+  let Inst{23-5}  = target;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass CmpBranch<bit op, string asm, SDNode node> {
+  def W : BaseCmpBranch<GPR32, op, asm, node> {
+    let Inst{31} = 0;
+  }
+  def X : BaseCmpBranch<GPR64, op, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Test-bit-and-branch instructions.
+//---
+// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
+// the target offset are implied zero and so are not part of the immediate.
+def BranchTarget14Operand : AsmOperandClass {
+  let Name = "BranchTarget14";
+}
+def am_tbrcond : Operand<OtherVT> {
+  let EncoderMethod = "getTestBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget14Operand;
+}
+
+// AsmOperand classes to emit (or not) special diagnostics
+def TBZImm0_31Operand : AsmOperandClass {
+  let Name = "TBZImm0_31";
+  let PredicateMethod = "isImm0_31";
+  let RenderMethod = "addImm0_31Operands";
+}
+def TBZImm32_63Operand : AsmOperandClass {
+  let Name = "Imm32_63";
+  let DiagnosticType = "InvalidImm0_63";
+}
+
+class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let ParserMatchClass = matcher;
+}
+
+def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>;
+def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>;
+
+def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{
+  return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64);
+}]> {
+  let ParserMatchClass = TBZImm32_63Operand;
+}
+
+class BaseTestBranch<RegisterClass regtype, Operand immtype,
+                     bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target),
+       asm, "\t$Rt, $bit_off, $target", "",
+       [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<6> bit_off;
+  bits<14> target;
+
+  let Inst{30-25} = 0b011011;
+  let Inst{24}    = op;
+  let Inst{23-19} = bit_off{4-0};
+  let Inst{18-5}  = target;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeTestAndBranch";
+}
+
+multiclass TestBranch<bit op, string asm, SDNode node> {
+  def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> {
+    let Inst{31} = 0;
+  }
+
+  def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> {
+    let Inst{31} = 1;
+  }
+
+  // Alias X-reg with 0-31 imm to W-Reg.
+  def : InstAlias<asm # "\t$Rd, $imm, $target",
+                  (!cast<Instruction>(NAME#"W") GPR32as64:$Rd,
+                  tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>;
+  def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target),
+            (!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32),
+            tbz_imm0_31_diag:$imm, bb:$target)>;
+}
+
+//---
+// Unconditional branch (immediate) instructions.
+//---
+def BranchTarget26Operand : AsmOperandClass {
+  let Name = "BranchTarget26";
+  let DiagnosticType = "InvalidLabel";
+}
+def am_b_target : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+def am_bl_target : Operand<i64> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+
+class BImm<bit op, dag iops, string asm, list<dag> pattern>
+    : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
+  bits<26> addr;
+  let Inst{31}    = op;
+  let Inst{30-26} = 0b00101;
+  let Inst{25-0}  = addr;
+
+  let DecoderMethod = "DecodeUnconditionalBranch";
+}
+
+class BranchImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_b_target:$addr), asm, pattern>;
+class CallImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
+
+//---
+// Basic one-operand data processing instructions.
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
+                         SDPatternOperator node>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+      [(set regtype:$Rd, (node regtype:$Rn))]>,
+    Sched<[WriteI, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+
+  let Inst{30-13} = 0b101101011000000000;
+  let Inst{12-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass OneOperandData<bits<3> opc, string asm,
+                          SDPatternOperator node = null_frag> {
+  def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR32, asm, node> {
+  let Inst{31} = 0;
+}
+
+class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR64, asm, node> {
+  let Inst{31} = 1;
+}
+
+//---
+// Basic two-operand data processing instructions.
+//---
+class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                          list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30}    = isSub;
+  let Inst{28-21} = 0b11010000;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                      SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>;
+
+class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
+                              SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)),
+         (implicit NZCV)]> {
+  let Defs = [NZCV];
+}
+
+multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
+                       SDNode OpNode, SDNode OpNode_setflags> {
+  def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
+    let Inst{31} = 0;
+    let Inst{29} = 0;
+  }
+  def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+    let Inst{29} = 0;
+  }
+
+  // Sets flags.
+  def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 0;
+    let Inst{29} = 1;
+  }
+  def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 1;
+    let Inst{29} = 1;
+  }
+}
+
+class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
+                     SDPatternOperator OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = 0b00;
+  let Inst{13-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
+              SDPatternOperator OpNode>
+    : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
+  let Inst{10}    = isSigned;
+}
+
+multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
+  def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
+           Sched<[WriteID32, ReadID, ReadID]> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
+           Sched<[WriteID64, ReadID, ReadID]> {
+    let Inst{31} = 1;
+  }
+}
+
+class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
+                SDPatternOperator OpNode = null_frag>
+  : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
+    Sched<[WriteIS, ReadI]> {
+  let Inst{11-10} = shift_type;
+}
+
+multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
+  def Wr : BaseShift<shift_type, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
+                                             (EXTRACT_SUBREG i64:$Rm, sub_32))>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+}
+
+class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
+
+class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
+                       RegisterClass addtype, string asm,
+                       list<dag> pattern>
+  : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
+      asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{30-24} = 0b0011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
+      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
+      Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
+    let Inst{31} = 0;
+  }
+
+  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
+      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
+      Sched<[WriteIM64, ReadIMA, ReadIM, ReadIM]> {
+    let Inst{31} = 1;
+  }
+}
+
+class WideMulAccum<bit isSub, bits<3> opc, string asm,
+                   SDNode AccNode, SDNode ExtNode>
+  : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
+    [(set GPR64:$Rd, (AccNode GPR64:$Ra,
+                            (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
+    Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
+  let Inst{31} = 1;
+}
+
+class MulHi<bits<3> opc, string asm, SDNode OpNode>
+  : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
+    Sched<[WriteIM64, ReadIM, ReadIM]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-24} = 0b10011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+  // (i.e. all bits 1) but is ignored by the processor.
+  let PostEncoderMethod = "fixMulHigh";
+}
+
+class MulAccumWAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
+class MulAccumXAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
+class WideMulAccumAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
+
+class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
+              SDPatternOperator OpNode, string asm>
+  : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
+    Sched<[WriteISReg, ReadI, ReadISReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31} = sf;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0b010;
+  let Inst{12} = C;
+  let Inst{11-10} = sz;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+  let Predicates = [HasCRC];
+}
+
+//---
+// Address generation.
+//---
+
+class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
+    : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
+        pattern>,
+      Sched<[WriteI]> {
+  bits<5>  Xd;
+  bits<21> label;
+  let Inst{31}    = page;
+  let Inst{30-29} = label{1-0};
+  let Inst{28-24} = 0b10000;
+  let Inst{23-5}  = label{20-2};
+  let Inst{4-0}   = Xd;
+
+  let DecoderMethod = "DecodeAdrInstruction";
+}
+
+//---
+// Move immediate.
+//---
+
+def movimm32_imm : Operand<i32> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let EncoderMethod = "getMoveWideImmOpValue";
+  let PrintMethod = "printHexImm";
+}
+def movimm32_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm32ShifterOperand;
+}
+def movimm64_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm64ShifterOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                        string asm>
+  : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "", []>,
+    Sched<[WriteImm]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass MoveImmediate<bits<2> opc, string asm> {
+  def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                          string asm>
+  : I<(outs regtype:$Rd),
+      (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
+    Sched<[WriteI, ReadI]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
+
+multiclass InsertImmediate<bits<2> opc, string asm> {
+  def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Add/Subtract
+//---
+
+class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                    RegisterClass srcRegtype, addsub_shifted_imm immtype,
+                    string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
+        asm, "\t$Rd, $Rn, $imm", "",
+        [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
+      Sched<[WriteI, ReadI]>  {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<14> imm;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b10001;
+  let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
+  let Inst{21-10} = imm{11-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+  let DecoderMethod = "DecodeBaseAddSubImm";
+}
+
+class BaseAddSubRegPseudo<RegisterClass regtype,
+                          SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI, ReadI, ReadI]>;
+
+class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
+                     arith_shifted_reg shifted_regtype, string asm,
+                     SDPatternOperator OpNode>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "",
+        [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = 0;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, Operand src2Regtype,
+                     string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$R1),
+        (ins src1Regtype:$R2, src2Regtype:$R3),
+        asm, "\t$R1, $R2, $R3", "",
+        [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = ext{5-3};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                       RegisterClass src1Regtype, RegisterClass src2Regtype,
+                       Operand ext_op, string asm>
+    : I<(outs dstRegtype:$Rd),
+        (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
+        asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = ext{5};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+// Aliases for register+register add/subtract.
+class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, RegisterClass src2Regtype,
+                     int shiftExt>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
+                      shiftExt)>;
+
+multiclass AddSub<bit isSub, string mnemonic,
+                  SDPatternOperator OpNode = null_frag> {
+  let hasSideEffects = 0 in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register - Only used for CodeGen
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1, hasSideEffects = 0 in {
+  def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
+                           arith_extended_reg32to64<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when either the destination or
+  // first source register is SP.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp> {
+  let isCompare = 1, Defs = [NZCV] in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1 in {
+  def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
+                           arith_extended_reg32<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+  } // Defs = [NZCV]
+
+  // Compare aliases
+  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
+                  WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
+  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
+                  XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
+                  WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
+                  XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
+                  XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
+                  WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
+                  XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
+
+  // Compare shorthands
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs")
+                  WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs")
+                  XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when the first source register
+  // is SP.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+//---
+// Extract
+//---
+def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                      SDTCisPtrTy<3>]>;
+def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
+
+class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
+                     list<dag> patterns>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
+      Sched<[WriteExtr, ReadExtrHi]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> imm;
+
+  let Inst{30-23} = 0b00100111;
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = imm;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ExtractImm<string asm> {
+  def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
+                      [(set GPR32:$Rd,
+                        (AArch64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imm<5> must be zero.
+    let imm{5}   = 0;
+  }
+  def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
+                      [(set GPR64:$Rd,
+                        (AArch64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
+
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Bitfield
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImm<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
+      Sched<[WriteIS, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImm<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imms<5> and immr<5> must be zero, else ReservedValue().
+    let Inst{21} = 0;
+    let Inst{15} = 0;
+  }
+  def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImmWith2RegArgs<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
+                             imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
+      Sched<[WriteIS, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imms<5> and immr<5> must be zero, else ReservedValue().
+    let Inst{21} = 0;
+    let Inst{15} = 0;
+  }
+  def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Logical
+//---
+
+// Logical (immediate)
+class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
+                     RegisterClass sregtype, Operand imm_type, string asm,
+                     list<dag> pattern>
+    : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $imm", "", pattern>,
+      Sched<[WriteI, ReadI]> {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<13> imm;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100100;
+  let Inst{22}    = imm{12};
+  let Inst{21-16} = imm{11-6};
+  let Inst{15-10} = imm{5-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeLogicalImmInstruction";
+}
+
+// Logical (shifted register)
+class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
+                      logical_shifted_reg shifted_regtype, string asm,
+                      list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-24} = 0b01010;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = N;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+// Aliases for register+register logical instructions.
+class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
+
+let AddedComplexity = 6 in
+multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> {
+  def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
+                           [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
+                                               logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
+                           [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
+                                               logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+}
+
+multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
+  let isCompare = 1, Defs = [NZCV] in {
+  def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  def Xri  : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+  } // end Defs = [NZCV]
+}
+
+class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI, ReadI, ReadI]>;
+
+// Split from LogicalImm as not all instructions have both.
+multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
+                      SDPatternOperator OpNode> {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+                            [(set GPR32:$Rd, (OpNode GPR32:$Rn,
+                                                 logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+                            [(set GPR64:$Rd, (OpNode GPR64:$Rn,
+                                                 logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+// Split from LogicalReg to allow setting NZCV Defs
+multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
+                       SDPatternOperator OpNode = null_frag> {
+  let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+            [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+            [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+  } // Defs = [NZCV]
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+//---
+// Conditionally set flags
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
+      Sched<[WriteI, ReadI]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> imm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = imm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass CondSetFlagsImm<bit op, string asm> {
+  def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass CondSetFlagsReg<bit op, string asm> {
+  def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Conditional select
+//---
+
+class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass CondSelect<bit op, bits<2> op2, string asm> {
+  def Wr : BaseCondSelect<op, op2, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelect<op, op2, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
+
+class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
+                       PatFrag frag>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel regtype:$Rn, (frag regtype:$Rm),
+               (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+  AArch64CC::CondCode CC = static_cast<AArch64CC::CondCode>(N->getZExtValue());
+  return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), MVT::i32);
+}]>;
+
+multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
+  def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV),
+            (!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm,
+                                           (inv_cond_XFORM imm:$cond))>;
+
+  def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV),
+            (!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm,
+                                           (inv_cond_XFORM imm:$cond))>;
+}
+
+//---
+// Special Mask Value
+//---
+def maski8_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
+}
+def maski16_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
+}
+
+
+//---
+// Load/store
+//---
+
+// (unsigned immediate)
+// Indexed for 8-bit registers. offset is in range [0,4095].
+def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>;
+def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>;
+def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
+def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
+def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;
+
+class UImm12OffsetOperand<int Scale> : AsmOperandClass {
+  let Name = "UImm12Offset" # Scale;
+  let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">";
+  let PredicateMethod = "isUImm12Offset<" # Scale # ">";
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale;
+}
+
+def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>;
+def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>;
+def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>;
+def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>;
+def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>;
+
+class uimm12_scaled<int Scale> : Operand<i64> {
+  let ParserMatchClass
+   = !cast<AsmOperandClass>("UImm12OffsetScale" # Scale # "Operand");
+  let EncoderMethod
+   = "getLdStUImm12OpValue<AArch64::fixup_aarch64_ldst_imm12_scale" # Scale # ">";
+  let PrintMethod = "printUImm12Offset<" # Scale # ">";
+}
+
+def uimm12s1 : uimm12_scaled<1>;
+def uimm12s2 : uimm12_scaled<2>;
+def uimm12s4 : uimm12_scaled<4>;
+def uimm12s8 : uimm12_scaled<8>;
+def uimm12s16 : uimm12_scaled<16>;
+
+class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                      string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+  bits<5> Rt;
+
+  bits<5> Rn;
+  bits<12> offset;
+
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b01;
+  let Inst{23-22} = opc;
+  let Inst{21-10} = offset;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeUnsignedLdStInstruction";
+}
+
+multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                  Operand indextype, string asm, list<dag> pattern> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
+                           (ins GPR64sp:$Rn, indextype:$offset),
+                           asm, pattern>,
+           Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             Operand indextype, string asm, list<dag> pattern> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def ui : BaseLoadStoreUI<sz, V, opc, (outs),
+                           (ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
+                           asm, pattern>,
+           Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+def PrefetchOperand : AsmOperandClass {
+  let Name = "Prefetch";
+  let ParserMethod = "tryParsePrefetch";
+}
+def prfop : Operand<i32> {
+  let PrintMethod = "printPrefetchOp";
+  let ParserMatchClass = PrefetchOperand;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
+    : BaseLoadStoreUI<sz, V, opc,
+                      (outs), (ins prfop:$Rt, GPR64sp:$Rn, uimm12s8:$offset),
+                      asm, pat>,
+      Sched<[WriteLD]>;
+
+//---
+// Load literal
+//---
+
+// Load literal address: 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def am_ldrlit : Operand<OtherVT> {
+  let EncoderMethod = "getLoadLiteralOpValue";
+  let DecoderMethod = "DecodePCRelLabel19";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PCRelLabel19Operand;
+}
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rt), (ins am_ldrlit:$label),
+        asm, "\t$Rt, $label", "", []>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
+    : I<(outs), (ins prfop:$Rt, am_ldrlit:$label),
+        asm, "\t$Rt, $label", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
+}
+
+//---
+// Load/store register offset
+//---
+
+def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>;
+def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>;
+def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
+def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
+def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
+
+def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
+def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
+def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
+def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
+def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;
+
+class MemExtendOperand<string Reg, int Width> : AsmOperandClass {
+  let Name = "Mem" # Reg # "Extend" # Width;
+  let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">";
+  let RenderMethod = "addMemExtendOperands";
+  let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width;
+}
+
+def MemWExtend8Operand : MemExtendOperand<"W", 8> {
+  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+  // the trivial shift.
+  let RenderMethod = "addMemExtend8Operands";
+}
+def MemWExtend16Operand : MemExtendOperand<"W", 16>;
+def MemWExtend32Operand : MemExtendOperand<"W", 32>;
+def MemWExtend64Operand : MemExtendOperand<"W", 64>;
+def MemWExtend128Operand : MemExtendOperand<"W", 128>;
+
+def MemXExtend8Operand : MemExtendOperand<"X", 8> {
+  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+  // the trivial shift.
+  let RenderMethod = "addMemExtend8Operands";
+}
+def MemXExtend16Operand : MemExtendOperand<"X", 16>;
+def MemXExtend32Operand : MemExtendOperand<"X", 32>;
+def MemXExtend64Operand : MemExtendOperand<"X", 64>;
+def MemXExtend128Operand : MemExtendOperand<"X", 128>;
+
+class ro_extend<AsmOperandClass ParserClass, string Reg, int Width>
+        : Operand<i32> {
+  let ParserMatchClass = ParserClass;
+  let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">";
+  let DecoderMethod = "DecodeMemExtend";
+  let EncoderMethod = "getMemExtendOpValue";
+  let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift);
+}
+
+def ro_Wextend8   : ro_extend<MemWExtend8Operand,   "w", 8>;
+def ro_Wextend16  : ro_extend<MemWExtend16Operand,  "w", 16>;
+def ro_Wextend32  : ro_extend<MemWExtend32Operand,  "w", 32>;
+def ro_Wextend64  : ro_extend<MemWExtend64Operand,  "w", 64>;
+def ro_Wextend128 : ro_extend<MemWExtend128Operand, "w", 128>;
+
+def ro_Xextend8   : ro_extend<MemXExtend8Operand,   "x", 8>;
+def ro_Xextend16  : ro_extend<MemXExtend16Operand,  "x", 16>;
+def ro_Xextend32  : ro_extend<MemXExtend32Operand,  "x", 32>;
+def ro_Xextend64  : ro_extend<MemXExtend64Operand,  "x", 64>;
+def ro_Xextend128 : ro_extend<MemXExtend128Operand, "x", 128>;
+
+class ROAddrMode<ComplexPattern windex, ComplexPattern xindex,
+                  Operand wextend, Operand xextend>  {
+  // CodeGen-level pattern covering the entire addressing mode.
+  ComplexPattern Wpat = windex;
+  ComplexPattern Xpat = xindex;
+
+  // Asm-level Operand covering the valid "uxtw #3" style syntax.
+  Operand Wext = wextend;
+  Operand Xext = xextend;
+}
+
+def ro8 : ROAddrMode<ro_Windexed8, ro_Xindexed8, ro_Wextend8, ro_Xextend8>;
+def ro16 : ROAddrMode<ro_Windexed16, ro_Xindexed16, ro_Wextend16, ro_Xextend16>;
+def ro32 : ROAddrMode<ro_Windexed32, ro_Xindexed32, ro_Wextend32, ro_Xextend32>;
+def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
+def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
+                       ro_Xextend128>;
+
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
+  : InstAlias<asm # " $Rt, [$Rn, $Rm]",
+              (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+
+multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                             ro_Wextend8:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                             ro_Xextend8:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+                 (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+                 [(storeop (Ty regtype:$Rt),
+                           (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend8:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+                 (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+                 [(storeop (Ty regtype:$Rt),
+                           (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend8:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                              ro_Wextend16:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                             ro_Xextend16:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend16:$extend))]>,
+           Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend16:$extend))]>,
+           Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                              ro_Wextend32:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                              ro_Xextend32:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend32:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                        ro_Xextend32:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                [(set (Ty regtype:$Rt),
+                      (loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                             ro_Wextend64:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                              ro_Xextend64:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend64:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend64:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+                                               ro_Wextend128:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+                                               ro_Xextend128:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+               (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+                                          ro_Wextend128:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+               (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+                                          ro_Xextend128:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BasePrefetchRO<bits<2> sz, bit V, bits<2> opc, dag outs, dag ins,
+                     string asm, list<dag> pat>
+    : I<outs, ins, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
+  def roW : BasePrefetchRO<sz, V, opc, (outs),
+                (ins prfop:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                asm, [(AArch64Prefetch imm:$Rt,
+                                     (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                    ro_Wextend64:$extend))]> {
+    let Inst{13} = 0b0;
+  }
+
+  def roX : BasePrefetchRO<sz, V, opc, (outs),
+                (ins prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                asm,  [(AArch64Prefetch imm:$Rt,
+                                      (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                     ro_Xextend64:$extend))]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
+               (!cast<Instruction>(NAME # "roX") prfop:$Rt,
+                                                 GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+}
+
+//---
+// Load/store unscaled immediate
+//---
+
+def am_unscaled8 :  ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
+def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
+def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
+def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
+def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
+
+class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                           string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, list<dag> pattern> {
+  let AddedComplexity = 1 in // try this before LoadUI
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
+                               (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                         string asm, list<dag> pattern> {
+  let AddedComplexity = 1 in // try this before StoreUI
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                               (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, pattern>,
+          Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
+                            list<dag> pat> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                               (ins prfop:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, pat>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store unscaled immediate, unprivileged
+//---
+
+class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                                dag oops, dag iops, string asm>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                            RegisterClass regtype, string asm> {
+  let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in
+  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs regtype:$Rt),
+                                    (ins GPR64sp:$Rn, simm9:$offset), asm>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                             RegisterClass regtype, string asm> {
+  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
+  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs),
+                                 (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                                 asm>,
+          Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store pre-indexed
+//---
+
+class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr, list<dag> pat>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]!", cstr, pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                     (outs GPR64sp:$wback, regtype:$Rt),
+                     (ins GPR64sp:$Rn, simm9:$offset), asm,
+                     "$Rn = $wback", []>,
+      Sched<[WriteLD, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                  string asm, SDPatternOperator storeop, ValueType Ty>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                      (outs GPR64sp:$wback),
+                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                      asm, "$Rn = $wback",
+      [(set GPR64sp:$wback,
+            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+      Sched<[WriteAdr, WriteST]>;
+} // hasSideEffects = 0
+
+//---
+// Load/store post-indexed
+//---
+
+// (pre-index) load/stores.
+class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr, list<dag> pat>
+    : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0b0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs GPR64sp:$wback, regtype:$Rt),
+                      (ins GPR64sp:$Rn, simm9:$offset),
+                      asm, "$Rn = $wback", []>,
+      Sched<[WriteLD, WriteI]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, SDPatternOperator storeop, ValueType Ty>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs GPR64sp:$wback),
+                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                       asm, "$Rn = $wback",
+      [(set GPR64sp:$wback,
+            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+    Sched<[WriteAdr, WriteST, ReadAdrBase]>;
+} // hasSideEffects = 0
+
+
+//---
+// Load/store pair
+//---
+
+// (indexed, offset)
+
+class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b010;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                          Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStorePairOffset<opc, V, 1,
+                                  (outs regtype:$Rt, regtype:$Rt2),
+                                  (ins GPR64sp:$Rn, indextype:$offset), asm>,
+          Sched<[WriteLD, WriteLDHi]>;
+
+  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+
+multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                           Operand indextype, string asm> {
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+  def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
+                                  (ins regtype:$Rt, regtype:$Rt2,
+                                       GPR64sp:$Rn, indextype:$offset),
+                                  asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+// (pre-indexed)
+class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b011;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                     Operand indextype, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 1,
+                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+                              (ins GPR64sp:$Rn, indextype:$offset), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
+                             (ins regtype:$Rt, regtype:$Rt2,
+                                  GPR64sp:$Rn, indextype:$offset),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+// (post-indexed)
+
+class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b001;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 1,
+                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+                              (ins GPR64sp:$Rn, idxtype:$offset), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                       Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
+                             (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2,
+                                  GPR64sp:$Rn, idxtype:$offset),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+//  (no-allocate)
+
+class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b000;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                           Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStorePairNoAlloc<opc, V, 1,
+                                   (outs regtype:$Rt, regtype:$Rt2),
+                                   (ins GPR64sp:$Rn, indextype:$offset), asm>,
+          Sched<[WriteLD, WriteLDHi]>;
+
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in
+  def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
+                                   (ins regtype:$Rt, regtype:$Rt2,
+                                        GPR64sp:$Rn, indextype:$offset),
+                                   asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store exclusive
+//---
+
+// True exclusive operations write to and/or read from the system's exclusive
+// monitors, which as far as a compiler is concerned can be modelled as a
+// random shared memory address. Hence LoadExclusive mayStore.
+//
+// Since these instructions have the undefined register bits set to 1 in
+// their canonical form, we need a post encoder method to set those bits
+// to 1 when encoding these instructions. We do this using the
+// fixLoadStoreExclusive function. This function has template parameters:
+//
+// fixLoadStoreExclusive<int hasRs, int hasRt2>
+//
+// hasRs indicates that the instruction uses the Rs field, so we won't set
+// it to 1 (and the same for Rt2). We don't need template parameters for
+// the other register fields since Rt and Rn are always used.
+//
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
+class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                             dag oops, dag iops, string asm, string operands>
+    : I<oops, iops, asm, operands, "", []> {
+  let Inst{31-30} = sz;
+  let Inst{29-24} = 0b001000;
+  let Inst{23}    = o2;
+  let Inst{22}    = L;
+  let Inst{21}    = o1;
+  let Inst{15}    = o0;
+
+  let DecoderMethod = "DecodeExclusiveLdStInstruction";
+}
+
+// Neither Rs nor Rt2 operands.
+class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                               dag oops, dag iops, string asm, string operands>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
+  bits<5> Rt;
+  bits<5> Rn;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
+}
+
+// Simple load acquires don't set the exclusive monitor
+let mayLoad = 1, mayStore = 0 in
+class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                  RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteLD]>;
+
+class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                    RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteLD]>;
+
+class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                       RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs regtype:$Rt, regtype:$Rt2),
+                             (ins GPR64sp0:$Rn), asm,
+                             "\t$Rt, $Rt2, [$Rn]">,
+      Sched<[WriteLD, WriteLDHi]> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
+}
+
+// Simple store release operations do not check the exclusive monitor.
+let mayLoad = 0, mayStore = 1 in
+class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                   RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
+                               (ins regtype:$Rt, GPR64sp0:$Rn),
+                               asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteST]>;
+
+let mayLoad = 1, mayStore = 1 in
+class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                     RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
+                             (ins regtype:$Rt, GPR64sp0:$Rn),
+                             asm, "\t$Ws, $Rt, [$Rn]">,
+      Sched<[WriteSTX]> {
+  bits<5> Ws;
+  bits<5> Rt;
+  bits<5> Rn;
+  let Inst{20-16} = Ws;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let Constraints = "@earlyclobber $Ws";
+  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
+}
+
+class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                         RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs GPR32:$Ws),
+                             (ins regtype:$Rt, regtype:$Rt2, GPR64sp0:$Rn),
+                              asm, "\t$Ws, $Rt, $Rt2, [$Rn]">,
+      Sched<[WriteSTX]> {
+  bits<5> Ws;
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{20-16} = Ws;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let Constraints = "@earlyclobber $Ws";
+}
+
+//---
+// Exception generation
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
+    : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
+      Sched<[WriteSys]> {
+  bits<16> imm;
+  let Inst{31-24} = 0b11010100;
+  let Inst{23-21} = op1;
+  let Inst{20-5}  = imm;
+  let Inst{4-2}   = 0b000;
+  let Inst{1-0}   = ll;
+}
+
+let Predicates = [HasFPARMv8] in {
+
+//---
+// Floating point to integer conversion
+//---
+
+class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-29} = 0b00;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-29} = 0b00;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 0;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
+           SDPatternOperator OpN> {
+  // Unscaled single-precision to 32-bit
+  def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled single-precision to 64-bit
+  def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Unscaled double-precision to 32-bit
+  def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled double-precision to 64-bit
+  def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
+                             SDPatternOperator OpN> {
+  // Scaled single-precision to 32-bit
+  def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
+                              fixedpoint_f32_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn,
+                                          fixedpoint_f32_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+  }
+
+  // Scaled single-precision to 64-bit
+  def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
+                              fixedpoint_f32_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn,
+                                          fixedpoint_f32_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Scaled double-precision to 32-bit
+  def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
+                              fixedpoint_f64_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn,
+                                          fixedpoint_f64_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+  }
+
+  // Scaled double-precision to 64-bit
+  def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
+                              fixedpoint_f64_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn,
+                                          fixedpoint_f64_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+//---
+// Integer to floating point conversion
+//---
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseIntegerToFP<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21-17} = 0b00001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseIntegerToFPUnscaled<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      ValueType dvt, string asm, SDNode node>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21-17} = 0b10001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
+  // Unscaled
+  def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  // Scaled
+  def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
+                             [(set FPR32:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f32_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+    let scale{5} = 1;
+  }
+
+  def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm,
+                             [(set FPR64:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f64_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+    let scale{5} = 1;
+  }
+
+  def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
+                             [(set FPR32:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f32_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
+                             [(set FPR64:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f64_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+}
+
+//---
+// Unscaled integer <-> floating point conversion (i.e. FMOV)
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
+        // We use COPY_TO_REGCLASS for these bitconvert operations.
+        // copyPhysReg() expands the resultant COPY instructions after
+        // regalloc is done. This gives greater freedom for the allocator
+        // and related passes (coalescing, copy propagation, et. al.) to
+        // be more effective.
+        [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterClass srcType, RegisterOperand dstType, string asm,
+                     string kind>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+        "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod =  "DecodeFMOVLaneInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterOperand srcType, RegisterClass dstType, string asm,
+                     string kind>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+        "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod =  "DecodeFMOVLaneInstruction";
+}
+
+
+
+multiclass UnscaledConversion<string asm> {
+  def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
+                                             asm, ".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+
+  def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
+                                               asm, ".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+}
+
+//---
+// Floating point conversion
+//---
+
+class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
+                       RegisterClass srcType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-24} = 0b00011110;
+  let Inst{23-22} = type;
+  let Inst{21-17} = 0b10001;
+  let Inst{16-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPConversion<string asm> {
+  // Double-precision to Half-precision
+  def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
+                             [(set FPR16:$Rd, (fround FPR64:$Rn))]>;
+
+  // Double-precision to Single-precision
+  def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
+                             [(set FPR32:$Rd, (fround FPR64:$Rn))]>;
+
+  // Half-precision to Double-precision
+  def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
+                             [(set FPR64:$Rd, (fextend FPR16:$Rn))]>;
+
+  // Half-precision to Single-precision
+  def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
+                             [(set FPR32:$Rd, (fextend FPR16:$Rn))]>;
+
+  // Single-precision to Double-precision
+  def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
+                             [(set FPR64:$Rd, (fextend FPR32:$Rn))]>;
+
+  // Single-precision to Half-precision
+  def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
+                             [(set FPR16:$Rd, (fround FPR32:$Rn))]>;
+}
+
+//---
+// Single operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
+                              ValueType vt, string asm, SDPatternOperator node>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+         [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21-19} = 0b100;
+  let Inst{18-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SingleOperandFPData<bits<4> opcode, string asm,
+                               SDPatternOperator node = null_frag> {
+  def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+//---
+// Two operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
+                           string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+         asm, "\t$Rd, $Rn, $Rm", "", pat>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass TwoOperandFPData<bits<4> opcode, string asm,
+                            SDPatternOperator node = null_frag> {
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                         [(set (f32 FPR32:$Rd),
+                               (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                         [(set (f64 FPR64:$Rd),
+                               (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                  [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                  [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+
+//---
+// Three operand floating point data processing
+//---
+
+class BaseThreeOperandFPData<bit isNegated, bit isSub,
+                             RegisterClass regtype, string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
+         asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
+      Sched<[WriteFMul]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{31-23} = 0b000111110;
+  let Inst{21}    = isNegated;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
+                              SDPatternOperator node> {
+  def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
+            [(set FPR32:$Rd,
+                  (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
+            [(set FPR64:$Rd,
+                  (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+//---
+// Floating point data comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandFPComparison<bit signalAllNans,
+                                 RegisterClass regtype, string asm,
+                                 list<dag> pat>
+    : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b1000;
+
+  // Rm should be 0b00000 canonically, but we need to accept any value.
+  let PostEncoderMethod = "fixOneOperandFPComparison";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
+                                string asm, list<dag> pat>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rm;
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b0000;
+}
+
+multiclass FPComparison<bit signalAllNans, string asm,
+                        SDPatternOperator OpNode = null_frag> {
+  let Defs = [NZCV] in {
+  def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
+    let Inst{22} = 0;
+  }
+
+  def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{22} = 0;
+  }
+
+  def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
+    let Inst{22} = 1;
+  }
+
+  def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{22} = 1;
+  }
+  } // Defs = [NZCV]
+}
+
+//---
+// Floating point conditional comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPCondComparison<bit signalAllNans,
+                              RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass FPCondComparison<bit signalAllNans, string asm> {
+  let Defs = [NZCV], Uses = [NZCV] in {
+  def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
+    let Inst{22} = 1;
+  }
+  } // Defs = [NZCV], Uses = [NZCV]
+}
+
+//---
+// Floating point conditional select
+//---
+
+class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel (vt regtype:$Rn), regtype:$Rm,
+                          (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPCondSelect<string asm> {
+  let Uses = [NZCV] in {
+  def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
+    let Inst{22} = 1;
+  }
+  } // Uses = [NZCV]
+}
+
+//---
+// Floating move immediate
+//---
+
+class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
+  : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
+      [(set regtype:$Rd, fpimmtype:$imm)]>,
+    Sched<[WriteFImm]> {
+  bits<5> Rd;
+  bits<8> imm;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-13} = imm;
+  let Inst{12-5}  = 0b10000000;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPMoveImmediate<string asm> {
+  def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
+    let Inst{22} = 1;
+  }
+}
+} // end of 'let Predicates = [HasFPARMv8]'
+
+//----------------------------------------------------------------------------
+// AdvSIMD
+//----------------------------------------------------------------------------
+
+let Predicates = [HasNEON] in {
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register vector instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// All operand sizes distinguished in the encoding.
+multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+         [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+         [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
+                                      asm, ".2d",
+         [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
+// As above, but D sized elements unsupported.
+multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+        [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+        [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+        [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+        [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+        [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+        [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
+}
+
+multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+      [(set (v8i8 V64:$dst),
+            (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+      [(set (v16i8 V128:$dst),
+            (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+      [(set (v4i16 V64:$dst),
+            (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+      [(set (v8i16 V128:$dst),
+            (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+      [(set (v2i32 V64:$dst),
+            (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+      [(set (v4i32 V128:$dst),
+            (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// As above, but only B sized elements supported.
+multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+    [(set (v16i8 V128:$Rd),
+          (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+}
+
+// As above, but only S and D sized floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
+                                    string asm,
+                                    SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+        [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+     [(set (v2f32 V64:$dst),
+           (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+     [(set (v4f32 V128:$dst),
+           (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+     [(set (v2f64 V128:$dst),
+           (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+// As above, but D and B sized elements unsupported.
+multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+        [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// Logical three vector ops share opcode bits, and only use B sized elements.
+multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
+                                     asm, ".8b",
+                         [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
+  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
+                                     asm, ".16b",
+                         [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
+
+  def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+}
+
+multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
+                                  string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
+                                     asm, ".8b",
+             [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
+                                     asm, ".16b",
+             [(set (v16i8 V128:$dst),
+                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                           (v16i8 V128:$Rm)))]>;
+
+  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+                           (v4i16 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+                           (v2i32 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+                           (v1i64 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+                           (v8i16 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+                           (v4i32 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+                           (v2i64 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string dstkind,
+                        string srckind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Supports B, H, and S element sizes.
+multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b", ".8b",
+                          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b", ".16b",
+                          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h", ".4h",
+                          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h", ".8h",
+                          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, string amount>
+  : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
+      "|" # dstkind # "\t$Rd, $Rn, #" #  amount # "}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-24} = 0b101110;
+  let Inst{23-22} = size;
+  let Inst{21-10} = 0b100001001110;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDVectorLShiftLongBySizeBHS {
+  let neverHasSideEffects = 1 in {
+  def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
+                                             "shll", ".8h",  ".8b", "8">;
+  def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
+                                             "shll2", ".8h", ".16b", "8">;
+  def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
+                                             "shll", ".4s",  ".4h", "16">;
+  def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
+                                             "shll2", ".4s", ".8h", "16">;
+  def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
+                                             "shll", ".2d",  ".2s", "32">;
+  def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
+                                             "shll2", ".2d", ".4s", "32">;
+  }
+}
+
+// Supports all element sizes.
+multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".4h", ".8b",
+               [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".8h", ".16b",
+               [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".2s", ".4h",
+               [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".4s", ".8h",
+               [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".1d", ".2s",
+               [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".2d", ".4s",
+               [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+                                          asm, ".4h", ".8b",
+      [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
+                                      (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+                                          asm, ".8h", ".16b",
+      [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
+                                      (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+                                          asm, ".2s", ".4h",
+      [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
+                                      (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+                                          asm, ".4s", ".8h",
+      [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+                                      (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+                                          asm, ".1d", ".2s",
+      [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
+                                      (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+                                          asm, ".2d", ".4s",
+      [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
+                                      (v4i32 V128:$Rn)))]>;
+}
+
+// Supports all element sizes, except 1xD.
+multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+                                    asm, ".8b", ".8b",
+    [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+                                    asm, ".16b", ".16b",
+    [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+                                    asm, ".4h", ".4h",
+    [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+                                    asm, ".8h", ".8h",
+    [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+                                    asm, ".2s", ".2s",
+    [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+                                    asm, ".4s", ".4s",
+    [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
+                                    asm, ".2d", ".2d",
+    [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                asm, ".8b", ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                asm, ".16b", ".16b",
+    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                asm, ".4h", ".4h",
+    [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                asm, ".8h", ".8h",
+    [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                asm, ".2s", ".2s",
+    [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                asm, ".4s", ".4s",
+    [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
+                                asm, ".2d", ".2d",
+    [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+// Supports only B element sizes.
+multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
+                          SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+}
+
+// Supports only B and H element sizes.
+multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
+}
+
+// Supports only S and D element sizes, uses high bit of the size field
+// as an extra opcode bit.
+multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+// Supports only S element size.
+multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+
+multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
+                                      asm, ".8b", ".8h",
+        [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
+                                      asm#"2", ".16b", ".8h", []>;
+  def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
+                                      asm, ".4h", ".4s",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
+                                      asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
+                                      asm, ".2s", ".2d",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+  def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
+                                      asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand regtype,
+                           string asm, string kind, string zero,
+                           ValueType dty, ValueType sty, SDNode OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
+      "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
+      [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Comparisons support all element sizes, except 1xD.
+multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
+                            SDNode OpNode> {
+  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
+                                     asm, ".8b", "0",
+                                     v8i8, v8i8, OpNode>;
+  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
+                                     asm, ".16b", "0",
+                                     v16i8, v16i8, OpNode>;
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
+                                     asm, ".4h", "0",
+                                     v4i16, v4i16, OpNode>;
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
+                                     asm, ".8h", "0",
+                                     v8i16, v8i16, OpNode>;
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
+                                     asm, ".2s", "0",
+                                     v2i32, v2i32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
+                                     asm, ".4s", "0",
+                                     v4i32, v4i32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
+                                     asm, ".2d", "0",
+                                     v2i64, v2i64, OpNode>;
+}
+
+// FP Comparisons support only S and D element sizes.
+multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
+                              string asm, SDNode OpNode> {
+
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
+                                     asm, ".2s", "0.0",
+                                     v2i32, v2f32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
+                                     asm, ".4s", "0.0",
+                                     v4i32, v4f32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
+                                     asm, ".2d", "0.0",
+                                     v2i64, v2f64, OpNode>;
+
+  def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0",
+                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0",
+                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0",
+                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # ".2s $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # ".4s $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # ".2d $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
+                                    asm, ".4s", ".4h", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".4s", ".8h", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
+                                    asm, ".2d", ".2s", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".2d", ".4s", []>;
+}
+
+multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
+                                    asm, ".4h", ".4s", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                    asm, ".2s", ".2d", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+}
+
+multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
+                                     Intrinsic OpNode> {
+  def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                     asm, ".2s", ".2d",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+  def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4f32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register different-size vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// FIXME: TableGen doesn't know how to deal with expanded types that also
+//        change the element count (in this case, placing the results in
+//        the high elements of the result register rather than the low
+//        elements). Until that's fixed, we can't code-gen those.
+multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                    Intrinsic IntOp> {
+  def v8i16_v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".8b", ".8h", ".8h",
+     [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v8i16_v16i8  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".16b", ".8h", ".8h",
+     []>;
+  def v4i32_v4i16  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".4h", ".4s", ".4s",
+     [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v4i32_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".4s", ".4s",
+     []>;
+  def v2i64_v2i32  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".2s", ".2d", ".2d",
+     [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+  def v2i64_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".2d", ".2d",
+     []>;
+
+
+  // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
+  // a version attached to an instruction.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
+                                                   (v8i16 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v8i16_v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
+                                                    (v4i32 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v4i32_v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
+                                                    (v2i64 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v2i64_v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
+                                      Intrinsic IntOp> {
+  def v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                            V128, V64, V64,
+                                            asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                            V128, V128, V128,
+                                            asm#"2", ".8h", ".16b", ".16b", []>;
+  let Predicates = [HasCrypto] in {
+    def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
+                                              V128, V64, V64,
+                                              asm, ".1q", ".1d", ".1d", []>;
+    def v2i64  : BaseSIMDDifferentThreeVector<U, 0b111, opc,
+                                              V128, V128, V128,
+                                              asm#"2", ".1q", ".2d", ".2d", []>;
+  }
+
+  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
+                          (v8i8 (extract_high_v16i8 V128:$Rm)))),
+      (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                (extract_high_v16i8 V128:$Rm)))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                  (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                 (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
+                                          string asm,
+                                          SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                   (extract_high_v16i8 V128:$Rm))))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                    (extract_high_v8i16 V128:$Rm))))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                    (extract_high_v4i32 V128:$Rm))))))]>;
+}
+
+multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
+                                      (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
+                                      string asm,
+                                      SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd),
+                  (extract_high_v16i8 V128:$Rn),
+                  (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
+                                           SDPatternOperator Accum> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+                                                (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
+                                            (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn),
+                                                (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
+                                            (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".8h", ".8h", ".8b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".8h", ".16b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                                       (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".4s", ".4s", ".4h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".4s", ".8h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                                       (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".2d", ".2d", ".2s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".2d", ".4s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                                       (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
+                             string asm, string kind>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
+      "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
+      [(set (vty regtype:$Rd),
+            (AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> imm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size;
+  let Inst{29-21} = 0b101110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-11} = imm;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+multiclass SIMDBitwiseExtract<string asm> {
+  def v8i8  : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> {
+    let imm{3} = 0;
+  }
+  def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
+                        string asm, string kind, SDNode OpNode, ValueType valty>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "",
+      [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDZipVector<bits<3>opc, string asm,
+                         SDNode OpNode> {
+  def v8i8   : BaseSIMDZipVector<0b000, opc, V64,
+      asm, ".8b", OpNode, v8i8>;
+  def v16i8  : BaseSIMDZipVector<0b001, opc, V128,
+      asm, ".16b", OpNode, v16i8>;
+  def v4i16  : BaseSIMDZipVector<0b010, opc, V64,
+      asm, ".4h", OpNode, v4i16>;
+  def v8i16  : BaseSIMDZipVector<0b011, opc, V128,
+      asm, ".8h", OpNode, v8i16>;
+  def v2i32  : BaseSIMDZipVector<0b100, opc, V64,
+      asm, ".2s", OpNode, v2i32>;
+  def v4i32  : BaseSIMDZipVector<0b101, opc, V128,
+      asm, ".4s", OpNode, v4i32>;
+  def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
+      asm, ".2d", OpNode, v2i64>;
+
+  def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
+        (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
+  def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, string asm,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "\t$Rd, $Rn, $Rm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
   let Inst{15-11} = opcode;
-  let Inst{10} = 0b1;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD 3 vector registers with different vector type
-class NeonI_3VDiff<bit q, bit u, bits<2> size, bits<4> opcode,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b01110;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+}
+
+multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
+  def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
+            (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
+                             [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+}
+
+multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+      [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+      [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  }
+
+  def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
+                                SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+      [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+      [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
+              dag oops, dag iops, string asm, string cstr, list<dag> pat>
+  : I<oops, iops, asm,
+      "\t$Rd, $Rn, $Rm", cstr, pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = opcode;
-  let Inst{11} = 0b0;
-  let Inst{10} = 0b0;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD two registers and an element
-class NeonI_2VElem<bit q, bit u, bits<2> size, bits<4> opcode,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b01111;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$Rd),
+                                      (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$Rd),
+                                      (ins FPR32:$Rn, FPR32:$Rm), asm, "",
+            [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$dst),
+                                      (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
+                                      asm, "$Rd = $dst", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$dst),
+                                      (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
+                                      asm, "$Rd = $dst",
+            [(set (i64 FPR64:$dst),
+                  (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  // l in Inst{21}
-  // m in Inst{20}
-  // Inherit Rm in 19-16
-  let Inst{15-12} = opcode;
-  // h in Inst{11}
-  let Inst{10} = 0b0;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD 1 vector register with modified immediate
-class NeonI_1VModImm<bit q, bit op,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs,ins, asmstr, patterns, itin> {
-  bits<8> Imm;
-  bits<4> cmode;
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "$Rd = $dst", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, string asm, string zero>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "\t$Rd, $Rn, #" # zero, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
+  : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
+     [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-17} = 0b011111100110000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
+
+  def : Pat<(v1i64 (OpNode FPR64:$Rn)),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
+  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
+
+  def : InstAlias<asm # " $Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
+  def : InstAlias<asm # " $Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
+                          SDPatternOperator OpNode = null_frag> {
+  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
+  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
+  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
+}
+
+multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
+                                [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
+  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
+                                [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+}
+
+multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+           [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
+           [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
+                                 Intrinsic OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
+        [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
+        [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
+}
+
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
+        [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
+  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, RegisterOperand vectype,
+                        string asm, string kind>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
+  def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
+  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
+                                      asm, ".2s">;
+  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
+                          RegisterClass regtype, RegisterOperand vectype,
+                          string asm, string kind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
+                              string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8,  V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8,  V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
+                            Intrinsic intOp> {
+  def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
+                                   asm, ".4s",
+        [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+// FIXME: There has got to be a better way to factor these. ugh.
+
+class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
+                     string operands, string constraints, list<dag> pattern>
+  : I<outs, ins, asm, operands, constraints, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
   let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{15} = 0;
+  let Inst{10} = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
+                      RegisterOperand vecreg, RegisterClass regtype>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
+                   "{\t$Rd" # size # ", $Rn" #
+                   "|" # size # "\t$Rd, $Rn}", "",
+                   [(set (vectype vecreg:$Rd), (AArch64dup regtype:$Rn))]> {
+  let Inst{20-16} = imm5;
+  let Inst{14-11} = 0b0001;
+}
+
+class SIMDDupFromElement<bit Q, string dstkind, string srckind,
+                         ValueType vectype, ValueType insreg,
+                         RegisterOperand vecreg, Operand idxtype,
+                         ValueType elttype, SDNode OpNode>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
+                   "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
+                   "|" # dstkind # "\t$Rd, $Rn$idx}", "",
+                 [(set (vectype vecreg:$Rd),
+                       (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
+  let Inst{14-11} = 0b0000;
+}
+
+class SIMDDup64FromElement
+  : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
+                       VectorIndexD, i64, AArch64duplane64> {
+  bits<1> idx;
+  let Inst{20} = idx;
+  let Inst{19-16} = 0b1000;
+}
+
+class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
+                       VectorIndexS, i64, AArch64duplane32> {
+  bits<2> idx;
+  let Inst{20-19} = idx;
+  let Inst{18-16} = 0b100;
+}
+
+class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
+                       VectorIndexH, i64, AArch64duplane16> {
+  bits<3> idx;
+  let Inst{20-18} = idx;
+  let Inst{17-16} = 0b10;
+}
+
+class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
+                          RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
+                       VectorIndexB, i64, AArch64duplane8> {
+  bits<4> idx;
+  let Inst{20-17} = idx;
+  let Inst{16} = 1;
+}
+
+class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
+                  Operand idxtype, string asm, list<dag> pattern>
+  : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
+                   "{\t$Rd, $Rn" # size # "$idx" #
+                   "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
+  let Inst{14-11} = imm4;
+}
+
+class SIMDSMov<bit Q, string size, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
+class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
+      [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
+
+class SIMDMovAlias<string asm, string size, Instruction inst,
+                   RegisterClass regtype, Operand idxtype>
+    : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
+                    "|" # size # "\t$dst, $src$idx}",
+                (inst regtype:$dst, V128:$src, idxtype:$idx)>;
+
+multiclass SMov {
+  def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+}
+
+multiclass UMov {
+  def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+  def : SIMDMovAlias<"mov", ".s",
+                     !cast<Instruction>(NAME#"vi32"),
+                     GPR32, VectorIndexS>;
+  def : SIMDMovAlias<"mov", ".d",
+                     !cast<Instruction>(NAME#"vi64"),
+                     GPR64, VectorIndexD>;
+}
+
+class SIMDInsFromMain<string size, ValueType vectype,
+                      RegisterClass regtype, Operand idxtype>
+  : BaseSIMDInsDup<1, 0, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" #
+                   "|" # size # "\t$Rd$idx, $Rn}",
+                   "$Rd = $dst",
+            [(set V128:$dst,
+              (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
+  let Inst{14-11} = 0b0011;
+}
+
+class SIMDInsFromElement<string size, ValueType vectype,
+                         ValueType elttype, Operand idxtype>
+  : BaseSIMDInsDup<1, 1, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
+                   "|" # size # "\t$Rd$idx, $Rn$idx2}",
+                   "$Rd = $dst",
+         [(set V128:$dst,
+               (vector_insert
+                 (vectype V128:$Rd),
+                 (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
+                 idxtype:$idx))]>;
+
+class SIMDInsMainMovAlias<string size, Instruction inst,
+                          RegisterClass regtype, Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
+                        "|" # size #"\t$dst$idx, $src}",
+                (inst V128:$dst, idxtype:$idx, regtype:$src)>;
+class SIMDInsElementMovAlias<string size, Instruction inst,
+                             Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
+                      # "|" # size #" $dst$idx, $src$idx2}",
+                (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
+
+
+multiclass SIMDIns {
+  def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
+    bits<4> idx;
+    bits<4> idx2;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+    let Inst{14-11} = idx2;
+  }
+  def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
+    bits<3> idx;
+    bits<3> idx2;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+    let Inst{14-12} = idx2;
+    let Inst{11} = 0;
+  }
+  def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
+    bits<2> idx;
+    bits<2> idx2;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+    let Inst{14-13} = idx2;
+    let Inst{12-11} = 0;
+  }
+  def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
+    bits<1> idx;
+    bits<1> idx2;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+    let Inst{14} = idx2;
+    let Inst{13-11} = 0;
+  }
+
+  // For all forms of the INS instruction, the "mov" mnemonic is the
+  // preferred alias. Why they didn't just call the instruction "mov" in
+  // the first place is a very good question indeed...
+  def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
+                         GPR32, VectorIndexB>;
+  def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
+                         GPR32, VectorIndexH>;
+  def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
+                         GPR32, VectorIndexS>;
+  def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
+                         GPR64, VectorIndexD>;
+
+  def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
+                         VectorIndexB>;
+  def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
+                         VectorIndexH>;
+  def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
+                         VectorIndexS>;
+  def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
+                         VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+class SIMDTableLookupAlias<string asm, Instruction inst,
+                          RegisterOperand vectype, RegisterOperand listtype>
+    : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
+                (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
+
+multiclass SIMDTableLookup<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+multiclass SIMDTableLookupTied<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY
+//----------------------------------------------------------------------------
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
+                        string kind, Operand idxtype>
+  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
+       "{\t$dst, $src" # kind # "$idx" #
+       "|\t$dst, $src$idx}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> dst;
+  bits<5> src;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{15-10} = 0b000001;
+  let Inst{9-5}   = src;
+  let Inst{4-0}   = dst;
+}
+
+class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
+      RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
+    : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
+                    # "|\t$dst, $src$index}",
+                (inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
+
+
+multiclass SIMDScalarCPY<string asm> {
+  def i8  : BaseSIMDScalarCPY<FPR8,  V128, ".b", VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src),
+                                                          VectorIndexD:$idx)))),
+            (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;
+
+  // 'DUP' mnemonic aliases.
+  def : SIMDScalarCPYAlias<"dup", ".b",
+                           !cast<Instruction>(NAME#"i8"),
+                           FPR8, V128, VectorIndexB>;
+  def : SIMDScalarCPYAlias<"dup", ".h",
+                           !cast<Instruction>(NAME#"i16"),
+                           FPR16, V128, VectorIndexH>;
+  def : SIMDScalarCPYAlias<"dup", ".s",
+                           !cast<Instruction>(NAME#"i32"),
+                           FPR32, V128, VectorIndexS>;
+  def : SIMDScalarCPYAlias<"dup", ".d",
+                           !cast<Instruction>(NAME#"i64"),
+                           FPR64, V128, VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//----------------------------------------------------------------------------
+
+class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
+                          string asm, string op_string,
+                          string cstr, list<dag> pattern>
+  : I<oops, iops, asm, op_string, cstr, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<8> imm8;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = op;
   let Inst{28-19} = 0b0111100000;
+  let Inst{18-16} = imm8{7-5};
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = imm8{4-0};
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
+                        !con((ins immtype:$imm8), opt_shift_iop), asm,
+                        "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                        "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "", pattern> {
+  let DecoderMethod = "DecodeModImmInstruction";
+}
+
+class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
+                        !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
+                        asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                             "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "$Rd = $dst", pattern> {
+  let DecoderMethod = "DecodeModImmTiedInstruction";
+}
+
+class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+
+class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
+                                      string asm> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
+                                                 asm, ".4h", []>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
+                                                 asm, ".8h", []>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
+                                             asm, ".2s", []>;
+  def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
+                                             asm, ".4s", []>;
+}
+
+multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
+                                      bits<2> w_cmode, string asm,
+                                      SDNode OpNode> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
+                                                 asm, ".4h",
+             [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
+                                                 asm, ".8h",
+             [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
+                                             asm, ".2s",
+             [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
+                                             asm, ".4s",
+             [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+}
+
+class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
+                             RegisterOperand vectype, string asm,
+                             string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins move_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<1> shift;
+  let Inst{15-13} = cmode{3-1};
+  let Inst{12}    = shift;
+}
+
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
+                                   RegisterOperand vectype,
+                                   Operand imm_type, string asm,
+                                   string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
+                              asm, kind, pattern> {
   let Inst{15-12} = cmode;
-  let Inst{11} = 0b0; // o2
-  let Inst{10} = 1;
-  // Inherit Rd in 4-0
-  let Inst{18-16} = Imm{7-5}; // imm a:b:c
-  let Inst{9-5} = Imm{4-0};   // imm d:e:f:g:h
 }
 
-// Format AdvSIMD 3 scalar registers with same type
+class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
+                                   list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+                        "\t$Rd, $imm8", "", pattern> {
+  let Inst{15-12} = cmode;
+  let DecoderMethod = "DecodeModImmInstruction";
+}
 
-class NeonI_Scalar3Same<bit u, bits<2> size, bits<5> opcode,
-                          dag outs, dag ins, string asmstr,
-                          list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = 0b1;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11110;
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
+      asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
   let Inst{23-22} = size;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-11} = opcode;
-  let Inst{10} = 0b1;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$dst),
+      (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
 
-// Format AdvSIMD 2 vector registers miscellaneous
-class NeonI_2VMisc<bit q, bit u, bits<2> size, bits<5> opcode,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b01110;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2f32 V64:$Rd),
+        (OpNode (v2f32 V64:$Rn),
+         (v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4f32 V128:$Rd),
+        (OpNode (v4f32 V128:$Rn),
+         (v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d",
+    [(set (v2f64 V128:$Rd),
+        (OpNode (v2f64 V128:$Rn),
+         (v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (f32 FPR32Op:$Rd),
+          (OpNode (f32 FPR32Op:$Rn),
+                  (f32 (vector_extract (v4f32 V128:$Rm),
+                                       VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d",
+    [(set (f64 FPR64Op:$Rd),
+          (OpNode (f64 FPR64Op:$Rn),
+                  (f64 (vector_extract (v2f64 V128:$Rm),
+                                       VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
+  // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # v2i32_indexed)
+                V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+
+  // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v4i32_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64duplane64 (v2f64 V128:$Rm),
+                                           VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v2i64_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64dup (f64 FPR64Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexD:$idx)>;
+}
+
+multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
+                                          V128, VectorIndexS,
+                                          asm, ".2s", ".2s", ".2s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
+                         SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s",  ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+      [(set (i32 FPR32Op:$Rd),
+            (OpNode FPR32Op:$Rn,
+                    (i32 (vector_extract (v4i32 V128:$Rm),
+                                         VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$dst),
+        (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$dst),
+       (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$dst),
+       (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$dst),
+       (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator Accum> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull
+                             (v4i16 V64:$Rn),
+                             (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
+  // intermediate EXTRACT_SUBREG would be untyped.
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                (i32 (vector_extract (v4i32
+                         (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+                             (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx)))),
+                         (i64 0))))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(NAME # v4i16_indexed)
+                    (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
+                    V128_lo:$Rm, VectorIndexH:$idx),
+                ssub)>;
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull
+                            (extract_high_v8i16 V128:$Rn),
+                            (extract_high_v8i16
+                                (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (Accum (v2i64 V128:$Rd),
+               (v2i64 (int_aarch64_neon_sqdmull
+                          (v2i32 V64:$Rn),
+                          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull
+                            (extract_high_v4i32 V128:$Rn),
+                            (extract_high_v4i32
+                                (AArch64duplane32 (v4i32 V128:$Rm),
+                                                VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (i64 FPR64Op:$dst),
+          (Accum (i64 FPR64Op:$Rd),
+                 (i64 (int_aarch64_neon_sqdmulls_scalar
+                            (i32 FPR32Op:$Rn),
+                            (i32 (vector_extract (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+        (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift by immediate
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+
+multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$Rd),
+     (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
+                                                   (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                           (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
+                                            vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (v1i64 FPR64:$Rd),
+       (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode = null_frag> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR16, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR32, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR64, vecshiftR32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+}
+
+multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftL8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftL16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftL32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
 }
 
-// Format AdvSIMD 2 vector 1 immediate shift
-class NeonI_2VShiftImm<bit q, bit u, bits<5> opcode,
-                       dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<7> Imm;
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
+//----------------------------------------------------------------------------
+// AdvSIMD vector x indexed element
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand dst_reg, RegisterOperand src_reg,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
   let Inst{28-23} = 0b011110;
-  let Inst{22-16} = Imm;
-  let Inst{15-11} = opcode;
-  let Inst{10} = 0b1;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD duplicate and insert
-class NeonI_copy<bit q, bit op, bits<4> imm4,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Imm5;
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = op;
-  let Inst{28-21} = 0b01110000;
-  let Inst{20-16} = Imm5;
-  let Inst{15} = 0b0;
-  let Inst{14-11} = imm4;
-  let Inst{10} = 0b1;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-// Format AdvSIMD insert from element to vector
-class NeonI_insert<bit q, bit op,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Imm5;
-  bits<4> Imm4;
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = op;
-  let Inst{28-21} = 0b01110000;
-  let Inst{20-16} = Imm5;
-  let Inst{15} = 0b0;
-  let Inst{14-11} = Imm4;
-  let Inst{10} = 0b1;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD scalar pairwise
-class NeonI_ScalarPair<bit u, bits<2> size, bits<5> opcode,
-                          dag outs, dag ins, string asmstr,
-                          list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = 0b1;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand vectype1, RegisterOperand vectype2,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-// Format AdvSIMD 2 vector across lanes
-class NeonI_2VAcross<bit q, bit u, bits<2> size, bits<5> opcode,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
+multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
+                              Intrinsic OpNode> {
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
+                                  Intrinsic OpNode> {
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
 }
 
-// Format AdvSIMD scalar two registers miscellaneous
-class NeonI_Scalar2SameMisc<bit u, bits<2> size, bits<5> opcode, dag outs, dag ins,
-                            string asmstr, list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = 0b1;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD vector load/store multiple N-element structure
-class NeonI_LdStMult<bit q, bit l, bits<4> opcode, bits<2> size,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
+multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
+                                     SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V128, vecshiftR16Narrow,
+                                  asm, ".8b", ".8h",
+      [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR16Narrow,
+                                  asm#"2", ".16b", ".8h", []> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V128, vecshiftR32Narrow,
+                                  asm, ".4h", ".4s",
+      [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR32Narrow,
+                                  asm#"2", ".8h", ".4s", []> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V128, vecshiftR64Narrow,
+                                  asm, ".2s", ".2d",
+      [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR64Narrow,
+                                  asm#"2", ".4s", ".2d", []> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
+  // themselves, so put them here instead.
+
+  // Patterns involving what's effectively an insert high and a normal
+  // intrinsic, represented by CONCAT_VECTORS.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
+                                                   vecshiftR16Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v16i8_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR16Narrow:$imm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
+                                                     vecshiftR32Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v8i16_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR32Narrow:$imm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
+                                                     vecshiftR64Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v4i32_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR64Narrow:$imm)>;
+}
+
+multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8, asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                           (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8, asm, ".16b", ".16b",
+             [(set (v16i8 V128:$dst),
+               (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16, asm, ".4h", ".4h",
+              [(set (v4i16 V64:$dst),
+                (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                        (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16, asm, ".8h", ".8h",
+            [(set (v8i16 V128:$dst),
+              (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                      (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32, asm, ".2s", ".2s",
+              [(set (v2i32 V64:$dst),
+                (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                        (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32, asm, ".4s", ".4s",
+            [(set (v4i32 V128:$dst),
+              (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                      (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
+              (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                      (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$dst),
+                          (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$dst),
+                          (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$dst),
+                           (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                                   (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$dst),
+                          (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+                    [(set (v2i32 V64:$dst),
+                          (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+                    [(set (v4i32 V128:$dst),
+                          (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+                    [(set (v2i64 V128:$dst),
+                          (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V64, vecshiftL8, asm, ".8h", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm#"2", ".8h", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V64, vecshiftL16, asm, ".4s", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm#"2", ".4s", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
+
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V64, vecshiftL32, asm, ".2d", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm#"2", ".2d", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+}
+
+
+//---
+// Vector load/store
+//---
+// SIMD ldX/stX no-index memory references don't allow the optional
+// ", #0" constant and handle post-indexing explicitly, so we use
+// a more specialized parse method for them. Otherwise, it's the same as
+// the general GPR64sp handling.
+
+class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
+                   string asm, dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, "\t$Vt, [$Rn]", "", pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
   let Inst{29-23} = 0b0011000;
-  let Inst{22} = l;
+  let Inst{22} = L;
   let Inst{21-16} = 0b000000;
   let Inst{15-12} = opcode;
   let Inst{11-10} = size;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD vector load/store multiple N-element structure (post-index)
-class NeonI_LdStMult_Post<bit q, bit l, bits<4> opcode, bits<2> size,
-                         dag outs, dag ins, string asmstr,
-                         list<dag> patterns, InstrItinClass itin>
-  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
+                       string asm, dag oops, dag iops>
+  : I<oops, iops, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", []> {
+  bits<5> Vt;
+  bits<5> Rn;
+  bits<5> Xm;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
   let Inst{29-23} = 0b0011001;
-  let Inst{22} = l;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
+  let Inst{22} = L;
+  let Inst{21} = 0;
+  let Inst{20-16} = Xm;
   let Inst{15-12} = opcode;
   let Inst{11-10} = size;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD vector load Single N-element structure to all lanes
-class NeonI_LdOne_Dup<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
-                      dag ins, string asmstr, list<dag> patterns,
-                      InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-23} = 0b0011010;
-  let Inst{22} = 0b1;
-  let Inst{21} = r;
-  let Inst{20-16} = 0b00000;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+// The immediate form of AdvSIMD post-indexed addressing is encoded with
+// register post-index addressing from the zero register.
+multiclass SIMDLdStAliases<string asm, string layout, string Count,
+                           int Offset, int Size> {
+  // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
+  //      "ld1\t$Vt, [$Rn], #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      XZR), 1>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], #16"
+  //      "ld1.8b\t$Vt, [$Rn], #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      XZR), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1]"
+  //      "ld1\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+                  (!cast<Instruction>(NAME # Count # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], x2"
+  //      "ld1\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+
+
+    def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+// Only ld1/st1 has a v1d version.
+multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
+    def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
+                            (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                                 GPR64sp:$Rn), []>;
+    def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                                GPR64sp:$Rn), []>;
+
+    def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDLd1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // LD1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDSt1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // ST1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+    def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                                GPR64sp:$Rn), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass SIMDLd1Multiple<string asm> {
+  defm One   : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDSt1Multiple<string asm> {
+  defm One   : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDLd2Multiple<string asm> {
+  defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDSt2Multiple<string asm> {
+  defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDLd3Multiple<string asm> {
+  defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDSt3Multiple<string asm> {
+  defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDLd4Multiple<string asm> {
+  defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+multiclass SIMDSt4Multiple<string asm> {
+  defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+//---
+// AdvSIMD Load/store single-element
+//---
+
+class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, string cst,
+                         dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, operands, cst, pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
+  let Inst{15-13} = opcode;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, string cst,
+                         dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
   let Inst{15-13} = opcode;
-  let Inst{12} = 0b0;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
+                  Operand listtype>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
+                       (outs listtype:$Vt), (ins GPR64sp:$Rn),
+                       []> {
+  let Inst{30} = Q;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = S;
   let Inst{11-10} = size;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
+                      string asm, Operand listtype, Operand GPR64pi>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
+                       "$Rn = $wback",
+                       (outs GPR64sp:$wback, listtype:$Vt),
+                       (ins GPR64sp:$Rn, GPR64pi:$Xm), []> {
+  bits<5> Xm;
+  let Inst{30} = Q;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = S;
+  let Inst{11-10} = size;
+}
+
+multiclass SIMDLdrAliases<string asm, string layout, string Count,
+                          int Offset, int Size> {
+  // E.g. "ld1r { v0.8b }, [x1], #1"
+  //      "ld1r.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      XZR), 1>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], #1"
+  //      "ld1r.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      XZR), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1]"
+  //      "ld1r.8b\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+                  (!cast<Instruction>(NAME # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], x2"
+  //      "ld1r.8b\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
+  int Offset1, int Offset2, int Offset4, int Offset8> {
+  def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count # "8b")>;
+  def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count #"16b")>;
+  def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"4h")>;
+  def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"8h")>;
+  def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"2s")>;
+  def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"4s")>;
+  def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"1d")>;
+  def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"2d")>;
+
+  def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "8b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "16b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "4h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "8h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "2s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "4s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "1d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+  def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "2d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+
+  defm : SIMDLdrAliases<asm, "8b",  Count, Offset1,  64>;
+  defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
+  defm : SIMDLdrAliases<asm, "4h",  Count, Offset2,  64>;
+  defm : SIMDLdrAliases<asm, "8h",  Count, Offset2, 128>;
+  defm : SIMDLdrAliases<asm, "2s",  Count, Offset4,  64>;
+  defm : SIMDLdrAliases<asm, "4s",  Count, Offset4, 128>;
+  defm : SIMDLdrAliases<asm, "1d",  Count, Offset8,  64>;
+  defm : SIMDLdrAliases<asm, "2d",  Count, Offset8, 128>;
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD vector load/store Single N-element structure to/from one lane
-class NeonI_LdStOne_Lane<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
-                         dag ins, string asmstr,
-                         list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
-  bits<4> lane;
-  let Inst{31} = 0b0;
-  let Inst{29-23} = 0b0011010;
-  let Inst{22} = l;
-  let Inst{21} = r;
+class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
   let Inst{20-16} = 0b00000;
-  let Inst{15-14} = op2_1;
-  let Inst{13} = op0;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD post-index vector load Single N-element structure to all lanes
-class NeonI_LdOne_Dup_Post<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
-                           dag ins, string asmstr, list<dag> patterns,
-                           InstrItinClass itin>
-  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-23} = 0b0011011;
-  let Inst{22} = 0b1;
-  let Inst{21} = r;
-  // Inherit Rm in 20-16
-  let Inst{15-13} = opcode;
-  let Inst{12} = 0b0;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+
+class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+
+class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
   let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD post-index vector load/store Single N-element structure
-// to/from one lane
-class NeonI_LdStOne_Lane_Post<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
-                         dag ins, string asmstr,
-                         list<dag> patterns, InstrItinClass itin>
-  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
-{
-  bits<4> lane;
-  let Inst{31} = 0b0;
-  let Inst{29-23} = 0b0011011;
-  let Inst{22} = l;
-  let Inst{21} = r;
-  // Inherit Rm in 20-16
-  let Inst{15-14} = op2_1;
-  let Inst{13} = op0;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD 3 scalar registers with different type
-
-class NeonI_Scalar3Diff<bit u, bits<2> size, bits<4> opcode,
-                          dag outs, dag ins, string asmstr,
-                          list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-30} = 0b01;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b00;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
+                           (outs listtype:$dst),
+                           (ins listtype:$Vt, VectorIndexB:$idx,
+                                GPR64sp:$Rn), []>;
+
+  def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexB:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleB<0, R, opcode, asm,
+                           (outs), (ins listtype:$Vt, VectorIndexB:$idx,
+                                        GPR64sp:$Rn), []>;
+
+  def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
+                                    (outs GPR64sp:$wback),
+                                    (ins listtype:$Vt, VectorIndexB:$idx,
+                                         GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
 }
 
-// Format AdvSIMD scalar shift by immediate
+multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
+                                 string Count, int Offset, Operand idxtype> {
+  // E.g. "ld1 { v0.8b }[0], [x1], #1"
+  //      "ld1\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt$idx, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Type  # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      idxtype:$idx, XZR), 1>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], #1"
+  //      "ld1.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Type # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                      idxtype:$idx, XZR), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1]"
+  //      "ld1.8b\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn]",
+                      (!cast<Instruction>(NAME # Type)
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx, GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], x2"
+  //      "ld1.8b\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], $Xm",
+                      (!cast<Instruction>(NAME # Type # "_POST")
+                         GPR64sp:$Rn,
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx,
+                         !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
 
-class NeonI_ScalarShiftImm<bit u, bits<5> opcode,
-                           dag outs, dag ins, string asmstr,
-                           list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<4> Imm4;
-  bits<3> Imm3;
-  let Inst{31-30} = 0b01;
-  let Inst{29} = u;
-  let Inst{28-23} = 0b111110;
-  let Inst{22-19} = Imm4;
-  let Inst{18-16} = Imm3;
-  let Inst{15-11} = opcode;
-  let Inst{10} = 0b1;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+multiclass SIMDLdSt1SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
 }
 
-// Format AdvSIMD crypto AES
-class NeonI_Crypto_AES<bits<2> size, bits<5> opcode,
-                       dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-24} = 0b01001110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10100;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+multiclass SIMDLdSt2SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
 }
 
-// Format AdvSIMD crypto SHA
-class NeonI_Crypto_SHA<bits<2> size, bits<5> opcode,
-                       dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-24} = 0b01011110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10100;
-  let Inst{16-12} = opcode;
+multiclass SIMDLdSt3SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
+}
+
+multiclass SIMDLdSt4SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
+}
+} // end of 'let Predicates = [HasNEON]'
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+let Predicates = [HasCrypto] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
+              list<dag> pat>
+  : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0100111000101000;
+  let Inst{15-12} = opc;
   let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-// Format AdvSIMD crypto 3V SHA
-class NeonI_Crypto_3VSHA<bits<2> size, bits<3> opcode,
-                         dag outs, dag ins, string asmstr,
-                         list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-24} = 0b01011110;
-  let Inst{23-22} = size;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
-  let Inst{15} = 0b0;
-  let Inst{14-12} = opcode;
+class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
+            [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
+            "$Rd = $dst",
+            [(set (v16i8 V128:$dst),
+                  (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
+                     dag oops, dag iops, list<dag> pat>
+  : I<oops, iops, asm,
+      "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
+      "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
   let Inst{11-10} = 0b00;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD scalar x indexed element
-class NeonI_ScalarXIndexedElem<bit u, bit szhi, bit szlo,
-                               bits<4> opcode, dag outs, dag ins,
-                               string asmstr, list<dag> patterns,
-                               InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = 0b1;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11111;
-  let Inst{23} = szhi;
-  let Inst{22} = szlo;
-  // l in Inst{21}
-  // m in Instr{20}
-  // Inherit Rm in 19-16
-  let Inst{15-12} = opcode;
-  // h in Inst{11}
-  let Inst{10} = 0b0;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-// Format AdvSIMD scalar copy - insert from element to scalar
-class NeonI_ScalarCopy<dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : NeonI_copy<0b1, 0b0, 0b0000, outs, ins, asmstr, patterns, itin> {
-  let Inst{28} = 0b1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
+
+class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
+                   (ins V128:$Rd, V128:$Rn, V128:$Rm),
+                   [(set (v4i32 V128:$dst),
+                         (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA2OpInst<bits<4> opc, string asm, string kind,
+                 string cstr, dag oops, dag iops,
+                 list<dag> pat>
+  : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
+                       "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0101111000101000;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
+class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
+               (ins V128:$Rd, V128:$Rn),
+               [(set (v4i32 V128:$dst),
+                     (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+
+class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
+               [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+} // end of 'let Predicates = [HasCrypto]'
+
+// Allow the size specifier tokens to be upper case, not just lower.
+def : TokenAlias<".8B", ".8b">;
+def : TokenAlias<".4H", ".4h">;
+def : TokenAlias<".2S", ".2s">;
+def : TokenAlias<".1D", ".1d">;
+def : TokenAlias<".16B", ".16b">;
+def : TokenAlias<".8H", ".8h">;
+def : TokenAlias<".4S", ".4s">;
+def : TokenAlias<".2D", ".2d">;
+def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".B", ".b">;
+def : TokenAlias<".H", ".h">;
+def : TokenAlias<".S", ".s">;
+def : TokenAlias<".D", ".d">;
+def : TokenAlias<".Q", ".q">;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index afb2034..ff115c0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -11,257 +11,83 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
 #include "AArch64InstrInfo.h"
-#include "AArch64MachineFunctionInfo.h"
-#include "AArch64TargetMachine.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "Utils/AArch64BaseInfo.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineDominators.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
-#include <algorithm>
+
+using namespace llvm;
 
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
-using namespace llvm;
-
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
-  : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
-    Subtarget(STI) {}
+    : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
+      RI(this, &STI), Subtarget(STI) {}
 
-void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I, DebugLoc DL,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   bool KillSrc) const {
-  unsigned Opc = 0;
-  unsigned ZeroReg = 0;
-  if (DestReg == AArch64::XSP || SrcReg == AArch64::XSP) {
-    // E.g. ADD xDst, xsp, #0 (, lsl #0)
-    BuildMI(MBB, I, DL, get(AArch64::ADDxxi_lsl0_s), DestReg)
-      .addReg(SrcReg)
-      .addImm(0);
-    return;
-  } else if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
-    // E.g. ADD wDST, wsp, #0 (, lsl #0)
-    BuildMI(MBB, I, DL, get(AArch64::ADDwwi_lsl0_s), DestReg)
-      .addReg(SrcReg)
-      .addImm(0);
-    return;
-  } else if (DestReg == AArch64::NZCV) {
-    assert(AArch64::GPR64RegClass.contains(SrcReg));
-    // E.g. MSR NZCV, xDST
-    BuildMI(MBB, I, DL, get(AArch64::MSRix))
-      .addImm(A64SysReg::NZCV)
-      .addReg(SrcReg);
-  } else if (SrcReg == AArch64::NZCV) {
-    assert(AArch64::GPR64RegClass.contains(DestReg));
-    // E.g. MRS xDST, NZCV
-    BuildMI(MBB, I, DL, get(AArch64::MRSxi), DestReg)
-      .addImm(A64SysReg::NZCV);
-  } else if (AArch64::GPR64RegClass.contains(DestReg)) {
-    if(AArch64::GPR64RegClass.contains(SrcReg)){
-      Opc = AArch64::ORRxxx_lsl;
-      ZeroReg = AArch64::XZR;
-    } else{
-      assert(AArch64::FPR64RegClass.contains(SrcReg));
-      BuildMI(MBB, I, DL, get(AArch64::FMOVxd), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-  } else if (AArch64::GPR32RegClass.contains(DestReg)) {
-    if(AArch64::GPR32RegClass.contains(SrcReg)){
-      Opc = AArch64::ORRwww_lsl;
-      ZeroReg = AArch64::WZR;
-    } else{
-      assert(AArch64::FPR32RegClass.contains(SrcReg));
-      BuildMI(MBB, I, DL, get(AArch64::FMOVws), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-  } else if (AArch64::FPR32RegClass.contains(DestReg)) {
-    if(AArch64::FPR32RegClass.contains(SrcReg)){
-      BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-    else {
-      assert(AArch64::GPR32RegClass.contains(SrcReg));
-      BuildMI(MBB, I, DL, get(AArch64::FMOVsw), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-  } else if (AArch64::FPR64RegClass.contains(DestReg)) {
-    if(AArch64::FPR64RegClass.contains(SrcReg)){
-      BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-    else {
-      assert(AArch64::GPR64RegClass.contains(SrcReg));
-      BuildMI(MBB, I, DL, get(AArch64::FMOVdx), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-  } else if (AArch64::FPR128RegClass.contains(DestReg)) {
-    assert(AArch64::FPR128RegClass.contains(SrcReg));
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MCInstrDesc &Desc = MI->getDesc();
 
-    // If NEON is enable, we use ORR to implement this copy.
-    // If NEON isn't available, emit STR and LDR to handle this.
-    if(getSubTarget().hasNEON()) {
-      BuildMI(MBB, I, DL, get(AArch64::ORRvvv_16B), DestReg)
-        .addReg(SrcReg)
-        .addReg(SrcReg);
-      return;
-    } else {
-      BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP)
-        .addReg(SrcReg)
-        .addReg(AArch64::XSP)
-        .addImm(0x1ff & -16);
-
-      BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg)
-        .addReg(AArch64::XSP, RegState::Define)
-        .addReg(AArch64::XSP)
-        .addImm(16);
-      return;
-    }
-  } else if (AArch64::FPR8RegClass.contains(DestReg, SrcReg)) {
-    // The copy of two FPR8 registers is implemented by the copy of two FPR32
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-    unsigned Dst = TRI->getMatchingSuperReg(DestReg, AArch64::sub_8,
-                                            &AArch64::FPR32RegClass);
-    unsigned Src = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_8,
-                                            &AArch64::FPR32RegClass);
-    BuildMI(MBB, I, DL, get(AArch64::FMOVss), Dst)
-      .addReg(Src);
-    return;
-  } else if (AArch64::FPR16RegClass.contains(DestReg, SrcReg)) {
-    // The copy of two FPR16 registers is implemented by the copy of two FPR32
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-    unsigned Dst = TRI->getMatchingSuperReg(DestReg, AArch64::sub_16,
-                                            &AArch64::FPR32RegClass);
-    unsigned Src = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_16,
-                                            &AArch64::FPR32RegClass);
-    BuildMI(MBB, I, DL, get(AArch64::FMOVss), Dst)
-      .addReg(Src);
-    return;
-  } else {
-    CopyPhysRegTuple(MBB, I, DL, DestReg, SrcReg);
-    return;
+  switch (Desc.getOpcode()) {
+  default:
+    // Anything not explicitly designated otherwise is a nomal 4-byte insn.
+    return 4;
+  case TargetOpcode::DBG_VALUE:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+    return 0;
   }
 
-  // E.g. ORR xDst, xzr, xSrc, lsl #0
-  BuildMI(MBB, I, DL, get(Opc), DestReg)
-    .addReg(ZeroReg)
-    .addReg(SrcReg)
-    .addImm(0);
-}
-
-void AArch64InstrInfo::CopyPhysRegTuple(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator I,
-                                        DebugLoc DL, unsigned DestReg,
-                                        unsigned SrcReg) const {
-  unsigned SubRegs;
-  bool IsQRegs;
-  if (AArch64::DPairRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 2;
-    IsQRegs = false;
-  } else if (AArch64::DTripleRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 3;
-    IsQRegs = false;
-  } else if (AArch64::DQuadRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 4;
-    IsQRegs = false;
-  } else if (AArch64::QPairRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 2;
-    IsQRegs = true;
-  } else if (AArch64::QTripleRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 3;
-    IsQRegs = true;
-  } else if (AArch64::QQuadRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 4;
-    IsQRegs = true;
-  } else
-    llvm_unreachable("Unknown register class");
-
-  unsigned BeginIdx = IsQRegs ? AArch64::qsub_0 : AArch64::dsub_0;
-  int Spacing = 1;
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  // Copy register tuples backward when the first Dest reg overlaps
-  // with SrcReg.
-  if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
-    BeginIdx = BeginIdx + (SubRegs - 1);
-    Spacing = -1;
-  }
-
-  unsigned Opc = IsQRegs ? AArch64::ORRvvv_16B : AArch64::ORRvvv_8B;
-  for (unsigned i = 0; i != SubRegs; ++i) {
-    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
-    unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
-    assert(Dst && Src && "Bad sub-register");
-    BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
-        .addReg(Src)
-        .addReg(Src);
-  }
-  return;
-}
-
-/// Does the Opcode represent a conditional branch that we can remove and re-add
-/// at the end of a basic block?
-static bool isCondBranch(unsigned Opc) {
-  return Opc == AArch64::Bcc || Opc == AArch64::CBZw || Opc == AArch64::CBZx ||
-         Opc == AArch64::CBNZw || Opc == AArch64::CBNZx ||
-         Opc == AArch64::TBZwii || Opc == AArch64::TBZxii ||
-         Opc == AArch64::TBNZwii || Opc == AArch64::TBNZxii;
-}
-
-/// Takes apart a given conditional branch MachineInstr (see isCondBranch),
-/// setting TBB to the destination basic block and populating the Cond vector
-/// with data necessary to recreate the conditional branch at a later
-/// date. First element will be the opcode, and subsequent ones define the
-/// conditions being branched on in an instruction-specific manner.
-static void classifyCondBranch(MachineInstr *I, MachineBasicBlock *&TBB,
-                               SmallVectorImpl<MachineOperand> &Cond) {
-  switch(I->getOpcode()) {
-  case AArch64::Bcc:
-  case AArch64::CBZw:
-  case AArch64::CBZx:
-  case AArch64::CBNZw:
-  case AArch64::CBNZx:
-    // These instructions just have one predicate operand in position 0 (either
-    // a condition code or a register being compared).
-    Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
-    Cond.push_back(I->getOperand(0));
-    TBB = I->getOperand(1).getMBB();
-    return;
-  case AArch64::TBZwii:
-  case AArch64::TBZxii:
-  case AArch64::TBNZwii:
-  case AArch64::TBNZxii:
-    // These have two predicate operands: a register and a bit position.
-    Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
-    Cond.push_back(I->getOperand(0));
-    Cond.push_back(I->getOperand(1));
-    TBB = I->getOperand(2).getMBB();
-    return;
+  llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size");
+}
+
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  // Block ends with fall-through condbranch.
+  switch (LastInst->getOpcode()) {
   default:
-    llvm_unreachable("Unknown conditional branch to classify");
+    llvm_unreachable("Unknown branch instruction?");
+  case AArch64::Bcc:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    Target = LastInst->getOperand(2).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    Cond.push_back(LastInst->getOperand(1));
   }
 }
 
-
-bool
-AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
-                                MachineBasicBlock *&FBB,
-                                SmallVectorImpl<MachineOperand> &Cond,
-                                bool AllowModify) const {
+// Branch analysis.
+bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin())
@@ -281,15 +107,16 @@ AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
   // If there is only one terminator instruction, process it.
   unsigned LastOpc = LastInst->getOpcode();
   if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (LastOpc == AArch64::Bimm) {
+    if (isUncondBranchOpcode(LastOpc)) {
       TBB = LastInst->getOperand(0).getMBB();
       return false;
     }
-    if (isCondBranch(LastOpc)) {
-      classifyCondBranch(LastInst, TBB, Cond);
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      parseCondBranch(LastInst, TBB, Cond);
       return false;
     }
-    return true;  // Can't handle indirect branch.
+    return true; // Can't handle indirect branch.
   }
 
   // Get the instruction before it if it is a terminator.
@@ -298,8 +125,8 @@ AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
 
   // If AllowModify is true and the block ends with two or more unconditional
   // branches, delete all but the first unconditional branch.
-  if (AllowModify && LastOpc == AArch64::Bimm) {
-    while (SecondLastOpc == AArch64::Bimm) {
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
       LastInst->eraseFromParent();
       LastInst = SecondLastInst;
       LastOpc = LastInst->getOpcode();
@@ -319,23 +146,15 @@ AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
     return true;
 
   // If the block ends with a B and a Bcc, handle it.
-  if (LastOpc == AArch64::Bimm) {
-    if (SecondLastOpc == AArch64::Bcc) {
-      TBB =  SecondLastInst->getOperand(1).getMBB();
-      Cond.push_back(MachineOperand::CreateImm(AArch64::Bcc));
-      Cond.push_back(SecondLastInst->getOperand(0));
-      FBB = LastInst->getOperand(0).getMBB();
-      return false;
-    } else if (isCondBranch(SecondLastOpc)) {
-      classifyCondBranch(SecondLastInst, TBB, Cond);
-      FBB = LastInst->getOperand(0).getMBB();
-      return false;
-    }
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    parseCondBranch(SecondLastInst, TBB, Cond);
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
   }
 
   // If the block ends with two unconditional branches, handle it.  The second
   // one is not executed, so remove it.
-  if (SecondLastOpc == AArch64::Bimm && LastOpc == AArch64::Bimm) {
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
     TBB = SecondLastInst->getOperand(0).getMBB();
     I = LastInst;
     if (AllowModify)
@@ -343,84 +162,72 @@ AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
     return false;
   }
 
+  // ...likewise if it ends with an indirect branch followed by an unconditional
+  // branch.
+  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
   // Otherwise, can't handle this.
   return true;
 }
 
 bool AArch64InstrInfo::ReverseBranchCondition(
-                                  SmallVectorImpl<MachineOperand> &Cond) const {
-  switch (Cond[0].getImm()) {
-  case AArch64::Bcc: {
-    A64CC::CondCodes CC = static_cast<A64CC::CondCodes>(Cond[1].getImm());
-    CC = A64InvertCondCode(CC);
-    Cond[1].setImm(CC);
-    return false;
-  }
-  case AArch64::CBZw:
-    Cond[0].setImm(AArch64::CBNZw);
-    return false;
-  case AArch64::CBZx:
-    Cond[0].setImm(AArch64::CBNZx);
-    return false;
-  case AArch64::CBNZw:
-    Cond[0].setImm(AArch64::CBZw);
-    return false;
-  case AArch64::CBNZx:
-    Cond[0].setImm(AArch64::CBZx);
-    return false;
-  case AArch64::TBZwii:
-    Cond[0].setImm(AArch64::TBNZwii);
-    return false;
-  case AArch64::TBZxii:
-    Cond[0].setImm(AArch64::TBNZxii);
-    return false;
-  case AArch64::TBNZwii:
-    Cond[0].setImm(AArch64::TBZwii);
-    return false;
-  case AArch64::TBNZxii:
-    Cond[0].setImm(AArch64::TBZxii);
-    return false;
-  default:
-    llvm_unreachable("Unknown branch type");
-  }
-}
-
-
-unsigned
-AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                               MachineBasicBlock *FBB,
-                               const SmallVectorImpl<MachineOperand> &Cond,
-                               DebugLoc DL) const {
-  if (FBB == 0 && Cond.empty()) {
-    BuildMI(&MBB, DL, get(AArch64::Bimm)).addMBB(TBB);
-    return 1;
-  } else if (FBB == 0) {
-    MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
-    for (int i = 1, e = Cond.size(); i != e; ++i)
-      MIB.addOperand(Cond[i]);
-    MIB.addMBB(TBB);
-    return 1;
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
+  } else {
+    // Folded compare-and-branch
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown conditional branch!");
+    case AArch64::CBZW:
+      Cond[1].setImm(AArch64::CBNZW);
+      break;
+    case AArch64::CBNZW:
+      Cond[1].setImm(AArch64::CBZW);
+      break;
+    case AArch64::CBZX:
+      Cond[1].setImm(AArch64::CBNZX);
+      break;
+    case AArch64::CBNZX:
+      Cond[1].setImm(AArch64::CBZX);
+      break;
+    case AArch64::TBZW:
+      Cond[1].setImm(AArch64::TBNZW);
+      break;
+    case AArch64::TBNZW:
+      Cond[1].setImm(AArch64::TBZW);
+      break;
+    case AArch64::TBZX:
+      Cond[1].setImm(AArch64::TBNZX);
+      break;
+    case AArch64::TBNZX:
+      Cond[1].setImm(AArch64::TBZX);
+      break;
+    }
   }
 
-  MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
-  for (int i = 1, e = Cond.size(); i != e; ++i)
-    MIB.addOperand(Cond[i]);
-  MIB.addMBB(TBB);
-
-  BuildMI(&MBB, DL, get(AArch64::Bimm)).addMBB(FBB);
-  return 2;
+  return false;
 }
 
 unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin()) return 0;
+  if (I == MBB.begin())
+    return 0;
   --I;
   while (I->isDebugValue()) {
     if (I == MBB.begin())
       return 0;
     --I;
   }
-  if (I->getOpcode() != AArch64::Bimm && !isCondBranch(I->getOpcode()))
+  if (!isUncondBranchOpcode(I->getOpcode()) &&
+      !isCondBranchOpcode(I->getOpcode()))
     return 0;
 
   // Remove the branch.
@@ -428,9 +235,10 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 
   I = MBB.end();
 
-  if (I == MBB.begin()) return 1;
+  if (I == MBB.begin())
+    return 1;
   --I;
-  if (!isCondBranch(I->getOpcode()))
+  if (!isCondBranchOpcode(I->getOpcode()))
     return 1;
 
   // Remove the branch.
@@ -438,542 +246,1838 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
-bool
-AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const {
-  MachineInstr &MI = *MBBI;
-  MachineBasicBlock &MBB = *MI.getParent();
+void AArch64InstrInfo::instantiateCondBranch(
+    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
+    const SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
+  } else {
+    // Folded compare-and-branch
+    const MachineInstrBuilder MIB =
+        BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
+    if (Cond.size() > 3)
+      MIB.addImm(Cond[3].getImm());
+    MIB.addMBB(TBB);
+  }
+}
 
-  unsigned Opcode = MI.getOpcode();
-  switch (Opcode) {
-  case AArch64::TLSDESC_BLRx: {
-    MachineInstr *NewMI =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(AArch64::TLSDESCCALL))
-        .addOperand(MI.getOperand(1));
-    MI.setDesc(get(AArch64::BLRx));
-
-    llvm::finalizeBundle(MBB, NewMI, *++MBBI);
-    return true;
-    }
+unsigned AArch64InstrInfo::InsertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  if (!FBB) {
+    if (Cond.empty()) // Unconditional branch?
+      BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
+    else
+      instantiateCondBranch(MBB, DL, TBB, Cond);
+    return 1;
+  }
+
+  // Two-way conditional branch.
+  instantiateCondBranch(MBB, DL, TBB, Cond);
+  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
+  return 2;
+}
+
+// Find the original register that VReg is copied from.
+static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
+  while (TargetRegisterInfo::isVirtualRegister(VReg)) {
+    const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+    if (!DefMI->isFullCopy())
+      return VReg;
+    VReg = DefMI->getOperand(1).getReg();
+  }
+  return VReg;
+}
+
+// Determine if VReg is defined by an instruction that can be folded into a
+// csel instruction. If so, return the folded opcode, and the replacement
+// register.
+static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
+                                unsigned *NewVReg = nullptr) {
+  VReg = removeCopies(MRI, VReg);
+  if (!TargetRegisterInfo::isVirtualRegister(VReg))
+    return 0;
+
+  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
+  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+  unsigned Opc = 0;
+  unsigned SrcOpNum = 0;
+  switch (DefMI->getOpcode()) {
+  case AArch64::ADDSXri:
+  case AArch64::ADDSWri:
+    // if NZCV is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+      return 0;
+  // fall-through to ADDXri and ADDWri.
+  case AArch64::ADDXri:
+  case AArch64::ADDWri:
+    // add x, 1 -> csinc.
+    if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
+        DefMI->getOperand(3).getImm() != 0)
+      return 0;
+    SrcOpNum = 1;
+    Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
+    break;
+
+  case AArch64::ORNXrr:
+  case AArch64::ORNWrr: {
+    // not x -> csinv, represented as orn dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
+    break;
+  }
+
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSWrr:
+    // if NZCV is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+      return 0;
+  // fall-through to SUBXrr and SUBWrr.
+  case AArch64::SUBXrr:
+  case AArch64::SUBWrr: {
+    // neg x -> csneg, represented as sub dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
+    break;
+  }
   default:
+    return 0;
+  }
+  assert(Opc && SrcOpNum && "Missing parameters");
+
+  if (NewVReg)
+    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
+  return Opc;
+}
+
+bool AArch64InstrInfo::canInsertSelect(
+    const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond,
+    unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
+    int &FalseCycles) const {
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
     return false;
+
+  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
+  unsigned ExtraCondLat = Cond.size() != 1;
+
+  // GPRs are handled by csel.
+  // FIXME: Fold in x+1, -x, and ~x when applicable.
+  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
+      AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+    // Single-cycle csel, csinc, csinv, and csneg.
+    CondCycles = 1 + ExtraCondLat;
+    TrueCycles = FalseCycles = 1;
+    if (canFoldIntoCSel(MRI, TrueReg))
+      TrueCycles = 0;
+    else if (canFoldIntoCSel(MRI, FalseReg))
+      FalseCycles = 0;
+    return true;
   }
 
+  // Scalar floating point is handled by fcsel.
+  // FIXME: Form fabs, fmin, and fmax when applicable.
+  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
+      AArch64::FPR32RegClass.hasSubClassEq(RC)) {
+    CondCycles = 5 + ExtraCondLat;
+    TrueCycles = FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
   return false;
 }
 
-void
-AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator MBBI,
-                                      unsigned SrcReg, bool isKill,
-                                      int FrameIdx,
-                                      const TargetRegisterClass *RC,
-                                      const TargetRegisterInfo *TRI) const {
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FrameIdx);
-
-  MachineMemOperand *MMO
-    = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                              MachineMemOperand::MOStore,
-                              MFI.getObjectSize(FrameIdx),
-                              Align);
-
-  unsigned StoreOp = 0;
-  if (RC->hasType(MVT::i64) || RC->hasType(MVT::i32)) {
-    switch(RC->getSize()) {
-    case 4: StoreOp = AArch64::LS32_STR; break;
-    case 8: StoreOp = AArch64::LS64_STR; break;
+void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I, DebugLoc DL,
+                                    unsigned DstReg,
+                                    const SmallVectorImpl<MachineOperand> &Cond,
+                                    unsigned TrueReg, unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  // Parse the condition code, see parseCondBranch() above.
+  AArch64CC::CondCode CC;
+  switch (Cond.size()) {
+  default:
+    llvm_unreachable("Unknown condition opcode in Cond");
+  case 1: // b.cc
+    CC = AArch64CC::CondCode(Cond[0].getImm());
+    break;
+  case 3: { // cbz/cbnz
+    // We must insert a compare against 0.
+    bool Is64Bit;
+    switch (Cond[1].getImm()) {
     default:
-      llvm_unreachable("Unknown size for regclass");
-    }
-  } else if (AArch64::FPR8RegClass.hasSubClassEq(RC)) {
-    StoreOp = AArch64::LSFP8_STR;
-  } else if (AArch64::FPR16RegClass.hasSubClassEq(RC)) {
-    StoreOp = AArch64::LSFP16_STR;
-  } else if (RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
-             RC->hasType(MVT::f128)) {
-    switch (RC->getSize()) {
-    case 4: StoreOp = AArch64::LSFP32_STR; break;
-    case 8: StoreOp = AArch64::LSFP64_STR; break;
-    case 16: StoreOp = AArch64::LSFP128_STR; break;
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case AArch64::CBZW:
+      Is64Bit = 0;
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::CBZX:
+      Is64Bit = 1;
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::CBNZW:
+      Is64Bit = 0;
+      CC = AArch64CC::NE;
+      break;
+    case AArch64::CBNZX:
+      Is64Bit = 1;
+      CC = AArch64CC::NE;
+      break;
+    }
+    unsigned SrcReg = Cond[2].getReg();
+    if (Is64Bit) {
+      // cmp reg, #0 is actually subs xzr, reg, #0.
+      MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
+      BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    } else {
+      MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
+      BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    }
+    break;
+  }
+  case 4: { // tbz/tbnz
+    // We must insert a tst instruction.
+    switch (Cond[1].getImm()) {
     default:
-      llvm_unreachable("Unknown size for regclass");
-    }
-  } else { // For a super register class has more than one sub registers
-    if (AArch64::DPairRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x2_8B;
-    else if (AArch64::DTripleRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x3_8B;
-    else if (AArch64::DQuadRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x4_8B;
-    else if (AArch64::QPairRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x2_16B;
-    else if (AArch64::QTripleRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x3_16B;
-    else if (AArch64::QQuadRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x4_16B;
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case AArch64::TBZW:
+    case AArch64::TBZX:
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::TBNZW:
+    case AArch64::TBNZX:
+      CC = AArch64CC::NE;
+      break;
+    }
+    // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
+    if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
+      BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
+          .addReg(Cond[2].getReg())
+          .addImm(
+              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
     else
-      llvm_unreachable("Unknown reg class");
+      BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
+          .addReg(Cond[2].getReg())
+          .addImm(
+              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
+    break;
+  }
+  }
 
-    MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp));
-    // Vector store has different operands from other store instructions.
-    NewMI.addFrameIndex(FrameIdx)
-         .addReg(SrcReg, getKillRegState(isKill))
-         .addMemOperand(MMO);
-    return;
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  bool TryFold = false;
+  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
+    RC = &AArch64::GPR64RegClass;
+    Opc = AArch64::CSELXr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
+    RC = &AArch64::GPR32RegClass;
+    Opc = AArch64::CSELWr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
+    RC = &AArch64::FPR64RegClass;
+    Opc = AArch64::FCSELDrrr;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
+    RC = &AArch64::FPR32RegClass;
+    Opc = AArch64::FCSELSrrr;
+  }
+  assert(RC && "Unsupported regclass");
+
+  // Try folding simple instructions into the csel.
+  if (TryFold) {
+    unsigned NewVReg = 0;
+    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
+    if (FoldedOpc) {
+      // The folded opcodes csinc, csinc and csneg apply the operation to
+      // FalseReg, so we need to invert the condition.
+      CC = AArch64CC::getInvertedCondCode(CC);
+      TrueReg = FalseReg;
+    } else
+      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
+
+    // Fold the operation. Leave any dead instructions for DCE to clean up.
+    if (FoldedOpc) {
+      FalseReg = NewVReg;
+      Opc = FoldedOpc;
+      // The extends the live range of NewVReg.
+      MRI.clearKillFlags(NewVReg);
+    }
   }
 
-  MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp));
-  NewMI.addReg(SrcReg, getKillRegState(isKill))
-    .addFrameIndex(FrameIdx)
-    .addImm(0)
-    .addMemOperand(MMO);
+  // Pull all virtual register into the appropriate class.
+  MRI.constrainRegClass(TrueReg, RC);
+  MRI.constrainRegClass(FalseReg, RC);
 
+  // Insert the csel.
+  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
+      CC);
 }
 
-void
-AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator MBBI,
-                                       unsigned DestReg, int FrameIdx,
-                                       const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI) const {
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FrameIdx);
-
-  MachineMemOperand *MMO
-    = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                              MachineMemOperand::MOLoad,
-                              MFI.getObjectSize(FrameIdx),
-                              Align);
-
-  unsigned LoadOp = 0;
-  if (RC->hasType(MVT::i64) || RC->hasType(MVT::i32)) {
-    switch(RC->getSize()) {
-    case 4: LoadOp = AArch64::LS32_LDR; break;
-    case 8: LoadOp = AArch64::LS64_LDR; break;
-    default:
-      llvm_unreachable("Unknown size for regclass");
-    }
-  } else if (AArch64::FPR8RegClass.hasSubClassEq(RC)) {
-    LoadOp = AArch64::LSFP8_LDR;
-  } else if (AArch64::FPR16RegClass.hasSubClassEq(RC)) {
-    LoadOp = AArch64::LSFP16_LDR;
-  } else if (RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
-             RC->hasType(MVT::f128)) {
-    switch (RC->getSize()) {
-    case 4: LoadOp = AArch64::LSFP32_LDR; break;
-    case 8: LoadOp = AArch64::LSFP64_LDR; break;
-    case 16: LoadOp = AArch64::LSFP128_LDR; break;
-    default:
-      llvm_unreachable("Unknown size for regclass");
-    }
-  } else { // For a super register class has more than one sub registers
-    if (AArch64::DPairRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x2_8B;
-    else if (AArch64::DTripleRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x3_8B;
-    else if (AArch64::DQuadRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x4_8B;
-    else if (AArch64::QPairRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x2_16B;
-    else if (AArch64::QTripleRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x3_16B;
-    else if (AArch64::QQuadRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x4_16B;
-    else
-      llvm_unreachable("Unknown reg class");
+bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                             unsigned &SrcReg, unsigned &DstReg,
+                                             unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::SBFMXri: // aka sxtw
+  case AArch64::UBFMXri: // aka uxtw
+    // Check for the 32 -> 64 bit extension case, these instructions can do
+    // much more.
+    if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
+      return false;
+    // This is a signed or unsigned 32 -> 64 bit extension.
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SubIdx = AArch64::sub_32;
+    return true;
+  }
+}
 
-    MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg);
-    // Vector load has different operands from other load instructions.
-    NewMI.addFrameIndex(FrameIdx)
-         .addMemOperand(MMO);
-    return;
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+/// Return true if the comparison instruction can be analyzed.
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &CmpMask,
+                                      int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBSXrx:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDSXrx:
+    // Replace SUBSWrr with SUBWrr if NZCV is not used.
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = MI->getOperand(2).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case AArch64::SUBSWri:
+  case AArch64::ADDSWri:
+  case AArch64::SUBSXri:
+  case AArch64::ADDSXri:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(2).getImm();
+    return true;
+  case AArch64::ANDSWri:
+  case AArch64::ANDSXri:
+    // ANDS does not use the same encoding scheme as the others xxxS
+    // instructions.
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = AArch64_AM::decodeLogicalImmediate(
+        MI->getOperand(2).getImm(),
+        MI->getOpcode() == AArch64::ANDSWri ? 32 : 64);
+    return true;
+  }
+
+  return false;
+}
+
+static bool UpdateOperandRegClass(MachineInstr *Instr) {
+  MachineBasicBlock *MBB = Instr->getParent();
+  assert(MBB && "Can't get MachineBasicBlock here");
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Can't get MachineFunction here");
+  const TargetMachine *TM = &MF->getTarget();
+  const TargetInstrInfo *TII = TM->getInstrInfo();
+  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
+       ++OpIdx) {
+    MachineOperand &MO = Instr->getOperand(OpIdx);
+    const TargetRegisterClass *OpRegCstraints =
+        Instr->getRegClassConstraint(OpIdx, TII, TRI);
+
+    // If there's no constraint, there's nothing to do.
+    if (!OpRegCstraints)
+      continue;
+    // If the operand is a frame index, there's nothing to do here.
+    // A frame index operand will resolve correctly during PEI.
+    if (MO.isFI())
+      continue;
+
+    assert(MO.isReg() &&
+           "Operand has register constraints without being a register!");
+
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (!OpRegCstraints->contains(Reg))
+        return false;
+    } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
+               !MRI->constrainRegClass(Reg, OpRegCstraints))
+      return false;
   }
 
-  MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg);
-  NewMI.addFrameIndex(FrameIdx)
-       .addImm(0)
-       .addMemOperand(MMO);
+  return true;
 }
 
-unsigned AArch64InstrInfo::estimateRSStackLimit(MachineFunction &MF) const {
-  unsigned Limit = (1 << 16) - 1;
-  for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
-    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
-         I != E; ++I) {
-      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-        if (!I->getOperand(i).isFI()) continue;
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register.
+bool AArch64InstrInfo::optimizeCompareInstr(
+    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
+
+  // Replace SUBSWrr with SUBWrr if NZCV is not used.
+  int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
+  if (Cmp_NZCV != -1) {
+    unsigned NewOpc;
+    switch (CmpInstr->getOpcode()) {
+    default:
+      return false;
+    case AArch64::ADDSWrr:      NewOpc = AArch64::ADDWrr; break;
+    case AArch64::ADDSWri:      NewOpc = AArch64::ADDWri; break;
+    case AArch64::ADDSWrs:      NewOpc = AArch64::ADDWrs; break;
+    case AArch64::ADDSWrx:      NewOpc = AArch64::ADDWrx; break;
+    case AArch64::ADDSXrr:      NewOpc = AArch64::ADDXrr; break;
+    case AArch64::ADDSXri:      NewOpc = AArch64::ADDXri; break;
+    case AArch64::ADDSXrs:      NewOpc = AArch64::ADDXrs; break;
+    case AArch64::ADDSXrx:      NewOpc = AArch64::ADDXrx; break;
+    case AArch64::SUBSWrr:      NewOpc = AArch64::SUBWrr; break;
+    case AArch64::SUBSWri:      NewOpc = AArch64::SUBWri; break;
+    case AArch64::SUBSWrs:      NewOpc = AArch64::SUBWrs; break;
+    case AArch64::SUBSWrx:      NewOpc = AArch64::SUBWrx; break;
+    case AArch64::SUBSXrr:      NewOpc = AArch64::SUBXrr; break;
+    case AArch64::SUBSXri:      NewOpc = AArch64::SUBXri; break;
+    case AArch64::SUBSXrs:      NewOpc = AArch64::SUBXrs; break;
+    case AArch64::SUBSXrx:      NewOpc = AArch64::SUBXrx; break;
+    }
+
+    const MCInstrDesc &MCID = get(NewOpc);
+    CmpInstr->setDesc(MCID);
+    CmpInstr->RemoveOperand(Cmp_NZCV);
+    bool succeeded = UpdateOperandRegClass(CmpInstr);
+    (void)succeeded;
+    assert(succeeded && "Some operands reg class are incompatible!");
+    return true;
+  }
+
+  // Continue only if we have a "ri" where immediate is zero.
+  if (CmpValue != 0 || SrcReg2 != 0)
+    return false;
+
+  // CmpInstr is a Compare instruction if destination register is not used.
+  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+    return false;
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI)
+    return false;
+
+  // We iterate backward, starting from the instruction before CmpInstr and
+  // stop when reaching the definition of the source register or done with the
+  // basic block, to check whether NZCV is used or modified in between.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr->getParent()->begin();
+
+  // Early exit if CmpInstr is at the beginning of the BB.
+  if (I == B)
+    return false;
+
+  // Check whether the definition of SrcReg is in the same basic block as
+  // Compare. If not, we can't optimize away the Compare.
+  if (MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  // Check that NZCV isn't set between the comparison instruction and the one we
+  // want to change.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  for (--I; I != E; --I) {
+    const MachineInstr &Instr = *I;
 
-        // When using ADDxxi_lsl0_s to get the address of a stack object, 0xfff
-        // is the largest offset guaranteed to fit in the immediate offset.
-        if (I->getOpcode() == AArch64::ADDxxi_lsl0_s) {
-          Limit = std::min(Limit, 0xfffu);
-          break;
-        }
+    if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
+        Instr.readsRegister(AArch64::NZCV, TRI))
+      // This instruction modifies or uses NZCV after the one we want to
+      // change. We can't do this transformation.
+      return false;
+    if (I == B)
+      // The 'and' is below the comparison instruction.
+      return false;
+  }
+
+  unsigned NewOpc = MI->getOpcode();
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXri:
+    break;
+  case AArch64::ADDWrr:    NewOpc = AArch64::ADDSWrr; break;
+  case AArch64::ADDWri:    NewOpc = AArch64::ADDSWri; break;
+  case AArch64::ADDXrr:    NewOpc = AArch64::ADDSXrr; break;
+  case AArch64::ADDXri:    NewOpc = AArch64::ADDSXri; break;
+  case AArch64::ADCWr:     NewOpc = AArch64::ADCSWr; break;
+  case AArch64::ADCXr:     NewOpc = AArch64::ADCSXr; break;
+  case AArch64::SUBWrr:    NewOpc = AArch64::SUBSWrr; break;
+  case AArch64::SUBWri:    NewOpc = AArch64::SUBSWri; break;
+  case AArch64::SUBXrr:    NewOpc = AArch64::SUBSXrr; break;
+  case AArch64::SUBXri:    NewOpc = AArch64::SUBSXri; break;
+  case AArch64::SBCWr:     NewOpc = AArch64::SBCSWr; break;
+  case AArch64::SBCXr:     NewOpc = AArch64::SBCSXr; break;
+  case AArch64::ANDWri:    NewOpc = AArch64::ANDSWri; break;
+  case AArch64::ANDXri:    NewOpc = AArch64::ANDSXri; break;
+  }
 
-        int AccessScale, MinOffset, MaxOffset;
-        getAddressConstraints(*I, AccessScale, MinOffset, MaxOffset);
-        Limit = std::min(Limit, static_cast<unsigned>(MaxOffset));
+  // Scan forward for the use of NZCV.
+  // When checking against MI: if it's a conditional code requires
+  // checking of V bit, then this is not safe to do.
+  // It is safe to remove CmpInstr if NZCV is redefined or killed.
+  // If we are done with the basic block, we need to check whether NZCV is
+  // live-out.
+  bool IsSafe = false;
+  for (MachineBasicBlock::iterator I = CmpInstr,
+                                   E = CmpInstr->getParent()->end();
+       !IsSafe && ++I != E;) {
+    const MachineInstr &Instr = *I;
+    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
+         ++IO) {
+      const MachineOperand &MO = Instr.getOperand(IO);
+      if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) {
+        IsSafe = true;
+        break;
+      }
+      if (!MO.isReg() || MO.getReg() != AArch64::NZCV)
+        continue;
+      if (MO.isDef()) {
+        IsSafe = true;
+        break;
+      }
 
-        break; // At most one FI per instruction
+      // Decode the condition code.
+      unsigned Opc = Instr.getOpcode();
+      AArch64CC::CondCode CC;
+      switch (Opc) {
+      default:
+        return false;
+      case AArch64::Bcc:
+        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm();
+        break;
+      case AArch64::CSINVWr:
+      case AArch64::CSINVXr:
+      case AArch64::CSINCWr:
+      case AArch64::CSINCXr:
+      case AArch64::CSELWr:
+      case AArch64::CSELXr:
+      case AArch64::CSNEGWr:
+      case AArch64::CSNEGXr:
+      case AArch64::FCSELSrrr:
+      case AArch64::FCSELDrrr:
+        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm();
+        break;
+      }
+
+      // It is not safe to remove Compare instruction if Overflow(V) is used.
+      switch (CC) {
+      default:
+        // NZCV can be used multiple times, we should continue.
+        break;
+      case AArch64CC::VS:
+      case AArch64CC::VC:
+      case AArch64CC::GE:
+      case AArch64CC::LT:
+      case AArch64CC::GT:
+      case AArch64CC::LE:
+        return false;
       }
     }
   }
 
-  return Limit;
+  // If NZCV is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if (!IsSafe) {
+    MachineBasicBlock *ParentBlock = CmpInstr->getParent();
+    for (auto *MBB : ParentBlock->successors())
+      if (MBB->isLiveIn(AArch64::NZCV))
+        return false;
+  }
+
+  // Update the instruction to set NZCV.
+  MI->setDesc(get(NewOpc));
+  CmpInstr->eraseFromParent();
+  bool succeeded = UpdateOperandRegClass(MI);
+  (void)succeeded;
+  assert(succeeded && "Some operands reg class are incompatible!");
+  MI->addRegisterDefined(AArch64::NZCV, TRI);
+  return true;
 }
-void AArch64InstrInfo::getAddressConstraints(const MachineInstr &MI,
-                                             int &AccessScale, int &MinOffset,
-                                             int &MaxOffset) const {
-  switch (MI.getOpcode()) {
+
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
   default:
-    llvm_unreachable("Unknown load/store kind");
-  case TargetOpcode::DBG_VALUE:
-    AccessScale = 1;
-    MinOffset = INT_MIN;
-    MaxOffset = INT_MAX;
-    return;
-  case AArch64::LS8_LDR: case AArch64::LS8_STR:
-  case AArch64::LSFP8_LDR: case AArch64::LSFP8_STR:
-  case AArch64::LDRSBw:
-  case AArch64::LDRSBx:
-    AccessScale = 1;
-    MinOffset = 0;
-    MaxOffset = 0xfff;
-    return;
-  case AArch64::LS16_LDR: case AArch64::LS16_STR:
-  case AArch64::LSFP16_LDR: case AArch64::LSFP16_STR:
-  case AArch64::LDRSHw:
-  case AArch64::LDRSHx:
-    AccessScale = 2;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LS32_LDR:  case AArch64::LS32_STR:
-  case AArch64::LSFP32_LDR: case AArch64::LSFP32_STR:
-  case AArch64::LDRSWx:
-  case AArch64::LDPSWx:
-    AccessScale = 4;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LS64_LDR: case AArch64::LS64_STR:
-  case AArch64::LSFP64_LDR: case AArch64::LSFP64_STR:
-  case AArch64::PRFM:
-    AccessScale = 8;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LSFP128_LDR: case AArch64::LSFP128_STR:
-    AccessScale = 16;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LSPair32_LDR: case AArch64::LSPair32_STR:
-  case AArch64::LSFPPair32_LDR: case AArch64::LSFPPair32_STR:
-    AccessScale = 4;
-    MinOffset = -0x40 * AccessScale;
-    MaxOffset = 0x3f * AccessScale;
-    return;
-  case AArch64::LSPair64_LDR: case AArch64::LSPair64_STR:
-  case AArch64::LSFPPair64_LDR: case AArch64::LSFPPair64_STR:
-    AccessScale = 8;
-    MinOffset = -0x40 * AccessScale;
-    MaxOffset = 0x3f * AccessScale;
-    return;
-  case AArch64::LSFPPair128_LDR: case AArch64::LSFPPair128_STR:
-    AccessScale = 16;
-    MinOffset = -0x40 * AccessScale;
-    MaxOffset = 0x3f * AccessScale;
-    return;
-  case AArch64::LD1x2_8B: case AArch64::ST1x2_8B:
-    AccessScale = 16;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LD1x3_8B: case AArch64::ST1x3_8B:
-    AccessScale = 24;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LD1x4_8B: case AArch64::ST1x4_8B:
-  case AArch64::LD1x2_16B: case AArch64::ST1x2_16B:
-    AccessScale = 32;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LD1x3_16B: case AArch64::ST1x3_16B:
-    AccessScale = 48;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LD1x4_16B: case AArch64::ST1x4_16B:
-    AccessScale = 64;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
+    break;
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::ANDSWrs:
+  case AArch64::ANDSXrs:
+  case AArch64::ANDWrs:
+  case AArch64::ANDXrs:
+  case AArch64::BICSWrs:
+  case AArch64::BICSXrs:
+  case AArch64::BICWrs:
+  case AArch64::BICXrs:
+  case AArch64::CRC32Brr:
+  case AArch64::CRC32CBrr:
+  case AArch64::CRC32CHrr:
+  case AArch64::CRC32CWrr:
+  case AArch64::CRC32CXrr:
+  case AArch64::CRC32Hrr:
+  case AArch64::CRC32Wrr:
+  case AArch64::CRC32Xrr:
+  case AArch64::EONWrs:
+  case AArch64::EONXrs:
+  case AArch64::EORWrs:
+  case AArch64::EORXrs:
+  case AArch64::ORNWrs:
+  case AArch64::ORNXrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBXrs:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
   }
+  return false;
 }
 
-unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
-  const MCInstrDesc &MCID = MI.getDesc();
-  const MachineBasicBlock &MBB = *MI.getParent();
-  const MachineFunction &MF = *MBB.getParent();
-  const MCAsmInfo &MAI = *MF.getTarget().getMCAsmInfo();
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrx:
+  case AArch64::ADDSXrx64:
+  case AArch64::ADDWrx:
+  case AArch64::ADDXrx:
+  case AArch64::ADDXrx64:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrx:
+  case AArch64::SUBSXrx64:
+  case AArch64::SUBWrx:
+  case AArch64::SUBXrx:
+  case AArch64::SUBXrx64:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
+  }
 
-  if (MCID.getSize())
-    return MCID.getSize();
+  return false;
+}
 
-  if (MI.getOpcode() == AArch64::INLINEASM)
-    return getInlineAsmLength(MI.getOperand(0).getSymbolName(), MAI);
+// Return true if this instruction simply sets its single destination register
+// to zero. This is equivalent to a register rename of the zero-register.
+bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::MOVZWi:
+  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
+    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
+      assert(MI->getDesc().getNumOperands() == 3 &&
+             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+      return true;
+    }
+    break;
+  case AArch64::ANDWri: // and Rd, Rzr, #imm
+    return MI->getOperand(1).getReg() == AArch64::WZR;
+  case AArch64::ANDXri:
+    return MI->getOperand(1).getReg() == AArch64::XZR;
+  case TargetOpcode::COPY:
+    return MI->getOperand(1).getReg() == AArch64::WZR;
+  }
+  return false;
+}
 
-  switch (MI.getOpcode()) {
-  case TargetOpcode::BUNDLE:
-    return getInstBundleLength(MI);
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL:
-  case TargetOpcode::CFI_INSTRUCTION:
-  case TargetOpcode::EH_LABEL:
-  case TargetOpcode::GC_LABEL:
-  case TargetOpcode::DBG_VALUE:
-  case AArch64::TLSDESCCALL:
-    return 0;
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
   default:
-    llvm_unreachable("Unknown instruction class");
+    break;
+  case TargetOpcode::COPY: {
+    // GPR32 copies will by lowered to ORRXrs
+    unsigned DstReg = MI->getOperand(0).getReg();
+    return (AArch64::GPR32RegClass.contains(DstReg) ||
+            AArch64::GPR64RegClass.contains(DstReg));
   }
+  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
+    if (MI->getOperand(1).getReg() == AArch64::XZR) {
+      assert(MI->getDesc().getNumOperands() == 4 &&
+             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+      return true;
+    }
+  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
+    if (MI->getOperand(2).getImm() == 0) {
+      assert(MI->getDesc().getNumOperands() == 4 &&
+             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+      return true;
+    }
+  }
+  return false;
 }
 
-unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
-  unsigned Size = 0;
-  MachineBasicBlock::const_instr_iterator I = MI;
-  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
-  while (++I != E && I->isInsideBundle()) {
-    assert(!I->isBundle() && "No nested bundle!");
-    Size += getInstSizeInBytes(*I);
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::COPY: {
+    // FPR64 copies will by lowered to ORR.16b
+    unsigned DstReg = MI->getOperand(0).getReg();
+    return (AArch64::FPR64RegClass.contains(DstReg) ||
+            AArch64::FPR128RegClass.contains(DstReg));
   }
-  return Size;
+  case AArch64::ORRv16i8:
+    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
+             "invalid ORRv16i8 operands");
+      return true;
+    }
+  }
+  return false;
 }
 
-bool llvm::rewriteA64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                                unsigned FrameReg, int &Offset,
-                                const AArch64InstrInfo &TII) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
+unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                               int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
 
-  MFI.getObjectOffset(FrameRegIdx);
-  llvm_unreachable("Unimplemented rewriteFrameIndex");
+  return 0;
 }
 
-void llvm::emitRegUpdate(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator MBBI,
-                         DebugLoc dl, const TargetInstrInfo &TII,
-                         unsigned DstReg, unsigned SrcReg, unsigned ScratchReg,
-                         int64_t NumBytes, MachineInstr::MIFlag MIFlags) {
-  if (NumBytes == 0 && DstReg == SrcReg)
-    return;
-  else if (abs64(NumBytes) & ~0xffffff) {
-    // Generically, we have to materialize the offset into a temporary register
-    // and subtract it. There are a couple of ways this could be done, for now
-    // we'll use a movz/movk or movn/movk sequence.
-    uint64_t Bits = static_cast<uint64_t>(abs64(NumBytes));
-    BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVZxii), ScratchReg)
-      .addImm(0xffff & Bits).addImm(0)
-      .setMIFlags(MIFlags);
-
-    Bits >>= 16;
-    if (Bits & 0xffff) {
-      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
-        .addReg(ScratchReg)
-        .addImm(0xffff & Bits).addImm(1)
-        .setMIFlags(MIFlags);
-    }
-
-    Bits >>= 16;
-    if (Bits & 0xffff) {
-      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
-        .addReg(ScratchReg)
-        .addImm(0xffff & Bits).addImm(2)
-        .setMIFlags(MIFlags);
-    }
-
-    Bits >>= 16;
-    if (Bits & 0xffff) {
-      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
-        .addReg(ScratchReg)
-        .addImm(0xffff & Bits).addImm(3)
-        .setMIFlags(MIFlags);
-    }
-
-    // ADD DST, SRC, xTMP (, lsl #0)
-    unsigned AddOp = NumBytes > 0 ? AArch64::ADDxxx_uxtx : AArch64::SUBxxx_uxtx;
-    BuildMI(MBB, MBBI, dl, TII.get(AddOp), DstReg)
-      .addReg(SrcReg, RegState::Kill)
-      .addReg(ScratchReg, RegState::Kill)
-      .addImm(0)
-      .setMIFlag(MIFlags);
-    return;
+unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRBui:
+  case AArch64::STRHui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
   }
+  return 0;
+}
 
-  // Now we know that the adjustment can be done in at most two add/sub
-  // (immediate) instructions, which is always more efficient than a
-  // literal-pool load, or even a hypothetical movz/movk/add sequence
+/// Return true if this is load/store scales or extends its register offset.
+/// This refers to scaling a dynamic index as opposed to scaled immediates.
+/// MI should be a memory op that allows scaled addressing.
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBroW:
+  case AArch64::LDRDroW:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHroW:
+  case AArch64::LDRQroW:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSroW:
+  case AArch64::LDRWroW:
+  case AArch64::LDRXroW:
+  case AArch64::STRBBroW:
+  case AArch64::STRBroW:
+  case AArch64::STRDroW:
+  case AArch64::STRHHroW:
+  case AArch64::STRHroW:
+  case AArch64::STRQroW:
+  case AArch64::STRSroW:
+  case AArch64::STRWroW:
+  case AArch64::STRXroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRBroX:
+  case AArch64::LDRDroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRHroX:
+  case AArch64::LDRQroX:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSroX:
+  case AArch64::LDRWroX:
+  case AArch64::LDRXroX:
+  case AArch64::STRBBroX:
+  case AArch64::STRBroX:
+  case AArch64::STRDroX:
+  case AArch64::STRHHroX:
+  case AArch64::STRHroX:
+  case AArch64::STRQroX:
+  case AArch64::STRSroX:
+  case AArch64::STRWroX:
+  case AArch64::STRXroX:
+
+    unsigned Val = MI->getOperand(3).getImm();
+    AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
+    return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
+  }
+  return false;
+}
 
-  // Decide whether we're doing addition or subtraction
-  unsigned LowOp, HighOp;
-  if (NumBytes >= 0) {
-    LowOp = AArch64::ADDxxi_lsl0_s;
-    HighOp = AArch64::ADDxxi_lsl12_s;
-  } else {
-    LowOp = AArch64::SUBxxi_lsl0_s;
-    HighOp = AArch64::SUBxxi_lsl12_s;
-    NumBytes = abs64(NumBytes);
+/// Check all MachineMemOperands for a hint to suppress pairing.
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
+  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+         "Too many target MO flags");
+  for (auto *MM : MI->memoperands()) {
+    if (MM->getFlags() &
+        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
+      return true;
+    }
   }
+  return false;
+}
 
-  // If we're here, at the very least a move needs to be produced, which just
-  // happens to be materializable by an ADD.
-  if ((NumBytes & 0xfff) || NumBytes == 0) {
-    BuildMI(MBB, MBBI, dl, TII.get(LowOp), DstReg)
-      .addReg(SrcReg, RegState::Kill)
-      .addImm(NumBytes & 0xfff)
-      .setMIFlag(MIFlags);
+/// Set a flag on the first MachineMemOperand to suppress pairing.
+void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
+  if (MI->memoperands_empty())
+    return;
 
-    // Next update should use the register we've just defined.
-    SrcReg = DstReg;
-  }
+  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+         "Too many target MO flags");
+  (*MI->memoperands_begin())
+      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
+}
+
+bool
+AArch64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                                       unsigned &Offset,
+                                       const TargetRegisterInfo *TRI) const {
+  switch (LdSt->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+  case AArch64::STRXui:
+  case AArch64::STRWui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+  case AArch64::LDRXui:
+  case AArch64::LDRWui:
+    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+      return false;
+    BaseReg = LdSt->getOperand(1).getReg();
+    MachineFunction &MF = *LdSt->getParent()->getParent();
+    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
+    Offset = LdSt->getOperand(2).getImm() * Width;
+    return true;
+  };
+}
 
-  if (NumBytes & 0xfff000) {
-    BuildMI(MBB, MBBI, dl, TII.get(HighOp), DstReg)
-      .addReg(SrcReg, RegState::Kill)
-      .addImm(NumBytes >> 12)
-      .setMIFlag(MIFlags);
+/// Detect opportunities for ldp/stp formation.
+///
+/// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
+bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
+                                          MachineInstr *SecondLdSt,
+                                          unsigned NumLoads) const {
+  // Only cluster up to a single pair.
+  if (NumLoads > 1)
+    return false;
+  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+    return false;
+  // getLdStBaseRegImmOfs guarantees that oper 2 isImm.
+  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
+  // Allow 6 bits of positive range.
+  if (Ofs1 > 64)
+    return false;
+  // The caller should already have ordered First/SecondLdSt by offset.
+  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
+  return Ofs1 + 1 == Ofs2;
+}
+
+bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
+                                              MachineInstr *Second) const {
+  // Cyclone can fuse CMN, CMP followed by Bcc.
+
+  // FIXME: B0 can also fuse:
+  // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
+  if (Second->getOpcode() != AArch64::Bcc)
+    return false;
+  switch (First->getOpcode()) {
+  default:
+    return false;
+  case AArch64::SUBSWri:
+  case AArch64::ADDSWri:
+  case AArch64::ANDSWri:
+  case AArch64::SUBSXri:
+  case AArch64::ADDSXri:
+  case AArch64::ANDSXri:
+    return true;
   }
 }
 
-void llvm::emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                        DebugLoc dl, const TargetInstrInfo &TII,
-                        unsigned ScratchReg, int64_t NumBytes,
-                        MachineInstr::MIFlag MIFlags) {
-  emitRegUpdate(MBB, MI, dl, TII, AArch64::XSP, AArch64::XSP, AArch64::X16,
-                NumBytes, MIFlags);
+MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+                                                         int FrameIx,
+                                                         uint64_t Offset,
+                                                         const MDNode *MDPtr,
+                                                         DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
+                                .addFrameIndex(FrameIx)
+                                .addImm(0)
+                                .addImm(Offset)
+                                .addMetadata(MDPtr);
+  return &*MIB;
 }
 
+static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg, unsigned SubIdx,
+                                            unsigned State,
+                                            const TargetRegisterInfo *TRI) {
+  if (!SubIdx)
+    return MIB.addReg(Reg, State);
 
-namespace {
-  struct LDTLSCleanup : public MachineFunctionPass {
-    static char ID;
-    LDTLSCleanup() : MachineFunctionPass(ID) {}
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+  return MIB.addReg(Reg, State, SubIdx);
+}
 
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
-      AArch64MachineFunctionInfo* MFI
-        = MF.getInfo<AArch64MachineFunctionInfo>();
-      if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
-        // No point folding accesses if there isn't at least two.
-        return false;
-      }
+static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
+                                        unsigned NumRegs) {
+  // We really want the positive remainder mod 32 here, that happens to be
+  // easily obtainable with a mask.
+  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
+}
 
-      MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
-      return VisitNode(DT->getRootNode(), 0);
-    }
-
-    // Visit the dominator subtree rooted at Node in pre-order.
-    // If TLSBaseAddrReg is non-null, then use that to replace any
-    // TLS_base_addr instructions. Otherwise, create the register
-    // when the first such instruction is seen, and then use it
-    // as we encounter more instructions.
-    bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
-      MachineBasicBlock *BB = Node->getBlock();
-      bool Changed = false;
-
-      // Traverse the current block.
-      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
-           ++I) {
-        switch (I->getOpcode()) {
-        case AArch64::TLSDESC_BLRx:
-          // Make sure it's a local dynamic access.
-          if (!I->getOperand(1).isSymbol() ||
-              strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
-            break;
-
-          if (TLSBaseAddrReg)
-            I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
-          else
-            I = SetRegister(I, &TLSBaseAddrReg);
-          Changed = true;
-          break;
-        default:
-          break;
-        }
-      }
+void AArch64InstrInfo::copyPhysRegTuple(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+    unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
+    llvm::ArrayRef<unsigned> Indices) const {
+  assert(getSubTarget().hasNEON() &&
+         "Unexpected register copy without NEON");
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
+  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
+  unsigned NumRegs = Indices.size();
+
+  int SubReg = 0, End = NumRegs, Incr = 1;
+  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
+    SubReg = NumRegs - 1;
+    End = -1;
+    Incr = -1;
+  }
 
-      // Visit the children of this block in the dominator tree.
-      for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
-           I != E; ++I) {
-        Changed |= VisitNode(*I, TLSBaseAddrReg);
+  for (; SubReg != End; SubReg += Incr) {
+    const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode));
+    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
+  }
+}
+
+void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   bool KillSrc) const {
+  if (AArch64::GPR32spRegClass.contains(DestReg) &&
+      (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+    if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
+      // If either operand is WSP, expand to ADD #0.
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                    &AArch64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
+            .addReg(SrcRegX, RegState::Undef)
+            .addImm(0)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc))
+            .addImm(0)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
       }
+    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg).addImm(0).addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                    &AArch64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
+            .addReg(AArch64::XZR)
+            .addReg(SrcRegX, RegState::Undef)
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        // Otherwise, expand to ORR WZR.
+        BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
+            .addReg(AArch64::WZR)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+      }
+    }
+    return;
+  }
 
-      return Changed;
+  if (AArch64::GPR64spRegClass.contains(DestReg) &&
+      (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
+    if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
+      // If either operand is SP, expand to ADD #0.
+      BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg).addImm(0).addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      // Otherwise, expand to ORR XZR.
+      BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
+          .addReg(AArch64::XZR)
+          .addReg(SrcReg, getKillRegState(KillSrc));
     }
+    return;
+  }
 
-    // Replace the TLS_base_addr instruction I with a copy from
-    // TLSBaseAddrReg, returning the new instruction.
-    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
-                                         unsigned TLSBaseAddrReg) {
-      MachineFunction *MF = I->getParent()->getParent();
-      const AArch64TargetMachine *TM =
-          static_cast<const AArch64TargetMachine *>(&MF->getTarget());
-      const AArch64InstrInfo *TII = TM->getInstrInfo();
+  // Copy a DDDD register quad by copying the individual sub-registers.
+  if (AArch64::DDDDRegClass.contains(DestReg) &&
+      AArch64::DDDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+                                        AArch64::dsub2, AArch64::dsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
 
-      // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
-      // code sequence assumes the address will be.
-      MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
-                                   TII->get(TargetOpcode::COPY),
-                                   AArch64::X0)
-        .addReg(TLSBaseAddrReg);
+  // Copy a DDD register triple by copying the individual sub-registers.
+  if (AArch64::DDDRegClass.contains(DestReg) &&
+      AArch64::DDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+                                        AArch64::dsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
 
-      // Erase the TLS_base_addr instruction.
-      I->eraseFromParent();
+  // Copy a DD register pair by copying the individual sub-registers.
+  if (AArch64::DDRegClass.contains(DestReg) &&
+      AArch64::DDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQQ register quad by copying the individual sub-registers.
+  if (AArch64::QQQQRegClass.contains(DestReg) &&
+      AArch64::QQQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+                                        AArch64::qsub2, AArch64::qsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQ register triple by copying the individual sub-registers.
+  if (AArch64::QQQRegClass.contains(DestReg) &&
+      AArch64::QQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+                                        AArch64::qsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQ register pair by copying the individual sub-registers.
+  if (AArch64::QQRegClass.contains(DestReg) &&
+      AArch64::QQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
 
-      return Copy;
+  if (AArch64::FPR128RegClass.contains(DestReg) &&
+      AArch64::FPR128RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::STRQpre))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(AArch64::SP)
+        .addImm(-16);
+      BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(DestReg, RegState::Define)
+        .addReg(AArch64::SP)
+        .addImm(16);
     }
+    return;
+  }
 
-    // Create a virtal register in *TLSBaseAddrReg, and populate it by
-    // inserting a copy instruction after I. Returns the new instruction.
-    MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
-      MachineFunction *MF = I->getParent()->getParent();
-      const AArch64TargetMachine *TM =
-          static_cast<const AArch64TargetMachine *>(&MF->getTarget());
-      const AArch64InstrInfo *TII = TM->getInstrInfo();
+  if (AArch64::FPR64RegClass.contains(DestReg) &&
+      AArch64::FPR64RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
 
-      // Create a virtual register for the TLS base address.
-      MachineRegisterInfo &RegInfo = MF->getRegInfo();
-      *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
+  if (AArch64::FPR32RegClass.contains(DestReg) &&
+      AArch64::FPR32RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
 
-      // Insert a copy from X0 to TLSBaseAddrReg for later.
-      MachineInstr *Next = I->getNextNode();
-      MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
-                                   TII->get(TargetOpcode::COPY),
-                                   *TLSBaseAddrReg)
-        .addReg(AArch64::X0);
+  if (AArch64::FPR16RegClass.contains(DestReg) &&
+      AArch64::FPR16RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+                                       &AArch64::FPR32RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                      &AArch64::FPR32RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
 
-      return Copy;
+  if (AArch64::FPR8RegClass.contains(DestReg) &&
+      AArch64::FPR8RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+                                       &AArch64::FPR32RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                      &AArch64::FPR32RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
     }
+    return;
+  }
+
+  // Copies between GPR64 and FPR64.
+  if (AArch64::FPR64RegClass.contains(DestReg) &&
+      AArch64::GPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (AArch64::GPR64RegClass.contains(DestReg) &&
+      AArch64::FPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  // Copies between GPR32 and FPR32.
+  if (AArch64::FPR32RegClass.contains(DestReg) &&
+      AArch64::GPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (AArch64::GPR32RegClass.contains(DestReg) &&
+      AArch64::FPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
 
-    virtual const char *getPassName() const {
-      return "Local Dynamic TLS Access Clean-up";
+  if (DestReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
+    BuildMI(MBB, I, DL, get(AArch64::MSR))
+      .addImm(AArch64SysReg::NZCV)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
+    return;
+  }
+
+  if (SrcReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
+    BuildMI(MBB, I, DL, get(AArch64::MRS))
+      .addReg(DestReg)
+      .addImm(AArch64SysReg::NZCV)
+      .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
+    return;
+  }
+
+  llvm_unreachable("unimplemented reg-to-reg copy");
+}
+
+void AArch64InstrInfo::storeRegToStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+    bool isKill, int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRHui;
+    break;
+  case 4:
+    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STRWui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+      else
+        assert(SrcReg != AArch64::WSP);
+    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRSui;
+    break;
+  case 8:
+    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STRXui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+      else
+        assert(SrcReg != AArch64::SP);
+    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRQui;
+    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Twov1d, Offset = false;
+    }
+    break;
+  case 24:
+    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Threev1d, Offset = false;
+    }
+    break;
+  case 32:
+    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Fourv1d, Offset = false;
+    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Twov2d, Offset = false;
     }
+    break;
+  case 48:
+    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Threev2d, Offset = false;
+    }
+    break;
+  case 64:
+    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Fourv2d, Offset = false;
+    }
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(SrcReg, getKillRegState(isKill))
+                                      .addFrameIndex(FI);
+
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void AArch64InstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+    int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addRequired<MachineDominatorTree>();
-      MachineFunctionPass::getAnalysisUsage(AU);
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRHui;
+    break;
+  case 4:
+    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDRWui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
+      else
+        assert(DestReg != AArch64::WSP);
+    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRSui;
+    break;
+  case 8:
+    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDRXui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
+      else
+        assert(DestReg != AArch64::SP);
+    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRQui;
+    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Twov1d, Offset = false;
     }
-  };
+    break;
+  case 24:
+    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Threev1d, Offset = false;
+    }
+    break;
+  case 32:
+    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Fourv1d, Offset = false;
+    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Twov2d, Offset = false;
+    }
+    break;
+  case 48:
+    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Threev2d, Offset = false;
+    }
+    break;
+  case 64:
+    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Fourv2d, Offset = false;
+    }
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(DestReg, getDefRegState(true))
+                                      .addFrameIndex(FI);
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void llvm::emitFrameOffset(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg, int Offset,
+                           const AArch64InstrInfo *TII,
+                           MachineInstr::MIFlag Flag, bool SetNZCV) {
+  if (DestReg == SrcReg && Offset == 0)
+    return;
+
+  bool isSub = Offset < 0;
+  if (isSub)
+    Offset = -Offset;
+
+  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
+  // scratch register.  If DestReg is a virtual register, use it as the
+  // scratch register; otherwise, create a new virtual register (to be
+  // replaced by the scavenger at the end of PEI).  That case can be optimized
+  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
+  // register can be loaded with offset%8 and the add/sub can use an extending
+  // instruction with LSL#3.
+  // Currently the function handles any offsets but generates a poor sequence
+  // of code.
+  //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
+
+  unsigned Opc;
+  if (SetNZCV)
+    Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
+  else
+    Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
+  const unsigned MaxEncoding = 0xfff;
+  const unsigned ShiftSize = 12;
+  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
+  while (((unsigned)Offset) >= (1 << ShiftSize)) {
+    unsigned ThisVal;
+    if (((unsigned)Offset) > MaxEncodableValue) {
+      ThisVal = MaxEncodableValue;
+    } else {
+      ThisVal = Offset & MaxEncodableValue;
+    }
+    assert((ThisVal >> ShiftSize) <= MaxEncoding &&
+           "Encoding cannot handle value that big");
+    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+        .addReg(SrcReg)
+        .addImm(ThisVal >> ShiftSize)
+        .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
+        .setMIFlag(Flag);
+
+    SrcReg = DestReg;
+    Offset -= ThisVal;
+    if (Offset == 0)
+      return;
+  }
+  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+      .addReg(SrcReg)
+      .addImm(Offset)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+      .setMIFlag(Flag);
 }
 
-char LDTLSCleanup::ID = 0;
-FunctionPass*
-llvm::createAArch64CleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
+MachineInstr *
+AArch64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        int FrameIndex) const {
+  // This is a bit of a hack. Consider this instruction:
+  //
+  //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
+  //
+  // We explicitly chose GPR64all for the virtual register so such a copy might
+  // be eliminated by RegisterCoalescer. However, that may not be possible, and
+  // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
+  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
+  //
+  // To prevent that, we are going to constrain the %vreg0 register class here.
+  //
+  // <rdar://problem/11522048>
+  //
+  if (MI->isCopy()) {
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    if (SrcReg == AArch64::SP &&
+        TargetRegisterInfo::isVirtualRegister(DstReg)) {
+      MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
+      return nullptr;
+    }
+    if (DstReg == AArch64::SP &&
+        TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+      return nullptr;
+    }
+  }
+
+  // Cannot fold.
+  return nullptr;
+}
+
+int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                                    bool *OutUseUnscaledOp,
+                                    unsigned *OutUnscaledOp,
+                                    int *EmittableOffset) {
+  int Scale = 1;
+  bool IsSigned = false;
+  // The ImmIdx should be changed case by case if it is not 2.
+  unsigned ImmIdx = 2;
+  unsigned UnscaledOp = 0;
+  // Set output values in case of early exit.
+  if (EmittableOffset)
+    *EmittableOffset = 0;
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = false;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = 0;
+  switch (MI.getOpcode()) {
+  default:
+    assert(0 && "unhandled opcode in rewriteAArch64FrameIndex");
+  // Vector spills/fills can't take an immediate offset.
+  case AArch64::LD1Twov2d:
+  case AArch64::LD1Threev2d:
+  case AArch64::LD1Fourv2d:
+  case AArch64::LD1Twov1d:
+  case AArch64::LD1Threev1d:
+  case AArch64::LD1Fourv1d:
+  case AArch64::ST1Twov2d:
+  case AArch64::ST1Threev2d:
+  case AArch64::ST1Fourv2d:
+  case AArch64::ST1Twov1d:
+  case AArch64::ST1Threev1d:
+  case AArch64::ST1Fourv1d:
+    return AArch64FrameOffsetCannotUpdate;
+  case AArch64::PRFMui:
+    Scale = 8;
+    UnscaledOp = AArch64::PRFUMi;
+    break;
+  case AArch64::LDRXui:
+    Scale = 8;
+    UnscaledOp = AArch64::LDURXi;
+    break;
+  case AArch64::LDRWui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURWi;
+    break;
+  case AArch64::LDRBui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURBi;
+    break;
+  case AArch64::LDRHui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURHi;
+    break;
+  case AArch64::LDRSui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURSi;
+    break;
+  case AArch64::LDRDui:
+    Scale = 8;
+    UnscaledOp = AArch64::LDURDi;
+    break;
+  case AArch64::LDRQui:
+    Scale = 16;
+    UnscaledOp = AArch64::LDURQi;
+    break;
+  case AArch64::LDRBBui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURBBi;
+    break;
+  case AArch64::LDRHHui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURHHi;
+    break;
+  case AArch64::LDRSBXui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURSBXi;
+    break;
+  case AArch64::LDRSBWui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURSBWi;
+    break;
+  case AArch64::LDRSHXui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURSHXi;
+    break;
+  case AArch64::LDRSHWui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURSHWi;
+    break;
+  case AArch64::LDRSWui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURSWi;
+    break;
+
+  case AArch64::STRXui:
+    Scale = 8;
+    UnscaledOp = AArch64::STURXi;
+    break;
+  case AArch64::STRWui:
+    Scale = 4;
+    UnscaledOp = AArch64::STURWi;
+    break;
+  case AArch64::STRBui:
+    Scale = 1;
+    UnscaledOp = AArch64::STURBi;
+    break;
+  case AArch64::STRHui:
+    Scale = 2;
+    UnscaledOp = AArch64::STURHi;
+    break;
+  case AArch64::STRSui:
+    Scale = 4;
+    UnscaledOp = AArch64::STURSi;
+    break;
+  case AArch64::STRDui:
+    Scale = 8;
+    UnscaledOp = AArch64::STURDi;
+    break;
+  case AArch64::STRQui:
+    Scale = 16;
+    UnscaledOp = AArch64::STURQi;
+    break;
+  case AArch64::STRBBui:
+    Scale = 1;
+    UnscaledOp = AArch64::STURBBi;
+    break;
+  case AArch64::STRHHui:
+    Scale = 2;
+    UnscaledOp = AArch64::STURHHi;
+    break;
+
+  case AArch64::LDPXi:
+  case AArch64::LDPDi:
+  case AArch64::STPXi:
+  case AArch64::STPDi:
+    IsSigned = true;
+    Scale = 8;
+    break;
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+    IsSigned = true;
+    Scale = 16;
+    break;
+  case AArch64::LDPWi:
+  case AArch64::LDPSi:
+  case AArch64::STPWi:
+  case AArch64::STPSi:
+    IsSigned = true;
+    Scale = 4;
+    break;
+
+  case AArch64::LDURXi:
+  case AArch64::LDURWi:
+  case AArch64::LDURBi:
+  case AArch64::LDURHi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSWi:
+  case AArch64::STURXi:
+  case AArch64::STURWi:
+  case AArch64::STURBi:
+  case AArch64::STURHi:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURBBi:
+  case AArch64::STURHHi:
+    Scale = 1;
+    break;
+  }
+
+  Offset += MI.getOperand(ImmIdx).getImm() * Scale;
+
+  bool useUnscaledOp = false;
+  // If the offset doesn't match the scale, we rewrite the instruction to
+  // use the unscaled instruction instead. Likewise, if we have a negative
+  // offset (and have an unscaled op to use).
+  if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
+    useUnscaledOp = true;
+
+  // Use an unscaled addressing mode if the instruction has a negative offset
+  // (or if the instruction is already using an unscaled addressing mode).
+  unsigned MaskBits;
+  if (IsSigned) {
+    // ldp/stp instructions.
+    MaskBits = 7;
+    Offset /= Scale;
+  } else if (UnscaledOp == 0 || useUnscaledOp) {
+    MaskBits = 9;
+    IsSigned = true;
+    Scale = 1;
+  } else {
+    MaskBits = 12;
+    IsSigned = false;
+    Offset /= Scale;
+  }
+
+  // Attempt to fold address computation.
+  int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
+  int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
+  if (Offset >= MinOff && Offset <= MaxOff) {
+    if (EmittableOffset)
+      *EmittableOffset = Offset;
+    Offset = 0;
+  } else {
+    int NewOff = Offset < 0 ? MinOff : MaxOff;
+    if (EmittableOffset)
+      *EmittableOffset = NewOff;
+    Offset = (Offset - NewOff) * Scale;
+  }
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = useUnscaledOp;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = UnscaledOp;
+  return AArch64FrameOffsetCanUpdate |
+         (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
+}
+
+bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                    unsigned FrameReg, int &Offset,
+                                    const AArch64InstrInfo *TII) {
+  unsigned Opcode = MI.getOpcode();
+  unsigned ImmIdx = FrameRegIdx + 1;
+
+  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
+    Offset += MI.getOperand(ImmIdx).getImm();
+    emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
+                    MI.getOperand(0).getReg(), FrameReg, Offset, TII,
+                    MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
+    MI.eraseFromParent();
+    Offset = 0;
+    return true;
+  }
+
+  int NewOffset;
+  unsigned UnscaledOp;
+  bool UseUnscaledOp;
+  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
+                                         &UnscaledOp, &NewOffset);
+  if (Status & AArch64FrameOffsetCanUpdate) {
+    if (Status & AArch64FrameOffsetIsLegal)
+      // Replace the FrameIndex with FrameReg.
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+    if (UseUnscaledOp)
+      MI.setDesc(TII->get(UnscaledOp));
+
+    MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
+    return Offset == 0;
+  }
+
+  return false;
+}
+
+void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(AArch64::HINT);
+  NopInst.addOperand(MCOperand::CreateImm(0));
+}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index ad20f9c..90ce75f 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -11,9 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64INSTRINFO_H
-#define LLVM_TARGET_AARCH64INSTRINFO_H
+#ifndef LLVM_TARGET_AArch64INSTRINFO_H
+#define LLVM_TARGET_AArch64INSTRINFO_H
 
+#include "AArch64.h"
 #include "AArch64RegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
@@ -23,89 +24,208 @@
 namespace llvm {
 
 class AArch64Subtarget;
+class AArch64TargetMachine;
 
 class AArch64InstrInfo : public AArch64GenInstrInfo {
+  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
+  // They will be shifted into MOTargetHintStart when accessed.
+  enum TargetMemOperandFlags {
+    MOSuppressPair = 1
+  };
+
   const AArch64RegisterInfo RI;
   const AArch64Subtarget &Subtarget;
+
 public:
-  explicit AArch64InstrInfo(const AArch64Subtarget &TM);
+  explicit AArch64InstrInfo(const AArch64Subtarget &STI);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
-  ///
-  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+  const AArch64RegisterInfo &getRegisterInfo() const { return RI; }
 
   const AArch64Subtarget &getSubTarget() const { return Subtarget; }
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator I, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
-                   bool KillSrc) const;
-  void CopyPhysRegTuple(MachineBasicBlock &MBB,
-                        MachineBasicBlock::iterator I, DebugLoc DL,
-                        unsigned DestReg, unsigned SrcReg) const;
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                             unsigned &DstReg, unsigned &SubIdx) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  /// Returns true if there is a shiftable register and that the shift value
+  /// is non-zero.
+  bool hasShiftedReg(const MachineInstr *MI) const;
+
+  /// Returns true if there is an extendable register and that the extending
+  /// value is non-zero.
+  bool hasExtendedReg(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction set its full destination register to zero?
+  bool isGPRZero(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename a GPR without modifying bits?
+  bool isGPRCopy(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename an FPR without modifying bits?
+  bool isFPRCopy(const MachineInstr *MI) const;
+
+  /// Return true if this is load/store scales or extends its register offset.
+  /// This refers to scaling a dynamic index as opposed to scaled immediates.
+  /// MI should be a memory op that allows scaled addressing.
+  bool isScaledAddr(const MachineInstr *MI) const;
+
+  /// Return true if pairing the given load or store is hinted to be
+  /// unprofitable.
+  bool isLdStPairSuppressed(const MachineInstr *MI) const;
+
+  /// Hint that pairing the given load or store is unprofitable.
+  void suppressLdStPair(MachineInstr *MI) const;
+
+  bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                            unsigned &Offset,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool enableClusterLoads() const override { return true; }
+
+  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
+                          unsigned NumLoads) const override;
+
+  bool shouldScheduleAdjacent(MachineInstr *First,
+                              MachineInstr *Second) const override;
+
+  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *MDPtr,
+                                         DebugLoc DL) const;
+  void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                        bool KillSrc, unsigned Opcode,
+                        llvm::ArrayRef<unsigned> Indices) const;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const;
+                           const TargetRegisterInfo *TRI) const override;
+
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI,
-                            unsigned DestReg, int FrameIdx,
-                            const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const;
+                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                        const SmallVectorImpl<unsigned> &Ops,
+                        int FrameIndex) const override;
 
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify = false) const;
+                     bool AllowModify = false) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB,
                         const SmallVectorImpl<MachineOperand> &Cond,
-                        DebugLoc DL) const;
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
-
-  /// Look through the instructions in this function and work out the largest
-  /// the stack frame can be while maintaining the ability to address local
-  /// slots with no complexities.
-  unsigned estimateRSStackLimit(MachineFunction &MF) const;
-
-  /// getAddressConstraints - For loads and stores (and PRFMs) taking an
-  /// immediate offset, this function determines the constraints required for
-  /// the immediate. It must satisfy:
-  ///    + MinOffset <= imm <= MaxOffset
-  ///    + imm % OffsetScale == 0
-  void getAddressConstraints(const MachineInstr &MI, int &AccessScale,
-                             int &MinOffset, int &MaxOffset) const;
-
-
-  unsigned getInstSizeInBytes(const MachineInstr &MI) const;
-
-  unsigned getInstBundleLength(const MachineInstr &MI) const;
-
+                        DebugLoc DL) const override;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  bool canInsertSelect(const MachineBasicBlock &,
+                       const SmallVectorImpl<MachineOperand> &Cond, unsigned,
+                       unsigned, int &, int &, int &) const override;
+  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    DebugLoc DL, unsigned DstReg,
+                    const SmallVectorImpl<MachineOperand> &Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+  /// Return true if the comparison instruction can be analyzed.
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+  /// optimizeCompareInstr - Convert the instruction supplying the argument to
+  /// the comparison into one that sets the zero bit in the flags register.
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+private:
+  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
+                             MachineBasicBlock *TBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const;
 };
 
-bool rewriteA64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                          unsigned FrameReg, int &Offset,
-                          const AArch64InstrInfo &TII);
-
+/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
+/// plus Offset.  This is intended to be used from within the prolog/epilog
+/// insertion (PEI) pass, where a virtual scratch register may be allocated
+/// if necessary, to be replaced by the scavenger at the end of PEI.
+void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
+                     const AArch64InstrInfo *TII,
+                     MachineInstr::MIFlag = MachineInstr::NoFlags,
+                     bool SetNZCV = false);
+
+/// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
+/// FP. Return false if the offset could not be handled directly in MI, and
+/// return the left-over portion by reference.
+bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                            unsigned FrameReg, int &Offset,
+                            const AArch64InstrInfo *TII);
+
+/// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal.
+enum AArch64FrameOffsetStatus {
+  AArch64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
+  AArch64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
+  AArch64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
+};
 
-void emitRegUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                   DebugLoc dl, const TargetInstrInfo &TII,
-                   unsigned DstReg, unsigned SrcReg, unsigned ScratchReg,
-                   int64_t NumBytes,
-                   MachineInstr::MIFlag MIFlags = MachineInstr::NoFlags);
+/// \brief Check if the @p Offset is a valid frame offset for @p MI.
+/// The returned value reports the validity of the frame offset for @p MI.
+/// It uses the values defined by AArch64FrameOffsetStatus for that.
+/// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
+/// use an offset.eq
+/// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
+/// rewriten in @p MI.
+/// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
+/// amount that is off the limit of the legal offset.
+/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
+/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
+/// If set, @p EmittableOffset contains the amount that can be set in @p MI
+/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
+/// is a legal offset.
+int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                            bool *OutUseUnscaledOp = nullptr,
+                            unsigned *OutUnscaledOp = nullptr,
+                            int *EmittableOffset = nullptr);
+
+static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
+
+static inline bool isCondBranchOpcode(int Opc) {
+  switch (Opc) {
+  case AArch64::Bcc:
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    return true;
+  default:
+    return false;
+  }
+}
 
-void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                  DebugLoc dl, const TargetInstrInfo &TII,
-                  unsigned ScratchReg, int64_t NumBytes,
-                  MachineInstr::MIFlag MIFlags = MachineInstr::NoFlags);
+static inline bool isIndirectBranchOpcode(int Opc) { return Opc == AArch64::BR; }
 
-}
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 7d7a641..9ad36e8 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1,4 +1,4 @@
-//===----- AArch64InstrInfo.td - AArch64 Instruction Info ----*- tablegen -*-=//
+//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file describes the AArch64 scalar instructions in TableGen format.
+// AArch64 Instruction definitions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,5368 +19,5266 @@ def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON", "neon">;
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
-                                 AssemblerPredicate<"FeatureCrypto","crypto">;
-
-// Use fused MAC if more precision in FP computation is allowed.
-def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
-                                 " FPOpFusion::Fast)">;
-include "AArch64InstrFormats.td"
+                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC           : Predicate<"Subtarget->hasCRC()">,
+                                 AssemblerPredicate<"FeatureCRC", "crc">;
+def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
+def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 
 //===----------------------------------------------------------------------===//
-//  AArch64 specific pattern fragments.
+// AArch64-specific DAG Nodes.
 //
-// An 'fmul' node with a single use.
-def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
-  return N->hasOneUse();
-}]>;
-
-
-//===----------------------------------------------------------------------===//
-// Target-specific ISD nodes and profiles
-//===----------------------------------------------------------------------===//
-
-def SDT_A64ret : SDTypeProfile<0, 0, []>;
-def A64ret : SDNode<"AArch64ISD::Ret", SDT_A64ret, [SDNPHasChain,
-                                                    SDNPOptInGlue,
-                                                    SDNPVariadic]>;
-
-// (ins NZCV, Condition, Dest)
-def SDT_A64br_cc : SDTypeProfile<0, 3, [SDTCisVT<0, i32>]>;
-def A64br_cc : SDNode<"AArch64ISD::BR_CC", SDT_A64br_cc, [SDNPHasChain]>;
-
-// (outs Result), (ins NZCV, IfTrue, IfFalse, Condition)
-def SDT_A64select_cc : SDTypeProfile<1, 4, [SDTCisVT<1, i32>,
-                                            SDTCisSameAs<0, 2>,
-                                            SDTCisSameAs<2, 3>]>;
-def A64select_cc : SDNode<"AArch64ISD::SELECT_CC", SDT_A64select_cc>;
-
-// (outs NZCV), (ins LHS, RHS, Condition)
-def SDT_A64setcc : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
-                                        SDTCisSameAs<1, 2>]>;
-def A64setcc : SDNode<"AArch64ISD::SETCC", SDT_A64setcc>;
-
-
-// (outs GPR64), (ins)
-def A64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
-
-// A64 compares don't care about the cond really (they set all flags) so a
-// simple binary operator is useful.
-def A64cmp : PatFrag<(ops node:$lhs, node:$rhs),
-                     (A64setcc node:$lhs, node:$rhs, cond)>;
-
-
-// When matching a notional (CMP op1, (sub 0, op2)), we'd like to use a CMN
-// instruction on the grounds that "op1 - (-op2) == op1 + op2". However, the C
-// and V flags can be set differently by this operation. It comes down to
-// whether "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are
-// then everything is fine. If not then the optimization is wrong. Thus general
-// comparisons are only valid if op2 != 0.
-
-// So, finally, the only LLVM-native comparisons that don't mention C and V are
-// SETEQ and SETNE. They're the only ones we can safely use CMN for in the
-// absence of information about op2.
-def equality_cond : PatLeaf<(cond), [{
-  return N->get() == ISD::SETEQ || N->get() == ISD::SETNE;
-}]>;
-
-def A64cmn : PatFrag<(ops node:$lhs, node:$rhs),
-                     (A64setcc node:$lhs, (sub 0, node:$rhs), equality_cond)>;
-
-// There are two layers of indirection here, driven by the following
-// considerations.
-//     + TableGen does not know CodeModel or Reloc so that decision should be
-//       made for a variable/address at ISelLowering.
-//     + The output of ISelLowering should be selectable (hence the Wrapper,
-//       rather than a bare target opcode)
-def SDTAArch64WrapperLarge : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
-                                                  SDTCisSameAs<0, 2>,
-                                                  SDTCisSameAs<0, 3>,
-                                                  SDTCisSameAs<0, 4>,
-                                                  SDTCisPtrTy<0>]>;
-
-def A64WrapperLarge :SDNode<"AArch64ISD::WrapperLarge", SDTAArch64WrapperLarge>;
-
-def SDTAArch64WrapperSmall : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
-                                                  SDTCisSameAs<1, 2>,
-                                                  SDTCisVT<3, i32>,
-                                                  SDTCisPtrTy<0>]>;
-
-def A64WrapperSmall :SDNode<"AArch64ISD::WrapperSmall", SDTAArch64WrapperSmall>;
-
-
-def SDTAArch64GOTLoad : SDTypeProfile<1, 1, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
-def A64GOTLoad : SDNode<"AArch64ISD::GOTLoad", SDTAArch64GOTLoad,
-                        [SDNPHasChain]>;
 
+// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
+def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
+                                              [SDTCisSameAs<0, 2>,
+                                               SDTCisSameAs<0, 3>,
+                                               SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<3, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+
+def SDT_AArch64Brcond  : SDTypeProfile<0, 3,
+                                     [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
+                                      SDTCisVT<2, i32>]>;
+def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
+def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+                                        SDTCisVT<2, OtherVT>]>;
+
+
+def SDT_AArch64CSel  : SDTypeProfile<1, 4,
+                                   [SDTCisSameAs<0, 1>,
+                                    SDTCisSameAs<0, 2>,
+                                    SDTCisInt<3>,
+                                    SDTCisVT<4, i32>]>;
+def SDT_AArch64FCmp   : SDTypeProfile<0, 2,
+                                   [SDTCisFP<0>,
+                                    SDTCisSameAs<0, 1>]>;
+def SDT_AArch64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDT_AArch64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
+def SDT_AArch64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisSameAs<0, 1>,
+                                          SDTCisSameAs<0, 2>]>;
+def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisInt<2>, SDTCisInt<3>]>;
+def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                          SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+
+def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
+def SDT_AArch64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
+def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>]>;
+def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>,
+                                           SDTCisSameAs<0,3>]>;
+def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
+def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
+
+def SDT_AArch64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
+
+def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
+                                                 SDTCisPtrTy<1>]>;
+def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
+                                        [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
+                                         SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
+                                         SDTCisSameAs<1, 4>]>;
+
+
+// Node definitions.
+def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
+def AArch64addlow        : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
+def AArch64LOADgot       : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
+def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
+                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+                                [SDNPHasChain, SDNPOutGlue]>;
+def AArch64callseq_end   : SDNode<"ISD::CALLSEQ_END",
+                                SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                               SDTCisVT<1, i32> ]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64call          : SDNode<"AArch64ISD::CALL",
+                                SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                 SDNPVariadic]>;
+def AArch64brcond        : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
+                                [SDNPHasChain]>;
+def AArch64cbz           : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
+                                [SDNPHasChain]>;
+def AArch64cbnz           : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz,
+                                [SDNPHasChain]>;
+def AArch64tbz           : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
+                                [SDNPHasChain]>;
+def AArch64tbnz           : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
+                                [SDNPHasChain]>;
+
+
+def AArch64csel          : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
+def AArch64csinv         : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>;
+def AArch64csneg         : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>;
+def AArch64csinc         : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>;
+def AArch64retflag       : SDNode<"AArch64ISD::RET_FLAG", SDTNone,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def AArch64adc       : SDNode<"AArch64ISD::ADC",  SDTBinaryArithWithFlagsIn >;
+def AArch64sbc       : SDNode<"AArch64ISD::SBC",  SDTBinaryArithWithFlagsIn>;
+def AArch64add_flag  : SDNode<"AArch64ISD::ADDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def AArch64sub_flag  : SDNode<"AArch64ISD::SUBS",  SDTBinaryArithWithFlagsOut>;
+def AArch64and_flag  : SDNode<"AArch64ISD::ANDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def AArch64adc_flag  : SDNode<"AArch64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
+def AArch64sbc_flag  : SDNode<"AArch64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
+
+def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
+
+def AArch64fcmp      : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
+
+def AArch64fmax      : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>;
+def AArch64fmin      : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>;
+
+def AArch64dup       : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
+def AArch64duplane8  : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
+def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
+def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
+def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
+
+def AArch64zip1      : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
+def AArch64zip2      : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
+def AArch64uzp1      : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
+def AArch64uzp2      : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
+def AArch64trn1      : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
+def AArch64trn2      : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;
+
+def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
+def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
+def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
+def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>;
+def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
+def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
+def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
+
+def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
+def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
+def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
+def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
+
+def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
+def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
+def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
+def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
+def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
+def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
+def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
+def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
+
+def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
+def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
+def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+
+def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
+def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
+def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>;
+def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>;
+def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>;
+
+def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>;
+def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>;
+def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>;
+
+def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>;
+def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>;
+def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
+def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
+def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
+def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
+                        (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
+
+def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
+def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
+def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>;
+def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
+def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;
+
+def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
+def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
+
+def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;
+
+def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
+                  [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def AArch64Prefetch        : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
+                               [SDNPHasChain, SDNPSideEffect]>;
+
+def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
+def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
+
+def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL",
+                                 SDT_AArch64TLSDescCall,
+                                 [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
+                                  SDNPVariadic]>;
+
+def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
+                                 SDT_AArch64WrapperLarge>;
 
-// (A64BFI LHS, RHS, LSB, Width)
-def SDTA64BFI : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
-                                     SDTCisSameAs<1, 2>,
-                                     SDTCisVT<3, i64>,
-                                     SDTCisVT<4, i64>]>;
 
-def A64Bfi : SDNode<"AArch64ISD::BFI", SDTA64BFI>;
+//===----------------------------------------------------------------------===//
 
-// (A64EXTR HiReg, LoReg, LSB)
-def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
-                                      SDTCisVT<3, i64>]>;
-def A64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
+//===----------------------------------------------------------------------===//
 
-// (A64[SU]BFX Field, ImmR, ImmS).
+// AArch64 Instruction Predicate Definitions.
 //
-// Note that ImmR and ImmS are already encoded for the actual instructions. The
-// more natural LSB and Width mix together to form ImmR and ImmS, something
-// which TableGen can't handle.
-def SDTA64BFX : SDTypeProfile<1, 3, [SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
-def A64Sbfx : SDNode<"AArch64ISD::SBFX", SDTA64BFX>;
+def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
+def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
+def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
+def ForCodeSize   : Predicate<"ForCodeSize">;
+def NotForCodeSize   : Predicate<"!ForCodeSize">;
 
-def A64Ubfx : SDNode<"AArch64ISD::UBFX", SDTA64BFX>;
+include "AArch64InstrFormats.td"
 
-class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+//===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Call sequence pseudo-instructions
+// Miscellaneous instructions.
 //===----------------------------------------------------------------------===//
 
+let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              [(AArch64callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
+} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
+
+let isReMaterializable = 1, isCodeGenOnly = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, they can be
+// removed, along with the AArch64Wrapper node.
+
+let AddedComplexity = 10 in
+def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
+                     [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
+              Sched<[WriteLDAdr]>;
+
+// The MOVaddr instruction should match only when the add is not folded
+// into a load or store address.
+def MOVaddr
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
+                                            tglobaladdr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrJT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
+                                             tjumptable:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrCP
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
+                                             tconstpool:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrBA
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
+                                             tblockaddress:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrTLS
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
+                                            tglobaltlsaddr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrEXT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
+                                            texternalsym:$low))]>,
+      Sched<[WriteAdrAdr]>;
+
+} // isReMaterializable, isCodeGenOnly
+
+def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
+          (LOADgot tglobaltlsaddr:$addr)>;
+
+def : Pat<(AArch64LOADgot texternalsym:$addr),
+          (LOADgot texternalsym:$addr)>;
+
+def : Pat<(AArch64LOADgot tconstpool:$addr),
+          (LOADgot tconstpool:$addr)>;
 
-def SDT_AArch64Call : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
-def AArch64Call : SDNode<"AArch64ISD::Call", SDT_AArch64Call,
-                     [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
-
-def AArch64tcret : SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64Call,
-                          [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-
-// The TLSDESCCALL node is a variant call which goes to an indirectly calculated
-// destination but needs a relocation against a fixed symbol. As such it has two
-// certain operands: the callee and the relocated variable.
-//
-// The TLS ABI only allows it to be selected to a BLR instructin (with
-// appropriate relocation).
-def SDTTLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
-
-def A64tlsdesc_blr : SDNode<"AArch64ISD::TLSDESCCALL", SDTTLSDescCall,
-                            [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
-                             SDNPVariadic]>;
-
-
-def SDT_AArch64CallSeqStart : SDCallSeqStart<[ SDTCisPtrTy<0> ]>;
-def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AArch64CallSeqStart,
-                                  [SDNPHasChain, SDNPOutGlue]>;
-
-def SDT_AArch64CallSeqEnd   : SDCallSeqEnd<[ SDTCisPtrTy<0>, SDTCisPtrTy<1> ]>;
-def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",   SDT_AArch64CallSeqEnd,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-
-
+//===----------------------------------------------------------------------===//
+// System instructions.
+//===----------------------------------------------------------------------===//
 
-// These pseudo-instructions have special semantics by virtue of being passed to
-// the InstrInfo constructor. CALLSEQ_START/CALLSEQ_END are produced by
-// LowerCall to (in our case) tell the back-end about stack adjustments for
-// arguments passed on the stack. Here we select those markers to
-// pseudo-instructions which explicitly set the stack, and finally in the
-// RegisterInfo we convert them to a true stack adjustment.
-let Defs = [XSP], Uses = [XSP] in {
-  def ADJCALLSTACKDOWN : PseudoInst<(outs), (ins i64imm:$amt),
-                                    [(AArch64callseq_start timm:$amt)]>;
+def HINT  : HintI<"hint">;
+def : InstAlias<"nop",  (HINT 0b000)>;
+def : InstAlias<"yield",(HINT 0b001)>;
+def : InstAlias<"wfe",  (HINT 0b010)>;
+def : InstAlias<"wfi",  (HINT 0b011)>;
+def : InstAlias<"sev",  (HINT 0b100)>;
+def : InstAlias<"sevl", (HINT 0b101)>;
 
-  def ADJCALLSTACKUP : PseudoInst<(outs), (ins i64imm:$amt1, i64imm:$amt2),
-                                 [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
-}
+  // As far as LLVM is concerned this writes to the system's exclusive monitors.
+let mayLoad = 1, mayStore = 1 in
+def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
 
-//===----------------------------------------------------------------------===//
-// Atomic operation pseudo-instructions
-//===----------------------------------------------------------------------===//
+def DMB   : CRmSystemI<barrier_op, 0b101, "dmb">;
+def DSB   : CRmSystemI<barrier_op, 0b100, "dsb">;
+def ISB   : CRmSystemI<barrier_op, 0b110, "isb">;
+def : InstAlias<"clrex", (CLREX 0xf)>;
+def : InstAlias<"isb", (ISB 0xf)>;
 
-// These get selected from C++ code as a pretty much direct translation from the
-// generic DAG nodes. The one exception is the AtomicOrdering is added as an
-// operand so that the eventual lowering can make use of it and choose
-// acquire/release operations when required.
-
-let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1 in {
-multiclass AtomicSizes {
-  def _I8 : PseudoInst<(outs GPR32:$dst),
-                       (ins GPR64xsp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I16 : PseudoInst<(outs GPR32:$dst),
-                        (ins GPR64xsp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I32 : PseudoInst<(outs GPR32:$dst),
-                        (ins GPR64xsp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I64 : PseudoInst<(outs GPR64:$dst),
-                        (ins GPR64xsp:$ptr, GPR64:$incr, i32imm:$ordering), []>;
-}
-}
+def MRS    : MRSI;
+def MSR    : MSRI;
+def MSRpstate: MSRpstateI;
 
-defm ATOMIC_LOAD_ADD  : AtomicSizes;
-defm ATOMIC_LOAD_SUB  : AtomicSizes;
-defm ATOMIC_LOAD_AND  : AtomicSizes;
-defm ATOMIC_LOAD_OR   : AtomicSizes;
-defm ATOMIC_LOAD_XOR  : AtomicSizes;
-defm ATOMIC_LOAD_NAND : AtomicSizes;
-defm ATOMIC_SWAP      : AtomicSizes;
-let Defs = [NZCV] in {
-  // These operations need a CMP to calculate the correct value
-  defm ATOMIC_LOAD_MIN  : AtomicSizes;
-  defm ATOMIC_LOAD_MAX  : AtomicSizes;
-  defm ATOMIC_LOAD_UMIN : AtomicSizes;
-  defm ATOMIC_LOAD_UMAX : AtomicSizes;
-}
+// The thread pointer (on Linux, at least, where this has been implemented) is
+// TPIDR_EL0.
+def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
 
-class AtomicCmpSwap<RegisterClass GPRData>
-  : PseudoInst<(outs GPRData:$dst),
-               (ins GPR64xsp:$ptr, GPRData:$old, GPRData:$new,
-                    i32imm:$ordering), []> {
-  let usesCustomInserter = 1;
-  let hasCtrlDep = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let Defs = [NZCV];
-}
+// Generic system instructions
+def SYSxt  : SystemXtI<0, "sys">;
+def SYSLxt : SystemLXtI<1, "sysl">;
 
-def ATOMIC_CMP_SWAP_I8  : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<GPR64>;
+def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
+                (SYSxt imm0_7:$op1, sys_cr_op:$Cn,
+                 sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
 
 //===----------------------------------------------------------------------===//
-// Add-subtract (extended register) instructions
+// Move immediate instructions.
 //===----------------------------------------------------------------------===//
-// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP
-
-// The RHS of these operations is conceptually a sign/zero-extended
-// register, optionally shifted left by 1-4. The extension can be a
-// NOP (e.g. "sxtx" sign-extending a 64-bit register to 64-bits) but
-// must be specified with one exception:
-
-// If one of the registers is sp/wsp then LSL is an alias for UXTW in
-// 32-bit instructions and UXTX in 64-bit versions, the shift amount
-// is not optional in that case (but can explicitly be 0), and the
-// entire suffix can be skipped (e.g. "add sp, x3, x2").
-
-multiclass extend_operands<string PREFIX, string Diag> {
-     def _asmoperand : AsmOperandClass {
-         let Name = PREFIX;
-         let RenderMethod = "addRegExtendOperands";
-         let PredicateMethod = "isRegExtend<A64SE::" # PREFIX # ">";
-         let DiagnosticType = "AddSubRegExtend" # Diag;
-     }
-
-     def _operand : Operand<i64>,
-                    ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 4; }]> {
-         let PrintMethod = "printRegExtendOperand<A64SE::" # PREFIX # ">";
-         let DecoderMethod = "DecodeRegExtendOperand";
-         let ParserMatchClass = !cast<AsmOperandClass>(PREFIX # "_asmoperand");
-     }
-}
 
-defm UXTB : extend_operands<"UXTB", "Small">;
-defm UXTH : extend_operands<"UXTH", "Small">;
-defm UXTW : extend_operands<"UXTW", "Small">;
-defm UXTX : extend_operands<"UXTX", "Large">;
-defm SXTB : extend_operands<"SXTB", "Small">;
-defm SXTH : extend_operands<"SXTH", "Small">;
-defm SXTW : extend_operands<"SXTW", "Small">;
-defm SXTX : extend_operands<"SXTX", "Large">;
-
-def LSL_extasmoperand : AsmOperandClass {
-    let Name = "RegExtendLSL";
-    let RenderMethod = "addRegExtendOperands";
-    let DiagnosticType = "AddSubRegExtendLarge";
-}
+defm MOVK : InsertImmediate<0b11, "movk">;
+defm MOVN : MoveImmediate<0b00, "movn">;
 
-def LSL_extoperand : Operand<i64> {
-    let ParserMatchClass = LSL_extasmoperand;
-}
+let PostEncoderMethod = "fixMOVZ" in
+defm MOVZ : MoveImmediate<0b10, "movz">;
 
+// First group of aliases covers an implicit "lsl #0".
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
 
-// The patterns for various sign-extensions are a little ugly and
-// non-uniform because everything has already been promoted to the
-// legal i64 and i32 types. We'll wrap the various variants up in a
-// class for use later.
-class extend_types {
-    dag uxtb; dag uxth; dag uxtw; dag uxtx;
-    dag sxtb; dag sxth; dag sxtw; dag sxtx;
-    ValueType ty;
-    RegisterClass GPR;
-}
+// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
 
-def extends_to_i64 : extend_types {
-    let uxtb = (and (anyext i32:$Rm), 255);
-    let uxth = (and (anyext i32:$Rm), 65535);
-    let uxtw = (zext i32:$Rm);
-    let uxtx = (i64 $Rm);
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
 
-    let sxtb = (sext_inreg (anyext i32:$Rm), i8);
-    let sxth = (sext_inreg (anyext i32:$Rm), i16);
-    let sxtw = (sext i32:$Rm);
-    let sxtx = (i64 $Rm);
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
 
-    let ty = i64;
-    let GPR = GPR64xsp;
-}
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
 
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
 
-def extends_to_i32 : extend_types {
-    let uxtb = (and i32:$Rm, 255);
-    let uxth = (and i32:$Rm, 65535);
-    let uxtw = (i32 i32:$Rm);
-    let uxtx = (i32 i32:$Rm);
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
 
-    let sxtb = (sext_inreg i32:$Rm, i8);
-    let sxth = (sext_inreg i32:$Rm, i16);
-    let sxtw = (i32 i32:$Rm);
-    let sxtx = (i32 i32:$Rm);
+// Final group of aliases covers true "mov $Rd, $imm" cases.
+multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
+                          int width, int shift> {
+  def _asmoperand : AsmOperandClass {
+    let Name = basename # width # "_lsl" # shift # "MovAlias";
+    let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
+                               # shift # ">";
+    let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
+  }
 
-    let ty = i32;
-    let GPR = GPR32wsp;
-}
+  def _movimm : Operand<i32> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
+  }
 
-// Now, six of the extensions supported are easy and uniform: if the source size
-// is 32-bits or less, then Rm is always a 32-bit register. We'll instantiate
-// those instructions in one block.
-
-// The uxtx/sxtx could potentially be merged in, but three facts dissuaded me:
-//     + It would break the naming scheme: either ADDxx_uxtx or ADDww_uxtx would
-//       be impossible.
-//     + Patterns are very different as well.
-//     + Passing different registers would be ugly (more fields in extend_types
-//       would probably be the best option).
-multiclass addsub_exts<bit sf, bit op, bit S, string asmop,
-                       SDPatternOperator opfrag,
-                       dag outs, extend_types exts> {
-    def w_uxtb : A64I_addsubext<sf, op, S, 0b00, 0b000,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTB_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.uxtb, UXTB_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-    def w_uxth : A64I_addsubext<sf, op, S, 0b00, 0b001,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTH_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.uxth, UXTH_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-    def w_uxtw : A64I_addsubext<sf, op, S, 0b00, 0b010,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTW_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.uxtw, UXTW_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-
-    def w_sxtb : A64I_addsubext<sf, op, S, 0b00, 0b100,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTB_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.sxtb, SXTB_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-    def w_sxth : A64I_addsubext<sf, op, S, 0b00, 0b101,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTH_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.sxth, SXTH_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-    def w_sxtw : A64I_addsubext<sf, op, S, 0b00, 0b110,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTW_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.sxtw, SXTW_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-}
+  def : InstAlias<"mov $Rd, $imm",
+                  (INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
+}
+
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;
+
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;
+
+let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
+    isAsCheapAsAMove = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, we can select
+// directly to the real instructions and get rid of these pseudos.
+
+def MOVi32imm
+    : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
+             [(set GPR32:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+def MOVi64imm
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
+             [(set GPR64:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+} // isReMaterializable, isCodeGenOnly
+
+// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
+// eventual expansion code fewer bits to worry about getting right. Marshalling
+// the types is a little tricky though:
+def i64imm_32bit : ImmLeaf<i64, [{
+  return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
+}]>;
 
-// These two could be merge in with the above, but their patterns aren't really
-// necessary and the naming-scheme would necessarily break:
-multiclass addsub_xxtx<bit op, bit S, string asmop, SDPatternOperator opfrag,
-                       dag outs> {
-    def x_uxtx : A64I_addsubext<0b1, op, S, 0b00, 0b011,
-                   outs,
-                   (ins GPR64xsp:$Rn, GPR64:$Rm, UXTX_operand:$Imm3),
-                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                   [(opfrag i64:$Rn, (shl i64:$Rm, UXTX_operand:$Imm3))],
-                   NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-
-    def x_sxtx : A64I_addsubext<0b1, op, S, 0b00, 0b111,
-                   outs,
-                   (ins GPR64xsp:$Rn, GPR64:$Rm, SXTX_operand:$Imm3),
-                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                   [/* No Pattern: same as uxtx */],
-                   NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-}
+def trunc_imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i32);
+}]>;
 
-multiclass addsub_wxtx<bit op, bit S, string asmop, dag outs> {
-    def w_uxtx : A64I_addsubext<0b0, op, S, 0b00, 0b011,
-                   outs, (ins GPR32wsp:$Rn, GPR32:$Rm, UXTX_operand:$Imm3),
-                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                   [/* No pattern: probably same as uxtw */],
-                   NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-
-    def w_sxtx : A64I_addsubext<0b0, op, S, 0b00, 0b111,
-                   outs, (ins GPR32wsp:$Rn, GPR32:$Rm, SXTX_operand:$Imm3),
-                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                   [/* No Pattern: probably same as uxtw */],
-                   NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-}
+def : Pat<(i64 i64imm_32bit:$src),
+          (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
+
+// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
+// sequences.
+def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
+                             tglobaladdr:$g1, tglobaladdr:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
+                                  tglobaladdr:$g2, 32),
+                          tglobaladdr:$g1, 16),
+                  tglobaladdr:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
+                             tblockaddress:$g1, tblockaddress:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
+                                  tblockaddress:$g2, 32),
+                          tblockaddress:$g1, 16),
+                  tblockaddress:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
+                             tconstpool:$g1, tconstpool:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
+                                  tconstpool:$g2, 32),
+                          tconstpool:$g1, 16),
+                  tconstpool:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
+                             tjumptable:$g1, tjumptable:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
+                                  tjumptable:$g2, 32),
+                          tjumptable:$g1, 16),
+                  tjumptable:$g0, 0)>;
 
-class SetRD<RegisterClass RC, SDPatternOperator op>
- : PatFrag<(ops node:$lhs, node:$rhs), (set RC:$Rd, (op node:$lhs, node:$rhs))>;
-class SetNZCV<SDPatternOperator op>
-  : PatFrag<(ops node:$lhs, node:$rhs), (set NZCV, (op node:$lhs, node:$rhs))>;
-
-defm ADDxx :addsub_exts<0b1, 0b0, 0b0, "add\t$Rd, ", SetRD<GPR64xsp, add>,
-                        (outs GPR64xsp:$Rd), extends_to_i64>,
-            addsub_xxtx<     0b0, 0b0, "add\t$Rd, ", SetRD<GPR64xsp, add>,
-                        (outs GPR64xsp:$Rd)>;
-defm ADDww :addsub_exts<0b0, 0b0, 0b0, "add\t$Rd, ", SetRD<GPR32wsp, add>,
-                        (outs GPR32wsp:$Rd), extends_to_i32>,
-            addsub_wxtx<     0b0, 0b0, "add\t$Rd, ",
-                        (outs GPR32wsp:$Rd)>;
-defm SUBxx :addsub_exts<0b1, 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR64xsp, sub>,
-                        (outs GPR64xsp:$Rd), extends_to_i64>,
-            addsub_xxtx<     0b1, 0b0, "sub\t$Rd, ", SetRD<GPR64xsp, sub>,
-                        (outs GPR64xsp:$Rd)>;
-defm SUBww :addsub_exts<0b0, 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR32wsp, sub>,
-                        (outs GPR32wsp:$Rd), extends_to_i32>,
-            addsub_wxtx<     0b1, 0b0, "sub\t$Rd, ",
-                        (outs GPR32wsp:$Rd)>;
-
-let Defs = [NZCV] in {
-defm ADDSxx :addsub_exts<0b1, 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR64, addc>,
-                         (outs GPR64:$Rd), extends_to_i64>,
-             addsub_xxtx<     0b0, 0b1, "adds\t$Rd, ", SetRD<GPR64, addc>,
-                         (outs GPR64:$Rd)>;
-defm ADDSww :addsub_exts<0b0, 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR32, addc>,
-                         (outs GPR32:$Rd), extends_to_i32>,
-             addsub_wxtx<     0b0, 0b1, "adds\t$Rd, ",
-                         (outs GPR32:$Rd)>;
-defm SUBSxx :addsub_exts<0b1, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR64, subc>,
-                         (outs GPR64:$Rd), extends_to_i64>,
-             addsub_xxtx<     0b1, 0b1, "subs\t$Rd, ", SetRD<GPR64, subc>,
-                         (outs GPR64:$Rd)>;
-defm SUBSww :addsub_exts<0b0, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR32, subc>,
-                         (outs GPR32:$Rd), extends_to_i32>,
-             addsub_wxtx<     0b1, 0b1, "subs\t$Rd, ",
-                         (outs GPR32:$Rd)>;
-
-
-let SchedRW = [WriteCMP, ReadCMP, ReadCMP], Rd = 0b11111, isCompare = 1 in {
-defm CMNx : addsub_exts<0b1, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
-                        (outs), extends_to_i64>,
-            addsub_xxtx<     0b0, 0b1, "cmn\t", SetNZCV<A64cmn>, (outs)>;
-defm CMNw : addsub_exts<0b0, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
-                        (outs), extends_to_i32>,
-            addsub_wxtx<     0b0, 0b1, "cmn\t", (outs)>;
-defm CMPx : addsub_exts<0b1, 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>,
-                        (outs), extends_to_i64>,
-            addsub_xxtx<     0b1, 0b1, "cmp\t", SetNZCV<A64cmp>, (outs)>;
-defm CMPw : addsub_exts<0b0, 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>,
-                        (outs), extends_to_i32>,
-            addsub_wxtx<     0b1, 0b1, "cmp\t", (outs)>;
-}
-}
 
-// Now patterns for the operation without a shift being needed. No patterns are
-// created for uxtx/sxtx since they're non-uniform and it's expected that
-// add/sub (shifted register) will handle those cases anyway.
-multiclass addsubext_noshift_patterns<string prefix, SDPatternOperator nodeop,
-                                      extend_types exts> {
-    def : Pat<(nodeop exts.ty:$Rn, exts.uxtb),
-              (!cast<Instruction>(prefix # "w_uxtb") $Rn, $Rm, 0)>;
-    def : Pat<(nodeop exts.ty:$Rn, exts.uxth),
-              (!cast<Instruction>(prefix # "w_uxth") $Rn, $Rm, 0)>;
-    def : Pat<(nodeop exts.ty:$Rn, exts.uxtw),
-              (!cast<Instruction>(prefix # "w_uxtw") $Rn, $Rm, 0)>;
-
-    def : Pat<(nodeop exts.ty:$Rn, exts.sxtb),
-              (!cast<Instruction>(prefix # "w_sxtb") $Rn, $Rm, 0)>;
-    def : Pat<(nodeop exts.ty:$Rn, exts.sxth),
-              (!cast<Instruction>(prefix # "w_sxth") $Rn, $Rm, 0)>;
-    def : Pat<(nodeop exts.ty:$Rn, exts.sxtw),
-              (!cast<Instruction>(prefix # "w_sxtw") $Rn, $Rm, 0)>;
-}
+//===----------------------------------------------------------------------===//
+// Arithmetic instructions.
+//===----------------------------------------------------------------------===//
 
-defm : addsubext_noshift_patterns<"ADDxx", add, extends_to_i64>;
-defm : addsubext_noshift_patterns<"ADDww", add, extends_to_i32>;
-defm : addsubext_noshift_patterns<"SUBxx", sub, extends_to_i64>;
-defm : addsubext_noshift_patterns<"SUBww", sub, extends_to_i32>;
+// Add/subtract with carry.
+defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>;
+defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>;
+
+def : InstAlias<"ngc $dst, $src",  (SBCWr  GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngc $dst, $src",  (SBCXr  GPR64:$dst, XZR, GPR64:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
+
+// Add/subtract
+defm ADD : AddSub<0, "add", add>;
+defm SUB : AddSub<1, "sub">;
+
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
+
+defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn">;
+defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp">;
+
+// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
+def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
+          (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
+          (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
+          (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
+          (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
+          (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
+          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src$shift",
+                (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"neg $dst, $src$shift",
+                (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src$shift",
+                (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"negs $dst, $src$shift",
+                (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+
+// Unsigned/Signed divide
+defm UDIV : Div<0, "udiv", udiv>;
+defm SDIV : Div<1, "sdiv", sdiv>;
+let isCodeGenOnly = 1 in {
+defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>;
+defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>;
+}
+
+// Variable shift
+defm ASRV : Shift<0b10, "asr", sra>;
+defm LSLV : Shift<0b00, "lsl", shl>;
+defm LSRV : Shift<0b01, "lsr", srl>;
+defm RORV : Shift<0b11, "ror", rotr>;
+
+def : ShiftAlias<"asrv", ASRVWr, GPR32>;
+def : ShiftAlias<"asrv", ASRVXr, GPR64>;
+def : ShiftAlias<"lslv", LSLVWr, GPR32>;
+def : ShiftAlias<"lslv", LSLVXr, GPR64>;
+def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
+def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
+def : ShiftAlias<"rorv", RORVWr, GPR32>;
+def : ShiftAlias<"rorv", RORVXr, GPR64>;
+
+// Multiply-add
+let AddedComplexity = 7 in {
+defm MADD : MulAccum<0, "madd", add>;
+defm MSUB : MulAccum<1, "msub", sub>;
+
+def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
+          (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
+          (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+
+def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
+          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
+          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+} // AddedComplexity = 7
+
+let AddedComplexity = 5 in {
+def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
+def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
+def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
+def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
+          (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
+          (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
+          (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
+          (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+} // AddedComplexity = 5
+
+def : MulAccumWAlias<"mul", MADDWrrr>;
+def : MulAccumXAlias<"mul", MADDXrrr>;
+def : MulAccumWAlias<"mneg", MSUBWrrr>;
+def : MulAccumXAlias<"mneg", MSUBXrrr>;
+def : WideMulAccumAlias<"smull", SMADDLrrr>;
+def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
+def : WideMulAccumAlias<"umull", UMADDLrrr>;
+def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
+
+// Multiply-high
+def SMULHrr : MulHi<0b010, "smulh", mulhs>;
+def UMULHrr : MulHi<0b110, "umulh", mulhu>;
+
+// CRC32
+def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">;
+def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">;
+def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">;
+def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">;
+
+def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">;
+def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
+def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
+def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
 
-defm : addsubext_noshift_patterns<"CMNx", A64cmn, extends_to_i64>;
-defm : addsubext_noshift_patterns<"CMNw", A64cmn, extends_to_i32>;
-defm : addsubext_noshift_patterns<"CMPx", A64cmp, extends_to_i64>;
-defm : addsubext_noshift_patterns<"CMPw", A64cmp, extends_to_i32>;
 
-// An extend of "lsl #imm" is valid if and only if one of Rn and Rd is
-// sp/wsp. It is synonymous with uxtx/uxtw depending on the size of the
-// operation. Also permitted in this case is complete omission of the argument,
-// which implies "lsl #0".
-multiclass lsl_aliases<string asmop, Instruction inst, RegisterClass GPR_Rd,
-                       RegisterClass GPR_Rn, RegisterClass GPR_Rm> {
-    def : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
-                    (inst GPR_Rd:$Rd, GPR_Rn:$Rn, GPR_Rm:$Rm, 0)>;
+//===----------------------------------------------------------------------===//
+// Logical instructions.
+//===----------------------------------------------------------------------===//
 
-    def : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm, $LSL"),
-                (inst GPR_Rd:$Rd, GPR_Rn:$Rn, GPR_Rm:$Rm, LSL_extoperand:$LSL)>;
+// (immediate)
+defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag>;
+defm AND  : LogicalImm<0b00, "and", and>;
+defm EOR  : LogicalImm<0b10, "eor", xor>;
+defm ORR  : LogicalImm<0b01, "orr", or>;
+
+// FIXME: these aliases *are* canonical sometimes (when movz can't be
+// used). Actually, it seems to be working right now, but putting logical_immXX
+// here is a bit dodgy on the AsmParser side too.
+def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
+                                          logical_imm32:$imm), 0>;
+def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
+                                          logical_imm64:$imm), 0>;
+
+
+// (register)
+defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>;
+defm BICS : LogicalRegS<0b11, 1, "bics",
+                        BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>;
+defm AND  : LogicalReg<0b00, 0, "and", and>;
+defm BIC  : LogicalReg<0b00, 1, "bic",
+                       BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+defm EON  : LogicalReg<0b10, 1, "eon",
+                       BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
+defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
+defm ORN  : LogicalReg<0b01, 1, "orn",
+                       BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
+defm ORR  : LogicalReg<0b01, 0, "orr", or>;
+
+def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
+def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
+
+def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
+def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
+
+def : InstAlias<"mvn $Wd, $Wm$sh",
+                (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
+def : InstAlias<"mvn $Xd, $Xm$sh",
+                (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+                (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
+def : InstAlias<"tst $src1, $src2",
+                (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+                        (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
+def : InstAlias<"tst $src1, $src2",
+                        (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
+
+def : InstAlias<"tst $src1, $src2$sh",
+               (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
+def : InstAlias<"tst $src1, $src2$sh",
+               (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
+
+
+def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
+def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
 
-}
 
-defm : lsl_aliases<"add",  ADDxxx_uxtx,  Rxsp, GPR64xsp, GPR64>;
-defm : lsl_aliases<"add",  ADDxxx_uxtx,  GPR64xsp, Rxsp, GPR64>;
-defm : lsl_aliases<"add",  ADDwww_uxtw,  Rwsp, GPR32wsp, GPR32>;
-defm : lsl_aliases<"add",  ADDwww_uxtw,  GPR32wsp, Rwsp, GPR32>;
-defm : lsl_aliases<"sub",  SUBxxx_uxtx,  Rxsp, GPR64xsp, GPR64>;
-defm : lsl_aliases<"sub",  SUBxxx_uxtx,  GPR64xsp, Rxsp, GPR64>;
-defm : lsl_aliases<"sub",  SUBwww_uxtw,  Rwsp, GPR32wsp, GPR32>;
-defm : lsl_aliases<"sub",  SUBwww_uxtw,  GPR32wsp, Rwsp, GPR32>;
-
-// Rd cannot be sp for flag-setting variants so only half of the aliases are
-// needed.
-defm : lsl_aliases<"adds", ADDSxxx_uxtx, GPR64, Rxsp, GPR64>;
-defm : lsl_aliases<"adds", ADDSwww_uxtw, GPR32, Rwsp, GPR32>;
-defm : lsl_aliases<"subs", SUBSxxx_uxtx, GPR64, Rxsp, GPR64>;
-defm : lsl_aliases<"subs", SUBSwww_uxtw, GPR32, Rwsp, GPR32>;
-
-// CMP unfortunately has to be different because the instruction doesn't have a
-// dest register.
-multiclass cmp_lsl_aliases<string asmop, Instruction inst,
-                       RegisterClass GPR_Rn, RegisterClass GPR_Rm> {
-    def : InstAlias<!strconcat(asmop, " $Rn, $Rm"),
-                    (inst GPR_Rn:$Rn, GPR_Rm:$Rm, 0)>;
-
-    def : InstAlias<!strconcat(asmop, " $Rn, $Rm, $LSL"),
-                    (inst GPR_Rn:$Rn, GPR_Rm:$Rm, LSL_extoperand:$LSL)>;
-}
+//===----------------------------------------------------------------------===//
+// One operand data processing instructions.
+//===----------------------------------------------------------------------===//
 
-defm : cmp_lsl_aliases<"cmp", CMPxx_uxtx, Rxsp, GPR64>;
-defm : cmp_lsl_aliases<"cmp", CMPww_uxtw, Rwsp, GPR32>;
-defm : cmp_lsl_aliases<"cmn", CMNxx_uxtx, Rxsp, GPR64>;
-defm : cmp_lsl_aliases<"cmn", CMNww_uxtw, Rwsp, GPR32>;
+defm CLS    : OneOperandData<0b101, "cls">;
+defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
+defm RBIT   : OneOperandData<0b000, "rbit">;
+def  REV16Wr : OneWRegData<0b001, "rev16",
+                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
+def  REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
+
+def : Pat<(cttz GPR32:$Rn),
+          (CLZWr (RBITWr GPR32:$Rn))>;
+def : Pat<(cttz GPR64:$Rn),
+          (CLZXr (RBITXr GPR64:$Rn))>;
+def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
+                (i32 1))),
+          (CLSWr GPR32:$Rn)>;
+def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
+                (i64 1))),
+          (CLSXr GPR64:$Rn)>;
+
+// Unlike the other one operand instructions, the instructions with the "rev"
+// mnemonic do *not* just different in the size bit, but actually use different
+// opcode bits for the different sizes.
+def REVWr   : OneWRegData<0b010, "rev", bswap>;
+def REVXr   : OneXRegData<0b011, "rev", bswap>;
+def REV32Xr : OneXRegData<0b010, "rev32",
+                                 UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
+
+// The bswap commutes with the rotr so we want a pattern for both possible
+// orders.
+def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
+def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
 
 //===----------------------------------------------------------------------===//
-// Add-subtract (immediate) instructions
+// Bitfield immediate extraction instruction.
 //===----------------------------------------------------------------------===//
-// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, MOV
-
-// These instructions accept a 12-bit unsigned immediate, optionally shifted
-// left by 12 bits. Official assembly format specifies a 12 bit immediate with
-// one of "", "LSL #0", "LSL #12" supplementary operands.
-
-// There are surprisingly few ways to make this work with TableGen, so this
-// implementation has separate instructions for the "LSL #0" and "LSL #12"
-// variants.
-
-// If the MCInst retained a single combined immediate (which could be 0x123000,
-// for example) then both components (imm & shift) would have to be delegated to
-// a single assembly operand. This would entail a separate operand parser
-// (because the LSL would have to live in the same AArch64Operand as the
-// immediate to be accessible); assembly parsing is rather complex and
-// error-prone C++ code.
-//
-// By splitting the immediate, we can delegate handling this optional operand to
-// an InstAlias. Supporting functions to generate the correct MCInst are still
-// required, but these are essentially trivial and parsing can remain generic.
-//
-// Rejected plans with rationale:
-// ------------------------------
-//
-// In an ideal world you'de have two first class immediate operands (in
-// InOperandList, specifying imm12 and shift). Unfortunately this is not
-// selectable by any means I could discover.
-//
-// An Instruction with two MCOperands hidden behind a single entry in
-// InOperandList (expanded by ComplexPatterns and MIOperandInfo) was functional,
-// but required more C++ code to handle encoding/decoding. Parsing (the intended
-// main beneficiary) ended up equally complex because of the optional nature of
-// "LSL #0".
-//
-// Attempting to circumvent the need for a custom OperandParser above by giving
-// InstAliases without the "lsl #0" failed. add/sub could be accommodated but
-// the cmp/cmn aliases didn't use the MIOperandInfo to determine how operands
-// should be parsed: there was no way to accommodate an "lsl #12".
-
-let ParserMethod = "ParseImmWithLSLOperand",
-    RenderMethod = "addImmWithLSLOperands" in {
-  // Derived PredicateMethod fields are different for each
-  def addsubimm_lsl0_asmoperand : AsmOperandClass {
-    let Name = "AddSubImmLSL0";
-    // If an error is reported against this operand, instruction could also be a
-    // register variant.
-    let DiagnosticType = "AddSubSecondSource";
-  }
+let neverHasSideEffects = 1 in
+defm EXTR : ExtractImm<"extr">;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
+
+def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
+          (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
+def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
+          (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
 
-  def addsubimm_lsl12_asmoperand : AsmOperandClass {
-    let Name = "AddSubImmLSL12";
-    let DiagnosticType = "AddSubSecondSource";
-  }
+//===----------------------------------------------------------------------===//
+// Other bitfield immediate instructions.
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in {
+defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
+defm SBFM : BitfieldImm<0b00, "sbfm">;
+defm UBFM : BitfieldImm<0b10, "ubfm">;
 }
 
-def shr_12_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue() >> 12, MVT::i32);
+def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
-def shr_12_neg_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((-N->getSExtValue()) >> 12, MVT::i32);
+def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
-def neg_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(-N->getSExtValue(), MVT::i32);
+// min(7, 31 - shift_amt)
+def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
+// min(15, 31 - shift_amt)
+def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-multiclass addsub_imm_operands<ValueType ty> {
- let PrintMethod = "printAddSubImmLSL0Operand",
-      EncoderMethod = "getAddSubImmOpValue",
-      ParserMatchClass = addsubimm_lsl0_asmoperand in {
-    def _posimm_lsl0 : Operand<ty>,
-        ImmLeaf<ty, [{ return Imm >= 0 && (Imm & ~0xfff) == 0; }]>;
-    def _negimm_lsl0 : Operand<ty>,
-        ImmLeaf<ty, [{ return Imm < 0 && (-Imm & ~0xfff) == 0; }],
-                neg_XFORM>;
-  }
-
-  let PrintMethod = "printAddSubImmLSL12Operand",
-      EncoderMethod = "getAddSubImmOpValue",
-      ParserMatchClass = addsubimm_lsl12_asmoperand in {
-    def _posimm_lsl12 : Operand<ty>,
-        ImmLeaf<ty, [{ return Imm >= 0 && (Imm & ~0xfff000) == 0; }],
-                shr_12_XFORM>;
-
-    def _negimm_lsl12 : Operand<ty>,
-        ImmLeaf<ty, [{ return Imm < 0 && (-Imm & ~0xfff000) == 0; }],
-                shr_12_neg_XFORM>;
-  }
-}
+def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-// The add operands don't need any transformation
-defm addsubimm_operand_i32 : addsub_imm_operands<i32>;
-defm addsubimm_operand_i64 : addsub_imm_operands<i64>;
-
-multiclass addsubimm_varieties<string prefix, bit sf, bit op, bits<2> shift,
-                               string asmop, string cmpasmop,
-                               Operand imm_operand, Operand cmp_imm_operand,
-                               RegisterClass GPR, RegisterClass GPRsp,
-                               AArch64Reg ZR, ValueType Ty> {
-    // All registers for non-S variants allow SP
-  def _s : A64I_addsubimm<sf, op, 0b0, shift,
-                         (outs GPRsp:$Rd),
-                         (ins GPRsp:$Rn, imm_operand:$Imm12),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm12"),
-                         [(set Ty:$Rd, (add Ty:$Rn, imm_operand:$Imm12))],
-                         NoItinerary>,
-           Sched<[WriteALU, ReadALU]>;
-
-
-  // S variants can read SP but would write to ZR
-  def _S : A64I_addsubimm<sf, op, 0b1, shift,
-                         (outs GPR:$Rd),
-                         (ins GPRsp:$Rn, imm_operand:$Imm12),
-                         !strconcat(asmop, "s\t$Rd, $Rn, $Imm12"),
-                         [(set Ty:$Rd, (addc Ty:$Rn, imm_operand:$Imm12))],
-                         NoItinerary>,
-           Sched<[WriteALU, ReadALU]> {
-    let Defs = [NZCV];
-  }
+def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-  // Note that the pattern here for ADDS is subtle. Canonically CMP
-  // a, b becomes SUBS a, b. If b < 0 then this is equivalent to
-  // ADDS a, (-b). This is not true in general.
-  def _cmp : A64I_addsubimm<sf, op, 0b1, shift,
-                            (outs), (ins GPRsp:$Rn, imm_operand:$Imm12),
-                            !strconcat(cmpasmop, " $Rn, $Imm12"),
-                            [(set NZCV,
-                                  (A64cmp Ty:$Rn, cmp_imm_operand:$Imm12))],
-                            NoItinerary>,
-           Sched<[WriteCMP, ReadCMP]> {
-    let Rd = 0b11111;
-    let Defs = [NZCV];
-    let isCompare = 1;
-  }
-}
+// min(7, 63 - shift_amt)
+def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
+// min(15, 63 - shift_amt)
+def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-multiclass addsubimm_shifts<string prefix, bit sf, bit op,
-           string asmop, string cmpasmop, string operand, string cmpoperand,
-           RegisterClass GPR, RegisterClass GPRsp, AArch64Reg ZR,
-           ValueType Ty> {
-  defm _lsl0 : addsubimm_varieties<prefix # "_lsl0", sf, op, 0b00,
-                                   asmop, cmpasmop,
-                                   !cast<Operand>(operand # "_lsl0"),
-                                   !cast<Operand>(cmpoperand # "_lsl0"),
-                                   GPR, GPRsp, ZR, Ty>;
-
-  defm _lsl12 : addsubimm_varieties<prefix # "_lsl12", sf, op, 0b01,
-                                    asmop, cmpasmop,
-                                    !cast<Operand>(operand # "_lsl12"),
-                                    !cast<Operand>(cmpoperand # "_lsl12"),
-                                    GPR, GPRsp, ZR, Ty>;
-}
+// min(31, 63 - shift_amt)
+def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 31 ? 31 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-defm ADDwwi : addsubimm_shifts<"ADDwi", 0b0, 0b0, "add", "cmn",
-                              "addsubimm_operand_i32_posimm",
-                              "addsubimm_operand_i32_negimm",
-                              GPR32, GPR32wsp, WZR, i32>;
-defm ADDxxi : addsubimm_shifts<"ADDxi", 0b1, 0b0, "add", "cmn",
-                              "addsubimm_operand_i64_posimm",
-                              "addsubimm_operand_i64_negimm",
-                              GPR64, GPR64xsp, XZR, i64>;
-defm SUBwwi : addsubimm_shifts<"SUBwi", 0b0, 0b1, "sub", "cmp",
-                              "addsubimm_operand_i32_negimm",
-                              "addsubimm_operand_i32_posimm",
-                              GPR32, GPR32wsp, WZR, i32>;
-defm SUBxxi : addsubimm_shifts<"SUBxi", 0b1, 0b1, "sub", "cmp",
-                              "addsubimm_operand_i64_negimm",
-                              "addsubimm_operand_i64_posimm",
-                              GPR64, GPR64xsp, XZR, i64>;
-
-multiclass MOVsp<RegisterClass GPRsp, RegisterClass SP, Instruction addop> {
-  def _fromsp : InstAlias<"mov $Rd, $Rn",
-                          (addop GPRsp:$Rd, SP:$Rn, 0),
-                          0b1>;
-
-  def _tosp : InstAlias<"mov $Rd, $Rn",
-                        (addop SP:$Rd, GPRsp:$Rn, 0),
-                        0b1>;
-}
+def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+                              (i64 (i32shift_b imm0_31:$imm)))>;
+def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_b imm0_63:$imm)))>;
 
-// Recall Rxsp is a RegisterClass containing *just* xsp.
-defm MOVxx : MOVsp<GPR64xsp, Rxsp, ADDxxi_lsl0_s>;
-defm MOVww : MOVsp<GPR32wsp, Rwsp, ADDwwi_lsl0_s>;
+let AddedComplexity = 10 in {
+def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+}
+
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
 
 //===----------------------------------------------------------------------===//
-// Add-subtract (shifted register) instructions
+// Conditionally set flags instructions.
 //===----------------------------------------------------------------------===//
-// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, NEG, NEGS
-
-//===-------------------------------
-// 1. The "shifted register" operands. Shared with logical insts.
-//===-------------------------------
-
-multiclass shift_operands<string prefix, string form> {
-  def _asmoperand_i32 : AsmOperandClass {
-    let Name = "Shift" # form # "i32";
-    let RenderMethod = "addShiftOperands";
-    let PredicateMethod = "isShift<A64SE::" # form # ", false>";
-    let DiagnosticType = "AddSubRegShift32";
-  }
-
-  // Note that the operand type is intentionally i64 because the DAGCombiner
-  // puts these into a canonical form.
-  def _i32 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
-    let ParserMatchClass
-          = !cast<AsmOperandClass>(prefix # "_asmoperand_i32");
-    let PrintMethod = "printShiftOperand<A64SE::" # form # ">";
-    let DecoderMethod = "Decode32BitShiftOperand";
-  }
+defm CCMN : CondSetFlagsImm<0, "ccmn">;
+defm CCMP : CondSetFlagsImm<1, "ccmp">;
 
-  def _asmoperand_i64 : AsmOperandClass {
-      let Name = "Shift" # form # "i64";
-      let RenderMethod = "addShiftOperands";
-      let PredicateMethod = "isShift<A64SE::" # form # ", true>";
-      let DiagnosticType = "AddSubRegShift64";
-  }
-
-  def _i64 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
-    let ParserMatchClass
-          = !cast<AsmOperandClass>(prefix # "_asmoperand_i64");
-    let PrintMethod = "printShiftOperand<A64SE::" # form # ">";
-  }
-}
+defm CCMN : CondSetFlagsReg<0, "ccmn">;
+defm CCMP : CondSetFlagsReg<1, "ccmp">;
 
-defm lsl_operand : shift_operands<"lsl_operand", "LSL">;
-defm lsr_operand : shift_operands<"lsr_operand", "LSR">;
-defm asr_operand : shift_operands<"asr_operand", "ASR">;
-
-// Not used for add/sub, but defined here for completeness. The "logical
-// (shifted register)" instructions *do* have an ROR variant.
-defm ror_operand : shift_operands<"ror_operand", "ROR">;
-
-//===-------------------------------
-// 2. The basic 3.5-operand ADD/SUB/ADDS/SUBS instructions.
-//===-------------------------------
-
-// N.b. the commutable parameter is just !N. It will be first against the wall
-// when the revolution comes.
-multiclass addsub_shifts<string prefix, bit sf, bit op, bit s, bit commutable,
-                         string asmop, SDPatternOperator opfrag, ValueType ty,
-                         RegisterClass GPR, list<Register> defs> {
-  let isCommutable = commutable, Defs = defs in {
-  def _lsl : A64I_addsubshift<sf, op, s, 0b00,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set GPR:$Rd, (opfrag ty:$Rn, (shl ty:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]>;
-
-  def _lsr : A64I_addsubshift<sf, op, s, 0b01,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (srl ty:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]>;
-
-  def _asr : A64I_addsubshift<sf, op, s, 0b10,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (sra ty:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]>;
-  }
+//===----------------------------------------------------------------------===//
+// Conditional select instructions.
+//===----------------------------------------------------------------------===//
+defm CSEL  : CondSelect<0, 0b00, "csel">;
+
+def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
+defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
+defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
+defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
+
+def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+
+def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
+          (CSINCWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
+          (CSINCXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
+          (CSINVWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
+          (CSINVXr XZR, XZR, (i32 imm:$cc))>;
+
+// The inverse of the condition code from the alias instruction is what is used
+// in the aliased instruction. The parser all ready inverts the condition code
+// for these aliases.
+def : InstAlias<"cset $dst, $cc",
+                (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"cset $dst, $cc",
+                (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"csetm $dst, $cc",
+                (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"csetm $dst, $cc",
+                (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
 
-  def _noshift
-      : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
-                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rd, GPR:$Rn,
-                                                      GPR:$Rm, 0)>;
+//===----------------------------------------------------------------------===//
+// PC-relative instructions.
+//===----------------------------------------------------------------------===//
+let isReMaterializable = 1 in {
+let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
+def ADR  : ADRI<0, "adr", adrlabel, []>;
+} // neverHasSideEffects = 1
 
-  def : Pat<(opfrag ty:$Rn, ty:$Rm),
-            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
+def ADRP : ADRI<1, "adrp", adrplabel,
+                [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
+} // isReMaterializable = 1
 
-multiclass addsub_sizes<string prefix, bit op, bit s, bit commutable,
-                         string asmop, SDPatternOperator opfrag,
-                         list<Register> defs> {
-  defm xxx : addsub_shifts<prefix # "xxx", 0b1, op, s,
-                           commutable, asmop, opfrag, i64, GPR64, defs>;
-  defm www : addsub_shifts<prefix # "www", 0b0, op, s,
-                           commutable, asmop, opfrag, i32, GPR32, defs>;
-}
+// page address of a constant pool entry, block address
+def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
+def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
 
+//===----------------------------------------------------------------------===//
+// Unconditional branch (register) instructions.
+//===----------------------------------------------------------------------===//
 
-defm ADD : addsub_sizes<"ADD", 0b0, 0b0, 0b1, "add", add, []>;
-defm SUB : addsub_sizes<"SUB", 0b1, 0b0, 0b0, "sub", sub, []>;
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+def RET  : BranchReg<0b0010, "ret", []>;
+def DRPS : SpecialReturn<0b0101, "drps">;
+def ERET : SpecialReturn<0b0100, "eret">;
+} // isReturn = 1, isTerminator = 1, isBarrier = 1
 
-defm ADDS : addsub_sizes<"ADDS", 0b0, 0b1, 0b1, "adds", addc, [NZCV]>;
-defm SUBS : addsub_sizes<"SUBS", 0b1, 0b1, 0b0, "subs", subc, [NZCV]>;
+// Default to the LR register.
+def : InstAlias<"ret", (RET LR)>;
 
-//===-------------------------------
-// 1. The NEG/NEGS aliases
-//===-------------------------------
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
+} // isCall
 
-multiclass neg_alias<Instruction INST, RegisterClass GPR, Register ZR,
-                     ValueType ty, Operand shift_operand, SDNode shiftop> {
-   def : InstAlias<"neg $Rd, $Rm, $Imm6",
-                   (INST GPR:$Rd, ZR, GPR:$Rm, shift_operand:$Imm6)>;
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
+} // isBranch, isTerminator, isBarrier, isIndirectBranch
 
-   def : Pat<(sub 0, (shiftop ty:$Rm, shift_operand:$Imm6)),
-             (INST ZR, $Rm, shift_operand:$Imm6)>;
+// Create a separate pseudo-instruction for codegen to use so that we don't
+// flag lr as used in every function. It'll be restored before the RET by the
+// epilogue if it's legitimately used.
+def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
 }
 
-defm : neg_alias<SUBwww_lsl, GPR32, WZR, i32, lsl_operand_i32, shl>;
-defm : neg_alias<SUBwww_lsr, GPR32, WZR, i32, lsr_operand_i32, srl>;
-defm : neg_alias<SUBwww_asr, GPR32, WZR, i32, asr_operand_i32, sra>;
-def : InstAlias<"neg $Rd, $Rm", (SUBwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
-def : Pat<(sub 0, i32:$Rm), (SUBwww_lsl WZR, $Rm, 0)>;
-
-defm : neg_alias<SUBxxx_lsl, GPR64, XZR, i64, lsl_operand_i64, shl>;
-defm : neg_alias<SUBxxx_lsr, GPR64, XZR, i64, lsr_operand_i64, srl>;
-defm : neg_alias<SUBxxx_asr, GPR64, XZR, i64, asr_operand_i64, sra>;
-def : InstAlias<"neg $Rd, $Rm", (SUBxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
-def : Pat<(sub 0, i64:$Rm), (SUBxxx_lsl XZR, $Rm, 0)>;
-
-// NEGS doesn't get any patterns yet: defining multiple outputs means C++ has to
-// be involved.
-class negs_alias<Instruction INST, RegisterClass GPR,
-                 Register ZR, Operand shift_operand, SDNode shiftop>
-  : InstAlias<"negs $Rd, $Rm, $Imm6",
-              (INST GPR:$Rd, ZR, GPR:$Rm, shift_operand:$Imm6)>;
-
-def : negs_alias<SUBSwww_lsl, GPR32, WZR, lsl_operand_i32, shl>;
-def : negs_alias<SUBSwww_lsr, GPR32, WZR, lsr_operand_i32, srl>;
-def : negs_alias<SUBSwww_asr, GPR32, WZR, asr_operand_i32, sra>;
-def : InstAlias<"negs $Rd, $Rm", (SUBSwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
-
-def : negs_alias<SUBSxxx_lsl, GPR64, XZR, lsl_operand_i64, shl>;
-def : negs_alias<SUBSxxx_lsr, GPR64, XZR, lsr_operand_i64, srl>;
-def : negs_alias<SUBSxxx_asr, GPR64, XZR, asr_operand_i64, sra>;
-def : InstAlias<"negs $Rd, $Rm", (SUBSxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
-
-//===-------------------------------
-// 1. The CMP/CMN aliases
-//===-------------------------------
-
-multiclass cmp_shifts<string prefix, bit sf, bit op, bit commutable,
-                      string asmop, SDPatternOperator opfrag, ValueType ty,
-                      RegisterClass GPR> {
-  let isCommutable = commutable, Rd = 0b11111, Defs = [NZCV] in {
-  def _lsl : A64I_addsubshift<sf, op, 0b1, 0b00,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
-                       [(set NZCV, (opfrag ty:$Rn, (shl ty:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-
-  def _lsr : A64I_addsubshift<sf, op, 0b1, 0b01,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
-                       [(set NZCV, (opfrag ty:$Rn, (srl ty:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-
-  def _asr : A64I_addsubshift<sf, op, 0b1, 0b10,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
-                       [(set NZCV, (opfrag ty:$Rn, (sra ty:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-  }
-
-  def _noshift
-      : InstAlias<!strconcat(asmop, " $Rn, $Rm"),
-                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
-
-  def : Pat<(opfrag ty:$Rn, ty:$Rm),
-            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
+// This is a directive-like pseudo-instruction. The purpose is to insert an
+// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
+// (which in the usual case is a BLR).
+let hasSideEffects = 1 in
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
+  let AsmString = ".tlsdesccall $sym";
 }
 
-defm CMPww : cmp_shifts<"CMPww", 0b0, 0b1, 0b0, "cmp", A64cmp, i32, GPR32>;
-defm CMPxx : cmp_shifts<"CMPxx", 0b1, 0b1, 0b0, "cmp", A64cmp, i64, GPR64>;
-
-defm CMNww : cmp_shifts<"CMNww", 0b0, 0b0, 0b1, "cmn", A64cmn, i32, GPR32>;
-defm CMNxx : cmp_shifts<"CMNxx", 0b1, 0b0, 0b1, "cmn", A64cmn, i64, GPR64>;
+// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
+// gets expanded to two MCInsts during lowering.
+let isCall = 1, Defs = [LR] in
+def TLSDESC_BLR
+    : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
+             [(AArch64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
 
+def : Pat<(AArch64tlsdesc_call GPR64:$dest, texternalsym:$sym),
+          (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
 //===----------------------------------------------------------------------===//
-// Add-subtract (with carry) instructions
+// Conditional branch (immediate) instruction.
 //===----------------------------------------------------------------------===//
-// Contains: ADC, ADCS, SBC, SBCS + aliases NGC, NGCS
-
-multiclass A64I_addsubcarrySizes<bit op, bit s, string asmop> {
-  let Uses = [NZCV] in {
-    def www : A64I_addsubcarry<0b0, op, s, 0b000000,
-                               (outs GPR32:$Rd), (ins GPR32:$Rn, GPR32:$Rm),
-                               !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                               [], NoItinerary>,
-              Sched<[WriteALU, ReadALU, ReadALU]>;
-
-    def xxx : A64I_addsubcarry<0b1, op, s, 0b000000,
-                               (outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
-                               !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                               [], NoItinerary>,
-              Sched<[WriteALU, ReadALU, ReadALU]>;
-  }
-}
-
-let isCommutable = 1 in {
-  defm ADC : A64I_addsubcarrySizes<0b0, 0b0, "adc">;
-}
-
-defm SBC : A64I_addsubcarrySizes<0b1, 0b0, "sbc">;
-
-let Defs = [NZCV] in {
-  let isCommutable = 1 in {
-    defm ADCS : A64I_addsubcarrySizes<0b0, 0b1, "adcs">;
-  }
-
-  defm SBCS : A64I_addsubcarrySizes<0b1, 0b1, "sbcs">;
-}
+def Bcc : BranchCond;
 
-def : InstAlias<"ngc $Rd, $Rm", (SBCwww GPR32:$Rd, WZR, GPR32:$Rm)>;
-def : InstAlias<"ngc $Rd, $Rm", (SBCxxx GPR64:$Rd, XZR, GPR64:$Rm)>;
-def : InstAlias<"ngcs $Rd, $Rm", (SBCSwww GPR32:$Rd, WZR, GPR32:$Rm)>;
-def : InstAlias<"ngcs $Rd, $Rm", (SBCSxxx GPR64:$Rd, XZR, GPR64:$Rm)>;
+//===----------------------------------------------------------------------===//
+// Compare-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm CBZ  : CmpBranch<0, "cbz", AArch64cbz>;
+defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>;
 
-// Note that adde and sube can form a chain longer than two (e.g. for 256-bit
-// addition). So the flag-setting instructions are appropriate.
-def : Pat<(adde i32:$Rn, i32:$Rm), (ADCSwww $Rn, $Rm)>;
-def : Pat<(adde i64:$Rn, i64:$Rm), (ADCSxxx $Rn, $Rm)>;
-def : Pat<(sube i32:$Rn, i32:$Rm), (SBCSwww $Rn, $Rm)>;
-def : Pat<(sube i64:$Rn, i64:$Rm), (SBCSxxx $Rn, $Rm)>;
+//===----------------------------------------------------------------------===//
+// Test-bit-and-branch instructions.
+//===----------------------------------------------------------------------===//
+defm TBZ  : TestBranch<0, "tbz", AArch64tbz>;
+defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>;
 
 //===----------------------------------------------------------------------===//
-// Bitfield
+// Unconditional branch (immediate) instructions.
 //===----------------------------------------------------------------------===//
-// Contains: SBFM, BFM, UBFM, [SU]XT[BHW], ASR, LSR, LSL, SBFI[ZX], BFI, BFXIL,
-//     UBFIZ, UBFX
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def B  : BranchImm<0, "b", [(br bb:$addr)]>;
+} // isBranch, isTerminator, isBarrier
 
-// Because of the rather complicated nearly-overlapping aliases, the decoding of
-// this range of instructions is handled manually. The architectural
-// instructions are BFM, SBFM and UBFM but a disassembler should never produce
-// these.
-//
-// In the end, the best option was to use BFM instructions for decoding under
-// almost all circumstances, but to create aliasing *Instructions* for each of
-// the canonical forms and specify a completely custom decoder which would
-// substitute the correct MCInst as needed.
-//
-// This also simplifies instruction selection, parsing etc because the MCInsts
-// have a shape that's closer to their use in code.
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>;
+} // isCall
+def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
 
-//===-------------------------------
-// 1. The architectural BFM instructions
-//===-------------------------------
+//===----------------------------------------------------------------------===//
+// Exception generation instructions.
+//===----------------------------------------------------------------------===//
+def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
+def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
+def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
+def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
+def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
+def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
+def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
+def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
+
+// DCPSn defaults to an immediate operand of zero if unspecified.
+def : InstAlias<"dcps1", (DCPS1 0)>;
+def : InstAlias<"dcps2", (DCPS2 0)>;
+def : InstAlias<"dcps3", (DCPS3 0)>;
 
-def uimm5_asmoperand : AsmOperandClass {
-  let Name = "UImm5";
-  let PredicateMethod = "isUImm<5>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm5";
-}
+//===----------------------------------------------------------------------===//
+// Load instructions.
+//===----------------------------------------------------------------------===//
 
-def uimm6_asmoperand : AsmOperandClass {
-  let Name = "UImm6";
-  let PredicateMethod = "isUImm<6>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm6";
+// Pair (indexed, offset)
+defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
+defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
+defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
+defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
+defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;
+
+defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (pre-indexed)
+def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (post-indexed)
+def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+
+// Pair (no allocate)
+defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
+defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
+defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
+defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
+defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;
+
+//---
+// (register offset)
+//---
+
+// Integer
+defm LDRBB : Load8RO<0b00,  0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
+defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
+defm LDRW  : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
+defm LDRX  : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
+
+// Floating-point
+defm LDRB : Load8RO<0b00,   1, 0b01, FPR8,   "ldr", untyped, load>;
+defm LDRH : Load16RO<0b01,  1, 0b01, FPR16,  "ldr", f16, load>;
+defm LDRS : Load32RO<0b10,  1, 0b01, FPR32,  "ldr", f32, load>;
+defm LDRD : Load64RO<0b11,  1, 0b01, FPR64,  "ldr", f64, load>;
+defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;
+
+// Load sign-extended half-word
+defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
+defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;
+
+// Load sign-extended byte
+defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
+defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;
+
+// Load sign-extended word
+defm LDRSW  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
+
+// Pre-fetch.
+defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
+                              ValueType ScalTy, ValueType VecTy,
+                              Instruction LOADW, Instruction LOADX,
+                              SubRegIndex sub> {
+  def : Pat<(VecTy (scalar_to_vector (ScalTy
+              (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
+            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+                           (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
+                           sub)>;
+
+  def : Pat<(VecTy (scalar_to_vector (ScalTy
+              (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
+            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+                           (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
+                           sub)>;
 }
 
-def bitfield32_imm : Operand<i64>,
-                     ImmLeaf<i64, [{ return Imm >= 0 && Imm < 32; }]> {
-  let ParserMatchClass = uimm5_asmoperand;
+let AddedComplexity = 10 in {
+defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v8i8,  LDRBroW, LDRBroX, bsub>;
+defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v16i8, LDRBroW, LDRBroX, bsub>;
 
-  let DecoderMethod = "DecodeBitfield32ImmOperand";
-}
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
 
+defm : ScalToVecROLoadPat<ro32, load,       i32, v2i32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load,       i32, v4i32, LDRSroW, LDRSroX, ssub>;
 
-def bitfield64_imm : Operand<i64>,
-                     ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
-  let ParserMatchClass = uimm6_asmoperand;
+defm : ScalToVecROLoadPat<ro32, load,       f32, v2f32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load,       f32, v4f32, LDRSroW, LDRSroX, ssub>;
 
-  // Default decoder works in 64-bit case: the 6-bit field can take any value.
-}
+defm : ScalToVecROLoadPat<ro64, load,       i64, v2i64, LDRDroW, LDRDroX, dsub>;
 
-multiclass A64I_bitfieldSizes<bits<2> opc, string asmop> {
-  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
-                    (ins GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
-                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                    [], NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    let DecoderMethod = "DecodeBitfieldInstruction";
-  }
+defm : ScalToVecROLoadPat<ro64, load,       f64, v2f64, LDRDroW, LDRDroX, dsub>;
 
-  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
-                    (ins GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
-                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                    [], NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    let DecoderMethod = "DecodeBitfieldInstruction";
-  }
-}
 
-defm SBFM : A64I_bitfieldSizes<0b00, "sbfm">;
-defm UBFM : A64I_bitfieldSizes<0b10, "ubfm">;
-
-// BFM instructions modify the destination register rather than defining it
-// completely.
-def BFMwwii :
-  A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
-        (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
-        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-  Sched<[WriteALU, ReadALU, ReadALU]> {
-  let DecoderMethod = "DecodeBitfieldInstruction";
-  let Constraints = "$src = $Rd";
-}
+def : Pat <(v1i64 (scalar_to_vector (i64
+                      (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                           ro_Wextend64:$extend))))),
+           (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
 
-def BFMxxii :
-  A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
-        (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
-        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-  Sched<[WriteALU, ReadALU, ReadALU]> {
-  let DecoderMethod = "DecodeBitfieldInstruction";
-  let Constraints = "$src = $Rd";
+def : Pat <(v1i64 (scalar_to_vector (i64
+                      (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                           ro_Xextend64:$extend))))),
+           (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
 }
 
+// Match all load 64 bits width whose type is compatible with FPR64
+multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
+                        Instruction LOADW, Instruction LOADX> {
 
-//===-------------------------------
-// 2. Extend aliases to 64-bit dest
-//===-------------------------------
-
-// Unfortunately the extensions that end up as 64-bits cannot be handled by an
-// instruction alias: their syntax is (for example) "SXTB x0, w0", which needs
-// to be mapped to "SBFM x0, x0, #0, 7" (changing the class of Rn). InstAlias is
-// not capable of such a map as far as I'm aware
-
-// Note that these instructions are strictly more specific than the
-// BFM ones (in ImmR) so they can handle their own decoding.
-class A64I_bf_ext<bit sf, bits<2> opc, RegisterClass GPRDest, ValueType dty,
-                    string asmop, bits<6> imms, dag pattern>
-  : A64I_bitfield<sf, opc, sf,
-                  (outs GPRDest:$Rd), (ins GPR32:$Rn),
-                  !strconcat(asmop, "\t$Rd, $Rn"),
-                  [(set dty:$Rd, pattern)], NoItinerary>,
-    Sched<[WriteALU, ReadALU]> {
-  let ImmR = 0b000000;
-  let ImmS = imms;
-}
+  def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
-// Signed extensions
-def SXTBxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxtb", 7,
-                         (sext_inreg (anyext i32:$Rn), i8)>;
-def SXTBww : A64I_bf_ext<0b0, 0b00, GPR32, i32, "sxtb", 7,
-                         (sext_inreg i32:$Rn, i8)>;
-def SXTHxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxth", 15,
-                         (sext_inreg (anyext i32:$Rn), i16)>;
-def SXTHww : A64I_bf_ext<0b0, 0b00, GPR32, i32, "sxth", 15,
-                         (sext_inreg i32:$Rn, i16)>;
-def SXTWxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxtw", 31, (sext i32:$Rn)>;
-
-// Unsigned extensions
-def UXTBww : A64I_bf_ext<0b0, 0b10, GPR32, i32, "uxtb", 7,
-                         (and i32:$Rn, 255)>;
-def UXTHww : A64I_bf_ext<0b0, 0b10, GPR32, i32, "uxth", 15,
-                         (and i32:$Rn, 65535)>;
-
-// The 64-bit unsigned variants are not strictly architectural but recommended
-// for consistency.
-let isAsmParserOnly = 1 in {
-  def UXTBxw : A64I_bf_ext<0b0, 0b10, GPR64, i64, "uxtb", 7,
-                           (and (anyext i32:$Rn), 255)>;
-  def UXTHxw : A64I_bf_ext<0b0, 0b10, GPR64, i64, "uxth", 15,
-                           (and (anyext i32:$Rn), 65535)>;
+  def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
-// Extra patterns for when the source register is actually 64-bits
-// too. There's no architectural difference here, it's just LLVM
-// shinanigans. There's no need for equivalent zero-extension patterns
-// because they'll already be caught by logical (immediate) matching.
-def : Pat<(sext_inreg i64:$Rn, i8),
-          (SXTBxw (EXTRACT_SUBREG $Rn, sub_32))>;
-def : Pat<(sext_inreg i64:$Rn, i16),
-          (SXTHxw (EXTRACT_SUBREG $Rn, sub_32))>;
-def : Pat<(sext_inreg i64:$Rn, i32),
-          (SXTWxw (EXTRACT_SUBREG $Rn, sub_32))>;
-
-
-//===-------------------------------
-// 3. Aliases for ASR and LSR (the simple shifts)
-//===-------------------------------
-
-// These also handle their own decoding because ImmS being set makes
-// them take precedence over BFM.
-multiclass A64I_shift<bits<2> opc, string asmop, SDNode opnode> {
-  def wwi : A64I_bitfield<0b0, opc, 0b0,
-                    (outs GPR32:$Rd), (ins GPR32:$Rn, bitfield32_imm:$ImmR),
-                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
-                    [(set i32:$Rd, (opnode i32:$Rn, bitfield32_imm:$ImmR))],
-                    NoItinerary>,
-            Sched<[WriteALU, ReadALU]> {
-    let ImmS = 31;
-  }
-
-  def xxi : A64I_bitfield<0b1, opc, 0b1,
-                    (outs GPR64:$Rd), (ins GPR64:$Rn, bitfield64_imm:$ImmR),
-                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
-                    [(set i64:$Rd, (opnode i64:$Rn, bitfield64_imm:$ImmR))],
-                    NoItinerary>,
-            Sched<[WriteALU, ReadALU]> {
-    let ImmS = 63;
-  }
-
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+  // We must do vector loads with LD1 in big-endian.
+  defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v8i8,  LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
+}
+
+defm : VecROLoadPat<ro64, v1i64,  LDRDroW, LDRDroX>;
+defm : VecROLoadPat<ro64, v1f64,  LDRDroW, LDRDroX>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must do vector loads with LD1 in big-endian.
+  defm : VecROLoadPat<ro128, v2i64,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v2f64,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v4i32,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v4f32,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v8i16,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v16i8,  LDRQroW, LDRQroX>;
+}
+} // AddedComplexity = 10
+
+// zextload -> i64
+multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
+                            Instruction INSTW, Instruction INSTX> {
+  def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (SUBREG_TO_REG (i64 0),
+                           (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+                           sub_32)>;
+
+  def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (SUBREG_TO_REG (i64 0),
+                           (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+                           sub_32)>;
 }
 
-defm ASR : A64I_shift<0b00, "asr", sra>;
-defm LSR : A64I_shift<0b10, "lsr", srl>;
-
-//===-------------------------------
-// 4. Aliases for LSL
-//===-------------------------------
+let AddedComplexity = 10 in {
+  defm : ExtLoadTo64ROPat<ro8,  zextloadi8,  LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW,  LDRWroX>;
 
-// Unfortunately LSL and subsequent aliases are much more complicated. We need
-// to be able to say certain output instruction fields depend in a complex
-// manner on combinations of input assembly fields).
-//
-// MIOperandInfo *might* have been able to do it, but at the cost of
-// significantly more C++ code.
-
-// N.b. contrary to usual practice these operands store the shift rather than
-// the machine bits in an MCInst. The complexity overhead of consistency
-// outweighed the benefits in this case (custom asmparser, printer and selection
-// vs custom encoder).
-def bitfield32_lsl_imm : Operand<i64>,
-                         ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
-  let ParserMatchClass = uimm5_asmoperand;
-  let EncoderMethod = "getBitfield32LSLOpValue";
-}
+  // zextloadi1 -> zextloadi8
+  defm : ExtLoadTo64ROPat<ro8,  zextloadi1,  LDRBBroW, LDRBBroX>;
 
-def bitfield64_lsl_imm : Operand<i64>,
-                         ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
-  let ParserMatchClass = uimm6_asmoperand;
-  let EncoderMethod = "getBitfield64LSLOpValue";
-}
+  // extload -> zextload
+  defm : ExtLoadTo64ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo64ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo64ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
 
-class A64I_bitfield_lsl<bit sf, RegisterClass GPR, ValueType ty,
-                        Operand operand>
-  : A64I_bitfield<sf, 0b10, sf, (outs GPR:$Rd), (ins GPR:$Rn, operand:$FullImm),
-                  "lsl\t$Rd, $Rn, $FullImm",
-                  [(set ty:$Rd, (shl ty:$Rn, operand:$FullImm))],
-                  NoItinerary>,
-    Sched<[WriteALU, ReadALU]> {
-  bits<12> FullImm;
-  let ImmR = FullImm{5-0};
-  let ImmS = FullImm{11-6};
-
-  // No disassembler allowed because it would overlap with BFM which does the
-  // actual work.
-  let isAsmParserOnly = 1;
+  // extloadi1 -> zextloadi8
+  defm : ExtLoadTo64ROPat<ro8,  extloadi1,   LDRBBroW, LDRBBroX>;
 }
 
-def LSLwwi : A64I_bitfield_lsl<0b0, GPR32, i32, bitfield32_lsl_imm>;
-def LSLxxi : A64I_bitfield_lsl<0b1, GPR64, i64, bitfield64_lsl_imm>;
 
-//===-------------------------------
-// 5. Aliases for bitfield extract instructions
-//===-------------------------------
-
-def bfx32_width_asmoperand : AsmOperandClass {
-  let Name = "BFX32Width";
-  let PredicateMethod = "isBitfieldWidth<32>";
-  let RenderMethod = "addBFXWidthOperands";
-  let DiagnosticType = "Width32";
-}
+// zextload -> i64
+multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
+                            Instruction INSTW, Instruction INSTX> {
+  def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
-def bfx32_width : Operand<i64>, ImmLeaf<i64, [{ return true; }]> {
-  let PrintMethod = "printBFXWidthOperand";
-  let ParserMatchClass = bfx32_width_asmoperand;
-}
+  def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 
-def bfx64_width_asmoperand : AsmOperandClass {
-  let Name = "BFX64Width";
-  let PredicateMethod = "isBitfieldWidth<64>";
-  let RenderMethod = "addBFXWidthOperands";
-  let DiagnosticType = "Width64";
 }
 
-def bfx64_width : Operand<i64> {
-  let PrintMethod = "printBFXWidthOperand";
-  let ParserMatchClass = bfx64_width_asmoperand;
+let AddedComplexity = 10 in {
+  // extload -> zextload
+  defm : ExtLoadTo32ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo32ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo32ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
+
+  // zextloadi1 -> zextloadi8
+  defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
+}
+
+//---
+// (unsigned immediate)
+//---
+defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
+                   [(set GPR64:$Rt,
+                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
+                   [(set GPR32:$Rt,
+                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
+                   [(set FPR8:$Rt,
+                         (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
+defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
+                   [(set (f16 FPR16:$Rt),
+                         (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
+defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
+                   [(set (f32 FPR32:$Rt),
+                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
+                   [(set (f64 FPR64:$Rt),
+                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
+                 [(set (f128 FPR128:$Rt),
+                       (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+def : Pat <(v8i8 (scalar_to_vector (i32
+               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v16i8 (scalar_to_vector (i32
+               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v4i16 (scalar_to_vector (i32
+               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v8i16 (scalar_to_vector (i32
+               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v2i32 (scalar_to_vector (i32
+               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v4i32 (scalar_to_vector (i32
+               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat <(v2i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                          (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use LD1 to perform vector loads in big-endian.
+  def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use LD1 to perform vector loads in big-endian.
+  def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(f128  (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+          (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+
+defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
+                    [(set GPR32:$Rt,
+                          (zextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                     uimm12s2:$offset)))]>;
+defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
+                    [(set GPR32:$Rt,
+                          (zextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                   uimm12s1:$offset)))]>;
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+
+// zextloadi1 -> zextloadi8
+def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// extload -> zextload
+def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
+                     [(set GPR32:$Rt,
+                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)))]>;
+defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
+                     [(set GPR64:$Rt,
+                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)))]>;
+
+// load sign-extended byte
+defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
+                     [(set GPR32:$Rt,
+                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)))]>;
+defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
+                     [(set GPR64:$Rt,
+                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)))]>;
+
+// load sign-extended word
+defm LDRSW  : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
+                     [(set GPR64:$Rt,
+                           (sextloadi32 (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s4:$offset)))]>;
+
+// load zero-extended word
+def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+      (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+
+// Pre-fetch.
+def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
+                        [(AArch64Prefetch imm:$Rt,
+                                        (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset))]>;
+
+def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
+
+//---
+// (literal)
+def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
+def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
+def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
+def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
+def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
+
+// load sign-extended word
+def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
+
+// prefetch
+def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
+//                   [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>;
+
+//---
+// (unscaled immediate)
+defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
+                    [(set GPR64:$Rt,
+                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
+                    [(set GPR32:$Rt,
+                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
+                    [(set FPR8:$Rt,
+                          (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
+                    [(set FPR16:$Rt,
+                          (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
+                    [(set (f32 FPR32:$Rt),
+                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
+                    [(set (f64 FPR64:$Rt),
+                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
+                    [(set (f128 FPR128:$Rt),
+                          (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
+
+defm LDURHH
+    : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
+             [(set GPR32:$Rt,
+                    (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURBB
+    : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
+             [(set GPR32:$Rt,
+                    (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+}
+
+//  anyext -> zext
+def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+// unscaled zext
+def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+
+//---
+// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
+
+// Define new assembler match classes as we want to only match these when
+// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
+// associate a DiagnosticType either, as we want the diagnostic for the
+// canonical form (the scaled operand) to take precedence.
+class SImm9OffsetOperand<int Width> : AsmOperandClass {
+  let Name = "SImm9OffsetFB" # Width;
+  let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
+  let RenderMethod = "addImmOperands";
 }
 
+def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
+def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
+def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
+def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
+def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;
+
+def simm9_offset_fb8 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB8Operand;
+}
+def simm9_offset_fb16 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB16Operand;
+}
+def simm9_offset_fb32 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB32Operand;
+}
+def simm9_offset_fb64 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB64Operand;
+}
+def simm9_offset_fb128 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB128Operand;
+}
+
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+               (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+  (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+  (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDURSHW
+    : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
+               [(set GPR32:$Rt,
+                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSHX
+    : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
+              [(set GPR64:$Rt,
+                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended byte
+defm LDURSBW
+    : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
+                [(set GPR32:$Rt,
+                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSBX
+    : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
+                [(set GPR64:$Rt,
+                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended word
+defm LDURSW
+    : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
+              [(set GPR64:$Rt,
+                    (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
+def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
+                (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
+                (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+                (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+                (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+                (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+                (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
+                (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+
+// Pre-fetch.
+defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
+                  [(AArch64Prefetch imm:$Rt,
+                                  (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
+defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
+
+defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
+defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
+
+// load sign-extended half-word
+defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
+defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
+
+// load sign-extended byte
+defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
+defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
+
+// load sign-extended word
+defm LDTRSW  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
+
+//---
+// (immediate pre-indexed)
+def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+//---
+// (immediate post-indexed)
+def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
 
-multiclass A64I_bitfield_extract<bits<2> opc, string asmop, SDNode op> {
-  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
-                       (ins GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
-                       !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                       [(set i32:$Rd, (op i32:$Rn, imm:$ImmR, imm:$ImmS))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    // As above, no disassembler allowed.
-    let isAsmParserOnly = 1;
-  }
-
-  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
-                       (ins GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
-                       !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                       [(set i64:$Rd, (op i64:$Rn, imm:$ImmR, imm:$ImmS))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    // As above, no disassembler allowed.
-    let isAsmParserOnly = 1;
-  }
-}
+//===----------------------------------------------------------------------===//
+// Store instructions.
+//===----------------------------------------------------------------------===//
 
-defm SBFX :  A64I_bitfield_extract<0b00, "sbfx", A64Sbfx>;
-defm UBFX :  A64I_bitfield_extract<0b10, "ubfx", A64Ubfx>;
-
-// Again, variants based on BFM modify Rd so need it as an input too.
-def BFXILwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
-                          (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
-                          "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-                Sched<[WriteALU, ReadALU, ReadALU]> {
-  // As above, no disassembler allowed.
-  let isAsmParserOnly = 1;
-  let Constraints = "$src = $Rd";
+// Pair (indexed, offset)
+// FIXME: Use dedicated range-checked addressing mode operand here.
+defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
+defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
+defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
+defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
+defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (no allocate)
+defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
+defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
+defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
+defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
+defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;
+
+//---
+// (Register offset)
+
+// Integer
+defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
+defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
+defm STRW  : Store32RO<0b10, 0, 0b00, GPR32, "str",  i32, store>;
+defm STRX  : Store64RO<0b11, 0, 0b00, GPR64, "str",  i64, store>;
+
+
+// Floating-point
+defm STRB : Store8RO< 0b00,  1, 0b00, FPR8,   "str", untyped, store>;
+defm STRH : Store16RO<0b01,  1, 0b00, FPR16,  "str", f16,     store>;
+defm STRS : Store32RO<0b10,  1, 0b00, FPR32,  "str", f32,     store>;
+defm STRD : Store64RO<0b11,  1, 0b00, FPR64,  "str", f64,     store>;
+defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128,    store>;
+
+multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
+                                 Instruction STRW, Instruction STRX> {
+
+  def : Pat<(storeop GPR64:$Rt,
+                     (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(storeop GPR64:$Rt,
+                     (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
-def BFXILxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
-                          (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
-                          "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-                Sched<[WriteALU, ReadALU, ReadALU]> {
-  // As above, no disassembler allowed.
-  let isAsmParserOnly = 1;
-  let Constraints = "$src = $Rd";
+let AddedComplexity = 10 in {
+  // truncstore i64
+  defm : TruncStoreFrom64ROPat<ro8,  truncstorei8,  STRBBroW, STRBBroX>;
+  defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
+  defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW,  STRWroX>;
 }
 
-// SBFX instructions can do a 1-instruction sign-extension of boolean values.
-def : Pat<(sext_inreg i64:$Rn, i1), (SBFXxxii $Rn, 0, 0)>;
-def : Pat<(sext_inreg i32:$Rn, i1), (SBFXwwii $Rn, 0, 0)>;
-def : Pat<(i64 (sext_inreg (anyext i32:$Rn), i1)),
-          (SBFXxxii (SUBREG_TO_REG (i64 0), $Rn, sub_32), 0, 0)>;
-
-// UBFX makes sense as an implementation of a 64-bit zero-extension too. Could
-// use either 64-bit or 32-bit variant, but 32-bit might be more efficient.
-def : Pat<(i64 (zext i32:$Rn)), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
-                                         sub_32)>;
-
-//===-------------------------------
-// 6. Aliases for bitfield insert instructions
-//===-------------------------------
-
-def bfi32_lsb_asmoperand : AsmOperandClass {
-  let Name = "BFI32LSB";
-  let PredicateMethod = "isUImm<5>";
-  let RenderMethod = "addBFILSBOperands<32>";
-  let DiagnosticType = "UImm5";
-}
+multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
+                         Instruction STRW, Instruction STRX> {
+  def : Pat<(store (VecTy FPR:$Rt),
+                   (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
-def bfi32_lsb : Operand<i64>,
-                ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
-  let PrintMethod = "printBFILSBOperand<32>";
-  let ParserMatchClass = bfi32_lsb_asmoperand;
+  def : Pat<(store (VecTy FPR:$Rt),
+                   (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
-def bfi64_lsb_asmoperand : AsmOperandClass {
-  let Name = "BFI64LSB";
-  let PredicateMethod = "isUImm<6>";
-  let RenderMethod = "addBFILSBOperands<64>";
-  let DiagnosticType = "UImm6";
-}
+let AddedComplexity = 10 in {
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
+}
+
+defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
+defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
+}
+} // AddedComplexity = 10
+
+//---
+// (unsigned immediate)
+defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
+                   [(store GPR64:$Rt,
+                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str",
+                    [(store GPR32:$Rt,
+                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
+                    [(store FPR8:$Rt,
+                            (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
+defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
+                    [(store (f16 FPR16:$Rt),
+                            (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
+defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
+                    [(store (f32 FPR32:$Rt),
+                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
+                    [(store (f64 FPR64:$Rt),
+                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
+
+defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh",
+                     [(truncstorei16 GPR32:$Rt,
+                                     (am_indexed16 GPR64sp:$Rn,
+                                                   uimm12s2:$offset))]>;
+defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1,  "strb",
+                     [(truncstorei8 GPR32:$Rt,
+                                    (am_indexed8 GPR64sp:$Rn,
+                                                 uimm12s1:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v2f32 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v8i8 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v4i16 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v2i32 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v4f32 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v16i8 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v8i16 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v4i32 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v2i64 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(store (f128  FPR128:$Rt),
+                 (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+          (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+
+// truncstore i64
+def : Pat<(truncstorei32 GPR64:$Rt,
+                         (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
+  (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt,
+                         (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+  (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
+  (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;
+
+} // AddedComplexity = 10
+
+//---
+// (unscaled immediate)
+defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
+                         [(store GPR64:$Rt,
+                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
+                         [(store GPR32:$Rt,
+                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
+                         [(store FPR8:$Rt,
+                                 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
+                         [(store (f16 FPR16:$Rt),
+                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
+                         [(store (f32 FPR32:$Rt),
+                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
+                         [(store (f64 FPR64:$Rt),
+                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
+                         [(store (f128 FPR128:$Rt),
+                                 (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
+                         [(truncstorei16 GPR32:$Rt,
+                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
+                         [(truncstorei8 GPR32:$Rt,
+                                  (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v2f32 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8i8 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4i16 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2i32 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v4f32 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v16i8 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8i16 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4i32 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2i64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+
+// unscaled i64 truncating stores
+def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+  (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+  (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+  (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+
+//---
+// STR mnemonics fall back to STUR for negative or unaligned offsets.
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+def : InstAlias<"strb $Rt, [$Rn, $offset]",
+                (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"strh $Rt, [$Rn, $offset]",
+                (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
+defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
+
+defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
+defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
+
+//---
+// (immediate pre-indexed)
+def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str",  pre_store, i32>;
+def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str",  pre_store, i64>;
+def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str",  pre_store, untyped>;
+def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str",  pre_store, f16>;
+def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str",  pre_store, f32>;
+def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str",  pre_store, f64>;
+def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;
+
+def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8,  i32>;
+def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+           simm9:$off)>;
+def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+
+def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//---
+// (immediate post-indexed)
+def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32,  "str", post_store, i32>;
+def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64,  "str", post_store, i64>;
+def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,   "str", post_store, untyped>;
+def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16,  "str", post_store, f16>;
+def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32,  "str", post_store, f32>;
+def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64,  "str", post_store, f64>;
+def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;
+
+def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
+def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+             simm9:$off)>;
+def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+             simm9:$off)>;
+
+def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 
-def bfi64_lsb : Operand<i64>,
-                ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
-  let PrintMethod = "printBFILSBOperand<64>";
-  let ParserMatchClass = bfi64_lsb_asmoperand;
-}
+//===----------------------------------------------------------------------===//
+// Load/store exclusive instructions.
+//===----------------------------------------------------------------------===//
 
-// Width verification is performed during conversion so width operand can be
-// shared between 32/64-bit cases. Still needed for the print method though
-// because ImmR encodes "width - 1".
-def bfi32_width_asmoperand : AsmOperandClass {
-  let Name = "BFI32Width";
-  let PredicateMethod = "isBitfieldWidth<32>";
-  let RenderMethod = "addBFIWidthOperands";
-  let DiagnosticType = "Width32";
-}
+def LDARW  : LoadAcquire   <0b10, 1, 1, 0, 1, GPR32, "ldar">;
+def LDARX  : LoadAcquire   <0b11, 1, 1, 0, 1, GPR64, "ldar">;
+def LDARB  : LoadAcquire   <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
+def LDARH  : LoadAcquire   <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
 
-def bfi32_width : Operand<i64>,
-                  ImmLeaf<i64, [{ return Imm >= 1 && Imm <= 32; }]> {
-  let PrintMethod = "printBFIWidthOperand";
-  let ParserMatchClass = bfi32_width_asmoperand;
-}
+def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
+def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
+def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
+def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
 
-def bfi64_width_asmoperand : AsmOperandClass {
-  let Name = "BFI64Width";
-  let PredicateMethod = "isBitfieldWidth<64>";
-  let RenderMethod = "addBFIWidthOperands";
-  let DiagnosticType = "Width64";
-}
+def LDXRW  : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
+def LDXRX  : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
+def LDXRB  : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
+def LDXRH  : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
 
-def bfi64_width : Operand<i64>,
-                  ImmLeaf<i64, [{ return Imm >= 1 && Imm <= 64; }]> {
-  let PrintMethod = "printBFIWidthOperand";
-  let ParserMatchClass = bfi64_width_asmoperand;
-}
+def STLRW  : StoreRelease  <0b10, 1, 0, 0, 1, GPR32, "stlr">;
+def STLRX  : StoreRelease  <0b11, 1, 0, 0, 1, GPR64, "stlr">;
+def STLRB  : StoreRelease  <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
+def STLRH  : StoreRelease  <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
 
-multiclass A64I_bitfield_insert<bits<2> opc, string asmop> {
-  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
-                           (ins GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
-                           !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                           [], NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    // As above, no disassembler allowed.
-    let isAsmParserOnly = 1;
-  }
+def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
+def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
+def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
+def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
 
-  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
-                           (ins GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
-                           !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                           [], NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    // As above, no disassembler allowed.
-    let isAsmParserOnly = 1;
-  }
-}
+def STXRW  : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
+def STXRX  : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
+def STXRB  : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
+def STXRH  : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
 
-defm SBFIZ :  A64I_bitfield_insert<0b00, "sbfiz">;
-defm UBFIZ :  A64I_bitfield_insert<0b10, "ubfiz">;
+def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
+def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
 
+def LDXPW  : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
+def LDXPX  : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
 
-def BFIwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
-                (ins GPR32:$src, GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
-                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-              Sched<[WriteALU, ReadALU, ReadALU]> {
-  // As above, no disassembler allowed.
-  let isAsmParserOnly = 1;
-  let Constraints = "$src = $Rd";
-}
+def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
+def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
 
-def BFIxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
-                (ins GPR64:$src, GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
-                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-              Sched<[WriteALU, ReadALU, ReadALU]> {
-  // As above, no disassembler allowed.
-  let isAsmParserOnly = 1;
-  let Constraints = "$src = $Rd";
-}
+def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
+def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
 
 //===----------------------------------------------------------------------===//
-// Compare and branch (immediate)
+// Scaled floating point to integer conversion instructions.
 //===----------------------------------------------------------------------===//
-// Contains: CBZ, CBNZ
-
-class label_asmoperand<int width, int scale> : AsmOperandClass {
-  let Name = "Label" # width # "_" # scale;
-  let PredicateMethod = "isLabel<" # width # "," # scale # ">";
-  let RenderMethod = "addLabelOperands<" # width # ", " # scale # ">";
-  let DiagnosticType = "Label";
-}
 
-def label_wid19_scal4_asmoperand : label_asmoperand<19, 4>;
-
-// All conditional immediate branches are the same really: 19 signed bits scaled
-// by the instruction-size (4).
-def bcc_target : Operand<OtherVT> {
-  // This label is a 19-bit offset from PC, scaled by the instruction-width: 4.
-  let ParserMatchClass = label_wid19_scal4_asmoperand;
-  let PrintMethod = "printLabelOperand<19, 4>";
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_condbr>";
-  let OperandType = "OPERAND_PCREL";
-}
-
-multiclass cmpbr_sizes<bit op, string asmop, ImmLeaf SETOP> {
-  let isBranch = 1, isTerminator = 1 in {
-  def x : A64I_cmpbr<0b1, op,
-                     (outs),
-                     (ins GPR64:$Rt, bcc_target:$Label),
-                     !strconcat(asmop,"\t$Rt, $Label"),
-                     [(A64br_cc (A64cmp i64:$Rt, 0), SETOP, bb:$Label)],
-                     NoItinerary>,
-          Sched<[WriteBr, ReadBr]>;
-
-  def w : A64I_cmpbr<0b0, op,
-                     (outs),
-                     (ins GPR32:$Rt, bcc_target:$Label),
-                     !strconcat(asmop,"\t$Rt, $Label"),
-                     [(A64br_cc (A64cmp i32:$Rt, 0), SETOP, bb:$Label)],
-                     NoItinerary>,
-          Sched<[WriteBr, ReadBr]>;
-  }
+defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>;
+defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>;
+defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>;
+defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>;
+defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>;
+defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
+defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
+defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
+defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
 }
 
-defm CBZ  : cmpbr_sizes<0b0, "cbz",  ImmLeaf<i32, [{
-  return Imm == A64CC::EQ;
-}]> >;
-defm CBNZ : cmpbr_sizes<0b1, "cbnz", ImmLeaf<i32, [{
-  return Imm == A64CC::NE;
-}]> >;
-
 //===----------------------------------------------------------------------===//
-// Conditional branch (immediate) instructions
+// Scaled integer to floating point conversion instructions.
 //===----------------------------------------------------------------------===//
-// Contains: B.cc
-
-def cond_code_asmoperand : AsmOperandClass {
-  let Name = "CondCode";
-  let DiagnosticType = "CondCode";
-}
-
-def cond_code : Operand<i32>, ImmLeaf<i32, [{
-  return Imm >= 0 && Imm <= 15;
-}]> {
-  let PrintMethod = "printCondCodeOperand";
-  let ParserMatchClass = cond_code_asmoperand;
-}
 
-def Bcc : A64I_condbr<0b0, 0b0, (outs),
-                (ins cond_code:$Cond, bcc_target:$Label),
-                "b.$Cond $Label", [(A64br_cc NZCV, (i32 imm:$Cond), bb:$Label)],
-                NoItinerary>,
-          Sched<[WriteBr]> {
-  let Uses = [NZCV];
-  let isBranch = 1;
-  let isTerminator = 1;
-}
+defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
+defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
 
 //===----------------------------------------------------------------------===//
-// Conditional compare (immediate) instructions
+// Unscaled integer to floating point conversion instruction.
 //===----------------------------------------------------------------------===//
-// Contains: CCMN, CCMP
 
-def uimm4_asmoperand : AsmOperandClass {
-  let Name = "UImm4";
-  let PredicateMethod = "isUImm<4>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm4";
-}
-
-def uimm4 : Operand<i32> {
-  let ParserMatchClass = uimm4_asmoperand;
-}
+defm FMOV : UnscaledConversion<"fmov">;
 
-def uimm5 : Operand<i32> {
-  let ParserMatchClass = uimm5_asmoperand;
-}
+def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
+def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
 
-// The only difference between this operand and the one for instructions like
-// B.cc is that it's parsed manually. The other get parsed implicitly as part of
-// the mnemonic handling.
-def cond_code_op_asmoperand : AsmOperandClass {
-  let Name = "CondCodeOp";
-  let RenderMethod = "addCondCodeOperands";
-  let PredicateMethod = "isCondCode";
-  let ParserMethod = "ParseCondCodeOperand";
-  let DiagnosticType = "CondCode";
-}
+//===----------------------------------------------------------------------===//
+// Floating point conversion instruction.
+//===----------------------------------------------------------------------===//
 
-def cond_code_op : Operand<i32> {
-  let PrintMethod = "printCondCodeOperand";
-  let ParserMatchClass = cond_code_op_asmoperand;
-}
+defm FCVT : FPConversion<"fcvt">;
 
-class A64I_condcmpimmImpl<bit sf, bit op, RegisterClass GPR, string asmop>
-  : A64I_condcmpimm<sf, op, 0b0, 0b0, 0b1, (outs),
-                (ins GPR:$Rn, uimm5:$UImm5, uimm4:$NZCVImm, cond_code_op:$Cond),
-                !strconcat(asmop, "\t$Rn, $UImm5, $NZCVImm, $Cond"),
-                [], NoItinerary>,
-    Sched<[WriteCMP, ReadCMP]> {
-  let Defs = [NZCV];
-}
+def : Pat<(f32_to_f16 FPR32:$Rn),
+          (i32 (COPY_TO_REGCLASS
+                   (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)),
+                   GPR32))>;
 
-def CCMNwi : A64I_condcmpimmImpl<0b0, 0b0, GPR32, "ccmn">;
-def CCMNxi : A64I_condcmpimmImpl<0b1, 0b0, GPR64, "ccmn">;
-def CCMPwi : A64I_condcmpimmImpl<0b0, 0b1, GPR32, "ccmp">;
-def CCMPxi : A64I_condcmpimmImpl<0b1, 0b1, GPR64, "ccmp">;
+def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn),
+                          [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>;
 
 //===----------------------------------------------------------------------===//
-// Conditional compare (register) instructions
+// Floating point single operand instructions.
 //===----------------------------------------------------------------------===//
-// Contains: CCMN, CCMP
-
-class A64I_condcmpregImpl<bit sf, bit op, RegisterClass GPR, string asmop>
-  : A64I_condcmpreg<sf, op, 0b0, 0b0, 0b1,
-                    (outs),
-                    (ins GPR:$Rn, GPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
-                    !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
-                    [], NoItinerary>,
-    Sched<[WriteCMP, ReadCMP, ReadCMP]> {
-  let Defs = [NZCV];
-}
 
-def CCMNww : A64I_condcmpregImpl<0b0, 0b0, GPR32, "ccmn">;
-def CCMNxx : A64I_condcmpregImpl<0b1, 0b0, GPR64, "ccmn">;
-def CCMPww : A64I_condcmpregImpl<0b0, 0b1, GPR32, "ccmp">;
-def CCMPxx : A64I_condcmpregImpl<0b1, 0b1, GPR64, "ccmp">;
+defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
+defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
+defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
+defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
+defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
 
-//===----------------------------------------------------------------------===//
-// Conditional select instructions
-//===----------------------------------------------------------------------===//
-// Contains: CSEL, CSINC, CSINV, CSNEG + aliases CSET, CSETM, CINC, CINV, CNEG
-
-// Condition code which is encoded as the inversion (semantically rather than
-// bitwise) in the instruction.
-def inv_cond_code_op_asmoperand : AsmOperandClass {
-  let Name = "InvCondCodeOp";
-  let RenderMethod = "addInvCondCodeOperands";
-  let PredicateMethod = "isCondCode";
-  let ParserMethod = "ParseCondCodeOperand";
-  let DiagnosticType = "CondCode";
-}
+def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
+          (FRINTNDr FPR64:$Rn)>;
 
-def inv_cond_code_op : Operand<i32> {
-  let ParserMatchClass = inv_cond_code_op_asmoperand;
+// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
+// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
+// <rdar://problem/13715968>
+// TODO: We should really model the FPSR flags correctly. This is really ugly.
+let hasSideEffects = 1 in {
+defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
 }
 
-// Having a separate operand for the selectable use-case is debatable, but gives
-// consistency with cond_code.
-def inv_cond_XFORM : SDNodeXForm<imm, [{
-  A64CC::CondCodes CC = static_cast<A64CC::CondCodes>(N->getZExtValue());
-  return CurDAG->getTargetConstant(A64InvertCondCode(CC), MVT::i32);
-}]>;
+defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
 
-def inv_cond_code
-  : ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 15; }], inv_cond_XFORM>;
-
-
-multiclass A64I_condselSizes<bit op, bits<2> op2, string asmop,
-                             SDPatternOperator select> {
-  let Uses = [NZCV] in {
-    def wwwc : A64I_condsel<0b0, op, 0b0, op2,
-                            (outs GPR32:$Rd),
-                            (ins GPR32:$Rn, GPR32:$Rm, cond_code_op:$Cond),
-                            !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
-                            [(set i32:$Rd, (select i32:$Rn, i32:$Rm))],
-                            NoItinerary>,
-               Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-
-
-    def xxxc : A64I_condsel<0b1, op, 0b0, op2,
-                            (outs GPR64:$Rd),
-                            (ins GPR64:$Rn, GPR64:$Rm, cond_code_op:$Cond),
-                            !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
-                            [(set i64:$Rd, (select i64:$Rn, i64:$Rm))],
-                            NoItinerary>,
-               Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-  }
+let SchedRW = [WriteFDiv] in {
+defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
 }
 
-def simple_select
-  : PatFrag<(ops node:$lhs, node:$rhs),
-            (A64select_cc NZCV, node:$lhs, node:$rhs, (i32 imm:$Cond))>;
-
-class complex_select<SDPatternOperator opnode>
-  : PatFrag<(ops node:$lhs, node:$rhs),
-        (A64select_cc NZCV, node:$lhs, (opnode node:$rhs), (i32 imm:$Cond))>;
-
-
-defm CSEL : A64I_condselSizes<0b0, 0b00, "csel", simple_select>;
-defm CSINC : A64I_condselSizes<0b0, 0b01, "csinc",
-                               complex_select<PatFrag<(ops node:$val),
-                                                      (add node:$val, 1)>>>;
-defm CSINV : A64I_condselSizes<0b1, 0b00, "csinv", complex_select<not>>;
-defm CSNEG : A64I_condselSizes<0b1, 0b01, "csneg", complex_select<ineg>>;
-
-// Now the instruction aliases, which fit nicely into LLVM's model:
-
-def : InstAlias<"cset $Rd, $Cond",
-                (CSINCwwwc GPR32:$Rd, WZR, WZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cset $Rd, $Cond",
-                (CSINCxxxc GPR64:$Rd, XZR, XZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"csetm $Rd, $Cond",
-                (CSINVwwwc GPR32:$Rd, WZR, WZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"csetm $Rd, $Cond",
-                (CSINVxxxc GPR64:$Rd, XZR, XZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinc $Rd, $Rn, $Cond",
-           (CSINCwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinc $Rd, $Rn, $Cond",
-           (CSINCxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinv $Rd, $Rn, $Cond",
-           (CSINVwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinv $Rd, $Rn, $Cond",
-           (CSINVxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cneg $Rd, $Rn, $Cond",
-           (CSNEGwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cneg $Rd, $Rn, $Cond",
-           (CSNEGxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
-
-// Finally some helper patterns.
-
-// For CSET (a.k.a. zero-extension of icmp)
-def : Pat<(A64select_cc NZCV, 0, 1, cond_code:$Cond),
-          (CSINCwwwc WZR, WZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, 1, 0, inv_cond_code:$Cond),
-          (CSINCwwwc WZR, WZR, inv_cond_code:$Cond)>;
-
-def : Pat<(A64select_cc NZCV, 0, 1, cond_code:$Cond),
-          (CSINCxxxc XZR, XZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, 1, 0, inv_cond_code:$Cond),
-          (CSINCxxxc XZR, XZR, inv_cond_code:$Cond)>;
-
-// For CSETM (a.k.a. sign-extension of icmp)
-def : Pat<(A64select_cc NZCV, 0, -1, cond_code:$Cond),
-          (CSINVwwwc WZR, WZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, -1, 0, inv_cond_code:$Cond),
-          (CSINVwwwc WZR, WZR, inv_cond_code:$Cond)>;
-
-def : Pat<(A64select_cc NZCV, 0, -1, cond_code:$Cond),
-          (CSINVxxxc XZR, XZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, -1, 0, inv_cond_code:$Cond),
-          (CSINVxxxc XZR, XZR, inv_cond_code:$Cond)>;
-
-// CINC, CINV and CNEG get dealt with automatically, which leaves the issue of
-// commutativity. The instructions are to complex for isCommutable to be used,
-// so we have to create the patterns manually:
-
-// No commutable pattern for CSEL since the commuted version is isomorphic.
-
-// CSINC
-def :Pat<(A64select_cc NZCV, (add i32:$Rm, 1), i32:$Rn, inv_cond_code:$Cond),
-         (CSINCwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
-def :Pat<(A64select_cc NZCV, (add i64:$Rm, 1), i64:$Rn, inv_cond_code:$Cond),
-         (CSINCxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
-
-// CSINV
-def :Pat<(A64select_cc NZCV, (not i32:$Rm), i32:$Rn, inv_cond_code:$Cond),
-         (CSINVwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
-def :Pat<(A64select_cc NZCV, (not i64:$Rm), i64:$Rn, inv_cond_code:$Cond),
-         (CSINVxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
-
-// CSNEG
-def :Pat<(A64select_cc NZCV, (ineg i32:$Rm), i32:$Rn, inv_cond_code:$Cond),
-         (CSNEGwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
-def :Pat<(A64select_cc NZCV, (ineg i64:$Rm), i64:$Rn, inv_cond_code:$Cond),
-         (CSNEGxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
-
 //===----------------------------------------------------------------------===//
-// Data Processing (1 source) instructions
+// Floating point two operand instructions.
 //===----------------------------------------------------------------------===//
-// Contains: RBIT, REV16, REV, REV32, CLZ, CLS.
-
-// We define an unary operator which always fails. We will use this to
-// define unary operators that cannot be matched.
-
-class A64I_dp_1src_impl<bit sf, bits<6> opcode, string asmop,
-                   list<dag> patterns, RegisterClass GPRrc,
-                   InstrItinClass itin>:
-      A64I_dp_1src<sf,
-                   0,
-                   0b00000,
-                   opcode,
-                   !strconcat(asmop, "\t$Rd, $Rn"),
-                   (outs GPRrc:$Rd),
-                   (ins GPRrc:$Rn),
-                   patterns,
-                   itin>,
-      Sched<[WriteALU, ReadALU]>;
-
-multiclass A64I_dp_1src <bits<6> opcode, string asmop> {
-  let hasSideEffects = 0 in {
-    def ww : A64I_dp_1src_impl<0b0, opcode, asmop, [], GPR32, NoItinerary>;
-    def xx : A64I_dp_1src_impl<0b1, opcode, asmop, [], GPR64, NoItinerary>;
-  }
-}
 
-defm RBIT  : A64I_dp_1src<0b000000, "rbit">;
-defm CLS   : A64I_dp_1src<0b000101, "cls">;
-defm CLZ   : A64I_dp_1src<0b000100, "clz">;
-
-def : Pat<(ctlz i32:$Rn), (CLZww $Rn)>;
-def : Pat<(ctlz i64:$Rn), (CLZxx $Rn)>;
-def : Pat<(ctlz_zero_undef i32:$Rn), (CLZww $Rn)>;
-def : Pat<(ctlz_zero_undef i64:$Rn), (CLZxx $Rn)>;
-
-def : Pat<(cttz i32:$Rn), (CLZww (RBITww $Rn))>;
-def : Pat<(cttz i64:$Rn), (CLZxx (RBITxx $Rn))>;
-def : Pat<(cttz_zero_undef i32:$Rn), (CLZww (RBITww $Rn))>;
-def : Pat<(cttz_zero_undef i64:$Rn), (CLZxx (RBITxx $Rn))>;
-
-
-def REVww : A64I_dp_1src_impl<0b0, 0b000010, "rev",
-                              [(set i32:$Rd, (bswap i32:$Rn))],
-                              GPR32, NoItinerary>;
-def REVxx : A64I_dp_1src_impl<0b1, 0b000011, "rev",
-                              [(set i64:$Rd, (bswap i64:$Rn))],
-                              GPR64, NoItinerary>;
-def REV32xx : A64I_dp_1src_impl<0b1, 0b000010, "rev32",
-                          [(set i64:$Rd, (bswap (rotr i64:$Rn, (i64 32))))],
-                          GPR64, NoItinerary>;
-def REV16ww : A64I_dp_1src_impl<0b0, 0b000001, "rev16",
-                          [(set i32:$Rd, (bswap (rotr i32:$Rn, (i64 16))))],
-                          GPR32,
-                          NoItinerary>;
-def REV16xx : A64I_dp_1src_impl<0b1, 0b000001, "rev16", [], GPR64, NoItinerary>;
+defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
+let SchedRW = [WriteFDiv] in {
+defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
+}
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>;
+defm FMAX   : TwoOperandFPData<0b0100, "fmax", AArch64fmax>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>;
+defm FMIN   : TwoOperandFPData<0b0101, "fmin", AArch64fmin>;
+let SchedRW = [WriteFMul] in {
+defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
+defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
+}
+defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
+
+def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
 
 //===----------------------------------------------------------------------===//
-// Data Processing (2 sources) instructions
+// Floating point three operand instructions.
 //===----------------------------------------------------------------------===//
-// Contains: CRC32C?[BHWX], UDIV, SDIV, LSLV, LSRV, ASRV, RORV + aliases LSL,
-//           LSR, ASR, ROR
-
-
-class dp_2src_impl<bit sf, bits<6> opcode, string asmop, list<dag> patterns,
-                   RegisterClass GPRsp,
-                   InstrItinClass itin>:
-      A64I_dp_2src<sf,
-                   opcode,
-                   0,
-                   !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                   (outs GPRsp:$Rd),
-                   (ins GPRsp:$Rn, GPRsp:$Rm),
-                   patterns,
-                   itin>,
-	  Sched<[WriteALU, ReadALU, ReadALU]>;
-
-multiclass dp_2src_crc<bit c, string asmop> {
-  def B_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 0},
-                           !strconcat(asmop, "b"), [], GPR32, NoItinerary>;
-  def H_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 1},
-                           !strconcat(asmop, "h"), [], GPR32, NoItinerary>;
-  def W_www : dp_2src_impl<0b0, {0, 1, 0, c, 1, 0},
-                           !strconcat(asmop, "w"), [], GPR32, NoItinerary>;
-  def X_wwx : A64I_dp_2src<0b1, {0, 1, 0, c, 1, 1}, 0b0,
-                           !strconcat(asmop, "x\t$Rd, $Rn, $Rm"),
-                           (outs GPR32:$Rd), (ins GPR32:$Rn, GPR64:$Rm), [],
-                           NoItinerary>,
-	          Sched<[WriteALU, ReadALU, ReadALU]>;
-}
 
-multiclass dp_2src_zext <bits<6> opcode, string asmop, SDPatternOperator op> {
-   def www : dp_2src_impl<0b0,
-                         opcode,
-                         asmop,
-                         [(set i32:$Rd,
-                               (op i32:$Rn, (i64 (zext i32:$Rm))))],
-                         GPR32,
-                         NoItinerary>;
-   def xxx : dp_2src_impl<0b1,
-                         opcode,
-                         asmop,
-                         [(set i64:$Rd, (op i64:$Rn, i64:$Rm))],
-                         GPR64,
-                         NoItinerary>;
-}
+defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
+defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
+     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
+defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
+     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
+defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
+     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
 
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
 
-multiclass dp_2src <bits<6> opcode, string asmop, SDPatternOperator op> {
-    def www : dp_2src_impl<0b0,
-                         opcode,
-                         asmop,
-                         [(set i32:$Rd, (op i32:$Rn, i32:$Rm))],
-                         GPR32,
-                         NoItinerary>;
-   def xxx : dp_2src_impl<0b1,
-                         opcode,
-                         asmop,
-                         [(set i64:$Rd, (op i64:$Rn, i64:$Rm))],
-                         GPR64,
-                         NoItinerary>;
-}
+// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
+// the NEON variant.
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
+          (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
-// Here we define the data processing 2 source instructions.
-defm CRC32  : dp_2src_crc<0b0, "crc32">;
-defm CRC32C : dp_2src_crc<0b1, "crc32c">;
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
+          (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
-let SchedRW = [WriteDiv, ReadDiv, ReadDiv] in {
-  defm UDIV : dp_2src<0b000010, "udiv", udiv>;
-  defm SDIV : dp_2src<0b000011, "sdiv", sdiv>;
-}
+// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
+// "(-a) + b*(-c)".
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
+          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
-let SchedRW = [WriteALUs, ReadALU, ReadALU] in {
-  defm LSLV : dp_2src_zext<0b001000, "lsl", shl>;
-  defm LSRV : dp_2src_zext<0b001001, "lsr", srl>;
-  defm ASRV : dp_2src_zext<0b001010, "asr", sra>;
-  defm RORV : dp_2src_zext<0b001011, "ror", rotr>;
-}
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
+          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
-// Extra patterns for an incoming 64-bit value for a 32-bit
-// operation. Since the LLVM operations are undefined (as in C) if the
-// RHS is out of range, it's perfectly permissible to discard the high
-// bits of the GPR64.
-def : Pat<(shl i32:$Rn, i64:$Rm),
-          (LSLVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-def : Pat<(srl i32:$Rn, i64:$Rm),
-          (LSRVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-def : Pat<(sra i32:$Rn, i64:$Rm),
-          (ASRVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-def : Pat<(rotr i32:$Rn, i64:$Rm),
-          (RORVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-
-// Here we define the aliases for the data processing 2 source instructions.
-def LSL_mnemonic : MnemonicAlias<"lslv", "lsl">;
-def LSR_mnemonic : MnemonicAlias<"lsrv", "lsr">;
-def ASR_menmonic : MnemonicAlias<"asrv", "asr">;
-def ROR_menmonic : MnemonicAlias<"rorv", "ror">;
+def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
+          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
+          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
 //===----------------------------------------------------------------------===//
-// Data Processing (3 sources) instructions
+// Floating point comparison instructions.
 //===----------------------------------------------------------------------===//
-// Contains: MADD, MSUB, SMADDL, SMSUBL, SMULH, UMADDL, UMSUBL, UMULH
-//    + aliases MUL, MNEG, SMULL, SMNEGL, UMULL, UMNEGL
-
-class A64I_dp3_4operand<bit sf, bits<6> opcode, RegisterClass AccReg,
-                        ValueType AccTy, RegisterClass SrcReg,
-                        string asmop, dag pattern>
-  : A64I_dp3<sf, opcode,
-             (outs AccReg:$Rd), (ins SrcReg:$Rn, SrcReg:$Rm, AccReg:$Ra),
-             !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Ra"),
-             [(set AccTy:$Rd, pattern)], NoItinerary>,
-    Sched<[WriteMAC, ReadMAC, ReadMAC, ReadMAC]> {
-  bits<5> Ra;
-  let Inst{14-10} = Ra;
-
-  RegisterClass AccGPR = AccReg;
-  RegisterClass SrcGPR = SrcReg;
-}
-
-def MADDwwww : A64I_dp3_4operand<0b0, 0b000000, GPR32, i32, GPR32, "madd",
-                                 (add i32:$Ra, (mul i32:$Rn, i32:$Rm))>;
-def MADDxxxx : A64I_dp3_4operand<0b1, 0b000000, GPR64, i64, GPR64, "madd",
-                                 (add i64:$Ra, (mul i64:$Rn, i64:$Rm))>;
-
-def MSUBwwww : A64I_dp3_4operand<0b0, 0b000001, GPR32, i32, GPR32, "msub",
-                                 (sub i32:$Ra, (mul i32:$Rn, i32:$Rm))>;
-def MSUBxxxx : A64I_dp3_4operand<0b1, 0b000001, GPR64, i64, GPR64, "msub",
-                                 (sub i64:$Ra, (mul i64:$Rn, i64:$Rm))>;
-
-def SMADDLxwwx : A64I_dp3_4operand<0b1, 0b000010, GPR64, i64, GPR32, "smaddl",
-                     (add i64:$Ra, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
-def SMSUBLxwwx : A64I_dp3_4operand<0b1, 0b000011, GPR64, i64, GPR32, "smsubl",
-                     (sub i64:$Ra, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
-
-def UMADDLxwwx : A64I_dp3_4operand<0b1, 0b001010, GPR64, i64, GPR32, "umaddl",
-                     (add i64:$Ra, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
-def UMSUBLxwwx : A64I_dp3_4operand<0b1, 0b001011, GPR64, i64, GPR32, "umsubl",
-                     (sub i64:$Ra, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
-
-let isCommutable = 1, PostEncoderMethod = "fixMulHigh" in {
-  def UMULHxxx : A64I_dp3<0b1, 0b001100, (outs GPR64:$Rd),
-                          (ins GPR64:$Rn, GPR64:$Rm),
-                          "umulh\t$Rd, $Rn, $Rm",
-                          [(set i64:$Rd, (mulhu i64:$Rn, i64:$Rm))],
-                          NoItinerary>,
-                 Sched<[WriteMAC, ReadMAC, ReadMAC]>;
-
-  def SMULHxxx : A64I_dp3<0b1, 0b000100, (outs GPR64:$Rd),
-                          (ins GPR64:$Rn, GPR64:$Rm),
-                          "smulh\t$Rd, $Rn, $Rm",
-                          [(set i64:$Rd, (mulhs i64:$Rn, i64:$Rm))],
-                          NoItinerary>,
-                 Sched<[WriteMAC, ReadMAC, ReadMAC]>;
-}
 
-multiclass A64I_dp3_3operand<string asmop, A64I_dp3_4operand INST,
-                             Register ZR, dag pattern> {
-  def : InstAlias<asmop # " $Rd, $Rn, $Rm",
-                  (INST INST.AccGPR:$Rd, INST.SrcGPR:$Rn, INST.SrcGPR:$Rm, ZR)>;
+defm FCMPE : FPComparison<1, "fcmpe">;
+defm FCMP  : FPComparison<0, "fcmp", AArch64fcmp>;
 
-  def : Pat<pattern, (INST $Rn, $Rm, ZR)>;
-}
+//===----------------------------------------------------------------------===//
+// Floating point conditional comparison instructions.
+//===----------------------------------------------------------------------===//
 
-defm : A64I_dp3_3operand<"mul", MADDwwww, WZR, (mul i32:$Rn, i32:$Rm)>;
-defm : A64I_dp3_3operand<"mul", MADDxxxx, XZR, (mul i64:$Rn, i64:$Rm)>;
+defm FCCMPE : FPCondComparison<1, "fccmpe">;
+defm FCCMP  : FPCondComparison<0, "fccmp">;
 
-defm : A64I_dp3_3operand<"mneg", MSUBwwww, WZR,
-                         (sub 0, (mul i32:$Rn, i32:$Rm))>;
-defm : A64I_dp3_3operand<"mneg", MSUBxxxx, XZR,
-                         (sub 0, (mul i64:$Rn, i64:$Rm))>;
+//===----------------------------------------------------------------------===//
+// Floating point conditional select instruction.
+//===----------------------------------------------------------------------===//
 
-defm : A64I_dp3_3operand<"smull", SMADDLxwwx, XZR,
-                         (mul (i64 (sext i32:$Rn)), (sext i32:$Rm))>;
-defm : A64I_dp3_3operand<"smnegl", SMSUBLxwwx, XZR,
-                       (sub 0, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
+defm FCSEL : FPCondSelect<"fcsel">;
 
-defm : A64I_dp3_3operand<"umull", UMADDLxwwx, XZR,
-                         (mul (i64 (zext i32:$Rn)), (zext i32:$Rm))>;
-defm : A64I_dp3_3operand<"umnegl", UMSUBLxwwx, XZR,
-                       (sub 0, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
+// CSEL instructions providing f128 types need to be handled by a
+// pseudo-instruction since the eventual code will need to introduce basic
+// blocks and control flow.
+def F128CSEL : Pseudo<(outs FPR128:$Rd),
+                      (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
+                      [(set (f128 FPR128:$Rd),
+                            (AArch64csel FPR128:$Rn, FPR128:$Rm,
+                                       (i32 imm:$cond), NZCV))]> {
+  let Uses = [NZCV];
+  let usesCustomInserter = 1;
+}
 
 
 //===----------------------------------------------------------------------===//
-// Exception generation
+// Floating point immediate move.
 //===----------------------------------------------------------------------===//
-// Contains: SVC, HVC, SMC, BRK, HLT, DCPS1, DCPS2, DCPS3
-
-def uimm16_asmoperand : AsmOperandClass {
-  let Name = "UImm16";
-  let PredicateMethod = "isUImm<16>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm16";
-}
 
-def uimm16 : Operand<i32> {
-  let ParserMatchClass = uimm16_asmoperand;
+let isReMaterializable = 1 in {
+defm FMOV : FPMoveImmediate<"fmov">;
 }
 
-class A64I_exceptImpl<bits<3> opc, bits<2> ll, string asmop>
-  : A64I_exception<opc, 0b000, ll, (outs), (ins uimm16:$UImm16),
-                   !strconcat(asmop, "\t$UImm16"), [], NoItinerary>,
-    Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-}
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two vector instructions.
+//===----------------------------------------------------------------------===//
 
-def SVCi : A64I_exceptImpl<0b000, 0b01, "svc">;
-def HVCi : A64I_exceptImpl<0b000, 0b10, "hvc">;
-def SMCi : A64I_exceptImpl<0b000, 0b11, "smc">;
-def BRKi : A64I_exceptImpl<0b001, 0b00, "brk">;
-def HLTi : A64I_exceptImpl<0b010, 0b00, "hlt">;
+defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
+defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
+defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE   : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
+defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
+defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
+
+defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>;
+defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>;
+defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
+          (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
+                                                              (i64 4)))),
+          (FCVTLv8i16 V128:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
+                                                    (i64 2))))),
+          (FCVTLv4i32 V128:$Rn)>;
+
+defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
+defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
+defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
+defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>;
+defm FCVTN  : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
+def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
+          (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd,
+                          (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
+          (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
+          (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
+defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
+defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
+                                        int_aarch64_neon_fcvtxn>;
+defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
+defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
+                                       int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
+                                       int_aarch64_neon_fcvtzu>;
+}
+defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
+defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
+defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
+defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
+defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
+defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
+defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
+defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
+defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
+                               UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
+// Aliases for MVN -> NOT.
+def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
+                (NOTv8i8 V64:$Vd, V64:$Vn)>;
+def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
+                (NOTv16i8 V128:$Vd, V128:$Vn)>;
+
+def : Pat<(AArch64neg (v8i8  V64:$Rn)),  (NEGv8i8  V64:$Rn)>;
+def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
+def : Pat<(AArch64neg (v4i16 V64:$Rn)),  (NEGv4i16 V64:$Rn)>;
+def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
+def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
+
+def : Pat<(AArch64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v1i64 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
+defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
+defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
+defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
+defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
+       BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
+defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
+defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
+defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
+defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN  : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
+defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
+defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
+defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
+       BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
+defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
+                    int_aarch64_neon_uaddlp>;
+defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
+defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
+defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
+defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
+defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
+defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
+
+def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
+
+// Patterns for vector long shift (by element width). These need to match all
+// three of zext, sext and anyext so it's easier to pull the patterns out of the
+// definition.
+multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
+  def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
+            (SHLLv8i8 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
+            (SHLLv16i8 V128:$Rn)>;
+  def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
+            (SHLLv4i16 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
+            (SHLLv8i16 V128:$Rn)>;
+  def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
+            (SHLLv2i32 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
+            (SHLLv4i32 V128:$Rn)>;
+}
+
+defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
 
-def DCPS1i : A64I_exceptImpl<0b101, 0b01, "dcps1">;
-def DCPS2i : A64I_exceptImpl<0b101, 0b10, "dcps2">;
-def DCPS3i : A64I_exceptImpl<0b101, 0b11, "dcps3">;
+//===----------------------------------------------------------------------===//
+// Advanced SIMD three vector instructions.
+//===----------------------------------------------------------------------===//
 
-// The immediate is optional for the DCPS instructions, defaulting to 0.
-def : InstAlias<"dcps1", (DCPS1i 0)>;
-def : InstAlias<"dcps2", (DCPS2i 0)>;
-def : InstAlias<"dcps3", (DCPS3i 0)>;
+defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
+defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
+defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>;
+defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>;
+defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
+defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
+defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
+defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
+defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>;
+defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>;
+defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>;
+
+// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
+// instruction expects the addend first, while the fma intrinsic puts it last.
+defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
+            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
+            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
+          (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
+defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
+defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
+                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
+                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
+defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
+      TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
+defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
+defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
+defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
+defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
+defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>;
+defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
+defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>;
+defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
+defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
+defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
+defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
+defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
+defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
+      TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
+defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
+defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
+defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
+defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
+defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>;
+defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
+defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>;
+defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
+defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
+defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
+defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
+
+defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
+defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
+                                  BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
+defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
+    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
+defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
+defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
+                                  BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
+defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
+
+def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
+def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+
+def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
+def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+
+def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmls.8b\t$dst, $src1, $src2}",
+                (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmls.16b\t$dst, $src1, $src2}",
+                (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmls.4h\t$dst, $src1, $src2}",
+                (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmls.8h\t$dst, $src1, $src2}",
+                (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmls.2s\t$dst, $src1, $src2}",
+                (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmls.4s\t$dst, $src1, $src2}",
+                (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmls.2d\t$dst, $src1, $src2}",
+                (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlo.8b\t$dst, $src1, $src2}",
+                (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlo.16b\t$dst, $src1, $src2}",
+                (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlo.4h\t$dst, $src1, $src2}",
+                (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlo.8h\t$dst, $src1, $src2}",
+                (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlo.2s\t$dst, $src1, $src2}",
+                (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlo.4s\t$dst, $src1, $src2}",
+                (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlo.2d\t$dst, $src1, $src2}",
+                (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmle.8b\t$dst, $src1, $src2}",
+                (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmle.16b\t$dst, $src1, $src2}",
+                (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmle.4h\t$dst, $src1, $src2}",
+                (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmle.8h\t$dst, $src1, $src2}",
+                (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmle.2s\t$dst, $src1, $src2}",
+                (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmle.4s\t$dst, $src1, $src2}",
+                (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmle.2d\t$dst, $src1, $src2}",
+                (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlt.8b\t$dst, $src1, $src2}",
+                (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlt.16b\t$dst, $src1, $src2}",
+                (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlt.4h\t$dst, $src1, $src2}",
+                (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlt.8h\t$dst, $src1, $src2}",
+                (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlt.2s\t$dst, $src1, $src2}",
+                (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlt.4s\t$dst, $src1, $src2}",
+                (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlt.2d\t$dst, $src1, $src2}",
+                (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmle.2s\t$dst, $src1, $src2}",
+                (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmle.4s\t$dst, $src1, $src2}",
+                (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmle.2d\t$dst, $src1, $src2}",
+                (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmlt.2s\t$dst, $src1, $src2}",
+                (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmlt.4s\t$dst, $src1, $src2}",
+                (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmlt.2d\t$dst, $src1, $src2}",
+                (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|facle.2s\t$dst, $src1, $src2}",
+                (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|facle.4s\t$dst, $src1, $src2}",
+                (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|facle.2d\t$dst, $src1, $src2}",
+                (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|faclt.2s\t$dst, $src1, $src2}",
+                (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|faclt.4s\t$dst, $src1, $src2}",
+                (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|faclt.2d\t$dst, $src1, $src2}",
+                (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
 //===----------------------------------------------------------------------===//
-// Extract (immediate)
+// Advanced SIMD three scalar instructions.
 //===----------------------------------------------------------------------===//
-// Contains: EXTR + alias ROR
-
-def EXTRwwwi : A64I_extract<0b0, 0b000, 0b0,
-                            (outs GPR32:$Rd),
-                            (ins GPR32:$Rn, GPR32:$Rm, bitfield32_imm:$LSB),
-                            "extr\t$Rd, $Rn, $Rm, $LSB",
-                            [(set i32:$Rd,
-                                  (A64Extr i32:$Rn, i32:$Rm, imm:$LSB))],
-                            NoItinerary>,
-               Sched<[WriteALU, ReadALU, ReadALU]>;
-def EXTRxxxi : A64I_extract<0b1, 0b000, 0b1,
-                            (outs GPR64:$Rd),
-                            (ins GPR64:$Rn, GPR64:$Rm, bitfield64_imm:$LSB),
-                            "extr\t$Rd, $Rn, $Rm, $LSB",
-                            [(set i64:$Rd,
-                                  (A64Extr i64:$Rn, i64:$Rm, imm:$LSB))],
-                            NoItinerary>,
-               Sched<[WriteALU, ReadALU, ReadALU]>;
-
-def : InstAlias<"ror $Rd, $Rs, $LSB",
-               (EXTRwwwi GPR32:$Rd, GPR32:$Rs, GPR32:$Rs, bitfield32_imm:$LSB)>;
-def : InstAlias<"ror $Rd, $Rs, $LSB",
-               (EXTRxxxi GPR64:$Rd, GPR64:$Rs, GPR64:$Rs, bitfield64_imm:$LSB)>;
-
-def : Pat<(rotr i32:$Rn, bitfield32_imm:$LSB),
-          (EXTRwwwi $Rn, $Rn, bitfield32_imm:$LSB)>;
-def : Pat<(rotr i64:$Rn, bitfield64_imm:$LSB),
-          (EXTRxxxi $Rn, $Rn, bitfield64_imm:$LSB)>;
+
+defm ADD      : SIMDThreeScalarD<0, 0b10000, "add", add>;
+defm CMEQ     : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE     : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>;
+def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FABD64 FPR64:$Rn, FPR64:$Rm)>;
+defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
+                                     int_aarch64_neon_facge>;
+defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
+                                     int_aarch64_neon_facgt>;
+defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
+defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
+defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
+defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB    : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>;
+defm SRSHL    : SIMDThreeScalarD<   0, 0b01010, "srshl", int_aarch64_neon_srshl>;
+defm SSHL     : SIMDThreeScalarD<   0, 0b01000, "sshl", int_aarch64_neon_sshl>;
+defm SUB      : SIMDThreeScalarD<   1, 0b10000, "sub", sub>;
+defm UQADD    : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
+defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_aarch64_neon_urshl>;
+defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_aarch64_neon_ushl>;
+
+def : InstAlias<"cmls $dst, $src1, $src2",
+                (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmle $dst, $src1, $src2",
+                (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlo $dst, $src1, $src2",
+                (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlt $dst, $src1, $src2",
+                (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
 
 //===----------------------------------------------------------------------===//
-// Floating-point compare instructions
+// Advanced SIMD three scalar instructions (mixed operands).
 //===----------------------------------------------------------------------===//
-// Contains: FCMP, FCMPE
+defm SQDMULL  : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
+                                       int_aarch64_neon_sqdmulls_scalar>;
+defm SQDMLAL  : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
+defm SQDMLSL  : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
+
+def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
+                   (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
+                   (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
 
-def fpzero_asmoperand : AsmOperandClass {
-  let Name = "FPZero";
-  let ParserMethod = "ParseFPImmOperand";
-  let DiagnosticType = "FPZero";
-}
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two scalar instructions.
+//===----------------------------------------------------------------------===//
 
-def fpz32 : Operand<f32>,
-            ComplexPattern<f32, 1, "SelectFPZeroOperand", [fpimm]> {
-  let ParserMatchClass = fpzero_asmoperand;
-  let PrintMethod = "printFPZeroOperand";
-  let DecoderMethod = "DecodeFPZeroOperand";
-}
-
-def fpz64 : Operand<f64>,
-            ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
-  let ParserMatchClass = fpzero_asmoperand;
-  let PrintMethod = "printFPZeroOperand";
-  let DecoderMethod = "DecodeFPZeroOperand";
-}
-
-def fpz64movi : Operand<i64>,
-            ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
-  let ParserMatchClass = fpzero_asmoperand;
-  let PrintMethod = "printFPZeroOperand";
-  let DecoderMethod = "DecodeFPZeroOperand";
-}
-
-multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, dag pattern> {
-  def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0},
-                          (outs), ins, "fcmp\t$Rn, $Rm", [pattern],
-                          NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    let Defs = [NZCV];
-  }
-
-  def _sig : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b1, imm, 0b0, 0b0, 0b0},
-                        (outs), ins, "fcmpe\t$Rn, $Rm", [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    let Defs = [NZCV];
-  }
-}
-
-defm FCMPss : A64I_fpcmpSignal<0b00, 0b0, (ins FPR32:$Rn, FPR32:$Rm),
-                               (set NZCV, (A64cmp f32:$Rn, f32:$Rm))>;
-defm FCMPdd : A64I_fpcmpSignal<0b01, 0b0, (ins FPR64:$Rn, FPR64:$Rm),
-                               (set NZCV, (A64cmp f64:$Rn, f64:$Rm))>;
-
-// What would be Rm should be written as 0; note that even though it's called
-// "$Rm" here to fit in with the InstrFormats, it's actually an immediate.
-defm FCMPsi : A64I_fpcmpSignal<0b00, 0b1, (ins FPR32:$Rn, fpz32:$Rm),
-                               (set NZCV, (A64cmp f32:$Rn, fpz32:$Rm))>;
-
-defm FCMPdi : A64I_fpcmpSignal<0b01, 0b1, (ins FPR64:$Rn, fpz64:$Rm),
-                               (set NZCV, (A64cmp f64:$Rn, fpz64:$Rm))>;
-
-
-//===----------------------------------------------------------------------===//
-// Floating-point conditional compare instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCCMP, FCCMPE
-
-class A64I_fpccmpImpl<bits<2> type, bit op, RegisterClass FPR, string asmop>
-  : A64I_fpccmp<0b0, 0b0, type, op,
-                (outs),
-                (ins FPR:$Rn, FPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
-                !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
-                [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Defs = [NZCV];
-}
-
-def FCCMPss : A64I_fpccmpImpl<0b00, 0b0, FPR32, "fccmp">;
-def FCCMPEss : A64I_fpccmpImpl<0b00, 0b1, FPR32, "fccmpe">;
-def FCCMPdd : A64I_fpccmpImpl<0b01, 0b0, FPR64, "fccmp">;
-def FCCMPEdd : A64I_fpccmpImpl<0b01, 0b1, FPR64, "fccmpe">;
-
-//===----------------------------------------------------------------------===//
-// Floating-point conditional select instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCSEL
-
-let Uses = [NZCV] in {
-  def FCSELsssc : A64I_fpcondsel<0b0, 0b0, 0b00, (outs FPR32:$Rd),
-                                 (ins FPR32:$Rn, FPR32:$Rm, cond_code_op:$Cond),
-                                 "fcsel\t$Rd, $Rn, $Rm, $Cond",
-                                 [(set f32:$Rd,
-                                       (simple_select f32:$Rn, f32:$Rm))],
-                                 NoItinerary>,
-                  Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-
-  def FCSELdddc : A64I_fpcondsel<0b0, 0b0, 0b01, (outs FPR64:$Rd),
-                                 (ins FPR64:$Rn, FPR64:$Rm, cond_code_op:$Cond),
-                                 "fcsel\t$Rd, $Rn, $Rm, $Cond",
-                                 [(set f64:$Rd,
-                                       (simple_select f64:$Rn, f64:$Rm))],
-                                 NoItinerary>,
-                  Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating-point data-processing (1 source)
-//===----------------------------------------------------------------------===//
-// Contains: FMOV, FABS, FNEG, FSQRT, FCVT, FRINT[NPMZAXI].
-
-def FPNoUnop : PatFrag<(ops node:$val), (fneg node:$val),
-                       [{ (void)N; return false; }]>;
-
-// First we do the fairly trivial bunch with uniform "OP s, s" and "OP d, d"
-// syntax. Default to no pattern because most are odd enough not to have one.
-multiclass A64I_fpdp1sizes<bits<6> opcode, string asmstr,
-                           SDPatternOperator opnode = FPNoUnop> {
-  def ss : A64I_fpdp1<0b0, 0b0, 0b00, opcode, (outs FPR32:$Rd), (ins FPR32:$Rn),
-                     !strconcat(asmstr, "\t$Rd, $Rn"),
-                     [(set f32:$Rd, (opnode f32:$Rn))],
-                     NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def dd : A64I_fpdp1<0b0, 0b0, 0b01, opcode, (outs FPR64:$Rd), (ins FPR64:$Rn),
-                     !strconcat(asmstr, "\t$Rd, $Rn"),
-                     [(set f64:$Rd, (opnode f64:$Rn))],
-                     NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FMOV   : A64I_fpdp1sizes<0b000000, "fmov">;
-defm FABS   : A64I_fpdp1sizes<0b000001, "fabs", fabs>;
-defm FNEG   : A64I_fpdp1sizes<0b000010, "fneg", fneg>;
-let SchedRW = [WriteFPSqrt, ReadFPSqrt] in {
-  defm FSQRT  : A64I_fpdp1sizes<0b000011, "fsqrt", fsqrt>;
-}
-
-defm FRINTN : A64I_fpdp1sizes<0b001000, "frintn">;
-defm FRINTP : A64I_fpdp1sizes<0b001001, "frintp", fceil>;
-defm FRINTM : A64I_fpdp1sizes<0b001010, "frintm", ffloor>;
-defm FRINTZ : A64I_fpdp1sizes<0b001011, "frintz", ftrunc>;
-defm FRINTA : A64I_fpdp1sizes<0b001100, "frinta">;
-defm FRINTX : A64I_fpdp1sizes<0b001110, "frintx", frint>;
-defm FRINTI : A64I_fpdp1sizes<0b001111, "frinti", fnearbyint>;
-
-// The FCVT instrucitons have different source and destination register-types,
-// but the fields are uniform everywhere a D-register (say) crops up. Package
-// this information in a Record.
-class FCVTRegType<RegisterClass rc, bits<2> fld, ValueType vt> {
-    RegisterClass Class = rc;
-    ValueType VT = vt;
-    bit t1 = fld{1};
-    bit t0 = fld{0};
-}
-
-def FCVT16 : FCVTRegType<FPR16, 0b11, f16>;
-def FCVT32 : FCVTRegType<FPR32, 0b00, f32>;
-def FCVT64 : FCVTRegType<FPR64, 0b01, f64>;
-
-class A64I_fpdp1_fcvt<FCVTRegType DestReg, FCVTRegType SrcReg, SDNode opnode>
-  : A64I_fpdp1<0b0, 0b0, {SrcReg.t1, SrcReg.t0},
-               {0,0,0,1, DestReg.t1, DestReg.t0},
-               (outs DestReg.Class:$Rd), (ins SrcReg.Class:$Rn),
-               "fcvt\t$Rd, $Rn",
-               [(set DestReg.VT:$Rd, (opnode SrcReg.VT:$Rn))], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def FCVTds : A64I_fpdp1_fcvt<FCVT64, FCVT32, fextend>;
-def FCVThs : A64I_fpdp1_fcvt<FCVT16, FCVT32, fround>;
-def FCVTsd : A64I_fpdp1_fcvt<FCVT32, FCVT64, fround>;
-def FCVThd : A64I_fpdp1_fcvt<FCVT16, FCVT64, fround>;
-def FCVTsh : A64I_fpdp1_fcvt<FCVT32, FCVT16, fextend>;
-def FCVTdh : A64I_fpdp1_fcvt<FCVT64, FCVT16, fextend>;
-
-
-//===----------------------------------------------------------------------===//
-// Floating-point data-processing (2 sources) instructions
-//===----------------------------------------------------------------------===//
-// Contains: FMUL, FDIV, FADD, FSUB, FMAX, FMIN, FMAXNM, FMINNM, FNMUL
-
-def FPNoBinop : PatFrag<(ops node:$lhs, node:$rhs), (fadd node:$lhs, node:$rhs),
-                      [{ (void)N; return false; }]>;
-
-multiclass A64I_fpdp2sizes<bits<4> opcode, string asmstr,
-                           SDPatternOperator opnode> {
-  def sss : A64I_fpdp2<0b0, 0b0, 0b00, opcode,
-                      (outs FPR32:$Rd),
-                      (ins FPR32:$Rn, FPR32:$Rm),
-                      !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
-                      [(set f32:$Rd, (opnode f32:$Rn, f32:$Rm))],
-                      NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-  def ddd : A64I_fpdp2<0b0, 0b0, 0b01, opcode,
-                      (outs FPR64:$Rd),
-                      (ins FPR64:$Rn, FPR64:$Rm),
-                      !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
-                      [(set f64:$Rd, (opnode f64:$Rn, f64:$Rm))],
-                      NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-}
-
-let isCommutable = 1 in {
-  let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-    defm FMUL   : A64I_fpdp2sizes<0b0000, "fmul", fmul>;
-  }
-  defm FADD   : A64I_fpdp2sizes<0b0010, "fadd", fadd>;
-
-  // No patterns for these.
-  defm FMAX   : A64I_fpdp2sizes<0b0100, "fmax", FPNoBinop>;
-  defm FMIN   : A64I_fpdp2sizes<0b0101, "fmin", FPNoBinop>;
-  defm FMAXNM : A64I_fpdp2sizes<0b0110, "fmaxnm", FPNoBinop>;
-  defm FMINNM : A64I_fpdp2sizes<0b0111, "fminnm", FPNoBinop>;
-
-  let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-    defm FNMUL  : A64I_fpdp2sizes<0b1000, "fnmul",
-                                  PatFrag<(ops node:$lhs, node:$rhs),
-                                          (fneg (fmul node:$lhs, node:$rhs))> >;
-  }
-}
-
-let SchedRW = [WriteFPDiv, ReadFPDiv, ReadFPDiv] in {
-  defm FDIV : A64I_fpdp2sizes<0b0001, "fdiv", fdiv>;
-}
-defm FSUB : A64I_fpdp2sizes<0b0011, "fsub", fsub>;
-
-//===----------------------------------------------------------------------===//
-// Floating-point data-processing (3 sources) instructions
-//===----------------------------------------------------------------------===//
-// Contains: FMADD, FMSUB, FNMADD, FNMSUB
-
-def fmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
-                    (fma (fneg node:$Rn),  node:$Rm, node:$Ra)>;
-def fnmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
-                     (fma node:$Rn,  node:$Rm, (fneg node:$Ra))>;
-def fnmadd : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
-                     (fma (fneg node:$Rn),  node:$Rm, (fneg node:$Ra))>;
-
-class A64I_fpdp3Impl<string asmop, RegisterClass FPR, ValueType VT,
-                     bits<2> type, bit o1, bit o0, SDPatternOperator fmakind>
-  : A64I_fpdp3<0b0, 0b0, type, o1, o0, (outs FPR:$Rd),
-               (ins FPR:$Rn, FPR:$Rm, FPR:$Ra),
-               !strconcat(asmop,"\t$Rd, $Rn, $Rm, $Ra"),
-               [(set VT:$Rd, (fmakind VT:$Rn, VT:$Rm, VT:$Ra))],
-               NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]>;
-
-def FMADDssss  : A64I_fpdp3Impl<"fmadd",  FPR32, f32, 0b00, 0b0, 0b0, fma>;
-def FMSUBssss  : A64I_fpdp3Impl<"fmsub",  FPR32, f32, 0b00, 0b0, 0b1, fmsub>;
-def FNMADDssss : A64I_fpdp3Impl<"fnmadd", FPR32, f32, 0b00, 0b1, 0b0, fnmadd>;
-def FNMSUBssss : A64I_fpdp3Impl<"fnmsub", FPR32, f32, 0b00, 0b1, 0b1, fnmsub>;
-
-def FMADDdddd  : A64I_fpdp3Impl<"fmadd",  FPR64, f64, 0b01, 0b0, 0b0, fma>;
-def FMSUBdddd  : A64I_fpdp3Impl<"fmsub",  FPR64, f64, 0b01, 0b0, 0b1, fmsub>;
-def FNMADDdddd : A64I_fpdp3Impl<"fnmadd", FPR64, f64, 0b01, 0b1, 0b0, fnmadd>;
-def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>;
-
-// Extra patterns for when we're allowed to optimise separate multiplication and
-// addition.
-let Predicates = [HasFPARMv8, UseFusedMAC] in {
-def : Pat<(f32 (fadd FPR32:$Ra, (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
-          (FMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub FPR32:$Ra, (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
-          (FMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub (f32 (fneg FPR32:$Ra)), (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
-          (FNMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)), FPR32:$Ra)),
-          (FNMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-
-def : Pat<(f64 (fadd FPR64:$Ra, (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
-          (FMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub FPR64:$Ra, (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
-          (FMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub (f64 (fneg FPR64:$Ra)), (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
-          (FNMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)), FPR64:$Ra)),
-          (FNMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-}
-
-
-//===----------------------------------------------------------------------===//
-// Floating-point <-> fixed-point conversion instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCVTZS, FCVTZU, SCVTF, UCVTF
-
-// #1-#32 allowed, encoded as "64 - <specified imm>
-def fixedpos_asmoperand_i32 : AsmOperandClass {
-  let Name = "CVTFixedPos32";
-  let RenderMethod = "addCVTFixedPosOperands";
-  let PredicateMethod = "isCVTFixedPos<32>";
-  let DiagnosticType = "CVTFixedPos32";
-}
-
-// Also encoded as "64 - <specified imm>" but #1-#64 allowed.
-def fixedpos_asmoperand_i64 : AsmOperandClass {
-  let Name = "CVTFixedPos64";
-  let RenderMethod = "addCVTFixedPosOperands";
-  let PredicateMethod = "isCVTFixedPos<64>";
-  let DiagnosticType = "CVTFixedPos64";
-}
-
-// We need the cartesian product of f32/f64 i32/i64 operands for
-// conversions:
-//   + Selection needs to use operands of correct floating type
-//   + Assembly parsing and decoding depend on integer width
-class cvtfix_i32_op<ValueType FloatVT>
-  : Operand<FloatVT>,
-    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm]> {
-  let ParserMatchClass = fixedpos_asmoperand_i32;
-  let DecoderMethod = "DecodeCVT32FixedPosOperand";
-  let PrintMethod = "printCVTFixedPosOperand";
-}
-
-class cvtfix_i64_op<ValueType FloatVT>
-  : Operand<FloatVT>,
-    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm]> {
-  let ParserMatchClass = fixedpos_asmoperand_i64;
-  let PrintMethod = "printCVTFixedPosOperand";
-}
-
-// Because of the proliferation of weird operands, it's not really
-// worth going for a multiclass here. Oh well.
-
-class A64I_fptofix<bit sf, bits<2> type, bits<3> opcode,
-                   RegisterClass GPR, RegisterClass FPR,
-                   ValueType DstTy, ValueType SrcTy,
-                   Operand scale_op, string asmop, SDNode cvtop>
-  : A64I_fpfixed<sf, 0b0, type, 0b11, opcode,
-                 (outs GPR:$Rd), (ins FPR:$Rn, scale_op:$Scale),
-                 !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
-                 [(set DstTy:$Rd, (cvtop (fmul SrcTy:$Rn, scale_op:$Scale)))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def FCVTZSwsi : A64I_fptofix<0b0, 0b00, 0b000, GPR32, FPR32, i32, f32,
-                             cvtfix_i32_op<f32>, "fcvtzs", fp_to_sint>;
-def FCVTZSxsi : A64I_fptofix<0b1, 0b00, 0b000, GPR64, FPR32, i64, f32,
-                             cvtfix_i64_op<f32>, "fcvtzs", fp_to_sint>;
-def FCVTZUwsi : A64I_fptofix<0b0, 0b00, 0b001, GPR32, FPR32, i32, f32,
-                             cvtfix_i32_op<f32>, "fcvtzu", fp_to_uint>;
-def FCVTZUxsi : A64I_fptofix<0b1, 0b00, 0b001, GPR64, FPR32, i64, f32,
-                             cvtfix_i64_op<f32>, "fcvtzu", fp_to_uint>;
-
-def FCVTZSwdi : A64I_fptofix<0b0, 0b01, 0b000, GPR32, FPR64, i32, f64,
-                             cvtfix_i32_op<f64>, "fcvtzs", fp_to_sint>;
-def FCVTZSxdi : A64I_fptofix<0b1, 0b01, 0b000, GPR64, FPR64, i64, f64,
-                             cvtfix_i64_op<f64>, "fcvtzs", fp_to_sint>;
-def FCVTZUwdi : A64I_fptofix<0b0, 0b01, 0b001, GPR32, FPR64, i32, f64,
-                             cvtfix_i32_op<f64>, "fcvtzu", fp_to_uint>;
-def FCVTZUxdi : A64I_fptofix<0b1, 0b01, 0b001, GPR64, FPR64, i64, f64,
-                             cvtfix_i64_op<f64>, "fcvtzu", fp_to_uint>;
-
-
-class A64I_fixtofp<bit sf, bits<2> type, bits<3> opcode,
-                   RegisterClass FPR, RegisterClass GPR,
-                   ValueType DstTy, ValueType SrcTy,
-                   Operand scale_op, string asmop, SDNode cvtop>
-  : A64I_fpfixed<sf, 0b0, type, 0b00, opcode,
-                 (outs FPR:$Rd), (ins GPR:$Rn, scale_op:$Scale),
-                 !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
-                 [(set DstTy:$Rd, (fdiv (cvtop SrcTy:$Rn), scale_op:$Scale))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def SCVTFswi : A64I_fixtofp<0b0, 0b00, 0b010, FPR32, GPR32, f32, i32,
-                            cvtfix_i32_op<f32>, "scvtf", sint_to_fp>;
-def SCVTFsxi : A64I_fixtofp<0b1, 0b00, 0b010, FPR32, GPR64, f32, i64,
-                            cvtfix_i64_op<f32>, "scvtf", sint_to_fp>;
-def UCVTFswi : A64I_fixtofp<0b0, 0b00, 0b011, FPR32, GPR32, f32, i32,
-                            cvtfix_i32_op<f32>, "ucvtf", uint_to_fp>;
-def UCVTFsxi : A64I_fixtofp<0b1, 0b00, 0b011, FPR32, GPR64, f32, i64,
-                            cvtfix_i64_op<f32>, "ucvtf", uint_to_fp>;
-def SCVTFdwi : A64I_fixtofp<0b0, 0b01, 0b010, FPR64, GPR32, f64, i32,
-                            cvtfix_i32_op<f64>, "scvtf", sint_to_fp>;
-def SCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b010, FPR64, GPR64, f64, i64,
-                            cvtfix_i64_op<f64>, "scvtf", sint_to_fp>;
-def UCVTFdwi : A64I_fixtofp<0b0, 0b01, 0b011, FPR64, GPR32, f64, i32,
-                            cvtfix_i32_op<f64>, "ucvtf", uint_to_fp>;
-def UCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b011, FPR64, GPR64, f64, i64,
-                            cvtfix_i64_op<f64>, "ucvtf", uint_to_fp>;
-
-//===----------------------------------------------------------------------===//
-// Floating-point <-> integer conversion instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCVTZS, FCVTZU, SCVTF, UCVTF
-
-class A64I_fpintI<bit sf, bits<2> type, bits<2> rmode, bits<3> opcode,
-                   RegisterClass DestPR, RegisterClass SrcPR, string asmop>
-  : A64I_fpint<sf, 0b0, type, rmode, opcode, (outs DestPR:$Rd), (ins SrcPR:$Rn),
-               !strconcat(asmop, "\t$Rd, $Rn"), [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass A64I_fptointRM<bits<2> rmode, bit o2, string asmop> {
-  def Sws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 0},
-                        GPR32, FPR32, asmop # "s">;
-  def Sxs : A64I_fpintI<0b1, 0b00, rmode, {o2, 0, 0},
-                        GPR64, FPR32, asmop # "s">;
-  def Uws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 1},
-                        GPR32, FPR32, asmop # "u">;
-  def Uxs : A64I_fpintI<0b1, 0b00, rmode, {o2, 0, 1},
-                        GPR64, FPR32, asmop # "u">;
-
-  def Swd : A64I_fpintI<0b0, 0b01, rmode, {o2, 0, 0},
-                        GPR32, FPR64, asmop # "s">;
-  def Sxd : A64I_fpintI<0b1, 0b01, rmode, {o2, 0, 0},
-                        GPR64, FPR64, asmop # "s">;
-  def Uwd : A64I_fpintI<0b0, 0b01, rmode, {o2, 0, 1},
-                        GPR32, FPR64, asmop # "u">;
-  def Uxd : A64I_fpintI<0b1, 0b01, rmode, {o2, 0, 1},
-                        GPR64, FPR64, asmop # "u">;
-}
-
-defm FCVTN : A64I_fptointRM<0b00, 0b0, "fcvtn">;
-defm FCVTP : A64I_fptointRM<0b01, 0b0, "fcvtp">;
-defm FCVTM : A64I_fptointRM<0b10, 0b0, "fcvtm">;
-defm FCVTZ : A64I_fptointRM<0b11, 0b0, "fcvtz">;
-defm FCVTA : A64I_fptointRM<0b00, 0b1, "fcvta">;
-
-let Predicates = [HasFPARMv8] in {
-def : Pat<(i32 (fp_to_sint f32:$Rn)), (FCVTZSws $Rn)>;
-def : Pat<(i64 (fp_to_sint f32:$Rn)), (FCVTZSxs $Rn)>;
-def : Pat<(i32 (fp_to_uint f32:$Rn)), (FCVTZUws $Rn)>;
-def : Pat<(i64 (fp_to_uint f32:$Rn)), (FCVTZUxs $Rn)>;
-def : Pat<(i32 (fp_to_sint f64:$Rn)), (FCVTZSwd $Rn)>;
-def : Pat<(i64 (fp_to_sint f64:$Rn)), (FCVTZSxd $Rn)>;
-def : Pat<(i32 (fp_to_uint f64:$Rn)), (FCVTZUwd $Rn)>;
-def : Pat<(i64 (fp_to_uint f64:$Rn)), (FCVTZUxd $Rn)>;
-}
-
-multiclass A64I_inttofp<bit o0, string asmop> {
-  def CVTFsw : A64I_fpintI<0b0, 0b00, 0b00, {0, 1, o0}, FPR32, GPR32, asmop>;
-  def CVTFsx : A64I_fpintI<0b1, 0b00, 0b00, {0, 1, o0}, FPR32, GPR64, asmop>;
-  def CVTFdw : A64I_fpintI<0b0, 0b01, 0b00, {0, 1, o0}, FPR64, GPR32, asmop>;
-  def CVTFdx : A64I_fpintI<0b1, 0b01, 0b00, {0, 1, o0}, FPR64, GPR64, asmop>;
-}
-
-defm S : A64I_inttofp<0b0, "scvtf">;
-defm U : A64I_inttofp<0b1, "ucvtf">;
-
-let Predicates = [HasFPARMv8] in {
-def : Pat<(f32 (sint_to_fp i32:$Rn)), (SCVTFsw $Rn)>;
-def : Pat<(f32 (sint_to_fp i64:$Rn)), (SCVTFsx $Rn)>;
-def : Pat<(f64 (sint_to_fp i32:$Rn)), (SCVTFdw $Rn)>;
-def : Pat<(f64 (sint_to_fp i64:$Rn)), (SCVTFdx $Rn)>;
-def : Pat<(f32 (uint_to_fp i32:$Rn)), (UCVTFsw $Rn)>;
-def : Pat<(f32 (uint_to_fp i64:$Rn)), (UCVTFsx $Rn)>;
-def : Pat<(f64 (uint_to_fp i32:$Rn)), (UCVTFdw $Rn)>;
-def : Pat<(f64 (uint_to_fp i64:$Rn)), (UCVTFdx $Rn)>;
-}
-
-def FMOVws : A64I_fpintI<0b0, 0b00, 0b00, 0b110, GPR32, FPR32, "fmov">;
-def FMOVsw : A64I_fpintI<0b0, 0b00, 0b00, 0b111, FPR32, GPR32, "fmov">;
-def FMOVxd : A64I_fpintI<0b1, 0b01, 0b00, 0b110, GPR64, FPR64, "fmov">;
-def FMOVdx : A64I_fpintI<0b1, 0b01, 0b00, 0b111, FPR64, GPR64, "fmov">;
-
-let Predicates = [HasFPARMv8] in {
-def : Pat<(i32 (bitconvert f32:$Rn)), (FMOVws $Rn)>;
-def : Pat<(f32 (bitconvert i32:$Rn)), (FMOVsw $Rn)>;
-def : Pat<(i64 (bitconvert f64:$Rn)), (FMOVxd $Rn)>;
-def : Pat<(f64 (bitconvert i64:$Rn)), (FMOVdx $Rn)>;
-}
-
-def lane1_asmoperand : AsmOperandClass {
-  let Name = "Lane1";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "Lane1";
-}
-
-def lane1 : Operand<i32> {
-  let ParserMatchClass = lane1_asmoperand;
-  let PrintMethod = "printBareImmOperand";
-}
-
-let DecoderMethod =  "DecodeFMOVLaneInstruction" in {
-  def FMOVxv : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b110,
-                          (outs GPR64:$Rd), (ins VPR128:$Rn, lane1:$Lane),
-                          "fmov\t$Rd, $Rn.d[$Lane]", [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-  def FMOVvx : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b111,
-                          (outs VPR128:$Rd), (ins GPR64:$Rn, lane1:$Lane),
-                          "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-let Predicates = [HasFPARMv8] in {
-def : InstAlias<"fmov $Rd, $Rn.2d[$Lane]",
-                (FMOVxv GPR64:$Rd, VPR128:$Rn, lane1:$Lane), 0b0>;
-
-def : InstAlias<"fmov $Rd.2d[$Lane], $Rn",
-                (FMOVvx VPR128:$Rd, GPR64:$Rn, lane1:$Lane), 0b0>;
-}
+defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
+defm FCMEQ  : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoScalarSD<   0, 0, 0b11100, "fcvtas">;
+defm FCVTAU : SIMDTwoScalarSD<   1, 0, 0b11100, "fcvtau">;
+defm FCVTMS : SIMDTwoScalarSD<   0, 0, 0b11011, "fcvtms">;
+defm FCVTMU : SIMDTwoScalarSD<   1, 0, 0b11011, "fcvtmu">;
+defm FCVTNS : SIMDTwoScalarSD<   0, 0, 0b11010, "fcvtns">;
+defm FCVTNU : SIMDTwoScalarSD<   1, 0, 0b11010, "fcvtnu">;
+defm FCVTPS : SIMDTwoScalarSD<   0, 1, 0b11010, "fcvtps">;
+defm FCVTPU : SIMDTwoScalarSD<   1, 1, 0b11010, "fcvtpu">;
+def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
+defm FCVTZS : SIMDTwoScalarSD<   0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDTwoScalarSD<   1, 1, 0b11011, "fcvtzu">;
+defm FRECPE : SIMDTwoScalarSD<   0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDTwoScalarSD<   0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDTwoScalarSD<  1, 1, 0b11101, "frsqrte">;
+defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
+                                 UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
+defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
+defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
+                                     int_aarch64_neon_suqadd>;
+defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
+defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
+                                    int_aarch64_neon_usqadd>;
+
+def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
+
+def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
+          (FCVTASv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
+          (FCVTAUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))),
+          (FCVTMSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))),
+          (FCVTMUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))),
+          (FCVTNSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))),
+          (FCVTNUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
+          (FCVTPSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
+          (FCVTPUv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
+          (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
+          (FRECPXv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
+          (FRECPXv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
+          (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// Here are the patterns for 8 and 16-bits to float.
+// 8-bits -> float.
+multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
+                             SDPatternOperator loadop, Instruction UCVTF,
+                             ROAddrMode ro, Instruction LDRW, Instruction LDRX,
+                             SubRegIndex sub> {
+  def : Pat<(DstTy (uint_to_fp (SrcTy
+                     (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
+                                      ro.Wext:$extend))))),
+           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+                                 (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+                                 sub))>;
+
+  def : Pat<(DstTy (uint_to_fp (SrcTy
+                     (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
+                                      ro.Wext:$extend))))),
+           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+                                 (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+                                 sub))>;
+}
+
+defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
+                         UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f32 (uint_to_fp (i32
+               (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+                     (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> float.
+defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
+                         UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f32 (uint_to_fp (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// UCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
+                         UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f64 (uint_to_fp (i32
+                    (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
+                         UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, load,
+                         UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
 
 //===----------------------------------------------------------------------===//
-// Floating-point immediate instructions
+// Advanced SIMD three different-sized vector instructions.
 //===----------------------------------------------------------------------===//
-// Contains: FMOV
 
-def fpimm_asmoperand : AsmOperandClass {
-  let Name = "FMOVImm";
-  let ParserMethod = "ParseFPImmOperand";
-  let DiagnosticType = "FPImm";
-}
-
-// The MCOperand for these instructions are the encoded 8-bit values.
-def SDXF_fpimm : SDNodeXForm<fpimm, [{
-  uint32_t Imm8;
-  A64Imms::isFPImm(N->getValueAPF(), Imm8);
-  return CurDAG->getTargetConstant(Imm8, MVT::i32);
+defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
+defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
+defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
+defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
+defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
+defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
+                                             int_aarch64_neon_sabd>;
+defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
+                                          int_aarch64_neon_sabd>;
+defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
+            BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
+defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
+                 BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
+defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
+                                               int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
+                                               int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
+                                     int_aarch64_neon_sqdmull>;
+defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
+                 BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
+defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
+                 BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
+defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
+                                              int_aarch64_neon_uabd>;
+defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
+                                          int_aarch64_neon_uabd>;
+defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
+                 BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
+defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
+                 BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
+defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
+defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
+                 BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
+defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
+                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
+
+// Patterns for 64-bit pmull
+def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
+          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
+def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
+                                  (vector_extract (v2i64 V128:$Rm), (i64 1))),
+          (PMULLv2i64 V128:$Rn, V128:$Rm)>;
+
+// CodeGen patterns for addhn and subhn instructions, which can actually be
+// written in LLVM IR without too much difficulty.
+
+// ADDHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
+          (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+// SUBHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
+          (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector instruction.
+//----------------------------------------------------------------------------
+
+defm EXT : SIMDBitwiseExtract<"ext">;
+
+def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+
+// We use EXT to handle extract_subvector to copy the upper 64-bits of a
+// 128-bit vector.
+def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>;
+defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>;
+defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>;
+defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
+defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
+defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX instructions
+//----------------------------------------------------------------------------
+
+defm TBL : SIMDTableLookup<    0, "tbl">;
+defm TBX : SIMDTableLookupTied<1, "tbx">;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBLv16i8One V128:$Ri, V128:$Rn)>;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd),
+                  (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
+                   (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY instruction
+//----------------------------------------------------------------------------
+
+defm CPY : SIMDScalarCPY<"cpy">;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
+defm FADDP   : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
+defm FMAXP   : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
+defm FMINP   : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
+def : Pat<(i64 (int_aarch64_neon_saddv (v2i64 V128:$Rn))),
+          (ADDPv2i64p V128:$Rn)>;
+def : Pat<(i64 (int_aarch64_neon_uaddv (v2i64 V128:$Rn))),
+          (ADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))),
+          (FADDPv2i32p V64:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))),
+          (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
+def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))),
+          (FADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))),
+          (FMAXNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))),
+          (FMAXNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))),
+          (FMAXPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))),
+          (FMAXPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))),
+          (FMINNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))),
+          (FMINNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))),
+          (FMINPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))),
+          (FMINPv2i64p V128:$Rn)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+def DUPv8i8gpr  : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>;
+def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>;
+def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>;
+def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>;
+def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>;
+def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>;
+def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>;
+
+def DUPv2i64lane : SIMDDup64FromElement;
+def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
+def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
+def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
+def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
+def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
+def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
+
+def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
+          (v2f32 (DUPv2i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))),
+          (v4f32 (DUPv4i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
+          (v2f64 (DUPv2i64lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
+            (i64 0)))>;
+
+def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+          (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+         (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
+          (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
+
+// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
+// instruction even if the types don't match: we just have to remap the lane
+// carefully. N.b. this trick only applies to truncations.
+def VecIndex_x2 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(2 * N->getZExtValue(), MVT::i64);
 }]>;
-
-class fmov_operand<ValueType FT>
-  : Operand<i32>,
-    PatLeaf<(FT fpimm), [{ return A64Imms::isFPImm(N->getValueAPF()); }],
-            SDXF_fpimm> {
-  let PrintMethod = "printFPImmOperand";
-  let ParserMatchClass = fpimm_asmoperand;
-}
-
-def fmov32_operand : fmov_operand<f32>;
-def fmov64_operand : fmov_operand<f64>;
-
-class A64I_fpimm_impl<bits<2> type, RegisterClass Reg, ValueType VT,
-                      Operand fmov_operand>
-  : A64I_fpimm<0b0, 0b0, type, 0b00000,
-               (outs Reg:$Rd),
-               (ins fmov_operand:$Imm8),
-               "fmov\t$Rd, $Imm8",
-               [(set VT:$Rd, fmov_operand:$Imm8)],
-               NoItinerary>,
-    Sched<[WriteFPALU]>;
-
-def FMOVsi : A64I_fpimm_impl<0b00, FPR32, f32, fmov32_operand>;
-def FMOVdi : A64I_fpimm_impl<0b01, FPR64, f64, fmov64_operand>;
-
-//===----------------------------------------------------------------------===//
-// Load-register (literal) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDR, LDRSW, PRFM
-
-def ldrlit_label_asmoperand : AsmOperandClass {
-  let Name = "LoadLitLabel";
-  let RenderMethod = "addLabelOperands<19, 4>";
-  let DiagnosticType = "Label";
-}
-
-def ldrlit_label : Operand<i64> {
-  let EncoderMethod = "getLoadLitLabelOpValue";
-
-  // This label is a 19-bit offset from PC, scaled by the instruction-width: 4.
-  let PrintMethod = "printLabelOperand<19, 4>";
-  let ParserMatchClass = ldrlit_label_asmoperand;
-  let OperandType = "OPERAND_PCREL";
-}
-
-// Various instructions take an immediate value (which can always be used),
-// where some numbers have a symbolic name to make things easier. These operands
-// and the associated functions abstract away the differences.
-multiclass namedimm<string prefix, string mapper> {
-  def _asmoperand : AsmOperandClass {
-    let Name = "NamedImm" # prefix;
-    let PredicateMethod = "isUImm";
-    let RenderMethod = "addImmOperands";
-    let ParserMethod = "ParseNamedImmOperand<" # mapper # ">";
-    let DiagnosticType = "NamedImm_" # prefix;
-  }
-
-  def _op : Operand<i32> {
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
-    let PrintMethod = "printNamedImmOperand<" # mapper # ">";
-    let DecoderMethod = "DecodeNamedImmOperand<" # mapper # ">";
-  }
-}
-
-defm prefetch : namedimm<"prefetch", "A64PRFM::PRFMMapper">;
-
-class A64I_LDRlitSimple<bits<2> opc, bit v, RegisterClass OutReg,
-                      list<dag> patterns = []>
-   : A64I_LDRlit<opc, v, (outs OutReg:$Rt), (ins ldrlit_label:$Imm19),
-                 "ldr\t$Rt, $Imm19", patterns, NoItinerary>,
-     Sched<[WriteLd]>;
-
-let mayLoad = 1 in {
-  def LDRw_lit : A64I_LDRlitSimple<0b00, 0b0, GPR32>;
-  def LDRx_lit : A64I_LDRlitSimple<0b01, 0b0, GPR64>;
-}
-
-let Predicates = [HasFPARMv8] in {
-def LDRs_lit  : A64I_LDRlitSimple<0b00, 0b1, FPR32>;
-def LDRd_lit  : A64I_LDRlitSimple<0b01, 0b1, FPR64>;
-}
-
-let mayLoad = 1 in {
-  let Predicates = [HasFPARMv8] in {
-  def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>;
-  }
-
-  def LDRSWx_lit : A64I_LDRlit<0b10, 0b0,
-                               (outs GPR64:$Rt),
-                               (ins ldrlit_label:$Imm19),
-                               "ldrsw\t$Rt, $Imm19",
-                               [], NoItinerary>,
-                   Sched<[WriteLd]>;
-
-  def PRFM_lit : A64I_LDRlit<0b11, 0b0,
-                             (outs), (ins prefetch_op:$Rt, ldrlit_label:$Imm19),
-                             "prfm\t$Rt, $Imm19",
-                             [], NoItinerary>,
-                 Sched<[WriteLd, ReadLd]>;
-}
-
-//===----------------------------------------------------------------------===//
-// Load-store exclusive instructions
-//===----------------------------------------------------------------------===//
-// Contains: STXRB, STXRH, STXR, LDXRB, LDXRH, LDXR. STXP, LDXP, STLXRB,
-//           STLXRH, STLXR, LDAXRB, LDAXRH, LDAXR, STLXP, LDAXP, STLRB,
-//           STLRH, STLR, LDARB, LDARH, LDAR
-
-// Since these instructions have the undefined register bits set to 1 in
-// their canonical form, we need a post encoder method to set those bits
-// to 1 when encoding these instructions. We do this using the
-// fixLoadStoreExclusive function. This function has template parameters:
-//
-// fixLoadStoreExclusive<int hasRs, int hasRt2>
-//
-// hasRs indicates that the instruction uses the Rs field, so we won't set
-// it to 1 (and the same for Rt2). We don't need template parameters for
-// the other register fiels since Rt and Rn are always used.
-
-// This operand parses a GPR64xsp register, followed by an optional immediate
-// #0.
-def GPR64xsp0_asmoperand : AsmOperandClass {
-  let Name = "GPR64xsp0";
-  let PredicateMethod = "isWrappedReg";
-  let RenderMethod = "addRegOperands";
-  let ParserMethod = "ParseLSXAddressOperand";
-  // Diagnostics are provided by ParserMethod
-}
-
-def GPR64xsp0 : RegisterOperand<GPR64xsp> {
-  let ParserMatchClass = GPR64xsp0_asmoperand;
-}
-
-//===----------------------------------
-// Store-exclusive (releasing & normal)
-//===----------------------------------
-
-class A64I_SRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-       A64I_LDSTex_stn <size,
-                        opcode{2}, 0, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rs, $Rt, [$Rn]"),
-                        pat, itin> {
-  let mayStore = 1;
-  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
-  let Constraints = "@earlyclobber $Rs";
-}
-
-multiclass A64I_SRex<string asmstr, bits<3> opcode, string prefix> {
-  def _byte:  A64I_SRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
-                              (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                              [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _hword:  A64I_SRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
-                               (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                               [],NoItinerary>,
-               Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _word:  A64I_SRexs_impl<0b10, opcode, asmstr,
-                              (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                              [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _dword: A64I_SRexs_impl<0b11, opcode, asmstr,
-                              (outs GPR32:$Rs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
-                              [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-}
-
-defm STXR  : A64I_SRex<"stxr",  0b000, "STXR">;
-defm STLXR : A64I_SRex<"stlxr", 0b001, "STLXR">;
-
-//===----------------------------------
-// Loads
-//===----------------------------------
-
-class A64I_LRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-        A64I_LDSTex_tn <size,
-                        opcode{2}, 1, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rt, [$Rn]"),
-                        pat, itin> {
-  let mayLoad = 1;
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
-}
-
-multiclass A64I_LRex<string asmstr, bits<3> opcode> {
-  def _byte:  A64I_LRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
-                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd]>;
-
-  def _hword:  A64I_LRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
-                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-               Sched<[WriteLd]>;
-
-  def _word:  A64I_LRexs_impl<0b10, opcode, asmstr,
-                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd]>;
-
-  def _dword: A64I_LRexs_impl<0b11, opcode, asmstr,
-                            (outs GPR64:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd]>;
-}
-
-defm LDXR  : A64I_LRex<"ldxr",  0b000>;
-defm LDAXR : A64I_LRex<"ldaxr", 0b001>;
-defm LDAR  : A64I_LRex<"ldar",  0b101>;
-
-class acquiring_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+def VecIndex_x4 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(4 * N->getZExtValue(), MVT::i64);
 }]>;
-
-def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
-def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
-def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
-def atomic_load_acquire_64 : acquiring_load<atomic_load_64>;
-
-def : Pat<(atomic_load_acquire_8  i64:$Rn), (LDAR_byte  $Rn)>;
-def : Pat<(atomic_load_acquire_16 i64:$Rn), (LDAR_hword $Rn)>;
-def : Pat<(atomic_load_acquire_32 i64:$Rn), (LDAR_word  $Rn)>;
-def : Pat<(atomic_load_acquire_64 i64:$Rn), (LDAR_dword $Rn)>;
-
-//===----------------------------------
-// Store-release (no exclusivity)
-//===----------------------------------
-
-class A64I_SLexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-        A64I_LDSTex_tn <size,
-                        opcode{2}, 0, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rt, [$Rn]"),
-                        pat, itin> {
-  let mayStore = 1;
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
-}
-
-class releasing_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Release || Ordering == SequentiallyConsistent;
+def VecIndex_x8 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(8 * N->getZExtValue(), MVT::i64);
 }]>;
 
-def atomic_store_release_8  : releasing_store<atomic_store_8>;
-def atomic_store_release_16 : releasing_store<atomic_store_16>;
-def atomic_store_release_32 : releasing_store<atomic_store_32>;
-def atomic_store_release_64 : releasing_store<atomic_store_64>;
-
-multiclass A64I_SLex<string asmstr, bits<3> opcode, string prefix> {
-  def _byte:  A64I_SLexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
-                            (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                            [(atomic_store_release_8 i64:$Rn, i32:$Rt)],
-                            NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _hword:  A64I_SLexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
-                           (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                           [(atomic_store_release_16 i64:$Rn, i32:$Rt)],
-                           NoItinerary>,
-               Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _word:  A64I_SLexs_impl<0b10, opcode, asmstr,
-                           (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                           [(atomic_store_release_32 i64:$Rn, i32:$Rt)],
-                           NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _dword: A64I_SLexs_impl<0b11, opcode, asmstr,
-                           (outs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
-                           [(atomic_store_release_64 i64:$Rn, i64:$Rt)],
-                           NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-}
-
-defm STLR  : A64I_SLex<"stlr", 0b101, "STLR">;
-
-//===----------------------------------
-// Store-exclusive pair (releasing & normal)
-//===----------------------------------
-
-class A64I_SPexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-     A64I_LDSTex_stt2n <size,
-                        opcode{2}, 0, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rs, $Rt, $Rt2, [$Rn]"),
-                        pat, itin> {
-  let mayStore = 1;
-}
-
-
-multiclass A64I_SPex<string asmstr, bits<3> opcode> {
-  def _word:  A64I_SPexs_impl<0b10, opcode, asmstr, (outs),
-                            (ins GPR32:$Rs, GPR32:$Rt, GPR32:$Rt2,
-                                 GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
-
-  def _dword: A64I_SPexs_impl<0b11, opcode, asmstr, (outs),
-                            (ins GPR32:$Rs, GPR64:$Rt, GPR64:$Rt2,
-                                            GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
-}
-
-defm STXP  : A64I_SPex<"stxp", 0b010>;
-defm STLXP : A64I_SPex<"stlxp", 0b011>;
-
-//===----------------------------------
-// Load-exclusive pair (acquiring & normal)
-//===----------------------------------
-
-class A64I_LPexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-      A64I_LDSTex_tt2n <size,
-                        opcode{2}, 1, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rt, $Rt2, [$Rn]"),
-                        pat, itin>{
-  let mayLoad = 1;
-  let DecoderMethod = "DecodeLoadPairExclusiveInstruction";
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
-}
-
-multiclass A64I_LPex<string asmstr, bits<3> opcode> {
-  def _word:  A64I_LPexs_impl<0b10, opcode, asmstr,
-                            (outs GPR32:$Rt, GPR32:$Rt2),
-                            (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd, WriteLd, ReadLd]>;
-
-  def _dword: A64I_LPexs_impl<0b11, opcode, asmstr,
-                            (outs GPR64:$Rt, GPR64:$Rt2),
-                            (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd, WriteLd, ReadLd]>;
-}
-
-defm LDXP  : A64I_LPex<"ldxp", 0b010>;
-defm LDAXP : A64I_LPex<"ldaxp", 0b011>;
+multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
+                            ValueType Src128VT, ValueType ScalVT,
+                            Instruction DUP, SDNodeXForm IdxXFORM> {
+  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
+                                                     imm:$idx)))),
+            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
+                                                     imm:$idx)))),
+            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTruncPats<v8i8,   v4i16, v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
+defm : DUPWithTruncPats<v8i8,   v2i32, v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
+defm : DUPWithTruncPats<v4i16,  v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+
+defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
+
+multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
+                               SDNodeXForm IdxXFORM> {
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
+                                                         imm:$idx))))),
+            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
+                                                         imm:$idx))))),
+            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTrunci64Pats<v8i8,  DUPv8i8lane,   VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane,  VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane,  VecIndex_x2>;
+
+defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+
+// SMOV and UMOV definitions, with some extra patterns for convenience
+defm SMOV : SMov;
+defm UMOV : UMov;
+
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
+          (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
+
+// Extracting i8 or i16 elements will have the zero-extend transformed to
+// an 'and' mask by type legalization since neither i8 nor i16 are legal types
+// for AArch64. Match these patterns here since UMOV already zeroes out the high
+// bits of the destination register.
+def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
+               (i32 0xff)),
+          (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
+               (i32 0xffff)),
+          (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
+
+defm INS : SIMDIns;
+
+def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+            (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                  (i64 FPR64:$Rn), dsub))>;
+
+def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
+
+def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (EXTRACT_SUBREG
+            (INSvi32lane
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+              VectorIndexS:$imm,
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+              (i64 0)),
+            dsub)>;
+def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (INSvi32lane
+            V128:$Rn, VectorIndexS:$imm,
+            (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+            (i64 0))>;
+def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
+            (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
+          (INSvi64lane
+            V128:$Rn, VectorIndexD:$imm,
+            (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
+            (i64 0))>;
+
+// Copy an element at a constant index in one vector into a constant indexed
+// element of another.
+// FIXME refactor to a shared class/dev parameterized on vector type, vector
+// index type and INS extension
+def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane
+                   (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
+                   VectorIndexB:$idx2)),
+          (v16i8 (INSvi8lane
+                   V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
+          )>;
+def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane
+                   (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
+                   VectorIndexH:$idx2)),
+          (v8i16 (INSvi16lane
+                   V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
+          )>;
+def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane
+                   (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
+                   VectorIndexS:$idx2)),
+          (v4i32 (INSvi32lane
+                   V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
+          )>;
+def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
+                   (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
+                   VectorIndexD:$idx2)),
+          (v2i64 (INSvi64lane
+                   V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
+          )>;
+
+multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
+                                ValueType VTScal, Instruction INS> {
+  def : Pat<(VT128 (vector_insert V128:$src,
+                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
+
+  def : Pat<(VT128 (vector_insert V128:$src,
+                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (INS V128:$src, imm:$Immd,
+                 (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
+
+  def : Pat<(VT64 (vector_insert V64:$src,
+                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+                                 imm:$Immd, V128:$Rn, imm:$Immn),
+                            dsub)>;
+
+  def : Pat<(VT64 (vector_insert V64:$src,
+                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (EXTRACT_SUBREG
+                (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
+                     (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
+                dsub)>;
+}
+
+defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8,  i32, INSvi8lane>;
+defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi32lane>;
+
+
+// Floating point vector extractions are codegen'd as either a sequence of
+// subregister extractions, possibly fed by an INS if the lane number is
+// anything other than zero.
+def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
+          (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
+          (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
+def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
+          (f64 (EXTRACT_SUBREG
+            (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexD:$idx),
+            dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
+          (f32 (EXTRACT_SUBREG
+            (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexS:$idx),
+            ssub))>;
+
+// All concat_vectors operations are canonicalised to act on i64 vectors for
+// AArch64. In the general case we need an instruction, which had just as well be
+// INS.
+class ConcatPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
+        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
+                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
+
+def : ConcatPat<v2i64, v1i64>;
+def : ConcatPat<v2f64, v1f64>;
+def : ConcatPat<v4i32, v2i32>;
+def : ConcatPat<v4f32, v2f32>;
+def : ConcatPat<v8i16, v4i16>;
+def : ConcatPat<v16i8, v8i8>;
+
+// If the high lanes are undef, though, we can just ignore them:
+class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
+        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
+
+def : ConcatUndefPat<v2i64, v1i64>;
+def : ConcatUndefPat<v2f64, v1f64>;
+def : ConcatUndefPat<v4i32, v2i32>;
+def : ConcatUndefPat<v4f32, v2f32>;
+def : ConcatUndefPat<v8i16, v4i16>;
+def : ConcatUndefPat<v16i8, v8i8>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+defm ADDV    : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
+defm SMAXV   : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
+defm SMINV   : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
+defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
+defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
+defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
+defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
+defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+
+multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+  def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          (i64 0)))>;
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+          ssub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+  def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          ssub))>;
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+          ssub))>;
+
+}
+
+multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+           ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
+                                                Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+            ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+defm : SIMDAcrossLanesSignedIntrinsic<"ADDV",  int_aarch64_neon_saddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))),
+          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV",  int_aarch64_neon_uaddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))),
+          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_aarch64_neon_smaxv>;
+def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_aarch64_neon_sminv>;
+def : Pat<(i32 (int_aarch64_neon_sminv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_aarch64_neon_umaxv>;
+def : Pat<(i32 (int_aarch64_neon_umaxv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_aarch64_neon_uminv>;
+def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
+defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
+
+// The vaddlv_s32 intrinsic gets mapped to SADDLP.
+def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (SADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+// The vaddlv_u32 intrinsic gets mapped to UADDLP.
+def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (UADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+
+//------------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//------------------------------------------------------------------------------
+
+// AdvSIMD BIC
+defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
+// AdvSIMD ORR
+defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;
+
+def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+// AdvSIMD FMOV
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
+                                              "fmov", ".2d",
+                       [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
+                                              "fmov", ".2s",
+                       [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
+                                              "fmov", ".4s",
+                       [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+
+// AdvSIMD MOVI
+
+// EDIT byte mask: scalar
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
+                    [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 here.
+def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
+          (MOVID imm0_255:$shift)>;
+
+def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
+
+def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
+
+// EDIT byte mask: 2d
+
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 in the pattern
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
+                                                simdimmtype10,
+                                                "movi", ".2d",
+                   [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
+
+
+// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
+// Complexity is added to break a tie with a plain MOVI.
+let AddedComplexity = 1 in {
+def : Pat<(f32   fpimm0),
+          (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
+      Requires<[HasZCZ]>;
+def : Pat<(f64   fpimm0),
+          (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
+      Requires<[HasZCZ]>;
+}
+
+def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+
+def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+
+def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
+
+def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+// Per byte: 8b & 16b
+def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
+                                                 "movi", ".8b",
+                       [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
+def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
+                                                 "movi", ".16b",
+                       [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
+
+// AdvSIMD MVNI
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
+
+def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let neverHasSideEffects = 1 in {
+  defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
+  defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
+}
+
+// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
+// instruction expects the addend first, while the intrinsic expects it last.
+
+// On the other hand, there are quite a few valid combinatorial options due to
+// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+
+multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
+  // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (v2f32 (AArch64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (v4f32 (AArch64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
+  // (DUPLANE from 64-bit would be trivial).
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64duplane64 (v2f64 (fneg V128:$Rm)),
+                                           VectorIndexD:$idx))),
+            (FMLSv2i64_indexed
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64dup (f64 (fneg FPR64Op:$Rm))))),
+            (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 (fneg V64:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+}
+
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+
+defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
+defm FMUL  : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
+
+def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+          (FMULv2i32_indexed V64:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+          (FMULv4i32_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
+          (FMULv2i64_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
+            (i64 0))>;
+
+defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
+              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
+              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
+defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
+                int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
+                                           int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
+                                           int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
+defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
+                int_aarch64_neon_umull>;
+
+// A scalar sqdmull with the second operand being a vector lane can be
+// handled directly with the indexed instruction encoding.
+def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                          (vector_extract (v4i32 V128:$Vm),
+                                                           VectorIndexS:$idx)),
+          (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
+defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
+defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
+// Codegen patterns for the above. We don't put these directly on the
+// instructions because TableGen's type inference can't handle the truth.
+// Having the same base pattern for fp <--> int totally freaks it out.
+def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
+          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
+          (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+
+defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", AArch64vshl>;
+defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
+defm SQRSHRN  : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
+                                     int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
+                                     int_aarch64_neon_sqrshrun>;
+defm SQSHLU   : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL    : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN   : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
+                                     int_aarch64_neon_sqshrn>;
+defm SQSHRUN  : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
+                                     int_aarch64_neon_sqshrun>;
+defm SRI      : SIMDScalarRShiftDTied<   1, 0b01000, "sri">;
+defm SRSHR    : SIMDScalarRShiftD<   0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64srshri node:$MHS, node:$RHS))>>;
+defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
+                                     int_aarch64_neon_uqrshrn>;
+defm UQSHL    : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN   : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
+                                     int_aarch64_neon_uqshrn>;
+defm URSHR    : SIMDScalarRShiftD<   1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64urshri node:$MHS, node:$RHS))>>;
+defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64vlshr node:$MHS, node:$RHS))>>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
+defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
+defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
+                                   int_aarch64_neon_vcvtfxs2fp>;
+defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
+                                         int_aarch64_neon_rshrn>;
+defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
+defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
+                          BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
+defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
+def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftL64:$imm))),
+          (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
+defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
+                                         int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
+                                         int_aarch64_neon_sqrshrun>;
+defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL  : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
+                                         int_aarch64_neon_sqshrn>;
+defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
+                                         int_aarch64_neon_sqshrun>;
+defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
+def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftR64:$imm))),
+          (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
+defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA   : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
+                 TriOpFrag<(add node:$LHS,
+                                (AArch64srshri node:$MHS, node:$RHS))> >;
+defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
+                BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>;
+
+defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
+                TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
+                        int_aarch64_neon_vcvtfxu2fp>;
+defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
+                                         int_aarch64_neon_uqrshrn>;
+defm UQSHL   : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN  : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
+                                         int_aarch64_neon_uqshrn>;
+defm URSHR   : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA   : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
+                TriOpFrag<(add node:$LHS,
+                               (AArch64urshri node:$MHS, node:$RHS))> >;
+defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
+                BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
+defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
+                TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
+
+// SHRN patterns for when a logical right shift was used instead of arithmetic
+// (the immediate guarantees no sign bits actually end up in the result so it
+// doesn't matter).
+def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
+          (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
+          (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
+          (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
+
+def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
+                                 (trunc (AArch64vlshr (v8i16 V128:$Rn),
+                                                    vecshiftR16Narrow:$imm)))),
+          (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
+                                 (trunc (AArch64vlshr (v4i32 V128:$Rn),
+                                                    vecshiftR32Narrow:$imm)))),
+          (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
+                                 (trunc (AArch64vlshr (v2i64 V128:$Rn),
+                                                    vecshiftR64Narrow:$imm)))),
+          (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+
+// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
+// Anyexts are implemented as zexts.
+def : Pat<(v8i16 (sext   (v8i8 V64:$Rn))),  (SSHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+// Also match an extend from the upper half of a 128 bit source register.
+def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (sext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
+
+// Vector shift sxtl aliases
+def : InstAlias<"sxtl.8h $dst, $src1",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.8h, $src1.8b",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.4s $dst, $src1",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.4s, $src1.4h",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.2d $dst, $src1",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.2d, $src1.2s",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift sxtl2 aliases
+def : InstAlias<"sxtl2.8h $dst, $src1",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.4s $dst, $src1",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.2d $dst, $src1",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// Vector shift uxtl aliases
+def : InstAlias<"uxtl.8h $dst, $src1",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.8h, $src1.8b",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.4s $dst, $src1",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.4s, $src1.4h",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.2d $dst, $src1",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.2d, $src1.2s",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift uxtl2 aliases
+def : InstAlias<"uxtl2.8h $dst, $src1",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.4s $dst, $src1",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.2d $dst, $src1",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// These patterns are more complex because floating point loads do not
+// support sign extension.
+// The sign extension has to be explicitly added and is only supported for
+// one step: byte-to-half, half-to-word, word-to-doubleword.
+// SCVTF GPR -> FPR is 9 cycles.
+// SCVTF FPR -> FPR is 4 cyclces.
+// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
+// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
+// and still being faster.
+// However, this is not good for code size.
+// 8-bits -> float. 2 sizes step-up.
+class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
+  : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
+        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                            (SSHLLv4i16_shift
+                              (f64
+                                (EXTRACT_SUBREG
+                                  (SSHLLv8i8_shift
+                                    (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                        INST,
+                                        bsub),
+                                    0),
+                                  dsub)),
+                               0),
+                             ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
+                          (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
+                          (LDRBroX  GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bits -> float. 1 size step-up.
+class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
+  : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                            (SSHLLv4i16_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                  INST,
+                                  hsub),
+                                0),
+                            ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+                           (LDRHroW   GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+                           (LDRHroX   GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bits to 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// SCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double. 3 size step-up: give up.
+// 16-bits -> double. 2 size step.
+class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
+  : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                 (f64
+                                  (EXTRACT_SUBREG
+                                    (SSHLLv4i16_shift
+                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                        INST,
+                                        hsub),
+                                     0),
+                                   dsub)),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+                           (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+                           (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+// 32-bits -> double. 1 size step-up.
+class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
+  : Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                  INST,
+                                  ssub),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
+                           (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
+                           (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
+                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
+                           (LDURSi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD Load-Store Structure
+//----------------------------------------------------------------------------
+defm LD1 : SIMDLd1Multiple<"ld1">;
+defm LD2 : SIMDLd2Multiple<"ld2">;
+defm LD3 : SIMDLd3Multiple<"ld3">;
+defm LD4 : SIMDLd4Multiple<"ld4">;
+
+defm ST1 : SIMDSt1Multiple<"st1">;
+defm ST2 : SIMDSt2Multiple<"st2">;
+defm ST3 : SIMDSt3Multiple<"st3">;
+defm ST4 : SIMDSt4Multiple<"st4">;
+
+class Ld1Pat<ValueType ty, Instruction INST>
+  : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;
+
+def : Ld1Pat<v16i8, LD1Onev16b>;
+def : Ld1Pat<v8i16, LD1Onev8h>;
+def : Ld1Pat<v4i32, LD1Onev4s>;
+def : Ld1Pat<v2i64, LD1Onev2d>;
+def : Ld1Pat<v8i8,  LD1Onev8b>;
+def : Ld1Pat<v4i16, LD1Onev4h>;
+def : Ld1Pat<v2i32, LD1Onev2s>;
+def : Ld1Pat<v1i64, LD1Onev1d>;
+
+class St1Pat<ValueType ty, Instruction INST>
+  : Pat<(store ty:$Vt, GPR64sp:$Rn),
+        (INST ty:$Vt, GPR64sp:$Rn)>;
+
+def : St1Pat<v16i8, ST1Onev16b>;
+def : St1Pat<v8i16, ST1Onev8h>;
+def : St1Pat<v4i32, ST1Onev4s>;
+def : St1Pat<v2i64, ST1Onev2d>;
+def : St1Pat<v8i8,  ST1Onev8b>;
+def : St1Pat<v4i16, ST1Onev4h>;
+def : St1Pat<v2i32, ST1Onev2s>;
+def : St1Pat<v1i64, ST1Onev1d>;
+
+//---
+// Single-element
+//---
+
+defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
+defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
+defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
+defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
+let mayLoad = 1, neverHasSideEffects = 1 in {
+defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
+defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
+defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
+defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned,   GPR64pi8>;
+defm LD2 : SIMDLdSingleBTied<1, 0b000,       "ld2", VecListTwob,   GPR64pi2>;
+defm LD2 : SIMDLdSingleHTied<1, 0b010, 0,    "ld2", VecListTwoh,   GPR64pi4>;
+defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos,   GPR64pi8>;
+defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod,   GPR64pi16>;
+defm LD3 : SIMDLdSingleBTied<0, 0b001,       "ld3", VecListThreeb, GPR64pi3>;
+defm LD3 : SIMDLdSingleHTied<0, 0b011, 0,    "ld3", VecListThreeh, GPR64pi6>;
+defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
+defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
+defm LD4 : SIMDLdSingleBTied<1, 0b001,       "ld4", VecListFourb,  GPR64pi4>;
+defm LD4 : SIMDLdSingleHTied<1, 0b011, 0,    "ld4", VecListFourh,  GPR64pi8>;
+defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours,  GPR64pi16>;
+defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd,  GPR64pi32>;
+}
+
+def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+          (LD1Rv8b GPR64sp:$Rn)>;
+def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+          (LD1Rv16b GPR64sp:$Rn)>;
+def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+          (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+          (LD1Rv8h GPR64sp:$Rn)>;
+def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;
+// Grab the floating point version too
+def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;
+
+class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne128:$Rd),
+           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
+def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
+
+class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne64:$Rd),
+           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (EXTRACT_SUBREG
+            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+                          VecIndex:$idx, GPR64sp:$Rn),
+            dsub)>;
+
+def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
+def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
+
+
+defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
+defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
+defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
+defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
+
+// Stores
+defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
+defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
+defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
+defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
+
+let AddedComplexity = 15 in
+class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+             GPR64sp:$Rn),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
+def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
+def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
+def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
+
+let AddedComplexity = 15 in
+class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+             GPR64sp:$Rn),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
+def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
+def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
+def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
+
+multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                             ValueType VTy, ValueType STy, Instruction ST1,
+                             int offset> {
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, offset),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, GPR64:$Rm),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
+defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
+                        2>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
+
+multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                             ValueType VTy, ValueType STy, Instruction ST1,
+                             int offset> {
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, offset),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, GPR64:$Rm),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
+                         1>;
+defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
+                         2>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
+defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
+defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
+defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
+defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
+defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
+defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
+defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
+defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
+defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
+defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
+defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
+}
+
+defm ST1 : SIMDLdSt1SingleAliases<"st1">;
+defm ST2 : SIMDLdSt2SingleAliases<"st2">;
+defm ST3 : SIMDLdSt3SingleAliases<"st3">;
+defm ST4 : SIMDLdSt4SingleAliases<"st4">;
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+def AESErr   : AESTiedInst<0b0100, "aese",   int_aarch64_crypto_aese>;
+def AESDrr   : AESTiedInst<0b0101, "aesd",   int_aarch64_crypto_aesd>;
+def AESMCrr  : AESInst<    0b0110, "aesmc",  int_aarch64_crypto_aesmc>;
+def AESIMCrr : AESInst<    0b0111, "aesimc", int_aarch64_crypto_aesimc>;
+
+def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_aarch64_crypto_sha1c>;
+def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_aarch64_crypto_sha1p>;
+def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_aarch64_crypto_sha1m>;
+def SHA1SU0rrr   : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>;
+def SHA256Hrrr   : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>;
+def SHA256H2rrr  : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>;
+def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>;
+
+def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_aarch64_crypto_sha1h>;
+def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_aarch64_crypto_sha1su1>;
+def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
+
+//----------------------------------------------------------------------------
+// Compiler-pseudos
+//----------------------------------------------------------------------------
+// FIXME: Like for X86, these should go in their own separate .td file.
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+// FIXME: X86 also checks for CMOV here. Do we need something similar?
+def def32 : PatLeaf<(i32 GPR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
 
-//===----------------------------------------------------------------------===//
-// Load-store register (unscaled immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDURB, LDURH, LDRUSB, LDRUSH, LDRUSW, STUR, STURB, STURH and PRFUM
-//
-// and
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
+
+// For an anyext, we don't care what the high bits are, so we can perform an
+// INSERT_SUBREF into an IMPLICIT_DEF.
+def : Pat<(i64 (anyext GPR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+
+// When we need to explicitly zero-extend, we use an unsigned bitfield move
+// instruction (UBFM) on the enclosing super-reg.
+def : Pat<(i64 (zext GPR32:$src)),
+ (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+
+// To sign extend, we use a signed bitfield move instruction (SBFM) on the
+// containing super-reg.
+def : Pat<(i64 (sext GPR32:$src)),
+   (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i8)),  (SBFMXri GPR64:$src, 0, 7)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i1)),  (SBFMXri GPR64:$src, 0, 0)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i8)),  (SBFMWri GPR32:$src, 0, 7)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i1)),  (SBFMWri GPR32:$src, 0, 0)>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a       imm0_31:$imm)),
+                              (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a        imm0_31:$imm)),
+                              (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a        imm0_63:$imm)),
+                              (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
+
+def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 (i64shift_a        imm0_63:$imm)),
+                   (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
+
+// sra patterns have an AddedComplexity of 10, so make sure we have a higher
+// AddedComplexity for the following patterns since we want to match sext + sra
+// patterns before we attempt to match a single sra node.
+let AddedComplexity = 20 in {
+// We support all sext + sra combinations which preserve at least one bit of the
+// original value which is to be sign extended. E.g. we support shifts up to
+// bitwidth-1 bits.
+def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
+
+def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
+
+def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 imm0_31:$imm), 31)>;
+} // AddedComplexity = 20
+
+// To truncate, we can simply extract from a subregister.
+def : Pat<(i32 (trunc GPR64sp:$src)),
+          (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
+
+// __builtin_trap() uses the BRK instruction on AArch64.
+def : Pat<(trap), (BRK 1)>;
+
+// Conversions within AdvSIMD types in the same register size are free.
+// But because we need a consistent lane ordering, in big endian many
+// conversions require one or more REV instructions.
 //
-//===----------------------------------------------------------------------===//
-// Load-store register (register offset) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDRB, LDRH, LDRSB, LDRSH, LDRSW, STR, STRB, STRH and PRFM
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register (unsigned immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDRB, LDRH, LDRSB, LDRSH, LDRSW, STR, STRB, STRH and PRFM
+// Consider a simple memory load followed by a bitconvert then a store.
+//   v0 = load v2i32
+//   v1 = BITCAST v2i32 v0 to v4i16
+//        store v4i16 v2
 //
-// and
+// In big endian mode every memory access has an implicit byte swap. LDR and
+// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
+// is, they treat the vector as a sequence of elements to be byte-swapped.
+// The two pairs of instructions are fundamentally incompatible. We've decided
+// to use LD1/ST1 only to simplify compiler implementation.
 //
-//===----------------------------------------------------------------------===//
-// Load-store register (immediate post-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STRB, STRH, STR, LDRB, LDRH, LDR, LDRSB, LDRSH, LDRSW
+// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
+// the original code sequence:
+//   v0 = load v2i32
+//   v1 = REV v2i32                  (implicit)
+//   v2 = BITCAST v2i32 v1 to v4i16
+//   v3 = REV v4i16 v2               (implicit)
+//        store v4i16 v3
 //
-// and
+// But this is now broken - the value stored is different to the value loaded
+// due to lane reordering. To fix this, on every BITCAST we must perform two
+// other REVs:
+//   v0 = load v2i32
+//   v1 = REV v2i32                  (implicit)
+//   v2 = REV v2i32
+//   v3 = BITCAST v2i32 v2 to v4i16
+//   v4 = REV v4i16
+//   v5 = REV v4i16 v4               (implicit)
+//        store v4i16 v5
 //
-//===----------------------------------------------------------------------===//
-// Load-store register (immediate pre-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STRB, STRH, STR, LDRB, LDRH, LDR, LDRSB, LDRSH, LDRSW
-
-// Note that patterns are much later on in a completely separate section (they
-// need ADRPxi to be defined).
-
-//===-------------------------------
-// 1. Various operands needed
-//===-------------------------------
-
-//===-------------------------------
-// 1.1 Unsigned 12-bit immediate operands
-//===-------------------------------
-// The addressing mode for these instructions consists of an unsigned 12-bit
-// immediate which is scaled by the size of the memory access.
+// This means an extra two instructions, but actually in most cases the two REV
+// instructions can be combined into one. For example:
+//   (REV64_2s (REV64_4h X)) === (REV32_4h X)
 //
-// We represent this in the MC layer by two operands:
-//     1. A base register.
-//     2. A 12-bit immediate: not multiplied by access size, so "LDR x0,[x0,#8]"
-//        would have '1' in this field.
-// This means that separate functions are needed for converting representations
-// which *are* aware of the intended access size.
-
-// Anything that creates an MCInst (Decoding, selection and AsmParsing) has to
-// know the access size via some means. An isolated operand does not have this
-// information unless told from here, which means we need separate tablegen
-// Operands for each access size. This multiclass takes care of instantiating
-// the correct template functions in the rest of the backend.
-
-//===-------------------------------
-// 1.1 Unsigned 12-bit immediate operands
-//===-------------------------------
-
-multiclass offsets_uimm12<int MemSize, string prefix> {
-  def uimm12_asmoperand : AsmOperandClass {
-    let Name = "OffsetUImm12_" # MemSize;
-    let PredicateMethod = "isOffsetUImm12<" # MemSize # ">";
-    let RenderMethod = "addOffsetUImm12Operands<" # MemSize # ">";
-    let DiagnosticType = "LoadStoreUImm12_" # MemSize;
-  }
-
-  // Pattern is really no more than an ImmLeaf, but predicated on MemSize which
-  // complicates things beyond TableGen's ken.
-  def uimm12 : Operand<i64>,
-               ComplexPattern<i64, 1, "SelectOffsetUImm12<" # MemSize # ">"> {
-    let ParserMatchClass
-      = !cast<AsmOperandClass>(prefix # uimm12_asmoperand);
-
-    let PrintMethod = "printOffsetUImm12Operand<" # MemSize # ">";
-    let EncoderMethod = "getOffsetUImm12OpValue<" # MemSize # ">";
-  }
-}
-
-defm byte_  : offsets_uimm12<1, "byte_">;
-defm hword_ : offsets_uimm12<2, "hword_">;
-defm word_  : offsets_uimm12<4, "word_">;
-defm dword_ : offsets_uimm12<8, "dword_">;
-defm qword_ : offsets_uimm12<16, "qword_">;
-
-//===-------------------------------
-// 1.1 Signed 9-bit immediate operands
-//===-------------------------------
-
-// The MCInst is expected to store the bit-wise encoding of the value,
-// which amounts to lopping off the extended sign bits.
-def SDXF_simm9 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() & 0x1ff, MVT::i32);
-}]>;
-
-def simm9_asmoperand : AsmOperandClass {
-  let Name = "SImm9";
-  let PredicateMethod = "isSImm<9>";
-  let RenderMethod = "addSImmOperands<9>";
-  let DiagnosticType = "LoadStoreSImm9";
-}
-
-def simm9 : Operand<i64>,
-            ImmLeaf<i64, [{ return Imm >= -0x100 && Imm <= 0xff; }],
-            SDXF_simm9> {
-  let PrintMethod = "printOffsetSImm9Operand";
-  let ParserMatchClass = simm9_asmoperand;
-}
-
-
-//===-------------------------------
-// 1.3 Register offset extensions
-//===-------------------------------
-
-// The assembly-syntax for these addressing-modes is:
-//    [<Xn|SP>, <R><m> {, <extend> {<amount>}}]
+// There is also no 128-bit REV instruction. This must be synthesized with an
+// EXT instruction.
 //
-// The essential semantics are:
-//     + <amount> is a shift: #<log(transfer size)> or #0
-//     + <R> can be W or X.
-//     + If <R> is W, <extend> can be UXTW or SXTW
-//     + If <R> is X, <extend> can be LSL or SXTX
+// Most bitconverts require some sort of conversion. The only exceptions are:
+//   a) Identity conversions -  vNfX <-> vNiX
+//   b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
 //
-// The trickiest of those constraints is that Rm can be either GPR32 or GPR64,
-// which will need separate instructions for LLVM type-consistency. We'll also
-// need separate operands, of course.
-multiclass regexts<int MemSize, int RmSize, RegisterClass GPR,
-                   string Rm, string prefix> {
-  def regext_asmoperand : AsmOperandClass {
-    let Name = "AddrRegExtend_" # MemSize # "_" #  Rm;
-    let PredicateMethod = "isAddrRegExtend<" # MemSize # "," # RmSize # ">";
-    let RenderMethod = "addAddrRegExtendOperands<" # MemSize # ">";
-    let DiagnosticType = "LoadStoreExtend" # RmSize # "_" # MemSize;
-  }
-
-  def regext : Operand<i64> {
-    let PrintMethod
-      = "printAddrRegExtendOperand<" # MemSize # ", " # RmSize # ">";
-
-    let DecoderMethod = "DecodeAddrRegExtendOperand";
-    let ParserMatchClass
-      = !cast<AsmOperandClass>(prefix # regext_asmoperand);
-  }
-}
-
-multiclass regexts_wx<int MemSize, string prefix> {
-  // Rm is an X-register if LSL or SXTX are specified as the shift.
-  defm Xm_ : regexts<MemSize, 64, GPR64, "Xm", prefix # "Xm_">;
-
-  // Rm is a W-register if UXTW or SXTW are specified as the shift.
-  defm Wm_ : regexts<MemSize, 32, GPR32, "Wm", prefix # "Wm_">;
-}
-
-defm byte_  : regexts_wx<1, "byte_">;
-defm hword_ : regexts_wx<2, "hword_">;
-defm word_  : regexts_wx<4, "word_">;
-defm dword_ : regexts_wx<8, "dword_">;
-defm qword_ : regexts_wx<16, "qword_">;
-
-
-//===------------------------------
-// 2. The instructions themselves.
-//===------------------------------
-
-// We have the following instructions to implement:
-// |                 | B     | H     | W     | X      |
-// |-----------------+-------+-------+-------+--------|
-// | unsigned str    | STRB  | STRH  | STR   | STR    |
-// | unsigned ldr    | LDRB  | LDRH  | LDR   | LDR    |
-// | signed ldr to W | LDRSB | LDRSH | -     | -      |
-// | signed ldr to X | LDRSB | LDRSH | LDRSW | (PRFM) |
-
-// This will instantiate the LDR/STR instructions you'd expect to use for an
-// unsigned datatype (first two rows above) or floating-point register, which is
-// reasonably uniform across all access sizes.
-
-
-//===------------------------------
-// 2.1 Regular instructions
-//===------------------------------
-
-// This class covers the basic unsigned or irrelevantly-signed loads and stores,
-// to general-purpose and floating-point registers.
-
-class AddrParams<string prefix> {
-  Operand uimm12 = !cast<Operand>(prefix # "_uimm12");
-
-  Operand regextWm = !cast<Operand>(prefix # "_Wm_regext");
-  Operand regextXm = !cast<Operand>(prefix # "_Xm_regext");
-}
-
-def byte_addrparams : AddrParams<"byte">;
-def hword_addrparams : AddrParams<"hword">;
-def word_addrparams : AddrParams<"word">;
-def dword_addrparams : AddrParams<"dword">;
-def qword_addrparams : AddrParams<"qword">;
-
-multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
-                                bit high_opc, string asmsuffix,
-                                RegisterClass GPR, AddrParams params> {
-  // Unsigned immediate
-  def _STR : A64I_LSunsigimm<size, v, {high_opc, 0b0},
-                     (outs), (ins GPR:$Rt, GPR64xsp:$Rn, params.uimm12:$UImm12),
-                     "str" # asmsuffix # "\t$Rt, [$Rn, $UImm12]",
-                     [], NoItinerary>,
-             Sched<[WriteSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-  }
-  def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn]",
-                (!cast<Instruction>(prefix # "_STR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def _LDR : A64I_LSunsigimm<size, v, {high_opc, 0b1},
-                      (outs GPR:$Rt), (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
-                      "ldr" #  asmsuffix # "\t$Rt, [$Rn, $UImm12]",
-                      [], NoItinerary>,
-             Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-  def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn]",
-                (!cast<Instruction>(prefix # "_LDR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  // Register offset (four of these: load/store and Wm/Xm).
-  let mayLoad = 1 in {
-    def _Wm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b0,
-                            (outs GPR:$Rt),
-                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
-                            "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                            Sched<[WriteLd, ReadLd, ReadLd]>;
-
-    def _Xm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b1,
-                            (outs GPR:$Rt),
-                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
-                            "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                            Sched<[WriteLd, ReadLd, ReadLd]>;
-  }
-  def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn, $Rm]",
-        (!cast<Instruction>(prefix # "_Xm_RegOffset_LDR") GPR:$Rt, GPR64xsp:$Rn,
-                                                          GPR64:$Rm, 2)>;
-
-  let mayStore = 1 in {
-    def _Wm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b0,
-                                  (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR32:$Rm,
-                                               params.regextWm:$Ext),
-                                  "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                                  [], NoItinerary>,
-                            Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
-
-    def _Xm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b1,
-                                  (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR64:$Rm,
-                                               params.regextXm:$Ext),
-                                  "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                                  [], NoItinerary>,
-                            Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
-  }
-  def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn, $Rm]",
-      (!cast<Instruction>(prefix # "_Xm_RegOffset_STR") GPR:$Rt, GPR64xsp:$Rn,
-                                                        GPR64:$Rm, 2)>;
-
-  // Unaligned immediate
-  def _STUR : A64I_LSunalimm<size, v, {high_opc, 0b0},
-                             (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                             "stur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-  }
-  def : InstAlias<"stur" # asmsuffix # " $Rt, [$Rn]",
-               (!cast<Instruction>(prefix # "_STUR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def _LDUR : A64I_LSunalimm<size, v, {high_opc, 0b1},
-                             (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
-                             "ldur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>,
-              Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-  def : InstAlias<"ldur" # asmsuffix # " $Rt, [$Rn]",
-               (!cast<Instruction>(prefix # "_LDUR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  // Post-indexed
-  def _PostInd_STR : A64I_LSpostind<size, v, {high_opc, 0b0},
-                               (outs GPR64xsp:$Rn_wb),
-                               (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                               "str" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
-                               [], NoItinerary>,
-                     Sched<[WriteSt, ReadSt, ReadSt]> {
-    let Constraints = "$Rn = $Rn_wb";
-    let mayStore = 1;
-
-    // Decoder only needed for unpredictability checking (FIXME).
-    let DecoderMethod = "DecodeSingleIndexedInstruction";
-  }
-
-  def _PostInd_LDR : A64I_LSpostind<size, v, {high_opc, 0b1},
-                                    (outs GPR:$Rt, GPR64xsp:$Rn_wb),
-                                    (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                    "ldr" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
-                                    [], NoItinerary>,
-                     Sched<[WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeSingleIndexedInstruction";
-  }
-
-  // Pre-indexed
-  def _PreInd_STR : A64I_LSpreind<size, v, {high_opc, 0b0},
-                               (outs GPR64xsp:$Rn_wb),
-                               (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                               "str" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
-                               [], NoItinerary>,
-                    Sched<[WriteSt, ReadSt, ReadSt]> {
-    let Constraints = "$Rn = $Rn_wb";
-    let mayStore = 1;
-
-    // Decoder only needed for unpredictability checking (FIXME).
-    let DecoderMethod = "DecodeSingleIndexedInstruction";
-  }
-
-  def _PreInd_LDR : A64I_LSpreind<size, v, {high_opc, 0b1},
-                                    (outs GPR:$Rt, GPR64xsp:$Rn_wb),
-                                    (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                    "ldr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
-                                    [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeSingleIndexedInstruction";
-  }
-
-}
-
-// STRB/LDRB: First define the instructions
-defm LS8
-  : A64I_LDRSTR_unsigned<"LS8", 0b00, 0b0, 0b0, "b", GPR32, byte_addrparams>;
-
-// STRH/LDRH
-defm LS16
-  : A64I_LDRSTR_unsigned<"LS16", 0b01, 0b0, 0b0, "h", GPR32, hword_addrparams>;
-
-
-// STR/LDR to/from a W register
-defm LS32
-  : A64I_LDRSTR_unsigned<"LS32", 0b10, 0b0, 0b0, "", GPR32, word_addrparams>;
-
-// STR/LDR to/from an X register
-defm LS64
-  : A64I_LDRSTR_unsigned<"LS64", 0b11, 0b0, 0b0, "", GPR64, dword_addrparams>;
-
-let Predicates = [HasFPARMv8] in {
-// STR/LDR to/from a B register
-defm LSFP8
-  : A64I_LDRSTR_unsigned<"LSFP8", 0b00, 0b1, 0b0, "", FPR8, byte_addrparams>;
-
-// STR/LDR to/from an H register
-defm LSFP16
-  : A64I_LDRSTR_unsigned<"LSFP16", 0b01, 0b1, 0b0, "", FPR16, hword_addrparams>;
-
-// STR/LDR to/from an S register
-defm LSFP32
-  : A64I_LDRSTR_unsigned<"LSFP32", 0b10, 0b1, 0b0, "", FPR32, word_addrparams>;
-// STR/LDR to/from a D register
-defm LSFP64
-  : A64I_LDRSTR_unsigned<"LSFP64", 0b11, 0b1, 0b0, "", FPR64, dword_addrparams>;
-// STR/LDR to/from a Q register
-defm LSFP128
-  : A64I_LDRSTR_unsigned<"LSFP128", 0b00, 0b1, 0b1, "", FPR128,
-                         qword_addrparams>;
-}
-
-//===------------------------------
-// 2.3 Signed loads
-//===------------------------------
-
-// Byte and half-word signed loads can both go into either an X or a W register,
-// so it's worth factoring out. Signed word loads don't fit because there is no
-// W version.
-multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
-                           string prefix> {
-  // Unsigned offset
-  def w : A64I_LSunsigimm<size, 0b0, 0b11,
-                          (outs GPR32:$Rt),
-                          (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
-                          "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
-                          [], NoItinerary>,
-          Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
-                  (!cast<Instruction>(prefix # w) GPR32:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def x : A64I_LSunsigimm<size, 0b0, 0b10,
-                          (outs GPR64:$Rt),
-                          (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
-                          "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
-                          [], NoItinerary>,
-          Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
-                  (!cast<Instruction>(prefix # x) GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-  // Register offset
-  let mayLoad = 1 in {
-    def w_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b0,
-                            (outs GPR32:$Rt),
-                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
-                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                         Sched<[WriteLd, ReadLd, ReadLd]>;
-
-    def w_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b1,
-                            (outs GPR32:$Rt),
-                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
-                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                         Sched<[WriteLd, ReadLd, ReadLd]>;
-
-    def x_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b0,
-                            (outs GPR64:$Rt),
-                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
-                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                         Sched<[WriteLd, ReadLd, ReadLd]>;
-
-    def x_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b1,
-                            (outs GPR64:$Rt),
-                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
-                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                         Sched<[WriteLd, ReadLd, ReadLd]>;
-  }
-  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
-        (!cast<Instruction>(prefix # "w_Xm_RegOffset") GPR32:$Rt, GPR64xsp:$Rn,
-                                                       GPR64:$Rm, 2)>;
-
-  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
-        (!cast<Instruction>(prefix # "x_Xm_RegOffset") GPR64:$Rt, GPR64xsp:$Rn,
-                                                       GPR64:$Rm, 2)>;
-
-
-  let mayLoad = 1 in {
-    // Unaligned offset
-    def w_U : A64I_LSunalimm<size, 0b0, 0b11,
-                             (outs GPR32:$Rt),
-                             (ins GPR64xsp:$Rn, simm9:$SImm9),
-                             "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>,
-              Sched<[WriteLd, ReadLd]>;
-
-    def x_U : A64I_LSunalimm<size, 0b0, 0b10,
-                             (outs GPR64:$Rt),
-                             (ins GPR64xsp:$Rn, simm9:$SImm9),
-                             "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>,
-              Sched<[WriteLd, ReadLd]>;
-
-
-    // Post-indexed
-    def w_PostInd : A64I_LSpostind<size, 0b0, 0b11,
-                                 (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
-                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                 "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
-                                 [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, ReadLd]> {
-      let Constraints = "$Rn = $Rn_wb";
-      let DecoderMethod = "DecodeSingleIndexedInstruction";
-    }
-
-    def x_PostInd : A64I_LSpostind<size, 0b0, 0b10,
-                                   (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
-                                   (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                   "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
-                                   [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, ReadLd]> {
-      let Constraints = "$Rn = $Rn_wb";
-      let DecoderMethod = "DecodeSingleIndexedInstruction";
-    }
-
-    // Pre-indexed
-    def w_PreInd : A64I_LSpreind<size, 0b0, 0b11,
-                                 (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
-                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                 "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
-                                 [], NoItinerary>,
-                   Sched<[WriteLd, WriteLd, ReadLd]> {
-      let Constraints = "$Rn = $Rn_wb";
-      let DecoderMethod = "DecodeSingleIndexedInstruction";
-    }
-
-    def x_PreInd : A64I_LSpreind<size, 0b0, 0b10,
-                                 (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
-                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                 "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
-                                 [], NoItinerary>,
-                   Sched<[WriteLd, WriteLd, ReadLd]> {
-      let Constraints = "$Rn = $Rn_wb";
-      let DecoderMethod = "DecodeSingleIndexedInstruction";
-    }
-  } // let mayLoad = 1
-}
-
-// LDRSB
-defm LDRSB : A64I_LDR_signed<0b00, "b", byte_addrparams, "LDRSB">;
-// LDRSH
-defm LDRSH : A64I_LDR_signed<0b01, "h", hword_addrparams, "LDRSH">;
-
-// LDRSW: load a 32-bit register, sign-extending to 64-bits.
-def LDRSWx
-    : A64I_LSunsigimm<0b10, 0b0, 0b10,
-                    (outs GPR64:$Rt),
-                    (ins GPR64xsp:$Rn, word_uimm12:$UImm12),
-                    "ldrsw\t$Rt, [$Rn, $UImm12]",
-                    [], NoItinerary>,
-      Sched<[WriteLd, ReadLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"ldrsw $Rt, [$Rn]", (LDRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-let mayLoad = 1 in {
-  def LDRSWx_Wm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b0,
-                             (outs GPR64:$Rt),
-                             (ins GPR64xsp:$Rn, GPR32:$Rm, word_Wm_regext:$Ext),
-                             "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
-                             [], NoItinerary>,
-                            Sched<[WriteLd, ReadLd, ReadLd]>;
-
-  def LDRSWx_Xm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b1,
-                             (outs GPR64:$Rt),
-                             (ins GPR64xsp:$Rn, GPR64:$Rm, word_Xm_regext:$Ext),
-                             "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
-                             [], NoItinerary>,
-                            Sched<[WriteLd, ReadLd, ReadLd]>;
-}
-def : InstAlias<"ldrsw $Rt, [$Rn, $Rm]",
-                (LDRSWx_Xm_RegOffset GPR64:$Rt, GPR64xsp:$Rn, GPR64:$Rm, 2)>;
-
-
-def LDURSWx
-    : A64I_LSunalimm<0b10, 0b0, 0b10,
-                    (outs GPR64:$Rt),
-                    (ins GPR64xsp:$Rn, simm9:$SImm9),
-                    "ldursw\t$Rt, [$Rn, $SImm9]",
-                    [], NoItinerary>,
-      Sched<[WriteLd, ReadLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"ldursw $Rt, [$Rn]", (LDURSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-def LDRSWx_PostInd
-    : A64I_LSpostind<0b10, 0b0, 0b10,
-                    (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
-                    (ins GPR64xsp:$Rn, simm9:$SImm9),
-                    "ldrsw\t$Rt, [$Rn], $SImm9",
-                    [], NoItinerary>,
-      Sched<[WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let Constraints = "$Rn = $Rn_wb";
-  let DecoderMethod = "DecodeSingleIndexedInstruction";
-}
-
-def LDRSWx_PreInd : A64I_LSpreind<0b10, 0b0, 0b10,
-                                 (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
-                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                 "ldrsw\t$Rt, [$Rn, $SImm9]!",
-                                 [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let Constraints = "$Rn = $Rn_wb";
-  let DecoderMethod = "DecodeSingleIndexedInstruction";
-}
-
-//===------------------------------
-// 2.4 Prefetch operations
-//===------------------------------
-
-def PRFM : A64I_LSunsigimm<0b11, 0b0, 0b10, (outs),
-                 (ins prefetch_op:$Rt, GPR64xsp:$Rn, dword_uimm12:$UImm12),
-                 "prfm\t$Rt, [$Rn, $UImm12]",
-                 [], NoItinerary>,
-           Sched<[WritePreLd, ReadPreLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"prfm $Rt, [$Rn]",
-                (PRFM prefetch_op:$Rt, GPR64xsp:$Rn, 0)>;
-
-let mayLoad = 1 in {
-  def PRFM_Wm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b0, (outs),
-                                        (ins prefetch_op:$Rt, GPR64xsp:$Rn,
-                                             GPR32:$Rm, dword_Wm_regext:$Ext),
-                                        "prfm\t$Rt, [$Rn, $Rm, $Ext]",
-                                        [], NoItinerary>,
-                          Sched<[WritePreLd, ReadPreLd]>;
-  def PRFM_Xm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b1, (outs),
-                                        (ins prefetch_op:$Rt, GPR64xsp:$Rn,
-                                             GPR64:$Rm, dword_Xm_regext:$Ext),
-                                        "prfm\t$Rt, [$Rn, $Rm, $Ext]",
-                                        [], NoItinerary>,
-                          Sched<[WritePreLd, ReadPreLd]>;
-}
-
-def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
-                (PRFM_Xm_RegOffset prefetch_op:$Rt, GPR64xsp:$Rn,
-                                   GPR64:$Rm, 2)>;
-
-
-def PRFUM : A64I_LSunalimm<0b11, 0b0, 0b10, (outs),
-                         (ins prefetch_op:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                         "prfum\t$Rt, [$Rn, $SImm9]",
-                         [], NoItinerary>,
-            Sched<[WritePreLd, ReadPreLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"prfum $Rt, [$Rn]",
-                (PRFUM prefetch_op:$Rt, GPR64xsp:$Rn, 0)>;
-
-//===----------------------------------------------------------------------===//
-// Load-store register (unprivileged) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDTRB, LDTRH, LDTRSB, LDTRSH, LDTRSW, STTR, STTRB and STTRH
-
-// These instructions very much mirror the "unscaled immediate" loads, but since
-// there are no floating-point variants we need to split them out into their own
-// section to avoid instantiation of "ldtr d0, [sp]" etc.
-
-multiclass A64I_LDTRSTTR<bits<2> size, string asmsuffix, RegisterClass GPR,
-                         string prefix> {
-  def _UnPriv_STR : A64I_LSunpriv<size, 0b0, 0b00,
-                              (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                              "sttr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                              [], NoItinerary>,
-                    Sched<[WriteLd, ReadLd]> {
-    let mayStore = 1;
-  }
-
-  def : InstAlias<"sttr" # asmsuffix # " $Rt, [$Rn]",
-         (!cast<Instruction>(prefix # "_UnPriv_STR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def _UnPriv_LDR : A64I_LSunpriv<size, 0b0, 0b01,
-                               (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
-                               "ldtr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                               [], NoItinerary>,
-                    Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-
-  def : InstAlias<"ldtr" # asmsuffix # " $Rt, [$Rn]",
-         (!cast<Instruction>(prefix # "_UnPriv_LDR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-}
-
-// STTRB/LDTRB: First define the instructions
-defm LS8 : A64I_LDTRSTTR<0b00, "b", GPR32, "LS8">;
-
-// STTRH/LDTRH
-defm LS16 : A64I_LDTRSTTR<0b01, "h", GPR32, "LS16">;
-
-// STTR/LDTR to/from a W register
-defm LS32 : A64I_LDTRSTTR<0b10, "", GPR32, "LS32">;
-
-// STTR/LDTR to/from an X register
-defm LS64 : A64I_LDTRSTTR<0b11, "", GPR64, "LS64">;
-
-// Now a class for the signed instructions that can go to either 32 or 64
-// bits...
-multiclass A64I_LDTR_signed<bits<2> size, string asmopcode, string prefix> {
-  let mayLoad = 1 in {
-    def w : A64I_LSunpriv<size, 0b0, 0b11,
-                          (outs GPR32:$Rt),
-                          (ins GPR64xsp:$Rn, simm9:$SImm9),
-                          "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                          [], NoItinerary>,
-            Sched<[WriteLd, ReadLd]>;
-
-    def x : A64I_LSunpriv<size, 0b0, 0b10,
-                          (outs GPR64:$Rt),
-                          (ins GPR64xsp:$Rn, simm9:$SImm9),
-                          "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                          [], NoItinerary>,
-            Sched<[WriteLd, ReadLd]>;
-  }
-
-  def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
-                 (!cast<Instruction>(prefix # "w") GPR32:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
-                 (!cast<Instruction>(prefix # "x") GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-}
-
-// LDTRSB
-defm LDTRSB : A64I_LDTR_signed<0b00, "b", "LDTRSB">;
-// LDTRSH
-defm LDTRSH : A64I_LDTR_signed<0b01, "h", "LDTRSH">;
-
-// And finally LDTRSW which only goes to 64 bits.
-def LDTRSWx : A64I_LSunpriv<0b10, 0b0, 0b10,
-                            (outs GPR64:$Rt),
-                            (ins GPR64xsp:$Rn, simm9:$SImm9),
-                            "ldtrsw\t$Rt, [$Rn, $SImm9]",
-                            [], NoItinerary>,
-              Sched<[WriteLd, ReadLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"ldtrsw $Rt, [$Rn]", (LDTRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-//===----------------------------------------------------------------------===//
-// Load-store register pair (offset) instructions
-//===----------------------------------------------------------------------===//
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register pair (post-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STP, LDP, LDPSW
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register pair (pre-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STP, LDP, LDPSW
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store non-temporal register pair (offset) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STNP, LDNP
-
-
-// Anything that creates an MCInst (Decoding, selection and AsmParsing) has to
-// know the access size via some means. An isolated operand does not have this
-// information unless told from here, which means we need separate tablegen
-// Operands for each access size. This multiclass takes care of instantiating
-// the correct template functions in the rest of the backend.
-
-multiclass offsets_simm7<string MemSize, string prefix> {
-  // The bare signed 7-bit immediate is used in post-indexed instructions, but
-  // because of the scaling performed a generic "simm7" operand isn't
-  // appropriate here either.
-  def simm7_asmoperand : AsmOperandClass {
-    let Name = "SImm7_Scaled" # MemSize;
-    let PredicateMethod = "isSImm7Scaled<" # MemSize # ">";
-    let RenderMethod = "addSImm7ScaledOperands<" # MemSize # ">";
-    let DiagnosticType = "LoadStoreSImm7_" # MemSize;
-  }
-
-  def simm7 : Operand<i64> {
-    let PrintMethod = "printSImm7ScaledOperand<" # MemSize # ">";
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "simm7_asmoperand");
-  }
-}
-
-defm word_  : offsets_simm7<"4", "word_">;
-defm dword_ : offsets_simm7<"8", "dword_">;
-defm qword_ : offsets_simm7<"16", "qword_">;
-
-multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
-                          Operand simm7, string prefix> {
-  def _STR : A64I_LSPoffset<opc, v, 0b0, (outs),
-                    (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
-                    "stp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-             Sched<[WriteLd, ReadLd]> {
-    let mayStore = 1;
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-  def : InstAlias<"stp $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(prefix # "_STR") SomeReg:$Rt,
-                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-  def _LDR : A64I_LSPoffset<opc, v, 0b1,
-                            (outs SomeReg:$Rt, SomeReg:$Rt2),
-                            (ins GPR64xsp:$Rn, simm7:$SImm7),
-                            "ldp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-             Sched<[WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-  def : InstAlias<"ldp $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(prefix # "_LDR") SomeReg:$Rt,
-                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-  def _PostInd_STR : A64I_LSPpostind<opc, v, 0b0,
-                               (outs GPR64xsp:$Rn_wb),
-                               (ins SomeReg:$Rt, SomeReg:$Rt2,
-                                    GPR64xsp:$Rn,
-                                    simm7:$SImm7),
-                               "stp\t$Rt, $Rt2, [$Rn], $SImm7",
-                               [], NoItinerary>,
-                     Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-    let Constraints = "$Rn = $Rn_wb";
-
-    // Decoder only needed for unpredictability checking (FIXME).
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-
-  def _PostInd_LDR : A64I_LSPpostind<opc, v, 0b1,
-                        (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
-                        (ins GPR64xsp:$Rn, simm7:$SImm7),
-                        "ldp\t$Rt, $Rt2, [$Rn], $SImm7",
-                        [], NoItinerary>,
-                     Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-
-  def _PreInd_STR : A64I_LSPpreind<opc, v, 0b0, (outs GPR64xsp:$Rn_wb),
-                       (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
-                       "stp\t$Rt, $Rt2, [$Rn, $SImm7]!",
-                       [], NoItinerary>,
-                    Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-
-  def _PreInd_LDR : A64I_LSPpreind<opc, v, 0b1,
-                              (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
-                              (ins GPR64xsp:$Rn, simm7:$SImm7),
-                              "ldp\t$Rt, $Rt2, [$Rn, $SImm7]!",
-                              [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-
-  def _NonTemp_STR : A64I_LSPnontemp<opc, v, 0b0, (outs),
-                       (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
-                       "stnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-                     Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-  def : InstAlias<"stnp $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(prefix # "_NonTemp_STR") SomeReg:$Rt,
-                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-  def _NonTemp_LDR : A64I_LSPnontemp<opc, v, 0b1,
-                            (outs SomeReg:$Rt, SomeReg:$Rt2),
-                            (ins GPR64xsp:$Rn, simm7:$SImm7),
-                            "ldnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-                     Sched<[WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-  def : InstAlias<"ldnp $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(prefix # "_NonTemp_LDR") SomeReg:$Rt,
-                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-}
-
-
-defm LSPair32 : A64I_LSPsimple<0b00, 0b0, GPR32, word_simm7, "LSPair32">;
-defm LSPair64 : A64I_LSPsimple<0b10, 0b0, GPR64, dword_simm7, "LSPair64">;
-
-let Predicates = [HasFPARMv8] in {
-defm LSFPPair32 : A64I_LSPsimple<0b00, 0b1, FPR32, word_simm7, "LSFPPair32">;
-defm LSFPPair64 : A64I_LSPsimple<0b01, 0b1, FPR64,  dword_simm7, "LSFPPair64">;
-defm LSFPPair128 : A64I_LSPsimple<0b10, 0b1, FPR128, qword_simm7,
-                                  "LSFPPair128">;
-}
-
-
-def LDPSWx : A64I_LSPoffset<0b01, 0b0, 0b1,
-                           (outs GPR64:$Rt, GPR64:$Rt2),
-                           (ins GPR64xsp:$Rn, word_simm7:$SImm7),
-                           "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-             Sched<[WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let DecoderMethod = "DecodeLDSTPairInstruction";
-}
-def : InstAlias<"ldpsw $Rt, $Rt2, [$Rn]",
-                (LDPSWx GPR64:$Rt, GPR64:$Rt2, GPR64xsp:$Rn, 0)>;
-
-def LDPSWx_PostInd : A64I_LSPpostind<0b01, 0b0, 0b1,
-                                  (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
-                                  (ins GPR64xsp:$Rn, word_simm7:$SImm7),
-                                  "ldpsw\t$Rt, $Rt2, [$Rn], $SImm7",
-                                  [], NoItinerary>,
-                     Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let Constraints = "$Rn = $Rn_wb";
-  let DecoderMethod = "DecodeLDSTPairInstruction";
-}
-
-def LDPSWx_PreInd : A64I_LSPpreind<0b01, 0b0, 0b1,
-                                   (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
-                                   (ins GPR64xsp:$Rn, word_simm7:$SImm7),
-                                   "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]!",
-                                   [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let Constraints = "$Rn = $Rn_wb";
-  let DecoderMethod = "DecodeLDSTPairInstruction";
-}
-
-//===----------------------------------------------------------------------===//
-// Logical (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: AND, ORR, EOR, ANDS, + aliases TST, MOV
-
-multiclass logical_imm_operands<string prefix, string note,
-                                int size, ValueType VT> {
-  def _asmoperand : AsmOperandClass {
-    let Name = "LogicalImm" # note # size;
-    let PredicateMethod = "isLogicalImm" # note # "<" # size # ">";
-    let RenderMethod = "addLogicalImmOperands<" # size # ">";
-    let DiagnosticType = "LogicalSecondSource";
-  }
-
-  def _operand
-        : Operand<VT>, ComplexPattern<VT, 1, "SelectLogicalImm", [imm]> {
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
-    let PrintMethod = "printLogicalImmOperand<" # size # ">";
-    let DecoderMethod = "DecodeLogicalImmOperand<" # size # ">";
-  }
-}
-
-defm logical_imm32 : logical_imm_operands<"logical_imm32", "", 32, i32>;
-defm logical_imm64 : logical_imm_operands<"logical_imm64", "", 64, i64>;
-
-// The mov versions only differ in assembly parsing, where they
-// exclude values representable with either MOVZ or MOVN.
-defm logical_imm32_mov
-  : logical_imm_operands<"logical_imm32_mov", "MOV", 32, i32>;
-defm logical_imm64_mov
-  : logical_imm_operands<"logical_imm64_mov", "MOV", 64, i64>;
-
-
-multiclass A64I_logimmSizes<bits<2> opc, string asmop, SDNode opnode> {
-  def wwi : A64I_logicalimm<0b0, opc, (outs GPR32wsp:$Rd),
-                         (ins GPR32:$Rn, logical_imm32_operand:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [(set i32:$Rd,
-                               (opnode i32:$Rn, logical_imm32_operand:$Imm))],
-                         NoItinerary>,
-            Sched<[WriteALU, ReadALU]>;
-
-  def xxi : A64I_logicalimm<0b1, opc, (outs GPR64xsp:$Rd),
-                         (ins GPR64:$Rn, logical_imm64_operand:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [(set i64:$Rd,
-                               (opnode i64:$Rn, logical_imm64_operand:$Imm))],
-                         NoItinerary>,
-            Sched<[WriteALU, ReadALU]>;
-}
-
-defm AND : A64I_logimmSizes<0b00, "and", and>;
-defm ORR : A64I_logimmSizes<0b01, "orr", or>;
-defm EOR : A64I_logimmSizes<0b10, "eor", xor>;
-
-let Defs = [NZCV] in {
-  def ANDSwwi : A64I_logicalimm<0b0, 0b11, (outs GPR32:$Rd),
-                                (ins GPR32:$Rn, logical_imm32_operand:$Imm),
-                                "ands\t$Rd, $Rn, $Imm",
-                                [], NoItinerary>,
-                Sched<[WriteALU, ReadALU]>;
-
-  def ANDSxxi : A64I_logicalimm<0b1, 0b11, (outs GPR64:$Rd),
-                                (ins GPR64:$Rn, logical_imm64_operand:$Imm),
-                                "ands\t$Rd, $Rn, $Imm",
-                                [], NoItinerary>,
-                Sched<[WriteALU, ReadALU]>;
-}
-
-
-def : InstAlias<"tst $Rn, $Imm",
-                (ANDSwwi WZR, GPR32:$Rn, logical_imm32_operand:$Imm)>;
-def : InstAlias<"tst $Rn, $Imm",
-                (ANDSxxi XZR, GPR64:$Rn, logical_imm64_operand:$Imm)>;
-def : InstAlias<"mov $Rd, $Imm",
-                (ORRwwi GPR32wsp:$Rd, WZR, logical_imm32_mov_operand:$Imm)>;
-def : InstAlias<"mov $Rd, $Imm",
-                (ORRxxi GPR64xsp:$Rd, XZR, logical_imm64_mov_operand:$Imm)>;
-
-//===----------------------------------------------------------------------===//
-// Logical (shifted register) instructions
-//===----------------------------------------------------------------------===//
-// Contains: AND, BIC, ORR, ORN, EOR, EON, ANDS, BICS + aliases TST, MVN, MOV
-
-// Operand for optimizing (icmp (and LHS, RHS), 0, SomeCode). In theory "ANDS"
-// behaves differently for unsigned comparisons, so we defensively only allow
-// signed or n/a as the operand. In practice "unsigned greater than 0" is "not
-// equal to 0" and LLVM gives us this.
-def signed_cond : PatLeaf<(cond), [{
-  return !isUnsignedIntSetCC(N->get());
-}]>;
-
-
-// These instructions share their "shift" operands with add/sub (shifted
-// register instructions). They are defined there.
-
-// N.b. the commutable parameter is just !N. It will be first against the wall
-// when the revolution comes.
-multiclass logical_shifts<string prefix, bit sf, bits<2> opc,
-                          bit N, bit commutable,
-                          string asmop, SDPatternOperator opfrag, ValueType ty,
-                          RegisterClass GPR, list<Register> defs> {
-  let isCommutable = commutable, Defs = defs in {
-  def _lsl : A64I_logicalshift<sf, opc, 0b00, N,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (shl ty:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _lsr : A64I_logicalshift<sf, opc, 0b01, N,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (srl ty:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _asr : A64I_logicalshift<sf, opc, 0b10, N,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (sra ty:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _ror : A64I_logicalshift<sf, opc, 0b11, N,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("ror_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (rotr ty:$Rm,
-                            !cast<Operand>("ror_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-  }
-
-  def _noshift
-      : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
-                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rd, GPR:$Rn,
-                                                      GPR:$Rm, 0)>;
-
-  def : Pat<(opfrag ty:$Rn, ty:$Rm),
-            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
-
-multiclass logical_sizes<string prefix, bits<2> opc, bit N, bit commutable,
-                         string asmop, SDPatternOperator opfrag,
-                         list<Register> defs> {
-  defm xxx : logical_shifts<prefix # "xxx", 0b1, opc, N,
-                            commutable, asmop, opfrag, i64, GPR64, defs>;
-  defm www : logical_shifts<prefix # "www", 0b0, opc, N,
-                            commutable, asmop, opfrag, i32, GPR32, defs>;
-}
-
-
-defm AND : logical_sizes<"AND", 0b00, 0b0, 0b1, "and", and, []>;
-defm ORR : logical_sizes<"ORR", 0b01, 0b0, 0b1, "orr", or, []>;
-defm EOR : logical_sizes<"EOR", 0b10, 0b0, 0b1, "eor", xor, []>;
-defm ANDS : logical_sizes<"ANDS", 0b11, 0b0, 0b1, "ands",
-             PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs),
-                     [{ (void)N; return false; }]>,
-             [NZCV]>;
-
-defm BIC : logical_sizes<"BIC", 0b00, 0b1, 0b0, "bic",
-                         PatFrag<(ops node:$lhs, node:$rhs),
-                                 (and node:$lhs, (not node:$rhs))>, []>;
-defm ORN : logical_sizes<"ORN", 0b01, 0b1, 0b0, "orn",
-                         PatFrag<(ops node:$lhs, node:$rhs),
-                                 (or node:$lhs, (not node:$rhs))>, []>;
-defm EON : logical_sizes<"EON", 0b10, 0b1, 0b0, "eon",
-                         PatFrag<(ops node:$lhs, node:$rhs),
-                                 (xor node:$lhs, (not node:$rhs))>, []>;
-defm BICS : logical_sizes<"BICS", 0b11, 0b1, 0b0, "bics",
-                          PatFrag<(ops node:$lhs, node:$rhs),
-                                  (and node:$lhs, (not node:$rhs)),
-                                  [{ (void)N; return false; }]>,
-                          [NZCV]>;
-
-multiclass tst_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
-  let isCommutable = 1, Rd = 0b11111, Defs = [NZCV] in {
-  def _lsl : A64I_logicalshift<sf, 0b11, 0b00, 0b0,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       "tst\t$Rn, $Rm, $Imm6",
-                       [(set NZCV, (A64setcc (and ty:$Rn, (shl ty:$Rm,
-                           !cast<Operand>("lsl_operand_" # ty):$Imm6)),
-                                          0, signed_cond))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-
-  def _lsr : A64I_logicalshift<sf, 0b11, 0b01, 0b0,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       "tst\t$Rn, $Rm, $Imm6",
-                       [(set NZCV, (A64setcc (and ty:$Rn, (srl ty:$Rm,
-                           !cast<Operand>("lsr_operand_" # ty):$Imm6)),
-                                          0, signed_cond))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _asr : A64I_logicalshift<sf, 0b11, 0b10, 0b0,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       "tst\t$Rn, $Rm, $Imm6",
-                       [(set NZCV, (A64setcc (and ty:$Rn, (sra ty:$Rm,
-                           !cast<Operand>("asr_operand_" # ty):$Imm6)),
-                                          0, signed_cond))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _ror : A64I_logicalshift<sf, 0b11, 0b11, 0b0,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("ror_operand_" # ty):$Imm6),
-                       "tst\t$Rn, $Rm, $Imm6",
-                       [(set NZCV, (A64setcc (and ty:$Rn, (rotr ty:$Rm,
-                           !cast<Operand>("ror_operand_" # ty):$Imm6)),
-                                          0, signed_cond))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-  }
-
-  def _noshift : InstAlias<"tst $Rn, $Rm",
-                     (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
-
-  def : Pat<(A64setcc (and ty:$Rn, ty:$Rm), 0, signed_cond),
-            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
-
-defm TSTxx : tst_shifts<"TSTxx", 0b1, i64, GPR64>;
-defm TSTww : tst_shifts<"TSTww", 0b0, i32, GPR32>;
-
-
-multiclass mvn_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
-  let isCommutable = 0, Rn = 0b11111 in {
-  def _lsl : A64I_logicalshift<sf, 0b01, 0b00, 0b1,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       "mvn\t$Rd, $Rm, $Imm6",
-                       [(set ty:$Rd, (not (shl ty:$Rm,
-                         !cast<Operand>("lsl_operand_" # ty):$Imm6)))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-
-  def _lsr : A64I_logicalshift<sf, 0b01, 0b01, 0b1,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       "mvn\t$Rd, $Rm, $Imm6",
-                       [(set ty:$Rd, (not (srl ty:$Rm,
-                         !cast<Operand>("lsr_operand_" # ty):$Imm6)))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _asr : A64I_logicalshift<sf, 0b01, 0b10, 0b1,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       "mvn\t$Rd, $Rm, $Imm6",
-                       [(set ty:$Rd, (not (sra ty:$Rm,
-                         !cast<Operand>("asr_operand_" # ty):$Imm6)))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _ror : A64I_logicalshift<sf, 0b01, 0b11, 0b1,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rm,
-                            !cast<Operand>("ror_operand_" # ty):$Imm6),
-                       "mvn\t$Rd, $Rm, $Imm6",
-                       [(set ty:$Rd, (not (rotr ty:$Rm,
-                         !cast<Operand>("lsl_operand_" # ty):$Imm6)))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-  }
-
-  def _noshift : InstAlias<"mvn $Rn, $Rm",
-                     (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
-
-  def : Pat<(not ty:$Rm),
-            (!cast<Instruction>(prefix # "_lsl") $Rm, 0)>;
-}
-
-defm MVNxx : mvn_shifts<"MVNxx", 0b1, i64, GPR64>;
-defm MVNww : mvn_shifts<"MVNww", 0b0, i32, GPR32>;
-
-def MOVxx :InstAlias<"mov $Rd, $Rm", (ORRxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
-def MOVww :InstAlias<"mov $Rd, $Rm", (ORRwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
-
-//===----------------------------------------------------------------------===//
-// Move wide (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: MOVN, MOVZ, MOVK + MOV aliases
-
-// A wide variety of different relocations are needed for variants of these
-// instructions, so it turns out that we need a different operand for all of
-// them.
-multiclass movw_operands<string prefix, string instname, int width> {
-  def _imm_asmoperand : AsmOperandClass {
-    let Name = instname # width # "Shifted" # shift;
-    let PredicateMethod = "is" # instname # width # "Imm";
-    let RenderMethod = "addMoveWideImmOperands";
-    let ParserMethod = "ParseImmWithLSLOperand";
-    let DiagnosticType = "MOVWUImm16";
-  }
-
-  def _imm : Operand<i64> {
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_imm_asmoperand");
-    let PrintMethod = "printMoveWideImmOperand";
-    let EncoderMethod = "getMoveWideImmOpValue";
-    let DecoderMethod = "DecodeMoveWideImmOperand<" # width # ">";
-
-    let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
-  }
-}
-
-defm movn32 : movw_operands<"movn32", "MOVN", 32>;
-defm movn64 : movw_operands<"movn64", "MOVN", 64>;
-defm movz32 : movw_operands<"movz32", "MOVZ", 32>;
-defm movz64 : movw_operands<"movz64", "MOVZ", 64>;
-defm movk32 : movw_operands<"movk32", "MOVK", 32>;
-defm movk64 : movw_operands<"movk64", "MOVK", 64>;
-
-multiclass A64I_movwSizes<bits<2> opc, string asmop, dag ins32bit,
-                          dag ins64bit> {
-
-  def wii : A64I_movw<0b0, opc, (outs GPR32:$Rd), ins32bit,
-                      !strconcat(asmop, "\t$Rd, $FullImm"),
-                      [], NoItinerary>,
-            Sched<[WriteALU]> {
-    bits<18> FullImm;
-    let UImm16 = FullImm{15-0};
-    let Shift = FullImm{17-16};
-  }
-
-  def xii : A64I_movw<0b1, opc, (outs GPR64:$Rd), ins64bit,
-                      !strconcat(asmop, "\t$Rd, $FullImm"),
-                      [], NoItinerary>,
-            Sched<[WriteALU]> {
-    bits<18> FullImm;
-    let UImm16 = FullImm{15-0};
-    let Shift = FullImm{17-16};
-  }
-}
-
-let isMoveImm = 1, isReMaterializable = 1,
-    isAsCheapAsAMove = 1, hasSideEffects = 0 in {
-  defm MOVN : A64I_movwSizes<0b00, "movn",
-                             (ins movn32_imm:$FullImm),
-                             (ins movn64_imm:$FullImm)>;
-
-  // Some relocations are able to convert between a MOVZ and a MOVN. If these
-  // are applied the instruction must be emitted with the corresponding bits as
-  // 0, which means a MOVZ needs to override that bit from the default.
-  let PostEncoderMethod = "fixMOVZ" in
-  defm MOVZ : A64I_movwSizes<0b10, "movz",
-                             (ins movz32_imm:$FullImm),
-                             (ins movz64_imm:$FullImm)>;
-}
-
-let Constraints = "$src = $Rd",
-    SchedRW = [WriteALU, ReadALU] in
-defm MOVK : A64I_movwSizes<0b11, "movk",
-                           (ins GPR32:$src, movk32_imm:$FullImm),
-                           (ins GPR64:$src, movk64_imm:$FullImm)>;
-
-
-// And now the "MOV" aliases. These also need their own operands because what
-// they accept is completely different to what the base instructions accept.
-multiclass movalias_operand<string prefix, string basename,
-                            string immpredicate, int width> {
-  def _asmoperand : AsmOperandClass {
-    let Name = basename # width # "MovAlias";
-    let PredicateMethod
-          = "isMoveWideMovAlias<" # width # ", A64Imms::" # immpredicate # ">";
-    let RenderMethod
-      = "addMoveWideMovAliasOperands<" # width # ", "
-                                       # "A64Imms::" # immpredicate # ">";
-  }
-
-  def _movimm : Operand<i64> {
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
-
-    let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
-  }
-}
-
-defm movz32 : movalias_operand<"movz32", "MOVZ", "isMOVZImm", 32>;
-defm movz64 : movalias_operand<"movz64", "MOVZ", "isMOVZImm", 64>;
-defm movn32 : movalias_operand<"movn32", "MOVN", "isOnlyMOVNImm", 32>;
-defm movn64 : movalias_operand<"movn64", "MOVN", "isOnlyMOVNImm", 64>;
-
-// FIXME: these are officially canonical aliases, but TableGen is too limited to
-// print them at the moment. I believe in this case an "AliasPredicate" method
-// will need to be implemented. to allow it, as well as the more generally
-// useful handling of non-register, non-constant operands.
-class movalias<Instruction INST, RegisterClass GPR, Operand operand>
-  : InstAlias<"mov $Rd, $FullImm", (INST GPR:$Rd, operand:$FullImm)>;
-
-def : movalias<MOVZwii, GPR32, movz32_movimm>;
-def : movalias<MOVZxii, GPR64, movz64_movimm>;
-def : movalias<MOVNwii, GPR32, movn32_movimm>;
-def : movalias<MOVNxii, GPR64, movn64_movimm>;
-
-def movw_addressref_g0 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<0>">;
-def movw_addressref_g1 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<1>">;
-def movw_addressref_g2 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<2>">;
-def movw_addressref_g3 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<3>">;
-
-def : Pat<(A64WrapperLarge movw_addressref_g3:$G3, movw_addressref_g2:$G2,
-                           movw_addressref_g1:$G1, movw_addressref_g0:$G0),
-          (MOVKxii (MOVKxii (MOVKxii (MOVZxii movw_addressref_g3:$G3),
-                                     movw_addressref_g2:$G2),
-                            movw_addressref_g1:$G1),
-                   movw_addressref_g0:$G0)>;
-
-//===----------------------------------------------------------------------===//
-// PC-relative addressing instructions
-//===----------------------------------------------------------------------===//
-// Contains: ADR, ADRP
-
-def adr_label : Operand<i64> {
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_adr_prel>";
-
-  // This label is a 21-bit offset from PC, unscaled
-  let PrintMethod = "printLabelOperand<21, 1>";
-  let ParserMatchClass = label_asmoperand<21, 1>;
-  let OperandType = "OPERAND_PCREL";
-}
-
-def adrp_label_asmoperand : AsmOperandClass {
-  let Name = "AdrpLabel";
-  let RenderMethod = "addLabelOperands<21, 4096>";
-  let DiagnosticType = "Label";
-}
-
-def adrp_label : Operand<i64> {
-  let EncoderMethod = "getAdrpLabelOpValue";
-
-  // This label is a 21-bit offset from PC, scaled by the page-size: 4096.
-  let PrintMethod = "printLabelOperand<21, 4096>";
-  let ParserMatchClass = adrp_label_asmoperand;
-  let OperandType = "OPERAND_PCREL";
-}
-
-let hasSideEffects = 0 in {
-  def ADRxi : A64I_PCADR<0b0, (outs GPR64:$Rd), (ins adr_label:$Label),
-                         "adr\t$Rd, $Label", [], NoItinerary>,
-              Sched<[WriteALUs]>;
-
-  def ADRPxi : A64I_PCADR<0b1, (outs GPR64:$Rd), (ins adrp_label:$Label),
-                          "adrp\t$Rd, $Label", [], NoItinerary>,
-               Sched<[WriteALUs]>;
-}
-
-//===----------------------------------------------------------------------===//
-// System instructions
-//===----------------------------------------------------------------------===//
-// Contains: HINT, CLREX, DSB, DMB, ISB, MSR, SYS, SYSL, MRS
-//    + aliases IC, DC, AT, TLBI, NOP, YIELD, WFE, WFI, SEV, SEVL
-
-// Op1 and Op2 fields are sometimes simple 3-bit unsigned immediate values.
-def uimm3_asmoperand : AsmOperandClass {
-  let Name = "UImm3";
-  let PredicateMethod = "isUImm<3>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm3";
-}
-
-def uimm3 : Operand<i32> {
-  let ParserMatchClass = uimm3_asmoperand;
-}
-
-// The HINT alias can accept a simple unsigned 7-bit immediate.
-def uimm7_asmoperand : AsmOperandClass {
-  let Name = "UImm7";
-  let PredicateMethod = "isUImm<7>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm7";
-}
-
-def uimm7 : Operand<i32> {
-  let ParserMatchClass = uimm7_asmoperand;
-}
-
-// Multiclass namedimm is defined with the prefetch operands. Most of these fit
-// into the NamedImmMapper scheme well: they either accept a named operand or
-// any immediate under a particular value (which may be 0, implying no immediate
-// is allowed).
-defm dbarrier : namedimm<"dbarrier", "A64DB::DBarrierMapper">;
-defm isb : namedimm<"isb", "A64ISB::ISBMapper">;
-defm ic : namedimm<"ic", "A64IC::ICMapper">;
-defm dc : namedimm<"dc", "A64DC::DCMapper">;
-defm at : namedimm<"at", "A64AT::ATMapper">;
-defm tlbi : namedimm<"tlbi", "A64TLBI::TLBIMapper">;
-
-// However, MRS and MSR are more complicated for a few reasons:
-//   * There are ~1000 generic names S3_<op1>_<CRn>_<CRm>_<Op2> which have an
-//     implementation-defined effect
-//   * Most registers are shared, but some are read-only or write-only.
-//   * There is a variant of MSR which accepts the same register name (SPSel),
-//     but which would have a different encoding.
-
-// In principle these could be resolved in with more complicated subclasses of
-// NamedImmMapper, however that imposes an overhead on other "named
-// immediates". Both in concrete terms with virtual tables and in unnecessary
-// abstraction.
-
-// The solution adopted here is to take the MRS/MSR Mappers out of the usual
-// hierarchy (they're not derived from NamedImmMapper) and to add logic for
-// their special situation.
-def mrs_asmoperand : AsmOperandClass {
-  let Name = "MRS";
-  let ParserMethod = "ParseSysRegOperand";
-  let DiagnosticType = "MRS";
-}
-
-def mrs_op : Operand<i32> {
-  let ParserMatchClass = mrs_asmoperand;
-  let PrintMethod = "printMRSOperand";
-  let DecoderMethod = "DecodeMRSOperand";
-}
-
-def msr_asmoperand : AsmOperandClass {
-  let Name = "MSRWithReg";
-
-  // Note that SPSel is valid for both this and the pstate operands, but with
-  // different immediate encodings. This is why these operands provide a string
-  // AArch64Operand rather than an immediate. The overlap is small enough that
-  // it could be resolved with hackery now, but who can say in future?
-  let ParserMethod = "ParseSysRegOperand";
-  let DiagnosticType = "MSR";
-}
-
-def msr_op : Operand<i32> {
-  let ParserMatchClass = msr_asmoperand;
-  let PrintMethod = "printMSROperand";
-  let DecoderMethod = "DecodeMSROperand";
-}
-
-def pstate_asmoperand : AsmOperandClass {
-  let Name = "MSRPState";
-  // See comment above about parser.
-  let ParserMethod = "ParseSysRegOperand";
-  let DiagnosticType = "MSR";
-}
-
-def pstate_op : Operand<i32> {
-  let ParserMatchClass = pstate_asmoperand;
-  let PrintMethod = "printNamedImmOperand<A64PState::PStateMapper>";
-  let DecoderMethod = "DecodeNamedImmOperand<A64PState::PStateMapper>";
-}
-
-// When <CRn> is specified, an assembler should accept something like "C4", not
-// the usual "#4" immediate.
-def CRx_asmoperand : AsmOperandClass {
-  let Name = "CRx";
-  let PredicateMethod = "isUImm<4>";
-  let RenderMethod = "addImmOperands";
-  let ParserMethod = "ParseCRxOperand";
-  // Diagnostics are handled in all cases by ParseCRxOperand.
-}
-
-def CRx : Operand<i32> {
-  let ParserMatchClass = CRx_asmoperand;
-  let PrintMethod = "printCRxOperand";
-}
-
-
-// Finally, we can start defining the instructions.
-
-// HINT is straightforward, with a few aliases.
-def HINTi : A64I_system<0b0, (outs), (ins uimm7:$UImm7), "hint\t$UImm7",
-                        [], NoItinerary> {
-  bits<7> UImm7;
-  let CRm = UImm7{6-3};
-  let Op2 = UImm7{2-0};
-
-  let Op0 = 0b00;
-  let Op1 = 0b011;
-  let CRn = 0b0010;
-  let Rt = 0b11111;
-}
-
-def : InstAlias<"nop", (HINTi 0)>;
-def : InstAlias<"yield", (HINTi 1)>;
-def : InstAlias<"wfe", (HINTi 2)>;
-def : InstAlias<"wfi", (HINTi 3)>;
-def : InstAlias<"sev", (HINTi 4)>;
-def : InstAlias<"sevl", (HINTi 5)>;
-
-// Quite a few instructions then follow a similar pattern of fixing common
-// fields in the bitpattern, we'll define a helper-class for them.
-class simple_sys<bits<2> op0, bits<3> op1, bits<4> crn, bits<3> op2,
-                 Operand operand, string asmop>
-  : A64I_system<0b0, (outs), (ins operand:$CRm), !strconcat(asmop, "\t$CRm"),
-                [], NoItinerary> {
-  let Op0 = op0;
-  let Op1 = op1;
-  let CRn = crn;
-  let Op2 = op2;
-  let Rt = 0b11111;
-}
-
-
-def CLREXi : simple_sys<0b00, 0b011, 0b0011, 0b010, uimm4, "clrex">;
-def DSBi : simple_sys<0b00, 0b011, 0b0011, 0b100, dbarrier_op, "dsb">;
-def DMBi : simple_sys<0b00, 0b011, 0b0011, 0b101, dbarrier_op, "dmb">;
-def ISBi : simple_sys<0b00, 0b011, 0b0011, 0b110, isb_op, "isb">;
-
-def : InstAlias<"clrex", (CLREXi 0b1111)>;
-def : InstAlias<"isb", (ISBi 0b1111)>;
-
-// (DMBi 0xb) is a "DMB ISH" instruciton, appropriate for Linux SMP
-// configurations at least.
-def : Pat<(atomic_fence imm, imm), (DMBi 0xb)>;
-
-// Any SYS bitpattern can be represented with a complex and opaque "SYS"
-// instruction.
-def SYSiccix : A64I_system<0b0, (outs),
-                           (ins uimm3:$Op1, CRx:$CRn, CRx:$CRm,
-                                uimm3:$Op2, GPR64:$Rt),
-                           "sys\t$Op1, $CRn, $CRm, $Op2, $Rt",
-                           [], NoItinerary> {
-  let Op0 = 0b01;
-}
-
-// You can skip the Xt argument whether it makes sense or not for the generic
-// SYS instruction.
-def : InstAlias<"sys $Op1, $CRn, $CRm, $Op2",
-                (SYSiccix uimm3:$Op1, CRx:$CRn, CRx:$CRm, uimm3:$Op2, XZR)>;
-
-
-// But many have aliases, which obviously don't fit into
-class SYSalias<dag ins, string asmstring>
-  : A64I_system<0b0, (outs), ins, asmstring, [], NoItinerary> {
-  let isAsmParserOnly = 1;
-
-  bits<14> SysOp;
-  let Op0 = 0b01;
-  let Op1 = SysOp{13-11};
-  let CRn = SysOp{10-7};
-  let CRm = SysOp{6-3};
-  let Op2 = SysOp{2-0};
-}
-
-def ICix : SYSalias<(ins ic_op:$SysOp, GPR64:$Rt), "ic\t$SysOp, $Rt">;
-
-def ICi : SYSalias<(ins ic_op:$SysOp), "ic\t$SysOp"> {
-  let Rt = 0b11111;
-}
-
-def DCix : SYSalias<(ins dc_op:$SysOp, GPR64:$Rt), "dc\t$SysOp, $Rt">;
-def ATix : SYSalias<(ins at_op:$SysOp, GPR64:$Rt), "at\t$SysOp, $Rt">;
-
-def TLBIix : SYSalias<(ins tlbi_op:$SysOp, GPR64:$Rt), "tlbi\t$SysOp, $Rt">;
-
-def TLBIi : SYSalias<(ins tlbi_op:$SysOp), "tlbi\t$SysOp"> {
-  let Rt = 0b11111;
-}
-
-
-def SYSLxicci : A64I_system<0b1, (outs GPR64:$Rt),
-                            (ins uimm3:$Op1, CRx:$CRn, CRx:$CRm, uimm3:$Op2),
-                            "sysl\t$Rt, $Op1, $CRn, $CRm, $Op2",
-                            [], NoItinerary> {
-  let Op0 = 0b01;
-}
-
-// The instructions themselves are rather simple for MSR and MRS.
-def MSRix : A64I_system<0b0, (outs), (ins msr_op:$SysReg, GPR64:$Rt),
-                        "msr\t$SysReg, $Rt", [], NoItinerary> {
-  bits<16> SysReg;
-  let Op0 = SysReg{15-14};
-  let Op1 = SysReg{13-11};
-  let CRn = SysReg{10-7};
-  let CRm = SysReg{6-3};
-  let Op2 = SysReg{2-0};
-}
-
-def MRSxi : A64I_system<0b1, (outs GPR64:$Rt), (ins mrs_op:$SysReg),
-                        "mrs\t$Rt, $SysReg", [], NoItinerary> {
-  bits<16> SysReg;
-  let Op0 = SysReg{15-14};
-  let Op1 = SysReg{13-11};
-  let CRn = SysReg{10-7};
-  let CRm = SysReg{6-3};
-  let Op2 = SysReg{2-0};
-}
-
-def MSRii : A64I_system<0b0, (outs), (ins pstate_op:$PState, uimm4:$CRm),
-                        "msr\t$PState, $CRm", [], NoItinerary> {
-  bits<6> PState;
-
-  let Op0 = 0b00;
-  let Op1 = PState{5-3};
-  let CRn = 0b0100;
-  let Op2 = PState{2-0};
-  let Rt = 0b11111;
-}
-
-//===----------------------------------------------------------------------===//
-// Test & branch (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: TBZ, TBNZ
-
-// The bit to test is a simple unsigned 6-bit immediate in the X-register
-// versions.
-def uimm6 : Operand<i64> {
-  let ParserMatchClass = uimm6_asmoperand;
-}
-
-def label_wid14_scal4_asmoperand : label_asmoperand<14, 4>;
-
-def tbimm_target : Operand<OtherVT> {
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_tstbr>";
-
-  // This label is a 14-bit offset from PC, scaled by the instruction-width: 4.
-  let PrintMethod = "printLabelOperand<14, 4>";
-  let ParserMatchClass = label_wid14_scal4_asmoperand;
-
-  let OperandType = "OPERAND_PCREL";
-}
-
-def A64eq : ImmLeaf<i32, [{ return Imm == A64CC::EQ; }]>;
-def A64ne : ImmLeaf<i32, [{ return Imm == A64CC::NE; }]>;
-
-// These instructions correspond to patterns involving "and" with a power of
-// two, which we need to be able to select.
-def tstb64_pat : ComplexPattern<i64, 1, "SelectTSTBOperand<64>">;
-def tstb32_pat : ComplexPattern<i32, 1, "SelectTSTBOperand<32>">;
-
-let isBranch = 1, isTerminator = 1 in {
-  def TBZxii : A64I_TBimm<0b0, (outs),
-                        (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
-                        "tbz\t$Rt, $Imm, $Label",
-                        [(A64br_cc (A64cmp (and i64:$Rt, tstb64_pat:$Imm), 0),
-                                   A64eq, bb:$Label)],
-                        NoItinerary>,
-               Sched<[WriteBr]>;
-
-  def TBNZxii : A64I_TBimm<0b1, (outs),
-                        (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
-                        "tbnz\t$Rt, $Imm, $Label",
-                        [(A64br_cc (A64cmp (and i64:$Rt, tstb64_pat:$Imm), 0),
-                                   A64ne, bb:$Label)],
-                        NoItinerary>,
-                Sched<[WriteBr]>;
-
-
-  // Note, these instructions overlap with the above 64-bit patterns. This is
-  // intentional, "tbz x3, #1, somewhere" and "tbz w3, #1, somewhere" would both
-  // do the same thing and are both permitted assembly. They also both have
-  // sensible DAG patterns.
-  def TBZwii : A64I_TBimm<0b0, (outs),
-                        (ins GPR32:$Rt, uimm5:$Imm, tbimm_target:$Label),
-                        "tbz\t$Rt, $Imm, $Label",
-                        [(A64br_cc (A64cmp (and i32:$Rt, tstb32_pat:$Imm), 0),
-                                   A64eq, bb:$Label)],
-                        NoItinerary>,
-               Sched<[WriteBr]> {
-    let Imm{5} = 0b0;
-  }
-
-  def TBNZwii : A64I_TBimm<0b1, (outs),
-                        (ins GPR32:$Rt, uimm5:$Imm, tbimm_target:$Label),
-                        "tbnz\t$Rt, $Imm, $Label",
-                        [(A64br_cc (A64cmp (and i32:$Rt, tstb32_pat:$Imm), 0),
-                                   A64ne, bb:$Label)],
-                        NoItinerary>,
-                Sched<[WriteBr]> {
-    let Imm{5} = 0b0;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: B, BL
-
-def label_wid26_scal4_asmoperand : label_asmoperand<26, 4>;
-
-def bimm_target : Operand<OtherVT> {
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_uncondbr>";
-
-  // This label is a 26-bit offset from PC, scaled by the instruction-width: 4.
-  let PrintMethod = "printLabelOperand<26, 4>";
-  let ParserMatchClass = label_wid26_scal4_asmoperand;
-
-  let OperandType = "OPERAND_PCREL";
-}
-
-def blimm_target : Operand<i64> {
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_call>";
-
-  // This label is a 26-bit offset from PC, scaled by the instruction-width: 4.
-  let PrintMethod = "printLabelOperand<26, 4>";
-  let ParserMatchClass = label_wid26_scal4_asmoperand;
-
-  let OperandType = "OPERAND_PCREL";
-}
-
-class A64I_BimmImpl<bit op, string asmop, list<dag> patterns, Operand lbl_type>
-  : A64I_Bimm<op, (outs), (ins lbl_type:$Label),
-              !strconcat(asmop, "\t$Label"), patterns,
-              NoItinerary>,
-    Sched<[WriteBr]>;
-
-let isBranch = 1 in {
-  def Bimm : A64I_BimmImpl<0b0, "b", [(br bb:$Label)], bimm_target> {
-    let isTerminator = 1;
-    let isBarrier = 1;
-  }
-
-  let SchedRW = [WriteBrL] in {
-    def BLimm : A64I_BimmImpl<0b1, "bl",
-                              [(AArch64Call tglobaladdr:$Label)], blimm_target> {
-      let isCall = 1;
-      let Defs = [X30];
-    }
-  }
-}
-
-def : Pat<(AArch64Call texternalsym:$Label), (BLimm texternalsym:$Label)>;
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (register) instructions
-//===----------------------------------------------------------------------===//
-// Contains: BR, BLR, RET, ERET, DRP.
-
-// Most of the notional opcode fields in the A64I_Breg format are fixed in A64
-// at the moment.
-class A64I_BregImpl<bits<4> opc,
-                    dag outs, dag ins, string asmstr, list<dag> patterns,
-                    InstrItinClass itin = NoItinerary>
-  : A64I_Breg<opc, 0b11111, 0b000000, 0b00000,
-              outs, ins, asmstr, patterns, itin>,
-    Sched<[WriteBr]> {
-  let isBranch         = 1;
-  let isIndirectBranch = 1;
-}
-
-// Note that these are not marked isCall or isReturn because as far as LLVM is
-// concerned they're not. "ret" is just another jump unless it has been selected
-// by LLVM as the function's return.
-
-let isBranch = 1 in {
-  def BRx : A64I_BregImpl<0b0000,(outs), (ins GPR64:$Rn),
-                          "br\t$Rn", [(brind i64:$Rn)]> {
-    let isBarrier = 1;
-    let isTerminator = 1;
-  }
-
-  let SchedRW = [WriteBrL] in {
-    def BLRx : A64I_BregImpl<0b0001, (outs), (ins GPR64:$Rn),
-                             "blr\t$Rn", [(AArch64Call i64:$Rn)]> {
-      let isBarrier = 0;
-      let isCall = 1;
-      let Defs = [X30];
-    }
-  }
-
-  def RETx : A64I_BregImpl<0b0010, (outs), (ins GPR64:$Rn),
-                           "ret\t$Rn", []> {
-    let isBarrier = 1;
-    let isTerminator = 1;
-    let isReturn = 1;
-  }
-
-  // Create a separate pseudo-instruction for codegen to use so that we don't
-  // flag x30 as used in every function. It'll be restored before the RET by the
-  // epilogue if it's legitimately used.
-  def RET : A64PseudoExpand<(outs), (ins), [(A64ret)], (RETx (ops X30))> {
-    let isTerminator = 1;
-    let isBarrier = 1;
-    let isReturn = 1;
-  }
-
-  def ERET : A64I_BregImpl<0b0100, (outs), (ins), "eret", []> {
-    let Rn = 0b11111;
-    let isBarrier = 1;
-    let isTerminator = 1;
-    let isReturn = 1;
-  }
-
-  def DRPS : A64I_BregImpl<0b0101, (outs), (ins), "drps", []> {
-    let Rn = 0b11111;
-    let isBarrier = 1;
-  }
-}
-
-def RETAlias : InstAlias<"ret", (RETx X30)>;
-
-
-//===----------------------------------------------------------------------===//
-// Address generation patterns
-//===----------------------------------------------------------------------===//
-
-// Primary method of address generation for the small/absolute memory model is
-// an ADRP/ADR pair:
-//     ADRP x0, some_variable
-//     ADD x0, x0, #:lo12:some_variable
-//
-// The load/store elision of the ADD is accomplished when selecting
-// addressing-modes. This just mops up the cases where that doesn't work and we
-// really need an address in some register.
-
-// This wrapper applies a LO12 modifier to the address. Otherwise we could just
-// use the same address.
-
-class ADRP_ADD<SDNode Wrapper, SDNode addrop>
- : Pat<(Wrapper addrop:$Hi, addrop:$Lo12, (i32 imm)),
-       (ADDxxi_lsl0_s (ADRPxi addrop:$Hi), addrop:$Lo12)>;
-
-def : ADRP_ADD<A64WrapperSmall, tblockaddress>;
-def : ADRP_ADD<A64WrapperSmall, texternalsym>;
-def : ADRP_ADD<A64WrapperSmall, tglobaladdr>;
-def : ADRP_ADD<A64WrapperSmall, tglobaltlsaddr>;
-def : ADRP_ADD<A64WrapperSmall, tjumptable>;
-def : ADRP_ADD<A64WrapperSmall, tconstpool>;
-
-//===----------------------------------------------------------------------===//
-// GOT access patterns
-//===----------------------------------------------------------------------===//
-
-class GOTLoadSmall<SDNode addrfrag>
-  : Pat<(A64GOTLoad (A64WrapperSmall addrfrag:$Hi, addrfrag:$Lo12, 8)),
-        (LS64_LDR (ADRPxi addrfrag:$Hi), addrfrag:$Lo12)>;
-
-def : GOTLoadSmall<texternalsym>;
-def : GOTLoadSmall<tglobaladdr>;
-def : GOTLoadSmall<tglobaltlsaddr>;
-
-//===----------------------------------------------------------------------===//
-// Tail call handling
-//===----------------------------------------------------------------------===//
-
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [XSP] in {
-  def TC_RETURNdi
-    : PseudoInst<(outs), (ins i64imm:$dst, i32imm:$FPDiff),
-                 [(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff))]>;
-
-  def TC_RETURNxi
-    : PseudoInst<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff),
-                 [(AArch64tcret i64:$dst, (i32 timm:$FPDiff))]>;
-}
-
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
-    Uses = [XSP] in {
-  def TAIL_Bimm : A64PseudoExpand<(outs), (ins bimm_target:$Label), [],
-                                  (Bimm bimm_target:$Label)>;
-
-  def TAIL_BRx : A64PseudoExpand<(outs), (ins tcGPR64:$Rd), [],
-                                 (BRx GPR64:$Rd)>;
-}
-
 
+let Predicates = [IsLE] in {
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)),
+                 (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
+                 (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
+                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
+                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
+          (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+}
+def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+
+def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
+          (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
+def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
+          (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
+def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
+          (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
+                             (v1i64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
+                             (v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
+                             (v1i64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
+                             (v1i64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
+                             (v2i32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))),
+                             (v2i32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))),
+                             (v4i16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))),
+                             (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))),
+                             (v8i8 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))),
+                             (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
+                             (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))),
+                             (f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))),
+                             (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
+                             (f64 (REV64v8i8 FPR64:$src))>;
+}
+def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
+                             (v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
+                             (v1f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))),
+                             (v1f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
+                             (v1f64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
+                             (v2f32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))),
+                             (v2f32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
+                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
+                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
+                                            (REV64v16i8 FPR128:$src), (i32 8)))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))),
+                             (v2f64 (EXTv16i8 FPR128:$src,
+                                              FPR128:$src, (i32 8)))>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
+                             (v2f64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
+                             (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
+                             (v2f64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
+                             (v2f64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))),
+                             (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                    (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
+                             (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
+                             (v4f32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
+                             (v4f32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
+                             (v4f32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
+                             (v2i64 (EXTv16i8 FPR128:$src,
+                                              FPR128:$src, (i32 8)))>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
+                             (v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
+                             (v2i64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
+                             (v2i64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
+                             (v2i64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
+                             (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                              (REV64v4i32 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
+                             (v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
+                             (v4i32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
+                             (v4i32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
+                             (v4i32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))),
+                             (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                              (REV64v8i16 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
+                             (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
+                             (v8i16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
+                             (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
+                             (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
+                                              (REV64v16i8 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
+                             (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
+                             (v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
+                             (v16i8 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
+                             (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
+                             (v16i8 (REV32v16i8 FPR128:$src))>;
+}
+
+def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+
+// A 64-bit subvector insert to the first 128-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
+// or v2f32.
+def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
+                    (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
+           (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
+def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
+                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
+           (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
+    // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
+    // so we match on v4f32 here, not v2f32. This will also catch adding
+    // the low two lanes of a true v4f32 vector.
+def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
+                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
+          (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
+
+// Scalar 64-bit shifts in FPR64 registers.
+def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+
+// Tail call return handling. These are all compiler pseudo-instructions,
+// so no encoding information or anything like that.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
+  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
+}
+
+def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
 def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
-          (TC_RETURNdi texternalsym:$dst, imm:$FPDiff)>;
-
-//===----------------------------------------------------------------------===//
-// Thread local storage
-//===----------------------------------------------------------------------===//
-
-// This is a pseudo-instruction representing the ".tlsdesccall" directive in
-// assembly. Its effect is to insert an R_AARCH64_TLSDESC_CALL relocation at the
-// current location. It should always be immediately followed by a BLR
-// instruction, and is intended solely for relaxation by the linker.
-
-def : Pat<(A64threadpointer), (MRSxi 0xde82)>;
-
-def TLSDESCCALL : PseudoInst<(outs), (ins i64imm:$Lbl), []> {
-  let hasSideEffects = 1;
-}
-
-def TLSDESC_BLRx : PseudoInst<(outs), (ins GPR64:$Rn, i64imm:$Var),
-                            [(A64tlsdesc_blr i64:$Rn, tglobaltlsaddr:$Var)]> {
-  let isCall = 1;
-  let Defs = [X30];
-}
-
-def : Pat<(A64tlsdesc_blr i64:$Rn, texternalsym:$Var),
-          (TLSDESC_BLRx $Rn, texternalsym:$Var)>;
-
-//===----------------------------------------------------------------------===//
-// Bitfield patterns
-//===----------------------------------------------------------------------===//
-
-def bfi32_lsb_to_immr : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((32 - N->getZExtValue()) % 32, MVT::i64);
-}]>;
-
-def bfi64_lsb_to_immr : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((64 - N->getZExtValue()) % 64, MVT::i64);
-}]>;
-
-def bfi_width_to_imms : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() - 1, MVT::i64);
-}]>;
-
-
-// The simpler patterns deal with cases where no AND mask is actually needed
-// (either all bits are used or the low 32 bits are used).
-let AddedComplexity = 10 in {
-
-def : Pat<(A64Bfi i64:$src, i64:$Rn, imm:$ImmR, imm:$ImmS),
-           (BFIxxii $src, $Rn,
-                    (bfi64_lsb_to_immr (i64 imm:$ImmR)),
-                    (bfi_width_to_imms (i64 imm:$ImmS)))>;
-
-def : Pat<(A64Bfi i32:$src, i32:$Rn, imm:$ImmR, imm:$ImmS),
-          (BFIwwii $src, $Rn,
-                   (bfi32_lsb_to_immr (i64 imm:$ImmR)),
-                   (bfi_width_to_imms (i64 imm:$ImmS)))>;
-
-
-def : Pat<(and (A64Bfi i64:$src, i64:$Rn, imm:$ImmR, imm:$ImmS),
-               (i64 4294967295)),
-          (SUBREG_TO_REG (i64 0),
-                         (BFIwwii (EXTRACT_SUBREG $src, sub_32),
-                                  (EXTRACT_SUBREG $Rn, sub_32),
-                                  (bfi32_lsb_to_immr (i64 imm:$ImmR)),
-                                  (bfi_width_to_imms (i64 imm:$ImmS))),
-                         sub_32)>;
-
-}
-
-//===----------------------------------------------------------------------===//
-// Miscellaneous patterns
-//===----------------------------------------------------------------------===//
-
-// Truncation from 64 to 32-bits just involves renaming your register.
-def : Pat<(i32 (trunc i64:$val)), (EXTRACT_SUBREG $val, sub_32)>;
-
-// Similarly, extension where we don't care about the high bits is
-// just a rename.
-def : Pat<(i64 (anyext i32:$val)),
-          (INSERT_SUBREG (IMPLICIT_DEF), $val, sub_32)>;
-
-// SELECT instructions providing f128 types need to be handled by a
-// pseudo-instruction since the eventual code will need to introduce basic
-// blocks and control flow.
-def F128CSEL : PseudoInst<(outs FPR128:$Rd),
-                         (ins FPR128:$Rn, FPR128:$Rm, cond_code_op:$Cond),
-                         [(set f128:$Rd, (simple_select f128:$Rn, f128:$Rm))]> {
-  let Uses = [NZCV];
-  let usesCustomInserter = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Load/store patterns
-//===----------------------------------------------------------------------===//
-
-// There are lots of patterns here, because we need to allow at least three
-// parameters to vary independently.
-//   1. Instruction: "ldrb w9, [sp]", "ldrh w9, [sp]", ...
-//   2. LLVM source: zextloadi8, anyextloadi8, ...
-//   3. Address-generation: A64Wrapper, (add BASE, OFFSET), ...
-//
-// The biggest problem turns out to be the address-generation variable. At the
-// point of instantiation we need to produce two DAGs, one for the pattern and
-// one for the instruction. Doing this at the lowest level of classes doesn't
-// work.
-//
-// Consider the simple uimm12 addressing mode, and the desire to match both (add
-// GPR64xsp:$Rn, uimm12:$Offset) and GPR64xsp:$Rn, particularly on the
-// instruction side. We'd need to insert either "GPR64xsp" and "uimm12" or
-// "GPR64xsp" and "0" into an unknown dag. !subst is not capable of this
-// operation, and PatFrags are for selection not output.
-//
-// As a result, the address-generation patterns are the final
-// instantiations. However, we do still need to vary the operand for the address
-// further down (At the point we're deciding A64WrapperSmall, we don't know
-// the memory width of the operation).
-
-//===------------------------------
-// 1. Basic infrastructural defs
-//===------------------------------
-
-// First, some simple classes for !foreach and !subst to use:
-class Decls {
-  dag pattern;
-}
-
-def decls : Decls;
-def ALIGN;
-def INST;
-def OFFSET;
-def SHIFT;
-
-// You can't use !subst on an actual immediate, but you *can* use it on an
-// operand record that happens to match a single immediate. So we do.
-def imm_eq0 : ImmLeaf<i64, [{ return Imm == 0; }]>;
-def imm_eq1 : ImmLeaf<i64, [{ return Imm == 1; }]>;
-def imm_eq2 : ImmLeaf<i64, [{ return Imm == 2; }]>;
-def imm_eq3 : ImmLeaf<i64, [{ return Imm == 3; }]>;
-def imm_eq4 : ImmLeaf<i64, [{ return Imm == 4; }]>;
-
-// If the low bits of a pointer are known to be 0 then an "or" is just as good
-// as addition for computing an offset. This fragment forwards that check for
-// TableGen's use.
-def add_like_or : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),
-[{
-  return CurDAG->isBaseWithConstantOffset(SDValue(N, 0));
-}]>;
-
-// Load/store (unsigned immediate) operations with relocations against global
-// symbols (for lo12) are only valid if those symbols have correct alignment
-// (since the immediate offset is divided by the access scale, it can't have a
-// remainder).
-//
-// The guaranteed alignment is provided as part of the WrapperSmall
-// operation, and checked against one of these.
-def any_align   : ImmLeaf<i32, [{ (void)Imm; return true; }]>;
-def min_align2  : ImmLeaf<i32, [{ return Imm >= 2; }]>;
-def min_align4  : ImmLeaf<i32, [{ return Imm >= 4; }]>;
-def min_align8  : ImmLeaf<i32, [{ return Imm >= 8; }]>;
-def min_align16 : ImmLeaf<i32, [{ return Imm >= 16; }]>;
-
-// "Normal" load/store instructions can be used on atomic operations, provided
-// the ordering parameter is at most "monotonic". Anything above that needs
-// special handling with acquire/release instructions.
-class simple_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  return cast<AtomicSDNode>(N)->getOrdering() <= Monotonic;
-}]>;
-
-def atomic_load_simple_i8  : simple_load<atomic_load_8>;
-def atomic_load_simple_i16 : simple_load<atomic_load_16>;
-def atomic_load_simple_i32 : simple_load<atomic_load_32>;
-def atomic_load_simple_i64 : simple_load<atomic_load_64>;
-
-class simple_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  return cast<AtomicSDNode>(N)->getOrdering() <= Monotonic;
-}]>;
-
-def atomic_store_simple_i8  : simple_store<atomic_store_8>;
-def atomic_store_simple_i16 : simple_store<atomic_store_16>;
-def atomic_store_simple_i32 : simple_store<atomic_store_32>;
-def atomic_store_simple_i64 : simple_store<atomic_store_64>;
-
-//===------------------------------
-// 2. UImm12 and SImm9
-//===------------------------------
-
-// These instructions have two operands providing the address so they can be
-// treated similarly for most purposes.
-
-//===------------------------------
-// 2.1 Base patterns covering extend/truncate semantics
-//===------------------------------
-
-// Atomic patterns can be shared between integer operations of all sizes, a
-// quick multiclass here allows reuse.
-multiclass ls_atomic_pats<Instruction LOAD, Instruction STORE, dag Base,
-                          dag Offset, dag address, ValueType transty,
-                          ValueType sty> {
-  def : Pat<(!cast<PatFrag>("atomic_load_simple_" # sty) address),
-            (LOAD Base, Offset)>;
-
-  def : Pat<(!cast<PatFrag>("atomic_store_simple_" # sty) address, transty:$Rt),
-            (STORE $Rt, Base, Offset)>;
-}
-
-// Instructions accessing a memory chunk smaller than a register (or, in a
-// pinch, the same size) have a characteristic set of patterns they want to
-// match: extending loads and truncating stores. This class deals with the
-// sign-neutral version of those patterns.
-//
-// It will be instantiated across multiple addressing-modes.
-multiclass ls_small_pats<Instruction LOAD, Instruction STORE,
-                         dag Base, dag Offset,
-                         dag address, ValueType sty>
-  : ls_atomic_pats<LOAD, STORE, Base, Offset, address, i32, sty> {
-  def : Pat<(!cast<SDNode>(zextload # sty) address), (LOAD Base, Offset)>;
-
-  def : Pat<(!cast<SDNode>(extload # sty) address), (LOAD Base, Offset)>;
-
-  // For zero-extension to 64-bits we have to tell LLVM that the whole 64-bit
-  // register was actually set.
-  def : Pat<(i64 (!cast<SDNode>(zextload # sty) address)),
-            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset), sub_32)>;
-
-  def : Pat<(i64 (!cast<SDNode>(extload # sty) address)),
-            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset), sub_32)>;
-
-  def : Pat<(!cast<SDNode>(truncstore # sty) i32:$Rt, address),
-            (STORE $Rt, Base, Offset)>;
-
-  // For truncating store from 64-bits, we have to manually tell LLVM to
-  // ignore the high bits of the x register.
-  def : Pat<(!cast<SDNode>(truncstore # sty) i64:$Rt, address),
-            (STORE (EXTRACT_SUBREG $Rt, sub_32), Base, Offset)>;
-}
-
-// Next come patterns for sign-extending loads.
-multiclass load_signed_pats<string T, string U, dag Base, dag Offset,
-                            dag address, ValueType sty> {
-  def : Pat<(i32 (!cast<SDNode>("sextload" # sty) address)),
-            (!cast<Instruction>("LDRS" # T # "w" # U) Base, Offset)>;
-
-  def : Pat<(i64 (!cast<SDNode>("sextload" # sty) address)),
-            (!cast<Instruction>("LDRS" # T # "x" # U) Base, Offset)>;
-
-}
-
-// and finally "natural-width" loads and stores come next.
-multiclass ls_neutral_pats<Instruction LOAD, Instruction STORE, dag Base,
-                           dag Offset, dag address, ValueType sty> {
-  def : Pat<(sty (load address)), (LOAD Base, Offset)>;
-  def : Pat<(store sty:$Rt, address), (STORE $Rt, Base, Offset)>;
-}
-
-// Integer operations also get atomic instructions to select for.
-multiclass ls_int_neutral_pats<Instruction LOAD, Instruction STORE, dag Base,
-                           dag Offset, dag address, ValueType sty>
-  : ls_neutral_pats<LOAD, STORE, Base, Offset, address, sty>,
-    ls_atomic_pats<LOAD, STORE, Base, Offset, address, sty, sty>;
-
-//===------------------------------
-// 2.2. Addressing-mode instantiations
-//===------------------------------
-
-multiclass uimm12_pats<dag address, dag Base, dag Offset> {
-  defm : ls_small_pats<LS8_LDR, LS8_STR, Base,
-                       !foreach(decls.pattern, Offset,
-                                !subst(OFFSET, byte_uimm12, decls.pattern)),
-                       !foreach(decls.pattern, address,
-                                !subst(OFFSET, byte_uimm12,
-                                !subst(ALIGN, any_align, decls.pattern))),
-                       i8>;
-  defm : ls_small_pats<LS16_LDR, LS16_STR, Base,
-                       !foreach(decls.pattern, Offset,
-                                !subst(OFFSET, hword_uimm12, decls.pattern)),
-                       !foreach(decls.pattern, address,
-                                !subst(OFFSET, hword_uimm12,
-                                !subst(ALIGN, min_align2, decls.pattern))),
-                       i16>;
-  defm : ls_small_pats<LS32_LDR, LS32_STR, Base,
-                       !foreach(decls.pattern, Offset,
-                                !subst(OFFSET, word_uimm12, decls.pattern)),
-                       !foreach(decls.pattern, address,
-                                !subst(OFFSET, word_uimm12,
-                                !subst(ALIGN, min_align4, decls.pattern))),
-                       i32>;
-
-  defm : ls_int_neutral_pats<LS32_LDR, LS32_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, word_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, word_uimm12,
-                                   !subst(ALIGN, min_align4, decls.pattern))),
-                          i32>;
-
-  defm : ls_int_neutral_pats<LS64_LDR, LS64_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, dword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, dword_uimm12,
-                                   !subst(ALIGN, min_align8, decls.pattern))),
-                          i64>;
-
-  defm : ls_neutral_pats<LSFP16_LDR, LSFP16_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, hword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, hword_uimm12,
-                                   !subst(ALIGN, min_align2, decls.pattern))),
-                          f16>;
-
-  defm : ls_neutral_pats<LSFP32_LDR, LSFP32_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, word_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, word_uimm12,
-                                   !subst(ALIGN, min_align4, decls.pattern))),
-                          f32>;
-
-  defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, dword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, dword_uimm12,
-                                   !subst(ALIGN, min_align8, decls.pattern))),
-                          f64>;
-
-  defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, qword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, qword_uimm12,
-                                   !subst(ALIGN, min_align16, decls.pattern))),
-                          f128>;
-
-  defm : load_signed_pats<"B", "", Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, byte_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, byte_uimm12,
-                                   !subst(ALIGN, any_align, decls.pattern))),
-                          i8>;
-
-  defm : load_signed_pats<"H", "", Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, hword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, hword_uimm12,
-                                   !subst(ALIGN, min_align2, decls.pattern))),
-                          i16>;
-
-  def : Pat<(sextloadi32 !foreach(decls.pattern, address,
-                                  !subst(OFFSET, word_uimm12,
-                                  !subst(ALIGN, min_align4, decls.pattern)))),
-            (LDRSWx Base, !foreach(decls.pattern, Offset,
-                                  !subst(OFFSET, word_uimm12, decls.pattern)))>;
-}
-
-// Straightforward patterns of last resort: a pointer with or without an
-// appropriate offset.
-defm : uimm12_pats<(i64 i64:$Rn), (i64 i64:$Rn), (i64 0)>;
-defm : uimm12_pats<(add i64:$Rn, OFFSET:$UImm12),
-                   (i64 i64:$Rn), (i64 OFFSET:$UImm12)>;
-
-// The offset could be hidden behind an "or", of course:
-defm : uimm12_pats<(add_like_or i64:$Rn, OFFSET:$UImm12),
-                   (i64 i64:$Rn), (i64 OFFSET:$UImm12)>;
-
-// Global addresses under the small-absolute model should use these
-// instructions. There are ELF relocations specifically for it.
-defm : uimm12_pats<(A64WrapperSmall tglobaladdr:$Hi, tglobaladdr:$Lo12, ALIGN),
-                   (ADRPxi tglobaladdr:$Hi), (i64 tglobaladdr:$Lo12)>;
-
-defm : uimm12_pats<(A64WrapperSmall tglobaltlsaddr:$Hi, tglobaltlsaddr:$Lo12,
-                                    ALIGN),
-                   (ADRPxi tglobaltlsaddr:$Hi), (i64 tglobaltlsaddr:$Lo12)>;
-
-// External symbols that make it this far should also get standard relocations.
-defm : uimm12_pats<(A64WrapperSmall texternalsym:$Hi, texternalsym:$Lo12,
-                                    ALIGN),
-                   (ADRPxi texternalsym:$Hi), (i64 texternalsym:$Lo12)>;
-
-defm : uimm12_pats<(A64WrapperSmall tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
-                   (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
-
-// We also want to use uimm12 instructions for local variables at the moment.
-def tframeindex_XFORM : SDNodeXForm<frameindex, [{
-  int FI = cast<FrameIndexSDNode>(N)->getIndex();
-  return CurDAG->getTargetFrameIndex(FI, MVT::i64);
-}]>;
-
-defm : uimm12_pats<(i64 frameindex:$Rn),
-                   (tframeindex_XFORM tframeindex:$Rn), (i64 0)>;
-
-// These can be much simpler than uimm12 because we don't to change the operand
-// type (e.g. LDURB and LDURH take the same operands).
-multiclass simm9_pats<dag address, dag Base, dag Offset> {
-  defm : ls_small_pats<LS8_LDUR, LS8_STUR, Base, Offset, address, i8>;
-  defm : ls_small_pats<LS16_LDUR, LS16_STUR, Base, Offset, address, i16>;
-
-  defm : ls_int_neutral_pats<LS32_LDUR, LS32_STUR, Base, Offset, address, i32>;
-  defm : ls_int_neutral_pats<LS64_LDUR, LS64_STUR, Base, Offset, address, i64>;
-
-  defm : ls_neutral_pats<LSFP16_LDUR, LSFP16_STUR, Base, Offset, address, f16>;
-  defm : ls_neutral_pats<LSFP32_LDUR, LSFP32_STUR, Base, Offset, address, f32>;
-  defm : ls_neutral_pats<LSFP64_LDUR, LSFP64_STUR, Base, Offset, address, f64>;
-  defm : ls_neutral_pats<LSFP128_LDUR, LSFP128_STUR, Base, Offset, address,
-                         f128>;
-
-  def : Pat<(i64 (zextloadi32 address)),
-            (SUBREG_TO_REG (i64 0), (LS32_LDUR Base, Offset), sub_32)>;
-
-  def : Pat<(truncstorei32 i64:$Rt, address),
-            (LS32_STUR (EXTRACT_SUBREG $Rt, sub_32), Base, Offset)>;
-
-  defm : load_signed_pats<"B", "_U", Base, Offset, address, i8>;
-  defm : load_signed_pats<"H", "_U", Base, Offset, address, i16>;
-  def : Pat<(sextloadi32 address), (LDURSWx Base, Offset)>;
-}
-
-defm : simm9_pats<(add i64:$Rn, simm9:$SImm9),
-                  (i64 $Rn), (SDXF_simm9 simm9:$SImm9)>;
-
-defm : simm9_pats<(add_like_or i64:$Rn, simm9:$SImm9),
-                  (i64 $Rn), (SDXF_simm9 simm9:$SImm9)>;
-
-
-//===------------------------------
-// 3. Register offset patterns
-//===------------------------------
-
-// Atomic patterns can be shared between integer operations of all sizes, a
-// quick multiclass here allows reuse.
-multiclass ro_atomic_pats<Instruction LOAD, Instruction STORE, dag Base,
-                          dag Offset, dag Extend, dag address,
-                          ValueType transty, ValueType sty> {
-  def : Pat<(!cast<PatFrag>("atomic_load_simple_" # sty) address),
-            (LOAD Base, Offset, Extend)>;
-
-  def : Pat<(!cast<PatFrag>("atomic_store_simple_" # sty) address, transty:$Rt),
-            (STORE $Rt, Base, Offset, Extend)>;
-}
-
-// The register offset instructions take three operands giving the instruction,
-// and have an annoying split between instructions where Rm is 32-bit and
-// 64-bit. So we need a special hierarchy to describe them. Other than that the
-// same operations should be supported as for simm9 and uimm12 addressing.
-
-multiclass ro_small_pats<Instruction LOAD, Instruction STORE,
-                         dag Base, dag Offset, dag Extend,
-                         dag address, ValueType sty>
-  : ro_atomic_pats<LOAD, STORE, Base, Offset, Extend, address, i32, sty> {
-  def : Pat<(!cast<SDNode>(zextload # sty) address),
-            (LOAD Base, Offset, Extend)>;
-
-  def : Pat<(!cast<SDNode>(extload # sty) address),
-            (LOAD Base, Offset, Extend)>;
-
-  // For zero-extension to 64-bits we have to tell LLVM that the whole 64-bit
-  // register was actually set.
-  def : Pat<(i64 (!cast<SDNode>(zextload # sty) address)),
-            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset, Extend), sub_32)>;
-
-  def : Pat<(i64 (!cast<SDNode>(extload # sty) address)),
-            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset, Extend), sub_32)>;
-
-  def : Pat<(!cast<SDNode>(truncstore # sty) i32:$Rt, address),
-            (STORE $Rt, Base, Offset, Extend)>;
-
-  // For truncating store from 64-bits, we have to manually tell LLVM to
-  // ignore the high bits of the x register.
-  def : Pat<(!cast<SDNode>(truncstore # sty) i64:$Rt, address),
-            (STORE (EXTRACT_SUBREG $Rt, sub_32), Base, Offset, Extend)>;
-
-}
-
-// Next come patterns for sign-extending loads.
-multiclass ro_signed_pats<string T, string Rm, dag Base, dag Offset, dag Extend,
-                          dag address, ValueType sty> {
-  def : Pat<(i32 (!cast<SDNode>("sextload" # sty) address)),
-            (!cast<Instruction>("LDRS" # T # "w_" # Rm # "_RegOffset")
-              Base, Offset, Extend)>;
-
-  def : Pat<(i64 (!cast<SDNode>("sextload" # sty) address)),
-            (!cast<Instruction>("LDRS" # T # "x_" # Rm # "_RegOffset")
-              Base, Offset, Extend)>;
-}
-
-// and finally "natural-width" loads and stores come next.
-multiclass ro_neutral_pats<Instruction LOAD, Instruction STORE,
-                           dag Base, dag Offset, dag Extend, dag address,
-                           ValueType sty> {
-  def : Pat<(sty (load address)), (LOAD Base, Offset, Extend)>;
-  def : Pat<(store sty:$Rt, address),
-            (STORE $Rt, Base, Offset, Extend)>;
-}
-
-multiclass ro_int_neutral_pats<Instruction LOAD, Instruction STORE,
-                               dag Base, dag Offset, dag Extend, dag address,
-                               ValueType sty>
-  : ro_neutral_pats<LOAD, STORE, Base, Offset, Extend, address, sty>,
-    ro_atomic_pats<LOAD, STORE, Base, Offset, Extend, address, sty, sty>;
-
-multiclass regoff_pats<string Rm, dag address, dag Base, dag Offset,
-                       dag Extend> {
-  defm : ro_small_pats<!cast<Instruction>("LS8_" # Rm # "_RegOffset_LDR"),
-                       !cast<Instruction>("LS8_" # Rm # "_RegOffset_STR"),
-                       Base, Offset, Extend,
-                       !foreach(decls.pattern, address,
-                                !subst(SHIFT, imm_eq0, decls.pattern)),
-                       i8>;
-  defm : ro_small_pats<!cast<Instruction>("LS16_" # Rm # "_RegOffset_LDR"),
-                       !cast<Instruction>("LS16_" # Rm # "_RegOffset_STR"),
-                       Base, Offset, Extend,
-                       !foreach(decls.pattern, address,
-                                !subst(SHIFT, imm_eq1, decls.pattern)),
-                       i16>;
-  defm : ro_small_pats<!cast<Instruction>("LS32_" # Rm # "_RegOffset_LDR"),
-                       !cast<Instruction>("LS32_" # Rm # "_RegOffset_STR"),
-                       Base, Offset, Extend,
-                       !foreach(decls.pattern, address,
-                                !subst(SHIFT, imm_eq2, decls.pattern)),
-                       i32>;
-
-  defm : ro_int_neutral_pats<
-                            !cast<Instruction>("LS32_" # Rm # "_RegOffset_LDR"),
-                            !cast<Instruction>("LS32_" # Rm # "_RegOffset_STR"),
-                            Base, Offset, Extend,
-                            !foreach(decls.pattern, address,
-                                     !subst(SHIFT, imm_eq2, decls.pattern)),
-                            i32>;
-
-  defm : ro_int_neutral_pats<
-                            !cast<Instruction>("LS64_" # Rm # "_RegOffset_LDR"),
-                            !cast<Instruction>("LS64_" # Rm # "_RegOffset_STR"),
-                            Base, Offset, Extend,
-                            !foreach(decls.pattern, address,
-                                     !subst(SHIFT, imm_eq3, decls.pattern)),
-                            i64>;
-
-  defm : ro_neutral_pats<!cast<Instruction>("LSFP16_" # Rm # "_RegOffset_LDR"),
-                         !cast<Instruction>("LSFP16_" # Rm # "_RegOffset_STR"),
-                         Base, Offset, Extend,
-                         !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq1, decls.pattern)),
-                         f16>;
-
-  defm : ro_neutral_pats<!cast<Instruction>("LSFP32_" # Rm # "_RegOffset_LDR"),
-                         !cast<Instruction>("LSFP32_" # Rm # "_RegOffset_STR"),
-                         Base, Offset, Extend,
-                         !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq2, decls.pattern)),
-                         f32>;
-
-  defm : ro_neutral_pats<!cast<Instruction>("LSFP64_" # Rm # "_RegOffset_LDR"),
-                         !cast<Instruction>("LSFP64_" # Rm # "_RegOffset_STR"),
-                         Base, Offset, Extend,
-                         !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq3, decls.pattern)),
-                         f64>;
-
-  defm : ro_neutral_pats<!cast<Instruction>("LSFP128_" # Rm # "_RegOffset_LDR"),
-                         !cast<Instruction>("LSFP128_" # Rm # "_RegOffset_STR"),
-                         Base, Offset, Extend,
-                         !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq4, decls.pattern)),
-                         f128>;
-
-  defm : ro_signed_pats<"B", Rm, Base, Offset, Extend,
-                        !foreach(decls.pattern, address,
-                                 !subst(SHIFT, imm_eq0, decls.pattern)),
-                        i8>;
-
-  defm : ro_signed_pats<"H", Rm, Base, Offset, Extend,
-                        !foreach(decls.pattern, address,
-                                 !subst(SHIFT, imm_eq1, decls.pattern)),
-                        i16>;
-
-  def : Pat<(sextloadi32 !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq2, decls.pattern))),
-            (!cast<Instruction>("LDRSWx_" # Rm # "_RegOffset")
-              Base, Offset, Extend)>;
-}
-
-
-// Finally we're in a position to tell LLVM exactly what addresses are reachable
-// using register-offset instructions. Essentially a base plus a possibly
-// extended, possibly shifted (by access size) offset.
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (sext i32:$Rm)),
-                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 6)>;
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (shl (sext i32:$Rm), SHIFT)),
-                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 7)>;
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (zext i32:$Rm)),
-                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 2)>;
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (shl (zext i32:$Rm), SHIFT)),
-                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 3)>;
-
-defm : regoff_pats<"Xm", (add i64:$Rn, i64:$Rm),
-                   (i64 i64:$Rn), (i64 i64:$Rm), (i64 2)>;
-
-defm : regoff_pats<"Xm", (add i64:$Rn, (shl i64:$Rm, SHIFT)),
-                   (i64 i64:$Rn), (i64 i64:$Rm), (i64 3)>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD (NEON) Support
-//
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
 
-include "AArch64InstrNEON.td"
+include "AArch64InstrAtomics.td"
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
deleted file mode 100644
index 0b97e3b..0000000
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ /dev/null
@@ -1,9476 +0,0 @@
-//===-- AArch64InstrNEON.td - NEON support for AArch64 -----*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the AArch64 NEON instruction set.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// NEON-specific DAG Nodes.
-//===----------------------------------------------------------------------===//
-
-// (outs Result), (ins Imm, OpCmode)
-def SDT_Neon_movi : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
-
-def Neon_movi     : SDNode<"AArch64ISD::NEON_MOVIMM", SDT_Neon_movi>;
-
-def Neon_mvni     : SDNode<"AArch64ISD::NEON_MVNIMM", SDT_Neon_movi>;
-
-// (outs Result), (ins Imm)
-def Neon_fmovi : SDNode<"AArch64ISD::NEON_FMOVIMM", SDTypeProfile<1, 1,
-                        [SDTCisVec<0>, SDTCisVT<1, i32>]>>;
-
-// (outs Result), (ins LHS, RHS, CondCode)
-def Neon_cmp : SDNode<"AArch64ISD::NEON_CMP", SDTypeProfile<1, 3,
-                 [SDTCisVec<0>,  SDTCisSameAs<1, 2>]>>;
-
-// (outs Result), (ins LHS, 0/0.0 constant, CondCode)
-def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3,
-                 [SDTCisVec<0>,  SDTCisVec<1>]>>;
-
-// (outs Result), (ins LHS, RHS)
-def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2,
-                 [SDTCisVec<0>,  SDTCisSameAs<1, 2>]>>;
-
-def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
-                                     SDTCisVT<2, i32>]>;
-def Neon_sqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>;
-def Neon_uqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>;
-
-def SDTPERMUTE : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
-                               SDTCisSameAs<0, 2>]>;
-def Neon_uzp1    : SDNode<"AArch64ISD::NEON_UZP1", SDTPERMUTE>;
-def Neon_uzp2    : SDNode<"AArch64ISD::NEON_UZP2", SDTPERMUTE>;
-def Neon_zip1    : SDNode<"AArch64ISD::NEON_ZIP1", SDTPERMUTE>;
-def Neon_zip2    : SDNode<"AArch64ISD::NEON_ZIP2", SDTPERMUTE>;
-def Neon_trn1    : SDNode<"AArch64ISD::NEON_TRN1", SDTPERMUTE>;
-def Neon_trn2    : SDNode<"AArch64ISD::NEON_TRN2", SDTPERMUTE>;
-
-def SDTVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
-def Neon_rev64    : SDNode<"AArch64ISD::NEON_REV64", SDTVSHUF>;
-def Neon_rev32    : SDNode<"AArch64ISD::NEON_REV32", SDTVSHUF>;
-def Neon_rev16    : SDNode<"AArch64ISD::NEON_REV16", SDTVSHUF>;
-def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1,
-                       [SDTCisVec<0>]>>;
-def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2,
-                           [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>;
-def Neon_vextract : SDNode<"AArch64ISD::NEON_VEXTRACT", SDTypeProfile<1, 3,
-                           [SDTCisVec<0>,  SDTCisSameAs<0, 1>,
-                           SDTCisSameAs<0, 2>, SDTCisVT<3, i64>]>>;
-
-//===----------------------------------------------------------------------===//
-// Addressing-mode instantiations
-//===----------------------------------------------------------------------===//
-
-multiclass ls_64_pats<dag address, dag Base, dag Offset, ValueType Ty> {
-defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
-                      !foreach(decls.pattern, Offset,
-                               !subst(OFFSET, dword_uimm12, decls.pattern)),
-                      !foreach(decls.pattern, address,
-                               !subst(OFFSET, dword_uimm12,
-                               !subst(ALIGN, min_align8, decls.pattern))),
-                      Ty>;
-}
-
-multiclass ls_128_pats<dag address, dag Base, dag Offset, ValueType Ty> {
-defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
-                       !foreach(decls.pattern, Offset,
-                                !subst(OFFSET, qword_uimm12, decls.pattern)),
-                       !foreach(decls.pattern, address,
-                                !subst(OFFSET, qword_uimm12,
-                                !subst(ALIGN, min_align16, decls.pattern))),
-                      Ty>;
-}
-
-multiclass uimm12_neon_pats<dag address, dag Base, dag Offset> {
-  defm : ls_64_pats<address, Base, Offset, v8i8>;
-  defm : ls_64_pats<address, Base, Offset, v4i16>;
-  defm : ls_64_pats<address, Base, Offset, v2i32>;
-  defm : ls_64_pats<address, Base, Offset, v1i64>;
-  defm : ls_64_pats<address, Base, Offset, v2f32>;
-  defm : ls_64_pats<address, Base, Offset, v1f64>;
-
-  defm : ls_128_pats<address, Base, Offset, v16i8>;
-  defm : ls_128_pats<address, Base, Offset, v8i16>;
-  defm : ls_128_pats<address, Base, Offset, v4i32>;
-  defm : ls_128_pats<address, Base, Offset, v2i64>;
-  defm : ls_128_pats<address, Base, Offset, v4f32>;
-  defm : ls_128_pats<address, Base, Offset, v2f64>;
-}
-
-defm : uimm12_neon_pats<(A64WrapperSmall
-                          tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
-                        (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
-
-//===----------------------------------------------------------------------===//
-// Multiclasses
-//===----------------------------------------------------------------------===//
-
-multiclass NeonI_3VSame_B_sizes<bit u, bits<2> size,  bits<5> opcode,
-                                string asmop, SDPatternOperator opnode8B,
-                                SDPatternOperator opnode16B,
-                                bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8B :  NeonI_3VSame<0b0, u, size, opcode,
-               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-               asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
-               [(set (v8i8 VPR64:$Rd),
-                  (v8i8 (opnode8B (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
-               NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _16B : NeonI_3VSame<0b1, u, size, opcode,
-               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-               asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
-               [(set (v16i8 VPR128:$Rd),
-                  (v16i8 (opnode16B (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
-               NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-
-}
-
-multiclass NeonI_3VSame_HS_sizes<bit u, bits<5> opcode,
-                                  string asmop, SDPatternOperator opnode,
-                                  bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _4H : NeonI_3VSame<0b0, u, 0b01, opcode,
-              (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-              asmop # "\t$Rd.4h, $Rn.4h, $Rm.4h",
-              [(set (v4i16 VPR64:$Rd),
-                 (v4i16 (opnode (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _8H : NeonI_3VSame<0b1, u, 0b01, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.8h, $Rn.8h, $Rm.8h",
-              [(set (v8i16 VPR128:$Rd),
-                 (v8i16 (opnode (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _2S : NeonI_3VSame<0b0, u, 0b10, opcode,
-              (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-              asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
-              [(set (v2i32 VPR64:$Rd),
-                 (v2i32 (opnode (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _4S : NeonI_3VSame<0b1, u, 0b10, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
-              [(set (v4i32 VPR128:$Rd),
-                 (v4i32 (opnode (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode,
-                                  string asmop, SDPatternOperator opnode,
-                                  bit Commutable = 0>
-   : NeonI_3VSame_HS_sizes<u, opcode,  asmop, opnode, Commutable> {
-  let isCommutable = Commutable in {
-    def _8B :  NeonI_3VSame<0b0, u, 0b00, opcode,
-               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-               asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
-               [(set (v8i8 VPR64:$Rd),
-                  (v8i8 (opnode (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
-               NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _16B : NeonI_3VSame<0b1, u, 0b00, opcode,
-               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-               asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
-               [(set (v16i8 VPR128:$Rd),
-                  (v16i8 (opnode (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
-               NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-multiclass NeonI_3VSame_BHSD_sizes<bit u, bits<5> opcode,
-                                   string asmop, SDPatternOperator opnode,
-                                   bit Commutable = 0>
-   : NeonI_3VSame_BHS_sizes<u, opcode,  asmop, opnode, Commutable> {
-  let isCommutable = Commutable in {
-    def _2D : NeonI_3VSame<0b1, u, 0b11, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
-              [(set (v2i64 VPR128:$Rd),
-                 (v2i64 (opnode (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-// Multiclass NeonI_3VSame_SD_sizes: Operand types are floating point types,
-// but Result types can be integer or floating point types.
-multiclass NeonI_3VSame_SD_sizes<bit u, bit size, bits<5> opcode,
-                                 string asmop, SDPatternOperator opnode,
-                                 ValueType ResTy2S, ValueType ResTy4S,
-                                 ValueType ResTy2D, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _2S : NeonI_3VSame<0b0, u, {size, 0b0}, opcode,
-              (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-              asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
-              [(set (ResTy2S VPR64:$Rd),
-                 (ResTy2S (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _4S : NeonI_3VSame<0b1, u, {size, 0b0}, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
-              [(set (ResTy4S VPR128:$Rd),
-                 (ResTy4S (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _2D : NeonI_3VSame<0b1, u, {size, 0b1}, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
-              [(set (ResTy2D VPR128:$Rd),
-                 (ResTy2D (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Instruction Definitions
-//===----------------------------------------------------------------------===//
-
-// Vector Arithmetic Instructions
-
-// Vector Add (Integer and Floating-Point)
-
-defm ADDvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b10000, "add", add, 1>;
-defm FADDvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11010, "fadd", fadd,
-                                     v2f32, v4f32, v2f64, 1>;
-
-// Patterns to match add of v1i8/v1i16/v1i32 types
-def : Pat<(v1i8 (add FPR8:$Rn, FPR8:$Rm)),
-          (EXTRACT_SUBREG
-              (ADDvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                         (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
-              sub_8)>;
-def : Pat<(v1i16 (add FPR16:$Rn, FPR16:$Rm)),
-          (EXTRACT_SUBREG
-              (ADDvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                         (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
-              sub_16)>;
-def : Pat<(v1i32 (add FPR32:$Rn, FPR32:$Rm)),
-          (EXTRACT_SUBREG
-              (ADDvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                         (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              sub_32)>;
-
-// Vector Sub (Integer and Floating-Point)
-
-defm SUBvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b10000, "sub", sub, 0>;
-defm FSUBvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11010, "fsub", fsub,
-                                     v2f32, v4f32, v2f64, 0>;
-
-// Patterns to match sub of v1i8/v1i16/v1i32 types
-def : Pat<(v1i8 (sub FPR8:$Rn, FPR8:$Rm)),
-          (EXTRACT_SUBREG
-              (SUBvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                         (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
-              sub_8)>;
-def : Pat<(v1i16 (sub FPR16:$Rn, FPR16:$Rm)),
-          (EXTRACT_SUBREG
-              (SUBvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                         (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
-              sub_16)>;
-def : Pat<(v1i32 (sub FPR32:$Rn, FPR32:$Rm)),
-          (EXTRACT_SUBREG
-              (SUBvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                         (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              sub_32)>;
-
-// Vector Multiply (Integer and Floating-Point)
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm MULvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b10011, "mul", mul, 1>;
-defm FMULvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11011, "fmul", fmul,
-                                     v2f32, v4f32, v2f64, 1>;
-}
-
-// Patterns to match mul of v1i8/v1i16/v1i32 types
-def : Pat<(v1i8 (mul FPR8:$Rn, FPR8:$Rm)),
-          (EXTRACT_SUBREG 
-              (MULvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                         (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
-              sub_8)>;
-def : Pat<(v1i16 (mul FPR16:$Rn, FPR16:$Rm)),
-          (EXTRACT_SUBREG 
-              (MULvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                         (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
-              sub_16)>;
-def : Pat<(v1i32 (mul FPR32:$Rn, FPR32:$Rm)),
-          (EXTRACT_SUBREG
-              (MULvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                         (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              sub_32)>;
-
-// Vector Multiply (Polynomial)
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul",
-                                    int_arm_neon_vmulp, int_arm_neon_vmulp, 1>;
-}
-
-// Vector Multiply-accumulate and Multiply-subtract (Integer)
-
-// class NeonI_3VSame_Constraint_impl: NeonI_3VSame with no data type and
-// two operands constraints.
-class NeonI_3VSame_Constraint_impl<string asmop, string asmlane,
-  RegisterOperand VPRC, ValueType OpTy, bit q, bit u, bits<2> size,
-  bits<5> opcode, SDPatternOperator opnode>
-  : NeonI_3VSame<q, u, size, opcode,
-    (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, VPRC:$Rm),
-    asmop # "\t$Rd" # asmlane # ", $Rn" # asmlane # ", $Rm" # asmlane,
-    [(set (OpTy VPRC:$Rd),
-       (OpTy (opnode (OpTy VPRC:$src), (OpTy VPRC:$Rn), (OpTy VPRC:$Rm))))],
-    NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-def Neon_mla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                       (add node:$Ra, (mul node:$Rn, node:$Rm))>;
-
-def Neon_mls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                       (sub node:$Ra, (mul node:$Rn, node:$Rm))>;
-
-
-let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC] in {
-def MLAvvv_8B:  NeonI_3VSame_Constraint_impl<"mla", ".8b",  VPR64,  v8i8,
-                                             0b0, 0b0, 0b00, 0b10010, Neon_mla>;
-def MLAvvv_16B: NeonI_3VSame_Constraint_impl<"mla", ".16b", VPR128, v16i8,
-                                             0b1, 0b0, 0b00, 0b10010, Neon_mla>;
-def MLAvvv_4H:  NeonI_3VSame_Constraint_impl<"mla", ".4h",  VPR64,  v4i16,
-                                             0b0, 0b0, 0b01, 0b10010, Neon_mla>;
-def MLAvvv_8H:  NeonI_3VSame_Constraint_impl<"mla", ".8h",  VPR128, v8i16,
-                                             0b1, 0b0, 0b01, 0b10010, Neon_mla>;
-def MLAvvv_2S:  NeonI_3VSame_Constraint_impl<"mla", ".2s",  VPR64,  v2i32,
-                                             0b0, 0b0, 0b10, 0b10010, Neon_mla>;
-def MLAvvv_4S:  NeonI_3VSame_Constraint_impl<"mla", ".4s",  VPR128, v4i32,
-                                             0b1, 0b0, 0b10, 0b10010, Neon_mla>;
-
-def MLSvvv_8B:  NeonI_3VSame_Constraint_impl<"mls", ".8b",  VPR64,  v8i8,
-                                             0b0, 0b1, 0b00, 0b10010, Neon_mls>;
-def MLSvvv_16B: NeonI_3VSame_Constraint_impl<"mls", ".16b", VPR128, v16i8,
-                                             0b1, 0b1, 0b00, 0b10010, Neon_mls>;
-def MLSvvv_4H:  NeonI_3VSame_Constraint_impl<"mls", ".4h",  VPR64,  v4i16,
-                                             0b0, 0b1, 0b01, 0b10010, Neon_mls>;
-def MLSvvv_8H:  NeonI_3VSame_Constraint_impl<"mls", ".8h",  VPR128, v8i16,
-                                             0b1, 0b1, 0b01, 0b10010, Neon_mls>;
-def MLSvvv_2S:  NeonI_3VSame_Constraint_impl<"mls", ".2s",  VPR64,  v2i32,
-                                             0b0, 0b1, 0b10, 0b10010, Neon_mls>;
-def MLSvvv_4S:  NeonI_3VSame_Constraint_impl<"mls", ".4s",  VPR128, v4i32,
-                                             0b1, 0b1, 0b10, 0b10010, Neon_mls>;
-}
-
-// Vector Multiply-accumulate and Multiply-subtract (Floating Point)
-
-def Neon_fmla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                        (fadd node:$Ra, (fmul_su node:$Rn, node:$Rm))>;
-
-def Neon_fmls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                        (fsub node:$Ra, (fmul_su node:$Rn, node:$Rm))>;
-
-let Predicates = [HasNEON, UseFusedMAC],
-    SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC] in {
-def FMLAvvv_2S: NeonI_3VSame_Constraint_impl<"fmla", ".2s",  VPR64,  v2f32,
-                                             0b0, 0b0, 0b00, 0b11001, Neon_fmla>;
-def FMLAvvv_4S: NeonI_3VSame_Constraint_impl<"fmla", ".4s",  VPR128, v4f32,
-                                             0b1, 0b0, 0b00, 0b11001, Neon_fmla>;
-def FMLAvvv_2D: NeonI_3VSame_Constraint_impl<"fmla", ".2d",  VPR128, v2f64,
-                                             0b1, 0b0, 0b01, 0b11001, Neon_fmla>;
-
-def FMLSvvv_2S: NeonI_3VSame_Constraint_impl<"fmls", ".2s",  VPR64,  v2f32,
-                                              0b0, 0b0, 0b10, 0b11001, Neon_fmls>;
-def FMLSvvv_4S: NeonI_3VSame_Constraint_impl<"fmls", ".4s",  VPR128, v4f32,
-                                             0b1, 0b0, 0b10, 0b11001, Neon_fmls>;
-def FMLSvvv_2D: NeonI_3VSame_Constraint_impl<"fmls", ".2d",  VPR128, v2f64,
-                                             0b1, 0b0, 0b11, 0b11001, Neon_fmls>;
-}
-
-// We're also allowed to match the fma instruction regardless of compile
-// options.
-def : Pat<(v2f32 (fma VPR64:$Rn, VPR64:$Rm, VPR64:$Ra)),
-          (FMLAvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>;
-def : Pat<(v4f32 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)),
-          (FMLAvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-def : Pat<(v2f64 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)),
-          (FMLAvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-
-def : Pat<(v2f32 (fma (fneg VPR64:$Rn), VPR64:$Rm, VPR64:$Ra)),
-          (FMLSvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>;
-def : Pat<(v4f32 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
-          (FMLSvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-def : Pat<(v2f64 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
-          (FMLSvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-
-// Vector Divide (Floating-Point)
-
-let SchedRW = [WriteFPDiv, ReadFPDiv, ReadFPDiv] in {
-defm FDIVvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11111, "fdiv", fdiv,
-                                     v2f32, v4f32, v2f64, 0>;
-}
-
-// Vector Bitwise Operations
-
-// Vector Bitwise AND
-
-defm ANDvvv : NeonI_3VSame_B_sizes<0b0, 0b00, 0b00011, "and", and, and, 1>;
-
-// Vector Bitwise Exclusive OR
-
-defm EORvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b00011, "eor", xor, xor, 1>;
-
-// Vector Bitwise OR
-
-defm ORRvvv : NeonI_3VSame_B_sizes<0b0, 0b10, 0b00011, "orr", or, or, 1>;
-
-// ORR disassembled as MOV if Vn==Vm
-
-// Vector Move - register
-// Alias for ORR if Vn=Vm.
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-def : NeonInstAlias<"mov $Rd.8b, $Rn.8b",
-                    (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn), 0>;
-def : NeonInstAlias<"mov $Rd.16b, $Rn.16b",
-                    (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn), 0>;
-
-// The MOVI instruction takes two immediate operands.  The first is the
-// immediate encoding, while the second is the cmode.  A cmode of 14, or
-// 0b1110, produces a MOVI operation, rather than a MVNI, ORR, or BIC.
-def Neon_AllZero : PatFrag<(ops), (Neon_movi (i32 0), (i32 14))>;
-def Neon_AllOne : PatFrag<(ops), (Neon_movi (i32 255), (i32 14))>;
-
-def Neon_not8B  : PatFrag<(ops node:$in),
-                          (xor node:$in, (bitconvert (v8i8 Neon_AllOne)))>;
-def Neon_not16B : PatFrag<(ops node:$in),
-                          (xor node:$in, (bitconvert (v16i8 Neon_AllOne)))>;
-
-def Neon_orn8B : PatFrag<(ops node:$Rn, node:$Rm),
-                         (or node:$Rn, (Neon_not8B node:$Rm))>;
-
-def Neon_orn16B : PatFrag<(ops node:$Rn, node:$Rm),
-                          (or node:$Rn, (Neon_not16B node:$Rm))>;
-
-def Neon_bic8B : PatFrag<(ops node:$Rn, node:$Rm),
-                         (and node:$Rn, (Neon_not8B node:$Rm))>;
-
-def Neon_bic16B : PatFrag<(ops node:$Rn, node:$Rm),
-                          (and node:$Rn, (Neon_not16B node:$Rm))>;
-
-
-// Vector Bitwise OR NOT - register
-
-defm ORNvvv : NeonI_3VSame_B_sizes<0b0, 0b11, 0b00011, "orn",
-                                   Neon_orn8B, Neon_orn16B, 0>;
-
-// Vector Bitwise Bit Clear (AND NOT) - register
-
-defm BICvvv : NeonI_3VSame_B_sizes<0b0, 0b01, 0b00011, "bic",
-                                   Neon_bic8B, Neon_bic16B, 0>;
-
-multiclass Neon_bitwise2V_patterns<SDPatternOperator opnode8B,
-                                   SDPatternOperator opnode16B,
-                                   Instruction INST8B,
-                                   Instruction INST16B> {
-  def : Pat<(v2i32 (opnode8B VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (opnode8B VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (opnode8B VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i32 (opnode16B VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (opnode16B VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (opnode16B VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$Rn, VPR128:$Rm)>;
-}
-
-// Additional patterns for bitwise instructions AND, EOR, ORR, BIC, ORN
-defm : Neon_bitwise2V_patterns<and, and, ANDvvv_8B, ANDvvv_16B>;
-defm : Neon_bitwise2V_patterns<or,  or,  ORRvvv_8B, ORRvvv_16B>;
-defm : Neon_bitwise2V_patterns<xor, xor, EORvvv_8B, EORvvv_16B>;
-defm : Neon_bitwise2V_patterns<Neon_bic8B, Neon_bic16B, BICvvv_8B, BICvvv_16B>;
-defm : Neon_bitwise2V_patterns<Neon_orn8B, Neon_orn16B, ORNvvv_8B, ORNvvv_16B>;
-
-//   Vector Bitwise Select
-def BSLvvv_8B  : NeonI_3VSame_Constraint_impl<"bsl", ".8b",  VPR64, v8i8,
-                                              0b0, 0b1, 0b01, 0b00011, vselect>;
-
-def BSLvvv_16B : NeonI_3VSame_Constraint_impl<"bsl", ".16b", VPR128, v16i8,
-                                              0b1, 0b1, 0b01, 0b00011, vselect>;
-
-multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode,
-                                   Instruction INST8B,
-                                   Instruction INST16B> {
-  // Disassociate type from instruction definition
-  def : Pat<(v8i8 (opnode (v8i8 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2i32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2f32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (opnode (v4i16 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1f64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v16i8 (opnode (v16i8 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4i32 (opnode (v4i32 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (opnode (v8i16 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (opnode (v2i64 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2f64 (opnode (v2i64 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4f32 (opnode (v4i32 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-
-  // Allow to match BSL instruction pattern with non-constant operand
-  def : Pat<(v8i8 (or (and VPR64:$Rn, VPR64:$Rd),
-                    (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
-          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (or (and VPR64:$Rn, VPR64:$Rd),
-                     (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
-          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2i32 (or (and VPR64:$Rn, VPR64:$Rd),
-                     (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
-          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (or (and VPR64:$Rn, VPR64:$Rd),
-                     (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
-          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v16i8 (or (and VPR128:$Rn, VPR128:$Rd),
-                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
-          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (or (and VPR128:$Rn, VPR128:$Rd),
-                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
-          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4i32 (or (and VPR128:$Rn, VPR128:$Rd),
-                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
-          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (or (and VPR128:$Rn, VPR128:$Rd),
-                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
-          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-
-  // Allow to match llvm.arm.* intrinsics.
-  def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 VPR64:$src),
-                    (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 VPR64:$src),
-                    (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 VPR64:$src),
-                    (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 VPR64:$src),
-                    (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 VPR64:$src),
-                    (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1f64 (int_arm_neon_vbsl (v1f64 VPR64:$src),
-                    (v1f64 VPR64:$Rn), (v1f64 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 VPR128:$src),
-                    (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 VPR128:$src),
-                    (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 VPR128:$src),
-                    (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 VPR128:$src),
-                    (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 VPR128:$src),
-                    (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2f64 (int_arm_neon_vbsl (v2f64 VPR128:$src),
-                    (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-}
-
-// Additional patterns for bitwise instruction BSL
-defm: Neon_bitwise3V_patterns<vselect, BSLvvv_8B, BSLvvv_16B>;
-
-def Neon_NoBSLop : PatFrag<(ops node:$src, node:$Rn, node:$Rm),
-                           (vselect node:$src, node:$Rn, node:$Rm),
-                           [{ (void)N; return false; }]>;
-
-// Vector Bitwise Insert if True
-
-def BITvvv_8B  : NeonI_3VSame_Constraint_impl<"bit", ".8b", VPR64,   v8i8,
-                   0b0, 0b1, 0b10, 0b00011, Neon_NoBSLop>;
-def BITvvv_16B : NeonI_3VSame_Constraint_impl<"bit", ".16b", VPR128, v16i8,
-                   0b1, 0b1, 0b10, 0b00011, Neon_NoBSLop>;
-
-// Vector Bitwise Insert if False
-
-def BIFvvv_8B  : NeonI_3VSame_Constraint_impl<"bif", ".8b", VPR64,  v8i8,
-                                0b0, 0b1, 0b11, 0b00011, Neon_NoBSLop>;
-def BIFvvv_16B : NeonI_3VSame_Constraint_impl<"bif", ".16b", VPR128, v16i8,
-                                0b1, 0b1, 0b11, 0b00011, Neon_NoBSLop>;
-
-// Vector Absolute Difference and Accumulate (Signed, Unsigned)
-
-def Neon_uaba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                       (add node:$Ra, (int_arm_neon_vabdu node:$Rn, node:$Rm))>;
-def Neon_saba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                       (add node:$Ra, (int_arm_neon_vabds node:$Rn, node:$Rm))>;
-
-// Vector Absolute Difference and Accumulate (Unsigned)
-def UABAvvv_8B :  NeonI_3VSame_Constraint_impl<"uaba", ".8b",  VPR64,  v8i8,
-                    0b0, 0b1, 0b00, 0b01111, Neon_uaba>;
-def UABAvvv_16B : NeonI_3VSame_Constraint_impl<"uaba", ".16b", VPR128, v16i8,
-                    0b1, 0b1, 0b00, 0b01111, Neon_uaba>;
-def UABAvvv_4H :  NeonI_3VSame_Constraint_impl<"uaba", ".4h",  VPR64,  v4i16,
-                    0b0, 0b1, 0b01, 0b01111, Neon_uaba>;
-def UABAvvv_8H :  NeonI_3VSame_Constraint_impl<"uaba", ".8h",  VPR128, v8i16,
-                    0b1, 0b1, 0b01, 0b01111, Neon_uaba>;
-def UABAvvv_2S :  NeonI_3VSame_Constraint_impl<"uaba", ".2s",  VPR64,  v2i32,
-                    0b0, 0b1, 0b10, 0b01111, Neon_uaba>;
-def UABAvvv_4S :  NeonI_3VSame_Constraint_impl<"uaba", ".4s",  VPR128, v4i32,
-                    0b1, 0b1, 0b10, 0b01111, Neon_uaba>;
-
-// Vector Absolute Difference and Accumulate (Signed)
-def SABAvvv_8B :  NeonI_3VSame_Constraint_impl<"saba", ".8b",  VPR64,  v8i8,
-                    0b0, 0b0, 0b00, 0b01111, Neon_saba>;
-def SABAvvv_16B : NeonI_3VSame_Constraint_impl<"saba", ".16b", VPR128, v16i8,
-                    0b1, 0b0, 0b00, 0b01111, Neon_saba>;
-def SABAvvv_4H :  NeonI_3VSame_Constraint_impl<"saba", ".4h",  VPR64,  v4i16,
-                    0b0, 0b0, 0b01, 0b01111, Neon_saba>;
-def SABAvvv_8H :  NeonI_3VSame_Constraint_impl<"saba", ".8h",  VPR128, v8i16,
-                    0b1, 0b0, 0b01, 0b01111, Neon_saba>;
-def SABAvvv_2S :  NeonI_3VSame_Constraint_impl<"saba", ".2s",  VPR64,  v2i32,
-                    0b0, 0b0, 0b10, 0b01111, Neon_saba>;
-def SABAvvv_4S :  NeonI_3VSame_Constraint_impl<"saba", ".4s",  VPR128, v4i32,
-                    0b1, 0b0, 0b10, 0b01111, Neon_saba>;
-
-
-// Vector Absolute Difference (Signed, Unsigned)
-defm UABDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01110, "uabd", int_arm_neon_vabdu, 0>;
-defm SABDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01110, "sabd", int_arm_neon_vabds, 0>;
-
-// Vector Absolute Difference (Floating Point)
-defm FABDvvv: NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11010, "fabd",
-                                    int_arm_neon_vabds, v2f32, v4f32, v2f64, 0>;
-
-// Vector Reciprocal Step (Floating Point)
-defm FRECPSvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11111, "frecps",
-                                       int_arm_neon_vrecps,
-                                       v2f32, v4f32, v2f64, 0>;
-
-// Vector Reciprocal Square Root Step (Floating Point)
-defm FRSQRTSvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11111, "frsqrts",
-                                        int_arm_neon_vrsqrts,
-                                        v2f32, v4f32, v2f64, 0>;
-
-// Vector Comparisons
-
-def Neon_cmeq : PatFrag<(ops node:$lhs, node:$rhs),
-                        (Neon_cmp node:$lhs, node:$rhs, SETEQ)>;
-def Neon_cmphs : PatFrag<(ops node:$lhs, node:$rhs),
-                         (Neon_cmp node:$lhs, node:$rhs, SETUGE)>;
-def Neon_cmge : PatFrag<(ops node:$lhs, node:$rhs),
-                        (Neon_cmp node:$lhs, node:$rhs, SETGE)>;
-def Neon_cmhi : PatFrag<(ops node:$lhs, node:$rhs),
-                        (Neon_cmp node:$lhs, node:$rhs, SETUGT)>;
-def Neon_cmgt : PatFrag<(ops node:$lhs, node:$rhs),
-                        (Neon_cmp node:$lhs, node:$rhs, SETGT)>;
-
-// NeonI_compare_aliases class: swaps register operands to implement
-// comparison aliases, e.g., CMLE is alias for CMGE with operands reversed.
-class NeonI_compare_aliases<string asmop, string asmlane,
-                            Instruction inst, RegisterOperand VPRC>
-  : NeonInstAlias<asmop # "\t$Rd" # asmlane #", $Rn" # asmlane #
-                    ", $Rm" # asmlane,
-                  (inst VPRC:$Rd, VPRC:$Rm, VPRC:$Rn), 0b0>;
-
-// Vector Comparisons (Integer)
-
-// Vector Compare Mask Equal (Integer)
-let isCommutable =1 in {
-defm CMEQvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10001, "cmeq", Neon_cmeq, 0>;
-}
-
-// Vector Compare Mask Higher or Same (Unsigned Integer)
-defm CMHSvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00111, "cmhs", Neon_cmphs, 0>;
-
-// Vector Compare Mask Greater Than or Equal (Integer)
-defm CMGEvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00111, "cmge", Neon_cmge, 0>;
-
-// Vector Compare Mask Higher (Unsigned Integer)
-defm CMHIvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00110, "cmhi", Neon_cmhi, 0>;
-
-// Vector Compare Mask Greater Than (Integer)
-defm CMGTvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00110, "cmgt", Neon_cmgt, 0>;
-
-// Vector Compare Mask Bitwise Test (Integer)
-defm CMTSTvvv:  NeonI_3VSame_BHSD_sizes<0b0, 0b10001, "cmtst", Neon_tst, 0>;
-
-// Vector Compare Mask Less or Same (Unsigned Integer)
-// CMLS is alias for CMHS with operands reversed.
-def CMLSvvv_8B  : NeonI_compare_aliases<"cmls", ".8b",  CMHSvvv_8B,  VPR64>;
-def CMLSvvv_16B : NeonI_compare_aliases<"cmls", ".16b", CMHSvvv_16B, VPR128>;
-def CMLSvvv_4H  : NeonI_compare_aliases<"cmls", ".4h",  CMHSvvv_4H,  VPR64>;
-def CMLSvvv_8H  : NeonI_compare_aliases<"cmls", ".8h",  CMHSvvv_8H,  VPR128>;
-def CMLSvvv_2S  : NeonI_compare_aliases<"cmls", ".2s",  CMHSvvv_2S,  VPR64>;
-def CMLSvvv_4S  : NeonI_compare_aliases<"cmls", ".4s",  CMHSvvv_4S,  VPR128>;
-def CMLSvvv_2D  : NeonI_compare_aliases<"cmls", ".2d",  CMHSvvv_2D,  VPR128>;
-
-// Vector Compare Mask Less Than or Equal (Integer)
-// CMLE is alias for CMGE with operands reversed.
-def CMLEvvv_8B  : NeonI_compare_aliases<"cmle", ".8b",  CMGEvvv_8B,  VPR64>;
-def CMLEvvv_16B : NeonI_compare_aliases<"cmle", ".16b", CMGEvvv_16B, VPR128>;
-def CMLEvvv_4H  : NeonI_compare_aliases<"cmle", ".4h",  CMGEvvv_4H,  VPR64>;
-def CMLEvvv_8H  : NeonI_compare_aliases<"cmle", ".8h",  CMGEvvv_8H,  VPR128>;
-def CMLEvvv_2S  : NeonI_compare_aliases<"cmle", ".2s",  CMGEvvv_2S,  VPR64>;
-def CMLEvvv_4S  : NeonI_compare_aliases<"cmle", ".4s",  CMGEvvv_4S,  VPR128>;
-def CMLEvvv_2D  : NeonI_compare_aliases<"cmle", ".2d",  CMGEvvv_2D,  VPR128>;
-
-// Vector Compare Mask Lower (Unsigned Integer)
-// CMLO is alias for CMHI with operands reversed.
-def CMLOvvv_8B  : NeonI_compare_aliases<"cmlo", ".8b",  CMHIvvv_8B,  VPR64>;
-def CMLOvvv_16B : NeonI_compare_aliases<"cmlo", ".16b", CMHIvvv_16B, VPR128>;
-def CMLOvvv_4H  : NeonI_compare_aliases<"cmlo", ".4h",  CMHIvvv_4H,  VPR64>;
-def CMLOvvv_8H  : NeonI_compare_aliases<"cmlo", ".8h",  CMHIvvv_8H,  VPR128>;
-def CMLOvvv_2S  : NeonI_compare_aliases<"cmlo", ".2s",  CMHIvvv_2S,  VPR64>;
-def CMLOvvv_4S  : NeonI_compare_aliases<"cmlo", ".4s",  CMHIvvv_4S,  VPR128>;
-def CMLOvvv_2D  : NeonI_compare_aliases<"cmlo", ".2d",  CMHIvvv_2D,  VPR128>;
-
-// Vector Compare Mask Less Than (Integer)
-// CMLT is alias for CMGT with operands reversed.
-def CMLTvvv_8B  : NeonI_compare_aliases<"cmlt", ".8b",  CMGTvvv_8B,  VPR64>;
-def CMLTvvv_16B : NeonI_compare_aliases<"cmlt", ".16b", CMGTvvv_16B, VPR128>;
-def CMLTvvv_4H  : NeonI_compare_aliases<"cmlt", ".4h",  CMGTvvv_4H,  VPR64>;
-def CMLTvvv_8H  : NeonI_compare_aliases<"cmlt", ".8h",  CMGTvvv_8H,  VPR128>;
-def CMLTvvv_2S  : NeonI_compare_aliases<"cmlt", ".2s",  CMGTvvv_2S,  VPR64>;
-def CMLTvvv_4S  : NeonI_compare_aliases<"cmlt", ".4s",  CMGTvvv_4S,  VPR128>;
-def CMLTvvv_2D  : NeonI_compare_aliases<"cmlt", ".2d",  CMGTvvv_2D,  VPR128>;
-
-
-def neon_uimm0_asmoperand : AsmOperandClass
-{
-  let Name = "UImm0";
-  let PredicateMethod = "isUImm<0>";
-  let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm0 : Operand<i32>, ImmLeaf<i32, [{return Imm == 0;}]> {
-  let ParserMatchClass = neon_uimm0_asmoperand;
-  let PrintMethod = "printNeonUImm0Operand";
-
-}
-
-multiclass NeonI_cmpz_sizes<bit u, bits<5> opcode, string asmop, CondCode CC>
-{
-  def _8B :  NeonI_2VMisc<0b0, u, 0b00, opcode,
-             (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
-             asmop # "\t$Rd.8b, $Rn.8b, $Imm",
-             [(set (v8i8 VPR64:$Rd),
-                (v8i8 (Neon_cmpz (v8i8 VPR64:$Rn), (i32 imm:$Imm), CC)))],
-             NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _16B : NeonI_2VMisc<0b1, u, 0b00, opcode,
-             (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
-             asmop # "\t$Rd.16b, $Rn.16b, $Imm",
-             [(set (v16i8 VPR128:$Rd),
-                (v16i8 (Neon_cmpz (v16i8 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-             NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _4H : NeonI_2VMisc<0b0, u, 0b01, opcode,
-            (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.4h, $Rn.4h, $Imm",
-            [(set (v4i16 VPR64:$Rd),
-               (v4i16 (Neon_cmpz (v4i16 VPR64:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _8H : NeonI_2VMisc<0b1, u, 0b01, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.8h, $Rn.8h, $Imm",
-            [(set (v8i16 VPR128:$Rd),
-               (v8i16 (Neon_cmpz (v8i16 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _2S : NeonI_2VMisc<0b0, u, 0b10, opcode,
-            (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.2s, $Rn.2s, $Imm",
-            [(set (v2i32 VPR64:$Rd),
-               (v2i32 (Neon_cmpz (v2i32 VPR64:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _4S : NeonI_2VMisc<0b1, u, 0b10, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.4s, $Rn.4s, $Imm",
-            [(set (v4i32 VPR128:$Rd),
-               (v4i32 (Neon_cmpz (v4i32 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _2D : NeonI_2VMisc<0b1, u, 0b11, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.2d, $Rn.2d, $Imm",
-            [(set (v2i64 VPR128:$Rd),
-               (v2i64 (Neon_cmpz (v2i64 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-// Vector Compare Mask Equal to Zero (Integer)
-defm CMEQvvi : NeonI_cmpz_sizes<0b0, 0b01001, "cmeq", SETEQ>;
-
-// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
-defm CMGEvvi : NeonI_cmpz_sizes<0b1, 0b01000, "cmge", SETGE>;
-
-// Vector Compare Mask Greater Than Zero (Signed Integer)
-defm CMGTvvi : NeonI_cmpz_sizes<0b0, 0b01000, "cmgt", SETGT>;
-
-// Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
-defm CMLEvvi : NeonI_cmpz_sizes<0b1, 0b01001, "cmle", SETLE>;
-
-// Vector Compare Mask Less Than Zero (Signed Integer)
-defm CMLTvvi : NeonI_cmpz_sizes<0b0, 0b01010, "cmlt", SETLT>;
-
-// Vector Comparisons (Floating Point)
-
-// Vector Compare Mask Equal (Floating Point)
-let isCommutable =1 in {
-defm FCMEQvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11100, "fcmeq", Neon_cmeq,
-                                      v2i32, v4i32, v2i64, 0>;
-}
-
-// Vector Compare Mask Greater Than Or Equal (Floating Point)
-defm FCMGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11100, "fcmge", Neon_cmge,
-                                      v2i32, v4i32, v2i64, 0>;
-
-// Vector Compare Mask Greater Than (Floating Point)
-defm FCMGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11100, "fcmgt", Neon_cmgt,
-                                      v2i32, v4i32, v2i64, 0>;
-
-// Vector Compare Mask Less Than Or Equal (Floating Point)
-// FCMLE is alias for FCMGE with operands reversed.
-def FCMLEvvv_2S  : NeonI_compare_aliases<"fcmle", ".2s",  FCMGEvvv_2S,  VPR64>;
-def FCMLEvvv_4S  : NeonI_compare_aliases<"fcmle", ".4s",  FCMGEvvv_4S,  VPR128>;
-def FCMLEvvv_2D  : NeonI_compare_aliases<"fcmle", ".2d",  FCMGEvvv_2D,  VPR128>;
-
-// Vector Compare Mask Less Than (Floating Point)
-// FCMLT is alias for FCMGT with operands reversed.
-def FCMLTvvv_2S  : NeonI_compare_aliases<"fcmlt", ".2s",  FCMGTvvv_2S,  VPR64>;
-def FCMLTvvv_4S  : NeonI_compare_aliases<"fcmlt", ".4s",  FCMGTvvv_4S,  VPR128>;
-def FCMLTvvv_2D  : NeonI_compare_aliases<"fcmlt", ".2d",  FCMGTvvv_2D,  VPR128>;
-
-def fpzero_izero_asmoperand : AsmOperandClass {
-  let Name = "FPZeroIZero";
-  let ParserMethod = "ParseFPImm0AndImm0Operand";
-  let DiagnosticType = "FPZero";
-}
-
-def fpzz32 : Operand<f32>,
-             ComplexPattern<f32, 1, "SelectFPZeroOperand", [fpimm]> {
-  let ParserMatchClass = fpzero_izero_asmoperand;
-  let PrintMethod = "printFPZeroOperand";
-  let DecoderMethod = "DecodeFPZeroOperand";
-}
-
-multiclass NeonI_fpcmpz_sizes<bit u, bit size, bits<5> opcode,
-                              string asmop, CondCode CC>
-{
-  def _2S : NeonI_2VMisc<0b0, u, {size, 0b0}, opcode,
-            (outs VPR64:$Rd), (ins VPR64:$Rn, fpzz32:$FPImm),
-            asmop # "\t$Rd.2s, $Rn.2s, $FPImm",
-            [(set (v2i32 VPR64:$Rd),
-               (v2i32 (Neon_cmpz (v2f32 VPR64:$Rn), (f32 fpzz32:$FPImm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _4S : NeonI_2VMisc<0b1, u, {size, 0b0}, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, fpzz32:$FPImm),
-            asmop # "\t$Rd.4s, $Rn.4s, $FPImm",
-            [(set (v4i32 VPR128:$Rd),
-               (v4i32 (Neon_cmpz (v4f32 VPR128:$Rn), (f32 fpzz32:$FPImm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _2D : NeonI_2VMisc<0b1, u, {size, 0b1}, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, fpzz32:$FPImm),
-            asmop # "\t$Rd.2d, $Rn.2d, $FPImm",
-            [(set (v2i64 VPR128:$Rd),
-               (v2i64 (Neon_cmpz (v2f64 VPR128:$Rn), (f32 fpzz32:$FPImm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-// Vector Compare Mask Equal to Zero (Floating Point)
-defm FCMEQvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01101, "fcmeq", SETEQ>;
-
-// Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
-defm FCMGEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01100, "fcmge", SETGE>;
-
-// Vector Compare Mask Greater Than Zero (Floating Point)
-defm FCMGTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01100, "fcmgt", SETGT>;
-
-// Vector Compare Mask Less Than or Equal To Zero (Floating Point)
-defm FCMLEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01101, "fcmle", SETLE>;
-
-// Vector Compare Mask Less Than Zero (Floating Point)
-defm FCMLTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01110, "fcmlt", SETLT>;
-
-// Vector Absolute Comparisons (Floating Point)
-
-// Vector Absolute Compare Mask Greater Than Or Equal (Floating Point)
-defm FACGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11101, "facge",
-                                      int_arm_neon_vacge,
-                                      v2i32, v4i32, v2i64, 0>;
-
-// Vector Absolute Compare Mask Greater Than (Floating Point)
-defm FACGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11101, "facgt",
-                                      int_arm_neon_vacgt,
-                                      v2i32, v4i32, v2i64, 0>;
-
-// Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
-// FACLE is alias for FACGE with operands reversed.
-def FACLEvvv_2S  : NeonI_compare_aliases<"facle", ".2s",  FACGEvvv_2S,  VPR64>;
-def FACLEvvv_4S  : NeonI_compare_aliases<"facle", ".4s",  FACGEvvv_4S,  VPR128>;
-def FACLEvvv_2D  : NeonI_compare_aliases<"facle", ".2d",  FACGEvvv_2D,  VPR128>;
-
-// Vector Absolute Compare Mask Less Than (Floating Point)
-// FACLT is alias for FACGT with operands reversed.
-def FACLTvvv_2S  : NeonI_compare_aliases<"faclt", ".2s",  FACGTvvv_2S,  VPR64>;
-def FACLTvvv_4S  : NeonI_compare_aliases<"faclt", ".4s",  FACGTvvv_4S,  VPR128>;
-def FACLTvvv_2D  : NeonI_compare_aliases<"faclt", ".2d",  FACGTvvv_2D,  VPR128>;
-
-// Vector halving add (Integer Signed, Unsigned)
-defm SHADDvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b00000, "shadd",
-                                        int_arm_neon_vhadds, 1>;
-defm UHADDvvv :  NeonI_3VSame_BHS_sizes<0b1, 0b00000, "uhadd",
-                                        int_arm_neon_vhaddu, 1>;
-
-// Vector halving sub (Integer Signed, Unsigned)
-defm SHSUBvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b00100, "shsub",
-                                        int_arm_neon_vhsubs, 0>;
-defm UHSUBvvv :  NeonI_3VSame_BHS_sizes<0b1, 0b00100, "uhsub",
-                                        int_arm_neon_vhsubu, 0>;
-
-// Vector rouding halving add (Integer Signed, Unsigned)
-defm SRHADDvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b00010, "srhadd",
-                                         int_arm_neon_vrhadds, 1>;
-defm URHADDvvv :  NeonI_3VSame_BHS_sizes<0b1, 0b00010, "urhadd",
-                                         int_arm_neon_vrhaddu, 1>;
-
-// Vector Saturating add (Integer Signed, Unsigned)
-defm SQADDvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b00001, "sqadd",
-                   int_arm_neon_vqadds, 1>;
-defm UQADDvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b00001, "uqadd",
-                   int_arm_neon_vqaddu, 1>;
-
-// Vector Saturating sub (Integer Signed, Unsigned)
-defm SQSUBvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b00101, "sqsub",
-                   int_arm_neon_vqsubs, 1>;
-defm UQSUBvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b00101, "uqsub",
-                   int_arm_neon_vqsubu, 1>;
-
-// Vector Shift Left (Signed and Unsigned Integer)
-defm SSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01000, "sshl",
-                 int_arm_neon_vshifts, 1>;
-defm USHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01000, "ushl",
-                 int_arm_neon_vshiftu, 1>;
-
-// Vector Saturating Shift Left (Signed and Unsigned Integer)
-defm SQSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01001, "sqshl",
-                  int_arm_neon_vqshifts, 1>;
-defm UQSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01001, "uqshl",
-                  int_arm_neon_vqshiftu, 1>;
-
-// Vector Rouding Shift Left (Signed and Unsigned Integer)
-defm SRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01010, "srshl",
-                  int_arm_neon_vrshifts, 1>;
-defm URSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01010, "urshl",
-                  int_arm_neon_vrshiftu, 1>;
-
-// Vector Saturating Rouding Shift Left (Signed and Unsigned Integer)
-defm SQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01011, "sqrshl",
-                   int_arm_neon_vqrshifts, 1>;
-defm UQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01011, "uqrshl",
-                   int_arm_neon_vqrshiftu, 1>;
-
-// Vector Maximum (Signed and Unsigned Integer)
-defm SMAXvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01100, "smax", int_arm_neon_vmaxs, 1>;
-defm UMAXvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01100, "umax", int_arm_neon_vmaxu, 1>;
-
-// Vector Minimum (Signed and Unsigned Integer)
-defm SMINvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01101, "smin", int_arm_neon_vmins, 1>;
-defm UMINvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01101, "umin", int_arm_neon_vminu, 1>;
-
-// Vector Maximum (Floating Point)
-defm FMAXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11110, "fmax",
-                                     int_arm_neon_vmaxs,
-                                     v2f32, v4f32, v2f64, 1>;
-
-// Vector Minimum (Floating Point)
-defm FMINvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11110, "fmin",
-                                     int_arm_neon_vmins,
-                                     v2f32, v4f32, v2f64, 1>;
-
-// Vector maxNum (Floating Point) -  prefer a number over a quiet NaN)
-defm FMAXNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11000, "fmaxnm",
-                                       int_aarch64_neon_vmaxnm,
-                                       v2f32, v4f32, v2f64, 1>;
-
-// Vector minNum (Floating Point) - prefer a number over a quiet NaN)
-defm FMINNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11000, "fminnm",
-                                       int_aarch64_neon_vminnm,
-                                       v2f32, v4f32, v2f64, 1>;
-
-// Vector Maximum Pairwise (Signed and Unsigned Integer)
-defm SMAXPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10100, "smaxp", int_arm_neon_vpmaxs, 1>;
-defm UMAXPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10100, "umaxp", int_arm_neon_vpmaxu, 1>;
-
-// Vector Minimum Pairwise (Signed and Unsigned Integer)
-defm SMINPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10101, "sminp", int_arm_neon_vpmins, 1>;
-defm UMINPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10101, "uminp", int_arm_neon_vpminu, 1>;
-
-// Vector Maximum Pairwise (Floating Point)
-defm FMAXPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11110, "fmaxp",
-                                     int_arm_neon_vpmaxs, v2f32, v4f32, v2f64, 1>;
-
-// Vector Minimum Pairwise (Floating Point)
-defm FMINPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11110, "fminp",
-                                     int_arm_neon_vpmins, v2f32, v4f32, v2f64, 1>;
-
-// Vector maxNum Pairwise (Floating Point) -  prefer a number over a quiet NaN)
-defm FMAXNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11000, "fmaxnmp",
-                                       int_aarch64_neon_vpmaxnm,
-                                       v2f32, v4f32, v2f64, 1>;
-
-// Vector minNum Pairwise (Floating Point) -  prefer a number over a quiet NaN)
-defm FMINNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11000, "fminnmp",
-                                       int_aarch64_neon_vpminnm,
-                                       v2f32, v4f32, v2f64, 1>;
-
-// Vector Addition Pairwise (Integer)
-defm ADDP : NeonI_3VSame_BHSD_sizes<0b0, 0b10111, "addp", int_arm_neon_vpadd, 1>;
-
-// Vector Addition Pairwise (Floating Point)
-defm FADDP : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11010, "faddp",
-                                       int_arm_neon_vpadd,
-                                       v2f32, v4f32, v2f64, 1>;
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-// Vector Saturating Doubling Multiply High
-defm SQDMULHvvv : NeonI_3VSame_HS_sizes<0b0, 0b10110, "sqdmulh",
-                    int_arm_neon_vqdmulh, 1>;
-
-// Vector Saturating Rouding Doubling Multiply High
-defm SQRDMULHvvv : NeonI_3VSame_HS_sizes<0b1, 0b10110, "sqrdmulh",
-                     int_arm_neon_vqrdmulh, 1>;
-
-// Vector Multiply Extended (Floating Point)
-defm FMULXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11011, "fmulx",
-                                      int_aarch64_neon_vmulx,
-                                      v2f32, v4f32, v2f64, 1>;
-}
-
-// Patterns to match llvm.aarch64.* intrinsic for 
-// ADDP, SMINP, UMINP, SMAXP, UMAXP having i32 as output
-class Neon_VectorPair_v2i32_pattern<SDPatternOperator opnode, Instruction INST>
-  : Pat<(v1i32 (opnode (v2i32 VPR64:$Rn))),
-        (EXTRACT_SUBREG
-             (v2i32 (INST (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rn))),
-             sub_32)>;
-
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_sminv, SMINPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_uminv, UMINPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_smaxv, SMAXPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_umaxv, UMAXPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_vaddv, ADDP_2S>;
-
-// Vector Immediate Instructions
-
-multiclass neon_mov_imm_shift_asmoperands<string PREFIX>
-{
-  def _asmoperand : AsmOperandClass
-    {
-      let Name = "NeonMovImmShift" # PREFIX;
-      let RenderMethod = "addNeonMovImmShift" # PREFIX # "Operands";
-      let PredicateMethod = "isNeonMovImmShift" # PREFIX;
-    }
-}
-
-// Definition of vector immediates shift operands
-
-// The selectable use-cases extract the shift operation
-// information from the OpCmode fields encoded in the immediate.
-def neon_mod_shift_imm_XFORM : SDNodeXForm<imm, [{
-  uint64_t OpCmode = N->getZExtValue();
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  unsigned HasShift =
-    A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn);
-  if (!HasShift) return SDValue();
-  return CurDAG->getTargetConstant(ShiftImm, MVT::i32);
-}]>;
-
-// Vector immediates shift operands which accept LSL and MSL
-// shift operators with shift value in the range of 0, 8, 16, 24 (LSL),
-// or 0, 8 (LSLH) or 8, 16 (MSL).
-defm neon_mov_imm_LSL : neon_mov_imm_shift_asmoperands<"LSL">;
-defm neon_mov_imm_MSL : neon_mov_imm_shift_asmoperands<"MSL">;
-// LSLH restricts shift amount to  0, 8 out of 0, 8, 16, 24
-defm neon_mov_imm_LSLH : neon_mov_imm_shift_asmoperands<"LSLH">;
-
-multiclass neon_mov_imm_shift_operands<string PREFIX,
-                                       string HALF, string ISHALF, code pred>
-{
-   def _operand : Operand<i32>, ImmLeaf<i32, pred, neon_mod_shift_imm_XFORM>
-    {
-      let PrintMethod =
-        "printNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">";
-      let DecoderMethod =
-        "DecodeNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">";
-      let ParserMatchClass =
-        !cast<AsmOperandClass>("neon_mov_imm_" # PREFIX # HALF # "_asmoperand");
-    }
-}
-
-defm neon_mov_imm_LSL  : neon_mov_imm_shift_operands<"LSL", "", "false", [{
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  unsigned HasShift =
-    A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
-  return (HasShift && !ShiftOnesIn);
-}]>;
-
-defm neon_mov_imm_MSL  : neon_mov_imm_shift_operands<"MSL", "", "false", [{
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  unsigned HasShift =
-    A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
-  return (HasShift && ShiftOnesIn);
-}]>;
-
-defm neon_mov_imm_LSLH  : neon_mov_imm_shift_operands<"LSL", "H", "true", [{
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  unsigned HasShift =
-    A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
-  return (HasShift && !ShiftOnesIn);
-}]>;
-
-def neon_uimm1_asmoperand : AsmOperandClass
-{
-  let Name = "UImm1";
-  let PredicateMethod = "isUImm<1>";
-  let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm2_asmoperand : AsmOperandClass
-{
-  let Name = "UImm2";
-  let PredicateMethod = "isUImm<2>";
-  let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm8_asmoperand : AsmOperandClass
-{
-  let Name = "UImm8";
-  let PredicateMethod = "isUImm<8>";
-  let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm8 : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
-  let ParserMatchClass = neon_uimm8_asmoperand;
-  let PrintMethod = "printUImmHexOperand";
-}
-
-def neon_uimm64_mask_asmoperand : AsmOperandClass
-{
-  let Name = "NeonUImm64Mask";
-  let PredicateMethod = "isNeonUImm64Mask";
-  let RenderMethod = "addNeonUImm64MaskOperands";
-}
-
-// MCOperand for 64-bit bytemask with each byte having only the
-// value 0x00 and 0xff is encoded as an unsigned 8-bit value
-def neon_uimm64_mask : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
-  let ParserMatchClass = neon_uimm64_mask_asmoperand;
-  let PrintMethod = "printNeonUImm64MaskOperand";
-}
-
-multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
-                                   SDPatternOperator opnode>
-{
-    // shift zeros, per word
-    def _2S  : NeonI_1VModImm<0b0, op,
-                              (outs VPR64:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_LSL_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
-                              [(set (v2i32 VPR64:$Rd),
-                                 (v2i32 (opnode (timm:$Imm),
-                                   (neon_mov_imm_LSL_operand:$Simm))))],
-                              NoItinerary>,
-               Sched<[WriteFPALU]> {
-       bits<2> Simm;
-       let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
-     }
-
-    def _4S  : NeonI_1VModImm<0b1, op,
-                              (outs VPR128:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_LSL_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
-                              [(set (v4i32 VPR128:$Rd),
-                                 (v4i32 (opnode (timm:$Imm),
-                                   (neon_mov_imm_LSL_operand:$Simm))))],
-                              NoItinerary>,
-               Sched<[WriteFPALU]> {
-      bits<2> Simm;
-      let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
-    }
-
-    // shift zeros, per halfword
-    def _4H  : NeonI_1VModImm<0b0, op,
-                              (outs VPR64:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_LSLH_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
-                              [(set (v4i16 VPR64:$Rd),
-                                 (v4i16 (opnode (timm:$Imm),
-                                   (neon_mov_imm_LSLH_operand:$Simm))))],
-                              NoItinerary>,
-               Sched<[WriteFPALU]> {
-      bit  Simm;
-      let cmode = {0b1, 0b0, Simm, 0b0};
-    }
-
-    def _8H  : NeonI_1VModImm<0b1, op,
-                              (outs VPR128:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_LSLH_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
-                              [(set (v8i16 VPR128:$Rd),
-                                 (v8i16 (opnode (timm:$Imm),
-                                   (neon_mov_imm_LSLH_operand:$Simm))))],
-                              NoItinerary>,
-               Sched<[WriteFPALU]> {
-      bit Simm;
-      let cmode = {0b1, 0b0, Simm, 0b0};
-     }
-}
-
-multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
-                                                   SDPatternOperator opnode,
-                                                   SDPatternOperator neonopnode>
-{
-  let Constraints = "$src = $Rd" in {
-    // shift zeros, per word
-    def _2S  : NeonI_1VModImm<0b0, op,
-                 (outs VPR64:$Rd),
-                 (ins VPR64:$src, neon_uimm8:$Imm,
-                   neon_mov_imm_LSL_operand:$Simm),
-                 !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
-                 [(set (v2i32 VPR64:$Rd),
-                    (v2i32 (opnode (v2i32 VPR64:$src),
-                      (v2i32 (neonopnode timm:$Imm,
-                        neon_mov_imm_LSL_operand:$Simm)))))],
-                 NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]> {
-      bits<2> Simm;
-      let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
-    }
-
-    def _4S  : NeonI_1VModImm<0b1, op,
-                 (outs VPR128:$Rd),
-                 (ins VPR128:$src, neon_uimm8:$Imm,
-                   neon_mov_imm_LSL_operand:$Simm),
-                 !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
-                 [(set (v4i32 VPR128:$Rd),
-                    (v4i32 (opnode (v4i32 VPR128:$src),
-                      (v4i32 (neonopnode timm:$Imm,
-                        neon_mov_imm_LSL_operand:$Simm)))))],
-                 NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]> {
-      bits<2> Simm;
-      let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
-    }
-
-    // shift zeros, per halfword
-    def _4H  : NeonI_1VModImm<0b0, op,
-                 (outs VPR64:$Rd),
-                 (ins VPR64:$src, neon_uimm8:$Imm,
-                   neon_mov_imm_LSLH_operand:$Simm),
-                 !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
-                 [(set (v4i16 VPR64:$Rd),
-                    (v4i16 (opnode (v4i16 VPR64:$src),
-                       (v4i16 (neonopnode timm:$Imm,
-                          neon_mov_imm_LSL_operand:$Simm)))))],
-                 NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]> {
-      bit  Simm;
-      let cmode = {0b1, 0b0, Simm, 0b1};
-    }
-
-    def _8H  : NeonI_1VModImm<0b1, op,
-                 (outs VPR128:$Rd),
-                 (ins VPR128:$src, neon_uimm8:$Imm,
-                   neon_mov_imm_LSLH_operand:$Simm),
-                 !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
-                 [(set (v8i16 VPR128:$Rd),
-                    (v8i16 (opnode (v8i16 VPR128:$src),
-                      (v8i16 (neonopnode timm:$Imm,
-                        neon_mov_imm_LSL_operand:$Simm)))))],
-                 NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]> {
-      bit Simm;
-      let cmode = {0b1, 0b0, Simm, 0b1};
-    }
-  }
-}
-
-multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op,
-                                   SDPatternOperator opnode>
-{
-    // shift ones, per word
-    def _2S  : NeonI_1VModImm<0b0, op,
-                             (outs VPR64:$Rd),
-                             (ins neon_uimm8:$Imm,
-                               neon_mov_imm_MSL_operand:$Simm),
-                             !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
-                              [(set (v2i32 VPR64:$Rd),
-                                 (v2i32 (opnode (timm:$Imm),
-                                   (neon_mov_imm_MSL_operand:$Simm))))],
-                             NoItinerary>,
-               Sched<[WriteFPALU]> {
-       bit Simm;
-       let cmode = {0b1, 0b1, 0b0, Simm};
-     }
-
-   def _4S  : NeonI_1VModImm<0b1, op,
-                              (outs VPR128:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_MSL_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
-                              [(set (v4i32 VPR128:$Rd),
-                                 (v4i32 (opnode (timm:$Imm),
-                                   (neon_mov_imm_MSL_operand:$Simm))))],
-                              NoItinerary>,
-              Sched<[WriteFPALU]> {
-     bit Simm;
-     let cmode = {0b1, 0b1, 0b0, Simm};
-   }
-}
-
-// Vector Move Immediate Shifted
-let isReMaterializable = 1 in {
-defm MOVIvi_lsl : NeonI_mov_imm_lsl_sizes<"movi", 0b0, Neon_movi>;
-}
-
-// Vector Move Inverted Immediate Shifted
-let isReMaterializable = 1 in {
-defm MVNIvi_lsl : NeonI_mov_imm_lsl_sizes<"mvni", 0b1, Neon_mvni>;
-}
-
-// Vector Bitwise Bit Clear (AND NOT) - immediate
-let isReMaterializable = 1 in {
-defm BICvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"bic", 0b1,
-                                                         and, Neon_mvni>;
-}
-
-// Vector Bitwise OR - immedidate
-
-let isReMaterializable = 1 in {
-defm ORRvi_lsl   : NeonI_mov_imm_with_constraint_lsl_sizes<"orr", 0b0,
-                                                           or, Neon_movi>;
-}
-
-// Additional patterns for Vector Bitwise Bit Clear (AND NOT) - immedidate
-// LowerBUILD_VECTOR favors lowering MOVI over MVNI.
-// BIC immediate instructions selection requires additional patterns to
-// transform Neon_movi operands into BIC immediate operands
-
-def neon_mov_imm_LSLH_transform_XFORM : SDNodeXForm<imm, [{
-  uint64_t OpCmode = N->getZExtValue();
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  (void)A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn);
-  // LSLH restricts shift amount to  0, 8 which are encoded as 0 and 1
-  // Transform encoded shift amount 0 to 1 and 1 to 0.
-  return CurDAG->getTargetConstant(!ShiftImm, MVT::i32);
-}]>;
-
-def neon_mov_imm_LSLH_transform_operand
-  : ImmLeaf<i32, [{
-    unsigned ShiftImm;
-    unsigned ShiftOnesIn;
-    unsigned HasShift =
-      A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
-    return (HasShift && !ShiftOnesIn); }],
-  neon_mov_imm_LSLH_transform_XFORM>;
-
-// Transform (and A, (4h Neon_movi 0xff)) -> BIC 4h (A, 0xff, LSL 8)
-// Transform (and A, (4h Neon_movi 0xff LSL #8)) -> BIC 4h (A, 0xff)
-def : Pat<(v4i16 (and VPR64:$src,
-            (v4i16 (Neon_movi 255,
-              neon_mov_imm_LSLH_transform_operand:$Simm)))),
-          (BICvi_lsl_4H VPR64:$src, 255,
-            neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-// Transform (and A, (8h Neon_movi 8h 0xff)) -> BIC 8h (A, 0xff, LSL 8)
-// Transform (and A, (8h Neon_movi 0xff LSL #8)) -> BIC 8h (A, 0xff)
-def : Pat<(v8i16 (and VPR128:$src,
-            (v8i16 (Neon_movi 255,
-              neon_mov_imm_LSLH_transform_operand:$Simm)))),
-          (BICvi_lsl_8H VPR128:$src, 255,
-            neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-def : Pat<(v8i8 (and VPR64:$src,
-                  (bitconvert(v4i16 (Neon_movi 255,
-                    neon_mov_imm_LSLH_transform_operand:$Simm))))),
-          (BICvi_lsl_4H VPR64:$src, 255,
-            neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v2i32 (and VPR64:$src,
-                 (bitconvert(v4i16 (Neon_movi 255,
-                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
-          (BICvi_lsl_4H VPR64:$src, 255,
-            neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v1i64 (and VPR64:$src,
-                (bitconvert(v4i16 (Neon_movi 255,
-                  neon_mov_imm_LSLH_transform_operand:$Simm))))),
-        (BICvi_lsl_4H VPR64:$src, 255,
-          neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-def : Pat<(v16i8 (and VPR128:$src,
-                 (bitconvert(v8i16 (Neon_movi 255,
-                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
-        (BICvi_lsl_8H VPR128:$src, 255,
-          neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v4i32 (and VPR128:$src,
-                 (bitconvert(v8i16 (Neon_movi 255,
-                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
-        (BICvi_lsl_8H VPR128:$src, 255,
-          neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v2i64 (and VPR128:$src,
-                 (bitconvert(v8i16 (Neon_movi 255,
-                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
-        (BICvi_lsl_8H VPR128:$src, 255,
-          neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-multiclass Neon_bitwiseVi_patterns<SDPatternOperator opnode,
-                                   SDPatternOperator neonopnode,
-                                   Instruction INST4H,
-                                   Instruction INST8H,
-                                   Instruction INST2S,
-                                   Instruction INST4S> {
-  def : Pat<(v8i8 (opnode VPR64:$src,
-                    (bitconvert(v4i16 (neonopnode timm:$Imm,
-                      neon_mov_imm_LSLH_operand:$Simm))))),
-            (INST4H VPR64:$src, neon_uimm8:$Imm,
-              neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v2i32 (opnode VPR64:$src,
-                   (bitconvert(v4i16 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-            (INST4H VPR64:$src, neon_uimm8:$Imm,
-              neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v1i64 (opnode VPR64:$src,
-                  (bitconvert(v4i16 (neonopnode timm:$Imm,
-                    neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST4H VPR64:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-
-  def : Pat<(v16i8 (opnode VPR128:$src,
-                   (bitconvert(v8i16 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST8H VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v4i32 (opnode VPR128:$src,
-                   (bitconvert(v8i16 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST8H VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v2i64 (opnode VPR128:$src,
-                   (bitconvert(v8i16 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST8H VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-
-  def : Pat<(v8i8 (opnode VPR64:$src,
-                    (bitconvert(v2i32 (neonopnode timm:$Imm,
-                      neon_mov_imm_LSLH_operand:$Simm))))),
-            (INST2S VPR64:$src, neon_uimm8:$Imm,
-              neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v4i16 (opnode VPR64:$src,
-                   (bitconvert(v2i32 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-            (INST2S VPR64:$src, neon_uimm8:$Imm,
-              neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v1i64 (opnode VPR64:$src,
-                  (bitconvert(v2i32 (neonopnode timm:$Imm,
-                    neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST2S VPR64:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-
-  def : Pat<(v16i8 (opnode VPR128:$src,
-                   (bitconvert(v4i32 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST4S VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v8i16 (opnode VPR128:$src,
-                   (bitconvert(v4i32 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST4S VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v2i64 (opnode VPR128:$src,
-                   (bitconvert(v4i32 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST4S VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-}
-
-// Additional patterns for Vector Vector Bitwise Bit Clear (AND NOT) - immediate
-defm : Neon_bitwiseVi_patterns<and, Neon_mvni, BICvi_lsl_4H, BICvi_lsl_8H,
-                               BICvi_lsl_2S, BICvi_lsl_4S>;
-
-// Additional patterns for Vector Bitwise OR - immedidate
-defm : Neon_bitwiseVi_patterns<or, Neon_movi, ORRvi_lsl_4H, ORRvi_lsl_8H,
-                               ORRvi_lsl_2S, ORRvi_lsl_4S>;
-
-
-// Vector Move Immediate Masked
-let isReMaterializable = 1 in {
-defm MOVIvi_msl : NeonI_mov_imm_msl_sizes<"movi", 0b0, Neon_movi>;
-}
-
-// Vector Move Inverted Immediate Masked
-let isReMaterializable = 1 in {
-defm MVNIvi_msl : NeonI_mov_imm_msl_sizes<"mvni", 0b1, Neon_mvni>;
-}
-
-class NeonI_mov_imm_lsl_aliases<string asmop, string asmlane,
-                                Instruction inst, RegisterOperand VPRC>
-  : NeonInstAlias<!strconcat(asmop, "\t$Rd," # asmlane # ", $Imm"),
-                        (inst VPRC:$Rd, neon_uimm8:$Imm,  0), 0b0>;
-
-// Aliases for Vector Move Immediate Shifted
-def : NeonI_mov_imm_lsl_aliases<"movi", ".2s", MOVIvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"movi", ".4s", MOVIvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"movi", ".4h", MOVIvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"movi", ".8h", MOVIvi_lsl_8H, VPR128>;
-
-// Aliases for Vector Move Inverted Immediate Shifted
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".2s", MVNIvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".4s", MVNIvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".4h", MVNIvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".8h", MVNIvi_lsl_8H, VPR128>;
-
-// Aliases for Vector Bitwise Bit Clear (AND NOT) - immediate
-def : NeonI_mov_imm_lsl_aliases<"bic", ".2s", BICvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"bic", ".4s", BICvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"bic", ".4h", BICvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"bic", ".8h", BICvi_lsl_8H, VPR128>;
-
-// Aliases for Vector Bitwise OR - immedidate
-def : NeonI_mov_imm_lsl_aliases<"orr", ".2s", ORRvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"orr", ".4s", ORRvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"orr", ".4h", ORRvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"orr", ".8h", ORRvi_lsl_8H, VPR128>;
-
-//  Vector Move Immediate - per byte
-let isReMaterializable = 1 in {
-def MOVIvi_8B : NeonI_1VModImm<0b0, 0b0,
-                               (outs VPR64:$Rd), (ins neon_uimm8:$Imm),
-                               "movi\t$Rd.8b, $Imm",
-                               [(set (v8i8 VPR64:$Rd),
-                                  (v8i8 (Neon_movi (timm:$Imm), (i32 imm))))],
-                                NoItinerary>,
-                Sched<[WriteFPALU]> {
-  let cmode = 0b1110;
-}
-
-def MOVIvi_16B : NeonI_1VModImm<0b1, 0b0,
-                                (outs VPR128:$Rd), (ins neon_uimm8:$Imm),
-                                "movi\t$Rd.16b, $Imm",
-                                [(set (v16i8 VPR128:$Rd),
-                                   (v16i8 (Neon_movi (timm:$Imm), (i32 imm))))],
-                                 NoItinerary>,
-                Sched<[WriteFPALU]> {
-  let cmode = 0b1110;
-}
-}
-
-// Vector Move Immediate - bytemask, per double word
-let isReMaterializable = 1 in {
-def MOVIvi_2D : NeonI_1VModImm<0b1, 0b1,
-                               (outs VPR128:$Rd), (ins neon_uimm64_mask:$Imm),
-                               "movi\t $Rd.2d, $Imm",
-                               [(set (v2i64 VPR128:$Rd),
-                                  (v2i64 (Neon_movi (timm:$Imm), (i32 imm))))],
-                               NoItinerary>,
-                Sched<[WriteFPALU]> {
-  let cmode = 0b1110;
-}
-}
-
-// Vector Move Immediate - bytemask, one doubleword
-
-let isReMaterializable = 1 in {
-def MOVIdi : NeonI_1VModImm<0b0, 0b1,
-                           (outs FPR64:$Rd), (ins neon_uimm64_mask:$Imm),
-                           "movi\t $Rd, $Imm",
-                           [(set (v1i64 FPR64:$Rd),
-                             (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))],
-                           NoItinerary>,
-             Sched<[WriteFPALU]> {
-  let cmode = 0b1110;
-}
-}
-
-// Vector Floating Point Move Immediate
-
-class NeonI_FMOV_impl<string asmlane, RegisterOperand VPRC, ValueType OpTy,
-                      Operand immOpType, bit q, bit op>
-  : NeonI_1VModImm<q, op,
-                   (outs VPRC:$Rd), (ins immOpType:$Imm),
-                   "fmov\t$Rd" # asmlane # ", $Imm",
-                   [(set (OpTy VPRC:$Rd),
-                      (OpTy (Neon_fmovi (timm:$Imm))))],
-                   NoItinerary>,
-    Sched<[WriteFPALU]> {
-     let cmode = 0b1111;
-   }
-
-let isReMaterializable = 1 in {
-def FMOVvi_2S : NeonI_FMOV_impl<".2s", VPR64,  v2f32, fmov32_operand, 0b0, 0b0>;
-def FMOVvi_4S : NeonI_FMOV_impl<".4s", VPR128, v4f32, fmov32_operand, 0b1, 0b0>;
-def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>;
-}
-
-// Vector Shift (Immediate)
-
-// Shift Right/Left Immediate - The immh:immb field of these shifts are encoded
-// as follows:
-//
-//    Offset    Encoding
-//     8        immh:immb<6:3> = '0001xxx', <imm> is encoded in immh:immb<2:0>
-//     16       immh:immb<6:4> = '001xxxx', <imm> is encoded in immh:immb<3:0>
-//     32       immh:immb<6:5> = '01xxxxx', <imm> is encoded in immh:immb<4:0>
-//     64       immh:immb<6>   = '1xxxxxx', <imm> is encoded in immh:immb<5:0>
-//
-// The shift right immediate amount, in the range 1 to element bits, is computed
-// as Offset - UInt(immh:immb).  The shift left immediate amount, in the range 0
-// to element bits - 1, is computed as UInt(immh:immb) - Offset.
-
-class shr_imm_asmoperands<string OFFSET> : AsmOperandClass {
-  let Name = "ShrImm" # OFFSET;
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "ShrImm" # OFFSET;
-}
-
-class shr_imm<string OFFSET> : Operand<i32> {
-  let EncoderMethod = "getShiftRightImm" # OFFSET;
-  let DecoderMethod = "DecodeShiftRightImm" # OFFSET;
-  let ParserMatchClass =
-    !cast<AsmOperandClass>("shr_imm" # OFFSET # "_asmoperand");
-}
-
-def shr_imm8_asmoperand : shr_imm_asmoperands<"8">;
-def shr_imm16_asmoperand : shr_imm_asmoperands<"16">;
-def shr_imm32_asmoperand : shr_imm_asmoperands<"32">;
-def shr_imm64_asmoperand : shr_imm_asmoperands<"64">;
-
-def shr_imm8 : shr_imm<"8">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 8;}]>;
-def shr_imm16 : shr_imm<"16">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 16;}]>;
-def shr_imm32 : shr_imm<"32">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 32;}]>;
-def shr_imm64 : shr_imm<"64">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 64;}]>;
-
-class shl_imm_asmoperands<string OFFSET> : AsmOperandClass {
-  let Name = "ShlImm" # OFFSET;
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "ShlImm" # OFFSET;
-}
-
-class shl_imm<string OFFSET> : Operand<i32> {
-  let EncoderMethod = "getShiftLeftImm" # OFFSET;
-  let DecoderMethod = "DecodeShiftLeftImm" # OFFSET;
-  let ParserMatchClass =
-    !cast<AsmOperandClass>("shl_imm" # OFFSET # "_asmoperand");
-}
-
-def shl_imm8_asmoperand : shl_imm_asmoperands<"8">;
-def shl_imm16_asmoperand : shl_imm_asmoperands<"16">;
-def shl_imm32_asmoperand : shl_imm_asmoperands<"32">;
-def shl_imm64_asmoperand : shl_imm_asmoperands<"64">;
-
-def shl_imm8 : shl_imm<"8">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 8;}]>;
-def shl_imm16 : shl_imm<"16">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 16;}]>;
-def shl_imm32 : shl_imm<"32">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 32;}]>;
-def shl_imm64 : shl_imm<"64">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 64;}]>;
-
-class N2VShift<bit q, bit u, bits<5> opcode, string asmop, string T,
-               RegisterOperand VPRC, ValueType Ty, Operand ImmTy, SDNode OpNode>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-                     [(set (Ty VPRC:$Rd),
-                        (Ty (OpNode (Ty VPRC:$Rn),
-                          (Ty (Neon_vdup (i32 ImmTy:$Imm))))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_N2VShL<bit u, bits<5> opcode, string asmop> {
-  // 64-bit vector types.
-  def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, shl> {
-    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
-  }
-
-  def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, shl> {
-    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
-  }
-
-  def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, shl> {
-    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
-  }
-
-  // 128-bit vector types.
-  def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, shl> {
-    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
-  }
-
-  def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, shl> {
-    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
-  }
-
-  def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, shl> {
-    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
-  }
-
-  def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, shl> {
-    let Inst{22} = 0b1;        // immh:immb = 1xxxxxx
-  }
-}
-
-multiclass NeonI_N2VShR<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
-  def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                     OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                     OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                     OpNode> {
-     let Inst{22-21} = 0b01;
-  }
-
-  def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                      OpNode> {
-                      let Inst{22-19} = 0b0001;
-                    }
-
-  def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                     OpNode> {
-                     let Inst{22-20} = 0b001;
-                    }
-
-  def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                     OpNode> {
-                      let Inst{22-21} = 0b01;
-                    }
-
-  def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                     OpNode> {
-                      let Inst{22} = 0b1;
-                    }
-}
-
-// Shift left
-
-defm SHLvvi : NeonI_N2VShL<0b0, 0b01010, "shl">;
-
-// Additional patterns to match vector shift left by immediate.
-// (v1i8/v1i16/v1i32 types)
-def : Pat<(v1i8 (shl (v1i8 FPR8:$Rn),
-                     (v1i8 (Neon_vdup (i32 (shl_imm8:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SHLvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          shl_imm8:$Imm),
-              sub_8)>;
-def : Pat<(v1i16 (shl (v1i16 FPR16:$Rn),
-                      (v1i16 (Neon_vdup (i32 (shl_imm16:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SHLvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          shl_imm16:$Imm),
-              sub_16)>;
-def : Pat<(v1i32 (shl (v1i32 FPR32:$Rn),
-                      (v1i32 (Neon_vdup (i32 (shl_imm32:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SHLvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          shl_imm32:$Imm),
-              sub_32)>;
-
-// Shift right
-defm SSHRvvi : NeonI_N2VShR<0b0, 0b00000, "sshr", sra>;
-defm USHRvvi : NeonI_N2VShR<0b1, 0b00000, "ushr", srl>;
-
-// Additional patterns to match vector shift right by immediate.
-// (v1i8/v1i16/v1i32 types)
-def : Pat<(v1i8 (sra (v1i8 FPR8:$Rn),
-                     (v1i8 (Neon_vdup (i32 (shr_imm8:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SSHRvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          shr_imm8:$Imm),
-              sub_8)>;
-def : Pat<(v1i16 (sra (v1i16 FPR16:$Rn),
-                      (v1i16 (Neon_vdup (i32 (shr_imm16:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SSHRvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          shr_imm16:$Imm),
-              sub_16)>;
-def : Pat<(v1i32 (sra (v1i32 FPR32:$Rn),
-                      (v1i32 (Neon_vdup (i32 (shr_imm32:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SSHRvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          shr_imm32:$Imm),
-              sub_32)>;
-def : Pat<(v1i8 (srl (v1i8 FPR8:$Rn),
-                     (v1i8 (Neon_vdup (i32 (shr_imm8:$Imm)))))),
-          (EXTRACT_SUBREG
-              (USHRvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          shr_imm8:$Imm),
-              sub_8)>;
-def : Pat<(v1i16 (srl (v1i16 FPR16:$Rn),
-                      (v1i16 (Neon_vdup (i32 (shr_imm16:$Imm)))))),
-          (EXTRACT_SUBREG
-              (USHRvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          shr_imm16:$Imm),
-              sub_16)>;
-def : Pat<(v1i32 (srl (v1i32 FPR32:$Rn),
-                      (v1i32 (Neon_vdup (i32 (shr_imm32:$Imm)))))),
-          (EXTRACT_SUBREG
-              (USHRvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          shr_imm32:$Imm),
-              sub_32)>;
-
-def Neon_High16B : PatFrag<(ops node:$in),
-                           (extract_subvector (v16i8 node:$in), (iPTR 8))>;
-def Neon_High8H  : PatFrag<(ops node:$in),
-                           (extract_subvector (v8i16 node:$in), (iPTR 4))>;
-def Neon_High4S  : PatFrag<(ops node:$in),
-                           (extract_subvector (v4i32 node:$in), (iPTR 2))>;
-def Neon_High2D  : PatFrag<(ops node:$in),
-                           (extract_subvector (v2i64 node:$in), (iPTR 1))>;
-def Neon_High4float : PatFrag<(ops node:$in),
-                               (extract_subvector (v4f32 node:$in), (iPTR 2))>;
-def Neon_High2double : PatFrag<(ops node:$in),
-                               (extract_subvector (v2f64 node:$in), (iPTR 1))>;
-
-def Neon_Low16B : PatFrag<(ops node:$in),
-                          (v8i8 (extract_subvector (v16i8 node:$in),
-                                                   (iPTR 0)))>;
-def Neon_Low8H : PatFrag<(ops node:$in),
-                         (v4i16 (extract_subvector (v8i16 node:$in),
-                                                   (iPTR 0)))>;
-def Neon_Low4S : PatFrag<(ops node:$in),
-                         (v2i32 (extract_subvector (v4i32 node:$in),
-                                                   (iPTR 0)))>;
-def Neon_Low2D : PatFrag<(ops node:$in),
-                         (v1i64 (extract_subvector (v2i64 node:$in),
-                                                   (iPTR 0)))>;
-def Neon_Low4float : PatFrag<(ops node:$in),
-                             (v2f32 (extract_subvector (v4f32 node:$in),
-                                                       (iPTR 0)))>;
-def Neon_Low2double : PatFrag<(ops node:$in),
-                              (v1f64 (extract_subvector (v2f64 node:$in),
-                                                        (iPTR 0)))>;
-
-class N2VShiftLong<bit q, bit u, bits<5> opcode, string asmop, string DestT,
-                   string SrcT, ValueType DestTy, ValueType SrcTy,
-                   Operand ImmTy, SDPatternOperator ExtOp>
-  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
-                     (ins VPR64:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [(set (DestTy VPR128:$Rd),
-                        (DestTy (shl
-                          (DestTy (ExtOp (SrcTy VPR64:$Rn))),
-                            (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-class N2VShiftLongHigh<bit q, bit u, bits<5> opcode, string asmop, string DestT,
-                       string SrcT, ValueType DestTy, ValueType SrcTy,
-                       int StartIndex, Operand ImmTy,
-                       SDPatternOperator ExtOp, PatFrag getTop>
-  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
-                     (ins VPR128:$Rn, ImmTy:$Imm),
-                     asmop # "2\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [(set (DestTy VPR128:$Rd),
-                        (DestTy (shl
-                          (DestTy (ExtOp
-                            (SrcTy (getTop VPR128:$Rn)))),
-                              (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_N2VShLL<string prefix, bit u, bits<5> opcode, string asmop,
-                         SDNode ExtOp> {
-  // 64-bit vector types.
-  def _8B : N2VShiftLong<0b0, u, opcode, asmop, "8h", "8b", v8i16, v8i8,
-                         shl_imm8, ExtOp> {
-    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
-  }
-
-  def _4H : N2VShiftLong<0b0, u, opcode, asmop, "4s", "4h", v4i32, v4i16,
-                         shl_imm16, ExtOp> {
-    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
-  }
-
-  def _2S : N2VShiftLong<0b0, u, opcode, asmop, "2d", "2s", v2i64, v2i32,
-                         shl_imm32, ExtOp> {
-    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
-  }
-
-  // 128-bit vector types
-  def _16B : N2VShiftLongHigh<0b1, u, opcode, asmop, "8h", "16b", v8i16, v8i8,
-                              8, shl_imm8, ExtOp, Neon_High16B> {
-    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
-  }
-
-  def _8H : N2VShiftLongHigh<0b1, u, opcode, asmop, "4s", "8h", v4i32, v4i16,
-                             4, shl_imm16, ExtOp, Neon_High8H> {
-    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
-  }
-
-  def _4S : N2VShiftLongHigh<0b1, u, opcode, asmop, "2d", "4s", v2i64, v2i32,
-                             2, shl_imm32, ExtOp, Neon_High4S> {
-    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
-  }
-
-  // Use other patterns to match when the immediate is 0.
-  def : Pat<(v8i16 (ExtOp (v8i8 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "_8B") VPR64:$Rn, 0)>;
-
-  def : Pat<(v4i32 (ExtOp (v4i16 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "_4H") VPR64:$Rn, 0)>;
-
-  def : Pat<(v2i64 (ExtOp (v2i32 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "_2S") VPR64:$Rn, 0)>;
-
-  def : Pat<(v8i16 (ExtOp (v8i8 (Neon_High16B VPR128:$Rn)))),
-            (!cast<Instruction>(prefix # "_16B") VPR128:$Rn, 0)>;
-
-  def : Pat<(v4i32 (ExtOp (v4i16 (Neon_High8H VPR128:$Rn)))),
-            (!cast<Instruction>(prefix # "_8H") VPR128:$Rn, 0)>;
-
-  def : Pat<(v2i64 (ExtOp (v2i32 (Neon_High4S VPR128:$Rn)))),
-            (!cast<Instruction>(prefix # "_4S") VPR128:$Rn, 0)>;
-}
-
-// Shift left long
-defm SSHLLvvi : NeonI_N2VShLL<"SSHLLvvi", 0b0, 0b10100, "sshll", sext>;
-defm USHLLvvi : NeonI_N2VShLL<"USHLLvvi", 0b1, 0b10100, "ushll", zext>;
-
-class NeonI_ext_len_alias<string asmop, string lane, string laneOp,
-                       Instruction inst, RegisterOperand VPRC,
-                       RegisterOperand VPRCOp>
-  : NeonInstAlias<asmop # "\t$Rd" # lane #", $Rn" # laneOp,
-                  (inst VPRC:$Rd, VPRCOp:$Rn, 0), 0b0>;
-
-// Signed integer lengthen (vector) is alias for SSHLL Vd, Vn, #0
-// Signed integer lengthen (vector, second part) is alias for SSHLL2 Vd, Vn, #0
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-def SXTLvv_8B  : NeonI_ext_len_alias<"sxtl", ".8h", ".8b",  SSHLLvvi_8B, VPR128, VPR64>;
-def SXTLvv_4H  : NeonI_ext_len_alias<"sxtl", ".4s", ".4h",  SSHLLvvi_4H, VPR128, VPR64>;
-def SXTLvv_2S  : NeonI_ext_len_alias<"sxtl", ".2d", ".2s",  SSHLLvvi_2S, VPR128, VPR64>;
-def SXTL2vv_16B : NeonI_ext_len_alias<"sxtl2", ".8h", ".16b",  SSHLLvvi_16B, VPR128, VPR128>;
-def SXTL2vv_8H  : NeonI_ext_len_alias<"sxtl2", ".4s", ".8h",  SSHLLvvi_8H, VPR128, VPR128>;
-def SXTL2vv_4S  : NeonI_ext_len_alias<"sxtl2", ".2d", ".4s",  SSHLLvvi_4S, VPR128, VPR128>;
-
-// Unsigned integer lengthen (vector) is alias for USHLL Vd, Vn, #0
-// Unsigned integer lengthen (vector, second part) is alias for USHLL2 Vd, Vn, #0
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-def UXTLvv_8B  : NeonI_ext_len_alias<"uxtl", ".8h", ".8b",  USHLLvvi_8B, VPR128, VPR64>;
-def UXTLvv_4H  : NeonI_ext_len_alias<"uxtl", ".4s", ".4h",  USHLLvvi_4H, VPR128, VPR64>;
-def UXTLvv_2S  : NeonI_ext_len_alias<"uxtl", ".2d", ".2s",  USHLLvvi_2S, VPR128, VPR64>;
-def UXTL2vv_16B : NeonI_ext_len_alias<"uxtl2", ".8h", ".16b",  USHLLvvi_16B, VPR128, VPR128>;
-def UXTL2vv_8H  : NeonI_ext_len_alias<"uxtl2", ".4s", ".8h",  USHLLvvi_8H, VPR128, VPR128>;
-def UXTL2vv_4S  : NeonI_ext_len_alias<"uxtl2", ".2d", ".4s",  USHLLvvi_4S, VPR128, VPR128>;
-
-def : Pat<(v8i16 (anyext (v8i8 VPR64:$Rn))), (USHLLvvi_8B VPR64:$Rn, 0)>;
-def : Pat<(v4i32 (anyext (v4i16 VPR64:$Rn))), (USHLLvvi_4H VPR64:$Rn, 0)>;
-def : Pat<(v2i64 (anyext (v2i32 VPR64:$Rn))), (USHLLvvi_2S VPR64:$Rn, 0)>;
-
-// Rounding/Saturating shift
-class N2VShift_RQ<bit q, bit u, bits<5> opcode, string asmop, string T,
-                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
-                  SDPatternOperator OpNode>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-                     [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn),
-                        (i32 ImmTy:$Imm))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-// shift right (vector by immediate)
-multiclass NeonI_N2VShR_RQ<bit u, bits<5> opcode, string asmop,
-                           SDPatternOperator OpNode> {
-  def _8B  : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                         OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H  : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                         OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S  : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                         OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                         OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                        OpNode> {
-    let Inst{22} = 0b1;
-  }
-}
-
-multiclass NeonI_N2VShL_Q<bit u, bits<5> opcode, string asmop,
-                          SDPatternOperator OpNode> {
-  // 64-bit vector types.
-  def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
-                        OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  // 128-bit vector types.
-  def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
-                         OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
-                        OpNode> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Rounding shift right
-defm SRSHRvvi : NeonI_N2VShR_RQ<0b0, 0b00100, "srshr",
-                                int_aarch64_neon_vsrshr>;
-defm URSHRvvi : NeonI_N2VShR_RQ<0b1, 0b00100, "urshr",
-                                int_aarch64_neon_vurshr>;
-
-// Saturating shift left unsigned
-defm SQSHLUvvi : NeonI_N2VShL_Q<0b1, 0b01100, "sqshlu", int_aarch64_neon_vsqshlu>;
-
-// Saturating shift left
-defm SQSHLvvi : NeonI_N2VShL_Q<0b0, 0b01110, "sqshl", Neon_sqrshlImm>;
-defm UQSHLvvi : NeonI_N2VShL_Q<0b1, 0b01110, "uqshl", Neon_uqrshlImm>;
-
-class N2VShiftAdd<bit q, bit u, bits<5> opcode, string asmop, string T,
-                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
-                  SDNode OpNode>
-  : NeonI_2VShiftImm<q, u, opcode,
-           (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
-           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-           [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
-              (Ty (OpNode (Ty VPRC:$Rn),
-                (Ty (Neon_vdup (i32 ImmTy:$Imm))))))))],
-           NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-// Shift Right accumulate
-multiclass NeonI_N2VShRAdd<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
-  def _8B : N2VShiftAdd<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                        OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShiftAdd<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShiftAdd<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _16B : N2VShiftAdd<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                         OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShiftAdd<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShiftAdd<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShiftAdd<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                        OpNode> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Shift right and accumulate
-defm SSRAvvi    : NeonI_N2VShRAdd<0, 0b00010, "ssra", sra>;
-defm USRAvvi    : NeonI_N2VShRAdd<1, 0b00010, "usra", srl>;
-
-// Rounding shift accumulate
-class N2VShiftAdd_R<bit q, bit u, bits<5> opcode, string asmop, string T,
-                    RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
-                    SDPatternOperator OpNode>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-                     [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
-                        (Ty (OpNode (Ty VPRC:$Rn), (i32 ImmTy:$Imm))))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_N2VShRAdd_R<bit u, bits<5> opcode, string asmop,
-                             SDPatternOperator OpNode> {
-  def _8B : N2VShiftAdd_R<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                          OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShiftAdd_R<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                          OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShiftAdd_R<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                          OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _16B : N2VShiftAdd_R<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                           OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShiftAdd_R<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                          OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShiftAdd_R<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                          OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShiftAdd_R<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                          OpNode> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Rounding shift right and accumulate
-defm SRSRAvvi : NeonI_N2VShRAdd_R<0, 0b00110, "srsra", int_aarch64_neon_vsrshr>;
-defm URSRAvvi : NeonI_N2VShRAdd_R<1, 0b00110, "ursra", int_aarch64_neon_vurshr>;
-
-// Shift insert by immediate
-class N2VShiftIns<bit q, bit u, bits<5> opcode, string asmop, string T,
-                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
-                  SDPatternOperator OpNode>
-    : NeonI_2VShiftImm<q, u, opcode,
-           (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
-           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-           [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$src), (Ty VPRC:$Rn),
-             (i32 ImmTy:$Imm))))],
-           NoItinerary>,
-      Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-// shift left insert (vector by immediate)
-multiclass NeonI_N2VShLIns<bit u, bits<5> opcode, string asmop> {
-  def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-21} = 0b01;
-  }
-
-    // 128-bit vector types
-  def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
-                         int_aarch64_neon_vsli> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
-                        int_aarch64_neon_vsli> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// shift right insert (vector by immediate)
-multiclass NeonI_N2VShRIns<bit u, bits<5> opcode, string asmop> {
-    // 64-bit vector types.
-  def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-21} = 0b01;
-  }
-
-    // 128-bit vector types
-  def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                         int_aarch64_neon_vsri> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                        int_aarch64_neon_vsri> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Shift left and insert
-defm SLIvvi   : NeonI_N2VShLIns<0b1, 0b01010, "sli">;
-
-// Shift right and insert
-defm SRIvvi   : NeonI_N2VShRIns<0b1, 0b01000, "sri">;
-
-class N2VShR_Narrow<bit q, bit u, bits<5> opcode, string asmop, string DestT,
-                    string SrcT, Operand ImmTy>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPR64:$Rd), (ins VPR128:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-class N2VShR_Narrow_Hi<bit q, bit u, bits<5> opcode, string asmop, string DestT,
-                       string SrcT, Operand ImmTy>
-  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
-                     (ins VPR128:$src, VPR128:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-// left long shift by immediate
-multiclass NeonI_N2VShR_Narrow<bit u, bits<5> opcode, string asmop> {
-  def _8B : N2VShR_Narrow<0b0, u, opcode, asmop, "8b", "8h", shr_imm8> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShR_Narrow<0b0, u, opcode, asmop, "4h", "4s", shr_imm16> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShR_Narrow<0b0, u, opcode, asmop, "2s", "2d", shr_imm32> {
-    let Inst{22-21} = 0b01;
-  }
-
-  // Shift Narrow High
-  def _16B : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "16b", "8h",
-                              shr_imm8> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "8h", "4s",
-                             shr_imm16> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "4s", "2d",
-                             shr_imm32> {
-    let Inst{22-21} = 0b01;
-  }
-}
-
-// Shift right narrow
-defm SHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10000, "shrn">;
-
-// Shift right narrow (prefix Q is saturating, prefix R is rounding)
-defm QSHRUNvvi :NeonI_N2VShR_Narrow<0b1, 0b10000, "sqshrun">;
-defm RSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10001, "rshrn">;
-defm QRSHRUNvvi : NeonI_N2VShR_Narrow<0b1, 0b10001, "sqrshrun">;
-defm SQSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10010, "sqshrn">;
-defm UQSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10010, "uqshrn">;
-defm SQRSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10011, "sqrshrn">;
-defm UQRSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10011, "uqrshrn">;
-
-def Neon_combine_2D : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v2i64 (concat_vectors (v1i64 node:$Rm),
-                                                     (v1i64 node:$Rn)))>;
-def Neon_combine_8H : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v8i16 (concat_vectors (v4i16 node:$Rm),
-                                                     (v4i16 node:$Rn)))>;
-def Neon_combine_4S : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v4i32 (concat_vectors (v2i32 node:$Rm),
-                                                     (v2i32 node:$Rn)))>;
-def Neon_combine_4f : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v4f32 (concat_vectors (v2f32 node:$Rm),
-                                                     (v2f32 node:$Rn)))>;
-def Neon_combine_2d : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v2f64 (concat_vectors (v1f64 node:$Rm),
-                                                     (v1f64 node:$Rn)))>;
-
-def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v8i16 (srl (v8i16 node:$lhs),
-                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v4i32 (srl (v4i32 node:$lhs),
-                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v2i64 (srl (v2i64 node:$lhs),
-                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v8i16 (sra (v8i16 node:$lhs),
-                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v4i32 (sra (v4i32 node:$lhs),
-                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v2i64 (sra (v2i64 node:$lhs),
-                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
-
-// Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors)
-multiclass Neon_shiftNarrow_patterns<string shr> {
-  def : Pat<(v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H") VPR128:$Rn,
-              (i32 shr_imm8:$Imm)))),
-            (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S") VPR128:$Rn,
-              (i32 shr_imm16:$Imm)))),
-            (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D") VPR128:$Rn,
-              (i32 shr_imm32:$Imm)))),
-            (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>;
-
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
-              (v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H")
-                VPR128:$Rn, (i32 shr_imm8:$Imm))))))),
-            (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
-                         VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
-              (v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S")
-                VPR128:$Rn, (i32 shr_imm16:$Imm))))))),
-            (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                        VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
-              (v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D")
-                VPR128:$Rn, (i32 shr_imm32:$Imm))))))),
-            (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                        VPR128:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_shiftNarrow_QR_patterns<SDPatternOperator op, string prefix> {
-  def : Pat<(v8i8 (op (v8i16 VPR128:$Rn), shr_imm8:$Imm)),
-            (!cast<Instruction>(prefix # "_8B") VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(v4i16 (op (v4i32 VPR128:$Rn), shr_imm16:$Imm)),
-            (!cast<Instruction>(prefix # "_4H") VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(v2i32 (op (v2i64 VPR128:$Rn), shr_imm32:$Imm)),
-            (!cast<Instruction>(prefix # "_2S") VPR128:$Rn, imm:$Imm)>;
-
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
-                (v1i64 (bitconvert (v8i8
-                    (op (v8i16 VPR128:$Rn), shr_imm8:$Imm))))),
-            (!cast<Instruction>(prefix # "_16B")
-                (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
-                (v1i64 (bitconvert (v4i16
-                    (op (v4i32 VPR128:$Rn), shr_imm16:$Imm))))),
-            (!cast<Instruction>(prefix # "_8H")
-                (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
-                (v1i64 (bitconvert (v2i32
-                    (op (v2i64 VPR128:$Rn), shr_imm32:$Imm))))),
-            (!cast<Instruction>(prefix # "_4S")
-                  (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                  VPR128:$Rn, imm:$Imm)>;
-}
-
-defm : Neon_shiftNarrow_patterns<"lshr">;
-defm : Neon_shiftNarrow_patterns<"ashr">;
-
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrun, "QSHRUNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vrshrn, "RSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrun, "QRSHRUNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrn, "SQSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqshrn, "UQSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrn, "SQRSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqrshrn, "UQRSHRNvvi">;
-
-// Convert fix-point and float-pointing
-class N2VCvt_Fx<bit q, bit u, bits<5> opcode, string asmop, string T,
-                RegisterOperand VPRC, ValueType DestTy, ValueType SrcTy,
-                Operand ImmTy, SDPatternOperator IntOp>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-                     [(set (DestTy VPRC:$Rd), (DestTy (IntOp (SrcTy VPRC:$Rn),
-                       (i32 ImmTy:$Imm))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_N2VCvt_Fx2fp<bit u, bits<5> opcode, string asmop,
-                              SDPatternOperator IntOp> {
-  def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2f32, v2i32,
-                      shr_imm32, IntOp> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4f32, v4i32,
-                      shr_imm32, IntOp> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2f64, v2i64,
-                      shr_imm64, IntOp> {
-    let Inst{22} = 0b1;
-  }
-}
-
-multiclass NeonI_N2VCvt_Fp2fx<bit u, bits<5> opcode, string asmop,
-                              SDPatternOperator IntOp> {
-  def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2i32, v2f32,
-                      shr_imm32, IntOp> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4i32, v4f32,
-                      shr_imm32, IntOp> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2i64, v2f64,
-                      shr_imm64, IntOp> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Convert fixed-point to floating-point
-defm VCVTxs2f : NeonI_N2VCvt_Fx2fp<0, 0b11100, "scvtf",
-                                   int_arm_neon_vcvtfxs2fp>;
-defm VCVTxu2f : NeonI_N2VCvt_Fx2fp<1, 0b11100, "ucvtf",
-                                   int_arm_neon_vcvtfxu2fp>;
-
-// Convert floating-point to fixed-point
-defm VCVTf2xs : NeonI_N2VCvt_Fp2fx<0, 0b11111, "fcvtzs",
-                                   int_arm_neon_vcvtfp2fxs>;
-defm VCVTf2xu : NeonI_N2VCvt_Fp2fx<1, 0b11111, "fcvtzu",
-                                   int_arm_neon_vcvtfp2fxu>;
-
-multiclass Neon_sshll2_0<SDNode ext>
-{
-  def _v8i8  : PatFrag<(ops node:$Rn),
-                       (v8i16 (ext (v8i8 (Neon_High16B node:$Rn))))>;
-  def _v4i16 : PatFrag<(ops node:$Rn),
-                       (v4i32 (ext (v4i16 (Neon_High8H node:$Rn))))>;
-  def _v2i32 : PatFrag<(ops node:$Rn),
-                       (v2i64 (ext (v2i32 (Neon_High4S node:$Rn))))>;
-}
-
-defm NI_sext_high : Neon_sshll2_0<sext>;
-defm NI_zext_high : Neon_sshll2_0<zext>;
-
-
-//===----------------------------------------------------------------------===//
-// Multiclasses for NeonI_Across
-//===----------------------------------------------------------------------===//
-
-// Variant 1
-
-multiclass NeonI_2VAcross_1<bit u, bits<5> opcode,
-                            string asmop, SDPatternOperator opnode>
-{
-    def _1h8b:  NeonI_2VAcross<0b0, u, 0b00, opcode,
-                (outs FPR16:$Rd), (ins VPR64:$Rn),
-                asmop # "\t$Rd, $Rn.8b",
-                [(set (v1i16 FPR16:$Rd),
-                    (v1i16 (opnode (v8i8 VPR64:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1h16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
-                (outs FPR16:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.16b",
-                [(set (v1i16 FPR16:$Rd),
-                    (v1i16 (opnode (v16i8 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1s4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
-                (outs FPR32:$Rd), (ins VPR64:$Rn),
-                asmop # "\t$Rd, $Rn.4h",
-                [(set (v1i32 FPR32:$Rd),
-                    (v1i32 (opnode (v4i16 VPR64:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1s8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
-                (outs FPR32:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.8h",
-                [(set (v1i32 FPR32:$Rd),
-                    (v1i32 (opnode (v8i16 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    // _1d2s doesn't exist!
-
-    def _1d4s:  NeonI_2VAcross<0b1, u, 0b10, opcode,
-                (outs FPR64:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.4s",
-                [(set (v1i64 FPR64:$Rd),
-                    (v1i64 (opnode (v4i32 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SADDLV : NeonI_2VAcross_1<0b0, 0b00011, "saddlv", int_aarch64_neon_saddlv>;
-defm UADDLV : NeonI_2VAcross_1<0b1, 0b00011, "uaddlv", int_aarch64_neon_uaddlv>;
-
-// Variant 2
-
-multiclass NeonI_2VAcross_2<bit u, bits<5> opcode,
-                            string asmop, SDPatternOperator opnode>
-{
-    def _1b8b:  NeonI_2VAcross<0b0, u, 0b00, opcode,
-                (outs FPR8:$Rd), (ins VPR64:$Rn),
-                asmop # "\t$Rd, $Rn.8b",
-                [(set (v1i8 FPR8:$Rd),
-                    (v1i8 (opnode (v8i8 VPR64:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1b16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
-                (outs FPR8:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.16b",
-                [(set (v1i8 FPR8:$Rd),
-                    (v1i8 (opnode (v16i8 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1h4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
-                (outs FPR16:$Rd), (ins VPR64:$Rn),
-                asmop # "\t$Rd, $Rn.4h",
-                [(set (v1i16 FPR16:$Rd),
-                    (v1i16 (opnode (v4i16 VPR64:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1h8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
-                (outs FPR16:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.8h",
-                [(set (v1i16 FPR16:$Rd),
-                    (v1i16 (opnode (v8i16 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    // _1s2s doesn't exist!
-
-    def _1s4s:  NeonI_2VAcross<0b1, u, 0b10, opcode,
-                (outs FPR32:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.4s",
-                [(set (v1i32 FPR32:$Rd),
-                    (v1i32 (opnode (v4i32 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SMAXV : NeonI_2VAcross_2<0b0, 0b01010, "smaxv", int_aarch64_neon_smaxv>;
-defm UMAXV : NeonI_2VAcross_2<0b1, 0b01010, "umaxv", int_aarch64_neon_umaxv>;
-
-defm SMINV : NeonI_2VAcross_2<0b0, 0b11010, "sminv", int_aarch64_neon_sminv>;
-defm UMINV : NeonI_2VAcross_2<0b1, 0b11010, "uminv", int_aarch64_neon_uminv>;
-
-defm ADDV : NeonI_2VAcross_2<0b0, 0b11011, "addv", int_aarch64_neon_vaddv>;
-
-// Variant 3
-
-multiclass NeonI_2VAcross_3<bit u, bits<5> opcode, bits<2> size,
-                            string asmop, SDPatternOperator opnode> {
-    def _1s4s:  NeonI_2VAcross<0b1, u, size, opcode,
-                (outs FPR32:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.4s",
-                [(set (f32 FPR32:$Rd),
-                    (f32 (opnode (v4f32 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FMAXNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b00, "fmaxnmv",
-                                int_aarch64_neon_vmaxnmv>;
-defm FMINNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b10, "fminnmv",
-                                int_aarch64_neon_vminnmv>;
-
-defm FMAXV : NeonI_2VAcross_3<0b1, 0b01111, 0b00, "fmaxv",
-                              int_aarch64_neon_vmaxv>;
-defm FMINV : NeonI_2VAcross_3<0b1, 0b01111, 0b10, "fminv",
-                              int_aarch64_neon_vminv>;
-
-// The followings are for instruction class (Perm)
-
-class NeonI_Permute<bit q, bits<2> size, bits<3> opcode,
-                    string asmop, RegisterOperand OpVPR, string OpS,
-                    SDPatternOperator opnode, ValueType Ty>
-  : NeonI_Perm<q, size, opcode,
-               (outs OpVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
-               asmop # "\t$Rd." # OpS # ", $Rn." # OpS # ", $Rm." # OpS,
-               [(set (Ty OpVPR:$Rd),
-                  (Ty (opnode (Ty OpVPR:$Rn), (Ty OpVPR:$Rm))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_Perm_pat<bits<3> opcode, string asmop,
-                          SDPatternOperator opnode> {
-  def _8b  : NeonI_Permute<0b0, 0b00, opcode, asmop,
-                           VPR64, "8b", opnode, v8i8>;
-  def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop,
-                           VPR128, "16b",opnode, v16i8>;
-  def _4h  : NeonI_Permute<0b0, 0b01, opcode, asmop,
-                           VPR64, "4h", opnode, v4i16>;
-  def _8h  : NeonI_Permute<0b1, 0b01, opcode, asmop,
-                           VPR128, "8h", opnode, v8i16>;
-  def _2s  : NeonI_Permute<0b0, 0b10, opcode, asmop,
-                           VPR64, "2s", opnode, v2i32>;
-  def _4s  : NeonI_Permute<0b1, 0b10, opcode, asmop,
-                           VPR128, "4s", opnode, v4i32>;
-  def _2d  : NeonI_Permute<0b1, 0b11, opcode, asmop,
-                           VPR128, "2d", opnode, v2i64>;
-}
-
-defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1", Neon_uzp1>;
-defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1", Neon_trn1>;
-defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1", Neon_zip1>;
-defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2", Neon_uzp2>;
-defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2", Neon_trn2>;
-defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2", Neon_zip2>;
-
-multiclass NeonI_Perm_float_pat<string INS, SDPatternOperator opnode> {
-  def : Pat<(v2f32 (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
-            (!cast<Instruction>(INS # "_2s") VPR64:$Rn, VPR64:$Rm)>;
-
-  def : Pat<(v4f32 (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
-            (!cast<Instruction>(INS # "_4s") VPR128:$Rn, VPR128:$Rm)>;
-
-  def : Pat<(v2f64 (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
-            (!cast<Instruction>(INS # "_2d") VPR128:$Rn, VPR128:$Rm)>;
-}
-
-defm : NeonI_Perm_float_pat<"UZP1vvv", Neon_uzp1>;
-defm : NeonI_Perm_float_pat<"UZP2vvv", Neon_uzp2>;
-defm : NeonI_Perm_float_pat<"ZIP1vvv", Neon_zip1>;
-defm : NeonI_Perm_float_pat<"ZIP2vvv", Neon_zip2>;
-defm : NeonI_Perm_float_pat<"TRN1vvv", Neon_trn1>;
-defm : NeonI_Perm_float_pat<"TRN2vvv", Neon_trn2>;
-
-// The followings are for instruction class (3V Diff)
-
-// normal long/long2 pattern
-class NeonI_3VDL<bit q, bit u, bits<2> size, bits<4> opcode,
-                 string asmop, string ResS, string OpS,
-                 SDPatternOperator opnode, SDPatternOperator ext,
-                 RegisterOperand OpVPR,
-                 ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode (ResTy (ext (OpTy OpVPR:$Rn))),
-                                   (ResTy (ext (OpTy OpVPR:$Rm))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDL_s<bit u, bits<4> opcode,
-                        string asmop, SDPatternOperator opnode,
-                        bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                           opnode, sext, VPR64, v8i16, v8i8>;
-    def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                           opnode, sext, VPR64, v4i32, v4i16>;
-    def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                           opnode, sext, VPR64, v2i64, v2i32>;
-  }
-}
-
-multiclass NeonI_3VDL2_s<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                            opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
-    def _4s8h  : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                            opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
-    def _2d4s  : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                            opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
-  }
-}
-
-multiclass NeonI_3VDL_u<bit u, bits<4> opcode, string asmop,
-                        SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                           opnode, zext, VPR64, v8i16, v8i8>;
-    def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                           opnode, zext, VPR64, v4i32, v4i16>;
-    def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                           opnode, zext, VPR64, v2i64, v2i32>;
-  }
-}
-
-multiclass NeonI_3VDL2_u<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                            opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
-    def _4s8h : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                           opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
-    def _2d4s : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                           opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
-  }
-}
-
-defm SADDLvvv :  NeonI_3VDL_s<0b0, 0b0000, "saddl", add, 1>;
-defm UADDLvvv :  NeonI_3VDL_u<0b1, 0b0000, "uaddl", add, 1>;
-
-defm SADDL2vvv :  NeonI_3VDL2_s<0b0, 0b0000, "saddl2", add, 1>;
-defm UADDL2vvv :  NeonI_3VDL2_u<0b1, 0b0000, "uaddl2", add, 1>;
-
-defm SSUBLvvv :  NeonI_3VDL_s<0b0, 0b0010, "ssubl", sub, 0>;
-defm USUBLvvv :  NeonI_3VDL_u<0b1, 0b0010, "usubl", sub, 0>;
-
-defm SSUBL2vvv :  NeonI_3VDL2_s<0b0, 0b0010, "ssubl2", sub, 0>;
-defm USUBL2vvv :  NeonI_3VDL2_u<0b1, 0b0010, "usubl2", sub, 0>;
-
-// normal wide/wide2 pattern
-class NeonI_3VDW<bit q, bit u, bits<2> size, bits<4> opcode,
-                 string asmop, string ResS, string OpS,
-                 SDPatternOperator opnode, SDPatternOperator ext,
-                 RegisterOperand OpVPR,
-                 ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # ResS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode (ResTy VPR128:$Rn),
-                                   (ResTy (ext (OpTy OpVPR:$Rm))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDW_s<bit u, bits<4> opcode, string asmop,
-                        SDPatternOperator opnode> {
-  def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                         opnode, sext, VPR64, v8i16, v8i8>;
-  def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                         opnode, sext, VPR64, v4i32, v4i16>;
-  def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                         opnode, sext, VPR64, v2i64, v2i32>;
-}
-
-defm SADDWvvv :  NeonI_3VDW_s<0b0, 0b0001, "saddw", add>;
-defm SSUBWvvv :  NeonI_3VDW_s<0b0, 0b0011, "ssubw", sub>;
-
-multiclass NeonI_3VDW2_s<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode> {
-  def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                          opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
-  def _4s8h  : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                          opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
-  def _2d4s  : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                          opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
-}
-
-defm SADDW2vvv :  NeonI_3VDW2_s<0b0, 0b0001, "saddw2", add>;
-defm SSUBW2vvv :  NeonI_3VDW2_s<0b0, 0b0011, "ssubw2", sub>;
-
-multiclass NeonI_3VDW_u<bit u, bits<4> opcode, string asmop,
-                        SDPatternOperator opnode> {
-  def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                         opnode, zext, VPR64, v8i16, v8i8>;
-  def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                         opnode, zext, VPR64, v4i32, v4i16>;
-  def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                         opnode, zext, VPR64, v2i64, v2i32>;
-}
-
-defm UADDWvvv :  NeonI_3VDW_u<0b1, 0b0001, "uaddw", add>;
-defm USUBWvvv :  NeonI_3VDW_u<0b1, 0b0011, "usubw", sub>;
-
-multiclass NeonI_3VDW2_u<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode> {
-  def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                          opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
-  def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                         opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
-  def _2d4s : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                         opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
-}
-
-defm UADDW2vvv :  NeonI_3VDW2_u<0b1, 0b0001, "uaddw2", add>;
-defm USUBW2vvv :  NeonI_3VDW2_u<0b1, 0b0011, "usubw2", sub>;
-
-// Get the high half part of the vector element.
-multiclass NeonI_get_high {
-  def _8h : PatFrag<(ops node:$Rn),
-                    (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn),
-                                             (v8i16 (Neon_vdup (i32 8)))))))>;
-  def _4s : PatFrag<(ops node:$Rn),
-                    (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn),
-                                              (v4i32 (Neon_vdup (i32 16)))))))>;
-  def _2d : PatFrag<(ops node:$Rn),
-                    (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn),
-                                              (v2i64 (Neon_vdup (i32 32)))))))>;
-}
-
-defm NI_get_hi : NeonI_get_high;
-
-// pattern for addhn/subhn with 2 operands
-class NeonI_3VDN_addhn_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
-                           string asmop, string ResS, string OpS,
-                           SDPatternOperator opnode, SDPatternOperator get_hi,
-                           ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR64:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR64:$Rd),
-                    (ResTy (get_hi
-                      (OpTy (opnode (OpTy VPR128:$Rn),
-                                    (OpTy VPR128:$Rm))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDN_addhn_2Op<bit u, bits<4> opcode, string asmop,
-                                SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8b8h : NeonI_3VDN_addhn_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
-                                     opnode, NI_get_hi_8h, v8i8, v8i16>;
-    def _4h4s : NeonI_3VDN_addhn_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
-                                     opnode, NI_get_hi_4s, v4i16, v4i32>;
-    def _2s2d : NeonI_3VDN_addhn_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
-                                     opnode, NI_get_hi_2d, v2i32, v2i64>;
-  }
-}
-
-defm ADDHNvvv  : NeonI_3VDN_addhn_2Op<0b0, 0b0100, "addhn", add, 1>;
-defm SUBHNvvv  : NeonI_3VDN_addhn_2Op<0b0, 0b0110, "subhn", sub, 0>;
-
-// pattern for operation with 2 operands
-class NeonI_3VD_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
-                    string asmop, string ResS, string OpS,
-                    SDPatternOperator opnode,
-                    RegisterOperand ResVPR, RegisterOperand OpVPR,
-                    ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs ResVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy ResVPR:$Rd),
-                    (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-// normal narrow pattern
-multiclass NeonI_3VDN_2Op<bit u, bits<4> opcode, string asmop,
-                          SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8b8h : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
-                              opnode, VPR64, VPR128, v8i8, v8i16>;
-    def _4h4s : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
-                              opnode, VPR64, VPR128, v4i16, v4i32>;
-    def _2s2d : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
-                              opnode, VPR64, VPR128, v2i32, v2i64>;
-  }
-}
-
-defm RADDHNvvv : NeonI_3VDN_2Op<0b1, 0b0100, "raddhn", int_arm_neon_vraddhn, 1>;
-defm RSUBHNvvv : NeonI_3VDN_2Op<0b1, 0b0110, "rsubhn", int_arm_neon_vrsubhn, 0>;
-
-// pattern for acle intrinsic with 3 operands
-class NeonI_3VDN_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
-                     string asmop, string ResS, string OpS>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let neverHasSideEffects = 1;
-}
-
-multiclass NeonI_3VDN_3Op_v1<bit u, bits<4> opcode, string asmop> {
-  def _16b8h : NeonI_3VDN_3Op<0b1, u, 0b00, opcode, asmop, "16b", "8h">;
-  def _8h4s : NeonI_3VDN_3Op<0b1, u, 0b01, opcode, asmop, "8h", "4s">;
-  def _4s2d : NeonI_3VDN_3Op<0b1, u, 0b10, opcode, asmop, "4s", "2d">;
-}
-
-defm ADDHN2vvv  : NeonI_3VDN_3Op_v1<0b0, 0b0100, "addhn2">;
-defm SUBHN2vvv  : NeonI_3VDN_3Op_v1<0b0, 0b0110, "subhn2">;
-
-defm RADDHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0100, "raddhn2">;
-defm RSUBHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0110, "rsubhn2">;
-
-// Patterns have to be separate because there's a SUBREG_TO_REG in the output
-// part.
-class NarrowHighHalfPat<Instruction INST, ValueType DstTy, ValueType SrcTy,
-                        SDPatternOperator coreop>
-  : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
-                      (v1i64 (bitconvert (DstTy (coreop (SrcTy VPR128:$Rn),
-                                                        (SrcTy VPR128:$Rm)))))),
-        (INST (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-              VPR128:$Rn, VPR128:$Rm)>;
-
-// addhn2 patterns
-def : NarrowHighHalfPat<ADDHN2vvv_16b8h, v8i8,  v8i16,
-          BinOpFrag<(NI_get_hi_8h (add node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<ADDHN2vvv_8h4s,  v4i16, v4i32,
-          BinOpFrag<(NI_get_hi_4s (add node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<ADDHN2vvv_4s2d,  v2i32, v2i64,
-          BinOpFrag<(NI_get_hi_2d (add node:$LHS, node:$RHS))>>;
-
-// subhn2 patterns
-def : NarrowHighHalfPat<SUBHN2vvv_16b8h, v8i8,  v8i16,
-          BinOpFrag<(NI_get_hi_8h (sub node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<SUBHN2vvv_8h4s,  v4i16, v4i32,
-          BinOpFrag<(NI_get_hi_4s (sub node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<SUBHN2vvv_4s2d,  v2i32, v2i64,
-          BinOpFrag<(NI_get_hi_2d (sub node:$LHS, node:$RHS))>>;
-
-// raddhn2 patterns
-def : NarrowHighHalfPat<RADDHN2vvv_16b8h, v8i8,  v8i16, int_arm_neon_vraddhn>;
-def : NarrowHighHalfPat<RADDHN2vvv_8h4s,  v4i16, v4i32, int_arm_neon_vraddhn>;
-def : NarrowHighHalfPat<RADDHN2vvv_4s2d,  v2i32, v2i64, int_arm_neon_vraddhn>;
-
-// rsubhn2 patterns
-def : NarrowHighHalfPat<RSUBHN2vvv_16b8h, v8i8,  v8i16, int_arm_neon_vrsubhn>;
-def : NarrowHighHalfPat<RSUBHN2vvv_8h4s,  v4i16, v4i32, int_arm_neon_vrsubhn>;
-def : NarrowHighHalfPat<RSUBHN2vvv_4s2d,  v2i32, v2i64, int_arm_neon_vrsubhn>;
-
-// pattern that need to extend result
-class NeonI_3VDL_Ext<bit q, bit u, bits<2> size, bits<4> opcode,
-                     string asmop, string ResS, string OpS,
-                     SDPatternOperator opnode,
-                     RegisterOperand OpVPR,
-                     ValueType ResTy, ValueType OpTy, ValueType OpSTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (zext (OpSTy (opnode (OpTy OpVPR:$Rn),
-                                                (OpTy OpVPR:$Rm))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDL_zext<bit u, bits<4> opcode, string asmop,
-                           SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b : NeonI_3VDL_Ext<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                               opnode, VPR64, v8i16, v8i8, v8i8>;
-    def _4s4h : NeonI_3VDL_Ext<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                               opnode, VPR64, v4i32, v4i16, v4i16>;
-    def _2d2s : NeonI_3VDL_Ext<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                               opnode, VPR64, v2i64, v2i32, v2i32>;
-  }
-}
-
-defm SABDLvvv : NeonI_3VDL_zext<0b0, 0b0111, "sabdl", int_arm_neon_vabds, 1>;
-defm UABDLvvv : NeonI_3VDL_zext<0b1, 0b0111, "uabdl", int_arm_neon_vabdu, 1>;
-
-multiclass NeonI_Op_High<SDPatternOperator op> {
-  def _16B : PatFrag<(ops node:$Rn, node:$Rm),
-                     (op (v8i8 (Neon_High16B node:$Rn)),
-                         (v8i8 (Neon_High16B node:$Rm)))>;
-  def _8H  : PatFrag<(ops node:$Rn, node:$Rm),
-                     (op (v4i16 (Neon_High8H node:$Rn)),
-                         (v4i16 (Neon_High8H node:$Rm)))>;
-  def _4S  : PatFrag<(ops node:$Rn, node:$Rm),
-                     (op (v2i32 (Neon_High4S node:$Rn)),
-                         (v2i32 (Neon_High4S node:$Rm)))>;
-}
-
-defm NI_sabdl_hi : NeonI_Op_High<int_arm_neon_vabds>;
-defm NI_uabdl_hi : NeonI_Op_High<int_arm_neon_vabdu>;
-defm NI_smull_hi : NeonI_Op_High<int_arm_neon_vmulls>;
-defm NI_umull_hi : NeonI_Op_High<int_arm_neon_vmullu>;
-defm NI_qdmull_hi : NeonI_Op_High<int_arm_neon_vqdmull>;
-defm NI_pmull_hi : NeonI_Op_High<int_arm_neon_vmullp>;
-
-multiclass NeonI_3VDL_Abd_u<bit u, bits<4> opcode, string asmop, string opnode,
-                            bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b  : NeonI_3VDL_Ext<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                                !cast<PatFrag>(opnode # "_16B"),
-                                VPR128, v8i16, v16i8, v8i8>;
-    def _4s4h  : NeonI_3VDL_Ext<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                !cast<PatFrag>(opnode # "_8H"),
-                                VPR128, v4i32, v8i16, v4i16>;
-    def _2d2s  : NeonI_3VDL_Ext<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                !cast<PatFrag>(opnode # "_4S"),
-                                VPR128, v2i64, v4i32, v2i32>;
-  }
-}
-
-defm SABDL2vvv : NeonI_3VDL_Abd_u<0b0, 0b0111, "sabdl2", "NI_sabdl_hi", 1>;
-defm UABDL2vvv : NeonI_3VDL_Abd_u<0b1, 0b0111, "uabdl2", "NI_uabdl_hi", 1>;
-
-// For pattern that need two operators being chained.
-class NeonI_3VDL_Aba<bit q, bit u, bits<2> size, bits<4> opcode,
-                     string asmop, string ResS, string OpS,
-                     SDPatternOperator opnode, SDPatternOperator subop,
-                     RegisterOperand OpVPR,
-                     ValueType ResTy, ValueType OpTy, ValueType OpSTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode
-                      (ResTy VPR128:$src),
-                      (ResTy (zext (OpSTy (subop (OpTy OpVPR:$Rn),
-                                                 (OpTy OpVPR:$Rm))))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_3VDL_Aba_v1<bit u, bits<4> opcode, string asmop,
-                             SDPatternOperator opnode, SDPatternOperator subop>{
-  def _8h8b : NeonI_3VDL_Aba<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                             opnode, subop, VPR64, v8i16, v8i8, v8i8>;
-  def _4s4h : NeonI_3VDL_Aba<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                             opnode, subop, VPR64, v4i32, v4i16, v4i16>;
-  def _2d2s : NeonI_3VDL_Aba<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                             opnode, subop, VPR64, v2i64, v2i32, v2i32>;
-}
-
-defm SABALvvv :  NeonI_3VDL_Aba_v1<0b0, 0b0101, "sabal",
-                                   add, int_arm_neon_vabds>;
-defm UABALvvv :  NeonI_3VDL_Aba_v1<0b1, 0b0101, "uabal",
-                                   add, int_arm_neon_vabdu>;
-
-multiclass NeonI_3VDL2_Aba_v1<bit u, bits<4> opcode, string asmop,
-                              SDPatternOperator opnode, string subop> {
-  def _8h8b : NeonI_3VDL_Aba<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                             opnode, !cast<PatFrag>(subop # "_16B"),
-                             VPR128, v8i16, v16i8, v8i8>;
-  def _4s4h : NeonI_3VDL_Aba<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                             opnode, !cast<PatFrag>(subop # "_8H"),
-                             VPR128, v4i32, v8i16, v4i16>;
-  def _2d2s : NeonI_3VDL_Aba<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                             opnode, !cast<PatFrag>(subop # "_4S"),
-                             VPR128, v2i64, v4i32, v2i32>;
-}
-
-defm SABAL2vvv :  NeonI_3VDL2_Aba_v1<0b0, 0b0101, "sabal2", add,
-                                     "NI_sabdl_hi">;
-defm UABAL2vvv :  NeonI_3VDL2_Aba_v1<0b1, 0b0101, "uabal2", add,
-                                     "NI_uabdl_hi">;
-
-// Long pattern with 2 operands
-multiclass NeonI_3VDL_2Op<bit u, bits<4> opcode, string asmop,
-                          SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable,
-      SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-    def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                              opnode, VPR128, VPR64, v8i16, v8i8>;
-    def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                              opnode, VPR128, VPR64, v4i32, v4i16>;
-    def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                              opnode, VPR128, VPR64, v2i64, v2i32>;
-  }
-}
-
-defm SMULLvvv :  NeonI_3VDL_2Op<0b0, 0b1100, "smull", int_arm_neon_vmulls, 1>;
-defm UMULLvvv :  NeonI_3VDL_2Op<0b1, 0b1100, "umull", int_arm_neon_vmullu, 1>;
-
-class NeonI_3VDL2_2Op_mull<bit q, bit u, bits<2> size, bits<4> opcode,
-                           string asmop, string ResS, string OpS,
-                           SDPatternOperator opnode,
-                           ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode (OpTy VPR128:$Rn), (OpTy VPR128:$Rm))))],
-                 NoItinerary>,
-    Sched<[WriteFPMul, ReadFPMul, ReadFPMul]>;
-
-multiclass NeonI_3VDL2_2Op_mull_v1<bit u, bits<4> opcode, string asmop,
-                                   string opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                                      !cast<PatFrag>(opnode # "_16B"),
-                                      v8i16, v16i8>;
-    def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                     !cast<PatFrag>(opnode # "_8H"),
-                                     v4i32, v8i16>;
-    def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                     !cast<PatFrag>(opnode # "_4S"),
-                                     v2i64, v4i32>;
-  }
-}
-
-defm SMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b0, 0b1100, "smull2",
-                                         "NI_smull_hi", 1>;
-defm UMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b1, 0b1100, "umull2",
-                                         "NI_umull_hi", 1>;
-
-// Long pattern with 3 operands
-class NeonI_3VDL_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
-                     string asmop, string ResS, string OpS,
-                     SDPatternOperator opnode,
-                     ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$src, VPR64:$Rn, VPR64:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode
-                      (ResTy VPR128:$src),
-                      (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))))],
-               NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_3VDL_3Op_v1<bit u, bits<4> opcode, string asmop,
-                             SDPatternOperator opnode> {
-  def _8h8b : NeonI_3VDL_3Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                             opnode, v8i16, v8i8>;
-  def _4s4h : NeonI_3VDL_3Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                             opnode, v4i32, v4i16>;
-  def _2d2s : NeonI_3VDL_3Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                             opnode, v2i64, v2i32>;
-}
-
-def Neon_smlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
-                         (add node:$Rd,
-                            (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
-
-def Neon_umlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
-                         (add node:$Rd,
-                            (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
-
-def Neon_smlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
-                         (sub node:$Rd,
-                            (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
-
-def Neon_umlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
-                         (sub node:$Rd,
-                            (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
-
-defm SMLALvvv :  NeonI_3VDL_3Op_v1<0b0, 0b1000, "smlal", Neon_smlal>;
-defm UMLALvvv :  NeonI_3VDL_3Op_v1<0b1, 0b1000, "umlal", Neon_umlal>;
-
-defm SMLSLvvv :  NeonI_3VDL_3Op_v1<0b0, 0b1010, "smlsl", Neon_smlsl>;
-defm UMLSLvvv :  NeonI_3VDL_3Op_v1<0b1, 0b1010, "umlsl", Neon_umlsl>;
-
-class NeonI_3VDL2_3Op_mlas<bit q, bit u, bits<2> size, bits<4> opcode,
-                           string asmop, string ResS, string OpS,
-                           SDPatternOperator subop, SDPatternOperator opnode,
-                           RegisterOperand OpVPR,
-                           ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-               (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
-               asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-               [(set (ResTy VPR128:$Rd),
-                  (ResTy (subop
-                    (ResTy VPR128:$src),
-                    (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))))],
-               NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_3VDL2_3Op_mlas_v1<bit u, bits<4> opcode, string asmop,
-                                   SDPatternOperator subop, string opnode> {
-  def _8h16b : NeonI_3VDL2_3Op_mlas<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                                    subop, !cast<PatFrag>(opnode # "_16B"),
-                                    VPR128, v8i16, v16i8>;
-  def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                   subop, !cast<PatFrag>(opnode # "_8H"),
-                                   VPR128, v4i32, v8i16>;
-  def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                   subop, !cast<PatFrag>(opnode # "_4S"),
-                                   VPR128, v2i64, v4i32>;
-}
-
-defm SMLAL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1000, "smlal2",
-                                          add, "NI_smull_hi">;
-defm UMLAL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1000, "umlal2",
-                                          add, "NI_umull_hi">;
-
-defm SMLSL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1010, "smlsl2",
-                                          sub, "NI_smull_hi">;
-defm UMLSL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1010, "umlsl2",
-                                          sub, "NI_umull_hi">;
-
-multiclass NeonI_3VDL_qdmlal_3Op_v2<bit u, bits<4> opcode, string asmop,
-                                    SDPatternOperator opnode> {
-  def _4s4h : NeonI_3VDL2_3Op_mlas<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                                   opnode, int_arm_neon_vqdmull,
-                                   VPR64, v4i32, v4i16>;
-  def _2d2s : NeonI_3VDL2_3Op_mlas<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                                   opnode, int_arm_neon_vqdmull,
-                                   VPR64, v2i64, v2i32>;
-}
-
-defm SQDMLALvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1001, "sqdmlal",
-                                           int_arm_neon_vqadds>;
-defm SQDMLSLvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1011, "sqdmlsl",
-                                           int_arm_neon_vqsubs>;
-
-multiclass NeonI_3VDL_v2<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                              opnode, VPR128, VPR64, v4i32, v4i16>;
-    def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                              opnode, VPR128, VPR64, v2i64, v2i32>;
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm SQDMULLvvv : NeonI_3VDL_v2<0b0, 0b1101, "sqdmull",
-                                int_arm_neon_vqdmull, 1>;
-}
-
-multiclass NeonI_3VDL2_2Op_mull_v2<bit u, bits<4> opcode, string asmop,
-                                   string opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                     !cast<PatFrag>(opnode # "_8H"),
-                                     v4i32, v8i16>;
-    def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                     !cast<PatFrag>(opnode # "_4S"),
-                                     v2i64, v4i32>;
-  }
-}
-
-defm SQDMULL2vvv : NeonI_3VDL2_2Op_mull_v2<0b0, 0b1101, "sqdmull2",
-                                           "NI_qdmull_hi", 1>;
-
-multiclass NeonI_3VDL2_3Op_qdmlal_v2<bit u, bits<4> opcode, string asmop,
-                                     SDPatternOperator opnode> {
-  def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                   opnode, NI_qdmull_hi_8H,
-                                   VPR128, v4i32, v8i16>;
-  def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                   opnode, NI_qdmull_hi_4S,
-                                   VPR128, v2i64, v4i32>;
-}
-
-defm SQDMLAL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1001, "sqdmlal2",
-                                             int_arm_neon_vqadds>;
-defm SQDMLSL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1011, "sqdmlsl2",
-                                             int_arm_neon_vqsubs>;
-
-multiclass NeonI_3VDL_v3<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode_8h8b,
-                         SDPatternOperator opnode_1q1d, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                              opnode_8h8b, VPR128, VPR64, v8i16, v8i8>;
-
-    def _1q1d : NeonI_3VD_2Op<0b0, u, 0b11, opcode, asmop, "1q", "1d",
-                              opnode_1q1d, VPR128, VPR64, v16i8, v1i64>;
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in
-defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp,
-                              int_aarch64_neon_vmull_p64, 1>;
-
-multiclass NeonI_3VDL2_2Op_mull_v3<bit u, bits<4> opcode, string asmop,
-                                   string opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                                      !cast<PatFrag>(opnode # "_16B"),
-                                      v8i16, v16i8>;
-
-    def _1q2d : 
-      NeonI_3VDiff<0b1, u, 0b11, opcode,
-                   (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-                   asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d",
-                   [(set (v16i8 VPR128:$Rd),
-                      (v16i8 (int_aarch64_neon_vmull_p64 
-                        (v1i64 (scalar_to_vector
-                          (i64 (vector_extract (v2i64 VPR128:$Rn), 1)))),
-                        (v1i64 (scalar_to_vector
-                          (i64 (vector_extract (v2i64 VPR128:$Rm), 1)))))))],
-                   NoItinerary>,
-      Sched<[WriteFPMul, ReadFPMul, ReadFPMul]>;
-  }
-
-  def : Pat<(v16i8 (int_aarch64_neon_vmull_p64
-                      (v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 1))),
-                      (v1i64 (extract_subvector (v2i64 VPR128:$Rm), (i64 1))))),
-            (!cast<Instruction>(NAME # "_1q2d") VPR128:$Rn, VPR128:$Rm)>;
-}
-
-defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", "NI_pmull_hi",
-                                         1>;
-
-// End of implementation for instruction class (3V Diff)
-
-// The followings are vector load/store multiple N-element structure
-// (class SIMD lselem).
-
-// ld1:         load multiple 1-element structure to 1/2/3/4 registers.
-// ld2/ld3/ld4: load multiple N-element structure to N registers (N = 2, 3, 4).
-//              The structure consists of a sequence of sets of N values.
-//              The first element of the structure is placed in the first lane
-//              of the first first vector, the second element in the first lane
-//              of the second vector, and so on.
-// E.g. LD1_3V_2S will load 32-bit elements {A, B, C, D, E, F} sequentially into
-// the three 64-bit vectors list {BA, DC, FE}.
-// E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three
-// 64-bit vectors list {DA, EB, FC}.
-// Store instructions store multiple structure to N registers like load.
-
-
-class NeonI_LDVList<bit q, bits<4> opcode, bits<2> size,
-                    RegisterOperand VecList, string asmop>
-  : NeonI_LdStMult<q, 1, opcode, size,
-                 (outs VecList:$Rt), (ins GPR64xsp:$Rn),
-                 asmop # "\t$Rt, [$Rn]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteVecLd, ReadVecLd]> {
-  let mayLoad = 1;
-  let neverHasSideEffects = 1;
-}
-
-multiclass LDVList_BHSD<bits<4> opcode, string List, string asmop> {
-  def _8B : NeonI_LDVList<0, opcode, 0b00,
-                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
-  def _4H : NeonI_LDVList<0, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
-  def _2S : NeonI_LDVList<0, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
-  def _16B : NeonI_LDVList<1, opcode, 0b00,
-                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
-  def _8H : NeonI_LDVList<1, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
-  def _4S : NeonI_LDVList<1, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
-  def _2D : NeonI_LDVList<1, opcode, 0b11,
-                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Load multiple N-element structure to N consecutive registers (N = 1,2,3,4)
-defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
-def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">;
-
-defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
-
-defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
-
-defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
-
-// Load multiple 1-element structure to N consecutive registers (N = 2,3,4)
-defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">;
-def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">;
-
-defm LD1x3 : LDVList_BHSD<0b0110, "VTriple", "ld1">;
-def LD1x3_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">;
-
-defm LD1x4 : LDVList_BHSD<0b0010, "VQuad", "ld1">;
-def LD1x4_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">;
-
-class NeonI_STVList<bit q, bits<4> opcode, bits<2> size,
-                    RegisterOperand VecList, string asmop>
-  : NeonI_LdStMult<q, 0, opcode, size,
-                 (outs), (ins GPR64xsp:$Rn, VecList:$Rt),
-                 asmop # "\t$Rt, [$Rn]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
-  let mayStore = 1;
-  let neverHasSideEffects = 1;
-}
-
-multiclass STVList_BHSD<bits<4> opcode, string List, string asmop> {
-  def _8B : NeonI_STVList<0, opcode, 0b00,
-                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
-  def _4H : NeonI_STVList<0, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
-  def _2S : NeonI_STVList<0, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
-  def _16B : NeonI_STVList<1, opcode, 0b00,
-                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
-  def _8H : NeonI_STVList<1, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
-  def _4S : NeonI_STVList<1, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
-  def _2D : NeonI_STVList<1, opcode, 0b11,
-                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Store multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
-def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">;
-
-defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
-
-defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
-
-defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
-
-// Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
-defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
-def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
-
-defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
-def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
-
-defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
-def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
-
-def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
-def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
-
-def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
-def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
-
-def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
-def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
-
-def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
-def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
-
-def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
-def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
-
-def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
-def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
-
-def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
-          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
-          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
-
-def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
-          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
-          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
-
-def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
-          (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
-          (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
-
-def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
-          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
-          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
-
-def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
-          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
-          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
-
-def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
-          (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
-          (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
-
-// Match load/store of v1i8/v1i16/v1i32 type to FPR8/FPR16/FPR32 load/store.
-// FIXME: for now we have v1i8, v1i16, v1i32 legal types, if they are illegal,
-// these patterns are not needed any more.
-def : Pat<(v1i8 (load GPR64xsp:$addr)), (LSFP8_LDR $addr, 0)>;
-def : Pat<(v1i16 (load GPR64xsp:$addr)), (LSFP16_LDR $addr, 0)>;
-def : Pat<(v1i32 (load GPR64xsp:$addr)), (LSFP32_LDR $addr, 0)>;
-
-def : Pat<(store (v1i8 FPR8:$value), GPR64xsp:$addr),
-          (LSFP8_STR $value, $addr, 0)>;
-def : Pat<(store (v1i16 FPR16:$value), GPR64xsp:$addr),
-          (LSFP16_STR $value, $addr, 0)>;
-def : Pat<(store (v1i32 FPR32:$value), GPR64xsp:$addr),
-          (LSFP32_STR $value, $addr, 0)>;
-
-
-// End of vector load/store multiple N-element structure(class SIMD lselem)
-
-// The followings are post-index vector load/store multiple N-element
-// structure(class SIMD lselem-post)
-def exact1_asmoperand : AsmOperandClass {
-  let Name = "Exact1";
-  let PredicateMethod = "isExactImm<1>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact1 : Operand<i32>, ImmLeaf<i32, [{return Imm == 1;}]> {
-  let ParserMatchClass = exact1_asmoperand;
-}
-
-def exact2_asmoperand : AsmOperandClass {
-  let Name = "Exact2";
-  let PredicateMethod = "isExactImm<2>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact2 : Operand<i32>, ImmLeaf<i32, [{return Imm == 2;}]> {
-  let ParserMatchClass = exact2_asmoperand;
-}
-
-def exact3_asmoperand : AsmOperandClass {
-  let Name = "Exact3";
-  let PredicateMethod = "isExactImm<3>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact3 : Operand<i32>, ImmLeaf<i32, [{return Imm == 3;}]> {
-  let ParserMatchClass = exact3_asmoperand;
-}
-
-def exact4_asmoperand : AsmOperandClass {
-  let Name = "Exact4";
-  let PredicateMethod = "isExactImm<4>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact4 : Operand<i32>, ImmLeaf<i32, [{return Imm == 4;}]> {
-  let ParserMatchClass = exact4_asmoperand;
-}
-
-def exact6_asmoperand : AsmOperandClass {
-  let Name = "Exact6";
-  let PredicateMethod = "isExactImm<6>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact6 : Operand<i32>, ImmLeaf<i32, [{return Imm == 6;}]> {
-  let ParserMatchClass = exact6_asmoperand;
-}
-
-def exact8_asmoperand : AsmOperandClass {
-  let Name = "Exact8";
-  let PredicateMethod = "isExactImm<8>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact8 : Operand<i32>, ImmLeaf<i32, [{return Imm == 8;}]> {
-  let ParserMatchClass = exact8_asmoperand;
-}
-
-def exact12_asmoperand : AsmOperandClass {
-  let Name = "Exact12";
-  let PredicateMethod = "isExactImm<12>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact12 : Operand<i32>, ImmLeaf<i32, [{return Imm == 12;}]> {
-  let ParserMatchClass = exact12_asmoperand;
-}
-
-def exact16_asmoperand : AsmOperandClass {
-  let Name = "Exact16";
-  let PredicateMethod = "isExactImm<16>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact16 : Operand<i32>, ImmLeaf<i32, [{return Imm == 16;}]> {
-  let ParserMatchClass = exact16_asmoperand;
-}
-
-def exact24_asmoperand : AsmOperandClass {
-  let Name = "Exact24";
-  let PredicateMethod = "isExactImm<24>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact24 : Operand<i32>, ImmLeaf<i32, [{return Imm == 24;}]> {
-  let ParserMatchClass = exact24_asmoperand;
-}
-
-def exact32_asmoperand : AsmOperandClass {
-  let Name = "Exact32";
-  let PredicateMethod = "isExactImm<32>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact32 : Operand<i32>, ImmLeaf<i32, [{return Imm == 32;}]> {
-  let ParserMatchClass = exact32_asmoperand;
-}
-
-def exact48_asmoperand : AsmOperandClass {
-  let Name = "Exact48";
-  let PredicateMethod = "isExactImm<48>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact48 : Operand<i32>, ImmLeaf<i32, [{return Imm == 48;}]> {
-  let ParserMatchClass = exact48_asmoperand;
-}
-
-def exact64_asmoperand : AsmOperandClass {
-  let Name = "Exact64";
-  let PredicateMethod = "isExactImm<64>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact64 : Operand<i32>, ImmLeaf<i32, [{return Imm == 64;}]> {
-  let ParserMatchClass = exact64_asmoperand;
-}
-
-multiclass NeonI_LDWB_VList<bit q, bits<4> opcode, bits<2> size,
-                           RegisterOperand VecList, Operand ImmTy,
-                           string asmop> {
-  let Constraints = "$Rn = $wb", mayLoad = 1, neverHasSideEffects = 1,
-      DecoderMethod = "DecodeVLDSTPostInstruction" in {
-    def _fixed : NeonI_LdStMult_Post<q, 1, opcode, size,
-                     (outs VecList:$Rt, GPR64xsp:$wb),
-                     (ins GPR64xsp:$Rn, ImmTy:$amt),
-                     asmop # "\t$Rt, [$Rn], $amt",
-                     [],
-                     NoItinerary>,
-                 Sched<[WriteVecLd, WriteVecLd, ReadVecLd]> {
-      let Rm = 0b11111;
-    }
-
-    def _register : NeonI_LdStMult_Post<q, 1, opcode, size,
-                        (outs VecList:$Rt, GPR64xsp:$wb),
-                        (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
-                        asmop # "\t$Rt, [$Rn], $Rm",
-                        [],
-                        NoItinerary>,
-                    Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]>;
-  }
-}
-
-multiclass LDWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
-    Operand ImmTy2, string asmop> {
-  defm _8B : NeonI_LDWB_VList<0, opcode, 0b00,
-                              !cast<RegisterOperand>(List # "8B_operand"),
-                              ImmTy, asmop>;
-
-  defm _4H : NeonI_LDWB_VList<0, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "4H_operand"),
-                              ImmTy, asmop>;
-
-  defm _2S : NeonI_LDWB_VList<0, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "2S_operand"),
-                              ImmTy, asmop>;
-
-  defm _16B : NeonI_LDWB_VList<1, opcode, 0b00,
-                               !cast<RegisterOperand>(List # "16B_operand"),
-                               ImmTy2, asmop>;
-
-  defm _8H : NeonI_LDWB_VList<1, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "8H_operand"),
-                              ImmTy2, asmop>;
-
-  defm _4S : NeonI_LDWB_VList<1, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "4S_operand"),
-                              ImmTy2, asmop>;
-
-  defm _2D : NeonI_LDWB_VList<1, opcode, 0b11,
-                              !cast<RegisterOperand>(List # "2D_operand"),
-                              ImmTy2, asmop>;
-}
-
-// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
-defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
-                                 "ld1">;
-
-defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
-
-defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
-                             "ld3">;
-
-defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
-
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
-                               "ld1">;
-defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
-                                   uimm_exact16, "ld1">;
-
-defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
-                               "ld1">;
-defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
-                                   uimm_exact24, "ld1">;
-
-defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
-                                "ld1">;
-defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
-                                   uimm_exact32, "ld1">;
-
-multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size,
-                            RegisterOperand VecList, Operand ImmTy,
-                            string asmop> {
-  let Constraints = "$Rn = $wb", mayStore = 1, neverHasSideEffects = 1,
-      DecoderMethod = "DecodeVLDSTPostInstruction" in {
-    def _fixed : NeonI_LdStMult_Post<q, 0, opcode, size,
-                     (outs GPR64xsp:$wb),
-                     (ins GPR64xsp:$Rn, ImmTy:$amt, VecList:$Rt),
-                     asmop # "\t$Rt, [$Rn], $amt",
-                     [],
-                     NoItinerary>,
-                 Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
-      let Rm = 0b11111;
-    }
-
-    def _register : NeonI_LdStMult_Post<q, 0, opcode, size,
-                      (outs GPR64xsp:$wb),
-                      (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VecList:$Rt),
-                      asmop # "\t$Rt, [$Rn], $Rm",
-                      [],
-                      NoItinerary>,
-                    Sched<[WriteVecSt, ReadVecSt, ReadVecSt, ReadVecSt]>;
-  }
-}
-
-multiclass STWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
-                           Operand ImmTy2, string asmop> {
-  defm _8B : NeonI_STWB_VList<0, opcode, 0b00,
-                 !cast<RegisterOperand>(List # "8B_operand"), ImmTy, asmop>;
-
-  defm _4H : NeonI_STWB_VList<0, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "4H_operand"),
-                              ImmTy, asmop>;
-
-  defm _2S : NeonI_STWB_VList<0, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "2S_operand"),
-                              ImmTy, asmop>;
-
-  defm _16B : NeonI_STWB_VList<1, opcode, 0b00,
-                               !cast<RegisterOperand>(List # "16B_operand"),
-                               ImmTy2, asmop>;
-
-  defm _8H : NeonI_STWB_VList<1, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "8H_operand"),
-                              ImmTy2, asmop>;
-
-  defm _4S : NeonI_STWB_VList<1, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "4S_operand"),
-                              ImmTy2, asmop>;
-
-  defm _2D : NeonI_STWB_VList<1, opcode, 0b11,
-                              !cast<RegisterOperand>(List # "2D_operand"),
-                              ImmTy2, asmop>;
-}
-
-// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
-defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
-                                 "st1">;
-
-defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
-
-defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
-                             "st3">;
-
-defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
-
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
-                               "st1">;
-defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
-                                   uimm_exact16, "st1">;
-
-defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
-                               "st1">;
-defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
-                                   uimm_exact24, "st1">;
-
-defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
-                               "st1">;
-defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
-                                   uimm_exact32, "st1">;
-
-// End of post-index vector load/store multiple N-element structure
-// (class SIMD lselem-post)
-
-// The followings are vector load/store single N-element structure
-// (class SIMD lsone).
-def neon_uimm0_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm == 0;}]> {
-  let ParserMatchClass = neon_uimm0_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm1_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm < 2;}]> {
-  let ParserMatchClass = neon_uimm1_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm2_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm < 4;}]> {
-  let ParserMatchClass = neon_uimm2_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm3_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm < 8;}]> {
-  let ParserMatchClass = uimm3_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm4_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm < 16;}]> {
-  let ParserMatchClass = uimm4_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-class NeonI_LDN_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
-                    RegisterOperand VecList, string asmop>
-    : NeonI_LdOne_Dup<q, r, opcode, size,
-                      (outs VecList:$Rt), (ins GPR64xsp:$Rn),
-                      asmop # "\t$Rt, [$Rn]",
-                      [],
-                      NoItinerary>,
-      Sched<[WriteVecLd, ReadVecLd]> {
-  let mayLoad = 1;
-  let neverHasSideEffects = 1;
-}
-
-multiclass LDN_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop> {
-  def _8B : NeonI_LDN_Dup<0, r, opcode, 0b00,
-                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
-  def _4H : NeonI_LDN_Dup<0, r, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
-  def _2S : NeonI_LDN_Dup<0, r, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
-  def _1D : NeonI_LDN_Dup<0, r, opcode, 0b11,
-                          !cast<RegisterOperand>(List # "1D_operand"), asmop>;
-
-  def _16B : NeonI_LDN_Dup<1, r, opcode, 0b00,
-                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
-  def _8H : NeonI_LDN_Dup<1, r, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
-  def _4S : NeonI_LDN_Dup<1, r, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
-  def _2D : NeonI_LDN_Dup<1, r, opcode, 0b11,
-                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Load single 1-element structure to all lanes of 1 register
-defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">;
-
-// Load single N-element structure to all lanes of N consecutive
-// registers (N = 2,3,4)
-defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
-defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
-defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
-
-
-class LD1R_pattern <ValueType VTy, ValueType DTy, PatFrag LoadOp,
-                    Instruction INST>
-    : Pat<(VTy (Neon_vdup (DTy (LoadOp GPR64xsp:$Rn)))),
-          (VTy (INST GPR64xsp:$Rn))>;
-
-// Match all LD1R instructions
-def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
-
-def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
-
-def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
-
-def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
-
-def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
-def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
-
-def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
-def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
-
-def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
-def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
-
-class LD1R_pattern_v1 <ValueType VTy, ValueType DTy, PatFrag LoadOp,
-                       Instruction INST>
-  : Pat<(VTy (scalar_to_vector (DTy (LoadOp GPR64xsp:$Rn)))),
-        (VTy (INST GPR64xsp:$Rn))>;
-
-def : LD1R_pattern_v1<v1i64, i64, load, LD1R_1D>;
-def : LD1R_pattern_v1<v1f64, f64, load, LD1R_1D>;
-
-multiclass VectorList_Bare_BHSD<string PREFIX, int Count,
-                                RegisterClass RegList> {
-  defm B : VectorList_operands<PREFIX, "B", Count, RegList>;
-  defm H : VectorList_operands<PREFIX, "H", Count, RegList>;
-  defm S : VectorList_operands<PREFIX, "S", Count, RegList>;
-  defm D : VectorList_operands<PREFIX, "D", Count, RegList>;
-}
-
-// Special vector list operand of 128-bit vectors with bare layout.
-// i.e. only show ".b", ".h", ".s", ".d"
-defm VOne : VectorList_Bare_BHSD<"VOne", 1, FPR128>;
-defm VPair : VectorList_Bare_BHSD<"VPair", 2, QPair>;
-defm VTriple : VectorList_Bare_BHSD<"VTriple", 3, QTriple>;
-defm VQuad : VectorList_Bare_BHSD<"VQuad", 4, QQuad>;
-
-class NeonI_LDN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                     Operand ImmOp, string asmop>
-    : NeonI_LdStOne_Lane<1, r, op2_1, op0,
-                         (outs VList:$Rt),
-                         (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane),
-                         asmop # "\t$Rt[$lane], [$Rn]",
-                         [],
-                         NoItinerary>,
-      Sched<[WriteVecLd, ReadVecLd, ReadVecLd]> {
-  let mayLoad = 1;
-  let neverHasSideEffects = 1;
-  let hasExtraDefRegAllocReq = 1;
-  let Constraints = "$src = $Rt";
-}
-
-multiclass LDN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
-  def _B : NeonI_LDN_Lane<r, 0b00, op0,
-                          !cast<RegisterOperand>(List # "B_operand"),
-                          neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _H : NeonI_LDN_Lane<r, 0b01, op0,
-                          !cast<RegisterOperand>(List # "H_operand"),
-                          neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _S : NeonI_LDN_Lane<r, 0b10, op0,
-                          !cast<RegisterOperand>(List # "S_operand"),
-                          neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _D : NeonI_LDN_Lane<r, 0b10, op0,
-                          !cast<RegisterOperand>(List # "D_operand"),
-                          neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-}
-
-// Load single 1-element structure to one lane of 1 register.
-defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">;
-
-// Load single N-element structure to one lane of N consecutive registers
-// (N = 2,3,4)
-defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
-defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
-defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
-
-multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
-                          Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
-                          Instruction INST> {
-  def : Pat<(VTy (vector_insert (VTy VPR64:$src),
-                     (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))),
-            (VTy (EXTRACT_SUBREG
-                     (INST GPR64xsp:$Rn,
-                           (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                           ImmOp:$lane),
-                     sub_64))>;
-
-  def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src),
-                      (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))),
-            (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
-}
-
-// Match all LD1LN instructions
-defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
-                      extloadi8, LD1LN_B>;
-
-defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
-                      extloadi16, LD1LN_H>;
-
-defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
-                      load, LD1LN_S>;
-defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
-                      load, LD1LN_S>;
-
-defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
-                      load, LD1LN_D>;
-defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
-                      load, LD1LN_D>;
-
-class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                     Operand ImmOp, string asmop>
-    : NeonI_LdStOne_Lane<0, r, op2_1, op0,
-                         (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane),
-                         asmop # "\t$Rt[$lane], [$Rn]",
-                         [],
-                         NoItinerary>,
-      Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
-  let mayStore = 1;
-  let neverHasSideEffects = 1;
-  let hasExtraDefRegAllocReq = 1;
-}
-
-multiclass STN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
-  def _B : NeonI_STN_Lane<r, 0b00, op0,
-                          !cast<RegisterOperand>(List # "B_operand"),
-                          neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _H : NeonI_STN_Lane<r, 0b01, op0,
-                          !cast<RegisterOperand>(List # "H_operand"),
-                          neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _S : NeonI_STN_Lane<r, 0b10, op0,
-                          !cast<RegisterOperand>(List # "S_operand"),
-                           neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _D : NeonI_STN_Lane<r, 0b10, op0,
-                          !cast<RegisterOperand>(List # "D_operand"),
-                          neon_uimm1_bare, asmop>{
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-}
-
-// Store single 1-element structure from one lane of 1 register.
-defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">;
-
-// Store single N-element structure from one lane of N consecutive registers
-// (N = 2,3,4)
-defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
-defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
-defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
-
-multiclass ST1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
-                          Operand ImmOp, Operand ImmOp2, PatFrag StoreOp,
-                          Instruction INST> {
-  def : Pat<(StoreOp (DTy (vector_extract (VTy VPR64:$Rt), ImmOp:$lane)),
-                     GPR64xsp:$Rn),
-            (INST GPR64xsp:$Rn,
-                  (SUBREG_TO_REG (i64 0), VPR64:$Rt, sub_64),
-                  ImmOp:$lane)>;
-
-  def : Pat<(StoreOp (DTy (vector_extract (VTy2 VPR128:$Rt), ImmOp2:$lane)),
-                     GPR64xsp:$Rn),
-            (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>;
-}
-
-// Match all ST1LN instructions
-defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
-                      truncstorei8, ST1LN_B>;
-
-defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
-                      truncstorei16, ST1LN_H>;
-
-defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
-                      store, ST1LN_S>;
-defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
-                      store, ST1LN_S>;
-
-defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
-                      store, ST1LN_D>;
-defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
-                      store, ST1LN_D>;
-
-// End of vector load/store single N-element structure (class SIMD lsone).
-
-
-// The following are post-index load/store single N-element instructions
-// (class SIMD lsone-post)
-
-multiclass NeonI_LDN_WB_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
-                            RegisterOperand VecList, Operand ImmTy,
-                            string asmop> {
-  let mayLoad = 1, neverHasSideEffects = 1, Constraints = "$wb = $Rn",
-  DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
-    def _fixed : NeonI_LdOne_Dup_Post<q, r, opcode, size,
-                      (outs VecList:$Rt, GPR64xsp:$wb),
-                      (ins GPR64xsp:$Rn, ImmTy:$amt),
-                      asmop # "\t$Rt, [$Rn], $amt",
-                      [],
-                      NoItinerary>,
-                 Sched<[WriteVecLd, WriteVecLd, ReadVecLd]> {
-      let Rm = 0b11111;
-    }
-
-    def _register : NeonI_LdOne_Dup_Post<q, r, opcode, size,
-                      (outs VecList:$Rt, GPR64xsp:$wb),
-                      (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
-                      asmop # "\t$Rt, [$Rn], $Rm",
-                      [],
-                      NoItinerary>,
-                    Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]>;
-  }
-}
-
-multiclass LDWB_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop,
-                         Operand uimm_b, Operand uimm_h,
-                         Operand uimm_s, Operand uimm_d> {
-  defm _8B : NeonI_LDN_WB_Dup<0, r, opcode, 0b00,
-                              !cast<RegisterOperand>(List # "8B_operand"),
-                              uimm_b, asmop>;
-
-  defm _4H : NeonI_LDN_WB_Dup<0, r, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "4H_operand"),
-                              uimm_h, asmop>;
-
-  defm _2S : NeonI_LDN_WB_Dup<0, r, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "2S_operand"),
-                              uimm_s, asmop>;
-
-  defm _1D : NeonI_LDN_WB_Dup<0, r, opcode, 0b11,
-                              !cast<RegisterOperand>(List # "1D_operand"),
-                              uimm_d, asmop>;
-
-  defm _16B : NeonI_LDN_WB_Dup<1, r, opcode, 0b00,
-                               !cast<RegisterOperand>(List # "16B_operand"),
-                               uimm_b, asmop>;
-
-  defm _8H : NeonI_LDN_WB_Dup<1, r, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "8H_operand"),
-                              uimm_h, asmop>;
-
-  defm _4S : NeonI_LDN_WB_Dup<1, r, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "4S_operand"),
-                              uimm_s, asmop>;
-
-  defm _2D : NeonI_LDN_WB_Dup<1, r, opcode, 0b11,
-                              !cast<RegisterOperand>(List # "2D_operand"),
-                              uimm_d, asmop>;
-}
-
-// Post-index load single 1-element structure to all lanes of 1 register
-defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1,
-                             uimm_exact2, uimm_exact4, uimm_exact8>;
-
-// Post-index load single N-element structure to all lanes of N consecutive
-// registers (N = 2,3,4)
-defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
-                             uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
-                             uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
-                             uimm_exact8, uimm_exact16, uimm_exact32>;
-
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
-    Constraints = "$Rn = $wb, $Rt = $src",
-    DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
-  class LDN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                                Operand ImmTy, Operand ImmOp, string asmop>
-      : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
-                                (outs VList:$Rt, GPR64xsp:$wb),
-                                (ins GPR64xsp:$Rn, ImmTy:$amt,
-                                    VList:$src, ImmOp:$lane),
-                                asmop # "\t$Rt[$lane], [$Rn], $amt",
-                                [],
-                                NoItinerary>,
-        Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]> {
-    let Rm = 0b11111;
-  }
-
-  class LDN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                                 Operand ImmTy, Operand ImmOp, string asmop>
-      : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
-                                (outs VList:$Rt, GPR64xsp:$wb),
-                                (ins GPR64xsp:$Rn, GPR64noxzr:$Rm,
-                                    VList:$src, ImmOp:$lane),
-                                asmop # "\t$Rt[$lane], [$Rn], $Rm",
-                                [],
-                                NoItinerary>,
-        Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd, ReadVecLd]>;
-}
-
-multiclass LD_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
-                           Operand uimm_b, Operand uimm_h,
-                           Operand uimm_s, Operand uimm_d> {
-  def _B_fixed : LDN_WBFx_Lane<r, 0b00, op0,
-                               !cast<RegisterOperand>(List # "B_operand"),
-                               uimm_b, neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _B_register : LDN_WBReg_Lane<r, 0b00, op0,
-                                   !cast<RegisterOperand>(List # "B_operand"),
-                                   uimm_b, neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _H_fixed : LDN_WBFx_Lane<r, 0b01, op0,
-                               !cast<RegisterOperand>(List # "H_operand"),
-                               uimm_h, neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _H_register : LDN_WBReg_Lane<r, 0b01, op0,
-                                   !cast<RegisterOperand>(List # "H_operand"),
-                                   uimm_h, neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _S_fixed : LDN_WBFx_Lane<r, 0b10, op0,
-                               !cast<RegisterOperand>(List # "S_operand"),
-                               uimm_s, neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _S_register : LDN_WBReg_Lane<r, 0b10, op0,
-                                   !cast<RegisterOperand>(List # "S_operand"),
-                                   uimm_s, neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _D_fixed : LDN_WBFx_Lane<r, 0b10, op0,
-                               !cast<RegisterOperand>(List # "D_operand"),
-                               uimm_d, neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-
-  def _D_register : LDN_WBReg_Lane<r, 0b10, op0,
-                                   !cast<RegisterOperand>(List # "D_operand"),
-                                   uimm_d, neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-}
-
-// Post-index load single 1-element structure to one lane of 1 register.
-defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1,
-                                uimm_exact2, uimm_exact4, uimm_exact8>;
-
-// Post-index load single N-element structure to one lane of N consecutive
-// registers
-// (N = 2,3,4)
-defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
-                                uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
-                                uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
-                                uimm_exact8, uimm_exact16, uimm_exact32>;
-
-let mayStore = 1, neverHasSideEffects = 1,
-    hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb",
-    DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
-  class STN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                      Operand ImmTy, Operand ImmOp, string asmop>
-      : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
-                                (outs GPR64xsp:$wb),
-                                (ins GPR64xsp:$Rn, ImmTy:$amt,
-                                    VList:$Rt, ImmOp:$lane),
-                                asmop # "\t$Rt[$lane], [$Rn], $amt",
-                                [],
-                                NoItinerary>,
-        Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
-    let Rm = 0b11111;
-  }
-
-  class STN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                       Operand ImmTy, Operand ImmOp, string asmop>
-      : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
-                                (outs GPR64xsp:$wb),
-                                (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VList:$Rt,
-                                    ImmOp:$lane),
-                                asmop # "\t$Rt[$lane], [$Rn], $Rm",
-                                [],
-                                NoItinerary>,
-        Sched<[WriteVecSt, ReadVecSt, ReadVecSt, ReadVecSt]>;
-}
-
-multiclass ST_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
-                           Operand uimm_b, Operand uimm_h,
-                           Operand uimm_s, Operand uimm_d> {
-  def _B_fixed : STN_WBFx_Lane<r, 0b00, op0,
-                               !cast<RegisterOperand>(List # "B_operand"),
-                               uimm_b, neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _B_register : STN_WBReg_Lane<r, 0b00, op0,
-                                   !cast<RegisterOperand>(List # "B_operand"),
-                                   uimm_b, neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _H_fixed : STN_WBFx_Lane<r, 0b01, op0,
-                               !cast<RegisterOperand>(List # "H_operand"),
-                               uimm_h, neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _H_register : STN_WBReg_Lane<r, 0b01, op0,
-                                   !cast<RegisterOperand>(List # "H_operand"),
-                                   uimm_h, neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _S_fixed : STN_WBFx_Lane<r, 0b10, op0,
-                               !cast<RegisterOperand>(List # "S_operand"),
-                               uimm_s, neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _S_register : STN_WBReg_Lane<r, 0b10, op0,
-                                   !cast<RegisterOperand>(List # "S_operand"),
-                                   uimm_s, neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _D_fixed : STN_WBFx_Lane<r, 0b10, op0,
-                               !cast<RegisterOperand>(List # "D_operand"),
-                               uimm_d, neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-
-  def _D_register : STN_WBReg_Lane<r, 0b10, op0,
-                                   !cast<RegisterOperand>(List # "D_operand"),
-                                   uimm_d, neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-}
-
-// Post-index store single 1-element structure from one lane of 1 register.
-defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1,
-                                uimm_exact2, uimm_exact4, uimm_exact8>;
-
-// Post-index store single N-element structure from one lane of N consecutive
-// registers (N = 2,3,4)
-defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
-                                uimm_exact4, uimm_exact8, uimm_exact16>;
-defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
-                                uimm_exact6, uimm_exact12, uimm_exact24>;
-defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
-                                uimm_exact8, uimm_exact16, uimm_exact32>;
-
-// End of post-index load/store single N-element instructions
-// (class SIMD lsone-post)
-
-// Neon Scalar instructions implementation
-// Scalar Three Same
-
-class NeonI_Scalar3Same_size<bit u, bits<2> size, bits<5> opcode, string asmop,
-                             RegisterClass FPRC>
-  : NeonI_Scalar3Same<u, size, opcode,
-                      (outs FPRC:$Rd), (ins FPRC:$Rn, FPRC:$Rm),
-                      !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                      [],
-                      NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
-
-multiclass NeonI_Scalar3Same_HS_sizes<bit u, bits<5> opcode, string asmop,
-                                      bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
-    def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
-  }
-}
-
-multiclass NeonI_Scalar3Same_SD_sizes<bit u, bit size_high, bits<5> opcode,
-                                      string asmop, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def sss : NeonI_Scalar3Same_size<u, {size_high, 0b0}, opcode, asmop, FPR32>;
-    def ddd : NeonI_Scalar3Same_size<u, {size_high, 0b1}, opcode, asmop, FPR64>;
-  }
-}
-
-multiclass NeonI_Scalar3Same_BHSD_sizes<bit u, bits<5> opcode,
-                                        string asmop, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def bbb : NeonI_Scalar3Same_size<u, 0b00, opcode, asmop, FPR8>;
-    def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
-    def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
-    def ddd : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
-  }
-}
-
-multiclass Neon_Scalar3Same_D_size_patterns<SDPatternOperator opnode,
-                                            Instruction INSTD> {
-  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-            (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_BHSD_size_patterns<SDPatternOperator opnode,
-                                               Instruction INSTB,
-                                               Instruction INSTH,
-                                               Instruction INSTS,
-                                               Instruction INSTD>
-  : Neon_Scalar3Same_D_size_patterns<opnode, INSTD> {
-  def: Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
-           (INSTB FPR8:$Rn, FPR8:$Rm)>;
-  def: Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-           (INSTH FPR16:$Rn, FPR16:$Rm)>;
-  def: Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-           (INSTS FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_HS_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTH,
-                                             Instruction INSTS> {
-  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-            (INSTH FPR16:$Rn, FPR16:$Rm)>;
-  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_SD_size_patterns<SDPatternOperator opnode,
-                                             ValueType SResTy, ValueType STy,
-                                             Instruction INSTS, ValueType DResTy,
-                                             ValueType DTy, Instruction INSTD> {
-  def : Pat<(SResTy (opnode (STy FPR32:$Rn), (STy FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-  def : Pat<(DResTy (opnode (DTy FPR64:$Rn), (DTy FPR64:$Rm))),
-            (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-class Neon_Scalar3Same_cmp_V1_D_size_patterns<CondCode CC,
-                                              Instruction INSTD>
-  : Pat<(v1i64 (Neon_cmp (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm), CC)),
-        (INSTD FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Three Different
-
-class NeonI_Scalar3Diff_size<bit u, bits<2> size, bits<4> opcode, string asmop,
-                             RegisterClass FPRCD, RegisterClass FPRCS>
-  : NeonI_Scalar3Diff<u, size, opcode,
-                      (outs FPRCD:$Rd), (ins FPRCS:$Rn, FPRCS:$Rm),
-                      !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                      [],
-                      NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar3Diff_HS_size<bit u, bits<4> opcode, string asmop> {
-  def shh : NeonI_Scalar3Diff_size<u, 0b01, opcode, asmop, FPR32, FPR16>;
-  def dss : NeonI_Scalar3Diff_size<u, 0b10, opcode, asmop, FPR64, FPR32>;
-}
-
-multiclass NeonI_Scalar3Diff_ml_HS_size<bit u, bits<4> opcode, string asmop> {
-  let Constraints = "$Src = $Rd" in {
-    def shh : NeonI_Scalar3Diff<u, 0b01, opcode,
-                       (outs FPR32:$Rd), (ins FPR32:$Src, FPR16:$Rn, FPR16:$Rm),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                       [],
-                       NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]>;
-    def dss : NeonI_Scalar3Diff<u, 0b10, opcode,
-                       (outs FPR64:$Rd), (ins FPR64:$Src, FPR32:$Rn, FPR32:$Rm),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                       [],
-                       NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-multiclass Neon_Scalar3Diff_HS_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTH,
-                                             Instruction INSTS> {
-  def : Pat<(v1i32 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-            (INSTH FPR16:$Rn, FPR16:$Rm)>;
-  def : Pat<(v1i64 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass Neon_Scalar3Diff_ml_HS_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTH,
-                                             Instruction INSTS> {
-  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-            (INSTH FPR32:$Src, FPR16:$Rn, FPR16:$Rm)>;
-  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-            (INSTS FPR64:$Src, FPR32:$Rn, FPR32:$Rm)>;
-}
-
-// Scalar Two Registers Miscellaneous
-
-class NeonI_Scalar2SameMisc_size<bit u, bits<2> size, bits<5> opcode, string asmop,
-                             RegisterClass FPRCD, RegisterClass FPRCS>
-  : NeonI_Scalar2SameMisc<u, size, opcode,
-                          (outs FPRCD:$Rd), (ins FPRCS:$Rn),
-                          !strconcat(asmop, "\t$Rd, $Rn"),
-                          [],
-                          NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar2SameMisc_SD_size<bit u, bit size_high, bits<5> opcode,
-                                         string asmop> {
-  def ss : NeonI_Scalar2SameMisc_size<u, {size_high, 0b0}, opcode, asmop, FPR32,
-                                      FPR32>;
-  def dd : NeonI_Scalar2SameMisc_size<u, {size_high, 0b1}, opcode, asmop, FPR64,
-                                      FPR64>;
-}
-
-multiclass NeonI_Scalar2SameMisc_D_size<bit u, bits<5> opcode, string asmop> {
-  def dd : NeonI_Scalar2SameMisc_size<u, 0b11, opcode, asmop, FPR64, FPR64>;
-}
-
-multiclass NeonI_Scalar2SameMisc_BHSD_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar2SameMisc_D_size<u, opcode, asmop> {
-  def bb : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR8>;
-  def hh : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR16>;
-  def ss : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR32>;
-}
-
-class NeonI_Scalar2SameMisc_fcvtxn_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR32, FPR64>;
-
-multiclass NeonI_Scalar2SameMisc_narrow_HSD_size<bit u, bits<5> opcode,
-                                                 string asmop> {
-  def bh : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR16>;
-  def hs : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR32>;
-  def sd : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR64>;
-}
-
-class NeonI_Scalar2SameMisc_accum_size<bit u, bits<2> size, bits<5> opcode,
-                                       string asmop, RegisterClass FPRC>
-  : NeonI_Scalar2SameMisc<u, size, opcode,
-                          (outs FPRC:$Rd), (ins FPRC:$Src, FPRC:$Rn),
-                          !strconcat(asmop, "\t$Rd, $Rn"),
-                          [],
-                          NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar2SameMisc_accum_BHSD_size<bit u, bits<5> opcode,
-                                                 string asmop> {
-
-  let Constraints = "$Src = $Rd" in {
-    def bb : NeonI_Scalar2SameMisc_accum_size<u, 0b00, opcode, asmop, FPR8>;
-    def hh : NeonI_Scalar2SameMisc_accum_size<u, 0b01, opcode, asmop, FPR16>;
-    def ss : NeonI_Scalar2SameMisc_accum_size<u, 0b10, opcode, asmop, FPR32>;
-    def dd : NeonI_Scalar2SameMisc_accum_size<u, 0b11, opcode, asmop, FPR64>;
-  }
-}
-
-class Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<SDPatternOperator opnode,
-                                                  Instruction INSTD>
-  : Pat<(f32 (opnode (f64 FPR64:$Rn))),
-        (INSTD FPR64:$Rn)>;
-
-multiclass Neon_Scalar2SameMisc_fcvt_SD_size_patterns<SDPatternOperator opnode,
-                                                      Instruction INSTS,
-                                                      Instruction INSTD> {
-  def : Pat<(v1i32 (opnode (f32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-  def : Pat<(v1i64 (opnode (f64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-}
-
-class Neon_Scalar2SameMisc_vcvt_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-
-multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns<SDPatternOperator opnode,
-                                                     Instruction INSTS,
-                                                     Instruction INSTD> {
-  def : Pat<(f32 (opnode (v1i32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-  def : Pat<(f64 (opnode (v1i64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-}
-
-multiclass Neon_Scalar2SameMisc_SD_size_patterns<SDPatternOperator opnode,
-                                                 Instruction INSTS,
-                                                 Instruction INSTD> {
-  def : Pat<(f32 (opnode (f32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-  def : Pat<(f64 (opnode (f64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-}
-
-class Neon_Scalar2SameMisc_V1_D_size_patterns<SDPatternOperator opnode,
-                                              Instruction INSTD>
-  : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))),
-        (INSTD FPR64:$Rn)>;
-
-class NeonI_Scalar2SameMisc_cmpz_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar2SameMisc<u, 0b11, opcode,
-                          (outs FPR64:$Rd), (ins FPR64:$Rn, neon_uimm0:$Imm),
-                          !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                          [],
-                          NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar2SameMisc_cmpz_SD_size<bit u, bits<5> opcode,
-                                              string asmop> {
-  def ssi : NeonI_Scalar2SameMisc<u, 0b10, opcode,
-                           (outs FPR32:$Rd), (ins FPR32:$Rn, fpzz32:$FPImm),
-                           !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
-                           [],
-                           NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-  def ddi : NeonI_Scalar2SameMisc<u, 0b11, opcode,
-                           (outs FPR64:$Rd), (ins FPR64:$Rn, fpzz32:$FPImm),
-                           !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
-                           [],
-                           NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-class Neon_Scalar2SameMisc_cmpz_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
-                       (v1i64 (bitconvert (v8i8 Neon_AllZero))))),
-        (INSTD FPR64:$Rn, 0)>;
-
-class Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<CondCode CC,
-                                                   Instruction INSTD>
-  : Pat<(v1i64 (Neon_cmpz (v1i64 FPR64:$Rn),
-                          (i32 neon_uimm0:$Imm), CC)),
-        (INSTD FPR64:$Rn, neon_uimm0:$Imm)>;
-
-multiclass Neon_Scalar2SameMisc_cmpz_SD_size_patterns<SDPatternOperator opnode,
-                                                      CondCode CC,
-                                                      Instruction INSTS,
-                                                      Instruction INSTD> {
-  def : Pat<(v1i32 (opnode (f32 FPR32:$Rn), (f32 fpzz32:$FPImm))),
-            (INSTS FPR32:$Rn, fpzz32:$FPImm)>;
-  def : Pat<(v1i64 (opnode (f64 FPR64:$Rn), (f32 fpzz32:$FPImm))),
-            (INSTD FPR64:$Rn, fpzz32:$FPImm)>;
-  def : Pat<(v1i64 (Neon_cmpz (v1f64 FPR64:$Rn), (f32 fpzz32:$FPImm), CC)),
-            (INSTD FPR64:$Rn, fpzz32:$FPImm)>;
-}
-
-multiclass Neon_Scalar2SameMisc_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD> {
-  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-}
-
-multiclass Neon_Scalar2SameMisc_BHSD_size_patterns<SDPatternOperator opnode,
-                                                   Instruction INSTB,
-                                                   Instruction INSTH,
-                                                   Instruction INSTS,
-                                                   Instruction INSTD>
-  : Neon_Scalar2SameMisc_D_size_patterns<opnode, INSTD> {
-  def : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn))),
-            (INSTB FPR8:$Rn)>;
-  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn))),
-            (INSTH FPR16:$Rn)>;
-  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-}
-
-multiclass Neon_Scalar2SameMisc_narrow_HSD_size_patterns<
-                                                       SDPatternOperator opnode,
-                                                       Instruction INSTH,
-                                                       Instruction INSTS,
-                                                       Instruction INSTD> {
-  def : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn))),
-            (INSTH FPR16:$Rn)>;
-  def : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-  def : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-
-}
-
-multiclass Neon_Scalar2SameMisc_accum_BHSD_size_patterns<
-                                                       SDPatternOperator opnode,
-                                                       Instruction INSTB,
-                                                       Instruction INSTH,
-                                                       Instruction INSTS,
-                                                       Instruction INSTD> {
-  def : Pat<(v1i8 (opnode (v1i8 FPR8:$Src), (v1i8 FPR8:$Rn))),
-            (INSTB FPR8:$Src, FPR8:$Rn)>;
-  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Src), (v1i16 FPR16:$Rn))),
-            (INSTH FPR16:$Src, FPR16:$Rn)>;
-  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i32 FPR32:$Rn))),
-            (INSTS FPR32:$Src, FPR32:$Rn)>;
-  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn))),
-            (INSTD FPR64:$Src, FPR64:$Rn)>;
-}
-
-// Scalar Shift By Immediate
-
-class NeonI_ScalarShiftImm_size<bit u, bits<5> opcode, string asmop,
-                                RegisterClass FPRC, Operand ImmTy>
-  : NeonI_ScalarShiftImm<u, opcode,
-                         (outs FPRC:$Rd), (ins FPRC:$Rn, ImmTy:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_ScalarShiftRightImm_D_size<bit u, bits<5> opcode,
-                                            string asmop> {
-  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-  }
-}
-
-multiclass NeonI_ScalarShiftRightImm_BHSD_size<bit u, bits<5> opcode,
-                                               string asmop>
-  : NeonI_ScalarShiftRightImm_D_size<u, opcode, asmop> {
-  def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shr_imm8> {
-    bits<3> Imm;
-    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
-    let Inst{18-16} = Imm;
-  }
-  def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shr_imm16> {
-    bits<4> Imm;
-    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
-    let Inst{19-16} = Imm;
-  }
-  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
-    bits<5> Imm;
-    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
-    let Inst{20-16} = Imm;
-  }
-}
-
-multiclass NeonI_ScalarShiftLeftImm_D_size<bit u, bits<5> opcode,
-                                            string asmop> {
-  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shl_imm64> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-  }
-}
-
-multiclass NeonI_ScalarShiftLeftImm_BHSD_size<bit u, bits<5> opcode,
-                                              string asmop>
-  : NeonI_ScalarShiftLeftImm_D_size<u, opcode, asmop> {
-  def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shl_imm8> {
-    bits<3> Imm;
-    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
-    let Inst{18-16} = Imm;
-  }
-  def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shl_imm16> {
-    bits<4> Imm;
-    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
-    let Inst{19-16} = Imm;
-  }
-  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shl_imm32> {
-    bits<5> Imm;
-    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
-    let Inst{20-16} = Imm;
-  }
-}
-
-class NeonI_ScalarShiftRightImm_accum_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_ScalarShiftImm<u, opcode,
-                         (outs FPR64:$Rd),
-                         (ins FPR64:$Src, FPR64:$Rn, shr_imm64:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-    let Constraints = "$Src = $Rd";
-}
-
-class NeonI_ScalarShiftLeftImm_accum_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_ScalarShiftImm<u, opcode,
-                         (outs FPR64:$Rd),
-                         (ins FPR64:$Src, FPR64:$Rn, shl_imm64:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-    let Constraints = "$Src = $Rd";
-}
-
-class NeonI_ScalarShiftImm_narrow_size<bit u, bits<5> opcode, string asmop,
-                                       RegisterClass FPRCD, RegisterClass FPRCS,
-                                       Operand ImmTy>
-  : NeonI_ScalarShiftImm<u, opcode,
-                         (outs FPRCD:$Rd), (ins FPRCS:$Rn, ImmTy:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_ScalarShiftImm_narrow_HSD_size<bit u, bits<5> opcode,
-                                                string asmop> {
-  def bhi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR8, FPR16,
-                                             shr_imm8> {
-    bits<3> Imm;
-    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
-    let Inst{18-16} = Imm;
-  }
-  def hsi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR16, FPR32,
-                                             shr_imm16> {
-    bits<4> Imm;
-    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
-    let Inst{19-16} = Imm;
-  }
-  def sdi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR32, FPR64,
-                                             shr_imm32> {
-    bits<5> Imm;
-    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
-    let Inst{20-16} = Imm;
-  }
-}
-
-multiclass NeonI_ScalarShiftImm_cvt_SD_size<bit u, bits<5> opcode, string asmop> {
-  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
-    bits<5> Imm;
-    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
-    let Inst{20-16} = Imm;
-  }
-  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-  }
-}
-
-multiclass Neon_ScalarShiftRImm_D_size_patterns<SDPatternOperator opnode,
-                                               Instruction INSTD> {
-  def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_ScalarShiftLImm_D_size_patterns<SDPatternOperator opnode,
-                                               Instruction INSTD> {
-  def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shl_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-class Neon_ScalarShiftLImm_V1_D_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
-            (v1i64 (Neon_vdup (i32 shl_imm64:$Imm))))),
-        (INSTD FPR64:$Rn, imm:$Imm)>;
-
-class Neon_ScalarShiftRImm_V1_D_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
-            (v1i64 (Neon_vdup (i32 shr_imm64:$Imm))))),
-        (INSTD FPR64:$Rn, imm:$Imm)>;
-
-multiclass Neon_ScalarShiftLImm_BHSD_size_patterns<SDPatternOperator opnode,
-                                                   Instruction INSTB,
-                                                   Instruction INSTH,
-                                                   Instruction INSTS,
-                                                   Instruction INSTD>
-  : Neon_ScalarShiftLImm_D_size_patterns<opnode, INSTD> {
-  def bbi : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (i32 shl_imm8:$Imm))),
-                (INSTB FPR8:$Rn, imm:$Imm)>;
-  def hhi : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (i32 shl_imm16:$Imm))),
-                (INSTH FPR16:$Rn, imm:$Imm)>;
-  def ssi : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (i32 shl_imm32:$Imm))),
-                (INSTS FPR32:$Rn, imm:$Imm)>;
-}
-
-class Neon_ScalarShiftLImm_accum_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
-            (i32 shl_imm64:$Imm))),
-        (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
-
-class Neon_ScalarShiftRImm_accum_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
-            (i32 shr_imm64:$Imm))),
-        (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
-
-multiclass Neon_ScalarShiftImm_narrow_HSD_size_patterns<
-                                                       SDPatternOperator opnode,
-                                                       Instruction INSTH,
-                                                       Instruction INSTS,
-                                                       Instruction INSTD> {
-  def bhi : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn), (i32 shr_imm16:$Imm))),
-                (INSTH FPR16:$Rn, imm:$Imm)>;
-  def hsi : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
-                (INSTS FPR32:$Rn, imm:$Imm)>;
-  def sdi : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_ScalarShiftImm_scvtf_SD_size_patterns<SDPatternOperator opnode,
-                                                      Instruction INSTS,
-                                                      Instruction INSTD> {
-  def ssi : Pat<(f32 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
-                (INSTS FPR32:$Rn, imm:$Imm)>;
-  def ddi : Pat<(f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns<SDPatternOperator opnode,
-                                                      Instruction INSTS,
-                                                      Instruction INSTD> {
-  def ssi : Pat<(v1i32 (opnode (f32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
-                (INSTS FPR32:$Rn, imm:$Imm)>;
-  def ddi : Pat<(v1i64 (opnode (f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-// Scalar Signed Shift Right (Immediate)
-defm SSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00000, "sshr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrds_n, SSHRddi>;
-// Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftRImm_V1_D_size_patterns<sra, SSHRddi>;
-
-// Scalar Unsigned Shift Right (Immediate)
-defm USHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00000, "ushr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrdu_n, USHRddi>;
-// Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftRImm_V1_D_size_patterns<srl, USHRddi>;
-
-// Scalar Signed Rounding Shift Right (Immediate)
-defm SRSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00100, "srshr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vsrshr, SRSHRddi>;
-
-// Scalar Unigned Rounding Shift Right (Immediate)
-defm URSHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00100, "urshr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vurshr, URSHRddi>;
-
-// Scalar Signed Shift Right and Accumulate (Immediate)
-def SSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00010, "ssra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vsrads_n, SSRA>;
-
-// Scalar Unsigned Shift Right and Accumulate (Immediate)
-def USRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00010, "usra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vsradu_n, USRA>;
-
-// Scalar Signed Rounding Shift Right and Accumulate (Immediate)
-def SRSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00110, "srsra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vrsrads_n, SRSRA>;
-
-// Scalar Unsigned Rounding Shift Right and Accumulate (Immediate)
-def URSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00110, "ursra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vrsradu_n, URSRA>;
-
-// Scalar Shift Left (Immediate)
-defm SHL : NeonI_ScalarShiftLeftImm_D_size<0b0, 0b01010, "shl">;
-defm : Neon_ScalarShiftLImm_D_size_patterns<int_aarch64_neon_vshld_n, SHLddi>;
-// Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftLImm_V1_D_size_patterns<shl, SHLddi>;
-
-// Signed Saturating Shift Left (Immediate)
-defm SQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b0, 0b01110, "sqshl">;
-defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshls_n,
-                                               SQSHLbbi, SQSHLhhi,
-                                               SQSHLssi, SQSHLddi>;
-// Pattern to match llvm.arm.* intrinsic.
-defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_sqrshlImm, SQSHLddi>;
-
-// Unsigned Saturating Shift Left (Immediate)
-defm UQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01110, "uqshl">;
-defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshlu_n,
-                                               UQSHLbbi, UQSHLhhi,
-                                               UQSHLssi, UQSHLddi>;
-// Pattern to match llvm.arm.* intrinsic.
-defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_uqrshlImm, UQSHLddi>;
-
-// Signed Saturating Shift Left Unsigned (Immediate)
-defm SQSHLU : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01100, "sqshlu">;
-defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vsqshlu,
-                                               SQSHLUbbi, SQSHLUhhi,
-                                               SQSHLUssi, SQSHLUddi>;
-
-// Shift Right And Insert (Immediate)
-def SRI : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b01000, "sri">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vsri, SRI>;
-
-// Shift Left And Insert (Immediate)
-def SLI : NeonI_ScalarShiftLeftImm_accum_D_size<0b1, 0b01010, "sli">;
-def : Neon_ScalarShiftLImm_accum_D_size_patterns
-          <int_aarch64_neon_vsli, SLI>;
-
-// Signed Saturating Shift Right Narrow (Immediate)
-defm SQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10010, "sqshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrn,
-                                                    SQSHRNbhi, SQSHRNhsi,
-                                                    SQSHRNsdi>;
-
-// Unsigned Saturating Shift Right Narrow (Immediate)
-defm UQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10010, "uqshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqshrn,
-                                                    UQSHRNbhi, UQSHRNhsi,
-                                                    UQSHRNsdi>;
-
-// Signed Saturating Rounded Shift Right Narrow (Immediate)
-defm SQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10011, "sqrshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrn,
-                                                    SQRSHRNbhi, SQRSHRNhsi,
-                                                    SQRSHRNsdi>;
-
-// Unsigned Saturating Rounded Shift Right Narrow (Immediate)
-defm UQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10011, "uqrshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqrshrn,
-                                                    UQRSHRNbhi, UQRSHRNhsi,
-                                                    UQRSHRNsdi>;
-
-// Signed Saturating Shift Right Unsigned Narrow (Immediate)
-defm SQSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10000, "sqshrun">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrun,
-                                                    SQSHRUNbhi, SQSHRUNhsi,
-                                                    SQSHRUNsdi>;
-
-// Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
-defm SQRSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10001, "sqrshrun">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrun,
-                                                    SQRSHRUNbhi, SQRSHRUNhsi,
-                                                    SQRSHRUNsdi>;
-
-// Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
-defm SCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11100, "scvtf">;
-defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtfxs2fp_n,
-                                                  SCVTF_Nssi, SCVTF_Nddi>;
-
-// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
-defm UCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11100, "ucvtf">;
-defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtfxu2fp_n,
-                                                  UCVTF_Nssi, UCVTF_Nddi>;
-
-// Scalar Floating-point Convert To Signed Fixed-point (Immediate)
-defm FCVTZS_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11111, "fcvtzs">;
-defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvtfp2fxs_n,
-                                                  FCVTZS_Nssi, FCVTZS_Nddi>;
-
-// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
-defm FCVTZU_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11111, "fcvtzu">;
-defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvtfp2fxu_n,
-                                                  FCVTZU_Nssi, FCVTZU_Nddi>;
-
-// Patterns For Convert Instructions Between v1f64 and v1i64
-class Neon_ScalarShiftImm_cvtf_v1f64_pattern<SDPatternOperator opnode,
-                                             Instruction INST>
-    : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-          (INST FPR64:$Rn, imm:$Imm)>;
-
-class Neon_ScalarShiftImm_fcvt_v1f64_pattern<SDPatternOperator opnode,
-                                             Instruction INST>
-    : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-          (INST FPR64:$Rn, imm:$Imm)>;
-
-def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxs2fp,
-                                             SCVTF_Nddi>;
-
-def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxu2fp,
-                                             UCVTF_Nddi>;
-
-def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxs,
-                                             FCVTZS_Nddi>;
-
-def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxu,
-                                             FCVTZU_Nddi>;
-
-// Scalar Integer Add
-let isCommutable = 1 in {
-def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">;
-}
-
-// Scalar Integer Sub
-def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">;
-
-// Pattern for Scalar Integer Add and Sub with D register only
-defm : Neon_Scalar3Same_D_size_patterns<add, ADDddd>;
-defm : Neon_Scalar3Same_D_size_patterns<sub, SUBddd>;
-
-// Patterns to match llvm.aarch64.* intrinsic for Scalar Add, Sub
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vaddds, ADDddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vadddu, ADDddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubds, SUBddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubdu, SUBddd>;
-
-// Scalar Integer Saturating Add (Signed, Unsigned)
-defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>;
-defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>;
-
-// Scalar Integer Saturating Sub (Signed, Unsigned)
-defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>;
-defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>;
-
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Saturating Add, Sub  (Signed, Unsigned)
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqadds, SQADDbbb,
-                                           SQADDhhh, SQADDsss, SQADDddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqaddu, UQADDbbb,
-                                           UQADDhhh, UQADDsss, UQADDddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubs, SQSUBbbb,
-                                           SQSUBhhh, SQSUBsss, SQSUBddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubu, UQSUBbbb,
-                                           UQSUBhhh, UQSUBsss, UQSUBddd>;
-
-// Scalar Integer Saturating Doubling Multiply Half High
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in
-defm SQDMULH : NeonI_Scalar3Same_HS_sizes<0b0, 0b10110, "sqdmulh", 1>;
-
-// Scalar Integer Saturating Rounding Doubling Multiply Half High
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm SQRDMULH : NeonI_Scalar3Same_HS_sizes<0b1, 0b10110, "sqrdmulh", 1>;
-}
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Saturating Doubling Multiply Half High and
-// Scalar Integer Saturating Rounding Doubling Multiply Half High
-defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqdmulh, SQDMULHhhh,
-                                                               SQDMULHsss>;
-defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqrdmulh, SQRDMULHhhh,
-                                                                SQRDMULHsss>;
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in {
-// Scalar Floating-point Multiply Extended
-defm FMULX : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11011, "fmulx", 1>;
-}
-
-// Scalar Floating-point Reciprocal Step
-defm FRECPS : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11111, "frecps", 0>;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vrecps, f32, f32,
-                                         FRECPSsss, f64, f64, FRECPSddd>;
-def : Pat<(v1f64 (int_arm_neon_vrecps (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FRECPSddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Floating-point Reciprocal Square Root Step
-defm FRSQRTS : NeonI_Scalar3Same_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", 0>;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vrsqrts, f32, f32,
-                                         FRSQRTSsss, f64, f64, FRSQRTSddd>;
-def : Pat<(v1f64 (int_arm_neon_vrsqrts (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FRSQRTSddd FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (fsqrt (v1f64 FPR64:$Rn))), (FSQRTdd FPR64:$Rn)>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Floating-point Multiply Extended,
-multiclass Neon_Scalar3Same_MULX_SD_size_patterns<SDPatternOperator opnode,
-                                                  Instruction INSTS,
-                                                  Instruction INSTD> {
-  def : Pat<(f32 (opnode (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-  def : Pat<(f64 (opnode (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
-            (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-defm : Neon_Scalar3Same_MULX_SD_size_patterns<int_aarch64_neon_vmulx,
-                                              FMULXsss, FMULXddd>;
-def : Pat<(v1f64 (int_aarch64_neon_vmulx (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMULXddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Integer Shift Left (Signed, Unsigned)
-def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">;
-def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshifts, SSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshiftu, USHLddd>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshlds, SSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshldu, USHLddd>;
-
-// Scalar Integer Saturating Shift Left (Signed, Unsigned)
-defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>;
-defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar  Integer Saturating Shift Letf (Signed, Unsigned)
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshls, SQSHLbbb,
-                                           SQSHLhhh, SQSHLsss, SQSHLddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshlu, UQSHLbbb,
-                                           UQSHLhhh, UQSHLsss, UQSHLddd>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar  Integer Saturating Shift Letf (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshifts, SQSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshiftu, UQSHLddd>;
-
-// Scalar Integer Rounding Shift Left (Signed, Unsigned)
-def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">;
-def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshlds, SRSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshldu, URSHLddd>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshifts, SRSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshiftu, URSHLddd>;
-
-// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
-defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>;
-defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshls, SQRSHLbbb,
-                                           SQRSHLhhh, SQRSHLsss, SQRSHLddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshlu, UQRSHLbbb,
-                                           UQRSHLhhh, UQRSHLsss, UQRSHLddd>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshifts, SQRSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshiftu, UQRSHLddd>;
-
-let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC] in {
-// Signed Saturating Doubling Multiply-Add Long
-defm SQDMLAL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1001, "sqdmlal">;
-}
-defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlal,
-                                            SQDMLALshh, SQDMLALdss>;
-
-// Signed Saturating Doubling Multiply-Subtract Long
-let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC] in {
-defm SQDMLSL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1011, "sqdmlsl">;
-}
-defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlsl,
-                                            SQDMLSLshh, SQDMLSLdss>;
-
-// Signed Saturating Doubling Multiply Long
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in {
-defm SQDMULL : NeonI_Scalar3Diff_HS_size<0b0, 0b1101, "sqdmull">;
-}
-defm : Neon_Scalar3Diff_HS_size_patterns<int_arm_neon_vqdmull,
-                                         SQDMULLshh, SQDMULLdss>;
-
-// Scalar Signed Integer Convert To Floating-point
-defm SCVTF  : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11101, "scvtf">;
-defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtint2fps,
-                                                 SCVTFss, SCVTFdd>;
-
-// Scalar Unsigned Integer Convert To Floating-point
-defm UCVTF  : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11101, "ucvtf">;
-defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtint2fpu,
-                                                 UCVTFss, UCVTFdd>;
-
-// Scalar Floating-point Converts
-def FCVTXN : NeonI_Scalar2SameMisc_fcvtxn_D_size<0b1, 0b10110, "fcvtxn">;
-def : Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<int_aarch64_neon_fcvtxn,
-                                                  FCVTXN>;
-
-defm FCVTNS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11010, "fcvtns">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtns,
-                                                  FCVTNSss, FCVTNSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtns, FCVTNSdd>;
-
-defm FCVTNU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11010, "fcvtnu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtnu,
-                                                  FCVTNUss, FCVTNUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtnu, FCVTNUdd>;
-
-defm FCVTMS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11011, "fcvtms">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtms,
-                                                  FCVTMSss, FCVTMSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtms, FCVTMSdd>;
-
-defm FCVTMU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11011, "fcvtmu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtmu,
-                                                  FCVTMUss, FCVTMUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtmu, FCVTMUdd>;
-
-defm FCVTAS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11100, "fcvtas">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtas,
-                                                  FCVTASss, FCVTASdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtas, FCVTASdd>;
-
-defm FCVTAU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11100, "fcvtau">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtau,
-                                                  FCVTAUss, FCVTAUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtau, FCVTAUdd>;
-
-defm FCVTPS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11010, "fcvtps">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtps,
-                                                  FCVTPSss, FCVTPSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtps, FCVTPSdd>;
-
-defm FCVTPU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11010, "fcvtpu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtpu,
-                                                  FCVTPUss, FCVTPUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtpu, FCVTPUdd>;
-
-defm FCVTZS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11011, "fcvtzs">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzs,
-                                                  FCVTZSss, FCVTZSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_aarch64_neon_vcvtzs,
-                                                FCVTZSdd>;
-
-defm FCVTZU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11011, "fcvtzu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzu,
-                                                  FCVTZUss, FCVTZUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_aarch64_neon_vcvtzu,
-                                                FCVTZUdd>;
-
-// Patterns For Convert Instructions Between v1f64 and v1i64
-class Neon_Scalar2SameMisc_cvtf_v1f64_pattern<SDPatternOperator opnode,
-                                              Instruction INST>
-    : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-class Neon_Scalar2SameMisc_fcvt_v1f64_pattern<SDPatternOperator opnode,
-                                              Instruction INST>
-    : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<sint_to_fp, SCVTFdd>;
-def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<uint_to_fp, UCVTFdd>;
-
-def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_sint, FCVTZSdd>;
-def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_uint, FCVTZUdd>;
-
-// Scalar Floating-point Reciprocal Estimate
-defm FRECPE : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11101, "frecpe">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpe,
-                                             FRECPEss, FRECPEdd>;
-def : Neon_Scalar2SameMisc_V1_D_size_patterns<int_arm_neon_vrecpe,
-                                              FRECPEdd>;
-
-// Scalar Floating-point Reciprocal Exponent
-defm FRECPX : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11111, "frecpx">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpx,
-                                             FRECPXss, FRECPXdd>;
-
-// Scalar Floating-point Reciprocal Square Root Estimate
-defm FRSQRTE: NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11101, "frsqrte">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrsqrte,
-                                                 FRSQRTEss, FRSQRTEdd>;
-def : Neon_Scalar2SameMisc_V1_D_size_patterns<int_arm_neon_vrsqrte,
-                                              FRSQRTEdd>;
-
-// Scalar Floating-point Round
-class Neon_ScalarFloatRound_pattern<SDPatternOperator opnode, Instruction INST>
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-def : Neon_ScalarFloatRound_pattern<fceil, FRINTPdd>;
-def : Neon_ScalarFloatRound_pattern<ffloor, FRINTMdd>;
-def : Neon_ScalarFloatRound_pattern<ftrunc, FRINTZdd>;
-def : Neon_ScalarFloatRound_pattern<frint, FRINTXdd>;
-def : Neon_ScalarFloatRound_pattern<fnearbyint, FRINTIdd>;
-def : Neon_ScalarFloatRound_pattern<frnd, FRINTAdd>;
-def : Neon_ScalarFloatRound_pattern<int_aarch64_neon_frintn, FRINTNdd>;
-
-// Scalar Integer Compare
-
-// Scalar Compare Bitwise Equal
-def CMEQddd: NeonI_Scalar3Same_D_size<0b1, 0b10001, "cmeq">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vceq, CMEQddd>;
-
-class Neon_Scalar3Same_cmp_D_size_v1_patterns<SDPatternOperator opnode,
-                                              Instruction INSTD,
-                                              CondCode CC>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm), CC)),
-        (INSTD FPR64:$Rn, FPR64:$Rm)>;
-
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMEQddd, SETEQ>;
-
-// Scalar Compare Signed Greather Than Or Equal
-def CMGEddd: NeonI_Scalar3Same_D_size<0b0, 0b00111, "cmge">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vcge, CMGEddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGEddd, SETGE>;
-
-// Scalar Compare Unsigned Higher Or Same
-def CMHSddd: NeonI_Scalar3Same_D_size<0b1, 0b00111, "cmhs">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vchs, CMHSddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHSddd, SETUGE>;
-
-// Scalar Compare Unsigned Higher
-def CMHIddd: NeonI_Scalar3Same_D_size<0b1, 0b00110, "cmhi">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vchi, CMHIddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHIddd, SETUGT>;
-
-// Scalar Compare Signed Greater Than
-def CMGTddd: NeonI_Scalar3Same_D_size<0b0, 0b00110, "cmgt">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vcgt, CMGTddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGTddd, SETGT>;
-
-// Scalar Compare Bitwise Test Bits
-def CMTSTddd: NeonI_Scalar3Same_D_size<0b0, 0b10001, "cmtst">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vtstd, CMTSTddd>;
-defm : Neon_Scalar3Same_D_size_patterns<Neon_tst, CMTSTddd>;
-
-// Scalar Compare Bitwise Equal To Zero
-def CMEQddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01001, "cmeq">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vceq,
-                                                CMEQddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETEQ, CMEQddi>;
-
-// Scalar Compare Signed Greather Than Or Equal To Zero
-def CMGEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01000, "cmge">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcge,
-                                                CMGEddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGE, CMGEddi>;
-
-// Scalar Compare Signed Greater Than Zero
-def CMGTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01000, "cmgt">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcgt,
-                                                CMGTddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGT, CMGTddi>;
-
-// Scalar Compare Signed Less Than Or Equal To Zero
-def CMLEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01001, "cmle">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vclez,
-                                                CMLEddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLE, CMLEddi>;
-
-// Scalar Compare Less Than Zero
-def CMLTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01010, "cmlt">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcltz,
-                                                CMLTddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLT, CMLTddi>;
-
-// Scalar Floating-point Compare
-
-// Scalar Floating-point Compare Mask Equal
-defm FCMEQ: NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11100, "fcmeq">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fceq, v1i32, f32,
-                                         FCMEQsss, v1i64, f64, FCMEQddd>;
-def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETEQ, FCMEQddd>;
-
-// Scalar Floating-point Compare Mask Equal To Zero
-defm FCMEQZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01101, "fcmeq">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fceq, SETEQ,
-                                                  FCMEQZssi, FCMEQZddi>;
-
-// Scalar Floating-point Compare Mask Greater Than Or Equal
-defm FCMGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11100, "fcmge">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcge, v1i32, f32,
-                                         FCMGEsss, v1i64, f64, FCMGEddd>;
-def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGE, FCMGEddd>;
-
-// Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
-defm FCMGEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01100, "fcmge">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcge, SETGE,
-                                                  FCMGEZssi, FCMGEZddi>;
-
-// Scalar Floating-point Compare Mask Greather Than
-defm FCMGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11100, "fcmgt">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcgt, v1i32, f32,
-                                         FCMGTsss, v1i64, f64, FCMGTddd>;
-def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGT, FCMGTddd>;
-
-// Scalar Floating-point Compare Mask Greather Than Zero
-defm FCMGTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01100, "fcmgt">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcgt, SETGT,
-                                                  FCMGTZssi, FCMGTZddi>;
-
-// Scalar Floating-point Compare Mask Less Than Or Equal To Zero
-defm FCMLEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01101, "fcmle">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fclez, SETLE,
-                                                  FCMLEZssi, FCMLEZddi>;
-
-// Scalar Floating-point Compare Mask Less Than Zero
-defm FCMLTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01110, "fcmlt">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcltz, SETLT,
-                                                  FCMLTZssi, FCMLTZddi>;
-
-// Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
-defm FACGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11101, "facge">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcage, v1i32, f32,
-                                         FACGEsss, v1i64, f64, FACGEddd>;
-def : Pat<(v1i64 (int_arm_neon_vacge (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FACGEddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Floating-point Absolute Compare Mask Greater Than
-defm FACGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11101, "facgt">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcagt, v1i32, f32,
-                                         FACGTsss, v1i64, f64, FACGTddd>;
-def : Pat<(v1i64 (int_arm_neon_vacgt (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FACGTddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Floating-point Absolute Difference
-defm FABD: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11010, "fabd">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vabd, f32, f32,
-                                         FABDsss, f64, f64, FABDddd>;
-
-// Scalar Absolute Value
-defm ABS : NeonI_Scalar2SameMisc_D_size<0b0, 0b01011, "abs">;
-defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vabs, ABSdd>;
-
-// Scalar Signed Saturating Absolute Value
-defm SQABS : NeonI_Scalar2SameMisc_BHSD_size<0b0, 0b00111, "sqabs">;
-defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqabs,
-                                               SQABSbb, SQABShh, SQABSss, SQABSdd>;
-
-// Scalar Negate
-defm NEG : NeonI_Scalar2SameMisc_D_size<0b1, 0b01011, "neg">;
-defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vneg, NEGdd>;
-
-// Scalar Signed Saturating Negate
-defm SQNEG : NeonI_Scalar2SameMisc_BHSD_size<0b1, 0b00111, "sqneg">;
-defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqneg,
-                                               SQNEGbb, SQNEGhh, SQNEGss, SQNEGdd>;
-
-// Scalar Signed Saturating Accumulated of Unsigned Value
-defm SUQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b0, 0b00011, "suqadd">;
-defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vuqadd,
-                                                     SUQADDbb, SUQADDhh,
-                                                     SUQADDss, SUQADDdd>;
-
-// Scalar Unsigned Saturating Accumulated of Signed Value
-defm USQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b1, 0b00011, "usqadd">;
-defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vsqadd,
-                                                     USQADDbb, USQADDhh,
-                                                     USQADDss, USQADDdd>;
-
-def : Pat<(v1i64 (int_aarch64_neon_suqadd (v1i64 FPR64:$Src),
-                                          (v1i64 FPR64:$Rn))),
-          (SUQADDdd FPR64:$Src, FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_aarch64_neon_usqadd (v1i64 FPR64:$Src),
-                                          (v1i64 FPR64:$Rn))),
-          (USQADDdd FPR64:$Src, FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm_neon_vabs (v1i64 FPR64:$Rn))),
-          (ABSdd FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm_neon_vqabs (v1i64 FPR64:$Rn))),
-          (SQABSdd FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm_neon_vqneg (v1i64 FPR64:$Rn))),
-          (SQNEGdd FPR64:$Rn)>;
-
-def : Pat<(v1i64 (sub (v1i64 (bitconvert (v8i8 Neon_AllZero))),
-                      (v1i64 FPR64:$Rn))),
-          (NEGdd FPR64:$Rn)>;
-
-// Scalar Signed Saturating Extract Unsigned Narrow
-defm SQXTUN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10010, "sqxtun">;
-defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnsu,
-                                                     SQXTUNbh, SQXTUNhs,
-                                                     SQXTUNsd>;
-
-// Scalar Signed Saturating Extract Narrow
-defm SQXTN  : NeonI_Scalar2SameMisc_narrow_HSD_size<0b0, 0b10100, "sqxtn">;
-defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovns,
-                                                     SQXTNbh, SQXTNhs,
-                                                     SQXTNsd>;
-
-// Scalar Unsigned Saturating Extract Narrow
-defm UQXTN  : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10100, "uqxtn">;
-defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnu,
-                                                     UQXTNbh, UQXTNhs,
-                                                     UQXTNsd>;
-
-// Scalar Reduce Pairwise
-
-multiclass NeonI_ScalarPair_D_sizes<bit u, bit size, bits<5> opcode,
-                                     string asmop, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _D_2D : NeonI_ScalarPair<u, {size, 0b1}, opcode,
-                                (outs FPR64:$Rd), (ins VPR128:$Rn),
-                                !strconcat(asmop, "\t$Rd, $Rn.2d"),
-                                [],
-                                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-  }
-}
-
-multiclass NeonI_ScalarPair_SD_sizes<bit u, bit size, bits<5> opcode,
-                                     string asmop, bit Commutable = 0>
-  : NeonI_ScalarPair_D_sizes<u, size, opcode, asmop, Commutable> {
-  let isCommutable = Commutable in {
-    def _S_2S : NeonI_ScalarPair<u, {size, 0b0}, opcode,
-                                (outs FPR32:$Rd), (ins VPR64:$Rn),
-                                !strconcat(asmop, "\t$Rd, $Rn.2s"),
-                                [],
-                                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-  }
-}
-
-// Scalar Reduce Addition Pairwise (Integer) with
-// Pattern to match llvm.arm.* intrinsic
-defm ADDPvv : NeonI_ScalarPair_D_sizes<0b0, 0b1, 0b11011, "addp", 0>;
-
-// Pattern to match llvm.aarch64.* intrinsic for
-// Scalar Reduce Addition Pairwise (Integer)
-def : Pat<(v1i64 (int_aarch64_neon_vpadd (v2i64 VPR128:$Rn))),
-          (ADDPvv_D_2D VPR128:$Rn)>;
-def : Pat<(v1i64 (int_aarch64_neon_vaddv (v2i64 VPR128:$Rn))),
-          (ADDPvv_D_2D VPR128:$Rn)>;
-
-// Scalar Reduce Addition Pairwise (Floating Point)
-defm FADDPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01101, "faddp", 0>;
-
-// Scalar Reduce Maximum Pairwise (Floating Point)
-defm FMAXPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01111, "fmaxp", 0>;
-
-// Scalar Reduce Minimum Pairwise (Floating Point)
-defm FMINPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01111, "fminp", 0>;
-
-// Scalar Reduce maxNum Pairwise (Floating Point)
-defm FMAXNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01100, "fmaxnmp", 0>;
-
-// Scalar Reduce minNum Pairwise (Floating Point)
-defm FMINNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01100, "fminnmp", 0>;
-
-multiclass Neon_ScalarPair_SD_size_patterns<SDPatternOperator opnode,
-                                            Instruction INSTS,
-                                            Instruction INSTD> {
-  def : Pat<(f32 (opnode (v2f32 VPR64:$Rn))),
-            (INSTS VPR64:$Rn)>;
-  def : Pat<(f64 (opnode (v2f64 VPR128:$Rn))),
-            (INSTD VPR128:$Rn)>;
-}
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Reduce Add, Max, Min, MaxiNum, MinNum Pairwise (Floating Point)
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfadd,
-                                        FADDPvv_S_2S, FADDPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmax,
-                                        FMAXPvv_S_2S, FMAXPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmin,
-                                        FMINPvv_S_2S, FMINPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfmaxnm,
-                                        FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfminnm,
-                                        FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
-
-def : Pat<(f32 (int_aarch64_neon_vpfadd (v4f32 VPR128:$Rn))),
-          (FADDPvv_S_2S (v2f32
-               (EXTRACT_SUBREG
-                   (v4f32 (FADDP_4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rn))),
-                   sub_64)))>;
-
-// Scalar by element Arithmetic
-
-class NeonI_ScalarXIndexedElemArith<string asmop, bits<4> opcode,
-                                    string rmlane, bit u, bit szhi, bit szlo,
-                                    RegisterClass ResFPR, RegisterClass OpFPR,
-                                    RegisterOperand OpVPR, Operand OpImm>
-  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
-                             (outs ResFPR:$Rd),
-                             (ins OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
-                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
-                             [],
-                             NoItinerary>,
-    Sched<[WriteFPMul, ReadFPMul, ReadFPMul]> {
-  bits<3> Imm;
-  bits<5> MRm;
-}
-
-class NeonI_ScalarXIndexedElemArith_Constraint_Impl<string asmop, bits<4> opcode,
-                                                    string rmlane,
-                                                    bit u, bit szhi, bit szlo,
-                                                    RegisterClass ResFPR,
-                                                    RegisterClass OpFPR,
-                                                    RegisterOperand OpVPR,
-                                                    Operand OpImm>
-  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
-                             (outs ResFPR:$Rd),
-                             (ins ResFPR:$src, OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
-                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
-                             [],
-                             NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
-  let Constraints = "$src = $Rd";
-  bits<3> Imm;
-  bits<5> MRm;
-}
-
-// Scalar Floating Point  multiply (scalar, by element)
-def FMULssv_4S : NeonI_ScalarXIndexedElemArith<"fmul",
-  0b1001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1}; // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def FMULddv_2D : NeonI_ScalarXIndexedElemArith<"fmul",
-  0b1001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{11} = Imm{0}; // h
-  let Inst{21} = 0b0;    // l
-  let Inst{20-16} = MRm;
-}
-
-// Scalar Floating Point  multiply extended (scalar, by element)
-def FMULXssv_4S : NeonI_ScalarXIndexedElemArith<"fmulx",
-  0b1001, ".s", 0b1, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1}; // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def FMULXddv_2D : NeonI_ScalarXIndexedElemArith<"fmulx",
-  0b1001, ".d", 0b1, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{11} = Imm{0}; // h
-  let Inst{21} = 0b0;    // l
-  let Inst{20-16} = MRm;
-}
-
-multiclass Neon_ScalarXIndexedElem_MUL_MULX_Patterns<
-  SDPatternOperator opnode,
-  Instruction INST,
-  ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
-  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
-
-  def  : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
-               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)))),
-             (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
-               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)))),
-             (ResTy (INST (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-
-  // swapped operands
-  def  : Pat<(ResTy (opnode
-               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
-               (ResTy FPRC:$Rn))),
-             (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode
-               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
-               (ResTy FPRC:$Rn))),
-             (ResTy (INST (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-}
-
-// Patterns for Scalar Floating Point  multiply (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULssv_4S,
-  f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULddv_2D,
-  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
-
-// Patterns for Scalar Floating Point  multiply extended (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
-  FMULXssv_4S, f32, FPR32, v4f32, neon_uimm2_bare,
-  v2f32, v4f32, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
-  FMULXddv_2D, f64, FPR64, v2f64, neon_uimm1_bare,
-  v1f64, v2f64, neon_uimm0_bare>;
-
-// Scalar Floating Point fused multiply-add (scalar, by element)
-def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
-  0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1}; // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def FMLAddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
-  0b0001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{11} = Imm{0}; // h
-  let Inst{21} = 0b0;    // l
-  let Inst{20-16} = MRm;
-}
-
-// Scalar Floating Point fused multiply-subtract (scalar, by element)
-def FMLSssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
-  0b0101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1}; // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def FMLSddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
-  0b0101, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{11} = Imm{0}; // h
-  let Inst{21} = 0b0;    // l
-  let Inst{20-16} = MRm;
-}
-// We are allowed to match the fma instruction regardless of compile options.
-multiclass Neon_ScalarXIndexedElem_FMA_Patterns<
-  Instruction FMLAI, Instruction FMLSI,
-  ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
-  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
-  // fmla
-  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
-               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLAI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
-               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLAI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-
-  // swapped fmla operands
-  def  : Pat<(ResTy (fma
-               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
-               (ResTy FPRC:$Rn),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLAI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (fma
-               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
-               (ResTy FPRC:$Rn),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLAI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-
-  // fmls
-  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
-               (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLSI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
-               (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLSI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-
-  // swapped fmls operands
-  def  : Pat<(ResTy (fma
-               (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
-               (ResTy FPRC:$Rn),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLSI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (fma
-               (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
-               (ResTy FPRC:$Rn),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLSI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-}
-
-// Scalar Floating Point fused multiply-add and
-// multiply-subtract (scalar, by element)
-defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAssv_4S, FMLSssv_4S,
-  f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
-  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
-defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
-  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
-
-// Scalar Signed saturating doubling multiply long (scalar, by element)
-def SQDMULLshv_4H : NeonI_ScalarXIndexedElemArith<"sqdmull",
-  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMULLshv_8H : NeonI_ScalarXIndexedElemArith<"sqdmull",
-  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMULLdsv_2S : NeonI_ScalarXIndexedElemArith<"sqdmull",
-  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQDMULLdsv_4S : NeonI_ScalarXIndexedElemArith<"sqdmull",
-  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-multiclass Neon_ScalarXIndexedElem_MUL_Patterns<
-  SDPatternOperator opnode,
-  Instruction INST,
-  ValueType ResTy, RegisterClass FPRC,
-  ValueType OpVTy, ValueType OpTy,
-  ValueType VecOpTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
-
-  def  : Pat<(ResTy (opnode (OpVTy FPRC:$Rn),
-               (OpVTy (scalar_to_vector
-                 (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))))),
-             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode (OpVTy FPRC:$Rn),
-               (OpVTy (extract_subvector (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
-             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-
-  //swapped operands
-  def  : Pat<(ResTy (opnode
-               (OpVTy (scalar_to_vector
-                 (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
-                 (OpVTy FPRC:$Rn))),
-             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode
-               (OpVTy (extract_subvector (VecOpTy VPRC:$MRm), OpImm:$Imm)),
-               (OpVTy FPRC:$Rn))),
-             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-}
-
-
-// Patterns for Scalar Signed saturating doubling
-// multiply long (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
-  SQDMULLshv_4H, v1i32, FPR16, v1i16, i16, v4i16,
-  i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
-  SQDMULLshv_8H, v1i32, FPR16, v1i16, i16, v8i16,
-  i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
-  SQDMULLdsv_2S, v1i64, FPR32, v1i32, i32, v2i32,
-  i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
-  SQDMULLdsv_4S, v1i64, FPR32, v1i32, i32, v4i32,
-  i32, VPR128Lo, neon_uimm2_bare>;
-
-// Scalar Signed saturating doubling multiply-add long (scalar, by element)
-def SQDMLALshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
-  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMLALshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
-  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMLALdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
-  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQDMLALdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
-  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-// Scalar Signed saturating doubling
-// multiply-subtract long (scalar, by element)
-def SQDMLSLshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
-  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMLSLshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
-  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMLSLdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
-  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQDMLSLdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
-  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-multiclass Neon_ScalarXIndexedElem_MLAL_Patterns<
-  SDPatternOperator opnode,
-  SDPatternOperator coreopnode,
-  Instruction INST,
-  ValueType ResTy, RegisterClass ResFPRC, RegisterClass FPRC,
-  ValueType OpTy,
-  ValueType OpVTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
-
-  def  : Pat<(ResTy (opnode
-               (ResTy ResFPRC:$Ra),
-               (ResTy (coreopnode (OpTy FPRC:$Rn),
-                 (OpTy (scalar_to_vector
-                   (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))))))),
-             (ResTy (INST (ResTy ResFPRC:$Ra),
-               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode
-               (ResTy ResFPRC:$Ra),
-               (ResTy (coreopnode (OpTy FPRC:$Rn),
-                 (OpTy (extract_subvector (OpVTy VPRC:$MRm), OpImm:$Imm)))))),
-             (ResTy (INST (ResTy ResFPRC:$Ra),
-               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-
-  // swapped operands
-  def  : Pat<(ResTy (opnode
-               (ResTy ResFPRC:$Ra),
-               (ResTy (coreopnode
-                 (OpTy (scalar_to_vector
-                   (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))),
-                 (OpTy FPRC:$Rn))))),
-             (ResTy (INST (ResTy ResFPRC:$Ra),
-               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode
-               (ResTy ResFPRC:$Ra),
-               (ResTy (coreopnode
-                 (OpTy (extract_subvector (OpVTy VPRC:$MRm), OpImm:$Imm)),
-                 (OpTy FPRC:$Rn))))),
-             (ResTy (INST (ResTy ResFPRC:$Ra),
-               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-}
-
-// Patterns for Scalar Signed saturating
-// doubling multiply-add long (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
-  int_arm_neon_vqdmull, SQDMLALshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
-  i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
-  int_arm_neon_vqdmull, SQDMLALshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
-  i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
-  int_arm_neon_vqdmull, SQDMLALdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
-  i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
-  int_arm_neon_vqdmull, SQDMLALdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
-  i32, VPR128Lo, neon_uimm2_bare>;
-
-// Patterns for Scalar Signed saturating
-// doubling multiply-sub long (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
-  int_arm_neon_vqdmull, SQDMLSLshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
-  i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
-  int_arm_neon_vqdmull, SQDMLSLshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
-  i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
-  int_arm_neon_vqdmull, SQDMLSLdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
-  i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
-  int_arm_neon_vqdmull, SQDMLSLdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
-  i32, VPR128Lo, neon_uimm2_bare>;
-
-// Scalar Signed saturating doubling multiply returning
-// high half (scalar, by element)
-def SQDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
-  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
-  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
-  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
-  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-// Patterns for Scalar Signed saturating doubling multiply returning
-// high half (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
-  SQDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16,
-  i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
-  SQDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16,
-  i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
-  SQDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32,
-  i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
-  SQDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32,
-  i32, VPR128Lo, neon_uimm2_bare>;
-
-// Scalar Signed saturating rounding doubling multiply
-// returning high half (scalar, by element)
-def SQRDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
-  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQRDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
-  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQRDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
-  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQRDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
-  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
-  SQRDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16, i32,
-  VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
-  SQRDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16, i32,
-  VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
-  SQRDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32, i32,
-  VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
-  SQRDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32, i32,
-  VPR128Lo, neon_uimm2_bare>;
-
-// Scalar general arithmetic operation
-class Neon_Scalar_GeneralMath2D_pattern<SDPatternOperator opnode,
-                                        Instruction INST> 
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-class Neon_Scalar_GeneralMath3D_pattern<SDPatternOperator opnode,
-                                        Instruction INST> 
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (INST FPR64:$Rn, FPR64:$Rm)>;
-
-class Neon_Scalar_GeneralMath4D_pattern<SDPatternOperator opnode,
-                                        Instruction INST> 
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm),
-              (v1f64 FPR64:$Ra))),
-          (INST FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
-def : Neon_Scalar_GeneralMath3D_pattern<fadd, FADDddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fmul, FMULddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fsub, FSUBddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fdiv, FDIVddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vabds, FABDddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmaxs, FMAXddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmins, FMINddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vmaxnm, FMAXNMddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vminnm, FMINNMddd>;
-
-def : Neon_Scalar_GeneralMath2D_pattern<fabs, FABSdd>;
-def : Neon_Scalar_GeneralMath2D_pattern<fneg, FNEGdd>;
-
-def : Neon_Scalar_GeneralMath4D_pattern<fma, FMADDdddd>;
-def : Neon_Scalar_GeneralMath4D_pattern<fmsub, FMSUBdddd>;
-
-// Scalar Copy - DUP element to scalar
-class NeonI_Scalar_DUP<string asmop, string asmlane,
-                       RegisterClass ResRC, RegisterOperand VPRC,
-                       Operand OpImm>
-  : NeonI_ScalarCopy<(outs ResRC:$Rd), (ins VPRC:$Rn, OpImm:$Imm),
-                     asmop # "\t$Rd, $Rn." # asmlane # "[$Imm]",
-                     [],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  bits<4> Imm;
-}
-
-def DUPbv_B : NeonI_Scalar_DUP<"dup", "b", FPR8, VPR128, neon_uimm4_bare> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def DUPhv_H : NeonI_Scalar_DUP<"dup", "h", FPR16, VPR128, neon_uimm3_bare> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def DUPsv_S : NeonI_Scalar_DUP<"dup", "s", FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 0)),
-          (f32 (EXTRACT_SUBREG (v4f32 VPR128:$Rn), sub_32))>;
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 1)),
-          (f32 (DUPsv_S (v4f32 VPR128:$Rn), 1))>;
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 2)),
-          (f32 (DUPsv_S (v4f32 VPR128:$Rn), 2))>;
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 3)),
-          (f32 (DUPsv_S (v4f32 VPR128:$Rn), 3))>;
-
-def : Pat<(f64 (vector_extract (v2f64 VPR128:$Rn), 0)),
-          (f64 (EXTRACT_SUBREG (v2f64 VPR128:$Rn), sub_64))>;
-def : Pat<(f64 (vector_extract (v2f64 VPR128:$Rn), 1)),
-          (f64 (DUPdv_D (v2f64 VPR128:$Rn), 1))>;
-
-def : Pat<(f32 (vector_extract (v2f32 VPR64:$Rn), 0)),
-          (f32 (EXTRACT_SUBREG (v2f32 VPR64:$Rn), sub_32))>;
-def : Pat<(f32 (vector_extract (v2f32 VPR64:$Rn), 1)),
-          (f32 (DUPsv_S (v4f32 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-            1))>;
-
-def : Pat<(f64 (vector_extract (v1f64 VPR64:$Rn), 0)),
-          (f64 (EXTRACT_SUBREG (v1f64 VPR64:$Rn), sub_64))>;
-
-multiclass NeonI_Scalar_DUP_Ext_Vec_pattern<Instruction DUPI,
-  ValueType ResTy, ValueType OpTy,Operand OpLImm,
-  ValueType NOpTy, ValueType ExTy, Operand OpNImm> {
-
-  def : Pat<(ResTy (extract_subvector (OpTy VPR128:$Rn), OpLImm:$Imm)),
-            (ResTy (DUPI VPR128:$Rn, OpLImm:$Imm))>;
-
-  def : Pat<(ResTy (extract_subvector (NOpTy VPR64:$Rn), OpNImm:$Imm)),
-            (ResTy (DUPI
-              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-                OpNImm:$Imm))>;
-}
-
-// Patterns for extract subvectors of v1ix data using scalar DUP instructions.
-defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPbv_B, v1i8, v16i8, neon_uimm4_bare,
-                                        v8i8, v16i8, neon_uimm3_bare>;
-defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPhv_H, v1i16, v8i16, neon_uimm3_bare,
-                                        v4i16, v8i16, neon_uimm2_bare>;
-defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPsv_S, v1i32, v4i32, neon_uimm2_bare,
-                                        v2i32, v4i32, neon_uimm1_bare>;
-
-multiclass NeonI_Scalar_DUP_Copy_pattern1<Instruction DUPI, ValueType ResTy,
-                                          ValueType OpTy, ValueType ElemTy,
-                                          Operand OpImm, ValueType OpNTy,
-                                          ValueType ExTy, Operand OpNImm> {
-
-  def : Pat<(ResTy (vector_insert (ResTy undef),
-              (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)),
-              (neon_uimm0_bare:$Imm))),
-            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
-
-  def : Pat<(ResTy (vector_insert (ResTy undef),
-              (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)),
-              (OpNImm:$Imm))),
-            (ResTy (DUPI
-              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              OpNImm:$Imm))>;
-}
-
-multiclass NeonI_Scalar_DUP_Copy_pattern2<Instruction DUPI, ValueType ResTy,
-                                          ValueType OpTy, ValueType ElemTy,
-                                          Operand OpImm, ValueType OpNTy,
-                                          ValueType ExTy, Operand OpNImm> {
-
-  def : Pat<(ResTy (scalar_to_vector
-              (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)))),
-            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
-
-  def : Pat<(ResTy (scalar_to_vector
-              (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)))),
-            (ResTy (DUPI
-              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              OpNImm:$Imm))>;
-}
-
-// Patterns for vector copy to v1ix and v1fx vectors using scalar DUP
-// instructions.
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D,
-  v1i64, v2i64, i64, neon_uimm1_bare,
-  v1i64, v2i64, neon_uimm0_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S,
-  v1i32, v4i32, i32, neon_uimm2_bare,
-  v2i32, v4i32, neon_uimm1_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPhv_H,
-  v1i16, v8i16, i32, neon_uimm3_bare,
-  v4i16, v8i16, neon_uimm2_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPbv_B,
-  v1i8, v16i8, i32, neon_uimm4_bare,
-  v8i8, v16i8, neon_uimm3_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D,
-  v1i64, v2i64, i64, neon_uimm1_bare,
-  v1i64, v2i64, neon_uimm0_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S,
-  v1i32, v4i32, i32, neon_uimm2_bare,
-  v2i32, v4i32, neon_uimm1_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPhv_H,
-  v1i16, v8i16, i32, neon_uimm3_bare,
-  v4i16, v8i16, neon_uimm2_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPbv_B,
-  v1i8, v16i8, i32, neon_uimm4_bare,
-  v8i8, v16i8, neon_uimm3_bare>;
-
-multiclass NeonI_Scalar_DUP_alias<string asmop, string asmlane,
-                                  Instruction DUPI, Operand OpImm,
-                                  RegisterClass ResRC> {
-  def : NeonInstAlias<!strconcat(asmop, "$Rd, $Rn" # asmlane # "[$Imm]"),
-          (DUPI ResRC:$Rd, VPR128:$Rn, OpImm:$Imm), 0b0>;
-}
-
-// Aliases for Scalar copy - DUP element (scalar)
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-defm : NeonI_Scalar_DUP_alias<"mov", ".b", DUPbv_B, neon_uimm4_bare, FPR8>;
-defm : NeonI_Scalar_DUP_alias<"mov", ".h", DUPhv_H, neon_uimm3_bare, FPR16>;
-defm : NeonI_Scalar_DUP_alias<"mov", ".s", DUPsv_S, neon_uimm2_bare, FPR32>;
-defm : NeonI_Scalar_DUP_alias<"mov", ".d", DUPdv_D, neon_uimm1_bare, FPR64>;
-
-multiclass NeonI_SDUP<PatFrag GetLow, PatFrag GetHigh, ValueType ResTy,
-                      ValueType OpTy> {
-  def : Pat<(ResTy (GetLow VPR128:$Rn)),
-            (ResTy (EXTRACT_SUBREG (OpTy VPR128:$Rn), sub_64))>;
-  def : Pat<(ResTy (GetHigh VPR128:$Rn)),
-            (ResTy (DUPdv_D (OpTy VPR128:$Rn), 1))>;
-}
-
-defm : NeonI_SDUP<Neon_Low16B, Neon_High16B, v8i8, v16i8>;
-defm : NeonI_SDUP<Neon_Low8H, Neon_High8H, v4i16, v8i16>;
-defm : NeonI_SDUP<Neon_Low4S, Neon_High4S, v2i32, v4i32>;
-defm : NeonI_SDUP<Neon_Low2D, Neon_High2D, v1i64, v2i64>;
-defm : NeonI_SDUP<Neon_Low4float, Neon_High4float, v2f32, v4f32>;
-defm : NeonI_SDUP<Neon_Low2double, Neon_High2double, v1f64, v2f64>;
-
-// The following is for sext/zext from v1xx to v1xx
-multiclass NeonI_ext<string prefix, SDNode ExtOp> {
-  // v1i32 -> v1i64
-  def : Pat<(v1i64 (ExtOp (v1i32 FPR32:$Rn))),
-            (EXTRACT_SUBREG 
-              (v2i64 (!cast<Instruction>(prefix # "_2S")
-                (v2i32 (SUBREG_TO_REG (i64 0), $Rn, sub_32)), 0)),
-              sub_64)>;
-  
-  // v1i16 -> v1i32
-  def : Pat<(v1i32 (ExtOp (v1i16 FPR16:$Rn))),
-            (EXTRACT_SUBREG 
-              (v4i32 (!cast<Instruction>(prefix # "_4H")
-                (v4i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)), 0)),
-              sub_32)>;
-  
-  // v1i8 -> v1i16
-  def : Pat<(v1i16 (ExtOp (v1i8 FPR8:$Rn))),
-            (EXTRACT_SUBREG 
-              (v8i16 (!cast<Instruction>(prefix # "_8B")
-                (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
-              sub_16)>;
-}
-
-defm NeonI_zext : NeonI_ext<"USHLLvvi", zext>;
-defm NeonI_sext : NeonI_ext<"SSHLLvvi", sext>;
-
-// zext v1i8 -> v1i32
-def : Pat<(v1i32 (zext (v1i8 FPR8:$Rn))),
-          (v1i32 (EXTRACT_SUBREG
-            (v1i64 (SUBREG_TO_REG (i64 0),
-              (v1i8 (DUPbv_B
-                (v16i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)),
-                0)),
-              sub_8)),
-            sub_32))>;
-
-// zext v1i8 -> v1i64
-def : Pat<(v1i64 (zext (v1i8 FPR8:$Rn))),
-          (v1i64 (SUBREG_TO_REG (i64 0),
-            (v1i8 (DUPbv_B
-              (v16i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)),
-              0)),
-            sub_8))>;
-
-// zext v1i16 -> v1i64
-def : Pat<(v1i64 (zext (v1i16 FPR16:$Rn))),
-          (v1i64 (SUBREG_TO_REG (i64 0),
-            (v1i16 (DUPhv_H
-              (v8i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)),
-              0)),
-            sub_16))>;
-
-// sext v1i8 -> v1i32
-def : Pat<(v1i32 (sext (v1i8 FPR8:$Rn))),
-          (EXTRACT_SUBREG
-            (v4i32 (SSHLLvvi_4H
-              (v4i16 (SUBREG_TO_REG (i64 0),
-                (v1i16 (EXTRACT_SUBREG 
-                  (v8i16 (SSHLLvvi_8B
-                    (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
-                  sub_16)),
-                sub_16)), 0)),
-            sub_32)>;
-              
-// sext v1i8 -> v1i64
-def : Pat<(v1i64 (sext (v1i8 FPR8:$Rn))),
-          (EXTRACT_SUBREG 
-            (v2i64 (SSHLLvvi_2S
-              (v2i32 (SUBREG_TO_REG (i64 0),
-                (v1i32 (EXTRACT_SUBREG
-                  (v4i32 (SSHLLvvi_4H
-                    (v4i16 (SUBREG_TO_REG (i64 0),
-                      (v1i16 (EXTRACT_SUBREG 
-                        (v8i16 (SSHLLvvi_8B
-                          (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
-                        sub_16)),
-                      sub_16)), 0)),
-                  sub_32)),
-                sub_32)), 0)),
-            sub_64)>;
-
-  
-// sext v1i16 -> v1i64
-def : Pat<(v1i64 (sext (v1i16 FPR16:$Rn))),
-          (EXTRACT_SUBREG
-            (v2i64 (SSHLLvvi_2S
-              (v2i32 (SUBREG_TO_REG (i64 0),
-                (v1i32 (EXTRACT_SUBREG 
-                  (v4i32 (SSHLLvvi_4H
-                    (v4i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)), 0)),
-                  sub_32)),
-                sub_32)), 0)),
-            sub_64)>;
-
-//===----------------------------------------------------------------------===//
-// Non-Instruction Patterns
-//===----------------------------------------------------------------------===//
-
-// 64-bit vector bitcasts...
-
-def : Pat<(v1i64 (bitconvert (v8i8  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v8i8  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v8i8  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v8i8  VPR64:$src))), (v4i16 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v4i16  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v4i16  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v4i16  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v4i16  VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v2i32  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v2i32  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2i32  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2i32  VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v2f32  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v2f32  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2f32  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2f32  VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v2f32 (bitconvert (v1i64  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1i64  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1i64  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v1i64  VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v1f64  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v1f64  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1f64  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1f64  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8 (bitconvert (v1f64  VPR64:$src))), (v8i8 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v1f64  VPR64:$src))), (f64 VPR64:$src)>;
-
-def : Pat<(v1f64 (bitconvert (v1i64  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2f32  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2i32  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v4i16  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v8i8  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (f64  VPR64:$src))), (v1f64 VPR64:$src)>;
-
-// ..and 128-bit vector bitcasts...
-
-def : Pat<(v2f64 (bitconvert (v16i8  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v16i8  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v16i8  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v16i8  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v16i8  VPR128:$src))), (v8i16 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v8i16  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v8i16  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v8i16  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v8i16  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v8i16  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v4i32  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4i32  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v4i32  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4i32  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4i32  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v4f32  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4f32  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v4f32  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4f32  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4f32  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v2i64  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2i64  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2i64  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2i64  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2i64  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2i64 (bitconvert (v2f64  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2f64  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2f64  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2f64  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2f64  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-// ...and scalar bitcasts...
-def : Pat<(f16 (bitconvert (v1i16  FPR16:$src))), (f16 FPR16:$src)>;
-def : Pat<(f32 (bitconvert (v1i32  FPR32:$src))), (f32 FPR32:$src)>;
-def : Pat<(f64 (bitconvert (v1i64  FPR64:$src))), (f64 FPR64:$src)>;
-def : Pat<(f64 (bitconvert (v1f64  FPR64:$src))), (f64 FPR64:$src)>;
-
-def : Pat<(i64 (bitconvert (v1i64  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v1f64  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v2i32  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v2f32  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v4i16  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v8i8  FPR64:$src))), (FMOVxd $src)>;
-
-def : Pat<(i32 (bitconvert (v1i32  FPR32:$src))), (FMOVws $src)>;
-
-def : Pat<(v8i8  (bitconvert (v1i64  VPR64:$src))), (v8i8 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1i64  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1i64  VPR64:$src))), (v2i32 VPR64:$src)>;
-
-def : Pat<(f64   (bitconvert (v8i8  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v4i16  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2i32  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2f32  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v1i64  VPR64:$src))), (f64 VPR64:$src)>;
-
-def : Pat<(f128  (bitconvert (v16i8  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v8i16  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v4i32  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v2i64  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v4f32  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v2f64  VPR128:$src))), (f128 VPR128:$src)>;
-
-def : Pat<(v1i16 (bitconvert (f16  FPR16:$src))), (v1i16 FPR16:$src)>;
-def : Pat<(v1i32 (bitconvert (f32  FPR32:$src))), (v1i32 FPR32:$src)>;
-def : Pat<(v1i64 (bitconvert (f64  FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (f64  FPR64:$src))), (v1f64 FPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v1f64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v2i32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v2f32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v4i16 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v8i8 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-
-def : Pat<(v1i32 (bitconvert (i32  GPR32:$src))), (FMOVsw $src)>;
-
-def : Pat<(v8i8   (bitconvert (f64   FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16  (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v2i32  (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32  (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v1i64  (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
-
-def : Pat<(v16i8  (bitconvert (f128   FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16  (bitconvert (f128   FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v4i32  (bitconvert (f128   FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64  (bitconvert (f128   FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v4f32  (bitconvert (f128   FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2f64  (bitconvert (f128   FPR128:$src))), (v2f64 FPR128:$src)>;
-
-// Scalar Three Same
-
-def neon_uimm3 : Operand<i64>,
-                   ImmLeaf<i64, [{return Imm < 8;}]> {
-  let ParserMatchClass = uimm3_asmoperand;
-  let PrintMethod = "printUImmHexOperand";
-}
-
-def neon_uimm4 : Operand<i64>,
-                   ImmLeaf<i64, [{return Imm < 16;}]> {
-  let ParserMatchClass = uimm4_asmoperand;
-  let PrintMethod = "printUImmHexOperand";
-}
-
-// Bitwise Extract
-class NeonI_Extract<bit q, bits<2> op2, string asmop,
-                    string OpS, RegisterOperand OpVPR, Operand OpImm>
-  : NeonI_BitExtract<q, op2, (outs OpVPR:$Rd),
-                     (ins OpVPR:$Rn, OpVPR:$Rm, OpImm:$Index),
-                     asmop # "\t$Rd." # OpS # ", $Rn." # OpS #
-                     ", $Rm." # OpS # ", $Index",
-                     [],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>{
-  bits<4> Index;
-}
-
-def EXTvvvi_8b : NeonI_Extract<0b0, 0b00, "ext", "8b",
-                               VPR64, neon_uimm3> {
-  let Inst{14-11} = {0b0, Index{2}, Index{1}, Index{0}};
-}
-
-def EXTvvvi_16b: NeonI_Extract<0b1, 0b00, "ext", "16b",
-                               VPR128, neon_uimm4> {
-  let Inst{14-11} = Index;
-}
-
-class NI_Extract<ValueType OpTy, RegisterOperand OpVPR, Instruction INST,
-                 Operand OpImm>
-  : Pat<(OpTy (Neon_vextract (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm),
-                                 (i64 OpImm:$Imm))),
-              (INST OpVPR:$Rn, OpVPR:$Rm, OpImm:$Imm)>;
-
-def : NI_Extract<v8i8,  VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v4i16, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v2i32, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v1i64, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v2f32, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v1f64, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v16i8, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v8i16, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v4i32, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v2i64, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v4f32, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v2f64, VPR128, EXTvvvi_16b, neon_uimm4>;
-
-// Table lookup
-class NI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
-             string asmop, string OpS, RegisterOperand OpVPR,
-             RegisterOperand VecList>
-  : NeonI_TBL<q, op2, len, op,
-              (outs OpVPR:$Rd), (ins VecList:$Rn, OpVPR:$Rm),
-              asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
-              [],
-              NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-// The vectors in look up table are always 16b
-multiclass NI_TBL_pat<bits<2> len, bit op, string asmop, string List> {
-  def _8b  : NI_TBL<0, 0b00, len, op, asmop, "8b", VPR64,
-                    !cast<RegisterOperand>(List # "16B_operand")>;
-
-  def _16b : NI_TBL<1, 0b00, len, op, asmop, "16b", VPR128,
-                    !cast<RegisterOperand>(List # "16B_operand")>;
-}
-
-defm TBL1 : NI_TBL_pat<0b00, 0b0, "tbl", "VOne">;
-defm TBL2 : NI_TBL_pat<0b01, 0b0, "tbl", "VPair">;
-defm TBL3 : NI_TBL_pat<0b10, 0b0, "tbl", "VTriple">;
-defm TBL4 : NI_TBL_pat<0b11, 0b0, "tbl", "VQuad">;
-
-// Table lookup extension
-class NI_TBX<bit q, bits<2> op2, bits<2> len, bit op,
-             string asmop, string OpS, RegisterOperand OpVPR,
-             RegisterOperand VecList>
-  : NeonI_TBL<q, op2, len, op,
-              (outs OpVPR:$Rd), (ins OpVPR:$src, VecList:$Rn, OpVPR:$Rm),
-              asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
-              [],
-              NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-// The vectors in look up table are always 16b
-multiclass NI_TBX_pat<bits<2> len, bit op, string asmop, string List> {
-  def _8b  : NI_TBX<0, 0b00, len, op, asmop, "8b", VPR64,
-                    !cast<RegisterOperand>(List # "16B_operand")>;
-
-  def _16b : NI_TBX<1, 0b00, len, op, asmop, "16b", VPR128,
-                    !cast<RegisterOperand>(List # "16B_operand")>;
-}
-
-defm TBX1 : NI_TBX_pat<0b00, 0b1, "tbx", "VOne">;
-defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">;
-defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">;
-defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">;
-
-class NeonI_INS_main<string asmop, string Res, ValueType ResTy,
-                     RegisterClass OpGPR, ValueType OpTy, Operand OpImm>
-  : NeonI_copy<0b1, 0b0, 0b0011,
-               (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm),
-               asmop # "\t$Rd." # Res # "[$Imm], $Rn",
-               [(set (ResTy VPR128:$Rd),
-                 (ResTy (vector_insert
-                   (ResTy VPR128:$src),
-                   (OpTy OpGPR:$Rn),
-                   (OpImm:$Imm))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  bits<4> Imm;
-  let Constraints = "$src = $Rd";
-}
-
-//Insert element (vector, from main)
-def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32,
-                           neon_uimm4_bare> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32,
-                           neon_uimm3_bare> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32,
-                           neon_uimm2_bare> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64,
-                           neon_uimm1_bare> {
-  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def : NeonInstAlias<"mov $Rd.b[$Imm], $Rn",
-                    (INSbw VPR128:$Rd, GPR32:$Rn, neon_uimm4_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd.h[$Imm], $Rn",
-                    (INShw VPR128:$Rd, GPR32:$Rn, neon_uimm3_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd.s[$Imm], $Rn",
-                    (INSsw VPR128:$Rd, GPR32:$Rn, neon_uimm2_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd.d[$Imm], $Rn",
-                    (INSdx VPR128:$Rd, GPR64:$Rn, neon_uimm1_bare:$Imm), 0>;
-
-class Neon_INS_main_pattern <ValueType ResTy,ValueType ExtResTy,
-                             RegisterClass OpGPR, ValueType OpTy,
-                             Operand OpImm, Instruction INS>
-  : Pat<(ResTy (vector_insert
-              (ResTy VPR64:$src),
-              (OpTy OpGPR:$Rn),
-              (OpImm:$Imm))),
-        (ResTy (EXTRACT_SUBREG
-          (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
-            OpGPR:$Rn, OpImm:$Imm)), sub_64))>;
-
-def INSbw_pattern : Neon_INS_main_pattern<v8i8, v16i8, GPR32, i32,
-                                          neon_uimm3_bare, INSbw>;
-def INShw_pattern : Neon_INS_main_pattern<v4i16, v8i16, GPR32, i32,
-                                          neon_uimm2_bare, INShw>;
-def INSsw_pattern : Neon_INS_main_pattern<v2i32, v4i32, GPR32, i32,
-                                          neon_uimm1_bare, INSsw>;
-def INSdx_pattern : Neon_INS_main_pattern<v1i64, v2i64, GPR64, i64,
-                                          neon_uimm0_bare, INSdx>;
-
-class NeonI_INS_element<string asmop, string Res, Operand ResImm>
-  : NeonI_insert<0b1, 0b1,
-                 (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn,
-                 ResImm:$Immd, ResImm:$Immn),
-                 asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  bits<4> Immd;
-  bits<4> Immn;
-}
-
-//Insert element (vector, from element)
-def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> {
-  let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1};
-  let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}};
-}
-def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> {
-  let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0};
-  let Inst{14-11} = {Immn{2}, Immn{1}, Immn{0}, 0b0};
-  // bit 11 is unspecified, but should be set to zero.
-}
-def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> {
-  let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0};
-  let Inst{14-11} = {Immn{1}, Immn{0}, 0b0, 0b0};
-  // bits 11-12 are unspecified, but should be set to zero.
-}
-def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> {
-  let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0};
-  let Inst{14-11} = {Immn{0}, 0b0, 0b0, 0b0};
-  // bits 11-13 are unspecified, but should be set to zero.
-}
-
-def : NeonInstAlias<"mov $Rd.b[$Immd], $Rn.b[$Immn]",
-                    (INSELb VPR128:$Rd, VPR128:$Rn,
-                      neon_uimm4_bare:$Immd, neon_uimm4_bare:$Immn), 0>;
-def : NeonInstAlias<"mov $Rd.h[$Immd], $Rn.h[$Immn]",
-                    (INSELh VPR128:$Rd, VPR128:$Rn,
-                      neon_uimm3_bare:$Immd, neon_uimm3_bare:$Immn), 0>;
-def : NeonInstAlias<"mov $Rd.s[$Immd], $Rn.s[$Immn]",
-                    (INSELs VPR128:$Rd, VPR128:$Rn,
-                      neon_uimm2_bare:$Immd, neon_uimm2_bare:$Immn), 0>;
-def : NeonInstAlias<"mov $Rd.d[$Immd], $Rn.d[$Immn]",
-                    (INSELd VPR128:$Rd, VPR128:$Rn,
-                      neon_uimm1_bare:$Immd, neon_uimm1_bare:$Immn), 0>;
-
-multiclass Neon_INS_elt_pattern<ValueType ResTy, ValueType NaTy,
-                                ValueType MidTy, Operand StImm, Operand NaImm,
-                                Instruction INS> {
-def : Pat<(ResTy (vector_insert
-            (ResTy VPR128:$src),
-            (MidTy (vector_extract
-              (ResTy VPR128:$Rn),
-              (StImm:$Immn))),
-            (StImm:$Immd))),
-          (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn),
-              StImm:$Immd, StImm:$Immn)>;
-
-def : Pat <(ResTy (vector_insert
-             (ResTy VPR128:$src),
-             (MidTy (vector_extract
-               (NaTy VPR64:$Rn),
-               (NaImm:$Immn))),
-             (StImm:$Immd))),
-           (INS (ResTy VPR128:$src),
-             (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
-             StImm:$Immd, NaImm:$Immn)>;
-
-def : Pat <(NaTy (vector_insert
-             (NaTy VPR64:$src),
-             (MidTy (vector_extract
-               (ResTy VPR128:$Rn),
-               (StImm:$Immn))),
-             (NaImm:$Immd))),
-           (NaTy (EXTRACT_SUBREG
-             (ResTy (INS
-               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
-               (ResTy VPR128:$Rn),
-               NaImm:$Immd, StImm:$Immn)),
-             sub_64))>;
-
-def : Pat <(NaTy (vector_insert
-             (NaTy VPR64:$src),
-             (MidTy (vector_extract
-               (NaTy VPR64:$Rn),
-               (NaImm:$Immn))),
-             (NaImm:$Immd))),
-           (NaTy (EXTRACT_SUBREG
-             (ResTy (INS
-               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
-               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
-               NaImm:$Immd, NaImm:$Immn)),
-             sub_64))>;
-}
-
-defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, neon_uimm2_bare,
-                            neon_uimm1_bare, INSELs>;
-defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, neon_uimm1_bare,
-                            neon_uimm0_bare, INSELd>;
-defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
-                            neon_uimm3_bare, INSELb>;
-defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
-                            neon_uimm2_bare, INSELh>;
-defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
-                            neon_uimm1_bare, INSELs>;
-defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, neon_uimm1_bare,
-                            neon_uimm0_bare, INSELd>;
-
-multiclass Neon_INS_elt_float_pattern<ValueType ResTy, ValueType NaTy,
-                                      ValueType MidTy,
-                                      RegisterClass OpFPR, Operand ResImm,
-                                      SubRegIndex SubIndex, Instruction INS> {
-def : Pat <(ResTy (vector_insert
-             (ResTy VPR128:$src),
-             (MidTy OpFPR:$Rn),
-             (ResImm:$Imm))),
-           (INS (ResTy VPR128:$src),
-             (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)),
-             ResImm:$Imm,
-             (i64 0))>;
-
-def : Pat <(NaTy (vector_insert
-             (NaTy VPR64:$src),
-             (MidTy OpFPR:$Rn),
-             (ResImm:$Imm))),
-           (NaTy (EXTRACT_SUBREG
-             (ResTy (INS
-               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
-               (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)),
-               ResImm:$Imm,
-               (i64 0))),
-             sub_64))>;
-}
-
-defm : Neon_INS_elt_float_pattern<v4f32, v2f32, f32, FPR32, neon_uimm2_bare,
-                                  sub_32, INSELs>;
-defm : Neon_INS_elt_float_pattern<v2f64, v1f64, f64, FPR64, neon_uimm1_bare,
-                                  sub_64, INSELd>;
-
-class NeonI_SMOV<string asmop, string Res, bit Q,
-                 ValueType OpTy, ValueType eleTy,
-                 Operand OpImm, RegisterClass ResGPR, ValueType ResTy>
-  : NeonI_copy<Q, 0b0, 0b0101,
-               (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
-               asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
-               [(set (ResTy ResGPR:$Rd),
-                 (ResTy (sext_inreg
-                   (ResTy (vector_extract
-                     (OpTy VPR128:$Rn), (OpImm:$Imm))),
-                   eleTy)))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  bits<4> Imm;
-}
-
-//Signed integer move (main, from element)
-def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare,
-                        GPR32, i32> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare,
-                        GPR32, i32> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare,
-                        GPR64, i64> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare,
-                        GPR64, i64> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare,
-                        GPR64, i64> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-
-multiclass Neon_SMOVx_pattern <ValueType StTy, ValueType NaTy,
-                               ValueType eleTy, Operand StImm,  Operand NaImm,
-                               Instruction SMOVI> {
-  def : Pat<(i64 (sext_inreg
-              (i64 (anyext
-                (i32 (vector_extract
-                  (StTy VPR128:$Rn), (StImm:$Imm))))),
-              eleTy)),
-            (SMOVI VPR128:$Rn, StImm:$Imm)>;
-
-  def : Pat<(i64 (sext
-              (i32 (vector_extract
-                (StTy VPR128:$Rn), (StImm:$Imm))))),
-            (SMOVI VPR128:$Rn, StImm:$Imm)>;
-
-  def : Pat<(i64 (sext_inreg
-              (i64 (vector_extract
-                (NaTy VPR64:$Rn), (NaImm:$Imm))),
-              eleTy)),
-            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              NaImm:$Imm)>;
-
-  def : Pat<(i64 (sext_inreg
-              (i64 (anyext
-                (i32 (vector_extract
-                  (NaTy VPR64:$Rn), (NaImm:$Imm))))),
-              eleTy)),
-            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              NaImm:$Imm)>;
-
-  def : Pat<(i64 (sext
-              (i32 (vector_extract
-                (NaTy VPR64:$Rn), (NaImm:$Imm))))),
-            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              NaImm:$Imm)>;
-}
-
-defm : Neon_SMOVx_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
-                          neon_uimm3_bare, SMOVxb>;
-defm : Neon_SMOVx_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
-                          neon_uimm2_bare, SMOVxh>;
-defm : Neon_SMOVx_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
-                          neon_uimm1_bare, SMOVxs>;
-
-class Neon_SMOVw_pattern <ValueType StTy, ValueType NaTy,
-                          ValueType eleTy, Operand StImm,  Operand NaImm,
-                          Instruction SMOVI>
-  : Pat<(i32 (sext_inreg
-          (i32 (vector_extract
-            (NaTy VPR64:$Rn), (NaImm:$Imm))),
-          eleTy)),
-        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-          NaImm:$Imm)>;
-
-def : Neon_SMOVw_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
-                         neon_uimm3_bare, SMOVwb>;
-def : Neon_SMOVw_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
-                         neon_uimm2_bare, SMOVwh>;
-
-class NeonI_UMOV<string asmop, string Res, bit Q,
-                 ValueType OpTy, Operand OpImm,
-                 RegisterClass ResGPR, ValueType ResTy>
-  : NeonI_copy<Q, 0b0, 0b0111,
-               (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
-               asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
-               [(set (ResTy ResGPR:$Rd),
-                  (ResTy (vector_extract
-                    (OpTy VPR128:$Rn), (OpImm:$Imm))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  bits<4> Imm;
-}
-
-//Unsigned integer move (main, from element)
-def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare,
-                         GPR32, i32> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare,
-                         GPR32, i32> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare,
-                         GPR32, i32> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare,
-                         GPR64, i64> {
-  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def : NeonInstAlias<"mov $Rd, $Rn.s[$Imm]",
-                    (UMOVws GPR32:$Rd, VPR128:$Rn, neon_uimm2_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd, $Rn.d[$Imm]",
-                    (UMOVxd GPR64:$Rd, VPR128:$Rn, neon_uimm1_bare:$Imm), 0>;
-
-class Neon_UMOV_pattern <ValueType StTy, ValueType NaTy, ValueType ResTy,
-                         Operand StImm,  Operand NaImm,
-                         Instruction SMOVI>
-  : Pat<(ResTy (vector_extract
-          (NaTy VPR64:$Rn), NaImm:$Imm)),
-        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-          NaImm:$Imm)>;
-
-def : Neon_UMOV_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
-                        neon_uimm3_bare, UMOVwb>;
-def : Neon_UMOV_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
-                        neon_uimm2_bare, UMOVwh>;
-def : Neon_UMOV_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
-                        neon_uimm1_bare, UMOVws>;
-
-def : Pat<(i32 (and
-            (i32 (vector_extract
-              (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))),
-            255)),
-          (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>;
-
-def : Pat<(i32 (and
-            (i32 (vector_extract
-              (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))),
-            65535)),
-          (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>;
-
-def : Pat<(i64 (zext
-            (i32 (vector_extract
-              (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))),
-          (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>;
-
-def : Pat<(i32 (and
-            (i32 (vector_extract
-              (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))),
-            255)),
-          (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
-            neon_uimm3_bare:$Imm)>;
-
-def : Pat<(i32 (and
-            (i32 (vector_extract
-              (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))),
-            65535)),
-          (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
-            neon_uimm2_bare:$Imm)>;
-
-def : Pat<(i64 (zext
-            (i32 (vector_extract
-              (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))),
-          (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
-            neon_uimm0_bare:$Imm)>;
-
-// Additional copy patterns for scalar types
-def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))),
-          (UMOVwb (v16i8
-            (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>;
-
-def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))),
-          (UMOVwh (v8i16
-            (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>;
-
-def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))),
-          (FMOVws FPR32:$Rn)>;
-
-def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))),
-          (FMOVxd FPR64:$Rn)>;
-
-def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))),
-          (f64 FPR64:$Rn)>;
-
-def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)),
-          (v1i8 (EXTRACT_SUBREG (v16i8
-            (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_8))>;
-
-def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)),
-          (v1i16 (EXTRACT_SUBREG (v8i16
-            (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_16))>;
-
-def : Pat<(v1i32 (scalar_to_vector GPR32:$src)),
-          (FMOVsw $src)>;
-
-def : Pat<(v1i64 (scalar_to_vector GPR64:$src)),
-          (FMOVdx $src)>;
-
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
-          (v8i8 (EXTRACT_SUBREG (v16i8
-            (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_64))>;
-
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
-          (v4i16 (EXTRACT_SUBREG (v8i16
-            (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_64))>;
-
-def : Pat<(v2i32 (scalar_to_vector GPR32:$Rn)),
-          (v2i32 (EXTRACT_SUBREG (v16i8
-            (INSsw (v4i32 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_64))>;
-
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
-          (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
-          (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v4i32 (scalar_to_vector GPR32:$Rn)),
-          (INSsw (v4i32 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v2i64 (scalar_to_vector GPR64:$Rn)),
-          (INSdx (v2i64 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)>;
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)>;
-
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))),
-          (v1f64 FPR64:$Rn)>;
-
-def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)),
-                         (f64 FPR64:$src), sub_64)>;
-
-class NeonI_DUP_Elt<bit Q, string asmop, string rdlane,  string rnlane,
-                    RegisterOperand ResVPR, Operand OpImm>
-  : NeonI_copy<Q, 0b0, 0b0000, (outs ResVPR:$Rd),
-               (ins VPR128:$Rn, OpImm:$Imm),
-               asmop # "\t$Rd" # rdlane # ", $Rn" # rnlane # "[$Imm]",
-               [],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  bits<4> Imm;
-}
-
-def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128,
-                              neon_uimm4_bare> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-
-def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128,
-                              neon_uimm3_bare> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-
-def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128,
-                              neon_uimm2_bare> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-
-def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128,
-                              neon_uimm1_bare> {
-  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64,
-                              neon_uimm4_bare> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-
-def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64,
-                              neon_uimm3_bare> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-
-def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64,
-                              neon_uimm2_bare> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-
-multiclass NeonI_DUP_Elt_pattern<Instruction DUPELT, ValueType ResTy,
-                                       ValueType OpTy,ValueType NaTy,
-                                       ValueType ExTy, Operand OpLImm,
-                                       Operand OpNImm> {
-def  : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)),
-        (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>;
-
-def : Pat<(ResTy (Neon_vduplane
-            (NaTy VPR64:$Rn), OpNImm:$Imm)),
-          (ResTy (DUPELT
-            (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>;
-}
-defm : NeonI_DUP_Elt_pattern<DUPELT16b, v16i8, v16i8, v8i8, v16i8,
-                             neon_uimm4_bare, neon_uimm3_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT8b, v8i8, v16i8, v8i8, v16i8,
-                             neon_uimm4_bare, neon_uimm3_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT8h, v8i16, v8i16, v4i16, v8i16,
-                             neon_uimm3_bare, neon_uimm2_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT4h, v4i16, v8i16, v4i16, v8i16,
-                             neon_uimm3_bare, neon_uimm2_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4i32, v4i32, v2i32, v4i32,
-                             neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2i32, v4i32, v2i32, v4i32,
-                             neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2i64, v2i64, v1i64, v2i64,
-                             neon_uimm1_bare, neon_uimm0_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4f32, v4f32, v2f32, v4f32,
-                             neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2f32, v4f32, v2f32, v4f32,
-                             neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2f64, v2f64, v1f64, v2f64,
-                             neon_uimm1_bare, neon_uimm0_bare>;
-
-def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))),
-          (v2f32 (DUPELT2s
-            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-            (i64 0)))>;
-def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))),
-          (v4f32 (DUPELT4s
-            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-            (i64 0)))>;
-def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))),
-          (v2f64 (DUPELT2d
-            (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64),
-            (i64 0)))>;
-
-multiclass NeonI_DUP_pattern<Instruction DUPELT, ValueType ResTy,
-                             ValueType OpTy, RegisterClass OpRC,
-                             Operand OpNImm, SubRegIndex SubIndex> {
-def : Pat<(ResTy (Neon_vduplane (OpTy OpRC:$Rn), OpNImm:$Imm)),
-          (ResTy (DUPELT
-            (SUBREG_TO_REG (i64 0), OpRC:$Rn, SubIndex), OpNImm:$Imm))>;
-}
-
-defm : NeonI_DUP_pattern<DUPELT4h, v4i16, v1i16, FPR16, neon_uimm2_bare,sub_16>;
-defm : NeonI_DUP_pattern<DUPELT4s, v4i32, v1i32, FPR32, neon_uimm2_bare,sub_32>;
-defm : NeonI_DUP_pattern<DUPELT8b, v8i8, v1i8, FPR8, neon_uimm3_bare, sub_8>;
-defm : NeonI_DUP_pattern<DUPELT8h, v8i16, v1i16, FPR16, neon_uimm3_bare,sub_16>;
-defm : NeonI_DUP_pattern<DUPELT16b, v16i8, v1i8, FPR8, neon_uimm4_bare, sub_8>;
-
-class NeonI_DUP<bit Q, string asmop, string rdlane,
-                RegisterOperand ResVPR, ValueType ResTy,
-                RegisterClass OpGPR, ValueType OpTy>
-  : NeonI_copy<Q, 0b0, 0b0001, (outs ResVPR:$Rd), (ins OpGPR:$Rn),
-               asmop # "\t$Rd" # rdlane # ", $Rn",
-               [(set (ResTy ResVPR:$Rd),
-                 (ResTy (Neon_vdup (OpTy OpGPR:$Rn))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> {
-  let Inst{20-16} = 0b00001;
-  // bits 17-20 are unspecified, but should be set to zero.
-}
-
-def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> {
-  let Inst{20-16} = 0b00010;
-  // bits 18-20 are unspecified, but should be set to zero.
-}
-
-def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> {
-  let Inst{20-16} = 0b00100;
-  // bits 19-20 are unspecified, but should be set to zero.
-}
-
-def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> {
-  let Inst{20-16} = 0b01000;
-  // bit 20 is unspecified, but should be set to zero.
-}
-
-def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> {
-  let Inst{20-16} = 0b00001;
-  // bits 17-20 are unspecified, but should be set to zero.
-}
-
-def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> {
-  let Inst{20-16} = 0b00010;
-  // bits 18-20 are unspecified, but should be set to zero.
-}
-
-def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> {
-  let Inst{20-16} = 0b00100;
-  // bits 19-20 are unspecified, but should be set to zero.
-}
-
-// patterns for CONCAT_VECTORS
-multiclass Concat_Vector_Pattern<ValueType ResTy, ValueType OpTy> {
-def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)),
-          (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>;
-def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))),
-          (INSELd
-            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)),
-            (i64 1),
-            (i64 0))>;
-def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))),
-          (DUPELT2d
-            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-            (i64 0))> ;
-}
-
-defm : Concat_Vector_Pattern<v16i8, v8i8>;
-defm : Concat_Vector_Pattern<v8i16, v4i16>;
-defm : Concat_Vector_Pattern<v4i32, v2i32>;
-defm : Concat_Vector_Pattern<v2i64, v1i64>;
-defm : Concat_Vector_Pattern<v4f32, v2f32>;
-defm : Concat_Vector_Pattern<v2f64, v1f64>;
-
-def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), undef)),
-          (v2i32 (SUBREG_TO_REG(i64 0), $Rn, sub_32))>;
-def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-          (EXTRACT_SUBREG 
-            (v4i32 (INSELs
-              (v4i32 (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)),
-              (v4i32 (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              (i64 1),
-              (i64 0))),
-            sub_64)>;
-def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rn))),
-          (DUPELT2s (v4i32 (SUBREG_TO_REG(i64 0), $Rn, sub_32)), 0)>;
-
-//patterns for EXTRACT_SUBVECTOR
-def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))),
-          (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))),
-          (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))),
-          (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))),
-          (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))),
-          (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))),
-          (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-
-// The followings are for instruction class (3V Elem)
-
-// Variant 1
-
-class NI_2VE<bit q, bit u, bits<2> size, bits<4> opcode,
-             string asmop, string ResS, string OpS, string EleOpS,
-             Operand OpImm, RegisterOperand ResVPR,
-             RegisterOperand OpVPR, RegisterOperand EleOpVPR>
-  : NeonI_2VElem<q, u, size, opcode,
-                 (outs ResVPR:$Rd), (ins ResVPR:$src, OpVPR:$Rn,
-                                         EleOpVPR:$Re, OpImm:$Index),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
-                 ", $Re." # EleOpS # "[$Index]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
-  bits<3> Index;
-  bits<5> Re;
-
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NI_2VE_v1<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
-                     neon_uimm2_bare, VPR64, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
-                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
-  def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
-                     neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-
-  def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
-                     neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-}
-
-defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">;
-defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">;
-
-// Pattern for lane in 128-bit vector
-class NI_2VE_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                   RegisterOperand ResVPR, RegisterOperand OpVPR,
-                   RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
-                   ValueType EleOpTy>
-  : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
-          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VE_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                  RegisterOperand ResVPR, RegisterOperand OpVPR,
-                  RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
-                  ValueType EleOpTy>
-  : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
-          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST ResVPR:$src, OpVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-multiclass NI_2VE_v1_pat<string subop, SDPatternOperator op>
-{
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
-                     op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
-                     op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
-                     op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
-                     op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
-                    op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32>;
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
-                    op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
-}
-
-defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>;
-defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>;
-
-class NI_2VE_2op<bit q, bit u, bits<2> size, bits<4> opcode,
-                 string asmop, string ResS, string OpS, string EleOpS,
-                 Operand OpImm, RegisterOperand ResVPR,
-                 RegisterOperand OpVPR, RegisterOperand EleOpVPR>
-  : NeonI_2VElem<q, u, size, opcode,
-                 (outs ResVPR:$Rd), (ins OpVPR:$Rn,
-                                         EleOpVPR:$Re, OpImm:$Index),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
-                 ", $Re." # EleOpS # "[$Index]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  bits<3> Index;
-  bits<5> Re;
-}
-
-multiclass NI_2VE_v1_2op<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
-                         neon_uimm2_bare, VPR64, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
-                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
-  def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
-                         neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-
-  def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
-                         neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">;
-defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">;
-defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">;
-}
-
-// Pattern for lane in 128-bit vector
-class NI_2VE_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                       RegisterOperand OpVPR, RegisterOperand EleOpVPR,
-                       ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
-  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
-          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VE_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                      RegisterOperand OpVPR, RegisterOperand EleOpVPR,
-                      ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
-  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
-          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST OpVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-multiclass NI_2VE_mul_v1_pat<string subop, SDPatternOperator op> {
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
-                         op, VPR64, VPR128, v2i32, v2i32, v4i32>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
-                         op, VPR128, VPR128, v4i32, v4i32, v4i32>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
-                         op, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
-                         op, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
-                        op, VPR64, VPR64, v2i32, v2i32, v2i32>;
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
-                        op, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
-}
-
-defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>;
-defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>;
-defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>;
-
-// Variant 2
-
-multiclass NI_2VE_v2_2op<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
-                         neon_uimm2_bare, VPR64, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
-                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // _1d2d doesn't exist!
-
-  def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
-                         neon_uimm1_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{0}};
-    let Inst{21} = 0b0;
-    let Inst{20-16} = Re;
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">;
-defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">;
-}
-
-class NI_2VE_mul_lane_2d<Instruction INST, Operand OpImm, SDPatternOperator op,
-                         RegisterOperand OpVPR, RegisterOperand EleOpVPR,
-                         ValueType ResTy, ValueType OpTy, ValueType EleOpTy,
-                         SDPatternOperator coreop>
-  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
-          (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))),
-        (INST OpVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>;
-
-multiclass NI_2VE_mul_v2_pat<string subop, SDPatternOperator op> {
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
-                         op, VPR64, VPR128, v2f32, v2f32, v4f32>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
-                         op, VPR128, VPR128, v4f32, v4f32, v4f32>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
-                         op, VPR128, VPR128, v2f64, v2f64, v2f64>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
-                        op, VPR64, VPR64, v2f32, v2f32, v2f32>;
-
-  def : NI_2VE_mul_lane_2d<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
-                           op, VPR128, VPR64, v2f64, v2f64, v1f64,
-                           BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
-}
-
-defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>;
-defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>;
-
-def : Pat<(v2f32 (fmul (v2f32 (Neon_vdup (f32 FPR32:$Re))),
-                       (v2f32 VPR64:$Rn))),
-          (FMULve_2s4s VPR64:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-def : Pat<(v4f32 (fmul (v4f32 (Neon_vdup (f32 FPR32:$Re))),
-                       (v4f32 VPR128:$Rn))),
-          (FMULve_4s4s VPR128:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-def : Pat<(v2f64 (fmul (v2f64 (Neon_vdup (f64 FPR64:$Re))),
-                       (v2f64 VPR128:$Rn))),
-          (FMULve_2d2d VPR128:$Rn, (SUBREG_TO_REG (i64 0), $Re, sub_64), 0)>;
-
-// The followings are patterns using fma
-// -ffp-contract=fast generates fma
-
-multiclass NI_2VE_v2<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
-                     neon_uimm2_bare, VPR64, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
-                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // _1d2d doesn't exist!
-
-  def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
-                     neon_uimm1_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{0}};
-    let Inst{21} = 0b0;
-    let Inst{20-16} = Re;
-  }
-}
-
-defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">;
-defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">;
-
-// Pattern for lane in 128-bit vector
-class NI_2VEswap_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                       RegisterOperand ResVPR, RegisterOperand OpVPR,
-                       ValueType ResTy, ValueType OpTy,
-                       SDPatternOperator coreop>
-  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
-                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane 0
-class NI_2VEfma_lane0<Instruction INST, SDPatternOperator op,
-                      RegisterOperand ResVPR, ValueType ResTy>
-  : Pat<(ResTy (op (ResTy ResVPR:$Rn),
-                   (ResTy (Neon_vdup (f32 FPR32:$Re))),
-                   (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn,
-              (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEswap_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                      RegisterOperand ResVPR, RegisterOperand OpVPR,
-                      ValueType ResTy, ValueType OpTy,
-                      SDPatternOperator coreop>
-  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
-                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEswap_lane_2d2d<Instruction INST, Operand OpImm,
-                           SDPatternOperator op,
-                           RegisterOperand ResVPR, RegisterOperand OpVPR,
-                           ValueType ResTy, ValueType OpTy,
-                           SDPatternOperator coreop>
-  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))),
-                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>;
-
-
-multiclass NI_2VE_fma_v2_pat<string subop, SDPatternOperator op> {
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
-                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
-                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
-  def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_2s4s"),
-                        op, VPR64, v2f32>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
-                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
-                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
-  def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_4s4s"),
-                        op, VPR128, v4f32>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
-                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
-                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
-                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
-                        BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
-  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
-                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
-                             BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
-}
-
-defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>;
-
-// Pattern for lane 0
-class NI_2VEfms_lane0<Instruction INST, SDPatternOperator op,
-                      RegisterOperand ResVPR, ValueType ResTy>
-  : Pat<(ResTy (op (ResTy (fneg ResVPR:$Rn)),
-                   (ResTy (Neon_vdup (f32 FPR32:$Re))),
-                   (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn,
-              (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-multiclass NI_2VE_fms_v2_pat<string subop, SDPatternOperator op>
-{
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
-                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
-                         BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
-                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
-                         BinOpFrag<(Neon_vduplane
-                                     (fneg node:$LHS), node:$RHS)>>;
-
-  def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_2s4s"),
-                        op, VPR64, v2f32>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
-                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
-                         BinOpFrag<(fneg (Neon_vduplane
-                                     node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
-                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
-                         BinOpFrag<(Neon_vduplane
-                                     (fneg node:$LHS), node:$RHS)>>;
-
-  def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_4s4s"),
-                        op, VPR128, v4f32>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
-                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
-                         BinOpFrag<(fneg (Neon_vduplane
-                                     node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
-                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
-                         BinOpFrag<(Neon_vduplane
-                                     (fneg node:$LHS), node:$RHS)>>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
-                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
-                        BinOpFrag<(fneg (Neon_vduplane
-                                    node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
-                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
-                        BinOpFrag<(Neon_vduplane
-                                    (fneg node:$LHS), node:$RHS)>>;
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
-                        neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
-                        BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
-                        neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
-                        BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>;
-
-  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
-                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
-                             BinOpFrag<(fneg (Neon_combine_2d
-                                         node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
-                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
-                             BinOpFrag<(Neon_combine_2d
-                                         (fneg node:$LHS), (fneg node:$RHS))>>;
-}
-
-defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>;
-
-// Variant 3: Long type
-// E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S
-//      SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S
-
-multiclass NI_2VE_v3<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
-                     neon_uimm2_bare, VPR128, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
-                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
-  def _4s8h : NI_2VE<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
-                     neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-
-  def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
-                     neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-}
-
-defm SMLALvve : NI_2VE_v3<0b0, 0b0010, "smlal">;
-defm UMLALvve : NI_2VE_v3<0b1, 0b0010, "umlal">;
-defm SMLSLvve : NI_2VE_v3<0b0, 0b0110, "smlsl">;
-defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">;
-defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">;
-defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">;
-
-multiclass NI_2VE_v3_2op<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
-                         neon_uimm2_bare, VPR128, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
-                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
-  def _4s8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
-                         neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-
-  def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
-                         neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">;
-defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">;
-defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">;
-}
-
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
-          (FMOVdd $src)>;
-
-// Pattern for lane in 128-bit vector
-class NI_2VEL2_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                     RegisterOperand EleOpVPR, ValueType ResTy,
-                     ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
-                     SDPatternOperator hiop>
-  : Pat<(ResTy (op (ResTy VPR128:$src),
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vduplane
-                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEL2_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                    RegisterOperand EleOpVPR, ValueType ResTy,
-                    ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
-                    SDPatternOperator hiop>
-  : Pat<(ResTy (op (ResTy VPR128:$src),
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vduplane
-                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST VPR128:$src, VPR128:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-class NI_2VEL2_lane0<Instruction INST, SDPatternOperator op,
-                     ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
-                     SDPatternOperator hiop, Instruction DupInst>
-  : Pat<(ResTy (op (ResTy VPR128:$src),
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
-        (INST VPR128:$src, VPR128:$Rn, (DupInst $Re), 0)>;
-
-multiclass NI_2VEL_v3_pat<string subop, SDPatternOperator op> {
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
-                     op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
-                     op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32>;
-
-  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
-                       op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
-                       op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
-
-  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
-                       op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
-
-  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
-                       op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
-                    op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
-                    op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32>;
-
-  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
-                      op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
-                      op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
-}
-
-defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>;
-defm UMLAL_lane_v3 : NI_2VEL_v3_pat<"UMLALvve", Neon_umlal>;
-defm SMLSL_lane_v3 : NI_2VEL_v3_pat<"SMLSLvve", Neon_smlsl>;
-defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>;
-
-// Pattern for lane in 128-bit vector
-class NI_2VEL2_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                         RegisterOperand EleOpVPR, ValueType ResTy,
-                         ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
-                         SDPatternOperator hiop>
-  : Pat<(ResTy (op
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vduplane
-                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEL2_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                        RegisterOperand EleOpVPR, ValueType ResTy,
-                        ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
-                        SDPatternOperator hiop>
-  : Pat<(ResTy (op
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vduplane
-                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST VPR128:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-// Pattern for fixed lane 0
-class NI_2VEL2_mul_lane0<Instruction INST, SDPatternOperator op,
-                         ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
-                         SDPatternOperator hiop, Instruction DupInst>
-  : Pat<(ResTy (op
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
-        (INST VPR128:$Rn, (DupInst $Re), 0)>;
-
-multiclass NI_2VEL_mul_v3_pat<string subop, SDPatternOperator op> {
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
-                         op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
-                         op, VPR64, VPR128, v2i64, v2i32, v4i32>;
-
-  def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
-                         op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
-                           op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
-
-  def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_4s8h"),
-                           op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
-
-  def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_2d4s"),
-                           op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
-                        op, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
-                        op, VPR64, VPR64, v2i64, v2i32, v2i32>;
-
-  def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
-                          op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
-                          op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
-}
-
-defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>;
-defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>;
-defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>;
-
-multiclass NI_qdma<SDPatternOperator op> {
-  def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                    (op node:$Ra,
-                      (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
-
-  def _2d : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                    (op node:$Ra,
-                      (v2i64 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
-}
-
-defm Neon_qdmlal : NI_qdma<int_arm_neon_vqadds>;
-defm Neon_qdmlsl : NI_qdma<int_arm_neon_vqsubs>;
-
-multiclass NI_2VEL_v3_qdma_pat<string subop, string op> {
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
-                     !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR128Lo,
-                     v4i32, v4i16, v8i16>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
-                     !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR128,
-                     v2i64, v2i32, v4i32>;
-
-  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
-                       !cast<PatFrag>(op # "_4s"), VPR128Lo,
-                       v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
-                       !cast<PatFrag>(op # "_2d"), VPR128,
-                       v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
-
-  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
-                       !cast<PatFrag>(op # "_4s"),
-                       v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
-
-  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
-                       !cast<PatFrag>(op # "_2d"),
-                       v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
-                    !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR64Lo,
-                    v4i32, v4i16, v4i16>;
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
-                    !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR64,
-                    v2i64, v2i32, v2i32>;
-
-  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
-                      !cast<PatFrag>(op # "_4s"), VPR64Lo,
-                      v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
-                      !cast<PatFrag>(op # "_2d"), VPR64,
-                      v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
-}
-
-defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">;
-defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">;
-
-// End of implementation for instruction class (3V Elem)
-
-class NeonI_REV<string asmop, string Res, bits<2> size, bit Q, bit U,
-                bits<5> opcode, RegisterOperand ResVPR, ValueType ResTy,
-                SDPatternOperator Neon_Rev>
-  : NeonI_2VMisc<Q, U, size, opcode,
-               (outs ResVPR:$Rd), (ins ResVPR:$Rn),
-               asmop # "\t$Rd." # Res # ", $Rn." # Res,
-               [(set (ResTy ResVPR:$Rd),
-                  (ResTy (Neon_Rev (ResTy ResVPR:$Rn))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def REV64_16b : NeonI_REV<"rev64", "16b", 0b00, 0b1, 0b0, 0b00000, VPR128,
-                          v16i8, Neon_rev64>;
-def REV64_8h : NeonI_REV<"rev64", "8h", 0b01, 0b1, 0b0, 0b00000, VPR128,
-                         v8i16, Neon_rev64>;
-def REV64_4s : NeonI_REV<"rev64", "4s", 0b10, 0b1, 0b0, 0b00000, VPR128,
-                         v4i32, Neon_rev64>;
-def REV64_8b : NeonI_REV<"rev64", "8b", 0b00, 0b0, 0b0, 0b00000, VPR64,
-                         v8i8, Neon_rev64>;
-def REV64_4h : NeonI_REV<"rev64", "4h", 0b01, 0b0, 0b0, 0b00000, VPR64,
-                         v4i16, Neon_rev64>;
-def REV64_2s : NeonI_REV<"rev64", "2s", 0b10, 0b0, 0b0, 0b00000, VPR64,
-                         v2i32, Neon_rev64>;
-
-def : Pat<(v4f32 (Neon_rev64 (v4f32 VPR128:$Rn))), (REV64_4s VPR128:$Rn)>;
-def : Pat<(v2f32 (Neon_rev64 (v2f32 VPR64:$Rn))), (REV64_2s VPR64:$Rn)>;
-
-def REV32_16b : NeonI_REV<"rev32", "16b", 0b00, 0b1, 0b1, 0b00000, VPR128,
-                          v16i8, Neon_rev32>;
-def REV32_8h : NeonI_REV<"rev32", "8h", 0b01, 0b1, 0b1, 0b00000, VPR128,
-                          v8i16, Neon_rev32>;
-def REV32_8b : NeonI_REV<"rev32", "8b", 0b00, 0b0, 0b1, 0b00000, VPR64,
-                         v8i8, Neon_rev32>;
-def REV32_4h : NeonI_REV<"rev32", "4h", 0b01, 0b0, 0b1, 0b00000, VPR64,
-                         v4i16, Neon_rev32>;
-
-def REV16_16b : NeonI_REV<"rev16", "16b", 0b00, 0b1, 0b0, 0b00001, VPR128,
-                          v16i8, Neon_rev16>;
-def REV16_8b : NeonI_REV<"rev16", "8b", 0b00, 0b0, 0b0, 0b00001, VPR64,
-                         v8i8, Neon_rev16>;
-
-multiclass NeonI_PairwiseAdd<string asmop, bit U, bits<5> opcode,
-                             SDPatternOperator Neon_Padd> {
-  def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                           (outs VPR128:$Rd), (ins VPR128:$Rn),
-                           asmop # "\t$Rd.8h, $Rn.16b",
-                           [(set (v8i16 VPR128:$Rd),
-                              (v8i16 (Neon_Padd (v16i8 VPR128:$Rn))))],
-                           NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.4h, $Rn.8b",
-                          [(set (v4i16 VPR64:$Rd),
-                             (v4i16 (Neon_Padd (v8i8 VPR64:$Rn))))],
-                          NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                           (outs VPR128:$Rd), (ins VPR128:$Rn),
-                           asmop # "\t$Rd.4s, $Rn.8h",
-                           [(set (v4i32 VPR128:$Rd),
-                              (v4i32 (Neon_Padd (v8i16 VPR128:$Rn))))],
-                           NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.4h",
-                          [(set (v2i32 VPR64:$Rd),
-                             (v2i32 (Neon_Padd (v4i16 VPR64:$Rn))))],
-                          NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                           (outs VPR128:$Rd), (ins VPR128:$Rn),
-                           asmop # "\t$Rd.2d, $Rn.4s",
-                           [(set (v2i64 VPR128:$Rd),
-                              (v2i64 (Neon_Padd (v4i32 VPR128:$Rn))))],
-                           NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.1d, $Rn.2s",
-                          [(set (v1i64 VPR64:$Rd),
-                             (v1i64 (Neon_Padd (v2i32 VPR64:$Rn))))],
-                          NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SADDLP : NeonI_PairwiseAdd<"saddlp", 0b0, 0b00010,
-                                int_arm_neon_vpaddls>;
-defm UADDLP : NeonI_PairwiseAdd<"uaddlp", 0b1, 0b00010,
-                                int_arm_neon_vpaddlu>;
-
-def : Pat<(v1i64 (int_aarch64_neon_saddlv (v2i32 VPR64:$Rn))),
-          (SADDLP2s1d $Rn)>;
-def : Pat<(v1i64 (int_aarch64_neon_uaddlv (v2i32 VPR64:$Rn))),
-          (UADDLP2s1d $Rn)>;
-
-multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
-                             SDPatternOperator Neon_Padd> {
-  let Constraints = "$src = $Rd" in {
-    def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                             asmop # "\t$Rd.8h, $Rn.16b",
-                             [(set (v8i16 VPR128:$Rd),
-                                (v8i16 (Neon_Padd
-                                  (v8i16 VPR128:$src), (v16i8 VPR128:$Rn))))],
-                             NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                            asmop # "\t$Rd.4h, $Rn.8b",
-                            [(set (v4i16 VPR64:$Rd),
-                               (v4i16 (Neon_Padd
-                                 (v4i16 VPR64:$src), (v8i8 VPR64:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "\t$Rd.4s, $Rn.8h",
-                            [(set (v4i32 VPR128:$Rd),
-                               (v4i32 (Neon_Padd
-                                 (v4i32 VPR128:$src), (v8i16 VPR128:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                            asmop # "\t$Rd.2s, $Rn.4h",
-                            [(set (v2i32 VPR64:$Rd),
-                               (v2i32 (Neon_Padd
-                                 (v2i32 VPR64:$src), (v4i16 VPR64:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "\t$Rd.2d, $Rn.4s",
-                            [(set (v2i64 VPR128:$Rd),
-                               (v2i64 (Neon_Padd
-                                 (v2i64 VPR128:$src), (v4i32 VPR128:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                            asmop # "\t$Rd.1d, $Rn.2s",
-                            [(set (v1i64 VPR64:$Rd),
-                               (v1i64 (Neon_Padd
-                                 (v1i64 VPR64:$src), (v2i32 VPR64:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-defm SADALP : NeonI_PairwiseAddAcc<"sadalp", 0b0, 0b00110,
-                                   int_arm_neon_vpadals>;
-defm UADALP : NeonI_PairwiseAddAcc<"uadalp", 0b1, 0b00110,
-                                   int_arm_neon_vpadalu>;
-
-multiclass NeonI_2VMisc_BHSDsize_1Arg<string asmop, bit U, bits<5> opcode> {
-  def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                         (outs VPR128:$Rd), (ins VPR128:$Rn),
-                         asmop # "\t$Rd.16b, $Rn.16b",
-                         [], NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.8h, $Rn.8h",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.2d, $Rn.2d",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                         (outs VPR64:$Rd), (ins VPR64:$Rn),
-                         asmop # "\t$Rd.8b, $Rn.8b",
-                         [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.4h, $Rn.4h",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SQABS : NeonI_2VMisc_BHSDsize_1Arg<"sqabs", 0b0, 0b00111>;
-defm SQNEG : NeonI_2VMisc_BHSDsize_1Arg<"sqneg", 0b1, 0b00111>;
-defm ABS : NeonI_2VMisc_BHSDsize_1Arg<"abs", 0b0, 0b01011>;
-defm NEG : NeonI_2VMisc_BHSDsize_1Arg<"neg", 0b1, 0b01011>;
-
-multiclass NeonI_2VMisc_BHSD_1Arg_Pattern<string Prefix,
-                                          SDPatternOperator Neon_Op> {
-  def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$Rn))),
-            (v16i8 (!cast<Instruction>(Prefix # 16b) (v16i8 VPR128:$Rn)))>;
-
-  def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$Rn))),
-            (v8i16 (!cast<Instruction>(Prefix # 8h) (v8i16 VPR128:$Rn)))>;
-
-  def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$Rn))),
-            (v4i32 (!cast<Instruction>(Prefix # 4s) (v4i32 VPR128:$Rn)))>;
-
-  def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$Rn))),
-            (v2i64 (!cast<Instruction>(Prefix # 2d) (v2i64 VPR128:$Rn)))>;
-
-  def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$Rn))),
-            (v8i8 (!cast<Instruction>(Prefix # 8b) (v8i8 VPR64:$Rn)))>;
-
-  def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$Rn))),
-            (v4i16 (!cast<Instruction>(Prefix # 4h) (v4i16 VPR64:$Rn)))>;
-
-  def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$Rn))),
-            (v2i32 (!cast<Instruction>(Prefix # 2s) (v2i32 VPR64:$Rn)))>;
-}
-
-defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQABS", int_arm_neon_vqabs>;
-defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQNEG", int_arm_neon_vqneg>;
-defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"ABS", int_arm_neon_vabs>;
-
-def : Pat<(v16i8 (sub
-            (v16i8 Neon_AllZero),
-            (v16i8 VPR128:$Rn))),
-          (v16i8 (NEG16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (sub
-            (v8i8 Neon_AllZero),
-            (v8i8 VPR64:$Rn))),
-          (v8i8 (NEG8b (v8i8 VPR64:$Rn)))>;
-def : Pat<(v8i16 (sub
-            (v8i16 (bitconvert (v16i8 Neon_AllZero))),
-            (v8i16 VPR128:$Rn))),
-          (v8i16 (NEG8h (v8i16 VPR128:$Rn)))>;
-def : Pat<(v4i16 (sub
-            (v4i16 (bitconvert (v8i8 Neon_AllZero))),
-            (v4i16 VPR64:$Rn))),
-          (v4i16 (NEG4h (v4i16 VPR64:$Rn)))>;
-def : Pat<(v4i32 (sub
-            (v4i32 (bitconvert (v16i8 Neon_AllZero))),
-            (v4i32 VPR128:$Rn))),
-          (v4i32 (NEG4s (v4i32 VPR128:$Rn)))>;
-def : Pat<(v2i32 (sub
-            (v2i32 (bitconvert (v8i8 Neon_AllZero))),
-            (v2i32 VPR64:$Rn))),
-          (v2i32 (NEG2s (v2i32 VPR64:$Rn)))>;
-def : Pat<(v2i64 (sub
-            (v2i64 (bitconvert (v16i8 Neon_AllZero))),
-            (v2i64 VPR128:$Rn))),
-          (v2i64 (NEG2d (v2i64 VPR128:$Rn)))>;
-
-multiclass NeonI_2VMisc_BHSDsize_2Args<string asmop, bit U, bits<5> opcode> {
-  let Constraints = "$src = $Rd" in {
-    def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                           (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                           asmop # "\t$Rd.16b, $Rn.16b",
-                           [], NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                          asmop # "\t$Rd.8h, $Rn.8h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                          asmop # "\t$Rd.4s, $Rn.4s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                          asmop # "\t$Rd.2d, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                          asmop # "\t$Rd.8b, $Rn.8b",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                          asmop # "\t$Rd.4h, $Rn.4h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.2s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-defm SUQADD : NeonI_2VMisc_BHSDsize_2Args<"suqadd", 0b0, 0b00011>;
-defm USQADD : NeonI_2VMisc_BHSDsize_2Args<"usqadd", 0b1, 0b00011>;
-
-multiclass NeonI_2VMisc_BHSD_2Args_Pattern<string Prefix,
-                                           SDPatternOperator Neon_Op> {
-  def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$src), (v16i8 VPR128:$Rn))),
-            (v16i8 (!cast<Instruction>(Prefix # 16b)
-              (v16i8 VPR128:$src), (v16i8 VPR128:$Rn)))>;
-
-  def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$src), (v8i16 VPR128:$Rn))),
-            (v8i16 (!cast<Instruction>(Prefix # 8h)
-              (v8i16 VPR128:$src), (v8i16 VPR128:$Rn)))>;
-
-  def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$src), (v4i32 VPR128:$Rn))),
-            (v4i32 (!cast<Instruction>(Prefix # 4s)
-              (v4i32 VPR128:$src), (v4i32 VPR128:$Rn)))>;
-
-  def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$src), (v2i64 VPR128:$Rn))),
-            (v2i64 (!cast<Instruction>(Prefix # 2d)
-              (v2i64 VPR128:$src), (v2i64 VPR128:$Rn)))>;
-
-  def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$src), (v8i8 VPR64:$Rn))),
-            (v8i8 (!cast<Instruction>(Prefix # 8b)
-              (v8i8 VPR64:$src), (v8i8 VPR64:$Rn)))>;
-
-  def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$src), (v4i16 VPR64:$Rn))),
-            (v4i16 (!cast<Instruction>(Prefix # 4h)
-              (v4i16 VPR64:$src), (v4i16 VPR64:$Rn)))>;
-
-  def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$src), (v2i32 VPR64:$Rn))),
-            (v2i32 (!cast<Instruction>(Prefix # 2s)
-              (v2i32 VPR64:$src), (v2i32 VPR64:$Rn)))>;
-}
-
-defm : NeonI_2VMisc_BHSD_2Args_Pattern<"SUQADD", int_aarch64_neon_suqadd>;
-defm : NeonI_2VMisc_BHSD_2Args_Pattern<"USQADD", int_aarch64_neon_usqadd>;
-
-multiclass NeonI_2VMisc_BHSsizes<string asmop, bit U,
-                          SDPatternOperator Neon_Op> {
-  def 16b : NeonI_2VMisc<0b1, U, 0b00, 0b00100,
-                         (outs VPR128:$Rd), (ins VPR128:$Rn),
-                         asmop # "\t$Rd.16b, $Rn.16b",
-                         [(set (v16i8 VPR128:$Rd),
-                            (v16i8 (Neon_Op (v16i8 VPR128:$Rn))))],
-                         NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8h : NeonI_2VMisc<0b1, U, 0b01, 0b00100,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.8h, $Rn.8h",
-                        [(set (v8i16 VPR128:$Rd),
-                           (v8i16 (Neon_Op (v8i16 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s : NeonI_2VMisc<0b1, U, 0b10, 0b00100,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [(set (v4i32 VPR128:$Rd),
-                           (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8b : NeonI_2VMisc<0b0, U, 0b00, 0b00100,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.8b, $Rn.8b",
-                        [(set (v8i8 VPR64:$Rd),
-                           (v8i8 (Neon_Op (v8i8 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4h : NeonI_2VMisc<0b0, U, 0b01, 0b00100,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.4h, $Rn.4h",
-                        [(set (v4i16 VPR64:$Rd),
-                           (v4i16 (Neon_Op (v4i16 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, 0b10, 0b00100,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [(set (v2i32 VPR64:$Rd),
-                           (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm CLS : NeonI_2VMisc_BHSsizes<"cls", 0b0, int_arm_neon_vcls>;
-defm CLZ : NeonI_2VMisc_BHSsizes<"clz", 0b1, ctlz>;
-
-multiclass NeonI_2VMisc_Bsize<string asmop, bit U, bits<2> size,
-                              bits<5> Opcode> {
-  def 16b : NeonI_2VMisc<0b1, U, size, Opcode,
-                         (outs VPR128:$Rd), (ins VPR128:$Rn),
-                         asmop # "\t$Rd.16b, $Rn.16b",
-                         [], NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8b : NeonI_2VMisc<0b0, U, size, Opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.8b, $Rn.8b",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm CNT : NeonI_2VMisc_Bsize<"cnt", 0b0, 0b00, 0b00101>;
-defm NOT : NeonI_2VMisc_Bsize<"not", 0b1, 0b00, 0b00101>;
-defm RBIT : NeonI_2VMisc_Bsize<"rbit", 0b1, 0b01, 0b00101>;
-
-def : NeonInstAlias<"mvn $Rd.16b, $Rn.16b",
-                    (NOT16b VPR128:$Rd, VPR128:$Rn), 0>;
-def : NeonInstAlias<"mvn $Rd.8b, $Rn.8b",
-                    (NOT8b VPR64:$Rd, VPR64:$Rn), 0>;
-
-def : Pat<(v16i8 (ctpop (v16i8 VPR128:$Rn))),
-          (v16i8 (CNT16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (ctpop (v8i8 VPR64:$Rn))),
-          (v8i8 (CNT8b (v8i8 VPR64:$Rn)))>;
-
-def : Pat<(v16i8 (xor
-            (v16i8 VPR128:$Rn),
-            (v16i8 Neon_AllOne))),
-          (v16i8 (NOT16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (xor
-            (v8i8 VPR64:$Rn),
-            (v8i8 Neon_AllOne))),
-          (v8i8 (NOT8b (v8i8 VPR64:$Rn)))>;
-def : Pat<(v8i16 (xor
-            (v8i16 VPR128:$Rn),
-            (v8i16 (bitconvert (v16i8 Neon_AllOne))))),
-          (NOT16b VPR128:$Rn)>;
-def : Pat<(v4i16 (xor
-            (v4i16 VPR64:$Rn),
-            (v4i16 (bitconvert (v8i8 Neon_AllOne))))),
-          (NOT8b VPR64:$Rn)>;
-def : Pat<(v4i32 (xor
-            (v4i32 VPR128:$Rn),
-            (v4i32 (bitconvert (v16i8 Neon_AllOne))))),
-          (NOT16b VPR128:$Rn)>;
-def : Pat<(v2i32 (xor
-            (v2i32 VPR64:$Rn),
-            (v2i32 (bitconvert (v8i8 Neon_AllOne))))),
-          (NOT8b VPR64:$Rn)>;
-def : Pat<(v2i64 (xor
-            (v2i64 VPR128:$Rn),
-            (v2i64 (bitconvert (v16i8 Neon_AllOne))))),
-          (NOT16b VPR128:$Rn)>;
-
-def : Pat<(v16i8 (int_aarch64_neon_rbit (v16i8 VPR128:$Rn))),
-          (v16i8 (RBIT16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (int_aarch64_neon_rbit (v8i8 VPR64:$Rn))),
-          (v8i8 (RBIT8b (v8i8 VPR64:$Rn)))>;
-
-multiclass NeonI_2VMisc_SDsizes<string asmop, bit U, bits<5> opcode,
-                                SDPatternOperator Neon_Op> {
-  def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [(set (v4f32 VPR128:$Rd),
-                           (v4f32 (Neon_Op (v4f32 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.2d, $Rn.2d",
-                        [(set (v2f64 VPR128:$Rd),
-                           (v2f64 (Neon_Op (v2f64 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [(set (v2f32 VPR64:$Rd),
-                           (v2f32 (Neon_Op (v2f32 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FABS : NeonI_2VMisc_SDsizes<"fabs", 0b0, 0b01111, fabs>;
-defm FNEG : NeonI_2VMisc_SDsizes<"fneg", 0b1, 0b01111, fneg>;
-
-multiclass NeonI_2VMisc_HSD_Narrow<string asmop, bit U, bits<5> opcode> {
-  def 8h8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.8b, $Rn.8h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.4h, $Rn.4s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  let Constraints = "$Rd = $src" in {
-    def 8h16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                             asmop # "2\t$Rd.16b, $Rn.8h",
-                             [], NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4s8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "2\t$Rd.8h, $Rn.4s",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2d4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "2\t$Rd.4s, $Rn.2d",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-defm XTN : NeonI_2VMisc_HSD_Narrow<"xtn", 0b0, 0b10010>;
-defm SQXTUN : NeonI_2VMisc_HSD_Narrow<"sqxtun", 0b1, 0b10010>;
-defm SQXTN : NeonI_2VMisc_HSD_Narrow<"sqxtn", 0b0, 0b10100>;
-defm UQXTN : NeonI_2VMisc_HSD_Narrow<"uqxtn", 0b1, 0b10100>;
-
-multiclass NeonI_2VMisc_Narrow_Patterns<string Prefix,
-                                        SDPatternOperator Neon_Op> {
-  def : Pat<(v8i8 (Neon_Op (v8i16 VPR128:$Rn))),
-            (v8i8 (!cast<Instruction>(Prefix # 8h8b) (v8i16 VPR128:$Rn)))>;
-
-  def : Pat<(v4i16 (Neon_Op (v4i32 VPR128:$Rn))),
-            (v4i16 (!cast<Instruction>(Prefix # 4s4h) (v4i32 VPR128:$Rn)))>;
-
-  def : Pat<(v2i32 (Neon_Op (v2i64 VPR128:$Rn))),
-            (v2i32 (!cast<Instruction>(Prefix # 2d2s) (v2i64 VPR128:$Rn)))>;
-
-  def : Pat<(v16i8 (concat_vectors
-              (v8i8 VPR64:$src),
-              (v8i8 (Neon_Op (v8i16 VPR128:$Rn))))),
-            (!cast<Instruction>(Prefix # 8h16b)
-              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
-              VPR128:$Rn)>;
-
-  def : Pat<(v8i16 (concat_vectors
-              (v4i16 VPR64:$src),
-              (v4i16 (Neon_Op (v4i32 VPR128:$Rn))))),
-            (!cast<Instruction>(Prefix # 4s8h)
-              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
-              VPR128:$Rn)>;
-
-  def : Pat<(v4i32 (concat_vectors
-              (v2i32 VPR64:$src),
-              (v2i32 (Neon_Op (v2i64 VPR128:$Rn))))),
-            (!cast<Instruction>(Prefix # 2d4s)
-              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
-              VPR128:$Rn)>;
-}
-
-defm : NeonI_2VMisc_Narrow_Patterns<"XTN", trunc>;
-defm : NeonI_2VMisc_Narrow_Patterns<"SQXTUN", int_arm_neon_vqmovnsu>;
-defm : NeonI_2VMisc_Narrow_Patterns<"SQXTN", int_arm_neon_vqmovns>;
-defm : NeonI_2VMisc_Narrow_Patterns<"UQXTN", int_arm_neon_vqmovnu>;
-
-multiclass NeonI_2VMisc_SHIFT<string asmop, bit U, bits<5> opcode> {
-  let DecoderMethod = "DecodeSHLLInstruction" in {
-    def 8b8h : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR64:$Rn, uimm_exact8:$Imm),
-                            asmop # "\t$Rd.8h, $Rn.8b, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 4h4s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR64:$Rn, uimm_exact16:$Imm),
-                            asmop # "\t$Rd.4s, $Rn.4h, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 2s2d : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR64:$Rn, uimm_exact32:$Imm),
-                            asmop # "\t$Rd.2d, $Rn.2s, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR128:$Rn, uimm_exact8:$Imm),
-                            asmop # "2\t$Rd.8h, $Rn.16b, $Imm",
-                            [], NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR128:$Rn, uimm_exact16:$Imm),
-                            asmop # "2\t$Rd.4s, $Rn.8h, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR128:$Rn, uimm_exact32:$Imm),
-                            asmop # "2\t$Rd.2d, $Rn.4s, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-  }
-}
-
-defm SHLL : NeonI_2VMisc_SHIFT<"shll", 0b1, 0b10011>;
-
-class NeonI_SHLL_Patterns<ValueType OpTy, ValueType DesTy,
-                          SDPatternOperator ExtOp, Operand Neon_Imm,
-                          string suffix>
-  : Pat<(DesTy (shl
-          (DesTy (ExtOp (OpTy VPR64:$Rn))),
-            (DesTy (Neon_vdup
-              (i32 Neon_Imm:$Imm))))),
-        (!cast<Instruction>("SHLL" # suffix) VPR64:$Rn, Neon_Imm:$Imm)>;
-
-class NeonI_SHLL_High_Patterns<ValueType OpTy, ValueType DesTy,
-                               SDPatternOperator ExtOp, Operand Neon_Imm,
-                               string suffix, PatFrag GetHigh>
-  : Pat<(DesTy (shl
-          (DesTy (ExtOp
-            (OpTy (GetHigh VPR128:$Rn)))),
-              (DesTy (Neon_vdup
-                (i32 Neon_Imm:$Imm))))),
-        (!cast<Instruction>("SHLL" # suffix) VPR128:$Rn, Neon_Imm:$Imm)>;
-
-def : NeonI_SHLL_Patterns<v8i8, v8i16, zext, uimm_exact8, "8b8h">;
-def : NeonI_SHLL_Patterns<v8i8, v8i16, sext, uimm_exact8, "8b8h">;
-def : NeonI_SHLL_Patterns<v4i16, v4i32, zext, uimm_exact16, "4h4s">;
-def : NeonI_SHLL_Patterns<v4i16, v4i32, sext, uimm_exact16, "4h4s">;
-def : NeonI_SHLL_Patterns<v2i32, v2i64, zext, uimm_exact32, "2s2d">;
-def : NeonI_SHLL_Patterns<v2i32, v2i64, sext, uimm_exact32, "2s2d">;
-def : NeonI_SHLL_High_Patterns<v8i8, v8i16, zext, uimm_exact8, "16b8h",
-                               Neon_High16B>;
-def : NeonI_SHLL_High_Patterns<v8i8, v8i16, sext, uimm_exact8, "16b8h",
-                               Neon_High16B>;
-def : NeonI_SHLL_High_Patterns<v4i16, v4i32, zext, uimm_exact16, "8h4s",
-                               Neon_High8H>;
-def : NeonI_SHLL_High_Patterns<v4i16, v4i32, sext, uimm_exact16, "8h4s",
-                               Neon_High8H>;
-def : NeonI_SHLL_High_Patterns<v2i32, v2i64, zext, uimm_exact32, "4s2d",
-                               Neon_High4S>;
-def : NeonI_SHLL_High_Patterns<v2i32, v2i64, sext, uimm_exact32, "4s2d",
-                               Neon_High4S>;
-
-multiclass NeonI_2VMisc_SD_Narrow<string asmop, bit U, bits<5> opcode> {
-  def 4s4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.4h, $Rn.4s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  let Constraints = "$src = $Rd" in {
-    def 4s8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "2\t$Rd.8h, $Rn.4s",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "2\t$Rd.4s, $Rn.2d",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-defm FCVTN : NeonI_2VMisc_SD_Narrow<"fcvtn", 0b0, 0b10110>;
-
-multiclass NeonI_2VMisc_Narrow_Pattern<string prefix,
-                                       SDPatternOperator f32_to_f16_Op,
-                                       SDPatternOperator f64_to_f32_Op> {
-
-  def : Pat<(v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))),
-              (!cast<Instruction>(prefix # "4s4h") (v4f32 VPR128:$Rn))>;
-
-  def : Pat<(v8i16 (concat_vectors
-                (v4i16 VPR64:$src),
-                (v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))))),
-                  (!cast<Instruction>(prefix # "4s8h")
-                    (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
-                    (v4f32 VPR128:$Rn))>;
-
-  def : Pat<(v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))),
-            (!cast<Instruction>(prefix # "2d2s") (v2f64 VPR128:$Rn))>;
-
-  def : Pat<(v4f32 (concat_vectors
-              (v2f32 VPR64:$src),
-              (v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))))),
-                (!cast<Instruction>(prefix # "2d4s")
-                  (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
-                  (v2f64 VPR128:$Rn))>;
-}
-
-defm : NeonI_2VMisc_Narrow_Pattern<"FCVTN", int_arm_neon_vcvtfp2hf, fround>;
-
-multiclass NeonI_2VMisc_D_Narrow<string asmop, string prefix, bit U,
-                                 bits<5> opcode> {
-  def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                          asmop # "2\t$Rd.4s, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    let Constraints = "$src = $Rd";
-  }
-
-  def : Pat<(v2f32 (int_aarch64_neon_vcvtxn (v2f64 VPR128:$Rn))),
-            (!cast<Instruction>(prefix # "2d2s") VPR128:$Rn)>;
-
-  def : Pat<(v4f32 (concat_vectors
-              (v2f32 VPR64:$src),
-              (v2f32 (int_aarch64_neon_vcvtxn (v2f64 VPR128:$Rn))))),
-            (!cast<Instruction>(prefix # "2d4s")
-               (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
-               VPR128:$Rn)>;
-}
-
-defm FCVTXN : NeonI_2VMisc_D_Narrow<"fcvtxn","FCVTXN", 0b1, 0b10110>;
-
-def Neon_High4Float : PatFrag<(ops node:$in),
-                              (extract_subvector (v4f32 node:$in), (iPTR 2))>;
-
-multiclass NeonI_2VMisc_HS_Extend<string asmop, bit U, bits<5> opcode> {
-  def 4h4s : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR128:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.4s, $Rn.4h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s2d : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR128:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.2d, $Rn.2s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8h4s : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$Rn),
-                          asmop # "2\t$Rd.4s, $Rn.8h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s2d : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$Rn),
-                          asmop # "2\t$Rd.2d, $Rn.4s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FCVTL : NeonI_2VMisc_HS_Extend<"fcvtl", 0b0, 0b10111>;
-
-multiclass NeonI_2VMisc_Extend_Pattern<string prefix> {
-  def : Pat<(v4f32 (int_arm_neon_vcvthf2fp (v4i16 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "4h4s") VPR64:$Rn)>;
-
-  def : Pat<(v4f32 (int_arm_neon_vcvthf2fp
-              (v4i16 (Neon_High8H
-                (v8i16 VPR128:$Rn))))),
-            (!cast<Instruction>(prefix # "8h4s") VPR128:$Rn)>;
-
-  def : Pat<(v2f64 (fextend (v2f32 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "2s2d") VPR64:$Rn)>;
-
-  def : Pat<(v2f64 (fextend
-              (v2f32 (Neon_High4Float
-                (v4f32 VPR128:$Rn))))),
-            (!cast<Instruction>(prefix # "4s2d") VPR128:$Rn)>;
-}
-
-defm : NeonI_2VMisc_Extend_Pattern<"FCVTL">;
-
-multiclass NeonI_2VMisc_SD_Conv<string asmop, bit Size, bit U, bits<5> opcode,
-                                ValueType ResTy4s, ValueType OpTy4s,
-                                ValueType ResTy2d, ValueType OpTy2d,
-                                ValueType ResTy2s, ValueType OpTy2s,
-                                SDPatternOperator Neon_Op> {
-
-  def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [(set (ResTy4s VPR128:$Rd),
-                           (ResTy4s (Neon_Op (OpTy4s VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d : NeonI_2VMisc<0b1, U, {Size, 0b1}, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.2d, $Rn.2d",
-                        [(set (ResTy2d VPR128:$Rd),
-                           (ResTy2d (Neon_Op (OpTy2d VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [(set (ResTy2s VPR64:$Rd),
-                           (ResTy2s (Neon_Op (OpTy2s VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-multiclass NeonI_2VMisc_fp_to_int<string asmop, bit Size, bit U,
-                                  bits<5> opcode, SDPatternOperator Neon_Op> {
-  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4i32, v4f32, v2i64,
-                                v2f64, v2i32, v2f32, Neon_Op>;
-}
-
-defm FCVTNS : NeonI_2VMisc_fp_to_int<"fcvtns", 0b0, 0b0, 0b11010,
-                                     int_arm_neon_vcvtns>;
-defm FCVTNU : NeonI_2VMisc_fp_to_int<"fcvtnu", 0b0, 0b1, 0b11010,
-                                     int_arm_neon_vcvtnu>;
-defm FCVTPS : NeonI_2VMisc_fp_to_int<"fcvtps", 0b1, 0b0, 0b11010,
-                                     int_arm_neon_vcvtps>;
-defm FCVTPU : NeonI_2VMisc_fp_to_int<"fcvtpu", 0b1, 0b1, 0b11010,
-                                     int_arm_neon_vcvtpu>;
-defm FCVTMS : NeonI_2VMisc_fp_to_int<"fcvtms", 0b0, 0b0, 0b11011,
-                                     int_arm_neon_vcvtms>;
-defm FCVTMU : NeonI_2VMisc_fp_to_int<"fcvtmu", 0b0, 0b1, 0b11011,
-                                     int_arm_neon_vcvtmu>;
-defm FCVTZS : NeonI_2VMisc_fp_to_int<"fcvtzs", 0b1, 0b0, 0b11011, fp_to_sint>;
-defm FCVTZU : NeonI_2VMisc_fp_to_int<"fcvtzu", 0b1, 0b1, 0b11011, fp_to_uint>;
-defm FCVTAS : NeonI_2VMisc_fp_to_int<"fcvtas", 0b0, 0b0, 0b11100,
-                                     int_arm_neon_vcvtas>;
-defm FCVTAU : NeonI_2VMisc_fp_to_int<"fcvtau", 0b0, 0b1, 0b11100,
-                                     int_arm_neon_vcvtau>;
-
-multiclass NeonI_2VMisc_int_to_fp<string asmop, bit Size, bit U,
-                                  bits<5> opcode, SDPatternOperator Neon_Op> {
-  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4i32, v2f64,
-                                v2i64, v2f32, v2i32, Neon_Op>;
-}
-
-defm SCVTF : NeonI_2VMisc_int_to_fp<"scvtf", 0b0, 0b0, 0b11101, sint_to_fp>;
-defm UCVTF : NeonI_2VMisc_int_to_fp<"ucvtf", 0b0, 0b1, 0b11101, uint_to_fp>;
-
-multiclass NeonI_2VMisc_fp_to_fp<string asmop, bit Size, bit U,
-                                 bits<5> opcode, SDPatternOperator Neon_Op> {
-  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4f32, v2f64,
-                                v2f64, v2f32, v2f32, Neon_Op>;
-}
-
-defm FRINTN : NeonI_2VMisc_fp_to_fp<"frintn", 0b0, 0b0, 0b11000,
-                                     int_aarch64_neon_frintn>;
-defm FRINTA : NeonI_2VMisc_fp_to_fp<"frinta", 0b0, 0b1, 0b11000, frnd>;
-defm FRINTP : NeonI_2VMisc_fp_to_fp<"frintp", 0b1, 0b0, 0b11000, fceil>;
-defm FRINTM : NeonI_2VMisc_fp_to_fp<"frintm", 0b0, 0b0, 0b11001, ffloor>;
-defm FRINTX : NeonI_2VMisc_fp_to_fp<"frintx", 0b0, 0b1, 0b11001, frint>;
-defm FRINTZ : NeonI_2VMisc_fp_to_fp<"frintz", 0b1, 0b0, 0b11001, ftrunc>;
-defm FRINTI : NeonI_2VMisc_fp_to_fp<"frinti", 0b1, 0b1, 0b11001, fnearbyint>;
-defm FRECPE : NeonI_2VMisc_fp_to_fp<"frecpe", 0b1, 0b0, 0b11101,
-                                    int_arm_neon_vrecpe>;
-defm FRSQRTE : NeonI_2VMisc_fp_to_fp<"frsqrte", 0b1, 0b1, 0b11101,
-                                     int_arm_neon_vrsqrte>;
-let SchedRW = [WriteFPSqrt, ReadFPSqrt] in {
-defm FSQRT : NeonI_2VMisc_fp_to_fp<"fsqrt", 0b1, 0b1, 0b11111, fsqrt>;
-}
-
-multiclass NeonI_2VMisc_S_Conv<string asmop, bit Size, bit U,
-                               bits<5> opcode, SDPatternOperator Neon_Op> {
-  def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [(set (v4i32 VPR128:$Rd),
-                           (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [(set (v2i32 VPR64:$Rd),
-                           (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm URECPE : NeonI_2VMisc_S_Conv<"urecpe", 0b1, 0b0, 0b11100,
-                                  int_arm_neon_vrecpe>;
-defm URSQRTE : NeonI_2VMisc_S_Conv<"ursqrte", 0b1, 0b1, 0b11100,
-                                   int_arm_neon_vrsqrte>;
-
-// Crypto Class
-class NeonI_Cryptoaes_2v<bits<2> size, bits<5> opcode,
-                         string asmop, SDPatternOperator opnode>
-  : NeonI_Crypto_AES<size, opcode,
-                     (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                     asmop # "\t$Rd.16b, $Rn.16b",
-                     [(set (v16i8 VPR128:$Rd),
-                        (v16i8 (opnode (v16i8 VPR128:$src),
-                                       (v16i8 VPR128:$Rn))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def AESE : NeonI_Cryptoaes_2v<0b00, 0b00100, "aese", int_arm_neon_aese>;
-def AESD : NeonI_Cryptoaes_2v<0b00, 0b00101, "aesd", int_arm_neon_aesd>;
-
-class NeonI_Cryptoaes<bits<2> size, bits<5> opcode,
-                      string asmop, SDPatternOperator opnode>
-  : NeonI_Crypto_AES<size, opcode,
-                     (outs VPR128:$Rd), (ins VPR128:$Rn),
-                     asmop # "\t$Rd.16b, $Rn.16b",
-                     [(set (v16i8 VPR128:$Rd),
-                        (v16i8 (opnode (v16i8 VPR128:$Rn))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def AESMC : NeonI_Cryptoaes<0b00, 0b00110, "aesmc", int_arm_neon_aesmc>;
-def AESIMC : NeonI_Cryptoaes<0b00, 0b00111, "aesimc", int_arm_neon_aesimc>;
-
-class NeonI_Cryptosha_vv<bits<2> size, bits<5> opcode,
-                         string asmop, SDPatternOperator opnode>
-  : NeonI_Crypto_SHA<size, opcode,
-                     (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                     asmop # "\t$Rd.4s, $Rn.4s",
-                     [(set (v4i32 VPR128:$Rd),
-                        (v4i32 (opnode (v4i32 VPR128:$src),
-                                       (v4i32 VPR128:$Rn))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA1SU1 : NeonI_Cryptosha_vv<0b00, 0b00001, "sha1su1",
-                                 int_arm_neon_sha1su1>;
-def SHA256SU0 : NeonI_Cryptosha_vv<0b00, 0b00010, "sha256su0",
-                                   int_arm_neon_sha256su0>;
-
-class NeonI_Cryptosha_ss<bits<2> size, bits<5> opcode,
-                         string asmop, SDPatternOperator opnode>
-  : NeonI_Crypto_SHA<size, opcode,
-                     (outs FPR32:$Rd), (ins FPR32:$Rn),
-                     asmop # "\t$Rd, $Rn",
-                     [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  let Predicates = [HasNEON, HasCrypto];
-  let hasSideEffects = 0;
-}
-
-def SHA1H : NeonI_Cryptosha_ss<0b00, 0b00000, "sha1h", int_arm_neon_sha1h>;
-def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)),
-          (COPY_TO_REGCLASS (SHA1H (COPY_TO_REGCLASS i32:$Rn, FPR32)), GPR32)>;
-
-
-class NeonI_Cryptosha3_vvv<bits<2> size, bits<3> opcode, string asmop,
-                           SDPatternOperator opnode>
-  : NeonI_Crypto_3VSHA<size, opcode,
-                       (outs VPR128:$Rd),
-                       (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
-                       asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
-                       [(set (v4i32 VPR128:$Rd),
-                          (v4i32 (opnode (v4i32 VPR128:$src),
-                                         (v4i32 VPR128:$Rn),
-                                         (v4i32 VPR128:$Rm))))],
-                       NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA1SU0 : NeonI_Cryptosha3_vvv<0b00, 0b011, "sha1su0",
-                                   int_arm_neon_sha1su0>;
-def SHA256SU1 : NeonI_Cryptosha3_vvv<0b00, 0b110, "sha256su1",
-                                     int_arm_neon_sha256su1>;
-
-class NeonI_Cryptosha3_qqv<bits<2> size, bits<3> opcode, string asmop,
-                           SDPatternOperator opnode>
-  : NeonI_Crypto_3VSHA<size, opcode,
-                       (outs FPR128:$Rd),
-                       (ins FPR128:$src, FPR128:$Rn, VPR128:$Rm),
-                       asmop # "\t$Rd, $Rn, $Rm.4s",
-                       [(set (v4i32 FPR128:$Rd),
-                          (v4i32 (opnode (v4i32 FPR128:$src),
-                                         (v4i32 FPR128:$Rn),
-                                         (v4i32 VPR128:$Rm))))],
-                       NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA256H : NeonI_Cryptosha3_qqv<0b00, 0b100, "sha256h",
-                                   int_arm_neon_sha256h>;
-def SHA256H2 : NeonI_Cryptosha3_qqv<0b00, 0b101, "sha256h2",
-                                    int_arm_neon_sha256h2>;
-
-class NeonI_Cryptosha3_qsv<bits<2> size, bits<3> opcode, string asmop>
-  : NeonI_Crypto_3VSHA<size, opcode,
-                       (outs FPR128:$Rd),
-                       (ins FPR128:$src, FPR32:$Rn, VPR128:$Rm),
-                       asmop # "\t$Rd, $Rn, $Rm.4s",
-                       [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let hasSideEffects = 0;
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA1C : NeonI_Cryptosha3_qsv<0b00, 0b000, "sha1c">;
-def SHA1P : NeonI_Cryptosha3_qsv<0b00, 0b001, "sha1p">;
-def SHA1M : NeonI_Cryptosha3_qsv<0b00, 0b010, "sha1m">;
-
-def : Pat<(int_arm_neon_sha1c v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
-          (SHA1C v4i32:$hash_abcd,
-                 (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
-def : Pat<(int_arm_neon_sha1m v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
-          (SHA1M v4i32:$hash_abcd,
-                 (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
-def : Pat<(int_arm_neon_sha1p v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
-          (SHA1P v4i32:$hash_abcd,
-                 (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
-
-// Additional patterns to match shl to USHL.
-def : Pat<(v8i8 (shl (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
-          (USHLvvv_8B $Rn, $Rm)>;
-def : Pat<(v4i16 (shl (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
-          (USHLvvv_4H $Rn, $Rm)>;
-def : Pat<(v2i32 (shl (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
-          (USHLvvv_2S $Rn, $Rm)>;
-def : Pat<(v1i64 (shl (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-          (USHLddd $Rn, $Rm)>;
-def : Pat<(v16i8 (shl (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
-          (USHLvvv_16B $Rn, $Rm)>;
-def : Pat<(v8i16 (shl (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
-          (USHLvvv_8H $Rn, $Rm)>;
-def : Pat<(v4i32 (shl (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
-          (USHLvvv_4S $Rn, $Rm)>;
-def : Pat<(v2i64 (shl (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
-          (USHLvvv_2D $Rn, $Rm)>;
-
-def : Pat<(v1i8 (shl (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
-              sub_8)>;
-def : Pat<(v1i16 (shl (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
-              sub_16)>;
-def : Pat<(v1i32 (shl (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              sub_32)>;
-
-// Additional patterns to match sra, srl.
-// For a vector right shift by vector, the shift amounts of SSHL/USHL are
-// negative. Negate the vector of shift amount first.
-def : Pat<(v8i8 (srl (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
-          (USHLvvv_8B $Rn, (NEG8b $Rm))>;
-def : Pat<(v4i16 (srl (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
-          (USHLvvv_4H $Rn, (NEG4h $Rm))>;
-def : Pat<(v2i32 (srl (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
-          (USHLvvv_2S $Rn, (NEG2s $Rm))>;
-def : Pat<(v1i64 (srl (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-          (USHLddd $Rn, (NEGdd $Rm))>;
-def : Pat<(v16i8 (srl (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
-          (USHLvvv_16B $Rn, (NEG16b $Rm))>;
-def : Pat<(v8i16 (srl (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
-          (USHLvvv_8H $Rn, (NEG8h $Rm))>;
-def : Pat<(v4i32 (srl (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
-          (USHLvvv_4S $Rn, (NEG4s $Rm))>;
-def : Pat<(v2i64 (srl (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
-          (USHLvvv_2D $Rn, (NEG2d $Rm))>;
-
-def : Pat<(v1i8 (srl (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          (NEG8b (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8))),
-              sub_8)>;
-def : Pat<(v1i16 (srl (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          (NEG4h (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16))),
-              sub_16)>;
-def : Pat<(v1i32 (srl (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          (NEG2s (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32))),
-              sub_32)>;
-
-def : Pat<(v8i8 (sra (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
-          (SSHLvvv_8B $Rn, (NEG8b $Rm))>;
-def : Pat<(v4i16 (sra (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
-          (SSHLvvv_4H $Rn, (NEG4h $Rm))>;
-def : Pat<(v2i32 (sra (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
-          (SSHLvvv_2S $Rn, (NEG2s $Rm))>;
-def : Pat<(v1i64 (sra (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-          (SSHLddd $Rn, (NEGdd $Rm))>;
-def : Pat<(v16i8 (sra (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
-          (SSHLvvv_16B $Rn, (NEG16b $Rm))>;
-def : Pat<(v8i16 (sra (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
-          (SSHLvvv_8H $Rn, (NEG8h $Rm))>;
-def : Pat<(v4i32 (sra (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
-          (SSHLvvv_4S $Rn, (NEG4s $Rm))>;
-def : Pat<(v2i64 (sra (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
-          (SSHLvvv_2D $Rn, (NEG2d $Rm))>;
-
-def : Pat<(v1i8 (sra (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
-          (EXTRACT_SUBREG
-              (SSHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          (NEG8b (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8))),
-              sub_8)>;
-def : Pat<(v1i16 (sra (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-          (EXTRACT_SUBREG
-              (SSHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          (NEG4h (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16))),
-              sub_16)>;
-def : Pat<(v1i32 (sra (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-          (EXTRACT_SUBREG
-              (SSHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          (NEG2s (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32))),
-              sub_32)>;
-
-//
-// Patterns for handling half-precision values
-//
-
-// Convert between f16 value and f32 value
-def : Pat<(f32 (f16_to_f32 (i32 GPR32:$Rn))),
-          (FCVTsh (EXTRACT_SUBREG (FMOVsw $Rn), sub_16))>;
-def : Pat<(i32 (f32_to_f16 (f32 FPR32:$Rn))),
-          (FMOVws (SUBREG_TO_REG (i64 0), (f16 (FCVThs $Rn)), sub_16))>;
-
-// Convert f16 value coming in as i16 value to f32
-def : Pat<(f32 (f16_to_f32 (i32 (and (i32 GPR32:$Rn), 65535)))),
-          (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
-def : Pat<(f32 (f16_to_f32 (i32 (assertzext GPR32:$Rn)))),
-          (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
-
-def : Pat<(f32 (f16_to_f32 (i32 (assertzext (i32 (
-            f32_to_f16 (f32 FPR32:$Rn))))))),
-          (f32 FPR32:$Rn)>;
-
-// Patterns for vector extract of half-precision FP value in i16 storage type
-def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
-            (v4i16 VPR64:$Rn), neon_uimm2_bare:$Imm)), 65535)))),
-          (FCVTsh (f16 (DUPhv_H
-            (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-            neon_uimm2_bare:$Imm)))>;
-
-def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
-            (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)), 65535)))),
-          (FCVTsh (f16 (DUPhv_H (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)))>;
-
-// Patterns for vector insert of half-precision FP value 0 in i16 storage type
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
-            (neon_uimm3_bare:$Imm))),
-          (v8i16 (INSELh (v8i16 VPR128:$Rn),
-            (v8i16 (SUBREG_TO_REG (i64 0),
-              (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
-              sub_16)),
-            neon_uimm3_bare:$Imm, 0))>;
-
-def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
-            (neon_uimm2_bare:$Imm))),
-          (v4i16 (EXTRACT_SUBREG
-            (v8i16 (INSELh
-              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              (v8i16 (SUBREG_TO_REG (i64 0),
-                (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
-                sub_16)),
-              neon_uimm2_bare:$Imm, 0)),
-            sub_64))>;
-
-// Patterns for vector insert of half-precision FP value in i16 storage type
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint
-              (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
-            (neon_uimm3_bare:$Imm))),
-          (v8i16 (INSELh (v8i16 VPR128:$Rn),
-            (v8i16 (SUBREG_TO_REG (i64 0),
-              (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
-              sub_16)),
-            neon_uimm3_bare:$Imm, 0))>;
-
-def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint
-              (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
-            (neon_uimm2_bare:$Imm))),
-          (v4i16 (EXTRACT_SUBREG
-            (v8i16 (INSELh
-              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              (v8i16 (SUBREG_TO_REG (i64 0),
-                (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
-                sub_16)),
-              neon_uimm2_bare:$Imm, 0)),
-            sub_64))>;
-
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
-            (i32 (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
-              (neon_uimm3_bare:$Imm1))),
-          (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
-            neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
-
-// Patterns for vector copy of half-precision FP value in i16 storage type
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
-              (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
-              65535)))))))),
-            (neon_uimm3_bare:$Imm1))),
-          (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
-            neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
-
-def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
-              (vector_extract (v4i16 VPR64:$src), neon_uimm3_bare:$Imm2)),
-              65535)))))))),
-            (neon_uimm3_bare:$Imm1))),
-          (v4i16 (EXTRACT_SUBREG
-            (v8i16 (INSELh
-              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
-              neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2)),
-            sub_64))>;
-
-
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
new file mode 100644
index 0000000..e7454be
--- /dev/null
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -0,0 +1,942 @@
+//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ldst-opt"
+
+/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
+/// load / store instructions to form ldp / stp instructions.
+
+STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
+STATISTIC(NumPostFolded, "Number of post-index updates folded");
+STATISTIC(NumPreFolded, "Number of pre-index updates folded");
+STATISTIC(NumUnscaledPairCreated,
+          "Number of load/store from unscaled generated");
+
+static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", cl::init(20),
+                                   cl::Hidden);
+
+// Place holder while testing unscaled load/store combining
+static cl::opt<bool>
+EnableAArch64UnscaledMemOp("aarch64-unscaled-mem-op", cl::Hidden,
+                         cl::desc("Allow AArch64 unscaled load/store combining"),
+                         cl::init(true));
+
+namespace {
+struct AArch64LoadStoreOpt : public MachineFunctionPass {
+  static char ID;
+  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
+
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  // Scan the instructions looking for a load/store that can be combined
+  // with the current instruction into a load/store pair.
+  // Return the matching instruction if one is found, else MBB->end().
+  // If a matching instruction is found, mergeForward is set to true if the
+  // merge is to remove the first instruction and replace the second with
+  // a pair-wise insn, and false if the reverse is true.
+  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
+                                               bool &mergeForward,
+                                               unsigned Limit);
+  // Merge the two instructions indicated into a single pair-wise instruction.
+  // If mergeForward is true, erase the first instruction and fold its
+  // operation into the second. If false, the reverse. Return the instruction
+  // following the first instruction (which may change during processing).
+  MachineBasicBlock::iterator
+  mergePairedInsns(MachineBasicBlock::iterator I,
+                   MachineBasicBlock::iterator Paired, bool mergeForward);
+
+  // Scan the instruction list to find a base register update that can
+  // be combined with the current instruction (a load or store) using
+  // pre or post indexed addressing with writeback. Scan forwards.
+  MachineBasicBlock::iterator
+  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
+                                int Value);
+
+  // Scan the instruction list to find a base register update that can
+  // be combined with the current instruction (a load or store) using
+  // pre or post indexed addressing with writeback. Scan backwards.
+  MachineBasicBlock::iterator
+  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
+
+  // Merge a pre-index base register update into a ld/st instruction.
+  MachineBasicBlock::iterator
+  mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+                        MachineBasicBlock::iterator Update);
+
+  // Merge a post-index base register update into a ld/st instruction.
+  MachineBasicBlock::iterator
+  mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
+                         MachineBasicBlock::iterator Update);
+
+  bool optimizeBlock(MachineBasicBlock &MBB);
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  const char *getPassName() const override {
+    return "AArch64 load / store optimization pass";
+  }
+
+private:
+  int getMemSize(MachineInstr *MemMI);
+};
+char AArch64LoadStoreOpt::ID = 0;
+}
+
+static bool isUnscaledLdst(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::STURSi:
+    return true;
+  case AArch64::STURDi:
+    return true;
+  case AArch64::STURQi:
+    return true;
+  case AArch64::STURWi:
+    return true;
+  case AArch64::STURXi:
+    return true;
+  case AArch64::LDURSi:
+    return true;
+  case AArch64::LDURDi:
+    return true;
+  case AArch64::LDURQi:
+    return true;
+  case AArch64::LDURWi:
+    return true;
+  case AArch64::LDURXi:
+    return true;
+  }
+}
+
+// Size in bytes of the data moved by an unscaled load or store
+int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
+  switch (MemMI->getOpcode()) {
+  default:
+    llvm_unreachable("Opcode has has unknown size!");
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+    return 4;
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+    return 8;
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+    return 16;
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+    return 4;
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+    return 8;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return 4;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+    return 8;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+    return 16;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return 4;
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+    return 8;
+  }
+}
+
+static unsigned getMatchingPairOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pairwise equivalent!");
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+    return AArch64::STPSi;
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+    return AArch64::STPDi;
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+    return AArch64::STPQi;
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+    return AArch64::STPWi;
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+    return AArch64::STPXi;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return AArch64::LDPSi;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+    return AArch64::LDPDi;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+    return AArch64::LDPQi;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return AArch64::LDPWi;
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+    return AArch64::LDPXi;
+  }
+}
+
+static unsigned getPreIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pre-indexed equivalent!");
+  case AArch64::STRSui:    return AArch64::STRSpre;
+  case AArch64::STRDui:    return AArch64::STRDpre;
+  case AArch64::STRQui:    return AArch64::STRQpre;
+  case AArch64::STRWui:    return AArch64::STRWpre;
+  case AArch64::STRXui:    return AArch64::STRXpre;
+  case AArch64::LDRSui:    return AArch64::LDRSpre;
+  case AArch64::LDRDui:    return AArch64::LDRDpre;
+  case AArch64::LDRQui:    return AArch64::LDRQpre;
+  case AArch64::LDRWui:    return AArch64::LDRWpre;
+  case AArch64::LDRXui:    return AArch64::LDRXpre;
+  }
+}
+
+static unsigned getPostIndexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
+  case AArch64::STRSui:
+    return AArch64::STRSpost;
+  case AArch64::STRDui:
+    return AArch64::STRDpost;
+  case AArch64::STRQui:
+    return AArch64::STRQpost;
+  case AArch64::STRWui:
+    return AArch64::STRWpost;
+  case AArch64::STRXui:
+    return AArch64::STRXpost;
+  case AArch64::LDRSui:
+    return AArch64::LDRSpost;
+  case AArch64::LDRDui:
+    return AArch64::LDRDpost;
+  case AArch64::LDRQui:
+    return AArch64::LDRQpost;
+  case AArch64::LDRWui:
+    return AArch64::LDRWpost;
+  case AArch64::LDRXui:
+    return AArch64::LDRXpost;
+  }
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator Paired,
+                                      bool mergeForward) {
+  MachineBasicBlock::iterator NextI = I;
+  ++NextI;
+  // If NextI is the second of the two instructions to be merged, we need
+  // to skip one further. Either way we merge will invalidate the iterator,
+  // and we don't need to scan the new instruction, as it's a pairwise
+  // instruction, which we're not considering for further action anyway.
+  if (NextI == Paired)
+    ++NextI;
+
+  bool IsUnscaled = isUnscaledLdst(I->getOpcode());
+  int OffsetStride =
+      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;
+
+  unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
+  // Insert our new paired instruction after whichever of the paired
+  // instructions mergeForward indicates.
+  MachineBasicBlock::iterator InsertionPoint = mergeForward ? Paired : I;
+  // Also based on mergeForward is from where we copy the base register operand
+  // so we get the flags compatible with the input code.
+  MachineOperand &BaseRegOp =
+      mergeForward ? Paired->getOperand(1) : I->getOperand(1);
+
+  // Which register is Rt and which is Rt2 depends on the offset order.
+  MachineInstr *RtMI, *Rt2MI;
+  if (I->getOperand(2).getImm() ==
+      Paired->getOperand(2).getImm() + OffsetStride) {
+    RtMI = Paired;
+    Rt2MI = I;
+  } else {
+    RtMI = I;
+    Rt2MI = Paired;
+  }
+  // Handle Unscaled
+  int OffsetImm = RtMI->getOperand(2).getImm();
+  if (IsUnscaled && EnableAArch64UnscaledMemOp)
+    OffsetImm /= OffsetStride;
+
+  // Construct the new instruction.
+  MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
+                                    I->getDebugLoc(), TII->get(NewOpc))
+                                .addOperand(RtMI->getOperand(0))
+                                .addOperand(Rt2MI->getOperand(0))
+                                .addOperand(BaseRegOp)
+                                .addImm(OffsetImm);
+  (void)MIB;
+
+  // FIXME: Do we need/want to copy the mem operands from the source
+  //        instructions? Probably. What uses them after this?
+
+  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Paired->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions.
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  return NextI;
+}
+
+/// trackRegDefsUses - Remember what registers the specified instruction uses
+/// and modifies.
+static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
+                             BitVector &UsedRegs,
+                             const TargetRegisterInfo *TRI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isRegMask())
+      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
+
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (MO.isDef()) {
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        ModifiedRegs.set(*AI);
+    } else {
+      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        UsedRegs.set(*AI);
+    }
+  }
+}
+
+static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
+  if (!IsUnscaled && (Offset > 63 || Offset < -64))
+    return false;
+  if (IsUnscaled) {
+    // Convert the byte-offset used by unscaled into an "element" offset used
+    // by the scaled pair load/store instructions.
+    int elemOffset = Offset / OffsetStride;
+    if (elemOffset > 63 || elemOffset < -64)
+      return false;
+  }
+  return true;
+}
+
+// Do alignment, specialized to power of 2 and for signed ints,
+// avoiding having to do a C-style cast from uint_64t to int when
+// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
+// FIXME: Move this function to include/MathExtras.h?
+static int alignTo(int Num, int PowOf2) {
+  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
+}
+
+/// findMatchingInsn - Scan the instructions looking for a load/store that can
+/// be combined with the current instruction into a load/store pair.
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
+                                      bool &mergeForward, unsigned Limit) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = I;
+  MachineInstr *FirstMI = I;
+  ++MBBI;
+
+  int Opc = FirstMI->getOpcode();
+  bool mayLoad = FirstMI->mayLoad();
+  bool IsUnscaled = isUnscaledLdst(Opc);
+  unsigned Reg = FirstMI->getOperand(0).getReg();
+  unsigned BaseReg = FirstMI->getOperand(1).getReg();
+  int Offset = FirstMI->getOperand(2).getImm();
+
+  // Early exit if the first instruction modifies the base register.
+  // e.g., ldr x0, [x0]
+  // Early exit if the offset if not possible to match. (6 bits of positive
+  // range, plus allow an extra one in case we find a later insn that matches
+  // with Offset-1
+  if (FirstMI->modifiesRegister(BaseReg, TRI))
+    return E;
+  int OffsetStride =
+      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1;
+  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) {
+      // If we've found another instruction with the same opcode, check to see
+      // if the base and offset are compatible with our starting instruction.
+      // These instructions all have scaled immediate operands, so we just
+      // check for +1/-1. Make sure to check the new instruction offset is
+      // actually an immediate and not a symbolic reference destined for
+      // a relocation.
+      //
+      // Pairwise instructions have a 7-bit signed offset field. Single insns
+      // have a 12-bit unsigned offset field. To be a valid combine, the
+      // final offset must be in range.
+      unsigned MIBaseReg = MI->getOperand(1).getReg();
+      int MIOffset = MI->getOperand(2).getImm();
+      if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
+                                   (Offset + OffsetStride == MIOffset))) {
+        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
+        // If this is a volatile load/store that otherwise matched, stop looking
+        // as something is going on that we don't have enough information to
+        // safely transform. Similarly, stop if we see a hint to avoid pairs.
+        if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+          return E;
+        // If the resultant immediate offset of merging these instructions
+        // is out of range for a pairwise instruction, bail and keep looking.
+        bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
+        if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+        // If the alignment requirements of the paired (scaled) instruction
+        // can't express the offset of the unscaled input, bail and keep
+        // looking.
+        if (IsUnscaled && EnableAArch64UnscaledMemOp &&
+            (alignTo(MinOffset, OffsetStride) != MinOffset)) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+        // If the destination register of the loads is the same register, bail
+        // and keep looking. A load-pair instruction with both destination
+        // registers the same is UNPREDICTABLE and will result in an exception.
+        if (mayLoad && Reg == MI->getOperand(0).getReg()) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+
+        // If the Rt of the second instruction was not modified or used between
+        // the two instructions, we can combine the second into the first.
+        if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
+            !UsedRegs[MI->getOperand(0).getReg()]) {
+          mergeForward = false;
+          return MBBI;
+        }
+
+        // Likewise, if the Rt of the first instruction is not modified or used
+        // between the two instructions, we can combine the first into the
+        // second.
+        if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
+            !UsedRegs[FirstMI->getOperand(0).getReg()]) {
+          mergeForward = true;
+          return MBBI;
+        }
+        // Unable to combine these instructions due to interference in between.
+        // Keep looking.
+      }
+    }
+
+    // If the instruction wasn't a matching load or store, but does (or can)
+    // modify memory, stop searching, as we don't have alias analysis or
+    // anything like that to tell us whether the access is tromping on the
+    // locations we care about. The big one we want to catch is calls.
+    //
+    // FIXME: Theoretically, we can do better than that for SP and FP based
+    // references since we can effectively know where those are touching. It's
+    // unclear if it's worth the extra code, though. Most paired instructions
+    // will be sequential, perhaps with a few intervening non-memory related
+    // instructions.
+    if (MI->mayStore() || MI->isCall())
+      return E;
+    // Likewise, if we're matching a store instruction, we don't want to
+    // move across a load, as it may be reading the same location.
+    if (FirstMI->mayStore() && MI->mayLoad())
+      return E;
+
+    // Update modified / uses register lists.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+                                           MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == AArch64::ADDXri ||
+          Update->getOpcode() == AArch64::SUBXri) &&
+         "Unexpected base register update instruction to merge!");
+  MachineBasicBlock::iterator NextI = I;
+  // Return the instruction following the merged instruction, which is
+  // the instruction following our unmerged load. Unless that's the add/sub
+  // instruction we're merging, in which case it's the one after that.
+  if (++NextI == Update)
+    ++NextI;
+
+  int Value = Update->getOperand(2).getImm();
+  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+         "Can't merge 1 << 12 offset into pre-indexed load / store");
+  if (Update->getOpcode() == AArch64::SUBXri)
+    Value = -Value;
+
+  unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
+  MachineInstrBuilder MIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(Update->getOperand(0))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addImm(Value);
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating pre-indexed load/store.");
+  DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Update->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
+    MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == AArch64::ADDXri ||
+          Update->getOpcode() == AArch64::SUBXri) &&
+         "Unexpected base register update instruction to merge!");
+  MachineBasicBlock::iterator NextI = I;
+  // Return the instruction following the merged instruction, which is
+  // the instruction following our unmerged load. Unless that's the add/sub
+  // instruction we're merging, in which case it's the one after that.
+  if (++NextI == Update)
+    ++NextI;
+
+  int Value = Update->getOperand(2).getImm();
+  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+         "Can't merge 1 << 12 offset into post-indexed load / store");
+  if (Update->getOpcode() == AArch64::SUBXri)
+    Value = -Value;
+
+  unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
+  MachineInstrBuilder MIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(Update->getOperand(0))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addImm(Value);
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating post-indexed load/store.");
+  DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Update->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
+static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
+                                 int Offset) {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::SUBXri:
+    // Negate the offset for a SUB instruction.
+    Offset *= -1;
+  // FALLTHROUGH
+  case AArch64::ADDXri:
+    // Make sure it's a vanilla immediate operand, not a relocation or
+    // anything else we can't handle.
+    if (!MI->getOperand(2).isImm())
+      break;
+    // Watch out for 1 << 12 shifted value.
+    if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
+      break;
+    // If the instruction has the base register as source and dest and the
+    // immediate will fit in a signed 9-bit integer, then we have a match.
+    if (MI->getOperand(0).getReg() == BaseReg &&
+        MI->getOperand(1).getReg() == BaseReg &&
+        MI->getOperand(2).getImm() <= 255 &&
+        MI->getOperand(2).getImm() >= -256) {
+      // If we have a non-zero Offset, we check that it matches the amount
+      // we're adding to the register.
+      if (!Offset || Offset == MI->getOperand(2).getImm())
+        return true;
+    }
+    break;
+  }
+  return false;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
+    MachineBasicBlock::iterator I, unsigned Limit, int Value) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr *MemMI = I;
+  MachineBasicBlock::iterator MBBI = I;
+  const MachineFunction &MF = *MemMI->getParent()->getParent();
+
+  unsigned DestReg = MemMI->getOperand(0).getReg();
+  unsigned BaseReg = MemMI->getOperand(1).getReg();
+  int Offset = MemMI->getOperand(2).getImm() *
+               TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+
+  // If the base register overlaps the destination register, we can't
+  // merge the update.
+  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+    return E;
+
+  // Scan forward looking for post-index opportunities.
+  // Updating instructions can't be formed if the memory insn already
+  // has an offset other than the value we're looking for.
+  if (Offset != Value)
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  ++MBBI;
+  for (unsigned Count = 0; MBBI != E; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingUpdateInsn(MI, BaseReg, Value))
+      return MBBI;
+
+    // Update the status of what the instruction clobbered and used.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
+    MachineBasicBlock::iterator I, unsigned Limit) {
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr *MemMI = I;
+  MachineBasicBlock::iterator MBBI = I;
+  const MachineFunction &MF = *MemMI->getParent()->getParent();
+
+  unsigned DestReg = MemMI->getOperand(0).getReg();
+  unsigned BaseReg = MemMI->getOperand(1).getReg();
+  int Offset = MemMI->getOperand(2).getImm();
+  unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
+
+  // If the load/store is the first instruction in the block, there's obviously
+  // not any matching update. Ditto if the memory offset isn't zero.
+  if (MBBI == B || Offset != 0)
+    return E;
+  // If the base register overlaps the destination register, we can't
+  // merge the update.
+  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+    return E;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  --MBBI;
+  for (unsigned Count = 0; MBBI != B; --MBBI) {
+    MachineInstr *MI = MBBI;
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
+      return MBBI;
+
+    // Update the status of what the instruction clobbered and used.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is used or modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  // Two tranformations to do here:
+  // 1) Find loads and stores that can be merged into a single load or store
+  //    pair instruction.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        ldr x1, [x2, #8]
+  //        ; becomes
+  //        ldp x0, x1, [x2]
+  // 2) Find base register updates that can be merged into the load or store
+  //    as a base-reg writeback.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        add x2, x2, #4
+  //        ; becomes
+  //        ldr x0, [x2], #4
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    switch (MI->getOpcode()) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    case AArch64::STRSui:
+    case AArch64::STRDui:
+    case AArch64::STRQui:
+    case AArch64::STRXui:
+    case AArch64::STRWui:
+    case AArch64::LDRSui:
+    case AArch64::LDRDui:
+    case AArch64::LDRQui:
+    case AArch64::LDRXui:
+    case AArch64::LDRWui:
+    // do the unscaled versions as well
+    case AArch64::STURSi:
+    case AArch64::STURDi:
+    case AArch64::STURQi:
+    case AArch64::STURWi:
+    case AArch64::STURXi:
+    case AArch64::LDURSi:
+    case AArch64::LDURDi:
+    case AArch64::LDURQi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi: {
+      // If this is a volatile load/store, don't mess with it.
+      if (MI->hasOrderedMemoryRef()) {
+        ++MBBI;
+        break;
+      }
+      // Make sure this is a reg+imm (as opposed to an address reloc).
+      if (!MI->getOperand(2).isImm()) {
+        ++MBBI;
+        break;
+      }
+      // Check if this load/store has a hint to avoid pair formation.
+      // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+      if (TII->isLdStPairSuppressed(MI)) {
+        ++MBBI;
+        break;
+      }
+      // Look ahead up to ScanLimit instructions for a pairable instruction.
+      bool mergeForward = false;
+      MachineBasicBlock::iterator Paired =
+          findMatchingInsn(MBBI, mergeForward, ScanLimit);
+      if (Paired != E) {
+        // Merge the loads into a pair. Keeping the iterator straight is a
+        // pain, so we let the merge routine tell us what the next instruction
+        // is after it's done mucking about.
+        MBBI = mergePairedInsns(MBBI, Paired, mergeForward);
+
+        Modified = true;
+        ++NumPairCreated;
+        if (isUnscaledLdst(MI->getOpcode()))
+          ++NumUnscaledPairCreated;
+        break;
+      }
+      ++MBBI;
+      break;
+    }
+      // FIXME: Do the other instructions.
+    }
+  }
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    // Do update merging. It's simpler to keep this separate from the above
+    // switch, though not strictly necessary.
+    int Opc = MI->getOpcode();
+    switch (Opc) {
+    default:
+      // Just move on to the next instruction.
+      ++MBBI;
+      break;
+    case AArch64::STRSui:
+    case AArch64::STRDui:
+    case AArch64::STRQui:
+    case AArch64::STRXui:
+    case AArch64::STRWui:
+    case AArch64::LDRSui:
+    case AArch64::LDRDui:
+    case AArch64::LDRQui:
+    case AArch64::LDRXui:
+    case AArch64::LDRWui:
+    // do the unscaled versions as well
+    case AArch64::STURSi:
+    case AArch64::STURDi:
+    case AArch64::STURQi:
+    case AArch64::STURWi:
+    case AArch64::STURXi:
+    case AArch64::LDURSi:
+    case AArch64::LDURDi:
+    case AArch64::LDURQi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi: {
+      // Make sure this is a reg+imm (as opposed to an address reloc).
+      if (!MI->getOperand(2).isImm()) {
+        ++MBBI;
+        break;
+      }
+      // Look ahead up to ScanLimit instructions for a mergable instruction.
+      MachineBasicBlock::iterator Update =
+          findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePostIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPostFolded;
+        break;
+      }
+      // Don't know how to handle pre/post-index versions, so move to the next
+      // instruction.
+      if (isUnscaledLdst(Opc)) {
+        ++MBBI;
+        break;
+      }
+
+      // Look back to try to find a pre-index instruction. For example,
+      // add x0, x0, #8
+      // ldr x1, [x0]
+      //   merged into:
+      // ldr x1, [x0, #8]!
+      Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPreFolded;
+        break;
+      }
+
+      // Look forward to try to find a post-index instruction. For example,
+      // ldr x1, [x0, #64]
+      // add x0, x0, #64
+      //   merged into:
+      // ldr x1, [x0, #64]!
+
+      // The immediate in the load/store is scaled by the size of the register
+      // being loaded. The immediate in the add we're looking for,
+      // however, is not, so adjust here.
+      int Value = MI->getOperand(2).getImm() *
+                  TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
+                      ->getSize();
+      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
+      if (Update != E) {
+        // Merge the update into the ld/st.
+        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
+        Modified = true;
+        ++NumPreFolded;
+        break;
+      }
+
+      // Nothing found. Just move to the next instruction.
+      ++MBBI;
+      break;
+    }
+      // FIXME: Do the other instructions.
+    }
+  }
+
+  return Modified;
+}
+
+bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+  TRI = TM.getRegisterInfo();
+
+  bool Modified = false;
+  for (auto &MBB : Fn)
+    Modified |= optimizeBlock(MBB);
+
+  return Modified;
+}
+
+// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
+// loads and stores near one another?
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
+  return new AArch64LoadStoreOpt();
+}
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 3842bfd..ab6d375 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst -==//
+//==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,146 +12,191 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64AsmPrinter.h"
-#include "AArch64TargetMachine.h"
+#include "AArch64MCInstLower.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Mangler.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-MCOperand
-AArch64AsmPrinter::lowerSymbolOperand(const MachineOperand &MO,
-                                      const MCSymbol *Sym) const {
-  const MCExpr *Expr = 0;
+AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, Mangler &mang,
+                                       AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
 
-  Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, OutContext);
+MCSymbol *
+AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
 
-  switch (MO.getTargetFlags()) {
-  case AArch64II::MO_GOT:
-    Expr = AArch64MCExpr::CreateGOT(Expr, OutContext);
-    break;
-  case AArch64II::MO_GOT_LO12:
-    Expr = AArch64MCExpr::CreateGOTLo12(Expr, OutContext);
-    break;
-  case AArch64II::MO_LO12:
-    Expr = AArch64MCExpr::CreateLo12(Expr, OutContext);
-    break;
-  case AArch64II::MO_DTPREL_G1:
-    Expr = AArch64MCExpr::CreateDTPREL_G1(Expr, OutContext);
-    break;
-  case AArch64II::MO_DTPREL_G0_NC:
-    Expr = AArch64MCExpr::CreateDTPREL_G0_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_GOTTPREL:
-    Expr = AArch64MCExpr::CreateGOTTPREL(Expr, OutContext);
-    break;
-  case AArch64II::MO_GOTTPREL_LO12:
-    Expr = AArch64MCExpr::CreateGOTTPRELLo12(Expr, OutContext);
-    break;
-  case AArch64II::MO_TLSDESC:
-    Expr = AArch64MCExpr::CreateTLSDesc(Expr, OutContext);
-    break;
-  case AArch64II::MO_TLSDESC_LO12:
-    Expr = AArch64MCExpr::CreateTLSDescLo12(Expr, OutContext);
-    break;
-  case AArch64II::MO_TPREL_G1:
-    Expr = AArch64MCExpr::CreateTPREL_G1(Expr, OutContext);
-    break;
-  case AArch64II::MO_TPREL_G0_NC:
-    Expr = AArch64MCExpr::CreateTPREL_G0_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_ABS_G3:
-    Expr = AArch64MCExpr::CreateABS_G3(Expr, OutContext);
-    break;
-  case AArch64II::MO_ABS_G2_NC:
-    Expr = AArch64MCExpr::CreateABS_G2_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_ABS_G1_NC:
-    Expr = AArch64MCExpr::CreateABS_G1_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_ABS_G0_NC:
-    Expr = AArch64MCExpr::CreateABS_G0_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_NO_FLAG:
-    // Expr is already correct
-    break;
-  default:
-    llvm_unreachable("Unexpected MachineOperand flag");
+MCSymbol *
+AArch64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                                       MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+  if ((MO.getTargetFlags() & AArch64II::MO_GOT) != 0) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
+    else
+      assert(0 && "Unexpected target flags with MO_GOT on GV operand");
+  } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
+    else
+      llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
+  } else {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_PAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_PAGEOFF;
   }
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
+                                                    MCSymbol *Sym) const {
+  uint32_t RefFlags = 0;
 
+  if (MO.getTargetFlags() & AArch64II::MO_GOT)
+    RefFlags |= AArch64MCExpr::VK_GOT;
+  else if (MO.getTargetFlags() & AArch64II::MO_TLS) {
+    TLSModel::Model Model;
+    if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      Model = Printer.TM.getTLSModel(GV);
+    } else {
+      assert(MO.isSymbol() &&
+             StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
+             "unexpected external TLS symbol");
+      Model = TLSModel::GeneralDynamic;
+    }
+    switch (Model) {
+    case TLSModel::InitialExec:
+      RefFlags |= AArch64MCExpr::VK_GOTTPREL;
+      break;
+    case TLSModel::LocalExec:
+      RefFlags |= AArch64MCExpr::VK_TPREL;
+      break;
+    case TLSModel::LocalDynamic:
+      RefFlags |= AArch64MCExpr::VK_DTPREL;
+      break;
+    case TLSModel::GeneralDynamic:
+      RefFlags |= AArch64MCExpr::VK_TLSDESC;
+      break;
+    }
+  } else {
+    // No modifier means this is a generic reference, classified as absolute for
+    // the cases where it matters (:abs_g0: etc).
+    RefFlags |= AArch64MCExpr::VK_ABS;
+  }
+
+  if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+    RefFlags |= AArch64MCExpr::VK_PAGE;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+           AArch64II::MO_PAGEOFF)
+    RefFlags |= AArch64MCExpr::VK_PAGEOFF;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3)
+    RefFlags |= AArch64MCExpr::VK_G3;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G2)
+    RefFlags |= AArch64MCExpr::VK_G2;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G1)
+    RefFlags |= AArch64MCExpr::VK_G1;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0)
+    RefFlags |= AArch64MCExpr::VK_G0;
+
+  if (MO.getTargetFlags() & AArch64II::MO_NC)
+    RefFlags |= AArch64MCExpr::VK_NC;
+
+  const MCExpr *Expr =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
   if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(Expr,
-                                   MCConstantExpr::Create(MO.getOffset(),
-                                                          OutContext),
-                                   OutContext);
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+
+  AArch64MCExpr::VariantKind RefKind;
+  RefKind = static_cast<AArch64MCExpr::VariantKind>(RefFlags);
+  Expr = AArch64MCExpr::Create(Expr, RefKind, Ctx);
 
   return MCOperand::CreateExpr(Expr);
 }
 
-bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO,
-                                     MCOperand &MCOp) const {
+MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                                 MCSymbol *Sym) const {
+  if (TargetTriple.isOSDarwin())
+    return lowerSymbolOperandDarwin(MO, Sym);
+
+  assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
+  return lowerSymbolOperandELF(MO, Sym);
+}
+
+bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
+                                      MCOperand &MCOp) const {
   switch (MO.getType()) {
-  default: llvm_unreachable("unknown operand type");
+  default:
+    assert(0 && "unknown operand type");
   case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
     if (MO.isImplicit())
       return false;
-    assert(!MO.getSubReg() && "Subregs should be eliminated!");
     MCOp = MCOperand::CreateReg(MO.getReg());
     break;
+  case MachineOperand::MO_RegisterMask:
+    // Regmasks are like implicit defs.
+    return false;
   case MachineOperand::MO_Immediate:
     MCOp = MCOperand::CreateImm(MO.getImm());
     break;
-  case MachineOperand::MO_FPImmediate: {
-    assert(MO.getFPImm()->isZero() && "Only fp imm 0.0 is supported");
-    MCOp = MCOperand::CreateFPImm(0.0);
-    break;
-  }
-  case MachineOperand::MO_BlockAddress:
-    MCOp = lowerSymbolOperand(MO, GetBlockAddressSymbol(MO.getBlockAddress()));
-    break;
-  case MachineOperand::MO_ExternalSymbol:
-    MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(
+        MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = lowerSymbolOperand(MO, getSymbol(MO.getGlobal()));
+    MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
     break;
-  case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
-                                   MO.getMBB()->getSymbol(), OutContext));
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
     break;
   case MachineOperand::MO_JumpTableIndex:
-    MCOp = lowerSymbolOperand(MO, GetJTISymbol(MO.getIndex()));
+    MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
     break;
   case MachineOperand::MO_ConstantPoolIndex:
-    MCOp = lowerSymbolOperand(MO, GetCPISymbol(MO.getIndex()));
+    MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = LowerSymbolOperand(
+        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
     break;
-  case MachineOperand::MO_RegisterMask:
-    // Ignore call clobbers
-    return false;
-
   }
-
   return true;
 }
 
-void llvm::LowerAArch64MachineInstrToMCInst(const MachineInstr *MI,
-                                            MCInst &OutMI,
-                                            AArch64AsmPrinter &AP) {
+void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-
     MCOperand MCOp;
-    if (AP.lowerOperand(MO, MCOp))
+    if (lowerOperand(MI->getOperand(i), MCOp))
       OutMI.addOperand(MCOp);
   }
 }
diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h
new file mode 100644
index 0000000..ba50ba9
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MCInstLower.h
@@ -0,0 +1,52 @@
+//===-- AArch64MCInstLower.h - Lower MachineInstr to MCInst ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64_MCINSTLOWER_H
+#define AArch64_MCINSTLOWER_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+class AsmPrinter;
+class MCAsmInfo;
+class MCContext;
+class MCInst;
+class MCOperand;
+class MCSymbol;
+class MachineInstr;
+class MachineModuleInfoMachO;
+class MachineOperand;
+class Mangler;
+
+/// AArch64MCInstLower - This class is used to lower an MachineInstr
+/// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower {
+  MCContext &Ctx;
+  AsmPrinter &Printer;
+  Triple TargetTriple;
+
+public:
+  AArch64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
+
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                     MCSymbol *Sym) const;
+  MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
+                                  MCSymbol *Sym) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+};
+}
+
+#endif
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
deleted file mode 100644
index f45d8f7..0000000
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===-- AArch64MachineFuctionInfo.cpp - AArch64 machine function info -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file just contains the anchor for the AArch64MachineFunctionInfo to
-// force vtable emission.
-//
-//===----------------------------------------------------------------------===//
-#include "AArch64MachineFunctionInfo.h"
-
-using namespace llvm;
-
-void AArch64MachineFunctionInfo::anchor() { }
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 33da54f..7c257ba 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//=- AArch64MachineFuctionInfo.h - AArch64 machine function info -*- C++ -*-==//
+//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +11,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AARCH64MACHINEFUNCTIONINFO_H
-#define AARCH64MACHINEFUNCTIONINFO_H
+#ifndef AArch64MACHINEFUNCTIONINFO_H
+#define AArch64MACHINEFUNCTIONINFO_H
 
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
 
 namespace llvm {
 
-/// This class is derived from MachineFunctionInfo and contains private AArch64
-/// target-specific information for each MachineFunction.
-class AArch64MachineFunctionInfo : public MachineFunctionInfo {
-  virtual void anchor();
+/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private AArch64-specific information for each MachineFunction.
+class AArch64FunctionInfo : public MachineFunctionInfo {
 
   /// Number of bytes of arguments this function has on the stack. If the callee
   /// is expected to restore the argument stack this should be a multiple of 16,
@@ -39,111 +41,123 @@ class AArch64MachineFunctionInfo : public MachineFunctionInfo {
   /// callee is expected to pop the args.
   unsigned ArgumentStackToRestore;
 
-  /// If the stack needs to be adjusted on frame entry in two stages, this
-  /// records the size of the first adjustment just prior to storing
-  /// callee-saved registers. The callee-saved slots are addressed assuming
-  /// SP == <incoming-SP> - InitialStackAdjust.
-  unsigned InitialStackAdjust;
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool HasStackFrame;
 
-  /// Number of local-dynamic TLS accesses.
-  unsigned NumLocalDynamics;
+  /// \brief Amount of stack frame size, not including callee-saved registers.
+  unsigned LocalStackSize;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The Frame index of the area where LowerFormalArguments puts the
-  /// general-purpose registers that might contain variadic parameters.
-  int VariadicGPRIdx;
+  /// \brief Number of TLS accesses using the special (combinable)
+  /// _TLS_MODULE_BASE_ symbol.
+  unsigned NumLocalDynamicTLSAccesses;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The size of the frame object used to store the general-purpose registers
-  /// which might contain variadic arguments. This is the offset from
-  /// VariadicGPRIdx to what's stored in __gr_top.
-  unsigned VariadicGPRSize;
+  /// \brief FrameIndex for start of varargs area for arguments passed on the
+  /// stack.
+  int VarArgsStackIndex;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The Frame index of the area where LowerFormalArguments puts the
-  /// floating-point registers that might contain variadic parameters.
-  int VariadicFPRIdx;
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// general purpose registers.
+  int VarArgsGPRIndex;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The size of the frame object used to store the floating-point registers
-  /// which might contain variadic arguments. This is the offset from
-  /// VariadicFPRIdx to what's stored in __vr_top.
-  unsigned VariadicFPRSize;
+  /// \brief Size of the varargs area for arguments passed in general purpose
+  /// registers.
+  unsigned VarArgsGPRSize;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The Frame index of an object pointing just past the last known stacked
-  /// argument on entry to a variadic function. This goes into the __stack field
-  /// of the va_list type.
-  int VariadicStackIdx;
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// floating-point registers.
+  int VarArgsFPRIndex;
 
-  /// The offset of the frame pointer from the stack pointer on function
-  /// entry. This is expected to be negative.
-  int FramePointerOffset;
+  /// \brief Size of the varargs area for arguments passed in floating-point
+  /// registers.
+  unsigned VarArgsFPRSize;
 
 public:
-  AArch64MachineFunctionInfo()
-    : BytesInStackArgArea(0),
-      ArgumentStackToRestore(0),
-      InitialStackAdjust(0),
-      NumLocalDynamics(0),
-      VariadicGPRIdx(0),
-      VariadicGPRSize(0),
-      VariadicFPRIdx(0),
-      VariadicFPRSize(0),
-      VariadicStackIdx(0),
-      FramePointerOffset(0) {}
-
-  explicit AArch64MachineFunctionInfo(MachineFunction &MF)
-    : BytesInStackArgArea(0),
-      ArgumentStackToRestore(0),
-      InitialStackAdjust(0),
-      NumLocalDynamics(0),
-      VariadicGPRIdx(0),
-      VariadicGPRSize(0),
-      VariadicFPRIdx(0),
-      VariadicFPRSize(0),
-      VariadicStackIdx(0),
-      FramePointerOffset(0) {}
+  AArch64FunctionInfo()
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
+
+  explicit AArch64FunctionInfo(MachineFunction &MF)
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+    (void)MF;
+  }
 
   unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
-  void setBytesInStackArgArea (unsigned bytes) { BytesInStackArgArea = bytes;}
+  void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
 
   unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
   void setArgumentStackToRestore(unsigned bytes) {
     ArgumentStackToRestore = bytes;
   }
 
-  unsigned getInitialStackAdjust() const { return InitialStackAdjust; }
-  void setInitialStackAdjust(unsigned bytes) { InitialStackAdjust = bytes; }
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
 
-  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
-  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+  void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
+  unsigned getLocalStackSize() const { return LocalStackSize; }
 
-  int getVariadicGPRIdx() const { return VariadicGPRIdx; }
-  void setVariadicGPRIdx(int Idx) { VariadicGPRIdx = Idx; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
+  unsigned getNumLocalDynamicTLSAccesses() const {
+    return NumLocalDynamicTLSAccesses;
+  }
 
-  unsigned getVariadicGPRSize() const { return VariadicGPRSize; }
-  void setVariadicGPRSize(unsigned Size) { VariadicGPRSize = Size; }
+  int getVarArgsStackIndex() const { return VarArgsStackIndex; }
+  void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
 
-  int getVariadicFPRIdx() const { return VariadicFPRIdx; }
-  void setVariadicFPRIdx(int Idx) { VariadicFPRIdx = Idx; }
+  int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
+  void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
 
-  unsigned getVariadicFPRSize() const { return VariadicFPRSize; }
-  void setVariadicFPRSize(unsigned Size) { VariadicFPRSize = Size; }
+  unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
+  void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
 
-  int getVariadicStackIdx() const { return VariadicStackIdx; }
-  void setVariadicStackIdx(int Idx) { VariadicStackIdx = Idx; }
+  int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
+  void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
 
-  int getFramePointerOffset() const { return FramePointerOffset; }
-  void setFramePointerOffset(int Idx) { FramePointerOffset = Idx; }
+  unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
+  void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
 
-};
+  typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
+
+  const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
+
+  // Shortcuts for LOH related types.
+  class MILOHDirective {
+    MCLOHType Kind;
 
+    /// Arguments of this directive. Order matters.
+    SmallVector<const MachineInstr *, 3> Args;
+
+  public:
+    typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
+
+    MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
+        : Kind(Kind), Args(Args.begin(), Args.end()) {
+      assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
+    }
+
+    MCLOHType getKind() const { return Kind; }
+    const LOHArgs &getArgs() const { return Args; }
+  };
+
+  typedef MILOHDirective::LOHArgs MILOHArgs;
+  typedef SmallVector<MILOHDirective, 32> MILOHContainer;
+
+  const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
+
+  /// Add a LOH directive of this @p Kind and this @p Args.
+  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
+    LOHContainerSet.push_back(MILOHDirective(Kind, Args));
+    LOHRelated.insert(Args.begin(), Args.end());
+  }
+
+private:
+  // Hold the lists of LOHs.
+  MILOHContainer LOHContainerSet;
+  SetOfInstructions LOHRelated;
+};
 } // End llvm namespace
 
-#endif
+#endif // AArch64MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/AArch64/AArch64PerfectShuffle.h b/lib/Target/AArch64/AArch64PerfectShuffle.h
new file mode 100644
index 0000000..b22fa24
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -0,0 +1,6586 @@
+//===-- AArch64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using AdvSIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+  835584U, // <0,1,2,3>: Cost 0 copy LHS
+  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+  835584U, // <0,1,2,u>: Cost 0 copy LHS
+  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+  835584U, // <0,1,u,3>: Cost 0 copy LHS
+  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+  835584U, // <0,1,u,u>: Cost 0 copy LHS
+  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+  835584U, // <0,u,2,3>: Cost 0 copy LHS
+  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+  835584U, // <0,u,2,u>: Cost 0 copy LHS
+  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+  835584U, // <0,u,u,3>: Cost 0 copy LHS
+  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+  835584U, // <0,u,u,u>: Cost 0 copy LHS
+  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+  27705344U, // <4,5,6,7>: Cost 0 copy RHS
+  27705344U, // <4,5,6,u>: Cost 0 copy RHS
+  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+  27705344U, // <4,5,u,7>: Cost 0 copy RHS
+  27705344U, // <4,5,u,u>: Cost 0 copy RHS
+  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+  27705344U, // <4,u,6,7>: Cost 0 copy RHS
+  27705344U, // <4,u,6,u>: Cost 0 copy RHS
+  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  27705344U, // <4,u,u,7>: Cost 0 copy RHS
+  27705344U, // <4,u,u,u>: Cost 0 copy RHS
+  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+  835584U, // <u,1,2,3>: Cost 0 copy LHS
+  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+  835584U, // <u,1,2,u>: Cost 0 copy LHS
+  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  835584U, // <u,1,u,3>: Cost 0 copy LHS
+  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+  835584U, // <u,1,u,u>: Cost 0 copy LHS
+  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  27705344U, // <u,5,6,7>: Cost 0 copy RHS
+  27705344U, // <u,5,6,u>: Cost 0 copy RHS
+  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  27705344U, // <u,5,u,7>: Cost 0 copy RHS
+  27705344U, // <u,5,u,u>: Cost 0 copy RHS
+  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+  835584U, // <u,u,2,3>: Cost 0 copy LHS
+  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  835584U, // <u,u,2,u>: Cost 0 copy LHS
+  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+  27705344U, // <u,u,6,7>: Cost 0 copy RHS
+  27705344U, // <u,u,6,u>: Cost 0 copy RHS
+  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+  835584U, // <u,u,u,3>: Cost 0 copy LHS
+  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+  27705344U, // <u,u,u,7>: Cost 0 copy RHS
+  835584U, // <u,u,u,u>: Cost 0 copy LHS
+  0
+};
diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
new file mode 100644
index 0000000..4723cc4
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -0,0 +1,578 @@
+//=- AArch64PromoteConstant.cpp --- Promote constant to global for AArch64 -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64PromoteConstant pass which promotes constants
+// to global variables when this is likely to be more efficient. Currently only
+// types related to constant vector (i.e., constant vector, array of constant
+// vectors, constant structure with a constant vector field, etc.) are promoted
+// to global variables. Constant vectors are likely to be lowered in target
+// constant pool during instruction selection already; therefore, the access
+// will remain the same (memory load), but the structure types are not split
+// into different constant pool accesses for each field. A bonus side effect is
+// that created globals may be merged by the global merge pass.
+//
+// FIXME: This pass may be useful for other targets too.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-promote-const"
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("aarch64-stress-promote-const", cl::Hidden,
+                            cl::desc("Promote all vector constants"));
+
+STATISTIC(NumPromoted, "Number of promoted constants");
+STATISTIC(NumPromotedUses, "Number of promoted constants uses");
+
+//===----------------------------------------------------------------------===//
+//                       AArch64PromoteConstant
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Promotes interesting constant into global variables.
+/// The motivating example is:
+/// static const uint16_t TableA[32] = {
+///   41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768,
+///   31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215,
+///   25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846,
+///   21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725,
+/// };
+///
+/// uint8x16x4_t LoadStatic(void) {
+///   uint8x16x4_t ret;
+///   ret.val[0] = vld1q_u16(TableA +  0);
+///   ret.val[1] = vld1q_u16(TableA +  8);
+///   ret.val[2] = vld1q_u16(TableA + 16);
+///   ret.val[3] = vld1q_u16(TableA + 24);
+///   return ret;
+/// }
+///
+/// The constants in this example are folded into the uses. Thus, 4 different
+/// constants are created.
+///
+/// As their type is vector the cheapest way to create them is to load them
+/// for the memory.
+///
+/// Therefore the final assembly final has 4 different loads. With this pass
+/// enabled, only one load is issued for the constants.
+class AArch64PromoteConstant : public ModulePass {
+
+public:
+  static char ID;
+  AArch64PromoteConstant() : ModulePass(ID) {}
+
+  const char *getPassName() const override { return "AArch64 Promote Constant"; }
+
+  /// Iterate over the functions and promote the interesting constants into
+  /// global variables with module scope.
+  bool runOnModule(Module &M) override {
+    DEBUG(dbgs() << getPassName() << '\n');
+    bool Changed = false;
+    for (auto &MF : M) {
+      Changed |= runOnFunction(MF);
+    }
+    return Changed;
+  }
+
+private:
+  /// Look for interesting constants used within the given function.
+  /// Promote them into global variables, load these global variables within
+  /// the related function, so that the number of inserted load is minimal.
+  bool runOnFunction(Function &F);
+
+  // This transformation requires dominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  /// Type to store a list of User.
+  typedef SmallVector<Value::user_iterator, 4> Users;
+  /// Map an insertion point to all the uses it dominates.
+  typedef DenseMap<Instruction *, Users> InsertionPoints;
+  /// Map a function to the required insertion point of load for a
+  /// global variable.
+  typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
+
+  /// Find the closest point that dominates the given Use.
+  Instruction *findInsertionPoint(Value::user_iterator &Use);
+
+  /// Check if the given insertion point is dominated by an existing
+  /// insertion point.
+  /// If true, the given use is added to the list of dominated uses for
+  /// the related existing point.
+  /// \param NewPt the insertion point to be checked
+  /// \param UseIt the use to be added into the list of dominated uses
+  /// \param InsertPts existing insertion points
+  /// \pre NewPt and all instruction in InsertPts belong to the same function
+  /// \return true if one of the insertion point in InsertPts dominates NewPt,
+  ///         false otherwise
+  bool isDominated(Instruction *NewPt, Value::user_iterator &UseIt,
+                   InsertionPoints &InsertPts);
+
+  /// Check if the given insertion point can be merged with an existing
+  /// insertion point in a common dominator.
+  /// If true, the given use is added to the list of the created insertion
+  /// point.
+  /// \param NewPt the insertion point to be checked
+  /// \param UseIt the use to be added into the list of dominated uses
+  /// \param InsertPts existing insertion points
+  /// \pre NewPt and all instruction in InsertPts belong to the same function
+  /// \pre isDominated returns false for the exact same parameters.
+  /// \return true if it exists an insertion point in InsertPts that could
+  ///         have been merged with NewPt in a common dominator,
+  ///         false otherwise
+  bool tryAndMerge(Instruction *NewPt, Value::user_iterator &UseIt,
+                   InsertionPoints &InsertPts);
+
+  /// Compute the minimal insertion points to dominates all the interesting
+  /// uses of value.
+  /// Insertion points are group per function and each insertion point
+  /// contains a list of all the uses it dominates within the related function
+  /// \param Val constant to be examined
+  /// \param[out] InsPtsPerFunc output storage of the analysis
+  void computeInsertionPoints(Constant *Val,
+                              InsertionPointsPerFunc &InsPtsPerFunc);
+
+  /// Insert a definition of a new global variable at each point contained in
+  /// InsPtsPerFunc and update the related uses (also contained in
+  /// InsPtsPerFunc).
+  bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc);
+
+  /// Compute the minimal insertion points to dominate all the interesting
+  /// uses of Val and insert a definition of a new global variable
+  /// at these points.
+  /// Also update the uses of Val accordingly.
+  /// Currently a use of Val is considered interesting if:
+  /// - Val is not UndefValue
+  /// - Val is not zeroinitialized
+  /// - Replacing Val per a load of a global variable is valid.
+  /// \see shouldConvert for more details
+  bool computeAndInsertDefinitions(Constant *Val);
+
+  /// Promote the given constant into a global variable if it is expected to
+  /// be profitable.
+  /// \return true if Cst has been promoted
+  bool promoteConstant(Constant *Cst);
+
+  /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
+  /// Append UseIt to this list and delete the entry of IPI in InsertPts.
+  static void appendAndTransferDominatedUses(Instruction *NewPt,
+                                             Value::user_iterator &UseIt,
+                                             InsertionPoints::iterator &IPI,
+                                             InsertionPoints &InsertPts) {
+    // Record the dominated use.
+    IPI->second.push_back(UseIt);
+    // Transfer the dominated uses of IPI to NewPt
+    // Inserting into the DenseMap may invalidate existing iterator.
+    // Keep a copy of the key to find the iterator to erase.
+    Instruction *OldInstr = IPI->first;
+    InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second));
+    // Erase IPI.
+    IPI = InsertPts.find(OldInstr);
+    InsertPts.erase(IPI);
+  }
+};
+} // end anonymous namespace
+
+char AArch64PromoteConstant::ID = 0;
+
+namespace llvm {
+void initializeAArch64PromoteConstantPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(AArch64PromoteConstant, "aarch64-promote-const",
+                      "AArch64 Promote Constant Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AArch64PromoteConstant, "aarch64-promote-const",
+                    "AArch64 Promote Constant Pass", false, false)
+
+ModulePass *llvm::createAArch64PromoteConstantPass() {
+  return new AArch64PromoteConstant();
+}
+
+/// Check if the given type uses a vector type.
+static bool isConstantUsingVectorTy(const Type *CstTy) {
+  if (CstTy->isVectorTy())
+    return true;
+  if (CstTy->isStructTy()) {
+    for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements();
+         EltIdx < EndEltIdx; ++EltIdx)
+      if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx)))
+        return true;
+  } else if (CstTy->isArrayTy())
+    return isConstantUsingVectorTy(CstTy->getArrayElementType());
+  return false;
+}
+
+/// Check if the given use (Instruction + OpIdx) of Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A use should be converted if it is legal to do so.
+/// For instance, it is not legal to turn the mask operand of a shuffle vector
+/// into a load of a global variable.
+static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
+                             unsigned OpIdx) {
+  // shufflevector instruction expects a const for the mask argument, i.e., the
+  // third argument. Do not promote this use in that case.
+  if (isa<const ShuffleVectorInst>(Instr) && OpIdx == 2)
+    return false;
+
+  // extractvalue instruction expects a const idx.
+  if (isa<const ExtractValueInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // extractvalue instruction expects a const idx.
+  if (isa<const InsertValueInst>(Instr) && OpIdx > 1)
+    return false;
+
+  if (isa<const AllocaInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Alignment argument must be constant.
+  if (isa<const LoadInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Alignment argument must be constant.
+  if (isa<const StoreInst>(Instr) && OpIdx > 1)
+    return false;
+
+  // Index must be constant.
+  if (isa<const GetElementPtrInst>(Instr) && OpIdx > 0)
+    return false;
+
+  // Personality function and filters must be constant.
+  // Give up on that instruction.
+  if (isa<const LandingPadInst>(Instr))
+    return false;
+
+  // Switch instruction expects constants to compare to.
+  if (isa<const SwitchInst>(Instr))
+    return false;
+
+  // Expected address must be a constant.
+  if (isa<const IndirectBrInst>(Instr))
+    return false;
+
+  // Do not mess with intrinsics.
+  if (isa<const IntrinsicInst>(Instr))
+    return false;
+
+  // Do not mess with inline asm.
+  const CallInst *CI = dyn_cast<const CallInst>(Instr);
+  if (CI && isa<const InlineAsm>(CI->getCalledValue()))
+    return false;
+
+  return true;
+}
+
+/// Check if the given Cst should be converted into
+/// a load of a global variable initialized with Cst.
+/// A constant should be converted if it is likely that the materialization of
+/// the constant will be tricky. Thus, we give up on zero or undef values.
+///
+/// \todo Currently, accept only vector related types.
+/// Also we give up on all simple vector type to keep the existing
+/// behavior. Otherwise, we should push here all the check of the lowering of
+/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging
+/// constant via global merge and the fact that the same constant is stored
+/// only once with this method (versus, as many function that uses the constant
+/// for the regular approach, even for float).
+/// Again, the simplest solution would be to promote every
+/// constant and rematerialize them when they are actually cheap to create.
+static bool shouldConvert(const Constant *Cst) {
+  if (isa<const UndefValue>(Cst))
+    return false;
+
+  // FIXME: In some cases, it may be interesting to promote in memory
+  // a zero initialized constant.
+  // E.g., when the type of Cst require more instructions than the
+  // adrp/add/load sequence or when this sequence can be shared by several
+  // instances of Cst.
+  // Ideally, we could promote this into a global and rematerialize the constant
+  // when it was a bad idea.
+  if (Cst->isZeroValue())
+    return false;
+
+  if (Stress)
+    return true;
+
+  // FIXME: see function \todo
+  if (Cst->getType()->isVectorTy())
+    return false;
+  return isConstantUsingVectorTy(Cst->getType());
+}
+
+Instruction *
+AArch64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
+  // If this user is a phi, the insertion point is in the related
+  // incoming basic block.
+  PHINode *PhiInst = dyn_cast<PHINode>(*Use);
+  Instruction *InsertionPoint;
+  if (PhiInst)
+    InsertionPoint =
+        PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
+  else
+    InsertionPoint = dyn_cast<Instruction>(*Use);
+  assert(InsertionPoint && "User is not an instruction!");
+  return InsertionPoint;
+}
+
+bool AArch64PromoteConstant::isDominated(Instruction *NewPt,
+                                         Value::user_iterator &UseIt,
+                                         InsertionPoints &InsertPts) {
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+      *NewPt->getParent()->getParent()).getDomTree();
+
+  // Traverse all the existing insertion points and check if one is dominating
+  // NewPt. If it is, remember that.
+  for (auto &IPI : InsertPts) {
+    if (NewPt == IPI.first || DT.dominates(IPI.first, NewPt) ||
+        // When IPI.first is a terminator instruction, DT may think that
+        // the result is defined on the edge.
+        // Here we are testing the insertion point, not the definition.
+        (IPI.first->getParent() != NewPt->getParent() &&
+         DT.dominates(IPI.first->getParent(), NewPt->getParent()))) {
+      // No need to insert this point. Just record the dominated use.
+      DEBUG(dbgs() << "Insertion point dominated by:\n");
+      DEBUG(IPI.first->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+      IPI.second.push_back(UseIt);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt,
+                                         Value::user_iterator &UseIt,
+                                         InsertionPoints &InsertPts) {
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+      *NewPt->getParent()->getParent()).getDomTree();
+  BasicBlock *NewBB = NewPt->getParent();
+
+  // Traverse all the existing insertion point and check if one is dominated by
+  // NewPt and thus useless or can be combined with NewPt into a common
+  // dominator.
+  for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                 EndIPI = InsertPts.end();
+       IPI != EndIPI; ++IPI) {
+    BasicBlock *CurBB = IPI->first->getParent();
+    if (NewBB == CurBB) {
+      // Instructions are in the same block.
+      // By construction, NewPt is dominating the other.
+      // Indeed, isDominated returned false with the exact same arguments.
+      DEBUG(dbgs() << "Merge insertion point with:\n");
+      DEBUG(IPI->first->print(dbgs()));
+      DEBUG(dbgs() << "\nat considered insertion point.\n");
+      appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+      return true;
+    }
+
+    // Look for a common dominator
+    BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB);
+    // If none exists, we cannot merge these two points.
+    if (!CommonDominator)
+      continue;
+
+    if (CommonDominator != NewBB) {
+      // By construction, the CommonDominator cannot be CurBB.
+      assert(CommonDominator != CurBB &&
+             "Instruction has not been rejected during isDominated check!");
+      // Take the last instruction of the CommonDominator as insertion point
+      NewPt = CommonDominator->getTerminator();
+    }
+    // else, CommonDominator is the block of NewBB, hence NewBB is the last
+    // possible insertion point in that block.
+    DEBUG(dbgs() << "Merge insertion point with:\n");
+    DEBUG(IPI->first->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+    DEBUG(NewPt->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+    appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
+    return true;
+  }
+  return false;
+}
+
+void AArch64PromoteConstant::computeInsertionPoints(
+    Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
+  DEBUG(dbgs() << "** Compute insertion points **\n");
+  for (Value::user_iterator UseIt = Val->user_begin(),
+                            EndUseIt = Val->user_end();
+       UseIt != EndUseIt; ++UseIt) {
+    // If the user is not an Instruction, we cannot modify it.
+    if (!isa<Instruction>(*UseIt))
+      continue;
+
+    // Filter out uses that should not be converted.
+    if (!shouldConvertUse(Val, cast<Instruction>(*UseIt), UseIt.getOperandNo()))
+      continue;
+
+    DEBUG(dbgs() << "Considered use, opidx " << UseIt.getOperandNo() << ":\n");
+    DEBUG((*UseIt)->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+
+    Instruction *InsertionPoint = findInsertionPoint(UseIt);
+
+    DEBUG(dbgs() << "Considered insertion point:\n");
+    DEBUG(InsertionPoint->print(dbgs()));
+    DEBUG(dbgs() << '\n');
+
+    // Check if the current insertion point is useless, i.e., it is dominated
+    // by another one.
+    InsertionPoints &InsertPts =
+        InsPtsPerFunc[InsertionPoint->getParent()->getParent()];
+    if (isDominated(InsertionPoint, UseIt, InsertPts))
+      continue;
+    // This insertion point is useful, check if we can merge some insertion
+    // point in a common dominator or if NewPt dominates an existing one.
+    if (tryAndMerge(InsertionPoint, UseIt, InsertPts))
+      continue;
+
+    DEBUG(dbgs() << "Keep considered insertion point\n");
+
+    // It is definitely useful by its own
+    InsertPts[InsertionPoint].push_back(UseIt);
+  }
+}
+
+bool AArch64PromoteConstant::insertDefinitions(
+    Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) {
+  // We will create one global variable per Module.
+  DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
+  bool HasChanged = false;
+
+  // Traverse all insertion points in all the function.
+  for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(),
+                                        EndIt = InsPtsPerFunc.end();
+       FctToInstPtsIt != EndIt; ++FctToInstPtsIt) {
+    InsertionPoints &InsertPts = FctToInstPtsIt->second;
+// Do more checking for debug purposes.
+#ifndef NDEBUG
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
+        *FctToInstPtsIt->first).getDomTree();
+#endif
+    GlobalVariable *PromotedGV;
+    assert(!InsertPts.empty() && "Empty uses does not need a definition");
+
+    Module *M = FctToInstPtsIt->first->getParent();
+    DenseMap<Module *, GlobalVariable *>::iterator MapIt =
+        ModuleToMergedGV.find(M);
+    if (MapIt == ModuleToMergedGV.end()) {
+      PromotedGV = new GlobalVariable(
+          *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr,
+          "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
+      PromotedGV->setInitializer(Cst);
+      ModuleToMergedGV[M] = PromotedGV;
+      DEBUG(dbgs() << "Global replacement: ");
+      DEBUG(PromotedGV->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+      ++NumPromoted;
+      HasChanged = true;
+    } else {
+      PromotedGV = MapIt->second;
+    }
+
+    for (InsertionPoints::iterator IPI = InsertPts.begin(),
+                                   EndIPI = InsertPts.end();
+         IPI != EndIPI; ++IPI) {
+      // Create the load of the global variable.
+      IRBuilder<> Builder(IPI->first->getParent(), IPI->first);
+      LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
+      DEBUG(dbgs() << "**********\n");
+      DEBUG(dbgs() << "New def: ");
+      DEBUG(LoadedCst->print(dbgs()));
+      DEBUG(dbgs() << '\n');
+
+      // Update the dominated uses.
+      Users &DominatedUsers = IPI->second;
+      for (Value::user_iterator Use : DominatedUsers) {
+#ifndef NDEBUG
+        assert((DT.dominates(LoadedCst, cast<Instruction>(*Use)) ||
+                (isa<PHINode>(*Use) &&
+                 DT.dominates(LoadedCst, findInsertionPoint(Use)))) &&
+               "Inserted definition does not dominate all its uses!");
+#endif
+        DEBUG(dbgs() << "Use to update " << Use.getOperandNo() << ":");
+        DEBUG(Use->print(dbgs()));
+        DEBUG(dbgs() << '\n');
+        Use->setOperand(Use.getOperandNo(), LoadedCst);
+        ++NumPromotedUses;
+      }
+    }
+  }
+  return HasChanged;
+}
+
+bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
+  InsertionPointsPerFunc InsertPtsPerFunc;
+  computeInsertionPoints(Val, InsertPtsPerFunc);
+  return insertDefinitions(Val, InsertPtsPerFunc);
+}
+
+bool AArch64PromoteConstant::promoteConstant(Constant *Cst) {
+  assert(Cst && "Given variable is not a valid constant.");
+
+  if (!shouldConvert(Cst))
+    return false;
+
+  DEBUG(dbgs() << "******************************\n");
+  DEBUG(dbgs() << "Candidate constant: ");
+  DEBUG(Cst->print(dbgs()));
+  DEBUG(dbgs() << '\n');
+
+  return computeAndInsertDefinitions(Cst);
+}
+
+bool AArch64PromoteConstant::runOnFunction(Function &F) {
+  // Look for instructions using constant vector. Promote that constant to a
+  // global variable. Create as few loads of this variable as possible and
+  // update the uses accordingly.
+  bool LocalChange = false;
+  SmallSet<Constant *, 8> AlreadyChecked;
+
+  for (auto &MBB : F) {
+    for (auto &MI : MBB) {
+      // Traverse the operand, looking for constant vectors. Replace them by a
+      // load of a global variable of constant vector type.
+      for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands();
+           OpIdx != EndOpIdx; ++OpIdx) {
+        Constant *Cst = dyn_cast<Constant>(MI.getOperand(OpIdx));
+        // There is no point in promoting global values as they are already
+        // global. Do not promote constant expressions either, as they may
+        // require some code expansion.
+        if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
+            AlreadyChecked.insert(Cst))
+          LocalChange |= promoteConstant(Cst);
+      }
+    }
+  }
+  return LocalChange;
+}
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 06e1ffb..01b9587 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -12,175 +12,393 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "AArch64RegisterInfo.h"
 #include "AArch64FrameLowering.h"
-#include "AArch64MachineFunctionInfo.h"
-#include "AArch64TargetMachine.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
 
 #define GET_REGINFO_TARGET_DESC
 #include "AArch64GenRegisterInfo.inc"
 
-using namespace llvm;
+AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii,
+                                         const AArch64Subtarget *sti)
+    : AArch64GenRegisterInfo(AArch64::LR), TII(tii), STI(sti) {}
 
-AArch64RegisterInfo::AArch64RegisterInfo()
-  : AArch64GenRegisterInfo(AArch64::X30) {
-}
-
-const uint16_t *
+const MCPhysReg *
 AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  return CSR_PCS_SaveList;
-}
-
-const uint32_t*
-AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID) const {
-  return CSR_PCS_RegMask;
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
+    return CSR_AArch64_AllRegs_SaveList;
+  else
+    return CSR_AArch64_AAPCS_SaveList;
 }
 
-const uint32_t *AArch64RegisterInfo::getTLSDescCallPreservedMask() const {
-  return TLSDesc_RegMask;
+const uint32_t *
+AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::AnyReg)
+    return CSR_AArch64_AllRegs_RegMask;
+  else
+    return CSR_AArch64_AAPCS_RegMask;
 }
 
-const TargetRegisterClass *
-AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
-  if (RC == &AArch64::FlagClassRegClass)
-    return &AArch64::GPR64RegClass;
+const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
+  if (STI->isTargetDarwin())
+    return CSR_AArch64_TLS_Darwin_RegMask;
 
-  return RC;
+  assert(STI->isTargetELF() && "only expect Darwin or ELF TLS");
+  return CSR_AArch64_TLS_ELF_RegMask;
 }
 
-
+const uint32_t *
+AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
+  // This should return a register mask that is the same as that returned by
+  // getCallPreservedMask but that additionally preserves the register used for
+  // the first i64 argument (which must also be the register used to return a
+  // single i64 return value)
+  //
+  // In case that the calling convention does not use the same register for
+  // both, the function should return NULL (does not currently apply)
+  return CSR_AArch64_AAPCS_ThisReturn_RegMask;
+}
 
 BitVector
 AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  BitVector Reserved(getNumRegs());
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-  Reserved.set(AArch64::XSP);
-  Reserved.set(AArch64::WSP);
-
+  // FIXME: avoid re-calculating this every time.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(AArch64::SP);
   Reserved.set(AArch64::XZR);
+  Reserved.set(AArch64::WSP);
   Reserved.set(AArch64::WZR);
 
-  if (TFI->hasFP(MF)) {
-    Reserved.set(AArch64::X29);
+  if (TFI->hasFP(MF) || STI->isTargetDarwin()) {
+    Reserved.set(AArch64::FP);
     Reserved.set(AArch64::W29);
   }
 
+  if (STI->isTargetDarwin()) {
+    Reserved.set(AArch64::X18); // Platform register
+    Reserved.set(AArch64::W18);
+  }
+
+  if (hasBasePointer(MF)) {
+    Reserved.set(AArch64::X19);
+    Reserved.set(AArch64::W19);
+  }
+
   return Reserved;
 }
 
-static bool hasFrameOffset(int opcode) {
-  return opcode != AArch64::LD1x2_8B  && opcode != AArch64::LD1x3_8B  &&
-         opcode != AArch64::LD1x4_8B  && opcode != AArch64::ST1x2_8B  &&
-         opcode != AArch64::ST1x3_8B  && opcode != AArch64::ST1x4_8B  &&
-         opcode != AArch64::LD1x2_16B && opcode != AArch64::LD1x3_16B &&
-         opcode != AArch64::LD1x4_16B && opcode != AArch64::ST1x2_16B &&
-         opcode != AArch64::ST1x3_16B && opcode != AArch64::ST1x4_16B;
-}
+bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
+                                      unsigned Reg) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-void
-AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
-                                         int SPAdj,
-                                         unsigned FIOperandNum,
-                                         RegScavenger *RS) const {
-  assert(SPAdj == 0 && "Cannot deal with nonzero SPAdj yet");
-  MachineInstr &MI = *MBBI;
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const AArch64FrameLowering *TFI =
-   static_cast<const AArch64FrameLowering *>(MF.getTarget().getFrameLowering());
-
-  // In order to work out the base and offset for addressing, the FrameLowering
-  // code needs to know (sometimes) whether the instruction is storing/loading a
-  // callee-saved register, or whether it's a more generic
-  // operation. Fortunately the frame indices are used *only* for that purpose
-  // and are contiguous, so we can check here.
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  int MinCSFI = 0;
-  int MaxCSFI = -1;
-
-  if (CSI.size()) {
-    MinCSFI = CSI[0].getFrameIdx();
-    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  switch (Reg) {
+  default:
+    break;
+  case AArch64::SP:
+  case AArch64::XZR:
+  case AArch64::WSP:
+  case AArch64::WZR:
+    return true;
+  case AArch64::X18:
+  case AArch64::W18:
+    return STI->isTargetDarwin();
+  case AArch64::FP:
+  case AArch64::W29:
+    return TFI->hasFP(MF) || STI->isTargetDarwin();
+  case AArch64::W19:
+  case AArch64::X19:
+    return hasBasePointer(MF);
   }
 
-  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  bool IsCalleeSaveOp = FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI;
+  return false;
+}
 
-  unsigned FrameReg;
-  int64_t Offset;
-  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj,
-                                           IsCalleeSaveOp);
-  // A vector load/store instruction doesn't have an offset operand.
-  bool HasOffsetOp = hasFrameOffset(MI.getOpcode());
-  if (HasOffsetOp)
-    Offset += MI.getOperand(FIOperandNum + 1).getImm();
+const TargetRegisterClass *
+AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                      unsigned Kind) const {
+  return &AArch64::GPR64RegClass;
+}
 
-  // DBG_VALUE instructions have no real restrictions so they can be handled
-  // easily.
-  if (MI.isDebugValue()) {
-    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/ false);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
-    return;
-  }
+const TargetRegisterClass *
+AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &AArch64::CCRRegClass)
+    return &AArch64::GPR64RegClass; // Only MSR & MRS copy NZCV.
+  return RC;
+}
 
-  const AArch64InstrInfo &TII =
-    *static_cast<const AArch64InstrInfo*>(MF.getTarget().getInstrInfo());
-  int MinOffset, MaxOffset, OffsetScale;
-  if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s || !HasOffsetOp) {
-    MinOffset = 0;
-    MaxOffset = 0xfff;
-    OffsetScale = 1;
-  } else {
-    // Load/store of a stack object
-    TII.getAddressConstraints(MI, OffsetScale, MinOffset, MaxOffset);
-  }
+unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
 
-  // There are two situations we don't use frame + offset directly in the
-  // instruction:
-  // (1) The offset can't really be scaled
-  // (2) Can't encode offset as it doesn't have an offset operand
-  if ((Offset % OffsetScale != 0 || Offset < MinOffset || Offset > MaxOffset) ||
-      (!HasOffsetOp && Offset != 0)) {
-    unsigned BaseReg =
-      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
-    emitRegUpdate(MBB, MBBI, MBBI->getDebugLoc(), TII,
-                  BaseReg, FrameReg, BaseReg, Offset);
-    FrameReg = BaseReg;
-    Offset = 0;
-  }
+bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
 
-  // Negative offsets are expected if we address from FP, but for
-  // now this checks nothing has gone horribly wrong.
-  assert(Offset >= 0 && "Unexpected negative offset from SP");
+  // In the presence of variable sized objects, if the fixed stack size is
+  // large enough that referencing from the FP won't result in things being
+  // in range relatively often, we can use a base pointer to allow access
+  // from the other direction like the SP normally works.
+  if (MFI->hasVarSizedObjects()) {
+    // Conservatively estimate whether the negative offset from the frame
+    // pointer will be sufficient to reach. If a function has a smallish
+    // frame, it's less likely to have lots of spills and callee saved
+    // space, so it's all more likely to be within range of the frame pointer.
+    // If it's wrong, we'll materialize the constant and still get to the
+    // object; it's just suboptimal. Negative offsets use the unscaled
+    // load/store instructions, which have a 9-bit signed immediate.
+    if (MFI->getLocalFrameSize() < 256)
+      return false;
+    return true;
+  }
 
-  MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, true);
-  if (HasOffsetOp)
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset / OffsetScale);
+  return false;
 }
 
 unsigned
 AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-  if (TFI->hasFP(MF))
-    return AArch64::X29;
-  else
-    return AArch64::XSP;
+  return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
+}
+
+bool AArch64RegisterInfo::requiresRegisterScavenging(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
+    const MachineFunction &MF) const {
+  return true;
 }
 
 bool
 AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // AArch64FrameLowering::resolveFrameIndexReference() can always fall back
+  // to the stack pointer, so only put the emergency spill slot next to the
+  // FP when there's no better way to access it (SP or base pointer).
+  return MFI->hasVarSizedObjects() && !hasBasePointer(MF);
+}
+
+bool AArch64RegisterInfo::requiresFrameIndexScavenging(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Only consider eliminating leaf frames.
+  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
+                          MFI->adjustsStack()))
+    return true;
+  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
+                                            int64_t Offset) const {
+  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
+    assert(i < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores, so
+  // return false for everything else.
+  if (!MI->mayLoad() && !MI->mayStore())
+    return false;
+
+  // Without a virtual base register, if the function has variable sized
+  // objects, all fixed-size local references will be via the frame pointer,
+  // Approximate the offset and see if it's legal for the instruction.
+  // Note that the incoming offset is based on the SP value at function entry,
+  // so it'll be negative.
+  MachineFunction &MF = *MI->getParent()->getParent();
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  const AArch64FrameLowering *AFI
-    = static_cast<const AArch64FrameLowering*>(TFI);
-  return AFI->useFPForAddressing(MF);
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Estimate an offset from the frame pointer.
+  // Conservatively assume all GPR callee-saved registers get pushed.
+  // FP, LR, X19-X28, D8-D15. 64-bits each.
+  int64_t FPOffset = Offset - 16 * 20;
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset += MFI->getLocalFrameSize();
+  // Assume that we'll have at least some spill slots allocated.
+  // FIXME: This is a total SWAG number. We should run some statistics
+  //        and pick a real one.
+  Offset += 128; // 128 bytes of spill slots
+
+  // If there is a frame pointer, try using it.
+  // The FP is only available if there is no dynamic realignment. We
+  // don't know for sure yet whether we'll need that, so we guess based
+  // on whether there are any local variables that would trigger it.
+  if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset))
+    return false;
+
+  // If we can reference via the stack pointer or base pointer, try that.
+  // FIXME: This (and the code that resolves the references) can be improved
+  //        to only disallow SP relative references in the live range of
+  //        the VLA(s). In practice, it's unclear how much difference that
+  //        would make, but it may be worth doing.
+  if (isFrameOffsetLegal(MI, Offset))
+    return false;
+
+  // The offset likely isn't legal; we want to allocate a virtual base register.
+  return true;
+}
+
+bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                             int64_t Offset) const {
+  assert(Offset <= INT_MAX && "Offset too big to fit in int.");
+  assert(MI && "Unable to get the legal offset for nil instruction.");
+  int SaveOffset = Offset;
+  return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
+}
+
+/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
+/// at the beginning of the basic block.
+void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                       unsigned BaseReg,
+                                                       int FrameIdx,
+                                                       int64_t Offset) const {
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL; // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const MachineFunction &MF = *MBB->getParent();
+  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
+  unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+
+  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+      .addFrameIndex(FrameIdx)
+      .addImm(Offset)
+      .addImm(Shifter);
 }
+
+void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                            int64_t Offset) const {
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII);
+  assert(Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                              int SPAdj, unsigned FIOperandNum,
+                                              RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>(
+      MF.getTarget().getFrameLowering());
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned FrameReg;
+  int Offset;
+
+  // Special handling of dbg_value, stackmap and patchpoint instructions.
+  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+                                             /*PreferFP=*/true);
+    Offset += MI.getOperand(FIOperandNum + 1).getImm();
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
+  if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+    return;
+
+  assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
+         "Emergency spill slot is out of reach");
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above.  Handle the rest, providing a register that is
+  // SP+LargeImm.
+  unsigned ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
+  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
+}
+
+namespace llvm {
+
+unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                                  MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case AArch64::GPR32RegClassID:
+  case AArch64::GPR32spRegClassID:
+  case AArch64::GPR32allRegClassID:
+  case AArch64::GPR64spRegClassID:
+  case AArch64::GPR64allRegClassID:
+  case AArch64::GPR64RegClassID:
+  case AArch64::GPR32commonRegClassID:
+  case AArch64::GPR64commonRegClassID:
+    return 32 - 1                                      // XZR/SP
+           - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
+           - STI->isTargetDarwin() // X18 reserved as platform register
+           - hasBasePointer(MF);   // X19
+  case AArch64::FPR8RegClassID:
+  case AArch64::FPR16RegClassID:
+  case AArch64::FPR32RegClassID:
+  case AArch64::FPR64RegClassID:
+  case AArch64::FPR128RegClassID:
+    return 32;
+
+  case AArch64::DDRegClassID:
+  case AArch64::DDDRegClassID:
+  case AArch64::DDDDRegClassID:
+  case AArch64::QQRegClassID:
+  case AArch64::QQQRegClassID:
+  case AArch64::QQQQRegClassID:
+    return 32;
+
+  case AArch64::FPR128_loRegClassID:
+    return 16;
+  }
+}
+
+} // namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index 4d67943..76af1ed 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -1,4 +1,4 @@
-//==- AArch64RegisterInfo.h - AArch64 Register Information Impl -*- C++ -*-===//
+//==- AArch64RegisterInfo.h - AArch64 Register Information Impl --*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the AArch64 implementation of the MCRegisterInfo class.
+// This file contains the AArch64 implementation of the MRegisterInfo class.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64REGISTERINFO_H
-#define LLVM_TARGET_AARCH64REGISTERINFO_H
-
-#include "llvm/Target/TargetRegisterInfo.h"
+#ifndef LLVM_TARGET_AArch64REGISTERINFO_H
+#define LLVM_TARGET_AArch64REGISTERINFO_H
 
 #define GET_REGINFO_HEADER
 #include "AArch64GenRegisterInfo.inc"
@@ -23,49 +21,81 @@ namespace llvm {
 
 class AArch64InstrInfo;
 class AArch64Subtarget;
+class MachineFunction;
+class RegScavenger;
+class TargetRegisterClass;
 
 struct AArch64RegisterInfo : public AArch64GenRegisterInfo {
-  AArch64RegisterInfo();
+private:
+  const AArch64InstrInfo *TII;
+  const AArch64Subtarget *STI;
 
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
-  const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+public:
+  AArch64RegisterInfo(const AArch64InstrInfo *tii, const AArch64Subtarget *sti);
 
-  const uint32_t *getTLSDescCallPreservedMask() const;
+  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  /// Code Generation virtual methods...
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
 
-  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                           unsigned FIOperandNum,
-                           RegScavenger *Rs = NULL) const;
+  unsigned getCSRFirstUseCost() const override {
+    // The cost will be compared against BlockFrequency where entry has the
+    // value of 1 << 14. A value of 5 will choose to spill or split really
+    // cold path instead of using a callee-saved register.
+    return 5;
+  }
 
-  /// getCrossCopyRegClass - Returns a legal register class to copy a register
-  /// in the specified class to or from. Returns original class if it is
-  /// possible to copy between a two registers of the specified class.
+  // Calls involved in thread-local variable lookup save more registers than
+  // normal calls, so they need a different mask to represent this.
+  const uint32_t *getTLSCallPreservedMask() const;
+
+  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+  /// case that 'returned' is on an i64 first argument if the calling convention
+  /// is one that can (partially) model this attribute with a preserved mask
+  /// (i.e. it is a calling convention that uses the same register for the first
+  /// i64 argument and an i64 return value)
+  ///
+  /// Should return NULL in the case that the calling convention does not have
+  /// this property
+  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
   const TargetRegisterClass *
-  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
-
-  /// getLargestLegalSuperClass - Returns the largest super class of RC that is
-  /// legal to use in the current sub-target and has the same spill size.
-  const TargetRegisterClass*
-  getLargestLegalSuperClass(const TargetRegisterClass *RC) const {
-    if (RC == &AArch64::tcGPR64RegClass)
-      return &AArch64::GPR64RegClass;
-
-    return RC;
-  }
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI,
+                          int64_t Offset) const override;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
+                                    int FrameIdx,
+                                    int64_t Offset) const override;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
 
-  bool requiresRegisterScavenging(const MachineFunction &MF) const {
-    return true;
-  }
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
+  bool hasBasePointer(const MachineFunction &MF) const;
+  unsigned getBaseRegister() const;
 
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const {
-    return true;
-  }
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
-  bool useFPForScavengingIndex(const MachineFunction &MF) const;
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
 
-#endif // LLVM_TARGET_AARCH64REGISTERINFO_H
+#endif // LLVM_TARGET_AArch64REGISTERINFO_H
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index 9de7abd..21c927f 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -1,4 +1,4 @@
-//===- AArch64RegisterInfo.td - ARM Register defs ----------*- tablegen -*-===//
+//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,284 +7,587 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file contains declarations that describe the AArch64 register file
 //
 //===----------------------------------------------------------------------===//
 
-let Namespace = "AArch64" in {
-def sub_128 : SubRegIndex<128>;
-def sub_64 : SubRegIndex<64>;
-def sub_32 : SubRegIndex<32>;
-def sub_16 : SubRegIndex<16>;
-def sub_8  : SubRegIndex<8>;
-
-// Note: Code depends on these having consecutive numbers.
-def qqsub : SubRegIndex<256, 256>;
-
-def qsub_0 : SubRegIndex<128>;
-def qsub_1 : SubRegIndex<128, 128>;
-def qsub_2 : ComposedSubRegIndex<qqsub, qsub_0>;
-def qsub_3 : ComposedSubRegIndex<qqsub, qsub_1>;
-
-def dsub_0 : SubRegIndex<64>;
-def dsub_1 : SubRegIndex<64, 64>;
-def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
-def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
-}
 
-// Registers are identified with 5-bit ID numbers.
-class AArch64Reg<bits<16> enc, string n> : Register<n> {
+class AArch64Reg<bits<16> enc, string n, list<Register> subregs = [],
+               list<string> altNames = []>
+        : Register<n, altNames> {
   let HWEncoding = enc;
   let Namespace = "AArch64";
+  let SubRegs = subregs;
 }
 
-class AArch64RegWithSubs<bits<16> enc, string n, list<Register> subregs = [],
-                         list<SubRegIndex> inds = []>
-      : AArch64Reg<enc, n> {
-  let SubRegs = subregs;
-  let SubRegIndices = inds;
+let Namespace = "AArch64" in {
+  def sub_32 : SubRegIndex<32>;
+
+  def bsub : SubRegIndex<8>;
+  def hsub : SubRegIndex<16>;
+  def ssub : SubRegIndex<32>;
+  def dsub : SubRegIndex<32>;
+  def qhisub : SubRegIndex<64>;
+  def qsub : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def dsub0 : SubRegIndex<64>;
+  def dsub1 : SubRegIndex<64>;
+  def dsub2 : SubRegIndex<64>;
+  def dsub3 : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def qsub0 : SubRegIndex<128>;
+  def qsub1 : SubRegIndex<128>;
+  def qsub2 : SubRegIndex<128>;
+  def qsub3 : SubRegIndex<128>;
+}
+
+let Namespace = "AArch64" in {
+  def vreg : RegAltNameIndex;
+  def vlist1 : RegAltNameIndex;
 }
 
 //===----------------------------------------------------------------------===//
-//  Integer registers: w0-w30, wzr, wsp, x0-x30, xzr, sp
+// Registers
 //===----------------------------------------------------------------------===//
-
-foreach Index = 0-30 in {
-  def W#Index : AArch64Reg< Index, "w"#Index>, DwarfRegNum<[Index]>;
+def W0    : AArch64Reg<0,   "w0" >, DwarfRegNum<[0]>;
+def W1    : AArch64Reg<1,   "w1" >, DwarfRegNum<[1]>;
+def W2    : AArch64Reg<2,   "w2" >, DwarfRegNum<[2]>;
+def W3    : AArch64Reg<3,   "w3" >, DwarfRegNum<[3]>;
+def W4    : AArch64Reg<4,   "w4" >, DwarfRegNum<[4]>;
+def W5    : AArch64Reg<5,   "w5" >, DwarfRegNum<[5]>;
+def W6    : AArch64Reg<6,   "w6" >, DwarfRegNum<[6]>;
+def W7    : AArch64Reg<7,   "w7" >, DwarfRegNum<[7]>;
+def W8    : AArch64Reg<8,   "w8" >, DwarfRegNum<[8]>;
+def W9    : AArch64Reg<9,   "w9" >, DwarfRegNum<[9]>;
+def W10   : AArch64Reg<10, "w10">, DwarfRegNum<[10]>;
+def W11   : AArch64Reg<11, "w11">, DwarfRegNum<[11]>;
+def W12   : AArch64Reg<12, "w12">, DwarfRegNum<[12]>;
+def W13   : AArch64Reg<13, "w13">, DwarfRegNum<[13]>;
+def W14   : AArch64Reg<14, "w14">, DwarfRegNum<[14]>;
+def W15   : AArch64Reg<15, "w15">, DwarfRegNum<[15]>;
+def W16   : AArch64Reg<16, "w16">, DwarfRegNum<[16]>;
+def W17   : AArch64Reg<17, "w17">, DwarfRegNum<[17]>;
+def W18   : AArch64Reg<18, "w18">, DwarfRegNum<[18]>;
+def W19   : AArch64Reg<19, "w19">, DwarfRegNum<[19]>;
+def W20   : AArch64Reg<20, "w20">, DwarfRegNum<[20]>;
+def W21   : AArch64Reg<21, "w21">, DwarfRegNum<[21]>;
+def W22   : AArch64Reg<22, "w22">, DwarfRegNum<[22]>;
+def W23   : AArch64Reg<23, "w23">, DwarfRegNum<[23]>;
+def W24   : AArch64Reg<24, "w24">, DwarfRegNum<[24]>;
+def W25   : AArch64Reg<25, "w25">, DwarfRegNum<[25]>;
+def W26   : AArch64Reg<26, "w26">, DwarfRegNum<[26]>;
+def W27   : AArch64Reg<27, "w27">, DwarfRegNum<[27]>;
+def W28   : AArch64Reg<28, "w28">, DwarfRegNum<[28]>;
+def W29   : AArch64Reg<29, "w29">, DwarfRegNum<[29]>;
+def W30   : AArch64Reg<30, "w30">, DwarfRegNum<[30]>;
+def WSP   : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
+def WZR   : AArch64Reg<31, "wzr">, DwarfRegAlias<WSP>;
+
+let SubRegIndices = [sub_32] in {
+def X0    : AArch64Reg<0,   "x0",  [W0]>, DwarfRegAlias<W0>;
+def X1    : AArch64Reg<1,   "x1",  [W1]>, DwarfRegAlias<W1>;
+def X2    : AArch64Reg<2,   "x2",  [W2]>, DwarfRegAlias<W2>;
+def X3    : AArch64Reg<3,   "x3",  [W3]>, DwarfRegAlias<W3>;
+def X4    : AArch64Reg<4,   "x4",  [W4]>, DwarfRegAlias<W4>;
+def X5    : AArch64Reg<5,   "x5",  [W5]>, DwarfRegAlias<W5>;
+def X6    : AArch64Reg<6,   "x6",  [W6]>, DwarfRegAlias<W6>;
+def X7    : AArch64Reg<7,   "x7",  [W7]>, DwarfRegAlias<W7>;
+def X8    : AArch64Reg<8,   "x8",  [W8]>, DwarfRegAlias<W8>;
+def X9    : AArch64Reg<9,   "x9",  [W9]>, DwarfRegAlias<W9>;
+def X10   : AArch64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
+def X11   : AArch64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
+def X12   : AArch64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
+def X13   : AArch64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
+def X14   : AArch64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
+def X15   : AArch64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
+def X16   : AArch64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
+def X17   : AArch64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
+def X18   : AArch64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
+def X19   : AArch64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
+def X20   : AArch64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
+def X21   : AArch64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
+def X22   : AArch64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
+def X23   : AArch64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
+def X24   : AArch64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
+def X25   : AArch64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
+def X26   : AArch64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
+def X27   : AArch64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
+def X28   : AArch64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
+def FP    : AArch64Reg<29, "x29", [W29]>, DwarfRegAlias<W29>;
+def LR    : AArch64Reg<30, "x30", [W30]>, DwarfRegAlias<W30>;
+def SP    : AArch64Reg<31, "sp",  [WSP]>, DwarfRegAlias<WSP>;
+def XZR   : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
 }
 
-def WSP : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
-def WZR : AArch64Reg<31, "wzr">;
+// Condition code register.
+def NZCV  : AArch64Reg<0, "nzcv">;
 
-// Could be combined with previous loop, but this way leaves w and x registers
-// consecutive as LLVM register numbers, which makes for easier debugging.
-foreach Index = 0-30 in {
-  def X#Index : AArch64RegWithSubs<Index, "x"#Index,
-                                   [!cast<Register>("W"#Index)], [sub_32]>,
-                DwarfRegNum<[Index]>;
+// GPR register classes with the intersections of GPR32/GPR32sp and
+// GPR64/GPR64sp for use by the coalescer.
+def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
+  let AltOrders = [(rotl GPR32common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64common : RegisterClass<"AArch64", [i64], 64,
+                                (add (sequence "X%u", 0, 28), FP, LR)> {
+  let AltOrders = [(rotl GPR64common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+// GPR register classes which exclude SP/WSP.
+def GPR32 : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR)> {
+  let AltOrders = [(rotl GPR32, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64 : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR)> {
+  let AltOrders = [(rotl GPR64, 8)];
+  let AltOrderSelect = [{ return 1; }];
 }
 
-def XSP : AArch64RegWithSubs<31, "sp", [WSP], [sub_32]>, DwarfRegNum<[31]>;
-def XZR : AArch64RegWithSubs<31, "xzr", [WZR], [sub_32]>;
+// GPR register classes which include SP/WSP.
+def GPR32sp : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WSP)> {
+  let AltOrders = [(rotl GPR32sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64sp : RegisterClass<"AArch64", [i64], 64, (add GPR64common, SP)> {
+  let AltOrders = [(rotl GPR64sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
 
-// Most instructions treat register 31 as zero for reads and a black-hole for
-// writes.
+def GPR32sponly : RegisterClass<"AArch64", [i32], 32, (add WSP)>;
+def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>;
 
-// Note that the order of registers is important for the Disassembler here:
-// tablegen uses it to form MCRegisterClass::getRegister, which we assume can
-// take an encoding value.
-def GPR32 : RegisterClass<"AArch64", [i32], 32,
-                          (add (sequence "W%u", 0, 30), WZR)> {
+def GPR64spPlus0Operand : AsmOperandClass {
+  let Name = "GPR64sp0";
+  let RenderMethod = "addRegOperands";
+  let ParserMethod = "tryParseGPR64sp0Operand";
 }
 
-def GPR64 : RegisterClass<"AArch64", [i64], 64,
-                          (add (sequence "X%u", 0, 30), XZR)> {
+def GPR64sp0 : RegisterOperand<GPR64sp> {
+  let ParserMatchClass = GPR64spPlus0Operand;
 }
 
-def GPR32nowzr : RegisterClass<"AArch64", [i32], 32,
-                               (sequence "W%u", 0, 30)> {
+// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
+// constraint used by any instructions, it is used as a common super-class.
+def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>;
+def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// This is for indirect tail calls to store the address of the destination.
+def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21,
+                                                     X22, X23, X24, X25, X26,
+                                                     X27, X28)>;
+
+// GPR register classes for post increment amount of vector load/store that
+// has alternate printing when Rm=31 and prints a constant immediate value
+// equal to the total number of bytes transferred.
+
+// FIXME: TableGen *should* be able to do these itself now. There appears to be
+// a bug in counting how many operands a Post-indexed MCInst should have which
+// means the aliases don't trigger.
+def GPR64pi1  : RegisterOperand<GPR64, "printPostIncOperand<1>">;
+def GPR64pi2  : RegisterOperand<GPR64, "printPostIncOperand<2>">;
+def GPR64pi3  : RegisterOperand<GPR64, "printPostIncOperand<3>">;
+def GPR64pi4  : RegisterOperand<GPR64, "printPostIncOperand<4>">;
+def GPR64pi6  : RegisterOperand<GPR64, "printPostIncOperand<6>">;
+def GPR64pi8  : RegisterOperand<GPR64, "printPostIncOperand<8>">;
+def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand<12>">;
+def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand<16>">;
+def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand<24>">;
+def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand<32>">;
+def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
+def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
+
+// Condition code regclass.
+def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+
+  // CCR is not allocatable.
+  let isAllocatable = 0;
 }
 
-def GPR64noxzr : RegisterClass<"AArch64", [i64], 64,
-                               (sequence "X%u", 0, 30)> {
-}
+//===----------------------------------------------------------------------===//
+// Floating Point Scalar Registers
+//===----------------------------------------------------------------------===//
 
-// For tail calls, we can't use callee-saved registers or the structure-return
-// register, as they are supposed to be live across function calls and may be
-// clobbered by the epilogue.
-def tcGPR64 : RegisterClass<"AArch64", [i64], 64,
-                            (add (sequence "X%u", 0, 7),
-                                 (sequence "X%u", 9, 18))> {
+def B0    : AArch64Reg<0,   "b0">, DwarfRegNum<[64]>;
+def B1    : AArch64Reg<1,   "b1">, DwarfRegNum<[65]>;
+def B2    : AArch64Reg<2,   "b2">, DwarfRegNum<[66]>;
+def B3    : AArch64Reg<3,   "b3">, DwarfRegNum<[67]>;
+def B4    : AArch64Reg<4,   "b4">, DwarfRegNum<[68]>;
+def B5    : AArch64Reg<5,   "b5">, DwarfRegNum<[69]>;
+def B6    : AArch64Reg<6,   "b6">, DwarfRegNum<[70]>;
+def B7    : AArch64Reg<7,   "b7">, DwarfRegNum<[71]>;
+def B8    : AArch64Reg<8,   "b8">, DwarfRegNum<[72]>;
+def B9    : AArch64Reg<9,   "b9">, DwarfRegNum<[73]>;
+def B10   : AArch64Reg<10, "b10">, DwarfRegNum<[74]>;
+def B11   : AArch64Reg<11, "b11">, DwarfRegNum<[75]>;
+def B12   : AArch64Reg<12, "b12">, DwarfRegNum<[76]>;
+def B13   : AArch64Reg<13, "b13">, DwarfRegNum<[77]>;
+def B14   : AArch64Reg<14, "b14">, DwarfRegNum<[78]>;
+def B15   : AArch64Reg<15, "b15">, DwarfRegNum<[79]>;
+def B16   : AArch64Reg<16, "b16">, DwarfRegNum<[80]>;
+def B17   : AArch64Reg<17, "b17">, DwarfRegNum<[81]>;
+def B18   : AArch64Reg<18, "b18">, DwarfRegNum<[82]>;
+def B19   : AArch64Reg<19, "b19">, DwarfRegNum<[83]>;
+def B20   : AArch64Reg<20, "b20">, DwarfRegNum<[84]>;
+def B21   : AArch64Reg<21, "b21">, DwarfRegNum<[85]>;
+def B22   : AArch64Reg<22, "b22">, DwarfRegNum<[86]>;
+def B23   : AArch64Reg<23, "b23">, DwarfRegNum<[87]>;
+def B24   : AArch64Reg<24, "b24">, DwarfRegNum<[88]>;
+def B25   : AArch64Reg<25, "b25">, DwarfRegNum<[89]>;
+def B26   : AArch64Reg<26, "b26">, DwarfRegNum<[90]>;
+def B27   : AArch64Reg<27, "b27">, DwarfRegNum<[91]>;
+def B28   : AArch64Reg<28, "b28">, DwarfRegNum<[92]>;
+def B29   : AArch64Reg<29, "b29">, DwarfRegNum<[93]>;
+def B30   : AArch64Reg<30, "b30">, DwarfRegNum<[94]>;
+def B31   : AArch64Reg<31, "b31">, DwarfRegNum<[95]>;
+
+let SubRegIndices = [bsub] in {
+def H0    : AArch64Reg<0,   "h0", [B0]>, DwarfRegAlias<B0>;
+def H1    : AArch64Reg<1,   "h1", [B1]>, DwarfRegAlias<B1>;
+def H2    : AArch64Reg<2,   "h2", [B2]>, DwarfRegAlias<B2>;
+def H3    : AArch64Reg<3,   "h3", [B3]>, DwarfRegAlias<B3>;
+def H4    : AArch64Reg<4,   "h4", [B4]>, DwarfRegAlias<B4>;
+def H5    : AArch64Reg<5,   "h5", [B5]>, DwarfRegAlias<B5>;
+def H6    : AArch64Reg<6,   "h6", [B6]>, DwarfRegAlias<B6>;
+def H7    : AArch64Reg<7,   "h7", [B7]>, DwarfRegAlias<B7>;
+def H8    : AArch64Reg<8,   "h8", [B8]>, DwarfRegAlias<B8>;
+def H9    : AArch64Reg<9,   "h9", [B9]>, DwarfRegAlias<B9>;
+def H10   : AArch64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
+def H11   : AArch64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
+def H12   : AArch64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
+def H13   : AArch64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
+def H14   : AArch64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
+def H15   : AArch64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
+def H16   : AArch64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
+def H17   : AArch64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
+def H18   : AArch64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
+def H19   : AArch64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
+def H20   : AArch64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
+def H21   : AArch64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
+def H22   : AArch64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
+def H23   : AArch64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
+def H24   : AArch64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
+def H25   : AArch64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
+def H26   : AArch64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
+def H27   : AArch64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
+def H28   : AArch64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
+def H29   : AArch64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
+def H30   : AArch64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
+def H31   : AArch64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
 }
 
+let SubRegIndices = [hsub] in {
+def S0    : AArch64Reg<0,   "s0", [H0]>, DwarfRegAlias<B0>;
+def S1    : AArch64Reg<1,   "s1", [H1]>, DwarfRegAlias<B1>;
+def S2    : AArch64Reg<2,   "s2", [H2]>, DwarfRegAlias<B2>;
+def S3    : AArch64Reg<3,   "s3", [H3]>, DwarfRegAlias<B3>;
+def S4    : AArch64Reg<4,   "s4", [H4]>, DwarfRegAlias<B4>;
+def S5    : AArch64Reg<5,   "s5", [H5]>, DwarfRegAlias<B5>;
+def S6    : AArch64Reg<6,   "s6", [H6]>, DwarfRegAlias<B6>;
+def S7    : AArch64Reg<7,   "s7", [H7]>, DwarfRegAlias<B7>;
+def S8    : AArch64Reg<8,   "s8", [H8]>, DwarfRegAlias<B8>;
+def S9    : AArch64Reg<9,   "s9", [H9]>, DwarfRegAlias<B9>;
+def S10   : AArch64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
+def S11   : AArch64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
+def S12   : AArch64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
+def S13   : AArch64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
+def S14   : AArch64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
+def S15   : AArch64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
+def S16   : AArch64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
+def S17   : AArch64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
+def S18   : AArch64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
+def S19   : AArch64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
+def S20   : AArch64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
+def S21   : AArch64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
+def S22   : AArch64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
+def S23   : AArch64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
+def S24   : AArch64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
+def S25   : AArch64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
+def S26   : AArch64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
+def S27   : AArch64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
+def S28   : AArch64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
+def S29   : AArch64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
+def S30   : AArch64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
+def S31   : AArch64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
+}
 
-// Certain addressing-useful instructions accept sp directly. Again the order of
-// registers is important to the Disassembler.
-def GPR32wsp : RegisterClass<"AArch64", [i32], 32,
-                             (add (sequence "W%u", 0, 30), WSP)> {
+let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
+def D0    : AArch64Reg<0,   "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
+def D1    : AArch64Reg<1,   "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
+def D2    : AArch64Reg<2,   "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
+def D3    : AArch64Reg<3,   "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
+def D4    : AArch64Reg<4,   "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
+def D5    : AArch64Reg<5,   "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
+def D6    : AArch64Reg<6,   "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
+def D7    : AArch64Reg<7,   "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
+def D8    : AArch64Reg<8,   "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
+def D9    : AArch64Reg<9,   "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
+def D10   : AArch64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
+def D11   : AArch64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
+def D12   : AArch64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
+def D13   : AArch64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
+def D14   : AArch64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
+def D15   : AArch64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
+def D16   : AArch64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
+def D17   : AArch64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
+def D18   : AArch64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
+def D19   : AArch64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
+def D20   : AArch64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
+def D21   : AArch64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
+def D22   : AArch64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
+def D23   : AArch64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
+def D24   : AArch64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
+def D25   : AArch64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
+def D26   : AArch64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
+def D27   : AArch64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
+def D28   : AArch64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
+def D29   : AArch64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
+def D30   : AArch64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
+def D31   : AArch64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
 }
 
-def GPR64xsp : RegisterClass<"AArch64", [i64], 64,
-                             (add (sequence "X%u", 0, 30), XSP)> {
+let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
+def Q0    : AArch64Reg<0,   "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
+def Q1    : AArch64Reg<1,   "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
+def Q2    : AArch64Reg<2,   "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
+def Q3    : AArch64Reg<3,   "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
+def Q4    : AArch64Reg<4,   "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
+def Q5    : AArch64Reg<5,   "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
+def Q6    : AArch64Reg<6,   "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
+def Q7    : AArch64Reg<7,   "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
+def Q8    : AArch64Reg<8,   "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
+def Q9    : AArch64Reg<9,   "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
+def Q10   : AArch64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
+def Q11   : AArch64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
+def Q12   : AArch64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
+def Q13   : AArch64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
+def Q14   : AArch64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
+def Q15   : AArch64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
+def Q16   : AArch64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
+def Q17   : AArch64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
+def Q18   : AArch64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
+def Q19   : AArch64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
+def Q20   : AArch64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
+def Q21   : AArch64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
+def Q22   : AArch64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
+def Q23   : AArch64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
+def Q24   : AArch64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
+def Q25   : AArch64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
+def Q26   : AArch64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
+def Q27   : AArch64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
+def Q28   : AArch64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
+def Q29   : AArch64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
+def Q30   : AArch64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
+def Q31   : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
 }
 
-// Some aliases *only* apply to SP (e.g. MOV uses different encoding for SP and
-// non-SP variants). We can't use a bare register in those patterns because
-// TableGen doesn't like it, so we need a class containing just stack registers
-def Rxsp : RegisterClass<"AArch64", [i64], 64,
-                         (add XSP)> {
+def FPR8  : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
+  let Size = 8;
 }
+def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
+  let Size = 16;
+}
+def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
+def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
+                                    v1i64],
+                                    64, (sequence "D%u", 0, 31)>;
+// We don't (yet) have an f128 legal type, so don't use that here. We
+// normalize 128-bit vectors to v2f64 for arg passing and such, so use
+// that here.
+def FPR128 : RegisterClass<"AArch64",
+                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
+                           128, (sequence "Q%u", 0, 31)>;
 
-def Rwsp : RegisterClass<"AArch64", [i32], 32,
-                         (add WSP)> {
+// The lower 16 vector registers.  Some instructions can only take registers
+// in this range.
+def FPR128_lo : RegisterClass<"AArch64",
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                              128, (trunc FPR128, 16)>;
+
+// Pairs, triples, and quads of 64-bit vector registers.
+def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
+                                 [(rotl FPR64, 0), (rotl FPR64, 1),
+                                  (rotl FPR64, 2)]>;
+def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
+                               [(rotl FPR64, 0), (rotl FPR64, 1),
+                                (rotl FPR64, 2), (rotl FPR64, 3)]>;
+def DD   : RegisterClass<"AArch64", [untyped], 64, (add DSeqPairs)> {
+  let Size = 128;
+}
+def DDD  : RegisterClass<"AArch64", [untyped], 64, (add DSeqTriples)> {
+  let Size = 196;
+}
+def DDDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqQuads)> {
+  let Size = 256;
 }
 
-//===----------------------------------------------------------------------===//
-//  Scalar registers in the vector unit:
-//  b0-b31, h0-h31, s0-s31, d0-d31, q0-q31
-//===----------------------------------------------------------------------===//
+// Pairs, triples, and quads of 128-bit vector registers.
+def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
+                                 [(rotl FPR128, 0), (rotl FPR128, 1),
+                                  (rotl FPR128, 2)]>;
+def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
+                               [(rotl FPR128, 0), (rotl FPR128, 1),
+                                (rotl FPR128, 2), (rotl FPR128, 3)]>;
+def QQ   : RegisterClass<"AArch64", [untyped], 128, (add QSeqPairs)> {
+  let Size = 256;
+}
+def QQQ  : RegisterClass<"AArch64", [untyped], 128, (add QSeqTriples)> {
+  let Size = 384;
+}
+def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> {
+  let Size = 512;
+}
 
-foreach Index = 0-31 in {
-  def B # Index : AArch64Reg< Index, "b" # Index>,
-                  DwarfRegNum<[!add(Index, 64)]>;
 
-  def H # Index : AArch64RegWithSubs<Index, "h" # Index,
-                                     [!cast<Register>("B" # Index)], [sub_8]>,
-                  DwarfRegNum<[!add(Index, 64)]>;
+// Vector operand versions of the FP registers. Alternate name printing and
+// assmebler matching.
+def VectorReg64AsmOperand : AsmOperandClass {
+  let Name = "VectorReg64";
+  let PredicateMethod = "isVectorReg";
+}
+def VectorReg128AsmOperand : AsmOperandClass {
+  let Name = "VectorReg128";
+  let PredicateMethod = "isVectorReg";
+}
 
-  def S # Index : AArch64RegWithSubs<Index, "s" # Index,
-                                     [!cast<Register>("H" # Index)], [sub_16]>,
-                  DwarfRegNum<[!add(Index, 64)]>;
+def V64  : RegisterOperand<FPR64, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg64AsmOperand;
+}
 
-  def D # Index : AArch64RegWithSubs<Index, "d" # Index,
-                                     [!cast<Register>("S" # Index)], [sub_32]>,
-                  DwarfRegNum<[!add(Index, 64)]>;
+def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg128AsmOperand;
+}
 
-  def Q # Index : AArch64RegWithSubs<Index, "q" # Index,
-                                     [!cast<Register>("D" # Index)], [sub_64]>,
-                  DwarfRegNum<[!add(Index, 64)]>;
+def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; }
+def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
+  let ParserMatchClass = VectorRegLoAsmOperand;
 }
 
+class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
+    : AsmOperandClass {
+  let Name = "TypedVectorList" # count # "_" # lanes # kind;
 
-def FPR8 : RegisterClass<"AArch64", [v1i8], 8,
-                          (sequence "B%u", 0, 31)> {
+  let PredicateMethod
+      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
+  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
 }
 
-def FPR16 : RegisterClass<"AArch64", [f16, v1i16], 16,
-                          (sequence "H%u", 0, 31)> {
-}
+class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
+    : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
+                                                   # kind # "'>">;
 
-def FPR32 : RegisterClass<"AArch64", [f32, v1i32], 32,
-                          (sequence "S%u", 0, 31)> {
-}
+multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
+  // With implicit types (probably on instruction instead). E.g. { v0, v1 }
+  def _64AsmOperand : AsmOperandClass {
+    let Name = NAME # "64";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList64Operands<" # count # ">";
+  }
 
-def FPR64 : RegisterClass<"AArch64",
-                          [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
-                          64, (sequence "D%u", 0, 31)>;
+  def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
+  }
 
-def FPR128 : RegisterClass<"AArch64",
-                           [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
-                           128, (sequence "Q%u", 0, 31)>;
+  def _128AsmOperand : AsmOperandClass {
+    let Name = NAME # "128";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList128Operands<" # count # ">";
+  }
+
+  def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
+  }
 
-def FPR64Lo : RegisterClass<"AArch64",
-                            [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
-                            64, (sequence "D%u", 0, 15)>;
+  // 64-bit register lists with explicit type.
 
-def FPR128Lo : RegisterClass<"AArch64",
-                             [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
-                             128, (sequence "Q%u", 0, 15)>;
+  // { v0.8b, v1.8b }
+  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
+  def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
+  }
 
-//===----------------------------------------------------------------------===//
-//  Vector registers:
-//===----------------------------------------------------------------------===//
+  // { v0.4h, v1.4h }
+  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
+  def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
+  }
 
-def VPR64AsmOperand : AsmOperandClass {
-  let Name = "VPR";
-  let PredicateMethod = "isReg";
-  let RenderMethod = "addRegOperands";
-}
+  // { v0.2s, v1.2s }
+  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
+  def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
+  }
+
+  // { v0.1d, v1.1d }
+  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
+  def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
+  }
 
-def VPR64 : RegisterOperand<FPR64, "printVPRRegister">;
+  // 128-bit register lists with explicit type
 
-def VPR128 : RegisterOperand<FPR128, "printVPRRegister">;
+  // { v0.16b, v1.16b }
+  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
+  def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
+  }
 
-def VPR64Lo : RegisterOperand<FPR64Lo, "printVPRRegister">;
+  // { v0.8h, v1.8h }
+  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
+  def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
+  }
 
-def VPR128Lo : RegisterOperand<FPR128Lo, "printVPRRegister">;
+  // { v0.4s, v1.4s }
+  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
+  def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
+  }
 
-// Flags register
-def NZCV : Register<"nzcv"> {
-  let Namespace = "AArch64";
-}
+  // { v0.2d, v1.2d }
+  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
+  def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
+  }
 
-def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
-  let CopyCost = -1;
-  let isAllocatable = 0;
-}
+  // { v0.b, v1.b }
+  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
+  def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
+  }
 
-//===----------------------------------------------------------------------===//
-//  Consecutive vector registers
-//===----------------------------------------------------------------------===//
-// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D31_D0
-def Tuples2D : RegisterTuples<[dsub_0, dsub_1],
-                              [(rotl FPR64, 0), (rotl FPR64, 1)]>;
-                              
-// 3 Consecutive 64-bit registers: D0_D1_D2, ..., D31_D0_D1
-def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
-                              [(rotl FPR64, 0), (rotl FPR64, 1),
-                               (rotl FPR64, 2)]>;
-                               
-// 4 Consecutive 64-bit registers: D0_D1_D2_D3, ..., D31_D0_D1_D2
-def Tuples4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
-                              [(rotl FPR64, 0), (rotl FPR64, 1),
-                               (rotl FPR64, 2), (rotl FPR64, 3)]>;
-
-// 2 Consecutive 128-bit registers: Q0_Q1, Q1_Q2, ..., Q30_Q31
-def Tuples2Q : RegisterTuples<[qsub_0, qsub_1],
-                              [(rotl FPR128, 0), (rotl FPR128, 1)]>;
-
-// 3 Consecutive 128-bit registers: Q0_Q1_Q2, ..., Q31_Q0_Q1
-def Tuples3Q : RegisterTuples<[qsub_0, qsub_1, qsub_2],
-                              [(rotl FPR128, 0), (rotl FPR128, 1),
-                               (rotl FPR128, 2)]>;
-                               
-// 4 Consecutive 128-bit registers: Q0_Q1_Q2_Q3, ..., Q31_Q0_Q1_Q2
-def Tuples4Q : RegisterTuples<[qsub_0, qsub_1, qsub_2, qsub_3],
-                              [(rotl FPR128, 0), (rotl FPR128, 1),
-                               (rotl FPR128, 2), (rotl FPR128, 3)]>;
-
-// The followings are super register classes to model 2/3/4 consecutive
-// 64-bit/128-bit registers.
-
-def DPair : RegisterClass<"AArch64", [v2i64], 64, (add Tuples2D)>;
-
-def DTriple : RegisterClass<"AArch64", [untyped], 64, (add Tuples3D)> {
-  let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
-}
-
-def DQuad : RegisterClass<"AArch64", [v4i64], 64, (add Tuples4D)>;
-
-def QPair : RegisterClass<"AArch64", [v4i64], 128, (add Tuples2Q)>;
-
-def QTriple : RegisterClass<"AArch64", [untyped], 128, (add Tuples3Q)> {
-  let Size = 384; // 3 x 128 bits, we have no predefined type of that size.
-}
-
-def QQuad : RegisterClass<"AArch64", [v8i64], 128, (add Tuples4Q)>;
-
-
-// The followings are vector list operands
-multiclass VectorList_operands<string PREFIX, string LAYOUT, int Count,
-                               RegisterClass RegList> {
-  def _asmoperand : AsmOperandClass {
-    let Name = PREFIX # LAYOUT # Count;
-    let RenderMethod = "addVectorListOperands";
-    let PredicateMethod = 
-        "isVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">";
-    let ParserMethod = "ParseVectorList";
+  // { v0.h, v1.h }
+  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
+  def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
   }
 
-  def _operand : RegisterOperand<RegList,
-        "printVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">"> {
-    let ParserMatchClass =
-      !cast<AsmOperandClass>(PREFIX # LAYOUT # "_asmoperand");
+  // { v0.s, v1.s }
+  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
+  def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
   }
-}
 
-multiclass VectorList_BHSD<string PREFIX, int Count, RegisterClass DRegList,
-                           RegisterClass QRegList> {
-  defm 8B : VectorList_operands<PREFIX, "8B", Count, DRegList>;
-  defm 4H : VectorList_operands<PREFIX, "4H", Count, DRegList>;
-  defm 2S : VectorList_operands<PREFIX, "2S", Count, DRegList>;
-  defm 1D : VectorList_operands<PREFIX, "1D", Count, DRegList>;
-  defm 16B : VectorList_operands<PREFIX, "16B", Count, QRegList>;
-  defm 8H : VectorList_operands<PREFIX, "8H", Count, QRegList>;
-  defm 4S : VectorList_operands<PREFIX, "4S", Count, QRegList>;
-  defm 2D : VectorList_operands<PREFIX, "2D", Count, QRegList>;
+  // { v0.d, v1.d }
+  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
+  def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
+  }
+
+
 }
 
-// Vector list operand with 1/2/3/4 registers: VOne8B_operand,..., VQuad2D_operand
-defm VOne : VectorList_BHSD<"VOne", 1, FPR64, FPR128>;
-defm VPair : VectorList_BHSD<"VPair", 2, DPair, QPair>;
-defm VTriple : VectorList_BHSD<"VTriple", 3, DTriple, QTriple>;
-defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>;
+defm VecListOne   : VectorList<1, FPR64, FPR128>;
+defm VecListTwo   : VectorList<2, DD,    QQ>;
+defm VecListThree : VectorList<3, DDD,   QQQ>;
+defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
+
+
+// Register operand versions of the scalar FP registers.
+def FPR16Op : RegisterOperand<FPR16, "printOperand">;
+def FPR32Op : RegisterOperand<FPR32, "printOperand">;
+def FPR64Op : RegisterOperand<FPR64, "printOperand">;
+def FPR128Op : RegisterOperand<FPR128, "printOperand">;
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
new file mode 100644
index 0000000..0c3949e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -0,0 +1,291 @@
+//==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A53 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
+def CortexA53Model : SchedMachineModel {
+  let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
+  let IssueWidth = 2;        // 2 micro-ops are dispatched per cycle.
+  let MinLatency = 1 ;       // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 3;       // Optimistic load latency assuming bypass.
+                             // This is overriden by OperandCycles if the
+                             // Itineraries are queried instead.
+  let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
+                             // Specification - Instruction Timings"
+                             // v 1.0 Spreadsheet
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Cortex-A53 is in-order.
+
+def A53UnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def A53UnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def A53UnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division
+def A53UnitLdSt   : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def A53UnitB      : ProcResource<1> { let BufferSize = 0; } // Branch
+def A53UnitFPALU  : ProcResource<1> { let BufferSize = 0; } // FP ALU
+def A53UnitFPMDS  : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types which both map the ProcResources and
+// set the latency.
+
+let SchedModel = CortexA53Model in {
+
+// ALU - Despite having a full latency of 4, most of the ALU instructions can
+//       forward a cycle earlier and then two cycles earlier in the case of a
+//       shift-only instruction. These latencies will be incorrect when the
+//       result cannot be forwarded, but modeling isn't rocket surgery.
+def : WriteRes<WriteImm, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteI, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteISReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIEReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIS, [A53UnitALU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [A53UnitALU]> { let Latency = 3; }
+
+// MAC
+def : WriteRes<WriteIM32, [A53UnitMAC]> { let Latency = 4; }
+def : WriteRes<WriteIM64, [A53UnitMAC]> { let Latency = 4; }
+
+// Div
+def : WriteRes<WriteID32, [A53UnitDiv]> { let Latency = 4; }
+def : WriteRes<WriteID64, [A53UnitDiv]> { let Latency = 4; }
+
+// Load
+def : WriteRes<WriteLD, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
+//               below, choosing the median of 3 which makes the latency 6.
+//               May model this more carefully in the future. The remaining
+//               A53WriteVLD# types represent the 1-5 cycle issues explicitly.
+def : WriteRes<WriteVLD, [A53UnitLdSt]> { let Latency = 6;
+                                          let ResourceCycles = [3]; }
+def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7;
+                                                  let ResourceCycles = [4]; }
+def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8;
+                                                  let ResourceCycles = [5]; }
+
+// Pre/Post Indexing - Performed as part of address generation which is already
+//                     accounted for in the WriteST* latencies below
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTP, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTX, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [A53UnitLdSt]> { let Latency = 5;
+                                          let ResourceCycles = [2];}
+def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+
+// Branch
+def : WriteRes<WriteBr, [A53UnitB]>;
+def : WriteRes<WriteBrReg, [A53UnitB]>;
+def : WriteRes<WriteSys, [A53UnitB]>;
+def : WriteRes<WriteBarrier, [A53UnitB]>;
+def : WriteRes<WriteHint, [A53UnitB]>;
+
+// FP ALU
+def : WriteRes<WriteF, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCmp, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCvt, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCopy, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteV, [A53UnitFPALU]> { let Latency = 6; }
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFMul, [A53UnitFPMDS]> { let Latency = 6; }
+def : WriteRes<WriteFDiv, [A53UnitFPMDS]> { let Latency = 33;
+                                            let ResourceCycles = [29]; }
+def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; }
+def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18;
+                                                     let ResourceCycles = [14]; }
+def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33;
+                                                     let ResourceCycles = [29]; }
+def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17;
+                                                      let ResourceCycles = [13]; }
+def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
+                                                      let ResourceCycles = [28]; }
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+// No forwarding for these reads.
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
+//       operands are needed one cycle later if and only if they are to be
+//       shifted. Otherwise, they too are needed two cycle later. This same
+//       ReadAdvance applies to Extended registers as well, even though there is
+//       a seperate SchedPredicate for them.
+def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
+                             WriteISReg, WriteIEReg,WriteIS,
+                             WriteID32,WriteID64,
+                             WriteIM32,WriteIM64]>;
+def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
+                                          WriteISReg, WriteIEReg,WriteIS,
+                                          WriteID32,WriteID64,
+                                          WriteIM32,WriteIM64]>;
+def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
+                                             WriteISReg, WriteIEReg,WriteIS,
+                                             WriteID32,WriteID64,
+                                             WriteIM32,WriteIM64]>;
+def A53ReadISReg : SchedReadVariant<[
+	SchedVar<RegShiftedPred, [A53ReadShifted]>,
+	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, A53ReadISReg>;
+
+def A53ReadIEReg : SchedReadVariant<[
+	SchedVar<RegExtendedPred, [A53ReadShifted]>,
+	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, A53ReadIEReg>;
+
+// MAC - Operands are generally needed one cycle later in the MAC pipe.
+//       Accumulator operands are needed two cycles later.
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
+                               WriteISReg, WriteIEReg,WriteIS,
+                               WriteID32,WriteID64,
+                               WriteIM32,WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRWs.
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[WriteI], (instrs COPY)>;
+
+//---
+// Vector Loads
+//---
+def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+//---
+// Floating Point MAC, DIV, SQRT
+//---
+def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+}
diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td
new file mode 100644
index 0000000..a2a1802
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -0,0 +1,865 @@
+//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AArch64 Cyclone to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def CycloneModel : SchedMachineModel {
+  let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 192; // Based on the reorder buffer.
+  let LoadLatency = 4; // Optimistic load latency.
+  let MispredictPenalty = 16; // 14-19 cycles are typical.
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cyclone.
+
+// 4 integer pipes
+def CyUnitI : ProcResource<4> {
+  let BufferSize = 48;
+}
+
+// 2 branch units: I[0..1]
+def CyUnitB : ProcResource<2> {
+  let Super  = CyUnitI;
+  let BufferSize = 24;
+}
+
+// 1 indirect-branch unit: I[0]
+def CyUnitBR : ProcResource<1> {
+  let Super  = CyUnitB;
+}
+
+// 2 shifter pipes: I[2..3]
+// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
+def CyUnitIS : ProcResource<2> {
+  let Super = CyUnitI;
+  let BufferSize = 24;
+}
+
+// 1 mul pipe: I[0]
+def CyUnitIM : ProcResource<1> {
+  let Super = CyUnitBR;
+  let BufferSize = 32;
+}
+
+// 1 div pipe: I[1]
+def CyUnitID : ProcResource<1> {
+  let Super = CyUnitB;
+  let BufferSize = 16;
+}
+
+// 1 integer division unit. This is driven by the ID pipe, but only
+// consumes the pipe for one cycle at issue and another cycle at writeback.
+def CyUnitIntDiv : ProcResource<1>;
+
+// 2 ld/st pipes.
+def CyUnitLS : ProcResource<2> {
+  let BufferSize = 28;
+}
+
+// 3 fp/vector pipes.
+def CyUnitV : ProcResource<3> {
+  let BufferSize = 48;
+}
+// 2 fp/vector arithmetic and multiply pipes: V[0-1]
+def CyUnitVM : ProcResource<2> {
+  let Super = CyUnitV;
+  let BufferSize = 32;
+}
+// 1 fp/vector division/sqrt pipe: V[2]
+def CyUnitVD : ProcResource<1> {
+  let Super = CyUnitV;
+  let BufferSize = 16;
+}
+// 1 fp compare pipe: V[0]
+def CyUnitVC : ProcResource<1> {
+  let Super = CyUnitVM;
+  let BufferSize = 16;
+}
+
+// 2 fp division/square-root units.  These are driven by the VD pipe,
+// but only consume the pipe for one cycle at issue and a cycle at writeback.
+def CyUnitFloatDiv : ProcResource<2>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write resources and latency on Cyclone.
+// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
+
+let SchedModel = CycloneModel in {
+
+//---
+// 7.8.1. Moves
+//---
+
+// A single nop micro-op (uX).
+def WriteX : SchedWriteRes<[]> { let Latency = 0; }
+
+// Move zero is a register rename (to machine register zero).
+// The move is replaced by a single nop micro-op.
+// MOVZ Rd, #0
+// AND Rd, Rzr, #imm
+def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>;
+def WriteImmZ  : SchedWriteVariant<[
+                   SchedVar<WriteZPred, [WriteX]>,
+                   SchedVar<NoSchedPred, [WriteImm]>]>;
+def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
+
+// Move GPR is a register rename and single nop micro-op.
+// ORR Xd, XZR, Xm
+// ADD Xd, Xn, #0
+def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>;
+def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>;
+def WriteMov      : SchedWriteVariant<[
+                      SchedVar<WriteIMovPred, [WriteX]>,
+                      SchedVar<WriteVMovPred, [WriteX]>,
+                      SchedVar<NoSchedPred,   [WriteI]>]>;
+def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
+
+// Move non-zero immediate is an integer ALU op.
+// MOVN,MOVZ,MOVK
+def : WriteRes<WriteImm, [CyUnitI]>;
+
+//---
+// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
+//              Shifts and Bitfield Operations
+//---
+
+// ADR,ADRP
+// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
+// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
+// ADC(S),SBC(S)
+// Aliases: CMN, CMP, TST
+//
+// Conditional operations.
+// CCMNi,CCMPi,CCMNr,CCMPr,
+// CSEL,CSINC,CSINV,CSNEG
+//
+// Bit counting and reversal operations.
+// CLS,CLZ,RBIT,REV,REV16,REV32
+def : WriteRes<WriteI, [CyUnitI]>;
+
+// ADD with shifted register operand is a single micro-op that
+// consumes a shift pipeline for two cycles.
+// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
+// EXAMPLE: ADDrs Xn, Xm LSL #imm
+def : WriteRes<WriteISReg, [CyUnitIS]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// ADD with extended register operand is the same as shifted reg operand.
+// ADD(S)re,SUB(S)re
+// EXAMPLE: ADDXre Xn, Xm, UXTB #1
+def : WriteRes<WriteIEReg, [CyUnitIS]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// Variable shift and bitfield operations.
+// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
+def : WriteRes<WriteIS, [CyUnitIS]>;
+
+// EXTR Shifts a pair of registers and requires two micro-ops.
+// The second micro-op is delayed, as modeled by ReadExtrHi.
+// EXTR Xn, Xm, #imm
+def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// EXTR's first register read is delayed by one cycle, effectively
+// shortening its writer's latency.
+// EXTR Xn, Xm, #imm
+def : ReadAdvance<ReadExtrHi, 1>;
+
+//---
+// 7.8.6. Multiplies
+//---
+
+// MUL/MNEG are aliases for MADD/MSUB.
+// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
+def : WriteRes<WriteIM32, [CyUnitIM]> {
+  let Latency = 4;
+}
+// MADDX,MSUBX,SMULH,UMULH
+def : WriteRes<WriteIM64, [CyUnitIM]> {
+  let Latency = 5;
+}
+
+//---
+// 7.8.7. Divide
+//---
+
+// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVW,UDIVW
+def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 10];
+}
+// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
+// The ID pipe is consumed for 2 cycles: issue and writeback.
+// SDIVX,UDIVX
+def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
+  let Latency = 13;
+  let ResourceCycles = [2, 13];
+}
+
+//---
+// 7.8.8,7.8.10. Load/Store, single element
+//---
+
+// Integer loads take 4 cycles and use one LS unit for one cycle.
+def : WriteRes<WriteLD, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// Store-load forwarding is 4 cycles.
+//
+// Note: The store-exclusive sequence incorporates this
+// latency. However, general heuristics should not model the
+// dependence between a store and subsequent may-alias load because
+// hardware speculation works.
+def : WriteRes<WriteST, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// Load from base address plus an optionally scaled register offset.
+// Rt latency is latency WriteIS + WriteLD.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def CyWriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
+  SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
+def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
+
+// EXAMPLE: STR Xn, Xm [, lsl 3]
+def CyWriteSTIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
+  SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
+def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
+
+// Read the (unshifted) base register Xn in the second micro-op one cycle later.
+// EXAMPLE: LDR Xn, Xm [, lsl 3]
+def ReadBaseRS : SchedReadAdvance<1>;
+def CyReadAdrBase : SchedReadVariant<[
+  SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
+  SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
+def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
+
+//---
+// 7.8.9,7.8.11. Load/Store, paired
+//---
+
+// Address pre/post increment is a simple ALU op with one cycle latency.
+def : WriteRes<WriteAdr, [CyUnitI]>;
+
+// LDP high register write is fused with the load, but a nop micro-op remains.
+def : WriteRes<WriteLDHi, []> {
+  let Latency = 4;
+}
+
+// STP is a vector op and store, except for QQ, which is just two stores.
+def : SchedAlias<WriteSTP, WriteVSTShuffle>;
+def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
+
+//---
+// 7.8.13. Branches
+//---
+
+// Branches take a single micro-op.
+// The misprediction penalty is defined as a SchedMachineModel property.
+def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
+def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
+
+//---
+// 7.8.14. Never-issued Instructions, Barrier and Hint Operations
+//---
+
+// NOP,SEV,SEVL,WFE,WFI,YIELD
+def : WriteRes<WriteHint, []> {let Latency = 0;}
+// ISB
+def : InstRW<[WriteI], (instrs ISB)>;
+// SLREX,DMB,DSB
+def : WriteRes<WriteBarrier, [CyUnitLS]>;
+
+// System instructions get an invalid latency because the latency of
+// other operations across them is meaningless.
+def : WriteRes<WriteSys, []> {let Latency = -1;}
+
+//===----------------------------------------------------------------------===//
+// 7.9 Vector Unit Instructions
+
+// Simple vector operations take 2 cycles.
+def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
+
+// Define some longer latency vector op types for Cyclone.
+def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
+def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
+def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
+
+// Simple floating-point operations take 2 cycles.
+def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
+
+//---
+// 7.9.1 Vector Moves
+//---
+
+// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
+// generates expensive int-float conversion instead:
+// FMOVDi Dd, #0.0
+// FMOVv2f64ns Vd.2d, #0.0
+
+// FMOVSi,FMOVDi
+def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
+
+// MOVI,MVNI are WriteV
+// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
+
+// Move FPR is a register rename and single nop micro-op.
+// ORR.16b Vd,Vn,Vn
+// COPY is handled above in the WriteMov Variant.
+def WriteVMov    : SchedWriteVariant<[
+                     SchedVar<WriteVMovPred, [WriteX]>,
+                     SchedVar<NoSchedPred,   [WriteV]>]>;
+def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
+
+// FMOVSr,FMOVDr are WriteF.
+
+// MOV V,V is a WriteV.
+
+// CPY D,V[x] is a WriteV
+
+// INS V[x],V[y] is a WriteV.
+
+// FMOVWSr,FMOVXDr,FMOVXDHighr
+def : WriteRes<WriteFCopy, [CyUnitLS]> {
+  let Latency = 5;
+}
+
+// FMOVSWr,FMOVDXr
+def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
+
+// INS V[x],R
+def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
+
+// SMOV,UMOV R,V[x]
+def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
+def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
+
+// DUP V,R
+def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
+
+// DUP V,V[x] is a WriteV.
+
+//---
+// 7.9.2 Integer Arithmetic, Logical, and Comparisons
+//---
+
+// BIC,ORR V,#imm are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "ABSv")>;
+
+// MVN,NEG,NOT are WriteV
+
+def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
+
+// ADDP is a WriteV.
+def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
+
+def : InstRW<[CyWriteV3],
+             (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
+
+// ADD,SUB are WriteV
+
+// Forward declare.
+def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+
+// Add/Diff and accumulate uses the vector multiply unit.
+def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVAccum  : SchedReadAdvance<1,
+                    [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+             (instregex "SADALP","UADALP")>;
+
+def : InstRW<[CyWriteVAccum, CyReadVAccum],
+             (instregex "SABAv","UABAv","SABALv","UABALv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
+
+def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
+
+// WriteV includes:
+// AND,BIC,CMTST,EOR,ORN,ORR
+// ADDP
+// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
+// SADDL,SSUBL,UADDL,USUBL
+// SADDW,SSUBW,UADDW,USUBW
+
+def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
+                                     "CMLEv","CMLTv",
+                                     "CMHIv","CMHSv")>;
+
+def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
+                                     "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
+
+def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
+                                       "SABDLv","UABDLv")>;
+
+//---
+// 7.9.3 Floating Point Arithmetic and Comparisons
+//---
+
+// FABS,FNEG are WriteF
+
+def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
+def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
+
+def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
+                                     "FMINPv2i","FMINNMPv2i")>;
+
+def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
+
+def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
+                                  FSUBSrr,FSUBv2f32,FSUBv4f32,
+                                  FADDPv2f32,FADDPv4f32,
+                                  FABD32,FABDv2f32,FABDv4f32)>;
+def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
+                                  FSUBDrr,FSUBv2f64,
+                                  FADDPv2f64,
+                                  FABD64,FABDv2f64)>;
+
+def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
+
+def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
+                                     "FMAXS","FMAXD","FMAXv",
+                                     "FMINS","FMIND","FMINv",
+                                     "FMAXNMS","FMAXNMD","FMAXNMv",
+                                     "FMINNMS","FMINNMD","FMINNMv",
+                                     "FMAXPv2f","FMAXPv4f",
+                                     "FMINPv2f","FMINPv4f",
+                                     "FMAXNMPv2f","FMAXNMPv4f",
+                                     "FMINNMPv2f","FMINNMPv4f")>;
+
+// FCMP,FCMPE,FCCMP,FCCMPE
+def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
+
+// FCSEL is a WriteF.
+
+//---
+// 7.9.4 Shifts and Bitfield Operations
+//---
+
+// SHL is a WriteV
+
+def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
+def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
+
+def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
+def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
+
+// Shift and accumulate uses the vector multiply unit.
+def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
+def CyReadVShiftAcc  : SchedReadAdvance<1,
+                        [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
+def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
+             (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// SSHL,USHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
+
+// SQSHL,SQSHLU,UQSHL are WriteV.
+
+def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
+
+// WriteV includes:
+// SHLL,SSHLL,USHLL
+// SLI,SRI
+// BIF,BIT,BSL
+// EXT
+// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
+// XTN2
+
+def : InstRW<[CyWriteV4],
+             (instregex "RSHRNv","SHRNv",
+                        "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
+                        "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+
+//---
+// 7.9.5 Multiplication
+//---
+
+def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
+def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
+                             "SQDMULLv","SQDMULHv","SQRDMULHv")>;
+
+// FMUL,FMULX,FNMUL default to WriteFMul.
+def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
+
+def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
+def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
+                               FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
+
+def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
+def : InstRW<[CyWriteVMul, CyReadVMulAcc],
+             (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
+              "SQDMLAL","SQDMLSL")>;
+
+def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
+def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
+def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
+def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
+
+def : InstRW<[CyWriteSMul, CyReadSMul],
+             (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
+              FMLAv2f32,FMLAv4f32,
+              FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
+def : InstRW<[CyWriteDMul, CyReadDMul],
+             (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
+              FMLAv2f64,FMLAv2i64_indexed,
+              FMLSv2f64,FMLSv2i64_indexed)>;
+
+def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
+def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
+
+//---
+// 7.9.6 Divide and Square Root
+//---
+
+// FDIV,FSQRT
+// TODO: Add 64-bit variant with 19 cycle latency.
+// TODO: Specialize FSQRT for longer latency.
+def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
+  let Latency = 17;
+  let ResourceCycles = [2, 17];
+}
+
+def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
+
+def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
+def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
+
+def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
+def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
+def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
+def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
+
+//---
+// 7.9.7 Integer-FP Conversions
+//---
+
+// FCVT lengthen f16/s32
+def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
+
+// FCVT,FCVTN,FCVTXN
+// SCVTF,UCVTF V,V
+// FRINT(AIMNPXZ) V,V
+def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
+
+// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
+def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
+def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
+
+// FCVT Rd, S/D = V6+LD4: 10 cycles
+def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
+def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
+
+// FCVTL is a WriteV
+
+//---
+// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
+//---
+
+def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
+def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
+                                       AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
+                                       SHA1SU0rrr)>;
+
+def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
+def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
+
+def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
+def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
+                                       SHA256Hrrr,SHA256H2rrr)>;
+
+// TRN,UZP,ZUP are WriteV.
+
+// TBL,TBX are WriteV.
+
+//---
+// 7.9.11-7.9.14 Load/Store, single element and paired
+//---
+
+// Loading into the vector unit takes 5 cycles vs 4 for integer loads.
+def : WriteRes<WriteVLD, [CyUnitLS]> {
+  let Latency = 5;
+}
+
+// Store-load forwarding is 4 cycles.
+def : WriteRes<WriteVST, [CyUnitLS]> {
+  let Latency = 4;
+}
+
+// WriteVLDPair/VSTPair sequences are expanded by the target description.
+
+//---
+// 7.9.15 Load, element operations
+//---
+
+// Only the first WriteVLD and WriteAdr for writeback matches def operands.
+// Subsequent WriteVLDs consume resources. Since all loaded values have the
+// same latency, this is acceptable.
+
+// Vd is read 5 cycles after issuing the vector load.
+def : ReadAdvance<ReadVLD, 5>;
+
+def : InstRW<[WriteVLD],
+             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+// Register writes from the load's high half are fused micro-ops.
+def : InstRW<[WriteVLD],
+             (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr],
+             (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
+             (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
+             (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
+             (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],
+             (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
+             (instregex "LD1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle],
+             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr],
+             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+             (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+             (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
+             (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+             (instregex "LD2i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+             (instregex "LD2i(8|16|32)_POST")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
+             (instregex "LD2i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
+             (instregex "LD2i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV],
+             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
+             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+             (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+             (instregex "LD3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
+             (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
+             (instregex "LD3i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
+             (instregex "LD3i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
+             (instregex "LD3i64$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+             (instregex "LD3i64_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
+             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
+             (instrs LD3Rv1d,LD3Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
+             (instrs LD3Rv2d_POST,LD3Rv2d_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+             (instregex "LD4Fourv(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
+              WriteVLDPairShuffle, WriteVLDPairShuffle],
+             (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
+              WriteVLDPairShuffle, WriteVLDPairShuffle],
+             (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
+             (instregex "LD4i(8|16|32)$")>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
+             (instregex "LD4i(8|16|32)_POST")>;
+
+
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4i64)>;
+def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
+             (instrs LD4i64_POST)>;
+
+def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
+             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
+             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
+
+def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4Rv1d,LD4Rv2d)>;
+def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
+             (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
+
+//---
+// 7.9.16 Store, element operations
+//---
+
+// Only the WriteAdr for writeback matches a def operands.
+// Subsequent WriteVLDs only consume resources.
+
+def : InstRW<[WriteVST],
+             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST],
+             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],
+             (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+             (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST],
+             (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST],
+             (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVST],
+             (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
+             (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
+             (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle],
+             (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle],
+             (instregex "ST2Twov(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
+def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(8b|4h|2s)_POST")>;
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
+             (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
+
+def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
+def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
+
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
+            (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
+            (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
+def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
+              WriteVSTPairShuffle, WriteVSTPairShuffle],
+             (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
+              WriteVSTPairShuffle, WriteVSTPairShuffle],
+             (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
+
+def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
+def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
+
+def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
+def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
+
+//---
+// Unused SchedRead types
+//---
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+
+} // SchedModel = CycloneModel
diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td
index ec8450b..eaa9110 100644
--- a/lib/Target/AArch64/AArch64Schedule.td
+++ b/lib/Target/AArch64/AArch64Schedule.td
@@ -1,4 +1,4 @@
-//===- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
+//==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,74 +7,98 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// Generic processor itineraries for legacy compatibility.
-
-def GenericItineraries : ProcessorItineraries<[], [], []>;
-
-
-//===----------------------------------------------------------------------===//
-// Base SchedReadWrite types
-
-// Basic ALU
-def WriteALU : SchedWrite;  // Generic: may contain shift and/or ALU operation
-def WriteALUs : SchedWrite; // Shift only with no ALU operation
-def ReadALU : SchedRead;    // Operand not needed for shifting
-def ReadALUs : SchedRead;   // Operand needed for shifting
-
-// Multiply with optional accumulate
-def WriteMAC : SchedWrite;
-def ReadMAC : SchedRead;
-
-// Compares
-def WriteCMP : SchedWrite;
-def ReadCMP : SchedRead;
-
-// Division
-def WriteDiv : SchedWrite;
-def ReadDiv : SchedRead;
-
-// Loads
-def WriteLd : SchedWrite;
-def WritePreLd : SchedWrite;
-def WriteVecLd : SchedWrite;
-def ReadLd : SchedRead;
-def ReadPreLd : SchedRead;
-def ReadVecLd : SchedRead;
-
-// Stores
-def WriteSt : SchedWrite;
-def WriteVecSt : SchedWrite;
-def ReadSt : SchedRead;
-def ReadVecSt : SchedRead;
-
-// Branches
-def WriteBr : SchedWrite;
-def WriteBrL : SchedWrite;
-def ReadBr : SchedRead;
-
-// Floating Point ALU
-def WriteFPALU : SchedWrite;
-def ReadFPALU : SchedRead;
-
-// Floating Point MAC, Mul, Div, Sqrt
-//   Most processors will simply send all of these down a dedicated pipe, but
-//   they're explicitly seperated here for flexibility of modeling later. May
-//   consider consolidating them into a single WriteFPXXXX type in the future.
-def WriteFPMAC : SchedWrite;
-def WriteFPMul : SchedWrite;
-def WriteFPDiv : SchedWrite;
-def WriteFPSqrt : SchedWrite;
-def ReadFPMAC : SchedRead;
-def ReadFPMul : SchedRead;
-def ReadFPDiv : SchedRead;
-def ReadFPSqrt : SchedRead;
-
-// Noop
-def WriteNoop : SchedWrite;
-
-
-//===----------------------------------------------------------------------===//
-// Subtarget specific Machine Models.
-
-include "AArch64ScheduleA53.td"
+// Define TII for use in SchedVariant Predicates.
+// const MachineInstr *MI and const TargetSchedModel *SchedModel
+// are defined by default.
+def : PredicateProlog<[{
+  const AArch64InstrInfo *TII =
+    static_cast<const AArch64InstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+// AArch64 Scheduler Definitions
+
+def WriteImm       : SchedWrite; // MOVN, MOVZ
+// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
+// select the correct sequence of WriteImms.
+
+def WriteI         : SchedWrite; // ALU
+def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
+def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
+def ReadI          : SchedRead;  // ALU
+def ReadISReg      : SchedRead;  // ALU of Shifted-Reg
+def ReadIEReg      : SchedRead;  // ALU of Extended-Reg
+def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
+def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
+def WriteIS        : SchedWrite; // Shift/Scale
+def WriteID32      : SchedWrite; // 32-bit Divide
+def WriteID64      : SchedWrite; // 64-bit Divide
+def ReadID         : SchedRead;  // 32/64-bit Divide
+def WriteIM32      : SchedWrite; // 32-bit Multiply
+def WriteIM64      : SchedWrite; // 64-bit Multiply
+def ReadIM         : SchedRead;  // 32/64-bit Multiply
+def ReadIMA        : SchedRead;  // 32/64-bit Multiply Accumulate
+def WriteBr        : SchedWrite; // Branch
+def WriteBrReg     : SchedWrite; // Indirect Branch
+
+def WriteLD        : SchedWrite; // Load from base addr plus immediate offset
+def WriteST        : SchedWrite; // Store to base addr plus immediate offset
+def WriteSTP       : SchedWrite; // Store a register pair.
+def WriteAdr       : SchedWrite; // Address pre/post increment.
+
+def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
+def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
+def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
+
+// Predicate for determining when a shiftable register is shifted.
+def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>;
+
+// Predicate for determining when a extendedable register is extended.
+def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>;
+
+// ScaledIdxPred is true if a WriteLDIdx operand will be
+// scaled. Subtargets can use this to dynamically select resources and
+// latency for WriteLDIdx and ReadAdrBase.
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
+
+// Serialized two-level address load.
+// EXAMPLE: LOADGot
+def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
+
+// Serialized two-level address lookup.
+// EXAMPLE: MOVaddr...
+def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
+
+// The second register of a load-pair.
+// LDP,LDPSW,LDNP,LDXP,LDAXP
+def WriteLDHi : SchedWrite;
+
+// Store-exclusive is a store followed by a dependent load.
+def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
+
+def WriteSys     : SchedWrite; // Long, variable latency system ops.
+def WriteBarrier : SchedWrite; // Memory barrier.
+def WriteHint    : SchedWrite; // Hint instruction.
+
+def WriteF       : SchedWrite; // General floating-point ops.
+def WriteFCmp    : SchedWrite; // Floating-point compare.
+def WriteFCvt    : SchedWrite; // Float conversion.
+def WriteFCopy   : SchedWrite; // Float-int register copy.
+def WriteFImm    : SchedWrite; // Floating-point immediate.
+def WriteFMul    : SchedWrite; // Floating-point multiply.
+def WriteFDiv    : SchedWrite; // Floating-point division.
+
+def WriteV   : SchedWrite; // Vector ops.
+def WriteVLD : SchedWrite; // Vector loads.
+def WriteVST : SchedWrite; // Vector stores.
+
+// Read the unwritten lanes of the VLD's destination registers.
+def ReadVLD : SchedRead;
+
+// Sequential vector load and shuffle.
+def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
+def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
+
+// Store a shuffled vector.
+def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
+def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/lib/Target/AArch64/AArch64ScheduleA53.td b/lib/Target/AArch64/AArch64ScheduleA53.td
deleted file mode 100644
index 20a14e7..0000000
--- a/lib/Target/AArch64/AArch64ScheduleA53.td
+++ /dev/null
@@ -1,144 +0,0 @@
-//=- AArch64ScheduleA53.td - ARM Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the itinerary class data for the ARM Cortex A53 processors.
-//
-//===----------------------------------------------------------------------===//
-
-// ===---------------------------------------------------------------------===//
-// The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedModel.h for details.
-
-// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
-def CortexA53Model : SchedMachineModel {
-  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
-  let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency.
-  let LoadLatency = 2; // Optimistic load latency assuming bypass.
-                       // This is overriden by OperandCycles if the
-                       // Itineraries are queried instead.
-  let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
-                             // Specification - Instruction Timings"
-                             // v 1.0 Spreadsheet
-}
-
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available.
-
-// Modeling each pipeline as a ProcResource using the default BufferSize = -1.
-// Cortex-A53 is in-order and therefore should be using BufferSize = 0. The
-// current configuration performs better with the basic latencies provided so
-// far. Will revisit BufferSize once the latency information is more accurate.
-
-let SchedModel = CortexA53Model in {
-
-def A53UnitALU    : ProcResource<2>;                        // Int ALU
-def A53UnitMAC    : ProcResource<1>;                        // Int MAC
-def A53UnitDiv    : ProcResource<1>;                        // Int Division
-def A53UnitLdSt   : ProcResource<1>;                        // Load/Store
-def A53UnitB      : ProcResource<1>;                        // Branch
-def A53UnitFPALU  : ProcResource<1>;                        // FP ALU
-def A53UnitFPMDS  : ProcResource<1>;                        // FP Mult/Div/Sqrt
-
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedWrite types which both map the ProcResources and
-// set the latency.
-
-// Issue - Every instruction must consume an A53WriteIssue. Optionally,
-//         instructions that cannot be dual-issued will also include the
-//         A53WriteIssue2nd in their SchedRW list. That second WriteRes will
-//         ensure that a second issue slot is consumed.
-def A53WriteIssue : SchedWriteRes<[]>;
-def A53WriteIssue2nd : SchedWriteRes<[]> { let Latency = 0; }
-
-// ALU - These are reduced to 1 despite a true latency of 4 in order to easily
-//       model forwarding logic. Once forwarding is properly modelled, then
-//       they'll be corrected.
-def : WriteRes<WriteALU, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteALUs, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteCMP, [A53UnitALU]> { let Latency = 1; }
-
-// MAC
-def : WriteRes<WriteMAC, [A53UnitMAC]> { let Latency = 4; }
-
-// Div
-def : WriteRes<WriteDiv, [A53UnitDiv]> { let Latency = 4; }
-
-// Load - Note: Vector loads take 1-5 cycles to issue. For the WriteVecLd below,
-//        choosing the median of 3 which makes the latency 6. May model this more
-//        carefully in the future.
-def : WriteRes<WriteLd, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WritePreLd, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteVecLd, [A53UnitLdSt]> { let Latency = 6; }
-
-// Store - Note: Vector stores take 1-3 cycles to issue. For the ReadVecSt below,
-//         choosing the median of 2 which makes the latency 5. May model this more
-//         carefully in the future.
-def : WriteRes<WriteSt, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteVecSt, [A53UnitLdSt]> { let Latency = 5; }
-
-// Branch
-def : WriteRes<WriteBr, [A53UnitB]>;
-def : WriteRes<WriteBrL, [A53UnitB]>;
-
-// FP ALU
-def : WriteRes<WriteFPALU, [A53UnitFPALU]> {let Latency = 6; }
-
-// FP MAC, Mul, Div, Sqrt
-//   Using Double Precision numbers for now as a worst case. Additionally, not
-//   modeling the exact hazard but instead treating the whole pipe as a hazard.
-//   As an example VMUL, VMLA, and others are actually pipelined. VDIV and VSQRT
-//   have a total latency of 33 and 32 respectively but only a hazard of 29 and
-//   28 (double-prescion example).
-def : WriteRes<WriteFPMAC, [A53UnitFPMDS]> { let Latency = 10; }
-def : WriteRes<WriteFPMul, [A53UnitFPMDS]> { let Latency = 6; }
-def : WriteRes<WriteFPDiv, [A53UnitFPMDS]> { let Latency = 33;
-                                             let ResourceCycles = [29]; }
-def : WriteRes<WriteFPSqrt, [A53UnitFPMDS]> { let Latency = 32;
-                                              let ResourceCycles = [28]; }
-
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedRead types.
-
-// No forwarding defined for ReadALU yet.
-def : ReadAdvance<ReadALU, 0>;
-
-// No forwarding defined for ReadCMP yet.
-def : ReadAdvance<ReadCMP, 0>;
-
-// No forwarding defined for ReadBr yet.
-def : ReadAdvance<ReadBr, 0>;
-
-// No forwarding defined for ReadMAC yet.
-def : ReadAdvance<ReadMAC, 0>;
-
-// No forwarding defined for ReadDiv yet.
-def : ReadAdvance<ReadDiv, 0>;
-
-// No forwarding defined for ReadLd, ReadPreLd, ReadVecLd yet.
-def : ReadAdvance<ReadLd, 0>;
-def : ReadAdvance<ReadPreLd, 0>;
-def : ReadAdvance<ReadVecLd, 0>;
-
-// No forwarding defined for ReadSt and ReadVecSt yet.
-def : ReadAdvance<ReadSt, 0>;
-def : ReadAdvance<ReadVecSt, 0>;
-
-// No forwarding defined for ReadFPALU yet.
-def : ReadAdvance<ReadFPALU, 0>;
-
-// No forwarding defined for ReadFPMAC/Mul/Div/Sqrt yet.
-def : ReadAdvance<ReadFPMAC, 0>;
-def : ReadAdvance<ReadFPMul, 0>;
-def : ReadAdvance<ReadFPDiv, 0>;
-def : ReadAdvance<ReadFPSqrt, 0>;
-
-}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 6bbe075..5c65b75 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -11,15 +11,49 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-selectiondag-info"
 #include "AArch64TargetMachine.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 using namespace llvm;
 
-AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const AArch64TargetMachine &TM)
-  : TargetSelectionDAGInfo(TM),
-    Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
-}
+#define DEBUG_TYPE "aarch64-selectiondag-info"
+
+AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const TargetMachine &TM)
+    : TargetSelectionDAGInfo(TM),
+      Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {}
+
+AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {}
+
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
+  // Check to see if there is a specialized entry-point for memory zeroing.
+  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
+  const char *bzeroEntry =
+      (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : nullptr;
+  // For small size (< 256), it is not beneficial to use bzero
+  // instead of memset.
+  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
+    const AArch64TargetLowering &TLI =
+        *static_cast<const AArch64TargetLowering *>(
+            DAG.getTarget().getTargetLowering());
 
-AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {
+    EVT IntPtr = TLI.getPointerTy();
+    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = Dst;
+    Entry.Ty = IntPtrTy;
+    Args.push_back(Entry);
+    Entry.Node = Size;
+    Args.push_back(Entry);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl).setChain(Chain)
+      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                 DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
+      .setDiscardResult();
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    return CallResult.second;
+  }
+  return SDValue();
 }
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index d412ed2..8381f99 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -11,22 +11,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64SELECTIONDAGINFO_H
-#define LLVM_AARCH64SELECTIONDAGINFO_H
+#ifndef AArch64SELECTIONDAGINFO_H
+#define AArch64SELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 
 namespace llvm {
 
-class AArch64TargetMachine;
-
 class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
+
 public:
-  explicit AArch64SelectionDAGInfo(const AArch64TargetMachine &TM);
+  explicit AArch64SelectionDAGInfo(const TargetMachine &TM);
   ~AArch64SelectionDAGInfo();
-};
 
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
+                                  SDValue Dst, SDValue Src, SDValue Size,
+                                  unsigned Align, bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+};
 }
 
 #endif
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
new file mode 100644
index 0000000..45f8ddb
--- /dev/null
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -0,0 +1,168 @@
+//===--- AArch64StorePairSuppress.cpp --- Suppress store pair formation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies floating point stores that should not be combined into
+// store pairs. Later we may do the same for floating point loads.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-stp-suppress"
+
+namespace {
+class AArch64StorePairSuppress : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  MachineFunction *MF;
+  TargetSchedModel SchedModel;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+
+public:
+  static char ID;
+  AArch64StorePairSuppress() : MachineFunctionPass(ID) {}
+
+  virtual const char *getPassName() const override {
+    return "AArch64 Store Pair Suppression";
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+private:
+  bool shouldAddSTPToBlock(const MachineBasicBlock *BB);
+
+  bool isNarrowFPStore(const MachineInstr &MI);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineTraceMetrics>();
+    AU.addPreserved<MachineTraceMetrics>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64StorePairSuppress::ID = 0;
+} // anonymous
+
+FunctionPass *llvm::createAArch64StorePairSuppressPass() {
+  return new AArch64StorePairSuppress();
+}
+
+/// Return true if an STP can be added to this block without increasing the
+/// critical resource height. STP is good to form in Ld/St limited blocks and
+/// bad to form in float-point limited blocks. This is true independent of the
+/// critical path. If the critical path is longer than the resource height, the
+/// extra vector ops can limit physreg renaming. Otherwise, it could simply
+/// oversaturate the vector units.
+bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB);
+  unsigned ResLength = BBTrace.getResourceLength();
+
+  // Get the machine model's scheduling class for STPQi.
+  // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
+  unsigned SCIdx = TII->get(AArch64::STPDi).getSchedClass();
+  const MCSchedClassDesc *SCDesc =
+      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
+
+  // If a subtarget does not define resources for STPQi, bail here.
+  if (SCDesc->isValid() && !SCDesc->isVariant()) {
+    unsigned ResLenWithSTP = BBTrace.getResourceLength(
+        ArrayRef<const MachineBasicBlock *>(), SCDesc);
+    if (ResLenWithSTP > ResLength) {
+      DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
+                   << " resources " << ResLength << " -> " << ResLenWithSTP
+                   << "\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Return true if this is a floating-point store smaller than the V reg. On
+/// cyclone, these require a vector shuffle before storing a pair.
+/// Ideally we would call getMatchingPairOpcode() and have the machine model
+/// tell us if it's profitable with no cpu knowledge here.
+///
+/// FIXME: We plan to develop a decent Target abstraction for simple loads and
+/// stores. Until then use a nasty switch similar to AArch64LoadStoreOptimizer.
+bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+    return true;
+  }
+}
+
+bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  TII = static_cast<const AArch64InstrInfo *>(MF->getTarget().getInstrInfo());
+  TRI = MF->getTarget().getRegisterInfo();
+  MRI = &MF->getRegInfo();
+  const TargetSubtargetInfo &ST =
+      MF->getTarget().getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = nullptr;
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n');
+
+  if (!SchedModel.hasInstrSchedModel()) {
+    DEBUG(dbgs() << "  Skipping pass: no machine model present.\n");
+    return false;
+  }
+
+  // Check for a sequence of stores to the same base address. We don't need to
+  // precisely determine whether a store pair can be formed. But we do want to
+  // filter out most situations where we can't form store pairs to avoid
+  // computing trace metrics in those cases.
+  for (auto &MBB : *MF) {
+    bool SuppressSTP = false;
+    unsigned PrevBaseReg = 0;
+    for (auto &MI : MBB) {
+      if (!isNarrowFPStore(MI))
+        continue;
+      unsigned BaseReg;
+      unsigned Offset;
+      if (TII->getLdStBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) {
+        if (PrevBaseReg == BaseReg) {
+          // If this block can take STPs, skip ahead to the next block.
+          if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
+            break;
+          // Otherwise, continue unpairing the stores in this block.
+          DEBUG(dbgs() << "Unpairing store " << MI << "\n");
+          SuppressSTP = true;
+          TII->suppressLdStPair(&MI);
+        }
+        PrevBaseReg = BaseReg;
+      } else
+        PrevBaseReg = 0;
+    }
+  }
+  // This pass just sets some internal MachineMemOperand flags. It can't really
+  // invalidate anything.
+  return false;
+}
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 9140bbd..cd69994 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information --------------===//
+//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,57 +7,110 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the AArch64 specific subclass of TargetSubtargetInfo.
+// This file implements the AArch64 specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
-#include "AArch64RegisterInfo.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-subtarget"
 
-#define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
 #include "AArch64GenSubtargetInfo.inc"
 
-using namespace llvm;
-
-// Pin the vtable to this file.
-void AArch64Subtarget::anchor() {}
+static cl::opt<bool>
+EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
+                     "converter pass"), cl::init(true), cl::Hidden);
 
-AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS,
-                                   bool LittleEndian)
+AArch64Subtarget::AArch64Subtarget(const std::string &TT,
+                                   const std::string &CPU,
+                                   const std::string &FS, bool LittleEndian)
     : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
-      HasFPARMv8(false), HasNEON(false), HasCrypto(false), TargetTriple(TT),
-      CPUString(CPU), IsLittleEndian(LittleEndian) {
+      HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
+      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU),
+      TargetTriple(TT), IsLittleEndian(LittleEndian) {
+  // Determine default and user-specified characteristics
+
+  if (CPUString.empty())
+    CPUString = "generic";
 
-  initializeSubtargetFeatures(CPU, FS);
+  ParseSubtargetFeatures(CPUString, FS);
 }
 
-void AArch64Subtarget::initializeSubtargetFeatures(StringRef CPU,
-                                                   StringRef FS) {
-  if (CPU.empty())
-    CPUString = "generic";
+/// ClassifyGlobalReference - Find the target operand flags that describe
+/// how a global value should be referenced for the current subtarget.
+unsigned char
+AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const {
+
+  // Determine whether this is a reference to a definition or a declaration.
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
+
+  // MachO large model always goes via a GOT, simply to get a single 8-byte
+  // absolute relocation on all global addresses.
+  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
+    return AArch64II::MO_GOT;
+
+  // The small code mode's direct accesses use ADRP, which cannot necessarily
+  // produce the value 0 (if the code is above 4GB). Therefore they must use the
+  // GOT.
+  if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
+    return AArch64II::MO_GOT;
+
+  // If symbol visibility is hidden, the extra load is not needed if
+  // the symbol is definitely defined in the current translation unit.
 
-  std::string FullFS = FS;
-  if (CPUString == "generic") {
-    // Enable FP by default.
-    if (FullFS.empty())
-      FullFS = "+fp-armv8";
+  // The handling of non-hidden symbols in PIC mode is rather target-dependent:
+  //   + On MachO, if the symbol is defined in this module the GOT can be
+  //     skipped.
+  //   + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
+  //     defined could end up in unexpected places. Use a GOT.
+  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
+    if (isTargetMachO())
+      return (isDecl || GV->isWeakForLinker()) ? AArch64II::MO_GOT
+                                               : AArch64II::MO_NO_FLAG;
     else
-      FullFS = "+fp-armv8," + FullFS;
+      // No need to go through the GOT for local symbols on ELF.
+      return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
   }
 
-  ParseSubtargetFeatures(CPU, FullFS);
+  return AArch64II::MO_NO_FLAG;
 }
 
-bool AArch64Subtarget::GVIsIndirectSymbol(const GlobalValue *GV,
-                                          Reloc::Model RelocM) const {
-  if (RelocM == Reloc::Static)
-    return false;
+/// This function returns the name of a function which has an interface
+/// like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over
+/// memset with zero passed as the second argument. Otherwise it
+/// returns null.
+const char *AArch64Subtarget::getBZeroEntry() const {
+  // Prefer bzero on Darwin only.
+  if(isTargetDarwin())
+    return "bzero";
+
+  return nullptr;
+}
+
+void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                         MachineInstr *begin, MachineInstr *end,
+                                         unsigned NumRegionInstrs) const {
+  // LNT run (at least on Cyclone) showed reasonably significant gains for
+  // bi-directional scheduling. 253.perlbmk.
+  Policy.OnlyTopDown = false;
+  Policy.OnlyBottomUp = false;
+}
 
-  return !GV->hasLocalLinkage() && !GV->hasHiddenVisibility();
+bool AArch64Subtarget::enableEarlyIfConversion() const {
+  return EnableEarlyIfConvert;
 }
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 68c6c4b..590ea05 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -1,4 +1,4 @@
-//==-- AArch64Subtarget.h - Define Subtarget for the AArch64 ---*- C++ -*--===//
+//===--- AArch64Subtarget.h - Define Subtarget for the AArch64 -*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,29 +7,27 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file declares the AArch64 specific subclass of TargetSubtargetInfo.
+// This file declares the AArch64 specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64_SUBTARGET_H
-#define LLVM_TARGET_AARCH64_SUBTARGET_H
+#ifndef AArch64SUBTARGET_H
+#define AArch64SUBTARGET_H
 
-#include "llvm/ADT/Triple.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include "AArch64RegisterInfo.h"
+#include <string>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "AArch64GenSubtargetInfo.inc"
 
-#include <string>
-
 namespace llvm {
-class StringRef;
 class GlobalValue;
+class StringRef;
 
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
-  virtual void anchor();
 protected:
-  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57};
+  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
 
   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
   ARMProcFamilyEnum ARMProcFamily;
@@ -37,47 +35,76 @@ protected:
   bool HasFPARMv8;
   bool HasNEON;
   bool HasCrypto;
+  bool HasCRC;
 
-  /// TargetTriple - What processor and OS we're targeting.
-  Triple TargetTriple;
+  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
+  bool HasZeroCycleRegMove;
+
+  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
+  bool HasZeroCycleZeroing;
 
   /// CPUString - String name of used CPU.
   std::string CPUString;
 
-  /// IsLittleEndian - The target is Little Endian
-  bool IsLittleEndian;
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
 
-private:
-  void initializeSubtargetFeatures(StringRef CPU, StringRef FS);
+  /// IsLittleEndian - Is the target little endian?
+  bool IsLittleEndian;
 
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
-  ///
-  AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS,
-                   bool LittleEndian);
+  AArch64Subtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS, bool LittleEndian);
 
-  virtual bool enableMachineScheduler() const {
-    return true;
-  }
-
-  /// ParseSubtargetFeatures - Parses features string setting specified
-  /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  bool enableMachineScheduler() const override { return true; }
 
-  bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
 
-  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
 
   bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON; }
   bool hasCrypto() const { return HasCrypto; }
+  bool hasCRC() const { return HasCRC; }
+
+  bool isLittleEndian() const { return IsLittleEndian; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  bool isCyclone() const { return CPUString == "cyclone"; }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return 64; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// ClassifyGlobalReference - Find the target operand flags that describe
+  /// how a global value should be referenced for the current subtarget.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
 
-  bool isLittle() const { return IsLittleEndian; }
+  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
+                           MachineInstr *end,
+                           unsigned NumRegionInstrs) const override;
 
-  const std::string & getCPUString() const { return CPUString; }
+  bool enableEarlyIfConversion() const override;
 };
 } // End llvm namespace
 
-#endif  // LLVM_TARGET_AARCH64_SUBTARGET_H
+#endif // AArch64SUBTARGET_H
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index d9c990d..0b5dd2f 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -7,41 +7,80 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the implementation of the AArch64TargetMachine
-// methods. Principally just setting up the passes needed to generate correct
-// code on this architecture.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
 #include "AArch64TargetMachine.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
-
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
+static cl::opt<bool>
+EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"),
+           cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"),
+                     cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableAdvSIMDScalar("aarch64-simd-scalar", cl::desc("Enable use of AdvSIMD scalar"
+                    " integer instructions"), cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+EnablePromoteConstant("aarch64-promote-const", cl::desc("Enable the promote "
+                      "constant pass"), cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableCollectLOH("aarch64-collect-loh", cl::desc("Enable the pass that emits the"
+                 " linker optimization hints (LOH)"), cl::init(true),
+                 cl::Hidden);
+
+static cl::opt<bool>
+EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden,
+                              cl::desc("Enable the pass that removes dead"
+                                       " definitons and replaces stores to"
+                                       " them with stores to the zero"
+                                       " register"),
+                              cl::init(true));
+
+static cl::opt<bool>
+EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
+                   " optimization pass"), cl::init(true), cl::Hidden);
+
 extern "C" void LLVMInitializeAArch64Target() {
+  // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
   RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
+
+  RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64leTarget);
+  RegisterTargetMachine<AArch64beTargetMachine> W(TheARM64beTarget);
 }
 
+/// TargetMachine ctor - Create an AArch64 architecture model.
+///
 AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
                                            Reloc::Model RM, CodeModel::Model CM,
                                            CodeGenOpt::Level OL,
                                            bool LittleEndian)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, LittleEndian),
-    InstrInfo(Subtarget),
-    DL(LittleEndian ?
-       "e-m:e-i64:64-i128:128-n32:64-S128" :
-       "E-m:e-i64:64-i128:128-n32:64-S128"),
-    TLInfo(*this),
-    TSInfo(*this),
-    FrameLowering(Subtarget) {
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, LittleEndian),
+      // This nested ternary is horrible, but DL needs to be properly
+      // initialized
+      // before TLInfo is constructed.
+      DL(Subtarget.isTargetMachO()
+             ? "e-m:o-i64:64-i128:128-n32:64-S128"
+             : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128"
+                             : "E-m:e-i64:64-i128:128-n32:64-S128")),
+      InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget),
+      TSInfo(*this) {
   initAsmInfo();
 }
 
@@ -63,50 +102,107 @@ AArch64beTargetMachine(const Target &T, StringRef TT,
                        CodeGenOpt::Level OL)
   : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
-void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our AArch64 pass. This
-  // allows the AArch64 pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createAArch64TargetTransformInfoPass(this));
-}
-
 namespace {
 /// AArch64 Code Generator Pass Configuration Options.
 class AArch64PassConfig : public TargetPassConfig {
 public:
   AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {}
 
   AArch64TargetMachine &getAArch64TargetMachine() const {
     return getTM<AArch64TargetMachine>();
   }
 
-  const AArch64Subtarget &getAArch64Subtarget() const {
-    return *getAArch64TargetMachine().getSubtargetImpl();
-  }
-
-  virtual bool addInstSelector();
-  virtual bool addPreEmitPass();
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addILPOpts() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
+void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our AArch64 pass. This
+  // allows the AArch64 pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createAArch64TargetTransformInfoPass(this));
+}
+
 TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new AArch64PassConfig(this, PM);
 }
 
-bool AArch64PassConfig::addPreEmitPass() {
-  addPass(&UnpackMachineBundlesID);
-  addPass(createAArch64BranchFixupPass());
-  return true;
+// Pass Pipeline Configuration
+bool AArch64PassConfig::addPreISel() {
+  // Run promote constant before global merge, so that the promoted constants
+  // get a chance to be merged
+  if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
+    addPass(createAArch64PromoteConstantPass());
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createGlobalMergePass(TM));
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64AddressTypePromotionPass());
+
+  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
+  // ourselves.
+  addPass(createAtomicExpandLoadLinkedPass(TM));
+
+  return false;
 }
 
 bool AArch64PassConfig::addInstSelector() {
-  addPass(createAArch64ISelDAG(getAArch64TargetMachine(), getOptLevel()));
+  addPass(createAArch64ISelDag(getAArch64TargetMachine(), getOptLevel()));
 
-  // For ELF, cleanup any local-dynamic TLS accesses.
-  if (getAArch64Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
+  // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
+  // references to _TLS_MODULE_BASE_ as possible.
+  if (TM->getSubtarget<AArch64Subtarget>().isTargetELF() &&
+      getOptLevel() != CodeGenOpt::None)
     addPass(createAArch64CleanupLocalDynamicTLSPass());
 
   return false;
 }
+
+bool AArch64PassConfig::addILPOpts() {
+  if (EnableCCMP)
+    addPass(createAArch64ConditionalCompares());
+  addPass(&EarlyIfConverterID);
+  if (EnableStPairSuppress)
+    addPass(createAArch64StorePairSuppressPass());
+  return true;
+}
+
+bool AArch64PassConfig::addPreRegAlloc() {
+  // Use AdvSIMD scalar instructions whenever profitable.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar)
+    addPass(createAArch64AdvSIMDScalar());
+  return true;
+}
+
+bool AArch64PassConfig::addPostRegAlloc() {
+  // Change dead register definitions to refer to the zero register.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
+    addPass(createAArch64DeadRegisterDefinitions());
+  return true;
+}
+
+bool AArch64PassConfig::addPreSched2() {
+  // Expand some pseudo instructions to allow proper scheduling.
+  addPass(createAArch64ExpandPseudoPass());
+  // Use load/store pair instructions when possible.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
+    addPass(createAArch64LoadStoreOptimizationPass());
+  return true;
+}
+
+bool AArch64PassConfig::addPreEmitPass() {
+  // Relax conditional branch instructions if they're otherwise out of
+  // range of their destination.
+  addPass(createAArch64BranchRelaxation());
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
+      TM->getSubtarget<AArch64Subtarget>().isTargetMachO())
+    addPass(createAArch64CollectLOHPass());
+  return true;
+}
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 4297c92..079b19b 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -1,4 +1,4 @@
-//=== AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-===//
+//==-- AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,60 +11,60 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64TARGETMACHINE_H
-#define LLVM_AARCH64TARGETMACHINE_H
+#ifndef AArch64TARGETMACHINE_H
+#define AArch64TARGETMACHINE_H
 
-#include "AArch64FrameLowering.h"
-#include "AArch64ISelLowering.h"
 #include "AArch64InstrInfo.h"
-#include "AArch64SelectionDAGInfo.h"
+#include "AArch64ISelLowering.h"
 #include "AArch64Subtarget.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64SelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCStreamer.h"
 
 namespace llvm {
 
 class AArch64TargetMachine : public LLVMTargetMachine {
-  AArch64Subtarget          Subtarget;
-  AArch64InstrInfo          InstrInfo;
-  const DataLayout          DL;
-  AArch64TargetLowering     TLInfo;
-  AArch64SelectionDAGInfo   TSInfo;
-  AArch64FrameLowering      FrameLowering;
+protected:
+  AArch64Subtarget Subtarget;
+
+private:
+  const DataLayout DL;
+  AArch64InstrInfo InstrInfo;
+  AArch64TargetLowering TLInfo;
+  AArch64FrameLowering FrameLowering;
+  AArch64SelectionDAGInfo TSInfo;
 
 public:
   AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
                        Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL,
-                       bool LittleEndian);
+                       CodeGenOpt::Level OL, bool IsLittleEndian);
 
-  const AArch64InstrInfo *getInstrInfo() const {
-    return &InstrInfo;
+  const AArch64Subtarget *getSubtargetImpl() const override {
+    return &Subtarget;
   }
-
-  const AArch64FrameLowering *getFrameLowering() const {
+  const AArch64TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const AArch64FrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-
-  const AArch64TargetLowering *getTargetLowering() const {
-    return &TLInfo;
+  const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const AArch64RegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
   }
-
-  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const {
+  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
-  const AArch64Subtarget *getSubtargetImpl() const { return &Subtarget; }
-
-  const DataLayout *getDataLayout() const { return &DL; }
-
-  const TargetRegisterInfo *getRegisterInfo() const {
-    return &InstrInfo.getRegisterInfo();
-  }
-  TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  /// \brief Register AArch64 analysis passes with a pass manager.
+  void addAnalysisPasses(PassManagerBase &PM) override;
 };
 
 // AArch64leTargetMachine - AArch64 little endian target machine.
@@ -72,8 +72,8 @@ public:
 class AArch64leTargetMachine : public AArch64TargetMachine {
   virtual void anchor();
 public:
-  AArch64leTargetMachine(const Target &T, StringRef TT,
-                         StringRef CPU, StringRef FS, const TargetOptions &Options,
+  AArch64leTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
                          Reloc::Model RM, CodeModel::Model CM,
                          CodeGenOpt::Level OL);
 };
@@ -83,12 +83,12 @@ public:
 class AArch64beTargetMachine : public AArch64TargetMachine {
   virtual void anchor();
 public:
-  AArch64beTargetMachine(const Target &T, StringRef TT,
-                         StringRef CPU, StringRef FS, const TargetOptions &Options,
+  AArch64beTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
                          Reloc::Model RM, CodeModel::Model CM,
                          CodeGenOpt::Level OL);
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 663d619..4069038 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -6,19 +6,47 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file deals with any AArch64 specific requirements on object files.
-//
-//===----------------------------------------------------------------------===//
-
 
 #include "AArch64TargetObjectFile.h"
-
+#include "AArch64TargetMachine.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Dwarf.h"
 using namespace llvm;
+using namespace dwarf;
 
-void
-AArch64ElfTargetObjectFile::Initialize(MCContext &Ctx,
-                                       const TargetMachine &TM) {
+void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 }
+
+const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
+    const MCExpr *Res =
+        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+    MCSymbol *PCSym = getContext().CreateTempSymbol();
+    Streamer.EmitLabel(PCSym);
+    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
+    return MCBinaryExpr::CreateSub(Res, PC, getContext());
+  }
+
+  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+      GV, Encoding, Mang, TM, MMI, Streamer);
+}
+
+MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV, Mang);
+}
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 0f00a78..de63cb4 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -1,4 +1,4 @@
-//===-- AArch64TargetObjectFile.h - AArch64 Object Info ---------*- C++ -*-===//
+//===-- AArch64TargetObjectFile.h - AArch64 Object Info -*- C++ ---------*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,25 +6,34 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file deals with any AArch64 specific requirements on object files.
-//
-//===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
-#define LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
+#ifndef LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
+#define LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
+class AArch64TargetMachine;
+
+/// This implementation is used for AArch64 ELF targets (Linux in particular).
+class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+/// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
+class AArch64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+public:
+  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+                                        unsigned Encoding, Mangler &Mang,
+                                        const TargetMachine &TM,
+                                        MachineModuleInfo *MMI,
+                                        MCStreamer &Streamer) const override;
 
-  /// AArch64ElfTargetObjectFile - This implementation is used for ELF
-  /// AArch64 targets.
-  class AArch64ElfTargetObjectFile : public TargetLoweringObjectFileELF {
-    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
-  };
+  MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+                                    const TargetMachine &TM,
+                                    MachineModuleInfo *MMI) const override;
+};
 
 } // end namespace llvm
 
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e2a1647..33e482a 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1,4 +1,4 @@
-//===- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass ---------===//
+//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,15 +14,18 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "aarch64tti"
 #include "AArch64.h"
 #include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
+#include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64tti"
+
 // Declare the pass initialization routine locally as target-specific passes
 // don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
@@ -33,25 +36,28 @@ void initializeAArch64TTIPass(PassRegistry &);
 namespace {
 
 class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
+  const AArch64TargetMachine *TM;
   const AArch64Subtarget *ST;
   const AArch64TargetLowering *TLI;
 
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
 public:
-  AArch64TTI() : ImmutablePass(ID), ST(0), TLI(0) {
+  AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
   AArch64TTI(const AArch64TargetMachine *TM)
-      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
         TLI(TM->getTargetLowering()) {
     initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() override {
-    pushTTIStack(this);
-  }
+  void initializePass() override { pushTTIStack(this); }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
@@ -59,31 +65,37 @@ public:
   static char ID;
 
   /// Provide necessary pointer adjustments for the two base classes.
-  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+  void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo*)this;
+      return (TargetTransformInfo *)this;
     return this;
   }
 
   /// \name Scalar TTI Implementations
   /// @{
+  unsigned getIntImmCost(int64_t Val) const;
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
 
   /// @}
 
-
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector) const {
+  unsigned getNumberOfRegisters(bool Vector) const override {
     if (Vector) {
       if (ST->hasNEON())
         return 32;
       return 0;
     }
-    return 32;
+    return 31;
   }
 
-  unsigned getRegisterBitWidth(bool Vector) const {
+  unsigned getRegisterBitWidth(bool Vector) const override {
     if (Vector) {
       if (ST->hasNEON())
         return 128;
@@ -92,6 +104,26 @@ public:
     return 64;
   }
 
+  unsigned getMaximumUnrollFactor() const override { return 2; }
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
+      override;
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
+      override;
+
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                  OperandValueKind Opd1Info = OK_AnyValue,
+                                  OperandValueKind Opd2Info = OK_AnyValue) const
+      override;
+
+  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
+      override;
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const override;
   /// @}
 };
 
@@ -105,3 +137,328 @@ ImmutablePass *
 llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
   return new AArch64TTI(TM);
 }
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
+  // Check if the immediate can be encoded within an instruction.
+  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
+    return 0;
+
+  if (Val < 0)
+    Val = ~Val;
+
+  // Calculate how many moves we will need to materialize this constant.
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  return (64 - LZ + 15) / 16;
+}
+
+/// \brief Calculate the cost of materializing the given constant.
+unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  // Sign-extend all constants to a multiple of 64-bit.
+  APInt ImmVal = Imm;
+  if (BitSize & 0x3f)
+    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+  // Split the constant into 64-bit chunks and calculate the cost for each
+  // chunk.
+  unsigned Cost = 0;
+  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+    int64_t Val = Tmp.getSExtValue();
+    Cost += getIntImmCost(Val);
+  }
+  // We need at least one instruction to materialze the constant.
+  return std::max(1U, Cost);
+}
+
+unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                 const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TCC_Free;
+
+  unsigned ImmIdx = ~0U;
+  switch (Opcode) {
+  default:
+    return TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr.
+    if (Idx == 0)
+      return 2 * TCC_Basic;
+    return TCC_Free;
+  case Instruction::Store:
+    ImmIdx = 0;
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+    ImmIdx = 1;
+    break;
+  // Always return TCC_Free for the shift value of a shift instruction.
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    if (Idx == 1)
+      return TCC_Free;
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+  case Instruction::BitCast:
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Select:
+  case Instruction::Ret:
+  case Instruction::Load:
+    break;
+  }
+
+  if (Idx == ImmIdx) {
+    unsigned NumConstants = (BitSize + 63) / 64;
+    unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TCC_Basic)
+      ? static_cast<unsigned>(TCC_Free) : Cost;
+  }
+  return AArch64TTI::getIntImmCost(Imm, Ty);
+}
+
+unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                 const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TCC_Free;
+
+  switch (IID) {
+  default:
+    return TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    if (Idx == 1) {
+      unsigned NumConstants = (BitSize + 63) / 64;
+      unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
+      return (Cost <= NumConstants * TCC_Basic)
+        ? static_cast<unsigned>(TCC_Free) : Cost;
+    }
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  }
+  return AArch64TTI::getIntImmCost(Imm, Ty);
+}
+
+AArch64TTI::PopcntSupportKind
+AArch64TTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  if (TyWidth == 32 || TyWidth == 64)
+    return PSK_FastHardware;
+  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
+  return PSK_Software;
+}
+
+unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
+    // LowerVectorINT_TO_FP:
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    // LowerVectorFP_TO_INT
+    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 4 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 4 },
+    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
+    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
+  };
+
+  int Idx = ConvertCostTableLookup<MVT>(
+      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
+      SrcTy.getSimpleVT());
+  if (Idx != -1)
+    return ConversionTbl[Idx].Cost;
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // The element at index zero is already inside the vector.
+    if (Index == 0)
+      return 0;
+  }
+
+  // All other insert/extracts cost this much.
+  return 2;
+}
+
+unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                          OperandValueKind Opd1Info,
+                                          OperandValueKind Opd2Info) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  switch (ISD) {
+  default:
+    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
+                                                       Opd2Info);
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::XOR:
+  case ISD::OR:
+  case ISD::AND:
+    // These nodes are marked as 'custom' for combining purposes only.
+    // We know that they are legal. See LowerAdd in ISelLowering.
+    return 1 * LT.first;
+  }
+}
+
+unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
+
+unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const {
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  // We don't lower vector selects well that are wider than the register width.
+  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    // We would need this many instructions to hide the scalarization happening.
+    unsigned AmortizationCost = 20;
+    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    VectorSelectTbl[] = {
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
+    };
+
+    EVT SelCondTy = TLI->getValueType(CondTy);
+    EVT SelValTy = TLI->getValueType(ValTy);
+    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+      int Idx =
+          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
+                                 SelValTy.getSimpleVT());
+      if (Idx != -1)
+        return VectorSelectTbl[Idx].Cost;
+    }
+  }
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const {
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+  if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isIntegerTy(64)) {
+    // Unaligned stores are extremely inefficient. We don't split
+    // unaligned v2i64 stores because the negative impact that has shown in
+    // practice on inlined memcpy code.
+    // We make v2i64 stores expensive so that we will only vectorize if there
+    // are 6 other instructions getting vectorized.
+    unsigned AmortizationCost = 6;
+
+    return LT.first * 2 * AmortizationCost;
+  }
+
+  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
+      Src->getVectorNumElements() < 8) {
+    // We scalarize the loads/stores because there is not v.4b register and we
+    // have to promote the elements to v.4h.
+    unsigned NumVecElts = Src->getVectorNumElements();
+    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+    // We generate 2 instructions per vector element.
+    return NumVectorizableInstsToAmortize * NumVecElts * 2;
+  }
+
+  return LT.first;
+}
diff --git a/lib/Target/AArch64/Android.mk b/lib/Target/AArch64/Android.mk
index 144c2d3..d0a50da 100644
--- a/lib/Target/AArch64/Android.mk
+++ b/lib/Target/AArch64/Android.mk
@@ -3,31 +3,41 @@ LOCAL_PATH := $(call my-dir)
 arm64_codegen_TBLGEN_TABLES := \
   AArch64GenRegisterInfo.inc \
   AArch64GenInstrInfo.inc \
-  AArch64GenCodeEmitter.inc \
-  AArch64GenMCCodeEmitter.inc \
-  AArch64GenMCPseudoLowering.inc \
   AArch64GenAsmWriter.inc \
-  AArch64GenAsmMatcher.inc \
+  AArch64GenAsmWriter1.inc \
   AArch64GenDAGISel.inc \
-  AArch64GenFastISel.inc \
   AArch64GenCallingConv.inc \
+  AArch64GenAsmMatcher.inc \
   AArch64GenSubtargetInfo.inc \
-  AArch64GenDisassemblerTables.inc
+  AArch64GenMCCodeEmitter.inc \
+  AArch64GenFastISel.inc \
+  AArch64GenDisassemblerTables.inc \
+  AArch64GenMCPseudoLowering.inc \
 
 arm64_codegen_SRC_FILES := \
+  AArch64AddressTypePromotion.cpp \
+  AArch64AdvSIMDScalarPass.cpp \
   AArch64AsmPrinter.cpp \
+  AArch64BranchRelaxation.cpp \
+  AArch64CleanupLocalDynamicTLSPass.cpp \
+  AArch64CollectLOH.cpp \
+  AArch64ConditionalCompares.cpp \
+  AArch64DeadRegisterDefinitionsPass.cpp \
+  AArch64ExpandPseudoInsts.cpp \
+  AArch64FastISel.cpp \
   AArch64FrameLowering.cpp \
-  AArch64ISelDAGToDAG.cpp \
-  AArch64MachineFunctionInfo.cpp \
-  AArch64RegisterInfo.cpp \
-  AArch64Subtarget.cpp \
-  AArch64TargetObjectFile.cpp \
-  AArch64BranchFixupPass.cpp \
   AArch64InstrInfo.cpp \
+  AArch64ISelDAGToDAG.cpp \
   AArch64ISelLowering.cpp \
+  AArch64LoadStoreOptimizer.cpp \
   AArch64MCInstLower.cpp \
+  AArch64PromoteConstant.cpp \
+  AArch64RegisterInfo.cpp \
   AArch64SelectionDAGInfo.cpp \
+  AArch64StorePairSuppress.cpp \
+  AArch64Subtarget.cpp \
   AArch64TargetMachine.cpp \
+  AArch64TargetObjectFile.cpp \
   AArch64TargetTransformInfo.cpp
 
 # For the host
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index e933ec1..65b77c5 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -6,34 +6,31 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the (GNU-style) assembly parser for the AArch64
-// architecture.
-//
-//===----------------------------------------------------------------------===//
-
 
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include <cstdio>
 using namespace llvm;
 
 namespace {
@@ -41,21 +38,74 @@ namespace {
 class AArch64Operand;
 
 class AArch64AsmParser : public MCTargetAsmParser {
+public:
+  typedef SmallVectorImpl<MCParsedAsmOperand *> OperandVector;
+
+private:
+  StringRef Mnemonic; ///< Instruction mnemonic.
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
 
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  SMLoc getLoc() const { return Parser.getTok().getLoc(); }
+
+  bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+  AArch64CC::CondCode parseCondCodeString(StringRef Cond);
+  bool parseCondCode(OperandVector &Operands, bool invertCondCode);
+  int tryParseRegister();
+  int tryMatchVectorRegister(StringRef &Kind, bool expected);
+  bool parseRegister(OperandVector &Operands);
+  bool parseSymbolicImmVal(const MCExpr *&ImmVal);
+  bool parseVectorList(OperandVector &Operands);
+  bool parseOperand(OperandVector &Operands, bool isCondCode,
+                    bool invertCondCode);
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+  bool showMatchError(SMLoc Loc, unsigned ErrCode);
+
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveTLSDescCall(SMLoc L);
+
+  bool parseDirectiveLOH(StringRef LOH, SMLoc L);
+
+  bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+/// @name Auto-generated Match Functions
+/// {
+
 #define GET_ASSEMBLER_HEADER
 #include "AArch64GenAsmMatcher.inc"
 
+  /// }
+
+  OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
+  OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
+  OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
+  OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
+  bool tryParseVectorRegister(OperandVector &Operands);
+
 public:
   enum AArch64MatchResultTy {
-    Match_FirstAArch64 = FIRST_TARGET_MATCH_RESULT_TY,
+    Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
 #define GET_OPERAND_DIAGNOSTIC_TYPES
 #include "AArch64GenAsmMatcher.inc"
   };
-
   AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-                   const MCInstrInfo &MII)
+                 const MCInstrInfo &MII,
+                 const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
 
@@ -63,191 +113,197 @@ public:
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
-  // These are the public interface of the MCTargetAsmParser
-  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  bool ParseDirective(AsmToken DirectiveID);
-  bool ParseDirectiveTLSDescCall(SMLoc L);
-  bool ParseDirectiveWord(unsigned Size, SMLoc L);
-
-  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer&Out, unsigned &ErrorInfo,
-                               bool MatchingInlineAsm);
-
-  // The rest of the sub-parsers have more freedom over interface: they return
-  // an OperandMatchResultTy because it's less ambiguous than true/false or
-  // -1/0/1 even if it is more verbose
-  OperandMatchResultTy
-  ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-               StringRef Mnemonic);
-
-  OperandMatchResultTy ParseImmediate(const MCExpr *&ExprVal);
-
-  OperandMatchResultTy ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind);
-
-  OperandMatchResultTy
-  ParseNEONLane(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                uint32_t NumLanes);
-
-  OperandMatchResultTy
-  ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                uint32_t &NumLanes);
-
-  OperandMatchResultTy
-  ParseImmWithLSLOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseCondCodeOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseCRxOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseFPImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseFPImm0AndImm0Operand( SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  template<typename SomeNamedImmMapper> OperandMatchResultTy
-  ParseNamedImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return ParseNamedImmOperand(SomeNamedImmMapper(), Operands);
-  }
-
-  OperandMatchResultTy
-  ParseNamedImmOperand(const NamedImmMapper &Mapper,
-                       SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseLSXAddressOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseShiftExtend(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseSysRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  bool TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, StringRef &Layout,
-                      SMLoc &LayoutLoc);
-
-  OperandMatchResultTy ParseVectorList(SmallVectorImpl<MCParsedAsmOperand *> &);
-
-  bool validateInstruction(MCInst &Inst,
-                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  /// Scan the next token (which had better be an identifier) and determine
-  /// whether it represents a general-purpose or vector register. It returns
-  /// true if an identifier was found and populates its reference arguments. It
-  /// does not consume the token.
-  bool
-  IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc, StringRef &LayoutSpec,
-                   SMLoc &LayoutLoc) const;
-
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+                                      unsigned Kind) override;
+
+  static bool classifySymbolRef(const MCExpr *Expr,
+                                AArch64MCExpr::VariantKind &ELFRefKind,
+                                MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                int64_t &Addend);
 };
-
-}
+} // end anonymous namespace
 
 namespace {
 
-/// Instances of this class represent a parsed AArch64 machine instruction.
+/// AArch64Operand - Instances of this class represent a parsed AArch64 machine
+/// instruction.
 class AArch64Operand : public MCParsedAsmOperand {
 private:
   enum KindTy {
-    k_ImmWithLSL,     // #uimm {, LSL #amt }
-    k_CondCode,       // eq/ne/...
-    k_FPImmediate,    // Limited-precision floating-point imm
-    k_Immediate,      // Including expressions referencing symbols
+    k_Immediate,
+    k_ShiftedImm,
+    k_CondCode,
     k_Register,
+    k_VectorList,
+    k_VectorIndex,
+    k_Token,
+    k_SysReg,
+    k_SysCR,
+    k_Prefetch,
     k_ShiftExtend,
-    k_VectorList,     // A sequential list of 1 to 4 registers.
-    k_SysReg,         // The register operand of MRS and MSR instructions
-    k_Token,          // The mnemonic; other raw tokens the auto-generated
-    k_WrappedRegister // Load/store exclusive permit a wrapped register.
+    k_FPImm,
+    k_Barrier
   } Kind;
 
   SMLoc StartLoc, EndLoc;
 
-  struct ImmWithLSLOp {
-    const MCExpr *Val;
-    unsigned ShiftAmount;
-    bool ImplicitAmount;
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+    bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
   };
 
-  struct CondCodeOp {
-    A64CC::CondCodes Code;
+  struct RegOp {
+    unsigned RegNum;
+    bool isVector;
   };
 
-  struct FPImmOp {
-    double Val;
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    unsigned NumElements;
+    unsigned ElementKind;
+  };
+
+  struct VectorIndexOp {
+    unsigned Val;
   };
 
   struct ImmOp {
     const MCExpr *Val;
   };
 
-  struct RegOp {
-    unsigned RegNum;
+  struct ShiftedImmOp {
+    const MCExpr *Val;
+    unsigned ShiftAmount;
   };
 
-  struct ShiftExtendOp {
-    A64SE::ShiftExtSpecifiers ShiftType;
-    unsigned Amount;
-    bool ImplicitAmount;
+  struct CondCodeOp {
+    AArch64CC::CondCode Code;
   };
 
-  // A vector register list is a sequential list of 1 to 4 registers.
-  struct VectorListOp {
-    unsigned RegNum;
-    unsigned Count;
-    A64Layout::VectorLayout Layout;
+  struct FPImmOp {
+    unsigned Val; // Encoded 8-bit representation.
+  };
+
+  struct BarrierOp {
+    unsigned Val; // Not the enum since not all values have names.
   };
 
   struct SysRegOp {
     const char *Data;
     unsigned Length;
+    uint64_t FeatureBits; // We need to pass through information about which
+                          // core we are compiling for so that the SysReg
+                          // Mappers can appropriately conditionalize.
   };
 
-  struct TokOp {
-    const char *Data;
-    unsigned Length;
+  struct SysCRImmOp {
+    unsigned Val;
+  };
+
+  struct PrefetchOp {
+    unsigned Val;
+  };
+
+  struct ShiftExtendOp {
+    AArch64_AM::ShiftExtendType Type;
+    unsigned Amount;
+    bool HasExplicitAmount;
+  };
+
+  struct ExtendOp {
+    unsigned Val;
   };
 
   union {
-    struct ImmWithLSLOp ImmWithLSL;
-    struct CondCodeOp CondCode;
-    struct FPImmOp FPImm;
-    struct ImmOp Imm;
+    struct TokOp Tok;
     struct RegOp Reg;
-    struct ShiftExtendOp ShiftExtend;
     struct VectorListOp VectorList;
+    struct VectorIndexOp VectorIndex;
+    struct ImmOp Imm;
+    struct ShiftedImmOp ShiftedImm;
+    struct CondCodeOp CondCode;
+    struct FPImmOp FPImm;
+    struct BarrierOp Barrier;
     struct SysRegOp SysReg;
-    struct TokOp Tok;
+    struct SysCRImmOp SysCRImm;
+    struct PrefetchOp Prefetch;
+    struct ShiftExtendOp ShiftExtend;
   };
 
-  AArch64Operand(KindTy K, SMLoc S, SMLoc E)
-    : MCParsedAsmOperand(), Kind(K), StartLoc(S), EndLoc(E) {}
+  // Keep the MCContext around as the MCExprs may need manipulated during
+  // the add<>Operands() calls.
+  MCContext &Ctx;
+
+  AArch64Operand(KindTy K, MCContext &_Ctx)
+      : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {}
 
 public:
-  AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand() {
+  AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    switch (Kind) {
+    case k_Token:
+      Tok = o.Tok;
+      break;
+    case k_Immediate:
+      Imm = o.Imm;
+      break;
+    case k_ShiftedImm:
+      ShiftedImm = o.ShiftedImm;
+      break;
+    case k_CondCode:
+      CondCode = o.CondCode;
+      break;
+    case k_FPImm:
+      FPImm = o.FPImm;
+      break;
+    case k_Barrier:
+      Barrier = o.Barrier;
+      break;
+    case k_Register:
+      Reg = o.Reg;
+      break;
+    case k_VectorList:
+      VectorList = o.VectorList;
+      break;
+    case k_VectorIndex:
+      VectorIndex = o.VectorIndex;
+      break;
+    case k_SysReg:
+      SysReg = o.SysReg;
+      break;
+    case k_SysCR:
+      SysCRImm = o.SysCRImm;
+      break;
+    case k_Prefetch:
+      Prefetch = o.Prefetch;
+      break;
+    case k_ShiftExtend:
+      ShiftExtend = o.ShiftExtend;
+      break;
+    }
   }
 
-  SMLoc getStartLoc() const { return StartLoc; }
-  SMLoc getEndLoc() const { return EndLoc; }
-  void print(raw_ostream&) const;
-  void dump() const;
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
 
   StringRef getToken() const {
     assert(Kind == k_Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
   }
 
-  unsigned getReg() const {
-    assert((Kind == k_Register || Kind == k_WrappedRegister)
-           && "Invalid access!");
-    return Reg.RegNum;
+  bool isTokenSuffix() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return Tok.IsSuffix;
   }
 
   const MCExpr *getImm() const {
@@ -255,731 +311,779 @@ public:
     return Imm.Val;
   }
 
-  A64CC::CondCodes getCondCode() const {
-    assert(Kind == k_CondCode && "Invalid access!");
-    return CondCode.Code;
+  const MCExpr *getShiftedImmVal() const {
+    assert(Kind == k_ShiftedImm && "Invalid access!");
+    return ShiftedImm.Val;
   }
 
-  static bool isNonConstantExpr(const MCExpr *E,
-                                AArch64MCExpr::VariantKind &Variant) {
-    if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(E)) {
-      Variant = A64E->getKind();
-      return true;
-    } else if (!isa<MCConstantExpr>(E)) {
-      Variant = AArch64MCExpr::VK_AARCH64_None;
-      return true;
-    }
-
-    return false;
+  unsigned getShiftedImmShift() const {
+    assert(Kind == k_ShiftedImm && "Invalid access!");
+    return ShiftedImm.ShiftAmount;
   }
 
-  bool isCondCode() const { return Kind == k_CondCode; }
-  bool isToken() const { return Kind == k_Token; }
-  bool isReg() const { return Kind == k_Register; }
-  bool isImm() const { return Kind == k_Immediate; }
-  bool isMem() const { return false; }
-  bool isFPImm() const { return Kind == k_FPImmediate; }
-  bool isShiftOrExtend() const { return Kind == k_ShiftExtend; }
-  bool isSysReg() const { return Kind == k_SysReg; }
-  bool isImmWithLSL() const { return Kind == k_ImmWithLSL; }
-  bool isWrappedReg() const { return Kind == k_WrappedRegister; }
-
-  bool isAddSubImmLSL0() const {
-    if (!isImmWithLSL()) return false;
-    if (ImmWithLSL.ShiftAmount != 0) return false;
-
-    AArch64MCExpr::VariantKind Variant;
-    if (isNonConstantExpr(ImmWithLSL.Val, Variant)) {
-      return Variant == AArch64MCExpr::VK_AARCH64_LO12
-          || Variant == AArch64MCExpr::VK_AARCH64_DTPREL_LO12
-          || Variant == AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC
-          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_LO12
-          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC
-          || Variant == AArch64MCExpr::VK_AARCH64_TLSDESC_LO12;
-    }
-
-    // Otherwise it should be a real immediate in range:
-    const MCConstantExpr *CE = cast<MCConstantExpr>(ImmWithLSL.Val);
-    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  AArch64CC::CondCode getCondCode() const {
+    assert(Kind == k_CondCode && "Invalid access!");
+    return CondCode.Code;
   }
 
-  bool isAddSubImmLSL12() const {
-    if (!isImmWithLSL()) return false;
-    if (ImmWithLSL.ShiftAmount != 12) return false;
-
-    AArch64MCExpr::VariantKind Variant;
-    if (isNonConstantExpr(ImmWithLSL.Val, Variant)) {
-      return Variant == AArch64MCExpr::VK_AARCH64_DTPREL_HI12
-          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_HI12;
-    }
-
-    // Otherwise it should be a real immediate in range:
-    const MCConstantExpr *CE = cast<MCConstantExpr>(ImmWithLSL.Val);
-    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  unsigned getFPImm() const {
+    assert(Kind == k_FPImm && "Invalid access!");
+    return FPImm.Val;
   }
 
-  template<unsigned MemSize, unsigned RmSize> bool isAddrRegExtend() const {
-    if (!isShiftOrExtend()) return false;
-
-    A64SE::ShiftExtSpecifiers Ext = ShiftExtend.ShiftType;
-    if (RmSize == 32 && !(Ext == A64SE::UXTW || Ext == A64SE::SXTW))
-      return false;
-
-    if (RmSize == 64 && !(Ext == A64SE::LSL || Ext == A64SE::SXTX))
-      return false;
-
-    return ShiftExtend.Amount == Log2_32(MemSize) || ShiftExtend.Amount == 0;
+  unsigned getBarrier() const {
+    assert(Kind == k_Barrier && "Invalid access!");
+    return Barrier.Val;
   }
 
-  bool isAdrpLabel() const {
-    if (!isImm()) return false;
-
-    AArch64MCExpr::VariantKind Variant;
-    if (isNonConstantExpr(getImm(), Variant)) {
-      return Variant == AArch64MCExpr::VK_AARCH64_None
-        || Variant == AArch64MCExpr::VK_AARCH64_GOT
-        || Variant == AArch64MCExpr::VK_AARCH64_GOTTPREL
-        || Variant == AArch64MCExpr::VK_AARCH64_TLSDESC;
-    }
-
-    return isLabel<21, 4096>();
+  unsigned getReg() const override {
+    assert(Kind == k_Register && "Invalid access!");
+    return Reg.RegNum;
   }
 
-  template<unsigned RegWidth>  bool isBitfieldWidth() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-
-    return CE->getValue() >= 1 && CE->getValue() <= RegWidth;
+  unsigned getVectorListStart() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.RegNum;
   }
 
-  template<int RegWidth>
-  bool isCVTFixedPos() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-
-    return CE->getValue() >= 1 && CE->getValue() <= RegWidth;
+  unsigned getVectorListCount() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.Count;
   }
 
-  bool isFMOVImm() const {
-    if (!isFPImm()) return false;
-
-    APFloat RealVal(FPImm.Val);
-    uint32_t ImmVal;
-    return A64Imms::isFPImm(RealVal, ImmVal);
+  unsigned getVectorIndex() const {
+    assert(Kind == k_VectorIndex && "Invalid access!");
+    return VectorIndex.Val;
   }
 
-  bool isFPZero() const {
-    if (!isFPImm()) return false;
-
-    APFloat RealVal(FPImm.Val);
-    return RealVal.isPosZero();
+  StringRef getSysReg() const {
+    assert(Kind == k_SysReg && "Invalid access!");
+    return StringRef(SysReg.Data, SysReg.Length);
   }
 
-  template<unsigned field_width, unsigned scale>
-  bool isLabel() const {
-    if (!isImm()) return false;
-
-    if (dyn_cast<MCSymbolRefExpr>(Imm.Val)) {
-      return true;
-    } else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
-      int64_t Val = CE->getValue();
-      int64_t Min = - (scale * (1LL << (field_width - 1)));
-      int64_t Max = scale * ((1LL << (field_width - 1)) - 1);
-      return (Val % scale) == 0 && Val >= Min && Val <= Max;
-    }
-
-    // N.b. this disallows explicit relocation specifications via an
-    // AArch64MCExpr. Users needing that behaviour
-    return false;
+  uint64_t getSysRegFeatureBits() const {
+    assert(Kind == k_SysReg && "Invalid access!");
+    return SysReg.FeatureBits;
   }
 
-  bool isLane1() const {
-    if (!isImm()) return false;
-
-    // Because it's come through custom assembly parsing, it must always be a
-    // constant expression.
-    return cast<MCConstantExpr>(getImm())->getValue() == 1;
+  unsigned getSysCR() const {
+    assert(Kind == k_SysCR && "Invalid access!");
+    return SysCRImm.Val;
   }
 
-  bool isLoadLitLabel() const {
-    if (!isImm()) return false;
-
-    AArch64MCExpr::VariantKind Variant;
-    if (isNonConstantExpr(getImm(), Variant)) {
-      return Variant == AArch64MCExpr::VK_AARCH64_None
-          || Variant == AArch64MCExpr::VK_AARCH64_GOTTPREL;
-    }
-
-    return isLabel<19, 4>();
+  unsigned getPrefetch() const {
+    assert(Kind == k_Prefetch && "Invalid access!");
+    return Prefetch.Val;
   }
 
-  template<unsigned RegWidth> bool isLogicalImm() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
-    if (!CE) return false;
-
-    uint32_t Bits;
-    return A64Imms::isLogicalImm(RegWidth, CE->getValue(), Bits);
+  AArch64_AM::ShiftExtendType getShiftExtendType() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.Type;
   }
 
-  template<unsigned RegWidth> bool isLogicalImmMOV() const {
-    if (!isLogicalImm<RegWidth>()) return false;
-
-    const MCConstantExpr *CE = cast<MCConstantExpr>(Imm.Val);
-
-    // The move alias for ORR is only valid if the immediate cannot be
-    // represented with a move (immediate) instruction; they take priority.
-    int UImm16, Shift;
-    return !A64Imms::isMOVZImm(RegWidth, CE->getValue(), UImm16, Shift)
-      && !A64Imms::isMOVNImm(RegWidth, CE->getValue(), UImm16, Shift);
+  unsigned getShiftExtendAmount() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.Amount;
   }
 
-  template<int MemSize>
-  bool isOffsetUImm12() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-
-    // Assume they know what they're doing for now if they've given us a
-    // non-constant expression. In principle we could check for ridiculous
-    // things that can't possibly work or relocations that would almost
-    // certainly break resulting code.
-    if (!CE)
-      return true;
-
-    int64_t Val = CE->getValue();
-
-    // Must be a multiple of the access size in bytes.
-    if ((Val & (MemSize - 1)) != 0) return false;
-
-    // Must be 12-bit unsigned
-    return Val >= 0 && Val <= 0xfff * MemSize;
+  bool hasShiftExtendAmount() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.HasExplicitAmount;
   }
 
-  template<A64SE::ShiftExtSpecifiers SHKind, bool is64Bit>
-  bool isShift() const {
-    if (!isShiftOrExtend()) return false;
-
-    if (ShiftExtend.ShiftType != SHKind)
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isMem() const override { return false; }
+  bool isSImm9() const {
+    if (!isImm())
       return false;
-
-    return is64Bit ? ShiftExtend.Amount <= 63 : ShiftExtend.Amount <= 31;
-  }
-
-  bool isMOVN32Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_SABS_G0,
-      AArch64MCExpr::VK_AARCH64_SABS_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
-  }
-
-  bool isMOVN64Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_SABS_G0,
-      AArch64MCExpr::VK_AARCH64_SABS_G1,
-      AArch64MCExpr::VK_AARCH64_SABS_G2,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G2,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G2,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val < 256);
   }
-
-
-  bool isMOVZ32Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_ABS_G0,
-      AArch64MCExpr::VK_AARCH64_ABS_G1,
-      AArch64MCExpr::VK_AARCH64_SABS_G0,
-      AArch64MCExpr::VK_AARCH64_SABS_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
-  }
-
-  bool isMOVZ64Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_ABS_G0,
-      AArch64MCExpr::VK_AARCH64_ABS_G1,
-      AArch64MCExpr::VK_AARCH64_ABS_G2,
-      AArch64MCExpr::VK_AARCH64_ABS_G3,
-      AArch64MCExpr::VK_AARCH64_SABS_G0,
-      AArch64MCExpr::VK_AARCH64_SABS_G1,
-      AArch64MCExpr::VK_AARCH64_SABS_G2,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G2,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G2,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+  bool isSImm7s4() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
   }
-
-  bool isMOVK32Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
-      AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
-  }
-
-  bool isMOVK64Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
-      AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
-      AArch64MCExpr::VK_AARCH64_ABS_G2_NC,
-      AArch64MCExpr::VK_AARCH64_ABS_G3,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+  bool isSImm7s8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
   }
-
-  bool isMoveWideImm(unsigned RegWidth,
-                     const AArch64MCExpr::VariantKind *PermittedModifiers,
-                     unsigned NumModifiers) const {
-    if (!isImmWithLSL()) return false;
-
-    if (ImmWithLSL.ShiftAmount % 16 != 0) return false;
-    if (ImmWithLSL.ShiftAmount >= RegWidth) return false;
-
-    AArch64MCExpr::VariantKind Modifier;
-    if (isNonConstantExpr(ImmWithLSL.Val, Modifier)) {
-      // E.g. "#:abs_g0:sym, lsl #16" makes no sense.
-      if (!ImmWithLSL.ImplicitAmount) return false;
-
-      for (unsigned i = 0; i < NumModifiers; ++i)
-        if (PermittedModifiers[i] == Modifier) return true;
-
+  bool isSImm7s16() const {
+    if (!isImm())
       return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
+  }
+
+  bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const {
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (!AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
+                                           Addend)) {
+      // If we don't understand the expression, assume the best and
+      // let the fixup and relocation code deal with it.
+      return true;
     }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmWithLSL.Val);
-    return CE && CE->getValue() >= 0  && CE->getValue() <= 0xffff;
-  }
-
-  template<int RegWidth, bool (*isValidImm)(int, uint64_t, int&, int&)>
-  bool isMoveWideMovAlias() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-
-    int UImm16, Shift;
-    uint64_t Value = CE->getValue();
-
-    // If this is a 32-bit instruction then all bits above 32 should be the
-    // same: either of these is fine because signed/unsigned values should be
-    // permitted.
-    if (RegWidth == 32) {
-      if ((Value >> 32) != 0 && (Value >> 32) != 0xffffffff)
-        return false;
-
-      Value &= 0xffffffffULL;
+    if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+        ELFRefKind == AArch64MCExpr::VK_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_GOT_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) {
+      // Note that we don't range-check the addend. It's adjusted modulo page
+      // size when converted, so there is no "out of range" condition when using
+      // @pageoff.
+      return Addend >= 0 && (Addend % Scale) == 0;
+    } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
+               DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
+      // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
+      return Addend == 0;
     }
 
-    return isValidImm(RegWidth, Value, UImm16, Shift);
+    return false;
   }
 
-  bool isMSRWithReg() const {
-    if (!isSysReg()) return false;
+  template <int Scale> bool isUImm12Offset() const {
+    if (!isImm())
+      return false;
 
-    bool IsKnownRegister;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    A64SysReg::MSRMapper().fromString(Name, IsKnownRegister);
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return isSymbolicUImm12Offset(getImm(), Scale);
 
-    return IsKnownRegister;
+    int64_t Val = MCE->getValue();
+    return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
   }
 
-  bool isMSRPState() const {
-    if (!isSysReg()) return false;
-
-    bool IsKnownRegister;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    A64PState::PStateMapper().fromString(Name, IsKnownRegister);
-
-    return IsKnownRegister;
+  bool isImm0_7() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 8);
   }
-
-  bool isMRS() const {
-    if (!isSysReg()) return false;
-
-    // First check against specific MSR-only (write-only) registers
-    bool IsKnownRegister;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    A64SysReg::MRSMapper().fromString(Name, IsKnownRegister);
-
-    return IsKnownRegister;
+  bool isImm1_8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 9);
   }
-
-  bool isPRFM() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-
-    if (!CE)
+  bool isImm0_15() const {
+    if (!isImm())
       return false;
-
-    return CE->getValue() >= 0 && CE->getValue() <= 31;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 16);
   }
-
-  template<A64SE::ShiftExtSpecifiers SHKind> bool isRegExtend() const {
-    if (!isShiftOrExtend()) return false;
-
-    if (ShiftExtend.ShiftType != SHKind)
+  bool isImm1_16() const {
+    if (!isImm())
       return false;
-
-    return ShiftExtend.Amount <= 4;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 17);
   }
-
-  bool isRegExtendLSL() const {
-    if (!isShiftOrExtend()) return false;
-
-    if (ShiftExtend.ShiftType != A64SE::LSL)
+  bool isImm0_31() const {
+    if (!isImm())
       return false;
-
-    return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 32);
   }
-
-  // if 0 < value <= w, return true
-  bool isShrFixedWidth(int w) const {
+  bool isImm1_31() const {
     if (!isImm())
       return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
       return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= w;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 32);
   }
-
-  bool isShrImm8() const { return isShrFixedWidth(8); }
-
-  bool isShrImm16() const { return isShrFixedWidth(16); }
-
-  bool isShrImm32() const { return isShrFixedWidth(32); }
-
-  bool isShrImm64() const { return isShrFixedWidth(64); }
-
-  // if 0 <= value < w, return true
-  bool isShlFixedWidth(int w) const {
+  bool isImm1_32() const {
     if (!isImm())
       return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
       return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < w;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 33);
   }
+  bool isImm0_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 64);
+  }
+  bool isImm1_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 64);
+  }
+  bool isImm1_64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 65);
+  }
+  bool isImm0_127() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 128);
+  }
+  bool isImm0_255() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 256);
+  }
+  bool isImm0_65535() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 65536);
+  }
+  bool isImm32_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 32 && Val < 64);
+  }
+  bool isLogicalImm32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(MCE->getValue(), 32);
+  }
+  bool isLogicalImm64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
+  }
+  bool isShiftedImm() const { return Kind == k_ShiftedImm; }
+  bool isAddSubImm() const {
+    if (!isShiftedImm() && !isImm())
+      return false;
 
-  bool isShlImm8() const { return isShlFixedWidth(8); }
-
-  bool isShlImm16() const { return isShlFixedWidth(16); }
+    const MCExpr *Expr;
 
-  bool isShlImm32() const { return isShlFixedWidth(32); }
+    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
+    if (isShiftedImm()) {
+      unsigned Shift = ShiftedImm.ShiftAmount;
+      Expr = ShiftedImm.Val;
+      if (Shift != 0 && Shift != 12)
+        return false;
+    } else {
+      Expr = getImm();
+    }
 
-  bool isShlImm64() const { return isShlFixedWidth(64); }
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind,
+                                          DarwinRefKind, Addend)) {
+      return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
+          || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
+          || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
+          || ELFRefKind == AArch64MCExpr::VK_LO12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
+          || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12;
+    }
 
-  bool isNeonMovImmShiftLSL() const {
-    if (!isShiftOrExtend())
+    // Otherwise it should be a real immediate in range:
+    const MCConstantExpr *CE = cast<MCConstantExpr>(Expr);
+    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  }
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isSIMDImmType10() const {
+    if (!isImm())
       return false;
-
-    if (ShiftExtend.ShiftType != A64SE::LSL)
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
       return false;
-
-    // Valid shift amount is 0, 8, 16 and 24.
-    return ShiftExtend.Amount % 8 == 0 && ShiftExtend.Amount <= 24;
+    return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
   }
-
-  bool isNeonMovImmShiftLSLH() const {
-    if (!isShiftOrExtend())
+  bool isBranchTarget26() const {
+    if (!isImm())
       return false;
-
-    if (ShiftExtend.ShiftType != A64SE::LSL)
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
       return false;
-
-    // Valid shift amount is 0 and 8.
-    return ShiftExtend.Amount == 0 || ShiftExtend.Amount == 8;
+    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
+  }
+  bool isPCRelLabel19() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
+  }
+  bool isBranchTarget14() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
   }
 
-  bool isNeonMovImmShiftMSL() const {
-    if (!isShiftOrExtend())
+  bool
+  isMovWSymbol(ArrayRef<AArch64MCExpr::VariantKind> AllowedModifiers) const {
+    if (!isImm())
       return false;
 
-    if (ShiftExtend.ShiftType != A64SE::MSL)
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFRefKind,
+                                             DarwinRefKind, Addend)) {
+      return false;
+    }
+    if (DarwinRefKind != MCSymbolRefExpr::VK_None)
       return false;
 
-    // Valid shift amount is 8 and 16.
-    return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16;
-  }
+    for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
+      if (ELFRefKind == AllowedModifiers[i])
+        return Addend == 0;
+    }
 
-  template <A64Layout::VectorLayout Layout, unsigned Count>
-  bool isVectorList() const {
-    return Kind == k_VectorList && VectorList.Layout == Layout &&
-           VectorList.Count == Count;
+    return false;
   }
 
-  template <int MemSize> bool isSImm7Scaled() const {
-    if (!isImm())
-      return false;
+  bool isMovZSymbolG3() const {
+    static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 };
+    return isMovWSymbol(Variants);
+  }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+  bool isMovZSymbolG2() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
+        AArch64MCExpr::VK_TPREL_G2, AArch64MCExpr::VK_DTPREL_G2};
+    return isMovWSymbol(Variants);
+  }
 
-    int64_t Val = CE->getValue();
-    if (Val % MemSize != 0) return false;
+  bool isMovZSymbolG1() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G1,      AArch64MCExpr::VK_ABS_G1_S,
+        AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
+        AArch64MCExpr::VK_DTPREL_G1,
+    };
+    return isMovWSymbol(Variants);
+  }
 
-    Val /= MemSize;
+  bool isMovZSymbolG0() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
+        AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_DTPREL_G0};
+    return isMovWSymbol(Variants);
+  }
 
-    return Val >= -64 && Val < 64;
+  bool isMovKSymbolG3() const {
+    static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 };
+    return isMovWSymbol(Variants);
   }
 
-  template<int BitWidth>
-  bool isSImm() const {
-    if (!isImm()) return false;
+  bool isMovKSymbolG2() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G2_NC};
+    return isMovWSymbol(Variants);
+  }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+  bool isMovKSymbolG1() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+      AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_TPREL_G1_NC,
+      AArch64MCExpr::VK_DTPREL_G1_NC
+    };
+    return isMovWSymbol(Variants);
+  }
 
-    return CE->getValue() >= -(1LL << (BitWidth - 1))
-      && CE->getValue() < (1LL << (BitWidth - 1));
+  bool isMovKSymbolG0() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+      AArch64MCExpr::VK_ABS_G0_NC,   AArch64MCExpr::VK_GOTTPREL_G0_NC,
+      AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC
+    };
+    return isMovWSymbol(Variants);
   }
 
-  template<int bitWidth>
-  bool isUImm() const {
+  template<int RegWidth, int Shift>
+  bool isMOVZMovAlias() const {
     if (!isImm()) return false;
 
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
+    uint64_t Value = CE->getValue();
 
-    return CE->getValue() >= 0 && CE->getValue() < (1LL << bitWidth);
-  }
+    if (RegWidth == 32)
+      Value &= 0xffffffffULL;
 
-  bool isUImm() const {
-    if (!isImm()) return false;
+    // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
+    if (Value == 0 && Shift != 0)
+      return false;
 
-    return isa<MCConstantExpr>(getImm());
+    return (Value & ~(0xffffULL << Shift)) == 0;
   }
 
-  bool isNeonUImm64Mask() const {
-    if (!isImm())
-      return false;
+  template<int RegWidth, int Shift>
+  bool isMOVNMovAlias() const {
+    if (!isImm()) return false;
 
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
-      return false;
-
+    if (!CE) return false;
     uint64_t Value = CE->getValue();
 
-    // i64 value with each byte being either 0x00 or 0xff.
-    for (unsigned i = 0; i < 8; ++i, Value >>= 8)
-      if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff)
+    // MOVZ takes precedence over MOVN.
+    for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16)
+      if ((Value & ~(0xffffULL << MOVZShift)) == 0)
         return false;
-    return true;
+
+    Value = ~Value;
+    if (RegWidth == 32)
+      Value &= 0xffffffffULL;
+
+    return (Value & ~(0xffffULL << Shift)) == 0;
   }
 
-  // if value == N, return true
-  template<int N>
-  bool isExactImm() const {
-    if (!isImm()) return false;
+  bool isFPImm() const { return Kind == k_FPImm; }
+  bool isBarrier() const { return Kind == k_Barrier; }
+  bool isSysReg() const { return Kind == k_SysReg; }
+  bool isMRSSystemRegister() const {
+    if (!isSysReg()) return false;
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+    bool IsKnownRegister;
+    auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
+    Mapper.fromString(getSysReg(), IsKnownRegister);
 
-    return CE->getValue() == N;
+    return IsKnownRegister;
   }
+  bool isMSRSystemRegister() const {
+    if (!isSysReg()) return false;
+
+    bool IsKnownRegister;
+    auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
+    Mapper.fromString(getSysReg(), IsKnownRegister);
 
-  bool isFPZeroIZero() const {
-    return isFPZero();
+    return IsKnownRegister;
   }
+  bool isSystemPStateField() const {
+    if (!isSysReg()) return false;
 
-  static AArch64Operand *CreateImmWithLSL(const MCExpr *Val,
-                                          unsigned ShiftAmount,
-                                          bool ImplicitAmount,
-										  SMLoc S,SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_ImmWithLSL, S, E);
-    Op->ImmWithLSL.Val = Val;
-    Op->ImmWithLSL.ShiftAmount = ShiftAmount;
-    Op->ImmWithLSL.ImplicitAmount = ImplicitAmount;
-    return Op;
+    bool IsKnownRegister;
+    AArch64PState::PStateMapper().fromString(getSysReg(), IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+  bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
+  bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+  bool isVectorRegLo() const {
+    return Kind == k_Register && Reg.isVector &&
+           AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+               Reg.RegNum);
+  }
+  bool isGPR32as64() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
   }
 
-  static AArch64Operand *CreateCondCode(A64CC::CondCodes Code,
-                                        SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_CondCode, S, E);
-    Op->CondCode.Code = Code;
-    return Op;
+  bool isGPR64sp0() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
   }
 
-  static AArch64Operand *CreateFPImm(double Val,
-                                     SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_FPImmediate, S, E);
-    Op->FPImm.Val = Val;
-    return Op;
+  /// Is this a vector list with the type implicit (presumably attached to the
+  /// instruction itself)?
+  template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
+    return Kind == k_VectorList && VectorList.Count == NumRegs &&
+           !VectorList.ElementKind;
   }
 
-  static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_Immediate, S, E);
-    Op->Imm.Val = Val;
-    return Op;
+  template <unsigned NumRegs, unsigned NumElements, char ElementKind>
+  bool isTypedVectorList() const {
+    if (Kind != k_VectorList)
+      return false;
+    if (VectorList.Count != NumRegs)
+      return false;
+    if (VectorList.ElementKind != ElementKind)
+      return false;
+    return VectorList.NumElements == NumElements;
   }
 
-  static AArch64Operand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_Register, S, E);
-    Op->Reg.RegNum = RegNum;
-    return Op;
+  bool isVectorIndex1() const {
+    return Kind == k_VectorIndex && VectorIndex.Val == 1;
   }
+  bool isVectorIndexB() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 16;
+  }
+  bool isVectorIndexH() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 8;
+  }
+  bool isVectorIndexS() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 4;
+  }
+  bool isVectorIndexD() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 2;
+  }
+  bool isToken() const override { return Kind == k_Token; }
+  bool isTokenEqual(StringRef Str) const {
+    return Kind == k_Token && getToken() == Str;
+  }
+  bool isSysCR() const { return Kind == k_SysCR; }
+  bool isPrefetch() const { return Kind == k_Prefetch; }
+  bool isShiftExtend() const { return Kind == k_ShiftExtend; }
+  bool isShifter() const {
+    if (!isShiftExtend())
+      return false;
 
-  static AArch64Operand *CreateWrappedReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_WrappedRegister, S, E);
-    Op->Reg.RegNum = RegNum;
-    return Op;
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR || ST == AArch64_AM::ROR ||
+            ST == AArch64_AM::MSL);
   }
+  bool isExtend() const {
+    if (!isShiftExtend())
+      return false;
 
-  static AArch64Operand *CreateShiftExtend(A64SE::ShiftExtSpecifiers ShiftTyp,
-                                           unsigned Amount,
-                                           bool ImplicitAmount,
-                                           SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, S, E);
-    Op->ShiftExtend.ShiftType = ShiftTyp;
-    Op->ShiftExtend.Amount = Amount;
-    Op->ShiftExtend.ImplicitAmount = ImplicitAmount;
-    return Op;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB ||
+            ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH ||
+            ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW ||
+            ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+            ET == AArch64_AM::LSL) &&
+           getShiftExtendAmount() <= 4;
   }
 
-  static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S) {
-    AArch64Operand *Op = new AArch64Operand(k_SysReg, S, S);
-    Op->Tok.Data = Str.data();
-    Op->Tok.Length = Str.size();
-    return Op;
+  bool isExtend64() const {
+    if (!isExtend())
+      return false;
+    // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX;
+  }
+  bool isExtendLSL64() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+            ET == AArch64_AM::LSL) &&
+           getShiftExtendAmount() <= 4;
   }
 
-  static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
-                                          A64Layout::VectorLayout Layout,
-                                          SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_VectorList, S, E);
-    Op->VectorList.RegNum = RegNum;
-    Op->VectorList.Count = Count;
-    Op->VectorList.Layout = Layout;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
+  template<int Width> bool isMemXExtend() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::LSL || ET == AArch64_AM::SXTX) &&
+           (getShiftExtendAmount() == Log2_32(Width / 8) ||
+            getShiftExtendAmount() == 0);
   }
 
-  static AArch64Operand *CreateToken(StringRef Str, SMLoc S) {
-    AArch64Operand *Op = new AArch64Operand(k_Token, S, S);
-    Op->Tok.Data = Str.data();
-    Op->Tok.Length = Str.size();
-    return Op;
+  template<int Width> bool isMemWExtend() const {
+    if (!isExtend())
+      return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW) &&
+           (getShiftExtendAmount() == Log2_32(Width / 8) ||
+            getShiftExtendAmount() == 0);
   }
 
+  template <unsigned width>
+  bool isArithmeticShifter() const {
+    if (!isShifter())
+      return false;
 
-  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
-    // Add as immediates when possible.
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    else
-      Inst.addOperand(MCOperand::CreateExpr(Expr));
+    // An arithmetic shifter is LSL, LSR, or ASR.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR) && getShiftExtendAmount() < width;
   }
 
-  template<unsigned RegWidth>
-  void addBFILSBOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    unsigned EncodedVal = (RegWidth - CE->getValue()) % RegWidth;
-    Inst.addOperand(MCOperand::CreateImm(EncodedVal));
-  }
+  template <unsigned width>
+  bool isLogicalShifter() const {
+    if (!isShifter())
+      return false;
 
-  void addBFIWidthOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
+    // A logical shifter is LSL, LSR, ASR or ROR.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR || ST == AArch64_AM::ROR) &&
+           getShiftExtendAmount() < width;
   }
 
-  void addBFXWidthOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+  bool isMovImm32Shifter() const {
+    if (!isShifter())
+      return false;
 
-    uint64_t LSB = Inst.getOperand(Inst.getNumOperands()-1).getImm();
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    // A MOVi shifter is LSL of 0, 16, 32, or 48.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    if (ST != AArch64_AM::LSL)
+      return false;
+    uint64_t Val = getShiftExtendAmount();
+    return (Val == 0 || Val == 16);
+  }
+
+  bool isMovImm64Shifter() const {
+    if (!isShifter())
+      return false;
 
-    Inst.addOperand(MCOperand::CreateImm(LSB + CE->getValue() - 1));
+    // A MOVi shifter is LSL of 0 or 16.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    if (ST != AArch64_AM::LSL)
+      return false;
+    uint64_t Val = getShiftExtendAmount();
+    return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
   }
 
-  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getCondCode()));
+  bool isLogicalVecShifter() const {
+    if (!isShifter())
+      return false;
+
+    // A logical vector shifter is a left shift by 0, 8, 16, or 24.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::LSL &&
+           (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
   }
 
-  void addCVTFixedPosOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+  bool isLogicalVecHalfWordShifter() const {
+    if (!isLogicalVecShifter())
+      return false;
 
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::CreateImm(64 - CE->getValue()));
+    // A logical vector shifter is a left shift by 0 or 8.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::LSL &&
+           (Shift == 0 || Shift == 8);
   }
 
-  void addFMOVImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+  bool isMoveVecShifter() const {
+    if (!isShiftExtend())
+      return false;
 
-    APFloat RealVal(FPImm.Val);
-    uint32_t ImmVal;
-    A64Imms::isFPImm(RealVal, ImmVal);
+    // A logical vector shifter is a left shift by 8 or 16.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::MSL &&
+           (Shift == 8 || Shift == 16);
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  // Fallback unscaled operands are for aliases of LDR/STR that fall back
+  // to LDUR/STUR when the offset is not legal for the former but is for
+  // the latter. As such, in addition to checking for being a legal unscaled
+  // address, also check that it is not a legal scaled address. This avoids
+  // ambiguity in the matcher.
+  template<int Width>
+  bool isSImm9OffsetFB() const {
+    return isSImm9() && !isUImm12Offset<Width / 8>();
   }
 
-  void addFPZeroOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands");
-    Inst.addOperand(MCOperand::CreateImm(0));
+  bool isAdrpLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    if (!isImm())
+        return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (4096 * (1LL << (21 - 1)));
+      int64_t Max = 4096 * ((1LL << (21 - 1)) - 1);
+      return (Val % 4096) == 0 && Val >= Min && Val <= Max;
+    }
+
+    return true;
   }
 
-  void addFPZeroIZeroOperands(MCInst &Inst, unsigned N) const {
-    addFPZeroOperands(Inst, N);
+  bool isAdrLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    if (!isImm())
+        return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (1LL << (21 - 1));
+      int64_t Max = ((1LL << (21 - 1)) - 1);
+      return Val >= Min && Val <= Max;
+    }
+
+    return true;
   }
 
-  void addInvCondCodeOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    unsigned Encoded = A64InvertCondCode(getCondCode());
-    Inst.addOperand(MCOperand::CreateImm(Encoded));
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
   }
 
   void addRegOperands(MCInst &Inst, unsigned N) const {
@@ -987,1330 +1091,2026 @@ public:
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  void addImmOperands(MCInst &Inst, unsigned N) const {
+  void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    addExpr(Inst, getImm());
-  }
+    assert(
+        AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(getReg()));
 
-  template<int MemSize>
-  void addSImm7ScaledOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+    const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+    uint32_t Reg = RI->getRegClass(AArch64::GPR32RegClassID).getRegister(
+        RI->getEncodingValue(getReg()));
 
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Val = CE->getValue() / MemSize;
-    Inst.addOperand(MCOperand::CreateImm(Val  & 0x7f));
+    Inst.addOperand(MCOperand::CreateReg(Reg));
   }
 
-  template<int BitWidth>
-  void addSImmOperands(MCInst &Inst, unsigned N) const {
+  void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Val = CE->getValue();
-    Inst.addOperand(MCOperand::CreateImm(Val  & ((1ULL << BitWidth) - 1)));
+    assert(
+        AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+    Inst.addOperand(MCOperand::CreateReg(AArch64::D0 + getReg() - AArch64::Q0));
   }
 
-  void addImmWithLSLOperands(MCInst &Inst, unsigned N) const {
-    assert (N == 1 && "Invalid number of operands!");
-
-    addExpr(Inst, ImmWithLSL.Val);
+  void addVectorReg128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  template<unsigned field_width, unsigned scale>
-  void addLabelOperands(MCInst &Inst, unsigned N) const {
+  void addVectorRegLoOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
+  template <unsigned NumRegs>
+  void addVectorList64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static unsigned FirstRegs[] = { AArch64::D0,       AArch64::D0_D1,
+                                    AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
 
-    if (!CE) {
-      addExpr(Inst, Imm.Val);
-      return;
-    }
+    Inst.addOperand(
+        MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0));
+  }
 
-    int64_t Val = CE->getValue();
-    assert(Val % scale == 0 && "Unaligned immediate in instruction");
-    Val /= scale;
+  template <unsigned NumRegs>
+  void addVectorList128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static unsigned FirstRegs[] = { AArch64::Q0,       AArch64::Q0_Q1,
+                                    AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
 
-    Inst.addOperand(MCOperand::CreateImm(Val & ((1LL << field_width) - 1)));
+    Inst.addOperand(
+        MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0));
   }
 
-  template<int MemSize>
-  void addOffsetUImm12Operands(MCInst &Inst, unsigned N) const {
+  void addVectorIndex1Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
 
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue() / MemSize));
+  void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // If this is a pageoff symrefexpr with an addend, adjust the addend
+    // to be only the page-offset portion. Otherwise, just add the expr
+    // as-is.
+    addExpr(Inst, getImm());
+  }
+
+  void addAddSubImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    if (isShiftedImm()) {
+      addExpr(Inst, getShiftedImmVal());
+      Inst.addOperand(MCOperand::CreateImm(getShiftedImmShift()));
     } else {
-      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::CreateImm(0));
     }
   }
 
-  template<unsigned RegWidth>
-  void addLogicalImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands");
-    const MCConstantExpr *CE = cast<MCConstantExpr>(Imm.Val);
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCondCode()));
+  }
 
-    uint32_t Bits;
-    A64Imms::isLogicalImm(RegWidth, CE->getValue(), Bits);
+  void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      addExpr(Inst, getImm());
+    else
+      Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 12));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(Bits));
+  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
   }
 
-  void addMRSOperands(MCInst &Inst, unsigned N) const {
+  template<int Scale>
+  void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
 
-    bool Valid;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    uint32_t Bits = A64SysReg::MRSMapper().fromString(Name, Valid);
+    if (!MCE) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      return;
+    }
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / Scale));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(Bits));
+  void addSImm9Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addMSRWithRegOperands(MCInst &Inst, unsigned N) const {
+  void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4));
+  }
 
-    bool Valid;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    uint32_t Bits = A64SysReg::MSRMapper().fromString(Name, Valid);
+  void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(Bits));
+  void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16));
   }
 
-  void addMSRPStateOperands(MCInst &Inst, unsigned N) const {
+  void addImm0_7Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    bool Valid;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    uint32_t Bits = A64PState::PStateMapper().fromString(Name, Valid);
+  void addImm1_8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(Bits));
+  void addImm0_15Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addMoveWideImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && "Invalid number of operands!");
+  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    addExpr(Inst, ImmWithLSL.Val);
+  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    AArch64MCExpr::VariantKind Variant;
-    if (!isNonConstantExpr(ImmWithLSL.Val, Variant)) {
-      Inst.addOperand(MCOperand::CreateImm(ImmWithLSL.ShiftAmount / 16));
-      return;
-    }
+  void addImm1_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    // We know it's relocated
-    switch (Variant) {
-    case AArch64MCExpr::VK_AARCH64_ABS_G0:
-    case AArch64MCExpr::VK_AARCH64_ABS_G0_NC:
-    case AArch64MCExpr::VK_AARCH64_SABS_G0:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC:
-    case AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G0:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G0_NC:
-      Inst.addOperand(MCOperand::CreateImm(0));
-      break;
-    case AArch64MCExpr::VK_AARCH64_ABS_G1:
-    case AArch64MCExpr::VK_AARCH64_ABS_G1_NC:
-    case AArch64MCExpr::VK_AARCH64_SABS_G1:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC:
-    case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G1:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G1_NC:
-      Inst.addOperand(MCOperand::CreateImm(1));
-      break;
-    case AArch64MCExpr::VK_AARCH64_ABS_G2:
-    case AArch64MCExpr::VK_AARCH64_ABS_G2_NC:
-    case AArch64MCExpr::VK_AARCH64_SABS_G2:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G2:
-      Inst.addOperand(MCOperand::CreateImm(2));
-      break;
-    case AArch64MCExpr::VK_AARCH64_ABS_G3:
-      Inst.addOperand(MCOperand::CreateImm(3));
-      break;
-    default: llvm_unreachable("Inappropriate move wide relocation");
-    }
+  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  template<int RegWidth, bool isValidImm(int, uint64_t, int&, int&)>
-  void addMoveWideMovAliasOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && "Invalid number of operands!");
-    int UImm16, Shift;
+  void addImm0_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Value = CE->getValue();
+  void addImm1_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    if (RegWidth == 32) {
-      Value &= 0xffffffffULL;
-    }
+  void addImm1_64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    bool Valid = isValidImm(RegWidth, Value, UImm16, Shift);
-    (void)Valid;
-    assert(Valid && "Invalid immediates should have been weeded out by now");
+  void addImm0_127Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(UImm16));
-    Inst.addOperand(MCOperand::CreateImm(Shift));
+  void addImm0_255Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addPRFMOperands(MCInst &Inst, unsigned N) const {
+  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    assert(CE->getValue() >= 0 && CE->getValue() <= 31
-           && "PRFM operand should be 5-bits");
+  void addImm32_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+  void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid logical immediate operand!");
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 32);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
   }
 
-  // For Add-sub (extended register) operands.
-  void addRegExtendOperands(MCInst &Inst, unsigned N) const {
+  void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid logical immediate operand!");
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
+  void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid immediate operand!");
+    uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
+    Inst.addOperand(MCOperand::CreateImm(encoding));
   }
 
-  // For Vector Immediates shifted imm operands.
-  void addNeonMovImmShiftLSLOperands(MCInst &Inst, unsigned N) const {
+  void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
 
-    if (ShiftExtend.Amount % 8 != 0 || ShiftExtend.Amount > 24)
-      llvm_unreachable("Invalid shift amount for vector immediate inst.");
+  void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
 
-    // Encode LSL shift amount 0, 8, 16, 24 as 0, 1, 2, 3.
-    int64_t Imm = ShiftExtend.Amount / 8;
-    Inst.addOperand(MCOperand::CreateImm(Imm));
+  void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
   }
 
-  void addNeonMovImmShiftLSLHOperands(MCInst &Inst, unsigned N) const {
+  void addFPImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getFPImm()));
+  }
 
-    if (ShiftExtend.Amount != 0 && ShiftExtend.Amount != 8)
-      llvm_unreachable("Invalid shift amount for vector immediate inst.");
+  void addBarrierOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getBarrier()));
+  }
 
-    // Encode LSLH shift amount 0, 8  as 0, 1.
-    int64_t Imm = ShiftExtend.Amount / 8;
-    Inst.addOperand(MCOperand::CreateImm(Imm));
+  void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
+    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
   }
 
-  void addNeonMovImmShiftMSLOperands(MCInst &Inst, unsigned N) const {
+  void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
-    if (ShiftExtend.Amount != 8 && ShiftExtend.Amount != 16)
-      llvm_unreachable("Invalid shift amount for vector immediate inst.");
+    bool Valid;
+    auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
+    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
 
-    // Encode MSL shift amount 8, 16  as 0, 1.
-    int64_t Imm = ShiftExtend.Amount / 8 - 1;
-    Inst.addOperand(MCOperand::CreateImm(Imm));
+    Inst.addOperand(MCOperand::CreateImm(Bits));
   }
 
-  // For the extend in load-store (register offset) instructions.
-  template<unsigned MemSize>
-  void addAddrRegExtendOperands(MCInst &Inst, unsigned N) const {
-    addAddrRegExtendOperands(Inst, N, MemSize);
+  void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    uint32_t Bits =
+        AArch64PState::PStateMapper().fromString(getSysReg(), Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
   }
 
-  void addAddrRegExtendOperands(MCInst &Inst, unsigned N,
-                                unsigned MemSize) const {
+  void addSysCROperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getSysCR()));
+  }
 
-    // First bit of Option is set in instruction classes, the high two bits are
-    // as follows:
-    unsigned OptionHi = 0;
-    switch (ShiftExtend.ShiftType) {
-    case A64SE::UXTW:
-    case A64SE::LSL:
-      OptionHi = 1;
-      break;
-    case A64SE::SXTW:
-    case A64SE::SXTX:
-      OptionHi = 3;
-      break;
-    default:
-      llvm_unreachable("Invalid extend type for register offset");
-    }
+  void addPrefetchOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getPrefetch()));
+  }
 
-    unsigned S = 0;
-    if (MemSize == 1 && !ShiftExtend.ImplicitAmount)
-      S = 1;
-    else if (MemSize != 1 && ShiftExtend.Amount != 0)
-      S = 1;
+  void addShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned Imm =
+        AArch64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm((OptionHi << 1) | S));
+  void addExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTW;
+    unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
   }
-  void addShiftOperands(MCInst &Inst, unsigned N) const {
+
+  void addExtend64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTX;
+    unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
+  void addMemExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+    Inst.addOperand(MCOperand::CreateImm(IsSigned));
+    Inst.addOperand(MCOperand::CreateImm(getShiftExtendAmount() != 0));
   }
 
-  void addNeonUImm64MaskOperands(MCInst &Inst, unsigned N) const {
+  // For 8-bit load/store instructions with a register offset, both the
+  // "DoShift" and "NoShift" variants have a shift of 0. Because of this,
+  // they're disambiguated by whether the shift was explicit or implicit rather
+  // than its size.
+  void addMemExtend8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+    Inst.addOperand(MCOperand::CreateImm(IsSigned));
+    Inst.addOperand(MCOperand::CreateImm(hasShiftExtendAmount()));
+  }
+
+  template<int Shift>
+  void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
-    // A bit from each byte in the constant forms the encoded immediate
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
     uint64_t Value = CE->getValue();
-
-    unsigned Imm = 0;
-    for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
-      Imm |= (Value & 1) << i;
-    }
-    Inst.addOperand(MCOperand::CreateImm(Imm));
+    Inst.addOperand(MCOperand::CreateImm((Value >> Shift) & 0xffff));
   }
 
-  void addVectorListOperands(MCInst &Inst, unsigned N) const {
+  template<int Shift>
+  void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((~Value >> Shift) & 0xffff));
   }
-};
 
-} // end anonymous namespace.
+  void print(raw_ostream &OS) const override;
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               StringRef Mnemonic) {
+  static AArch64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S,
+                                   MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Token, Ctx);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->Tok.IsSuffix = IsSuffix;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
 
-  // See if the operand has a custom parser
-  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  static AArch64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S,
+                                 SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Register, Ctx);
+    Op->Reg.RegNum = RegNum;
+    Op->Reg.isVector = isVector;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-  // It could either succeed, fail or just not care.
-  if (ResTy != MatchOperand_NoMatch)
-    return ResTy;
+  static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
+                                        unsigned NumElements, char ElementKind,
+                                        SMLoc S, SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_VectorList, Ctx);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.NumElements = NumElements;
+    Op->VectorList.ElementKind = ElementKind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-  switch (getLexer().getKind()) {
-  default:
-    Error(Parser.getTok().getLoc(), "unexpected token in operand");
-    return MatchOperand_ParseFail;
-  case AsmToken::Identifier: {
-    // It might be in the LSL/UXTB family ...
-    OperandMatchResultTy GotShift = ParseShiftExtend(Operands);
+  static AArch64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
+                                         MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_VectorIndex, Ctx);
+    Op->VectorIndex.Val = Idx;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-    // We can only continue if no tokens were eaten.
-    if (GotShift != MatchOperand_NoMatch)
-      return GotShift;
+  static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
+                                 MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Immediate, Ctx);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-    // ... or it might be a register ...
-    uint32_t NumLanes = 0;
-    OperandMatchResultTy GotReg = ParseRegister(Operands, NumLanes);
-    assert(GotReg != MatchOperand_ParseFail
-           && "register parsing shouldn't partially succeed");
-
-    if (GotReg == MatchOperand_Success) {
-      if (Parser.getTok().is(AsmToken::LBrac))
-        return ParseNEONLane(Operands, NumLanes);
-      else
-        return MatchOperand_Success;
-    }
-    // ... or it might be a symbolish thing
-  }
-    // Fall through
-  case AsmToken::LParen:  // E.g. (strcmp-4)
-  case AsmToken::Integer: // 1f, 2b labels
-  case AsmToken::String:  // quoted labels
-  case AsmToken::Dot:     // . is Current location
-  case AsmToken::Dollar:  // $ is PC
-  case AsmToken::Colon: {
-    SMLoc StartLoc  = Parser.getTok().getLoc();
-    SMLoc EndLoc;
-    const MCExpr *ImmVal = 0;
-
-    if (ParseImmediate(ImmVal) != MatchOperand_Success)
-      return MatchOperand_ParseFail;
+  static AArch64Operand *CreateShiftedImm(const MCExpr *Val,
+                                          unsigned ShiftAmount, SMLoc S,
+                                          SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_ShiftedImm, Ctx);
+    Op->ShiftedImm .Val = Val;
+    Op->ShiftedImm.ShiftAmount = ShiftAmount;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-    EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-    Operands.push_back(AArch64Operand::CreateImm(ImmVal, StartLoc, EndLoc));
-    return MatchOperand_Success;
+  static AArch64Operand *CreateCondCode(AArch64CC::CondCode Code, SMLoc S,
+                                        SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_CondCode, Ctx);
+    Op->CondCode.Code = Code;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
   }
-  case AsmToken::Hash: {   // Immediates
-    SMLoc StartLoc = Parser.getTok().getLoc();
-    SMLoc EndLoc;
-    const MCExpr *ImmVal = 0;
-    Parser.Lex();
 
-    if (ParseImmediate(ImmVal) != MatchOperand_Success)
-      return MatchOperand_ParseFail;
+  static AArch64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_FPImm, Ctx);
+    Op->FPImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
 
-    EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-    Operands.push_back(AArch64Operand::CreateImm(ImmVal, StartLoc, EndLoc));
-    return MatchOperand_Success;
+  static AArch64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Barrier, Ctx);
+    Op->Barrier.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
   }
-  case AsmToken::LBrac: {
-    SMLoc Loc = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateToken("[", Loc));
-    Parser.Lex(); // Eat '['
 
-    // There's no comma after a '[', so we can parse the next operand
-    // immediately.
-    return ParseOperand(Operands, Mnemonic);
+  static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S,
+                                    uint64_t FeatureBits, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_SysReg, Ctx);
+    Op->SysReg.Data = Str.data();
+    Op->SysReg.Length = Str.size();
+    Op->SysReg.FeatureBits = FeatureBits;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
   }
-  // The following will likely be useful later, but not in very early cases
-  case AsmToken::LCurly: // SIMD vector list is not parsed here
-    llvm_unreachable("Don't know how to deal with '{' in operand");
-    return MatchOperand_ParseFail;
+
+  static AArch64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E,
+                                   MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_SysCR, Ctx);
+    Op->SysCRImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
   }
-}
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseImmediate(const MCExpr *&ExprVal) {
-  if (getLexer().is(AsmToken::Colon)) {
-    AArch64MCExpr::VariantKind RefKind;
+  static AArch64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Prefetch, Ctx);
+    Op->Prefetch.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
 
-    OperandMatchResultTy ResTy = ParseRelocPrefix(RefKind);
-    if (ResTy != MatchOperand_Success)
-      return ResTy;
+  static AArch64Operand *CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp,
+                                         unsigned Val, bool HasExplicitAmount,
+                                         SMLoc S, SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, Ctx);
+    Op->ShiftExtend.Type = ShOp;
+    Op->ShiftExtend.Amount = Val;
+    Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+};
 
-    const MCExpr *SubExprVal;
-    if (getParser().parseExpression(SubExprVal))
-      return MatchOperand_ParseFail;
+} // end anonymous namespace.
 
-    ExprVal = AArch64MCExpr::Create(RefKind, SubExprVal, getContext());
-    return MatchOperand_Success;
+void AArch64Operand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case k_FPImm:
+    OS << "<fpimm " << getFPImm() << "("
+       << AArch64_AM::getFPImmFloat(getFPImm()) << ") >";
+    break;
+  case k_Barrier: {
+    bool Valid;
+    StringRef Name = AArch64DB::DBarrierMapper().toString(getBarrier(), Valid);
+    if (Valid)
+      OS << "<barrier " << Name << ">";
+    else
+      OS << "<barrier invalid #" << getBarrier() << ">";
+    break;
+  }
+  case k_Immediate:
+    getImm()->print(OS);
+    break;
+  case k_ShiftedImm: {
+    unsigned Shift = getShiftedImmShift();
+    OS << "<shiftedimm ";
+    getShiftedImmVal()->print(OS);
+    OS << ", lsl #" << AArch64_AM::getShiftValue(Shift) << ">";
+    break;
+  }
+  case k_CondCode:
+    OS << "<condcode " << getCondCode() << ">";
+    break;
+  case k_Register:
+    OS << "<register " << getReg() << ">";
+    break;
+  case k_VectorList: {
+    OS << "<vectorlist ";
+    unsigned Reg = getVectorListStart();
+    for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
+      OS << Reg + i << " ";
+    OS << ">";
+    break;
+  }
+  case k_VectorIndex:
+    OS << "<vectorindex " << getVectorIndex() << ">";
+    break;
+  case k_SysReg:
+    OS << "<sysreg: " << getSysReg() << '>';
+    break;
+  case k_Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case k_SysCR:
+    OS << "c" << getSysCR();
+    break;
+  case k_Prefetch: {
+    bool Valid;
+    StringRef Name = AArch64PRFM::PRFMMapper().toString(getPrefetch(), Valid);
+    if (Valid)
+      OS << "<prfop " << Name << ">";
+    else
+      OS << "<prfop invalid #" << getPrefetch() << ">";
+    break;
+  }
+  case k_ShiftExtend: {
+    OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
+       << getShiftExtendAmount();
+    if (!hasShiftExtendAmount())
+      OS << "<imp>";
+    OS << '>';
+    break;
+  }
   }
+}
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
 
-  // No weird AArch64MCExpr prefix
-  return getParser().parseExpression(ExprVal)
-    ? MatchOperand_ParseFail : MatchOperand_Success;
+/// }
+
+static unsigned matchVectorRegName(StringRef Name) {
+  return StringSwitch<unsigned>(Name)
+      .Case("v0", AArch64::Q0)
+      .Case("v1", AArch64::Q1)
+      .Case("v2", AArch64::Q2)
+      .Case("v3", AArch64::Q3)
+      .Case("v4", AArch64::Q4)
+      .Case("v5", AArch64::Q5)
+      .Case("v6", AArch64::Q6)
+      .Case("v7", AArch64::Q7)
+      .Case("v8", AArch64::Q8)
+      .Case("v9", AArch64::Q9)
+      .Case("v10", AArch64::Q10)
+      .Case("v11", AArch64::Q11)
+      .Case("v12", AArch64::Q12)
+      .Case("v13", AArch64::Q13)
+      .Case("v14", AArch64::Q14)
+      .Case("v15", AArch64::Q15)
+      .Case("v16", AArch64::Q16)
+      .Case("v17", AArch64::Q17)
+      .Case("v18", AArch64::Q18)
+      .Case("v19", AArch64::Q19)
+      .Case("v20", AArch64::Q20)
+      .Case("v21", AArch64::Q21)
+      .Case("v22", AArch64::Q22)
+      .Case("v23", AArch64::Q23)
+      .Case("v24", AArch64::Q24)
+      .Case("v25", AArch64::Q25)
+      .Case("v26", AArch64::Q26)
+      .Case("v27", AArch64::Q27)
+      .Case("v28", AArch64::Q28)
+      .Case("v29", AArch64::Q29)
+      .Case("v30", AArch64::Q30)
+      .Case("v31", AArch64::Q31)
+      .Default(0);
 }
 
-// A lane attached to a NEON register. "[N]", which should yield three tokens:
-// '[', N, ']'. A hash is not allowed to precede the immediate here.
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseNEONLane(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                uint32_t NumLanes) {
-  SMLoc Loc = Parser.getTok().getLoc();
+static bool isValidVectorKind(StringRef Name) {
+  return StringSwitch<bool>(Name.lower())
+      .Case(".8b", true)
+      .Case(".16b", true)
+      .Case(".4h", true)
+      .Case(".8h", true)
+      .Case(".2s", true)
+      .Case(".4s", true)
+      .Case(".1d", true)
+      .Case(".2d", true)
+      .Case(".1q", true)
+      // Accept the width neutral ones, too, for verbose syntax. If those
+      // aren't used in the right places, the token operand won't match so
+      // all will work out.
+      .Case(".b", true)
+      .Case(".h", true)
+      .Case(".s", true)
+      .Case(".d", true)
+      .Default(false);
+}
 
-  assert(Parser.getTok().is(AsmToken::LBrac) && "inappropriate operand");
-  Operands.push_back(AArch64Operand::CreateToken("[", Loc));
-  Parser.Lex(); // Eat '['
+static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
+                                 char &ElementKind) {
+  assert(isValidVectorKind(Name));
 
-  if (Parser.getTok().isNot(AsmToken::Integer)) {
-    Error(Parser.getTok().getLoc(), "expected lane number");
-    return MatchOperand_ParseFail;
-  }
+  ElementKind = Name.lower()[Name.size() - 1];
+  NumElements = 0;
 
-  if (Parser.getTok().getIntVal() >= NumLanes) {
-    Error(Parser.getTok().getLoc(), "lane number incompatible with layout");
-    return MatchOperand_ParseFail;
+  if (Name.size() == 2)
+    return;
+
+  // Parse the lane count
+  Name = Name.drop_front();
+  while (isdigit(Name.front())) {
+    NumElements = 10 * NumElements + (Name.front() - '0');
+    Name = Name.drop_front();
   }
+}
 
-  const MCExpr *Lane = MCConstantExpr::Create(Parser.getTok().getIntVal(),
-                                              getContext());
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat actual lane
-  SMLoc E = Parser.getTok().getLoc();
-  Operands.push_back(AArch64Operand::CreateImm(Lane, S, E));
+bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                     SMLoc &EndLoc) {
+  StartLoc = getLoc();
+  RegNo = tryParseRegister();
+  EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  return (RegNo == (unsigned)-1);
+}
 
+/// tryParseRegister - Try to parse a register name. The token must be an
+/// Identifier when called, and if it is a register name the token is eaten and
+/// the register is added to the operand list.
+int AArch64AsmParser::tryParseRegister() {
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  std::string lowerCase = Tok.getString().lower();
+  unsigned RegNum = MatchRegisterName(lowerCase);
+  // Also handle a few aliases of registers.
+  if (RegNum == 0)
+    RegNum = StringSwitch<unsigned>(lowerCase)
+                 .Case("fp",  AArch64::FP)
+                 .Case("lr",  AArch64::LR)
+                 .Case("x31", AArch64::XZR)
+                 .Case("w31", AArch64::WZR)
+                 .Default(0);
+
+  if (RegNum == 0)
+    return -1;
+
+  Parser.Lex(); // Eat identifier token.
+  return RegNum;
+}
 
-  if (Parser.getTok().isNot(AsmToken::RBrac)) {
-    Error(Parser.getTok().getLoc(), "expected ']' after lane");
-    return MatchOperand_ParseFail;
+/// tryMatchVectorRegister - Try to parse a vector register name with optional
+/// kind specifier. If it is a register specifier, eat the token and return it.
+int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    TokError("vector register expected");
+    return -1;
+  }
+
+  StringRef Name = Parser.getTok().getString();
+  // If there is a kind specifier, it's separated from the register name by
+  // a '.'.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+  unsigned RegNum = matchVectorRegName(Head);
+  if (RegNum) {
+    if (Next != StringRef::npos) {
+      Kind = Name.slice(Next, StringRef::npos);
+      if (!isValidVectorKind(Kind)) {
+        TokError("invalid vector kind qualifier");
+        return -1;
+      }
+    }
+    Parser.Lex(); // Eat the register token.
+    return RegNum;
   }
 
-  Operands.push_back(AArch64Operand::CreateToken("]", Loc));
-  Parser.Lex(); // Eat ']'
-
-  return MatchOperand_Success;
+  if (expected)
+    TokError("vector register expected");
+  return -1;
 }
 
+/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind) {
-  assert(getLexer().is(AsmToken::Colon) && "expected a ':'");
-  Parser.Lex();
+AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
+  SMLoc S = getLoc();
 
-  if (getLexer().isNot(AsmToken::Identifier)) {
-    Error(Parser.getTok().getLoc(),
-          "expected relocation specifier in operand after ':'");
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
     return MatchOperand_ParseFail;
   }
 
-  std::string LowerCase = Parser.getTok().getIdentifier().lower();
-  RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
-    .Case("got",              AArch64MCExpr::VK_AARCH64_GOT)
-    .Case("got_lo12",         AArch64MCExpr::VK_AARCH64_GOT_LO12)
-    .Case("lo12",             AArch64MCExpr::VK_AARCH64_LO12)
-    .Case("abs_g0",           AArch64MCExpr::VK_AARCH64_ABS_G0)
-    .Case("abs_g0_nc",        AArch64MCExpr::VK_AARCH64_ABS_G0_NC)
-    .Case("abs_g1",           AArch64MCExpr::VK_AARCH64_ABS_G1)
-    .Case("abs_g1_nc",        AArch64MCExpr::VK_AARCH64_ABS_G1_NC)
-    .Case("abs_g2",           AArch64MCExpr::VK_AARCH64_ABS_G2)
-    .Case("abs_g2_nc",        AArch64MCExpr::VK_AARCH64_ABS_G2_NC)
-    .Case("abs_g3",           AArch64MCExpr::VK_AARCH64_ABS_G3)
-    .Case("abs_g0_s",         AArch64MCExpr::VK_AARCH64_SABS_G0)
-    .Case("abs_g1_s",         AArch64MCExpr::VK_AARCH64_SABS_G1)
-    .Case("abs_g2_s",         AArch64MCExpr::VK_AARCH64_SABS_G2)
-    .Case("dtprel_g2",        AArch64MCExpr::VK_AARCH64_DTPREL_G2)
-    .Case("dtprel_g1",        AArch64MCExpr::VK_AARCH64_DTPREL_G1)
-    .Case("dtprel_g1_nc",     AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC)
-    .Case("dtprel_g0",        AArch64MCExpr::VK_AARCH64_DTPREL_G0)
-    .Case("dtprel_g0_nc",     AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC)
-    .Case("dtprel_hi12",      AArch64MCExpr::VK_AARCH64_DTPREL_HI12)
-    .Case("dtprel_lo12",      AArch64MCExpr::VK_AARCH64_DTPREL_LO12)
-    .Case("dtprel_lo12_nc",   AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC)
-    .Case("gottprel_g1",      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1)
-    .Case("gottprel_g0_nc",   AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC)
-    .Case("gottprel",         AArch64MCExpr::VK_AARCH64_GOTTPREL)
-    .Case("gottprel_lo12",    AArch64MCExpr::VK_AARCH64_GOTTPREL_LO12)
-    .Case("tprel_g2",         AArch64MCExpr::VK_AARCH64_TPREL_G2)
-    .Case("tprel_g1",         AArch64MCExpr::VK_AARCH64_TPREL_G1)
-    .Case("tprel_g1_nc",      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC)
-    .Case("tprel_g0",         AArch64MCExpr::VK_AARCH64_TPREL_G0)
-    .Case("tprel_g0_nc",      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC)
-    .Case("tprel_hi12",       AArch64MCExpr::VK_AARCH64_TPREL_HI12)
-    .Case("tprel_lo12",       AArch64MCExpr::VK_AARCH64_TPREL_LO12)
-    .Case("tprel_lo12_nc",    AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC)
-    .Case("tlsdesc",          AArch64MCExpr::VK_AARCH64_TLSDESC)
-    .Case("tlsdesc_lo12",     AArch64MCExpr::VK_AARCH64_TLSDESC_LO12)
-    .Default(AArch64MCExpr::VK_AARCH64_None);
-
-  if (RefKind == AArch64MCExpr::VK_AARCH64_None) {
-    Error(Parser.getTok().getLoc(),
-          "expected relocation specifier in operand after ':'");
+  StringRef Tok = Parser.getTok().getIdentifier();
+  if (Tok[0] != 'c' && Tok[0] != 'C') {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
     return MatchOperand_ParseFail;
   }
-  Parser.Lex(); // Eat identifier
 
-  if (getLexer().isNot(AsmToken::Colon)) {
-    Error(Parser.getTok().getLoc(),
-          "expected ':' after relocation specifier");
+  uint32_t CRNum;
+  bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
+  if (BadNum || CRNum > 15) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
     return MatchOperand_ParseFail;
   }
-  Parser.Lex();
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(
+      AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
   return MatchOperand_Success;
 }
 
+/// tryParsePrefetch - Try to parse a prefetch operand.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseImmWithLSLOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  // Either an identifier for named values or a 5-bit immediate.
+  bool Hash = Tok.is(AsmToken::Hash);
+  if (Hash || Tok.is(AsmToken::Integer)) {
+    if (Hash)
+      Parser.Lex(); // Eat hash token.
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
 
-  SMLoc S = Parser.getTok().getLoc();
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for prefetch operand");
+      return MatchOperand_ParseFail;
+    }
+    unsigned prfop = MCE->getValue();
+    if (prfop > 31) {
+      TokError("prefetch operand out of range, [0,31] expected");
+      return MatchOperand_ParseFail;
+    }
 
-  if (Parser.getTok().is(AsmToken::Hash))
-    Parser.Lex(); // Eat '#'
-  else if (Parser.getTok().isNot(AsmToken::Integer))
-    // Operand should start from # or should be integer, emit error otherwise.
-    return MatchOperand_NoMatch;
+    Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+    return MatchOperand_Success;
+  }
 
-  const MCExpr *Imm;
-  if (ParseImmediate(Imm) != MatchOperand_Success)
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("pre-fetch hint expected");
     return MatchOperand_ParseFail;
-  else if (Parser.getTok().isNot(AsmToken::Comma)) {
-    SMLoc E = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateImmWithLSL(Imm, 0, true, S, E));
-    return MatchOperand_Success;
   }
 
-  // Eat ','
-  Parser.Lex();
+  bool Valid;
+  unsigned prfop = AArch64PRFM::PRFMMapper().fromString(Tok.getString(), Valid);
+  if (!Valid) {
+    TokError("pre-fetch hint expected");
+    return MatchOperand_ParseFail;
+  }
 
-  // The optional operand must be "lsl #N" where N is non-negative.
-  if (Parser.getTok().is(AsmToken::Identifier)
-      && Parser.getTok().getIdentifier().equals_lower("lsl")) {
-    Parser.Lex();
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+  return MatchOperand_Success;
+}
 
-    if (Parser.getTok().is(AsmToken::Hash)) {
-      Parser.Lex();
+/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
+/// instruction.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
 
-      if (Parser.getTok().isNot(AsmToken::Integer)) {
-        Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
-        return MatchOperand_ParseFail;
-      }
-    }
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat hash token.
   }
 
-  int64_t ShiftAmount = Parser.getTok().getIntVal();
-
-  if (ShiftAmount < 0) {
-    Error(Parser.getTok().getLoc(), "positive shift amount required");
+  if (parseSymbolicImmVal(Expr))
     return MatchOperand_ParseFail;
+
+  AArch64MCExpr::VariantKind ELFRefKind;
+  MCSymbolRefExpr::VariantKind DarwinRefKind;
+  int64_t Addend;
+  if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+    if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
+        ELFRefKind == AArch64MCExpr::VK_INVALID) {
+      // No modifier was specified at all; this is the syntax for an ELF basic
+      // ADRP relocation (unfortunately).
+      Expr =
+          AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
+    } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
+                DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
+               Addend != 0) {
+      Error(S, "gotpage label reference not allowed an addend");
+      return MatchOperand_ParseFail;
+    } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
+               DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
+               DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
+      // The operand must be an @page or @gotpage qualified symbolref.
+      Error(S, "page or gotpage label reference expected");
+      return MatchOperand_ParseFail;
+    }
   }
-  Parser.Lex(); // Eat the number
 
-  SMLoc E = Parser.getTok().getLoc();
-  Operands.push_back(AArch64Operand::CreateImmWithLSL(Imm, ShiftAmount,
-                                                      false, S, E));
+  // We have either a label reference possibly with addend or an immediate. The
+  // addend is a raw value here. The linker will adjust it to only reference the
+  // page.
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+
   return MatchOperand_Success;
 }
 
-
+/// tryParseAdrLabel - Parse and validate a source label for the ADR
+/// instruction.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseCondCodeOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  if (Parser.getTok().isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
+AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
 
-  StringRef Tok = Parser.getTok().getIdentifier();
-  A64CC::CondCodes CondCode = A64StringToCondCode(Tok);
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat hash token.
+  }
 
-  if (CondCode == A64CC::Invalid)
-    return MatchOperand_NoMatch;
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
 
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat condition code
-  SMLoc E = Parser.getTok().getLoc();
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
 
-  Operands.push_back(AArch64Operand::CreateCondCode(CondCode, S, E));
   return MatchOperand_Success;
 }
 
+/// tryParseFPImm - A floating point immediate expression operand.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseCRxOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-  if (Parser.getTok().isNot(AsmToken::Identifier)) {
-    Error(S, "Expected cN operand where 0 <= N <= 15");
-    return MatchOperand_ParseFail;
-  }
+AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
+  SMLoc S = getLoc();
 
-  StringRef Tok = Parser.getTok().getIdentifier();
-  if (Tok[0] != 'c' && Tok[0] != 'C') {
-    Error(S, "Expected cN operand where 0 <= N <= 15");
-    return MatchOperand_ParseFail;
+  bool Hash = false;
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat '#'
+    Hash = true;
   }
 
-  uint32_t CRNum;
-  bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
-  if (BadNum || CRNum > 15) {
-    Error(S, "Expected cN operand where 0 <= N <= 15");
-    return MatchOperand_ParseFail;
+  // Handle negation, as that still comes through as a separate token.
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex();
+  }
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.is(AsmToken::Real)) {
+    APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    // If we had a '-' in front, toggle the sign bit.
+    IntVal ^= (uint64_t)isNegative << 63;
+    int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+    Parser.Lex(); // Eat the token.
+    // Check for out of range values. As an exception, we let Zero through,
+    // as we handle that special case in post-processing before matching in
+    // order to use the zero register for it.
+    if (Val == -1 && !RealVal.isZero()) {
+      TokError("expected compatible register or floating-point constant");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+  if (Tok.is(AsmToken::Integer)) {
+    int64_t Val;
+    if (!isNegative && Tok.getString().startswith("0x")) {
+      Val = Tok.getIntVal();
+      if (Val > 255 || Val < 0) {
+        TokError("encoded floating point value out of range");
+        return MatchOperand_ParseFail;
+      }
+    } else {
+      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      // If we had a '-' in front, toggle the sign bit.
+      IntVal ^= (uint64_t)isNegative << 63;
+      Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+    }
+    Parser.Lex(); // Eat the token.
+    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
   }
 
-  const MCExpr *CRImm = MCConstantExpr::Create(CRNum, getContext());
-
-  Parser.Lex();
-  SMLoc E = Parser.getTok().getLoc();
+  if (!Hash)
+    return MatchOperand_NoMatch;
 
-  Operands.push_back(AArch64Operand::CreateImm(CRImm, S, E));
-  return MatchOperand_Success;
+  TokError("invalid floating point immediate");
+  return MatchOperand_ParseFail;
 }
 
+/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseFPImmOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+  SMLoc S = getLoc();
+
+  if (Parser.getTok().is(AsmToken::Hash))
+    Parser.Lex(); // Eat '#'
+  else if (Parser.getTok().isNot(AsmToken::Integer))
+    // Operand should start from # or should be integer, emit error otherwise.
+    return MatchOperand_NoMatch;
+
+  const MCExpr *Imm;
+  if (parseSymbolicImmVal(Imm))
+    return MatchOperand_ParseFail;
+  else if (Parser.getTok().isNot(AsmToken::Comma)) {
+    uint64_t ShiftAmount = 0;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm);
+    if (MCE) {
+      int64_t Val = MCE->getValue();
+      if (Val > 0xfff && (Val & 0xfff) == 0) {
+        Imm = MCConstantExpr::Create(Val >> 12, getContext());
+        ShiftAmount = 12;
+      }
+    }
+    SMLoc E = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E,
+                                                        getContext()));
+    return MatchOperand_Success;
+  }
+
+  // Eat ','
+  Parser.Lex();
+
+  // The optional operand must be "lsl #N" where N is non-negative.
+  if (!Parser.getTok().is(AsmToken::Identifier) ||
+      !Parser.getTok().getIdentifier().equals_lower("lsl")) {
+    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+    return MatchOperand_ParseFail;
+  }
 
-  SMLoc S = Parser.getTok().getLoc();
+  // Eat 'lsl'
+  Parser.Lex();
 
-  bool Hash = false;
   if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat '#'
-    Hash = true;
+    Parser.Lex();
   }
 
-  bool Negative = false;
-  if (Parser.getTok().is(AsmToken::Minus)) {
-    Negative = true;
-    Parser.Lex(); // Eat '-'
-  } else if (Parser.getTok().is(AsmToken::Plus)) {
-    Parser.Lex(); // Eat '+'
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+    return MatchOperand_ParseFail;
   }
 
-  if (Parser.getTok().isNot(AsmToken::Real)) {
-    if (!Hash)
-      return MatchOperand_NoMatch;
-    Error(S, "Expected floating-point immediate");
+  int64_t ShiftAmount = Parser.getTok().getIntVal();
+
+  if (ShiftAmount < 0) {
+    Error(Parser.getTok().getLoc(), "positive shift amount required");
     return MatchOperand_ParseFail;
   }
+  Parser.Lex(); // Eat the number
 
-  APFloat RealVal(APFloat::IEEEdouble, Parser.getTok().getString());
-  if (Negative) RealVal.changeSign();
-  double DblVal = RealVal.convertToDouble();
-
-  Parser.Lex(); // Eat real number
   SMLoc E = Parser.getTok().getLoc();
-
-  Operands.push_back(AArch64Operand::CreateFPImm(DblVal, S, E));
+  Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
+                                                      S, E, getContext()));
   return MatchOperand_Success;
 }
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseFPImm0AndImm0Operand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+/// parseCondCodeString - Parse a Condition Code string.
+AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
+  AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
+                    .Case("eq", AArch64CC::EQ)
+                    .Case("ne", AArch64CC::NE)
+                    .Case("cs", AArch64CC::HS)
+                    .Case("hs", AArch64CC::HS)
+                    .Case("cc", AArch64CC::LO)
+                    .Case("lo", AArch64CC::LO)
+                    .Case("mi", AArch64CC::MI)
+                    .Case("pl", AArch64CC::PL)
+                    .Case("vs", AArch64CC::VS)
+                    .Case("vc", AArch64CC::VC)
+                    .Case("hi", AArch64CC::HI)
+                    .Case("ls", AArch64CC::LS)
+                    .Case("ge", AArch64CC::GE)
+                    .Case("lt", AArch64CC::LT)
+                    .Case("gt", AArch64CC::GT)
+                    .Case("le", AArch64CC::LE)
+                    .Case("al", AArch64CC::AL)
+                    .Case("nv", AArch64CC::NV)
+                    .Default(AArch64CC::Invalid);
+  return CC;
+}
 
-  SMLoc S = Parser.getTok().getLoc();
+/// parseCondCode - Parse a Condition Code operand.
+bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
+                                     bool invertCondCode) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
 
-  bool Hash = false;
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat '#'
-    Hash = true;
-  }
+  StringRef Cond = Tok.getString();
+  AArch64CC::CondCode CC = parseCondCodeString(Cond);
+  if (CC == AArch64CC::Invalid)
+    return TokError("invalid condition code");
+  Parser.Lex(); // Eat identifier token.
 
-  APFloat RealVal(0.0);
-  if (Parser.getTok().is(AsmToken::Real)) {
-    if(Parser.getTok().getString() != "0.0") {
-      Error(S, "only #0.0 is acceptable as immediate");
-      return MatchOperand_ParseFail;
-    }
-  }
-  else if (Parser.getTok().is(AsmToken::Integer)) {
-    if(Parser.getTok().getIntVal() != 0) {
-      Error(S, "only #0.0 is acceptable as immediate");
+  if (invertCondCode)
+    CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC));
+
+  Operands.push_back(
+      AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
+  return false;
+}
+
+/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
+/// them if present.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  std::string LowerID = Tok.getString().lower();
+  AArch64_AM::ShiftExtendType ShOp =
+      StringSwitch<AArch64_AM::ShiftExtendType>(LowerID)
+          .Case("lsl", AArch64_AM::LSL)
+          .Case("lsr", AArch64_AM::LSR)
+          .Case("asr", AArch64_AM::ASR)
+          .Case("ror", AArch64_AM::ROR)
+          .Case("msl", AArch64_AM::MSL)
+          .Case("uxtb", AArch64_AM::UXTB)
+          .Case("uxth", AArch64_AM::UXTH)
+          .Case("uxtw", AArch64_AM::UXTW)
+          .Case("uxtx", AArch64_AM::UXTX)
+          .Case("sxtb", AArch64_AM::SXTB)
+          .Case("sxth", AArch64_AM::SXTH)
+          .Case("sxtw", AArch64_AM::SXTW)
+          .Case("sxtx", AArch64_AM::SXTX)
+          .Default(AArch64_AM::InvalidShiftExtend);
+
+  if (ShOp == AArch64_AM::InvalidShiftExtend)
+    return MatchOperand_NoMatch;
+
+  SMLoc S = Tok.getLoc();
+  Parser.Lex();
+
+  bool Hash = getLexer().is(AsmToken::Hash);
+  if (!Hash && getLexer().isNot(AsmToken::Integer)) {
+    if (ShOp == AArch64_AM::LSL || ShOp == AArch64_AM::LSR ||
+        ShOp == AArch64_AM::ASR || ShOp == AArch64_AM::ROR ||
+        ShOp == AArch64_AM::MSL) {
+      // We expect a number here.
+      TokError("expected #imm after shift specifier");
       return MatchOperand_ParseFail;
     }
+
+    // "extend" type operatoins don't need an immediate, #0 is implicit.
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(
+        AArch64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext()));
+    return MatchOperand_Success;
   }
-  else {
-    if (!Hash)
-      return MatchOperand_NoMatch;
-    Error(S, "only #0.0 is acceptable as immediate");
+
+  if (Hash)
+    Parser.Lex(); // Eat the '#'.
+
+  // Make sure we do actually have a number
+  if (!Parser.getTok().is(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(),
+          "expected integer shift amount");
     return MatchOperand_ParseFail;
   }
 
-  Parser.Lex(); // Eat real number
-  SMLoc E = Parser.getTok().getLoc();
+  const MCExpr *ImmVal;
+  if (getParser().parseExpression(ImmVal))
+    return MatchOperand_ParseFail;
+
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+  if (!MCE) {
+    TokError("expected #imm after shift specifier");
+    return MatchOperand_ParseFail;
+  }
 
-  Operands.push_back(AArch64Operand::CreateFPImm(0.0, S, E));
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateShiftExtend(
+      ShOp, MCE->getValue(), true, S, E, getContext()));
   return MatchOperand_Success;
 }
 
-// Automatically generated
-static unsigned MatchRegisterName(StringRef Name);
-
-bool
-AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc,
-                                   StringRef &Layout,
-                                   SMLoc &LayoutLoc) const {
-  const AsmToken &Tok = Parser.getTok();
-
-  if (Tok.isNot(AsmToken::Identifier))
-    return false;
+/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
+/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
+bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
+                                   OperandVector &Operands) {
+  if (Name.find('.') != StringRef::npos)
+    return TokError("invalid operand");
 
-  std::string LowerReg = Tok.getString().lower();
-  size_t DotPos = LowerReg.find('.');
+  Mnemonic = Name;
+  Operands.push_back(
+      AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
 
-  bool IsVec128 = false;
+  const AsmToken &Tok = Parser.getTok();
+  StringRef Op = Tok.getString();
   SMLoc S = Tok.getLoc();
-  RegEndLoc = SMLoc::getFromPointer(S.getPointer() + DotPos);
 
-  if (DotPos == std::string::npos) {
-    Layout = StringRef();
-  } else {
-    // Everything afterwards needs to be a literal token, expected to be
-    // '.2d','.b' etc for vector registers.
-
-    // This StringSwitch validates the input and (perhaps more importantly)
-    // gives us a permanent string to use in the token (a pointer into LowerReg
-    // would go out of scope when we return).
-    LayoutLoc = SMLoc::getFromPointer(S.getPointer() + DotPos + 1);
-    StringRef LayoutText = StringRef(LowerReg).substr(DotPos);
-
-    // See if it's a 128-bit layout first.
-    Layout = StringSwitch<const char *>(LayoutText)
-      .Case(".q", ".q").Case(".1q", ".1q")
-      .Case(".d", ".d").Case(".2d", ".2d")
-      .Case(".s", ".s").Case(".4s", ".4s")
-      .Case(".h", ".h").Case(".8h", ".8h")
-      .Case(".b", ".b").Case(".16b", ".16b")
-      .Default("");
-
-    if (Layout.size() != 0)
-      IsVec128 = true;
-    else {
-      Layout = StringSwitch<const char *>(LayoutText)
-                   .Case(".1d", ".1d")
-                   .Case(".2s", ".2s")
-                   .Case(".4h", ".4h")
-                   .Case(".8b", ".8b")
-                   .Default("");
+  const MCExpr *Expr = nullptr;
+
+#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
+  do {                                                                         \
+    Expr = MCConstantExpr::Create(op1, getContext());                          \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));           \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));           \
+    Expr = MCConstantExpr::Create(op2, getContext());                          \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
+  } while (0)
+
+  if (Mnemonic == "ic") {
+    if (!Op.compare_lower("ialluis")) {
+      // SYS #0, C7, C1, #0
+      SYS_ALIAS(0, 7, 1, 0);
+    } else if (!Op.compare_lower("iallu")) {
+      // SYS #0, C7, C5, #0
+      SYS_ALIAS(0, 7, 5, 0);
+    } else if (!Op.compare_lower("ivau")) {
+      // SYS #3, C7, C5, #1
+      SYS_ALIAS(3, 7, 5, 1);
+    } else {
+      return TokError("invalid operand for IC instruction");
     }
-
-    if (Layout.size() == 0) {
-      // If we've still not pinned it down the register is malformed.
-      return false;
+  } else if (Mnemonic == "dc") {
+    if (!Op.compare_lower("zva")) {
+      // SYS #3, C7, C4, #1
+      SYS_ALIAS(3, 7, 4, 1);
+    } else if (!Op.compare_lower("ivac")) {
+      // SYS #3, C7, C6, #1
+      SYS_ALIAS(0, 7, 6, 1);
+    } else if (!Op.compare_lower("isw")) {
+      // SYS #0, C7, C6, #2
+      SYS_ALIAS(0, 7, 6, 2);
+    } else if (!Op.compare_lower("cvac")) {
+      // SYS #3, C7, C10, #1
+      SYS_ALIAS(3, 7, 10, 1);
+    } else if (!Op.compare_lower("csw")) {
+      // SYS #0, C7, C10, #2
+      SYS_ALIAS(0, 7, 10, 2);
+    } else if (!Op.compare_lower("cvau")) {
+      // SYS #3, C7, C11, #1
+      SYS_ALIAS(3, 7, 11, 1);
+    } else if (!Op.compare_lower("civac")) {
+      // SYS #3, C7, C14, #1
+      SYS_ALIAS(3, 7, 14, 1);
+    } else if (!Op.compare_lower("cisw")) {
+      // SYS #0, C7, C14, #2
+      SYS_ALIAS(0, 7, 14, 2);
+    } else {
+      return TokError("invalid operand for DC instruction");
+    }
+  } else if (Mnemonic == "at") {
+    if (!Op.compare_lower("s1e1r")) {
+      // SYS #0, C7, C8, #0
+      SYS_ALIAS(0, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e2r")) {
+      // SYS #4, C7, C8, #0
+      SYS_ALIAS(4, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e3r")) {
+      // SYS #6, C7, C8, #0
+      SYS_ALIAS(6, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e1w")) {
+      // SYS #0, C7, C8, #1
+      SYS_ALIAS(0, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e2w")) {
+      // SYS #4, C7, C8, #1
+      SYS_ALIAS(4, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e3w")) {
+      // SYS #6, C7, C8, #1
+      SYS_ALIAS(6, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e0r")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 2);
+    } else if (!Op.compare_lower("s1e0w")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 3);
+    } else if (!Op.compare_lower("s12e1r")) {
+      // SYS #4, C7, C8, #4
+      SYS_ALIAS(4, 7, 8, 4);
+    } else if (!Op.compare_lower("s12e1w")) {
+      // SYS #4, C7, C8, #5
+      SYS_ALIAS(4, 7, 8, 5);
+    } else if (!Op.compare_lower("s12e0r")) {
+      // SYS #4, C7, C8, #6
+      SYS_ALIAS(4, 7, 8, 6);
+    } else if (!Op.compare_lower("s12e0w")) {
+      // SYS #4, C7, C8, #7
+      SYS_ALIAS(4, 7, 8, 7);
+    } else {
+      return TokError("invalid operand for AT instruction");
+    }
+  } else if (Mnemonic == "tlbi") {
+    if (!Op.compare_lower("vmalle1is")) {
+      // SYS #0, C8, C3, #0
+      SYS_ALIAS(0, 8, 3, 0);
+    } else if (!Op.compare_lower("alle2is")) {
+      // SYS #4, C8, C3, #0
+      SYS_ALIAS(4, 8, 3, 0);
+    } else if (!Op.compare_lower("alle3is")) {
+      // SYS #6, C8, C3, #0
+      SYS_ALIAS(6, 8, 3, 0);
+    } else if (!Op.compare_lower("vae1is")) {
+      // SYS #0, C8, C3, #1
+      SYS_ALIAS(0, 8, 3, 1);
+    } else if (!Op.compare_lower("vae2is")) {
+      // SYS #4, C8, C3, #1
+      SYS_ALIAS(4, 8, 3, 1);
+    } else if (!Op.compare_lower("vae3is")) {
+      // SYS #6, C8, C3, #1
+      SYS_ALIAS(6, 8, 3, 1);
+    } else if (!Op.compare_lower("aside1is")) {
+      // SYS #0, C8, C3, #2
+      SYS_ALIAS(0, 8, 3, 2);
+    } else if (!Op.compare_lower("vaae1is")) {
+      // SYS #0, C8, C3, #3
+      SYS_ALIAS(0, 8, 3, 3);
+    } else if (!Op.compare_lower("alle1is")) {
+      // SYS #4, C8, C3, #4
+      SYS_ALIAS(4, 8, 3, 4);
+    } else if (!Op.compare_lower("vale1is")) {
+      // SYS #0, C8, C3, #5
+      SYS_ALIAS(0, 8, 3, 5);
+    } else if (!Op.compare_lower("vaale1is")) {
+      // SYS #0, C8, C3, #7
+      SYS_ALIAS(0, 8, 3, 7);
+    } else if (!Op.compare_lower("vmalle1")) {
+      // SYS #0, C8, C7, #0
+      SYS_ALIAS(0, 8, 7, 0);
+    } else if (!Op.compare_lower("alle2")) {
+      // SYS #4, C8, C7, #0
+      SYS_ALIAS(4, 8, 7, 0);
+    } else if (!Op.compare_lower("vale2is")) {
+      // SYS #4, C8, C3, #5
+      SYS_ALIAS(4, 8, 3, 5);
+    } else if (!Op.compare_lower("vale3is")) {
+      // SYS #6, C8, C3, #5
+      SYS_ALIAS(6, 8, 3, 5);
+    } else if (!Op.compare_lower("alle3")) {
+      // SYS #6, C8, C7, #0
+      SYS_ALIAS(6, 8, 7, 0);
+    } else if (!Op.compare_lower("vae1")) {
+      // SYS #0, C8, C7, #1
+      SYS_ALIAS(0, 8, 7, 1);
+    } else if (!Op.compare_lower("vae2")) {
+      // SYS #4, C8, C7, #1
+      SYS_ALIAS(4, 8, 7, 1);
+    } else if (!Op.compare_lower("vae3")) {
+      // SYS #6, C8, C7, #1
+      SYS_ALIAS(6, 8, 7, 1);
+    } else if (!Op.compare_lower("aside1")) {
+      // SYS #0, C8, C7, #2
+      SYS_ALIAS(0, 8, 7, 2);
+    } else if (!Op.compare_lower("vaae1")) {
+      // SYS #0, C8, C7, #3
+      SYS_ALIAS(0, 8, 7, 3);
+    } else if (!Op.compare_lower("alle1")) {
+      // SYS #4, C8, C7, #4
+      SYS_ALIAS(4, 8, 7, 4);
+    } else if (!Op.compare_lower("vale1")) {
+      // SYS #0, C8, C7, #5
+      SYS_ALIAS(0, 8, 7, 5);
+    } else if (!Op.compare_lower("vale2")) {
+      // SYS #4, C8, C7, #5
+      SYS_ALIAS(4, 8, 7, 5);
+    } else if (!Op.compare_lower("vale3")) {
+      // SYS #6, C8, C7, #5
+      SYS_ALIAS(6, 8, 7, 5);
+    } else if (!Op.compare_lower("vaale1")) {
+      // SYS #0, C8, C7, #7
+      SYS_ALIAS(0, 8, 7, 7);
+    } else if (!Op.compare_lower("ipas2e1")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 4, 1);
+    } else if (!Op.compare_lower("ipas2le1")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 4, 5);
+    } else if (!Op.compare_lower("ipas2e1is")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 0, 1);
+    } else if (!Op.compare_lower("ipas2le1is")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 0, 5);
+    } else if (!Op.compare_lower("vmalls12e1")) {
+      // SYS #4, C8, C7, #6
+      SYS_ALIAS(4, 8, 7, 6);
+    } else if (!Op.compare_lower("vmalls12e1is")) {
+      // SYS #4, C8, C3, #6
+      SYS_ALIAS(4, 8, 3, 6);
+    } else {
+      return TokError("invalid operand for TLBI instruction");
     }
   }
 
-  RegNum = MatchRegisterName(LowerReg.substr(0, DotPos));
-  if (RegNum == AArch64::NoRegister) {
-    RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos))
-      .Case("ip0", AArch64::X16)
-      .Case("ip1", AArch64::X17)
-      .Case("fp", AArch64::X29)
-      .Case("lr", AArch64::X30)
-      .Case("v0", IsVec128 ? AArch64::Q0 : AArch64::D0)
-      .Case("v1", IsVec128 ? AArch64::Q1 : AArch64::D1)
-      .Case("v2", IsVec128 ? AArch64::Q2 : AArch64::D2)
-      .Case("v3", IsVec128 ? AArch64::Q3 : AArch64::D3)
-      .Case("v4", IsVec128 ? AArch64::Q4 : AArch64::D4)
-      .Case("v5", IsVec128 ? AArch64::Q5 : AArch64::D5)
-      .Case("v6", IsVec128 ? AArch64::Q6 : AArch64::D6)
-      .Case("v7", IsVec128 ? AArch64::Q7 : AArch64::D7)
-      .Case("v8", IsVec128 ? AArch64::Q8 : AArch64::D8)
-      .Case("v9", IsVec128 ? AArch64::Q9 : AArch64::D9)
-      .Case("v10", IsVec128 ? AArch64::Q10 : AArch64::D10)
-      .Case("v11", IsVec128 ? AArch64::Q11 : AArch64::D11)
-      .Case("v12", IsVec128 ? AArch64::Q12 : AArch64::D12)
-      .Case("v13", IsVec128 ? AArch64::Q13 : AArch64::D13)
-      .Case("v14", IsVec128 ? AArch64::Q14 : AArch64::D14)
-      .Case("v15", IsVec128 ? AArch64::Q15 : AArch64::D15)
-      .Case("v16", IsVec128 ? AArch64::Q16 : AArch64::D16)
-      .Case("v17", IsVec128 ? AArch64::Q17 : AArch64::D17)
-      .Case("v18", IsVec128 ? AArch64::Q18 : AArch64::D18)
-      .Case("v19", IsVec128 ? AArch64::Q19 : AArch64::D19)
-      .Case("v20", IsVec128 ? AArch64::Q20 : AArch64::D20)
-      .Case("v21", IsVec128 ? AArch64::Q21 : AArch64::D21)
-      .Case("v22", IsVec128 ? AArch64::Q22 : AArch64::D22)
-      .Case("v23", IsVec128 ? AArch64::Q23 : AArch64::D23)
-      .Case("v24", IsVec128 ? AArch64::Q24 : AArch64::D24)
-      .Case("v25", IsVec128 ? AArch64::Q25 : AArch64::D25)
-      .Case("v26", IsVec128 ? AArch64::Q26 : AArch64::D26)
-      .Case("v27", IsVec128 ? AArch64::Q27 : AArch64::D27)
-      .Case("v28", IsVec128 ? AArch64::Q28 : AArch64::D28)
-      .Case("v29", IsVec128 ? AArch64::Q29 : AArch64::D29)
-      .Case("v30", IsVec128 ? AArch64::Q30 : AArch64::D30)
-      .Case("v31", IsVec128 ? AArch64::Q31 : AArch64::D31)
-      .Default(AArch64::NoRegister);
-  }
-  if (RegNum == AArch64::NoRegister)
-    return false;
+#undef SYS_ALIAS
 
-  return true;
-}
+  Parser.Lex(); // Eat operand.
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                uint32_t &NumLanes) {
-  unsigned RegNum;
-  StringRef Layout;
-  SMLoc RegEndLoc, LayoutLoc;
-  SMLoc S = Parser.getTok().getLoc();
-
-  if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
-    return MatchOperand_NoMatch;
+  bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
+  bool HasRegister = false;
 
-  Operands.push_back(AArch64Operand::CreateReg(RegNum, S, RegEndLoc));
+  // Check for the optional register operand.
+  if (getLexer().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat comma.
 
-  if (Layout.size() != 0) {
-    unsigned long long TmpLanes = 0;
-    llvm::getAsUnsignedInteger(Layout.substr(1), 10, TmpLanes);
-    if (TmpLanes != 0) {
-      NumLanes = TmpLanes;
-    } else {
-      // If the number of lanes isn't specified explicitly, a valid instruction
-      // will have an element specifier and be capable of acting on the entire
-      // vector register.
-      switch (Layout.back()) {
-      default: llvm_unreachable("Invalid layout specifier");
-      case 'b': NumLanes = 16; break;
-      case 'h': NumLanes = 8; break;
-      case 's': NumLanes = 4; break;
-      case 'd': NumLanes = 2; break;
-      case 'q': NumLanes = 1; break;
-      }
-    }
+    if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
+      return TokError("expected register operand");
 
-    Operands.push_back(AArch64Operand::CreateToken(Layout, LayoutLoc));
+    HasRegister = true;
   }
 
-  Parser.Lex();
-  return MatchOperand_Success;
-}
-
-bool
-AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
-                                SMLoc &EndLoc) {
-  // This callback is used for things like DWARF frame directives in
-  // assembly. They don't care about things like NEON layouts or lanes, they
-  // just want to be able to produce the DWARF register number.
-  StringRef LayoutSpec;
-  SMLoc RegEndLoc, LayoutLoc;
-  StartLoc = Parser.getTok().getLoc();
-
-  if (!IdentifyRegister(RegNo, RegEndLoc, LayoutSpec, LayoutLoc))
-    return true;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Parser.eatToEndOfStatement();
+    return TokError("unexpected token in argument list");
+  }
 
-  Parser.Lex();
-  EndLoc = Parser.getTok().getLoc();
+  if (ExpectRegister && !HasRegister) {
+    return TokError("specified " + Mnemonic + " op requires a register");
+  }
+  else if (!ExpectRegister && HasRegister) {
+    return TokError("specified " + Mnemonic + " op does not use a register");
+  }
 
+  Parser.Lex(); // Consume the EndOfStatement
   return false;
 }
 
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseNamedImmOperand(const NamedImmMapper &Mapper,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Since these operands occur in very limited circumstances, without
-  // alternatives, we actually signal an error if there is no match. If relaxing
-  // this, beware of unintended consequences: an immediate will be accepted
-  // during matching, no matter how it gets into the AArch64Operand.
+AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
   const AsmToken &Tok = Parser.getTok();
-  SMLoc S = Tok.getLoc();
 
-  if (Tok.is(AsmToken::Identifier)) {
-    bool ValidName;
-    uint32_t Code = Mapper.fromString(Tok.getString().lower(), ValidName);
-
-    if (!ValidName) {
-      Error(S, "operand specifier not recognised");
+  // Can be either a #imm style literal or an option name
+  bool Hash = Tok.is(AsmToken::Hash);
+  if (Hash || Tok.is(AsmToken::Integer)) {
+    // Immediate operand.
+    if (Hash)
+      Parser.Lex(); // Eat the '#'
+    const MCExpr *ImmVal;
+    SMLoc ExprLoc = getLoc();
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      Error(ExprLoc, "immediate value expected for barrier operand");
       return MatchOperand_ParseFail;
     }
-
-    Parser.Lex(); // We're done with the identifier. Eat it
-
-    SMLoc E = Parser.getTok().getLoc();
-    const MCExpr *Imm = MCConstantExpr::Create(Code, getContext());
-    Operands.push_back(AArch64Operand::CreateImm(Imm, S, E));
+    if (MCE->getValue() < 0 || MCE->getValue() > 15) {
+      Error(ExprLoc, "barrier operand out of range");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(
+        AArch64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext()));
     return MatchOperand_Success;
-  } else if (Tok.is(AsmToken::Hash)) {
-    Parser.Lex();
+  }
 
-    const MCExpr *ImmVal;
-    if (ParseImmediate(ImmVal) != MatchOperand_Success)
-      return MatchOperand_ParseFail;
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("invalid operand for instruction");
+    return MatchOperand_ParseFail;
+  }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!CE || CE->getValue() < 0 || !Mapper.validImm(CE->getValue())) {
-      Error(S, "Invalid immediate for instruction");
-      return MatchOperand_ParseFail;
-    }
+  bool Valid;
+  unsigned Opt = AArch64DB::DBarrierMapper().fromString(Tok.getString(), Valid);
+  if (!Valid) {
+    TokError("invalid barrier option name");
+    return MatchOperand_ParseFail;
+  }
 
-    SMLoc E = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E));
-    return MatchOperand_Success;
+  // The only valid named option for ISB is 'sy'
+  if (Mnemonic == "isb" && Opt != AArch64DB::SY) {
+    TokError("'sy' or #imm operand expected");
+    return MatchOperand_ParseFail;
   }
 
-  Error(S, "unexpected operand for instruction");
-  return MatchOperand_ParseFail;
+  Operands.push_back(
+      AArch64Operand::CreateBarrier(Opt, getLoc(), getContext()));
+  Parser.Lex(); // Consume the option
+
+  return MatchOperand_Success;
 }
 
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseSysRegOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
   const AsmToken &Tok = Parser.getTok();
 
-  // Any MSR/MRS operand will be an identifier, and we want to store it as some
-  // kind of string: SPSel is valid for two different forms of MSR with two
-  // different encodings. There's no collision at the moment, but the potential
-  // is there.
-  if (!Tok.is(AsmToken::Identifier)) {
+  if (Tok.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
-  }
 
-  SMLoc S = Tok.getLoc();
-  Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), S));
+  Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), getLoc(),
+                     STI.getFeatureBits(), getContext()));
   Parser.Lex(); // Eat identifier
 
   return MatchOperand_Success;
 }
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseLSXAddressOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-
-  unsigned RegNum;
-  SMLoc RegEndLoc, LayoutLoc;
-  StringRef Layout;
-  if(!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc)
-     || !AArch64MCRegisterClasses[AArch64::GPR64xspRegClassID].contains(RegNum)
-     || Layout.size() != 0) {
-    // Check Layout.size because we don't want to let "x3.4s" or similar
-    // through.
-    return MatchOperand_NoMatch;
-  }
-  Parser.Lex(); // Eat register
+/// tryParseVectorRegister - Parse a vector register operand.
+bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return true;
 
-  if (Parser.getTok().is(AsmToken::RBrac)) {
-    // We're done
-    SMLoc E = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateWrappedReg(RegNum, S, E));
-    return MatchOperand_Success;
-  }
+  SMLoc S = getLoc();
+  // Check for a vector register specifier first.
+  StringRef Kind;
+  int64_t Reg = tryMatchVectorRegister(Kind, false);
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
+  // If there was an explicit qualifier, that goes on as a literal text
+  // operand.
+  if (!Kind.empty())
+    Operands.push_back(
+        AArch64Operand::CreateToken(Kind, false, S, getContext()));
+
+  // If there is an index specifier following the register, parse that too.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = getLoc();
+    Parser.Lex(); // Eat left bracket token.
 
-  // Otherwise, only ", #0" is valid
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
+    }
 
-  if (Parser.getTok().isNot(AsmToken::Comma)) {
-    Error(Parser.getTok().getLoc(), "expected ',' or ']' after register");
-    return MatchOperand_ParseFail;
-  }
-  Parser.Lex(); // Eat ','
+    SMLoc E = getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return false;
+    }
 
-  if (Parser.getTok().isNot(AsmToken::Hash)) {
-    Error(Parser.getTok().getLoc(), "expected '#0'");
-    return MatchOperand_ParseFail;
-  }
-  Parser.Lex(); // Eat '#'
+    Parser.Lex(); // Eat right bracket token.
 
-  if (Parser.getTok().isNot(AsmToken::Integer)
-      || Parser.getTok().getIntVal() != 0 ) {
-    Error(Parser.getTok().getLoc(), "expected '#0'");
-    return MatchOperand_ParseFail;
+    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+                                                         E, getContext()));
   }
-  Parser.Lex(); // Eat '0'
 
-  SMLoc E = Parser.getTok().getLoc();
-  Operands.push_back(AArch64Operand::CreateWrappedReg(RegNum, S, E));
-  return MatchOperand_Success;
+  return false;
 }
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseShiftExtend(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  StringRef IDVal = Parser.getTok().getIdentifier();
-  std::string LowerID = IDVal.lower();
-
-  A64SE::ShiftExtSpecifiers Spec =
-      StringSwitch<A64SE::ShiftExtSpecifiers>(LowerID)
-        .Case("lsl", A64SE::LSL)
-	.Case("msl", A64SE::MSL)
-	.Case("lsr", A64SE::LSR)
-	.Case("asr", A64SE::ASR)
-	.Case("ror", A64SE::ROR)
-	.Case("uxtb", A64SE::UXTB)
-	.Case("uxth", A64SE::UXTH)
-	.Case("uxtw", A64SE::UXTW)
-	.Case("uxtx", A64SE::UXTX)
-	.Case("sxtb", A64SE::SXTB)
-	.Case("sxth", A64SE::SXTH)
-	.Case("sxtw", A64SE::SXTW)
-	.Case("sxtx", A64SE::SXTX)
-	.Default(A64SE::Invalid);
-
-  if (Spec == A64SE::Invalid)
-    return MatchOperand_NoMatch;
+/// parseRegister - Parse a non-vector register operand.
+bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  // Try for a vector register.
+  if (!tryParseVectorRegister(Operands))
+    return false;
 
-  // Eat the shift
-  SMLoc S, E;
-  S = Parser.getTok().getLoc();
-  Parser.Lex();
+  // Try for a scalar register.
+  int64_t Reg = tryParseRegister();
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
 
-  if (Spec != A64SE::LSL && Spec != A64SE::LSR && Spec != A64SE::ASR &&
-      Spec != A64SE::ROR && Spec != A64SE::MSL) {
-    // The shift amount can be omitted for the extending versions, but not real
-    // shifts:
-    //     add x0, x0, x0, uxtb
-    // is valid, and equivalent to
-    //     add x0, x0, x0, uxtb #0
-
-    if (Parser.getTok().is(AsmToken::Comma) ||
-        Parser.getTok().is(AsmToken::EndOfStatement) ||
-        Parser.getTok().is(AsmToken::RBrac)) {
-      Operands.push_back(AArch64Operand::CreateShiftExtend(Spec, 0, true,
-                                                           S, E));
-      return MatchOperand_Success;
+  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
+  // as a string token in the instruction itself.
+  if (getLexer().getKind() == AsmToken::LBrac) {
+    SMLoc LBracS = getLoc();
+    Parser.Lex();
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Integer)) {
+      SMLoc IntS = getLoc();
+      int64_t Val = Tok.getIntVal();
+      if (Val == 1) {
+        Parser.Lex();
+        if (getLexer().getKind() == AsmToken::RBrac) {
+          SMLoc RBracS = getLoc();
+          Parser.Lex();
+          Operands.push_back(
+              AArch64Operand::CreateToken("[", false, LBracS, getContext()));
+          Operands.push_back(
+              AArch64Operand::CreateToken("1", false, IntS, getContext()));
+          Operands.push_back(
+              AArch64Operand::CreateToken("]", false, RBracS, getContext()));
+          return false;
+        }
+      }
     }
   }
 
-  // Eat # at beginning of immediate
-  if (!Parser.getTok().is(AsmToken::Hash)) {
-    Error(Parser.getTok().getLoc(),
-          "expected #imm after shift specifier");
-    return MatchOperand_ParseFail;
-  }
-  Parser.Lex();
+  return false;
+}
 
-  // Make sure we do actually have a number
-  if (!Parser.getTok().is(AsmToken::Integer)) {
-    Error(Parser.getTok().getLoc(),
-          "expected integer shift amount");
-    return MatchOperand_ParseFail;
-  }
-  unsigned Amount = Parser.getTok().getIntVal();
-  Parser.Lex();
-  E = Parser.getTok().getLoc();
+bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
+  bool HasELFModifier = false;
+  AArch64MCExpr::VariantKind RefKind;
 
-  Operands.push_back(AArch64Operand::CreateShiftExtend(Spec, Amount, false,
-                                                       S, E));
+  if (Parser.getTok().is(AsmToken::Colon)) {
+    Parser.Lex(); // Eat ':"
+    HasELFModifier = true;
 
-  return MatchOperand_Success;
-}
+    if (Parser.getTok().isNot(AsmToken::Identifier)) {
+      Error(Parser.getTok().getLoc(),
+            "expect relocation specifier in operand after ':'");
+      return true;
+    }
 
-/// Try to parse a vector register token, If it is a vector register,
-/// the token is eaten and return true. Otherwise return false.
-bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc,
-                                      StringRef &Layout, SMLoc &LayoutLoc) {
-  bool IsVector = true;
-
-  if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
-    IsVector = false;
-  else if (!AArch64MCRegisterClasses[AArch64::FPR64RegClassID]
-                .contains(RegNum) &&
-           !AArch64MCRegisterClasses[AArch64::FPR128RegClassID]
-                .contains(RegNum))
-    IsVector = false;
-  else if (Layout.size() == 0)
-    IsVector = false;
-
-  if (!IsVector)
-    Error(Parser.getTok().getLoc(), "expected vector type register");
-
-  Parser.Lex(); // Eat this token.
-  return IsVector;
-}
+    std::string LowerCase = Parser.getTok().getIdentifier().lower();
+    RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
+                  .Case("lo12", AArch64MCExpr::VK_LO12)
+                  .Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
+                  .Case("abs_g2", AArch64MCExpr::VK_ABS_G2)
+                  .Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S)
+                  .Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC)
+                  .Case("abs_g1", AArch64MCExpr::VK_ABS_G1)
+                  .Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S)
+                  .Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC)
+                  .Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
+                  .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
+                  .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
+                  .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
+                  .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
+                  .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
+                  .Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0)
+                  .Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC)
+                  .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
+                  .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
+                  .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
+                  .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
+                  .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
+                  .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
+                  .Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0)
+                  .Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC)
+                  .Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12)
+                  .Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12)
+                  .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
+                  .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
+                  .Case("got", AArch64MCExpr::VK_GOT_PAGE)
+                  .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
+                  .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
+                  .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
+                  .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
+                  .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
+                  .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
+                  .Default(AArch64MCExpr::VK_INVALID);
+
+    if (RefKind == AArch64MCExpr::VK_INVALID) {
+      Error(Parser.getTok().getLoc(),
+            "expect relocation specifier in operand after ':'");
+      return true;
+    }
 
+    Parser.Lex(); // Eat identifier
 
-// A vector list contains 1-4 consecutive registers.
-// Now there are two kinds of vector list when number of vector > 1:
-//   (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
-//   (2) {Vn.layout - Vm.layout}
-// If the layout is like .b/.h/.s/.d, also parse the lane.
-AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  if (Parser.getTok().isNot(AsmToken::LCurly)) {
-    Error(Parser.getTok().getLoc(), "'{' expected");
-    return MatchOperand_ParseFail;
+    if (Parser.getTok().isNot(AsmToken::Colon)) {
+      Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier");
+      return true;
+    }
+    Parser.Lex(); // Eat ':'
   }
-  SMLoc SLoc = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat '{' token.
 
-  unsigned Reg, Count = 1;
-  StringRef LayoutStr;
-  SMLoc RegEndLoc, LayoutLoc;
-  if (!TryParseVector(Reg, RegEndLoc, LayoutStr, LayoutLoc))
-    return MatchOperand_ParseFail;
+  if (getParser().parseExpression(ImmVal))
+    return true;
+
+  if (HasELFModifier)
+    ImmVal = AArch64MCExpr::Create(ImmVal, RefKind, getContext());
+
+  return false;
+}
+
+/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
+bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+  assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat left bracket token.
+  StringRef Kind;
+  int64_t FirstReg = tryMatchVectorRegister(Kind, true);
+  if (FirstReg == -1)
+    return true;
+  int64_t PrevReg = FirstReg;
+  unsigned Count = 1;
 
   if (Parser.getTok().is(AsmToken::Minus)) {
     Parser.Lex(); // Eat the minus.
 
-    unsigned Reg2;
-    StringRef LayoutStr2;
-    SMLoc RegEndLoc2, LayoutLoc2;
-    SMLoc RegLoc2 = Parser.getTok().getLoc();
+    SMLoc Loc = getLoc();
+    StringRef NextKind;
+    int64_t Reg = tryMatchVectorRegister(NextKind, true);
+    if (Reg == -1)
+      return true;
+    // Any Kind suffices must match on all regs in the list.
+    if (Kind != NextKind)
+      return Error(Loc, "mismatched register size suffix");
 
-    if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
-      return MatchOperand_ParseFail;
-    unsigned Space = (Reg < Reg2) ? (Reg2 - Reg) : (Reg2 + 32 - Reg);
+    unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
 
-    if (LayoutStr != LayoutStr2) {
-      Error(LayoutLoc2, "expected the same vector layout");
-      return MatchOperand_ParseFail;
-    }
     if (Space == 0 || Space > 3) {
-      Error(RegLoc2, "invalid number of vectors");
-      return MatchOperand_ParseFail;
+      return Error(Loc, "invalid number of vectors");
     }
 
     Count += Space;
-  } else {
-    unsigned LastReg = Reg;
+  }
+  else {
     while (Parser.getTok().is(AsmToken::Comma)) {
-      Parser.Lex(); // Eat the comma.
-      unsigned Reg2;
-      StringRef LayoutStr2;
-      SMLoc RegEndLoc2, LayoutLoc2;
-      SMLoc RegLoc2 = Parser.getTok().getLoc();
+      Parser.Lex(); // Eat the comma token.
 
-      if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
-        return MatchOperand_ParseFail;
-      unsigned Space = (LastReg < Reg2) ? (Reg2 - LastReg)
-                                        : (Reg2 + 32 - LastReg);
-      Count++;
-
-      // The space between two vectors should be 1. And they should have the same layout.
-      // Total count shouldn't be great than 4
-      if (Space != 1) {
-        Error(RegLoc2, "invalid space between two vectors");
-        return MatchOperand_ParseFail;
-      }
-      if (LayoutStr != LayoutStr2) {
-        Error(LayoutLoc2, "expected the same vector layout");
-        return MatchOperand_ParseFail;
-      }
-      if (Count > 4) {
-        Error(RegLoc2, "invalid number of vectors");
-        return MatchOperand_ParseFail;
-      }
+      SMLoc Loc = getLoc();
+      StringRef NextKind;
+      int64_t Reg = tryMatchVectorRegister(NextKind, true);
+      if (Reg == -1)
+        return true;
+      // Any Kind suffices must match on all regs in the list.
+      if (Kind != NextKind)
+        return Error(Loc, "mismatched register size suffix");
+
+      // Registers must be incremental (with wraparound at 31)
+      if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
+          (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
+       return Error(Loc, "registers must be sequential");
 
-      LastReg = Reg2;
+      PrevReg = Reg;
+      ++Count;
     }
   }
 
-  if (Parser.getTok().isNot(AsmToken::RCurly)) {
-    Error(Parser.getTok().getLoc(), "'}' expected");
-    return MatchOperand_ParseFail;
-  }
-  SMLoc ELoc = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat '}' token.
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return Error(getLoc(), "'}' expected");
+  Parser.Lex(); // Eat the '}' token.
 
-  A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr);
-  if (Count > 1) { // If count > 1, create vector list using super register.
-    bool IsVec64 = (Layout < A64Layout::VL_16B);
-    static unsigned SupRegIDs[3][2] = {
-      { AArch64::QPairRegClassID, AArch64::DPairRegClassID },
-      { AArch64::QTripleRegClassID, AArch64::DTripleRegClassID },
-      { AArch64::QQuadRegClassID, AArch64::DQuadRegClassID }
-    };
-    unsigned SupRegID = SupRegIDs[Count - 2][static_cast<int>(IsVec64)];
-    unsigned Sub0 = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
-    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
-    Reg = MRI->getMatchingSuperReg(Reg, Sub0,
-                                   &AArch64MCRegisterClasses[SupRegID]);
-  }
-  Operands.push_back(
-      AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc));
+  if (Count > 4)
+    return Error(S, "invalid number of vectors");
+
+  unsigned NumElements = 0;
+  char ElementKind = 0;
+  if (!Kind.empty())
+    parseValidVectorKind(Kind, NumElements, ElementKind);
 
+  Operands.push_back(AArch64Operand::CreateVectorList(
+      FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
+
+  // If there is an index specifier following the list, parse that too.
   if (Parser.getTok().is(AsmToken::LBrac)) {
-    uint32_t NumLanes = 0;
-    switch(Layout) {
-    case A64Layout::VL_B : NumLanes = 16; break;
-    case A64Layout::VL_H : NumLanes = 8; break;
-    case A64Layout::VL_S : NumLanes = 4; break;
-    case A64Layout::VL_D : NumLanes = 2; break;
-    default:
-      SMLoc Loc = getLexer().getLoc();
-      Error(Loc, "expected comma before next operand");
-      return MatchOperand_ParseFail;
+    SMLoc SIdx = getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
     }
-    return ParseNEONLane(Operands, NumLanes);
-  } else {
-    return MatchOperand_Success;
+
+    SMLoc E = getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return false;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+                                                         E, getContext()));
   }
+  return false;
 }
 
-// FIXME: We would really like to be able to tablegen'erate this.
-bool AArch64AsmParser::
-validateInstruction(MCInst &Inst,
-                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  switch (Inst.getOpcode()) {
-  case AArch64::BFIwwii:
-  case AArch64::BFIxxii:
-  case AArch64::SBFIZwwii:
-  case AArch64::SBFIZxxii:
-  case AArch64::UBFIZwwii:
-  case AArch64::UBFIZxxii:  {
-    unsigned ImmOps = Inst.getNumOperands() - 2;
-    int64_t ImmR = Inst.getOperand(ImmOps).getImm();
-    int64_t ImmS = Inst.getOperand(ImmOps+1).getImm();
-
-    if (ImmR != 0 && ImmS >= ImmR) {
-      return Error(Operands[4]->getStartLoc(),
-                   "requested insert overflows register");
-    }
-    return false;
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  unsigned RegNum = MatchRegisterName(Tok.getString().lower());
+
+  MCContext &Ctx = getContext();
+  const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+  if (!RI->getRegClass(AArch64::GPR64spRegClassID).contains(RegNum))
+    return MatchOperand_NoMatch;
+
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat register
+
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Operands.push_back(
+        AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+    return MatchOperand_Success;
   }
-  case AArch64::BFXILwwii:
-  case AArch64::BFXILxxii:
-  case AArch64::SBFXwwii:
-  case AArch64::SBFXxxii:
-  case AArch64::UBFXwwii:
-  case AArch64::UBFXxxii: {
-    unsigned ImmOps = Inst.getNumOperands() - 2;
-    int64_t ImmR = Inst.getOperand(ImmOps).getImm();
-    int64_t ImmS = Inst.getOperand(ImmOps+1).getImm();
-    int64_t RegWidth = 0;
-    switch (Inst.getOpcode()) {
-    case AArch64::SBFXxxii: case AArch64::UBFXxxii: case AArch64::BFXILxxii:
-      RegWidth = 64;
-      break;
-    case AArch64::SBFXwwii: case AArch64::UBFXwwii: case AArch64::BFXILwwii:
-      RegWidth = 32;
-      break;
-    }
+  Parser.Lex(); // Eat comma.
+
+  if (Parser.getTok().is(AsmToken::Hash))
+    Parser.Lex(); // Eat hash
+
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(getLoc(), "index must be absent or #0");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *ImmVal;
+  if (Parser.parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
+      cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
+    Error(getLoc(), "index must be absent or #0");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+  return MatchOperand_Success;
+}
 
-    if (ImmS >= RegWidth || ImmS < ImmR) {
-      return Error(Operands[4]->getStartLoc(),
-                   "requested extract overflows register");
-    }
+/// parseOperand - Parse a arm instruction operand.  For now this parses the
+/// operand regardless of the mnemonic.
+bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
+                                  bool invertCondCode) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
     return false;
-  }
-  case AArch64::ICix: {
-    int64_t ImmVal = Inst.getOperand(0).getImm();
-    A64IC::ICValues ICOp = static_cast<A64IC::ICValues>(ImmVal);
-    if (!A64IC::NeedsRegister(ICOp)) {
-      return Error(Operands[1]->getStartLoc(),
-                   "specified IC op does not use a register");
-    }
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  // Nothing custom, so do general case parsing.
+  SMLoc S, E;
+  switch (getLexer().getKind()) {
+  default: {
+    SMLoc S = getLoc();
+    const MCExpr *Expr;
+    if (parseSymbolicImmVal(Expr))
+      return Error(S, "invalid operand");
+
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
     return false;
   }
-  case AArch64::ICi: {
-    int64_t ImmVal = Inst.getOperand(0).getImm();
-    A64IC::ICValues ICOp = static_cast<A64IC::ICValues>(ImmVal);
-    if (A64IC::NeedsRegister(ICOp)) {
-      return Error(Operands[1]->getStartLoc(),
-                   "specified IC op requires a register");
-    }
-    return false;
+  case AsmToken::LBrac: {
+    SMLoc Loc = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateToken("[", false, Loc,
+                                                   getContext()));
+    Parser.Lex(); // Eat '['
+
+    // There's no comma after a '[', so we can parse the next operand
+    // immediately.
+    return parseOperand(Operands, false, false);
   }
-  case AArch64::TLBIix: {
-    int64_t ImmVal = Inst.getOperand(0).getImm();
-    A64TLBI::TLBIValues TLBIOp = static_cast<A64TLBI::TLBIValues>(ImmVal);
-    if (!A64TLBI::NeedsRegister(TLBIOp)) {
-      return Error(Operands[1]->getStartLoc(),
-                   "specified TLBI op does not use a register");
-    }
+  case AsmToken::LCurly:
+    return parseVectorList(Operands);
+  case AsmToken::Identifier: {
+    // If we're expecting a Condition Code operand, then just parse that.
+    if (isCondCode)
+      return parseCondCode(Operands, invertCondCode);
+
+    // If it's a register name, parse it.
+    if (!parseRegister(Operands))
+      return false;
+
+    // This could be an optional "shift" or "extend" operand.
+    OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
+    // We can only continue if no tokens were eaten.
+    if (GotShift != MatchOperand_NoMatch)
+      return GotShift;
+
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    S = getLoc();
+    if (getParser().parseExpression(IdVal))
+      return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext()));
     return false;
   }
-  case AArch64::TLBIi: {
-    int64_t ImmVal = Inst.getOperand(0).getImm();
-    A64TLBI::TLBIValues TLBIOp = static_cast<A64TLBI::TLBIValues>(ImmVal);
-    if (A64TLBI::NeedsRegister(TLBIOp)) {
-      return Error(Operands[1]->getStartLoc(),
-                   "specified TLBI op requires a register");
+  case AsmToken::Integer:
+  case AsmToken::Real:
+  case AsmToken::Hash: {
+    // #42 -> immediate.
+    S = getLoc();
+    if (getLexer().is(AsmToken::Hash))
+      Parser.Lex();
+
+    // Parse a negative sign
+    bool isNegative = false;
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      isNegative = true;
+      // We need to consume this token only when we have a Real, otherwise
+      // we let parseSymbolicImmVal take care of it
+      if (Parser.getLexer().peekTok().is(AsmToken::Real))
+        Parser.Lex();
+    }
+
+    // The only Real that should come through here is a literal #0.0 for
+    // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
+    // so convert the value.
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Real)) {
+      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
+          Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
+          Mnemonic != "fcmlt")
+        return TokError("unexpected floating point literal");
+      else if (IntVal != 0 || isNegative)
+        return TokError("expected floating-point constant #0.0");
+      Parser.Lex(); // Eat the token.
+
+      Operands.push_back(
+          AArch64Operand::CreateToken("#0", false, S, getContext()));
+      Operands.push_back(
+          AArch64Operand::CreateToken(".0", false, S, getContext()));
+      return false;
     }
+
+    const MCExpr *ImmVal;
+    if (parseSymbolicImmVal(ImmVal))
+      return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
     return false;
   }
   }
-
-  return false;
 }
 
-
-// Parses the instruction *together with* all operands, appending each parsed
-// operand to the "Operands" list
+/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its
+/// operands.
 bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
                                         StringRef Name, SMLoc NameLoc,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  StringRef PatchedName = StringSwitch<StringRef>(Name.lower())
-    .Case("beq", "b.eq")
-    .Case("bne", "b.ne")
-    .Case("bhs", "b.hs")
-    .Case("bcs", "b.cs")
-    .Case("blo", "b.lo")
-    .Case("bcc", "b.cc")
-    .Case("bmi", "b.mi")
-    .Case("bpl", "b.pl")
-    .Case("bvs", "b.vs")
-    .Case("bvc", "b.vc")
-    .Case("bhi", "b.hi")
-    .Case("bls", "b.ls")
-    .Case("bge", "b.ge")
-    .Case("blt", "b.lt")
-    .Case("bgt", "b.gt")
-    .Case("ble", "b.le")
-    .Case("bal", "b.al")
-    .Case("bnv", "b.nv")
-    .Default(Name);
-
-  size_t CondCodePos = PatchedName.find('.');
-
-  StringRef Mnemonic = PatchedName.substr(0, CondCodePos);
-  Operands.push_back(AArch64Operand::CreateToken(Mnemonic, NameLoc));
-
-  if (CondCodePos != StringRef::npos) {
-    // We have a condition code
-    SMLoc S = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 1);
-    StringRef CondStr = PatchedName.substr(CondCodePos + 1, StringRef::npos);
-    A64CC::CondCodes Code;
-
-    Code = A64StringToCondCode(CondStr);
-
-    if (Code == A64CC::Invalid) {
-      Error(S, "invalid condition code");
+                                        OperandVector &Operands) {
+  Name = StringSwitch<StringRef>(Name.lower())
+             .Case("beq", "b.eq")
+             .Case("bne", "b.ne")
+             .Case("bhs", "b.hs")
+             .Case("bcs", "b.cs")
+             .Case("blo", "b.lo")
+             .Case("bcc", "b.cc")
+             .Case("bmi", "b.mi")
+             .Case("bpl", "b.pl")
+             .Case("bvs", "b.vs")
+             .Case("bvc", "b.vc")
+             .Case("bhi", "b.hi")
+             .Case("bls", "b.ls")
+             .Case("bge", "b.ge")
+             .Case("blt", "b.lt")
+             .Case("bgt", "b.gt")
+             .Case("ble", "b.le")
+             .Case("bal", "b.al")
+             .Case("bnv", "b.nv")
+             .Default(Name);
+
+  // Create the leading tokens for the mnemonic, split by '.' characters.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+
+  // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
+  if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi") {
+    bool IsError = parseSysAlias(Head, NameLoc, Operands);
+    if (IsError && getLexer().isNot(AsmToken::EndOfStatement))
       Parser.eatToEndOfStatement();
-      return true;
-    }
-
-    SMLoc DotL = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos);
-
-    Operands.push_back(AArch64Operand::CreateToken(".",  DotL));
-    SMLoc E = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 3);
-    Operands.push_back(AArch64Operand::CreateCondCode(Code, S, E));
+    return IsError;
   }
 
-  // Now we parse the operands of this instruction
+  Operands.push_back(
+      AArch64Operand::CreateToken(Head, false, NameLoc, getContext()));
+  Mnemonic = Head;
+
+  // Handle condition codes for a branch mnemonic
+  if (Head == "b" && Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start + 1, Next);
+
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()));
+    AArch64CC::CondCode CC = parseCondCodeString(Head);
+    if (CC == AArch64CC::Invalid)
+      return Error(SuffixLoc, "invalid condition code");
+    Operands.push_back(
+        AArch64Operand::CreateToken(".", true, SuffixLoc, getContext()));
+    Operands.push_back(
+        AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
+  }
+
+  // Add the remaining tokens in the mnemonic.
+  while (Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start, Next);
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()) + 1);
+    Operands.push_back(
+        AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
+  }
+
+  // Conditional compare instructions have a Condition Code operand, which needs
+  // to be parsed and an immediate operand created.
+  bool condCodeFourthOperand =
+      (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
+       Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
+       Head == "csinc" || Head == "csinv" || Head == "csneg");
+
+  // These instructions are aliases to some of the conditional select
+  // instructions. However, the condition code is inverted in the aliased
+  // instruction.
+  //
+  // FIXME: Is this the correct way to handle these? Or should the parser
+  //        generate the aliased instructions directly?
+  bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
+  bool condCodeThirdOperand =
+      (Head == "cinc" || Head == "cinv" || Head == "cneg");
+
+  // Read the remaining operands.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
-    if (ParseOperand(Operands, Mnemonic)) {
+    if (parseOperand(Operands, false, false)) {
       Parser.eatToEndOfStatement();
       return true;
     }
 
+    unsigned N = 2;
     while (getLexer().is(AsmToken::Comma)) {
-      Parser.Lex();  // Eat the comma.
+      Parser.Lex(); // Eat the comma.
 
       // Parse and remember the operand.
-      if (ParseOperand(Operands, Mnemonic)) {
+      if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
+                                     (N == 3 && condCodeThirdOperand) ||
+                                     (N == 2 && condCodeSecondOperand),
+                       condCodeSecondOperand || condCodeThirdOperand)) {
         Parser.eatToEndOfStatement();
         return true;
       }
 
-
       // After successfully parsing some operands there are two special cases to
       // consider (i.e. notional operands not separated by commas). Both are due
       // to memory specifiers:
@@ -2321,52 +3121,716 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
       // in the given context!
       if (Parser.getTok().is(AsmToken::RBrac)) {
         SMLoc Loc = Parser.getTok().getLoc();
-        Operands.push_back(AArch64Operand::CreateToken("]", Loc));
+        Operands.push_back(AArch64Operand::CreateToken("]", false, Loc,
+                                                       getContext()));
         Parser.Lex();
       }
 
       if (Parser.getTok().is(AsmToken::Exclaim)) {
         SMLoc Loc = Parser.getTok().getLoc();
-        Operands.push_back(AArch64Operand::CreateToken("!", Loc));
+        Operands.push_back(AArch64Operand::CreateToken("!", false, Loc,
+                                                       getContext()));
         Parser.Lex();
       }
+
+      ++N;
     }
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    SMLoc Loc = getLexer().getLoc();
+    SMLoc Loc = Parser.getTok().getLoc();
     Parser.eatToEndOfStatement();
-    return Error(Loc, "expected comma before next operand");
+    return Error(Loc, "unexpected token in argument list");
   }
 
-  // Eat the EndOfStatement
-  Parser.Lex();
-
+  Parser.Lex(); // Consume the EndOfStatement
   return false;
 }
 
+// FIXME: This entire function is a giant hack to provide us with decent
+// operand range validation/diagnostics until TableGen/MC can be extended
+// to support autogeneration of this kind of validation.
+bool AArch64AsmParser::validateInstruction(MCInst &Inst,
+                                         SmallVectorImpl<SMLoc> &Loc) {
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  // Check for indexed addressing modes w/ the base register being the
+  // same as a destination/source register or pair load where
+  // the Rt == Rt2. All of those are undefined behaviour.
+  switch (Inst.getOpcode()) {
+  case AArch64::LDPSWpre:
+  case AArch64::LDPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::LDPXpost:
+  case AArch64::LDPXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    // FALLTHROUGH
+  }
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi: {
+    unsigned Rt = Inst.getOperand(0).getReg();
+    unsigned Rt2 = Inst.getOperand(1).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case AArch64::LDPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::LDPQpost:
+  case AArch64::LDPQpre:
+  case AArch64::LDPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::LDPSWpost: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case AArch64::STPDpost:
+  case AArch64::STPDpre:
+  case AArch64::STPQpost:
+  case AArch64::STPQpre:
+  case AArch64::STPSpost:
+  case AArch64::STPSpre:
+  case AArch64::STPWpost:
+  case AArch64::STPWpre:
+  case AArch64::STPXpost:
+  case AArch64::STPXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case AArch64::LDRBBpre:
+  case AArch64::LDRBpre:
+  case AArch64::LDRHHpre:
+  case AArch64::LDRHpre:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRBBpost:
+  case AArch64::LDRBpost:
+  case AArch64::LDRHHpost:
+  case AArch64::LDRHpost:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRWpost:
+  case AArch64::LDRXpost: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case AArch64::STRBBpost:
+  case AArch64::STRBpost:
+  case AArch64::STRHHpost:
+  case AArch64::STRHpost:
+  case AArch64::STRWpost:
+  case AArch64::STRXpost:
+  case AArch64::STRBBpre:
+  case AArch64::STRBpre:
+  case AArch64::STRHHpre:
+  case AArch64::STRHpre:
+  case AArch64::STRWpre:
+  case AArch64::STRXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  }
+
+  // Now check immediate ranges. Separate from the above as there is overlap
+  // in the instructions being checked and this keeps the nested conditionals
+  // to a minimum.
+  switch (Inst.getOpcode()) {
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXri:
+  case AArch64::ADDWri:
+  case AArch64::ADDXri:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXri:
+  case AArch64::SUBWri:
+  case AArch64::SUBXri: {
+    // Annoyingly we can't do this in the isAddSubImm predicate, so there is
+    // some slight duplication here.
+    if (Inst.getOperand(2).isExpr()) {
+      const MCExpr *Expr = Inst.getOperand(2).getExpr();
+      AArch64MCExpr::VariantKind ELFRefKind;
+      MCSymbolRefExpr::VariantKind DarwinRefKind;
+      int64_t Addend;
+      if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+        return Error(Loc[2], "invalid immediate expression");
+      }
+
+      // Only allow these with ADDXri.
+      if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+          DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) &&
+          Inst.getOpcode() == AArch64::ADDXri)
+        return false;
+
+      // Only allow these with ADDXri/ADDWri
+      if ((ELFRefKind == AArch64MCExpr::VK_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+          ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) &&
+          (Inst.getOpcode() == AArch64::ADDXri ||
+          Inst.getOpcode() == AArch64::ADDWri))
+        return false;
+
+      // Don't allow expressions in the immediate field otherwise
+      return Error(Loc[2], "invalid immediate expression");
+    }
+    return false;
+  }
+  default:
+    return false;
+  }
+}
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+  switch (ErrCode) {
+  case Match_MissingFeature:
+    return Error(Loc,
+                 "instruction requires a CPU feature not currently enabled");
+  case Match_InvalidOperand:
+    return Error(Loc, "invalid operand for instruction");
+  case Match_InvalidSuffix:
+    return Error(Loc, "invalid type suffix for instruction");
+  case Match_InvalidCondCode:
+    return Error(Loc, "expected AArch64 condition code");
+  case Match_AddSubRegExtendSmall:
+    return Error(Loc,
+      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubRegExtendLarge:
+    return Error(Loc,
+      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubSecondSource:
+    return Error(Loc,
+      "expected compatible register, symbol or integer in range [0, 4095]");
+  case Match_LogicalSecondSource:
+    return Error(Loc, "expected compatible register or logical immediate");
+  case Match_InvalidMovImm32Shift:
+    return Error(Loc, "expected 'lsl' with optional integer 0 or 16");
+  case Match_InvalidMovImm64Shift:
+    return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48");
+  case Match_AddSubRegShift32:
+    return Error(Loc,
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
+  case Match_AddSubRegShift64:
+    return Error(Loc,
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
+  case Match_InvalidFPImm:
+    return Error(Loc,
+                 "expected compatible register or floating-point constant");
+  case Match_InvalidMemoryIndexedSImm9:
+    return Error(Loc, "index must be an integer in range [-256, 255].");
+  case Match_InvalidMemoryIndexed4SImm7:
+    return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
+  case Match_InvalidMemoryIndexed8SImm7:
+    return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
+  case Match_InvalidMemoryIndexed16SImm7:
+    return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
+  case Match_InvalidMemoryWExtend8:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0");
+  case Match_InvalidMemoryWExtend16:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
+  case Match_InvalidMemoryWExtend32:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
+  case Match_InvalidMemoryWExtend64:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
+  case Match_InvalidMemoryWExtend128:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #4");
+  case Match_InvalidMemoryXExtend8:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0");
+  case Match_InvalidMemoryXExtend16:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
+  case Match_InvalidMemoryXExtend32:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
+  case Match_InvalidMemoryXExtend64:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
+  case Match_InvalidMemoryXExtend128:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
+  case Match_InvalidMemoryIndexed1:
+    return Error(Loc, "index must be an integer in range [0, 4095].");
+  case Match_InvalidMemoryIndexed2:
+    return Error(Loc, "index must be a multiple of 2 in range [0, 8190].");
+  case Match_InvalidMemoryIndexed4:
+    return Error(Loc, "index must be a multiple of 4 in range [0, 16380].");
+  case Match_InvalidMemoryIndexed8:
+    return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
+  case Match_InvalidMemoryIndexed16:
+    return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
+  case Match_InvalidImm0_7:
+    return Error(Loc, "immediate must be an integer in range [0, 7].");
+  case Match_InvalidImm0_15:
+    return Error(Loc, "immediate must be an integer in range [0, 15].");
+  case Match_InvalidImm0_31:
+    return Error(Loc, "immediate must be an integer in range [0, 31].");
+  case Match_InvalidImm0_63:
+    return Error(Loc, "immediate must be an integer in range [0, 63].");
+  case Match_InvalidImm0_127:
+    return Error(Loc, "immediate must be an integer in range [0, 127].");
+  case Match_InvalidImm0_65535:
+    return Error(Loc, "immediate must be an integer in range [0, 65535].");
+  case Match_InvalidImm1_8:
+    return Error(Loc, "immediate must be an integer in range [1, 8].");
+  case Match_InvalidImm1_16:
+    return Error(Loc, "immediate must be an integer in range [1, 16].");
+  case Match_InvalidImm1_32:
+    return Error(Loc, "immediate must be an integer in range [1, 32].");
+  case Match_InvalidImm1_64:
+    return Error(Loc, "immediate must be an integer in range [1, 64].");
+  case Match_InvalidIndex1:
+    return Error(Loc, "expected lane specifier '[1]'");
+  case Match_InvalidIndexB:
+    return Error(Loc, "vector lane must be an integer in range [0, 15].");
+  case Match_InvalidIndexH:
+    return Error(Loc, "vector lane must be an integer in range [0, 7].");
+  case Match_InvalidIndexS:
+    return Error(Loc, "vector lane must be an integer in range [0, 3].");
+  case Match_InvalidIndexD:
+    return Error(Loc, "vector lane must be an integer in range [0, 1].");
+  case Match_InvalidLabel:
+    return Error(Loc, "expected label or encodable integer pc offset");
+  case Match_MRS:
+    return Error(Loc, "expected readable system register");
+  case Match_MSR:
+    return Error(Loc, "expected writable system register or pstate");
+  case Match_MnemonicFail:
+    return Error(Loc, "unrecognized instruction mnemonic");
+  default:
+    assert(0 && "unexpected error code!");
+    return Error(Loc, "invalid instruction format");
+  }
+}
+
+static const char *getSubtargetFeatureName(unsigned Val);
+
+bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                               OperandVector &Operands,
+                                               MCStreamer &Out,
+                                               unsigned &ErrorInfo,
+                                               bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[0]);
+  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+
+  StringRef Tok = Op->getToken();
+  unsigned NumOperands = Operands.size();
+
+  if (NumOperands == 4 && Tok == "lsl") {
+    AArch64Operand *Op2 = static_cast<AArch64Operand *>(Operands[2]);
+    AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
+    if (Op2->isReg() && Op3->isImm()) {
+      const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+      if (Op3CE) {
+        uint64_t Op3Val = Op3CE->getValue();
+        uint64_t NewOp3Val = 0;
+        uint64_t NewOp4Val = 0;
+        if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+                Op2->getReg())) {
+          NewOp3Val = (32 - Op3Val) & 0x1f;
+          NewOp4Val = 31 - Op3Val;
+        } else {
+          NewOp3Val = (64 - Op3Val) & 0x3f;
+          NewOp4Val = 63 - Op3Val;
+        }
+
+        const MCExpr *NewOp3 = MCConstantExpr::Create(NewOp3Val, getContext());
+        const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext());
+
+        Operands[0] = AArch64Operand::CreateToken(
+            "ubfm", false, Op->getStartLoc(), getContext());
+        Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
+                                                Op3->getEndLoc(), getContext());
+        Operands.push_back(AArch64Operand::CreateImm(
+            NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext()));
+        delete Op3;
+        delete Op;
+      }
+    }
+  } else if (NumOperands == 5) {
+    // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
+    // UBFIZ -> UBFM aliases.
+    if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
+      AArch64Operand *Op1 = static_cast<AArch64Operand *>(Operands[1]);
+      AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
+      AArch64Operand *Op4 = static_cast<AArch64Operand *>(Operands[4]);
+
+      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t RegWidth = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+              Op1->getReg()))
+            RegWidth = 64;
+          else
+            RegWidth = 32;
+
+          if (Op3Val >= RegWidth)
+            return Error(Op3->getStartLoc(),
+                         "expected integer in range [0, 31]");
+          if (Op4Val < 1 || Op4Val > RegWidth)
+            return Error(Op4->getStartLoc(),
+                         "expected integer in range [1, 32]");
+
+          uint64_t NewOp3Val = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+                  Op1->getReg()))
+            NewOp3Val = (32 - Op3Val) & 0x1f;
+          else
+            NewOp3Val = (64 - Op3Val) & 0x3f;
+
+          uint64_t NewOp4Val = Op4Val - 1;
+
+          if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
+            return Error(Op4->getStartLoc(),
+                         "requested insert overflows register");
+
+          const MCExpr *NewOp3 =
+              MCConstantExpr::Create(NewOp3Val, getContext());
+          const MCExpr *NewOp4 =
+              MCConstantExpr::Create(NewOp4Val, getContext());
+          Operands[3] = AArch64Operand::CreateImm(
+              NewOp3, Op3->getStartLoc(), Op3->getEndLoc(), getContext());
+          Operands[4] = AArch64Operand::CreateImm(
+              NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
+          if (Tok == "bfi")
+            Operands[0] = AArch64Operand::CreateToken(
+                "bfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "sbfiz")
+            Operands[0] = AArch64Operand::CreateToken(
+                "sbfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "ubfiz")
+            Operands[0] = AArch64Operand::CreateToken(
+                "ubfm", false, Op->getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+
+          delete Op;
+          delete Op3;
+          delete Op4;
+        }
+      }
+
+      // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
+      // UBFX -> UBFM aliases.
+    } else if (NumOperands == 5 &&
+               (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
+      AArch64Operand *Op1 = static_cast<AArch64Operand *>(Operands[1]);
+      AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
+      AArch64Operand *Op4 = static_cast<AArch64Operand *>(Operands[4]);
+
+      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t RegWidth = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+              Op1->getReg()))
+            RegWidth = 64;
+          else
+            RegWidth = 32;
+
+          if (Op3Val >= RegWidth)
+            return Error(Op3->getStartLoc(),
+                         "expected integer in range [0, 31]");
+          if (Op4Val < 1 || Op4Val > RegWidth)
+            return Error(Op4->getStartLoc(),
+                         "expected integer in range [1, 32]");
+
+          uint64_t NewOp4Val = Op3Val + Op4Val - 1;
+
+          if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val)
+            return Error(Op4->getStartLoc(),
+                         "requested extract overflows register");
+
+          const MCExpr *NewOp4 =
+              MCConstantExpr::Create(NewOp4Val, getContext());
+          Operands[4] = AArch64Operand::CreateImm(
+              NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
+          if (Tok == "bfxil")
+            Operands[0] = AArch64Operand::CreateToken(
+                "bfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "sbfx")
+            Operands[0] = AArch64Operand::CreateToken(
+                "sbfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "ubfx")
+            Operands[0] = AArch64Operand::CreateToken(
+                "ubfm", false, Op->getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+
+          delete Op;
+          delete Op4;
+        }
+      }
+    }
+  }
+  // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
+  //        InstAlias can't quite handle this since the reg classes aren't
+  //        subclasses.
+  if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
+    // The source register can be Wn here, but the matcher expects a
+    // GPR64. Twiddle it here if necessary.
+    AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[2]);
+    if (Op->isReg()) {
+      unsigned Reg = getXRegFromWReg(Op->getReg());
+      Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+                                              Op->getEndLoc(), getContext());
+      delete Op;
+    }
+  }
+  // FIXME: Likewise for sxt[bh] with a Xd dst operand
+  else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
+    AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
+    if (Op->isReg() &&
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Op->getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR64. Twiddle it here if necessary.
+      AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[2]);
+      if (Op->isReg()) {
+        unsigned Reg = getXRegFromWReg(Op->getReg());
+        Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+                                                Op->getEndLoc(), getContext());
+        delete Op;
+      }
+    }
+  }
+  // FIXME: Likewise for uxt[bh] with a Xd dst operand
+  else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
+    AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
+    if (Op->isReg() &&
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Op->getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR32. Twiddle it here if necessary.
+      AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
+      if (Op->isReg()) {
+        unsigned Reg = getWRegFromXReg(Op->getReg());
+        Operands[1] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+                                                Op->getEndLoc(), getContext());
+        delete Op;
+      }
+    }
+  }
+
+  // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
+  if (NumOperands == 3 && Tok == "fmov") {
+    AArch64Operand *RegOp = static_cast<AArch64Operand *>(Operands[1]);
+    AArch64Operand *ImmOp = static_cast<AArch64Operand *>(Operands[2]);
+    if (RegOp->isReg() && ImmOp->isFPImm() &&
+        ImmOp->getFPImm() == (unsigned)-1) {
+      unsigned zreg =
+          AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains(
+              RegOp->getReg())
+              ? AArch64::WZR
+              : AArch64::XZR;
+      Operands[2] = AArch64Operand::CreateReg(zreg, false, Op->getStartLoc(),
+                                              Op->getEndLoc(), getContext());
+      delete ImmOp;
+    }
+  }
+
+  MCInst Inst;
+  // First try to match against the secondary set of tables containing the
+  // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
+
+  // If that fails, try against the alternate table containing long-form NEON:
+  // "fadd v0.2s, v1.2s, v2.2s"
+  if (MatchResult != Match_Success)
+    MatchResult =
+        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
+
+  switch (MatchResult) {
+  case Match_Success: {
+    // Perform range checking and other semantic validations
+    SmallVector<SMLoc, 8> OperandLocs;
+    NumOperands = Operands.size();
+    for (unsigned i = 1; i < NumOperands; ++i)
+      OperandLocs.push_back(Operands[i]->getStartLoc());
+    if (validateInstruction(Inst, OperandLocs))
+      return true;
+
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  }
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing (neon, e.g.).
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
+  case Match_MnemonicFail:
+    return showMatchError(IDLoc, MatchResult);
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    // If the match failed on a suffix token operand, tweak the diagnostic
+    // accordingly.
+    if (((AArch64Operand *)Operands[ErrorInfo])->isToken() &&
+        ((AArch64Operand *)Operands[ErrorInfo])->isTokenSuffix())
+      MatchResult = Match_InvalidSuffix;
+
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  case Match_InvalidMemoryIndexed1:
+  case Match_InvalidMemoryIndexed2:
+  case Match_InvalidMemoryIndexed4:
+  case Match_InvalidMemoryIndexed8:
+  case Match_InvalidMemoryIndexed16:
+  case Match_InvalidCondCode:
+  case Match_AddSubRegExtendSmall:
+  case Match_AddSubRegExtendLarge:
+  case Match_AddSubSecondSource:
+  case Match_LogicalSecondSource:
+  case Match_AddSubRegShift32:
+  case Match_AddSubRegShift64:
+  case Match_InvalidMovImm32Shift:
+  case Match_InvalidMovImm64Shift:
+  case Match_InvalidFPImm:
+  case Match_InvalidMemoryWExtend8:
+  case Match_InvalidMemoryWExtend16:
+  case Match_InvalidMemoryWExtend32:
+  case Match_InvalidMemoryWExtend64:
+  case Match_InvalidMemoryWExtend128:
+  case Match_InvalidMemoryXExtend8:
+  case Match_InvalidMemoryXExtend16:
+  case Match_InvalidMemoryXExtend32:
+  case Match_InvalidMemoryXExtend64:
+  case Match_InvalidMemoryXExtend128:
+  case Match_InvalidMemoryIndexed4SImm7:
+  case Match_InvalidMemoryIndexed8SImm7:
+  case Match_InvalidMemoryIndexed16SImm7:
+  case Match_InvalidMemoryIndexedSImm9:
+  case Match_InvalidImm0_7:
+  case Match_InvalidImm0_15:
+  case Match_InvalidImm0_31:
+  case Match_InvalidImm0_63:
+  case Match_InvalidImm0_127:
+  case Match_InvalidImm0_65535:
+  case Match_InvalidImm1_8:
+  case Match_InvalidImm1_16:
+  case Match_InvalidImm1_32:
+  case Match_InvalidImm1_64:
+  case Match_InvalidIndex1:
+  case Match_InvalidIndexB:
+  case Match_InvalidIndexH:
+  case Match_InvalidIndexS:
+  case Match_InvalidIndexD:
+  case Match_InvalidLabel:
+  case Match_MSR:
+  case Match_MRS: {
+    // Any time we get here, there's nothing fancy to do. Just get the
+    // operand SMLoc and display the diagnostic.
+    SMLoc ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+  return true;
+}
+
+/// ParseDirective parses the arm specific directives
 bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
+  SMLoc Loc = DirectiveID.getLoc();
   if (IDVal == ".hword")
-    return ParseDirectiveWord(2, DirectiveID.getLoc());
-  else if (IDVal == ".word")
-    return ParseDirectiveWord(4, DirectiveID.getLoc());
-  else if (IDVal == ".xword")
-    return ParseDirectiveWord(8, DirectiveID.getLoc());
-  else if (IDVal == ".tlsdesccall")
-    return ParseDirectiveTLSDescCall(DirectiveID.getLoc());
-
-  return true;
+    return parseDirectiveWord(2, Loc);
+  if (IDVal == ".word")
+    return parseDirectiveWord(4, Loc);
+  if (IDVal == ".xword")
+    return parseDirectiveWord(8, Loc);
+  if (IDVal == ".tlsdesccall")
+    return parseDirectiveTLSDescCall(Loc);
+
+  return parseDirectiveLOH(IDVal, Loc);
 }
 
 /// parseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
-bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
       if (getParser().parseExpression(Value))
-        return false;
+        return true;
 
       getParser().getStreamer().EmitValue(Value, Size);
 
@@ -2374,10 +3838,8 @@ bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
         break;
 
       // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma)) {
-        Error(L, "unexpected token in directive");
-        return false;
-      }
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
       Parser.Lex();
     }
   }
@@ -2388,15 +3850,14 @@ bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
 
 // parseDirectiveTLSDescCall:
 //   ::= .tlsdesccall symbol
-bool AArch64AsmParser::ParseDirectiveTLSDescCall(SMLoc L) {
+bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
   StringRef Name;
-  if (getParser().parseIdentifier(Name)) {
-    Error(L, "expected symbol after directive");
-    return false;
-  }
+  if (getParser().parseIdentifier(Name))
+    return Error(L, "expected symbol after directive");
 
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
-  const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+  Expr = AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
 
   MCInst Inst;
   Inst.setOpcode(AArch64::TLSDESCCALL);
@@ -2406,271 +3867,181 @@ bool AArch64AsmParser::ParseDirectiveTLSDescCall(SMLoc L) {
   return false;
 }
 
+/// ::= .loh <lohName | lohId> label1, ..., labelN
+/// The number of arguments depends on the loh identifier.
+bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
+  if (IDVal != MCLOHDirectiveName())
+    return true;
+  MCLOHType Kind;
+  if (getParser().getTok().isNot(AsmToken::Identifier)) {
+    if (getParser().getTok().isNot(AsmToken::Integer))
+      return TokError("expected an identifier or a number in directive");
+    // We successfully get a numeric value for the identifier.
+    // Check if it is valid.
+    int64_t Id = getParser().getTok().getIntVal();
+    Kind = (MCLOHType)Id;
+    // Check that Id does not overflow MCLOHType.
+    if (!isValidMCLOHType(Kind) || Id != Kind)
+      return TokError("invalid numeric identifier in directive");
+  } else {
+    StringRef Name = getTok().getIdentifier();
+    // We successfully parse an identifier.
+    // Check if it is a recognized one.
+    int Id = MCLOHNameToId(Name);
+
+    if (Id == -1)
+      return TokError("invalid identifier in directive");
+    Kind = (MCLOHType)Id;
+  }
+  // Consume the identifier.
+  Lex();
+  // Get the number of arguments of this LOH.
+  int NbArgs = MCLOHIdToNbArgs(Kind);
+
+  assert(NbArgs != -1 && "Invalid number of arguments");
+
+  SmallVector<MCSymbol *, 3> Args;
+  for (int Idx = 0; Idx < NbArgs; ++Idx) {
+    StringRef Name;
+    if (getParser().parseIdentifier(Name))
+      return TokError("expected identifier in directive");
+    Args.push_back(getContext().GetOrCreateSymbol(Name));
+
+    if (Idx + 1 == NbArgs)
+      break;
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+    Lex();
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
 
-bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                 MCStreamer &Out, unsigned &ErrorInfo,
-                                 bool MatchingInlineAsm) {
-  MCInst Inst;
-  unsigned MatchResult;
-  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
-                                     MatchingInlineAsm);
+  getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
+  return false;
+}
 
-  if (ErrorInfo != ~0U && ErrorInfo >= Operands.size())
-    return Error(IDLoc, "too few operands for instruction");
+bool
+AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
+                                    AArch64MCExpr::VariantKind &ELFRefKind,
+                                    MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                    int64_t &Addend) {
+  ELFRefKind = AArch64MCExpr::VK_INVALID;
+  DarwinRefKind = MCSymbolRefExpr::VK_None;
+  Addend = 0;
+
+  if (const AArch64MCExpr *AE = dyn_cast<AArch64MCExpr>(Expr)) {
+    ELFRefKind = AE->getKind();
+    Expr = AE->getSubExpr();
+  }
+
+  const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
+  if (SE) {
+    // It's a simple symbol reference with no addend.
+    DarwinRefKind = SE->getKind();
+    return true;
+  }
 
-  switch (MatchResult) {
-  default: break;
-  case Match_Success:
-    if (validateInstruction(Inst, Operands))
-      return true;
+  const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
+  if (!BE)
+    return false;
 
-    Out.EmitInstruction(Inst, STI);
+  SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+  if (!SE)
     return false;
-  case Match_MissingFeature:
-    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
-    return true;
-  case Match_InvalidOperand: {
-    SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
-      ErrorLoc = ((AArch64Operand*)Operands[ErrorInfo])->getStartLoc();
-      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
-    }
+  DarwinRefKind = SE->getKind();
 
-    return Error(ErrorLoc, "invalid operand for instruction");
-  }
-  case Match_MnemonicFail:
-    return Error(IDLoc, "invalid instruction");
+  if (BE->getOpcode() != MCBinaryExpr::Add &&
+      BE->getOpcode() != MCBinaryExpr::Sub)
+    return false;
 
-  case Match_AddSubRegExtendSmall:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
-  case Match_AddSubRegExtendLarge:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
-  case Match_AddSubRegShift32:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
-  case Match_AddSubRegShift64:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
-  case Match_AddSubSecondSource:
-      return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-          "expected compatible register, symbol or integer in range [0, 4095]");
-  case Match_CVTFixedPos32:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 32]");
-  case Match_CVTFixedPos64:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 64]");
-  case Match_CondCode:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected AArch64 condition code");
-  case Match_FPImm:
-    // Any situation which allows a nontrivial floating-point constant also
-    // allows a register.
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected compatible register or floating-point constant");
-  case Match_FPZero:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected floating-point constant #0.0 or invalid register type");
-  case Match_Label:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected label or encodable integer pc offset");
-  case Match_Lane1:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected lane specifier '[1]'");
-  case Match_LoadStoreExtend32_1:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0");
-  case Match_LoadStoreExtend32_2:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
-  case Match_LoadStoreExtend32_4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
-  case Match_LoadStoreExtend32_8:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
-  case Match_LoadStoreExtend32_16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtw' with optional shift of #0 or #4");
-  case Match_LoadStoreExtend64_1:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0");
-  case Match_LoadStoreExtend64_2:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
-  case Match_LoadStoreExtend64_4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
-  case Match_LoadStoreExtend64_8:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
-  case Match_LoadStoreExtend64_16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
-  case Match_LoadStoreSImm7_4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer multiple of 4 in range [-256, 252]");
-  case Match_LoadStoreSImm7_8:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer multiple of 8 in range [-512, 504]");
-  case Match_LoadStoreSImm7_16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer multiple of 16 in range [-1024, 1008]");
-  case Match_LoadStoreSImm9:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [-256, 255]");
-  case Match_LoadStoreUImm12_1:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 4095]");
-  case Match_LoadStoreUImm12_2:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 8190]");
-  case Match_LoadStoreUImm12_4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 16380]");
-  case Match_LoadStoreUImm12_8:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 32760]");
-  case Match_LoadStoreUImm12_16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 65520]");
-  case Match_LogicalSecondSource:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected compatible register or logical immediate");
-  case Match_MOVWUImm16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected relocated symbol or integer in range [0, 65535]");
-  case Match_MRS:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected readable system register");
-  case Match_MSR:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected writable system register or pstate");
-  case Match_NamedImm_at:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                "expected symbolic 'at' operand: s1e[0-3][rw] or s12e[01][rw]");
-  case Match_NamedImm_dbarrier:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-             "expected integer in range [0, 15] or symbolic barrier operand");
-  case Match_NamedImm_dc:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic 'dc' operand");
-  case Match_NamedImm_ic:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'ic' operand: 'ialluis', 'iallu' or 'ivau'");
-  case Match_NamedImm_isb:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 15] or 'sy'");
-  case Match_NamedImm_prefetch:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected prefetch hint: p(ld|st|i)l[123](strm|keep)");
-  case Match_NamedImm_tlbi:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected translation buffer invalidation operand");
-  case Match_UImm16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 65535]");
-  case Match_UImm3:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 7]");
-  case Match_UImm4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 15]");
-  case Match_UImm5:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 31]");
-  case Match_UImm6:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 63]");
-  case Match_UImm7:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 127]");
-  case Match_Width32:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [<lsb>, 31]");
-  case Match_Width64:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [<lsb>, 63]");
-  case Match_ShrImm8:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 8]");
-  case Match_ShrImm16:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 16]");
-  case Match_ShrImm32:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 32]");
-  case Match_ShrImm64:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 64]");
-  case Match_ShlImm8:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 7]");
-  case Match_ShlImm16:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 15]");
-  case Match_ShlImm32:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 31]");
-  case Match_ShlImm64:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 63]");
-  }
+  // See if the addend is is a constant, otherwise there's more going
+  // on here than we can deal with.
+  auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
+  if (!AddendExpr)
+    return false;
 
-  llvm_unreachable("Implement any new match types added!");
-  return true;
+  Addend = AddendExpr->getValue();
+  if (BE->getOpcode() == MCBinaryExpr::Sub)
+    Addend = -Addend;
+
+  // It's some symbol reference + a constant addend, but really
+  // shouldn't use both Darwin and ELF syntax.
+  return ELFRefKind == AArch64MCExpr::VK_INVALID ||
+         DarwinRefKind == MCSymbolRefExpr::VK_None;
 }
 
-void AArch64Operand::print(raw_ostream &OS) const {
+/// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmParser() {
+  RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
+  RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
+
+  RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64leTarget);
+  RegisterMCAsmParser<AArch64AsmParser> W(TheARM64beTarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "AArch64GenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+                                                      unsigned Kind) {
+  AArch64Operand *Op = static_cast<AArch64Operand *>(AsmOp);
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  int64_t ExpectedVal;
   switch (Kind) {
-  case k_CondCode:
-    OS << "<CondCode: " << CondCode.Code << ">";
+  default:
+    return Match_InvalidOperand;
+  case MCK__35_0:
+    ExpectedVal = 0;
     break;
-  case k_FPImmediate:
-    OS << "<fpimm: " << FPImm.Val << ">";
+  case MCK__35_1:
+    ExpectedVal = 1;
     break;
-  case k_ImmWithLSL:
-    OS << "<immwithlsl: imm=" << ImmWithLSL.Val
-       << ", shift=" << ImmWithLSL.ShiftAmount << ">";
+  case MCK__35_12:
+    ExpectedVal = 12;
     break;
-  case k_Immediate:
-    getImm()->print(OS);
+  case MCK__35_16:
+    ExpectedVal = 16;
     break;
-  case k_Register:
-    OS << "<register " << getReg() << '>';
+  case MCK__35_2:
+    ExpectedVal = 2;
     break;
-  case k_Token:
-    OS << '\'' << getToken() << '\'';
+  case MCK__35_24:
+    ExpectedVal = 24;
     break;
-  case k_ShiftExtend:
-    OS << "<shift: type=" << ShiftExtend.ShiftType
-       << ", amount=" << ShiftExtend.Amount << ">";
+  case MCK__35_3:
+    ExpectedVal = 3;
     break;
-  case k_SysReg: {
-    StringRef Name(SysReg.Data, SysReg.Length);
-    OS << "<sysreg: " << Name << '>';
+  case MCK__35_32:
+    ExpectedVal = 32;
     break;
-  }
-  default:
-    llvm_unreachable("No idea how to print this kind of operand");
+  case MCK__35_4:
+    ExpectedVal = 4;
+    break;
+  case MCK__35_48:
+    ExpectedVal = 48;
+    break;
+  case MCK__35_6:
+    ExpectedVal = 6;
+    break;
+  case MCK__35_64:
+    ExpectedVal = 64;
+    break;
+  case MCK__35_8:
+    ExpectedVal = 8;
     break;
   }
+  if (!Op->isImm())
+    return Match_InvalidOperand;
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+  if (!CE)
+    return Match_InvalidOperand;
+  if (CE->getValue() == ExpectedVal)
+    return Match_Success;
+  return Match_InvalidOperand;
 }
-
-void AArch64Operand::dump() const {
-  print(errs());
-}
-
-
-/// Force static initialization.
-extern "C" void LLVMInitializeAArch64AsmParser() {
-  RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
-  RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
-}
-
-#define GET_REGISTER_MATCHER
-#define GET_MATCHER_IMPLEMENTATION
-#include "AArch64GenAsmMatcher.inc"
diff --git a/lib/Target/AArch64/AsmParser/CMakeLists.txt b/lib/Target/AArch64/AsmParser/CMakeLists.txt
index e81ec70..cc0a9d8 100644
--- a/lib/Target/AArch64/AsmParser/CMakeLists.txt
+++ b/lib/Target/AArch64/AsmParser/CMakeLists.txt
@@ -1,3 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
 add_llvm_library(LLVMAArch64AsmParser
   AArch64AsmParser.cpp
   )
+
diff --git a/lib/Target/AArch64/AsmParser/LLVMBuild.txt b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
index 2d8f632..11eb9d5 100644
--- a/lib/Target/AArch64/AsmParser/LLVMBuild.txt
+++ b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/AsmParser/LLVMBuild.txt -------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/AsmParser/Makefile b/lib/Target/AArch64/AsmParser/Makefile
index 56c9ef5..00268c7 100644
--- a/lib/Target/AArch64/AsmParser/Makefile
+++ b/lib/Target/AArch64/AsmParser/Makefile
@@ -9,7 +9,7 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMAArch64AsmParser
 
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' ARM target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index dfc10af..789d549 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -1,37 +1,51 @@
 set(LLVM_TARGET_DEFINITIONS AArch64.td)
 
-tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
-tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
 tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering)
-tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
+tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
 tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
 add_public_tablegen_target(AArch64CommonTableGen)
 
 add_llvm_target(AArch64CodeGen
+  AArch64AddressTypePromotion.cpp
+  AArch64AdvSIMDScalarPass.cpp
   AArch64AsmPrinter.cpp
-  AArch64BranchFixupPass.cpp
+  AArch64BranchRelaxation.cpp
+  AArch64CleanupLocalDynamicTLSPass.cpp
+  AArch64CollectLOH.cpp
+  AArch64ConditionalCompares.cpp
+  AArch64DeadRegisterDefinitionsPass.cpp
+  AArch64ExpandPseudoInsts.cpp
+  AArch64FastISel.cpp
   AArch64FrameLowering.cpp
   AArch64ISelDAGToDAG.cpp
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
-  AArch64MachineFunctionInfo.cpp
+  AArch64LoadStoreOptimizer.cpp
   AArch64MCInstLower.cpp
+  AArch64PromoteConstant.cpp
   AArch64RegisterInfo.cpp
   AArch64SelectionDAGInfo.cpp
+  AArch64StorePairSuppress.cpp
   AArch64Subtarget.cpp
   AArch64TargetMachine.cpp
   AArch64TargetObjectFile.cpp
   AArch64TargetTransformInfo.cpp
-  )
+)
 
+add_dependencies(LLVMAArch64CodeGen intrinsics_gen)
+
+add_subdirectory(TargetInfo)
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
-add_subdirectory(TargetInfo)
 add_subdirectory(Utils)
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 9bd363a..6de27d6 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -1,4 +1,4 @@
-//===- AArch64Disassembler.cpp - Disassembler for AArch64 ISA -------------===//
+//===- AArch64Disassembler.cpp - Disassembler for AArch64 -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,244 +7,169 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the functions necessary to decode AArch64 instruction
-// bitpatterns into MCInsts (with the help of TableGenerated information from
-// the instruction definitions).
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-disassembler"
-
-#include "AArch64.h"
-#include "AArch64RegisterInfo.h"
+#include "AArch64Disassembler.h"
+#include "AArch64ExternalSymbolizer.h"
 #include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-typedef MCDisassembler::DecodeStatus DecodeStatus;
-
-namespace {
-/// AArch64 disassembler for all AArch64 platforms.
-class AArch64Disassembler : public MCDisassembler {
-  OwningPtr<const MCRegisterInfo> RegInfo;
-public:
-  /// Initializes the disassembler.
-  ///
-  AArch64Disassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info)
-    : MCDisassembler(STI), RegInfo(Info) {
-  }
-
-  ~AArch64Disassembler() {}
+#define DEBUG_TYPE "aarch64-disassembler"
 
-  /// See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const;
+// Pull DecodeStatus and its enum values into the global namespace.
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
 
-  const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
-};
-
-}
-
-// Forward-declarations used in the auto-generated files.
-static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus
-DecodeGPR64xspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus
-DecodeGPR32wspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
 static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
                                               unsigned RegNo, uint64_t Address,
                                               const void *Decoder);
-static DecodeStatus DecodeFPR128LoRegisterClass(llvm::MCInst &Inst,
-                                                unsigned RegNo, uint64_t Address,
-                                                const void *Decoder);
-
-static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
-                                                  unsigned RegNo,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
-
-static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-
-static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
-                                               unsigned OptionHiS,
-                                               uint64_t Address,
-                                               const void *Decoder);
-
-
-static DecodeStatus DecodeBitfield32ImmOperand(llvm::MCInst &Inst,
-                                               unsigned Imm6Bits,
-                                               uint64_t Address,
+static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
                                                const void *Decoder);
-
-static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
-                                               unsigned Imm6Bits,
-                                               uint64_t Address,
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
                                                const void *Decoder);
-
-static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
-                                        unsigned RmBits,
-                                        uint64_t Address,
-                                        const void *Decoder);
-
-static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                           uint64_t Address,
                                           const void *Decoder);
-
-static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder);
-static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder);
-static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder);
-
-template<int RegWidth>
-static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
-                                             unsigned FullImm,
-                                             uint64_t Address,
-                                             const void *Decoder);
-
-template<int RegWidth>
-static DecodeStatus DecodeLogicalImmOperand(llvm::MCInst &Inst,
-                                            unsigned Bits,
+static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const void *Decoder);
-
-static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
-                                           unsigned ShiftAmount,
+static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
                                            const void *Decoder);
-template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
-static DecodeStatus
-DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
-                             uint64_t Address, const void *Decoder);
-
-static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
-                                            unsigned ShiftAmount,
+static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const void *Decoder);
-static DecodeStatus DecodeBitfieldInstruction(llvm::MCInst &Inst, unsigned Insn,
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+                                       uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
                                               uint64_t Address,
                                               const void *Decoder);
-
-static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
                                               uint64_t Address,
                                               const void *Decoder);
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
 
-static DecodeStatus DecodeLDSTPairInstruction(llvm::MCInst &Inst,
-                                              unsigned Insn,
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
                                               uint64_t Address,
                                               const void *Decoder);
-
-static DecodeStatus DecodeLoadPairExclusiveInstruction(llvm::MCInst &Inst,
-                                                       unsigned Val,
-                                                       uint64_t Address,
-                                                       const void *Decoder);
-
-template<typename SomeNamedImmMapper>
-static DecodeStatus DecodeNamedImmOperand(llvm::MCInst &Inst,
-                                          unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder);
-
-static DecodeStatus
-DecodeSysRegOperand(const A64SysReg::SysRegMapper &InstMapper,
-                    llvm::MCInst &Inst, unsigned Val,
-                    uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeMRSOperand(llvm::MCInst &Inst,
-                                     unsigned Val,
-                                     uint64_t Address,
-                                     const void *Decoder);
-
-static DecodeStatus DecodeMSROperand(llvm::MCInst &Inst,
-                                     unsigned Val,
-                                     uint64_t Address,
-                                     const void *Decoder);
-
-
-static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
-                                                   unsigned Val,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-
-static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Val,
-                                               uint64_t Address,
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
                                                const void *Decoder);
-
-static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-
-static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const void *Decoder);
-
-static bool Check(DecodeStatus &Out, DecodeStatus In);
-
-#include "AArch64GenDisassemblerTables.inc"
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
 
 static bool Check(DecodeStatus &Out, DecodeStatus In) {
   switch (In) {
@@ -261,486 +186,479 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) {
   llvm_unreachable("Invalid DecodeStatus!");
 }
 
+#include "AArch64GenDisassemblerTables.inc"
+#include "AArch64GenInstrInfo.inc"
+
+#define Success llvm::MCDisassembler::Success
+#define Fail llvm::MCDisassembler::Fail
+#define SoftFail llvm::MCDisassembler::SoftFail
+
+static MCDisassembler *createAArch64Disassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new AArch64Disassembler(STI, Ctx);
+}
+
 DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                                 const MemoryObject &Region,
-                                                 uint64_t Address,
-                                                 raw_ostream &os,
-                                                 raw_ostream &cs) const {
+                                               const MemoryObject &Region,
+                                               uint64_t Address,
+                                               raw_ostream &os,
+                                               raw_ostream &cs) const {
   CommentStream = &cs;
 
   uint8_t bytes[4];
 
+  Size = 0;
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, bytes) == -1) {
-    Size = 0;
-    return MCDisassembler::Fail;
-  }
+  if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
+    return Fail;
+  Size = 4;
 
   // Encoded as a small-endian 32-bit word in the stream.
-  uint32_t insn = (bytes[3] << 24) |
-    (bytes[2] << 16) |
-    (bytes[1] <<  8) |
-    (bytes[0] <<  0);
+  uint32_t insn =
+      (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
 
   // Calling the auto-generated decoder function.
-  DecodeStatus result = decodeInstruction(DecoderTableA6432, MI, insn, Address,
-                                          this, STI);
-  if (result != MCDisassembler::Fail) {
-    Size = 4;
-    return result;
-  }
-
-  MI.clear();
-  Size = 0;
-  return MCDisassembler::Fail;
+  return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
 }
 
-static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
-  const AArch64Disassembler *Dis = static_cast<const AArch64Disassembler*>(D);
-  return Dis->getRegInfo()->getRegClass(RC).getRegister(RegNo);
+static MCSymbolizer *
+createAArch64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
+                              LLVMSymbolLookupCallback SymbolLookUp,
+                              void *DisInfo, MCContext *Ctx,
+                              MCRelocationInfo *RelInfo) {
+  return new llvm::AArch64ExternalSymbolizer(
+                                     *Ctx,
+                                     std::unique_ptr<MCRelocationInfo>(RelInfo),
+                                     GetOpInfo, SymbolLookUp, DisInfo);
 }
 
-static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                        uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
+extern "C" void LLVMInitializeAArch64Disassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(TheAArch64leTarget,
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget,
+                                       createAArch64ExternalSymbolizer);
 
-  uint16_t Register = getReg(Decoder, AArch64::GPR64RegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  TargetRegistry::RegisterMCDisassembler(TheARM64leTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheARM64beTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget,
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget,
+                                       createAArch64ExternalSymbolizer);
 }
 
-static DecodeStatus
-DecodeGPR64xspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  uint16_t Register = getReg(Decoder, AArch64::GPR64xspRegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
-}
+static const unsigned FPR128DecoderTable[] = {
+    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
+    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
+    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+    AArch64::Q30, AArch64::Q31
+};
 
-static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
+static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::GPR32RegClassID, RegNo);
+  unsigned Register = FPR128DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus
-DecodeGPR32wspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  uint16_t Register = getReg(Decoder, AArch64::GPR32wspRegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t Addr,
+                                                 const void *Decoder) {
+  if (RegNo > 15)
+    return Fail;
+  return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
 }
 
-static DecodeStatus
-DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  uint16_t Register = getReg(Decoder, AArch64::FPR8RegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
-}
+static const unsigned FPR64DecoderTable[] = {
+    AArch64::D0,  AArch64::D1,  AArch64::D2,  AArch64::D3,  AArch64::D4,
+    AArch64::D5,  AArch64::D6,  AArch64::D7,  AArch64::D8,  AArch64::D9,
+    AArch64::D10, AArch64::D11, AArch64::D12, AArch64::D13, AArch64::D14,
+    AArch64::D15, AArch64::D16, AArch64::D17, AArch64::D18, AArch64::D19,
+    AArch64::D20, AArch64::D21, AArch64::D22, AArch64::D23, AArch64::D24,
+    AArch64::D25, AArch64::D26, AArch64::D27, AArch64::D28, AArch64::D29,
+    AArch64::D30, AArch64::D31
+};
 
-static DecodeStatus
-DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::FPR16RegClassID, RegNo);
+  unsigned Register = FPR64DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
+static const unsigned FPR32DecoderTable[] = {
+    AArch64::S0,  AArch64::S1,  AArch64::S2,  AArch64::S3,  AArch64::S4,
+    AArch64::S5,  AArch64::S6,  AArch64::S7,  AArch64::S8,  AArch64::S9,
+    AArch64::S10, AArch64::S11, AArch64::S12, AArch64::S13, AArch64::S14,
+    AArch64::S15, AArch64::S16, AArch64::S17, AArch64::S18, AArch64::S19,
+    AArch64::S20, AArch64::S21, AArch64::S22, AArch64::S23, AArch64::S24,
+    AArch64::S25, AArch64::S26, AArch64::S27, AArch64::S28, AArch64::S29,
+    AArch64::S30, AArch64::S31
+};
 
-static DecodeStatus
-DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::FPR32RegClassID, RegNo);
+  unsigned Register = FPR32DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus
-DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static const unsigned FPR16DecoderTable[] = {
+    AArch64::H0,  AArch64::H1,  AArch64::H2,  AArch64::H3,  AArch64::H4,
+    AArch64::H5,  AArch64::H6,  AArch64::H7,  AArch64::H8,  AArch64::H9,
+    AArch64::H10, AArch64::H11, AArch64::H12, AArch64::H13, AArch64::H14,
+    AArch64::H15, AArch64::H16, AArch64::H17, AArch64::H18, AArch64::H19,
+    AArch64::H20, AArch64::H21, AArch64::H22, AArch64::H23, AArch64::H24,
+    AArch64::H25, AArch64::H26, AArch64::H27, AArch64::H28, AArch64::H29,
+    AArch64::H30, AArch64::H31
+};
+
+static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::FPR64RegClassID, RegNo);
+  unsigned Register = FPR16DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus
-DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 15)
-    return MCDisassembler::Fail;
-
-  return DecodeFPR64RegisterClass(Inst, RegNo, Address, Decoder);
-}
+static const unsigned FPR8DecoderTable[] = {
+    AArch64::B0,  AArch64::B1,  AArch64::B2,  AArch64::B3,  AArch64::B4,
+    AArch64::B5,  AArch64::B6,  AArch64::B7,  AArch64::B8,  AArch64::B9,
+    AArch64::B10, AArch64::B11, AArch64::B12, AArch64::B13, AArch64::B14,
+    AArch64::B15, AArch64::B16, AArch64::B17, AArch64::B18, AArch64::B19,
+    AArch64::B20, AArch64::B21, AArch64::B22, AArch64::B23, AArch64::B24,
+    AArch64::B25, AArch64::B26, AArch64::B27, AArch64::B28, AArch64::B29,
+    AArch64::B30, AArch64::B31
+};
 
-static DecodeStatus
-DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::FPR128RegClassID, RegNo);
+  unsigned Register = FPR8DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus
-DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 15)
-    return MCDisassembler::Fail;
-
-  return DecodeFPR128RegisterClass(Inst, RegNo, Address, Decoder);
-}
+static const unsigned GPR64DecoderTable[] = {
+    AArch64::X0,  AArch64::X1,  AArch64::X2,  AArch64::X3,  AArch64::X4,
+    AArch64::X5,  AArch64::X6,  AArch64::X7,  AArch64::X8,  AArch64::X9,
+    AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14,
+    AArch64::X15, AArch64::X16, AArch64::X17, AArch64::X18, AArch64::X19,
+    AArch64::X20, AArch64::X21, AArch64::X22, AArch64::X23, AArch64::X24,
+    AArch64::X25, AArch64::X26, AArch64::X27, AArch64::X28, AArch64::FP,
+    AArch64::LR,  AArch64::XZR
+};
 
-static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
-                                                  unsigned RegNo,
-                                                  uint64_t Address,
-                                                  const void *Decoder) {
-  if (RegNo > 30)
-    return MCDisassembler::Fail;
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::GPR64noxzrRegClassID, RegNo);
+  unsigned Register = GPR64DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus DecodeRegisterClassByID(llvm::MCInst &Inst, unsigned RegNo,
-                                            unsigned RegID,
-                                            const void *Decoder) {
+static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  uint16_t Register = getReg(Decoder, RegID, RegNo);
+    return Fail;
+  unsigned Register = GPR64DecoderTable[RegNo];
+  if (Register == AArch64::XZR)
+    Register = AArch64::SP;
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DPairRegClassID,
-                                 Decoder);
-}
+static const unsigned GPR32DecoderTable[] = {
+    AArch64::W0,  AArch64::W1,  AArch64::W2,  AArch64::W3,  AArch64::W4,
+    AArch64::W5,  AArch64::W6,  AArch64::W7,  AArch64::W8,  AArch64::W9,
+    AArch64::W10, AArch64::W11, AArch64::W12, AArch64::W13, AArch64::W14,
+    AArch64::W15, AArch64::W16, AArch64::W17, AArch64::W18, AArch64::W19,
+    AArch64::W20, AArch64::W21, AArch64::W22, AArch64::W23, AArch64::W24,
+    AArch64::W25, AArch64::W26, AArch64::W27, AArch64::W28, AArch64::W29,
+    AArch64::W30, AArch64::WZR
+};
 
-static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
                                              const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QPairRegClassID,
-                                 Decoder);
-}
+  if (RegNo > 31)
+    return Fail;
 
-static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DTripleRegClassID,
-                                 Decoder);
+  unsigned Register = GPR32DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
+static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
                                                const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QTripleRegClassID,
-                                 Decoder);
-}
+  if (RegNo > 31)
+    return Fail;
 
-static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DQuadRegClassID,
-                                 Decoder);
+  unsigned Register = GPR32DecoderTable[RegNo];
+  if (Register == AArch64::WZR)
+    Register = AArch64::WSP;
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QQuadRegClassID,
-                                 Decoder);
-}
+static const unsigned VectorDecoderTable[] = {
+    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
+    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
+    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+    AArch64::Q30, AArch64::Q31
+};
 
-static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
-                                               unsigned OptionHiS,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  // Option{1} must be 1. OptionHiS is made up of {Option{2}, Option{1},
-  // S}. Hence we want to check bit 1.
-  if (!(OptionHiS & 2))
-    return MCDisassembler::Fail;
+static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
 
-  Inst.addOperand(MCOperand::CreateImm(OptionHiS));
-  return MCDisassembler::Success;
+  unsigned Register = VectorDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeBitfield32ImmOperand(llvm::MCInst &Inst,
-                                               unsigned Imm6Bits,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  // In the 32-bit variant, bit 6 must be zero. I.e. the immediate must be
-  // between 0 and 31.
-  if (Imm6Bits > 31)
-    return MCDisassembler::Fail;
+static const unsigned QQDecoderTable[] = {
+  AArch64::Q0_Q1,   AArch64::Q1_Q2,   AArch64::Q2_Q3,   AArch64::Q3_Q4,
+  AArch64::Q4_Q5,   AArch64::Q5_Q6,   AArch64::Q6_Q7,   AArch64::Q7_Q8,
+  AArch64::Q8_Q9,   AArch64::Q9_Q10,  AArch64::Q10_Q11, AArch64::Q11_Q12,
+  AArch64::Q12_Q13, AArch64::Q13_Q14, AArch64::Q14_Q15, AArch64::Q15_Q16,
+  AArch64::Q16_Q17, AArch64::Q17_Q18, AArch64::Q18_Q19, AArch64::Q19_Q20,
+  AArch64::Q20_Q21, AArch64::Q21_Q22, AArch64::Q22_Q23, AArch64::Q23_Q24,
+  AArch64::Q24_Q25, AArch64::Q25_Q26, AArch64::Q26_Q27, AArch64::Q27_Q28,
+  AArch64::Q28_Q29, AArch64::Q29_Q30, AArch64::Q30_Q31, AArch64::Q31_Q0
+};
 
-  Inst.addOperand(MCOperand::CreateImm(Imm6Bits));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
-                                               unsigned Imm6Bits,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  // 1 <= Imm <= 32. Encoded as 64 - Imm so: 63 >= Encoded >= 32.
-  if (Imm6Bits < 32)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(Imm6Bits));
-  return MCDisassembler::Success;
-}
+static const unsigned QQQDecoderTable[] = {
+  AArch64::Q0_Q1_Q2,    AArch64::Q1_Q2_Q3,    AArch64::Q2_Q3_Q4,
+  AArch64::Q3_Q4_Q5,    AArch64::Q4_Q5_Q6,    AArch64::Q5_Q6_Q7,
+  AArch64::Q6_Q7_Q8,    AArch64::Q7_Q8_Q9,    AArch64::Q8_Q9_Q10,
+  AArch64::Q9_Q10_Q11,  AArch64::Q10_Q11_Q12, AArch64::Q11_Q12_Q13,
+  AArch64::Q12_Q13_Q14, AArch64::Q13_Q14_Q15, AArch64::Q14_Q15_Q16,
+  AArch64::Q15_Q16_Q17, AArch64::Q16_Q17_Q18, AArch64::Q17_Q18_Q19,
+  AArch64::Q18_Q19_Q20, AArch64::Q19_Q20_Q21, AArch64::Q20_Q21_Q22,
+  AArch64::Q21_Q22_Q23, AArch64::Q22_Q23_Q24, AArch64::Q23_Q24_Q25,
+  AArch64::Q24_Q25_Q26, AArch64::Q25_Q26_Q27, AArch64::Q26_Q27_Q28,
+  AArch64::Q27_Q28_Q29, AArch64::Q28_Q29_Q30, AArch64::Q29_Q30_Q31,
+  AArch64::Q30_Q31_Q0,  AArch64::Q31_Q0_Q1
+};
 
-static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
-                                        unsigned RmBits,
-                                        uint64_t Address,
-                                        const void *Decoder) {
-  // Any bits are valid in the instruction (they're architecturally ignored),
-  // but a code generator should insert 0.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(8 - Val));
-  return MCDisassembler::Success;
-}
+static const unsigned QQQQDecoderTable[] = {
+  AArch64::Q0_Q1_Q2_Q3,     AArch64::Q1_Q2_Q3_Q4,     AArch64::Q2_Q3_Q4_Q5,
+  AArch64::Q3_Q4_Q5_Q6,     AArch64::Q4_Q5_Q6_Q7,     AArch64::Q5_Q6_Q7_Q8,
+  AArch64::Q6_Q7_Q8_Q9,     AArch64::Q7_Q8_Q9_Q10,    AArch64::Q8_Q9_Q10_Q11,
+  AArch64::Q9_Q10_Q11_Q12,  AArch64::Q10_Q11_Q12_Q13, AArch64::Q11_Q12_Q13_Q14,
+  AArch64::Q12_Q13_Q14_Q15, AArch64::Q13_Q14_Q15_Q16, AArch64::Q14_Q15_Q16_Q17,
+  AArch64::Q15_Q16_Q17_Q18, AArch64::Q16_Q17_Q18_Q19, AArch64::Q17_Q18_Q19_Q20,
+  AArch64::Q18_Q19_Q20_Q21, AArch64::Q19_Q20_Q21_Q22, AArch64::Q20_Q21_Q22_Q23,
+  AArch64::Q21_Q22_Q23_Q24, AArch64::Q22_Q23_Q24_Q25, AArch64::Q23_Q24_Q25_Q26,
+  AArch64::Q24_Q25_Q26_Q27, AArch64::Q25_Q26_Q27_Q28, AArch64::Q26_Q27_Q28_Q29,
+  AArch64::Q27_Q28_Q29_Q30, AArch64::Q28_Q29_Q30_Q31, AArch64::Q29_Q30_Q31_Q0,
+  AArch64::Q30_Q31_Q0_Q1,   AArch64::Q31_Q0_Q1_Q2
+};
 
-static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(16 - Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(32 - Val));
-  return MCDisassembler::Success;
-}
+static const unsigned DDDecoderTable[] = {
+  AArch64::D0_D1,   AArch64::D1_D2,   AArch64::D2_D3,   AArch64::D3_D4,
+  AArch64::D4_D5,   AArch64::D5_D6,   AArch64::D6_D7,   AArch64::D7_D8,
+  AArch64::D8_D9,   AArch64::D9_D10,  AArch64::D10_D11, AArch64::D11_D12,
+  AArch64::D12_D13, AArch64::D13_D14, AArch64::D14_D15, AArch64::D15_D16,
+  AArch64::D16_D17, AArch64::D17_D18, AArch64::D18_D19, AArch64::D19_D20,
+  AArch64::D20_D21, AArch64::D21_D22, AArch64::D22_D23, AArch64::D23_D24,
+  AArch64::D24_D25, AArch64::D25_D26, AArch64::D26_D27, AArch64::D27_D28,
+  AArch64::D28_D29, AArch64::D29_D30, AArch64::D30_D31, AArch64::D31_D0
+};
 
-static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(64 - Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
-                                        uint64_t Address,
-                                        const void *Decoder) {
-  if (Val > 7)
-    return MCDisassembler::Fail;
+static const unsigned DDDDecoderTable[] = {
+  AArch64::D0_D1_D2,    AArch64::D1_D2_D3,    AArch64::D2_D3_D4,
+  AArch64::D3_D4_D5,    AArch64::D4_D5_D6,    AArch64::D5_D6_D7,
+  AArch64::D6_D7_D8,    AArch64::D7_D8_D9,    AArch64::D8_D9_D10,
+  AArch64::D9_D10_D11,  AArch64::D10_D11_D12, AArch64::D11_D12_D13,
+  AArch64::D12_D13_D14, AArch64::D13_D14_D15, AArch64::D14_D15_D16,
+  AArch64::D15_D16_D17, AArch64::D16_D17_D18, AArch64::D17_D18_D19,
+  AArch64::D18_D19_D20, AArch64::D19_D20_D21, AArch64::D20_D21_D22,
+  AArch64::D21_D22_D23, AArch64::D22_D23_D24, AArch64::D23_D24_D25,
+  AArch64::D24_D25_D26, AArch64::D25_D26_D27, AArch64::D26_D27_D28,
+  AArch64::D27_D28_D29, AArch64::D28_D29_D30, AArch64::D29_D30_D31,
+  AArch64::D30_D31_D0,  AArch64::D31_D0_D1
+};
 
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  if (Val > 15)
-    return MCDisassembler::Fail;
+static const unsigned DDDDDecoderTable[] = {
+  AArch64::D0_D1_D2_D3,     AArch64::D1_D2_D3_D4,     AArch64::D2_D3_D4_D5,
+  AArch64::D3_D4_D5_D6,     AArch64::D4_D5_D6_D7,     AArch64::D5_D6_D7_D8,
+  AArch64::D6_D7_D8_D9,     AArch64::D7_D8_D9_D10,    AArch64::D8_D9_D10_D11,
+  AArch64::D9_D10_D11_D12,  AArch64::D10_D11_D12_D13, AArch64::D11_D12_D13_D14,
+  AArch64::D12_D13_D14_D15, AArch64::D13_D14_D15_D16, AArch64::D14_D15_D16_D17,
+  AArch64::D15_D16_D17_D18, AArch64::D16_D17_D18_D19, AArch64::D17_D18_D19_D20,
+  AArch64::D18_D19_D20_D21, AArch64::D19_D20_D21_D22, AArch64::D20_D21_D22_D23,
+  AArch64::D21_D22_D23_D24, AArch64::D22_D23_D24_D25, AArch64::D23_D24_D25_D26,
+  AArch64::D24_D25_D26_D27, AArch64::D25_D26_D27_D28, AArch64::D26_D27_D28_D29,
+  AArch64::D27_D28_D29_D30, AArch64::D28_D29_D30_D31, AArch64::D29_D30_D31_D0,
+  AArch64::D30_D31_D0_D1,   AArch64::D31_D0_D1_D2
+};
 
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  if (Val > 31)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  // scale{5} is asserted as 1 in tblgen.
+  Imm |= 0x20;  
+  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  if (Val > 63)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+  return Success;
 }
 
-template<int RegWidth>
-static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
-                                             unsigned FullImm,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  unsigned Imm16 = FullImm & 0xffff;
-  unsigned Shift = FullImm >> 16;
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+                                       uint64_t Addr, const void *Decoder) {
+  int64_t ImmVal = Imm;
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend 19-bit immediate.
+  if (ImmVal & (1 << (19 - 1)))
+    ImmVal |= ~((1LL << 19) - 1);
 
-  if (RegWidth == 32 && Shift > 1) return MCDisassembler::Fail;
+  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr,
+                                     Inst.getOpcode() != AArch64::LDRXl, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  return Success;
+}
 
-  Inst.addOperand(MCOperand::CreateImm(Imm16));
-  Inst.addOperand(MCOperand::CreateImm(Shift));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+                                    uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm((Imm  >> 1) & 1));
+  Inst.addOperand(MCOperand::CreateImm(Imm & 1));
+  return Success;
 }
 
-template<int RegWidth>
-static DecodeStatus DecodeLogicalImmOperand(llvm::MCInst &Inst,
-                                            unsigned Bits,
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
                                             uint64_t Address,
                                             const void *Decoder) {
-  uint64_t Imm;
-  if (!A64Imms::isLogicalImmBits(RegWidth, Bits, Imm))
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(Bits));
-  return MCDisassembler::Success;
-}
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
 
+  Imm |= 0x8000;
+  Inst.addOperand(MCOperand::CreateImm(Imm));
 
-static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
-                                           unsigned ShiftAmount,
-                                           uint64_t Address,
-                                           const void *Decoder) {
-  // Only values 0-4 are valid for this 3-bit field
-  if (ShiftAmount > 4)
-    return MCDisassembler::Fail;
+  bool ValidNamed;
+  (void)AArch64SysReg::MRSMapper(STI.getFeatureBits())
+      .toString(Imm, ValidNamed);
 
-  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
-  return MCDisassembler::Success;
+  return ValidNamed ? Success : Fail;
 }
 
-static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
-                                            unsigned ShiftAmount,
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
                                             uint64_t Address,
                                             const void *Decoder) {
-  // Only values below 32 are valid for a 32-bit register
-  if (ShiftAmount > 31)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeBitfieldInstruction(llvm::MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned ImmS = fieldFromInstruction(Insn, 10, 6);
-  unsigned ImmR = fieldFromInstruction(Insn, 16, 6);
-  unsigned SF = fieldFromInstruction(Insn, 31, 1);
-
-  // Undef for 0b11 just in case it occurs. Don't want the compiler to optimise
-  // out assertions that it thinks should never be hit.
-  enum OpcTypes { SBFM = 0, BFM, UBFM, Undef } Opc;
-  Opc = (OpcTypes)fieldFromInstruction(Insn, 29, 2);
-
-  if (!SF) {
-    // ImmR and ImmS must be between 0 and 31 for 32-bit instructions.
-    if (ImmR > 31 || ImmS > 31)
-      return MCDisassembler::Fail;
-  }
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
 
-  if (SF) {
-    DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
-    // BFM MCInsts use Rd as a source too.
-    if (Opc == BFM) DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
-  } else {
-    DecodeGPR32RegisterClass(Inst, Rd, Address, Decoder);
-    // BFM MCInsts use Rd as a source too.
-    if (Opc == BFM) DecodeGPR32RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rn, Address, Decoder);
-  }
-
-  // ASR and LSR have more specific patterns so they won't get here:
-  assert(!(ImmS == 31 && !SF && Opc != BFM)
-         && "shift should have used auto decode");
-  assert(!(ImmS == 63 && SF && Opc != BFM)
-         && "shift should have used auto decode");
-
-  // Extension instructions similarly:
-  if (Opc == SBFM && ImmR == 0) {
-    assert((ImmS != 7 && ImmS != 15) && "extension got here");
-    assert((ImmS != 31 || SF == 0) && "extension got here");
-  } else if (Opc == UBFM && ImmR == 0) {
-    assert((SF != 0 || (ImmS != 7 && ImmS != 15)) && "extension got here");
-  }
+  Imm |= 0x8000;
+  Inst.addOperand(MCOperand::CreateImm(Imm));
 
-  if (Opc == UBFM) {
-    // It might be a LSL instruction, which actually takes the shift amount
-    // itself as an MCInst operand.
-    if (SF && (ImmS + 1) % 64 == ImmR) {
-      Inst.setOpcode(AArch64::LSLxxi);
-      Inst.addOperand(MCOperand::CreateImm(63 - ImmS));
-      return MCDisassembler::Success;
-    } else if (!SF && (ImmS + 1) % 32 == ImmR) {
-      Inst.setOpcode(AArch64::LSLwwi);
-      Inst.addOperand(MCOperand::CreateImm(31 - ImmS));
-      return MCDisassembler::Success;
-    }
-  }
-
-  // Otherwise it's definitely either an extract or an insert depending on which
-  // of ImmR or ImmS is larger.
-  unsigned ExtractOp, InsertOp;
-  switch (Opc) {
-  default: llvm_unreachable("unexpected instruction trying to decode bitfield");
-  case SBFM:
-    ExtractOp = SF ? AArch64::SBFXxxii : AArch64::SBFXwwii;
-    InsertOp = SF ? AArch64::SBFIZxxii : AArch64::SBFIZwwii;
-    break;
-  case BFM:
-    ExtractOp = SF ? AArch64::BFXILxxii : AArch64::BFXILwwii;
-    InsertOp = SF ? AArch64::BFIxxii : AArch64::BFIwwii;
-    break;
-  case UBFM:
-    ExtractOp = SF ? AArch64::UBFXxxii : AArch64::UBFXwwii;
-    InsertOp = SF ? AArch64::UBFIZxxii : AArch64::UBFIZwwii;
-    break;
-  }
-
-  // Otherwise it's a boring insert or extract
-  Inst.addOperand(MCOperand::CreateImm(ImmR));
-  Inst.addOperand(MCOperand::CreateImm(ImmS));
-
-
-  if (ImmS < ImmR)
-    Inst.setOpcode(InsertOp);
-  else
-    Inst.setOpcode(ExtractOp);
+  bool ValidNamed;
+  (void)AArch64SysReg::MSRMapper(STI.getFeatureBits())
+      .toString(Imm, ValidNamed);
 
-  return MCDisassembler::Success;
+  return ValidNamed ? Success : Fail;
 }
 
 static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
@@ -763,811 +681,879 @@ static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
   // Add the lane
   Inst.addOperand(MCOperand::CreateImm(1));
 
-  return MCDisassembler::Success;
+  return Success;
 }
 
+static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::CreateImm(Add - Imm));
+  return Success;
+}
 
-static DecodeStatus DecodeLDSTPairInstruction(llvm::MCInst &Inst,
-                                              unsigned Insn,
-                                              uint64_t Address,
-                                              const void *Decoder) {
-  DecodeStatus Result = MCDisassembler::Success;
-  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(Insn, 10, 5);
-  unsigned SImm7 = fieldFromInstruction(Insn, 15, 7);
-  unsigned L = fieldFromInstruction(Insn, 22, 1);
-  unsigned V = fieldFromInstruction(Insn, 26, 1);
-  unsigned Opc = fieldFromInstruction(Insn, 30, 2);
-
-  // Not an official name, but it turns out that bit 23 distinguishes indexed
-  // from non-indexed operations.
-  unsigned Indexed = fieldFromInstruction(Insn, 23, 1);
-
-  if (Indexed && L == 0) {
-    // The MCInst for an indexed store has an out operand and 4 ins:
-    //    Rn_wb, Rt, Rt2, Rn, Imm
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  }
-
-  // You shouldn't load to the same register twice in an instruction...
-  if (L && Rt == Rt2)
-    Result = MCDisassembler::SoftFail;
-
-  // ... or do any operation that writes-back to a transfer register. But note
-  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
-  if (Indexed && V == 0 && Rn != 31 && (Rt == Rn || Rt2 == Rn))
-    Result = MCDisassembler::SoftFail;
-
-  // Exactly how we decode the MCInst's registers depends on the Opc and V
-  // fields of the instruction. These also obviously determine the size of the
-  // operation so we can fill in that information while we're at it.
-  if (V) {
-    // The instruction operates on the FP/SIMD registers
-    switch (Opc) {
-    default: return MCDisassembler::Fail;
-    case 0:
-      DecodeFPR32RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeFPR32RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    case 1:
-      DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeFPR64RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    case 2:
-      DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeFPR128RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    }
-  } else {
-    switch (Opc) {
-    default: return MCDisassembler::Fail;
-    case 0:
-      DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeGPR32RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    case 1:
-      assert(L && "unexpected \"store signed\" attempt");
-      DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    case 2:
-      DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    }
-  }
-
-  if (Indexed && L == 1) {
-    // The MCInst for an indexed load has 3 out operands and an 3 ins:
-    //    Rt, Rt2, Rn_wb, Rt2, Rn, Imm
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  }
-
-
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(SImm7));
+static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1)));
+  return Success;
+}
 
-  return Result;
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 64);
 }
 
-static DecodeStatus DecodeLoadPairExclusiveInstruction(llvm::MCInst &Inst,
-                                                       uint32_t Val,
-                                                       uint64_t Address,
-                                                       const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(Val, 0, 5);
-  unsigned Rn = fieldFromInstruction(Val, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(Val, 10, 5);
-  unsigned MemSize = fieldFromInstruction(Val, 30, 2);
-
-  DecodeStatus S = MCDisassembler::Success;
-  if (Rt == Rt2) S = MCDisassembler::SoftFail;
-
-  switch (MemSize) {
-    case 2:
-      if (!Check(S, DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder)))
-        return MCDisassembler::Fail;
-      if (!Check(S, DecodeGPR32RegisterClass(Inst, Rt2, Address, Decoder)))
-        return MCDisassembler::Fail;
-      break;
-    case 3:
-      if (!Check(S, DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder)))
-        return MCDisassembler::Fail;
-      if (!Check(S, DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder)))
-        return MCDisassembler::Fail;
-      break;
-    default:
-      llvm_unreachable("Invalid MemSize in DecodeLoadPairExclusiveInstruction");
-  }
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
+}
 
-  if (!Check(S, DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder)))
-    return MCDisassembler::Fail;
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 32);
+}
 
-  return S;
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
 }
 
-template<typename SomeNamedImmMapper>
-static DecodeStatus DecodeNamedImmOperand(llvm::MCInst &Inst,
-                                          unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  SomeNamedImmMapper Mapper;
-  bool ValidNamed;
-  Mapper.toString(Val, ValidNamed);
-  if (ValidNamed || Mapper.validImm(Val)) {
-    Inst.addOperand(MCOperand::CreateImm(Val));
-    return MCDisassembler::Success;
-  }
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 16);
+}
 
-  return MCDisassembler::Fail;
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
 }
 
-static DecodeStatus DecodeSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
-                                        llvm::MCInst &Inst,
-                                        unsigned Val,
-                                        uint64_t Address,
-                                        const void *Decoder) {
-  bool ValidNamed;
-  Mapper.toString(Val, ValidNamed);
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 8);
+}
 
-  Inst.addOperand(MCOperand::CreateImm(Val));
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 64);
+}
 
-  return ValidNamed ? MCDisassembler::Success : MCDisassembler::Fail;
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 32);
 }
 
-static DecodeStatus DecodeMRSOperand(llvm::MCInst &Inst,
-                                     unsigned Val,
-                                     uint64_t Address,
-                                     const void *Decoder) {
-  return DecodeSysRegOperand(A64SysReg::MRSMapper(), Inst, Val, Address,
-                             Decoder);
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 16);
 }
 
-static DecodeStatus DecodeMSROperand(llvm::MCInst &Inst,
-                                     unsigned Val,
-                                     uint64_t Address,
-                                     const void *Decoder) {
-  return DecodeSysRegOperand(A64SysReg::MSRMapper(), Inst, Val, Address,
-                             Decoder);
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 8);
 }
 
-static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
-                                                   unsigned Insn,
-                                                   uint64_t Address,
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
                                                    const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned Imm9 = fieldFromInstruction(Insn, 12, 9);
-
-  unsigned Opc = fieldFromInstruction(Insn, 22, 2);
-  unsigned V = fieldFromInstruction(Insn, 26, 1);
-  unsigned Size = fieldFromInstruction(Insn, 30, 2);
-
-  if (Opc == 0 || (V == 1 && Opc == 2)) {
-    // It's a store, the MCInst gets: Rn_wb, Rt, Rn, Imm
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
+  unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
+  unsigned shift = (shiftHi << 6) | shiftLo;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::ADDWrs:
+  case AArch64::ADDSWrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBSWrs:
+    // if shift == '11' then ReservedValue()
+    if (shiftHi == 0x3)
+      return Fail;
+    // Deliberate fallthrough
+  case AArch64::ANDWrs:
+  case AArch64::ANDSWrs:
+  case AArch64::BICWrs:
+  case AArch64::BICSWrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORNWrs:
+  case AArch64::EORWrs:
+  case AArch64::EONWrs: {
+    // if sf == '0' and imm6<5> == '1' then ReservedValue()
+    if (shiftLo >> 5 == 1)
+      return Fail;
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
   }
-
-  if (V == 0 && (Opc == 2 || Size == 3)) {
-    DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
-  } else if (V == 0) {
-    DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder);
-  } else if (V == 1 && (Opc & 2)) {
-    DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-  } else {
-    switch (Size) {
-    case 0:
-      DecodeFPR8RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 1:
-      DecodeFPR16RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      DecodeFPR32RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    }
+  case AArch64::ADDXrs:
+  case AArch64::ADDSXrs:
+  case AArch64::SUBXrs:
+  case AArch64::SUBSXrs:
+    // if shift == '11' then ReservedValue()
+    if (shiftHi == 0x3)
+      return Fail;
+    // Deliberate fallthrough
+  case AArch64::ANDXrs:
+  case AArch64::ANDSXrs:
+  case AArch64::BICXrs:
+  case AArch64::BICSXrs:
+  case AArch64::ORRXrs:
+  case AArch64::ORNXrs:
+  case AArch64::EORXrs:
+  case AArch64::EONXrs:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
   }
 
-  if (Opc != 0 && (V != 1 || Opc != 2)) {
-    // It's a load, the MCInst gets: Rt, Rn_wb, Rn, Imm
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  }
+  Inst.addOperand(MCOperand::CreateImm(shift));
+  return Success;
+}
 
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned imm = fieldFromInstruction(insn, 5, 16);
+  unsigned shift = fieldFromInstruction(insn, 21, 2);
+  shift <<= 4;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::MOVZWi:
+  case AArch64::MOVNWi:
+  case AArch64::MOVKWi:
+    if (shift & (1U << 5))
+      return Fail;
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case AArch64::MOVZXi:
+  case AArch64::MOVNXi:
+  case AArch64::MOVKXi:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  }
 
-  Inst.addOperand(MCOperand::CreateImm(Imm9));
+  if (Inst.getOpcode() == AArch64::MOVKWi ||
+      Inst.getOpcode() == AArch64::MOVKXi)
+    Inst.addOperand(Inst.getOperand(0));
 
-  // N.b. The official documentation says undpredictable if Rt == Rn, but this
-  // takes place at the architectural rather than encoding level:
-  //
-  // "STR xzr, [sp], #4" is perfectly valid.
-  if (V == 0 && Rt == Rn && Rn != 31)
-    return MCDisassembler::SoftFail;
-  else
-    return MCDisassembler::Success;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  Inst.addOperand(MCOperand::CreateImm(shift));
+  return Success;
 }
 
-static MCDisassembler *createAArch64Disassembler(const Target &T,
-                                                 const MCSubtargetInfo &STI) {
-  return new AArch64Disassembler(STI, T.createMCRegInfo(""));
-}
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned offset = fieldFromInstruction(insn, 10, 12);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
 
-extern "C" void LLVMInitializeAArch64Disassembler() {
-  TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget,
-                                         createAArch64Disassembler);
-  TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget,
-                                         createAArch64Disassembler);
-}
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::PRFMui:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+    break;
+  case AArch64::STRBBui:
+  case AArch64::LDRBBui:
+  case AArch64::LDRSBWui:
+  case AArch64::STRHHui:
+  case AArch64::LDRHHui:
+  case AArch64::LDRSHWui:
+  case AArch64::STRWui:
+  case AArch64::LDRWui:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::STRXui:
+  case AArch64::LDRXui:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRQui:
+  case AArch64::STRQui:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRDui:
+  case AArch64::STRDui:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRSui:
+  case AArch64::STRSui:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRHui:
+  case AArch64::STRHui:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRBui:
+  case AArch64::STRBui:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
 
-template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
-static DecodeStatus
-DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
-                             uint64_t Address, const void *Decoder) {
-  bool IsLSL = false;
-  if (Ext == A64SE::LSL)
-    IsLSL = true;
-  else if (Ext != A64SE::MSL)
-    return MCDisassembler::Fail;
-
-  // MSL and LSLH accepts encoded shift amount 0 or 1.
-  if ((!IsLSL || (IsLSL && IsHalf)) && ShiftAmount != 0 && ShiftAmount != 1)
-    return MCDisassembler::Fail;
-
-  // LSL  accepts encoded shift amount 0, 1, 2 or 3.
-  if (IsLSL && ShiftAmount > 3)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
-  return MCDisassembler::Success;
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(offset));
+  return Success;
 }
 
-// Decode post-index vector load/store instructions.
-// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
-// operand is an immediate equal the the length of vector list in bytes,
-// or Rm is decoded to a GPR64noxzr register.
-static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(Insn, 16, 5);
-  unsigned Opcode = fieldFromInstruction(Insn, 12, 4);
-  unsigned IsLoad = fieldFromInstruction(Insn, 22, 1);
-  // 0 for 64bit vector list, 1 for 128bit vector list
-  unsigned Is128BitVec = fieldFromInstruction(Insn, 30, 1);
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  int64_t offset = fieldFromInstruction(insn, 12, 9);
 
-  unsigned NumVecs;
-  switch (Opcode) {
-  case 0: // ld4/st4
-  case 2: // ld1/st1 with 4 vectors
-    NumVecs = 4; break;
-  case 4: // ld3/st3
-  case 6: // ld1/st1 with 3 vectors
-    NumVecs = 3; break;
-  case 7: // ld1/st1 with 1 vector
-    NumVecs = 1; break;
-  case 8:  // ld2/st2
-  case 10: // ld1/st1 with 2 vectors
-    NumVecs = 2; break;
+  // offset is a 9-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (9 - 1)))
+    offset |= ~((1LL << 9) - 1);
+
+  // First operand is always the writeback to the address register, if needed.
+  switch (Inst.getOpcode()) {
   default:
-    llvm_unreachable("Invalid opcode for post-index load/store instructions");
+    break;
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::STRBBpre:
+  case AArch64::LDRBBpre:
+  case AArch64::STRHHpre:
+  case AArch64::LDRHHpre:
+  case AArch64::STRWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::STRBBpost:
+  case AArch64::LDRBBpost:
+  case AArch64::STRHHpost:
+  case AArch64::LDRHHpost:
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::STRXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::STRXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRXpost:
+  case AArch64::LDRQpre:
+  case AArch64::STRQpre:
+  case AArch64::LDRQpost:
+  case AArch64::STRQpost:
+  case AArch64::LDRDpre:
+  case AArch64::STRDpre:
+  case AArch64::LDRDpost:
+  case AArch64::STRDpost:
+  case AArch64::LDRSpre:
+  case AArch64::STRSpre:
+  case AArch64::LDRSpost:
+  case AArch64::STRSpost:
+  case AArch64::LDRHpre:
+  case AArch64::STRHpre:
+  case AArch64::LDRHpost:
+  case AArch64::STRHpost:
+  case AArch64::LDRBpre:
+  case AArch64::STRBpre:
+  case AArch64::LDRBpost:
+  case AArch64::STRBpost:
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    break;
   }
 
-  // Decode vector list of 1/2/3/4 vectors for load instructions.
-  if (IsLoad) {
-    switch (NumVecs) {
-    case 1:
-      Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 4:
-      Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    }
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::PRFUMi:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+    break;
+  case AArch64::STURBBi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+  case AArch64::STURHHi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURSHWi:
+  case AArch64::STURWi:
+  case AArch64::LDURWi:
+  case AArch64::LDTRSBWi:
+  case AArch64::LDTRSHWi:
+  case AArch64::STTRWi:
+  case AArch64::LDTRWi:
+  case AArch64::STTRHi:
+  case AArch64::LDTRHi:
+  case AArch64::LDTRBi:
+  case AArch64::STTRBi:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::STRBBpre:
+  case AArch64::LDRBBpre:
+  case AArch64::STRHHpre:
+  case AArch64::LDRHHpre:
+  case AArch64::STRWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::STRBBpost:
+  case AArch64::LDRBBpost:
+  case AArch64::STRHHpost:
+  case AArch64::LDRHHpost:
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::STURXi:
+  case AArch64::LDURXi:
+  case AArch64::LDTRSBXi:
+  case AArch64::LDTRSHXi:
+  case AArch64::LDTRSWi:
+  case AArch64::STTRXi:
+  case AArch64::LDTRXi:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::STRXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::STRXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRXpost:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+  case AArch64::LDRQpre:
+  case AArch64::STRQpre:
+  case AArch64::LDRQpost:
+  case AArch64::STRQpost:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURDi:
+  case AArch64::STURDi:
+  case AArch64::LDRDpre:
+  case AArch64::STRDpre:
+  case AArch64::LDRDpost:
+  case AArch64::STRDpost:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURSi:
+  case AArch64::STURSi:
+  case AArch64::LDRSpre:
+  case AArch64::STRSpre:
+  case AArch64::LDRSpost:
+  case AArch64::STRSpost:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURHi:
+  case AArch64::STURHi:
+  case AArch64::LDRHpre:
+  case AArch64::STRHpre:
+  case AArch64::LDRHpost:
+  case AArch64::STRHpost:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURBi:
+  case AArch64::STURBi:
+  case AArch64::LDRBpre:
+  case AArch64::STRBpre:
+  case AArch64::LDRBpost:
+  case AArch64::STRBpost:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
   }
 
-  // Decode write back register, which is equal to Rn.
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-
-  if (Rm == 31) // If Rm is 0x11111, add the vector list length in byte
-    Inst.addOperand(MCOperand::CreateImm(NumVecs * (Is128BitVec ? 16 : 8)));
-  else // Decode Rm
-    DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
-
-  // Decode vector list of 1/2/3/4 vectors for load instructions.
-  if (!IsLoad) {
-    switch (NumVecs) {
-    case 1:
-      Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 4:
-      Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    }
-  }
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(offset));
 
-  return MCDisassembler::Success;
+  bool IsLoad = fieldFromInstruction(insn, 22, 1);
+  bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0;
+  bool IsFP = fieldFromInstruction(insn, 26, 1);
+
+  // Cannot write back to a transfer register (but xzr != sp).
+  if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn)
+    return SoftFail;
+
+  return Success;
 }
 
-// Decode post-index vector load/store lane instructions.
-// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
-// operand is an immediate equal the the length of the changed bytes,
-// or Rm is decoded to a GPR64noxzr register.
-static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
-                                                   uint64_t Address,
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
                                                    const void *Decoder) {
-  bool Is64bitVec = false;
-  bool IsLoadDup = false;
-  bool IsLoad = false;
-  // The total number of bytes transferred.
-  // TransferBytes = NumVecs * OneLaneBytes
-  unsigned TransferBytes = 0;
-  unsigned NumVecs = 0;
-  unsigned Opc = Inst.getOpcode();
-  switch (Opc) {
-  case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
-  case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
-  case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
-  case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: {
-    switch (Opc) {
-    case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
-      TransferBytes = 1; break;
-    case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
-      TransferBytes = 2; break;
-    case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
-      TransferBytes = 4; break;
-    case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register:
-      TransferBytes = 8; break;
-    }
-    Is64bitVec = true;
-    IsLoadDup = true;
-    NumVecs = 1;
-    break;
-  }
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  unsigned Rs = fieldFromInstruction(insn, 16, 5);
 
-  case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
-  case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
-  case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
-  case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: {
-    switch (Opc) {
-    case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
-      TransferBytes = 1; break;
-    case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
-      TransferBytes = 2; break;
-    case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
-      TransferBytes = 4; break;
-    case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register:
-      TransferBytes = 8; break;
-    }
-    IsLoadDup = true;
-    NumVecs = 1;
+  unsigned Opcode = Inst.getOpcode();
+  switch (Opcode) {
+  default:
+    return Fail;
+  case AArch64::STLXRW:
+  case AArch64::STLXRB:
+  case AArch64::STLXRH:
+  case AArch64::STXRW:
+  case AArch64::STXRB:
+  case AArch64::STXRH:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDARW:
+  case AArch64::LDARB:
+  case AArch64::LDARH:
+  case AArch64::LDAXRW:
+  case AArch64::LDAXRB:
+  case AArch64::LDAXRH:
+  case AArch64::LDXRW:
+  case AArch64::LDXRB:
+  case AArch64::LDXRH:
+  case AArch64::STLRW:
+  case AArch64::STLRB:
+  case AArch64::STLRH:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
     break;
-  }
-
-  case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
-  case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
-  case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
-  case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: {
-    switch (Opc) {
-    case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
-      TransferBytes = 2; break;
-    case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
-      TransferBytes = 4; break;
-    case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
-      TransferBytes = 8; break;
-    case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register:
-      TransferBytes = 16; break;
-    }
-    Is64bitVec = true;
-    IsLoadDup = true;
-    NumVecs = 2;
+  case AArch64::STLXRX:
+  case AArch64::STXRX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDARX:
+  case AArch64::LDAXRX:
+  case AArch64::LDXRX:
+  case AArch64::STLRX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
     break;
-  }
-
-  case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
-  case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
-  case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
-  case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: {
-    switch (Opc) {
-    case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
-      TransferBytes = 2; break;
-    case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
-      TransferBytes = 4; break;
-    case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
-      TransferBytes = 8; break;
-    case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register:
-      TransferBytes = 16; break;
-    }
-    IsLoadDup = true;
-    NumVecs = 2;
+  case AArch64::STLXPW:
+  case AArch64::STXPW:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDAXPW:
+  case AArch64::LDXPW:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
-  }
-
-  case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
-  case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
-  case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
-  case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: {
-    switch (Opc) {
-    case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
-      TransferBytes = 3; break;
-    case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
-      TransferBytes = 6; break;
-    case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
-      TransferBytes = 12; break;
-    case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register:
-      TransferBytes = 24; break;
-    }
-    Is64bitVec = true;
-    IsLoadDup = true;
-    NumVecs = 3;
+  case AArch64::STLXPX:
+  case AArch64::STXPX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDAXPX:
+  case AArch64::LDXPX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
   }
 
-  case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
-  case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_8H_register:
-  case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_4S_register:
-  case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: {
-    switch (Opc) {
-    case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
-      TransferBytes = 3; break;
-    case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_8H_register:
-      TransferBytes = 6; break;
-    case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_4S_register:
-      TransferBytes = 12; break;
-    case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register:
-      TransferBytes = 24; break;
-    }
-    IsLoadDup = true;
-    NumVecs = 3;
-    break;
-  }
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
 
-  case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
-  case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
-  case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
-  case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: {
-    switch (Opc) {
-    case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
-      TransferBytes = 4; break;
-    case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
-      TransferBytes = 8; break;
-    case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
-      TransferBytes = 16; break;
-    case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register:
-      TransferBytes = 32; break;
-    }
-    Is64bitVec = true;
-    IsLoadDup = true;
-    NumVecs = 4;
-    break;
-  }
+  // You shouldn't load to the same register twice in an instruction...
+  if ((Opcode == AArch64::LDAXPW || Opcode == AArch64::LDXPW ||
+       Opcode == AArch64::LDAXPX || Opcode == AArch64::LDXPX) &&
+      Rt == Rt2)
+    return SoftFail;
 
-  case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
-  case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_8H_register:
-  case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_4S_register:
-  case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: {
-    switch (Opc) {
-    case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
-      TransferBytes = 4; break;
-    case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_8H_register:
-      TransferBytes = 8; break;
-    case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_4S_register:
-      TransferBytes = 16; break;
-    case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register:
-      TransferBytes = 32; break;
-    }
-    IsLoadDup = true;
-    NumVecs = 4;
-    break;
-  }
+  return Success;
+}
 
-  case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
-  case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
-  case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
-  case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
-      TransferBytes = 1; break;
-    case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
-      TransferBytes = 2; break;
-    case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
-      TransferBytes = 4; break;
-    case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register:
-      TransferBytes = 8; break;
-    }
-    IsLoad = true;
-    NumVecs = 1;
-    break;
-  }
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  int64_t offset = fieldFromInstruction(insn, 15, 7);
+  bool IsLoad = fieldFromInstruction(insn, 22, 1);
 
-  case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
-  case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
-  case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
-  case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
-      TransferBytes = 2; break;
-    case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
-      TransferBytes = 4; break;
-    case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
-      TransferBytes = 8; break;
-    case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register:
-      TransferBytes = 16; break;
-    }
-    IsLoad = true;
-    NumVecs = 2;
-    break;
-  }
+  // offset is a 7-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (7 - 1)))
+    offset |= ~((1LL << 7) - 1);
 
-  case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
-  case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
-  case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
-  case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
-      TransferBytes = 3; break;
-    case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
-      TransferBytes = 6; break;
-    case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
-      TransferBytes = 12; break;
-    case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register:
-      TransferBytes = 24; break;
-    }
-    IsLoad = true;
-    NumVecs = 3;
-    break;
-  }
+  unsigned Opcode = Inst.getOpcode();
+  bool NeedsDisjointWritebackTransfer = false;
 
-  case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
-  case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
-  case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
-  case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
-      TransferBytes = 4; break;
-    case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
-      TransferBytes = 8; break;
-    case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
-      TransferBytes = 16; break;
-    case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register:
-      TransferBytes = 32; break;
-    }
-    IsLoad = true;
-    NumVecs = 4;
+  // First operand is always writeback of base register.
+  switch (Opcode) {
+  default:
     break;
-  }
-
-  case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
-  case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
-  case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
-  case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
-      TransferBytes = 1; break;
-    case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
-      TransferBytes = 2; break;
-    case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
-      TransferBytes = 4; break;
-    case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register:
-      TransferBytes = 8; break;
-    }
-    NumVecs = 1;
+  case AArch64::LDPXpost:
+  case AArch64::STPXpost:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPXpre:
+  case AArch64::STPXpre:
+  case AArch64::LDPSWpre:
+  case AArch64::LDPWpost:
+  case AArch64::STPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::STPWpre:
+  case AArch64::LDPQpost:
+  case AArch64::STPQpost:
+  case AArch64::LDPQpre:
+  case AArch64::STPQpre:
+  case AArch64::LDPDpost:
+  case AArch64::STPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::STPDpre:
+  case AArch64::LDPSpost:
+  case AArch64::STPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::STPSpre:
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
     break;
   }
 
-  case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
-  case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
-  case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
-  case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
-      TransferBytes = 2; break;
-    case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
-      TransferBytes = 4; break;
-    case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
-      TransferBytes = 8; break;
-    case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register:
-      TransferBytes = 16; break;
-    }
-    NumVecs = 2;
+  switch (Opcode) {
+  default:
+    return Fail;
+  case AArch64::LDPXpost:
+  case AArch64::STPXpost:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPXpre:
+  case AArch64::STPXpre:
+  case AArch64::LDPSWpre:
+    NeedsDisjointWritebackTransfer = true;
+    // Fallthrough
+  case AArch64::LDNPXi:
+  case AArch64::STNPXi:
+  case AArch64::LDPXi:
+  case AArch64::STPXi:
+  case AArch64::LDPSWi:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
-  }
-
-  case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
-  case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
-  case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
-  case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
-      TransferBytes = 3; break;
-    case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
-      TransferBytes = 6; break;
-    case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
-      TransferBytes = 12; break;
-    case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register:
-      TransferBytes = 24; break;
-    }
-    NumVecs = 3;
+  case AArch64::LDPWpost:
+  case AArch64::STPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::STPWpre:
+    NeedsDisjointWritebackTransfer = true;
+    // Fallthrough
+  case AArch64::LDNPWi:
+  case AArch64::STNPWi:
+  case AArch64::LDPWi:
+  case AArch64::STPWi:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
-  }
-
-  case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
-  case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
-  case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
-  case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
-      TransferBytes = 4; break;
-    case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
-      TransferBytes = 8; break;
-    case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
-      TransferBytes = 16; break;
-    case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register:
-      TransferBytes = 32; break;
-    }
-    NumVecs = 4;
+  case AArch64::LDNPQi:
+  case AArch64::STNPQi:
+  case AArch64::LDPQpost:
+  case AArch64::STPQpost:
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+  case AArch64::LDPQpre:
+  case AArch64::STPQpre:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPDi:
+  case AArch64::STNPDi:
+  case AArch64::LDPDpost:
+  case AArch64::STPDpost:
+  case AArch64::LDPDi:
+  case AArch64::STPDi:
+  case AArch64::LDPDpre:
+  case AArch64::STPDpre:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPSi:
+  case AArch64::STNPSi:
+  case AArch64::LDPSpost:
+  case AArch64::STPSpost:
+  case AArch64::LDPSi:
+  case AArch64::STPSi:
+  case AArch64::LDPSpre:
+  case AArch64::STPSpre:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
   }
 
-  default:
-    return MCDisassembler::Fail;
-  } // End of switch (Opc)
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(offset));
 
-  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(Insn, 16, 5);
-
-  // Decode post-index of load duplicate lane
-  if (IsLoadDup) {
-    switch (NumVecs) {
-    case 1:
-      Is64bitVec ? DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder)
-                 : DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      Is64bitVec ? DecodeDPairRegisterClass(Inst, Rt, Address, Decoder)
-                 : DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      Is64bitVec ? DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder)
-                 : DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 4:
-      Is64bitVec ? DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder)
-                 : DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
-    }
-
-    // Decode write back register, which is equal to Rn.
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-
-    if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
-      Inst.addOperand(MCOperand::CreateImm(TransferBytes));
-    else // Decode Rm
-      DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
-
-    return MCDisassembler::Success;
-  }
+  // You shouldn't load to the same register twice in an instruction...
+  if (IsLoad && Rt == Rt2)
+    return SoftFail;
 
-  // Decode post-index of load/store lane
-  // Loads have a vector list as output.
-  if (IsLoad) {
-    switch (NumVecs) {
-    case 1:
-      DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 4:
-      DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
-    }
-  }
+  // ... or do any operation that writes-back to a transfer register. But note
+  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
+  if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn))
+    return SoftFail;
+
+  return Success;
+}
 
-  // Decode write back register, which is equal to Rn.
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned extend = fieldFromInstruction(insn, 10, 6);
 
-  if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
-    Inst.addOperand(MCOperand::CreateImm(TransferBytes));
-  else // Decode Rm
-    DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+  unsigned shift = extend & 0x7;
+  if (shift > 4)
+    return Fail;
 
-  // Decode the source vector list.
-  switch (NumVecs) {
-  case 1:
-    DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-    break;
-  case 2:
-    DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::ADDWrx:
+  case AArch64::SUBWrx:
+    DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 3:
-    DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+  case AArch64::ADDSWrx:
+  case AArch64::SUBSWrx:
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 4:
-    DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
-  }
-
-  // Decode lane
-  unsigned Q = fieldFromInstruction(Insn, 30, 1);
-  unsigned S = fieldFromInstruction(Insn, 10, 3);
-  unsigned lane = 0;
-  // Calculate the number of lanes by number of vectors and transferred bytes.
-  // NumLanes = 16 bytes / bytes of each lane
-  unsigned NumLanes = 16 / (TransferBytes / NumVecs);
-  switch (NumLanes) {
-  case 16: // A vector has 16 lanes, each lane is 1 bytes.
-    lane = (Q << 3) | S;
+  case AArch64::ADDXrx:
+  case AArch64::SUBXrx:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 8:
-    lane = (Q << 2) | (S >> 1);
+  case AArch64::ADDSXrx:
+  case AArch64::SUBSXrx:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 4:
-    lane = (Q << 1) | (S >> 2);
+  case AArch64::ADDXrx64:
+  case AArch64::SUBXrx64:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 2:
-    lane = Q;
+  case AArch64::SUBSXrx64:
+  case AArch64::ADDSXrx64:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
     break;
   }
-  Inst.addOperand(MCOperand::CreateImm(lane));
 
-  return MCDisassembler::Success;
+  Inst.addOperand(MCOperand::CreateImm(extend));
+  return Success;
 }
 
-static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned size = fieldFromInstruction(Insn, 22, 2);
-  unsigned Q = fieldFromInstruction(Insn, 30, 1);
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+  unsigned imm;
+
+  if (Datasize) {
+    if (Inst.getOpcode() == AArch64::ANDSXri)
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 13);
+    if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64))
+      return Fail;
+  } else {
+    if (Inst.getOpcode() == AArch64::ANDSWri)
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 12);
+    if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 32))
+      return Fail;
+  }
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  return Success;
+}
 
-  DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
 
-  if(Q)
-    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
+  if (Inst.getOpcode() == AArch64::MOVID)
+    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
   else
-    DecodeFPR64RegisterClass(Inst, Rn, Address, Decoder);
+    DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
 
-  switch (size) {
-  case 0:
-    Inst.addOperand(MCOperand::CreateImm(8));
+  switch (Inst.getOpcode()) {
+  default:
     break;
-  case 1:
-    Inst.addOperand(MCOperand::CreateImm(16));
+  case AArch64::MOVIv4i16:
+  case AArch64::MOVIv8i16:
+  case AArch64::MVNIv4i16:
+  case AArch64::MVNIv8i16:
+  case AArch64::MOVIv2i32:
+  case AArch64::MOVIv4i32:
+  case AArch64::MVNIv2i32:
+  case AArch64::MVNIv4i32:
+    Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
     break;
-  case 2:
-    Inst.addOperand(MCOperand::CreateImm(32));
+  case AArch64::MOVIv2s_msl:
+  case AArch64::MOVIv4s_msl:
+  case AArch64::MVNIv2s_msl:
+  case AArch64::MVNIv4s_msl:
+    Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108));
     break;
-  default :
-    return MCDisassembler::Fail;
   }
-  return MCDisassembler::Success;
+
+  return Success;
 }
 
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
+
+  // Tied operands added twice.
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
+
+  return Success;
+}
+
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
+  imm |= fieldFromInstruction(insn, 29, 2);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend the 21-bit immediate.
+  if (imm & (1 << (21 - 1)))
+    imm |= ~((1LL << 21) - 1);
+
+  DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Imm = fieldFromInstruction(insn, 10, 14);
+  unsigned S = fieldFromInstruction(insn, 29, 1);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+
+  unsigned ShifterVal = (Imm >> 12) & 3;
+  unsigned ImmVal = Imm & 0xFFF;
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  if (ShifterVal != 0 && ShifterVal != 1)
+    return Fail;
+
+  if (Datasize) {
+    if (Rd == 31 && !S)
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  } else {
+    if (Rd == 31 && !S)
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+  }
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal));
+  return Success;
+}
+
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  int64_t imm = fieldFromInstruction(insn, 0, 26);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend the 26-bit immediate.
+  if (imm & (1 << (26 - 1)))
+    imm |= ~((1LL << 26) - 1);
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  uint64_t op1 = fieldFromInstruction(insn, 16, 3);
+  uint64_t op2 = fieldFromInstruction(insn, 5, 3);
+  uint64_t crm = fieldFromInstruction(insn, 8, 4);
+
+  uint64_t pstate_field = (op1 << 3) | op2;
+
+  Inst.addOperand(MCOperand::CreateImm(pstate_field));
+  Inst.addOperand(MCOperand::CreateImm(crm));
+
+  bool ValidNamed;
+  (void)AArch64PState::PStateMapper().toString(pstate_field, ValidNamed);
+  
+  return ValidNamed ? Success : Fail;
+}
+
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
+  uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
+  bit |= fieldFromInstruction(insn, 19, 5);
+  int64_t dst = fieldFromInstruction(insn, 5, 14);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend 14-bit immediate.
+  if (dst & (1 << (14 - 1)))
+    dst |= ~((1LL << 14) - 1);
+
+  if (fieldFromInstruction(insn, 31, 1) == 0)
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+  else
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(bit));
+  if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(dst));
+
+  return Success;
+}
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
new file mode 100644
index 0000000..68d4867
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -0,0 +1,40 @@
+//===- AArch64Disassembler.h - Disassembler for AArch64 ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64DISASSEMBLER_H
+#define AArch64DISASSEMBLER_H
+
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class MemoryObject;
+class raw_ostream;
+
+class AArch64Disassembler : public MCDisassembler {
+public:
+  AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
+
+  ~AArch64Disassembler() {}
+
+  /// getInstruction - See MCDisassembler.
+  MCDisassembler::DecodeStatus
+  getInstruction(MCInst &instr, uint64_t &size, const MemoryObject &region,
+                 uint64_t address, raw_ostream &vStream,
+                 raw_ostream &cStream) const override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
new file mode 100644
index 0000000..2466368
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -0,0 +1,221 @@
+//===- AArch64ExternalSymbolizer.cpp - Symbolizer for AArch64 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ExternalSymbolizer.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-disassembler"
+
+static MCSymbolRefExpr::VariantKind
+getVariant(uint64_t LLVMDisassembler_VariantKind) {
+  switch (LLVMDisassembler_VariantKind) {
+  case LLVMDisassembler_VariantKind_None:
+    return MCSymbolRefExpr::VK_None;
+  case LLVMDisassembler_VariantKind_ARM64_PAGE:
+    return MCSymbolRefExpr::VK_PAGE;
+  case LLVMDisassembler_VariantKind_ARM64_PAGEOFF:
+    return MCSymbolRefExpr::VK_PAGEOFF;
+  case LLVMDisassembler_VariantKind_ARM64_GOTPAGE:
+    return MCSymbolRefExpr::VK_GOTPAGE;
+  case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF:
+    return MCSymbolRefExpr::VK_GOTPAGEOFF;
+  case LLVMDisassembler_VariantKind_ARM64_TLVP:
+  case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
+  default:
+    assert(0 && "bad LLVMDisassembler_VariantKind");
+    return MCSymbolRefExpr::VK_None;
+  }
+}
+
+/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+/// operand in place of the immediate Value in the MCInst.  The immediate
+/// Value has not had any PC adjustment made by the caller. If the instruction
+/// is a branch that adds the PC to the immediate Value then isBranch is
+/// Success, else Fail. If GetOpInfo is non-null, then it is called to get any
+/// symbolic information at the Address for this instrution.  If that returns
+/// non-zero then the symbolic information it returns is used to create an
+/// MCExpr and that is added as an operand to the MCInst.  If GetOpInfo()
+/// returns zero and isBranch is Success then a symbol look up for
+/// Address + Value is done and if a symbol is found an MCExpr is created with
+/// that, else an MCExpr with Address + Value is created.  If GetOpInfo()
+/// returns zero and isBranch is Fail then the the Opcode of the MCInst is
+/// tested and for ADRP an other instructions that help to load of pointers
+/// a symbol look up is done to see it is returns a specific reference type
+/// to add to the comment stream.  This function returns Success if it adds
+/// an operand to the MCInst and Fail otherwise.
+bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
+    MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address,
+    bool IsBranch, uint64_t Offset, uint64_t InstSize) {
+  // FIXME: This method shares a lot of code with
+  //        MCExternalSymbolizer::tryAddingSymbolicOperand. It may be possible
+  //        refactor the MCExternalSymbolizer interface to allow more of this
+  //        implementation to be shared.
+  //
+  struct LLVMOpInfo1 SymbolicOp;
+  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+  SymbolicOp.Value = Value;
+  uint64_t ReferenceType;
+  const char *ReferenceName;
+  if (!GetOpInfo ||
+      !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+    if (IsBranch) {
+      ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+      const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType,
+                                      Address, &ReferenceName);
+      if (Name) {
+        SymbolicOp.AddSymbol.Name = Name;
+        SymbolicOp.AddSymbol.Present = true;
+        SymbolicOp.Value = 0;
+      } else {
+        SymbolicOp.Value = Address + Value;
+      }
+      if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+        CommentStream << "symbol stub for: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message)
+        CommentStream << "Objc message: " << ReferenceName;
+    } else if (MI.getOpcode() == AArch64::ADRP) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP;
+        // otool expects the fully encoded ADRP instruction to be passed in as
+        // the value here, so reconstruct it:
+        const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+        uint32_t EncodedInst = 0x90000000;
+        EncodedInst |= (Value & 0x3) << 29; // immlo
+        EncodedInst |= ((Value >> 2) & 0x7FFFF) << 5; // immhi
+        EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg
+        SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+                     &ReferenceName);
+        CommentStream << format("0x%llx",
+                                0xfffffffffffff000LL & (Address + Value));
+    } else if (MI.getOpcode() == AArch64::ADDXri ||
+               MI.getOpcode() == AArch64::LDRXui ||
+               MI.getOpcode() == AArch64::LDRXl ||
+               MI.getOpcode() == AArch64::ADR) {
+      if (MI.getOpcode() == AArch64::ADDXri)
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri;
+      else if (MI.getOpcode() == AArch64::LDRXui)
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui;
+      if (MI.getOpcode() == AArch64::LDRXl) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl;
+        SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                     &ReferenceName);
+      } else if (MI.getOpcode() == AArch64::ADR) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR;
+        SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                            &ReferenceName);
+      } else {
+        const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+        // otool expects the fully encoded ADD/LDR instruction to be passed in
+        // as the value here, so reconstruct it:
+        unsigned EncodedInst =
+          MI.getOpcode() == AArch64::ADDXri ? 0x91000000: 0xF9400000;
+        EncodedInst |= Value << 10; // imm12 [+ shift:2 for ADD]
+        EncodedInst |=
+          MCRI.getEncodingValue(MI.getOperand(1).getReg()) << 5; // Rn
+        EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // Rd
+
+        SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+                     &ReferenceName);
+      }
+      if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
+        CommentStream << "literal pool symbol address: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+        CommentStream << "literal pool for: \"" << ReferenceName << "\"";
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
+        CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message)
+        CommentStream << "Objc message: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
+        CommentStream << "Objc message ref: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
+        CommentStream << "Objc selector ref: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
+        CommentStream << "Objc class ref: " << ReferenceName;
+      // For these instructions, the SymbolLookUp() above is just to get the
+      // ReferenceType and ReferenceName.  We want to make sure not to
+      // fall through so we don't build an MCExpr to leave the disassembly
+      // of the immediate values of these instructions to the InstPrinter.
+      return false;
+    } else {
+      return false;
+    }
+  }
+
+  const MCExpr *Add = nullptr;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+      MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind);
+      if (Variant != MCSymbolRefExpr::VK_None)
+        Add = MCSymbolRefExpr::Create(Sym, Variant, Ctx);
+      else
+        Add = MCSymbolRefExpr::Create(Sym, Ctx);
+    } else {
+      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Sub = nullptr;
+  if (SymbolicOp.SubtractSymbol.Present) {
+    if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, Ctx);
+    } else {
+      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Off = nullptr;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx);
+    else
+      LHS = MCUnaryExpr::CreateMinus(Sub, Ctx);
+    if (Off)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, Ctx);
+  }
+
+  MI.addOperand(MCOperand::CreateExpr(Expr));
+
+  return true;
+}
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
new file mode 100644
index 0000000..171d31c
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -0,0 +1,38 @@
+//===- AArch64ExternalSymbolizer.h - Symbolizer for AArch64 -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Symbolize AArch64 assembly code during disassembly using callbacks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64EXTERNALSYMBOLIZER_H
+#define AArch64EXTERNALSYMBOLIZER_H
+
+#include "llvm/MC/MCExternalSymbolizer.h"
+
+namespace llvm {
+
+class AArch64ExternalSymbolizer : public MCExternalSymbolizer {
+public:
+  AArch64ExternalSymbolizer(MCContext &Ctx,
+                            std::unique_ptr<MCRelocationInfo> RelInfo,
+                            LLVMOpInfoCallback GetOpInfo,
+                            LLVMSymbolLookupCallback SymbolLookUp,
+                            void *DisInfo)
+      : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp,
+                             DisInfo) {}
+
+  bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream,
+                                int64_t Value, uint64_t Address, bool IsBranch,
+                                uint64_t Offset, uint64_t InstSize) override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/Disassembler/Android.mk b/lib/Target/AArch64/Disassembler/Android.mk
index fcc53ad..b89538d 100644
--- a/lib/Target/AArch64/Disassembler/Android.mk
+++ b/lib/Target/AArch64/Disassembler/Android.mk
@@ -7,7 +7,8 @@ arm64_disassembler_TBLGEN_TABLES := \
   AArch64GenRegisterInfo.inc
 
 arm64_disassembler_SRC_FILES := \
-  AArch64Disassembler.cpp
+  AArch64Disassembler.cpp \
+  AArch64ExternalSymbolizer.cpp
 
 # For the device
 # =====================================================
diff --git a/lib/Target/AArch64/Disassembler/CMakeLists.txt b/lib/Target/AArch64/Disassembler/CMakeLists.txt
index 21baf25..be4ccad 100644
--- a/lib/Target/AArch64/Disassembler/CMakeLists.txt
+++ b/lib/Target/AArch64/Disassembler/CMakeLists.txt
@@ -1,3 +1,14 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
 add_llvm_library(LLVMAArch64Disassembler
   AArch64Disassembler.cpp
+  AArch64ExternalSymbolizer.cpp
   )
+# workaround for hanging compilation on MSVC8, 9 and 10
+#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
+#set_property(
+#  SOURCE ARMDisassembler.cpp
+#  PROPERTY COMPILE_FLAGS "/Od"
+#  )
+#endif()
+add_dependencies(LLVMAArch64Disassembler AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
index 05c4ed1..a4224f4 100644
--- a/lib/Target/AArch64/Disassembler/LLVMBuild.txt
+++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/Disassembler/LLVMBuild.txt ----------*- Conf -*--===;
+;===- ./lib/Target/AArch64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/Disassembler/Makefile b/lib/Target/AArch64/Disassembler/Makefile
index 5c86120..741bb81 100644
--- a/lib/Target/AArch64/Disassembler/Makefile
+++ b/lib/Target/AArch64/Disassembler/Makefile
@@ -10,7 +10,7 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMAArch64Disassembler
 
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' arm target directory to grab private headers
 CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index fd3f009..f484a5b 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -11,529 +11,1306 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "AArch64InstPrinter.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #define GET_INSTRUCTION_NAME
 #define PRINT_ALIAS_INSTR
 #include "AArch64GenAsmWriter.inc"
-
-static int64_t unpackSignedImm(int BitWidth, uint64_t Value) {
-  assert(!(Value & ~((1ULL << BitWidth)-1)) && "immediate not n-bit");
-  if (Value & (1ULL <<  (BitWidth - 1)))
-    return static_cast<int64_t>(Value) - (1LL << BitWidth);
-  else
-    return Value;
-}
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter1.inc"
 
 AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI,
                                        const MCInstrInfo &MII,
                                        const MCRegisterInfo &MRI,
-                                       const MCSubtargetInfo &STI) :
-  MCInstPrinter(MAI, MII, MRI) {
+                                       const MCSubtargetInfo &STI)
+    : MCInstPrinter(MAI, MII, MRI) {
   // Initialize the set of available features.
   setAvailableFeatures(STI.getFeatureBits());
 }
 
+AArch64AppleInstPrinter::AArch64AppleInstPrinter(const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI)
+    : AArch64InstPrinter(MAI, MII, MRI, STI) {}
+
 void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // This is for .cfi directives.
   OS << getRegisterName(RegNo);
 }
 
-void
-AArch64InstPrinter::printOffsetSImm9Operand(const MCInst *MI,
-                                              unsigned OpNum, raw_ostream &O) {
-  const MCOperand &MOImm = MI->getOperand(OpNum);
-  int32_t Imm = unpackSignedImm(9, MOImm.getImm());
-
-  O << '#' << Imm;
-}
-
-void
-AArch64InstPrinter::printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O, unsigned MemSize,
-                                          unsigned RmSize) {
-  unsigned ExtImm = MI->getOperand(OpNum).getImm();
-  unsigned OptionHi = ExtImm >> 1;
-  unsigned S = ExtImm & 1;
-  bool IsLSL = OptionHi == 1 && RmSize == 64;
-
-  const char *Ext;
-  switch (OptionHi) {
-  case 1:
-    Ext = (RmSize == 32) ? "uxtw" : "lsl";
-    break;
-  case 3:
-    Ext = (RmSize == 32) ? "sxtw" : "sxtx";
-    break;
-  default:
-    llvm_unreachable("Incorrect Option on load/store (reg offset)");
-  }
-  O << Ext;
+void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  // Check for special encodings and print the canonical alias instead.
 
-  if (S) {
-    unsigned ShiftAmt = Log2_32(MemSize);
-    O << " #" << ShiftAmt;
-  } else if (IsLSL) {
-    O << " #0";
-  }
-}
+  unsigned Opcode = MI->getOpcode();
 
-void
-AArch64InstPrinter::printAddSubImmLSL0Operand(const MCInst *MI,
-                                              unsigned OpNum, raw_ostream &O) {
-  const MCOperand &Imm12Op = MI->getOperand(OpNum);
+  if (Opcode == AArch64::SYSxt)
+    if (printSysAlias(MI, O)) {
+      printAnnotation(O, Annot);
+      return;
+    }
 
-  if (Imm12Op.isImm()) {
-    int64_t Imm12 = Imm12Op.getImm();
-    assert(Imm12 >= 0 && "Invalid immediate for add/sub imm");
-    O << "#" << Imm12;
-  } else {
-    assert(Imm12Op.isExpr() && "Unexpected shift operand type");
-    O << "#" << *Imm12Op.getExpr();
-  }
-}
+  // SBFM/UBFM should print to a nicer aliased form if possible.
+  if (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri ||
+      Opcode == AArch64::UBFMXri || Opcode == AArch64::UBFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0);
+    const MCOperand &Op1 = MI->getOperand(1);
+    const MCOperand &Op2 = MI->getOperand(2);
+    const MCOperand &Op3 = MI->getOperand(3);
+
+    bool IsSigned = (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri);
+    bool Is64Bit = (Opcode == AArch64::SBFMXri || Opcode == AArch64::UBFMXri);
+    if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
+      const char *AsmMnemonic = nullptr;
+
+      switch (Op3.getImm()) {
+      default:
+        break;
+      case 7:
+        if (IsSigned)
+          AsmMnemonic = "sxtb";
+        else if (!Is64Bit)
+          AsmMnemonic = "uxtb";
+        break;
+      case 15:
+        if (IsSigned)
+          AsmMnemonic = "sxth";
+        else if (!Is64Bit)
+          AsmMnemonic = "uxth";
+        break;
+      case 31:
+        // *xtw is only valid for signed 64-bit operations.
+        if (Is64Bit && IsSigned)
+          AsmMnemonic = "sxtw";
+        break;
+      }
+
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(getWRegFromXReg(Op1.getReg()));
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
 
-void
-AArch64InstPrinter::printAddSubImmLSL12Operand(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &O) {
+    // All immediate shifts are aliases, implemented using the Bitfield
+    // instruction. In all cases the immediate shift amount shift must be in
+    // the range 0 to (reg.size -1).
+    if (Op2.isImm() && Op3.isImm()) {
+      const char *AsmMnemonic = nullptr;
+      int shift = 0;
+      int64_t immr = Op2.getImm();
+      int64_t imms = Op3.getImm();
+      if (Opcode == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
+        AsmMnemonic = "lsl";
+        shift = 31 - imms;
+      } else if (Opcode == AArch64::UBFMXri && imms != 0x3f &&
+                 ((imms + 1 == immr))) {
+        AsmMnemonic = "lsl";
+        shift = 63 - imms;
+      } else if (Opcode == AArch64::UBFMWri && imms == 0x1f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == AArch64::UBFMXri && imms == 0x3f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == AArch64::SBFMWri && imms == 0x1f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      } else if (Opcode == AArch64::SBFMXri && imms == 0x3f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      }
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
 
-  printAddSubImmLSL0Operand(MI, OpNum, O);
+    // SBFIZ/UBFIZ aliases
+    if (Op2.getImm() > Op3.getImm()) {
+      O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t'
+        << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+        << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1;
+      printAnnotation(O, Annot);
+      return;
+    }
 
-  O << ", lsl #12";
-}
+    // Otherwise SBFX/UBFX is the preferred form
+    O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t'
+      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+      << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1;
+    printAnnotation(O, Annot);
+    return;
+  }
 
-void
-AArch64InstPrinter::printBareImmOperand(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  O << MO.getImm();
-}
+  if (Opcode == AArch64::BFMXri || Opcode == AArch64::BFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0
+    const MCOperand &Op2 = MI->getOperand(2);
+    int ImmR = MI->getOperand(3).getImm();
+    int ImmS = MI->getOperand(4).getImm();
+
+    // BFI alias
+    if (ImmS < ImmR) {
+      int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
+      int LSB = (BitWidth - ImmR) % BitWidth;
+      int Width = ImmS + 1;
+      O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", "
+        << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width;
+      printAnnotation(O, Annot);
+      return;
+    }
 
-template<unsigned RegWidth> void
-AArch64InstPrinter::printBFILSBOperand(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
-  const MCOperand &ImmROp = MI->getOperand(OpNum);
-  unsigned LSB = ImmROp.getImm() == 0 ? 0 : RegWidth - ImmROp.getImm();
+    int LSB = ImmR;
+    int Width = ImmS - ImmR + 1;
+    // Otherwise BFXIL the preferred form
+    O << "\tbfxil\t"
+      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg())
+      << ", #" << LSB << ", #" << Width;
+    printAnnotation(O, Annot);
+    return;
+  }
 
-  O << '#' << LSB;
-}
+  // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
+  // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
+  // printed.
+  if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi ||
+       Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+      MI->getOperand(1).isExpr()) {
+    if (Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi)
+      O << "\tmovz\t";
+    else
+      O << "\tmovn\t";
 
-void AArch64InstPrinter::printBFIWidthOperand(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  const MCOperand &ImmSOp = MI->getOperand(OpNum);
-  unsigned Width = ImmSOp.getImm() + 1;
+    O << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+      << *MI->getOperand(1).getExpr();
+    return;
+  }
 
-  O << '#' << Width;
-}
+  if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) &&
+      MI->getOperand(2).isExpr()) {
+    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+      << *MI->getOperand(2).getExpr();
+    return;
+  }
 
-void
-AArch64InstPrinter::printBFXWidthOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &ImmSOp = MI->getOperand(OpNum);
-  const MCOperand &ImmROp = MI->getOperand(OpNum - 1);
+  if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
 
-  unsigned ImmR = ImmROp.getImm();
-  unsigned ImmS = ImmSOp.getImm();
+  printAnnotation(O, Annot);
+}
 
-  assert(ImmS >= ImmR && "Invalid ImmR, ImmS combination for bitfield extract");
+static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
+                                bool &IsTbx) {
+  switch (Opcode) {
+  case AArch64::TBXv8i8One:
+  case AArch64::TBXv8i8Two:
+  case AArch64::TBXv8i8Three:
+  case AArch64::TBXv8i8Four:
+    IsTbx = true;
+    Layout = ".8b";
+    return true;
+  case AArch64::TBLv8i8One:
+  case AArch64::TBLv8i8Two:
+  case AArch64::TBLv8i8Three:
+  case AArch64::TBLv8i8Four:
+    IsTbx = false;
+    Layout = ".8b";
+    return true;
+  case AArch64::TBXv16i8One:
+  case AArch64::TBXv16i8Two:
+  case AArch64::TBXv16i8Three:
+  case AArch64::TBXv16i8Four:
+    IsTbx = true;
+    Layout = ".16b";
+    return true;
+  case AArch64::TBLv16i8One:
+  case AArch64::TBLv16i8Two:
+  case AArch64::TBLv16i8Three:
+  case AArch64::TBLv16i8Four:
+    IsTbx = false;
+    Layout = ".16b";
+    return true;
+  default:
+    return false;
+  }
+}
 
-  O << '#' << (ImmS - ImmR + 1);
+struct LdStNInstrDesc {
+  unsigned Opcode;
+  const char *Mnemonic;
+  const char *Layout;
+  int ListOperand;
+  bool HasLane;
+  int NaturalOffset;
+};
+
+static LdStNInstrDesc LdStNInstInfo[] = {
+  { AArch64::LD1i8,             "ld1",  ".b",     1, true,  0  },
+  { AArch64::LD1i16,            "ld1",  ".h",     1, true,  0  },
+  { AArch64::LD1i32,            "ld1",  ".s",     1, true,  0  },
+  { AArch64::LD1i64,            "ld1",  ".d",     1, true,  0  },
+  { AArch64::LD1i8_POST,        "ld1",  ".b",     2, true,  1  },
+  { AArch64::LD1i16_POST,       "ld1",  ".h",     2, true,  2  },
+  { AArch64::LD1i32_POST,       "ld1",  ".s",     2, true,  4  },
+  { AArch64::LD1i64_POST,       "ld1",  ".d",     2, true,  8  },
+  { AArch64::LD1Rv16b,          "ld1r", ".16b",   0, false, 0  },
+  { AArch64::LD1Rv8h,           "ld1r", ".8h",    0, false, 0  },
+  { AArch64::LD1Rv4s,           "ld1r", ".4s",    0, false, 0  },
+  { AArch64::LD1Rv2d,           "ld1r", ".2d",    0, false, 0  },
+  { AArch64::LD1Rv8b,           "ld1r", ".8b",    0, false, 0  },
+  { AArch64::LD1Rv4h,           "ld1r", ".4h",    0, false, 0  },
+  { AArch64::LD1Rv2s,           "ld1r", ".2s",    0, false, 0  },
+  { AArch64::LD1Rv1d,           "ld1r", ".1d",    0, false, 0  },
+  { AArch64::LD1Rv16b_POST,     "ld1r", ".16b",   1, false, 1  },
+  { AArch64::LD1Rv8h_POST,      "ld1r", ".8h",    1, false, 2  },
+  { AArch64::LD1Rv4s_POST,      "ld1r", ".4s",    1, false, 4  },
+  { AArch64::LD1Rv2d_POST,      "ld1r", ".2d",    1, false, 8  },
+  { AArch64::LD1Rv8b_POST,      "ld1r", ".8b",    1, false, 1  },
+  { AArch64::LD1Rv4h_POST,      "ld1r", ".4h",    1, false, 2  },
+  { AArch64::LD1Rv2s_POST,      "ld1r", ".2s",    1, false, 4  },
+  { AArch64::LD1Rv1d_POST,      "ld1r", ".1d",    1, false, 8  },
+  { AArch64::LD1Onev16b,        "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Onev8h,         "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Onev4s,         "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Onev2d,         "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Onev8b,         "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Onev4h,         "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Onev2s,         "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Onev1d,         "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Onev16b_POST,   "ld1",  ".16b",   1, false, 16 },
+  { AArch64::LD1Onev8h_POST,    "ld1",  ".8h",    1, false, 16 },
+  { AArch64::LD1Onev4s_POST,    "ld1",  ".4s",    1, false, 16 },
+  { AArch64::LD1Onev2d_POST,    "ld1",  ".2d",    1, false, 16 },
+  { AArch64::LD1Onev8b_POST,    "ld1",  ".8b",    1, false, 8  },
+  { AArch64::LD1Onev4h_POST,    "ld1",  ".4h",    1, false, 8  },
+  { AArch64::LD1Onev2s_POST,    "ld1",  ".2s",    1, false, 8  },
+  { AArch64::LD1Onev1d_POST,    "ld1",  ".1d",    1, false, 8  },
+  { AArch64::LD1Twov16b,        "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Twov8h,         "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Twov4s,         "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Twov2d,         "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Twov8b,         "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Twov4h,         "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Twov2s,         "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Twov1d,         "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Twov16b_POST,   "ld1",  ".16b",   1, false, 32 },
+  { AArch64::LD1Twov8h_POST,    "ld1",  ".8h",    1, false, 32 },
+  { AArch64::LD1Twov4s_POST,    "ld1",  ".4s",    1, false, 32 },
+  { AArch64::LD1Twov2d_POST,    "ld1",  ".2d",    1, false, 32 },
+  { AArch64::LD1Twov8b_POST,    "ld1",  ".8b",    1, false, 16 },
+  { AArch64::LD1Twov4h_POST,    "ld1",  ".4h",    1, false, 16 },
+  { AArch64::LD1Twov2s_POST,    "ld1",  ".2s",    1, false, 16 },
+  { AArch64::LD1Twov1d_POST,    "ld1",  ".1d",    1, false, 16 },
+  { AArch64::LD1Threev16b,      "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Threev8h,       "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Threev4s,       "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Threev2d,       "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Threev8b,       "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Threev4h,       "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Threev2s,       "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Threev1d,       "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Threev16b_POST, "ld1",  ".16b",   1, false, 48 },
+  { AArch64::LD1Threev8h_POST,  "ld1",  ".8h",    1, false, 48 },
+  { AArch64::LD1Threev4s_POST,  "ld1",  ".4s",    1, false, 48 },
+  { AArch64::LD1Threev2d_POST,  "ld1",  ".2d",    1, false, 48 },
+  { AArch64::LD1Threev8b_POST,  "ld1",  ".8b",    1, false, 24 },
+  { AArch64::LD1Threev4h_POST,  "ld1",  ".4h",    1, false, 24 },
+  { AArch64::LD1Threev2s_POST,  "ld1",  ".2s",    1, false, 24 },
+  { AArch64::LD1Threev1d_POST,  "ld1",  ".1d",    1, false, 24 },
+  { AArch64::LD1Fourv16b,       "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Fourv8h,        "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Fourv4s,        "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Fourv2d,        "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Fourv8b,        "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Fourv4h,        "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Fourv2s,        "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Fourv1d,        "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Fourv16b_POST,  "ld1",  ".16b",   1, false, 64 },
+  { AArch64::LD1Fourv8h_POST,   "ld1",  ".8h",    1, false, 64 },
+  { AArch64::LD1Fourv4s_POST,   "ld1",  ".4s",    1, false, 64 },
+  { AArch64::LD1Fourv2d_POST,   "ld1",  ".2d",    1, false, 64 },
+  { AArch64::LD1Fourv8b_POST,   "ld1",  ".8b",    1, false, 32 },
+  { AArch64::LD1Fourv4h_POST,   "ld1",  ".4h",    1, false, 32 },
+  { AArch64::LD1Fourv2s_POST,   "ld1",  ".2s",    1, false, 32 },
+  { AArch64::LD1Fourv1d_POST,   "ld1",  ".1d",    1, false, 32 },
+  { AArch64::LD2i8,             "ld2",  ".b",     1, true,  0  },
+  { AArch64::LD2i16,            "ld2",  ".h",     1, true,  0  },
+  { AArch64::LD2i32,            "ld2",  ".s",     1, true,  0  },
+  { AArch64::LD2i64,            "ld2",  ".d",     1, true,  0  },
+  { AArch64::LD2i8_POST,        "ld2",  ".b",     2, true,  2  },
+  { AArch64::LD2i16_POST,       "ld2",  ".h",     2, true,  4  },
+  { AArch64::LD2i32_POST,       "ld2",  ".s",     2, true,  8  },
+  { AArch64::LD2i64_POST,       "ld2",  ".d",     2, true,  16  },
+  { AArch64::LD2Rv16b,          "ld2r", ".16b",   0, false, 0  },
+  { AArch64::LD2Rv8h,           "ld2r", ".8h",    0, false, 0  },
+  { AArch64::LD2Rv4s,           "ld2r", ".4s",    0, false, 0  },
+  { AArch64::LD2Rv2d,           "ld2r", ".2d",    0, false, 0  },
+  { AArch64::LD2Rv8b,           "ld2r", ".8b",    0, false, 0  },
+  { AArch64::LD2Rv4h,           "ld2r", ".4h",    0, false, 0  },
+  { AArch64::LD2Rv2s,           "ld2r", ".2s",    0, false, 0  },
+  { AArch64::LD2Rv1d,           "ld2r", ".1d",    0, false, 0  },
+  { AArch64::LD2Rv16b_POST,     "ld2r", ".16b",   1, false, 2  },
+  { AArch64::LD2Rv8h_POST,      "ld2r", ".8h",    1, false, 4  },
+  { AArch64::LD2Rv4s_POST,      "ld2r", ".4s",    1, false, 8  },
+  { AArch64::LD2Rv2d_POST,      "ld2r", ".2d",    1, false, 16 },
+  { AArch64::LD2Rv8b_POST,      "ld2r", ".8b",    1, false, 2  },
+  { AArch64::LD2Rv4h_POST,      "ld2r", ".4h",    1, false, 4  },
+  { AArch64::LD2Rv2s_POST,      "ld2r", ".2s",    1, false, 8  },
+  { AArch64::LD2Rv1d_POST,      "ld2r", ".1d",    1, false, 16 },
+  { AArch64::LD2Twov16b,        "ld2",  ".16b",   0, false, 0  },
+  { AArch64::LD2Twov8h,         "ld2",  ".8h",    0, false, 0  },
+  { AArch64::LD2Twov4s,         "ld2",  ".4s",    0, false, 0  },
+  { AArch64::LD2Twov2d,         "ld2",  ".2d",    0, false, 0  },
+  { AArch64::LD2Twov8b,         "ld2",  ".8b",    0, false, 0  },
+  { AArch64::LD2Twov4h,         "ld2",  ".4h",    0, false, 0  },
+  { AArch64::LD2Twov2s,         "ld2",  ".2s",    0, false, 0  },
+  { AArch64::LD2Twov16b_POST,   "ld2",  ".16b",   1, false, 32 },
+  { AArch64::LD2Twov8h_POST,    "ld2",  ".8h",    1, false, 32 },
+  { AArch64::LD2Twov4s_POST,    "ld2",  ".4s",    1, false, 32 },
+  { AArch64::LD2Twov2d_POST,    "ld2",  ".2d",    1, false, 32 },
+  { AArch64::LD2Twov8b_POST,    "ld2",  ".8b",    1, false, 16 },
+  { AArch64::LD2Twov4h_POST,    "ld2",  ".4h",    1, false, 16 },
+  { AArch64::LD2Twov2s_POST,    "ld2",  ".2s",    1, false, 16 },
+  { AArch64::LD3i8,             "ld3",  ".b",     1, true,  0  },
+  { AArch64::LD3i16,            "ld3",  ".h",     1, true,  0  },
+  { AArch64::LD3i32,            "ld3",  ".s",     1, true,  0  },
+  { AArch64::LD3i64,            "ld3",  ".d",     1, true,  0  },
+  { AArch64::LD3i8_POST,        "ld3",  ".b",     2, true,  3  },
+  { AArch64::LD3i16_POST,       "ld3",  ".h",     2, true,  6  },
+  { AArch64::LD3i32_POST,       "ld3",  ".s",     2, true,  12  },
+  { AArch64::LD3i64_POST,       "ld3",  ".d",     2, true,  24  },
+  { AArch64::LD3Rv16b,          "ld3r", ".16b",   0, false, 0  },
+  { AArch64::LD3Rv8h,           "ld3r", ".8h",    0, false, 0  },
+  { AArch64::LD3Rv4s,           "ld3r", ".4s",    0, false, 0  },
+  { AArch64::LD3Rv2d,           "ld3r", ".2d",    0, false, 0  },
+  { AArch64::LD3Rv8b,           "ld3r", ".8b",    0, false, 0  },
+  { AArch64::LD3Rv4h,           "ld3r", ".4h",    0, false, 0  },
+  { AArch64::LD3Rv2s,           "ld3r", ".2s",    0, false, 0  },
+  { AArch64::LD3Rv1d,           "ld3r", ".1d",    0, false, 0  },
+  { AArch64::LD3Rv16b_POST,     "ld3r", ".16b",   1, false, 3  },
+  { AArch64::LD3Rv8h_POST,      "ld3r", ".8h",    1, false, 6  },
+  { AArch64::LD3Rv4s_POST,      "ld3r", ".4s",    1, false, 12 },
+  { AArch64::LD3Rv2d_POST,      "ld3r", ".2d",    1, false, 24 },
+  { AArch64::LD3Rv8b_POST,      "ld3r", ".8b",    1, false, 3  },
+  { AArch64::LD3Rv4h_POST,      "ld3r", ".4h",    1, false, 6  },
+  { AArch64::LD3Rv2s_POST,      "ld3r", ".2s",    1, false, 12 },
+  { AArch64::LD3Rv1d_POST,      "ld3r", ".1d",    1, false, 24 },
+  { AArch64::LD3Threev16b,      "ld3",  ".16b",   0, false, 0  },
+  { AArch64::LD3Threev8h,       "ld3",  ".8h",    0, false, 0  },
+  { AArch64::LD3Threev4s,       "ld3",  ".4s",    0, false, 0  },
+  { AArch64::LD3Threev2d,       "ld3",  ".2d",    0, false, 0  },
+  { AArch64::LD3Threev8b,       "ld3",  ".8b",    0, false, 0  },
+  { AArch64::LD3Threev4h,       "ld3",  ".4h",    0, false, 0  },
+  { AArch64::LD3Threev2s,       "ld3",  ".2s",    0, false, 0  },
+  { AArch64::LD3Threev16b_POST, "ld3",  ".16b",   1, false, 48 },
+  { AArch64::LD3Threev8h_POST,  "ld3",  ".8h",    1, false, 48 },
+  { AArch64::LD3Threev4s_POST,  "ld3",  ".4s",    1, false, 48 },
+  { AArch64::LD3Threev2d_POST,  "ld3",  ".2d",    1, false, 48 },
+  { AArch64::LD3Threev8b_POST,  "ld3",  ".8b",    1, false, 24 },
+  { AArch64::LD3Threev4h_POST,  "ld3",  ".4h",    1, false, 24 },
+  { AArch64::LD3Threev2s_POST,  "ld3",  ".2s",    1, false, 24 },
+  { AArch64::LD4i8,             "ld4",  ".b",     1, true,  0  },
+  { AArch64::LD4i16,            "ld4",  ".h",     1, true,  0  },
+  { AArch64::LD4i32,            "ld4",  ".s",     1, true,  0  },
+  { AArch64::LD4i64,            "ld4",  ".d",     1, true,  0  },
+  { AArch64::LD4i8_POST,        "ld4",  ".b",     2, true,  4  },
+  { AArch64::LD4i16_POST,       "ld4",  ".h",     2, true,  8  },
+  { AArch64::LD4i32_POST,       "ld4",  ".s",     2, true,  16 },
+  { AArch64::LD4i64_POST,       "ld4",  ".d",     2, true,  32 },
+  { AArch64::LD4Rv16b,          "ld4r", ".16b",   0, false, 0  },
+  { AArch64::LD4Rv8h,           "ld4r", ".8h",    0, false, 0  },
+  { AArch64::LD4Rv4s,           "ld4r", ".4s",    0, false, 0  },
+  { AArch64::LD4Rv2d,           "ld4r", ".2d",    0, false, 0  },
+  { AArch64::LD4Rv8b,           "ld4r", ".8b",    0, false, 0  },
+  { AArch64::LD4Rv4h,           "ld4r", ".4h",    0, false, 0  },
+  { AArch64::LD4Rv2s,           "ld4r", ".2s",    0, false, 0  },
+  { AArch64::LD4Rv1d,           "ld4r", ".1d",    0, false, 0  },
+  { AArch64::LD4Rv16b_POST,     "ld4r", ".16b",   1, false, 4  },
+  { AArch64::LD4Rv8h_POST,      "ld4r", ".8h",    1, false, 8  },
+  { AArch64::LD4Rv4s_POST,      "ld4r", ".4s",    1, false, 16 },
+  { AArch64::LD4Rv2d_POST,      "ld4r", ".2d",    1, false, 32 },
+  { AArch64::LD4Rv8b_POST,      "ld4r", ".8b",    1, false, 4  },
+  { AArch64::LD4Rv4h_POST,      "ld4r", ".4h",    1, false, 8  },
+  { AArch64::LD4Rv2s_POST,      "ld4r", ".2s",    1, false, 16 },
+  { AArch64::LD4Rv1d_POST,      "ld4r", ".1d",    1, false, 32 },
+  { AArch64::LD4Fourv16b,       "ld4",  ".16b",   0, false, 0  },
+  { AArch64::LD4Fourv8h,        "ld4",  ".8h",    0, false, 0  },
+  { AArch64::LD4Fourv4s,        "ld4",  ".4s",    0, false, 0  },
+  { AArch64::LD4Fourv2d,        "ld4",  ".2d",    0, false, 0  },
+  { AArch64::LD4Fourv8b,        "ld4",  ".8b",    0, false, 0  },
+  { AArch64::LD4Fourv4h,        "ld4",  ".4h",    0, false, 0  },
+  { AArch64::LD4Fourv2s,        "ld4",  ".2s",    0, false, 0  },
+  { AArch64::LD4Fourv16b_POST,  "ld4",  ".16b",   1, false, 64 },
+  { AArch64::LD4Fourv8h_POST,   "ld4",  ".8h",    1, false, 64 },
+  { AArch64::LD4Fourv4s_POST,   "ld4",  ".4s",    1, false, 64 },
+  { AArch64::LD4Fourv2d_POST,   "ld4",  ".2d",    1, false, 64 },
+  { AArch64::LD4Fourv8b_POST,   "ld4",  ".8b",    1, false, 32 },
+  { AArch64::LD4Fourv4h_POST,   "ld4",  ".4h",    1, false, 32 },
+  { AArch64::LD4Fourv2s_POST,   "ld4",  ".2s",    1, false, 32 },
+  { AArch64::ST1i8,             "st1",  ".b",     0, true,  0  },
+  { AArch64::ST1i16,            "st1",  ".h",     0, true,  0  },
+  { AArch64::ST1i32,            "st1",  ".s",     0, true,  0  },
+  { AArch64::ST1i64,            "st1",  ".d",     0, true,  0  },
+  { AArch64::ST1i8_POST,        "st1",  ".b",     1, true,  1  },
+  { AArch64::ST1i16_POST,       "st1",  ".h",     1, true,  2  },
+  { AArch64::ST1i32_POST,       "st1",  ".s",     1, true,  4  },
+  { AArch64::ST1i64_POST,       "st1",  ".d",     1, true,  8  },
+  { AArch64::ST1Onev16b,        "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Onev8h,         "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Onev4s,         "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Onev2d,         "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Onev8b,         "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Onev4h,         "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Onev2s,         "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Onev1d,         "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Onev16b_POST,   "st1",  ".16b",   1, false, 16 },
+  { AArch64::ST1Onev8h_POST,    "st1",  ".8h",    1, false, 16 },
+  { AArch64::ST1Onev4s_POST,    "st1",  ".4s",    1, false, 16 },
+  { AArch64::ST1Onev2d_POST,    "st1",  ".2d",    1, false, 16 },
+  { AArch64::ST1Onev8b_POST,    "st1",  ".8b",    1, false, 8  },
+  { AArch64::ST1Onev4h_POST,    "st1",  ".4h",    1, false, 8  },
+  { AArch64::ST1Onev2s_POST,    "st1",  ".2s",    1, false, 8  },
+  { AArch64::ST1Onev1d_POST,    "st1",  ".1d",    1, false, 8  },
+  { AArch64::ST1Twov16b,        "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Twov8h,         "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Twov4s,         "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Twov2d,         "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Twov8b,         "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Twov4h,         "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Twov2s,         "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Twov1d,         "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Twov16b_POST,   "st1",  ".16b",   1, false, 32 },
+  { AArch64::ST1Twov8h_POST,    "st1",  ".8h",    1, false, 32 },
+  { AArch64::ST1Twov4s_POST,    "st1",  ".4s",    1, false, 32 },
+  { AArch64::ST1Twov2d_POST,    "st1",  ".2d",    1, false, 32 },
+  { AArch64::ST1Twov8b_POST,    "st1",  ".8b",    1, false, 16 },
+  { AArch64::ST1Twov4h_POST,    "st1",  ".4h",    1, false, 16 },
+  { AArch64::ST1Twov2s_POST,    "st1",  ".2s",    1, false, 16 },
+  { AArch64::ST1Twov1d_POST,    "st1",  ".1d",    1, false, 16 },
+  { AArch64::ST1Threev16b,      "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Threev8h,       "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Threev4s,       "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Threev2d,       "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Threev8b,       "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Threev4h,       "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Threev2s,       "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Threev1d,       "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Threev16b_POST, "st1",  ".16b",   1, false, 48 },
+  { AArch64::ST1Threev8h_POST,  "st1",  ".8h",    1, false, 48 },
+  { AArch64::ST1Threev4s_POST,  "st1",  ".4s",    1, false, 48 },
+  { AArch64::ST1Threev2d_POST,  "st1",  ".2d",    1, false, 48 },
+  { AArch64::ST1Threev8b_POST,  "st1",  ".8b",    1, false, 24 },
+  { AArch64::ST1Threev4h_POST,  "st1",  ".4h",    1, false, 24 },
+  { AArch64::ST1Threev2s_POST,  "st1",  ".2s",    1, false, 24 },
+  { AArch64::ST1Threev1d_POST,  "st1",  ".1d",    1, false, 24 },
+  { AArch64::ST1Fourv16b,       "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Fourv8h,        "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Fourv4s,        "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Fourv2d,        "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Fourv8b,        "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Fourv4h,        "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Fourv2s,        "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Fourv1d,        "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Fourv16b_POST,  "st1",  ".16b",   1, false, 64 },
+  { AArch64::ST1Fourv8h_POST,   "st1",  ".8h",    1, false, 64 },
+  { AArch64::ST1Fourv4s_POST,   "st1",  ".4s",    1, false, 64 },
+  { AArch64::ST1Fourv2d_POST,   "st1",  ".2d",    1, false, 64 },
+  { AArch64::ST1Fourv8b_POST,   "st1",  ".8b",    1, false, 32 },
+  { AArch64::ST1Fourv4h_POST,   "st1",  ".4h",    1, false, 32 },
+  { AArch64::ST1Fourv2s_POST,   "st1",  ".2s",    1, false, 32 },
+  { AArch64::ST1Fourv1d_POST,   "st1",  ".1d",    1, false, 32 },
+  { AArch64::ST2i8,             "st2",  ".b",     0, true,  0  },
+  { AArch64::ST2i16,            "st2",  ".h",     0, true,  0  },
+  { AArch64::ST2i32,            "st2",  ".s",     0, true,  0  },
+  { AArch64::ST2i64,            "st2",  ".d",     0, true,  0  },
+  { AArch64::ST2i8_POST,        "st2",  ".b",     1, true,  2  },
+  { AArch64::ST2i16_POST,       "st2",  ".h",     1, true,  4  },
+  { AArch64::ST2i32_POST,       "st2",  ".s",     1, true,  8  },
+  { AArch64::ST2i64_POST,       "st2",  ".d",     1, true,  16 },
+  { AArch64::ST2Twov16b,        "st2",  ".16b",   0, false, 0  },
+  { AArch64::ST2Twov8h,         "st2",  ".8h",    0, false, 0  },
+  { AArch64::ST2Twov4s,         "st2",  ".4s",    0, false, 0  },
+  { AArch64::ST2Twov2d,         "st2",  ".2d",    0, false, 0  },
+  { AArch64::ST2Twov8b,         "st2",  ".8b",    0, false, 0  },
+  { AArch64::ST2Twov4h,         "st2",  ".4h",    0, false, 0  },
+  { AArch64::ST2Twov2s,         "st2",  ".2s",    0, false, 0  },
+  { AArch64::ST2Twov16b_POST,   "st2",  ".16b",   1, false, 32 },
+  { AArch64::ST2Twov8h_POST,    "st2",  ".8h",    1, false, 32 },
+  { AArch64::ST2Twov4s_POST,    "st2",  ".4s",    1, false, 32 },
+  { AArch64::ST2Twov2d_POST,    "st2",  ".2d",    1, false, 32 },
+  { AArch64::ST2Twov8b_POST,    "st2",  ".8b",    1, false, 16 },
+  { AArch64::ST2Twov4h_POST,    "st2",  ".4h",    1, false, 16 },
+  { AArch64::ST2Twov2s_POST,    "st2",  ".2s",    1, false, 16 },
+  { AArch64::ST3i8,             "st3",  ".b",     0, true,  0  },
+  { AArch64::ST3i16,            "st3",  ".h",     0, true,  0  },
+  { AArch64::ST3i32,            "st3",  ".s",     0, true,  0  },
+  { AArch64::ST3i64,            "st3",  ".d",     0, true,  0  },
+  { AArch64::ST3i8_POST,        "st3",  ".b",     1, true,  3  },
+  { AArch64::ST3i16_POST,       "st3",  ".h",     1, true,  6  },
+  { AArch64::ST3i32_POST,       "st3",  ".s",     1, true,  12 },
+  { AArch64::ST3i64_POST,       "st3",  ".d",     1, true,  24 },
+  { AArch64::ST3Threev16b,      "st3",  ".16b",   0, false, 0  },
+  { AArch64::ST3Threev8h,       "st3",  ".8h",    0, false, 0  },
+  { AArch64::ST3Threev4s,       "st3",  ".4s",    0, false, 0  },
+  { AArch64::ST3Threev2d,       "st3",  ".2d",    0, false, 0  },
+  { AArch64::ST3Threev8b,       "st3",  ".8b",    0, false, 0  },
+  { AArch64::ST3Threev4h,       "st3",  ".4h",    0, false, 0  },
+  { AArch64::ST3Threev2s,       "st3",  ".2s",    0, false, 0  },
+  { AArch64::ST3Threev16b_POST, "st3",  ".16b",   1, false, 48 },
+  { AArch64::ST3Threev8h_POST,  "st3",  ".8h",    1, false, 48 },
+  { AArch64::ST3Threev4s_POST,  "st3",  ".4s",    1, false, 48 },
+  { AArch64::ST3Threev2d_POST,  "st3",  ".2d",    1, false, 48 },
+  { AArch64::ST3Threev8b_POST,  "st3",  ".8b",    1, false, 24 },
+  { AArch64::ST3Threev4h_POST,  "st3",  ".4h",    1, false, 24 },
+  { AArch64::ST3Threev2s_POST,  "st3",  ".2s",    1, false, 24 },
+  { AArch64::ST4i8,             "st4",  ".b",     0, true,  0  },
+  { AArch64::ST4i16,            "st4",  ".h",     0, true,  0  },
+  { AArch64::ST4i32,            "st4",  ".s",     0, true,  0  },
+  { AArch64::ST4i64,            "st4",  ".d",     0, true,  0  },
+  { AArch64::ST4i8_POST,        "st4",  ".b",     1, true,  4  },
+  { AArch64::ST4i16_POST,       "st4",  ".h",     1, true,  8  },
+  { AArch64::ST4i32_POST,       "st4",  ".s",     1, true,  16 },
+  { AArch64::ST4i64_POST,       "st4",  ".d",     1, true,  32 },
+  { AArch64::ST4Fourv16b,       "st4",  ".16b",   0, false, 0  },
+  { AArch64::ST4Fourv8h,        "st4",  ".8h",    0, false, 0  },
+  { AArch64::ST4Fourv4s,        "st4",  ".4s",    0, false, 0  },
+  { AArch64::ST4Fourv2d,        "st4",  ".2d",    0, false, 0  },
+  { AArch64::ST4Fourv8b,        "st4",  ".8b",    0, false, 0  },
+  { AArch64::ST4Fourv4h,        "st4",  ".4h",    0, false, 0  },
+  { AArch64::ST4Fourv2s,        "st4",  ".2s",    0, false, 0  },
+  { AArch64::ST4Fourv16b_POST,  "st4",  ".16b",   1, false, 64 },
+  { AArch64::ST4Fourv8h_POST,   "st4",  ".8h",    1, false, 64 },
+  { AArch64::ST4Fourv4s_POST,   "st4",  ".4s",    1, false, 64 },
+  { AArch64::ST4Fourv2d_POST,   "st4",  ".2d",    1, false, 64 },
+  { AArch64::ST4Fourv8b_POST,   "st4",  ".8b",    1, false, 32 },
+  { AArch64::ST4Fourv4h_POST,   "st4",  ".4h",    1, false, 32 },
+  { AArch64::ST4Fourv2s_POST,   "st4",  ".2s",    1, false, 32 },
+};
+
+static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
+  unsigned Idx;
+  for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
+    if (LdStNInstInfo[Idx].Opcode == Opcode)
+      return &LdStNInstInfo[Idx];
+
+  return nullptr;
 }
 
-void
-AArch64InstPrinter::printCRxOperand(const MCInst *MI, unsigned OpNum,
-                                    raw_ostream &O) {
-    const MCOperand &CRx = MI->getOperand(OpNum);
+void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                        StringRef Annot) {
+  unsigned Opcode = MI->getOpcode();
+  StringRef Layout, Mnemonic;
 
-    O << 'c' << CRx.getImm();
-}
+  bool IsTbx;
+  if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
+    O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
+      << getRegisterName(MI->getOperand(0).getReg(), AArch64::vreg) << ", ";
 
+    unsigned ListOpNum = IsTbx ? 2 : 1;
+    printVectorList(MI, ListOpNum, O, "");
 
-void
-AArch64InstPrinter::printCVTFixedPosOperand(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-    const MCOperand &ScaleOp = MI->getOperand(OpNum);
+    O << ", "
+      << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), AArch64::vreg);
+    printAnnotation(O, Annot);
+    return;
+  }
 
-    O << '#' << (64 - ScaleOp.getImm());
-}
+  if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
+    O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
+
+    // Now onto the operands: first a vector list with possible lane
+    // specifier. E.g. { v0 }[2]
+    int OpNum = LdStDesc->ListOperand;
+    printVectorList(MI, OpNum++, O, "");
+
+    if (LdStDesc->HasLane)
+      O << '[' << MI->getOperand(OpNum++).getImm() << ']';
+
+    // Next the address: [xN]
+    unsigned AddrReg = MI->getOperand(OpNum++).getReg();
+    O << ", [" << getRegisterName(AddrReg) << ']';
+
+    // Finally, there might be a post-indexed offset.
+    if (LdStDesc->NaturalOffset != 0) {
+      unsigned Reg = MI->getOperand(OpNum++).getReg();
+      if (Reg != AArch64::XZR)
+        O << ", " << getRegisterName(Reg);
+      else {
+        assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
+        O << ", #" << LdStDesc->NaturalOffset;
+      }
+    }
 
+    printAnnotation(O, Annot);
+    return;
+  }
 
-void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
-                                           raw_ostream &o) {
-  const MCOperand &MOImm8 = MI->getOperand(OpNum);
+  AArch64InstPrinter::printInst(MI, O, Annot);
+}
 
-  assert(MOImm8.isImm()
-         && "Immediate operand required for floating-point immediate inst");
+bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
+#ifndef NDEBUG
+  unsigned Opcode = MI->getOpcode();
+  assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
+#endif
+
+  const char *Asm = nullptr;
+  const MCOperand &Op1 = MI->getOperand(0);
+  const MCOperand &Cn = MI->getOperand(1);
+  const MCOperand &Cm = MI->getOperand(2);
+  const MCOperand &Op2 = MI->getOperand(3);
+
+  unsigned Op1Val = Op1.getImm();
+  unsigned CnVal = Cn.getImm();
+  unsigned CmVal = Cm.getImm();
+  unsigned Op2Val = Op2.getImm();
+
+  if (CnVal == 7) {
+    switch (CmVal) {
+    default:
+      break;
+
+    // IC aliases
+    case 1:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tialluis";
+      break;
+    case 5:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tiallu";
+      else if (Op1Val == 3 && Op2Val == 1)
+        Asm = "ic\tivau";
+      break;
+
+    // DC aliases
+    case 4:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tzva";
+      break;
+    case 6:
+      if (Op1Val == 0 && Op2Val == 1)
+        Asm = "dc\tivac";
+      if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tisw";
+      break;
+    case 10:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcsw";
+      break;
+    case 11:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvau";
+      break;
+    case 14:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcivac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcisw";
+      break;
+
+    // AT aliases
+    case 8:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e1r"; break;
+        case 1: Asm = "at\ts1e1w"; break;
+        case 2: Asm = "at\ts1e0r"; break;
+        case 3: Asm = "at\ts1e0w"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e2r"; break;
+        case 1: Asm = "at\ts1e2w"; break;
+        case 4: Asm = "at\ts12e1r"; break;
+        case 5: Asm = "at\ts12e1w"; break;
+        case 6: Asm = "at\ts12e0r"; break;
+        case 7: Asm = "at\ts12e0w"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e3r"; break;
+        case 1: Asm = "at\ts1e3w"; break;
+        }
+        break;
+      }
+      break;
+    }
+  } else if (CnVal == 8) {
+    // TLBI aliases
+    switch (CmVal) {
+    default:
+      break;
+    case 3:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1is"; break;
+        case 1: Asm = "tlbi\tvae1is"; break;
+        case 2: Asm = "tlbi\taside1is"; break;
+        case 3: Asm = "tlbi\tvaae1is"; break;
+        case 5: Asm = "tlbi\tvale1is"; break;
+        case 7: Asm = "tlbi\tvaale1is"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2is"; break;
+        case 1: Asm = "tlbi\tvae2is"; break;
+        case 4: Asm = "tlbi\talle1is"; break;
+        case 5: Asm = "tlbi\tvale2is"; break;
+        case 6: Asm = "tlbi\tvmalls12e1is"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3is"; break;
+        case 1: Asm = "tlbi\tvae3is"; break;
+        case 5: Asm = "tlbi\tvale3is"; break;
+        }
+        break;
+      }
+      break;
+    case 0:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1is"; break;
+        case 5: Asm = "tlbi\tipas2le1is"; break;
+        }
+        break;
+      }
+      break;
+    case 4:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1"; break;
+        case 5: Asm = "tlbi\tipas2le1"; break;
+        }
+        break;
+      }
+      break;
+    case 7:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1"; break;
+        case 1: Asm = "tlbi\tvae1"; break;
+        case 2: Asm = "tlbi\taside1"; break;
+        case 3: Asm = "tlbi\tvaae1"; break;
+        case 5: Asm = "tlbi\tvale1"; break;
+        case 7: Asm = "tlbi\tvaale1"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2"; break;
+        case 1: Asm = "tlbi\tvae2"; break;
+        case 4: Asm = "tlbi\talle1"; break;
+        case 5: Asm = "tlbi\tvale2"; break;
+        case 6: Asm = "tlbi\tvmalls12e1"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3"; break;
+        case 1: Asm = "tlbi\tvae3";  break;
+        case 5: Asm = "tlbi\tvale3"; break;
+        }
+        break;
+      }
+      break;
+    }
+  }
+
+  if (Asm) {
+    unsigned Reg = MI->getOperand(4).getReg();
 
-  uint32_t Imm8 = MOImm8.getImm();
-  uint32_t Fraction = Imm8 & 0xf;
-  uint32_t Exponent = (Imm8 >> 4) & 0x7;
-  uint32_t Negative = (Imm8 >> 7) & 0x1;
+    O << '\t' << Asm;
+    if (StringRef(Asm).lower().find("all") == StringRef::npos)
+      O << ", " << getRegisterName(Reg);
+  }
 
-  float Val = 1.0f + Fraction / 16.0f;
+  return Asm != nullptr;
+}
 
-  // That is:
-  // 000 -> 2^1,  001 -> 2^2,  010 -> 2^3,  011 -> 2^4,
-  // 100 -> 2^-3, 101 -> 2^-2, 110 -> 2^-1, 111 -> 2^0
-  if (Exponent & 0x4) {
-    Val /= 1 << (7 - Exponent);
+void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    O << getRegisterName(Reg);
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
   } else {
-    Val *= 1 << (Exponent + 1);
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << *Op.getExpr();
   }
+}
 
-  Val = Negative ? -Val : Val;
-
-  o << '#' << format("%.8f", Val);
+void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  O << format("#%#llx", Op.getImm());
 }
 
-void AArch64InstPrinter::printFPZeroOperand(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &o) {
-  o << "#0.0";
+void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
+                                             unsigned Imm, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    if (Reg == AArch64::XZR)
+      O << "#" << Imm;
+    else
+      O << getRegisterName(Reg);
+  } else
+    assert(0 && "unknown operand kind in printPostIncOperand64");
 }
 
-void
-AArch64InstPrinter::printCondCodeOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
+void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isReg() && "Non-register vreg operand!");
+  unsigned Reg = Op.getReg();
+  O << getRegisterName(Reg, AArch64::vreg);
+}
 
-  O << A64CondCodeToString(static_cast<A64CC::CondCodes>(MO.getImm()));
+void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
+  O << "c" << Op.getImm();
 }
 
-template <unsigned field_width, unsigned scale> void
-AArch64InstPrinter::printLabelOperand(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
+void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
-
-  if (!MO.isImm()) {
-    printOperand(MI, OpNum, O);
-    return;
+  if (MO.isImm()) {
+    unsigned Val = (MO.getImm() & 0xfff);
+    assert(Val == MO.getImm() && "Add/sub immediate out of range!");
+    unsigned Shift =
+        AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
+    O << '#' << Val;
+    if (Shift != 0)
+      printShifter(MI, OpNum + 1, O);
+
+    if (CommentStream)
+      *CommentStream << '=' << (Val << Shift) << '\n';
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    O << *MO.getExpr();
+    printShifter(MI, OpNum + 1, O);
   }
+}
 
-  // The immediate of LDR (lit) instructions is a signed 19-bit immediate, which
-  // is multiplied by 4 (because all A64 instructions are 32-bits wide).
-  uint64_t UImm = MO.getImm();
-  uint64_t Sign = UImm & (1LL << (field_width - 1));
-  int64_t SImm = scale * ((UImm & ~Sign) - Sign);
-
-  O << "#" << SImm;
+void AArch64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  uint64_t Val = MI->getOperand(OpNum).getImm();
+  O << "#0x";
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 32));
 }
 
-template<unsigned RegWidth> void
-AArch64InstPrinter::printLogicalImmOperand(const MCInst *MI, unsigned OpNum,
+void AArch64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
                                            raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  uint64_t Val;
-  A64Imms::isLogicalImmBits(RegWidth, MO.getImm(), Val);
+  uint64_t Val = MI->getOperand(OpNum).getImm();
   O << "#0x";
-  O.write_hex(Val);
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 64));
 }
 
-void
-AArch64InstPrinter::printOffsetUImm12Operand(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &O, int MemSize) {
-  const MCOperand &MOImm = MI->getOperand(OpNum);
+void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  // LSL #0 should not be printed.
+  if (AArch64_AM::getShiftType(Val) == AArch64_AM::LSL &&
+      AArch64_AM::getShiftValue(Val) == 0)
+    return;
+  O << ", " << AArch64_AM::getShiftExtendName(AArch64_AM::getShiftType(Val))
+    << " #" << AArch64_AM::getShiftValue(Val);
+}
 
-  if (MOImm.isImm()) {
-    uint32_t Imm = MOImm.getImm() * MemSize;
+void AArch64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printShifter(MI, OpNum + 1, O);
+}
 
-    O << "#" << Imm;
-  } else {
-    O << "#" << *MOImm.getExpr();
+void AArch64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printArithExtend(MI, OpNum + 1, O);
+}
+
+void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getArithExtendType(Val);
+  unsigned ShiftVal = AArch64_AM::getArithShiftValue(Val);
+
+  // If the destination or first source register operand is [W]SP, print
+  // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
+  // all.
+  if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) {
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src1 = MI->getOperand(1).getReg();
+    if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) &&
+          ExtType == AArch64_AM::UXTX) ||
+         ((Dest == AArch64::WSP || Src1 == AArch64::WSP) &&
+          ExtType == AArch64_AM::UXTW) ) {
+      if (ShiftVal != 0)
+        O << ", lsl #" << ShiftVal;
+      return;
+    }
   }
+  O << ", " << AArch64_AM::getShiftExtendName(ExtType);
+  if (ShiftVal != 0)
+    O << " #" << ShiftVal;
 }
 
-void
-AArch64InstPrinter::printShiftOperand(const MCInst *MI,  unsigned OpNum,
-                                      raw_ostream &O,
-                                      A64SE::ShiftExtSpecifiers Shift) {
-    const MCOperand &MO = MI->getOperand(OpNum);
+void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O, char SrcRegKind,
+                                        unsigned Width) {
+  unsigned SignExtend = MI->getOperand(OpNum).getImm();
+  unsigned DoShift = MI->getOperand(OpNum + 1).getImm();
 
-    // LSL #0 is not printed
-    if (Shift == A64SE::LSL && MO.isImm() && MO.getImm() == 0)
-        return;
+  // sxtw, sxtx, uxtw or lsl (== uxtx)
+  bool IsLSL = !SignExtend && SrcRegKind == 'x';
+  if (IsLSL)
+    O << "lsl";
+  else
+    O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind;
 
-    switch (Shift) {
-    case A64SE::LSL: O << "lsl"; break;
-    case A64SE::LSR: O << "lsr"; break;
-    case A64SE::ASR: O << "asr"; break;
-    case A64SE::ROR: O << "ror"; break;
-    default: llvm_unreachable("Invalid shift specifier in logical instruction");
-    }
+  if (DoShift || IsLSL)
+    O << " #" << Log2_32(Width / 8);
+}
 
-  O << " #" << MO.getImm();
+void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << AArch64CC::getCondCodeName(CC);
 }
 
-void
-AArch64InstPrinter::printMoveWideImmOperand(const MCInst *MI,  unsigned OpNum,
-                                            raw_ostream &O) {
-  const MCOperand &UImm16MO = MI->getOperand(OpNum);
-  const MCOperand &ShiftMO = MI->getOperand(OpNum + 1);
+void AArch64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << AArch64CC::getCondCodeName(AArch64CC::getInvertedCondCode(CC));
+}
 
-  if (UImm16MO.isImm()) {
-    O << '#' << UImm16MO.getImm();
+void AArch64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
+}
 
-    if (ShiftMO.getImm() != 0)
-      O << ", lsl #" << (ShiftMO.getImm() * 16);
+template<int Scale>
+void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  O << '#' << Scale * MI->getOperand(OpNum).getImm();
+}
 
-    return;
+void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
+                                           unsigned Scale, raw_ostream &O) {
+  const MCOperand MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    O << "#" << (MO.getImm() * Scale);
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    O << *MO.getExpr();
   }
-
-  O << "#" << *UImm16MO.getExpr();
 }
 
-void AArch64InstPrinter::printNamedImmOperand(const NamedImmMapper &Mapper,
-                                              const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  bool ValidName;
-  const MCOperand &MO = MI->getOperand(OpNum);
-  StringRef Name = Mapper.toString(MO.getImm(), ValidName);
+void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
+                                          unsigned Scale, raw_ostream &O) {
+  const MCOperand MO1 = MI->getOperand(OpNum + 1);
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
+  if (MO1.isImm()) {
+      O << ", #" << (MO1.getImm() * Scale);
+  } else {
+    assert(MO1.isExpr() && "Unexpected operand type!");
+    O << ", " << *MO1.getExpr();
+  }
+  O << ']';
+}
 
-  if (ValidName)
+void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned prfop = MI->getOperand(OpNum).getImm();
+  bool Valid;
+  StringRef Name = AArch64PRFM::PRFMMapper().toString(prfop, Valid);
+  if (Valid)
     O << Name;
   else
-    O << '#' << MO.getImm();
+    O << '#' << prfop;
 }
 
-void
-AArch64InstPrinter::printSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
-                                       const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
+void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
+  float FPImm =
+      MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm());
 
-  bool ValidName;
-  std::string Name = Mapper.toString(MO.getImm(), ValidName);
-  if (ValidName) {
-    O << Name;
-    return;
-  }
+  // 8 decimal places are enough to perfectly represent permitted floats.
+  O << format("#%.8f", FPImm);
 }
 
+static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
+  while (Stride--) {
+    switch (Reg) {
+    default:
+      assert(0 && "Vector register expected!");
+    case AArch64::Q0:  Reg = AArch64::Q1;  break;
+    case AArch64::Q1:  Reg = AArch64::Q2;  break;
+    case AArch64::Q2:  Reg = AArch64::Q3;  break;
+    case AArch64::Q3:  Reg = AArch64::Q4;  break;
+    case AArch64::Q4:  Reg = AArch64::Q5;  break;
+    case AArch64::Q5:  Reg = AArch64::Q6;  break;
+    case AArch64::Q6:  Reg = AArch64::Q7;  break;
+    case AArch64::Q7:  Reg = AArch64::Q8;  break;
+    case AArch64::Q8:  Reg = AArch64::Q9;  break;
+    case AArch64::Q9:  Reg = AArch64::Q10; break;
+    case AArch64::Q10: Reg = AArch64::Q11; break;
+    case AArch64::Q11: Reg = AArch64::Q12; break;
+    case AArch64::Q12: Reg = AArch64::Q13; break;
+    case AArch64::Q13: Reg = AArch64::Q14; break;
+    case AArch64::Q14: Reg = AArch64::Q15; break;
+    case AArch64::Q15: Reg = AArch64::Q16; break;
+    case AArch64::Q16: Reg = AArch64::Q17; break;
+    case AArch64::Q17: Reg = AArch64::Q18; break;
+    case AArch64::Q18: Reg = AArch64::Q19; break;
+    case AArch64::Q19: Reg = AArch64::Q20; break;
+    case AArch64::Q20: Reg = AArch64::Q21; break;
+    case AArch64::Q21: Reg = AArch64::Q22; break;
+    case AArch64::Q22: Reg = AArch64::Q23; break;
+    case AArch64::Q23: Reg = AArch64::Q24; break;
+    case AArch64::Q24: Reg = AArch64::Q25; break;
+    case AArch64::Q25: Reg = AArch64::Q26; break;
+    case AArch64::Q26: Reg = AArch64::Q27; break;
+    case AArch64::Q27: Reg = AArch64::Q28; break;
+    case AArch64::Q28: Reg = AArch64::Q29; break;
+    case AArch64::Q29: Reg = AArch64::Q30; break;
+    case AArch64::Q30: Reg = AArch64::Q31; break;
+    // Vector lists can wrap around.
+    case AArch64::Q31:
+      Reg = AArch64::Q0;
+      break;
+    }
+  }
+  return Reg;
+}
 
-void AArch64InstPrinter::printRegExtendOperand(const MCInst *MI,
-                                               unsigned OpNum,
-                                               raw_ostream &O,
-                                               A64SE::ShiftExtSpecifiers Ext) {
-  // FIXME: In principle TableGen should be able to detect this itself far more
-  // easily. We will only accumulate more of these hacks.
-  unsigned Reg0 = MI->getOperand(0).getReg();
-  unsigned Reg1 = MI->getOperand(1).getReg();
-
-  if (isStackReg(Reg0) || isStackReg(Reg1)) {
-    A64SE::ShiftExtSpecifiers LSLEquiv;
-
-    if (Reg0 == AArch64::XSP || Reg1 == AArch64::XSP)
-      LSLEquiv = A64SE::UXTX;
-    else
-      LSLEquiv = A64SE::UXTW;
+void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O,
+                                         StringRef LayoutSuffix) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
 
-    if (Ext == LSLEquiv) {
-      O << "lsl #" << MI->getOperand(OpNum).getImm();
-      return;
-    }
+  O << "{ ";
+
+  // Work out how many registers there are in the list (if there is an actual
+  // list).
+  unsigned NumRegs = 1;
+  if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) ||
+      MRI.getRegClass(AArch64::QQRegClassID).contains(Reg))
+    NumRegs = 2;
+  else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg))
+    NumRegs = 3;
+  else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg))
+    NumRegs = 4;
+
+  // Now forget about the list and find out what the first register is.
+  if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0))
+    Reg = FirstReg;
+  else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0))
+    Reg = FirstReg;
+
+  // If it's a D-reg, we need to promote it to the equivalent Q-reg before
+  // printing (otherwise getRegisterName fails).
+  if (MRI.getRegClass(AArch64::FPR64RegClassID).contains(Reg)) {
+    const MCRegisterClass &FPR128RC =
+        MRI.getRegClass(AArch64::FPR128RegClassID);
+    Reg = MRI.getMatchingSuperReg(Reg, AArch64::dsub, &FPR128RC);
   }
 
-  switch (Ext) {
-  case A64SE::UXTB: O << "uxtb"; break;
-  case A64SE::UXTH: O << "uxth"; break;
-  case A64SE::UXTW: O << "uxtw"; break;
-  case A64SE::UXTX: O << "uxtx"; break;
-  case A64SE::SXTB: O << "sxtb"; break;
-  case A64SE::SXTH: O << "sxth"; break;
-  case A64SE::SXTW: O << "sxtw"; break;
-  case A64SE::SXTX: O << "sxtx"; break;
-  default: llvm_unreachable("Unexpected shift type for printing");
+  for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
+    O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix;
+    if (i + 1 != NumRegs)
+      O << ", ";
   }
 
-  const MCOperand &MO = MI->getOperand(OpNum);
-  if (MO.getImm() != 0)
-    O << " #" << MO.getImm();
+  O << " }";
 }
 
-template<int MemScale> void
-AArch64InstPrinter::printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  const MCOperand &MOImm = MI->getOperand(OpNum);
-  int32_t Imm = unpackSignedImm(7, MOImm.getImm());
+void AArch64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
+                                                        unsigned OpNum,
+                                                        raw_ostream &O) {
+  printVectorList(MI, OpNum, O, "");
+}
+
+template <unsigned NumLanes, char LaneKind>
+void AArch64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  std::string Suffix(".");
+  if (NumLanes)
+    Suffix += itostr(NumLanes) + LaneKind;
+  else
+    Suffix += LaneKind;
 
-  O << "#" << (Imm * MemScale);
+  printVectorList(MI, OpNum, O, Suffix);
 }
 
-void AArch64InstPrinter::printVPRRegister(const MCInst *MI, unsigned OpNo,
+void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
                                           raw_ostream &O) {
-  unsigned Reg = MI->getOperand(OpNo).getReg();
-  std::string Name = getRegisterName(Reg);
-  Name[0] = 'v';
-  O << Name;
+  O << "[" << MI->getOperand(OpNum).getImm() << "]";
 }
 
-void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                      raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
-    O << getRegisterName(Reg);
-  } else if (Op.isImm()) {
-    O << '#' << Op.getImm();
+void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << (Op.getImm() << 2);
+    return;
+  }
+
+  // If the branch target is simply an address then print it in hex.
+  const MCConstantExpr *BranchTarget =
+      dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
+  int64_t Address;
+  if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+    O << "0x";
+    O.write_hex(Address);
   } else {
-    assert(Op.isExpr() && "unknown operand kind in printOperand");
-    // If a symbolic branch target was added as a constant expression then print
-    // that address in hex.
-    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
-    int64_t Address;
-    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
-      O << "0x";
-      O.write_hex(Address);
-    }
-    else {
-      // Otherwise, just print the expression.
-      O << *Op.getExpr();
-    }
+    // Otherwise, just print the expression.
+    O << *MI->getOperand(OpNum).getExpr();
   }
 }
 
+void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
 
-void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                   StringRef Annot) {
-  if (MI->getOpcode() == AArch64::TLSDESCCALL) {
-    // This is a special assembler directive which applies an
-    // R_AARCH64_TLSDESC_CALL to the following (BLR) instruction. It has a fixed
-    // form outside the normal TableGenerated scheme.
-    O << "\t.tlsdesccall " << *MI->getOperand(0).getExpr();
-  } else if (!printAliasInstr(MI, O))
-    printInstruction(MI, O);
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << (Op.getImm() << 12);
+    return;
+  }
 
-  printAnnotation(O, Annot);
+  // Otherwise, just print the expression.
+  O << *MI->getOperand(OpNum).getExpr();
 }
 
-template <A64SE::ShiftExtSpecifiers Ext, bool isHalf>
-void AArch64InstPrinter::printNeonMovImmShiftOperand(const MCInst *MI,
-                                                     unsigned OpNum,
-                                                     raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-
-  assert(MO.isImm() &&
-         "Immediate operand required for Neon vector immediate inst.");
-
-  bool IsLSL = false;
-  if (Ext == A64SE::LSL)
-    IsLSL = true;
-  else if (Ext != A64SE::MSL)
-    llvm_unreachable("Invalid shift specifier in movi instruction");
-
-  int64_t Imm = MO.getImm();
-
-  // MSL and LSLH accepts encoded shift amount 0 or 1.
-  if ((!IsLSL || (IsLSL && isHalf)) && Imm != 0 && Imm != 1)
-    llvm_unreachable("Invalid shift amount in movi instruction");
-
-  // LSH accepts encoded shift amount 0, 1, 2 or 3.
-  if (IsLSL && (Imm < 0 || Imm > 3))
-    llvm_unreachable("Invalid shift amount in movi instruction");
-
-  // Print shift amount as multiple of 8 with MSL encoded shift amount
-  // 0 and 1 printed as 8 and 16.
-  if (!IsLSL)
-    Imm++;
-  Imm *= 8;
-
-  // LSL #0 is not printed
-  if (IsLSL) {
-    if (Imm == 0)
-      return;
-    O << ", lsl";
-  } else
-    O << ", msl";
-
-  O << " #" << Imm;
-}
+void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+  unsigned Opcode = MI->getOpcode();
 
-void AArch64InstPrinter::printNeonUImm0Operand(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &o) {
-  o << "#0x0";
+  bool Valid;
+  StringRef Name;
+  if (Opcode == AArch64::ISB)
+    Name = AArch64ISB::ISBMapper().toString(Val, Valid);
+  else
+    Name = AArch64DB::DBarrierMapper().toString(Val, Valid);
+  if (Valid)
+    O << Name;
+  else
+    O << "#" << Val;
 }
 
-void AArch64InstPrinter::printUImmHexOperand(const MCInst *MI, unsigned OpNum,
-                                             raw_ostream &O) {
-  const MCOperand &MOUImm = MI->getOperand(OpNum);
-
-  assert(MOUImm.isImm() &&
-         "Immediate operand required for Neon vector immediate inst.");
+void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
 
-  unsigned Imm = MOUImm.getImm();
+  bool Valid;
+  auto Mapper = AArch64SysReg::MRSMapper(getAvailableFeatures());
+  std::string Name = Mapper.toString(Val, Valid);
 
-  O << "#0x";
-  O.write_hex(Imm);
+  if (Valid)
+    O << StringRef(Name).upper();
 }
 
-void AArch64InstPrinter::printUImmBareOperand(const MCInst *MI,
-                                              unsigned OpNum,
-                                              raw_ostream &O) {
-  const MCOperand &MOUImm = MI->getOperand(OpNum);
+void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
 
-  assert(MOUImm.isImm()
-         && "Immediate operand required for Neon vector immediate inst.");
+  bool Valid;
+  auto Mapper = AArch64SysReg::MSRMapper(getAvailableFeatures());
+  std::string Name = Mapper.toString(Val, Valid);
 
-  unsigned Imm = MOUImm.getImm();
-  O << Imm;
+  if (Valid)
+    O << StringRef(Name).upper();
 }
 
-void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI,
-                                                    unsigned OpNum,
-                                                    raw_ostream &O) {
-  const MCOperand &MOUImm8 = MI->getOperand(OpNum);
-
-  assert(MOUImm8.isImm() &&
-         "Immediate operand required for Neon vector immediate bytemask inst.");
+void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
 
-  uint32_t UImm8 = MOUImm8.getImm();
-  uint64_t Mask = 0;
-
-  // Replicates 0x00 or 0xff byte in a 64-bit vector
-  for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
-    if ((UImm8 >> ByteNum) & 1)
-      Mask |= (uint64_t)0xff << (8 * ByteNum);
-  }
-
-  O << "#0x";
-  O.write_hex(Mask);
+  bool Valid;
+  StringRef Name = AArch64PState::PStateMapper().toString(Val, Valid);
+  if (Valid)
+    O << StringRef(Name.str()).upper();
+  else
+    O << "#" << Val;
 }
 
-// If Count > 1, there are two valid kinds of vector list:
-//   (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
-//   (2) {Vn.layout - Vm.layout}
-// We choose the first kind as output.
-template <A64Layout::VectorLayout Layout, unsigned Count>
-void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  assert(Count >= 1 && Count <= 4 && "Invalid Number of Vectors");
-
-  unsigned Reg = MI->getOperand(OpNum).getReg();
-  std::string LayoutStr = A64VectorLayoutToString(Layout);
-  O << "{";
-  if (Count > 1) { // Print sub registers separately
-    bool IsVec64 = (Layout < A64Layout::VL_16B);
-    unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
-    for (unsigned I = 0; I < Count; I++) {
-      std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++));
-      Name[0] = 'v';
-      O << Name << LayoutStr;
-      if (I != Count - 1)
-        O << ", ";
-    }
-  } else { // Print the register directly when NumVecs is 1.
-    std::string Name = getRegisterName(Reg);
-    Name[0] = 'v';
-    O << Name << LayoutStr;
-  }
-  O << "}";
+void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned RawVal = MI->getOperand(OpNo).getImm();
+  uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal);
+  O << format("#%#016llx", Val);
 }
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 37b7273..fe7666e 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -11,11 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64INSTPRINTER_H
-#define LLVM_AARCH64INSTPRINTER_H
+#ifndef AArch64INSTPRINTER_H
+#define AArch64INSTPRINTER_H
 
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
@@ -28,154 +28,112 @@ public:
   AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                      const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
 
-  // Autogenerated by tblgen
-  void printInstruction(const MCInst *MI, raw_ostream &O);
-  bool printAliasInstr(const MCInst *MI, raw_ostream &O);
-  static const char *getRegisterName(unsigned RegNo);
-  static const char *getInstructionName(unsigned Opcode);
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
 
-  void printRegName(raw_ostream &O, unsigned RegNum) const;
-
-  template<unsigned MemSize, unsigned RmSize>
-  void printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                                 raw_ostream &O) {
-    printAddrRegExtendOperand(MI, OpNum, O, MemSize, RmSize);
+  // Autogenerated by tblgen.
+  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx, raw_ostream &O);
+  virtual StringRef getRegName(unsigned RegNo) const {
+    return getRegisterName(RegNo);
   }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AArch64::NoRegAltName);
 
-
-  void printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                                 raw_ostream &O, unsigned MemSize,
-                                 unsigned RmSize);
-
-  void printAddSubImmLSL0Operand(const MCInst *MI,
-                                 unsigned OpNum, raw_ostream &O);
-  void printAddSubImmLSL12Operand(const MCInst *MI,
-                                  unsigned OpNum, raw_ostream &O);
-
-  void printBareImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  template<unsigned RegWidth>
-  void printBFILSBOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printBFIWidthOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printBFXWidthOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-
-  void printCondCodeOperand(const MCInst *MI, unsigned OpNum,
-                            raw_ostream &O);
-
-  void printCRxOperand(const MCInst *MI, unsigned OpNum,
-                       raw_ostream &O);
-
-  void printCVTFixedPosOperand(const MCInst *MI, unsigned OpNum,
-                               raw_ostream &O);
-
-  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &o);
-
-  void printFPZeroOperand(const MCInst *MI, unsigned OpNum, raw_ostream &o);
-
-  template<int MemScale>
-  void printOffsetUImm12Operand(const MCInst *MI,
-                                  unsigned OpNum, raw_ostream &o) {
-    printOffsetUImm12Operand(MI, OpNum, o, MemScale);
+protected:
+  bool printSysAlias(const MCInst *MI, raw_ostream &O);
+  // Operand printers
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printHexImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
+                           raw_ostream &O);
+  template<int Amount>
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printPostIncOperand(MI, OpNo, Amount, O);
   }
 
-  void printOffsetUImm12Operand(const MCInst *MI, unsigned OpNum,
-                                  raw_ostream &o, int MemScale);
-
-  template<unsigned field_width, unsigned scale>
-  void printLabelOperand(const MCInst *MI, unsigned OpNum,
-                         raw_ostream &O);
-
-  template<unsigned RegWidth>
-  void printLogicalImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  template<typename SomeNamedImmMapper>
-  void printNamedImmOperand(const MCInst *MI, unsigned OpNum,
-                            raw_ostream &O) {
-    printNamedImmOperand(SomeNamedImmMapper(), MI, OpNum, O);
+  void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printArithExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                      char SrcRegKind, unsigned Width);
+  template <char SrcRegKind, unsigned Width>
+  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printMemExtend(MI, OpNum, O, SrcRegKind, Width);
   }
 
-  void printNamedImmOperand(const NamedImmMapper &Mapper,
-                            const MCInst *MI, unsigned OpNum,
-                            raw_ostream &O);
-
-  void printSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
-                          const MCInst *MI, unsigned OpNum,
-                          raw_ostream &O);
+  void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printInverseCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAlignedLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                         raw_ostream &O);
+  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                        raw_ostream &O);
 
-  void printMRSOperand(const MCInst *MI, unsigned OpNum,
-                       raw_ostream &O) {
-    printSysRegOperand(A64SysReg::MRSMapper(), MI, OpNum, O);
+  template<int Scale>
+  void printUImm12Offset(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printUImm12Offset(MI, OpNum, Scale, O);
   }
 
-  void printMSROperand(const MCInst *MI, unsigned OpNum,
-                       raw_ostream &O) {
-    printSysRegOperand(A64SysReg::MSRMapper(), MI, OpNum, O);
+  template<int BitWidth>
+  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexedWB(MI, OpNum, BitWidth / 8, O);
   }
 
-  void printShiftOperand(const char *name, const MCInst *MI,
-                         unsigned OpIdx, raw_ostream &O);
-
-  void printLSLOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  void printLSROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printShiftOperand("lsr", MI, OpNum, O);
-  }
-  void printASROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printShiftOperand("asr", MI, OpNum, O);
-  }
-  void printROROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printShiftOperand("ror", MI, OpNum, O);
-  }
+  template<int Scale>
+  void printImmScale(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  template<A64SE::ShiftExtSpecifiers Shift>
-  void printShiftOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printShiftOperand(MI, OpNum, O, Shift);
-  }
+  void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  void printShiftOperand(const MCInst *MI, unsigned OpNum,
-                         raw_ostream &O, A64SE::ShiftExtSpecifiers Sh);
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
+  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                       StringRef LayoutSuffix);
 
-  void printMoveWideImmOperand(const  MCInst *MI, unsigned OpNum,
-                               raw_ostream &O);
+  /// Print a list of vector registers where the type suffix is implicit
+  /// (i.e. attached to the instruction rather than the registers).
+  void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O);
 
-  template<int MemSize> void
-  printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  template <unsigned NumLanes, char LaneKind>
+  void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  void printOffsetSImm9Operand(const MCInst *MI, unsigned OpNum,
-                               raw_ostream &O);
-
-  void printPRFMOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  template<A64SE::ShiftExtSpecifiers EXT>
-  void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                             raw_ostream &O) {
-    printRegExtendOperand(MI, OpNum, O, EXT);
-  }
+  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSystemPStateField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+};
 
-  void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                             raw_ostream &O, A64SE::ShiftExtSpecifiers Ext);
+class AArch64AppleInstPrinter : public AArch64InstPrinter {
+public:
+  AArch64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                        const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
 
-  void printVPRRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 
-  bool isStackReg(unsigned RegNo) {
-    return RegNo == AArch64::XSP || RegNo == AArch64::WSP;
+  void printInstruction(const MCInst *MI, raw_ostream &O) override;
+  bool printAliasInstr(const MCInst *MI, raw_ostream &O) override;
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx, raw_ostream &O);
+  StringRef getRegName(unsigned RegNo) const override {
+    return getRegisterName(RegNo);
   }
-
-  template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
-  void printNeonMovImmShiftOperand(const MCInst *MI, unsigned OpNum,
-                                   raw_ostream &O);
-  void printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printUImmHexOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printUImmBareOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum,
-                                  raw_ostream &O);
-
-  template <A64Layout::VectorLayout Layout, unsigned Count>
-  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AArch64::NoRegAltName);
 };
 }
 
diff --git a/lib/Target/AArch64/InstPrinter/Android.mk b/lib/Target/AArch64/InstPrinter/Android.mk
index ac9b0df..de6aa89 100644
--- a/lib/Target/AArch64/InstPrinter/Android.mk
+++ b/lib/Target/AArch64/InstPrinter/Android.mk
@@ -2,6 +2,7 @@ LOCAL_PATH := $(call my-dir)
 
 arm64_asm_printer_TBLGEN_TABLES := \
   AArch64GenAsmWriter.inc \
+  AArch64GenAsmWriter1.inc \
   AArch64GenRegisterInfo.inc \
   AArch64GenSubtargetInfo.inc \
   AArch64GenInstrInfo.inc
diff --git a/lib/Target/AArch64/InstPrinter/CMakeLists.txt b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
index 3db56e4..363f502 100644
--- a/lib/Target/AArch64/InstPrinter/CMakeLists.txt
+++ b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
@@ -1,3 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
 add_llvm_library(LLVMAArch64AsmPrinter
   AArch64InstPrinter.cpp
   )
+
+add_dependencies(LLVMAArch64AsmPrinter AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/InstPrinter/LLVMBuild.txt b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
index 4836c7c..a13e842 100644
--- a/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
+;===- ./lib/Target/AArch64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/InstPrinter/Makefile b/lib/Target/AArch64/InstPrinter/Makefile
index 1c36a8d..b17e8d0 100644
--- a/lib/Target/AArch64/InstPrinter/Makefile
+++ b/lib/Target/AArch64/InstPrinter/Makefile
@@ -9,7 +9,7 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMAArch64AsmPrinter
 
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' arm target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 4c8f101..642c183 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/LLVMBuild.txt -----------------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/LLVMBuild.txt -------------------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = AArch64CodeGen
 parent = AArch64
-required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils AsmPrinter CodeGen Core MC SelectionDAG Support Target
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
 add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
new file mode 100644
index 0000000..8b1e44e
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -0,0 +1,738 @@
+//===- AArch64AddressingModes.h - AArch64 Addressing Modes ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+#define LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// AArch64_AM - AArch64 Addressing Mode Stuff
+namespace AArch64_AM {
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//
+
+enum ShiftExtendType {
+  InvalidShiftExtend = -1,
+  LSL = 0,
+  LSR,
+  ASR,
+  ROR,
+  MSL,
+
+  UXTB,
+  UXTH,
+  UXTW,
+  UXTX,
+
+  SXTB,
+  SXTH,
+  SXTW,
+  SXTX,
+};
+
+/// getShiftName - Get the string encoding for the shift type.
+static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
+  switch (ST) {
+  default: assert(false && "unhandled shift type!");
+  case AArch64_AM::LSL: return "lsl";
+  case AArch64_AM::LSR: return "lsr";
+  case AArch64_AM::ASR: return "asr";
+  case AArch64_AM::ROR: return "ror";
+  case AArch64_AM::MSL: return "msl";
+  case AArch64_AM::UXTB: return "uxtb";
+  case AArch64_AM::UXTH: return "uxth";
+  case AArch64_AM::UXTW: return "uxtw";
+  case AArch64_AM::UXTX: return "uxtx";
+  case AArch64_AM::SXTB: return "sxtb";
+  case AArch64_AM::SXTH: return "sxth";
+  case AArch64_AM::SXTW: return "sxtw";
+  case AArch64_AM::SXTX: return "sxtx";
+  }
+  return nullptr;
+}
+
+/// getShiftType - Extract the shift type.
+static inline AArch64_AM::ShiftExtendType getShiftType(unsigned Imm) {
+  switch ((Imm >> 6) & 0x7) {
+  default: return AArch64_AM::InvalidShiftExtend;
+  case 0: return AArch64_AM::LSL;
+  case 1: return AArch64_AM::LSR;
+  case 2: return AArch64_AM::ASR;
+  case 3: return AArch64_AM::ROR;
+  case 4: return AArch64_AM::MSL;
+  }
+}
+
+/// getShiftValue - Extract the shift value.
+static inline unsigned getShiftValue(unsigned Imm) {
+  return Imm & 0x3f;
+}
+
+/// getShifterImm - Encode the shift type and amount:
+///   imm:     6-bit shift amount
+///   shifter: 000 ==> lsl
+///            001 ==> lsr
+///            010 ==> asr
+///            011 ==> ror
+///            100 ==> msl
+///   {8-6}  = shifter
+///   {5-0}  = imm
+static inline unsigned getShifterImm(AArch64_AM::ShiftExtendType ST,
+                                     unsigned Imm) {
+  assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
+  unsigned STEnc = 0;
+  switch (ST) {
+  default:  llvm_unreachable("Invalid shift requested");
+  case AArch64_AM::LSL: STEnc = 0; break;
+  case AArch64_AM::LSR: STEnc = 1; break;
+  case AArch64_AM::ASR: STEnc = 2; break;
+  case AArch64_AM::ROR: STEnc = 3; break;
+  case AArch64_AM::MSL: STEnc = 4; break;
+  }
+  return (STEnc << 6) | (Imm & 0x3f);
+}
+
+//===----------------------------------------------------------------------===//
+// Extends
+//
+
+/// getArithShiftValue - get the arithmetic shift value.
+static inline unsigned getArithShiftValue(unsigned Imm) {
+  return Imm & 0x7;
+}
+
+/// getExtendType - Extract the extend type for operands of arithmetic ops.
+static inline AArch64_AM::ShiftExtendType getExtendType(unsigned Imm) {
+  assert((Imm & 0x7) == Imm && "invalid immediate!");
+  switch (Imm) {
+  default: llvm_unreachable("Compiler bug!");
+  case 0: return AArch64_AM::UXTB;
+  case 1: return AArch64_AM::UXTH;
+  case 2: return AArch64_AM::UXTW;
+  case 3: return AArch64_AM::UXTX;
+  case 4: return AArch64_AM::SXTB;
+  case 5: return AArch64_AM::SXTH;
+  case 6: return AArch64_AM::SXTW;
+  case 7: return AArch64_AM::SXTX;
+  }
+}
+
+static inline AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm) {
+  return getExtendType((Imm >> 3) & 0x7);
+}
+
+/// Mapping from extend bits to required operation:
+///   shifter: 000 ==> uxtb
+///            001 ==> uxth
+///            010 ==> uxtw
+///            011 ==> uxtx
+///            100 ==> sxtb
+///            101 ==> sxth
+///            110 ==> sxtw
+///            111 ==> sxtx
+inline unsigned getExtendEncoding(AArch64_AM::ShiftExtendType ET) {
+  switch (ET) {
+  default: llvm_unreachable("Invalid extend type requested");
+  case AArch64_AM::UXTB: return 0; break;
+  case AArch64_AM::UXTH: return 1; break;
+  case AArch64_AM::UXTW: return 2; break;
+  case AArch64_AM::UXTX: return 3; break;
+  case AArch64_AM::SXTB: return 4; break;
+  case AArch64_AM::SXTH: return 5; break;
+  case AArch64_AM::SXTW: return 6; break;
+  case AArch64_AM::SXTX: return 7; break;
+  }
+}
+
+/// getArithExtendImm - Encode the extend type and shift amount for an
+///                     arithmetic instruction:
+///   imm:     3-bit extend amount
+///   {5-3}  = shifter
+///   {2-0}  = imm3
+static inline unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET,
+                                         unsigned Imm) {
+  assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
+  return (getExtendEncoding(ET) << 3) | (Imm & 0x7);
+}
+
+/// getMemDoShift - Extract the "do shift" flag value for load/store
+/// instructions.
+static inline bool getMemDoShift(unsigned Imm) {
+  return (Imm & 0x1) != 0;
+}
+
+/// getExtendType - Extract the extend type for the offset operand of
+/// loads/stores.
+static inline AArch64_AM::ShiftExtendType getMemExtendType(unsigned Imm) {
+  return getExtendType((Imm >> 1) & 0x7);
+}
+
+/// getExtendImm - Encode the extend type and amount for a load/store inst:
+///   doshift:     should the offset be scaled by the access size
+///   shifter: 000 ==> uxtb
+///            001 ==> uxth
+///            010 ==> uxtw
+///            011 ==> uxtx
+///            100 ==> sxtb
+///            101 ==> sxth
+///            110 ==> sxtw
+///            111 ==> sxtx
+///   {3-1}  = shifter
+///   {0}  = doshift
+static inline unsigned getMemExtendImm(AArch64_AM::ShiftExtendType ET,
+                                       bool DoShift) {
+  return (getExtendEncoding(ET) << 1) | unsigned(DoShift);
+}
+
+static inline uint64_t ror(uint64_t elt, unsigned size) {
+  return ((elt & 1) << (size-1)) | (elt >> 1);
+}
+
+/// processLogicalImmediate - Determine if an immediate value can be encoded
+/// as the immediate operand of a logical instruction for the given register
+/// size.  If so, return true with "encoding" set to the encoded value in
+/// the form N:immr:imms.
+static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize,
+                                           uint64_t &encoding) {
+  if (imm == 0ULL || imm == ~0ULL ||
+      (regSize != 64 && (imm >> regSize != 0 || imm == ~0U)))
+    return false;
+
+  unsigned size = 2;
+  uint64_t eltVal = imm;
+
+  // First, determine the element size.
+  while (size < regSize) {
+    unsigned numElts = regSize / size;
+    unsigned mask = (1ULL << size) - 1;
+    uint64_t lowestEltVal = imm & mask;
+
+    bool allMatched = true;
+    for (unsigned i = 1; i < numElts; ++i) {
+     uint64_t currEltVal = (imm >> (i*size)) & mask;
+      if (currEltVal != lowestEltVal) {
+        allMatched = false;
+        break;
+      }
+    }
+
+    if (allMatched) {
+      eltVal = lowestEltVal;
+      break;
+    }
+
+    size *= 2;
+  }
+
+  // Second, determine the rotation to make the element be: 0^m 1^n.
+  for (unsigned i = 0; i < size; ++i) {
+    eltVal = ror(eltVal, size);
+    uint32_t clz = countLeadingZeros(eltVal) - (64 - size);
+    uint32_t cto = CountTrailingOnes_64(eltVal);
+
+    if (clz + cto == size) {
+      // Encode in immr the number of RORs it would take to get *from* this
+      // element value to our target value, where i+1 is the number of RORs
+      // to go the opposite direction.
+      unsigned immr = size - (i + 1);
+
+      // If size has a 1 in the n'th bit, create a value that has zeroes in
+      // bits [0, n] and ones above that.
+      uint64_t nimms = ~(size-1) << 1;
+
+      // Or the CTO value into the low bits, which must be below the Nth bit
+      // bit mentioned above.
+      nimms |= (cto-1);
+
+      // Extract the seventh bit and toggle it to create the N field.
+      unsigned N = ((nimms >> 6) & 1) ^ 1;
+
+      encoding = (N << 12) | (immr << 6) | (nimms & 0x3f);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// isLogicalImmediate - Return true if the immediate is valid for a logical
+/// immediate instruction of the given register size. Return false otherwise.
+static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) {
+  uint64_t encoding;
+  return processLogicalImmediate(imm, regSize, encoding);
+}
+
+/// encodeLogicalImmediate - Return the encoded immediate value for a logical
+/// immediate instruction of the given register size.
+static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) {
+  uint64_t encoding = 0;
+  bool res = processLogicalImmediate(imm, regSize, encoding);
+  assert(res && "invalid logical immediate");
+  (void)res;
+  return encoding;
+}
+
+/// decodeLogicalImmediate - Decode a logical immediate value in the form
+/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the
+/// integer value it represents with regSize bits.
+static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) {
+  // Extract the N, imms, and immr fields.
+  unsigned N = (val >> 12) & 1;
+  unsigned immr = (val >> 6) & 0x3f;
+  unsigned imms = val & 0x3f;
+
+  assert((regSize == 64 || N == 0) && "undefined logical immediate encoding");
+  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+  assert(len >= 0 && "undefined logical immediate encoding");
+  unsigned size = (1 << len);
+  unsigned R = immr & (size - 1);
+  unsigned S = imms & (size - 1);
+  assert(S != size - 1 && "undefined logical immediate encoding");
+  uint64_t pattern = (1ULL << (S + 1)) - 1;
+  for (unsigned i = 0; i < R; ++i)
+    pattern = ror(pattern, size);
+
+  // Replicate the pattern to fill the regSize.
+  while (size != regSize) {
+    pattern |= (pattern << size);
+    size *= 2;
+  }
+  return pattern;
+}
+
+/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value
+/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits)
+/// is a valid encoding for an integer value with regSize bits.
+static inline bool isValidDecodeLogicalImmediate(uint64_t val,
+                                                 unsigned regSize) {
+  // Extract the N and imms fields needed for checking.
+  unsigned N = (val >> 12) & 1;
+  unsigned imms = val & 0x3f;
+
+  if (regSize == 32 && N != 0) // undefined logical immediate encoding
+    return false;
+  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
+  if (len < 0) // undefined logical immediate encoding
+    return false;
+  unsigned size = (1 << len);
+  unsigned S = imms & (size - 1);
+  if (S == size - 1) // undefined logical immediate encoding
+    return false;
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point Immediates
+//
+static inline float getFPImmFloat(unsigned Imm) {
+  // We expect an 8-bit binary encoding of a floating-point number here.
+  union {
+    uint32_t I;
+    float F;
+  } FPUnion;
+
+  uint8_t Sign = (Imm >> 7) & 0x1;
+  uint8_t Exp = (Imm >> 4) & 0x7;
+  uint8_t Mantissa = Imm & 0xf;
+
+  //   8-bit FP    iEEEE Float Encoding
+  //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
+  //
+  // where B = NOT(b);
+
+  FPUnion.I = 0;
+  FPUnion.I |= Sign << 31;
+  FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+  FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+  FPUnion.I |= (Exp & 0x3) << 23;
+  FPUnion.I |= Mantissa << 19;
+  return FPUnion.F;
+}
+
+/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP32Imm(const APInt &Imm) {
+  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
+  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0x7ffff)
+    return -1;
+  Mantissa >>= 19;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP32Imm(const APFloat &FPImm) {
+  return getFP32Imm(FPImm.bitcastToAPInt());
+}
+
+/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+/// floating-point value. If the value cannot be represented as an 8-bit
+/// floating-point value, then return -1.
+static inline int getFP64Imm(const APInt &Imm) {
+  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023;   // -1022 to 1023
+  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0xffffffffffffULL)
+    return -1;
+  Mantissa >>= 48;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+static inline int getFP64Imm(const APFloat &FPImm) {
+  return getFP64Imm(FPImm.bitcastToAPInt());
+}
+
+//===--------------------------------------------------------------------===//
+// AdvSIMD Modified Immediates
+//===--------------------------------------------------------------------===//
+
+// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType1(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffffff00ffffff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 32) | EncVal;
+}
+
+// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType2(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffff00ffffff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 40) | (EncVal << 8);
+}
+
+// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00
+static inline bool isAdvSIMDModImmType3(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xff00ffffff00ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) {
+  return (Imm & 0xff0000ULL) >> 16;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 16);
+}
+
+// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType4(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0x00ffffff00ffffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) {
+  return (Imm & 0xff000000ULL) >> 24;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 56) | (EncVal << 24);
+}
+
+// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh
+static inline bool isAdvSIMDModImmType5(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) &&
+         ((Imm & 0xff00ff00ff00ff00ULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal;
+}
+
+// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00
+static inline bool isAdvSIMDModImmType6(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) &&
+         ((Imm & 0x00ff00ff00ff00ffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8);
+}
+
+// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF
+static inline bool isAdvSIMDModImmType7(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) {
+  return (Imm & 0xff00ULL) >> 8;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL;
+}
+
+// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF
+static inline bool isAdvSIMDModImmType8(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL;
+}
+
+static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) {
+  return (Imm & 0x00ff0000ULL) >> 16;
+}
+
+// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh
+static inline bool isAdvSIMDModImmType9(uint64_t Imm) {
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         ((Imm >> 48) == (Imm & 0x0000ffffULL)) &&
+         ((Imm >> 56) == (Imm & 0x000000ffULL));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) {
+  return (Imm & 0xffULL);
+}
+
+static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) {
+  uint64_t EncVal = Imm;
+  EncVal |= (EncVal << 8);
+  EncVal |= (EncVal << 16);
+  EncVal |= (EncVal << 32);
+  return EncVal;
+}
+
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// cmode: 1110, op: 1
+static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
+  uint64_t ByteA = Imm & 0xff00000000000000ULL;
+  uint64_t ByteB = Imm & 0x00ff000000000000ULL;
+  uint64_t ByteC = Imm & 0x0000ff0000000000ULL;
+  uint64_t ByteD = Imm & 0x000000ff00000000ULL;
+  uint64_t ByteE = Imm & 0x00000000ff000000ULL;
+  uint64_t ByteF = Imm & 0x0000000000ff0000ULL;
+  uint64_t ByteG = Imm & 0x000000000000ff00ULL;
+  uint64_t ByteH = Imm & 0x00000000000000ffULL;
+
+  return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) &&
+         (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) &&
+         (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) &&
+         (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) &&
+         (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) &&
+         (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) &&
+         (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) &&
+         (ByteH == 0ULL || ByteH == 0x00000000000000ffULL);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0xff00000000000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x00ff000000000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x0000ff0000000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x000000ff00000000ULL) != 0;
+  uint8_t BitE = (Imm & 0x00000000ff000000ULL) != 0;
+  uint8_t BitF = (Imm & 0x0000000000ff0000ULL) != 0;
+  uint8_t BitG = (Imm & 0x000000000000ff00ULL) != 0;
+  uint8_t BitH = (Imm & 0x00000000000000ffULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0xff00000000000000ULL;
+  if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL;
+  if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL;
+  if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL;
+  if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL;
+  if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL;
+  if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL;
+  if (Imm & 0x01) EncVal |= 0x00000000000000ffULL;
+  return EncVal;
+}
+
+// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00
+static inline bool isAdvSIMDModImmType11(uint64_t Imm) {
+  uint64_t BString = (Imm & 0x7E000000ULL) >> 25;
+  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
+         (BString == 0x1f || BString == 0x20) &&
+         ((Imm & 0x0007ffff0007ffffULL) == 0);
+}
+
+static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0x80000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x20000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x01000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x00800000ULL) != 0;
+  uint8_t BitE = (Imm & 0x00400000ULL) != 0;
+  uint8_t BitF = (Imm & 0x00200000ULL) != 0;
+  uint8_t BitG = (Imm & 0x00100000ULL) != 0;
+  uint8_t BitH = (Imm & 0x00080000ULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0x80000000ULL;
+  if (Imm & 0x40) EncVal |= 0x3e000000ULL;
+  else            EncVal |= 0x40000000ULL;
+  if (Imm & 0x20) EncVal |= 0x01000000ULL;
+  if (Imm & 0x10) EncVal |= 0x00800000ULL;
+  if (Imm & 0x08) EncVal |= 0x00400000ULL;
+  if (Imm & 0x04) EncVal |= 0x00200000ULL;
+  if (Imm & 0x02) EncVal |= 0x00100000ULL;
+  if (Imm & 0x01) EncVal |= 0x00080000ULL;
+  return (EncVal << 32) | EncVal;
+}
+
+// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00
+static inline bool isAdvSIMDModImmType12(uint64_t Imm) {
+  uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54;
+  return ((BString == 0xff || BString == 0x100) &&
+         ((Imm & 0x0000ffffffffffffULL) == 0));
+}
+
+static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) {
+  uint8_t BitA = (Imm & 0x8000000000000000ULL) != 0;
+  uint8_t BitB = (Imm & 0x0040000000000000ULL) != 0;
+  uint8_t BitC = (Imm & 0x0020000000000000ULL) != 0;
+  uint8_t BitD = (Imm & 0x0010000000000000ULL) != 0;
+  uint8_t BitE = (Imm & 0x0008000000000000ULL) != 0;
+  uint8_t BitF = (Imm & 0x0004000000000000ULL) != 0;
+  uint8_t BitG = (Imm & 0x0002000000000000ULL) != 0;
+  uint8_t BitH = (Imm & 0x0001000000000000ULL) != 0;
+
+  uint8_t EncVal = BitA;
+  EncVal <<= 1;
+  EncVal |= BitB;
+  EncVal <<= 1;
+  EncVal |= BitC;
+  EncVal <<= 1;
+  EncVal |= BitD;
+  EncVal <<= 1;
+  EncVal |= BitE;
+  EncVal <<= 1;
+  EncVal |= BitF;
+  EncVal <<= 1;
+  EncVal |= BitG;
+  EncVal <<= 1;
+  EncVal |= BitH;
+  return EncVal;
+}
+
+static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
+  uint64_t EncVal = 0;
+  if (Imm & 0x80) EncVal |= 0x8000000000000000ULL;
+  if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL;
+  else            EncVal |= 0x4000000000000000ULL;
+  if (Imm & 0x20) EncVal |= 0x0020000000000000ULL;
+  if (Imm & 0x10) EncVal |= 0x0010000000000000ULL;
+  if (Imm & 0x08) EncVal |= 0x0008000000000000ULL;
+  if (Imm & 0x04) EncVal |= 0x0004000000000000ULL;
+  if (Imm & 0x02) EncVal |= 0x0002000000000000ULL;
+  if (Imm & 0x01) EncVal |= 0x0001000000000000ULL;
+  return (EncVal << 32) | EncVal;
+}
+
+} // end namespace AArch64_AM
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index f1452ab..d8900d4 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -6,168 +6,57 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the AArch64 implementation of the MCAsmBackend class,
-// which is principally concerned with relaxation of the various fixup kinds.
-//
-//===----------------------------------------------------------------------===//
 
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/MachO.h"
 using namespace llvm;
 
 namespace {
-class AArch64AsmBackend : public MCAsmBackend {
-  const MCSubtargetInfo* STI;
-public:
-  AArch64AsmBackend(const Target &T, const StringRef TT)
-    : MCAsmBackend(),
-      STI(AArch64_MC::createAArch64MCSubtargetInfo(TT, "", ""))
-    {}
-
-
-  ~AArch64AsmBackend() {
-    delete STI;
-  }
-
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
-
-  virtual void processFixupValue(const MCAssembler &Asm,
-                                 const MCAsmLayout &Layout,
-                                 const MCFixup &Fixup, const MCFragment *DF,
-                                 const MCValue &Target, uint64_t &Value,
-                                 bool &IsResolved);
-};
-} // end anonymous namespace
-
-void AArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
-                                          const MCAsmLayout &Layout,
-                                          const MCFixup &Fixup,
-                                          const MCFragment *DF,
-                                          const MCValue &Target,
-                                          uint64_t &Value, bool &IsResolved) {
-  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
-  // ~0xfff. This means that the required offset to reach a symbol can vary by
-  // up to one step depending on where the ADRP is in memory. For example:
-  //
-  //     ADRP x0, there
-  //  there:
-  //
-  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
-  // we'll need that as an offset. At any other address "there" will be in the
-  // same page as the ADRP and the instruction should encode 0x0. Assuming the
-  // section isn't 0x1000-aligned, we therefore need to delegate this decision
-  // to the linker -- a relocation!
-  if ((uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_prel_page ||
-      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_prel_got_page ||
-      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_gottprel_page ||
-      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_tlsdesc_adr_page)
-    IsResolved = false;
-}
-
 
-static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value);
-
-namespace {
+class AArch64AsmBackend : public MCAsmBackend {
+  static const unsigned PCRelFlagVal =
+      MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
 
-class ELFAArch64AsmBackend : public AArch64AsmBackend {
-  uint8_t OSABI;
-  bool IsLittle; // Big or little endian
 public:
-  ELFAArch64AsmBackend(const Target &T, const StringRef TT,
-                       uint8_t _OSABI, bool isLittle)
-    : AArch64AsmBackend(T, TT), OSABI(_OSABI), IsLittle(isLittle) { }
+  AArch64AsmBackend(const Target &T) : MCAsmBackend() {}
 
-  bool fixupNeedsRelaxation(const MCFixup &Fixup,
-                            uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const;
-
-  unsigned int getNumFixupKinds() const {
+  unsigned getNumFixupKinds() const override {
     return AArch64::NumTargetFixupKinds;
   }
 
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
     const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
-// This table *must* be in the order that the fixup_* kinds are defined in
-// AArch64FixupKinds.h.
-//
-// Name                   Offset (bits)    Size (bits)    Flags
-{ "fixup_a64_ld_prel",               0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_adr_prel",              0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_adr_prel_page",         0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_add_lo12",              0,    32,             0 },
-{ "fixup_a64_ldst8_lo12",            0,    32,             0 },
-{ "fixup_a64_ldst16_lo12",           0,    32,             0 },
-{ "fixup_a64_ldst32_lo12",           0,    32,             0 },
-{ "fixup_a64_ldst64_lo12",           0,    32,             0 },
-{ "fixup_a64_ldst128_lo12",          0,    32,             0 },
-{ "fixup_a64_tstbr",                 0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_condbr",                0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_uncondbr",              0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_call",                  0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_movw_uabs_g0",          0,    32,             0 },
-{ "fixup_a64_movw_uabs_g0_nc",       0,    32,             0 },
-{ "fixup_a64_movw_uabs_g1",          0,    32,             0 },
-{ "fixup_a64_movw_uabs_g1_nc",       0,    32,             0 },
-{ "fixup_a64_movw_uabs_g2",          0,    32,             0 },
-{ "fixup_a64_movw_uabs_g2_nc",       0,    32,             0 },
-{ "fixup_a64_movw_uabs_g3",          0,    32,             0 },
-{ "fixup_a64_movw_sabs_g0",          0,    32,             0 },
-{ "fixup_a64_movw_sabs_g1",          0,    32,             0 },
-{ "fixup_a64_movw_sabs_g2",          0,    32,             0 },
-{ "fixup_a64_adr_prel_got_page",     0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_ld64_got_lo12_nc",      0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g2",        0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g1",        0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g1_nc",     0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g0",        0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g0_nc",     0,    32,             0 },
-{ "fixup_a64_add_dtprel_hi12",       0,    32,             0 },
-{ "fixup_a64_add_dtprel_lo12",       0,    32,             0 },
-{ "fixup_a64_add_dtprel_lo12_nc",    0,    32,             0 },
-{ "fixup_a64_ldst8_dtprel_lo12",     0,    32,             0 },
-{ "fixup_a64_ldst8_dtprel_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_ldst16_dtprel_lo12",    0,    32,             0 },
-{ "fixup_a64_ldst16_dtprel_lo12_nc", 0,    32,             0 },
-{ "fixup_a64_ldst32_dtprel_lo12",    0,    32,             0 },
-{ "fixup_a64_ldst32_dtprel_lo12_nc", 0,    32,             0 },
-{ "fixup_a64_ldst64_dtprel_lo12",    0,    32,             0 },
-{ "fixup_a64_ldst64_dtprel_lo12_nc", 0,    32,             0 },
-{ "fixup_a64_movw_gottprel_g1",      0,    32,             0 },
-{ "fixup_a64_movw_gottprel_g0_nc",   0,    32,             0 },
-{ "fixup_a64_adr_gottprel_page",     0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_ld64_gottprel_lo12_nc", 0,    32,             0 },
-{ "fixup_a64_ld_gottprel_prel19",    0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_movw_tprel_g2",         0,    32,             0 },
-{ "fixup_a64_movw_tprel_g1",         0,    32,             0 },
-{ "fixup_a64_movw_tprel_g1_nc",      0,    32,             0 },
-{ "fixup_a64_movw_tprel_g0",         0,    32,             0 },
-{ "fixup_a64_movw_tprel_g0_nc",      0,    32,             0 },
-{ "fixup_a64_add_tprel_hi12",        0,    32,             0 },
-{ "fixup_a64_add_tprel_lo12",        0,    32,             0 },
-{ "fixup_a64_add_tprel_lo12_nc",     0,    32,             0 },
-{ "fixup_a64_ldst8_tprel_lo12",      0,    32,             0 },
-{ "fixup_a64_ldst8_tprel_lo12_nc",   0,    32,             0 },
-{ "fixup_a64_ldst16_tprel_lo12",     0,    32,             0 },
-{ "fixup_a64_ldst16_tprel_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_ldst32_tprel_lo12",     0,    32,             0 },
-{ "fixup_a64_ldst32_tprel_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_ldst64_tprel_lo12",     0,    32,             0 },
-{ "fixup_a64_ldst64_tprel_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_tlsdesc_adr_page",      0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_tlsdesc_ld64_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_tlsdesc_add_lo12_nc",   0,    32,             0 },
-{ "fixup_a64_tlsdesc_call",          0,     0,             0 }
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // AArch64FixupKinds.h.
+      //
+      // Name                           Offset (bits) Size (bits)     Flags
+      { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_aarch64_add_imm12", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
+      { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
+      { "fixup_aarch64_movw", 5, 16, 0 },
+      { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
+      { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
     };
+
     if (Kind < FirstTargetFixupKind)
       return MCAsmBackend::getFixupKindInfo(Kind);
 
@@ -177,417 +66,501 @@ public:
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const {
-    unsigned NumBytes = getFixupKindInfo(Fixup.getKind()).TargetSize / 8;
-    Value = adjustFixupValue(Fixup.getKind(), Value);
-    if (!Value) return;           // Doesn't change encoding.
-
-    unsigned Offset = Fixup.getOffset();
-    assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-
-    // For each byte of the fragment that the fixup touches, mask in the bits
-    // from the fixup value.
-    for (unsigned i = 0; i != NumBytes; ++i) {
-      Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
-    }
-  }
+                  uint64_t Value, bool IsPCRel) const override;
 
-  bool mayNeedRelaxation(const MCInst&) const {
-    return false;
-  }
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
-  void relaxInstruction(const MCInst&, llvm::MCInst&) const {
-    llvm_unreachable("Cannot relax instructions");
-  }
+  void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createAArch64ELFObjectWriter(OS, OSABI, IsLittle);
-  }
+  unsigned getPointerSize() const { return 8; }
 };
 
 } // end anonymous namespace
 
-bool
-ELFAArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
-                                           uint64_t Value,
-                                           const MCRelaxableFragment *DF,
-                                           const MCAsmLayout &Layout) const {
-  // Correct for now. With all instructions 32-bit only very low-level
-  // considerations could make you select something which may fail.
-  return false;
-}
+/// \brief The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    assert(0 && "Unknown fixup kind!");
 
+  case AArch64::fixup_aarch64_tlsdesc_call:
+    return 0;
 
-bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  // Can't emit NOP with size not multiple of 32-bits
-  if (Count % 4 != 0)
-    return false;
+  case FK_Data_1:
+    return 1;
 
-  uint64_t NumNops = Count / 4;
-  for (uint64_t i = 0; i != NumNops; ++i)
-    OW->Write32(0xd503201f);
+  case FK_Data_2:
+  case AArch64::fixup_aarch64_movw:
+    return 2;
+
+  case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    return 3;
+
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+  case FK_Data_4:
+    return 4;
 
-  return true;
+  case FK_Data_8:
+    return 8;
+  }
 }
 
-static unsigned ADRImmBits(unsigned Value) {
+static unsigned AdrImmBits(unsigned Value) {
   unsigned lo2 = Value & 0x3;
-  unsigned hi19 = (Value & 0x1fffff) >> 2;
-
+  unsigned hi19 = (Value & 0x1ffffc) >> 2;
   return (hi19 << 5) | (lo2 << 29);
 }
 
 static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  int64_t SignedValue = static_cast<int64_t>(Value);
   switch (Kind) {
   default:
-    llvm_unreachable("Unknown fixup kind!");
-  case FK_Data_2:
-    assert((int64_t)Value >= -32768 &&
-           (int64_t)Value <= 65536 &&
-           "Out of range ABS16 fixup");
+    assert(false && "Unknown fixup kind!");
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    return AdrImmBits(Value & 0x1fffffULL);
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    // Signed 21-bit immediate
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded.
+    return (Value >> 2) & 0x7ffff;
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+    // Unsigned 12-bit immediate
+    if (Value >= 0x1000)
+      report_fatal_error("invalid imm12 fixup value");
     return Value;
-  case FK_Data_4:
-    assert((int64_t)Value >= -(1LL << 31) &&
-           (int64_t)Value <= (1LL << 32) - 1 &&
-           "Out of range ABS32 fixup");
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+    // Unsigned 12-bit immediate which gets multiplied by 2
+    if (Value & 1 || Value >= 0x2000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 1;
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+    // Unsigned 12-bit immediate which gets multiplied by 4
+    if (Value & 3 || Value >= 0x4000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 2;
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+    // Unsigned 12-bit immediate which gets multiplied by 8
+    if (Value & 7 || Value >= 0x8000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 3;
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    // Unsigned 12-bit immediate which gets multiplied by 16
+    if (Value & 15 || Value >= 0x10000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 4;
+  case AArch64::fixup_aarch64_movw:
+    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
     return Value;
+  case AArch64::fixup_aarch64_pcrel_branch14:
+    // Signed 16-bit immediate
+    if (SignedValue > 32767 || SignedValue < -32768)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3fff;
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    // Signed 28-bit immediate
+    if (SignedValue > 134217727 || SignedValue < -134217728)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3ffffff;
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
   case FK_Data_8:
     return Value;
+  }
+}
 
-  case AArch64::fixup_a64_ld_gottprel_prel19:
-    // R_AARCH64_LD_GOTTPREL_PREL19: Set a load-literal immediate to bits 1F
-    // FFFC of G(TPREL(S+A)) - P; check -2^20 <= X < 2^20.
-  case AArch64::fixup_a64_ld_prel:
-    // R_AARCH64_LD_PREL_LO19: Sets a load-literal (immediate) value to bits
-    // 1F FFFC of S+A-P, checking that -2^20 <= S+A-P < 2^20.
-    assert((int64_t)Value >= -(1LL << 20) &&
-           (int64_t)Value < (1LL << 20) && "Out of range LDR (lit) fixup");
-    return (Value & 0x1ffffc) << 3;
-
-  case AArch64::fixup_a64_adr_prel:
-    // R_AARCH64_ADR_PREL_LO21: Sets an ADR immediate value to bits 1F FFFF of
-    // the result of S+A-P, checking that -2^20 <= S+A-P < 2^20.
-    assert((int64_t)Value >= -(1LL << 20) &&
-           (int64_t)Value < (1LL << 20) && "Out of range ADR fixup");
-    return ADRImmBits(Value & 0x1fffff);
-
-  case AArch64::fixup_a64_adr_prel_page:
-    // R_AARCH64_ADR_PREL_PG_HI21: Sets an ADRP immediate value to bits 1 FFFF
-    // F000 of the result of the operation, checking that -2^32 <= result <
-    // 2^32.
-    assert((int64_t)Value >= -(1LL << 32) &&
-           (int64_t)Value < (1LL << 32) && "Out of range ADRP fixup");
-    return ADRImmBits((Value & 0x1fffff000ULL) >> 12);
-
-  case AArch64::fixup_a64_add_dtprel_hi12:
-    // R_AARCH64_TLSLD_ADD_DTPREL_LO12: Set an ADD immediate field to bits
-    // FF F000 of DTPREL(S+A), check 0 <= X < 2^24.
-  case AArch64::fixup_a64_add_tprel_hi12:
-    // R_AARCH64_TLSLD_ADD_TPREL_LO12: Set an ADD immediate field to bits
-    // FF F000 of TPREL(S+A), check 0 <= X < 2^24.
-    assert((int64_t)Value >= 0 &&
-           (int64_t)Value < (1LL << 24) && "Out of range ADD fixup");
-    return (Value & 0xfff000) >> 2;
-
-  case AArch64::fixup_a64_add_dtprel_lo12:
-    // R_AARCH64_TLSLD_ADD_DTPREL_LO12: Set an ADD immediate field to bits
-    // FFF of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_add_tprel_lo12:
-    // R_AARCH64_TLSLD_ADD_TPREL_LO12: Set an ADD immediate field to bits
-    // FFF of TPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t)Value >= 0 &&
-           (int64_t)Value < (1LL << 12) && "Out of range ADD fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_add_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC: Set an ADD immediate field to bits
-    // FFF of DTPREL(S+A) with no overflow check.
-  case AArch64::fixup_a64_add_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_ADD_TPREL_LO12_NC: Set an ADD immediate field to bits
-    // FFF of TPREL(S+A) with no overflow check.
-  case AArch64::fixup_a64_tlsdesc_add_lo12_nc:
-    // R_AARCH64_TLSDESC_ADD_LO12_NC: Set an ADD immediate field to bits
-    // FFF of G(TLSDESC(S+A)), with no overflow check.
-  case AArch64::fixup_a64_add_lo12:
-    // R_AARCH64_ADD_ABS_LO12_NC: Sets an ADD immediate value to bits FFF of
-    // S+A, with no overflow check.
-    return (Value & 0xfff) << 10;
-
-  case AArch64::fixup_a64_ldst8_dtprel_lo12:
-    // R_AARCH64_TLSLD_LDST8_DTPREL_LO12: Set an LD/ST offset field to bits FFF
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_ldst8_tprel_lo12:
-    // R_AARCH64_TLSLE_LDST8_TPREL_LO12: Set an LD/ST offset field to bits FFF
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t) Value >= 0 &&
-           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_ldst8_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST8_DTPREL_LO12: Set an LD/ST offset field to bits FFF
-    // of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst8_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST8_TPREL_LO12: Set an LD/ST offset field to bits FFF
-    // of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst8_lo12:
-    // R_AARCH64_LDST8_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFF
-    // of S+A, with no overflow check.
-    return (Value & 0xfff) << 10;
-
-  case AArch64::fixup_a64_ldst16_dtprel_lo12:
-    // R_AARCH64_TLSLD_LDST16_DTPREL_LO12: Set an LD/ST offset field to bits FFE
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_ldst16_tprel_lo12:
-    // R_AARCH64_TLSLE_LDST16_TPREL_LO12: Set an LD/ST offset field to bits FFE
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t) Value >= 0 &&
-           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_ldst16_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST16_DTPREL_LO12: Set an LD/ST offset field to bits FFE
-    // of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst16_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST16_TPREL_LO12: Set an LD/ST offset field to bits FFE
-    // of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst16_lo12:
-    // R_AARCH64_LDST16_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFE
-    // of S+A, with no overflow check.
-    return (Value & 0xffe) << 9;
-
-  case AArch64::fixup_a64_ldst32_dtprel_lo12:
-    // R_AARCH64_TLSLD_LDST32_DTPREL_LO12: Set an LD/ST offset field to bits FFC
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_ldst32_tprel_lo12:
-    // R_AARCH64_TLSLE_LDST32_TPREL_LO12: Set an LD/ST offset field to bits FFC
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t) Value >= 0 &&
-           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_ldst32_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST32_DTPREL_LO12: Set an LD/ST offset field to bits FFC
-    // of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst32_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST32_TPREL_LO12: Set an LD/ST offset field to bits FFC
-    // of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst32_lo12:
-    // R_AARCH64_LDST32_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFC
-    // of S+A, with no overflow check.
-    return (Value & 0xffc) << 8;
-
-  case AArch64::fixup_a64_ldst64_dtprel_lo12:
-    // R_AARCH64_TLSLD_LDST64_DTPREL_LO12: Set an LD/ST offset field to bits FF8
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_ldst64_tprel_lo12:
-    // R_AARCH64_TLSLE_LDST64_TPREL_LO12: Set an LD/ST offset field to bits FF8
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t) Value >= 0 &&
-           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_ldst64_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST64_DTPREL_LO12: Set an LD/ST offset field to bits FF8
-    // of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst64_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST64_TPREL_LO12: Set an LD/ST offset field to bits FF8
-    // of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst64_lo12:
-    // R_AARCH64_LDST64_ABS_LO12_NC: Sets an LD/ST immediate value to bits FF8
-    // of S+A, with no overflow check.
-    return (Value & 0xff8) << 7;
-
-  case AArch64::fixup_a64_ldst128_lo12:
-    // R_AARCH64_LDST128_ABS_LO12_NC: Sets an LD/ST immediate value to bits FF0
-    // of S+A, with no overflow check.
-    return (Value & 0xff0) << 6;
-
-  case AArch64::fixup_a64_movw_uabs_g0:
-    // R_AARCH64_MOVW_UABS_G0: Sets a MOVZ immediate field to bits FFFF of S+A
-    // with a check that S+A < 2^16
-    assert(Value <= 0xffff && "Out of range move wide fixup");
-    return (Value & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_dtprel_g0_nc:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC: Sets a MOVK immediate field to bits
-    // FFFF of DTPREL(S+A) with no overflow check.
-  case AArch64::fixup_a64_movw_gottprel_g0_nc:
-    // R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC: Sets a MOVK immediate field to bits
-    // FFFF of G(TPREL(S+A)) - GOT with no overflow check.
-  case AArch64::fixup_a64_movw_tprel_g0_nc:
-    // R_AARCH64_TLSLE_MOVW_TPREL_G0_NC: Sets a MOVK immediate field to bits
-    // FFFF of TPREL(S+A) with no overflow check.
-  case AArch64::fixup_a64_movw_uabs_g0_nc:
-    // R_AARCH64_MOVW_UABS_G0_NC: Sets a MOVK immediate field to bits FFFF of
-    // S+A with no overflow check.
-    return (Value & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_uabs_g1:
-    // R_AARCH64_MOVW_UABS_G1: Sets a MOVZ immediate field to bits FFFF0000 of
-    // S+A with a check that S+A < 2^32
-    assert(Value <= 0xffffffffull && "Out of range move wide fixup");
-    return ((Value >> 16) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_dtprel_g1_nc:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC: Set a MOVK immediate field
-    // to bits FFFF0000 of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_movw_tprel_g1_nc:
-    // R_AARCH64_TLSLD_MOVW_TPREL_G1_NC: Set a MOVK immediate field
-    // to bits FFFF0000 of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_movw_uabs_g1_nc:
-    // R_AARCH64_MOVW_UABS_G1_NC: Sets a MOVK immediate field to bits
-    // FFFF0000 of S+A with no overflow check.
-    return ((Value >> 16) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_uabs_g2:
-    // R_AARCH64_MOVW_UABS_G2: Sets a MOVZ immediate field to bits FFFF 0000
-    // 0000 of S+A with a check that S+A < 2^48
-    assert(Value <= 0xffffffffffffull && "Out of range move wide fixup");
-    return ((Value >> 32) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_uabs_g2_nc:
-    // R_AARCH64_MOVW_UABS_G2: Sets a MOVK immediate field to bits FFFF 0000
-    // 0000 of S+A with no overflow check.
-    return ((Value >> 32) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_uabs_g3:
-    // R_AARCH64_MOVW_UABS_G3: Sets a MOVZ immediate field to bits FFFF 0000
-    // 0000 0000 of S+A (no overflow check needed)
-    return ((Value >> 48) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_dtprel_g0:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G0: Set a MOV[NZ] immediate field
-    // to bits FFFF of DTPREL(S+A).
-  case AArch64::fixup_a64_movw_tprel_g0:
-    // R_AARCH64_TLSLE_MOVW_TPREL_G0: Set a MOV[NZ] immediate field to
-    // bits FFFF of TPREL(S+A).
-  case AArch64::fixup_a64_movw_sabs_g0: {
-    // R_AARCH64_MOVW_SABS_G0: Sets MOV[NZ] immediate field using bits FFFF of
-    // S+A (see notes below); check -2^16 <= S+A < 2^16. (notes say that we
-    // should convert between MOVN and MOVZ to achieve our goals).
-    int64_t Signed = Value;
-    assert(Signed >= -(1LL << 16) && Signed < (1LL << 16)
-           && "Out of range move wide fixup");
-    if (Signed >= 0) {
-      Value = (Value & 0xffff) << 5;
-      // Bit 30 converts the MOVN encoding into a MOVZ
-      Value |= 1 << 30;
-    } else {
-      // MCCodeEmitter should have encoded a MOVN, which is fine.
-      Value = (~Value & 0xffff) << 5;
-    }
-    return Value;
+void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                   unsigned DataSize, uint64_t Value,
+                                   bool IsPCRel) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  if (!Value)
+    return; // Doesn't change encoding.
+  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  // Apply any target-specific value adjustments.
+  Value = adjustFixupValue(Fixup.getKind(), Value);
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  return false;
+}
+
+bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                             uint64_t Value,
+                                             const MCRelaxableFragment *DF,
+                                             const MCAsmLayout &Layout) const {
+  // FIXME:  This isn't correct for AArch64. Just moving the "generic" logic
+  // into the targets for now.
+  //
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
+                                         MCInst &Res) const {
+  assert(false && "AArch64AsmBackend::relaxInstruction() unimplemented");
+}
+
+bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  if ((Count & 3) != 0) {
+    for (uint64_t i = 0, e = (Count & 3); i != e; ++i)
+      OW->Write8(0);
   }
 
-  case AArch64::fixup_a64_movw_dtprel_g1:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G1: Set a MOV[NZ] immediate field
-    // to bits FFFF0000 of DTPREL(S+A).
-  case AArch64::fixup_a64_movw_gottprel_g1:
-    // R_AARCH64_TLSIE_MOVW_GOTTPREL_G1: Set a MOV[NZ] immediate field
-    // to bits FFFF0000 of G(TPREL(S+A)) - GOT.
-  case AArch64::fixup_a64_movw_tprel_g1:
-    // R_AARCH64_TLSLE_MOVW_TPREL_G1: Set a MOV[NZ] immediate field to
-    // bits FFFF0000 of TPREL(S+A).
-  case AArch64::fixup_a64_movw_sabs_g1: {
-    // R_AARCH64_MOVW_SABS_G1: Sets MOV[NZ] immediate field using bits FFFF 0000
-    // of S+A (see notes below); check -2^32 <= S+A < 2^32. (notes say that we
-    // should convert between MOVN and MOVZ to achieve our goals).
-    int64_t Signed = Value;
-    assert(Signed >= -(1LL << 32) && Signed < (1LL << 32)
-           && "Out of range move wide fixup");
-    if (Signed >= 0) {
-      Value = ((Value >> 16) & 0xffff) << 5;
-      // Bit 30 converts the MOVN encoding into a MOVZ
-      Value |= 1 << 30;
-    } else {
-      Value = ((~Value >> 16) & 0xffff) << 5;
-    }
-    return Value;
+  // We are properly aligned, so write NOPs as requested.
+  Count /= 4;
+  for (uint64_t i = 0; i != Count; ++i)
+    OW->Write32(0xd503201f);
+  return true;
+}
+
+namespace {
+
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+  /// \brief A "frameless" leaf function, where no non-volatile registers are
+  /// saved. The return remains in LR throughout the function.
+  UNWIND_AArch64_MODE_FRAMELESS = 0x02000000,
+
+  /// \brief No compact unwind encoding available. Instead the low 23-bits of
+  /// the compact unwind encoding is the offset of the DWARF FDE in the
+  /// __eh_frame section. This mode is never used in object files. It is only
+  /// generated by the linker in final linked images, which have only DWARF info
+  /// for a function.
+  UNWIND_AArch64_MODE_DWARF = 0x03000000,
+
+  /// \brief This is a standard arm64 prologue where FP/LR are immediately
+  /// pushed on the stack, then SP is copied to FP. If there are any
+  /// non-volatile register saved, they are copied into the stack fame in pairs
+  /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
+  /// five X pairs and four D pairs can be saved, but the memory layout must be
+  /// in register number order.
+  UNWIND_AArch64_MODE_FRAME = 0x04000000,
+
+  /// \brief Frame register pair encodings.
+  UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001,
+  UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002,
+  UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004,
+  UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008,
+  UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010,
+  UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100,
+  UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200,
+  UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400,
+  UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800
+};
+
+} // end CU namespace
+
+// FIXME: This should be in a separate file.
+class DarwinAArch64AsmBackend : public AArch64AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Encode compact unwind stack adjustment for frameless functions.
+  /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+  /// The stack size always needs to be 16 byte aligned.
+  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
+    return (StackSize / 16) << 12;
+  }
+
+public:
+  DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
+      : AArch64AsmBackend(T), MRI(MRI) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
+                                         MachO::CPU_SUBTYPE_ARM64_ALL);
+  }
+
+  bool doesSectionRequireSymbols(const MCSection &Section) const override {
+    // Any section for which the linker breaks things into atoms needs to
+    // preserve symbols, including assembler local symbols, to identify
+    // those atoms. These sections are:
+    // Sections of type:
+    //
+    //    S_CSTRING_LITERALS  (e.g. __cstring)
+    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
+    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
+    //
+    // Sections named:
+    //
+    //    __TEXT,__eh_frame
+    //    __TEXT,__ustring
+    //    __DATA,__cfstring
+    //    __DATA,__objc_classrefs
+    //    __DATA,__objc_catlist
+    //
+    // FIXME: It would be better if the compiler used actual linker local
+    // symbols for each of these sections rather than preserving what
+    // are ostensibly assembler local symbols.
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
+    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
+            SMO.getType() == MachO::S_4BYTE_LITERALS ||
+            SMO.getType() == MachO::S_8BYTE_LITERALS ||
+            SMO.getType() == MachO::S_16BYTE_LITERALS ||
+            SMO.getType() == MachO::S_LITERAL_POINTERS ||
+            (SMO.getSegmentName() == "__TEXT" &&
+             (SMO.getSectionName() == "__eh_frame" ||
+              SMO.getSectionName() == "__ustring")) ||
+            (SMO.getSegmentName() == "__DATA" &&
+             (SMO.getSectionName() == "__cfstring" ||
+              SMO.getSectionName() == "__objc_classrefs" ||
+              SMO.getSectionName() == "__objc_catlist")));
   }
 
-  case AArch64::fixup_a64_movw_dtprel_g2:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G2: Set a MOV[NZ] immediate field
-    // to bits FFFF 0000 0000 of DTPREL(S+A).
-  case AArch64::fixup_a64_movw_tprel_g2:
-    // R_AARCH64_TLSLE_MOVW_TPREL_G2: Set a MOV[NZ] immediate field to
-    // bits FFFF 0000 0000 of TPREL(S+A).
-  case AArch64::fixup_a64_movw_sabs_g2: {
-    // R_AARCH64_MOVW_SABS_G2: Sets MOV[NZ] immediate field using bits FFFF 0000
-    // 0000 of S+A (see notes below); check -2^48 <= S+A < 2^48. (notes say that
-    // we should convert between MOVN and MOVZ to achieve our goals).
-    int64_t Signed = Value;
-    assert(Signed >= -(1LL << 48) && Signed < (1LL << 48)
-           && "Out of range move wide fixup");
-    if (Signed >= 0) {
-      Value = ((Value >> 32) & 0xffff) << 5;
-      // Bit 30 converts the MOVN encoding into a MOVZ
-      Value |= 1 << 30;
-    } else {
-      Value = ((~Value >> 32) & 0xffff) << 5;
+  /// \brief Generate the compact unwind encoding from the CFI directives.
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
+    if (Instrs.empty())
+      return CU::UNWIND_AArch64_MODE_FRAMELESS;
+
+    bool HasFP = false;
+    unsigned StackSize = 0;
+
+    uint32_t CompactUnwindEncoding = 0;
+    for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Cannot handle this directive:  bail out.
+        return CU::UNWIND_AArch64_MODE_DWARF;
+      case MCCFIInstruction::OpDefCfa: {
+        // Defines a frame pointer.
+        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
+                   AArch64::FP &&
+               "Invalid frame pointer!");
+        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
+
+        const MCCFIInstruction &LRPush = Instrs[++i];
+        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Link register not pushed!");
+        const MCCFIInstruction &FPPush = Instrs[++i];
+        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Frame pointer not pushed!");
+
+        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
+        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+
+        LRReg = getXRegFromWReg(LRReg);
+        FPReg = getXRegFromWReg(FPReg);
+
+        assert(LRReg == AArch64::LR && FPReg == AArch64::FP &&
+               "Pushing invalid registers for frame!");
+
+        // Indicate that the function has a frame.
+        CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME;
+        HasFP = true;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        assert(StackSize == 0 && "We already have the CFA offset!");
+        StackSize = std::abs(Inst.getOffset());
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Registers are saved in pairs. We expect there to be two consecutive
+        // `.cfi_offset' instructions with the appropriate registers specified.
+        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        if (i + 1 == e)
+          return CU::UNWIND_AArch64_MODE_DWARF;
+
+        const MCCFIInstruction &Inst2 = Instrs[++i];
+        if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
+          return CU::UNWIND_AArch64_MODE_DWARF;
+        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+
+        // N.B. The encodings must be in register number order, and the X
+        // registers before the D registers.
+
+        // X19/X20 pair = 0x00000001,
+        // X21/X22 pair = 0x00000002,
+        // X23/X24 pair = 0x00000004,
+        // X25/X26 pair = 0x00000008,
+        // X27/X28 pair = 0x00000010
+        Reg1 = getXRegFromWReg(Reg1);
+        Reg2 = getXRegFromWReg(Reg2);
+
+        if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 &&
+            (CompactUnwindEncoding & 0xF1E) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR;
+        else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 &&
+                 (CompactUnwindEncoding & 0xF1C) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR;
+        else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 &&
+                 (CompactUnwindEncoding & 0xF18) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR;
+        else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 &&
+                 (CompactUnwindEncoding & 0xF10) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR;
+        else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 &&
+                 (CompactUnwindEncoding & 0xF00) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR;
+        else {
+          Reg1 = getDRegFromBReg(Reg1);
+          Reg2 = getDRegFromBReg(Reg2);
+
+          // D8/D9 pair   = 0x00000100,
+          // D10/D11 pair = 0x00000200,
+          // D12/D13 pair = 0x00000400,
+          // D14/D15 pair = 0x00000800
+          if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 &&
+              (CompactUnwindEncoding & 0xE00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR;
+          else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 &&
+                   (CompactUnwindEncoding & 0xC00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR;
+          else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 &&
+                   (CompactUnwindEncoding & 0x800) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR;
+          else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR;
+          else
+            // A pair was pushed which we cannot handle.
+            return CU::UNWIND_AArch64_MODE_DWARF;
+        }
+
+        break;
+      }
+      }
     }
-    return Value;
+
+    if (!HasFP) {
+      // With compact unwind info we can only represent stack adjustments of up
+      // to 65520 bytes.
+      if (StackSize > 65520)
+        return CU::UNWIND_AArch64_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS;
+      CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
+    }
+
+    return CompactUnwindEncoding;
   }
+};
 
-  case AArch64::fixup_a64_tstbr:
-    // R_AARCH64_TSTBR14: Sets the immediate field of a TBZ/TBNZ instruction to
-    // bits FFFC of S+A-P, checking -2^15 <= S+A-P < 2^15.
-    assert((int64_t)Value >= -(1LL << 15) &&
-           (int64_t)Value < (1LL << 15) && "Out of range TBZ/TBNZ fixup");
-    return (Value & 0xfffc) << (5 - 2);
-
-  case AArch64::fixup_a64_condbr:
-    // R_AARCH64_CONDBR19: Sets the immediate field of a conditional branch
-    // instruction to bits 1FFFFC of S+A-P, checking -2^20 <= S+A-P < 2^20.
-    assert((int64_t)Value >= -(1LL << 20) &&
-           (int64_t)Value < (1LL << 20) && "Out of range B.cond fixup");
-    return (Value & 0x1ffffc) << (5 - 2);
-
-  case AArch64::fixup_a64_uncondbr:
-    // R_AARCH64_JUMP26 same as below (except to a linker, possibly).
-  case AArch64::fixup_a64_call:
-    // R_AARCH64_CALL26: Sets a CALL immediate field to bits FFFFFFC of S+A-P,
-    // checking that -2^27 <= S+A-P < 2^27.
-    assert((int64_t)Value >= -(1LL << 27) &&
-           (int64_t)Value < (1LL << 27) && "Out of range branch fixup");
-    return (Value & 0xffffffc) >> 2;
-
-  case AArch64::fixup_a64_adr_gottprel_page:
-    // R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: Set an ADRP immediate field to bits
-    // 1FFFFF000 of Page(G(TPREL(S+A))) - Page(P); check -2^32 <= X < 2^32.
-  case AArch64::fixup_a64_tlsdesc_adr_page:
-    // R_AARCH64_TLSDESC_ADR_PAGE: Set an ADRP immediate field to bits 1FFFFF000
-    // of Page(G(TLSDESC(S+A))) - Page(P); check -2^32 <= X < 2^32.
-  case AArch64::fixup_a64_adr_prel_got_page:
-    // R_AARCH64_ADR_GOT_PAGE: Sets the immediate value of an ADRP to bits
-    // 1FFFFF000 of the operation, checking that -2^32 < Page(G(S))-Page(GOT) <
-    // 2^32.
-    assert((int64_t)Value >= -(1LL << 32) &&
-           (int64_t)Value < (1LL << 32) && "Out of range ADRP fixup");
-    return ADRImmBits((Value & 0x1fffff000ULL) >> 12);
-
-  case AArch64::fixup_a64_ld64_gottprel_lo12_nc:
-    // R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: Set an LD offset field to bits FF8
-    // of X, with no overflow check. Check that X & 7 == 0.
-  case AArch64::fixup_a64_tlsdesc_ld64_lo12_nc:
-    // R_AARCH64_TLSDESC_LD64_LO12_NC: Set an LD offset field to bits FF8 of
-    // G(TLSDESC(S+A)), with no overflow check. Check that X & 7 == 0.
-  case AArch64::fixup_a64_ld64_got_lo12_nc:
-    // R_AARCH64_LD64_GOT_LO12_NC: Sets the LD/ST immediate field to bits FF8 of
-    // G(S) with no overflow check. Check X & 7 == 0
-    assert(((int64_t)Value & 7) == 0 && "Misaligned fixup");
-    return (Value & 0xff8) << 7;
-
-  case AArch64::fixup_a64_tlsdesc_call:
-    // R_AARCH64_TLSDESC_CALL: For relaxation only.
-    return 0;
+} // end anonymous namespace
+
+namespace {
+
+class ELFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+  uint8_t OSABI;
+  bool IsLittleEndian;
+
+  ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian)
+    : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian);
   }
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+};
+
+void ELFAArch64AsmBackend::processFixupValue(
+    const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
+    const MCFragment *DF, const MCValue &Target, uint64_t &Value,
+    bool &IsResolved) {
+  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
+  // ~0xfff. This means that the required offset to reach a symbol can vary by
+  // up to one step depending on where the ADRP is in memory. For example:
+  //
+  //     ADRP x0, there
+  //  there:
+  //
+  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
+  // we'll need that as an offset. At any other address "there" will be in the
+  // same page as the ADRP and the instruction should encode 0x0. Assuming the
+  // section isn't 0x1000-aligned, we therefore need to delegate this decision
+  // to the linker -- a relocation!
+  if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+    IsResolved = false;
+}
+
+void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                      unsigned DataSize, uint64_t Value,
+                                      bool IsPCRel) const {
+  // store fixups in .eh_frame section in big endian order
+  if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
+    const MCSection *Sec = Fixup.getValue()->FindAssociatedSection();
+    const MCSectionELF *SecELF = static_cast<const MCSectionELF *>(Sec);
+    if (SecELF->getSectionName() == ".eh_frame")
+      Value = ByteSwap_32(unsigned(Value));
+  }
+  AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
+}
 }
 
-MCAsmBackend *
-llvm::createAArch64leAsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                              StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
+                                            const MCRegisterInfo &MRI,
+                                            StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
-  return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS(), /*isLittle*/ true);
+
+  if (TheTriple.isOSDarwin())
+    return new DarwinAArch64AsmBackend(T, MRI);
+
+  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+  return new ELFAArch64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true);
 }
 
-MCAsmBackend *
-llvm::createAArch64beAsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                              StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
+                                            const MCRegisterInfo &MRI,
+                                            StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
-  return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS(), /*isLittle*/ false);
+
+  assert(TheTriple.isOSBinFormatELF() &&
+         "Big endian is only supported for ELF targets!");
+  return new ELFAArch64AsmBackend(T, TheTriple.getOS(),
+                                  /*IsLittleEndian=*/false);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index a5fe914..e05191e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCValue.h"
@@ -35,257 +36,222 @@ private:
 };
 }
 
-AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian)
-  : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
-                            /*HasRelocationAddend*/ true)
-{}
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
+                                               bool IsLittleEndian)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+                              /*HasRelocationAddend*/ true) {}
 
-AArch64ELFObjectWriter::~AArch64ELFObjectWriter()
-{}
+AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
 
 unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
-                                              const MCFixup &Fixup,
-                                              bool IsPCRel) const {
-  unsigned Type;
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+  AArch64MCExpr::VariantKind RefKind =
+      static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
+  AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
+  bool IsNC = AArch64MCExpr::isNotChecked(RefKind);
+
+  assert((!Target.getSymA() ||
+          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  assert((!Target.getSymB() ||
+          Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
   if (IsPCRel) {
     switch ((unsigned)Fixup.getKind()) {
-    default:
-      llvm_unreachable("Unimplemented fixup -> relocation");
-    case FK_Data_8:
-      return ELF::R_AARCH64_PREL64;
-    case FK_Data_4:
-      return ELF::R_AARCH64_PREL32;
     case FK_Data_2:
       return ELF::R_AARCH64_PREL16;
-    case AArch64::fixup_a64_ld_prel:
-      Type = ELF::R_AARCH64_LD_PREL_LO19;
-      break;
-    case AArch64::fixup_a64_adr_prel:
-      Type = ELF::R_AARCH64_ADR_PREL_LO21;
-      break;
-    case AArch64::fixup_a64_adr_prel_page:
-      Type = ELF::R_AARCH64_ADR_PREL_PG_HI21;
-      break;
-    case AArch64::fixup_a64_adr_prel_got_page:
-      Type = ELF::R_AARCH64_ADR_GOT_PAGE;
-      break;
-    case AArch64::fixup_a64_tstbr:
-      Type = ELF::R_AARCH64_TSTBR14;
-      break;
-    case AArch64::fixup_a64_condbr:
-      Type = ELF::R_AARCH64_CONDBR19;
-      break;
-    case AArch64::fixup_a64_uncondbr:
-      Type = ELF::R_AARCH64_JUMP26;
-      break;
-    case AArch64::fixup_a64_call:
-      Type = ELF::R_AARCH64_CALL26;
-      break;
-    case AArch64::fixup_a64_adr_gottprel_page:
-      Type = ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
-      break;
-    case AArch64::fixup_a64_ld_gottprel_prel19:
-      Type =  ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
-      break;
-    case AArch64::fixup_a64_tlsdesc_adr_page:
-      Type = ELF::R_AARCH64_TLSDESC_ADR_PAGE;
-      break;
+    case FK_Data_4:
+      return ELF::R_AARCH64_PREL32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_PREL64;
+    case AArch64::fixup_aarch64_pcrel_adr_imm21:
+      assert(SymLoc == AArch64MCExpr::VK_NONE && "unexpected ADR relocation");
+      return ELF::R_AARCH64_ADR_PREL_LO21;
+    case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+      if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC)
+        return ELF::R_AARCH64_ADR_PREL_PG_HI21;
+      if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC)
+        return ELF::R_AARCH64_ADR_GOT_PAGE;
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
+        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+      llvm_unreachable("invalid symbol kind for ADRP relocation");
+    case AArch64::fixup_aarch64_pcrel_branch26:
+      return ELF::R_AARCH64_JUMP26;
+    case AArch64::fixup_aarch64_pcrel_call26:
+      return ELF::R_AARCH64_CALL26;
+    case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL)
+        return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
+      return ELF::R_AARCH64_LD_PREL_LO19;
+    case AArch64::fixup_aarch64_pcrel_branch14:
+      return ELF::R_AARCH64_TSTBR14;
+    case AArch64::fixup_aarch64_pcrel_branch19:
+      return ELF::R_AARCH64_CONDBR19;
+    default:
+      llvm_unreachable("Unsupported pc-relative fixup kind");
     }
   } else {
     switch ((unsigned)Fixup.getKind()) {
-    default:
-      llvm_unreachable("Unimplemented fixup -> relocation");
-    case FK_Data_8:
-      return ELF::R_AARCH64_ABS64;
-    case FK_Data_4:
-      return ELF::R_AARCH64_ABS32;
     case FK_Data_2:
       return ELF::R_AARCH64_ABS16;
-    case AArch64::fixup_a64_add_lo12:
-      Type = ELF::R_AARCH64_ADD_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ld64_got_lo12_nc:
-      Type = ELF::R_AARCH64_LD64_GOT_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst8_lo12:
-      Type = ELF::R_AARCH64_LDST8_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst16_lo12:
-      Type = ELF::R_AARCH64_LDST16_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst32_lo12:
-      Type = ELF::R_AARCH64_LDST32_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst64_lo12:
-      Type = ELF::R_AARCH64_LDST64_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst128_lo12:
-      Type = ELF::R_AARCH64_LDST128_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g0:
-      Type = ELF::R_AARCH64_MOVW_UABS_G0;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g0_nc:
-      Type = ELF::R_AARCH64_MOVW_UABS_G0_NC;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g1:
-      Type = ELF::R_AARCH64_MOVW_UABS_G1;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g1_nc:
-      Type = ELF::R_AARCH64_MOVW_UABS_G1_NC;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g2:
-      Type = ELF::R_AARCH64_MOVW_UABS_G2;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g2_nc:
-      Type = ELF::R_AARCH64_MOVW_UABS_G2_NC;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g3:
-      Type = ELF::R_AARCH64_MOVW_UABS_G3;
-      break;
-    case AArch64::fixup_a64_movw_sabs_g0:
-      Type = ELF::R_AARCH64_MOVW_SABS_G0;
-      break;
-    case AArch64::fixup_a64_movw_sabs_g1:
-      Type = ELF::R_AARCH64_MOVW_SABS_G1;
-      break;
-    case AArch64::fixup_a64_movw_sabs_g2:
-      Type = ELF::R_AARCH64_MOVW_SABS_G2;
-      break;
+    case FK_Data_4:
+      return ELF::R_AARCH64_ABS32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_ABS64;
+    case AArch64::fixup_aarch64_add_imm12:
+      if (RefKind == AArch64MCExpr::VK_DTPREL_HI12)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
+      if (RefKind == AArch64MCExpr::VK_TPREL_HI12)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_LO12_NC)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_LO12)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
+      if (RefKind == AArch64MCExpr::VK_TPREL_LO12_NC)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_LO12)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
+      if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12)
+        return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_ADD_ABS_LO12_NC;
 
-    // TLS Local-dynamic block
-    case AArch64::fixup_a64_movw_dtprel_g2:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
-      break;
-    case AArch64::fixup_a64_movw_dtprel_g1:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
-      break;
-    case AArch64::fixup_a64_movw_dtprel_g1_nc:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
-      break;
-    case AArch64::fixup_a64_movw_dtprel_g0:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
-      break;
-    case AArch64::fixup_a64_movw_dtprel_g0_nc:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
-      break;
-    case AArch64::fixup_a64_add_dtprel_hi12:
-      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
-      break;
-    case AArch64::fixup_a64_add_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_add_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst8_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst8_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst16_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst16_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst32_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst32_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst64_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst64_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
-      break;
+      report_fatal_error("invalid fixup for add (uimm12) instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale1:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
 
-    // TLS initial-exec block
-    case AArch64::fixup_a64_movw_gottprel_g1:
-      Type = ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
-      break;
-    case AArch64::fixup_a64_movw_gottprel_g0_nc:
-      Type = ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
-      break;
-    case AArch64::fixup_a64_ld64_gottprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
-      break;
+      report_fatal_error("invalid fixup for 8-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale2:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
 
-    // TLS local-exec block
-    case AArch64::fixup_a64_movw_tprel_g2:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
-      break;
-    case AArch64::fixup_a64_movw_tprel_g1:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
-      break;
-    case AArch64::fixup_a64_movw_tprel_g1_nc:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
-      break;
-    case AArch64::fixup_a64_movw_tprel_g0:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
-      break;
-    case AArch64::fixup_a64_movw_tprel_g0_nc:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
-      break;
-    case AArch64::fixup_a64_add_tprel_hi12:
-      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
-      break;
-    case AArch64::fixup_a64_add_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_add_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst8_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst8_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst16_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst16_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst32_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst32_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst64_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst64_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
-      break;
+      report_fatal_error("invalid fixup for 16-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale4:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
 
-    // TLS general-dynamic block
-    case AArch64::fixup_a64_tlsdesc_adr_page:
-      Type = ELF::R_AARCH64_TLSDESC_ADR_PAGE;
-      break;
-    case AArch64::fixup_a64_tlsdesc_ld64_lo12_nc:
-      Type = ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
-      break;
-    case AArch64::fixup_a64_tlsdesc_add_lo12_nc:
-      Type = ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
-      break;
-    case AArch64::fixup_a64_tlsdesc_call:
-      Type = ELF::R_AARCH64_TLSDESC_CALL;
-      break;
+      report_fatal_error("invalid fixup for 32-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale8:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_GOT && IsNC)
+        return ELF::R_AARCH64_LD64_GOT_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC)
+        return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
+        return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
+
+      report_fatal_error("invalid fixup for 64-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale16:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
+
+      report_fatal_error("invalid fixup for 128-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_movw:
+      if (RefKind == AArch64MCExpr::VK_ABS_G3)
+        return ELF::R_AARCH64_MOVW_UABS_G3;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2)
+        return ELF::R_AARCH64_MOVW_UABS_G2;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2_S)
+        return ELF::R_AARCH64_MOVW_SABS_G2;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G2_NC;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1)
+        return ELF::R_AARCH64_MOVW_UABS_G1;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1_S)
+        return ELF::R_AARCH64_MOVW_SABS_G1;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0)
+        return ELF::R_AARCH64_MOVW_UABS_G0;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0_S)
+        return ELF::R_AARCH64_MOVW_SABS_G0;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G2)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G1)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G0)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G2)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G1)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G0)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_GOTTPREL_G1)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
+      report_fatal_error("invalid fixup for movz/movk instruction");
+      return 0;
+    case AArch64::fixup_aarch64_tlsdesc_call:
+      return ELF::R_AARCH64_TLSDESC_CALL;
+    default:
+      llvm_unreachable("Unknown ELF relocation type");
     }
   }
 
-  return Type;
+  llvm_unreachable("Unimplemented fixup -> relocation");
 }
 
 MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_ostream &OS,
-                                                   uint8_t OSABI,
-                                                   bool IsLittleEndian) {
-  MCELFObjectTargetWriter *MOTW = new AArch64ELFObjectWriter(OSABI, IsLittleEndian);
-  return createELFObjectWriter(MOTW, OS,  IsLittleEndian);
+                                                 uint8_t OSABI,
+                                                 bool IsLittleEndian) {
+  MCELFObjectTargetWriter *MOTW =
+      new AArch64ELFObjectWriter(OSABI, IsLittleEndian);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 473b7dd..a79406d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -56,14 +56,14 @@ namespace {
 class AArch64ELFStreamer : public MCELFStreamer {
 public:
   AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
-                     MCCodeEmitter *Emitter)
+                   MCCodeEmitter *Emitter)
       : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
         LastEMS(EMS_None) {}
 
   ~AArch64ELFStreamer() {}
 
-  virtual void ChangeSection(const MCSection *Section,
-                             const MCExpr *Subsection) {
+  void ChangeSection(const MCSection *Section,
+                     const MCExpr *Subsection) override {
     // We have to keep track of the mapping symbol state of any sections we
     // use. Each one should start off as EMS_None, which is provided as the
     // default constructor by DenseMap::lookup.
@@ -76,7 +76,8 @@ public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer. We override it to add the appropriate mapping symbol if
   /// necessary.
-  virtual void EmitInstruction(const MCInst& Inst, const MCSubtargetInfo &STI) {
+  void EmitInstruction(const MCInst &Inst,
+                       const MCSubtargetInfo &STI) override {
     EmitA64MappingSymbol();
     MCELFStreamer::EmitInstruction(Inst, STI);
   }
@@ -84,7 +85,7 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
   /// if necessary.
-  virtual void EmitBytes(StringRef Data) {
+  void EmitBytes(StringRef Data) override {
     EmitDataMappingSymbol();
     MCELFStreamer::EmitBytes(Data);
   }
@@ -92,7 +93,8 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
   /// if necessary.
-  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {
+  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                     const SMLoc &Loc) override {
     EmitDataMappingSymbol();
     MCELFStreamer::EmitValueImpl(Value, Size);
   }
@@ -105,13 +107,15 @@ private:
   };
 
   void EmitDataMappingSymbol() {
-    if (LastEMS == EMS_Data) return;
+    if (LastEMS == EMS_Data)
+      return;
     EmitMappingSymbol("$d");
     LastEMS = EMS_Data;
   }
 
   void EmitA64MappingSymbol() {
-    if (LastEMS == EMS_A64) return;
+    if (LastEMS == EMS_A64)
+      return;
     EmitMappingSymbol("$x");
     LastEMS = EMS_A64;
   }
@@ -120,15 +124,14 @@ private:
     MCSymbol *Start = getContext().CreateTempSymbol();
     EmitLabel(Start);
 
-    MCSymbol *Symbol =
-      getContext().GetOrCreateSymbol(Name + "." +
-                                     Twine(MappingSymbolCounter++));
+    MCSymbol *Symbol = getContext().GetOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++));
 
     MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
     MCELF::SetType(SD, ELF::STT_NOTYPE);
     MCELF::SetBinding(SD, ELF::STB_LOCAL);
     SD.setExternal(false);
-    AssignSection(Symbol, getCurrentSection().first);
+    Symbol->setSection(*getCurrentSection().first);
 
     const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
     Symbol->setVariableValue(Value);
@@ -144,16 +147,14 @@ private:
 }
 
 namespace llvm {
-  MCELFStreamer* createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack) {
-    AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
-    if (RelaxAll)
-      S->getAssembler().setRelaxAll(true);
-    if (NoExecStack)
-      S->getAssembler().setNoExecStack(true);
-    return S;
-  }
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                        raw_ostream &OS, MCCodeEmitter *Emitter,
+                                        bool RelaxAll, bool NoExecStack) {
+  AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  if (NoExecStack)
+    S->getAssembler().setNoExecStack(true);
+  return S;
+}
 }
-
-
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
index 5a89ca5..bc6973b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -18,10 +18,9 @@
 
 namespace llvm {
 
-  MCELFStreamer* createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                          raw_ostream &OS,
-                                          MCCodeEmitter *Emitter,
-                                          bool RelaxAll, bool NoExecStack);
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                        raw_ostream &OS, MCCodeEmitter *Emitter,
+                                        bool RelaxAll, bool NoExecStack);
 }
 
 #endif // AArch64_ELF_STREAMER_H
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index eeb122d..bf405fb 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -1,4 +1,4 @@
-//=- AArch64/AArch64FixupKinds.h - AArch64 Specific Fixup Entries -*- C++ -*-=//
+//===-- AArch64FixupKinds.h - AArch64 Specific Fixup Entries ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,108 +6,71 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file describes the LLVM fixups applied to MCInsts in the AArch64
-// backend.
-//
-//===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64_AARCH64FIXUPKINDS_H
-#define LLVM_AARCH64_AARCH64FIXUPKINDS_H
+#ifndef LLVM_AArch64FIXUPKINDS_H
+#define LLVM_AArch64FIXUPKINDS_H
 
 #include "llvm/MC/MCFixup.h"
 
 namespace llvm {
-  namespace AArch64 {
-    enum Fixups {
-      fixup_a64_ld_prel = FirstTargetFixupKind,
-      fixup_a64_adr_prel,
-      fixup_a64_adr_prel_page,
-
-      fixup_a64_add_lo12,
-
-      fixup_a64_ldst8_lo12,
-      fixup_a64_ldst16_lo12,
-      fixup_a64_ldst32_lo12,
-      fixup_a64_ldst64_lo12,
-      fixup_a64_ldst128_lo12,
-
-      fixup_a64_tstbr,
-      fixup_a64_condbr,
-      fixup_a64_uncondbr,
-      fixup_a64_call,
-
-      fixup_a64_movw_uabs_g0,
-      fixup_a64_movw_uabs_g0_nc,
-      fixup_a64_movw_uabs_g1,
-      fixup_a64_movw_uabs_g1_nc,
-      fixup_a64_movw_uabs_g2,
-      fixup_a64_movw_uabs_g2_nc,
-      fixup_a64_movw_uabs_g3,
-
-      fixup_a64_movw_sabs_g0,
-      fixup_a64_movw_sabs_g1,
-      fixup_a64_movw_sabs_g2,
-
-      fixup_a64_adr_prel_got_page,
-      fixup_a64_ld64_got_lo12_nc,
-
-      // Produce offsets relative to the module's dynamic TLS area.
-      fixup_a64_movw_dtprel_g2,
-      fixup_a64_movw_dtprel_g1,
-      fixup_a64_movw_dtprel_g1_nc,
-      fixup_a64_movw_dtprel_g0,
-      fixup_a64_movw_dtprel_g0_nc,
-      fixup_a64_add_dtprel_hi12,
-      fixup_a64_add_dtprel_lo12,
-      fixup_a64_add_dtprel_lo12_nc,
-      fixup_a64_ldst8_dtprel_lo12,
-      fixup_a64_ldst8_dtprel_lo12_nc,
-      fixup_a64_ldst16_dtprel_lo12,
-      fixup_a64_ldst16_dtprel_lo12_nc,
-      fixup_a64_ldst32_dtprel_lo12,
-      fixup_a64_ldst32_dtprel_lo12_nc,
-      fixup_a64_ldst64_dtprel_lo12,
-      fixup_a64_ldst64_dtprel_lo12_nc,
-
-      // Produce the GOT entry containing a variable's address in TLS's
-      // initial-exec mode.
-      fixup_a64_movw_gottprel_g1,
-      fixup_a64_movw_gottprel_g0_nc,
-      fixup_a64_adr_gottprel_page,
-      fixup_a64_ld64_gottprel_lo12_nc,
-      fixup_a64_ld_gottprel_prel19,
-
-      // Produce offsets relative to the thread pointer: TPIDR_EL0.
-      fixup_a64_movw_tprel_g2,
-      fixup_a64_movw_tprel_g1,
-      fixup_a64_movw_tprel_g1_nc,
-      fixup_a64_movw_tprel_g0,
-      fixup_a64_movw_tprel_g0_nc,
-      fixup_a64_add_tprel_hi12,
-      fixup_a64_add_tprel_lo12,
-      fixup_a64_add_tprel_lo12_nc,
-      fixup_a64_ldst8_tprel_lo12,
-      fixup_a64_ldst8_tprel_lo12_nc,
-      fixup_a64_ldst16_tprel_lo12,
-      fixup_a64_ldst16_tprel_lo12_nc,
-      fixup_a64_ldst32_tprel_lo12,
-      fixup_a64_ldst32_tprel_lo12_nc,
-      fixup_a64_ldst64_tprel_lo12,
-      fixup_a64_ldst64_tprel_lo12_nc,
-
-      // Produce the special fixups used by the general-dynamic TLS model.
-      fixup_a64_tlsdesc_adr_page,
-      fixup_a64_tlsdesc_ld64_lo12_nc,
-      fixup_a64_tlsdesc_add_lo12_nc,
-      fixup_a64_tlsdesc_call,
-
-
-      // Marker
-      LastTargetFixupKind,
-      NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-    };
-  }
-}
+namespace AArch64 {
+
+enum Fixups {
+  // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADR instruction.
+  fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind,
+
+  // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADRP instruction.
+  fixup_aarch64_pcrel_adrp_imm21,
+
+  // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions.
+  //     No alignment adjustment. All value bits are encoded.
+  fixup_aarch64_add_imm12,
+
+  // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and
+  // store instructions.
+  fixup_aarch64_ldst_imm12_scale1,
+  fixup_aarch64_ldst_imm12_scale2,
+  fixup_aarch64_ldst_imm12_scale4,
+  fixup_aarch64_ldst_imm12_scale8,
+  fixup_aarch64_ldst_imm12_scale16,
+
+  // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by
+  // pc-relative loads and generates relocations directly when necessary.
+  fixup_aarch64_ldr_pcrel_imm19,
+
+  // FIXME: comment
+  fixup_aarch64_movw,
+
+  // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
+  // immediate.
+  fixup_aarch64_pcrel_branch14,
+
+  // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by
+  // b.cc and generates relocations directly when necessary.
+  fixup_aarch64_pcrel_branch19,
+
+  // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
+  // immediate.
+  fixup_aarch64_pcrel_branch26,
+
+  // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
+  // immediate. Distinguished from branch26 only on ELF.
+  fixup_aarch64_pcrel_call26,
+
+  // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF
+  // R_AARCH64_TLSDESC_CALL relocation.
+  fixup_aarch64_tlsdesc_call,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+
+} // end namespace AArch64
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index b090a55..dc4a8bf 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -13,26 +13,82 @@
 
 #include "AArch64MCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
-
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
-AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo(StringRef TT) {
-  Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::aarch64_be)
+enum AsmWriterVariantTy {
+  Default = -1,
+  Generic = 0,
+  Apple = 1
+};
+
+static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
+    "aarch64-neon-syntax", cl::init(Default),
+    cl::desc("Choose style of NEON code to emit from AArch64 backend:"),
+    cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
+               clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"),
+               clEnumValEnd));
+
+AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+
+  PrivateGlobalPrefix = "L";
+  SeparatorString = "%%";
+  CommentString = ";";
+  PointerSize = CalleeSaveStackSlotSize = 8;
+
+  AlignmentIsInBytes = false;
+  UsesELFSectionDirectiveForBSS = true;
+  SupportsDebugInformation = true;
+  UseDataRegionDirectives = true;
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
+    const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
+  MCSymbol *PCSym = Context.CreateTempSymbol();
+  Streamer.EmitLabel(PCSym);
+  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
+  return MCBinaryExpr::CreateSub(Res, PC, Context);
+}
+
+AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
+  Triple T(TT);
+  if (T.getArch() == Triple::arm64_be || T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
 
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+
   PointerSize = 8;
 
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
 
   CommentString = "//";
+  PrivateGlobalPrefix = ".L";
   Code32Directive = ".code\t32";
 
   Data16bitsDirective = "\t.hword\t";
   Data32bitsDirective = "\t.word\t";
   Data64bitsDirective = "\t.xword\t";
 
+  UseDataRegionDirectives = false;
+
+  WeakRefDirective = "\t.weak\t";
+
   HasLEB128 = true;
   SupportsDebugInformation = true;
 
@@ -41,6 +97,3 @@ AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo(StringRef TT) {
 
   UseIntegratedAssembler = true;
 }
-
-// Pin the vtable to this file.
-void AArch64ELFMCAsmInfo::anchor() {}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 43c0e47..42a031d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -1,4 +1,4 @@
-//==-- AArch64MCAsmInfo.h - AArch64 asm properties -------------*- C++ -*--===//
+//=====-- AArch64MCAsmInfo.h - AArch64 asm properties ---------*- C++ -*--====//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +11,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64TARGETASMINFO_H
-#define LLVM_AARCH64TARGETASMINFO_H
+#ifndef AArch64TARGETASMINFO_H
+#define AArch64TARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
 
 namespace llvm {
+class Target;
+class StringRef;
+class MCStreamer;
+struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
+  explicit AArch64MCAsmInfoDarwin();
+  const MCExpr *
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+};
 
-struct AArch64ELFMCAsmInfo : public MCAsmInfoELF {
-  explicit AArch64ELFMCAsmInfo(StringRef TT);
-private:
-  virtual void anchor();
+struct AArch64MCAsmInfoELF : public MCAsmInfo {
+  explicit AArch64MCAsmInfoELF(StringRef TT);
 };
 
 } // namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index b9a61ef..464a18c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -1,4 +1,4 @@
-//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code =//
+//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,10 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -22,524 +21,562 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/raw_ostream.h"
-
 using namespace llvm;
 
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
 namespace {
+
 class AArch64MCCodeEmitter : public MCCodeEmitter {
-  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const AArch64MCCodeEmitter &) LLVM_DELETED_FUNCTION;
   MCContext &Ctx;
 
+  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const AArch64MCCodeEmitter &);     // DO NOT IMPLEMENT
 public:
-  AArch64MCCodeEmitter(MCContext &ctx) : Ctx(ctx) {}
+  AArch64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                     MCContext &ctx)
+      : Ctx(ctx) {}
 
   ~AArch64MCCodeEmitter() {}
 
-  unsigned getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
 
-  unsigned getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
+  /// getLdStUImm12OpValue - Return encoding info for 12-bit unsigned immediate
+  /// attached to a load, store or prfm instruction. If operand requires a
+  /// relocation, record it and return zero in that part of the encoding.
+  template <uint32_t FixupKind>
+  uint32_t getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+  /// target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+  /// the 2-bit shift field.
+  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const;
 
-  template<int MemSize>
-  unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const {
-    return getOffsetUImm12OpValue(MI, OpIdx, Fixups, STI, MemSize);
-  }
+  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
+  /// branch target.
+  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
 
-  unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI,
-                                    int MemSize) const;
+  /// getLoadLiteralOpValue - Return the encoded value for a load-literal
+  /// pc-relative address.
+  uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
-  unsigned getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const;
-  unsigned getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const;
+  /// getMemExtendOpValue - Return the encoded value for a reg-extend load/store
+  /// instruction: bit 0 is whether a shift is present, bit 1 is whether the
+  /// operation is a sign extend (as opposed to a zero extend).
+  uint32_t getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
-  unsigned getShiftRightImm8(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getShiftRightImm16(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-  unsigned getShiftRightImm32(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-  unsigned getShiftRightImm64(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
+  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+  /// branch target.
+  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
 
-  unsigned getShiftLeftImm8(const MCInst &MI, unsigned Op,
-                            SmallVectorImpl<MCFixup> &Fixups,
-                            const MCSubtargetInfo &STI) const;
-  unsigned getShiftLeftImm16(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getShiftLeftImm32(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getShiftLeftImm64(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
+  /// getBranchTargetOpValue - Return the encoded value for an unconditional
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
 
-  // Labels are handled mostly the same way: a symbol is needed, and
-  // just gets some fixup attached.
-  template<AArch64::Fixups fixupDesired>
-  unsigned getLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                           SmallVectorImpl<MCFixup> &Fixups,
-                           const MCSubtargetInfo &STI) const;
+  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
+  /// of a MOVZ or MOVK instruction.
+  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
-  unsigned  getLoadLitLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const;
+  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
+  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
+  /// shifter (MSL).
+  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
 
+  /// getFixedPointScaleOpValue - Return the encoded value for the
+  // FP-to-fixed-point scale factor.
+  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
 
-  unsigned getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
 
+  /// getSIMDShift64OpValue - Return the encoded value for the
+  // shift-by-immediate AdvSIMD instructions.
+  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
-  unsigned getAddressWithFixup(const MCOperand &MO,
-                               unsigned FixupKind,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-
+  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
 
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
 
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
+  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
+  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                   const MCSubtargetInfo &STI) const;
 
-  void EmitByte(unsigned char C, raw_ostream &OS) const {
-    OS << (char)C;
-  }
+  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
 
-  void EmitInstruction(uint32_t Val, raw_ostream &OS) const {
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
     // Output the constant in little endian byte order.
-    for (unsigned i = 0; i != 4; ++i) {
-      EmitByte(Val & 0xff, OS);
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, OS);
       Val >>= 8;
     }
   }
 
-
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
-
-  template<int hasRs, int hasRt2> unsigned
-  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
-                        const MCSubtargetInfo &STI) const;
-
-  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                   const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue,
                       const MCSubtargetInfo &STI) const;
 
+  template<int hasRs, int hasRt2> unsigned
+  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
+                        const MCSubtargetInfo &STI) const;
 
+  unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
+                                     const MCSubtargetInfo &STI) const;
 };
 
 } // end anonymous namespace
 
-unsigned AArch64MCCodeEmitter::getAddressWithFixup(const MCOperand &MO,
-                                       unsigned FixupKind,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  if (!MO.isExpr()) {
-    // This can occur for manually decoded or constructed MCInsts, but neither
-    // the assembly-parser nor instruction selection will currently produce an
-    // MCInst that's not a symbol reference.
-    assert(MO.isImm() && "Unexpected address requested");
-    return MO.getImm();
-  }
+MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new AArch64MCCodeEmitter(MCII, STI, Ctx);
+}
 
-  const MCExpr *Expr = MO.getExpr();
-  MCFixupKind Kind = MCFixupKind(FixupKind);
-  Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned
+AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else {
+    assert(MO.isImm() && "did not expect relocated expression");
+    return static_cast<unsigned>(MO.getImm());
+  }
 
+  assert(0 && "Unable to encode MCOperand!");
   return 0;
 }
 
-unsigned AArch64MCCodeEmitter::
-getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                       SmallVectorImpl<MCFixup> &Fixups,
-                       const MCSubtargetInfo &STI,
-                       int MemSize) const {
-  const MCOperand &ImmOp = MI.getOperand(OpIdx);
-  if (ImmOp.isImm())
-    return ImmOp.getImm();
-
-  assert(ImmOp.isExpr() && "Unexpected operand type");
-  const AArch64MCExpr *Expr = cast<AArch64MCExpr>(ImmOp.getExpr());
-  unsigned FixupKind;
-
-
-  switch (Expr->getKind()) {
-  default: llvm_unreachable("Unexpected operand modifier");
-  case AArch64MCExpr::VK_AARCH64_LO12: {
-    static const unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_lo12,
-                                             AArch64::fixup_a64_ldst16_lo12,
-                                             AArch64::fixup_a64_ldst32_lo12,
-                                             AArch64::fixup_a64_ldst64_lo12,
-                                AArch64::fixup_a64_ldst128_lo12 };
-    assert(MemSize <= 16 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_GOT_LO12:
-    assert(MemSize == 8 && "Invalid fixup for operation");
-    FixupKind = AArch64::fixup_a64_ld64_got_lo12_nc;
-    break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12:  {
-    static const unsigned FixupsBySize[] = {
-      AArch64::fixup_a64_ldst8_dtprel_lo12,
-      AArch64::fixup_a64_ldst16_dtprel_lo12,
-      AArch64::fixup_a64_ldst32_dtprel_lo12,
-      AArch64::fixup_a64_ldst64_dtprel_lo12
-    };
-    assert(MemSize <= 8 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC: {
-    static const unsigned FixupsBySize[] = {
-      AArch64::fixup_a64_ldst8_dtprel_lo12_nc,
-      AArch64::fixup_a64_ldst16_dtprel_lo12_nc,
-      AArch64::fixup_a64_ldst32_dtprel_lo12_nc,
-      AArch64::fixup_a64_ldst64_dtprel_lo12_nc
-    };
-    assert(MemSize <= 8 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL_LO12:
-    assert(MemSize == 8 && "Invalid fixup for operation");
-    FixupKind = AArch64::fixup_a64_ld64_gottprel_lo12_nc;
-    break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_LO12:{
-    static const unsigned FixupsBySize[] = {
-      AArch64::fixup_a64_ldst8_tprel_lo12,
-      AArch64::fixup_a64_ldst16_tprel_lo12,
-      AArch64::fixup_a64_ldst32_tprel_lo12,
-      AArch64::fixup_a64_ldst64_tprel_lo12
-    };
-    assert(MemSize <= 8 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC: {
-    static const unsigned FixupsBySize[] = {
-      AArch64::fixup_a64_ldst8_tprel_lo12_nc,
-      AArch64::fixup_a64_ldst16_tprel_lo12_nc,
-      AArch64::fixup_a64_ldst32_tprel_lo12_nc,
-      AArch64::fixup_a64_ldst64_tprel_lo12_nc
-    };
-    assert(MemSize <= 8 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_TLSDESC_LO12:
-    assert(MemSize == 8 && "Invalid fixup for operation");
-    FixupKind = AArch64::fixup_a64_tlsdesc_ld64_lo12_nc;
-    break;
+template<unsigned FixupKind> uint32_t
+AArch64MCCodeEmitter::getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  uint32_t ImmVal = 0;
+
+  if (MO.isImm())
+    ImmVal = static_cast<uint32_t>(MO.getImm());
+  else {
+    assert(MO.isExpr() && "unable to encode load/store imm operand");
+    MCFixupKind Kind = MCFixupKind(FixupKind);
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+    ++MCNumFixups;
   }
 
-  return getAddressWithFixup(ImmOp, FixupKind, Fixups, STI);
+  return ImmVal;
 }
 
-unsigned
-AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+/// target.
+uint32_t
+AArch64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
   if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+  const MCExpr *Expr = MO.getExpr();
 
-  assert(MO.isExpr());
-
-  unsigned FixupKind = 0;
-  switch(cast<AArch64MCExpr>(MO.getExpr())->getKind()) {
-  default: llvm_unreachable("Invalid expression modifier");
-  case AArch64MCExpr::VK_AARCH64_LO12:
-    FixupKind = AArch64::fixup_a64_add_lo12; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_HI12:
-    FixupKind = AArch64::fixup_a64_add_dtprel_hi12; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12:
-    FixupKind = AArch64::fixup_a64_add_dtprel_lo12; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC:
-    FixupKind = AArch64::fixup_a64_add_dtprel_lo12_nc; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_HI12:
-    FixupKind = AArch64::fixup_a64_add_tprel_hi12; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_LO12:
-    FixupKind = AArch64::fixup_a64_add_tprel_lo12; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC:
-    FixupKind = AArch64::fixup_a64_add_tprel_lo12_nc; break;
-  case AArch64MCExpr::VK_AARCH64_TLSDESC_LO12:
-    FixupKind = AArch64::fixup_a64_tlsdesc_add_lo12_nc; break;
-  }
+  MCFixupKind Kind = MI.getOpcode() == AArch64::ADR
+                         ? MCFixupKind(AArch64::fixup_aarch64_pcrel_adr_imm21)
+                         : MCFixupKind(AArch64::fixup_aarch64_pcrel_adrp_imm21);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
 
-  return getAddressWithFixup(MO, FixupKind, Fixups, STI);
-}
+  MCNumFixups += 1;
 
-unsigned
-AArch64MCCodeEmitter::getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+  // All of the information is in the fixup.
+  return 0;
+}
 
+/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
+/// return value.
+uint32_t
+AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  // Suboperands are [imm, shifter].
   const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(AArch64_AM::getShiftType(MO1.getImm()) == AArch64_AM::LSL &&
+         "unexpected shift type for add/sub immediate");
+  unsigned ShiftVal = AArch64_AM::getShiftValue(MO1.getImm());
+  assert((ShiftVal == 0 || ShiftVal == 12) &&
+         "unexpected shift value for add/sub immediate");
   if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
-
-  assert(MO.isExpr());
+    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
+  assert(MO.isExpr() && "Unable to encode MCOperand!");
+  const MCExpr *Expr = MO.getExpr();
 
-  unsigned Modifier = AArch64MCExpr::VK_AARCH64_None;
-  if (const AArch64MCExpr *Expr = dyn_cast<AArch64MCExpr>(MO.getExpr()))
-    Modifier = Expr->getKind();
+  // Encode the 12 bits of the fixup.
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_add_imm12);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
 
-  unsigned FixupKind = 0;
-  switch(Modifier) {
-  case AArch64MCExpr::VK_AARCH64_None:
-    FixupKind = AArch64::fixup_a64_adr_prel_page;
-    break;
-  case AArch64MCExpr::VK_AARCH64_GOT:
-    FixupKind = AArch64::fixup_a64_adr_prel_got_page;
-    break;
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL:
-    FixupKind = AArch64::fixup_a64_adr_gottprel_page;
-    break;
-  case AArch64MCExpr::VK_AARCH64_TLSDESC:
-    FixupKind = AArch64::fixup_a64_tlsdesc_adr_page;
-    break;
-  default:
-    llvm_unreachable("Unknown symbol reference kind for ADRP instruction");
-  }
+  ++MCNumFixups;
 
-  return getAddressWithFixup(MO, FixupKind, Fixups, STI);
+  return 0;
 }
 
-unsigned
-AArch64MCCodeEmitter::getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-
+/// getCondBranchTargetOpValue - Return the encoded value for a conditional
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Only immediate expected for shift");
 
-  return ((32 - MO.getImm()) & 0x1f) | (31 - MO.getImm()) << 6;
-}
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
 
-unsigned
-AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
 
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Only immediate expected for shift");
+  ++MCNumFixups;
 
-  return ((64 - MO.getImm()) & 0x3f) | (63 - MO.getImm()) << 6;
+  // All of the information is in the fixup.
+  return 0;
 }
 
-unsigned AArch64MCCodeEmitter::getShiftRightImm8(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return 8 - MI.getOperand(Op).getImm();
-}
+/// getLoadLiteralOpValue - Return the encoded value for a load-literal
+/// pc-relative address.
+uint32_t
+AArch64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
 
-unsigned AArch64MCCodeEmitter::getShiftRightImm16(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return 16 - MI.getOperand(Op).getImm();
-}
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
 
-unsigned AArch64MCCodeEmitter::getShiftRightImm32(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return 32 - MI.getOperand(Op).getImm();
-}
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_ldr_pcrel_imm19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
 
-unsigned AArch64MCCodeEmitter::getShiftRightImm64(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return 64 - MI.getOperand(Op).getImm();
-}
+  ++MCNumFixups;
 
-unsigned AArch64MCCodeEmitter::getShiftLeftImm8(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return MI.getOperand(Op).getImm() - 8;
+  // All of the information is in the fixup.
+  return 0;
 }
 
-unsigned AArch64MCCodeEmitter::getShiftLeftImm16(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return MI.getOperand(Op).getImm() - 16;
+uint32_t
+AArch64MCCodeEmitter::getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  unsigned SignExtend = MI.getOperand(OpIdx).getImm();
+  unsigned DoShift = MI.getOperand(OpIdx + 1).getImm();
+  return (SignExtend << 1) | DoShift;
 }
 
-unsigned AArch64MCCodeEmitter::getShiftLeftImm32(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return MI.getOperand(Op).getImm() - 32;
-}
+uint32_t
+AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
 
-unsigned AArch64MCCodeEmitter::getShiftLeftImm64(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return MI.getOperand(Op).getImm() - 64;
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected movz/movk immediate");
+
+  Fixups.push_back(MCFixup::Create(
+      0, MO.getExpr(), MCFixupKind(AArch64::fixup_aarch64_movw), MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
 }
 
-template<AArch64::Fixups fixupDesired> unsigned
-AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI,
-                                      unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const {
+/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getTestBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
 
-  if (MO.isExpr())
-    return getAddressWithFixup(MO, fixupDesired, Fixups, STI);
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
 
-  assert(MO.isImm());
-  return MO.getImm();
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch14);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
 }
 
-unsigned
-AArch64MCCodeEmitter::getLoadLitLabelOpValue(const MCInst &MI,
-                                       unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+/// getBranchTargetOpValue - Return the encoded value for an unconditional
+/// branch target.
+uint32_t
+AArch64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
 
+  // If the destination is an immediate, we have nothing to do.
   if (MO.isImm())
     return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
 
-  assert(MO.isExpr());
+  MCFixupKind Kind = MI.getOpcode() == AArch64::BL
+                         ? MCFixupKind(AArch64::fixup_aarch64_pcrel_call26)
+                         : MCFixupKind(AArch64::fixup_aarch64_pcrel_branch26);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
 
-  unsigned FixupKind;
-  if (isa<AArch64MCExpr>(MO.getExpr())) {
-    assert(dyn_cast<AArch64MCExpr>(MO.getExpr())->getKind()
-           == AArch64MCExpr::VK_AARCH64_GOTTPREL
-           && "Invalid symbol modifier for literal load");
-    FixupKind = AArch64::fixup_a64_ld_gottprel_prel19;
-  } else {
-    FixupKind = AArch64::fixup_a64_ld_prel;
-  }
+  ++MCNumFixups;
 
-  return getAddressWithFixup(MO, FixupKind, Fixups, STI);
+  // All of the information is in the fixup.
+  return 0;
 }
 
+/// getVecShifterOpValue - Return the encoded value for the vector shifter:
+///
+///   00 -> 0
+///   01 -> 8
+///   10 -> 16
+///   11 -> 24
+uint32_t
+AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
 
-unsigned
-AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI,
-                                       const MCOperand &MO,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  if (MO.isReg()) {
-    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
-  } else if (MO.isImm()) {
-    return static_cast<unsigned>(MO.getImm());
+  switch (MO.getImm()) {
+  default:
+    break;
+  case 0:
+    return 0;
+  case 8:
+    return 1;
+  case 16:
+    return 2;
+  case 24:
+    return 3;
   }
 
-  llvm_unreachable("Unable to encode MCOperand!");
+  assert(false && "Invalid value for vector shift amount!");
   return 0;
 }
 
-unsigned
-AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  const MCOperand &UImm16MO = MI.getOperand(OpIdx);
-  const MCOperand &ShiftMO = MI.getOperand(OpIdx + 1);
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm());
+}
 
-  unsigned Result = static_cast<unsigned>(ShiftMO.getImm()) << 16;
+uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm() | 32);
+}
 
-  if (UImm16MO.isImm()) {
-    Result |= UImm16MO.getImm();
-    return Result;
-  }
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 32 - (MO.getImm() | 16);
+}
 
-  const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
-  AArch64::Fixups requestedFixup;
-  switch (A64E->getKind()) {
-  default: llvm_unreachable("unexpected expression modifier");
-  case AArch64MCExpr::VK_AARCH64_ABS_G0:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g0; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G0_NC:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g0_nc; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G1:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g1; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G1_NC:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g1_nc; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G2:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g2; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G2_NC:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g2_nc; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G3:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g3; break;
-  case AArch64MCExpr::VK_AARCH64_SABS_G0:
-    requestedFixup = AArch64::fixup_a64_movw_sabs_g0; break;
-  case AArch64MCExpr::VK_AARCH64_SABS_G1:
-    requestedFixup = AArch64::fixup_a64_movw_sabs_g1; break;
-  case AArch64MCExpr::VK_AARCH64_SABS_G2:
-    requestedFixup = AArch64::fixup_a64_movw_sabs_g2; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g2; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g1; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g1_nc; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g0; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g0_nc; break;
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
-    requestedFixup = AArch64::fixup_a64_movw_gottprel_g1; break;
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC:
-    requestedFixup = AArch64::fixup_a64_movw_gottprel_g0_nc; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G2:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g2; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G1:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g1; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G1_NC:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g1_nc; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G0:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g0; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G0_NC:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g0_nc; break;
-  }
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 16 - (MO.getImm() | 8);
+}
 
-  return Result | getAddressWithFixup(UImm16MO, requestedFixup, Fixups, STI);
+/// getFixedPointScaleOpValue - Return the encoded value for the
+// FP-to-fixed-point scale factor.
+uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
 }
 
-template<int hasRs, int hasRt2> unsigned
-AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
-                                            unsigned EncodedValue,
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
                                             const MCSubtargetInfo &STI) const {
-  if (!hasRs) EncodedValue |= 0x001F0000;
-  if (!hasRt2) EncodedValue |= 0x00007C00;
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
 
-  return EncodedValue;
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 32 - MO.getImm();
 }
 
-unsigned
-AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                              const MCSubtargetInfo &STI) const {
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 16 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 8 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 64;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 32;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 16;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 8;
+}
+
+/// getMoveVecShifterOpValue - Return the encoded value for the vector move
+/// shifter (MSL).
+uint32_t AArch64MCCodeEmitter::getMoveVecShifterOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() &&
+         "Expected an immediate value for the move shift amount!");
+  unsigned ShiftVal = AArch64_AM::getShiftValue(MO.getImm());
+  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
+  return ShiftVal == 8 ? 0 : 1;
+}
+
+unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                                       const MCSubtargetInfo &STI) const {
   // If one of the signed fixup kinds is applied to a MOVZ instruction, the
   // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
   // job to ensure that any bits possibly affected by this are 0. This means we
@@ -552,23 +589,38 @@ AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
 
   const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
   switch (A64E->getKind()) {
-  case AArch64MCExpr::VK_AARCH64_SABS_G0:
-  case AArch64MCExpr::VK_AARCH64_SABS_G1:
-  case AArch64MCExpr::VK_AARCH64_SABS_G2:
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
-  case AArch64MCExpr::VK_AARCH64_TPREL_G2:
-  case AArch64MCExpr::VK_AARCH64_TPREL_G1:
-  case AArch64MCExpr::VK_AARCH64_TPREL_G0:
+  case AArch64MCExpr::VK_DTPREL_G2:
+  case AArch64MCExpr::VK_DTPREL_G1:
+  case AArch64MCExpr::VK_DTPREL_G0:
+  case AArch64MCExpr::VK_GOTTPREL_G1:
+  case AArch64MCExpr::VK_TPREL_G2:
+  case AArch64MCExpr::VK_TPREL_G1:
+  case AArch64MCExpr::VK_TPREL_G0:
     return EncodedValue & ~(1u << 30);
   default:
     // Nothing to do for an unsigned fixup.
     return EncodedValue;
   }
 
-  llvm_unreachable("Should have returned by now");
+
+  return EncodedValue & ~(1u << 30);
+}
+
+void AArch64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  if (MI.getOpcode() == AArch64::TLSDESCCALL) {
+    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+    // following (BLR) instruction. It doesn't emit any code itself so it
+    // doesn't go through the normal TableGenerated channels.
+    MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call);
+    Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
+    return;
+  }
+
+  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  EmitConstant(Binary, 4, OS);
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
 }
 
 unsigned
@@ -581,32 +633,22 @@ AArch64MCCodeEmitter::fixMulHigh(const MCInst &MI,
   return EncodedValue;
 }
 
-MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
-                                                const MCRegisterInfo &MRI,
-                                                const MCSubtargetInfo &STI,
-                                                MCContext &Ctx) {
-  return new AArch64MCCodeEmitter(Ctx);
-}
-
-void AArch64MCCodeEmitter::
-EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups,
-                  const MCSubtargetInfo &STI) const {
-  if (MI.getOpcode() == AArch64::TLSDESCCALL) {
-    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
-    // following (BLR) instruction. It doesn't emit any code itself so it
-    // doesn't go through the normal TableGenerated channels.
-    MCFixupKind Fixup = MCFixupKind(AArch64::fixup_a64_tlsdesc_call);
-    const MCExpr *Expr;
-    Expr = AArch64MCExpr::CreateTLSDesc(MI.getOperand(0).getExpr(), Ctx);
-    Fixups.push_back(MCFixup::Create(0, Expr, Fixup));
-    return;
-  }
-
-  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+template<int hasRs, int hasRt2> unsigned
+AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
+                                            unsigned EncodedValue,
+                                            const MCSubtargetInfo &STI) const {
+  if (!hasRs) EncodedValue |= 0x001F0000;
+  if (!hasRt2) EncodedValue |= 0x00007C00;
 
-  EmitInstruction(Binary, OS);
+  return EncodedValue;
 }
 
+unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
+    const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const {
+  // The Rm field of FCMP and friends is unused - it should be assembled
+  // as 0, but is ignored by the processor.
+  EncodedValue &= ~(0x1f << 16);
+  return EncodedValue;
+}
 
 #include "AArch64GenMCCodeEmitter.inc"
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index c7ccaee..85c3ec7 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -12,74 +12,121 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "aarch64mcexpr"
 #include "AArch64MCExpr.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Object/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-const AArch64MCExpr*
-AArch64MCExpr::Create(VariantKind Kind, const MCExpr *Expr,
-                      MCContext &Ctx) {
-  return new (Ctx) AArch64MCExpr(Kind, Expr);
+#define DEBUG_TYPE "aarch64symbolrefexpr"
+
+const AArch64MCExpr *AArch64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
+                                       MCContext &Ctx) {
+  return new (Ctx) AArch64MCExpr(Expr, Kind);
+}
+
+StringRef AArch64MCExpr::getVariantKindName() const {
+  switch (static_cast<uint32_t>(getKind())) {
+  case VK_CALL:                return "";
+  case VK_LO12:                return ":lo12:";
+  case VK_ABS_G3:              return ":abs_g3:";
+  case VK_ABS_G2:              return ":abs_g2:";
+  case VK_ABS_G2_S:            return ":abs_g2_s:";
+  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case VK_ABS_G1:              return ":abs_g1:";
+  case VK_ABS_G1_S:            return ":abs_g1_s:";
+  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case VK_ABS_G0:              return ":abs_g0:";
+  case VK_ABS_G0_S:            return ":abs_g0_s:";
+  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case VK_DTPREL_G2:           return ":dtprel_g2:";
+  case VK_DTPREL_G1:           return ":dtprel_g1:";
+  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case VK_DTPREL_G0:           return ":dtprel_g0:";
+  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
+  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case VK_TPREL_G2:            return ":tprel_g2:";
+  case VK_TPREL_G1:            return ":tprel_g1:";
+  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case VK_TPREL_G0:            return ":tprel_g0:";
+  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case VK_TPREL_HI12:          return ":tprel_hi12:";
+  case VK_TPREL_LO12:          return ":tprel_lo12:";
+  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case VK_ABS_PAGE:            return "";
+  case VK_GOT_PAGE:            return ":got:";
+  case VK_GOT_LO12:            return ":got_lo12:";
+  case VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case VK_TLSDESC:             return "";
+  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  default:
+    llvm_unreachable("Invalid ELF symbol kind");
+  }
 }
 
 void AArch64MCExpr::PrintImpl(raw_ostream &OS) const {
-  switch (Kind) {
-  default: llvm_unreachable("Invalid kind!");
-  case VK_AARCH64_GOT:              OS << ":got:"; break;
-  case VK_AARCH64_GOT_LO12:         OS << ":got_lo12:"; break;
-  case VK_AARCH64_LO12:             OS << ":lo12:"; break;
-  case VK_AARCH64_ABS_G0:           OS << ":abs_g0:"; break;
-  case VK_AARCH64_ABS_G0_NC:        OS << ":abs_g0_nc:"; break;
-  case VK_AARCH64_ABS_G1:           OS << ":abs_g1:"; break;
-  case VK_AARCH64_ABS_G1_NC:        OS << ":abs_g1_nc:"; break;
-  case VK_AARCH64_ABS_G2:           OS << ":abs_g2:"; break;
-  case VK_AARCH64_ABS_G2_NC:        OS << ":abs_g2_nc:"; break;
-  case VK_AARCH64_ABS_G3:           OS << ":abs_g3:"; break;
-  case VK_AARCH64_SABS_G0:          OS << ":abs_g0_s:"; break;
-  case VK_AARCH64_SABS_G1:          OS << ":abs_g1_s:"; break;
-  case VK_AARCH64_SABS_G2:          OS << ":abs_g2_s:"; break;
-  case VK_AARCH64_DTPREL_G2:        OS << ":dtprel_g2:"; break;
-  case VK_AARCH64_DTPREL_G1:        OS << ":dtprel_g1:"; break;
-  case VK_AARCH64_DTPREL_G1_NC:     OS << ":dtprel_g1_nc:"; break;
-  case VK_AARCH64_DTPREL_G0:        OS << ":dtprel_g0:"; break;
-  case VK_AARCH64_DTPREL_G0_NC:     OS << ":dtprel_g0_nc:"; break;
-  case VK_AARCH64_DTPREL_HI12:      OS << ":dtprel_hi12:"; break;
-  case VK_AARCH64_DTPREL_LO12:      OS << ":dtprel_lo12:"; break;
-  case VK_AARCH64_DTPREL_LO12_NC:   OS << ":dtprel_lo12_nc:"; break;
-  case VK_AARCH64_GOTTPREL_G1:      OS << ":gottprel_g1:"; break;
-  case VK_AARCH64_GOTTPREL_G0_NC:   OS << ":gottprel_g0_nc:"; break;
-  case VK_AARCH64_GOTTPREL:         OS << ":gottprel:"; break;
-  case VK_AARCH64_GOTTPREL_LO12:    OS << ":gottprel_lo12:"; break;
-  case VK_AARCH64_TPREL_G2:         OS << ":tprel_g2:"; break;
-  case VK_AARCH64_TPREL_G1:         OS << ":tprel_g1:"; break;
-  case VK_AARCH64_TPREL_G1_NC:      OS << ":tprel_g1_nc:"; break;
-  case VK_AARCH64_TPREL_G0:         OS << ":tprel_g0:"; break;
-  case VK_AARCH64_TPREL_G0_NC:      OS << ":tprel_g0_nc:"; break;
-  case VK_AARCH64_TPREL_HI12:       OS << ":tprel_hi12:"; break;
-  case VK_AARCH64_TPREL_LO12:       OS << ":tprel_lo12:"; break;
-  case VK_AARCH64_TPREL_LO12_NC:    OS << ":tprel_lo12_nc:"; break;
-  case VK_AARCH64_TLSDESC:          OS << ":tlsdesc:"; break;
-  case VK_AARCH64_TLSDESC_LO12:     OS << ":tlsdesc_lo12:"; break;
+  if (getKind() != VK_NONE)
+    OS << getVariantKindName();
+  OS << *Expr;
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+// FIXME: really do above: now that two backends are using it.
+static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
 
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbolsImpl(BE->getLHS(), Asm);
+    AddValueSymbolsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
   }
+}
+
+void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbolsImpl(getSubExpr(), Asm);
+}
 
-  const MCExpr *Expr = getSubExpr();
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    OS << '(';
-  Expr->print(OS);
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    OS << ')';
+const MCSection *AArch64MCExpr::FindAssociatedSection() const {
+  llvm_unreachable("FIXME: what goes here?");
 }
 
-bool
-AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                         const MCAsmLayout *Layout) const {
-  return getSubExpr()->EvaluateAsRelocatable(Res, Layout);
+bool AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                            const MCAsmLayout *Layout) const {
+  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
+    return false;
+
+  Res =
+      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
 }
 
 static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
@@ -113,66 +160,15 @@ static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
 }
 
 void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
-  switch (getKind()) {
+  switch (getSymbolLoc(Kind)) {
   default:
     return;
-  case VK_AARCH64_DTPREL_G2:
-  case VK_AARCH64_DTPREL_G1:
-  case VK_AARCH64_DTPREL_G1_NC:
-  case VK_AARCH64_DTPREL_G0:
-  case VK_AARCH64_DTPREL_G0_NC:
-  case VK_AARCH64_DTPREL_HI12:
-  case VK_AARCH64_DTPREL_LO12:
-  case VK_AARCH64_DTPREL_LO12_NC:
-  case VK_AARCH64_GOTTPREL_G1:
-  case VK_AARCH64_GOTTPREL_G0_NC:
-  case VK_AARCH64_GOTTPREL:
-  case VK_AARCH64_GOTTPREL_LO12:
-  case VK_AARCH64_TPREL_G2:
-  case VK_AARCH64_TPREL_G1:
-  case VK_AARCH64_TPREL_G1_NC:
-  case VK_AARCH64_TPREL_G0:
-  case VK_AARCH64_TPREL_G0_NC:
-  case VK_AARCH64_TPREL_HI12:
-  case VK_AARCH64_TPREL_LO12:
-  case VK_AARCH64_TPREL_LO12_NC:
-  case VK_AARCH64_TLSDESC:
-  case VK_AARCH64_TLSDESC_LO12:
+  case VK_DTPREL:
+  case VK_GOTTPREL:
+  case VK_TPREL:
+  case VK_TLSDESC:
     break;
   }
 
   fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
 }
-
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-// FIXME: really do above: now that two backends are using it.
-static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expr!");
-    break;
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbolsImpl(BE->getLHS(), Asm);
-    AddValueSymbolsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbolsImpl(getSubExpr(), Asm);
-}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index d9798ae..e869ed0 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -1,4 +1,4 @@
-//==- AArch64MCExpr.h - AArch64 specific MC expression classes --*- C++ -*-===//
+//=--- AArch64MCExpr.h - AArch64 specific MC expression classes ---*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,168 +12,149 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64MCEXPR_H
-#define LLVM_AARCH64MCEXPR_H
+#ifndef LLVM_AArch64MCEXPR_H
+#define LLVM_AArch64MCEXPR_H
 
 #include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 
 class AArch64MCExpr : public MCTargetExpr {
 public:
   enum VariantKind {
-    VK_AARCH64_None,
-    VK_AARCH64_GOT,      // :got: modifier in assembly
-    VK_AARCH64_GOT_LO12, // :got_lo12:
-    VK_AARCH64_LO12,     // :lo12:
-
-    VK_AARCH64_ABS_G0, // :abs_g0:
-    VK_AARCH64_ABS_G0_NC, // :abs_g0_nc:
-    VK_AARCH64_ABS_G1,
-    VK_AARCH64_ABS_G1_NC,
-    VK_AARCH64_ABS_G2,
-    VK_AARCH64_ABS_G2_NC,
-    VK_AARCH64_ABS_G3,
-
-    VK_AARCH64_SABS_G0, // :abs_g0_s:
-    VK_AARCH64_SABS_G1,
-    VK_AARCH64_SABS_G2,
-
-    VK_AARCH64_DTPREL_G2, // :dtprel_g2:
-    VK_AARCH64_DTPREL_G1,
-    VK_AARCH64_DTPREL_G1_NC,
-    VK_AARCH64_DTPREL_G0,
-    VK_AARCH64_DTPREL_G0_NC,
-    VK_AARCH64_DTPREL_HI12,
-    VK_AARCH64_DTPREL_LO12,
-    VK_AARCH64_DTPREL_LO12_NC,
-
-    VK_AARCH64_GOTTPREL_G1, // :gottprel:
-    VK_AARCH64_GOTTPREL_G0_NC,
-    VK_AARCH64_GOTTPREL,
-    VK_AARCH64_GOTTPREL_LO12,
-
-    VK_AARCH64_TPREL_G2, // :tprel:
-    VK_AARCH64_TPREL_G1,
-    VK_AARCH64_TPREL_G1_NC,
-    VK_AARCH64_TPREL_G0,
-    VK_AARCH64_TPREL_G0_NC,
-    VK_AARCH64_TPREL_HI12,
-    VK_AARCH64_TPREL_LO12,
-    VK_AARCH64_TPREL_LO12_NC,
-
-    VK_AARCH64_TLSDESC, // :tlsdesc:
-    VK_AARCH64_TLSDESC_LO12
+    VK_NONE     = 0x000,
+
+    // Symbol locations specifying (roughly speaking) what calculation should be
+    // performed to construct the final address for the relocated
+    // symbol. E.g. direct, via the GOT, ...
+    VK_ABS      = 0x001,
+    VK_SABS     = 0x002,
+    VK_GOT      = 0x003,
+    VK_DTPREL   = 0x004,
+    VK_GOTTPREL = 0x005,
+    VK_TPREL    = 0x006,
+    VK_TLSDESC  = 0x007,
+    VK_SymLocBits = 0x00f,
+
+    // Variants specifying which part of the final address calculation is
+    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
+    // MOVZ/MOVK.
+    VK_PAGE     = 0x010,
+    VK_PAGEOFF  = 0x020,
+    VK_HI12     = 0x030,
+    VK_G0       = 0x040,
+    VK_G1       = 0x050,
+    VK_G2       = 0x060,
+    VK_G3       = 0x070,
+    VK_AddressFragBits = 0x0f0,
+
+    // Whether the final relocation is a checked one (where a linker should
+    // perform a range-check on the final address) or not. Note that this field
+    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
+    // on its own is a non-checked relocation. We side with ELF on being
+    // explicit about this!
+    VK_NC       = 0x100,
+
+    // Convenience definitions for referring to specific textual representations
+    // of relocation specifiers. Note that this means the "_NC" is sometimes
+    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
+    // since a user would write ":lo12:").
+    VK_CALL              = VK_ABS,
+    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
+    VK_ABS_G3            = VK_ABS      | VK_G3,
+    VK_ABS_G2            = VK_ABS      | VK_G2,
+    VK_ABS_G2_S          = VK_SABS     | VK_G2,
+    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
+    VK_ABS_G1            = VK_ABS      | VK_G1,
+    VK_ABS_G1_S          = VK_SABS     | VK_G1,
+    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
+    VK_ABS_G0            = VK_ABS      | VK_G0,
+    VK_ABS_G0_S          = VK_SABS     | VK_G0,
+    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
+    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
+    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
+    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
+    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
+    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
+    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
+    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
+    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
+    VK_DTPREL_HI12       = VK_DTPREL   | VK_HI12,
+    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
+    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
+    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
+    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
+    VK_TPREL_G2          = VK_TPREL    | VK_G2,
+    VK_TPREL_G1          = VK_TPREL    | VK_G1,
+    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
+    VK_TPREL_G0          = VK_TPREL    | VK_G0,
+    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
+    VK_TPREL_HI12        = VK_TPREL    | VK_HI12,
+    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
+    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
+
+    VK_INVALID  = 0xfff
   };
 
 private:
-  const VariantKind Kind;
   const MCExpr *Expr;
+  const VariantKind Kind;
 
-  explicit AArch64MCExpr(VariantKind _Kind, const MCExpr *_Expr)
-    : Kind(_Kind), Expr(_Expr) {}
+  explicit AArch64MCExpr(const MCExpr *Expr, VariantKind Kind)
+    : Expr(Expr), Kind(Kind) {}
 
 public:
   /// @name Construction
   /// @{
 
-  static const AArch64MCExpr *Create(VariantKind Kind, const MCExpr *Expr,
-                                     MCContext &Ctx);
-
-  static const AArch64MCExpr *CreateLo12(const MCExpr *Expr, MCContext &Ctx) {
-    return Create(VK_AARCH64_LO12, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateGOT(const MCExpr *Expr, MCContext &Ctx) {
-    return Create(VK_AARCH64_GOT, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateGOTLo12(const MCExpr *Expr,
-                                            MCContext &Ctx) {
-    return Create(VK_AARCH64_GOT_LO12, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateDTPREL_G1(const MCExpr *Expr,
-                                             MCContext &Ctx) {
-    return Create(VK_AARCH64_DTPREL_G1, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateDTPREL_G0_NC(const MCExpr *Expr,
-                                                MCContext &Ctx) {
-    return Create(VK_AARCH64_DTPREL_G0_NC, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateGOTTPREL(const MCExpr *Expr,
-                                             MCContext &Ctx) {
-    return Create(VK_AARCH64_GOTTPREL, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateGOTTPRELLo12(const MCExpr *Expr,
-                                                 MCContext &Ctx) {
-    return Create(VK_AARCH64_GOTTPREL_LO12, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateTLSDesc(const MCExpr *Expr,
-                                            MCContext &Ctx) {
-    return Create(VK_AARCH64_TLSDESC, Expr, Ctx);
-  }
+  static const AArch64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
+                                   MCContext &Ctx);
 
-  static const AArch64MCExpr *CreateTLSDescLo12(const MCExpr *Expr,
-                                                MCContext &Ctx) {
-    return Create(VK_AARCH64_TLSDESC_LO12, Expr, Ctx);
-  }
+  /// @}
+  /// @name Accessors
+  /// @{
 
-  static const AArch64MCExpr *CreateTPREL_G1(const MCExpr *Expr,
-                                             MCContext &Ctx) {
-    return Create(VK_AARCH64_TPREL_G1, Expr, Ctx);
-  }
+  /// Get the kind of this expression.
+  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
 
-  static const AArch64MCExpr *CreateTPREL_G0_NC(const MCExpr *Expr,
-                                                MCContext &Ctx) {
-    return Create(VK_AARCH64_TPREL_G0_NC, Expr, Ctx);
-  }
+  /// Get the expression this modifier applies to.
+  const MCExpr *getSubExpr() const { return Expr; }
 
-  static const AArch64MCExpr *CreateABS_G3(const MCExpr *Expr,
-                                           MCContext &Ctx) {
-    return Create(VK_AARCH64_ABS_G3, Expr, Ctx);
-  }
+  /// @}
+  /// @name VariantKind information extractors.
+  /// @{
 
-  static const AArch64MCExpr *CreateABS_G2_NC(const MCExpr *Expr,
-                                           MCContext &Ctx) {
-    return Create(VK_AARCH64_ABS_G2_NC, Expr, Ctx);
+  static VariantKind getSymbolLoc(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_SymLocBits);
   }
 
-  static const AArch64MCExpr *CreateABS_G1_NC(const MCExpr *Expr,
-                                           MCContext &Ctx) {
-    return Create(VK_AARCH64_ABS_G1_NC, Expr, Ctx);
+  static VariantKind getAddressFrag(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
   }
 
-  static const AArch64MCExpr *CreateABS_G0_NC(const MCExpr *Expr,
-                                           MCContext &Ctx) {
-    return Create(VK_AARCH64_ABS_G0_NC, Expr, Ctx);
-  }
+  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
 
   /// @}
-  /// @name Accessors
-  /// @{
 
-  /// getOpcode - Get the kind of this expression.
-  VariantKind getKind() const { return Kind; }
+  /// Convert the variant kind into an ELF-appropriate modifier
+  /// (e.g. ":got:", ":lo12:").
+  StringRef getVariantKindName() const;
 
-  /// getSubExpr - Get the child of this expression.
-  const MCExpr *getSubExpr() const { return Expr; }
+  void PrintImpl(raw_ostream &OS) const override;
 
-  /// @}
+  void AddValueSymbols(MCAssembler *) const override;
+
+  const MCSection *FindAssociatedSection() const override;
 
-  void PrintImpl(raw_ostream &OS) const;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-  void AddValueSymbols(MCAssembler *) const;
-  const MCSection *FindAssociatedSection() const {
-    return getSubExpr()->FindAssociatedSection();
-  }
+                                 const MCAsmLayout *Layout) const override;
 
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 3d19e42..ae698c5 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions -------------===//
+//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,9 +15,7 @@
 #include "AArch64ELFStreamer.h"
 #include "AArch64MCAsmInfo.h"
 #include "InstPrinter/AArch64InstPrinter.h"
-#include "llvm/ADT/APInt.h"
 #include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -25,8 +23,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_REGINFO_MC_DESC
-#include "AArch64GenRegisterInfo.inc"
+using namespace llvm;
 
 #define GET_INSTRINFO_MC_DESC
 #include "AArch64GenInstrInfo.inc"
@@ -34,26 +31,29 @@
 #define GET_SUBTARGETINFO_MC_DESC
 #include "AArch64GenSubtargetInfo.inc"
 
-using namespace llvm;
+#define GET_REGINFO_MC_DESC
+#include "AArch64GenRegisterInfo.inc"
 
-MCSubtargetInfo *AArch64_MC::createAArch64MCSubtargetInfo(StringRef TT,
-                                                          StringRef CPU,
-                                                          StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
+static MCInstrInfo *createAArch64MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAArch64MCInstrInfo(X);
   return X;
 }
 
+static MCSubtargetInfo *
+createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
 
-static MCInstrInfo *createAArch64MCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitAArch64MCInstrInfo(X);
+  if (CPU.empty())
+    CPU = "generic";
+
+  InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
   return X;
 }
 
 static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitAArch64MCRegisterInfo(X, AArch64::X30);
+  InitAArch64MCRegisterInfo(X, AArch64::LR);
   return X;
 }
 
@@ -61,9 +61,17 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
                                          StringRef TT) {
   Triple TheTriple(TT);
 
-  MCAsmInfo *MAI = new AArch64ELFMCAsmInfo(TT);
-  unsigned Reg = MRI.getDwarfRegNum(AArch64::XSP, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new AArch64MCAsmInfoDarwin();
+  else {
+    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+    MAI = new AArch64MCAsmInfoELF(TT);
+  }
+
+  // Initial state of the frame pointer is SP.
+  unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -72,40 +80,35 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
 static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
                                                  CodeModel::Model CM,
                                                  CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC) {
-    // On ELF platforms the default static relocation model has a smart enough
-    // linker to cope with referencing external symbols defined in a shared
-    // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
-    RM = Reloc::Static;
-  }
+  Triple TheTriple(TT);
+  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
+         "Only expect Darwin and ELF targets");
 
   if (CM == CodeModel::Default)
     CM = CodeModel::Small;
-  else if (CM == CodeModel::JITDefault) {
-    // The default MCJIT memory managers make no guarantees about where they can
-    // find an executable page; JITed code needs to be able to refer to globals
-    // no matter how far away they are.
+  // The default MCJIT memory managers make no guarantees about where they can
+  // find an executable page; JITed code needs to be able to refer to globals
+  // no matter how far away they are.
+  else if (CM == CodeModel::JITDefault)
     CM = CodeModel::Large;
-  }
+  else if (CM != CodeModel::Small && CM != CodeModel::Large)
+    report_fatal_error(
+        "Only small and large code models are allowed on AArch64");
+
+  // AArch64 Darwin is always PIC.
+  if (TheTriple.isOSDarwin())
+    RM = Reloc::PIC_;
+  // On ELF platforms the default static relocation model has a smart enough
+  // linker to cope with referencing external symbols defined in a shared
+  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
+    RM = Reloc::Static;
 
+  MCCodeGenInfo *X = new MCCodeGenInfo();
   X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
-static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
-                                    MCContext &Ctx, MCAsmBackend &MAB,
-                                    raw_ostream &OS,
-                                    MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  return createAArch64ELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
-}
-
-
 static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
                                                  unsigned SyntaxVariant,
                                                  const MCAsmInfo &MAI,
@@ -114,108 +117,109 @@ static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
                                                  const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
     return new AArch64InstPrinter(MAI, MII, MRI, STI);
-  return 0;
-}
-
-namespace {
-
-class AArch64MCInstrAnalysis : public MCInstrAnalysis {
-public:
-  AArch64MCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
-
-  virtual bool isUnconditionalBranch(const MCInst &Inst) const {
-    if (Inst.getOpcode() == AArch64::Bcc
-        && Inst.getOperand(0).getImm() == A64CC::AL)
-      return true;
-    return MCInstrAnalysis::isUnconditionalBranch(Inst);
-  }
-
-  virtual bool isConditionalBranch(const MCInst &Inst) const {
-    if (Inst.getOpcode() == AArch64::Bcc
-        && Inst.getOperand(0).getImm() == A64CC::AL)
-      return false;
-    return MCInstrAnalysis::isConditionalBranch(Inst);
-  }
-
-  bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
-                      uint64_t Size, uint64_t &Target) const {
-    unsigned LblOperand = Inst.getOpcode() == AArch64::Bcc ? 1 : 0;
-    // FIXME: We only handle PCRel branches for now.
-    if (Info->get(Inst.getOpcode()).OpInfo[LblOperand].OperandType
-        != MCOI::OPERAND_PCREL)
-      return false;
-
-    int64_t Imm = Inst.getOperand(LblOperand).getImm();
-    Target = Addr + Imm;
-    return true;
-  }
-};
+  if (SyntaxVariant == 1)
+    return new AArch64AppleInstPrinter(MAI, MII, MRI, STI);
 
+  return nullptr;
 }
 
-static MCInstrAnalysis *createAArch64MCInstrAnalysis(const MCInstrInfo *Info) {
-  return new AArch64MCInstrAnalysis(Info);
-}
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &TAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
 
+  if (TheTriple.isOSDarwin())
+    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+                               /*LabelSections*/ true);
 
+  return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
+}
 
+// Force static initialization.
 extern "C" void LLVMInitializeAArch64TargetMC() {
   // Register the MC asm info.
-  RegisterMCAsmInfoFn A(TheAArch64leTarget, createAArch64MCAsmInfo);
-  RegisterMCAsmInfoFn B(TheAArch64beTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn X(TheAArch64leTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheAArch64beTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn Z(TheARM64leTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn W(TheARM64beTarget, createAArch64MCAsmInfo);
 
   // Register the MC codegen info.
   TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget,
                                         createAArch64MCCodeGenInfo);
   TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget,
                                         createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget,
+                                        createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget,
+                                        createAArch64MCCodeGenInfo);
 
   // Register the MC instruction info.
   TargetRegistry::RegisterMCInstrInfo(TheAArch64leTarget,
                                       createAArch64MCInstrInfo);
   TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget,
                                       createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget,
+                                      createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget,
+                                      createAArch64MCInstrInfo);
 
   // Register the MC register info.
   TargetRegistry::RegisterMCRegInfo(TheAArch64leTarget,
                                     createAArch64MCRegisterInfo);
   TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget,
                                     createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARM64leTarget,
+                                    createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARM64beTarget,
+                                    createAArch64MCRegisterInfo);
 
   // Register the MC subtarget info.
-  using AArch64_MC::createAArch64MCSubtargetInfo;
   TargetRegistry::RegisterMCSubtargetInfo(TheAArch64leTarget,
                                           createAArch64MCSubtargetInfo);
   TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget,
                                           createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget,
+                                          createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget,
+                                          createAArch64MCSubtargetInfo);
 
-  // Register the MC instruction analyzer.
-  TargetRegistry::RegisterMCInstrAnalysis(TheAArch64leTarget,
-                                          createAArch64MCInstrAnalysis);
-  TargetRegistry::RegisterMCInstrAnalysis(TheAArch64beTarget,
-                                          createAArch64MCInstrAnalysis);
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget,
+                                       createAArch64leAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
+                                       createAArch64beAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget,
+                                       createAArch64leAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget,
+                                       createAArch64beAsmBackend);
 
   // Register the MC Code Emitter
   TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget,
                                         createAArch64MCCodeEmitter);
   TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget,
                                         createAArch64MCCodeEmitter);
-
-  // Register the asm backend.
-  TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget,
-                                       createAArch64leAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
-                                       createAArch64beAsmBackend);
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget,
+                                        createAArch64MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget,
+                                        createAArch64MCCodeEmitter);
 
   // Register the object streamer.
   TargetRegistry::RegisterMCObjectStreamer(TheAArch64leTarget,
                                            createMCStreamer);
   TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget,
                                            createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer);
 
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget,
                                         createAArch64MCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget,
                                         createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget,
+                                        createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget,
+                                        createAArch64MCInstPrinter);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index bd8beaf..d886ea2 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -11,18 +11,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64MCTARGETDESC_H
-#define LLVM_AARCH64MCTARGETDESC_H
+#ifndef AArch64MCTARGETDESC_H
+#define AArch64MCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
+#include <string>
 
 namespace llvm {
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
 class MCRegisterInfo;
+class MCObjectWriter;
 class MCSubtargetInfo;
 class StringRef;
 class Target;
@@ -30,28 +31,25 @@ class raw_ostream;
 
 extern Target TheAArch64leTarget;
 extern Target TheAArch64beTarget;
-
-namespace AArch64_MC {
-  MCSubtargetInfo *createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU,
-                                                StringRef FS);
-}
+extern Target TheARM64leTarget;
+extern Target TheARM64beTarget;
 
 MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
-                                          const MCRegisterInfo &MRI,
-                                          const MCSubtargetInfo &STI,
-                                          MCContext &Ctx);
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+MCAsmBackend *createAArch64leAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI, StringRef TT,
+                                        StringRef CPU);
+MCAsmBackend *createAArch64beAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI, StringRef TT,
+                                        StringRef CPU);
 
-MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS,
-                                             uint8_t OSABI,
+MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
                                              bool IsLittleEndian);
 
-MCAsmBackend *createAArch64leAsmBackend(const Target &T,
-                                        const MCRegisterInfo &MRI,
-                                        StringRef TT, StringRef CPU);
-
-MCAsmBackend *createAArch64beAsmBackend(const Target &T,
-                                        const MCRegisterInfo &MRI,
-                                        StringRef TT, StringRef CPU);
+MCObjectWriter *createAArch64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
+                                            uint32_t CPUSubtype);
 
 } // End llvm namespace
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
new file mode 100644
index 0000000..5c86189
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -0,0 +1,396 @@
+//===-- AArch64MachObjectWriter.cpp - ARM Mach Object Writer --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
+using namespace llvm;
+
+namespace {
+class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
+  bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
+                                  const MCSymbolRefExpr *Sym,
+                                  unsigned &Log2Size, const MCAssembler &Asm);
+
+public:
+  AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
+                                 /*UseAggressiveSymbolFolding=*/true) {}
+
+  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) override;
+};
+}
+
+bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
+    const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
+    unsigned &Log2Size, const MCAssembler &Asm) {
+  RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
+  Log2Size = ~0U;
+
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    return false;
+
+  case FK_Data_1:
+    Log2Size = llvm::Log2_32(1);
+    return true;
+  case FK_Data_2:
+    Log2Size = llvm::Log2_32(2);
+    return true;
+  case FK_Data_4:
+    Log2Size = llvm::Log2_32(4);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case FK_Data_8:
+    Log2Size = llvm::Log2_32(8);
+    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
+      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
+    return true;
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    Log2Size = llvm::Log2_32(4);
+    switch (Sym->getKind()) {
+    default:
+      assert(0 && "Unexpected symbol reference variant kind!");
+    case MCSymbolRefExpr::VK_PAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGEOFF:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
+      return true;
+    }
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    Log2Size = llvm::Log2_32(4);
+    // This encompasses the relocation for the whole 21-bit value.
+    switch (Sym->getKind()) {
+    default:
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "ADR/ADRP relocations must be GOT relative");
+    case MCSymbolRefExpr::VK_PAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_GOTPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21);
+      return true;
+    case MCSymbolRefExpr::VK_TLVPPAGE:
+      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21);
+      return true;
+    }
+    return true;
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    Log2Size = llvm::Log2_32(4);
+    RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
+    return true;
+  }
+}
+
+void AArch64MachObjectWriter::RecordRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment);
+  unsigned Log2Size = 0;
+  int64_t Value = 0;
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+  unsigned Kind = Fixup.getKind();
+
+  FixupOffset += Fixup.getOffset();
+
+  // AArch64 pcrel relocation addends do not include the section offset.
+  if (IsPCRel)
+    FixedValue += FixupOffset;
+
+  // ADRP fixups use relocations for the whole symbol value and only
+  // put the addend in the instruction itself. Clear out any value the
+  // generic code figured out from the sybmol definition.
+  if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+    FixedValue = 0;
+
+  // imm19 relocations are for conditional branches, which require
+  // assembler local symbols. If we got here, that's not what we have,
+  // so complain loudly.
+  if (Kind == AArch64::fixup_aarch64_pcrel_branch19) {
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "conditional branch requires assembler-local"
+                                " label. '" +
+                                    Target.getSymA()->getSymbol().getName() +
+                                    "' is external.");
+    return;
+  }
+
+  // 14-bit branch relocations should only target internal labels, and so
+  // should never get here.
+  if (Kind == AArch64::fixup_aarch64_pcrel_branch14) {
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "Invalid relocation on conditional branch!");
+    return;
+  }
+
+  if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
+                                  Asm)) {
+    Asm.getContext().FatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
+    return;
+  }
+
+  Value = Target.getConstant();
+
+  if (Target.isAbsolute()) { // constant
+    // FIXME: Should this always be extern?
+    // SymbolNum of 0 indicates the absolute section.
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+    Index = 0;
+
+    if (IsPCRel) {
+      IsExtern = 1;
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "PC relative absolute relocation!");
+
+      // FIXME: x86_64 sets the type to a branch reloc here. Should we do
+      // something similar?
+    }
+  } else if (Target.getSymB()) { // A - B + constant
+    const MCSymbol *A = &Target.getSymA()->getSymbol();
+    const MCSymbolData &A_SD = Asm.getSymbolData(*A);
+    const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
+
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    const MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
+
+    // Check for "_foo@got - .", which comes through here as:
+    // Ltmp0:
+    //    ... _foo@got - Ltmp0
+    if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT &&
+        Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None &&
+        Layout.getSymbolOffset(&B_SD) ==
+            Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
+      // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
+      Index = A_Base->getIndex();
+      IsExtern = 1;
+      Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
+      IsPCRel = 1;
+      MachO::any_relocation_info MRE;
+      MRE.r_word0 = FixupOffset;
+      MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                     (IsExtern << 27) | (Type << 28));
+      Writer->addRelocation(Fragment->getParent(), MRE);
+      return;
+    } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+               Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+      // Otherwise, neither symbol can be modified.
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation of modified symbol");
+
+    // We don't support PCrel relocations of differences.
+    if (IsPCRel)
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported pc-relative relocation of "
+                                  "difference");
+
+    // AArch64 always uses external relocations. If there is no symbol to use as
+    // a base address (a local symbol with no preceding non-local symbol),
+    // error out.
+    //
+    // FIXME: We should probably just synthesize an external symbol and use
+    // that.
+    if (!A_Base)
+      Asm.getContext().FatalError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + A->getName() +
+              "'. Must have non-local symbol earlier in section.");
+    if (!B_Base)
+      Asm.getContext().FatalError(
+          Fixup.getLoc(),
+          "unsupported relocation of local symbol '" + B->getName() +
+              "'. Must have non-local symbol earlier in section.");
+
+    if (A_Base == B_Base && A_Base)
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation with identical base");
+
+    Value += (!A_SD.getFragment() ? 0
+                                  : Writer->getSymbolAddress(&A_SD, Layout)) -
+             (!A_Base || !A_Base->getFragment()
+                  ? 0
+                  : Writer->getSymbolAddress(A_Base, Layout));
+    Value -= (!B_SD.getFragment() ? 0
+                                  : Writer->getSymbolAddress(&B_SD, Layout)) -
+             (!B_Base || !B_Base->getFragment()
+                  ? 0
+                  : Writer->getSymbolAddress(B_Base, Layout));
+
+    Index = A_Base->getIndex();
+    IsExtern = 1;
+    Type = MachO::ARM64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    Index = B_Base->getIndex();
+    IsExtern = 1;
+    Type = MachO::ARM64_RELOC_SUBTRACTOR;
+  } else { // A + constant
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData *Base = Asm.getAtom(&SD);
+    const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
+        Fragment->getParent()->getSection());
+
+    // If the symbol is a variable and we weren't able to get a Base for it
+    // (i.e., it's not in the symbol table associated with a section) resolve
+    // the relocation based its expansion instead.
+    if (Symbol->isVariable() && !Base) {
+      // If the evaluation is an absolute value, just use that directly
+      // to keep things easy.
+      int64_t Res;
+      if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+
+      // FIXME: Will the Target we already have ever have any data in it
+      // we need to preserve and merge with the new Target? How about
+      // the FixedValue?
+      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
+        Asm.getContext().FatalError(Fixup.getLoc(),
+                                    "unable to resolve variable '" +
+                                        Symbol->getName() + "'");
+      return RecordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                              FixedValue);
+    }
+
+    // Relocations inside debug sections always use local relocations when
+    // possible. This seems to be done because the debugger doesn't fully
+    // understand relocation entries and expects to find values that
+    // have already been fixed up.
+    if (Symbol->isInSection()) {
+      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+        Base = nullptr;
+    }
+
+    // AArch64 uses external relocations as much as possible. For debug
+    // sections, and for pointer-sized relocations (.quad), we allow section
+    // relocations.  It's code sections that run into trouble.
+    if (Base) {
+      Index = Base->getIndex();
+      IsExtern = 1;
+
+      // Add the local offset, if needed.
+      if (Base != &SD)
+        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
+    } else if (Symbol->isInSection()) {
+      // Pointer-sized relocations can use a local relocation. Otherwise,
+      // we have to be in a debug info section.
+      if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
+        Asm.getContext().FatalError(
+            Fixup.getLoc(),
+            "unsupported relocation of local symbol '" + Symbol->getName() +
+                "'. Must have non-local symbol earlier in section.");
+      // Adjust the relocation to be section-relative.
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD =
+          Asm.getSectionData(SD.getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      IsExtern = 0;
+      Value += Writer->getSymbolAddress(&SD, Layout);
+
+      if (IsPCRel)
+        Value -= Writer->getFragmentAddress(Fragment, Layout) +
+                 Fixup.getOffset() + (1ULL << Log2Size);
+    } else {
+      // Resolve constant variables.
+      if (SD.getSymbol().isVariable()) {
+        int64_t Res;
+        if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
+                Res, Layout, Writer->getSectionAddressMap())) {
+          FixedValue = Res;
+          return;
+        }
+      }
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  "unsupported relocation of variable '" +
+                                      Symbol->getName() + "'");
+    }
+  }
+
+  // If the relocation kind is Branch26, Page21, or Pageoff12, any addend
+  // is represented via an Addend relocation, not encoded directly into
+  // the instruction.
+  if ((Type == MachO::ARM64_RELOC_BRANCH26 ||
+       Type == MachO::ARM64_RELOC_PAGE21 ||
+       Type == MachO::ARM64_RELOC_PAGEOFF12) &&
+      Value) {
+    assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                   (IsExtern << 27) | (Type << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    // Now set up the Addend relocation.
+    Type = MachO::ARM64_RELOC_ADDEND;
+    Index = Value;
+    IsPCRel = 0;
+    Log2Size = 2;
+    IsExtern = 0;
+
+    // Put zero into the instruction itself. The addend is in the relocation.
+    Value = 0;
+  }
+
+  // If there's any addend left to handle, encode it in the instruction.
+  FixedValue = Value;
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
+                 (IsExtern << 27) | (Type << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS,
+                                                  uint32_t CPUType,
+                                                  uint32_t CPUSubtype) {
+  return createMachObjectWriter(
+      new AArch64MachObjectWriter(CPUType, CPUSubtype), OS,
+      /*IsLittleEndian=*/true);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/Android.mk b/lib/Target/AArch64/MCTargetDesc/Android.mk
index edcf1f2..c0cdb2b 100644
--- a/lib/Target/AArch64/MCTargetDesc/Android.mk
+++ b/lib/Target/AArch64/MCTargetDesc/Android.mk
@@ -10,6 +10,7 @@ arm64_mc_desc_SRC_FILES := \
   AArch64AsmBackend.cpp \
   AArch64ELFObjectWriter.cpp \
   AArch64ELFStreamer.cpp \
+  AArch64MachObjectWriter.cpp \
   AArch64MCAsmInfo.cpp \
   AArch64MCCodeEmitter.cpp \
   AArch64MCExpr.cpp \
diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
index 54c4465..7d5bced 100644
--- a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
@@ -6,4 +6,9 @@ add_llvm_library(LLVMAArch64Desc
   AArch64MCCodeEmitter.cpp
   AArch64MCExpr.cpp
   AArch64MCTargetDesc.cpp
-  )
+  AArch64MachObjectWriter.cpp
+)
+add_dependencies(LLVMAArch64Desc AArch64CommonTableGen)
+
+# Hack: we need to include 'main' target directory to grab private headers
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
index 37c8035..70cff0b 100644
--- a/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt ----------*- Conf -*--===;
+;===- ./lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/Makefile b/lib/Target/AArch64/Makefile
index 641bb83..f356c58 100644
--- a/lib/Target/AArch64/Makefile
+++ b/lib/Target/AArch64/Makefile
@@ -12,19 +12,14 @@ LIBRARYNAME = LLVMAArch64CodeGen
 TARGET = AArch64
 
 # Make sure that tblgen is run, first thing.
-BUILT_SOURCES = AArch64GenAsmMatcher.inc \
-   AArch64GenAsmWriter.inc \
-   AArch64GenCallingConv.inc \
-   AArch64GenDAGISel.inc \
-   AArch64GenDisassemblerTables.inc \
-   AArch64GenInstrInfo.inc \
-   AArch64GenMCCodeEmitter.inc \
-   AArch64GenMCPseudoLowering.inc \
-   AArch64GenRegisterInfo.inc \
-   AArch64GenSubtargetInfo.inc
+BUILT_SOURCES = AArch64GenRegisterInfo.inc AArch64GenInstrInfo.inc \
+		AArch64GenAsmWriter.inc AArch64GenAsmWriter1.inc \
+		AArch64GenDAGISel.inc \
+		AArch64GenCallingConv.inc AArch64GenAsmMatcher.inc \
+		AArch64GenSubtargetInfo.inc AArch64GenMCCodeEmitter.inc \
+		AArch64GenFastISel.inc AArch64GenDisassemblerTables.inc \
+		AArch64GenMCPseudoLowering.inc
 
-DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc Utils
+DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils
 
 include $(LEVEL)/Makefile.common
-
-
diff --git a/lib/Target/AArch64/README.txt b/lib/Target/AArch64/README.txt
deleted file mode 100644
index 601990f..0000000
--- a/lib/Target/AArch64/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-This file will contain changes that need to be made before AArch64 can become an
-officially supported target. Currently a placeholder.
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index 9281e4e..3a382c1 100644
--- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -------------===//
+//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,22 +6,26 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the key registration step for the architecture.
-//
-//===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-Target llvm::TheAArch64leTarget;
-Target llvm::TheAArch64beTarget;
+namespace llvm {
+Target TheAArch64leTarget;
+Target TheAArch64beTarget;
+Target TheARM64leTarget;
+Target TheARM64beTarget;
+} // end namespace llvm
 
 extern "C" void LLVMInitializeAArch64TargetInfo() {
-    RegisterTarget<Triple::aarch64, /*HasJIT=*/true>
-    X(TheAArch64leTarget, "aarch64", "AArch64 (ARM 64-bit little endian target)");
-    RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true>
-    Y(TheAArch64beTarget, "aarch64_be", "AArch64 (ARM 64-bit big endian target)");
+  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64leTarget, "arm64",
+                                                   "AArch64 (little endian)");
+  RegisterTarget<Triple::arm64_be, /*HasJIT=*/true> Y(TheARM64beTarget, "arm64_be",
+                                                      "AArch64 (big endian)");
+
+  RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
+      TheAArch64leTarget, "aarch64", "AArch64 (little endian)");
+  RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
+      TheAArch64beTarget, "aarch64_be", "AArch64 (big endian)");
 }
diff --git a/lib/Target/AArch64/TargetInfo/CMakeLists.txt b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
index ee734c6..e236eed 100644
--- a/lib/Target/AArch64/TargetInfo/CMakeLists.txt
+++ b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
@@ -1,3 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
 add_llvm_library(LLVMAArch64Info
   AArch64TargetInfo.cpp
   )
+
+add_dependencies(LLVMAArch64Info AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
index 6429172..93c5407 100644
--- a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/TargetInfo/LLVMBuild.txt ------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 2a97cd6..3c24bb3 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -18,7 +18,7 @@
 
 using namespace llvm;
 
-StringRef NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
+StringRef AArch64NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
   for (unsigned i = 0; i < NumPairs; ++i) {
     if (Pairs[i].Value == Value) {
       Valid = true;
@@ -30,7 +30,7 @@ StringRef NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
   return StringRef();
 }
 
-uint32_t NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
+uint32_t AArch64NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
   std::string LowerCaseName = Name.lower();
   for (unsigned i = 0; i < NumPairs; ++i) {
     if (Pairs[i].Name == LowerCaseName) {
@@ -43,11 +43,11 @@ uint32_t NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
   return -1;
 }
 
-bool NamedImmMapper::validImm(uint32_t Value) const {
+bool AArch64NamedImmMapper::validImm(uint32_t Value) const {
   return Value < TooBigImm;
 }
 
-const NamedImmMapper::Mapping A64AT::ATMapper::ATPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATPairs[] = {
   {"s1e1r", S1E1R},
   {"s1e2r", S1E2R},
   {"s1e3r", S1E3R},
@@ -62,10 +62,10 @@ const NamedImmMapper::Mapping A64AT::ATMapper::ATPairs[] = {
   {"s12e0w", S12E0W},
 };
 
-A64AT::ATMapper::ATMapper()
-  : NamedImmMapper(ATPairs, 0) {}
+AArch64AT::ATMapper::ATMapper()
+  : AArch64NamedImmMapper(ATPairs, 0) {}
 
-const NamedImmMapper::Mapping A64DB::DBarrierMapper::DBarrierPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierPairs[] = {
   {"oshld", OSHLD},
   {"oshst", OSHST},
   {"osh", OSH},
@@ -80,10 +80,10 @@ const NamedImmMapper::Mapping A64DB::DBarrierMapper::DBarrierPairs[] = {
   {"sy", SY}
 };
 
-A64DB::DBarrierMapper::DBarrierMapper()
-  : NamedImmMapper(DBarrierPairs, 16u) {}
+AArch64DB::DBarrierMapper::DBarrierMapper()
+  : AArch64NamedImmMapper(DBarrierPairs, 16u) {}
 
-const NamedImmMapper::Mapping A64DC::DCMapper::DCPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCPairs[] = {
   {"zva", ZVA},
   {"ivac", IVAC},
   {"isw", ISW},
@@ -94,26 +94,26 @@ const NamedImmMapper::Mapping A64DC::DCMapper::DCPairs[] = {
   {"cisw", CISW}
 };
 
-A64DC::DCMapper::DCMapper()
-  : NamedImmMapper(DCPairs, 0) {}
+AArch64DC::DCMapper::DCMapper()
+  : AArch64NamedImmMapper(DCPairs, 0) {}
 
-const NamedImmMapper::Mapping A64IC::ICMapper::ICPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICPairs[] = {
   {"ialluis",  IALLUIS},
   {"iallu", IALLU},
   {"ivau", IVAU}
 };
 
-A64IC::ICMapper::ICMapper()
-  : NamedImmMapper(ICPairs, 0) {}
+AArch64IC::ICMapper::ICMapper()
+  : AArch64NamedImmMapper(ICPairs, 0) {}
 
-const NamedImmMapper::Mapping A64ISB::ISBMapper::ISBPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBPairs[] = {
   {"sy",  SY},
 };
 
-A64ISB::ISBMapper::ISBMapper()
-  : NamedImmMapper(ISBPairs, 16) {}
+AArch64ISB::ISBMapper::ISBMapper()
+  : AArch64NamedImmMapper(ISBPairs, 16) {}
 
-const NamedImmMapper::Mapping A64PRFM::PRFMMapper::PRFMPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMPairs[] = {
   {"pldl1keep", PLDL1KEEP},
   {"pldl1strm", PLDL1STRM},
   {"pldl2keep", PLDL2KEEP},
@@ -134,19 +134,19 @@ const NamedImmMapper::Mapping A64PRFM::PRFMMapper::PRFMPairs[] = {
   {"pstl3strm", PSTL3STRM}
 };
 
-A64PRFM::PRFMMapper::PRFMMapper()
-  : NamedImmMapper(PRFMPairs, 32) {}
+AArch64PRFM::PRFMMapper::PRFMMapper()
+  : AArch64NamedImmMapper(PRFMPairs, 32) {}
 
-const NamedImmMapper::Mapping A64PState::PStateMapper::PStatePairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStatePairs[] = {
   {"spsel", SPSel},
   {"daifset", DAIFSet},
   {"daifclr", DAIFClr}
 };
 
-A64PState::PStateMapper::PStateMapper()
-  : NamedImmMapper(PStatePairs, 0) {}
+AArch64PState::PStateMapper::PStateMapper()
+  : AArch64NamedImmMapper(PStatePairs, 0) {}
 
-const NamedImmMapper::Mapping A64SysReg::MRSMapper::MRSPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSPairs[] = {
   {"mdccsr_el0", MDCCSR_EL0},
   {"dbgdtrrx_el0", DBGDTRRX_EL0},
   {"mdrar_el1", MDRAR_EL1},
@@ -176,16 +176,16 @@ const NamedImmMapper::Mapping A64SysReg::MRSMapper::MRSPairs[] = {
   {"id_isar3_el1", ID_ISAR3_EL1},
   {"id_isar4_el1", ID_ISAR4_EL1},
   {"id_isar5_el1", ID_ISAR5_EL1},
-  {"id_aa64pfr0_el1", ID_AA64PFR0_EL1},
-  {"id_aa64pfr1_el1", ID_AA64PFR1_EL1},
-  {"id_aa64dfr0_el1", ID_AA64DFR0_EL1},
-  {"id_aa64dfr1_el1", ID_AA64DFR1_EL1},
-  {"id_aa64afr0_el1", ID_AA64AFR0_EL1},
-  {"id_aa64afr1_el1", ID_AA64AFR1_EL1},
-  {"id_aa64isar0_el1", ID_AA64ISAR0_EL1},
-  {"id_aa64isar1_el1", ID_AA64ISAR1_EL1},
-  {"id_aa64mmfr0_el1", ID_AA64MMFR0_EL1},
-  {"id_aa64mmfr1_el1", ID_AA64MMFR1_EL1},
+  {"id_aa64pfr0_el1", ID_A64PFR0_EL1},
+  {"id_aa64pfr1_el1", ID_A64PFR1_EL1},
+  {"id_aa64dfr0_el1", ID_A64DFR0_EL1},
+  {"id_aa64dfr1_el1", ID_A64DFR1_EL1},
+  {"id_aa64afr0_el1", ID_A64AFR0_EL1},
+  {"id_aa64afr1_el1", ID_A64AFR1_EL1},
+  {"id_aa64isar0_el1", ID_A64ISAR0_EL1},
+  {"id_aa64isar1_el1", ID_A64ISAR1_EL1},
+  {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1},
+  {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1},
   {"mvfr0_el1", MVFR0_EL1},
   {"mvfr1_el1", MVFR1_EL1},
   {"mvfr2_el1", MVFR2_EL1},
@@ -245,12 +245,13 @@ const NamedImmMapper::Mapping A64SysReg::MRSMapper::MRSPairs[] = {
   {"ich_elsr_el2", ICH_ELSR_EL2}
 };
 
-A64SysReg::MRSMapper::MRSMapper() {
+AArch64SysReg::MRSMapper::MRSMapper(uint64_t FeatureBits)
+  : SysRegMapper(FeatureBits) {
     InstPairs = &MRSPairs[0];
     NumInstPairs = llvm::array_lengthof(MRSPairs);
 }
 
-const NamedImmMapper::Mapping A64SysReg::MSRMapper::MSRPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRPairs[] = {
   {"dbgdtrtx_el0", DBGDTRTX_EL0},
   {"oslar_el1", OSLAR_EL1},
   {"pmswinc_el0", PMSWINC_EL0},
@@ -268,13 +269,14 @@ const NamedImmMapper::Mapping A64SysReg::MSRMapper::MSRPairs[] = {
   {"icc_sgi0r_el1", ICC_SGI0R_EL1}
 };
 
-A64SysReg::MSRMapper::MSRMapper() {
+AArch64SysReg::MSRMapper::MSRMapper(uint64_t FeatureBits)
+  : SysRegMapper(FeatureBits) {
     InstPairs = &MSRPairs[0];
     NumInstPairs = llvm::array_lengthof(MSRPairs);
 }
 
 
-const NamedImmMapper::Mapping A64SysReg::SysRegMapper::SysRegPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegPairs[] = {
   {"osdtrrx_el1", OSDTRRX_EL1},
   {"osdtrtx_el1",  OSDTRTX_EL1},
   {"teecr32_el1", TEECR32_EL1},
@@ -753,10 +755,16 @@ const NamedImmMapper::Mapping A64SysReg::SysRegMapper::SysRegPairs[] = {
   {"ich_lr15_el2", ICH_LR15_EL2}
 };
 
+const AArch64NamedImmMapper::Mapping
+AArch64SysReg::SysRegMapper::CycloneSysRegPairs[] = {
+  {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3}
+};
+
 uint32_t
-A64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
-  // First search the registers shared by all
+AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
   std::string NameLower = Name.lower();
+
+  // First search the registers shared by all
   for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
     if (SysRegPairs[i].Name == NameLower) {
       Valid = true;
@@ -764,6 +772,16 @@ A64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
     }
   }
 
+  // Next search for target specific registers
+  if (FeatureBits & AArch64::ProcCyclone) {
+    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
+      if (CycloneSysRegPairs[i].Name == NameLower) {
+        Valid = true;
+        return CycloneSysRegPairs[i].Value;
+      }
+    }
+  }
+
   // Now try the instruction-specific registers (either read-only or
   // write-only).
   for (unsigned i = 0; i < NumInstPairs; ++i) {
@@ -796,7 +814,8 @@ A64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
 }
 
 std::string
-A64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+  // First search the registers shared by all
   for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
     if (SysRegPairs[i].Value == Bits) {
       Valid = true;
@@ -804,6 +823,18 @@ A64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
     }
   }
 
+  // Next search for target specific registers
+  if (FeatureBits & AArch64::ProcCyclone) {
+    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
+      if (CycloneSysRegPairs[i].Value == Bits) {
+        Valid = true;
+        return CycloneSysRegPairs[i].Name;
+      }
+    }
+  }
+
+  // Now try the instruction-specific registers (either read-only or
+  // write-only).
   for (unsigned i = 0; i < NumInstPairs; ++i) {
     if (InstPairs[i].Value == Bits) {
       Valid = true;
@@ -831,7 +862,7 @@ A64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
                + "_c" + utostr(CRm) + "_" + utostr(Op2);
 }
 
-const NamedImmMapper::Mapping A64TLBI::TLBIMapper::TLBIPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIPairs[] = {
   {"ipas2e1is", IPAS2E1IS},
   {"ipas2le1is", IPAS2LE1IS},
   {"vmalle1is", VMALLE1IS},
@@ -866,308 +897,5 @@ const NamedImmMapper::Mapping A64TLBI::TLBIMapper::TLBIPairs[] = {
   {"vaale1", VAALE1}
 };
 
-A64TLBI::TLBIMapper::TLBIMapper()
-  : NamedImmMapper(TLBIPairs, 0) {}
-
-bool A64Imms::isFPImm(const APFloat &Val, uint32_t &Imm8Bits) {
-  const fltSemantics &Sem = Val.getSemantics();
-  unsigned FracBits = APFloat::semanticsPrecision(Sem) - 1;
-
-  uint32_t ExpMask;
-  switch (FracBits) {
-  case 10: // IEEE half-precision
-    ExpMask = 0x1f;
-    break;
-  case 23: // IEEE single-precision
-    ExpMask = 0xff;
-    break;
-  case 52: // IEEE double-precision
-    ExpMask = 0x7ff;
-    break;
-  case 112: // IEEE quad-precision
-    // No immediates are valid for double precision.
-    return false;
-  default:
-    llvm_unreachable("Only half, single and double precision supported");
-  }
-
-  uint32_t ExpStart = FracBits;
-  uint64_t FracMask = (1ULL << FracBits) - 1;
-
-  uint32_t Sign = Val.isNegative();
-
-  uint64_t Bits= Val.bitcastToAPInt().getLimitedValue();
-  uint64_t Fraction = Bits & FracMask;
-  int32_t Exponent = ((Bits >> ExpStart) & ExpMask);
-  Exponent -= ExpMask >> 1;
-
-  // S[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>, 5):imm8<5:0>:Zeros(19)
-  // D[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>, 8):imm8<5:0>:Zeros(48)
-  // This translates to: only 4 bits of fraction; -3 <= exp <= 4.
-  uint64_t A64FracStart = FracBits - 4;
-  uint64_t A64FracMask = 0xf;
-
-  // Are there too many fraction bits?
-  if (Fraction & ~(A64FracMask << A64FracStart))
-    return false;
-
-  if (Exponent < -3 || Exponent > 4)
-    return false;
-
-  uint32_t PackedFraction = (Fraction >> A64FracStart) & A64FracMask;
-  uint32_t PackedExp = (Exponent + 7) & 0x7;
-
-  Imm8Bits = (Sign << 7) | (PackedExp << 4) | PackedFraction;
-  return true;
-}
-
-// Encoding of the immediate for logical (immediate) instructions:
-//
-// | N | imms   | immr   | size | R            | S            |
-// |---+--------+--------+------+--------------+--------------|
-// | 1 | ssssss | rrrrrr |   64 | UInt(rrrrrr) | UInt(ssssss) |
-// | 0 | 0sssss | xrrrrr |   32 | UInt(rrrrr)  | UInt(sssss)  |
-// | 0 | 10ssss | xxrrrr |   16 | UInt(rrrr)   | UInt(ssss)   |
-// | 0 | 110sss | xxxrrr |    8 | UInt(rrr)    | UInt(sss)    |
-// | 0 | 1110ss | xxxxrr |    4 | UInt(rr)     | UInt(ss)     |
-// | 0 | 11110s | xxxxxr |    2 | UInt(r)      | UInt(s)      |
-// | 0 | 11111x | -      |      | UNALLOCATED  |              |
-//
-// Columns 'R', 'S' and 'size' specify a "bitmask immediate" of size bits in
-// which the lower S+1 bits are ones and the remaining bits are zero, then
-// rotated right by R bits, which is then replicated across the datapath.
-//
-// + Values of 'N', 'imms' and 'immr' which do not match the above table are
-//   RESERVED.
-// + If all 's' bits in the imms field are set then the instruction is
-//   RESERVED.
-// + The 'x' bits in the 'immr' field are IGNORED.
-
-bool A64Imms::isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits) {
-  int RepeatWidth;
-  int Rotation = 0;
-  int Num1s = 0;
-
-  // Because there are S+1 ones in the replicated mask, an immediate of all
-  // zeros is not allowed. Filtering it here is probably more efficient.
-  if (Imm == 0) return false;
-
-  for (RepeatWidth = RegWidth; RepeatWidth > 1; RepeatWidth /= 2) {
-    uint64_t RepeatMask = RepeatWidth == 64 ? -1 : (1ULL << RepeatWidth) - 1;
-    uint64_t ReplicatedMask = Imm & RepeatMask;
-
-    if (ReplicatedMask == 0) continue;
-
-    // First we have to make sure the mask is actually repeated in each slot for
-    // this width-specifier.
-    bool IsReplicatedMask = true;
-    for (unsigned i = RepeatWidth; i < RegWidth; i += RepeatWidth) {
-      if (((Imm >> i) & RepeatMask) != ReplicatedMask) {
-        IsReplicatedMask = false;
-        break;
-      }
-    }
-    if (!IsReplicatedMask) continue;
-
-    // Now we have to work out the amount of rotation needed. The first part of
-    // this calculation is actually independent of RepeatWidth, but the complex
-    // case will depend on it.
-    Rotation = countTrailingZeros(Imm);
-    if (Rotation == 0) {
-      // There were no leading zeros, which means it's either in place or there
-      // are 1s at each end (e.g. 0x8003 needs rotating).
-      Rotation = RegWidth == 64 ? CountLeadingOnes_64(Imm)
-                                : CountLeadingOnes_32(Imm);
-      Rotation = RepeatWidth - Rotation;
-    }
-
-    uint64_t ReplicatedOnes = ReplicatedMask;
-    if (Rotation != 0 && Rotation != 64)
-      ReplicatedOnes = (ReplicatedMask >> Rotation)
-        | ((ReplicatedMask << (RepeatWidth - Rotation)) & RepeatMask);
-
-    // Of course, they may not actually be ones, so we have to check that:
-    if (!isMask_64(ReplicatedOnes))
-      continue;
-
-    Num1s = CountTrailingOnes_64(ReplicatedOnes);
-
-    // We know we've got an almost valid encoding (certainly, if this is invalid
-    // no other parameters would work).
-    break;
-  }
-
-  // The encodings which would produce all 1s are RESERVED.
-  if (RepeatWidth == 1 || Num1s == RepeatWidth) return false;
-
-  uint32_t N = RepeatWidth == 64;
-  uint32_t ImmR = RepeatWidth - Rotation;
-  uint32_t ImmS = Num1s - 1;
-
-  switch (RepeatWidth) {
-  default: break; // No action required for other valid rotations.
-  case 16: ImmS |= 0x20; break; // 10ssss
-  case 8: ImmS |= 0x30; break;  // 110sss
-  case 4: ImmS |= 0x38; break;  // 1110ss
-  case 2: ImmS |= 0x3c; break;  // 11110s
-  }
-
-  Bits = ImmS | (ImmR << 6) | (N << 12);
-
-  return true;
-}
-
-
-bool A64Imms::isLogicalImmBits(unsigned RegWidth, uint32_t Bits,
-                               uint64_t &Imm) {
-  uint32_t N = Bits >> 12;
-  uint32_t ImmR = (Bits >> 6) & 0x3f;
-  uint32_t ImmS = Bits & 0x3f;
-
-  // N=1 encodes a 64-bit replication and is invalid for the 32-bit
-  // instructions.
-  if (RegWidth == 32 && N != 0) return false;
-
-  int Width = 0;
-  if (N == 1)
-    Width = 64;
-  else if ((ImmS & 0x20) == 0)
-    Width = 32;
-  else if ((ImmS & 0x10) == 0)
-    Width = 16;
-  else if ((ImmS & 0x08) == 0)
-    Width = 8;
-  else if ((ImmS & 0x04) == 0)
-    Width = 4;
-  else if ((ImmS & 0x02) == 0)
-    Width = 2;
-  else {
-    // ImmS  is 0b11111x: UNALLOCATED
-    return false;
-  }
-
-  int Num1s = (ImmS & (Width - 1)) + 1;
-
-  // All encodings which would map to -1 (signed) are RESERVED.
-  if (Num1s == Width) return false;
-
-  int Rotation = (ImmR & (Width - 1));
-  uint64_t Mask = (1ULL << Num1s) - 1;
-  uint64_t WidthMask = Width == 64 ? -1 : (1ULL << Width) - 1;
-  if (Rotation != 0 && Rotation != 64)
-    Mask = (Mask >> Rotation)
-      | ((Mask << (Width - Rotation)) & WidthMask);
-
-  Imm = Mask;
-  for (unsigned i = 1; i < RegWidth / Width; ++i) {
-    Mask <<= Width;
-    Imm |= Mask;
-  }
-
-  return true;
-}
-
-bool A64Imms::isMOVZImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift) {
-  // If high bits are set then a 32-bit MOVZ can't possibly work.
-  if (RegWidth == 32 && (Value & ~0xffffffffULL))
-    return false;
-
-  for (int i = 0; i < RegWidth; i += 16) {
-    // If the value is 0 when we mask out all the bits that could be set with
-    // the current LSL value then it's representable.
-    if ((Value & ~(0xffffULL << i)) == 0) {
-      Shift = i / 16;
-      UImm16 = (Value >> i) & 0xffff;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool A64Imms::isMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift) {
-  // MOVN is defined to set its register to NOT(LSL(imm16, shift)).
-
-  // We have to be a little careful about a 32-bit register: 0xffff_1234 *is*
-  // representable, but ~0xffff_1234 == 0xffff_ffff_0000_edcb which is not
-  // a valid input for isMOVZImm.
-  if (RegWidth == 32 && (Value & ~0xffffffffULL))
-    return false;
-
-  uint64_t MOVZEquivalent = RegWidth == 32 ? ~Value & 0xffffffff : ~Value;
-
-  return isMOVZImm(RegWidth, MOVZEquivalent, UImm16, Shift);
-}
-
-bool A64Imms::isOnlyMOVNImm(int RegWidth, uint64_t Value,
-                            int &UImm16, int &Shift) {
-  if (isMOVZImm(RegWidth, Value, UImm16, Shift))
-    return false;
-
-  return isMOVNImm(RegWidth, Value, UImm16, Shift);
-}
-
-// decodeNeonModShiftImm - Decode a Neon OpCmode value into the
-// the shift amount and the shift type (shift zeros or ones in) and
-// returns whether the OpCmode value implies a shift operation.
-bool A64Imms::decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm,
-                                    unsigned &ShiftOnesIn) {
-  ShiftImm = 0;
-  ShiftOnesIn = false;
-  bool HasShift = true;
-
-  if (OpCmode == 0xe) {
-    // movi byte
-    HasShift = false;
-  } else if (OpCmode == 0x1e) {
-    // movi 64-bit bytemask
-    HasShift = false;
-  } else if ((OpCmode & 0xc) == 0x8) {
-    // shift zeros, per halfword
-    ShiftImm = ((OpCmode & 0x2) >> 1);
-  } else if ((OpCmode & 0x8) == 0) {
-    // shift zeros, per word
-    ShiftImm = ((OpCmode & 0x6) >> 1);
-  } else if ((OpCmode & 0xe) == 0xc) {
-    // shift ones, per word
-    ShiftOnesIn = true;
-    ShiftImm = (OpCmode & 0x1);
-  } else {
-    // per byte, per bytemask
-    llvm_unreachable("Unsupported Neon modified immediate");
-  }
-
-  return HasShift;
-}
-
-// decodeNeonModImm - Decode a NEON modified immediate and OpCmode values
-// into the element value and the element size in bits.
-uint64_t A64Imms::decodeNeonModImm(unsigned Val, unsigned OpCmode,
-                                   unsigned &EltBits) {
-  uint64_t DecodedVal = Val;
-  EltBits = 0;
-
-  if (OpCmode == 0xe) {
-    // movi byte
-    EltBits = 8;
-  } else if (OpCmode == 0x1e) {
-    // movi 64-bit bytemask
-    DecodedVal = 0;
-    for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
-      if ((Val >> ByteNum) & 1)
-        DecodedVal |= (uint64_t)0xff << (8 * ByteNum);
-    }
-    EltBits = 64;
-  } else if ((OpCmode & 0xc) == 0x8) {
-    // shift zeros, per halfword
-    EltBits = 16;
-  } else if ((OpCmode & 0x8) == 0) {
-    // shift zeros, per word
-    EltBits = 32;
-  } else if ((OpCmode & 0xe) == 0xc) {
-    // shift ones, per word
-    EltBits = 32;
-  } else {
-    llvm_unreachable("Unsupported Neon modified immediate");
-  }
-  return DecodedVal;
-}
+AArch64TLBI::TLBIMapper::TLBIMapper()
+  : AArch64NamedImmMapper(TLBIPairs, 0) {}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 39b042b..9e4c389 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -1,4 +1,4 @@
-//===-- AArch64BaseInfo.h - Top level definitions for AArch64- --*- C++ -*-===//
+//===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,96 +14,271 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64_BASEINFO_H
-#define LLVM_AARCH64_BASEINFO_H
+#ifndef AArch64BASEINFO_H
+#define AArch64BASEINFO_H
 
+// FIXME: Is it easiest to fix this layering violation by moving the .inc
+// #includes from AArch64MCTargetDesc.h to here?
+#include "MCTargetDesc/AArch64MCTargetDesc.h" // For AArch64::X0 and friends.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 
-// // Enums corresponding to AArch64 condition codes
-namespace A64CC {
-  // The CondCodes constants map directly to the 4-bit encoding of the
-  // condition field for predicated instructions.
-  enum CondCodes {   // Meaning (integer)          Meaning (floating-point)
-    EQ = 0,        // Equal                      Equal
-    NE,            // Not equal                  Not equal, or unordered
-    HS,            // Unsigned higher or same    >, ==, or unordered
-    LO,            // Unsigned lower or same     Less than
-    MI,            // Minus, negative            Less than
-    PL,            // Plus, positive or zero     >, ==, or unordered
-    VS,            // Overflow                   Unordered
-    VC,            // No overflow                Ordered
-    HI,            // Unsigned higher            Greater than, or unordered
-    LS,            // Unsigned lower or same     Less than or equal
-    GE,            // Greater than or equal      Greater than or equal
-    LT,            // Less than                  Less than, or unordered
-    GT,            // Signed greater than        Greater than
-    LE,            // Signed less than or equal  <, ==, or unordered
-    AL,            // Always (unconditional)     Always (unconditional)
-    NV,             // Always (unconditional)     Always (unconditional)
-    // Note the NV exists purely to disassemble 0b1111. Execution
-    // is "always".
-    Invalid
-  };
+inline static unsigned getWRegFromXReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::X0: return AArch64::W0;
+  case AArch64::X1: return AArch64::W1;
+  case AArch64::X2: return AArch64::W2;
+  case AArch64::X3: return AArch64::W3;
+  case AArch64::X4: return AArch64::W4;
+  case AArch64::X5: return AArch64::W5;
+  case AArch64::X6: return AArch64::W6;
+  case AArch64::X7: return AArch64::W7;
+  case AArch64::X8: return AArch64::W8;
+  case AArch64::X9: return AArch64::W9;
+  case AArch64::X10: return AArch64::W10;
+  case AArch64::X11: return AArch64::W11;
+  case AArch64::X12: return AArch64::W12;
+  case AArch64::X13: return AArch64::W13;
+  case AArch64::X14: return AArch64::W14;
+  case AArch64::X15: return AArch64::W15;
+  case AArch64::X16: return AArch64::W16;
+  case AArch64::X17: return AArch64::W17;
+  case AArch64::X18: return AArch64::W18;
+  case AArch64::X19: return AArch64::W19;
+  case AArch64::X20: return AArch64::W20;
+  case AArch64::X21: return AArch64::W21;
+  case AArch64::X22: return AArch64::W22;
+  case AArch64::X23: return AArch64::W23;
+  case AArch64::X24: return AArch64::W24;
+  case AArch64::X25: return AArch64::W25;
+  case AArch64::X26: return AArch64::W26;
+  case AArch64::X27: return AArch64::W27;
+  case AArch64::X28: return AArch64::W28;
+  case AArch64::FP: return AArch64::W29;
+  case AArch64::LR: return AArch64::W30;
+  case AArch64::SP: return AArch64::WSP;
+  case AArch64::XZR: return AArch64::WZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
 
-} // namespace A64CC
+inline static unsigned getXRegFromWReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::W0: return AArch64::X0;
+  case AArch64::W1: return AArch64::X1;
+  case AArch64::W2: return AArch64::X2;
+  case AArch64::W3: return AArch64::X3;
+  case AArch64::W4: return AArch64::X4;
+  case AArch64::W5: return AArch64::X5;
+  case AArch64::W6: return AArch64::X6;
+  case AArch64::W7: return AArch64::X7;
+  case AArch64::W8: return AArch64::X8;
+  case AArch64::W9: return AArch64::X9;
+  case AArch64::W10: return AArch64::X10;
+  case AArch64::W11: return AArch64::X11;
+  case AArch64::W12: return AArch64::X12;
+  case AArch64::W13: return AArch64::X13;
+  case AArch64::W14: return AArch64::X14;
+  case AArch64::W15: return AArch64::X15;
+  case AArch64::W16: return AArch64::X16;
+  case AArch64::W17: return AArch64::X17;
+  case AArch64::W18: return AArch64::X18;
+  case AArch64::W19: return AArch64::X19;
+  case AArch64::W20: return AArch64::X20;
+  case AArch64::W21: return AArch64::X21;
+  case AArch64::W22: return AArch64::X22;
+  case AArch64::W23: return AArch64::X23;
+  case AArch64::W24: return AArch64::X24;
+  case AArch64::W25: return AArch64::X25;
+  case AArch64::W26: return AArch64::X26;
+  case AArch64::W27: return AArch64::X27;
+  case AArch64::W28: return AArch64::X28;
+  case AArch64::W29: return AArch64::FP;
+  case AArch64::W30: return AArch64::LR;
+  case AArch64::WSP: return AArch64::SP;
+  case AArch64::WZR: return AArch64::XZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
 
-inline static const char *A64CondCodeToString(A64CC::CondCodes CC) {
-  switch (CC) {
-  default: llvm_unreachable("Unknown condition code");
-  case A64CC::EQ:  return "eq";
-  case A64CC::NE:  return "ne";
-  case A64CC::HS:  return "hs";
-  case A64CC::LO:  return "lo";
-  case A64CC::MI:  return "mi";
-  case A64CC::PL:  return "pl";
-  case A64CC::VS:  return "vs";
-  case A64CC::VC:  return "vc";
-  case A64CC::HI:  return "hi";
-  case A64CC::LS:  return "ls";
-  case A64CC::GE:  return "ge";
-  case A64CC::LT:  return "lt";
-  case A64CC::GT:  return "gt";
-  case A64CC::LE:  return "le";
-  case A64CC::AL:  return "al";
-  case A64CC::NV:  return "nv";
+static inline unsigned getBRegFromDReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::D0:  return AArch64::B0;
+  case AArch64::D1:  return AArch64::B1;
+  case AArch64::D2:  return AArch64::B2;
+  case AArch64::D3:  return AArch64::B3;
+  case AArch64::D4:  return AArch64::B4;
+  case AArch64::D5:  return AArch64::B5;
+  case AArch64::D6:  return AArch64::B6;
+  case AArch64::D7:  return AArch64::B7;
+  case AArch64::D8:  return AArch64::B8;
+  case AArch64::D9:  return AArch64::B9;
+  case AArch64::D10: return AArch64::B10;
+  case AArch64::D11: return AArch64::B11;
+  case AArch64::D12: return AArch64::B12;
+  case AArch64::D13: return AArch64::B13;
+  case AArch64::D14: return AArch64::B14;
+  case AArch64::D15: return AArch64::B15;
+  case AArch64::D16: return AArch64::B16;
+  case AArch64::D17: return AArch64::B17;
+  case AArch64::D18: return AArch64::B18;
+  case AArch64::D19: return AArch64::B19;
+  case AArch64::D20: return AArch64::B20;
+  case AArch64::D21: return AArch64::B21;
+  case AArch64::D22: return AArch64::B22;
+  case AArch64::D23: return AArch64::B23;
+  case AArch64::D24: return AArch64::B24;
+  case AArch64::D25: return AArch64::B25;
+  case AArch64::D26: return AArch64::B26;
+  case AArch64::D27: return AArch64::B27;
+  case AArch64::D28: return AArch64::B28;
+  case AArch64::D29: return AArch64::B29;
+  case AArch64::D30: return AArch64::B30;
+  case AArch64::D31: return AArch64::B31;
   }
+  // For anything else, return it unchanged.
+  return Reg;
 }
 
-inline static A64CC::CondCodes A64StringToCondCode(StringRef CondStr) {
-  return StringSwitch<A64CC::CondCodes>(CondStr.lower())
-             .Case("eq", A64CC::EQ)
-             .Case("ne", A64CC::NE)
-             .Case("ne", A64CC::NE)
-             .Case("hs", A64CC::HS)
-             .Case("cs", A64CC::HS)
-             .Case("lo", A64CC::LO)
-             .Case("cc", A64CC::LO)
-             .Case("mi", A64CC::MI)
-             .Case("pl", A64CC::PL)
-             .Case("vs", A64CC::VS)
-             .Case("vc", A64CC::VC)
-             .Case("hi", A64CC::HI)
-             .Case("ls", A64CC::LS)
-             .Case("ge", A64CC::GE)
-             .Case("lt", A64CC::LT)
-             .Case("gt", A64CC::GT)
-             .Case("le", A64CC::LE)
-             .Case("al", A64CC::AL)
-             .Case("nv", A64CC::NV)
-             .Default(A64CC::Invalid);
+
+static inline unsigned getDRegFromBReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::B0:  return AArch64::D0;
+  case AArch64::B1:  return AArch64::D1;
+  case AArch64::B2:  return AArch64::D2;
+  case AArch64::B3:  return AArch64::D3;
+  case AArch64::B4:  return AArch64::D4;
+  case AArch64::B5:  return AArch64::D5;
+  case AArch64::B6:  return AArch64::D6;
+  case AArch64::B7:  return AArch64::D7;
+  case AArch64::B8:  return AArch64::D8;
+  case AArch64::B9:  return AArch64::D9;
+  case AArch64::B10: return AArch64::D10;
+  case AArch64::B11: return AArch64::D11;
+  case AArch64::B12: return AArch64::D12;
+  case AArch64::B13: return AArch64::D13;
+  case AArch64::B14: return AArch64::D14;
+  case AArch64::B15: return AArch64::D15;
+  case AArch64::B16: return AArch64::D16;
+  case AArch64::B17: return AArch64::D17;
+  case AArch64::B18: return AArch64::D18;
+  case AArch64::B19: return AArch64::D19;
+  case AArch64::B20: return AArch64::D20;
+  case AArch64::B21: return AArch64::D21;
+  case AArch64::B22: return AArch64::D22;
+  case AArch64::B23: return AArch64::D23;
+  case AArch64::B24: return AArch64::D24;
+  case AArch64::B25: return AArch64::D25;
+  case AArch64::B26: return AArch64::D26;
+  case AArch64::B27: return AArch64::D27;
+  case AArch64::B28: return AArch64::D28;
+  case AArch64::B29: return AArch64::D29;
+  case AArch64::B30: return AArch64::D30;
+  case AArch64::B31: return AArch64::D31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
 }
 
-inline static A64CC::CondCodes A64InvertCondCode(A64CC::CondCodes CC) {
-  // It turns out that the condition codes have been designed so that in order
-  // to reverse the intent of the condition you only have to invert the low bit:
+namespace AArch64CC {
+
+// The CondCodes constants map directly to the 4-bit encoding of the condition
+// field for predicated instructions.
+enum CondCode {  // Meaning (integer)          Meaning (floating-point)
+  EQ = 0x0,      // Equal                      Equal
+  NE = 0x1,      // Not equal                  Not equal, or unordered
+  HS = 0x2,      // Unsigned higher or same    >, ==, or unordered
+  LO = 0x3,      // Unsigned lower             Less than
+  MI = 0x4,      // Minus, negative            Less than
+  PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
+  VS = 0x6,      // Overflow                   Unordered
+  VC = 0x7,      // No overflow                Not unordered
+  HI = 0x8,      // Unsigned higher            Greater than, or unordered
+  LS = 0x9,      // Unsigned lower or same     Less than or equal
+  GE = 0xa,      // Greater than or equal      Greater than or equal
+  LT = 0xb,      // Less than                  Less than, or unordered
+  GT = 0xc,      // Greater than               Greater than
+  LE = 0xd,      // Less than or equal         <, ==, or unordered
+  AL = 0xe,      // Always (unconditional)     Always (unconditional)
+  NV = 0xf,      // Always (unconditional)     Always (unconditional)
+  // Note the NV exists purely to disassemble 0b1111. Execution is "always".
+  Invalid
+};
 
-  return static_cast<A64CC::CondCodes>(static_cast<unsigned>(CC) ^ 0x1);
+inline static const char *getCondCodeName(CondCode Code) {
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ:  return "eq";
+  case NE:  return "ne";
+  case HS:  return "hs";
+  case LO:  return "lo";
+  case MI:  return "mi";
+  case PL:  return "pl";
+  case VS:  return "vs";
+  case VC:  return "vc";
+  case HI:  return "hi";
+  case LS:  return "ls";
+  case GE:  return "ge";
+  case LT:  return "lt";
+  case GT:  return "gt";
+  case LE:  return "le";
+  case AL:  return "al";
+  case NV:  return "nv";
+  }
+}
+
+inline static CondCode getInvertedCondCode(CondCode Code) {
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ:  return NE;
+  case NE:  return EQ;
+  case HS:  return LO;
+  case LO:  return HS;
+  case MI:  return PL;
+  case PL:  return MI;
+  case VS:  return VC;
+  case VC:  return VS;
+  case HI:  return LS;
+  case LS:  return HI;
+  case GE:  return LT;
+  case LT:  return GE;
+  case GT:  return LE;
+  case LE:  return GT;
+  }
 }
 
+/// Given a condition code, return NZCV flags that would satisfy that condition.
+/// The flag bits are in the format expected by the ccmp instructions.
+/// Note that many different flag settings can satisfy a given condition code,
+/// this function just returns one of them.
+inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
+  // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
+  enum { N = 8, Z = 4, C = 2, V = 1 };
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ: return Z; // Z == 1
+  case NE: return 0; // Z == 0
+  case HS: return C; // C == 1
+  case LO: return 0; // C == 0
+  case MI: return N; // N == 1
+  case PL: return 0; // N == 0
+  case VS: return V; // V == 1
+  case VC: return 0; // V == 0
+  case HI: return C; // C == 1 && Z == 0
+  case LS: return 0; // C == 0 || Z == 1
+  case GE: return 0; // N == V
+  case LT: return N; // N != V
+  case GT: return 0; // Z == 0 && N == V
+  case LE: return Z; // Z == 1 || N != V
+  }
+}
+} // end namespace AArch64CC
+
 /// Instances of this class can perform bidirectional mapping from random
 /// identifier strings to operand encodings. For example "MSR" takes a named
 /// system-register which must be encoded somehow and decoded for printing. This
@@ -115,14 +290,14 @@ inline static A64CC::CondCodes A64InvertCondCode(A64CC::CondCodes CC) {
 /// out just how often these instructions are emitted before working on it. It
 /// might even be optimal to just reorder the tables for the common instructions
 /// rather than changing the algorithm.
-struct NamedImmMapper {
+struct AArch64NamedImmMapper {
   struct Mapping {
     const char *Name;
     uint32_t Value;
   };
 
   template<int N>
-  NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
+  AArch64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
     : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {}
 
   StringRef toString(uint32_t Value, bool &Valid) const;
@@ -138,7 +313,7 @@ protected:
   uint32_t TooBigImm;
 };
 
-namespace A64AT {
+namespace AArch64AT {
   enum ATValues {
     Invalid = -1,    // Op0 Op1  CRn   CRm   Op2
     S1E1R = 0x43c0,  // 01  000  0111  1000  000
@@ -155,14 +330,14 @@ namespace A64AT {
     S12E0W = 0x63c7  // 01  100  0111  1000  111
   };
 
-  struct ATMapper : NamedImmMapper {
+  struct ATMapper : AArch64NamedImmMapper {
     const static Mapping ATPairs[];
 
     ATMapper();
   };
 
 }
-namespace A64DB {
+namespace AArch64DB {
   enum DBValues {
     Invalid = -1,
     OSHLD = 0x1,
@@ -179,14 +354,14 @@ namespace A64DB {
     SY =    0xf
   };
 
-  struct DBarrierMapper : NamedImmMapper {
+  struct DBarrierMapper : AArch64NamedImmMapper {
     const static Mapping DBarrierPairs[];
 
     DBarrierMapper();
   };
 }
 
-namespace  A64DC {
+namespace  AArch64DC {
   enum DCValues {
     Invalid = -1,   // Op1  CRn   CRm   Op2
     ZVA   = 0x5ba1, // 01  011  0111  0100  001
@@ -199,7 +374,7 @@ namespace  A64DC {
     CISW  = 0x43f2  // 01  000  0111  1110  010
   };
 
-  struct DCMapper : NamedImmMapper {
+  struct DCMapper : AArch64NamedImmMapper {
     const static Mapping DCPairs[];
 
     DCMapper();
@@ -207,7 +382,7 @@ namespace  A64DC {
 
 }
 
-namespace  A64IC {
+namespace  AArch64IC {
   enum ICValues {
     Invalid = -1,     // Op1  CRn   CRm   Op2
     IALLUIS = 0x0388, // 000  0111  0001  000
@@ -216,7 +391,7 @@ namespace  A64IC {
   };
 
 
-  struct ICMapper : NamedImmMapper {
+  struct ICMapper : AArch64NamedImmMapper {
     const static Mapping ICPairs[];
 
     ICMapper();
@@ -227,19 +402,19 @@ namespace  A64IC {
   }
 }
 
-namespace  A64ISB {
+namespace  AArch64ISB {
   enum ISBValues {
     Invalid = -1,
     SY = 0xf
   };
-  struct ISBMapper : NamedImmMapper {
+  struct ISBMapper : AArch64NamedImmMapper {
     const static Mapping ISBPairs[];
 
     ISBMapper();
   };
 }
 
-namespace A64PRFM {
+namespace AArch64PRFM {
   enum PRFMValues {
     Invalid = -1,
     PLDL1KEEP = 0x00,
@@ -262,14 +437,14 @@ namespace A64PRFM {
     PSTL3STRM = 0x15
   };
 
-  struct PRFMMapper : NamedImmMapper {
+  struct PRFMMapper : AArch64NamedImmMapper {
     const static Mapping PRFMPairs[];
 
     PRFMMapper();
   };
 }
 
-namespace A64PState {
+namespace AArch64PState {
   enum PStateValues {
     Invalid = -1,
     SPSel = 0x05,
@@ -277,7 +452,7 @@ namespace A64PState {
     DAIFClr = 0x1f
   };
 
-  struct PStateMapper : NamedImmMapper {
+  struct PStateMapper : AArch64NamedImmMapper {
     const static Mapping PStatePairs[];
 
     PStateMapper();
@@ -285,7 +460,7 @@ namespace A64PState {
 
 }
 
-namespace A64SE {
+namespace AArch64SE {
     enum ShiftExtSpecifiers {
         Invalid = -1,
         LSL,
@@ -306,7 +481,7 @@ namespace A64SE {
     };
 }
 
-namespace A64Layout {
+namespace AArch64Layout {
     enum VectorLayout {
         Invalid = -1,
         VL_8B,
@@ -329,43 +504,43 @@ namespace A64Layout {
 }
 
 inline static const char *
-A64VectorLayoutToString(A64Layout::VectorLayout Layout) {
+AArch64VectorLayoutToString(AArch64Layout::VectorLayout Layout) {
   switch (Layout) {
-  case A64Layout::VL_8B:  return ".8b";
-  case A64Layout::VL_4H:  return ".4h";
-  case A64Layout::VL_2S:  return ".2s";
-  case A64Layout::VL_1D:  return ".1d";
-  case A64Layout::VL_16B:  return ".16b";
-  case A64Layout::VL_8H:  return ".8h";
-  case A64Layout::VL_4S:  return ".4s";
-  case A64Layout::VL_2D:  return ".2d";
-  case A64Layout::VL_B:  return ".b";
-  case A64Layout::VL_H:  return ".h";
-  case A64Layout::VL_S:  return ".s";
-  case A64Layout::VL_D:  return ".d";
+  case AArch64Layout::VL_8B:  return ".8b";
+  case AArch64Layout::VL_4H:  return ".4h";
+  case AArch64Layout::VL_2S:  return ".2s";
+  case AArch64Layout::VL_1D:  return ".1d";
+  case AArch64Layout::VL_16B:  return ".16b";
+  case AArch64Layout::VL_8H:  return ".8h";
+  case AArch64Layout::VL_4S:  return ".4s";
+  case AArch64Layout::VL_2D:  return ".2d";
+  case AArch64Layout::VL_B:  return ".b";
+  case AArch64Layout::VL_H:  return ".h";
+  case AArch64Layout::VL_S:  return ".s";
+  case AArch64Layout::VL_D:  return ".d";
   default: llvm_unreachable("Unknown Vector Layout");
   }
 }
 
-inline static A64Layout::VectorLayout
-A64StringToVectorLayout(StringRef LayoutStr) {
-  return StringSwitch<A64Layout::VectorLayout>(LayoutStr)
-             .Case(".8b", A64Layout::VL_8B)
-             .Case(".4h", A64Layout::VL_4H)
-             .Case(".2s", A64Layout::VL_2S)
-             .Case(".1d", A64Layout::VL_1D)
-             .Case(".16b", A64Layout::VL_16B)
-             .Case(".8h", A64Layout::VL_8H)
-             .Case(".4s", A64Layout::VL_4S)
-             .Case(".2d", A64Layout::VL_2D)
-             .Case(".b", A64Layout::VL_B)
-             .Case(".h", A64Layout::VL_H)
-             .Case(".s", A64Layout::VL_S)
-             .Case(".d", A64Layout::VL_D)
-             .Default(A64Layout::Invalid);
+inline static AArch64Layout::VectorLayout
+AArch64StringToVectorLayout(StringRef LayoutStr) {
+  return StringSwitch<AArch64Layout::VectorLayout>(LayoutStr)
+             .Case(".8b", AArch64Layout::VL_8B)
+             .Case(".4h", AArch64Layout::VL_4H)
+             .Case(".2s", AArch64Layout::VL_2S)
+             .Case(".1d", AArch64Layout::VL_1D)
+             .Case(".16b", AArch64Layout::VL_16B)
+             .Case(".8h", AArch64Layout::VL_8H)
+             .Case(".4s", AArch64Layout::VL_4S)
+             .Case(".2d", AArch64Layout::VL_2D)
+             .Case(".b", AArch64Layout::VL_B)
+             .Case(".h", AArch64Layout::VL_H)
+             .Case(".s", AArch64Layout::VL_S)
+             .Case(".d", AArch64Layout::VL_D)
+             .Default(AArch64Layout::Invalid);
 }
 
-namespace A64SysReg {
+namespace AArch64SysReg {
   enum SysRegROValues {
     MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
     DBGDTRRX_EL0      = 0x9828, // 10  011  0000  0101  000
@@ -396,16 +571,16 @@ namespace A64SysReg {
     ID_ISAR3_EL1      = 0xc013, // 11  000  0000  0010  011
     ID_ISAR4_EL1      = 0xc014, // 11  000  0000  0010  100
     ID_ISAR5_EL1      = 0xc015, // 11  000  0000  0010  101
-    ID_AA64PFR0_EL1   = 0xc020, // 11  000  0000  0100  000
-    ID_AA64PFR1_EL1   = 0xc021, // 11  000  0000  0100  001
-    ID_AA64DFR0_EL1   = 0xc028, // 11  000  0000  0101  000
-    ID_AA64DFR1_EL1   = 0xc029, // 11  000  0000  0101  001
-    ID_AA64AFR0_EL1   = 0xc02c, // 11  000  0000  0101  100
-    ID_AA64AFR1_EL1   = 0xc02d, // 11  000  0000  0101  101
-    ID_AA64ISAR0_EL1  = 0xc030, // 11  000  0000  0110  000
-    ID_AA64ISAR1_EL1  = 0xc031, // 11  000  0000  0110  001
-    ID_AA64MMFR0_EL1  = 0xc038, // 11  000  0000  0111  000
-    ID_AA64MMFR1_EL1  = 0xc039, // 11  000  0000  0111  001
+    ID_A64PFR0_EL1    = 0xc020, // 11  000  0000  0100  000
+    ID_A64PFR1_EL1    = 0xc021, // 11  000  0000  0100  001
+    ID_A64DFR0_EL1    = 0xc028, // 11  000  0000  0101  000
+    ID_A64DFR1_EL1    = 0xc029, // 11  000  0000  0101  001
+    ID_A64AFR0_EL1    = 0xc02c, // 11  000  0000  0101  100
+    ID_A64AFR1_EL1    = 0xc02d, // 11  000  0000  0101  101
+    ID_A64ISAR0_EL1   = 0xc030, // 11  000  0000  0110  000
+    ID_A64ISAR1_EL1   = 0xc031, // 11  000  0000  0110  001
+    ID_A64MMFR0_EL1   = 0xc038, // 11  000  0000  0111  000
+    ID_A64MMFR1_EL1   = 0xc039, // 11  000  0000  0111  001
     MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
     MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
     MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
@@ -960,38 +1135,45 @@ namespace A64SysReg {
     ICH_LR12_EL2      = 0xe66c, // 11  100  1100  1101  100
     ICH_LR13_EL2      = 0xe66d, // 11  100  1100  1101  101
     ICH_LR14_EL2      = 0xe66e, // 11  100  1100  1101  110
-    ICH_LR15_EL2      = 0xe66f  // 11  100  1100  1101  111
+    ICH_LR15_EL2      = 0xe66f, // 11  100  1100  1101  111
+  };
+
+  // Cyclone specific system registers
+  enum CycloneSysRegValues {
+    CPM_IOACC_CTL_EL3 = 0xff90
   };
 
-  // Note that these do not inherit from NamedImmMapper. This class is
+  // Note that these do not inherit from AArch64NamedImmMapper. This class is
   // sufficiently different in its behaviour that I don't believe it's worth
-  // burdening the common NamedImmMapper with abstractions only needed in
+  // burdening the common AArch64NamedImmMapper with abstractions only needed in
   // this one case.
   struct SysRegMapper {
-    static const NamedImmMapper::Mapping SysRegPairs[];
+    static const AArch64NamedImmMapper::Mapping SysRegPairs[];
+    static const AArch64NamedImmMapper::Mapping CycloneSysRegPairs[];
 
-    const NamedImmMapper::Mapping *InstPairs;
+    const AArch64NamedImmMapper::Mapping *InstPairs;
     size_t NumInstPairs;
+    uint64_t FeatureBits;
 
-    SysRegMapper() {}
+    SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
     uint32_t fromString(StringRef Name, bool &Valid) const;
     std::string toString(uint32_t Bits, bool &Valid) const;
   };
 
   struct MSRMapper : SysRegMapper {
-    static const NamedImmMapper::Mapping MSRPairs[];
-    MSRMapper();
+    static const AArch64NamedImmMapper::Mapping MSRPairs[];
+    MSRMapper(uint64_t FeatureBits);
   };
 
   struct MRSMapper : SysRegMapper {
-    static const NamedImmMapper::Mapping MRSPairs[];
-    MRSMapper();
+    static const AArch64NamedImmMapper::Mapping MRSPairs[];
+    MRSMapper(uint64_t FeatureBits);
   };
 
   uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
 }
 
-namespace A64TLBI {
+namespace AArch64TLBI {
   enum TLBIValues {
     Invalid = -1,          // Op0 Op1  CRn   CRm   Op2
     IPAS2E1IS    = 0x6401, // 01  100  1000  0000  001
@@ -1028,7 +1210,7 @@ namespace A64TLBI {
     VAALE1       = 0x443f  // 01  000  1000  0111  111
   };
 
-  struct TLBIMapper : NamedImmMapper {
+  struct TLBIMapper : AArch64NamedImmMapper {
     const static Mapping TLBIPairs[];
 
     TLBIMapper();
@@ -1051,88 +1233,62 @@ namespace A64TLBI {
       return true;
     }
   }
-}
+} 
 
 namespace AArch64II {
-
+  /// Target Operand Flag enum.
   enum TOF {
-    //===--------------------------------------------------------------===//
+    //===------------------------------------------------------------------===//
     // AArch64 Specific MachineOperand flags.
 
     MO_NO_FLAG,
 
-    // MO_GOT - Represents a relocation referring to the GOT entry of a given
-    // symbol. Used in adrp.
-    MO_GOT,
-
-    // MO_GOT_LO12 - Represents a relocation referring to the low 12 bits of the
-    // GOT entry of a given symbol. Used in ldr only.
-    MO_GOT_LO12,
-
-    // MO_DTPREL_* - Represents a relocation referring to the offset from a
-    // module's dynamic thread pointer. Used in the local-dynamic TLS access
-    // model.
-    MO_DTPREL_G1,
-    MO_DTPREL_G0_NC,
-
-    // MO_GOTTPREL_* - Represents a relocation referring to a GOT entry
-    // providing the offset of a variable from the thread-pointer. Used in
-    // initial-exec TLS model where this offset is assigned in the static thread
-    // block and thus known by the dynamic linker.
-    MO_GOTTPREL,
-    MO_GOTTPREL_LO12,
-
-    // MO_TLSDESC_* - Represents a relocation referring to a GOT entry providing
-    // a TLS descriptor chosen by the dynamic linker. Used for the
-    // general-dynamic and local-dynamic TLS access models where very littls is
-    // known at link-time.
-    MO_TLSDESC,
-    MO_TLSDESC_LO12,
-
-    // MO_TPREL_* - Represents a relocation referring to the offset of a
-    // variable from the thread pointer itself. Used in the local-exec TLS
-    // access model.
-    MO_TPREL_G1,
-    MO_TPREL_G0_NC,
-
-    // MO_LO12 - On a symbol operand, this represents a relocation containing
-    // lower 12 bits of the address. Used in add/sub/ldr/str.
-    MO_LO12,
-
-    // MO_ABS_G* - Represent the 16-bit granules of an absolute reference using
-    // movz/movk instructions.
-    MO_ABS_G3,
-    MO_ABS_G2_NC,
-    MO_ABS_G1_NC,
-    MO_ABS_G0_NC
+    MO_FRAGMENT = 0x7,
+
+    /// MO_PAGE - A symbol operand with this flag represents the pc-relative
+    /// offset of the 4K page containing the symbol.  This is used with the
+    /// ADRP instruction.
+    MO_PAGE = 1,
+
+    /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
+    /// that symbol within a 4K page.  This offset is added to the page address
+    /// to produce the complete address.
+    MO_PAGEOFF = 2,
+
+    /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
+    /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G3 = 3,
+
+    /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
+    /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G2 = 4,
+
+    /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
+    /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G1 = 5,
+
+    /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
+    /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G0 = 6,
+
+    /// MO_GOT - This flag indicates that a symbol operand represents the
+    /// address of the GOT entry for the symbol, rather than the address of
+    /// the symbol itself.
+    MO_GOT = 8,
+
+    /// MO_NC - Indicates whether the linker is expected to check the symbol
+    /// reference for overflow. For example in an ADRP/ADD pair of relocations
+    /// the ADRP usually does check, but not the ADD.
+    MO_NC = 0x10,
+
+    /// MO_TLS - Indicates that the operand being accessed is some kind of
+    /// thread-local symbol. On Darwin, only one type of thread-local access
+    /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
+    /// referee will affect interpretation.
+    MO_TLS = 0x20
   };
-}
-
-class APFloat;
-
-namespace A64Imms {
-  bool isFPImm(const APFloat &Val, uint32_t &Imm8Bits);
-
-  inline bool isFPImm(const APFloat &Val) {
-    uint32_t Imm8;
-    return isFPImm(Val, Imm8);
-  }
-
-  bool isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits);
-  bool isLogicalImmBits(unsigned RegWidth, uint32_t Bits, uint64_t &Imm);
-
-  bool isMOVZImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
-  bool isMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
-
-  // We sometimes want to know whether the immediate is representable with a
-  // MOVN but *not* with a MOVZ (because that would take priority).
-  bool isOnlyMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
-
-  uint64_t decodeNeonModImm(unsigned Val, unsigned OpCmode, unsigned &EltBits);
-  bool decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm,
-                             unsigned &ShiftOnesIn);
-  }
+} // end namespace AArch64II
 
-} // end namespace llvm;
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/Utils/Android.mk b/lib/Target/AArch64/Utils/Android.mk
index b8bf795..3c1d194 100644
--- a/lib/Target/AArch64/Utils/Android.mk
+++ b/lib/Target/AArch64/Utils/Android.mk
@@ -1,5 +1,10 @@
 LOCAL_PATH := $(call my-dir)
 
+arm64_utils_TBLGEN_TABLES := \
+  AArch64GenRegisterInfo.inc \
+  AArch64GenInstrInfo.inc \
+  AArch64GenSubtargetInfo.inc
+
 arm64_utils_SRC_FILES := \
   AArch64BaseInfo.cpp
 
@@ -16,7 +21,12 @@ LOCAL_MODULE:= libLLVMARM64Utils
 
 LOCAL_MODULE_TAGS := optional
 
+TBLGEN_TD_DIR := $(LOCAL_PATH)/..
+TBLGEN_TABLES := $(arm64_utils_TBLGEN_TABLES)
+
 include $(LLVM_DEVICE_BUILD_MK)
+include $(LLVM_TBLGEN_RULES_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_STATIC_LIBRARY)
 endif
 
@@ -32,5 +42,10 @@ LOCAL_MODULE:= libLLVMARM64Utils
 
 LOCAL_MODULE_TAGS := optional
 
+TBLGEN_TD_DIR := $(LOCAL_PATH)/..
+TBLGEN_TABLES := $(arm64_utils_TBLGEN_TABLES)
+
 include $(LLVM_HOST_BUILD_MK)
+include $(LLVM_TBLGEN_RULES_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/lib/Target/AArch64/Utils/LLVMBuild.txt b/lib/Target/AArch64/Utils/LLVMBuild.txt
index 4acecc9..bcefeb6 100644
--- a/lib/Target/AArch64/Utils/LLVMBuild.txt
+++ b/lib/Target/AArch64/Utils/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch646/Utils/LLVMBuild.txt ----------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/Utils/LLVMBuild.txt ----------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/Utils/Makefile b/lib/Target/AArch64/Utils/Makefile
index 0f4a645..0b80f82 100644
--- a/lib/Target/AArch64/Utils/Makefile
+++ b/lib/Target/AArch64/Utils/Makefile
@@ -9,7 +9,8 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMAArch64Utils
 
-# Hack: we need to include 'main' AArch64 target directory to grab private headers
-#CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+# Hack: we need to include 'main' AArch64 target directory to grab private
+# headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index 28ea879..94faf6f 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -24,7 +24,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "a15-sd-optimizer"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
@@ -39,6 +38,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "a15-sd-optimizer"
+
 namespace {
   struct A15SDOptimizer : public MachineFunctionPass {
     static char ID;
@@ -90,7 +91,7 @@ namespace {
     unsigned createImplicitDef(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator InsertBefore,
                                DebugLoc DL);
-    
+
     //
     // Various property checkers
     //
@@ -259,7 +260,7 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
       if (DPRMI && SPRMI) {
         // See if the first operand of this insert_subreg is IMPLICIT_DEF
         MachineInstr *ECDef = elideCopies(DPRMI);
-        if (ECDef != 0 && ECDef->isImplicitDef()) {
+        if (ECDef && ECDef->isImplicitDef()) {
           // Another corner case - if we're inserting something that is purely
           // a subreg copy of a DPR, just use that DPR.
 
@@ -348,10 +349,10 @@ MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
   if (!MI->isFullCopy())
     return MI;
   if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
-    return NULL;
+    return nullptr;
   MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
   if (!Def)
-    return NULL;
+    return nullptr;
   return elideCopies(Def);
 }
 
@@ -435,7 +436,7 @@ A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
                          Out)
                    .addReg(Reg)
                    .addImm(Lane));
- 
+
   return Out;
 }
 
@@ -601,7 +602,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
   //   * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR
   //                      lane, and the other lane(s) of the DPR/QPR register
   //                      that we are inserting in are undefined, use the
-  //                      original DPR/QPR value. 
+  //                      original DPR/QPR value.
   //                    * Otherwise, fall back on the same stategy as COPY.
   //
   //   * REG_SEQUENCE:  * If all except one of the input operands are
@@ -693,7 +694,7 @@ bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
       MI != ME;) {
       Modified |= runOnInstruction(MI++);
     }
- 
+
   }
 
   for (std::set<MachineInstr *>::iterator I = DeadInstr.begin(),
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 4412b45..55df29c 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -49,8 +49,6 @@ FunctionPass *createThumb2SizeReductionPass();
 /// \brief Creates an ARM-specific Target Transformation Info pass.
 ImmutablePass *createARMTargetTransformInfoPass(const ARMBaseTargetMachine *TM);
 
-FunctionPass *createARMAtomicExpandPass(const TargetMachine *TM);
-
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
 
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 0fa865f..55e9fe5 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "ARMAsmPrinter.h"
 #include "ARM.h"
 #include "ARMConstantPoolValue.h"
@@ -45,6 +44,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/COFF.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
@@ -55,6 +55,8 @@
 #include <cctype>
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 void ARMAsmPrinter::EmitFunctionBodyEnd() {
   // Make sure to terminate any constant pools that were at the end
   // of the function.
@@ -85,7 +87,7 @@ void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
                                              ? MCSymbolRefExpr::VK_ARM_TARGET1
                                              : MCSymbolRefExpr::VK_None),
                                             OutContext);
-  
+
   OutStreamer.EmitValue(E, Size);
 }
 
@@ -96,7 +98,28 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   AFI = MF.getInfo<ARMFunctionInfo>();
   MCP = MF.getConstantPool();
 
-  return AsmPrinter::runOnMachineFunction(MF);
+  SetupMachineFunction(MF);
+
+  if (Subtarget->isTargetCOFF()) {
+    bool Internal = MF.getFunction()->hasInternalLinkage();
+    COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC
+                                            : COFF::IMAGE_SYM_CLASS_EXTERNAL;
+    int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
+
+    OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
+    OutStreamer.EmitCOFFSymbolStorageClass(Scl);
+    OutStreamer.EmitCOFFSymbolType(Type);
+    OutStreamer.EndCOFFSymbolDef();
+  }
+
+  // Have common code print out the function header with linkage info etc.
+  EmitFunctionHeader();
+
+  // Emit the rest of the function body.
+  EmitFunctionBody();
+
+  // We didn't modify anything.
+  return false;
 }
 
 void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
@@ -239,7 +262,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       if (ARM::GPRPairRegClass.contains(RegBegin)) {
         const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
         unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
-        O << ARMInstPrinter::getRegisterName(Reg0) << ", ";;
+        O << ARMInstPrinter::getRegisterName(Reg0) << ", ";
         RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1);
       }
       O << ARMInstPrinter::getRegisterName(RegBegin);
@@ -383,7 +406,7 @@ void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
   // If either end mode is unknown (EndInfo == NULL) or different than
   // the start mode, then restore the start mode.
   const bool WasThumb = isThumb(StartInfo);
-  if (EndInfo == NULL || WasThumb != isThumb(*EndInfo)) {
+  if (!EndInfo || WasThumb != isThumb(*EndInfo)) {
     OutStreamer.EmitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32);
   }
 }
@@ -456,6 +479,29 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
     emitAttributes();
 }
 
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+                         MachineModuleInfoImpl::StubValueTy &MCSym) {
+  // L_foo$stub:
+  OutStreamer.EmitLabel(StubLabel);
+  //   .indirect_symbol _foo
+  OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+  if (MCSym.getInt())
+    // External to current translation unit.
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+  else
+    // Internal to current translation unit.
+    //
+    // When we place the LSDA into the TEXT section, the type info
+    // pointers need to be indirect and pc-rel. We accomplish this by
+    // using NLPs; however, sometimes the types are local to the file.
+    // We need to fill in the value for the NLP in those cases.
+    OutStreamer.EmitValue(
+        MCSymbolRefExpr::Create(MCSym.getPointer(), OutStreamer.getContext()),
+        4 /*size*/);
+}
+
 
 void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
   if (Subtarget->isTargetMachO()) {
@@ -472,27 +518,9 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
       // Switch with ".non_lazy_symbol_pointer" directive.
       OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
       EmitAlignment(2);
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        // L_foo$stub:
-        OutStreamer.EmitLabel(Stubs[i].first);
-        //   .indirect_symbol _foo
-        MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
-        OutStreamer.EmitSymbolAttribute(MCSym.getPointer(),MCSA_IndirectSymbol);
-
-        if (MCSym.getInt())
-          // External to current translation unit.
-          OutStreamer.EmitIntValue(0, 4/*size*/);
-        else
-          // Internal to current translation unit.
-          //
-          // When we place the LSDA into the TEXT section, the type info
-          // pointers need to be indirect and pc-rel. We accomplish this by
-          // using NLPs; however, sometimes the types are local to the file.
-          // We need to fill in the value for the NLP in those cases.
-          OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
-                                                        OutContext),
-                                4/*size*/);
-      }
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
 
       Stubs.clear();
       OutStreamer.AddBlankLine();
@@ -500,17 +528,11 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
 
     Stubs = MMIMacho.GetHiddenGVStubList();
     if (!Stubs.empty()) {
-      OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+      OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
       EmitAlignment(2);
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        // L_foo$stub:
-        OutStreamer.EmitLabel(Stubs[i].first);
-        //   .long _foo
-        OutStreamer.EmitValue(MCSymbolRefExpr::
-                              Create(Stubs[i].second.getPointer(),
-                                     OutContext),
-                              4/*size*/);
-      }
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
 
       Stubs.clear();
       OutStreamer.AddBlankLine();
@@ -523,6 +545,28 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     // generates code that does this, it is always safe to set.
     OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
+
+  // Emit a .data.rel section containing any stubs that were created.
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (auto &stub: Stubs) {
+        OutStreamer.EmitLabel(stub.first);
+        OutStreamer.EmitSymbolValue(stub.second.getPointer(),
+                                    TD->getPointerSize(0));
+      }
+      Stubs.clear();
+    }
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -575,7 +619,7 @@ void ARMAsmPrinter::emitAttributes() {
                     getArchForCPU(CPUString, Subtarget));
 
   // Tag_CPU_arch_profile must have the default value of 0 when "Architecture
-  // profile is not applicable (e.g. pre v7, or cross-profile code)". 
+  // profile is not applicable (e.g. pre v7, or cross-profile code)".
   if (Subtarget->hasV7Ops()) {
     if (Subtarget->isAClass()) {
       ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
@@ -627,6 +671,20 @@ void ARMAsmPrinter::emitAttributes() {
       ATS.emitFPU(ARM::VFPV2);
   }
 
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // PIC specific attributes.
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RW_data,
+                      ARMBuildAttrs::AddressRWPCRel);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RO_data,
+                      ARMBuildAttrs::AddressROPCRel);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use,
+                      ARMBuildAttrs::AddressGOT);
+  } else {
+    // Allow direct addressing of imported data for all other relocation models.
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use,
+                      ARMBuildAttrs::AddressDirect);
+  }
+
   // Signal various FP modes.
   if (!TM.Options.UnsafeFPMath) {
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::Allowed);
@@ -723,7 +781,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
   MachineModuleInfoImpl::StubValueTy &StubSym =
     GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) :
     MMIMachO.getGVStubEntry(MCSym);
-  if (StubSym.getPointer() == 0)
+  if (!StubSym.getPointer())
     StubSym = MachineModuleInfoImpl::
       StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
   return MCSym;
@@ -971,7 +1029,8 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       RegList.push_back(SrcReg);
       break;
     }
-    ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
+    if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+      ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
   } else {
     // Changes of stack / frame pointer.
     if (SrcReg == ARM::SP) {
@@ -1016,18 +1075,20 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       }
       }
 
-      if (DstReg == FramePtr && FramePtr != ARM::SP)
-        // Set-up of the frame pointer. Positive values correspond to "add"
-        // instruction.
-        ATS.emitSetFP(FramePtr, ARM::SP, -Offset);
-      else if (DstReg == ARM::SP) {
-        // Change of SP by an offset. Positive values correspond to "sub"
-        // instruction.
-        ATS.emitPad(Offset);
-      } else {
-        // Move of SP to a register.  Positive values correspond to an "add"
-        // instruction.
-        ATS.emitMovSP(DstReg, -Offset);
+      if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) {
+        if (DstReg == FramePtr && FramePtr != ARM::SP)
+          // Set-up of the frame pointer. Positive values correspond to "add"
+          // instruction.
+          ATS.emitSetFP(FramePtr, ARM::SP, -Offset);
+        else if (DstReg == ARM::SP) {
+          // Change of SP by an offset. Positive values correspond to "sub"
+          // instruction.
+          ATS.emitPad(Offset);
+        } else {
+          // Move of SP to a register.  Positive values correspond to an "add"
+          // instruction.
+          ATS.emitMovSP(DstReg, -Offset);
+        }
       }
     } else if (DstReg == ARM::SP) {
       MI->dump();
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 46c2626..7c103c6 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -47,16 +47,17 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
   bool InConstantPool;
 public:
   explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL), InConstantPool(false) {
-      Subtarget = &TM.getSubtarget<ARMSubtarget>();
-    }
+    : AsmPrinter(TM, Streamer), AFI(nullptr), MCP(nullptr),
+      InConstantPool(false) {
+    Subtarget = &TM.getSubtarget<ARMSubtarget>();
+  }
 
   const char *getPassName() const override {
     return "ARM Assembly / Object Emitter";
   }
 
   void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O,
-                    const char *Modifier = 0);
+                    const char *Modifier = nullptr);
 
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                        unsigned AsmVariant, const char *ExtraCode,
diff --git a/lib/Target/ARM/ARMAtomicExpandPass.cpp b/lib/Target/ARM/ARMAtomicExpandPass.cpp
deleted file mode 100644
index 18e0783..0000000
--- a/lib/Target/ARM/ARMAtomicExpandPass.cpp
+++ /dev/null
@@ -1,406 +0,0 @@
-//===-- ARMAtomicExpandPass.cpp - Expand atomic instructions --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass (at IR level) to replace atomic instructions with
-// appropriate (intrinsic-based) ldrex/strex loops.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm-atomic-expand"
-#include "ARM.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-namespace {
-  class ARMAtomicExpandPass : public FunctionPass {
-    const TargetLowering *TLI;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit ARMAtomicExpandPass(const TargetMachine *TM = 0)
-      : FunctionPass(ID), TLI(TM->getTargetLowering()) {}
-
-    bool runOnFunction(Function &F) override;
-    bool expandAtomicInsts(Function &F);
-
-    bool expandAtomicLoad(LoadInst *LI);
-    bool expandAtomicStore(StoreInst *LI);
-    bool expandAtomicRMW(AtomicRMWInst *AI);
-    bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
-
-    AtomicOrdering insertLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
-    void insertTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
-
-    /// Perform a load-linked operation on Addr, returning a "Value *" with the
-    /// corresponding pointee type. This may entail some non-trivial operations
-    /// to truncate or reconstruct illegal types since intrinsics must be legal
-    Value *loadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord);
-
-    /// Perform a store-conditional operation to Addr. Return the status of the
-    /// store: 0 if the it succeeded, non-zero otherwise.
-    Value *storeConditional(IRBuilder<> &Builder, Value *Val, Value *Addr,
-                            AtomicOrdering Ord);
-
-    /// Return true if the given (atomic) instruction should be expanded by this
-    /// pass.
-    bool shouldExpandAtomic(Instruction *Inst);
-  };
-}
-
-char ARMAtomicExpandPass::ID = 0;
-
-FunctionPass *llvm::createARMAtomicExpandPass(const TargetMachine *TM) {
-  return new ARMAtomicExpandPass(TM);
-}
-
-bool ARMAtomicExpandPass::runOnFunction(Function &F) {
-  SmallVector<Instruction *, 1> AtomicInsts;
-
-  // Changing control-flow while iterating through it is a bad idea, so gather a
-  // list of all atomic instructions before we start.
-  for (BasicBlock &BB : F)
-    for (Instruction &Inst : BB) {
-      if (isa<AtomicRMWInst>(&Inst) || isa<AtomicCmpXchgInst>(&Inst) ||
-          (isa<LoadInst>(&Inst) && cast<LoadInst>(&Inst)->isAtomic()) ||
-          (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
-        AtomicInsts.push_back(&Inst);
-    }
-
-  bool MadeChange = false;
-  for (Instruction *Inst : AtomicInsts) {
-    if (!shouldExpandAtomic(Inst))
-      continue;
-
-    if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
-      MadeChange |= expandAtomicRMW(AI);
-    else if (AtomicCmpXchgInst *CI = dyn_cast<AtomicCmpXchgInst>(Inst))
-      MadeChange |= expandAtomicCmpXchg(CI);
-    else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-      MadeChange |= expandAtomicLoad(LI);
-    else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-      MadeChange |= expandAtomicStore(SI);
-    else
-      llvm_unreachable("Unknown atomic instruction");
-  }
-
-  return MadeChange;
-}
-
-bool ARMAtomicExpandPass::expandAtomicLoad(LoadInst *LI) {
-  // Load instructions don't actually need a leading fence, even in the
-  // SequentiallyConsistent case.
-  AtomicOrdering MemOpOrder =
-    TLI->getInsertFencesForAtomic() ? Monotonic : LI->getOrdering();
-
-  // The only 64-bit load guaranteed to be single-copy atomic by the ARM ARM is
-  // an ldrexd (A3.5.3).
-  IRBuilder<> Builder(LI);
-  Value *Val = loadLinked(Builder, LI->getPointerOperand(), MemOpOrder);
-
-  insertTrailingFence(Builder, LI->getOrdering());
-
-  LI->replaceAllUsesWith(Val);
-  LI->eraseFromParent();
-
-  return true;
-}
-
-bool ARMAtomicExpandPass::expandAtomicStore(StoreInst *SI) {
-  // The only atomic 64-bit store on ARM is an strexd that succeeds, which means
-  // we need a loop and the entire instruction is essentially an "atomicrmw
-  // xchg" that ignores the value loaded.
-  IRBuilder<> Builder(SI);
-  AtomicRMWInst *AI =
-      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
-                              SI->getValueOperand(), SI->getOrdering());
-  SI->eraseFromParent();
-
-  // Now we have an appropriate swap instruction, lower it as usual.
-  return expandAtomicRMW(AI);
-}
-
-bool ARMAtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) {
-  AtomicOrdering Order = AI->getOrdering();
-  Value *Addr = AI->getPointerOperand();
-  BasicBlock *BB = AI->getParent();
-  Function *F = BB->getParent();
-  LLVMContext &Ctx = F->getContext();
-
-  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
-  //
-  // The standard expansion we produce is:
-  //     [...]
-  //     fence?
-  // atomicrmw.start:
-  //     %loaded = @load.linked(%addr)
-  //     %new = some_op iN %loaded, %incr
-  //     %stored = @store_conditional(%new, %addr)
-  //     %try_again = icmp i32 ne %stored, 0
-  //     br i1 %try_again, label %loop, label %atomicrmw.end
-  // atomicrmw.end:
-  //     fence?
-  //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
-  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
-
-  // This grabs the DebugLoc from AI.
-  IRBuilder<> Builder(AI);
-
-  // The split call above "helpfully" added a branch at the end of BB (to the
-  // wrong place), but we might want a fence too. It's easiest to just remove
-  // the branch entirely.
-  std::prev(BB->end())->eraseFromParent();
-  Builder.SetInsertPoint(BB);
-  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, Order);
-  Builder.CreateBr(LoopBB);
-
-  // Start the main loop block now that we've taken care of the preliminaries.
-  Builder.SetInsertPoint(LoopBB);
-  Value *Loaded = loadLinked(Builder, Addr, MemOpOrder);
-
-  Value *NewVal;
-  switch (AI->getOperation()) {
-  case AtomicRMWInst::Xchg:
-    NewVal = AI->getValOperand();
-    break;
-  case AtomicRMWInst::Add:
-    NewVal = Builder.CreateAdd(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Sub:
-    NewVal = Builder.CreateSub(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::And:
-    NewVal = Builder.CreateAnd(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Nand:
-    NewVal = Builder.CreateAnd(Loaded, Builder.CreateNot(AI->getValOperand()),
-                               "new");
-    break;
-  case AtomicRMWInst::Or:
-    NewVal = Builder.CreateOr(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Xor:
-    NewVal = Builder.CreateXor(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Max:
-    NewVal = Builder.CreateICmpSGT(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Min:
-    NewVal = Builder.CreateICmpSLE(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::UMax:
-    NewVal = Builder.CreateICmpUGT(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::UMin:
-    NewVal = Builder.CreateICmpULE(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  default:
-    llvm_unreachable("Unknown atomic op");
-  }
-
-  Value *StoreSuccess = storeConditional(Builder, NewVal, Addr, MemOpOrder);
-  Value *TryAgain = Builder.CreateICmpNE(
-      StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
-  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
-
-  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
-  insertTrailingFence(Builder, Order);
-
-  AI->replaceAllUsesWith(Loaded);
-  AI->eraseFromParent();
-
-  return true;
-}
-
-bool ARMAtomicExpandPass::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
-  AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
-  AtomicOrdering FailureOrder = CI->getFailureOrdering();
-  Value *Addr = CI->getPointerOperand();
-  BasicBlock *BB = CI->getParent();
-  Function *F = BB->getParent();
-  LLVMContext &Ctx = F->getContext();
-
-  // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
-  //
-  // The full expansion we produce is:
-  //     [...]
-  //     fence?
-  // cmpxchg.start:
-  //     %loaded = @load.linked(%addr)
-  //     %should_store = icmp eq %loaded, %desired
-  //     br i1 %should_store, label %cmpxchg.trystore,
-  //                          label %cmpxchg.end/%cmpxchg.barrier
-  // cmpxchg.trystore:
-  //     %stored = @store_conditional(%new, %addr)
-  //     %try_again = icmp i32 ne %stored, 0
-  //     br i1 %try_again, label %loop, label %cmpxchg.end
-  // cmpxchg.barrier:
-  //     fence?
-  //     br label %cmpxchg.end
-  // cmpxchg.end:
-  //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end");
-  auto BarrierBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ExitBB);
-  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.barrier", F, BarrierBB);
-  auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
-
-  // This grabs the DebugLoc from CI
-  IRBuilder<> Builder(CI);
-
-  // The split call above "helpfully" added a branch at the end of BB (to the
-  // wrong place), but we might want a fence too. It's easiest to just remove
-  // the branch entirely.
-  std::prev(BB->end())->eraseFromParent();
-  Builder.SetInsertPoint(BB);
-  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, SuccessOrder);
-  Builder.CreateBr(LoopBB);
-
-  // Start the main loop block now that we've taken care of the preliminaries.
-  Builder.SetInsertPoint(LoopBB);
-  Value *Loaded = loadLinked(Builder, Addr, MemOpOrder);
-  Value *ShouldStore =
-      Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store");
-
-  // If the the cmpxchg doesn't actually need any ordering when it fails, we can
-  // jump straight past that fence instruction (if it exists).
-  BasicBlock *FailureBB = FailureOrder == Monotonic ? ExitBB : BarrierBB;
-  Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
-
-  Builder.SetInsertPoint(TryStoreBB);
-  Value *StoreSuccess =
-      storeConditional(Builder, CI->getNewValOperand(), Addr, MemOpOrder);
-  Value *TryAgain = Builder.CreateICmpNE(
-      StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
-  Builder.CreateCondBr(TryAgain, LoopBB, BarrierBB);
-
-  // Finally, make sure later instructions don't get reordered with a fence if
-  // necessary.
-  Builder.SetInsertPoint(BarrierBB);
-  insertTrailingFence(Builder, SuccessOrder);
-  Builder.CreateBr(ExitBB);
-
-  CI->replaceAllUsesWith(Loaded);
-  CI->eraseFromParent();
-
-  return true;
-}
-
-Value *ARMAtomicExpandPass::loadLinked(IRBuilder<> &Builder, Value *Addr,
-                                          AtomicOrdering Ord) {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire =
-      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
-
-  // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
-  // intrinsic must return {i32, i32} and we have to recombine them into a
-  // single i64 here.
-  if (ValTy->getPrimitiveSizeInBits() == 64) {
-    Intrinsic::ID Int =
-        IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
-    Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
-
-    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
-    Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
-
-    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
-    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
-    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
-    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
-    return Builder.CreateOr(
-        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
-  }
-
-  Type *Tys[] = { Addr->getType() };
-  Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
-  Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
-
-  return Builder.CreateTruncOrBitCast(
-      Builder.CreateCall(Ldrex, Addr),
-      cast<PointerType>(Addr->getType())->getElementType());
-}
-
-Value *ARMAtomicExpandPass::storeConditional(IRBuilder<> &Builder, Value *Val,
-                                           Value *Addr, AtomicOrdering Ord) {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease =
-      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
-
-  // Since the intrinsics must have legal type, the i64 intrinsics take two
-  // parameters: "i32, i32". We must marshal Val into the appropriate form
-  // before the call.
-  if (Val->getType()->getPrimitiveSizeInBits() == 64) {
-    Intrinsic::ID Int =
-        IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
-    Function *Strex = Intrinsic::getDeclaration(M, Int);
-    Type *Int32Ty = Type::getInt32Ty(M->getContext());
-
-    Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
-    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
-    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
-    return Builder.CreateCall3(Strex, Lo, Hi, Addr);
-  }
-
-  Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
-  Type *Tys[] = { Addr->getType() };
-  Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
-
-  return Builder.CreateCall2(
-      Strex, Builder.CreateZExtOrBitCast(
-                 Val, Strex->getFunctionType()->getParamType(0)),
-      Addr);
-}
-
-AtomicOrdering ARMAtomicExpandPass::insertLeadingFence(IRBuilder<> &Builder,
-                                                       AtomicOrdering Ord) {
-  if (!TLI->getInsertFencesForAtomic())
-    return Ord;
-
-  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    Builder.CreateFence(Release);
-
-  // The exclusive operations don't need any barrier if we're adding separate
-  // fences.
-  return Monotonic;
-}
-
-void ARMAtomicExpandPass::insertTrailingFence(IRBuilder<> &Builder,
-                                              AtomicOrdering Ord) {
-  if (!TLI->getInsertFencesForAtomic())
-    return;
-
-  if (Ord == Acquire || Ord == AcquireRelease)
-    Builder.CreateFence(Acquire);
-  else if (Ord == SequentiallyConsistent)
-    Builder.CreateFence(SequentiallyConsistent);
-}
-
-bool ARMAtomicExpandPass::shouldExpandAtomic(Instruction *Inst) {
-  // Loads and stores less than 64-bits are already atomic; ones above that
-  // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong:
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 64;
-  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-    return LI->getType()->getPrimitiveSizeInBits() == 64;
-
-  // For the real atomic operations, we have ldrex/strex up to 64 bits.
-  return Inst->getType()->getPrimitiveSizeInBits() <= 64;
-}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 47f5bf9..bc266e8 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -37,11 +37,13 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-instrinfo"
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "ARMGenInstrInfo.inc"
 
-using namespace llvm;
-
 static cl::opt<bool>
 EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
                cl::desc("Enable ARM 2-addr to 3-addr conv"));
@@ -125,14 +127,14 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   // FIXME: Thumb2 support.
 
   if (!EnableARM3Addr)
-    return NULL;
+    return nullptr;
 
   MachineInstr *MI = MBBI;
   MachineFunction &MF = *MI->getParent()->getParent();
   uint64_t TSFlags = MI->getDesc().TSFlags;
   bool isPre = false;
   switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
-  default: return NULL;
+  default: return nullptr;
   case ARMII::IndexModePre:
     isPre = true;
     break;
@@ -144,10 +146,10 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   // operation.
   unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
   if (MemOpc == 0)
-    return NULL;
+    return nullptr;
 
-  MachineInstr *UpdateMI = NULL;
-  MachineInstr *MemMI = NULL;
+  MachineInstr *UpdateMI = nullptr;
+  MachineInstr *MemMI = nullptr;
   unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
   const MCInstrDesc &MCID = MI->getDesc();
   unsigned NumOps = MCID.getNumOperands();
@@ -169,7 +171,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       if (ARM_AM::getSOImmVal(Amt) == -1)
         // Can't encode it in a so_imm operand. This transformation will
         // add more than 1 instruction. Abandon!
-        return NULL;
+        return nullptr;
       UpdateMI = BuildMI(MF, MI->getDebugLoc(),
                          get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
         .addReg(BaseReg).addImm(Amt)
@@ -273,8 +275,8 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                                 MachineBasicBlock *&FBB,
                                 SmallVectorImpl<MachineOperand> &Cond,
                                 bool AllowModify) const {
-  TBB = 0;
-  FBB = 0;
+  TBB = nullptr;
+  FBB = nullptr;
 
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin())
@@ -331,7 +333,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
            I->isReturn())) {
       // Forget any previous condition branch information - it no longer applies.
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       // If we can modify the function, delete everything below this
       // unconditional branch.
@@ -405,7 +407,7 @@ ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   assert((Cond.size() == 2 || Cond.size() == 0) &&
          "ARM branch conditions have two components!");
 
-  if (FBB == 0) {
+  if (!FBB) {
     if (Cond.empty()) { // Unconditional branch?
       if (isThumb)
         BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).addImm(ARMCC::AL).addReg(0);
@@ -535,7 +537,8 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
   return true;
 }
 
-template<> bool IsCPSRDead<MachineInstr>(MachineInstr* MI) {
+namespace llvm {
+template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || MO.isUndef() || MO.isUse())
@@ -548,6 +551,7 @@ template<> bool IsCPSRDead<MachineInstr>(MachineInstr* MI) {
   // all definitions of CPSR are dead
   return true;
 }
+}
 
 /// FIXME: Works around a gcc miscompilation with -fstrict-aliasing.
 LLVM_ATTRIBUTE_NOINLINE
@@ -620,7 +624,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
       MI->getOperand(NumOps - (MI->isPredicable() ? 3 : 2));
     unsigned JTI = JTOP.getIndex();
     const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-    assert(MJTI != 0);
+    assert(MJTI != nullptr);
     const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
     assert(JTI < JT.size());
     // Thumb instructions are 2 byte aligned, but JT entries are 4 byte
@@ -1248,7 +1252,7 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
     static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
 
   unsigned PCLabelId = AFI->createPICLabelUId();
-  ARMConstantPoolValue *NewCPV = 0;
+  ARMConstantPoolValue *NewCPV = nullptr;
 
   // FIXME: The below assumes PIC relocation model and that the function
   // is Thumb mode (t1 or t2). PCAdjustment would be 8 for ARM mode PIC, and
@@ -1659,10 +1663,10 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
     ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg);
     // MOVCC AL can't be inverted. Shouldn't happen.
     if (CC == ARMCC::AL || PredReg != ARM::CPSR)
-      return NULL;
+      return nullptr;
     MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
     if (!MI)
-      return NULL;
+      return nullptr;
     // After swapping the MOVCC operands, also invert the condition.
     MI->getOperand(MI->findFirstPredOperandIdx())
       .setImm(ARMCC::getOppositeCondition(CC));
@@ -1678,35 +1682,36 @@ static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
                                       const MachineRegisterInfo &MRI,
                                       const TargetInstrInfo *TII) {
   if (!TargetRegisterInfo::isVirtualRegister(Reg))
-    return 0;
+    return nullptr;
   if (!MRI.hasOneNonDBGUse(Reg))
-    return 0;
+    return nullptr;
   MachineInstr *MI = MRI.getVRegDef(Reg);
   if (!MI)
-    return 0;
+    return nullptr;
   // MI is folded into the MOVCC by predicating it.
   if (!MI->isPredicable())
-    return 0;
+    return nullptr;
   // Check if MI has any non-dead defs or physreg uses. This also detects
   // predicated instructions which will be reading CPSR.
   for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     // Reject frame index operands, PEI can't handle the predicated pseudos.
     if (MO.isFI() || MO.isCPI() || MO.isJTI())
-      return 0;
+      return nullptr;
     if (!MO.isReg())
       continue;
     // MI can't have any tied operands, that would conflict with predication.
     if (MO.isTied())
-      return 0;
+      return nullptr;
     if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
-      return 0;
+      return nullptr;
     if (MO.isDef() && !MO.isDead())
-      return 0;
+      return nullptr;
   }
   bool DontMoveAcrossStores = true;
-  if (!MI->isSafeToMove(TII, /* AliasAnalysis = */ 0, DontMoveAcrossStores))
-    return 0;
+  if (!MI->isSafeToMove(TII, /* AliasAnalysis = */ nullptr,
+                        DontMoveAcrossStores))
+    return nullptr;
   return MI;
 }
 
@@ -1741,14 +1746,14 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   if (!DefMI)
     DefMI = canFoldIntoMOVCC(MI->getOperand(1).getReg(), MRI, this);
   if (!DefMI)
-    return 0;
+    return nullptr;
 
   // Find new register class to use.
   MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
   unsigned       DestReg  = MI->getOperand(0).getReg();
   const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
   if (!MRI.constrainRegClass(DestReg, PreviousClass))
-    return 0;
+    return nullptr;
 
   // Create a new predicated version of DefMI.
   // Rfalse is the first use.
@@ -2254,7 +2259,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // Masked compares sometimes use the same register as the corresponding 'and'.
   if (CmpMask != ~0) {
     if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) {
-      MI = 0;
+      MI = nullptr;
       for (MachineRegisterInfo::use_instr_iterator
            UI = MRI->use_instr_begin(SrcReg), UE = MRI->use_instr_end();
            UI != UE; ++UI) {
@@ -2281,17 +2286,17 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // One is MI, the other is a SUB instruction.
   // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
   // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue).
-  MachineInstr *Sub = NULL;
+  MachineInstr *Sub = nullptr;
   if (SrcReg2 != 0)
     // MI is not a candidate for CMPrr.
-    MI = NULL;
+    MI = nullptr;
   else if (MI->getParent() != CmpInstr->getParent() || CmpValue != 0) {
     // Conservatively refuse to convert an instruction which isn't in the same
     // BB as the comparison.
     // For CMPri, we need to check Sub, thus we can't return here.
     if (CmpInstr->getOpcode() == ARM::CMPri ||
        CmpInstr->getOpcode() == ARM::t2CMPri)
-      MI = NULL;
+      MI = nullptr;
     else
       return false;
   }
@@ -3295,7 +3300,7 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
 
   if (Idx == -1) {
     Dist = 0;
-    return 0;
+    return nullptr;
   }
 
   UseIdx = Idx;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 3ddddcb..4b3e740 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -261,7 +261,7 @@ private:
 
   unsigned getInstrLatency(const InstrItineraryData *ItinData,
                            const MachineInstr *MI,
-                           unsigned *PredCost = 0) const override;
+                           unsigned *PredCost = nullptr) const override;
 
   int getInstrLatency(const InstrItineraryData *ItinData,
                       SDNode *Node) const override;
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 8130a2d..a2eee9f 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -44,14 +44,18 @@
 using namespace llvm;
 
 ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti)
-  : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti),
-    FramePtr((STI.isTargetMachO() || STI.isThumb()) ? ARM::R7 : ARM::R11),
-    BasePtr(ARM::R6) {
+    : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti), BasePtr(ARM::R6) {
+  if (STI.isTargetMachO())
+    FramePtr = ARM::R7;
+  else if (STI.isTargetWindows())
+    FramePtr = ARM::R11;
+  else // ARM EABI
+    FramePtr = STI.isThumb() ? ARM::R7 : ARM::R11;
 }
 
-const uint16_t*
+const MCPhysReg*
 ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  const uint16_t *RegList = (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+  const MCPhysReg *RegList = (STI.isTargetIOS() && !STI.isAAPCS_ABI())
                                 ? CSR_iOS_SaveList
                                 : CSR_AAPCS_SaveList;
 
@@ -107,7 +111,7 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const {
   // should return NULL
   if (CC == CallingConv::GHC)
     // This is academic becase all GHC calls are (supposed to be) tail calls
-    return NULL;
+    return nullptr;
   return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
     ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask;
 }
@@ -173,7 +177,7 @@ ARMBaseRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind
 const TargetRegisterClass *
 ARMBaseRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
   if (RC == &ARM::CCRRegClass)
-    return 0;  // Can't copy CCR registers.
+    return nullptr;  // Can't copy CCR registers.
   return RC;
 }
 
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 66b3c82..91df565 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -100,8 +100,8 @@ protected:
 
 public:
   /// Code Generation virtual methods...
-  const uint16_t *
-  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
   const uint32_t *getNoPreservedMask() const;
 
@@ -186,7 +186,7 @@ public:
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const override;
+                           RegScavenger *RS = nullptr) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index 4f94ad2..dc41c1c 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -28,7 +28,7 @@ namespace llvm {
 static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                           CCValAssign::LocInfo &LocInfo,
                           CCState &State, bool CanFail) {
-  static const uint16_t RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+  static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
 
   // Try to get the first register.
   if (unsigned Reg = State.AllocateReg(RegList, 4))
@@ -71,10 +71,10 @@ static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
 static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                            CCValAssign::LocInfo &LocInfo,
                            CCState &State, bool CanFail) {
-  static const uint16_t HiRegList[] = { ARM::R0, ARM::R2 };
-  static const uint16_t LoRegList[] = { ARM::R1, ARM::R3 };
-  static const uint16_t ShadowRegList[] = { ARM::R0, ARM::R1 };
-  static const uint16_t GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+  static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
+  static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
+  static const MCPhysReg ShadowRegList[] = { ARM::R0, ARM::R1 };
+  static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
 
   unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2);
   if (Reg == 0) {
@@ -123,8 +123,8 @@ static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
 
 static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                          CCValAssign::LocInfo &LocInfo, CCState &State) {
-  static const uint16_t HiRegList[] = { ARM::R0, ARM::R2 };
-  static const uint16_t LoRegList[] = { ARM::R1, ARM::R3 };
+  static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
+  static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
 
   unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
   if (Reg == 0)
@@ -160,6 +160,105 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                    State);
 }
 
+static const uint16_t SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
+                                     ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
+                                     ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
+                                     ARM::S12, ARM::S13, ARM::S14,  ARM::S15 };
+static const uint16_t DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+                                     ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
+static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
+
+// Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
+// has InConsecutiveRegs set, and that the last member also has
+// InConsecutiveRegsLast set. We must process all members of the HA before
+// we can allocate it, as we need to know the total number of registers that
+// will be needed in order to (attempt to) allocate a contiguous block.
+static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  SmallVectorImpl<CCValAssign> &PendingHAMembers = State.getPendingLocs();
+  // AAPCS HFAs must have 1-4 elements, all of the same type
+  assert(PendingHAMembers.size() < 8);
+  if (PendingHAMembers.size() > 0)
+    assert(PendingHAMembers[0].getLocVT() == LocVT);
+
+  // Add the argument to the list to be allocated once we know the size of the
+  // HA
+  PendingHAMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (ArgFlags.isInConsecutiveRegsLast()) {
+    assert(PendingHAMembers.size() > 0 && PendingHAMembers.size() <= 8 &&
+           "Homogeneous aggregates must have between 1 and 4 members");
+
+    // Try to allocate a contiguous block of registers, each of the correct
+    // size to hold one member.
+    const uint16_t *RegList;
+    unsigned NumRegs;
+    switch (LocVT.SimpleTy) {
+    case MVT::i32:
+    case MVT::f32:
+      RegList = SRegList;
+      NumRegs = 16;
+      break;
+    case MVT::f64:
+      RegList = DRegList;
+      NumRegs = 8;
+      break;
+    case MVT::v2f64:
+      RegList = QRegList;
+      NumRegs = 4;
+      break;
+    default:
+      llvm_unreachable("Unexpected member type for HA");
+      break;
+    }
+
+    unsigned RegResult =
+        State.AllocateRegBlock(RegList, NumRegs, PendingHAMembers.size());
+
+    if (RegResult) {
+      for (SmallVectorImpl<CCValAssign>::iterator It = PendingHAMembers.begin();
+           It != PendingHAMembers.end(); ++It) {
+        It->convertToReg(RegResult);
+        State.addLoc(*It);
+        ++RegResult;
+      }
+      PendingHAMembers.clear();
+      return true;
+    }
+
+    // Register allocation failed, fall back to the stack
+
+    // Mark all VFP regs as unavailable (AAPCS rule C.2.vfp)
+    for (unsigned regNo = 0; regNo < 16; ++regNo)
+      State.AllocateReg(SRegList[regNo]);
+
+    unsigned Size = LocVT.getSizeInBits() / 8;
+    unsigned Align = Size;
+
+    if (LocVT.SimpleTy == MVT::v2f64 || LocVT.SimpleTy == MVT::i32) {
+      // Vectors are always aligned to 8 bytes. If we've seen an i32 here
+      // it's because it's been split from a larger type, also with align 8.
+      Align = 8;
+    }
+
+    for (auto It : PendingHAMembers) {
+      It.convertToMem(State.AllocateStack(Size, Align));
+      State.addLoc(It);
+
+      // Only the first member needs to be aligned.
+      Align = 1;
+    }
+
+    // All pending members have now been allocated
+    PendingHAMembers.clear();
+  }
+
+  // This will be allocated by the last member of the HA
+  return true;
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 7cffd82..526089b 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -174,6 +174,9 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
+  // HFAs are passed in a contiguous block of registers, or on the stack
+  CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_HA">>,
+
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index 7359a11..2fd7edd 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMConstantPoolValue.h"
@@ -40,6 +39,8 @@
 #endif
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 STATISTIC(NumEmitted, "Number of machine instructions emitted");
 
 namespace {
@@ -65,10 +66,10 @@ namespace {
     static char ID;
   public:
     ARMCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-      : MachineFunctionPass(ID), JTI(0),
+      : MachineFunctionPass(ID), JTI(nullptr),
         II((const ARMBaseInstrInfo *)tm.getInstrInfo()),
         TD(tm.getDataLayout()), TM(tm),
-        MCE(mce), MCPEs(0), MJTEs(0),
+        MCE(mce), MCPEs(nullptr), MJTEs(nullptr),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_), IsThumb(false) {}
 
     /// getBinaryCodeForInstr - This function, generated by the
@@ -373,7 +374,7 @@ bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
 
   Subtarget = &TM.getSubtarget<ARMSubtarget>();
   MCPEs = &MF.getConstantPool()->getConstants();
-  MJTEs = 0;
+  MJTEs = nullptr;
   if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
   IsPIC = TM.getRelocationModel() == Reloc::PIC_;
   IsThumb = MF.getInfo<ARMFunctionInfo>()->isThumbFunction();
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index ba05171..ce264ee 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-cp-islands"
 #include "ARM.h"
 #include "ARMMachineFunctionInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
@@ -36,6 +35,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-cp-islands"
+
 STATISTIC(NumCPEs,       "Number of constpool entries");
 STATISTIC(NumSplit,      "Number of uncond branches inserted");
 STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
@@ -593,7 +594,7 @@ ARMConstantIslands::CPEntry
     if (CPEs[i].CPEMI == CPEMI)
       return &CPEs[i];
   }
-  return NULL;
+  return nullptr;
 }
 
 /// getCPELogAlign - Returns the required alignment of the constant pool entry
@@ -1102,7 +1103,7 @@ bool ARMConstantIslands::decrementCPEReferenceCount(unsigned CPI,
   assert(CPE && "Unexpected!");
   if (--CPE->RefCount == 0) {
     removeDeadCPEMI(CPEMI);
-    CPE->CPEMI = NULL;
+    CPE->CPEMI = nullptr;
     --NumCPEs;
     return true;
   }
@@ -1135,7 +1136,7 @@ int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
     if (CPEs[i].CPEMI == CPEMI)
       continue;
     // Removing CPEs can leave empty entries, skip
-    if (CPEs[i].CPEMI == NULL)
+    if (CPEs[i].CPEMI == nullptr)
       continue;
     if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
                      U.NegOk)) {
@@ -1317,7 +1318,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   ++MI;
   unsigned CPUIndex = CPUserIndex+1;
   unsigned NumCPUsers = CPUsers.size();
-  MachineInstr *LastIT = 0;
+  MachineInstr *LastIT = nullptr;
   for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
        Offset < BaseInsertOffset;
        Offset += TII->GetInstSizeInBytes(MI), MI = std::next(MI)) {
@@ -1491,7 +1492,7 @@ bool ARMConstantIslands::removeUnusedCPEntries() {
       for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
         if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
           removeDeadCPEMI(CPEs[j].CPEMI);
-          CPEs[j].CPEMI = NULL;
+          CPEs[j].CPEMI = nullptr;
           MadeChange = true;
         }
       }
@@ -1844,7 +1845,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
   // FIXME: After the tables are shrunk, can we get rid some of the
   // constantpool tables?
   MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-  if (MJTI == 0) return false;
+  if (!MJTI) return false;
 
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
@@ -1970,7 +1971,7 @@ bool ARMConstantIslands::reorderThumb2JumpTables() {
   bool MadeChange = false;
 
   MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-  if (MJTI == 0) return false;
+  if (!MJTI) return false;
 
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
@@ -2012,7 +2013,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   // try to move it; otherwise, create a new block following the jump
   // table that branches back to the actual target. This is a very simple
   // heuristic. FIXME: We can definitely improve it.
-  MachineBasicBlock *TBB = 0, *FBB = 0;
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   SmallVector<MachineOperand, 4> CondPrior;
   MachineFunction::iterator BBi = BB;
@@ -2032,7 +2033,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
     // Update numbering to account for the block being moved.
     MF->RenumberBlocks();
     ++NumJTMoved;
-    return NULL;
+    return nullptr;
   }
 
   // Create a new MBB for the code after the jump BB.
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index bd4ee44..6045738 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-pseudo"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
@@ -23,6 +22,7 @@
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/CommandLine.h"
@@ -31,6 +31,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-pseudo"
+
 static cl::opt<bool>
 VerifyARMPseudo("verify-arm-pseudo-expand", cl::Hidden,
                 cl::desc("Verify machine code after expanding ARM pseudos"));
@@ -345,7 +347,7 @@ static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
     std::lower_bound(NEONLdStTable, NEONLdStTable + NumEntries, Opcode);
   if (I != NEONLdStTable + NumEntries && I->PseudoOpc == Opcode)
     return I;
-  return NULL;
+  return nullptr;
 }
 
 /// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register,
@@ -614,6 +616,39 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   MI.eraseFromParent();
 }
 
+static bool IsAnAddressOperand(const MachineOperand &MO) {
+  // This check is overly conservative.  Unless we are certain that the machine
+  // operand is not a symbol reference, we return that it is a symbol reference.
+  // This is important as the load pair may not be split up Windows.
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+  case MachineOperand::MO_Immediate:
+  case MachineOperand::MO_CImmediate:
+  case MachineOperand::MO_FPImmediate:
+    return false;
+  case MachineOperand::MO_MachineBasicBlock:
+    return true;
+  case MachineOperand::MO_FrameIndex:
+    return false;
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_BlockAddress:
+    return true;
+  case MachineOperand::MO_RegisterMask:
+  case MachineOperand::MO_RegisterLiveOut:
+    return false;
+  case MachineOperand::MO_Metadata:
+  case MachineOperand::MO_MCSymbol:
+    return true;
+  case MachineOperand::MO_CFIIndex:
+    return false;
+  }
+  llvm_unreachable("unhandled machine operand type");
+}
+
 void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
@@ -624,10 +659,14 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   bool DstIsDead = MI.getOperand(0).isDead();
   bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
   const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1);
+  bool RequiresBundling = STI->isTargetWindows() && IsAnAddressOperand(MO);
   MachineInstrBuilder LO16, HI16;
 
   if (!STI->hasV6T2Ops() &&
       (Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) {
+    // FIXME Windows CE supports older ARM CPUs
+    assert(!STI->isTargetWindows() && "Windows on ARM requires ARMv7+");
+
     // Expand into a movi + orr.
     LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg);
     HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri))
@@ -664,17 +703,29 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
     .addReg(DstReg);
 
-  if (MO.isImm()) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate: {
     unsigned Imm = MO.getImm();
     unsigned Lo16 = Imm & 0xffff;
     unsigned Hi16 = (Imm >> 16) & 0xffff;
     LO16 = LO16.addImm(Lo16);
     HI16 = HI16.addImm(Hi16);
-  } else {
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    const char *ES = MO.getSymbolName();
+    unsigned TF = MO.getTargetFlags();
+    LO16 = LO16.addExternalSymbol(ES, TF | ARMII::MO_LO16);
+    HI16 = HI16.addExternalSymbol(ES, TF | ARMII::MO_HI16);
+    break;
+  }
+  default: {
     const GlobalValue *GV = MO.getGlobal();
     unsigned TF = MO.getTargetFlags();
     LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16);
     HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
+    break;
+  }
   }
 
   LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -682,6 +733,9 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   LO16.addImm(Pred).addReg(PredReg);
   HI16.addImm(Pred).addReg(PredReg);
 
+  if (RequiresBundling)
+    finalizeBundle(MBB, &*LO16, &*MBBI);
+
   TransferImpOps(MI, LO16, HI16);
   MI.eraseFromParent();
 }
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index c442444..6f8fb1a 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -166,8 +166,6 @@ class ARMFastISel final : public FastISel {
 
     // Utility routines.
   private:
-    unsigned constrainOperandRegClass(const MCInstrDesc &II, unsigned OpNum,
-                                      unsigned Op);
     bool isTypeLegal(Type *Ty, MVT &VT);
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
@@ -191,6 +189,8 @@ class ARMFastISel final : public FastISel {
     unsigned ARMSelectCallOp(bool UseReg);
     unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT);
 
+    const TargetLowering *getTargetLowering() { return TM.getTargetLowering(); }
+
     // Call handling routines.
   private:
     CCAssignFn *CCAssignFnForCall(CallingConv::ID CC,
@@ -283,23 +283,6 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   return MIB;
 }
 
-unsigned ARMFastISel::constrainOperandRegClass(const MCInstrDesc &II,
-                                               unsigned Op, unsigned OpNum) {
-  if (TargetRegisterInfo::isVirtualRegister(Op)) {
-    const TargetRegisterClass *RegClass =
-        TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
-    if (!MRI.constrainRegClass(Op, RegClass)) {
-      // If it's not legal to COPY between the register classes, something
-      // has gone very wrong before we got here.
-      unsigned NewOp = createResultReg(RegClass);
-      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                              TII.get(TargetOpcode::COPY), NewOp).addReg(Op));
-      return NewOp;
-    }
-  }
-  return Op;
-}
-
 unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
                                      const TargetRegisterClass *RC,
                                      unsigned Op0, bool Op0IsKill) {
@@ -769,7 +752,7 @@ bool ARMFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
 // Computes the address to get to an object.
 bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
   // Some boilerplate from the X86 FastISel.
-  const User *U = NULL;
+  const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
     // Don't walk into other basic blocks unless the object is an alloca from
@@ -1400,7 +1383,7 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
       const APInt &CIVal = ConstInt->getValue();
       Imm = (isZExt) ? (int)CIVal.getZExtValue() : (int)CIVal.getSExtValue();
       // For INT_MIN/LONG_MIN (i.e., 0x80000000) we need to use a cmp, rather
-      // then a cmn, because there is no way to represent 2147483648 as a 
+      // then a cmn, because there is no way to represent 2147483648 as a
       // signed 32-bit int.
       if (Imm < 0 && Imm != (int)0x80000000) {
         isNegativeImm = true;
@@ -2182,7 +2165,8 @@ unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
   if (!LCREVT.isSimple()) return 0;
 
   GlobalValue *GV = new GlobalVariable(M, Type::getInt32Ty(*Context), false,
-                                       GlobalValue::ExternalLinkage, 0, Name);
+                                       GlobalValue::ExternalLinkage, nullptr,
+                                       Name);
   assert(GV->getType() == GVTy && "We miscomputed the type for the global!");
   return ARMMaterializeGV(GV, LCREVT.getSimpleVT());
 }
@@ -2286,7 +2270,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
 }
 
 bool ARMFastISel::SelectCall(const Instruction *I,
-                             const char *IntrMemName = 0) {
+                             const char *IntrMemName = nullptr) {
   const CallInst *CI = cast<CallInst>(I);
   const Value *Callee = CI->getCalledValue();
 
@@ -3092,6 +3076,6 @@ namespace llvm {
       TM.Options.NoFramePointerElim = true;
       return new ARMFastISel(funcInfo, libInfo);
     }
-    return 0;
+    return nullptr;
   }
 }
diff --git a/lib/Target/ARM/ARMFeatures.h b/lib/Target/ARM/ARMFeatures.h
index a30f4cd..e191a3c 100644
--- a/lib/Target/ARM/ARMFeatures.h
+++ b/lib/Target/ARM/ARMFeatures.h
@@ -1,4 +1,4 @@
-//===-- ARMFeatures.h - Checks for ARM instruction features ------*- C++ -*-===//
+//===-- ARMFeatures.h - Checks for ARM instruction features -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,11 +16,11 @@
 
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 
+namespace llvm {
+
 template<typename InstrType> // could be MachineInstr or MCInst
 bool IsCPSRDead(InstrType *Instr);
 
-namespace llvm {
-
 template<typename InstrType> // could be MachineInstr or MCInst
 inline bool isV8EligibleForIT(InstrType *Instr) {
   switch (Instr->getOpcode()) {
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 36ecfca..0caf4bf 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -87,7 +87,7 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
 
 static bool isCSRestore(MachineInstr *MI,
                         const ARMBaseInstrInfo &TII,
-                        const uint16_t *CSRegs) {
+                        const MCPhysReg *CSRegs) {
   // Integer spill area is handled with "pop".
   if (isPopOpcode(MI->getOpcode())) {
     // The first two operands are predicates. The last two are
@@ -142,6 +142,14 @@ static int sizeOfSPAdjustment(const MachineInstr *MI) {
   return count;
 }
 
+static bool WindowsRequiresStackProbe(const MachineFunction &MF,
+                                      size_t StackSizeInBytes) {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  if (MFI->getStackProtectorIndex() > 0)
+    return StackSizeInBytes >= 4080;
+  return StackSizeInBytes >= 4096;
+}
+
 void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -149,15 +157,16 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   MachineModuleInfo &MMI = MF.getMMI();
   MCContext &Context = MMI.getContext();
+  const TargetMachine &TM = MF.getTarget();
   const MCRegisterInfo *MRI = Context.getRegisterInfo();
   const ARMBaseRegisterInfo *RegInfo =
-    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+    static_cast<const ARMBaseRegisterInfo*>(TM.getRegisterInfo());
   const ARMBaseInstrInfo &TII =
-    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+    *static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
   assert(!AFI->isThumb1OnlyFunction() &&
          "This emitPrologue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
-  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned Align = TM.getFrameLowering()->getStackAlignment();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   unsigned NumBytes = MFI->getStackSize();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
@@ -187,7 +196,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
         .addCFIIndex(CFIIndex);
   }
 
-  if (!AFI->hasStackFrame()) {
+  if (!AFI->hasStackFrame() &&
+      (!STI.isTargetWindows() || !WindowsRequiresStackProbe(MF, NumBytes))) {
     if (NumBytes - ArgRegsSaveSize != 0) {
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, -(NumBytes - ArgRegsSaveSize),
                    MachineInstr::FrameSetup);
@@ -284,6 +294,51 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   } else
     NumBytes = DPRCSOffset;
 
+  if (STI.isTargetWindows() && WindowsRequiresStackProbe(MF, NumBytes)) {
+    uint32_t NumWords = NumBytes >> 2;
+
+    if (NumWords < 65536)
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4)
+                     .addImm(NumWords)
+                     .setMIFlags(MachineInstr::FrameSetup));
+    else
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R4)
+        .addImm(NumWords)
+        .setMIFlags(MachineInstr::FrameSetup);
+
+    switch (TM.getCodeModel()) {
+    case CodeModel::Small:
+    case CodeModel::Medium:
+    case CodeModel::Default:
+    case CodeModel::Kernel:
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tBL))
+        .addImm((unsigned)ARMCC::AL).addReg(0)
+        .addExternalSymbol("__chkstk")
+        .addReg(ARM::R4, RegState::Implicit)
+        .setMIFlags(MachineInstr::FrameSetup);
+      break;
+    case CodeModel::Large:
+    case CodeModel::JITDefault:
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R12)
+        .addExternalSymbol("__chkstk")
+        .setMIFlags(MachineInstr::FrameSetup);
+
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tBLXr))
+        .addImm((unsigned)ARMCC::AL).addReg(0)
+        .addReg(ARM::R12, RegState::Kill)
+        .addReg(ARM::R4, RegState::Implicit)
+        .setMIFlags(MachineInstr::FrameSetup);
+      break;
+    }
+
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr),
+                                        ARM::SP)
+                                .addReg(ARM::SP, RegState::Define)
+                                .addReg(ARM::R4, RegState::Kill)
+                                .setMIFlags(MachineInstr::FrameSetup)));
+    NumBytes = 0;
+  }
+
   unsigned adjustedGPRCS1Size = GPRCS1Size;
   if (NumBytes) {
     // Adjust SP after all the callee-save spills.
@@ -316,10 +371,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     MachineBasicBlock::iterator Pos = ++GPRCS1Push;
     BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
-    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-           E = CSI.end(); I != E; ++I) {
-      unsigned Reg = I->getReg();
-      int FI = I->getFrameIdx();
+    for (const auto &Entry : CSI) {
+      unsigned Reg = Entry.getReg();
+      int FI = Entry.getFrameIdx();
       switch (Reg) {
       case ARM::R8:
       case ARM::R9:
@@ -382,10 +436,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
-    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-           E = CSI.end(); I != E; ++I) {
-      unsigned Reg = I->getReg();
-      int FI = I->getFrameIdx();
+    for (const auto &Entry : CSI) {
+      unsigned Reg = Entry.getReg();
+      int FI = Entry.getFrameIdx();
       switch (Reg) {
       case ARM::R8:
       case ARM::R9:
@@ -411,7 +464,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     do {
       MachineBasicBlock::iterator Push = DPRCSPush++;
       if (!HasFP) {
-        CFAOffset -= sizeOfSPAdjustment(Push);;
+        CFAOffset -= sizeOfSPAdjustment(Push);
         unsigned CFIIndex = MMI.addFrameInst(
             MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
         BuildMI(MBB, DPRCSPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -419,10 +472,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       }
     } while (DPRCSPush->getOpcode() == ARM::VSTMDDB_UPD);
 
-    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-           E = CSI.end(); I != E; ++I) {
-      unsigned Reg = I->getReg();
-      int FI = I->getFrameIdx();
+    for (const auto &Entry : CSI) {
+      unsigned Reg = Entry.getReg();
+      int FI = Entry.getFrameIdx();
       if ((Reg >= ARM::D0 && Reg <= ARM::D31) &&
           (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) {
         unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
@@ -540,7 +592,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize);
   } else {
     // Unwind MBBI to point to first LDR / VLDRD.
-    const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+    const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
     if (MBBI != MBB.begin()) {
       do {
         --MBBI;
@@ -1205,12 +1257,9 @@ bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
                                        const ARMBaseInstrInfo &TII) {
   unsigned FnSize = 0;
-  for (MachineFunction::const_iterator MBBI = MF.begin(), E = MF.end();
-       MBBI != E; ++MBBI) {
-    const MachineBasicBlock &MBB = *MBBI;
-    for (MachineBasicBlock::const_iterator I = MBB.begin(),E = MBB.end();
-         I != E; ++I)
-      FnSize += TII.GetInstSizeInBytes(I);
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB)
+      FnSize += TII.GetInstSizeInBytes(&MI);
   }
   return FnSize;
 }
@@ -1223,21 +1272,21 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
                                          const TargetFrameLowering *TFI) {
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned Limit = (1 << 12) - 1;
-  for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
-    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
-         I != E; ++I) {
-      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-        if (!I->getOperand(i).isFI()) continue;
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        if (!MI.getOperand(i).isFI())
+          continue;
 
         // When using ADDri to get the address of a stack object, 255 is the
         // largest offset guaranteed to fit in the immediate offset.
-        if (I->getOpcode() == ARM::ADDri) {
+        if (MI.getOpcode() == ARM::ADDri) {
           Limit = std::min(Limit, (1U << 8) - 1);
           break;
         }
 
         // Otherwise check the addressing mode.
-        switch (I->getDesc().TSFlags & ARMII::AddrModeMask) {
+        switch (MI.getDesc().TSFlags & ARMII::AddrModeMask) {
         case ARMII::AddrMode3:
         case ARMII::AddrModeT2_i8:
           Limit = std::min(Limit, (1U << 8) - 1);
@@ -1374,7 +1423,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   // Don't spill FP if the frame can be eliminated. This is determined
   // by scanning the callee-save registers to see if any is used.
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     bool Spilled = false;
@@ -1486,6 +1535,10 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
     if (hasFP(MF)) {
       MRI.setPhysRegUsed(FramePtr);
+      auto FPPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(),
+                             FramePtr);
+      if (FPPos != UnspilledCS1GPRs.end())
+        UnspilledCS1GPRs.erase(FPPos);
       NumGPRSpills++;
     }
 
@@ -1681,7 +1734,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   if (MF.getFunction()->isVarArg())
     report_fatal_error("Segmented stacks do not support vararg functions.");
   if (!ST->isTargetAndroid() && !ST->isTargetLinux())
-    report_fatal_error("Segmented stacks not supported on this platfrom.");
+    report_fatal_error("Segmented stacks not supported on this platform.");
 
   MachineBasicBlock &prologueMBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1693,6 +1746,12 @@ void ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   ARMFunctionInfo *ARMFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc DL;
 
+  uint64_t StackSize = MFI->getStackSize();
+
+  // Do not generate a prologue for functions with a stack of size zero
+  if (StackSize == 0)
+    return;
+
   // Use R4 and R5 as scratch registers.
   // We save R4 and R5 before use and restore them before leaving the function.
   unsigned ScratchReg0 = ARM::R4;
@@ -1722,8 +1781,6 @@ void ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MF.push_front(PrevStackMBB);
 
   // The required stack size that is aligned to ARM constant criterion.
-  uint64_t StackSize = MFI->getStackSize();
-
   AlignedStackSize = alignToARMConstant(StackSize);
 
   // When the frame size is less than 256 we just compare the stack
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 524ee36..981d320 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -57,7 +57,7 @@ public:
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS) const override;
 
-  void adjustForSegmentedStacks(MachineFunction &MF) const;
+  void adjustForSegmentedStacks(MachineFunction &MF) const override;
 
  private:
   void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index 61d4e12..0885c4e 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -77,7 +77,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
 }
 
 void ARMHazardRecognizer::Reset() {
-  LastMI = 0;
+  LastMI = nullptr;
   FpMLxStalls = 0;
   ScoreboardHazardRecognizer::Reset();
 }
@@ -95,7 +95,7 @@ void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
 void ARMHazardRecognizer::AdvanceCycle() {
   if (FpMLxStalls && --FpMLxStalls == 0)
     // Stalled for 4 cycles but still can't schedule any other instructions.
-    LastMI = 0;
+    LastMI = nullptr;
   ScoreboardHazardRecognizer::AdvanceCycle();
 }
 
diff --git a/lib/Target/ARM/ARMHazardRecognizer.h b/lib/Target/ARM/ARMHazardRecognizer.h
index e88cd0d..a8198e2 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.h
+++ b/lib/Target/ARM/ARMHazardRecognizer.h
@@ -35,7 +35,7 @@ public:
   ARMHazardRecognizer(const InstrItineraryData *ItinData,
                       const ScheduleDAG *DAG)
     : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"),
-      LastMI(0) {}
+      LastMI(nullptr) {}
 
   HazardType getHazardType(SUnit *SU, int Stalls) override;
   void Reset() override;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 70e11c5..08d598d 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMTargetMachine.h"
@@ -37,6 +36,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-isel"
+
 static cl::opt<bool>
 DisableShifterOp("disable-shifter-op", cl::Hidden,
   cl::desc("Disable isel of shifter-op"),
@@ -72,6 +73,13 @@ public:
       Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
   }
 
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // Reset the subtarget each time through.
+    Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    SelectionDAGISel::runOnMachineFunction(MF);
+    return true;
+  }
+
   const char *getPassName() const override {
     return "ARM Instruction Selection";
   }
@@ -397,7 +405,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
     N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32,
                          N1, CurDAG->getConstant(TZ, MVT::i32));
     CurDAG->UpdateNodeOperands(N, N0, N1);
-  }  
+  }
 }
 
 /// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
@@ -1440,7 +1448,7 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   if (AM == ISD::UNINDEXED)
-    return NULL;
+    return nullptr;
 
   EVT LoadedVT = LD->getMemoryVT();
   SDValue Offset, AMOpc;
@@ -1506,14 +1514,14 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
     }
   }
 
-  return NULL;
+  return nullptr;
 }
 
 SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   if (AM == ISD::UNINDEXED)
-    return NULL;
+    return nullptr;
 
   EVT LoadedVT = LD->getMemoryVT();
   bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
@@ -1540,7 +1548,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
         Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST;
       break;
     default:
-      return NULL;
+      return nullptr;
     }
     Match = true;
   }
@@ -1554,7 +1562,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
                                   MVT::Other, Ops);
   }
 
-  return NULL;
+  return nullptr;
 }
 
 /// \brief Form a GPRPair pseudo register from a pair of GPR regs.
@@ -1699,10 +1707,10 @@ static bool isVSTfixed(unsigned Opc)
   case ARM::VST1d16wb_fixed : return true;
   case ARM::VST1d32wb_fixed : return true;
   case ARM::VST1d64wb_fixed : return true;
-  case ARM::VST1q8wb_fixed : return true; 
-  case ARM::VST1q16wb_fixed : return true; 
-  case ARM::VST1q32wb_fixed : return true; 
-  case ARM::VST1q64wb_fixed : return true; 
+  case ARM::VST1q8wb_fixed : return true;
+  case ARM::VST1q16wb_fixed : return true;
+  case ARM::VST1q32wb_fixed : return true;
+  case ARM::VST1q64wb_fixed : return true;
   case ARM::VST1d64TPseudoWB_fixed : return true;
   case ARM::VST1d64QPseudoWB_fixed : return true;
   case ARM::VST2d8wb_fixed : return true;
@@ -1776,7 +1784,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   SDValue MemAddr, Align;
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return NULL;
+    return nullptr;
 
   SDValue Chain = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -1895,7 +1903,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
-  return NULL;
+  return nullptr;
 }
 
 SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
@@ -1909,7 +1917,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return NULL;
+    return nullptr;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2055,7 +2063,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return NULL;
+    return nullptr;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2160,7 +2168,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
-  return NULL;
+  return nullptr;
 }
 
 SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
@@ -2171,7 +2179,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
 
   SDValue MemAddr, Align;
   if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
-    return NULL;
+    return nullptr;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2243,7 +2251,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
-  return NULL;
+  return nullptr;
 }
 
 SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
@@ -2282,7 +2290,7 @@ SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
 SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
                                                      bool isSigned) {
   if (!Subtarget->hasV6T2Ops())
-    return NULL;
+    return nullptr;
 
   unsigned Opc = isSigned
     ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
@@ -2295,7 +2303,7 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
 
       // The immediate is a mask of the low bits iff imm & (imm+1) == 0
       if (And_imm & (And_imm + 1))
-        return NULL;
+        return nullptr;
 
       unsigned Srl_imm = 0;
       if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL,
@@ -2315,7 +2323,7 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
             SDValue Ops[] = { N->getOperand(0).getOperand(0),
                               CurDAG->getTargetConstant(LSB, MVT::i32),
                               getAL(CurDAG), Reg0, Reg0 };
-            return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+            return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
           }
 
           // ARM models shift instructions as MOVsi with shifter operand.
@@ -2325,17 +2333,17 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
                                       MVT::i32);
           SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc,
                             getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops, 5);
+          return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops);
         }
 
         SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           CurDAG->getTargetConstant(LSB, MVT::i32),
                           CurDAG->getTargetConstant(Width, MVT::i32),
-          getAL(CurDAG), Reg0 };
-        return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+                          getAL(CurDAG), Reg0 };
+        return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
       }
     }
-    return NULL;
+    return nullptr;
   }
 
   // Otherwise, we're looking for a shift of a shift
@@ -2349,16 +2357,16 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
       unsigned Width = 32 - Srl_imm - 1;
       int LSB = Srl_imm - Shl_imm;
       if (LSB < 0)
-        return NULL;
+        return nullptr;
       SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                         CurDAG->getTargetConstant(LSB, MVT::i32),
                         CurDAG->getTargetConstant(Width, MVT::i32),
                         getAL(CurDAG), Reg0 };
-      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 /// Target-specific DAG combining for ISD::XOR.
@@ -2377,10 +2385,10 @@ SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
   EVT VT = N->getValueType(0);
 
   if (Subtarget->isThumb1Only())
-    return NULL;
+    return nullptr;
 
   if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA)
-    return NULL;
+    return nullptr;
 
   SDValue ADDSrc0 = XORSrc0.getOperand(0);
   SDValue ADDSrc1 = XORSrc0.getOperand(1);
@@ -2391,13 +2399,13 @@ SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
   unsigned Size = XType.getSizeInBits() - 1;
 
   if (ADDSrc1 == XORSrc1 && ADDSrc0 == SRASrc0 &&
-      XType.isInteger() && SRAConstant != NULL &&
+      XType.isInteger() && SRAConstant != nullptr &&
       Size == SRAConstant->getZExtValue()) {
     unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
     return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
   }
 
-  return NULL;
+  return nullptr;
 }
 
 SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
@@ -2414,7 +2422,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
 
   switch (N->getOpcode()) {
@@ -2478,7 +2486,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                        Ops);
       }
       ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
-      return NULL;
+      return nullptr;
     }
 
     // Other cases are autogenerated.
@@ -2492,14 +2500,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     if (Subtarget->isThumb1Only()) {
       SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, Ops, 4);
+      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, Ops);
     } else {
       unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
                       ARM::t2ADDri : ARM::ADDri);
       SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
     }
   }
   case ISD::SRL:
@@ -2526,10 +2534,10 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         if (Subtarget->isThumb()) {
           SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops, 6);
+          return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops);
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops, 7);
+          return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops);
         }
       }
       if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
@@ -2542,10 +2550,10 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         if (Subtarget->isThumb()) {
           SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 6);
+          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops);
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops, 7);
+          return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops);
         }
       }
     }
@@ -2660,7 +2668,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     }
   }
   case ISD::LOAD: {
-    SDNode *ResNode = 0;
+    SDNode *ResNode = nullptr;
     if (Subtarget->isThumb() && Subtarget->hasThumb2())
       ResNode = SelectT2IndexedLoad(N);
     else
@@ -2707,13 +2715,13 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     }
     ReplaceUses(SDValue(N, 0),
                 SDValue(Chain.getNode(), Chain.getResNo()));
-    return NULL;
+    return nullptr;
   }
   case ARMISD::VZIP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return NULL;
+    default: return nullptr;
     case MVT::v8i8:  Opc = ARM::VZIPd8; break;
     case MVT::v4i16: Opc = ARM::VZIPd16; break;
     case MVT::v2f32:
@@ -2733,7 +2741,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return NULL;
+    default: return nullptr;
     case MVT::v8i8:  Opc = ARM::VUZPd8; break;
     case MVT::v4i16: Opc = ARM::VUZPd16; break;
     case MVT::v2f32:
@@ -2753,7 +2761,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return NULL;
+    default: return nullptr;
     case MVT::v8i8:  Opc = ARM::VTRNd8; break;
     case MVT::v4i16: Opc = ARM::VTRNd16; break;
     case MVT::v2f32:
@@ -2834,7 +2842,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD1q16wb_fixed,
                                          ARM::VLD1q32wb_fixed,
                                          ARM::VLD1q64wb_fixed };
-    return SelectVLD(N, true, 1, DOpcodes, QOpcodes, 0);
+    return SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr);
   }
 
   case ARMISD::VLD2_UPD: {
@@ -2845,7 +2853,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
                                          ARM::VLD2q16PseudoWB_fixed,
                                          ARM::VLD2q32PseudoWB_fixed };
-    return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0);
+    return SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
   }
 
   case ARMISD::VLD3_UPD: {
@@ -2912,7 +2920,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST1q16wb_fixed,
                                          ARM::VST1q32wb_fixed,
                                          ARM::VST1q64wb_fixed };
-    return SelectVST(N, true, 1, DOpcodes, QOpcodes, 0);
+    return SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr);
   }
 
   case ARMISD::VST2_UPD: {
@@ -2923,7 +2931,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
                                          ARM::VST2q16PseudoWB_fixed,
                                          ARM::VST2q32PseudoWB_fixed };
-    return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0);
+    return SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
   }
 
   case ARMISD::VST3_UPD: {
@@ -3047,7 +3055,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         ReplaceUses(SDValue(N, 1), Result);
       }
       ReplaceUses(SDValue(N, 2), OutChain);
-      return NULL;
+      return nullptr;
     }
     case Intrinsic::arm_stlexd:
     case Intrinsic::arm_strexd: {
@@ -3093,7 +3101,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD1d32, ARM::VLD1d64 };
       static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
                                            ARM::VLD1q32, ARM::VLD1q64};
-      return SelectVLD(N, false, 1, DOpcodes, QOpcodes, 0);
+      return SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr);
     }
 
     case Intrinsic::arm_neon_vld2: {
@@ -3101,7 +3109,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD2d32, ARM::VLD1q64 };
       static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
                                            ARM::VLD2q32Pseudo };
-      return SelectVLD(N, false, 2, DOpcodes, QOpcodes, 0);
+      return SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
     }
 
     case Intrinsic::arm_neon_vld3: {
@@ -3164,7 +3172,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST1d32, ARM::VST1d64 };
       static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
                                            ARM::VST1q32, ARM::VST1q64 };
-      return SelectVST(N, false, 1, DOpcodes, QOpcodes, 0);
+      return SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr);
     }
 
     case Intrinsic::arm_neon_vst2: {
@@ -3172,7 +3180,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST2d32, ARM::VST1q64 };
       static uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
                                      ARM::VST2q32Pseudo };
-      return SelectVST(N, false, 2, DOpcodes, QOpcodes, 0);
+      return SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
     }
 
     case Intrinsic::arm_neon_vst3: {
@@ -3306,7 +3314,8 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
   // them into a GPRPair.
 
   SDLoc dl(N);
-  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1) : SDValue(0,0);
+  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1)
+                                   : SDValue(nullptr,0);
 
   SmallVector<bool, 8> OpChanged;
   // Glue node will be appended late.
@@ -3388,7 +3397,7 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
       // Update the original glue user.
       std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
       Ops.push_back(T1.getValue(1));
-      CurDAG->UpdateNodeOperands(GU, &Ops[0], Ops.size());
+      CurDAG->UpdateNodeOperands(GU, Ops);
       GU = T1.getNode();
     }
     else {
@@ -3435,11 +3444,10 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
   if (Glue.getNode())
     AsmNodeOperands.push_back(Glue);
   if (!Changed)
-    return NULL;
+    return nullptr;
 
   SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
-      CurDAG->getVTList(MVT::Other, MVT::Glue), &AsmNodeOperands[0],
-                        AsmNodeOperands.size());
+      CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
   New->setNodeId(-1);
   return New.getNode();
 }
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 2ebad8e..00d07e8 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-isel"
 #include "ARMISelLowering.h"
 #include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
@@ -37,18 +36,22 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOptions.h"
 #include <utility>
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-isel"
+
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
@@ -79,7 +82,7 @@ namespace {
 }
 
 // The APCS parameter registers.
-static const uint16_t GPRArgRegs[] = {
+static const MCPhysReg GPRArgRegs[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3
 };
 
@@ -155,7 +158,8 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
 static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
   if (TM.getSubtarget<ARMSubtarget>().isTargetMachO())
     return new TargetLoweringObjectFileMachO();
-
+  if (TM.getSubtarget<ARMSubtarget>().isTargetWindows())
+    return new TargetLoweringObjectFileCOFF();
   return new ARMElfTargetObjectFile();
 }
 
@@ -170,7 +174,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   if (Subtarget->isTargetMachO()) {
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
-        Subtarget->hasARMOps()) {
+        Subtarget->hasARMOps() && !TM.Options.UseSoftFloat) {
       // Single-precision floating-point arithmetic.
       setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
       setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
@@ -246,173 +250,134 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   // These libcalls are not available in 32-bit.
-  setLibcallName(RTLIB::SHL_I128, 0);
-  setLibcallName(RTLIB::SRL_I128, 0);
-  setLibcallName(RTLIB::SRA_I128, 0);
+  setLibcallName(RTLIB::SHL_I128, nullptr);
+  setLibcallName(RTLIB::SRL_I128, nullptr);
+  setLibcallName(RTLIB::SRA_I128, nullptr);
 
   if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO() &&
       !Subtarget->isTargetWindows()) {
-    // Double-precision floating-point arithmetic helper functions
-    // RTABI chapter 4.1.2, Table 2
-    setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd");
-    setLibcallName(RTLIB::DIV_F64, "__aeabi_ddiv");
-    setLibcallName(RTLIB::MUL_F64, "__aeabi_dmul");
-    setLibcallName(RTLIB::SUB_F64, "__aeabi_dsub");
-    setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::ARM_AAPCS);
-
-    // Double-precision floating-point comparison helper functions
-    // RTABI chapter 4.1.2, Table 3
-    setLibcallName(RTLIB::OEQ_F64, "__aeabi_dcmpeq");
-    setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
-    setLibcallName(RTLIB::UNE_F64, "__aeabi_dcmpeq");
-    setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETEQ);
-    setLibcallName(RTLIB::OLT_F64, "__aeabi_dcmplt");
-    setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
-    setLibcallName(RTLIB::OLE_F64, "__aeabi_dcmple");
-    setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
-    setLibcallName(RTLIB::OGE_F64, "__aeabi_dcmpge");
-    setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
-    setLibcallName(RTLIB::OGT_F64, "__aeabi_dcmpgt");
-    setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
-    setLibcallName(RTLIB::UO_F64,  "__aeabi_dcmpun");
-    setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
-    setLibcallName(RTLIB::O_F64,   "__aeabi_dcmpun");
-    setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
-    setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UO_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::O_F64, CallingConv::ARM_AAPCS);
-
-    // Single-precision floating-point arithmetic helper functions
-    // RTABI chapter 4.1.2, Table 4
-    setLibcallName(RTLIB::ADD_F32, "__aeabi_fadd");
-    setLibcallName(RTLIB::DIV_F32, "__aeabi_fdiv");
-    setLibcallName(RTLIB::MUL_F32, "__aeabi_fmul");
-    setLibcallName(RTLIB::SUB_F32, "__aeabi_fsub");
-    setLibcallCallingConv(RTLIB::ADD_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::DIV_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::MUL_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SUB_F32, CallingConv::ARM_AAPCS);
-
-    // Single-precision floating-point comparison helper functions
-    // RTABI chapter 4.1.2, Table 5
-    setLibcallName(RTLIB::OEQ_F32, "__aeabi_fcmpeq");
-    setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
-    setLibcallName(RTLIB::UNE_F32, "__aeabi_fcmpeq");
-    setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETEQ);
-    setLibcallName(RTLIB::OLT_F32, "__aeabi_fcmplt");
-    setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
-    setLibcallName(RTLIB::OLE_F32, "__aeabi_fcmple");
-    setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
-    setLibcallName(RTLIB::OGE_F32, "__aeabi_fcmpge");
-    setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
-    setLibcallName(RTLIB::OGT_F32, "__aeabi_fcmpgt");
-    setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
-    setLibcallName(RTLIB::UO_F32,  "__aeabi_fcmpun");
-    setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
-    setLibcallName(RTLIB::O_F32,   "__aeabi_fcmpun");
-    setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
-    setLibcallCallingConv(RTLIB::OEQ_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UNE_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OLT_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OLE_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OGE_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OGT_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UO_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::O_F32, CallingConv::ARM_AAPCS);
-
-    // Floating-point to integer conversions.
-    // RTABI chapter 4.1.2, Table 6
-    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz");
-    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz");
-    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz");
-    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz");
-    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz");
-    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz");
-    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz");
-    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz");
-    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::ARM_AAPCS);
-
-    // Conversions between floating types.
-    // RTABI chapter 4.1.2, Table 7
-    setLibcallName(RTLIB::FPROUND_F64_F32, "__aeabi_d2f");
-    setLibcallName(RTLIB::FPEXT_F32_F64,   "__aeabi_f2d");
-    setLibcallCallingConv(RTLIB::FPROUND_F64_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPEXT_F32_F64, CallingConv::ARM_AAPCS);
-
-    // Integer to floating-point conversions.
-    // RTABI chapter 4.1.2, Table 8
-    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d");
-    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d");
-    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d");
-    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d");
-    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f");
-    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f");
-    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f");
-    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f");
-    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
-
-    // Long long helper functions
-    // RTABI chapter 4.2, Table 9
-    setLibcallName(RTLIB::MUL_I64,  "__aeabi_lmul");
-    setLibcallName(RTLIB::SHL_I64, "__aeabi_llsl");
-    setLibcallName(RTLIB::SRL_I64, "__aeabi_llsr");
-    setLibcallName(RTLIB::SRA_I64, "__aeabi_lasr");
-    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SHL_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SRL_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SRA_I64, CallingConv::ARM_AAPCS);
-
-    // Integer division functions
-    // RTABI chapter 4.3.1
-    setLibcallName(RTLIB::SDIV_I8,  "__aeabi_idiv");
-    setLibcallName(RTLIB::SDIV_I16, "__aeabi_idiv");
-    setLibcallName(RTLIB::SDIV_I32, "__aeabi_idiv");
-    setLibcallName(RTLIB::SDIV_I64, "__aeabi_ldivmod");
-    setLibcallName(RTLIB::UDIV_I8,  "__aeabi_uidiv");
-    setLibcallName(RTLIB::UDIV_I16, "__aeabi_uidiv");
-    setLibcallName(RTLIB::UDIV_I32, "__aeabi_uidiv");
-    setLibcallName(RTLIB::UDIV_I64, "__aeabi_uldivmod");
-    setLibcallCallingConv(RTLIB::SDIV_I8, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIV_I16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIV_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
-
-    // Memory operations
-    // RTABI chapter 4.3.4
-    setLibcallName(RTLIB::MEMCPY,  "__aeabi_memcpy");
-    setLibcallName(RTLIB::MEMMOVE, "__aeabi_memmove");
-    setLibcallName(RTLIB::MEMSET,  "__aeabi_memset");
-    setLibcallCallingConv(RTLIB::MEMCPY, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::MEMMOVE, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::MEMSET, CallingConv::ARM_AAPCS);
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+      const CallingConv::ID CC;
+      const ISD::CondCode Cond;
+    } LibraryCalls[] = {
+      // Double-precision floating-point arithmetic helper functions
+      // RTABI chapter 4.1.2, Table 2
+      { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Double-precision floating-point comparison helper functions
+      // RTABI chapter 4.1.2, Table 3
+      { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
+      { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
+
+      // Single-precision floating-point arithmetic helper functions
+      // RTABI chapter 4.1.2, Table 4
+      { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Single-precision floating-point comparison helper functions
+      // RTABI chapter 4.1.2, Table 5
+      { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
+      { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
+
+      // Floating-point to integer conversions.
+      // RTABI chapter 4.1.2, Table 6
+      { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Conversions between floating types.
+      // RTABI chapter 4.1.2, Table 7
+      { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Integer to floating-point conversions.
+      // RTABI chapter 4.1.2, Table 8
+      { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Long long helper functions
+      // RTABI chapter 4.2, Table 9
+      { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Integer division functions
+      // RTABI chapter 4.3.1
+      { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Memory operations
+      // RTABI chapter 4.3.4
+      { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+      setLibcallCallingConv(LC.Op, LC.CC);
+      if (LC.Cond != ISD::SETCC_INVALID)
+        setCmpLibcallCC(LC.Op, LC.Cond);
+    }
+  }
+
+  if (Subtarget->isTargetWindows()) {
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+      const CallingConv::ID CC;
+    } LibraryCalls[] = {
+      { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+      setLibcallCallingConv(LC.Op, LC.CC);
+    }
   }
 
   // Use divmod compiler-rt calls for iOS 5.0 and later.
@@ -444,6 +409,13 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
     setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
     setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+
+    setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+
+    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
   }
 
   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
@@ -631,6 +603,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     }
   }
 
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+
   // i64 operation support.
   setOperationAction(ISD::MUL,     MVT::i64, Expand);
   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
@@ -850,7 +827,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
       setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand);
     }
   }
-      
+
   // Combine sin / cos into one node or libcall if possible.
   if (Subtarget->hasSinCos()) {
     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
@@ -913,7 +890,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 // and extractions.
 std::pair<const TargetRegisterClass*, uint8_t>
 ARMTargetLowering::findRepresentativeClass(MVT VT) const{
-  const TargetRegisterClass *RRC = 0;
+  const TargetRegisterClass *RRC = nullptr;
   uint8_t Cost = 1;
   switch (VT.SimpleTy) {
   default:
@@ -950,7 +927,7 @@ ARMTargetLowering::findRepresentativeClass(MVT VT) const{
 
 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return 0;
+  default: return nullptr;
   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
@@ -1204,40 +1181,58 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
 
 #include "ARMGenCallingConv.inc"
 
-/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
-/// given CallingConvention value.
-CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
-                                                 bool Return,
-                                                 bool isVarArg) const {
+/// getEffectiveCallingConv - Get the effective calling convention, taking into
+/// account presence of floating point hardware and calling convention
+/// limitations, such as support for variadic functions.
+CallingConv::ID
+ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
+                                           bool isVarArg) const {
   switch (CC) {
   default:
     llvm_unreachable("Unsupported calling convention");
-  case CallingConv::Fast:
-    if (Subtarget->hasVFP2() && !isVarArg) {
-      if (!Subtarget->isAAPCS_ABI())
-        return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
-      // For AAPCS ABI targets, just use VFP variant of the calling convention.
-      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
-    }
-    // Fallthrough
-  case CallingConv::C: {
-    // Use target triple & subtarget features to do actual dispatch.
+  case CallingConv::ARM_AAPCS:
+  case CallingConv::ARM_APCS:
+  case CallingConv::GHC:
+    return CC;
+  case CallingConv::ARM_AAPCS_VFP:
+    return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
+  case CallingConv::C:
     if (!Subtarget->isAAPCS_ABI())
-      return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
+      return CallingConv::ARM_APCS;
     else if (Subtarget->hasVFP2() &&
              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
              !isVarArg)
-      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
-    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+      return CallingConv::ARM_AAPCS_VFP;
+    else
+      return CallingConv::ARM_AAPCS;
+  case CallingConv::Fast:
+    if (!Subtarget->isAAPCS_ABI()) {
+      if (Subtarget->hasVFP2() && !isVarArg)
+        return CallingConv::Fast;
+      return CallingConv::ARM_APCS;
+    } else if (Subtarget->hasVFP2() && !isVarArg)
+      return CallingConv::ARM_AAPCS_VFP;
+    else
+      return CallingConv::ARM_AAPCS;
   }
-  case CallingConv::ARM_AAPCS_VFP:
-    if (!isVarArg)
-      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
-    // Fallthrough
-  case CallingConv::ARM_AAPCS:
-    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+}
+
+/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
+/// CallingConvention.
+CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
+                                                 bool Return,
+                                                 bool isVarArg) const {
+  switch (getEffectiveCallingConv(CC, isVarArg)) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
   case CallingConv::ARM_APCS:
     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
+  case CallingConv::ARM_AAPCS:
+    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+  case CallingConv::ARM_AAPCS_VFP:
+    return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+  case CallingConv::Fast:
+    return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
   case CallingConv::GHC:
     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
   }
@@ -1286,6 +1281,8 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                       InFlag);
       Chain = Hi.getValue(1);
       InFlag = Hi.getValue(2);
+      if (!Subtarget->isLittle())
+        std::swap (Lo, Hi);
       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
 
       if (VA.getLocVT() == MVT::v2f64) {
@@ -1301,6 +1298,8 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
         Chain = Hi.getValue(1);
         InFlag = Hi.getValue(2);
+        if (!Subtarget->isLittle())
+          std::swap (Lo, Hi);
         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
                           DAG.getConstant(1, MVT::i32));
@@ -1351,16 +1350,17 @@ void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
 
   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
-  RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
+  unsigned id = Subtarget->isLittle() ? 0 : 1;
+  RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
 
   if (NextVA.isRegLoc())
-    RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1)));
+    RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
   else {
     assert(NextVA.isMemLoc());
-    if (StackPtr.getNode() == 0)
+    if (!StackPtr.getNode())
       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
 
-    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1),
+    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
                                            dl, DAG, NextVA,
                                            Flags));
   }
@@ -1398,6 +1398,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                     isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
                                                    Outs, OutVals, Ins, DAG);
+    if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
+      report_fatal_error("failed to perform tail call elimination on a call "
+                         "site marked musttail");
     // We don't support GuaranteedTailCallOpt for ARM, only automatically
     // detected sibcalls.
     if (isTailCall) {
@@ -1542,7 +1545,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
-                                          Ops, array_lengthof(Ops)));
+                                          Ops));
       }
     } else if (!isSibCall) {
       assert(VA.isMemLoc());
@@ -1553,8 +1556,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
@@ -1741,10 +1743,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   if (isTailCall)
-    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
 
   // Returns a chain and a flag for retval copy to use.
-  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
@@ -2049,8 +2051,7 @@ static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
 
   RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, MVT::i32, false));
 
-  return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other,
-                     RetOps.data(), RetOps.size());
+  return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
 }
 
 SDValue
@@ -2074,6 +2075,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps;
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+  bool isLittleEndian = Subtarget->isLittle();
 
   // Copy the result values into the output registers.
   for (unsigned i = 0, realRVLocIdx = 0;
@@ -2100,12 +2102,15 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
 
-        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                                 HalfGPRs.getValue(isLittleEndian ? 0 : 1),
+                                 Flag);
         Flag = Chain.getValue(1);
         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                                 HalfGPRs.getValue(1), Flag);
+                                 HalfGPRs.getValue(isLittleEndian ? 1 : 0),
+                                 Flag);
         Flag = Chain.getValue(1);
         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
@@ -2117,12 +2122,15 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
       // available.
       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
-                                  DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
-      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
+                                  DAG.getVTList(MVT::i32, MVT::i32), Arg);
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                               fmrrd.getValue(isLittleEndian ? 0 : 1),
+                               Flag);
       Flag = Chain.getValue(1);
       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
       VA = RVLocs[++i]; // skip ahead to next loc
-      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1),
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                               fmrrd.getValue(isLittleEndian ? 1 : 0),
                                Flag);
     } else
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
@@ -2151,8 +2159,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
     return LowerInterruptReturn(RetOps, dl, DAG);
   }
 
-  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other,
-                     RetOps.data(), RetOps.size());
+  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -2314,13 +2321,13 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   Entry.Node = Argument;
   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
   Args.push_back(Entry);
+
   // FIXME: is there useful debug info available here?
-  TargetLowering::CallLoweringInfo CLI(Chain,
-                (Type *) Type::getInt32Ty(*DAG.getContext()),
-                false, false, false, false,
-                0, CallingConv::C, /*isTailCall=*/false,
-                /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
-                DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
+               DAG.getExternalSymbol("__tls_get_addr", PtrVT), &Args, 0);
+
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
 }
@@ -2466,6 +2473,23 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   return Result;
 }
 
+SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
+  assert(Subtarget->useMovt() && "Windows on ARM expects to use movw/movt");
+
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+
+  ++NumMovwMovt;
+
+  // FIXME: Once remat is capable of dealing with instructions with register
+  // operands, expand this into two nodes.
+  return DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
+                     DAG.getTargetGlobalAddress(GV, DL, PtrVT));
+}
+
 SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
                                                     SelectionDAG &DAG) const {
   assert(Subtarget->isTargetELF() &&
@@ -2654,7 +2678,8 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
   }
-
+  if (!Subtarget->isLittle())
+    std::swap (ArgValue, ArgValue2);
   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
 }
 
@@ -2803,8 +2828,7 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
     AFI->setArgRegsSaveSize(ArgRegsSaveSize + AFI->getArgRegsSaveSize());
 
     if (!MemOps.empty())
-      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                          &MemOps[0], MemOps.size());
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
     return FrameIndex;
   } else {
     if (ArgSize == 0) {
@@ -2834,8 +2858,9 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
   // If there is no regs to be stored, just point address after last
   // argument passed via stack.
   int FrameIndex =
-    StoreByValRegs(CCInfo, DAG, dl, Chain, 0, CCInfo.getInRegsParamsCount(),
-                   0, ArgOffset, 0, ForceMutable, 0, TotalArgRegsSaveSize);
+    StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
+                   CCInfo.getInRegsParamsCount(), 0, ArgOffset, 0, ForceMutable,
+                   0, TotalArgRegsSaveSize);
 
   AFI->setVarArgsFrameIndex(FrameIndex);
 }
@@ -3166,11 +3191,96 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
 }
 
+std::pair<SDValue, SDValue>
+ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
+                                 SDValue &ARMcc) const {
+  assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
+
+  SDValue Value, OverflowCmp;
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+
+  // FIXME: We are currently always generating CMPs because we don't support
+  // generating CMN through the backend. This is not as good as the natural
+  // CMP case because it causes a register dependency and cannot be folded
+  // later.
+
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown overflow instruction!");
+  case ISD::SADDO:
+    ARMcc = DAG.getConstant(ARMCC::VC, MVT::i32);
+    Value = DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, Value, LHS);
+    break;
+  case ISD::UADDO:
+    ARMcc = DAG.getConstant(ARMCC::HS, MVT::i32);
+    Value = DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, Value, LHS);
+    break;
+  case ISD::SSUBO:
+    ARMcc = DAG.getConstant(ARMCC::VC, MVT::i32);
+    Value = DAG.getNode(ISD::SUB, SDLoc(Op), Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, LHS, RHS);
+    break;
+  case ISD::USUBO:
+    ARMcc = DAG.getConstant(ARMCC::HS, MVT::i32);
+    Value = DAG.getNode(ISD::SUB, SDLoc(Op), Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, LHS, RHS);
+    break;
+  } // switch (...)
+
+  return std::make_pair(Value, OverflowCmp);
+}
+
+
+SDValue
+ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+    return SDValue();
+
+  SDValue Value, OverflowCmp;
+  SDValue ARMcc;
+  std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  // We use 0 and 1 as false and true values.
+  SDValue TVal = DAG.getConstant(1, MVT::i32);
+  SDValue FVal = DAG.getConstant(0, MVT::i32);
+  EVT VT = Op.getValueType();
+
+  SDValue Overflow = DAG.getNode(ARMISD::CMOV, SDLoc(Op), VT, TVal, FVal,
+                                 ARMcc, CCR, OverflowCmp);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
+}
+
+
 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = Op.getOperand(0);
   SDValue SelectTrue = Op.getOperand(1);
   SDValue SelectFalse = Op.getOperand(2);
   SDLoc dl(Op);
+  unsigned Opc = Cond.getOpcode();
+
+  if (Cond.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO)) {
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
+      return SDValue();
+
+    SDValue Value, OverflowCmp;
+    SDValue ARMcc;
+    std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    EVT VT = Op.getValueType();
+
+    return DAG.getNode(ARMISD::CMOV, SDLoc(Op), VT, SelectTrue, SelectFalse,
+                       ARMcc, CCR, OverflowCmp);
+
+  }
 
   // Convert:
   //
@@ -3472,7 +3582,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
     ARMcc = DAG.getConstant(CondCode, MVT::i32);
     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
-    return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7);
+    return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
   }
 
   return SDValue();
@@ -3512,11 +3622,11 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
-  SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
+  SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
   if (CondCode2 != ARMCC::AL) {
     ARMcc = DAG.getConstant(CondCode2, MVT::i32);
     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
-    Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
+    Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
   }
   return Res;
 }
@@ -3713,7 +3823,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   // Bitcast operand 1 to i32.
   if (SrcVT == MVT::f64)
     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
-                       &Tmp1, 1).getValue(1);
+                       Tmp1).getValue(1);
   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
 
   // Or in the signbit with integer operations.
@@ -3729,7 +3839,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
 
   // f64: Or the high part with signbit and then combine two parts.
   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
-                     &Tmp0, 1);
+                     Tmp0);
   SDValue Lo = Tmp0.getValue(0);
   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
@@ -3761,14 +3871,16 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
 }
 
 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  const ARMBaseRegisterInfo &ARI =
+    *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetMachO())
-    ? ARM::R7 : ARM::R11;
+  unsigned FrameReg = ARI.getFrameRegister(MF);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
@@ -3777,6 +3889,18 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   return FrameAddr;
 }
 
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned ARMTargetLowering::getRegisterByName(const char* RegName,
+                                              EVT VT) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("sp", ARM::SP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
 /// ExpandBITCAST - If the target supports VFP, this function is called to
 /// expand a bit convert where either the source or destination type is i64 to
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -3806,8 +3930,15 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
 
   // Turn f64->i64 into VMOVRRD.
   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
-    SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
-                              DAG.getVTList(MVT::i32, MVT::i32), &Op, 1);
+    SDValue Cvt;
+    if (TLI.isBigEndian() && SrcVT.isVector() &&
+        SrcVT.getVectorNumElements() > 1)
+      Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
+                        DAG.getVTList(MVT::i32, MVT::i32),
+                        DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
+    else
+      Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
+                        DAG.getVTList(MVT::i32, MVT::i32), Op);
     // Merge the pieces into a single i64 value.
     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
   }
@@ -3863,7 +3994,7 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
                            CCR, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
@@ -3897,7 +4028,7 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
                            CCR, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
@@ -4102,7 +4233,7 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
   // captures the result into a carry flag.
   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
-  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), &Hi, 1);
+  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
 
   // The low part is an ARMISD::RRX operand, which shifts the carry in.
   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
@@ -4859,7 +4990,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
           Ops.push_back(N);
           Ops.push_back(Op.getOperand(I));
           Ops.push_back(DAG.getConstant(I, MVT::i32));
-          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3);
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
         }
       }
       return N;
@@ -4870,7 +5001,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
                                   Op.getOperand(i)));
       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
-      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
+      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
       Val = LowerBUILD_VECTOR(Val, DAG, ST);
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
@@ -4906,7 +5037,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0; i < NumElts; ++i)
       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
-    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
@@ -5213,12 +5344,10 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
 
   if (V2.getNode()->getOpcode() == ISD::UNDEF)
     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
-                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
-                                   &VTBLMask[0], 8));
+                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask));
 
   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
-                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
-                                 &VTBLMask[0], 8));
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask));
 }
 
 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
@@ -5371,7 +5500,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
                                                   MVT::i32)));
     }
-    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
@@ -5608,7 +5737,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
   }
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
-                     MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
+                     MVT::getVectorVT(TruncVT, NumElts), Ops);
 }
 
 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
@@ -5946,12 +6075,12 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   ? "__sincos_stret" : "__sincosf_stret";
   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
 
-  TargetLowering::
-  CallLoweringInfo CLI(DAG.getEntryNode(), Type::getVoidTy(*DAG.getContext()),
-                       false, false, false, false, 0,
-                       CallingConv::C, /*isTaillCall=*/false,
-                       /*doesNotRet=*/false, /*isReturnValueUsed*/false,
-                       Callee, Args, DAG, dl);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee,
+               &Args, 0)
+    .setDiscardResult();
+
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet,
@@ -5998,8 +6127,7 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
     };
 
     Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
-                           DAG.getVTList(MVT::i32, MVT::Other), &Ops[0],
-                           array_lengthof(Ops));
+                           DAG.getVTList(MVT::i32, MVT::Other), Ops);
     OutChain = Cycles32.getValue(1);
   } else {
     // Intrinsic is defined to return 0 on unsupported platforms. Technically
@@ -6022,8 +6150,15 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
   case ISD::GlobalAddress:
-    return Subtarget->isTargetMachO() ? LowerGlobalAddressDarwin(Op, DAG) :
-      LowerGlobalAddressELF(Op, DAG);
+    switch (Subtarget->getTargetTriple().getObjectFormat()) {
+    default: llvm_unreachable("unknown object format");
+    case Triple::COFF:
+      return LowerGlobalAddressWindows(Op, DAG);
+    case Triple::ELF:
+      return LowerGlobalAddressELF(Op, DAG);
+    case Triple::MachO:
+      return LowerGlobalAddressDarwin(Op, DAG);
+    }
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SELECT:        return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
@@ -6068,6 +6203,11 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADDE:
   case ISD::SUBC:
   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+    return LowerXALUO(Op, DAG);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
@@ -6558,7 +6698,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
   }
 
   // N.B. the order the invoke BBs are processed in doesn't matter here.
-  const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF);
+  const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
   SmallVector<MachineBasicBlock*, 64> MBBLPads;
   for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
          I = InvokeBBs.begin(), E = InvokeBBs.end(); I != E; ++I) {
@@ -6755,8 +6895,8 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   unsigned UnitSize = 0;
-  const TargetRegisterClass *TRC = 0;
-  const TargetRegisterClass *VecTRC = 0;
+  const TargetRegisterClass *TRC = nullptr;
+  const TargetRegisterClass *VecTRC = nullptr;
 
   bool IsThumb1 = Subtarget->isThumb1Only();
   bool IsThumb2 = Subtarget->isThumb2();
@@ -6790,7 +6930,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
                  ? (const TargetRegisterClass *)&ARM::DPairRegClass
                  : UnitSize == 8
                        ? (const TargetRegisterClass *)&ARM::DPRRegClass
-                       : 0;
+                       : nullptr;
 
   unsigned BytesLeft = SizeVal % UnitSize;
   unsigned LoopSize = SizeVal - BytesLeft;
@@ -7520,8 +7660,7 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
       llvm_unreachable("Invalid vector element type for padd optimization.");
   }
 
-  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
-                            widenType, &Ops[0], Ops.size());
+  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), widenType, Ops);
   unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
   return DAG.getNode(ExtOp, SDLoc(N), VT, tmp);
 }
@@ -7581,7 +7720,7 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
 
   // Look for the glued ADDE.
   SDNode* AddeNode = AddcNode->getGluedUser();
-  if (AddeNode == NULL)
+  if (!AddeNode)
     return SDValue();
 
   // Make sure it is really an ADDE.
@@ -7616,9 +7755,9 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
 
   // Figure out the high and low input values to the MLAL node.
   SDValue* HiMul = &MULOp;
-  SDValue* HiAdd = NULL;
-  SDValue* LoMul = NULL;
-  SDValue* LowAdd = NULL;
+  SDValue* HiAdd = nullptr;
+  SDValue* LoMul = nullptr;
+  SDValue* LowAdd = nullptr;
 
   if (IsLeftOperandMUL)
     HiAdd = &AddeOp1;
@@ -7635,7 +7774,7 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
     LowAdd = &AddcOp0;
   }
 
-  if (LoMul == NULL)
+  if (!LoMul)
     return SDValue();
 
   if (LoMul->getNode() != HiMul->getNode())
@@ -7652,8 +7791,7 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   Ops.push_back(*HiAdd);
 
   SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
-                                 DAG.getVTList(MVT::i32, MVT::i32),
-                                 &Ops[0], Ops.size());
+                                 DAG.getVTList(MVT::i32, MVT::i32), Ops);
 
   // Replace the ADDs' nodes uses by the MLA node's values.
   SDValue HiMLALResult(MLALNode.getNode(), 1);
@@ -8290,8 +8428,7 @@ static SDValue PerformSTORECombine(SDNode *N,
                             Increment);
       Chains.push_back(Ch);
     }
-    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
-                       Chains.size());
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   }
 
   if (!ISD::isNormalStore(St))
@@ -8302,16 +8439,18 @@ static SDValue PerformSTORECombine(SDNode *N,
   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
       StVal.getNode()->hasOneUse()) {
     SelectionDAG  &DAG = DCI.DAG;
+    bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian();
     SDLoc DL(St);
     SDValue BasePtr = St->getBasePtr();
     SDValue NewST1 = DAG.getStore(St->getChain(), DL,
-                                  StVal.getNode()->getOperand(0), BasePtr,
-                                  St->getPointerInfo(), St->isVolatile(),
+                                  StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ),
+                                  BasePtr, St->getPointerInfo(), St->isVolatile(),
                                   St->isNonTemporal(), St->getAlignment());
 
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                     DAG.getConstant(4, MVT::i32));
-    return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1),
+    return DAG.getStore(NewST1.getValue(0), DL,
+                        StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
                         OffsetPtr, St->getPointerInfo(), St->isVolatile(),
                         St->isNonTemporal(),
                         std::min(4U, St->getAlignment() / 2));
@@ -8387,7 +8526,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
     DCI.AddToWorklist(V.getNode());
   }
   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
-  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops.data(), NumElts);
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops);
   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
 }
 
@@ -8470,7 +8609,7 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
       // Fold obvious case.
       V = V.getOperand(0);
     else {
-      V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 
+      V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
       // Make the DAGCombiner fold the bitcasts.
       DCI.AddToWorklist(V.getNode());
     }
@@ -8666,7 +8805,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
       Tys[n] = VecTy;
     Tys[n++] = MVT::i32;
     Tys[n] = MVT::Other;
-    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2);
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs+2));
     SmallVector<SDValue, 8> Ops;
     Ops.push_back(N->getOperand(0)); // incoming chain
     Ops.push_back(N->getOperand(AddrOpIdx));
@@ -8676,8 +8815,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
     }
     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
-                                           Ops.data(), Ops.size(),
-                                           MemInt->getMemoryVT(),
+                                           Ops, MemInt->getMemoryVT(),
                                            MemInt->getMemOperand());
 
     // Update the uses.
@@ -8746,11 +8884,11 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   for (n = 0; n < NumVecs; ++n)
     Tys[n] = VT;
   Tys[n] = MVT::Other;
-  SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
+  SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumVecs+1));
   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
-                                           Ops, 2, VLDMemInt->getMemoryVT(),
+                                           Ops, VLDMemInt->getMemoryVT(),
                                            VLDMemInt->getMemOperand());
 
   // Update the uses.
@@ -9348,7 +9486,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
 
   if (Res.getNode()) {
     APInt KnownZero, KnownOne;
-    DAG.ComputeMaskedBits(SDValue(N,0), KnownZero, KnownOne);
+    DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
     // Capture demanded bits information that would be otherwise lost.
     if (KnownZero == 0xfffffffe)
       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
@@ -9935,11 +10073,11 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
   return true;
 }
 
-void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       APInt &KnownZero,
-                                                       APInt &KnownOne,
-                                                       const SelectionDAG &DAG,
-                                                       unsigned Depth) const {
+void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
   unsigned BitWidth = KnownOne.getBitWidth();
   KnownZero = KnownOne = APInt(BitWidth, 0);
   switch (Op.getOpcode()) {
@@ -9955,11 +10093,11 @@ void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
     break;
   case ARMISD::CMOV: {
     // Bits are known zero/one if known on the LHS and RHS.
-    DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     if (KnownZero == 0 && KnownOne == 0) return;
 
     APInt KnownZeroRHS, KnownOneRHS;
-    DAG.ComputeMaskedBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
     KnownZero &= KnownZeroRHS;
     KnownOne  &= KnownOneRHS;
     return;
@@ -10053,7 +10191,7 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
@@ -10132,7 +10270,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result;
 
   // Currently only support length 1 constraints.
   if (Constraint.length() != 1) return;
@@ -10331,13 +10469,12 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   Type *RetTy = (Type*)StructType::get(Ty, Ty, NULL);
 
   SDLoc dl(Op);
-  TargetLowering::
-  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, true,
-                    0, getLibcallCallingConv(LC), /*isTailCall=*/false,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, dl);
-  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
 
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return CallInfo.first;
 }
 
@@ -10494,3 +10631,160 @@ bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
     return false;
   return true;
 }
+
+bool ARMTargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
+  // Loads and stores less than 64-bits are already atomic; ones above that
+  // are doomed anyway, so defer to the default libcall and blame the OS when
+  // things go wrong:
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 64;
+  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return LI->getType()->getPrimitiveSizeInBits() == 64;
+
+  // For the real atomic operations, we have ldrex/strex up to 64 bits.
+  return Inst->getType()->getPrimitiveSizeInBits() <= 64;
+}
+
+Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                                         AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+  bool IsAcquire =
+      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
+  // intrinsic must return {i32, i32} and we have to recombine them into a
+  // single i64 here.
+  if (ValTy->getPrimitiveSizeInBits() == 64) {
+    Intrinsic::ID Int =
+        IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
+    Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
+
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
+
+    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+    if (!Subtarget->isLittle())
+      std::swap (Lo, Hi);
+    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+    return Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
+  }
+
+  Type *Tys[] = { Addr->getType() };
+  Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
+  Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateTruncOrBitCast(
+      Builder.CreateCall(Ldrex, Addr),
+      cast<PointerType>(Addr->getType())->getElementType());
+}
+
+Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                                               Value *Addr,
+                                               AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  bool IsRelease =
+      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since the intrinsics must have legal type, the i64 intrinsics take two
+  // parameters: "i32, i32". We must marshal Val into the appropriate form
+  // before the call.
+  if (Val->getType()->getPrimitiveSizeInBits() == 64) {
+    Intrinsic::ID Int =
+        IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
+    Function *Strex = Intrinsic::getDeclaration(M, Int);
+    Type *Int32Ty = Type::getInt32Ty(M->getContext());
+
+    Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
+    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
+    if (!Subtarget->isLittle())
+      std::swap (Lo, Hi);
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    return Builder.CreateCall3(Strex, Lo, Hi, Addr);
+  }
+
+  Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
+  Type *Tys[] = { Addr->getType() };
+  Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateCall2(
+      Strex, Builder.CreateZExtOrBitCast(
+                 Val, Strex->getFunctionType()->getParamType(0)),
+      Addr);
+}
+
+enum HABaseType {
+  HA_UNKNOWN = 0,
+  HA_FLOAT,
+  HA_DOUBLE,
+  HA_VECT64,
+  HA_VECT128
+};
+
+static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
+                                   uint64_t &Members) {
+  if (const StructType *ST = dyn_cast<StructType>(Ty)) {
+    for (unsigned i = 0; i < ST->getNumElements(); ++i) {
+      uint64_t SubMembers = 0;
+      if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
+        return false;
+      Members += SubMembers;
+    }
+  } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+    uint64_t SubMembers = 0;
+    if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
+      return false;
+    Members += SubMembers * AT->getNumElements();
+  } else if (Ty->isFloatTy()) {
+    if (Base != HA_UNKNOWN && Base != HA_FLOAT)
+      return false;
+    Members = 1;
+    Base = HA_FLOAT;
+  } else if (Ty->isDoubleTy()) {
+    if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
+      return false;
+    Members = 1;
+    Base = HA_DOUBLE;
+  } else if (const VectorType *VT = dyn_cast<VectorType>(Ty)) {
+    Members = 1;
+    switch (Base) {
+    case HA_FLOAT:
+    case HA_DOUBLE:
+      return false;
+    case HA_VECT64:
+      return VT->getBitWidth() == 64;
+    case HA_VECT128:
+      return VT->getBitWidth() == 128;
+    case HA_UNKNOWN:
+      switch (VT->getBitWidth()) {
+      case 64:
+        Base = HA_VECT64;
+        return true;
+      case 128:
+        Base = HA_VECT128;
+        return true;
+      default:
+        return false;
+      }
+    }
+  }
+
+  return (Members > 0 && Members <= 4);
+}
+
+/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate.
+bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+  if (getEffectiveCallingConv(CallConv, isVarArg) !=
+      CallingConv::ARM_AAPCS_VFP)
+    return false;
+
+  HABaseType Base = HA_UNKNOWN;
+  uint64_t Members = 0;
+  bool result = isHomogeneousAggregate(Ty, Base, Members);
+  DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump(); dbgs() << "\n");
+  return result;
+}
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index f33e6db..c15305c 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -313,10 +313,10 @@ namespace llvm {
                                     SDValue &Offset, ISD::MemIndexedMode &AM,
                                     SelectionDAG &DAG) const override;
 
-    void computeMaskedBitsForTargetNode(const SDValue Op, APInt &KnownZero,
-                                        APInt &KnownOne,
-                                        const SelectionDAG &DAG,
-                                        unsigned Depth) const override;
+    void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth) const override;
 
 
     bool ExpandInlineAsm(CallInst *CI) const override;
@@ -384,6 +384,18 @@ namespace llvm {
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
+    /// \brief Returns true if an argument of type Ty needs to be passed in a
+    /// contiguous block of registers in calling convention CallConv.
+    bool functionArgumentNeedsConsecutiveRegisters(
+        Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
+
+    Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                          AtomicOrdering Ord) const override;
+    Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                                Value *Addr, AtomicOrdering Ord) const override;
+
+    bool shouldExpandAtomicInIR(Instruction *Inst) const override;
+
   protected:
     std::pair<const TargetRegisterClass*, uint8_t>
     findRepresentativeClass(MVT VT) const override;
@@ -404,6 +416,7 @@ namespace llvm {
     void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
     void addDRTypeForNEON(MVT VT);
     void addQRTypeForNEON(MVT VT);
+    std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
 
     typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
     void PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
@@ -417,6 +430,8 @@ namespace llvm {
                                  SDValue &Root, SelectionDAG &DAG,
                                  SDLoc dl) const;
 
+    CallingConv::ID getEffectiveCallingConv(CallingConv::ID CC,
+                                            bool isVarArg) const;
     CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
                                   bool isVarArg) const;
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
@@ -430,6 +445,7 @@ namespace llvm {
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                             SelectionDAG &DAG) const;
@@ -438,6 +454,7 @@ namespace llvm {
                                  TLSModel::Model model) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
@@ -454,6 +471,8 @@ namespace llvm {
     SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
 
+    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
     /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
     /// expanded to FMAs when this method returns true, otherwise fmuladd is
@@ -567,7 +586,6 @@ namespace llvm {
     OtherModImm
   };
 
-
   namespace ARM {
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo);
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index aafff98..59e9260 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -2029,7 +2029,7 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
 // Same as N2V but not predicated.
 class N2Vnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
             dag oops, dag iops, InstrItinClass itin, string OpcodeStr,
-            string Dt, ValueType ResTy, ValueType OpTy, list<dag> pattern>
+            string Dt, list<dag> pattern>
    : NeonInp<oops, iops, AddrModeNone, IndexModeNone, N2RegFrm, itin,
              OpcodeStr, Dt, "$Vd, $Vm", "", pattern> {
   bits<5> Vd;
@@ -2138,8 +2138,7 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
 
 class N3Vnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 bit op4, dag oops, dag iops,Format f, InstrItinClass itin,
-                string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
-                SDPatternOperator IntOp, bit Commutable, list<dag> pattern>
+                string OpcodeStr, string Dt, list<dag> pattern>
   : NeonInp<oops, iops, AddrModeNone, IndexModeNone, f, itin, OpcodeStr,
             Dt, "$Vd, $Vn, $Vm", "", pattern> {
   bits<5> Vd;
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 75a109e..718d5da 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -991,6 +991,81 @@ def addrmode6oneL32 : Operand<i32>,
   let EncoderMethod = "getAddrMode6OneLane32AddressOpValue";
 }
 
+// Base class for addrmode6 with specific alignment restrictions.
+class AddrMode6Align : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
+  let EncoderMethod = "getAddrMode6AddressOpValue";
+  let DecoderMethod = "DecodeAddrMode6Operand";
+}
+
+// Special version of addrmode6 to handle no allowed alignment encoding for
+// VLD/VST instructions and checking the alignment is not specified.
+def AddrMode6AlignNoneAsmOperand : AsmOperandClass {
+  let Name = "AlignedMemoryNone";
+  let DiagnosticType = "AlignedMemoryRequiresNone";
+}
+def addrmode6alignNone : AddrMode6Align {
+  // The alignment specifier can only be omitted.
+  let ParserMatchClass = AddrMode6AlignNoneAsmOperand;
+}
+
+// Special version of addrmode6 to handle 16-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align16AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory16";
+  let DiagnosticType = "AlignedMemoryRequires16";
+}
+def addrmode6align16 : AddrMode6Align {
+  // The alignment specifier can only be 16 or omitted.
+  let ParserMatchClass = AddrMode6Align16AsmOperand;
+}
+
+// Special version of addrmode6 to handle 32-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align32AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory32";
+  let DiagnosticType = "AlignedMemoryRequires32";
+}
+def addrmode6align32 : AddrMode6Align {
+  // The alignment specifier can only be 32 or omitted.
+  let ParserMatchClass = AddrMode6Align32AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory64";
+  let DiagnosticType = "AlignedMemoryRequires64";
+}
+def addrmode6align64 : AddrMode6Align {
+  // The alignment specifier can only be 64 or omitted.
+  let ParserMatchClass = AddrMode6Align64AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding
+// for VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64or128AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory64or128";
+  let DiagnosticType = "AlignedMemoryRequires64or128";
+}
+def addrmode6align64or128 : AddrMode6Align {
+  // The alignment specifier can only be 64, 128 or omitted.
+  let ParserMatchClass = AddrMode6Align64or128AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit, 128-bit or 256-bit alignment
+// encoding for VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64or128or256AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory64or128or256";
+  let DiagnosticType = "AlignedMemoryRequires64or128or256";
+}
+def addrmode6align64or128or256 : AddrMode6Align {
+  // The alignment specifier can only be 64, 128, 256 or omitted.
+  let ParserMatchClass = AddrMode6Align64or128or256AsmOperand;
+}
+
 // Special version of addrmode6 to handle alignment encoding for VLD-dup
 // instructions, specifically VLD4-dup.
 def addrmode6dup : Operand<i32>,
@@ -1003,6 +1078,69 @@ def addrmode6dup : Operand<i32>,
   let ParserMatchClass = AddrMode6AsmOperand;
 }
 
+// Base class for addrmode6dup with specific alignment restrictions.
+class AddrMode6DupAlign : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6DupAddressOpValue";
+}
+
+// Special version of addrmode6 to handle no allowed alignment encoding for
+// VLD-dup instruction and checking the alignment is not specified.
+def AddrMode6dupAlignNoneAsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemoryNone";
+  let DiagnosticType = "DupAlignedMemoryRequiresNone";
+}
+def addrmode6dupalignNone : AddrMode6DupAlign {
+  // The alignment specifier can only be omitted.
+  let ParserMatchClass = AddrMode6dupAlignNoneAsmOperand;
+}
+
+// Special version of addrmode6 to handle 16-bit alignment encoding for VLD-dup
+// instruction and checking the alignment value.
+def AddrMode6dupAlign16AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory16";
+  let DiagnosticType = "DupAlignedMemoryRequires16";
+}
+def addrmode6dupalign16 : AddrMode6DupAlign {
+  // The alignment specifier can only be 16 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign16AsmOperand;
+}
+
+// Special version of addrmode6 to handle 32-bit alignment encoding for VLD-dup
+// instruction and checking the alignment value.
+def AddrMode6dupAlign32AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory32";
+  let DiagnosticType = "DupAlignedMemoryRequires32";
+}
+def addrmode6dupalign32 : AddrMode6DupAlign {
+  // The alignment specifier can only be 32 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign32AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit alignment encoding for VLD
+// instructions and checking the alignment value.
+def AddrMode6dupAlign64AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory64";
+  let DiagnosticType = "DupAlignedMemoryRequires64";
+}
+def addrmode6dupalign64 : AddrMode6DupAlign {
+  // The alignment specifier can only be 64 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign64AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding
+// for VLD instructions and checking the alignment value.
+def AddrMode6dupAlign64or128AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory64or128";
+  let DiagnosticType = "DupAlignedMemoryRequires64or128";
+}
+def addrmode6dupalign64or128 : AddrMode6DupAlign {
+  // The alignment specifier can only be 64, 128 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign64or128AsmOperand;
+}
+
 // addrmodepc := pc + reg
 //
 def addrmodepc : Operand<i32>,
@@ -1689,7 +1827,8 @@ PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
 }
 
 def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
-              "hint", "\t$imm", []>, Requires<[IsARM, HasV6]> {
+              "hint", "\t$imm", [(int_arm_hint imm0_239:$imm)]>,
+           Requires<[IsARM, HasV6]> {
   bits<8> imm;
   let Inst{27-8} = 0b00110010000011110000;
   let Inst{7-0} = imm;
@@ -1702,8 +1841,6 @@ def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
 
-def : Pat<(int_arm_sevl), (HINT 5)>;
-
 def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
              "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
   bits<4> Rd;
@@ -1830,6 +1967,18 @@ def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt",
   let Inst{3-0} = opt;
 }
 
+// A8.8.247  UDF - Undefined (Encoding A1)
+def UDF : AInoP<(outs), (ins imm0_65535:$imm16), MiscFrm, NoItinerary,
+                "udf", "\t$imm16", [(int_arm_undefined imm0_65535:$imm16)]> {
+  bits<16> imm16;
+  let Inst{31-28} = 0b1110; // AL
+  let Inst{27-25} = 0b011;
+  let Inst{24-20} = 0b11111;
+  let Inst{19-8} = imm16{15-4};
+  let Inst{7-4} = 0b1111;
+  let Inst{3-0} = imm16{3-0};
+}
+
 /*
  * A5.4 Permanently UNDEFINED instructions.
  *
@@ -2282,12 +2431,6 @@ let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
   def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode3:$addr),
                    LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $Rt2, $addr", []>,
              Requires<[IsARM, HasV5TE]>;
-
-  // GNU Assembler extension (compatibility)
-  let isAsmParserOnly = 1 in
-    def LDRD_PAIR : AI3ld<0b1101, 0, (outs GPRPairOp:$Rt), (ins addrmode3:$addr),
-                          LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $addr", []>,
-                    Requires<[IsARM, HasV5TE]>;
 }
 
 def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
@@ -2557,14 +2700,6 @@ let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
              Requires<[IsARM, HasV5TE]> {
     let Inst{21} = 0;
   }
-
-  // GNU Assembler extension (compatibility)
-  let isAsmParserOnly = 1 in
-    def STRD_PAIR : AI3str<0b1111, (outs), (ins GPRPairOp:$Rt, addrmode3:$addr),
-                           StMiscFrm, IIC_iStore_d_r, "strd", "\t$Rt, $addr", []>,
-                    Requires<[IsARM, HasV5TE]> {
-      let Inst{21} = 0;
-    }
 }
 
 // Indexed stores
@@ -3999,6 +4134,11 @@ def REV16 : AMiscA1I<0b01101011, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
                Requires<[IsARM, HasV6]>,
            Sched<[WriteALU]>;
 
+def : ARMV6Pat<(srl (bswap (extloadi16 addrmode3:$addr)), (i32 16)),
+              (REV16 (LDRH addrmode3:$addr))>;
+def : ARMV6Pat<(truncstorei16 (srl (bswap GPR:$Rn), (i32 16)), addrmode3:$addr),
+               (STRH (REV16 GPR:$Rn), addrmode3:$addr)>;
+
 let AddedComplexity = 5 in
 def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
                IIC_iUNAr, "revsh", "\t$Rd, $Rm",
@@ -4816,7 +4956,7 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
                       [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
                                      imm:$CRm, imm:$opc2)]>,
                       Requires<[PreV8]>;
-def : ARMInstAlias<"mcr2$ $cop, $opc1, $Rt, $CRn, $CRm",
+def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
 def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
@@ -4824,7 +4964,7 @@ def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
                       (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
                            imm0_7:$opc2), []>,
                       Requires<[PreV8]>;
-def : ARMInstAlias<"mrc2$ $cop, $opc1, $Rt, $CRn, $CRm",
+def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
                    (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
 
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 0d46c49..b32b5d2 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -39,6 +39,49 @@ def nImmVMOVI32 : Operand<i32> {
   let PrintMethod = "printNEONModImmOperand";
   let ParserMatchClass = nImmVMOVI32AsmOperand;
 }
+
+def nImmVMOVI16AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi16vmovByteReplicate";
+  let PredicateMethod = "isNEONi16ByteReplicate";
+  let RenderMethod = "addNEONvmovByteReplicateOperands";
+}
+def nImmVMOVI32AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi32vmovByteReplicate";
+  let PredicateMethod = "isNEONi32ByteReplicate";
+  let RenderMethod = "addNEONvmovByteReplicateOperands";
+}
+def nImmVMVNI16AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi16invByteReplicate";
+  let PredicateMethod = "isNEONi16ByteReplicate";
+  let RenderMethod = "addNEONinvByteReplicateOperands";
+}
+def nImmVMVNI32AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi32invByteReplicate";
+  let PredicateMethod = "isNEONi32ByteReplicate";
+  let RenderMethod = "addNEONinvByteReplicateOperands";
+}
+
+def nImmVMOVI16ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI16AsmOperandByteReplicate;
+}
+def nImmVMOVI32ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI32AsmOperandByteReplicate;
+}
+def nImmVMVNI16ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMVNI16AsmOperandByteReplicate;
+}
+def nImmVMVNI32ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMVNI32AsmOperandByteReplicate;
+}
+
 def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; }
 def nImmVMOVI32Neg : Operand<i32> {
   let PrintMethod = "printNEONModImmOperand";
@@ -617,37 +660,37 @@ class VLDQQQQWBPseudo<InstrItinClass itin>
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 
 //   VLD1     : Vector Load (multiple single elements)
-class VLD1D<bits<4> op7_4, string Dt>
+class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd),
-          (ins addrmode6:$Rn), IIC_VLD1,
+          (ins AddrMode:$Rn), IIC_VLD1,
           "vld1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
-class VLD1Q<bits<4> op7_4, string Dt>
+class VLD1Q<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd),
-          (ins addrmode6:$Rn), IIC_VLD1x2,
+          (ins AddrMode:$Rn), IIC_VLD1x2,
           "vld1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
 
-def  VLD1d8   : VLD1D<{0,0,0,?}, "8">;
-def  VLD1d16  : VLD1D<{0,1,0,?}, "16">;
-def  VLD1d32  : VLD1D<{1,0,0,?}, "32">;
-def  VLD1d64  : VLD1D<{1,1,0,?}, "64">;
+def  VLD1d8   : VLD1D<{0,0,0,?}, "8",  addrmode6align64>;
+def  VLD1d16  : VLD1D<{0,1,0,?}, "16", addrmode6align64>;
+def  VLD1d32  : VLD1D<{1,0,0,?}, "32", addrmode6align64>;
+def  VLD1d64  : VLD1D<{1,1,0,?}, "64", addrmode6align64>;
 
-def  VLD1q8   : VLD1Q<{0,0,?,?}, "8">;
-def  VLD1q16  : VLD1Q<{0,1,?,?}, "16">;
-def  VLD1q32  : VLD1Q<{1,0,?,?}, "32">;
-def  VLD1q64  : VLD1Q<{1,1,?,?}, "64">;
+def  VLD1q8   : VLD1Q<{0,0,?,?}, "8",  addrmode6align64or128>;
+def  VLD1q16  : VLD1Q<{0,1,?,?}, "16", addrmode6align64or128>;
+def  VLD1q32  : VLD1Q<{1,0,?,?}, "32", addrmode6align64or128>;
+def  VLD1q64  : VLD1Q<{1,1,?,?}, "64", addrmode6align64or128>;
 
 // ...with address register writeback:
-multiclass VLD1DWB<bits<4> op7_4, string Dt> {
+multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10, 0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
-                     (ins addrmode6:$Rn), IIC_VLD1u,
+                     (ins AddrMode:$Rn), IIC_VLD1u,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -655,16 +698,16 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1u,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
 }
-multiclass VLD1QWB<bits<4> op7_4, string Dt> {
+multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
-                    (ins addrmode6:$Rn), IIC_VLD1x2u,
+                    (ins AddrMode:$Rn), IIC_VLD1x2u,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -672,7 +715,7 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
@@ -680,27 +723,27 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt> {
   }
 }
 
-defm VLD1d8wb  : VLD1DWB<{0,0,0,?}, "8">;
-defm VLD1d16wb : VLD1DWB<{0,1,0,?}, "16">;
-defm VLD1d32wb : VLD1DWB<{1,0,0,?}, "32">;
-defm VLD1d64wb : VLD1DWB<{1,1,0,?}, "64">;
-defm VLD1q8wb  : VLD1QWB<{0,0,?,?}, "8">;
-defm VLD1q16wb : VLD1QWB<{0,1,?,?}, "16">;
-defm VLD1q32wb : VLD1QWB<{1,0,?,?}, "32">;
-defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64">;
+defm VLD1d8wb  : VLD1DWB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VLD1d16wb : VLD1DWB<{0,1,0,?}, "16", addrmode6align64>;
+defm VLD1d32wb : VLD1DWB<{1,0,0,?}, "32", addrmode6align64>;
+defm VLD1d64wb : VLD1DWB<{1,1,0,?}, "64", addrmode6align64>;
+defm VLD1q8wb  : VLD1QWB<{0,0,?,?}, "8",  addrmode6align64or128>;
+defm VLD1q16wb : VLD1QWB<{0,1,?,?}, "16", addrmode6align64or128>;
+defm VLD1q32wb : VLD1QWB<{1,0,?,?}, "32", addrmode6align64or128>;
+defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
 
 // ...with 3 registers
-class VLD1D3<bits<4> op7_4, string Dt>
+class VLD1D3<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd),
-          (ins addrmode6:$Rn), IIC_VLD1x3, "vld1", Dt,
+          (ins AddrMode:$Rn), IIC_VLD1x3, "vld1", Dt,
           "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
-multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
+multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
-                    (ins addrmode6:$Rn), IIC_VLD1x2u,
+                    (ins AddrMode:$Rn), IIC_VLD1x2u,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -708,7 +751,7 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
@@ -716,32 +759,32 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
   }
 }
 
-def VLD1d8T      : VLD1D3<{0,0,0,?}, "8">;
-def VLD1d16T     : VLD1D3<{0,1,0,?}, "16">;
-def VLD1d32T     : VLD1D3<{1,0,0,?}, "32">;
-def VLD1d64T     : VLD1D3<{1,1,0,?}, "64">;
+def VLD1d8T      : VLD1D3<{0,0,0,?}, "8",  addrmode6align64>;
+def VLD1d16T     : VLD1D3<{0,1,0,?}, "16", addrmode6align64>;
+def VLD1d32T     : VLD1D3<{1,0,0,?}, "32", addrmode6align64>;
+def VLD1d64T     : VLD1D3<{1,1,0,?}, "64", addrmode6align64>;
 
-defm VLD1d8Twb  : VLD1D3WB<{0,0,0,?}, "8">;
-defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16">;
-defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32">;
-defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64">;
+defm VLD1d8Twb  : VLD1D3WB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>;
+defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>;
+defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>;
 
 def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>;
 def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>;
 def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>;
 
 // ...with 4 registers
-class VLD1D4<bits<4> op7_4, string Dt>
+class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd),
-          (ins addrmode6:$Rn), IIC_VLD1x4, "vld1", Dt,
+          (ins AddrMode:$Rn), IIC_VLD1x4, "vld1", Dt,
           "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
-multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
+multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb),
-                    (ins addrmode6:$Rn), IIC_VLD1x2u,
+                    (ins AddrMode:$Rn), IIC_VLD1x2u,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -749,7 +792,7 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
@@ -757,15 +800,15 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
   }
 }
 
-def VLD1d8Q      : VLD1D4<{0,0,?,?}, "8">;
-def VLD1d16Q     : VLD1D4<{0,1,?,?}, "16">;
-def VLD1d32Q     : VLD1D4<{1,0,?,?}, "32">;
-def VLD1d64Q     : VLD1D4<{1,1,?,?}, "64">;
+def VLD1d8Q      : VLD1D4<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+def VLD1d16Q     : VLD1D4<{0,1,?,?}, "16", addrmode6align64or128or256>;
+def VLD1d32Q     : VLD1D4<{1,0,?,?}, "32", addrmode6align64or128or256>;
+def VLD1d64Q     : VLD1D4<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
-defm VLD1d8Qwb   : VLD1D4WB<{0,0,?,?}, "8">;
-defm VLD1d16Qwb  : VLD1D4WB<{0,1,?,?}, "16">;
-defm VLD1d32Qwb  : VLD1D4WB<{1,0,?,?}, "32">;
-defm VLD1d64Qwb  : VLD1D4WB<{1,1,?,?}, "64">;
+defm VLD1d8Qwb   : VLD1D4WB<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+defm VLD1d16Qwb  : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VLD1d32Qwb  : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+defm VLD1d64Qwb  : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
 def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>;
 def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>;
@@ -773,22 +816,28 @@ def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>;
 
 //   VLD2     : Vector Load (multiple 2-element structures)
 class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
-           InstrItinClass itin>
+           InstrItinClass itin, Operand AddrMode>
   : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd),
-          (ins addrmode6:$Rn), itin,
+          (ins AddrMode:$Rn), itin,
           "vld2", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST2Instruction";
 }
 
-def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2>;
-def  VLD2d16  : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2>;
-def  VLD2d32  : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2>;
+def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2,
+                     addrmode6align64or128>;
+def  VLD2d16  : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2,
+                     addrmode6align64or128>;
+def  VLD2d32  : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2,
+                     addrmode6align64or128>;
 
-def  VLD2q8   : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2>;
-def  VLD2q16  : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2>;
-def  VLD2q32  : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2>;
+def  VLD2q8   : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2,
+                     addrmode6align64or128or256>;
+def  VLD2q16  : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2,
+                     addrmode6align64or128or256>;
+def  VLD2q32  : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2,
+                     addrmode6align64or128or256>;
 
 def  VLD2q8Pseudo  : VLDQQPseudo<IIC_VLD2x2>;
 def  VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>;
@@ -796,9 +845,9 @@ def  VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>;
 
 // ...with address register writeback:
 multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
-                  RegisterOperand VdTy, InstrItinClass itin> {
+                  RegisterOperand VdTy, InstrItinClass itin, Operand AddrMode> {
   def _fixed : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
-                     (ins addrmode6:$Rn), itin,
+                     (ins AddrMode:$Rn), itin,
                      "vld2", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -806,7 +855,7 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
     let DecoderMethod = "DecodeVLDST2Instruction";
   }
   def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm), itin,
+                        (ins AddrMode:$Rn, rGPR:$Rm), itin,
                         "vld2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
@@ -814,13 +863,19 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
   }
 }
 
-defm VLD2d8wb  : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u>;
-defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u>;
-defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u>;
+defm VLD2d8wb  : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u,
+                        addrmode6align64or128>;
 
-defm VLD2q8wb  : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u>;
-defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u>;
-defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u>;
+defm VLD2q8wb  : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u,
+                        addrmode6align64or128or256>;
+defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u,
+                        addrmode6align64or128or256>;
+defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u,
+                        addrmode6align64or128or256>;
 
 def VLD2q8PseudoWB_fixed     : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
 def VLD2q16PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
@@ -830,12 +885,18 @@ def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
 def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
 
 // ...with double-spaced registers
-def  VLD2b8    : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2>;
-def  VLD2b16   : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2>;
-def  VLD2b32   : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2>;
-defm VLD2b8wb  : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u>;
-defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u>;
-defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u>;
+def  VLD2b8    : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2,
+                      addrmode6align64or128>;
+def  VLD2b16   : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2,
+                      addrmode6align64or128>;
+def  VLD2b32   : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2,
+                      addrmode6align64or128>;
+defm VLD2b8wb  : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u,
+                        addrmode6align64or128>;
 
 //   VLD3     : Vector Load (multiple 3-element structures)
 class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1293,47 +1354,55 @@ def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
 } // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
 
 //   VLD1DUP  : Vector Load (single element to all lanes)
-class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp>
+class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
+              Operand AddrMode>
   : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListOneDAllLanes:$Vd),
-          (ins addrmode6dup:$Rn),
+          (ins AddrMode:$Rn),
           IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "",
           [(set VecListOneDAllLanes:$Vd,
-                (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> {
+                (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD1DupInstruction";
 }
-def VLD1DUPd8  : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8>;
-def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16>;
-def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load>;
+def VLD1DUPd8  : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8,
+                         addrmode6dupalignNone>;
+def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16,
+                         addrmode6dupalign16>;
+def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load,
+                         addrmode6dupalign32>;
 
 def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
           (VLD1DUPd32 addrmode6:$addr)>;
 
-class VLD1QDUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp>
+class VLD1QDUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
+               Operand AddrMode>
   : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListDPairAllLanes:$Vd),
-          (ins addrmode6dup:$Rn), IIC_VLD1dup,
+          (ins AddrMode:$Rn), IIC_VLD1dup,
           "vld1", Dt, "$Vd, $Rn", "",
           [(set VecListDPairAllLanes:$Vd,
-                (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> {
+                (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD1DupInstruction";
 }
 
-def VLD1DUPq8  : VLD1QDUP<{0,0,1,0}, "8", v16i8, extloadi8>;
-def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16", v8i16, extloadi16>;
-def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load>;
+def VLD1DUPq8  : VLD1QDUP<{0,0,1,0}, "8", v16i8, extloadi8,
+                          addrmode6dupalignNone>;
+def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16", v8i16, extloadi16,
+                          addrmode6dupalign16>;
+def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load,
+                          addrmode6dupalign32>;
 
 def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
           (VLD1DUPq32 addrmode6:$addr)>;
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 // ...with address register writeback:
-multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
+multiclass VLD1DUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
                      (outs VecListOneDAllLanes:$Vd, GPR:$wb),
-                     (ins addrmode6dup:$Rn), IIC_VLD1dupu,
+                     (ins AddrMode:$Rn), IIC_VLD1dupu,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1342,17 +1411,17 @@ multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
   }
   def _register : NLdSt<1, 0b10, 0b1100, op7_4,
                         (outs VecListOneDAllLanes:$Vd, GPR:$wb),
-                        (ins addrmode6dup:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1dupu,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD1DupInstruction";
   }
 }
-multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
+multiclass VLD1QDUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
                      (outs VecListDPairAllLanes:$Vd, GPR:$wb),
-                     (ins addrmode6dup:$Rn), IIC_VLD1dupu,
+                     (ins AddrMode:$Rn), IIC_VLD1dupu,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1361,7 +1430,7 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
   }
   def _register : NLdSt<1, 0b10, 0b1100, op7_4,
                         (outs VecListDPairAllLanes:$Vd, GPR:$wb),
-                        (ins addrmode6dup:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1dupu,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
@@ -1369,38 +1438,47 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
   }
 }
 
-defm VLD1DUPd8wb  : VLD1DUPWB<{0,0,0,0}, "8">;
-defm VLD1DUPd16wb : VLD1DUPWB<{0,1,0,?}, "16">;
-defm VLD1DUPd32wb : VLD1DUPWB<{1,0,0,?}, "32">;
+defm VLD1DUPd8wb  : VLD1DUPWB<{0,0,0,0}, "8", addrmode6dupalignNone>;
+defm VLD1DUPd16wb : VLD1DUPWB<{0,1,0,?}, "16", addrmode6dupalign16>;
+defm VLD1DUPd32wb : VLD1DUPWB<{1,0,0,?}, "32", addrmode6dupalign32>;
 
-defm VLD1DUPq8wb  : VLD1QDUPWB<{0,0,1,0}, "8">;
-defm VLD1DUPq16wb : VLD1QDUPWB<{0,1,1,?}, "16">;
-defm VLD1DUPq32wb : VLD1QDUPWB<{1,0,1,?}, "32">;
+defm VLD1DUPq8wb  : VLD1QDUPWB<{0,0,1,0}, "8", addrmode6dupalignNone>;
+defm VLD1DUPq16wb : VLD1QDUPWB<{0,1,1,?}, "16", addrmode6dupalign16>;
+defm VLD1DUPq32wb : VLD1QDUPWB<{1,0,1,?}, "32", addrmode6dupalign32>;
 
 //   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
-class VLD2DUP<bits<4> op7_4, string Dt, RegisterOperand VdTy>
+class VLD2DUP<bits<4> op7_4, string Dt, RegisterOperand VdTy, Operand AddrMode>
   : NLdSt<1, 0b10, 0b1101, op7_4, (outs VdTy:$Vd),
-          (ins addrmode6dup:$Rn), IIC_VLD2dup,
+          (ins AddrMode:$Rn), IIC_VLD2dup,
           "vld2", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD2DupInstruction";
 }
 
-def VLD2DUPd8  : VLD2DUP<{0,0,0,?}, "8",  VecListDPairAllLanes>;
-def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16", VecListDPairAllLanes>;
-def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32", VecListDPairAllLanes>;
+def VLD2DUPd8  : VLD2DUP<{0,0,0,?}, "8",  VecListDPairAllLanes,
+                         addrmode6dupalign16>;
+def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16", VecListDPairAllLanes,
+                         addrmode6dupalign32>;
+def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32", VecListDPairAllLanes,
+                         addrmode6dupalign64>;
 
+// HACK this one, VLD2DUPd8x2 must be changed at the same time with VLD2b8 or
+// "vld2.8 {d0[], d2[]}, [r4:32]" will become "vld2.8 {d0, d2}, [r4:32]".
 // ...with double-spaced registers
-def VLD2DUPd8x2  : VLD2DUP<{0,0,1,?}, "8",  VecListDPairSpacedAllLanes>;
-def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes>;
-def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes>;
+def VLD2DUPd8x2  : VLD2DUP<{0,0,1,?}, "8",  VecListDPairSpacedAllLanes,
+                           addrmode6dupalign16>;
+def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
+                           addrmode6dupalign32>;
+def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
+                           addrmode6dupalign64>;
 
 // ...with address register writeback:
-multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
+multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
+                     Operand AddrMode> {
   def _fixed : NLdSt<1, 0b10, 0b1101, op7_4,
                      (outs VdTy:$Vd, GPR:$wb),
-                     (ins addrmode6dup:$Rn), IIC_VLD2dupu,
+                     (ins AddrMode:$Rn), IIC_VLD2dupu,
                      "vld2", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1409,7 +1487,7 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
   }
   def _register : NLdSt<1, 0b10, 0b1101, op7_4,
                         (outs VdTy:$Vd, GPR:$wb),
-                        (ins addrmode6dup:$Rn, rGPR:$Rm), IIC_VLD2dupu,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD2dupu,
                         "vld2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
@@ -1417,13 +1495,19 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
   }
 }
 
-defm VLD2DUPd8wb    : VLD2DUPWB<{0,0,0,0}, "8",  VecListDPairAllLanes>;
-defm VLD2DUPd16wb   : VLD2DUPWB<{0,1,0,?}, "16", VecListDPairAllLanes>;
-defm VLD2DUPd32wb   : VLD2DUPWB<{1,0,0,?}, "32", VecListDPairAllLanes>;
+defm VLD2DUPd8wb    : VLD2DUPWB<{0,0,0,0}, "8",  VecListDPairAllLanes,
+                                addrmode6dupalign16>;
+defm VLD2DUPd16wb   : VLD2DUPWB<{0,1,0,?}, "16", VecListDPairAllLanes,
+                                addrmode6dupalign32>;
+defm VLD2DUPd32wb   : VLD2DUPWB<{1,0,0,?}, "32", VecListDPairAllLanes,
+                                addrmode6dupalign64>;
 
-defm VLD2DUPd8x2wb  : VLD2DUPWB<{0,0,1,0}, "8",  VecListDPairSpacedAllLanes>;
-defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes>;
-defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes>;
+defm VLD2DUPd8x2wb  : VLD2DUPWB<{0,0,1,0}, "8",  VecListDPairSpacedAllLanes,
+                                addrmode6dupalign16>;
+defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
+                                addrmode6dupalign32>;
+defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
+                                addrmode6dupalign64>;
 
 //   VLD3DUP  : Vector Load (single 3-element structure to all lanes)
 class VLD3DUP<bits<4> op7_4, string Dt>
@@ -1449,22 +1533,22 @@ def VLD3DUPq16 : VLD3DUP<{0,1,1,?}, "16">;
 def VLD3DUPq32 : VLD3DUP<{1,0,1,?}, "32">;
 
 // ...with address register writeback:
-class VLD3DUPWB<bits<4> op7_4, string Dt>
+class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
-          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD3dupu,
+          (ins AddrMode:$Rn, am6offset:$Rm), IIC_VLD3dupu,
           "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = 0;
   let DecoderMethod = "DecodeVLD3DupInstruction";
 }
 
-def VLD3DUPd8_UPD  : VLD3DUPWB<{0,0,0,0}, "8">;
-def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16">;
-def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32">;
+def VLD3DUPd8_UPD  : VLD3DUPWB<{0,0,0,0}, "8",  addrmode6dupalign64>;
+def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16", addrmode6dupalign64>;
+def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32", addrmode6dupalign64>;
 
-def VLD3DUPq8_UPD  : VLD3DUPWB<{0,0,1,0}, "8">;
-def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16">;
-def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32">;
+def VLD3DUPq8_UPD  : VLD3DUPWB<{0,0,1,0}, "8",  addrmode6dupalign64>;
+def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16", addrmode6dupalign64>;
+def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32", addrmode6dupalign64>;
 
 def VLD3DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3dupu>;
 def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
@@ -1560,35 +1644,35 @@ class VSTQQQQWBPseudo<InstrItinClass itin>
                 "$addr.addr = $wb">;
 
 //   VST1     : Vector Store (multiple single elements)
-class VST1D<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$Rn, VecListOneD:$Vd),
+class VST1D<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins AddrMode:$Rn, VecListOneD:$Vd),
           IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
-class VST1Q<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$Rn, VecListDPair:$Vd),
+class VST1Q<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins AddrMode:$Rn, VecListDPair:$Vd),
           IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
 
-def  VST1d8   : VST1D<{0,0,0,?}, "8">;
-def  VST1d16  : VST1D<{0,1,0,?}, "16">;
-def  VST1d32  : VST1D<{1,0,0,?}, "32">;
-def  VST1d64  : VST1D<{1,1,0,?}, "64">;
+def  VST1d8   : VST1D<{0,0,0,?}, "8",  addrmode6align64>;
+def  VST1d16  : VST1D<{0,1,0,?}, "16", addrmode6align64>;
+def  VST1d32  : VST1D<{1,0,0,?}, "32", addrmode6align64>;
+def  VST1d64  : VST1D<{1,1,0,?}, "64", addrmode6align64>;
 
-def  VST1q8   : VST1Q<{0,0,?,?}, "8">;
-def  VST1q16  : VST1Q<{0,1,?,?}, "16">;
-def  VST1q32  : VST1Q<{1,0,?,?}, "32">;
-def  VST1q64  : VST1Q<{1,1,?,?}, "64">;
+def  VST1q8   : VST1Q<{0,0,?,?}, "8",  addrmode6align64or128>;
+def  VST1q16  : VST1Q<{0,1,?,?}, "16", addrmode6align64or128>;
+def  VST1q32  : VST1Q<{1,0,?,?}, "32", addrmode6align64or128>;
+def  VST1q64  : VST1Q<{1,1,?,?}, "64", addrmode6align64or128>;
 
 // ...with address register writeback:
-multiclass VST1DWB<bits<4> op7_4, string Dt> {
+multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00, 0b0111,op7_4, (outs GPR:$wb),
-                     (ins addrmode6:$Rn, VecListOneD:$Vd), IIC_VLD1u,
+                     (ins AddrMode:$Rn, VecListOneD:$Vd), IIC_VLD1u,
                      "vst1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1596,7 +1680,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b00,0b0111,op7_4, (outs GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm, VecListOneD:$Vd),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListOneD:$Vd),
                         IIC_VLD1u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
@@ -1604,9 +1688,9 @@ multiclass VST1DWB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
 }
-multiclass VST1QWB<bits<4> op7_4, string Dt> {
+multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
-                    (ins addrmode6:$Rn, VecListDPair:$Vd), IIC_VLD1x2u,
+                    (ins AddrMode:$Rn, VecListDPair:$Vd), IIC_VLD1x2u,
                      "vst1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1614,7 +1698,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm, VecListDPair:$Vd),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListDPair:$Vd),
                         IIC_VLD1x2u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
@@ -1623,28 +1707,28 @@ multiclass VST1QWB<bits<4> op7_4, string Dt> {
   }
 }
 
-defm VST1d8wb  : VST1DWB<{0,0,0,?}, "8">;
-defm VST1d16wb : VST1DWB<{0,1,0,?}, "16">;
-defm VST1d32wb : VST1DWB<{1,0,0,?}, "32">;
-defm VST1d64wb : VST1DWB<{1,1,0,?}, "64">;
+defm VST1d8wb  : VST1DWB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VST1d16wb : VST1DWB<{0,1,0,?}, "16", addrmode6align64>;
+defm VST1d32wb : VST1DWB<{1,0,0,?}, "32", addrmode6align64>;
+defm VST1d64wb : VST1DWB<{1,1,0,?}, "64", addrmode6align64>;
 
-defm VST1q8wb  : VST1QWB<{0,0,?,?}, "8">;
-defm VST1q16wb : VST1QWB<{0,1,?,?}, "16">;
-defm VST1q32wb : VST1QWB<{1,0,?,?}, "32">;
-defm VST1q64wb : VST1QWB<{1,1,?,?}, "64">;
+defm VST1q8wb  : VST1QWB<{0,0,?,?}, "8",  addrmode6align64or128>;
+defm VST1q16wb : VST1QWB<{0,1,?,?}, "16", addrmode6align64or128>;
+defm VST1q32wb : VST1QWB<{1,0,?,?}, "32", addrmode6align64or128>;
+defm VST1q64wb : VST1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
 
 // ...with 3 registers
-class VST1D3<bits<4> op7_4, string Dt>
+class VST1D3<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
-          (ins addrmode6:$Rn, VecListThreeD:$Vd),
+          (ins AddrMode:$Rn, VecListThreeD:$Vd),
           IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
-multiclass VST1D3WB<bits<4> op7_4, string Dt> {
+multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
-                    (ins addrmode6:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u,
+                    (ins AddrMode:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u,
                      "vst1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1652,7 +1736,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
                         IIC_VLD1x3u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
@@ -1661,33 +1745,33 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt> {
   }
 }
 
-def VST1d8T     : VST1D3<{0,0,0,?}, "8">;
-def VST1d16T    : VST1D3<{0,1,0,?}, "16">;
-def VST1d32T    : VST1D3<{1,0,0,?}, "32">;
-def VST1d64T    : VST1D3<{1,1,0,?}, "64">;
+def VST1d8T     : VST1D3<{0,0,0,?}, "8",  addrmode6align64>;
+def VST1d16T    : VST1D3<{0,1,0,?}, "16", addrmode6align64>;
+def VST1d32T    : VST1D3<{1,0,0,?}, "32", addrmode6align64>;
+def VST1d64T    : VST1D3<{1,1,0,?}, "64", addrmode6align64>;
 
-defm VST1d8Twb  : VST1D3WB<{0,0,0,?}, "8">;
-defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16">;
-defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32">;
-defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64">;
+defm VST1d8Twb  : VST1D3WB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>;
+defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>;
+defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>;
 
 def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>;
 def VST1d64TPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x3u>;
 def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>;
 
 // ...with 4 registers
-class VST1D4<bits<4> op7_4, string Dt>
+class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
-          (ins addrmode6:$Rn, VecListFourD:$Vd),
+          (ins AddrMode:$Rn, VecListFourD:$Vd),
           IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "",
           []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
-multiclass VST1D4WB<bits<4> op7_4, string Dt> {
+multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
-                    (ins addrmode6:$Rn, VecListFourD:$Vd), IIC_VLD1x4u,
+                    (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1x4u,
                      "vst1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1695,7 +1779,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
                         IIC_VLD1x4u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
@@ -1704,15 +1788,15 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt> {
   }
 }
 
-def VST1d8Q     : VST1D4<{0,0,?,?}, "8">;
-def VST1d16Q    : VST1D4<{0,1,?,?}, "16">;
-def VST1d32Q    : VST1D4<{1,0,?,?}, "32">;
-def VST1d64Q    : VST1D4<{1,1,?,?}, "64">;
+def VST1d8Q     : VST1D4<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+def VST1d16Q    : VST1D4<{0,1,?,?}, "16", addrmode6align64or128or256>;
+def VST1d32Q    : VST1D4<{1,0,?,?}, "32", addrmode6align64or128or256>;
+def VST1d64Q    : VST1D4<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
-defm VST1d8Qwb  : VST1D4WB<{0,0,?,?}, "8">;
-defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16">;
-defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32">;
-defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64">;
+defm VST1d8Qwb  : VST1D4WB<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
 def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>;
 def VST1d64QPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x4u>;
@@ -1720,21 +1804,27 @@ def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>;
 
 //   VST2     : Vector Store (multiple 2-element structures)
 class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
-            InstrItinClass itin>
-  : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$Rn, VdTy:$Vd),
+            InstrItinClass itin, Operand AddrMode>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins AddrMode:$Rn, VdTy:$Vd),
           itin, "vst2", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST2Instruction";
 }
 
-def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListDPair, IIC_VST2>;
-def  VST2d16  : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2>;
-def  VST2d32  : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2>;
+def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListDPair, IIC_VST2,
+                     addrmode6align64or128>;
+def  VST2d16  : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2,
+                     addrmode6align64or128>;
+def  VST2d32  : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2,
+                     addrmode6align64or128>;
 
-def  VST2q8   : VST2<0b0011, {0,0,?,?}, "8",  VecListFourD, IIC_VST2x2>;
-def  VST2q16  : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2>;
-def  VST2q32  : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2>;
+def  VST2q8   : VST2<0b0011, {0,0,?,?}, "8",  VecListFourD, IIC_VST2x2,
+                     addrmode6align64or128or256>;
+def  VST2q16  : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2,
+                     addrmode6align64or128or256>;
+def  VST2q32  : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2,
+                     addrmode6align64or128or256>;
 
 def  VST2q8Pseudo  : VSTQQPseudo<IIC_VST2x2>;
 def  VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>;
@@ -1742,9 +1832,9 @@ def  VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>;
 
 // ...with address register writeback:
 multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
-                   RegisterOperand VdTy> {
+                   RegisterOperand VdTy, Operand AddrMode> {
   def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
-                     (ins addrmode6:$Rn, VdTy:$Vd), IIC_VLD1u,
+                     (ins AddrMode:$Rn, VdTy:$Vd), IIC_VLD1u,
                      "vst2", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1752,16 +1842,16 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
     let DecoderMethod = "DecodeVLDST2Instruction";
   }
   def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
+                        (ins AddrMode:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
                         "vst2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
   }
 }
-multiclass VST2QWB<bits<4> op7_4, string Dt> {
+multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
-                     (ins addrmode6:$Rn, VecListFourD:$Vd), IIC_VLD1u,
+                     (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1u,
                      "vst2", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1769,7 +1859,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST2Instruction";
   }
   def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
                         IIC_VLD1u,
                         "vst2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
@@ -1778,13 +1868,16 @@ multiclass VST2QWB<bits<4> op7_4, string Dt> {
   }
 }
 
-defm VST2d8wb    : VST2DWB<0b1000, {0,0,?,?}, "8",  VecListDPair>;
-defm VST2d16wb   : VST2DWB<0b1000, {0,1,?,?}, "16", VecListDPair>;
-defm VST2d32wb   : VST2DWB<0b1000, {1,0,?,?}, "32", VecListDPair>;
+defm VST2d8wb    : VST2DWB<0b1000, {0,0,?,?}, "8",  VecListDPair,
+                           addrmode6align64or128>;
+defm VST2d16wb   : VST2DWB<0b1000, {0,1,?,?}, "16", VecListDPair,
+                           addrmode6align64or128>;
+defm VST2d32wb   : VST2DWB<0b1000, {1,0,?,?}, "32", VecListDPair,
+                           addrmode6align64or128>;
 
-defm VST2q8wb    : VST2QWB<{0,0,?,?}, "8">;
-defm VST2q16wb   : VST2QWB<{0,1,?,?}, "16">;
-defm VST2q32wb   : VST2QWB<{1,0,?,?}, "32">;
+defm VST2q8wb    : VST2QWB<{0,0,?,?}, "8", addrmode6align64or128or256>;
+defm VST2q16wb   : VST2QWB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VST2q32wb   : VST2QWB<{1,0,?,?}, "32", addrmode6align64or128or256>;
 
 def VST2q8PseudoWB_fixed     : VSTQQWBfixedPseudo<IIC_VST2x2u>;
 def VST2q16PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>;
@@ -1794,12 +1887,18 @@ def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
 def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
 
 // ...with double-spaced registers
-def VST2b8      : VST2<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced, IIC_VST2>;
-def VST2b16     : VST2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VST2>;
-def VST2b32     : VST2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VST2>;
-defm VST2b8wb   : VST2DWB<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced>;
-defm VST2b16wb  : VST2DWB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced>;
-defm VST2b32wb  : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced>;
+def VST2b8      : VST2<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced, IIC_VST2,
+                      addrmode6align64or128>;
+def VST2b16     : VST2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VST2,
+                      addrmode6align64or128>;
+def VST2b32     : VST2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VST2,
+                      addrmode6align64or128>;
+defm VST2b8wb   : VST2DWB<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced,
+                          addrmode6align64or128>;
+defm VST2b16wb  : VST2DWB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced,
+                          addrmode6align64or128>;
+defm VST2b32wb  : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced,
+                          addrmode6align64or128>;
 
 //   VST3     : Vector Store (multiple 3-element structures)
 class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -2267,9 +2366,9 @@ def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)),
 def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
           (VST1q64 addrmode6:$addr, QPR:$value)>;
 def : Pat<(v2f64 (word_alignedload addrmode6:$addr)),
-          (VLD1q32 addrmode6:$addr)>;
+          (VLD1q32 addrmode6:$addr)>, Requires<[IsLE]>;
 def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
-          (VST1q32 addrmode6:$addr, QPR:$value)>;
+          (VST1q32 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
 def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
           (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>;
 def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
@@ -2357,14 +2456,14 @@ class N2VDIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2Vnp<0b10, op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
-          itin, OpcodeStr, Dt, ResTy, OpTy,
+          itin, OpcodeStr, Dt,
           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
 
 class N2VQIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2Vnp<0b10, op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
-          itin, OpcodeStr, Dt, ResTy, OpTy,
+          itin, OpcodeStr, Dt,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
 
 // Similar to NV2VQIntnp with some more encoding bits exposed (crypto).
@@ -2372,7 +2471,7 @@ class N2VQIntXnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
               bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2Vnp<op19_18, op17_16, op10_8, op7, op6,  (outs QPR:$Vd), (ins QPR:$Vm),
-          itin, OpcodeStr, Dt, ResTy, OpTy,
+          itin, OpcodeStr, Dt,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
 
 // Same as N2VQIntXnp but with Vd as a src register.
@@ -2381,7 +2480,7 @@ class N2VQIntX2np<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2Vnp<op19_18, op17_16, op10_8, op7, op6,
           (outs QPR:$Vd), (ins QPR:$src, QPR:$Vm),
-          itin, OpcodeStr, Dt, ResTy, OpTy,
+          itin, OpcodeStr, Dt,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vm))))]> {
   let Constraints = "$src = $Vd";
 }
@@ -2555,7 +2654,6 @@ class N3VDIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
           (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
-          ResTy, OpTy, IntOp, Commutable,
           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
 
 class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
@@ -2609,7 +2707,6 @@ class N3VQIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
           (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr, Dt,
-          ResTy, OpTy, IntOp, Commutable,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
 
 // Same as N3VQIntnp but with Vd as a src register.
@@ -2618,8 +2715,8 @@ class N3VQInt3np<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 string Dt, ValueType ResTy, ValueType OpTy,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
-          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr,
-          Dt, ResTy, OpTy, IntOp, Commutable,
+          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vn, QPR:$Vm),
+          f, itin, OpcodeStr, Dt,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vn),
                                        (OpTy QPR:$Vm))))]> {
   let Constraints = "$src = $Vd";
@@ -2939,7 +3036,6 @@ class N3VLIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
           (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
-          ResTy, OpTy, IntOp, Commutable,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
 
 class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
@@ -5245,6 +5341,35 @@ def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
                          [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>;
 } // isReMaterializable
 
+// Add support for bytes replication feature, so it could be GAS compatible.
+// E.g. instructions below:
+// "vmov.i32 d0, 0xffffffff"
+// "vmov.i32 d0, 0xabababab"
+// "vmov.i16 d0, 0xabab"
+// are incorrect, but we could deal with such cases.
+// For last two instructions, for example, it should emit:
+// "vmov.i8 d0, 0xab"
+def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
+
+// Also add same support for VMVN instructions. So instruction:
+// "vmvn.i32 d0, 0xabababab"
+// actually means:
+// "vmov.i8 d0, 0x54"
+def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
 
 // On some CPUs the two instructions "vmov.i32 dD, #0" and "vmov.i32 qD, #0"
 // require zero cycles to execute so they should be used wherever possible for
@@ -5617,22 +5742,22 @@ def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
 }
 
-def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0", 
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0",
                     (VCVTf2sd DPR:$Dd, DPR:$Dm, pred:$p)>;
-def : NEONInstAlias<"vcvt${p}.u32.f32 $Dd, $Dm, #0", 
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Dd, $Dm, #0",
                     (VCVTf2ud DPR:$Dd, DPR:$Dm, pred:$p)>;
-def : NEONInstAlias<"vcvt${p}.f32.s32 $Dd, $Dm, #0", 
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Dd, $Dm, #0",
                     (VCVTs2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
-def : NEONInstAlias<"vcvt${p}.f32.u32 $Dd, $Dm, #0", 
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Dd, $Dm, #0",
                     (VCVTu2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
 
-def : NEONInstAlias<"vcvt${p}.s32.f32 $Qd, $Qm, #0", 
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Qd, $Qm, #0",
                     (VCVTf2sq QPR:$Qd, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vcvt${p}.u32.f32 $Qd, $Qm, #0", 
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Qd, $Qm, #0",
                     (VCVTf2uq QPR:$Qd, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0", 
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0",
                     (VCVTs2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0", 
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0",
                     (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
 
 
@@ -6051,67 +6176,145 @@ def : Pat<(f32 (bitconvert GPR:$a)),
 //===----------------------------------------------------------------------===//
 
 // bit_convert
-def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
-def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
-def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (v1i64 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (v1i64 DPR:$src)>;
+}
 def : Pat<(v1i64 (bitconvert (f64   DPR:$src))), (v1i64 DPR:$src)>;
-def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
-def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
-def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
-def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (v2i32 DPR:$src)>;
-def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (v2i32 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (v2i32 DPR:$src)>;
+}
 def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
-def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
-def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
-def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (v4i16 DPR:$src)>;
-def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (v4i16 DPR:$src)>;
-def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
-def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (v8i8  DPR:$src)>;
-def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (v8i8  DPR:$src)>;
-def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (v8i8  DPR:$src)>;
-def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (v8i8  DPR:$src)>;
-def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (v8i8  DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (v8i8  DPR:$src)>;
+}
 def : Pat<(f64   (bitconvert (v1i64 DPR:$src))), (f64   DPR:$src)>;
-def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (f64   DPR:$src)>;
-def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (f64   DPR:$src)>;
-def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (f64   DPR:$src)>;
-def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (f64   DPR:$src)>;
-def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (v2f32 DPR:$src)>;
-def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (v2f32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
+}
 def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
-def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
-def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (v2f32 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (v2f32 DPR:$src)>;
+}
 
-def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
-def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
-def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
+}
 def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
-def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
-def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
-def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
-def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
-def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
+}
 def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
-def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
-def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
-def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
-def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
-def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
-def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
-def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
-def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
-def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
-def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
-def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
+}
 def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
-def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
-def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
-def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
+}
 def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
-def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
-def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
-def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
-def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
+}
+
+let Predicates = [IsBE] in {
+  // 64 bit conversions
+  def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (VREV32d8  DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (VREV16d8  DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (VREV32d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (VREV16d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (VREV32d8  DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (VREV32d8  DPR:$src)>;
+
+  // 128 bit conversions
+  def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8  QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
+}
 
 // Fold extracting an element out of a v2i32 into a vfp register.
 def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))),
@@ -6120,7 +6323,7 @@ def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))),
 // Vector lengthening move with load, matching extending loads.
 
 // extload, zextload and sextload for a standard lengthening load. Example:
-// Lengthen_Single<"8", "i16", "8"> = 
+// Lengthen_Single<"8", "i16", "8"> =
 //     Pat<(v8i16 (extloadvi8 addrmode6:$addr))
 //         (VMOVLuv8i16 (VLD1d8 addrmode6:$addr,
 //                              (f64 (IMPLICIT_DEF)), (i32 0)))>;
@@ -6147,7 +6350,7 @@ multiclass Lengthen_Single<string DestLanes, string DestTy, string SrcTy> {
 // half the lanes available. Example:
 // Lengthen_HalfSingle<"4", "i16", "8", "i16", "i8"> =
 //     Pat<(v4i16 (extloadvi8 addrmode6oneL32:$addr)),
-//         (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr, 
+//         (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
 //                                      (f64 (IMPLICIT_DEF)), (i32 0))),
 //                         dsub_0)>;
 multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
@@ -6257,7 +6460,7 @@ defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
 // Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64
 def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
       (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
-         (VLD1LNd16 addrmode6:$addr, 
+         (VLD1LNd16 addrmode6:$addr,
                     (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
 def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
       (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
@@ -6311,379 +6514,442 @@ defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
 // VLD1 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr",
-                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
 def VLD1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr",
-                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VLD1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr",
-                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 
 def VLD1LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr!",
-                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
 def VLD1LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr!",
-                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VLD1LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr!",
-                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VLD1LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListOneDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD1LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListOneDHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD1LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListOneDWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 
 // VST1 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VST1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr",
-                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
 def VST1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr",
-                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VST1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr",
-                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 
 def VST1LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr!",
-                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
 def VST1LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr!",
-                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VST1LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr!",
-                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VST1LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListOneDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST1LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListOneDHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST1LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListOneDWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VLD2 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr",
-                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                  pred:$p)>;
 def VLD2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr",
-                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VLD2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr",
-                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr",
-                 (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VLD2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr",
-                 (ins VecListTwoQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 
 def VLD2LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr!",
-                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VLD2LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!",
-                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VLD2LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!",
-                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 def VLD2LNqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!",
-                 (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VLD2LNqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!",
-                 (ins VecListTwoQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 def VLD2LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListTwoDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD2LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD2LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListTwoDWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD2LNqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD2LNqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListTwoQWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 
 // VST2 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VST2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr",
-                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VST2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr",
-                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VST2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr",
-                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 def VST2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr",
-                 (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VST2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr",
-                 (ins VecListTwoQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 
 def VST2LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr!",
-                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VST2LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!",
-                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VST2LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!",
-                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 def VST2LNqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!",
-                 (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VST2LNqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!",
-                 (ins VecListTwoQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 def VST2LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListTwoDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST2LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm",
-                  (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST2LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListTwoDWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST2LNqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm",
-                  (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST2LNqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListTwoQWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VLD3 all-lanes pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
-               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
-               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
-               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
-               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
-               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
-               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 
 def VLD3DUPdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPqWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeDAllLanes:$list, addrmode6:$addr,
+                  (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3DUPdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeDAllLanes:$list, addrmode6:$addr,
+                  (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3DUPdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeDAllLanes:$list, addrmode6:$addr,
+                  (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3DUPqWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeQAllLanes:$list, addrmode6:$addr,
+                  (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3DUPqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeQAllLanes:$list, addrmode6:$addr,
+                  (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3DUPqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeQAllLanes:$list, addrmode6:$addr,
+                  (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 
 // VLD3 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
-               (ins VecListThreeDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
-               (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
-               (ins VecListThreeDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
-               (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
-               (ins VecListThreeQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 
 def VLD3LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListThreeDHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
 def VLD3LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeDWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3LNqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListThreeQHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
 def VLD3LNqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeQWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VLD3 multiple structure pseudo-instructions. These need special handling for
 // the vector operands that the normal instructions don't yet model.
 // FIXME: Remove these when the register classes and instructions are updated.
 def VLD3dAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3dAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3dAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3qAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3qAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3qAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 
 def VLD3dWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3dWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3dWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3qWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3qWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3qWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3dWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeD:$list, addrmode6:$addr,
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3dWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeD:$list, addrmode6:$addr,
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3dWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeD:$list, addrmode6:$addr,
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3qWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3qWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3qWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VST3 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VST3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
-               (ins VecListThreeDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
-               (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
-               (ins VecListThreeDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
-               (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
-               (ins VecListThreeQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 
 def VST3LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListThreeDHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
 def VST3LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeDWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3LNqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListThreeQHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
 def VST3LNqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeQWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 
@@ -6691,168 +6957,190 @@ def VST3LNqWB_register_Asm_32 :
 // the vector operands that the normal instructions don't yet model.
 // FIXME: Remove these when the register classes and instructions are updated.
 def VST3dAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3dAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3dAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3qAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3qAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3qAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 
 def VST3dWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3dWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3dWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3qWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3qWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3qWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3dWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeD:$list, addrmode6:$addr,
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3dWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeD:$list, addrmode6:$addr,
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3dWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeD:$list, addrmode6:$addr,
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3qWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3qWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3qWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VLD4 all-lanes pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD4DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
-               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
 def VLD4DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
-               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
 def VLD4DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
-               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
 def VLD4DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
-               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
 def VLD4DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
-               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
 def VLD4DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
-               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
 
 def VLD4DUPdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
-               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
 def VLD4DUPdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
-               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
 def VLD4DUPdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
-               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
 def VLD4DUPqWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
-               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
 def VLD4DUPqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
-               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
 def VLD4DUPqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
-               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
 def VLD4DUPdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourDAllLanes:$list, addrmode6:$addr,
+                  (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4DUPdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourDAllLanes:$list, addrmode6:$addr,
+                  (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4DUPdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourDAllLanes:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListFourDAllLanes:$list,
+                       addrmode6dupalign64or128:$addr, rGPR:$Rm, pred:$p)>;
 def VLD4DUPqWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourQAllLanes:$list, addrmode6:$addr,
+                  (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4DUPqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourQAllLanes:$list, addrmode6:$addr,
+                  (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4DUPqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourQAllLanes:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListFourQAllLanes:$list,
+                       addrmode6dupalign64or128:$addr, rGPR:$Rm, pred:$p)>;
 
 
 // VLD4 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
-               (ins VecListFourDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
 def VLD4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
-               (ins VecListFourDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VLD4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
-               (ins VecListFourDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 def VLD4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
-               (ins VecListFourQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VLD4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
-               (ins VecListFourQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 
 def VLD4LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
-               (ins VecListFourDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
 def VLD4LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
-               (ins VecListFourDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VLD4LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
-               (ins VecListFourDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 def VLD4LNqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
-               (ins VecListFourQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VLD4LNqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
-               (ins VecListFourQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 def VLD4LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourDHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourDWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListFourDWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
 def VLD4LNqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourQHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4LNqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourQWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListFourQWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
 
 
 
@@ -6860,168 +7148,202 @@ def VLD4LNqWB_register_Asm_32 :
 // the vector operands that the normal instructions don't yet model.
 // FIXME: Remove these when the register classes and instructions are updated.
 def VLD4dAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4dAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4dAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4qAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4qAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4qAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 
 def VLD4dWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4dWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4dWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4qWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4qWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4qWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4dWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourD:$list, addrmode6:$addr,
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4dWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourD:$list, addrmode6:$addr,
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4dWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourD:$list, addrmode6:$addr,
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4qWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourQ:$list, addrmode6:$addr,
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4qWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourQ:$list, addrmode6:$addr,
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4qWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourQ:$list, addrmode6:$addr,
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VST4 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VST4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
-               (ins VecListFourDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
 def VST4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
-               (ins VecListFourDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VST4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
-               (ins VecListFourDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 def VST4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
-               (ins VecListFourQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VST4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
-               (ins VecListFourQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 
 def VST4LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
-               (ins VecListFourDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
 def VST4LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
-               (ins VecListFourDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VST4LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
-               (ins VecListFourDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 def VST4LNqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
-               (ins VecListFourQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VST4LNqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
-               (ins VecListFourQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 def VST4LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourDHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourDWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListFourDWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
 def VST4LNqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourQHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4LNqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourQWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListFourQWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
 
 
 // VST4 multiple structure pseudo-instructions. These need special handling for
 // the vector operands that the normal instructions don't yet model.
 // FIXME: Remove these when the register classes and instructions are updated.
 def VST4dAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4dAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4dAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4qAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4qAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4qAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 
 def VST4dWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4dWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4dWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4qWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4qWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4qWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4dWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourD:$list, addrmode6:$addr,
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4dWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourD:$list, addrmode6:$addr,
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4dWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourD:$list, addrmode6:$addr,
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4qWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourQ:$list, addrmode6:$addr,
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4qWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourQ:$list, addrmode6:$addr,
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4qWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourQ:$list, addrmode6:$addr,
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VMOV/VMVN takes an optional datatype suffix
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 754295f..e17f73a 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -269,7 +269,8 @@ class T1SystemEncoding<bits<8> opc>
   let Inst{7-0} = opc;
 }
 
-def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm", []>,
+def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm",
+                 [(int_arm_hint imm0_15:$imm)]>,
             T1SystemEncoding<0x00>,
             Requires<[IsThumb, HasV6M]> {
   bits<4> imm;
@@ -288,7 +289,6 @@ def : tHintAlias<"sev$p", (tHINT 4, pred:$p)>; // A8.6.157
 def : tInstAlias<"sevl$p", (tHINT 5, pred:$p)> {
   let Predicates = [IsThumb2, HasV8];
 }
-def : T2Pat<(int_arm_sevl), (tHINT 5)>;
 
 // The imm operand $val can be used by a debugger to store more information
 // about the breakpoint.
@@ -1193,6 +1193,15 @@ def tTST :                      // A8.6.230
                [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>,
                Sched<[WriteALU]>;
 
+// A8.8.247  UDF - Undefined (Encoding T1)
+def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8",
+              [(int_arm_undefined imm0_255:$imm8)]>, Encoding16 {
+  bits<8> imm8;
+  let Inst{15-12} = 0b1101;
+  let Inst{11-8} = 0b1110;
+  let Inst{7-0} = imm8;
+}
+
 // Zero-extend byte
 def tUXTB :                     // A8.6.262
   T1pIMiscEncode<{0,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
@@ -1308,6 +1317,18 @@ def : T1Pat<(addc   tGPR:$lhs, imm8_255_neg:$rhs),
 def : T1Pat<(subc   tGPR:$lhs, tGPR:$rhs),
             (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
 
+// Bswap 16 with load/store
+def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rrs2:$addr)), (i32 16)),
+            (tREV16 (tLDRHr t_addrmode_rrs2:$addr))>;
+def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
+            (tREV16 (tLDRHi t_addrmode_is2:$addr))>;
+def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
+                           t_addrmode_rrs2:$addr),
+            (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rrs2:$addr)>;
+def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
+                           t_addrmode_is2:$addr),
+            (tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>;
+
 // ConstantPool
 def : T1Pat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
 
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 387bd60..c30d6ab 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -1445,7 +1445,7 @@ defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
 // Store doubleword
 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
 def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
-                       (ins GPR:$Rt, GPR:$Rt2, t2addrmode_imm8s4:$addr),
+                       (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
                IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>;
 
 // Indexed stores
@@ -1676,7 +1676,7 @@ defm t2PLI    : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
 // pci variant is very similar to i12, but supports negative offsets
 // from the PC. Only PLD and PLI have pci variants (not PLDW)
 class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr),
-               IIC_Preload, opc, "\t$addr", 
+               IIC_Preload, opc, "\t$addr",
                [(ARMPreload (ARMWrapper tconstpool:$addr),
                 (i32 0), (i32 inst))]>, Sched<[WritePreLd]> {
   let Inst{31-25} = 0b1111100;
@@ -1918,7 +1918,7 @@ def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
   let DecoderMethod = "DecodeT2MOVTWInstruction";
 }
 
-def : t2InstAlias<"mov${p} $Rd, $imm", 
+def : t2InstAlias<"mov${p} $Rd, $imm",
                   (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p)>;
 
 def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
@@ -2407,6 +2407,19 @@ def t2UBFX: T2TwoRegBitFI<
   let Inst{15} = 0;
 }
 
+// A8.8.247  UDF - Undefined (Encoding T2)
+def t2UDF : T2XI<(outs), (ins imm0_65535:$imm16), IIC_Br, "udf.w\t$imm16",
+                 [(int_arm_undefined imm0_65535:$imm16)]> {
+  bits<16> imm16;
+  let Inst{31-29} = 0b111;
+  let Inst{28-27} = 0b10;
+  let Inst{26-20} = 0b1111111;
+  let Inst{19-16} = imm16{15-12};
+  let Inst{15} = 0b1;
+  let Inst{14-12} = 0b010;
+  let Inst{11-0} = imm16{11-0};
+}
+
 // A8.6.18  BFI - Bitfield insert (Encoding T1)
 let Constraints = "$src = $Rd" in {
   def t2BFI : T2TwoRegBitFI<(outs rGPR:$Rd),
@@ -3495,8 +3508,8 @@ def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
   let Inst{25-16} = target{20-11};
   let Inst{10-0} = target{10-0};
   let DecoderMethod = "DecodeT2BInstruction";
-  let AsmMatchConverter = "cvtThumbBranches"; 
-} 
+  let AsmMatchConverter = "cvtThumbBranches";
+}
 
 let isNotDuplicable = 1, isIndirectBranch = 1 in {
 def t2BR_JT : t2PseudoInst<(outs),
@@ -3671,7 +3684,8 @@ def : t2InstAlias<"cps.w $mode", (t2CPS1p imm0_31:$mode), 0>;
 
 // A6.3.4 Branches and miscellaneous control
 // Table A6-14 Change Processor State, and hint instructions
-def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",[]> {
+def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",
+                  [(int_arm_hint imm0_239:$imm)]> {
   bits<8> imm;
   let Inst{31-3} = 0b11110011101011111000000000000;
   let Inst{7-0} = imm;
@@ -3698,7 +3712,7 @@ def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> {
 
 // Secure Monitor Call is a system instruction.
 // Option = Inst{19-16}
-def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", 
+def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
                 []>, Requires<[IsThumb2, HasTrustZone]> {
   let Inst{31-27} = 0b11110;
   let Inst{26-20} = 0b1111111;
@@ -4278,7 +4292,7 @@ def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
 
 // Aliases for ADD without the ".w" optional width specifier.
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
-        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, 
+        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p,
          cc_out:$s)>;
 def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
            (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index 73c6eb7..8821c2d 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "ARMJITInfo.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMRelocations.h"
@@ -25,6 +24,8 @@
 #include <cstdlib>
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
   report_fatal_error("ARMJITInfo::replaceMachineCodeForFunction");
 }
@@ -319,13 +320,13 @@ void ARMJITInfo::relocate(void *Function, MachineRelocation *MR,
       break;
     }
     case ARM::reloc_arm_movw: {
-      ResultPtr = ResultPtr & 0xFFFF; 
+      ResultPtr = ResultPtr & 0xFFFF;
       *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF;
       *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16;
       break;
     }
     case ARM::reloc_arm_movt: {
-      ResultPtr = (ResultPtr >> 16) & 0xFFFF; 
+      ResultPtr = (ResultPtr >> 16) & 0xFFFF;
       *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF;
       *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16;
       break;
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 48e0bd7..ee7df54 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -12,13 +12,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-ldst-opt"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
+#include "ARMISelLowering.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "Thumb1RegisterInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -42,6 +43,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-ldst-opt"
+
 STATISTIC(NumLDMGened , "Number of ldm instructions generated");
 STATISTIC(NumSTMGened , "Number of stm instructions generated");
 STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
@@ -65,9 +68,10 @@ namespace {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     const ARMSubtarget *STI;
+    const TargetLowering *TL;
     ARMFunctionInfo *AFI;
     RegScavenger *RS;
-    bool isThumb2;
+    bool isThumb1, isThumb2;
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
@@ -93,7 +97,10 @@ namespace {
     void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs,
                           const MemOpQueue &MemOps, unsigned DefReg,
                           unsigned RangeBegin, unsigned RangeEnd);
-
+    void UpdateBaseRegUses(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           DebugLoc dl, unsigned Base, unsigned WordOffset,
+                           ARMCC::CondCodes Pred, unsigned PredReg);
     bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                   int Offset, unsigned Base, bool BaseKill, int Opcode,
                   ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
@@ -119,7 +126,6 @@ namespace {
                       ARMCC::CondCodes Pred, unsigned PredReg,
                       unsigned Scratch, MemOpQueue &MemOps,
                       SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
-
     void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator &MBBI);
@@ -159,6 +165,21 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
     case ARM_AM::db: return ARM::STMDB;
     case ARM_AM::ib: return ARM::STMIB;
     }
+  case ARM::tLDRi:
+    // tLDMIA is writeback-only - unless the base register is in the input
+    // reglist.
+    ++NumLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::tLDMIA;
+    }
+  case ARM::tSTRi:
+    // There is no non-writeback tSTMIA either.
+    ++NumSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::tSTMIA_UPD;
+    }
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
     ++NumLDMGened;
@@ -217,6 +238,9 @@ AMSubMode getLoadStoreMultipleSubMode(int Opcode) {
   case ARM::LDMIA_UPD:
   case ARM::STMIA:
   case ARM::STMIA_UPD:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
   case ARM::t2LDMIA_RET:
   case ARM::t2LDMIA:
   case ARM::t2LDMIA_UPD:
@@ -263,12 +287,20 @@ AMSubMode getLoadStoreMultipleSubMode(int Opcode) {
   } // end namespace ARM_AM
 } // end namespace llvm
 
+static bool isT1i32Load(unsigned Opc) {
+  return Opc == ARM::tLDRi;
+}
+
 static bool isT2i32Load(unsigned Opc) {
   return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
 }
 
 static bool isi32Load(unsigned Opc) {
-  return Opc == ARM::LDRi12 || isT2i32Load(Opc);
+  return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ;
+}
+
+static bool isT1i32Store(unsigned Opc) {
+  return Opc == ARM::tSTRi;
 }
 
 static bool isT2i32Store(unsigned Opc) {
@@ -276,7 +308,102 @@ static bool isT2i32Store(unsigned Opc) {
 }
 
 static bool isi32Store(unsigned Opc) {
-  return Opc == ARM::STRi12 || isT2i32Store(Opc);
+  return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
+}
+
+static unsigned getImmScale(unsigned Opc) {
+  switch (Opc) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::tLDRi:
+  case ARM::tSTRi:
+    return 1;
+  case ARM::tLDRHi:
+  case ARM::tSTRHi:
+    return 2;
+  case ARM::tLDRBi:
+  case ARM::tSTRBi:
+    return 4;
+  }
+}
+
+/// Update future uses of the base register with the offset introduced
+/// due to writeback. This function only works on Thumb1.
+void
+ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   DebugLoc dl, unsigned Base,
+                                   unsigned WordOffset,
+                                   ARMCC::CondCodes Pred, unsigned PredReg) {
+  assert(isThumb1 && "Can only update base register uses for Thumb1!");
+
+  // Start updating any instructions with immediate offsets. Insert a sub before
+  // the first non-updateable instruction (if any).
+  for (; MBBI != MBB.end(); ++MBBI) {
+    if (MBBI->readsRegister(Base)) {
+      unsigned Opc = MBBI->getOpcode();
+      int Offset;
+      bool InsertSub = false;
+
+      if (Opc == ARM::tLDRi  || Opc == ARM::tSTRi  ||
+          Opc == ARM::tLDRHi || Opc == ARM::tSTRHi ||
+          Opc == ARM::tLDRBi || Opc == ARM::tSTRBi) {
+        // Loads and stores with immediate offsets can be updated, but only if
+        // the new offset isn't negative.
+        // The MachineOperand containing the offset immediate is the last one
+        // before predicates.
+        MachineOperand &MO =
+          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
+        // The offsets are scaled by 1, 2 or 4 depending on the Opcode
+        Offset = MO.getImm() - WordOffset * getImmScale(Opc);
+        if (Offset >= 0)
+          MO.setImm(Offset);
+        else
+          InsertSub = true;
+
+      } else if (Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) {
+        // SUB/ADD using this register. Merge it with the update.
+        // If the merged offset is too large, insert a new sub instead.
+        MachineOperand &MO =
+          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
+        Offset = (Opc == ARM::tSUBi8) ?
+          MO.getImm() + WordOffset * 4 :
+          MO.getImm() - WordOffset * 4 ;
+        if (TL->isLegalAddImmediate(Offset)) {
+          MO.setImm(Offset);
+          // The base register has now been reset, so exit early.
+          return;
+        } else {
+          InsertSub = true;
+        }
+
+      } else {
+        // Can't update the instruction.
+        InsertSub = true;
+      }
+
+      if (InsertSub) {
+        // An instruction above couldn't be updated, so insert a sub.
+        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base))
+          .addReg(Base, getKillRegState(true)).addImm(WordOffset * 4)
+          .addImm(Pred).addReg(PredReg);
+        return;
+      }
+    }
+
+    if (MBBI->killsRegister(Base))
+      // Register got killed. Stop updating.
+      return;
+  }
+
+  // The end of the block was reached. This means register liveness escapes the
+  // block, and it's necessary to insert a sub before the last instruction.
+  if (MBB.succ_size() > 0)
+    // But only insert the SUB if there is actually a successor block.
+    // FIXME: Check more carefully if register is live at this point, e.g. by
+    // also examining the successor block's register liveness information.
+    AddDefaultT1CC(BuildMI(MBB, --MBBI, dl, TII->get(ARM::tSUBi8), Base))
+      .addReg(Base, getKillRegState(true)).addImm(WordOffset * 4)
+      .addImm(Pred).addReg(PredReg);
 }
 
 /// MergeOps - Create and insert a LDM or STM with Base as base register and
@@ -296,18 +423,19 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
     return false;
 
   ARM_AM::AMSubMode Mode = ARM_AM::ia;
-  // VFP and Thumb2 do not support IB or DA modes.
+  // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
   bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
-  bool haveIBAndDA = isNotVFP && !isThumb2;
-  if (Offset == 4 && haveIBAndDA)
+  bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1;
+
+  if (Offset == 4 && haveIBAndDA) {
     Mode = ARM_AM::ib;
-  else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA)
+  } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) {
     Mode = ARM_AM::da;
-  else if (Offset == -4 * (int)NumRegs && isNotVFP)
+  } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) {
     // VLDM/VSTM do not support DB mode without also updating the base reg.
     Mode = ARM_AM::db;
-  else if (Offset != 0) {
-    // Check if this is a supported opcode before we insert instructions to
+  } else if (Offset != 0) {
+    // Check if this is a supported opcode before inserting instructions to
     // calculate a new base register.
     if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false;
 
@@ -318,41 +446,98 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       return false;
 
     unsigned NewBase;
-    if (isi32Load(Opcode))
+    if (isi32Load(Opcode)) {
       // If it is a load, then just use one of the destination register to
       // use as the new base.
       NewBase = Regs[NumRegs-1].first;
-    else {
+    } else {
       // Use the scratch register to use as a new base.
       NewBase = Scratch;
       if (NewBase == 0)
         return false;
     }
-    int BaseOpc = !isThumb2 ? ARM::ADDri : ARM::t2ADDri;
+
+    int BaseOpc =
+      isThumb2 ? ARM::t2ADDri :
+      isThumb1 ? ARM::tADDi8  : ARM::ADDri;
+
     if (Offset < 0) {
-      BaseOpc = !isThumb2 ? ARM::SUBri : ARM::t2SUBri;
+      BaseOpc =
+        isThumb2 ? ARM::t2SUBri :
+        isThumb1 ? ARM::tSUBi8  : ARM::SUBri;
       Offset = - Offset;
     }
-    int ImmedOffset = isThumb2
-      ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
-    if (ImmedOffset == -1)
-      // FIXME: Try t2ADDri12 or t2SUBri12?
-      return false;  // Probably not worth it then.
-
-    BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
-      .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
-      .addImm(Pred).addReg(PredReg).addReg(0);
+
+    if (!TL->isLegalAddImmediate(Offset))
+      // FIXME: Try add with register operand?
+      return false; // Probably not worth it then.
+
+    if (isThumb1) {
+      if (Base != NewBase) {
+        // Need to insert a MOV to the new base first.
+        // FIXME: If the immediate fits in 3 bits, use ADD instead.
+        BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)
+          .addReg(Base, getKillRegState(BaseKill))
+          .addImm(Pred).addReg(PredReg);
+      }
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase))
+        .addReg(NewBase, getKillRegState(true)).addImm(Offset)
+        .addImm(Pred).addReg(PredReg);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
+        .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
+        .addImm(Pred).addReg(PredReg).addReg(0);
+    }
+
     Base = NewBase;
-    BaseKill = true;  // New base is always killed right its use.
+    BaseKill = true; // New base is always killed straight away.
   }
 
   bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
                 Opcode == ARM::VLDRD);
+
+  // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
+  // base register writeback.
   Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
   if (!Opcode) return false;
-  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode))
-    .addReg(Base, getKillRegState(BaseKill))
-    .addImm(Pred).addReg(PredReg);
+
+  bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
+
+  // Exception: If the base register is in the input reglist, Thumb1 LDM is
+  // non-writeback. Check for this.
+  if (Opcode == ARM::tLDRi && isThumb1)
+    for (unsigned I = 0; I < NumRegs; ++I)
+      if (Base == Regs[I].first) {
+        Writeback = false;
+        break;
+      }
+
+  MachineInstrBuilder MIB;
+
+  if (Writeback) {
+    if (Opcode == ARM::tLDMIA)
+      // Update tLDMIA with writeback if necessary.
+      Opcode = ARM::tLDMIA_UPD;
+
+    // The base isn't dead after a merged instruction with writeback. Update
+    // future uses of the base with the added offset (if possible), or reset
+    // the base register as necessary.
+    if (!BaseKill)
+      UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
+
+    MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
+
+    // Thumb1: we might need to set base writeback when building the MI.
+    MIB.addReg(Base, getDefRegState(true))
+       .addReg(Base, getKillRegState(BaseKill));
+  } else {
+    // No writeback, simply build the MachineInstr.
+    MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
+    MIB.addReg(Base, getKillRegState(BaseKill));
+  }
+
+  MIB.addImm(Pred).addReg(PredReg);
+
   for (unsigned i = 0; i != NumRegs; ++i)
     MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
                      | getKillRegState(Regs[i].second));
@@ -492,7 +677,7 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
   // affected uses.
   for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(),
                                                    E = UsesOfImpDefs.end();
-       I != E; ++I)
+                                                   I != E; ++I)
     (*I)->setIsUndef();
 
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
@@ -589,7 +774,6 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
   bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
   MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
                  Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
-  return;
 }
 
 static bool definesCPSR(MachineInstr *MI) {
@@ -616,6 +800,7 @@ static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
   bool CheckCPSRDef = false;
   switch (MI->getOpcode()) {
   default: return false;
+  case ARM::tSUBi8:
   case ARM::t2SUBri:
   case ARM::SUBri:
     CheckCPSRDef = true;
@@ -628,10 +813,11 @@ static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
   if (Bytes == 0 || (Limit && Bytes >= Limit))
     return false;
 
-  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
+  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi ||
+                    MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME
   if (!(MI->getOperand(0).getReg() == Base &&
         MI->getOperand(1).getReg() == Base &&
-        (MI->getOperand(2).getImm()*Scale) == Bytes &&
+        (MI->getOperand(2).getImm() * Scale) == Bytes &&
         getInstrPredicate(MI, MyPredReg) == Pred &&
         MyPredReg == PredReg))
     return false;
@@ -649,6 +835,7 @@ static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
   bool CheckCPSRDef = false;
   switch (MI->getOpcode()) {
   default: return false;
+  case ARM::tADDi8:
   case ARM::t2ADDri:
   case ARM::ADDri:
     CheckCPSRDef = true;
@@ -661,10 +848,11 @@ static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
     // Make sure the offset fits in 8 bits.
     return false;
 
-  unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
+  unsigned Scale = (MI->getOpcode() == ARM::tADDspi ||
+                    MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME
   if (!(MI->getOperand(0).getReg() == Base &&
         MI->getOperand(1).getReg() == Base &&
-        (MI->getOperand(2).getImm()*Scale) == Bytes &&
+        (MI->getOperand(2).getImm() * Scale) == Bytes &&
         getInstrPredicate(MI, MyPredReg) == Pred &&
         MyPredReg == PredReg))
     return false;
@@ -677,6 +865,8 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
   default: return 0;
   case ARM::LDRi12:
   case ARM::STRi12:
+  case ARM::tLDRi:
+  case ARM::tSTRi:
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
   case ARM::t2STRi8:
@@ -695,6 +885,9 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
   case ARM::STMDA:
   case ARM::STMDB:
   case ARM::STMIB:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
   case ARM::t2LDMIA:
   case ARM::t2LDMDB:
   case ARM::t2STMIA:
@@ -791,6 +984,9 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
                                                MachineBasicBlock::iterator MBBI,
                                                bool &Advance,
                                                MachineBasicBlock::iterator &I) {
+  // Thumb1 is already using updating loads/stores.
+  if (isThumb1) return false;
+
   MachineInstr *MI = MBBI;
   unsigned Base = MI->getOperand(0).getReg();
   bool BaseKill = MI->getOperand(0).isKill();
@@ -927,6 +1123,10 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
                                                const TargetInstrInfo *TII,
                                                bool &Advance,
                                                MachineBasicBlock::iterator &I) {
+  // Thumb1 doesn't have updating LDR/STR.
+  // FIXME: Use LDM/STM with single register instead.
+  if (isThumb1) return false;
+
   MachineInstr *MI = MBBI;
   unsigned Base = MI->getOperand(1).getReg();
   bool BaseKill = MI->getOperand(1).isKill();
@@ -1002,7 +1202,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
     return false;
 
   if (isAM5) {
-    // VLDM[SD}_UPD, VSTM[SD]_UPD
+    // VLDM[SD]_UPD, VSTM[SD]_UPD
     // (There are no base-updating versions of VLDR/VSTR instructions, but the
     // updating load/store-multiple instructions can be used with only one
     // register.)
@@ -1100,6 +1300,8 @@ static bool isMemoryOp(const MachineInstr *MI) {
     return MI->getOperand(1).isReg();
   case ARM::LDRi12:
   case ARM::STRi12:
+  case ARM::tLDRi:
+  case ARM::tSTRi:
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
   case ARM::t2STRi8:
@@ -1137,6 +1339,10 @@ static int getMemoryOpOffset(const MachineInstr *MI) {
       Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
     return OffField;
 
+  // Thumb1 immediate offsets are scaled by 4
+  if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi)
+    return OffField * 4;
+
   int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
     : ARM_AM::getAM5Offset(OffField) * 4;
   if (isAM3) {
@@ -1408,16 +1614,20 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       if (MBBI == E)
         // Reach the end of the block, try merging the memory instructions.
         TryMerge = true;
-    } else
+    } else {
       TryMerge = true;
+    }
 
     if (TryMerge) {
       if (NumMemOps > 1) {
         // Try to find a free register to use as a new base in case it's needed.
         // First advance to the instruction just before the start of the chain.
         AdvanceRS(MBB, MemOps);
+
         // Find a scratch register.
-        unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass);
+        unsigned Scratch =
+          RS->FindUnusedReg(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass);
+
         // Process the load / store instructions.
         RS->forward(std::prev(MBBI));
 
@@ -1483,6 +1693,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
 /// =>
 ///   ldmfd sp!, {..., pc}
 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
+  // Thumb1 LDM doesn't allow high registers.
+  if (isThumb1) return false;
   if (MBB.empty()) return false;
 
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
@@ -1513,12 +1725,14 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
 
 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   const TargetMachine &TM = Fn.getTarget();
+  TL = TM.getTargetLowering();
   AFI = Fn.getInfo<ARMFunctionInfo>();
   TII = TM.getInstrInfo();
   TRI = TM.getRegisterInfo();
   STI = &TM.getSubtarget<ARMSubtarget>();
   RS = new RegScavenger();
   isThumb2 = AFI->isThumb2Function();
+  isThumb1 = AFI->isThumbFunction() && !isThumb2;
 
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
@@ -1666,11 +1880,11 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
   // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
   unsigned Scale = 1;
   unsigned Opcode = Op0->getOpcode();
-  if (Opcode == ARM::LDRi12)
+  if (Opcode == ARM::LDRi12) {
     NewOpc = ARM::LDRD;
-  else if (Opcode == ARM::STRi12)
+  } else if (Opcode == ARM::STRi12) {
     NewOpc = ARM::STRD;
-  else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
+  } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
     NewOpc = ARM::t2LDRDi8;
     Scale = 4;
     isT2 = true;
@@ -1678,8 +1892,9 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
     NewOpc = ARM::t2STRDi8;
     Scale = 4;
     isT2 = true;
-  } else
+  } else {
     return false;
+  }
 
   // Make sure the base address satisfies i64 ld / st alignment requirement.
   // At the moment, we ignore the memoryoperand's value.
@@ -1746,8 +1961,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
   while (Ops.size() > 1) {
     unsigned FirstLoc = ~0U;
     unsigned LastLoc = 0;
-    MachineInstr *FirstOp = 0;
-    MachineInstr *LastOp = 0;
+    MachineInstr *FirstOp = nullptr;
+    MachineInstr *LastOp = nullptr;
     int LastOffset = 0;
     unsigned LastOpcode = 0;
     unsigned LastBytes = 0;
diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
index 20619fa..2a49255 100644
--- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
+++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -8,8 +8,6 @@
 //
 //===------------------------------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "double barriers"
-
 #include "ARM.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMInstrInfo.h"
@@ -17,6 +15,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "double barriers"
+
 STATISTIC(NumDMBsRemoved, "Number of DMBs removed");
 
 namespace {
@@ -25,9 +25,9 @@ public:
   static char ID;
   ARMOptimizeBarriersPass() : MachineFunctionPass(ID) {}
 
-  virtual bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "optimise barriers pass";
   }
 
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 7f0fe05..b290e7f 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -116,13 +116,13 @@ def D15 : ARMReg<15, "d15", [S30, S31]>, DwarfRegNum<[271]>;
 }
 
 // VFP3 defines 16 additional double registers
-def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>; 
+def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>;
 def D17 : ARMFReg<17, "d17">, DwarfRegNum<[273]>;
 def D18 : ARMFReg<18, "d18">, DwarfRegNum<[274]>;
 def D19 : ARMFReg<19, "d19">, DwarfRegNum<[275]>;
 def D20 : ARMFReg<20, "d20">, DwarfRegNum<[276]>;
 def D21 : ARMFReg<21, "d21">, DwarfRegNum<[277]>;
-def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>; 
+def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>;
 def D23 : ARMFReg<23, "d23">, DwarfRegNum<[279]>;
 def D24 : ARMFReg<24, "d24">, DwarfRegNum<[280]>;
 def D25 : ARMFReg<25, "d25">, DwarfRegNum<[281]>;
@@ -158,11 +158,11 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
 // Current Program Status Register.
 // We model fpscr with two registers: FPSCR models the control bits and will be
 // reserved. FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV
-// models the APSR when it's accessed by some special instructions. In such cases 
+// models the APSR when it's accessed by some special instructions. In such cases
 // it has the same encoding as PC.
 def CPSR       : ARMReg<0,  "cpsr">;
 def APSR       : ARMReg<1,  "apsr">;
-def APSR_NZCV  : ARMReg<15, "apsr_nzcv">; 
+def APSR_NZCV  : ARMReg<15, "apsr_nzcv">;
 def SPSR       : ARMReg<2,  "spsr">;
 def FPSCR      : ARMReg<3,  "fpscr">;
 def FPSCR_NZCV : ARMReg<3,  "fpscr_nzcv"> {
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td
index 0ace9bc..57d0bfb 100644
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -93,7 +93,7 @@ def ARMV6Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iMAC32   , [InstrStage<2, [V6_Pipe]>], [5, 1, 1, 2]>,
   InstrItinData<IIC_iMUL64   , [InstrStage<3, [V6_Pipe]>], [6, 1, 1]>,
   InstrItinData<IIC_iMAC64   , [InstrStage<3, [V6_Pipe]>], [6, 1, 1, 2]>,
-  
+
   // Integer load pipeline
   //
   // Immediate offset
@@ -181,7 +181,7 @@ def ARMV6Itineraries : ProcessorItineraries<
   //
   // Store multiple + update
   InstrItinData<IIC_iStore_mu , [InstrStage<3, [V6_Pipe]>], [2]>,
-  
+
   // Branch
   //
   // no delay slots, so the latency of a branch is unimportant
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index ba3cf4d..008ad64 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -11,12 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-selectiondag-info"
 #include "ARMTargetMachine.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DerivedTypes.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-selectiondag-info"
+
 ARMSelectionDAGInfo::ARMSelectionDAGInfo(const TargetMachine &TM)
   : TargetSelectionDAGInfo(TM),
     Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
@@ -52,9 +53,10 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   EVT VT = MVT::i32;
   unsigned VTSize = 4;
   unsigned i = 0;
-  const unsigned MAX_LOADS_IN_LDM = 6;
-  SDValue TFOps[MAX_LOADS_IN_LDM];
-  SDValue Loads[MAX_LOADS_IN_LDM];
+  // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
+  const unsigned MAX_LOADS_IN_LDM = Subtarget->isThumb1Only() ? 4 : 6;
+  SDValue TFOps[6];
+  SDValue Loads[6];
   uint64_t SrcOff = 0, DstOff = 0;
 
   // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
@@ -71,7 +73,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
       TFOps[i] = Loads[i].getValue(1);
       SrcOff += VTSize;
     }
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        makeArrayRef(TFOps, i));
 
     for (i = 0;
          i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
@@ -82,7 +85,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                               isVolatile, false, 0);
       DstOff += VTSize;
     }
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        makeArrayRef(TFOps, i));
 
     EmittedNumMemOps += i;
   }
@@ -112,7 +116,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
     SrcOff += VTSize;
     BytesLeft -= VTSize;
   }
-  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      makeArrayRef(TFOps, i));
 
   i = 0;
   BytesLeft = BytesLeftSave;
@@ -133,7 +138,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
     DstOff += VTSize;
     BytesLeft -= VTSize;
   }
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     makeArrayRef(TFOps, i));
 }
 
 // Adjust parameters for memset, EABI uses format (ptr, size, value),
@@ -146,7 +152,8 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                         unsigned Align, bool isVolatile,
                         MachinePointerInfo DstPtrInfo) const {
   // Use default for non-AAPCS (or MachO) subtargets
-  if (!Subtarget->isAAPCS_ABI() || Subtarget->isTargetMachO())
+  if (!Subtarget->isAAPCS_ABI() || Subtarget->isTargetMachO() ||
+      Subtarget->isTargetWindows())
     return SDValue();
 
   const ARMTargetLowering &TLI =
@@ -179,22 +186,14 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
   Args.push_back(Entry);
 
   // Emit __eabi_memset call
-  TargetLowering::CallLoweringInfo CLI(Chain,
-                    Type::getVoidTy(*DAG.getContext()), // return type
-                    false, // return sign ext
-                    false, // return zero ext
-                    false, // is var arg
-                    false, // is in regs
-                    0,     // number of fixed arguments
-                    TLI.getLibcallCallingConv(RTLIB::MEMSET), // call conv
-                    false, // is tail call
-                    false, // does not return
-                    false, // is return val used
-                    DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
-                                          TLI.getPointerTy()), // callee
-                    Args, DAG, dl);
-  std::pair<SDValue,SDValue> CallResult =
-    TLI.LowerCallTo(CLI);
-
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMSET),
+               Type::getVoidTy(*DAG.getContext()),
+               DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
+                                     TLI.getPointerTy()), &Args, 0)
+    .setDiscardResult();
+
+  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
   return CallResult.second;
 }
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 73e2018..5b204f6 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -21,12 +21,14 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "ARMGenSubtargetInfo.inc"
 
-using namespace llvm;
-
 static cl::opt<bool>
 ReserveR9("arm-reserve-r9", cl::Hidden,
           cl::desc("Reserve R9, making it unavailable as GPR"));
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 3855419..38536b2 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -31,7 +31,7 @@ class TargetOptions;
 class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
-    Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15, 
+    Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
     CortexR5, Swift, CortexA53, CortexA57, Krait
   };
   enum ARMProcClassEnum {
@@ -242,9 +242,7 @@ protected:
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
   unsigned getMaxInlineSizeThreshold() const {
-    // FIXME: For now, we don't lower memcpy's to loads / stores for Thumb1.
-    // Change this once Thumb1 ldmia / stmia support is added.
-    return isThumb1Only() ? 0 : 64;
+    return 64;
   }
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -396,7 +394,7 @@ public:
   bool isLittle() const { return IsLittle; }
 
   unsigned getMispredictionPenalty() const;
-  
+
   /// This function returns true if the target has sincos() routine in its
   /// compiler runtime or math libraries.
   bool hasSinCos() const;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 4ae539a..8876227 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -228,7 +228,7 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
 bool ARMPassConfig::addPreISel() {
   const ARMSubtarget *Subtarget = &getARMSubtarget();
   if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only())
-    addPass(createARMAtomicExpandPass(TM));
+    addPass(createAtomicExpandLoadLinkedPass(TM));
 
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createGlobalMergePass(TM));
@@ -247,8 +247,7 @@ bool ARMPassConfig::addInstSelector() {
 }
 
 bool ARMPassConfig::addPreRegAlloc() {
-  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
-  if (getOptLevel() != CodeGenOpt::None && !getARMSubtarget().isThumb1Only())
+  if (getOptLevel() != CodeGenOpt::None)
     addPass(createARMLoadStoreOptimizationPass(true));
   if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9())
     addPass(createMLxExpansionPass());
@@ -262,12 +261,10 @@ bool ARMPassConfig::addPreRegAlloc() {
 }
 
 bool ARMPassConfig::addPreSched2() {
-  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (getOptLevel() != CodeGenOpt::None) {
-    if (!getARMSubtarget().isThumb1Only()) {
-      addPass(createARMLoadStoreOptimizationPass());
-      printAndVerify("After ARM load / store optimizer");
-    }
+    addPass(createARMLoadStoreOptimizationPass());
+    printAndVerify("After ARM load / store optimizer");
+
     if (getARMSubtarget().hasNEON())
       addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
   }
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 0c80a95..664c992 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -23,7 +23,6 @@
 #include "Thumb1FrameLowering.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb2InstrInfo.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/TargetMachine.h"
@@ -102,7 +101,7 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
 /// ARMLETargetMachine - ARM little endian target machine.
 ///
 class ARMLETargetMachine : public ARMTargetMachine {
-  virtual void anchor();
+  void anchor() override;
 public:
   ARMLETargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -113,7 +112,7 @@ public:
 /// ARMBETargetMachine - ARM big endian target machine.
 ///
 class ARMBETargetMachine : public ARMTargetMachine {
-  virtual void anchor();
+  void anchor() override;
 public:
   ARMBETargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -128,12 +127,12 @@ public:
 class ThumbTargetMachine : public ARMBaseTargetMachine {
   virtual void anchor();
   // Either Thumb1InstrInfo or Thumb2InstrInfo.
-  OwningPtr<ARMBaseInstrInfo> InstrInfo;
+  std::unique_ptr<ARMBaseInstrInfo> InstrInfo;
   const DataLayout    DL;   // Calculates type size & alignment
   ARMTargetLowering   TLInfo;
   ARMSelectionDAGInfo TSInfo;
   // Either Thumb1FrameLowering or ARMFrameLowering.
-  OwningPtr<ARMFrameLowering> FrameLowering;
+  std::unique_ptr<ARMFrameLowering> FrameLowering;
 public:
   ThumbTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS,
@@ -169,7 +168,7 @@ public:
 /// ThumbLETargetMachine - Thumb little endian target machine.
 ///
 class ThumbLETargetMachine : public ThumbTargetMachine {
-  virtual void anchor();
+  void anchor() override;
 public:
   ThumbLETargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -180,10 +179,10 @@ public:
 /// ThumbBETargetMachine - Thumb big endian target machine.
 ///
 class ThumbBETargetMachine : public ThumbTargetMachine {
-  virtual void anchor();
+  void anchor() override;
 public:
-  ThumbBETargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+  ThumbBETargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 3379f85..48238bf 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -11,6 +11,7 @@
 #include "ARMSubtarget.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -31,7 +32,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
   InitializeELF(isAAPCS_ABI);
 
   if (isAAPCS_ABI) {
-    LSDASection = NULL;
+    LSDASection = nullptr;
   }
 
   AttributesSection =
@@ -45,6 +46,10 @@ const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
     const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
     const TargetMachine &TM, MachineModuleInfo *MMI,
     MCStreamer &Streamer) const {
+  if (TM.getMCAsmInfo()->getExceptionHandlingType() != ExceptionHandling::ARM)
+    return TargetLoweringObjectFileELF::getTTypeGlobalReference(
+        GV, Encoding, Mang, TM, MMI, Streamer);
+
   assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
 
   return MCSymbolRefExpr::Create(TM.getSymbol(GV, Mang),
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index 5f8d612..c926421 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -23,7 +23,7 @@ protected:
 public:
   ARMElfTargetObjectFile() :
     TargetLoweringObjectFileELF(),
-    AttributesSection(NULL)
+    AttributesSection(nullptr)
   {}
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index d3b43cd..57df7da 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -14,7 +14,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "armtti"
 #include "ARM.h"
 #include "ARMTargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -23,8 +22,10 @@
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "armtti"
+
 // Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializeARMTTIPass(PassRegistry &);
@@ -42,7 +43,7 @@ class ARMTTI final : public ImmutablePass, public TargetTransformInfo {
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
 public:
-  ARMTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+  ARMTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
diff --git a/lib/Target/ARM/Android.mk b/lib/Target/ARM/Android.mk
index 4be95aa..095955b 100644
--- a/lib/Target/ARM/Android.mk
+++ b/lib/Target/ARM/Android.mk
@@ -17,7 +17,6 @@ arm_codegen_TBLGEN_TABLES := \
 arm_codegen_SRC_FILES := \
   A15SDOptimizer.cpp \
   ARMAsmPrinter.cpp \
-  ARMAtomicExpandPass.cpp \
   ARMBaseInstrInfo.cpp \
   ARMBaseRegisterInfo.cpp \
   ARMCodeEmitter.cpp \
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 9c57a24..5cdf394 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -13,7 +13,6 @@
 #include "MCTargetDesc/ARMArchName.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMMCExpr.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -23,9 +22,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -345,7 +342,8 @@ public:
   };
 
   ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-               const MCInstrInfo &MII)
+               const MCInstrInfo &MII,
+               const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(MII), UC(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
 
@@ -416,7 +414,7 @@ class ARMOperand : public MCParsedAsmOperand {
     k_Token
   } Kind;
 
-  SMLoc StartLoc, EndLoc;
+  SMLoc StartLoc, EndLoc, AlignmentLoc;
   SmallVector<unsigned, 8> Registers;
 
   struct CCOp {
@@ -633,6 +631,12 @@ public:
   /// operand.
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
 
+  /// getAlignmentLoc - Get the location of the Alignment token of this operand.
+  SMLoc getAlignmentLoc() const {
+    assert(Kind == k_Memory && "Invalid access!");
+    return AlignmentLoc;
+  }
+
   ARMCC::CondCodes getCondCode() const {
     assert(Kind == k_CondCode && "Invalid access!");
     return CC.Val;
@@ -1089,12 +1093,12 @@ public:
   bool isPostIdxReg() const {
     return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy ==ARM_AM::no_shift;
   }
-  bool isMemNoOffset(bool alignOK = false) const {
+  bool isMemNoOffset(bool alignOK = false, unsigned Alignment = 0) const {
     if (!isMem())
       return false;
     // No offset of any kind.
-    return Memory.OffsetRegNum == 0 && Memory.OffsetImm == 0 &&
-     (alignOK || Memory.Alignment == 0);
+    return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr &&
+     (alignOK || Memory.Alignment == Alignment);
   }
   bool isMemPCRelImm12() const {
     if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
@@ -1110,6 +1114,65 @@ public:
   bool isAlignedMemory() const {
     return isMemNoOffset(true);
   }
+  bool isAlignedMemoryNone() const {
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemoryNone() const {
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory16() const {
+    if (isMemNoOffset(false, 2)) // alignment in bytes for 16-bits is 2.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory16() const {
+    if (isMemNoOffset(false, 2)) // alignment in bytes for 16-bits is 2.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory32() const {
+    if (isMemNoOffset(false, 4)) // alignment in bytes for 32-bits is 4.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory32() const {
+    if (isMemNoOffset(false, 4)) // alignment in bytes for 32-bits is 4.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory64() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory64() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory64or128() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory64or128() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory64or128or256() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+      return true;
+    if (isMemNoOffset(false, 32)) // alignment in bytes for 256-bits is 32.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
   bool isAddrMode2() const {
     if (!isMem() || Memory.Alignment != 0) return false;
     // Check for register offset.
@@ -1545,7 +1608,10 @@ public:
   }
 
   bool isNEONi16splat() const {
-    if (!isImm()) return false;
+    if (isNEONByteReplicate(2))
+      return false; // Leave that for bytes replication and forbid by default.
+    if (!isImm())
+      return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     // Must be a constant.
     if (!CE) return false;
@@ -1555,7 +1621,10 @@ public:
   }
 
   bool isNEONi32splat() const {
-    if (!isImm()) return false;
+    if (isNEONByteReplicate(4))
+      return false; // Leave that for bytes replication and forbid by default.
+    if (!isImm())
+      return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     // Must be a constant.
     if (!CE) return false;
@@ -1567,11 +1636,36 @@ public:
       (Value >= 0x01000000 && Value <= 0xff000000);
   }
 
+  bool isNEONByteReplicate(unsigned NumBytes) const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE)
+      return false;
+    int64_t Value = CE->getValue();
+    if (!Value)
+      return false; // Don't bother with zero.
+
+    unsigned char B = Value & 0xff;
+    for (unsigned i = 1; i < NumBytes; ++i) {
+      Value >>= 8;
+      if ((Value & 0xff) != B)
+        return false;
+    }
+    return true;
+  }
+  bool isNEONi16ByteReplicate() const { return isNEONByteReplicate(2); }
+  bool isNEONi32ByteReplicate() const { return isNEONByteReplicate(4); }
   bool isNEONi32vmov() const {
-    if (!isImm()) return false;
+    if (isNEONByteReplicate(4))
+      return false; // Let it to be classified as byte-replicate case.
+    if (!isImm())
+      return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     // Must be a constant.
-    if (!CE) return false;
+    if (!CE)
+      return false;
     int64_t Value = CE->getValue();
     // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
     // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
@@ -1612,7 +1706,7 @@ public:
 
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
-    if (Expr == 0)
+    if (!Expr)
       Inst.addOperand(MCOperand::CreateImm(0));
     else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
       Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
@@ -1926,6 +2020,50 @@ public:
     Inst.addOperand(MCOperand::CreateImm(Memory.Alignment));
   }
 
+  void addDupAlignedMemoryNoneOperands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemoryNoneOperands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory16Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory16Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory32Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory32Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory64Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory64Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory64or128Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory64or128Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory64or128or256Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
   void addAddrMode2Operands(MCInst &Inst, unsigned N) const {
     assert(N == 3 && "Invalid number of operands!");
     int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
@@ -2275,6 +2413,19 @@ public:
     Inst.addOperand(MCOperand::CreateImm(Value));
   }
 
+  void addNEONinvByteReplicateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
+            Inst.getOpcode() == ARM::VMOVv16i8) &&
+           "All vmvn instructions that wants to replicate non-zero byte "
+           "always must be replaced with VMOVv8i8 or VMOVv16i8.");
+    unsigned B = ((~Value) & 0xff);
+    B |= 0xe00; // cmode = 0b1110
+    Inst.addOperand(MCOperand::CreateImm(B));
+  }
   void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
@@ -2289,6 +2440,19 @@ public:
     Inst.addOperand(MCOperand::CreateImm(Value));
   }
 
+  void addNEONvmovByteReplicateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
+            Inst.getOpcode() == ARM::VMOVv16i8) &&
+           "All instructions that wants to replicate non-zero byte "
+           "always must be replaced with VMOVv8i8 or VMOVv16i8.");
+    unsigned B = Value & 0xff;
+    B |= 0xe00; // cmode = 0b1110
+    Inst.addOperand(MCOperand::CreateImm(B));
+  }
   void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
@@ -2523,7 +2687,8 @@ public:
                                unsigned ShiftImm,
                                unsigned Alignment,
                                bool isNegative,
-                               SMLoc S, SMLoc E) {
+                               SMLoc S, SMLoc E,
+                               SMLoc AlignmentLoc = SMLoc()) {
     ARMOperand *Op = new ARMOperand(k_Memory);
     Op->Memory.BaseRegNum = BaseRegNum;
     Op->Memory.OffsetImm = OffsetImm;
@@ -2534,6 +2699,7 @@ public:
     Op->Memory.isNegative = isNegative;
     Op->StartLoc = S;
     Op->EndLoc = E;
+    Op->AlignmentLoc = AlignmentLoc;
     return Op;
   }
 
@@ -2806,7 +2972,7 @@ int ARMAsmParser::tryParseShiftRegister(
   // The source register for the shift has already been added to the
   // operand list, so we need to pop it off and combine it into the shifted
   // register operand instead.
-  OwningPtr<ARMOperand> PrevOp((ARMOperand*)Operands.pop_back_val());
+  std::unique_ptr<ARMOperand> PrevOp((ARMOperand*)Operands.pop_back_val());
   if (!PrevOp->isReg())
     return Error(PrevOp->getStartLoc(), "shift must be of a register");
   int SrcReg = PrevOp->getReg();
@@ -2825,7 +2991,7 @@ int ARMAsmParser::tryParseShiftRegister(
         Parser.getTok().is(AsmToken::Dollar)) {
       Parser.Lex(); // Eat hash.
       SMLoc ImmLoc = Parser.getTok().getLoc();
-      const MCExpr *ShiftExpr = 0;
+      const MCExpr *ShiftExpr = nullptr;
       if (getParser().parseExpression(ShiftExpr, EndLoc)) {
         Error(ImmLoc, "invalid immediate shift value");
         return -1;
@@ -2855,12 +3021,12 @@ int ARMAsmParser::tryParseShiftRegister(
       EndLoc = Parser.getTok().getEndLoc();
       ShiftReg = tryParseRegister();
       if (ShiftReg == -1) {
-        Error (L, "expected immediate or register in shift operand");
+        Error(L, "expected immediate or register in shift operand");
         return -1;
       }
     } else {
-      Error (Parser.getTok().getLoc(),
-                    "expected immediate or register in shift operand");
+      Error(Parser.getTok().getLoc(),
+            "expected immediate or register in shift operand");
       return -1;
     }
   }
@@ -4323,8 +4489,9 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     E = Tok.getEndLoc();
     Parser.Lex(); // Eat right bracket token.
 
-    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, 0, ARM_AM::no_shift,
-                                             0, 0, false, S, E));
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0,
+                                             ARM_AM::no_shift, 0, 0, false,
+                                             S, E));
 
     // If there's a pre-indexing writeback marker, '!', just add it as a token
     // operand. It's rather odd, but syntactically valid.
@@ -4346,6 +4513,7 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   if (Parser.getTok().is(AsmToken::Colon)) {
     Parser.Lex(); // Eat the ':'.
     E = Parser.getTok().getLoc();
+    SMLoc AlignmentLoc = Tok.getLoc();
 
     const MCExpr *Expr;
     if (getParser().parseExpression(Expr))
@@ -4378,9 +4546,9 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
     // Don't worry about range checking the value here. That's handled by
     // the is*() predicates.
-    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, 0,
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0,
                                              ARM_AM::no_shift, 0, Align,
-                                             false, S, E));
+                                             false, S, E, AlignmentLoc));
 
     // If there's a pre-indexing writeback marker, '!', just add it as a token
     // operand.
@@ -4471,7 +4639,7 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   E = Parser.getTok().getEndLoc();
   Parser.Lex(); // Eat right bracket token.
 
-  Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, OffsetRegNum,
+  Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, OffsetRegNum,
                                            ShiftType, ShiftImm, 0, isNegative,
                                            S, E));
 
@@ -4926,8 +5094,9 @@ getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
 
   if (Mnemonic == "bkpt" || Mnemonic == "cbnz" || Mnemonic == "setend" ||
       Mnemonic == "cps" ||  Mnemonic == "it" ||  Mnemonic == "cbz" ||
-      Mnemonic == "trap" || Mnemonic == "hlt" || Mnemonic.startswith("crc32") ||
-      Mnemonic.startswith("cps") || Mnemonic.startswith("vsel") ||
+      Mnemonic == "trap" || Mnemonic == "hlt" || Mnemonic == "udf" ||
+      Mnemonic.startswith("crc32") || Mnemonic.startswith("cps") ||
+      Mnemonic.startswith("vsel") ||
       Mnemonic == "vmaxnm" || Mnemonic == "vminnm" || Mnemonic == "vcvta" ||
       Mnemonic == "vcvtn" || Mnemonic == "vcvtp" || Mnemonic == "vcvtm" ||
       Mnemonic == "vrinta" || Mnemonic == "vrintn" || Mnemonic == "vrintp" ||
@@ -5404,21 +5573,24 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   }
 
   // GNU Assembler extension (compatibility)
-  if ((Mnemonic == "ldrd" || Mnemonic == "strd") && !isThumb() &&
-      Operands.size() == 4) {
-    ARMOperand *Op = static_cast<ARMOperand *>(Operands[2]);
-    assert(Op->isReg() && "expected register argument");
+  if ((Mnemonic == "ldrd" || Mnemonic == "strd")) {
+    ARMOperand *Op2 = static_cast<ARMOperand *>(Operands[2]);
+    ARMOperand *Op3 = static_cast<ARMOperand *>(Operands[3]);
+    if (Op3->isMem()) {
+      assert(Op2->isReg() && "expected register argument");
 
-    unsigned SuperReg = MRI->getMatchingSuperReg(
-        Op->getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID));
+      unsigned SuperReg = MRI->getMatchingSuperReg(
+          Op2->getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID));
 
-    assert(SuperReg && "expected register pair");
+      assert(SuperReg && "expected register pair");
 
-    unsigned PairedReg = MRI->getSubReg(SuperReg, ARM::gsub_1);
+      unsigned PairedReg = MRI->getSubReg(SuperReg, ARM::gsub_1);
 
-    Operands.insert(Operands.begin() + 3,
-                    ARMOperand::CreateReg(PairedReg, Op->getStartLoc(),
-                                          Op->getEndLoc()));
+      Operands.insert(Operands.begin() + 3,
+                      ARMOperand::CreateReg(PairedReg,
+                                            Op2->getStartLoc(),
+                                            Op2->getEndLoc()));
+    }
   }
 
   // FIXME: As said above, this is all a pretty gross hack.  This instruction
@@ -5748,6 +5920,30 @@ validateInstruction(MCInst &Inst,
       return Error(Operands[Op]->getStartLoc(), "branch target out of range");
     break;
   }
+  case ARM::MOVi16:
+  case ARM::t2MOVi16:
+  case ARM::t2MOVTi16:
+    {
+    // We want to avoid misleadingly allowing something like "mov r0, <symbol>"
+    // especially when we turn it into a movw and the expression <symbol> does
+    // not have a :lower16: or :upper16 as part of the expression.  We don't
+    // want the behavior of silently truncating, which can be unexpected and
+    // lead to bugs that are difficult to find since this is an easy mistake
+    // to make.
+    int i = (Operands[3]->isImm()) ? 3 : 4;
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[i]);
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+    if (CE) break;
+    const MCExpr *E = dyn_cast<MCExpr>(Op->getImm());
+    if (!E) break;
+    const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
+    if (!ARM16Expr || (ARM16Expr->getKind() != ARMMCExpr::VK_ARM_HI16 &&
+                       ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16)) {
+      return Error(Op->getStartLoc(),
+	     "immediate expression for mov requires :lower16: or :upper16");
+      break;
+    }
+    }
   }
 
   return false;
@@ -5898,7 +6094,7 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
   case ARM::VLD3DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
   case ARM::VLD3DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD;
   case ARM::VLD3DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3DUPq8_UPD;
-  case ARM::VLD3DUPqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3DUPq16_UPD;
+  case ARM::VLD3DUPqWB_fixed_Asm_16: Spacing = 2; return ARM::VLD3DUPq16_UPD;
   case ARM::VLD3DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD;
   case ARM::VLD3DUPdWB_register_Asm_8:  Spacing = 1; return ARM::VLD3DUPd8_UPD;
   case ARM::VLD3DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
@@ -7860,9 +8056,11 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   return Match_Success;
 }
 
-template<> inline bool IsCPSRDead<MCInst>(MCInst* Instr) {
+namespace llvm {
+template <> inline bool IsCPSRDead<MCInst>(MCInst *Instr) {
   return true; // In an assembly source, no need to second-guess
 }
+}
 
 static const char *getSubtargetFeatureName(unsigned Val);
 bool ARMAsmParser::
@@ -7965,6 +8163,42 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
     return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
   }
+  case Match_AlignedMemoryRequiresNone:
+  case Match_DupAlignedMemoryRequiresNone:
+  case Match_AlignedMemoryRequires16:
+  case Match_DupAlignedMemoryRequires16:
+  case Match_AlignedMemoryRequires32:
+  case Match_DupAlignedMemoryRequires32:
+  case Match_AlignedMemoryRequires64:
+  case Match_DupAlignedMemoryRequires64:
+  case Match_AlignedMemoryRequires64or128:
+  case Match_DupAlignedMemoryRequires64or128:
+  case Match_AlignedMemoryRequires64or128or256:
+  {
+    SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getAlignmentLoc();
+    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    switch (MatchResult) {
+      default:
+        llvm_unreachable("Missing Match_Aligned type");
+      case Match_AlignedMemoryRequiresNone:
+      case Match_DupAlignedMemoryRequiresNone:
+        return Error(ErrorLoc, "alignment must be omitted");
+      case Match_AlignedMemoryRequires16:
+      case Match_DupAlignedMemoryRequires16:
+        return Error(ErrorLoc, "alignment must be 16 or omitted");
+      case Match_AlignedMemoryRequires32:
+      case Match_DupAlignedMemoryRequires32:
+        return Error(ErrorLoc, "alignment must be 32 or omitted");
+      case Match_AlignedMemoryRequires64:
+      case Match_DupAlignedMemoryRequires64:
+        return Error(ErrorLoc, "alignment must be 64 or omitted");
+      case Match_AlignedMemoryRequires64or128:
+      case Match_DupAlignedMemoryRequires64or128:
+        return Error(ErrorLoc, "alignment must be 64, 128 or omitted");
+      case Match_AlignedMemoryRequires64or128or256:
+        return Error(ErrorLoc, "alignment must be 64, 128, 256 or omitted");
+    }
+  }
   }
 
   llvm_unreachable("Implement any new match types added!");
@@ -7972,6 +8206,10 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
 /// parseDirective parses the arm specific directives
 bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
+  const MCObjectFileInfo::Environment Format =
+    getContext().getObjectFileInfo()->getObjectFileType();
+  bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
     return parseLiteralValues(4, DirectiveID.getLoc());
@@ -7989,16 +8227,6 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveSyntax(DirectiveID.getLoc());
   else if (IDVal == ".unreq")
     return parseDirectiveUnreq(DirectiveID.getLoc());
-  else if (IDVal == ".arch")
-    return parseDirectiveArch(DirectiveID.getLoc());
-  else if (IDVal == ".eabi_attribute")
-    return parseDirectiveEabiAttr(DirectiveID.getLoc());
-  else if (IDVal == ".cpu")
-    return parseDirectiveCPU(DirectiveID.getLoc());
-  else if (IDVal == ".fpu")
-    return parseDirectiveFPU(DirectiveID.getLoc());
-  else if (IDVal == ".fnstart")
-    return parseDirectiveFnStart(DirectiveID.getLoc());
   else if (IDVal == ".fnend")
     return parseDirectiveFnEnd(DirectiveID.getLoc());
   else if (IDVal == ".cantunwind")
@@ -8015,12 +8243,6 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveRegSave(DirectiveID.getLoc(), false);
   else if (IDVal == ".vsave")
     return parseDirectiveRegSave(DirectiveID.getLoc(), true);
-  else if (IDVal == ".inst")
-    return parseDirectiveInst(DirectiveID.getLoc());
-  else if (IDVal == ".inst.n")
-    return parseDirectiveInst(DirectiveID.getLoc(), 'n');
-  else if (IDVal == ".inst.w")
-    return parseDirectiveInst(DirectiveID.getLoc(), 'w');
   else if (IDVal == ".ltorg" || IDVal == ".pool")
     return parseDirectiveLtorg(DirectiveID.getLoc());
   else if (IDVal == ".even")
@@ -8029,18 +8251,38 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectivePersonalityIndex(DirectiveID.getLoc());
   else if (IDVal == ".unwind_raw")
     return parseDirectiveUnwindRaw(DirectiveID.getLoc());
-  else if (IDVal == ".tlsdescseq")
-    return parseDirectiveTLSDescSeq(DirectiveID.getLoc());
   else if (IDVal == ".movsp")
     return parseDirectiveMovSP(DirectiveID.getLoc());
-  else if (IDVal == ".object_arch")
-    return parseDirectiveObjectArch(DirectiveID.getLoc());
   else if (IDVal == ".arch_extension")
     return parseDirectiveArchExtension(DirectiveID.getLoc());
   else if (IDVal == ".align")
     return parseDirectiveAlign(DirectiveID.getLoc());
   else if (IDVal == ".thumb_set")
     return parseDirectiveThumbSet(DirectiveID.getLoc());
+
+  if (!IsMachO) {
+    if (IDVal == ".arch")
+      return parseDirectiveArch(DirectiveID.getLoc());
+    else if (IDVal == ".cpu")
+      return parseDirectiveCPU(DirectiveID.getLoc());
+    else if (IDVal == ".eabi_attribute")
+      return parseDirectiveEabiAttr(DirectiveID.getLoc());
+    else if (IDVal == ".fpu")
+      return parseDirectiveFPU(DirectiveID.getLoc());
+    else if (IDVal == ".fnstart")
+      return parseDirectiveFnStart(DirectiveID.getLoc());
+    else if (IDVal == ".inst")
+      return parseDirectiveInst(DirectiveID.getLoc());
+    else if (IDVal == ".inst.n")
+      return parseDirectiveInst(DirectiveID.getLoc(), 'n');
+    else if (IDVal == ".inst.w")
+      return parseDirectiveInst(DirectiveID.getLoc(), 'w');
+    else if (IDVal == ".object_arch")
+      return parseDirectiveObjectArch(DirectiveID.getLoc());
+    else if (IDVal == ".tlsdescseq")
+      return parseDirectiveTLSDescSeq(DirectiveID.getLoc());
+  }
+
   return true;
 }
 
@@ -8121,32 +8363,6 @@ void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
   if (NextSymbolIsThumb) {
     getParser().getStreamer().EmitThumbFunc(Symbol);
     NextSymbolIsThumb = false;
-    return;
-  }
-
-  if (!isThumb())
-    return;
-
-  const MCObjectFileInfo::Environment Format =
-    getContext().getObjectFileInfo()->getObjectFileType();
-  switch (Format) {
-  case MCObjectFileInfo::IsCOFF: {
-    const MCSymbolData &SD =
-      getParser().getStreamer().getOrCreateSymbolData(Symbol);
-    char Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
-    if (SD.getFlags() & (Type << COFF::SF_TypeShift))
-      getParser().getStreamer().EmitThumbFunc(Symbol);
-    break;
-  }
-  case MCObjectFileInfo::IsELF: {
-    const MCSymbolData &SD =
-      getParser().getStreamer().getOrCreateSymbolData(Symbol);
-    if (MCELF::GetType(SD) & (ELF::STT_FUNC << ELF_STT_Shift))
-      getParser().getStreamer().EmitThumbFunc(Symbol);
-    break;
-  }
-  case MCObjectFileInfo::IsMachO:
-    break;
   }
 }
 
@@ -8303,14 +8519,6 @@ bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
 /// parseDirectiveArch
 ///  ::= .arch token
 bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".arch directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   StringRef Arch = getParser().parseStringToEndOfStatement().trim();
 
   unsigned ID = StringSwitch<unsigned>(Arch)
@@ -8334,14 +8542,6 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
 ///  ::= .eabi_attribute int, int [, "str"]
 ///  ::= .eabi_attribute Tag_name, int [, "str"]
 bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".eabi_attribute directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   int64_t Tag;
   SMLoc TagLoc;
   TagLoc = Parser.getTok().getLoc();
@@ -8447,14 +8647,6 @@ bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
 /// parseDirectiveCPU
 ///  ::= .cpu str
 bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".cpu directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   StringRef CPU = getParser().parseStringToEndOfStatement().trim();
   getTargetStreamer().emitTextAttribute(ARMBuildAttrs::CPU_name, CPU);
   return false;
@@ -8463,14 +8655,6 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
 /// parseDirectiveFPU
 ///  ::= .fpu str
 bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".fpu directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   StringRef FPU = getParser().parseStringToEndOfStatement().trim();
 
   unsigned ID = StringSwitch<unsigned>(FPU)
@@ -8490,14 +8674,6 @@ bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
 /// parseDirectiveFnStart
 ///  ::= .fnstart
 bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".fnstart directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   if (UC.hasFnStart()) {
     Error(L, ".fnstart starts before the end of previous one");
     UC.emitFnStartLocNotes();
@@ -8777,14 +8953,6 @@ bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
 ///  ::= .inst.n opcode [, ...]
 ///  ::= .inst.w opcode [, ...]
 bool ARMAsmParser::parseDirectiveInst(SMLoc Loc, char Suffix) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(Loc, ".inst directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   int Width;
 
   if (isThumb()) {
@@ -9033,14 +9201,6 @@ bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) {
 /// parseDirectiveTLSDescSeq
 ///   ::= .tlsdescseq tls-variable
 bool ARMAsmParser::parseDirectiveTLSDescSeq(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".tlsdescseq directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   if (getLexer().isNot(AsmToken::Identifier)) {
     TokError("expected variable after '.tlsdescseq' directive");
     Parser.eatToEndOfStatement();
@@ -9128,14 +9288,6 @@ bool ARMAsmParser::parseDirectiveMovSP(SMLoc L) {
 /// parseDirectiveObjectArch
 ///   ::= .object_arch name
 bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".object_arch directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   if (getLexer().isNot(AsmToken::Identifier)) {
     Error(getLexer().getLoc(), "unexpected token");
     Parser.eatToEndOfStatement();
@@ -9221,36 +9373,7 @@ bool ARMAsmParser::parseDirectiveThumbSet(SMLoc L) {
   Lex();
 
   MCSymbol *Alias = getContext().GetOrCreateSymbol(Name);
-  if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Value)) {
-    MCSymbol *Sym = getContext().LookupSymbol(SRE->getSymbol().getName());
-    if (!Sym->isDefined()) {
-      getStreamer().EmitSymbolAttribute(Sym, MCSA_Global);
-      getStreamer().EmitAssignment(Alias, Value);
-      return false;
-    }
-
-    const MCObjectFileInfo::Environment Format =
-      getContext().getObjectFileInfo()->getObjectFileType();
-    switch (Format) {
-    case MCObjectFileInfo::IsCOFF: {
-      char Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
-      getStreamer().EmitCOFFSymbolType(Type);
-      // .set values are always local in COFF
-      getStreamer().EmitSymbolAttribute(Alias, MCSA_Local);
-      break;
-    }
-    case MCObjectFileInfo::IsELF:
-      getStreamer().EmitSymbolAttribute(Alias, MCSA_ELF_TypeFunction);
-      break;
-    case MCObjectFileInfo::IsMachO:
-      break;
-    }
-  }
-
-  // FIXME: set the function as being a thumb function via the assembler
-  getStreamer().EmitThumbFunc(Alias);
-  getStreamer().EmitAssignment(Alias, Value);
-
+  getTargetStreamer().emitThumbSet(Alias, Value);
   return false;
 }
 
@@ -9365,8 +9488,8 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
       int64_t Value;
       if (!SOExpr->EvaluateAsAbsolute(Value))
         return Match_Success;
-      assert((Value >= INT32_MIN && Value <= INT32_MAX) &&
-             "expression value must be representiable in 32 bits");
+      assert((Value >= INT32_MIN && Value <= UINT32_MAX) &&
+             "expression value must be representable in 32 bits");
     }
     break;
   case MCK_GPRPair:
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 8e14883..9b5fa75 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -17,7 +17,6 @@ add_public_tablegen_target(ARMCommonTableGen)
 add_llvm_target(ARMCodeGen
   A15SDOptimizer.cpp
   ARMAsmPrinter.cpp
-  ARMAtomicExpandPass.cpp
   ARMBaseInstrInfo.cpp
   ARMBaseRegisterInfo.cpp
   ARMCodeEmitter.cpp
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 9e40381..4d4038d 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-disassembler"
-
 #include "llvm/MC/MCDisassembler.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
@@ -29,6 +27,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-disassembler"
+
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
@@ -90,8 +90,8 @@ class ARMDisassembler : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  ARMDisassembler(const MCSubtargetInfo &STI) :
-    MCDisassembler(STI) {
+  ARMDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {
   }
 
   ~ARMDisassembler() {
@@ -109,8 +109,8 @@ class ThumbDisassembler : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  ThumbDisassembler(const MCSubtargetInfo &STI) :
-    MCDisassembler(STI) {
+  ThumbDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {
   }
 
   ~ThumbDisassembler() {
@@ -400,12 +400,16 @@ static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
 #include "ARMGenDisassemblerTables.inc"
 
-static MCDisassembler *createARMDisassembler(const Target &T, const MCSubtargetInfo &STI) {
-  return new ARMDisassembler(STI);
+static MCDisassembler *createARMDisassembler(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new ARMDisassembler(STI, Ctx);
 }
 
-static MCDisassembler *createThumbDisassembler(const Target &T, const MCSubtargetInfo &STI) {
-  return new ThumbDisassembler(STI);
+static MCDisassembler *createThumbDisassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new ThumbDisassembler(STI, Ctx);
 }
 
 DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index da3fe01..e4b785d 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "ARMInstPrinter.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
@@ -23,6 +22,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #include "ARMGenAsmWriter.inc"
 
 /// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing.
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 1db517f..7acd9cc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -306,8 +306,36 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
+static uint32_t swapHalfWords(uint32_t Value, bool IsLittleEndian) {
+  if (IsLittleEndian) {
+    // Note that the halfwords are stored high first and low second in thumb;
+    // so we need to swap the fixup value here to map properly.
+    uint32_t Swapped = (Value & 0xFFFF0000) >> 16;
+    Swapped |= (Value & 0x0000FFFF) << 16;
+    return Swapped;
+  }
+  else
+    return Value;
+}
+
+static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
+                              bool IsLittleEndian) {
+  uint32_t Value;
+
+  if (IsLittleEndian) {
+    Value = (SecondHalf & 0xFFFF) << 16;
+    Value |= (FirstHalf & 0xFFFF);
+  } else {
+    Value = (SecondHalf & 0xFFFF);
+    Value |= (FirstHalf & 0xFFFF) << 16;
+  }
+
+  return Value;
+}
+
 static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                 bool IsPCRel, MCContext *Ctx) {
+                                 bool IsPCRel, MCContext *Ctx,
+                                 bool IsLittleEndian) {
   unsigned Kind = Fixup.getKind();
   switch (Kind) {
   default:
@@ -316,6 +344,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case FK_Data_2:
   case FK_Data_4:
     return Value;
+  case FK_SecRel_2:
+    return Value;
+  case FK_SecRel_4:
+    return Value;
   case ARM::fixup_arm_movt_hi16:
     if (!IsPCRel)
       Value >>= 16;
@@ -342,9 +374,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // inst{14-12} = Mid3;
     // inst{7-0} = Lo8;
     Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
-    uint64_t swapped = (Value & 0xFFFF0000) >> 16;
-    swapped |= (Value & 0x0000FFFF) << 16;
-    return swapped;
+    return swapHalfWords(Value, IsLittleEndian);
   }
   case ARM::fixup_arm_ldst_pcrel_12:
     // ARM PC-relative values are offset by 8.
@@ -364,11 +394,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
 
     // Same addressing mode as fixup_arm_pcrel_10,
     // but with 16-bit halfwords swapped.
-    if (Kind == ARM::fixup_t2_ldst_pcrel_12) {
-      uint64_t swapped = (Value & 0xFFFF0000) >> 16;
-      swapped |= (Value & 0x0000FFFF) << 16;
-      return swapped;
-    }
+    if (Kind == ARM::fixup_t2_ldst_pcrel_12)
+      return swapHalfWords(Value, IsLittleEndian);
 
     return Value;
   }
@@ -401,9 +428,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     out |= (Value & 0x700) << 4;
     out |= (Value & 0x0FF);
 
-    uint64_t swapped = (out & 0xFFFF0000) >> 16;
-    swapped |= (out & 0x0000FFFF) << 16;
-    return swapped;
+    return swapHalfWords(out, IsLittleEndian);
   }
 
   case ARM::fixup_arm_condbranch:
@@ -434,9 +459,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     out |= (Value & 0x1FF800)  << 5; // imm6 field
     out |= (Value & 0x0007FF);        // imm11 field
 
-    uint64_t swapped = (out & 0xFFFF0000) >> 16;
-    swapped |= (out & 0x0000FFFF) << 16;
-    return swapped;
+    return swapHalfWords(out, IsLittleEndian);
   }
   case ARM::fixup_t2_condbranch: {
     Value = Value - 4;
@@ -449,9 +472,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     out |= (Value & 0x1F800) << 5; // imm6 field
     out |= (Value & 0x007FF);      // imm11 field
 
-    uint32_t swapped = (out & 0xFFFF0000) >> 16;
-    swapped |= (out & 0x0000FFFF) << 16;
-    return swapped;
+    return swapHalfWords(out, IsLittleEndian);
   }
   case ARM::fixup_arm_thumb_bl: {
     // The value doesn't encode the low bit (always zero) and is offset by
@@ -475,13 +496,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     uint32_t imm10Bits = (offset & 0x1FF800) >> 11;
     uint32_t imm11Bits = (offset & 0x000007FF);
 
-    uint32_t Binary = 0;
-    uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
-    uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+    uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
+    uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
                           (uint16_t)imm11Bits);
-    Binary |= secondHalf << 16;
-    Binary |= firstHalf;
-    return Binary;
+    return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
   }
   case ARM::fixup_arm_thumb_blx: {
     // The value doesn't encode the low two bits (always zero) and is offset by
@@ -508,13 +526,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     uint32_t imm10HBits = (offset & 0xFFC00) >> 10;
     uint32_t imm10LBits = (offset & 0x3FF);
 
-    uint32_t Binary = 0;
-    uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
-    uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+    uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
+    uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
                           ((uint16_t)imm10LBits) << 1);
-    Binary |= secondHalf << 16;
-    Binary |= firstHalf;
-    return Binary;
+    return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
   }
   case ARM::fixup_arm_thumb_cp:
     // Offset by 4, and don't encode the low two bits. Two bytes of that
@@ -566,11 +581,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
 
     // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords
     // swapped.
-    if (Kind == ARM::fixup_t2_pcrel_10) {
-      uint32_t swapped = (Value & 0xFFFF0000) >> 16;
-      swapped |= (Value & 0x0000FFFF) << 16;
-      return swapped;
-    }
+    if (Kind == ARM::fixup_t2_pcrel_10)
+      return swapHalfWords(Value, IsLittleEndian);
 
     return Value;
   }
@@ -603,7 +615,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
   // the offset when the destination has the same MCFragment.
   if (A && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
     const MCSymbol &Sym = A->getSymbol().AliasedSymbol();
-    MCSymbolData &SymData = Asm.getSymbolData(Sym);
+    const MCSymbolData &SymData = Asm.getSymbolData(Sym);
     IsResolved = (SymData.getFragment() == DF);
   }
   // We must always generate a relocation for BL/BLX instructions if we have
@@ -618,7 +630,8 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
   // Try to get the encoded value for the fixup as-if we're mapping it into
   // the instruction. This allows adjustFixupValue() to issue a diagnostic
   // if the value aren't invalid.
-  (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext());
+  (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(),
+                         IsLittleEndian);
 }
 
 /// getFixupKindNumBytes - The number of bytes the fixup may change.
@@ -662,6 +675,11 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case ARM::fixup_t2_movt_hi16:
   case ARM::fixup_t2_movw_lo16:
     return 4;
+
+  case FK_SecRel_2:
+    return 2;
+  case FK_SecRel_4:
+    return 4;
   }
 }
 
@@ -716,7 +734,7 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                unsigned DataSize, uint64_t Value,
                                bool IsPCRel) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  Value = adjustFixupValue(Fixup, Value, IsPCRel, nullptr);
+  Value = adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian);
   if (!Value) return;           // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
@@ -724,8 +742,11 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
   // Used to point to big endian bytes.
   unsigned FullSizeBytes;
-  if (!IsLittleEndian)
+  if (!IsLittleEndian) {
     FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
+    assert((Offset + FullSizeBytes) <= DataSize && "Invalid fixup size!");
+    assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
+  }
 
   // For each byte of the fragment that the fixup touches, mask in the bits from
   // the fixup value. The Value has been "split up" into the appropriate
@@ -737,6 +758,15 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 }
 
 namespace {
+// FIXME: This should be in a separate file.
+class ARMWinCOFFAsmBackend : public ARMAsmBackend {
+public:
+  ARMWinCOFFAsmBackend(const Target &T, const StringRef &Triple)
+    : ARMAsmBackend(T, Triple, true) { }
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createARMWinCOFFObjectWriter(OS, /*Is64Bit=*/false);
+  }
+};
 
 // FIXME: This should be in a separate file.
 // ELF is an ELF of course...
@@ -777,7 +807,9 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
                                         bool isLittle) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSBinFormatMachO()) {
+  switch (TheTriple.getObjectFormat()) {
+  default: llvm_unreachable("unsupported object format");
+  case Triple::MachO: {
     MachO::CPUSubTypeARM CS =
       StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName())
       .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T)
@@ -792,15 +824,14 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
 
     return new DarwinARMAsmBackend(T, TT, CS);
   }
-
-#if 0
-  // FIXME: Introduce yet another checker but assert(0).
-  if (TheTriple.isOSBinFormatCOFF())
-    assert(0 && "Windows not supported on ARM");
-#endif
-
-  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
-  return new ELFARMAsmBackend(T, TT, OSABI, isLittle);
+  case Triple::COFF:
+    assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
+    return new ARMWinCOFFAsmBackend(T, TT);
+  case Triple::ELF:
+    assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target");
+    uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
+    return new ELFARMAsmBackend(T, TT, OSABI, isLittle);
+  }
 }
 
 MCAsmBackend *llvm::createARMLEAsmBackend(const Target &T,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index a4661b1..1c84263 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -74,7 +74,7 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
 unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
                                                const MCFixup &Fixup,
                                                bool IsPCRel) const  {
-  MCSymbolRefExpr::VariantKind Modifier = Fixup.getAccessVariant();
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
 
   unsigned Type = 0;
   if (IsPCRel) {
@@ -91,6 +91,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_GOTTPOFF:
         Type = ELF::R_ARM_TLS_IE32;
         break;
+      case MCSymbolRefExpr::VK_GOTPCREL:
+        Type = ELF::R_ARM_GOT_PREL;
+        break;
       }
       break;
     case ARM::fixup_arm_blx:
@@ -167,6 +170,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_GOTOFF:
         Type = ELF::R_ARM_GOTOFF32;
         break;
+      case MCSymbolRefExpr::VK_GOTPCREL:
+        Type = ELF::R_ARM_GOT_PREL;
+        break;
       case MCSymbolRefExpr::VK_ARM_TARGET1:
         Type = ELF::R_ARM_TARGET1;
         break;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 5a01d26..a4d13ed 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -30,6 +30,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
@@ -62,7 +63,7 @@ static const char *GetFPUName(unsigned ID) {
 #define ARM_FPU_NAME(NAME, ID) case ARM::ID: return NAME;
 #include "ARMFPUName.def"
   }
-  return NULL;
+  return nullptr;
 }
 
 static const char *GetArchName(unsigned ID) {
@@ -75,7 +76,7 @@ static const char *GetArchName(unsigned ID) {
 #define ARM_ARCH_ALIAS(NAME, ID) /* empty */
 #include "ARMArchName.def"
   }
-  return NULL;
+  return nullptr;
 }
 
 static const char *GetArchDefaultCPUName(unsigned ID) {
@@ -88,7 +89,7 @@ static const char *GetArchDefaultCPUName(unsigned ID) {
 #define ARM_ARCH_ALIAS(NAME, ID) /* empty */
 #include "ARMArchName.def"
   }
-  return NULL;
+  return nullptr;
 }
 
 static unsigned GetArchDefaultCPUArch(unsigned ID) {
@@ -139,6 +140,7 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer {
   void finishAttributeSection() override;
 
   void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
+  void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
 
 public:
   ARMTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS,
@@ -260,6 +262,10 @@ ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
   OS << "\t.tlsdescseq\t" << S->getSymbol().getName();
 }
 
+void ARMTargetAsmStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
+  OS << "\t.thumb_set\t" << *Symbol << ", " << *Value << '\n';
+}
+
 void ARMTargetAsmStreamer::emitInst(uint32_t Inst, char Suffix) {
   OS << "\t.inst";
   if (Suffix)
@@ -310,7 +316,7 @@ private:
     for (size_t i = 0; i < Contents.size(); ++i)
       if (Contents[i].Tag == Attribute)
         return &Contents[i];
-    return 0;
+    return nullptr;
   }
 
   void setAttributeItem(unsigned Attribute, unsigned Value,
@@ -406,8 +412,10 @@ private:
   void emitFPU(unsigned FPU) override;
   void emitInst(uint32_t Inst, char Suffix = '\0') override;
   void finishAttributeSection() override;
+  void emitLabel(MCSymbol *Symbol) override;
 
   void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
+  void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
 
   size_t calculateContentSize() const;
 
@@ -415,7 +423,7 @@ public:
   ARMTargetELFStreamer(MCStreamer &S)
     : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::INVALID_FPU),
       Arch(ARM::INVALID_ARCH), EmittedArch(ARM::INVALID_ARCH),
-      AttributeSection(0) {}
+      AttributeSection(nullptr) {}
 };
 
 /// Extend the generic ELFStreamer class so that it can emit mapping symbols at
@@ -531,7 +539,8 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
-  void EmitValueImpl(const MCExpr *Value, unsigned Size) override {
+  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                     const SMLoc &Loc) override {
     EmitDataMappingSymbol();
     MCELFStreamer::EmitValueImpl(Value, Size);
   }
@@ -600,12 +609,8 @@ private:
   }
 
   void EmitThumbFunc(MCSymbol *Func) override {
-    // FIXME: Anything needed here to flag the function as thumb?
-
     getAssembler().setIsThumbFunc(Func);
-
-    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Func);
-    SD.setFlags(SD.getFlags() | ELF_Other_ThumbFunc);
+    EmitSymbolAttribute(Func, MCSA_ELF_TypeFunction);
   }
 
   // Helper functions for ARM exception handling directives
@@ -980,10 +985,35 @@ void ARMTargetELFStreamer::finishAttributeSection() {
   Contents.clear();
   FPU = ARM::INVALID_FPU;
 }
+
+void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
+  ARMELFStreamer &Streamer = getStreamer();
+  if (!Streamer.IsThumb)
+    return;
+
+  const MCSymbolData &SD = Streamer.getOrCreateSymbolData(Symbol);
+  if (MCELF::GetType(SD) & (ELF::STT_FUNC << ELF_STT_Shift))
+    Streamer.EmitThumbFunc(Symbol);
+}
+
 void
 ARMTargetELFStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
   getStreamer().EmitFixup(S, FK_Data_4);
 }
+
+void ARMTargetELFStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
+  if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Value)) {
+    const MCSymbol &Sym = SRE->getSymbol();
+    if (!Sym.isDefined()) {
+      getStreamer().EmitAssignment(Symbol, Value);
+      return;
+    }
+  }
+
+  getStreamer().EmitThumbFunc(Symbol);
+  getStreamer().EmitAssignment(Symbol, Value);
+}
+
 void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
   getStreamer().emitInst(Inst, Suffix);
 }
@@ -1012,7 +1042,7 @@ inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
   }
 
   // Get .ARM.extab or .ARM.exidx section
-  const MCSectionELF *EHSection = NULL;
+  const MCSectionELF *EHSection = nullptr;
   if (const MCSymbol *Group = FnSection.getGroup()) {
     EHSection = getContext().getELFSection(
       EHSecName, Type, Flags | ELF::SHF_GROUP, Kind,
@@ -1049,9 +1079,9 @@ void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
 }
 
 void ARMELFStreamer::Reset() {
-  ExTab = NULL;
-  FnStart = NULL;
-  Personality = NULL;
+  ExTab = nullptr;
+  FnStart = nullptr;
+  Personality = nullptr;
   PersonalityIndex = ARM::EHABI::NUM_PERSONALITY_INDEX;
   FPReg = ARM::SP;
   FPOffset = 0;
@@ -1065,7 +1095,7 @@ void ARMELFStreamer::Reset() {
 }
 
 void ARMELFStreamer::emitFnStart() {
-  assert(FnStart == 0);
+  assert(FnStart == nullptr);
   FnStart = getContext().CreateTempSymbol();
   EmitLabel(FnStart);
 }
@@ -1104,11 +1134,14 @@ void ARMELFStreamer::emitFnEnd() {
     // the second word of exception index table entry.  The size of the unwind
     // opcodes should always be 4 bytes.
     assert(PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0 &&
-           "Compact model must use __aeabi_cpp_unwind_pr0 as personality");
+           "Compact model must use __aeabi_unwind_cpp_pr0 as personality");
     assert(Opcodes.size() == 4u &&
-           "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be equal to 4");
-    EmitBytes(StringRef(reinterpret_cast<const char*>(Opcodes.data()),
-                        Opcodes.size()));
+           "Unwind opcode size for __aeabi_unwind_cpp_pr0 must be equal to 4");
+    uint64_t Intval = Opcodes[0] |
+                      Opcodes[1] << 8 |
+                      Opcodes[2] << 16 |
+                      Opcodes[3] << 24;
+    EmitIntValue(Intval, Opcodes.size());
   }
 
   // Switch to the section containing FnStart
@@ -1180,8 +1213,15 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
   }
 
   // Emit unwind opcodes
-  EmitBytes(StringRef(reinterpret_cast<const char *>(Opcodes.data()),
-                      Opcodes.size()));
+  assert((Opcodes.size() % 4) == 0 &&
+         "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be multiple of 4");
+  for (unsigned I = 0; I != Opcodes.size(); I += 4) {
+    uint64_t Intval = Opcodes[I] |
+                      Opcodes[I + 1] << 8 |
+                      Opcodes[I + 2] << 16 |
+                      Opcodes[I + 3] << 24;
+    EmitIntValue(Intval, 4);
+  }
 
   // According to ARM EHABI section 9.2, if the __aeabi_unwind_cpp_pr1() or
   // __aeabi_unwind_cpp_pr2() is used, then the handler data must be emitted
@@ -1283,13 +1323,11 @@ void ARMELFStreamer::emitUnwindRaw(int64_t Offset,
 namespace llvm {
 
 MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                                bool isVerboseAsm, bool useCFI,
-                                bool useDwarfDirectory,
+                                bool isVerboseAsm, bool useDwarfDirectory,
                                 MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                                 MCAsmBackend *TAB, bool ShowInst) {
-  MCStreamer *S =
-      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
-                              InstPrint, CE, TAB, ShowInst);
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
   new ARMTargetAsmStreamer(*S, OS, *InstPrint, isVerboseAsm);
   return S;
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index b7f96e0..7a19208 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -25,7 +25,7 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(StringRef TT) {
       (TheTriple.getArch() == Triple::thumbeb))
     IsLittleEndian = false;
 
-  Data64bitsDirective = 0;
+  Data64bitsDirective = nullptr;
   CommentString = "@";
   Code16Directive = ".code\t16";
   Code32Directive = ".code\t32";
@@ -50,7 +50,7 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo(StringRef TT) {
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
 
-  Data64bitsDirective = 0;
+  Data64bitsDirective = nullptr;
   CommentString = "@";
   Code16Directive = ".code\t16";
   Code32Directive = ".code\t32";
@@ -59,7 +59,14 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo(StringRef TT) {
   SupportsDebugInformation = true;
 
   // Exceptions handling
-  ExceptionsType = ExceptionHandling::ARM;
+  switch (TheTriple.getOS()) {
+  case Triple::NetBSD:
+    ExceptionsType = ExceptionHandling::DwarfCFI;
+    break;
+  default:
+    ExceptionsType = ExceptionHandling::ARM;
+    break;
+  }
 
   // foo(plt) instead of foo@plt
   UseParensForSymbolVariant = true;
@@ -89,6 +96,7 @@ void ARMCOFFMCAsmInfoGNU::anchor() { }
 
 ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
   AlignmentIsInBytes = false;
+  HasSingleParameterDotFile = true;
 
   CommentString = "@";
   Code16Directive = ".code\t16";
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index beaf6a4..51cfa0a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -35,13 +35,13 @@ namespace llvm {
   };
 
   class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
-    void anchor();
+    void anchor() override;
   public:
     explicit ARMCOFFMCAsmInfoMicrosoft();
   };
 
   class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
-    void anchor();
+    void anchor() override;
   public:
     explicit ARMCOFFMCAsmInfoGNU();
   };
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 5564e0a..5b51a52 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mccodeemitter"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
@@ -31,6 +30,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mccodeemitter"
+
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
 STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
 
@@ -1036,16 +1037,17 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
                                        : ARM::fixup_arm_movw_lo16);
       break;
     }
+
     Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
     return 0;
   }
   // If the expression doesn't have :upper16: or :lower16: on it,
-  // it's just a plain immediate expression, and those evaluate to
+  // it's just a plain immediate expression, previously those evaluated to
   // the lower 16 bits of the expression regardless of whether
-  // we have a movt or a movw.
-  Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movw_lo16
-                                   : ARM::fixup_arm_movw_lo16);
-  Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
+  // we have a movt or a movw, but that led to misleadingly results.
+  // This is now disallowed in the the AsmParser in validateInstruction()
+  // so this should never happen.
+  assert(0 && "expression without :upper16: or :lower16:");
   return 0;
 }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
index fc8505b..87ea875 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -7,12 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "armmcexpr"
 #include "ARMMCExpr.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "armmcexpr"
+
 const ARMMCExpr*
 ARMMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
                        MCContext &Ctx) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 949a3d5..04d63a7 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -106,9 +107,11 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
     unsigned SubVer = TT[Idx];
     if (SubVer == '8') {
       if (NoCPU)
-        // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2, FeatureMP,
-        //      FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone, FeatureT2XtPk, FeatureCrypto, FeatureCRC
-        ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,+trustzone,+t2xtpk,+crypto,+crc";
+        // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2,
+        //      FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone,
+        //      FeatureT2XtPk, FeatureCrypto, FeatureCRC
+        ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,"
+                         "+trustzone,+t2xtpk,+crypto,+crc";
       else
         // Use CPU to figure out the exact features
         ARMArchFeature = "+v8";
@@ -245,7 +248,7 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   }
 
   unsigned Reg = MRI.getDwarfRegNum(ARM::SP, true);
-  MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(0, Reg, 0));
+  MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(nullptr, Reg, 0));
 
   return MAI;
 }
@@ -273,18 +276,20 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     bool NoExecStack) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSBinFormatMachO()) {
+  switch (TheTriple.getObjectFormat()) {
+  default: llvm_unreachable("unsupported object format");
+  case Triple::MachO: {
     MCStreamer *S = createMachOStreamer(Ctx, MAB, OS, Emitter, false);
     new ARMTargetStreamer(*S);
     return S;
   }
-
-  if (TheTriple.isOSWindows()) {
-    llvm_unreachable("ARM does not support Windows COFF format");
+  case Triple::COFF:
+    assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
+    return createARMWinCOFFStreamer(Ctx, MAB, *Emitter, OS);
+  case Triple::ELF:
+    return createARMELFStreamer(Ctx, MAB, OS, Emitter, false, NoExecStack,
+                                TheTriple.getArch() == Triple::thumb);
   }
-
-  return createARMELFStreamer(Ctx, MAB, OS, Emitter, false, NoExecStack,
-                              TheTriple.getArch() == Triple::thumb);
 }
 
 static MCInstPrinter *createARMMCInstPrinter(const Target &T,
@@ -295,7 +300,7 @@ static MCInstPrinter *createARMMCInstPrinter(const Target &T,
                                              const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
     return new ARMInstPrinter(MAI, MII, MRI, STI);
-  return 0;
+  return nullptr;
 }
 
 static MCRelocationInfo *createARMMCRelocationInfo(StringRef TT,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index e81876f..8853a8c 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -47,8 +47,7 @@ namespace ARM_MC {
 }
 
 MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                                bool isVerboseAsm, bool useCFI,
-                                bool useDwarfDirectory,
+                                bool isVerboseAsm, bool useDwarfDirectory,
                                 MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                                 MCAsmBackend *TAB, bool ShowInst);
 
@@ -78,6 +77,11 @@ MCAsmBackend *createThumbLEAsmBackend(const Target &T, const MCRegisterInfo &MRI
 MCAsmBackend *createThumbBEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
                                       StringRef TT, StringRef CPU);
 
+/// createARMWinCOFFStreamer - Construct a PE/COFF machine code streamer which
+/// will generate a PE/COFF object file.
+MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     MCCodeEmitter &Emitter, raw_ostream &OS);
+
 /// createARMELFObjectWriter - Construct an ELF Mach-O object writer.
 MCObjectWriter *createARMELFObjectWriter(raw_ostream &OS,
                                          uint8_t OSABI,
@@ -89,6 +93,8 @@ MCObjectWriter *createARMMachObjectWriter(raw_ostream &OS,
                                           uint32_t CPUType,
                                           uint32_t CPUSubtype);
 
+/// createARMWinCOFFObjectWriter - Construct an ARM PE/COFF object writer.
+MCObjectWriter *createARMWinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit);
 
 /// createARMMachORelocationInfo - Construct ARM Mach-O relocation info.
 MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 3bf5cf1..ecfa4e5 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -156,7 +156,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
-  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
 
   if (!A_SD->getFragment())
     Asm.getContext().FatalError(Fixup.getLoc(),
@@ -170,7 +170,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
   FixedValue += SecAddr;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
 
     if (!B_SD->getFragment())
       Asm.getContext().FatalError(Fixup.getLoc(),
@@ -206,11 +206,11 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
     // The thumb bit shouldn't be set in the 'other-half' bit of the
     // relocation, but it will be set in FixedValue if the base symbol
     // is a thumb function. Clear it out here.
-    if (A_SD->getFlags() & SF_ThumbFunc)
+    if (Asm.isThumbFunc(A))
       FixedValue &= 0xfffffffe;
     break;
   case ARM::fixup_t2_movt_hi16:
-    if (A_SD->getFlags() & SF_ThumbFunc)
+    if (Asm.isThumbFunc(A))
       FixedValue &= 0xfffffffe;
     MovtBit = 1;
     // Fallthrough
@@ -259,7 +259,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
-  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
 
   if (!A_SD->getFragment())
     Asm.getContext().FatalError(Fixup.getLoc(),
@@ -272,7 +272,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
   uint32_t Value2 = 0;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
 
     if (!B_SD->getFragment())
       Asm.getContext().FatalError(Fixup.getLoc(),
@@ -378,7 +378,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   }
 
   // Get the symbol data, if any.
-  MCSymbolData *SD = 0;
+  const MCSymbolData *SD = nullptr;
   if (Target.getSymA())
     SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index fdc0ed7..e3cfb05 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -109,7 +109,7 @@ ConstantPool *
 AssemblerConstantPools::getConstantPool(const MCSection *Section) {
   ConstantPoolMapTy::iterator CP = ConstantPools.find(Section);
   if (CP == ConstantPools.end())
-    return 0;
+    return nullptr;
 
   return &CP->second;
 }
@@ -246,3 +246,7 @@ void ARMTargetStreamer::AnnotateTLSDescriptorSequence(
     const MCSymbolRefExpr *SRE) {
   llvm_unreachable("unimplemented");
 }
+
+void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
+  llvm_unreachable("unimplemented");
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
new file mode 100644
index 0000000..d31f1f4
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -0,0 +1,82 @@
+//===-- ARMWinCOFFObjectWriter.cpp - ARM Windows COFF Object Writer -- C++ -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace {
+class ARMWinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+  ARMWinCOFFObjectWriter(bool Is64Bit)
+    : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARMNT) {
+    assert(!Is64Bit && "AArch64 support not yet implemented");
+  }
+  virtual ~ARMWinCOFFObjectWriter() { }
+
+  unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsCrossSection) const override;
+
+  bool recordRelocation(const MCFixup &) const override;
+};
+
+unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
+                                              const MCFixup &Fixup,
+                                              bool IsCrossSection) const {
+  assert(getMachine() == COFF::IMAGE_FILE_MACHINE_ARMNT &&
+         "AArch64 support not yet implemented");
+
+  MCSymbolRefExpr::VariantKind Modifier =
+    Target.isAbsolute() ? MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+  switch (static_cast<unsigned>(Fixup.getKind())) {
+  default: llvm_unreachable("unsupported relocation type");
+  case FK_Data_4:
+    switch (Modifier) {
+    case MCSymbolRefExpr::VK_COFF_IMGREL32:
+      return COFF::IMAGE_REL_ARM_ADDR32NB;
+    case MCSymbolRefExpr::VK_SECREL:
+      return COFF::IMAGE_REL_ARM_SECREL;
+    default:
+      return COFF::IMAGE_REL_ARM_ADDR32;
+    }
+  case FK_SecRel_2:
+    return COFF::IMAGE_REL_ARM_SECTION;
+  case FK_SecRel_4:
+    return COFF::IMAGE_REL_ARM_SECREL;
+  case ARM::fixup_t2_condbranch:
+    return COFF::IMAGE_REL_ARM_BRANCH20T;
+  case ARM::fixup_t2_uncondbranch:
+    return COFF::IMAGE_REL_ARM_BRANCH24T;
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+    return COFF::IMAGE_REL_ARM_BLX23T;
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movt_hi16:
+    return COFF::IMAGE_REL_ARM_MOV32T;
+  }
+}
+
+bool ARMWinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
+  return static_cast<unsigned>(Fixup.getKind()) != ARM::fixup_t2_movt_hi16;
+}
+}
+
+namespace llvm {
+MCObjectWriter *createARMWinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit) {
+  MCWinCOFFObjectTargetWriter *MOTW = new ARMWinCOFFObjectWriter(Is64Bit);
+  return createWinCOFFObjectWriter(MOTW, OS);
+}
+}
+
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
new file mode 100644
index 0000000..b344ced
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -0,0 +1,46 @@
+//===-- ARMWinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCTargetDesc.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class ARMWinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+  ARMWinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE,
+                     raw_ostream &OS)
+    : MCWinCOFFStreamer(C, AB, CE, OS) { }
+
+  void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
+  void EmitThumbFunc(MCSymbol *Symbol) override;
+};
+
+void ARMWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  switch (Flag) {
+  default: llvm_unreachable("not implemented");
+  case MCAF_SyntaxUnified:
+  case MCAF_Code16:
+    break;
+  }
+}
+
+void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) {
+  getAssembler().setIsThumbFunc(Symbol);
+}
+}
+
+namespace llvm {
+MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     MCCodeEmitter &Emitter, raw_ostream &OS) {
+  return new ARMWinCOFFStreamer(Context, MAB, Emitter, OS);
+}
+}
+
diff --git a/lib/Target/ARM/MCTargetDesc/Android.mk b/lib/Target/ARM/MCTargetDesc/Android.mk
index 074d29e..a5827f7 100644
--- a/lib/Target/ARM/MCTargetDesc/Android.mk
+++ b/lib/Target/ARM/MCTargetDesc/Android.mk
@@ -17,7 +17,9 @@ arm_mc_desc_SRC_FILES := \
   ARMMachObjectWriter.cpp \
   ARMMachORelocationInfo.cpp \
   ARMTargetStreamer.cpp \
-  ARMUnwindOpAsm.cpp
+  ARMUnwindOpAsm.cpp \
+  ARMWinCOFFObjectWriter.cpp \
+  ARMWinCOFFStreamer.cpp \
 
 # For the host
 # =====================================================
diff --git a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
index 06812d4..9582e8c 100644
--- a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
@@ -1,14 +1,16 @@
 add_llvm_library(LLVMARMDesc
   ARMAsmBackend.cpp
   ARMELFObjectWriter.cpp
+  ARMELFObjectWriter.cpp
   ARMELFStreamer.cpp
+  ARMMachObjectWriter.cpp
+  ARMMachORelocationInfo.cpp
   ARMMCAsmInfo.cpp
   ARMMCCodeEmitter.cpp
   ARMMCExpr.cpp
   ARMMCTargetDesc.cpp
-  ARMMachObjectWriter.cpp
-  ARMELFObjectWriter.cpp
   ARMTargetStreamer.cpp
   ARMUnwindOpAsm.cpp
-  ARMMachORelocationInfo.cpp
+  ARMWinCOFFObjectWriter.cpp
+  ARMWinCOFFStreamer.cpp
   )
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 80af859..f6d24e9 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mlx-expansion"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
@@ -28,6 +27,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mlx-expansion"
+
 static cl::opt<bool>
 ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden);
 static cl::opt<unsigned>
@@ -73,7 +74,7 @@ namespace {
 }
 
 void MLxExpansion::clearStack() {
-  std::fill(LastMIs, LastMIs + 4, (MachineInstr*)0);
+  std::fill(LastMIs, LastMIs + 4, nullptr);
   MIIdx = 0;
 }
 
@@ -88,7 +89,7 @@ MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
   // real definition MI. This is important for _sfp instructions.
   unsigned Reg = MI->getOperand(1).getReg();
   if (TargetRegisterInfo::isPhysicalRegister(Reg))
-    return 0;
+    return nullptr;
 
   MachineBasicBlock *MBB = MI->getParent();
   MachineInstr *DefMI = MRI->getVRegDef(Reg);
@@ -352,7 +353,7 @@ bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
     if (Domain == ARMII::DomainGeneral) {
       if (++Skip == 2)
         // Assume dual issues of non-VFP / NEON instructions.
-        pushStack(0);
+        pushStack(nullptr);
     } else {
       Skip = 0;
 
diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt
index a64707e..f4d9be3 100644
--- a/lib/Target/ARM/README-Thumb.txt
+++ b/lib/Target/ARM/README-Thumb.txt
@@ -215,10 +215,6 @@ etc. Almost all Thumb instructions clobber condition code.
 
 //===---------------------------------------------------------------------===//
 
-Add ldmia, stmia support.
-
-//===---------------------------------------------------------------------===//
-
 Thumb load / store address mode offsets are scaled. The values kept in the
 instruction operands are pre-scale values. This probably ought to be changed
 to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions.
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 2224652..be29dc5 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -293,7 +293,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     AFI->setShouldRestoreSPFromFP(true);
 }
 
-static bool isCSRestore(MachineInstr *MI, const uint16_t *CSRegs) {
+static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
   if (MI->getOpcode() == ARM::tLDRspi &&
       MI->getOperand(1).isFI() &&
       isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs))
@@ -328,7 +328,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
   int NumBytes = (int)MFI->getStackSize();
   assert((unsigned)NumBytes >= ArgRegsSaveSize &&
          "ArgRegsSaveSize is included in NumBytes");
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
   if (!AFI->hasStackFrame()) {
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 93e2b5a..0c0abbe 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -56,7 +56,7 @@ public:
                              unsigned Reg) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const override;
+                           RegScavenger *RS = nullptr) const override;
 };
 }
 
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 406dbe0..edb9ff3 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "thumb2-it"
 #include "ARM.h"
 #include "ARMMachineFunctionInfo.h"
 #include "Thumb2InstrInfo.h"
@@ -19,6 +18,8 @@
 #include "llvm/CodeGen/MachineInstrBundle.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "thumb2-it"
+
 STATISTIC(NumITs,        "Number of IT blocks inserted");
 STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
 
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 04b83fb..6267ecf 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "t2-reduce-size"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
@@ -25,6 +24,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "t2-reduce-size"
+
 STATISTIC(NumNarrows,  "Number of 32-bit instrs reduced to 16-bit ones");
 STATISTIC(Num2Addrs,   "Number of 32-bit instrs reduced to 2addr 16-bit ones");
 STATISTIC(NumLdSts,    "Number of 32-bit load / store reduced to 16-bit ones");
@@ -915,15 +916,14 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
 
   // Yes, CPSR could be livein.
   bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
-  MachineInstr *BundleMI = 0;
+  MachineInstr *BundleMI = nullptr;
 
-  CPSRDef = 0;
+  CPSRDef = nullptr;
   HighLatencyCPSR = false;
 
   // Check predecessors for the latest CPSRDef.
-  for (MachineBasicBlock::pred_iterator
-       I = MBB.pred_begin(), E = MBB.pred_end(); I != E; ++I) {
-    const MBBInfo &PInfo = BlockInfo[(*I)->getNumber()];
+  for (auto *Pred : MBB.predecessors()) {
+    const MBBInfo &PInfo = BlockInfo[Pred->getNumber()];
     if (!PInfo.Visited) {
       // Since blocks are visited in RPO, this must be a back-edge.
       continue;
@@ -984,7 +984,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
     LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR);
     if (MI->isCall()) {
       // Calls don't really set CPSR.
-      CPSRDef = 0;
+      CPSRDef = nullptr;
       HighLatencyCPSR = false;
       IsSelfLoop = false;
     } else if (DefCPSR) {
diff --git a/lib/Target/ARM64/ARM64.h b/lib/Target/ARM64/ARM64.h
deleted file mode 100644
index f2c5e60..0000000
--- a/lib/Target/ARM64/ARM64.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===-- ARM64.h - Top-level interface for ARM64 representation --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the entry points for global functions defined in the LLVM
-// ARM64 back-end.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef TARGET_ARM64_H
-#define TARGET_ARM64_H
-
-#include "MCTargetDesc/ARM64BaseInfo.h"
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-
-class ARM64TargetMachine;
-class FunctionPass;
-class MachineFunctionPass;
-
-FunctionPass *createARM64DeadRegisterDefinitions();
-FunctionPass *createARM64ConditionalCompares();
-FunctionPass *createARM64AdvSIMDScalar();
-FunctionPass *createARM64BranchRelaxation();
-FunctionPass *createARM64ISelDag(ARM64TargetMachine &TM,
-                                 CodeGenOpt::Level OptLevel);
-FunctionPass *createARM64StorePairSuppressPass();
-FunctionPass *createARM64ExpandPseudoPass();
-FunctionPass *createARM64LoadStoreOptimizationPass();
-ModulePass *createARM64PromoteConstantPass();
-FunctionPass *createARM64AddressTypePromotionPass();
-/// \brief Creates an ARM-specific Target Transformation Info pass.
-ImmutablePass *createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM);
-
-FunctionPass *createARM64CleanupLocalDynamicTLSPass();
-
-FunctionPass *createARM64CollectLOHPass();
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64.td b/lib/Target/ARM64/ARM64.td
deleted file mode 100644
index 3eef8b2..0000000
--- a/lib/Target/ARM64/ARM64.td
+++ /dev/null
@@ -1,95 +0,0 @@
-//===- ARM64.td - Describe the ARM64 Target Machine --------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Target-independent interfaces which we are implementing
-//===----------------------------------------------------------------------===//
-
-include "llvm/Target/Target.td"
-
-//===----------------------------------------------------------------------===//
-// ARM64 Subtarget features.
-//
-
-/// Cyclone has register move instructions which are "free".
-def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
-                                        "Has zereo-cycle register moves">;
-
-/// Cyclone has instructions which zero registers for "free".
-def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
-                                        "Has zero-cycle zeroing instructions">;
-
-//===----------------------------------------------------------------------===//
-// Register File Description
-//===----------------------------------------------------------------------===//
-
-include "ARM64RegisterInfo.td"
-include "ARM64CallingConvention.td"
-
-//===----------------------------------------------------------------------===//
-// Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-include "ARM64Schedule.td"
-include "ARM64InstrInfo.td"
-
-def ARM64InstrInfo : InstrInfo;
-
-//===----------------------------------------------------------------------===//
-// ARM64 Processors supported.
-//
-include "ARM64SchedCyclone.td"
-
-def : ProcessorModel<"arm64-generic", NoSchedModel, []>;
-
-def : ProcessorModel<"cyclone", CycloneModel, [FeatureZCRegMove, FeatureZCZeroing]>;
-
-//===----------------------------------------------------------------------===//
-// Assembly parser
-//===----------------------------------------------------------------------===//
-
-def GenericAsmParserVariant : AsmParserVariant {
-  int Variant = 0;
-  string Name = "generic";
-}
-
-def AppleAsmParserVariant : AsmParserVariant {
-  int Variant = 1;
-  string Name = "apple-neon";
-}
-
-//===----------------------------------------------------------------------===//
-// Assembly printer
-//===----------------------------------------------------------------------===//
-// ARM64 Uses the MC printer for asm output, so make sure the TableGen
-// AsmWriter bits get associated with the correct class.
-def GenericAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  int Variant = 0;
-  bit isMCAsmWriter = 1;
-}
-
-def AppleAsmWriter : AsmWriter {
-  let AsmWriterClassName = "AppleInstPrinter";
-  int Variant = 1;
-  int isMCAsmWriter = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Target Declaration
-//===----------------------------------------------------------------------===//
-
-def ARM64 : Target {
-  let InstructionSet = ARM64InstrInfo;
-  let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
-  let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
-}
diff --git a/lib/Target/ARM64/ARM64AddressTypePromotion.cpp b/lib/Target/ARM64/ARM64AddressTypePromotion.cpp
deleted file mode 100644
index 72fa6af..0000000
--- a/lib/Target/ARM64/ARM64AddressTypePromotion.cpp
+++ /dev/null
@@ -1,496 +0,0 @@
-
-//===-- ARM64AddressTypePromotion.cpp --- Promote type for addr accesses -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass tries to promote the computations use to obtained a sign extended
-// value used into memory accesses.
-// E.g.
-// a = add nsw i32 b, 3
-// d = sext i32 a to i64
-// e = getelementptr ..., i64 d
-//
-// =>
-// f = sext i32 b to i64
-// a = add nsw i64 f, 3
-// e = getelementptr ..., i64 a
-//
-// This is legal to do so if the computations are markers with either nsw or nuw
-// markers.
-// Moreover, the current heuristic is simple: it does not create new sext
-// operations, i.e., it gives up when a sext would have forked (e.g., if
-// a = add i32 b, c, two sexts are required to promote the computation).
-//
-// FIXME: This pass may be useful for other targets too.
-// ===---------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-type-promotion"
-#include "ARM64.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-static cl::opt<bool>
-EnableAddressTypePromotion("arm64-type-promotion", cl::Hidden,
-                           cl::desc("Enable the type promotion pass"),
-                           cl::init(true));
-static cl::opt<bool>
-EnableMerge("arm64-type-promotion-merge", cl::Hidden,
-            cl::desc("Enable merging of redundant sexts when one is dominating"
-                     " the other."),
-            cl::init(true));
-
-//===----------------------------------------------------------------------===//
-//                       ARM64AddressTypePromotion
-//===----------------------------------------------------------------------===//
-
-namespace llvm {
-void initializeARM64AddressTypePromotionPass(PassRegistry &);
-}
-
-namespace {
-class ARM64AddressTypePromotion : public FunctionPass {
-
-public:
-  static char ID;
-  ARM64AddressTypePromotion()
-      : FunctionPass(ID), Func(NULL), ConsideredSExtType(NULL) {
-    initializeARM64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
-  }
-
-  virtual const char *getPassName() const {
-    return "ARM64 Address Type Promotion";
-  }
-
-  /// Iterate over the functions and promote the computation of interesting
-  // sext instructions.
-  bool runOnFunction(Function &F);
-
-private:
-  /// The current function.
-  Function *Func;
-  /// Filter out all sexts that does not have this type.
-  /// Currently initialized with Int64Ty.
-  Type *ConsideredSExtType;
-
-  // This transformation requires dominator info.
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.setPreservesCFG();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    FunctionPass::getAnalysisUsage(AU);
-  }
-
-  typedef SmallPtrSet<Instruction *, 32> SetOfInstructions;
-  typedef SmallVector<Instruction *, 16> Instructions;
-  typedef DenseMap<Value *, Instructions> ValueToInsts;
-
-  /// Check if it is profitable to move a sext through this instruction.
-  /// Currently, we consider it is profitable if:
-  /// - Inst is used only once (no need to insert truncate).
-  /// - Inst has only one operand that will require a sext operation (we do
-  ///   do not create new sext operation).
-  bool shouldGetThrough(const Instruction *Inst);
-
-  /// Check if it is possible and legal to move a sext through this
-  /// instruction.
-  /// Current heuristic considers that we can get through:
-  /// - Arithmetic operation marked with the nsw or nuw flag.
-  /// - Other sext operation.
-  /// - Truncate operation if it was just dropping sign extended bits.
-  bool canGetThrough(const Instruction *Inst);
-
-  /// Move sext operations through safe to sext instructions.
-  bool propagateSignExtension(Instructions &SExtInsts);
-
-  /// Is this sext should be considered for code motion.
-  /// We look for sext with ConsideredSExtType and uses in at least one
-  // GetElementPtrInst.
-  bool shouldConsiderSExt(const Instruction *SExt) const;
-
-  /// Collect all interesting sext operations, i.e., the ones with the right
-  /// type and used in memory accesses.
-  /// More precisely, a sext instruction is considered as interesting if it
-  /// is used in a "complex" getelementptr or it exits at least another
-  /// sext instruction that sign extended the same initial value.
-  /// A getelementptr is considered as "complex" if it has more than 2
-  // operands.
-  void analyzeSExtension(Instructions &SExtInsts);
-
-  /// Merge redundant sign extension operations in common dominator.
-  void mergeSExts(ValueToInsts &ValToSExtendedUses,
-                  SetOfInstructions &ToRemove);
-};
-} // end anonymous namespace.
-
-char ARM64AddressTypePromotion::ID = 0;
-
-INITIALIZE_PASS_BEGIN(ARM64AddressTypePromotion, "arm64-type-promotion",
-                      "ARM64 Type Promotion Pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ARM64AddressTypePromotion, "arm64-type-promotion",
-                    "ARM64 Type Promotion Pass", false, false)
-
-FunctionPass *llvm::createARM64AddressTypePromotionPass() {
-  return new ARM64AddressTypePromotion();
-}
-
-bool ARM64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
-  if (isa<SExtInst>(Inst))
-    return true;
-
-  const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
-  if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
-      (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
-    return true;
-
-  // sext(trunc(sext)) --> sext
-  if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) {
-    const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0));
-    // Check that the truncate just drop sign extended bits.
-    if (Inst->getType()->getIntegerBitWidth() >=
-            Opnd->getOperand(0)->getType()->getIntegerBitWidth() &&
-        Inst->getOperand(0)->getType()->getIntegerBitWidth() <=
-            ConsideredSExtType->getIntegerBitWidth())
-      return true;
-  }
-
-  return false;
-}
-
-bool ARM64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
-  // If the type of the sext is the same as the considered one, this sext
-  // will become useless.
-  // Otherwise, we will have to do something to preserve the original value,
-  // unless it is used once.
-  if (isa<SExtInst>(Inst) &&
-      (Inst->getType() == ConsideredSExtType || Inst->hasOneUse()))
-    return true;
-
-  // If the Inst is used more that once, we may need to insert truncate
-  // operations and we don't do that at the moment.
-  if (!Inst->hasOneUse())
-    return false;
-
-  // This truncate is used only once, thus if we can get thourgh, it will become
-  // useless.
-  if (isa<TruncInst>(Inst))
-    return true;
-
-  // If both operands are not constant, a new sext will be created here.
-  // Current heuristic is: each step should be profitable.
-  // Therefore we don't allow to increase the number of sext even if it may
-  // be profitable later on.
-  if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1)))
-    return true;
-
-  return false;
-}
-
-static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
-  if (isa<SelectInst>(Inst) && OpIdx == 0)
-    return false;
-  return true;
-}
-
-bool
-ARM64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
-  if (SExt->getType() != ConsideredSExtType)
-    return false;
-
-  for (Value::const_use_iterator UseIt = SExt->use_begin(),
-                                 EndUseIt = SExt->use_end();
-       UseIt != EndUseIt; ++UseIt) {
-    if (isa<GetElementPtrInst>(*UseIt))
-      return true;
-  }
-
-  return false;
-}
-
-// Input:
-// - SExtInsts contains all the sext instructions that are use direclty in
-//   GetElementPtrInst, i.e., access to memory.
-// Algorithm:
-// - For each sext operation in SExtInsts:
-//   Let var be the operand of sext.
-//   while it is profitable (see shouldGetThrough), legal, and safe
-//   (see canGetThrough) to move sext through var's definition:
-//   * promote the type of var's definition.
-//   * fold var into sext uses.
-//   * move sext above var's definition.
-//   * update sext operand to use the operand of var that should be sign
-//     extended (by construction there is only one).
-//
-//   E.g.,
-//   a = ... i32 c, 3
-//   b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a'
-//   ...
-//   = b
-// => Yes, update the code
-//   b = sext i32 c to i64
-//   a = ... i64 b, 3
-//   ...
-//   = a
-// Iterate on 'c'.
-bool
-ARM64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
-  DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
-
-  bool LocalChange = false;
-  SetOfInstructions ToRemove;
-  ValueToInsts ValToSExtendedUses;
-  while (!SExtInsts.empty()) {
-    // Get through simple chain.
-    Instruction *SExt = SExtInsts.pop_back_val();
-
-    DEBUG(dbgs() << "Consider:\n" << *SExt << '\n');
-
-    // If this SExt has already been merged continue.
-    if (SExt->use_empty() && ToRemove.count(SExt)) {
-      DEBUG(dbgs() << "No uses => marked as delete\n");
-      continue;
-    }
-
-    // Now try to get through the chain of definitions.
-    while (isa<Instruction>(SExt->getOperand(0))) {
-      Instruction *Inst = dyn_cast<Instruction>(SExt->getOperand(0));
-      DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
-      if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
-        // We cannot get through something that is not an Instruction
-        // or not safe to SExt.
-        DEBUG(dbgs() << "Cannot get through\n");
-        break;
-      }
-
-      LocalChange = true;
-      // If this is a sign extend, it becomes useless.
-      if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) {
-        DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n");
-        // We cannot use replaceAllUsesWith here because we may trigger some
-        // assertion on the type as all involved sext operation may have not
-        // been moved yet.
-        while (!Inst->use_empty()) {
-          Value::use_iterator UseIt = Inst->use_begin();
-          Instruction *UseInst = dyn_cast<Instruction>(*UseIt);
-          assert(UseInst && "Use of sext is not an Instruction!");
-          UseInst->setOperand(UseIt->getOperandNo(), SExt);
-        }
-        ToRemove.insert(Inst);
-        SExt->setOperand(0, Inst->getOperand(0));
-        SExt->moveBefore(Inst);
-        continue;
-      }
-
-      // Get through the Instruction:
-      // 1. Update its type.
-      // 2. Replace the uses of SExt by Inst.
-      // 3. Sign extend each operand that needs to be sign extended.
-
-      // Step #1.
-      Inst->mutateType(SExt->getType());
-      // Step #2.
-      SExt->replaceAllUsesWith(Inst);
-      // Step #3.
-      Instruction *SExtForOpnd = SExt;
-
-      DEBUG(dbgs() << "Propagate SExt to operands\n");
-      for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx;
-           ++OpIdx) {
-        DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n');
-        if (Inst->getOperand(OpIdx)->getType() == SExt->getType() ||
-            !shouldSExtOperand(Inst, OpIdx)) {
-          DEBUG(dbgs() << "No need to propagate\n");
-          continue;
-        }
-        // Check if we can statically sign extend the operand.
-        Value *Opnd = Inst->getOperand(OpIdx);
-        if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
-          DEBUG(dbgs() << "Statically sign extend\n");
-          Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(),
-                                                         Cst->getSExtValue()));
-          continue;
-        }
-        // UndefValue are typed, so we have to statically sign extend them.
-        if (isa<UndefValue>(Opnd)) {
-          DEBUG(dbgs() << "Statically sign extend\n");
-          Inst->setOperand(OpIdx, UndefValue::get(SExt->getType()));
-          continue;
-        }
-
-        // Otherwise we have to explicity sign extend it.
-        assert(SExtForOpnd &&
-               "Only one operand should have been sign extended");
-
-        SExtForOpnd->setOperand(0, Opnd);
-
-        DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n");
-        // Move the sign extension before the insertion point.
-        SExtForOpnd->moveBefore(Inst);
-        Inst->setOperand(OpIdx, SExtForOpnd);
-        // If more sext are required, new instructions will have to be created.
-        SExtForOpnd = NULL;
-      }
-      if (SExtForOpnd == SExt) {
-        DEBUG(dbgs() << "Sign extension is useless now\n");
-        ToRemove.insert(SExt);
-        break;
-      }
-    }
-
-    // If the use is already of the right type, connect its uses to its argument
-    // and delete it.
-    // This can happen for an Instruction which all uses are sign extended.
-    if (!ToRemove.count(SExt) &&
-        SExt->getType() == SExt->getOperand(0)->getType()) {
-      DEBUG(dbgs() << "Sign extension is useless, attach its use to "
-                      "its argument\n");
-      SExt->replaceAllUsesWith(SExt->getOperand(0));
-      ToRemove.insert(SExt);
-    } else
-      ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt);
-  }
-
-  if (EnableMerge)
-    mergeSExts(ValToSExtendedUses, ToRemove);
-
-  // Remove all instructions marked as ToRemove.
-  for (Instruction *I: ToRemove)
-    I->eraseFromParent();
-  return LocalChange;
-}
-
-void ARM64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
-                                           SetOfInstructions &ToRemove) {
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
-  for (auto &Entry: ValToSExtendedUses) {
-    Instructions &Insts = Entry.second;
-    Instructions CurPts;
-    for (Instruction *Inst : Insts) {
-      if (ToRemove.count(Inst))
-        continue;
-      bool inserted = false;
-      for (auto Pt : CurPts) {
-        if (DT.dominates(Inst, Pt)) {
-          DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n"
-                       << *Inst << '\n');
-          (Pt)->replaceAllUsesWith(Inst);
-          ToRemove.insert(Pt);
-          Pt = Inst;
-          inserted = true;
-          break;
-        }
-        if (!DT.dominates(Pt, Inst))
-          // Give up if we need to merge in a common dominator as the
-          // expermients show it is not profitable.
-          continue;
-
-        DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n"
-                     << *Pt << '\n');
-        Inst->replaceAllUsesWith(Pt);
-        ToRemove.insert(Inst);
-        inserted = true;
-        break;
-      }
-      if (!inserted)
-        CurPts.push_back(Inst);
-    }
-  }
-}
-
-void ARM64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
-  DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
-
-  DenseMap<Value *, Instruction *> SeenChains;
-
-  for (auto &BB : *Func) {
-    for (auto &II: BB) {
-      Instruction *SExt = &II;
-
-      // Collect all sext operation per type.
-      if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt))
-        continue;
-
-      DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n');
-
-      // Cases where we actually perform the optimization:
-      // 1. SExt is used in a getelementptr with more than 2 operand =>
-      //    likely we can merge some computation if they are done on 64 bits.
-      // 2. The beginning of the SExt chain is SExt several time. =>
-      //    code sharing is possible.
-
-      bool insert = false;
-      // #1.
-      for (Value::use_iterator UseIt = SExt->use_begin(),
-                               EndUseIt = SExt->use_end();
-           UseIt != EndUseIt; ++UseIt) {
-        const Instruction *Inst = dyn_cast<GetElementPtrInst>(*UseIt);
-        if (Inst && Inst->getNumOperands() > 2) {
-          DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
-                       << '\n');
-          insert = true;
-          break;
-        }
-      }
-
-      // #2.
-      // Check the head of the chain.
-      Instruction *Inst = SExt;
-      Value *Last;
-      do {
-        int OpdIdx = 0;
-        const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
-        if (BinOp && isa<ConstantInt>(BinOp->getOperand(0)))
-          OpdIdx = 1;
-        Last = Inst->getOperand(OpdIdx);
-        Inst = dyn_cast<Instruction>(Last);
-      } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst));
-
-      DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n');
-      DenseMap<Value *, Instruction *>::iterator AlreadySeen =
-          SeenChains.find(Last);
-      if (insert || AlreadySeen != SeenChains.end()) {
-        DEBUG(dbgs() << "Insert\n");
-        SExtInsts.push_back(SExt);
-        if (AlreadySeen != SeenChains.end() && AlreadySeen->second != NULL) {
-          DEBUG(dbgs() << "Insert chain member\n");
-          SExtInsts.push_back(AlreadySeen->second);
-          SeenChains[Last] = NULL;
-        }
-      } else {
-        DEBUG(dbgs() << "Record its chain membership\n");
-        SeenChains[Last] = SExt;
-      }
-    }
-  }
-}
-
-bool ARM64AddressTypePromotion::runOnFunction(Function &F) {
-  if (!EnableAddressTypePromotion || F.isDeclaration())
-    return false;
-  Func = &F;
-  ConsideredSExtType = Type::getInt64Ty(Func->getContext());
-
-  DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n');
-
-  Instructions SExtInsts;
-  analyzeSExtension(SExtInsts);
-  return propagateSignExtension(SExtInsts);
-}
diff --git a/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp b/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp
deleted file mode 100644
index 83f8cda..0000000
--- a/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp
+++ /dev/null
@@ -1,392 +0,0 @@
-//===-- ARM64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// When profitable, replace GPR targeting i64 instructions with their
-// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
-// as minimizing the number of cross-class register copies.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// TODO: Graph based predicate heuristics.
-// Walking the instruction list linearly will get many, perhaps most, of
-// the cases, but to do a truly throrough job of this, we need a more
-// wholistic approach.
-//
-// This optimization is very similar in spirit to the register allocator's
-// spill placement, only here we're determining where to place cross-class
-// register copies rather than spills. As such, a similar approach is
-// called for.
-//
-// We want to build up a set of graphs of all instructions which are candidates
-// for transformation along with instructions which generate their inputs and
-// consume their outputs. For each edge in the graph, we assign a weight
-// based on whether there is a copy required there (weight zero if not) and
-// the block frequency of the block containing the defining or using
-// instruction, whichever is less. Our optimization is then a graph problem
-// to minimize the total weight of all the graphs, then transform instructions
-// and add or remove copy instructions as called for to implement the
-// solution.
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-simd-scalar"
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64RegisterInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-static cl::opt<bool>
-AdvSIMDScalar("arm64-simd-scalar",
-              cl::desc("enable use of AdvSIMD scalar integer instructions"),
-              cl::init(false), cl::Hidden);
-// Allow forcing all i64 operations with equivalent SIMD instructions to use
-// them. For stress-testing the transformation function.
-static cl::opt<bool>
-TransformAll("arm64-simd-scalar-force-all",
-             cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
-             cl::init(false), cl::Hidden);
-
-STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
-STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
-STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
-
-namespace {
-class ARM64AdvSIMDScalar : public MachineFunctionPass {
-  MachineRegisterInfo *MRI;
-  const ARM64InstrInfo *TII;
-
-private:
-  // isProfitableToTransform - Predicate function to determine whether an
-  // instruction should be transformed to its equivalent AdvSIMD scalar
-  // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
-  bool isProfitableToTransform(const MachineInstr *MI) const;
-
-  // tranformInstruction - Perform the transformation of an instruction
-  // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
-  // to be the correct register class, minimizing cross-class copies.
-  void transformInstruction(MachineInstr *MI);
-
-  // processMachineBasicBlock - Main optimzation loop.
-  bool processMachineBasicBlock(MachineBasicBlock *MBB);
-
-public:
-  static char ID; // Pass identification, replacement for typeid.
-  explicit ARM64AdvSIMDScalar() : MachineFunctionPass(ID) {}
-
-  virtual bool runOnMachineFunction(MachineFunction &F);
-
-  const char *getPassName() const {
-    return "AdvSIMD scalar operation optimization";
-  }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-char ARM64AdvSIMDScalar::ID = 0;
-} // end anonymous namespace
-
-static bool isGPR64(unsigned Reg, unsigned SubReg,
-                    const MachineRegisterInfo *MRI) {
-  if (SubReg)
-    return false;
-  if (TargetRegisterInfo::isVirtualRegister(Reg))
-    return MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::GPR64RegClass);
-  return ARM64::GPR64RegClass.contains(Reg);
-}
-
-static bool isFPR64(unsigned Reg, unsigned SubReg,
-                    const MachineRegisterInfo *MRI) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg))
-    return (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR64RegClass) &&
-            SubReg == 0) ||
-           (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR128RegClass) &&
-            SubReg == ARM64::dsub);
-  // Physical register references just check the regist class directly.
-  return (ARM64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
-         (ARM64::FPR128RegClass.contains(Reg) && SubReg == ARM64::dsub);
-}
-
-// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
-// copy instruction. Return zero_reg if the instruction is not a copy.
-static unsigned getSrcFromCopy(const MachineInstr *MI,
-                               const MachineRegisterInfo *MRI,
-                               unsigned &SubReg) {
-  SubReg = 0;
-  // The "FMOV Xd, Dn" instruction is the typical form.
-  if (MI->getOpcode() == ARM64::FMOVDXr || MI->getOpcode() == ARM64::FMOVXDr)
-    return MI->getOperand(1).getReg();
-  // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
-  // these at this stage, but it's easy to check for.
-  if (MI->getOpcode() == ARM64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
-    SubReg = ARM64::dsub;
-    return MI->getOperand(1).getReg();
-  }
-  // Or just a plain COPY instruction. This can be directly to/from FPR64,
-  // or it can be a dsub subreg reference to an FPR128.
-  if (MI->getOpcode() == ARM64::COPY) {
-    if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
-                MRI) &&
-        isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
-      return MI->getOperand(1).getReg();
-    if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
-                MRI) &&
-        isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
-                MRI)) {
-      SubReg = ARM64::dsub;
-      return MI->getOperand(1).getReg();
-    }
-  }
-
-  // Otherwise, this is some other kind of instruction.
-  return 0;
-}
-
-// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
-// that we're considering transforming to, return that AdvSIMD opcode. For all
-// others, return the original opcode.
-static int getTransformOpcode(unsigned Opc) {
-  switch (Opc) {
-  default:
-    break;
-  // FIXME: Lots more possibilities.
-  case ARM64::ADDXrr:
-    return ARM64::ADDv1i64;
-  case ARM64::SUBXrr:
-    return ARM64::SUBv1i64;
-  }
-  // No AdvSIMD equivalent, so just return the original opcode.
-  return Opc;
-}
-
-static bool isTransformable(const MachineInstr *MI) {
-  int Opc = MI->getOpcode();
-  return Opc != getTransformOpcode(Opc);
-}
-
-// isProfitableToTransform - Predicate function to determine whether an
-// instruction should be transformed to its equivalent AdvSIMD scalar
-// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
-bool ARM64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
-  // If this instruction isn't eligible to be transformed (no SIMD equivalent),
-  // early exit since that's the common case.
-  if (!isTransformable(MI))
-    return false;
-
-  // Count the number of copies we'll need to add and approximate the number
-  // of copies that a transform will enable us to remove.
-  unsigned NumNewCopies = 3;
-  unsigned NumRemovableCopies = 0;
-
-  unsigned OrigSrc0 = MI->getOperand(1).getReg();
-  unsigned OrigSrc1 = MI->getOperand(2).getReg();
-  unsigned Src0 = 0, SubReg0;
-  unsigned Src1 = 0, SubReg1;
-  if (!MRI->def_empty(OrigSrc0)) {
-    MachineRegisterInfo::def_instr_iterator Def =
-        MRI->def_instr_begin(OrigSrc0);
-    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
-    // If the source was from a copy, we don't need to insert a new copy.
-    if (Src0)
-      --NumNewCopies;
-    // If there are no other users of the original source, we can delete
-    // that instruction.
-    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0))
-      ++NumRemovableCopies;
-  }
-  if (!MRI->def_empty(OrigSrc1)) {
-    MachineRegisterInfo::def_instr_iterator Def =
-        MRI->def_instr_begin(OrigSrc1);
-    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
-    if (Src1)
-      --NumNewCopies;
-    // If there are no other users of the original source, we can delete
-    // that instruction.
-    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1))
-      ++NumRemovableCopies;
-  }
-
-  // If any of the uses of the original instructions is a cross class copy,
-  // that's a copy that will be removable if we transform. Likewise, if
-  // any of the uses is a transformable instruction, it's likely the tranforms
-  // will chain, enabling us to save a copy there, too. This is an aggressive
-  // heuristic that approximates the graph based cost analysis described above.
-  unsigned Dst = MI->getOperand(0).getReg();
-  bool AllUsesAreCopies = true;
-  for (MachineRegisterInfo::use_instr_nodbg_iterator
-           Use = MRI->use_instr_nodbg_begin(Dst),
-           E = MRI->use_instr_nodbg_end();
-       Use != E; ++Use) {
-    unsigned SubReg;
-    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use))
-      ++NumRemovableCopies;
-    // If the use is an INSERT_SUBREG, that's still something that can
-    // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
-    // preferable to have it use the FPR64 in most cases, as if the source
-    // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
-    // Ditto for a lane insert.
-    else if (Use->getOpcode() == ARM64::INSERT_SUBREG ||
-             Use->getOpcode() == ARM64::INSvi64gpr)
-      ;
-    else
-      AllUsesAreCopies = false;
-  }
-  // If all of the uses of the original destination register are copies to
-  // FPR64, then we won't end up having a new copy back to GPR64 either.
-  if (AllUsesAreCopies)
-    --NumNewCopies;
-
-  // If a tranform will not increase the number of cross-class copies required,
-  // return true.
-  if (NumNewCopies <= NumRemovableCopies)
-    return true;
-
-  // Finally, even if we otherwise wouldn't transform, check if we're forcing
-  // transformation of everything.
-  return TransformAll;
-}
-
-static MachineInstr *insertCopy(const ARM64InstrInfo *TII, MachineInstr *MI,
-                                unsigned Dst, unsigned Src, bool IsKill) {
-  MachineInstrBuilder MIB =
-      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(ARM64::COPY),
-              Dst)
-          .addReg(Src, getKillRegState(IsKill));
-  DEBUG(dbgs() << "    adding copy: " << *MIB);
-  ++NumCopiesInserted;
-  return MIB;
-}
-
-// tranformInstruction - Perform the transformation of an instruction
-// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
-// to be the correct register class, minimizing cross-class copies.
-void ARM64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
-  DEBUG(dbgs() << "Scalar transform: " << *MI);
-
-  MachineBasicBlock *MBB = MI->getParent();
-  int OldOpc = MI->getOpcode();
-  int NewOpc = getTransformOpcode(OldOpc);
-  assert(OldOpc != NewOpc && "transform an instruction to itself?!");
-
-  // Check if we need a copy for the source registers.
-  unsigned OrigSrc0 = MI->getOperand(1).getReg();
-  unsigned OrigSrc1 = MI->getOperand(2).getReg();
-  unsigned Src0 = 0, SubReg0;
-  unsigned Src1 = 0, SubReg1;
-  if (!MRI->def_empty(OrigSrc0)) {
-    MachineRegisterInfo::def_instr_iterator Def =
-        MRI->def_instr_begin(OrigSrc0);
-    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
-    // If there are no other users of the original source, we can delete
-    // that instruction.
-    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) {
-      assert(Src0 && "Can't delete copy w/o a valid original source!");
-      Def->eraseFromParent();
-      ++NumCopiesDeleted;
-    }
-  }
-  if (!MRI->def_empty(OrigSrc1)) {
-    MachineRegisterInfo::def_instr_iterator Def =
-        MRI->def_instr_begin(OrigSrc1);
-    assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
-    // If there are no other users of the original source, we can delete
-    // that instruction.
-    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) {
-      assert(Src1 && "Can't delete copy w/o a valid original source!");
-      Def->eraseFromParent();
-      ++NumCopiesDeleted;
-    }
-  }
-  // If we weren't able to reference the original source directly, create a
-  // copy.
-  if (!Src0) {
-    SubReg0 = 0;
-    Src0 = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
-    insertCopy(TII, MI, Src0, OrigSrc0, true);
-  }
-  if (!Src1) {
-    SubReg1 = 0;
-    Src1 = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
-    insertCopy(TII, MI, Src1, OrigSrc1, true);
-  }
-
-  // Create a vreg for the destination.
-  // FIXME: No need to do this if the ultimate user expects an FPR64.
-  // Check for that and avoid the copy if possible.
-  unsigned Dst = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
-
-  // For now, all of the new instructions have the same simple three-register
-  // form, so no need to special case based on what instruction we're
-  // building.
-  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst)
-      .addReg(Src0, getKillRegState(true), SubReg0)
-      .addReg(Src1, getKillRegState(true), SubReg1);
-
-  // Now copy the result back out to a GPR.
-  // FIXME: Try to avoid this if all uses could actually just use the FPR64
-  // directly.
-  insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true);
-
-  // Erase the old instruction.
-  MI->eraseFromParent();
-
-  ++NumScalarInsnsUsed;
-}
-
-// processMachineBasicBlock - Main optimzation loop.
-bool ARM64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
-  bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
-    MachineInstr *MI = I;
-    ++I;
-    if (isProfitableToTransform(MI)) {
-      transformInstruction(MI);
-      Changed = true;
-    }
-  }
-  return Changed;
-}
-
-// runOnMachineFunction - Pass entry point from PassManager.
-bool ARM64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
-  // Early exit if pass disabled.
-  if (!AdvSIMDScalar)
-    return false;
-
-  bool Changed = false;
-  DEBUG(dbgs() << "***** ARM64AdvSIMDScalar *****\n");
-
-  const TargetMachine &TM = mf.getTarget();
-  MRI = &mf.getRegInfo();
-  TII = static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
-
-  // Just check things on a one-block-at-a-time basis.
-  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
-    if (processMachineBasicBlock(I))
-      Changed = true;
-  return Changed;
-}
-
-// createARM64AdvSIMDScalar - Factory function used by ARM64TargetMachine
-// to add the pass to the PassManager.
-FunctionPass *llvm::createARM64AdvSIMDScalar() {
-  return new ARM64AdvSIMDScalar();
-}
diff --git a/lib/Target/ARM64/ARM64AsmPrinter.cpp b/lib/Target/ARM64/ARM64AsmPrinter.cpp
deleted file mode 100644
index d0aa6af..0000000
--- a/lib/Target/ARM64/ARM64AsmPrinter.cpp
+++ /dev/null
@@ -1,563 +0,0 @@
-//===-- ARM64AsmPrinter.cpp - ARM64 LLVM assembly writer ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a printer that converts from our internal representation
-// of machine-dependent LLVM code to the ARM64 assembly language.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "asm-printer"
-#include "ARM64.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64MCInstLower.h"
-#include "ARM64RegisterInfo.h"
-#include "InstPrinter/ARM64InstPrinter.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/StackMaps.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstBuilder.h"
-#include "llvm/MC/MCLinkerOptimizationHint.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
-using namespace llvm;
-
-namespace {
-
-class ARM64AsmPrinter : public AsmPrinter {
-  ARM64MCInstLower MCInstLowering;
-  StackMaps SM;
-
-public:
-  ARM64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer), MCInstLowering(OutContext, *Mang, *this),
-        SM(*this), ARM64FI(NULL), LOHLabelCounter(0) {}
-
-  virtual const char *getPassName() const { return "ARM64 Assembly Printer"; }
-
-  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
-  /// tblgen'erated pseudo lowering.
-  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
-    return MCInstLowering.lowerOperand(MO, MCOp);
-  }
-
-  void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
-                     const MachineInstr &MI);
-  void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
-                       const MachineInstr &MI);
-  /// \brief tblgen'erated driver function for lowering simple MI->MC
-  /// pseudo instructions.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
-
-  void EmitInstruction(const MachineInstr *MI);
-
-  void getAnalysisUsage(AnalysisUsage &AU) const {
-    AsmPrinter::getAnalysisUsage(AU);
-    AU.setPreservesAll();
-  }
-
-  bool runOnMachineFunction(MachineFunction &F) {
-    ARM64FI = F.getInfo<ARM64FunctionInfo>();
-    return AsmPrinter::runOnMachineFunction(F);
-  }
-
-private:
-  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
-  void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
-  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
-  bool printAsmRegInClass(const MachineOperand &MO,
-                          const TargetRegisterClass *RC, bool isVector,
-                          raw_ostream &O);
-
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &O);
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
-                             unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &O);
-
-  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-
-  void EmitFunctionBodyEnd();
-
-  MCSymbol *GetCPISymbol(unsigned CPID) const;
-  void EmitEndOfAsmFile(Module &M);
-  ARM64FunctionInfo *ARM64FI;
-
-  /// \brief Emit the LOHs contained in ARM64FI.
-  void EmitLOHs();
-
-  typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
-  MInstToMCSymbol LOHInstToLabel;
-  unsigned LOHLabelCounter;
-};
-
-} // end of anonymous namespace
-
-//===----------------------------------------------------------------------===//
-
-void ARM64AsmPrinter::EmitEndOfAsmFile(Module &M) {
-  // Funny Darwin hack: This flag tells the linker that no global symbols
-  // contain code that falls through to other global symbols (e.g. the obvious
-  // implementation of multiple entry points).  If this doesn't occur, the
-  // linker can safely perform dead code stripping.  Since LLVM never
-  // generates code that does this, it is always safe to set.
-  OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
-  SM.serializeToStackMapSection();
-}
-
-MachineLocation
-ARM64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
-  MachineLocation Location;
-  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
-  // Frame address.  Currently handles register +- offset only.
-  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
-    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
-  else {
-    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
-  }
-  return Location;
-}
-
-void ARM64AsmPrinter::EmitLOHs() {
-  SmallVector<MCSymbol *, 3> MCArgs;
-
-  for (const auto &D : ARM64FI->getLOHContainer()) {
-    for (const MachineInstr *MI : D.getArgs()) {
-      MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
-      assert(LabelIt != LOHInstToLabel.end() &&
-             "Label hasn't been inserted for LOH related instruction");
-      MCArgs.push_back(LabelIt->second);
-    }
-    OutStreamer.EmitLOHDirective(D.getKind(), MCArgs);
-    MCArgs.clear();
-  }
-}
-
-void ARM64AsmPrinter::EmitFunctionBodyEnd() {
-  if (!ARM64FI->getLOHRelated().empty())
-    EmitLOHs();
-}
-
-/// GetCPISymbol - Return the symbol for the specified constant pool entry.
-MCSymbol *ARM64AsmPrinter::GetCPISymbol(unsigned CPID) const {
-  // Darwin uses a linker-private symbol name for constant-pools (to
-  // avoid addends on the relocation?), ELF has no such concept and
-  // uses a normal private symbol.
-  if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
-    return OutContext.GetOrCreateSymbol(
-        Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
-        Twine(getFunctionNumber()) + "_" + Twine(CPID));
-
-  return OutContext.GetOrCreateSymbol(
-      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
-      Twine(getFunctionNumber()) + "_" + Twine(CPID));
-}
-
-void ARM64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
-                                   raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  switch (MO.getType()) {
-  default:
-    assert(0 && "<unknown operand type>");
-  case MachineOperand::MO_Register: {
-    unsigned Reg = MO.getReg();
-    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
-    assert(!MO.getSubReg() && "Subregs should be eliminated!");
-    O << ARM64InstPrinter::getRegisterName(Reg);
-    break;
-  }
-  case MachineOperand::MO_Immediate: {
-    int64_t Imm = MO.getImm();
-    O << '#' << Imm;
-    break;
-  }
-  }
-}
-
-bool ARM64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
-                                        raw_ostream &O) {
-  unsigned Reg = MO.getReg();
-  switch (Mode) {
-  default:
-    return true; // Unknown mode.
-  case 'w':
-    Reg = getWRegFromXReg(Reg);
-    break;
-  case 'x':
-    Reg = getXRegFromWReg(Reg);
-    break;
-  }
-
-  O << ARM64InstPrinter::getRegisterName(Reg);
-  return false;
-}
-
-// Prints the register in MO using class RC using the offset in the
-// new register class. This should not be used for cross class
-// printing.
-bool ARM64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
-                                         const TargetRegisterClass *RC,
-                                         bool isVector, raw_ostream &O) {
-  assert(MO.isReg() && "Should only get here with a register!");
-  const ARM64RegisterInfo *RI =
-      static_cast<const ARM64RegisterInfo *>(TM.getRegisterInfo());
-  unsigned Reg = MO.getReg();
-  unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
-  assert(RI->regsOverlap(RegToPrint, Reg));
-  O << ARM64InstPrinter::getRegisterName(
-           RegToPrint, isVector ? ARM64::vreg : ARM64::NoRegAltName);
-  return false;
-}
-
-bool ARM64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                                      unsigned AsmVariant,
-                                      const char *ExtraCode, raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  // Does this asm operand have a single letter operand modifier?
-  if (ExtraCode && ExtraCode[0]) {
-    if (ExtraCode[1] != 0)
-      return true; // Unknown modifier.
-
-    switch (ExtraCode[0]) {
-    default:
-      return true; // Unknown modifier.
-    case 'w':      // Print W register
-    case 'x':      // Print X register
-      if (MO.isReg())
-        return printAsmMRegister(MO, ExtraCode[0], O);
-      if (MO.isImm() && MO.getImm() == 0) {
-        unsigned Reg = ExtraCode[0] == 'w' ? ARM64::WZR : ARM64::XZR;
-        O << ARM64InstPrinter::getRegisterName(Reg);
-        return false;
-      }
-      printOperand(MI, OpNum, O);
-      return false;
-    case 'b': // Print B register.
-    case 'h': // Print H register.
-    case 's': // Print S register.
-    case 'd': // Print D register.
-    case 'q': // Print Q register.
-      if (MO.isReg()) {
-        const TargetRegisterClass *RC;
-        switch (ExtraCode[0]) {
-        case 'b':
-          RC = &ARM64::FPR8RegClass;
-          break;
-        case 'h':
-          RC = &ARM64::FPR16RegClass;
-          break;
-        case 's':
-          RC = &ARM64::FPR32RegClass;
-          break;
-        case 'd':
-          RC = &ARM64::FPR64RegClass;
-          break;
-        case 'q':
-          RC = &ARM64::FPR128RegClass;
-          break;
-        default:
-          return true;
-        }
-        return printAsmRegInClass(MO, RC, false /* vector */, O);
-      }
-      printOperand(MI, OpNum, O);
-      return false;
-    }
-  }
-
-  // According to ARM, we should emit x and v registers unless we have a
-  // modifier.
-  if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
-
-    // If this is a w or x register, print an x register.
-    if (ARM64::GPR32allRegClass.contains(Reg) ||
-        ARM64::GPR64allRegClass.contains(Reg))
-      return printAsmMRegister(MO, 'x', O);
-
-    // If this is a b, h, s, d, or q register, print it as a v register.
-    return printAsmRegInClass(MO, &ARM64::FPR128RegClass, true /* vector */, O);
-  }
-
-  printOperand(MI, OpNum, O);
-  return false;
-}
-
-bool ARM64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
-                                            unsigned OpNum, unsigned AsmVariant,
-                                            const char *ExtraCode,
-                                            raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier.
-
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  assert(MO.isReg() && "unexpected inline asm memory operand");
-  O << "[" << ARM64InstPrinter::getRegisterName(MO.getReg()) << "]";
-  return false;
-}
-
-void ARM64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
-                                             raw_ostream &OS) {
-  unsigned NOps = MI->getNumOperands();
-  assert(NOps == 4);
-  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
-  // cast away const; DIetc do not take const operands for some reason.
-  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
-  OS << V.getName();
-  OS << " <- ";
-  // Frame address.  Currently handles register +- offset only.
-  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
-  OS << '[';
-  printOperand(MI, 0, OS);
-  OS << '+';
-  printOperand(MI, 1, OS);
-  OS << ']';
-  OS << "+";
-  printOperand(MI, NOps - 2, OS);
-}
-
-void ARM64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
-                                    const MachineInstr &MI) {
-  unsigned NumNOPBytes = MI.getOperand(1).getImm();
-
-  SM.recordStackMap(MI);
-  // Emit padding.
-  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
-  for (unsigned i = 0; i < NumNOPBytes; i += 4)
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0));
-}
-
-// Lower a patchpoint of the form:
-// [<def>], <id>, <numBytes>, <target>, <numArgs>
-void ARM64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
-                                      const MachineInstr &MI) {
-  SM.recordPatchPoint(MI);
-
-  PatchPointOpers Opers(&MI);
-
-  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
-  unsigned EncodedBytes = 0;
-  if (CallTarget) {
-    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
-           "High 16 bits of call target should be zero.");
-    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
-    EncodedBytes = 16;
-    // Materialize the jump address:
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVZWi)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 32) & 0xFFFF)
-                                    .addImm(32));
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 16) & 0xFFFF)
-                                    .addImm(16));
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm(CallTarget & 0xFFFF)
-                                    .addImm(0));
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::BLR).addReg(ScratchReg));
-  }
-  // Emit padding.
-  unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
-  assert(NumBytes >= EncodedBytes &&
-         "Patchpoint can't request size less than the length of a call.");
-  assert((NumBytes - EncodedBytes) % 4 == 0 &&
-         "Invalid number of NOP bytes requested!");
-  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0));
-}
-
-// Simple pseudo-instructions have their lowering (with expansion to real
-// instructions) auto-generated.
-#include "ARM64GenMCPseudoLowering.inc"
-
-static unsigned getRealIndexedOpcode(unsigned Opc) {
-  switch (Opc) {
-  case ARM64::LDRXpre_isel:    return ARM64::LDRXpre;
-  case ARM64::LDRWpre_isel:    return ARM64::LDRWpre;
-  case ARM64::LDRDpre_isel:    return ARM64::LDRDpre;
-  case ARM64::LDRSpre_isel:    return ARM64::LDRSpre;
-  case ARM64::LDRBBpre_isel:   return ARM64::LDRBBpre;
-  case ARM64::LDRHHpre_isel:   return ARM64::LDRHHpre;
-  case ARM64::LDRSBWpre_isel:  return ARM64::LDRSBWpre;
-  case ARM64::LDRSBXpre_isel:  return ARM64::LDRSBXpre;
-  case ARM64::LDRSHWpre_isel:  return ARM64::LDRSHWpre;
-  case ARM64::LDRSHXpre_isel:  return ARM64::LDRSHXpre;
-  case ARM64::LDRSWpre_isel:   return ARM64::LDRSWpre;
-
-  case ARM64::LDRDpost_isel:   return ARM64::LDRDpost;
-  case ARM64::LDRSpost_isel:   return ARM64::LDRSpost;
-  case ARM64::LDRXpost_isel:   return ARM64::LDRXpost;
-  case ARM64::LDRWpost_isel:   return ARM64::LDRWpost;
-  case ARM64::LDRHHpost_isel:  return ARM64::LDRHHpost;
-  case ARM64::LDRBBpost_isel:  return ARM64::LDRBBpost;
-  case ARM64::LDRSWpost_isel:  return ARM64::LDRSWpost;
-  case ARM64::LDRSHWpost_isel: return ARM64::LDRSHWpost;
-  case ARM64::LDRSHXpost_isel: return ARM64::LDRSHXpost;
-  case ARM64::LDRSBWpost_isel: return ARM64::LDRSBWpost;
-  case ARM64::LDRSBXpost_isel: return ARM64::LDRSBXpost;
-
-  case ARM64::STRXpre_isel:    return ARM64::STRXpre;
-  case ARM64::STRWpre_isel:    return ARM64::STRWpre;
-  case ARM64::STRHHpre_isel:   return ARM64::STRHHpre;
-  case ARM64::STRBBpre_isel:   return ARM64::STRBBpre;
-  case ARM64::STRDpre_isel:    return ARM64::STRDpre;
-  case ARM64::STRSpre_isel:    return ARM64::STRSpre;
-  }
-  llvm_unreachable("Unexpected pre-indexed opcode!");
-}
-
-void ARM64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(OutStreamer, MI))
-    return;
-
-  if (ARM64FI->getLOHRelated().count(MI)) {
-    // Generate a label for LOH related instruction
-    MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
-    // Associate the instruction with the label
-    LOHInstToLabel[MI] = LOHLabel;
-    OutStreamer.EmitLabel(LOHLabel);
-  }
-
-  // Do any manual lowerings.
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::DBG_VALUE: {
-    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
-      SmallString<128> TmpStr;
-      raw_svector_ostream OS(TmpStr);
-      PrintDebugValueComment(MI, OS);
-      OutStreamer.EmitRawText(StringRef(OS.str()));
-    }
-    return;
-  }
-  // Indexed loads and stores use a pseudo to handle complex operand
-  // tricks and writeback to the base register. We strip off the writeback
-  // operand and switch the opcode here. Post-indexed stores were handled by the
-  // tablegen'erated pseudos above. (The complex operand <--> simple
-  // operand isel is beyond tablegen's ability, so we do these manually).
-  case ARM64::LDRHHpre_isel:
-  case ARM64::LDRBBpre_isel:
-  case ARM64::LDRXpre_isel:
-  case ARM64::LDRWpre_isel:
-  case ARM64::LDRDpre_isel:
-  case ARM64::LDRSpre_isel:
-  case ARM64::LDRSBWpre_isel:
-  case ARM64::LDRSBXpre_isel:
-  case ARM64::LDRSHWpre_isel:
-  case ARM64::LDRSHXpre_isel:
-  case ARM64::LDRSWpre_isel:
-  case ARM64::LDRDpost_isel:
-  case ARM64::LDRSpost_isel:
-  case ARM64::LDRXpost_isel:
-  case ARM64::LDRWpost_isel:
-  case ARM64::LDRHHpost_isel:
-  case ARM64::LDRBBpost_isel:
-  case ARM64::LDRSWpost_isel:
-  case ARM64::LDRSHWpost_isel:
-  case ARM64::LDRSHXpost_isel:
-  case ARM64::LDRSBWpost_isel:
-  case ARM64::LDRSBXpost_isel: {
-    MCInst TmpInst;
-    // For loads, the writeback operand to be skipped is the second.
-    TmpInst.setOpcode(getRealIndexedOpcode(MI->getOpcode()));
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(2).getReg()));
-    TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm()));
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-  case ARM64::STRXpre_isel:
-  case ARM64::STRWpre_isel:
-  case ARM64::STRHHpre_isel:
-  case ARM64::STRBBpre_isel:
-  case ARM64::STRDpre_isel:
-  case ARM64::STRSpre_isel: {
-    MCInst TmpInst;
-    // For loads, the writeback operand to be skipped is the first.
-    TmpInst.setOpcode(getRealIndexedOpcode(MI->getOpcode()));
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(2).getReg()));
-    TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm()));
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-
-  // Tail calls use pseudo instructions so they have the proper code-gen
-  // attributes (isCall, isReturn, etc.). We lower them to the real
-  // instruction here.
-  case ARM64::TCRETURNri: {
-    MCInst TmpInst;
-    TmpInst.setOpcode(ARM64::BR);
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-  case ARM64::TCRETURNdi: {
-    MCOperand Dest;
-    MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
-    MCInst TmpInst;
-    TmpInst.setOpcode(ARM64::B);
-    TmpInst.addOperand(Dest);
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-  case ARM64::TLSDESC_BLR: {
-    MCOperand Callee, Sym;
-    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
-    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
-
-    // First emit a relocation-annotation. This expands to no code, but requests
-    // the following instruction gets an R_AARCH64_TLSDESC_CALL.
-    MCInst TLSDescCall;
-    TLSDescCall.setOpcode(ARM64::TLSDESCCALL);
-    TLSDescCall.addOperand(Sym);
-    EmitToStreamer(OutStreamer, TLSDescCall);
-
-    // Other than that it's just a normal indirect call to the function loaded
-    // from the descriptor.
-    MCInst BLR;
-    BLR.setOpcode(ARM64::BLR);
-    BLR.addOperand(Callee);
-    EmitToStreamer(OutStreamer, BLR);
-
-    return;
-  }
-
-  case TargetOpcode::STACKMAP:
-    return LowerSTACKMAP(OutStreamer, SM, *MI);
-
-  case TargetOpcode::PATCHPOINT:
-    return LowerPATCHPOINT(OutStreamer, SM, *MI);
-  }
-
-  // Finally, do the automated lowerings for everything else.
-  MCInst TmpInst;
-  MCInstLowering.Lower(MI, TmpInst);
-  EmitToStreamer(OutStreamer, TmpInst);
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializeARM64AsmPrinter() {
-  RegisterAsmPrinter<ARM64AsmPrinter> X(TheARM64Target);
-}
diff --git a/lib/Target/ARM64/ARM64BranchRelaxation.cpp b/lib/Target/ARM64/ARM64BranchRelaxation.cpp
deleted file mode 100644
index a9bbef5..0000000
--- a/lib/Target/ARM64/ARM64BranchRelaxation.cpp
+++ /dev/null
@@ -1,505 +0,0 @@
-//===-- ARM64BranchRelaxation.cpp - ARM64 branch relaxation ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-branch-relax"
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
-using namespace llvm;
-
-static cl::opt<bool>
-BranchRelaxation("arm64-branch-relax", cl::Hidden, cl::init(true),
-                 cl::desc("Relax out of range conditional branches"));
-
-static cl::opt<unsigned>
-TBZDisplacementBits("arm64-tbz-offset-bits", cl::Hidden, cl::init(14),
-                    cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
-
-static cl::opt<unsigned>
-CBZDisplacementBits("arm64-cbz-offset-bits", cl::Hidden, cl::init(19),
-                    cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
-
-static cl::opt<unsigned>
-BCCDisplacementBits("arm64-bcc-offset-bits", cl::Hidden, cl::init(19),
-                    cl::desc("Restrict range of Bcc instructions (DEBUG)"));
-
-STATISTIC(NumSplit, "Number of basic blocks split");
-STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
-
-namespace {
-class ARM64BranchRelaxation : public MachineFunctionPass {
-  /// BasicBlockInfo - Information about the offset and size of a single
-  /// basic block.
-  struct BasicBlockInfo {
-    /// Offset - Distance from the beginning of the function to the beginning
-    /// of this basic block.
-    ///
-    /// The offset is always aligned as required by the basic block.
-    unsigned Offset;
-
-    /// Size - Size of the basic block in bytes.  If the block contains
-    /// inline assembly, this is a worst case estimate.
-    ///
-    /// The size does not include any alignment padding whether from the
-    /// beginning of the block, or from an aligned jump table at the end.
-    unsigned Size;
-
-    BasicBlockInfo() : Offset(0), Size(0) {}
-
-    /// Compute the offset immediately following this block.  If LogAlign is
-    /// specified, return the offset the successor block will get if it has
-    /// this alignment.
-    unsigned postOffset(unsigned LogAlign = 0) const {
-      unsigned PO = Offset + Size;
-      unsigned Align = 1 << LogAlign;
-      return (PO + Align - 1) / Align * Align;
-    }
-  };
-
-  SmallVector<BasicBlockInfo, 16> BlockInfo;
-
-  MachineFunction *MF;
-  const ARM64InstrInfo *TII;
-
-  bool relaxBranchInstructions();
-  void scanFunction();
-  MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
-  void adjustBlockOffsets(MachineBasicBlock *BB);
-  bool isBlockInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
-  bool fixupConditionalBranch(MachineInstr *MI);
-  void computeBlockSize(MachineBasicBlock *MBB);
-  unsigned getInstrOffset(MachineInstr *MI) const;
-  void dumpBBs();
-  void verify();
-
-public:
-  static char ID;
-  ARM64BranchRelaxation() : MachineFunctionPass(ID) {}
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  virtual const char *getPassName() const {
-    return "ARM64 branch relaxation pass";
-  }
-};
-char ARM64BranchRelaxation::ID = 0;
-}
-
-/// verify - check BBOffsets, BBSizes, alignment of islands
-void ARM64BranchRelaxation::verify() {
-#ifndef NDEBUG
-  unsigned PrevNum = MF->begin()->getNumber();
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); MBBI != E;
-       ++MBBI) {
-    MachineBasicBlock *MBB = MBBI;
-    unsigned Align = MBB->getAlignment();
-    unsigned Num = MBB->getNumber();
-    assert(BlockInfo[Num].Offset % (1u << Align) == 0);
-    assert(!Num || BlockInfo[PrevNum].postOffset() <= BlockInfo[Num].Offset);
-    PrevNum = Num;
-  }
-#endif
-}
-
-/// print block size and offset information - debugging
-void ARM64BranchRelaxation::dumpBBs() {
-  for (auto &MBB: *MF) {
-    const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
-    dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
-           << format("size=%#x\n", BBI.Size);
-  }
-}
-
-/// BBHasFallthrough - Return true if the specified basic block can fallthrough
-/// into the block immediately after it.
-static bool BBHasFallthrough(MachineBasicBlock *MBB) {
-  // Get the next machine basic block in the function.
-  MachineFunction::iterator MBBI = MBB;
-  // Can't fall off end of function.
-  if (std::next(MBBI) == MBB->getParent()->end())
-    return false;
-
-  MachineBasicBlock *NextBB = std::next(MBBI);
-  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
-                                        E = MBB->succ_end();
-       I != E; ++I)
-    if (*I == NextBB)
-      return true;
-
-  return false;
-}
-
-/// scanFunction - Do the initial scan of the function, building up
-/// information about each block.
-void ARM64BranchRelaxation::scanFunction() {
-  BlockInfo.clear();
-  BlockInfo.resize(MF->getNumBlockIDs());
-
-  // First thing, compute the size of all basic blocks, and see if the function
-  // has any inline assembly in it. If so, we have to be conservative about
-  // alignment assumptions, as we don't know for sure the size of any
-  // instructions in the inline assembly.
-  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
-    computeBlockSize(I);
-
-  // Compute block offsets and known bits.
-  adjustBlockOffsets(MF->begin());
-}
-
-/// computeBlockSize - Compute the size for MBB.
-/// This function updates BlockInfo directly.
-void ARM64BranchRelaxation::computeBlockSize(MachineBasicBlock *MBB) {
-  unsigned Size = 0;
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-       ++I)
-    Size += TII->GetInstSizeInBytes(I);
-  BlockInfo[MBB->getNumber()].Size = Size;
-}
-
-/// getInstrOffset - Return the current offset of the specified machine
-/// instruction from the start of the function.  This offset changes as stuff is
-/// moved around inside the function.
-unsigned ARM64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
-  MachineBasicBlock *MBB = MI->getParent();
-
-  // The offset is composed of two things: the sum of the sizes of all MBB's
-  // before this instruction's block, and the offset from the start of the block
-  // it is in.
-  unsigned Offset = BlockInfo[MBB->getNumber()].Offset;
-
-  // Sum instructions before MI in MBB.
-  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
-    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
-    Offset += TII->GetInstSizeInBytes(I);
-  }
-  return Offset;
-}
-
-void ARM64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock *Start) {
-  unsigned PrevNum = Start->getNumber();
-  MachineFunction::iterator MBBI = Start, E = MF->end();
-  for (++MBBI; MBBI != E; ++MBBI) {
-    MachineBasicBlock *MBB = MBBI;
-    unsigned Num = MBB->getNumber();
-    if (!Num) // block zero is never changed from offset zero.
-      continue;
-    // Get the offset and known bits at the end of the layout predecessor.
-    // Include the alignment of the current block.
-    unsigned LogAlign = MBBI->getAlignment();
-    BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(LogAlign);
-    PrevNum = Num;
-  }
-}
-
-/// Split the basic block containing MI into two blocks, which are joined by
-/// an unconditional branch.  Update data structures and renumber blocks to
-/// account for this change and returns the newly created block.
-/// NOTE: Successor list of the original BB is out of date after this function,
-/// and must be updated by the caller! Other transforms follow using this
-/// utility function, so no point updating now rather than waiting.
-MachineBasicBlock *
-ARM64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
-  MachineBasicBlock *OrigBB = MI->getParent();
-
-  // Create a new MBB for the code after the OrigBB.
-  MachineBasicBlock *NewBB =
-      MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
-  MachineFunction::iterator MBBI = OrigBB;
-  ++MBBI;
-  MF->insert(MBBI, NewBB);
-
-  // Splice the instructions starting with MI over to NewBB.
-  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
-
-  // Add an unconditional branch from OrigBB to NewBB.
-  // Note the new unconditional branch is not being recorded.
-  // There doesn't seem to be meaningful DebugInfo available; this doesn't
-  // correspond to anything in the source.
-  BuildMI(OrigBB, DebugLoc(), TII->get(ARM64::B)).addMBB(NewBB);
-
-  // Insert an entry into BlockInfo to align it properly with the block numbers.
-  BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
-
-  // Figure out how large the OrigBB is.  As the first half of the original
-  // block, it cannot contain a tablejump.  The size includes
-  // the new jump we added.  (It should be possible to do this without
-  // recounting everything, but it's very confusing, and this is rarely
-  // executed.)
-  computeBlockSize(OrigBB);
-
-  // Figure out how large the NewMBB is.  As the second half of the original
-  // block, it may contain a tablejump.
-  computeBlockSize(NewBB);
-
-  // All BBOffsets following these blocks must be modified.
-  adjustBlockOffsets(OrigBB);
-
-  ++NumSplit;
-
-  return NewBB;
-}
-
-/// isBlockInRange - Returns true if the distance between specific MI and
-/// specific BB can fit in MI's displacement field.
-bool ARM64BranchRelaxation::isBlockInRange(MachineInstr *MI,
-                                           MachineBasicBlock *DestBB,
-                                           unsigned Bits) {
-  unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2;
-  unsigned BrOffset = getInstrOffset(MI);
-  unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset;
-
-  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
-               << " from BB#" << MI->getParent()->getNumber()
-               << " max delta=" << MaxOffs << " from " << getInstrOffset(MI)
-               << " to " << DestOffset << " offset "
-               << int(DestOffset - BrOffset) << "\t" << *MI);
-
-  // Branch before the Dest.
-  if (BrOffset <= DestOffset)
-    return (DestOffset - BrOffset <= MaxOffs);
-  return (BrOffset - DestOffset <= MaxOffs);
-}
-
-static bool isConditionalBranch(unsigned Opc) {
-  switch (Opc) {
-  default:
-    return false;
-  case ARM64::TBZ:
-  case ARM64::TBNZ:
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-  case ARM64::Bcc:
-    return true;
-  }
-}
-
-static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
-  default:
-    assert(0 && "unexpected opcode!");
-  case ARM64::TBZ:
-  case ARM64::TBNZ:
-    return MI->getOperand(2).getMBB();
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-  case ARM64::Bcc:
-    return MI->getOperand(1).getMBB();
-  }
-}
-
-static unsigned getOppositeConditionOpcode(unsigned Opc) {
-  switch (Opc) {
-  default:
-    assert(0 && "unexpected opcode!");
-  case ARM64::TBNZ:    return ARM64::TBZ;
-  case ARM64::TBZ:     return ARM64::TBNZ;
-  case ARM64::CBNZW:   return ARM64::CBZW;
-  case ARM64::CBNZX:   return ARM64::CBZX;
-  case ARM64::CBZW:    return ARM64::CBNZW;
-  case ARM64::CBZX:    return ARM64::CBNZX;
-  case ARM64::Bcc:     return ARM64::Bcc; // Condition is an operand for Bcc.
-  }
-}
-
-static unsigned getBranchDisplacementBits(unsigned Opc) {
-  switch (Opc) {
-  default:
-    assert(0 && "unexpected opcode!");
-  case ARM64::TBNZ:
-  case ARM64::TBZ:
-    return TBZDisplacementBits;
-  case ARM64::CBNZW:
-  case ARM64::CBZW:
-  case ARM64::CBNZX:
-  case ARM64::CBZX:
-    return CBZDisplacementBits;
-  case ARM64::Bcc:
-    return BCCDisplacementBits;
-  }
-}
-
-static inline void invertBccCondition(MachineInstr *MI) {
-  assert(MI->getOpcode() == ARM64::Bcc && "Unexpected opcode!");
-  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(0).getImm();
-  CC = ARM64CC::getInvertedCondCode(CC);
-  MI->getOperand(0).setImm((int64_t)CC);
-}
-
-/// fixupConditionalBranch - Fix up a conditional branch whose destination is
-/// too far away to fit in its displacement field. It is converted to an inverse
-/// conditional branch + an unconditional branch to the destination.
-bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
-  MachineBasicBlock *DestBB = getDestBlock(MI);
-
-  // Add an unconditional branch to the destination and invert the branch
-  // condition to jump over it:
-  // tbz L1
-  // =>
-  // tbnz L2
-  // b   L1
-  // L2:
-
-  // If the branch is at the end of its MBB and that has a fall-through block,
-  // direct the updated conditional branch to the fall-through block. Otherwise,
-  // split the MBB before the next instruction.
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineInstr *BMI = &MBB->back();
-  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
-
-  if (BMI != MI) {
-    if (std::next(MachineBasicBlock::iterator(MI)) ==
-            std::prev(MBB->getLastNonDebugInstr()) &&
-        BMI->getOpcode() == ARM64::B) {
-      // Last MI in the BB is an unconditional branch. Can we simply invert the
-      // condition and swap destinations:
-      // beq L1
-      // b   L2
-      // =>
-      // bne L2
-      // b   L1
-      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
-      if (isBlockInRange(MI, NewDest,
-                         getBranchDisplacementBits(MI->getOpcode()))) {
-        DEBUG(dbgs() << "  Invert condition and swap its destination with "
-                     << *BMI);
-        BMI->getOperand(0).setMBB(DestBB);
-        unsigned OpNum =
-            (MI->getOpcode() == ARM64::TBZ || MI->getOpcode() == ARM64::TBNZ)
-                ? 2
-                : 1;
-        MI->getOperand(OpNum).setMBB(NewDest);
-        MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode())));
-        if (MI->getOpcode() == ARM64::Bcc)
-          invertBccCondition(MI);
-        return true;
-      }
-    }
-  }
-
-  if (NeedSplit) {
-    // Analyze the branch so we know how to update the successor lists.
-    MachineBasicBlock *TBB, *FBB;
-    SmallVector<MachineOperand, 2> Cond;
-    TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false);
-
-    MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI);
-    // No need for the branch to the next block. We're adding an unconditional
-    // branch to the destination.
-    int delta = TII->GetInstSizeInBytes(&MBB->back());
-    BlockInfo[MBB->getNumber()].Size -= delta;
-    MBB->back().eraseFromParent();
-    // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below
-
-    // Update the successor lists according to the transformation to follow.
-    // Do it here since if there's no split, no update is needed.
-    MBB->replaceSuccessor(FBB, NewBB);
-    NewBB->addSuccessor(FBB);
-  }
-  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
-
-  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
-               << ", invert condition and change dest. to BB#"
-               << NextBB->getNumber() << "\n");
-
-  // Insert a new conditional branch and a new unconditional branch.
-  MachineInstrBuilder MIB = BuildMI(
-      MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode())))
-                                .addOperand(MI->getOperand(0));
-  if (MI->getOpcode() == ARM64::TBZ || MI->getOpcode() == ARM64::TBNZ)
-    MIB.addOperand(MI->getOperand(1));
-  if (MI->getOpcode() == ARM64::Bcc)
-    invertBccCondition(MIB);
-  MIB.addMBB(NextBB);
-  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
-  BuildMI(MBB, DebugLoc(), TII->get(ARM64::B)).addMBB(DestBB);
-  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
-
-  // Remove the old conditional branch.  It may or may not still be in MBB.
-  BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
-  MI->eraseFromParent();
-
-  // Finally, keep the block offsets up to date.
-  adjustBlockOffsets(MBB);
-  return true;
-}
-
-bool ARM64BranchRelaxation::relaxBranchInstructions() {
-  bool Changed = false;
-  // Relaxing branches involves creating new basic blocks, so re-eval
-  // end() for termination.
-  for (auto &MBB : *MF) {
-    MachineInstr *MI = MBB.getFirstTerminator();
-    if (isConditionalBranch(MI->getOpcode()) &&
-        !isBlockInRange(MI, getDestBlock(MI),
-                        getBranchDisplacementBits(MI->getOpcode()))) {
-      fixupConditionalBranch(MI);
-      ++NumRelaxed;
-      Changed = true;
-    }
-  }
-  return Changed;
-}
-
-bool ARM64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
-  MF = &mf;
-
-  // If the pass is disabled, just bail early.
-  if (!BranchRelaxation)
-    return false;
-
-  DEBUG(dbgs() << "***** ARM64BranchRelaxation *****\n");
-
-  TII = (const ARM64InstrInfo *)MF->getTarget().getInstrInfo();
-
-  // Renumber all of the machine basic blocks in the function, guaranteeing that
-  // the numbers agree with the position of the block in the function.
-  MF->RenumberBlocks();
-
-  // Do the initial scan of the function, building up information about the
-  // sizes of each block.
-  scanFunction();
-
-  DEBUG(dbgs() << "  Basic blocks before relaxation\n");
-  DEBUG(dumpBBs());
-
-  bool MadeChange = false;
-  while (relaxBranchInstructions())
-    MadeChange = true;
-
-  // After a while, this might be made debug-only, but it is not expensive.
-  verify();
-
-  DEBUG(dbgs() << "  Basic blocks after relaxation\n");
-  DEBUG(dbgs() << '\n'; dumpBBs());
-
-  BlockInfo.clear();
-
-  return MadeChange;
-}
-
-/// createARM64BranchRelaxation - returns an instance of the constpool
-/// island pass.
-FunctionPass *llvm::createARM64BranchRelaxation() {
-  return new ARM64BranchRelaxation();
-}
diff --git a/lib/Target/ARM64/ARM64CallingConv.h b/lib/Target/ARM64/ARM64CallingConv.h
deleted file mode 100644
index 0128236..0000000
--- a/lib/Target/ARM64/ARM64CallingConv.h
+++ /dev/null
@@ -1,94 +0,0 @@
-//=== ARM64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the custom routines for the ARM64 Calling Convention that
-// aren't done by tablegen.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64CALLINGCONV_H
-#define ARM64CALLINGCONV_H
-
-#include "ARM64InstrInfo.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-namespace llvm {
-
-/// CC_ARM64_Custom_i1i8i16_Reg - customized handling of passing i1/i8/i16 via
-/// register. Here, ValVT can be i1/i8/i16 or i32 depending on whether the
-/// argument is already promoted and LocVT is i1/i8/i16. We only promote the
-/// argument to i32 if we are sure this argument will be passed in register.
-static bool CC_ARM64_Custom_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                        CCValAssign::LocInfo LocInfo,
-                                        ISD::ArgFlagsTy ArgFlags,
-                                        CCState &State,
-                                        bool IsWebKitJS = false) {
-  static const uint16_t RegList1[] = { ARM64::W0, ARM64::W1, ARM64::W2,
-                                       ARM64::W3, ARM64::W4, ARM64::W5,
-                                       ARM64::W6, ARM64::W7 };
-  static const uint16_t RegList2[] = { ARM64::X0, ARM64::X1, ARM64::X2,
-                                       ARM64::X3, ARM64::X4, ARM64::X5,
-                                       ARM64::X6, ARM64::X7 };
-  static const uint16_t WebKitRegList1[] = { ARM64::W0 };
-  static const uint16_t WebKitRegList2[] = { ARM64::X0 };
-
-  const uint16_t *List1 = IsWebKitJS ? WebKitRegList1 : RegList1;
-  const uint16_t *List2 = IsWebKitJS ? WebKitRegList2 : RegList2;
-
-  if (unsigned Reg = State.AllocateReg(List1, List2, 8)) {
-    // Customized extra section for handling i1/i8/i16:
-    // We need to promote the argument to i32 if it is not done already.
-    if (ValVT != MVT::i32) {
-      if (ArgFlags.isSExt())
-        LocInfo = CCValAssign::SExt;
-      else if (ArgFlags.isZExt())
-        LocInfo = CCValAssign::ZExt;
-      else
-        LocInfo = CCValAssign::AExt;
-      ValVT = MVT::i32;
-    }
-    // Set LocVT to i32 as well if passing via register.
-    LocVT = MVT::i32;
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    return true;
-  }
-  return false;
-}
-
-/// CC_ARM64_WebKit_JS_i1i8i16_Reg - customized handling of passing i1/i8/i16
-/// via register. This behaves the same as CC_ARM64_Custom_i1i8i16_Reg, but only
-/// uses the first register.
-static bool CC_ARM64_WebKit_JS_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                           CCValAssign::LocInfo LocInfo,
-                                           ISD::ArgFlagsTy ArgFlags,
-                                           CCState &State) {
-  return CC_ARM64_Custom_i1i8i16_Reg(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
-                                     State, true);
-}
-
-/// CC_ARM64_Custom_i1i8i16_Stack: customized handling of passing i1/i8/i16 on
-/// stack. Here, ValVT can be i1/i8/i16 or i32 depending on whether the argument
-/// is already promoted and LocVT is i1/i8/i16. If ValVT is already promoted,
-/// it will be truncated back to i1/i8/i16.
-static bool CC_ARM64_Custom_i1i8i16_Stack(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                          CCValAssign::LocInfo LocInfo,
-                                          ISD::ArgFlagsTy ArgFlags,
-                                          CCState &State) {
-  unsigned Space = ((LocVT == MVT::i1 || LocVT == MVT::i8) ? 1 : 2);
-  unsigned Offset12 = State.AllocateStack(Space, Space);
-  ValVT = LocVT;
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset12, LocVT, LocInfo));
-  return true;
-}
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/ARM64/ARM64CallingConvention.td b/lib/Target/ARM64/ARM64CallingConvention.td
deleted file mode 100644
index 9ac888f..0000000
--- a/lib/Target/ARM64/ARM64CallingConvention.td
+++ /dev/null
@@ -1,210 +0,0 @@
-//===- ARM64CallingConv.td - Calling Conventions for ARM64 -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This describes the calling conventions for ARM64 architecture.
-//
-//===----------------------------------------------------------------------===//
-
-/// CCIfAlign - Match of the original alignment of the arg
-class CCIfAlign<string Align, CCAction A> :
-  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
-
-//===----------------------------------------------------------------------===//
-// ARM AAPCS64 Calling Convention
-//===----------------------------------------------------------------------===//
-
-def CC_ARM64_AAPCS : CallingConv<[
-  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
-  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
-
-  // An SRet is passed in X8, not X0 like a normal pointer parameter.
-  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
-
-  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
-  // up to eight each of GPR and FPR.
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
-  // i128 is split to two i64s, we can't fit half to register X7.
-  CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6],
-                                                    [X0, X1, X3, X5]>>>,
-
-  // i128 is split to two i64s, and its stack alignment is 16 bytes.
-  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
-
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
-           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
-           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-
-  // If more than will fit in registers, pass them on the stack instead.
-  CCIfType<[i1, i8, i16], CCAssignToStack<8, 8>>,
-  CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
-  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
-           CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
-]>;
-
-def RetCC_ARM64_AAPCS : CallingConv<[
-  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
-  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
-
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
-      CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                              [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
-      CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
-]>;
-
-
-// Darwin uses a calling convention which differs in only two ways
-// from the standard one at this level:
-//     + i128s (i.e. split i64s) don't need even registers.
-//     + Stack slots are sized as needed rather than being at least 64-bit.
-def CC_ARM64_DarwinPCS : CallingConv<[
-  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
-  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
-
-  // An SRet is passed in X8, not X0 like a normal pointer parameter.
-  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
-
-  // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
-  // up to eight each of GPR and FPR.
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
-  // i128 is split to two i64s, we can't fit half to register X7.
-  CCIfType<[i64],
-           CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6],
-                                             [W0, W1, W2, W3, W4, W5, W6]>>>,
-  // i128 is split to two i64s, and its stack alignment is 16 bytes.
-  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
-
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
-           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
-           CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-
-  // If more than will fit in registers, pass them on the stack instead.
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Stack">>,
-  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
-           CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
-]>;
-
-def CC_ARM64_DarwinPCS_VarArg : CallingConv<[
-  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
-  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
-
-  // Handle all scalar types as either i64 or f64.
-  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
-  CCIfType<[f32],          CCPromoteToType<f64>>,
-
-  // Everything is on the stack.
-  // i128 is split to two i64s, and its stack alignment is 16 bytes.
-  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
-  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],   CCAssignToStack<16, 16>>
-]>;
-
-// The WebKit_JS calling convention only passes the first argument (the callee)
-// in register and the remaining arguments on stack. We allow 32bit stack slots,
-// so that WebKit can write partial values in the stack and define the other
-// 32bit quantity as undef.
-def CC_ARM64_WebKit_JS : CallingConv<[
-  // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_WebKit_JS_i1i8i16_Reg">>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
-
-  // Pass the remaining arguments on the stack instead.
-  CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>,
-  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
-]>;
-
-def RetCC_ARM64_WebKit_JS : CallingConv<[
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
-]>;
-
-// FIXME: LR is only callee-saved in the sense that *we* preserve it and are
-// presumably a callee to someone. External functions may not do so, but this
-// is currently safe since BL has LR as an implicit-def and what happens after a
-// tail call doesn't matter.
-//
-// It would be better to model its preservation semantics properly (create a
-// vreg on entry, use it in RET & tail call generation; make that vreg def if we
-// end up saving LR as part of a call frame). Watch this space...
-def CSR_ARM64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
-                                           X23, X24, X25, X26, X27, X28,
-                                           D8,  D9,  D10, D11,
-                                           D12, D13, D14, D15)>;
-
-// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
-// 'this' and the pointer return value are both passed in X0 in these cases,
-// this can be partially modelled by treating X0 as a callee-saved register;
-// only the resulting RegMask is used; the SaveList is ignored
-//
-// (For generic ARM 64-bit ABI code, clang will not generate constructors or
-// destructors with 'this' returns, so this RegMask will not be used in that
-// case)
-def CSR_ARM64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_ARM64_AAPCS, X0)>;
-
-// The function used by Darwin to obtain the address of a thread-local variable
-// guarantees more than a normal AAPCS function. x16 and x17 are used on the
-// fast path for calculation, but other registers except X0 (argument/return)
-// and LR (it is a call, after all) are preserved.
-def CSR_ARM64_TLS_Darwin
-    : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
-                           FP,
-                           (sequence "Q%u", 0, 31))>;
-
-// The ELF stub used for TLS-descriptor access saves every feasible
-// register. Only X0 and LR are clobbered.
-def CSR_ARM64_TLS_ELF
-    : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP,
-                           (sequence "Q%u", 0, 31))>;
-
-def CSR_ARM64_AllRegs
-    : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP,
-                           (sequence "X%u", 0, 28), FP, LR, SP,
-                           (sequence "B%u", 0, 31), (sequence "H%u", 0, 31),
-                           (sequence "S%u", 0, 31), (sequence "D%u", 0, 31),
-                           (sequence "Q%u", 0, 31))>;
-
diff --git a/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp b/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp
deleted file mode 100644
index e3f8248..0000000
--- a/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-//===-- ARM64CleanupLocalDynamicTLSPass.cpp -----------------------*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Local-dynamic access to thread-local variables proceeds in three stages.
-//
-// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated
-//    in much the same way as a general-dynamic TLS-descriptor access against
-//    the special symbol _TLS_MODULE_BASE.
-// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using
-//    instructions with "dtprel" modifiers.
-// 3. These two are added, together with TPIDR_EL0, to obtain the variable's
-//    true address.
-//
-// This is only better than general-dynamic access to the variable if two or
-// more of the first stage TLS-descriptor calculations can be combined. This
-// pass looks through a function and performs such combinations.
-//
-//===----------------------------------------------------------------------===//
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-using namespace llvm;
-
-namespace {
-struct LDTLSCleanup : public MachineFunctionPass {
-  static char ID;
-  LDTLSCleanup() : MachineFunctionPass(ID) {}
-
-  virtual bool runOnMachineFunction(MachineFunction &MF) {
-    ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-    if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
-      // No point folding accesses if there isn't at least two.
-      return false;
-    }
-
-    MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
-    return VisitNode(DT->getRootNode(), 0);
-  }
-
-  // Visit the dominator subtree rooted at Node in pre-order.
-  // If TLSBaseAddrReg is non-null, then use that to replace any
-  // TLS_base_addr instructions. Otherwise, create the register
-  // when the first such instruction is seen, and then use it
-  // as we encounter more instructions.
-  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
-    MachineBasicBlock *BB = Node->getBlock();
-    bool Changed = false;
-
-    // Traverse the current block.
-    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
-         ++I) {
-      switch (I->getOpcode()) {
-      case ARM64::TLSDESC_BLR:
-        // Make sure it's a local dynamic access.
-        if (!I->getOperand(1).isSymbol() ||
-            strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
-          break;
-
-        if (TLSBaseAddrReg)
-          I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg);
-        else
-          I = setRegister(I, &TLSBaseAddrReg);
-        Changed = true;
-        break;
-      default:
-        break;
-      }
-    }
-
-    // Visit the children of this block in the dominator tree.
-    for (MachineDomTreeNode *N : *Node) {
-      Changed |= VisitNode(N, TLSBaseAddrReg);
-    }
-
-    return Changed;
-  }
-
-  // Replace the TLS_base_addr instruction I with a copy from
-  // TLSBaseAddrReg, returning the new instruction.
-  MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
-                                       unsigned TLSBaseAddrReg) {
-    MachineFunction *MF = I->getParent()->getParent();
-    const ARM64TargetMachine *TM =
-        static_cast<const ARM64TargetMachine *>(&MF->getTarget());
-    const ARM64InstrInfo *TII = TM->getInstrInfo();
-
-    // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
-    // code sequence assumes the address will be.
-    MachineInstr *Copy =
-        BuildMI(*I->getParent(), I, I->getDebugLoc(),
-                TII->get(TargetOpcode::COPY), ARM64::X0).addReg(TLSBaseAddrReg);
-
-    // Erase the TLS_base_addr instruction.
-    I->eraseFromParent();
-
-    return Copy;
-  }
-
-  // Create a virtal register in *TLSBaseAddrReg, and populate it by
-  // inserting a copy instruction after I. Returns the new instruction.
-  MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
-    MachineFunction *MF = I->getParent()->getParent();
-    const ARM64TargetMachine *TM =
-        static_cast<const ARM64TargetMachine *>(&MF->getTarget());
-    const ARM64InstrInfo *TII = TM->getInstrInfo();
-
-    // Create a virtual register for the TLS base address.
-    MachineRegisterInfo &RegInfo = MF->getRegInfo();
-    *TLSBaseAddrReg = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
-
-    // Insert a copy from X0 to TLSBaseAddrReg for later.
-    MachineInstr *Next = I->getNextNode();
-    MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
-                                 TII->get(TargetOpcode::COPY),
-                                 *TLSBaseAddrReg).addReg(ARM64::X0);
-
-    return Copy;
-  }
-
-  virtual const char *getPassName() const {
-    return "Local Dynamic TLS Access Clean-up";
-  }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-}
-
-char LDTLSCleanup::ID = 0;
-FunctionPass *llvm::createARM64CleanupLocalDynamicTLSPass() {
-  return new LDTLSCleanup();
-}
diff --git a/lib/Target/ARM64/ARM64CollectLOH.cpp b/lib/Target/ARM64/ARM64CollectLOH.cpp
deleted file mode 100644
index f52778f..0000000
--- a/lib/Target/ARM64/ARM64CollectLOH.cpp
+++ /dev/null
@@ -1,1157 +0,0 @@
-//===-------------- ARM64CollectLOH.cpp - ARM64 collect LOH pass --*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that collect the Linker Optimization Hint (LOH).
-// This pass should be run at the very end of the compilation flow, just before
-// assembly printer.
-// To be useful for the linker, the LOH must be printed into the assembly file.
-//
-// A LOH describes a sequence of instructions that may be optimized by the
-// linker.
-// This same sequence cannot be optimized by the compiler because some of
-// the information will be known at link time.
-// For instance, consider the following sequence:
-//     L1: adrp xA, sym@PAGE
-//     L2: add xB, xA, sym@PAGEOFF
-//     L3: ldr xC, [xB, #imm]
-// This sequence can be turned into:
-// A literal load if sym@PAGE + sym@PAGEOFF + #imm - address(L3) is < 1MB:
-//     L3: ldr xC, sym+#imm
-// It may also be turned into either the following more efficient
-// code sequences:
-// - If sym@PAGEOFF + #imm fits the encoding space of L3.
-//     L1: adrp xA, sym@PAGE
-//     L3: ldr xC, [xB, sym@PAGEOFF + #imm]
-// - If sym@PAGE + sym@PAGEOFF - address(L1) < 1MB:
-//     L1: adr xA, sym
-//     L3: ldr xC, [xB, #imm]
-//
-// To be valid a LOH must meet all the requirements needed by all the related
-// possible linker transformations.
-// For instance, using the running example, the constraints to emit
-// ".loh AdrpAddLdr" are:
-// - L1, L2, and L3 instructions are of the expected type, i.e.,
-//   respectively ADRP, ADD (immediate), and LD.
-// - The result of L1 is used only by L2.
-// - The register argument (xA) used in the ADD instruction is defined
-//   only by L1.
-// - The result of L2 is used only by L3.
-// - The base address (xB) in L3 is defined only L2.
-// - The ADRP in L1 and the ADD in L2 must reference the same symbol using
-//   @PAGE/@PAGEOFF with no additional constants
-//
-// Currently supported LOHs are:
-// * So called non-ADRP-related:
-//   - .loh AdrpAddLdr L1, L2, L3:
-//     L1: adrp xA, sym@PAGE
-//     L2: add xB, xA, sym@PAGEOFF
-//     L3: ldr xC, [xB, #imm]
-//   - .loh AdrpLdrGotLdr L1, L2, L3:
-//     L1: adrp xA, sym@GOTPAGE
-//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
-//     L3: ldr xC, [xB, #imm]
-//   - .loh AdrpLdr L1, L3:
-//     L1: adrp xA, sym@PAGE
-//     L3: ldr xC, [xA, sym@PAGEOFF]
-//   - .loh AdrpAddStr L1, L2, L3:
-//     L1: adrp xA, sym@PAGE
-//     L2: add xB, xA, sym@PAGEOFF
-//     L3: str xC, [xB, #imm]
-//   - .loh AdrpLdrGotStr L1, L2, L3:
-//     L1: adrp xA, sym@GOTPAGE
-//     L2: ldr xB, [xA, sym@GOTPAGEOFF]
-//     L3: str xC, [xB, #imm]
-//   - .loh AdrpAdd L1, L2:
-//     L1: adrp xA, sym@PAGE
-//     L2: add xB, xA, sym@PAGEOFF
-//   For all these LOHs, L1, L2, L3 form a simple chain:
-//   L1 result is used only by L2 and L2 result by L3.
-//   L3 LOH-related argument is defined only by L2 and L2 LOH-related argument
-//   by L1.
-// All these LOHs aim at using more efficient load/store patterns by folding
-// some instructions used to compute the address directly into the load/store.
-//
-// * So called ADRP-related:
-//  - .loh AdrpAdrp L2, L1:
-//    L2: ADRP xA, sym1@PAGE
-//    L1: ADRP xA, sym2@PAGE
-//    L2 dominates L1 and xA is not redifined between L2 and L1
-// This LOH aims at getting rid of redundant ADRP instructions.
-//
-// The overall design for emitting the LOHs is:
-// 1. ARM64CollectLOH (this pass) records the LOHs in the ARM64FunctionInfo.
-// 2. ARM64AsmPrinter reads the LOHs from ARM64FunctionInfo and it:
-//     1. Associates them a label.
-//     2. Emits them in a MCStreamer (EmitLOHDirective).
-//         - The MCMachOStreamer records them into the MCAssembler.
-//         - The MCAsmStreamer prints them.
-//         - Other MCStreamers ignore them.
-//     3. Closes the MCStreamer:
-//         - The MachObjectWriter gets them from the MCAssembler and writes
-//           them in the object file.
-//         - Other ObjectWriters ignore them.
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-collect-loh"
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
-using namespace llvm;
-
-static cl::opt<bool>
-PreCollectRegister("arm64-collect-loh-pre-collect-register", cl::Hidden,
-                   cl::desc("Restrict analysis to registers invovled"
-                            " in LOHs"),
-                   cl::init(true));
-
-static cl::opt<bool>
-BasicBlockScopeOnly("arm64-collect-loh-bb-only", cl::Hidden,
-                    cl::desc("Restrict analysis at basic block scope"),
-                    cl::init(true));
-
-STATISTIC(NumADRPSimpleCandidate,
-          "Number of simplifiable ADRP dominate by another");
-STATISTIC(NumADRPComplexCandidate2,
-          "Number of simplifiable ADRP reachable by 2 defs");
-STATISTIC(NumADRPComplexCandidate3,
-          "Number of simplifiable ADRP reachable by 3 defs");
-STATISTIC(NumADRPComplexCandidateOther,
-          "Number of simplifiable ADRP reachable by 4 or more defs");
-STATISTIC(NumADDToSTRWithImm,
-          "Number of simplifiable STR with imm reachable by ADD");
-STATISTIC(NumLDRToSTRWithImm,
-          "Number of simplifiable STR with imm reachable by LDR");
-STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD");
-STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR");
-STATISTIC(NumADDToLDRWithImm,
-          "Number of simplifiable LDR with imm reachable by ADD");
-STATISTIC(NumLDRToLDRWithImm,
-          "Number of simplifiable LDR with imm reachable by LDR");
-STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD");
-STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR");
-STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP");
-STATISTIC(NumCplxLvl1, "Number of complex case of level 1");
-STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1");
-STATISTIC(NumCplxLvl2, "Number of complex case of level 2");
-STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2");
-STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
-STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
-
-namespace llvm {
-void initializeARM64CollectLOHPass(PassRegistry &);
-}
-
-namespace {
-struct ARM64CollectLOH : public MachineFunctionPass {
-  static char ID;
-  ARM64CollectLOH() : MachineFunctionPass(ID) {
-    initializeARM64CollectLOHPass(*PassRegistry::getPassRegistry());
-  }
-
-  virtual bool runOnMachineFunction(MachineFunction &Fn);
-
-  virtual const char *getPassName() const {
-    return "ARM64 Collect Linker Optimization Hint (LOH)";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<MachineDominatorTree>();
-  }
-
-private:
-};
-
-/// A set of MachineInstruction.
-typedef SetVector<const MachineInstr *> SetOfMachineInstr;
-/// Map a basic block to a set of instructions per register.
-/// This is used to represent the exposed uses of a basic block
-/// per register.
-typedef MapVector<const MachineBasicBlock *, SetOfMachineInstr *>
-BlockToSetOfInstrsPerColor;
-/// Map a basic block to an instruction per register.
-/// This is used to represent the live-out definitions of a basic block
-/// per register.
-typedef MapVector<const MachineBasicBlock *, const MachineInstr **>
-BlockToInstrPerColor;
-/// Map an instruction to a set of instructions. Used to represent the
-/// mapping def to reachable uses or use to definitions.
-typedef MapVector<const MachineInstr *, SetOfMachineInstr> InstrToInstrs;
-/// Map a basic block to a BitVector.
-/// This is used to record the kill registers per basic block.
-typedef MapVector<const MachineBasicBlock *, BitVector> BlockToRegSet;
-
-/// Map a register to a dense id.
-typedef DenseMap<unsigned, unsigned> MapRegToId;
-/// Map a dense id to a register. Used for debug purposes.
-typedef SmallVector<unsigned, 32> MapIdToReg;
-} // end anonymous namespace.
-
-char ARM64CollectLOH::ID = 0;
-
-INITIALIZE_PASS_BEGIN(ARM64CollectLOH, "arm64-collect-loh",
-                      "ARM64 Collect Linker Optimization Hint (LOH)", false,
-                      false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(ARM64CollectLOH, "arm64-collect-loh",
-                    "ARM64 Collect Linker Optimization Hint (LOH)", false,
-                    false)
-
-/// Given a couple (MBB, reg) get the corresponding set of instruction from
-/// the given "sets".
-/// If this couple does not reference any set, an empty set is added to "sets"
-/// for this couple and returned.
-/// \param nbRegs is used internally allocate some memory. It must be consistent
-/// with the way sets is used.
-static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
-                                 const MachineBasicBlock *MBB, unsigned reg,
-                                 unsigned nbRegs) {
-  SetOfMachineInstr *result;
-  BlockToSetOfInstrsPerColor::iterator it = sets.find(MBB);
-  if (it != sets.end()) {
-    result = it->second;
-  } else {
-    result = sets[MBB] = new SetOfMachineInstr[nbRegs];
-  }
-
-  return result[reg];
-}
-
-/// Given a couple (reg, MI) get the corresponding set of instructions from the
-/// the given "sets".
-/// This is used to get the uses record in sets of a definition identified by
-/// MI and reg, i.e., MI defines reg.
-/// If the couple does not reference anything, an empty set is added to
-/// "sets[reg]".
-/// \pre set[reg] is valid.
-static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg,
-                                  const MachineInstr *MI) {
-  return sets[reg][MI];
-}
-
-/// Same as getUses but does not modify the input map: sets.
-/// \return NULL if the couple (reg, MI) is not in sets.
-static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
-                                        const MachineInstr *MI) {
-  InstrToInstrs::const_iterator Res = sets[reg].find(MI);
-  if (Res != sets[reg].end())
-    return &(Res->second);
-  return NULL;
-}
-
-/// Initialize the reaching definition algorithm:
-/// For each basic block BB in MF, record:
-/// - its kill set.
-/// - its reachable uses (uses that are exposed to BB's predecessors).
-/// - its the generated definitions.
-/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to
-/// the list of uses of exposed defintions.
-/// \param ADRPMode specifies to only consider ADRP instructions for generated
-/// definition. It also consider definitions of ADRP instructions as uses and
-/// ignore other uses. The ADRPMode is used to collect the information for LHO
-/// that involve ADRP operation only.
-static void initReachingDef(MachineFunction *MF,
-                            InstrToInstrs *ColorOpToReachedUses,
-                            BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
-                            BlockToSetOfInstrsPerColor &ReachableUses,
-                            const MapRegToId &RegToId,
-                            const MachineInstr *DummyOp, bool ADRPMode) {
-  const TargetMachine &TM = MF->getTarget();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-
-  unsigned NbReg = RegToId.size();
-
-  for (MachineFunction::const_iterator IMBB = MF->begin(), IMBBEnd = MF->end();
-       IMBB != IMBBEnd; ++IMBB) {
-    const MachineBasicBlock *MBB = &(*IMBB);
-    const MachineInstr **&BBGen = Gen[MBB];
-    BBGen = new const MachineInstr *[NbReg];
-    memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg);
-
-    BitVector &BBKillSet = Kill[MBB];
-    BBKillSet.resize(NbReg);
-    for (MachineBasicBlock::const_iterator II = MBB->begin(), IEnd = MBB->end();
-         II != IEnd; ++II) {
-      bool IsADRP = II->getOpcode() == ARM64::ADRP;
-
-      // Process uses first.
-      if (IsADRP || !ADRPMode)
-        for (MachineInstr::const_mop_iterator IO = II->operands_begin(),
-                                              IOEnd = II->operands_end();
-             IO != IOEnd; ++IO) {
-          // Treat ADRP def as use, as the goal of the analysis is to find
-          // ADRP defs reached by other ADRP defs.
-          if (!IO->isReg() || (!ADRPMode && !IO->isUse()) ||
-              (ADRPMode && (!IsADRP || !IO->isDef())))
-            continue;
-          unsigned CurReg = IO->getReg();
-          MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
-          if (ItCurRegId == RegToId.end())
-            continue;
-          CurReg = ItCurRegId->second;
-
-          // if CurReg has not been defined, this use is reachable.
-          if (!BBGen[CurReg] && !BBKillSet.test(CurReg))
-            getSet(ReachableUses, MBB, CurReg, NbReg).insert(&(*II));
-          // current basic block definition for this color, if any, is in Gen.
-          if (BBGen[CurReg])
-            getUses(ColorOpToReachedUses, CurReg, BBGen[CurReg]).insert(&(*II));
-        }
-
-      // Process clobbers.
-      for (MachineInstr::const_mop_iterator IO = II->operands_begin(),
-                                            IOEnd = II->operands_end();
-           IO != IOEnd; ++IO) {
-        if (!IO->isRegMask())
-          continue;
-        // Clobbers kill the related colors.
-        const uint32_t *PreservedRegs = IO->getRegMask();
-
-        // Set generated regs.
-        for (const auto Entry : RegToId) {
-          unsigned Reg = Entry.second;
-          // Use the global register ID when querying APIs external to this
-          // pass.
-          if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) {
-            // Do not register clobbered definition for no ADRP.
-            // This definition is not used anyway (otherwise register
-            // allocation is wrong).
-            BBGen[Reg] = ADRPMode ? II : NULL;
-            BBKillSet.set(Reg);
-          }
-        }
-      }
-
-      // Process defs
-      for (MachineInstr::const_mop_iterator IO = II->operands_begin(),
-                                            IOEnd = II->operands_end();
-           IO != IOEnd; ++IO) {
-        if (!IO->isReg() || !IO->isDef())
-          continue;
-        unsigned CurReg = IO->getReg();
-        MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
-        if (ItCurRegId == RegToId.end())
-          continue;
-
-        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) {
-          MapRegToId::const_iterator ItRegId = RegToId.find(*AI);
-          assert(ItRegId != RegToId.end() &&
-                 "Sub-register of an "
-                 "involved register, not recorded as involved!");
-          BBKillSet.set(ItRegId->second);
-          BBGen[ItRegId->second] = &(*II);
-        }
-        BBGen[ItCurRegId->second] = &(*II);
-      }
-    }
-
-    // If we restrict our analysis to basic block scope, conservatively add a
-    // dummy
-    // use for each generated value.
-    if (!ADRPMode && DummyOp && !MBB->succ_empty())
-      for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg)
-        if (BBGen[CurReg])
-          getUses(ColorOpToReachedUses, CurReg, BBGen[CurReg]).insert(DummyOp);
-  }
-}
-
-/// Reaching def core algorithm:
-/// while an Out has changed
-///    for each bb
-///       for each color
-///           In[bb][color] = U Out[bb.predecessors][color]
-///           insert reachableUses[bb][color] in each in[bb][color]
-///                 op.reachedUses
-///
-///           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
-static void reachingDefAlgorithm(MachineFunction *MF,
-                                 InstrToInstrs *ColorOpToReachedUses,
-                                 BlockToSetOfInstrsPerColor &In,
-                                 BlockToSetOfInstrsPerColor &Out,
-                                 BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
-                                 BlockToSetOfInstrsPerColor &ReachableUses,
-                                 unsigned NbReg) {
-  bool HasChanged;
-  do {
-    HasChanged = false;
-    for (MachineFunction::const_iterator IMBB = MF->begin(),
-                                         IMBBEnd = MF->end();
-         IMBB != IMBBEnd; ++IMBB) {
-      const MachineBasicBlock *MBB = &(*IMBB);
-      unsigned CurReg;
-      for (CurReg = 0; CurReg < NbReg; ++CurReg) {
-        SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg);
-        SetOfMachineInstr &BBReachableUses =
-            getSet(ReachableUses, MBB, CurReg, NbReg);
-        SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg);
-        unsigned Size = BBOutSet.size();
-        //   In[bb][color] = U Out[bb.predecessors][color]
-        for (MachineBasicBlock::const_pred_iterator
-                 PredMBB = MBB->pred_begin(),
-                 EndPredMBB = MBB->pred_end();
-             PredMBB != EndPredMBB; ++PredMBB) {
-          SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg);
-          BBInSet.insert(PredOutSet.begin(), PredOutSet.end());
-        }
-        //   insert reachableUses[bb][color] in each in[bb][color] op.reachedses
-        for (const MachineInstr *MI: BBInSet) {
-          SetOfMachineInstr &OpReachedUses =
-              getUses(ColorOpToReachedUses, CurReg, MI);
-          OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end());
-        }
-        //           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
-        if (!Kill[MBB].test(CurReg))
-          BBOutSet.insert(BBInSet.begin(), BBInSet.end());
-        if (Gen[MBB][CurReg])
-          BBOutSet.insert(Gen[MBB][CurReg]);
-        HasChanged |= BBOutSet.size() != Size;
-      }
-    }
-  } while (HasChanged);
-}
-
-/// Release all memory dynamically allocated during the reaching
-/// definition algorithm.
-static void finitReachingDef(BlockToSetOfInstrsPerColor &In,
-                             BlockToSetOfInstrsPerColor &Out,
-                             BlockToInstrPerColor &Gen,
-                             BlockToSetOfInstrsPerColor &ReachableUses) {
-  for (BlockToSetOfInstrsPerColor::const_iterator IT = Out.begin(),
-                                                  End = Out.end();
-       IT != End; ++IT)
-    delete[] IT->second;
-  for (BlockToSetOfInstrsPerColor::const_iterator IT = In.begin(),
-                                                  End = In.end();
-       IT != End; ++IT)
-    delete[] IT->second;
-  for (BlockToSetOfInstrsPerColor::const_iterator IT = ReachableUses.begin(),
-                                                  End = ReachableUses.end();
-       IT != End; ++IT)
-    delete[] IT->second;
-  for (BlockToInstrPerColor::const_iterator IT = Gen.begin(), End = Gen.end();
-       IT != End; ++IT)
-    delete[] IT->second;
-}
-
-/// Reaching definiton algorithm.
-/// \param MF function on which the algorithm will operate.
-/// \param[out] ColorOpToReachedUses will contain the result of the reaching
-/// def algorithm.
-/// \param ADRPMode specify whether the reaching def algorithm should be tuned
-/// for ADRP optimization. \see initReachingDef for more details.
-/// \param DummyOp if not NULL, the algorithm will work at
-/// basic block scope and will set for every exposed defintion a use to
-/// @p DummyOp.
-/// \pre ColorOpToReachedUses is an array of at least number of registers of
-/// InstrToInstrs.
-static void reachingDef(MachineFunction *MF,
-                        InstrToInstrs *ColorOpToReachedUses,
-                        const MapRegToId &RegToId, bool ADRPMode = false,
-                        const MachineInstr *DummyOp = NULL) {
-  // structures:
-  // For each basic block.
-  // Out: a set per color of definitions that reach the
-  //      out boundary of this block.
-  // In: Same as Out but for in boundary.
-  // Gen: generated color in this block (one operation per color).
-  // Kill: register set of killed color in this block.
-  // ReachableUses: a set per color of uses (operation) reachable
-  //                for "In" definitions.
-  BlockToSetOfInstrsPerColor Out, In, ReachableUses;
-  BlockToInstrPerColor Gen;
-  BlockToRegSet Kill;
-
-  // Initialize Gen, kill and reachableUses.
-  initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId,
-                  DummyOp, ADRPMode);
-
-  // Algo.
-  if (!DummyOp)
-    reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
-                         ReachableUses, RegToId.size());
-
-  // finit.
-  finitReachingDef(In, Out, Gen, ReachableUses);
-}
-
-#ifndef NDEBUG
-/// print the result of the reaching definition algorithm.
-static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses,
-                             unsigned NbReg, const TargetRegisterInfo *TRI,
-                             const MapIdToReg &IdToReg) {
-  unsigned CurReg;
-  for (CurReg = 0; CurReg < NbReg; ++CurReg) {
-    if (ColorOpToReachedUses[CurReg].empty())
-      continue;
-    DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n");
-
-    InstrToInstrs::const_iterator DefsIt = ColorOpToReachedUses[CurReg].begin();
-    InstrToInstrs::const_iterator DefsItEnd =
-        ColorOpToReachedUses[CurReg].end();
-    for (; DefsIt != DefsItEnd; ++DefsIt) {
-      DEBUG(dbgs() << "Def:\n");
-      DEBUG(DefsIt->first->print(dbgs()));
-      DEBUG(dbgs() << "Reachable uses:\n");
-      for (SetOfMachineInstr::const_iterator UsesIt = DefsIt->second.begin(),
-                                             UsesItEnd = DefsIt->second.end();
-           UsesIt != UsesItEnd; ++UsesIt) {
-        DEBUG((*UsesIt)->print(dbgs()));
-      }
-    }
-  }
-}
-#endif // NDEBUG
-
-/// Answer the following question: Can Def be one of the definition
-/// involved in a part of a LOH?
-static bool canDefBePartOfLOH(const MachineInstr *Def) {
-  unsigned Opc = Def->getOpcode();
-  // Accept ADRP, ADDLow and LOADGot.
-  switch (Opc) {
-  default:
-    return false;
-  case ARM64::ADRP:
-    return true;
-  case ARM64::ADDXri:
-    // Check immediate to see if the immediate is an address.
-    switch (Def->getOperand(2).getType()) {
-    default:
-      return false;
-    case MachineOperand::MO_GlobalAddress:
-    case MachineOperand::MO_JumpTableIndex:
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_BlockAddress:
-      return true;
-    }
-  case ARM64::LDRXui:
-    // Check immediate to see if the immediate is an address.
-    switch (Def->getOperand(2).getType()) {
-    default:
-      return false;
-    case MachineOperand::MO_GlobalAddress:
-      return true;
-    }
-  }
-  // Unreachable.
-  return false;
-}
-
-/// Check whether the given instruction can the end of a LOH chain involving a
-/// store.
-static bool isCandidateStore(const MachineInstr *Instr) {
-  switch (Instr->getOpcode()) {
-  default:
-    return false;
-  case ARM64::STRBui:
-  case ARM64::STRHui:
-  case ARM64::STRWui:
-  case ARM64::STRXui:
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STRQui:
-    // In case we have str xA, [xA, #imm], this is two different uses
-    // of xA and we cannot fold, otherwise the xA stored may be wrong,
-    // even if #imm == 0.
-    if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg())
-      return true;
-  }
-  return false;
-}
-
-/// Given the result of a reaching defintion algorithm in ColorOpToReachedUses,
-/// Build the Use to Defs information and filter out obvious non-LOH candidates.
-/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions.
-/// In non-ADRPMode, non-LOH candidates are "uses" with several definition,
-/// i.e., no simple chain.
-/// \param ADRPMode -- \see initReachingDef.
-static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
-                              const InstrToInstrs *ColorOpToReachedUses,
-                              const MapRegToId &RegToId,
-                              bool ADRPMode = false) {
-
-  SetOfMachineInstr NotCandidate;
-  unsigned NbReg = RegToId.size();
-  MapRegToId::const_iterator EndIt = RegToId.end();
-  for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) {
-    // If this color is never defined, continue.
-    if (ColorOpToReachedUses[CurReg].empty())
-      continue;
-
-    InstrToInstrs::const_iterator DefsIt = ColorOpToReachedUses[CurReg].begin();
-    InstrToInstrs::const_iterator DefsItEnd =
-        ColorOpToReachedUses[CurReg].end();
-    for (; DefsIt != DefsItEnd; ++DefsIt) {
-      for (SetOfMachineInstr::const_iterator UsesIt = DefsIt->second.begin(),
-                                             UsesItEnd = DefsIt->second.end();
-           UsesIt != UsesItEnd; ++UsesIt) {
-        const MachineInstr *Def = DefsIt->first;
-        MapRegToId::const_iterator It;
-        // if all the reaching defs are not adrp, this use will not be
-        // simplifiable.
-        if ((ADRPMode && Def->getOpcode() != ARM64::ADRP) ||
-            (!ADRPMode && !canDefBePartOfLOH(Def)) ||
-            (!ADRPMode && isCandidateStore(*UsesIt) &&
-             // store are LOH candidate iff the end of the chain is used as
-             // base.
-             ((It = RegToId.find((*UsesIt)->getOperand(1).getReg())) == EndIt ||
-              It->second != CurReg))) {
-          NotCandidate.insert(*UsesIt);
-          continue;
-        }
-        // Do not consider self reaching as a simplifiable case for ADRP.
-        if (!ADRPMode || *UsesIt != DefsIt->first) {
-          UseToReachingDefs[*UsesIt].insert(DefsIt->first);
-          // If UsesIt has several reaching definitions, it is not
-          // candidate for simplificaton in non-ADRPMode.
-          if (!ADRPMode && UseToReachingDefs[*UsesIt].size() > 1)
-            NotCandidate.insert(*UsesIt);
-        }
-      }
-    }
-  }
-  for (const MachineInstr *Elem : NotCandidate) {
-    DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n");
-    // It would have been better if we could just remove the entry
-    // from the map.  Because of that, we have to filter the garbage
-    // (second.empty) in the subsequence analysis.
-    UseToReachingDefs[Elem].clear();
-  }
-}
-
-/// Based on the use to defs information (in ADRPMode), compute the
-/// opportunities of LOH ADRP-related.
-static void computeADRP(const InstrToInstrs &UseToDefs,
-                        ARM64FunctionInfo &ARM64FI,
-                        const MachineDominatorTree *MDT) {
-  DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
-  for (const auto &Entry: UseToDefs) {
-    unsigned Size = Entry.second.size();
-    if (Size == 0)
-      continue;
-    if (Size == 1) {
-      const MachineInstr *L2 = *Entry.second.begin();
-      const MachineInstr *L1 = Entry.first;
-      if (!MDT->dominates(L2, L1)) {
-        DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1
-                     << '\n');
-        continue;
-      }
-      DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
-      SmallVector<const MachineInstr *, 2> Args;
-      Args.push_back(L2);
-      Args.push_back(L1);
-      ARM64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
-      ++NumADRPSimpleCandidate;
-    }
-#ifdef DEBUG
-    else if (Size == 2)
-      ++NumADRPComplexCandidate2;
-    else if (Size == 3)
-      ++NumADRPComplexCandidate3;
-    else
-      ++NumADRPComplexCandidateOther;
-#endif
-    // if Size < 1, the use should have been removed from the candidates
-    assert(Size >= 1 && "No reaching defs for that use!");
-  }
-}
-
-/// Check whether the given instruction can be the end of a LOH chain
-/// involving a load.
-static bool isCandidateLoad(const MachineInstr *Instr) {
-  switch (Instr->getOpcode()) {
-  default:
-    return false;
-  case ARM64::LDRSBWui:
-  case ARM64::LDRSBXui:
-  case ARM64::LDRSHWui:
-  case ARM64::LDRSHXui:
-  case ARM64::LDRSWui:
-  case ARM64::LDRBui:
-  case ARM64::LDRHui:
-  case ARM64::LDRWui:
-  case ARM64::LDRXui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-    if (Instr->getOperand(2).getTargetFlags() & ARM64II::MO_GOT)
-      return false;
-    return true;
-  }
-  // Unreachable.
-  return false;
-}
-
-/// Check whether the given instruction can load a litteral.
-static bool supportLoadFromLiteral(const MachineInstr *Instr) {
-  switch (Instr->getOpcode()) {
-  default:
-    return false;
-  case ARM64::LDRSWui:
-  case ARM64::LDRWui:
-  case ARM64::LDRXui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-    return true;
-  }
-  // Unreachable.
-  return false;
-}
-
-/// Check whether the given instruction is a LOH candidate.
-/// \param UseToDefs is used to check that Instr is at the end of LOH supported
-/// chain.
-/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are
-/// already been filtered out.
-static bool isCandidate(const MachineInstr *Instr,
-                        const InstrToInstrs &UseToDefs,
-                        const MachineDominatorTree *MDT) {
-  if (!isCandidateLoad(Instr) && !isCandidateStore(Instr))
-    return false;
-
-  const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
-  if (Def->getOpcode() != ARM64::ADRP) {
-    // At this point, Def is ADDXri or LDRXui of the right type of
-    // symbol, because we filtered out the uses that were not defined
-    // by these kind of instructions (+ ADRP).
-
-    // Check if this forms a simple chain: each intermediate node must
-    // dominates the next one.
-    if (!MDT->dominates(Def, Instr))
-      return false;
-    // Move one node up in the simple chain.
-    if (UseToDefs.find(Def) == UseToDefs.end()
-                               // The map may contain garbage we have to ignore.
-        ||
-        UseToDefs.find(Def)->second.empty())
-      return false;
-    Instr = Def;
-    Def = *UseToDefs.find(Def)->second.begin();
-  }
-  // Check if we reached the top of the simple chain:
-  // - top is ADRP.
-  // - check the simple chain property: each intermediate node must
-  // dominates the next one.
-  if (Def->getOpcode() == ARM64::ADRP)
-    return MDT->dominates(Def, Instr);
-  return false;
-}
-
-static bool registerADRCandidate(const MachineInstr *Use,
-                                 const InstrToInstrs &UseToDefs,
-                                 const InstrToInstrs *DefsPerColorToUses,
-                                 ARM64FunctionInfo &ARM64FI,
-                                 SetOfMachineInstr *InvolvedInLOHs,
-                                 const MapRegToId &RegToId) {
-  // Look for opportunities to turn ADRP -> ADD or
-  // ADRP -> LDR GOTPAGEOFF into ADR.
-  // If ADRP has more than one use. Give up.
-  if (Use->getOpcode() != ARM64::ADDXri &&
-      (Use->getOpcode() != ARM64::LDRXui ||
-       !(Use->getOperand(2).getTargetFlags() & ARM64II::MO_GOT)))
-    return false;
-  InstrToInstrs::const_iterator It = UseToDefs.find(Use);
-  // The map may contain garbage that we need to ignore.
-  if (It == UseToDefs.end() || It->second.empty())
-    return false;
-  const MachineInstr *Def = *It->second.begin();
-  if (Def->getOpcode() != ARM64::ADRP)
-    return false;
-  // Check the number of users of ADRP.
-  const SetOfMachineInstr *Users =
-      getUses(DefsPerColorToUses,
-              RegToId.find(Def->getOperand(0).getReg())->second, Def);
-  if (Users->size() > 1) {
-    ++NumADRComplexCandidate;
-    return false;
-  }
-  ++NumADRSimpleCandidate;
-  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Def)) &&
-         "ADRP already involved in LOH.");
-  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Use)) &&
-         "ADD already involved in LOH.");
-  DEBUG(dbgs() << "Record AdrpAdd\n" << *Def << '\n' << *Use << '\n');
-
-  SmallVector<const MachineInstr *, 2> Args;
-  Args.push_back(Def);
-  Args.push_back(Use);
-
-  ARM64FI.addLOHDirective(Use->getOpcode() == ARM64::ADDXri ? MCLOH_AdrpAdd
-                                                            : MCLOH_AdrpLdrGot,
-                          Args);
-  return true;
-}
-
-/// Based on the use to defs information (in non-ADRPMode), compute the
-/// opportunities of LOH non-ADRP-related
-static void computeOthers(const InstrToInstrs &UseToDefs,
-                          const InstrToInstrs *DefsPerColorToUses,
-                          ARM64FunctionInfo &ARM64FI, const MapRegToId &RegToId,
-                          const MachineDominatorTree *MDT) {
-  SetOfMachineInstr *InvolvedInLOHs = NULL;
-#ifdef DEBUG
-  SetOfMachineInstr InvolvedInLOHsStorage;
-  InvolvedInLOHs = &InvolvedInLOHsStorage;
-#endif // DEBUG
-  DEBUG(dbgs() << "*** Compute LOH for Others\n");
-  // ADRP -> ADD/LDR -> LDR/STR pattern.
-  // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern.
-
-  // FIXME: When the statistics are not important,
-  // This initial filtering loop can be merged into the next loop.
-  // Currently, we didn't do it to have the same code for both DEBUG and
-  // NDEBUG builds. Indeed, the iterator of the second loop would need
-  // to be changed.
-  SetOfMachineInstr PotentialCandidates;
-  SetOfMachineInstr PotentialADROpportunities;
-  for (InstrToInstrs::const_iterator UseIt = UseToDefs.begin(),
-                                     EndUseIt = UseToDefs.end();
-       UseIt != EndUseIt; ++UseIt) {
-    // If no definition is available, this is a non candidate.
-    if (UseIt->second.empty())
-      continue;
-    // Keep only instructions that are load or store and at the end of
-    // a ADRP -> ADD/LDR/Nothing chain.
-    // We already filtered out the no-chain cases.
-    if (!isCandidate(UseIt->first, UseToDefs, MDT)) {
-      PotentialADROpportunities.insert(UseIt->first);
-      continue;
-    }
-    PotentialCandidates.insert(UseIt->first);
-  }
-
-  // Make the following distinctions for statistics as the linker does
-  // know how to decode instructions:
-  // - ADD/LDR/Nothing make there different patterns.
-  // - LDR/STR make two different patterns.
-  // Hence, 6 - 1 base patterns.
-  // (because ADRP-> Nothing -> STR is not simplifiable)
-
-  // The linker is only able to have a simple semantic, i.e., if pattern A
-  // do B.
-  // However, we want to see the opportunity we may miss if we were able to
-  // catch more complex cases.
-
-  // PotentialCandidates are result of a chain ADRP -> ADD/LDR ->
-  // A potential candidate becomes a candidate, if its current immediate
-  // operand is zero and all nodes of the chain have respectively only one user
-  SetOfMachineInstr::const_iterator CandidateIt, EndCandidateIt;
-#ifdef DEBUG
-  SetOfMachineInstr DefsOfPotentialCandidates;
-#endif
-  for (CandidateIt = PotentialCandidates.begin(),
-      EndCandidateIt = PotentialCandidates.end();
-       CandidateIt != EndCandidateIt; ++CandidateIt) {
-    const MachineInstr *Candidate = *CandidateIt;
-    // Get the definition of the candidate i.e., ADD or LDR.
-    const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin();
-    // Record the elements of the chain.
-    const MachineInstr *L1 = Def;
-    const MachineInstr *L2 = NULL;
-    unsigned ImmediateDefOpc = Def->getOpcode();
-    if (Def->getOpcode() != ARM64::ADRP) {
-      // Check the number of users of this node.
-      const SetOfMachineInstr *Users =
-          getUses(DefsPerColorToUses,
-                  RegToId.find(Def->getOperand(0).getReg())->second, Def);
-      if (Users->size() > 1) {
-#ifdef DEBUG
-        // if all the uses of this def are in potential candidate, this is
-        // a complex candidate of level 2.
-        SetOfMachineInstr::const_iterator UseIt = Users->begin();
-        SetOfMachineInstr::const_iterator EndUseIt = Users->end();
-        for (; UseIt != EndUseIt; ++UseIt) {
-          if (!PotentialCandidates.count(*UseIt)) {
-            ++NumTooCplxLvl2;
-            break;
-          }
-        }
-        if (UseIt == EndUseIt)
-          ++NumCplxLvl2;
-#endif // DEBUG
-        PotentialADROpportunities.insert(Def);
-        continue;
-      }
-      L2 = Def;
-      Def = *UseToDefs.find(Def)->second.begin();
-      L1 = Def;
-    } // else the element in the middle of the chain is nothing, thus
-      // Def already contains the first element of the chain.
-
-    // Check the number of users of the first node in the chain, i.e., ADRP
-    const SetOfMachineInstr *Users =
-        getUses(DefsPerColorToUses,
-                RegToId.find(Def->getOperand(0).getReg())->second, Def);
-    if (Users->size() > 1) {
-#ifdef DEBUG
-      // if all the uses of this def are in the defs of the potential candidate,
-      // this is a complex candidate of level 1
-      if (DefsOfPotentialCandidates.empty()) {
-        // lazy init
-        DefsOfPotentialCandidates = PotentialCandidates;
-        for (const MachineInstr *Candidate : PotentialCandidates) {
-          if (!UseToDefs.find(Candidate)->second.empty())
-            DefsOfPotentialCandidates.insert(
-                *UseToDefs.find(Candidate)->second.begin());
-        }
-      }
-      bool Found = false;
-      for (auto &Use: *Users) {
-        if (!DefsOfPotentialCandidates.count(Use)) {
-          ++NumTooCplxLvl1;
-          Found = true;
-          break;
-        }
-      }
-      if (!Found)
-        ++NumCplxLvl1;
-#endif // DEBUG
-      continue;
-    }
-
-    bool IsL2Add = (ImmediateDefOpc == ARM64::ADDXri);
-    // If the chain is three instructions long and ldr is the second element,
-    // then this ldr must load form GOT, otherwise this is not a correct chain.
-    if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != ARM64II::MO_GOT)
-      continue;
-    SmallVector<const MachineInstr *, 3> Args;
-    MCLOHType Kind;
-    if (isCandidateLoad(Candidate)) {
-      if (L2 == NULL) {
-        // At this point, the candidate LOH indicates that the ldr instruction
-        // may use a direct access to the symbol. There is not such encoding
-        // for loads of byte and half.
-        if (!supportLoadFromLiteral(Candidate))
-          continue;
-
-        DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate
-                     << '\n');
-        Kind = MCLOH_AdrpLdr;
-        Args.push_back(L1);
-        Args.push_back(Candidate);
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
-               "L1 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
-               "Candidate already involved in LOH.");
-        ++NumADRPToLDR;
-      } else {
-        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
-                     << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
-                     << '\n');
-
-        Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr;
-        Args.push_back(L1);
-        Args.push_back(L2);
-        Args.push_back(Candidate);
-
-        PotentialADROpportunities.remove(L2);
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
-               "L1 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
-               "L2 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
-               "Candidate already involved in LOH.");
-#ifdef DEBUG
-        // get the immediate of the load
-        if (Candidate->getOperand(2).getImm() == 0)
-          if (ImmediateDefOpc == ARM64::ADDXri)
-            ++NumADDToLDR;
-          else
-            ++NumLDRToLDR;
-        else if (ImmediateDefOpc == ARM64::ADDXri)
-          ++NumADDToLDRWithImm;
-        else
-          ++NumLDRToLDRWithImm;
-#endif // DEBUG
-      }
-    } else {
-      if (ImmediateDefOpc == ARM64::ADRP)
-        continue;
-      else {
-
-        DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot")
-                     << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate
-                     << '\n');
-
-        Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr;
-        Args.push_back(L1);
-        Args.push_back(L2);
-        Args.push_back(Candidate);
-
-        PotentialADROpportunities.remove(L2);
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) &&
-               "L1 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) &&
-               "L2 already involved in LOH.");
-        assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) &&
-               "Candidate already involved in LOH.");
-#ifdef DEBUG
-        // get the immediate of the store
-        if (Candidate->getOperand(2).getImm() == 0)
-          if (ImmediateDefOpc == ARM64::ADDXri)
-            ++NumADDToSTR;
-          else
-            ++NumLDRToSTR;
-        else if (ImmediateDefOpc == ARM64::ADDXri)
-          ++NumADDToSTRWithImm;
-        else
-          ++NumLDRToSTRWithImm;
-#endif // DEBUG
-      }
-    }
-    ARM64FI.addLOHDirective(Kind, Args);
-  }
-
-  // Now, we grabbed all the big patterns, check ADR opportunities.
-  for (const MachineInstr *Candidate: PotentialADROpportunities)
-    registerADRCandidate(Candidate, UseToDefs, DefsPerColorToUses, ARM64FI,
-                         InvolvedInLOHs, RegToId);
-}
-
-/// Look for every register defined by potential LOHs candidates.
-/// Map these registers with dense id in @p RegToId and vice-versa in
-/// @p IdToReg. @p IdToReg is populated only in DEBUG mode.
-static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
-                               MapIdToReg &IdToReg,
-                               const TargetRegisterInfo *TRI) {
-  unsigned CurRegId = 0;
-  if (!PreCollectRegister) {
-    unsigned NbReg = TRI->getNumRegs();
-    for (; CurRegId < NbReg; ++CurRegId) {
-      RegToId[CurRegId] = CurRegId;
-      DEBUG(IdToReg.push_back(CurRegId));
-      DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches"));
-    }
-    return;
-  }
-
-  DEBUG(dbgs() << "** Collect Involved Register\n");
-  for (MachineFunction::const_iterator IMBB = MF.begin(), IMBBEnd = MF.end();
-       IMBB != IMBBEnd; ++IMBB)
-    for (MachineBasicBlock::const_iterator II = IMBB->begin(),
-                                           IEnd = IMBB->end();
-         II != IEnd; ++II) {
-
-      if (!canDefBePartOfLOH(II))
-        continue;
-
-      // Process defs
-      for (MachineInstr::const_mop_iterator IO = II->operands_begin(),
-                                            IOEnd = II->operands_end();
-           IO != IOEnd; ++IO) {
-        if (!IO->isReg() || !IO->isDef())
-          continue;
-        unsigned CurReg = IO->getReg();
-        for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI)
-          if (RegToId.find(*AI) == RegToId.end()) {
-            DEBUG(IdToReg.push_back(*AI);
-                  assert(IdToReg[CurRegId] == *AI &&
-                         "Reg index mismatches insertion index."));
-            RegToId[*AI] = CurRegId++;
-            DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n');
-          }
-      }
-    }
-}
-
-bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetMachine &TM = Fn.getTarget();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-  const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
-
-  MapRegToId RegToId;
-  MapIdToReg IdToReg;
-  ARM64FunctionInfo *ARM64FI = Fn.getInfo<ARM64FunctionInfo>();
-  assert(ARM64FI && "No MachineFunctionInfo for this function!");
-
-  DEBUG(dbgs() << "Looking for LOH in " << Fn.getName() << '\n');
-
-  collectInvolvedReg(Fn, RegToId, IdToReg, TRI);
-  if (RegToId.empty())
-    return false;
-
-  MachineInstr *DummyOp = NULL;
-  if (BasicBlockScopeOnly) {
-    const ARM64InstrInfo *TII =
-        static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
-    // For local analysis, create a dummy operation to record uses that are not
-    // local.
-    DummyOp = Fn.CreateMachineInstr(TII->get(ARM64::COPY), DebugLoc());
-  }
-
-  unsigned NbReg = RegToId.size();
-  bool Modified = false;
-
-  // Start with ADRP.
-  InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg];
-
-  // Compute the reaching def in ADRP mode, meaning ADRP definitions
-  // are first considered as uses.
-  reachingDef(&Fn, ColorOpToReachedUses, RegToId, true, DummyOp);
-  DEBUG(dbgs() << "ADRP reaching defs\n");
-  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
-
-  // Translate the definition to uses map into a use to definitions map to ease
-  // statistic computation.
-  InstrToInstrs ADRPToReachingDefs;
-  reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
-
-  // Compute LOH for ADRP.
-  computeADRP(ADRPToReachingDefs, *ARM64FI, MDT);
-  delete[] ColorOpToReachedUses;
-
-  // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
-  ColorOpToReachedUses = new InstrToInstrs[NbReg];
-
-  // first perform a regular reaching def analysis.
-  reachingDef(&Fn, ColorOpToReachedUses, RegToId, false, DummyOp);
-  DEBUG(dbgs() << "All reaching defs\n");
-  DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
-
-  // Turn that into a use to defs to ease statistic computation.
-  InstrToInstrs UsesToReachingDefs;
-  reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
-
-  // Compute other than AdrpAdrp LOH.
-  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *ARM64FI, RegToId,
-                MDT);
-  delete[] ColorOpToReachedUses;
-
-  if (BasicBlockScopeOnly)
-    Fn.DeleteMachineInstr(DummyOp);
-
-  return Modified;
-}
-
-/// createARM64CollectLOHPass - returns an instance of the Statistic for
-/// linker optimization pass.
-FunctionPass *llvm::createARM64CollectLOHPass() {
-  return new ARM64CollectLOH();
-}
diff --git a/lib/Target/ARM64/ARM64ConditionalCompares.cpp b/lib/Target/ARM64/ARM64ConditionalCompares.cpp
deleted file mode 100644
index b495afa..0000000
--- a/lib/Target/ARM64/ARM64ConditionalCompares.cpp
+++ /dev/null
@@ -1,918 +0,0 @@
-//===-- ARM64ConditionalCompares.cpp --- CCMP formation for ARM64 ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64ConditionalCompares pass which reduces
-// branching and code size by using the conditional compare instructions CCMP,
-// CCMN, and FCMP.
-//
-// The CFG transformations for forming conditional compares are very similar to
-// if-conversion, and this pass should run immediately before the early
-// if-conversion pass.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-ccmp"
-#include "ARM64.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SparseSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineTraceMetrics.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-
-using namespace llvm;
-
-// Absolute maximum number of instructions allowed per speculated block.
-// This bypasses all other heuristics, so it should be set fairly high.
-static cl::opt<unsigned> BlockInstrLimit(
-    "arm64-ccmp-limit", cl::init(30), cl::Hidden,
-    cl::desc("Maximum number of instructions per speculated block."));
-
-// Stress testing mode - disable heuristics.
-static cl::opt<bool> Stress("arm64-stress-ccmp", cl::Hidden,
-                            cl::desc("Turn all knobs to 11"));
-
-STATISTIC(NumConsidered, "Number of ccmps considered");
-STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)");
-STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)");
-STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)");
-STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)");
-STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)");
-STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)");
-STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)");
-STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)");
-STATISTIC(NumMultCPSRUses, "Number of ccmps rejected (CPSR used)");
-STATISTIC(NumUnknCPSRDefs, "Number of ccmps rejected (CPSR def unknown)");
-
-STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)");
-
-STATISTIC(NumConverted, "Number of ccmp instructions created");
-STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted");
-
-//===----------------------------------------------------------------------===//
-//                                 SSACCmpConv
-//===----------------------------------------------------------------------===//
-//
-// The SSACCmpConv class performs ccmp-conversion on SSA form machine code
-// after determining if it is possible. The class contains no heuristics;
-// external code should be used to determine when ccmp-conversion is a good
-// idea.
-//
-// CCmp-formation works on a CFG representing chained conditions, typically
-// from C's short-circuit || and && operators:
-//
-//   From:         Head            To:         Head
-//                 / |                         CmpBB
-//                /  |                         / |
-//               |  CmpBB                     /  |
-//               |  / |                    Tail  |
-//               | /  |                      |   |
-//              Tail  |                      |   |
-//                |   |                      |   |
-//               ... ...                    ... ...
-//
-// The Head block is terminated by a br.cond instruction, and the CmpBB block
-// contains compare + br.cond. Tail must be a successor of both.
-//
-// The cmp-conversion turns the compare instruction in CmpBB into a conditional
-// compare, and merges CmpBB into Head, speculatively executing its
-// instructions. The ARM64 conditional compare instructions have an immediate
-// operand that specifies the NZCV flag values when the condition is false and
-// the compare isn't executed. This makes it possible to chain compares with
-// different condition codes.
-//
-// Example:
-//
-//    if (a == 5 || b == 17)
-//      foo();
-//
-//    Head:
-//       cmp  w0, #5
-//       b.eq Tail
-//    CmpBB:
-//       cmp  w1, #17
-//       b.eq Tail
-//    ...
-//    Tail:
-//      bl _foo
-//
-//  Becomes:
-//
-//    Head:
-//       cmp  w0, #5
-//       ccmp w1, #17, 4, ne  ; 4 = nZcv
-//       b.eq Tail
-//    ...
-//    Tail:
-//      bl _foo
-//
-// The ccmp condition code is the one that would cause the Head terminator to
-// branch to CmpBB.
-//
-// FIXME: It should also be possible to speculate a block on the critical edge
-// between Head and Tail, just like if-converting a diamond.
-//
-// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion).
-
-namespace {
-class SSACCmpConv {
-  MachineFunction *MF;
-  const TargetInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  MachineRegisterInfo *MRI;
-
-public:
-  /// The first block containing a conditional branch, dominating everything
-  /// else.
-  MachineBasicBlock *Head;
-
-  /// The block containing cmp+br.cond with a sucessor shared with Head.
-  MachineBasicBlock *CmpBB;
-
-  /// The common successor for Head and CmpBB.
-  MachineBasicBlock *Tail;
-
-  /// The compare instruction in CmpBB that can be converted to a ccmp.
-  MachineInstr *CmpMI;
-
-private:
-  /// The branch condition in Head as determined by AnalyzeBranch.
-  SmallVector<MachineOperand, 4> HeadCond;
-
-  /// The condition code that makes Head branch to CmpBB.
-  ARM64CC::CondCode HeadCmpBBCC;
-
-  /// The branch condition in CmpBB.
-  SmallVector<MachineOperand, 4> CmpBBCond;
-
-  /// The condition code that makes CmpBB branch to Tail.
-  ARM64CC::CondCode CmpBBTailCC;
-
-  /// Check if the Tail PHIs are trivially convertible.
-  bool trivialTailPHIs();
-
-  /// Remove CmpBB from the Tail PHIs.
-  void updateTailPHIs();
-
-  /// Check if an operand defining DstReg is dead.
-  bool isDeadDef(unsigned DstReg);
-
-  /// Find the compare instruction in MBB that controls the conditional branch.
-  /// Return NULL if a convertible instruction can't be found.
-  MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB);
-
-  /// Return true if all non-terminator instructions in MBB can be safely
-  /// speculated.
-  bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI);
-
-public:
-  /// runOnMachineFunction - Initialize per-function data structures.
-  void runOnMachineFunction(MachineFunction &MF) {
-    this->MF = &MF;
-    TII = MF.getTarget().getInstrInfo();
-    TRI = MF.getTarget().getRegisterInfo();
-    MRI = &MF.getRegInfo();
-  }
-
-  /// If the sub-CFG headed by MBB can be cmp-converted, initialize the
-  /// internal state, and return true.
-  bool canConvert(MachineBasicBlock *MBB);
-
-  /// Cmo-convert the last block passed to canConvertCmp(), assuming
-  /// it is possible. Add any erased blocks to RemovedBlocks.
-  void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks);
-
-  /// Return the expected code size delta if the conversion into a
-  /// conditional compare is performed.
-  int expectedCodeSizeDelta() const;
-};
-} // end anonymous namespace
-
-// Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
-// This means that no if-conversion is required when merging CmpBB into Head.
-bool SSACCmpConv::trivialTailPHIs() {
-  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
-       I != E && I->isPHI(); ++I) {
-    unsigned HeadReg = 0, CmpBBReg = 0;
-    // PHI operands come in (VReg, MBB) pairs.
-    for (unsigned oi = 1, oe = I->getNumOperands(); oi != oe; oi += 2) {
-      MachineBasicBlock *MBB = I->getOperand(oi + 1).getMBB();
-      unsigned Reg = I->getOperand(oi).getReg();
-      if (MBB == Head) {
-        assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
-        HeadReg = Reg;
-      }
-      if (MBB == CmpBB) {
-        assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands");
-        CmpBBReg = Reg;
-      }
-    }
-    if (HeadReg != CmpBBReg)
-      return false;
-  }
-  return true;
-}
-
-// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply
-// removing the CmpBB operands. The Head operands will be identical.
-void SSACCmpConv::updateTailPHIs() {
-  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
-       I != E && I->isPHI(); ++I) {
-    // I is a PHI. It can have multiple entries for CmpBB.
-    for (unsigned oi = I->getNumOperands(); oi > 2; oi -= 2) {
-      // PHI operands are (Reg, MBB) at (oi-2, oi-1).
-      if (I->getOperand(oi - 1).getMBB() == CmpBB) {
-        I->RemoveOperand(oi - 1);
-        I->RemoveOperand(oi - 2);
-      }
-    }
-  }
-}
-
-// This pass runs before the ARM64DeadRegisterDefinitions pass, so compares are
-// still writing virtual registers without any uses.
-bool SSACCmpConv::isDeadDef(unsigned DstReg) {
-  // Writes to the zero register are dead.
-  if (DstReg == ARM64::WZR || DstReg == ARM64::XZR)
-    return true;
-  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
-    return false;
-  // A virtual register def without any uses will be marked dead later, and
-  // eventually replaced by the zero register.
-  return MRI->use_nodbg_empty(DstReg);
-}
-
-// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
-// corresponding to TBB.
-// Return
-static bool parseCond(ArrayRef<MachineOperand> Cond, ARM64CC::CondCode &CC) {
-  // A normal br.cond simply has the condition code.
-  if (Cond[0].getImm() != -1) {
-    assert(Cond.size() == 1 && "Unknown Cond array format");
-    CC = (ARM64CC::CondCode)(int)Cond[0].getImm();
-    return true;
-  }
-  // For tbz and cbz instruction, the opcode is next.
-  switch (Cond[1].getImm()) {
-  default:
-    // This includes tbz / tbnz branches which can't be converted to
-    // ccmp + br.cond.
-    return false;
-  case ARM64::CBZW:
-  case ARM64::CBZX:
-    assert(Cond.size() == 3 && "Unknown Cond array format");
-    CC = ARM64CC::EQ;
-    return true;
-  case ARM64::CBNZW:
-  case ARM64::CBNZX:
-    assert(Cond.size() == 3 && "Unknown Cond array format");
-    CC = ARM64CC::NE;
-    return true;
-  }
-}
-
-MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
-  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
-  if (I == MBB->end())
-    return 0;
-  // The terminator must be controlled by the flags.
-  if (!I->readsRegister(ARM64::CPSR)) {
-    switch (I->getOpcode()) {
-    case ARM64::CBZW:
-    case ARM64::CBZX:
-    case ARM64::CBNZW:
-    case ARM64::CBNZX:
-      // These can be converted into a ccmp against #0.
-      return I;
-    }
-    ++NumCmpTermRejs;
-    DEBUG(dbgs() << "Flags not used by terminator: " << *I);
-    return 0;
-  }
-
-  // Now find the instruction controlling the terminator.
-  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
-    --I;
-    assert(!I->isTerminator() && "Spurious terminator");
-    switch (I->getOpcode()) {
-    // cmp is an alias for subs with a dead destination register.
-    case ARM64::SUBSWri:
-    case ARM64::SUBSXri:
-    // cmn is an alias for adds with a dead destination register.
-    case ARM64::ADDSWri:
-    case ARM64::ADDSXri:
-      // Check that the immediate operand is within range, ccmp wants a uimm5.
-      // Rd = SUBSri Rn, imm, shift
-      if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
-        DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
-        ++NumImmRangeRejs;
-        return 0;
-      }
-    // Fall through.
-    case ARM64::SUBSWrr:
-    case ARM64::SUBSXrr:
-    case ARM64::ADDSWrr:
-    case ARM64::ADDSXrr:
-      if (isDeadDef(I->getOperand(0).getReg()))
-        return I;
-      DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
-      ++NumLiveDstRejs;
-      return 0;
-    case ARM64::FCMPSrr:
-    case ARM64::FCMPDrr:
-    case ARM64::FCMPESrr:
-    case ARM64::FCMPEDrr:
-      return I;
-    }
-
-    // Check for flag reads and clobbers.
-    MIOperands::PhysRegInfo PRI =
-        MIOperands(I).analyzePhysReg(ARM64::CPSR, TRI);
-
-    if (PRI.Reads) {
-      // The ccmp doesn't produce exactly the same flags as the original
-      // compare, so reject the transform if there are uses of the flags
-      // besides the terminators.
-      DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
-      ++NumMultCPSRUses;
-      return 0;
-    }
-
-    if (PRI.Clobbers) {
-      DEBUG(dbgs() << "Not convertible compare: " << *I);
-      ++NumUnknCPSRDefs;
-      return 0;
-    }
-  }
-  DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
-  return 0;
-}
-
-/// Determine if all the instructions in MBB can safely
-/// be speculated. The terminators are not considered.
-///
-/// Only CmpMI is allowed to clobber the flags.
-///
-bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
-                                     const MachineInstr *CmpMI) {
-  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
-  // get right.
-  if (!MBB->livein_empty()) {
-    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
-    return false;
-  }
-
-  unsigned InstrCount = 0;
-
-  // Check all instructions, except the terminators. It is assumed that
-  // terminators never have side effects or define any used register values.
-  for (MachineBasicBlock::iterator I = MBB->begin(),
-                                   E = MBB->getFirstTerminator();
-       I != E; ++I) {
-    if (I->isDebugValue())
-      continue;
-
-    if (++InstrCount > BlockInstrLimit && !Stress) {
-      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
-                   << BlockInstrLimit << " instructions.\n");
-      return false;
-    }
-
-    // There shouldn't normally be any phis in a single-predecessor block.
-    if (I->isPHI()) {
-      DEBUG(dbgs() << "Can't hoist: " << *I);
-      return false;
-    }
-
-    // Don't speculate loads. Note that it may be possible and desirable to
-    // speculate GOT or constant pool loads that are guaranteed not to trap,
-    // but we don't support that for now.
-    if (I->mayLoad()) {
-      DEBUG(dbgs() << "Won't speculate load: " << *I);
-      return false;
-    }
-
-    // We never speculate stores, so an AA pointer isn't necessary.
-    bool DontMoveAcrossStore = true;
-    if (!I->isSafeToMove(TII, 0, DontMoveAcrossStore)) {
-      DEBUG(dbgs() << "Can't speculate: " << *I);
-      return false;
-    }
-
-    // Only CmpMI is alowed to clobber the flags.
-    if (&*I != CmpMI && I->modifiesRegister(ARM64::CPSR, TRI)) {
-      DEBUG(dbgs() << "Clobbers flags: " << *I);
-      return false;
-    }
-  }
-  return true;
-}
-
-/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential
-/// candidate for cmp-conversion. Fill out the internal state.
-///
-bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
-  Head = MBB;
-  Tail = CmpBB = 0;
-
-  if (Head->succ_size() != 2)
-    return false;
-  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
-  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
-
-  // CmpBB can only have a single predecessor. Tail is allowed many.
-  if (Succ0->pred_size() != 1)
-    std::swap(Succ0, Succ1);
-
-  // Succ0 is our candidate for CmpBB.
-  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2)
-    return false;
-
-  CmpBB = Succ0;
-  Tail = Succ1;
-
-  if (!CmpBB->isSuccessor(Tail))
-    return false;
-
-  // The CFG topology checks out.
-  DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#"
-               << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n');
-  ++NumConsidered;
-
-  // Tail is allowed to have many predecessors, but we can't handle PHIs yet.
-  //
-  // FIXME: Real PHIs could be if-converted as long as the CmpBB values are
-  // defined before The CmpBB cmp clobbers the flags. Alternatively, it should
-  // always be safe to sink the ccmp down to immediately before the CmpBB
-  // terminators.
-  if (!trivialTailPHIs()) {
-    DEBUG(dbgs() << "Can't handle phis in Tail.\n");
-    ++NumPhiRejs;
-    return false;
-  }
-
-  if (!Tail->livein_empty()) {
-    DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n");
-    ++NumPhysRejs;
-    return false;
-  }
-
-  // CmpBB should never have PHIs since Head is its only predecessor.
-  // FIXME: Clean them up if it happens.
-  if (!CmpBB->empty() && CmpBB->front().isPHI()) {
-    DEBUG(dbgs() << "Can't handle phis in CmpBB.\n");
-    ++NumPhi2Rejs;
-    return false;
-  }
-
-  if (!CmpBB->livein_empty()) {
-    DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n");
-    ++NumPhysRejs;
-    return false;
-  }
-
-  // The branch we're looking to eliminate must be analyzable.
-  HeadCond.clear();
-  MachineBasicBlock *TBB = 0, *FBB = 0;
-  if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) {
-    DEBUG(dbgs() << "Head branch not analyzable.\n");
-    ++NumHeadBranchRejs;
-    return false;
-  }
-
-  // This is weird, probably some sort of degenerate CFG, or an edge to a
-  // landing pad.
-  if (!TBB || HeadCond.empty()) {
-    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
-    ++NumHeadBranchRejs;
-    return false;
-  }
-
-  if (!parseCond(HeadCond, HeadCmpBBCC)) {
-    DEBUG(dbgs() << "Unsupported branch type on Head\n");
-    ++NumHeadBranchRejs;
-    return false;
-  }
-
-  // Make sure the branch direction is right.
-  if (TBB != CmpBB) {
-    assert(TBB == Tail && "Unexpected TBB");
-    HeadCmpBBCC = ARM64CC::getInvertedCondCode(HeadCmpBBCC);
-  }
-
-  CmpBBCond.clear();
-  TBB = FBB = 0;
-  if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
-    DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
-    ++NumCmpBranchRejs;
-    return false;
-  }
-
-  if (!TBB || CmpBBCond.empty()) {
-    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
-    ++NumCmpBranchRejs;
-    return false;
-  }
-
-  if (!parseCond(CmpBBCond, CmpBBTailCC)) {
-    DEBUG(dbgs() << "Unsupported branch type on CmpBB\n");
-    ++NumCmpBranchRejs;
-    return false;
-  }
-
-  if (TBB != Tail)
-    CmpBBTailCC = ARM64CC::getInvertedCondCode(CmpBBTailCC);
-
-  DEBUG(dbgs() << "Head->CmpBB on " << ARM64CC::getCondCodeName(HeadCmpBBCC)
-               << ", CmpBB->Tail on " << ARM64CC::getCondCodeName(CmpBBTailCC)
-               << '\n');
-
-  CmpMI = findConvertibleCompare(CmpBB);
-  if (!CmpMI)
-    return false;
-
-  if (!canSpeculateInstrs(CmpBB, CmpMI)) {
-    ++NumSpeculateRejs;
-    return false;
-  }
-  return true;
-}
-
-void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
-  DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#"
-               << Head->getNumber() << ":\n" << *CmpBB);
-
-  // All CmpBB instructions are moved into Head, and CmpBB is deleted.
-  // Update the CFG first.
-  updateTailPHIs();
-  Head->removeSuccessor(CmpBB);
-  CmpBB->removeSuccessor(Tail);
-  Head->transferSuccessorsAndUpdatePHIs(CmpBB);
-  DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
-  TII->RemoveBranch(*Head);
-
-  // If the Head terminator was one of the cbz / tbz branches with built-in
-  // compare, we need to insert an explicit compare instruction in its place.
-  if (HeadCond[0].getImm() == -1) {
-    ++NumCompBranches;
-    unsigned Opc = 0;
-    switch (HeadCond[1].getImm()) {
-    case ARM64::CBZW:
-    case ARM64::CBNZW:
-      Opc = ARM64::SUBSWri;
-      break;
-    case ARM64::CBZX:
-    case ARM64::CBNZX:
-      Opc = ARM64::SUBSXri;
-      break;
-    default:
-      llvm_unreachable("Cannot convert Head branch");
-    }
-    const MCInstrDesc &MCID = TII->get(Opc);
-    // Create a dummy virtual register for the SUBS def.
-    unsigned DestReg =
-        MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF));
-    // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
-    BuildMI(*Head, Head->end(), TermDL, MCID)
-        .addReg(DestReg, RegState::Define | RegState::Dead)
-        .addOperand(HeadCond[2])
-        .addImm(0)
-        .addImm(0);
-    // SUBS uses the GPR*sp register classes.
-    MRI->constrainRegClass(HeadCond[2].getReg(),
-                           TII->getRegClass(MCID, 1, TRI, *MF));
-  }
-
-  Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end());
-
-  // Now replace CmpMI with a ccmp instruction that also considers the incoming
-  // flags.
-  unsigned Opc = 0;
-  unsigned FirstOp = 1;   // First CmpMI operand to copy.
-  bool isZBranch = false; // CmpMI is a cbz/cbnz instruction.
-  switch (CmpMI->getOpcode()) {
-  default:
-    llvm_unreachable("Unknown compare opcode");
-  case ARM64::SUBSWri:    Opc = ARM64::CCMPWi; break;
-  case ARM64::SUBSWrr:    Opc = ARM64::CCMPWr; break;
-  case ARM64::SUBSXri:    Opc = ARM64::CCMPXi; break;
-  case ARM64::SUBSXrr:    Opc = ARM64::CCMPXr; break;
-  case ARM64::ADDSWri:    Opc = ARM64::CCMNWi; break;
-  case ARM64::ADDSWrr:    Opc = ARM64::CCMNWr; break;
-  case ARM64::ADDSXri:    Opc = ARM64::CCMNXi; break;
-  case ARM64::ADDSXrr:    Opc = ARM64::CCMNXr; break;
-  case ARM64::FCMPSrr:    Opc = ARM64::FCCMPSrr; FirstOp = 0; break;
-  case ARM64::FCMPDrr:    Opc = ARM64::FCCMPDrr; FirstOp = 0; break;
-  case ARM64::FCMPESrr:   Opc = ARM64::FCCMPESrr; FirstOp = 0; break;
-  case ARM64::FCMPEDrr:   Opc = ARM64::FCCMPEDrr; FirstOp = 0; break;
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-    Opc = ARM64::CCMPWi;
-    FirstOp = 0;
-    isZBranch = true;
-    break;
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-    Opc = ARM64::CCMPXi;
-    FirstOp = 0;
-    isZBranch = true;
-    break;
-  }
-
-  // The ccmp instruction should set the flags according to the comparison when
-  // Head would have branched to CmpBB.
-  // The NZCV immediate operand should provide flags for the case where Head
-  // would have branched to Tail. These flags should cause the new Head
-  // terminator to branch to tail.
-  unsigned NZCV = ARM64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
-  const MCInstrDesc &MCID = TII->get(Opc);
-  MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
-                         TII->getRegClass(MCID, 0, TRI, *MF));
-  if (CmpMI->getOperand(FirstOp + 1).isReg())
-    MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
-                           TII->getRegClass(MCID, 1, TRI, *MF));
-  MachineInstrBuilder MIB =
-      BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
-          .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
-  if (isZBranch)
-    MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
-  else
-    MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
-  MIB.addImm(NZCV).addImm(HeadCmpBBCC);
-
-  // If CmpMI was a terminator, we need a new conditional branch to replace it.
-  // This now becomes a Head terminator.
-  if (isZBranch) {
-    bool isNZ = CmpMI->getOpcode() == ARM64::CBNZW ||
-                CmpMI->getOpcode() == ARM64::CBNZX;
-    BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(ARM64::Bcc))
-        .addImm(isNZ ? ARM64CC::NE : ARM64CC::EQ)
-        .addOperand(CmpMI->getOperand(1)); // Branch target.
-  }
-  CmpMI->eraseFromParent();
-  Head->updateTerminator();
-
-  RemovedBlocks.push_back(CmpBB);
-  CmpBB->eraseFromParent();
-  DEBUG(dbgs() << "Result:\n" << *Head);
-  ++NumConverted;
-}
-
-int SSACCmpConv::expectedCodeSizeDelta() const {
-  int delta = 0;
-  // If the Head terminator was one of the cbz / tbz branches with built-in
-  // compare, we need to insert an explicit compare instruction in its place
-  // plus a branch instruction.
-  if (HeadCond[0].getImm() == -1) {
-    switch (HeadCond[1].getImm()) {
-    case ARM64::CBZW:
-    case ARM64::CBNZW:
-    case ARM64::CBZX:
-    case ARM64::CBNZX:
-      // Therefore delta += 1
-      delta = 1;
-      break;
-    default:
-      llvm_unreachable("Cannot convert Head branch");
-    }
-  }
-  // If the Cmp terminator was one of the cbz / tbz branches with
-  // built-in compare, it will be turned into a compare instruction
-  // into Head, but we do not save any instruction.
-  // Otherwise, we save the branch instruction.
-  switch (CmpMI->getOpcode()) {
-  default:
-    --delta;
-    break;
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-    break;
-  }
-  return delta;
-}
-
-//===----------------------------------------------------------------------===//
-//                       ARM64ConditionalCompares Pass
-//===----------------------------------------------------------------------===//
-
-namespace {
-class ARM64ConditionalCompares : public MachineFunctionPass {
-  const TargetInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  const MCSchedModel *SchedModel;
-  // Does the proceeded function has Oz attribute.
-  bool MinSize;
-  MachineRegisterInfo *MRI;
-  MachineDominatorTree *DomTree;
-  MachineLoopInfo *Loops;
-  MachineTraceMetrics *Traces;
-  MachineTraceMetrics::Ensemble *MinInstr;
-  SSACCmpConv CmpConv;
-
-public:
-  static char ID;
-  ARM64ConditionalCompares() : MachineFunctionPass(ID) {}
-  void getAnalysisUsage(AnalysisUsage &AU) const;
-  bool runOnMachineFunction(MachineFunction &MF);
-  const char *getPassName() const { return "ARM64 Conditional Compares"; }
-
-private:
-  bool tryConvert(MachineBasicBlock *);
-  void updateDomTree(ArrayRef<MachineBasicBlock *> Removed);
-  void updateLoops(ArrayRef<MachineBasicBlock *> Removed);
-  void invalidateTraces();
-  bool shouldConvert();
-};
-} // end anonymous namespace
-
-char ARM64ConditionalCompares::ID = 0;
-
-namespace llvm {
-void initializeARM64ConditionalComparesPass(PassRegistry &);
-}
-
-INITIALIZE_PASS_BEGIN(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
-INITIALIZE_PASS_END(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass",
-                    false, false)
-
-FunctionPass *llvm::createARM64ConditionalCompares() {
-  return new ARM64ConditionalCompares();
-}
-
-void ARM64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineBranchProbabilityInfo>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addPreserved<MachineDominatorTree>();
-  AU.addRequired<MachineLoopInfo>();
-  AU.addPreserved<MachineLoopInfo>();
-  AU.addRequired<MachineTraceMetrics>();
-  AU.addPreserved<MachineTraceMetrics>();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-/// Update the dominator tree after if-conversion erased some blocks.
-void
-ARM64ConditionalCompares::updateDomTree(ArrayRef<MachineBasicBlock *> Removed) {
-  // convert() removes CmpBB which was previously dominated by Head.
-  // CmpBB children should be transferred to Head.
-  MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
-  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
-    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
-    assert(Node != HeadNode && "Cannot erase the head node");
-    assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
-    while (Node->getNumChildren())
-      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
-    DomTree->eraseNode(Removed[i]);
-  }
-}
-
-/// Update LoopInfo after if-conversion.
-void
-ARM64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
-  if (!Loops)
-    return;
-  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
-    Loops->removeBlock(Removed[i]);
-}
-
-/// Invalidate MachineTraceMetrics before if-conversion.
-void ARM64ConditionalCompares::invalidateTraces() {
-  Traces->invalidate(CmpConv.Head);
-  Traces->invalidate(CmpConv.CmpBB);
-}
-
-/// Apply cost model and heuristics to the if-conversion in IfConv.
-/// Return true if the conversion is a good idea.
-///
-bool ARM64ConditionalCompares::shouldConvert() {
-  // Stress testing mode disables all cost considerations.
-  if (Stress)
-    return true;
-  if (!MinInstr)
-    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
-
-  // Head dominates CmpBB, so it is always included in its trace.
-  MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB);
-
-  // If code size is the main concern
-  if (MinSize) {
-    int CodeSizeDelta = CmpConv.expectedCodeSizeDelta();
-    DEBUG(dbgs() << "Code size delta:  " << CodeSizeDelta << '\n');
-    // If we are minimizing the code size, do the conversion whatever
-    // the cost is.
-    if (CodeSizeDelta < 0)
-      return true;
-    if (CodeSizeDelta > 0) {
-      DEBUG(dbgs() << "Code size is increasing, give up on this one.\n");
-      return false;
-    }
-    // CodeSizeDelta == 0, continue with the regular heuristics
-  }
-
-  // Heuristic: The compare conversion delays the execution of the branch
-  // instruction because we must wait for the inputs to the second compare as
-  // well. The branch has no dependent instructions, but delaying it increases
-  // the cost of a misprediction.
-  //
-  // Set a limit on the delay we will accept.
-  unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4;
-
-  // Instruction depths can be computed for all trace instructions above CmpBB.
-  unsigned HeadDepth =
-      Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth;
-  unsigned CmpBBDepth =
-      Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth;
-  DEBUG(dbgs() << "Head depth:  " << HeadDepth
-               << "\nCmpBB depth: " << CmpBBDepth << '\n');
-  if (CmpBBDepth > HeadDepth + DelayLimit) {
-    DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit
-                 << " cycles.\n");
-    return false;
-  }
-
-  // Check the resource depth at the bottom of CmpBB - these instructions will
-  // be speculated.
-  unsigned ResDepth = Trace.getResourceDepth(true);
-  DEBUG(dbgs() << "Resources:   " << ResDepth << '\n');
-
-  // Heuristic: The speculatively executed instructions must all be able to
-  // merge into the Head block. The Head critical path should dominate the
-  // resource cost of the speculated instructions.
-  if (ResDepth > HeadDepth) {
-    DEBUG(dbgs() << "Too many instructions to speculate.\n");
-    return false;
-  }
-  return true;
-}
-
-bool ARM64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
-  bool Changed = false;
-  while (CmpConv.canConvert(MBB) && shouldConvert()) {
-    invalidateTraces();
-    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
-    CmpConv.convert(RemovedBlocks);
-    Changed = true;
-    updateDomTree(RemovedBlocks);
-    updateLoops(RemovedBlocks);
-  }
-  return Changed;
-}
-
-bool ARM64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** ARM64 Conditional Compares **********\n"
-               << "********** Function: " << MF.getName() << '\n');
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
-  SchedModel =
-      MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
-  MRI = &MF.getRegInfo();
-  DomTree = &getAnalysis<MachineDominatorTree>();
-  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
-  Traces = &getAnalysis<MachineTraceMetrics>();
-  MinInstr = 0;
-  MinSize = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::MinSize);
-
-  bool Changed = false;
-  CmpConv.runOnMachineFunction(MF);
-
-  // Visit blocks in dominator tree pre-order. The pre-order enables multiple
-  // cmp-conversions from the same head block.
-  // Note that updateDomTree() modifies the children of the DomTree node
-  // currently being visited. The df_iterator supports that, it doesn't look at
-  // child_begin() / child_end() until after a node has been visited.
-  for (df_iterator<MachineDominatorTree *> I = df_begin(DomTree),
-                                           E = df_end(DomTree);
-       I != E; ++I)
-    if (tryConvert(I->getBlock()))
-      Changed = true;
-
-  return Changed;
-}
diff --git a/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp b/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
deleted file mode 100644
index 3e410e5..0000000
--- a/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-//===-- ARM64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// When allowed by the instruction, replace a dead definition of a GPR with
-// the zero register. This makes the code a bit friendlier towards the
-// hardware's register renamer.
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-dead-defs"
-#include "ARM64.h"
-#include "ARM64RegisterInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
-
-namespace {
-class ARM64DeadRegisterDefinitions : public MachineFunctionPass {
-private:
-  bool processMachineBasicBlock(MachineBasicBlock *MBB);
-
-public:
-  static char ID; // Pass identification, replacement for typeid.
-  explicit ARM64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
-
-  virtual bool runOnMachineFunction(MachineFunction &F);
-
-  const char *getPassName() const { return "Dead register definitions"; }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-char ARM64DeadRegisterDefinitions::ID = 0;
-} // end anonymous namespace
-
-bool
-ARM64DeadRegisterDefinitions::processMachineBasicBlock(MachineBasicBlock *MBB) {
-  bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-       ++I) {
-    MachineInstr *MI = I;
-    for (int i = 0, e = MI->getDesc().getNumDefs(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (MO.isReg() && MO.isDead() && MO.isDef()) {
-        assert(!MO.isImplicit() && "Unexpected implicit def!");
-        DEBUG(dbgs() << "  Dead def operand #" << i << " in:\n    ";
-              MI->print(dbgs()));
-        // Be careful not to change the register if it's a tied operand.
-        if (MI->isRegTiedToUseOperand(i)) {
-          DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
-          continue;
-        }
-        // Make sure the instruction take a register class that contains
-        // the zero register and replace it if so.
-        unsigned NewReg;
-        switch (MI->getDesc().OpInfo[i].RegClass) {
-        default:
-          DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
-          continue;
-        case ARM64::GPR32RegClassID:
-          NewReg = ARM64::WZR;
-          break;
-        case ARM64::GPR64RegClassID:
-          NewReg = ARM64::XZR;
-          break;
-        }
-        DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
-        MO.setReg(NewReg);
-        DEBUG(MI->print(dbgs()));
-        ++NumDeadDefsReplaced;
-      }
-    }
-  }
-  return Changed;
-}
-
-// Scan the function for instructions that have a dead definition of a
-// register. Replace that register with the zero register when possible.
-bool ARM64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &mf) {
-  MachineFunction *MF = &mf;
-  bool Changed = false;
-  DEBUG(dbgs() << "***** ARM64DeadRegisterDefinitions *****\n");
-
-  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
-    if (processMachineBasicBlock(I))
-      Changed = true;
-  return Changed;
-}
-
-FunctionPass *llvm::createARM64DeadRegisterDefinitions() {
-  return new ARM64DeadRegisterDefinitions();
-}
diff --git a/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp b/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp
deleted file mode 100644
index e082baf..0000000
--- a/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp
+++ /dev/null
@@ -1,737 +0,0 @@
-//===-- ARM64ExpandPseudoInsts.cpp - Expand pseudo instructions ---*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that expands pseudo instructions into target
-// instructions to allow proper scheduling and other late optimizations.  This
-// pass should be run after register allocation but before the post-regalloc
-// scheduling pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "ARM64InstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/MathExtras.h"
-using namespace llvm;
-
-namespace {
-class ARM64ExpandPseudo : public MachineFunctionPass {
-public:
-  static char ID;
-  ARM64ExpandPseudo() : MachineFunctionPass(ID) {}
-
-  const ARM64InstrInfo *TII;
-
-  virtual bool runOnMachineFunction(MachineFunction &Fn);
-
-  virtual const char *getPassName() const {
-    return "ARM64 pseudo instruction expansion pass";
-  }
-
-private:
-  bool expandMBB(MachineBasicBlock &MBB);
-  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
-  bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                    unsigned BitSize);
-};
-char ARM64ExpandPseudo::ID = 0;
-}
-
-/// \brief Transfer implicit operands on the pseudo instruction to the
-/// instructions created from the expansion.
-static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
-                           MachineInstrBuilder &DefMI) {
-  const MCInstrDesc &Desc = OldMI.getDesc();
-  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
-       ++i) {
-    const MachineOperand &MO = OldMI.getOperand(i);
-    assert(MO.isReg() && MO.getReg());
-    if (MO.isUse())
-      UseMI.addOperand(MO);
-    else
-      DefMI.addOperand(MO);
-  }
-}
-
-/// \brief Helper function which extracts the specified 16-bit chunk from a
-/// 64-bit value.
-static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
-  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
-
-  return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
-}
-
-/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
-/// value. Indices correspond to element numbers in a v4i16.
-static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
-  assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
-  const unsigned ShiftAmt = ToIdx * 16;
-
-  // Replicate the source chunk to the destination position.
-  const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
-  // Clear the destination chunk.
-  Imm &= ~(0xFFFFLL << ShiftAmt);
-  // Insert the replicated chunk.
-  return Imm | Chunk;
-}
-
-/// \brief Helper function which tries to materialize a 64-bit value with an
-/// ORR + MOVK instruction sequence.
-static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
-                       MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator &MBBI,
-                       const ARM64InstrInfo *TII, unsigned ChunkIdx) {
-  assert(ChunkIdx < 4 && "Out of range chunk index specified!");
-  const unsigned ShiftAmt = ChunkIdx * 16;
-
-  uint64_t Encoding;
-  if (ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
-    // Create the ORR-immediate instruction.
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
-            .addOperand(MI.getOperand(0))
-            .addReg(ARM64::XZR)
-            .addImm(Encoding);
-
-    // Create the MOVK instruction.
-    const unsigned Imm16 = getChunk(UImm, ChunkIdx);
-    const unsigned DstReg = MI.getOperand(0).getReg();
-    const bool DstIsDead = MI.getOperand(0).isDead();
-    MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
-            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-            .addReg(DstReg)
-            .addImm(Imm16)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
-
-    transferImpOps(MI, MIB, MIB1);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  return false;
-}
-
-/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
-/// can be materialized with an ORR instruction.
-static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
-  Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
-
-  return ARM64_AM::processLogicalImmediate(Chunk, 64, Encoding);
-}
-
-/// \brief Check for identical 16-bit chunks within the constant and if so
-/// materialize them with a single ORR instruction. The remaining one or two
-/// 16-bit chunks will be materialized with MOVK instructions.
-///
-/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
-/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
-/// an ORR instruction.
-///
-static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
-                                 MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator &MBBI,
-                                 const ARM64InstrInfo *TII) {
-  typedef DenseMap<uint64_t, unsigned> CountMap;
-  CountMap Counts;
-
-  // Scan the constant and count how often every chunk occurs.
-  for (unsigned Idx = 0; Idx < 4; ++Idx)
-    ++Counts[getChunk(UImm, Idx)];
-
-  // Traverse the chunks to find one which occurs more than once.
-  for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
-       Chunk != End; ++Chunk) {
-    const uint64_t ChunkVal = Chunk->first;
-    const unsigned Count = Chunk->second;
-
-    uint64_t Encoding = 0;
-
-    // We are looking for chunks which have two or three instances and can be
-    // materialized with an ORR instruction.
-    if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
-      continue;
-
-    const bool CountThree = Count == 3;
-    // Create the ORR-immediate instruction.
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
-            .addOperand(MI.getOperand(0))
-            .addReg(ARM64::XZR)
-            .addImm(Encoding);
-
-    const unsigned DstReg = MI.getOperand(0).getReg();
-    const bool DstIsDead = MI.getOperand(0).isDead();
-
-    unsigned ShiftAmt = 0;
-    uint64_t Imm16 = 0;
-    // Find the first chunk not materialized with the ORR instruction.
-    for (; ShiftAmt < 64; ShiftAmt += 16) {
-      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
-
-      if (Imm16 != ChunkVal)
-        break;
-    }
-
-    // Create the first MOVK instruction.
-    MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
-            .addReg(DstReg,
-                    RegState::Define | getDeadRegState(DstIsDead && CountThree))
-            .addReg(DstReg)
-            .addImm(Imm16)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
-
-    // In case we have three instances the whole constant is now materialized
-    // and we can exit.
-    if (CountThree) {
-      transferImpOps(MI, MIB, MIB1);
-      MI.eraseFromParent();
-      return true;
-    }
-
-    // Find the remaining chunk which needs to be materialized.
-    for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
-      Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
-
-      if (Imm16 != ChunkVal)
-        break;
-    }
-
-    // Create the second MOVK instruction.
-    MachineInstrBuilder MIB2 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
-            .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-            .addReg(DstReg)
-            .addImm(Imm16)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
-
-    transferImpOps(MI, MIB, MIB2);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  return false;
-}
-
-/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
-/// starts a contiguous sequence of ones if we look at the bits from the LSB
-/// towards the MSB.
-static bool isStartChunk(uint64_t Chunk) {
-  if (Chunk == 0 || Chunk == UINT64_MAX)
-    return false;
-
-  return (CountLeadingOnes_64(Chunk) + countTrailingZeros(Chunk)) == 64;
-}
-
-/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
-/// ends a contiguous sequence of ones if we look at the bits from the LSB
-/// towards the MSB.
-static bool isEndChunk(uint64_t Chunk) {
-  if (Chunk == 0 || Chunk == UINT64_MAX)
-    return false;
-
-  return (countLeadingZeros(Chunk) + CountTrailingOnes_64(Chunk)) == 64;
-}
-
-/// \brief Clear or set all bits in the chunk at the given index.
-static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
-  const uint64_t Mask = 0xFFFF;
-
-  if (Clear)
-    // Clear chunk in the immediate.
-    Imm &= ~(Mask << (Idx * 16));
-  else
-    // Set all bits in the immediate for the particular chunk.
-    Imm |= Mask << (Idx * 16);
-
-  return Imm;
-}
-
-/// \brief Check whether the constant contains a sequence of contiguous ones,
-/// which might be interrupted by one or two chunks. If so, materialize the
-/// sequence of contiguous ones with an ORR instruction.
-/// Materialize the chunks which are either interrupting the sequence or outside
-/// of the sequence with a MOVK instruction.
-///
-/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
-/// which ends the sequence (0...1...). Then we are looking for constants which
-/// contain at least one S and E chunk.
-/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
-///
-/// We are also looking for constants like |S|A|B|E| where the contiguous
-/// sequence of ones wraps around the MSB into the LSB.
-///
-static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
-                              MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator &MBBI,
-                              const ARM64InstrInfo *TII) {
-  const int NotSet = -1;
-  const uint64_t Mask = 0xFFFF;
-
-  int StartIdx = NotSet;
-  int EndIdx = NotSet;
-  // Try to find the chunks which start/end a contiguous sequence of ones.
-  for (int Idx = 0; Idx < 4; ++Idx) {
-    int64_t Chunk = getChunk(UImm, Idx);
-    // Sign extend the 16-bit chunk to 64-bit.
-    Chunk = (Chunk << 48) >> 48;
-
-    if (isStartChunk(Chunk))
-      StartIdx = Idx;
-    else if (isEndChunk(Chunk))
-      EndIdx = Idx;
-  }
-
-  // Early exit in case we can't find a start/end chunk.
-  if (StartIdx == NotSet || EndIdx == NotSet)
-    return false;
-
-  // Outside of the contiguous sequence of ones everything needs to be zero.
-  uint64_t Outside = 0;
-  // Chunks between the start and end chunk need to have all their bits set.
-  uint64_t Inside = Mask;
-
-  // If our contiguous sequence of ones wraps around from the MSB into the LSB,
-  // just swap indices and pretend we are materializing a contiguous sequence
-  // of zeros surrounded by a contiguous sequence of ones.
-  if (StartIdx > EndIdx) {
-    std::swap(StartIdx, EndIdx);
-    std::swap(Outside, Inside);
-  }
-
-  uint64_t OrrImm = UImm;
-  int FirstMovkIdx = NotSet;
-  int SecondMovkIdx = NotSet;
-
-  // Find out which chunks we need to patch up to obtain a contiguous sequence
-  // of ones.
-  for (int Idx = 0; Idx < 4; ++Idx) {
-    const uint64_t Chunk = getChunk(UImm, Idx);
-
-    // Check whether we are looking at a chunk which is not part of the
-    // contiguous sequence of ones.
-    if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
-      OrrImm = updateImm(OrrImm, Idx, Outside == 0);
-
-      // Remember the index we need to patch.
-      if (FirstMovkIdx == NotSet)
-        FirstMovkIdx = Idx;
-      else
-        SecondMovkIdx = Idx;
-
-      // Check whether we are looking a chunk which is part of the contiguous
-      // sequence of ones.
-    } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
-      OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
-
-      // Remember the index we need to patch.
-      if (FirstMovkIdx == NotSet)
-        FirstMovkIdx = Idx;
-      else
-        SecondMovkIdx = Idx;
-    }
-  }
-  assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
-
-  // Create the ORR-immediate instruction.
-  uint64_t Encoding = 0;
-  ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
-  MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
-          .addOperand(MI.getOperand(0))
-          .addReg(ARM64::XZR)
-          .addImm(Encoding);
-
-  const unsigned DstReg = MI.getOperand(0).getReg();
-  const bool DstIsDead = MI.getOperand(0).isDead();
-
-  const bool SingleMovk = SecondMovkIdx == NotSet;
-  // Create the first MOVK instruction.
-  MachineInstrBuilder MIB1 =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
-          .addReg(DstReg,
-                  RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
-          .addReg(DstReg)
-          .addImm(getChunk(UImm, FirstMovkIdx))
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, FirstMovkIdx * 16));
-
-  // Early exit in case we only need to emit a single MOVK instruction.
-  if (SingleMovk) {
-    transferImpOps(MI, MIB, MIB1);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  // Create the second MOVK instruction.
-  MachineInstrBuilder MIB2 =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
-          .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-          .addReg(DstReg)
-          .addImm(getChunk(UImm, SecondMovkIdx))
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, SecondMovkIdx * 16));
-
-  transferImpOps(MI, MIB, MIB2);
-  MI.eraseFromParent();
-  return true;
-}
-
-/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
-/// real move-immediate instructions to synthesize the immediate.
-bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MBBI,
-                                     unsigned BitSize) {
-  MachineInstr &MI = *MBBI;
-  uint64_t Imm = MI.getOperand(1).getImm();
-  const unsigned Mask = 0xFFFF;
-
-  // Try a MOVI instruction (aka ORR-immediate with the zero register).
-  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
-  uint64_t Encoding;
-  if (ARM64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
-    unsigned Opc = (BitSize == 32 ? ARM64::ORRWri : ARM64::ORRXri);
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
-            .addOperand(MI.getOperand(0))
-            .addReg(BitSize == 32 ? ARM64::WZR : ARM64::XZR)
-            .addImm(Encoding);
-    transferImpOps(MI, MIB, MIB);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  // Scan the immediate and count the number of 16-bit chunks which are either
-  // all ones or all zeros.
-  unsigned OneChunks = 0;
-  unsigned ZeroChunks = 0;
-  for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
-    const unsigned Chunk = (Imm >> Shift) & Mask;
-    if (Chunk == Mask)
-      OneChunks++;
-    else if (Chunk == 0)
-      ZeroChunks++;
-  }
-
-  // Since we can't materialize the constant with a single ORR instruction,
-  // let's see whether we can materialize 3/4 of the constant with an ORR
-  // instruction and use an additional MOVK instruction to materialize the
-  // remaining 1/4.
-  //
-  // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
-  //
-  // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
-  // we would create the following instruction sequence:
-  //
-  // ORR x0, xzr, |A|X|A|X|
-  // MOVK x0, |B|, LSL #16
-  //
-  // Only look at 64-bit constants which can't be materialized with a single
-  // instruction e.g. which have less than either three all zero or all one
-  // chunks.
-  //
-  // Ignore 32-bit constants here, they always can be materialized with a
-  // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
-  // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
-  // Thus we fall back to the default code below which in the best case creates
-  // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
-  //
-  if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
-    // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
-    // identical?
-    if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
-      // See if we can come up with a constant which can be materialized with
-      // ORR-immediate by replicating element 3 into element 1.
-      uint64_t OrrImm = replicateChunk(UImm, 3, 1);
-      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
-        return true;
-
-      // See if we can come up with a constant which can be materialized with
-      // ORR-immediate by replicating element 1 into element 3.
-      OrrImm = replicateChunk(UImm, 1, 3);
-      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
-        return true;
-
-      // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
-      // identical?
-    } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
-      // See if we can come up with a constant which can be materialized with
-      // ORR-immediate by replicating element 2 into element 0.
-      uint64_t OrrImm = replicateChunk(UImm, 2, 0);
-      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
-        return true;
-
-      // See if we can come up with a constant which can be materialized with
-      // ORR-immediate by replicating element 1 into element 3.
-      OrrImm = replicateChunk(UImm, 0, 2);
-      if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
-        return true;
-    }
-  }
-
-  // Check for identical 16-bit chunks within the constant and if so materialize
-  // them with a single ORR instruction. The remaining one or two 16-bit chunks
-  // will be materialized with MOVK instructions.
-  if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
-    return true;
-
-  // Check whether the constant contains a sequence of contiguous ones, which
-  // might be interrupted by one or two chunks. If so, materialize the sequence
-  // of contiguous ones with an ORR instruction. Materialize the chunks which
-  // are either interrupting the sequence or outside of the sequence with a
-  // MOVK instruction.
-  if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
-    return true;
-
-  // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
-  // more MOVK instructions to insert additional 16-bit portions into the
-  // lower bits.
-  bool isNeg = false;
-
-  // Use MOVN to materialize the high bits if we have more all one chunks
-  // than all zero chunks.
-  if (OneChunks > ZeroChunks) {
-    isNeg = true;
-    Imm = ~Imm;
-  }
-
-  unsigned FirstOpc;
-  if (BitSize == 32) {
-    Imm &= (1LL << 32) - 1;
-    FirstOpc = (isNeg ? ARM64::MOVNWi : ARM64::MOVZWi);
-  } else {
-    FirstOpc = (isNeg ? ARM64::MOVNXi : ARM64::MOVZXi);
-  }
-  unsigned Shift = 0;     // LSL amount for high bits with MOVZ/MOVN
-  unsigned LastShift = 0; // LSL amount for last MOVK
-  if (Imm != 0) {
-    unsigned LZ = countLeadingZeros(Imm);
-    unsigned TZ = countTrailingZeros(Imm);
-    Shift = ((63 - LZ) / 16) * 16;
-    LastShift = (TZ / 16) * 16;
-  }
-  unsigned Imm16 = (Imm >> Shift) & Mask;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  bool DstIsDead = MI.getOperand(0).isDead();
-  MachineInstrBuilder MIB1 =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
-          .addReg(DstReg, RegState::Define |
-                              getDeadRegState(DstIsDead && Shift == LastShift))
-          .addImm(Imm16)
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift));
-
-  // If a MOVN was used for the high bits of a negative value, flip the rest
-  // of the bits back for use with MOVK.
-  if (isNeg)
-    Imm = ~Imm;
-
-  if (Shift == LastShift) {
-    transferImpOps(MI, MIB1, MIB1);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  MachineInstrBuilder MIB2;
-  unsigned Opc = (BitSize == 32 ? ARM64::MOVKWi : ARM64::MOVKXi);
-  while (Shift != LastShift) {
-    Shift -= 16;
-    Imm16 = (Imm >> Shift) & Mask;
-    if (Imm16 == (isNeg ? Mask : 0))
-      continue; // This 16-bit portion is already set correctly.
-    MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
-               .addReg(DstReg,
-                       RegState::Define |
-                           getDeadRegState(DstIsDead && Shift == LastShift))
-               .addReg(DstReg)
-               .addImm(Imm16)
-               .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift));
-  }
-
-  transferImpOps(MI, MIB1, MIB2);
-  MI.eraseFromParent();
-  return true;
-}
-
-/// \brief If MBBI references a pseudo instruction that should be expanded here,
-/// do the expansion and return true.  Otherwise return false.
-bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI) {
-  MachineInstr &MI = *MBBI;
-  unsigned Opcode = MI.getOpcode();
-  switch (Opcode) {
-  default:
-    break;
-
-  case ARM64::ADDWrr:
-  case ARM64::SUBWrr:
-  case ARM64::ADDXrr:
-  case ARM64::SUBXrr:
-  case ARM64::ADDSWrr:
-  case ARM64::SUBSWrr:
-  case ARM64::ADDSXrr:
-  case ARM64::SUBSXrr:
-  case ARM64::ANDWrr:
-  case ARM64::ANDXrr:
-  case ARM64::BICWrr:
-  case ARM64::BICXrr:
-  case ARM64::EONWrr:
-  case ARM64::EONXrr:
-  case ARM64::EORWrr:
-  case ARM64::EORXrr:
-  case ARM64::ORNWrr:
-  case ARM64::ORNXrr:
-  case ARM64::ORRWrr:
-  case ARM64::ORRXrr: {
-    unsigned Opcode;
-    switch (MI.getOpcode()) {
-    default:
-      return false;
-    case ARM64::ADDWrr:      Opcode = ARM64::ADDWrs; break;
-    case ARM64::SUBWrr:      Opcode = ARM64::SUBWrs; break;
-    case ARM64::ADDXrr:      Opcode = ARM64::ADDXrs; break;
-    case ARM64::SUBXrr:      Opcode = ARM64::SUBXrs; break;
-    case ARM64::ADDSWrr:     Opcode = ARM64::ADDSWrs; break;
-    case ARM64::SUBSWrr:     Opcode = ARM64::SUBSWrs; break;
-    case ARM64::ADDSXrr:     Opcode = ARM64::ADDSXrs; break;
-    case ARM64::SUBSXrr:     Opcode = ARM64::SUBSXrs; break;
-    case ARM64::ANDWrr:      Opcode = ARM64::ANDWrs; break;
-    case ARM64::ANDXrr:      Opcode = ARM64::ANDXrs; break;
-    case ARM64::BICWrr:      Opcode = ARM64::BICWrs; break;
-    case ARM64::BICXrr:      Opcode = ARM64::BICXrs; break;
-    case ARM64::EONWrr:      Opcode = ARM64::EONWrs; break;
-    case ARM64::EONXrr:      Opcode = ARM64::EONXrs; break;
-    case ARM64::EORWrr:      Opcode = ARM64::EORWrs; break;
-    case ARM64::EORXrr:      Opcode = ARM64::EORXrs; break;
-    case ARM64::ORNWrr:      Opcode = ARM64::ORNWrs; break;
-    case ARM64::ORNXrr:      Opcode = ARM64::ORNXrs; break;
-    case ARM64::ORRWrr:      Opcode = ARM64::ORRWrs; break;
-    case ARM64::ORRXrr:      Opcode = ARM64::ORRXrs; break;
-    }
-    MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
-                MI.getOperand(0).getReg())
-            .addOperand(MI.getOperand(1))
-            .addOperand(MI.getOperand(2))
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    transferImpOps(MI, MIB1, MIB1);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  case ARM64::FCVTSHpseudo: {
-    MachineOperand Src = MI.getOperand(1);
-    Src.setImplicit();
-    unsigned SrcH = TII->getRegisterInfo().getSubReg(Src.getReg(), ARM64::hsub);
-    auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::FCVTSHr))
-                   .addOperand(MI.getOperand(0))
-                   .addReg(SrcH, RegState::Undef)
-                   .addOperand(Src);
-    transferImpOps(MI, MIB, MIB);
-    MI.eraseFromParent();
-    return true;
-  }
-  case ARM64::LOADgot: {
-    // Expand into ADRP + LDR.
-    unsigned DstReg = MI.getOperand(0).getReg();
-    const MachineOperand &MO1 = MI.getOperand(1);
-    unsigned Flags = MO1.getTargetFlags();
-    MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg);
-    MachineInstrBuilder MIB2 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::LDRXui))
-            .addOperand(MI.getOperand(0))
-            .addReg(DstReg);
-
-    if (MO1.isGlobal()) {
-      MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | ARM64II::MO_PAGE);
-      MIB2.addGlobalAddress(MO1.getGlobal(), 0,
-                            Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-    } else if (MO1.isSymbol()) {
-      MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | ARM64II::MO_PAGE);
-      MIB2.addExternalSymbol(MO1.getSymbolName(),
-                             Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-    } else {
-      assert(MO1.isCPI() &&
-             "Only expect globals, externalsymbols, or constant pools");
-      MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
-                                Flags | ARM64II::MO_PAGE);
-      MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
-                                Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-    }
-
-    transferImpOps(MI, MIB1, MIB2);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  case ARM64::MOVaddr:
-  case ARM64::MOVaddrJT:
-  case ARM64::MOVaddrCP:
-  case ARM64::MOVaddrBA:
-  case ARM64::MOVaddrTLS:
-  case ARM64::MOVaddrEXT: {
-    // Expand into ADRP + ADD.
-    unsigned DstReg = MI.getOperand(0).getReg();
-    MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg)
-            .addOperand(MI.getOperand(1));
-
-    MachineInstrBuilder MIB2 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADDXri))
-            .addOperand(MI.getOperand(0))
-            .addReg(DstReg)
-            .addOperand(MI.getOperand(2))
-            .addImm(0);
-
-    transferImpOps(MI, MIB1, MIB2);
-    MI.eraseFromParent();
-    return true;
-  }
-
-  case ARM64::MOVi32imm:
-    return expandMOVImm(MBB, MBBI, 32);
-  case ARM64::MOVi64imm:
-    return expandMOVImm(MBB, MBBI, 64);
-  case ARM64::RET_ReallyLR:
-    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::RET))
-        .addReg(ARM64::LR);
-    MI.eraseFromParent();
-    return true;
-  }
-  return false;
-}
-
-/// \brief Iterate over the instructions in basic block MBB and expand any
-/// pseudo instructions.  Return true if anything was modified.
-bool ARM64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
-  bool Modified = false;
-
-  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-  while (MBBI != E) {
-    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
-    Modified |= expandMI(MBB, MBBI);
-    MBBI = NMBBI;
-  }
-
-  return Modified;
-}
-
-bool ARM64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
-
-  bool Modified = false;
-  for (auto &MBB : MF)
-    Modified |= expandMBB(MBB);
-  return Modified;
-}
-
-/// \brief Returns an instance of the pseudo instruction expansion pass.
-FunctionPass *llvm::createARM64ExpandPseudoPass() {
-  return new ARM64ExpandPseudo();
-}
diff --git a/lib/Target/ARM64/ARM64FastISel.cpp b/lib/Target/ARM64/ARM64FastISel.cpp
deleted file mode 100644
index 51b0f76..0000000
--- a/lib/Target/ARM64/ARM64FastISel.cpp
+++ /dev/null
@@ -1,1929 +0,0 @@
-//===-- ARM6464FastISel.cpp - ARM64 FastISel implementation ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the ARM64-specific support for the FastISel class. Some
-// of the target-specific code is generated by tablegen in the file
-// ARM64GenFastISel.inc, which is #included here.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64TargetMachine.h"
-#include "ARM64Subtarget.h"
-#include "ARM64CallingConv.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/FastISel.h"
-#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/Support/CommandLine.h"
-using namespace llvm;
-
-namespace {
-
-class ARM64FastISel : public FastISel {
-
-  class Address {
-  public:
-    typedef enum {
-      RegBase,
-      FrameIndexBase
-    } BaseKind;
-
-  private:
-    BaseKind Kind;
-    union {
-      unsigned Reg;
-      int FI;
-    } Base;
-    int64_t Offset;
-
-  public:
-    Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; }
-    void setKind(BaseKind K) { Kind = K; }
-    BaseKind getKind() const { return Kind; }
-    bool isRegBase() const { return Kind == RegBase; }
-    bool isFIBase() const { return Kind == FrameIndexBase; }
-    void setReg(unsigned Reg) {
-      assert(isRegBase() && "Invalid base register access!");
-      Base.Reg = Reg;
-    }
-    unsigned getReg() const {
-      assert(isRegBase() && "Invalid base register access!");
-      return Base.Reg;
-    }
-    void setFI(unsigned FI) {
-      assert(isFIBase() && "Invalid base frame index  access!");
-      Base.FI = FI;
-    }
-    unsigned getFI() const {
-      assert(isFIBase() && "Invalid base frame index access!");
-      return Base.FI;
-    }
-    void setOffset(int64_t O) { Offset = O; }
-    int64_t getOffset() { return Offset; }
-
-    bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); }
-  };
-
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-  LLVMContext *Context;
-
-private:
-  // Selection routines.
-  bool SelectLoad(const Instruction *I);
-  bool SelectStore(const Instruction *I);
-  bool SelectBranch(const Instruction *I);
-  bool SelectIndirectBr(const Instruction *I);
-  bool SelectCmp(const Instruction *I);
-  bool SelectSelect(const Instruction *I);
-  bool SelectFPExt(const Instruction *I);
-  bool SelectFPTrunc(const Instruction *I);
-  bool SelectFPToInt(const Instruction *I, bool Signed);
-  bool SelectIntToFP(const Instruction *I, bool Signed);
-  bool SelectRem(const Instruction *I, unsigned ISDOpcode);
-  bool SelectCall(const Instruction *I, const char *IntrMemName);
-  bool SelectIntrinsicCall(const IntrinsicInst &I);
-  bool SelectRet(const Instruction *I);
-  bool SelectTrunc(const Instruction *I);
-  bool SelectIntExt(const Instruction *I);
-  bool SelectMul(const Instruction *I);
-
-  // Utility helper routines.
-  bool isTypeLegal(Type *Ty, MVT &VT);
-  bool isLoadStoreTypeLegal(Type *Ty, MVT &VT);
-  bool ComputeAddress(const Value *Obj, Address &Addr);
-  bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
-                       bool UseUnscaled);
-  void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
-                            unsigned Flags, bool UseUnscaled);
-  bool IsMemCpySmall(uint64_t Len, unsigned Alignment);
-  bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
-                          unsigned Alignment);
-  // Emit functions.
-  bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt);
-  bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
-                bool UseUnscaled = false);
-  bool EmitStore(MVT VT, unsigned SrcReg, Address Addr,
-                 bool UseUnscaled = false);
-  unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
-  unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
-
-  unsigned ARM64MaterializeFP(const ConstantFP *CFP, MVT VT);
-  unsigned ARM64MaterializeGV(const GlobalValue *GV);
-
-  // Call handling routines.
-private:
-  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
-  bool ProcessCallArgs(SmallVectorImpl<Value *> &Args,
-                       SmallVectorImpl<unsigned> &ArgRegs,
-                       SmallVectorImpl<MVT> &ArgVTs,
-                       SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
-                       SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
-                       unsigned &NumBytes);
-  bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
-                  const Instruction *I, CallingConv::ID CC, unsigned &NumBytes);
-
-public:
-  // Backend specific FastISel code.
-  virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI);
-  virtual unsigned TargetMaterializeConstant(const Constant *C);
-
-  explicit ARM64FastISel(FunctionLoweringInfo &funcInfo,
-                         const TargetLibraryInfo *libInfo)
-      : FastISel(funcInfo, libInfo) {
-    Subtarget = &TM.getSubtarget<ARM64Subtarget>();
-    Context = &funcInfo.Fn->getContext();
-  }
-
-  virtual bool TargetSelectInstruction(const Instruction *I);
-
-#include "ARM64GenFastISel.inc"
-};
-
-} // end anonymous namespace
-
-#include "ARM64GenCallingConv.inc"
-
-CCAssignFn *ARM64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
-  if (CC == CallingConv::WebKit_JS)
-    return CC_ARM64_WebKit_JS;
-  return Subtarget->isTargetDarwin() ? CC_ARM64_DarwinPCS : CC_ARM64_AAPCS;
-}
-
-unsigned ARM64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
-  assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
-         "Alloca should always return a pointer.");
-
-  // Don't handle dynamic allocas.
-  if (!FuncInfo.StaticAllocaMap.count(AI))
-    return 0;
-
-  DenseMap<const AllocaInst *, int>::iterator SI =
-      FuncInfo.StaticAllocaMap.find(AI);
-
-  if (SI != FuncInfo.StaticAllocaMap.end()) {
-    unsigned ResultReg = createResultReg(&ARM64::GPR64RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADDXri),
-            ResultReg)
-        .addFrameIndex(SI->second)
-        .addImm(0)
-        .addImm(0);
-    return ResultReg;
-  }
-
-  return 0;
-}
-
-unsigned ARM64FastISel::ARM64MaterializeFP(const ConstantFP *CFP, MVT VT) {
-  const APFloat Val = CFP->getValueAPF();
-  bool is64bit = (VT == MVT::f64);
-
-  // This checks to see if we can use FMOV instructions to materialize
-  // a constant, otherwise we have to materialize via the constant pool.
-  if (TLI.isFPImmLegal(Val, VT)) {
-    int Imm;
-    unsigned Opc;
-    if (is64bit) {
-      Imm = ARM64_AM::getFP64Imm(Val);
-      Opc = ARM64::FMOVDi;
-    } else {
-      Imm = ARM64_AM::getFP32Imm(Val);
-      Opc = ARM64::FMOVSi;
-    }
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-        .addImm(Imm);
-    return ResultReg;
-  }
-
-  // Materialize via constant pool.  MachineConstantPool wants an explicit
-  // alignment.
-  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
-  if (Align == 0)
-    Align = DL.getTypeAllocSize(CFP->getType());
-
-  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
-  unsigned ADRPReg = createResultReg(&ARM64::GPR64RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
-          ADRPReg).addConstantPoolIndex(Idx, 0, ARM64II::MO_PAGE);
-
-  unsigned Opc = is64bit ? ARM64::LDRDui : ARM64::LDRSui;
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(ADRPReg)
-      .addConstantPoolIndex(Idx, 0, ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-  return ResultReg;
-}
-
-unsigned ARM64FastISel::ARM64MaterializeGV(const GlobalValue *GV) {
-  // We can't handle thread-local variables quickly yet. Unfortunately we have
-  // to peer through any aliases to find out if that rule applies.
-  const GlobalValue *TLSGV = GV;
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    TLSGV = GA->getAliasedGlobal();
-
-  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(TLSGV))
-    if (GVar->isThreadLocal())
-      return 0;
-
-  unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
-
-  EVT DestEVT = TLI.getValueType(GV->getType(), true);
-  if (!DestEVT.isSimple())
-    return 0;
-  MVT DestVT = DestEVT.getSimpleVT();
-
-  unsigned ADRPReg = createResultReg(&ARM64::GPR64RegClass);
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-
-  if (OpFlags & ARM64II::MO_GOT) {
-    // ADRP + LDRX
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
-            ADRPReg)
-        .addGlobalAddress(GV, 0, ARM64II::MO_GOT | ARM64II::MO_PAGE);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::LDRXui),
-            ResultReg)
-        .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0, ARM64II::MO_GOT | ARM64II::MO_PAGEOFF |
-                                     ARM64II::MO_NC);
-  } else {
-    // ADRP + ADDX
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
-            ADRPReg).addGlobalAddress(GV, 0, ARM64II::MO_PAGE);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADDXri),
-            ResultReg)
-        .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0, ARM64II::MO_PAGEOFF | ARM64II::MO_NC)
-        .addImm(0);
-  }
-  return ResultReg;
-}
-
-unsigned ARM64FastISel::TargetMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
-
-  // Only handle simple types.
-  if (!CEVT.isSimple())
-    return 0;
-  MVT VT = CEVT.getSimpleVT();
-
-  // FIXME: Handle ConstantInt.
-  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
-    return ARM64MaterializeFP(CFP, VT);
-  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
-    return ARM64MaterializeGV(GV);
-
-  return 0;
-}
-
-// Computes the address to get to an object.
-bool ARM64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
-  const User *U = NULL;
-  unsigned Opcode = Instruction::UserOp1;
-  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
-    // Don't walk into other basic blocks unless the object is an alloca from
-    // another block, otherwise it may not have a virtual register assigned.
-    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
-        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
-      Opcode = I->getOpcode();
-      U = I;
-    }
-  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
-    Opcode = C->getOpcode();
-    U = C;
-  }
-
-  if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
-    if (Ty->getAddressSpace() > 255)
-      // Fast instruction selection doesn't support the special
-      // address spaces.
-      return false;
-
-  switch (Opcode) {
-  default:
-    break;
-  case Instruction::BitCast: {
-    // Look through bitcasts.
-    return ComputeAddress(U->getOperand(0), Addr);
-  }
-  case Instruction::IntToPtr: {
-    // Look past no-op inttoptrs.
-    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
-      return ComputeAddress(U->getOperand(0), Addr);
-    break;
-  }
-  case Instruction::PtrToInt: {
-    // Look past no-op ptrtoints.
-    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
-      return ComputeAddress(U->getOperand(0), Addr);
-    break;
-  }
-  case Instruction::GetElementPtr: {
-    Address SavedAddr = Addr;
-    uint64_t TmpOffset = Addr.getOffset();
-
-    // Iterate through the GEP folding the constants into offsets where
-    // we can.
-    gep_type_iterator GTI = gep_type_begin(U);
-    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
-         ++i, ++GTI) {
-      const Value *Op = *i;
-      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        const StructLayout *SL = DL.getStructLayout(STy);
-        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
-        TmpOffset += SL->getElementOffset(Idx);
-      } else {
-        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
-        for (;;) {
-          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
-            // Constant-offset addressing.
-            TmpOffset += CI->getSExtValue() * S;
-            break;
-          }
-          if (canFoldAddIntoGEP(U, Op)) {
-            // A compatible add with a constant operand. Fold the constant.
-            ConstantInt *CI =
-                cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
-            TmpOffset += CI->getSExtValue() * S;
-            // Iterate on the other operand.
-            Op = cast<AddOperator>(Op)->getOperand(0);
-            continue;
-          }
-          // Unsupported
-          goto unsupported_gep;
-        }
-      }
-    }
-
-    // Try to grab the base operand now.
-    Addr.setOffset(TmpOffset);
-    if (ComputeAddress(U->getOperand(0), Addr))
-      return true;
-
-    // We failed, restore everything and try the other options.
-    Addr = SavedAddr;
-
-  unsupported_gep:
-    break;
-  }
-  case Instruction::Alloca: {
-    const AllocaInst *AI = cast<AllocaInst>(Obj);
-    DenseMap<const AllocaInst *, int>::iterator SI =
-        FuncInfo.StaticAllocaMap.find(AI);
-    if (SI != FuncInfo.StaticAllocaMap.end()) {
-      Addr.setKind(Address::FrameIndexBase);
-      Addr.setFI(SI->second);
-      return true;
-    }
-    break;
-  }
-  }
-
-  // Try to get this in a register if nothing else has worked.
-  if (!Addr.isValid())
-    Addr.setReg(getRegForValue(Obj));
-  return Addr.isValid();
-}
-
-bool ARM64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
-  EVT evt = TLI.getValueType(Ty, true);
-
-  // Only handle simple types.
-  if (evt == MVT::Other || !evt.isSimple())
-    return false;
-  VT = evt.getSimpleVT();
-
-  // Handle all legal types, i.e. a register that will directly hold this
-  // value.
-  return TLI.isTypeLegal(VT);
-}
-
-bool ARM64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
-  if (isTypeLegal(Ty, VT))
-    return true;
-
-  // If this is a type than can be sign or zero-extended to a basic operation
-  // go ahead and accept it now. For stores, this reflects truncation.
-  if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
-    return true;
-
-  return false;
-}
-
-bool ARM64FastISel::SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
-                                    bool UseUnscaled) {
-  bool needsLowering = false;
-  int64_t Offset = Addr.getOffset();
-  switch (VT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i1:
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32:
-  case MVT::i64:
-  case MVT::f32:
-  case MVT::f64:
-    if (!UseUnscaled)
-      // Using scaled, 12-bit, unsigned immediate offsets.
-      needsLowering = ((Offset & 0xfff) != Offset);
-    else
-      // Using unscaled, 9-bit, signed immediate offsets.
-      needsLowering = (Offset > 256 || Offset < -256);
-    break;
-  }
-
-  // FIXME: If this is a stack pointer and the offset needs to be simplified
-  // then put the alloca address into a register, set the base type back to
-  // register and continue. This should almost never happen.
-  if (needsLowering && Addr.getKind() == Address::FrameIndexBase) {
-    return false;
-  }
-
-  // Since the offset is too large for the load/store instruction get the
-  // reg+offset into a register.
-  if (needsLowering) {
-    uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor;
-    unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false,
-                                      UnscaledOffset, MVT::i64);
-    if (ResultReg == 0)
-      return false;
-    Addr.setReg(ResultReg);
-    Addr.setOffset(0);
-  }
-  return true;
-}
-
-void ARM64FastISel::AddLoadStoreOperands(Address &Addr,
-                                         const MachineInstrBuilder &MIB,
-                                         unsigned Flags, bool UseUnscaled) {
-  int64_t Offset = Addr.getOffset();
-  // Frame base works a bit differently. Handle it separately.
-  if (Addr.getKind() == Address::FrameIndexBase) {
-    int FI = Addr.getFI();
-    // FIXME: We shouldn't be using getObjectSize/getObjectAlignment.  The size
-    // and alignment should be based on the VT.
-    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(FI, Offset), Flags,
-        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
-    // Now add the rest of the operands.
-    MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
-  } else {
-    // Now add the rest of the operands.
-    MIB.addReg(Addr.getReg());
-    MIB.addImm(Offset);
-  }
-}
-
-bool ARM64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
-                             bool UseUnscaled) {
-  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
-  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
-  if (!UseUnscaled && Addr.getOffset() < 0)
-    UseUnscaled = true;
-
-  unsigned Opc;
-  const TargetRegisterClass *RC;
-  bool VTIsi1 = false;
-  int64_t ScaleFactor = 0;
-  switch (VT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i1:
-    VTIsi1 = true;
-  // Intentional fall-through.
-  case MVT::i8:
-    Opc = UseUnscaled ? ARM64::LDURBBi : ARM64::LDRBBui;
-    RC = &ARM64::GPR32RegClass;
-    ScaleFactor = 1;
-    break;
-  case MVT::i16:
-    Opc = UseUnscaled ? ARM64::LDURHHi : ARM64::LDRHHui;
-    RC = &ARM64::GPR32RegClass;
-    ScaleFactor = 2;
-    break;
-  case MVT::i32:
-    Opc = UseUnscaled ? ARM64::LDURWi : ARM64::LDRWui;
-    RC = &ARM64::GPR32RegClass;
-    ScaleFactor = 4;
-    break;
-  case MVT::i64:
-    Opc = UseUnscaled ? ARM64::LDURXi : ARM64::LDRXui;
-    RC = &ARM64::GPR64RegClass;
-    ScaleFactor = 8;
-    break;
-  case MVT::f32:
-    Opc = UseUnscaled ? ARM64::LDURSi : ARM64::LDRSui;
-    RC = TLI.getRegClassFor(VT);
-    ScaleFactor = 4;
-    break;
-  case MVT::f64:
-    Opc = UseUnscaled ? ARM64::LDURDi : ARM64::LDRDui;
-    RC = TLI.getRegClassFor(VT);
-    ScaleFactor = 8;
-    break;
-  }
-  // Scale the offset.
-  if (!UseUnscaled) {
-    int64_t Offset = Addr.getOffset();
-    if (Offset & (ScaleFactor - 1))
-      // Retry using an unscaled, 9-bit, signed immediate offset.
-      return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true);
-
-    Addr.setOffset(Offset / ScaleFactor);
-  }
-
-  // Simplify this down to something we can handle.
-  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
-    return false;
-
-  // Create the base instruction, then add the operands.
-  ResultReg = createResultReg(RC);
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(Opc), ResultReg);
-  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled);
-
-  // Loading an i1 requires special handling.
-  if (VTIsi1) {
-    unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-            ANDReg)
-        .addReg(ResultReg)
-        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
-    ResultReg = ANDReg;
-  }
-  return true;
-}
-
-bool ARM64FastISel::SelectLoad(const Instruction *I) {
-  MVT VT;
-  // Verify we have a legal type before going any further.  Currently, we handle
-  // simple types that will directly fit in a register (i32/f32/i64/f64) or
-  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
-  if (!isLoadStoreTypeLegal(I->getType(), VT) || cast<LoadInst>(I)->isAtomic())
-    return false;
-
-  // See if we can handle this address.
-  Address Addr;
-  if (!ComputeAddress(I->getOperand(0), Addr))
-    return false;
-
-  unsigned ResultReg;
-  if (!EmitLoad(VT, ResultReg, Addr))
-    return false;
-
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
-                              bool UseUnscaled) {
-  // Negative offsets require unscaled, 9-bit, signed immediate offsets.
-  // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
-  if (!UseUnscaled && Addr.getOffset() < 0)
-    UseUnscaled = true;
-
-  unsigned StrOpc;
-  bool VTIsi1 = false;
-  int64_t ScaleFactor = 0;
-  // Using scaled, 12-bit, unsigned immediate offsets.
-  switch (VT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i1:
-    VTIsi1 = true;
-  case MVT::i8:
-    StrOpc = UseUnscaled ? ARM64::STURBBi : ARM64::STRBBui;
-    ScaleFactor = 1;
-    break;
-  case MVT::i16:
-    StrOpc = UseUnscaled ? ARM64::STURHHi : ARM64::STRHHui;
-    ScaleFactor = 2;
-    break;
-  case MVT::i32:
-    StrOpc = UseUnscaled ? ARM64::STURWi : ARM64::STRWui;
-    ScaleFactor = 4;
-    break;
-  case MVT::i64:
-    StrOpc = UseUnscaled ? ARM64::STURXi : ARM64::STRXui;
-    ScaleFactor = 8;
-    break;
-  case MVT::f32:
-    StrOpc = UseUnscaled ? ARM64::STURSi : ARM64::STRSui;
-    ScaleFactor = 4;
-    break;
-  case MVT::f64:
-    StrOpc = UseUnscaled ? ARM64::STURDi : ARM64::STRDui;
-    ScaleFactor = 8;
-    break;
-  }
-  // Scale the offset.
-  if (!UseUnscaled) {
-    int64_t Offset = Addr.getOffset();
-    if (Offset & (ScaleFactor - 1))
-      // Retry using an unscaled, 9-bit, signed immediate offset.
-      return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true);
-
-    Addr.setOffset(Offset / ScaleFactor);
-  }
-
-  // Simplify this down to something we can handle.
-  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
-    return false;
-
-  // Storing an i1 requires special handling.
-  if (VTIsi1) {
-    unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-            ANDReg)
-        .addReg(SrcReg)
-        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
-    SrcReg = ANDReg;
-  }
-  // Create the base instruction, then add the operands.
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(StrOpc)).addReg(SrcReg);
-  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled);
-  return true;
-}
-
-bool ARM64FastISel::SelectStore(const Instruction *I) {
-  MVT VT;
-  Value *Op0 = I->getOperand(0);
-  // Verify we have a legal type before going any further.  Currently, we handle
-  // simple types that will directly fit in a register (i32/f32/i64/f64) or
-  // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
-  if (!isLoadStoreTypeLegal(Op0->getType(), VT) ||
-      cast<StoreInst>(I)->isAtomic())
-    return false;
-
-  // Get the value to be stored into a register.
-  unsigned SrcReg = getRegForValue(Op0);
-  if (SrcReg == 0)
-    return false;
-
-  // See if we can handle this address.
-  Address Addr;
-  if (!ComputeAddress(I->getOperand(1), Addr))
-    return false;
-
-  if (!EmitStore(VT, SrcReg, Addr))
-    return false;
-  return true;
-}
-
-static ARM64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
-  switch (Pred) {
-  case CmpInst::FCMP_ONE:
-  case CmpInst::FCMP_UEQ:
-  default:
-    // AL is our "false" for now. The other two need more compares.
-    return ARM64CC::AL;
-  case CmpInst::ICMP_EQ:
-  case CmpInst::FCMP_OEQ:
-    return ARM64CC::EQ;
-  case CmpInst::ICMP_SGT:
-  case CmpInst::FCMP_OGT:
-    return ARM64CC::GT;
-  case CmpInst::ICMP_SGE:
-  case CmpInst::FCMP_OGE:
-    return ARM64CC::GE;
-  case CmpInst::ICMP_UGT:
-  case CmpInst::FCMP_UGT:
-    return ARM64CC::HI;
-  case CmpInst::FCMP_OLT:
-    return ARM64CC::MI;
-  case CmpInst::ICMP_ULE:
-  case CmpInst::FCMP_OLE:
-    return ARM64CC::LS;
-  case CmpInst::FCMP_ORD:
-    return ARM64CC::VC;
-  case CmpInst::FCMP_UNO:
-    return ARM64CC::VS;
-  case CmpInst::FCMP_UGE:
-    return ARM64CC::PL;
-  case CmpInst::ICMP_SLT:
-  case CmpInst::FCMP_ULT:
-    return ARM64CC::LT;
-  case CmpInst::ICMP_SLE:
-  case CmpInst::FCMP_ULE:
-    return ARM64CC::LE;
-  case CmpInst::FCMP_UNE:
-  case CmpInst::ICMP_NE:
-    return ARM64CC::NE;
-  case CmpInst::ICMP_UGE:
-    return ARM64CC::CS;
-  case CmpInst::ICMP_ULT:
-    return ARM64CC::CC;
-  }
-}
-
-bool ARM64FastISel::SelectBranch(const Instruction *I) {
-  const BranchInst *BI = cast<BranchInst>(I);
-  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
-  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
-
-  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
-    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
-      // We may not handle every CC for now.
-      ARM64CC::CondCode CC = getCompareCC(CI->getPredicate());
-      if (CC == ARM64CC::AL)
-        return false;
-
-      // Emit the cmp.
-      if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
-        return false;
-
-      // Emit the branch.
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
-          .addImm(CC)
-          .addMBB(TBB);
-      FuncInfo.MBB->addSuccessor(TBB);
-
-      FastEmitBranch(FBB, DbgLoc);
-      return true;
-    }
-  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
-    MVT SrcVT;
-    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
-        (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) {
-      unsigned CondReg = getRegForValue(TI->getOperand(0));
-      if (CondReg == 0)
-        return false;
-
-      // Issue an extract_subreg to get the lower 32-bits.
-      if (SrcVT == MVT::i64)
-        CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true,
-                                             ARM64::sub_32);
-
-      unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-              ANDReg)
-          .addReg(CondReg)
-          .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri))
-          .addReg(ANDReg)
-          .addReg(ANDReg)
-          .addImm(0)
-          .addImm(0);
-
-      unsigned CC = ARM64CC::NE;
-      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
-        std::swap(TBB, FBB);
-        CC = ARM64CC::EQ;
-      }
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
-          .addImm(CC)
-          .addMBB(TBB);
-      FuncInfo.MBB->addSuccessor(TBB);
-      FastEmitBranch(FBB, DbgLoc);
-      return true;
-    }
-  } else if (const ConstantInt *CI =
-                 dyn_cast<ConstantInt>(BI->getCondition())) {
-    uint64_t Imm = CI->getZExtValue();
-    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::B))
-        .addMBB(Target);
-    FuncInfo.MBB->addSuccessor(Target);
-    return true;
-  }
-
-  unsigned CondReg = getRegForValue(BI->getCondition());
-  if (CondReg == 0)
-    return false;
-
-  // We've been divorced from our compare!  Our block was split, and
-  // now our compare lives in a predecessor block.  We musn't
-  // re-compare here, as the children of the compare aren't guaranteed
-  // live across the block boundary (we *could* check for this).
-  // Regardless, the compare has been done in the predecessor block,
-  // and it left a value for us in a virtual register.  Ergo, we test
-  // the one-bit value left in the virtual register.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri),
-          ARM64::WZR)
-      .addReg(CondReg)
-      .addImm(0)
-      .addImm(0);
-
-  unsigned CC = ARM64CC::NE;
-  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
-    std::swap(TBB, FBB);
-    CC = ARM64CC::EQ;
-  }
-
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
-      .addImm(CC)
-      .addMBB(TBB);
-  FuncInfo.MBB->addSuccessor(TBB);
-  FastEmitBranch(FBB, DbgLoc);
-  return true;
-}
-
-bool ARM64FastISel::SelectIndirectBr(const Instruction *I) {
-  const IndirectBrInst *BI = cast<IndirectBrInst>(I);
-  unsigned AddrReg = getRegForValue(BI->getOperand(0));
-  if (AddrReg == 0)
-    return false;
-
-  // Emit the indirect branch.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BR))
-      .addReg(AddrReg);
-
-  // Make sure the CFG is up-to-date.
-  for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
-    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]);
-
-  return true;
-}
-
-bool ARM64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
-  Type *Ty = Src1Value->getType();
-  EVT SrcEVT = TLI.getValueType(Ty, true);
-  if (!SrcEVT.isSimple())
-    return false;
-  MVT SrcVT = SrcEVT.getSimpleVT();
-
-  // Check to see if the 2nd operand is a constant that we can encode directly
-  // in the compare.
-  uint64_t Imm;
-  bool UseImm = false;
-  bool isNegativeImm = false;
-  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
-    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
-        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
-      const APInt &CIVal = ConstInt->getValue();
-
-      Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue();
-      if (CIVal.isNegative()) {
-        isNegativeImm = true;
-        Imm = -Imm;
-      }
-      // FIXME: We can handle more immediates using shifts.
-      UseImm = ((Imm & 0xfff) == Imm);
-    }
-  } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
-    if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
-      if (ConstFP->isZero() && !ConstFP->isNegative())
-        UseImm = true;
-  }
-
-  unsigned ZReg;
-  unsigned CmpOpc;
-  bool isICmp = true;
-  bool needsExt = false;
-  switch (SrcVT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i1:
-  case MVT::i8:
-  case MVT::i16:
-    needsExt = true;
-  // Intentional fall-through.
-  case MVT::i32:
-    ZReg = ARM64::WZR;
-    if (UseImm)
-      CmpOpc = isNegativeImm ? ARM64::ADDSWri : ARM64::SUBSWri;
-    else
-      CmpOpc = ARM64::SUBSWrr;
-    break;
-  case MVT::i64:
-    ZReg = ARM64::XZR;
-    if (UseImm)
-      CmpOpc = isNegativeImm ? ARM64::ADDSXri : ARM64::SUBSXri;
-    else
-      CmpOpc = ARM64::SUBSXrr;
-    break;
-  case MVT::f32:
-    isICmp = false;
-    CmpOpc = UseImm ? ARM64::FCMPSri : ARM64::FCMPSrr;
-    break;
-  case MVT::f64:
-    isICmp = false;
-    CmpOpc = UseImm ? ARM64::FCMPDri : ARM64::FCMPDrr;
-    break;
-  }
-
-  unsigned SrcReg1 = getRegForValue(Src1Value);
-  if (SrcReg1 == 0)
-    return false;
-
-  unsigned SrcReg2;
-  if (!UseImm) {
-    SrcReg2 = getRegForValue(Src2Value);
-    if (SrcReg2 == 0)
-      return false;
-  }
-
-  // We have i1, i8, or i16, we need to either zero extend or sign extend.
-  if (needsExt) {
-    SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
-    if (SrcReg1 == 0)
-      return false;
-    if (!UseImm) {
-      SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
-      if (SrcReg2 == 0)
-        return false;
-    }
-  }
-
-  if (isICmp) {
-    if (UseImm)
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(ZReg)
-          .addReg(SrcReg1)
-          .addImm(Imm)
-          .addImm(0);
-    else
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(ZReg)
-          .addReg(SrcReg1)
-          .addReg(SrcReg2);
-  } else {
-    if (UseImm)
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(SrcReg1);
-    else
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(SrcReg1)
-          .addReg(SrcReg2);
-  }
-  return true;
-}
-
-bool ARM64FastISel::SelectCmp(const Instruction *I) {
-  const CmpInst *CI = cast<CmpInst>(I);
-
-  // We may not handle every CC for now.
-  ARM64CC::CondCode CC = getCompareCC(CI->getPredicate());
-  if (CC == ARM64CC::AL)
-    return false;
-
-  // Emit the cmp.
-  if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
-    return false;
-
-  // Now set a register based on the comparison.
-  ARM64CC::CondCode invertedCC = getInvertedCondCode(CC);
-  unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::CSINCWr),
-          ResultReg)
-      .addReg(ARM64::WZR)
-      .addReg(ARM64::WZR)
-      .addImm(invertedCC);
-
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::SelectSelect(const Instruction *I) {
-  const SelectInst *SI = cast<SelectInst>(I);
-
-  EVT DestEVT = TLI.getValueType(SI->getType(), true);
-  if (!DestEVT.isSimple())
-    return false;
-
-  MVT DestVT = DestEVT.getSimpleVT();
-  if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 &&
-      DestVT != MVT::f64)
-    return false;
-
-  unsigned CondReg = getRegForValue(SI->getCondition());
-  if (CondReg == 0)
-    return false;
-  unsigned TrueReg = getRegForValue(SI->getTrueValue());
-  if (TrueReg == 0)
-    return false;
-  unsigned FalseReg = getRegForValue(SI->getFalseValue());
-  if (FalseReg == 0)
-    return false;
-
-  unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-          ANDReg)
-      .addReg(CondReg)
-      .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
-
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri))
-      .addReg(ANDReg)
-      .addReg(ANDReg)
-      .addImm(0)
-      .addImm(0);
-
-  unsigned SelectOpc;
-  switch (DestVT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i32:
-    SelectOpc = ARM64::CSELWr;
-    break;
-  case MVT::i64:
-    SelectOpc = ARM64::CSELXr;
-    break;
-  case MVT::f32:
-    SelectOpc = ARM64::FCSELSrrr;
-    break;
-  case MVT::f64:
-    SelectOpc = ARM64::FCSELDrrr;
-    break;
-  }
-
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc),
-          ResultReg)
-      .addReg(TrueReg)
-      .addReg(FalseReg)
-      .addImm(ARM64CC::NE);
-
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::SelectFPExt(const Instruction *I) {
-  Value *V = I->getOperand(0);
-  if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
-    return false;
-
-  unsigned Op = getRegForValue(V);
-  if (Op == 0)
-    return false;
-
-  unsigned ResultReg = createResultReg(&ARM64::FPR64RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::FCVTDSr),
-          ResultReg).addReg(Op);
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::SelectFPTrunc(const Instruction *I) {
-  Value *V = I->getOperand(0);
-  if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
-    return false;
-
-  unsigned Op = getRegForValue(V);
-  if (Op == 0)
-    return false;
-
-  unsigned ResultReg = createResultReg(&ARM64::FPR32RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::FCVTSDr),
-          ResultReg).addReg(Op);
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-// FPToUI and FPToSI
-bool ARM64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
-  MVT DestVT;
-  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
-    return false;
-
-  unsigned SrcReg = getRegForValue(I->getOperand(0));
-  if (SrcReg == 0)
-    return false;
-
-  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
-
-  unsigned Opc;
-  if (SrcVT == MVT::f64) {
-    if (Signed)
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZSUWDr : ARM64::FCVTZSUXDr;
-    else
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZUUWDr : ARM64::FCVTZUUXDr;
-  } else {
-    if (Signed)
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZSUWSr : ARM64::FCVTZSUXSr;
-    else
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZUUWSr : ARM64::FCVTZUUXSr;
-  }
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(SrcReg);
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
-  MVT DestVT;
-  if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
-    return false;
-
-  unsigned SrcReg = getRegForValue(I->getOperand(0));
-  if (SrcReg == 0)
-    return false;
-
-  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
-
-  // Handle sign-extension.
-  if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
-    SrcReg =
-        EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
-    if (SrcReg == 0)
-      return false;
-  }
-
-  unsigned Opc;
-  if (SrcVT == MVT::i64) {
-    if (Signed)
-      Opc = (DestVT == MVT::f32) ? ARM64::SCVTFUXSri : ARM64::SCVTFUXDri;
-    else
-      Opc = (DestVT == MVT::f32) ? ARM64::UCVTFUXSri : ARM64::UCVTFUXDri;
-  } else {
-    if (Signed)
-      Opc = (DestVT == MVT::f32) ? ARM64::SCVTFUWSri : ARM64::SCVTFUWDri;
-    else
-      Opc = (DestVT == MVT::f32) ? ARM64::UCVTFUWSri : ARM64::UCVTFUWDri;
-  }
-
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(SrcReg);
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl<Value *> &Args,
-                                    SmallVectorImpl<unsigned> &ArgRegs,
-                                    SmallVectorImpl<MVT> &ArgVTs,
-                                    SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
-                                    SmallVectorImpl<unsigned> &RegArgs,
-                                    CallingConv::ID CC, unsigned &NumBytes) {
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
-  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
-
-  // Get a count of how many bytes are to be pushed on the stack.
-  NumBytes = CCInfo.getNextStackOffset();
-
-  // Issue CALLSEQ_START
-  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-      .addImm(NumBytes);
-
-  // Process the args.
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-    unsigned Arg = ArgRegs[VA.getValNo()];
-    MVT ArgVT = ArgVTs[VA.getValNo()];
-
-    // Handle arg promotion: SExt, ZExt, AExt.
-    switch (VA.getLocInfo()) {
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::SExt: {
-      MVT DestVT = VA.getLocVT();
-      MVT SrcVT = ArgVT;
-      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
-      if (Arg == 0)
-        return false;
-      ArgVT = DestVT;
-      break;
-    }
-    case CCValAssign::AExt:
-    // Intentional fall-through.
-    case CCValAssign::ZExt: {
-      MVT DestVT = VA.getLocVT();
-      MVT SrcVT = ArgVT;
-      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
-      if (Arg == 0)
-        return false;
-      ArgVT = DestVT;
-      break;
-    }
-    default:
-      llvm_unreachable("Unknown arg promotion!");
-    }
-
-    // Now copy/store arg to correct locations.
-    if (VA.isRegLoc() && !VA.needsCustom()) {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
-      RegArgs.push_back(VA.getLocReg());
-    } else if (VA.needsCustom()) {
-      // FIXME: Handle custom args.
-      return false;
-    } else {
-      assert(VA.isMemLoc() && "Assuming store on stack.");
-
-      // Need to store on the stack.
-      Address Addr;
-      Addr.setKind(Address::RegBase);
-      Addr.setReg(ARM64::SP);
-      Addr.setOffset(VA.getLocMemOffset());
-
-      if (!EmitStore(ArgVT, Arg, Addr))
-        return false;
-    }
-  }
-  return true;
-}
-
-bool ARM64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
-                               const Instruction *I, CallingConv::ID CC,
-                               unsigned &NumBytes) {
-  // Issue CALLSEQ_END
-  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
-      .addImm(NumBytes)
-      .addImm(0);
-
-  // Now the return value.
-  if (RetVT != MVT::isVoid) {
-    SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
-    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
-
-    // Only handle a single return value.
-    if (RVLocs.size() != 1)
-      return false;
-
-    // Copy all of the result registers out of their specified physreg.
-    MVT CopyVT = RVLocs[0].getValVT();
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY),
-            ResultReg).addReg(RVLocs[0].getLocReg());
-    UsedRegs.push_back(RVLocs[0].getLocReg());
-
-    // Finally update the result.
-    UpdateValueMap(I, ResultReg);
-  }
-
-  return true;
-}
-
-bool ARM64FastISel::SelectCall(const Instruction *I,
-                               const char *IntrMemName = 0) {
-  const CallInst *CI = cast<CallInst>(I);
-  const Value *Callee = CI->getCalledValue();
-
-  // Don't handle inline asm or intrinsics.
-  if (isa<InlineAsm>(Callee))
-    return false;
-
-  // Only handle global variable Callees.
-  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
-  if (!GV)
-    return false;
-
-  // Check the calling convention.
-  ImmutableCallSite CS(CI);
-  CallingConv::ID CC = CS.getCallingConv();
-
-  // Let SDISel handle vararg functions.
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  if (FTy->isVarArg())
-    return false;
-
-  // Handle *simple* calls for now.
-  MVT RetVT;
-  Type *RetTy = I->getType();
-  if (RetTy->isVoidTy())
-    RetVT = MVT::isVoid;
-  else if (!isTypeLegal(RetTy, RetVT))
-    return false;
-
-  // Set up the argument vectors.
-  SmallVector<Value *, 8> Args;
-  SmallVector<unsigned, 8> ArgRegs;
-  SmallVector<MVT, 8> ArgVTs;
-  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
-  Args.reserve(CS.arg_size());
-  ArgRegs.reserve(CS.arg_size());
-  ArgVTs.reserve(CS.arg_size());
-  ArgFlags.reserve(CS.arg_size());
-
-  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
-       i != e; ++i) {
-    // If we're lowering a memory intrinsic instead of a regular call, skip the
-    // last two arguments, which shouldn't be passed to the underlying function.
-    if (IntrMemName && e - i <= 2)
-      break;
-
-    unsigned Arg = getRegForValue(*i);
-    if (Arg == 0)
-      return false;
-
-    ISD::ArgFlagsTy Flags;
-    unsigned AttrInd = i - CS.arg_begin() + 1;
-    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
-      Flags.setSExt();
-    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
-      Flags.setZExt();
-
-    // FIXME: Only handle *easy* calls for now.
-    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
-        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
-        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
-        CS.paramHasAttr(AttrInd, Attribute::ByVal))
-      return false;
-
-    MVT ArgVT;
-    Type *ArgTy = (*i)->getType();
-    if (!isTypeLegal(ArgTy, ArgVT) &&
-        !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16))
-      return false;
-
-    // We don't handle vector parameters yet.
-    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
-      return false;
-
-    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
-    Flags.setOrigAlign(OriginalAlignment);
-
-    Args.push_back(*i);
-    ArgRegs.push_back(Arg);
-    ArgVTs.push_back(ArgVT);
-    ArgFlags.push_back(Flags);
-  }
-
-  // Handle the arguments now that we've gotten them.
-  SmallVector<unsigned, 4> RegArgs;
-  unsigned NumBytes;
-  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
-    return false;
-
-  // Issue the call.
-  MachineInstrBuilder MIB;
-  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BL));
-  if (!IntrMemName)
-    MIB.addGlobalAddress(GV, 0, 0);
-  else
-    MIB.addExternalSymbol(IntrMemName, 0);
-
-  // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
-
-  // Add a register mask with the call-preserved registers.
-  // Proper defs for return values will be added by setPhysRegsDeadExcept().
-  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
-
-  // Finish off the call including any return values.
-  SmallVector<unsigned, 4> UsedRegs;
-  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes))
-    return false;
-
-  // Set all unused physreg defs as dead.
-  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
-
-  return true;
-}
-
-bool ARM64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
-  if (Alignment)
-    return Len / Alignment <= 4;
-  else
-    return Len < 32;
-}
-
-bool ARM64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
-                                       unsigned Alignment) {
-  // Make sure we don't bloat code by inlining very large memcpy's.
-  if (!IsMemCpySmall(Len, Alignment))
-    return false;
-
-  int64_t UnscaledOffset = 0;
-  Address OrigDest = Dest;
-  Address OrigSrc = Src;
-
-  while (Len) {
-    MVT VT;
-    if (!Alignment || Alignment >= 8) {
-      if (Len >= 8)
-        VT = MVT::i64;
-      else if (Len >= 4)
-        VT = MVT::i32;
-      else if (Len >= 2)
-        VT = MVT::i16;
-      else {
-        VT = MVT::i8;
-      }
-    } else {
-      // Bound based on alignment.
-      if (Len >= 4 && Alignment == 4)
-        VT = MVT::i32;
-      else if (Len >= 2 && Alignment == 2)
-        VT = MVT::i16;
-      else {
-        VT = MVT::i8;
-      }
-    }
-
-    bool RV;
-    unsigned ResultReg;
-    RV = EmitLoad(VT, ResultReg, Src);
-    assert(RV == true && "Should be able to handle this load.");
-    RV = EmitStore(VT, ResultReg, Dest);
-    assert(RV == true && "Should be able to handle this store.");
-    (void)RV;
-
-    int64_t Size = VT.getSizeInBits() / 8;
-    Len -= Size;
-    UnscaledOffset += Size;
-
-    // We need to recompute the unscaled offset for each iteration.
-    Dest.setOffset(OrigDest.getOffset() + UnscaledOffset);
-    Src.setOffset(OrigSrc.getOffset() + UnscaledOffset);
-  }
-
-  return true;
-}
-
-bool ARM64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
-  // FIXME: Handle more intrinsics.
-  switch (I.getIntrinsicID()) {
-  default:
-    return false;
-  case Intrinsic::memcpy:
-  case Intrinsic::memmove: {
-    const MemTransferInst &MTI = cast<MemTransferInst>(I);
-    // Don't handle volatile.
-    if (MTI.isVolatile())
-      return false;
-
-    // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
-    // we would emit dead code because we don't currently handle memmoves.
-    bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
-    if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
-      // Small memcpy's are common enough that we want to do them without a call
-      // if possible.
-      uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
-      unsigned Alignment = MTI.getAlignment();
-      if (IsMemCpySmall(Len, Alignment)) {
-        Address Dest, Src;
-        if (!ComputeAddress(MTI.getRawDest(), Dest) ||
-            !ComputeAddress(MTI.getRawSource(), Src))
-          return false;
-        if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment))
-          return true;
-      }
-    }
-
-    if (!MTI.getLength()->getType()->isIntegerTy(64))
-      return false;
-
-    if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
-      // Fast instruction selection doesn't support the special
-      // address spaces.
-      return false;
-
-    const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
-    return SelectCall(&I, IntrMemName);
-  }
-  case Intrinsic::memset: {
-    const MemSetInst &MSI = cast<MemSetInst>(I);
-    // Don't handle volatile.
-    if (MSI.isVolatile())
-      return false;
-
-    if (!MSI.getLength()->getType()->isIntegerTy(64))
-      return false;
-
-    if (MSI.getDestAddressSpace() > 255)
-      // Fast instruction selection doesn't support the special
-      // address spaces.
-      return false;
-
-    return SelectCall(&I, "memset");
-  }
-  case Intrinsic::trap: {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BRK))
-        .addImm(1);
-    return true;
-  }
-  }
-  return false;
-}
-
-bool ARM64FastISel::SelectRet(const Instruction *I) {
-  const ReturnInst *Ret = cast<ReturnInst>(I);
-  const Function &F = *I->getParent()->getParent();
-
-  if (!FuncInfo.CanLowerReturn)
-    return false;
-
-  if (F.isVarArg())
-    return false;
-
-  // Build a list of return value registers.
-  SmallVector<unsigned, 4> RetRegs;
-
-  if (Ret->getNumOperands() > 0) {
-    CallingConv::ID CC = F.getCallingConv();
-    SmallVector<ISD::OutputArg, 4> Outs;
-    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
-
-    // Analyze operands of the call, assigning locations to each operand.
-    SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
-                   I->getContext());
-    CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                     : RetCC_ARM64_AAPCS;
-    CCInfo.AnalyzeReturn(Outs, RetCC);
-
-    // Only handle a single return value for now.
-    if (ValLocs.size() != 1)
-      return false;
-
-    CCValAssign &VA = ValLocs[0];
-    const Value *RV = Ret->getOperand(0);
-
-    // Don't bother handling odd stuff for now.
-    if (VA.getLocInfo() != CCValAssign::Full)
-      return false;
-    // Only handle register returns for now.
-    if (!VA.isRegLoc())
-      return false;
-    unsigned Reg = getRegForValue(RV);
-    if (Reg == 0)
-      return false;
-
-    unsigned SrcReg = Reg + VA.getValNo();
-    unsigned DestReg = VA.getLocReg();
-    // Avoid a cross-class copy. This is very unlikely.
-    if (!MRI.getRegClass(SrcReg)->contains(DestReg))
-      return false;
-
-    EVT RVEVT = TLI.getValueType(RV->getType());
-    if (!RVEVT.isSimple())
-      return false;
-    MVT RVVT = RVEVT.getSimpleVT();
-    MVT DestVT = VA.getValVT();
-    // Special handling for extended integers.
-    if (RVVT != DestVT) {
-      if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
-        return false;
-
-      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
-        return false;
-
-      bool isZExt = Outs[0].Flags.isZExt();
-      SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt);
-      if (SrcReg == 0)
-        return false;
-    }
-
-    // Make the copy.
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
-
-    // Add register to return instruction.
-    RetRegs.push_back(VA.getLocReg());
-  }
-
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(ARM64::RET_ReallyLR));
-  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
-    MIB.addReg(RetRegs[i], RegState::Implicit);
-  return true;
-}
-
-bool ARM64FastISel::SelectTrunc(const Instruction *I) {
-  Type *DestTy = I->getType();
-  Value *Op = I->getOperand(0);
-  Type *SrcTy = Op->getType();
-
-  EVT SrcEVT = TLI.getValueType(SrcTy, true);
-  EVT DestEVT = TLI.getValueType(DestTy, true);
-  if (!SrcEVT.isSimple())
-    return false;
-  if (!DestEVT.isSimple())
-    return false;
-
-  MVT SrcVT = SrcEVT.getSimpleVT();
-  MVT DestVT = DestEVT.getSimpleVT();
-
-  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
-      SrcVT != MVT::i8)
-    return false;
-  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 &&
-      DestVT != MVT::i1)
-    return false;
-
-  unsigned SrcReg = getRegForValue(Op);
-  if (!SrcReg)
-    return false;
-
-  // If we're truncating from i64 to a smaller non-legal type then generate an
-  // AND.  Otherwise, we know the high bits are undefined and a truncate doesn't
-  // generate any code.
-  if (SrcVT == MVT::i64) {
-    uint64_t Mask = 0;
-    switch (DestVT.SimpleTy) {
-    default:
-      // Trunc i64 to i32 is handled by the target-independent fast-isel.
-      return false;
-    case MVT::i1:
-      Mask = 0x1;
-      break;
-    case MVT::i8:
-      Mask = 0xff;
-      break;
-    case MVT::i16:
-      Mask = 0xffff;
-      break;
-    }
-    // Issue an extract_subreg to get the lower 32-bits.
-    unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true,
-                                                ARM64::sub_32);
-    // Create the AND instruction which performs the actual truncation.
-    unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-            ANDReg)
-        .addReg(Reg32)
-        .addImm(ARM64_AM::encodeLogicalImmediate(Mask, 32));
-    SrcReg = ANDReg;
-  }
-
-  UpdateValueMap(I, SrcReg);
-  return true;
-}
-
-unsigned ARM64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
-  assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
-          DestVT == MVT::i64) &&
-         "Unexpected value type.");
-  // Handle i8 and i16 as i32.
-  if (DestVT == MVT::i8 || DestVT == MVT::i16)
-    DestVT = MVT::i32;
-
-  if (isZExt) {
-    unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-            ResultReg)
-        .addReg(SrcReg)
-        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
-
-    if (DestVT == MVT::i64) {
-      // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
-      // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
-      unsigned Reg64 = MRI.createVirtualRegister(&ARM64::GPR64RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(ARM64::SUBREG_TO_REG), Reg64)
-          .addImm(0)
-          .addReg(ResultReg)
-          .addImm(ARM64::sub_32);
-      ResultReg = Reg64;
-    }
-    return ResultReg;
-  } else {
-    if (DestVT == MVT::i64) {
-      // FIXME: We're SExt i1 to i64.
-      return 0;
-    }
-    unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SBFMWri),
-            ResultReg)
-        .addReg(SrcReg)
-        .addImm(0)
-        .addImm(0);
-    return ResultReg;
-  }
-}
-
-unsigned ARM64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
-                                   bool isZExt) {
-  assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
-  unsigned Opc;
-  unsigned Imm = 0;
-
-  switch (SrcVT.SimpleTy) {
-  default:
-    return 0;
-  case MVT::i1:
-    return Emiti1Ext(SrcReg, DestVT, isZExt);
-  case MVT::i8:
-    if (DestVT == MVT::i64)
-      Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
-    else
-      Opc = isZExt ? ARM64::UBFMWri : ARM64::SBFMWri;
-    Imm = 7;
-    break;
-  case MVT::i16:
-    if (DestVT == MVT::i64)
-      Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
-    else
-      Opc = isZExt ? ARM64::UBFMWri : ARM64::SBFMWri;
-    Imm = 15;
-    break;
-  case MVT::i32:
-    assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
-    Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
-    Imm = 31;
-    break;
-  }
-
-  // Handle i8 and i16 as i32.
-  if (DestVT == MVT::i8 || DestVT == MVT::i16)
-    DestVT = MVT::i32;
-
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(SrcReg)
-      .addImm(0)
-      .addImm(Imm);
-
-  return ResultReg;
-}
-
-bool ARM64FastISel::SelectIntExt(const Instruction *I) {
-  // On ARM, in general, integer casts don't involve legal types; this code
-  // handles promotable integers.  The high bits for a type smaller than
-  // the register size are assumed to be undefined.
-  Type *DestTy = I->getType();
-  Value *Src = I->getOperand(0);
-  Type *SrcTy = Src->getType();
-
-  bool isZExt = isa<ZExtInst>(I);
-  unsigned SrcReg = getRegForValue(Src);
-  if (!SrcReg)
-    return false;
-
-  EVT SrcEVT = TLI.getValueType(SrcTy, true);
-  EVT DestEVT = TLI.getValueType(DestTy, true);
-  if (!SrcEVT.isSimple())
-    return false;
-  if (!DestEVT.isSimple())
-    return false;
-
-  MVT SrcVT = SrcEVT.getSimpleVT();
-  MVT DestVT = DestEVT.getSimpleVT();
-  unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
-  if (ResultReg == 0)
-    return false;
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
-  EVT DestEVT = TLI.getValueType(I->getType(), true);
-  if (!DestEVT.isSimple())
-    return false;
-
-  MVT DestVT = DestEVT.getSimpleVT();
-  if (DestVT != MVT::i64 && DestVT != MVT::i32)
-    return false;
-
-  unsigned DivOpc;
-  bool is64bit = (DestVT == MVT::i64);
-  switch (ISDOpcode) {
-  default:
-    return false;
-  case ISD::SREM:
-    DivOpc = is64bit ? ARM64::SDIVXr : ARM64::SDIVWr;
-    break;
-  case ISD::UREM:
-    DivOpc = is64bit ? ARM64::UDIVXr : ARM64::UDIVWr;
-    break;
-  }
-  unsigned MSubOpc = is64bit ? ARM64::MSUBXrrr : ARM64::MSUBWrrr;
-  unsigned Src0Reg = getRegForValue(I->getOperand(0));
-  if (!Src0Reg)
-    return false;
-
-  unsigned Src1Reg = getRegForValue(I->getOperand(1));
-  if (!Src1Reg)
-    return false;
-
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), ResultReg)
-      .addReg(Src0Reg)
-      .addReg(Src1Reg);
-  // The remainder is computed as numerator – (quotient * denominator) using the
-  // MSUB instruction.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg)
-      .addReg(ResultReg)
-      .addReg(Src1Reg)
-      .addReg(Src0Reg);
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::SelectMul(const Instruction *I) {
-  EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
-  if (!SrcEVT.isSimple())
-    return false;
-  MVT SrcVT = SrcEVT.getSimpleVT();
-
-  // Must be simple value type.  Don't handle vectors.
-  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
-      SrcVT != MVT::i8)
-    return false;
-
-  unsigned Opc;
-  unsigned ZReg;
-  switch (SrcVT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32:
-    ZReg = ARM64::WZR;
-    Opc = ARM64::MADDWrrr;
-    break;
-  case MVT::i64:
-    ZReg = ARM64::XZR;
-    Opc = ARM64::MADDXrrr;
-    break;
-  }
-
-  unsigned Src0Reg = getRegForValue(I->getOperand(0));
-  if (!Src0Reg)
-    return false;
-
-  unsigned Src1Reg = getRegForValue(I->getOperand(1));
-  if (!Src1Reg)
-    return false;
-
-  // Create the base instruction, then add the operands.
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(Src0Reg)
-      .addReg(Src1Reg)
-      .addReg(ZReg);
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool ARM64FastISel::TargetSelectInstruction(const Instruction *I) {
-  switch (I->getOpcode()) {
-  default:
-    break;
-  case Instruction::Load:
-    return SelectLoad(I);
-  case Instruction::Store:
-    return SelectStore(I);
-  case Instruction::Br:
-    return SelectBranch(I);
-  case Instruction::IndirectBr:
-    return SelectIndirectBr(I);
-  case Instruction::FCmp:
-  case Instruction::ICmp:
-    return SelectCmp(I);
-  case Instruction::Select:
-    return SelectSelect(I);
-  case Instruction::FPExt:
-    return SelectFPExt(I);
-  case Instruction::FPTrunc:
-    return SelectFPTrunc(I);
-  case Instruction::FPToSI:
-    return SelectFPToInt(I, /*Signed=*/true);
-  case Instruction::FPToUI:
-    return SelectFPToInt(I, /*Signed=*/false);
-  case Instruction::SIToFP:
-    return SelectIntToFP(I, /*Signed=*/true);
-  case Instruction::UIToFP:
-    return SelectIntToFP(I, /*Signed=*/false);
-  case Instruction::SRem:
-    return SelectRem(I, ISD::SREM);
-  case Instruction::URem:
-    return SelectRem(I, ISD::UREM);
-  case Instruction::Call:
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
-      return SelectIntrinsicCall(*II);
-    return SelectCall(I);
-  case Instruction::Ret:
-    return SelectRet(I);
-  case Instruction::Trunc:
-    return SelectTrunc(I);
-  case Instruction::ZExt:
-  case Instruction::SExt:
-    return SelectIntExt(I);
-  case Instruction::Mul:
-    // FIXME: This really should be handled by the target-independent selector.
-    return SelectMul(I);
-  }
-  return false;
-  // Silence warnings.
-  (void)&CC_ARM64_DarwinPCS_VarArg;
-}
-
-namespace llvm {
-llvm::FastISel *ARM64::createFastISel(FunctionLoweringInfo &funcInfo,
-                                      const TargetLibraryInfo *libInfo) {
-  return new ARM64FastISel(funcInfo, libInfo);
-}
-}
diff --git a/lib/Target/ARM64/ARM64FrameLowering.cpp b/lib/Target/ARM64/ARM64FrameLowering.cpp
deleted file mode 100644
index 798986c..0000000
--- a/lib/Target/ARM64/ARM64FrameLowering.cpp
+++ /dev/null
@@ -1,816 +0,0 @@
-//===- ARM64FrameLowering.cpp - ARM64 Frame Lowering -----------*- C++ -*-====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of TargetFrameLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "frame-info"
-#include "ARM64FrameLowering.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64Subtarget.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-static cl::opt<bool> EnableRedZone("arm64-redzone",
-                                   cl::desc("enable use of redzone on ARM64"),
-                                   cl::init(false), cl::Hidden);
-
-STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
-
-static unsigned estimateStackSize(MachineFunction &MF) {
-  const MachineFrameInfo *FFI = MF.getFrameInfo();
-  int Offset = 0;
-  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
-    int FixedOff = -FFI->getObjectOffset(i);
-    if (FixedOff > Offset)
-      Offset = FixedOff;
-  }
-  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
-    if (FFI->isDeadObjectIndex(i))
-      continue;
-    Offset += FFI->getObjectSize(i);
-    unsigned Align = FFI->getObjectAlignment(i);
-    // Adjust to alignment boundary
-    Offset = (Offset + Align - 1) / Align * Align;
-  }
-  // This does not include the 16 bytes used for fp and lr.
-  return (unsigned)Offset;
-}
-
-bool ARM64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
-  if (!EnableRedZone)
-    return false;
-  // Don't use the red zone if the function explicitly asks us not to.
-  // This is typically used for kernel code.
-  if (MF.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::NoRedZone))
-    return false;
-
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  unsigned NumBytes = AFI->getLocalStackSize();
-
-  // Note: currently hasFP() is always true for hasCalls(), but that's an
-  // implementation detail of the current code, not a strict requirement,
-  // so stay safe here and check both.
-  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
-    return false;
-  return true;
-}
-
-/// hasFP - Return true if the specified function should have a dedicated frame
-/// pointer register.
-bool ARM64FrameLowering::hasFP(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-#ifndef NDEBUG
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
-  assert(!RegInfo->needsStackRealignment(MF) &&
-         "No stack realignment on ARM64!");
-#endif
-
-  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken());
-}
-
-/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
-/// not required, we reserve argument space for call sites in the function
-/// immediately on entry to the current function.  This eliminates the need for
-/// add/sub sp brackets around call sites.  Returns true if the call frame is
-/// included as part of the stack frame.
-bool ARM64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  return !MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-void ARM64FrameLowering::eliminateCallFramePseudoInstr(
-    MachineFunction &MF, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator I) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  const ARM64InstrInfo *TII =
-      static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
-  if (!TFI->hasReservedCallFrame(MF)) {
-    // If we have alloca, convert as follows:
-    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
-    // ADJCALLSTACKUP   -> add, sp, sp, amount
-    MachineInstr *Old = I;
-    DebugLoc DL = Old->getDebugLoc();
-    unsigned Amount = Old->getOperand(0).getImm();
-    if (Amount != 0) {
-      // We need to keep the stack aligned properly.  To do this, we round the
-      // amount of space needed for the outgoing arguments up to the next
-      // alignment boundary.
-      unsigned Align = TFI->getStackAlignment();
-      Amount = (Amount + Align - 1) / Align * Align;
-
-      // Replace the pseudo instruction with a new instruction...
-      unsigned Opc = Old->getOpcode();
-      if (Opc == ARM64::ADJCALLSTACKDOWN) {
-        emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -Amount, TII);
-      } else {
-        assert(Opc == ARM64::ADJCALLSTACKUP && "expected ADJCALLSTACKUP");
-        emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
-      }
-    }
-  }
-  MBB.erase(I);
-}
-
-void
-ARM64FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                              MachineBasicBlock::iterator MBBI,
-                                              unsigned FramePtr) const {
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const ARM64InstrInfo *TII = TM.getInstrInfo();
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-
-  // Add callee saved registers to move list.
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  if (CSI.empty())
-    return;
-
-  const DataLayout *TD = MF.getTarget().getDataLayout();
-  bool HasFP = hasFP(MF);
-
-  // Calculate amount of bytes used for return address storing.
-  int stackGrowth = -TD->getPointerSize(0);
-
-  // Calculate offsets.
-  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
-  unsigned TotalSkipped = 0;
-  for (const auto &Info : CSI) {
-    unsigned Reg = Info.getReg();
-    int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
-                     getOffsetOfLocalArea() + saveAreaOffset;
-
-    // Don't output a new CFI directive if we're re-saving the frame pointer or
-    // link register. This happens when the PrologEpilogInserter has inserted an
-    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
-    // method automatically generates the directives when frame pointers are
-    // used. If we generate CFI directives for the extra "STP"s, the linker will
-    // lose track of the correct values for the frame pointer and link register.
-    if (HasFP && (FramePtr == Reg || Reg == ARM64::LR)) {
-      TotalSkipped += stackGrowth;
-      continue;
-    }
-
-    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
-        nullptr, DwarfReg, Offset - TotalSkipped));
-    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
-  }
-}
-
-void ARM64FrameLowering::emitPrologue(MachineFunction &MF) const {
-  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const Function *Fn = MF.getFunction();
-  const ARM64RegisterInfo *RegInfo = TM.getRegisterInfo();
-  const ARM64InstrInfo *TII = TM.getInstrInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
-  bool HasFP = hasFP(MF);
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-
-  int NumBytes = (int)MFI->getStackSize();
-  if (!AFI->hasStackFrame()) {
-    assert(!HasFP && "unexpected function without stack frame but with FP");
-
-    // All of the stack allocation is for locals.
-    AFI->setLocalStackSize(NumBytes);
-
-    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
-    MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-
-    // REDZONE: If the stack size is less than 128 bytes, we don't need
-    // to actually allocate.
-    if (NumBytes && !canUseRedZone(MF)) {
-      emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup);
-
-      // Encode the stack size of the leaf function.
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    } else if (NumBytes) {
-      ++NumRedZoneFunctions;
-    }
-
-    return;
-  }
-
-  // Only set up FP if we actually need to.
-  int FPOffset = 0;
-  if (HasFP) {
-    // First instruction must a) allocate the stack  and b) have an immediate
-    // that is a multiple of -2.
-    assert((MBBI->getOpcode() == ARM64::STPXpre ||
-            MBBI->getOpcode() == ARM64::STPDpre) &&
-           MBBI->getOperand(2).getReg() == ARM64::SP &&
-           MBBI->getOperand(3).getImm() < 0 &&
-           (MBBI->getOperand(3).getImm() & 1) == 0);
-
-    // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
-    // required for the callee saved register area we get the frame pointer
-    // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
-    FPOffset = -(MBBI->getOperand(3).getImm() + 2) * 8;
-    assert(FPOffset >= 0 && "Bad Framepointer Offset");
-  }
-
-  // Move past the saves of the callee-saved registers.
-  while (MBBI->getOpcode() == ARM64::STPXi ||
-         MBBI->getOpcode() == ARM64::STPDi ||
-         MBBI->getOpcode() == ARM64::STPXpre ||
-         MBBI->getOpcode() == ARM64::STPDpre) {
-    ++MBBI;
-    NumBytes -= 16;
-  }
-  assert(NumBytes >= 0 && "Negative stack allocation size!?");
-  if (HasFP) {
-    // Issue    sub fp, sp, FPOffset or
-    //          mov fp,sp          when FPOffset is zero.
-    // Note: All stores of callee-saved registers are marked as "FrameSetup".
-    // This code marks the instruction(s) that set the FP also.
-    emitFrameOffset(MBB, MBBI, DL, ARM64::FP, ARM64::SP, FPOffset, TII,
-                    MachineInstr::FrameSetup);
-  }
-
-  // All of the remaining stack allocations are for locals.
-  AFI->setLocalStackSize(NumBytes);
-
-  // Allocate space for the rest of the frame.
-  if (NumBytes) {
-    // If we're a leaf function, try using the red zone.
-    if (!canUseRedZone(MF))
-      emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup);
-  }
-
-  // If we need a base pointer, set it up here. It's whatever the value of the
-  // stack pointer is at this point. Any variable size objects will be allocated
-  // after this, so we can still use the base pointer to reference locals.
-  //
-  // FIXME: Clarify FrameSetup flags here.
-  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
-  // needed.
-  //
-  if (RegInfo->hasBasePointer(MF))
-    TII->copyPhysReg(MBB, MBBI, DL, ARM64::X19, ARM64::SP, false);
-
-  if (needsFrameMoves) {
-    const DataLayout *TD = MF.getTarget().getDataLayout();
-    const int StackGrowth = -TD->getPointerSize(0);
-    unsigned FramePtr = RegInfo->getFrameRegister(MF);
-
-    // An example of the prologue:
-    //
-    //     .globl __foo
-    //     .align 2
-    //  __foo:
-    // Ltmp0:
-    //     .cfi_startproc
-    //     .cfi_personality 155, ___gxx_personality_v0
-    // Leh_func_begin:
-    //     .cfi_lsda 16, Lexception33
-    //
-    //     stp  xa,bx, [sp, -#offset]!
-    //     ...
-    //     stp  x28, x27, [sp, #offset-32]
-    //     stp  fp, lr, [sp, #offset-16]
-    //     add  fp, sp, #offset - 16
-    //     sub  sp, sp, #1360
-    //
-    // The Stack:
-    //       +-------------------------------------------+
-    // 10000 | ........ | ........ | ........ | ........ |
-    // 10004 | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    // 10008 | ........ | ........ | ........ | ........ |
-    // 1000c | ........ | ........ | ........ | ........ |
-    //       +===========================================+
-    // 10010 |                X28 Register               |
-    // 10014 |                X28 Register               |
-    //       +-------------------------------------------+
-    // 10018 |                X27 Register               |
-    // 1001c |                X27 Register               |
-    //       +===========================================+
-    // 10020 |                Frame Pointer              |
-    // 10024 |                Frame Pointer              |
-    //       +-------------------------------------------+
-    // 10028 |                Link Register              |
-    // 1002c |                Link Register              |
-    //       +===========================================+
-    // 10030 | ........ | ........ | ........ | ........ |
-    // 10034 | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    // 10038 | ........ | ........ | ........ | ........ |
-    // 1003c | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    //
-    //     [sp] = 10030        ::    >>initial value<<
-    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
-    //     fp = sp == 10020    ::  mov fp, sp
-    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
-    //     sp == 10010         ::    >>final value<<
-    //
-    // The frame pointer (w29) points to address 10020. If we use an offset of
-    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
-    // for w27, and -32 for w28:
-    //
-    //  Ltmp1:
-    //     .cfi_def_cfa w29, 16
-    //  Ltmp2:
-    //     .cfi_offset w30, -8
-    //  Ltmp3:
-    //     .cfi_offset w29, -16
-    //  Ltmp4:
-    //     .cfi_offset w27, -24
-    //  Ltmp5:
-    //     .cfi_offset w28, -32
-
-    if (HasFP) {
-      // Define the current CFA rule to use the provided FP.
-      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-
-      // Record the location of the stored LR
-      unsigned LR = RegInfo->getDwarfRegNum(ARM64::LR, true);
-      CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-
-      // Record the location of the stored FP
-      CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    } else {
-      // Encode the stack size of the leaf function.
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    }
-
-    // Now emit the moves for whatever callee saved regs we have.
-    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
-  }
-}
-
-static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
-static bool isCSRestore(MachineInstr *MI, const uint16_t *CSRegs) {
-  if (MI->getOpcode() == ARM64::LDPXpost ||
-      MI->getOpcode() == ARM64::LDPDpost || MI->getOpcode() == ARM64::LDPXi ||
-      MI->getOpcode() == ARM64::LDPDi) {
-    if (!isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) ||
-        !isCalleeSavedRegister(MI->getOperand(1).getReg(), CSRegs) ||
-        MI->getOperand(2).getReg() != ARM64::SP)
-      return false;
-    return true;
-  }
-
-  return false;
-}
-
-void ARM64FrameLowering::emitEpilogue(MachineFunction &MF,
-                                      MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARM64InstrInfo *TII =
-      static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
-  const ARM64RegisterInfo *RegInfo =
-      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  DebugLoc DL = MBBI->getDebugLoc();
-
-  int NumBytes = MFI->getStackSize();
-  unsigned NumRestores = 0;
-  // Move past the restores of the callee-saved registers.
-  MachineBasicBlock::iterator LastPopI = MBBI;
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
-  if (LastPopI != MBB.begin()) {
-    do {
-      ++NumRestores;
-      --LastPopI;
-    } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
-    if (!isCSRestore(LastPopI, CSRegs)) {
-      ++LastPopI;
-      --NumRestores;
-    }
-  }
-  NumBytes -= NumRestores * 16;
-  assert(NumBytes >= 0 && "Negative stack allocation size!?");
-
-  if (!hasFP(MF)) {
-    // If this was a redzone leaf function, we don't need to restore the
-    // stack pointer.
-    if (!canUseRedZone(MF))
-      emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::SP, NumBytes, TII);
-    return;
-  }
-
-  // Restore the original stack pointer.
-  // FIXME: Rather than doing the math here, we should instead just use
-  // non-post-indexed loads for the restores if we aren't actually going to
-  // be able to save any instructions.
-  if (NumBytes || MFI->hasVarSizedObjects())
-    emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::FP,
-                    -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
-}
-
-/// getFrameIndexOffset - Returns the displacement from the frame register to
-/// the stack frame of the specified index.
-int ARM64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                            int FI) const {
-  unsigned FrameReg;
-  return getFrameIndexReference(MF, FI, FrameReg);
-}
-
-/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
-/// debug info.  It's the same as what we use for resolving the code-gen
-/// references for now.  FIXME: This can go wrong when references are
-/// SP-relative and simple call frames aren't used.
-int ARM64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                               int FI,
-                                               unsigned &FrameReg) const {
-  return resolveFrameIndexReference(MF, FI, FrameReg);
-}
-
-int ARM64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
-                                                   int FI, unsigned &FrameReg,
-                                                   bool PreferFP) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARM64RegisterInfo *RegInfo =
-      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  int FPOffset = MFI->getObjectOffset(FI) + 16;
-  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
-  bool isFixed = MFI->isFixedObjectIndex(FI);
-
-  // Use frame pointer to reference fixed objects. Use it for locals if
-  // there are VLAs (and thus the SP isn't reliable as a base).
-  // Make sure useFPForScavengingIndex() does the right thing for the emergency
-  // spill slot.
-  bool UseFP = false;
-  if (AFI->hasStackFrame()) {
-    // Note: Keeping the following as multiple 'if' statements rather than
-    // merging to a single expression for readability.
-    //
-    // Argument access should always use the FP.
-    if (isFixed) {
-      UseFP = hasFP(MF);
-    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) {
-      // Use SP or FP, whichever gives us the best chance of the offset
-      // being in range for direct access. If the FPOffset is positive,
-      // that'll always be best, as the SP will be even further away.
-      // If the FPOffset is negative, we have to keep in mind that the
-      // available offset range for negative offsets is smaller than for
-      // positive ones. If we have variable sized objects, we're stuck with
-      // using the FP regardless, though, as the SP offset is unknown
-      // and we don't have a base pointer available. If an offset is
-      // available via the FP and the SP, use whichever is closest.
-      if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
-          (FPOffset >= -256 && Offset > -FPOffset))
-        UseFP = true;
-    }
-  }
-
-  if (UseFP) {
-    FrameReg = RegInfo->getFrameRegister(MF);
-    return FPOffset;
-  }
-
-  // Use the base pointer if we have one.
-  if (RegInfo->hasBasePointer(MF))
-    FrameReg = RegInfo->getBaseRegister();
-  else {
-    FrameReg = ARM64::SP;
-    // If we're using the red zone for this function, the SP won't actually
-    // be adjusted, so the offsets will be negative. They're also all
-    // within range of the signed 9-bit immediate instructions.
-    if (canUseRedZone(MF))
-      Offset -= AFI->getLocalStackSize();
-  }
-
-  return Offset;
-}
-
-static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
-  if (Reg != ARM64::LR)
-    return getKillRegState(true);
-
-  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
-  bool LRLiveIn = MF.getRegInfo().isLiveIn(ARM64::LR);
-  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
-  return getKillRegState(LRKill);
-}
-
-bool ARM64FrameLowering::spillCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
-  MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  unsigned Count = CSI.size();
-  DebugLoc DL;
-  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-
-  if (MI != MBB.end())
-    DL = MI->getDebugLoc();
-
-  for (unsigned i = 0; i < Count; i += 2) {
-    unsigned idx = Count - i - 2;
-    unsigned Reg1 = CSI[idx].getReg();
-    unsigned Reg2 = CSI[idx + 1].getReg();
-    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
-    // list to come in sorted by frame index so that we can issue the store
-    // pair instructions directly. Assert if we see anything otherwise.
-    //
-    // The order of the registers in the list is controlled by
-    // getCalleeSavedRegs(), so they will always be in-order, as well.
-    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
-           "Out of order callee saved regs!");
-    unsigned StrOpc;
-    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
-    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
-    // first spill is a pre-increment that allocates the stack.
-    // For example:
-    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
-    //    stp     x20, x19, [sp, #16]    // addImm(+2)
-    //    stp     fp, lr, [sp, #32]      // addImm(+4)
-    // Rationale: This sequence saves uop updates compared to a sequence of
-    // pre-increment spills like stp xi,xj,[sp,#-16]!
-    // Note: Similar rational and sequence for restores in epilog.
-    if (ARM64::GPR64RegClass.contains(Reg1)) {
-      assert(ARM64::GPR64RegClass.contains(Reg2) &&
-             "Expected GPR64 callee-saved register pair!");
-      // For first spill use pre-increment store.
-      if (i == 0)
-        StrOpc = ARM64::STPXpre;
-      else
-        StrOpc = ARM64::STPXi;
-    } else if (ARM64::FPR64RegClass.contains(Reg1)) {
-      assert(ARM64::FPR64RegClass.contains(Reg2) &&
-             "Expected FPR64 callee-saved register pair!");
-      // For first spill use pre-increment store.
-      if (i == 0)
-        StrOpc = ARM64::STPDpre;
-      else
-        StrOpc = ARM64::STPDi;
-    } else
-      llvm_unreachable("Unexpected callee saved register!");
-    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
-                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
-                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
-    // Compute offset: i = 0 => offset = -Count;
-    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
-    const int Offset = (i == 0) ? -Count : i;
-    assert((Offset >= -64 && Offset <= 63) &&
-           "Offset out of bounds for STP immediate");
-    BuildMI(MBB, MI, DL, TII.get(StrOpc))
-        .addReg(Reg2, getPrologueDeath(MF, Reg2))
-        .addReg(Reg1, getPrologueDeath(MF, Reg1))
-        .addReg(ARM64::SP)
-        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
-        .setMIFlag(MachineInstr::FrameSetup);
-  }
-  return true;
-}
-
-bool ARM64FrameLowering::restoreCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
-  MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  unsigned Count = CSI.size();
-  DebugLoc DL;
-  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-
-  if (MI != MBB.end())
-    DL = MI->getDebugLoc();
-
-  for (unsigned i = 0; i < Count; i += 2) {
-    unsigned Reg1 = CSI[i].getReg();
-    unsigned Reg2 = CSI[i + 1].getReg();
-    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
-    // list to come in sorted by frame index so that we can issue the store
-    // pair instructions directly. Assert if we see anything otherwise.
-    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
-           "Out of order callee saved regs!");
-    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
-    // the last load is sp-pi post-increment and de-allocates the stack:
-    // For example:
-    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
-    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
-    //    ldp     x22, x21, [sp], #48     // addImm(+6)
-    // Note: see comment in spillCalleeSavedRegisters()
-    unsigned LdrOpc;
-
-    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
-    if (ARM64::GPR64RegClass.contains(Reg1)) {
-      assert(ARM64::GPR64RegClass.contains(Reg2) &&
-             "Expected GPR64 callee-saved register pair!");
-      if (i == Count - 2)
-        LdrOpc = ARM64::LDPXpost;
-      else
-        LdrOpc = ARM64::LDPXi;
-    } else if (ARM64::FPR64RegClass.contains(Reg1)) {
-      assert(ARM64::FPR64RegClass.contains(Reg2) &&
-             "Expected FPR64 callee-saved register pair!");
-      if (i == Count - 2)
-        LdrOpc = ARM64::LDPDpost;
-      else
-        LdrOpc = ARM64::LDPDi;
-    } else
-      llvm_unreachable("Unexpected callee saved register!");
-    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
-                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
-                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
-
-    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
-    // etc.
-    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
-    assert((Offset >= -64 && Offset <= 63) &&
-           "Offset out of bounds for LDP immediate");
-    BuildMI(MBB, MI, DL, TII.get(LdrOpc))
-        .addReg(Reg2, getDefRegState(true))
-        .addReg(Reg1, getDefRegState(true))
-        .addReg(ARM64::SP)
-        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
-                         // where the factor * 8 is implicit
-  }
-  return true;
-}
-
-void ARM64FrameLowering::processFunctionBeforeCalleeSavedScan(
-    MachineFunction &MF, RegScavenger *RS) const {
-  const ARM64RegisterInfo *RegInfo =
-      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  MachineRegisterInfo *MRI = &MF.getRegInfo();
-  SmallVector<unsigned, 4> UnspilledCSGPRs;
-  SmallVector<unsigned, 4> UnspilledCSFPRs;
-
-  // The frame record needs to be created by saving the appropriate registers
-  if (hasFP(MF)) {
-    MRI->setPhysRegUsed(ARM64::FP);
-    MRI->setPhysRegUsed(ARM64::LR);
-  }
-
-  // Spill the BasePtr if it's used. Do this first thing so that the
-  // getCalleeSavedRegs() below will get the right answer.
-  if (RegInfo->hasBasePointer(MF))
-    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
-
-  // If any callee-saved registers are used, the frame cannot be eliminated.
-  unsigned NumGPRSpilled = 0;
-  unsigned NumFPRSpilled = 0;
-  bool ExtraCSSpill = false;
-  bool CanEliminateFrame = true;
-  DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
-
-  // Check pairs of consecutive callee-saved registers.
-  for (unsigned i = 0; CSRegs[i]; i += 2) {
-    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
-
-    const unsigned OddReg = CSRegs[i];
-    const unsigned EvenReg = CSRegs[i + 1];
-    assert((ARM64::GPR64RegClass.contains(OddReg) &&
-            ARM64::GPR64RegClass.contains(EvenReg)) ^
-               (ARM64::FPR64RegClass.contains(OddReg) &&
-                ARM64::FPR64RegClass.contains(EvenReg)) &&
-           "Register class mismatch!");
-
-    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
-    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
-
-    // Early exit if none of the registers in the register pair is actually
-    // used.
-    if (!OddRegUsed && !EvenRegUsed) {
-      if (ARM64::GPR64RegClass.contains(OddReg)) {
-        UnspilledCSGPRs.push_back(OddReg);
-        UnspilledCSGPRs.push_back(EvenReg);
-      } else {
-        UnspilledCSFPRs.push_back(OddReg);
-        UnspilledCSFPRs.push_back(EvenReg);
-      }
-      continue;
-    }
-
-    unsigned Reg = ARM64::NoRegister;
-    // If only one of the registers of the register pair is used, make sure to
-    // mark the other one as used as well.
-    if (OddRegUsed ^ EvenRegUsed) {
-      // Find out which register is the additional spill.
-      Reg = OddRegUsed ? EvenReg : OddReg;
-      MRI->setPhysRegUsed(Reg);
-    }
-
-    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
-    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
-
-    assert(((OddReg == ARM64::LR && EvenReg == ARM64::FP) ||
-            (RegInfo->getEncodingValue(OddReg) + 1 ==
-             RegInfo->getEncodingValue(EvenReg))) &&
-           "Register pair of non-adjacent registers!");
-    if (ARM64::GPR64RegClass.contains(OddReg)) {
-      NumGPRSpilled += 2;
-      // If it's not a reserved register, we can use it in lieu of an
-      // emergency spill slot for the register scavenger.
-      // FIXME: It would be better to instead keep looking and choose another
-      // unspilled register that isn't reserved, if there is one.
-      if (Reg != ARM64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
-        ExtraCSSpill = true;
-    } else
-      NumFPRSpilled += 2;
-
-    CanEliminateFrame = false;
-  }
-
-  // FIXME: Set BigStack if any stack slot references may be out of range.
-  // For now, just conservatively guestimate based on unscaled indexing
-  // range. We'll end up allocating an unnecessary spill slot a lot, but
-  // realistically that's not a big deal at this stage of the game.
-  // The CSR spill slots have not been allocated yet, so estimateStackSize
-  // won't include them.
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
-  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
-  bool BigStack = (CFSize >= 256);
-  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
-    AFI->setHasStackFrame(true);
-
-  // Estimate if we might need to scavenge a register at some point in order
-  // to materialize a stack offset. If so, either spill one additional
-  // callee-saved register or reserve a special spill slot to facilitate
-  // register scavenging. If we already spilled an extra callee-saved register
-  // above to keep the number of spills even, we don't need to do anything else
-  // here.
-  if (BigStack && !ExtraCSSpill) {
-
-    // If we're adding a register to spill here, we have to add two of them
-    // to keep the number of regs to spill even.
-    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
-    unsigned Count = 0;
-    while (!UnspilledCSGPRs.empty() && Count < 2) {
-      unsigned Reg = UnspilledCSGPRs.back();
-      UnspilledCSGPRs.pop_back();
-      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
-                   << " to get a scratch register.\n");
-      MRI->setPhysRegUsed(Reg);
-      ExtraCSSpill = true;
-      ++Count;
-    }
-
-    // If we didn't find an extra callee-saved register to spill, create
-    // an emergency spill slot.
-    if (!ExtraCSSpill) {
-      const TargetRegisterClass *RC = &ARM64::GPR64RegClass;
-      int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
-      RS->addScavengingFrameIndex(FI);
-      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
-                   << " as the emergency spill slot.\n");
-    }
-  }
-}
diff --git a/lib/Target/ARM64/ARM64FrameLowering.h b/lib/Target/ARM64/ARM64FrameLowering.h
deleted file mode 100644
index 02edcdb..0000000
--- a/lib/Target/ARM64/ARM64FrameLowering.h
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- ARM64FrameLowering.h - TargetFrameLowering for ARM64 ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64_FRAMELOWERING_H
-#define ARM64_FRAMELOWERING_H
-
-#include "llvm/Target/TargetFrameLowering.h"
-
-namespace llvm {
-
-class ARM64Subtarget;
-class ARM64TargetMachine;
-
-class ARM64FrameLowering : public TargetFrameLowering {
-  const ARM64TargetMachine &TM;
-
-public:
-  explicit ARM64FrameLowering(const ARM64TargetMachine &TM,
-                              const ARM64Subtarget &STI)
-      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
-                            false /*StackRealignable*/),
-        TM(TM) {}
-
-  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI,
-                                 unsigned FramePtr) const;
-
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
-  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
-  /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const;
-  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
-                                 unsigned &FrameReg,
-                                 bool PreferFP = false) const;
-  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
-
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
-
-  /// \brief Can this function use the red zone for local allocations.
-  bool canUseRedZone(const MachineFunction &MF) const;
-
-  bool hasFP(const MachineFunction &MF) const;
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
-
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const;
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
deleted file mode 100644
index 2e234c9..0000000
--- a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
+++ /dev/null
@@ -1,2381 +0,0 @@
-//===-- ARM64ISelDAGToDAG.cpp - A dag to dag inst selector for ARM64 ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an instruction selector for the ARM64 target.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-isel"
-#include "ARM64TargetMachine.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/Function.h" // To access function attributes.
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-//===--------------------------------------------------------------------===//
-/// ARM64DAGToDAGISel - ARM64 specific code to select ARM64 machine
-/// instructions for SelectionDAG operations.
-///
-namespace {
-
-class ARM64DAGToDAGISel : public SelectionDAGISel {
-  ARM64TargetMachine &TM;
-
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-  bool ForCodeSize;
-
-public:
-  explicit ARM64DAGToDAGISel(ARM64TargetMachine &tm, CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(tm, OptLevel), TM(tm),
-        Subtarget(&TM.getSubtarget<ARM64Subtarget>()), ForCodeSize(false) {}
-
-  virtual const char *getPassName() const {
-    return "ARM64 Instruction Selection";
-  }
-
-  virtual bool runOnMachineFunction(MachineFunction &MF) {
-    AttributeSet FnAttrs = MF.getFunction()->getAttributes();
-    ForCodeSize =
-        FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                             Attribute::OptimizeForSize) ||
-        FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
-    return SelectionDAGISel::runOnMachineFunction(MF);
-  }
-
-  SDNode *Select(SDNode *Node);
-
-  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
-  /// inline asm expressions.
-  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                            char ConstraintCode,
-                                            std::vector<SDValue> &OutOps);
-
-  SDNode *SelectMLAV64LaneV128(SDNode *N);
-  SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
-  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
-  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
-  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
-  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
-    return SelectShiftedRegister(N, false, Reg, Shift);
-  }
-  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
-    return SelectShiftedRegister(N, true, Reg, Shift);
-  }
-  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 1, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 2, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 4, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 8, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 16, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 1, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 2, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 4, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 8, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
-  }
-
-  bool SelectAddrModeRO8(SDValue N, SDValue &Base, SDValue &Offset,
-                         SDValue &Imm) {
-    return SelectAddrModeRO(N, 1, Base, Offset, Imm);
-  }
-  bool SelectAddrModeRO16(SDValue N, SDValue &Base, SDValue &Offset,
-                          SDValue &Imm) {
-    return SelectAddrModeRO(N, 2, Base, Offset, Imm);
-  }
-  bool SelectAddrModeRO32(SDValue N, SDValue &Base, SDValue &Offset,
-                          SDValue &Imm) {
-    return SelectAddrModeRO(N, 4, Base, Offset, Imm);
-  }
-  bool SelectAddrModeRO64(SDValue N, SDValue &Base, SDValue &Offset,
-                          SDValue &Imm) {
-    return SelectAddrModeRO(N, 8, Base, Offset, Imm);
-  }
-  bool SelectAddrModeRO128(SDValue N, SDValue &Base, SDValue &Offset,
-                           SDValue &Imm) {
-    return SelectAddrModeRO(N, 16, Base, Offset, Imm);
-  }
-  bool SelectAddrModeNoIndex(SDValue N, SDValue &Val);
-
-  /// Form sequences of consecutive 64/128-bit registers for use in NEON
-  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
-  /// between 1 and 4 elements. If it contains a single element that is returned
-  /// unchanged; otherwise a REG_SEQUENCE value is returned.
-  SDValue createDTuple(ArrayRef<SDValue> Vecs);
-  SDValue createQTuple(ArrayRef<SDValue> Vecs);
-
-  /// Generic helper for the createDTuple/createQTuple
-  /// functions. Those should almost always be called instead.
-  SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
-                      unsigned SubRegs[]);
-
-  SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
-
-  SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
-
-  SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
-                     unsigned SubRegIdx);
-  SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-
-  SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-
-  SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node);
-  SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node);
-
-  SDNode *SelectAtomic(SDNode *Node, unsigned Op8, unsigned Op16, unsigned Op32,
-                       unsigned Op64);
-
-  SDNode *SelectBitfieldExtractOp(SDNode *N);
-  SDNode *SelectBitfieldInsertOp(SDNode *N);
-
-  SDNode *SelectLIBM(SDNode *N);
-
-// Include the pieces autogenerated from the target description.
-#include "ARM64GenDAGISel.inc"
-
-private:
-  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
-                             SDValue &Shift);
-  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
-                             SDValue &OffImm);
-  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
-                              SDValue &OffImm);
-  bool SelectAddrModeRO(SDValue N, unsigned Size, SDValue &Base,
-                        SDValue &Offset, SDValue &Imm);
-  bool isWorthFolding(SDValue V) const;
-  bool SelectExtendedSHL(SDValue N, unsigned Size, SDValue &Offset,
-                         SDValue &Imm);
-};
-} // end anonymous namespace
-
-/// isIntImmediate - This method tests to see if the node is a constant
-/// operand. If so Imm will receive the 32-bit value.
-static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
-  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
-    Imm = C->getZExtValue();
-    return true;
-  }
-  return false;
-}
-
-// isIntImmediate - This method tests to see if a constant operand.
-// If so Imm will receive the value.
-static bool isIntImmediate(SDValue N, uint64_t &Imm) {
-  return isIntImmediate(N.getNode(), Imm);
-}
-
-// isOpcWithIntImmediate - This method tests to see if the node is a specific
-// opcode and that it has a immediate integer right operand.
-// If so Imm will receive the 32 bit value.
-static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
-                                  uint64_t &Imm) {
-  return N->getOpcode() == Opc &&
-         isIntImmediate(N->getOperand(1).getNode(), Imm);
-}
-
-bool ARM64DAGToDAGISel::SelectAddrModeNoIndex(SDValue N, SDValue &Val) {
-  EVT ValTy = N.getValueType();
-  if (ValTy != MVT::i64)
-    return false;
-  Val = N;
-  return true;
-}
-
-bool ARM64DAGToDAGISel::SelectInlineAsmMemoryOperand(
-    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
-  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
-  // Require the address to be in a register.  That is safe for all ARM64
-  // variants and it is hard to do anything much smarter without knowing
-  // how the operand is used.
-  OutOps.push_back(Op);
-  return false;
-}
-
-/// SelectArithImmed - Select an immediate value that can be represented as
-/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
-/// Val set to the 12-bit value and Shift set to the shifter operand.
-bool ARM64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
-                                         SDValue &Shift) {
-  // This function is called from the addsub_shifted_imm ComplexPattern,
-  // which lists [imm] as the list of opcode it's interested in, however
-  // we still need to check whether the operand is actually an immediate
-  // here because the ComplexPattern opcode list is only used in
-  // root-level opcode matching.
-  if (!isa<ConstantSDNode>(N.getNode()))
-    return false;
-
-  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
-  unsigned ShiftAmt;
-
-  if (Immed >> 12 == 0) {
-    ShiftAmt = 0;
-  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
-    ShiftAmt = 12;
-    Immed = Immed >> 12;
-  } else
-    return false;
-
-  unsigned ShVal = ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt);
-  Val = CurDAG->getTargetConstant(Immed, MVT::i32);
-  Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
-  return true;
-}
-
-/// SelectNegArithImmed - As above, but negates the value before trying to
-/// select it.
-bool ARM64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
-                                            SDValue &Shift) {
-  // This function is called from the addsub_shifted_imm ComplexPattern,
-  // which lists [imm] as the list of opcode it's interested in, however
-  // we still need to check whether the operand is actually an immediate
-  // here because the ComplexPattern opcode list is only used in
-  // root-level opcode matching.
-  if (!isa<ConstantSDNode>(N.getNode()))
-    return false;
-
-  // The immediate operand must be a 24-bit zero-extended immediate.
-  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
-
-  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
-  // have the opposite effect on the C flag, so this pattern mustn't match under
-  // those circumstances.
-  if (Immed == 0)
-    return false;
-
-  if (N.getValueType() == MVT::i32)
-    Immed = ~((uint32_t)Immed) + 1;
-  else
-    Immed = ~Immed + 1ULL;
-  if (Immed & 0xFFFFFFFFFF000000ULL)
-    return false;
-
-  Immed &= 0xFFFFFFULL;
-  return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift);
-}
-
-/// getShiftTypeForNode - Translate a shift node to the corresponding
-/// ShiftType value.
-static ARM64_AM::ShiftType getShiftTypeForNode(SDValue N) {
-  switch (N.getOpcode()) {
-  default:
-    return ARM64_AM::InvalidShift;
-  case ISD::SHL:
-    return ARM64_AM::LSL;
-  case ISD::SRL:
-    return ARM64_AM::LSR;
-  case ISD::SRA:
-    return ARM64_AM::ASR;
-  case ISD::ROTR:
-    return ARM64_AM::ROR;
-  }
-}
-
-/// \brief Determine wether it is worth to fold V into an extended register.
-bool ARM64DAGToDAGISel::isWorthFolding(SDValue V) const {
-  // it hurts if the a value is used at least twice, unless we are optimizing
-  // for code size.
-  if (ForCodeSize || V.hasOneUse())
-    return true;
-  return false;
-}
-
-/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
-/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
-/// instructions allow the shifted register to be rotated, but the arithmetic
-/// instructions do not.  The AllowROR parameter specifies whether ROR is
-/// supported.
-bool ARM64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
-                                              SDValue &Reg, SDValue &Shift) {
-  ARM64_AM::ShiftType ShType = getShiftTypeForNode(N);
-  if (ShType == ARM64_AM::InvalidShift)
-    return false;
-  if (!AllowROR && ShType == ARM64_AM::ROR)
-    return false;
-
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    unsigned BitSize = N.getValueType().getSizeInBits();
-    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
-    unsigned ShVal = ARM64_AM::getShifterImm(ShType, Val);
-
-    Reg = N.getOperand(0);
-    Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
-    return isWorthFolding(N);
-  }
-
-  return false;
-}
-
-/// getExtendTypeForNode - Translate an extend node to the corresponding
-/// ExtendType value.
-static ARM64_AM::ExtendType getExtendTypeForNode(SDValue N,
-                                                 bool IsLoadStore = false) {
-  if (N.getOpcode() == ISD::SIGN_EXTEND ||
-      N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
-    EVT SrcVT;
-    if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
-      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
-    else
-      SrcVT = N.getOperand(0).getValueType();
-
-    if (!IsLoadStore && SrcVT == MVT::i8)
-      return ARM64_AM::SXTB;
-    else if (!IsLoadStore && SrcVT == MVT::i16)
-      return ARM64_AM::SXTH;
-    else if (SrcVT == MVT::i32)
-      return ARM64_AM::SXTW;
-    else if (SrcVT == MVT::i64)
-      return ARM64_AM::SXTX;
-
-    return ARM64_AM::InvalidExtend;
-  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
-             N.getOpcode() == ISD::ANY_EXTEND) {
-    EVT SrcVT = N.getOperand(0).getValueType();
-    if (!IsLoadStore && SrcVT == MVT::i8)
-      return ARM64_AM::UXTB;
-    else if (!IsLoadStore && SrcVT == MVT::i16)
-      return ARM64_AM::UXTH;
-    else if (SrcVT == MVT::i32)
-      return ARM64_AM::UXTW;
-    else if (SrcVT == MVT::i64)
-      return ARM64_AM::UXTX;
-
-    return ARM64_AM::InvalidExtend;
-  } else if (N.getOpcode() == ISD::AND) {
-    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
-    if (!CSD)
-      return ARM64_AM::InvalidExtend;
-    uint64_t AndMask = CSD->getZExtValue();
-
-    switch (AndMask) {
-    default:
-      return ARM64_AM::InvalidExtend;
-    case 0xFF:
-      return !IsLoadStore ? ARM64_AM::UXTB : ARM64_AM::InvalidExtend;
-    case 0xFFFF:
-      return !IsLoadStore ? ARM64_AM::UXTH : ARM64_AM::InvalidExtend;
-    case 0xFFFFFFFF:
-      return ARM64_AM::UXTW;
-    }
-  }
-
-  return ARM64_AM::InvalidExtend;
-}
-
-// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
-static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
-  if (DL->getOpcode() != ARM64ISD::DUPLANE16 &&
-      DL->getOpcode() != ARM64ISD::DUPLANE32)
-    return false;
-
-  SDValue SV = DL->getOperand(0);
-  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
-    return false;
-
-  SDValue EV = SV.getOperand(1);
-  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
-    return false;
-
-  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
-  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
-  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
-  LaneOp = EV.getOperand(0);
-
-  return true;
-}
-
-// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
-// high lane extract.
-static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
-                             SDValue &LaneOp, int &LaneIdx) {
-
-  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
-    std::swap(Op0, Op1);
-    if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
-      return false;
-  }
-  StdOp = Op1;
-  return true;
-}
-
-/// SelectMLAV64LaneV128 - ARM64 supports 64-bit vector MLAs (v4i16 and v2i32)
-/// where one multiplicand is a lane in the upper half of a 128-bit vector.
-/// Recognize and select this so that we don't emit unnecessary lane extracts.
-SDNode *ARM64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
-  SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
-  int LaneIdx = -1; // Will hold the lane index.
-
-  if (Op1.getOpcode() != ISD::MUL ||
-      !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
-                        LaneIdx)) {
-    std::swap(Op0, Op1);
-    if (Op1.getOpcode() != ISD::MUL ||
-        !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
-                          LaneIdx))
-      return 0;
-  }
-
-  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
-
-  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
-
-  unsigned MLAOpc = ~0U;
-
-  switch (N->getSimpleValueType(0).SimpleTy) {
-  default:
-    llvm_unreachable("Unrecognized MLA.");
-  case MVT::v4i16:
-    MLAOpc = ARM64::MLAv4i16_indexed;
-    break;
-  case MVT::v2i32:
-    MLAOpc = ARM64::MLAv2i32_indexed;
-    break;
-  }
-
-  return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
-  SDValue SMULLOp0;
-  SDValue SMULLOp1;
-  int LaneIdx;
-
-  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
-                        LaneIdx))
-    return 0;
-
-  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
-
-  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
-
-  unsigned SMULLOpc = ~0U;
-
-  if (IntNo == Intrinsic::arm64_neon_smull) {
-    switch (N->getSimpleValueType(0).SimpleTy) {
-    default:
-      llvm_unreachable("Unrecognized SMULL.");
-    case MVT::v4i32:
-      SMULLOpc = ARM64::SMULLv4i16_indexed;
-      break;
-    case MVT::v2i64:
-      SMULLOpc = ARM64::SMULLv2i32_indexed;
-      break;
-    }
-  } else if (IntNo == Intrinsic::arm64_neon_umull) {
-    switch (N->getSimpleValueType(0).SimpleTy) {
-    default:
-      llvm_unreachable("Unrecognized SMULL.");
-    case MVT::v4i32:
-      SMULLOpc = ARM64::UMULLv4i16_indexed;
-      break;
-    case MVT::v2i64:
-      SMULLOpc = ARM64::UMULLv2i32_indexed;
-      break;
-    }
-  } else
-    llvm_unreachable("Unrecognized intrinsic.");
-
-  return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops);
-}
-
-/// SelectArithExtendedRegister - Select a "extended register" operand.  This
-/// operand folds in an extend followed by an optional left shift.
-bool ARM64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
-                                                    SDValue &Shift) {
-  unsigned ShiftVal = 0;
-  ARM64_AM::ExtendType Ext;
-
-  if (N.getOpcode() == ISD::SHL) {
-    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
-    if (!CSD)
-      return false;
-    ShiftVal = CSD->getZExtValue();
-    if ((ShiftVal & 0x3) != ShiftVal)
-      return false;
-
-    Ext = getExtendTypeForNode(N.getOperand(0));
-    if (Ext == ARM64_AM::InvalidExtend)
-      return false;
-
-    Reg = N.getOperand(0).getOperand(0);
-  } else {
-    Ext = getExtendTypeForNode(N);
-    if (Ext == ARM64_AM::InvalidExtend)
-      return false;
-
-    Reg = N.getOperand(0);
-  }
-
-  // ARM64 mandates that the RHS of the operation must use the smallest
-  // register classs that could contain the size being extended from.  Thus,
-  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
-  // there might not be an actual 32-bit value in the program.  We can
-  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
-  if (Reg.getValueType() == MVT::i64 && Ext != ARM64_AM::UXTX &&
-      Ext != ARM64_AM::SXTX) {
-    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-    MachineSDNode *Node = CurDAG->getMachineNode(
-        TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32, Reg, SubReg);
-    Reg = SDValue(Node, 0);
-  }
-
-  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32);
-  return isWorthFolding(N);
-}
-
-/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
-/// immediate" address.  The "Size" argument is the size in bytes of the memory
-/// reference, which determines the scale.
-bool ARM64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
-                                              SDValue &Base, SDValue &OffImm) {
-  const TargetLowering *TLI = getTargetLowering();
-  if (N.getOpcode() == ISD::FrameIndex) {
-    int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-    OffImm = CurDAG->getTargetConstant(0, MVT::i64);
-    return true;
-  }
-
-  if (N.getOpcode() == ARM64ISD::ADDlow) {
-    GlobalAddressSDNode *GAN =
-        dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
-    Base = N.getOperand(0);
-    OffImm = N.getOperand(1);
-    if (!GAN)
-      return true;
-
-    const GlobalValue *GV = GAN->getGlobal();
-    unsigned Alignment = GV->getAlignment();
-    const DataLayout *DL = TLI->getDataLayout();
-    if (Alignment == 0 && !Subtarget->isTargetDarwin())
-      Alignment = DL->getABITypeAlignment(GV->getType()->getElementType());
-
-    if (Alignment >= Size)
-      return true;
-  }
-
-  if (CurDAG->isBaseWithConstantOffset(N)) {
-    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-      int64_t RHSC = (int64_t)RHS->getZExtValue();
-      unsigned Scale = Log2_32(Size);
-      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
-        Base = N.getOperand(0);
-        if (Base.getOpcode() == ISD::FrameIndex) {
-          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-        }
-        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64);
-        return true;
-      }
-    }
-  }
-
-  // Before falling back to our general case, check if the unscaled
-  // instructions can handle this. If so, that's preferable.
-  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
-    return false;
-
-  // Base only. The address will be materialized into a register before
-  // the memory is accessed.
-  //    add x0, Xbase, #offset
-  //    ldr x0, [x0]
-  Base = N;
-  OffImm = CurDAG->getTargetConstant(0, MVT::i64);
-  return true;
-}
-
-/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
-/// immediate" address.  This should only match when there is an offset that
-/// is not valid for a scaled immediate addressing mode.  The "Size" argument
-/// is the size in bytes of the memory reference, which is needed here to know
-/// what is valid for a scaled immediate.
-bool ARM64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
-                                               SDValue &Base, SDValue &OffImm) {
-  if (!CurDAG->isBaseWithConstantOffset(N))
-    return false;
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    int64_t RHSC = RHS->getSExtValue();
-    // If the offset is valid as a scaled immediate, don't match here.
-    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
-        RHSC < (0x1000 << Log2_32(Size)))
-      return false;
-    if (RHSC >= -256 && RHSC < 256) {
-      Base = N.getOperand(0);
-      if (Base.getOpcode() == ISD::FrameIndex) {
-        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        const TargetLowering *TLI = getTargetLowering();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-      }
-      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64);
-      return true;
-    }
-  }
-  return false;
-}
-
-static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
-  SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-  SDValue ImpDef = SDValue(
-      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64),
-      0);
-  MachineSDNode *Node = CurDAG->getMachineNode(
-      TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg);
-  return SDValue(Node, 0);
-}
-
-static SDValue WidenIfNeeded(SelectionDAG *CurDAG, SDValue N) {
-  if (N.getValueType() == MVT::i32) {
-    return Widen(CurDAG, N);
-  }
-
-  return N;
-}
-
-/// \brief Check if the given SHL node (\p N), can be used to form an
-/// extended register for an addressing mode.
-bool ARM64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
-                                          SDValue &Offset, SDValue &Imm) {
-  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
-  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
-  if (CSD && (CSD->getZExtValue() & 0x7) == CSD->getZExtValue()) {
-
-    ARM64_AM::ExtendType Ext = getExtendTypeForNode(N.getOperand(0), true);
-    if (Ext == ARM64_AM::InvalidExtend) {
-      Ext = ARM64_AM::UXTX;
-      Offset = WidenIfNeeded(CurDAG, N.getOperand(0));
-    } else {
-      Offset = WidenIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
-    }
-
-    unsigned LegalShiftVal = Log2_32(Size);
-    unsigned ShiftVal = CSD->getZExtValue();
-
-    if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
-      return false;
-
-    Imm = CurDAG->getTargetConstant(
-        ARM64_AM::getMemExtendImm(Ext, ShiftVal != 0), MVT::i32);
-    if (isWorthFolding(N))
-      return true;
-  }
-  return false;
-}
-
-bool ARM64DAGToDAGISel::SelectAddrModeRO(SDValue N, unsigned Size,
-                                         SDValue &Base, SDValue &Offset,
-                                         SDValue &Imm) {
-  if (N.getOpcode() != ISD::ADD)
-    return false;
-  SDValue LHS = N.getOperand(0);
-  SDValue RHS = N.getOperand(1);
-
-  // We don't want to match immediate adds here, because they are better lowered
-  // to the register-immediate addressing modes.
-  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
-    return false;
-
-  // Check if this particular node is reused in any non-memory related
-  // operation.  If yes, do not try to fold this node into the address
-  // computation, since the computation will be kept.
-  const SDNode *Node = N.getNode();
-  for (SDNode::use_iterator UI = Node->use_begin(), UE = Node->use_end();
-       UI != UE; ++UI) {
-    if (!isa<MemSDNode>(*UI))
-      return false;
-  }
-
-  // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
-
-  // Try to match a shifted extend on the RHS.
-  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
-      SelectExtendedSHL(RHS, Size, Offset, Imm)) {
-    Base = LHS;
-    return true;
-  }
-
-  // Try to match a shifted extend on the LHS.
-  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
-      SelectExtendedSHL(LHS, Size, Offset, Imm)) {
-    Base = RHS;
-    return true;
-  }
-
-  ARM64_AM::ExtendType Ext = ARM64_AM::UXTX;
-  // Try to match an unshifted extend on the LHS.
-  if (IsExtendedRegisterWorthFolding &&
-      (Ext = getExtendTypeForNode(LHS, true)) != ARM64_AM::InvalidExtend) {
-    Base = RHS;
-    Offset = WidenIfNeeded(CurDAG, LHS.getOperand(0));
-    Imm = CurDAG->getTargetConstant(ARM64_AM::getMemExtendImm(Ext, false),
-                                    MVT::i32);
-    if (isWorthFolding(LHS))
-      return true;
-  }
-
-  // Try to match an unshifted extend on the RHS.
-  if (IsExtendedRegisterWorthFolding &&
-      (Ext = getExtendTypeForNode(RHS, true)) != ARM64_AM::InvalidExtend) {
-    Base = LHS;
-    Offset = WidenIfNeeded(CurDAG, RHS.getOperand(0));
-    Imm = CurDAG->getTargetConstant(ARM64_AM::getMemExtendImm(Ext, false),
-                                    MVT::i32);
-    if (isWorthFolding(RHS))
-      return true;
-  }
-
-  // Match any non-shifted, non-extend, non-immediate add expression.
-  Base = LHS;
-  Offset = WidenIfNeeded(CurDAG, RHS);
-  Ext = ARM64_AM::UXTX;
-  Imm = CurDAG->getTargetConstant(ARM64_AM::getMemExtendImm(Ext, false),
-                                  MVT::i32);
-  // Reg1 + Reg2 is free: no check needed.
-  return true;
-}
-
-SDValue ARM64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { ARM64::DDRegClassID, ARM64::DDDRegClassID,
-                                    ARM64::DDDDRegClassID };
-  static unsigned SubRegs[] = { ARM64::dsub0, ARM64::dsub1,
-                                ARM64::dsub2, ARM64::dsub3 };
-
-  return createTuple(Regs, RegClassIDs, SubRegs);
-}
-
-SDValue ARM64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { ARM64::QQRegClassID, ARM64::QQQRegClassID,
-                                    ARM64::QQQQRegClassID };
-  static unsigned SubRegs[] = { ARM64::qsub0, ARM64::qsub1,
-                                ARM64::qsub2, ARM64::qsub3 };
-
-  return createTuple(Regs, RegClassIDs, SubRegs);
-}
-
-SDValue ARM64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
-                                       unsigned RegClassIDs[],
-                                       unsigned SubRegs[]) {
-  // There's no special register-class for a vector-list of 1 element: it's just
-  // a vector.
-  if (Regs.size() == 1)
-    return Regs[0];
-
-  assert(Regs.size() >= 2 && Regs.size() <= 4);
-
-  SDLoc DL(Regs[0].getNode());
-
-  SmallVector<SDValue, 4> Ops;
-
-  // First operand of REG_SEQUENCE is the desired RegClass.
-  Ops.push_back(
-      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32));
-
-  // Then we get pairs of source & subregister-position for the components.
-  for (unsigned i = 0; i < Regs.size(); ++i) {
-    Ops.push_back(Regs[i]);
-    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32));
-  }
-
-  SDNode *N =
-      CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
-  return SDValue(N, 0);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
-                                       unsigned Opc, bool isExt) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-
-  unsigned ExtOff = isExt;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  unsigned Vec0Off = ExtOff + 1;
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
-                               N->op_begin() + Vec0Off + NumVecs);
-  SDValue RegSeq = createQTuple(Regs);
-
-  SmallVector<SDValue, 6> Ops;
-  if (isExt)
-    Ops.push_back(N->getOperand(1));
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  if (LD->isUnindexed())
-    return NULL;
-  EVT VT = LD->getMemoryVT();
-  EVT DstVT = N->getValueType(0);
-  ISD::MemIndexedMode AM = LD->getAddressingMode();
-  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
-
-  // We're not doing validity checking here. That was done when checking
-  // if we should mark the load as indexed or not. We're just selecting
-  // the right instruction.
-  unsigned Opcode = 0;
-
-  ISD::LoadExtType ExtType = LD->getExtensionType();
-  bool InsertTo64 = false;
-  if (VT == MVT::i64)
-    Opcode = IsPre ? ARM64::LDRXpre_isel : ARM64::LDRXpost_isel;
-  else if (VT == MVT::i32) {
-    if (ExtType == ISD::NON_EXTLOAD)
-      Opcode = IsPre ? ARM64::LDRWpre_isel : ARM64::LDRWpost_isel;
-    else if (ExtType == ISD::SEXTLOAD)
-      Opcode = IsPre ? ARM64::LDRSWpre_isel : ARM64::LDRSWpost_isel;
-    else {
-      Opcode = IsPre ? ARM64::LDRWpre_isel : ARM64::LDRWpost_isel;
-      InsertTo64 = true;
-      // The result of the load is only i32. It's the subreg_to_reg that makes
-      // it into an i64.
-      DstVT = MVT::i32;
-    }
-  } else if (VT == MVT::i16) {
-    if (ExtType == ISD::SEXTLOAD) {
-      if (DstVT == MVT::i64)
-        Opcode = IsPre ? ARM64::LDRSHXpre_isel : ARM64::LDRSHXpost_isel;
-      else
-        Opcode = IsPre ? ARM64::LDRSHWpre_isel : ARM64::LDRSHWpost_isel;
-    } else {
-      Opcode = IsPre ? ARM64::LDRHHpre_isel : ARM64::LDRHHpost_isel;
-      InsertTo64 = DstVT == MVT::i64;
-      // The result of the load is only i32. It's the subreg_to_reg that makes
-      // it into an i64.
-      DstVT = MVT::i32;
-    }
-  } else if (VT == MVT::i8) {
-    if (ExtType == ISD::SEXTLOAD) {
-      if (DstVT == MVT::i64)
-        Opcode = IsPre ? ARM64::LDRSBXpre_isel : ARM64::LDRSBXpost_isel;
-      else
-        Opcode = IsPre ? ARM64::LDRSBWpre_isel : ARM64::LDRSBWpost_isel;
-    } else {
-      Opcode = IsPre ? ARM64::LDRBBpre_isel : ARM64::LDRBBpost_isel;
-      InsertTo64 = DstVT == MVT::i64;
-      // The result of the load is only i32. It's the subreg_to_reg that makes
-      // it into an i64.
-      DstVT = MVT::i32;
-    }
-  } else if (VT == MVT::f32) {
-    Opcode = IsPre ? ARM64::LDRSpre_isel : ARM64::LDRSpost_isel;
-  } else if (VT == MVT::f64) {
-    Opcode = IsPre ? ARM64::LDRDpre_isel : ARM64::LDRDpost_isel;
-  } else
-    return NULL;
-  SDValue Chain = LD->getChain();
-  SDValue Base = LD->getBasePtr();
-  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
-  int OffsetVal = (int)OffsetOp->getZExtValue();
-  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64);
-  SDValue Ops[] = { Base, Offset, Chain };
-  SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), DstVT, MVT::i64,
-                                       MVT::Other, Ops);
-  // Either way, we're replacing the node, so tell the caller that.
-  Done = true;
-  if (InsertTo64) {
-    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-    SDNode *Sub = CurDAG->getMachineNode(
-        ARM64::SUBREG_TO_REG, SDLoc(N), MVT::i64,
-        CurDAG->getTargetConstant(0, MVT::i64), SDValue(Res, 0), SubReg);
-    ReplaceUses(SDValue(N, 0), SDValue(Sub, 0));
-    ReplaceUses(SDValue(N, 1), SDValue(Res, 1));
-    ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
-    return 0;
-  }
-  return Res;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
-                                      unsigned SubRegIdx) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  SDValue Chain = N->getOperand(0);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(2)); // Mem operand;
-  Ops.push_back(Chain);
-
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
-
-  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-  SDValue SuperReg = SDValue(Ld, 0);
-
-  // MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  // MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  // cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
-
-  switch (NumVecs) {
-  case 4:
-    ReplaceUses(SDValue(N, 3), CurDAG->getTargetExtractSubreg(SubRegIdx + 3, dl,
-                                                              VT, SuperReg));
-  // FALLTHROUGH
-  case 3:
-    ReplaceUses(SDValue(N, 2), CurDAG->getTargetExtractSubreg(SubRegIdx + 2, dl,
-                                                              VT, SuperReg));
-  // FALLTHROUGH
-  case 2:
-    ReplaceUses(SDValue(N, 1), CurDAG->getTargetExtractSubreg(SubRegIdx + 1, dl,
-                                                              VT, SuperReg));
-    ReplaceUses(SDValue(N, 0),
-                CurDAG->getTargetExtractSubreg(SubRegIdx, dl, VT, SuperReg));
-    break;
-  case 1:
-    ReplaceUses(SDValue(N, 0), SuperReg);
-    break;
-  }
-
-  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-
-  return 0;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
-                                       unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getOperand(2)->getValueType(0);
-
-  // Form a REG_SEQUENCE to force register allocation.
-  bool Is128Bit = VT.getSizeInBits() == 128;
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
-  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + 2));
-  Ops.push_back(N->getOperand(0));
-  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
-
-  return St;
-}
-
-/// WidenVector - Given a value in the V64 register class, produce the
-/// equivalent value in the V128 register class.
-class WidenVector {
-  SelectionDAG &DAG;
-
-public:
-  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
-
-  SDValue operator()(SDValue V64Reg) {
-    EVT VT = V64Reg.getValueType();
-    unsigned NarrowSize = VT.getVectorNumElements();
-    MVT EltTy = VT.getVectorElementType().getSimpleVT();
-    MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
-    SDLoc DL(V64Reg);
-
-    SDValue Undef =
-        SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
-    return DAG.getTargetInsertSubreg(ARM64::dsub, DL, WideTy, Undef, V64Reg);
-  }
-};
-
-/// NarrowVector - Given a value in the V128 register class, produce the
-/// equivalent value in the V64 register class.
-static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
-  EVT VT = V128Reg.getValueType();
-  unsigned WideSize = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType().getSimpleVT();
-  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
-
-  return DAG.getTargetExtractSubreg(ARM64::dsub, SDLoc(V128Reg), NarrowTy,
-                                    V128Reg);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
-                                          unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  bool Narrow = VT.getSizeInBits() == 64;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
-
-  if (Narrow)
-    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
-                   WidenVector(*CurDAG));
-
-  SDValue RegSeq = createQTuple(Regs);
-
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
-
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 3));
-  Ops.push_back(N->getOperand(0));
-  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-  SDValue SuperReg = SDValue(Ld, 0);
-
-  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
-  switch (NumVecs) {
-  case 4: {
-    SDValue NV3 =
-        CurDAG->getTargetExtractSubreg(ARM64::qsub3, dl, WideVT, SuperReg);
-    if (Narrow)
-      ReplaceUses(SDValue(N, 3), NarrowVector(NV3, *CurDAG));
-    else
-      ReplaceUses(SDValue(N, 3), NV3);
-  }
-  // FALLTHROUGH
-  case 3: {
-    SDValue NV2 =
-        CurDAG->getTargetExtractSubreg(ARM64::qsub2, dl, WideVT, SuperReg);
-    if (Narrow)
-      ReplaceUses(SDValue(N, 2), NarrowVector(NV2, *CurDAG));
-    else
-      ReplaceUses(SDValue(N, 2), NV2);
-  }
-  // FALLTHROUGH
-  case 2: {
-    SDValue NV1 =
-        CurDAG->getTargetExtractSubreg(ARM64::qsub1, dl, WideVT, SuperReg);
-    SDValue NV0 =
-        CurDAG->getTargetExtractSubreg(ARM64::qsub0, dl, WideVT, SuperReg);
-    if (Narrow) {
-      ReplaceUses(SDValue(N, 1), NarrowVector(NV1, *CurDAG));
-      ReplaceUses(SDValue(N, 0), NarrowVector(NV0, *CurDAG));
-    } else {
-      ReplaceUses(SDValue(N, 1), NV1);
-      ReplaceUses(SDValue(N, 0), NV0);
-    }
-    break;
-  }
-  }
-
-  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-
-  return Ld;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
-                                           unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getOperand(2)->getValueType(0);
-  bool Narrow = VT.getSizeInBits() == 64;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
-
-  if (Narrow)
-    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
-                   WidenVector(*CurDAG));
-
-  SDValue RegSeq = createQTuple(Regs);
-
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 3));
-  Ops.push_back(N->getOperand(0));
-  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
-
-  // Transfer memoperands.
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
-
-  return St;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
-                                        unsigned Op16, unsigned Op32,
-                                        unsigned Op64) {
-  // Mostly direct translation to the given operations, except that we preserve
-  // the AtomicOrdering for use later on.
-  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
-  EVT VT = AN->getMemoryVT();
-
-  unsigned Op;
-  if (VT == MVT::i8)
-    Op = Op8;
-  else if (VT == MVT::i16)
-    Op = Op16;
-  else if (VT == MVT::i32)
-    Op = Op32;
-  else if (VT == MVT::i64)
-    Op = Op64;
-  else
-    llvm_unreachable("Unexpected atomic operation");
-
-  SmallVector<SDValue, 4> Ops;
-  for (unsigned i = 1; i < AN->getNumOperands(); ++i)
-    Ops.push_back(AN->getOperand(i));
-
-  Ops.push_back(CurDAG->getTargetConstant(AN->getOrdering(), MVT::i32));
-  Ops.push_back(AN->getOperand(0)); // Chain moves to the end
-
-  return CurDAG->SelectNodeTo(Node, Op, AN->getValueType(0), MVT::Other,
-                              &Ops[0], Ops.size());
-}
-
-static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
-                                       unsigned &Opc, SDValue &Opd0,
-                                       unsigned &LSB, unsigned &MSB,
-                                       unsigned NumberOfIgnoredLowBits,
-                                       bool BiggerPattern) {
-  assert(N->getOpcode() == ISD::AND &&
-         "N must be a AND operation to call this function");
-
-  EVT VT = N->getValueType(0);
-
-  // Here we can test the type of VT and return false when the type does not
-  // match, but since it is done prior to that call in the current context
-  // we turned that into an assert to avoid redundant code.
-  assert((VT == MVT::i32 || VT == MVT::i64) &&
-         "Type checking must have been done before calling this function");
-
-  // FIXME: simplify-demanded-bits in DAGCombine will probably have
-  // changed the AND node to a 32-bit mask operation. We'll have to
-  // undo that as part of the transform here if we want to catch all
-  // the opportunities.
-  // Currently the NumberOfIgnoredLowBits argument helps to recover
-  // form these situations when matching bigger pattern (bitfield insert).
-
-  // For unsigned extracts, check for a shift right and mask
-  uint64_t And_imm = 0;
-  if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
-    return false;
-
-  const SDNode *Op0 = N->getOperand(0).getNode();
-
-  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
-  // simplified. Try to undo that
-  And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
-
-  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
-  if (And_imm & (And_imm + 1))
-    return false;
-
-  bool ClampMSB = false;
-  uint64_t Srl_imm = 0;
-  // Handle the SRL + ANY_EXTEND case.
-  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
-      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
-    // Extend the incoming operand of the SRL to 64-bit.
-    Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
-    // Make sure to clamp the MSB so that we preserve the semantics of the
-    // original operations.
-    ClampMSB = true;
-  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
-    Opd0 = Op0->getOperand(0);
-  } else if (BiggerPattern) {
-    // Let's pretend a 0 shift right has been performed.
-    // The resulting code will be at least as good as the original one
-    // plus it may expose more opportunities for bitfield insert pattern.
-    // FIXME: Currently we limit this to the bigger pattern, because
-    // some optimizations expect AND and not UBFM
-    Opd0 = N->getOperand(0);
-  } else
-    return false;
-
-  assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
-         "bad amount in shift node!");
-
-  LSB = Srl_imm;
-  MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
-                                  : CountTrailingOnes_64(And_imm)) -
-        1;
-  if (ClampMSB)
-    // Since we're moving the extend before the right shift operation, we need
-    // to clamp the MSB to make sure we don't shift in undefined bits instead of
-    // the zeros which would get shifted in with the original right shift
-    // operation.
-    MSB = MSB > 31 ? 31 : MSB;
-
-  Opc = VT == MVT::i32 ? ARM64::UBFMWri : ARM64::UBFMXri;
-  return true;
-}
-
-static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                     unsigned &LSB, unsigned &MSB) {
-  // We are looking for the following pattern which basically extracts a single
-  // bit from the source value and places it in the LSB of the destination
-  // value, all other bits of the destination value or set to zero:
-  //
-  // Value2 = AND Value, MaskImm
-  // SRL Value2, ShiftImm
-  //
-  // with MaskImm >> ShiftImm == 1.
-  //
-  // This gets selected into a single UBFM:
-  //
-  // UBFM Value, ShiftImm, ShiftImm
-  //
-
-  if (N->getOpcode() != ISD::SRL)
-    return false;
-
-  uint64_t And_mask = 0;
-  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
-    return false;
-
-  Opd0 = N->getOperand(0).getOperand(0);
-
-  uint64_t Srl_imm = 0;
-  if (!isIntImmediate(N->getOperand(1), Srl_imm))
-    return false;
-
-  // Check whether we really have a one bit extract here.
-  if (And_mask >> Srl_imm == 0x1) {
-    if (N->getValueType(0) == MVT::i32)
-      Opc = ARM64::UBFMWri;
-    else
-      Opc = ARM64::UBFMXri;
-
-    LSB = MSB = Srl_imm;
-
-    return true;
-  }
-
-  return false;
-}
-
-static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                       unsigned &LSB, unsigned &MSB,
-                                       bool BiggerPattern) {
-  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
-         "N must be a SHR/SRA operation to call this function");
-
-  EVT VT = N->getValueType(0);
-
-  // Here we can test the type of VT and return false when the type does not
-  // match, but since it is done prior to that call in the current context
-  // we turned that into an assert to avoid redundant code.
-  assert((VT == MVT::i32 || VT == MVT::i64) &&
-         "Type checking must have been done before calling this function");
-
-  // Check for AND + SRL doing a one bit extract.
-  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
-    return true;
-
-  // we're looking for a shift of a shift
-  uint64_t Shl_imm = 0;
-  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
-    Opd0 = N->getOperand(0).getOperand(0);
-  } else if (BiggerPattern) {
-    // Let's pretend a 0 shift left has been performed.
-    // FIXME: Currently we limit this to the bigger pattern case,
-    // because some optimizations expect AND and not UBFM
-    Opd0 = N->getOperand(0);
-  } else
-    return false;
-
-  assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
-  uint64_t Srl_imm = 0;
-  if (!isIntImmediate(N->getOperand(1), Srl_imm))
-    return false;
-
-  assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
-         "bad amount in shift node!");
-  // Note: The width operand is encoded as width-1.
-  unsigned Width = VT.getSizeInBits() - Srl_imm - 1;
-  int sLSB = Srl_imm - Shl_imm;
-  if (sLSB < 0)
-    return false;
-  LSB = sLSB;
-  MSB = LSB + Width;
-  // SRA requires a signed extraction
-  if (VT == MVT::i32)
-    Opc = N->getOpcode() == ISD::SRA ? ARM64::SBFMWri : ARM64::UBFMWri;
-  else
-    Opc = N->getOpcode() == ISD::SRA ? ARM64::SBFMXri : ARM64::UBFMXri;
-  return true;
-}
-
-static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
-                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
-                                unsigned NumberOfIgnoredLowBits = 0,
-                                bool BiggerPattern = false) {
-  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
-    return false;
-
-  switch (N->getOpcode()) {
-  default:
-    if (!N->isMachineOpcode())
-      return false;
-    break;
-  case ISD::AND:
-    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
-                                      NumberOfIgnoredLowBits, BiggerPattern);
-  case ISD::SRL:
-  case ISD::SRA:
-    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
-  }
-
-  unsigned NOpc = N->getMachineOpcode();
-  switch (NOpc) {
-  default:
-    return false;
-  case ARM64::SBFMWri:
-  case ARM64::UBFMWri:
-  case ARM64::SBFMXri:
-  case ARM64::UBFMXri:
-    Opc = NOpc;
-    Opd0 = N->getOperand(0);
-    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
-    return true;
-  }
-  // Unreachable
-  return false;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
-  unsigned Opc, LSB, MSB;
-  SDValue Opd0;
-  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
-    return NULL;
-
-  EVT VT = N->getValueType(0);
-  SDValue Ops[] = { Opd0, CurDAG->getTargetConstant(LSB, VT),
-                    CurDAG->getTargetConstant(MSB, VT) };
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3);
-}
-
-// Is mask a i32 or i64 binary sequence 1..10..0 and
-// CountTrailingZeros(mask) == ExpectedTrailingZeros
-static bool isHighMask(uint64_t Mask, unsigned ExpectedTrailingZeros,
-                       unsigned NumberOfIgnoredHighBits, EVT VT) {
-  assert((VT == MVT::i32 || VT == MVT::i64) &&
-         "i32 or i64 mask type expected!");
-
-  uint64_t ExpectedMask;
-  if (VT == MVT::i32) {
-    uint32_t ExpectedMaski32 = ~0 << ExpectedTrailingZeros;
-    ExpectedMask = ExpectedMaski32;
-    if (NumberOfIgnoredHighBits) {
-      uint32_t highMask = ~0 << (32 - NumberOfIgnoredHighBits);
-      Mask |= highMask;
-    }
-  } else {
-    ExpectedMask = ((uint64_t) ~0) << ExpectedTrailingZeros;
-    if (NumberOfIgnoredHighBits)
-      Mask |= ((uint64_t) ~0) << (64 - NumberOfIgnoredHighBits);
-  }
-
-  return Mask == ExpectedMask;
-}
-
-// Look for bits that will be useful for later uses.
-// A bit is consider useless as soon as it is dropped and never used
-// before it as been dropped.
-// E.g., looking for useful bit of x
-// 1. y = x & 0x7
-// 2. z = y >> 2
-// After #1, x useful bits are 0x7, then the useful bits of x, live through
-// y.
-// After #2, the useful bits of x are 0x4.
-// However, if x is used on an unpredicatable instruction, then all its bits
-// are useful.
-// E.g.
-// 1. y = x & 0x7
-// 2. z = y >> 2
-// 3. str x, [@x]
-static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
-
-static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
-                                              unsigned Depth) {
-  uint64_t Imm =
-      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
-  Imm = ARM64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
-  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
-  getUsefulBits(Op, UsefulBits, Depth + 1);
-}
-
-static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
-                                             uint64_t Imm, uint64_t MSB,
-                                             unsigned Depth) {
-  // inherit the bitwidth value
-  APInt OpUsefulBits(UsefulBits);
-  OpUsefulBits = 1;
-
-  if (MSB >= Imm) {
-    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
-    --OpUsefulBits;
-    // The interesting part will be in the lower part of the result
-    getUsefulBits(Op, OpUsefulBits, Depth + 1);
-    // The interesting part was starting at Imm in the argument
-    OpUsefulBits = OpUsefulBits.shl(Imm);
-  } else {
-    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
-    --OpUsefulBits;
-    // The interesting part will be shifted in the result
-    OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
-    getUsefulBits(Op, OpUsefulBits, Depth + 1);
-    // The interesting part was at zero in the argument
-    OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
-  }
-
-  UsefulBits &= OpUsefulBits;
-}
-
-static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
-                                  unsigned Depth) {
-  uint64_t Imm =
-      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
-  uint64_t MSB =
-      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-
-  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
-}
-
-static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
-                                              unsigned Depth) {
-  uint64_t ShiftTypeAndValue =
-      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-  APInt Mask(UsefulBits);
-  Mask.clearAllBits();
-  Mask.flipAllBits();
-
-  if (ARM64_AM::getShiftType(ShiftTypeAndValue) == ARM64_AM::LSL) {
-    // Shift Left
-    uint64_t ShiftAmt = ARM64_AM::getShiftValue(ShiftTypeAndValue);
-    Mask = Mask.shl(ShiftAmt);
-    getUsefulBits(Op, Mask, Depth + 1);
-    Mask = Mask.lshr(ShiftAmt);
-  } else if (ARM64_AM::getShiftType(ShiftTypeAndValue) == ARM64_AM::LSR) {
-    // Shift Right
-    // We do not handle ARM64_AM::ASR, because the sign will change the
-    // number of useful bits
-    uint64_t ShiftAmt = ARM64_AM::getShiftValue(ShiftTypeAndValue);
-    Mask = Mask.lshr(ShiftAmt);
-    getUsefulBits(Op, Mask, Depth + 1);
-    Mask = Mask.shl(ShiftAmt);
-  } else
-    return;
-
-  UsefulBits &= Mask;
-}
-
-static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
-                                 unsigned Depth) {
-  uint64_t Imm =
-      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-  uint64_t MSB =
-      cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
-
-  if (Op.getOperand(1) == Orig)
-    return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
-
-  APInt OpUsefulBits(UsefulBits);
-  OpUsefulBits = 1;
-
-  if (MSB >= Imm) {
-    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
-    --OpUsefulBits;
-    UsefulBits &= ~OpUsefulBits;
-    getUsefulBits(Op, UsefulBits, Depth + 1);
-  } else {
-    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
-    --OpUsefulBits;
-    UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm));
-    getUsefulBits(Op, UsefulBits, Depth + 1);
-  }
-}
-
-static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
-                                SDValue Orig, unsigned Depth) {
-
-  // Users of this node should have already been instruction selected
-  // FIXME: Can we turn that into an assert?
-  if (!UserNode->isMachineOpcode())
-    return;
-
-  switch (UserNode->getMachineOpcode()) {
-  default:
-    return;
-  case ARM64::ANDSWri:
-  case ARM64::ANDSXri:
-  case ARM64::ANDWri:
-  case ARM64::ANDXri:
-    // We increment Depth only when we call the getUsefulBits
-    return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
-                                             Depth);
-  case ARM64::UBFMWri:
-  case ARM64::UBFMXri:
-    return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
-
-  case ARM64::ORRWrs:
-  case ARM64::ORRXrs:
-    if (UserNode->getOperand(1) != Orig)
-      return;
-    return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
-                                             Depth);
-  case ARM64::BFMWri:
-  case ARM64::BFMXri:
-    return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
-  }
-}
-
-static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
-  if (Depth >= 6)
-    return;
-  // Initialize UsefulBits
-  if (!Depth) {
-    unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits();
-    // At the beginning, assume every produced bits is useful
-    UsefulBits = APInt(Bitwidth, 0);
-    UsefulBits.flipAllBits();
-  }
-  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
-
-  for (SDNode::use_iterator UseIt = Op.getNode()->use_begin(),
-                            UseEnd = Op.getNode()->use_end();
-       UseIt != UseEnd; ++UseIt) {
-    // A use cannot produce useful bits
-    APInt UsefulBitsForUse = APInt(UsefulBits);
-    getUsefulBitsForUse(*UseIt, UsefulBitsForUse, Op, Depth);
-    UsersUsefulBits |= UsefulBitsForUse;
-  }
-  // UsefulBits contains the produced bits that are meaningful for the
-  // current definition, thus a user cannot make a bit meaningful at
-  // this point
-  UsefulBits &= UsersUsefulBits;
-}
-
-// Given a OR operation, check if we have the following pattern
-// ubfm c, b, imm, imm2 (or something that does the same jobs, see
-//                       isBitfieldExtractOp)
-// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
-//                 countTrailingZeros(mask2) == imm2 - imm + 1
-// f = d | c
-// if yes, given reference arguments will be update so that one can replace
-// the OR instruction with:
-// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
-static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                     SDValue &Opd1, unsigned &LSB,
-                                     unsigned &MSB, SelectionDAG *CurDAG) {
-  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
-
-  // Set Opc
-  EVT VT = N->getValueType(0);
-  if (VT == MVT::i32)
-    Opc = ARM64::BFMWri;
-  else if (VT == MVT::i64)
-    Opc = ARM64::BFMXri;
-  else
-    return false;
-
-  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
-  // have the expected shape. Try to undo that.
-  APInt UsefulBits;
-  getUsefulBits(SDValue(N, 0), UsefulBits);
-
-  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
-  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
-
-  // OR is commutative, check both possibilities (does llvm provide a
-  // way to do that directely, e.g., via code matcher?)
-  SDValue OrOpd1Val = N->getOperand(1);
-  SDNode *OrOpd0 = N->getOperand(0).getNode();
-  SDNode *OrOpd1 = N->getOperand(1).getNode();
-  for (int i = 0; i < 2;
-       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
-    unsigned BFXOpc;
-    // Set Opd1, LSB and MSB arguments by looking for
-    // c = ubfm b, imm, imm2
-    if (!isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Opd1, LSB, MSB,
-                             NumberOfIgnoredLowBits, true))
-      continue;
-
-    // Check that the returned opcode is compatible with the pattern,
-    // i.e., same type and zero extended (U and not S)
-    if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) ||
-        (BFXOpc != ARM64::UBFMWri && VT == MVT::i32))
-      continue;
-
-    // Compute the width of the bitfield insertion
-    int sMSB = MSB - LSB + 1;
-    // FIXME: This constraints is to catch bitfield insertion we may
-    // want to widen the pattern if we want to grab general bitfied
-    // move case
-    if (sMSB <= 0)
-      continue;
-
-    // Check the second part of the pattern
-    EVT VT = OrOpd1->getValueType(0);
-    if (VT != MVT::i32 && VT != MVT::i64)
-      continue;
-
-    // Compute the Known Zero for the candidate of the first operand.
-    // This allows to catch more general case than just looking for
-    // AND with imm. Indeed, simplify-demanded-bits may have removed
-    // the AND instruction because it proves it was useless.
-    APInt KnownZero, KnownOne;
-    CurDAG->ComputeMaskedBits(OrOpd1Val, KnownZero, KnownOne);
-
-    // Check if there is enough room for the second operand to appear
-    // in the first one
-    if (KnownZero.countTrailingOnes() < (unsigned)sMSB)
-      continue;
-
-    // Set the first operand
-    uint64_t Imm;
-    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
-        isHighMask(Imm, sMSB, NumberOfIgnoredHighBits, VT))
-      // In that case, we can eliminate the AND
-      Opd0 = OrOpd1->getOperand(0);
-    else
-      // Maybe the AND has been removed by simplify-demanded-bits
-      // or is useful because it discards more bits
-      Opd0 = OrOpd1Val;
-
-    // both parts match
-    return true;
-  }
-
-  return false;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
-  if (N->getOpcode() != ISD::OR)
-    return NULL;
-
-  unsigned Opc;
-  unsigned LSB, MSB;
-  SDValue Opd0, Opd1;
-
-  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
-    return NULL;
-
-  EVT VT = N->getValueType(0);
-  SDValue Ops[] = { Opd0,
-                    Opd1,
-                    CurDAG->getTargetConstant(LSB, VT),
-                    CurDAG->getTargetConstant(MSB, VT) };
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 4);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectLIBM(SDNode *N) {
-  EVT VT = N->getValueType(0);
-  unsigned Variant;
-  unsigned Opc;
-  unsigned FRINTXOpcs[] = { ARM64::FRINTXSr, ARM64::FRINTXDr };
-
-  if (VT == MVT::f32) {
-    Variant = 0;
-  } else if (VT == MVT::f64) {
-    Variant = 1;
-  } else
-    return 0; // Unrecognized argument type. Fall back on default codegen.
-
-  // Pick the FRINTX variant needed to set the flags.
-  unsigned FRINTXOpc = FRINTXOpcs[Variant];
-
-  switch (N->getOpcode()) {
-  default:
-    return 0; // Unrecognized libm ISD node. Fall back on default codegen.
-  case ISD::FCEIL: {
-    unsigned FRINTPOpcs[] = { ARM64::FRINTPSr, ARM64::FRINTPDr };
-    Opc = FRINTPOpcs[Variant];
-    break;
-  }
-  case ISD::FFLOOR: {
-    unsigned FRINTMOpcs[] = { ARM64::FRINTMSr, ARM64::FRINTMDr };
-    Opc = FRINTMOpcs[Variant];
-    break;
-  }
-  case ISD::FTRUNC: {
-    unsigned FRINTZOpcs[] = { ARM64::FRINTZSr, ARM64::FRINTZDr };
-    Opc = FRINTZOpcs[Variant];
-    break;
-  }
-  case ISD::FROUND: {
-    unsigned FRINTAOpcs[] = { ARM64::FRINTASr, ARM64::FRINTADr };
-    Opc = FRINTAOpcs[Variant];
-    break;
-  }
-  }
-
-  SDLoc dl(N);
-  SDValue In = N->getOperand(0);
-  SmallVector<SDValue, 2> Ops;
-  Ops.push_back(In);
-
-  if (!TM.Options.UnsafeFPMath) {
-    SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
-    Ops.push_back(SDValue(FRINTX, 1));
-  }
-
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
-}
-
-SDNode *ARM64DAGToDAGISel::Select(SDNode *Node) {
-  // Dump information about the Node being selected
-  DEBUG(errs() << "Selecting: ");
-  DEBUG(Node->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  // If we have a custom node, we already have selected!
-  if (Node->isMachineOpcode()) {
-    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
-    Node->setNodeId(-1);
-    return NULL;
-  }
-
-  // Few custom selection stuff.
-  SDNode *ResNode = 0;
-  EVT VT = Node->getValueType(0);
-
-  switch (Node->getOpcode()) {
-  default:
-    break;
-
-  case ISD::ADD:
-    if (SDNode *I = SelectMLAV64LaneV128(Node))
-      return I;
-    break;
-
-  case ISD::ATOMIC_LOAD_ADD:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_ADD_I8,
-                        ARM64::ATOMIC_LOAD_ADD_I16, ARM64::ATOMIC_LOAD_ADD_I32,
-                        ARM64::ATOMIC_LOAD_ADD_I64);
-  case ISD::ATOMIC_LOAD_SUB:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_SUB_I8,
-                        ARM64::ATOMIC_LOAD_SUB_I16, ARM64::ATOMIC_LOAD_SUB_I32,
-                        ARM64::ATOMIC_LOAD_SUB_I64);
-  case ISD::ATOMIC_LOAD_AND:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_AND_I8,
-                        ARM64::ATOMIC_LOAD_AND_I16, ARM64::ATOMIC_LOAD_AND_I32,
-                        ARM64::ATOMIC_LOAD_AND_I64);
-  case ISD::ATOMIC_LOAD_OR:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_OR_I8,
-                        ARM64::ATOMIC_LOAD_OR_I16, ARM64::ATOMIC_LOAD_OR_I32,
-                        ARM64::ATOMIC_LOAD_OR_I64);
-  case ISD::ATOMIC_LOAD_XOR:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_XOR_I8,
-                        ARM64::ATOMIC_LOAD_XOR_I16, ARM64::ATOMIC_LOAD_XOR_I32,
-                        ARM64::ATOMIC_LOAD_XOR_I64);
-  case ISD::ATOMIC_LOAD_NAND:
-    return SelectAtomic(
-        Node, ARM64::ATOMIC_LOAD_NAND_I8, ARM64::ATOMIC_LOAD_NAND_I16,
-        ARM64::ATOMIC_LOAD_NAND_I32, ARM64::ATOMIC_LOAD_NAND_I64);
-  case ISD::ATOMIC_LOAD_MIN:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_MIN_I8,
-                        ARM64::ATOMIC_LOAD_MIN_I16, ARM64::ATOMIC_LOAD_MIN_I32,
-                        ARM64::ATOMIC_LOAD_MIN_I64);
-  case ISD::ATOMIC_LOAD_MAX:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_MAX_I8,
-                        ARM64::ATOMIC_LOAD_MAX_I16, ARM64::ATOMIC_LOAD_MAX_I32,
-                        ARM64::ATOMIC_LOAD_MAX_I64);
-  case ISD::ATOMIC_LOAD_UMIN:
-    return SelectAtomic(
-        Node, ARM64::ATOMIC_LOAD_UMIN_I8, ARM64::ATOMIC_LOAD_UMIN_I16,
-        ARM64::ATOMIC_LOAD_UMIN_I32, ARM64::ATOMIC_LOAD_UMIN_I64);
-  case ISD::ATOMIC_LOAD_UMAX:
-    return SelectAtomic(
-        Node, ARM64::ATOMIC_LOAD_UMAX_I8, ARM64::ATOMIC_LOAD_UMAX_I16,
-        ARM64::ATOMIC_LOAD_UMAX_I32, ARM64::ATOMIC_LOAD_UMAX_I64);
-  case ISD::ATOMIC_SWAP:
-    return SelectAtomic(Node, ARM64::ATOMIC_SWAP_I8, ARM64::ATOMIC_SWAP_I16,
-                        ARM64::ATOMIC_SWAP_I32, ARM64::ATOMIC_SWAP_I64);
-  case ISD::ATOMIC_CMP_SWAP:
-    return SelectAtomic(Node, ARM64::ATOMIC_CMP_SWAP_I8,
-                        ARM64::ATOMIC_CMP_SWAP_I16, ARM64::ATOMIC_CMP_SWAP_I32,
-                        ARM64::ATOMIC_CMP_SWAP_I64);
-
-  case ISD::LOAD: {
-    // Try to select as an indexed load. Fall through to normal processing
-    // if we can't.
-    bool Done = false;
-    SDNode *I = SelectIndexedLoad(Node, Done);
-    if (Done)
-      return I;
-    break;
-  }
-
-  case ISD::SRL:
-  case ISD::AND:
-  case ISD::SRA:
-    if (SDNode *I = SelectBitfieldExtractOp(Node))
-      return I;
-    break;
-
-  case ISD::OR:
-    if (SDNode *I = SelectBitfieldInsertOp(Node))
-      return I;
-    break;
-
-  case ISD::EXTRACT_VECTOR_ELT: {
-    // Extracting lane zero is a special case where we can just use a plain
-    // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
-    // the rest of the compiler, especially the register allocator and copyi
-    // propagation, to reason about, so is preferred when it's possible to
-    // use it.
-    ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
-    // Bail and use the default Select() for non-zero lanes.
-    if (LaneNode->getZExtValue() != 0)
-      break;
-    // If the element type is not the same as the result type, likewise
-    // bail and use the default Select(), as there's more to do than just
-    // a cross-class COPY. This catches extracts of i8 and i16 elements
-    // since they will need an explicit zext.
-    if (VT != Node->getOperand(0).getValueType().getVectorElementType())
-      break;
-    unsigned SubReg;
-    switch (Node->getOperand(0)
-                .getValueType()
-                .getVectorElementType()
-                .getSizeInBits()) {
-    default:
-      assert(0 && "Unexpected vector element type!");
-    case 64:
-      SubReg = ARM64::dsub;
-      break;
-    case 32:
-      SubReg = ARM64::ssub;
-      break;
-    case 16: // FALLTHROUGH
-    case 8:
-      llvm_unreachable("unexpected zext-requiring extract element!");
-    }
-    SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
-                                                     Node->getOperand(0));
-    DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
-    DEBUG(Extract->dumpr(CurDAG));
-    DEBUG(dbgs() << "\n");
-    return Extract.getNode();
-  }
-  case ISD::Constant: {
-    // Materialize zero constants as copies from WZR/XZR.  This allows
-    // the coalescer to propagate these into other instructions.
-    ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
-    if (ConstNode->isNullValue()) {
-      if (VT == MVT::i32)
-        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                      ARM64::WZR, MVT::i32).getNode();
-      else if (VT == MVT::i64)
-        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                      ARM64::XZR, MVT::i64).getNode();
-    }
-    break;
-  }
-
-  case ISD::FrameIndex: {
-    // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
-    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-    unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0);
-    const TargetLowering *TLI = getTargetLowering();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
-                      CurDAG->getTargetConstant(Shifter, MVT::i32) };
-    return CurDAG->SelectNodeTo(Node, ARM64::ADDXri, MVT::i64, Ops, 3);
-  }
-  case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_ldxp: {
-      SDValue MemAddr = Node->getOperand(2);
-      SDLoc DL(Node);
-      SDValue Chain = Node->getOperand(0);
-
-      SDNode *Ld = CurDAG->getMachineNode(ARM64::LDXPX, DL, MVT::i64, MVT::i64,
-                                          MVT::Other, MemAddr, Chain);
-
-      // Transfer memoperands.
-      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
-      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
-      return Ld;
-    }
-    case Intrinsic::arm64_stxp: {
-      SDLoc DL(Node);
-      SDValue Chain = Node->getOperand(0);
-      SDValue ValLo = Node->getOperand(2);
-      SDValue ValHi = Node->getOperand(3);
-      SDValue MemAddr = Node->getOperand(4);
-
-      // Place arguments in the right order.
-      SmallVector<SDValue, 7> Ops;
-      Ops.push_back(ValLo);
-      Ops.push_back(ValHi);
-      Ops.push_back(MemAddr);
-      Ops.push_back(Chain);
-
-      SDNode *St =
-          CurDAG->getMachineNode(ARM64::STXPX, DL, MVT::i32, MVT::Other, Ops);
-      // Transfer memoperands.
-      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
-      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
-
-      return St;
-    }
-    case Intrinsic::arm64_neon_ld1x2:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, ARM64::LD1Twov8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, ARM64::LD1Twov16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 2, ARM64::LD1Twov4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 2, ARM64::LD1Twov8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, ARM64::LD1Twov2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, ARM64::LD1Twov4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, ARM64::LD1Twov1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, ARM64::LD1Twov2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld1x3:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, ARM64::LD1Threev8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, ARM64::LD1Threev16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 3, ARM64::LD1Threev4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 3, ARM64::LD1Threev8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, ARM64::LD1Threev2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, ARM64::LD1Threev4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, ARM64::LD1Threev1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, ARM64::LD1Threev2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld1x4:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld2:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, ARM64::LD2Twov8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, ARM64::LD2Twov16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 2, ARM64::LD2Twov4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 2, ARM64::LD2Twov8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, ARM64::LD2Twov2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, ARM64::LD2Twov4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, ARM64::LD1Twov1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, ARM64::LD2Twov2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld3:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, ARM64::LD3Threev8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, ARM64::LD3Threev16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 3, ARM64::LD3Threev4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 3, ARM64::LD3Threev8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, ARM64::LD3Threev2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, ARM64::LD3Threev4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, ARM64::LD1Threev1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, ARM64::LD3Threev2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld4:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld2r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, ARM64::LD2Rv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, ARM64::LD2Rv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 2, ARM64::LD2Rv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 2, ARM64::LD2Rv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, ARM64::LD2Rv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, ARM64::LD2Rv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, ARM64::LD2Rv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, ARM64::LD2Rv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld3r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, ARM64::LD3Rv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, ARM64::LD3Rv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 3, ARM64::LD3Rv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 3, ARM64::LD3Rv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, ARM64::LD3Rv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, ARM64::LD3Rv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, ARM64::LD3Rv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, ARM64::LD3Rv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld4r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, ARM64::LD4Rv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, ARM64::LD4Rv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 4, ARM64::LD4Rv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 4, ARM64::LD4Rv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, ARM64::LD4Rv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, ARM64::LD4Rv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, ARM64::LD4Rv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, ARM64::LD4Rv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld2lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 2, ARM64::LD2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectLoadLane(Node, 2, ARM64::LD2i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 2, ARM64::LD2i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 2, ARM64::LD2i64);
-      break;
-    case Intrinsic::arm64_neon_ld3lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 3, ARM64::LD3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectLoadLane(Node, 3, ARM64::LD3i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 3, ARM64::LD3i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 3, ARM64::LD3i64);
-      break;
-    case Intrinsic::arm64_neon_ld4lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 4, ARM64::LD4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectLoadLane(Node, 4, ARM64::LD4i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 4, ARM64::LD4i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 4, ARM64::LD4i64);
-      break;
-    }
-  } break;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_neon_tbl2:
-      return SelectTable(Node, 2, VT == MVT::v8i8 ? ARM64::TBLv8i8Two
-                                                  : ARM64::TBLv16i8Two,
-                         false);
-    case Intrinsic::arm64_neon_tbl3:
-      return SelectTable(Node, 3, VT == MVT::v8i8 ? ARM64::TBLv8i8Three
-                                                  : ARM64::TBLv16i8Three,
-                         false);
-    case Intrinsic::arm64_neon_tbl4:
-      return SelectTable(Node, 4, VT == MVT::v8i8 ? ARM64::TBLv8i8Four
-                                                  : ARM64::TBLv16i8Four,
-                         false);
-    case Intrinsic::arm64_neon_tbx2:
-      return SelectTable(Node, 2, VT == MVT::v8i8 ? ARM64::TBXv8i8Two
-                                                  : ARM64::TBXv16i8Two,
-                         true);
-    case Intrinsic::arm64_neon_tbx3:
-      return SelectTable(Node, 3, VT == MVT::v8i8 ? ARM64::TBXv8i8Three
-                                                  : ARM64::TBXv16i8Three,
-                         true);
-    case Intrinsic::arm64_neon_tbx4:
-      return SelectTable(Node, 4, VT == MVT::v8i8 ? ARM64::TBXv8i8Four
-                                                  : ARM64::TBXv16i8Four,
-                         true);
-    case Intrinsic::arm64_neon_smull:
-    case Intrinsic::arm64_neon_umull:
-      if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
-        return N;
-      break;
-    }
-    break;
-  }
-  case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    if (Node->getNumOperands() >= 3)
-      VT = Node->getOperand(2)->getValueType(0);
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_neon_st1x2: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 2, ARM64::ST1Twov8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 2, ARM64::ST1Twov16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 2, ARM64::ST1Twov4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 2, ARM64::ST1Twov8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 2, ARM64::ST1Twov2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 2, ARM64::ST1Twov4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 2, ARM64::ST1Twov2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 2, ARM64::ST1Twov1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st1x3: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 3, ARM64::ST1Threev8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 3, ARM64::ST1Threev16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 3, ARM64::ST1Threev4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 3, ARM64::ST1Threev8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 3, ARM64::ST1Threev2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 3, ARM64::ST1Threev4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 3, ARM64::ST1Threev2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 3, ARM64::ST1Threev1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st1x4: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 4, ARM64::ST1Fourv8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 4, ARM64::ST1Fourv16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 4, ARM64::ST1Fourv4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 4, ARM64::ST1Fourv8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 4, ARM64::ST1Fourv2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 4, ARM64::ST1Fourv4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 4, ARM64::ST1Fourv2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 4, ARM64::ST1Fourv1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st2: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 2, ARM64::ST2Twov8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 2, ARM64::ST2Twov16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 2, ARM64::ST2Twov4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 2, ARM64::ST2Twov8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 2, ARM64::ST2Twov2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 2, ARM64::ST2Twov4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 2, ARM64::ST2Twov2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 2, ARM64::ST1Twov1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st3: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 3, ARM64::ST3Threev8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 3, ARM64::ST3Threev16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 3, ARM64::ST3Threev4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 3, ARM64::ST3Threev8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 3, ARM64::ST3Threev2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 3, ARM64::ST3Threev4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 3, ARM64::ST3Threev2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 3, ARM64::ST1Threev1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st4: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 4, ARM64::ST4Fourv8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 4, ARM64::ST4Fourv16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 4, ARM64::ST4Fourv4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 4, ARM64::ST4Fourv8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 4, ARM64::ST4Fourv2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 4, ARM64::ST4Fourv4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 4, ARM64::ST4Fourv2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 4, ARM64::ST1Fourv1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st2lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 2, ARM64::ST2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectStoreLane(Node, 2, ARM64::ST2i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 2, ARM64::ST2i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 2, ARM64::ST2i64);
-      break;
-    }
-    case Intrinsic::arm64_neon_st3lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 3, ARM64::ST3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectStoreLane(Node, 3, ARM64::ST3i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 3, ARM64::ST3i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 3, ARM64::ST3i64);
-      break;
-    }
-    case Intrinsic::arm64_neon_st4lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 4, ARM64::ST4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectStoreLane(Node, 4, ARM64::ST4i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 4, ARM64::ST4i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 4, ARM64::ST4i64);
-      break;
-    }
-    }
-  }
-
-  case ISD::FCEIL:
-  case ISD::FFLOOR:
-  case ISD::FTRUNC:
-  case ISD::FROUND:
-    if (SDNode *I = SelectLIBM(Node))
-      return I;
-    break;
-  }
-
-  // Select the default instruction
-  ResNode = SelectCode(Node);
-
-  DEBUG(errs() << "=> ");
-  if (ResNode == NULL || ResNode == Node)
-    DEBUG(Node->dump(CurDAG));
-  else
-    DEBUG(ResNode->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  return ResNode;
-}
-
-/// createARM64ISelDag - This pass converts a legalized DAG into a
-/// ARM64-specific DAG, ready for instruction scheduling.
-FunctionPass *llvm::createARM64ISelDag(ARM64TargetMachine &TM,
-                                       CodeGenOpt::Level OptLevel) {
-  return new ARM64DAGToDAGISel(TM, OptLevel);
-}
diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp
deleted file mode 100644
index 641f591..0000000
--- a/lib/Target/ARM64/ARM64ISelLowering.cpp
+++ /dev/null
@@ -1,7551 +0,0 @@
-//===-- ARM64ISelLowering.cpp - ARM64 DAG Lowering Implementation  --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64TargetLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-lower"
-
-#include "ARM64ISelLowering.h"
-#include "ARM64PerfectShuffle.h"
-#include "ARM64Subtarget.h"
-#include "ARM64CallingConv.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64TargetMachine.h"
-#include "ARM64TargetObjectFile.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOptions.h"
-using namespace llvm;
-
-STATISTIC(NumTailCalls, "Number of tail calls");
-STATISTIC(NumShiftInserts, "Number of vector shift inserts");
-
-// This option should go away when tail calls fully work.
-static cl::opt<bool>
-EnableARM64TailCalls("arm64-tail-calls", cl::Hidden,
-                     cl::desc("Generate ARM64 tail calls (TEMPORARY OPTION)."),
-                     cl::init(true));
-
-static cl::opt<bool>
-StrictAlign("arm64-strict-align", cl::Hidden,
-            cl::desc("Disallow all unaligned memory accesses"));
-
-// Place holder until extr generation is tested fully.
-static cl::opt<bool>
-EnableARM64ExtrGeneration("arm64-extr-generation", cl::Hidden,
-                          cl::desc("Allow ARM64 (or (shift)(shift))->extract"),
-                          cl::init(true));
-
-static cl::opt<bool>
-EnableARM64SlrGeneration("arm64-shift-insert-generation", cl::Hidden,
-                         cl::desc("Allow ARM64 SLI/SRI formation"),
-                         cl::init(false));
-
-//===----------------------------------------------------------------------===//
-// ARM64 Lowering public interface.
-//===----------------------------------------------------------------------===//
-static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
-  if (TM.getSubtarget<ARM64Subtarget>().isTargetDarwin())
-    return new ARM64_MachoTargetObjectFile();
-
-  return new ARM64_ELFTargetObjectFile();
-}
-
-ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(TM)) {
-  Subtarget = &TM.getSubtarget<ARM64Subtarget>();
-
-  // ARM64 doesn't have comparisons which set GPRs or setcc instructions, so
-  // we have to make something up. Arbitrarily, choose ZeroOrOne.
-  setBooleanContents(ZeroOrOneBooleanContent);
-  // When comparing vectors the result sets the different elements in the
-  // vector to all-one or all-zero.
-  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
-
-  // Set up the register classes.
-  addRegisterClass(MVT::i32, &ARM64::GPR32allRegClass);
-  addRegisterClass(MVT::i64, &ARM64::GPR64allRegClass);
-  addRegisterClass(MVT::f32, &ARM64::FPR32RegClass);
-  addRegisterClass(MVT::f64, &ARM64::FPR64RegClass);
-  addRegisterClass(MVT::f128, &ARM64::FPR128RegClass);
-  addRegisterClass(MVT::v16i8, &ARM64::FPR8RegClass);
-  addRegisterClass(MVT::v8i16, &ARM64::FPR16RegClass);
-
-  // Someone set us up the NEON.
-  addDRTypeForNEON(MVT::v2f32);
-  addDRTypeForNEON(MVT::v8i8);
-  addDRTypeForNEON(MVT::v4i16);
-  addDRTypeForNEON(MVT::v2i32);
-  addDRTypeForNEON(MVT::v1i64);
-  addDRTypeForNEON(MVT::v1f64);
-
-  addQRTypeForNEON(MVT::v4f32);
-  addQRTypeForNEON(MVT::v2f64);
-  addQRTypeForNEON(MVT::v16i8);
-  addQRTypeForNEON(MVT::v8i16);
-  addQRTypeForNEON(MVT::v4i32);
-  addQRTypeForNEON(MVT::v2i64);
-
-  // Compute derived properties from the register classes
-  computeRegisterProperties();
-
-  // Provide all sorts of operation actions
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
-  setOperationAction(ISD::SETCC, MVT::i32, Custom);
-  setOperationAction(ISD::SETCC, MVT::i64, Custom);
-  setOperationAction(ISD::SETCC, MVT::f32, Custom);
-  setOperationAction(ISD::SETCC, MVT::f64, Custom);
-  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
-  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
-  setOperationAction(ISD::SELECT, MVT::i32, Custom);
-  setOperationAction(ISD::SELECT, MVT::i64, Custom);
-  setOperationAction(ISD::SELECT, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT, MVT::f64, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
-
-  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
-
-  setOperationAction(ISD::FREM, MVT::f32, Expand);
-  setOperationAction(ISD::FREM, MVT::f64, Expand);
-  setOperationAction(ISD::FREM, MVT::f80, Expand);
-
-  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
-  // silliness like this:
-  setOperationAction(ISD::FABS, MVT::v1f64, Expand);
-  setOperationAction(ISD::FADD, MVT::v1f64, Expand);
-  setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
-  setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
-  setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
-  setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
-  setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
-  setOperationAction(ISD::FMA, MVT::v1f64, Expand);
-  setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
-  setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
-  setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
-  setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
-  setOperationAction(ISD::FREM, MVT::v1f64, Expand);
-  setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
-  setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
-  setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
-  setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
-  setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
-  setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
-  setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
-  setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
-  setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
-  setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
-  setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
-
-  setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
-  setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
-  setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
-  setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
-
-  // Custom lowering hooks are needed for XOR
-  // to fold it into CSINC/CSINV.
-  setOperationAction(ISD::XOR, MVT::i32, Custom);
-  setOperationAction(ISD::XOR, MVT::i64, Custom);
-
-  // Virtually no operation on f128 is legal, but LLVM can't expand them when
-  // there's a valid register class, so we need custom operations in most cases.
-  setOperationAction(ISD::FABS, MVT::f128, Expand);
-  setOperationAction(ISD::FADD, MVT::f128, Custom);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
-  setOperationAction(ISD::FCOS, MVT::f128, Expand);
-  setOperationAction(ISD::FDIV, MVT::f128, Custom);
-  setOperationAction(ISD::FMA, MVT::f128, Expand);
-  setOperationAction(ISD::FMUL, MVT::f128, Custom);
-  setOperationAction(ISD::FNEG, MVT::f128, Expand);
-  setOperationAction(ISD::FPOW, MVT::f128, Expand);
-  setOperationAction(ISD::FREM, MVT::f128, Expand);
-  setOperationAction(ISD::FRINT, MVT::f128, Expand);
-  setOperationAction(ISD::FSIN, MVT::f128, Expand);
-  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
-  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
-  setOperationAction(ISD::FSUB, MVT::f128, Custom);
-  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
-  setOperationAction(ISD::SETCC, MVT::f128, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
-  setOperationAction(ISD::SELECT, MVT::f128, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
-  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
-
-  // Lowering for many of the conversions is actually specified by the non-f128
-  // type. The LowerXXX function will be trivial when f128 isn't involved.
-  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
-  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
-
-  // 128-bit atomics
-  setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i128, Custom);
-  // These are surprisingly difficult. The only single-copy atomic 128-bit
-  // instruction on AArch64 is stxp (when it succeeds). So a store can safely
-  // become a simple swap, but a load can only be determined to have been atomic
-  // if storing the same value back succeeds.
-  setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Expand);
-
-  // Variable arguments.
-  setOperationAction(ISD::VASTART, MVT::Other, Custom);
-  setOperationAction(ISD::VAARG, MVT::Other, Custom);
-  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
-  setOperationAction(ISD::VAEND, MVT::Other, Expand);
-
-  // Variable-sized objects.
-  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
-  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
-
-  // Exception handling.
-  // FIXME: These are guesses. Has this been defined yet?
-  setExceptionPointerRegister(ARM64::X0);
-  setExceptionSelectorRegister(ARM64::X1);
-
-  // Constant pool entries
-  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
-
-  // BlockAddress
-  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
-
-  // Add/Sub overflow ops with MVT::Glues are lowered to CPSR dependences.
-  setOperationAction(ISD::ADDC, MVT::i32, Custom);
-  setOperationAction(ISD::ADDE, MVT::i32, Custom);
-  setOperationAction(ISD::SUBC, MVT::i32, Custom);
-  setOperationAction(ISD::SUBE, MVT::i32, Custom);
-  setOperationAction(ISD::ADDC, MVT::i64, Custom);
-  setOperationAction(ISD::ADDE, MVT::i64, Custom);
-  setOperationAction(ISD::SUBC, MVT::i64, Custom);
-  setOperationAction(ISD::SUBE, MVT::i64, Custom);
-
-  // ARM64 lacks both left-rotate and popcount instructions.
-  setOperationAction(ISD::ROTL, MVT::i32, Expand);
-  setOperationAction(ISD::ROTL, MVT::i64, Expand);
-
-  // ARM64 doesn't have a direct vector ->f32 conversion instructions for
-  // elements smaller than i32, so promote the input to i32 first.
-  setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
-  setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
-  setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
-  setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
-  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
-  setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
-
-  // ARM64 doesn't have {U|S}MUL_LOHI.
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-
-  // ARM64 doesn't have MUL.2d:
-  setOperationAction(ISD::MUL, MVT::v2i64, Expand);
-
-  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
-  // counterparts, which ARM64 supports directly.
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
-
-  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
-  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
-
-  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
-  setOperationAction(ISD::SREM, MVT::i32, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-
-  // Custom lower Add/Sub/Mul with overflow.
-  setOperationAction(ISD::SADDO, MVT::i32, Custom);
-  setOperationAction(ISD::SADDO, MVT::i64, Custom);
-  setOperationAction(ISD::UADDO, MVT::i32, Custom);
-  setOperationAction(ISD::UADDO, MVT::i64, Custom);
-  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
-  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
-  setOperationAction(ISD::USUBO, MVT::i32, Custom);
-  setOperationAction(ISD::USUBO, MVT::i64, Custom);
-  setOperationAction(ISD::SMULO, MVT::i32, Custom);
-  setOperationAction(ISD::SMULO, MVT::i64, Custom);
-  setOperationAction(ISD::UMULO, MVT::i32, Custom);
-  setOperationAction(ISD::UMULO, MVT::i64, Custom);
-
-  setOperationAction(ISD::FSIN, MVT::f32, Expand);
-  setOperationAction(ISD::FSIN, MVT::f64, Expand);
-  setOperationAction(ISD::FCOS, MVT::f32, Expand);
-  setOperationAction(ISD::FCOS, MVT::f64, Expand);
-  setOperationAction(ISD::FPOW, MVT::f32, Expand);
-  setOperationAction(ISD::FPOW, MVT::f64, Expand);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
-
-  // ARM64 has implementations of a lot of rounding-like FP operations.
-  static MVT RoundingTypes[] = { MVT::f32,   MVT::f64,  MVT::v2f32,
-                                 MVT::v4f32, MVT::v2f64 };
-  for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
-    MVT Ty = RoundingTypes[I];
-    setOperationAction(ISD::FFLOOR, Ty, Legal);
-    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-    setOperationAction(ISD::FCEIL, Ty, Legal);
-    setOperationAction(ISD::FRINT, Ty, Legal);
-    setOperationAction(ISD::FTRUNC, Ty, Legal);
-    setOperationAction(ISD::FROUND, Ty, Legal);
-  }
-
-  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
-
-  if (Subtarget->isTargetMachO()) {
-    // For iOS, we don't want to the normal expansion of a libcall to
-    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
-    // traffic.
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  } else {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-  }
-
-  // ARM64 does not have floating-point extending loads, i1 sign-extending load,
-  // floating-point truncating stores, or v2i32->v2i16 truncating store.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
-  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
-  // Indexed loads and stores are supported.
-  for (unsigned im = (unsigned)ISD::PRE_INC;
-       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
-    setIndexedLoadAction(im, MVT::i8, Legal);
-    setIndexedLoadAction(im, MVT::i16, Legal);
-    setIndexedLoadAction(im, MVT::i32, Legal);
-    setIndexedLoadAction(im, MVT::i64, Legal);
-    setIndexedLoadAction(im, MVT::f64, Legal);
-    setIndexedLoadAction(im, MVT::f32, Legal);
-    setIndexedStoreAction(im, MVT::i8, Legal);
-    setIndexedStoreAction(im, MVT::i16, Legal);
-    setIndexedStoreAction(im, MVT::i32, Legal);
-    setIndexedStoreAction(im, MVT::i64, Legal);
-    setIndexedStoreAction(im, MVT::f64, Legal);
-    setIndexedStoreAction(im, MVT::f32, Legal);
-  }
-
-  // Likewise, narrowing and extending vector loads/stores aren't handled
-  // directly.
-  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-
-    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
-                       Expand);
-
-    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
-      setTruncStoreAction((MVT::SimpleValueType)VT,
-                          (MVT::SimpleValueType)InnerVT, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
-  }
-
-  // Trap.
-  setOperationAction(ISD::TRAP, MVT::Other, Legal);
-  setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
-
-  // We combine OR nodes for bitfield operations.
-  setTargetDAGCombine(ISD::OR);
-
-  // Vector add and sub nodes may conceal a high-half opportunity.
-  // Also, try to fold ADD into CSINC/CSINV..
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::SUB);
-
-  setTargetDAGCombine(ISD::XOR);
-  setTargetDAGCombine(ISD::SINT_TO_FP);
-  setTargetDAGCombine(ISD::UINT_TO_FP);
-
-  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
-
-  setTargetDAGCombine(ISD::ANY_EXTEND);
-  setTargetDAGCombine(ISD::ZERO_EXTEND);
-  setTargetDAGCombine(ISD::SIGN_EXTEND);
-  setTargetDAGCombine(ISD::BITCAST);
-  setTargetDAGCombine(ISD::CONCAT_VECTORS);
-  setTargetDAGCombine(ISD::STORE);
-
-  setTargetDAGCombine(ISD::MUL);
-
-  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
-  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
-  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
-
-  setStackPointerRegisterToSaveRestore(ARM64::SP);
-
-  setSchedulingPreference(Sched::Hybrid);
-
-  // Enable TBZ/TBNZ
-  MaskAndBranchFoldingIsLegal = true;
-
-  setMinFunctionAlignment(2);
-
-  RequireStrictAlign = StrictAlign;
-}
-
-void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
-  if (VT == MVT::v2f32) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
-
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
-  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
-
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
-  }
-
-  // Mark vector float intrinsics as expand.
-  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
-    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
-  }
-
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SCALAR_TO_VECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-
-  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
-  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
-
-  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
-
-  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
-}
-
-void ARM64TargetLowering::addDRTypeForNEON(MVT VT) {
-  addRegisterClass(VT, &ARM64::FPR64RegClass);
-  addTypeForNEON(VT, MVT::v2i32);
-}
-
-void ARM64TargetLowering::addQRTypeForNEON(MVT VT) {
-  addRegisterClass(VT, &ARM64::FPR128RegClass);
-  addTypeForNEON(VT, MVT::v4i32);
-}
-
-EVT ARM64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-  if (!VT.isVector())
-    return MVT::i32;
-  return VT.changeVectorElementTypeToInteger();
-}
-
-/// computeMaskedBitsForTargetNode - Determine which of the bits specified in
-/// Mask are known to be either zero or one and return them in the
-/// KnownZero/KnownOne bitsets.
-void ARM64TargetLowering::computeMaskedBitsForTargetNode(
-    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
-    const SelectionDAG &DAG, unsigned Depth) const {
-  switch (Op.getOpcode()) {
-  default:
-    break;
-  case ARM64ISD::CSEL: {
-    APInt KnownZero2, KnownOne2;
-    DAG.ComputeMaskedBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
-    DAG.ComputeMaskedBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
-    KnownZero &= KnownZero2;
-    KnownOne &= KnownOne2;
-    break;
-  }
-  case ISD::INTRINSIC_W_CHAIN:
-    break;
-  case ISD::INTRINSIC_WO_CHAIN:
-  case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_neon_umaxv:
-    case Intrinsic::arm64_neon_uminv: {
-      // Figure out the datatype of the vector operand. The UMINV instruction
-      // will zero extend the result, so we can mark as known zero all the
-      // bits larger than the element datatype. 32-bit or larget doesn't need
-      // this as those are legal types and will be handled by isel directly.
-      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
-      unsigned BitWidth = KnownZero.getBitWidth();
-      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
-        assert(BitWidth >= 8 && "Unexpected width!");
-        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
-        KnownZero |= Mask;
-      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
-        assert(BitWidth >= 16 && "Unexpected width!");
-        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
-        KnownZero |= Mask;
-      }
-      break;
-    } break;
-    }
-  }
-  }
-}
-
-MVT ARM64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
-  return MVT::i64;
-}
-
-unsigned ARM64TargetLowering::getMaximalGlobalOffset() const {
-  // FIXME: On ARM64, this depends on the type.
-  // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes().
-  // and the offset has to be a multiple of the related size in bytes.
-  return 4095;
-}
-
-FastISel *
-ARM64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
-                                    const TargetLibraryInfo *libInfo) const {
-  return ARM64::createFastISel(funcInfo, libInfo);
-}
-
-const char *ARM64TargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (Opcode) {
-  default:
-    return 0;
-  case ARM64ISD::CALL:              return "ARM64ISD::CALL";
-  case ARM64ISD::ADRP:              return "ARM64ISD::ADRP";
-  case ARM64ISD::ADDlow:            return "ARM64ISD::ADDlow";
-  case ARM64ISD::LOADgot:           return "ARM64ISD::LOADgot";
-  case ARM64ISD::RET_FLAG:          return "ARM64ISD::RET_FLAG";
-  case ARM64ISD::BRCOND:            return "ARM64ISD::BRCOND";
-  case ARM64ISD::CSEL:              return "ARM64ISD::CSEL";
-  case ARM64ISD::FCSEL:             return "ARM64ISD::FCSEL";
-  case ARM64ISD::CSINV:             return "ARM64ISD::CSINV";
-  case ARM64ISD::CSNEG:             return "ARM64ISD::CSNEG";
-  case ARM64ISD::CSINC:             return "ARM64ISD::CSINC";
-  case ARM64ISD::THREAD_POINTER:    return "ARM64ISD::THREAD_POINTER";
-  case ARM64ISD::TLSDESC_CALL:      return "ARM64ISD::TLSDESC_CALL";
-  case ARM64ISD::ADC:               return "ARM64ISD::ADC";
-  case ARM64ISD::SBC:               return "ARM64ISD::SBC";
-  case ARM64ISD::ADDS:              return "ARM64ISD::ADDS";
-  case ARM64ISD::SUBS:              return "ARM64ISD::SUBS";
-  case ARM64ISD::ADCS:              return "ARM64ISD::ADCS";
-  case ARM64ISD::SBCS:              return "ARM64ISD::SBCS";
-  case ARM64ISD::ANDS:              return "ARM64ISD::ANDS";
-  case ARM64ISD::FCMP:              return "ARM64ISD::FCMP";
-  case ARM64ISD::FMIN:              return "ARM64ISD::FMIN";
-  case ARM64ISD::FMAX:              return "ARM64ISD::FMAX";
-  case ARM64ISD::DUP:               return "ARM64ISD::DUP";
-  case ARM64ISD::DUPLANE8:          return "ARM64ISD::DUPLANE8";
-  case ARM64ISD::DUPLANE16:         return "ARM64ISD::DUPLANE16";
-  case ARM64ISD::DUPLANE32:         return "ARM64ISD::DUPLANE32";
-  case ARM64ISD::DUPLANE64:         return "ARM64ISD::DUPLANE64";
-  case ARM64ISD::MOVI:              return "ARM64ISD::MOVI";
-  case ARM64ISD::MOVIshift:         return "ARM64ISD::MOVIshift";
-  case ARM64ISD::MOVIedit:          return "ARM64ISD::MOVIedit";
-  case ARM64ISD::MOVImsl:           return "ARM64ISD::MOVImsl";
-  case ARM64ISD::FMOV:              return "ARM64ISD::FMOV";
-  case ARM64ISD::MVNIshift:         return "ARM64ISD::MVNIshift";
-  case ARM64ISD::MVNImsl:           return "ARM64ISD::MVNImsl";
-  case ARM64ISD::BICi:              return "ARM64ISD::BICi";
-  case ARM64ISD::ORRi:              return "ARM64ISD::ORRi";
-  case ARM64ISD::NEG:               return "ARM64ISD::NEG";
-  case ARM64ISD::EXTR:              return "ARM64ISD::EXTR";
-  case ARM64ISD::ZIP1:              return "ARM64ISD::ZIP1";
-  case ARM64ISD::ZIP2:              return "ARM64ISD::ZIP2";
-  case ARM64ISD::UZP1:              return "ARM64ISD::UZP1";
-  case ARM64ISD::UZP2:              return "ARM64ISD::UZP2";
-  case ARM64ISD::TRN1:              return "ARM64ISD::TRN1";
-  case ARM64ISD::TRN2:              return "ARM64ISD::TRN2";
-  case ARM64ISD::REV16:             return "ARM64ISD::REV16";
-  case ARM64ISD::REV32:             return "ARM64ISD::REV32";
-  case ARM64ISD::REV64:             return "ARM64ISD::REV64";
-  case ARM64ISD::EXT:               return "ARM64ISD::EXT";
-  case ARM64ISD::VSHL:              return "ARM64ISD::VSHL";
-  case ARM64ISD::VLSHR:             return "ARM64ISD::VLSHR";
-  case ARM64ISD::VASHR:             return "ARM64ISD::VASHR";
-  case ARM64ISD::CMEQ:              return "ARM64ISD::CMEQ";
-  case ARM64ISD::CMGE:              return "ARM64ISD::CMGE";
-  case ARM64ISD::CMGT:              return "ARM64ISD::CMGT";
-  case ARM64ISD::CMHI:              return "ARM64ISD::CMHI";
-  case ARM64ISD::CMHS:              return "ARM64ISD::CMHS";
-  case ARM64ISD::FCMEQ:             return "ARM64ISD::FCMEQ";
-  case ARM64ISD::FCMGE:             return "ARM64ISD::FCMGE";
-  case ARM64ISD::FCMGT:             return "ARM64ISD::FCMGT";
-  case ARM64ISD::CMEQz:             return "ARM64ISD::CMEQz";
-  case ARM64ISD::CMGEz:             return "ARM64ISD::CMGEz";
-  case ARM64ISD::CMGTz:             return "ARM64ISD::CMGTz";
-  case ARM64ISD::CMLEz:             return "ARM64ISD::CMLEz";
-  case ARM64ISD::CMLTz:             return "ARM64ISD::CMLTz";
-  case ARM64ISD::FCMEQz:            return "ARM64ISD::FCMEQz";
-  case ARM64ISD::FCMGEz:            return "ARM64ISD::FCMGEz";
-  case ARM64ISD::FCMGTz:            return "ARM64ISD::FCMGTz";
-  case ARM64ISD::FCMLEz:            return "ARM64ISD::FCMLEz";
-  case ARM64ISD::FCMLTz:            return "ARM64ISD::FCMLTz";
-  case ARM64ISD::NOT:               return "ARM64ISD::NOT";
-  case ARM64ISD::BIT:               return "ARM64ISD::BIT";
-  case ARM64ISD::CBZ:               return "ARM64ISD::CBZ";
-  case ARM64ISD::CBNZ:              return "ARM64ISD::CBNZ";
-  case ARM64ISD::TBZ:               return "ARM64ISD::TBZ";
-  case ARM64ISD::TBNZ:              return "ARM64ISD::TBNZ";
-  case ARM64ISD::TC_RETURN:         return "ARM64ISD::TC_RETURN";
-  case ARM64ISD::SITOF:             return "ARM64ISD::SITOF";
-  case ARM64ISD::UITOF:             return "ARM64ISD::UITOF";
-  case ARM64ISD::SQSHL_I:           return "ARM64ISD::SQSHL_I";
-  case ARM64ISD::UQSHL_I:           return "ARM64ISD::UQSHL_I";
-  case ARM64ISD::SRSHR_I:           return "ARM64ISD::SRSHR_I";
-  case ARM64ISD::URSHR_I:           return "ARM64ISD::URSHR_I";
-  case ARM64ISD::SQSHLU_I:          return "ARM64ISD::SQSHLU_I";
-  case ARM64ISD::WrapperLarge:      return "ARM64ISD::WrapperLarge";
-  }
-}
-
-static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
-                                  unsigned &LdrOpc, unsigned &StrOpc) {
-  static unsigned LoadBares[] = { ARM64::LDXRB, ARM64::LDXRH, ARM64::LDXRW,
-                                  ARM64::LDXRX, ARM64::LDXPX };
-  static unsigned LoadAcqs[] = { ARM64::LDAXRB, ARM64::LDAXRH, ARM64::LDAXRW,
-                                 ARM64::LDAXRX, ARM64::LDAXPX };
-  static unsigned StoreBares[] = { ARM64::STXRB, ARM64::STXRH, ARM64::STXRW,
-                                   ARM64::STXRX, ARM64::STXPX };
-  static unsigned StoreRels[] = { ARM64::STLXRB, ARM64::STLXRH, ARM64::STLXRW,
-                                  ARM64::STLXRX, ARM64::STLXPX };
-
-  unsigned *LoadOps, *StoreOps;
-  if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    LoadOps = LoadAcqs;
-  else
-    LoadOps = LoadBares;
-
-  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    StoreOps = StoreRels;
-  else
-    StoreOps = StoreBares;
-
-  assert(isPowerOf2_32(Size) && Size <= 16 &&
-         "unsupported size for atomic binary op!");
-
-  LdrOpc = LoadOps[Log2_32(Size)];
-  StrOpc = StoreOps[Log2_32(Size)];
-}
-
-MachineBasicBlock *ARM64TargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
-                                                          MachineBasicBlock *BB,
-                                                          unsigned Size) const {
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptr = MI->getOperand(1).getReg();
-  unsigned oldval = MI->getOperand(2).getReg();
-  unsigned newval = MI->getOperand(3).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
-  unsigned scratch = BB->getParent()->getRegInfo().createVirtualRegister(
-      &ARM64::GPR32RegClass);
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-
-  // FIXME: We currently always generate a seq_cst operation; we should
-  // be able to relax this in some cases.
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
-  MachineFunction *MF = BB->getParent();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It; // insert the new blocks after the current block
-
-  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loop1MBB);
-  MF->insert(It, loop2MBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loop1MBB
-  BB->addSuccessor(loop1MBB);
-
-  // loop1MBB:
-  //   ldrex dest, [ptr]
-  //   cmp dest, oldval
-  //   bne exitMBB
-  BB = loop1MBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-  BuildMI(BB, dl, TII->get(Size == 8 ? ARM64::SUBSXrr : ARM64::SUBSWrr))
-      .addReg(Size == 8 ? ARM64::XZR : ARM64::WZR, RegState::Define)
-      .addReg(dest)
-      .addReg(oldval);
-  BuildMI(BB, dl, TII->get(ARM64::Bcc)).addImm(ARM64CC::NE).addMBB(exitMBB);
-  BB->addSuccessor(loop2MBB);
-  BB->addSuccessor(exitMBB);
-
-  // loop2MBB:
-  //   strex scratch, newval, [ptr]
-  //   cmp scratch, #0
-  //   bne loop1MBB
-  BB = loop2MBB;
-  BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr);
-  BuildMI(BB, dl, TII->get(ARM64::CBNZW)).addReg(scratch).addMBB(loop1MBB);
-  BB->addSuccessor(loop1MBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
-
-  MI->eraseFromParent(); // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
-                                      unsigned Size, unsigned BinOpcode) const {
-  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptr = MI->getOperand(1).getReg();
-  unsigned incr = MI->getOperand(2).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
-  DebugLoc dl = MI->getDebugLoc();
-
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loopMBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  unsigned scratch = RegInfo.createVirtualRegister(&ARM64::GPR32RegClass);
-  unsigned scratch2 =
-      (!BinOpcode)
-          ? incr
-          : RegInfo.createVirtualRegister(Size == 8 ? &ARM64::GPR64RegClass
-                                                    : &ARM64::GPR32RegClass);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
-
-  //  loopMBB:
-  //   ldxr dest, ptr
-  //   <binop> scratch2, dest, incr
-  //   stxr scratch, scratch2, ptr
-  //   cbnz scratch, loopMBB
-  //   fallthrough --> exitMBB
-  BB = loopMBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-  if (BinOpcode) {
-    // operand order needs to go the other way for NAND
-    if (BinOpcode == ARM64::BICWrr || BinOpcode == ARM64::BICXrr)
-      BuildMI(BB, dl, TII->get(BinOpcode), scratch2).addReg(incr).addReg(dest);
-    else
-      BuildMI(BB, dl, TII->get(BinOpcode), scratch2).addReg(dest).addReg(incr);
-  }
-
-  BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
-  BuildMI(BB, dl, TII->get(ARM64::CBNZW)).addReg(scratch).addMBB(loopMBB);
-
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
-
-  MI->eraseFromParent(); // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *ARM64TargetLowering::EmitAtomicBinary128(
-    MachineInstr *MI, MachineBasicBlock *BB, unsigned BinOpcodeLo,
-    unsigned BinOpcodeHi) const {
-  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  unsigned DestLo = MI->getOperand(0).getReg();
-  unsigned DestHi = MI->getOperand(1).getReg();
-  unsigned Ptr = MI->getOperand(2).getReg();
-  unsigned IncrLo = MI->getOperand(3).getReg();
-  unsigned IncrHi = MI->getOperand(4).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(5).getImm());
-  DebugLoc DL = MI->getDebugLoc();
-
-  unsigned LdrOpc, StrOpc;
-  getExclusiveOperation(16, Ord, LdrOpc, StrOpc);
-
-  MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, LoopMBB);
-  MF->insert(It, ExitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  ExitMBB->splice(ExitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  ExitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  unsigned ScratchRes = RegInfo.createVirtualRegister(&ARM64::GPR32RegClass);
-  unsigned ScratchLo = IncrLo, ScratchHi = IncrHi;
-  if (BinOpcodeLo) {
-    assert(BinOpcodeHi && "Expect neither or both opcodes to be defined");
-    ScratchLo = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
-    ScratchHi = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
-  }
-
-  //  ThisMBB:
-  //   ...
-  //   fallthrough --> LoopMBB
-  BB->addSuccessor(LoopMBB);
-
-  //  LoopMBB:
-  //   ldxp DestLo, DestHi, Ptr
-  //   <binoplo> ScratchLo, DestLo, IncrLo
-  //   <binophi> ScratchHi, DestHi, IncrHi
-  //   stxp ScratchRes, ScratchLo, ScratchHi, ptr
-  //   cbnz ScratchRes, LoopMBB
-  //   fallthrough --> ExitMBB
-  BB = LoopMBB;
-  BuildMI(BB, DL, TII->get(LdrOpc), DestLo)
-      .addReg(DestHi, RegState::Define)
-      .addReg(Ptr);
-  if (BinOpcodeLo) {
-    // operand order needs to go the other way for NAND
-    if (BinOpcodeLo == ARM64::BICXrr) {
-      std::swap(IncrLo, DestLo);
-      std::swap(IncrHi, DestHi);
-    }
-
-    BuildMI(BB, DL, TII->get(BinOpcodeLo), ScratchLo).addReg(DestLo).addReg(
-        IncrLo);
-    BuildMI(BB, DL, TII->get(BinOpcodeHi), ScratchHi).addReg(DestHi).addReg(
-        IncrHi);
-  }
-
-  BuildMI(BB, DL, TII->get(StrOpc), ScratchRes)
-      .addReg(ScratchLo)
-      .addReg(ScratchHi)
-      .addReg(Ptr);
-  BuildMI(BB, DL, TII->get(ARM64::CBNZW)).addReg(ScratchRes).addMBB(LoopMBB);
-
-  BB->addSuccessor(LoopMBB);
-  BB->addSuccessor(ExitMBB);
-
-  //  ExitMBB:
-  //   ...
-  BB = ExitMBB;
-
-  MI->eraseFromParent(); // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitAtomicCmpSwap128(MachineInstr *MI,
-                                          MachineBasicBlock *BB) const {
-  unsigned DestLo = MI->getOperand(0).getReg();
-  unsigned DestHi = MI->getOperand(1).getReg();
-  unsigned Ptr = MI->getOperand(2).getReg();
-  unsigned OldValLo = MI->getOperand(3).getReg();
-  unsigned OldValHi = MI->getOperand(4).getReg();
-  unsigned NewValLo = MI->getOperand(5).getReg();
-  unsigned NewValHi = MI->getOperand(6).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(7).getImm());
-  unsigned ScratchRes = BB->getParent()->getRegInfo().createVirtualRegister(
-      &ARM64::GPR32RegClass);
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
-
-  unsigned LdrOpc, StrOpc;
-  getExclusiveOperation(16, Ord, LdrOpc, StrOpc);
-
-  MachineFunction *MF = BB->getParent();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It; // insert the new blocks after the current block
-
-  MachineBasicBlock *Loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *Loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, Loop1MBB);
-  MF->insert(It, Loop2MBB);
-  MF->insert(It, ExitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  ExitMBB->splice(ExitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  ExitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  //  ThisMBB:
-  //   ...
-  //   fallthrough --> Loop1MBB
-  BB->addSuccessor(Loop1MBB);
-
-  // Loop1MBB:
-  //   ldxp DestLo, DestHi, [Ptr]
-  //   cmp DestLo, OldValLo
-  //   sbc xzr, DestHi, OldValHi
-  //   bne ExitMBB
-  BB = Loop1MBB;
-  BuildMI(BB, DL, TII->get(LdrOpc), DestLo)
-      .addReg(DestHi, RegState::Define)
-      .addReg(Ptr);
-  BuildMI(BB, DL, TII->get(ARM64::SUBSXrr), ARM64::XZR).addReg(DestLo).addReg(
-      OldValLo);
-  BuildMI(BB, DL, TII->get(ARM64::SBCXr), ARM64::XZR).addReg(DestHi).addReg(
-      OldValHi);
-
-  BuildMI(BB, DL, TII->get(ARM64::Bcc)).addImm(ARM64CC::NE).addMBB(ExitMBB);
-  BB->addSuccessor(Loop2MBB);
-  BB->addSuccessor(ExitMBB);
-
-  // Loop2MBB:
-  //   stxp ScratchRes, NewValLo, NewValHi, [Ptr]
-  //   cbnz ScratchRes, Loop1MBB
-  BB = Loop2MBB;
-  BuildMI(BB, DL, TII->get(StrOpc), ScratchRes)
-      .addReg(NewValLo)
-      .addReg(NewValHi)
-      .addReg(Ptr);
-  BuildMI(BB, DL, TII->get(ARM64::CBNZW)).addReg(ScratchRes).addMBB(Loop1MBB);
-  BB->addSuccessor(Loop1MBB);
-  BB->addSuccessor(ExitMBB);
-
-  //  ExitMBB:
-  //   ...
-  BB = ExitMBB;
-
-  MI->eraseFromParent(); // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *ARM64TargetLowering::EmitAtomicMinMax128(
-    MachineInstr *MI, MachineBasicBlock *BB, unsigned CondCode) const {
-  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  unsigned DestLo = MI->getOperand(0).getReg();
-  unsigned DestHi = MI->getOperand(1).getReg();
-  unsigned Ptr = MI->getOperand(2).getReg();
-  unsigned IncrLo = MI->getOperand(3).getReg();
-  unsigned IncrHi = MI->getOperand(4).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(5).getImm());
-  DebugLoc DL = MI->getDebugLoc();
-
-  unsigned LdrOpc, StrOpc;
-  getExclusiveOperation(16, Ord, LdrOpc, StrOpc);
-
-  MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, LoopMBB);
-  MF->insert(It, ExitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  ExitMBB->splice(ExitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  ExitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  unsigned ScratchRes = RegInfo.createVirtualRegister(&ARM64::GPR32RegClass);
-  unsigned ScratchLo = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
-  unsigned ScratchHi = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
-
-  //  ThisMBB:
-  //   ...
-  //   fallthrough --> LoopMBB
-  BB->addSuccessor(LoopMBB);
-
-  //  LoopMBB:
-  //   ldxp DestLo, DestHi, Ptr
-  //   cmp ScratchLo, DestLo, IncrLo
-  //   sbc xzr, ScratchHi, DestHi, IncrHi
-  //   csel ScratchLo, DestLo, IncrLo, <cmp-op>
-  //   csel ScratchHi, DestHi, IncrHi, <cmp-op>
-  //   stxp ScratchRes, ScratchLo, ScratchHi, ptr
-  //   cbnz ScratchRes, LoopMBB
-  //   fallthrough --> ExitMBB
-  BB = LoopMBB;
-  BuildMI(BB, DL, TII->get(LdrOpc), DestLo)
-      .addReg(DestHi, RegState::Define)
-      .addReg(Ptr);
-
-  BuildMI(BB, DL, TII->get(ARM64::SUBSXrr), ARM64::XZR).addReg(DestLo).addReg(
-      IncrLo);
-  BuildMI(BB, DL, TII->get(ARM64::SBCXr), ARM64::XZR).addReg(DestHi).addReg(
-      IncrHi);
-
-  BuildMI(BB, DL, TII->get(ARM64::CSELXr), ScratchLo)
-      .addReg(DestLo)
-      .addReg(IncrLo)
-      .addImm(CondCode);
-  BuildMI(BB, DL, TII->get(ARM64::CSELXr), ScratchHi)
-      .addReg(DestHi)
-      .addReg(IncrHi)
-      .addImm(CondCode);
-
-  BuildMI(BB, DL, TII->get(StrOpc), ScratchRes)
-      .addReg(ScratchLo)
-      .addReg(ScratchHi)
-      .addReg(Ptr);
-  BuildMI(BB, DL, TII->get(ARM64::CBNZW)).addReg(ScratchRes).addMBB(LoopMBB);
-
-  BB->addSuccessor(LoopMBB);
-  BB->addSuccessor(ExitMBB);
-
-  //  ExitMBB:
-  //   ...
-  BB = ExitMBB;
-
-  MI->eraseFromParent(); // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitF128CSEL(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const {
-  // We materialise the F128CSEL pseudo-instruction as some control flow and a
-  // phi node:
-
-  // OrigBB:
-  //     [... previous instrs leading to comparison ...]
-  //     b.ne TrueBB
-  //     b EndBB
-  // TrueBB:
-  //     ; Fallthrough
-  // EndBB:
-  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
-
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  MachineFunction *MF = MBB->getParent();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  DebugLoc DL = MI->getDebugLoc();
-  MachineFunction::iterator It = MBB;
-  ++It;
-
-  unsigned DestReg = MI->getOperand(0).getReg();
-  unsigned IfTrueReg = MI->getOperand(1).getReg();
-  unsigned IfFalseReg = MI->getOperand(2).getReg();
-  unsigned CondCode = MI->getOperand(3).getImm();
-  bool CPSRKilled = MI->getOperand(4).isKill();
-
-  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, TrueBB);
-  MF->insert(It, EndBB);
-
-  // Transfer rest of current basic-block to EndBB
-  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
-                MBB->end());
-  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  BuildMI(MBB, DL, TII->get(ARM64::Bcc)).addImm(CondCode).addMBB(TrueBB);
-  BuildMI(MBB, DL, TII->get(ARM64::B)).addMBB(EndBB);
-  MBB->addSuccessor(TrueBB);
-  MBB->addSuccessor(EndBB);
-
-  // TrueBB falls through to the end.
-  TrueBB->addSuccessor(EndBB);
-
-  if (!CPSRKilled) {
-    TrueBB->addLiveIn(ARM64::CPSR);
-    EndBB->addLiveIn(ARM64::CPSR);
-  }
-
-  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(ARM64::PHI), DestReg)
-      .addReg(IfTrueReg)
-      .addMBB(TrueBB)
-      .addReg(IfFalseReg)
-      .addMBB(MBB);
-
-  MI->eraseFromParent();
-  return EndBB;
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *BB) const {
-  switch (MI->getOpcode()) {
-  default:
-#ifndef NDEBUG
-    MI->dump();
-#endif
-    assert(0 && "Unexpected instruction for custom inserter!");
-    break;
-
-  case ARM64::ATOMIC_LOAD_ADD_I8:
-    return EmitAtomicBinary(MI, BB, 1, ARM64::ADDWrr);
-  case ARM64::ATOMIC_LOAD_ADD_I16:
-    return EmitAtomicBinary(MI, BB, 2, ARM64::ADDWrr);
-  case ARM64::ATOMIC_LOAD_ADD_I32:
-    return EmitAtomicBinary(MI, BB, 4, ARM64::ADDWrr);
-  case ARM64::ATOMIC_LOAD_ADD_I64:
-    return EmitAtomicBinary(MI, BB, 8, ARM64::ADDXrr);
-  case ARM64::ATOMIC_LOAD_ADD_I128:
-    return EmitAtomicBinary128(MI, BB, ARM64::ADDSXrr, ARM64::ADCXr);
-
-  case ARM64::ATOMIC_LOAD_AND_I8:
-    return EmitAtomicBinary(MI, BB, 1, ARM64::ANDWrr);
-  case ARM64::ATOMIC_LOAD_AND_I16:
-    return EmitAtomicBinary(MI, BB, 2, ARM64::ANDWrr);
-  case ARM64::ATOMIC_LOAD_AND_I32:
-    return EmitAtomicBinary(MI, BB, 4, ARM64::ANDWrr);
-  case ARM64::ATOMIC_LOAD_AND_I64:
-    return EmitAtomicBinary(MI, BB, 8, ARM64::ANDXrr);
-  case ARM64::ATOMIC_LOAD_AND_I128:
-    return EmitAtomicBinary128(MI, BB, ARM64::ANDXrr, ARM64::ANDXrr);
-
-  case ARM64::ATOMIC_LOAD_OR_I8:
-    return EmitAtomicBinary(MI, BB, 1, ARM64::ORRWrr);
-  case ARM64::ATOMIC_LOAD_OR_I16:
-    return EmitAtomicBinary(MI, BB, 2, ARM64::ORRWrr);
-  case ARM64::ATOMIC_LOAD_OR_I32:
-    return EmitAtomicBinary(MI, BB, 4, ARM64::ORRWrr);
-  case ARM64::ATOMIC_LOAD_OR_I64:
-    return EmitAtomicBinary(MI, BB, 8, ARM64::ORRXrr);
-  case ARM64::ATOMIC_LOAD_OR_I128:
-    return EmitAtomicBinary128(MI, BB, ARM64::ORRXrr, ARM64::ORRXrr);
-
-  case ARM64::ATOMIC_LOAD_XOR_I8:
-    return EmitAtomicBinary(MI, BB, 1, ARM64::EORWrr);
-  case ARM64::ATOMIC_LOAD_XOR_I16:
-    return EmitAtomicBinary(MI, BB, 2, ARM64::EORWrr);
-  case ARM64::ATOMIC_LOAD_XOR_I32:
-    return EmitAtomicBinary(MI, BB, 4, ARM64::EORWrr);
-  case ARM64::ATOMIC_LOAD_XOR_I64:
-    return EmitAtomicBinary(MI, BB, 8, ARM64::EORXrr);
-  case ARM64::ATOMIC_LOAD_XOR_I128:
-    return EmitAtomicBinary128(MI, BB, ARM64::EORXrr, ARM64::EORXrr);
-
-  case ARM64::ATOMIC_LOAD_NAND_I8:
-    return EmitAtomicBinary(MI, BB, 1, ARM64::BICWrr);
-  case ARM64::ATOMIC_LOAD_NAND_I16:
-    return EmitAtomicBinary(MI, BB, 2, ARM64::BICWrr);
-  case ARM64::ATOMIC_LOAD_NAND_I32:
-    return EmitAtomicBinary(MI, BB, 4, ARM64::BICWrr);
-  case ARM64::ATOMIC_LOAD_NAND_I64:
-    return EmitAtomicBinary(MI, BB, 8, ARM64::BICXrr);
-  case ARM64::ATOMIC_LOAD_NAND_I128:
-    return EmitAtomicBinary128(MI, BB, ARM64::BICXrr, ARM64::BICXrr);
-
-  case ARM64::ATOMIC_LOAD_SUB_I8:
-    return EmitAtomicBinary(MI, BB, 1, ARM64::SUBWrr);
-  case ARM64::ATOMIC_LOAD_SUB_I16:
-    return EmitAtomicBinary(MI, BB, 2, ARM64::SUBWrr);
-  case ARM64::ATOMIC_LOAD_SUB_I32:
-    return EmitAtomicBinary(MI, BB, 4, ARM64::SUBWrr);
-  case ARM64::ATOMIC_LOAD_SUB_I64:
-    return EmitAtomicBinary(MI, BB, 8, ARM64::SUBXrr);
-  case ARM64::ATOMIC_LOAD_SUB_I128:
-    return EmitAtomicBinary128(MI, BB, ARM64::SUBSXrr, ARM64::SBCXr);
-
-  case ARM64::ATOMIC_LOAD_MIN_I128:
-    return EmitAtomicMinMax128(MI, BB, ARM64CC::LT);
-
-  case ARM64::ATOMIC_LOAD_MAX_I128:
-    return EmitAtomicMinMax128(MI, BB, ARM64CC::GT);
-
-  case ARM64::ATOMIC_LOAD_UMIN_I128:
-    return EmitAtomicMinMax128(MI, BB, ARM64CC::CC);
-
-  case ARM64::ATOMIC_LOAD_UMAX_I128:
-    return EmitAtomicMinMax128(MI, BB, ARM64CC::HI);
-
-  case ARM64::ATOMIC_SWAP_I8:
-    return EmitAtomicBinary(MI, BB, 1, 0);
-  case ARM64::ATOMIC_SWAP_I16:
-    return EmitAtomicBinary(MI, BB, 2, 0);
-  case ARM64::ATOMIC_SWAP_I32:
-    return EmitAtomicBinary(MI, BB, 4, 0);
-  case ARM64::ATOMIC_SWAP_I64:
-    return EmitAtomicBinary(MI, BB, 8, 0);
-  case ARM64::ATOMIC_SWAP_I128:
-    return EmitAtomicBinary128(MI, BB, 0, 0);
-
-  case ARM64::ATOMIC_CMP_SWAP_I8:
-    return EmitAtomicCmpSwap(MI, BB, 1);
-  case ARM64::ATOMIC_CMP_SWAP_I16:
-    return EmitAtomicCmpSwap(MI, BB, 2);
-  case ARM64::ATOMIC_CMP_SWAP_I32:
-    return EmitAtomicCmpSwap(MI, BB, 4);
-  case ARM64::ATOMIC_CMP_SWAP_I64:
-    return EmitAtomicCmpSwap(MI, BB, 8);
-  case ARM64::ATOMIC_CMP_SWAP_I128:
-    return EmitAtomicCmpSwap128(MI, BB);
-
-  case ARM64::F128CSEL:
-    return EmitF128CSEL(MI, BB);
-
-  case TargetOpcode::STACKMAP:
-  case TargetOpcode::PATCHPOINT:
-    return emitPatchPoint(MI, BB);
-  }
-  llvm_unreachable("Unexpected instruction for custom inserter!");
-}
-
-//===----------------------------------------------------------------------===//
-// ARM64 Lowering private implementation.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Lowering Code
-//===----------------------------------------------------------------------===//
-
-/// changeIntCCToARM64CC - Convert a DAG integer condition code to an ARM64 CC
-static ARM64CC::CondCode changeIntCCToARM64CC(ISD::CondCode CC) {
-  switch (CC) {
-  default:
-    llvm_unreachable("Unknown condition code!");
-  case ISD::SETNE:
-    return ARM64CC::NE;
-  case ISD::SETEQ:
-    return ARM64CC::EQ;
-  case ISD::SETGT:
-    return ARM64CC::GT;
-  case ISD::SETGE:
-    return ARM64CC::GE;
-  case ISD::SETLT:
-    return ARM64CC::LT;
-  case ISD::SETLE:
-    return ARM64CC::LE;
-  case ISD::SETUGT:
-    return ARM64CC::HI;
-  case ISD::SETUGE:
-    return ARM64CC::CS;
-  case ISD::SETULT:
-    return ARM64CC::CC;
-  case ISD::SETULE:
-    return ARM64CC::LS;
-  }
-}
-
-/// changeFPCCToARM64CC - Convert a DAG fp condition code to an ARM64 CC.
-static void changeFPCCToARM64CC(ISD::CondCode CC, ARM64CC::CondCode &CondCode,
-                                ARM64CC::CondCode &CondCode2) {
-  CondCode2 = ARM64CC::AL;
-  switch (CC) {
-  default:
-    llvm_unreachable("Unknown FP condition!");
-  case ISD::SETEQ:
-  case ISD::SETOEQ:
-    CondCode = ARM64CC::EQ;
-    break;
-  case ISD::SETGT:
-  case ISD::SETOGT:
-    CondCode = ARM64CC::GT;
-    break;
-  case ISD::SETGE:
-  case ISD::SETOGE:
-    CondCode = ARM64CC::GE;
-    break;
-  case ISD::SETOLT:
-    CondCode = ARM64CC::MI;
-    break;
-  case ISD::SETOLE:
-    CondCode = ARM64CC::LS;
-    break;
-  case ISD::SETONE:
-    CondCode = ARM64CC::MI;
-    CondCode2 = ARM64CC::GT;
-    break;
-  case ISD::SETO:
-    CondCode = ARM64CC::VC;
-    break;
-  case ISD::SETUO:
-    CondCode = ARM64CC::VS;
-    break;
-  case ISD::SETUEQ:
-    CondCode = ARM64CC::EQ;
-    CondCode2 = ARM64CC::VS;
-    break;
-  case ISD::SETUGT:
-    CondCode = ARM64CC::HI;
-    break;
-  case ISD::SETUGE:
-    CondCode = ARM64CC::PL;
-    break;
-  case ISD::SETLT:
-  case ISD::SETULT:
-    CondCode = ARM64CC::LT;
-    break;
-  case ISD::SETLE:
-  case ISD::SETULE:
-    CondCode = ARM64CC::LE;
-    break;
-  case ISD::SETNE:
-  case ISD::SETUNE:
-    CondCode = ARM64CC::NE;
-    break;
-  }
-}
-
-static bool isLegalArithImmed(uint64_t C) {
-  // Matches ARM64DAGToDAGISel::SelectArithImmed().
-  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
-}
-
-static SDValue emitComparison(SDValue LHS, SDValue RHS, SDLoc dl,
-                              SelectionDAG &DAG) {
-  EVT VT = LHS.getValueType();
-
-  if (VT.isFloatingPoint())
-    return DAG.getNode(ARM64ISD::FCMP, dl, VT, LHS, RHS);
-
-  // The CMP instruction is just an alias for SUBS, and representing it as
-  // SUBS means that it's possible to get CSE with subtract operations.
-  // A later phase can perform the optimization of setting the destination
-  // register to WZR/XZR if it ends up being unused.
-  return DAG.getNode(ARM64ISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
-      .getValue(1);
-}
-
-static SDValue getARM64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                           SDValue &ARM64cc, SelectionDAG &DAG, SDLoc dl) {
-  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
-    EVT VT = RHS.getValueType();
-    uint64_t C = RHSC->getZExtValue();
-    if (!isLegalArithImmed(C)) {
-      // Constant does not fit, try adjusting it by one?
-      switch (CC) {
-      default:
-        break;
-      case ISD::SETLT:
-      case ISD::SETGE:
-        if ((VT == MVT::i32 && C != 0x80000000 &&
-             isLegalArithImmed((uint32_t)(C - 1))) ||
-            (VT == MVT::i64 && C != 0x80000000ULL &&
-             isLegalArithImmed(C - 1ULL))) {
-          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
-          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      case ISD::SETULT:
-      case ISD::SETUGE:
-        if ((VT == MVT::i32 && C != 0 &&
-             isLegalArithImmed((uint32_t)(C - 1))) ||
-            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
-          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
-          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      case ISD::SETLE:
-      case ISD::SETGT:
-        if ((VT == MVT::i32 && C != 0x7fffffff &&
-             isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
-             isLegalArithImmed(C + 1ULL))) {
-          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
-          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      case ISD::SETULE:
-      case ISD::SETUGT:
-        if ((VT == MVT::i32 && C != 0xffffffff &&
-             isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
-             isLegalArithImmed(C + 1ULL))) {
-          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
-          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      }
-    }
-  }
-
-  SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
-  ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC);
-  ARM64cc = DAG.getConstant(ARM64CC, MVT::i32);
-  return Cmp;
-}
-
-static std::pair<SDValue, SDValue>
-getARM64XALUOOp(ARM64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
-  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
-         "Unsupported value type");
-  SDValue Value, Overflow;
-  SDLoc DL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  unsigned Opc = 0;
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("Unknown overflow instruction!");
-  case ISD::SADDO:
-    Opc = ARM64ISD::ADDS;
-    CC = ARM64CC::VS;
-    break;
-  case ISD::UADDO:
-    Opc = ARM64ISD::ADDS;
-    CC = ARM64CC::CS;
-    break;
-  case ISD::SSUBO:
-    Opc = ARM64ISD::SUBS;
-    CC = ARM64CC::VS;
-    break;
-  case ISD::USUBO:
-    Opc = ARM64ISD::SUBS;
-    CC = ARM64CC::CC;
-    break;
-  // Multiply needs a little bit extra work.
-  case ISD::SMULO:
-  case ISD::UMULO: {
-    CC = ARM64CC::NE;
-    bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
-    if (Op.getValueType() == MVT::i32) {
-      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-      // For a 32 bit multiply with overflow check we want the instruction
-      // selector to generate a widening multiply (SMADDL/UMADDL). For that we
-      // need to generate the following pattern:
-      // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
-      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
-      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
-      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
-      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
-                                DAG.getConstant(0, MVT::i64));
-      // On ARM64 the upper 32 bits are always zero extended for a 32 bit
-      // operation. We need to clear out the upper 32 bits, because we used a
-      // widening multiply that wrote all 64 bits. In the end this should be a
-      // noop.
-      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
-      if (IsSigned) {
-        // The signed overflow check requires more than just a simple check for
-        // any bit set in the upper 32 bits of the result. These bits could be
-        // just the sign bits of a negative number. To perform the overflow
-        // check we have to arithmetic shift right the 32nd bit of the result by
-        // 31 bits. Then we compare the result to the upper 32 bits.
-        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
-                                        DAG.getConstant(32, MVT::i64));
-        UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
-        SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
-                                        DAG.getConstant(31, MVT::i64));
-        // It is important that LowerBits is last, otherwise the arithmetic
-        // shift will not be folded into the compare (SUBS).
-        SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
-        Overflow = DAG.getNode(ARM64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
-                       .getValue(1);
-      } else {
-        // The overflow check for unsigned multiply is easy. We only need to
-        // check if any of the upper 32 bits are set. This can be done with a
-        // CMP (shifted register). For that we need to generate the following
-        // pattern:
-        // (i64 ARM64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
-        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
-                                        DAG.getConstant(32, MVT::i64));
-        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-        Overflow =
-            DAG.getNode(ARM64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
-                        UpperBits).getValue(1);
-      }
-      break;
-    }
-    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
-    // For the 64 bit multiply
-    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
-    if (IsSigned) {
-      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
-      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
-                                      DAG.getConstant(63, MVT::i64));
-      // It is important that LowerBits is last, otherwise the arithmetic
-      // shift will not be folded into the compare (SUBS).
-      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-      Overflow = DAG.getNode(ARM64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
-                     .getValue(1);
-    } else {
-      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
-      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-      Overflow =
-          DAG.getNode(ARM64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
-                      UpperBits).getValue(1);
-    }
-    break;
-  }
-  } // switch (...)
-
-  if (Opc) {
-    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
-
-    // Emit the ARM64 operation with overflow check.
-    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
-    Overflow = Value.getValue(1);
-  }
-  return std::make_pair(Value, Overflow);
-}
-
-SDValue ARM64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                                           RTLIB::Libcall Call) const {
-  SmallVector<SDValue, 2> Ops;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
-    Ops.push_back(Op.getOperand(i));
-
-  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
-                     SDLoc(Op)).first;
-}
-
-static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
-  SDValue Sel = Op.getOperand(0);
-  SDValue Other = Op.getOperand(1);
-
-  // If neither operand is a SELECT_CC, give up.
-  if (Sel.getOpcode() != ISD::SELECT_CC)
-    std::swap(Sel, Other);
-  if (Sel.getOpcode() != ISD::SELECT_CC)
-    return Op;
-
-  // The folding we want to perform is:
-  // (xor x, (select_cc a, b, cc, 0, -1) )
-  //   -->
-  // (csel x, (xor x, -1), cc ...)
-  //
-  // The latter will get matched to a CSINV instruction.
-
-  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
-  SDValue LHS = Sel.getOperand(0);
-  SDValue RHS = Sel.getOperand(1);
-  SDValue TVal = Sel.getOperand(2);
-  SDValue FVal = Sel.getOperand(3);
-  SDLoc dl(Sel);
-
-  // FIXME: This could be generalized to non-integer comparisons.
-  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
-    return Op;
-
-  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
-  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
-
-  // The the values aren't constants, this isn't the pattern we're looking for.
-  if (!CFVal || !CTVal)
-    return Op;
-
-  // We can commute the SELECT_CC by inverting the condition.  This
-  // might be needed to make this fit into a CSINV pattern.
-  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
-    std::swap(TVal, FVal);
-    std::swap(CTVal, CFVal);
-    CC = ISD::getSetCCInverse(CC, true);
-  }
-
-  // If the constants line up, perform the transform!
-  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
-    SDValue CCVal;
-    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
-
-    FVal = Other;
-    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
-                       DAG.getConstant(-1ULL, Other.getValueType()));
-
-    return DAG.getNode(ARM64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
-                       CCVal, Cmp);
-  }
-
-  return Op;
-}
-
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
-
-  // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
-    return SDValue();
-
-  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-
-  unsigned Opc;
-  bool ExtraOp = false;
-  switch (Op.getOpcode()) {
-  default:
-    assert(0 && "Invalid code");
-  case ISD::ADDC:
-    Opc = ARM64ISD::ADDS;
-    break;
-  case ISD::SUBC:
-    Opc = ARM64ISD::SUBS;
-    break;
-  case ISD::ADDE:
-    Opc = ARM64ISD::ADCS;
-    ExtraOp = true;
-    break;
-  case ISD::SUBE:
-    Opc = ARM64ISD::SBCS;
-    ExtraOp = true;
-    break;
-  }
-
-  if (!ExtraOp)
-    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
-  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
-                     Op.getOperand(2));
-}
-
-static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
-  // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
-    return SDValue();
-
-  ARM64CC::CondCode CC;
-  // The actual operation that sets the overflow or carry flag.
-  SDValue Value, Overflow;
-  std::tie(Value, Overflow) = getARM64XALUOOp(CC, Op, DAG);
-
-  // We use 0 and 1 as false and true values.
-  SDValue TVal = DAG.getConstant(1, MVT::i32);
-  SDValue FVal = DAG.getConstant(0, MVT::i32);
-
-  // We use an inverted condition, because the conditional select is inverted
-  // too. This will allow it to be selected to a single instruction:
-  // CSINC Wd, WZR, WZR, invert(cond).
-  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
-  Overflow = DAG.getNode(ARM64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal, CCVal,
-                         Overflow);
-
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
-}
-
-// Prefetch operands are:
-// 1: Address to prefetch
-// 2: bool isWrite
-// 3: int locality (0 = no locality ... 3 = extreme locality)
-// 4: bool isDataCache
-static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-  // The data thing is not used.
-  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
-
-  bool IsStream = !Locality;
-  // When the locality number is set
-  if (Locality) {
-    // The front-end should have filtered out the out-of-range values
-    assert(Locality <= 3 && "Prefetch locality out-of-range");
-    // The locality degree is the opposite of the cache speed.
-    // Put the number the other way around.
-    // The encoding starts at 0 for level 1
-    Locality = 3 - Locality;
-  }
-
-  // built the mask value encoding the expected behavior.
-  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
-                   (Locality << 1) |    // Cache level bits
-                   (unsigned)IsStream;  // Stream bit
-  return DAG.getNode(ARM64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
-                     DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
-}
-
-SDValue ARM64TargetLowering::LowerFP_EXTEND(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
-
-  RTLIB::Libcall LC;
-  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
-}
-
-SDValue ARM64TargetLowering::LowerFP_ROUND(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
-  }
-
-  RTLIB::Libcall LC;
-  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  // FP_ROUND node has a second operand indicating whether it is known to be
-  // precise. That doesn't take part in the LibCall so we can't directly use
-  // LowerF128Call.
-  SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op)).first;
-}
-
-static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
-  // Warning: We maintain cost tables in ARM64TargetTransformInfo.cpp.
-  // Any additional optimization in this function should be recorded
-  // in the cost tables.
-  EVT InVT = Op.getOperand(0).getValueType();
-  EVT VT = Op.getValueType();
-
-  // FP_TO_XINT conversion from the same type are legal.
-  if (VT.getSizeInBits() == InVT.getSizeInBits())
-    return Op;
-
-  if (InVT == MVT::v2f64) {
-    SDLoc dl(Op);
-    SDValue Cv = DAG.getNode(Op.getOpcode(), dl, MVT::v2i64, Op.getOperand(0));
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
-  }
-
-  // Type changing conversions are illegal.
-  return SDValue();
-}
-
-SDValue ARM64TargetLowering::LowerFP_TO_INT(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType().isVector())
-    return LowerVectorFP_TO_INT(Op, DAG);
-
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
-  }
-
-  RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::FP_TO_SINT)
-    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  SmallVector<SDValue, 2> Ops;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
-    Ops.push_back(Op.getOperand(i));
-
-  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
-                     SDLoc(Op)).first;
-}
-
-static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
-  // Warning: We maintain cost tables in ARM64TargetTransformInfo.cpp.
-  // Any additional optimization in this function should be recorded
-  // in the cost tables.
-  EVT VT = Op.getValueType();
-  SDLoc dl(Op);
-  SDValue In = Op.getOperand(0);
-  EVT InVT = In.getValueType();
-
-  // v2i32 to v2f32 is legal.
-  if (VT == MVT::v2f32 && InVT == MVT::v2i32)
-    return Op;
-
-  // This function only handles v2f64 outputs.
-  if (VT == MVT::v2f64) {
-    // Extend the input argument to a v2i64 that we can feed into the
-    // floating point conversion. Zero or sign extend based on whether
-    // we're doing a signed or unsigned float conversion.
-    unsigned Opc =
-        Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
-    assert(Op.getNumOperands() == 1 && "FP conversions take one argument");
-    SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0));
-    return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted);
-  }
-
-  // Scalarize v2i64 to v2f32 conversions.
-  std::vector<SDValue> BuildVectorOps;
-  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
-    SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In,
-                               DAG.getConstant(i, MVT::i64));
-    Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr);
-    BuildVectorOps.push_back(Sclr);
-  }
-
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &BuildVectorOps[0],
-                     BuildVectorOps.size());
-}
-
-SDValue ARM64TargetLowering::LowerINT_TO_FP(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  if (Op.getValueType().isVector())
-    return LowerVectorINT_TO_FP(Op, DAG);
-
-  // i128 conversions are libcalls.
-  if (Op.getOperand(0).getValueType() == MVT::i128)
-    return SDValue();
-
-  // Other conversions are legal, unless it's to the completely software-based
-  // fp128.
-  if (Op.getValueType() != MVT::f128)
-    return Op;
-
-  RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::SINT_TO_FP)
-    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
-}
-
-SDValue ARM64TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
-  // For iOS, we want to call an alternative entry point: __sincos_stret,
-  // which returns the values in two S / D registers.
-  SDLoc dl(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  ArgListTy Args;
-  ArgListEntry Entry;
-
-  Entry.Node = Arg;
-  Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
-  Args.push_back(Entry);
-
-  const char *LibcallName =
-      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
-
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
-  TargetLowering::CallLoweringInfo CLI(
-      DAG.getEntryNode(), RetTy, false, false, false, false, 0,
-      CallingConv::Fast, /*isTaillCall=*/false,
-      /*doesNotRet=*/false, /*isReturnValueUsed*/ true, Callee, Args, DAG, dl);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-  return CallResult.first;
-}
-
-SDValue ARM64TargetLowering::LowerOperation(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("unimplemented operand");
-    return SDValue();
-  case ISD::GlobalAddress:
-    return LowerGlobalAddress(Op, DAG);
-  case ISD::GlobalTLSAddress:
-    return LowerGlobalTLSAddress(Op, DAG);
-  case ISD::SETCC:
-    return LowerSETCC(Op, DAG);
-  case ISD::BR_CC:
-    return LowerBR_CC(Op, DAG);
-  case ISD::SELECT:
-    return LowerSELECT(Op, DAG);
-  case ISD::SELECT_CC:
-    return LowerSELECT_CC(Op, DAG);
-  case ISD::JumpTable:
-    return LowerJumpTable(Op, DAG);
-  case ISD::ConstantPool:
-    return LowerConstantPool(Op, DAG);
-  case ISD::BlockAddress:
-    return LowerBlockAddress(Op, DAG);
-  case ISD::VASTART:
-    return LowerVASTART(Op, DAG);
-  case ISD::VACOPY:
-    return LowerVACOPY(Op, DAG);
-  case ISD::VAARG:
-    return LowerVAARG(Op, DAG);
-  case ISD::ADDC:
-  case ISD::ADDE:
-  case ISD::SUBC:
-  case ISD::SUBE:
-    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
-  case ISD::SADDO:
-  case ISD::UADDO:
-  case ISD::SSUBO:
-  case ISD::USUBO:
-  case ISD::SMULO:
-  case ISD::UMULO:
-    return LowerXALUO(Op, DAG);
-  case ISD::FADD:
-    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
-  case ISD::FSUB:
-    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
-  case ISD::FMUL:
-    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
-  case ISD::FDIV:
-    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
-  case ISD::FP_ROUND:
-    return LowerFP_ROUND(Op, DAG);
-  case ISD::FP_EXTEND:
-    return LowerFP_EXTEND(Op, DAG);
-  case ISD::FRAMEADDR:
-    return LowerFRAMEADDR(Op, DAG);
-  case ISD::RETURNADDR:
-    return LowerRETURNADDR(Op, DAG);
-  case ISD::INSERT_VECTOR_ELT:
-    return LowerINSERT_VECTOR_ELT(Op, DAG);
-  case ISD::EXTRACT_VECTOR_ELT:
-    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
-  case ISD::SCALAR_TO_VECTOR:
-    return LowerSCALAR_TO_VECTOR(Op, DAG);
-  case ISD::BUILD_VECTOR:
-    return LowerBUILD_VECTOR(Op, DAG);
-  case ISD::VECTOR_SHUFFLE:
-    return LowerVECTOR_SHUFFLE(Op, DAG);
-  case ISD::EXTRACT_SUBVECTOR:
-    return LowerEXTRACT_SUBVECTOR(Op, DAG);
-  case ISD::SRA:
-  case ISD::SRL:
-  case ISD::SHL:
-    return LowerVectorSRA_SRL_SHL(Op, DAG);
-  case ISD::SHL_PARTS:
-    return LowerShiftLeftParts(Op, DAG);
-  case ISD::SRL_PARTS:
-  case ISD::SRA_PARTS:
-    return LowerShiftRightParts(Op, DAG);
-  case ISD::CTPOP:
-    return LowerCTPOP(Op, DAG);
-  case ISD::FCOPYSIGN:
-    return LowerFCOPYSIGN(Op, DAG);
-  case ISD::AND:
-    return LowerVectorAND(Op, DAG);
-  case ISD::OR:
-    return LowerVectorOR(Op, DAG);
-  case ISD::XOR:
-    return LowerXOR(Op, DAG);
-  case ISD::PREFETCH:
-    return LowerPREFETCH(Op, DAG);
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
-    return LowerINT_TO_FP(Op, DAG);
-  case ISD::FP_TO_SINT:
-  case ISD::FP_TO_UINT:
-    return LowerFP_TO_INT(Op, DAG);
-  case ISD::FSINCOS:
-    return LowerFSINCOS(Op, DAG);
-  }
-}
-
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned ARM64TargetLowering::getFunctionAlignment(const Function *F) const {
-  return 2;
-}
-
-//===----------------------------------------------------------------------===//
-//                      Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-
-#include "ARM64GenCallingConv.inc"
-
-/// Selects the correct CCAssignFn for a the given CallingConvention
-/// value.
-CCAssignFn *ARM64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
-                                                   bool IsVarArg) const {
-  switch (CC) {
-  default:
-    llvm_unreachable("Unsupported calling convention.");
-  case CallingConv::WebKit_JS:
-    return CC_ARM64_WebKit_JS;
-  case CallingConv::C:
-  case CallingConv::Fast:
-    if (!Subtarget->isTargetDarwin())
-      return CC_ARM64_AAPCS;
-    return IsVarArg ? CC_ARM64_DarwinPCS_VarArg : CC_ARM64_DarwinPCS;
-  }
-}
-
-SDValue ARM64TargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Assign locations to all of the incoming arguments.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  // At this point, Ins[].VT may already be promoted to i32. To correctly
-  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
-  // i8 to CC_ARM64_AAPCS with i32 being ValVT and i8 being LocVT.
-  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
-  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
-  // LocVT.
-  unsigned NumArgs = Ins.size();
-  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
-  unsigned CurArgIdx = 0;
-  for (unsigned i = 0; i != NumArgs; ++i) {
-    MVT ValVT = Ins[i].VT;
-    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[i].OrigArgIndex;
-
-    // Get type of the original argument.
-    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
-    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
-    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
-    MVT LocVT = ValVT;
-    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
-      LocVT = MVT::i8;
-    else if (ActualMVT == MVT::i16)
-      LocVT = MVT::i16;
-
-    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
-    bool Res =
-        AssignFn(i, ValVT, LocVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
-    assert(!Res && "Call operand has unhandled type");
-    (void)Res;
-  }
-
-  SmallVector<SDValue, 16> ArgValues;
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-
-    // Arguments stored in registers.
-    if (VA.isRegLoc()) {
-      EVT RegVT = VA.getLocVT();
-
-      SDValue ArgValue;
-      const TargetRegisterClass *RC;
-
-      if (RegVT == MVT::i32)
-        RC = &ARM64::GPR32RegClass;
-      else if (RegVT == MVT::i64)
-        RC = &ARM64::GPR64RegClass;
-      else if (RegVT == MVT::f32)
-        RC = &ARM64::FPR32RegClass;
-      else if (RegVT == MVT::f64 || RegVT == MVT::v1i64 ||
-               RegVT == MVT::v1f64 || RegVT == MVT::v2i32 ||
-               RegVT == MVT::v4i16 || RegVT == MVT::v8i8)
-        RC = &ARM64::FPR64RegClass;
-      else if (RegVT == MVT::v2i64 || RegVT == MVT::v4i32 ||
-               RegVT == MVT::v8i16 || RegVT == MVT::v16i8)
-        RC = &ARM64::FPR128RegClass;
-      else
-        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
-
-      // Transform the arguments in physical registers into virtual ones.
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
-      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
-
-      // If this is an 8, 16 or 32-bit value, it is really passed promoted
-      // to 64 bits.  Insert an assert[sz]ext to capture this, then
-      // truncate to the right size.
-      switch (VA.getLocInfo()) {
-      default:
-        llvm_unreachable("Unknown loc info!");
-      case CCValAssign::Full:
-        break;
-      case CCValAssign::BCvt:
-        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
-        break;
-      case CCValAssign::SExt:
-        ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
-                               DAG.getValueType(VA.getValVT()));
-        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
-        break;
-      case CCValAssign::ZExt:
-        ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
-                               DAG.getValueType(VA.getValVT()));
-        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
-        break;
-      }
-
-      InVals.push_back(ArgValue);
-
-    } else { // VA.isRegLoc()
-      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
-      unsigned ArgOffset = VA.getLocMemOffset();
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
-      int FI = MFI->CreateFixedObject(ArgSize, ArgOffset, true);
-
-      // Create load nodes to retrieve arguments from the stack.
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(FI), false,
-                                   false, false, 0));
-    }
-  }
-
-  // varargs
-  if (isVarArg) {
-    if (!Subtarget->isTargetDarwin()) {
-      // The AAPCS variadic function ABI is identical to the non-variadic
-      // one. As a result there may be more arguments in registers and we should
-      // save them for future reference.
-      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
-    }
-
-    ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-    // This will point to the next argument passed via stack.
-    unsigned StackOffset = CCInfo.getNextStackOffset();
-    // We currently pass all varargs at 8-byte alignment.
-    StackOffset = ((StackOffset + 7) & ~7);
-    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
-  }
-
-  return Chain;
-}
-
-void ARM64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
-                                              SelectionDAG &DAG, SDLoc DL,
-                                              SDValue &Chain) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-
-  SmallVector<SDValue, 8> MemOps;
-
-  static const uint16_t GPRArgRegs[] = { ARM64::X0, ARM64::X1, ARM64::X2,
-                                         ARM64::X3, ARM64::X4, ARM64::X5,
-                                         ARM64::X6, ARM64::X7 };
-  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
-  unsigned FirstVariadicGPR =
-      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
-
-  static const uint16_t FPRArgRegs[] = { ARM64::Q0, ARM64::Q1, ARM64::Q2,
-                                         ARM64::Q3, ARM64::Q4, ARM64::Q5,
-                                         ARM64::Q6, ARM64::Q7 };
-  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
-  unsigned FirstVariadicFPR =
-      CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
-
-  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
-  int GPRIdx = 0;
-  if (GPRSaveSize != 0) {
-    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
-
-    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
-
-    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &ARM64::GPR64RegClass);
-      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
-      SDValue Store =
-          DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                       MachinePointerInfo::getStack(i * 8), false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(8, getPointerTy()));
-    }
-  }
-
-  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
-  int FPRIdx = 0;
-  if (FPRSaveSize != 0) {
-    FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
-
-    SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
-
-    for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &ARM64::FPR128RegClass);
-      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::v2i64);
-      SDValue Store =
-          DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                       MachinePointerInfo::getStack(i * 16), false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(16, getPointerTy()));
-    }
-  }
-
-  FuncInfo->setVarArgsGPRIndex(GPRIdx);
-  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
-  FuncInfo->setVarArgsFPRIndex(FPRIdx);
-  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
-
-  if (!MemOps.empty()) {
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
-                        MemOps.size());
-  }
-}
-
-/// LowerCallResult - Lower the result values of a call into the
-/// appropriate copies out of appropriate physical registers.
-SDValue ARM64TargetLowering::LowerCallResult(
-    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
-    SDValue ThisVal) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                         : RetCC_ARM64_AAPCS;
-  // Assign locations to each value returned by this call.
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
-  CCInfo.AnalyzeCallResult(Ins, RetCC);
-
-  // Copy all of the result registers out of their specified physreg.
-  for (unsigned i = 0; i != RVLocs.size(); ++i) {
-    CCValAssign VA = RVLocs[i];
-
-    // Pass 'this' value directly from the argument to return value, to avoid
-    // reg unit interference
-    if (i == 0 && isThisReturn) {
-      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
-             "unexpected return calling convention register assignment");
-      InVals.push_back(ThisVal);
-      continue;
-    }
-
-    SDValue Val =
-        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
-    Chain = Val.getValue(1);
-    InFlag = Val.getValue(2);
-
-    switch (VA.getLocInfo()) {
-    default:
-      llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::BCvt:
-      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
-      break;
-    }
-
-    InVals.push_back(Val);
-  }
-
-  return Chain;
-}
-
-bool ARM64TargetLowering::isEligibleForTailCallOptimization(
-    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-    bool isCalleeStructRet, bool isCallerStructRet,
-    const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
-  // Look for obvious safe cases to perform tail call optimization that do not
-  // require ABI changes. This is what gcc calls sibcall.
-
-  // Do not sibcall optimize vararg calls unless the call site is not passing
-  // any arguments.
-  if (isVarArg && !Outs.empty())
-    return false;
-
-  // Also avoid sibcall optimization if either caller or callee uses struct
-  // return semantics.
-  if (isCalleeStructRet || isCallerStructRet)
-    return false;
-
-  // Note that currently ARM64 "C" calling convention and "Fast" calling
-  // convention are compatible. If/when that ever changes, we'll need to
-  // add checks here to make sure any interactions are OK.
-
-  // If the callee takes no arguments then go on to check the results of the
-  // call.
-  if (!Outs.empty()) {
-    // Check if stack adjustment is needed. For now, do not do this if any
-    // argument is passed on the stack.
-    SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), ArgLocs, *DAG.getContext());
-    CCAssignFn *AssignFn = CCAssignFnForCall(CalleeCC, /*IsVarArg=*/false);
-    CCInfo.AnalyzeCallOperands(Outs, AssignFn);
-    if (CCInfo.getNextStackOffset()) {
-      // Check if the arguments are already laid out in the right way as
-      // the caller's fixed stack objects.
-      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
-           ++i, ++realArgIdx) {
-        CCValAssign &VA = ArgLocs[i];
-        if (VA.getLocInfo() == CCValAssign::Indirect)
-          return false;
-        if (VA.needsCustom()) {
-          // Just don't handle anything that needs custom adjustments for now.
-          // If need be, we can revisit later, but we shouldn't ever end up
-          // here.
-          return false;
-        } else if (!VA.isRegLoc()) {
-          // Likewise, don't try to handle stack based arguments for the
-          // time being.
-          return false;
-        }
-      }
-    }
-  }
-
-  return true;
-}
-/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
-/// and add input and output parameter nodes.
-SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
-                                       SmallVectorImpl<SDValue> &InVals) const {
-  SelectionDAG &DAG = CLI.DAG;
-  SDLoc &DL = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
-  SDValue Chain = CLI.Chain;
-  SDValue Callee = CLI.Callee;
-  bool &IsTailCall = CLI.IsTailCall;
-  CallingConv::ID CallConv = CLI.CallConv;
-  bool IsVarArg = CLI.IsVarArg;
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
-  bool IsThisReturn = false;
-
-  // If tail calls are explicitly disabled, make sure not to use them.
-  if (!EnableARM64TailCalls)
-    IsTailCall = false;
-
-  if (IsTailCall) {
-    // Check if it's really possible to do a tail call.
-    IsTailCall = isEligibleForTailCallOptimization(
-        Callee, CallConv, IsVarArg, IsStructRet,
-        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
-    // We don't support GuaranteedTailCallOpt, only automatically
-    // detected sibcalls.
-    // FIXME: Re-evaluate. Is this true? Should it be true?
-    if (IsTailCall)
-      ++NumTailCalls;
-  }
-
-  // Analyze operands of the call, assigning locations to each operand.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  if (IsVarArg) {
-    // Handle fixed and variable vector arguments differently.
-    // Variable vector arguments always go into memory.
-    unsigned NumArgs = Outs.size();
-
-    for (unsigned i = 0; i != NumArgs; ++i) {
-      MVT ArgVT = Outs[i].VT;
-      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
-                                               /*IsVarArg=*/ !Outs[i].IsFixed);
-      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
-      assert(!Res && "Call operand has unhandled type");
-      (void)Res;
-    }
-  } else {
-    // At this point, Outs[].VT may already be promoted to i32. To correctly
-    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
-    // i8 to CC_ARM64_AAPCS with i32 being ValVT and i8 being LocVT.
-    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
-    // we use a special version of AnalyzeCallOperands to pass in ValVT and
-    // LocVT.
-    unsigned NumArgs = Outs.size();
-    for (unsigned i = 0; i != NumArgs; ++i) {
-      MVT ValVT = Outs[i].VT;
-      // Get type of the original argument.
-      EVT ActualVT = getValueType(CLI.Args[Outs[i].OrigArgIndex].Ty,
-                                  /*AllowUnknown*/ true);
-      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
-      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
-      MVT LocVT = ValVT;
-      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
-        LocVT = MVT::i8;
-      else if (ActualMVT == MVT::i16)
-        LocVT = MVT::i16;
-
-      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
-      bool Res = AssignFn(i, ValVT, LocVT, CCValAssign::Full, ArgFlags, CCInfo);
-      assert(!Res && "Call operand has unhandled type");
-      (void)Res;
-    }
-  }
-
-  // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
-
-  // Adjust the stack pointer for the new arguments...
-  // These operations are automatically eliminated by the prolog/epilog pass
-  if (!IsTailCall)
-    Chain =
-        DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
-
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, ARM64::SP, getPointerTy());
-
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
-  SmallVector<SDValue, 8> MemOpChains;
-
-  // Walk the register/memloc assignments, inserting copies/loads.
-  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
-       ++i, ++realArgIdx) {
-    CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = OutVals[realArgIdx];
-    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
-
-    // Promote the value if needed.
-    switch (VA.getLocInfo()) {
-    default:
-      llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::SExt:
-      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::ZExt:
-      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::AExt:
-      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::FPExt:
-      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    }
-
-    if (VA.isRegLoc()) {
-      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
-        assert(VA.getLocVT() == MVT::i64 &&
-               "unexpected calling convention register assignment");
-        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
-               "unexpected use of 'returned'");
-        IsThisReturn = true;
-      }
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    } else {
-      assert(VA.isMemLoc());
-      // There's no reason we can't support stack args w/ tailcall, but
-      // we currently don't, so assert if we see one.
-      assert(!IsTailCall && "stack argument with tail call!?");
-      unsigned LocMemOffset = VA.getLocMemOffset();
-      SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
-      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
-
-      // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
-      // promoted to a legal register type i32, we should truncate Arg back to
-      // i1/i8/i16.
-      if (Arg.getValueType().isSimple() &&
-          Arg.getValueType().getSimpleVT() == MVT::i32 &&
-          (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
-           VA.getLocVT() == MVT::i16))
-        Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
-
-      SDValue Store = DAG.getStore(Chain, DL, Arg, PtrOff,
-                                   MachinePointerInfo::getStack(LocMemOffset),
-                                   false, false, 0);
-      MemOpChains.push_back(Store);
-    }
-  }
-
-  if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOpChains[0],
-                        MemOpChains.size());
-
-  // Build a sequence of copy-to-reg nodes chained together with token chain
-  // and flag operands which copy the outgoing args into the appropriate regs.
-  SDValue InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
-  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
-  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
-  // node so that legalize doesn't hack it.
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      Subtarget->isTargetMachO()) {
-    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-      const GlobalValue *GV = G->getGlobal();
-      bool InternalLinkage = GV->hasInternalLinkage();
-      if (InternalLinkage)
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
-      else {
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
-                                            ARM64II::MO_GOT);
-        Callee = DAG.getNode(ARM64ISD::LOADgot, DL, getPointerTy(), Callee);
-      }
-    } else if (ExternalSymbolSDNode *S =
-                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
-      const char *Sym = S->getSymbol();
-      Callee =
-          DAG.getTargetExternalSymbol(Sym, getPointerTy(), ARM64II::MO_GOT);
-      Callee = DAG.getNode(ARM64ISD::LOADgot, DL, getPointerTy(), Callee);
-    }
-  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = G->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
-  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    const char *Sym = S->getSymbol();
-    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
-  }
-
-  std::vector<SDValue> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Callee);
-
-  // Add argument registers to the end of the list so that they are known live
-  // into the call.
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
-    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
-                                  RegsToPass[i].second.getValueType()));
-
-  // Add a register mask operand representing the call-preserved registers.
-  const uint32_t *Mask;
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
-  if (IsThisReturn) {
-    // For 'this' returns, use the X0-preserving mask if applicable
-    Mask = ARI->getThisReturnPreservedMask(CallConv);
-    if (!Mask) {
-      IsThisReturn = false;
-      Mask = ARI->getCallPreservedMask(CallConv);
-    }
-  } else
-    Mask = ARI->getCallPreservedMask(CallConv);
-
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
-
-  if (InFlag.getNode())
-    Ops.push_back(InFlag);
-
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-
-  // If we're doing a tall call, use a TC_RETURN here rather than an
-  // actual call instruction.
-  if (IsTailCall)
-    return DAG.getNode(ARM64ISD::TC_RETURN, DL, NodeTys, &Ops[0], Ops.size());
-
-  // Returns a chain and a flag for retval copy to use.
-  Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
-  InFlag = Chain.getValue(1);
-
-  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag, DL);
-  if (!Ins.empty())
-    InFlag = Chain.getValue(1);
-
-  // Handle result values, copying them out of physregs into vregs that we
-  // return.
-  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
-                         InVals, IsThisReturn,
-                         IsThisReturn ? OutVals[0] : SDValue());
-}
-
-bool ARM64TargetLowering::CanLowerReturn(
-    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                         : RetCC_ARM64_AAPCS;
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
-  return CCInfo.CheckReturn(Outs, RetCC);
-}
-
-SDValue
-ARM64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                                 bool isVarArg,
-                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                 const SmallVectorImpl<SDValue> &OutVals,
-                                 SDLoc DL, SelectionDAG &DAG) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                         : RetCC_ARM64_AAPCS;
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
-  CCInfo.AnalyzeReturn(Outs, RetCC);
-
-  // Copy the result values into the output registers.
-  SDValue Flag;
-  SmallVector<SDValue, 4> RetOps(1, Chain);
-  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
-       ++i, ++realRVLocIdx) {
-    CCValAssign &VA = RVLocs[i];
-    assert(VA.isRegLoc() && "Can only return in registers!");
-    SDValue Arg = OutVals[realRVLocIdx];
-
-    switch (VA.getLocInfo()) {
-    default:
-      llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
-      break;
-    }
-
-    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
-    Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
-  }
-
-  RetOps[0] = Chain; // Update chain.
-
-  // Add the flag if we have it.
-  if (Flag.getNode())
-    RetOps.push_back(Flag);
-
-  return DAG.getNode(ARM64ISD::RET_FLAG, DL, MVT::Other, &RetOps[0],
-                     RetOps.size());
-}
-
-//===----------------------------------------------------------------------===//
-//  Other Lowering Code
-//===----------------------------------------------------------------------===//
-
-SDValue ARM64TargetLowering::LowerGlobalAddress(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  unsigned char OpFlags =
-      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
-
-  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
-         "unexpected offset in global node");
-
-  // This also catched the large code model case for Darwin.
-  if ((OpFlags & ARM64II::MO_GOT) != 0) {
-    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
-    // FIXME: Once remat is capable of dealing with instructions with register
-    // operands, expand this into two nodes instead of using a wrapper node.
-    return DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, GotAddr);
-  }
-
-  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G3),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G0 | MO_NC));
-  } else {
-    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
-    // the only correct model on Darwin.
-    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                            OpFlags | ARM64II::MO_PAGE);
-    unsigned char LoFlags = OpFlags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC;
-    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
-
-    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-  }
-}
-
-/// \brief Convert a TLS address reference into the correct sequence of loads
-/// and calls to compute the variable's address (for Darwin, currently) and
-/// return an SDValue containing the final node.
-
-/// Darwin only has one TLS scheme which must be capable of dealing with the
-/// fully general situation, in the worst case. This means:
-///     + "extern __thread" declaration.
-///     + Defined in a possibly unknown dynamic library.
-///
-/// The general system is that each __thread variable has a [3 x i64] descriptor
-/// which contains information used by the runtime to calculate the address. The
-/// only part of this the compiler needs to know about is the first xword, which
-/// contains a function pointer that must be called with the address of the
-/// entire descriptor in "x0".
-///
-/// Since this descriptor may be in a different unit, in general even the
-/// descriptor must be accessed via an indirect load. The "ideal" code sequence
-/// is:
-///     adrp x0, _var@TLVPPAGE
-///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
-///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
-///                                      ; the function pointer
-///     blr x1                           ; Uses descriptor address in x0
-///     ; Address of _var is now in x0.
-///
-/// If the address of _var's descriptor *is* known to the linker, then it can
-/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
-/// a slight efficiency gain.
-SDValue
-ARM64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
-
-  SDLoc DL(Op);
-  MVT PtrVT = getPointerTy();
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-
-  SDValue TLVPAddr =
-      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
-  SDValue DescAddr = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, TLVPAddr);
-
-  // The first entry in the descriptor is a function pointer that we must call
-  // to obtain the address of the variable.
-  SDValue Chain = DAG.getEntryNode();
-  SDValue FuncTLVGet =
-      DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
-                  false, true, true, 8);
-  Chain = FuncTLVGet.getValue(1);
-
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  MFI->setAdjustsStack(true);
-
-  // TLS calls preserve all registers except those that absolutely must be
-  // trashed: X0 (it takes an argument), LR (it's a call) and CPSR (let's not be
-  // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
-
-  // Finally, we can make the call. This is just a degenerate version of a
-  // normal ARM64 call node: x0 takes the address of the descriptor, and returns
-  // the address of the variable in this thread.
-  Chain = DAG.getCopyToReg(Chain, DL, ARM64::X0, DescAddr, SDValue());
-  Chain = DAG.getNode(ARM64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
-                      Chain, FuncTLVGet, DAG.getRegister(ARM64::X0, MVT::i64),
-                      DAG.getRegisterMask(Mask), Chain.getValue(1));
-  return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Chain.getValue(1));
-}
-
-/// When accessing thread-local variables under either the general-dynamic or
-/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
-/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
-/// is a function pointer to carry out the resolution. This function takes the
-/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
-/// other registers (except LR, CPSR) are preserved.
-///
-/// Thus, the ideal call sequence on AArch64 is:
-///
-///     adrp x0, :tlsdesc:thread_var
-///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
-///     add x0, x0, :tlsdesc_lo12:thread_var
-///     .tlsdesccall thread_var
-///     blr x8
-///     (TPIDR_EL0 offset now in x0).
-///
-/// The ".tlsdesccall" directive instructs the assembler to insert a particular
-/// relocation to help the linker relax this sequence if it turns out to be too
-/// conservative.
-///
-/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
-/// is harmless.
-SDValue ARM64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
-                                                 SDValue DescAddr, SDLoc DL,
-                                                 SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
-
-  // The function we need to call is simply the first entry in the GOT for this
-  // descriptor, load it in preparation.
-  SDValue Func = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, SymAddr);
-
-  // TLS calls preserve all registers except those that absolutely must be
-  // trashed: X0 (it takes an argument), LR (it's a call) and CPSR (let's not be
-  // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
-
-  // The function takes only one argument: the address of the descriptor itself
-  // in X0.
-  SDValue Glue, Chain;
-  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM64::X0, DescAddr, Glue);
-  Glue = Chain.getValue(1);
-
-  // We're now ready to populate the argument list, as with a normal call:
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Func);
-  Ops.push_back(SymAddr);
-  Ops.push_back(DAG.getRegister(ARM64::X0, PtrVT));
-  Ops.push_back(DAG.getRegisterMask(Mask));
-  Ops.push_back(Glue);
-
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(ARM64ISD::TLSDESC_CALL, DL, NodeTys, &Ops[0], Ops.size());
-  Glue = Chain.getValue(1);
-
-  return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Glue);
-}
-
-SDValue ARM64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
-                                                      SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
-         "ELF TLS only supported in small memory model");
-  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-
-  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
-
-  SDValue TPOff;
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-  const GlobalValue *GV = GA->getGlobal();
-
-  SDValue ThreadBase = DAG.getNode(ARM64ISD::THREAD_POINTER, DL, PtrVT);
-
-  if (Model == TLSModel::LocalExec) {
-    SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_G0 | ARM64II::MO_NC);
-
-    TPOff = SDValue(DAG.getMachineNode(ARM64::MOVZXi, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(16, MVT::i32)),
-                    0);
-    TPOff = SDValue(DAG.getMachineNode(ARM64::MOVKXi, DL, PtrVT, TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)),
-                    0);
-  } else if (Model == TLSModel::InitialExec) {
-    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
-    TPOff = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, TPOff);
-  } else if (Model == TLSModel::LocalDynamic) {
-    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
-    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
-    // the beginning of the module's TLS region, followed by a DTPREL offset
-    // calculation.
-
-    // These accesses will need deduplicating if there's more than one.
-    ARM64FunctionInfo *MFI =
-        DAG.getMachineFunction().getInfo<ARM64FunctionInfo>();
-    MFI->incNumLocalDynamicTLSAccesses();
-
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT, ARM64II::MO_TLS | ARM64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT,
-        ARM64II::MO_TLS | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
-    // The call needs a relocation too for linker relaxation. It doesn't make
-    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
-    // the address.
-    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
-                                                  ARM64II::MO_TLS);
-
-    // Now we can calculate the offset from TPIDR_EL0 to this module's
-    // thread-local area.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
-
-    // Now use :dtprel_whatever: operations to calculate this variable's offset
-    // in its thread-storage area.
-    SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, MVT::i64, 0, ARM64II::MO_TLS | ARM64II::MO_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(
-        GV, DL, MVT::i64, 0, ARM64II::MO_TLS | ARM64II::MO_G0 | ARM64II::MO_NC);
-
-    SDValue DTPOff =
-        SDValue(DAG.getMachineNode(ARM64::MOVZXi, DL, PtrVT, HiVar,
-                                   DAG.getTargetConstant(16, MVT::i32)),
-                0);
-    DTPOff = SDValue(DAG.getMachineNode(ARM64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
-                                        DAG.getTargetConstant(0, MVT::i32)),
-                     0);
-
-    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
-  } else if (Model == TLSModel::GeneralDynamic) {
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0,
-        ARM64II::MO_TLS | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
-    // The call needs a relocation too for linker relaxation. It doesn't make
-    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
-    // the address.
-    SDValue SymAddr =
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
-
-    // Finally we can make a call to calculate the offset from tpidr_el0.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
-  } else
-    llvm_unreachable("Unsupported ELF TLS access model");
-
-  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
-}
-
-SDValue ARM64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
-                                                   SelectionDAG &DAG) const {
-  if (Subtarget->isTargetDarwin())
-    return LowerDarwinGlobalTLSAddress(Op, DAG);
-  else if (Subtarget->isTargetELF())
-    return LowerELFGlobalTLSAddress(Op, DAG);
-
-  llvm_unreachable("Unexpected platform trying to use TLS");
-}
-SDValue ARM64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain = Op.getOperand(0);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
-  SDValue LHS = Op.getOperand(2);
-  SDValue RHS = Op.getOperand(3);
-  SDValue Dest = Op.getOperand(4);
-  SDLoc dl(Op);
-
-  // Handle f128 first, since lowering it will result in comparing the return
-  // value of a libcall against zero, which is just what the rest of LowerBR_CC
-  // is expecting to deal with.
-  if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (RHS.getNode() == 0) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
-    }
-  }
-
-  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
-  // instruction.
-  unsigned Opc = LHS.getOpcode();
-  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
-      cast<ConstantSDNode>(RHS)->isOne() &&
-      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
-       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
-    assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
-           "Unexpected condition code.");
-    // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
-      return SDValue();
-
-    // The actual operation with overflow check.
-    ARM64CC::CondCode OFCC;
-    SDValue Value, Overflow;
-    std::tie(Value, Overflow) = getARM64XALUOOp(OFCC, LHS.getValue(0), DAG);
-
-    if (CC == ISD::SETNE)
-      OFCC = getInvertedCondCode(OFCC);
-    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
-
-    return DAG.getNode(ARM64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
-                       CCVal, Overflow);
-  }
-
-  if (LHS.getValueType().isInteger()) {
-    assert((LHS.getValueType() == RHS.getValueType()) &&
-           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
-
-    // If the RHS of the comparison is zero, we can potentially fold this
-    // to a specialized branch.
-    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
-    if (RHSC && RHSC->getZExtValue() == 0) {
-      if (CC == ISD::SETEQ) {
-        // See if we can use a TBZ to fold in an AND as well.
-        // TBZ has a smaller branch displacement than CBZ.  If the offset is
-        // out of bounds, a late MI-layer pass rewrites branches.
-        // 403.gcc is an example that hits this case.
-        if (LHS.getOpcode() == ISD::AND &&
-            isa<ConstantSDNode>(LHS.getOperand(1)) &&
-            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
-          SDValue Test = LHS.getOperand(0);
-          uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
-          return DAG.getNode(ARM64ISD::TBZ, dl, MVT::Other, Chain, Test,
-                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
-        }
-
-        return DAG.getNode(ARM64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
-      } else if (CC == ISD::SETNE) {
-        // See if we can use a TBZ to fold in an AND as well.
-        // TBZ has a smaller branch displacement than CBZ.  If the offset is
-        // out of bounds, a late MI-layer pass rewrites branches.
-        // 403.gcc is an example that hits this case.
-        if (LHS.getOpcode() == ISD::AND &&
-            isa<ConstantSDNode>(LHS.getOperand(1)) &&
-            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
-          SDValue Test = LHS.getOperand(0);
-          uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBNZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
-          return DAG.getNode(ARM64ISD::TBNZ, dl, MVT::Other, Chain, Test,
-                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
-        }
-
-        return DAG.getNode(ARM64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
-      }
-    }
-
-    SDValue CCVal;
-    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
-    return DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
-                       Cmp);
-  }
-
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-
-  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-  // clean.  Some of them require two branches to implement.
-  SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-  SDValue BR1 =
-      DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
-  if (CC2 != ARM64CC::AL) {
-    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
-    return DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
-                       Cmp);
-  }
-
-  return BR1;
-}
-
-SDValue ARM64TargetLowering::LowerFCOPYSIGN(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-
-  SDValue In1 = Op.getOperand(0);
-  SDValue In2 = Op.getOperand(1);
-  EVT SrcVT = In2.getValueType();
-  if (SrcVT != VT) {
-    if (SrcVT == MVT::f32 && VT == MVT::f64)
-      In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
-    else if (SrcVT == MVT::f64 && VT == MVT::f32)
-      In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
-    else
-      // FIXME: Src type is different, bail out for now. Can VT really be a
-      // vector type?
-      return SDValue();
-  }
-
-  EVT VecVT;
-  EVT EltVT;
-  SDValue EltMask, VecVal1, VecVal2;
-  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
-    EltVT = MVT::i32;
-    VecVT = MVT::v4i32;
-    EltMask = DAG.getConstant(0x80000000ULL, EltVT);
-
-    if (!VT.isVector()) {
-      VecVal1 = DAG.getTargetInsertSubreg(ARM64::ssub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In1);
-      VecVal2 = DAG.getTargetInsertSubreg(ARM64::ssub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In2);
-    } else {
-      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
-      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
-    }
-  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
-    EltVT = MVT::i64;
-    VecVT = MVT::v2i64;
-
-    // We want to materialize a mask with the the high bit set, but the AdvSIMD
-    // immediate moves cannot materialize that in a single instruction for
-    // 64-bit elements. Instead, materialize zero and then negate it.
-    EltMask = DAG.getConstant(0, EltVT);
-
-    if (!VT.isVector()) {
-      VecVal1 = DAG.getTargetInsertSubreg(ARM64::dsub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In1);
-      VecVal2 = DAG.getTargetInsertSubreg(ARM64::dsub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In2);
-    } else {
-      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
-      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
-    }
-  } else {
-    llvm_unreachable("Invalid type for copysign!");
-  }
-
-  std::vector<SDValue> BuildVectorOps;
-  for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
-    BuildVectorOps.push_back(EltMask);
-
-  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT,
-                                 &BuildVectorOps[0], BuildVectorOps.size());
-
-  // If we couldn't materialize the mask above, then the mask vector will be
-  // the zero vector, and we need to negate it here.
-  if (VT == MVT::f64 || VT == MVT::v2f64) {
-    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
-    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
-    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
-  }
-
-  SDValue Sel =
-      DAG.getNode(ARM64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
-
-  if (VT == MVT::f32)
-    return DAG.getTargetExtractSubreg(ARM64::ssub, DL, VT, Sel);
-  else if (VT == MVT::f64)
-    return DAG.getTargetExtractSubreg(ARM64::dsub, DL, VT, Sel);
-  else
-    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
-}
-
-SDValue ARM64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
-  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
-    return SDValue();
-
-  // While there is no integer popcount instruction, it can
-  // be more efficiently lowered to the following sequence that uses
-  // AdvSIMD registers/instructions as long as the copies to/from
-  // the AdvSIMD registers are cheap.
-  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
-  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
-  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
-  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
-  SDValue Val = Op.getOperand(0);
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
-
-  SDValue VecVal;
-  if (VT == MVT::i32) {
-    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
-    VecVal =
-        DAG.getTargetInsertSubreg(ARM64::ssub, DL, MVT::v8i8, ZeroVec, VecVal);
-  } else {
-    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
-  }
-
-  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
-  SDValue UaddLV = DAG.getNode(
-      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
-      DAG.getConstant(Intrinsic::arm64_neon_uaddlv, MVT::i32), CtPop);
-
-  if (VT == MVT::i64)
-    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
-  return UaddLV;
-}
-
-SDValue ARM64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-
-  if (Op.getValueType().isVector())
-    return LowerVSETCC(Op, DAG);
-
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  SDLoc dl(Op);
-
-  // We chose ZeroOrOneBooleanContents, so use zero and one.
-  EVT VT = Op.getValueType();
-  SDValue TVal = DAG.getConstant(1, VT);
-  SDValue FVal = DAG.getConstant(0, VT);
-
-  // Handle f128 first, since one possible outcome is a normal integer
-  // comparison which gets picked up by the next if statement.
-  if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, use it.
-    if (RHS.getNode() == 0) {
-      assert(LHS.getValueType() == Op.getValueType() &&
-             "Unexpected setcc expansion!");
-      return LHS;
-    }
-  }
-
-  if (LHS.getValueType().isInteger()) {
-    SDValue CCVal;
-    SDValue Cmp =
-        getARM64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
-
-    // Note that we inverted the condition above, so we reverse the order of
-    // the true and false operands here.  This will allow the setcc to be
-    // matched to a single CSINC instruction.
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
-  }
-
-  // Now we know we're dealing with FP values.
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-
-  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
-  // and do the comparison.
-  SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
-
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-  if (CC2 == ARM64CC::AL) {
-    changeFPCCToARM64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
-    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-
-    // Note that we inverted the condition above, so we reverse the order of
-    // the true and false operands here.  This will allow the setcc to be
-    // matched to a single CSINC instruction.
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
-  } else {
-    // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-    // clean.  Some of them require two CSELs to implement.  As is in this case,
-    // we emit the first CSEL and then emit a second using the output of the
-    // first as the RHS.  We're effectively OR'ing the two CC's together.
-
-    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
-    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-    SDValue CS1 = DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
-
-    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
-  }
-}
-
-/// A SELECT_CC operation is really some kind of max or min if both values being
-/// compared are, in some sense, equal to the results in either case. However,
-/// it is permissible to compare f32 values and produce directly extended f64
-/// values.
-///
-/// Extending the comparison operands would also be allowed, but is less likely
-/// to happen in practice since their use is right here. Note that truncate
-/// operations would *not* be semantically equivalent.
-static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
-  if (Cmp == Result)
-    return true;
-
-  ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
-  ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
-  if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
-      Result.getValueType() == MVT::f64) {
-    bool Lossy;
-    APFloat CmpVal = CCmp->getValueAPF();
-    CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
-    return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
-  }
-
-  return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
-}
-
-SDValue ARM64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  SDValue CC = Op->getOperand(0);
-  SDValue TVal = Op->getOperand(1);
-  SDValue FVal = Op->getOperand(2);
-  SDLoc DL(Op);
-
-  unsigned Opc = CC.getOpcode();
-  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
-  // instruction.
-  if (CC.getResNo() == 1 &&
-      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
-       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
-    // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
-      return SDValue();
-
-    ARM64CC::CondCode OFCC;
-    SDValue Value, Overflow;
-    std::tie(Value, Overflow) = getARM64XALUOOp(OFCC, CC.getValue(0), DAG);
-    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
-
-    return DAG.getNode(ARM64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal,
-                       Overflow);
-  }
-
-  if (CC.getOpcode() == ISD::SETCC)
-    return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
-                           cast<CondCodeSDNode>(CC.getOperand(2))->get());
-  else
-    return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
-                           FVal, ISD::SETNE);
-}
-
-SDValue ARM64TargetLowering::LowerSELECT_CC(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue TVal = Op.getOperand(2);
-  SDValue FVal = Op.getOperand(3);
-  SDLoc dl(Op);
-
-  // Handle f128 first, because it will result in a comparison of some RTLIB
-  // call result against zero.
-  if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (RHS.getNode() == 0) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
-    }
-  }
-
-  // Handle integers first.
-  if (LHS.getValueType().isInteger()) {
-    assert((LHS.getValueType() == RHS.getValueType()) &&
-           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
-
-    unsigned Opcode = ARM64ISD::CSEL;
-
-    // If both the TVal and the FVal are constants, see if we can swap them in
-    // order to for a CSINV or CSINC out of them.
-    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
-    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
-
-    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
-      std::swap(TVal, FVal);
-      std::swap(CTVal, CFVal);
-      CC = ISD::getSetCCInverse(CC, true);
-    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
-      std::swap(TVal, FVal);
-      std::swap(CTVal, CFVal);
-      CC = ISD::getSetCCInverse(CC, true);
-    } else if (TVal.getOpcode() == ISD::XOR) {
-      // If TVal is a NOT we want to swap TVal and FVal so that we can match
-      // with a CSINV rather than a CSEL.
-      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
-
-      if (CVal && CVal->isAllOnesValue()) {
-        std::swap(TVal, FVal);
-        std::swap(CTVal, CFVal);
-        CC = ISD::getSetCCInverse(CC, true);
-      }
-    } else if (TVal.getOpcode() == ISD::SUB) {
-      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
-      // that we can match with a CSNEG rather than a CSEL.
-      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
-
-      if (CVal && CVal->isNullValue()) {
-        std::swap(TVal, FVal);
-        std::swap(CTVal, CFVal);
-        CC = ISD::getSetCCInverse(CC, true);
-      }
-    } else if (CTVal && CFVal) {
-      const int64_t TrueVal = CTVal->getSExtValue();
-      const int64_t FalseVal = CFVal->getSExtValue();
-      bool Swap = false;
-
-      // If both TVal and FVal are constants, see if FVal is the
-      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
-      // instead of a CSEL in that case.
-      if (TrueVal == ~FalseVal) {
-        Opcode = ARM64ISD::CSINV;
-      } else if (TrueVal == -FalseVal) {
-        Opcode = ARM64ISD::CSNEG;
-      } else if (TVal.getValueType() == MVT::i32) {
-        // If our operands are only 32-bit wide, make sure we use 32-bit
-        // arithmetic for the check whether we can use CSINC. This ensures that
-        // the addition in the check will wrap around properly in case there is
-        // an overflow (which would not be the case if we do the check with
-        // 64-bit arithmetic).
-        const uint32_t TrueVal32 = CTVal->getZExtValue();
-        const uint32_t FalseVal32 = CFVal->getZExtValue();
-
-        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
-          Opcode = ARM64ISD::CSINC;
-
-          if (TrueVal32 > FalseVal32) {
-            Swap = true;
-          }
-        }
-        // 64-bit check whether we can use CSINC.
-      } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
-        Opcode = ARM64ISD::CSINC;
-
-        if (TrueVal > FalseVal) {
-          Swap = true;
-        }
-      }
-
-      // Swap TVal and FVal if necessary.
-      if (Swap) {
-        std::swap(TVal, FVal);
-        std::swap(CTVal, CFVal);
-        CC = ISD::getSetCCInverse(CC, true);
-      }
-
-      if (Opcode != ARM64ISD::CSEL) {
-        // Drop FVal since we can get its value by simply inverting/negating
-        // TVal.
-        FVal = TVal;
-      }
-    }
-
-    SDValue CCVal;
-    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
-
-    EVT VT = Op.getValueType();
-    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
-  }
-
-  // Now we know we're dealing with FP values.
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-  assert(LHS.getValueType() == RHS.getValueType());
-  EVT VT = Op.getValueType();
-
-  // Try to match this select into a max/min operation, which have dedicated
-  // opcode in the instruction set.
-  // NOTE: This is not correct in the presence of NaNs, so we only enable this
-  // in no-NaNs mode.
-  if (getTargetMachine().Options.NoNaNsFPMath) {
-    if (selectCCOpsAreFMaxCompatible(LHS, FVal) &&
-        selectCCOpsAreFMaxCompatible(RHS, TVal)) {
-      CC = ISD::getSetCCSwappedOperands(CC);
-      std::swap(TVal, FVal);
-    }
-
-    if (selectCCOpsAreFMaxCompatible(LHS, TVal) &&
-        selectCCOpsAreFMaxCompatible(RHS, FVal)) {
-      switch (CC) {
-      default:
-        break;
-      case ISD::SETGT:
-      case ISD::SETGE:
-      case ISD::SETUGT:
-      case ISD::SETUGE:
-      case ISD::SETOGT:
-      case ISD::SETOGE:
-        return DAG.getNode(ARM64ISD::FMAX, dl, VT, TVal, FVal);
-        break;
-      case ISD::SETLT:
-      case ISD::SETLE:
-      case ISD::SETULT:
-      case ISD::SETULE:
-      case ISD::SETOLT:
-      case ISD::SETOLE:
-        return DAG.getNode(ARM64ISD::FMIN, dl, VT, TVal, FVal);
-        break;
-      }
-    }
-  }
-
-  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
-  // and do the comparison.
-  SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
-
-  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-  // clean.  Some of them require two CSELs to implement.
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-  SDValue CS1 = DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
-
-  // If we need a second CSEL, emit it, using the output of the first as the
-  // RHS.  We're effectively OR'ing the two CC's together.
-  if (CC2 != ARM64CC::AL) {
-    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
-  }
-
-  // Otherwise, return the output of the first CSEL.
-  return CS1;
-}
-
-SDValue ARM64TargetLowering::LowerJumpTable(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  // Jump table entries as PC relative offsets. No additional tweaking
-  // is necessary here. Just get the address of the jump table.
-  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-
-  SDValue Hi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_PAGE);
-  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
-                                      ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-  SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-  return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-}
-
-SDValue ARM64TargetLowering::LowerConstantPool(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-
-  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
-    // Use the GOT for the large code model on iOS.
-    if (Subtarget->isTargetMachO()) {
-      SDValue GotAddr = DAG.getTargetConstantPool(
-          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
-          ARM64II::MO_GOT);
-      return DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, GotAddr);
-    }
-
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G3),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G0 | MO_NC));
-  } else {
-    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
-    // ELF, the only valid one on Darwin.
-    SDValue Hi =
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_PAGE);
-    SDValue Lo = DAG.getTargetConstantPool(
-        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
-        ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-
-    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-  }
-}
-
-SDValue ARM64TargetLowering::LowerBlockAddress(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      !Subtarget->isTargetMachO()) {
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G3),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G0 | MO_NC));
-  } else {
-    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_PAGE);
-    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_PAGEOFF |
-                                                             ARM64II::MO_NC);
-    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-  }
-}
-
-SDValue ARM64TargetLowering::LowerDarwin_VASTART(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  ARM64FunctionInfo *FuncInfo =
-      DAG.getMachineFunction().getInfo<ARM64FunctionInfo>();
-
-  SDLoc DL(Op);
-  SDValue FR =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
-  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
-                      MachinePointerInfo(SV), false, false, 0);
-}
-
-SDValue ARM64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  // The layout of the va_list struct is specified in the AArch64 Procedure Call
-  // Standard, section B.3.
-  MachineFunction &MF = DAG.getMachineFunction();
-  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-  SDLoc DL(Op);
-
-  SDValue Chain = Op.getOperand(0);
-  SDValue VAList = Op.getOperand(1);
-  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  SmallVector<SDValue, 4> MemOps;
-
-  // void *__stack at offset 0
-  SDValue Stack =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
-  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
-                                MachinePointerInfo(SV), false, false, 8));
-
-  // void *__gr_top at offset 8
-  int GPRSize = FuncInfo->getVarArgsGPRSize();
-  if (GPRSize > 0) {
-    SDValue GRTop, GRTopAddr;
-
-    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(8, getPointerTy()));
-
-    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
-    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
-                        DAG.getConstant(GPRSize, getPointerTy()));
-
-    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
-                                  MachinePointerInfo(SV, 8), false, false, 8));
-  }
-
-  // void *__vr_top at offset 16
-  int FPRSize = FuncInfo->getVarArgsFPRSize();
-  if (FPRSize > 0) {
-    SDValue VRTop, VRTopAddr;
-    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(16, getPointerTy()));
-
-    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
-    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
-                        DAG.getConstant(FPRSize, getPointerTy()));
-
-    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
-                                  MachinePointerInfo(SV, 16), false, false, 8));
-  }
-
-  // int __gr_offs at offset 24
-  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(24, getPointerTy()));
-  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
-                                GROffsAddr, MachinePointerInfo(SV, 24), false,
-                                false, 4));
-
-  // int __vr_offs at offset 28
-  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(28, getPointerTy()));
-  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
-                                VROffsAddr, MachinePointerInfo(SV, 28), false,
-                                false, 4));
-
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
-                     MemOps.size());
-}
-
-SDValue ARM64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
-  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
-                                     : LowerAAPCS_VASTART(Op, DAG);
-}
-
-SDValue ARM64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
-  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
-  // pointer.
-  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
-  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
-  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-
-  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
-                       Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
-                       8, false, false, MachinePointerInfo(DestSV),
-                       MachinePointerInfo(SrcSV));
-}
-
-SDValue ARM64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetDarwin() &&
-         "automatic va_arg instruction only works on Darwin");
-
-  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Addr = Op.getOperand(1);
-  unsigned Align = Op.getConstantOperandVal(3);
-
-  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
-                               MachinePointerInfo(V), false, false, false, 0);
-  Chain = VAList.getValue(1);
-
-  if (Align > 8) {
-    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
-    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                         DAG.getConstant(Align - 1, getPointerTy()));
-    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
-                         DAG.getConstant(-(int64_t)Align, getPointerTy()));
-  }
-
-  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
-  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
-
-  // Scalar integer and FP values smaller than 64 bits are implicitly extended
-  // up to 64 bits.  At the very least, we have to increase the striding of the
-  // vaargs list to match this, and for FP values we need to introduce
-  // FP_ROUND nodes as well.
-  if (VT.isInteger() && !VT.isVector())
-    ArgSize = 8;
-  bool NeedFPTrunc = false;
-  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
-    ArgSize = 8;
-    NeedFPTrunc = true;
-  }
-
-  // Increment the pointer, VAList, to the next vaarg
-  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                               DAG.getConstant(ArgSize, getPointerTy()));
-  // Store the incremented VAList to the legalized pointer
-  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
-                                 false, false, 0);
-
-  // Load the actual argument out of the pointer VAList
-  if (NeedFPTrunc) {
-    // Load the value as an f64.
-    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
-                                 MachinePointerInfo(), false, false, false, 0);
-    // Round the value down to an f32.
-    SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
-                                   DAG.getIntPtrConstant(1));
-    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
-    // Merge the rounded value with the chain output of the load.
-    return DAG.getMergeValues(Ops, 2, DL);
-  }
-
-  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
-                     false, false, 0);
-}
-
-SDValue ARM64TargetLowering::LowerFRAMEADDR(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  MFI->setFrameAddressIsTaken(true);
-
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, ARM64::FP, VT);
-  while (Depth--)
-    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(), false, false, false, 0);
-  return FrameAddr;
-}
-
-SDValue ARM64TargetLowering::LowerRETURNADDR(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MFI->setReturnAddressIsTaken(true);
-
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  if (Depth) {
-    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset = DAG.getConstant(8, getPointerTy());
-    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
-  }
-
-  // Return LR, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(ARM64::LR, &ARM64::GPR64RegClass);
-  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
-}
-
-/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue ARM64TargetLowering::LowerShiftRightParts(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
-  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
-
-  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
-
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
-
-  SDValue Cmp =
-      emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), dl, DAG);
-  SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32);
-
-  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-  SDValue Lo =
-      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
-
-  // ARM64 shifts larger than the register width are wrapped rather than
-  // clamped, so we can't just emit "hi >> x".
-  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-  SDValue TrueValHi = Opc == ISD::SRA
-                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
-                                        DAG.getConstant(VTBits - 1, MVT::i64))
-                          : DAG.getConstant(0, VT);
-  SDValue Hi =
-      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
-
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
-}
-
-/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue ARM64TargetLowering::LowerShiftLeftParts(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
-
-  assert(Op.getOpcode() == ISD::SHL_PARTS);
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
-  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
-
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-
-  SDValue Cmp =
-      emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), dl, DAG);
-  SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32);
-  SDValue Hi = DAG.getNode(ARM64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
-
-  // ARM64 shifts of larger than register sizes are wrapped rather than clamped,
-  // so we can't just emit "lo << a" if a is too big.
-  SDValue TrueValLo = DAG.getConstant(0, VT);
-  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-  SDValue Lo =
-      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
-
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
-}
-
-bool
-ARM64TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
-  // The ARM64 target doesn't support folding offsets into global addresses.
-  return false;
-}
-
-bool ARM64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
-  // FIXME: We should be able to handle f128 as well with a clever lowering.
-  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
-    return true;
-
-  if (VT == MVT::f64)
-    return ARM64_AM::getFP64Imm(Imm) != -1;
-  else if (VT == MVT::f32)
-    return ARM64_AM::getFP32Imm(Imm) != -1;
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-//                          ARM64 Optimization Hooks
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//                          ARM64 Inline Assembly Support
-//===----------------------------------------------------------------------===//
-
-// Table of Constraints
-// TODO: This is the current set of constraints supported by ARM for the
-// compiler, not all of them may make sense, e.g. S may be difficult to support.
-//
-// r - A general register
-// w - An FP/SIMD register of some size in the range v0-v31
-// x - An FP/SIMD register of some size in the range v0-v15
-// I - Constant that can be used with an ADD instruction
-// J - Constant that can be used with a SUB instruction
-// K - Constant that can be used with a 32-bit logical instruction
-// L - Constant that can be used with a 64-bit logical instruction
-// M - Constant that can be used as a 32-bit MOV immediate
-// N - Constant that can be used as a 64-bit MOV immediate
-// Q - A memory reference with base register and no offset
-// S - A symbolic address
-// Y - Floating point constant zero
-// Z - Integer constant zero
-//
-//   Note that general register operands will be output using their 64-bit x
-// register name, whatever the size of the variable, unless the asm operand
-// is prefixed by the %w modifier. Floating-point and SIMD register operands
-// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
-// %q modifier.
-
-/// getConstraintType - Given a constraint letter, return the type of
-/// constraint it is for this target.
-ARM64TargetLowering::ConstraintType
-ARM64TargetLowering::getConstraintType(const std::string &Constraint) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    default:
-      break;
-    case 'z':
-      return C_Other;
-    case 'x':
-    case 'w':
-      return C_RegisterClass;
-    // An address with a single base register. Due to the way we
-    // currently handle addresses it is the same as 'r'.
-    case 'Q':
-      return C_Memory;
-    }
-  }
-  return TargetLowering::getConstraintType(Constraint);
-}
-
-/// Examine constraint type and operand type and determine a weight value.
-/// This object must already have been set up with the operand type
-/// and the current alternative constraint selected.
-TargetLowering::ConstraintWeight
-ARM64TargetLowering::getSingleConstraintMatchWeight(
-    AsmOperandInfo &info, const char *constraint) const {
-  ConstraintWeight weight = CW_Invalid;
-  Value *CallOperandVal = info.CallOperandVal;
-  // If we don't have a value, we can't do a match,
-  // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
-    return CW_Default;
-  Type *type = CallOperandVal->getType();
-  // Look at the constraint type.
-  switch (*constraint) {
-  default:
-    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
-    break;
-  case 'x':
-  case 'w':
-    if (type->isFloatingPointTy() || type->isVectorTy())
-      weight = CW_Register;
-    break;
-  case 'z':
-    weight = CW_Constant;
-    break;
-  }
-  return weight;
-}
-
-std::pair<unsigned, const TargetRegisterClass *>
-ARM64TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                  MVT VT) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    case 'r':
-      if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &ARM64::GPR64commonRegClass);
-      return std::make_pair(0U, &ARM64::GPR32commonRegClass);
-    case 'w':
-      if (VT == MVT::f32)
-        return std::make_pair(0U, &ARM64::FPR32RegClass);
-      if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &ARM64::FPR64RegClass);
-      if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &ARM64::FPR128RegClass);
-      break;
-    // The instructions that this constraint is designed for can
-    // only take 128-bit registers so just use that regclass.
-    case 'x':
-      if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &ARM64::FPR128_loRegClass);
-      break;
-    }
-  }
-  if (StringRef("{cc}").equals_lower(Constraint))
-    return std::make_pair(unsigned(ARM64::CPSR), &ARM64::CCRRegClass);
-
-  // Use the default implementation in TargetLowering to convert the register
-  // constraint into a member of a register class.
-  std::pair<unsigned, const TargetRegisterClass *> Res;
-  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
-
-  // Not found as a standard register?
-  if (Res.second == 0) {
-    unsigned Size = Constraint.size();
-    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
-        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
-      const std::string Reg =
-          std::string(&Constraint[2], &Constraint[Size - 1]);
-      int RegNo = atoi(Reg.c_str());
-      if (RegNo >= 0 && RegNo <= 31) {
-        // v0 - v31 are aliases of q0 - q31.
-        // By default we'll emit v0-v31 for this unless there's a modifier where
-        // we'll emit the correct register as well.
-        Res.first = ARM64::FPR128RegClass.getRegister(RegNo);
-        Res.second = &ARM64::FPR128RegClass;
-      }
-    }
-  }
-
-  return Res;
-}
-
-/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
-/// vector.  If it is invalid, don't add anything to Ops.
-void ARM64TargetLowering::LowerAsmOperandForConstraint(
-    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
-    SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
-
-  // Currently only support length 1 constraints.
-  if (Constraint.length() != 1)
-    return;
-
-  char ConstraintLetter = Constraint[0];
-  switch (ConstraintLetter) {
-  default:
-    break;
-
-  // This set of constraints deal with valid constants for various instructions.
-  // Validate and return a target constant for them if we can.
-  case 'z': {
-    // 'z' maps to xzr or wzr so it needs an input of 0.
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C || C->getZExtValue() != 0)
-      return;
-
-    if (Op.getValueType() == MVT::i64)
-      Result = DAG.getRegister(ARM64::XZR, MVT::i64);
-    else
-      Result = DAG.getRegister(ARM64::WZR, MVT::i32);
-    break;
-  }
-
-  case 'I':
-  case 'J':
-  case 'K':
-  case 'L':
-  case 'M':
-  case 'N':
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C)
-      return;
-
-    // Grab the value and do some validation.
-    uint64_t CVal = C->getZExtValue();
-    switch (ConstraintLetter) {
-    // The I constraint applies only to simple ADD or SUB immediate operands:
-    // i.e. 0 to 4095 with optional shift by 12
-    // The J constraint applies only to ADD or SUB immediates that would be
-    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
-    // instruction [or vice versa], in other words -1 to -4095 with optional
-    // left shift by 12.
-    case 'I':
-      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
-        break;
-      return;
-    case 'J': {
-      uint64_t NVal = -C->getSExtValue();
-      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
-        break;
-      return;
-    }
-    // The K and L constraints apply *only* to logical immediates, including
-    // what used to be the MOVI alias for ORR (though the MOVI alias has now
-    // been removed and MOV should be used). So these constraints have to
-    // distinguish between bit patterns that are valid 32-bit or 64-bit
-    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
-    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
-    // versa.
-    case 'K':
-      if (ARM64_AM::isLogicalImmediate(CVal, 32))
-        break;
-      return;
-    case 'L':
-      if (ARM64_AM::isLogicalImmediate(CVal, 64))
-        break;
-      return;
-    // The M and N constraints are a superset of K and L respectively, for use
-    // with the MOV (immediate) alias. As well as the logical immediates they
-    // also match 32 or 64-bit immediates that can be loaded either using a
-    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
-    // (M) or 64-bit 0x1234000000000000 (N) etc.
-    // As a note some of this code is liberally stolen from the asm parser.
-    case 'M': {
-      if (!isUInt<32>(CVal))
-        return;
-      if (ARM64_AM::isLogicalImmediate(CVal, 32))
-        break;
-      if ((CVal & 0xFFFF) == CVal)
-        break;
-      if ((CVal & 0xFFFF0000ULL) == CVal)
-        break;
-      uint64_t NCVal = ~(uint32_t)CVal;
-      if ((NCVal & 0xFFFFULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF0000ULL) == NCVal)
-        break;
-      return;
-    }
-    case 'N': {
-      if (ARM64_AM::isLogicalImmediate(CVal, 64))
-        break;
-      if ((CVal & 0xFFFFULL) == CVal)
-        break;
-      if ((CVal & 0xFFFF0000ULL) == CVal)
-        break;
-      if ((CVal & 0xFFFF00000000ULL) == CVal)
-        break;
-      if ((CVal & 0xFFFF000000000000ULL) == CVal)
-        break;
-      uint64_t NCVal = ~CVal;
-      if ((NCVal & 0xFFFFULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF0000ULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
-        break;
-      return;
-    }
-    default:
-      return;
-    }
-
-    // All assembler immediates are 64-bit integers.
-    Result = DAG.getTargetConstant(CVal, MVT::i64);
-    break;
-  }
-
-  if (Result.getNode()) {
-    Ops.push_back(Result);
-    return;
-  }
-
-  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
-}
-
-//===----------------------------------------------------------------------===//
-//                     ARM64 Advanced SIMD Support
-//===----------------------------------------------------------------------===//
-
-/// WidenVector - Given a value in the V64 register class, produce the
-/// equivalent value in the V128 register class.
-static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
-  EVT VT = V64Reg.getValueType();
-  unsigned NarrowSize = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType().getSimpleVT();
-  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
-  SDLoc DL(V64Reg);
-
-  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
-                     V64Reg, DAG.getConstant(0, MVT::i32));
-}
-
-/// getExtFactor - Determine the adjustment factor for the position when
-/// generating an "extract from vector registers" instruction.
-static unsigned getExtFactor(SDValue &V) {
-  EVT EltType = V.getValueType().getVectorElementType();
-  return EltType.getSizeInBits() / 8;
-}
-
-/// NarrowVector - Given a value in the V128 register class, produce the
-/// equivalent value in the V64 register class.
-static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
-  EVT VT = V128Reg.getValueType();
-  unsigned WideSize = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType().getSimpleVT();
-  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
-  SDLoc DL(V128Reg);
-
-  return DAG.getTargetExtractSubreg(ARM64::dsub, DL, NarrowTy, V128Reg);
-}
-
-// Gather data to see if the operation can be modelled as a
-// shuffle in combination with VEXTs.
-SDValue ARM64TargetLowering::ReconstructShuffle(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
-
-  SmallVector<SDValue, 2> SourceVecs;
-  SmallVector<unsigned, 2> MinElts;
-  SmallVector<unsigned, 2> MaxElts;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
-      continue;
-    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
-      // A shuffle can only come from building a vector from various
-      // elements of other vectors.
-      return SDValue();
-    }
-
-    // Record this extraction against the appropriate vector if possible...
-    SDValue SourceVec = V.getOperand(0);
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
-    bool FoundSource = false;
-    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
-      if (SourceVecs[j] == SourceVec) {
-        if (MinElts[j] > EltNo)
-          MinElts[j] = EltNo;
-        if (MaxElts[j] < EltNo)
-          MaxElts[j] = EltNo;
-        FoundSource = true;
-        break;
-      }
-    }
-
-    // Or record a new source if not...
-    if (!FoundSource) {
-      SourceVecs.push_back(SourceVec);
-      MinElts.push_back(EltNo);
-      MaxElts.push_back(EltNo);
-    }
-  }
-
-  // Currently only do something sane when at most two source vectors
-  // involved.
-  if (SourceVecs.size() > 2)
-    return SDValue();
-
-  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
-  int VEXTOffsets[2] = { 0, 0 };
-
-  // This loop extracts the usage patterns of the source vectors
-  // and prepares appropriate SDValues for a shuffle if possible.
-  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
-    if (SourceVecs[i].getValueType() == VT) {
-      // No VEXT necessary
-      ShuffleSrcs[i] = SourceVecs[i];
-      VEXTOffsets[i] = 0;
-      continue;
-    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
-      // It probably isn't worth padding out a smaller vector just to
-      // break it down again in a shuffle.
-      return SDValue();
-    }
-
-    // Don't attempt to extract subvectors from BUILD_VECTOR sources
-    // that expand or trunc the original value.
-    // TODO: We can try to bitcast and ANY_EXTEND the result but
-    // we need to consider the cost of vector ANY_EXTEND, and the
-    // legality of all the types.
-    if (SourceVecs[i].getValueType().getVectorElementType() !=
-        VT.getVectorElementType())
-      return SDValue();
-
-    // Since only 64-bit and 128-bit vectors are legal on ARM and
-    // we've eliminated the other cases...
-    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts &&
-           "unexpected vector sizes in ReconstructShuffle");
-
-    if (MaxElts[i] - MinElts[i] >= NumElts) {
-      // Span too large for a VEXT to cope
-      return SDValue();
-    }
-
-    if (MinElts[i] >= NumElts) {
-      // The extraction can just take the second half
-      VEXTOffsets[i] = NumElts;
-      ShuffleSrcs[i] =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
-                      DAG.getIntPtrConstant(NumElts));
-    } else if (MaxElts[i] < NumElts) {
-      // The extraction can just take the first half
-      VEXTOffsets[i] = 0;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                   SourceVecs[i], DAG.getIntPtrConstant(0));
-    } else {
-      // An actual VEXT is needed
-      VEXTOffsets[i] = MinElts[i];
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                     SourceVecs[i], DAG.getIntPtrConstant(0));
-      SDValue VEXTSrc2 =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
-                      DAG.getIntPtrConstant(NumElts));
-      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
-      ShuffleSrcs[i] = DAG.getNode(ARM64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
-                                   DAG.getConstant(Imm, MVT::i32));
-    }
-  }
-
-  SmallVector<int, 8> Mask;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF) {
-      Mask.push_back(-1);
-      continue;
-    }
-
-    SDValue ExtractVec = Entry.getOperand(0);
-    int ExtractElt =
-        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
-    if (ExtractVec == SourceVecs[0]) {
-      Mask.push_back(ExtractElt - VEXTOffsets[0]);
-    } else {
-      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
-    }
-  }
-
-  // Final check before we try to produce nonsense...
-  if (isShuffleMaskLegal(Mask, VT))
-    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
-                                &Mask[0]);
-
-  return SDValue();
-}
-
-// check if an EXT instruction can handle the shuffle mask when the
-// vector sources of the shuffle are the same.
-static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
-  if (M[0] < 0)
-    return false;
-
-  Imm = M[0];
-
-  // If this is a VEXT shuffle, the immediate value is the index of the first
-  // element.  The other shuffle indices must be the successive elements after
-  // the first one.
-  unsigned ExpectedElt = Imm;
-  for (unsigned i = 1; i < NumElts; ++i) {
-    // Increment the expected index.  If it wraps around, just follow it
-    // back to index zero and keep going.
-    ++ExpectedElt;
-    if (ExpectedElt == NumElts)
-      ExpectedElt = 0;
-
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if (ExpectedElt != static_cast<unsigned>(M[i]))
-      return false;
-  }
-
-  return true;
-}
-
-// check if an EXT instruction can handle the shuffle mask when the
-// vector sources of the shuffle are different.
-static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
-                      unsigned &Imm) {
-  unsigned NumElts = VT.getVectorNumElements();
-  ReverseEXT = false;
-
-  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
-  if (M[0] < 0)
-    return false;
-
-  Imm = M[0];
-
-  // If this is a VEXT shuffle, the immediate value is the index of the first
-  // element.  The other shuffle indices must be the successive elements after
-  // the first one.
-  unsigned ExpectedElt = Imm;
-  for (unsigned i = 1; i < NumElts; ++i) {
-    // Increment the expected index.  If it wraps around, it may still be
-    // a VEXT but the source vectors must be swapped.
-    ExpectedElt += 1;
-    if (ExpectedElt == NumElts * 2) {
-      ExpectedElt = 0;
-      ReverseEXT = true;
-    }
-
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if (ExpectedElt != static_cast<unsigned>(M[i]))
-      return false;
-  }
-
-  // Adjust the index value if the source operands will be swapped.
-  if (ReverseEXT)
-    Imm -= NumElts;
-
-  return true;
-}
-
-/// isREVMask - Check if a vector shuffle corresponds to a REV
-/// instruction with the specified blocksize.  (The order of the elements
-/// within each block of the vector is reversed.)
-static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
-  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
-         "Only possible block sizes for REV are: 16, 32, 64");
-
-  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
-  if (EltSz == 64)
-    return false;
-
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned BlockElts = M[0] + 1;
-  // If the first shuffle index is UNDEF, be optimistic.
-  if (M[0] < 0)
-    BlockElts = BlockSize / EltSz;
-
-  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
-    return false;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
-      return false;
-  }
-
-  return true;
-}
-
-static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  unsigned Idx = WhichResult * NumElts / 2;
-  for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
-      return false;
-    Idx += 1;
-  }
-
-  return true;
-}
-
-static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i != NumElts; ++i) {
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if ((unsigned)M[i] != 2 * i + WhichResult)
-      return false;
-  }
-
-  return true;
-}
-
-static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
-      return false;
-  }
-  return true;
-}
-
-/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
-/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
-/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
-static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  unsigned Idx = WhichResult * NumElts / 2;
-  for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
-      return false;
-    Idx += 1;
-  }
-
-  return true;
-}
-
-/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
-/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
-/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
-static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned Half = VT.getVectorNumElements() / 2;
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned j = 0; j != 2; ++j) {
-    unsigned Idx = WhichResult;
-    for (unsigned i = 0; i != Half; ++i) {
-      int MIdx = M[i + j * Half];
-      if (MIdx >= 0 && (unsigned)MIdx != Idx)
-        return false;
-      Idx += 2;
-    }
-  }
-
-  return true;
-}
-
-/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
-/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
-/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
-static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
-      return false;
-  }
-  return true;
-}
-
-/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
-/// the specified operations to build the shuffle.
-static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
-                                      SDValue RHS, SelectionDAG &DAG,
-                                      SDLoc dl) {
-  unsigned OpNum = (PFEntry >> 26) & 0x0F;
-  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
-  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
-
-  enum {
-    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
-    OP_VREV,
-    OP_VDUP0,
-    OP_VDUP1,
-    OP_VDUP2,
-    OP_VDUP3,
-    OP_VEXT1,
-    OP_VEXT2,
-    OP_VEXT3,
-    OP_VUZPL, // VUZP, left result
-    OP_VUZPR, // VUZP, right result
-    OP_VZIPL, // VZIP, left result
-    OP_VZIPR, // VZIP, right result
-    OP_VTRNL, // VTRN, left result
-    OP_VTRNR  // VTRN, right result
-  };
-
-  if (OpNum == OP_COPY) {
-    if (LHSID == (1 * 9 + 2) * 9 + 3)
-      return LHS;
-    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
-    return RHS;
-  }
-
-  SDValue OpLHS, OpRHS;
-  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
-  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
-  EVT VT = OpLHS.getValueType();
-
-  switch (OpNum) {
-  default:
-    llvm_unreachable("Unknown shuffle opcode!");
-  case OP_VREV:
-    // VREV divides the vector in half and swaps within the half.
-    if (VT.getVectorElementType() == MVT::i32 ||
-        VT.getVectorElementType() == MVT::f32)
-      return DAG.getNode(ARM64ISD::REV64, dl, VT, OpLHS);
-    // vrev <4 x i16> -> REV32
-    if (VT.getVectorElementType() == MVT::i16)
-      return DAG.getNode(ARM64ISD::REV32, dl, VT, OpLHS);
-    // vrev <4 x i8> -> REV16
-    assert(VT.getVectorElementType() == MVT::i8);
-    return DAG.getNode(ARM64ISD::REV16, dl, VT, OpLHS);
-  case OP_VDUP0:
-  case OP_VDUP1:
-  case OP_VDUP2:
-  case OP_VDUP3: {
-    EVT EltTy = VT.getVectorElementType();
-    unsigned Opcode;
-    if (EltTy == MVT::i8)
-      Opcode = ARM64ISD::DUPLANE8;
-    else if (EltTy == MVT::i16)
-      Opcode = ARM64ISD::DUPLANE16;
-    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
-      Opcode = ARM64ISD::DUPLANE32;
-    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
-      Opcode = ARM64ISD::DUPLANE64;
-    else
-      llvm_unreachable("Invalid vector element type?");
-
-    if (VT.getSizeInBits() == 64)
-      OpLHS = WidenVector(OpLHS, DAG);
-    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
-    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
-  }
-  case OP_VEXT1:
-  case OP_VEXT2:
-  case OP_VEXT3: {
-    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
-    return DAG.getNode(ARM64ISD::EXT, dl, VT, OpLHS, OpRHS,
-                       DAG.getConstant(Imm, MVT::i32));
-  }
-  case OP_VUZPL:
-    return DAG.getNode(ARM64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VUZPR:
-    return DAG.getNode(ARM64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VZIPL:
-    return DAG.getNode(ARM64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VZIPR:
-    return DAG.getNode(ARM64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VTRNL:
-    return DAG.getNode(ARM64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VTRNR:
-    return DAG.getNode(ARM64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  }
-}
-
-static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
-                           SelectionDAG &DAG) {
-  // Check to see if we can use the TBL instruction.
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  SDLoc DL(Op);
-
-  EVT EltVT = Op.getValueType().getVectorElementType();
-  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
-
-  SmallVector<SDValue, 8> TBLMask;
-  for (int Val : ShuffleMask) {
-    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
-      unsigned Offset = Byte + Val * BytesPerElt;
-      TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
-    }
-  }
-
-  MVT IndexVT = MVT::v8i8;
-  unsigned IndexLen = 8;
-  if (Op.getValueType().getSizeInBits() == 128) {
-    IndexVT = MVT::v16i8;
-    IndexLen = 16;
-  }
-
-  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
-  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
-
-  SDValue Shuffle;
-  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
-    if (IndexLen == 8)
-      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
-    Shuffle = DAG.getNode(
-        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-        DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst,
-        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, &TBLMask[0], IndexLen));
-  } else {
-    if (IndexLen == 8) {
-      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
-      Shuffle = DAG.getNode(
-          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-          DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, &TBLMask[0], IndexLen));
-    } else {
-      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
-      // cannot currently represent the register constraints on the input
-      // table registers.
-      //  Shuffle = DAG.getNode(ARM64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
-      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-      //                               &TBLMask[0], IndexLen));
-      Shuffle = DAG.getNode(
-          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-          DAG.getConstant(Intrinsic::arm64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, &TBLMask[0], IndexLen));
-    }
-  }
-  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
-}
-
-static unsigned getDUPLANEOp(EVT EltType) {
-  if (EltType == MVT::i8)
-    return ARM64ISD::DUPLANE8;
-  if (EltType == MVT::i16)
-    return ARM64ISD::DUPLANE16;
-  if (EltType == MVT::i32 || EltType == MVT::f32)
-    return ARM64ISD::DUPLANE32;
-  if (EltType == MVT::i64 || EltType == MVT::f64)
-    return ARM64ISD::DUPLANE64;
-
-  llvm_unreachable("Invalid vector element type?");
-}
-
-SDValue ARM64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
-
-  // Convert shuffles that are directly supported on NEON to target-specific
-  // DAG nodes, instead of keeping them as shuffles and matching them again
-  // during code selection.  This is more efficient and avoids the possibility
-  // of inconsistencies between legalization and selection.
-  ArrayRef<int> ShuffleMask = SVN->getMask();
-
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-
-  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
-                                       V1.getValueType().getSimpleVT())) {
-    int Lane = SVN->getSplatIndex();
-    // If this is undef splat, generate it via "just" vdup, if possible.
-    if (Lane == -1)
-      Lane = 0;
-
-    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return DAG.getNode(ARM64ISD::DUP, dl, V1.getValueType(),
-                         V1.getOperand(0));
-    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
-    // constant. If so, we can just reference the lane's definition directly.
-    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
-        !isa<ConstantSDNode>(V1.getOperand(Lane)))
-      return DAG.getNode(ARM64ISD::DUP, dl, VT, V1.getOperand(Lane));
-
-    // Otherwise, duplicate from the lane of the input vector.
-    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
-
-    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
-    // to make a vector of the same size as this SHUFFLE. We can ignore the
-    // extract entirely, and canonicalise the concat using WidenVector.
-    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
-      V1 = V1.getOperand(0);
-    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
-      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
-      Lane -= Idx * VT.getVectorNumElements() / 2;
-      V1 = WidenVector(V1.getOperand(Idx), DAG);
-    } else if (VT.getSizeInBits() == 64)
-      V1 = WidenVector(V1, DAG);
-
-    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
-  }
-
-  if (isREVMask(ShuffleMask, VT, 64))
-    return DAG.getNode(ARM64ISD::REV64, dl, V1.getValueType(), V1, V2);
-  if (isREVMask(ShuffleMask, VT, 32))
-    return DAG.getNode(ARM64ISD::REV32, dl, V1.getValueType(), V1, V2);
-  if (isREVMask(ShuffleMask, VT, 16))
-    return DAG.getNode(ARM64ISD::REV16, dl, V1.getValueType(), V1, V2);
-
-  bool ReverseEXT = false;
-  unsigned Imm;
-  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
-    if (ReverseEXT)
-      std::swap(V1, V2);
-    Imm *= getExtFactor(V1);
-    return DAG.getNode(ARM64ISD::EXT, dl, V1.getValueType(), V1, V2,
-                       DAG.getConstant(Imm, MVT::i32));
-  } else if (V2->getOpcode() == ISD::UNDEF &&
-             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
-    Imm *= getExtFactor(V1);
-    return DAG.getNode(ARM64ISD::EXT, dl, V1.getValueType(), V1, V1,
-                       DAG.getConstant(Imm, MVT::i32));
-  }
-
-  unsigned WhichResult;
-  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::ZIP1 : ARM64ISD::ZIP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
-  }
-  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::UZP1 : ARM64ISD::UZP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
-  }
-  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::TRN1 : ARM64ISD::TRN2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
-  }
-
-  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::ZIP1 : ARM64ISD::ZIP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
-  }
-  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::UZP1 : ARM64ISD::UZP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
-  }
-  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::TRN1 : ARM64ISD::TRN2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
-  }
-
-  // If the shuffle is not directly supported and it has 4 elements, use
-  // the PerfectShuffle-generated table to synthesize it from other shuffles.
-  unsigned NumElts = VT.getVectorNumElements();
-  if (NumElts == 4) {
-    unsigned PFIndexes[4];
-    for (unsigned i = 0; i != 4; ++i) {
-      if (ShuffleMask[i] < 0)
-        PFIndexes[i] = 8;
-      else
-        PFIndexes[i] = ShuffleMask[i];
-    }
-
-    // Compute the index in the perfect shuffle table.
-    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
-                            PFIndexes[2] * 9 + PFIndexes[3];
-    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost = (PFEntry >> 30);
-
-    if (Cost <= 4)
-      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
-  }
-
-  return GenerateTBL(Op, ShuffleMask, DAG);
-}
-
-static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
-                               APInt &UndefBits) {
-  EVT VT = BVN->getValueType(0);
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
-
-    for (unsigned i = 0; i < NumSplats; ++i) {
-      CnstBits <<= SplatBitSize;
-      UndefBits <<= SplatBitSize;
-      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
-      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
-    }
-
-    return true;
-  }
-
-  return false;
-}
-
-SDValue ARM64TargetLowering::LowerVectorAND(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN =
-      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
-  SDValue LHS = Op.getOperand(0);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  if (!BVN)
-    return Op;
-
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We only have BIC vector immediate instruction, which is and-not.
-    CnstBits = ~CnstBits;
-
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = ~UndefBits;
-    goto AttemptModImm;
-  }
-
-// We can always fall back to a non-immediate AND.
-FailedModImm:
-  return Op;
-}
-
-// Specialized code to quickly find if PotentialBVec is a BuildVector that
-// consists of only the same constant int value, returned in reference arg
-// ConstVal
-static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
-                                     uint64_t &ConstVal) {
-  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
-  if (!Bvec)
-    return false;
-  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
-  if (!FirstElt)
-    return false;
-  EVT VT = Bvec->getValueType(0);
-  unsigned NumElts = VT.getVectorNumElements();
-  for (unsigned i = 1; i < NumElts; ++i)
-    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
-      return false;
-  ConstVal = FirstElt->getZExtValue();
-  return true;
-}
-
-static unsigned getIntrinsicID(const SDNode *N) {
-  unsigned Opcode = N->getOpcode();
-  switch (Opcode) {
-  default:
-    return Intrinsic::not_intrinsic;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-    if (IID < Intrinsic::num_intrinsics)
-      return IID;
-    return Intrinsic::not_intrinsic;
-  }
-  }
-}
-
-// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
-// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
-// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
-// Also, logical shift right -> sri, with the same structure.
-static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-
-  if (!VT.isVector())
-    return SDValue();
-
-  SDLoc DL(N);
-
-  // Is the first op an AND?
-  const SDValue And = N->getOperand(0);
-  if (And.getOpcode() != ISD::AND)
-    return SDValue();
-
-  // Is the second op an shl or lshr?
-  SDValue Shift = N->getOperand(1);
-  // This will have been turned into: ARM64ISD::VSHL vector, #shift
-  // or ARM64ISD::VLSHR vector, #shift
-  unsigned ShiftOpc = Shift.getOpcode();
-  if ((ShiftOpc != ARM64ISD::VSHL && ShiftOpc != ARM64ISD::VLSHR))
-    return SDValue();
-  bool IsShiftRight = ShiftOpc == ARM64ISD::VLSHR;
-
-  // Is the shift amount constant?
-  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
-  if (!C2node)
-    return SDValue();
-
-  // Is the and mask vector all constant?
-  uint64_t C1;
-  if (!isAllConstantBuildVector(And.getOperand(1), C1))
-    return SDValue();
-
-  // Is C1 == ~C2, taking into account how much one can shift elements of a
-  // particular size?
-  uint64_t C2 = C2node->getZExtValue();
-  unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
-  if (C2 > ElemSizeInBits)
-    return SDValue();
-  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
-  if ((C1 & ElemMask) != (~C2 & ElemMask))
-    return SDValue();
-
-  SDValue X = And.getOperand(0);
-  SDValue Y = Shift.getOperand(0);
-
-  unsigned Intrin =
-      IsShiftRight ? Intrinsic::arm64_neon_vsri : Intrinsic::arm64_neon_vsli;
-  SDValue ResultSLI =
-      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                  DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
-
-  DEBUG(dbgs() << "arm64-lower: transformed: \n");
-  DEBUG(N->dump(&DAG));
-  DEBUG(dbgs() << "into: \n");
-  DEBUG(ResultSLI->dump(&DAG));
-
-  ++NumShiftInserts;
-  return ResultSLI;
-}
-
-SDValue ARM64TargetLowering::LowerVectorOR(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
-  if (EnableARM64SlrGeneration) {
-    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
-    if (Res.getNode())
-      return Res;
-  }
-
-  BuildVectorSDNode *BVN =
-      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
-  SDValue LHS = Op.getOperand(1);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  // OR commutes, so try swapping the operands.
-  if (!BVN) {
-    LHS = Op.getOperand(0);
-    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
-  }
-  if (!BVN)
-    return Op;
-
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = UndefBits;
-    goto AttemptModImm;
-  }
-
-// We can always fall back to a non-immediate OR.
-FailedModImm:
-  return Op;
-}
-
-SDValue ARM64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      // Certain magic vector constants (used to express things like NOT
-      // and NEG) are passed through unmodified.  This allows codegen patterns
-      // for these operations to match.  Special-purpose patterns will lower
-      // these immediates to MOVIs if it proves necessary.
-      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
-        return Op;
-
-      // The many faces of MOVI...
-      if (ARM64_AM::isAdvSIMDModImmType10(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType10(CnstVal);
-        if (VT.getSizeInBits() == 128) {
-          SDValue Mov = DAG.getNode(ARM64ISD::MOVIedit, dl, MVT::v2i64,
-                                    DAG.getConstant(CnstVal, MVT::i32));
-          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-        }
-
-        // Support the V64 version via subregister insertion.
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIedit, dl, MVT::f64,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType7(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType7(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType8(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType8(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType9(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType9(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVI, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      // The few faces of FMOV...
-      if (ARM64_AM::isAdvSIMDModImmType11(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType11(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
-        SDValue Mov = DAG.getNode(ARM64ISD::FMOV, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType12(CnstVal) &&
-          VT.getSizeInBits() == 128) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType12(CnstVal);
-        SDValue Mov = DAG.getNode(ARM64ISD::FMOV, dl, MVT::v2f64,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      // The many faces of MVNI...
-      CnstVal = ~CnstVal;
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType7(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType7(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType8(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType8(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = UndefBits;
-    goto AttemptModImm;
-  }
-FailedModImm:
-
-  // Scan through the operands to find some interesting properties we can
-  // exploit:
-  //   1) If only one value is used, we can use a DUP, or
-  //   2) if only the low element is not undef, we can just insert that, or
-  //   3) if only one constant value is used (w/ some non-constant lanes),
-  //      we can splat the constant value into the whole vector then fill
-  //      in the non-constant lanes.
-  //   4) FIXME: If different constant values are used, but we can intelligently
-  //             select the values we'll be overwriting for the non-constant
-  //             lanes such that we can directly materialize the vector
-  //             some other way (MOVI, e.g.), we can be sneaky.
-  unsigned NumElts = VT.getVectorNumElements();
-  bool isOnlyLowElement = true;
-  bool usesOnlyOneValue = true;
-  bool usesOnlyOneConstantValue = true;
-  bool isConstant = true;
-  unsigned NumConstantLanes = 0;
-  SDValue Value;
-  SDValue ConstantValue;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
-      continue;
-    if (i > 0)
-      isOnlyLowElement = false;
-    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
-      isConstant = false;
-
-    if (isa<ConstantSDNode>(V)) {
-      ++NumConstantLanes;
-      if (!ConstantValue.getNode())
-        ConstantValue = V;
-      else if (ConstantValue != V)
-        usesOnlyOneConstantValue = false;
-    }
-
-    if (!Value.getNode())
-      Value = V;
-    else if (V != Value)
-      usesOnlyOneValue = false;
-  }
-
-  if (!Value.getNode())
-    return DAG.getUNDEF(VT);
-
-  if (isOnlyLowElement)
-    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
-
-  // Use DUP for non-constant splats.  For f32 constant splats, reduce to
-  // i32 and try again.
-  if (usesOnlyOneValue) {
-    if (!isConstant) {
-      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-          Value.getValueType() != VT)
-        return DAG.getNode(ARM64ISD::DUP, dl, VT, Value);
-
-      // This is actually a DUPLANExx operation, which keeps everything vectory.
-
-      // DUPLANE works on 128-bit vectors, widen it if necessary.
-      SDValue Lane = Value.getOperand(1);
-      Value = Value.getOperand(0);
-      if (Value.getValueType().getSizeInBits() == 64)
-        Value = WidenVector(Value, DAG);
-
-      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
-      return DAG.getNode(Opcode, dl, VT, Value, Lane);
-    }
-
-    if (VT.getVectorElementType().isFloatingPoint()) {
-      SmallVector<SDValue, 8> Ops;
-      MVT NewType =
-          (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
-      for (unsigned i = 0; i < NumElts; ++i)
-        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
-      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
-      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
-      Val = LowerBUILD_VECTOR(Val, DAG);
-      if (Val.getNode())
-        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
-    }
-  }
-
-  // If there was only one constant value used and for more than one lane,
-  // start by splatting that value, then replace the non-constant lanes. This
-  // is better than the default, which will perform a separate initialization
-  // for each lane.
-  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
-    SDValue Val = DAG.getNode(ARM64ISD::DUP, dl, VT, ConstantValue);
-    // Now insert the non-constant lanes.
-    for (unsigned i = 0; i < NumElts; ++i) {
-      SDValue V = Op.getOperand(i);
-      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
-      if (!isa<ConstantSDNode>(V)) {
-        // Note that type legalization likely mucked about with the VT of the
-        // source operand, so we may have to convert it here before inserting.
-        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
-      }
-    }
-    return Val;
-  }
-
-  // If all elements are constants and the case above didn't get hit, fall back
-  // to the default expansion, which will generate a load from the constant
-  // pool.
-  if (isConstant)
-    return SDValue();
-
-  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
-  if (NumElts >= 4) {
-    SDValue shuffle = ReconstructShuffle(Op, DAG);
-    if (shuffle != SDValue())
-      return shuffle;
-  }
-
-  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
-  // know the default expansion would otherwise fall back on something even
-  // worse. For a vector with one or two non-undef values, that's
-  // scalar_to_vector for the elements followed by a shuffle (provided the
-  // shuffle is valid for the target) and materialization element by element
-  // on the stack followed by a load for everything else.
-  if (!isConstant && !usesOnlyOneValue) {
-    SDValue Vec = DAG.getUNDEF(VT);
-    SDValue Op0 = Op.getOperand(0);
-    unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
-    unsigned i = 0;
-    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
-    // a) Avoid a RMW dependency on the full vector register, and
-    // b) Allow the register coalescer to fold away the copy if the
-    //    value is already in an S or D register.
-    if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
-      unsigned SubIdx = ElemSize == 32 ? ARM64::ssub : ARM64::dsub;
-      MachineSDNode *N =
-          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
-                             DAG.getTargetConstant(SubIdx, MVT::i32));
-      Vec = SDValue(N, 0);
-      ++i;
-    }
-    for (; i < NumElts; ++i) {
-      SDValue V = Op.getOperand(i);
-      if (V.getOpcode() == ISD::UNDEF)
-        continue;
-      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
-      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
-    }
-    return Vec;
-  }
-
-  // Just use the default expansion. We failed to find a better alternative.
-  return SDValue();
-}
-
-SDValue ARM64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
-
-  // Check for non-constant lane.
-  if (!isa<ConstantSDNode>(Op.getOperand(2)))
-    return SDValue();
-
-  EVT VT = Op.getOperand(0).getValueType();
-
-  // Insertion/extraction are legal for V128 types.
-  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
-    return Op;
-
-  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
-    return SDValue();
-
-  // For V64 types, we perform insertion by expanding the value
-  // to a V128 type and perform the insertion on that.
-  SDLoc DL(Op);
-  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
-  EVT WideTy = WideVec.getValueType();
-
-  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
-                             Op.getOperand(1), Op.getOperand(2));
-  // Re-narrow the resultant vector.
-  return NarrowVector(Node, DAG);
-}
-
-SDValue ARM64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
-                                                     SelectionDAG &DAG) const {
-  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
-
-  // Check for non-constant lane.
-  if (!isa<ConstantSDNode>(Op.getOperand(1)))
-    return SDValue();
-
-  EVT VT = Op.getOperand(0).getValueType();
-
-  // Insertion/extraction are legal for V128 types.
-  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
-    return Op;
-
-  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
-    return SDValue();
-
-  // For V64 types, we perform extraction by expanding the value
-  // to a V128 type and perform the extraction on that.
-  SDLoc DL(Op);
-  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
-  EVT WideTy = WideVec.getValueType();
-
-  EVT ExtrTy = WideTy.getVectorElementType();
-  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
-    ExtrTy = MVT::i32;
-
-  // For extractions, we just return the result directly.
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
-                     Op.getOperand(1));
-}
-
-SDValue ARM64TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
-                                                   SelectionDAG &DAG) const {
-  assert(Op.getOpcode() == ISD::SCALAR_TO_VECTOR && "Unknown opcode!");
-  // Some AdvSIMD intrinsics leave their results in the scalar B/H/S/D
-  // registers. The default lowering will copy those to a GPR then back
-  // to a vector register. Instead, just recognize those cases and reference
-  // the vector register they're already a subreg of.
-  SDValue Op0 = Op->getOperand(0);
-  if (Op0->getOpcode() != ISD::INTRINSIC_WO_CHAIN)
-    return Op;
-  unsigned IID = getIntrinsicID(Op0.getNode());
-  // The below list of intrinsics isn't exhaustive. Add cases as-needed.
-  // FIXME: Even better would be if there were an attribute on the node
-  // that we could query and set in the intrinsics definition or something.
-  unsigned SubIdx;
-  switch (IID) {
-  default:
-    // Early exit if this isn't one of the intrinsics we handle.
-    return Op;
-  case Intrinsic::arm64_neon_uaddv:
-  case Intrinsic::arm64_neon_saddv:
-  case Intrinsic::arm64_neon_uaddlv:
-  case Intrinsic::arm64_neon_saddlv:
-    switch (Op0.getValueType().getSizeInBits()) {
-    default:
-      llvm_unreachable("Illegal result size from ARM64 vector intrinsic!");
-    case 8:
-      SubIdx = ARM64::bsub;
-      break;
-    case 16:
-      SubIdx = ARM64::hsub;
-      break;
-    case 32:
-      SubIdx = ARM64::ssub;
-      break;
-    case 64:
-      SubIdx = ARM64::dsub;
-      break;
-    }
-  }
-  MachineSDNode *N =
-      DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(Op),
-                         Op.getValueType(), DAG.getUNDEF(Op.getValueType()),
-                         Op0, DAG.getTargetConstant(SubIdx, MVT::i32));
-  return SDValue(N, 0);
-}
-
-SDValue ARM64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  EVT VT = Op.getOperand(0).getValueType();
-  SDLoc dl(Op);
-  // Just in case...
-  if (!VT.isVector())
-    return SDValue();
-
-  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-  if (!Cst)
-    return SDValue();
-  unsigned Val = Cst->getZExtValue();
-
-  unsigned Size = Op.getValueType().getSizeInBits();
-  if (Val == 0) {
-    switch (Size) {
-    case 8:
-      return DAG.getTargetExtractSubreg(ARM64::bsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 16:
-      return DAG.getTargetExtractSubreg(ARM64::hsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 32:
-      return DAG.getTargetExtractSubreg(ARM64::ssub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 64:
-      return DAG.getTargetExtractSubreg(ARM64::dsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    default:
-      llvm_unreachable("Unexpected vector type in extract_subvector!");
-    }
-  }
-  // If this is extracting the upper 64-bits of a 128-bit vector, we match
-  // that directly.
-  if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
-    return Op;
-
-  return SDValue();
-}
-
-bool ARM64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
-                                             EVT VT) const {
-  if (VT.getVectorNumElements() == 4 &&
-      (VT.is128BitVector() || VT.is64BitVector())) {
-    unsigned PFIndexes[4];
-    for (unsigned i = 0; i != 4; ++i) {
-      if (M[i] < 0)
-        PFIndexes[i] = 8;
-      else
-        PFIndexes[i] = M[i];
-    }
-
-    // Compute the index in the perfect shuffle table.
-    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
-                            PFIndexes[2] * 9 + PFIndexes[3];
-    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost = (PFEntry >> 30);
-
-    if (Cost <= 4)
-      return true;
-  }
-
-  bool ReverseVEXT;
-  unsigned Imm, WhichResult;
-
-  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
-          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
-          isEXTMask(M, VT, ReverseVEXT, Imm) ||
-          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
-          isTRNMask(M, VT, WhichResult) || isUZPMask(M, VT, WhichResult) ||
-          isZIPMask(M, VT, WhichResult) ||
-          isTRN_v_undef_Mask(M, VT, WhichResult) ||
-          isUZP_v_undef_Mask(M, VT, WhichResult) ||
-          isZIP_v_undef_Mask(M, VT, WhichResult));
-}
-
-/// getVShiftImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift operation, where all the elements of the
-/// build_vector must have the same constant integer value.
-static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
-  // Ignore bit_converts.
-  while (Op.getOpcode() == ISD::BITCAST)
-    Op = Op.getOperand(0);
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
-                                    HasAnyUndefs, ElementBits) ||
-      SplatBitSize > ElementBits)
-    return false;
-  Cnt = SplatBits.getSExtValue();
-  return true;
-}
-
-/// isVShiftLImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift left operation.  That value must be in the range:
-///   0 <= Value < ElementBits for a left shift; or
-///   0 <= Value <= ElementBits for a long left shift.
-static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
-    return false;
-  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
-}
-
-/// isVShiftRImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift right operation.  For a shift opcode, the value
-/// is positive, but for an intrinsic the value count must be negative. The
-/// absolute value must be in the range:
-///   1 <= |Value| <= ElementBits for a right shift; or
-///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
-static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
-                         int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
-    return false;
-  if (isIntrinsic)
-    Cnt = -Cnt;
-  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
-}
-
-SDValue ARM64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  int64_t Cnt;
-
-  if (!Op.getOperand(1).getValueType().isVector())
-    return Op;
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("unexpected shift opcode");
-
-  case ISD::SHL:
-    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
-      return DAG.getNode(ARM64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
-                         DAG.getConstant(Cnt, MVT::i32));
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                       DAG.getConstant(Intrinsic::arm64_neon_ushl, MVT::i32),
-                       Op.getOperand(0), Op.getOperand(1));
-  case ISD::SRA:
-  case ISD::SRL:
-    // Right shift immediate
-    if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
-        Cnt < EltSize) {
-      unsigned Opc =
-          (Op.getOpcode() == ISD::SRA) ? ARM64ISD::VASHR : ARM64ISD::VLSHR;
-      return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
-                         DAG.getConstant(Cnt, MVT::i32));
-    }
-
-    // Right shift register.  Note, there is not a shift right register
-    // instruction, but the shift left register instruction takes a signed
-    // value, where negative numbers specify a right shift.
-    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::arm64_neon_sshl
-                                                : Intrinsic::arm64_neon_ushl;
-    // negate the shift amount
-    SDValue NegShift = DAG.getNode(ARM64ISD::NEG, DL, VT, Op.getOperand(1));
-    SDValue NegShiftLeft =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                    DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
-    return NegShiftLeft;
-  }
-
-  return SDValue();
-}
-
-static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
-                                    ARM64CC::CondCode CC, bool NoNans, EVT VT,
-                                    SDLoc dl, SelectionDAG &DAG) {
-  EVT SrcVT = LHS.getValueType();
-
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
-  bool IsZero = IsCnst && (CnstBits == 0);
-
-  if (SrcVT.getVectorElementType().isFloatingPoint()) {
-    switch (CC) {
-    default:
-      return SDValue();
-    case ARM64CC::NE: {
-      SDValue Fcmeq;
-      if (IsZero)
-        Fcmeq = DAG.getNode(ARM64ISD::FCMEQz, dl, VT, LHS);
-      else
-        Fcmeq = DAG.getNode(ARM64ISD::FCMEQ, dl, VT, LHS, RHS);
-      return DAG.getNode(ARM64ISD::NOT, dl, VT, Fcmeq);
-    }
-    case ARM64CC::EQ:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMEQz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMEQ, dl, VT, LHS, RHS);
-    case ARM64CC::GE:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMGEz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGE, dl, VT, LHS, RHS);
-    case ARM64CC::GT:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMGTz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGT, dl, VT, LHS, RHS);
-    case ARM64CC::LS:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMLEz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGE, dl, VT, RHS, LHS);
-    case ARM64CC::LT:
-      if (!NoNans)
-        return SDValue();
-    // If we ignore NaNs then we can use to the MI implementation.
-    // Fallthrough.
-    case ARM64CC::MI:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMLTz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGT, dl, VT, RHS, LHS);
-    }
-  }
-
-  switch (CC) {
-  default:
-    return SDValue();
-  case ARM64CC::NE: {
-    SDValue Cmeq;
-    if (IsZero)
-      Cmeq = DAG.getNode(ARM64ISD::CMEQz, dl, VT, LHS);
-    else
-      Cmeq = DAG.getNode(ARM64ISD::CMEQ, dl, VT, LHS, RHS);
-    return DAG.getNode(ARM64ISD::NOT, dl, VT, Cmeq);
-  }
-  case ARM64CC::EQ:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMEQz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMEQ, dl, VT, LHS, RHS);
-  case ARM64CC::GE:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMGEz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGE, dl, VT, LHS, RHS);
-  case ARM64CC::GT:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMGTz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGT, dl, VT, LHS, RHS);
-  case ARM64CC::LE:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMLEz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGE, dl, VT, RHS, LHS);
-  case ARM64CC::LS:
-    return DAG.getNode(ARM64ISD::CMHS, dl, VT, RHS, LHS);
-  case ARM64CC::CC:
-    return DAG.getNode(ARM64ISD::CMHI, dl, VT, RHS, LHS);
-  case ARM64CC::LT:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMLTz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGT, dl, VT, RHS, LHS);
-  case ARM64CC::HI:
-    return DAG.getNode(ARM64ISD::CMHI, dl, VT, LHS, RHS);
-  case ARM64CC::CS:
-    return DAG.getNode(ARM64ISD::CMHS, dl, VT, LHS, RHS);
-  }
-}
-
-SDValue ARM64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDLoc dl(Op);
-
-  if (LHS.getValueType().getVectorElementType().isInteger()) {
-    assert(LHS.getValueType() == RHS.getValueType());
-    ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC);
-    return EmitVectorComparison(LHS, RHS, ARM64CC, false, Op.getValueType(), dl,
-                                DAG);
-  }
-
-  assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
-         LHS.getValueType().getVectorElementType() == MVT::f64);
-
-  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-  // clean.  Some of them require two branches to implement.
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-
-  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
-  SDValue Cmp1 =
-      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
-  if (!Cmp1.getNode())
-    return SDValue();
-
-  if (CC2 != ARM64CC::AL) {
-    SDValue Cmp2 =
-        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
-    if (!Cmp2.getNode())
-      return SDValue();
-
-    return DAG.getNode(ISD::OR, dl, Cmp1.getValueType(), Cmp1, Cmp2);
-  }
-
-  return Cmp1;
-}
-
-/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
-/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
-/// specified in the intrinsic calls.
-bool ARM64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                             const CallInst &I,
-                                             unsigned Intrinsic) const {
-  switch (Intrinsic) {
-  case Intrinsic::arm64_neon_ld2:
-  case Intrinsic::arm64_neon_ld3:
-  case Intrinsic::arm64_neon_ld4:
-  case Intrinsic::arm64_neon_ld2lane:
-  case Intrinsic::arm64_neon_ld3lane:
-  case Intrinsic::arm64_neon_ld4lane:
-  case Intrinsic::arm64_neon_ld2r:
-  case Intrinsic::arm64_neon_ld3r:
-  case Intrinsic::arm64_neon_ld4r: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.offset = 0;
-    Info.align = 0;
-    Info.vol = false; // volatile loads with NEON intrinsics not supported
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm64_neon_st2:
-  case Intrinsic::arm64_neon_st3:
-  case Intrinsic::arm64_neon_st4:
-  case Intrinsic::arm64_neon_st2lane:
-  case Intrinsic::arm64_neon_st3lane:
-  case Intrinsic::arm64_neon_st4lane: {
-    Info.opc = ISD::INTRINSIC_VOID;
-    // Conservatively set memVT to the entire set of vectors stored.
-    unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
-      Type *ArgTy = I.getArgOperand(ArgI)->getType();
-      if (!ArgTy->isVectorTy())
-        break;
-      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
-    }
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.offset = 0;
-    Info.align = 0;
-    Info.vol = false; // volatile stores with NEON intrinsics not supported
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  case Intrinsic::arm64_ldxr: {
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
-    Info.vol = true;
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm64_stxr: {
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
-    Info.ptrVal = I.getArgOperand(1);
-    Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
-    Info.vol = true;
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  case Intrinsic::arm64_ldxp: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::i128;
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.align = 16;
-    Info.vol = true;
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm64_stxp: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::i128;
-    Info.ptrVal = I.getArgOperand(2);
-    Info.offset = 0;
-    Info.align = 16;
-    Info.vol = true;
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  default:
-    break;
-  }
-
-  return false;
-}
-
-// Truncations from 64-bit GPR to 32-bit GPR is free.
-bool ARM64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
-  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
-    return false;
-  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
-  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
-  if (NumBits1 <= NumBits2)
-    return false;
-  return true;
-}
-bool ARM64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
-    return false;
-  unsigned NumBits1 = VT1.getSizeInBits();
-  unsigned NumBits2 = VT2.getSizeInBits();
-  if (NumBits1 <= NumBits2)
-    return false;
-  return true;
-}
-
-// All 32-bit GPR operations implicitly zero the high-half of the corresponding
-// 64-bit GPR.
-bool ARM64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
-  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
-    return false;
-  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
-  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
-  if (NumBits1 == 32 && NumBits2 == 64)
-    return true;
-  return false;
-}
-bool ARM64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
-    return false;
-  unsigned NumBits1 = VT1.getSizeInBits();
-  unsigned NumBits2 = VT2.getSizeInBits();
-  if (NumBits1 == 32 && NumBits2 == 64)
-    return true;
-  return false;
-}
-
-bool ARM64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
-  EVT VT1 = Val.getValueType();
-  if (isZExtFree(VT1, VT2)) {
-    return true;
-  }
-
-  if (Val.getOpcode() != ISD::LOAD)
-    return false;
-
-  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
-  return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() &&
-          VT2.isInteger() && VT1.getSizeInBits() <= 32);
-}
-
-bool ARM64TargetLowering::hasPairedLoad(Type *LoadedType,
-                                        unsigned &RequiredAligment) const {
-  if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
-    return false;
-  // Cyclone supports unaligned accesses.
-  RequiredAligment = 0;
-  unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
-  return NumBits == 32 || NumBits == 64;
-}
-
-bool ARM64TargetLowering::hasPairedLoad(EVT LoadedType,
-                                        unsigned &RequiredAligment) const {
-  if (!LoadedType.isSimple() ||
-      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
-    return false;
-  // Cyclone supports unaligned accesses.
-  RequiredAligment = 0;
-  unsigned NumBits = LoadedType.getSizeInBits();
-  return NumBits == 32 || NumBits == 64;
-}
-
-static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
-                       unsigned AlignCheck) {
-  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
-          (DstAlign == 0 || DstAlign % AlignCheck == 0));
-}
-
-EVT ARM64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-                                             unsigned SrcAlign, bool IsMemset,
-                                             bool ZeroMemset, bool MemcpyStrSrc,
-                                             MachineFunction &MF) const {
-  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
-  // instruction to materialize the v2i64 zero and one store (with restrictive
-  // addressing mode). Just do two i64 store of zero-registers.
-  bool Fast;
-  const Function *F = MF.getFunction();
-  if (!IsMemset && Size >= 16 &&
-      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat) &&
-      (memOpAlign(SrcAlign, DstAlign, 16) ||
-       (allowsUnalignedMemoryAccesses(MVT::v2i64, 0, &Fast) && Fast)))
-    return MVT::v2i64;
-
-  return Size >= 8 ? MVT::i64 : MVT::i32;
-}
-
-// 12-bit optionally shifted immediates are legal for adds.
-bool ARM64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
-  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
-    return true;
-  return false;
-}
-
-// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
-// immediates is the same as for an add or a sub.
-bool ARM64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
-  if (Immed < 0)
-    Immed *= -1;
-  return isLegalAddImmediate(Immed);
-}
-
-/// isLegalAddressingMode - Return true if the addressing mode represented
-/// by AM is legal for this target, for a load/store of the specified type.
-bool ARM64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                Type *Ty) const {
-  // ARM64 has five basic addressing modes:
-  //  reg
-  //  reg + 9-bit signed offset
-  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
-  //  reg1 + reg2
-  //  reg + SIZE_IN_BYTES * reg
-
-  // No global is ever allowed as a base.
-  if (AM.BaseGV)
-    return false;
-
-  // No reg+reg+imm addressing.
-  if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
-    return false;
-
-  // check reg + imm case:
-  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
-  uint64_t NumBytes = 0;
-  if (Ty->isSized()) {
-    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
-    NumBytes = NumBits / 8;
-    if (!isPowerOf2_64(NumBits))
-      NumBytes = 0;
-  }
-
-  if (!AM.Scale) {
-    int64_t Offset = AM.BaseOffs;
-
-    // 9-bit signed offset
-    if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
-      return true;
-
-    // 12-bit unsigned offset
-    unsigned shift = Log2_64(NumBytes);
-    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
-        // Must be a multiple of NumBytes (NumBytes is a power of 2)
-        (Offset >> shift) << shift == Offset)
-      return true;
-    return false;
-  }
-
-  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
-
-  if (!AM.Scale || AM.Scale == 1 ||
-      (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
-    return true;
-  return false;
-}
-
-int ARM64TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                              Type *Ty) const {
-  // Scaling factors are not free at all.
-  // Operands                     | Rt Latency
-  // -------------------------------------------
-  // Rt, [Xn, Xm]                 | 4
-  // -------------------------------------------
-  // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
-  // Rt, [Xn, Wm, <extend> #imm]  |
-  if (isLegalAddressingMode(AM, Ty))
-    // Scale represents reg2 * scale, thus account for 1 if
-    // it is not equal to 0 or 1.
-    return AM.Scale != 0 && AM.Scale != 1;
-  return -1;
-}
-
-bool ARM64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  VT = VT.getScalarType();
-
-  if (!VT.isSimple())
-    return false;
-
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::f32:
-  case MVT::f64:
-    return true;
-  default:
-    break;
-  }
-
-  return false;
-}
-
-const uint16_t *
-ARM64TargetLowering::getScratchRegisters(CallingConv::ID) const {
-  // LR is a callee-save register, but we must treat it as clobbered by any call
-  // site. Hence we include LR in the scratch registers, which are in turn added
-  // as implicit-defs for stackmaps and patchpoints.
-  static const uint16_t ScratchRegs[] = {
-    ARM64::X16, ARM64::X17, ARM64::LR, 0
-  };
-  return ScratchRegs;
-}
-
-bool ARM64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                                            Type *Ty) const {
-  assert(Ty->isIntegerTy());
-
-  unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  if (BitSize == 0)
-    return false;
-
-  int64_t Val = Imm.getSExtValue();
-  if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, BitSize))
-    return true;
-
-  if ((int64_t)Val < 0)
-    Val = ~Val;
-  if (BitSize == 32)
-    Val &= (1LL << 32) - 1;
-
-  unsigned LZ = countLeadingZeros((uint64_t)Val);
-  unsigned Shift = (63 - LZ) / 16;
-  // MOVZ is free so return true for one or fewer MOVK.
-  return (Shift < 3) ? true : false;
-}
-
-// Generate SUBS and CSEL for integer abs.
-static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc DL(N);
-
-  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
-  // and change it to SUB and CSEL.
-  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
-      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
-      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
-    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
-      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
-        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
-                                  N0.getOperand(0));
-        // Generate SUBS & CSEL.
-        SDValue Cmp =
-            DAG.getNode(ARM64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
-                        N0.getOperand(0), DAG.getConstant(0, VT));
-        return DAG.getNode(ARM64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
-                           DAG.getConstant(ARM64CC::PL, MVT::i32),
-                           SDValue(Cmp.getNode(), 1));
-      }
-  return SDValue();
-}
-
-// performXorCombine - Attempts to handle integer ABS.
-static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARM64Subtarget *Subtarget) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  return performIntegerAbsCombine(N, DAG);
-}
-
-static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARM64Subtarget *Subtarget) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // Multiplication of a power of two plus/minus one can be done more
-  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
-  // future CPUs have a cheaper MADD instruction, this may need to be
-  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
-  // 64-bit is 5 cycles, so this is always a win.
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
-    APInt Value = C->getAPIntValue();
-    EVT VT = N->getValueType(0);
-    APInt VP1 = Value + 1;
-    if (VP1.isPowerOf2()) {
-      // Multiplying by one less than a power of two, replace with a shift
-      // and a subtract.
-      SDValue ShiftedVal =
-          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                      DAG.getConstant(VP1.logBase2(), MVT::i64));
-      return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
-    }
-    APInt VM1 = Value - 1;
-    if (VM1.isPowerOf2()) {
-      // Multiplying by one more than a power of two, replace with a shift
-      // and an add.
-      SDValue ShiftedVal =
-          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                      DAG.getConstant(VM1.logBase2(), MVT::i64));
-      return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
-    }
-  }
-  return SDValue();
-}
-
-static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  if (VT != MVT::f32 && VT != MVT::f64)
-    return SDValue();
-  // Only optimize when the source and destination types have the same width.
-  if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
-    return SDValue();
-
-  // If the result of an integer load is only used by an integer-to-float
-  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
-  // This eliminates an "integer-to-vector-move UOP and improve throughput.
-  SDValue N0 = N->getOperand(0);
-  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
-      // Do not change the width of a volatile load.
-      !cast<LoadSDNode>(N0)->isVolatile()) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
-                               LN0->getPointerInfo(), LN0->isVolatile(),
-                               LN0->isNonTemporal(), LN0->isInvariant(),
-                               LN0->getAlignment());
-
-    // Make sure successors of the original load stay after it by updating them
-    // to use the new Chain.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
-
-    unsigned Opcode =
-        (N->getOpcode() == ISD::SINT_TO_FP) ? ARM64ISD::SITOF : ARM64ISD::UITOF;
-    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
-  }
-
-  return SDValue();
-}
-
-/// An EXTR instruction is made up of two shifts, ORed together. This helper
-/// searches for and classifies those shifts.
-static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
-                         bool &FromHi) {
-  if (N.getOpcode() == ISD::SHL)
-    FromHi = false;
-  else if (N.getOpcode() == ISD::SRL)
-    FromHi = true;
-  else
-    return false;
-
-  if (!isa<ConstantSDNode>(N.getOperand(1)))
-    return false;
-
-  ShiftAmount = N->getConstantOperandVal(1);
-  Src = N->getOperand(0);
-  return true;
-}
-
-/// EXTR instruction extracts a contiguous chunk of bits from two existing
-/// registers viewed as a high/low pair. This function looks for the pattern:
-/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
-/// EXTR. Can't quite be done in TableGen because the two immediates aren't
-/// independent.
-static SDValue tryCombineToEXTR(SDNode *N,
-                                TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  assert(N->getOpcode() == ISD::OR && "Unexpected root");
-
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  SDValue LHS;
-  uint32_t ShiftLHS = 0;
-  bool LHSFromHi = 0;
-  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
-    return SDValue();
-
-  SDValue RHS;
-  uint32_t ShiftRHS = 0;
-  bool RHSFromHi = 0;
-  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
-    return SDValue();
-
-  // If they're both trying to come from the high part of the register, they're
-  // not really an EXTR.
-  if (LHSFromHi == RHSFromHi)
-    return SDValue();
-
-  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
-    return SDValue();
-
-  if (LHSFromHi) {
-    std::swap(LHS, RHS);
-    std::swap(ShiftLHS, ShiftRHS);
-  }
-
-  return DAG.getNode(ARM64ISD::EXTR, DL, VT, LHS, RHS,
-                     DAG.getConstant(ShiftRHS, MVT::i64));
-}
-
-static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
-                                const ARM64Subtarget *Subtarget) {
-  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
-  if (!EnableARM64ExtrGeneration)
-    return SDValue();
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
-    return SDValue();
-
-  SDValue Res = tryCombineToEXTR(N, DCI);
-  if (Res.getNode())
-    return Res;
-
-  return SDValue();
-}
-
-static SDValue performBitcastCombine(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // Remove extraneous bitcasts around an extract_subvector.
-  // For example,
-  //    (v4i16 (bitconvert
-  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
-  //  becomes
-  //    (extract_subvector ((v8i16 ...), (i64 4)))
-
-  // Only interested in 64-bit vectors as the ultimate result.
-  EVT VT = N->getValueType(0);
-  if (!VT.isVector())
-    return SDValue();
-  if (VT.getSimpleVT().getSizeInBits() != 64)
-    return SDValue();
-  // Is the operand an extract_subvector starting at the beginning or halfway
-  // point of the vector? A low half may also come through as an
-  // EXTRACT_SUBREG, so look for that, too.
-  SDValue Op0 = N->getOperand(0);
-  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
-      !(Op0->isMachineOpcode() &&
-        Op0->getMachineOpcode() == ARM64::EXTRACT_SUBREG))
-    return SDValue();
-  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
-  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
-      return SDValue();
-  } else if (Op0->getMachineOpcode() == ARM64::EXTRACT_SUBREG) {
-    if (idx != ARM64::dsub)
-      return SDValue();
-    // The dsub reference is equivalent to a lane zero subvector reference.
-    idx = 0;
-  }
-  // Look through the bitcast of the input to the extract.
-  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
-    return SDValue();
-  SDValue Source = Op0->getOperand(0)->getOperand(0);
-  // If the source type has twice the number of elements as our destination
-  // type, we know this is an extract of the high or low half of the vector.
-  EVT SVT = Source->getValueType(0);
-  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
-    return SDValue();
-
-  DEBUG(dbgs() << "arm64-lower: bitcast extract_subvector simplification\n");
-
-  // Create the simplified form to just extract the low or high half of the
-  // vector directly rather than bothering with the bitcasts.
-  SDLoc dl(N);
-  unsigned NumElements = VT.getVectorNumElements();
-  if (idx) {
-    SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
-  } else {
-    SDValue SubReg = DAG.getTargetConstant(ARM64::dsub, MVT::i32);
-    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
-                                      Source, SubReg),
-                   0);
-  }
-}
-
-static SDValue performConcatVectorsCombine(SDNode *N,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-
-  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
-  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
-  // canonicalise to that.
-  if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
-    assert(VT.getVectorElementType().getSizeInBits() == 64);
-    return DAG.getNode(ARM64ISD::DUPLANE64, dl, VT,
-                       WidenVector(N->getOperand(0), DAG),
-                       DAG.getConstant(0, MVT::i64));
-  }
-
-  // Canonicalise concat_vectors so that the right-hand vector has as few
-  // bit-casts as possible before its real operation. The primary matching
-  // destination for these operations will be the narrowing "2" instructions,
-  // which depend on the operation being performed on this right-hand vector.
-  // For example,
-  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
-  // becomes
-  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
-
-  SDValue Op1 = N->getOperand(1);
-  if (Op1->getOpcode() != ISD::BITCAST)
-    return SDValue();
-  SDValue RHS = Op1->getOperand(0);
-  MVT RHSTy = RHS.getValueType().getSimpleVT();
-  // If the RHS is not a vector, this is not the pattern we're looking for.
-  if (!RHSTy.isVector())
-    return SDValue();
-
-  DEBUG(dbgs() << "arm64-lower: concat_vectors bitcast simplification\n");
-
-  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
-                                  RHSTy.getVectorNumElements() * 2);
-  return DAG.getNode(
-      ISD::BITCAST, dl, VT,
-      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
-                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
-}
-
-static SDValue tryCombineFixedPointConvert(SDNode *N,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-  // Transform a scalar conversion of a value from a lane extract into a
-  // lane extract of a vector conversion. E.g., from foo1 to foo2:
-  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
-  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
-  //
-  // The second form interacts better with instruction selection and the
-  // register allocator to avoid cross-class register copies that aren't
-  // coalescable due to a lane reference.
-
-  // Check the operand and see if it originates from a lane extract.
-  SDValue Op1 = N->getOperand(1);
-  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
-    // Yep, no additional predication needed. Perform the transform.
-    SDValue IID = N->getOperand(0);
-    SDValue Shift = N->getOperand(2);
-    SDValue Vec = Op1.getOperand(0);
-    SDValue Lane = Op1.getOperand(1);
-    EVT ResTy = N->getValueType(0);
-    EVT VecResTy;
-    SDLoc DL(N);
-
-    // The vector width should be 128 bits by the time we get here, even
-    // if it started as 64 bits (the extract_vector handling will have
-    // done so).
-    assert(Vec.getValueType().getSizeInBits() == 128 &&
-           "unexpected vector size on extract_vector_elt!");
-    if (Vec.getValueType() == MVT::v4i32)
-      VecResTy = MVT::v4f32;
-    else if (Vec.getValueType() == MVT::v2i64)
-      VecResTy = MVT::v2f64;
-    else
-      assert(0 && "unexpected vector type!");
-
-    SDValue Convert =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
-  }
-  return SDValue();
-}
-
-// AArch64 high-vector "long" operations are formed by performing the non-high
-// version on an extract_subvector of each operand which gets the high half:
-//
-//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
-//
-// However, there are cases which don't have an extract_high explicitly, but
-// have another operation that can be made compatible with one for free. For
-// example:
-//
-//  (dupv64 scalar) --> (extract_high (dup128 scalar))
-//
-// This routine does the actual conversion of such DUPs, once outer routines
-// have determined that everything else is in order.
-static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
-  // We can handle most types of duplicate, but the lane ones have an extra
-  // operand saying *which* lane, so we need to know.
-  bool IsDUPLANE;
-  switch (N.getOpcode()) {
-  case ARM64ISD::DUP:
-    IsDUPLANE = false;
-    break;
-  case ARM64ISD::DUPLANE8:
-  case ARM64ISD::DUPLANE16:
-  case ARM64ISD::DUPLANE32:
-  case ARM64ISD::DUPLANE64:
-    IsDUPLANE = true;
-    break;
-  default:
-    return SDValue();
-  }
-
-  MVT NarrowTy = N.getSimpleValueType();
-  if (!NarrowTy.is64BitVector())
-    return SDValue();
-
-  MVT ElementTy = NarrowTy.getVectorElementType();
-  unsigned NumElems = NarrowTy.getVectorNumElements();
-  MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
-
-  SDValue NewDUP;
-  if (IsDUPLANE)
-    NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
-                         N.getOperand(1));
-  else
-    NewDUP = DAG.getNode(ARM64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
-
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
-                     NewDUP, DAG.getConstant(NumElems, MVT::i64));
-}
-
-static bool isEssentiallyExtractSubvector(SDValue N) {
-  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
-    return true;
-
-  return N.getOpcode() == ISD::BITCAST &&
-         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
-}
-
-/// \brief Helper structure to keep track of ISD::SET_CC operands.
-struct GenericSetCCInfo {
-  const SDValue *Opnd0;
-  const SDValue *Opnd1;
-  ISD::CondCode CC;
-};
-
-/// \brief Helper structure to keep track of a SET_CC lowered into ARM64 code.
-struct ARM64SetCCInfo {
-  const SDValue *Cmp;
-  ARM64CC::CondCode CC;
-};
-
-/// \brief Helper structure to keep track of SetCC information.
-union SetCCInfo {
-  GenericSetCCInfo Generic;
-  ARM64SetCCInfo ARM64;
-};
-
-/// \brief Helper structure to be able to read SetCC information.
-/// If set to true, IsARM64 field, Info is a ARM64SetCCInfo, otherwise Info is
-/// a GenericSetCCInfo.
-struct SetCCInfoAndKind {
-  SetCCInfo Info;
-  bool IsARM64;
-};
-
-/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
-/// an
-/// ARM64 lowered one.
-/// \p SetCCInfo is filled accordingly.
-/// \post SetCCInfo is meanginfull only when this function returns true.
-/// \return True when Op is a kind of SET_CC operation.
-static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
-  // If this is a setcc, this is straight forward.
-  if (Op.getOpcode() == ISD::SETCC) {
-    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
-    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
-    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-    SetCCInfo.IsARM64 = false;
-    return true;
-  }
-  // Otherwise, check if this is a matching csel instruction.
-  // In other words:
-  // - csel 1, 0, cc
-  // - csel 0, 1, !cc
-  if (Op.getOpcode() != ARM64ISD::CSEL)
-    return false;
-  // Set the information about the operands.
-  // TODO: we want the operands of the Cmp not the csel
-  SetCCInfo.Info.ARM64.Cmp = &Op.getOperand(3);
-  SetCCInfo.IsARM64 = true;
-  SetCCInfo.Info.ARM64.CC = static_cast<ARM64CC::CondCode>(
-      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
-
-  // Check that the operands matches the constraints:
-  // (1) Both operands must be constants.
-  // (2) One must be 1 and the other must be 0.
-  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
-  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-
-  // Check (1).
-  if (!TValue || !FValue)
-    return false;
-
-  // Check (2).
-  if (!TValue->isOne()) {
-    // Update the comparison when we are interested in !cc.
-    std::swap(TValue, FValue);
-    SetCCInfo.Info.ARM64.CC =
-        ARM64CC::getInvertedCondCode(SetCCInfo.Info.ARM64.CC);
-  }
-  return TValue->isOne() && FValue->isNullValue();
-}
-
-// The folding we want to perform is:
-// (add x, (setcc cc ...) )
-//   -->
-// (csel x, (add x, 1), !cc ...)
-//
-// The latter will get matched to a CSINC instruction.
-static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
-  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
-  SDValue LHS = Op->getOperand(0);
-  SDValue RHS = Op->getOperand(1);
-  SetCCInfoAndKind InfoAndKind;
-
-  // If neither operand is a SET_CC, give up.
-  if (!isSetCC(LHS, InfoAndKind)) {
-    std::swap(LHS, RHS);
-    if (!isSetCC(LHS, InfoAndKind))
-      return SDValue();
-  }
-
-  // FIXME: This could be generatized to work for FP comparisons.
-  EVT CmpVT = InfoAndKind.IsARM64
-                  ? InfoAndKind.Info.ARM64.Cmp->getOperand(0).getValueType()
-                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
-  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
-    return SDValue();
-
-  SDValue CCVal;
-  SDValue Cmp;
-  SDLoc dl(Op);
-  if (InfoAndKind.IsARM64) {
-    CCVal = DAG.getConstant(
-        ARM64CC::getInvertedCondCode(InfoAndKind.Info.ARM64.CC), MVT::i32);
-    Cmp = *InfoAndKind.Info.ARM64.Cmp;
-  } else
-    Cmp = getARM64Cmp(*InfoAndKind.Info.Generic.Opnd0,
-                      *InfoAndKind.Info.Generic.Opnd1,
-                      ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
-                      CCVal, DAG, dl);
-
-  EVT VT = Op->getValueType(0);
-  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
-  return DAG.getNode(ARM64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
-}
-
-// The basic add/sub long vector instructions have variants with "2" on the end
-// which act on the high-half of their inputs. They are normally matched by
-// patterns like:
-//
-// (add (zeroext (extract_high LHS)),
-//      (zeroext (extract_high RHS)))
-// -> uaddl2 vD, vN, vM
-//
-// However, if one of the extracts is something like a duplicate, this
-// instruction can still be used profitably. This function puts the DAG into a
-// more appropriate form for those patterns to trigger.
-static SDValue performAddSubLongCombine(SDNode *N,
-                                        TargetLowering::DAGCombinerInfo &DCI,
-                                        SelectionDAG &DAG) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  MVT VT = N->getSimpleValueType(0);
-  if (!VT.is128BitVector()) {
-    if (N->getOpcode() == ISD::ADD)
-      return performSetccAddFolding(N, DAG);
-    return SDValue();
-  }
-
-  // Make sure both branches are extended in the same way.
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
-       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
-      LHS.getOpcode() != RHS.getOpcode())
-    return SDValue();
-
-  unsigned ExtType = LHS.getOpcode();
-
-  // It's not worth doing if at least one of the inputs isn't already an
-  // extract, but we don't know which it'll be so we have to try both.
-  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
-    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
-    if (!RHS.getNode())
-      return SDValue();
-
-    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
-  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
-    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
-    if (!LHS.getNode())
-      return SDValue();
-
-    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
-  }
-
-  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
-}
-
-// Massage DAGs which we can use the high-half "long" operations on into
-// something isel will recognize better. E.g.
-//
-// (arm64_neon_umull (extract_high vec) (dupv64 scalar)) -->
-//   (arm64_neon_umull (extract_high (v2i64 vec)))
-//                     (extract_high (v2i64 (dup128 scalar)))))
-//
-static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
-                                       TargetLowering::DAGCombinerInfo &DCI,
-                                       SelectionDAG &DAG) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SDValue LHS = N->getOperand(1);
-  SDValue RHS = N->getOperand(2);
-  assert(LHS.getValueType().is64BitVector() &&
-         RHS.getValueType().is64BitVector() &&
-         "unexpected shape for long operation");
-
-  // Either node could be a DUP, but it's not worth doing both of them (you'd
-  // just as well use the non-high version) so look for a corresponding extract
-  // operation on the other "wing".
-  if (isEssentiallyExtractSubvector(LHS)) {
-    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
-    if (!RHS.getNode())
-      return SDValue();
-  } else if (isEssentiallyExtractSubvector(RHS)) {
-    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
-    if (!LHS.getNode())
-      return SDValue();
-  }
-
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
-                     N->getOperand(0), LHS, RHS);
-}
-
-static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
-  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
-  unsigned ElemBits = ElemTy.getSizeInBits();
-
-  int64_t ShiftAmount;
-  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
-    APInt SplatValue, SplatUndef;
-    unsigned SplatBitSize;
-    bool HasAnyUndefs;
-    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
-                              HasAnyUndefs, ElemBits) ||
-        SplatBitSize != ElemBits)
-      return SDValue();
-
-    ShiftAmount = SplatValue.getSExtValue();
-  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
-    ShiftAmount = CVN->getSExtValue();
-  } else
-    return SDValue();
-
-  unsigned Opcode;
-  bool IsRightShift;
-  switch (IID) {
-  default:
-    llvm_unreachable("Unknown shift intrinsic");
-  case Intrinsic::arm64_neon_sqshl:
-    Opcode = ARM64ISD::SQSHL_I;
-    IsRightShift = false;
-    break;
-  case Intrinsic::arm64_neon_uqshl:
-    Opcode = ARM64ISD::UQSHL_I;
-    IsRightShift = false;
-    break;
-  case Intrinsic::arm64_neon_srshl:
-    Opcode = ARM64ISD::SRSHR_I;
-    IsRightShift = true;
-    break;
-  case Intrinsic::arm64_neon_urshl:
-    Opcode = ARM64ISD::URSHR_I;
-    IsRightShift = true;
-    break;
-  case Intrinsic::arm64_neon_sqshlu:
-    Opcode = ARM64ISD::SQSHLU_I;
-    IsRightShift = false;
-    break;
-  }
-
-  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
-    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
-                       DAG.getConstant(-ShiftAmount, MVT::i32));
-  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits)
-    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
-                       DAG.getConstant(ShiftAmount, MVT::i32));
-
-  return SDValue();
-}
-
-// The CRC32[BH] instructions ignore the high bits of their data operand. Since
-// the intrinsics must be legal and take an i32, this means there's almost
-// certainly going to be a zext in the DAG which we can eliminate.
-static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
-  SDValue AndN = N->getOperand(2);
-  if (AndN.getOpcode() != ISD::AND)
-    return SDValue();
-
-  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
-  if (!CMask || CMask->getZExtValue() != Mask)
-    return SDValue();
-
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
-                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
-}
-
-static SDValue performIntrinsicCombine(SDNode *N,
-                                       TargetLowering::DAGCombinerInfo &DCI,
-                                       const ARM64Subtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-  unsigned IID = getIntrinsicID(N);
-  switch (IID) {
-  default:
-    break;
-  case Intrinsic::arm64_neon_vcvtfxs2fp:
-  case Intrinsic::arm64_neon_vcvtfxu2fp:
-    return tryCombineFixedPointConvert(N, DCI, DAG);
-    break;
-  case Intrinsic::arm64_neon_fmax:
-    return DAG.getNode(ARM64ISD::FMAX, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2));
-  case Intrinsic::arm64_neon_fmin:
-    return DAG.getNode(ARM64ISD::FMIN, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2));
-  case Intrinsic::arm64_neon_smull:
-  case Intrinsic::arm64_neon_umull:
-  case Intrinsic::arm64_neon_pmull:
-  case Intrinsic::arm64_neon_sqdmull:
-    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
-  case Intrinsic::arm64_neon_sqshl:
-  case Intrinsic::arm64_neon_uqshl:
-  case Intrinsic::arm64_neon_sqshlu:
-  case Intrinsic::arm64_neon_srshl:
-  case Intrinsic::arm64_neon_urshl:
-    return tryCombineShiftImm(IID, N, DAG);
-  case Intrinsic::arm64_crc32b:
-  case Intrinsic::arm64_crc32cb:
-    return tryCombineCRC32(0xff, N, DAG);
-  case Intrinsic::arm64_crc32h:
-  case Intrinsic::arm64_crc32ch:
-    return tryCombineCRC32(0xffff, N, DAG);
-  }
-  return SDValue();
-}
-
-static SDValue performExtendCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    SelectionDAG &DAG) {
-  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
-  // we can convert that DUP into another extract_high (of a bigger DUP), which
-  // helps the backend to decide that an sabdl2 would be useful, saving a real
-  // extract_high operation.
-  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
-      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
-    SDNode *ABDNode = N->getOperand(0).getNode();
-    unsigned IID = getIntrinsicID(ABDNode);
-    if (IID == Intrinsic::arm64_neon_sabd ||
-        IID == Intrinsic::arm64_neon_uabd) {
-      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
-      if (!NewABD.getNode())
-        return SDValue();
-
-      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
-                         NewABD);
-    }
-  }
-
-  // This is effectively a custom type legalization for ARM64.
-  //
-  // Type legalization will split an extend of a small, legal, type to a larger
-  // illegal type by first splitting the destination type, often creating
-  // illegal source types, which then get legalized in isel-confusing ways,
-  // leading to really terrible codegen. E.g.,
-  //   %result = v8i32 sext v8i8 %value
-  // becomes
-  //   %losrc = extract_subreg %value, ...
-  //   %hisrc = extract_subreg %value, ...
-  //   %lo = v4i32 sext v4i8 %losrc
-  //   %hi = v4i32 sext v4i8 %hisrc
-  // Things go rapidly downhill from there.
-  //
-  // For ARM64, the [sz]ext vector instructions can only go up one element
-  // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
-  // take two instructions.
-  //
-  // This implies that the most efficient way to do the extend from v8i8
-  // to two v4i32 values is to first extend the v8i8 to v8i16, then do
-  // the normal splitting to happen for the v8i16->v8i32.
-
-  // This is pre-legalization to catch some cases where the default
-  // type legalization will create ill-tempered code.
-  if (!DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // We're only interested in cleaning things up for non-legal vector types
-  // here. If both the source and destination are legal, things will just
-  // work naturally without any fiddling.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT ResVT = N->getValueType(0);
-  if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
-    return SDValue();
-  // If the vector type isn't a simple VT, it's beyond the scope of what
-  // we're  worried about here. Let legalization do its thing and hope for
-  // the best.
-  if (!ResVT.isSimple())
-    return SDValue();
-
-  SDValue Src = N->getOperand(0);
-  MVT SrcVT = Src->getValueType(0).getSimpleVT();
-  // If the source VT is a 64-bit vector, we can play games and get the
-  // better results we want.
-  if (SrcVT.getSizeInBits() != 64)
-    return SDValue();
-
-  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
-  unsigned ElementCount = SrcVT.getVectorNumElements();
-  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
-  SDLoc DL(N);
-  Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
-
-  // Now split the rest of the operation into two halves, each with a 64
-  // bit source.
-  EVT LoVT, HiVT;
-  SDValue Lo, Hi;
-  unsigned NumElements = ResVT.getVectorNumElements();
-  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
-  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
-                                 ResVT.getVectorElementType(), NumElements / 2);
-
-  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
-                               LoVT.getVectorNumElements());
-  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(0));
-  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
-  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
-  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
-
-  // Now combine the parts back together so we still have a single result
-  // like the combiner expects.
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
-}
-
-/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
-/// value. The load store optimizer pass will merge them to store pair stores.
-/// This has better performance than a splat of the scalar followed by a split
-/// vector store. Even if the stores are not merged it is four stores vs a dup,
-/// followed by an ext.b and two stores.
-static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
-  SDValue StVal = St->getValue();
-  EVT VT = StVal.getValueType();
-
-  // Don't replace floating point stores, they possibly won't be transformed to
-  // stp because of the store pair suppress pass.
-  if (VT.isFloatingPoint())
-    return SDValue();
-
-  // Check for insert vector elements.
-  if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
-    return SDValue();
-
-  // We can express a splat as store pair(s) for 2 or 4 elements.
-  unsigned NumVecElts = VT.getVectorNumElements();
-  if (NumVecElts != 4 && NumVecElts != 2)
-    return SDValue();
-  SDValue SplatVal = StVal.getOperand(1);
-  unsigned RemainInsertElts = NumVecElts - 1;
-
-  // Check that this is a splat.
-  while (--RemainInsertElts) {
-    SDValue NextInsertElt = StVal.getOperand(0);
-    if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
-      return SDValue();
-    if (NextInsertElt.getOperand(1) != SplatVal)
-      return SDValue();
-    StVal = NextInsertElt;
-  }
-  unsigned OrigAlignment = St->getAlignment();
-  unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
-  unsigned Alignment = std::min(OrigAlignment, EltOffset);
-
-  // Create scalar stores. This is at least as good as the code sequence for a
-  // split unaligned store wich is a dup.s, ext.b, and two stores.
-  // Most of the time the three stores should be replaced by store pair
-  // instructions (stp).
-  SDLoc DL(St);
-  SDValue BasePtr = St->getBasePtr();
-  SDValue NewST1 =
-      DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
-                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
-
-  unsigned Offset = EltOffset;
-  while (--NumVecElts) {
-    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
-                                    DAG.getConstant(Offset, MVT::i64));
-    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
-                          St->getPointerInfo(), St->isVolatile(),
-                          St->isNonTemporal(), Alignment);
-    Offset += EltOffset;
-  }
-  return NewST1;
-}
-
-static SDValue performSTORECombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   SelectionDAG &DAG,
-                                   const ARM64Subtarget *Subtarget) {
-  if (!DCI.isBeforeLegalize())
-    return SDValue();
-
-  StoreSDNode *S = cast<StoreSDNode>(N);
-  if (S->isVolatile())
-    return SDValue();
-
-  // Cyclone has bad performance on unaligned 16B stores when crossing line and
-  // page boundries. We want to split such stores.
-  if (!Subtarget->isCyclone())
-    return SDValue();
-
-  // Don't split at Oz.
-  MachineFunction &MF = DAG.getMachineFunction();
-  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::MinSize);
-  if (IsMinSize)
-    return SDValue();
-
-  SDValue StVal = S->getValue();
-  EVT VT = StVal.getValueType();
-
-  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
-  // those up regresses performance on micro-benchmarks and olden/bh.
-  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
-    return SDValue();
-
-  // Split unaligned 16B stores. They are terrible for performance.
-  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
-  // extensions can use this to mark that it does not want splitting to happen
-  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
-  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
-  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
-      S->getAlignment() <= 2)
-    return SDValue();
-
-  // If we get a splat of a scalar convert this vector store to a store of
-  // scalars. They will be merged into store pairs thereby removing two
-  // instructions.
-  SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
-  if (ReplacedSplat != SDValue())
-    return ReplacedSplat;
-
-  SDLoc DL(S);
-  unsigned NumElts = VT.getVectorNumElements() / 2;
-  // Split VT into two.
-  EVT HalfVT =
-      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
-  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(0));
-  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(NumElts));
-  SDValue BasePtr = S->getBasePtr();
-  SDValue NewST1 =
-      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
-                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
-  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
-                                  DAG.getConstant(8, MVT::i64));
-  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
-                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
-                      S->getAlignment());
-}
-
-// Optimize compare with zero and branch.
-static SDValue performBRCONDCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    SelectionDAG &DAG) {
-  SDValue Chain = N->getOperand(0);
-  SDValue Dest = N->getOperand(1);
-  SDValue CCVal = N->getOperand(2);
-  SDValue Cmp = N->getOperand(3);
-
-  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
-  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
-  if (CC != ARM64CC::EQ && CC != ARM64CC::NE)
-    return SDValue();
-
-  unsigned CmpOpc = Cmp.getOpcode();
-  if (CmpOpc != ARM64ISD::ADDS && CmpOpc != ARM64ISD::SUBS)
-    return SDValue();
-
-  // Only attempt folding if there is only one use of the flag and no use of the
-  // value.
-  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
-    return SDValue();
-
-  SDValue LHS = Cmp.getOperand(0);
-  SDValue RHS = Cmp.getOperand(1);
-
-  assert(LHS.getValueType() == RHS.getValueType() &&
-         "Expected the value type to be the same for both operands!");
-  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
-    return SDValue();
-
-  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
-    std::swap(LHS, RHS);
-
-  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
-    return SDValue();
-
-  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
-      LHS.getOpcode() == ISD::SRL)
-    return SDValue();
-
-  // Fold the compare into the branch instruction.
-  SDValue BR;
-  if (CC == ARM64CC::EQ)
-    BR = DAG.getNode(ARM64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
-  else
-    BR = DAG.getNode(ARM64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
-
-  // Do not add new nodes to DAG combiner worklist.
-  DCI.CombineTo(N, BR, false);
-
-  return SDValue();
-}
-
-SDValue ARM64TargetLowering::PerformDAGCombine(SDNode *N,
-                                               DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-  switch (N->getOpcode()) {
-  default:
-    break;
-  case ISD::ADD:
-  case ISD::SUB:
-    return performAddSubLongCombine(N, DCI, DAG);
-  case ISD::XOR:
-    return performXorCombine(N, DAG, DCI, Subtarget);
-  case ISD::MUL:
-    return performMulCombine(N, DAG, DCI, Subtarget);
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
-    return performIntToFpCombine(N, DAG);
-  case ISD::OR:
-    return performORCombine(N, DCI, Subtarget);
-  case ISD::INTRINSIC_WO_CHAIN:
-    return performIntrinsicCombine(N, DCI, Subtarget);
-  case ISD::ANY_EXTEND:
-  case ISD::ZERO_EXTEND:
-  case ISD::SIGN_EXTEND:
-    return performExtendCombine(N, DCI, DAG);
-  case ISD::BITCAST:
-    return performBitcastCombine(N, DCI, DAG);
-  case ISD::CONCAT_VECTORS:
-    return performConcatVectorsCombine(N, DCI, DAG);
-  case ISD::STORE:
-    return performSTORECombine(N, DCI, DAG, Subtarget);
-  case ARM64ISD::BRCOND:
-    return performBRCONDCombine(N, DCI, DAG);
-  }
-  return SDValue();
-}
-
-// Check if the return value is used as only a return value, as otherwise
-// we can't perform a tail-call. In particular, we need to check for
-// target ISD nodes that are returns and any other "odd" constructs
-// that the generic analysis code won't necessarily catch.
-bool ARM64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
-  if (N->getNumValues() != 1)
-    return false;
-  if (!N->hasNUsesOfValue(1, 0))
-    return false;
-
-  SDValue TCChain = Chain;
-  SDNode *Copy = *N->use_begin();
-  if (Copy->getOpcode() == ISD::CopyToReg) {
-    // If the copy has a glue operand, we conservatively assume it isn't safe to
-    // perform a tail call.
-    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
-        MVT::Glue)
-      return false;
-    TCChain = Copy->getOperand(0);
-  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
-    return false;
-
-  bool HasRet = false;
-  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
-       UI != UE; ++UI) {
-    if (UI->getOpcode() != ARM64ISD::RET_FLAG)
-      return false;
-    HasRet = true;
-  }
-
-  if (!HasRet)
-    return false;
-
-  Chain = TCChain;
-  return true;
-}
-
-// Return whether the an instruction can potentially be optimized to a tail
-// call. This will cause the optimizers to attempt to move, or duplicate,
-// return instructions to help enable tail call optimizations for this
-// instruction.
-bool ARM64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!EnableARM64TailCalls)
-    return false;
-
-  if (!CI->isTailCall())
-    return false;
-
-  return true;
-}
-
-bool ARM64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
-                                                 SDValue &Offset,
-                                                 ISD::MemIndexedMode &AM,
-                                                 bool &IsInc,
-                                                 SelectionDAG &DAG) const {
-  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
-    return false;
-
-  Base = Op->getOperand(0);
-  // All of the indexed addressing mode instructions take a signed
-  // 9 bit immediate offset.
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
-    int64_t RHSC = (int64_t)RHS->getZExtValue();
-    if (RHSC >= 256 || RHSC <= -256)
-      return false;
-    IsInc = (Op->getOpcode() == ISD::ADD);
-    Offset = Op->getOperand(1);
-    return true;
-  }
-  return false;
-}
-
-bool ARM64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
-                                                    SDValue &Offset,
-                                                    ISD::MemIndexedMode &AM,
-                                                    SelectionDAG &DAG) const {
-  EVT VT;
-  SDValue Ptr;
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
-    VT = LD->getMemoryVT();
-    Ptr = LD->getBasePtr();
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
-    VT = ST->getMemoryVT();
-    Ptr = ST->getBasePtr();
-  } else
-    return false;
-
-  bool IsInc;
-  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
-    return false;
-  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
-  return true;
-}
-
-bool ARM64TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
-                                                     SDValue &Base,
-                                                     SDValue &Offset,
-                                                     ISD::MemIndexedMode &AM,
-                                                     SelectionDAG &DAG) const {
-  EVT VT;
-  SDValue Ptr;
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
-    VT = LD->getMemoryVT();
-    Ptr = LD->getBasePtr();
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
-    VT = ST->getMemoryVT();
-    Ptr = ST->getBasePtr();
-  } else
-    return false;
-
-  bool IsInc;
-  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
-    return false;
-  // Post-indexing updates the base, so it's not a valid transform
-  // if that's not the same as the load's pointer.
-  if (Ptr != Base)
-    return false;
-  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
-  return true;
-}
-
-/// The only 128-bit atomic operation is an stxp that succeeds. In particular
-/// neither ldp nor ldxp are atomic. So the canonical sequence for an atomic
-/// load is:
-///     loop:
-///         ldxp x0, x1, [x8]
-///         stxp w2, x0, x1, [x8]
-///         cbnz w2, loop
-/// If the stxp succeeds then the ldxp managed to get both halves without an
-/// intervening stxp from a different thread and the read was atomic.
-static void ReplaceATOMIC_LOAD_128(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                   SelectionDAG &DAG) {
-  SDLoc DL(N);
-  AtomicSDNode *AN = cast<AtomicSDNode>(N);
-  EVT VT = AN->getMemoryVT();
-  SDValue Zero = DAG.getConstant(0, VT);
-
-  // FIXME: Really want ATOMIC_LOAD_NOP but that doesn't fit into the existing
-  // scheme very well. Given the complexity of what we're already generating, an
-  // extra couple of ORRs probably won't make much difference.
-  SDValue Result = DAG.getAtomic(ISD::ATOMIC_LOAD_OR, DL, AN->getMemoryVT(),
-                                 N->getOperand(0), N->getOperand(1), Zero,
-                                 AN->getMemOperand(), AN->getOrdering(),
-                                 AN->getSynchScope());
-
-  Results.push_back(Result.getValue(0)); // Value
-  Results.push_back(Result.getValue(1)); // Chain
-}
-
-static void ReplaceATOMIC_OP_128(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                 SelectionDAG &DAG, unsigned NewOp) {
-  SDLoc DL(N);
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(N->getValueType(0) == MVT::i128 &&
-         "Only know how to expand i128 atomics");
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(1)); // Ptr
-  // Low part of Val1
-  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
-                            N->getOperand(2), DAG.getIntPtrConstant(0)));
-  // High part of Val1
-  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
-                            N->getOperand(2), DAG.getIntPtrConstant(1)));
-  if (NewOp == ARM64::ATOMIC_CMP_SWAP_I128) {
-    // Low part of Val2
-    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
-                              N->getOperand(3), DAG.getIntPtrConstant(0)));
-    // High part of Val2
-    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
-                              N->getOperand(3), DAG.getIntPtrConstant(1)));
-  }
-
-  Ops.push_back(DAG.getTargetConstant(Ordering, MVT::i32));
-  Ops.push_back(N->getOperand(0)); // Chain
-
-  SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
-  SDNode *Result = DAG.getMachineNode(NewOp, DL, Tys, Ops);
-  SDValue OpsF[] = { SDValue(Result, 0), SDValue(Result, 1) };
-  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, OpsF, 2));
-  Results.push_back(SDValue(Result, 2));
-}
-
-void ARM64TargetLowering::ReplaceNodeResults(SDNode *N,
-                                             SmallVectorImpl<SDValue> &Results,
-                                             SelectionDAG &DAG) const {
-  switch (N->getOpcode()) {
-  default:
-    llvm_unreachable("Don't know how to custom expand this");
-  case ISD::ATOMIC_LOAD:
-    ReplaceATOMIC_LOAD_128(N, Results, DAG);
-    return;
-  case ISD::ATOMIC_LOAD_ADD:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_ADD_I128);
-    return;
-  case ISD::ATOMIC_LOAD_SUB:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_SUB_I128);
-    return;
-  case ISD::ATOMIC_LOAD_AND:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_AND_I128);
-    return;
-  case ISD::ATOMIC_LOAD_OR:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_OR_I128);
-    return;
-  case ISD::ATOMIC_LOAD_XOR:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_XOR_I128);
-    return;
-  case ISD::ATOMIC_LOAD_NAND:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_NAND_I128);
-    return;
-  case ISD::ATOMIC_SWAP:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_SWAP_I128);
-    return;
-  case ISD::ATOMIC_LOAD_MIN:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_MIN_I128);
-    return;
-  case ISD::ATOMIC_LOAD_MAX:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_MAX_I128);
-    return;
-  case ISD::ATOMIC_LOAD_UMIN:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_UMIN_I128);
-    return;
-  case ISD::ATOMIC_LOAD_UMAX:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_UMAX_I128);
-    return;
-  case ISD::ATOMIC_CMP_SWAP:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_CMP_SWAP_I128);
-    return;
-  case ISD::FP_TO_UINT:
-  case ISD::FP_TO_SINT:
-    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
-    // Let normal code take care of it by not adding anything to Results.
-    return;
-  }
-}
diff --git a/lib/Target/ARM64/ARM64ISelLowering.h b/lib/Target/ARM64/ARM64ISelLowering.h
deleted file mode 100644
index a4664ac..0000000
--- a/lib/Target/ARM64/ARM64ISelLowering.h
+++ /dev/null
@@ -1,422 +0,0 @@
-//==-- ARM64ISelLowering.h - ARM64 DAG Lowering Interface --------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interfaces that ARM64 uses to lower LLVM code into a
-// selection DAG.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64_ISELLOWERING_H
-#define LLVM_TARGET_ARM64_ISELLOWERING_H
-
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/Target/TargetLowering.h"
-
-namespace llvm {
-
-namespace ARM64ISD {
-
-enum {
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
-  CALL,         // Function call.
-
-  // Almost the same as a normal call node, except that a TLSDesc relocation is
-  // needed so the linker can relax it correctly if possible.
-  TLSDESC_CALL,
-  ADRP,     // Page address of a TargetGlobalAddress operand.
-  ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
-  LOADgot,  // Load from automatically generated descriptor (e.g. Global
-            // Offset Table, TLS record).
-  RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
-  BRCOND,   // Conditional branch instruction; "b.cond".
-  CSEL,
-  FCSEL, // Conditional move instruction.
-  CSINV, // Conditional select invert.
-  CSNEG, // Conditional select negate.
-  CSINC, // Conditional select increment.
-
-  // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
-  // ELF.
-  THREAD_POINTER,
-  ADC,
-  SBC, // adc, sbc instructions
-
-  // Arithmetic instructions which write flags.
-  ADDS,
-  SUBS,
-  ADCS,
-  SBCS,
-  ANDS,
-
-  // Floating point comparison
-  FCMP,
-
-  // Floating point max and min instructions.
-  FMAX,
-  FMIN,
-
-  // Scalar extract
-  EXTR,
-
-  // Scalar-to-vector duplication
-  DUP,
-  DUPLANE8,
-  DUPLANE16,
-  DUPLANE32,
-  DUPLANE64,
-
-  // Vector immedate moves
-  MOVI,
-  MOVIshift,
-  MOVIedit,
-  MOVImsl,
-  FMOV,
-  MVNIshift,
-  MVNImsl,
-
-  // Vector immediate ops
-  BICi,
-  ORRi,
-
-  // Vector arithmetic negation
-  NEG,
-
-  // Vector shuffles
-  ZIP1,
-  ZIP2,
-  UZP1,
-  UZP2,
-  TRN1,
-  TRN2,
-  REV16,
-  REV32,
-  REV64,
-  EXT,
-
-  // Vector shift by scalar
-  VSHL,
-  VLSHR,
-  VASHR,
-
-  // Vector shift by scalar (again)
-  SQSHL_I,
-  UQSHL_I,
-  SQSHLU_I,
-  SRSHR_I,
-  URSHR_I,
-
-  // Vector comparisons
-  CMEQ,
-  CMGE,
-  CMGT,
-  CMHI,
-  CMHS,
-  FCMEQ,
-  FCMGE,
-  FCMGT,
-
-  // Vector zero comparisons
-  CMEQz,
-  CMGEz,
-  CMGTz,
-  CMLEz,
-  CMLTz,
-  FCMEQz,
-  FCMGEz,
-  FCMGTz,
-  FCMLEz,
-  FCMLTz,
-
-  // Vector bitwise negation
-  NOT,
-
-  // Vector bitwise selection
-  BIT,
-
-  // Compare-and-branch
-  CBZ,
-  CBNZ,
-  TBZ,
-  TBNZ,
-
-  // Tail calls
-  TC_RETURN,
-
-  // Custom prefetch handling
-  PREFETCH,
-
-  // {s|u}int to FP within a FP register.
-  SITOF,
-  UITOF
-};
-
-} // end namespace ARM64ISD
-
-class ARM64Subtarget;
-class ARM64TargetMachine;
-
-class ARM64TargetLowering : public TargetLowering {
-  bool RequireStrictAlign;
-
-public:
-  explicit ARM64TargetLowering(ARM64TargetMachine &TM);
-
-  /// Selects the correct CCAssignFn for a the given CallingConvention
-  /// value.
-  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
-
-  /// computeMaskedBitsForTargetNode - Determine which of the bits specified in
-  /// Mask are known to be either zero or one and return them in the
-  /// KnownZero/KnownOne bitsets.
-  void computeMaskedBitsForTargetNode(const SDValue Op, APInt &KnownZero,
-                                      APInt &KnownOne, const SelectionDAG &DAG,
-                                      unsigned Depth = 0) const;
-
-  MVT getScalarShiftAmountTy(EVT LHSTy) const override;
-
-  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
-  /// unaligned memory accesses. of the specified type.
-  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
-                                     bool *Fast = 0) const override {
-    if (RequireStrictAlign)
-      return false;
-    // FIXME: True for Cyclone, but not necessary others.
-    if (Fast)
-      *Fast = true;
-    return true;
-  }
-
-  /// LowerOperation - Provide custom lowering hooks for some operations.
-  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-
-  const char *getTargetNodeName(unsigned Opcode) const override;
-
-  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-
-  /// getFunctionAlignment - Return the Log2 alignment of this function.
-  unsigned getFunctionAlignment(const Function *F) const;
-
-  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
-  /// be used for loads / stores from the global.
-  unsigned getMaximalGlobalOffset() const override;
-
-  /// Returns true if a cast between SrcAS and DestAS is a noop.
-  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
-    // Addrspacecasts are always noops.
-    return true;
-  }
-
-  /// createFastISel - This method returns a target specific FastISel object,
-  /// or null if the target does not support "fast" ISel.
-  FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
-                           const TargetLibraryInfo *libInfo) const override;
-
-  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
-
-  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
-
-  /// isShuffleMaskLegal - Return true if the given shuffle mask can be
-  /// codegen'd directly, or if it should be stack expanded.
-  bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
-
-  /// getSetCCResultType - Return the ISD::SETCC ValueType
-  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
-
-  SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
-
-  MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
-                                      unsigned Size, unsigned BinOpcode) const;
-  MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *BB,
-                                       unsigned Size) const;
-  MachineBasicBlock *EmitAtomicBinary128(MachineInstr *MI,
-                                         MachineBasicBlock *BB,
-                                         unsigned BinOpcodeLo,
-                                         unsigned BinOpcodeHi) const;
-  MachineBasicBlock *EmitAtomicCmpSwap128(MachineInstr *MI,
-                                          MachineBasicBlock *BB) const;
-  MachineBasicBlock *EmitAtomicMinMax128(MachineInstr *MI,
-                                         MachineBasicBlock *BB,
-                                         unsigned CondCode) const;
-  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
-                                  MachineBasicBlock *BB) const;
-
-  MachineBasicBlock *
-  EmitInstrWithCustomInserter(MachineInstr *MI,
-                              MachineBasicBlock *MBB) const override;
-
-  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
-                          unsigned Intrinsic) const override;
-
-  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
-  bool isTruncateFree(EVT VT1, EVT VT2) const override;
-
-  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
-  bool isZExtFree(EVT VT1, EVT VT2) const override;
-  bool isZExtFree(SDValue Val, EVT VT2) const override;
-
-  bool hasPairedLoad(Type *LoadedType,
-                     unsigned &RequiredAligment) const override;
-  bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
-
-  bool isLegalAddImmediate(int64_t) const override;
-  bool isLegalICmpImmediate(int64_t) const override;
-
-  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
-                          MachineFunction &MF) const override;
-
-  /// isLegalAddressingMode - Return true if the addressing mode represented
-  /// by AM is legal for this target, for a load/store of the specified type.
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
-
-  /// \brief Return the cost of the scaling factor used in the addressing
-  /// mode represented by AM for this target, for a load/store
-  /// of the specified type.
-  /// If the AM is supported, the return value must be >= 0.
-  /// If the AM is not supported, it returns a negative value.
-  int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
-
-  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
-  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
-  /// expanded to FMAs when this method returns true, otherwise fmuladd is
-  /// expanded to fmul + fadd.
-  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
-
-  const uint16_t *getScratchRegisters(CallingConv::ID CC) const override;
-
-  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                         Type *Ty) const override;
-
-private:
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
-  void addDRTypeForNEON(MVT VT);
-  void addQRTypeForNEON(MVT VT);
-
-  SDValue
-  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                       SelectionDAG &DAG,
-                       SmallVectorImpl<SDValue> &InVals) const override;
-
-  SDValue LowerCall(CallLoweringInfo & /*CLI*/,
-                    SmallVectorImpl<SDValue> &InVals) const override;
-
-  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
-                          CallingConv::ID CallConv, bool isVarArg,
-                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-                          bool isThisReturn, SDValue ThisVal) const;
-
-  bool isEligibleForTailCallOptimization(
-      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-      bool isCalleeStructRet, bool isCallerStructRet,
-      const SmallVectorImpl<ISD::OutputArg> &Outs,
-      const SmallVectorImpl<SDValue> &OutVals,
-      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
-
-  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
-                           SDValue &Chain) const;
-
-  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-                      bool isVarArg,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
-
-  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
-                      SelectionDAG &DAG) const override;
-
-  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
-                              SelectionDAG &DAG) const;
-  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                        RTLIB::Libcall Call) const;
-  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
-
-  ConstraintType getConstraintType(const std::string &Constraint) const;
-
-  /// Examine constraint string and operand type and determine a weight value.
-  /// The operand object must already have been set up with the operand type.
-  ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info,
-                                                  const char *constraint) const;
-
-  std::pair<unsigned, const TargetRegisterClass *>
-  getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
-  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
-                                    std::vector<SDValue> &Ops,
-                                    SelectionDAG &DAG) const;
-
-  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
-  bool mayBeEmittedAsTailCall(CallInst *CI) const;
-  bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
-                              ISD::MemIndexedMode &AM, bool &IsInc,
-                              SelectionDAG &DAG) const;
-  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
-                                 ISD::MemIndexedMode &AM,
-                                 SelectionDAG &DAG) const;
-  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
-                                  SDValue &Offset, ISD::MemIndexedMode &AM,
-                                  SelectionDAG &DAG) const;
-
-  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                          SelectionDAG &DAG) const;
-};
-
-namespace ARM64 {
-FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
-                         const TargetLibraryInfo *libInfo);
-} // end namespace ARM64
-
-} // end namespace llvm
-
-#endif // LLVM_TARGET_ARM64_ISELLOWERING_H
diff --git a/lib/Target/ARM64/ARM64InstrAtomics.td b/lib/Target/ARM64/ARM64InstrAtomics.td
deleted file mode 100644
index 0d36e06..0000000
--- a/lib/Target/ARM64/ARM64InstrAtomics.td
+++ /dev/null
@@ -1,293 +0,0 @@
-//===- ARM64InstrAtomics.td - ARM64 Atomic codegen support -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// ARM64 Atomic operand code-gen constructs.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------
-// Atomic fences
-//===----------------------------------
-def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
-def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
-
-//===----------------------------------
-// Atomic loads
-//===----------------------------------
-
-// When they're actually atomic, only one addressing mode (GPR64sp) is
-// supported, but when they're relaxed and anything can be used, all the
-// standard modes would be valid and may give efficiency gains.
-
-// A atomic load operation that actually needs acquire semantics.
-class acquiring_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(Ordering != AcquireRelease && "unexpected load ordering");
-  return Ordering == Acquire || Ordering == SequentiallyConsistent;
-}]>;
-
-// An atomic load operation that does not need either acquire or release
-// semantics.
-class relaxed_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Monotonic || Ordering == Unordered;
-}]>;
-
-// 8-bit loads
-def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_8> ro_indexed8:$addr),
-          (LDRBBro ro_indexed8:$addr)>;
-def : Pat<(relaxed_load<atomic_load_8> am_indexed8:$addr),
-          (LDRBBui am_indexed8:$addr)>;
-def : Pat<(relaxed_load<atomic_load_8> am_unscaled8:$addr),
-          (LDURBBi am_unscaled8:$addr)>;
-
-// 16-bit loads
-def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_16> ro_indexed16:$addr),
-          (LDRHHro ro_indexed16:$addr)>;
-def : Pat<(relaxed_load<atomic_load_16> am_indexed16:$addr),
-          (LDRHHui am_indexed16:$addr)>;
-def : Pat<(relaxed_load<atomic_load_16> am_unscaled16:$addr),
-          (LDURHHi am_unscaled16:$addr)>;
-
-// 32-bit loads
-def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_32> ro_indexed32:$addr),
-          (LDRWro ro_indexed32:$addr)>;
-def : Pat<(relaxed_load<atomic_load_32> am_indexed32:$addr),
-          (LDRWui am_indexed32:$addr)>;
-def : Pat<(relaxed_load<atomic_load_32> am_unscaled32:$addr),
-          (LDURWi am_unscaled32:$addr)>;
-
-// 64-bit loads
-def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_64> ro_indexed64:$addr),
-          (LDRXro ro_indexed64:$addr)>;
-def : Pat<(relaxed_load<atomic_load_64> am_indexed64:$addr),
-          (LDRXui am_indexed64:$addr)>;
-def : Pat<(relaxed_load<atomic_load_64> am_unscaled64:$addr),
-          (LDURXi am_unscaled64:$addr)>;
-
-//===----------------------------------
-// Atomic stores
-//===----------------------------------
-
-// When they're actually atomic, only one addressing mode (GPR64sp) is
-// supported, but when they're relaxed and anything can be used, all the
-// standard modes would be valid and may give efficiency gains.
-
-// A store operation that actually needs release semantics.
-class releasing_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(Ordering != AcquireRelease && "unexpected store ordering");
-  return Ordering == Release || Ordering == SequentiallyConsistent;
-}]>;
-
-// An atomic store operation that doesn't actually need to be atomic on ARM64.
-class relaxed_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Monotonic || Ordering == Unordered;
-}]>;
-
-// 8-bit stores
-def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
-          (STLRB GPR32:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_8> ro_indexed8:$ptr, GPR32:$val),
-          (STRBBro GPR32:$val, ro_indexed8:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_8> am_indexed8:$ptr, GPR32:$val),
-          (STRBBui GPR32:$val, am_indexed8:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_8> am_unscaled8:$ptr, GPR32:$val),
-          (STURBBi GPR32:$val, am_unscaled8:$ptr)>;
-
-// 16-bit stores
-def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
-          (STLRH GPR32:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_16> ro_indexed16:$ptr, GPR32:$val),
-          (STRHHro GPR32:$val, ro_indexed16:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_16> am_indexed16:$ptr, GPR32:$val),
-          (STRHHui GPR32:$val, am_indexed16:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_16> am_unscaled16:$ptr, GPR32:$val),
-          (STURHHi GPR32:$val, am_unscaled16:$ptr)>;
-
-// 32-bit stores
-def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
-          (STLRW GPR32:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_32> ro_indexed32:$ptr, GPR32:$val),
-          (STRWro GPR32:$val, ro_indexed32:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_32> am_indexed32:$ptr, GPR32:$val),
-          (STRWui GPR32:$val, am_indexed32:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_32> am_unscaled32:$ptr, GPR32:$val),
-          (STURWi GPR32:$val, am_unscaled32:$ptr)>;
-
-// 64-bit stores
-def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
-          (STLRX GPR64:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_64> ro_indexed64:$ptr, GPR64:$val),
-          (STRXro GPR64:$val, ro_indexed64:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_64> am_indexed64:$ptr, GPR64:$val),
-          (STRXui GPR64:$val, am_indexed64:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_64> am_unscaled64:$ptr, GPR64:$val),
-          (STURXi GPR64:$val, am_unscaled64:$ptr)>;
-
-//===----------------------------------
-// Atomic read-modify-write operations
-//===----------------------------------
-
-// More complicated operations need lots of C++ support, so we just create
-// skeletons here for the C++ code to refer to.
-
-let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1 in {
-multiclass AtomicSizes {
-  def _I8 : Pseudo<(outs GPR32:$dst),
-                   (ins GPR64sp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I16 : Pseudo<(outs GPR32:$dst),
-                    (ins GPR64sp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I32 : Pseudo<(outs GPR32:$dst),
-                    (ins GPR64sp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I64 : Pseudo<(outs GPR64:$dst),
-                    (ins GPR64sp:$ptr, GPR64:$incr, i32imm:$ordering), []>;
-  def _I128 : Pseudo<(outs GPR64:$dstlo, GPR64:$dsthi),
-                     (ins GPR64sp:$ptr, GPR64:$incrlo, GPR64:$incrhi,
-                          i32imm:$ordering), []>;
-}
-}
-
-defm ATOMIC_LOAD_ADD  : AtomicSizes;
-defm ATOMIC_LOAD_SUB  : AtomicSizes;
-defm ATOMIC_LOAD_AND  : AtomicSizes;
-defm ATOMIC_LOAD_OR   : AtomicSizes;
-defm ATOMIC_LOAD_XOR  : AtomicSizes;
-defm ATOMIC_LOAD_NAND : AtomicSizes;
-defm ATOMIC_SWAP      : AtomicSizes;
-let Defs = [CPSR] in {
-  // These operations need a CMP to calculate the correct value
-  defm ATOMIC_LOAD_MIN  : AtomicSizes;
-  defm ATOMIC_LOAD_MAX  : AtomicSizes;
-  defm ATOMIC_LOAD_UMIN : AtomicSizes;
-  defm ATOMIC_LOAD_UMAX : AtomicSizes;
-}
-
-class AtomicCmpSwap<RegisterClass GPRData>
-  : Pseudo<(outs GPRData:$dst),
-           (ins GPR64sp:$ptr, GPRData:$old, GPRData:$new,
-                i32imm:$ordering), []> {
-  let usesCustomInserter = 1;
-  let hasCtrlDep = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let Defs = [CPSR];
-}
-
-def ATOMIC_CMP_SWAP_I8  : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<GPR64>;
-
-def ATOMIC_CMP_SWAP_I128
-  : Pseudo<(outs GPR64:$dstlo, GPR64:$dsthi),
-           (ins GPR64sp:$ptr, GPR64:$oldlo, GPR64:$oldhi,
-                GPR64:$newlo, GPR64:$newhi, i32imm:$ordering), []> {
-  let usesCustomInserter = 1;
-  let hasCtrlDep = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let Defs = [CPSR];
-}
-
-//===----------------------------------
-// Low-level exclusive operations
-//===----------------------------------
-
-// Load-exclusives.
-
-def ldxr_1 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def ldxr_2 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def ldxr_4 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def ldxr_8 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
-
-def : Pat<(ldxr_1 am_noindex:$addr),
-          (SUBREG_TO_REG (i64 0), (LDXRB am_noindex:$addr), sub_32)>;
-def : Pat<(ldxr_2 am_noindex:$addr),
-          (SUBREG_TO_REG (i64 0), (LDXRH am_noindex:$addr), sub_32)>;
-def : Pat<(ldxr_4 am_noindex:$addr),
-          (SUBREG_TO_REG (i64 0), (LDXRW am_noindex:$addr), sub_32)>;
-def : Pat<(ldxr_8 am_noindex:$addr), (LDXRX am_noindex:$addr)>;
-
-def : Pat<(and (ldxr_1 am_noindex:$addr), 0xff),
-          (SUBREG_TO_REG (i64 0), (LDXRB am_noindex:$addr), sub_32)>;
-def : Pat<(and (ldxr_2 am_noindex:$addr), 0xffff),
-          (SUBREG_TO_REG (i64 0), (LDXRH am_noindex:$addr), sub_32)>;
-def : Pat<(and (ldxr_4 am_noindex:$addr), 0xffffffff),
-          (SUBREG_TO_REG (i64 0), (LDXRW am_noindex:$addr), sub_32)>;
-
-// Store-exclusives.
-
-def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
-
-def : Pat<(stxr_1 GPR64:$val, am_noindex:$addr),
-          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_2 GPR64:$val, am_noindex:$addr),
-          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_4 GPR64:$val, am_noindex:$addr),
-          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_8 GPR64:$val, am_noindex:$addr),
-          (STXRX GPR64:$val, am_noindex:$addr)>;
-
-def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), am_noindex:$addr),
-          (STXRB GPR32:$val, am_noindex:$addr)>;
-def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), am_noindex:$addr),
-          (STXRH GPR32:$val, am_noindex:$addr)>;
-def : Pat<(stxr_4 (zext GPR32:$val), am_noindex:$addr),
-          (STXRW GPR32:$val, am_noindex:$addr)>;
-
-def : Pat<(stxr_1 (and GPR64:$val, 0xff), am_noindex:$addr),
-          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_2 (and GPR64:$val, 0xffff), am_noindex:$addr),
-          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), am_noindex:$addr),
-          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-
-
-// And clear exclusive.
-
-def : Pat<(int_arm64_clrex), (CLREX 0xf)>;
diff --git a/lib/Target/ARM64/ARM64InstrFormats.td b/lib/Target/ARM64/ARM64InstrFormats.td
deleted file mode 100644
index 38406f8..0000000
--- a/lib/Target/ARM64/ARM64InstrFormats.td
+++ /dev/null
@@ -1,8193 +0,0 @@
-//===- ARM64InstrFormats.td - ARM64 Instruction Formats ------*- tblgen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//  Describe ARM64 instructions format here
-//
-
-// Format specifies the encoding used by the instruction.  This is part of the
-// ad-hoc solution used to emit machine instruction encodings by our machine
-// code emitter.
-class Format<bits<2> val> {
-  bits<2> Value = val;
-}
-
-def PseudoFrm   : Format<0>;
-def NormalFrm   : Format<1>; // Do we need any others?
-
-// ARM64 Instruction Format
-class ARM64Inst<Format f, string cstr> : Instruction {
-  field bits<32> Inst; // Instruction encoding.
-  // Mask of bits that cause an encoding to be UNPREDICTABLE.
-  // If a bit is set, then if the corresponding bit in the
-  // target encoding differs from its value in the "Inst" field,
-  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
-  field bits<32> Unpredictable = 0;
-  // SoftFail is the generic name for this field, but we alias it so
-  // as to make it more obvious what it means in ARM-land.
-  field bits<32> SoftFail = Unpredictable;
-  let Namespace   = "ARM64";
-  Format F        = f;
-  bits<2> Form    = F.Value;
-  let Pattern     = [];
-  let Constraints = cstr;
-}
-
-// Pseudo instructions (don't have encoding information)
-class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
-    : ARM64Inst<PseudoFrm, cstr> {
-  dag OutOperandList = oops;
-  dag InOperandList  = iops;
-  let Pattern        = pattern;
-  let isCodeGenOnly  = 1;
-}
-
-// Real instructions (have encoding information)
-class EncodedI<string cstr, list<dag> pattern> : ARM64Inst<NormalFrm, cstr> {
-  let Pattern = pattern;
-  let Size = 4;
-}
-
-// Normal instructions
-class I<dag oops, dag iops, string asm, string operands, string cstr,
-        list<dag> pattern>
-    : EncodedI<cstr, pattern> {
-  dag OutOperandList = oops;
-  dag InOperandList  = iops;
-  let AsmString      = !strconcat(asm, operands);
-}
-
-class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
-class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
-class UnOpFrag<dag res>  : PatFrag<(ops node:$LHS), res>;
-
-// Helper fragment for an extract of the high portion of a 128-bit vector.
-def extract_high_v16i8 :
-   UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
-def extract_high_v8i16 :
-   UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
-def extract_high_v4i32 :
-   UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
-def extract_high_v2i64 :
-   UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
-
-//===----------------------------------------------------------------------===//
-// Asm Operand Classes.
-//
-
-// Shifter operand for arithmetic shifted encodings.
-def ShifterOperand : AsmOperandClass {
-  let Name = "Shifter";
-}
-
-// Shifter operand for mov immediate encodings.
-def MovImm32ShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "MovImm32Shifter";
-}
-def MovImm64ShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "MovImm64Shifter";
-}
-
-// Shifter operand for arithmetic register shifted encodings.
-def ArithmeticShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "ArithmeticShifter";
-}
-
-// Shifter operand for arithmetic shifted encodings for ADD/SUB instructions.
-def AddSubShifterOperand : AsmOperandClass {
-  let SuperClasses = [ArithmeticShifterOperand];
-  let Name = "AddSubShifter";
-}
-
-// Shifter operand for logical vector 128/64-bit shifted encodings.
-def LogicalVecShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "LogicalVecShifter";
-}
-def LogicalVecHalfWordShifterOperand : AsmOperandClass {
-  let SuperClasses = [LogicalVecShifterOperand];
-  let Name = "LogicalVecHalfWordShifter";
-}
-
-// The "MSL" shifter on the vector MOVI instruction.
-def MoveVecShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "MoveVecShifter";
-}
-
-// Extend operand for arithmetic encodings.
-def ExtendOperand : AsmOperandClass { let Name = "Extend"; }
-def ExtendOperand64 : AsmOperandClass {
-  let SuperClasses = [ExtendOperand];
-  let Name = "Extend64";
-}
-// 'extend' that's a lsl of a 64-bit register.
-def ExtendOperandLSL64 : AsmOperandClass {
-  let SuperClasses = [ExtendOperand];
-  let Name = "ExtendLSL64";
-}
-
-// 8-bit floating-point immediate encodings.
-def FPImmOperand : AsmOperandClass {
-  let Name = "FPImm";
-  let ParserMethod = "tryParseFPImm";
-}
-
-// 8-bit immediate for AdvSIMD where 64-bit values of the form:
-// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
-// are encoded as the eight bit value 'abcdefgh'.
-def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
-
-
-//===----------------------------------------------------------------------===//
-// Operand Definitions.
-//
-
-// ADR[P] instruction labels.
-def AdrpOperand : AsmOperandClass {
-  let Name = "AdrpLabel";
-  let ParserMethod = "tryParseAdrpLabel";
-}
-def adrplabel : Operand<i64> {
-  let EncoderMethod = "getAdrLabelOpValue";
-  let PrintMethod = "printAdrpLabel";
-  let ParserMatchClass = AdrpOperand;
-}
-
-def AdrOperand : AsmOperandClass {
-  let Name = "AdrLabel";
-  let ParserMethod = "tryParseAdrLabel";
-}
-def adrlabel : Operand<i64> {
-  let EncoderMethod = "getAdrLabelOpValue";
-  let ParserMatchClass = AdrOperand;
-}
-
-// simm9 predicate - True if the immediate is in the range [-256, 255].
-def SImm9Operand : AsmOperandClass {
-  let Name = "SImm9";
-  let DiagnosticType = "InvalidMemoryIndexedSImm9";
-}
-def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
-  let ParserMatchClass = SImm9Operand;
-}
-
-// simm7s4 predicate - True if the immediate is a multiple of 4 in the range
-// [-256, 252].
-def SImm7s4Operand : AsmOperandClass {
-  let Name = "SImm7s4";
-  let DiagnosticType = "InvalidMemoryIndexed32SImm7";
-}
-def simm7s4 : Operand<i32> {
-  let ParserMatchClass = SImm7s4Operand;
-  let PrintMethod = "printImmScale4";
-}
-
-// simm7s8 predicate - True if the immediate is a multiple of 8 in the range
-// [-512, 504].
-def SImm7s8Operand : AsmOperandClass {
-  let Name = "SImm7s8";
-  let DiagnosticType = "InvalidMemoryIndexed64SImm7";
-}
-def simm7s8 : Operand<i32> {
-  let ParserMatchClass = SImm7s8Operand;
-  let PrintMethod = "printImmScale8";
-}
-
-// simm7s16 predicate - True if the immediate is a multiple of 16 in the range
-// [-1024, 1008].
-def SImm7s16Operand : AsmOperandClass {
-  let Name = "SImm7s16";
-  let DiagnosticType = "InvalidMemoryIndexed64SImm7";
-}
-def simm7s16 : Operand<i32> {
-  let ParserMatchClass = SImm7s16Operand;
-  let PrintMethod = "printImmScale16";
-}
-
-// imm0_65535 predicate - True if the immediate is in the range [0,65535].
-def Imm0_65535Operand : AsmOperandClass { let Name = "Imm0_65535"; }
-def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 65536;
-}]> {
-  let ParserMatchClass = Imm0_65535Operand;
-}
-
-def Imm1_8Operand : AsmOperandClass {
-  let Name = "Imm1_8";
-  let DiagnosticType = "InvalidImm1_8";
-}
-def Imm1_16Operand : AsmOperandClass {
-  let Name = "Imm1_16";
-  let DiagnosticType = "InvalidImm1_16";
-}
-def Imm1_32Operand : AsmOperandClass {
-  let Name = "Imm1_32";
-  let DiagnosticType = "InvalidImm1_32";
-}
-def Imm1_64Operand : AsmOperandClass {
-  let Name = "Imm1_64";
-  let DiagnosticType = "InvalidImm1_64";
-}
-
-def MovZSymbolG3AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG3";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g3 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG3AsmOperand;
-}
-
-def MovZSymbolG2AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG2";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g2 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG2AsmOperand;
-}
-
-def MovZSymbolG1AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG1";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g1 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG1AsmOperand;
-}
-
-def MovZSymbolG0AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG0";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g0 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG0AsmOperand;
-}
-
-def MovKSymbolG2AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG2";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g2 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG2AsmOperand;
-}
-
-def MovKSymbolG1AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG1";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g1 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG1AsmOperand;
-}
-
-def MovKSymbolG0AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG0";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g0 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG0AsmOperand;
-}
-
-def fixedpoint32 : Operand<i32> {
-  let EncoderMethod = "getFixedPointScaleOpValue";
-  let DecoderMethod = "DecodeFixedPointScaleImm";
-  let ParserMatchClass = Imm1_32Operand;
-}
-def fixedpoint64 : Operand<i64> {
-  let EncoderMethod = "getFixedPointScaleOpValue";
-  let DecoderMethod = "DecodeFixedPointScaleImm";
-  let ParserMatchClass = Imm1_64Operand;
-}
-
-def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
-}]> {
-  let EncoderMethod = "getVecShiftR8OpValue";
-  let DecoderMethod = "DecodeVecShiftR8Imm";
-  let ParserMatchClass = Imm1_8Operand;
-}
-def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
-}]> {
-  let EncoderMethod = "getVecShiftR16OpValue";
-  let DecoderMethod = "DecodeVecShiftR16Imm";
-  let ParserMatchClass = Imm1_16Operand;
-}
-def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
-}]> {
-  let EncoderMethod = "getVecShiftR16OpValue";
-  let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
-  let ParserMatchClass = Imm1_8Operand;
-}
-def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
-}]> {
-  let EncoderMethod = "getVecShiftR32OpValue";
-  let DecoderMethod = "DecodeVecShiftR32Imm";
-  let ParserMatchClass = Imm1_32Operand;
-}
-def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
-}]> {
-  let EncoderMethod = "getVecShiftR32OpValue";
-  let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
-  let ParserMatchClass = Imm1_16Operand;
-}
-def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
-}]> {
-  let EncoderMethod = "getVecShiftR64OpValue";
-  let DecoderMethod = "DecodeVecShiftR64Imm";
-  let ParserMatchClass = Imm1_64Operand;
-}
-def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
-}]> {
-  let EncoderMethod = "getVecShiftR64OpValue";
-  let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
-  let ParserMatchClass = Imm1_32Operand;
-}
-
-def Imm0_7Operand : AsmOperandClass { let Name = "Imm0_7"; }
-def Imm0_15Operand : AsmOperandClass { let Name = "Imm0_15"; }
-def Imm0_31Operand : AsmOperandClass { let Name = "Imm0_31"; }
-def Imm0_63Operand : AsmOperandClass { let Name = "Imm0_63"; }
-
-def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 8);
-}]> {
-  let EncoderMethod = "getVecShiftL8OpValue";
-  let DecoderMethod = "DecodeVecShiftL8Imm";
-  let ParserMatchClass = Imm0_7Operand;
-}
-def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 16);
-}]> {
-  let EncoderMethod = "getVecShiftL16OpValue";
-  let DecoderMethod = "DecodeVecShiftL16Imm";
-  let ParserMatchClass = Imm0_15Operand;
-}
-def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 32);
-}]> {
-  let EncoderMethod = "getVecShiftL32OpValue";
-  let DecoderMethod = "DecodeVecShiftL32Imm";
-  let ParserMatchClass = Imm0_31Operand;
-}
-def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 64);
-}]> {
-  let EncoderMethod = "getVecShiftL64OpValue";
-  let DecoderMethod = "DecodeVecShiftL64Imm";
-  let ParserMatchClass = Imm0_63Operand;
-}
-
-
-// Crazy immediate formats used by 32-bit and 64-bit logical immediate
-// instructions for splatting repeating bit patterns across the immediate.
-def logical_imm32_XFORM : SDNodeXForm<imm, [{
-  uint64_t enc = ARM64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
-  return CurDAG->getTargetConstant(enc, MVT::i32);
-}]>;
-def logical_imm64_XFORM : SDNodeXForm<imm, [{
-  uint64_t enc = ARM64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
-  return CurDAG->getTargetConstant(enc, MVT::i32);
-}]>;
-
-def LogicalImm32Operand : AsmOperandClass { let Name = "LogicalImm32"; }
-def LogicalImm64Operand : AsmOperandClass { let Name = "LogicalImm64"; }
-def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
-  return ARM64_AM::isLogicalImmediate(N->getZExtValue(), 32);
-}], logical_imm32_XFORM> {
-  let PrintMethod = "printLogicalImm32";
-  let ParserMatchClass = LogicalImm32Operand;
-}
-def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
-  return ARM64_AM::isLogicalImmediate(N->getZExtValue(), 64);
-}], logical_imm64_XFORM> {
-  let PrintMethod = "printLogicalImm64";
-  let ParserMatchClass = LogicalImm64Operand;
-}
-
-// imm0_255 predicate - True if the immediate is in the range [0,255].
-def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
-def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 256;
-}]> {
-  let ParserMatchClass = Imm0_255Operand;
-}
-
-// imm0_127 predicate - True if the immediate is in the range [0,127]
-def Imm0_127Operand : AsmOperandClass { let Name = "Imm0_127"; }
-def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 128;
-}]> {
-  let ParserMatchClass = Imm0_127Operand;
-}
-
-// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
-// for all shift-amounts.
-
-// imm0_63 predicate - True if the immediate is in the range [0,63]
-def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 64;
-}]> {
-  let ParserMatchClass = Imm0_63Operand;
-}
-
-// imm0_31 predicate - True if the immediate is in the range [0,31]
-def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 32;
-}]> {
-  let ParserMatchClass = Imm0_31Operand;
-}
-
-// imm0_15 predicate - True if the immediate is in the range [0,15]
-def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 16;
-}]> {
-  let ParserMatchClass = Imm0_15Operand;
-}
-
-// imm0_7 predicate - True if the immediate is in the range [0,7]
-def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 8;
-}]> {
-  let ParserMatchClass = Imm0_7Operand;
-}
-
-// An arithmetic shifter operand:
-//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
-//  {5-0} - imm6
-def arith_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = ArithmeticShifterOperand;
-}
-
-class arith_shifted_reg<ValueType Ty, RegisterClass regclass>
-    : Operand<Ty>,
-      ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
-  let PrintMethod = "printShiftedRegister";
-  let MIOperandInfo = (ops regclass, arith_shift);
-}
-
-def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32>;
-def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64>;
-
-// An arithmetic shifter operand:
-//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
-//  {5-0} - imm6
-def logical_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = ShifterOperand;
-}
-
-class logical_shifted_reg<ValueType Ty, RegisterClass regclass>
-    : Operand<Ty>,
-      ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
-  let PrintMethod = "printShiftedRegister";
-  let MIOperandInfo = (ops regclass, logical_shift);
-}
-
-def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32>;
-def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64>;
-
-// A logical vector shifter operand:
-//  {7-6} - shift type: 00 = lsl
-//  {5-0} - imm6: #0, #8, #16, or #24
-def logical_vec_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let EncoderMethod = "getVecShifterOpValue";
-  let ParserMatchClass = LogicalVecShifterOperand;
-}
-
-// A logical vector half-word shifter operand:
-//  {7-6} - shift type: 00 = lsl
-//  {5-0} - imm6: #0 or #8
-def logical_vec_hw_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let EncoderMethod = "getVecShifterOpValue";
-  let ParserMatchClass = LogicalVecHalfWordShifterOperand;
-}
-
-// A vector move shifter operand:
-//  {0} - imm1: #8 or #16
-def move_vec_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let EncoderMethod = "getMoveVecShifterOpValue";
-  let ParserMatchClass = MoveVecShifterOperand;
-}
-
-// An ADD/SUB immediate shifter operand:
-//  {7-6} - shift type: 00 = lsl
-//  {5-0} - imm6: #0 or #12
-def addsub_shift : Operand<i32> {
-  let ParserMatchClass = AddSubShifterOperand;
-}
-
-class addsub_shifted_imm<ValueType Ty>
-    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
-  let PrintMethod = "printAddSubImm";
-  let EncoderMethod = "getAddSubImmOpValue";
-  let MIOperandInfo = (ops i32imm, addsub_shift);
-}
-
-def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
-def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
-
-class neg_addsub_shifted_imm<ValueType Ty>
-    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
-  let PrintMethod = "printAddSubImm";
-  let EncoderMethod = "getAddSubImmOpValue";
-  let MIOperandInfo = (ops i32imm, addsub_shift);
-}
-
-def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
-def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
-
-// An extend operand:
-//  {5-3} - extend type
-//  {2-0} - imm3
-def arith_extend : Operand<i32> {
-  let PrintMethod = "printExtend";
-  let ParserMatchClass = ExtendOperand;
-}
-def arith_extend64 : Operand<i32> {
-  let PrintMethod = "printExtend";
-  let ParserMatchClass = ExtendOperand64;
-}
-
-// 'extend' that's a lsl of a 64-bit register.
-def arith_extendlsl64 : Operand<i32> {
-  let PrintMethod = "printExtend";
-  let ParserMatchClass = ExtendOperandLSL64;
-}
-
-class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
-                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
-  let PrintMethod = "printExtendedRegister";
-  let MIOperandInfo = (ops GPR32, arith_extend);
-}
-
-class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
-                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
-  let PrintMethod = "printExtendedRegister";
-  let MIOperandInfo = (ops GPR32, arith_extend64);
-}
-
-// Floating-point immediate.
-def fpimm32 : Operand<f32>,
-              PatLeaf<(f32 fpimm), [{
-      return ARM64_AM::getFP32Imm(N->getValueAPF()) != -1;
-    }], SDNodeXForm<fpimm, [{
-      APFloat InVal = N->getValueAPF();
-      uint32_t enc = ARM64_AM::getFP32Imm(InVal);
-      return CurDAG->getTargetConstant(enc, MVT::i32);
-    }]>> {
-  let ParserMatchClass = FPImmOperand;
-  let PrintMethod = "printFPImmOperand";
-}
-def fpimm64 : Operand<f64>,
-              PatLeaf<(f64 fpimm), [{
-      return ARM64_AM::getFP64Imm(N->getValueAPF()) != -1;
-    }], SDNodeXForm<fpimm, [{
-      APFloat InVal = N->getValueAPF();
-      uint32_t enc = ARM64_AM::getFP64Imm(InVal);
-      return CurDAG->getTargetConstant(enc, MVT::i32);
-    }]>> {
-  let ParserMatchClass = FPImmOperand;
-  let PrintMethod = "printFPImmOperand";
-}
-
-def fpimm8 : Operand<i32> {
-  let ParserMatchClass = FPImmOperand;
-  let PrintMethod = "printFPImmOperand";
-}
-
-def fpimm0 : PatLeaf<(fpimm), [{
-  return N->isExactlyValue(+0.0);
-}]>;
-
-// 8-bit immediate for AdvSIMD where 64-bit values of the form:
-// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
-// are encoded as the eight bit value 'abcdefgh'.
-def simdimmtype10 : Operand<i32>,
-                    PatLeaf<(f64 fpimm), [{
-      return ARM64_AM::isAdvSIMDModImmType10(N->getValueAPF()
-                                               .bitcastToAPInt()
-                                               .getZExtValue());
-    }], SDNodeXForm<fpimm, [{
-      APFloat InVal = N->getValueAPF();
-      uint32_t enc = ARM64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
-                                                           .bitcastToAPInt()
-                                                           .getZExtValue());
-      return CurDAG->getTargetConstant(enc, MVT::i32);
-    }]>> {
-  let ParserMatchClass = SIMDImmType10Operand;
-  let PrintMethod = "printSIMDType10Operand";
-}
-
-
-//---
-// Sytem management
-//---
-
-// Base encoding for system instruction operands.
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands>
-    : I<oops, iops, asm, operands, "", []> {
-  let Inst{31-22} = 0b1101010100;
-  let Inst{21}    = L;
-}
-
-// System instructions which do not have an Rt register.
-class SimpleSystemI<bit L, dag iops, string asm, string operands>
-    : BaseSystemI<L, (outs), iops, asm, operands> {
-  let Inst{4-0} = 0b11111;
-}
-
-// System instructions which have an Rt register.
-class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
-    : BaseSystemI<L, oops, iops, asm, operands>,
-      Sched<[WriteSys]> {
-  bits<5> Rt;
-  let Inst{4-0} = Rt;
-}
-
-// Hint instructions that take both a CRm and a 3-bit immediate.
-class HintI<string mnemonic>
-    : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">,
-      Sched<[WriteHint]> {
-  bits <7> imm;
-  let Inst{20-12} = 0b000110010;
-  let Inst{11-5} = imm;
-}
-
-// System instructions taking a single literal operand which encodes into
-// CRm. op2 differentiates the opcodes.
-def BarrierAsmOperand : AsmOperandClass {
-  let Name = "Barrier";
-  let ParserMethod = "tryParseBarrierOperand";
-}
-def barrier_op : Operand<i32> {
-  let PrintMethod = "printBarrierOption";
-  let ParserMatchClass = BarrierAsmOperand;
-}
-class CRmSystemI<Operand crmtype, bits<3> opc, string asm>
-    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">,
-      Sched<[WriteBarrier]> {
-  bits<4> CRm;
-  let Inst{20-12} = 0b000110011;
-  let Inst{11-8} = CRm;
-  let Inst{7-5} = opc;
-}
-
-// MRS/MSR system instructions.
-def SystemRegisterOperand : AsmOperandClass {
-  let Name = "SystemRegister";
-  let ParserMethod = "tryParseSystemRegister";
-}
-// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
-def sysreg_op : Operand<i32> {
-  let ParserMatchClass = SystemRegisterOperand;
-  let DecoderMethod = "DecodeSystemRegister";
-  let PrintMethod = "printSystemRegister";
-}
-
-class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins sysreg_op:$systemreg),
-                       "mrs", "\t$Rt, $systemreg"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
-}
-
-// FIXME: Some of these def CPSR, others don't. Best way to model that?
-// Explicitly modeling each of the system register as a register class
-// would do it, but feels like overkill at this point.
-class MSRI : RtSystemI<0, (outs), (ins sysreg_op:$systemreg, GPR64:$Rt),
-                       "msr", "\t$systemreg, $Rt"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
-}
-
-def SystemCPSRFieldOperand : AsmOperandClass {
-  let Name = "SystemCPSRField";
-  let ParserMethod = "tryParseCPSRField";
-}
-def cpsrfield_op : Operand<i32> {
-  let ParserMatchClass = SystemCPSRFieldOperand;
-  let PrintMethod = "printSystemCPSRField";
-}
-
-let Defs = [CPSR] in
-class MSRcpsrI : SimpleSystemI<0, (ins cpsrfield_op:$cpsr_field, imm0_15:$imm),
-                               "msr", "\t$cpsr_field, $imm">,
-                 Sched<[WriteSys]> {
-  bits<6> cpsrfield;
-  bits<4> imm;
-  let Inst{20-19} = 0b00;
-  let Inst{18-16} = cpsrfield{5-3};
-  let Inst{15-12} = 0b0100;
-  let Inst{11-8} = imm;
-  let Inst{7-5} = cpsrfield{2-0};
-
-  let DecoderMethod = "DecodeSystemCPSRInstruction";
-}
-
-// SYS and SYSL generic system instructions.
-def SysCRAsmOperand : AsmOperandClass {
-  let Name = "SysCR";
-  let ParserMethod = "tryParseSysCROperand";
-}
-
-def sys_cr_op : Operand<i32> {
-  let PrintMethod = "printSysCROperand";
-  let ParserMatchClass = SysCRAsmOperand;
-}
-
-class SystemI<bit L, string asm>
-  : SimpleSystemI<L,
-                  (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
-                   asm, "\t$op1, $Cn, $Cm, $op2">,
-    Sched<[WriteSys]> {
-  bits<3> op1;
-  bits<4> Cn;
-  bits<4> Cm;
-  bits<3> op2;
-  let Inst{20-19} = 0b01;
-  let Inst{18-16} = op1;
-  let Inst{15-12} = Cn;
-  let Inst{11-8}  = Cm;
-  let Inst{7-5}   = op2;
-}
-
-class SystemXtI<bit L, string asm>
-  : RtSystemI<L, (outs),
-       (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
-       asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
-  bits<3> op1;
-  bits<4> Cn;
-  bits<4> Cm;
-  bits<3> op2;
-  let Inst{20-19} = 0b01;
-  let Inst{18-16} = op1;
-  let Inst{15-12} = Cn;
-  let Inst{11-8}  = Cm;
-  let Inst{7-5}   = op2;
-}
-
-class SystemLXtI<bit L, string asm>
-  : RtSystemI<L, (outs),
-       (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
-       asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
-  bits<3> op1;
-  bits<4> Cn;
-  bits<4> Cm;
-  bits<3> op2;
-  let Inst{20-19} = 0b01;
-  let Inst{18-16} = op1;
-  let Inst{15-12} = Cn;
-  let Inst{11-8}  = Cm;
-  let Inst{7-5}   = op2;
-}
-
-
-// Branch (register) instructions:
-//
-//  case opc of
-//    0001 blr
-//    0000 br
-//    0101 dret
-//    0100 eret
-//    0010 ret
-//    otherwise UNDEFINED
-class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
-                    string operands, list<dag> pattern>
-    : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
-  let Inst{31-25} = 0b1101011;
-  let Inst{24-21} = opc;
-  let Inst{20-16} = 0b11111;
-  let Inst{15-10} = 0b000000;
-  let Inst{4-0}   = 0b00000;
-}
-
-class BranchReg<bits<4> opc, string asm, list<dag> pattern>
-    : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
-  bits<5> Rn;
-  let Inst{9-5} = Rn;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
-class SpecialReturn<bits<4> opc, string asm>
-    : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
-  let Inst{9-5} = 0b11111;
-}
-
-//---
-// Conditional branch instruction.
-//---
-// Branch condition code.
-// 4-bit immediate. Pretty-printed as .<cc>
-def dotCcode : Operand<i32> {
-  let PrintMethod = "printDotCondCode";
-}
-
-// Conditional branch target. 19-bit immediate. The low two bits of the target
-// offset are implied zero and so are not part of the immediate.
-def BranchTarget19Operand : AsmOperandClass {
-  let Name = "BranchTarget19";
-}
-def am_brcond : Operand<OtherVT> {
-  let EncoderMethod = "getCondBranchTargetOpValue";
-  let DecoderMethod = "DecodeCondBranchTarget";
-  let PrintMethod = "printAlignedBranchTarget";
-  let ParserMatchClass = BranchTarget19Operand;
-}
-
-class BranchCond : I<(outs), (ins dotCcode:$cond, am_brcond:$target),
-                     "b", "$cond\t$target", "",
-                     [(ARM64brcond bb:$target, imm:$cond, CPSR)]>,
-                   Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-  let Uses = [CPSR];
-
-  bits<4> cond;
-  bits<19> target;
-  let Inst{31-24} = 0b01010100;
-  let Inst{23-5} = target;
-  let Inst{4} = 0;
-  let Inst{3-0} = cond;
-}
-
-//---
-// Compare-and-branch instructions.
-//---
-class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
-    : I<(outs), (ins regtype:$Rt, am_brcond:$target),
-         asm, "\t$Rt, $target", "",
-         [(node regtype:$Rt, bb:$target)]>,
-      Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-
-  bits<5> Rt;
-  bits<19> target;
-  let Inst{30-25} = 0b011010;
-  let Inst{24}    = op;
-  let Inst{23-5}  = target;
-  let Inst{4-0}   = Rt;
-}
-
-multiclass CmpBranch<bit op, string asm, SDNode node> {
-  def W : BaseCmpBranch<GPR32, op, asm, node> {
-    let Inst{31} = 0;
-  }
-  def X : BaseCmpBranch<GPR64, op, asm, node> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Test-bit-and-branch instructions.
-//---
-// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
-// the target offset are implied zero and so are not part of the immediate.
-def BranchTarget14Operand : AsmOperandClass {
-  let Name = "BranchTarget14";
-}
-def am_tbrcond : Operand<OtherVT> {
-  let EncoderMethod = "getTestBranchTargetOpValue";
-  let PrintMethod = "printAlignedBranchTarget";
-  let ParserMatchClass = BranchTarget14Operand;
-}
-
-class TestBranch<bit op, string asm, SDNode node>
-    : I<(outs), (ins GPR64:$Rt, imm0_63:$bit_off, am_tbrcond:$target),
-       asm, "\t$Rt, $bit_off, $target", "",
-       [(node GPR64:$Rt, imm0_63:$bit_off, bb:$target)]>,
-      Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-
-  bits<5> Rt;
-  bits<6> bit_off;
-  bits<14> target;
-
-  let Inst{31}    = bit_off{5};
-  let Inst{30-25} = 0b011011;
-  let Inst{24}    = op;
-  let Inst{23-19} = bit_off{4-0};
-  let Inst{18-5}  = target;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodeTestAndBranch";
-}
-
-//---
-// Unconditional branch (immediate) instructions.
-//---
-def BranchTarget26Operand : AsmOperandClass {
-  let Name = "BranchTarget26";
-}
-def am_b_target : Operand<OtherVT> {
-  let EncoderMethod = "getBranchTargetOpValue";
-  let PrintMethod = "printAlignedBranchTarget";
-  let ParserMatchClass = BranchTarget26Operand;
-}
-def am_bl_target : Operand<i64> {
-  let EncoderMethod = "getBranchTargetOpValue";
-  let PrintMethod = "printAlignedBranchTarget";
-  let ParserMatchClass = BranchTarget26Operand;
-}
-
-class BImm<bit op, dag iops, string asm, list<dag> pattern>
-    : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
-  bits<26> addr;
-  let Inst{31}    = op;
-  let Inst{30-26} = 0b00101;
-  let Inst{25-0}  = addr;
-
-  let DecoderMethod = "DecodeUnconditionalBranch";
-}
-
-class BranchImm<bit op, string asm, list<dag> pattern>
-    : BImm<op, (ins am_b_target:$addr), asm, pattern>;
-class CallImm<bit op, string asm, list<dag> pattern>
-    : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
-
-//---
-// Basic one-operand data processing instructions.
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
-                         SDPatternOperator node>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
-      [(set regtype:$Rd, (node regtype:$Rn))]>,
-    Sched<[WriteI]> {
-  bits<5> Rd;
-  bits<5> Rn;
-
-  let Inst{30-13} = 0b101101011000000000;
-  let Inst{12-10} = opc;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass OneOperandData<bits<3> opc, string asm,
-                          SDPatternOperator node = null_frag> {
-  def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
-    let Inst{31} = 0;
-  }
-
-  def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
-    let Inst{31} = 1;
-  }
-}
-
-class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
-    : BaseOneOperandData<opc, GPR32, asm, node> {
-  let Inst{31} = 0;
-}
-
-class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
-    : BaseOneOperandData<opc, GPR64, asm, node> {
-  let Inst{31} = 1;
-}
-
-//---
-// Basic two-operand data processing instructions.
-//---
-class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
-                          list<dag> pattern>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
-      Sched<[WriteI]> {
-  let Uses = [CPSR];
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{30}    = isSub;
-  let Inst{28-21} = 0b11010000;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
-                      SDNode OpNode>
-    : BaseBaseAddSubCarry<isSub, regtype, asm,
-        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, CPSR))]>;
-
-class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
-                              SDNode OpNode>
-    : BaseBaseAddSubCarry<isSub, regtype, asm,
-        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, CPSR)),
-         (implicit CPSR)]> {
-  let Defs = [CPSR];
-}
-
-multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
-                       SDNode OpNode, SDNode OpNode_setflags> {
-  def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
-    let Inst{31} = 0;
-    let Inst{29} = 0;
-  }
-  def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
-    let Inst{31} = 1;
-    let Inst{29} = 0;
-  }
-
-  // Sets flags.
-  def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
-                                    OpNode_setflags> {
-    let Inst{31} = 0;
-    let Inst{29} = 1;
-  }
-  def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
-                                    OpNode_setflags> {
-    let Inst{31} = 1;
-    let Inst{29} = 1;
-  }
-}
-
-class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
-                     SDPatternOperator OpNode>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-      asm, "\t$Rd, $Rn, $Rm", "",
-      [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{30-21} = 0b0011010110;
-  let Inst{20-16} = Rm;
-  let Inst{15-14} = 0b00;
-  let Inst{13-10} = opc;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
-              SDPatternOperator OpNode>
-    : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
-  let Inst{10}    = isSigned;
-}
-
-multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
-  def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
-           Sched<[WriteID32]> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
-           Sched<[WriteID64]> {
-    let Inst{31} = 1;
-  }
-}
-
-class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
-                SDPatternOperator OpNode = null_frag>
-  : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
-    Sched<[WriteIS]> {
-  let Inst{11-10} = shift_type;
-}
-
-multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
-  def Wr : BaseShift<shift_type, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-
-  def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
-    let Inst{31} = 1;
-  }
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
-                                             (EXTRACT_SUBREG i64:$Rm, sub_32))>;
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
-}
-
-class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst regtype:$dst, regtype:$src1, regtype:$src2)>;
-
-class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
-                       RegisterClass addtype, string asm,
-                       list<dag> pattern>
-  : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
-      asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<5> Ra;
-  let Inst{30-24} = 0b0011011;
-  let Inst{23-21} = opc;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = isSub;
-  let Inst{14-10} = Ra;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
-  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
-      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
-      Sched<[WriteIM32]> {
-    let Inst{31} = 0;
-  }
-
-  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
-      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
-      Sched<[WriteIM64]> {
-    let Inst{31} = 1;
-  }
-}
-
-class WideMulAccum<bit isSub, bits<3> opc, string asm,
-                   SDNode AccNode, SDNode ExtNode>
-  : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
-    [(set GPR64:$Rd, (AccNode GPR64:$Ra,
-                            (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
-    Sched<[WriteIM32]> {
-  let Inst{31} = 1;
-}
-
-class MulHi<bits<3> opc, string asm, SDNode OpNode>
-  : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
-      asm, "\t$Rd, $Rn, $Rm", "",
-      [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
-    Sched<[WriteIM64]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-24} = 0b10011011;
-  let Inst{23-21} = opc;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = 0b011111;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class MulAccumWAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
-class MulAccumXAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
-class WideMulAccumAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
-
-class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
-              SDPatternOperator OpNode, string asm>
-  : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
-      asm, "\t$Rd, $Rn, $Rm", "",
-      [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
-    Sched<[WriteISReg]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-
-  let Inst{31} = sf;
-  let Inst{30-21} = 0b0011010110;
-  let Inst{20-16} = Rm;
-  let Inst{15-13} = 0b010;
-  let Inst{12} = C;
-  let Inst{11-10} = sz;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-//---
-// Address generation.
-//---
-
-class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
-    : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
-        pattern>,
-      Sched<[WriteI]> {
-  bits<5>  Xd;
-  bits<21> label;
-  let Inst{31}    = page;
-  let Inst{30-29} = label{1-0};
-  let Inst{28-24} = 0b10000;
-  let Inst{23-5}  = label{20-2};
-  let Inst{4-0}   = Xd;
-
-  let DecoderMethod = "DecodeAdrInstruction";
-}
-
-//---
-// Move immediate.
-//---
-
-def movimm32_imm : Operand<i32> {
-  let ParserMatchClass = Imm0_65535Operand;
-  let EncoderMethod = "getMoveWideImmOpValue";
-}
-def movimm32_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = MovImm32ShifterOperand;
-}
-def movimm64_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = MovImm64ShifterOperand;
-}
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
-                        string asm>
-  : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
-       asm, "\t$Rd, $imm$shift", "", []>,
-    Sched<[WriteImm]> {
-  bits<5> Rd;
-  bits<16> imm;
-  bits<6> shift;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100101;
-  let Inst{22-21} = shift{5-4};
-  let Inst{20-5}  = imm;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeMoveImmInstruction";
-}
-
-multiclass MoveImmediate<bits<2> opc, string asm> {
-  def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
-    let Inst{31} = 0;
-  }
-
-  def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
-                          string asm>
-  : I<(outs regtype:$Rd),
-      (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
-       asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
-    Sched<[WriteI]> {
-  bits<5> Rd;
-  bits<16> imm;
-  bits<6> shift;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100101;
-  let Inst{22-21} = shift{5-4};
-  let Inst{20-5}  = imm;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeMoveImmInstruction";
-}
-
-multiclass InsertImmediate<bits<2> opc, string asm> {
-  def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
-    let Inst{31} = 0;
-  }
-
-  def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Add/Subtract
-//---
-
-class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
-                    RegisterClass srcRegtype, addsub_shifted_imm immtype,
-                    string asm, SDPatternOperator OpNode>
-    : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
-        asm, "\t$Rd, $Rn, $imm", "",
-        [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
-      Sched<[WriteI]>  {
-  bits<5>  Rd;
-  bits<5>  Rn;
-  bits<14> imm;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b10001;
-  let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
-  let Inst{21-10} = imm{11-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-  let DecoderMethod = "DecodeBaseAddSubImm";
-}
-
-class BaseAddSubRegPseudo<RegisterClass regtype,
-                          SDPatternOperator OpNode>
-    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
-      Sched<[WriteI]>;
-
-class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
-                     arith_shifted_reg shifted_regtype, string asm,
-                     SDPatternOperator OpNode>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
-        asm, "\t$Rd, $Rn, $Rm", "",
-        [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
-      Sched<[WriteISReg]> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<8> shift;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b01011;
-  let Inst{23-22} = shift{7-6};
-  let Inst{21}    = 0;
-  let Inst{20-16} = src2;
-  let Inst{15-10} = shift{5-0};
-  let Inst{9-5}   = src1;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
-}
-
-class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
-                     RegisterClass src1Regtype, Operand src2Regtype,
-                     string asm, SDPatternOperator OpNode>
-    : I<(outs dstRegtype:$R1),
-        (ins src1Regtype:$R2, src2Regtype:$R3),
-        asm, "\t$R1, $R2, $R3", "",
-        [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
-      Sched<[WriteIEReg]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<6> ext;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b01011;
-  let Inst{23-21} = 0b001;
-  let Inst{20-16} = Rm;
-  let Inst{15-13} = ext{5-3};
-  let Inst{12-10} = ext{2-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeAddSubERegInstruction";
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
-                       RegisterClass src1Regtype, RegisterClass src2Regtype,
-                       Operand ext_op, string asm>
-    : I<(outs dstRegtype:$Rd),
-        (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
-        asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
-      Sched<[WriteIEReg]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<6> ext;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b01011;
-  let Inst{23-21} = 0b001;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = ext{5};
-  let Inst{12-10} = ext{2-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeAddSubERegInstruction";
-}
-
-// Aliases for register+register add/subtract.
-class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
-                     RegisterClass src1Regtype, RegisterClass src2Regtype,
-                     int shiftExt>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
-                      shiftExt)>;
-
-multiclass AddSub<bit isSub, string mnemonic,
-                  SDPatternOperator OpNode = null_frag> {
-  let hasSideEffects = 0 in {
-  // Add/Subtract immediate
-  def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
-                           mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xri  : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
-                           mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-
-  // Add/Subtract register - Only used for CodeGen
-  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
-
-  // Add/Subtract shifted register
-  def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
-                           OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
-                           OpNode> {
-    let Inst{31} = 1;
-  }
-  }
-
-  // Add/Subtract extended register
-  let AddedComplexity = 1, hasSideEffects = 0 in {
-  def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
-                           arith_extended_reg32<i32>, mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
-                           arith_extended_reg32to64<i64>, mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-  }
-
-  def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
-                               arith_extendlsl64, mnemonic> {
-    // UXTX and SXTX only.
-    let Inst{14-13} = 0b11;
-    let Inst{31} = 1;
-  }
-
-  // Register/register aliases with no shift when SP is not used.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
-                       GPR32, GPR32, GPR32, 0>;
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
-                       GPR64, GPR64, GPR64, 0>;
-
-  // Register/register aliases with no shift when either the destination or
-  // first source register is SP.  This relies on the shifted register aliases
-  // above matching first in the case when SP is not used.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32sp, GPR32sp, GPR32, 16>; // UXTW #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64sp, GPR64sp, GPR64, 24>; // UXTX #0
-}
-
-multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode> {
-  let isCompare = 1, Defs = [CPSR] in {
-  // Add/Subtract immediate
-  def Wri  : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
-                           mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xri  : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
-                           mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-
-  // Add/Subtract register
-  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
-
-  // Add/Subtract shifted register
-  def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
-                           OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
-                           OpNode> {
-    let Inst{31} = 1;
-  }
-
-  // Add/Subtract extended register
-  let AddedComplexity = 1 in {
-  def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
-                           arith_extended_reg32<i32>, mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
-                           arith_extended_reg32<i64>, mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-  }
-
-  def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
-                               arith_extendlsl64, mnemonic> {
-    // UXTX and SXTX only.
-    let Inst{14-13} = 0b11;
-    let Inst{31} = 1;
-  }
-  } // Defs = [CPSR]
-
-  // Register/register aliases with no shift when SP is not used.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
-                       GPR32, GPR32, GPR32, 0>;
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
-                       GPR64, GPR64, GPR64, 0>;
-
-  // Register/register aliases with no shift when the first source register
-  // is SP.  This relies on the shifted register aliases above matching first
-  // in the case when SP is not used.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32, GPR32sp, GPR32, 16>; // UXTW #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64, GPR64sp, GPR64, 24>; // UXTX #0
-}
-
-//---
-// Extract
-//---
-def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
-                                      SDTCisPtrTy<3>]>;
-def ARM64Extr : SDNode<"ARM64ISD::EXTR", SDTA64EXTR>;
-
-class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
-                     list<dag> patterns>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
-         asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
-      Sched<[WriteExtr, ReadExtrHi]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<6> imm;
-
-  let Inst{30-23} = 0b00100111;
-  let Inst{21}    = 0;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = imm;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass ExtractImm<string asm> {
-  def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
-                      [(set GPR32:$Rd,
-                        (ARM64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
-    let Inst{31} = 0;
-    let Inst{22} = 0;
-  }
-  def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
-                      [(set GPR64:$Rd,
-                        (ARM64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
-
-    let Inst{31} = 1;
-    let Inst{22} = 1;
-  }
-}
-
-//---
-// Bitfield
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseBitfieldImm<bits<2> opc,
-                      RegisterClass regtype, Operand imm_type, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
-         asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
-      Sched<[WriteIS]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> immr;
-  bits<6> imms;
-
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100110;
-  let Inst{21-16} = immr;
-  let Inst{15-10} = imms;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass BitfieldImm<bits<2> opc, string asm> {
-  def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
-    let Inst{31} = 0;
-    let Inst{22} = 0;
-  }
-  def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
-    let Inst{31} = 1;
-    let Inst{22} = 1;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseBitfieldImmWith2RegArgs<bits<2> opc,
-                      RegisterClass regtype, Operand imm_type, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
-                             imm_type:$imms),
-         asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
-      Sched<[WriteIS]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> immr;
-  bits<6> imms;
-
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100110;
-  let Inst{21-16} = immr;
-  let Inst{15-10} = imms;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
-  def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
-    let Inst{31} = 0;
-    let Inst{22} = 0;
-  }
-  def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
-    let Inst{31} = 1;
-    let Inst{22} = 1;
-  }
-}
-
-//---
-// Logical
-//---
-
-// Logical (immediate)
-class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
-                     RegisterClass sregtype, Operand imm_type, string asm,
-                     list<dag> pattern>
-    : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
-         asm, "\t$Rd, $Rn, $imm", "", pattern>,
-      Sched<[WriteI]> {
-  bits<5>  Rd;
-  bits<5>  Rn;
-  bits<13> imm;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100100;
-  let Inst{22}    = imm{12};
-  let Inst{21-16} = imm{11-6};
-  let Inst{15-10} = imm{5-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeLogicalImmInstruction";
-}
-
-// Logical (shifted register)
-class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
-                      logical_shifted_reg shifted_regtype, string asm,
-                      list<dag> pattern>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
-        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
-      Sched<[WriteISReg]> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<8> shift;
-  let Inst{30-29} = opc;
-  let Inst{28-24} = 0b01010;
-  let Inst{23-22} = shift{7-6};
-  let Inst{21}    = N;
-  let Inst{20-16} = src2;
-  let Inst{15-10} = shift{5-0};
-  let Inst{9-5}   = src1;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
-}
-
-// Aliases for register+register logical instructions.
-class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
-
-let AddedComplexity = 6 in
-multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> {
-  def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
-                           [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
-                                               logical_imm32:$imm))]> {
-    let Inst{31} = 0;
-    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
-  }
-  def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
-                           [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
-                                               logical_imm64:$imm))]> {
-    let Inst{31} = 1;
-  }
-}
-
-multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
-  let isCompare = 1, Defs = [CPSR] in {
-  def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
-      [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
-    let Inst{31} = 0;
-    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
-  }
-  def Xri  : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
-      [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
-    let Inst{31} = 1;
-  }
-  } // end Defs = [CPSR]
-}
-
-class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
-    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
-      Sched<[WriteI]>;
-
-// Split from LogicalImm as not all instructions have both.
-multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
-                      SDPatternOperator OpNode> {
-  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
-
-  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
-                            [(set GPR32:$Rd, (OpNode GPR32:$Rn,
-                                                 logical_shifted_reg32:$Rm))]> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
-                            [(set GPR64:$Rd, (OpNode GPR64:$Rn,
-                                                 logical_shifted_reg64:$Rm))]> {
-    let Inst{31} = 1;
-  }
-
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
-}
-
-// Split from LogicalReg to allow setting CPSR Defs
-multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic> {
-  let Defs = [CPSR], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic, []>{
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic, []>{
-    let Inst{31} = 1;
-  }
-  } // Defs = [CPSR]
-
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
-}
-
-//---
-// Conditionally set flags
-//---
-
-// Condition code.
-// 4-bit immediate. Pretty-printed as <cc>
-def ccode : Operand<i32> {
-  let PrintMethod = "printCondCode";
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
-      Sched<[WriteI]> {
-  let Uses = [CPSR];
-  let Defs = [CPSR];
-
-  bits<5> Rn;
-  bits<5> imm;
-  bits<4> nzcv;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b111010010;
-  let Inst{20-16} = imm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = 0b0;
-  let Inst{3-0}   = nzcv;
-}
-
-multiclass CondSetFlagsImm<bit op, string asm> {
-  def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
-      Sched<[WriteI]> {
-  let Uses = [CPSR];
-  let Defs = [CPSR];
-
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> nzcv;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b111010010;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = 0b0;
-  let Inst{3-0}   = nzcv;
-}
-
-multiclass CondSetFlagsReg<bit op, string asm> {
-  def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Conditional select
-//---
-
-class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
-         asm, "\t$Rd, $Rn, $Rm, $cond", "",
-         [(set regtype:$Rd,
-               (ARM64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), CPSR))]>,
-      Sched<[WriteI]> {
-  let Uses = [CPSR];
-
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b011010100;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = op2;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass CondSelect<bit op, bits<2> op2, string asm> {
-  def Wr : BaseCondSelect<op, op2, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseCondSelect<op, op2, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
-                       PatFrag frag>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
-         asm, "\t$Rd, $Rn, $Rm, $cond", "",
-         [(set regtype:$Rd,
-               (ARM64csel regtype:$Rn, (frag regtype:$Rm),
-               (i32 imm:$cond), CPSR))]>,
-      Sched<[WriteI]> {
-  let Uses = [CPSR];
-
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b011010100;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = op2;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
-  def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Special Mask Value
-//---
-def maski8_or_more : Operand<i32>,
-  ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
-}
-def maski16_or_more : Operand<i32>,
-  ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
-}
-
-
-//---
-// Load/store
-//---
-
-// (unsigned immediate)
-// Indexed for 8-bit registers. offset is in range [0,4095].
-def MemoryIndexed8Operand : AsmOperandClass {
-  let Name = "MemoryIndexed8";
-  let DiagnosticType = "InvalidMemoryIndexed8";
-}
-def am_indexed8 : Operand<i64>,
-                  ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []> {
-  let PrintMethod = "printAMIndexed8";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale1>";
-  let ParserMatchClass = MemoryIndexed8Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// Indexed for 16-bit registers. offset is multiple of 2 in range [0,8190],
-// stored as immval/2 (the 12-bit literal that encodes directly into the insn).
-def MemoryIndexed16Operand : AsmOperandClass {
-  let Name = "MemoryIndexed16";
-  let DiagnosticType = "InvalidMemoryIndexed16";
-}
-def am_indexed16 : Operand<i64>,
-                   ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []> {
-  let PrintMethod = "printAMIndexed16";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale2>";
-  let ParserMatchClass = MemoryIndexed16Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// Indexed for 32-bit registers. offset is multiple of 4 in range [0,16380],
-// stored as immval/4 (the 12-bit literal that encodes directly into the insn).
-def MemoryIndexed32Operand : AsmOperandClass {
-  let Name = "MemoryIndexed32";
-  let DiagnosticType = "InvalidMemoryIndexed32";
-}
-def am_indexed32 : Operand<i64>,
-                   ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []> {
-  let PrintMethod = "printAMIndexed32";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale4>";
-  let ParserMatchClass = MemoryIndexed32Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// Indexed for 64-bit registers. offset is multiple of 8 in range [0,32760],
-// stored as immval/8 (the 12-bit literal that encodes directly into the insn).
-def MemoryIndexed64Operand : AsmOperandClass {
-  let Name = "MemoryIndexed64";
-  let DiagnosticType = "InvalidMemoryIndexed64";
-}
-def am_indexed64 : Operand<i64>,
-                   ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []> {
-  let PrintMethod = "printAMIndexed64";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale8>";
-  let ParserMatchClass = MemoryIndexed64Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// Indexed for 128-bit registers. offset is multiple of 16 in range [0,65520],
-// stored as immval/16 (the 12-bit literal that encodes directly into the insn).
-def MemoryIndexed128Operand : AsmOperandClass {
-  let Name = "MemoryIndexed128";
-  let DiagnosticType = "InvalidMemoryIndexed128";
-}
-def am_indexed128 : Operand<i64>,
-                   ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []> {
-  let PrintMethod = "printAMIndexed128";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale16>";
-  let ParserMatchClass = MemoryIndexed128Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// No offset.
-def MemoryNoIndexOperand : AsmOperandClass { let Name = "MemoryNoIndex"; }
-def am_noindex : Operand<i64>,
-                 ComplexPattern<i64, 1, "SelectAddrModeNoIndex", []> {
-  let PrintMethod = "printAMNoIndex";
-  let ParserMatchClass = MemoryNoIndexOperand;
-  let MIOperandInfo = (ops GPR64sp:$base);
-}
-
-class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                      string asm, list<dag> pattern>
-    : I<oops, iops, asm, "\t$Rt, $addr", "", pattern> {
-  bits<5> dst;
-
-  bits<17> addr;
-  bits<5> base = addr{4-0};
-  bits<12> offset = addr{16-5};
-
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b01;
-  let Inst{23-22} = opc;
-  let Inst{21-10} = offset;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeUnsignedLdStInstruction";
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             Operand indextype, string asm, list<dag> pattern>
-    : BaseLoadStoreUI<sz, V, opc,
-                      (outs regtype:$Rt), (ins indextype:$addr), asm, pattern>,
-      Sched<[WriteLD]>;
-
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-class StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             Operand indextype, string asm, list<dag> pattern>
-    : BaseLoadStoreUI<sz, V, opc,
-                      (outs), (ins regtype:$Rt, indextype:$addr), asm, pattern>,
-      Sched<[WriteST]>;
-
-def PrefetchOperand : AsmOperandClass {
-  let Name = "Prefetch";
-  let ParserMethod = "tryParsePrefetch";
-}
-def prfop : Operand<i32> {
-  let PrintMethod = "printPrefetchOp";
-  let ParserMatchClass = PrefetchOperand;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
-    : BaseLoadStoreUI<sz, V, opc,
-                      (outs), (ins prfop:$Rt, am_indexed64:$addr), asm, pat>,
-      Sched<[WriteLD]>;
-
-//---
-// Load literal
-//---
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
-    : I<(outs regtype:$Rt), (ins am_brcond:$label),
-        asm, "\t$Rt, $label", "", []>,
-      Sched<[WriteLD]> {
-  bits<5> Rt;
-  bits<19> label;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b011;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-5}  = label;
-  let Inst{4-0}   = Rt;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
-    : I<(outs), (ins prfop:$Rt, am_brcond:$label),
-        asm, "\t$Rt, $label", "", pat>,
-      Sched<[WriteLD]> {
-  bits<5> Rt;
-  bits<19> label;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b011;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-5}  = label;
-  let Inst{4-0}   = Rt;
-}
-
-//---
-// Load/store register offset
-//---
-
-class MemROAsmOperand<int sz> : AsmOperandClass {
-  let Name = "MemoryRegisterOffset"#sz;
-}
-
-def MemROAsmOperand8 : MemROAsmOperand<8>;
-def MemROAsmOperand16 : MemROAsmOperand<16>;
-def MemROAsmOperand32 : MemROAsmOperand<32>;
-def MemROAsmOperand64 : MemROAsmOperand<64>;
-def MemROAsmOperand128 : MemROAsmOperand<128>;
-
-class ro_indexed<int sz> : Operand<i64> { // ComplexPattern<...>
-  let PrintMethod = "printMemoryRegOffset"#sz;
-  let MIOperandInfo = (ops GPR64sp:$base, GPR64:$offset, i32imm:$extend);
-}
-
-def ro_indexed8 : ro_indexed<8>, ComplexPattern<i64, 3, "SelectAddrModeRO8", []> {
-  let ParserMatchClass = MemROAsmOperand8;
-}
-
-def ro_indexed16 : ro_indexed<16>, ComplexPattern<i64, 3, "SelectAddrModeRO16", []> {
-  let ParserMatchClass = MemROAsmOperand16;
-}
-
-def ro_indexed32 : ro_indexed<32>, ComplexPattern<i64, 3, "SelectAddrModeRO32", []> {
-  let ParserMatchClass = MemROAsmOperand32;
-}
-
-def ro_indexed64 : ro_indexed<64>, ComplexPattern<i64, 3, "SelectAddrModeRO64", []> {
-  let ParserMatchClass = MemROAsmOperand64;
-}
-
-def ro_indexed128 : ro_indexed<128>, ComplexPattern<i64, 3, "SelectAddrModeRO128", []> {
-  let ParserMatchClass = MemROAsmOperand128;
-}
-
-class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-class Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore8RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed8:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-class Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore8RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed8:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-class Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore16RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed16:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-class Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore16RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed16:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-class Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore32RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed32:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-class Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore32RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed32:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore64RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed64:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-class Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore64RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed64:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-
-class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore128RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed128:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-class Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore128RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed128:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
-    : I<(outs), (ins prfop:$Rt, ro_indexed64:$addr), asm,
-         "\t$Rt, $addr", "", pat>,
-      Sched<[WriteLD]> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-//---
-// Load/store unscaled immediate
-//---
-
-def MemoryUnscaledOperand : AsmOperandClass {
-  let Name = "MemoryUnscaled";
-  let DiagnosticType = "InvalidMemoryIndexedSImm9";
-}
-class am_unscaled_operand : Operand<i64> {
-  let PrintMethod = "printAMUnscaled";
-  let ParserMatchClass = MemoryUnscaledOperand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled   : am_unscaled_operand;
-def am_unscaled8  : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
-def am_unscaled16 : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
-def am_unscaled32 : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
-def am_unscaled64 : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
-def am_unscaled128 : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
-
-class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                           string asm, list<dag> pattern>
-    : I<oops, iops, asm, "\t$Rt, $addr", "", pattern> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let AddedComplexity = 1 in // try this before LoadUI
-class LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                   Operand amtype, string asm, list<dag> pattern>
-    : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
-                           (ins amtype:$addr), asm, pattern>,
-      Sched<[WriteLD]>;
-
-let AddedComplexity = 1 in // try this before StoreUI
-class StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                    Operand amtype, string asm, list<dag> pattern>
-    : BaseLoadStoreUnscale<sz, V, opc, (outs),
-                           (ins regtype:$Rt, amtype:$addr), asm, pattern>,
-      Sched<[WriteST]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
-    : BaseLoadStoreUnscale<sz, V, opc, (outs),
-                           (ins prfop:$Rt, am_unscaled:$addr), asm, pat>,
-      Sched<[WriteLD]>;
-
-//---
-// Load/store unscaled immediate, unprivileged
-//---
-
-class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
-                                dag oops, dag iops, string asm>
-    : I<oops, iops, asm, "\t$Rt, $addr", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in {
-class LoadUnprivileged<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                   string asm>
-    : BaseLoadStoreUnprivileged<sz, V, opc,
-                      (outs regtype:$Rt), (ins am_unscaled:$addr), asm>,
-      Sched<[WriteLD]>;
-}
-
-let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
-class StoreUnprivileged<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                    string asm>
-    : BaseLoadStoreUnprivileged<sz, V, opc,
-                      (outs), (ins regtype:$Rt, am_unscaled:$addr), asm>,
-      Sched<[WriteST]>;
-}
-
-//---
-// Load/store pre-indexed
-//---
-
-class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                          string asm, string cstr>
-    : I<oops, iops, asm, "\t$Rt, $addr!", cstr, []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling.
-  bits<5> dst;
-  bits<5> base;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b11;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-// FIXME: Modeling the write-back of these instructions for isel is tricky.
-//        we need the complex addressing mode for the memory reference, but
-//        we also need the write-back specified as a tied operand to the
-//        base register. That combination does not play nicely with
-//        the asm matcher and friends.
-class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePreIdx<sz, V, opc,
-                     (outs regtype:$Rt/*, GPR64sp:$wback*/),
-                     (ins am_unscaled:$addr), asm, ""/*"$addr.base = $wback"*/>,
-      Sched<[WriteLD, WriteAdr]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePreIdx<sz, V, opc,
-                      (outs/* GPR64sp:$wback*/),
-                      (ins regtype:$Rt, am_unscaled:$addr),
-                       asm, ""/*"$addr.base = $wback"*/>,
-      Sched<[WriteAdr, WriteST]>;
-} // hasSideEffects = 0
-
-// ISel pseudo-instructions which have the tied operands. When the MC lowering
-// logic finally gets smart enough to strip off tied operands that are just
-// for isel convenience, we can get rid of these pseudos and just reference
-// the real instructions directly.
-//
-// Ironically, also because of the writeback operands, we can't put the
-// matcher pattern directly on the instruction, but need to define it
-// separately.
-//
-// Loads aren't matched with patterns here at all, but rather in C++
-// custom lowering.
-let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in {
-class LoadPreIdxPseudo<RegisterClass regtype>
-    : Pseudo<(outs regtype:$Rt, GPR64sp:$wback),
-             (ins am_noindex:$addr, simm9:$offset), [],
-              "$addr.base = $wback,@earlyclobber $wback">,
-      Sched<[WriteLD, WriteAdr]>;
-class LoadPostIdxPseudo<RegisterClass regtype>
-    : Pseudo<(outs regtype:$Rt, GPR64sp:$wback),
-             (ins am_noindex:$addr, simm9:$offset), [],
-              "$addr.base = $wback,@earlyclobber $wback">,
-      Sched<[WriteLD, WriteI]>;
-}
-multiclass StorePreIdxPseudo<RegisterClass regtype, ValueType Ty,
-                             SDPatternOperator OpNode> {
-  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
-  def _isel: Pseudo<(outs GPR64sp:$wback),
-                    (ins regtype:$Rt, am_noindex:$addr, simm9:$offset), [],
-                    "$addr.base = $wback,@earlyclobber $wback">,
-      Sched<[WriteAdr, WriteST]>;
-
-  def : Pat<(OpNode (Ty regtype:$Rt), am_noindex:$addr, simm9:$offset),
-            (!cast<Instruction>(NAME#_isel) regtype:$Rt, am_noindex:$addr,
-                                            simm9:$offset)>;
-}
-
-//---
-// Load/store post-indexed
-//---
-
-// (pre-index) load/stores.
-class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                          string asm, string cstr>
-    : I<oops, iops, asm, "\t$Rt, $addr, $idx", cstr, []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling.
-  bits<5> dst;
-  bits<5> base;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0b0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b01;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-// FIXME: Modeling the write-back of these instructions for isel is tricky.
-//        we need the complex addressing mode for the memory reference, but
-//        we also need the write-back specified as a tied operand to the
-//        base register. That combination does not play nicely with
-//        the asm matcher and friends.
-class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePostIdx<sz, V, opc,
-                      (outs regtype:$Rt/*, GPR64sp:$wback*/),
-                      (ins am_noindex:$addr, simm9:$idx),
-                      asm, ""/*"$addr.base = $wback"*/>,
-      Sched<[WriteLD, WriteI]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePostIdx<sz, V, opc,
-                      (outs/* GPR64sp:$wback*/),
-                      (ins regtype:$Rt, am_noindex:$addr, simm9:$idx),
-                       asm, ""/*"$addr.base = $wback"*/>,
-    Sched<[WriteAdr, WriteST, ReadAdrBase]>;
-} // hasSideEffects = 0
-
-// ISel pseudo-instructions which have the tied operands. When the MC lowering
-// logic finally gets smart enough to strip off tied operands that are just
-// for isel convenience, we can get rid of these pseudos and just reference
-// the real instructions directly.
-//
-// Ironically, also because of the writeback operands, we can't put the
-// matcher pattern directly on the instruction, but need to define it
-// separately.
-multiclass StorePostIdxPseudo<RegisterClass regtype, ValueType Ty,
-                              SDPatternOperator OpNode, Instruction Insn> {
-  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
-  def _isel: Pseudo<(outs GPR64sp:$wback),
-                    (ins regtype:$Rt, am_noindex:$addr, simm9:$idx), [],
-                    "$addr.base = $wback,@earlyclobber $wback">,
-      PseudoInstExpansion<(Insn regtype:$Rt, am_noindex:$addr, simm9:$idx)>,
-      Sched<[WriteAdr, WriteST, ReadAdrBase]>;
-
-  def : Pat<(OpNode (Ty regtype:$Rt), am_noindex:$addr, simm9:$idx),
-            (!cast<Instruction>(NAME#_isel) regtype:$Rt, am_noindex:$addr,
-                                            simm9:$idx)>;
-}
-
-//---
-// Load/store pair
-//---
-
-// (indexed, offset)
-
-class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> dst2;
-  bits<5> base;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b010;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = dst2;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
-                     Operand indextype, string asm>
-    : BaseLoadStorePairOffset<opc, V, 1,
-                              (outs regtype:$Rt, regtype:$Rt2),
-                              (ins indextype:$addr), asm>,
-      Sched<[WriteLD, WriteLDHi]>;
-
-let mayLoad = 0, mayStore = 1 in
-class StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand indextype, string asm>
-    : BaseLoadStorePairOffset<opc, V, 0, (outs),
-                             (ins regtype:$Rt, regtype:$Rt2, indextype:$addr),
-                             asm>,
-      Sched<[WriteSTP]>;
-} // hasSideEffects = 0
-
-// (pre-indexed)
-
-def MemoryIndexed32SImm7 : AsmOperandClass {
-  let Name = "MemoryIndexed32SImm7";
-  let DiagnosticType = "InvalidMemoryIndexed32SImm7";
-}
-def am_indexed32simm7 : Operand<i32> { // ComplexPattern<...>
-  let PrintMethod = "printAMIndexed32";
-  let ParserMatchClass = MemoryIndexed32SImm7;
-  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
-}
-
-def MemoryIndexed64SImm7 : AsmOperandClass {
-  let Name = "MemoryIndexed64SImm7";
-  let DiagnosticType = "InvalidMemoryIndexed64SImm7";
-}
-def am_indexed64simm7 : Operand<i32> { // ComplexPattern<...>
-  let PrintMethod = "printAMIndexed64";
-  let ParserMatchClass = MemoryIndexed64SImm7;
-  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
-}
-
-def MemoryIndexed128SImm7 : AsmOperandClass {
-  let Name = "MemoryIndexed128SImm7";
-  let DiagnosticType = "InvalidMemoryIndexed128SImm7";
-}
-def am_indexed128simm7 : Operand<i32> { // ComplexPattern<...>
-  let PrintMethod = "printAMIndexed128";
-  let ParserMatchClass = MemoryIndexed128SImm7;
-  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
-}
-
-class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr!", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> dst2;
-  bits<5> base;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b011;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = dst2;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
-                     Operand addrmode, string asm>
-    : BaseLoadStorePairPreIdx<opc, V, 1,
-                              (outs regtype:$Rt, regtype:$Rt2),
-                              (ins addrmode:$addr), asm>,
-      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand addrmode, string asm>
-    : BaseLoadStorePairPreIdx<opc, V, 0, (outs),
-                             (ins regtype:$Rt, regtype:$Rt2, addrmode:$addr),
-                             asm>,
-      Sched<[WriteAdr, WriteSTP]>;
-} // hasSideEffects = 0
-
-// (post-indexed)
-
-class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr, $idx", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> dst2;
-  bits<5> base;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b001;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = dst2;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand idxtype, string asm>
-    : BaseLoadStorePairPostIdx<opc, V, 1,
-                              (outs regtype:$Rt, regtype:$Rt2),
-                              (ins am_noindex:$addr, idxtype:$idx), asm>,
-      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
-                       Operand idxtype, string asm>
-    : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
-                             (ins regtype:$Rt, regtype:$Rt2,
-                                  am_noindex:$addr, idxtype:$idx),
-                             asm>,
-      Sched<[WriteAdr, WriteSTP]>;
-} // hasSideEffects = 0
-
-//  (no-allocate)
-
-class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> dst2;
-  bits<5> base;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b000;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = dst2;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
-                     Operand indextype, string asm>
-    : BaseLoadStorePairNoAlloc<opc, V, 1,
-                              (outs regtype:$Rt, regtype:$Rt2),
-                              (ins indextype:$addr), asm>,
-      Sched<[WriteLD, WriteLDHi]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand indextype, string asm>
-    : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
-                             (ins regtype:$Rt, regtype:$Rt2, indextype:$addr),
-                             asm>,
-      Sched<[WriteSTP]>;
-} // hasSideEffects = 0
-
-//---
-// Load/store exclusive
-//---
-
-// True exclusive operations write to and/or read from the system's exclusive
-// monitors, which as far as a compiler is concerned can be modelled as a
-// random shared memory address. Hence LoadExclusive mayStore.
-let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
-class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                             dag oops, dag iops, string asm, string operands>
-    : I<oops, iops, asm, operands, "", []> {
-  let Inst{31-30} = sz;
-  let Inst{29-24} = 0b001000;
-  let Inst{23}    = o2;
-  let Inst{22}    = L;
-  let Inst{21}    = o1;
-  let Inst{15}    = o0;
-
-  let DecoderMethod = "DecodeExclusiveLdStInstruction";
-}
-
-// Neither Rs nor Rt2 operands.
-class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                               dag oops, dag iops, string asm, string operands>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
-  bits<5> reg;
-  bits<5> base;
-  let Inst{20-16} = 0b11111;
-  let Inst{14-10} = 0b11111;
-  let Inst{9-5} = base;
-  let Inst{4-0} = reg;
-}
-
-// Simple load acquires don't set the exclusive monitor
-let mayLoad = 1, mayStore = 0 in
-class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                  RegisterClass regtype, string asm>
-    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
-                               (ins am_noindex:$addr), asm, "\t$Rt, $addr">,
-      Sched<[WriteLD]>;
-
-class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                    RegisterClass regtype, string asm>
-    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
-                               (ins am_noindex:$addr), asm, "\t$Rt, $addr">,
-      Sched<[WriteLD]>;
-
-class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                       RegisterClass regtype, string asm>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
-                             (outs regtype:$Rt, regtype:$Rt2),
-                             (ins am_noindex:$addr), asm,
-                             "\t$Rt, $Rt2, $addr">,
-      Sched<[WriteLD, WriteLDHi]> {
-  bits<5> dst1;
-  bits<5> dst2;
-  bits<5> base;
-  let Inst{20-16} = 0b11111;
-  let Inst{14-10} = dst2;
-  let Inst{9-5} = base;
-  let Inst{4-0} = dst1;
-}
-
-// Simple store release operations do not check the exclusive monitor.
-let mayLoad = 0, mayStore = 1 in
-class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                   RegisterClass regtype, string asm>
-    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
-                               (ins regtype:$Rt, am_noindex:$addr),
-                               asm, "\t$Rt, $addr">,
-      Sched<[WriteST]>;
-
-let mayLoad = 1, mayStore = 1 in
-class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                     RegisterClass regtype, string asm>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
-                             (ins regtype:$Rt, am_noindex:$addr),
-                             asm, "\t$Ws, $Rt, $addr">,
-      Sched<[WriteSTX]> {
-  bits<5> status;
-  bits<5> reg;
-  bits<5> base;
-  let Inst{20-16} = status;
-  let Inst{14-10} = 0b11111;
-  let Inst{9-5} = base;
-  let Inst{4-0} = reg;
-
-  let Constraints = "@earlyclobber $Ws";
-}
-
-class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                         RegisterClass regtype, string asm>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
-                             (outs GPR32:$Ws),
-                             (ins regtype:$Rt, regtype:$Rt2, am_noindex:$addr),
-                              asm, "\t$Ws, $Rt, $Rt2, $addr">,
-      Sched<[WriteSTX]> {
-  bits<5> status;
-  bits<5> dst1;
-  bits<5> dst2;
-  bits<5> base;
-  let Inst{20-16} = status;
-  let Inst{14-10} = dst2;
-  let Inst{9-5} = base;
-  let Inst{4-0} = dst1;
-
-  let Constraints = "@earlyclobber $Ws";
-}
-
-//---
-// Exception generation
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
-    : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
-      Sched<[WriteSys]> {
-  bits<16> imm;
-  let Inst{31-24} = 0b11010100;
-  let Inst{23-21} = op1;
-  let Inst{20-5}  = imm;
-  let Inst{4-2}   = 0b000;
-  let Inst{1-0}   = ll;
-}
-
-//---
-// Floating point to integer conversion
-//---
-
-class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
-                      RegisterClass srcType, RegisterClass dstType,
-                      string asm, list<dag> pattern>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn),
-         asm, "\t$Rd, $Rn", "", pattern>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30}    = 0;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
-                      RegisterClass srcType, RegisterClass dstType,
-                      Operand immType, string asm>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
-         asm, "\t$Rd, $Rn, $scale", "", []>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> scale;
-  let Inst{30}    = 0;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21}    = 0;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = scale;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPToInteger<bits<2> rmode, bits<3> opcode, string asm, SDPatternOperator OpN> {
-  // Unscaled single-precision to 32-bit
-  def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
-                                     [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-  }
-
-  // Unscaled single-precision to 64-bit
-  def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
-                                     [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-
-  // Unscaled double-precision to 32-bit
-  def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
-                                     [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-  }
-
-  // Unscaled double-precision to 64-bit
-  def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
-                                     [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-
-  // Scaled single-precision to 32-bit
-  def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
-                              fixedpoint32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-  }
-
-  // Scaled single-precision to 64-bit
-  def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
-                              fixedpoint64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-
-  // Scaled double-precision to 32-bit
-  def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
-                              fixedpoint32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-  }
-
-  // Scaled double-precision to 64-bit
-  def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
-                              fixedpoint64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-}
-
-//---
-// Integer to floating point conversion
-//---
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseIntegerToFP<bit isUnsigned,
-                      RegisterClass srcType, RegisterClass dstType,
-                      Operand immType, string asm>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
-         asm, "\t$Rd, $Rn, $scale", "", []>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> scale;
-  let Inst{30-23} = 0b00111100;
-  let Inst{21-17} = 0b00001;
-  let Inst{16}    = isUnsigned;
-  let Inst{15-10} = scale;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseIntegerToFPUnscaled<bit isUnsigned,
-                      RegisterClass srcType, RegisterClass dstType,
-                      ValueType dvt, string asm, SDNode node>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn),
-         asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> scale;
-  let Inst{30-23} = 0b00111100;
-  let Inst{21-17} = 0b10001;
-  let Inst{16}    = isUnsigned;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
-  // Unscaled
-  def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  // Scaled
-  def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-}
-
-//---
-// Unscaled integer <-> floating point conversion (i.e. FMOV)
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
-                      RegisterClass srcType, RegisterClass dstType,
-                      string asm>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
-        // We use COPY_TO_REGCLASS for these bitconvert operations.
-        // copyPhysReg() expands the resultant COPY instructions after
-        // regalloc is done. This gives greater freedom for the allocator
-        // and related passes (coalescing, copy propagation, et. al.) to
-        // be more effective.
-        [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
-      Sched<[WriteFCopy]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-23} = 0b00111100;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
-                     RegisterClass srcType, RegisterOperand dstType, string asm>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd[1], $Rn", "", []>,
-      Sched<[WriteFCopy]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-23} = 0b00111101;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
-                     RegisterOperand srcType, RegisterClass dstType, string asm>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn[1]", "", []>,
-      Sched<[WriteFCopy]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-23} = 0b00111101;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-
-
-multiclass UnscaledConversion<string asm> {
-  def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
-                                             asm#".d"> {
-    let Inst{31} = 1;
-    let Inst{22} = 0;
-  }
-
-  def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
-                                               asm#".d"> {
-    let Inst{31} = 1;
-    let Inst{22} = 0;
-  }
-
-  def : InstAlias<asm#"$Vd.d[1], $Rn",
-                  (!cast<Instruction>(NAME#XDHighr) V128:$Vd, GPR64:$Rn), 0>;
-  def : InstAlias<asm#"$Rd, $Vn.d[1]",
-                  (!cast<Instruction>(NAME#DXHighr) GPR64:$Rd, V128:$Vn), 0>;
-}
-
-//---
-// Floating point conversion
-//---
-
-class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
-                       RegisterClass srcType, string asm, list<dag> pattern>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-24} = 0b00011110;
-  let Inst{23-22} = type;
-  let Inst{21-17} = 0b10001;
-  let Inst{16-15} = opcode;
-  let Inst{14-10} = 0b10000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPConversion<string asm> {
-  // Double-precision to Half-precision
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-  def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm, []>;
-
-  // Double-precision to Single-precision
-  def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
-                             [(set FPR32:$Rd, (fround FPR64:$Rn))]>;
-
-  // Half-precision to Double-precision
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-  def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, []>;
-
-  // Half-precision to Single-precision
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-  def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, []>;
-
-  // Single-precision to Double-precision
-  def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
-                             [(set FPR64:$Rd, (fextend FPR32:$Rn))]>;
-
-  // Single-precision to Half-precision
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-  def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, []>;
-}
-
-//---
-// Single operand floating point data processing
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
-                              ValueType vt, string asm, SDPatternOperator node>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
-         [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
-      Sched<[WriteF]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21-19} = 0b100;
-  let Inst{18-15} = opcode;
-  let Inst{14-10} = 0b10000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SingleOperandFPData<bits<4> opcode, string asm,
-                               SDPatternOperator node = null_frag> {
-  def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-//---
-// Two operand floating point data processing
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
-                           string asm, list<dag> pat>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-         asm, "\t$Rd, $Rn, $Rm", "", pat>,
-      Sched<[WriteF]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass TwoOperandFPData<bits<4> opcode, string asm,
-                            SDPatternOperator node = null_frag> {
-  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
-                         [(set (f32 FPR32:$Rd),
-                               (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
-                         [(set (f64 FPR64:$Rd),
-                               (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
-  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
-                  [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
-                  [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-
-//---
-// Three operand floating point data processing
-//---
-
-class BaseThreeOperandFPData<bit isNegated, bit isSub,
-                             RegisterClass regtype, string asm, list<dag> pat>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
-         asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
-      Sched<[WriteFMul]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<5> Ra;
-  let Inst{31-23} = 0b000111110;
-  let Inst{21}    = isNegated;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = isSub;
-  let Inst{14-10} = Ra;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
-                              SDPatternOperator node> {
-  def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
-            [(set FPR32:$Rd,
-                  (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
-            [(set FPR64:$Rd,
-                  (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-//---
-// Floating point data comparisons
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseOneOperandFPComparison<bit signalAllNans,
-                                 RegisterClass regtype, string asm,
-                                 list<dag> pat>
-    : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
-      Sched<[WriteFCmp]> {
-  bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-
-  let Inst{20-16} = 0b00000;
-  let Inst{15-10} = 0b001000;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = signalAllNans;
-  let Inst{3-0}   = 0b1000;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
-                                string asm, list<dag> pat>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
-      Sched<[WriteFCmp]> {
-  bits<5> Rm;
-  bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = 0b001000;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = signalAllNans;
-  let Inst{3-0}   = 0b0000;
-}
-
-multiclass FPComparison<bit signalAllNans, string asm,
-                        SDPatternOperator OpNode = null_frag> {
-  let Defs = [CPSR] in {
-  def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
-      [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit CPSR)]> {
-    let Inst{22} = 0;
-  }
-
-  def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
-      [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit CPSR)]> {
-    let Inst{22} = 0;
-  }
-
-  def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
-      [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit CPSR)]> {
-    let Inst{22} = 1;
-  }
-
-  def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
-      [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit CPSR)]> {
-    let Inst{22} = 1;
-  }
-  } // Defs = [CPSR]
-}
-
-//---
-// Floating point conditional comparisons
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseFPCondComparison<bit signalAllNans,
-                              RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
-      Sched<[WriteFCmp]> {
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> nzcv;
-  bits<4> cond;
-
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b01;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = signalAllNans;
-  let Inst{3-0}   = nzcv;
-}
-
-multiclass FPCondComparison<bit signalAllNans, string asm> {
-  let Defs = [CPSR], Uses = [CPSR] in {
-  def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
-    let Inst{22} = 0;
-  }
-
-  def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
-    let Inst{22} = 1;
-  }
-  } // Defs = [CPSR], Uses = [CPSR]
-}
-
-//---
-// Floating point conditional select
-//---
-
-class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
-         asm, "\t$Rd, $Rn, $Rm, $cond", "",
-         [(set regtype:$Rd,
-               (ARM64csel (vt regtype:$Rn), regtype:$Rm,
-                          (i32 imm:$cond), CPSR))]>,
-      Sched<[WriteF]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> cond;
-
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b11;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPCondSelect<string asm> {
-  let Uses = [CPSR] in {
-  def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
-    let Inst{22} = 0;
-  }
-
-  def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
-    let Inst{22} = 1;
-  }
-  } // Uses = [CPSR]
-}
-
-//---
-// Floating move immediate
-//---
-
-class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
-  : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
-      [(set regtype:$Rd, fpimmtype:$imm)]>,
-    Sched<[WriteFImm]> {
-  bits<5> Rd;
-  bits<8> imm;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-13} = imm;
-  let Inst{12-5}  = 0b10000000;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPMoveImmediate<string asm> {
-  def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
-    let Inst{22} = 0;
-  }
-
-  def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
-    let Inst{22} = 1;
-  }
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD
-//----------------------------------------------------------------------------
-
-def VectorIndexBOperand : AsmOperandClass { let Name = "VectorIndexB"; }
-def VectorIndexHOperand : AsmOperandClass { let Name = "VectorIndexH"; }
-def VectorIndexSOperand : AsmOperandClass { let Name = "VectorIndexS"; }
-def VectorIndexDOperand : AsmOperandClass { let Name = "VectorIndexD"; }
-def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 16;
-}]> {
-  let ParserMatchClass = VectorIndexBOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 8;
-}]> {
-  let ParserMatchClass = VectorIndexHOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 4;
-}]> {
-  let ParserMatchClass = VectorIndexSOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 2;
-}]> {
-  let ParserMatchClass = VectorIndexDOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD three register vector instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string kind,
-                        list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
-      "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string kind,
-                        list<dag> pattern>
-  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
-      "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// All operand sizes distinguished in the encoding.
-multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
-                               SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-         [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-         [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
-                                      asm, ".2d",
-         [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
-}
-
-// As above, but D sized elements unsupported.
-multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-        [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-        [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-        [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-        [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-        [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-        [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
-}
-
-multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-      [(set (v8i8 V64:$dst),
-            (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-      [(set (v16i8 V128:$dst),
-            (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-      [(set (v4i16 V64:$dst),
-            (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-      [(set (v8i16 V128:$dst),
-            (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-      [(set (v2i32 V64:$dst),
-            (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-      [(set (v4i32 V128:$dst),
-            (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-}
-
-// As above, but only B sized elements supported.
-multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-    [(set (v16i8 V128:$Rd),
-          (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-}
-
-// As above, but only S and D sized floating point elements supported.
-multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
-                                 string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
-                                      asm, ".2s",
-        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
-                                      asm, ".4s",
-        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
-                                      asm, ".2d",
-        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
-}
-
-multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
-                                    string asm,
-                                    SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
-                                      asm, ".2s",
-        [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
-                                      asm, ".4s",
-        [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
-                                      asm, ".2d",
-        [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
-}
-
-multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
-                                 string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
-                                      asm, ".2s",
-     [(set (v2f32 V64:$dst),
-           (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
-                                      asm, ".4s",
-     [(set (v4f32 V128:$dst),
-           (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
-                                      asm, ".2d",
-     [(set (v2f64 V128:$dst),
-           (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
-}
-
-// As above, but D and B sized elements unsupported.
-multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-        [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-        [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-        [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-        [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-}
-
-// Logical three vector ops share opcode bits, and only use B sized elements.
-multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
-                                     asm, ".8b",
-                         [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
-  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
-                                     asm, ".16b",
-                         [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
-
-  def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
-          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
-  def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
-          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
-  def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
-          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
-
-  def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
-      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
-  def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
-      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
-  def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
-      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
-}
-
-multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
-                                  string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
-                                     asm, ".8b",
-             [(set (v8i8 V64:$dst),
-                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
-                                     asm, ".16b",
-             [(set (v16i8 V128:$dst),
-                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
-                           (v16i8 V128:$Rm)))]>;
-
-  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
-                           (v4i16 V64:$RHS))),
-          (!cast<Instruction>(NAME#"v8i8")
-            V64:$LHS, V64:$MHS, V64:$RHS)>;
-  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
-                           (v2i32 V64:$RHS))),
-          (!cast<Instruction>(NAME#"v8i8")
-            V64:$LHS, V64:$MHS, V64:$RHS)>;
-  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
-                           (v1i64 V64:$RHS))),
-          (!cast<Instruction>(NAME#"v8i8")
-            V64:$LHS, V64:$MHS, V64:$RHS)>;
-
-  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
-                           (v8i16 V128:$RHS))),
-      (!cast<Instruction>(NAME#"v16i8")
-        V128:$LHS, V128:$MHS, V128:$RHS)>;
-  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
-                           (v4i32 V128:$RHS))),
-      (!cast<Instruction>(NAME#"v16i8")
-        V128:$LHS, V128:$MHS, V128:$RHS)>;
-  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
-                           (v2i64 V128:$RHS))),
-      (!cast<Instruction>(NAME#"v16i8")
-        V128:$LHS, V128:$MHS, V128:$RHS)>;
-}
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD two register vector instructions.
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string dstkind,
-                        string srckind, list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
-      "{\t$Rd" # dstkind # ", $Rn" # srckind #
-      "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                            RegisterOperand regtype, string asm, string dstkind,
-                            string srckind, list<dag> pattern>
-  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
-      "{\t$Rd" # dstkind # ", $Rn" # srckind #
-      "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// Supports B, H, and S element sizes.
-multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
-                            SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b", ".8b",
-                          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b", ".16b",
-                          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h", ".4h",
-                          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h", ".8h",
-                          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s", ".2s",
-                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s", ".4s",
-                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-}
-
-class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
-                            RegisterOperand regtype, string asm, string dstkind,
-                            string srckind, string amount>
-  : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
-      "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
-      "|" # dstkind # "\t$Rd, $Rn, #" #  amount # "}", "", []>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29-24} = 0b101110;
-  let Inst{23-22} = size;
-  let Inst{21-10} = 0b100001001110;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDVectorLShiftLongBySizeBHS {
-  let neverHasSideEffects = 1 in {
-  def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
-                                             "shll", ".8h",  ".8b", "8">;
-  def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
-                                             "shll2", ".8h", ".16b", "8">;
-  def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
-                                             "shll", ".4s",  ".4h", "16">;
-  def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
-                                             "shll2", ".4s", ".8h", "16">;
-  def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
-                                             "shll", ".2d",  ".2s", "32">;
-  def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
-                                             "shll2", ".2d", ".4s", "32">;
-  }
-}
-
-// Supports all element sizes.
-multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".4h", ".8b",
-               [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".8h", ".16b",
-               [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".2s", ".4h",
-               [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".4s", ".8h",
-               [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".1d", ".2s",
-               [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".2d", ".4s",
-               [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-}
-
-multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
-                                          asm, ".4h", ".8b",
-      [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
-                                      (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
-                                          asm, ".8h", ".16b",
-      [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
-                                      (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
-                                          asm, ".2s", ".4h",
-      [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
-                                      (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
-                                          asm, ".4s", ".8h",
-      [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
-                                      (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
-                                          asm, ".1d", ".2s",
-      [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
-                                      (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
-                                          asm, ".2d", ".4s",
-      [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
-                                      (v4i32 V128:$Rn)))]>;
-}
-
-// Supports all element sizes, except 1xD.
-multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
-                                    asm, ".8b", ".8b",
-    [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
-                                    asm, ".16b", ".16b",
-    [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
-                                    asm, ".4h", ".4h",
-    [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
-                                    asm, ".8h", ".8h",
-    [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
-                                    asm, ".2s", ".2s",
-    [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
-                                    asm, ".4s", ".4s",
-    [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
-                                    asm, ".2d", ".2d",
-    [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
-}
-
-multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                asm, ".8b", ".8b",
-    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                asm, ".16b", ".16b",
-    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                asm, ".4h", ".4h",
-    [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                asm, ".8h", ".8h",
-    [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
-                                asm, ".2s", ".2s",
-    [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
-                                asm, ".4s", ".4s",
-    [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
-                                asm, ".2d", ".2d",
-    [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
-}
-
-
-// Supports only B element sizes.
-multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
-                          SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
-                                asm, ".8b", ".8b",
-                    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
-                                asm, ".16b", ".16b",
-                    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-
-}
-
-// Supports only B and H element sizes.
-multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                asm, ".8b", ".8b",
-                    [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                asm, ".16b", ".16b",
-                    [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                asm, ".4h", ".4h",
-                    [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                asm, ".8h", ".8h",
-                    [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
-}
-
-// Supports only S and D element sizes, uses high bit of the size field
-// as an extra opcode bit.
-multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
-                                asm, ".2d", ".2d",
-                          [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
-}
-
-// Supports only S element size.
-multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-}
-
-
-multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
-                                asm, ".2d", ".2d",
-                          [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
-}
-
-multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
-                                asm, ".2d", ".2d",
-                          [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
-}
-
-
-class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand inreg, RegisterOperand outreg,
-                           string asm, string outkind, string inkind,
-                           list<dag> pattern>
-  : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind #
-      "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand inreg, RegisterOperand outreg,
-                           string asm, string outkind, string inkind,
-                           list<dag> pattern>
-  : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind #
-      "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
-                              SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
-                                      asm, ".8b", ".8h",
-        [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
-                                      asm#"2", ".16b", ".8h", []>;
-  def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
-                                      asm, ".4h", ".4s",
-        [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
-                                      asm#"2", ".8h", ".4s", []>;
-  def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
-                                      asm, ".2s", ".2d",
-        [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
-  def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
-                                      asm#"2", ".4s", ".2d", []>;
-
-  def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v16i8")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v8i16")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v4i32")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-}
-
-class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand regtype, string asm, string kind,
-                           ValueType dty, ValueType sty, SDNode OpNode>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", #0" #
-      "|" # kind # "\t$Rd, $Rn, #0}", "",
-      [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// Comparisons support all element sizes, except 1xD.
-multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
-                            SDNode OpNode> {
-  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
-                                     asm, ".8b",
-                                     v8i8, v8i8, OpNode>;
-  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
-                                     asm, ".16b",
-                                     v16i8, v16i8, OpNode>;
-  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
-                                     asm, ".4h",
-                                     v4i16, v4i16, OpNode>;
-  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
-                                     asm, ".8h",
-                                     v8i16, v8i16, OpNode>;
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
-                                     asm, ".2s",
-                                     v2i32, v2i32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
-                                     asm, ".4s",
-                                     v4i32, v4i32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
-                                     asm, ".2d",
-                                     v2i64, v2i64, OpNode>;
-}
-
-// FP Comparisons support only S and D element sizes.
-multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
-                              string asm, SDNode OpNode> {
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
-                                     asm, ".2s",
-                                     v2i32, v2f32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
-                                     asm, ".4s",
-                                     v4i32, v4f32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
-                                     asm, ".2d",
-                                     v2i64, v2f64, OpNode>;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                             RegisterOperand outtype, RegisterOperand intype,
-                             string asm, string VdTy, string VnTy,
-                             list<dag> pattern>
-  : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
-      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                             RegisterOperand outtype, RegisterOperand intype,
-                             string asm, string VdTy, string VnTy,
-                             list<dag> pattern>
-  : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
-      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
-  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
-                                    asm, ".4s", ".4h", []>;
-  def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
-                                    asm#"2", ".4s", ".8h", []>;
-  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
-                                    asm, ".2d", ".2s", []>;
-  def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
-                                    asm#"2", ".2d", ".4s", []>;
-}
-
-multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
-  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
-                                    asm, ".4h", ".4s", []>;
-  def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
-                                    asm#"2", ".8h", ".4s", []>;
-  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
-                                    asm, ".2s", ".2d", []>;
-  def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
-                                    asm#"2", ".4s", ".2d", []>;
-}
-
-multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
-                                     Intrinsic OpNode> {
-  def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
-                                     asm, ".2s", ".2d",
-                          [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
-  def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
-                                    asm#"2", ".4s", ".2d", []>;
-
-  def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v4f32")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD three register different-size vector instructions.
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
-                      RegisterOperand outtype, RegisterOperand intype1,
-                      RegisterOperand intype2, string asm,
-                      string outkind, string inkind1, string inkind2,
-                      list<dag> pattern>
-  : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
-      "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size{0};
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size{2-1};
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
-                      RegisterOperand outtype, RegisterOperand intype1,
-                      RegisterOperand intype2, string asm,
-                      string outkind, string inkind1, string inkind2,
-                      list<dag> pattern>
-  : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
-      "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size{0};
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size{2-1};
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// FIXME: TableGen doesn't know how to deal with expanded types that also
-//        change the element count (in this case, placing the results in
-//        the high elements of the result register rather than the low
-//        elements). Until that's fixed, we can't code-gen those.
-multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
-                                    Intrinsic IntOp> {
-  def v8i16_v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V64, V128, V128,
-                                                  asm, ".8b", ".8h", ".8h",
-     [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v8i16_v16i8  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".16b", ".8h", ".8h",
-     []>;
-  def v4i32_v4i16  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V64, V128, V128,
-                                                  asm, ".4h", ".4s", ".4s",
-     [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-  def v4i32_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".8h", ".4s", ".4s",
-     []>;
-  def v2i64_v2i32  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V64, V128, V128,
-                                                  asm, ".2s", ".2d", ".2d",
-     [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
-  def v2i64_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".2d", ".2d",
-     []>;
-
-
-  // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
-  // a version attached to an instruction.
-  def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
-                                                   (v8i16 V128:$Rm))),
-            (!cast<Instruction>(NAME # "v8i16_v16i8")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, V128:$Rm)>;
-  def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
-                                                    (v4i32 V128:$Rm))),
-            (!cast<Instruction>(NAME # "v4i32_v8i16")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, V128:$Rm)>;
-  def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
-                                                    (v2i64 V128:$Rm))),
-            (!cast<Instruction>(NAME # "v2i64_v4i32")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, V128:$Rm)>;
-}
-
-multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
-                                      Intrinsic IntOp> {
-  def v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                            V128, V64, V64,
-                                            asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                            V128, V128, V128,
-                                            asm#"2", ".8h", ".16b", ".16b", []>;
-  def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
-                                            V128, V64, V64,
-                                            asm, ".1q", ".1d", ".1d", []>;
-  def v2i64  : BaseSIMDDifferentThreeVector<U, 0b111, opc,
-                                            V128, V128, V128,
-                                            asm#"2", ".1q", ".2d", ".2d", []>;
-
-  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
-                          (v8i8 (extract_high_v16i8 V128:$Rm)))),
-      (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
-}
-
-multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
-                                 SDPatternOperator OpNode> {
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
-                                      (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
-                                      (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd),
-            (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-      [(set (v8i16 V128:$Rd),
-            (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
-                                (extract_high_v16i8 V128:$Rm)))))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd),
-            (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd),
-            (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
-                                  (extract_high_v8i16 V128:$Rm)))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd),
-            (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd),
-            (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
-                                 (extract_high_v4i32 V128:$Rm)))))]>;
-}
-
-multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
-                                          string asm,
-                                          SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-    [(set (v8i16 V128:$dst),
-          (add (v8i16 V128:$Rd),
-               (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-    [(set (v8i16 V128:$dst),
-          (add (v8i16 V128:$Rd),
-               (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
-                                   (extract_high_v16i8 V128:$Rm))))))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (add (v4i32 V128:$Rd),
-               (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (add (v4i32 V128:$Rd),
-               (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
-                                    (extract_high_v8i16 V128:$Rm))))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (add (v2i64 V128:$Rd),
-               (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (add (v2i64 V128:$Rd),
-               (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
-                                    (extract_high_v4i32 V128:$Rm))))))]>;
-}
-
-multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
-                                      (extract_high_v16i8 V128:$Rm)))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
-                                      (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
-                                      (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
-                                      string asm,
-                                      SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-    [(set (v8i16 V128:$dst),
-          (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-    [(set (v8i16 V128:$dst),
-          (OpNode (v8i16 V128:$Rd),
-                  (extract_high_v16i8 V128:$Rn),
-                  (extract_high_v16i8 V128:$Rm)))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (OpNode (v4i32 V128:$Rd),
-                  (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (OpNode (v2i64 V128:$Rd),
-                  (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
-                                           SDPatternOperator Accum> {
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull (v4i16 V64:$Rn),
-                                                (v4i16 V64:$Rm)))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
-                                            (extract_high_v8i16 V128:$Rm)))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_arm64_neon_sqdmull (v2i32 V64:$Rn),
-                                                (v2i32 V64:$Rm)))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_arm64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
-                                            (extract_high_v4i32 V128:$Rm)))))]>;
-}
-
-multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V128, V64,
-                                                  asm, ".8h", ".8h", ".8b",
-       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".8h", ".8h", ".16b",
-       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                                       (extract_high_v16i8 V128:$Rm)))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V128, V64,
-                                                  asm, ".4s", ".4s", ".4h",
-       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".4s", ".8h",
-       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                                       (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V128, V64,
-                                                  asm, ".2d", ".2d", ".2s",
-       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".2d", ".4s",
-       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                                       (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD bitwise extract from vector
-//----------------------------------------------------------------------------
-
-class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
-                             string asm, string kind>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
-      "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
-      [(set (vty regtype:$Rd),
-            (ARM64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> imm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size;
-  let Inst{29-21} = 0b101110000;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{14-11} = imm;
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-
-multiclass SIMDBitwiseExtract<string asm> {
-  def v8i8  : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b">;
-  def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD zip vector
-//----------------------------------------------------------------------------
-
-class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
-                        string asm, string kind, SDNode OpNode, ValueType valty>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
-      "|" # kind # "\t$Rd, $Rn, $Rm}", "",
-      [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size{0};
-  let Inst{29-24} = 0b001110;
-  let Inst{23-22} = size{2-1};
-  let Inst{21}    = 0;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{14-12} = opc;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDZipVector<bits<3>opc, string asm,
-                         SDNode OpNode> {
-  def v8i8   : BaseSIMDZipVector<0b000, opc, V64,
-      asm, ".8b", OpNode, v8i8>;
-  def v16i8  : BaseSIMDZipVector<0b001, opc, V128,
-      asm, ".16b", OpNode, v16i8>;
-  def v4i16  : BaseSIMDZipVector<0b010, opc, V64,
-      asm, ".4h", OpNode, v4i16>;
-  def v8i16  : BaseSIMDZipVector<0b011, opc, V128,
-      asm, ".8h", OpNode, v8i16>;
-  def v2i32  : BaseSIMDZipVector<0b100, opc, V64,
-      asm, ".2s", OpNode, v2i32>;
-  def v4i32  : BaseSIMDZipVector<0b101, opc, V128,
-      asm, ".4s", OpNode, v4i32>;
-  def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
-      asm, ".2d", OpNode, v2i64>;
-
-  def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
-        (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
-  def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
-        (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
-  def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
-        (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD three register scalar instructions
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, string asm,
-                        list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
-      "\t$Rd, $Rn, $Rm", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
-                            SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
-}
-
-multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
-                               SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
-  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
-
-  def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-            (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
-  def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
-            (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
-                             [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
-}
-
-multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
-      [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
-      [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
-  }
-
-  def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
-}
-
-multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
-                                SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
-      [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
-      [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
-  }
-
-  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
-}
-
-class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
-              dag oops, dag iops, string asm, string cstr, list<dag> pat>
-  : I<oops, iops, asm,
-      "\t$Rd, $Rn, $Rm", cstr, pat>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
-                                      (outs FPR32:$Rd),
-                                      (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
-  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
-                                      (outs FPR64:$Rd),
-                                      (ins FPR32:$Rn, FPR32:$Rm), asm, "",
-            [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
-                                      (outs FPR32:$dst),
-                                      (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
-                                      asm, "$Rd = $dst", []>;
-  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
-                                      (outs FPR64:$dst),
-                                      (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
-                                      asm, "$Rd = $dst",
-            [(set (i64 FPR64:$dst),
-                  (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD two register scalar instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, RegisterClass regtype2,
-                        string asm, list<dag> pat>
-  : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
-      "\t$Rd, $Rn", "", pat>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, RegisterClass regtype2,
-                        string asm, list<dag> pat>
-  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
-      "\t$Rd, $Rn", "$Rd = $dst", pat>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, string asm>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
-      "\t$Rd, $Rn, #0", "", []>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
-  : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
-     [(set (f32 FPR32:$Rd), (int_arm64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-17} = 0b011111100110000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm>;
-
-  def : Pat<(v1i64 (OpNode FPR64:$Rn)),
-            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
-}
-
-multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
-                              SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm>;
-  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm>;
-
-  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
-}
-
-multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
-                          SDPatternOperator OpNode = null_frag> {
-  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
-
-  def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
-}
-
-multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
-  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
-  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
-}
-
-multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
-                              SDPatternOperator OpNode> {
-  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
-                                [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
-  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
-                                [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
-}
-
-multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
-           [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
-           [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
-    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
-}
-
-multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
-                                 Intrinsic OpNode> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def v1i64  : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
-        [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
-    def v1i32  : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
-        [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
-    def v1i16  : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
-    def v1i8   : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
-}
-
-
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode = null_frag> {
-  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
-        [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
-  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar pairwise instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, RegisterOperand vectype,
-                        string asm, string kind>
-  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
-      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
-  def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
-                                      asm, ".2d">;
-}
-
-multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
-  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
-                                      asm, ".2s">;
-  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
-                                      asm, ".2d">;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD across lanes instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
-                          RegisterClass regtype, RegisterOperand vectype,
-                          string asm, string kind, list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
-      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
-                              string asm> {
-  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8,  V64,
-                                   asm, ".8b", []>;
-  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8,  V128,
-                                   asm, ".16b", []>;
-  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
-                                   asm, ".4h", []>;
-  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
-                                   asm, ".8h", []>;
-  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
-                                   asm, ".4s", []>;
-}
-
-multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
-  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
-                                   asm, ".8b", []>;
-  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
-                                   asm, ".16b", []>;
-  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
-                                   asm, ".4h", []>;
-  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
-                                   asm, ".8h", []>;
-  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
-                                   asm, ".4s", []>;
-}
-
-multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
-                            Intrinsic intOp> {
-  def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
-                                   asm, ".4s",
-        [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD INS/DUP instructions
-//----------------------------------------------------------------------------
-
-// FIXME: There has got to be a better way to factor these. ugh.
-
-class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
-                     string operands, string constraints, list<dag> pattern>
-  : I<outs, ins, asm, operands, constraints, pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31} = 0;
-  let Inst{30} = Q;
-  let Inst{29} = op;
-  let Inst{28-21} = 0b01110000;
-  let Inst{15} = 0;
-  let Inst{10} = 1;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
-                      RegisterOperand vecreg, RegisterClass regtype>
-  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
-                   "{\t$Rd" # size # ", $Rn" #
-                   "|" # size # "\t$Rd, $Rn}", "",
-                   [(set (vectype vecreg:$Rd), (ARM64dup regtype:$Rn))]> {
-  let Inst{20-16} = imm5;
-  let Inst{14-11} = 0b0001;
-}
-
-class SIMDDupFromElement<bit Q, string dstkind, string srckind,
-                         ValueType vectype, ValueType insreg,
-                         RegisterOperand vecreg, Operand idxtype,
-                         ValueType elttype, SDNode OpNode>
-  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
-                   "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
-                   "|" # dstkind # "\t$Rd, $Rn$idx}", "",
-                 [(set (vectype vecreg:$Rd),
-                       (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
-  let Inst{14-11} = 0b0000;
-}
-
-class SIMDDup64FromElement
-  : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
-                       VectorIndexD, i64, ARM64duplane64> {
-  bits<1> idx;
-  let Inst{20} = idx;
-  let Inst{19-16} = 0b1000;
-}
-
-class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
-                           RegisterOperand vecreg>
-  : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
-                       VectorIndexS, i64, ARM64duplane32> {
-  bits<2> idx;
-  let Inst{20-19} = idx;
-  let Inst{18-16} = 0b100;
-}
-
-class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
-                           RegisterOperand vecreg>
-  : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
-                       VectorIndexH, i64, ARM64duplane16> {
-  bits<3> idx;
-  let Inst{20-18} = idx;
-  let Inst{17-16} = 0b10;
-}
-
-class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
-                          RegisterOperand vecreg>
-  : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
-                       VectorIndexB, i64, ARM64duplane8> {
-  bits<4> idx;
-  let Inst{20-17} = idx;
-  let Inst{16} = 1;
-}
-
-class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
-                  Operand idxtype, string asm, list<dag> pattern>
-  : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
-                   "{\t$Rd, $Rn" # size # "$idx" #
-                   "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
-  let Inst{14-11} = imm4;
-}
-
-class SIMDSMov<bit Q, string size, RegisterClass regtype,
-               Operand idxtype>
-  : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
-class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
-               Operand idxtype>
-  : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
-      [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
-
-class SIMDMovAlias<string asm, string size, Instruction inst,
-                   RegisterClass regtype, Operand idxtype>
-    : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
-                    "|" # size # "\t$dst, $src$idx}",
-                (inst regtype:$dst, V128:$src, idxtype:$idx)>;
-
-multiclass SMov {
-  def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-}
-
-multiclass UMov {
-  def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-  def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
-    bits<1> idx;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-  }
-  def : SIMDMovAlias<"mov", ".s",
-                     !cast<Instruction>(NAME#"vi32"),
-                     GPR32, VectorIndexS>;
-  def : SIMDMovAlias<"mov", ".d",
-                     !cast<Instruction>(NAME#"vi64"),
-                     GPR64, VectorIndexD>;
-}
-
-class SIMDInsFromMain<string size, ValueType vectype,
-                      RegisterClass regtype, Operand idxtype>
-  : BaseSIMDInsDup<1, 0, (outs V128:$dst),
-                   (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
-                   "{\t$Rd" # size # "$idx, $Rn" #
-                   "|" # size # "\t$Rd$idx, $Rn}",
-                   "$Rd = $dst",
-            [(set V128:$dst,
-              (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
-  let Inst{14-11} = 0b0011;
-}
-
-class SIMDInsFromElement<string size, ValueType vectype,
-                         ValueType elttype, Operand idxtype>
-  : BaseSIMDInsDup<1, 1, (outs V128:$dst),
-                   (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
-                   "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
-                   "|" # size # "\t$Rd$idx, $Rn$idx2}",
-                   "$Rd = $dst",
-         [(set V128:$dst,
-               (vector_insert
-                 (vectype V128:$Rd),
-                 (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
-                 idxtype:$idx))]>;
-
-class SIMDInsMainMovAlias<string size, Instruction inst,
-                          RegisterClass regtype, Operand idxtype>
-    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
-                        "|" # size #"\t$dst$idx, $src}",
-                (inst V128:$dst, idxtype:$idx, regtype:$src)>;
-class SIMDInsElementMovAlias<string size, Instruction inst,
-                             Operand idxtype>
-    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
-                      # "|" # size #" $dst$idx, $src$idx2}",
-                (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
-
-
-multiclass SIMDIns {
-  def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-  def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
-    bits<1> idx;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-  }
-
-  def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
-    bits<4> idx;
-    bits<4> idx2;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-    let Inst{14-11} = idx2;
-  }
-  def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
-    bits<3> idx;
-    bits<3> idx2;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-    let Inst{14-12} = idx2;
-    let Inst{11} = 0;
-  }
-  def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
-    bits<2> idx;
-    bits<2> idx2;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-    let Inst{14-13} = idx2;
-    let Inst{12-11} = 0;
-  }
-  def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
-    bits<1> idx;
-    bits<1> idx2;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-    let Inst{14} = idx2;
-    let Inst{13-11} = 0;
-  }
-
-  // For all forms of the INS instruction, the "mov" mnemonic is the
-  // preferred alias. Why they didn't just call the instruction "mov" in
-  // the first place is a very good question indeed...
-  def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
-                         GPR32, VectorIndexB>;
-  def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
-                         GPR32, VectorIndexH>;
-  def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
-                         GPR32, VectorIndexS>;
-  def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
-                         GPR64, VectorIndexD>;
-
-  def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
-                         VectorIndexB>;
-  def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
-                         VectorIndexH>;
-  def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
-                         VectorIndexS>;
-  def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
-                         VectorIndexD>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD TBL/TBX
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
-                          RegisterOperand listtype, string asm, string kind>
-  : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
-       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
-    Sched<[WriteV]> {
-  bits<5> Vd;
-  bits<5> Vn;
-  bits<5> Vm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29-21} = 0b001110000;
-  let Inst{20-16} = Vm;
-  let Inst{15}    = 0;
-  let Inst{14-13} = len;
-  let Inst{12}    = op;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Vn;
-  let Inst{4-0}   = Vd;
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
-                          RegisterOperand listtype, string asm, string kind>
-  : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
-       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
-    Sched<[WriteV]> {
-  bits<5> Vd;
-  bits<5> Vn;
-  bits<5> Vm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29-21} = 0b001110000;
-  let Inst{20-16} = Vm;
-  let Inst{15}    = 0;
-  let Inst{14-13} = len;
-  let Inst{12}    = op;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Vn;
-  let Inst{4-0}   = Vd;
-}
-
-class SIMDTableLookupAlias<string asm, Instruction inst,
-                          RegisterOperand vectype, RegisterOperand listtype>
-    : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
-                (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
-
-multiclass SIMDTableLookup<bit op, string asm> {
-  def v8i8One   : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
-                                      asm, ".8b">;
-  def v8i8Two   : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
-                                      asm, ".8b">;
-  def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
-                                      asm, ".8b">;
-  def v8i8Four  : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
-                                      asm, ".8b">;
-  def v16i8One  : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
-                                      asm, ".16b">;
-  def v16i8Two  : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
-                                      asm, ".16b">;
-  def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
-                                      asm, ".16b">;
-  def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
-                                      asm, ".16b">;
-
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8One"),
-                         V64, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Two"),
-                         V64, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Three"),
-                         V64, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Four"),
-                         V64, VecListFour128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8One"),
-                         V128, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Two"),
-                         V128, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Three"),
-                         V128, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Four"),
-                         V128, VecListFour128>;
-}
-
-multiclass SIMDTableLookupTied<bit op, string asm> {
-  def v8i8One   : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
-                                      asm, ".8b">;
-  def v8i8Two   : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
-                                      asm, ".8b">;
-  def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
-                                      asm, ".8b">;
-  def v8i8Four  : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
-                                      asm, ".8b">;
-  def v16i8One  : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
-                                      asm, ".16b">;
-  def v16i8Two  : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
-                                      asm, ".16b">;
-  def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
-                                      asm, ".16b">;
-  def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
-                                      asm, ".16b">;
-
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8One"),
-                         V64, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Two"),
-                         V64, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Three"),
-                         V64, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Four"),
-                         V64, VecListFour128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8One"),
-                         V128, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Two"),
-                         V128, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Three"),
-                         V128, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Four"),
-                         V128, VecListFour128>;
-}
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar CPY
-//----------------------------------------------------------------------------
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
-                        string kind, Operand idxtype>
-  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
-       "{\t$dst, $src" # kind # "$idx" #
-       "|\t$dst, $src$idx}", "", []>,
-    Sched<[WriteV]> {
-  bits<5> dst;
-  bits<5> src;
-  let Inst{31-21} = 0b01011110000;
-  let Inst{15-10} = 0b000001;
-  let Inst{9-5}   = src;
-  let Inst{4-0}   = dst;
-}
-
-class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
-      RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
-    : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
-                    # "|\t$dst, $src$index}",
-                (inst regtype:$dst, vectype:$src, idxtype:$index)>;
-
-
-multiclass SIMDScalarCPY<string asm> {
-  def i8  : BaseSIMDScalarCPY<FPR8,  V128, ".b", VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-  def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
-    bits<1> idx;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-  }
-
-  // 'DUP' mnemonic aliases.
-  def : SIMDScalarCPYAlias<"dup", ".b",
-                           !cast<Instruction>(NAME#"i8"),
-                           FPR8, V128, VectorIndexB>;
-  def : SIMDScalarCPYAlias<"dup", ".h",
-                           !cast<Instruction>(NAME#"i16"),
-                           FPR16, V128, VectorIndexH>;
-  def : SIMDScalarCPYAlias<"dup", ".s",
-                           !cast<Instruction>(NAME#"i32"),
-                           FPR32, V128, VectorIndexS>;
-  def : SIMDScalarCPYAlias<"dup", ".d",
-                           !cast<Instruction>(NAME#"i64"),
-                           FPR64, V128, VectorIndexD>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD modified immediate instructions
-//----------------------------------------------------------------------------
-
-class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
-                          string asm, string op_string,
-                          string cstr, list<dag> pattern>
-  : I<oops, iops, asm, op_string, cstr, pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<8> imm8;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = op;
-  let Inst{28-19} = 0b0111100000;
-  let Inst{18-16} = imm8{7-5};
-  let Inst{11-10} = 0b01;
-  let Inst{9-5}   = imm8{4-0};
-  let Inst{4-0}   = Rd;
-}
-
-class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
-                                Operand immtype, dag opt_shift_iop,
-                                string opt_shift, string asm, string kind,
-                                list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
-                        !con((ins immtype:$imm8), opt_shift_iop), asm,
-                        "{\t$Rd" # kind # ", $imm8" # opt_shift #
-                        "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
-                        "", pattern> {
-  let DecoderMethod = "DecodeModImmInstruction";
-}
-
-class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
-                                Operand immtype, dag opt_shift_iop,
-                                string opt_shift, string asm, string kind,
-                                list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
-                        !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
-                        asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
-                             "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
-                        "$Rd = $dst", pattern> {
-  let DecoderMethod = "DecodeModImmTiedInstruction";
-}
-
-class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
-                                     RegisterOperand vectype, string asm,
-                                     string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
-                              (ins logical_vec_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15}    = b15_b12{1};
-  let Inst{14-13} = shift;
-  let Inst{12}    = b15_b12{0};
-}
-
-class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
-                                     RegisterOperand vectype, string asm,
-                                     string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
-                              (ins logical_vec_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15}    = b15_b12{1};
-  let Inst{14-13} = shift;
-  let Inst{12}    = b15_b12{0};
-}
-
-
-class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
-                                         RegisterOperand vectype, string asm,
-                                         string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
-                              (ins logical_vec_hw_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15} = b15_b12{1};
-  let Inst{14} = 0;
-  let Inst{13} = shift{0};
-  let Inst{12} = b15_b12{0};
-}
-
-class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
-                                         RegisterOperand vectype, string asm,
-                                         string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
-                              (ins logical_vec_hw_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15} = b15_b12{1};
-  let Inst{14} = 0;
-  let Inst{13} = shift{0};
-  let Inst{12} = b15_b12{0};
-}
-
-multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
-                                      string asm> {
-  def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
-                                                 asm, ".4h", []>;
-  def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
-                                                 asm, ".8h", []>;
-
-  def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
-                                             asm, ".2s", []>;
-  def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
-                                             asm, ".4s", []>;
-}
-
-multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
-                                      bits<2> w_cmode, string asm,
-                                      SDNode OpNode> {
-  def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
-                                                 asm, ".4h",
-             [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
-                                             imm0_255:$imm8,
-                                             (i32 imm:$shift)))]>;
-  def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
-                                                 asm, ".8h",
-             [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
-                                              imm0_255:$imm8,
-                                              (i32 imm:$shift)))]>;
-
-  def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
-                                             asm, ".2s",
-             [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
-                                             imm0_255:$imm8,
-                                             (i32 imm:$shift)))]>;
-  def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
-                                             asm, ".4s",
-             [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
-                                              imm0_255:$imm8,
-                                              (i32 imm:$shift)))]>;
-}
-
-class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
-                             RegisterOperand vectype, string asm,
-                             string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
-                              (ins move_vec_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<1> shift;
-  let Inst{15-13} = cmode{3-1};
-  let Inst{12}    = shift;
-}
-
-class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
-                                   RegisterOperand vectype,
-                                   Operand imm_type, string asm,
-                                   string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
-                              asm, kind, pattern> {
-  let Inst{15-12} = cmode;
-}
-
-class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
-                                   list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
-                        "\t$Rd, $imm8", "", pattern> {
-  let Inst{15-12} = cmode;
-  let DecoderMethod = "DecodeModImmInstruction";
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD indexed element
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
-                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
-                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
-                      string apple_kind, string dst_kind, string lhs_kind,
-                      string rhs_kind, list<dag> pattern>
-  : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
-      asm,
-      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
-      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28}    = Scalar;
-  let Inst{27-24} = 0b1111;
-  let Inst{23-22} = size;
-  // Bit 21 must be set by the derived class.
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opc;
-  // Bit 11 must be set by the derived class.
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
-                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
-                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
-                      string apple_kind, string dst_kind, string lhs_kind,
-                      string rhs_kind, list<dag> pattern>
-  : I<(outs dst_reg:$dst),
-      (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
-      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
-      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28}    = Scalar;
-  let Inst{27-24} = 0b1111;
-  let Inst{23-22} = size;
-  // Bit 21 must be set by the derived class.
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opc;
-  // Bit 11 must be set by the derived class.
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s", ".s",
-    [(set (v2f32 V64:$Rd),
-        (OpNode (v2f32 V64:$Rn),
-         (v2f32 (ARM64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4f32 V128:$Rd),
-        (OpNode (v4f32 V128:$Rn),
-         (v4f32 (ARM64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
-                                      V128, V128,
-                                      V128, VectorIndexD,
-                                      asm, ".2d", ".2d", ".2d", ".d",
-    [(set (v2f64 V128:$Rd),
-        (OpNode (v2f64 V128:$Rn),
-         (v2f64 (ARM64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-
-  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
-                                      FPR32Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s",
-    [(set (f32 FPR32Op:$Rd),
-          (OpNode (f32 FPR32Op:$Rn),
-                  (f32 (vector_extract (v4f32 V128:$Rm),
-                                       VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
-                                      FPR64Op, FPR64Op, V128, VectorIndexD,
-                                      asm, ".d", "", "", ".d",
-    [(set (f64 FPR64Op:$Rd),
-          (OpNode (f64 FPR64Op:$Rn),
-                  (f64 (vector_extract (v2f64 V128:$Rm),
-                                       VectorIndexD:$idx))))]> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-}
-
-multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
-  // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64duplane32 (v4f32 V128:$Rm),
-                                           VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # v2i32_indexed)
-                V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64dup (f32 FPR32Op:$Rm)))),
-            (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-
-  // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64duplane32 (v4f32 V128:$Rm),
-                                           VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v4i32_indexed")
-                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64dup (f32 FPR32Op:$Rm)))),
-            (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-  // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64duplane64 (v2f64 V128:$Rm),
-                                           VectorIndexD:$idx))),
-            (!cast<Instruction>(INST # "v2i64_indexed")
-                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64dup (f64 FPR64Op:$Rm)))),
-            (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
-
-  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
-                V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
-                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
-
-  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
-  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
-                         (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
-            (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
-                V128:$Rm, VectorIndexD:$idx)>;
-}
-
-multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
-                                          V128, VectorIndexS,
-                                          asm, ".2s", ".2s", ".2s", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
-                                      V128, V128,
-                                      V128, VectorIndexD,
-                                      asm, ".2d", ".2d", ".2d", ".d", []> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-
-
-  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
-                                      FPR32Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
-                                      FPR64Op, FPR64Op, V128, VectorIndexD,
-                                      asm, ".d", "", "", ".d", []> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-}
-
-multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
-                         SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4h", ".4h", ".4h", ".h",
-    [(set (v4i16 V64:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".8h", ".8h", ".8h", ".h",
-    [(set (v8i16 V128:$Rd),
-       (OpNode (v8i16 V128:$Rn),
-         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s",  ".s",
-    [(set (v2i32 V64:$Rd),
-       (OpNode (v2i32 V64:$Rn),
-          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4i32 V128:$Rd),
-       (OpNode (v4i32 V128:$Rn),
-          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
-                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
-                                      asm, ".h", "", "", ".h", []> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
-                                      FPR32Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s",
-      [(set (i32 FPR32Op:$Rd),
-            (OpNode FPR32Op:$Rn,
-                    (i32 (vector_extract (v4i32 V128:$Rm),
-                                         VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
-                               SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
-                                      V64, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4h", ".4h", ".4h", ".h",
-    [(set (v4i16 V64:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".8h", ".8h", ".8h", ".h",
-    [(set (v8i16 V128:$Rd),
-       (OpNode (v8i16 V128:$Rn),
-         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s", ".s",
-    [(set (v2i32 V64:$Rd),
-       (OpNode (v2i32 V64:$Rn),
-          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4i32 V128:$Rd),
-       (OpNode (v4i32 V128:$Rn),
-          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
-                                   SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
-                                          V128_lo, VectorIndexH,
-                                          asm, ".4h", ".4h", ".4h", ".h",
-    [(set (v4i16 V64:$dst),
-        (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".8h", ".8h", ".8h", ".h",
-    [(set (v8i16 V128:$dst),
-       (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
-         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s", ".s",
-    [(set (v2i32 V64:$dst),
-       (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
-          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4i32 V128:$dst),
-       (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$Rd),
-          (OpNode (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
-
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$Rd),
-        (OpNode (v2i32 V64:$Rn),
-         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$Rd),
-          (OpNode (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
-                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
-                                      asm, ".h", "", "", ".h", []> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
-                                      FPR64Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
-                                       SDPatternOperator Accum> {
-  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull
-                             (v4i16 V64:$Rn),
-                             (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                    VectorIndexH:$idx))))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
-  // intermediate EXTRACT_SUBREG would be untyped.
-  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
-                (i32 (vector_extract (v4i32
-                         (int_arm64_neon_sqdmull (v4i16 V64:$Rn),
-                             (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                    VectorIndexH:$idx)))),
-                         (i64 0))))),
-            (EXTRACT_SUBREG
-                (!cast<Instruction>(NAME # v4i16_indexed)
-                    (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
-                    V128_lo:$Rm, VectorIndexH:$idx),
-                ssub)>;
-
-  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull
-                            (extract_high_v8i16 V128:$Rn),
-                            (extract_high_v8i16
-                                (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                VectorIndexH:$idx))))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$dst),
-        (Accum (v2i64 V128:$Rd),
-               (v2i64 (int_arm64_neon_sqdmull
-                          (v2i32 V64:$Rn),
-                          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                 VectorIndexS:$idx))))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$dst),
-          (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_arm64_neon_sqdmull
-                            (extract_high_v4i32 V128:$Rn),
-                            (extract_high_v4i32
-                                (ARM64duplane32 (v4i32 V128:$Rm),
-                                                VectorIndexS:$idx))))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
-                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
-                                      asm, ".h", "", "", ".h", []> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-
-  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
-                                      FPR64Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s",
-    [(set (i64 FPR64Op:$dst),
-          (Accum (i64 FPR64Op:$Rd),
-                 (i64 (int_arm64_neon_sqdmulls_scalar
-                            (i32 FPR32Op:$Rn),
-                            (i32 (vector_extract (v4i32 V128:$Rm),
-                                                 VectorIndexS:$idx))))))]> {
-
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
-                                   SDPatternOperator OpNode> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$Rd),
-          (OpNode (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
-
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$Rd),
-        (OpNode (v2i32 V64:$Rn),
-         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$Rd),
-          (OpNode (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-  }
-}
-
-multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
-                                       SDPatternOperator OpNode> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$dst),
-        (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$dst),
-          (OpNode (v4i32 V128:$Rd),
-                  (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$dst),
-        (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
-         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$dst),
-          (OpNode (v2i64 V128:$Rd),
-                  (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-  }
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar shift by immediate
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterClass regtype1, RegisterClass regtype2,
-                     Operand immtype, string asm, list<dag> pattern>
-  : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
-      asm, "\t$Rd, $Rn, $imm", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<7> imm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b111110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterClass regtype1, RegisterClass regtype2,
-                     Operand immtype, string asm, list<dag> pattern>
-  : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
-      asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<7> imm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b111110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-
-multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftR32, asm, []> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm,
-  [(set (i64 FPR64:$Rd),
-     (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
-            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
-}
-
-multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode = null_frag> {
-  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm,
-  [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
-                                                   (i32 vecshiftR64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
-                           (i32 vecshiftR64:$imm))),
-            (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
-                                            vecshiftR64:$imm)>;
-}
-
-multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftL64, asm,
-    [(set (v1i64 FPR64:$Rd),
-       (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
-  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftL64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
-                               SDPatternOperator OpNode = null_frag> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR16, vecshiftR8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR32, vecshiftR16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR64, vecshiftR32, asm,
-    [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
-    let Inst{20-16} = imm{4-0};
-  }
-}
-
-multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR8, vecshiftL8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR16, vecshiftL16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftL32, asm,
-    [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftL64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn),
-                                     (i32 vecshiftL64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR8, vecshiftR8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR16, vecshiftR16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftR32, asm, []> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD vector x indexed element
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterOperand dst_reg, RegisterOperand src_reg,
-                     Operand immtype,
-                     string asm, string dst_kind, string src_kind,
-                     list<dag> pattern>
-  : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
-      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
-           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b011110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterOperand vectype1, RegisterOperand vectype2,
-                     Operand immtype,
-                     string asm, string dst_kind, string src_kind,
-                     list<dag> pattern>
-  : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
-      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
-           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b011110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
-                              Intrinsic OpNode> {
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32,
-                                  asm, ".2s", ".2s",
-      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32,
-                                  asm, ".4s", ".4s",
-      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d",
-      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
-                                  Intrinsic OpNode> {
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32,
-                                  asm, ".2s", ".2s",
-      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32,
-                                  asm, ".4s", ".4s",
-      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d",
-      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
-                                     SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V128, vecshiftR16Narrow,
-                                  asm, ".8b", ".8h",
-      [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftR16Narrow,
-                                  asm#"2", ".16b", ".8h", []> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-    let hasSideEffects = 0;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V128, vecshiftR32Narrow,
-                                  asm, ".4h", ".4s",
-      [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftR32Narrow,
-                                  asm#"2", ".8h", ".4s", []> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-    let hasSideEffects = 0;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V128, vecshiftR64Narrow,
-                                  asm, ".2s", ".2d",
-      [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR64Narrow,
-                                  asm#"2", ".4s", ".2d", []> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-    let hasSideEffects = 0;
-  }
-
-  // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
-  // themselves, so put them here instead.
-
-  // Patterns involving what's effectively an insert high and a normal
-  // intrinsic, represented by CONCAT_VECTORS.
-  def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
-                                                   vecshiftR16Narrow:$imm)),
-            (!cast<Instruction>(NAME # "v16i8_shift")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, vecshiftR16Narrow:$imm)>;
-  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
-                                                     vecshiftR32Narrow:$imm)),
-            (!cast<Instruction>(NAME # "v8i16_shift")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, vecshiftR32Narrow:$imm)>;
-  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
-                                                     vecshiftR64Narrow:$imm)),
-            (!cast<Instruction>(NAME # "v4i32_shift")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, vecshiftR64Narrow:$imm)>;
-}
-
-multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftL8,
-                                  asm, ".8b", ".8b",
-                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
-                       (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftL8,
-                                  asm, ".16b", ".16b",
-             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
-                   (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftL16,
-                                  asm, ".4h", ".4h",
-              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
-                    (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftL16,
-                                  asm, ".8h", ".8h",
-            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                  (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftL32,
-                                  asm, ".2s", ".2s",
-              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
-                    (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftL32,
-                                  asm, ".4s", ".4s",
-            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                  (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftL64,
-                                  asm, ".2d", ".2d",
-            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                  (i32 vecshiftL64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftR8,
-                                  asm, ".8b", ".8b",
-                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
-                       (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftR8,
-                                  asm, ".16b", ".16b",
-             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
-                   (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftR16,
-                                  asm, ".4h", ".4h",
-              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
-                    (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftR16,
-                                  asm, ".8h", ".8h",
-            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                  (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32,
-                                  asm, ".2s", ".2s",
-              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
-                    (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32,
-                                  asm, ".4s", ".4s",
-            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                  (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d",
-            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                  (i32 vecshiftR64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
-                                    SDPatternOperator OpNode = null_frag> {
-  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftR8, asm, ".8b", ".8b",
-                 [(set (v8i8 V64:$dst),
-                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
-                           (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftR8, asm, ".16b", ".16b",
-             [(set (v16i8 V128:$dst),
-               (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
-                       (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftR16, asm, ".4h", ".4h",
-              [(set (v4i16 V64:$dst),
-                (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
-                        (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftR16, asm, ".8h", ".8h",
-            [(set (v8i16 V128:$dst),
-              (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
-                      (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32, asm, ".2s", ".2s",
-              [(set (v2i32 V64:$dst),
-                (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
-                        (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32, asm, ".4s", ".4s",
-            [(set (v4i32 V128:$dst),
-              (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-                      (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
-              (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
-                      (i32 vecshiftR64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
-                                    SDPatternOperator OpNode = null_frag> {
-  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftL8,
-                                  asm, ".8b", ".8b",
-                    [(set (v8i8 V64:$dst),
-                          (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
-                                  (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftL8,
-                                  asm, ".16b", ".16b",
-                    [(set (v16i8 V128:$dst),
-                          (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
-                                  (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftL16,
-                                  asm, ".4h", ".4h",
-                    [(set (v4i16 V64:$dst),
-                           (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
-                                   (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftL16,
-                                  asm, ".8h", ".8h",
-                    [(set (v8i16 V128:$dst),
-                          (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
-                                  (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftL32,
-                                  asm, ".2s", ".2s",
-                    [(set (v2i32 V64:$dst),
-                          (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
-                                  (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftL32,
-                                  asm, ".4s", ".4s",
-                    [(set (v4i32 V128:$dst),
-                          (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-                                  (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftL64,
-                                  asm, ".2d", ".2d",
-                    [(set (v2i64 V128:$dst),
-                          (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
-                                  (i32 vecshiftL64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
-                                   SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V64, vecshiftL8, asm, ".8h", ".8b",
-      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftL8,
-                                  asm#"2", ".8h", ".16b",
-      [(set (v8i16 V128:$Rd),
-            (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V64, vecshiftL16, asm, ".4s", ".4h",
-      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftL16,
-                                  asm#"2", ".4s", ".8h",
-      [(set (v4i32 V128:$Rd),
-            (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
-
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V64, vecshiftL32, asm, ".2d", ".2s",
-      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftL32,
-                                  asm#"2", ".2d", ".4s",
-      [(set (v2i64 V128:$Rd),
-            (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-}
-
-
-//---
-// Vector load/store
-//---
-// SIMD ldX/stX no-index memory references don't allow the optional
-// ", #0" constant and handle post-indexing explicitly, so we use
-// a more specialized parse method for them. Otherwise, it's the same as
-// the general am_noindex handling.
-def MemorySIMDNoIndexOperand : AsmOperandClass {
-  let Name = "MemorySIMDNoIndex";
-  let ParserMethod = "tryParseNoIndexMemory";
-}
-def am_simdnoindex : Operand<i64>,
-                     ComplexPattern<i64, 1, "SelectAddrModeNoIndex", []> {
-  let PrintMethod = "printAMNoIndex";
-  let ParserMatchClass = MemorySIMDNoIndexOperand;
-  let MIOperandInfo = (ops GPR64sp:$base);
-  let DecoderMethod = "DecodeGPR64spRegisterClass";
-}
-
-class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
-                   string asm, dag oops, dag iops, list<dag> pattern>
-  : I<oops, iops, asm, "\t$Vt, $vaddr", "", pattern> {
-  bits<5> Vt;
-  bits<5> vaddr;
-  let Inst{31} = 0;
-  let Inst{30} = Q;
-  let Inst{29-23} = 0b0011000;
-  let Inst{22} = L;
-  let Inst{21-16} = 0b000000;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = size;
-  let Inst{9-5} = vaddr;
-  let Inst{4-0} = Vt;
-}
-
-class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
-                       string asm, dag oops, dag iops>
-  : I<oops, iops, asm, "\t$Vt, $vaddr, $Xm", "", []> {
-  bits<5> Vt;
-  bits<5> vaddr;
-  bits<5> Xm;
-  let Inst{31} = 0;
-  let Inst{30} = Q;
-  let Inst{29-23} = 0b0011001;
-  let Inst{22} = L;
-  let Inst{21} = 0;
-  let Inst{20-16} = Xm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = size;
-  let Inst{9-5} = vaddr;
-  let Inst{4-0} = Vt;
-  let DecoderMethod = "DecodeSIMDLdStPost";
-}
-
-// The immediate form of AdvSIMD post-indexed addressing is encoded with
-// register post-index addressing from the zero register.
-multiclass SIMDLdStAliases<string asm, string layout, string Count,
-                           int Offset, int Size> {
-  // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
-  //      "ld1\t$Vt, $vaddr, #16"
-  // may get mapped to
-  //      (LD1Twov8b_POST VecListTwo8b:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "\t$Vt, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
-                      am_simdnoindex:$vaddr, XZR), 1>;
-
-  // E.g. "ld1.8b { v0, v1 }, [x1], #16"
-  //      "ld1.8b\t$Vt, $vaddr, #16"
-  // may get mapped to
-  //      (LD1Twov8b_POST VecListTwo64:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr, XZR), 0>;
-
-  // E.g. "ld1.8b { v0, v1 }, [x1]"
-  //      "ld1\t$Vt, $vaddr"
-  // may get mapped to
-  //      (LD1Twov8b VecListTwo64:$Vt, am_simdnoindex:$vaddr)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr",
-                  (!cast<Instruction>(NAME # Count # "v" # layout)
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr), 0>;
-
-  // E.g. "ld1.8b { v0, v1 }, [x1], x2"
-  //      "ld1\t$Vt, $vaddr, $Xm"
-  // may get mapped to
-  //      (LD1Twov8b_POST VecListTwo64:$Vt, am_simdnoindex:$vaddr, GPR64pi8:$Xm)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, $Xm",
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr,
-                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
-}
-
-multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
-                       int Offset64, bits<4> opcode> {
-  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
-    def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
-                           (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
-                           (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
-                           (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
-                           (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
-                           (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
-                           (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
-                           (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-
-
-    def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
-                       (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
-                       (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
-                       (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
-                       (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
-                       (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
-                       (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
-                       (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
-}
-
-// Only ld1/st1 has a v1d version.
-multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
-                       int Offset64, bits<4> opcode> {
-  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
-    def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
-                            (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
-                                 am_simdnoindex:$vaddr), []>;
-    def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-
-    def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
-}
-
-multiclass BaseSIMDLd1<string Count, string asm, string veclist,
-                       int Offset128, int Offset64, bits<4> opcode>
-  : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
-
-  // LD1 instructions have extra "1d" variants.
-  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
-    def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
-                           (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-
-    def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
-                       (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
-}
-
-multiclass BaseSIMDSt1<string Count, string asm, string veclist,
-                       int Offset128, int Offset64, bits<4> opcode>
-  : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
-
-  // ST1 instructions have extra "1d" variants.
-  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
-    def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-
-    def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
-}
-
-multiclass SIMDLd1Multiple<string asm> {
-  defm One   : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8,  0b0111>;
-  defm Two   : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
-  defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
-  defm Four  : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
-}
-
-multiclass SIMDSt1Multiple<string asm> {
-  defm One   : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8,  0b0111>;
-  defm Two   : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
-  defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
-  defm Four  : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
-}
-
-multiclass SIMDLd2Multiple<string asm> {
-  defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
-}
-
-multiclass SIMDSt2Multiple<string asm> {
-  defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
-}
-
-multiclass SIMDLd3Multiple<string asm> {
-  defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
-}
-
-multiclass SIMDSt3Multiple<string asm> {
-  defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
-}
-
-multiclass SIMDLd4Multiple<string asm> {
-  defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
-}
-
-multiclass SIMDSt4Multiple<string asm> {
-  defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
-}
-
-//---
-// AdvSIMD Load/store single-element
-//---
-
-class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
-                         string asm, string operands, dag oops, dag iops,
-                         list<dag> pattern>
-  : I<oops, iops, asm, operands, "", pattern> {
-  bits<5> Vt;
-  bits<5> vaddr;
-  let Inst{31} = 0;
-  let Inst{29-24} = 0b001101;
-  let Inst{22} = L;
-  let Inst{21} = R;
-  let Inst{15-13} = opcode;
-  let Inst{9-5} = vaddr;
-  let Inst{4-0} = Vt;
-  let DecoderMethod = "DecodeSIMDLdStSingle";
-}
-
-class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
-                         string asm, string operands, dag oops, dag iops,
-                         list<dag> pattern>
-  : I<oops, iops, asm, operands, "$Vt = $dst", pattern> {
-  bits<5> Vt;
-  bits<5> vaddr;
-  let Inst{31} = 0;
-  let Inst{29-24} = 0b001101;
-  let Inst{22} = L;
-  let Inst{21} = R;
-  let Inst{15-13} = opcode;
-  let Inst{9-5} = vaddr;
-  let Inst{4-0} = Vt;
-  let DecoderMethod = "DecodeSIMDLdStSingleTied";
-}
-
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
-                  Operand listtype>
-  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, $vaddr",
-                       (outs listtype:$Vt), (ins am_simdnoindex:$vaddr), []> {
-  let Inst{30} = Q;
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = S;
-  let Inst{11-10} = size;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
-                      string asm, Operand listtype, Operand GPR64pi>
-  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, $vaddr, $Xm",
-                       (outs listtype:$Vt),
-                       (ins am_simdnoindex:$vaddr, GPR64pi:$Xm), []> {
-  bits<5> Xm;
-  let Inst{30} = Q;
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = S;
-  let Inst{11-10} = size;
-}
-
-multiclass SIMDLdrAliases<string asm, string layout, string Count,
-                          int Offset, int Size> {
-  // E.g. "ld1r { v0.8b }, [x1], #1"
-  //      "ld1r.8b\t$Vt, $vaddr, #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne8b:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "\t$Vt, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
-                      am_simdnoindex:$vaddr, XZR), 1>;
-
-  // E.g. "ld1r.8b { v0 }, [x1], #1"
-  //      "ld1r.8b\t$Vt, $vaddr, #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr, XZR), 0>;
-
-  // E.g. "ld1r.8b { v0 }, [x1]"
-  //      "ld1r.8b\t$Vt, $vaddr"
-  // may get mapped to
-  //      (LD1Rv8b VecListOne64:$Vt, am_simdnoindex:$vaddr)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr",
-                  (!cast<Instruction>(NAME # "v" # layout)
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr), 0>;
-
-  // E.g. "ld1r.8b { v0 }, [x1], x2"
-  //      "ld1r.8b\t$Vt, $vaddr, $Xm"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, GPR64pi1:$Xm)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, $Xm",
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr,
-                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
-}
-
-multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
-  int Offset1, int Offset2, int Offset4, int Offset8> {
-  def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
-                        !cast<Operand>("VecList" # Count # "8b")>;
-  def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
-                        !cast<Operand>("VecList" # Count #"16b")>;
-  def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
-                        !cast<Operand>("VecList" # Count #"4h")>;
-  def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
-                        !cast<Operand>("VecList" # Count #"8h")>;
-  def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
-                        !cast<Operand>("VecList" # Count #"2s")>;
-  def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
-                        !cast<Operand>("VecList" # Count #"4s")>;
-  def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
-                        !cast<Operand>("VecList" # Count #"1d")>;
-  def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
-                        !cast<Operand>("VecList" # Count #"2d")>;
-
-  def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
-                                 !cast<Operand>("VecList" # Count # "8b"),
-                                 !cast<Operand>("GPR64pi" # Offset1)>;
-  def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
-                                 !cast<Operand>("VecList" # Count # "16b"),
-                                 !cast<Operand>("GPR64pi" # Offset1)>;
-  def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
-                                 !cast<Operand>("VecList" # Count # "4h"),
-                                 !cast<Operand>("GPR64pi" # Offset2)>;
-  def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
-                                 !cast<Operand>("VecList" # Count # "8h"),
-                                 !cast<Operand>("GPR64pi" # Offset2)>;
-  def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
-                                 !cast<Operand>("VecList" # Count # "2s"),
-                                 !cast<Operand>("GPR64pi" # Offset4)>;
-  def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
-                                 !cast<Operand>("VecList" # Count # "4s"),
-                                 !cast<Operand>("GPR64pi" # Offset4)>;
-  def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
-                                 !cast<Operand>("VecList" # Count # "1d"),
-                                 !cast<Operand>("GPR64pi" # Offset8)>;
-  def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
-                                 !cast<Operand>("VecList" # Count # "2d"),
-                                 !cast<Operand>("GPR64pi" # Offset8)>;
-
-  defm : SIMDLdrAliases<asm, "8b",  Count, Offset1,  64>;
-  defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
-  defm : SIMDLdrAliases<asm, "4h",  Count, Offset2,  64>;
-  defm : SIMDLdrAliases<asm, "8h",  Count, Offset2, 128>;
-  defm : SIMDLdrAliases<asm, "2s",  Count, Offset4,  64>;
-  defm : SIMDLdrAliases<asm, "4s",  Count, Offset4, 128>;
-  defm : SIMDLdrAliases<asm, "1d",  Count, Offset8,  64>;
-  defm : SIMDLdrAliases<asm, "2d",  Count, Offset8, 128>;
-}
-
-class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  let Inst{30} = idx{3};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  let Inst{30} = idx{3};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{3};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{3};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-
-class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  let Inst{30} = idx{2};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  let Inst{30} = idx{2};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-
-class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{2};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{2};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  let Inst{30} = idx{1};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  let Inst{30} = idx{1};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{1};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{1};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  let Inst{30} = idx;
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  let Inst{30} = idx;
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  bits<5> Xm;
-  let Inst{30} = idx;
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  bits<5> Xm;
-  let Inst{30} = idx;
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
-  def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
-                           (outs listtype:$dst),
-                           (ins listtype:$Vt, VectorIndexB:$idx,
-                                am_simdnoindex:$vaddr), []>;
-
-  def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexB:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
-  def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexH:$idx,
-                                 am_simdnoindex:$vaddr), []>;
-
-  def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexH:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
-  def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexS:$idx,
-                                 am_simdnoindex:$vaddr), []>;
-
-  def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexS:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexD:$idx,
-                                 am_simdnoindex:$vaddr), []>;
-
-  def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexD:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i8 : SIMDLdStSingleB<0, R, opcode, asm,
-                           (outs), (ins listtype:$Vt, VectorIndexB:$idx,
-                                        am_simdnoindex:$vaddr), []>;
-
-  def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexB:$idx,
-                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
-                                         am_simdnoindex:$vaddr), []>;
-
-  def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
-                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
-                                         am_simdnoindex:$vaddr), []>;
-
-  def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
-                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
-                                         am_simdnoindex:$vaddr), []>;
-
-  def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
-                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-
-multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
-                                 string Count, int Offset, Operand idxtype> {
-  // E.g. "ld1 { v0.8b }[0], [x1], #1"
-  //      "ld1\t$Vt, $vaddr, #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne8b:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "\t$Vt$idx, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # Type  # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
-                      idxtype:$idx, am_simdnoindex:$vaddr, XZR), 1>;
-
-  // E.g. "ld1.8b { v0 }[0], [x1], #1"
-  //      "ld1.8b\t$Vt, $vaddr, #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "." # layout # "\t$Vt$idx, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # Type # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
-                      idxtype:$idx, am_simdnoindex:$vaddr, XZR), 0>;
-
-  // E.g. "ld1.8b { v0 }[0], [x1]"
-  //      "ld1.8b\t$Vt, $vaddr"
-  // may get mapped to
-  //      (LD1Rv8b VecListOne64:$Vt, am_simdnoindex:$vaddr)
-  def : InstAlias<asm # "." # layout # "\t$Vt$idx, $vaddr",
-                      (!cast<Instruction>(NAME # Type)
-                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
-                         idxtype:$idx, am_simdnoindex:$vaddr), 0>;
-
-  // E.g. "ld1.8b { v0 }[0], [x1], x2"
-  //      "ld1.8b\t$Vt, $vaddr, $Xm"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, GPR64pi1:$Xm)
-  def : InstAlias<asm # "." # layout # "\t$Vt$idx, $vaddr, $Xm",
-                      (!cast<Instruction>(NAME # Type # "_POST")
-                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
-                         idxtype:$idx, am_simdnoindex:$vaddr,
-                         !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
-}
-
-multiclass SIMDLdSt1SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
-}
-
-multiclass SIMDLdSt2SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
-}
-
-multiclass SIMDLdSt3SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
-}
-
-multiclass SIMDLdSt4SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
-}
-
-//----------------------------------------------------------------------------
-// Crypto extensions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
-              list<dag> pat>
-  : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
-    Sched<[WriteV]>{
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-16} = 0b0100111000101000;
-  let Inst{15-12} = opc;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
-  : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
-            [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-
-class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
-  : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
-            "$Rd = $dst",
-            [(set (v16i8 V128:$dst),
-                  (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
-                     dag oops, dag iops, list<dag> pat>
-  : I<oops, iops, asm,
-      "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
-      "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
-    Sched<[WriteV]>{
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-21} = 0b01011110000;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{14-12} = opc;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
-  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
-                   (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
-                   [(set (v4i32 FPR128:$dst),
-                         (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
-                                 (v4i32 V128:$Rm)))]>;
-
-class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
-  : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
-                   (ins V128:$Rd, V128:$Rn, V128:$Rm),
-                   [(set (v4i32 V128:$dst),
-                         (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-                                 (v4i32 V128:$Rm)))]>;
-
-class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
-  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
-                   (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
-                   [(set (v4i32 FPR128:$dst),
-                         (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
-                                 (v4i32 V128:$Rm)))]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class SHA2OpInst<bits<4> opc, string asm, string kind,
-                 string cstr, dag oops, dag iops,
-                 list<dag> pat>
-  : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
-                       "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
-    Sched<[WriteV]>{
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-16} = 0b0101111000101000;
-  let Inst{15-12} = opc;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
-  : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
-               (ins V128:$Rd, V128:$Rn),
-               [(set (v4i32 V128:$dst),
-                     (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
-
-class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
-  : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
-               [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-
-// Allow the size specifier tokens to be upper case, not just lower.
-def : TokenAlias<".8B", ".8b">;
-def : TokenAlias<".4H", ".4h">;
-def : TokenAlias<".2S", ".2s">;
-def : TokenAlias<".1D", ".1d">;
-def : TokenAlias<".16B", ".16b">;
-def : TokenAlias<".8H", ".8h">;
-def : TokenAlias<".4S", ".4s">;
-def : TokenAlias<".2D", ".2d">;
-def : TokenAlias<".B", ".b">;
-def : TokenAlias<".H", ".h">;
-def : TokenAlias<".S", ".s">;
-def : TokenAlias<".D", ".d">;
diff --git a/lib/Target/ARM64/ARM64InstrInfo.cpp b/lib/Target/ARM64/ARM64InstrInfo.cpp
deleted file mode 100644
index 8f11757..0000000
--- a/lib/Target/ARM64/ARM64InstrInfo.cpp
+++ /dev/null
@@ -1,1864 +0,0 @@
-//===- ARM64InstrInfo.cpp - ARM64 Instruction Information -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64InstrInfo.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_INSTRINFO_CTOR_DTOR
-#include "ARM64GenInstrInfo.inc"
-
-using namespace llvm;
-
-ARM64InstrInfo::ARM64InstrInfo(const ARM64Subtarget &STI)
-    : ARM64GenInstrInfo(ARM64::ADJCALLSTACKDOWN, ARM64::ADJCALLSTACKUP),
-      RI(this, &STI), Subtarget(STI) {}
-
-/// GetInstSize - Return the number of bytes of code the specified
-/// instruction may be.  This returns the maximum number of bytes.
-unsigned ARM64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const MCInstrDesc &Desc = MI->getDesc();
-
-  switch (Desc.getOpcode()) {
-  default:
-    // Anything not explicitly designated otherwise is a nomal 4-byte insn.
-    return 4;
-  case TargetOpcode::DBG_VALUE:
-  case TargetOpcode::EH_LABEL:
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL:
-    return 0;
-  }
-
-  llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size");
-}
-
-static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
-                            SmallVectorImpl<MachineOperand> &Cond) {
-  // Block ends with fall-through condbranch.
-  switch (LastInst->getOpcode()) {
-  default:
-    llvm_unreachable("Unknown branch instruction?");
-  case ARM64::Bcc:
-    Target = LastInst->getOperand(1).getMBB();
-    Cond.push_back(LastInst->getOperand(0));
-    break;
-  case ARM64::CBZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZW:
-  case ARM64::CBNZX:
-    Target = LastInst->getOperand(1).getMBB();
-    Cond.push_back(MachineOperand::CreateImm(-1));
-    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
-    Cond.push_back(LastInst->getOperand(0));
-    break;
-  case ARM64::TBZ:
-  case ARM64::TBNZ:
-    Target = LastInst->getOperand(2).getMBB();
-    Cond.push_back(MachineOperand::CreateImm(-1));
-    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
-    Cond.push_back(LastInst->getOperand(0));
-    Cond.push_back(LastInst->getOperand(1));
-  }
-}
-
-// Branch analysis.
-bool ARM64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                                   MachineBasicBlock *&TBB,
-                                   MachineBasicBlock *&FBB,
-                                   SmallVectorImpl<MachineOperand> &Cond,
-                                   bool AllowModify) const {
-  // If the block has no terminators, it just falls into the block after it.
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin())
-    return false;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return false;
-    --I;
-  }
-  if (!isUnpredicatedTerminator(I))
-    return false;
-
-  // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
-
-  // If there is only one terminator instruction, process it.
-  unsigned LastOpc = LastInst->getOpcode();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (isUncondBranchOpcode(LastOpc)) {
-      TBB = LastInst->getOperand(0).getMBB();
-      return false;
-    }
-    if (isCondBranchOpcode(LastOpc)) {
-      // Block ends with fall-through condbranch.
-      parseCondBranch(LastInst, TBB, Cond);
-      return false;
-    }
-    return true; // Can't handle indirect branch.
-  }
-
-  // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
-  unsigned SecondLastOpc = SecondLastInst->getOpcode();
-
-  // If AllowModify is true and the block ends with two or more unconditional
-  // branches, delete all but the first unconditional branch.
-  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
-    while (isUncondBranchOpcode(SecondLastOpc)) {
-      LastInst->eraseFromParent();
-      LastInst = SecondLastInst;
-      LastOpc = LastInst->getOpcode();
-      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-        // Return now the only terminator is an unconditional branch.
-        TBB = LastInst->getOperand(0).getMBB();
-        return false;
-      } else {
-        SecondLastInst = I;
-        SecondLastOpc = SecondLastInst->getOpcode();
-      }
-    }
-  }
-
-  // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
-    return true;
-
-  // If the block ends with a B and a Bcc, handle it.
-  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    parseCondBranch(SecondLastInst, TBB, Cond);
-    FBB = LastInst->getOperand(0).getMBB();
-    return false;
-  }
-
-  // If the block ends with two unconditional branches, handle it.  The second
-  // one is not executed, so remove it.
-  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return false;
-  }
-
-  // ...likewise if it ends with an indirect branch followed by an unconditional
-  // branch.
-  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return true;
-  }
-
-  // Otherwise, can't handle this.
-  return true;
-}
-
-bool ARM64InstrInfo::ReverseBranchCondition(
-    SmallVectorImpl<MachineOperand> &Cond) const {
-  if (Cond[0].getImm() != -1) {
-    // Regular Bcc
-    ARM64CC::CondCode CC = (ARM64CC::CondCode)(int)Cond[0].getImm();
-    Cond[0].setImm(ARM64CC::getInvertedCondCode(CC));
-  } else {
-    // Folded compare-and-branch
-    switch (Cond[1].getImm()) {
-    default:
-      llvm_unreachable("Unknown conditional branch!");
-    case ARM64::CBZW:
-      Cond[1].setImm(ARM64::CBNZW);
-      break;
-    case ARM64::CBNZW:
-      Cond[1].setImm(ARM64::CBZW);
-      break;
-    case ARM64::CBZX:
-      Cond[1].setImm(ARM64::CBNZX);
-      break;
-    case ARM64::CBNZX:
-      Cond[1].setImm(ARM64::CBZX);
-      break;
-    case ARM64::TBZ:
-      Cond[1].setImm(ARM64::TBNZ);
-      break;
-    case ARM64::TBNZ:
-      Cond[1].setImm(ARM64::TBZ);
-      break;
-    }
-  }
-
-  return false;
-}
-
-unsigned ARM64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin())
-    return 0;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return 0;
-    --I;
-  }
-  if (!isUncondBranchOpcode(I->getOpcode()) &&
-      !isCondBranchOpcode(I->getOpcode()))
-    return 0;
-
-  // Remove the branch.
-  I->eraseFromParent();
-
-  I = MBB.end();
-
-  if (I == MBB.begin())
-    return 1;
-  --I;
-  if (!isCondBranchOpcode(I->getOpcode()))
-    return 1;
-
-  // Remove the branch.
-  I->eraseFromParent();
-  return 2;
-}
-
-void ARM64InstrInfo::instantiateCondBranch(
-    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
-    const SmallVectorImpl<MachineOperand> &Cond) const {
-  if (Cond[0].getImm() != -1) {
-    // Regular Bcc
-    BuildMI(&MBB, DL, get(ARM64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
-  } else {
-    // Folded compare-and-branch
-    const MachineInstrBuilder MIB =
-        BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
-    if (Cond.size() > 3)
-      MIB.addImm(Cond[3].getImm());
-    MIB.addMBB(TBB);
-  }
-}
-
-unsigned ARM64InstrInfo::InsertBranch(
-    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
-  // Shouldn't be a fall through.
-  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
-
-  if (FBB == 0) {
-    if (Cond.empty()) // Unconditional branch?
-      BuildMI(&MBB, DL, get(ARM64::B)).addMBB(TBB);
-    else
-      instantiateCondBranch(MBB, DL, TBB, Cond);
-    return 1;
-  }
-
-  // Two-way conditional branch.
-  instantiateCondBranch(MBB, DL, TBB, Cond);
-  BuildMI(&MBB, DL, get(ARM64::B)).addMBB(FBB);
-  return 2;
-}
-
-// Find the original register that VReg is copied from.
-static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
-  while (TargetRegisterInfo::isVirtualRegister(VReg)) {
-    const MachineInstr *DefMI = MRI.getVRegDef(VReg);
-    if (!DefMI->isFullCopy())
-      return VReg;
-    VReg = DefMI->getOperand(1).getReg();
-  }
-  return VReg;
-}
-
-// Determine if VReg is defined by an instruction that can be folded into a
-// csel instruction. If so, return the folded opcode, and the replacement
-// register.
-static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
-                                unsigned *NewVReg = 0) {
-  VReg = removeCopies(MRI, VReg);
-  if (!TargetRegisterInfo::isVirtualRegister(VReg))
-    return 0;
-
-  bool Is64Bit = ARM64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
-  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
-  unsigned Opc = 0;
-  unsigned SrcOpNum = 0;
-  switch (DefMI->getOpcode()) {
-  case ARM64::ADDSXri:
-  case ARM64::ADDSWri:
-    // if CPSR is used, do not fold.
-    if (DefMI->findRegisterDefOperandIdx(ARM64::CPSR, true) == -1)
-      return 0;
-  // fall-through to ADDXri and ADDWri.
-  case ARM64::ADDXri:
-  case ARM64::ADDWri:
-    // add x, 1 -> csinc.
-    if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
-        DefMI->getOperand(3).getImm() != 0)
-      return 0;
-    SrcOpNum = 1;
-    Opc = Is64Bit ? ARM64::CSINCXr : ARM64::CSINCWr;
-    break;
-
-  case ARM64::ORNXrr:
-  case ARM64::ORNWrr: {
-    // not x -> csinv, represented as orn dst, xzr, src.
-    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
-    if (ZReg != ARM64::XZR && ZReg != ARM64::WZR)
-      return 0;
-    SrcOpNum = 2;
-    Opc = Is64Bit ? ARM64::CSINVXr : ARM64::CSINVWr;
-    break;
-  }
-
-  case ARM64::SUBSXrr:
-  case ARM64::SUBSWrr:
-    // if CPSR is used, do not fold.
-    if (DefMI->findRegisterDefOperandIdx(ARM64::CPSR, true) == -1)
-      return 0;
-  // fall-through to SUBXrr and SUBWrr.
-  case ARM64::SUBXrr:
-  case ARM64::SUBWrr: {
-    // neg x -> csneg, represented as sub dst, xzr, src.
-    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
-    if (ZReg != ARM64::XZR && ZReg != ARM64::WZR)
-      return 0;
-    SrcOpNum = 2;
-    Opc = Is64Bit ? ARM64::CSNEGXr : ARM64::CSNEGWr;
-    break;
-  }
-  default:
-    return 0;
-  }
-  assert(Opc && SrcOpNum && "Missing parameters");
-
-  if (NewVReg)
-    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
-  return Opc;
-}
-
-bool ARM64InstrInfo::canInsertSelect(
-    const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond,
-    unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
-    int &FalseCycles) const {
-  // Check register classes.
-  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  const TargetRegisterClass *RC =
-      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
-  if (!RC)
-    return false;
-
-  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
-  unsigned ExtraCondLat = Cond.size() != 1;
-
-  // GPRs are handled by csel.
-  // FIXME: Fold in x+1, -x, and ~x when applicable.
-  if (ARM64::GPR64allRegClass.hasSubClassEq(RC) ||
-      ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
-    // Single-cycle csel, csinc, csinv, and csneg.
-    CondCycles = 1 + ExtraCondLat;
-    TrueCycles = FalseCycles = 1;
-    if (canFoldIntoCSel(MRI, TrueReg))
-      TrueCycles = 0;
-    else if (canFoldIntoCSel(MRI, FalseReg))
-      FalseCycles = 0;
-    return true;
-  }
-
-  // Scalar floating point is handled by fcsel.
-  // FIXME: Form fabs, fmin, and fmax when applicable.
-  if (ARM64::FPR64RegClass.hasSubClassEq(RC) ||
-      ARM64::FPR32RegClass.hasSubClassEq(RC)) {
-    CondCycles = 5 + ExtraCondLat;
-    TrueCycles = FalseCycles = 2;
-    return true;
-  }
-
-  // Can't do vectors.
-  return false;
-}
-
-void ARM64InstrInfo::insertSelect(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I, DebugLoc DL,
-                                  unsigned DstReg,
-                                  const SmallVectorImpl<MachineOperand> &Cond,
-                                  unsigned TrueReg, unsigned FalseReg) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
-  // Parse the condition code, see parseCondBranch() above.
-  ARM64CC::CondCode CC;
-  switch (Cond.size()) {
-  default:
-    llvm_unreachable("Unknown condition opcode in Cond");
-  case 1: // b.cc
-    CC = ARM64CC::CondCode(Cond[0].getImm());
-    break;
-  case 3: { // cbz/cbnz
-    // We must insert a compare against 0.
-    bool Is64Bit;
-    switch (Cond[1].getImm()) {
-    default:
-      llvm_unreachable("Unknown branch opcode in Cond");
-    case ARM64::CBZW:
-      Is64Bit = 0;
-      CC = ARM64CC::EQ;
-      break;
-    case ARM64::CBZX:
-      Is64Bit = 1;
-      CC = ARM64CC::EQ;
-      break;
-    case ARM64::CBNZW:
-      Is64Bit = 0;
-      CC = ARM64CC::NE;
-      break;
-    case ARM64::CBNZX:
-      Is64Bit = 1;
-      CC = ARM64CC::NE;
-      break;
-    }
-    unsigned SrcReg = Cond[2].getReg();
-    if (Is64Bit) {
-      // cmp reg, #0 is actually subs xzr, reg, #0.
-      MRI.constrainRegClass(SrcReg, &ARM64::GPR64spRegClass);
-      BuildMI(MBB, I, DL, get(ARM64::SUBSXri), ARM64::XZR)
-          .addReg(SrcReg)
-          .addImm(0)
-          .addImm(0);
-    } else {
-      MRI.constrainRegClass(SrcReg, &ARM64::GPR32spRegClass);
-      BuildMI(MBB, I, DL, get(ARM64::SUBSWri), ARM64::WZR)
-          .addReg(SrcReg)
-          .addImm(0)
-          .addImm(0);
-    }
-    break;
-  }
-  case 4: { // tbz/tbnz
-    // We must insert a tst instruction.
-    switch (Cond[1].getImm()) {
-    default:
-      llvm_unreachable("Unknown branch opcode in Cond");
-    case ARM64::TBZ:
-      CC = ARM64CC::EQ;
-      break;
-    case ARM64::TBNZ:
-      CC = ARM64CC::NE;
-      break;
-    }
-    // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
-    BuildMI(MBB, I, DL, get(ARM64::ANDSXri), ARM64::XZR)
-        .addReg(Cond[2].getReg())
-        .addImm(ARM64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
-    break;
-  }
-  }
-
-  unsigned Opc = 0;
-  const TargetRegisterClass *RC = 0;
-  bool TryFold = false;
-  if (MRI.constrainRegClass(DstReg, &ARM64::GPR64RegClass)) {
-    RC = &ARM64::GPR64RegClass;
-    Opc = ARM64::CSELXr;
-    TryFold = true;
-  } else if (MRI.constrainRegClass(DstReg, &ARM64::GPR32RegClass)) {
-    RC = &ARM64::GPR32RegClass;
-    Opc = ARM64::CSELWr;
-    TryFold = true;
-  } else if (MRI.constrainRegClass(DstReg, &ARM64::FPR64RegClass)) {
-    RC = &ARM64::FPR64RegClass;
-    Opc = ARM64::FCSELDrrr;
-  } else if (MRI.constrainRegClass(DstReg, &ARM64::FPR32RegClass)) {
-    RC = &ARM64::FPR32RegClass;
-    Opc = ARM64::FCSELSrrr;
-  }
-  assert(RC && "Unsupported regclass");
-
-  // Try folding simple instructions into the csel.
-  if (TryFold) {
-    unsigned NewVReg = 0;
-    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
-    if (FoldedOpc) {
-      // The folded opcodes csinc, csinc and csneg apply the operation to
-      // FalseReg, so we need to invert the condition.
-      CC = ARM64CC::getInvertedCondCode(CC);
-      TrueReg = FalseReg;
-    } else
-      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
-
-    // Fold the operation. Leave any dead instructions for DCE to clean up.
-    if (FoldedOpc) {
-      FalseReg = NewVReg;
-      Opc = FoldedOpc;
-      // The extends the live range of NewVReg.
-      MRI.clearKillFlags(NewVReg);
-    }
-  }
-
-  // Pull all virtual register into the appropriate class.
-  MRI.constrainRegClass(TrueReg, RC);
-  MRI.constrainRegClass(FalseReg, RC);
-
-  // Insert the csel.
-  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
-      CC);
-}
-
-bool ARM64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
-                                           unsigned &SrcReg, unsigned &DstReg,
-                                           unsigned &SubIdx) const {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case ARM64::SBFMXri: // aka sxtw
-  case ARM64::UBFMXri: // aka uxtw
-    // Check for the 32 -> 64 bit extension case, these instructions can do
-    // much more.
-    if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
-      return false;
-    // This is a signed or unsigned 32 -> 64 bit extension.
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    SubIdx = ARM64::sub_32;
-    return true;
-  }
-}
-
-/// analyzeCompare - For a comparison instruction, return the source registers
-/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
-/// Return true if the comparison instruction can be analyzed.
-bool ARM64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                                    unsigned &SrcReg2, int &CmpMask,
-                                    int &CmpValue) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::SUBSWrr:
-  case ARM64::SUBSWrs:
-  case ARM64::SUBSWrx:
-  case ARM64::SUBSXrr:
-  case ARM64::SUBSXrs:
-  case ARM64::SUBSXrx:
-  case ARM64::ADDSWrr:
-  case ARM64::ADDSWrs:
-  case ARM64::ADDSWrx:
-  case ARM64::ADDSXrr:
-  case ARM64::ADDSXrs:
-  case ARM64::ADDSXrx:
-    // Replace SUBSWrr with SUBWrr if CPSR is not used.
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = MI->getOperand(2).getReg();
-    CmpMask = ~0;
-    CmpValue = 0;
-    return true;
-  case ARM64::SUBSWri:
-  case ARM64::ADDSWri:
-  case ARM64::ANDSWri:
-  case ARM64::SUBSXri:
-  case ARM64::ADDSXri:
-  case ARM64::ANDSXri:
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = 0;
-    CmpMask = ~0;
-    CmpValue = MI->getOperand(2).getImm();
-    return true;
-  }
-
-  return false;
-}
-
-static bool UpdateOperandRegClass(MachineInstr *Instr) {
-  MachineBasicBlock *MBB = Instr->getParent();
-  assert(MBB && "Can't get MachineBasicBlock here");
-  MachineFunction *MF = MBB->getParent();
-  assert(MF && "Can't get MachineFunction here");
-  const TargetMachine *TM = &MF->getTarget();
-  const TargetInstrInfo *TII = TM->getInstrInfo();
-  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
-  MachineRegisterInfo *MRI = &MF->getRegInfo();
-
-  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
-       ++OpIdx) {
-    MachineOperand &MO = Instr->getOperand(OpIdx);
-    const TargetRegisterClass *OpRegCstraints =
-        Instr->getRegClassConstraint(OpIdx, TII, TRI);
-
-    // If there's no constraint, there's nothing to do.
-    if (!OpRegCstraints)
-      continue;
-    // If the operand is a frame index, there's nothing to do here.
-    // A frame index operand will resolve correctly during PEI.
-    if (MO.isFI())
-      continue;
-
-    assert(MO.isReg() &&
-           "Operand has register constraints without being a register!");
-
-    unsigned Reg = MO.getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-      if (!OpRegCstraints->contains(Reg))
-        return false;
-    } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
-               !MRI->constrainRegClass(Reg, OpRegCstraints))
-      return false;
-  }
-
-  return true;
-}
-
-/// optimizeCompareInstr - Convert the instruction supplying the argument to the
-/// comparison into one that sets the zero bit in the flags register.
-bool ARM64InstrInfo::optimizeCompareInstr(
-    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
-    int CmpValue, const MachineRegisterInfo *MRI) const {
-
-  // Replace SUBSWrr with SUBWrr if CPSR is not used.
-  int Cmp_CPSR = CmpInstr->findRegisterDefOperandIdx(ARM64::CPSR, true);
-  if (Cmp_CPSR != -1) {
-    unsigned NewOpc;
-    switch (CmpInstr->getOpcode()) {
-    default:
-      return false;
-    case ARM64::ADDSWrr:      NewOpc = ARM64::ADDWrr; break;
-    case ARM64::ADDSWri:      NewOpc = ARM64::ADDWri; break;
-    case ARM64::ADDSWrs:      NewOpc = ARM64::ADDWrs; break;
-    case ARM64::ADDSWrx:      NewOpc = ARM64::ADDWrx; break;
-    case ARM64::ADDSXrr:      NewOpc = ARM64::ADDXrr; break;
-    case ARM64::ADDSXri:      NewOpc = ARM64::ADDXri; break;
-    case ARM64::ADDSXrs:      NewOpc = ARM64::ADDXrs; break;
-    case ARM64::ADDSXrx:      NewOpc = ARM64::ADDXrx; break;
-    case ARM64::SUBSWrr:      NewOpc = ARM64::SUBWrr; break;
-    case ARM64::SUBSWri:      NewOpc = ARM64::SUBWri; break;
-    case ARM64::SUBSWrs:      NewOpc = ARM64::SUBWrs; break;
-    case ARM64::SUBSWrx:      NewOpc = ARM64::SUBWrx; break;
-    case ARM64::SUBSXrr:      NewOpc = ARM64::SUBXrr; break;
-    case ARM64::SUBSXri:      NewOpc = ARM64::SUBXri; break;
-    case ARM64::SUBSXrs:      NewOpc = ARM64::SUBXrs; break;
-    case ARM64::SUBSXrx:      NewOpc = ARM64::SUBXrx; break;
-    }
-
-    const MCInstrDesc &MCID = get(NewOpc);
-    CmpInstr->setDesc(MCID);
-    CmpInstr->RemoveOperand(Cmp_CPSR);
-    bool succeeded = UpdateOperandRegClass(CmpInstr);
-    (void)succeeded;
-    assert(succeeded && "Some operands reg class are incompatible!");
-    return true;
-  }
-
-  // Continue only if we have a "ri" where immediate is zero.
-  if (CmpValue != 0 || SrcReg2 != 0)
-    return false;
-
-  // CmpInstr is a Compare instruction if destination register is not used.
-  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
-    return false;
-
-  // Get the unique definition of SrcReg.
-  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
-  if (!MI)
-    return false;
-
-  // We iterate backward, starting from the instruction before CmpInstr and
-  // stop when reaching the definition of the source register or done with the
-  // basic block, to check whether CPSR is used or modified in between.
-  MachineBasicBlock::iterator I = CmpInstr, E = MI,
-                              B = CmpInstr->getParent()->begin();
-
-  // Early exit if CmpInstr is at the beginning of the BB.
-  if (I == B)
-    return false;
-
-  // Check whether the definition of SrcReg is in the same basic block as
-  // Compare. If not, we can't optimize away the Compare.
-  if (MI->getParent() != CmpInstr->getParent())
-    return false;
-
-  // Check that CPSR isn't set between the comparison instruction and the one we
-  // want to change.
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  for (--I; I != E; --I) {
-    const MachineInstr &Instr = *I;
-
-    if (Instr.modifiesRegister(ARM64::CPSR, TRI) ||
-        Instr.readsRegister(ARM64::CPSR, TRI))
-      // This instruction modifies or uses CPSR after the one we want to
-      // change. We can't do this transformation.
-      return false;
-    if (I == B)
-      // The 'and' is below the comparison instruction.
-      return false;
-  }
-
-  unsigned NewOpc = MI->getOpcode();
-  switch (MI->getOpcode()) {
-  default:
-    return false;
-  case ARM64::ADDSWrr:
-  case ARM64::ADDSWri:
-  case ARM64::ADDSXrr:
-  case ARM64::ADDSXri:
-  case ARM64::SUBSWrr:
-  case ARM64::SUBSWri:
-  case ARM64::SUBSXrr:
-  case ARM64::SUBSXri:
-    break;
-  case ARM64::ADDWrr:    NewOpc = ARM64::ADDSWrr; break;
-  case ARM64::ADDWri:    NewOpc = ARM64::ADDSWri; break;
-  case ARM64::ADDXrr:    NewOpc = ARM64::ADDSXrr; break;
-  case ARM64::ADDXri:    NewOpc = ARM64::ADDSXri; break;
-  case ARM64::ADCWr:     NewOpc = ARM64::ADCSWr; break;
-  case ARM64::ADCXr:     NewOpc = ARM64::ADCSXr; break;
-  case ARM64::SUBWrr:    NewOpc = ARM64::SUBSWrr; break;
-  case ARM64::SUBWri:    NewOpc = ARM64::SUBSWri; break;
-  case ARM64::SUBXrr:    NewOpc = ARM64::SUBSXrr; break;
-  case ARM64::SUBXri:    NewOpc = ARM64::SUBSXri; break;
-  case ARM64::SBCWr:     NewOpc = ARM64::SBCSWr; break;
-  case ARM64::SBCXr:     NewOpc = ARM64::SBCSXr; break;
-  case ARM64::ANDWri:    NewOpc = ARM64::ANDSWri; break;
-  case ARM64::ANDXri:    NewOpc = ARM64::ANDSXri; break;
-  }
-
-  // Scan forward for the use of CPSR.
-  // When checking against MI: if it's a conditional code requires
-  // checking of V bit, then this is not safe to do.
-  // It is safe to remove CmpInstr if CPSR is redefined or killed.
-  // If we are done with the basic block, we need to check whether CPSR is
-  // live-out.
-  bool IsSafe = false;
-  for (MachineBasicBlock::iterator I = CmpInstr,
-                                   E = CmpInstr->getParent()->end();
-       !IsSafe && ++I != E;) {
-    const MachineInstr &Instr = *I;
-    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
-         ++IO) {
-      const MachineOperand &MO = Instr.getOperand(IO);
-      if (MO.isRegMask() && MO.clobbersPhysReg(ARM64::CPSR)) {
-        IsSafe = true;
-        break;
-      }
-      if (!MO.isReg() || MO.getReg() != ARM64::CPSR)
-        continue;
-      if (MO.isDef()) {
-        IsSafe = true;
-        break;
-      }
-
-      // Decode the condition code.
-      unsigned Opc = Instr.getOpcode();
-      ARM64CC::CondCode CC;
-      switch (Opc) {
-      default:
-        return false;
-      case ARM64::Bcc:
-        CC = (ARM64CC::CondCode)Instr.getOperand(IO - 2).getImm();
-        break;
-      case ARM64::CSINVWr:
-      case ARM64::CSINVXr:
-      case ARM64::CSINCWr:
-      case ARM64::CSINCXr:
-      case ARM64::CSELWr:
-      case ARM64::CSELXr:
-      case ARM64::CSNEGWr:
-      case ARM64::CSNEGXr:
-        CC = (ARM64CC::CondCode)Instr.getOperand(IO - 1).getImm();
-        break;
-      }
-
-      // It is not safe to remove Compare instruction if Overflow(V) is used.
-      switch (CC) {
-      default:
-        // CPSR can be used multiple times, we should continue.
-        break;
-      case ARM64CC::VS:
-      case ARM64CC::VC:
-      case ARM64CC::GE:
-      case ARM64CC::LT:
-      case ARM64CC::GT:
-      case ARM64CC::LE:
-        return false;
-      }
-    }
-  }
-
-  // If CPSR is not killed nor re-defined, we should check whether it is
-  // live-out. If it is live-out, do not optimize.
-  if (!IsSafe) {
-    MachineBasicBlock *MBB = CmpInstr->getParent();
-    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-                                          SE = MBB->succ_end();
-         SI != SE; ++SI)
-      if ((*SI)->isLiveIn(ARM64::CPSR))
-        return false;
-  }
-
-  // Update the instruction to set CPSR.
-  MI->setDesc(get(NewOpc));
-  CmpInstr->eraseFromParent();
-  bool succeeded = UpdateOperandRegClass(MI);
-  (void)succeeded;
-  assert(succeeded && "Some operands reg class are incompatible!");
-  MI->addRegisterDefined(ARM64::CPSR, TRI);
-  return true;
-}
-
-// Return true if this instruction simply sets its single destination register
-// to zero. This is equivalent to a register rename of the zero-register.
-bool ARM64InstrInfo::isGPRZero(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::MOVZWi:
-  case ARM64::MOVZXi: // movz Rd, #0 (LSL #0)
-    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 3 &&
-             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
-      return true;
-    }
-    break;
-  case ARM64::ANDWri: // and Rd, Rzr, #imm
-    return MI->getOperand(1).getReg() == ARM64::WZR;
-  case ARM64::ANDXri:
-    return MI->getOperand(1).getReg() == ARM64::XZR;
-  case TargetOpcode::COPY:
-    return MI->getOperand(1).getReg() == ARM64::WZR;
-  }
-  return false;
-}
-
-// Return true if this instruction simply renames a general register without
-// modifying bits.
-bool ARM64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case TargetOpcode::COPY: {
-    // GPR32 copies will by lowered to ORRXrs
-    unsigned DstReg = MI->getOperand(0).getReg();
-    return (ARM64::GPR32RegClass.contains(DstReg) ||
-            ARM64::GPR64RegClass.contains(DstReg));
-  }
-  case ARM64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
-    if (MI->getOperand(1).getReg() == ARM64::XZR) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
-      return true;
-    }
-  case ARM64::ADDXri: // add Xd, Xn, #0 (LSL #0)
-    if (MI->getOperand(2).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
-      return true;
-    }
-  }
-  return false;
-}
-
-// Return true if this instruction simply renames a general register without
-// modifying bits.
-bool ARM64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case TargetOpcode::COPY: {
-    // FPR64 copies will by lowered to ORR.16b
-    unsigned DstReg = MI->getOperand(0).getReg();
-    return (ARM64::FPR64RegClass.contains(DstReg) ||
-            ARM64::FPR128RegClass.contains(DstReg));
-  }
-  case ARM64::ORRv16i8:
-    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
-      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
-             "invalid ORRv16i8 operands");
-      return true;
-    }
-  }
-  return false;
-}
-
-unsigned ARM64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                             int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::LDRWui:
-  case ARM64::LDRXui:
-  case ARM64::LDRBui:
-  case ARM64::LDRHui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
-
-  return 0;
-}
-
-unsigned ARM64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
-                                            int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::STRWui:
-  case ARM64::STRXui:
-  case ARM64::STRBui:
-  case ARM64::STRHui:
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
-  return 0;
-}
-
-/// Return true if this is load/store scales or extends its register offset.
-/// This refers to scaling a dynamic index as opposed to scaled immediates.
-/// MI should be a memory op that allows scaled addressing.
-bool ARM64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::LDRBBro:
-  case ARM64::LDRBro:
-  case ARM64::LDRDro:
-  case ARM64::LDRHHro:
-  case ARM64::LDRHro:
-  case ARM64::LDRQro:
-  case ARM64::LDRSBWro:
-  case ARM64::LDRSBXro:
-  case ARM64::LDRSHWro:
-  case ARM64::LDRSHXro:
-  case ARM64::LDRSWro:
-  case ARM64::LDRSro:
-  case ARM64::LDRWro:
-  case ARM64::LDRXro:
-  case ARM64::STRBBro:
-  case ARM64::STRBro:
-  case ARM64::STRDro:
-  case ARM64::STRHHro:
-  case ARM64::STRHro:
-  case ARM64::STRQro:
-  case ARM64::STRSro:
-  case ARM64::STRWro:
-  case ARM64::STRXro:
-    unsigned Val = MI->getOperand(3).getImm();
-    ARM64_AM::ExtendType ExtType = ARM64_AM::getMemExtendType(Val);
-    return (ExtType != ARM64_AM::UXTX) || ARM64_AM::getMemDoShift(Val);
-  }
-  return false;
-}
-
-/// Check all MachineMemOperands for a hint to suppress pairing.
-bool ARM64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  for (MachineInstr::mmo_iterator MM = MI->memoperands_begin(),
-                                  E = MI->memoperands_end();
-       MM != E; ++MM) {
-
-    if ((*MM)->getFlags() &
-        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/// Set a flag on the first MachineMemOperand to suppress pairing.
-void ARM64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
-  if (MI->memoperands_empty())
-    return;
-
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  (*MI->memoperands_begin())
-      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
-}
-
-bool ARM64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                                          unsigned &Offset,
-                                          const TargetRegisterInfo *TRI) const {
-  switch (LdSt->getOpcode()) {
-  default:
-    return false;
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STRQui:
-  case ARM64::STRXui:
-  case ARM64::STRWui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-  case ARM64::LDRXui:
-  case ARM64::LDRWui:
-    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
-      return false;
-    BaseReg = LdSt->getOperand(1).getReg();
-    MachineFunction &MF = *LdSt->getParent()->getParent();
-    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
-    Offset = LdSt->getOperand(2).getImm() * Width;
-    return true;
-  };
-}
-
-/// Detect opportunities for ldp/stp formation.
-///
-/// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
-bool ARM64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
-                                        MachineInstr *SecondLdSt,
-                                        unsigned NumLoads) const {
-  // Only cluster up to a single pair.
-  if (NumLoads > 1)
-    return false;
-  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
-    return false;
-  // getLdStBaseRegImmOfs guarantees that oper 2 isImm.
-  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
-  // Allow 6 bits of positive range.
-  if (Ofs1 > 64)
-    return false;
-  // The caller should already have ordered First/SecondLdSt by offset.
-  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
-  return Ofs1 + 1 == Ofs2;
-}
-
-bool ARM64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
-                                            MachineInstr *Second) const {
-  // Cyclone can fuse CMN, CMP followed by Bcc.
-
-  // FIXME: B0 can also fuse:
-  // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
-  if (Second->getOpcode() != ARM64::Bcc)
-    return false;
-  switch (First->getOpcode()) {
-  default:
-    return false;
-  case ARM64::SUBSWri:
-  case ARM64::ADDSWri:
-  case ARM64::ANDSWri:
-  case ARM64::SUBSXri:
-  case ARM64::ADDSXri:
-  case ARM64::ANDSXri:
-    return true;
-  }
-}
-
-MachineInstr *ARM64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
-                                                       int FrameIx,
-                                                       uint64_t Offset,
-                                                       const MDNode *MDPtr,
-                                                       DebugLoc DL) const {
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM64::DBG_VALUE))
-                                .addFrameIndex(FrameIx)
-                                .addImm(0)
-                                .addImm(Offset)
-                                .addMetadata(MDPtr);
-  return &*MIB;
-}
-
-static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
-                                            unsigned Reg, unsigned SubIdx,
-                                            unsigned State,
-                                            const TargetRegisterInfo *TRI) {
-  if (!SubIdx)
-    return MIB.addReg(Reg, State);
-
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
-    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
-  return MIB.addReg(Reg, State, SubIdx);
-}
-
-static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
-                                        unsigned NumRegs) {
-  // We really want the positive remainder mod 32 here, that happens to be
-  // easily obtainable with a mask.
-  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
-}
-
-void ARM64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator I,
-                                      DebugLoc DL, unsigned DestReg,
-                                      unsigned SrcReg, bool KillSrc,
-                                      unsigned Opcode,
-                                      llvm::ArrayRef<unsigned> Indices) const {
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
-  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
-  unsigned NumRegs = Indices.size();
-
-  int SubReg = 0, End = NumRegs, Incr = 1;
-  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
-    SubReg = NumRegs - 1;
-    End = -1;
-    Incr = -1;
-  }
-
-  for (; SubReg != End; SubReg += Incr) {
-    const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode));
-    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
-    AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
-    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
-  }
-}
-
-void ARM64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator I, DebugLoc DL,
-                                 unsigned DestReg, unsigned SrcReg,
-                                 bool KillSrc) const {
-  if (ARM64::GPR32spRegClass.contains(DestReg) &&
-      (ARM64::GPR32spRegClass.contains(SrcReg) || SrcReg == ARM64::WZR)) {
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-
-    if (DestReg == ARM64::WSP || SrcReg == ARM64::WSP) {
-      // If either operand is WSP, expand to ADD #0.
-      if (Subtarget.hasZeroCycleRegMove()) {
-        // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
-        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, ARM64::sub_32,
-                                                     &ARM64::GPR64spRegClass);
-        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, ARM64::sub_32,
-                                                    &ARM64::GPR64spRegClass);
-        // This instruction is reading and writing X registers.  This may upset
-        // the register scavenger and machine verifier, so we need to indicate
-        // that we are reading an undefined value from SrcRegX, but a proper
-        // value from SrcReg.
-        BuildMI(MBB, I, DL, get(ARM64::ADDXri), DestRegX)
-            .addReg(SrcRegX, RegState::Undef)
-            .addImm(0)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0))
-            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
-      } else {
-        BuildMI(MBB, I, DL, get(ARM64::ADDWri), DestReg)
-            .addReg(SrcReg, getKillRegState(KillSrc))
-            .addImm(0)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-      }
-    } else if (SrcReg == ARM64::WZR && Subtarget.hasZeroCycleZeroing()) {
-      BuildMI(MBB, I, DL, get(ARM64::MOVZWi), DestReg).addImm(0).addImm(
-          ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    } else {
-      if (Subtarget.hasZeroCycleRegMove()) {
-        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
-        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, ARM64::sub_32,
-                                                     &ARM64::GPR64spRegClass);
-        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, ARM64::sub_32,
-                                                    &ARM64::GPR64spRegClass);
-        // This instruction is reading and writing X registers.  This may upset
-        // the register scavenger and machine verifier, so we need to indicate
-        // that we are reading an undefined value from SrcRegX, but a proper
-        // value from SrcReg.
-        BuildMI(MBB, I, DL, get(ARM64::ORRXrr), DestRegX)
-            .addReg(ARM64::XZR)
-            .addReg(SrcRegX, RegState::Undef)
-            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
-      } else {
-        // Otherwise, expand to ORR WZR.
-        BuildMI(MBB, I, DL, get(ARM64::ORRWrr), DestReg)
-            .addReg(ARM64::WZR)
-            .addReg(SrcReg, getKillRegState(KillSrc));
-      }
-    }
-    return;
-  }
-
-  if (ARM64::GPR64spRegClass.contains(DestReg) &&
-      (ARM64::GPR64spRegClass.contains(SrcReg) || SrcReg == ARM64::XZR)) {
-    if (DestReg == ARM64::SP || SrcReg == ARM64::SP) {
-      // If either operand is SP, expand to ADD #0.
-      BuildMI(MBB, I, DL, get(ARM64::ADDXri), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc))
-          .addImm(0)
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    } else if (SrcReg == ARM64::XZR && Subtarget.hasZeroCycleZeroing()) {
-      BuildMI(MBB, I, DL, get(ARM64::MOVZXi), DestReg).addImm(0).addImm(
-          ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    } else {
-      // Otherwise, expand to ORR XZR.
-      BuildMI(MBB, I, DL, get(ARM64::ORRXrr), DestReg)
-          .addReg(ARM64::XZR)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
-  // Copy a DDDD register quad by copying the individual sub-registers.
-  if (ARM64::DDDDRegClass.contains(DestReg) &&
-      ARM64::DDDDRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1,
-                                        ARM64::dsub2, ARM64::dsub3 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a DDD register triple by copying the individual sub-registers.
-  if (ARM64::DDDRegClass.contains(DestReg) &&
-      ARM64::DDDRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1,
-                                        ARM64::dsub2 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a DD register pair by copying the individual sub-registers.
-  if (ARM64::DDRegClass.contains(DestReg) &&
-      ARM64::DDRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a QQQQ register quad by copying the individual sub-registers.
-  if (ARM64::QQQQRegClass.contains(DestReg) &&
-      ARM64::QQQQRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1,
-                                        ARM64::qsub2, ARM64::qsub3 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a QQQ register triple by copying the individual sub-registers.
-  if (ARM64::QQQRegClass.contains(DestReg) &&
-      ARM64::QQQRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1,
-                                        ARM64::qsub2 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a QQ register pair by copying the individual sub-registers.
-  if (ARM64::QQRegClass.contains(DestReg) &&
-      ARM64::QQRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
-                     Indices);
-    return;
-  }
-
-  if (ARM64::FPR128RegClass.contains(DestReg) &&
-      ARM64::FPR128RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-        SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  if (ARM64::FPR64RegClass.contains(DestReg) &&
-      ARM64::FPR64RegClass.contains(SrcReg)) {
-    DestReg =
-        RI.getMatchingSuperReg(DestReg, ARM64::dsub, &ARM64::FPR128RegClass);
-    SrcReg =
-        RI.getMatchingSuperReg(SrcReg, ARM64::dsub, &ARM64::FPR128RegClass);
-    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-        SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  if (ARM64::FPR32RegClass.contains(DestReg) &&
-      ARM64::FPR32RegClass.contains(SrcReg)) {
-    DestReg =
-        RI.getMatchingSuperReg(DestReg, ARM64::ssub, &ARM64::FPR128RegClass);
-    SrcReg =
-        RI.getMatchingSuperReg(SrcReg, ARM64::ssub, &ARM64::FPR128RegClass);
-    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-        SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  if (ARM64::FPR16RegClass.contains(DestReg) &&
-      ARM64::FPR16RegClass.contains(SrcReg)) {
-    DestReg =
-        RI.getMatchingSuperReg(DestReg, ARM64::hsub, &ARM64::FPR128RegClass);
-    SrcReg =
-        RI.getMatchingSuperReg(SrcReg, ARM64::hsub, &ARM64::FPR128RegClass);
-    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-        SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  if (ARM64::FPR8RegClass.contains(DestReg) &&
-      ARM64::FPR8RegClass.contains(SrcReg)) {
-    DestReg =
-        RI.getMatchingSuperReg(DestReg, ARM64::bsub, &ARM64::FPR128RegClass);
-    SrcReg =
-        RI.getMatchingSuperReg(SrcReg, ARM64::bsub, &ARM64::FPR128RegClass);
-    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-        SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  // Copies between GPR64 and FPR64.
-  if (ARM64::FPR64RegClass.contains(DestReg) &&
-      ARM64::GPR64RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVXDr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  if (ARM64::GPR64RegClass.contains(DestReg) &&
-      ARM64::FPR64RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVDXr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  // Copies between GPR32 and FPR32.
-  if (ARM64::FPR32RegClass.contains(DestReg) &&
-      ARM64::GPR32RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVWSr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  if (ARM64::GPR32RegClass.contains(DestReg) &&
-      ARM64::FPR32RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVSWr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  assert(0 && "unimplemented reg-to-reg copy");
-}
-
-void ARM64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                         MachineBasicBlock::iterator MBBI,
-                                         unsigned SrcReg, bool isKill, int FI,
-                                         const TargetRegisterClass *RC,
-                                         const TargetRegisterInfo *TRI) const {
-  DebugLoc DL;
-  if (MBBI != MBB.end())
-    DL = MBBI->getDebugLoc();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
-
-  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
-  unsigned Opc = 0;
-  bool Offset = true;
-  switch (RC->getSize()) {
-  case 1:
-    if (ARM64::FPR8RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRBui;
-    break;
-  case 2:
-    if (ARM64::FPR16RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRHui;
-    break;
-  case 4:
-    if (ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::STRWui;
-      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
-        MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR32RegClass);
-      else
-        assert(SrcReg != ARM64::WSP);
-    } else if (ARM64::FPR32RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRSui;
-    break;
-  case 8:
-    if (ARM64::GPR64allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::STRXui;
-      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
-        MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR64RegClass);
-      else
-        assert(SrcReg != ARM64::SP);
-    } else if (ARM64::FPR64RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRDui;
-    break;
-  case 16:
-    if (ARM64::FPR128RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRQui;
-    else if (ARM64::DDRegClass.hasSubClassEq(RC))
-      Opc = ARM64::ST1Twov1d, Offset = false;
-    break;
-  case 24:
-    if (ARM64::DDDRegClass.hasSubClassEq(RC))
-      Opc = ARM64::ST1Threev1d, Offset = false;
-    break;
-  case 32:
-    if (ARM64::DDDDRegClass.hasSubClassEq(RC))
-      Opc = ARM64::ST1Fourv1d, Offset = false;
-    else if (ARM64::QQRegClass.hasSubClassEq(RC))
-      Opc = ARM64::ST1Twov2d, Offset = false;
-    break;
-  case 48:
-    if (ARM64::QQQRegClass.hasSubClassEq(RC))
-      Opc = ARM64::ST1Threev2d, Offset = false;
-    break;
-  case 64:
-    if (ARM64::QQQQRegClass.hasSubClassEq(RC))
-      Opc = ARM64::ST1Fourv2d, Offset = false;
-    break;
-  }
-  assert(Opc && "Unknown register class");
-
-  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
-                                      .addReg(SrcReg, getKillRegState(isKill))
-                                      .addFrameIndex(FI);
-
-  if (Offset)
-    MI.addImm(0);
-  MI.addMemOperand(MMO);
-}
-
-void ARM64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator MBBI,
-                                          unsigned DestReg, int FI,
-                                          const TargetRegisterClass *RC,
-                                          const TargetRegisterInfo *TRI) const {
-  DebugLoc DL;
-  if (MBBI != MBB.end())
-    DL = MBBI->getDebugLoc();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
-  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
-
-  unsigned Opc = 0;
-  bool Offset = true;
-  switch (RC->getSize()) {
-  case 1:
-    if (ARM64::FPR8RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRBui;
-    break;
-  case 2:
-    if (ARM64::FPR16RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRHui;
-    break;
-  case 4:
-    if (ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::LDRWui;
-      if (TargetRegisterInfo::isVirtualRegister(DestReg))
-        MF.getRegInfo().constrainRegClass(DestReg, &ARM64::GPR32RegClass);
-      else
-        assert(DestReg != ARM64::WSP);
-    } else if (ARM64::FPR32RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRSui;
-    break;
-  case 8:
-    if (ARM64::GPR64allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::LDRXui;
-      if (TargetRegisterInfo::isVirtualRegister(DestReg))
-        MF.getRegInfo().constrainRegClass(DestReg, &ARM64::GPR64RegClass);
-      else
-        assert(DestReg != ARM64::SP);
-    } else if (ARM64::FPR64RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRDui;
-    break;
-  case 16:
-    if (ARM64::FPR128RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRQui;
-    else if (ARM64::DDRegClass.hasSubClassEq(RC))
-      Opc = ARM64::LD1Twov1d, Offset = false;
-    break;
-  case 24:
-    if (ARM64::DDDRegClass.hasSubClassEq(RC))
-      Opc = ARM64::LD1Threev1d, Offset = false;
-    break;
-  case 32:
-    if (ARM64::DDDDRegClass.hasSubClassEq(RC))
-      Opc = ARM64::LD1Fourv1d, Offset = false;
-    else if (ARM64::QQRegClass.hasSubClassEq(RC))
-      Opc = ARM64::LD1Twov2d, Offset = false;
-    break;
-  case 48:
-    if (ARM64::QQQRegClass.hasSubClassEq(RC))
-      Opc = ARM64::LD1Threev2d, Offset = false;
-    break;
-  case 64:
-    if (ARM64::QQQQRegClass.hasSubClassEq(RC))
-      Opc = ARM64::LD1Fourv2d, Offset = false;
-    break;
-  }
-  assert(Opc && "Unknown register class");
-
-  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
-                                      .addReg(DestReg, getDefRegState(true))
-                                      .addFrameIndex(FI);
-  if (Offset)
-    MI.addImm(0);
-  MI.addMemOperand(MMO);
-}
-
-void llvm::emitFrameOffset(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg, int Offset,
-                           const ARM64InstrInfo *TII, MachineInstr::MIFlag Flag,
-                           bool SetCPSR) {
-  if (DestReg == SrcReg && Offset == 0)
-    return;
-
-  bool isSub = Offset < 0;
-  if (isSub)
-    Offset = -Offset;
-
-  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
-  // scratch register.  If DestReg is a virtual register, use it as the
-  // scratch register; otherwise, create a new virtual register (to be
-  // replaced by the scavenger at the end of PEI).  That case can be optimized
-  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
-  // register can be loaded with offset%8 and the add/sub can use an extending
-  // instruction with LSL#3.
-  // Currently the function handles any offsets but generates a poor sequence
-  // of code.
-  //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
-
-  unsigned Opc;
-  if (SetCPSR)
-    Opc = isSub ? ARM64::SUBSXri : ARM64::ADDSXri;
-  else
-    Opc = isSub ? ARM64::SUBXri : ARM64::ADDXri;
-  const unsigned MaxEncoding = 0xfff;
-  const unsigned ShiftSize = 12;
-  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
-  while (((unsigned)Offset) >= (1 << ShiftSize)) {
-    unsigned ThisVal;
-    if (((unsigned)Offset) > MaxEncodableValue) {
-      ThisVal = MaxEncodableValue;
-    } else {
-      ThisVal = Offset & MaxEncodableValue;
-    }
-    assert((ThisVal >> ShiftSize) <= MaxEncoding &&
-           "Encoding cannot handle value that big");
-    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
-        .addReg(SrcReg)
-        .addImm(ThisVal >> ShiftSize)
-        .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftSize))
-        .setMIFlag(Flag);
-
-    SrcReg = DestReg;
-    Offset -= ThisVal;
-    if (Offset == 0)
-      return;
-  }
-  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
-      .addReg(SrcReg)
-      .addImm(Offset)
-      .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0))
-      .setMIFlag(Flag);
-}
-
-MachineInstr *
-ARM64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      const SmallVectorImpl<unsigned> &Ops,
-                                      int FrameIndex) const {
-  // This is a bit of a hack. Consider this instruction:
-  //
-  //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
-  //
-  // We explicitly chose GPR64all for the virtual register so such a copy might
-  // be eliminated by RegisterCoalescer. However, that may not be possible, and
-  // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
-  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
-  //
-  // To prevent that, we are going to constrain the %vreg0 register class here.
-  //
-  // <rdar://problem/11522048>
-  //
-  if (MI->isCopy()) {
-    unsigned DstReg = MI->getOperand(0).getReg();
-    unsigned SrcReg = MI->getOperand(1).getReg();
-    if (SrcReg == ARM64::SP && TargetRegisterInfo::isVirtualRegister(DstReg)) {
-      MF.getRegInfo().constrainRegClass(DstReg, &ARM64::GPR64RegClass);
-      return 0;
-    }
-    if (DstReg == ARM64::SP && TargetRegisterInfo::isVirtualRegister(SrcReg)) {
-      MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR64RegClass);
-      return 0;
-    }
-  }
-
-  // Cannot fold.
-  return 0;
-}
-
-int llvm::isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
-                                  bool *OutUseUnscaledOp,
-                                  unsigned *OutUnscaledOp,
-                                  int *EmittableOffset) {
-  int Scale = 1;
-  bool IsSigned = false;
-  // The ImmIdx should be changed case by case if it is not 2.
-  unsigned ImmIdx = 2;
-  unsigned UnscaledOp = 0;
-  // Set output values in case of early exit.
-  if (EmittableOffset)
-    *EmittableOffset = 0;
-  if (OutUseUnscaledOp)
-    *OutUseUnscaledOp = false;
-  if (OutUnscaledOp)
-    *OutUnscaledOp = 0;
-  switch (MI.getOpcode()) {
-  default:
-    assert(0 && "unhandled opcode in rewriteARM64FrameIndex");
-  // Vector spills/fills can't take an immediate offset.
-  case ARM64::LD1Twov2d:
-  case ARM64::LD1Threev2d:
-  case ARM64::LD1Fourv2d:
-  case ARM64::LD1Twov1d:
-  case ARM64::LD1Threev1d:
-  case ARM64::LD1Fourv1d:
-  case ARM64::ST1Twov2d:
-  case ARM64::ST1Threev2d:
-  case ARM64::ST1Fourv2d:
-  case ARM64::ST1Twov1d:
-  case ARM64::ST1Threev1d:
-  case ARM64::ST1Fourv1d:
-    return ARM64FrameOffsetCannotUpdate;
-  case ARM64::PRFMui:
-    Scale = 8;
-    UnscaledOp = ARM64::PRFUMi;
-    break;
-  case ARM64::LDRXui:
-    Scale = 8;
-    UnscaledOp = ARM64::LDURXi;
-    break;
-  case ARM64::LDRWui:
-    Scale = 4;
-    UnscaledOp = ARM64::LDURWi;
-    break;
-  case ARM64::LDRBui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURBi;
-    break;
-  case ARM64::LDRHui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURHi;
-    break;
-  case ARM64::LDRSui:
-    Scale = 4;
-    UnscaledOp = ARM64::LDURSi;
-    break;
-  case ARM64::LDRDui:
-    Scale = 8;
-    UnscaledOp = ARM64::LDURDi;
-    break;
-  case ARM64::LDRQui:
-    Scale = 16;
-    UnscaledOp = ARM64::LDURQi;
-    break;
-  case ARM64::LDRBBui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURBBi;
-    break;
-  case ARM64::LDRHHui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURHHi;
-    break;
-  case ARM64::LDRSBXui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURSBXi;
-    break;
-  case ARM64::LDRSBWui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURSBWi;
-    break;
-  case ARM64::LDRSHXui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURSHXi;
-    break;
-  case ARM64::LDRSHWui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURSHWi;
-    break;
-  case ARM64::LDRSWui:
-    Scale = 4;
-    UnscaledOp = ARM64::LDURSWi;
-    break;
-
-  case ARM64::STRXui:
-    Scale = 8;
-    UnscaledOp = ARM64::STURXi;
-    break;
-  case ARM64::STRWui:
-    Scale = 4;
-    UnscaledOp = ARM64::STURWi;
-    break;
-  case ARM64::STRBui:
-    Scale = 1;
-    UnscaledOp = ARM64::STURBi;
-    break;
-  case ARM64::STRHui:
-    Scale = 2;
-    UnscaledOp = ARM64::STURHi;
-    break;
-  case ARM64::STRSui:
-    Scale = 4;
-    UnscaledOp = ARM64::STURSi;
-    break;
-  case ARM64::STRDui:
-    Scale = 8;
-    UnscaledOp = ARM64::STURDi;
-    break;
-  case ARM64::STRQui:
-    Scale = 16;
-    UnscaledOp = ARM64::STURQi;
-    break;
-  case ARM64::STRBBui:
-    Scale = 1;
-    UnscaledOp = ARM64::STURBBi;
-    break;
-  case ARM64::STRHHui:
-    Scale = 2;
-    UnscaledOp = ARM64::STURHHi;
-    break;
-
-  case ARM64::LDPXi:
-  case ARM64::LDPDi:
-  case ARM64::STPXi:
-  case ARM64::STPDi:
-    IsSigned = true;
-    Scale = 8;
-    break;
-  case ARM64::LDPQi:
-  case ARM64::STPQi:
-    IsSigned = true;
-    Scale = 16;
-    break;
-  case ARM64::LDPWi:
-  case ARM64::LDPSi:
-  case ARM64::STPWi:
-  case ARM64::STPSi:
-    IsSigned = true;
-    Scale = 4;
-    break;
-
-  case ARM64::LDURXi:
-  case ARM64::LDURWi:
-  case ARM64::LDURBi:
-  case ARM64::LDURHi:
-  case ARM64::LDURSi:
-  case ARM64::LDURDi:
-  case ARM64::LDURQi:
-  case ARM64::LDURHHi:
-  case ARM64::LDURBBi:
-  case ARM64::LDURSBXi:
-  case ARM64::LDURSBWi:
-  case ARM64::LDURSHXi:
-  case ARM64::LDURSHWi:
-  case ARM64::LDURSWi:
-  case ARM64::STURXi:
-  case ARM64::STURWi:
-  case ARM64::STURBi:
-  case ARM64::STURHi:
-  case ARM64::STURSi:
-  case ARM64::STURDi:
-  case ARM64::STURQi:
-  case ARM64::STURBBi:
-  case ARM64::STURHHi:
-    Scale = 1;
-    break;
-  }
-
-  Offset += MI.getOperand(ImmIdx).getImm() * Scale;
-
-  bool useUnscaledOp = false;
-  // If the offset doesn't match the scale, we rewrite the instruction to
-  // use the unscaled instruction instead. Likewise, if we have a negative
-  // offset (and have an unscaled op to use).
-  if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
-    useUnscaledOp = true;
-
-  // Use an unscaled addressing mode if the instruction has a negative offset
-  // (or if the instruction is already using an unscaled addressing mode).
-  unsigned MaskBits;
-  if (IsSigned) {
-    // ldp/stp instructions.
-    MaskBits = 7;
-    Offset /= Scale;
-  } else if (UnscaledOp == 0 || useUnscaledOp) {
-    MaskBits = 9;
-    IsSigned = true;
-    Scale = 1;
-  } else {
-    MaskBits = 12;
-    IsSigned = false;
-    Offset /= Scale;
-  }
-
-  // Attempt to fold address computation.
-  int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
-  int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
-  if (Offset >= MinOff && Offset <= MaxOff) {
-    if (EmittableOffset)
-      *EmittableOffset = Offset;
-    Offset = 0;
-  } else {
-    int NewOff = Offset < 0 ? MinOff : MaxOff;
-    if (EmittableOffset)
-      *EmittableOffset = NewOff;
-    Offset = (Offset - NewOff) * Scale;
-  }
-  if (OutUseUnscaledOp)
-    *OutUseUnscaledOp = useUnscaledOp;
-  if (OutUnscaledOp)
-    *OutUnscaledOp = UnscaledOp;
-  return ARM64FrameOffsetCanUpdate |
-         (Offset == 0 ? ARM64FrameOffsetIsLegal : 0);
-}
-
-bool llvm::rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                                  unsigned FrameReg, int &Offset,
-                                  const ARM64InstrInfo *TII) {
-  unsigned Opcode = MI.getOpcode();
-  unsigned ImmIdx = FrameRegIdx + 1;
-
-  if (Opcode == ARM64::ADDSXri || Opcode == ARM64::ADDXri) {
-    Offset += MI.getOperand(ImmIdx).getImm();
-    emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
-                    MI.getOperand(0).getReg(), FrameReg, Offset, TII,
-                    MachineInstr::NoFlags, (Opcode == ARM64::ADDSXri));
-    MI.eraseFromParent();
-    Offset = 0;
-    return true;
-  }
-
-  int NewOffset;
-  unsigned UnscaledOp;
-  bool UseUnscaledOp;
-  int Status = isARM64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, &UnscaledOp,
-                                       &NewOffset);
-  if (Status & ARM64FrameOffsetCanUpdate) {
-    if (Status & ARM64FrameOffsetIsLegal)
-      // Replace the FrameIndex with FrameReg.
-      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
-    if (UseUnscaledOp)
-      MI.setDesc(TII->get(UnscaledOp));
-
-    MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
-    return Offset == 0;
-  }
-
-  return false;
-}
-
-void ARM64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
-  NopInst.setOpcode(ARM64::HINT);
-  NopInst.addOperand(MCOperand::CreateImm(0));
-}
diff --git a/lib/Target/ARM64/ARM64InstrInfo.h b/lib/Target/ARM64/ARM64InstrInfo.h
deleted file mode 100644
index 2591ca0..0000000
--- a/lib/Target/ARM64/ARM64InstrInfo.h
+++ /dev/null
@@ -1,219 +0,0 @@
-//===- ARM64InstrInfo.h - ARM64 Instruction Information ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64INSTRINFO_H
-#define LLVM_TARGET_ARM64INSTRINFO_H
-
-#include "ARM64.h"
-#include "ARM64RegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-#define GET_INSTRINFO_HEADER
-#include "ARM64GenInstrInfo.inc"
-
-namespace llvm {
-
-class ARM64Subtarget;
-class ARM64TargetMachine;
-
-class ARM64InstrInfo : public ARM64GenInstrInfo {
-  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
-  // They will be shifted into MOTargetHintStart when accessed.
-  enum TargetMemOperandFlags {
-    MOSuppressPair = 1
-  };
-
-  const ARM64RegisterInfo RI;
-  const ARM64Subtarget &Subtarget;
-
-public:
-  explicit ARM64InstrInfo(const ARM64Subtarget &STI);
-
-  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
-  /// such, whenever a client has an instance of instruction info, it should
-  /// always be able to get register info as well (through this method).
-  const ARM64RegisterInfo &getRegisterInfo() const { return RI; }
-
-  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
-
-  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
-                             unsigned &DstReg, unsigned &SubIdx) const override;
-
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                               int &FrameIndex) const override;
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
-                              int &FrameIndex) const override;
-
-  /// \brief Does this instruction set its full destination register to zero?
-  bool isGPRZero(const MachineInstr *MI) const;
-
-  /// \brief Does this instruction rename a GPR without modifying bits?
-  bool isGPRCopy(const MachineInstr *MI) const;
-
-  /// \brief Does this instruction rename an FPR without modifying bits?
-  bool isFPRCopy(const MachineInstr *MI) const;
-
-  /// Return true if this is load/store scales or extends its register offset.
-  /// This refers to scaling a dynamic index as opposed to scaled immediates.
-  /// MI should be a memory op that allows scaled addressing.
-  bool isScaledAddr(const MachineInstr *MI) const;
-
-  /// Return true if pairing the given load or store is hinted to be
-  /// unprofitable.
-  bool isLdStPairSuppressed(const MachineInstr *MI) const;
-
-  /// Hint that pairing the given load or store is unprofitable.
-  void suppressLdStPair(MachineInstr *MI) const;
-
-  bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                            unsigned &Offset,
-                            const TargetRegisterInfo *TRI) const override;
-
-  bool enableClusterLoads() const override { return true; }
-
-  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
-                          unsigned NumLoads) const override;
-
-  bool shouldScheduleAdjacent(MachineInstr *First,
-                              MachineInstr *Second) const override;
-
-  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
-                                         uint64_t Offset, const MDNode *MDPtr,
-                                         DebugLoc DL) const;
-  void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
-                        bool KillSrc, unsigned Opcode,
-                        llvm::ArrayRef<unsigned> Indices) const;
-  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
-                   bool KillSrc) const override;
-
-  void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
-                           bool isKill, int FrameIndex,
-                           const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const override;
-
-  void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
-                            int FrameIndex, const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const override;
-
-  MachineInstr *
-  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                        const SmallVectorImpl<unsigned> &Ops,
-                        int FrameIndex) const override;
-
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                     MachineBasicBlock *&FBB,
-                     SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify = false) const override;
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
-  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                        MachineBasicBlock *FBB,
-                        const SmallVectorImpl<MachineOperand> &Cond,
-                        DebugLoc DL) const override;
-  bool
-  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-  bool canInsertSelect(const MachineBasicBlock &,
-                       const SmallVectorImpl<MachineOperand> &Cond, unsigned,
-                       unsigned, int &, int &, int &) const override;
-  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                    DebugLoc DL, unsigned DstReg,
-                    const SmallVectorImpl<MachineOperand> &Cond,
-                    unsigned TrueReg, unsigned FalseReg) const override;
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
-
-  /// analyzeCompare - For a comparison instruction, return the source registers
-  /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
-  /// Return true if the comparison instruction can be analyzed.
-  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                      unsigned &SrcReg2, int &CmpMask,
-                      int &CmpValue) const override;
-  /// optimizeCompareInstr - Convert the instruction supplying the argument to
-  /// the comparison into one that sets the zero bit in the flags register.
-  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
-                            unsigned SrcReg2, int CmpMask, int CmpValue,
-                            const MachineRegisterInfo *MRI) const override;
-
-private:
-  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
-                             MachineBasicBlock *TBB,
-                             const SmallVectorImpl<MachineOperand> &Cond) const;
-};
-
-/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
-/// plus Offset.  This is intended to be used from within the prolog/epilog
-/// insertion (PEI) pass, where a virtual scratch register may be allocated
-/// if necessary, to be replaced by the scavenger at the end of PEI.
-void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
-                     const ARM64InstrInfo *TII,
-                     MachineInstr::MIFlag = MachineInstr::NoFlags,
-                     bool SetCPSR = false);
-
-/// rewriteARM64FrameIndex - Rewrite MI to access 'Offset' bytes from the
-/// FP. Return false if the offset could not be handled directly in MI, and
-/// return the left-over portion by reference.
-bool rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                            unsigned FrameReg, int &Offset,
-                            const ARM64InstrInfo *TII);
-
-/// \brief Use to report the frame offset status in isARM64FrameOffsetLegal.
-enum ARM64FrameOffsetStatus {
-  ARM64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
-  ARM64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
-  ARM64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
-};
-
-/// \brief Check if the @p Offset is a valid frame offset for @p MI.
-/// The returned value reports the validity of the frame offset for @p MI.
-/// It uses the values defined by ARM64FrameOffsetStatus for that.
-/// If result == ARM64FrameOffsetCannotUpdate, @p MI cannot be updated to
-/// use an offset.eq
-/// If result & ARM64FrameOffsetIsLegal, @p Offset can completely be
-/// rewriten in @p MI.
-/// If result & ARM64FrameOffsetCanUpdate, @p Offset contains the
-/// amount that is off the limit of the legal offset.
-/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
-/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
-/// If set, @p EmittableOffset contains the amount that can be set in @p MI
-/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
-/// is a legal offset.
-int isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
-                            bool *OutUseUnscaledOp = NULL,
-                            unsigned *OutUnscaledOp = NULL,
-                            int *EmittableOffset = NULL);
-
-static inline bool isUncondBranchOpcode(int Opc) { return Opc == ARM64::B; }
-
-static inline bool isCondBranchOpcode(int Opc) {
-  switch (Opc) {
-  case ARM64::Bcc:
-  case ARM64::CBZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZW:
-  case ARM64::CBNZX:
-  case ARM64::TBZ:
-  case ARM64::TBNZ:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static inline bool isIndirectBranchOpcode(int Opc) { return Opc == ARM64::BR; }
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64InstrInfo.td b/lib/Target/ARM64/ARM64InstrInfo.td
deleted file mode 100644
index 2fe1720..0000000
--- a/lib/Target/ARM64/ARM64InstrInfo.td
+++ /dev/null
@@ -1,4458 +0,0 @@
-//===- ARM64InstrInfo.td - Describe the ARM64 Instructions -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// ARM64 Instruction definitions.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ARM64-specific DAG Nodes.
-//
-
-// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
-def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
-                                              [SDTCisSameAs<0, 2>,
-                                               SDTCisSameAs<0, 3>,
-                                               SDTCisInt<0>, SDTCisVT<1, i32>]>;
-
-// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
-def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
-                                            [SDTCisSameAs<0, 1>,
-                                             SDTCisSameAs<0, 2>,
-                                             SDTCisInt<0>,
-                                             SDTCisVT<3, i32>]>;
-
-// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
-def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
-                                            [SDTCisSameAs<0, 2>,
-                                             SDTCisSameAs<0, 3>,
-                                             SDTCisInt<0>,
-                                             SDTCisVT<1, i32>,
-                                             SDTCisVT<4, i32>]>;
-
-def SDT_ARM64Brcond  : SDTypeProfile<0, 3,
-                                     [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
-                                      SDTCisVT<2, i32>]>;
-def SDT_ARM64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
-def SDT_ARM64tbz : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisVT<1, i64>,
-                                        SDTCisVT<2, OtherVT>]>;
-
-
-def SDT_ARM64CSel  : SDTypeProfile<1, 4,
-                                   [SDTCisSameAs<0, 1>,
-                                    SDTCisSameAs<0, 2>,
-                                    SDTCisInt<3>,
-                                    SDTCisVT<4, i32>]>;
-def SDT_ARM64FCmp   : SDTypeProfile<0, 2,
-                                   [SDTCisFP<0>,
-                                    SDTCisSameAs<0, 1>]>;
-def SDT_ARM64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
-def SDT_ARM64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
-def SDT_ARM64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
-                                          SDTCisSameAs<0, 1>,
-                                          SDTCisSameAs<0, 2>]>;
-def SDT_ARM64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
-def SDT_ARM64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
-def SDT_ARM64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                           SDTCisInt<2>, SDTCisInt<3>]>;
-def SDT_ARM64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
-def SDT_ARM64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                          SDTCisSameAs<0,2>, SDTCisInt<3>]>;
-def SDT_ARM64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
-
-def SDT_ARM64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
-def SDT_ARM64fcmpz : SDTypeProfile<1, 1, []>;
-def SDT_ARM64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
-def SDT_ARM64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                           SDTCisSameAs<0,2>]>;
-def SDT_ARM64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                           SDTCisSameAs<0,2>,
-                                           SDTCisSameAs<0,3>]>;
-def SDT_ARM64TCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
-def SDT_ARM64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
-
-def SDT_ARM64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
-
-def SDT_ARM64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
-                                                 SDTCisPtrTy<1>]>;
-def SDT_ARM64WrapperLarge : SDTypeProfile<1, 4,
-                                        [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
-                                         SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
-                                         SDTCisSameAs<1, 4>]>;
-
-
-// Node definitions.
-def ARM64adrp          : SDNode<"ARM64ISD::ADRP", SDTIntUnaryOp, []>;
-def ARM64addlow        : SDNode<"ARM64ISD::ADDlow", SDTIntBinOp, []>;
-def ARM64LOADgot       : SDNode<"ARM64ISD::LOADgot", SDTIntUnaryOp>;
-def ARM64callseq_start : SDNode<"ISD::CALLSEQ_START",
-                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
-                                [SDNPHasChain, SDNPOutGlue]>;
-def ARM64callseq_end   : SDNode<"ISD::CALLSEQ_END",
-                                SDCallSeqEnd<[ SDTCisVT<0, i32>,
-                                               SDTCisVT<1, i32> ]>,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def ARM64call          : SDNode<"ARM64ISD::CALL",
-                                SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
-                                 SDNPVariadic]>;
-def ARM64brcond        : SDNode<"ARM64ISD::BRCOND", SDT_ARM64Brcond,
-                                [SDNPHasChain]>;
-def ARM64cbz           : SDNode<"ARM64ISD::CBZ", SDT_ARM64cbz,
-                                [SDNPHasChain]>;
-def ARM64cbnz           : SDNode<"ARM64ISD::CBNZ", SDT_ARM64cbz,
-                                [SDNPHasChain]>;
-def ARM64tbz           : SDNode<"ARM64ISD::TBZ", SDT_ARM64tbz,
-                                [SDNPHasChain]>;
-def ARM64tbnz           : SDNode<"ARM64ISD::TBNZ", SDT_ARM64tbz,
-                                [SDNPHasChain]>;
-
-
-def ARM64csel          : SDNode<"ARM64ISD::CSEL", SDT_ARM64CSel>;
-def ARM64csinv         : SDNode<"ARM64ISD::CSINV", SDT_ARM64CSel>;
-def ARM64csneg         : SDNode<"ARM64ISD::CSNEG", SDT_ARM64CSel>;
-def ARM64csinc         : SDNode<"ARM64ISD::CSINC", SDT_ARM64CSel>;
-def ARM64retflag       : SDNode<"ARM64ISD::RET_FLAG", SDTNone,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def ARM64adc       : SDNode<"ARM64ISD::ADC",  SDTBinaryArithWithFlagsIn >;
-def ARM64sbc       : SDNode<"ARM64ISD::SBC",  SDTBinaryArithWithFlagsIn>;
-def ARM64add_flag  : SDNode<"ARM64ISD::ADDS",  SDTBinaryArithWithFlagsOut,
-                            [SDNPCommutative]>;
-def ARM64sub_flag  : SDNode<"ARM64ISD::SUBS",  SDTBinaryArithWithFlagsOut>;
-def ARM64and_flag  : SDNode<"ARM64ISD::ANDS",  SDTBinaryArithWithFlagsOut>;
-def ARM64adc_flag  : SDNode<"ARM64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
-def ARM64sbc_flag  : SDNode<"ARM64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
-
-def ARM64threadpointer : SDNode<"ARM64ISD::THREAD_POINTER", SDTPtrLeaf>;
-
-def ARM64fcmp      : SDNode<"ARM64ISD::FCMP", SDT_ARM64FCmp>;
-
-def ARM64fmax      : SDNode<"ARM64ISD::FMAX", SDTFPBinOp>;
-def ARM64fmin      : SDNode<"ARM64ISD::FMIN", SDTFPBinOp>;
-
-def ARM64dup       : SDNode<"ARM64ISD::DUP", SDT_ARM64Dup>;
-def ARM64duplane8  : SDNode<"ARM64ISD::DUPLANE8", SDT_ARM64DupLane>;
-def ARM64duplane16 : SDNode<"ARM64ISD::DUPLANE16", SDT_ARM64DupLane>;
-def ARM64duplane32 : SDNode<"ARM64ISD::DUPLANE32", SDT_ARM64DupLane>;
-def ARM64duplane64 : SDNode<"ARM64ISD::DUPLANE64", SDT_ARM64DupLane>;
-
-def ARM64zip1      : SDNode<"ARM64ISD::ZIP1", SDT_ARM64Zip>;
-def ARM64zip2      : SDNode<"ARM64ISD::ZIP2", SDT_ARM64Zip>;
-def ARM64uzp1      : SDNode<"ARM64ISD::UZP1", SDT_ARM64Zip>;
-def ARM64uzp2      : SDNode<"ARM64ISD::UZP2", SDT_ARM64Zip>;
-def ARM64trn1      : SDNode<"ARM64ISD::TRN1", SDT_ARM64Zip>;
-def ARM64trn2      : SDNode<"ARM64ISD::TRN2", SDT_ARM64Zip>;
-
-def ARM64movi_edit : SDNode<"ARM64ISD::MOVIedit", SDT_ARM64MOVIedit>;
-def ARM64movi_shift : SDNode<"ARM64ISD::MOVIshift", SDT_ARM64MOVIshift>;
-def ARM64movi_msl : SDNode<"ARM64ISD::MOVImsl", SDT_ARM64MOVIshift>;
-def ARM64mvni_shift : SDNode<"ARM64ISD::MVNIshift", SDT_ARM64MOVIshift>;
-def ARM64mvni_msl : SDNode<"ARM64ISD::MVNImsl", SDT_ARM64MOVIshift>;
-def ARM64movi : SDNode<"ARM64ISD::MOVI", SDT_ARM64MOVIedit>;
-def ARM64fmov : SDNode<"ARM64ISD::FMOV", SDT_ARM64MOVIedit>;
-
-def ARM64rev16 : SDNode<"ARM64ISD::REV16", SDT_ARM64UnaryVec>;
-def ARM64rev32 : SDNode<"ARM64ISD::REV32", SDT_ARM64UnaryVec>;
-def ARM64rev64 : SDNode<"ARM64ISD::REV64", SDT_ARM64UnaryVec>;
-def ARM64ext : SDNode<"ARM64ISD::EXT", SDT_ARM64ExtVec>;
-
-def ARM64vashr : SDNode<"ARM64ISD::VASHR", SDT_ARM64vshift>;
-def ARM64vlshr : SDNode<"ARM64ISD::VLSHR", SDT_ARM64vshift>;
-def ARM64vshl : SDNode<"ARM64ISD::VSHL", SDT_ARM64vshift>;
-def ARM64sqshli : SDNode<"ARM64ISD::SQSHL_I", SDT_ARM64vshift>;
-def ARM64uqshli : SDNode<"ARM64ISD::UQSHL_I", SDT_ARM64vshift>;
-def ARM64sqshlui : SDNode<"ARM64ISD::SQSHLU_I", SDT_ARM64vshift>;
-def ARM64srshri : SDNode<"ARM64ISD::SRSHR_I", SDT_ARM64vshift>;
-def ARM64urshri : SDNode<"ARM64ISD::URSHR_I", SDT_ARM64vshift>;
-
-def ARM64not: SDNode<"ARM64ISD::NOT", SDT_ARM64unvec>;
-def ARM64bit: SDNode<"ARM64ISD::BIT", SDT_ARM64trivec>;
-
-def ARM64cmeq: SDNode<"ARM64ISD::CMEQ", SDT_ARM64binvec>;
-def ARM64cmge: SDNode<"ARM64ISD::CMGE", SDT_ARM64binvec>;
-def ARM64cmgt: SDNode<"ARM64ISD::CMGT", SDT_ARM64binvec>;
-def ARM64cmhi: SDNode<"ARM64ISD::CMHI", SDT_ARM64binvec>;
-def ARM64cmhs: SDNode<"ARM64ISD::CMHS", SDT_ARM64binvec>;
-
-def ARM64fcmeq: SDNode<"ARM64ISD::FCMEQ", SDT_ARM64fcmp>;
-def ARM64fcmge: SDNode<"ARM64ISD::FCMGE", SDT_ARM64fcmp>;
-def ARM64fcmgt: SDNode<"ARM64ISD::FCMGT", SDT_ARM64fcmp>;
-
-def ARM64cmeqz: SDNode<"ARM64ISD::CMEQz", SDT_ARM64unvec>;
-def ARM64cmgez: SDNode<"ARM64ISD::CMGEz", SDT_ARM64unvec>;
-def ARM64cmgtz: SDNode<"ARM64ISD::CMGTz", SDT_ARM64unvec>;
-def ARM64cmlez: SDNode<"ARM64ISD::CMLEz", SDT_ARM64unvec>;
-def ARM64cmltz: SDNode<"ARM64ISD::CMLTz", SDT_ARM64unvec>;
-def ARM64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
-                         (ARM64not (ARM64cmeqz (and node:$LHS, node:$RHS)))>;
-
-def ARM64fcmeqz: SDNode<"ARM64ISD::FCMEQz", SDT_ARM64fcmpz>;
-def ARM64fcmgez: SDNode<"ARM64ISD::FCMGEz", SDT_ARM64fcmpz>;
-def ARM64fcmgtz: SDNode<"ARM64ISD::FCMGTz", SDT_ARM64fcmpz>;
-def ARM64fcmlez: SDNode<"ARM64ISD::FCMLEz", SDT_ARM64fcmpz>;
-def ARM64fcmltz: SDNode<"ARM64ISD::FCMLTz", SDT_ARM64fcmpz>;
-
-def ARM64bici: SDNode<"ARM64ISD::BICi", SDT_ARM64vecimm>;
-def ARM64orri: SDNode<"ARM64ISD::ORRi", SDT_ARM64vecimm>;
-
-def ARM64neg : SDNode<"ARM64ISD::NEG", SDT_ARM64unvec>;
-
-def ARM64tcret: SDNode<"ARM64ISD::TC_RETURN", SDT_ARM64TCRET,
-                  [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
-
-def ARM64Prefetch        : SDNode<"ARM64ISD::PREFETCH", SDT_ARM64PREFETCH,
-                               [SDNPHasChain, SDNPSideEffect]>;
-
-def ARM64sitof: SDNode<"ARM64ISD::SITOF", SDT_ARM64ITOF>;
-def ARM64uitof: SDNode<"ARM64ISD::UITOF", SDT_ARM64ITOF>;
-
-def ARM64tlsdesc_call : SDNode<"ARM64ISD::TLSDESC_CALL", SDT_ARM64TLSDescCall,
-                               [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
-                                SDNPVariadic]>;
-
-def ARM64WrapperLarge : SDNode<"ARM64ISD::WrapperLarge", SDT_ARM64WrapperLarge>;
-
-
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-
-// ARM64 Instruction Predicate Definitions.
-//
-def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
-def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
-def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
-def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
-def ForCodeSize   : Predicate<"ForCodeSize">;
-def NotForCodeSize   : Predicate<"!ForCodeSize">;
-
-include "ARM64InstrFormats.td"
-
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Miscellaneous instructions.
-//===----------------------------------------------------------------------===//
-
-let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                              [(ARM64callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-                            [(ARM64callseq_end timm:$amt1, timm:$amt2)]>;
-} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
-
-let isReMaterializable = 1, isCodeGenOnly = 1 in {
-// FIXME: The following pseudo instructions are only needed because remat
-// cannot handle multiple instructions.  When that changes, they can be
-// removed, along with the ARM64Wrapper node.
-
-let AddedComplexity = 10 in
-def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
-                     [(set GPR64:$dst, (ARM64LOADgot tglobaladdr:$addr))]>,
-              Sched<[WriteLDAdr]>;
-
-// The MOVaddr instruction should match only when the add is not folded
-// into a load or store address.
-def MOVaddr
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tglobaladdr:$hi),
-                                            tglobaladdr:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrJT
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tjumptable:$hi),
-                                             tjumptable:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrCP
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tconstpool:$hi),
-                                             tconstpool:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrBA
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tblockaddress:$hi),
-                                             tblockaddress:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrTLS
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tglobaltlsaddr:$hi),
-                                            tglobaltlsaddr:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrEXT
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp texternalsym:$hi),
-                                            texternalsym:$low))]>,
-      Sched<[WriteAdrAdr]>;
-
-} // isReMaterializable, isCodeGenOnly
-
-def : Pat<(ARM64LOADgot tglobaltlsaddr:$addr),
-          (LOADgot tglobaltlsaddr:$addr)>;
-
-def : Pat<(ARM64LOADgot texternalsym:$addr),
-          (LOADgot texternalsym:$addr)>;
-
-def : Pat<(ARM64LOADgot tconstpool:$addr),
-          (LOADgot tconstpool:$addr)>;
-
-//===----------------------------------------------------------------------===//
-// System instructions.
-//===----------------------------------------------------------------------===//
-
-def HINT  : HintI<"hint">;
-def : InstAlias<"nop",  (HINT 0b000)>;
-def : InstAlias<"yield",(HINT 0b001)>;
-def : InstAlias<"wfe",  (HINT 0b010)>;
-def : InstAlias<"wfi",  (HINT 0b011)>;
-def : InstAlias<"sev",  (HINT 0b100)>;
-def : InstAlias<"sevl", (HINT 0b101)>;
-
-  // As far as LLVM is concerned this writes to the system's exclusive monitors.
-let mayLoad = 1, mayStore = 1 in
-def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
-
-def DMB   : CRmSystemI<barrier_op, 0b101, "dmb">;
-def DSB   : CRmSystemI<barrier_op, 0b100, "dsb">;
-def ISB   : CRmSystemI<barrier_op, 0b110, "isb">;
-def : InstAlias<"clrex", (CLREX 0xf)>;
-def : InstAlias<"isb", (ISB 0xf)>;
-
-def MRS    : MRSI;
-def MSR    : MSRI;
-def MSRcpsr: MSRcpsrI;
-
-// The thread pointer (on Linux, at least, where this has been implemented) is
-// TPIDR_EL0.
-def : Pat<(ARM64threadpointer), (MRS 0xde82)>;
-
-// Generic system instructions
-def SYS    : SystemI<0, "sys">;
-def SYSxt  : SystemXtI<0, "sys">;
-def SYSLxt : SystemLXtI<1, "sysl">;
-
-//===----------------------------------------------------------------------===//
-// Move immediate instructions.
-//===----------------------------------------------------------------------===//
-
-defm MOVK : InsertImmediate<0b11, "movk">;
-defm MOVN : MoveImmediate<0b00, "movn">;
-
-let PostEncoderMethod = "fixMOVZ" in
-defm MOVZ : MoveImmediate<0b10, "movz">;
-
-def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
-
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g2:$sym, 32)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
-
-let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
-    isAsCheapAsAMove = 1 in {
-// FIXME: The following pseudo instructions are only needed because remat
-// cannot handle multiple instructions.  When that changes, we can select
-// directly to the real instructions and get rid of these pseudos.
-
-def MOVi32imm
-    : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
-             [(set GPR32:$dst, imm:$src)]>,
-      Sched<[WriteImm]>;
-def MOVi64imm
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
-             [(set GPR64:$dst, imm:$src)]>,
-      Sched<[WriteImm]>;
-} // isReMaterializable, isCodeGenOnly
-
-def : Pat<(ARM64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
-                             tglobaladdr:$g1, tglobaladdr:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
-                                  tglobaladdr:$g2, 32),
-                          tglobaladdr:$g1, 16),
-                  tglobaladdr:$g0, 0)>;
-
-def : Pat<(ARM64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
-                             tblockaddress:$g1, tblockaddress:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
-                                  tblockaddress:$g2, 32),
-                          tblockaddress:$g1, 16),
-                  tblockaddress:$g0, 0)>;
-
-def : Pat<(ARM64WrapperLarge tconstpool:$g3, tconstpool:$g2,
-                             tconstpool:$g1, tconstpool:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
-                                  tconstpool:$g2, 32),
-                          tconstpool:$g1, 16),
-                  tconstpool:$g0, 0)>;
-
-
-//===----------------------------------------------------------------------===//
-// Arithmetic instructions.
-//===----------------------------------------------------------------------===//
-
-// Add/subtract with carry.
-defm ADC : AddSubCarry<0, "adc", "adcs", ARM64adc, ARM64adc_flag>;
-defm SBC : AddSubCarry<1, "sbc", "sbcs", ARM64sbc, ARM64sbc_flag>;
-
-def : InstAlias<"ngc $dst, $src",  (SBCWr  GPR32:$dst, WZR, GPR32:$src)>;
-def : InstAlias<"ngc $dst, $src",  (SBCXr  GPR64:$dst, XZR, GPR64:$src)>;
-def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
-def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
-
-// Add/subtract
-defm ADD : AddSub<0, "add", add>;
-defm SUB : AddSub<1, "sub">;
-
-defm ADDS : AddSubS<0, "adds", ARM64add_flag>;
-defm SUBS : AddSubS<1, "subs", ARM64sub_flag>;
-
-// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
-def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
-          (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
-def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
-          (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
-def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
-          (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
-def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
-          (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
-def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
-          (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
-def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
-          (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
-def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
-          (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
-def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
-          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
-
-// Because of the immediate format for add/sub-imm instructions, the
-// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
-//  These patterns capture that transformation.
-let AddedComplexity = 1 in {
-def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-}
-
-def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0)>;
-def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0)>;
-def : InstAlias<"neg $dst, $src, $shift",
-                (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift:$shift)>;
-def : InstAlias<"neg $dst, $src, $shift",
-                (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift:$shift)>;
-
-// Because of the immediate format for add/sub-imm instructions, the
-// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
-//  These patterns capture that transformation.
-let AddedComplexity = 1 in {
-def : Pat<(ARM64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(ARM64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-def : Pat<(ARM64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(ARM64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-}
-
-def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0)>;
-def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0)>;
-def : InstAlias<"negs $dst, $src, $shift",
-                (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift:$shift)>;
-def : InstAlias<"negs $dst, $src, $shift",
-                (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift:$shift)>;
-
-// Unsigned/Signed divide
-defm UDIV : Div<0, "udiv", udiv>;
-defm SDIV : Div<1, "sdiv", sdiv>;
-let isCodeGenOnly = 1 in {
-defm UDIV_Int : Div<0, "udiv", int_arm64_udiv>;
-defm SDIV_Int : Div<1, "sdiv", int_arm64_sdiv>;
-}
-
-// Variable shift
-defm ASRV : Shift<0b10, "asrv", sra>;
-defm LSLV : Shift<0b00, "lslv", shl>;
-defm LSRV : Shift<0b01, "lsrv", srl>;
-defm RORV : Shift<0b11, "rorv", rotr>;
-
-def : ShiftAlias<"asr", ASRVWr, GPR32>;
-def : ShiftAlias<"asr", ASRVXr, GPR64>;
-def : ShiftAlias<"lsl", LSLVWr, GPR32>;
-def : ShiftAlias<"lsl", LSLVXr, GPR64>;
-def : ShiftAlias<"lsr", LSRVWr, GPR32>;
-def : ShiftAlias<"lsr", LSRVXr, GPR64>;
-def : ShiftAlias<"ror", RORVWr, GPR32>;
-def : ShiftAlias<"ror", RORVXr, GPR64>;
-
-// Multiply-add
-let AddedComplexity = 7 in {
-defm MADD : MulAccum<0, "madd", add>;
-defm MSUB : MulAccum<1, "msub", sub>;
-
-def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
-          (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
-def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
-          (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-
-def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
-          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
-def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
-          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-} // AddedComplexity = 7
-
-let AddedComplexity = 5 in {
-def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
-def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
-def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
-def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
-
-def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
-          (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
-          (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-
-def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
-          (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
-          (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-} // AddedComplexity = 5
-
-def : MulAccumWAlias<"mul", MADDWrrr>;
-def : MulAccumXAlias<"mul", MADDXrrr>;
-def : MulAccumWAlias<"mneg", MSUBWrrr>;
-def : MulAccumXAlias<"mneg", MSUBXrrr>;
-def : WideMulAccumAlias<"smull", SMADDLrrr>;
-def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
-def : WideMulAccumAlias<"umull", UMADDLrrr>;
-def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
-
-// Multiply-high
-def SMULHrr : MulHi<0b010, "smulh", mulhs>;
-def UMULHrr : MulHi<0b110, "umulh", mulhu>;
-
-// CRC32
-def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_arm64_crc32b, "crc32b">;
-def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_arm64_crc32h, "crc32h">;
-def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_arm64_crc32w, "crc32w">;
-def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_arm64_crc32x, "crc32x">;
-
-def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_arm64_crc32cb, "crc32cb">;
-def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_arm64_crc32ch, "crc32ch">;
-def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_arm64_crc32cw, "crc32cw">;
-def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_arm64_crc32cx, "crc32cx">;
-
-
-//===----------------------------------------------------------------------===//
-// Logical instructions.
-//===----------------------------------------------------------------------===//
-
-// (immediate)
-defm ANDS : LogicalImmS<0b11, "ands", ARM64and_flag>;
-defm AND  : LogicalImm<0b00, "and", and>;
-defm EOR  : LogicalImm<0b10, "eor", xor>;
-defm ORR  : LogicalImm<0b01, "orr", or>;
-
-def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
-                                          logical_imm32:$imm)>;
-def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
-                                          logical_imm64:$imm)>;
-
-
-// (register)
-defm ANDS : LogicalRegS<0b11, 0, "ands">;
-defm BICS : LogicalRegS<0b11, 1, "bics">;
-defm AND  : LogicalReg<0b00, 0, "and", and>;
-defm BIC  : LogicalReg<0b00, 1, "bic",
-                       BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
-defm EON  : LogicalReg<0b10, 1, "eon",
-                       BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
-defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
-defm ORN  : LogicalReg<0b01, 1, "orn",
-                       BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
-defm ORR  : LogicalReg<0b01, 0, "orr", or>;
-
-def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0)>;
-def : InstAlias<"mov $dst, $src",
-                (ADDWri GPR32sp:$dst, GPR32sp:$src, 0, 0)>;
-def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0)>;
-def : InstAlias<"mov $dst, $src",
-                (ADDXri GPR64sp:$dst, GPR64sp:$src, 0, 0)>;
-
-def : InstAlias<"tst $src1, $src2",
-                (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2)>;
-def : InstAlias<"tst $src1, $src2",
-                (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2)>;
-
-def : InstAlias<"tst $src1, $src2",
-                (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0)>;
-def : InstAlias<"tst $src1, $src2",
-                (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0)>;
-
-def : InstAlias<"tst $src1, $src2, $sh",
-                (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift:$sh)>;
-def : InstAlias<"tst $src1, $src2, $sh",
-                (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift:$sh)>;
-
-def : InstAlias<"mvn $Wd, $Wm",
-                (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0)>;
-def : InstAlias<"mvn $Xd, $Xm",
-                (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0)>;
-
-def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
-def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
-
-
-//===----------------------------------------------------------------------===//
-// One operand data processing instructions.
-//===----------------------------------------------------------------------===//
-
-defm CLS    : OneOperandData<0b101, "cls">;
-defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
-defm RBIT   : OneOperandData<0b000, "rbit">;
-def  REV16Wr : OneWRegData<0b001, "rev16",
-                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
-def  REV16Xr : OneXRegData<0b001, "rev16",
-                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
-
-def : Pat<(cttz GPR32:$Rn),
-          (CLZWr (RBITWr GPR32:$Rn))>;
-def : Pat<(cttz GPR64:$Rn),
-          (CLZXr (RBITXr GPR64:$Rn))>;
-
-// Unlike the other one operand instructions, the instructions with the "rev"
-// mnemonic do *not* just different in the size bit, but actually use different
-// opcode bits for the different sizes.
-def REVWr   : OneWRegData<0b010, "rev", bswap>;
-def REVXr   : OneXRegData<0b011, "rev", bswap>;
-def REV32Xr : OneXRegData<0b010, "rev32",
-                                 UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
-
-//===----------------------------------------------------------------------===//
-// Bitfield immediate extraction instruction.
-//===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in
-defm EXTR : ExtractImm<"extr">;
-def : InstAlias<"ror $dst, $src, $shift",
-            (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
-def : InstAlias<"ror $dst, $src, $shift",
-            (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
-
-def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
-          (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
-def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
-          (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
-
-//===----------------------------------------------------------------------===//
-// Other bitfield immediate instructions.
-//===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in {
-defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
-defm SBFM : BitfieldImm<0b00, "sbfm">;
-defm UBFM : BitfieldImm<0b10, "ubfm">;
-}
-
-def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 31 - N->getZExtValue();
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(7, 31 - shift_amt)
-def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 31 - N->getZExtValue();
-  enc = enc > 7 ? 7 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(15, 31 - shift_amt)
-def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 31 - N->getZExtValue();
-  enc = enc > 15 ? 15 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(7, 63 - shift_amt)
-def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  enc = enc > 7 ? 7 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(15, 63 - shift_amt)
-def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  enc = enc > 15 ? 15 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(31, 63 - shift_amt)
-def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  enc = enc > 31 ? 31 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
-          (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
-                              (i64 (i32shift_b imm0_31:$imm)))>;
-def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
-          (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
-                              (i64 (i64shift_b imm0_63:$imm)))>;
-
-let AddedComplexity = 10 in {
-def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
-          (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
-def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
-          (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
-}
-
-def : InstAlias<"asr $dst, $src, $shift",
-                (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
-def : InstAlias<"asr $dst, $src, $shift",
-                (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
-def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
-def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
-def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
-def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
-def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
-
-def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
-          (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
-def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
-          (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
-
-def : InstAlias<"lsr $dst, $src, $shift",
-                (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
-def : InstAlias<"lsr $dst, $src, $shift",
-                (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
-def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
-def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
-def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
-def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
-def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
-
-//===----------------------------------------------------------------------===//
-// Conditionally set flags instructions.
-//===----------------------------------------------------------------------===//
-defm CCMN : CondSetFlagsImm<0, "ccmn">;
-defm CCMP : CondSetFlagsImm<1, "ccmp">;
-
-defm CCMN : CondSetFlagsReg<0, "ccmn">;
-defm CCMP : CondSetFlagsReg<1, "ccmp">;
-
-//===----------------------------------------------------------------------===//
-// Conditional select instructions.
-//===----------------------------------------------------------------------===//
-defm CSEL  : CondSelect<0, 0b00, "csel">;
-
-def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
-defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
-defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
-defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
-
-def : Pat<(ARM64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), CPSR),
-          (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), CPSR),
-          (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), CPSR),
-          (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), CPSR),
-          (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), CPSR),
-          (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), CPSR),
-          (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
-
-def : Pat<(ARM64csel (i32 0), (i32 1), (i32 imm:$cc), CPSR),
-          (CSINCWr WZR, WZR, (i32 imm:$cc))>;
-def : Pat<(ARM64csel (i64 0), (i64 1), (i32 imm:$cc), CPSR),
-          (CSINCXr XZR, XZR, (i32 imm:$cc))>;
-def : Pat<(ARM64csel (i32 0), (i32 -1), (i32 imm:$cc), CPSR),
-          (CSINVWr WZR, WZR, (i32 imm:$cc))>;
-def : Pat<(ARM64csel (i64 0), (i64 -1), (i32 imm:$cc), CPSR),
-          (CSINVXr XZR, XZR, (i32 imm:$cc))>;
-
-// The inverse of the condition code from the alias instruction is what is used
-// in the aliased instruction. The parser all ready inverts the condition code
-// for these aliases.
-// FIXME: Is this the correct way to handle these aliases?
-def : InstAlias<"cset $dst, $cc", (CSINCWr GPR32:$dst, WZR, WZR, ccode:$cc)>;
-def : InstAlias<"cset $dst, $cc", (CSINCXr GPR64:$dst, XZR, XZR, ccode:$cc)>;
-
-def : InstAlias<"csetm $dst, $cc", (CSINVWr GPR32:$dst, WZR, WZR, ccode:$cc)>;
-def : InstAlias<"csetm $dst, $cc", (CSINVXr GPR64:$dst, XZR, XZR, ccode:$cc)>;
-
-def : InstAlias<"cinc $dst, $src, $cc",
-                (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, ccode:$cc)>;
-def : InstAlias<"cinc $dst, $src, $cc",
-                (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, ccode:$cc)>;
-
-def : InstAlias<"cinv $dst, $src, $cc",
-                (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, ccode:$cc)>;
-def : InstAlias<"cinv $dst, $src, $cc",
-                (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, ccode:$cc)>;
-
-def : InstAlias<"cneg $dst, $src, $cc",
-                (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, ccode:$cc)>;
-def : InstAlias<"cneg $dst, $src, $cc",
-                (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, ccode:$cc)>;
-
-//===----------------------------------------------------------------------===//
-// PC-relative instructions.
-//===----------------------------------------------------------------------===//
-let isReMaterializable = 1 in {
-let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
-def ADR  : ADRI<0, "adr", adrlabel, []>;
-} // neverHasSideEffects = 1
-
-def ADRP : ADRI<1, "adrp", adrplabel,
-                [(set GPR64:$Xd, (ARM64adrp tglobaladdr:$label))]>;
-} // isReMaterializable = 1
-
-// page address of a constant pool entry, block address
-def : Pat<(ARM64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
-def : Pat<(ARM64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (register) instructions.
-//===----------------------------------------------------------------------===//
-
-let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-def RET  : BranchReg<0b0010, "ret", []>;
-def DRPS : SpecialReturn<0b0101, "drps">;
-def ERET : SpecialReturn<0b0100, "eret">;
-} // isReturn = 1, isTerminator = 1, isBarrier = 1
-
-// Default to the LR register.
-def : InstAlias<"ret", (RET LR)>;
-
-let isCall = 1, Defs = [LR], Uses = [SP] in {
-def BLR : BranchReg<0b0001, "blr", [(ARM64call GPR64:$Rn)]>;
-} // isCall
-
-let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
-def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
-} // isBranch, isTerminator, isBarrier, isIndirectBranch
-
-// Create a separate pseudo-instruction for codegen to use so that we don't
-// flag lr as used in every function. It'll be restored before the RET by the
-// epilogue if it's legitimately used.
-def RET_ReallyLR : Pseudo<(outs), (ins), [(ARM64retflag)]> {
-  let isTerminator = 1;
-  let isBarrier = 1;
-  let isReturn = 1;
-}
-
-// This is a directive-like pseudo-instruction. The purpose is to insert an
-// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
-// (which in the usual case is a BLR).
-let hasSideEffects = 1 in
-def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
-  let AsmString = ".tlsdesccall $sym";
-}
-
-// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
-// gets expanded to two MCInsts during lowering.
-let isCall = 1, Defs = [LR] in
-def TLSDESC_BLR
-    : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
-             [(ARM64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
-
-def : Pat<(ARM64tlsdesc_call GPR64:$dest, texternalsym:$sym),
-          (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
-//===----------------------------------------------------------------------===//
-// Conditional branch (immediate) instruction.
-//===----------------------------------------------------------------------===//
-def Bcc : BranchCond;
-
-//===----------------------------------------------------------------------===//
-// Compare-and-branch instructions.
-//===----------------------------------------------------------------------===//
-defm CBZ  : CmpBranch<0, "cbz", ARM64cbz>;
-defm CBNZ : CmpBranch<1, "cbnz", ARM64cbnz>;
-
-//===----------------------------------------------------------------------===//
-// Test-bit-and-branch instructions.
-//===----------------------------------------------------------------------===//
-def TBZ  : TestBranch<0, "tbz", ARM64tbz>;
-def TBNZ : TestBranch<1, "tbnz", ARM64tbnz>;
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (immediate) instructions.
-//===----------------------------------------------------------------------===//
-let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
-def B  : BranchImm<0, "b", [(br bb:$addr)]>;
-} // isBranch, isTerminator, isBarrier
-
-let isCall = 1, Defs = [LR], Uses = [SP] in {
-def BL : CallImm<1, "bl", [(ARM64call tglobaladdr:$addr)]>;
-} // isCall
-def : Pat<(ARM64call texternalsym:$func), (BL texternalsym:$func)>;
-
-//===----------------------------------------------------------------------===//
-// Exception generation instructions.
-//===----------------------------------------------------------------------===//
-def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
-def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
-def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
-def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
-def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
-def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
-def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
-def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
-
-// DCPSn defaults to an immediate operand of zero if unspecified.
-def : InstAlias<"dcps1", (DCPS1 0)>;
-def : InstAlias<"dcps2", (DCPS2 0)>;
-def : InstAlias<"dcps3", (DCPS3 0)>;
-
-//===----------------------------------------------------------------------===//
-// Load instructions.
-//===----------------------------------------------------------------------===//
-
-// Pair (indexed, offset)
-def LDPWi : LoadPairOffset<0b00, 0, GPR32, am_indexed32simm7, "ldp">;
-def LDPXi : LoadPairOffset<0b10, 0, GPR64, am_indexed64simm7, "ldp">;
-def LDPSi : LoadPairOffset<0b00, 1, FPR32, am_indexed32simm7, "ldp">;
-def LDPDi : LoadPairOffset<0b01, 1, FPR64, am_indexed64simm7, "ldp">;
-def LDPQi : LoadPairOffset<0b10, 1, FPR128, am_indexed128simm7, "ldp">;
-
-def LDPSWi : LoadPairOffset<0b01, 0, GPR64, am_indexed32simm7, "ldpsw">;
-
-// Pair (pre-indexed)
-def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, am_indexed32simm7, "ldp">;
-def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, am_indexed64simm7, "ldp">;
-def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, am_indexed32simm7, "ldp">;
-def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, am_indexed64simm7, "ldp">;
-def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, am_indexed128simm7, "ldp">;
-
-def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, am_indexed32simm7, "ldpsw">;
-
-// Pair (post-indexed)
-def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
-def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
-def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
-def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
-def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
-
-def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
-
-
-// Pair (no allocate)
-def LDNPWi : LoadPairNoAlloc<0b00, 0, GPR32, am_indexed32simm7, "ldnp">;
-def LDNPXi : LoadPairNoAlloc<0b10, 0, GPR64, am_indexed64simm7, "ldnp">;
-def LDNPSi : LoadPairNoAlloc<0b00, 1, FPR32, am_indexed32simm7, "ldnp">;
-def LDNPDi : LoadPairNoAlloc<0b01, 1, FPR64, am_indexed64simm7, "ldnp">;
-def LDNPQi : LoadPairNoAlloc<0b10, 1, FPR128, am_indexed128simm7, "ldnp">;
-
-//---
-// (register offset)
-//---
-
-let AddedComplexity = 10 in {
-// Integer
-def LDRBBro : Load8RO<0b00,  0, 0b01, GPR32, "ldrb",
-                      [(set GPR32:$Rt, (zextloadi8 ro_indexed8:$addr))]>;
-def LDRHHro : Load16RO<0b01, 0, 0b01, GPR32, "ldrh",
-                      [(set GPR32:$Rt, (zextloadi16 ro_indexed16:$addr))]>;
-def LDRWro  : Load32RO<0b10,   0, 0b01, GPR32, "ldr",
-                      [(set GPR32:$Rt, (load ro_indexed32:$addr))]>;
-def LDRXro  : Load64RO<0b11,   0, 0b01, GPR64, "ldr",
-                      [(set GPR64:$Rt, (load ro_indexed64:$addr))]>;
-
-// Floating-point
-def LDRBro : Load8RO<0b00,   1, 0b01, FPR8,   "ldr",
-                      [(set FPR8:$Rt, (load ro_indexed8:$addr))]>;
-def LDRHro : Load16RO<0b01,  1, 0b01, FPR16,  "ldr",
-                      [(set FPR16:$Rt, (load ro_indexed16:$addr))]>;
-def LDRSro : Load32RO<0b10,    1, 0b01, FPR32,  "ldr",
-                      [(set (f32 FPR32:$Rt), (load ro_indexed32:$addr))]>;
-def LDRDro : Load64RO<0b11,    1, 0b01, FPR64,  "ldr",
-                      [(set (f64 FPR64:$Rt), (load ro_indexed64:$addr))]>;
-def LDRQro : Load128RO<0b00,    1, 0b11, FPR128, "ldr", []> {
-  let mayLoad = 1;
-}
-
-// For regular load, we do not have any alignment requirement.
-// Thus, it is safe to directly map the vector loads with interesting
-// addressing modes.
-// FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (scalar_to_vector (i32 (extloadi8 ro_indexed8:$addr)))),
-           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
-                          (LDRBro ro_indexed8:$addr), bsub)>;
-def : Pat <(v16i8 (scalar_to_vector (i32 (extloadi8 ro_indexed8:$addr)))),
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-                          (LDRBro ro_indexed8:$addr), bsub)>;
-def : Pat <(v4i16 (scalar_to_vector (i32 (extloadi16 ro_indexed16:$addr)))),
-           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
-                          (LDRHro ro_indexed16:$addr), hsub)>;
-def : Pat <(v8i16 (scalar_to_vector (i32 (extloadi16 ro_indexed16:$addr)))),
-           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
-                          (LDRHro ro_indexed16:$addr), hsub)>;
-def : Pat <(v2i32 (scalar_to_vector (i32 (load ro_indexed32:$addr)))),
-           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                          (LDRSro ro_indexed32:$addr), ssub)>;
-def : Pat <(v4i32 (scalar_to_vector (i32 (load ro_indexed32:$addr)))),
-           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                          (LDRSro ro_indexed32:$addr), ssub)>;
-def : Pat <(v1i64 (scalar_to_vector (i64 (load ro_indexed64:$addr)))),
-           (LDRDro ro_indexed64:$addr)>;
-def : Pat <(v2i64 (scalar_to_vector (i64 (load ro_indexed64:$addr)))),
-           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
-                          (LDRDro ro_indexed64:$addr), dsub)>;
-
-// Match all load 64 bits width whose type is compatible with FPR64
-def : Pat<(v2f32 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-def : Pat<(v1f64 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-def : Pat<(v8i8 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-def : Pat<(v4i16 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-def : Pat<(v2i32 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-def : Pat<(v1i64 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-
-// Match all load 128 bits width whose type is compatible with FPR128
-def : Pat<(v4f32 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-def : Pat<(v2f64 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-def : Pat<(v16i8 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-def : Pat<(v8i16 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-def : Pat<(v4i32 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-def : Pat<(v2i64 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-def : Pat<(f128  (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-
-// Load sign-extended half-word
-def LDRSHWro : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh",
-                      [(set GPR32:$Rt, (sextloadi16 ro_indexed16:$addr))]>;
-def LDRSHXro : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh",
-                      [(set GPR64:$Rt, (sextloadi16 ro_indexed16:$addr))]>;
-
-// Load sign-extended byte
-def LDRSBWro : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb",
-                      [(set GPR32:$Rt, (sextloadi8 ro_indexed8:$addr))]>;
-def LDRSBXro : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb",
-                      [(set GPR64:$Rt, (sextloadi8 ro_indexed8:$addr))]>;
-
-// Load sign-extended word
-def LDRSWro  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw",
-                      [(set GPR64:$Rt, (sextloadi32 ro_indexed32:$addr))]>;
-
-// Pre-fetch.
-def PRFMro : PrefetchRO<0b11, 0, 0b10, "prfm",
-                        [(ARM64Prefetch imm:$Rt, ro_indexed64:$addr)]>;
-
-// zextload -> i64
-def : Pat<(i64 (zextloadi8 ro_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi16 ro_indexed16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRHHro ro_indexed16:$addr), sub_32)>;
-
-// zextloadi1 -> zextloadi8
-def : Pat<(i32 (zextloadi1 ro_indexed8:$addr)), (LDRBBro ro_indexed8:$addr)>;
-def : Pat<(i64 (zextloadi1 ro_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
-
-// extload -> zextload
-def : Pat<(i32 (extloadi16 ro_indexed16:$addr)), (LDRHHro ro_indexed16:$addr)>;
-def : Pat<(i32 (extloadi8 ro_indexed8:$addr)), (LDRBBro ro_indexed8:$addr)>;
-def : Pat<(i32 (extloadi1 ro_indexed8:$addr)), (LDRBBro ro_indexed8:$addr)>;
-def : Pat<(i64 (extloadi32 ro_indexed32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRWro ro_indexed32:$addr), sub_32)>;
-def : Pat<(i64 (extloadi16 ro_indexed16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRHHro ro_indexed16:$addr), sub_32)>;
-def : Pat<(i64 (extloadi8 ro_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
-def : Pat<(i64 (extloadi1 ro_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
-
-} // AddedComplexity = 10
-
-//---
-// (unsigned immediate)
-//---
-def LDRXui : LoadUI<0b11, 0, 0b01, GPR64, am_indexed64, "ldr",
-                    [(set GPR64:$Rt, (load am_indexed64:$addr))]>;
-def LDRWui : LoadUI<0b10, 0, 0b01, GPR32, am_indexed32, "ldr",
-                    [(set GPR32:$Rt, (load am_indexed32:$addr))]>;
-def LDRBui : LoadUI<0b00, 1, 0b01, FPR8, am_indexed8, "ldr",
-                    [(set FPR8:$Rt, (load am_indexed8:$addr))]>;
-def LDRHui : LoadUI<0b01, 1, 0b01, FPR16, am_indexed16, "ldr",
-                    [(set FPR16:$Rt, (load am_indexed16:$addr))]>;
-def LDRSui : LoadUI<0b10, 1, 0b01, FPR32, am_indexed32, "ldr",
-                    [(set (f32 FPR32:$Rt), (load am_indexed32:$addr))]>;
-def LDRDui : LoadUI<0b11, 1, 0b01, FPR64, am_indexed64, "ldr",
-                    [(set (f64 FPR64:$Rt), (load am_indexed64:$addr))]>;
-def LDRQui : LoadUI<0b00, 1, 0b11, FPR128, am_indexed128, "ldr",
-                    [(set (f128 FPR128:$Rt), (load am_indexed128:$addr))]>;
-
-// For regular load, we do not have any alignment requirement.
-// Thus, it is safe to directly map the vector loads with interesting
-// addressing modes.
-// FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (scalar_to_vector (i32 (extloadi8 am_indexed8:$addr)))),
-           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
-                          (LDRBui am_indexed8:$addr), bsub)>;
-def : Pat <(v16i8 (scalar_to_vector (i32 (extloadi8 am_indexed8:$addr)))),
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-                          (LDRBui am_indexed8:$addr), bsub)>;
-def : Pat <(v4i16 (scalar_to_vector (i32 (extloadi16 am_indexed16:$addr)))),
-           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
-                          (LDRHui am_indexed16:$addr), hsub)>;
-def : Pat <(v8i16 (scalar_to_vector (i32 (extloadi16 am_indexed16:$addr)))),
-           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
-                          (LDRHui am_indexed16:$addr), hsub)>;
-def : Pat <(v2i32 (scalar_to_vector (i32 (load am_indexed32:$addr)))),
-           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                          (LDRSui am_indexed32:$addr), ssub)>;
-def : Pat <(v4i32 (scalar_to_vector (i32 (load am_indexed32:$addr)))),
-           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                          (LDRSui am_indexed32:$addr), ssub)>;
-def : Pat <(v1i64 (scalar_to_vector (i64 (load am_indexed64:$addr)))),
-           (LDRDui am_indexed64:$addr)>;
-def : Pat <(v2i64 (scalar_to_vector (i64 (load am_indexed64:$addr)))),
-           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
-                          (LDRDui am_indexed64:$addr), dsub)>;
-
-// Match all load 64 bits width whose type is compatible with FPR64
-def : Pat<(v2f32 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-def : Pat<(v1f64 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-def : Pat<(v8i8 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-def : Pat<(v4i16 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-def : Pat<(v2i32 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-def : Pat<(v1i64 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-
-// Match all load 128 bits width whose type is compatible with FPR128
-def : Pat<(v4f32 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-def : Pat<(v2f64 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-def : Pat<(v16i8 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-def : Pat<(v8i16 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-def : Pat<(v4i32 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-def : Pat<(v2i64 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-def : Pat<(f128  (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-
-def LDRHHui : LoadUI<0b01, 0, 0b01, GPR32, am_indexed16, "ldrh",
-                     [(set GPR32:$Rt, (zextloadi16 am_indexed16:$addr))]>;
-def LDRBBui : LoadUI<0b00, 0, 0b01, GPR32, am_indexed8, "ldrb",
-                     [(set GPR32:$Rt, (zextloadi8 am_indexed8:$addr))]>;
-// zextload -> i64
-def : Pat<(i64 (zextloadi8 am_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi16 am_indexed16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRHHui am_indexed16:$addr), sub_32)>;
-
-// zextloadi1 -> zextloadi8
-def : Pat<(i32 (zextloadi1 am_indexed8:$addr)), (LDRBBui am_indexed8:$addr)>;
-def : Pat<(i64 (zextloadi1 am_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
-
-// extload -> zextload
-def : Pat<(i32 (extloadi16 am_indexed16:$addr)), (LDRHHui am_indexed16:$addr)>;
-def : Pat<(i32 (extloadi8 am_indexed8:$addr)), (LDRBBui am_indexed8:$addr)>;
-def : Pat<(i32 (extloadi1 am_indexed8:$addr)), (LDRBBui am_indexed8:$addr)>;
-def : Pat<(i64 (extloadi32 am_indexed32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRWui am_indexed32:$addr), sub_32)>;
-def : Pat<(i64 (extloadi16 am_indexed16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRHHui am_indexed16:$addr), sub_32)>;
-def : Pat<(i64 (extloadi8 am_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
-def : Pat<(i64 (extloadi1 am_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
-
-// load sign-extended half-word
-def LDRSHWui : LoadUI<0b01, 0, 0b11, GPR32, am_indexed16, "ldrsh",
-                      [(set GPR32:$Rt, (sextloadi16 am_indexed16:$addr))]>;
-def LDRSHXui : LoadUI<0b01, 0, 0b10, GPR64, am_indexed16, "ldrsh",
-                      [(set GPR64:$Rt, (sextloadi16 am_indexed16:$addr))]>;
-
-// load sign-extended byte
-def LDRSBWui : LoadUI<0b00, 0, 0b11, GPR32, am_indexed8, "ldrsb",
-                      [(set GPR32:$Rt, (sextloadi8 am_indexed8:$addr))]>;
-def LDRSBXui : LoadUI<0b00, 0, 0b10, GPR64, am_indexed8, "ldrsb",
-                      [(set GPR64:$Rt, (sextloadi8 am_indexed8:$addr))]>;
-
-// load sign-extended word
-def LDRSWui  : LoadUI<0b10, 0, 0b10, GPR64, am_indexed32, "ldrsw",
-                      [(set GPR64:$Rt, (sextloadi32 am_indexed32:$addr))]>;
-
-// load zero-extended word
-def : Pat<(i64 (zextloadi32 am_indexed32:$addr)),
- (SUBREG_TO_REG (i64 0), (LDRWui am_indexed32:$addr), sub_32)>;
-
-// Pre-fetch.
-def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
-                        [(ARM64Prefetch imm:$Rt, am_indexed64:$addr)]>;
-
-//---
-// (literal)
-def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
-def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
-def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
-def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
-def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
-
-// load sign-extended word
-def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
-
-// prefetch
-def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
-//                   [(ARM64Prefetch imm:$Rt, tglobaladdr:$label)]>;
-
-//---
-// (unscaled immediate)
-def LDURXi : LoadUnscaled<0b11, 0, 0b01, GPR64, am_unscaled64, "ldur",
-                          [(set GPR64:$Rt, (load am_unscaled64:$addr))]>;
-def LDURWi : LoadUnscaled<0b10, 0, 0b01, GPR32, am_unscaled32, "ldur",
-                          [(set GPR32:$Rt, (load am_unscaled32:$addr))]>;
-def LDURBi : LoadUnscaled<0b00, 1, 0b01, FPR8,  am_unscaled8, "ldur",
-                          [(set FPR8:$Rt, (load am_unscaled8:$addr))]>;
-def LDURHi : LoadUnscaled<0b01, 1, 0b01, FPR16, am_unscaled16, "ldur",
-                          [(set FPR16:$Rt, (load am_unscaled16:$addr))]>;
-def LDURSi : LoadUnscaled<0b10, 1, 0b01, FPR32, am_unscaled32, "ldur",
-                          [(set (f32 FPR32:$Rt), (load am_unscaled32:$addr))]>;
-def LDURDi : LoadUnscaled<0b11, 1, 0b01, FPR64, am_unscaled64, "ldur",
-                          [(set (f64 FPR64:$Rt), (load am_unscaled64:$addr))]>;
-def LDURQi : LoadUnscaled<0b00, 1, 0b11, FPR128, am_unscaled128, "ldur",
-                        [(set (v2f64 FPR128:$Rt), (load am_unscaled128:$addr))]>;
-
-def LDURHHi
-    : LoadUnscaled<0b01, 0, 0b01, GPR32, am_unscaled16, "ldurh",
-                   [(set GPR32:$Rt, (zextloadi16 am_unscaled16:$addr))]>;
-def LDURBBi
-    : LoadUnscaled<0b00, 0, 0b01, GPR32, am_unscaled8, "ldurb",
-                   [(set GPR32:$Rt, (zextloadi8 am_unscaled8:$addr))]>;
-
-// Match all load 64 bits width whose type is compatible with FPR64
-def : Pat<(v2f32 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-def : Pat<(v1f64 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-def : Pat<(v8i8 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-def : Pat<(v4i16 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-def : Pat<(v2i32 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-def : Pat<(v1i64 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-
-// Match all load 128 bits width whose type is compatible with FPR128
-def : Pat<(v4f32 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-def : Pat<(v2f64 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-def : Pat<(v16i8 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-def : Pat<(v8i16 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-def : Pat<(v4i32 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-def : Pat<(v2i64 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-def : Pat<(f128  (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-
-//  anyext -> zext
-def : Pat<(i32 (extloadi16 am_unscaled16:$addr)), (LDURHHi am_unscaled16:$addr)>;
-def : Pat<(i32 (extloadi8 am_unscaled8:$addr)), (LDURBBi am_unscaled8:$addr)>;
-def : Pat<(i32 (extloadi1 am_unscaled8:$addr)), (LDURBBi am_unscaled8:$addr)>;
-def : Pat<(i64 (extloadi32 am_unscaled32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURWi am_unscaled32:$addr), sub_32)>;
-def : Pat<(i64 (extloadi16 am_unscaled16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURHHi am_unscaled16:$addr), sub_32)>;
-def : Pat<(i64 (extloadi8 am_unscaled8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-def : Pat<(i64 (extloadi1 am_unscaled8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-// unscaled zext
-def : Pat<(i32 (zextloadi16 am_unscaled16:$addr)),
-    (LDURHHi am_unscaled16:$addr)>;
-def : Pat<(i32 (zextloadi8 am_unscaled8:$addr)),
-    (LDURBBi am_unscaled8:$addr)>;
-def : Pat<(i32 (zextloadi1 am_unscaled8:$addr)),
-    (LDURBBi am_unscaled8:$addr)>;
-def : Pat<(i64 (zextloadi32 am_unscaled32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURWi am_unscaled32:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi16 am_unscaled16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURHHi am_unscaled16:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi8 am_unscaled8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi1 am_unscaled8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-
-
-//---
-// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
-
-// Define new assembler match classes as we want to only match these when
-// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
-// associate a DiagnosticType either, as we want the diagnostic for the
-// canonical form (the scaled operand) to take precedence.
-def MemoryUnscaledFB8Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB8";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def MemoryUnscaledFB16Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB16";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def MemoryUnscaledFB32Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB32";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def MemoryUnscaledFB64Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB64";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def MemoryUnscaledFB128Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB128";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def am_unscaled_fb8 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB8Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled_fb16 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB16Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled_fb32 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB32Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled_fb64 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB64Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled_fb128 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB128Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def : InstAlias<"ldr $Rt, $addr", (LDURXi GPR64:$Rt, am_unscaled_fb64:$addr)>;
-def : InstAlias<"ldr $Rt, $addr", (LDURWi GPR32:$Rt, am_unscaled_fb32:$addr)>;
-def : InstAlias<"ldr $Rt, $addr", (LDURBi FPR8:$Rt, am_unscaled_fb8:$addr)>;
-def : InstAlias<"ldr $Rt, $addr", (LDURHi FPR16:$Rt, am_unscaled_fb16:$addr)>;
-def : InstAlias<"ldr $Rt, $addr", (LDURSi FPR32:$Rt, am_unscaled_fb32:$addr)>;
-def : InstAlias<"ldr $Rt, $addr", (LDURDi FPR64:$Rt, am_unscaled_fb64:$addr)>;
-def : InstAlias<"ldr $Rt, $addr", (LDURQi FPR128:$Rt, am_unscaled_fb128:$addr)>;
-
-// zextload -> i64
-def : Pat<(i64 (zextloadi8 am_unscaled8:$addr)),
-  (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi16 am_unscaled16:$addr)),
-  (SUBREG_TO_REG (i64 0), (LDURHHi am_unscaled16:$addr), sub_32)>;
-
-// load sign-extended half-word
-def LDURSHWi
-    : LoadUnscaled<0b01, 0, 0b11, GPR32, am_unscaled16, "ldursh",
-                   [(set GPR32:$Rt, (sextloadi16 am_unscaled16:$addr))]>;
-def LDURSHXi
-    : LoadUnscaled<0b01, 0, 0b10, GPR64, am_unscaled16, "ldursh",
-                   [(set GPR64:$Rt, (sextloadi16 am_unscaled16:$addr))]>;
-
-// load sign-extended byte
-def LDURSBWi
-    : LoadUnscaled<0b00, 0, 0b11, GPR32, am_unscaled8, "ldursb",
-                   [(set GPR32:$Rt, (sextloadi8 am_unscaled8:$addr))]>;
-def LDURSBXi
-    : LoadUnscaled<0b00, 0, 0b10, GPR64, am_unscaled8, "ldursb",
-                   [(set GPR64:$Rt, (sextloadi8 am_unscaled8:$addr))]>;
-
-// load sign-extended word
-def LDURSWi
-    : LoadUnscaled<0b10, 0, 0b10, GPR64, am_unscaled32, "ldursw",
-                   [(set GPR64:$Rt, (sextloadi32 am_unscaled32:$addr))]>;
-
-// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
-def : InstAlias<"ldrb $Rt, $addr", (LDURBBi GPR32:$Rt, am_unscaled_fb8:$addr)>;
-def : InstAlias<"ldrh $Rt, $addr", (LDURHHi GPR32:$Rt, am_unscaled_fb16:$addr)>;
-def : InstAlias<"ldrsb $Rt, $addr", (LDURSBWi GPR32:$Rt, am_unscaled_fb8:$addr)>;
-def : InstAlias<"ldrsb $Rt, $addr", (LDURSBXi GPR64:$Rt, am_unscaled_fb8:$addr)>;
-def : InstAlias<"ldrsh $Rt, $addr", (LDURSHWi GPR32:$Rt, am_unscaled_fb16:$addr)>;
-def : InstAlias<"ldrsh $Rt, $addr", (LDURSHXi GPR64:$Rt, am_unscaled_fb16:$addr)>;
-def : InstAlias<"ldrsw $Rt, $addr", (LDURSWi GPR64:$Rt, am_unscaled_fb32:$addr)>;
-
-// Pre-fetch.
-def PRFUMi : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
-                               [(ARM64Prefetch imm:$Rt, am_unscaled64:$addr)]>;
-
-//---
-// (unscaled immediate, unprivileged)
-def LDTRXi : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
-def LDTRWi : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
-
-def LDTRHi : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
-def LDTRBi : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
-
-// load sign-extended half-word
-def LDTRSHWi : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
-def LDTRSHXi : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
-
-// load sign-extended byte
-def LDTRSBWi : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
-def LDTRSBXi : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
-
-// load sign-extended word
-def LDTRSWi  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
-
-//---
-// (immediate pre-indexed)
-def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
-def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
-def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
-def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
-def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
-def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
-def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
-
-// load sign-extended half-word
-def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
-def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
-
-// load sign-extended byte
-def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
-def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
-
-// load zero-extended byte
-def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
-def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
-
-// load sign-extended word
-def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
-
-// ISel pseudos and patterns. See expanded comment on LoadPreIdxPseudo.
-def LDRDpre_isel  : LoadPreIdxPseudo<FPR64>;
-def LDRSpre_isel  : LoadPreIdxPseudo<FPR32>;
-def LDRXpre_isel  : LoadPreIdxPseudo<GPR64>;
-def LDRWpre_isel  : LoadPreIdxPseudo<GPR32>;
-def LDRHHpre_isel : LoadPreIdxPseudo<GPR32>;
-def LDRBBpre_isel : LoadPreIdxPseudo<GPR32>;
-
-def LDRSWpre_isel : LoadPreIdxPseudo<GPR64>;
-def LDRSHWpre_isel : LoadPreIdxPseudo<GPR32>;
-def LDRSHXpre_isel : LoadPreIdxPseudo<GPR64>;
-def LDRSBWpre_isel : LoadPreIdxPseudo<GPR32>;
-def LDRSBXpre_isel : LoadPreIdxPseudo<GPR64>;
-
-//---
-// (immediate post-indexed)
-def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
-def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
-def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
-def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
-def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
-def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
-def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
-
-// load sign-extended half-word
-def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
-def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
-
-// load sign-extended byte
-def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
-def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
-
-// load zero-extended byte
-def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
-def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
-
-// load sign-extended word
-def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
-
-// ISel pseudos and patterns. See expanded comment on LoadPostIdxPseudo.
-def LDRDpost_isel  : LoadPostIdxPseudo<FPR64>;
-def LDRSpost_isel  : LoadPostIdxPseudo<FPR32>;
-def LDRXpost_isel  : LoadPostIdxPseudo<GPR64>;
-def LDRWpost_isel  : LoadPostIdxPseudo<GPR32>;
-def LDRHHpost_isel : LoadPostIdxPseudo<GPR32>;
-def LDRBBpost_isel : LoadPostIdxPseudo<GPR32>;
-
-def LDRSWpost_isel : LoadPostIdxPseudo<GPR64>;
-def LDRSHWpost_isel : LoadPostIdxPseudo<GPR32>;
-def LDRSHXpost_isel : LoadPostIdxPseudo<GPR64>;
-def LDRSBWpost_isel : LoadPostIdxPseudo<GPR32>;
-def LDRSBXpost_isel : LoadPostIdxPseudo<GPR64>;
-
-//===----------------------------------------------------------------------===//
-// Store instructions.
-//===----------------------------------------------------------------------===//
-
-// Pair (indexed, offset)
-// FIXME: Use dedicated range-checked addressing mode operand here.
-def STPWi : StorePairOffset<0b00, 0, GPR32, am_indexed32simm7, "stp">;
-def STPXi : StorePairOffset<0b10, 0, GPR64, am_indexed64simm7, "stp">;
-def STPSi : StorePairOffset<0b00, 1, FPR32, am_indexed32simm7, "stp">;
-def STPDi : StorePairOffset<0b01, 1, FPR64, am_indexed64simm7, "stp">;
-def STPQi : StorePairOffset<0b10, 1, FPR128, am_indexed128simm7, "stp">;
-
-// Pair (pre-indexed)
-def STPWpre : StorePairPreIdx<0b00, 0, GPR32, am_indexed32simm7, "stp">;
-def STPXpre : StorePairPreIdx<0b10, 0, GPR64, am_indexed64simm7, "stp">;
-def STPSpre : StorePairPreIdx<0b00, 1, FPR32, am_indexed32simm7, "stp">;
-def STPDpre : StorePairPreIdx<0b01, 1, FPR64, am_indexed64simm7, "stp">;
-def STPQpre : StorePairPreIdx<0b10, 1, FPR128, am_indexed128simm7, "stp">;
-
-// Pair (pre-indexed)
-def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
-def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
-def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
-def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
-def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
-
-// Pair (no allocate)
-def STNPWi : StorePairNoAlloc<0b00, 0, GPR32, am_indexed32simm7, "stnp">;
-def STNPXi : StorePairNoAlloc<0b10, 0, GPR64, am_indexed64simm7, "stnp">;
-def STNPSi : StorePairNoAlloc<0b00, 1, FPR32, am_indexed32simm7, "stnp">;
-def STNPDi : StorePairNoAlloc<0b01, 1, FPR64, am_indexed64simm7, "stnp">;
-def STNPQi : StorePairNoAlloc<0b10, 1, FPR128, am_indexed128simm7, "stnp">;
-
-//---
-// (Register offset)
-
-let AddedComplexity = 10 in {
-
-// Integer
-def STRHHro : Store16RO<0b01, 0, 0b00, GPR32, "strh",
-                            [(truncstorei16 GPR32:$Rt, ro_indexed16:$addr)]>;
-def STRBBro : Store8RO<0b00,  0, 0b00, GPR32, "strb",
-                            [(truncstorei8 GPR32:$Rt, ro_indexed8:$addr)]>;
-def STRWro  : Store32RO<0b10,   0, 0b00, GPR32, "str",
-                            [(store GPR32:$Rt, ro_indexed32:$addr)]>;
-def STRXro  : Store64RO<0b11,   0, 0b00, GPR64, "str",
-                            [(store GPR64:$Rt, ro_indexed64:$addr)]>;
-
-// truncstore i64
-def : Pat<(truncstorei8 GPR64:$Rt, ro_indexed8:$addr),
-           (STRBBro (EXTRACT_SUBREG GPR64:$Rt, sub_32), ro_indexed8:$addr)>;
-def : Pat<(truncstorei16 GPR64:$Rt, ro_indexed16:$addr),
-           (STRHHro (EXTRACT_SUBREG GPR64:$Rt, sub_32), ro_indexed16:$addr)>;
-def : Pat<(truncstorei32 GPR64:$Rt, ro_indexed32:$addr),
-           (STRWro (EXTRACT_SUBREG GPR64:$Rt, sub_32), ro_indexed32:$addr)>;
-
-
-// Floating-point
-def STRBro : Store8RO<0b00,  1, 0b00, FPR8,  "str",
-                            [(store FPR8:$Rt, ro_indexed8:$addr)]>;
-def STRHro : Store16RO<0b01, 1, 0b00, FPR16, "str",
-                            [(store FPR16:$Rt, ro_indexed16:$addr)]>;
-def STRSro : Store32RO<0b10,   1, 0b00, FPR32, "str",
-                            [(store (f32 FPR32:$Rt), ro_indexed32:$addr)]>;
-def STRDro : Store64RO<0b11,   1, 0b00, FPR64, "str",
-                            [(store (f64 FPR64:$Rt), ro_indexed64:$addr)]>;
-def STRQro : Store128RO<0b00,   1, 0b10, FPR128, "str", []> {
-  let mayStore = 1;
-}
-
-// Match all store 64 bits width whose type is compatible with FPR64
-def : Pat<(store (v2f32 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-def : Pat<(store (v1f64 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-def : Pat<(store (v8i8 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-def : Pat<(store (v4i16 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-def : Pat<(store (v2i32 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-def : Pat<(store (v1i64 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-
-// Match all store 128 bits width whose type is compatible with FPR128
-def : Pat<(store (v4f32 FPR128:$Rn), ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-def : Pat<(store (v2f64 FPR128:$Rn), ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-def : Pat<(store (v16i8 FPR128:$Rn), ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-def : Pat<(store (v8i16 FPR128:$Rn), ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-def : Pat<(store (v4i32 FPR128:$Rn), ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-def : Pat<(store (v2i64 FPR128:$Rn), ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-def : Pat<(store (f128 FPR128:$Rn),  ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-
-//---
-// (unsigned immediate)
-def STRXui : StoreUI<0b11, 0, 0b00, GPR64, am_indexed64, "str",
-                     [(store GPR64:$Rt, am_indexed64:$addr)]>;
-def STRWui : StoreUI<0b10, 0, 0b00, GPR32, am_indexed32, "str",
-                     [(store GPR32:$Rt, am_indexed32:$addr)]>;
-def STRBui : StoreUI<0b00, 1, 0b00, FPR8, am_indexed8, "str",
-                     [(store FPR8:$Rt, am_indexed8:$addr)]>;
-def STRHui : StoreUI<0b01, 1, 0b00, FPR16, am_indexed16, "str",
-                     [(store FPR16:$Rt, am_indexed16:$addr)]>;
-def STRSui : StoreUI<0b10, 1, 0b00, FPR32, am_indexed32, "str",
-                     [(store (f32 FPR32:$Rt), am_indexed32:$addr)]>;
-def STRDui : StoreUI<0b11, 1, 0b00, FPR64, am_indexed64, "str",
-                     [(store (f64 FPR64:$Rt), am_indexed64:$addr)]>;
-def STRQui : StoreUI<0b00, 1, 0b10, FPR128, am_indexed128, "str", []> {
-  let mayStore = 1;
-}
-
-// Match all store 64 bits width whose type is compatible with FPR64
-def : Pat<(store (v2f32 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-def : Pat<(store (v1f64 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-def : Pat<(store (v8i8 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-def : Pat<(store (v4i16 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-def : Pat<(store (v2i32 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-def : Pat<(store (v1i64 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-
-// Match all store 128 bits width whose type is compatible with FPR128
-def : Pat<(store (v4f32 FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-def : Pat<(store (v2f64 FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-def : Pat<(store (v16i8 FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-def : Pat<(store (v8i16 FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-def : Pat<(store (v4i32 FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-def : Pat<(store (v2i64 FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-def : Pat<(store (f128  FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-
-def STRHHui : StoreUI<0b01, 0, 0b00, GPR32, am_indexed16, "strh",
-                      [(truncstorei16 GPR32:$Rt, am_indexed16:$addr)]>;
-def STRBBui : StoreUI<0b00, 0, 0b00, GPR32, am_indexed8,  "strb",
-                      [(truncstorei8 GPR32:$Rt, am_indexed8:$addr)]>;
-
-// truncstore i64
-def : Pat<(truncstorei32 GPR64:$Rt, am_indexed32:$addr),
-  (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_indexed32:$addr)>;
-def : Pat<(truncstorei16 GPR64:$Rt, am_indexed16:$addr),
-  (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_indexed16:$addr)>;
-def : Pat<(truncstorei8 GPR64:$Rt, am_indexed8:$addr),
-  (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_indexed8:$addr)>;
-
-} // AddedComplexity = 10
-
-//---
-// (unscaled immediate)
-def STURXi : StoreUnscaled<0b11, 0, 0b00, GPR64, am_unscaled64, "stur",
-                           [(store GPR64:$Rt, am_unscaled64:$addr)]>;
-def STURWi : StoreUnscaled<0b10, 0, 0b00, GPR32, am_unscaled32, "stur",
-                           [(store GPR32:$Rt, am_unscaled32:$addr)]>;
-def STURBi : StoreUnscaled<0b00, 1, 0b00, FPR8,  am_unscaled8, "stur",
-                           [(store FPR8:$Rt, am_unscaled8:$addr)]>;
-def STURHi : StoreUnscaled<0b01, 1, 0b00, FPR16, am_unscaled16, "stur",
-                           [(store FPR16:$Rt, am_unscaled16:$addr)]>;
-def STURSi : StoreUnscaled<0b10, 1, 0b00, FPR32, am_unscaled32, "stur",
-                           [(store (f32 FPR32:$Rt), am_unscaled32:$addr)]>;
-def STURDi : StoreUnscaled<0b11, 1, 0b00, FPR64, am_unscaled64, "stur",
-                           [(store (f64 FPR64:$Rt), am_unscaled64:$addr)]>;
-def STURQi : StoreUnscaled<0b00, 1, 0b10, FPR128, am_unscaled128, "stur",
-                           [(store (v2f64 FPR128:$Rt), am_unscaled128:$addr)]>;
-def STURHHi : StoreUnscaled<0b01, 0, 0b00, GPR32, am_unscaled16, "sturh",
-                            [(truncstorei16 GPR32:$Rt, am_unscaled16:$addr)]>;
-def STURBBi : StoreUnscaled<0b00, 0, 0b00, GPR32, am_unscaled8, "sturb",
-                            [(truncstorei8 GPR32:$Rt, am_unscaled8:$addr)]>;
-
-// Match all store 64 bits width whose type is compatible with FPR64
-def : Pat<(store (v2f32 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-def : Pat<(store (v1f64 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-def : Pat<(store (v8i8 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-def : Pat<(store (v4i16 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-def : Pat<(store (v2i32 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-def : Pat<(store (v1i64 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-
-// Match all store 128 bits width whose type is compatible with FPR128
-def : Pat<(store (v4f32 FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-def : Pat<(store (v2f64 FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-def : Pat<(store (v16i8 FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-def : Pat<(store (v8i16 FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-def : Pat<(store (v4i32 FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-def : Pat<(store (v2i64 FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-def : Pat<(store (f128  FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-
-// unscaled i64 truncating stores
-def : Pat<(truncstorei32 GPR64:$Rt, am_unscaled32:$addr),
-  (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_unscaled32:$addr)>;
-def : Pat<(truncstorei16 GPR64:$Rt, am_unscaled16:$addr),
-  (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_unscaled16:$addr)>;
-def : Pat<(truncstorei8 GPR64:$Rt, am_unscaled8:$addr),
-  (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_unscaled8:$addr)>;
-
-//---
-// STR mnemonics fall back to STUR for negative or unaligned offsets.
-def : InstAlias<"str $Rt, $addr", (STURXi GPR64:$Rt, am_unscaled_fb64:$addr)>;
-def : InstAlias<"str $Rt, $addr", (STURWi GPR32:$Rt, am_unscaled_fb32:$addr)>;
-def : InstAlias<"str $Rt, $addr", (STURBi FPR8:$Rt, am_unscaled_fb8:$addr)>;
-def : InstAlias<"str $Rt, $addr", (STURHi FPR16:$Rt, am_unscaled_fb16:$addr)>;
-def : InstAlias<"str $Rt, $addr", (STURSi FPR32:$Rt, am_unscaled_fb32:$addr)>;
-def : InstAlias<"str $Rt, $addr", (STURDi FPR64:$Rt, am_unscaled_fb64:$addr)>;
-def : InstAlias<"str $Rt, $addr", (STURQi FPR128:$Rt, am_unscaled_fb128:$addr)>;
-
-def : InstAlias<"strb $Rt, $addr", (STURBBi GPR32:$Rt, am_unscaled_fb8:$addr)>;
-def : InstAlias<"strh $Rt, $addr", (STURHHi GPR32:$Rt, am_unscaled_fb16:$addr)>;
-
-//---
-// (unscaled immediate, unprivileged)
-def STTRWi : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
-def STTRXi : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
-
-def STTRHi : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
-def STTRBi : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
-
-//---
-// (immediate pre-indexed)
-def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str">;
-def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str">;
-def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str">;
-def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str">;
-def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str">;
-def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str">;
-def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str">;
-
-def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb">;
-def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh">;
-
-// ISel pseudos and patterns. See expanded comment on StorePreIdxPseudo.
-defm STRDpre : StorePreIdxPseudo<FPR64, f64, pre_store>;
-defm STRSpre : StorePreIdxPseudo<FPR32, f32, pre_store>;
-defm STRXpre : StorePreIdxPseudo<GPR64, i64, pre_store>;
-defm STRWpre : StorePreIdxPseudo<GPR32, i32, pre_store>;
-defm STRHHpre : StorePreIdxPseudo<GPR32, i32, pre_truncsti16>;
-defm STRBBpre : StorePreIdxPseudo<GPR32, i32, pre_truncsti8>;
-// truncstore i64
-def : Pat<(pre_truncsti32 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRWpre_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-def : Pat<(pre_truncsti16 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRHHpre_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-def : Pat<(pre_truncsti8 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRBBpre_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-
-//---
-// (immediate post-indexed)
-def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32, "str">;
-def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64, "str">;
-def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,  "str">;
-def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16, "str">;
-def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32, "str">;
-def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64, "str">;
-def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str">;
-
-def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb">;
-def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh">;
-
-// ISel pseudos and patterns. See expanded comment on StorePostIdxPseudo.
-defm STRDpost : StorePostIdxPseudo<FPR64, f64, post_store, STRDpost>;
-defm STRSpost : StorePostIdxPseudo<FPR32, f32, post_store, STRSpost>;
-defm STRXpost : StorePostIdxPseudo<GPR64, i64, post_store, STRXpost>;
-defm STRWpost : StorePostIdxPseudo<GPR32, i32, post_store, STRWpost>;
-defm STRHHpost : StorePostIdxPseudo<GPR32, i32, post_truncsti16, STRHHpost>;
-defm STRBBpost : StorePostIdxPseudo<GPR32, i32, post_truncsti8, STRBBpost>;
-// truncstore i64
-def : Pat<(post_truncsti32 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRWpost_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-def : Pat<(post_truncsti16 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRHHpost_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-def : Pat<(post_truncsti8 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRBBpost_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-
-
-//===----------------------------------------------------------------------===//
-// Load/store exclusive instructions.
-//===----------------------------------------------------------------------===//
-
-def LDARW  : LoadAcquire   <0b10, 1, 1, 0, 1, GPR32, "ldar">;
-def LDARX  : LoadAcquire   <0b11, 1, 1, 0, 1, GPR64, "ldar">;
-def LDARB  : LoadAcquire   <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
-def LDARH  : LoadAcquire   <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
-
-def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
-def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
-def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
-def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
-
-def LDXRW  : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
-def LDXRX  : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
-def LDXRB  : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
-def LDXRH  : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
-
-def STLRW  : StoreRelease  <0b10, 1, 0, 0, 1, GPR32, "stlr">;
-def STLRX  : StoreRelease  <0b11, 1, 0, 0, 1, GPR64, "stlr">;
-def STLRB  : StoreRelease  <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
-def STLRH  : StoreRelease  <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
-
-def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
-def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
-def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
-def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
-
-def STXRW  : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
-def STXRX  : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
-def STXRB  : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
-def STXRH  : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
-
-def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
-def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
-
-def LDXPW  : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
-def LDXPX  : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
-
-def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
-def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
-
-def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
-def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
-
-//===----------------------------------------------------------------------===//
-// Scaled floating point to integer conversion instructions.
-//===----------------------------------------------------------------------===//
-
-defm FCVTAS : FPToInteger<0b00, 0b100, "fcvtas", int_arm64_neon_fcvtas>;
-defm FCVTAU : FPToInteger<0b00, 0b101, "fcvtau", int_arm64_neon_fcvtau>;
-defm FCVTMS : FPToInteger<0b10, 0b000, "fcvtms", int_arm64_neon_fcvtms>;
-defm FCVTMU : FPToInteger<0b10, 0b001, "fcvtmu", int_arm64_neon_fcvtmu>;
-defm FCVTNS : FPToInteger<0b00, 0b000, "fcvtns", int_arm64_neon_fcvtns>;
-defm FCVTNU : FPToInteger<0b00, 0b001, "fcvtnu", int_arm64_neon_fcvtnu>;
-defm FCVTPS : FPToInteger<0b01, 0b000, "fcvtps", int_arm64_neon_fcvtps>;
-defm FCVTPU : FPToInteger<0b01, 0b001, "fcvtpu", int_arm64_neon_fcvtpu>;
-defm FCVTZS : FPToInteger<0b11, 0b000, "fcvtzs", fp_to_sint>;
-defm FCVTZU : FPToInteger<0b11, 0b001, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : FPToInteger<0b11, 0b000, "fcvtzs", int_arm64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToInteger<0b11, 0b001, "fcvtzu", int_arm64_neon_fcvtzu>;
-}
-
-//===----------------------------------------------------------------------===//
-// Scaled integer to floating point conversion instructions.
-//===----------------------------------------------------------------------===//
-
-defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
-defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
-
-//===----------------------------------------------------------------------===//
-// Unscaled integer to floating point conversion instruction.
-//===----------------------------------------------------------------------===//
-
-defm FMOV : UnscaledConversion<"fmov">;
-
-def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
-def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
-
-def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
-
-def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))), (FMOVDXr V64:$Vn)>;
-def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), (FMOVDXr V64:$Vn)>;
-def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (FMOVDXr V64:$Vn)>;
-def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))), (FMOVDXr V64:$Vn)>;
-def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (FMOVDXr V64:$Vn)>;
-def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), (FMOVDXr V64:$Vn)>;
-
-def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))), (COPY_TO_REGCLASS GPR32:$Xn,
-                                                                FPR32)>;
-def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))), (COPY_TO_REGCLASS FPR32:$Xn,
-                                                                GPR32)>;
-def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))), (COPY_TO_REGCLASS GPR64:$Xn,
-                                                                FPR64)>;
-def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))), (COPY_TO_REGCLASS FPR64:$Xn,
-                                                                GPR64)>;
-
-//===----------------------------------------------------------------------===//
-// Floating point conversion instruction.
-//===----------------------------------------------------------------------===//
-
-defm FCVT : FPConversion<"fcvt">;
-
-def : Pat<(f32_to_f16 FPR32:$Rn),
-          (i32 (COPY_TO_REGCLASS
-                   (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)),
-                   GPR32))>;
-
-def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn),
-                          [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>;
-
-//===----------------------------------------------------------------------===//
-// Floating point single operand instructions.
-//===----------------------------------------------------------------------===//
-
-defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
-defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
-defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
-defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
-defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
-defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
-defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_arm64_neon_frintn>;
-defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
-
-def : Pat<(v1f64 (int_arm64_neon_frintn (v1f64 FPR64:$Rn))),
-          (FRINTNDr FPR64:$Rn)>;
-
-// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
-// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
-// <rdar://problem/13715968>
-// TODO: We should really model the FPSR flags correctly. This is really ugly.
-let hasSideEffects = 1 in {
-defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
-}
-
-defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
-
-let SchedRW = [WriteFDiv] in {
-defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating point two operand instructions.
-//===----------------------------------------------------------------------===//
-
-defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
-let SchedRW = [WriteFDiv] in {
-defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
-}
-defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_arm64_neon_fmaxnm>;
-defm FMAX   : TwoOperandFPData<0b0100, "fmax", ARM64fmax>;
-defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_arm64_neon_fminnm>;
-defm FMIN   : TwoOperandFPData<0b0101, "fmin", ARM64fmin>;
-let SchedRW = [WriteFMul] in {
-defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
-defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
-}
-defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
-
-def : Pat<(v1f64 (ARM64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (ARM64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_arm64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_arm64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
-
-//===----------------------------------------------------------------------===//
-// Floating point three operand instructions.
-//===----------------------------------------------------------------------===//
-
-defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
-defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
-     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
-defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
-     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
-defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
-     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
-
-// The following def pats catch the case where the LHS of an FMA is negated.
-// The TriOpFrag above catches the case where the middle operand is negated.
-def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Rd)),
-          (FMSUBSrrr FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-
-def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Rd)),
-          (FMSUBDrrr FPR64:$Rd, FPR64:$Rn, FPR64:$Rm)>;
-
-//===----------------------------------------------------------------------===//
-// Floating point comparison instructions.
-//===----------------------------------------------------------------------===//
-
-defm FCMPE : FPComparison<1, "fcmpe">;
-defm FCMP  : FPComparison<0, "fcmp", ARM64fcmp>;
-
-//===----------------------------------------------------------------------===//
-// Floating point conditional comparison instructions.
-//===----------------------------------------------------------------------===//
-
-defm FCCMPE : FPCondComparison<1, "fccmpe">;
-defm FCCMP  : FPCondComparison<0, "fccmp">;
-
-//===----------------------------------------------------------------------===//
-// Floating point conditional select instruction.
-//===----------------------------------------------------------------------===//
-
-defm FCSEL : FPCondSelect<"fcsel">;
-
-// CSEL instructions providing f128 types need to be handled by a
-// pseudo-instruction since the eventual code will need to introduce basic
-// blocks and control flow.
-def F128CSEL : Pseudo<(outs FPR128:$Rd),
-                      (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
-                      [(set (f128 FPR128:$Rd),
-                            (ARM64csel FPR128:$Rn, FPR128:$Rm,
-                                       (i32 imm:$cond), CPSR))]> {
-  let Uses = [CPSR];
-  let usesCustomInserter = 1;
-}
-
-
-//===----------------------------------------------------------------------===//
-// Floating point immediate move.
-//===----------------------------------------------------------------------===//
-
-let isReMaterializable = 1 in {
-defm FMOV : FPMoveImmediate<"fmov">;
-}
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD two vector instructions.
-//===----------------------------------------------------------------------===//
-
-defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_arm64_neon_abs>;
-defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_arm64_neon_cls>;
-defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
-defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", ARM64cmeqz>;
-defm CMGE   : SIMDCmpTwoVector<1, 0b01000, "cmge", ARM64cmgez>;
-defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", ARM64cmgtz>;
-defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", ARM64cmlez>;
-defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", ARM64cmltz>;
-defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
-defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
-
-defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", ARM64fcmeqz>;
-defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", ARM64fcmgez>;
-defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", ARM64fcmgtz>;
-defm FCMLE  : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", ARM64fcmlez>;
-defm FCMLT  : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", ARM64fcmltz>;
-defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_arm64_neon_fcvtas>;
-defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_arm64_neon_fcvtau>;
-defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
-def : Pat<(v4f32 (int_arm64_neon_vcvthf2fp (v4i16 V64:$Rn))),
-          (FCVTLv4i16 V64:$Rn)>;
-def : Pat<(v4f32 (int_arm64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
-                                                              (i64 4)))),
-          (FCVTLv8i16 V128:$Rn)>;
-def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
-def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
-                                                    (i64 2))))),
-          (FCVTLv4i32 V128:$Rn)>;
-
-defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_arm64_neon_fcvtms>;
-defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_arm64_neon_fcvtmu>;
-defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_arm64_neon_fcvtns>;
-defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_arm64_neon_fcvtnu>;
-defm FCVTN  : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
-def : Pat<(v4i16 (int_arm64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
-          (FCVTNv4i16 V128:$Rn)>;
-def : Pat<(concat_vectors V64:$Rd,
-                          (v4i16 (int_arm64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
-          (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
-def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
-          (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_arm64_neon_fcvtps>;
-defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_arm64_neon_fcvtpu>;
-defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
-                                        int_arm64_neon_fcvtxn>;
-defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
-defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
-                                       int_arm64_neon_fcvtzs>;
-defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
-                                       int_arm64_neon_fcvtzu>;
-}
-defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
-defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_arm64_neon_frecpe>;
-defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
-defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
-defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
-defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_arm64_neon_frintn>;
-defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
-defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
-defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
-defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_arm64_neon_frsqrte>;
-defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
-defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
-                               UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
-// Aliases for MVN -> NOT.
-def : InstAlias<"mvn.8b $Vd, $Vn", (NOTv8i8 V64:$Vd, V64:$Vn)>;
-def : InstAlias<"mvn.16b $Vd, $Vn", (NOTv16i8 V128:$Vd, V128:$Vn)>;
-def : InstAlias<"mvn $Vd.8b, $Vn.8b", (NOTv8i8 V64:$Vd, V64:$Vn)>;
-def : InstAlias<"mvn $Vd.16b, $Vn.16b", (NOTv16i8 V128:$Vd, V128:$Vn)>;
-
-def : Pat<(ARM64neg (v8i8  V64:$Rn)),  (NEGv8i8  V64:$Rn)>;
-def : Pat<(ARM64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
-def : Pat<(ARM64neg (v4i16 V64:$Rn)),  (NEGv4i16 V64:$Rn)>;
-def : Pat<(ARM64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
-def : Pat<(ARM64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
-def : Pat<(ARM64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
-def : Pat<(ARM64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
-
-def : Pat<(ARM64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(ARM64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(ARM64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(ARM64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-
-def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-
-defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_arm64_neon_rbit>;
-defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", ARM64rev16>;
-defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", ARM64rev32>;
-defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", ARM64rev64>;
-defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
-       BinOpFrag<(add node:$LHS, (int_arm64_neon_saddlp node:$RHS))> >;
-defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_arm64_neon_saddlp>;
-defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
-defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
-defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_arm64_neon_sqabs>;
-defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_arm64_neon_sqneg>;
-defm SQXTN  : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_arm64_neon_sqxtn>;
-defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_arm64_neon_sqxtun>;
-defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_arm64_neon_suqadd>;
-defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
-       BinOpFrag<(add node:$LHS, (int_arm64_neon_uaddlp node:$RHS))> >;
-defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
-                    int_arm64_neon_uaddlp>;
-defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
-defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_arm64_neon_uqxtn>;
-defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_arm64_neon_urecpe>;
-defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_arm64_neon_ursqrte>;
-defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_arm64_neon_usqadd>;
-defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
-
-def : Pat<(v2f32 (ARM64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
-def : Pat<(v4f32 (ARM64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
-
-// Patterns for vector long shift (by element width). These need to match all
-// three of zext, sext and anyext so it's easier to pull the patterns out of the
-// definition.
-multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
-  def : Pat<(ARM64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
-            (SHLLv8i8 V64:$Rn)>;
-  def : Pat<(ARM64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
-            (SHLLv16i8 V128:$Rn)>;
-  def : Pat<(ARM64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
-            (SHLLv4i16 V64:$Rn)>;
-  def : Pat<(ARM64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
-            (SHLLv8i16 V128:$Rn)>;
-  def : Pat<(ARM64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
-            (SHLLv2i32 V64:$Rn)>;
-  def : Pat<(ARM64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
-            (SHLLv4i32 V128:$Rn)>;
-}
-
-defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
-defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
-defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three vector instructions.
-//===----------------------------------------------------------------------===//
-
-defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
-defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_arm64_neon_addp>;
-defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", ARM64cmeq>;
-defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", ARM64cmge>;
-defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", ARM64cmgt>;
-defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", ARM64cmhi>;
-defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", ARM64cmhs>;
-defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", ARM64cmtst>;
-defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_arm64_neon_fabd>;
-defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_arm64_neon_facge>;
-defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_arm64_neon_facgt>;
-defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_arm64_neon_addp>;
-defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
-defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", ARM64fcmeq>;
-defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", ARM64fcmge>;
-defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", ARM64fcmgt>;
-defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
-defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_arm64_neon_fmaxnmp>;
-defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_arm64_neon_fmaxnm>;
-defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_arm64_neon_fmaxp>;
-defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", ARM64fmax>;
-defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_arm64_neon_fminnmp>;
-defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_arm64_neon_fminnm>;
-defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_arm64_neon_fminp>;
-defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", ARM64fmin>;
-
-// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
-// instruction expects the addend first, while the fma intrinsic puts it last.
-defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
-            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
-            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
-
-// The following def pats catch the case where the LHS of an FMA is negated.
-// The TriOpFrag above catches the case where the middle operand is negated.
-def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
-          (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
-
-def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
-          (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
-def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
-          (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
-defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_arm64_neon_fmulx>;
-defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
-defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_arm64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_arm64_neon_frsqrts>;
-defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
-defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
-                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
-defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
-                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
-defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
-defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_arm64_neon_pmul>;
-defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
-      TriOpFrag<(add node:$LHS, (int_arm64_neon_sabd node:$MHS, node:$RHS))> >;
-defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_arm64_neon_sabd>;
-defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_arm64_neon_shadd>;
-defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_arm64_neon_shsub>;
-defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_arm64_neon_smaxp>;
-defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_arm64_neon_smax>;
-defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_arm64_neon_sminp>;
-defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_arm64_neon_smin>;
-defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_arm64_neon_sqadd>;
-defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_arm64_neon_sqdmulh>;
-defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_arm64_neon_sqrdmulh>;
-defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_arm64_neon_sqrshl>;
-defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_arm64_neon_sqshl>;
-defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_arm64_neon_sqsub>;
-defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_arm64_neon_srhadd>;
-defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_arm64_neon_srshl>;
-defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_arm64_neon_sshl>;
-defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
-defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
-      TriOpFrag<(add node:$LHS, (int_arm64_neon_uabd node:$MHS, node:$RHS))> >;
-defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_arm64_neon_uabd>;
-defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_arm64_neon_uhadd>;
-defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_arm64_neon_uhsub>;
-defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_arm64_neon_umaxp>;
-defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_arm64_neon_umax>;
-defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_arm64_neon_uminp>;
-defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_arm64_neon_umin>;
-defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_arm64_neon_uqadd>;
-defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_arm64_neon_uqrshl>;
-defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_arm64_neon_uqshl>;
-defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_arm64_neon_uqsub>;
-defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_arm64_neon_urhadd>;
-defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_arm64_neon_urshl>;
-defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_arm64_neon_ushl>;
-
-defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
-defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
-                                  BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
-defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
-defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", ARM64bit>;
-defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
-    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
-defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
-defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
-                                  BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
-defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
-
-// FIXME: the .16b and .8b variantes should be emitted by the
-// AsmWriter. TableGen's AsmWriter-generator doesn't deal with variant syntaxes
-// in aliases yet though.
-def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-def : InstAlias<"{mov\t$dst.8h, $src.8h|mov.8h\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-def : InstAlias<"{mov\t$dst.4s, $src.4s|mov.4s\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-def : InstAlias<"{mov\t$dst.2d, $src.2d|mov.2d\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-
-def : InstAlias<"{mov\t$dst.8b, $src.8b|mov.8b\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-def : InstAlias<"{mov\t$dst.4h, $src.4h|mov.4h\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-def : InstAlias<"{mov\t$dst.2s, $src.2s|mov.2s\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-def : InstAlias<"{mov\t$dst.1d, $src.1d|mov.1d\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-
-def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmls.8b\t$dst, $src1, $src2}",
-                (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmls.16b\t$dst, $src1, $src2}",
-                (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmls.4h\t$dst, $src1, $src2}",
-                (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmls.8h\t$dst, $src1, $src2}",
-                (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmls.2s\t$dst, $src1, $src2}",
-                (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmls.4s\t$dst, $src1, $src2}",
-                (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmls.2d\t$dst, $src1, $src2}",
-                (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmlo.8b\t$dst, $src1, $src2}",
-                (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmlo.16b\t$dst, $src1, $src2}",
-                (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmlo.4h\t$dst, $src1, $src2}",
-                (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmlo.8h\t$dst, $src1, $src2}",
-                (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmlo.2s\t$dst, $src1, $src2}",
-                (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmlo.4s\t$dst, $src1, $src2}",
-                (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmlo.2d\t$dst, $src1, $src2}",
-                (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmle.8b\t$dst, $src1, $src2}",
-                (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmle.16b\t$dst, $src1, $src2}",
-                (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmle.4h\t$dst, $src1, $src2}",
-                (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmle.8h\t$dst, $src1, $src2}",
-                (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmle.2s\t$dst, $src1, $src2}",
-                (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmle.4s\t$dst, $src1, $src2}",
-                (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmle.2d\t$dst, $src1, $src2}",
-                (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmlt.8b\t$dst, $src1, $src2}",
-                (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmlt.16b\t$dst, $src1, $src2}",
-                (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmlt.4h\t$dst, $src1, $src2}",
-                (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmlt.8h\t$dst, $src1, $src2}",
-                (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmlt.2s\t$dst, $src1, $src2}",
-                (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmlt.4s\t$dst, $src1, $src2}",
-                (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmlt.2d\t$dst, $src1, $src2}",
-                (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
-                "|fcmle.2s\t$dst, $src1, $src2}",
-                (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
-                "|fcmle.4s\t$dst, $src1, $src2}",
-                (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
-                "|fcmle.2d\t$dst, $src1, $src2}",
-                (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
-                "|fcmlt.2s\t$dst, $src1, $src2}",
-                (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
-                "|fcmlt.4s\t$dst, $src1, $src2}",
-                (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
-                "|fcmlt.2d\t$dst, $src1, $src2}",
-                (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
-                "|facle.2s\t$dst, $src1, $src2}",
-                (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
-                "|facle.4s\t$dst, $src1, $src2}",
-                (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
-                "|facle.2d\t$dst, $src1, $src2}",
-                (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
-                "|faclt.2s\t$dst, $src1, $src2}",
-                (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
-                "|faclt.4s\t$dst, $src1, $src2}",
-                (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
-                "|faclt.2d\t$dst, $src1, $src2}",
-                (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three scalar instructions.
-//===----------------------------------------------------------------------===//
-
-defm ADD      : SIMDThreeScalarD<0, 0b10000, "add", add>;
-defm CMEQ     : SIMDThreeScalarD<1, 0b10001, "cmeq", ARM64cmeq>;
-defm CMGE     : SIMDThreeScalarD<0, 0b00111, "cmge", ARM64cmge>;
-defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", ARM64cmgt>;
-defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", ARM64cmhi>;
-defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", ARM64cmhs>;
-defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", ARM64cmtst>;
-defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_arm64_sisd_fabd>;
-def : Pat<(v1f64 (int_arm64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FABD64 FPR64:$Rn, FPR64:$Rm)>;
-defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
-                                     int_arm64_neon_facge>;
-defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
-                                     int_arm64_neon_facgt>;
-defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", ARM64fcmeq>;
-defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", ARM64fcmge>;
-defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", ARM64fcmgt>;
-defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_arm64_neon_fmulx>;
-defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_arm64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_arm64_neon_frsqrts>;
-defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_arm64_neon_sqadd>;
-defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_arm64_neon_sqdmulh>;
-defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_arm64_neon_sqrdmulh>;
-defm SQRSHL   : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_arm64_neon_sqrshl>;
-defm SQSHL    : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_arm64_neon_sqshl>;
-defm SQSUB    : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_arm64_neon_sqsub>;
-defm SRSHL    : SIMDThreeScalarD<   0, 0b01010, "srshl", int_arm64_neon_srshl>;
-defm SSHL     : SIMDThreeScalarD<   0, 0b01000, "sshl", int_arm64_neon_sshl>;
-defm SUB      : SIMDThreeScalarD<   1, 0b10000, "sub", sub>;
-defm UQADD    : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_arm64_neon_uqadd>;
-defm UQRSHL   : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_arm64_neon_uqrshl>;
-defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_arm64_neon_uqshl>;
-defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_arm64_neon_uqsub>;
-defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_arm64_neon_urshl>;
-defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_arm64_neon_ushl>;
-
-def : InstAlias<"cmls $dst, $src1, $src2",
-                (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"cmle $dst, $src1, $src2",
-                (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"cmlo $dst, $src1, $src2",
-                (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"cmlt $dst, $src1, $src2",
-                (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"fcmle $dst, $src1, $src2",
-                (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1)>;
-def : InstAlias<"fcmle $dst, $src1, $src2",
-                (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"fcmlt $dst, $src1, $src2",
-                (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1)>;
-def : InstAlias<"fcmlt $dst, $src1, $src2",
-                (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"facle $dst, $src1, $src2",
-                (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1)>;
-def : InstAlias<"facle $dst, $src1, $src2",
-                (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"faclt $dst, $src1, $src2",
-                (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1)>;
-def : InstAlias<"faclt $dst, $src1, $src2",
-                (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three scalar instructions (mixed operands).
-//===----------------------------------------------------------------------===//
-defm SQDMULL  : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
-                                       int_arm64_neon_sqdmulls_scalar>;
-defm SQDMLAL  : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
-defm SQDMLSL  : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
-
-def : Pat<(i64 (int_arm64_neon_sqadd (i64 FPR64:$Rd),
-                   (i64 (int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
-                                                        (i32 FPR32:$Rm))))),
-          (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_sqsub (i64 FPR64:$Rd),
-                   (i64 (int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
-                                                        (i32 FPR32:$Rm))))),
-          (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD two scalar instructions.
-//===----------------------------------------------------------------------===//
-
-defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_arm64_neon_abs>;
-defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", ARM64cmeqz>;
-defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", ARM64cmgez>;
-defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", ARM64cmgtz>;
-defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", ARM64cmlez>;
-defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", ARM64cmltz>;
-defm FCMEQ  : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", ARM64fcmeqz>;
-defm FCMGE  : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", ARM64fcmgez>;
-defm FCMGT  : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", ARM64fcmgtz>;
-defm FCMLE  : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", ARM64fcmlez>;
-defm FCMLT  : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", ARM64fcmltz>;
-defm FCVTAS : SIMDTwoScalarSD<   0, 0, 0b11100, "fcvtas">;
-defm FCVTAU : SIMDTwoScalarSD<   1, 0, 0b11100, "fcvtau">;
-defm FCVTMS : SIMDTwoScalarSD<   0, 0, 0b11011, "fcvtms">;
-defm FCVTMU : SIMDTwoScalarSD<   1, 0, 0b11011, "fcvtmu">;
-defm FCVTNS : SIMDTwoScalarSD<   0, 0, 0b11010, "fcvtns">;
-defm FCVTNU : SIMDTwoScalarSD<   1, 0, 0b11010, "fcvtnu">;
-defm FCVTPS : SIMDTwoScalarSD<   0, 1, 0b11010, "fcvtps">;
-defm FCVTPU : SIMDTwoScalarSD<   1, 1, 0b11010, "fcvtpu">;
-def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
-defm FCVTZS : SIMDTwoScalarSD<   0, 1, 0b11011, "fcvtzs">;
-defm FCVTZU : SIMDTwoScalarSD<   1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDTwoScalarSD<   0, 1, 0b11101, "frecpe">;
-defm FRECPX : SIMDTwoScalarSD<   0, 1, 0b11111, "frecpx">;
-defm FRSQRTE : SIMDTwoScalarSD<  1, 1, 0b11101, "frsqrte">;
-defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
-                                 UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", ARM64sitof>;
-defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_arm64_neon_sqabs>;
-defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_arm64_neon_sqneg>;
-defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_arm64_neon_scalar_sqxtn>;
-defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_arm64_neon_scalar_sqxtun>;
-defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
-                                     int_arm64_neon_suqadd>;
-defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", ARM64uitof>;
-defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_arm64_neon_scalar_uqxtn>;
-defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
-                                    int_arm64_neon_usqadd>;
-
-def : Pat<(v1i64 (int_arm64_neon_fcvtas (v1f64 FPR64:$Rn))),
-          (FCVTASv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtau (v1f64 FPR64:$Rn))),
-          (FCVTAUv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtms (v1f64 FPR64:$Rn))),
-          (FCVTMSv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtmu (v1f64 FPR64:$Rn))),
-          (FCVTMUv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtns (v1f64 FPR64:$Rn))),
-          (FCVTNSv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtnu (v1f64 FPR64:$Rn))),
-          (FCVTNUv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtps (v1f64 FPR64:$Rn))),
-          (FCVTPSv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtpu (v1f64 FPR64:$Rn))),
-          (FCVTPUv1i64 FPR64:$Rn)>;
-
-def : Pat<(f32 (int_arm64_neon_frecpe (f32 FPR32:$Rn))),
-          (FRECPEv1i32 FPR32:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_frecpe (f64 FPR64:$Rn))),
-          (FRECPEv1i64 FPR64:$Rn)>;
-def : Pat<(v1f64 (int_arm64_neon_frecpe (v1f64 FPR64:$Rn))),
-          (FRECPEv1i64 FPR64:$Rn)>;
-
-def : Pat<(f32 (int_arm64_neon_frecpx (f32 FPR32:$Rn))),
-          (FRECPXv1i32 FPR32:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_frecpx (f64 FPR64:$Rn))),
-          (FRECPXv1i64 FPR64:$Rn)>;
-
-def : Pat<(f32 (int_arm64_neon_frsqrte (f32 FPR32:$Rn))),
-          (FRSQRTEv1i32 FPR32:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_frsqrte (f64 FPR64:$Rn))),
-          (FRSQRTEv1i64 FPR64:$Rn)>;
-def : Pat<(v1f64 (int_arm64_neon_frsqrte (v1f64 FPR64:$Rn))),
-          (FRSQRTEv1i64 FPR64:$Rn)>;
-
-// If an integer is about to be converted to a floating point value,
-// just load it on the floating point unit.
-// Here are the patterns for 8 and 16-bits to float.
-// 8-bits -> float.
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi8 ro_indexed8:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRBro ro_indexed8:$addr), bsub))>;
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi8 am_indexed8:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRBui am_indexed8:$addr), bsub))>;
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi8 am_unscaled8:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDURBi am_unscaled8:$addr), bsub))>;
-// 16-bits -> float.
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi16 ro_indexed16:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRHro ro_indexed16:$addr), hsub))>;
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi16 am_indexed16:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRHui am_indexed16:$addr), hsub))>;
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi16 am_unscaled16:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDURHi am_unscaled16:$addr), hsub))>;
-// 32-bits are handled in target specific dag combine:
-// performIntToFpCombine.
-// 64-bits integer to 32-bits floating point, not possible with
-// UCVTF on floating point registers (both source and destination
-// must have the same size).
-
-// Here are the patterns for 8, 16, 32, and 64-bits to double.
-// 8-bits -> double.
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi8 ro_indexed8:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRBro ro_indexed8:$addr), bsub))>;
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi8 am_indexed8:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRBui am_indexed8:$addr), bsub))>;
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi8 am_unscaled8:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDURBi am_unscaled8:$addr), bsub))>;
-// 16-bits -> double.
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi16 ro_indexed16:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRHro ro_indexed16:$addr), hsub))>;
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi16 am_indexed16:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRHui am_indexed16:$addr), hsub))>;
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi16 am_unscaled16:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDURHi am_unscaled16:$addr), hsub))>;
-// 32-bits -> double.
-def : Pat <(f64 (uint_to_fp (i32 (load ro_indexed32:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRSro ro_indexed32:$addr), ssub))>;
-def : Pat <(f64 (uint_to_fp (i32 (load am_indexed32:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRSui am_indexed32:$addr), ssub))>;
-def : Pat <(f64 (uint_to_fp (i32 (load am_unscaled32:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDURSi am_unscaled32:$addr), ssub))>;
-// 64-bits -> double are handled in target specific dag combine:
-// performIntToFpCombine.
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three different-sized vector instructions.
-//===----------------------------------------------------------------------===//
-
-defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_arm64_neon_addhn>;
-defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_arm64_neon_subhn>;
-defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_arm64_neon_raddhn>;
-defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_arm64_neon_rsubhn>;
-defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_arm64_neon_pmull>;
-defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
-                                             int_arm64_neon_sabd>;
-defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
-                                          int_arm64_neon_sabd>;
-defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
-            BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
-defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
-                 BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
-defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_arm64_neon_smull>;
-defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
-                                               int_arm64_neon_sqadd>;
-defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
-                                               int_arm64_neon_sqsub>;
-defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
-                                     int_arm64_neon_sqdmull>;
-defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
-                 BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
-defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
-                 BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
-defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
-                                              int_arm64_neon_uabd>;
-defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
-                                          int_arm64_neon_uabd>;
-defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
-                 BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
-defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
-                 BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
-defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_arm64_neon_umull>;
-defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
-                 BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
-defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
-                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
-
-// Patterns for 64-bit pmull
-def : Pat<(int_arm64_neon_pmull64 V64:$Rn, V64:$Rm),
-          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
-def : Pat<(int_arm64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
-                                  (vector_extract (v2i64 V128:$Rm), (i64 1))),
-          (PMULLv2i64 V128:$Rn, V128:$Rm)>;
-
-// CodeGen patterns for addhn and subhn instructions, which can actually be
-// written in LLVM IR without too much difficulty.
-
-// ADDHN
-def : Pat<(v8i8 (trunc (v8i16 (ARM64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
-          (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i16 (trunc (v4i32 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                           (i32 16))))),
-          (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v2i32 (trunc (v2i64 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                           (i32 32))))),
-          (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v8i8 V64:$Rd),
-                          (trunc (v8i16 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                                    (i32 8))))),
-          (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v4i16 V64:$Rd),
-                          (trunc (v4i32 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                                    (i32 16))))),
-          (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v2i32 V64:$Rd),
-                          (trunc (v2i64 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                                    (i32 32))))),
-          (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-
-// SUBHN
-def : Pat<(v8i8 (trunc (v8i16 (ARM64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
-          (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i16 (trunc (v4i32 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                           (i32 16))))),
-          (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v2i32 (trunc (v2i64 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                           (i32 32))))),
-          (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v8i8 V64:$Rd),
-                          (trunc (v8i16 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                                    (i32 8))))),
-          (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v4i16 V64:$Rd),
-                          (trunc (v4i32 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                                    (i32 16))))),
-          (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v2i32 V64:$Rd),
-                          (trunc (v2i64 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                                    (i32 32))))),
-          (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD bitwise extract from vector instruction.
-//----------------------------------------------------------------------------
-
-defm EXT : SIMDBitwiseExtract<"ext">;
-
-def : Pat<(v4i16 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v8i16 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2i32 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v2f32 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v4i32 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v4f32 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2i64 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2f64 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-
-// We use EXT to handle extract_subvector to copy the upper 64-bits of a
-// 128-bit vector.
-def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD zip vector
-//----------------------------------------------------------------------------
-
-defm TRN1 : SIMDZipVector<0b010, "trn1", ARM64trn1>;
-defm TRN2 : SIMDZipVector<0b110, "trn2", ARM64trn2>;
-defm UZP1 : SIMDZipVector<0b001, "uzp1", ARM64uzp1>;
-defm UZP2 : SIMDZipVector<0b101, "uzp2", ARM64uzp2>;
-defm ZIP1 : SIMDZipVector<0b011, "zip1", ARM64zip1>;
-defm ZIP2 : SIMDZipVector<0b111, "zip2", ARM64zip2>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD TBL/TBX instructions
-//----------------------------------------------------------------------------
-
-defm TBL : SIMDTableLookup<    0, "tbl">;
-defm TBX : SIMDTableLookupTied<1, "tbx">;
-
-def : Pat<(v8i8 (int_arm64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
-          (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
-def : Pat<(v16i8 (int_arm64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
-          (TBLv16i8One V128:$Ri, V128:$Rn)>;
-
-def : Pat<(v8i8 (int_arm64_neon_tbx1 (v8i8 V64:$Rd),
-                  (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
-          (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
-def : Pat<(v16i8 (int_arm64_neon_tbx1 (v16i8 V128:$Rd),
-                   (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
-          (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar CPY instruction
-//----------------------------------------------------------------------------
-
-defm CPY : SIMDScalarCPY<"cpy">;
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar pairwise instructions
-//----------------------------------------------------------------------------
-
-defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
-defm FADDP   : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
-defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
-defm FMAXP   : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
-defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
-defm FMINP   : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
-def : Pat<(i64 (int_arm64_neon_saddv (v2i64 V128:$Rn))),
-          (ADDPv2i64p V128:$Rn)>;
-def : Pat<(i64 (int_arm64_neon_uaddv (v2i64 V128:$Rn))),
-          (ADDPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_faddv (v2f32 V64:$Rn))),
-          (FADDPv2i32p V64:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_faddv (v4f32 V128:$Rn))),
-          (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
-def : Pat<(f64 (int_arm64_neon_faddv (v2f64 V128:$Rn))),
-          (FADDPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fmaxnmv (v2f32 V64:$Rn))),
-          (FMAXNMPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fmaxnmv (v2f64 V128:$Rn))),
-          (FMAXNMPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fmaxv (v2f32 V64:$Rn))),
-          (FMAXPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fmaxv (v2f64 V128:$Rn))),
-          (FMAXPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fminnmv (v2f32 V64:$Rn))),
-          (FMINNMPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fminnmv (v2f64 V128:$Rn))),
-          (FMINNMPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fminv (v2f32 V64:$Rn))),
-          (FMINPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fminv (v2f64 V128:$Rn))),
-          (FMINPv2i64p V128:$Rn)>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD INS/DUP instructions
-//----------------------------------------------------------------------------
-
-def DUPv8i8gpr  : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>;
-def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>;
-def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>;
-def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>;
-def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>;
-def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>;
-def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>;
-
-def DUPv2i64lane : SIMDDup64FromElement;
-def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
-def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
-def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
-def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
-def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
-def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
-
-def : Pat<(v2f32 (ARM64dup (f32 FPR32:$Rn))),
-          (v2f32 (DUPv2i32lane
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
-            (i64 0)))>;
-def : Pat<(v4f32 (ARM64dup (f32 FPR32:$Rn))),
-          (v4f32 (DUPv4i32lane
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
-            (i64 0)))>;
-def : Pat<(v2f64 (ARM64dup (f64 FPR64:$Rn))),
-          (v2f64 (DUPv2i64lane
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
-            (i64 0)))>;
-
-def : Pat<(v2f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
-          (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
-def : Pat<(v4f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
-         (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
-def : Pat<(v2f64 (ARM64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
-          (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
-
-defm SMOV : SMov;
-defm UMOV : UMov;
-
-def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
-          (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
-          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
-          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
-          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
-          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
-def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
-          (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
-
-// Extracting i8 or i16 elements will have the zero-extend transformed to
-// an 'and' mask by type legalization since neither i8 nor i16 are legal types
-// for ARM64. Match these patterns here since UMOV already zeroes out the high
-// bits of the destination register.
-def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
-               (i32 0xff)),
-          (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
-def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
-               (i32 0xffff)),
-          (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
-
-defm INS : SIMDIns;
-
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
-          (INSvi8gpr (v16i8 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn)>;
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
-          (EXTRACT_SUBREG
-            (INSvi8gpr (v16i8 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn), dsub)>;
-
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
-          (INSvi16gpr (v8i16 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn)>;
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
-          (EXTRACT_SUBREG
-            (INSvi16gpr (v8i16 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn), dsub)>;
-
-def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
-            (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                                  (i32 FPR32:$Rn), ssub))>;
-def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
-            (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                                  (i32 FPR32:$Rn), ssub))>;
-def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
-            (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
-                                  (i64 FPR64:$Rn), dsub))>;
-
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
-
-def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
-            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
-          (EXTRACT_SUBREG
-            (INSvi32lane
-              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
-              VectorIndexS:$imm,
-              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
-              (i64 0)),
-            dsub)>;
-def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
-            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
-          (INSvi32lane
-            V128:$Rn, VectorIndexS:$imm,
-            (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
-            (i64 0))>;
-def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
-            (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
-          (INSvi64lane
-            V128:$Rn, VectorIndexD:$imm,
-            (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
-            (i64 0))>;
-
-// Copy an element at a constant index in one vector into a constant indexed
-// element of another.
-// FIXME refactor to a shared class/dev parameterized on vector type, vector
-// index type and INS extension
-def : Pat<(v16i8 (int_arm64_neon_vcopy_lane
-                   (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
-                   VectorIndexB:$idx2)),
-          (v16i8 (INSvi8lane
-                   V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
-          )>;
-def : Pat<(v8i16 (int_arm64_neon_vcopy_lane
-                   (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
-                   VectorIndexH:$idx2)),
-          (v8i16 (INSvi16lane
-                   V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
-          )>;
-def : Pat<(v4i32 (int_arm64_neon_vcopy_lane
-                   (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
-                   VectorIndexS:$idx2)),
-          (v4i32 (INSvi32lane
-                   V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
-          )>;
-def : Pat<(v2i64 (int_arm64_neon_vcopy_lane
-                   (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
-                   VectorIndexD:$idx2)),
-          (v2i64 (INSvi64lane
-                   V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
-          )>;
-
-// Floating point vector extractions are codegen'd as either a sequence of
-// subregister extractions, possibly fed by an INS if the lane number is
-// anything other than zero.
-def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
-          (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
-def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
-          (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
-def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
-          (f64 (EXTRACT_SUBREG
-            (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
-                         V128:$Rn, VectorIndexD:$idx),
-            dsub))>;
-def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
-          (f32 (EXTRACT_SUBREG
-            (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
-                         V128:$Rn, VectorIndexS:$idx),
-            ssub))>;
-
-// All concat_vectors operations are canonicalised to act on i64 vectors for
-// ARM64. In the general case we need an instruction, which had just as well be
-// INS.
-class ConcatPat<ValueType DstTy, ValueType SrcTy>
-  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
-        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
-                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
-
-def : ConcatPat<v2i64, v1i64>;
-def : ConcatPat<v2f64, v1f64>;
-def : ConcatPat<v4i32, v2i32>;
-def : ConcatPat<v4f32, v2f32>;
-def : ConcatPat<v8i16, v4i16>;
-def : ConcatPat<v16i8, v8i8>;
-
-// If the high lanes are undef, though, we can just ignore them:
-class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
-  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
-        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
-
-def : ConcatUndefPat<v2i64, v1i64>;
-def : ConcatUndefPat<v2f64, v1f64>;
-def : ConcatUndefPat<v4i32, v2i32>;
-def : ConcatUndefPat<v4f32, v2f32>;
-def : ConcatUndefPat<v8i16, v4i16>;
-def : ConcatUndefPat<v16i8, v8i8>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD across lanes instructions
-//----------------------------------------------------------------------------
-
-defm ADDV    : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
-defm SMAXV   : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
-defm SMINV   : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
-defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
-defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
-defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
-defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
-defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_arm64_neon_fmaxnmv>;
-defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_arm64_neon_fmaxv>;
-defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_arm64_neon_fminnmv>;
-defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_arm64_neon_fminv>;
-
-multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> {
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-  def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          (i64 0)))>;
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          (i64 0)))>;
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          (i64 0)))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          (i64 0)))>;
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)),
-          (i32 (SMOVvi16to32
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-           (i64 0)))>;
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (SMOVvi16to32
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-           (i64 0)))>;
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          (i64 0)))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          (i64 0)))>;
-
-def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
-          ssub))>;
-}
-
-multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> {
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-  def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          ssub))>;
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          ssub))>;
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          ssub))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          ssub))>;
-
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)),
-          (i32 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-            ssub))>;
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-            ssub))>;
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          ssub))>;
-
-def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
-          ssub))>;
-
-}
-
-multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
-          (i64 0)))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
-          (i64 0)))>;
-
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (EXTRACT_SUBREG
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
-           ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
-          ssub))>;
-
-def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
-        (i64 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
-          dsub))>;
-}
-
-multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
-                                                Intrinsic intOp> {
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
-          ssub))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
-          ssub))>;
-
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
-            ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
-          ssub))>;
-
-def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
-        (i64 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
-          dsub))>;
-}
-
-defm : SIMDAcrossLanesSignedIntrinsic<"ADDV",  int_arm64_neon_saddv>;
-// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
-def : Pat<(i32 (int_arm64_neon_saddv (v2i32 V64:$Rn))),
-          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV",  int_arm64_neon_uaddv>;
-// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
-def : Pat<(i32 (int_arm64_neon_uaddv (v2i32 V64:$Rn))),
-          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_arm64_neon_smaxv>;
-def : Pat<(i32 (int_arm64_neon_smaxv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_arm64_neon_sminv>;
-def : Pat<(i32 (int_arm64_neon_sminv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_arm64_neon_umaxv>;
-def : Pat<(i32 (int_arm64_neon_umaxv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_arm64_neon_uminv>;
-def : Pat<(i32 (int_arm64_neon_uminv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_arm64_neon_saddlv>;
-defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_arm64_neon_uaddlv>;
-
-// The vaddlv_s32 intrinsic gets mapped to SADDLP.
-def : Pat<(i64 (int_arm64_neon_saddlv (v2i32 V64:$Rn))),
-          (i64 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (SADDLPv2i32_v1i64 V64:$Rn), dsub),
-            dsub))>;
-// The vaddlv_u32 intrinsic gets mapped to UADDLP.
-def : Pat<(i64 (int_arm64_neon_uaddlv (v2i32 V64:$Rn))),
-          (i64 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (UADDLPv2i32_v1i64 V64:$Rn), dsub),
-            dsub))>;
-
-//------------------------------------------------------------------------------
-// AdvSIMD modified immediate instructions
-//------------------------------------------------------------------------------
-
-// AdvSIMD BIC
-defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", ARM64bici>;
-// AdvSIMD ORR
-defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", ARM64orri>;
-
-
-// AdvSIMD FMOV
-def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
-                                              "fmov", ".2d",
-                       [(set (v2f64 V128:$Rd), (ARM64fmov imm0_255:$imm8))]>;
-def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
-                                              "fmov", ".2s",
-                       [(set (v2f32 V64:$Rd), (ARM64fmov imm0_255:$imm8))]>;
-def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
-                                              "fmov", ".4s",
-                       [(set (v4f32 V128:$Rd), (ARM64fmov imm0_255:$imm8))]>;
-
-// AdvSIMD MOVI
-
-// EDIT byte mask: scalar
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
-                    [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
-// The movi_edit node has the immediate value already encoded, so we use
-// a plain imm0_255 here.
-def : Pat<(f64 (ARM64movi_edit imm0_255:$shift)),
-          (MOVID imm0_255:$shift)>;
-
-def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
-
-def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
-
-// EDIT byte mask: 2d
-
-// The movi_edit node has the immediate value already encoded, so we use
-// a plain imm0_255 in the pattern
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
-                                                simdimmtype10,
-                                                "movi", ".2d",
-                   [(set (v2i64 V128:$Rd), (ARM64movi_edit imm0_255:$imm8))]>;
-
-
-// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
-// Complexity is added to break a tie with a plain MOVI.
-let AddedComplexity = 1 in {
-def : Pat<(f32   fpimm0),
-          (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
-      Requires<[HasZCZ]>;
-def : Pat<(f64   fpimm0),
-          (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
-      Requires<[HasZCZ]>;
-}
-
-def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-
-def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-
-// EDIT per word & halfword: 2s, 4h, 4s, & 8h
-defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
-def : Pat<(v2i32 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i32 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i16 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v8i16 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
-
-// EDIT per word: 2s & 4s with MSL shifter
-def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
-                      [(set (v2i32 V64:$Rd),
-                            (ARM64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
-                      [(set (v4i32 V128:$Rd),
-                            (ARM64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-
-// Per byte: 8b & 16b
-def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
-                                                 "movi", ".8b",
-                       [(set (v8i8 V64:$Rd), (ARM64movi imm0_255:$imm8))]>;
-def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
-                                                 "movi", ".16b",
-                       [(set (v16i8 V128:$Rd), (ARM64movi imm0_255:$imm8))]>;
-
-// AdvSIMD MVNI
-
-// EDIT per word & halfword: 2s, 4h, 4s, & 8h
-defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
-def : Pat<(v2i32 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i32 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i16 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v8i16 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
-
-// EDIT per word: 2s & 4s with MSL shifter
-def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
-                      [(set (v2i32 V64:$Rd),
-                            (ARM64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
-                      [(set (v4i32 V128:$Rd),
-                            (ARM64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD indexed element
-//----------------------------------------------------------------------------
-
-let neverHasSideEffects = 1 in {
-  defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
-  defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
-}
-
-// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
-// instruction expects the addend first, while the intrinsic expects it last.
-
-// On the other hand, there are quite a few valid combinatorial options due to
-// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
-
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
-
-multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
-  // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
-  // and DUP scalar.
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64duplane32 (v4f32 (fneg V128:$Rm)),
-                                           VectorIndexS:$idx))),
-            (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (v2f32 (ARM64duplane32
-                                      (v4f32 (insert_subvector undef,
-                                                 (v2f32 (fneg V64:$Rm)),
-                                                 (i32 0))),
-                                      VectorIndexS:$idx)))),
-            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
-                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
-                               VectorIndexS:$idx)>;
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64dup (f32 (fneg FPR32Op:$Rm))))),
-            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-  // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
-  // and DUP scalar.
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64duplane32 (v4f32 (fneg V128:$Rm)),
-                                           VectorIndexS:$idx))),
-            (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
-                               VectorIndexS:$idx)>;
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (v4f32 (ARM64duplane32
-                                      (v4f32 (insert_subvector undef,
-                                                 (v2f32 (fneg V64:$Rm)),
-                                                 (i32 0))),
-                                      VectorIndexS:$idx)))),
-            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
-                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
-                               VectorIndexS:$idx)>;
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64dup (f32 (fneg FPR32Op:$Rm))))),
-            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-  // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
-  // (DUPLANE from 64-bit would be trivial).
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64duplane64 (v2f64 (fneg V128:$Rm)),
-                                           VectorIndexD:$idx))),
-            (FMLSv2i64_indexed
-                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64dup (f64 (fneg FPR64Op:$Rm))))),
-            (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
-
-  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v4f32 (fneg V128:$Rm)),
-                                         VectorIndexS:$idx))),
-            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
-                V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v2f32 (fneg V64:$Rm)),
-                                         VectorIndexS:$idx))),
-            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
-                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
-
-  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
-  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
-                         (vector_extract (v2f64 (fneg V128:$Rm)),
-                                         VectorIndexS:$idx))),
-            (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
-                V128:$Rm, VectorIndexS:$idx)>;
-}
-
-defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
-
-defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_arm64_neon_fmulx>;
-defm FMUL  : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
-
-def : Pat<(v2f32 (fmul V64:$Rn, (ARM64dup (f32 FPR32:$Rm)))),
-          (FMULv2i32_indexed V64:$Rn,
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
-            (i64 0))>;
-def : Pat<(v4f32 (fmul V128:$Rn, (ARM64dup (f32 FPR32:$Rm)))),
-          (FMULv4i32_indexed V128:$Rn,
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
-            (i64 0))>;
-def : Pat<(v2f64 (fmul V128:$Rn, (ARM64dup (f64 FPR64:$Rm)))),
-          (FMULv2i64_indexed V128:$Rn,
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
-            (i64 0))>;
-
-defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_arm64_neon_sqdmulh>;
-defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_arm64_neon_sqrdmulh>;
-defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
-              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
-defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
-              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
-defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
-defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
-                int_arm64_neon_smull>;
-defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
-                                           int_arm64_neon_sqadd>;
-defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
-                                           int_arm64_neon_sqsub>;
-defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_arm64_neon_sqdmull>;
-defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
-                int_arm64_neon_umull>;
-
-// A scalar sqdmull with the second operand being a vector lane can be
-// handled directly with the indexed instruction encoding.
-def : Pat<(int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
-                                          (vector_extract (v4i32 V128:$Vm),
-                                                           VectorIndexS:$idx)),
-          (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar shift instructions
-//----------------------------------------------------------------------------
-defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
-defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
-defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
-defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
-// Codegen patterns for the above. We don't put these directly on the
-// instructions because TableGen's type inference can't handle the truth.
-// Having the same base pattern for fp <--> int totally freaks it out.
-def : Pat<(int_arm64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
-          (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(int_arm64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
-          (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(i64 (int_arm64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
-          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(i64 (int_arm64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
-          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1i64 (int_arm64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1i64 (int_arm64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(int_arm64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
-          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(int_arm64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
-          (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(f64 (int_arm64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
-          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(f64 (int_arm64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
-          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1f64 (int_arm64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1f64 (int_arm64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-
-defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", ARM64vshl>;
-defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
-defm SQRSHRN  : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
-                                     int_arm64_neon_sqrshrn>;
-defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
-                                     int_arm64_neon_sqrshrun>;
-defm SQSHLU   : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", ARM64sqshlui>;
-defm SQSHL    : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", ARM64sqshli>;
-defm SQSHRN   : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
-                                     int_arm64_neon_sqshrn>;
-defm SQSHRUN  : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
-                                     int_arm64_neon_sqshrun>;
-defm SRI      : SIMDScalarRShiftDTied<   1, 0b01000, "sri">;
-defm SRSHR    : SIMDScalarRShiftD<   0, 0b00100, "srshr", ARM64srshri>;
-defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64srshri node:$MHS, node:$RHS))>>;
-defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", ARM64vashr>;
-defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64vashr node:$MHS, node:$RHS))>>;
-defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
-                                     int_arm64_neon_uqrshrn>;
-defm UQSHL    : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", ARM64uqshli>;
-defm UQSHRN   : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
-                                     int_arm64_neon_uqshrn>;
-defm URSHR    : SIMDScalarRShiftD<   1, 0b00100, "urshr", ARM64urshri>;
-defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64urshri node:$MHS, node:$RHS))>>;
-defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", ARM64vlshr>;
-defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64vlshr node:$MHS, node:$RHS))>>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD vector shift instructions
-//----------------------------------------------------------------------------
-defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_arm64_neon_vcvtfp2fxs>;
-defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_arm64_neon_vcvtfp2fxu>;
-defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
-                                   int_arm64_neon_vcvtfxs2fp>;
-defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
-                                         int_arm64_neon_rshrn>;
-defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", ARM64vshl>;
-defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
-                          BinOpFrag<(trunc (ARM64vashr node:$LHS, node:$RHS))>>;
-defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_arm64_neon_vsli>;
-def : Pat<(v1i64 (int_arm64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
-                                      (i32 vecshiftL64:$imm))),
-          (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
-defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
-                                         int_arm64_neon_sqrshrn>;
-defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
-                                         int_arm64_neon_sqrshrun>;
-defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", ARM64sqshlui>;
-defm SQSHL  : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", ARM64sqshli>;
-defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
-                                         int_arm64_neon_sqshrn>;
-defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
-                                         int_arm64_neon_sqshrun>;
-defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_arm64_neon_vsri>;
-def : Pat<(v1i64 (int_arm64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
-                                      (i32 vecshiftR64:$imm))),
-          (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
-defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", ARM64srshri>;
-defm SRSRA   : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
-                 TriOpFrag<(add node:$LHS,
-                                (ARM64srshri node:$MHS, node:$RHS))> >;
-defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
-                BinOpFrag<(ARM64vshl (sext node:$LHS), node:$RHS)>>;
-
-defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", ARM64vashr>;
-defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
-                TriOpFrag<(add node:$LHS, (ARM64vashr node:$MHS, node:$RHS))>>;
-defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
-                        int_arm64_neon_vcvtfxu2fp>;
-defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
-                                         int_arm64_neon_uqrshrn>;
-defm UQSHL   : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", ARM64uqshli>;
-defm UQSHRN  : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
-                                         int_arm64_neon_uqshrn>;
-defm URSHR   : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", ARM64urshri>;
-defm URSRA   : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
-                TriOpFrag<(add node:$LHS,
-                               (ARM64urshri node:$MHS, node:$RHS))> >;
-defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
-                BinOpFrag<(ARM64vshl (zext node:$LHS), node:$RHS)>>;
-defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", ARM64vlshr>;
-defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
-                TriOpFrag<(add node:$LHS, (ARM64vlshr node:$MHS, node:$RHS))> >;
-
-// SHRN patterns for when a logical right shift was used instead of arithmetic
-// (the immediate guarantees no sign bits actually end up in the result so it
-// doesn't matter).
-def : Pat<(v8i8 (trunc (ARM64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
-          (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
-def : Pat<(v4i16 (trunc (ARM64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
-          (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
-def : Pat<(v2i32 (trunc (ARM64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
-          (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
-
-def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
-                                 (trunc (ARM64vlshr (v8i16 V128:$Rn),
-                                                    vecshiftR16Narrow:$imm)))),
-          (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                           V128:$Rn, vecshiftR16Narrow:$imm)>;
-def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
-                                 (trunc (ARM64vlshr (v4i32 V128:$Rn),
-                                                    vecshiftR32Narrow:$imm)))),
-          (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                           V128:$Rn, vecshiftR32Narrow:$imm)>;
-def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
-                                 (trunc (ARM64vlshr (v2i64 V128:$Rn),
-                                                    vecshiftR64Narrow:$imm)))),
-          (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                           V128:$Rn, vecshiftR32Narrow:$imm)>;
-
-// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
-// Anyexts are implemented as zexts.
-def : Pat<(v8i16 (sext   (v8i8 V64:$Rn))),  (SSHLLv8i8_shift  V64:$Rn, (i32 0))>;
-def : Pat<(v8i16 (zext   (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
-def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
-def : Pat<(v4i32 (sext   (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
-def : Pat<(v4i32 (zext   (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
-def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
-def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
-def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
-def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
-// Also match an extend from the upper half of a 128 bit source register.
-def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
-          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
-def : Pat<(v8i16 (zext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
-          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
-def : Pat<(v8i16 (sext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
-          (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
-def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
-          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
-def : Pat<(v4i32 (zext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
-          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
-def : Pat<(v4i32 (sext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
-          (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
-def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
-          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
-def : Pat<(v2i64 (zext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
-          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
-def : Pat<(v2i64 (sext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
-          (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
-
-// Vector shift sxtl aliases
-def : InstAlias<"sxtl.8h $dst, $src1",
-                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl $dst.8h, $src1.8b",
-                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl.4s $dst, $src1",
-                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl $dst.4s, $src1.4h",
-                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl.2d $dst, $src1",
-                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl $dst.2d, $src1.2s",
-                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-
-// Vector shift sxtl2 aliases
-def : InstAlias<"sxtl2.8h $dst, $src1",
-                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
-                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2.4s $dst, $src1",
-                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
-                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2.2d $dst, $src1",
-                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
-                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-
-// Vector shift uxtl aliases
-def : InstAlias<"uxtl.8h $dst, $src1",
-                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl $dst.8h, $src1.8b",
-                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl.4s $dst, $src1",
-                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl $dst.4s, $src1.4h",
-                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl.2d $dst, $src1",
-                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl $dst.2d, $src1.2s",
-                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-
-// Vector shift uxtl2 aliases
-def : InstAlias<"uxtl2.8h $dst, $src1",
-                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
-                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2.4s $dst, $src1",
-                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
-                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2.2d $dst, $src1",
-                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
-                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-
-// If an integer is about to be converted to a floating point value,
-// just load it on the floating point unit.
-// These patterns are more complex because floating point loads do not
-// support sign extension.
-// The sign extension has to be explicitly added and is only supported for
-// one step: byte-to-half, half-to-word, word-to-doubleword.
-// SCVTF GPR -> FPR is 9 cycles.
-// SCVTF FPR -> FPR is 4 cyclces.
-// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
-// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
-// and still being faster.
-// However, this is not good for code size.
-// 8-bits -> float. 2 sizes step-up.
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi8 ro_indexed8:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv8i8_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDRBro ro_indexed8:$addr),
-                                                  bsub),
-                                     0),
-                                   dsub)),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi8 am_indexed8:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv8i8_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDRBui am_indexed8:$addr),
-                                                  bsub),
-                                     0),
-                                   dsub)),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi8 am_unscaled8:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv8i8_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDURBi am_unscaled8:$addr),
-                                                  bsub),
-                                     0),
-                                   dsub)),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-// 16-bits -> float. 1 size step-up.
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi16 ro_indexed16:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDRHro ro_indexed16:$addr),
-                                               hsub),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi16 am_indexed16:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDRHui am_indexed16:$addr),
-                                               hsub),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi16 am_unscaled16:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDURHi am_unscaled16:$addr),
-                                               hsub),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-// 32-bits to 32-bits are handled in target specific dag combine:
-// performIntToFpCombine.
-// 64-bits integer to 32-bits floating point, not possible with
-// SCVTF on floating point registers (both source and destination
-// must have the same size).
-
-// Here are the patterns for 8, 16, 32, and 64-bits to double.
-// 8-bits -> double. 3 size step-up: give up.
-// 16-bits -> double. 2 size step.
-def : Pat <(f64 (sint_to_fp (i32 (sextloadi16 ro_indexed16:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                 (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv4i16_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDRHro ro_indexed16:$addr),
-                                                  hsub),
-                                     0),
-                                   dsub)),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f64 (sint_to_fp (i32 (sextloadi16 am_indexed16:$addr)))),
-           (SCVTFv1i64  (f64 (EXTRACT_SUBREG
-                               (SSHLLv2i32_shift
-                                 (f64
-                                   (EXTRACT_SUBREG
-                                     (SSHLLv4i16_shift
-                                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDRHui am_indexed16:$addr),
-                                                  hsub),
-                                      0),
-                                    dsub)),
-                                 0),
-                              dsub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f64 (sint_to_fp (i32 (sextloadi16 am_unscaled16:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv4i16_shift
-                                     (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDURHi am_unscaled16:$addr),
-                                                  hsub),
-                                      0),
-                                   dsub)),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-// 32-bits -> double. 1 size step-up.
-def : Pat <(f64 (sint_to_fp (i32 (load ro_indexed32:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDRSro ro_indexed32:$addr),
-                                               ssub),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f64 (sint_to_fp (i32 (load am_indexed32:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDRSui am_indexed32:$addr),
-                                               ssub),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f64 (sint_to_fp (i32 (load am_unscaled32:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDURSi am_unscaled32:$addr),
-                                               ssub),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-// 64-bits -> double are handled in target specific dag combine:
-// performIntToFpCombine.
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD Load-Store Structure
-//----------------------------------------------------------------------------
-defm LD1 : SIMDLd1Multiple<"ld1">;
-defm LD2 : SIMDLd2Multiple<"ld2">;
-defm LD3 : SIMDLd3Multiple<"ld3">;
-defm LD4 : SIMDLd4Multiple<"ld4">;
-
-defm ST1 : SIMDSt1Multiple<"st1">;
-defm ST2 : SIMDSt2Multiple<"st2">;
-defm ST3 : SIMDSt3Multiple<"st3">;
-defm ST4 : SIMDSt4Multiple<"st4">;
-
-class Ld1Pat<ValueType ty, Instruction INST>
-  : Pat<(ty (load am_simdnoindex:$vaddr)), (INST am_simdnoindex:$vaddr)>;
-
-def : Ld1Pat<v16i8, LD1Onev16b>;
-def : Ld1Pat<v8i16, LD1Onev8h>;
-def : Ld1Pat<v4i32, LD1Onev4s>;
-def : Ld1Pat<v2i64, LD1Onev2d>;
-def : Ld1Pat<v8i8,  LD1Onev8b>;
-def : Ld1Pat<v4i16, LD1Onev4h>;
-def : Ld1Pat<v2i32, LD1Onev2s>;
-def : Ld1Pat<v1i64, LD1Onev1d>;
-
-class St1Pat<ValueType ty, Instruction INST>
-  : Pat<(store ty:$Vt, am_simdnoindex:$vaddr),
-        (INST ty:$Vt, am_simdnoindex:$vaddr)>;
-
-def : St1Pat<v16i8, ST1Onev16b>;
-def : St1Pat<v8i16, ST1Onev8h>;
-def : St1Pat<v4i32, ST1Onev4s>;
-def : St1Pat<v2i64, ST1Onev2d>;
-def : St1Pat<v8i8,  ST1Onev8b>;
-def : St1Pat<v4i16, ST1Onev4h>;
-def : St1Pat<v2i32, ST1Onev2s>;
-def : St1Pat<v1i64, ST1Onev1d>;
-
-//---
-// Single-element
-//---
-
-defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
-defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
-defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
-defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
-let mayLoad = 1, neverHasSideEffects = 1 in {
-defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
-defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
-defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
-defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned,   GPR64pi8>;
-defm LD2 : SIMDLdSingleBTied<1, 0b000,       "ld2", VecListTwob,   GPR64pi2>;
-defm LD2 : SIMDLdSingleHTied<1, 0b010, 0,    "ld2", VecListTwoh,   GPR64pi4>;
-defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos,   GPR64pi8>;
-defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod,   GPR64pi16>;
-defm LD3 : SIMDLdSingleBTied<0, 0b001,       "ld3", VecListThreeb, GPR64pi3>;
-defm LD3 : SIMDLdSingleHTied<0, 0b011, 0,    "ld3", VecListThreeh, GPR64pi6>;
-defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
-defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
-defm LD4 : SIMDLdSingleBTied<1, 0b001,       "ld4", VecListFourb,  GPR64pi4>;
-defm LD4 : SIMDLdSingleHTied<1, 0b011, 0,    "ld4", VecListFourh,  GPR64pi8>;
-defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours,  GPR64pi16>;
-defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd,  GPR64pi32>;
-}
-
-def : Pat<(v8i8 (ARM64dup (i32 (extloadi8 am_simdnoindex:$vaddr)))),
-          (LD1Rv8b am_simdnoindex:$vaddr)>;
-def : Pat<(v16i8 (ARM64dup (i32 (extloadi8 am_simdnoindex:$vaddr)))),
-          (LD1Rv16b am_simdnoindex:$vaddr)>;
-def : Pat<(v4i16 (ARM64dup (i32 (extloadi16 am_simdnoindex:$vaddr)))),
-          (LD1Rv4h am_simdnoindex:$vaddr)>;
-def : Pat<(v8i16 (ARM64dup (i32 (extloadi16 am_simdnoindex:$vaddr)))),
-          (LD1Rv8h am_simdnoindex:$vaddr)>;
-def : Pat<(v2i32 (ARM64dup (i32 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv2s am_simdnoindex:$vaddr)>;
-def : Pat<(v4i32 (ARM64dup (i32 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv4s am_simdnoindex:$vaddr)>;
-def : Pat<(v2i64 (ARM64dup (i64 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv2d am_simdnoindex:$vaddr)>;
-def : Pat<(v1i64 (ARM64dup (i64 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv1d am_simdnoindex:$vaddr)>;
-// Grab the floating point version too
-def : Pat<(v2f32 (ARM64dup (f32 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv2s am_simdnoindex:$vaddr)>;
-def : Pat<(v4f32 (ARM64dup (f32 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv4s am_simdnoindex:$vaddr)>;
-def : Pat<(v2f64 (ARM64dup (f64 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv2d am_simdnoindex:$vaddr)>;
-def : Pat<(v1f64 (ARM64dup (f64 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv1d am_simdnoindex:$vaddr)>;
-
-class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
-                    ValueType VTy, ValueType STy, Instruction LD1>
-  : Pat<(vector_insert (VTy VecListOne128:$Rd),
-           (STy (scalar_load am_simdnoindex:$vaddr)), VecIndex:$idx),
-        (LD1 VecListOne128:$Rd, VecIndex:$idx, am_simdnoindex:$vaddr)>;
-
-def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
-def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
-def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
-def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
-def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
-def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
-
-class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
-                   ValueType VTy, ValueType STy, Instruction LD1>
-  : Pat<(vector_insert (VTy VecListOne64:$Rd),
-           (STy (scalar_load am_simdnoindex:$vaddr)), VecIndex:$idx),
-        (EXTRACT_SUBREG
-            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
-                          VecIndex:$idx, am_simdnoindex:$vaddr),
-            dsub)>;
-
-def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
-def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
-def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
-def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
-
-
-defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
-defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
-defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
-defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
-
-// Stores
-defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
-defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
-defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
-defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
-
-let AddedComplexity = 8 in
-class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
-                    ValueType VTy, ValueType STy, Instruction ST1>
-  : Pat<(scalar_store
-             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
-             am_simdnoindex:$vaddr),
-        (ST1 VecListOne128:$Vt, VecIndex:$idx, am_simdnoindex:$vaddr)>;
-
-def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
-def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
-def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
-def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
-def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
-def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
-
-let AddedComplexity = 8 in
-class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
-                   ValueType VTy, ValueType STy, Instruction ST1>
-  : Pat<(scalar_store
-             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
-             am_simdnoindex:$vaddr),
-        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
-             VecIndex:$idx, am_simdnoindex:$vaddr)>;
-
-def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
-def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
-def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
-def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
-
-let mayStore = 1, neverHasSideEffects = 1 in {
-defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
-defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
-defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
-defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
-defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
-defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
-defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
-defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
-defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
-defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
-defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
-defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
-}
-
-defm ST1 : SIMDLdSt1SingleAliases<"st1">;
-defm ST2 : SIMDLdSt2SingleAliases<"st2">;
-defm ST3 : SIMDLdSt3SingleAliases<"st3">;
-defm ST4 : SIMDLdSt4SingleAliases<"st4">;
-
-//----------------------------------------------------------------------------
-// Crypto extensions
-//----------------------------------------------------------------------------
-
-def AESErr   : AESTiedInst<0b0100, "aese",   int_arm64_crypto_aese>;
-def AESDrr   : AESTiedInst<0b0101, "aesd",   int_arm64_crypto_aesd>;
-def AESMCrr  : AESInst<    0b0110, "aesmc",  int_arm64_crypto_aesmc>;
-def AESIMCrr : AESInst<    0b0111, "aesimc", int_arm64_crypto_aesimc>;
-
-def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_arm64_crypto_sha1c>;
-def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_arm64_crypto_sha1p>;
-def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_arm64_crypto_sha1m>;
-def SHA1SU0rrr   : SHATiedInstVVV<0b011, "sha1su0", int_arm64_crypto_sha1su0>;
-def SHA256Hrrr   : SHATiedInstQQV<0b100, "sha256h", int_arm64_crypto_sha256h>;
-def SHA256H2rrr  : SHATiedInstQQV<0b101, "sha256h2",int_arm64_crypto_sha256h2>;
-def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_arm64_crypto_sha256su1>;
-
-def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_arm64_crypto_sha1h>;
-def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_arm64_crypto_sha1su1>;
-def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_arm64_crypto_sha256su0>;
-
-//----------------------------------------------------------------------------
-// Compiler-pseudos
-//----------------------------------------------------------------------------
-// FIXME: Like for X86, these should go in their own separate .td file.
-
-// Any instruction that defines a 32-bit result leaves the high half of the
-// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. But any other 32-bit operation will zero-extend
-// up to 64 bits.
-// FIXME: X86 also checks for CMOV here. Do we need something similar?
-def def32 : PatLeaf<(i32 GPR32:$src), [{
-  return N->getOpcode() != ISD::TRUNCATE &&
-         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
-         N->getOpcode() != ISD::CopyFromReg;
-}]>;
-
-// In the case of a 32-bit def that is known to implicitly zero-extend,
-// we can use a SUBREG_TO_REG.
-def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
-
-// For an anyext, we don't care what the high bits are, so we can perform an
-// INSERT_SUBREF into an IMPLICIT_DEF.
-def : Pat<(i64 (anyext GPR32:$src)),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
-
-// When we need to explicitly zero-extend, we use an unsigned bitfield move
-// instruction (UBFM) on the enclosing super-reg.
-def : Pat<(i64 (zext GPR32:$src)),
- (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
-
-// To sign extend, we use a signed bitfield move instruction (SBFM) on the
-// containing super-reg.
-def : Pat<(i64 (sext GPR32:$src)),
-   (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i8)),  (SBFMXri GPR64:$src, 0, 7)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i1)),  (SBFMXri GPR64:$src, 0, 0)>;
-def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
-def : Pat<(i32 (sext_inreg GPR32:$src, i8)),  (SBFMWri GPR32:$src, 0, 7)>;
-def : Pat<(i32 (sext_inreg GPR32:$src, i1)),  (SBFMWri GPR32:$src, 0, 0)>;
-
-def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 (i32shift_a       imm0_31:$imm)),
-                              (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
-def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
-                              (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
-
-def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 (i32shift_a        imm0_31:$imm)),
-                              (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
-def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 (i64shift_a        imm0_63:$imm)),
-                              (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
-
-def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
-          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
-                   (i64 (i64shift_a        imm0_63:$imm)),
-                   (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
-
-// sra patterns have an AddedComplexity of 10, so make sure we have a higher
-// AddedComplexity for the following patterns since we want to match sext + sra
-// patterns before we attempt to match a single sra node.
-let AddedComplexity = 20 in {
-// We support all sext + sra combinations which preserve at least one bit of the
-// original value which is to be sign extended. E.g. we support shifts up to
-// bitwidth-1 bits.
-def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
-def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
-
-def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
-def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
-
-def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
-          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
-                   (i64 imm0_31:$imm), 31)>;
-} // AddedComplexity = 20
-
-// To truncate, we can simply extract from a subregister.
-def : Pat<(i32 (trunc GPR64sp:$src)),
-          (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
-
-// __builtin_trap() uses the BRK instruction on ARM64.
-def : Pat<(trap), (BRK 1)>;
-
-// Conversions within AdvSIMD types in the same register size are free.
-
-def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
-
-def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
-
-def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
-
-def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
-
-def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
-
-def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
-
-def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
-
-
-def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
-
-def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
-
-def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
-
-def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
-
-def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
-
-def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-
-// A 64-bit subvector insert to the first 128-bit vector position
-// is a subregister copy that needs no instruction.
-def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-
-// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
-// or v2f32.
-def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
-                    (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
-           (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
-def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
-                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
-           (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
-    // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
-    // so we match on v4f32 here, not v2f32. This will also catch adding
-    // the low two lanes of a true v4f32 vector.
-def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
-                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
-          (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
-
-// Scalar 64-bit shifts in FPR64 registers.
-def : Pat<(i64 (int_arm64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-
-// Tail call return handling. These are all compiler pseudo-instructions,
-// so no encoding information or anything like that.
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
-  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst), []>;
-  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst), []>;
-}
-
-def : Pat<(ARM64tcret tcGPR64:$dst), (TCRETURNri tcGPR64:$dst)>;
-def : Pat<(ARM64tcret (i64 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
-def : Pat<(ARM64tcret (i64 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
-
-include "ARM64InstrAtomics.td"
diff --git a/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp b/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp
deleted file mode 100644
index c0031a4..0000000
--- a/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp
+++ /dev/null
@@ -1,947 +0,0 @@
-//===-- ARM64LoadStoreOptimizer.cpp - ARM64 load/store opt. pass --*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that performs load / store related peephole
-// optimizations. This pass should be run after register allocation.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-ldst-opt"
-#include "ARM64InstrInfo.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
-using namespace llvm;
-
-/// ARM64AllocLoadStoreOpt - Post-register allocation pass to combine
-/// load / store instructions to form ldp / stp instructions.
-
-STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
-STATISTIC(NumPostFolded, "Number of post-index updates folded");
-STATISTIC(NumPreFolded, "Number of pre-index updates folded");
-STATISTIC(NumUnscaledPairCreated,
-          "Number of load/store from unscaled generated");
-
-static cl::opt<bool> DoLoadStoreOpt("arm64-load-store-opt", cl::init(true),
-                                    cl::Hidden);
-static cl::opt<unsigned> ScanLimit("arm64-load-store-scan-limit", cl::init(20),
-                                   cl::Hidden);
-
-// Place holder while testing unscaled load/store combining
-static cl::opt<bool>
-EnableARM64UnscaledMemOp("arm64-unscaled-mem-op", cl::Hidden,
-                         cl::desc("Allow ARM64 unscaled load/store combining"),
-                         cl::init(true));
-
-namespace {
-struct ARM64LoadStoreOpt : public MachineFunctionPass {
-  static char ID;
-  ARM64LoadStoreOpt() : MachineFunctionPass(ID) {}
-
-  const ARM64InstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-
-  // Scan the instructions looking for a load/store that can be combined
-  // with the current instruction into a load/store pair.
-  // Return the matching instruction if one is found, else MBB->end().
-  // If a matching instruction is found, mergeForward is set to true if the
-  // merge is to remove the first instruction and replace the second with
-  // a pair-wise insn, and false if the reverse is true.
-  MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
-                                               bool &mergeForward,
-                                               unsigned Limit);
-  // Merge the two instructions indicated into a single pair-wise instruction.
-  // If mergeForward is true, erase the first instruction and fold its
-  // operation into the second. If false, the reverse. Return the instruction
-  // following the first instruction (which may change during proecessing).
-  MachineBasicBlock::iterator
-  mergePairedInsns(MachineBasicBlock::iterator I,
-                   MachineBasicBlock::iterator Paired, bool mergeForward);
-
-  // Scan the instruction list to find a base register update that can
-  // be combined with the current instruction (a load or store) using
-  // pre or post indexed addressing with writeback. Scan forwards.
-  MachineBasicBlock::iterator
-  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
-                                int Value);
-
-  // Scan the instruction list to find a base register update that can
-  // be combined with the current instruction (a load or store) using
-  // pre or post indexed addressing with writeback. Scan backwards.
-  MachineBasicBlock::iterator
-  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
-
-  // Merge a pre-index base register update into a ld/st instruction.
-  MachineBasicBlock::iterator
-  mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
-                        MachineBasicBlock::iterator Update);
-
-  // Merge a post-index base register update into a ld/st instruction.
-  MachineBasicBlock::iterator
-  mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
-                         MachineBasicBlock::iterator Update);
-
-  bool optimizeBlock(MachineBasicBlock &MBB);
-
-  virtual bool runOnMachineFunction(MachineFunction &Fn);
-
-  virtual const char *getPassName() const {
-    return "ARM64 load / store optimization pass";
-  }
-
-private:
-  int getMemSize(MachineInstr *MemMI);
-};
-char ARM64LoadStoreOpt::ID = 0;
-}
-
-static bool isUnscaledLdst(unsigned Opc) {
-  switch (Opc) {
-  default:
-    return false;
-  case ARM64::STURSi:
-    return true;
-  case ARM64::STURDi:
-    return true;
-  case ARM64::STURQi:
-    return true;
-  case ARM64::STURWi:
-    return true;
-  case ARM64::STURXi:
-    return true;
-  case ARM64::LDURSi:
-    return true;
-  case ARM64::LDURDi:
-    return true;
-  case ARM64::LDURQi:
-    return true;
-  case ARM64::LDURWi:
-    return true;
-  case ARM64::LDURXi:
-    return true;
-  }
-}
-
-// Size in bytes of the data moved by an unscaled load or store
-int ARM64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
-  switch (MemMI->getOpcode()) {
-  default:
-    llvm_unreachable("Opcode has has unknown size!");
-  case ARM64::STRSui:
-  case ARM64::STURSi:
-    return 4;
-  case ARM64::STRDui:
-  case ARM64::STURDi:
-    return 8;
-  case ARM64::STRQui:
-  case ARM64::STURQi:
-    return 16;
-  case ARM64::STRWui:
-  case ARM64::STURWi:
-    return 4;
-  case ARM64::STRXui:
-  case ARM64::STURXi:
-    return 8;
-  case ARM64::LDRSui:
-  case ARM64::LDURSi:
-    return 4;
-  case ARM64::LDRDui:
-  case ARM64::LDURDi:
-    return 8;
-  case ARM64::LDRQui:
-  case ARM64::LDURQi:
-    return 16;
-  case ARM64::LDRWui:
-  case ARM64::LDURWi:
-    return 4;
-  case ARM64::LDRXui:
-  case ARM64::LDURXi:
-    return 8;
-  }
-}
-
-static unsigned getMatchingPairOpcode(unsigned Opc) {
-  switch (Opc) {
-  default:
-    llvm_unreachable("Opcode has no pairwise equivalent!");
-  case ARM64::STRSui:
-  case ARM64::STURSi:
-    return ARM64::STPSi;
-  case ARM64::STRDui:
-  case ARM64::STURDi:
-    return ARM64::STPDi;
-  case ARM64::STRQui:
-  case ARM64::STURQi:
-    return ARM64::STPQi;
-  case ARM64::STRWui:
-  case ARM64::STURWi:
-    return ARM64::STPWi;
-  case ARM64::STRXui:
-  case ARM64::STURXi:
-    return ARM64::STPXi;
-  case ARM64::LDRSui:
-  case ARM64::LDURSi:
-    return ARM64::LDPSi;
-  case ARM64::LDRDui:
-  case ARM64::LDURDi:
-    return ARM64::LDPDi;
-  case ARM64::LDRQui:
-  case ARM64::LDURQi:
-    return ARM64::LDPQi;
-  case ARM64::LDRWui:
-  case ARM64::LDURWi:
-    return ARM64::LDPWi;
-  case ARM64::LDRXui:
-  case ARM64::LDURXi:
-    return ARM64::LDPXi;
-  }
-}
-
-static unsigned getPreIndexedOpcode(unsigned Opc) {
-  switch (Opc) {
-  default:
-    llvm_unreachable("Opcode has no pre-indexed equivalent!");
-  case ARM64::STRSui:    return ARM64::STRSpre;
-  case ARM64::STRDui:    return ARM64::STRDpre;
-  case ARM64::STRQui:    return ARM64::STRQpre;
-  case ARM64::STRWui:    return ARM64::STRWpre;
-  case ARM64::STRXui:    return ARM64::STRXpre;
-  case ARM64::LDRSui:    return ARM64::LDRSpre;
-  case ARM64::LDRDui:    return ARM64::LDRDpre;
-  case ARM64::LDRQui:    return ARM64::LDRQpre;
-  case ARM64::LDRWui:    return ARM64::LDRWpre;
-  case ARM64::LDRXui:    return ARM64::LDRXpre;
-  }
-}
-
-static unsigned getPostIndexedOpcode(unsigned Opc) {
-  switch (Opc) {
-  default:
-    llvm_unreachable("Opcode has no post-indexed wise equivalent!");
-  case ARM64::STRSui:
-    return ARM64::STRSpost;
-  case ARM64::STRDui:
-    return ARM64::STRDpost;
-  case ARM64::STRQui:
-    return ARM64::STRQpost;
-  case ARM64::STRWui:
-    return ARM64::STRWpost;
-  case ARM64::STRXui:
-    return ARM64::STRXpost;
-  case ARM64::LDRSui:
-    return ARM64::LDRSpost;
-  case ARM64::LDRDui:
-    return ARM64::LDRDpost;
-  case ARM64::LDRQui:
-    return ARM64::LDRQpost;
-  case ARM64::LDRWui:
-    return ARM64::LDRWpost;
-  case ARM64::LDRXui:
-    return ARM64::LDRXpost;
-  }
-}
-
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
-                                    MachineBasicBlock::iterator Paired,
-                                    bool mergeForward) {
-  MachineBasicBlock::iterator NextI = I;
-  ++NextI;
-  // If NextI is the second of the two instructions to be merged, we need
-  // to skip one further. Either way we merge will invalidate the iterator,
-  // and we don't need to scan the new instruction, as it's a pairwise
-  // instruction, which we're not considering for further action anyway.
-  if (NextI == Paired)
-    ++NextI;
-
-  bool IsUnscaled = isUnscaledLdst(I->getOpcode());
-  int OffsetStride = IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(I) : 1;
-
-  unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
-  // Insert our new paired instruction after whichever of the paired
-  // instructions mergeForward indicates.
-  MachineBasicBlock::iterator InsertionPoint = mergeForward ? Paired : I;
-  // Also based on mergeForward is from where we copy the base register operand
-  // so we get the flags compatible with the input code.
-  MachineOperand &BaseRegOp =
-      mergeForward ? Paired->getOperand(1) : I->getOperand(1);
-
-  // Which register is Rt and which is Rt2 depends on the offset order.
-  MachineInstr *RtMI, *Rt2MI;
-  if (I->getOperand(2).getImm() ==
-      Paired->getOperand(2).getImm() + OffsetStride) {
-    RtMI = Paired;
-    Rt2MI = I;
-  } else {
-    RtMI = I;
-    Rt2MI = Paired;
-  }
-  // Handle Unscaled
-  int OffsetImm = RtMI->getOperand(2).getImm();
-  if (IsUnscaled && EnableARM64UnscaledMemOp)
-    OffsetImm /= OffsetStride;
-
-  // Construct the new instruction.
-  MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
-                                    I->getDebugLoc(), TII->get(NewOpc))
-                                .addOperand(RtMI->getOperand(0))
-                                .addOperand(Rt2MI->getOperand(0))
-                                .addOperand(BaseRegOp)
-                                .addImm(OffsetImm);
-  (void)MIB;
-
-  // FIXME: Do we need/want to copy the mem operands from the source
-  //        instructions? Probably. What uses them after this?
-
-  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
-  DEBUG(I->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG(Paired->print(dbgs()));
-  DEBUG(dbgs() << "  with instruction:\n    ");
-  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-  DEBUG(dbgs() << "\n");
-
-  // Erase the old instructions.
-  I->eraseFromParent();
-  Paired->eraseFromParent();
-
-  return NextI;
-}
-
-/// trackRegDefsUses - Remember what registers the specified instruction uses
-/// and modifies.
-static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
-                             BitVector &UsedRegs,
-                             const TargetRegisterInfo *TRI) {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (MO.isRegMask())
-      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
-
-    if (!MO.isReg())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (MO.isDef()) {
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        ModifiedRegs.set(*AI);
-    } else {
-      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        UsedRegs.set(*AI);
-    }
-  }
-}
-
-static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
-  if (!IsUnscaled && (Offset > 63 || Offset < -64))
-    return false;
-  if (IsUnscaled) {
-    // Convert the byte-offset used by unscaled into an "element" offset used
-    // by the scaled pair load/store instructions.
-    int elemOffset = Offset / OffsetStride;
-    if (elemOffset > 63 || elemOffset < -64)
-      return false;
-  }
-  return true;
-}
-
-// Do alignment, specialized to power of 2 and for signed ints,
-// avoiding having to do a C-style cast from uint_64t to int when
-// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
-// FIXME: Move this function to include/MathExtras.h?
-static int alignTo(int Num, int PowOf2) {
-  return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
-}
-
-/// findMatchingInsn - Scan the instructions looking for a load/store that can
-/// be combined with the current instruction into a load/store pair.
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
-                                    bool &mergeForward, unsigned Limit) {
-  MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineBasicBlock::iterator MBBI = I;
-  MachineInstr *FirstMI = I;
-  ++MBBI;
-
-  int Opc = FirstMI->getOpcode();
-  bool mayLoad = FirstMI->mayLoad();
-  bool IsUnscaled = isUnscaledLdst(Opc);
-  unsigned Reg = FirstMI->getOperand(0).getReg();
-  unsigned BaseReg = FirstMI->getOperand(1).getReg();
-  int Offset = FirstMI->getOperand(2).getImm();
-
-  // Early exit if the first instruction modifies the base register.
-  // e.g., ldr x0, [x0]
-  // Early exit if the offset if not possible to match. (6 bits of positive
-  // range, plus allow an extra one in case we find a later insn that matches
-  // with Offset-1
-  if (FirstMI->modifiesRegister(BaseReg, TRI))
-    return E;
-  int OffsetStride =
-      IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(FirstMI) : 1;
-  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
-    return E;
-
-  // Track which registers have been modified and used between the first insn
-  // (inclusive) and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
-  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
-    MachineInstr *MI = MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI->isDebugValue())
-      continue;
-
-    // Now that we know this is a real instruction, count it.
-    ++Count;
-
-    if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) {
-      // If we've found another instruction with the same opcode, check to see
-      // if the base and offset are compatible with our starting instruction.
-      // These instructions all have scaled immediate operands, so we just
-      // check for +1/-1. Make sure to check the new instruction offset is
-      // actually an immediate and not a symbolic reference destined for
-      // a relocation.
-      //
-      // Pairwise instructions have a 7-bit signed offset field. Single insns
-      // have a 12-bit unsigned offset field. To be a valid combine, the
-      // final offset must be in range.
-      unsigned MIBaseReg = MI->getOperand(1).getReg();
-      int MIOffset = MI->getOperand(2).getImm();
-      if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
-                                   (Offset + OffsetStride == MIOffset))) {
-        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
-        // If this is a volatile load/store that otherwise matched, stop looking
-        // as something is going on that we don't have enough information to
-        // safely transform. Similarly, stop if we see a hint to avoid pairs.
-        if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
-          return E;
-        // If the resultant immediate offset of merging these instructions
-        // is out of range for a pairwise instruction, bail and keep looking.
-        bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode());
-        if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
-          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          continue;
-        }
-        // If the alignment requirements of the paired (scaled) instruction
-        // can't express the offset of the unscaled input, bail and keep
-        // looking.
-        if (IsUnscaled && EnableARM64UnscaledMemOp &&
-            (alignTo(MinOffset, OffsetStride) != MinOffset)) {
-          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          continue;
-        }
-        // If the destination register of the loads is the same register, bail
-        // and keep looking. A load-pair instruction with both destination
-        // registers the same is UNPREDICTABLE and will result in an exception.
-        if (mayLoad && Reg == MI->getOperand(0).getReg()) {
-          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          continue;
-        }
-
-        // If the Rt of the second instruction was not modified or used between
-        // the two instructions, we can combine the second into the first.
-        if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
-            !UsedRegs[MI->getOperand(0).getReg()]) {
-          mergeForward = false;
-          return MBBI;
-        }
-
-        // Likewise, if the Rt of the first instruction is not modified or used
-        // between the two instructions, we can combine the first into the
-        // second.
-        if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
-            !UsedRegs[FirstMI->getOperand(0).getReg()]) {
-          mergeForward = true;
-          return MBBI;
-        }
-        // Unable to combine these instructions due to interference in between.
-        // Keep looking.
-      }
-    }
-
-    // If the instruction wasn't a matching load or store, but does (or can)
-    // modify memory, stop searching, as we don't have alias analysis or
-    // anything like that to tell us whether the access is tromping on the
-    // locations we care about. The big one we want to catch is calls.
-    //
-    // FIXME: Theoretically, we can do better than that for SP and FP based
-    // references since we can effectively know where those are touching. It's
-    // unclear if it's worth the extra code, though. Most paired instructions
-    // will be sequential, perhaps with a few intervening non-memory related
-    // instructions.
-    if (MI->mayStore() || MI->isCall())
-      return E;
-    // Likewise, if we're matching a store instruction, we don't want to
-    // move across a load, as it may be reading the same location.
-    if (FirstMI->mayStore() && MI->mayLoad())
-      return E;
-
-    // Update modified / uses register lists.
-    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-
-    // Otherwise, if the base register is modified, we have no match, so
-    // return early.
-    if (ModifiedRegs[BaseReg])
-      return E;
-  }
-  return E;
-}
-
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
-                                         MachineBasicBlock::iterator Update) {
-  assert((Update->getOpcode() == ARM64::ADDXri ||
-          Update->getOpcode() == ARM64::SUBXri) &&
-         "Unexpected base register update instruction to merge!");
-  MachineBasicBlock::iterator NextI = I;
-  // Return the instruction following the merged instruction, which is
-  // the instruction following our unmerged load. Unless that's the add/sub
-  // instruction we're merging, in which case it's the one after that.
-  if (++NextI == Update)
-    ++NextI;
-
-  int Value = Update->getOperand(2).getImm();
-  assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
-         "Can't merge 1 << 12 offset into pre-indexed load / store");
-  if (Update->getOpcode() == ARM64::SUBXri)
-    Value = -Value;
-
-  unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
-  MachineInstrBuilder MIB =
-      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-          .addOperand(I->getOperand(0))
-          .addOperand(I->getOperand(1))
-          .addImm(Value);
-  (void)MIB;
-
-  DEBUG(dbgs() << "Creating pre-indexed load/store.");
-  DEBUG(dbgs() << "    Replacing instructions:\n    ");
-  DEBUG(I->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG(Update->print(dbgs()));
-  DEBUG(dbgs() << "  with instruction:\n    ");
-  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-  DEBUG(dbgs() << "\n");
-
-  // Erase the old instructions for the block.
-  I->eraseFromParent();
-  Update->eraseFromParent();
-
-  return NextI;
-}
-
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
-                                          MachineBasicBlock::iterator Update) {
-  assert((Update->getOpcode() == ARM64::ADDXri ||
-          Update->getOpcode() == ARM64::SUBXri) &&
-         "Unexpected base register update instruction to merge!");
-  MachineBasicBlock::iterator NextI = I;
-  // Return the instruction following the merged instruction, which is
-  // the instruction following our unmerged load. Unless that's the add/sub
-  // instruction we're merging, in which case it's the one after that.
-  if (++NextI == Update)
-    ++NextI;
-
-  int Value = Update->getOperand(2).getImm();
-  assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
-         "Can't merge 1 << 12 offset into post-indexed load / store");
-  if (Update->getOpcode() == ARM64::SUBXri)
-    Value = -Value;
-
-  unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
-  MachineInstrBuilder MIB =
-      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-          .addOperand(I->getOperand(0))
-          .addOperand(I->getOperand(1))
-          .addImm(Value);
-  (void)MIB;
-
-  DEBUG(dbgs() << "Creating post-indexed load/store.");
-  DEBUG(dbgs() << "    Replacing instructions:\n    ");
-  DEBUG(I->print(dbgs()));
-  DEBUG(dbgs() << "    ");
-  DEBUG(Update->print(dbgs()));
-  DEBUG(dbgs() << "  with instruction:\n    ");
-  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-  DEBUG(dbgs() << "\n");
-
-  // Erase the old instructions for the block.
-  I->eraseFromParent();
-  Update->eraseFromParent();
-
-  return NextI;
-}
-
-static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
-                                 int Offset) {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::SUBXri:
-    // Negate the offset for a SUB instruction.
-    Offset *= -1;
-  // FALLTHROUGH
-  case ARM64::ADDXri:
-    // Make sure it's a vanilla immediate operand, not a relocation or
-    // anything else we can't handle.
-    if (!MI->getOperand(2).isImm())
-      break;
-    // Watch out for 1 << 12 shifted value.
-    if (ARM64_AM::getShiftValue(MI->getOperand(3).getImm()))
-      break;
-    // If the instruction has the base register as source and dest and the
-    // immediate will fit in a signed 9-bit integer, then we have a match.
-    if (MI->getOperand(0).getReg() == BaseReg &&
-        MI->getOperand(1).getReg() == BaseReg &&
-        MI->getOperand(2).getImm() <= 255 &&
-        MI->getOperand(2).getImm() >= -256) {
-      // If we have a non-zero Offset, we check that it matches the amount
-      // we're adding to the register.
-      if (!Offset || Offset == MI->getOperand(2).getImm())
-        return true;
-    }
-    break;
-  }
-  return false;
-}
-
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
-                                                 unsigned Limit, int Value) {
-  MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineInstr *MemMI = I;
-  MachineBasicBlock::iterator MBBI = I;
-  const MachineFunction &MF = *MemMI->getParent()->getParent();
-
-  unsigned DestReg = MemMI->getOperand(0).getReg();
-  unsigned BaseReg = MemMI->getOperand(1).getReg();
-  int Offset = MemMI->getOperand(2).getImm() *
-               TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
-
-  // If the base register overlaps the destination register, we can't
-  // merge the update.
-  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
-    return E;
-
-  // Scan forward looking for post-index opportunities.
-  // Updating instructions can't be formed if the memory insn already
-  // has an offset other than the value we're looking for.
-  if (Offset != Value)
-    return E;
-
-  // Track which registers have been modified and used between the first insn
-  // (inclusive) and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
-  ++MBBI;
-  for (unsigned Count = 0; MBBI != E; ++MBBI) {
-    MachineInstr *MI = MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI->isDebugValue())
-      continue;
-
-    // Now that we know this is a real instruction, count it.
-    ++Count;
-
-    // If we found a match, return it.
-    if (isMatchingUpdateInsn(MI, BaseReg, Value))
-      return MBBI;
-
-    // Update the status of what the instruction clobbered and used.
-    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-
-    // Otherwise, if the base register is used or modified, we have no match, so
-    // return early.
-    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
-      return E;
-  }
-  return E;
-}
-
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,
-                                                  unsigned Limit) {
-  MachineBasicBlock::iterator B = I->getParent()->begin();
-  MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineInstr *MemMI = I;
-  MachineBasicBlock::iterator MBBI = I;
-  const MachineFunction &MF = *MemMI->getParent()->getParent();
-
-  unsigned DestReg = MemMI->getOperand(0).getReg();
-  unsigned BaseReg = MemMI->getOperand(1).getReg();
-  int Offset = MemMI->getOperand(2).getImm();
-  unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize();
-
-  // If the load/store is the first instruction in the block, there's obviously
-  // not any matching update. Ditto if the memory offset isn't zero.
-  if (MBBI == B || Offset != 0)
-    return E;
-  // If the base register overlaps the destination register, we can't
-  // merge the update.
-  if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
-    return E;
-
-  // Track which registers have been modified and used between the first insn
-  // (inclusive) and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
-  --MBBI;
-  for (unsigned Count = 0; MBBI != B; --MBBI) {
-    MachineInstr *MI = MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI->isDebugValue())
-      continue;
-
-    // Now that we know this is a real instruction, count it.
-    ++Count;
-
-    // If we found a match, return it.
-    if (isMatchingUpdateInsn(MI, BaseReg, RegSize))
-      return MBBI;
-
-    // Update the status of what the instruction clobbered and used.
-    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-
-    // Otherwise, if the base register is used or modified, we have no match, so
-    // return early.
-    if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
-      return E;
-  }
-  return E;
-}
-
-bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
-  bool Modified = false;
-  // Two tranformations to do here:
-  // 1) Find loads and stores that can be merged into a single load or store
-  //    pair instruction.
-  //      e.g.,
-  //        ldr x0, [x2]
-  //        ldr x1, [x2, #8]
-  //        ; becomes
-  //        ldp x0, x1, [x2]
-  // 2) Find base register updates that can be merged into the load or store
-  //    as a base-reg writeback.
-  //      e.g.,
-  //        ldr x0, [x2]
-  //        add x2, x2, #4
-  //        ; becomes
-  //        ldr x0, [x2], #4
-
-  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-       MBBI != E;) {
-    MachineInstr *MI = MBBI;
-    switch (MI->getOpcode()) {
-    default:
-      // Just move on to the next instruction.
-      ++MBBI;
-      break;
-    case ARM64::STRSui:
-    case ARM64::STRDui:
-    case ARM64::STRQui:
-    case ARM64::STRXui:
-    case ARM64::STRWui:
-    case ARM64::LDRSui:
-    case ARM64::LDRDui:
-    case ARM64::LDRQui:
-    case ARM64::LDRXui:
-    case ARM64::LDRWui:
-    // do the unscaled versions as well
-    case ARM64::STURSi:
-    case ARM64::STURDi:
-    case ARM64::STURQi:
-    case ARM64::STURWi:
-    case ARM64::STURXi:
-    case ARM64::LDURSi:
-    case ARM64::LDURDi:
-    case ARM64::LDURQi:
-    case ARM64::LDURWi:
-    case ARM64::LDURXi: {
-      // If this is a volatile load/store, don't mess with it.
-      if (MI->hasOrderedMemoryRef()) {
-        ++MBBI;
-        break;
-      }
-      // Make sure this is a reg+imm (as opposed to an address reloc).
-      if (!MI->getOperand(2).isImm()) {
-        ++MBBI;
-        break;
-      }
-      // Check if this load/store has a hint to avoid pair formation.
-      // MachineMemOperands hints are set by the ARM64StorePairSuppress pass.
-      if (TII->isLdStPairSuppressed(MI)) {
-        ++MBBI;
-        break;
-      }
-      // Look ahead up to ScanLimit instructions for a pairable instruction.
-      bool mergeForward = false;
-      MachineBasicBlock::iterator Paired =
-          findMatchingInsn(MBBI, mergeForward, ScanLimit);
-      if (Paired != E) {
-        // Merge the loads into a pair. Keeping the iterator straight is a
-        // pain, so we let the merge routine tell us what the next instruction
-        // is after it's done mucking about.
-        MBBI = mergePairedInsns(MBBI, Paired, mergeForward);
-
-        Modified = true;
-        ++NumPairCreated;
-        if (isUnscaledLdst(MI->getOpcode()))
-          ++NumUnscaledPairCreated;
-        break;
-      }
-      ++MBBI;
-      break;
-    }
-      // FIXME: Do the other instructions.
-    }
-  }
-
-  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-       MBBI != E;) {
-    MachineInstr *MI = MBBI;
-    // Do update merging. It's simpler to keep this separate from the above
-    // switch, though not strictly necessary.
-    int Opc = MI->getOpcode();
-    switch (Opc) {
-    default:
-      // Just move on to the next instruction.
-      ++MBBI;
-      break;
-    case ARM64::STRSui:
-    case ARM64::STRDui:
-    case ARM64::STRQui:
-    case ARM64::STRXui:
-    case ARM64::STRWui:
-    case ARM64::LDRSui:
-    case ARM64::LDRDui:
-    case ARM64::LDRQui:
-    case ARM64::LDRXui:
-    case ARM64::LDRWui:
-    // do the unscaled versions as well
-    case ARM64::STURSi:
-    case ARM64::STURDi:
-    case ARM64::STURQi:
-    case ARM64::STURWi:
-    case ARM64::STURXi:
-    case ARM64::LDURSi:
-    case ARM64::LDURDi:
-    case ARM64::LDURQi:
-    case ARM64::LDURWi:
-    case ARM64::LDURXi: {
-      // Make sure this is a reg+imm (as opposed to an address reloc).
-      if (!MI->getOperand(2).isImm()) {
-        ++MBBI;
-        break;
-      }
-      // Look ahead up to ScanLimit instructions for a mergable instruction.
-      MachineBasicBlock::iterator Update =
-          findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
-      if (Update != E) {
-        // Merge the update into the ld/st.
-        MBBI = mergePostIdxUpdateInsn(MBBI, Update);
-        Modified = true;
-        ++NumPostFolded;
-        break;
-      }
-      // Don't know how to handle pre/post-index versions, so move to the next
-      // instruction.
-      if (isUnscaledLdst(Opc)) {
-        ++MBBI;
-        break;
-      }
-
-      // Look back to try to find a pre-index instruction. For example,
-      // add x0, x0, #8
-      // ldr x1, [x0]
-      //   merged into:
-      // ldr x1, [x0, #8]!
-      Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
-      if (Update != E) {
-        // Merge the update into the ld/st.
-        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
-        Modified = true;
-        ++NumPreFolded;
-        break;
-      }
-
-      // Look forward to try to find a post-index instruction. For example,
-      // ldr x1, [x0, #64]
-      // add x0, x0, #64
-      //   merged into:
-      // ldr x1, [x0], #64
-
-      // The immediate in the load/store is scaled by the size of the register
-      // being loaded. The immediate in the add we're looking for,
-      // however, is not, so adjust here.
-      int Value = MI->getOperand(2).getImm() *
-                  TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent()))
-                      ->getSize();
-      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value);
-      if (Update != E) {
-        // Merge the update into the ld/st.
-        MBBI = mergePreIdxUpdateInsn(MBBI, Update);
-        Modified = true;
-        ++NumPreFolded;
-        break;
-      }
-
-      // Nothing found. Just move to the next instruction.
-      ++MBBI;
-      break;
-    }
-      // FIXME: Do the other instructions.
-    }
-  }
-
-  return Modified;
-}
-
-bool ARM64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
-  // Early exit if pass disabled.
-  if (!DoLoadStoreOpt)
-    return false;
-
-  const TargetMachine &TM = Fn.getTarget();
-  TII = static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
-  TRI = TM.getRegisterInfo();
-
-  bool Modified = false;
-  for (auto &MBB : Fn)
-    Modified |= optimizeBlock(MBB);
-
-  return Modified;
-}
-
-// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
-// loads and stores near one another?
-
-/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
-/// optimization pass.
-FunctionPass *llvm::createARM64LoadStoreOptimizationPass() {
-  return new ARM64LoadStoreOpt();
-}
diff --git a/lib/Target/ARM64/ARM64MCInstLower.cpp b/lib/Target/ARM64/ARM64MCInstLower.cpp
deleted file mode 100644
index 01dc229..0000000
--- a/lib/Target/ARM64/ARM64MCInstLower.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-//===-- ARM64MCInstLower.cpp - Convert ARM64 MachineInstr to an MCInst---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains code to lower ARM64 MachineInstrs to their corresponding
-// MCInst records.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCInstLower.h"
-#include "MCTargetDesc/ARM64BaseInfo.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-ARM64MCInstLower::ARM64MCInstLower(MCContext &ctx, Mangler &mang,
-                                   AsmPrinter &printer)
-    : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
-
-MCSymbol *
-ARM64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
-  return Printer.getSymbol(MO.getGlobal());
-}
-
-MCSymbol *
-ARM64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
-  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
-}
-
-MCOperand ARM64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
-                                                     MCSymbol *Sym) const {
-  // FIXME: We would like an efficient form for this, so we don't have to do a
-  // lot of extra uniquing.
-  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
-  if ((MO.getTargetFlags() & ARM64II::MO_GOT) != 0) {
-    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-      RefKind = MCSymbolRefExpr::VK_GOTPAGE;
-    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
-             ARM64II::MO_PAGEOFF)
-      RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
-    else
-      assert(0 && "Unexpected target flags with MO_GOT on GV operand");
-  } else if ((MO.getTargetFlags() & ARM64II::MO_TLS) != 0) {
-    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-      RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
-    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
-             ARM64II::MO_PAGEOFF)
-      RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
-    else
-      llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
-  } else {
-    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-      RefKind = MCSymbolRefExpr::VK_PAGE;
-    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
-             ARM64II::MO_PAGEOFF)
-      RefKind = MCSymbolRefExpr::VK_PAGEOFF;
-  }
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
-  if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(
-        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
-  return MCOperand::CreateExpr(Expr);
-}
-
-MCOperand ARM64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
-                                                  MCSymbol *Sym) const {
-  uint32_t RefFlags = 0;
-
-  if (MO.getTargetFlags() & ARM64II::MO_GOT)
-    RefFlags |= ARM64MCExpr::VK_GOT;
-  else if (MO.getTargetFlags() & ARM64II::MO_TLS) {
-    TLSModel::Model Model;
-    if (MO.isGlobal()) {
-      const GlobalValue *GV = MO.getGlobal();
-      Model = Printer.TM.getTLSModel(GV);
-    } else {
-      assert(MO.isSymbol() &&
-             StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
-             "unexpected external TLS symbol");
-      Model = TLSModel::GeneralDynamic;
-    }
-    switch (Model) {
-    case TLSModel::InitialExec:
-      RefFlags |= ARM64MCExpr::VK_GOTTPREL;
-      break;
-    case TLSModel::LocalExec:
-      RefFlags |= ARM64MCExpr::VK_TPREL;
-      break;
-    case TLSModel::LocalDynamic:
-      RefFlags |= ARM64MCExpr::VK_DTPREL;
-      break;
-    case TLSModel::GeneralDynamic:
-      RefFlags |= ARM64MCExpr::VK_TLSDESC;
-      break;
-    }
-  } else {
-    // No modifier means this is a generic reference, classified as absolute for
-    // the cases where it matters (:abs_g0: etc).
-    RefFlags |= ARM64MCExpr::VK_ABS;
-  }
-
-  if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-    RefFlags |= ARM64MCExpr::VK_PAGE;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGEOFF)
-    RefFlags |= ARM64MCExpr::VK_PAGEOFF;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G3)
-    RefFlags |= ARM64MCExpr::VK_G3;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G2)
-    RefFlags |= ARM64MCExpr::VK_G2;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G1)
-    RefFlags |= ARM64MCExpr::VK_G1;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G0)
-    RefFlags |= ARM64MCExpr::VK_G0;
-
-  if (MO.getTargetFlags() & ARM64II::MO_NC)
-    RefFlags |= ARM64MCExpr::VK_NC;
-
-  const MCExpr *Expr =
-      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
-  if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(
-        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
-
-  ARM64MCExpr::VariantKind RefKind;
-  RefKind = static_cast<ARM64MCExpr::VariantKind>(RefFlags);
-  Expr = ARM64MCExpr::Create(Expr, RefKind, Ctx);
-
-  return MCOperand::CreateExpr(Expr);
-}
-
-MCOperand ARM64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
-                                               MCSymbol *Sym) const {
-  if (TargetTriple.isOSDarwin())
-    return lowerSymbolOperandDarwin(MO, Sym);
-
-  assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
-  return lowerSymbolOperandELF(MO, Sym);
-}
-
-bool ARM64MCInstLower::lowerOperand(const MachineOperand &MO,
-                                    MCOperand &MCOp) const {
-  switch (MO.getType()) {
-  default:
-    assert(0 && "unknown operand type");
-  case MachineOperand::MO_Register:
-    // Ignore all implicit register operands.
-    if (MO.isImplicit())
-      return false;
-    MCOp = MCOperand::CreateReg(MO.getReg());
-    break;
-  case MachineOperand::MO_RegisterMask:
-    // Regmasks are like implicit defs.
-    return false;
-  case MachineOperand::MO_Immediate:
-    MCOp = MCOperand::CreateImm(MO.getImm());
-    break;
-  case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::CreateExpr(
-        MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
-    break;
-  case MachineOperand::MO_GlobalAddress:
-    MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
-    break;
-  case MachineOperand::MO_ExternalSymbol:
-    MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
-    break;
-  case MachineOperand::MO_JumpTableIndex:
-    MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
-    break;
-  case MachineOperand::MO_ConstantPoolIndex:
-    MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
-    break;
-  case MachineOperand::MO_BlockAddress:
-    MCOp = LowerSymbolOperand(
-        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
-    break;
-  }
-  return true;
-}
-
-void ARM64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
-  OutMI.setOpcode(MI->getOpcode());
-
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MCOperand MCOp;
-    if (lowerOperand(MI->getOperand(i), MCOp))
-      OutMI.addOperand(MCOp);
-  }
-}
diff --git a/lib/Target/ARM64/ARM64MCInstLower.h b/lib/Target/ARM64/ARM64MCInstLower.h
deleted file mode 100644
index 7e3a2c8..0000000
--- a/lib/Target/ARM64/ARM64MCInstLower.h
+++ /dev/null
@@ -1,52 +0,0 @@
-//===-- ARM64MCInstLower.h - Lower MachineInstr to MCInst ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64_MCINSTLOWER_H
-#define ARM64_MCINSTLOWER_H
-
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-class AsmPrinter;
-class MCAsmInfo;
-class MCContext;
-class MCInst;
-class MCOperand;
-class MCSymbol;
-class MachineInstr;
-class MachineModuleInfoMachO;
-class MachineOperand;
-class Mangler;
-
-/// ARM64MCInstLower - This class is used to lower an MachineInstr
-/// into an MCInst.
-class LLVM_LIBRARY_VISIBILITY ARM64MCInstLower {
-  MCContext &Ctx;
-  AsmPrinter &Printer;
-  Triple TargetTriple;
-
-public:
-  ARM64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
-
-  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
-  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-
-  MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO,
-                                     MCSymbol *Sym) const;
-  MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
-                                  MCSymbol *Sym) const;
-  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
-
-  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
-  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
-};
-}
-
-#endif
diff --git a/lib/Target/ARM64/ARM64MachineFunctionInfo.h b/lib/Target/ARM64/ARM64MachineFunctionInfo.h
deleted file mode 100644
index 02bf7cf..0000000
--- a/lib/Target/ARM64/ARM64MachineFunctionInfo.h
+++ /dev/null
@@ -1,139 +0,0 @@
-//===- ARM64MachineFuctionInfo.h - ARM64 machine function info --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares ARM64-specific per-machine-function information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64MACHINEFUNCTIONINFO_H
-#define ARM64MACHINEFUNCTIONINFO_H
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/MC/MCLinkerOptimizationHint.h"
-
-namespace llvm {
-
-/// ARM64FunctionInfo - This class is derived from MachineFunctionInfo and
-/// contains private ARM64-specific information for each MachineFunction.
-class ARM64FunctionInfo : public MachineFunctionInfo {
-
-  /// HasStackFrame - True if this function has a stack frame. Set by
-  /// processFunctionBeforeCalleeSavedScan().
-  bool HasStackFrame;
-
-  /// \brief Amount of stack frame size, not including callee-saved registers.
-  unsigned LocalStackSize;
-
-  /// \brief Number of TLS accesses using the special (combinable)
-  /// _TLS_MODULE_BASE_ symbol.
-  unsigned NumLocalDynamicTLSAccesses;
-
-  /// \brief FrameIndex for start of varargs area for arguments passed on the
-  /// stack.
-  int VarArgsStackIndex;
-
-  /// \brief FrameIndex for start of varargs area for arguments passed in
-  /// general purpose registers.
-  int VarArgsGPRIndex;
-
-  /// \brief Size of the varargs area for arguments passed in general purpose
-  /// registers.
-  unsigned VarArgsGPRSize;
-
-  /// \brief FrameIndex for start of varargs area for arguments passed in
-  /// floating-point registers.
-  int VarArgsFPRIndex;
-
-  /// \brief Size of the varargs area for arguments passed in floating-point
-  /// registers.
-  unsigned VarArgsFPRSize;
-
-public:
-  ARM64FunctionInfo()
-      : HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
-        VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
-        VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
-
-  explicit ARM64FunctionInfo(MachineFunction &MF)
-      : HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
-        VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
-        VarArgsFPRIndex(0), VarArgsFPRSize(0) {
-    (void)MF;
-  }
-
-  bool hasStackFrame() const { return HasStackFrame; }
-  void setHasStackFrame(bool s) { HasStackFrame = s; }
-
-  void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
-  unsigned getLocalStackSize() const { return LocalStackSize; }
-
-  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
-  unsigned getNumLocalDynamicTLSAccesses() const {
-    return NumLocalDynamicTLSAccesses;
-  }
-
-  int getVarArgsStackIndex() const { return VarArgsStackIndex; }
-  void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
-
-  int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
-  void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
-
-  unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
-  void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
-
-  int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
-  void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
-
-  unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
-  void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
-
-  typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
-
-  const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
-
-  // Shortcuts for LOH related types.
-  class MILOHDirective {
-    MCLOHType Kind;
-
-    /// Arguments of this directive. Order matters.
-    SmallVector<const MachineInstr *, 3> Args;
-
-  public:
-    typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
-
-    MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
-        : Kind(Kind), Args(Args.begin(), Args.end()) {
-      assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
-    }
-
-    MCLOHType getKind() const { return Kind; }
-    const LOHArgs &getArgs() const { return Args; }
-  };
-
-  typedef MILOHDirective::LOHArgs MILOHArgs;
-  typedef SmallVector<MILOHDirective, 32> MILOHContainer;
-
-  const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
-
-  /// Add a LOH directive of this @p Kind and this @p Args.
-  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
-    LOHContainerSet.push_back(MILOHDirective(Kind, Args));
-    LOHRelated.insert(Args.begin(), Args.end());
-  }
-
-private:
-  // Hold the lists of LOHs.
-  MILOHContainer LOHContainerSet;
-  SetOfInstructions LOHRelated;
-};
-} // End llvm namespace
-
-#endif // ARM64MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/ARM64/ARM64PerfectShuffle.h b/lib/Target/ARM64/ARM64PerfectShuffle.h
deleted file mode 100644
index 6759236..0000000
--- a/lib/Target/ARM64/ARM64PerfectShuffle.h
+++ /dev/null
@@ -1,6586 +0,0 @@
-//===-- ARM64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file, which was autogenerated by llvm-PerfectShuffle, contains data
-// for the optimal way to build a perfect shuffle using AdvSIMD instructions.
-//
-//===----------------------------------------------------------------------===//
-
-// 31 entries have cost 0
-// 242 entries have cost 1
-// 1447 entries have cost 2
-// 3602 entries have cost 3
-// 1237 entries have cost 4
-// 2 entries have cost 5
-
-// This table is 6561*4 = 26244 bytes in size.
-static const unsigned PerfectShuffleTable[6561+1] = {
-  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
-  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
-  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
-  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
-  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
-  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
-  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
-  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
-  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
-  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
-  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
-  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
-  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
-  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
-  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
-  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
-  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
-  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
-  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
-  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
-  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
-  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
-  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
-  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
-  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
-  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
-  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
-  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
-  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
-  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
-  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
-  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
-  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
-  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
-  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
-  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
-  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
-  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
-  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
-  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
-  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
-  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
-  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
-  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
-  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
-  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
-  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
-  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
-  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
-  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
-  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
-  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
-  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
-  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
-  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
-  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
-  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
-  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
-  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
-  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
-  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
-  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
-  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
-  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
-  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
-  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
-  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
-  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
-  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
-  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
-  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
-  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
-  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
-  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
-  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
-  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
-  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
-  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
-  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
-  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
-  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
-  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
-  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
-  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
-  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
-  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
-  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
-  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
-  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
-  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
-  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
-  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
-  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
-  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
-  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
-  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
-  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
-  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
-  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
-  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
-  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
-  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
-  835584U, // <0,1,2,3>: Cost 0 copy LHS
-  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
-  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
-  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
-  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
-  835584U, // <0,1,2,u>: Cost 0 copy LHS
-  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
-  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
-  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
-  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
-  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
-  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
-  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
-  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
-  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
-  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
-  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
-  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
-  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
-  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
-  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
-  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
-  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
-  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
-  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
-  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
-  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
-  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
-  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
-  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
-  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
-  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
-  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
-  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
-  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
-  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
-  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
-  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
-  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
-  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
-  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
-  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
-  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
-  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
-  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
-  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
-  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
-  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
-  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
-  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
-  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
-  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
-  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
-  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
-  835584U, // <0,1,u,3>: Cost 0 copy LHS
-  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
-  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
-  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
-  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
-  835584U, // <0,1,u,u>: Cost 0 copy LHS
-  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
-  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
-  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
-  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
-  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
-  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
-  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
-  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
-  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
-  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
-  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
-  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
-  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
-  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
-  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
-  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
-  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
-  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
-  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
-  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
-  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
-  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
-  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
-  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
-  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
-  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
-  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
-  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
-  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
-  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
-  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
-  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
-  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
-  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
-  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
-  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
-  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
-  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
-  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
-  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
-  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
-  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
-  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
-  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
-  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
-  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
-  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
-  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
-  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
-  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
-  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
-  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
-  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
-  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
-  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
-  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
-  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
-  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
-  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
-  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
-  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
-  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
-  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
-  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
-  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
-  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
-  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
-  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
-  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
-  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
-  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
-  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
-  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
-  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
-  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
-  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
-  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
-  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
-  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
-  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
-  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
-  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
-  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
-  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
-  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
-  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
-  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
-  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
-  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
-  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
-  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
-  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
-  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
-  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
-  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
-  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
-  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
-  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
-  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
-  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
-  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
-  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
-  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
-  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
-  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
-  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
-  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
-  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
-  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
-  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
-  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
-  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
-  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
-  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
-  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
-  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
-  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
-  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
-  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
-  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
-  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
-  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
-  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
-  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
-  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
-  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
-  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
-  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
-  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
-  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
-  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
-  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
-  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
-  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
-  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
-  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
-  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
-  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
-  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
-  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
-  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
-  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
-  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
-  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
-  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
-  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
-  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
-  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
-  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
-  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
-  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
-  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
-  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
-  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
-  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
-  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
-  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
-  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
-  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
-  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
-  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
-  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
-  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
-  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
-  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
-  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
-  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
-  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
-  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
-  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
-  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
-  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
-  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
-  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
-  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
-  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
-  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
-  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
-  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
-  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
-  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
-  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
-  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
-  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
-  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
-  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
-  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
-  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
-  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
-  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
-  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
-  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
-  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
-  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
-  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
-  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
-  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
-  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
-  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
-  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
-  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
-  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
-  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
-  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
-  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
-  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
-  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
-  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
-  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
-  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
-  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
-  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
-  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
-  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
-  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
-  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
-  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
-  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
-  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
-  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
-  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
-  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
-  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
-  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
-  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
-  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
-  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
-  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
-  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
-  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
-  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
-  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
-  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
-  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
-  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
-  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
-  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
-  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
-  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
-  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
-  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
-  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
-  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
-  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
-  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
-  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
-  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
-  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
-  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
-  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
-  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
-  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
-  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
-  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
-  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
-  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
-  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
-  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
-  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
-  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
-  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
-  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
-  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
-  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
-  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
-  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
-  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
-  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
-  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
-  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
-  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
-  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
-  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
-  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
-  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
-  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
-  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
-  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
-  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
-  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
-  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
-  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
-  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
-  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
-  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
-  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
-  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
-  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
-  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
-  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
-  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
-  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
-  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
-  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
-  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
-  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
-  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
-  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
-  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
-  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
-  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
-  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
-  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
-  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
-  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
-  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
-  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
-  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
-  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
-  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
-  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
-  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
-  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
-  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
-  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
-  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
-  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
-  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
-  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
-  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
-  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
-  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
-  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
-  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
-  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
-  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
-  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
-  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
-  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
-  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
-  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
-  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
-  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
-  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
-  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
-  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
-  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
-  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
-  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
-  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
-  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
-  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
-  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
-  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
-  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
-  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
-  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
-  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
-  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
-  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
-  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
-  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
-  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
-  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
-  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
-  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
-  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
-  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
-  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
-  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
-  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
-  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
-  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
-  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
-  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
-  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
-  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
-  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
-  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
-  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
-  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
-  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
-  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
-  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
-  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
-  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
-  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
-  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
-  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
-  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
-  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
-  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
-  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
-  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
-  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
-  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
-  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
-  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
-  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
-  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
-  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
-  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
-  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
-  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
-  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
-  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
-  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
-  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
-  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
-  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
-  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
-  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
-  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
-  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
-  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
-  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
-  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
-  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
-  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
-  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
-  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
-  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
-  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
-  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
-  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
-  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
-  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
-  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
-  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
-  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
-  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
-  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
-  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
-  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
-  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
-  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
-  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
-  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
-  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
-  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
-  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
-  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
-  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
-  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
-  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
-  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
-  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
-  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
-  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
-  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
-  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
-  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
-  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
-  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
-  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
-  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
-  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
-  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
-  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
-  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
-  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
-  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
-  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
-  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
-  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
-  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
-  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
-  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
-  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
-  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
-  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
-  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
-  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
-  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
-  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
-  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
-  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
-  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
-  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
-  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
-  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
-  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
-  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
-  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
-  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
-  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
-  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
-  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
-  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
-  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
-  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
-  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
-  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
-  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
-  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
-  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
-  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
-  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
-  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
-  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
-  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
-  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
-  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
-  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
-  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
-  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
-  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
-  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
-  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
-  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
-  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
-  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
-  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
-  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
-  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
-  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
-  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
-  835584U, // <0,u,2,3>: Cost 0 copy LHS
-  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
-  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
-  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
-  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
-  835584U, // <0,u,2,u>: Cost 0 copy LHS
-  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
-  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
-  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
-  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
-  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
-  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
-  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
-  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
-  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
-  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
-  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
-  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
-  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
-  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
-  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
-  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
-  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
-  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
-  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
-  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
-  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
-  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
-  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
-  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
-  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
-  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
-  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
-  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
-  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
-  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
-  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
-  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
-  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
-  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
-  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
-  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
-  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
-  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
-  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
-  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
-  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
-  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
-  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
-  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
-  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
-  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
-  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
-  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
-  835584U, // <0,u,u,3>: Cost 0 copy LHS
-  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
-  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
-  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
-  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
-  835584U, // <0,u,u,u>: Cost 0 copy LHS
-  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
-  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
-  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
-  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
-  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
-  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
-  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
-  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
-  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
-  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
-  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
-  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
-  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
-  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
-  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
-  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
-  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
-  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
-  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
-  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
-  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
-  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
-  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
-  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
-  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
-  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
-  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
-  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
-  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
-  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
-  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
-  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
-  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
-  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
-  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
-  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
-  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
-  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
-  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
-  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
-  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
-  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
-  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
-  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
-  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
-  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
-  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
-  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
-  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
-  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
-  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
-  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
-  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
-  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
-  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
-  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
-  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
-  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
-  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
-  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
-  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
-  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
-  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
-  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
-  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
-  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
-  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
-  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
-  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
-  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
-  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
-  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
-  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
-  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
-  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
-  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
-  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
-  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
-  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
-  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
-  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
-  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
-  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
-  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
-  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
-  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
-  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
-  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
-  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
-  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
-  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
-  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
-  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
-  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
-  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
-  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
-  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
-  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
-  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
-  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
-  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
-  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
-  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
-  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
-  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
-  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
-  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
-  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
-  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
-  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
-  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
-  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
-  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
-  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
-  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
-  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
-  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
-  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
-  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
-  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
-  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
-  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
-  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
-  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
-  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
-  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
-  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
-  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
-  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
-  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
-  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
-  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
-  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
-  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
-  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
-  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
-  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
-  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
-  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
-  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
-  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
-  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
-  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
-  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
-  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
-  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
-  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
-  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
-  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
-  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
-  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
-  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
-  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
-  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
-  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
-  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
-  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
-  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
-  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
-  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
-  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
-  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
-  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
-  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
-  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
-  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
-  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
-  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
-  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
-  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
-  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
-  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
-  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
-  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
-  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
-  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
-  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
-  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
-  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
-  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
-  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
-  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
-  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
-  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
-  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
-  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
-  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
-  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
-  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
-  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
-  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
-  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
-  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
-  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
-  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
-  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
-  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
-  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
-  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
-  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
-  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
-  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
-  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
-  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
-  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
-  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
-  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
-  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
-  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
-  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
-  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
-  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
-  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
-  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
-  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
-  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
-  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
-  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
-  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
-  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
-  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
-  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
-  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
-  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
-  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
-  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
-  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
-  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
-  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
-  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
-  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
-  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
-  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
-  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
-  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
-  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
-  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
-  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
-  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
-  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
-  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
-  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
-  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
-  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
-  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
-  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
-  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
-  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
-  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
-  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
-  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
-  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
-  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
-  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
-  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
-  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
-  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
-  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
-  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
-  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
-  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
-  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
-  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
-  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
-  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
-  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
-  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
-  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
-  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
-  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
-  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
-  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
-  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
-  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
-  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
-  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
-  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
-  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
-  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
-  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
-  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
-  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
-  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
-  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
-  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
-  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
-  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
-  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
-  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
-  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
-  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
-  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
-  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
-  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
-  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
-  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
-  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
-  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
-  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
-  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
-  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
-  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
-  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
-  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
-  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
-  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
-  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
-  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
-  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
-  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
-  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
-  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
-  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
-  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
-  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
-  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
-  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
-  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
-  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
-  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
-  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
-  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
-  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
-  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
-  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
-  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
-  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
-  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
-  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
-  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
-  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
-  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
-  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
-  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
-  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
-  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
-  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
-  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
-  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
-  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
-  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
-  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
-  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
-  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
-  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
-  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
-  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
-  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
-  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
-  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
-  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
-  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
-  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
-  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
-  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
-  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
-  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
-  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
-  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
-  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
-  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
-  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
-  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
-  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
-  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
-  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
-  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
-  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
-  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
-  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
-  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
-  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
-  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
-  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
-  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
-  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
-  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
-  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
-  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
-  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
-  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
-  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
-  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
-  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
-  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
-  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
-  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
-  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
-  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
-  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
-  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
-  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
-  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
-  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
-  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
-  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
-  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
-  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
-  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
-  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
-  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
-  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
-  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
-  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
-  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
-  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
-  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
-  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
-  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
-  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
-  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
-  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
-  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
-  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
-  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
-  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
-  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
-  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
-  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
-  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
-  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
-  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
-  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
-  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
-  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
-  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
-  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
-  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
-  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
-  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
-  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
-  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
-  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
-  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
-  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
-  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
-  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
-  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
-  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
-  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
-  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
-  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
-  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
-  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
-  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
-  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
-  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
-  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
-  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
-  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
-  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
-  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
-  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
-  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
-  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
-  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
-  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
-  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
-  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
-  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
-  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
-  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
-  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
-  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
-  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
-  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
-  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
-  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
-  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
-  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
-  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
-  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
-  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
-  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
-  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
-  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
-  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
-  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
-  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
-  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
-  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
-  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
-  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
-  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
-  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
-  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
-  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
-  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
-  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
-  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
-  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
-  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
-  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
-  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
-  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
-  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
-  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
-  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
-  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
-  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
-  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
-  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
-  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
-  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
-  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
-  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
-  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
-  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
-  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
-  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
-  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
-  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
-  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
-  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
-  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
-  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
-  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
-  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
-  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
-  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
-  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
-  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
-  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
-  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
-  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
-  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
-  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
-  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
-  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
-  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
-  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
-  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
-  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
-  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
-  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
-  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
-  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
-  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
-  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
-  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
-  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
-  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
-  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
-  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
-  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
-  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
-  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
-  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
-  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
-  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
-  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
-  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
-  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
-  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
-  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
-  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
-  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
-  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
-  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
-  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
-  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
-  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
-  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
-  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
-  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
-  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
-  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
-  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
-  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
-  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
-  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
-  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
-  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
-  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
-  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
-  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
-  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
-  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
-  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
-  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
-  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
-  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
-  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
-  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
-  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
-  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
-  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
-  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
-  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
-  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
-  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
-  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
-  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
-  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
-  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
-  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
-  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
-  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
-  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
-  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
-  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
-  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
-  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
-  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
-  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
-  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
-  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
-  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
-  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
-  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
-  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
-  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
-  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
-  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
-  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
-  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
-  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
-  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
-  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
-  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
-  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
-  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
-  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
-  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
-  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
-  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
-  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
-  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
-  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
-  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
-  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
-  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
-  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
-  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
-  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
-  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
-  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
-  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
-  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
-  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
-  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
-  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
-  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
-  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
-  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
-  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
-  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
-  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
-  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
-  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
-  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
-  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
-  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
-  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
-  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
-  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
-  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
-  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
-  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
-  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
-  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
-  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
-  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
-  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
-  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
-  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
-  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
-  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
-  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
-  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
-  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
-  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
-  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
-  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
-  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
-  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
-  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
-  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
-  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
-  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
-  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
-  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
-  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
-  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
-  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
-  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
-  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
-  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
-  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
-  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
-  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
-  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
-  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
-  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
-  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
-  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
-  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
-  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
-  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
-  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
-  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
-  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
-  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
-  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
-  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
-  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
-  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
-  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
-  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
-  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
-  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
-  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
-  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
-  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
-  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
-  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
-  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
-  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
-  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
-  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
-  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
-  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
-  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
-  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
-  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
-  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
-  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
-  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
-  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
-  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
-  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
-  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
-  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
-  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
-  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
-  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
-  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
-  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
-  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
-  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
-  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
-  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
-  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
-  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
-  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
-  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
-  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
-  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
-  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
-  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
-  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
-  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
-  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
-  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
-  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
-  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
-  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
-  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
-  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
-  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
-  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
-  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
-  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
-  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
-  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
-  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
-  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
-  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
-  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
-  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
-  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
-  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
-  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
-  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
-  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
-  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
-  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
-  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
-  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
-  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
-  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
-  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
-  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
-  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
-  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
-  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
-  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
-  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
-  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
-  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
-  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
-  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
-  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
-  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
-  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
-  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
-  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
-  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
-  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
-  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
-  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
-  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
-  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
-  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
-  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
-  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
-  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
-  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
-  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
-  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
-  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
-  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
-  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
-  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
-  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
-  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
-  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
-  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
-  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
-  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
-  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
-  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
-  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
-  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
-  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
-  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
-  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
-  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
-  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
-  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
-  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
-  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
-  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
-  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
-  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
-  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
-  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
-  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
-  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
-  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
-  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
-  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
-  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
-  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
-  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
-  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
-  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
-  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
-  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
-  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
-  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
-  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
-  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
-  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
-  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
-  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
-  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
-  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
-  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
-  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
-  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
-  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
-  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
-  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
-  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
-  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
-  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
-  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
-  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
-  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
-  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
-  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
-  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
-  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
-  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
-  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
-  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
-  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
-  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
-  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
-  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
-  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
-  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
-  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
-  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
-  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
-  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
-  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
-  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
-  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
-  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
-  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
-  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
-  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
-  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
-  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
-  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
-  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
-  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
-  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
-  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
-  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
-  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
-  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
-  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
-  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
-  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
-  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
-  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
-  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
-  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
-  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
-  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
-  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
-  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
-  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
-  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
-  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
-  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
-  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
-  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
-  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
-  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
-  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
-  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
-  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
-  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
-  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
-  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
-  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
-  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
-  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
-  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
-  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
-  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
-  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
-  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
-  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
-  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
-  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
-  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
-  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
-  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
-  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
-  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
-  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
-  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
-  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
-  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
-  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
-  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
-  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
-  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
-  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
-  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
-  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
-  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
-  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
-  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
-  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
-  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
-  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
-  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
-  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
-  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
-  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
-  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
-  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
-  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
-  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
-  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
-  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
-  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
-  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
-  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
-  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
-  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
-  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
-  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
-  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
-  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
-  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
-  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
-  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
-  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
-  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
-  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
-  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
-  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
-  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
-  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
-  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
-  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
-  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
-  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
-  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
-  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
-  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
-  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
-  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
-  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
-  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
-  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
-  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
-  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
-  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
-  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
-  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
-  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
-  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
-  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
-  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
-  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
-  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
-  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
-  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
-  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
-  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
-  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
-  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
-  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
-  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
-  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
-  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
-  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
-  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
-  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
-  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
-  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
-  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
-  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
-  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
-  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
-  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
-  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
-  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
-  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
-  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
-  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
-  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
-  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
-  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
-  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
-  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
-  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
-  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
-  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
-  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
-  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
-  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
-  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
-  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
-  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
-  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
-  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
-  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
-  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
-  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
-  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
-  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
-  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
-  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
-  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
-  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
-  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
-  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
-  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
-  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
-  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
-  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
-  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
-  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
-  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
-  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
-  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
-  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
-  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
-  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
-  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
-  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
-  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
-  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
-  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
-  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
-  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
-  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
-  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
-  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
-  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
-  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
-  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
-  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
-  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
-  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
-  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
-  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
-  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
-  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
-  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
-  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
-  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
-  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
-  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
-  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
-  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
-  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
-  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
-  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
-  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
-  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
-  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
-  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
-  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
-  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
-  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
-  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
-  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
-  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
-  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
-  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
-  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
-  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
-  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
-  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
-  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
-  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
-  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
-  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
-  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
-  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
-  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
-  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
-  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
-  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
-  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
-  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
-  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
-  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
-  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
-  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
-  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
-  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
-  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
-  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
-  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
-  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
-  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
-  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
-  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
-  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
-  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
-  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
-  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
-  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
-  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
-  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
-  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
-  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
-  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
-  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
-  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
-  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
-  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
-  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
-  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
-  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
-  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
-  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
-  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
-  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
-  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
-  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
-  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
-  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
-  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
-  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
-  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
-  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
-  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
-  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
-  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
-  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
-  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
-  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
-  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
-  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
-  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
-  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
-  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
-  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
-  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
-  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
-  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
-  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
-  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
-  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
-  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
-  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
-  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
-  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
-  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
-  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
-  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
-  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
-  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
-  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
-  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
-  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
-  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
-  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
-  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
-  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
-  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
-  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
-  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
-  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
-  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
-  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
-  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
-  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
-  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
-  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
-  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
-  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
-  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
-  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
-  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
-  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
-  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
-  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
-  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
-  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
-  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
-  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
-  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
-  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
-  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
-  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
-  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
-  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
-  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
-  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
-  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
-  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
-  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
-  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
-  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
-  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
-  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
-  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
-  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
-  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
-  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
-  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
-  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
-  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
-  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
-  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
-  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
-  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
-  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
-  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
-  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
-  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
-  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
-  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
-  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
-  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
-  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
-  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
-  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
-  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
-  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
-  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
-  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
-  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
-  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
-  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
-  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
-  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
-  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
-  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
-  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
-  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
-  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
-  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
-  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
-  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
-  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
-  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
-  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
-  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
-  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
-  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
-  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
-  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
-  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
-  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
-  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
-  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
-  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
-  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
-  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
-  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
-  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
-  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
-  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
-  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
-  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
-  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
-  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
-  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
-  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
-  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
-  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
-  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
-  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
-  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
-  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
-  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
-  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
-  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
-  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
-  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
-  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
-  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
-  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
-  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
-  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
-  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
-  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
-  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
-  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
-  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
-  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
-  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
-  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
-  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
-  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
-  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
-  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
-  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
-  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
-  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
-  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
-  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
-  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
-  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
-  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
-  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
-  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
-  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
-  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
-  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
-  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
-  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
-  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
-  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
-  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
-  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
-  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
-  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
-  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
-  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
-  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
-  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
-  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
-  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
-  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
-  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
-  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
-  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
-  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
-  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
-  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
-  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
-  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
-  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
-  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
-  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
-  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
-  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
-  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
-  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
-  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
-  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
-  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
-  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
-  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
-  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
-  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
-  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
-  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
-  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
-  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
-  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
-  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
-  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
-  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
-  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
-  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
-  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
-  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
-  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
-  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
-  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
-  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
-  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
-  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
-  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
-  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
-  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
-  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
-  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
-  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
-  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
-  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
-  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
-  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
-  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
-  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
-  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
-  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
-  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
-  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
-  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
-  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
-  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
-  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
-  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
-  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
-  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
-  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
-  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
-  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
-  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
-  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
-  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
-  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
-  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
-  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
-  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
-  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
-  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
-  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
-  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
-  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
-  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
-  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
-  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
-  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
-  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
-  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
-  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
-  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
-  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
-  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
-  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
-  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
-  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
-  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
-  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
-  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
-  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
-  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
-  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
-  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
-  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
-  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
-  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
-  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
-  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
-  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
-  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
-  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
-  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
-  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
-  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
-  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
-  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
-  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
-  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
-  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
-  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
-  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
-  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
-  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
-  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
-  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
-  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
-  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
-  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
-  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
-  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
-  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
-  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
-  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
-  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
-  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
-  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
-  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
-  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
-  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
-  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
-  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
-  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
-  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
-  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
-  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
-  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
-  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
-  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
-  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
-  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
-  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
-  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
-  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
-  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
-  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
-  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
-  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
-  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
-  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
-  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
-  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
-  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
-  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
-  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
-  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
-  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
-  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
-  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
-  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
-  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
-  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
-  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
-  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
-  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
-  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
-  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
-  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
-  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
-  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
-  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
-  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
-  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
-  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
-  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
-  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
-  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
-  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
-  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
-  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
-  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
-  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
-  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
-  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
-  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
-  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
-  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
-  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
-  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
-  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
-  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
-  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
-  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
-  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
-  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
-  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
-  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
-  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
-  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
-  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
-  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
-  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
-  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
-  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
-  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
-  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
-  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
-  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
-  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
-  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
-  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
-  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
-  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
-  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
-  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
-  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
-  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
-  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
-  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
-  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
-  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
-  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
-  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
-  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
-  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
-  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
-  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
-  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
-  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
-  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
-  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
-  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
-  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
-  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
-  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
-  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
-  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
-  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
-  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
-  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
-  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
-  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
-  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
-  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
-  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
-  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
-  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
-  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
-  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
-  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
-  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
-  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
-  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
-  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
-  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
-  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
-  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
-  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
-  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
-  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
-  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
-  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
-  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
-  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
-  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
-  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
-  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
-  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
-  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
-  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
-  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
-  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
-  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
-  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
-  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
-  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
-  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
-  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
-  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
-  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
-  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
-  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
-  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
-  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
-  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
-  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
-  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
-  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
-  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
-  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
-  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
-  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
-  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
-  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
-  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
-  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
-  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
-  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
-  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
-  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
-  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
-  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
-  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
-  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
-  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
-  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
-  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
-  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
-  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
-  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
-  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
-  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
-  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
-  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
-  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
-  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
-  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
-  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
-  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
-  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
-  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
-  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
-  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
-  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
-  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
-  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
-  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
-  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
-  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
-  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
-  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
-  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
-  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
-  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
-  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
-  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
-  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
-  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
-  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
-  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
-  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
-  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
-  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
-  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
-  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
-  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
-  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
-  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
-  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
-  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
-  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
-  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
-  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
-  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
-  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
-  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
-  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
-  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
-  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
-  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
-  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
-  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
-  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
-  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
-  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
-  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
-  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
-  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
-  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
-  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
-  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
-  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
-  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
-  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
-  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
-  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
-  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
-  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
-  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
-  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
-  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
-  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
-  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
-  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
-  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
-  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
-  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
-  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
-  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
-  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
-  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
-  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
-  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
-  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
-  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
-  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
-  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
-  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
-  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
-  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
-  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
-  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
-  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
-  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
-  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
-  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
-  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
-  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
-  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
-  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
-  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
-  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
-  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
-  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
-  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
-  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
-  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
-  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
-  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
-  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
-  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
-  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
-  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
-  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
-  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
-  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
-  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
-  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
-  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
-  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
-  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
-  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
-  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
-  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
-  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
-  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
-  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
-  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
-  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
-  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
-  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
-  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
-  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
-  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
-  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
-  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
-  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
-  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
-  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
-  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
-  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
-  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
-  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
-  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
-  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
-  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
-  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
-  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
-  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
-  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
-  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
-  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
-  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
-  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
-  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
-  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
-  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
-  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
-  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
-  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
-  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
-  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
-  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
-  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
-  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
-  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
-  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
-  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
-  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
-  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
-  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
-  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
-  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
-  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
-  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
-  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
-  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
-  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
-  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
-  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
-  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
-  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
-  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
-  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
-  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
-  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
-  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
-  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
-  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
-  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
-  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
-  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
-  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
-  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
-  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
-  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
-  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
-  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
-  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
-  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
-  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
-  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
-  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
-  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
-  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
-  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
-  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
-  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
-  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
-  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
-  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
-  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
-  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
-  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
-  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
-  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
-  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
-  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
-  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
-  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
-  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
-  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
-  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
-  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
-  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
-  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
-  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
-  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
-  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
-  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
-  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
-  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
-  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
-  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
-  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
-  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
-  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
-  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
-  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
-  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
-  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
-  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
-  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
-  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
-  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
-  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
-  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
-  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
-  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
-  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
-  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
-  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
-  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
-  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
-  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
-  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
-  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
-  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
-  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
-  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
-  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
-  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
-  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
-  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
-  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
-  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
-  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
-  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
-  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
-  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
-  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
-  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
-  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
-  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
-  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
-  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
-  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
-  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
-  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
-  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
-  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
-  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
-  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
-  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
-  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
-  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
-  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
-  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
-  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
-  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
-  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
-  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
-  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
-  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
-  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
-  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
-  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
-  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
-  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
-  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
-  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
-  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
-  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
-  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
-  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
-  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
-  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
-  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
-  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
-  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
-  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
-  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
-  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
-  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
-  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
-  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
-  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
-  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
-  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
-  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
-  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
-  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
-  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
-  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
-  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
-  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
-  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
-  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
-  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
-  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
-  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
-  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
-  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
-  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
-  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
-  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
-  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
-  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
-  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
-  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
-  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
-  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
-  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
-  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
-  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
-  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
-  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
-  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
-  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
-  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
-  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
-  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
-  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
-  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
-  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
-  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
-  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
-  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
-  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
-  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
-  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
-  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
-  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
-  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
-  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
-  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
-  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
-  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
-  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
-  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
-  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
-  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
-  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
-  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
-  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
-  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
-  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
-  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
-  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
-  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
-  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
-  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
-  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
-  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
-  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
-  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
-  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
-  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
-  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
-  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
-  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
-  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
-  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
-  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
-  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
-  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
-  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
-  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
-  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
-  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
-  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
-  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
-  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
-  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
-  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
-  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
-  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
-  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
-  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
-  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
-  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
-  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
-  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
-  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
-  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
-  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
-  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
-  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
-  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
-  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
-  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
-  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
-  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
-  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
-  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
-  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
-  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
-  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
-  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
-  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
-  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
-  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
-  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
-  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
-  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
-  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
-  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
-  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
-  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
-  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
-  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
-  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
-  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
-  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
-  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
-  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
-  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
-  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
-  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
-  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
-  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
-  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
-  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
-  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
-  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
-  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
-  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
-  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
-  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
-  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
-  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
-  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
-  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
-  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
-  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
-  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
-  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
-  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
-  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
-  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
-  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
-  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
-  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
-  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
-  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
-  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
-  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
-  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
-  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
-  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
-  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
-  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
-  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
-  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
-  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
-  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
-  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
-  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
-  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
-  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
-  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
-  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
-  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
-  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
-  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
-  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
-  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
-  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
-  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
-  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
-  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
-  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
-  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
-  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
-  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
-  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
-  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
-  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
-  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
-  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
-  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
-  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
-  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
-  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
-  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
-  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
-  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
-  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
-  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
-  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
-  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
-  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
-  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
-  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
-  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
-  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
-  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
-  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
-  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
-  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
-  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
-  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
-  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
-  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
-  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
-  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
-  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
-  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
-  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
-  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
-  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
-  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
-  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
-  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
-  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
-  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
-  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
-  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
-  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
-  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
-  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
-  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
-  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
-  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
-  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
-  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
-  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
-  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
-  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
-  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
-  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
-  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
-  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
-  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
-  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
-  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
-  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
-  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
-  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
-  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
-  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
-  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
-  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
-  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
-  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
-  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
-  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
-  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
-  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
-  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
-  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
-  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
-  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
-  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
-  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
-  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
-  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
-  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
-  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
-  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
-  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
-  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
-  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
-  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
-  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
-  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
-  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
-  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
-  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
-  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
-  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
-  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
-  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
-  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
-  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
-  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
-  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
-  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
-  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
-  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
-  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
-  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
-  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
-  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
-  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
-  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
-  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
-  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
-  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
-  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
-  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
-  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
-  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
-  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
-  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
-  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
-  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
-  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
-  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
-  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
-  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
-  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
-  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
-  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
-  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
-  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
-  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
-  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
-  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
-  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
-  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
-  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
-  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
-  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
-  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
-  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
-  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
-  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
-  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
-  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
-  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
-  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
-  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
-  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
-  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
-  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
-  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
-  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
-  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
-  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
-  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
-  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
-  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
-  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
-  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
-  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
-  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
-  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
-  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
-  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
-  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
-  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
-  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
-  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
-  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
-  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
-  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
-  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
-  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
-  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
-  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
-  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
-  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
-  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
-  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
-  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
-  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
-  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
-  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
-  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
-  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
-  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
-  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
-  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
-  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
-  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
-  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
-  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
-  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
-  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
-  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
-  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
-  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
-  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
-  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
-  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
-  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
-  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
-  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
-  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
-  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
-  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
-  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
-  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
-  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
-  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
-  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
-  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
-  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
-  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
-  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
-  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
-  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
-  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
-  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
-  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
-  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
-  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
-  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
-  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
-  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
-  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
-  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
-  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
-  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
-  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
-  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
-  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
-  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
-  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
-  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
-  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
-  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
-  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
-  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
-  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
-  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
-  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
-  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
-  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
-  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
-  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
-  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
-  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
-  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
-  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
-  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
-  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
-  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
-  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
-  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
-  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
-  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
-  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
-  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
-  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
-  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
-  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
-  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
-  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
-  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
-  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
-  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
-  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
-  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
-  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
-  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
-  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
-  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
-  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
-  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
-  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
-  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
-  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
-  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
-  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
-  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
-  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
-  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
-  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
-  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
-  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
-  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
-  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
-  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
-  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
-  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
-  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
-  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
-  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
-  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
-  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
-  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
-  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
-  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
-  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
-  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
-  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
-  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
-  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
-  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
-  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
-  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
-  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
-  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
-  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
-  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
-  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
-  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
-  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
-  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
-  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
-  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
-  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
-  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
-  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
-  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
-  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
-  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
-  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
-  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
-  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
-  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
-  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
-  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
-  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
-  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
-  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
-  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
-  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
-  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
-  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
-  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
-  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
-  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
-  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
-  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
-  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
-  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
-  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
-  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
-  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
-  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
-  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
-  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
-  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
-  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
-  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
-  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
-  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
-  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
-  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
-  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
-  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
-  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
-  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
-  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
-  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
-  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
-  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
-  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
-  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
-  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
-  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
-  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
-  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
-  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
-  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
-  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
-  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
-  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
-  27705344U, // <4,5,6,7>: Cost 0 copy RHS
-  27705344U, // <4,5,6,u>: Cost 0 copy RHS
-  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
-  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
-  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
-  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
-  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
-  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
-  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
-  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
-  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
-  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
-  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
-  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
-  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
-  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
-  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
-  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
-  27705344U, // <4,5,u,7>: Cost 0 copy RHS
-  27705344U, // <4,5,u,u>: Cost 0 copy RHS
-  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
-  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
-  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
-  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
-  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
-  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
-  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
-  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
-  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
-  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
-  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
-  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
-  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
-  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
-  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
-  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
-  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
-  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
-  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
-  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
-  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
-  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
-  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
-  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
-  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
-  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
-  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
-  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
-  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
-  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
-  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
-  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
-  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
-  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
-  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
-  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
-  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
-  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
-  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
-  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
-  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
-  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
-  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
-  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
-  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
-  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
-  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
-  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
-  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
-  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
-  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
-  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
-  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
-  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
-  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
-  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
-  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
-  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
-  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
-  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
-  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
-  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
-  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
-  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
-  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
-  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
-  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
-  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
-  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
-  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
-  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
-  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
-  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
-  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
-  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
-  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
-  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
-  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
-  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
-  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
-  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
-  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
-  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
-  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
-  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
-  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
-  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
-  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
-  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
-  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
-  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
-  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
-  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
-  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
-  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
-  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
-  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
-  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
-  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
-  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
-  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
-  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
-  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
-  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
-  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
-  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
-  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
-  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
-  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
-  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
-  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
-  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
-  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
-  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
-  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
-  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
-  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
-  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
-  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
-  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
-  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
-  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
-  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
-  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
-  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
-  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
-  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
-  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
-  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
-  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
-  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
-  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
-  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
-  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
-  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
-  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
-  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
-  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
-  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
-  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
-  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
-  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
-  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
-  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
-  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
-  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
-  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
-  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
-  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
-  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
-  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
-  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
-  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
-  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
-  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
-  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
-  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
-  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
-  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
-  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
-  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
-  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
-  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
-  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
-  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
-  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
-  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
-  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
-  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
-  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
-  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
-  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
-  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
-  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
-  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
-  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
-  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
-  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
-  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
-  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
-  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
-  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
-  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
-  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
-  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
-  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
-  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
-  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
-  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
-  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
-  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
-  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
-  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
-  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
-  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
-  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
-  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
-  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
-  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
-  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
-  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
-  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
-  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
-  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
-  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
-  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
-  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
-  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
-  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
-  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
-  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
-  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
-  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
-  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
-  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
-  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
-  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
-  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
-  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
-  27705344U, // <4,u,6,7>: Cost 0 copy RHS
-  27705344U, // <4,u,6,u>: Cost 0 copy RHS
-  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
-  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
-  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
-  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
-  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
-  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
-  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
-  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
-  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
-  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
-  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
-  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
-  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
-  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
-  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  27705344U, // <4,u,u,7>: Cost 0 copy RHS
-  27705344U, // <4,u,u,u>: Cost 0 copy RHS
-  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
-  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
-  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
-  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
-  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
-  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
-  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
-  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
-  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
-  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
-  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
-  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
-  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
-  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
-  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
-  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
-  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
-  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
-  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
-  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
-  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
-  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
-  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
-  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
-  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
-  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
-  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
-  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
-  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
-  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
-  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
-  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
-  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
-  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
-  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
-  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
-  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
-  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
-  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
-  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
-  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
-  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
-  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
-  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
-  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
-  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
-  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
-  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
-  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
-  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
-  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
-  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
-  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
-  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
-  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
-  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
-  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
-  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
-  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
-  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
-  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
-  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
-  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
-  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
-  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
-  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
-  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
-  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
-  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
-  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
-  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
-  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
-  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
-  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
-  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
-  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
-  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
-  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
-  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
-  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
-  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
-  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
-  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
-  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
-  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
-  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
-  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
-  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
-  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
-  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
-  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
-  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
-  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
-  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
-  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
-  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
-  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
-  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
-  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
-  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
-  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
-  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
-  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
-  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
-  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
-  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
-  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
-  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
-  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
-  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
-  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
-  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
-  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
-  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
-  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
-  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
-  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
-  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
-  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
-  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
-  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
-  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
-  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
-  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
-  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
-  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
-  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
-  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
-  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
-  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
-  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
-  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
-  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
-  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
-  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
-  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
-  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
-  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
-  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
-  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
-  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
-  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
-  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
-  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
-  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
-  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
-  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
-  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
-  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
-  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
-  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
-  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
-  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
-  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
-  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
-  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
-  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
-  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
-  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
-  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
-  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
-  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
-  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
-  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
-  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
-  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
-  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
-  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
-  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
-  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
-  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
-  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
-  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
-  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
-  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
-  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
-  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
-  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
-  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
-  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
-  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
-  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
-  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
-  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
-  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
-  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
-  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
-  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
-  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
-  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
-  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
-  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
-  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
-  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
-  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
-  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
-  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
-  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
-  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
-  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
-  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
-  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
-  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
-  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
-  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
-  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
-  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
-  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
-  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
-  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
-  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
-  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
-  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
-  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
-  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
-  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
-  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
-  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
-  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
-  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
-  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
-  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
-  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
-  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
-  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
-  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
-  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
-  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
-  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
-  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
-  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
-  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
-  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
-  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
-  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
-  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
-  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
-  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
-  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
-  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
-  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
-  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
-  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
-  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
-  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
-  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
-  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
-  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
-  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
-  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
-  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
-  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
-  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
-  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
-  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
-  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
-  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
-  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
-  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
-  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
-  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
-  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
-  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
-  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
-  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
-  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
-  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
-  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
-  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
-  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
-  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
-  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
-  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
-  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
-  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
-  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
-  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
-  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
-  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
-  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
-  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
-  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
-  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
-  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
-  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
-  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
-  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
-  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
-  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
-  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
-  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
-  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
-  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
-  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
-  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
-  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
-  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
-  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
-  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
-  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
-  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
-  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
-  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
-  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
-  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
-  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
-  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
-  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
-  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
-  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
-  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
-  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
-  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
-  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
-  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
-  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
-  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
-  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
-  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
-  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
-  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
-  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
-  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
-  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
-  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
-  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
-  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
-  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
-  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
-  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
-  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
-  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
-  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
-  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
-  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
-  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
-  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
-  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
-  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
-  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
-  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
-  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
-  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
-  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
-  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
-  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
-  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
-  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
-  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
-  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
-  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
-  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
-  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
-  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
-  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
-  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
-  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
-  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
-  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
-  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
-  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
-  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
-  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
-  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
-  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
-  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
-  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
-  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
-  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
-  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
-  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
-  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
-  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
-  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
-  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
-  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
-  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
-  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
-  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
-  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
-  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
-  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
-  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
-  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
-  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
-  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
-  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
-  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
-  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
-  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
-  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
-  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
-  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
-  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
-  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
-  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
-  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
-  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
-  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
-  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
-  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
-  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
-  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
-  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
-  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
-  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
-  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
-  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
-  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
-  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
-  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
-  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
-  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
-  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
-  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
-  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
-  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
-  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
-  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
-  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
-  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
-  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
-  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
-  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
-  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
-  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
-  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
-  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
-  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
-  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
-  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
-  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
-  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
-  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
-  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
-  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
-  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
-  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
-  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
-  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
-  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
-  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
-  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
-  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
-  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
-  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
-  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
-  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
-  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
-  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
-  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
-  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
-  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
-  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
-  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
-  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
-  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
-  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
-  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
-  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
-  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
-  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
-  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
-  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
-  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
-  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
-  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
-  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
-  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
-  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
-  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
-  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
-  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
-  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
-  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
-  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
-  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
-  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
-  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
-  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
-  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
-  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
-  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
-  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
-  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
-  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
-  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
-  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
-  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
-  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
-  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
-  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
-  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
-  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
-  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
-  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
-  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
-  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
-  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
-  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
-  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
-  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
-  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
-  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
-  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
-  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
-  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
-  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
-  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
-  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
-  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
-  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
-  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
-  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
-  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
-  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
-  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
-  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
-  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
-  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
-  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
-  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
-  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
-  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
-  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
-  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
-  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
-  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
-  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
-  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
-  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
-  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
-  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
-  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
-  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
-  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
-  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
-  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
-  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
-  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
-  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
-  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
-  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
-  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
-  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
-  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
-  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
-  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
-  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
-  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
-  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
-  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
-  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
-  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
-  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
-  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
-  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
-  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
-  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
-  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
-  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
-  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
-  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
-  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
-  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
-  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
-  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
-  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
-  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
-  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
-  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
-  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
-  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
-  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
-  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
-  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
-  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
-  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
-  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
-  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
-  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
-  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
-  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
-  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
-  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
-  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
-  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
-  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
-  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
-  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
-  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
-  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
-  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
-  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
-  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
-  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
-  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
-  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
-  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
-  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
-  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
-  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
-  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
-  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
-  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
-  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
-  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
-  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
-  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
-  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
-  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
-  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
-  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
-  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
-  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
-  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
-  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
-  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
-  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
-  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
-  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
-  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
-  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
-  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
-  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
-  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
-  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
-  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
-  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
-  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
-  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
-  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
-  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
-  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
-  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
-  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
-  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
-  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
-  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
-  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
-  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
-  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
-  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
-  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
-  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
-  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
-  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
-  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
-  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
-  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
-  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
-  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
-  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
-  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
-  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
-  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
-  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
-  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
-  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
-  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
-  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
-  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
-  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
-  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
-  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
-  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
-  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
-  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
-  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
-  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
-  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
-  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
-  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
-  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
-  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
-  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
-  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
-  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
-  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
-  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
-  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
-  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
-  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
-  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
-  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
-  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
-  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
-  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
-  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
-  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
-  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
-  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
-  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
-  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
-  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
-  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
-  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
-  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
-  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
-  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
-  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
-  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
-  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
-  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
-  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
-  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
-  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
-  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
-  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
-  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
-  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
-  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
-  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
-  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
-  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
-  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
-  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
-  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
-  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
-  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
-  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
-  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
-  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
-  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
-  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
-  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
-  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
-  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
-  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
-  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
-  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
-  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
-  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
-  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
-  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
-  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
-  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
-  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
-  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
-  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
-  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
-  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
-  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
-  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
-  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
-  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
-  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
-  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
-  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
-  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
-  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
-  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
-  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
-  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
-  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
-  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
-  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
-  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
-  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
-  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
-  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
-  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
-  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
-  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
-  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
-  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
-  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
-  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
-  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
-  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
-  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
-  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
-  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
-  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
-  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
-  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
-  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
-  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
-  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
-  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
-  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
-  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
-  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
-  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
-  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
-  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
-  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
-  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
-  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
-  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
-  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
-  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
-  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
-  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
-  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
-  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
-  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
-  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
-  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
-  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
-  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
-  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
-  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
-  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
-  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
-  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
-  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
-  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
-  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
-  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
-  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
-  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
-  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
-  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
-  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
-  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
-  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
-  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
-  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
-  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
-  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
-  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
-  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
-  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
-  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
-  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
-  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
-  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
-  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
-  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
-  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
-  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
-  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
-  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
-  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
-  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
-  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
-  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
-  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
-  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
-  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
-  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
-  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
-  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
-  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
-  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
-  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
-  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
-  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
-  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
-  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
-  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
-  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
-  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
-  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
-  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
-  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
-  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
-  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
-  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
-  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
-  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
-  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
-  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
-  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
-  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
-  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
-  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
-  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
-  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
-  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
-  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
-  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
-  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
-  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
-  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
-  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
-  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
-  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
-  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
-  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
-  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
-  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
-  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
-  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
-  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
-  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
-  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
-  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
-  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
-  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
-  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
-  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
-  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
-  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
-  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
-  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
-  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
-  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
-  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
-  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
-  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
-  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
-  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
-  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
-  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
-  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
-  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
-  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
-  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
-  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
-  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
-  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
-  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
-  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
-  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
-  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
-  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
-  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
-  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
-  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
-  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
-  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
-  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
-  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
-  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
-  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
-  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
-  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
-  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
-  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
-  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
-  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
-  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
-  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
-  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
-  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
-  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
-  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
-  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
-  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
-  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
-  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
-  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
-  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
-  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
-  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
-  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
-  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
-  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
-  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
-  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
-  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
-  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
-  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
-  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
-  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
-  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
-  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
-  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
-  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
-  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
-  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
-  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
-  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
-  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
-  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
-  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
-  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
-  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
-  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
-  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
-  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
-  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
-  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
-  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
-  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
-  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
-  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
-  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
-  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
-  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
-  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
-  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
-  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
-  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
-  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
-  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
-  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
-  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
-  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
-  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
-  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
-  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
-  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
-  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
-  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
-  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
-  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
-  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
-  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
-  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
-  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
-  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
-  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
-  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
-  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
-  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
-  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
-  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
-  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
-  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
-  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
-  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
-  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
-  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
-  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
-  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
-  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
-  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
-  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
-  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
-  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
-  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
-  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
-  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
-  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
-  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
-  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
-  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
-  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
-  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
-  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
-  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
-  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
-  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
-  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
-  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
-  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
-  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
-  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
-  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
-  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
-  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
-  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
-  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
-  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
-  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
-  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
-  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
-  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
-  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
-  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
-  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
-  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
-  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
-  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
-  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
-  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
-  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
-  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
-  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
-  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
-  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
-  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
-  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
-  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
-  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
-  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
-  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
-  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
-  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
-  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
-  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
-  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
-  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
-  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
-  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
-  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
-  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
-  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
-  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
-  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
-  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
-  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
-  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
-  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
-  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
-  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
-  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
-  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
-  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
-  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
-  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
-  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
-  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
-  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
-  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
-  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
-  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
-  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
-  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
-  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
-  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
-  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
-  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
-  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
-  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
-  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
-  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
-  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
-  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
-  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
-  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
-  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
-  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
-  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
-  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
-  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
-  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
-  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
-  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
-  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
-  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
-  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
-  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
-  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
-  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
-  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
-  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
-  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
-  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
-  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
-  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
-  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
-  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
-  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
-  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
-  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
-  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
-  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
-  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
-  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
-  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
-  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
-  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
-  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
-  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
-  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
-  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
-  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
-  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
-  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
-  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
-  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
-  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
-  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
-  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
-  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
-  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
-  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
-  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
-  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
-  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
-  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
-  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
-  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
-  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
-  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
-  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
-  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
-  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
-  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
-  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
-  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
-  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
-  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
-  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
-  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
-  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
-  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
-  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
-  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
-  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
-  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
-  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
-  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
-  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
-  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
-  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
-  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
-  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
-  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
-  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
-  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
-  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
-  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
-  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
-  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
-  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
-  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
-  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
-  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
-  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
-  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
-  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
-  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
-  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
-  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
-  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
-  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
-  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
-  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
-  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
-  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
-  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
-  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
-  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
-  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
-  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
-  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
-  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
-  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
-  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
-  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
-  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
-  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
-  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
-  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
-  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
-  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
-  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
-  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
-  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
-  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
-  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
-  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
-  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
-  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
-  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
-  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
-  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
-  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
-  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
-  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
-  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
-  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
-  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
-  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
-  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
-  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
-  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
-  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
-  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
-  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
-  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
-  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
-  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
-  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
-  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
-  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
-  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
-  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
-  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
-  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
-  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
-  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
-  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
-  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
-  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
-  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
-  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
-  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
-  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
-  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
-  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
-  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
-  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
-  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
-  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
-  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
-  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
-  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
-  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
-  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
-  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
-  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
-  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
-  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
-  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
-  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
-  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
-  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
-  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
-  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
-  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
-  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
-  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
-  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
-  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
-  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
-  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
-  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
-  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
-  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
-  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
-  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
-  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
-  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
-  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
-  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
-  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
-  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
-  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
-  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
-  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
-  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
-  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
-  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
-  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
-  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
-  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
-  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
-  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
-  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
-  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
-  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
-  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
-  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
-  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
-  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
-  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
-  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
-  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
-  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
-  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
-  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
-  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
-  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
-  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
-  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
-  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
-  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
-  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
-  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
-  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
-  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
-  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
-  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
-  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
-  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
-  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
-  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
-  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
-  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
-  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
-  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
-  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
-  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
-  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
-  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
-  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
-  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
-  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
-  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
-  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
-  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
-  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
-  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
-  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
-  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
-  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
-  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
-  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
-  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
-  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
-  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
-  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
-  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
-  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
-  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
-  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
-  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
-  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
-  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
-  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
-  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
-  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
-  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
-  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
-  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
-  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
-  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
-  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
-  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
-  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
-  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
-  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
-  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
-  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
-  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
-  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
-  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
-  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
-  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
-  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
-  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
-  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
-  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
-  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
-  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
-  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
-  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
-  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
-  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
-  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
-  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
-  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
-  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
-  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
-  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
-  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
-  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
-  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
-  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
-  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
-  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
-  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
-  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
-  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
-  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
-  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
-  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
-  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
-  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
-  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
-  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
-  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
-  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
-  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
-  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
-  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
-  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
-  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
-  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
-  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
-  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
-  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
-  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
-  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
-  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
-  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
-  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
-  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
-  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
-  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
-  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
-  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
-  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
-  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
-  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
-  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
-  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
-  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
-  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
-  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
-  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
-  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
-  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
-  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
-  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
-  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
-  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
-  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
-  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
-  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
-  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
-  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
-  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
-  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
-  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
-  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
-  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
-  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
-  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
-  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
-  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
-  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
-  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
-  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
-  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
-  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
-  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
-  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
-  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
-  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
-  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
-  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
-  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
-  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
-  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
-  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
-  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
-  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
-  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
-  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
-  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
-  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
-  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
-  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
-  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
-  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
-  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
-  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
-  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
-  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
-  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
-  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
-  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
-  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
-  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
-  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
-  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
-  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
-  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
-  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
-  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
-  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
-  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
-  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
-  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
-  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
-  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
-  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
-  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
-  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
-  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
-  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
-  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
-  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
-  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
-  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
-  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
-  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
-  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
-  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
-  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
-  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
-  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
-  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
-  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
-  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
-  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
-  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
-  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
-  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
-  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
-  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
-  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
-  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
-  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
-  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
-  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
-  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
-  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
-  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
-  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
-  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
-  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
-  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
-  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
-  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
-  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
-  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
-  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
-  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
-  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
-  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
-  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
-  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
-  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
-  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
-  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
-  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
-  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
-  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
-  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
-  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
-  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
-  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
-  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
-  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
-  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
-  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
-  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
-  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
-  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
-  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
-  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
-  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
-  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
-  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
-  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
-  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
-  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
-  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
-  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
-  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
-  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
-  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
-  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
-  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
-  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
-  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
-  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
-  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
-  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
-  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
-  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
-  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
-  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
-  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
-  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
-  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
-  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
-  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
-  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
-  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
-  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
-  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
-  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
-  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
-  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
-  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
-  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
-  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
-  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
-  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
-  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
-  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
-  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
-  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
-  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
-  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
-  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
-  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
-  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
-  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
-  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
-  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
-  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
-  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
-  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
-  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
-  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
-  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
-  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
-  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
-  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
-  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
-  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
-  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
-  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
-  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
-  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
-  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
-  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
-  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
-  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
-  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
-  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
-  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
-  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
-  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
-  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
-  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
-  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
-  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
-  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
-  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
-  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
-  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
-  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
-  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
-  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
-  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
-  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
-  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
-  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
-  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
-  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
-  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
-  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
-  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
-  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
-  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
-  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
-  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
-  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
-  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
-  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
-  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
-  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
-  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
-  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
-  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
-  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
-  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
-  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
-  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
-  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
-  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
-  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
-  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
-  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
-  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
-  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
-  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
-  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
-  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
-  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
-  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
-  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
-  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
-  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
-  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
-  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
-  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
-  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
-  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
-  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
-  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
-  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
-  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
-  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
-  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
-  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
-  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
-  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
-  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
-  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
-  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
-  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
-  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
-  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
-  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
-  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
-  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
-  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
-  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
-  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
-  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
-  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
-  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
-  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
-  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
-  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
-  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
-  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
-  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
-  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
-  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
-  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
-  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
-  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
-  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
-  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
-  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
-  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
-  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
-  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
-  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
-  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
-  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
-  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
-  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
-  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
-  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
-  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
-  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
-  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
-  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
-  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
-  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
-  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
-  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
-  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
-  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
-  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
-  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
-  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
-  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
-  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
-  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
-  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
-  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
-  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
-  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
-  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
-  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
-  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
-  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
-  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
-  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
-  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
-  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
-  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
-  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
-  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
-  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
-  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
-  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
-  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
-  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
-  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
-  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
-  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
-  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
-  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
-  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
-  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
-  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
-  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
-  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
-  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
-  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
-  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
-  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
-  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
-  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
-  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
-  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
-  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
-  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
-  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
-  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
-  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
-  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
-  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
-  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
-  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
-  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
-  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
-  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
-  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
-  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
-  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
-  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
-  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
-  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
-  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
-  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
-  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
-  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
-  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
-  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
-  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
-  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
-  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
-  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
-  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
-  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
-  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
-  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
-  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
-  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
-  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
-  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
-  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
-  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
-  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
-  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
-  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
-  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
-  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
-  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
-  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
-  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
-  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
-  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
-  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
-  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
-  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
-  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
-  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
-  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
-  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
-  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
-  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
-  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
-  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
-  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
-  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
-  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
-  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
-  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
-  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
-  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
-  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
-  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
-  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
-  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
-  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
-  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
-  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
-  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
-  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
-  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
-  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
-  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
-  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
-  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
-  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
-  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
-  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
-  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
-  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
-  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
-  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
-  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
-  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
-  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
-  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
-  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
-  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
-  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
-  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
-  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
-  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
-  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
-  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
-  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
-  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
-  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
-  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
-  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
-  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
-  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
-  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
-  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
-  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
-  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
-  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
-  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
-  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
-  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
-  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
-  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
-  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
-  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
-  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
-  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
-  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
-  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
-  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
-  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
-  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
-  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
-  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
-  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
-  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
-  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
-  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
-  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
-  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
-  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
-  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
-  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
-  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
-  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
-  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
-  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
-  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
-  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
-  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
-  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
-  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
-  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
-  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
-  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
-  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
-  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
-  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
-  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
-  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
-  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
-  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
-  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
-  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
-  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
-  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
-  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
-  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
-  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
-  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
-  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
-  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
-  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
-  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
-  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
-  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
-  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
-  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
-  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
-  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
-  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
-  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
-  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
-  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
-  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
-  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
-  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
-  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
-  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
-  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
-  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
-  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
-  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
-  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
-  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
-  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
-  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
-  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
-  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
-  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
-  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
-  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
-  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
-  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
-  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
-  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
-  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
-  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
-  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
-  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
-  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
-  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
-  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
-  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
-  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
-  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
-  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
-  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
-  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
-  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
-  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
-  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
-  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
-  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
-  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
-  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
-  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
-  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
-  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
-  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
-  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
-  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
-  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
-  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
-  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
-  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
-  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
-  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
-  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
-  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
-  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
-  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
-  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
-  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
-  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
-  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
-  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
-  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
-  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
-  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
-  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
-  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
-  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
-  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
-  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
-  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
-  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
-  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
-  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
-  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
-  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
-  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
-  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
-  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
-  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
-  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
-  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
-  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
-  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
-  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
-  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
-  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
-  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
-  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
-  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
-  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
-  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
-  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
-  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
-  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
-  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
-  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
-  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
-  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
-  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
-  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
-  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
-  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
-  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
-  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
-  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
-  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
-  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
-  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
-  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
-  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
-  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
-  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
-  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
-  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
-  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
-  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
-  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
-  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
-  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
-  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
-  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
-  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
-  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
-  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
-  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
-  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
-  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
-  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
-  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
-  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
-  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
-  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
-  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
-  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
-  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
-  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
-  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
-  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
-  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
-  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
-  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
-  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
-  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
-  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
-  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
-  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
-  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
-  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
-  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
-  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
-  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
-  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
-  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
-  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
-  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
-  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
-  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
-  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
-  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
-  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
-  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
-  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
-  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
-  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
-  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
-  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
-  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
-  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
-  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
-  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
-  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
-  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
-  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
-  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
-  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
-  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
-  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
-  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
-  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
-  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
-  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
-  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
-  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
-  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
-  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
-  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
-  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
-  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
-  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
-  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
-  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
-  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
-  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
-  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
-  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
-  835584U, // <u,1,2,3>: Cost 0 copy LHS
-  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
-  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
-  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
-  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
-  835584U, // <u,1,2,u>: Cost 0 copy LHS
-  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
-  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
-  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
-  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
-  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
-  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
-  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
-  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
-  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
-  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
-  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
-  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
-  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
-  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
-  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
-  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
-  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
-  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
-  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
-  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
-  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
-  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
-  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
-  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
-  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
-  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
-  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
-  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
-  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
-  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
-  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
-  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
-  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
-  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
-  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
-  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
-  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
-  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
-  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
-  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
-  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
-  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
-  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
-  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
-  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
-  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
-  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
-  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
-  835584U, // <u,1,u,3>: Cost 0 copy LHS
-  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
-  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
-  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
-  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
-  835584U, // <u,1,u,u>: Cost 0 copy LHS
-  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
-  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
-  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
-  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
-  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
-  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
-  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
-  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
-  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
-  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
-  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
-  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
-  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
-  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
-  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
-  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
-  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
-  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
-  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
-  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
-  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
-  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
-  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
-  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
-  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
-  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
-  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
-  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
-  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
-  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
-  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
-  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
-  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
-  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
-  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
-  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
-  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
-  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
-  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
-  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
-  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
-  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
-  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
-  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
-  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
-  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
-  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
-  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
-  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
-  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
-  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
-  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
-  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
-  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
-  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
-  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
-  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
-  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
-  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
-  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
-  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
-  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
-  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
-  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
-  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
-  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
-  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
-  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
-  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
-  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
-  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
-  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
-  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
-  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
-  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
-  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
-  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
-  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
-  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
-  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
-  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
-  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
-  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
-  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
-  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
-  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
-  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
-  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
-  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
-  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
-  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
-  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
-  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
-  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
-  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
-  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
-  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
-  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
-  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
-  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
-  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
-  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
-  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
-  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
-  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
-  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
-  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
-  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
-  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
-  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
-  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
-  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
-  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
-  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
-  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
-  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
-  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
-  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
-  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
-  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
-  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
-  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
-  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
-  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
-  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
-  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
-  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
-  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
-  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
-  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
-  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
-  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
-  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
-  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
-  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
-  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
-  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
-  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
-  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
-  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
-  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
-  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
-  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
-  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
-  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
-  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
-  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
-  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
-  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
-  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
-  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
-  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
-  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
-  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
-  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
-  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
-  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
-  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
-  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
-  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
-  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
-  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
-  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
-  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
-  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
-  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
-  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
-  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
-  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
-  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
-  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
-  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
-  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
-  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
-  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
-  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
-  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
-  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
-  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
-  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
-  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
-  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
-  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
-  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
-  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
-  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
-  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
-  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
-  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
-  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
-  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
-  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
-  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
-  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
-  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
-  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
-  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
-  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
-  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
-  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
-  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
-  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
-  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
-  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
-  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
-  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
-  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
-  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
-  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
-  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
-  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
-  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
-  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
-  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
-  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
-  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
-  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
-  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
-  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
-  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
-  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
-  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
-  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
-  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
-  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
-  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
-  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
-  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
-  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
-  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
-  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
-  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
-  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
-  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
-  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
-  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
-  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
-  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
-  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
-  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
-  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
-  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
-  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
-  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
-  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
-  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
-  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
-  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
-  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
-  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
-  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
-  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
-  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
-  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
-  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
-  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
-  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
-  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
-  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
-  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
-  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
-  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
-  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
-  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
-  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
-  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
-  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
-  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
-  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
-  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
-  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
-  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
-  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
-  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
-  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
-  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
-  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
-  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
-  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
-  27705344U, // <u,5,6,7>: Cost 0 copy RHS
-  27705344U, // <u,5,6,u>: Cost 0 copy RHS
-  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
-  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
-  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
-  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
-  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
-  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
-  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
-  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
-  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
-  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
-  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
-  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
-  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
-  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
-  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
-  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  27705344U, // <u,5,u,7>: Cost 0 copy RHS
-  27705344U, // <u,5,u,u>: Cost 0 copy RHS
-  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
-  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
-  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
-  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
-  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
-  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
-  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
-  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
-  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
-  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
-  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
-  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
-  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
-  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
-  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
-  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
-  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
-  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
-  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
-  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
-  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
-  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
-  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
-  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
-  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
-  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
-  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
-  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
-  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
-  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
-  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
-  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
-  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
-  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
-  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
-  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
-  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
-  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
-  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
-  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
-  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
-  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
-  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
-  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
-  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
-  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
-  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
-  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
-  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
-  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
-  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
-  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
-  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
-  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
-  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
-  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
-  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
-  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
-  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
-  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
-  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
-  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
-  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
-  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
-  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
-  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
-  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
-  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
-  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
-  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
-  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
-  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
-  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
-  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
-  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
-  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
-  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
-  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
-  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
-  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
-  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
-  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
-  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
-  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
-  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
-  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
-  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
-  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
-  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
-  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
-  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
-  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
-  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
-  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
-  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
-  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
-  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
-  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
-  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
-  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
-  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
-  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
-  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
-  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
-  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
-  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
-  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
-  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
-  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
-  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
-  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
-  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
-  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
-  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
-  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
-  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
-  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
-  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
-  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
-  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
-  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
-  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
-  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
-  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
-  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
-  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
-  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
-  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
-  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
-  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
-  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
-  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
-  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
-  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
-  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
-  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
-  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
-  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
-  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
-  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
-  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
-  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
-  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
-  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
-  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
-  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
-  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
-  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
-  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
-  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
-  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
-  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
-  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
-  835584U, // <u,u,2,3>: Cost 0 copy LHS
-  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
-  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
-  835584U, // <u,u,2,u>: Cost 0 copy LHS
-  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
-  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
-  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
-  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
-  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
-  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
-  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
-  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
-  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
-  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
-  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
-  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
-  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
-  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
-  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
-  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
-  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
-  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
-  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
-  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
-  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
-  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
-  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
-  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
-  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
-  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
-  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
-  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
-  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
-  27705344U, // <u,u,6,7>: Cost 0 copy RHS
-  27705344U, // <u,u,6,u>: Cost 0 copy RHS
-  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
-  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
-  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
-  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
-  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
-  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
-  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
-  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
-  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
-  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
-  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
-  835584U, // <u,u,u,3>: Cost 0 copy LHS
-  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
-  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
-  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
-  27705344U, // <u,u,u,7>: Cost 0 copy RHS
-  835584U, // <u,u,u,u>: Cost 0 copy LHS
-  0
-};
diff --git a/lib/Target/ARM64/ARM64PromoteConstant.cpp b/lib/Target/ARM64/ARM64PromoteConstant.cpp
deleted file mode 100644
index 9fbaedb..0000000
--- a/lib/Target/ARM64/ARM64PromoteConstant.cpp
+++ /dev/null
@@ -1,585 +0,0 @@
-
-//===-- ARM64PromoteConstant.cpp --- Promote constant to global for ARM64 -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64PromoteConstant pass which promotes constant
-// to global variables when this is likely to be more efficient.
-// Currently only types related to constant vector (i.e., constant vector, array
-// of constant vectors, constant structure with a constant vector field, etc.)
-// are promoted to global variables.
-// Indeed, constant vector are likely to be lowered in target constant pool
-// during instruction selection.
-// Therefore, the access will remain the same (memory load), but the structures
-// types are not split into different constant pool accesses for each field.
-// The bonus side effect is that created globals may be merged by the global
-// merge pass.
-//
-// FIXME: This pass may be useful for other targets too.
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-promote-const"
-#include "ARM64.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-// Stress testing mode - disable heuristics.
-static cl::opt<bool> Stress("arm64-stress-promote-const", cl::Hidden,
-                            cl::desc("Promote all vector constants"));
-
-STATISTIC(NumPromoted, "Number of promoted constants");
-STATISTIC(NumPromotedUses, "Number of promoted constants uses");
-
-//===----------------------------------------------------------------------===//
-//                       ARM64PromoteConstant
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Promotes interesting constant into global variables.
-/// The motivating example is:
-/// static const uint16_t TableA[32] = {
-///   41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768,
-///   31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215,
-///   25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846,
-///   21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725,
-/// };
-///
-/// uint8x16x4_t LoadStatic(void) {
-///   uint8x16x4_t ret;
-///   ret.val[0] = vld1q_u16(TableA +  0);
-///   ret.val[1] = vld1q_u16(TableA +  8);
-///   ret.val[2] = vld1q_u16(TableA + 16);
-///   ret.val[3] = vld1q_u16(TableA + 24);
-///   return ret;
-/// }
-///
-/// The constants in that example are folded into the uses. Thus, 4 different
-/// constants are created.
-/// As their type is vector the cheapest way to create them is to load them
-/// for the memory.
-/// Therefore the final assembly final has 4 different load.
-/// With this pass enabled, only one load is issued for the constants.
-class ARM64PromoteConstant : public ModulePass {
-
-public:
-  static char ID;
-  ARM64PromoteConstant() : ModulePass(ID) {}
-
-  virtual const char *getPassName() const { return "ARM64 Promote Constant"; }
-
-  /// Iterate over the functions and promote the interesting constants into
-  /// global variables with module scope.
-  bool runOnModule(Module &M) {
-    DEBUG(dbgs() << getPassName() << '\n');
-    bool Changed = false;
-    for (auto &MF: M) {
-      Changed |= runOnFunction(MF);
-    }
-    return Changed;
-  }
-
-private:
-  /// Look for interesting constants used within the given function.
-  /// Promote them into global variables, load these global variables within
-  /// the related function, so that the number of inserted load is minimal.
-  bool runOnFunction(Function &F);
-
-  // This transformation requires dominator info
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.setPreservesCFG();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-  }
-
-  /// Type to store a list of User
-  typedef SmallVector<Value::user_iterator, 4> Users;
-  /// Map an insertion point to all the uses it dominates.
-  typedef DenseMap<Instruction *, Users> InsertionPoints;
-  /// Map a function to the required insertion point of load for a
-  /// global variable
-  typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
-
-  /// Find the closest point that dominates the given Use.
-  Instruction *findInsertionPoint(Value::user_iterator &Use);
-
-  /// Check if the given insertion point is dominated by an existing
-  /// insertion point.
-  /// If true, the given use is added to the list of dominated uses for
-  /// the related existing point.
-  /// \param NewPt the insertion point to be checked
-  /// \param UseIt the use to be added into the list of dominated uses
-  /// \param InsertPts existing insertion points
-  /// \pre NewPt and all instruction in InsertPts belong to the same function
-  /// \return true if one of the insertion point in InsertPts dominates NewPt,
-  ///         false otherwise
-  bool isDominated(Instruction *NewPt, Value::user_iterator &UseIt,
-                   InsertionPoints &InsertPts);
-
-  /// Check if the given insertion point can be merged with an existing
-  /// insertion point in a common dominator.
-  /// If true, the given use is added to the list of the created insertion
-  /// point.
-  /// \param NewPt the insertion point to be checked
-  /// \param UseIt the use to be added into the list of dominated uses
-  /// \param InsertPts existing insertion points
-  /// \pre NewPt and all instruction in InsertPts belong to the same function
-  /// \pre isDominated returns false for the exact same parameters.
-  /// \return true if it exists an insertion point in InsertPts that could
-  ///         have been merged with NewPt in a common dominator,
-  ///         false otherwise
-  bool tryAndMerge(Instruction *NewPt, Value::user_iterator &UseIt,
-                   InsertionPoints &InsertPts);
-
-  /// Compute the minimal insertion points to dominates all the interesting
-  /// uses of value.
-  /// Insertion points are group per function and each insertion point
-  /// contains a list of all the uses it dominates within the related function
-  /// \param Val constant to be examined
-  /// \param[out] InsPtsPerFunc output storage of the analysis
-  void computeInsertionPoints(Constant *Val,
-                              InsertionPointsPerFunc &InsPtsPerFunc);
-
-  /// Insert a definition of a new global variable at each point contained in
-  /// InsPtsPerFunc and update the related uses (also contained in
-  /// InsPtsPerFunc).
-  bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc);
-
-  /// Compute the minimal insertion points to dominate all the interesting
-  /// uses of Val and insert a definition of a new global variable
-  /// at these points.
-  /// Also update the uses of Val accordingly.
-  /// Currently a use of Val is considered interesting if:
-  /// - Val is not UndefValue
-  /// - Val is not zeroinitialized
-  /// - Replacing Val per a load of a global variable is valid.
-  /// \see shouldConvert for more details
-  bool computeAndInsertDefinitions(Constant *Val);
-
-  /// Promote the given constant into a global variable if it is expected to
-  /// be profitable.
-  /// \return true if Cst has been promoted
-  bool promoteConstant(Constant *Cst);
-
-  /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
-  /// Append UseIt to this list and delete the entry of IPI in InsertPts.
-  static void appendAndTransferDominatedUses(Instruction *NewPt,
-                                             Value::user_iterator &UseIt,
-                                             InsertionPoints::iterator &IPI,
-                                             InsertionPoints &InsertPts) {
-    // Record the dominated use
-    IPI->second.push_back(UseIt);
-    // Transfer the dominated uses of IPI to NewPt
-    // Inserting into the DenseMap may invalidate existing iterator.
-    // Keep a copy of the key to find the iterator to erase.
-    Instruction *OldInstr = IPI->first;
-    InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second));
-    // Erase IPI
-    IPI = InsertPts.find(OldInstr);
-    InsertPts.erase(IPI);
-  }
-};
-} // end anonymous namespace
-
-char ARM64PromoteConstant::ID = 0;
-
-namespace llvm {
-void initializeARM64PromoteConstantPass(PassRegistry &);
-}
-
-INITIALIZE_PASS_BEGIN(ARM64PromoteConstant, "arm64-promote-const",
-                      "ARM64 Promote Constant Pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ARM64PromoteConstant, "arm64-promote-const",
-                    "ARM64 Promote Constant Pass", false, false)
-
-ModulePass *llvm::createARM64PromoteConstantPass() {
-  return new ARM64PromoteConstant();
-}
-
-/// Check if the given type uses a vector type.
-static bool isConstantUsingVectorTy(const Type *CstTy) {
-  if (CstTy->isVectorTy())
-    return true;
-  if (CstTy->isStructTy()) {
-    for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements();
-         EltIdx < EndEltIdx; ++EltIdx)
-      if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx)))
-        return true;
-  } else if (CstTy->isArrayTy())
-    return isConstantUsingVectorTy(CstTy->getArrayElementType());
-  return false;
-}
-
-/// Check if the given use (Instruction + OpIdx) of Cst should be converted into
-/// a load of a global variable initialized with Cst.
-/// A use should be converted if it is legal to do so.
-/// For instance, it is not legal to turn the mask operand of a shuffle vector
-/// into a load of a global variable.
-static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
-                             unsigned OpIdx) {
-  // shufflevector instruction expects a const for the mask argument, i.e., the
-  // third argument. Do not promote this use in that case.
-  if (isa<const ShuffleVectorInst>(Instr) && OpIdx == 2)
-    return false;
-
-  // extractvalue instruction expects a const idx
-  if (isa<const ExtractValueInst>(Instr) && OpIdx > 0)
-    return false;
-
-  // extractvalue instruction expects a const idx
-  if (isa<const InsertValueInst>(Instr) && OpIdx > 1)
-    return false;
-
-  if (isa<const AllocaInst>(Instr) && OpIdx > 0)
-    return false;
-
-  // Alignment argument must be constant
-  if (isa<const LoadInst>(Instr) && OpIdx > 0)
-    return false;
-
-  // Alignment argument must be constant
-  if (isa<const StoreInst>(Instr) && OpIdx > 1)
-    return false;
-
-  // Index must be constant
-  if (isa<const GetElementPtrInst>(Instr) && OpIdx > 0)
-    return false;
-
-  // Personality function and filters must be constant.
-  // Give up on that instruction.
-  if (isa<const LandingPadInst>(Instr))
-    return false;
-
-  // switch instruction expects constants to compare to
-  if (isa<const SwitchInst>(Instr))
-    return false;
-
-  // Expected address must be a constant
-  if (isa<const IndirectBrInst>(Instr))
-    return false;
-
-  // Do not mess with intrinsic
-  if (isa<const IntrinsicInst>(Instr))
-    return false;
-
-  // Do not mess with inline asm
-  const CallInst *CI = dyn_cast<const CallInst>(Instr);
-  if (CI && isa<const InlineAsm>(CI->getCalledValue()))
-    return false;
-
-  return true;
-}
-
-/// Check if the given Cst should be converted into
-/// a load of a global variable initialized with Cst.
-/// A constant should be converted if it is likely that the materialization of
-/// the constant will be tricky. Thus, we give up on zero or undef values.
-///
-/// \todo Currently, accept only vector related types.
-/// Also we give up on all simple vector type to keep the existing
-/// behavior. Otherwise, we should push here all the check of the lowering of
-/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging
-/// constant via global merge and the fact that the same constant is stored
-/// only once with this method (versus, as many function that uses the constant
-/// for the regular approach, even for float).
-/// Again, the simplest solution would be to promote every
-/// constant and rematerialize them when they are actually cheap to create.
-static bool shouldConvert(const Constant *Cst) {
-  if (isa<const UndefValue>(Cst))
-    return false;
-
-  // FIXME: In some cases, it may be interesting to promote in memory
-  // a zero initialized constant.
-  // E.g., when the type of Cst require more instructions than the
-  // adrp/add/load sequence or when this sequence can be shared by several
-  // instances of Cst.
-  // Ideally, we could promote this into a global and rematerialize the constant
-  // when it was a bad idea.
-  if (Cst->isZeroValue())
-    return false;
-
-  if (Stress)
-    return true;
-
-  // FIXME: see function \todo
-  if (Cst->getType()->isVectorTy())
-    return false;
-  return isConstantUsingVectorTy(Cst->getType());
-}
-
-Instruction *
-ARM64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
-  // If this user is a phi, the insertion point is in the related
-  // incoming basic block
-  PHINode *PhiInst = dyn_cast<PHINode>(*Use);
-  Instruction *InsertionPoint;
-  if (PhiInst)
-    InsertionPoint =
-        PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
-  else
-    InsertionPoint = dyn_cast<Instruction>(*Use);
-  assert(InsertionPoint && "User is not an instruction!");
-  return InsertionPoint;
-}
-
-bool ARM64PromoteConstant::isDominated(Instruction *NewPt,
-                                       Value::user_iterator &UseIt,
-                                       InsertionPoints &InsertPts) {
-
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
-      *NewPt->getParent()->getParent()).getDomTree();
-
-  // Traverse all the existing insertion point and check if one is dominating
-  // NewPt
-  for (InsertionPoints::iterator IPI = InsertPts.begin(),
-                                 EndIPI = InsertPts.end();
-       IPI != EndIPI; ++IPI) {
-    if (NewPt == IPI->first || DT.dominates(IPI->first, NewPt) ||
-        // When IPI->first is a terminator instruction, DT may think that
-        // the result is defined on the edge.
-        // Here we are testing the insertion point, not the definition.
-        (IPI->first->getParent() != NewPt->getParent() &&
-         DT.dominates(IPI->first->getParent(), NewPt->getParent()))) {
-      // No need to insert this point
-      // Record the dominated use
-      DEBUG(dbgs() << "Insertion point dominated by:\n");
-      DEBUG(IPI->first->print(dbgs()));
-      DEBUG(dbgs() << '\n');
-      IPI->second.push_back(UseIt);
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ARM64PromoteConstant::tryAndMerge(Instruction *NewPt,
-                                       Value::user_iterator &UseIt,
-                                       InsertionPoints &InsertPts) {
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
-      *NewPt->getParent()->getParent()).getDomTree();
-  BasicBlock *NewBB = NewPt->getParent();
-
-  // Traverse all the existing insertion point and check if one is dominated by
-  // NewPt and thus useless or can be combined with NewPt into a common
-  // dominator
-  for (InsertionPoints::iterator IPI = InsertPts.begin(),
-                                 EndIPI = InsertPts.end();
-       IPI != EndIPI; ++IPI) {
-    BasicBlock *CurBB = IPI->first->getParent();
-    if (NewBB == CurBB) {
-      // Instructions are in the same block.
-      // By construction, NewPt is dominating the other.
-      // Indeed, isDominated returned false with the exact same arguments.
-      DEBUG(dbgs() << "Merge insertion point with:\n");
-      DEBUG(IPI->first->print(dbgs()));
-      DEBUG(dbgs() << "\nat considered insertion point.\n");
-      appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
-      return true;
-    }
-
-    // Look for a common dominator
-    BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB);
-    // If none exists, we cannot merge these two points
-    if (!CommonDominator)
-      continue;
-
-    if (CommonDominator != NewBB) {
-      // By construction, the CommonDominator cannot be CurBB
-      assert(CommonDominator != CurBB &&
-             "Instruction has not been rejected during isDominated check!");
-      // Take the last instruction of the CommonDominator as insertion point
-      NewPt = CommonDominator->getTerminator();
-    }
-    // else, CommonDominator is the block of NewBB, hence NewBB is the last
-    // possible insertion point in that block
-    DEBUG(dbgs() << "Merge insertion point with:\n");
-    DEBUG(IPI->first->print(dbgs()));
-    DEBUG(dbgs() << '\n');
-    DEBUG(NewPt->print(dbgs()));
-    DEBUG(dbgs() << '\n');
-    appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts);
-    return true;
-  }
-  return false;
-}
-
-void ARM64PromoteConstant::computeInsertionPoints(
-    Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
-  DEBUG(dbgs() << "** Compute insertion points **\n");
-  for (Value::user_iterator UseIt = Val->user_begin(),
-                            EndUseIt = Val->user_end();
-       UseIt != EndUseIt; ++UseIt) {
-    // If the user is not an Instruction, we cannot modify it
-    if (!isa<Instruction>(*UseIt))
-      continue;
-
-    // Filter out uses that should not be converted
-    if (!shouldConvertUse(Val, cast<Instruction>(*UseIt), UseIt.getOperandNo()))
-      continue;
-
-    DEBUG(dbgs() << "Considered use, opidx " << UseIt.getOperandNo() << ":\n");
-    DEBUG((*UseIt)->print(dbgs()));
-    DEBUG(dbgs() << '\n');
-
-    Instruction *InsertionPoint = findInsertionPoint(UseIt);
-
-    DEBUG(dbgs() << "Considered insertion point:\n");
-    DEBUG(InsertionPoint->print(dbgs()));
-    DEBUG(dbgs() << '\n');
-
-    // Check if the current insertion point is useless, i.e., it is dominated
-    // by another one.
-    InsertionPoints &InsertPts =
-        InsPtsPerFunc[InsertionPoint->getParent()->getParent()];
-    if (isDominated(InsertionPoint, UseIt, InsertPts))
-      continue;
-    // This insertion point is useful, check if we can merge some insertion
-    // point in a common dominator or if NewPt dominates an existing one.
-    if (tryAndMerge(InsertionPoint, UseIt, InsertPts))
-      continue;
-
-    DEBUG(dbgs() << "Keep considered insertion point\n");
-
-    // It is definitely useful by its own
-    InsertPts[InsertionPoint].push_back(UseIt);
-  }
-}
-
-bool
-ARM64PromoteConstant::insertDefinitions(Constant *Cst,
-                                        InsertionPointsPerFunc &InsPtsPerFunc) {
-  // We will create one global variable per Module
-  DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
-  bool HasChanged = false;
-
-  // Traverse all insertion points in all the function
-  for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(),
-                                        EndIt = InsPtsPerFunc.end();
-       FctToInstPtsIt != EndIt; ++FctToInstPtsIt) {
-    InsertionPoints &InsertPts = FctToInstPtsIt->second;
-// Do more check for debug purposes
-#ifndef NDEBUG
-    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
-        *FctToInstPtsIt->first).getDomTree();
-#endif
-    GlobalVariable *PromotedGV;
-    assert(!InsertPts.empty() && "Empty uses does not need a definition");
-
-    Module *M = FctToInstPtsIt->first->getParent();
-    DenseMap<Module *, GlobalVariable *>::iterator MapIt =
-        ModuleToMergedGV.find(M);
-    if (MapIt == ModuleToMergedGV.end()) {
-      PromotedGV = new GlobalVariable(
-          *M, Cst->getType(), true, GlobalValue::InternalLinkage, 0,
-          "_PromotedConst", 0, GlobalVariable::NotThreadLocal);
-      PromotedGV->setInitializer(Cst);
-      ModuleToMergedGV[M] = PromotedGV;
-      DEBUG(dbgs() << "Global replacement: ");
-      DEBUG(PromotedGV->print(dbgs()));
-      DEBUG(dbgs() << '\n');
-      ++NumPromoted;
-      HasChanged = true;
-    } else {
-      PromotedGV = MapIt->second;
-    }
-
-    for (InsertionPoints::iterator IPI = InsertPts.begin(),
-                                   EndIPI = InsertPts.end();
-         IPI != EndIPI; ++IPI) {
-      // Create the load of the global variable
-      IRBuilder<> Builder(IPI->first->getParent(), IPI->first);
-      LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
-      DEBUG(dbgs() << "**********\n");
-      DEBUG(dbgs() << "New def: ");
-      DEBUG(LoadedCst->print(dbgs()));
-      DEBUG(dbgs() << '\n');
-
-      // Update the dominated uses
-      Users &DominatedUsers = IPI->second;
-      for (Users::iterator UseIt = DominatedUsers.begin(),
-                           EndIt = DominatedUsers.end();
-           UseIt != EndIt; ++UseIt) {
-#ifndef NDEBUG
-        assert((DT.dominates(LoadedCst, cast<Instruction>(**UseIt)) ||
-                (isa<PHINode>(**UseIt) &&
-                 DT.dominates(LoadedCst, findInsertionPoint(*UseIt)))) &&
-               "Inserted definition does not dominate all its uses!");
-#endif
-        DEBUG(dbgs() << "Use to update " << UseIt->getOperandNo() << ":");
-        DEBUG((*UseIt)->print(dbgs()));
-        DEBUG(dbgs() << '\n');
-        (*UseIt)->setOperand(UseIt->getOperandNo(), LoadedCst);
-        ++NumPromotedUses;
-      }
-    }
-  }
-  return HasChanged;
-}
-
-bool ARM64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
-  InsertionPointsPerFunc InsertPtsPerFunc;
-  computeInsertionPoints(Val, InsertPtsPerFunc);
-  return insertDefinitions(Val, InsertPtsPerFunc);
-}
-
-bool ARM64PromoteConstant::promoteConstant(Constant *Cst) {
-  assert(Cst && "Given variable is not a valid constant.");
-
-  if (!shouldConvert(Cst))
-    return false;
-
-  DEBUG(dbgs() << "******************************\n");
-  DEBUG(dbgs() << "Candidate constant: ");
-  DEBUG(Cst->print(dbgs()));
-  DEBUG(dbgs() << '\n');
-
-  return computeAndInsertDefinitions(Cst);
-}
-
-bool ARM64PromoteConstant::runOnFunction(Function &F) {
-  // Look for instructions using constant vector
-  // Promote that constant to a global variable.
-  // Create as few load of this variable as possible and update the uses
-  // accordingly
-  bool LocalChange = false;
-  SmallSet<Constant *, 8> AlreadyChecked;
-
-  for (auto &MBB : F) {
-    for (auto &MI: MBB) {
-      // Traverse the operand, looking for constant vectors
-      // Replace them by a load of a global variable of type constant vector
-      for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands();
-           OpIdx != EndOpIdx; ++OpIdx) {
-        Constant *Cst = dyn_cast<Constant>(MI.getOperand(OpIdx));
-        // There is no point is promoting global value, they are already global.
-        // Do not promote constant expression, as they may require some code
-        // expansion.
-        if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
-            AlreadyChecked.insert(Cst))
-          LocalChange |= promoteConstant(Cst);
-      }
-    }
-  }
-  return LocalChange;
-}
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.cpp b/lib/Target/ARM64/ARM64RegisterInfo.cpp
deleted file mode 100644
index 4c7fc8a..0000000
--- a/lib/Target/ARM64/ARM64RegisterInfo.cpp
+++ /dev/null
@@ -1,400 +0,0 @@
-//===- ARM64RegisterInfo.cpp - ARM64 Register Information -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the TargetRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64RegisterInfo.h"
-#include "ARM64FrameLowering.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetOptions.h"
-
-#define GET_REGINFO_TARGET_DESC
-#include "ARM64GenRegisterInfo.inc"
-
-using namespace llvm;
-
-ARM64RegisterInfo::ARM64RegisterInfo(const ARM64InstrInfo *tii,
-                                     const ARM64Subtarget *sti)
-    : ARM64GenRegisterInfo(ARM64::LR), TII(tii), STI(sti) {}
-
-const uint16_t *
-ARM64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  assert(MF && "Invalid MachineFunction pointer.");
-  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
-    return CSR_ARM64_AllRegs_SaveList;
-  else
-    return CSR_ARM64_AAPCS_SaveList;
-}
-
-const uint32_t *
-ARM64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
-  if (CC == CallingConv::AnyReg)
-    return CSR_ARM64_AllRegs_RegMask;
-  else
-    return CSR_ARM64_AAPCS_RegMask;
-}
-
-const uint32_t *ARM64RegisterInfo::getTLSCallPreservedMask() const {
-  if (STI->isTargetDarwin())
-    return CSR_ARM64_TLS_Darwin_RegMask;
-
-  assert(STI->isTargetELF() && "only expect Darwin or ELF TLS");
-  return CSR_ARM64_TLS_ELF_RegMask;
-}
-
-const uint32_t *
-ARM64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
-  // This should return a register mask that is the same as that returned by
-  // getCallPreservedMask but that additionally preserves the register used for
-  // the first i64 argument (which must also be the register used to return a
-  // single i64 return value)
-  //
-  // In case that the calling convention does not use the same register for
-  // both, the function should return NULL (does not currently apply)
-  return CSR_ARM64_AAPCS_ThisReturn_RegMask;
-}
-
-BitVector ARM64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  // FIXME: avoid re-calculating this everytime.
-  BitVector Reserved(getNumRegs());
-  Reserved.set(ARM64::SP);
-  Reserved.set(ARM64::XZR);
-  Reserved.set(ARM64::WSP);
-  Reserved.set(ARM64::WZR);
-
-  if (TFI->hasFP(MF) || STI->isTargetDarwin()) {
-    Reserved.set(ARM64::FP);
-    Reserved.set(ARM64::W29);
-  }
-
-  if (STI->isTargetDarwin()) {
-    Reserved.set(ARM64::X18); // Platform register
-    Reserved.set(ARM64::W18);
-  }
-
-  if (hasBasePointer(MF)) {
-    Reserved.set(ARM64::X19);
-    Reserved.set(ARM64::W19);
-  }
-
-  return Reserved;
-}
-
-bool ARM64RegisterInfo::isReservedReg(const MachineFunction &MF,
-                                      unsigned Reg) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  switch (Reg) {
-  default:
-    break;
-  case ARM64::SP:
-  case ARM64::XZR:
-  case ARM64::WSP:
-  case ARM64::WZR:
-    return true;
-  case ARM64::X18:
-  case ARM64::W18:
-    return STI->isTargetDarwin();
-  case ARM64::FP:
-  case ARM64::W29:
-    return TFI->hasFP(MF) || STI->isTargetDarwin();
-  case ARM64::W19:
-  case ARM64::X19:
-    return hasBasePointer(MF);
-  }
-
-  return false;
-}
-
-const TargetRegisterClass *
-ARM64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                      unsigned Kind) const {
-  return &ARM64::GPR64RegClass;
-}
-
-const TargetRegisterClass *
-ARM64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
-  if (RC == &ARM64::CCRRegClass)
-    return NULL; // Can't copy CPSR.
-  return RC;
-}
-
-unsigned ARM64RegisterInfo::getBaseRegister() const { return ARM64::X19; }
-
-bool ARM64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // In the presence of variable sized objects, if the fixed stack size is
-  // large enough that referencing from the FP won't result in things being
-  // in range relatively often, we can use a base pointer to allow access
-  // from the other direction like the SP normally works.
-  if (MFI->hasVarSizedObjects()) {
-    // Conservatively estimate whether the negative offset from the frame
-    // pointer will be sufficient to reach. If a function has a smallish
-    // frame, it's less likely to have lots of spills and callee saved
-    // space, so it's all more likely to be within range of the frame pointer.
-    // If it's wrong, we'll materialize the constant and still get to the
-    // object; it's just suboptimal. Negative offsets use the unscaled
-    // load/store instructions, which have a 9-bit signed immediate.
-    if (MFI->getLocalFrameSize() < 256)
-      return false;
-    return true;
-  }
-
-  return false;
-}
-
-unsigned ARM64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  return TFI->hasFP(MF) ? ARM64::FP : ARM64::SP;
-}
-
-bool
-ARM64RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
-  return true;
-}
-
-bool ARM64RegisterInfo::requiresVirtualBaseRegisters(const MachineFunction &MF)
-    const {
-  return true;
-}
-
-bool
-ARM64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  // ARM64FrameLowering::resolveFrameIndexReference() can always fall back
-  // to the stack pointer, so only put the emergency spill slot next to the
-  // FP when there's no better way to access it (SP or base pointer).
-  return MFI->hasVarSizedObjects() && !hasBasePointer(MF);
-}
-
-bool ARM64RegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF)
-    const {
-  return true;
-}
-
-bool ARM64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  // Only consider eliminating leaf frames.
-  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
-                          MFI->adjustsStack()))
-    return true;
-  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
-}
-
-/// needsFrameBaseReg - Returns true if the instruction's frame index
-/// reference would be better served by a base register other than FP
-/// or SP. Used by LocalStackFrameAllocation to determine which frame index
-/// references it should create new base registers for.
-bool ARM64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
-                                          int64_t Offset) const {
-  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
-    assert(i < MI->getNumOperands() &&
-           "Instr doesn't have FrameIndex operand!");
-
-  // It's the load/store FI references that cause issues, as it can be difficult
-  // to materialize the offset if it won't fit in the literal field. Estimate
-  // based on the size of the local frame and some conservative assumptions
-  // about the rest of the stack frame (note, this is pre-regalloc, so
-  // we don't know everything for certain yet) whether this offset is likely
-  // to be out of range of the immediate. Return true if so.
-
-  // We only generate virtual base registers for loads and stores, so
-  // return false for everything else.
-  if (!MI->mayLoad() && !MI->mayStore())
-    return false;
-
-  // Without a virtual base register, if the function has variable sized
-  // objects, all fixed-size local references will be via the frame pointer,
-  // Approximate the offset and see if it's legal for the instruction.
-  // Note that the incoming offset is based on the SP value at function entry,
-  // so it'll be negative.
-  MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Estimate an offset from the frame pointer.
-  // Conservatively assume all GPR callee-saved registers get pushed.
-  // FP, LR, X19-X28, D8-D15. 64-bits each.
-  int64_t FPOffset = Offset - 16 * 20;
-  // Estimate an offset from the stack pointer.
-  // The incoming offset is relating to the SP at the start of the function,
-  // but when we access the local it'll be relative to the SP after local
-  // allocation, so adjust our SP-relative offset by that allocation size.
-  Offset += MFI->getLocalFrameSize();
-  // Assume that we'll have at least some spill slots allocated.
-  // FIXME: This is a total SWAG number. We should run some statistics
-  //        and pick a real one.
-  Offset += 128; // 128 bytes of spill slots
-
-  // If there is a frame pointer, try using it.
-  // The FP is only available if there is no dynamic realignment. We
-  // don't know for sure yet whether we'll need that, so we guess based
-  // on whether there are any local variables that would trigger it.
-  if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset))
-    return false;
-
-  // If we can reference via the stack pointer or base pointer, try that.
-  // FIXME: This (and the code that resolves the references) can be improved
-  //        to only disallow SP relative references in the live range of
-  //        the VLA(s). In practice, it's unclear how much difference that
-  //        would make, but it may be worth doing.
-  if (isFrameOffsetLegal(MI, Offset))
-    return false;
-
-  // The offset likely isn't legal; we want to allocate a virtual base register.
-  return true;
-}
-
-bool ARM64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
-                                           int64_t Offset) const {
-  assert(Offset <= INT_MAX && "Offset too big to fit in int.");
-  assert(MI && "Unable to get the legal offset for nil instruction.");
-  int SaveOffset = Offset;
-  return isARM64FrameOffsetLegal(*MI, SaveOffset) & ARM64FrameOffsetIsLegal;
-}
-
-/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
-/// at the beginning of the basic block.
-void ARM64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                                     unsigned BaseReg,
-                                                     int FrameIdx,
-                                                     int64_t Offset) const {
-  MachineBasicBlock::iterator Ins = MBB->begin();
-  DebugLoc DL; // Defaults to "unknown"
-  if (Ins != MBB->end())
-    DL = Ins->getDebugLoc();
-
-  const MCInstrDesc &MCID = TII->get(ARM64::ADDXri);
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  const MachineFunction &MF = *MBB->getParent();
-  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
-  unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0);
-
-  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
-      .addFrameIndex(FrameIdx)
-      .addImm(Offset)
-      .addImm(Shifter);
-}
-
-void ARM64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
-                                          int64_t Offset) const {
-  int Off = Offset; // ARM doesn't need the general 64-bit offsets
-  unsigned i = 0;
-
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-  bool Done = rewriteARM64FrameIndex(MI, i, BaseReg, Off, TII);
-  assert(Done && "Unable to resolve frame index!");
-  (void)Done;
-}
-
-void ARM64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                            int SPAdj, unsigned FIOperandNum,
-                                            RegScavenger *RS) const {
-  assert(SPAdj == 0 && "Unexpected");
-
-  MachineInstr &MI = *II;
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  const ARM64FrameLowering *TFI = static_cast<const ARM64FrameLowering *>(
-      MF.getTarget().getFrameLowering());
-
-  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  unsigned FrameReg;
-  int Offset;
-
-  // Special handling of dbg_value, stackmap and patchpoint instructions.
-  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
-      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
-    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
-                                             /*PreferFP=*/true);
-    Offset += MI.getOperand(FIOperandNum + 1).getImm();
-    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
-    return;
-  }
-
-  // Modify MI as necessary to handle as much of 'Offset' as possible
-  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
-  if (rewriteARM64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
-    return;
-
-  assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
-         "Emergency spill slot is out of reach");
-
-  // If we get here, the immediate doesn't fit into the instruction.  We folded
-  // as much as possible above.  Handle the rest, providing a register that is
-  // SP+LargeImm.
-  unsigned ScratchReg =
-      MF.getRegInfo().createVirtualRegister(&ARM64::GPR64RegClass);
-  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
-  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
-}
-
-namespace llvm {
-
-unsigned ARM64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
-                                                MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  switch (RC->getID()) {
-  default:
-    return 0;
-  case ARM64::GPR32RegClassID:
-  case ARM64::GPR32spRegClassID:
-  case ARM64::GPR32allRegClassID:
-  case ARM64::GPR64spRegClassID:
-  case ARM64::GPR64allRegClassID:
-  case ARM64::GPR64RegClassID:
-  case ARM64::GPR32commonRegClassID:
-  case ARM64::GPR64commonRegClassID:
-    return 32 - 1                                      // XZR/SP
-           - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
-           - STI->isTargetDarwin() // X18 reserved as platform register
-           - hasBasePointer(MF);   // X19
-  case ARM64::FPR8RegClassID:
-  case ARM64::FPR16RegClassID:
-  case ARM64::FPR32RegClassID:
-  case ARM64::FPR64RegClassID:
-  case ARM64::FPR128RegClassID:
-    return 32;
-
-  case ARM64::DDRegClassID:
-  case ARM64::DDDRegClassID:
-  case ARM64::DDDDRegClassID:
-  case ARM64::QQRegClassID:
-  case ARM64::QQQRegClassID:
-  case ARM64::QQQQRegClassID:
-    return 32;
-
-  case ARM64::FPR128_loRegClassID:
-    return 16;
-  }
-}
-
-} // namespace llvm
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.h b/lib/Target/ARM64/ARM64RegisterInfo.h
deleted file mode 100644
index 31d9242..0000000
--- a/lib/Target/ARM64/ARM64RegisterInfo.h
+++ /dev/null
@@ -1,101 +0,0 @@
-//===- ARM64RegisterInfo.h - ARM64 Register Information Impl ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the MRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64REGISTERINFO_H
-#define LLVM_TARGET_ARM64REGISTERINFO_H
-
-#define GET_REGINFO_HEADER
-#include "ARM64GenRegisterInfo.inc"
-
-namespace llvm {
-
-class ARM64InstrInfo;
-class ARM64Subtarget;
-class MachineFunction;
-class RegScavenger;
-class TargetRegisterClass;
-
-struct ARM64RegisterInfo : public ARM64GenRegisterInfo {
-private:
-  const ARM64InstrInfo *TII;
-  const ARM64Subtarget *STI;
-
-public:
-  ARM64RegisterInfo(const ARM64InstrInfo *tii, const ARM64Subtarget *sti);
-
-  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
-
-  /// Code Generation virtual methods...
-  const uint16_t *
-  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
-  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
-
-  unsigned getCSRFirstUseCost() const {
-    // The cost will be compared against BlockFrequency where entry has the
-    // value of 1 << 14. A value of 5 will choose to spill or split really
-    // cold path instead of using a callee-saved register.
-    return 5;
-  }
-
-  // Calls involved in thread-local variable lookup save more registers than
-  // normal calls, so they need a different mask to represent this.
-  const uint32_t *getTLSCallPreservedMask() const;
-
-  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
-  /// case that 'returned' is on an i64 first argument if the calling convention
-  /// is one that can (partially) model this attribute with a preserved mask
-  /// (i.e. it is a calling convention that uses the same register for the first
-  /// i64 argument and an i64 return value)
-  ///
-  /// Should return NULL in the case that the calling convention does not have
-  /// this property
-  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
-
-  BitVector getReservedRegs(const MachineFunction &MF) const override;
-  const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override;
-  const TargetRegisterClass *
-  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
-
-  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
-  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
-
-  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
-  bool isFrameOffsetLegal(const MachineInstr *MI,
-                          int64_t Offset) const override;
-  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
-                                    int FrameIdx,
-                                    int64_t Offset) const override;
-  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
-                         int64_t Offset) const override;
-  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                           unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const override;
-  bool cannotEliminateFrame(const MachineFunction &MF) const;
-
-  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
-  bool hasBasePointer(const MachineFunction &MF) const;
-  unsigned getBaseRegister() const;
-
-  // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const override;
-
-  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const override;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_TARGET_ARM64REGISTERINFO_H
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.td b/lib/Target/ARM64/ARM64RegisterInfo.td
deleted file mode 100644
index 96001c5..0000000
--- a/lib/Target/ARM64/ARM64RegisterInfo.td
+++ /dev/null
@@ -1,561 +0,0 @@
-//===- ARM64RegisterInfo.td - Describe the ARM64 Regisers --*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-
-class ARM64Reg<bits<16> enc, string n, list<Register> subregs = [],
-               list<string> altNames = []>
-        : Register<n, altNames> {
-  let HWEncoding = enc;
-  let Namespace = "ARM64";
-  let SubRegs = subregs;
-}
-
-let Namespace = "ARM64" in {
-  def sub_32 : SubRegIndex<32>;
-
-  def bsub : SubRegIndex<8>;
-  def hsub : SubRegIndex<16>;
-  def ssub : SubRegIndex<32>;
-  def dsub : SubRegIndex<32>;
-  def qhisub : SubRegIndex<64>;
-  def qsub : SubRegIndex<64>;
-  // Note: Code depends on these having consecutive numbers
-  def dsub0 : SubRegIndex<64>;
-  def dsub1 : SubRegIndex<64>;
-  def dsub2 : SubRegIndex<64>;
-  def dsub3 : SubRegIndex<64>;
-  // Note: Code depends on these having consecutive numbers
-  def qsub0 : SubRegIndex<128>;
-  def qsub1 : SubRegIndex<128>;
-  def qsub2 : SubRegIndex<128>;
-  def qsub3 : SubRegIndex<128>;
-}
-
-let Namespace = "ARM64" in {
-  def vreg : RegAltNameIndex;
-  def vlist1 : RegAltNameIndex;
-}
-
-//===----------------------------------------------------------------------===//
-// Registers
-//===----------------------------------------------------------------------===//
-def W0    : ARM64Reg<0,   "w0" >, DwarfRegNum<[0]>;
-def W1    : ARM64Reg<1,   "w1" >, DwarfRegNum<[1]>;
-def W2    : ARM64Reg<2,   "w2" >, DwarfRegNum<[2]>;
-def W3    : ARM64Reg<3,   "w3" >, DwarfRegNum<[3]>;
-def W4    : ARM64Reg<4,   "w4" >, DwarfRegNum<[4]>;
-def W5    : ARM64Reg<5,   "w5" >, DwarfRegNum<[5]>;
-def W6    : ARM64Reg<6,   "w6" >, DwarfRegNum<[6]>;
-def W7    : ARM64Reg<7,   "w7" >, DwarfRegNum<[7]>;
-def W8    : ARM64Reg<8,   "w8" >, DwarfRegNum<[8]>;
-def W9    : ARM64Reg<9,   "w9" >, DwarfRegNum<[9]>;
-def W10   : ARM64Reg<10, "w10">, DwarfRegNum<[10]>;
-def W11   : ARM64Reg<11, "w11">, DwarfRegNum<[11]>;
-def W12   : ARM64Reg<12, "w12">, DwarfRegNum<[12]>;
-def W13   : ARM64Reg<13, "w13">, DwarfRegNum<[13]>;
-def W14   : ARM64Reg<14, "w14">, DwarfRegNum<[14]>;
-def W15   : ARM64Reg<15, "w15">, DwarfRegNum<[15]>;
-def W16   : ARM64Reg<16, "w16">, DwarfRegNum<[16]>;
-def W17   : ARM64Reg<17, "w17">, DwarfRegNum<[17]>;
-def W18   : ARM64Reg<18, "w18">, DwarfRegNum<[18]>;
-def W19   : ARM64Reg<19, "w19">, DwarfRegNum<[19]>;
-def W20   : ARM64Reg<20, "w20">, DwarfRegNum<[20]>;
-def W21   : ARM64Reg<21, "w21">, DwarfRegNum<[21]>;
-def W22   : ARM64Reg<22, "w22">, DwarfRegNum<[22]>;
-def W23   : ARM64Reg<23, "w23">, DwarfRegNum<[23]>;
-def W24   : ARM64Reg<24, "w24">, DwarfRegNum<[24]>;
-def W25   : ARM64Reg<25, "w25">, DwarfRegNum<[25]>;
-def W26   : ARM64Reg<26, "w26">, DwarfRegNum<[26]>;
-def W27   : ARM64Reg<27, "w27">, DwarfRegNum<[27]>;
-def W28   : ARM64Reg<28, "w28">, DwarfRegNum<[28]>;
-def W29   : ARM64Reg<29, "w29">, DwarfRegNum<[29]>;
-def W30   : ARM64Reg<30, "w30">, DwarfRegNum<[30]>;
-def WSP   : ARM64Reg<31, "wsp">, DwarfRegNum<[31]>;
-def WZR   : ARM64Reg<31, "wzr">, DwarfRegAlias<WSP>;
-
-let SubRegIndices = [sub_32] in {
-def X0    : ARM64Reg<0,   "x0",  [W0]>, DwarfRegAlias<W0>;
-def X1    : ARM64Reg<1,   "x1",  [W1]>, DwarfRegAlias<W1>;
-def X2    : ARM64Reg<2,   "x2",  [W2]>, DwarfRegAlias<W2>;
-def X3    : ARM64Reg<3,   "x3",  [W3]>, DwarfRegAlias<W3>;
-def X4    : ARM64Reg<4,   "x4",  [W4]>, DwarfRegAlias<W4>;
-def X5    : ARM64Reg<5,   "x5",  [W5]>, DwarfRegAlias<W5>;
-def X6    : ARM64Reg<6,   "x6",  [W6]>, DwarfRegAlias<W6>;
-def X7    : ARM64Reg<7,   "x7",  [W7]>, DwarfRegAlias<W7>;
-def X8    : ARM64Reg<8,   "x8",  [W8]>, DwarfRegAlias<W8>;
-def X9    : ARM64Reg<9,   "x9",  [W9]>, DwarfRegAlias<W9>;
-def X10   : ARM64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
-def X11   : ARM64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
-def X12   : ARM64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
-def X13   : ARM64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
-def X14   : ARM64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
-def X15   : ARM64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
-def X16   : ARM64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
-def X17   : ARM64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
-def X18   : ARM64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
-def X19   : ARM64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
-def X20   : ARM64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
-def X21   : ARM64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
-def X22   : ARM64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
-def X23   : ARM64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
-def X24   : ARM64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
-def X25   : ARM64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
-def X26   : ARM64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
-def X27   : ARM64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
-def X28   : ARM64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
-def FP    : ARM64Reg<29, "fp",  [W29]>, DwarfRegAlias<W29>;
-def LR    : ARM64Reg<30, "lr",  [W30]>, DwarfRegAlias<W30>;
-def SP    : ARM64Reg<31, "sp",  [WSP]>, DwarfRegAlias<WSP>;
-def XZR   : ARM64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
-}
-
-// Condition code register.
-def CPSR  : ARM64Reg<0, "cpsr">;
-
-// GPR register classes with the intersections of GPR32/GPR32sp and
-// GPR64/GPR64sp for use by the coalescer.
-def GPR32common : RegisterClass<"ARM64", [i32], 32, (sequence "W%u", 0, 30)> {
-  let AltOrders = [(rotl GPR32common, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-def GPR64common : RegisterClass<"ARM64", [i64], 64,
-                                (add (sequence "X%u", 0, 28), FP, LR)> {
-  let AltOrders = [(rotl GPR64common, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-// GPR register classes which exclude SP/WSP.
-def GPR32 : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR)> {
-  let AltOrders = [(rotl GPR32, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-def GPR64 : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR)> {
-  let AltOrders = [(rotl GPR64, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-
-// GPR register classes which include SP/WSP.
-def GPR32sp : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WSP)> {
-  let AltOrders = [(rotl GPR32sp, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-def GPR64sp : RegisterClass<"ARM64", [i64], 64, (add GPR64common, SP)> {
-  let AltOrders = [(rotl GPR64sp, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-
-// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
-// constraint used by any instructions, it is used as a common super-class.
-def GPR32all : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR, WSP)>;
-def GPR64all : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR, SP)>;
-
-// For tail calls, we can't use callee-saved registers, as they are restored
-// to the saved value before the tail call, which would clobber a call address.
-// This is for indirect tail calls to store the address of the destination.
-def tcGPR64 : RegisterClass<"ARM64", [i64], 64, (sub GPR64common, X19, X20, X21,
-                                                     X22, X23, X24, X25, X26,
-                                                     X27, X28)>;
-
-// GPR register classes for post increment ammount of vector load/store that
-// has alternate printing when Rm=31 and prints a constant immediate value
-// equal to the total number of bytes transferred.
-def GPR64pi1  : RegisterOperand<GPR64, "printPostIncOperand1">;
-def GPR64pi2  : RegisterOperand<GPR64, "printPostIncOperand2">;
-def GPR64pi3  : RegisterOperand<GPR64, "printPostIncOperand3">;
-def GPR64pi4  : RegisterOperand<GPR64, "printPostIncOperand4">;
-def GPR64pi6  : RegisterOperand<GPR64, "printPostIncOperand6">;
-def GPR64pi8  : RegisterOperand<GPR64, "printPostIncOperand8">;
-def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand12">;
-def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand16">;
-def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand24">;
-def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand32">;
-def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand48">;
-def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand64">;
-
-// Condition code regclass.
-def CCR : RegisterClass<"ARM64", [i32], 32, (add CPSR)> {
-  let CopyCost = -1;  // Don't allow copying of status registers.
-
-  // CCR is not allocatable.
-  let isAllocatable = 0;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating Point Scalar Registers
-//===----------------------------------------------------------------------===//
-
-def B0    : ARM64Reg<0,   "b0">, DwarfRegNum<[64]>;
-def B1    : ARM64Reg<1,   "b1">, DwarfRegNum<[65]>;
-def B2    : ARM64Reg<2,   "b2">, DwarfRegNum<[66]>;
-def B3    : ARM64Reg<3,   "b3">, DwarfRegNum<[67]>;
-def B4    : ARM64Reg<4,   "b4">, DwarfRegNum<[68]>;
-def B5    : ARM64Reg<5,   "b5">, DwarfRegNum<[69]>;
-def B6    : ARM64Reg<6,   "b6">, DwarfRegNum<[70]>;
-def B7    : ARM64Reg<7,   "b7">, DwarfRegNum<[71]>;
-def B8    : ARM64Reg<8,   "b8">, DwarfRegNum<[72]>;
-def B9    : ARM64Reg<9,   "b9">, DwarfRegNum<[73]>;
-def B10   : ARM64Reg<10, "b10">, DwarfRegNum<[74]>;
-def B11   : ARM64Reg<11, "b11">, DwarfRegNum<[75]>;
-def B12   : ARM64Reg<12, "b12">, DwarfRegNum<[76]>;
-def B13   : ARM64Reg<13, "b13">, DwarfRegNum<[77]>;
-def B14   : ARM64Reg<14, "b14">, DwarfRegNum<[78]>;
-def B15   : ARM64Reg<15, "b15">, DwarfRegNum<[79]>;
-def B16   : ARM64Reg<16, "b16">, DwarfRegNum<[80]>;
-def B17   : ARM64Reg<17, "b17">, DwarfRegNum<[81]>;
-def B18   : ARM64Reg<18, "b18">, DwarfRegNum<[82]>;
-def B19   : ARM64Reg<19, "b19">, DwarfRegNum<[83]>;
-def B20   : ARM64Reg<20, "b20">, DwarfRegNum<[84]>;
-def B21   : ARM64Reg<21, "b21">, DwarfRegNum<[85]>;
-def B22   : ARM64Reg<22, "b22">, DwarfRegNum<[86]>;
-def B23   : ARM64Reg<23, "b23">, DwarfRegNum<[87]>;
-def B24   : ARM64Reg<24, "b24">, DwarfRegNum<[88]>;
-def B25   : ARM64Reg<25, "b25">, DwarfRegNum<[89]>;
-def B26   : ARM64Reg<26, "b26">, DwarfRegNum<[90]>;
-def B27   : ARM64Reg<27, "b27">, DwarfRegNum<[91]>;
-def B28   : ARM64Reg<28, "b28">, DwarfRegNum<[92]>;
-def B29   : ARM64Reg<29, "b29">, DwarfRegNum<[93]>;
-def B30   : ARM64Reg<30, "b30">, DwarfRegNum<[94]>;
-def B31   : ARM64Reg<31, "b31">, DwarfRegNum<[95]>;
-
-let SubRegIndices = [bsub] in {
-def H0    : ARM64Reg<0,   "h0", [B0]>, DwarfRegAlias<B0>;
-def H1    : ARM64Reg<1,   "h1", [B1]>, DwarfRegAlias<B1>;
-def H2    : ARM64Reg<2,   "h2", [B2]>, DwarfRegAlias<B2>;
-def H3    : ARM64Reg<3,   "h3", [B3]>, DwarfRegAlias<B3>;
-def H4    : ARM64Reg<4,   "h4", [B4]>, DwarfRegAlias<B4>;
-def H5    : ARM64Reg<5,   "h5", [B5]>, DwarfRegAlias<B5>;
-def H6    : ARM64Reg<6,   "h6", [B6]>, DwarfRegAlias<B6>;
-def H7    : ARM64Reg<7,   "h7", [B7]>, DwarfRegAlias<B7>;
-def H8    : ARM64Reg<8,   "h8", [B8]>, DwarfRegAlias<B8>;
-def H9    : ARM64Reg<9,   "h9", [B9]>, DwarfRegAlias<B9>;
-def H10   : ARM64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
-def H11   : ARM64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
-def H12   : ARM64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
-def H13   : ARM64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
-def H14   : ARM64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
-def H15   : ARM64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
-def H16   : ARM64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
-def H17   : ARM64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
-def H18   : ARM64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
-def H19   : ARM64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
-def H20   : ARM64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
-def H21   : ARM64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
-def H22   : ARM64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
-def H23   : ARM64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
-def H24   : ARM64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
-def H25   : ARM64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
-def H26   : ARM64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
-def H27   : ARM64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
-def H28   : ARM64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
-def H29   : ARM64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
-def H30   : ARM64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
-def H31   : ARM64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
-}
-
-let SubRegIndices = [hsub] in {
-def S0    : ARM64Reg<0,   "s0", [H0]>, DwarfRegAlias<B0>;
-def S1    : ARM64Reg<1,   "s1", [H1]>, DwarfRegAlias<B1>;
-def S2    : ARM64Reg<2,   "s2", [H2]>, DwarfRegAlias<B2>;
-def S3    : ARM64Reg<3,   "s3", [H3]>, DwarfRegAlias<B3>;
-def S4    : ARM64Reg<4,   "s4", [H4]>, DwarfRegAlias<B4>;
-def S5    : ARM64Reg<5,   "s5", [H5]>, DwarfRegAlias<B5>;
-def S6    : ARM64Reg<6,   "s6", [H6]>, DwarfRegAlias<B6>;
-def S7    : ARM64Reg<7,   "s7", [H7]>, DwarfRegAlias<B7>;
-def S8    : ARM64Reg<8,   "s8", [H8]>, DwarfRegAlias<B8>;
-def S9    : ARM64Reg<9,   "s9", [H9]>, DwarfRegAlias<B9>;
-def S10   : ARM64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
-def S11   : ARM64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
-def S12   : ARM64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
-def S13   : ARM64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
-def S14   : ARM64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
-def S15   : ARM64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
-def S16   : ARM64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
-def S17   : ARM64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
-def S18   : ARM64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
-def S19   : ARM64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
-def S20   : ARM64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
-def S21   : ARM64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
-def S22   : ARM64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
-def S23   : ARM64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
-def S24   : ARM64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
-def S25   : ARM64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
-def S26   : ARM64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
-def S27   : ARM64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
-def S28   : ARM64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
-def S29   : ARM64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
-def S30   : ARM64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
-def S31   : ARM64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
-}
-
-let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
-def D0    : ARM64Reg<0,   "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
-def D1    : ARM64Reg<1,   "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
-def D2    : ARM64Reg<2,   "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
-def D3    : ARM64Reg<3,   "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
-def D4    : ARM64Reg<4,   "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
-def D5    : ARM64Reg<5,   "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
-def D6    : ARM64Reg<6,   "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
-def D7    : ARM64Reg<7,   "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
-def D8    : ARM64Reg<8,   "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
-def D9    : ARM64Reg<9,   "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
-def D10   : ARM64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
-def D11   : ARM64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
-def D12   : ARM64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
-def D13   : ARM64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
-def D14   : ARM64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
-def D15   : ARM64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
-def D16   : ARM64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
-def D17   : ARM64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
-def D18   : ARM64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
-def D19   : ARM64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
-def D20   : ARM64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
-def D21   : ARM64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
-def D22   : ARM64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
-def D23   : ARM64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
-def D24   : ARM64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
-def D25   : ARM64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
-def D26   : ARM64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
-def D27   : ARM64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
-def D28   : ARM64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
-def D29   : ARM64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
-def D30   : ARM64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
-def D31   : ARM64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
-}
-
-let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
-def Q0    : ARM64Reg<0,   "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
-def Q1    : ARM64Reg<1,   "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
-def Q2    : ARM64Reg<2,   "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
-def Q3    : ARM64Reg<3,   "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
-def Q4    : ARM64Reg<4,   "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
-def Q5    : ARM64Reg<5,   "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
-def Q6    : ARM64Reg<6,   "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
-def Q7    : ARM64Reg<7,   "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
-def Q8    : ARM64Reg<8,   "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
-def Q9    : ARM64Reg<9,   "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
-def Q10   : ARM64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
-def Q11   : ARM64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
-def Q12   : ARM64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
-def Q13   : ARM64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
-def Q14   : ARM64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
-def Q15   : ARM64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
-def Q16   : ARM64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
-def Q17   : ARM64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
-def Q18   : ARM64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
-def Q19   : ARM64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
-def Q20   : ARM64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
-def Q21   : ARM64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
-def Q22   : ARM64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
-def Q23   : ARM64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
-def Q24   : ARM64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
-def Q25   : ARM64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
-def Q26   : ARM64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
-def Q27   : ARM64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
-def Q28   : ARM64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
-def Q29   : ARM64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
-def Q30   : ARM64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
-def Q31   : ARM64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
-}
-
-def FPR8  : RegisterClass<"ARM64", [untyped], 8, (sequence "B%u", 0, 31)> {
-  let Size = 8;
-}
-def FPR16 : RegisterClass<"ARM64", [untyped], 16, (sequence "H%u", 0, 31)> {
-  let Size = 16;
-}
-def FPR32 : RegisterClass<"ARM64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
-def FPR64 : RegisterClass<"ARM64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
-                                    v1i64],
-                                    64, (sequence "D%u", 0, 31)>;
-// We don't (yet) have an f128 legal type, so don't use that here. We
-// normalize 128-bit vectors to v2f64 for arg passing and such, so use
-// that here.
-def FPR128 : RegisterClass<"ARM64",
-                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
-                           128, (sequence "Q%u", 0, 31)>;
-
-// The lower 16 vector registers.  Some instructions can only take registers
-// in this range.
-def FPR128_lo : RegisterClass<"ARM64",
-                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                              128, (trunc FPR128, 16)>;
-
-// Pairs, triples, and quads of 64-bit vector registers.
-def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
-def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
-                                 [(rotl FPR64, 0), (rotl FPR64, 1),
-                                  (rotl FPR64, 2)]>;
-def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
-                               [(rotl FPR64, 0), (rotl FPR64, 1),
-                                (rotl FPR64, 2), (rotl FPR64, 3)]>;
-def DD   : RegisterClass<"ARM64", [untyped], 64, (add DSeqPairs)> {
-  let Size = 128;
-}
-def DDD  : RegisterClass<"ARM64", [untyped], 64, (add DSeqTriples)> {
-  let Size = 196;
-}
-def DDDD : RegisterClass<"ARM64", [untyped], 64, (add DSeqQuads)> {
-  let Size = 256;
-}
-
-// Pairs, triples, and quads of 128-bit vector registers.
-def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
-def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
-                                 [(rotl FPR128, 0), (rotl FPR128, 1),
-                                  (rotl FPR128, 2)]>;
-def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
-                               [(rotl FPR128, 0), (rotl FPR128, 1),
-                                (rotl FPR128, 2), (rotl FPR128, 3)]>;
-def QQ   : RegisterClass<"ARM64", [untyped], 128, (add QSeqPairs)> {
-  let Size = 256;
-}
-def QQQ  : RegisterClass<"ARM64", [untyped], 128, (add QSeqTriples)> {
-  let Size = 384;
-}
-def QQQQ : RegisterClass<"ARM64", [untyped], 128, (add QSeqQuads)> {
-  let Size = 512;
-}
-
-
-// Vector operand versions of the FP registers. Alternate name printing and
-// assmebler matching.
-def VectorRegAsmOperand : AsmOperandClass { let Name = "VectorReg"; }
-let ParserMatchClass = VectorRegAsmOperand in {
-def V64  : RegisterOperand<FPR64, "printVRegOperand">;
-def V128 : RegisterOperand<FPR128, "printVRegOperand">;
-def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand">;
-}
-
-class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
-    : AsmOperandClass {
-  let Name = "TypedVectorList" # count # "_" # lanes # kind;
-
-  let PredicateMethod
-      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
-  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
-}
-
-class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
-    : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
-                                                   # kind # "'>">;
-
-multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
-  // With implicit types (probably on instruction instead). E.g. { v0, v1 }
-  def _64AsmOperand : AsmOperandClass {
-    let Name = NAME # "64";
-    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
-    let RenderMethod = "addVectorList64Operands<" # count # ">";
-  }
-
-  def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
-  }
-
-  def _128AsmOperand : AsmOperandClass {
-    let Name = NAME # "128";
-    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
-    let RenderMethod = "addVectorList128Operands<" # count # ">";
-  }
-
-  def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
-  }
-
-  // 64-bit register lists with explicit type.
-
-  // { v0.8b, v1.8b }
-  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
-  def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
-  }
-
-  // { v0.4h, v1.4h }
-  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
-  def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
-  }
-
-  // { v0.2s, v1.2s }
-  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
-  def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
-  }
-
-  // { v0.1d, v1.1d }
-  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
-  def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
-  }
-
-  // 128-bit register lists with explicit type
-
-  // { v0.16b, v1.16b }
-  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
-  def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
-  }
-
-  // { v0.8h, v1.8h }
-  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
-  def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
-  }
-
-  // { v0.4s, v1.4s }
-  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
-  def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
-  }
-
-  // { v0.2d, v1.2d }
-  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
-  def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
-  }
-
-  // { v0.b, v1.b }
-  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
-  def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
-  }
-
-  // { v0.h, v1.h }
-  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
-  def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
-  }
-
-  // { v0.s, v1.s }
-  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
-  def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
-  }
-
-  // { v0.d, v1.d }
-  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
-  def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
-  }
-
-
-}
-
-defm VecListOne   : VectorList<1, FPR64, FPR128>;
-defm VecListTwo   : VectorList<2, DD,    QQ>;
-defm VecListThree : VectorList<3, DDD,   QQQ>;
-defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
-
-
-// Register operand versions of the scalar FP registers.
-def FPR16Op : RegisterOperand<FPR16, "printOperand">;
-def FPR32Op : RegisterOperand<FPR32, "printOperand">;
-def FPR64Op : RegisterOperand<FPR64, "printOperand">;
-def FPR128Op : RegisterOperand<FPR128, "printOperand">;
diff --git a/lib/Target/ARM64/ARM64SchedCyclone.td b/lib/Target/ARM64/ARM64SchedCyclone.td
deleted file mode 100644
index 65c68b3..0000000
--- a/lib/Target/ARM64/ARM64SchedCyclone.td
+++ /dev/null
@@ -1,852 +0,0 @@
-//=- ARMSchedCyclone.td - ARM64 Cyclone Scheduling Defs ------*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the machine model for ARM64 Cyclone to support
-// instruction scheduling and other instruction cost heuristics.
-//
-//===----------------------------------------------------------------------===//
-
-def CycloneModel : SchedMachineModel {
-  let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
-  let MicroOpBufferSize = 192; // Based on the reorder buffer.
-  let LoadLatency = 4; // Optimistic load latency.
-  let MispredictPenalty = 16; // 14-19 cycles are typical.
-}
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available on Cyclone.
-
-// 4 integer pipes
-def CyUnitI : ProcResource<4> {
-  let BufferSize = 48;
-}
-
-// 2 branch units: I[0..1]
-def CyUnitB : ProcResource<2> {
-  let Super  = CyUnitI;
-  let BufferSize = 24;
-}
-
-// 1 indirect-branch unit: I[0]
-def CyUnitBR : ProcResource<1> {
-  let Super  = CyUnitB;
-}
-
-// 2 shifter pipes: I[2..3]
-// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
-def CyUnitIS : ProcResource<2> {
-  let Super = CyUnitI;
-  let BufferSize = 24;
-}
-
-// 1 mul pipe: I[0]
-def CyUnitIM : ProcResource<1> {
-  let Super = CyUnitBR;
-  let BufferSize = 32;
-}
-
-// 1 div pipe: I[1]
-def CyUnitID : ProcResource<1> {
-  let Super = CyUnitB;
-  let BufferSize = 16;
-}
-
-// 1 integer division unit. This is driven by the ID pipe, but only
-// consumes the pipe for one cycle at issue and another cycle at writeback.
-def CyUnitIntDiv : ProcResource<1>;
-
-// 2 ld/st pipes.
-def CyUnitLS : ProcResource<2> {
-  let BufferSize = 28;
-}
-
-// 3 fp/vector pipes.
-def CyUnitV : ProcResource<3> {
-  let BufferSize = 48;
-}
-// 2 fp/vector arithmetic and multiply pipes: V[0-1]
-def CyUnitVM : ProcResource<2> {
-  let Super = CyUnitV;
-  let BufferSize = 32;
-}
-// 1 fp/vector division/sqrt pipe: V[2]
-def CyUnitVD : ProcResource<1> {
-  let Super = CyUnitV;
-  let BufferSize = 16;
-}
-// 1 fp compare pipe: V[0]
-def CyUnitVC : ProcResource<1> {
-  let Super = CyUnitVM;
-  let BufferSize = 16;
-}
-
-// 2 fp division/square-root units.  These are driven by the VD pipe,
-// but only consume the pipe for one cycle at issue and a cycle at writeback.
-def CyUnitFloatDiv : ProcResource<2>;
-
-//===----------------------------------------------------------------------===//
-// Define scheduler read/write resources and latency on Cyclone.
-// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
-
-let SchedModel = CycloneModel in {
-
-//---
-// 7.8.1. Moves
-//---
-
-// A single nop micro-op (uX).
-def WriteX : SchedWriteRes<[]> { let Latency = 0; }
-
-// Move zero is a register rename (to machine register zero).
-// The move is replaced by a single nop micro-op.
-// MOVZ Rd, #0
-// AND Rd, Rzr, #imm
-def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>;
-def WriteImmZ  : SchedWriteVariant<[
-                   SchedVar<WriteZPred, [WriteX]>,
-                   SchedVar<NoSchedPred, [WriteImm]>]>;
-def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
-
-// Move GPR is a register rename and single nop micro-op.
-// ORR Xd, XZR, Xm
-// ADD Xd, Xn, #0
-def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>;
-def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>;
-def WriteMov      : SchedWriteVariant<[
-                      SchedVar<WriteIMovPred, [WriteX]>,
-                      SchedVar<WriteVMovPred, [WriteX]>,
-                      SchedVar<NoSchedPred,   [WriteI]>]>;
-def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
-
-// Move non-zero immediate is an integer ALU op.
-// MOVN,MOVZ,MOVK
-def : WriteRes<WriteImm, [CyUnitI]>;
-
-//---
-// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
-//              Shifts and Bitfield Operations
-//---
-
-// ADR,ADRP
-// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
-// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
-// ADC(S),SBC(S)
-// Aliases: CMN, CMP, TST
-//
-// Conditional operations.
-// CCMNi,CCMPi,CCMNr,CCMPr,
-// CSEL,CSINC,CSINV,CSNEG
-//
-// Bit counting and reversal operations.
-// CLS,CLZ,RBIT,REV,REV16,REV32
-def : WriteRes<WriteI, [CyUnitI]>;
-
-// ADD with shifted register operand is a single micro-op that
-// consumes a shift pipeline for two cycles.
-// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
-// EXAMPLE: ADDrs Xn, Xm LSL #imm
-def : WriteRes<WriteISReg, [CyUnitIS]> {
-  let Latency = 2;
-  let ResourceCycles = [2];
-}
-
-// ADD with extended register operand is the same as shifted reg operand.
-// ADD(S)re,SUB(S)re
-// EXAMPLE: ADDXre Xn, Xm, UXTB #1
-def : WriteRes<WriteIEReg, [CyUnitIS]> {
-  let Latency = 2;
-  let ResourceCycles = [2];
-}
-
-// Variable shift and bitfield operations.
-// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
-def : WriteRes<WriteIS, [CyUnitIS]>;
-
-// EXTR Shifts a pair of registers and requires two micro-ops.
-// The second micro-op is delayed, as modeled by ReadExtrHi.
-// EXTR Xn, Xm, #imm
-def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}
-
-// EXTR's first register read is delayed by one cycle, effectively
-// shortening its writer's latency.
-// EXTR Xn, Xm, #imm
-def : ReadAdvance<ReadExtrHi, 1>;
-
-//---
-// 7.8.6. Multiplies
-//---
-
-// MUL/MNEG are aliases for MADD/MSUB.
-// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
-def : WriteRes<WriteIM32, [CyUnitIM]> {
-  let Latency = 4;
-}
-// MADDX,MSUBX,SMULH,UMULH
-def : WriteRes<WriteIM64, [CyUnitIM]> {
-  let Latency = 5;
-}
-
-//---
-// 7.8.7. Divide
-//---
-
-// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
-// The ID pipe is consumed for 2 cycles: issue and writeback.
-// SDIVW,UDIVW
-def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
-  let Latency = 10;
-  let ResourceCycles = [2, 10];
-}
-// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
-// The ID pipe is consumed for 2 cycles: issue and writeback.
-// SDIVX,UDIVX
-def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
-  let Latency = 13;
-  let ResourceCycles = [2, 13];
-}
-
-//---
-// 7.8.8,7.8.10. Load/Store, single element
-//---
-
-// Integer loads take 4 cycles and use one LS unit for one cycle.
-def : WriteRes<WriteLD, [CyUnitLS]> {
-  let Latency = 4;
-}
-
-// Store-load forwarding is 4 cycles.
-//
-// Note: The store-exclusive sequence incorporates this
-// latency. However, general heuristics should not model the
-// dependence between a store and subsequent may-alias load because
-// hardware speculation works.
-def : WriteRes<WriteST, [CyUnitLS]> {
-  let Latency = 4;
-}
-
-// Load from base address plus an optionally scaled register offset.
-// Rt latency is latency WriteIS + WriteLD.
-// EXAMPLE: LDR Xn, Xm [, lsl 3]
-def CyWriteLDIdx : SchedWriteVariant<[
-  SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
-  SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
-def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map ARM64->Cyclone type.
-
-// EXAMPLE: STR Xn, Xm [, lsl 3]
-def CyWriteSTIdx : SchedWriteVariant<[
-  SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
-  SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
-def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map ARM64->Cyclone type.
-
-// Read the (unshifted) base register Xn in the second micro-op one cycle later.
-// EXAMPLE: LDR Xn, Xm [, lsl 3]
-def ReadBaseRS : SchedReadAdvance<1>;
-def CyReadAdrBase : SchedReadVariant<[
-  SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
-  SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
-def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map ARM64->Cyclone type.
-
-//---
-// 7.8.9,7.8.11. Load/Store, paired
-//---
-
-// Address pre/post increment is a simple ALU op with one cycle latency.
-def : WriteRes<WriteAdr, [CyUnitI]>;
-
-// LDP high register write is fused with the load, but a nop micro-op remains.
-def : WriteRes<WriteLDHi, []> {
-  let Latency = 4;
-}
-
-// STP is a vector op and store, except for QQ, which is just two stores.
-def : SchedAlias<WriteSTP, WriteVSTShuffle>;
-def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
-
-//---
-// 7.8.13. Branches
-//---
-
-// Branches take a single micro-op.
-// The misprediction penalty is defined as a SchedMachineModel property.
-def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
-def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
-
-//---
-// 7.8.14. Never-issued Instructions, Barrier and Hint Operations
-//---
-
-// NOP,SEV,SEVL,WFE,WFI,YIELD
-def : WriteRes<WriteHint, []> {let Latency = 0;}
-// ISB
-def : InstRW<[WriteI], (instrs ISB)>;
-// SLREX,DMB,DSB
-def : WriteRes<WriteBarrier, [CyUnitLS]>;
-
-// System instructions get an invalid latency because the latency of
-// other operations across them is meaningless.
-def : WriteRes<WriteSys, []> {let Latency = -1;}
-
-//===----------------------------------------------------------------------===//
-// 7.9 Vector Unit Instructions
-
-// Simple vector operations take 2 cycles.
-def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
-
-// Define some longer latency vector op types for Cyclone.
-def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
-def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
-def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
-def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
-
-// Simple floating-point operations take 2 cycles.
-def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
-
-//---
-// 7.9.1 Vector Moves
-//---
-
-// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
-// generates expensive int-float conversion instead:
-// FMOVDi Dd, #0.0
-// FMOVv2f64ns Vd.2d, #0.0
-
-// FMOVSi,FMOVDi
-def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
-
-// MOVI,MVNI are WriteV
-// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
-
-// Move FPR is a register rename and single nop micro-op.
-// ORR.16b Vd,Vn,Vn
-// COPY is handled above in the WriteMov Variant.
-def WriteVMov    : SchedWriteVariant<[
-                     SchedVar<WriteVMovPred, [WriteX]>,
-                     SchedVar<NoSchedPred,   [WriteV]>]>;
-def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
-
-// FMOVSr,FMOVDr are WriteF.
-
-// MOV V,V is a WriteV.
-
-// CPY D,V[x] is a WriteV
-
-// INS V[x],V[y] is a WriteV.
-
-// FMOVWSr,FMOVXDr,FMOVXDHighr
-def : SchedAlias<WriteFCopy, WriteVLD>;
-
-// FMOVSWr,FMOVDXr
-def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
-
-// INS V[x],R
-def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
-def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
-
-// SMOV,UMOV R,V[x]
-def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
-def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
-
-// DUP V,R
-def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
-
-// DUP V,V[x] is a WriteV.
-
-//---
-// 7.9.2 Integer Arithmetic, Logical, and Comparisons
-//---
-
-// BIC,ORR V,#imm are WriteV
-
-def : InstRW<[CyWriteV3], (instregex "ABSv")>;
-
-// MVN,NEG,NOT are WriteV
-
-def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
-
-// ADDP is a WriteV.
-def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
-def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
-
-def : InstRW<[CyWriteV3],
-             (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
-
-def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
-
-// ADD,SUB are WriteV
-
-// Forward declare.
-def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
-
-// Add/Diff and accumulate uses the vector multiply unit.
-def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
-def CyReadVAccum  : SchedReadAdvance<1,
-                    [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
-
-def : InstRW<[CyWriteVAccum, CyReadVAccum],
-             (instregex "SADALP","UADALP")>;
-
-def : InstRW<[CyWriteVAccum, CyReadVAccum],
-             (instregex "SABAv","UABAv","SABALv","UABALv")>;
-
-def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
-
-def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
-
-def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
-
-// WriteV includes:
-// AND,BIC,CMTST,EOR,ORN,ORR
-// ADDP
-// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
-// SADDL,SSUBL,UADDL,USUBL
-// SADDW,SSUBW,UADDW,USUBW
-
-def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
-                                     "CMLEv","CMLTv",
-                                     "CMHIv","CMHSv")>;
-
-def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
-                                     "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
-
-def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
-                                       "SABDLv","UABDLv")>;
-
-//---
-// 7.9.3 Floating Point Arithmetic and Comparisons
-//---
-
-// FABS,FNEG are WriteF
-
-def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
-def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
-
-def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
-                                     "FMINPv2i","FMINNMPv2i")>;
-
-def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
-
-def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
-                                  FSUBSrr,FSUBv2f32,FSUBv4f32,
-                                  FADDPv2f32,FADDPv4f32,
-                                  FABD32,FABDv2f32,FABDv4f32)>;
-def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
-                                  FSUBDrr,FSUBv2f64,
-                                  FADDPv2f64,
-                                  FABD64,FABDv2f64)>;
-
-def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
-
-def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
-                                     "FMAXS","FMAXD","FMAXv",
-                                     "FMINS","FMIND","FMINv",
-                                     "FMAXNMS","FMAXNMD","FMAXNMv",
-                                     "FMINNMS","FMINNMD","FMINNMv",
-                                     "FMAXPv2f","FMAXPv4f",
-                                     "FMINPv2f","FMINPv4f",
-                                     "FMAXNMPv2f","FMAXNMPv4f",
-                                     "FMINNMPv2f","FMINNMPv4f")>;
-
-// FCMP,FCMPE,FCCMP,FCCMPE
-def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
-
-// FCSEL is a WriteF.
-
-//---
-// 7.9.4 Shifts and Bitfield Operations
-//---
-
-// SHL is a WriteV
-
-def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
-def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
-
-def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
-def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
-
-// Shift and accumulate uses the vector multiply unit.
-def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
-def CyReadVShiftAcc  : SchedReadAdvance<1,
-                        [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
-def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
-             (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
-
-// SSHL,USHL are WriteV.
-
-def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
-
-// SQSHL,SQSHLU,UQSHL are WriteV.
-
-def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
-
-// WriteV includes:
-// SHLL,SSHLL,USHLL
-// SLI,SRI
-// BIF,BIT,BSL
-// EXT
-// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
-// XTN2
-
-def : InstRW<[CyWriteV4],
-             (instregex "RSHRNv","SHRNv",
-                        "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
-                        "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
-
-//---
-// 7.9.5 Multiplication
-//---
-
-def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
-def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
-                             "SQDMULLv","SQDMULHv","SQRDMULHv")>;
-
-// FMUL,FMULX,FNMUL default to WriteFMul.
-def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
-
-def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
-def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
-                               FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
-
-def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
-def : InstRW<[CyWriteVMul, CyReadVMulAcc],
-             (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
-              "SQDMLAL","SQDMLSL")>;
-
-def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
-def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
-def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
-def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
-
-def : InstRW<[CyWriteSMul, CyReadSMul],
-             (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
-              FMLAv2f32,FMLAv4f32,
-              FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
-def : InstRW<[CyWriteDMul, CyReadDMul],
-             (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
-              FMLAv2f64,FMLAv2i64_indexed,
-              FMLSv2f64,FMLSv2i64_indexed)>;
-
-def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
-def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
-
-//---
-// 7.9.6 Divide and Square Root
-//---
-
-// FDIV,FSQRT
-// TODO: Add 64-bit variant with 19 cycle latency.
-// TODO: Specialize FSQRT for longer latency.
-def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
-  let Latency = 17;
-  let ResourceCycles = [2, 17];
-}
-
-def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
-
-def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
-def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
-
-def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
-def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
-def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
-def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
-
-//---
-// 7.9.7 Integer-FP Conversions
-//---
-
-// FCVT lengthen f16/s32
-def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
-
-// FCVT,FCVTN,FCVTXN
-// SCVTF,UCVTF V,V
-// FRINT(AIMNPXZ) V,V
-def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
-
-// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
-def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
-def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
-
-// FCVT Rd, S/D = V6+LD4: 10 cycles
-def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
-def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
-
-// FCVTL is a WriteV
-
-//---
-// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
-//---
-
-def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
-def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
-                                       AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
-                                       SHA1SU0rrr)>;
-
-def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
-def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
-
-def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
-def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
-                                       SHA256Hrrr,SHA256H2rrr)>;
-
-// TRN,UZP,ZUP are WriteV.
-
-// TBL,TBX are WriteV.
-
-//---
-// 7.9.11-7.9.14 Load/Store, single element and paired
-//---
-
-// Loading into the vector unit takes 5 cycles vs 4 for integer loads.
-def : WriteRes<WriteVLD, [CyUnitLS]> {
-  let Latency = 5;
-}
-
-// Store-load forwarding is 4 cycles.
-def : WriteRes<WriteVST, [CyUnitLS]> {
-  let Latency = 4;
-}
-
-// WriteVLDPair/VSTPair sequences are expanded by the target description.
-
-//---
-// 7.9.15 Load, element operations
-//---
-
-// Only the first WriteVLD and WriteAdr for writeback matches def operands.
-// Subsequent WriteVLDs consume resources. Since all loaded values have the
-// same latency, this is acceptable.
-
-// Vd is read 5 cycles after issuing the vector load.
-def : ReadAdvance<ReadVLD, 5>;
-
-def : InstRW<[WriteVLD],
-             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLD, WriteAdr],
-             (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
-
-// Register writes from the load's high half are fused micro-ops.
-def : InstRW<[WriteVLD],
-             (instregex "LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteVLD, WriteAdr],
-             (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVLD, WriteVLD],
-             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
-             (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLD, WriteVLD],
-             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
-             (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
-             (instregex "LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
-             (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLD, WriteVLD],
-             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
-             (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
-             (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
-             (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD],
-             (instregex "LD1i(8|16|32)$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
-             (instregex "LD1i(8|16|32)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
-
-def : InstRW<[WriteVLDShuffle],
-             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr],
-             (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-def : InstRW<[WriteVLDShuffle, WriteV],
-             (instregex "LD2Twov(8b|4h|2s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
-             (instregex "LD2Twov(8b|4h|2s)_POST$")>;
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
-             (instregex "LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
-             (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
-             (instregex "LD2i(8|16|32)$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
-             (instregex "LD2i(8|16|32)_POST")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
-             (instregex "LD2i64$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
-             (instregex "LD2i64_POST")>;
-
-def : InstRW<[WriteVLDShuffle, WriteV],
-             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
-             (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
-             (instregex "LD3Threev(8b|4h|2s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
-             (instregex "LD3Threev(8b|4h|2s)_POST")>;
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
-             (instregex "LD3Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
-             (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
-             (instregex "LD3i(8|16|32)$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
-             (instregex "LD3i(8|16|32)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
-             (instregex "LD3i64$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
-             (instregex "LD3i64_POST")>;
-
-def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
-             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
-             (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
-             (instrs LD3Rv1d,LD3Rv2d)>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
-             (instrs LD3Rv2d_POST,LD3Rv2d_POST)>;
-
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
-             (instregex "LD4Fourv(8b|4h|2s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
-             (instregex "LD4Fourv(8b|4h|2s)_POST")>;
-def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
-              WriteVLDPairShuffle, WriteVLDPairShuffle],
-             (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
-              WriteVLDPairShuffle, WriteVLDPairShuffle],
-             (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
-             (instregex "LD4i(8|16|32)$")>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
-             (instregex "LD4i(8|16|32)_POST")>;
-
-
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
-             (instrs LD4i64)>;
-def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
-             (instrs LD4i64_POST)>;
-
-def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
-             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
-             (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
-
-def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
-             (instrs LD4Rv1d,LD4Rv2d)>;
-def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
-             (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
-
-//---
-// 7.9.16 Store, element operations
-//---
-
-// Only the WriteAdr for writeback matches a def operands.
-// Subsequent WriteVLDs only consume resources.
-
-def : InstRW<[WriteVST],
-             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVST],
-             (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTShuffle],
-             (instregex "ST1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle],
-             (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVST, WriteVST],
-             (instregex "ST1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVST, WriteVST],
-             (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTShuffle, WriteVST],
-             (instregex "ST1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
-             (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVST, WriteVST, WriteVST],
-             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
-             (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
-             (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
-             (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
-
-def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
-def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
-
-def : InstRW<[WriteVSTShuffle],
-             (instregex "ST2Twov(8b|4h|2s)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle],
-             (instregex "ST2Twov(8b|4h|2s)_POST")>;
-def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
-def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
-def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
-
-def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST3Threev(8b|4h|2s)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST3Threev(8b|4h|2s)_POST")>;
-def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST3Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
-             (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
-def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
-
-def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
-def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
-
-def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
-            (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
-            (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
-def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
-              WriteVSTPairShuffle, WriteVSTPairShuffle],
-             (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
-              WriteVSTPairShuffle, WriteVSTPairShuffle],
-             (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
-
-def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
-def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
-
-def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
-def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
-
-} // SchedModel = CycloneModel
diff --git a/lib/Target/ARM64/ARM64Schedule.td b/lib/Target/ARM64/ARM64Schedule.td
deleted file mode 100644
index 52f9262..0000000
--- a/lib/Target/ARM64/ARM64Schedule.td
+++ /dev/null
@@ -1,92 +0,0 @@
-//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-// Define TII for use in SchedVariant Predicates.
-// const MachineInstr *MI and const TargetSchedModel *SchedModel
-// are defined by default.
-def : PredicateProlog<[{
-  const ARM64InstrInfo *TII =
-    static_cast<const ARM64InstrInfo*>(SchedModel->getInstrInfo());
-  (void)TII;
-}]>;
-
-// ARM64 Scheduler Definitions
-
-def WriteImm       : SchedWrite; // MOVN, MOVZ
-// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
-// select the correct sequence of WriteImms.
-
-def WriteI         : SchedWrite; // ALU
-def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
-def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
-def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
-def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
-def WriteIS        : SchedWrite; // Shift/Scale
-def WriteID32      : SchedWrite; // 32-bit Divide
-def WriteID64      : SchedWrite; // 64-bit Divide
-def WriteIM32      : SchedWrite; // 32-bit Multiply
-def WriteIM64      : SchedWrite; // 64-bit Multiply
-def WriteBr        : SchedWrite; // Branch
-def WriteBrReg     : SchedWrite; // Indirect Branch
-
-def WriteLD        : SchedWrite; // Load from base addr plus immediate offset
-def WriteST        : SchedWrite; // Store to base addr plus immediate offset
-def WriteSTP       : SchedWrite; // Store a register pair.
-def WriteAdr       : SchedWrite; // Address pre/post increment.
-
-def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
-def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
-def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
-
-// ScaledIdxPred is true if a WriteLDIdx operand will be
-// scaled. Subtargets can use this to dynamically select resources and
-// latency for WriteLDIdx and ReadAdrBase.
-def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
-
-// Serialized two-level address load.
-// EXAMPLE: LOADGot
-def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
-
-// Serialized two-level address lookup.
-// EXAMPLE: MOVaddr...
-def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
-
-// The second register of a load-pair.
-// LDP,LDPSW,LDNP,LDXP,LDAXP
-def WriteLDHi : SchedWrite;
-
-// Store-exclusive is a store followed by a dependent load.
-def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
-
-def WriteSys     : SchedWrite; // Long, variable latency system ops.
-def WriteBarrier : SchedWrite; // Memory barrier.
-def WriteHint    : SchedWrite; // Hint instruction.
-
-def WriteF       : SchedWrite; // General floating-point ops.
-def WriteFCmp    : SchedWrite; // Floating-point compare.
-def WriteFCvt    : SchedWrite; // Float conversion.
-def WriteFCopy   : SchedWrite; // Float-int register copy.
-def WriteFImm    : SchedWrite; // Floating-point immediate.
-def WriteFMul    : SchedWrite; // Floating-point multiply.
-def WriteFDiv    : SchedWrite; // Floating-point division.
-
-def WriteV   : SchedWrite; // Vector ops.
-def WriteVLD : SchedWrite; // Vector loads.
-def WriteVST : SchedWrite; // Vector stores.
-
-// Read the unwritten lanes of the VLD's destination registers.
-def ReadVLD : SchedRead;
-
-// Sequential vector load and shuffle.
-def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
-def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
-
-// Store a shuffled vector.
-def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
-def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp b/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
deleted file mode 100644
index 79d507f..0000000
--- a/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-//===-- ARM64SelectionDAGInfo.cpp - ARM64 SelectionDAG Info ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64SelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-selectiondag-info"
-#include "ARM64TargetMachine.h"
-using namespace llvm;
-
-ARM64SelectionDAGInfo::ARM64SelectionDAGInfo(const TargetMachine &TM)
-    : TargetSelectionDAGInfo(TM),
-      Subtarget(&TM.getSubtarget<ARM64Subtarget>()) {}
-
-ARM64SelectionDAGInfo::~ARM64SelectionDAGInfo() {}
-
-SDValue ARM64SelectionDAGInfo::EmitTargetCodeForMemset(
-    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool isVolatile,
-    MachinePointerInfo DstPtrInfo) const {
-  // Check to see if there is a specialized entry-point for memory zeroing.
-  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
-  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
-  const char *bzeroEntry =
-      (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : 0;
-  // For small size (< 256), it is not beneficial to use bzero
-  // instead of memset.
-  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
-    const ARM64TargetLowering &TLI = *static_cast<const ARM64TargetLowering *>(
-                                          DAG.getTarget().getTargetLowering());
-
-    EVT IntPtr = TLI.getPointerTy();
-    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
-    TargetLowering::ArgListTy Args;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Node = Dst;
-    Entry.Ty = IntPtrTy;
-    Args.push_back(Entry);
-    Entry.Node = Size;
-    Args.push_back(Entry);
-    TargetLowering::CallLoweringInfo CLI(
-        Chain, Type::getVoidTy(*DAG.getContext()), false, false, false, false,
-        0, CallingConv::C, /*isTailCall=*/false,
-        /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
-        DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
-    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-    return CallResult.second;
-  }
-  return SDValue();
-}
diff --git a/lib/Target/ARM64/ARM64SelectionDAGInfo.h b/lib/Target/ARM64/ARM64SelectionDAGInfo.h
deleted file mode 100644
index 770775f..0000000
--- a/lib/Target/ARM64/ARM64SelectionDAGInfo.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- ARM64SelectionDAGInfo.h - ARM64 SelectionDAG Info -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the ARM64 subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64SELECTIONDAGINFO_H
-#define ARM64SELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class ARM64SelectionDAGInfo : public TargetSelectionDAGInfo {
-  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-public:
-  explicit ARM64SelectionDAGInfo(const TargetMachine &TM);
-  ~ARM64SelectionDAGInfo();
-
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
-                                  SDValue Dst, SDValue Src, SDValue Size,
-                                  unsigned Align, bool isVolatile,
-                                  MachinePointerInfo DstPtrInfo) const override;
-};
-}
-
-#endif
diff --git a/lib/Target/ARM64/ARM64StorePairSuppress.cpp b/lib/Target/ARM64/ARM64StorePairSuppress.cpp
deleted file mode 100644
index 6521d13..0000000
--- a/lib/Target/ARM64/ARM64StorePairSuppress.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-//===---- ARM64StorePairSuppress.cpp --- Suppress store pair formation ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass identifies floating point stores that should not be combined into
-// store pairs. Later we may do the same for floating point loads.
-// ===---------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-stp-suppress"
-#include "ARM64InstrInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineTraceMetrics.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-class ARM64StorePairSuppress : public MachineFunctionPass {
-  const ARM64InstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  const MachineRegisterInfo *MRI;
-  MachineFunction *MF;
-  TargetSchedModel SchedModel;
-  MachineTraceMetrics *Traces;
-  MachineTraceMetrics::Ensemble *MinInstr;
-
-public:
-  static char ID;
-  ARM64StorePairSuppress() : MachineFunctionPass(ID) {}
-
-  virtual const char *getPassName() const override {
-    return "ARM64 Store Pair Suppression";
-  }
-
-  bool runOnMachineFunction(MachineFunction &F) override;
-
-private:
-  bool shouldAddSTPToBlock(const MachineBasicBlock *BB);
-
-  bool isNarrowFPStore(const MachineInstr &MI);
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<MachineTraceMetrics>();
-    AU.addPreserved<MachineTraceMetrics>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-char ARM64StorePairSuppress::ID = 0;
-} // anonymous
-
-FunctionPass *llvm::createARM64StorePairSuppressPass() {
-  return new ARM64StorePairSuppress();
-}
-
-/// Return true if an STP can be added to this block without increasing the
-/// critical resource height. STP is good to form in Ld/St limited blocks and
-/// bad to form in float-point limited blocks. This is true independent of the
-/// critical path. If the critical path is longer than the resource height, the
-/// extra vector ops can limit physreg renaming. Otherwise, it could simply
-/// oversaturate the vector units.
-bool ARM64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
-  if (!MinInstr)
-    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
-
-  MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB);
-  unsigned ResLength = BBTrace.getResourceLength();
-
-  // Get the machine model's scheduling class for STPQi.
-  // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
-  unsigned SCIdx = TII->get(ARM64::STPDi).getSchedClass();
-  const MCSchedClassDesc *SCDesc =
-      SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
-
-  // If a subtarget does not define resources for STPQi, bail here.
-  if (SCDesc->isValid() && !SCDesc->isVariant()) {
-    unsigned ResLenWithSTP = BBTrace.getResourceLength(
-        ArrayRef<const MachineBasicBlock *>(), SCDesc);
-    if (ResLenWithSTP > ResLength) {
-      DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
-                   << " resources " << ResLength << " -> " << ResLenWithSTP
-                   << "\n");
-      return false;
-    }
-  }
-  return true;
-}
-
-/// Return true if this is a floating-point store smaller than the V reg. On
-/// cyclone, these require a vector shuffle before storing a pair.
-/// Ideally we would call getMatchingPairOpcode() and have the machine model
-/// tell us if it's profitable with no cpu knowledge here.
-///
-/// FIXME: We plan to develop a decent Target abstraction for simple loads and
-/// stores. Until then use a nasty switch similar to ARM64LoadStoreOptimizer.
-bool ARM64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STURSi:
-  case ARM64::STURDi:
-    return true;
-  }
-}
-
-bool ARM64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
-  MF = &mf;
-  TII = static_cast<const ARM64InstrInfo *>(MF->getTarget().getInstrInfo());
-  TRI = MF->getTarget().getRegisterInfo();
-  MRI = &MF->getRegInfo();
-  const TargetSubtargetInfo &ST =
-      MF->getTarget().getSubtarget<TargetSubtargetInfo>();
-  SchedModel.init(*ST.getSchedModel(), &ST, TII);
-
-  Traces = &getAnalysis<MachineTraceMetrics>();
-  MinInstr = 0;
-
-  DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n');
-
-  if (!SchedModel.hasInstrSchedModel()) {
-    DEBUG(dbgs() << "  Skipping pass: no machine model present.\n");
-    return false;
-  }
-
-  // Check for a sequence of stores to the same base address. We don't need to
-  // precisely determine whether a store pair can be formed. But we do want to
-  // filter out most situations where we can't form store pairs to avoid
-  // computing trace metrics in those cases.
-  for (auto &MBB: *MF) {
-    bool SuppressSTP = false;
-    unsigned PrevBaseReg = 0;
-    for (auto &MI: MBB) {
-      if (!isNarrowFPStore(MI))
-        continue;
-      unsigned BaseReg;
-      unsigned Offset;
-      if (TII->getLdStBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) {
-        if (PrevBaseReg == BaseReg) {
-          // If this block can take STPs, skip ahead to the next block.
-          if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
-            break;
-          // Otherwise, continue unpairing the stores in this block.
-          DEBUG(dbgs() << "Unpairing store " << MI << "\n");
-          SuppressSTP = true;
-          TII->suppressLdStPair(&MI);
-        }
-        PrevBaseReg = BaseReg;
-      } else
-        PrevBaseReg = 0;
-    }
-  }
-  // This pass just sets some internal MachineMemOperand flags. It can't really
-  // invalidate anything.
-  return false;
-}
diff --git a/lib/Target/ARM64/ARM64Subtarget.cpp b/lib/Target/ARM64/ARM64Subtarget.cpp
deleted file mode 100644
index 14b5444..0000000
--- a/lib/Target/ARM64/ARM64Subtarget.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-//===-- ARM64Subtarget.cpp - ARM64 Subtarget Information --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64 specific subclass of TargetSubtarget.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64InstrInfo.h"
-#include "ARM64Subtarget.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_SUBTARGETINFO_CTOR
-#define GET_SUBTARGETINFO_TARGET_DESC
-#include "ARM64GenSubtargetInfo.inc"
-
-using namespace llvm;
-
-ARM64Subtarget::ARM64Subtarget(const std::string &TT, const std::string &CPU,
-                               const std::string &FS)
-    : ARM64GenSubtargetInfo(TT, CPU, FS), HasZeroCycleRegMove(false),
-      HasZeroCycleZeroing(false), CPUString(CPU), TargetTriple(TT) {
-  // Determine default and user-specified characteristics
-
-  if (CPUString.empty())
-    // We default to Cyclone for now.
-    CPUString = "cyclone";
-
-  ParseSubtargetFeatures(CPUString, FS);
-}
-
-/// ClassifyGlobalReference - Find the target operand flags that describe
-/// how a global value should be referenced for the current subtarget.
-unsigned char
-ARM64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
-                                        const TargetMachine &TM) const {
-
-  // Determine whether this is a reference to a definition or a declaration.
-  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
-  // load from stub.
-  bool isDecl = GV->hasAvailableExternallyLinkage();
-  if (GV->isDeclaration() && !GV->isMaterializable())
-    isDecl = true;
-
-  // MachO large model always goes via a GOT, simply to get a single 8-byte
-  // absolute relocation on all global addresses.
-  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
-    return ARM64II::MO_GOT;
-
-  // The small code mode's direct accesses use ADRP, which cannot necessarily
-  // produce the value 0 (if the code is above 4GB). Therefore they must use the
-  // GOT.
-  if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
-    return ARM64II::MO_GOT;
-
-  // If symbol visibility is hidden, the extra load is not needed if
-  // the symbol is definitely defined in the current translation unit.
-
-  // The handling of non-hidden symbols in PIC mode is rather target-dependent:
-  //   + On MachO, if the symbol is defined in this module the GOT can be
-  //     skipped.
-  //   + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
-  //     defined could end up in unexpected places. Use a GOT.
-  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
-    if (isTargetMachO())
-      return (isDecl || GV->isWeakForLinker()) ? ARM64II::MO_GOT
-                                               : ARM64II::MO_NO_FLAG;
-    else
-      return ARM64II::MO_GOT;
-  }
-
-  return ARM64II::MO_NO_FLAG;
-}
-
-/// This function returns the name of a function which has an interface
-/// like the non-standard bzero function, if such a function exists on
-/// the current subtarget and it is considered prefereable over
-/// memset with zero passed as the second argument. Otherwise it
-/// returns null.
-const char *ARM64Subtarget::getBZeroEntry() const {
-  // At the moment, always prefer bzero.
-  return "bzero";
-}
-
-void ARM64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                         MachineInstr *begin, MachineInstr *end,
-                                         unsigned NumRegionInstrs) const {
-  // LNT run (at least on Cyclone) showed reasonably significant gains for
-  // bi-directional scheduling. 253.perlbmk.
-  Policy.OnlyTopDown = false;
-  Policy.OnlyBottomUp = false;
-}
diff --git a/lib/Target/ARM64/ARM64Subtarget.h b/lib/Target/ARM64/ARM64Subtarget.h
deleted file mode 100644
index 1cbd79e..0000000
--- a/lib/Target/ARM64/ARM64Subtarget.h
+++ /dev/null
@@ -1,87 +0,0 @@
-//=====---- ARM64Subtarget.h - Define Subtarget for the ARM64 -*- C++ -*--====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the ARM64 specific subclass of TargetSubtarget.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64SUBTARGET_H
-#define ARM64SUBTARGET_H
-
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include "ARM64RegisterInfo.h"
-#include <string>
-
-#define GET_SUBTARGETINFO_HEADER
-#include "ARM64GenSubtargetInfo.inc"
-
-namespace llvm {
-class GlobalValue;
-class StringRef;
-
-class ARM64Subtarget : public ARM64GenSubtargetInfo {
-protected:
-  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
-  bool HasZeroCycleRegMove;
-
-  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
-  bool HasZeroCycleZeroing;
-
-  /// CPUString - String name of used CPU.
-  std::string CPUString;
-
-  /// TargetTriple - What processor and OS we're targeting.
-  Triple TargetTriple;
-
-public:
-  /// This constructor initializes the data members to match that
-  /// of the specified triple.
-  ARM64Subtarget(const std::string &TT, const std::string &CPU,
-                 const std::string &FS);
-
-  bool enableMachineScheduler() const override { return true; }
-
-  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
-
-  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
-
-  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
-
-  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-
-  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
-
-  bool isCyclone() const { return CPUString == "cyclone"; }
-
-  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
-  /// that still makes it profitable to inline the call.
-  unsigned getMaxInlineSizeThreshold() const { return 64; }
-
-  /// ParseSubtargetFeatures - Parses features string setting specified
-  /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-
-  /// ClassifyGlobalReference - Find the target operand flags that describe
-  /// how a global value should be referenced for the current subtarget.
-  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
-                                        const TargetMachine &TM) const;
-
-  /// This function returns the name of a function which has an interface
-  /// like the non-standard bzero function, if such a function exists on
-  /// the current subtarget and it is considered prefereable over
-  /// memset with zero passed as the second argument. Otherwise it
-  /// returns null.
-  const char *getBZeroEntry() const;
-
-  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
-                           MachineInstr *end, unsigned NumRegionInstrs) const;
-};
-} // End llvm namespace
-
-#endif // ARM64SUBTARGET_H
diff --git a/lib/Target/ARM64/ARM64TargetMachine.cpp b/lib/Target/ARM64/ARM64TargetMachine.cpp
deleted file mode 100644
index 101dc25..0000000
--- a/lib/Target/ARM64/ARM64TargetMachine.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-//===-- ARM64TargetMachine.cpp - Define TargetMachine for ARM64 -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/PassManager.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Transforms/Scalar.h"
-using namespace llvm;
-
-static cl::opt<bool> EnableCCMP("arm64-ccmp",
-                                cl::desc("Enable the CCMP formation pass"),
-                                cl::init(true));
-
-static cl::opt<bool> EnableStPairSuppress("arm64-stp-suppress", cl::Hidden,
-                                          cl::desc("Suppress STP for ARM64"),
-                                          cl::init(true));
-
-static cl::opt<bool>
-EnablePromoteConstant("arm64-promote-const", cl::Hidden,
-                      cl::desc("Enable the promote constant pass"),
-                      cl::init(true));
-
-static cl::opt<bool>
-EnableCollectLOH("arm64-collect-loh", cl::Hidden,
-                 cl::desc("Enable the pass that emits the linker"
-                          " optimization hints (LOH)"),
-                 cl::init(true));
-
-extern "C" void LLVMInitializeARM64Target() {
-  // Register the target.
-  RegisterTargetMachine<ARM64TargetMachine> X(TheARM64Target);
-}
-
-/// TargetMachine ctor - Create an ARM64 architecture model.
-///
-ARM64TargetMachine::ARM64TargetMachine(const Target &T, StringRef TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS),
-      DL(Subtarget.isTargetMachO() ? "e-m:o-i64:64-i128:128-n32:64-S128"
-                                   : "e-m:e-i64:64-i128:128-n32:64-S128"),
-      InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget),
-      TSInfo(*this) {
-  initAsmInfo();
-}
-
-namespace {
-/// ARM64 Code Generator Pass Configuration Options.
-class ARM64PassConfig : public TargetPassConfig {
-public:
-  ARM64PassConfig(ARM64TargetMachine *TM, PassManagerBase &PM)
-      : TargetPassConfig(TM, PM) {}
-
-  ARM64TargetMachine &getARM64TargetMachine() const {
-    return getTM<ARM64TargetMachine>();
-  }
-
-  virtual bool addPreISel();
-  virtual bool addInstSelector();
-  virtual bool addILPOpts();
-  virtual bool addPreRegAlloc();
-  virtual bool addPostRegAlloc();
-  virtual bool addPreSched2();
-  virtual bool addPreEmitPass();
-};
-} // namespace
-
-void ARM64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our ARM64 pass. This
-  // allows the ARM64 pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createARM64TargetTransformInfoPass(this));
-}
-
-TargetPassConfig *ARM64TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new ARM64PassConfig(this, PM);
-}
-
-// Pass Pipeline Configuration
-bool ARM64PassConfig::addPreISel() {
-  // Run promote constant before global merge, so that the promoted constants
-  // get a chance to be merged
-  if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
-    addPass(createARM64PromoteConstantPass());
-  if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createGlobalMergePass(TM));
-  if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createARM64AddressTypePromotionPass());
-  return false;
-}
-
-bool ARM64PassConfig::addInstSelector() {
-  addPass(createARM64ISelDag(getARM64TargetMachine(), getOptLevel()));
-
-  // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
-  // references to _TLS_MODULE_BASE_ as possible.
-  if (TM->getSubtarget<ARM64Subtarget>().isTargetELF() &&
-      getOptLevel() != CodeGenOpt::None)
-    addPass(createARM64CleanupLocalDynamicTLSPass());
-
-  return false;
-}
-
-bool ARM64PassConfig::addILPOpts() {
-  if (EnableCCMP)
-    addPass(createARM64ConditionalCompares());
-  addPass(&EarlyIfConverterID);
-  if (EnableStPairSuppress)
-    addPass(createARM64StorePairSuppressPass());
-  return true;
-}
-
-bool ARM64PassConfig::addPreRegAlloc() {
-  // Use AdvSIMD scalar instructions whenever profitable.
-  addPass(createARM64AdvSIMDScalar());
-  return true;
-}
-
-bool ARM64PassConfig::addPostRegAlloc() {
-  // Change dead register definitions to refer to the zero register.
-  addPass(createARM64DeadRegisterDefinitions());
-  return true;
-}
-
-bool ARM64PassConfig::addPreSched2() {
-  // Expand some pseudo instructions to allow proper scheduling.
-  addPass(createARM64ExpandPseudoPass());
-  // Use load/store pair instructions when possible.
-  addPass(createARM64LoadStoreOptimizationPass());
-  return true;
-}
-
-bool ARM64PassConfig::addPreEmitPass() {
-  // Relax conditional branch instructions if they're otherwise out of
-  // range of their destination.
-  addPass(createARM64BranchRelaxation());
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH)
-    addPass(createARM64CollectLOHPass());
-  return true;
-}
diff --git a/lib/Target/ARM64/ARM64TargetMachine.h b/lib/Target/ARM64/ARM64TargetMachine.h
deleted file mode 100644
index 8274550..0000000
--- a/lib/Target/ARM64/ARM64TargetMachine.h
+++ /dev/null
@@ -1,69 +0,0 @@
-//===-- ARM64TargetMachine.h - Define TargetMachine for ARM64 ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the ARM64 specific subclass of TargetMachine.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64TARGETMACHINE_H
-#define ARM64TARGETMACHINE_H
-
-#include "ARM64InstrInfo.h"
-#include "ARM64ISelLowering.h"
-#include "ARM64Subtarget.h"
-#include "ARM64FrameLowering.h"
-#include "ARM64SelectionDAGInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/MC/MCStreamer.h"
-
-namespace llvm {
-
-class ARM64TargetMachine : public LLVMTargetMachine {
-protected:
-  ARM64Subtarget Subtarget;
-
-private:
-  const DataLayout DL;
-  ARM64InstrInfo InstrInfo;
-  ARM64TargetLowering TLInfo;
-  ARM64FrameLowering FrameLowering;
-  ARM64SelectionDAGInfo TSInfo;
-
-public:
-  ARM64TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
-                     const TargetOptions &Options, Reloc::Model RM,
-                     CodeModel::Model CM, CodeGenOpt::Level OL);
-
-  const ARM64Subtarget *getSubtargetImpl() const override { return &Subtarget; }
-  const ARM64TargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
-  const DataLayout *getDataLayout() const override { return &DL; }
-  const ARM64FrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-  const ARM64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const ARM64RegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
-  }
-  const ARM64SelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
-  }
-
-  // Pass Pipeline Configuration
-  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-
-  /// \brief Register ARM64 analysis passes with a pass manager.
-  void addAnalysisPasses(PassManagerBase &PM) override;
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64TargetObjectFile.cpp b/lib/Target/ARM64/ARM64TargetObjectFile.cpp
deleted file mode 100644
index cde01e5..0000000
--- a/lib/Target/ARM64/ARM64TargetObjectFile.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===-- ARM64TargetObjectFile.cpp - ARM64 Object Info ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64TargetObjectFile.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Dwarf.h"
-using namespace llvm;
-using namespace dwarf;
-
-void ARM64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
-                                           const TargetMachine &TM) {
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
-}
-
-const MCExpr *ARM64_MachoTargetObjectFile::getTTypeGlobalReference(
-    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
-    const TargetMachine &TM, MachineModuleInfo *MMI,
-    MCStreamer &Streamer) const {
-  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
-  // is an indirect pc-relative reference. The default implementation
-  // won't reference using the GOT, so we need this target-specific
-  // version.
-  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
-    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
-    const MCExpr *Res =
-        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
-    MCSymbol *PCSym = getContext().CreateTempSymbol();
-    Streamer.EmitLabel(PCSym);
-    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
-    return MCBinaryExpr::CreateSub(Res, PC, getContext());
-  }
-
-  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
-      GV, Encoding, Mang, TM, MMI, Streamer);
-}
-
-MCSymbol *ARM64_MachoTargetObjectFile::getCFIPersonalitySymbol(
-    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
-    MachineModuleInfo *MMI) const {
-  return TM.getSymbol(GV, Mang);
-}
diff --git a/lib/Target/ARM64/ARM64TargetObjectFile.h b/lib/Target/ARM64/ARM64TargetObjectFile.h
deleted file mode 100644
index 62446f9..0000000
--- a/lib/Target/ARM64/ARM64TargetObjectFile.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- ARM64TargetObjectFile.h - ARM64 Object Info -*- C++ -------------*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64_TARGETOBJECTFILE_H
-#define LLVM_TARGET_ARM64_TARGETOBJECTFILE_H
-
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-
-namespace llvm {
-class ARM64TargetMachine;
-
-/// This implementation is used for AArch64 ELF targets (Linux in particular).
-class ARM64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
-  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-};
-
-/// ARM64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
-class ARM64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
-public:
-  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
-                                        unsigned Encoding, Mangler &Mang,
-                                        const TargetMachine &TM,
-                                        MachineModuleInfo *MMI,
-                                        MCStreamer &Streamer) const override;
-
-  MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
-                                    const TargetMachine &TM,
-                                    MachineModuleInfo *MMI) const override;
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64TargetTransformInfo.cpp b/lib/Target/ARM64/ARM64TargetTransformInfo.cpp
deleted file mode 100644
index 9b598d7..0000000
--- a/lib/Target/ARM64/ARM64TargetTransformInfo.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-//===-- ARM64TargetTransformInfo.cpp - ARM64 specific TTI pass ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file implements a TargetTransformInfo analysis pass specific to the
-/// ARM64 target machine. It uses the target's detailed information to provide
-/// more precise answers to certain TTI queries, while letting the target
-/// independent and default TTI implementations handle the rest.
-///
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64tti"
-#include "ARM64.h"
-#include "ARM64TargetMachine.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/CostTable.h"
-#include "llvm/Target/TargetLowering.h"
-using namespace llvm;
-
-// Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializeARM64TTIPass(PassRegistry &);
-}
-
-namespace {
-
-class ARM64TTI final : public ImmutablePass, public TargetTransformInfo {
-  const ARM64TargetMachine *TM;
-  const ARM64Subtarget *ST;
-  const ARM64TargetLowering *TLI;
-
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
-public:
-  ARM64TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  ARM64TTI(const ARM64TargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getTargetLowering()) {
-    initializeARM64TTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override { pushTTIStack(this); }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo *)this;
-    return this;
-  }
-
-  /// \name Scalar TTI Implementations
-  /// @{
-
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
-  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
-
-  /// @}
-
-  /// \name Vector TTI Implementations
-  /// @{
-
-  unsigned getNumberOfRegisters(bool Vector) const override {
-    if (Vector)
-      return 32;
-
-    return 31;
-  }
-
-  unsigned getRegisterBitWidth(bool Vector) const override {
-    if (Vector)
-      return 128;
-
-    return 64;
-  }
-
-  unsigned getMaximumUnrollFactor() const override { return 2; }
-
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
-      override;
-
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
-      override;
-
-  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                  OperandValueKind Opd1Info = OK_AnyValue,
-                                  OperandValueKind Opd2Info = OK_AnyValue) const
-      override;
-
-  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
-
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
-      override;
-
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const override;
-  /// @}
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(ARM64TTI, TargetTransformInfo, "arm64tti",
-                   "ARM64 Target Transform Info", true, true, false)
-char ARM64TTI::ID = 0;
-
-ImmutablePass *
-llvm::createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM) {
-  return new ARM64TTI(TM);
-}
-
-unsigned ARM64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
-  assert(Ty->isIntegerTy());
-
-  unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  if (BitSize == 0)
-    return ~0U;
-
-  int64_t Val = Imm.getSExtValue();
-  if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, BitSize))
-    return 1;
-
-  if ((int64_t)Val < 0)
-    Val = ~Val;
-  if (BitSize == 32)
-    Val &= (1LL << 32) - 1;
-
-  unsigned LZ = countLeadingZeros((uint64_t)Val);
-  unsigned Shift = (63 - LZ) / 16;
-  // MOVZ is free so return true for one or fewer MOVK.
-  return (Shift == 0) ? 1 : Shift;
-}
-
-ARM64TTI::PopcntSupportKind ARM64TTI::getPopcntSupport(unsigned TyWidth) const {
-  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-  if (TyWidth == 32 || TyWidth == 64)
-    return PSK_FastHardware;
-  // TODO: ARM64TargetLowering::LowerCTPOP() supports 128bit popcount.
-  return PSK_Software;
-}
-
-unsigned ARM64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const {
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  assert(ISD && "Invalid opcode");
-
-  EVT SrcTy = TLI->getValueType(Src);
-  EVT DstTy = TLI->getValueType(Dst);
-
-  if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
-
-  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
-    // LowerVectorINT_TO_FP:
-    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
-    // LowerVectorFP_TO_INT
-    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
-    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
-    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
-    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
-  };
-
-  int Idx = ConvertCostTableLookup<MVT>(
-      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
-      SrcTy.getSimpleVT());
-  if (Idx != -1)
-    return ConversionTbl[Idx].Cost;
-
-  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
-}
-
-unsigned ARM64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                      unsigned Index) const {
-  assert(Val->isVectorTy() && "This must be a vector type");
-
-  if (Index != -1U) {
-    // Legalize the type.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
-
-    // This type is legalized to a scalar type.
-    if (!LT.second.isVector())
-      return 0;
-
-    // The type may be split. Normalize the index to the new type.
-    unsigned Width = LT.second.getVectorNumElements();
-    Index = Index % Width;
-
-    // The element at index zero is already inside the vector.
-    if (Index == 0)
-      return 0;
-  }
-
-  // All other insert/extracts cost this much.
-  return 2;
-}
-
-unsigned ARM64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                          OperandValueKind Opd1Info,
-                                          OperandValueKind Opd2Info) const {
-  // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
-
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-
-  switch (ISD) {
-  default:
-    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
-                                                       Opd2Info);
-  case ISD::ADD:
-  case ISD::MUL:
-  case ISD::XOR:
-  case ISD::OR:
-  case ISD::AND:
-    // These nodes are marked as 'custom' for combining purposes only.
-    // We know that they are legal. See LowerAdd in ISelLowering.
-    return 1 * LT.first;
-  }
-}
-
-unsigned ARM64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
-  // Address computations in vectorized code with non-consecutive addresses will
-  // likely result in more instructions compared to scalar code where the
-  // computation can more often be merged into the index mode. The resulting
-  // extra micro-ops can significantly decrease throughput.
-  unsigned NumVectorInstToHideOverhead = 10;
-
-  if (Ty->isVectorTy() && IsComplex)
-    return NumVectorInstToHideOverhead;
-
-  // In many cases the address computation is not merged into the instruction
-  // addressing mode.
-  return 1;
-}
-
-unsigned ARM64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy) const {
-
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  // We don't lower vector selects well that are wider than the register width.
-  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
-    // We would need this many instructions to hide the scalarization happening.
-    unsigned AmortizationCost = 20;
-    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-    VectorSelectTbl[] = {
-      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
-      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
-      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
-      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
-      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
-      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
-    };
-
-    EVT SelCondTy = TLI->getValueType(CondTy);
-    EVT SelValTy = TLI->getValueType(ValTy);
-    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
-      int Idx =
-          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
-                                 SelValTy.getSimpleVT());
-      if (Idx != -1)
-        return VectorSelectTbl[Idx].Cost;
-    }
-  }
-  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
-}
-
-unsigned ARM64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                   unsigned Alignment,
-                                   unsigned AddressSpace) const {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
-
-  if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
-      Src->getVectorElementType()->isIntegerTy(64)) {
-    // Unaligned stores are extremely inefficient. We don't split
-    // unaligned v2i64 stores because the negative impact that has shown in
-    // practice on inlined memcpy code.
-    // We make v2i64 stores expensive so that we will only vectorize if there
-    // are 6 other instructions getting vectorized.
-    unsigned AmortizationCost = 6;
-
-    return LT.first * 2 * AmortizationCost;
-  }
-
-  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
-      Src->getVectorNumElements() < 8) {
-    // We scalarize the loads/stores because there is not v.4b register and we
-    // have to promote the elements to v.4h.
-    unsigned NumVecElts = Src->getVectorNumElements();
-    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
-    // We generate 2 instructions per vector element.
-    return NumVectorizableInstsToAmortize * NumVecElts * 2;
-  }
-
-  return LT.first;
-}
diff --git a/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp b/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
deleted file mode 100644
index 38a61d8..0000000
--- a/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
+++ /dev/null
@@ -1,4832 +0,0 @@
-//===-- ARM64AsmParser.cpp - Parse ARM64 assembly to MCInst instructions --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "MCTargetDesc/ARM64BaseInfo.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
-#include <cstdio>
-using namespace llvm;
-
-namespace {
-
-class ARM64Operand;
-
-class ARM64AsmParser : public MCTargetAsmParser {
-public:
-  typedef SmallVectorImpl<MCParsedAsmOperand *> OperandVector;
-
-private:
-  StringRef Mnemonic; ///< Instruction mnemonic.
-  MCSubtargetInfo &STI;
-  MCAsmParser &Parser;
-
-  MCAsmParser &getParser() const { return Parser; }
-  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
-  SMLoc getLoc() const { return Parser.getTok().getLoc(); }
-
-  bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
-  unsigned parseCondCodeString(StringRef Cond);
-  bool parseCondCode(OperandVector &Operands, bool invertCondCode);
-  int tryParseRegister();
-  int tryMatchVectorRegister(StringRef &Kind);
-  bool parseOptionalShift(OperandVector &Operands);
-  bool parseOptionalExtend(OperandVector &Operands);
-  bool parseRegister(OperandVector &Operands);
-  bool parseMemory(OperandVector &Operands);
-  bool parseSymbolicImmVal(const MCExpr *&ImmVal);
-  bool parseVectorList(OperandVector &Operands);
-  bool parseOperand(OperandVector &Operands, bool isCondCode,
-                    bool invertCondCode);
-
-  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
-  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
-  bool showMatchError(SMLoc Loc, unsigned ErrCode);
-
-  bool parseDirectiveWord(unsigned Size, SMLoc L);
-  bool parseDirectiveTLSDescCall(SMLoc L);
-
-  bool parseDirectiveLOH(StringRef LOH, SMLoc L);
-
-  bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
-  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               OperandVector &Operands, MCStreamer &Out,
-                               unsigned &ErrorInfo, bool MatchingInlineAsm);
-/// @name Auto-generated Match Functions
-/// {
-
-#define GET_ASSEMBLER_HEADER
-#include "ARM64GenAsmMatcher.inc"
-
-  /// }
-
-  OperandMatchResultTy tryParseNoIndexMemory(OperandVector &Operands);
-  OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
-  OperandMatchResultTy tryParseSystemRegister(OperandVector &Operands);
-  OperandMatchResultTy tryParseCPSRField(OperandVector &Operands);
-  OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
-  OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
-  OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
-  OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
-  OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
-  bool tryParseVectorRegister(OperandVector &Operands);
-
-public:
-  enum ARM64MatchResultTy {
-    Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
-#define GET_OPERAND_DIAGNOSTIC_TYPES
-#include "ARM64GenAsmMatcher.inc"
-  };
-  ARM64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-                 const MCInstrInfo &MII)
-      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
-    MCAsmParserExtension::Initialize(_Parser);
-  }
-
-  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                                SMLoc NameLoc, OperandVector &Operands);
-  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
-  virtual bool ParseDirective(AsmToken DirectiveID);
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, unsigned Kind);
-
-  static bool classifySymbolRef(const MCExpr *Expr,
-                                ARM64MCExpr::VariantKind &ELFRefKind,
-                                MCSymbolRefExpr::VariantKind &DarwinRefKind,
-                                const MCConstantExpr *&Addend);
-};
-} // end anonymous namespace
-
-namespace {
-
-/// ARM64Operand - Instances of this class represent a parsed ARM64 machine
-/// instruction.
-class ARM64Operand : public MCParsedAsmOperand {
-public:
-  enum MemIdxKindTy {
-    ImmediateOffset, // pre-indexed, no writeback
-    RegisterOffset   // register offset, with optional extend
-  };
-
-private:
-  enum KindTy {
-    k_Immediate,
-    k_Memory,
-    k_Register,
-    k_VectorList,
-    k_VectorIndex,
-    k_Token,
-    k_SysCR,
-    k_Prefetch,
-    k_Shifter,
-    k_Extend,
-    k_FPImm,
-    k_Barrier,
-    k_SystemRegister,
-    k_CPSRField
-  } Kind;
-
-  SMLoc StartLoc, EndLoc, OffsetLoc;
-
-  struct TokOp {
-    const char *Data;
-    unsigned Length;
-    bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
-  };
-
-  struct RegOp {
-    unsigned RegNum;
-    bool isVector;
-  };
-
-  struct VectorListOp {
-    unsigned RegNum;
-    unsigned Count;
-    unsigned NumElements;
-    unsigned ElementKind;
-  };
-
-  struct VectorIndexOp {
-    unsigned Val;
-  };
-
-  struct ImmOp {
-    const MCExpr *Val;
-  };
-
-  struct FPImmOp {
-    unsigned Val; // Encoded 8-bit representation.
-  };
-
-  struct BarrierOp {
-    unsigned Val; // Not the enum since not all values have names.
-  };
-
-  struct SystemRegisterOp {
-    // 16-bit immediate, usually from the ARM64SYS::SystermRegister enum,
-    // but not limited to those values.
-    uint16_t Val;
-  };
-
-  struct CPSRFieldOp {
-    ARM64SYS::CPSRField Field;
-  };
-
-  struct SysCRImmOp {
-    unsigned Val;
-  };
-
-  struct PrefetchOp {
-    unsigned Val;
-  };
-
-  struct ShifterOp {
-    unsigned Val;
-  };
-
-  struct ExtendOp {
-    unsigned Val;
-  };
-
-  // This is for all forms of ARM64 address expressions
-  struct MemOp {
-    unsigned BaseRegNum, OffsetRegNum;
-    ARM64_AM::ExtendType ExtType;
-    unsigned ShiftVal;
-    bool ExplicitShift;
-    const MCExpr *OffsetImm;
-    MemIdxKindTy Mode;
-  };
-
-  union {
-    struct TokOp Tok;
-    struct RegOp Reg;
-    struct VectorListOp VectorList;
-    struct VectorIndexOp VectorIndex;
-    struct ImmOp Imm;
-    struct FPImmOp FPImm;
-    struct BarrierOp Barrier;
-    struct SystemRegisterOp SystemRegister;
-    struct CPSRFieldOp CPSRField;
-    struct SysCRImmOp SysCRImm;
-    struct PrefetchOp Prefetch;
-    struct ShifterOp Shifter;
-    struct ExtendOp Extend;
-    struct MemOp Mem;
-  };
-
-  // Keep the MCContext around as the MCExprs may need manipulated during
-  // the add<>Operands() calls.
-  MCContext &Ctx;
-
-  ARM64Operand(KindTy K, MCContext &_Ctx)
-      : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {}
-
-public:
-  ARM64Operand(const ARM64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
-    Kind = o.Kind;
-    StartLoc = o.StartLoc;
-    EndLoc = o.EndLoc;
-    switch (Kind) {
-    case k_Token:
-      Tok = o.Tok;
-      break;
-    case k_Immediate:
-      Imm = o.Imm;
-      break;
-    case k_FPImm:
-      FPImm = o.FPImm;
-      break;
-    case k_Barrier:
-      Barrier = o.Barrier;
-      break;
-    case k_SystemRegister:
-      SystemRegister = o.SystemRegister;
-      break;
-    case k_CPSRField:
-      CPSRField = o.CPSRField;
-      break;
-    case k_Register:
-      Reg = o.Reg;
-      break;
-    case k_VectorList:
-      VectorList = o.VectorList;
-      break;
-    case k_VectorIndex:
-      VectorIndex = o.VectorIndex;
-      break;
-    case k_SysCR:
-      SysCRImm = o.SysCRImm;
-      break;
-    case k_Prefetch:
-      Prefetch = o.Prefetch;
-      break;
-    case k_Memory:
-      Mem = o.Mem;
-      break;
-    case k_Shifter:
-      Shifter = o.Shifter;
-      break;
-    case k_Extend:
-      Extend = o.Extend;
-      break;
-    }
-  }
-
-  /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const { return StartLoc; }
-  /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const { return EndLoc; }
-  /// getOffsetLoc - Get the location of the offset of this memory operand.
-  SMLoc getOffsetLoc() const { return OffsetLoc; }
-
-  StringRef getToken() const {
-    assert(Kind == k_Token && "Invalid access!");
-    return StringRef(Tok.Data, Tok.Length);
-  }
-
-  bool isTokenSuffix() const {
-    assert(Kind == k_Token && "Invalid access!");
-    return Tok.IsSuffix;
-  }
-
-  const MCExpr *getImm() const {
-    assert(Kind == k_Immediate && "Invalid access!");
-    return Imm.Val;
-  }
-
-  unsigned getFPImm() const {
-    assert(Kind == k_FPImm && "Invalid access!");
-    return FPImm.Val;
-  }
-
-  unsigned getBarrier() const {
-    assert(Kind == k_Barrier && "Invalid access!");
-    return Barrier.Val;
-  }
-
-  uint16_t getSystemRegister() const {
-    assert(Kind == k_SystemRegister && "Invalid access!");
-    return SystemRegister.Val;
-  }
-
-  ARM64SYS::CPSRField getCPSRField() const {
-    assert(Kind == k_CPSRField && "Invalid access!");
-    return CPSRField.Field;
-  }
-
-  unsigned getReg() const {
-    assert(Kind == k_Register && "Invalid access!");
-    return Reg.RegNum;
-  }
-
-  unsigned getVectorListStart() const {
-    assert(Kind == k_VectorList && "Invalid access!");
-    return VectorList.RegNum;
-  }
-
-  unsigned getVectorListCount() const {
-    assert(Kind == k_VectorList && "Invalid access!");
-    return VectorList.Count;
-  }
-
-  unsigned getVectorIndex() const {
-    assert(Kind == k_VectorIndex && "Invalid access!");
-    return VectorIndex.Val;
-  }
-
-  unsigned getSysCR() const {
-    assert(Kind == k_SysCR && "Invalid access!");
-    return SysCRImm.Val;
-  }
-
-  unsigned getPrefetch() const {
-    assert(Kind == k_Prefetch && "Invalid access!");
-    return Prefetch.Val;
-  }
-
-  unsigned getShifter() const {
-    assert(Kind == k_Shifter && "Invalid access!");
-    return Shifter.Val;
-  }
-
-  unsigned getExtend() const {
-    assert(Kind == k_Extend && "Invalid access!");
-    return Extend.Val;
-  }
-
-  bool isImm() const { return Kind == k_Immediate; }
-  bool isSImm9() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -256 && Val < 256);
-  }
-  bool isSImm7s4() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
-  }
-  bool isSImm7s8() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
-  }
-  bool isSImm7s16() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
-  }
-  bool isImm0_7() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 8);
-  }
-  bool isImm1_8() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 9);
-  }
-  bool isImm0_15() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 16);
-  }
-  bool isImm1_16() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 17);
-  }
-  bool isImm0_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 32);
-  }
-  bool isImm1_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 32);
-  }
-  bool isImm1_32() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 33);
-  }
-  bool isImm0_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 64);
-  }
-  bool isImm1_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 64);
-  }
-  bool isImm1_64() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 65);
-  }
-  bool isImm0_127() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 128);
-  }
-  bool isImm0_255() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 256);
-  }
-  bool isImm0_65535() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 65536);
-  }
-  bool isLogicalImm32() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return ARM64_AM::isLogicalImmediate(MCE->getValue(), 32);
-  }
-  bool isLogicalImm64() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return ARM64_AM::isLogicalImmediate(MCE->getValue(), 64);
-  }
-  bool isSIMDImmType10() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return ARM64_AM::isAdvSIMDModImmType10(MCE->getValue());
-  }
-  bool isBranchTarget26() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
-  }
-  bool isBranchTarget19() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
-  }
-  bool isBranchTarget14() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
-  }
-
-  bool isMovWSymbol(ArrayRef<ARM64MCExpr::VariantKind> AllowedModifiers) const {
-    if (!isImm())
-      return false;
-
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    const MCConstantExpr *Addend;
-    if (!ARM64AsmParser::classifySymbolRef(getImm(), ELFRefKind, DarwinRefKind,
-                                           Addend)) {
-      return false;
-    }
-    if (DarwinRefKind != MCSymbolRefExpr::VK_None)
-      return false;
-
-    for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
-      if (ELFRefKind == AllowedModifiers[i])
-        return Addend == 0;
-    }
-
-    return false;
-  }
-
-  bool isMovZSymbolG3() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G3 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovZSymbolG2() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G2,
-                                                   ARM64MCExpr::VK_TPREL_G2,
-                                                   ARM64MCExpr::VK_DTPREL_G2 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovZSymbolG1() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G1,
-                                                   ARM64MCExpr::VK_GOTTPREL_G1,
-                                                   ARM64MCExpr::VK_TPREL_G1,
-                                                   ARM64MCExpr::VK_DTPREL_G1, };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovZSymbolG0() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G0,
-                                                   ARM64MCExpr::VK_TPREL_G0,
-                                                   ARM64MCExpr::VK_DTPREL_G0 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG2() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G2_NC };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG1() const {
-    static ARM64MCExpr::VariantKind Variants[] = {
-      ARM64MCExpr::VK_ABS_G1_NC, ARM64MCExpr::VK_TPREL_G1_NC,
-      ARM64MCExpr::VK_DTPREL_G1_NC
-    };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG0() const {
-    static ARM64MCExpr::VariantKind Variants[] = {
-      ARM64MCExpr::VK_ABS_G0_NC,   ARM64MCExpr::VK_GOTTPREL_G0_NC,
-      ARM64MCExpr::VK_TPREL_G0_NC, ARM64MCExpr::VK_DTPREL_G0_NC
-    };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isFPImm() const { return Kind == k_FPImm; }
-  bool isBarrier() const { return Kind == k_Barrier; }
-  bool isSystemRegister() const {
-    if (Kind == k_SystemRegister)
-      return true;
-    // SPSel is legal for both the system register and the CPSR-field
-    // variants of MSR, so special case that. Fugly.
-    return (Kind == k_CPSRField && getCPSRField() == ARM64SYS::cpsr_SPSel);
-  }
-  bool isSystemCPSRField() const { return Kind == k_CPSRField; }
-  bool isReg() const { return Kind == k_Register && !Reg.isVector; }
-  bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
-
-  /// Is this a vector list with the type implicit (presumably attached to the
-  /// instruction itself)?
-  template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
-    return Kind == k_VectorList && VectorList.Count == NumRegs &&
-           !VectorList.ElementKind;
-  }
-
-  template <unsigned NumRegs, unsigned NumElements, char ElementKind>
-  bool isTypedVectorList() const {
-    if (Kind != k_VectorList)
-      return false;
-    if (VectorList.Count != NumRegs)
-      return false;
-    if (VectorList.ElementKind != ElementKind)
-      return false;
-    return VectorList.NumElements == NumElements;
-  }
-
-  bool isVectorIndexB() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 16;
-  }
-  bool isVectorIndexH() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 8;
-  }
-  bool isVectorIndexS() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 4;
-  }
-  bool isVectorIndexD() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 2;
-  }
-  bool isToken() const { return Kind == k_Token; }
-  bool isTokenEqual(StringRef Str) const {
-    return Kind == k_Token && getToken() == Str;
-  }
-  bool isMem() const { return Kind == k_Memory; }
-  bool isSysCR() const { return Kind == k_SysCR; }
-  bool isPrefetch() const { return Kind == k_Prefetch; }
-  bool isShifter() const { return Kind == k_Shifter; }
-  bool isExtend() const {
-    // lsl is an alias for UXTX but will be a parsed as a k_Shifter operand.
-    if (isShifter()) {
-      ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
-      return ST == ARM64_AM::LSL;
-    }
-    return Kind == k_Extend;
-  }
-  bool isExtend64() const {
-    if (Kind != k_Extend)
-      return false;
-    // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
-    ARM64_AM::ExtendType ET = ARM64_AM::getArithExtendType(Extend.Val);
-    return ET != ARM64_AM::UXTX && ET != ARM64_AM::SXTX;
-  }
-  bool isExtendLSL64() const {
-    // lsl is an alias for UXTX but will be a parsed as a k_Shifter operand.
-    if (isShifter()) {
-      ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
-      return ST == ARM64_AM::LSL;
-    }
-    if (Kind != k_Extend)
-      return false;
-    ARM64_AM::ExtendType ET = ARM64_AM::getArithExtendType(Extend.Val);
-    return ET == ARM64_AM::UXTX || ET == ARM64_AM::SXTX;
-  }
-
-  bool isArithmeticShifter() const {
-    if (!isShifter())
-      return false;
-
-    // An arithmetic shifter is LSL, LSR, or ASR.
-    ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
-    return ST == ARM64_AM::LSL || ST == ARM64_AM::LSR || ST == ARM64_AM::ASR;
-  }
-
-  bool isMovImm32Shifter() const {
-    if (!isShifter())
-      return false;
-
-    // A MOVi shifter is LSL of 0, 16, 32, or 48.
-    ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
-    if (ST != ARM64_AM::LSL)
-      return false;
-    uint64_t Val = ARM64_AM::getShiftValue(Shifter.Val);
-    return (Val == 0 || Val == 16);
-  }
-
-  bool isMovImm64Shifter() const {
-    if (!isShifter())
-      return false;
-
-    // A MOVi shifter is LSL of 0 or 16.
-    ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
-    if (ST != ARM64_AM::LSL)
-      return false;
-    uint64_t Val = ARM64_AM::getShiftValue(Shifter.Val);
-    return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
-  }
-
-  bool isAddSubShifter() const {
-    if (!isShifter())
-      return false;
-
-    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
-    unsigned Val = Shifter.Val;
-    return ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
-           (ARM64_AM::getShiftValue(Val) == 0 ||
-            ARM64_AM::getShiftValue(Val) == 12);
-  }
-
-  bool isLogicalVecShifter() const {
-    if (!isShifter())
-      return false;
-
-    // A logical vector shifter is a left shift by 0, 8, 16, or 24.
-    unsigned Val = Shifter.Val;
-    unsigned Shift = ARM64_AM::getShiftValue(Val);
-    return ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
-           (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
-  }
-
-  bool isLogicalVecHalfWordShifter() const {
-    if (!isLogicalVecShifter())
-      return false;
-
-    // A logical vector shifter is a left shift by 0 or 8.
-    unsigned Val = Shifter.Val;
-    unsigned Shift = ARM64_AM::getShiftValue(Val);
-    return ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
-           (Shift == 0 || Shift == 8);
-  }
-
-  bool isMoveVecShifter() const {
-    if (!isShifter())
-      return false;
-
-    // A logical vector shifter is a left shift by 8 or 16.
-    unsigned Val = Shifter.Val;
-    unsigned Shift = ARM64_AM::getShiftValue(Val);
-    return ARM64_AM::getShiftType(Val) == ARM64_AM::MSL &&
-           (Shift == 8 || Shift == 16);
-  }
-
-  bool isMemoryRegisterOffset8() const {
-    return isMem() && Mem.Mode == RegisterOffset && Mem.ShiftVal == 0;
-  }
-
-  bool isMemoryRegisterOffset16() const {
-    return isMem() && Mem.Mode == RegisterOffset &&
-           (Mem.ShiftVal == 0 || Mem.ShiftVal == 1);
-  }
-
-  bool isMemoryRegisterOffset32() const {
-    return isMem() && Mem.Mode == RegisterOffset &&
-           (Mem.ShiftVal == 0 || Mem.ShiftVal == 2);
-  }
-
-  bool isMemoryRegisterOffset64() const {
-    return isMem() && Mem.Mode == RegisterOffset &&
-           (Mem.ShiftVal == 0 || Mem.ShiftVal == 3);
-  }
-
-  bool isMemoryRegisterOffset128() const {
-    return isMem() && Mem.Mode == RegisterOffset &&
-           (Mem.ShiftVal == 0 || Mem.ShiftVal == 4);
-  }
-
-  bool isMemoryUnscaled() const {
-    if (!isMem())
-      return false;
-    if (Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    // Make sure the immediate value is valid.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    if (!CE)
-      return false;
-    // The offset must fit in a signed 9-bit unscaled immediate.
-    int64_t Value = CE->getValue();
-    return (Value >= -256 && Value < 256);
-  }
-  // Fallback unscaled operands are for aliases of LDR/STR that fall back
-  // to LDUR/STUR when the offset is not legal for the former but is for
-  // the latter. As such, in addition to checking for being a legal unscaled
-  // address, also check that it is not a legal scaled address. This avoids
-  // ambiguity in the matcher.
-  bool isMemoryUnscaledFB8() const {
-    return isMemoryUnscaled() && !isMemoryIndexed8();
-  }
-  bool isMemoryUnscaledFB16() const {
-    return isMemoryUnscaled() && !isMemoryIndexed16();
-  }
-  bool isMemoryUnscaledFB32() const {
-    return isMemoryUnscaled() && !isMemoryIndexed32();
-  }
-  bool isMemoryUnscaledFB64() const {
-    return isMemoryUnscaled() && !isMemoryIndexed64();
-  }
-  bool isMemoryUnscaledFB128() const {
-    return isMemoryUnscaled() && !isMemoryIndexed128();
-  }
-  bool isMemoryIndexed(unsigned Scale) const {
-    if (!isMem())
-      return false;
-    if (Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    // Make sure the immediate value is valid.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-
-    if (CE) {
-      // The offset must be a positive multiple of the scale and in range of
-      // encoding with a 12-bit immediate.
-      int64_t Value = CE->getValue();
-      return (Value >= 0 && (Value % Scale) == 0 && Value <= (4095 * Scale));
-    }
-
-    // If it's not a constant, check for some expressions we know.
-    const MCExpr *Expr = Mem.OffsetImm;
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    const MCConstantExpr *Addend;
-    if (!ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
-                                           Addend)) {
-      // If we don't understand the expression, assume the best and
-      // let the fixup and relocation code deal with it.
-      return true;
-    }
-
-    if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
-        ELFRefKind == ARM64MCExpr::VK_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_GOT_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC ||
-        ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC ||
-        ELFRefKind == ARM64MCExpr::VK_GOTTPREL_LO12_NC ||
-        ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12) {
-      // Note that we don't range-check the addend. It's adjusted modulo page
-      // size when converted, so there is no "out of range" condition when using
-      // @pageoff.
-      int64_t Value = Addend ? Addend->getValue() : 0;
-      return Value >= 0 && (Value % Scale) == 0;
-    } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
-               DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
-      // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
-      return Addend == 0;
-    }
-
-    return false;
-  }
-  bool isMemoryIndexed128() const { return isMemoryIndexed(16); }
-  bool isMemoryIndexed64() const { return isMemoryIndexed(8); }
-  bool isMemoryIndexed32() const { return isMemoryIndexed(4); }
-  bool isMemoryIndexed16() const { return isMemoryIndexed(2); }
-  bool isMemoryIndexed8() const { return isMemoryIndexed(1); }
-  bool isMemoryNoIndex() const {
-    if (!isMem())
-      return false;
-    if (Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-
-    // Make sure the immediate value is valid. Only zero is allowed.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    if (!CE || CE->getValue() != 0)
-      return false;
-    return true;
-  }
-  bool isMemorySIMDNoIndex() const {
-    if (!isMem())
-      return false;
-    if (Mem.Mode != ImmediateOffset)
-      return false;
-    return Mem.OffsetImm == 0;
-  }
-  bool isMemoryIndexedSImm9() const {
-    if (!isMem() || Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    assert(CE && "Non-constant pre-indexed offset!");
-    int64_t Value = CE->getValue();
-    return Value >= -256 && Value <= 255;
-  }
-  bool isMemoryIndexed32SImm7() const {
-    if (!isMem() || Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    assert(CE && "Non-constant pre-indexed offset!");
-    int64_t Value = CE->getValue();
-    return ((Value % 4) == 0) && Value >= -256 && Value <= 252;
-  }
-  bool isMemoryIndexed64SImm7() const {
-    if (!isMem() || Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    assert(CE && "Non-constant pre-indexed offset!");
-    int64_t Value = CE->getValue();
-    return ((Value % 8) == 0) && Value >= -512 && Value <= 504;
-  }
-  bool isMemoryIndexed128SImm7() const {
-    if (!isMem() || Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    assert(CE && "Non-constant pre-indexed offset!");
-    int64_t Value = CE->getValue();
-    return ((Value % 16) == 0) && Value >= -1024 && Value <= 1008;
-  }
-
-  bool isAdrpLabel() const {
-    // Validation was handled during parsing, so we just sanity check that
-    // something didn't go haywire.
-    return isImm();
-  }
-
-  bool isAdrLabel() const {
-    // Validation was handled during parsing, so we just sanity check that
-    // something didn't go haywire.
-    return isImm();
-  }
-
-  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
-    // Add as immediates when possible.  Null MCExpr = 0.
-    if (Expr == 0)
-      Inst.addOperand(MCOperand::CreateImm(0));
-    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    else
-      Inst.addOperand(MCOperand::CreateExpr(Expr));
-  }
-
-  void addRegOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
-  }
-
-  void addVectorRegOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
-  }
-
-  template <unsigned NumRegs>
-  void addVectorList64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    static unsigned FirstRegs[] = { ARM64::D0,       ARM64::D0_D1,
-                                    ARM64::D0_D1_D2, ARM64::D0_D1_D2_D3 };
-    unsigned FirstReg = FirstRegs[NumRegs - 1];
-
-    Inst.addOperand(
-        MCOperand::CreateReg(FirstReg + getVectorListStart() - ARM64::Q0));
-  }
-
-  template <unsigned NumRegs>
-  void addVectorList128Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    static unsigned FirstRegs[] = { ARM64::Q0,       ARM64::Q0_Q1,
-                                    ARM64::Q0_Q1_Q2, ARM64::Q0_Q1_Q2_Q3 };
-    unsigned FirstReg = FirstRegs[NumRegs - 1];
-
-    Inst.addOperand(
-        MCOperand::CreateReg(FirstReg + getVectorListStart() - ARM64::Q0));
-  }
-
-  void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // If this is a pageoff symrefexpr with an addend, adjust the addend
-    // to be only the page-offset portion. Otherwise, just add the expr
-    // as-is.
-    addExpr(Inst, getImm());
-  }
-
-  void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void addSImm9Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4));
-  }
-
-  void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8));
-  }
-
-  void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16));
-  }
-
-  void addImm0_7Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_15Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_31Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_63Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_63Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_127Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_255Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid logical immediate operand!");
-    uint64_t encoding = ARM64_AM::encodeLogicalImmediate(MCE->getValue(), 32);
-    Inst.addOperand(MCOperand::CreateImm(encoding));
-  }
-
-  void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid logical immediate operand!");
-    uint64_t encoding = ARM64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
-    Inst.addOperand(MCOperand::CreateImm(encoding));
-  }
-
-  void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid immediate operand!");
-    uint64_t encoding = ARM64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
-    Inst.addOperand(MCOperand::CreateImm(encoding));
-  }
-
-  void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
-    // Branch operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
-  }
-
-  void addBranchTarget19Operands(MCInst &Inst, unsigned N) const {
-    // Branch operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
-  }
-
-  void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
-    // Branch operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
-  }
-
-  void addFPImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getFPImm()));
-  }
-
-  void addBarrierOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getBarrier()));
-  }
-
-  void addSystemRegisterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    if (Kind == k_SystemRegister)
-      Inst.addOperand(MCOperand::CreateImm(getSystemRegister()));
-    else {
-      assert(Kind == k_CPSRField && getCPSRField() == ARM64SYS::cpsr_SPSel);
-      Inst.addOperand(MCOperand::CreateImm(ARM64SYS::SPSel));
-    }
-  }
-
-  void addSystemCPSRFieldOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getCPSRField()));
-  }
-
-  void addSysCROperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getSysCR()));
-  }
-
-  void addPrefetchOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getPrefetch()));
-  }
-
-  void addShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addArithmeticShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addMovImm32ShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addMovImm64ShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addAddSubShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addLogicalVecShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addLogicalVecHalfWordShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addMoveVecShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addExtendOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // lsl is an alias for UXTX but will be a parsed as a k_Shifter operand.
-    if (isShifter()) {
-      assert(ARM64_AM::getShiftType(getShifter()) == ARM64_AM::LSL);
-      unsigned imm = getArithExtendImm(ARM64_AM::UXTX,
-                                       ARM64_AM::getShiftValue(getShifter()));
-      Inst.addOperand(MCOperand::CreateImm(imm));
-    } else
-      Inst.addOperand(MCOperand::CreateImm(getExtend()));
-  }
-
-  void addExtend64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getExtend()));
-  }
-
-  void addExtendLSL64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // lsl is an alias for UXTX but will be a parsed as a k_Shifter operand.
-    if (isShifter()) {
-      assert(ARM64_AM::getShiftType(getShifter()) == ARM64_AM::LSL);
-      unsigned imm = getArithExtendImm(ARM64_AM::UXTX,
-                                       ARM64_AM::getShiftValue(getShifter()));
-      Inst.addOperand(MCOperand::CreateImm(imm));
-    } else
-      Inst.addOperand(MCOperand::CreateImm(getExtend()));
-  }
-
-  void addMemoryRegisterOffsetOperands(MCInst &Inst, unsigned N, bool DoShift) {
-    assert(N == 3 && "Invalid number of operands!");
-
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-    Inst.addOperand(MCOperand::CreateReg(Mem.OffsetRegNum));
-    unsigned ExtendImm = ARM64_AM::getMemExtendImm(Mem.ExtType, DoShift);
-    Inst.addOperand(MCOperand::CreateImm(ExtendImm));
-  }
-
-  void addMemoryRegisterOffset8Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ExplicitShift);
-  }
-
-  void addMemoryRegisterOffset16Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 1);
-  }
-
-  void addMemoryRegisterOffset32Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 2);
-  }
-
-  void addMemoryRegisterOffset64Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 3);
-  }
-
-  void addMemoryRegisterOffset128Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 4);
-  }
-
-  void addMemoryIndexedOperands(MCInst &Inst, unsigned N,
-                                unsigned Scale) const {
-    // Add the base register operand.
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-
-    if (!Mem.OffsetImm) {
-      // There isn't an offset.
-      Inst.addOperand(MCOperand::CreateImm(0));
-      return;
-    }
-
-    // Add the offset operand.
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm)) {
-      assert(CE->getValue() % Scale == 0 &&
-             "Offset operand must be multiple of the scale!");
-
-      // The MCInst offset operand doesn't include the low bits (like the
-      // instruction encoding).
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue() / Scale));
-    }
-
-    // If this is a pageoff symrefexpr with an addend, the linker will
-    // do the scaling of the addend.
-    //
-    // Otherwise we don't know what this is, so just add the scaling divide to
-    // the expression and let the MC fixup evaluation code deal with it.
-    const MCExpr *Expr = Mem.OffsetImm;
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    const MCConstantExpr *Addend;
-    if (Scale > 1 &&
-        (!ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
-                                            Addend) ||
-         (Addend != 0 && DarwinRefKind != MCSymbolRefExpr::VK_PAGEOFF))) {
-      Expr = MCBinaryExpr::CreateDiv(Expr, MCConstantExpr::Create(Scale, Ctx),
-                                     Ctx);
-    }
-
-    Inst.addOperand(MCOperand::CreateExpr(Expr));
-  }
-
-  void addMemoryUnscaledOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryUnscaled() && "Invalid number of operands!");
-    // Add the base register operand.
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-
-    // Add the offset operand.
-    if (!Mem.OffsetImm)
-      Inst.addOperand(MCOperand::CreateImm(0));
-    else {
-      // Only constant offsets supported.
-      const MCConstantExpr *CE = cast<MCConstantExpr>(Mem.OffsetImm);
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    }
-  }
-
-  void addMemoryIndexed128Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed128() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 16);
-  }
-
-  void addMemoryIndexed64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed64() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 8);
-  }
-
-  void addMemoryIndexed32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed32() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 4);
-  }
-
-  void addMemoryIndexed16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed16() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 2);
-  }
-
-  void addMemoryIndexed8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed8() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 1);
-  }
-
-  void addMemoryNoIndexOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && isMemoryNoIndex() && "Invalid number of operands!");
-    // Add the base register operand (the offset is always zero, so ignore it).
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-  }
-
-  void addMemorySIMDNoIndexOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && isMemorySIMDNoIndex() && "Invalid number of operands!");
-    // Add the base register operand (the offset is always zero, so ignore it).
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-  }
-
-  void addMemoryWritebackIndexedOperands(MCInst &Inst, unsigned N,
-                                         unsigned Scale) const {
-    assert(N == 2 && "Invalid number of operands!");
-
-    // Add the base register operand.
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-
-    // Add the offset operand.
-    int64_t Offset = 0;
-    if (Mem.OffsetImm) {
-      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-      assert(CE && "Non-constant indexed offset operand!");
-      Offset = CE->getValue();
-    }
-
-    if (Scale != 1) {
-      assert(Offset % Scale == 0 &&
-             "Offset operand must be a multiple of the scale!");
-      Offset /= Scale;
-    }
-
-    Inst.addOperand(MCOperand::CreateImm(Offset));
-  }
-
-  void addMemoryIndexedSImm9Operands(MCInst &Inst, unsigned N) const {
-    addMemoryWritebackIndexedOperands(Inst, N, 1);
-  }
-
-  void addMemoryIndexed32SImm7Operands(MCInst &Inst, unsigned N) const {
-    addMemoryWritebackIndexedOperands(Inst, N, 4);
-  }
-
-  void addMemoryIndexed64SImm7Operands(MCInst &Inst, unsigned N) const {
-    addMemoryWritebackIndexedOperands(Inst, N, 8);
-  }
-
-  void addMemoryIndexed128SImm7Operands(MCInst &Inst, unsigned N) const {
-    addMemoryWritebackIndexedOperands(Inst, N, 16);
-  }
-
-  virtual void print(raw_ostream &OS) const;
-
-  static ARM64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S,
-                                   MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Token, Ctx);
-    Op->Tok.Data = Str.data();
-    Op->Tok.Length = Str.size();
-    Op->Tok.IsSuffix = IsSuffix;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S,
-                                 SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Register, Ctx);
-    Op->Reg.RegNum = RegNum;
-    Op->Reg.isVector = isVector;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
-                                        unsigned NumElements, char ElementKind,
-                                        SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_VectorList, Ctx);
-    Op->VectorList.RegNum = RegNum;
-    Op->VectorList.Count = Count;
-    Op->VectorList.NumElements = NumElements;
-    Op->VectorList.ElementKind = ElementKind;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
-                                         MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_VectorIndex, Ctx);
-    Op->VectorIndex.Val = Idx;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
-                                 MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Immediate, Ctx);
-    Op->Imm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_FPImm, Ctx);
-    Op->FPImm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Barrier, Ctx);
-    Op->Barrier.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateSystemRegister(uint16_t Val, SMLoc S,
-                                            MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_SystemRegister, Ctx);
-    Op->SystemRegister.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateCPSRField(ARM64SYS::CPSRField Field, SMLoc S,
-                                       MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_CPSRField, Ctx);
-    Op->CPSRField.Field = Field;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateMem(unsigned BaseRegNum, const MCExpr *Off,
-                                 SMLoc S, SMLoc E, SMLoc OffsetLoc,
-                                 MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Memory, Ctx);
-    Op->Mem.BaseRegNum = BaseRegNum;
-    Op->Mem.OffsetRegNum = 0;
-    Op->Mem.OffsetImm = Off;
-    Op->Mem.ExtType = ARM64_AM::UXTX;
-    Op->Mem.ShiftVal = 0;
-    Op->Mem.ExplicitShift = false;
-    Op->Mem.Mode = ImmediateOffset;
-    Op->OffsetLoc = OffsetLoc;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateRegOffsetMem(unsigned BaseReg, unsigned OffsetReg,
-                                          ARM64_AM::ExtendType ExtType,
-                                          unsigned ShiftVal, bool ExplicitShift,
-                                          SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Memory, Ctx);
-    Op->Mem.BaseRegNum = BaseReg;
-    Op->Mem.OffsetRegNum = OffsetReg;
-    Op->Mem.OffsetImm = 0;
-    Op->Mem.ExtType = ExtType;
-    Op->Mem.ShiftVal = ShiftVal;
-    Op->Mem.ExplicitShift = ExplicitShift;
-    Op->Mem.Mode = RegisterOffset;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E,
-                                   MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_SysCR, Ctx);
-    Op->SysCRImm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Prefetch, Ctx);
-    Op->Prefetch.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateShifter(ARM64_AM::ShiftType ShOp, unsigned Val,
-                                     SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Shifter, Ctx);
-    Op->Shifter.Val = ARM64_AM::getShifterImm(ShOp, Val);
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateExtend(ARM64_AM::ExtendType ExtOp, unsigned Val,
-                                    SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Extend, Ctx);
-    Op->Extend.Val = ARM64_AM::getArithExtendImm(ExtOp, Val);
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-};
-
-} // end anonymous namespace.
-
-void ARM64Operand::print(raw_ostream &OS) const {
-  switch (Kind) {
-  case k_FPImm:
-    OS << "<fpimm " << getFPImm() << "(" << ARM64_AM::getFPImmFloat(getFPImm())
-       << ") >";
-    break;
-  case k_Barrier: {
-    const char *Name =
-        ARM64SYS::getBarrierOptName((ARM64SYS::BarrierOption)getBarrier());
-    OS << "<barrier ";
-    if (Name)
-      OS << Name;
-    else
-      OS << getBarrier();
-    OS << ">";
-    break;
-  }
-  case k_SystemRegister: {
-    const char *Name = ARM64SYS::getSystemRegisterName(
-        (ARM64SYS::SystemRegister)getSystemRegister());
-    OS << "<systemreg ";
-    if (Name)
-      OS << Name;
-    else
-      OS << "#" << getSystemRegister();
-    OS << ">";
-    break;
-  }
-  case k_CPSRField: {
-    const char *Name = ARM64SYS::getCPSRFieldName(getCPSRField());
-    OS << "<cpsrfield " << Name << ">";
-    break;
-  }
-  case k_Immediate:
-    getImm()->print(OS);
-    break;
-  case k_Memory:
-    OS << "<memory>";
-    break;
-  case k_Register:
-    OS << "<register " << getReg() << ">";
-    break;
-  case k_VectorList: {
-    OS << "<vectorlist ";
-    unsigned Reg = getVectorListStart();
-    for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
-      OS << Reg + i << " ";
-    OS << ">";
-    break;
-  }
-  case k_VectorIndex:
-    OS << "<vectorindex " << getVectorIndex() << ">";
-    break;
-  case k_Token:
-    OS << "'" << getToken() << "'";
-    break;
-  case k_SysCR:
-    OS << "c" << getSysCR();
-    break;
-  case k_Prefetch:
-    OS << "<prfop ";
-    if (ARM64_AM::isNamedPrefetchOp(getPrefetch()))
-      OS << ARM64_AM::getPrefetchOpName((ARM64_AM::PrefetchOp)getPrefetch());
-    else
-      OS << "#" << getPrefetch();
-    OS << ">";
-    break;
-  case k_Shifter: {
-    unsigned Val = getShifter();
-    OS << "<" << ARM64_AM::getShiftName(ARM64_AM::getShiftType(Val)) << " #"
-       << ARM64_AM::getShiftValue(Val) << ">";
-    break;
-  }
-  case k_Extend: {
-    unsigned Val = getExtend();
-    OS << "<" << ARM64_AM::getExtendName(ARM64_AM::getArithExtendType(Val))
-       << " #" << ARM64_AM::getArithShiftValue(Val) << ">";
-    break;
-  }
-  }
-}
-
-/// @name Auto-generated Match Functions
-/// {
-
-static unsigned MatchRegisterName(StringRef Name);
-
-/// }
-
-static unsigned matchVectorRegName(StringRef Name) {
-  return StringSwitch<unsigned>(Name)
-      .Case("v0", ARM64::Q0)
-      .Case("v1", ARM64::Q1)
-      .Case("v2", ARM64::Q2)
-      .Case("v3", ARM64::Q3)
-      .Case("v4", ARM64::Q4)
-      .Case("v5", ARM64::Q5)
-      .Case("v6", ARM64::Q6)
-      .Case("v7", ARM64::Q7)
-      .Case("v8", ARM64::Q8)
-      .Case("v9", ARM64::Q9)
-      .Case("v10", ARM64::Q10)
-      .Case("v11", ARM64::Q11)
-      .Case("v12", ARM64::Q12)
-      .Case("v13", ARM64::Q13)
-      .Case("v14", ARM64::Q14)
-      .Case("v15", ARM64::Q15)
-      .Case("v16", ARM64::Q16)
-      .Case("v17", ARM64::Q17)
-      .Case("v18", ARM64::Q18)
-      .Case("v19", ARM64::Q19)
-      .Case("v20", ARM64::Q20)
-      .Case("v21", ARM64::Q21)
-      .Case("v22", ARM64::Q22)
-      .Case("v23", ARM64::Q23)
-      .Case("v24", ARM64::Q24)
-      .Case("v25", ARM64::Q25)
-      .Case("v26", ARM64::Q26)
-      .Case("v27", ARM64::Q27)
-      .Case("v28", ARM64::Q28)
-      .Case("v29", ARM64::Q29)
-      .Case("v30", ARM64::Q30)
-      .Case("v31", ARM64::Q31)
-      .Default(0);
-}
-
-static bool isValidVectorKind(StringRef Name) {
-  return StringSwitch<bool>(Name.lower())
-      .Case(".8b", true)
-      .Case(".16b", true)
-      .Case(".4h", true)
-      .Case(".8h", true)
-      .Case(".2s", true)
-      .Case(".4s", true)
-      .Case(".1d", true)
-      .Case(".2d", true)
-      .Case(".1q", true)
-      // Accept the width neutral ones, too, for verbose syntax. If those
-      // aren't used in the right places, the token operand won't match so
-      // all will work out.
-      .Case(".b", true)
-      .Case(".h", true)
-      .Case(".s", true)
-      .Case(".d", true)
-      .Default(false);
-}
-
-static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
-                                 char &ElementKind) {
-  assert(isValidVectorKind(Name));
-
-  ElementKind = Name.lower()[Name.size() - 1];
-  NumElements = 0;
-
-  if (Name.size() == 2)
-    return;
-
-  // Parse the lane count
-  Name = Name.drop_front();
-  while (isdigit(Name.front())) {
-    NumElements = 10 * NumElements + (Name.front() - '0');
-    Name = Name.drop_front();
-  }
-}
-
-bool ARM64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
-                                   SMLoc &EndLoc) {
-  StartLoc = getLoc();
-  RegNo = tryParseRegister();
-  EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  return (RegNo == (unsigned)-1);
-}
-
-/// tryParseRegister - Try to parse a register name. The token must be an
-/// Identifier when called, and if it is a register name the token is eaten and
-/// the register is added to the operand list.
-int ARM64AsmParser::tryParseRegister() {
-  const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
-
-  std::string lowerCase = Tok.getString().lower();
-  unsigned RegNum = MatchRegisterName(lowerCase);
-  // Also handle a few aliases of registers.
-  if (RegNum == 0)
-    RegNum = StringSwitch<unsigned>(lowerCase)
-                 .Case("x29", ARM64::FP)
-                 .Case("x30", ARM64::LR)
-                 .Case("x31", ARM64::XZR)
-                 .Case("w31", ARM64::WZR)
-                 .Default(0);
-
-  if (RegNum == 0)
-    return -1;
-
-  Parser.Lex(); // Eat identifier token.
-  return RegNum;
-}
-
-/// tryMatchVectorRegister - Try to parse a vector register name with optional
-/// kind specifier. If it is a register specifier, eat the token and return it.
-int ARM64AsmParser::tryMatchVectorRegister(StringRef &Kind) {
-  if (Parser.getTok().isNot(AsmToken::Identifier)) {
-    TokError("vector register expected");
-    return -1;
-  }
-
-  StringRef Name = Parser.getTok().getString();
-  // If there is a kind specifier, it's separated from the register name by
-  // a '.'.
-  size_t Start = 0, Next = Name.find('.');
-  StringRef Head = Name.slice(Start, Next);
-  unsigned RegNum = matchVectorRegName(Head);
-  if (RegNum) {
-    if (Next != StringRef::npos) {
-      Kind = Name.slice(Next, StringRef::npos);
-      if (!isValidVectorKind(Kind)) {
-        TokError("invalid vector kind qualifier");
-        return -1;
-      }
-    }
-    Parser.Lex(); // Eat the register token.
-    return RegNum;
-  }
-  return -1;
-}
-
-static int MatchSysCRName(StringRef Name) {
-  // Use the same layout as the tablegen'erated register name matcher. Ugly,
-  // but efficient.
-  switch (Name.size()) {
-  default:
-    break;
-  case 2:
-    if (Name[0] != 'c' && Name[0] != 'C')
-      return -1;
-    switch (Name[1]) {
-    default:
-      return -1;
-    case '0':
-      return 0;
-    case '1':
-      return 1;
-    case '2':
-      return 2;
-    case '3':
-      return 3;
-    case '4':
-      return 4;
-    case '5':
-      return 5;
-    case '6':
-      return 6;
-    case '7':
-      return 7;
-    case '8':
-      return 8;
-    case '9':
-      return 9;
-    }
-    break;
-  case 3:
-    if ((Name[0] != 'c' && Name[0] != 'C') || Name[1] != '1')
-      return -1;
-    switch (Name[2]) {
-    default:
-      return -1;
-    case '0':
-      return 10;
-    case '1':
-      return 11;
-    case '2':
-      return 12;
-    case '3':
-      return 13;
-    case '4':
-      return 14;
-    case '5':
-      return 15;
-    }
-    break;
-  }
-
-  llvm_unreachable("Unhandled SysCR operand string!");
-  return -1;
-}
-
-/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  if (Tok.isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
-
-  int Num = MatchSysCRName(Tok.getString());
-  if (Num == -1)
-    return MatchOperand_NoMatch;
-
-  Parser.Lex(); // Eat identifier token.
-  Operands.push_back(ARM64Operand::CreateSysCR(Num, S, getLoc(), getContext()));
-  return MatchOperand_Success;
-}
-
-/// tryParsePrefetch - Try to parse a prefetch operand.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParsePrefetch(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  // Either an identifier for named values or a 5-bit immediate.
-  if (Tok.is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat hash token.
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return MatchOperand_ParseFail;
-
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for prefetch operand");
-      return MatchOperand_ParseFail;
-    }
-    unsigned prfop = MCE->getValue();
-    if (prfop > 31) {
-      TokError("prefetch operand out of range, [0,31] expected");
-      return MatchOperand_ParseFail;
-    }
-
-    Operands.push_back(ARM64Operand::CreatePrefetch(prfop, S, getContext()));
-    return MatchOperand_Success;
-  }
-
-  if (Tok.isNot(AsmToken::Identifier)) {
-    TokError("pre-fetch hint expected");
-    return MatchOperand_ParseFail;
-  }
-
-  unsigned prfop = StringSwitch<unsigned>(Tok.getString())
-                       .Case("pldl1keep", ARM64_AM::PLDL1KEEP)
-                       .Case("pldl1strm", ARM64_AM::PLDL1STRM)
-                       .Case("pldl2keep", ARM64_AM::PLDL2KEEP)
-                       .Case("pldl2strm", ARM64_AM::PLDL2STRM)
-                       .Case("pldl3keep", ARM64_AM::PLDL3KEEP)
-                       .Case("pldl3strm", ARM64_AM::PLDL3STRM)
-                       .Case("pstl1keep", ARM64_AM::PSTL1KEEP)
-                       .Case("pstl1strm", ARM64_AM::PSTL1STRM)
-                       .Case("pstl2keep", ARM64_AM::PSTL2KEEP)
-                       .Case("pstl2strm", ARM64_AM::PSTL2STRM)
-                       .Case("pstl3keep", ARM64_AM::PSTL3KEEP)
-                       .Case("pstl3strm", ARM64_AM::PSTL3STRM)
-                       .Default(0xff);
-  if (prfop == 0xff) {
-    TokError("pre-fetch hint expected");
-    return MatchOperand_ParseFail;
-  }
-
-  Parser.Lex(); // Eat identifier token.
-  Operands.push_back(ARM64Operand::CreatePrefetch(prfop, S, getContext()));
-  return MatchOperand_Success;
-}
-
-/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
-/// instruction.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const MCExpr *Expr;
-  if (parseSymbolicImmVal(Expr))
-    return MatchOperand_ParseFail;
-
-  ARM64MCExpr::VariantKind ELFRefKind;
-  MCSymbolRefExpr::VariantKind DarwinRefKind;
-  const MCConstantExpr *Addend;
-  if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
-    Error(S, "modified label reference + constant expected");
-    return MatchOperand_ParseFail;
-  }
-
-  if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
-      ELFRefKind == ARM64MCExpr::VK_INVALID) {
-    // No modifier was specified at all; this is the syntax for an ELF basic
-    // ADRP relocation (unfortunately).
-    Expr = ARM64MCExpr::Create(Expr, ARM64MCExpr::VK_ABS_PAGE, getContext());
-  } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
-              DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
-             Addend != 0) {
-    Error(S, "gotpage label reference not allowed an addend");
-    return MatchOperand_ParseFail;
-  } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
-             DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
-             DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
-             ELFRefKind != ARM64MCExpr::VK_GOT_PAGE &&
-             ELFRefKind != ARM64MCExpr::VK_GOTTPREL_PAGE &&
-             ELFRefKind != ARM64MCExpr::VK_TLSDESC_PAGE) {
-    // The operand must be an @page or @gotpage qualified symbolref.
-    Error(S, "page or gotpage label reference expected");
-    return MatchOperand_ParseFail;
-  }
-
-  // We have a label reference possibly with addend. The addend is a raw value
-  // here. The linker will adjust it to only reference the page.
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
-
-  return MatchOperand_Success;
-}
-
-/// tryParseAdrLabel - Parse and validate a source label for the ADR
-/// instruction.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const MCExpr *Expr;
-  if (getParser().parseExpression(Expr))
-    return MatchOperand_ParseFail;
-
-  // The operand must be an un-qualified assembler local symbolref.
-  // FIXME: wrong for ELF.
-  if (const MCSymbolRefExpr *SRE = dyn_cast<const MCSymbolRefExpr>(Expr)) {
-    // FIXME: Should reference the MachineAsmInfo to get the private prefix.
-    bool isTemporary = SRE->getSymbol().getName().startswith("L");
-    if (!isTemporary || SRE->getKind() != MCSymbolRefExpr::VK_None) {
-      Error(S, "unqualified, assembler-local label name expected");
-      return MatchOperand_ParseFail;
-    }
-  }
-
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
-
-  return MatchOperand_Success;
-}
-
-/// tryParseFPImm - A floating point immediate expression operand.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseFPImm(OperandVector &Operands) {
-  SMLoc S = getLoc();
-
-  if (Parser.getTok().isNot(AsmToken::Hash))
-    return MatchOperand_NoMatch;
-  Parser.Lex(); // Eat the '#'.
-
-  // Handle negation, as that still comes through as a separate token.
-  bool isNegative = false;
-  if (Parser.getTok().is(AsmToken::Minus)) {
-    isNegative = true;
-    Parser.Lex();
-  }
-  const AsmToken &Tok = Parser.getTok();
-  if (Tok.is(AsmToken::Real)) {
-    APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
-    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-    // If we had a '-' in front, toggle the sign bit.
-    IntVal ^= (uint64_t)isNegative << 63;
-    int Val = ARM64_AM::getFP64Imm(APInt(64, IntVal));
-    Parser.Lex(); // Eat the token.
-    // Check for out of range values. As an exception, we let Zero through,
-    // as we handle that special case in post-processing before matching in
-    // order to use the zero register for it.
-    if (Val == -1 && !RealVal.isZero()) {
-      TokError("floating point value out of range");
-      return MatchOperand_ParseFail;
-    }
-    Operands.push_back(ARM64Operand::CreateFPImm(Val, S, getContext()));
-    return MatchOperand_Success;
-  }
-  if (Tok.is(AsmToken::Integer)) {
-    int64_t Val;
-    if (!isNegative && Tok.getString().startswith("0x")) {
-      Val = Tok.getIntVal();
-      if (Val > 255 || Val < 0) {
-        TokError("encoded floating point value out of range");
-        return MatchOperand_ParseFail;
-      }
-    } else {
-      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
-      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-      // If we had a '-' in front, toggle the sign bit.
-      IntVal ^= (uint64_t)isNegative << 63;
-      Val = ARM64_AM::getFP64Imm(APInt(64, IntVal));
-    }
-    Parser.Lex(); // Eat the token.
-    Operands.push_back(ARM64Operand::CreateFPImm(Val, S, getContext()));
-    return MatchOperand_Success;
-  }
-
-  TokError("invalid floating point immediate");
-  return MatchOperand_ParseFail;
-}
-
-/// parseCondCodeString - Parse a Condition Code string.
-unsigned ARM64AsmParser::parseCondCodeString(StringRef Cond) {
-  unsigned CC = StringSwitch<unsigned>(Cond)
-                    .Case("eq", ARM64CC::EQ)
-                    .Case("ne", ARM64CC::NE)
-                    .Case("cs", ARM64CC::CS)
-                    .Case("hs", ARM64CC::CS)
-                    .Case("cc", ARM64CC::CC)
-                    .Case("lo", ARM64CC::CC)
-                    .Case("mi", ARM64CC::MI)
-                    .Case("pl", ARM64CC::PL)
-                    .Case("vs", ARM64CC::VS)
-                    .Case("vc", ARM64CC::VC)
-                    .Case("hi", ARM64CC::HI)
-                    .Case("ls", ARM64CC::LS)
-                    .Case("ge", ARM64CC::GE)
-                    .Case("lt", ARM64CC::LT)
-                    .Case("gt", ARM64CC::GT)
-                    .Case("le", ARM64CC::LE)
-                    .Case("al", ARM64CC::AL)
-                // Upper case works too. Not mixed case, though.
-                    .Case("EQ", ARM64CC::EQ)
-                    .Case("NE", ARM64CC::NE)
-                    .Case("CS", ARM64CC::CS)
-                    .Case("HS", ARM64CC::CS)
-                    .Case("CC", ARM64CC::CC)
-                    .Case("LO", ARM64CC::CC)
-                    .Case("MI", ARM64CC::MI)
-                    .Case("PL", ARM64CC::PL)
-                    .Case("VS", ARM64CC::VS)
-                    .Case("VC", ARM64CC::VC)
-                    .Case("HI", ARM64CC::HI)
-                    .Case("LS", ARM64CC::LS)
-                    .Case("GE", ARM64CC::GE)
-                    .Case("LT", ARM64CC::LT)
-                    .Case("GT", ARM64CC::GT)
-                    .Case("LE", ARM64CC::LE)
-                    .Case("AL", ARM64CC::AL)
-                    .Default(~0U);
-  return CC;
-}
-
-/// parseCondCode - Parse a Condition Code operand.
-bool ARM64AsmParser::parseCondCode(OperandVector &Operands,
-                                   bool invertCondCode) {
-  SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
-
-  StringRef Cond = Tok.getString();
-  unsigned CC = parseCondCodeString(Cond);
-  if (CC == ~0U)
-    return TokError("invalid condition code");
-  Parser.Lex(); // Eat identifier token.
-
-  if (invertCondCode)
-    CC = ARM64CC::getInvertedCondCode(ARM64CC::CondCode(CC));
-
-  const MCExpr *CCExpr = MCConstantExpr::Create(CC, getContext());
-  Operands.push_back(
-      ARM64Operand::CreateImm(CCExpr, S, getLoc(), getContext()));
-  return false;
-}
-
-/// ParseOptionalShift - Some operands take an optional shift argument. Parse
-/// them if present.
-bool ARM64AsmParser::parseOptionalShift(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-  ARM64_AM::ShiftType ShOp = StringSwitch<ARM64_AM::ShiftType>(Tok.getString())
-                                 .Case("lsl", ARM64_AM::LSL)
-                                 .Case("lsr", ARM64_AM::LSR)
-                                 .Case("asr", ARM64_AM::ASR)
-                                 .Case("ror", ARM64_AM::ROR)
-                                 .Case("msl", ARM64_AM::MSL)
-                                 .Case("LSL", ARM64_AM::LSL)
-                                 .Case("LSR", ARM64_AM::LSR)
-                                 .Case("ASR", ARM64_AM::ASR)
-                                 .Case("ROR", ARM64_AM::ROR)
-                                 .Case("MSL", ARM64_AM::MSL)
-                                 .Default(ARM64_AM::InvalidShift);
-  if (ShOp == ARM64_AM::InvalidShift)
-    return true;
-
-  SMLoc S = Tok.getLoc();
-  Parser.Lex();
-
-  // We expect a number here.
-  if (getLexer().isNot(AsmToken::Hash))
-    return TokError("immediate value expected for shifter operand");
-  Parser.Lex(); // Eat the '#'.
-
-  SMLoc ExprLoc = getLoc();
-  const MCExpr *ImmVal;
-  if (getParser().parseExpression(ImmVal))
-    return true;
-
-  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-  if (!MCE)
-    return TokError("immediate value expected for shifter operand");
-
-  if ((MCE->getValue() & 0x3f) != MCE->getValue())
-    return Error(ExprLoc, "immediate value too large for shifter operand");
-
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(
-      ARM64Operand::CreateShifter(ShOp, MCE->getValue(), S, E, getContext()));
-  return false;
-}
-
-/// parseOptionalExtend - Some operands take an optional extend argument. Parse
-/// them if present.
-bool ARM64AsmParser::parseOptionalExtend(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-  ARM64_AM::ExtendType ExtOp =
-      StringSwitch<ARM64_AM::ExtendType>(Tok.getString())
-          .Case("uxtb", ARM64_AM::UXTB)
-          .Case("uxth", ARM64_AM::UXTH)
-          .Case("uxtw", ARM64_AM::UXTW)
-          .Case("uxtx", ARM64_AM::UXTX)
-          .Case("lsl", ARM64_AM::UXTX) // Alias for UXTX
-          .Case("sxtb", ARM64_AM::SXTB)
-          .Case("sxth", ARM64_AM::SXTH)
-          .Case("sxtw", ARM64_AM::SXTW)
-          .Case("sxtx", ARM64_AM::SXTX)
-          .Case("UXTB", ARM64_AM::UXTB)
-          .Case("UXTH", ARM64_AM::UXTH)
-          .Case("UXTW", ARM64_AM::UXTW)
-          .Case("UXTX", ARM64_AM::UXTX)
-          .Case("LSL", ARM64_AM::UXTX) // Alias for UXTX
-          .Case("SXTB", ARM64_AM::SXTB)
-          .Case("SXTH", ARM64_AM::SXTH)
-          .Case("SXTW", ARM64_AM::SXTW)
-          .Case("SXTX", ARM64_AM::SXTX)
-          .Default(ARM64_AM::InvalidExtend);
-  if (ExtOp == ARM64_AM::InvalidExtend)
-    return true;
-
-  SMLoc S = Tok.getLoc();
-  Parser.Lex();
-
-  if (getLexer().is(AsmToken::EndOfStatement) ||
-      getLexer().is(AsmToken::Comma)) {
-    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(
-        ARM64Operand::CreateExtend(ExtOp, 0, S, E, getContext()));
-    return false;
-  }
-
-  if (getLexer().isNot(AsmToken::Hash)) {
-    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(
-        ARM64Operand::CreateExtend(ExtOp, 0, S, E, getContext()));
-    return false;
-  }
-
-  Parser.Lex(); // Eat the '#'.
-
-  const MCExpr *ImmVal;
-  if (getParser().parseExpression(ImmVal))
-    return true;
-
-  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-  if (!MCE)
-    return TokError("immediate value expected for extend operand");
-
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(
-      ARM64Operand::CreateExtend(ExtOp, MCE->getValue(), S, E, getContext()));
-  return false;
-}
-
-/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
-/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
-bool ARM64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
-                                   OperandVector &Operands) {
-  if (Name.find('.') != StringRef::npos)
-    return TokError("invalid operand");
-
-  Mnemonic = Name;
-  Operands.push_back(
-      ARM64Operand::CreateToken("sys", false, NameLoc, getContext()));
-
-  const AsmToken &Tok = Parser.getTok();
-  StringRef Op = Tok.getString();
-  SMLoc S = Tok.getLoc();
-
-  const MCExpr *Expr = 0;
-
-#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
-  do {                                                                         \
-    Expr = MCConstantExpr::Create(op1, getContext());                          \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateImm(Expr, S, getLoc(), getContext()));             \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));             \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));             \
-    Expr = MCConstantExpr::Create(op2, getContext());                          \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateImm(Expr, S, getLoc(), getContext()));             \
-  } while (0)
-
-  if (Mnemonic == "ic") {
-    if (!Op.compare_lower("ialluis")) {
-      // SYS #0, C7, C1, #0
-      SYS_ALIAS(0, 7, 1, 0);
-    } else if (!Op.compare_lower("iallu")) {
-      // SYS #0, C7, C5, #0
-      SYS_ALIAS(0, 7, 5, 0);
-    } else if (!Op.compare_lower("ivau")) {
-      // SYS #3, C7, C5, #1
-      SYS_ALIAS(3, 7, 5, 1);
-    } else {
-      return TokError("invalid operand for IC instruction");
-    }
-  } else if (Mnemonic == "dc") {
-    if (!Op.compare_lower("zva")) {
-      // SYS #3, C7, C4, #1
-      SYS_ALIAS(3, 7, 4, 1);
-    } else if (!Op.compare_lower("ivac")) {
-      // SYS #3, C7, C6, #1
-      SYS_ALIAS(0, 7, 6, 1);
-    } else if (!Op.compare_lower("isw")) {
-      // SYS #0, C7, C6, #2
-      SYS_ALIAS(0, 7, 6, 2);
-    } else if (!Op.compare_lower("cvac")) {
-      // SYS #3, C7, C10, #1
-      SYS_ALIAS(3, 7, 10, 1);
-    } else if (!Op.compare_lower("csw")) {
-      // SYS #0, C7, C10, #2
-      SYS_ALIAS(0, 7, 10, 2);
-    } else if (!Op.compare_lower("cvau")) {
-      // SYS #3, C7, C11, #1
-      SYS_ALIAS(3, 7, 11, 1);
-    } else if (!Op.compare_lower("civac")) {
-      // SYS #3, C7, C14, #1
-      SYS_ALIAS(3, 7, 14, 1);
-    } else if (!Op.compare_lower("cisw")) {
-      // SYS #0, C7, C14, #2
-      SYS_ALIAS(0, 7, 14, 2);
-    } else {
-      return TokError("invalid operand for DC instruction");
-    }
-  } else if (Mnemonic == "at") {
-    if (!Op.compare_lower("s1e1r")) {
-      // SYS #0, C7, C8, #0
-      SYS_ALIAS(0, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e2r")) {
-      // SYS #4, C7, C8, #0
-      SYS_ALIAS(4, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e3r")) {
-      // SYS #6, C7, C8, #0
-      SYS_ALIAS(6, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e1w")) {
-      // SYS #0, C7, C8, #1
-      SYS_ALIAS(0, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e2w")) {
-      // SYS #4, C7, C8, #1
-      SYS_ALIAS(4, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e3w")) {
-      // SYS #6, C7, C8, #1
-      SYS_ALIAS(6, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e0r")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 2);
-    } else if (!Op.compare_lower("s1e0w")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 3);
-    } else if (!Op.compare_lower("s12e1r")) {
-      // SYS #4, C7, C8, #4
-      SYS_ALIAS(4, 7, 8, 4);
-    } else if (!Op.compare_lower("s12e1w")) {
-      // SYS #4, C7, C8, #5
-      SYS_ALIAS(4, 7, 8, 5);
-    } else if (!Op.compare_lower("s12e0r")) {
-      // SYS #4, C7, C8, #6
-      SYS_ALIAS(4, 7, 8, 6);
-    } else if (!Op.compare_lower("s12e0w")) {
-      // SYS #4, C7, C8, #7
-      SYS_ALIAS(4, 7, 8, 7);
-    } else {
-      return TokError("invalid operand for AT instruction");
-    }
-  } else if (Mnemonic == "tlbi") {
-    if (!Op.compare_lower("vmalle1is")) {
-      // SYS #0, C8, C3, #0
-      SYS_ALIAS(0, 8, 3, 0);
-    } else if (!Op.compare_lower("alle2is")) {
-      // SYS #4, C8, C3, #0
-      SYS_ALIAS(4, 8, 3, 0);
-    } else if (!Op.compare_lower("alle3is")) {
-      // SYS #6, C8, C3, #0
-      SYS_ALIAS(6, 8, 3, 0);
-    } else if (!Op.compare_lower("vae1is")) {
-      // SYS #0, C8, C3, #1
-      SYS_ALIAS(0, 8, 3, 1);
-    } else if (!Op.compare_lower("vae2is")) {
-      // SYS #4, C8, C3, #1
-      SYS_ALIAS(4, 8, 3, 1);
-    } else if (!Op.compare_lower("vae3is")) {
-      // SYS #6, C8, C3, #1
-      SYS_ALIAS(6, 8, 3, 1);
-    } else if (!Op.compare_lower("aside1is")) {
-      // SYS #0, C8, C3, #2
-      SYS_ALIAS(0, 8, 3, 2);
-    } else if (!Op.compare_lower("vaae1is")) {
-      // SYS #0, C8, C3, #3
-      SYS_ALIAS(0, 8, 3, 3);
-    } else if (!Op.compare_lower("alle1is")) {
-      // SYS #4, C8, C3, #4
-      SYS_ALIAS(4, 8, 3, 4);
-    } else if (!Op.compare_lower("vale1is")) {
-      // SYS #0, C8, C3, #5
-      SYS_ALIAS(0, 8, 3, 5);
-    } else if (!Op.compare_lower("vaale1is")) {
-      // SYS #0, C8, C3, #7
-      SYS_ALIAS(0, 8, 3, 7);
-    } else if (!Op.compare_lower("vmalle1")) {
-      // SYS #0, C8, C7, #0
-      SYS_ALIAS(0, 8, 7, 0);
-    } else if (!Op.compare_lower("alle2")) {
-      // SYS #4, C8, C7, #0
-      SYS_ALIAS(4, 8, 7, 0);
-    } else if (!Op.compare_lower("vale2is")) {
-      // SYS #4, C8, C3, #5
-      SYS_ALIAS(4, 8, 3, 5);
-    } else if (!Op.compare_lower("vale3is")) {
-      // SYS #6, C8, C3, #5
-      SYS_ALIAS(6, 8, 3, 5);
-    } else if (!Op.compare_lower("alle3")) {
-      // SYS #6, C8, C7, #0
-      SYS_ALIAS(6, 8, 7, 0);
-    } else if (!Op.compare_lower("vae1")) {
-      // SYS #0, C8, C7, #1
-      SYS_ALIAS(0, 8, 7, 1);
-    } else if (!Op.compare_lower("vae2")) {
-      // SYS #4, C8, C7, #1
-      SYS_ALIAS(4, 8, 7, 1);
-    } else if (!Op.compare_lower("vae3")) {
-      // SYS #6, C8, C7, #1
-      SYS_ALIAS(6, 8, 7, 1);
-    } else if (!Op.compare_lower("aside1")) {
-      // SYS #0, C8, C7, #2
-      SYS_ALIAS(0, 8, 7, 2);
-    } else if (!Op.compare_lower("vaae1")) {
-      // SYS #0, C8, C7, #3
-      SYS_ALIAS(0, 8, 7, 3);
-    } else if (!Op.compare_lower("alle1")) {
-      // SYS #4, C8, C7, #4
-      SYS_ALIAS(4, 8, 7, 4);
-    } else if (!Op.compare_lower("vale1")) {
-      // SYS #0, C8, C7, #5
-      SYS_ALIAS(0, 8, 7, 5);
-    } else if (!Op.compare_lower("vale2")) {
-      // SYS #4, C8, C7, #5
-      SYS_ALIAS(4, 8, 7, 5);
-    } else if (!Op.compare_lower("vale3")) {
-      // SYS #6, C8, C7, #5
-      SYS_ALIAS(6, 8, 7, 5);
-    } else if (!Op.compare_lower("vaale1")) {
-      // SYS #0, C8, C7, #7
-      SYS_ALIAS(0, 8, 7, 7);
-    } else if (!Op.compare_lower("ipas2e1")) {
-      // SYS #4, C8, C4, #1
-      SYS_ALIAS(4, 8, 4, 1);
-    } else if (!Op.compare_lower("ipas2le1")) {
-      // SYS #4, C8, C4, #5
-      SYS_ALIAS(4, 8, 4, 5);
-    } else if (!Op.compare_lower("vmalls12e1")) {
-      // SYS #4, C8, C7, #6
-      SYS_ALIAS(4, 8, 7, 6);
-    } else if (!Op.compare_lower("vmalls12e1is")) {
-      // SYS #4, C8, C3, #6
-      SYS_ALIAS(4, 8, 3, 6);
-    } else {
-      return TokError("invalid operand for TLBI instruction");
-    }
-  }
-
-#undef SYS_ALIAS
-
-  Parser.Lex(); // Eat operand.
-
-  // Check for the optional register operand.
-  if (getLexer().is(AsmToken::Comma)) {
-    Parser.Lex(); // Eat comma.
-
-    if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
-      return TokError("expected register operand");
-  }
-
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    Parser.eatToEndOfStatement();
-    return TokError("unexpected token in argument list");
-  }
-
-  Parser.Lex(); // Consume the EndOfStatement
-  return false;
-}
-
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-
-  // Can be either a #imm style literal or an option name
-  if (Tok.is(AsmToken::Hash)) {
-    // Immediate operand.
-    Parser.Lex(); // Eat the '#'
-    const MCExpr *ImmVal;
-    SMLoc ExprLoc = getLoc();
-    if (getParser().parseExpression(ImmVal))
-      return MatchOperand_ParseFail;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      Error(ExprLoc, "immediate value expected for barrier operand");
-      return MatchOperand_ParseFail;
-    }
-    if (MCE->getValue() < 0 || MCE->getValue() > 15) {
-      Error(ExprLoc, "barrier operand out of range");
-      return MatchOperand_ParseFail;
-    }
-    Operands.push_back(
-        ARM64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext()));
-    return MatchOperand_Success;
-  }
-
-  if (Tok.isNot(AsmToken::Identifier)) {
-    TokError("invalid operand for instruction");
-    return MatchOperand_ParseFail;
-  }
-
-  unsigned Opt = StringSwitch<unsigned>(Tok.getString())
-                     .Case("oshld", ARM64SYS::OSHLD)
-                     .Case("oshst", ARM64SYS::OSHST)
-                     .Case("osh", ARM64SYS::OSH)
-                     .Case("nshld", ARM64SYS::NSHLD)
-                     .Case("nshst", ARM64SYS::NSHST)
-                     .Case("nsh", ARM64SYS::NSH)
-                     .Case("ishld", ARM64SYS::ISHLD)
-                     .Case("ishst", ARM64SYS::ISHST)
-                     .Case("ish", ARM64SYS::ISH)
-                     .Case("ld", ARM64SYS::LD)
-                     .Case("st", ARM64SYS::ST)
-                     .Case("sy", ARM64SYS::SY)
-                     .Default(ARM64SYS::InvalidBarrier);
-  if (Opt == ARM64SYS::InvalidBarrier) {
-    TokError("invalid barrier option name");
-    return MatchOperand_ParseFail;
-  }
-
-  // The only valid named option for ISB is 'sy'
-  if (Mnemonic == "isb" && Opt != ARM64SYS::SY) {
-    TokError("'sy' or #imm operand expected");
-    return MatchOperand_ParseFail;
-  }
-
-  Operands.push_back(ARM64Operand::CreateBarrier(Opt, getLoc(), getContext()));
-  Parser.Lex(); // Consume the option
-
-  return MatchOperand_Success;
-}
-
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseSystemRegister(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-
-  // It can be specified as a symbolic name.
-  if (Tok.isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
-
-  auto ID = Tok.getString().lower();
-  ARM64SYS::SystemRegister Reg =
-      StringSwitch<ARM64SYS::SystemRegister>(ID)
-          .Case("spsr_el1", ARM64SYS::SPSR_svc)
-          .Case("spsr_svc", ARM64SYS::SPSR_svc)
-          .Case("elr_el1", ARM64SYS::ELR_EL1)
-          .Case("sp_el0", ARM64SYS::SP_EL0)
-          .Case("spsel", ARM64SYS::SPSel)
-          .Case("daif", ARM64SYS::DAIF)
-          .Case("currentel", ARM64SYS::CurrentEL)
-          .Case("nzcv", ARM64SYS::NZCV)
-          .Case("fpcr", ARM64SYS::FPCR)
-          .Case("fpsr", ARM64SYS::FPSR)
-          .Case("dspsr", ARM64SYS::DSPSR)
-          .Case("dlr", ARM64SYS::DLR)
-          .Case("spsr_el2", ARM64SYS::SPSR_hyp)
-          .Case("spsr_hyp", ARM64SYS::SPSR_hyp)
-          .Case("elr_el2", ARM64SYS::ELR_EL2)
-          .Case("sp_el1", ARM64SYS::SP_EL1)
-          .Case("spsr_irq", ARM64SYS::SPSR_irq)
-          .Case("spsr_abt", ARM64SYS::SPSR_abt)
-          .Case("spsr_und", ARM64SYS::SPSR_und)
-          .Case("spsr_fiq", ARM64SYS::SPSR_fiq)
-          .Case("spsr_el3", ARM64SYS::SPSR_EL3)
-          .Case("elr_el3", ARM64SYS::ELR_EL3)
-          .Case("sp_el2", ARM64SYS::SP_EL2)
-          .Case("midr_el1", ARM64SYS::MIDR_EL1)
-          .Case("ctr_el0", ARM64SYS::CTR_EL0)
-          .Case("mpidr_el1", ARM64SYS::MPIDR_EL1)
-          .Case("ecoidr_el1", ARM64SYS::ECOIDR_EL1)
-          .Case("dczid_el0", ARM64SYS::DCZID_EL0)
-          .Case("mvfr0_el1", ARM64SYS::MVFR0_EL1)
-          .Case("mvfr1_el1", ARM64SYS::MVFR1_EL1)
-          .Case("id_aa64pfr0_el1", ARM64SYS::ID_AA64PFR0_EL1)
-          .Case("id_aa64pfr1_el1", ARM64SYS::ID_AA64PFR1_EL1)
-          .Case("id_aa64dfr0_el1", ARM64SYS::ID_AA64DFR0_EL1)
-          .Case("id_aa64dfr1_el1", ARM64SYS::ID_AA64DFR1_EL1)
-          .Case("id_aa64isar0_el1", ARM64SYS::ID_AA64ISAR0_EL1)
-          .Case("id_aa64isar1_el1", ARM64SYS::ID_AA64ISAR1_EL1)
-          .Case("id_aa64mmfr0_el1", ARM64SYS::ID_AA64MMFR0_EL1)
-          .Case("id_aa64mmfr1_el1", ARM64SYS::ID_AA64MMFR1_EL1)
-          .Case("ccsidr_el1", ARM64SYS::CCSIDR_EL1)
-          .Case("clidr_el1", ARM64SYS::CLIDR_EL1)
-          .Case("aidr_el1", ARM64SYS::AIDR_EL1)
-          .Case("csselr_el1", ARM64SYS::CSSELR_EL1)
-          .Case("vpidr_el2", ARM64SYS::VPIDR_EL2)
-          .Case("vmpidr_el2", ARM64SYS::VMPIDR_EL2)
-          .Case("sctlr_el1", ARM64SYS::SCTLR_EL1)
-          .Case("sctlr_el2", ARM64SYS::SCTLR_EL2)
-          .Case("sctlr_el3", ARM64SYS::SCTLR_EL3)
-          .Case("actlr_el1", ARM64SYS::ACTLR_EL1)
-          .Case("actlr_el2", ARM64SYS::ACTLR_EL2)
-          .Case("actlr_el3", ARM64SYS::ACTLR_EL3)
-          .Case("cpacr_el1", ARM64SYS::CPACR_EL1)
-          .Case("cptr_el2", ARM64SYS::CPTR_EL2)
-          .Case("cptr_el3", ARM64SYS::CPTR_EL3)
-          .Case("scr_el3", ARM64SYS::SCR_EL3)
-          .Case("hcr_el2", ARM64SYS::HCR_EL2)
-          .Case("mdcr_el2", ARM64SYS::MDCR_EL2)
-          .Case("mdcr_el3", ARM64SYS::MDCR_EL3)
-          .Case("hstr_el2", ARM64SYS::HSTR_EL2)
-          .Case("hacr_el2", ARM64SYS::HACR_EL2)
-          .Case("ttbr0_el1", ARM64SYS::TTBR0_EL1)
-          .Case("ttbr1_el1", ARM64SYS::TTBR1_EL1)
-          .Case("ttbr0_el2", ARM64SYS::TTBR0_EL2)
-          .Case("ttbr0_el3", ARM64SYS::TTBR0_EL3)
-          .Case("vttbr_el2", ARM64SYS::VTTBR_EL2)
-          .Case("tcr_el1", ARM64SYS::TCR_EL1)
-          .Case("tcr_el2", ARM64SYS::TCR_EL2)
-          .Case("tcr_el3", ARM64SYS::TCR_EL3)
-          .Case("vtcr_el2", ARM64SYS::VTCR_EL2)
-          .Case("adfsr_el1", ARM64SYS::ADFSR_EL1)
-          .Case("aifsr_el1", ARM64SYS::AIFSR_EL1)
-          .Case("adfsr_el2", ARM64SYS::ADFSR_EL2)
-          .Case("aifsr_el2", ARM64SYS::AIFSR_EL2)
-          .Case("adfsr_el3", ARM64SYS::ADFSR_EL3)
-          .Case("aifsr_el3", ARM64SYS::AIFSR_EL3)
-          .Case("esr_el1", ARM64SYS::ESR_EL1)
-          .Case("esr_el2", ARM64SYS::ESR_EL2)
-          .Case("esr_el3", ARM64SYS::ESR_EL3)
-          .Case("far_el1", ARM64SYS::FAR_EL1)
-          .Case("far_el2", ARM64SYS::FAR_EL2)
-          .Case("far_el3", ARM64SYS::FAR_EL3)
-          .Case("hpfar_el2", ARM64SYS::HPFAR_EL2)
-          .Case("par_el1", ARM64SYS::PAR_EL1)
-          .Case("mair_el1", ARM64SYS::MAIR_EL1)
-          .Case("mair_el2", ARM64SYS::MAIR_EL2)
-          .Case("mair_el3", ARM64SYS::MAIR_EL3)
-          .Case("amair_el1", ARM64SYS::AMAIR_EL1)
-          .Case("amair_el2", ARM64SYS::AMAIR_EL2)
-          .Case("amair_el3", ARM64SYS::AMAIR_EL3)
-          .Case("vbar_el1", ARM64SYS::VBAR_EL1)
-          .Case("vbar_el2", ARM64SYS::VBAR_EL2)
-          .Case("vbar_el3", ARM64SYS::VBAR_EL3)
-          .Case("rvbar_el1", ARM64SYS::RVBAR_EL1)
-          .Case("rvbar_el2", ARM64SYS::RVBAR_EL2)
-          .Case("rvbar_el3", ARM64SYS::RVBAR_EL3)
-          .Case("isr_el1", ARM64SYS::ISR_EL1)
-          .Case("contextidr_el1", ARM64SYS::CONTEXTIDR_EL1)
-          .Case("tpidr_el0", ARM64SYS::TPIDR_EL0)
-          .Case("tpidrro_el0", ARM64SYS::TPIDRRO_EL0)
-          .Case("tpidr_el1", ARM64SYS::TPIDR_EL1)
-          .Case("tpidr_el2", ARM64SYS::TPIDR_EL2)
-          .Case("tpidr_el3", ARM64SYS::TPIDR_EL3)
-          .Case("teecr32_el1", ARM64SYS::TEECR32_EL1)
-          .Case("cntfrq_el0", ARM64SYS::CNTFRQ_EL0)
-          .Case("cntpct_el0", ARM64SYS::CNTPCT_EL0)
-          .Case("cntvct_el0", ARM64SYS::CNTVCT_EL0)
-          .Case("cntvoff_el2", ARM64SYS::CNTVOFF_EL2)
-          .Case("cntkctl_el1", ARM64SYS::CNTKCTL_EL1)
-          .Case("cnthctl_el2", ARM64SYS::CNTHCTL_EL2)
-          .Case("cntp_tval_el0", ARM64SYS::CNTP_TVAL_EL0)
-          .Case("cntp_ctl_el0", ARM64SYS::CNTP_CTL_EL0)
-          .Case("cntp_cval_el0", ARM64SYS::CNTP_CVAL_EL0)
-          .Case("cntv_tval_el0", ARM64SYS::CNTV_TVAL_EL0)
-          .Case("cntv_ctl_el0", ARM64SYS::CNTV_CTL_EL0)
-          .Case("cntv_cval_el0", ARM64SYS::CNTV_CVAL_EL0)
-          .Case("cnthp_tval_el2", ARM64SYS::CNTHP_TVAL_EL2)
-          .Case("cnthp_ctl_el2", ARM64SYS::CNTHP_CTL_EL2)
-          .Case("cnthp_cval_el2", ARM64SYS::CNTHP_CVAL_EL2)
-          .Case("cntps_tval_el1", ARM64SYS::CNTPS_TVAL_EL1)
-          .Case("cntps_ctl_el1", ARM64SYS::CNTPS_CTL_EL1)
-          .Case("cntps_cval_el1", ARM64SYS::CNTPS_CVAL_EL1)
-          .Case("dacr32_el2", ARM64SYS::DACR32_EL2)
-          .Case("ifsr32_el2", ARM64SYS::IFSR32_EL2)
-          .Case("teehbr32_el1", ARM64SYS::TEEHBR32_EL1)
-          .Case("sder32_el3", ARM64SYS::SDER32_EL3)
-          .Case("fpexc32_el2", ARM64SYS::FPEXC32_EL2)
-          .Case("current_el", ARM64SYS::CurrentEL)
-          .Case("pmevcntr0_el0", ARM64SYS::PMEVCNTR0_EL0)
-          .Case("pmevcntr1_el0", ARM64SYS::PMEVCNTR1_EL0)
-          .Case("pmevcntr2_el0", ARM64SYS::PMEVCNTR2_EL0)
-          .Case("pmevcntr3_el0", ARM64SYS::PMEVCNTR3_EL0)
-          .Case("pmevcntr4_el0", ARM64SYS::PMEVCNTR4_EL0)
-          .Case("pmevcntr5_el0", ARM64SYS::PMEVCNTR5_EL0)
-          .Case("pmevcntr6_el0", ARM64SYS::PMEVCNTR6_EL0)
-          .Case("pmevcntr7_el0", ARM64SYS::PMEVCNTR7_EL0)
-          .Case("pmevcntr8_el0", ARM64SYS::PMEVCNTR8_EL0)
-          .Case("pmevcntr9_el0", ARM64SYS::PMEVCNTR9_EL0)
-          .Case("pmevcntr10_el0", ARM64SYS::PMEVCNTR10_EL0)
-          .Case("pmevcntr11_el0", ARM64SYS::PMEVCNTR11_EL0)
-          .Case("pmevcntr12_el0", ARM64SYS::PMEVCNTR12_EL0)
-          .Case("pmevcntr13_el0", ARM64SYS::PMEVCNTR13_EL0)
-          .Case("pmevcntr14_el0", ARM64SYS::PMEVCNTR14_EL0)
-          .Case("pmevcntr15_el0", ARM64SYS::PMEVCNTR15_EL0)
-          .Case("pmevcntr16_el0", ARM64SYS::PMEVCNTR16_EL0)
-          .Case("pmevcntr17_el0", ARM64SYS::PMEVCNTR17_EL0)
-          .Case("pmevcntr18_el0", ARM64SYS::PMEVCNTR18_EL0)
-          .Case("pmevcntr19_el0", ARM64SYS::PMEVCNTR19_EL0)
-          .Case("pmevcntr20_el0", ARM64SYS::PMEVCNTR20_EL0)
-          .Case("pmevcntr21_el0", ARM64SYS::PMEVCNTR21_EL0)
-          .Case("pmevcntr22_el0", ARM64SYS::PMEVCNTR22_EL0)
-          .Case("pmevcntr23_el0", ARM64SYS::PMEVCNTR23_EL0)
-          .Case("pmevcntr24_el0", ARM64SYS::PMEVCNTR24_EL0)
-          .Case("pmevcntr25_el0", ARM64SYS::PMEVCNTR25_EL0)
-          .Case("pmevcntr26_el0", ARM64SYS::PMEVCNTR26_EL0)
-          .Case("pmevcntr27_el0", ARM64SYS::PMEVCNTR27_EL0)
-          .Case("pmevcntr28_el0", ARM64SYS::PMEVCNTR28_EL0)
-          .Case("pmevcntr29_el0", ARM64SYS::PMEVCNTR29_EL0)
-          .Case("pmevcntr30_el0", ARM64SYS::PMEVCNTR30_EL0)
-          .Case("pmevtyper0_el0", ARM64SYS::PMEVTYPER0_EL0)
-          .Case("pmevtyper1_el0", ARM64SYS::PMEVTYPER1_EL0)
-          .Case("pmevtyper2_el0", ARM64SYS::PMEVTYPER2_EL0)
-          .Case("pmevtyper3_el0", ARM64SYS::PMEVTYPER3_EL0)
-          .Case("pmevtyper4_el0", ARM64SYS::PMEVTYPER4_EL0)
-          .Case("pmevtyper5_el0", ARM64SYS::PMEVTYPER5_EL0)
-          .Case("pmevtyper6_el0", ARM64SYS::PMEVTYPER6_EL0)
-          .Case("pmevtyper7_el0", ARM64SYS::PMEVTYPER7_EL0)
-          .Case("pmevtyper8_el0", ARM64SYS::PMEVTYPER8_EL0)
-          .Case("pmevtyper9_el0", ARM64SYS::PMEVTYPER9_EL0)
-          .Case("pmevtyper10_el0", ARM64SYS::PMEVTYPER10_EL0)
-          .Case("pmevtyper11_el0", ARM64SYS::PMEVTYPER11_EL0)
-          .Case("pmevtyper12_el0", ARM64SYS::PMEVTYPER12_EL0)
-          .Case("pmevtyper13_el0", ARM64SYS::PMEVTYPER13_EL0)
-          .Case("pmevtyper14_el0", ARM64SYS::PMEVTYPER14_EL0)
-          .Case("pmevtyper15_el0", ARM64SYS::PMEVTYPER15_EL0)
-          .Case("pmevtyper16_el0", ARM64SYS::PMEVTYPER16_EL0)
-          .Case("pmevtyper17_el0", ARM64SYS::PMEVTYPER17_EL0)
-          .Case("pmevtyper18_el0", ARM64SYS::PMEVTYPER18_EL0)
-          .Case("pmevtyper19_el0", ARM64SYS::PMEVTYPER19_EL0)
-          .Case("pmevtyper20_el0", ARM64SYS::PMEVTYPER20_EL0)
-          .Case("pmevtyper21_el0", ARM64SYS::PMEVTYPER21_EL0)
-          .Case("pmevtyper22_el0", ARM64SYS::PMEVTYPER22_EL0)
-          .Case("pmevtyper23_el0", ARM64SYS::PMEVTYPER23_EL0)
-          .Case("pmevtyper24_el0", ARM64SYS::PMEVTYPER24_EL0)
-          .Case("pmevtyper25_el0", ARM64SYS::PMEVTYPER25_EL0)
-          .Case("pmevtyper26_el0", ARM64SYS::PMEVTYPER26_EL0)
-          .Case("pmevtyper27_el0", ARM64SYS::PMEVTYPER27_EL0)
-          .Case("pmevtyper28_el0", ARM64SYS::PMEVTYPER28_EL0)
-          .Case("pmevtyper29_el0", ARM64SYS::PMEVTYPER29_EL0)
-          .Case("pmevtyper30_el0", ARM64SYS::PMEVTYPER30_EL0)
-          .Case("pmccfiltr_el0", ARM64SYS::PMCCFILTR_EL0)
-          .Case("rmr_el3", ARM64SYS::RMR_EL3)
-          .Case("rmr_el2", ARM64SYS::RMR_EL2)
-          .Case("rmr_el1", ARM64SYS::RMR_EL1)
-          .Case("cpm_ioacc_ctl_el3", ARM64SYS::CPM_IOACC_CTL_EL3)
-          .Case("mdccsr_el0", ARM64SYS::MDCCSR_EL0)
-          .Case("mdccint_el1", ARM64SYS::MDCCINT_EL1)
-          .Case("dbgdtr_el0", ARM64SYS::DBGDTR_EL0)
-          .Case("dbgdtrrx_el0", ARM64SYS::DBGDTRRX_EL0)
-          .Case("dbgdtrtx_el0", ARM64SYS::DBGDTRTX_EL0)
-          .Case("dbgvcr32_el2", ARM64SYS::DBGVCR32_EL2)
-          .Case("osdtrrx_el1", ARM64SYS::OSDTRRX_EL1)
-          .Case("mdscr_el1", ARM64SYS::MDSCR_EL1)
-          .Case("osdtrtx_el1", ARM64SYS::OSDTRTX_EL1)
-          .Case("oseccr_el11", ARM64SYS::OSECCR_EL11)
-          .Case("dbgbvr0_el1", ARM64SYS::DBGBVR0_EL1)
-          .Case("dbgbvr1_el1", ARM64SYS::DBGBVR1_EL1)
-          .Case("dbgbvr2_el1", ARM64SYS::DBGBVR2_EL1)
-          .Case("dbgbvr3_el1", ARM64SYS::DBGBVR3_EL1)
-          .Case("dbgbvr4_el1", ARM64SYS::DBGBVR4_EL1)
-          .Case("dbgbvr5_el1", ARM64SYS::DBGBVR5_EL1)
-          .Case("dbgbvr6_el1", ARM64SYS::DBGBVR6_EL1)
-          .Case("dbgbvr7_el1", ARM64SYS::DBGBVR7_EL1)
-          .Case("dbgbvr8_el1", ARM64SYS::DBGBVR8_EL1)
-          .Case("dbgbvr9_el1", ARM64SYS::DBGBVR9_EL1)
-          .Case("dbgbvr10_el1", ARM64SYS::DBGBVR10_EL1)
-          .Case("dbgbvr11_el1", ARM64SYS::DBGBVR11_EL1)
-          .Case("dbgbvr12_el1", ARM64SYS::DBGBVR12_EL1)
-          .Case("dbgbvr13_el1", ARM64SYS::DBGBVR13_EL1)
-          .Case("dbgbvr14_el1", ARM64SYS::DBGBVR14_EL1)
-          .Case("dbgbvr15_el1", ARM64SYS::DBGBVR15_EL1)
-          .Case("dbgbcr0_el1", ARM64SYS::DBGBCR0_EL1)
-          .Case("dbgbcr1_el1", ARM64SYS::DBGBCR1_EL1)
-          .Case("dbgbcr2_el1", ARM64SYS::DBGBCR2_EL1)
-          .Case("dbgbcr3_el1", ARM64SYS::DBGBCR3_EL1)
-          .Case("dbgbcr4_el1", ARM64SYS::DBGBCR4_EL1)
-          .Case("dbgbcr5_el1", ARM64SYS::DBGBCR5_EL1)
-          .Case("dbgbcr6_el1", ARM64SYS::DBGBCR6_EL1)
-          .Case("dbgbcr7_el1", ARM64SYS::DBGBCR7_EL1)
-          .Case("dbgbcr8_el1", ARM64SYS::DBGBCR8_EL1)
-          .Case("dbgbcr9_el1", ARM64SYS::DBGBCR9_EL1)
-          .Case("dbgbcr10_el1", ARM64SYS::DBGBCR10_EL1)
-          .Case("dbgbcr11_el1", ARM64SYS::DBGBCR11_EL1)
-          .Case("dbgbcr12_el1", ARM64SYS::DBGBCR12_EL1)
-          .Case("dbgbcr13_el1", ARM64SYS::DBGBCR13_EL1)
-          .Case("dbgbcr14_el1", ARM64SYS::DBGBCR14_EL1)
-          .Case("dbgbcr15_el1", ARM64SYS::DBGBCR15_EL1)
-          .Case("dbgwvr0_el1", ARM64SYS::DBGWVR0_EL1)
-          .Case("dbgwvr1_el1", ARM64SYS::DBGWVR1_EL1)
-          .Case("dbgwvr2_el1", ARM64SYS::DBGWVR2_EL1)
-          .Case("dbgwvr3_el1", ARM64SYS::DBGWVR3_EL1)
-          .Case("dbgwvr4_el1", ARM64SYS::DBGWVR4_EL1)
-          .Case("dbgwvr5_el1", ARM64SYS::DBGWVR5_EL1)
-          .Case("dbgwvr6_el1", ARM64SYS::DBGWVR6_EL1)
-          .Case("dbgwvr7_el1", ARM64SYS::DBGWVR7_EL1)
-          .Case("dbgwvr8_el1", ARM64SYS::DBGWVR8_EL1)
-          .Case("dbgwvr9_el1", ARM64SYS::DBGWVR9_EL1)
-          .Case("dbgwvr10_el1", ARM64SYS::DBGWVR10_EL1)
-          .Case("dbgwvr11_el1", ARM64SYS::DBGWVR11_EL1)
-          .Case("dbgwvr12_el1", ARM64SYS::DBGWVR12_EL1)
-          .Case("dbgwvr13_el1", ARM64SYS::DBGWVR13_EL1)
-          .Case("dbgwvr14_el1", ARM64SYS::DBGWVR14_EL1)
-          .Case("dbgwvr15_el1", ARM64SYS::DBGWVR15_EL1)
-          .Case("dbgwcr0_el1", ARM64SYS::DBGWCR0_EL1)
-          .Case("dbgwcr1_el1", ARM64SYS::DBGWCR1_EL1)
-          .Case("dbgwcr2_el1", ARM64SYS::DBGWCR2_EL1)
-          .Case("dbgwcr3_el1", ARM64SYS::DBGWCR3_EL1)
-          .Case("dbgwcr4_el1", ARM64SYS::DBGWCR4_EL1)
-          .Case("dbgwcr5_el1", ARM64SYS::DBGWCR5_EL1)
-          .Case("dbgwcr6_el1", ARM64SYS::DBGWCR6_EL1)
-          .Case("dbgwcr7_el1", ARM64SYS::DBGWCR7_EL1)
-          .Case("dbgwcr8_el1", ARM64SYS::DBGWCR8_EL1)
-          .Case("dbgwcr9_el1", ARM64SYS::DBGWCR9_EL1)
-          .Case("dbgwcr10_el1", ARM64SYS::DBGWCR10_EL1)
-          .Case("dbgwcr11_el1", ARM64SYS::DBGWCR11_EL1)
-          .Case("dbgwcr12_el1", ARM64SYS::DBGWCR12_EL1)
-          .Case("dbgwcr13_el1", ARM64SYS::DBGWCR13_EL1)
-          .Case("dbgwcr14_el1", ARM64SYS::DBGWCR14_EL1)
-          .Case("dbgwcr15_el1", ARM64SYS::DBGWCR15_EL1)
-          .Case("mdrar_el1", ARM64SYS::MDRAR_EL1)
-          .Case("oslar_el1", ARM64SYS::OSLAR_EL1)
-          .Case("oslsr_el1", ARM64SYS::OSLSR_EL1)
-          .Case("osdlr_el1", ARM64SYS::OSDLR_EL1)
-          .Case("dbgprcr_el1", ARM64SYS::DBGPRCR_EL1)
-          .Case("dbgclaimset_el1", ARM64SYS::DBGCLAIMSET_EL1)
-          .Case("dbgclaimclr_el1", ARM64SYS::DBGCLAIMCLR_EL1)
-          .Case("dbgauthstatus_el1", ARM64SYS::DBGAUTHSTATUS_EL1)
-          .Case("dbgdevid2", ARM64SYS::DBGDEVID2)
-          .Case("dbgdevid1", ARM64SYS::DBGDEVID1)
-          .Case("dbgdevid0", ARM64SYS::DBGDEVID0)
-          .Case("id_pfr0_el1", ARM64SYS::ID_PFR0_EL1)
-          .Case("id_pfr1_el1", ARM64SYS::ID_PFR1_EL1)
-          .Case("id_dfr0_el1", ARM64SYS::ID_DFR0_EL1)
-          .Case("id_afr0_el1", ARM64SYS::ID_AFR0_EL1)
-          .Case("id_isar0_el1", ARM64SYS::ID_ISAR0_EL1)
-          .Case("id_isar1_el1", ARM64SYS::ID_ISAR1_EL1)
-          .Case("id_isar2_el1", ARM64SYS::ID_ISAR2_EL1)
-          .Case("id_isar3_el1", ARM64SYS::ID_ISAR3_EL1)
-          .Case("id_isar4_el1", ARM64SYS::ID_ISAR4_EL1)
-          .Case("id_isar5_el1", ARM64SYS::ID_ISAR5_EL1)
-          .Case("afsr1_el1", ARM64SYS::AFSR1_EL1)
-          .Case("afsr0_el1", ARM64SYS::AFSR0_EL1)
-          .Case("revidr_el1", ARM64SYS::REVIDR_EL1)
-          .Default(ARM64SYS::InvalidSystemReg);
-  if (Reg != ARM64SYS::InvalidSystemReg) {
-    // We matched a reg name, so create the operand.
-    Operands.push_back(
-        ARM64Operand::CreateSystemRegister(Reg, getLoc(), getContext()));
-    Parser.Lex(); // Consume the register name.
-    return MatchOperand_Success;
-  }
-
-  // Or we may have an identifier that encodes the sub-operands.
-  // For example, s3_2_c15_c0_0.
-  unsigned op0, op1, CRn, CRm, op2;
-  std::string Desc = ID;
-  if (std::sscanf(Desc.c_str(), "s%u_%u_c%u_c%u_%u", &op0, &op1, &CRn, &CRm,
-                  &op2) != 5)
-    return MatchOperand_NoMatch;
-  if ((op0 != 2 && op0 != 3) || op1 > 7 || CRn > 15 || CRm > 15 || op2 > 7)
-    return MatchOperand_NoMatch;
-
-  unsigned Val = op0 << 14 | op1 << 11 | CRn << 7 | CRm << 3 | op2;
-  Operands.push_back(
-      ARM64Operand::CreateSystemRegister(Val, getLoc(), getContext()));
-  Parser.Lex(); // Consume the register name.
-
-  return MatchOperand_Success;
-}
-
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseCPSRField(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-
-  if (Tok.isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
-
-  ARM64SYS::CPSRField Field =
-      StringSwitch<ARM64SYS::CPSRField>(Tok.getString().lower())
-          .Case("spsel", ARM64SYS::cpsr_SPSel)
-          .Case("daifset", ARM64SYS::cpsr_DAIFSet)
-          .Case("daifclr", ARM64SYS::cpsr_DAIFClr)
-          .Default(ARM64SYS::InvalidCPSRField);
-  if (Field == ARM64SYS::InvalidCPSRField)
-    return MatchOperand_NoMatch;
-  Operands.push_back(
-      ARM64Operand::CreateCPSRField(Field, getLoc(), getContext()));
-  Parser.Lex(); // Consume the register name.
-
-  return MatchOperand_Success;
-}
-
-/// tryParseVectorRegister - Parse a vector register operand.
-bool ARM64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
-  if (Parser.getTok().isNot(AsmToken::Identifier))
-    return true;
-
-  SMLoc S = getLoc();
-  // Check for a vector register specifier first.
-  StringRef Kind;
-  int64_t Reg = tryMatchVectorRegister(Kind);
-  if (Reg == -1)
-    return true;
-  Operands.push_back(
-      ARM64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
-  // If there was an explicit qualifier, that goes on as a literal text
-  // operand.
-  if (!Kind.empty())
-    Operands.push_back(ARM64Operand::CreateToken(Kind, false, S, getContext()));
-
-  // If there is an index specifier following the register, parse that too.
-  if (Parser.getTok().is(AsmToken::LBrac)) {
-    SMLoc SIdx = getLoc();
-    Parser.Lex(); // Eat left bracket token.
-
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for vector index");
-      return false;
-    }
-
-    SMLoc E = getLoc();
-    if (Parser.getTok().isNot(AsmToken::RBrac)) {
-      Error(E, "']' expected");
-      return false;
-    }
-
-    Parser.Lex(); // Eat right bracket token.
-
-    Operands.push_back(ARM64Operand::CreateVectorIndex(MCE->getValue(), SIdx, E,
-                                                       getContext()));
-  }
-
-  return false;
-}
-
-/// parseRegister - Parse a non-vector register operand.
-bool ARM64AsmParser::parseRegister(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  // Try for a vector register.
-  if (!tryParseVectorRegister(Operands))
-    return false;
-
-  // Try for a scalar register.
-  int64_t Reg = tryParseRegister();
-  if (Reg == -1)
-    return true;
-  Operands.push_back(
-      ARM64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
-
-  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
-  // as a string token in the instruction itself.
-  if (getLexer().getKind() == AsmToken::LBrac) {
-    SMLoc LBracS = getLoc();
-    Parser.Lex();
-    const AsmToken &Tok = Parser.getTok();
-    if (Tok.is(AsmToken::Integer)) {
-      SMLoc IntS = getLoc();
-      int64_t Val = Tok.getIntVal();
-      if (Val == 1) {
-        Parser.Lex();
-        if (getLexer().getKind() == AsmToken::RBrac) {
-          SMLoc RBracS = getLoc();
-          Parser.Lex();
-          Operands.push_back(
-              ARM64Operand::CreateToken("[", false, LBracS, getContext()));
-          Operands.push_back(
-              ARM64Operand::CreateToken("1", false, IntS, getContext()));
-          Operands.push_back(
-              ARM64Operand::CreateToken("]", false, RBracS, getContext()));
-          return false;
-        }
-      }
-    }
-  }
-
-  return false;
-}
-
-/// tryParseNoIndexMemory - Custom parser method for memory operands that
-///                         do not allow base regisrer writeback modes,
-///                         or those that handle writeback separately from
-///                         the memory operand (like the AdvSIMD ldX/stX
-///                         instructions.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseNoIndexMemory(OperandVector &Operands) {
-  if (Parser.getTok().isNot(AsmToken::LBrac))
-    return MatchOperand_NoMatch;
-  SMLoc S = getLoc();
-  Parser.Lex(); // Eat left bracket token.
-
-  const AsmToken &BaseRegTok = Parser.getTok();
-  if (BaseRegTok.isNot(AsmToken::Identifier)) {
-    Error(BaseRegTok.getLoc(), "register expected");
-    return MatchOperand_ParseFail;
-  }
-
-  int64_t Reg = tryParseRegister();
-  if (Reg == -1) {
-    Error(BaseRegTok.getLoc(), "register expected");
-    return MatchOperand_ParseFail;
-  }
-
-  SMLoc E = getLoc();
-  if (Parser.getTok().isNot(AsmToken::RBrac)) {
-    Error(E, "']' expected");
-    return MatchOperand_ParseFail;
-  }
-
-  Parser.Lex(); // Eat right bracket token.
-
-  Operands.push_back(ARM64Operand::CreateMem(Reg, 0, S, E, E, getContext()));
-  return MatchOperand_Success;
-}
-
-/// parseMemory - Parse a memory operand for a basic load/store instruction.
-bool ARM64AsmParser::parseMemory(OperandVector &Operands) {
-  assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a Left Bracket");
-  SMLoc S = getLoc();
-  Parser.Lex(); // Eat left bracket token.
-
-  const AsmToken &BaseRegTok = Parser.getTok();
-  if (BaseRegTok.isNot(AsmToken::Identifier))
-    return Error(BaseRegTok.getLoc(), "register expected");
-
-  int64_t Reg = tryParseRegister();
-  if (Reg == -1)
-    return Error(BaseRegTok.getLoc(), "register expected");
-
-  // If there is an offset expression, parse it.
-  const MCExpr *OffsetExpr = 0;
-  SMLoc OffsetLoc;
-  if (Parser.getTok().is(AsmToken::Comma)) {
-    Parser.Lex(); // Eat the comma.
-    OffsetLoc = getLoc();
-
-    // Register offset
-    const AsmToken &OffsetRegTok = Parser.getTok();
-    int Reg2 = OffsetRegTok.is(AsmToken::Identifier) ? tryParseRegister() : -1;
-    if (Reg2 != -1) {
-      // Default shift is LSL, with an omitted shift.  We use the third bit of
-      // the extend value to indicate presence/omission of the immediate offset.
-      ARM64_AM::ExtendType ExtOp = ARM64_AM::UXTX;
-      int64_t ShiftVal = 0;
-      bool ExplicitShift = false;
-
-      if (Parser.getTok().is(AsmToken::Comma)) {
-        // Embedded extend operand.
-        Parser.Lex(); // Eat the comma
-
-        SMLoc ExtLoc = getLoc();
-        const AsmToken &Tok = Parser.getTok();
-        ExtOp = StringSwitch<ARM64_AM::ExtendType>(Tok.getString())
-                    .Case("uxtw", ARM64_AM::UXTW)
-                    .Case("lsl", ARM64_AM::UXTX) // Alias for UXTX
-                    .Case("sxtw", ARM64_AM::SXTW)
-                    .Case("sxtx", ARM64_AM::SXTX)
-                    .Case("UXTW", ARM64_AM::UXTW)
-                    .Case("LSL", ARM64_AM::UXTX) // Alias for UXTX
-                    .Case("SXTW", ARM64_AM::SXTW)
-                    .Case("SXTX", ARM64_AM::SXTX)
-                    .Default(ARM64_AM::InvalidExtend);
-        if (ExtOp == ARM64_AM::InvalidExtend)
-          return Error(ExtLoc, "expected valid extend operation");
-
-        Parser.Lex(); // Eat the extend op.
-
-        if (getLexer().is(AsmToken::RBrac)) {
-          // No immediate operand.
-          if (ExtOp == ARM64_AM::UXTX)
-            return Error(ExtLoc, "LSL extend requires immediate operand");
-        } else if (getLexer().is(AsmToken::Hash)) {
-          // Immediate operand.
-          Parser.Lex(); // Eat the '#'
-          const MCExpr *ImmVal;
-          SMLoc ExprLoc = getLoc();
-          if (getParser().parseExpression(ImmVal))
-            return true;
-          const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-          if (!MCE)
-            return TokError("immediate value expected for extend operand");
-
-          ExplicitShift = true;
-          ShiftVal = MCE->getValue();
-          if (ShiftVal < 0 || ShiftVal > 4)
-            return Error(ExprLoc, "immediate operand out of range");
-        } else
-          return Error(getLoc(), "expected immediate operand");
-      }
-
-      if (Parser.getTok().isNot(AsmToken::RBrac))
-        return Error(getLoc(), "']' expected");
-
-      Parser.Lex(); // Eat right bracket token.
-
-      SMLoc E = getLoc();
-      Operands.push_back(ARM64Operand::CreateRegOffsetMem(
-          Reg, Reg2, ExtOp, ShiftVal, ExplicitShift, S, E, getContext()));
-      return false;
-
-      // Immediate expressions.
-    } else if (Parser.getTok().is(AsmToken::Hash)) {
-      Parser.Lex(); // Eat hash token.
-
-      if (parseSymbolicImmVal(OffsetExpr))
-        return true;
-    } else {
-      // FIXME: We really should make sure that we're dealing with a LDR/STR
-      // instruction that can legally have a symbolic expression here.
-      // Symbol reference.
-      if (Parser.getTok().isNot(AsmToken::Identifier) &&
-          Parser.getTok().isNot(AsmToken::String))
-        return Error(getLoc(), "identifier or immediate expression expected");
-      if (getParser().parseExpression(OffsetExpr))
-        return true;
-      // If this is a plain ref, Make sure a legal variant kind was specified.
-      // Otherwise, it's a more complicated expression and we have to just
-      // assume it's OK and let the relocation stuff puke if it's not.
-      ARM64MCExpr::VariantKind ELFRefKind;
-      MCSymbolRefExpr::VariantKind DarwinRefKind;
-      const MCConstantExpr *Addend;
-      if (classifySymbolRef(OffsetExpr, ELFRefKind, DarwinRefKind, Addend) &&
-          Addend == 0) {
-        assert(ELFRefKind == ARM64MCExpr::VK_INVALID &&
-               "ELF symbol modifiers not supported here yet");
-
-        switch (DarwinRefKind) {
-        default:
-          return Error(getLoc(), "expected @pageoff or @gotpageoff modifier");
-        case MCSymbolRefExpr::VK_GOTPAGEOFF:
-        case MCSymbolRefExpr::VK_PAGEOFF:
-        case MCSymbolRefExpr::VK_TLVPPAGEOFF:
-          // These are what we're expecting.
-          break;
-        }
-      }
-    }
-  }
-
-  SMLoc E = getLoc();
-  if (Parser.getTok().isNot(AsmToken::RBrac))
-    return Error(E, "']' expected");
-
-  Parser.Lex(); // Eat right bracket token.
-
-  // Create the memory operand.
-  Operands.push_back(
-      ARM64Operand::CreateMem(Reg, OffsetExpr, S, E, OffsetLoc, getContext()));
-
-  // Check for a '!', indicating pre-indexed addressing with writeback.
-  if (Parser.getTok().is(AsmToken::Exclaim)) {
-    // There needs to have been an immediate or wback doesn't make sense.
-    if (!OffsetExpr)
-      return Error(E, "missing offset for pre-indexed addressing");
-    // Pre-indexed with writeback must have a constant expression for the
-    // offset. FIXME: Theoretically, we'd like to allow fixups so long
-    // as they don't require a relocation.
-    if (!isa<MCConstantExpr>(OffsetExpr))
-      return Error(OffsetLoc, "constant immediate expression expected");
-
-    // Create the Token operand for the '!'.
-    Operands.push_back(ARM64Operand::CreateToken(
-        "!", false, Parser.getTok().getLoc(), getContext()));
-    Parser.Lex(); // Eat the '!' token.
-  }
-
-  return false;
-}
-
-bool ARM64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
-  bool HasELFModifier = false;
-  ARM64MCExpr::VariantKind RefKind;
-
-  if (Parser.getTok().is(AsmToken::Colon)) {
-    Parser.Lex(); // Eat ':"
-    HasELFModifier = true;
-
-    if (Parser.getTok().isNot(AsmToken::Identifier)) {
-      Error(Parser.getTok().getLoc(),
-            "expect relocation specifier in operand after ':'");
-      return true;
-    }
-
-    std::string LowerCase = Parser.getTok().getIdentifier().lower();
-    RefKind = StringSwitch<ARM64MCExpr::VariantKind>(LowerCase)
-                  .Case("lo12", ARM64MCExpr::VK_LO12)
-                  .Case("abs_g3", ARM64MCExpr::VK_ABS_G3)
-                  .Case("abs_g2", ARM64MCExpr::VK_ABS_G2)
-                  .Case("abs_g2_nc", ARM64MCExpr::VK_ABS_G2_NC)
-                  .Case("abs_g1", ARM64MCExpr::VK_ABS_G1)
-                  .Case("abs_g1_nc", ARM64MCExpr::VK_ABS_G1_NC)
-                  .Case("abs_g0", ARM64MCExpr::VK_ABS_G0)
-                  .Case("abs_g0_nc", ARM64MCExpr::VK_ABS_G0_NC)
-                  .Case("dtprel_g2", ARM64MCExpr::VK_DTPREL_G2)
-                  .Case("dtprel_g1", ARM64MCExpr::VK_DTPREL_G1)
-                  .Case("dtprel_g1_nc", ARM64MCExpr::VK_DTPREL_G1_NC)
-                  .Case("dtprel_g0", ARM64MCExpr::VK_DTPREL_G0)
-                  .Case("dtprel_g0_nc", ARM64MCExpr::VK_DTPREL_G0_NC)
-                  .Case("dtprel_lo12", ARM64MCExpr::VK_DTPREL_LO12)
-                  .Case("dtprel_lo12_nc", ARM64MCExpr::VK_DTPREL_LO12_NC)
-                  .Case("tprel_g2", ARM64MCExpr::VK_TPREL_G2)
-                  .Case("tprel_g1", ARM64MCExpr::VK_TPREL_G1)
-                  .Case("tprel_g1_nc", ARM64MCExpr::VK_TPREL_G1_NC)
-                  .Case("tprel_g0", ARM64MCExpr::VK_TPREL_G0)
-                  .Case("tprel_g0_nc", ARM64MCExpr::VK_TPREL_G0_NC)
-                  .Case("tprel_lo12", ARM64MCExpr::VK_TPREL_LO12)
-                  .Case("tprel_lo12_nc", ARM64MCExpr::VK_TPREL_LO12_NC)
-                  .Case("tlsdesc_lo12", ARM64MCExpr::VK_TLSDESC_LO12)
-                  .Case("got", ARM64MCExpr::VK_GOT_PAGE)
-                  .Case("got_lo12", ARM64MCExpr::VK_GOT_LO12)
-                  .Case("gottprel", ARM64MCExpr::VK_GOTTPREL_PAGE)
-                  .Case("gottprel_lo12", ARM64MCExpr::VK_GOTTPREL_LO12_NC)
-                  .Case("gottprel_g1", ARM64MCExpr::VK_GOTTPREL_G1)
-                  .Case("gottprel_g0_nc", ARM64MCExpr::VK_GOTTPREL_G0_NC)
-                  .Case("tlsdesc", ARM64MCExpr::VK_TLSDESC_PAGE)
-                  .Default(ARM64MCExpr::VK_INVALID);
-
-    if (RefKind == ARM64MCExpr::VK_INVALID) {
-      Error(Parser.getTok().getLoc(),
-            "expect relocation specifier in operand after ':'");
-      return true;
-    }
-
-    Parser.Lex(); // Eat identifier
-
-    if (Parser.getTok().isNot(AsmToken::Colon)) {
-      Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier");
-      return true;
-    }
-    Parser.Lex(); // Eat ':'
-  }
-
-  if (getParser().parseExpression(ImmVal))
-    return true;
-
-  if (HasELFModifier)
-    ImmVal = ARM64MCExpr::Create(ImmVal, RefKind, getContext());
-
-  return false;
-}
-
-/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
-bool ARM64AsmParser::parseVectorList(OperandVector &Operands) {
-  assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
-  SMLoc S = getLoc();
-  Parser.Lex(); // Eat left bracket token.
-  StringRef Kind;
-  int64_t FirstReg = tryMatchVectorRegister(Kind);
-  if (FirstReg == -1)
-    return Error(getLoc(), "vector register expected");
-  int64_t PrevReg = FirstReg;
-  unsigned Count = 1;
-  while (Parser.getTok().isNot(AsmToken::RCurly)) {
-    if (Parser.getTok().is(AsmToken::EndOfStatement))
-      Error(getLoc(), "'}' expected");
-
-    if (Parser.getTok().isNot(AsmToken::Comma))
-      return Error(getLoc(), "',' expected");
-    Parser.Lex(); // Eat the comma token.
-
-    SMLoc Loc = getLoc();
-    StringRef NextKind;
-    int64_t Reg = tryMatchVectorRegister(NextKind);
-    if (Reg == -1)
-      return Error(Loc, "vector register expected");
-    // Any Kind suffices must match on all regs in the list.
-    if (Kind != NextKind)
-      return Error(Loc, "mismatched register size suffix");
-
-    // Registers must be incremental (with wraparound at 31)
-    if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
-        (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
-      return Error(Loc, "registers must be sequential");
-
-    PrevReg = Reg;
-    ++Count;
-  }
-  Parser.Lex(); // Eat the '}' token.
-
-  unsigned NumElements = 0;
-  char ElementKind = 0;
-  if (!Kind.empty())
-    parseValidVectorKind(Kind, NumElements, ElementKind);
-
-  Operands.push_back(ARM64Operand::CreateVectorList(
-      FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
-
-  // If there is an index specifier following the list, parse that too.
-  if (Parser.getTok().is(AsmToken::LBrac)) {
-    SMLoc SIdx = getLoc();
-    Parser.Lex(); // Eat left bracket token.
-
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for vector index");
-      return false;
-    }
-
-    SMLoc E = getLoc();
-    if (Parser.getTok().isNot(AsmToken::RBrac)) {
-      Error(E, "']' expected");
-      return false;
-    }
-
-    Parser.Lex(); // Eat right bracket token.
-
-    Operands.push_back(ARM64Operand::CreateVectorIndex(MCE->getValue(), SIdx, E,
-                                                       getContext()));
-  }
-  return false;
-}
-
-/// parseOperand - Parse a arm instruction operand.  For now this parses the
-/// operand regardless of the mnemonic.
-bool ARM64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
-                                  bool invertCondCode) {
-  // Check if the current operand has a custom associated parser, if so, try to
-  // custom parse the operand, or fallback to the general approach.
-  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
-  if (ResTy == MatchOperand_Success)
-    return false;
-  // If there wasn't a custom match, try the generic matcher below. Otherwise,
-  // there was a match, but an error occurred, in which case, just return that
-  // the operand parsing failed.
-  if (ResTy == MatchOperand_ParseFail)
-    return true;
-
-  // Nothing custom, so do general case parsing.
-  SMLoc S, E;
-  switch (getLexer().getKind()) {
-  default: {
-    SMLoc S = getLoc();
-    const MCExpr *Expr;
-    if (parseSymbolicImmVal(Expr))
-      return Error(S, "invalid operand");
-
-    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
-    return false;
-  }
-  case AsmToken::LBrac:
-    return parseMemory(Operands);
-  case AsmToken::LCurly:
-    return parseVectorList(Operands);
-  case AsmToken::Identifier: {
-    // If we're expecting a Condition Code operand, then just parse that.
-    if (isCondCode)
-      return parseCondCode(Operands, invertCondCode);
-
-    // If it's a register name, parse it.
-    if (!parseRegister(Operands))
-      return false;
-
-    // This could be an optional "shift" operand.
-    if (!parseOptionalShift(Operands))
-      return false;
-
-    // Or maybe it could be an optional "extend" operand.
-    if (!parseOptionalExtend(Operands))
-      return false;
-
-    // This was not a register so parse other operands that start with an
-    // identifier (like labels) as expressions and create them as immediates.
-    const MCExpr *IdVal;
-    S = getLoc();
-    if (getParser().parseExpression(IdVal))
-      return true;
-
-    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(ARM64Operand::CreateImm(IdVal, S, E, getContext()));
-    return false;
-  }
-  case AsmToken::Hash: {
-    // #42 -> immediate.
-    S = getLoc();
-    Parser.Lex();
-
-    // The only Real that should come through here is a literal #0.0 for
-    // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
-    // so convert the value.
-    const AsmToken &Tok = Parser.getTok();
-    if (Tok.is(AsmToken::Real)) {
-      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
-      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-      if (IntVal != 0 || (Mnemonic != "fcmp" && Mnemonic != "fcmpe"))
-        return TokError("unexpected floating point literal");
-      Parser.Lex(); // Eat the token.
-
-      Operands.push_back(
-          ARM64Operand::CreateToken("#0", false, S, getContext()));
-      Operands.push_back(
-          ARM64Operand::CreateToken(".0", false, S, getContext()));
-      return false;
-    }
-
-    const MCExpr *ImmVal;
-    if (parseSymbolicImmVal(ImmVal))
-      return true;
-
-    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(ARM64Operand::CreateImm(ImmVal, S, E, getContext()));
-    return false;
-  }
-  }
-}
-
-/// ParseInstruction - Parse an ARM64 instruction mnemonic followed by its
-/// operands.
-bool ARM64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
-                                      StringRef Name, SMLoc NameLoc,
-                                      OperandVector &Operands) {
-  // Create the leading tokens for the mnemonic, split by '.' characters.
-  size_t Start = 0, Next = Name.find('.');
-  StringRef Head = Name.slice(Start, Next);
-
-  // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
-  if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi")
-    return parseSysAlias(Head, NameLoc, Operands);
-
-  Operands.push_back(
-      ARM64Operand::CreateToken(Head, false, NameLoc, getContext()));
-  Mnemonic = Head;
-
-  // Handle condition codes for a branch mnemonic
-  if (Head == "b" && Next != StringRef::npos) {
-    Start = Next;
-    Next = Name.find('.', Start + 1);
-    Head = Name.slice(Start + 1, Next);
-
-    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
-                                            (Head.data() - Name.data()));
-    unsigned CC = parseCondCodeString(Head);
-    if (CC == ~0U)
-      return Error(SuffixLoc, "invalid condition code");
-    const MCExpr *CCExpr = MCConstantExpr::Create(CC, getContext());
-    Operands.push_back(
-        ARM64Operand::CreateImm(CCExpr, NameLoc, NameLoc, getContext()));
-  }
-
-  // Add the remaining tokens in the mnemonic.
-  while (Next != StringRef::npos) {
-    Start = Next;
-    Next = Name.find('.', Start + 1);
-    Head = Name.slice(Start, Next);
-    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
-                                            (Head.data() - Name.data()) + 1);
-    Operands.push_back(
-        ARM64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
-  }
-
-  // Conditional compare instructions have a Condition Code operand, which needs
-  // to be parsed and an immediate operand created.
-  bool condCodeFourthOperand =
-      (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
-       Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
-       Head == "csinc" || Head == "csinv" || Head == "csneg");
-
-  // These instructions are aliases to some of the conditional select
-  // instructions. However, the condition code is inverted in the aliased
-  // instruction.
-  //
-  // FIXME: Is this the correct way to handle these? Or should the parser
-  //        generate the aliased instructions directly?
-  bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
-  bool condCodeThirdOperand =
-      (Head == "cinc" || Head == "cinv" || Head == "cneg");
-
-  // Read the remaining operands.
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    // Read the first operand.
-    if (parseOperand(Operands, false, false)) {
-      Parser.eatToEndOfStatement();
-      return true;
-    }
-
-    unsigned N = 2;
-    while (getLexer().is(AsmToken::Comma)) {
-      Parser.Lex(); // Eat the comma.
-
-      // Parse and remember the operand.
-      if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
-                                     (N == 3 && condCodeThirdOperand) ||
-                                     (N == 2 && condCodeSecondOperand),
-                       condCodeSecondOperand || condCodeThirdOperand)) {
-        Parser.eatToEndOfStatement();
-        return true;
-      }
-
-      ++N;
-    }
-  }
-
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    SMLoc Loc = Parser.getTok().getLoc();
-    Parser.eatToEndOfStatement();
-    return Error(Loc, "unexpected token in argument list");
-  }
-
-  Parser.Lex(); // Consume the EndOfStatement
-  return false;
-}
-
-/// isFPR32Register - Check if a register is in the FPR32 register class.
-/// (The parser does not have the target register info to check the register
-/// class directly.)
-static bool isFPR32Register(unsigned Reg) {
-  using namespace ARM64;
-  switch (Reg) {
-  default:
-    break;
-  case S0:  case S1:  case S2:  case S3:  case S4:  case S5:  case S6:
-  case S7:  case S8:  case S9:  case S10:  case S11:  case S12:  case S13:
-  case S14:  case S15:  case S16:  case S17:  case S18:  case S19:  case S20:
-  case S21:  case S22:  case S23:  case S24:  case S25:  case S26:  case S27:
-  case S28:  case S29:  case S30:  case S31:
-    return true;
-  }
-  return false;
-}
-
-/// isGPR32Register - Check if a register is in the GPR32sp register class.
-/// (The parser does not have the target register info to check the register
-/// class directly.)
-static bool isGPR32Register(unsigned Reg) {
-  using namespace ARM64;
-  switch (Reg) {
-  default:
-    break;
-  case W0:  case W1:  case W2:  case W3:  case W4:  case W5:  case W6:
-  case W7:  case W8:  case W9:  case W10:  case W11:  case W12:  case W13:
-  case W14:  case W15:  case W16:  case W17:  case W18:  case W19:  case W20:
-  case W21:  case W22:  case W23:  case W24:  case W25:  case W26:  case W27:
-  case W28:  case W29:  case W30:  case WSP:
-    return true;
-  }
-  return false;
-}
-
-static bool isGPR64Reg(unsigned Reg) {
-  using namespace ARM64;
-  switch (Reg) {
-  case X0:  case X1:  case X2:  case X3:  case X4:  case X5:  case X6:
-  case X7:  case X8:  case X9:  case X10:  case X11:  case X12:  case X13:
-  case X14:  case X15:  case X16:  case X17:  case X18:  case X19:  case X20:
-  case X21:  case X22:  case X23:  case X24:  case X25:  case X26:  case X27:
-  case X28:  case FP:  case LR:  case SP:  case XZR:
-    return true;
-  default:
-    return false;
-  }
-}
-
-
-// FIXME: This entire function is a giant hack to provide us with decent
-// operand range validation/diagnostics until TableGen/MC can be extended
-// to support autogeneration of this kind of validation.
-bool ARM64AsmParser::validateInstruction(MCInst &Inst,
-                                         SmallVectorImpl<SMLoc> &Loc) {
-  const MCRegisterInfo *RI = getContext().getRegisterInfo();
-  // Check for indexed addressing modes w/ the base register being the
-  // same as a destination/source register or pair load where
-  // the Rt == Rt2. All of those are undefined behaviour.
-  switch (Inst.getOpcode()) {
-  case ARM64::LDPSWpre:
-  case ARM64::LDPWpost:
-  case ARM64::LDPWpre:
-  case ARM64::LDPXpost:
-  case ARM64::LDPXpre: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rt2 = Inst.getOperand(1).getReg();
-    unsigned Rn = Inst.getOperand(2).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable LDP instruction, writeback base "
-                           "is also a destination");
-    if (RI->isSubRegisterEq(Rn, Rt2))
-      return Error(Loc[1], "unpredictable LDP instruction, writeback base "
-                           "is also a destination");
-    // FALLTHROUGH
-  }
-  case ARM64::LDPDpost:
-  case ARM64::LDPDpre:
-  case ARM64::LDPQpost:
-  case ARM64::LDPQpre:
-  case ARM64::LDPSpost:
-  case ARM64::LDPSpre:
-  case ARM64::LDPSWpost:
-  case ARM64::LDPDi:
-  case ARM64::LDPQi:
-  case ARM64::LDPSi:
-  case ARM64::LDPSWi:
-  case ARM64::LDPWi:
-  case ARM64::LDPXi: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rt2 = Inst.getOperand(1).getReg();
-    if (Rt == Rt2)
-      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
-    break;
-  }
-  case ARM64::STPDpost:
-  case ARM64::STPDpre:
-  case ARM64::STPQpost:
-  case ARM64::STPQpre:
-  case ARM64::STPSpost:
-  case ARM64::STPSpre:
-  case ARM64::STPWpost:
-  case ARM64::STPWpre:
-  case ARM64::STPXpost:
-  case ARM64::STPXpre: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rt2 = Inst.getOperand(1).getReg();
-    unsigned Rn = Inst.getOperand(2).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable STP instruction, writeback base "
-                           "is also a source");
-    if (RI->isSubRegisterEq(Rn, Rt2))
-      return Error(Loc[1], "unpredictable STP instruction, writeback base "
-                           "is also a source");
-    break;
-  }
-  case ARM64::LDRBBpre:
-  case ARM64::LDRBpre:
-  case ARM64::LDRHHpre:
-  case ARM64::LDRHpre:
-  case ARM64::LDRSBWpre:
-  case ARM64::LDRSBXpre:
-  case ARM64::LDRSHWpre:
-  case ARM64::LDRSHXpre:
-  case ARM64::LDRSWpre:
-  case ARM64::LDRWpre:
-  case ARM64::LDRXpre:
-  case ARM64::LDRBBpost:
-  case ARM64::LDRBpost:
-  case ARM64::LDRHHpost:
-  case ARM64::LDRHpost:
-  case ARM64::LDRSBWpost:
-  case ARM64::LDRSBXpost:
-  case ARM64::LDRSHWpost:
-  case ARM64::LDRSHXpost:
-  case ARM64::LDRSWpost:
-  case ARM64::LDRWpost:
-  case ARM64::LDRXpost: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rn = Inst.getOperand(1).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable LDR instruction, writeback base "
-                           "is also a source");
-    break;
-  }
-  case ARM64::STRBBpost:
-  case ARM64::STRBpost:
-  case ARM64::STRHHpost:
-  case ARM64::STRHpost:
-  case ARM64::STRWpost:
-  case ARM64::STRXpost:
-  case ARM64::STRBBpre:
-  case ARM64::STRBpre:
-  case ARM64::STRHHpre:
-  case ARM64::STRHpre:
-  case ARM64::STRWpre:
-  case ARM64::STRXpre: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rn = Inst.getOperand(1).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable STR instruction, writeback base "
-                           "is also a source");
-    break;
-  }
-  }
-
-  // Now check immediate ranges. Separate from the above as there is overlap
-  // in the instructions being checked and this keeps the nested conditionals
-  // to a minimum.
-  switch (Inst.getOpcode()) {
-  case ARM64::ANDWrs:
-  case ARM64::ANDSWrs:
-  case ARM64::EORWrs:
-  case ARM64::ORRWrs: {
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[3], "immediate value expected");
-    int64_t shifter = Inst.getOperand(3).getImm();
-    ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(shifter);
-    if (ST == ARM64_AM::LSL && shifter > 31)
-      return Error(Loc[3], "shift value out of range");
-    return false;
-  }
-  case ARM64::ADDSWri:
-  case ARM64::ADDSXri:
-  case ARM64::ADDWri:
-  case ARM64::ADDXri:
-  case ARM64::SUBSWri:
-  case ARM64::SUBSXri:
-  case ARM64::SUBWri:
-  case ARM64::SUBXri: {
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[3], "immediate value expected");
-    int64_t shifter = Inst.getOperand(3).getImm();
-    if (shifter != 0 && shifter != 12)
-      return Error(Loc[3], "shift value out of range");
-    // The imm12 operand can be an expression. Validate that it's legit.
-    // FIXME: We really, really want to allow arbitrary expressions here
-    // and resolve the value and validate the result at fixup time, but
-    // that's hard as we have long since lost any source information we
-    // need to generate good diagnostics by that point.
-    if (Inst.getOpcode() == ARM64::ADDXri && Inst.getOperand(2).isExpr()) {
-      const MCExpr *Expr = Inst.getOperand(2).getExpr();
-      ARM64MCExpr::VariantKind ELFRefKind;
-      MCSymbolRefExpr::VariantKind DarwinRefKind;
-      const MCConstantExpr *Addend;
-      if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
-        return Error(Loc[2], "invalid immediate expression");
-      }
-
-      if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
-          DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF ||
-          ELFRefKind == ARM64MCExpr::VK_LO12 ||
-          ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 ||
-          ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC ||
-          ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 ||
-          ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC ||
-          ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12) {
-        // Note that we don't range-check the addend. It's adjusted
-        // modulo page size when converted, so there is no "out of range"
-        // condition when using @pageoff. Any validity checking for the value
-        // was done in the is*() predicate function.
-        return false;
-      } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF) {
-        // @gotpageoff can only be used directly, not with an addend.
-        return Addend != 0;
-      }
-
-      // Otherwise, we're not sure, so don't allow it for now.
-      return Error(Loc[2], "invalid immediate expression");
-    }
-
-    // If it's anything but an immediate, it's not legit.
-    if (!Inst.getOperand(2).isImm())
-      return Error(Loc[2], "invalid immediate expression");
-    int64_t imm = Inst.getOperand(2).getImm();
-    if (imm > 4095 || imm < 0)
-      return Error(Loc[2], "immediate value out of range");
-    return false;
-  }
-  case ARM64::LDRBpre:
-  case ARM64::LDRHpre:
-  case ARM64::LDRSBWpre:
-  case ARM64::LDRSBXpre:
-  case ARM64::LDRSHWpre:
-  case ARM64::LDRSHXpre:
-  case ARM64::LDRWpre:
-  case ARM64::LDRXpre:
-  case ARM64::LDRSpre:
-  case ARM64::LDRDpre:
-  case ARM64::LDRQpre:
-  case ARM64::STRBpre:
-  case ARM64::STRHpre:
-  case ARM64::STRWpre:
-  case ARM64::STRXpre:
-  case ARM64::STRSpre:
-  case ARM64::STRDpre:
-  case ARM64::STRQpre:
-  case ARM64::LDRBpost:
-  case ARM64::LDRHpost:
-  case ARM64::LDRSBWpost:
-  case ARM64::LDRSBXpost:
-  case ARM64::LDRSHWpost:
-  case ARM64::LDRSHXpost:
-  case ARM64::LDRWpost:
-  case ARM64::LDRXpost:
-  case ARM64::LDRSpost:
-  case ARM64::LDRDpost:
-  case ARM64::LDRQpost:
-  case ARM64::STRBpost:
-  case ARM64::STRHpost:
-  case ARM64::STRWpost:
-  case ARM64::STRXpost:
-  case ARM64::STRSpost:
-  case ARM64::STRDpost:
-  case ARM64::STRQpost:
-  case ARM64::LDTRXi:
-  case ARM64::LDTRWi:
-  case ARM64::LDTRHi:
-  case ARM64::LDTRBi:
-  case ARM64::LDTRSHWi:
-  case ARM64::LDTRSHXi:
-  case ARM64::LDTRSBWi:
-  case ARM64::LDTRSBXi:
-  case ARM64::LDTRSWi:
-  case ARM64::STTRWi:
-  case ARM64::STTRXi:
-  case ARM64::STTRHi:
-  case ARM64::STTRBi:
-  case ARM64::LDURWi:
-  case ARM64::LDURXi:
-  case ARM64::LDURSi:
-  case ARM64::LDURDi:
-  case ARM64::LDURQi:
-  case ARM64::LDURHi:
-  case ARM64::LDURBi:
-  case ARM64::LDURSHWi:
-  case ARM64::LDURSHXi:
-  case ARM64::LDURSBWi:
-  case ARM64::LDURSBXi:
-  case ARM64::LDURSWi:
-  case ARM64::PRFUMi:
-  case ARM64::STURWi:
-  case ARM64::STURXi:
-  case ARM64::STURSi:
-  case ARM64::STURDi:
-  case ARM64::STURQi:
-  case ARM64::STURHi:
-  case ARM64::STURBi: {
-    // FIXME: Should accept expressions and error in fixup evaluation
-    // if out of range.
-    if (!Inst.getOperand(2).isImm())
-      return Error(Loc[1], "immediate value expected");
-    int64_t offset = Inst.getOperand(2).getImm();
-    if (offset > 255 || offset < -256)
-      return Error(Loc[1], "offset value out of range");
-    return false;
-  }
-  case ARM64::LDRSro:
-  case ARM64::LDRWro:
-  case ARM64::LDRSWro:
-  case ARM64::STRWro:
-  case ARM64::STRSro: {
-    // FIXME: Should accept expressions and error in fixup evaluation
-    // if out of range.
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[1], "immediate value expected");
-    int64_t shift = Inst.getOperand(3).getImm();
-    ARM64_AM::ExtendType type = ARM64_AM::getMemExtendType(shift);
-    if (type != ARM64_AM::UXTW && type != ARM64_AM::UXTX &&
-        type != ARM64_AM::SXTW && type != ARM64_AM::SXTX)
-      return Error(Loc[1], "shift type invalid");
-    return false;
-  }
-  case ARM64::LDRDro:
-  case ARM64::LDRQro:
-  case ARM64::LDRXro:
-  case ARM64::PRFMro:
-  case ARM64::STRXro:
-  case ARM64::STRDro:
-  case ARM64::STRQro: {
-    // FIXME: Should accept expressions and error in fixup evaluation
-    // if out of range.
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[1], "immediate value expected");
-    int64_t shift = Inst.getOperand(3).getImm();
-    ARM64_AM::ExtendType type = ARM64_AM::getMemExtendType(shift);
-    if (type != ARM64_AM::UXTW && type != ARM64_AM::UXTX &&
-        type != ARM64_AM::SXTW && type != ARM64_AM::SXTX)
-      return Error(Loc[1], "shift type invalid");
-    return false;
-  }
-  case ARM64::LDRHro:
-  case ARM64::LDRHHro:
-  case ARM64::LDRSHWro:
-  case ARM64::LDRSHXro:
-  case ARM64::STRHro:
-  case ARM64::STRHHro: {
-    // FIXME: Should accept expressions and error in fixup evaluation
-    // if out of range.
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[1], "immediate value expected");
-    int64_t shift = Inst.getOperand(3).getImm();
-    ARM64_AM::ExtendType type = ARM64_AM::getMemExtendType(shift);
-    if (type != ARM64_AM::UXTW && type != ARM64_AM::UXTX &&
-        type != ARM64_AM::SXTW && type != ARM64_AM::SXTX)
-      return Error(Loc[1], "shift type invalid");
-    return false;
-  }
-  case ARM64::LDRBro:
-  case ARM64::LDRBBro:
-  case ARM64::LDRSBWro:
-  case ARM64::LDRSBXro:
-  case ARM64::STRBro:
-  case ARM64::STRBBro: {
-    // FIXME: Should accept expressions and error in fixup evaluation
-    // if out of range.
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[1], "immediate value expected");
-    int64_t shift = Inst.getOperand(3).getImm();
-    ARM64_AM::ExtendType type = ARM64_AM::getMemExtendType(shift);
-    if (type != ARM64_AM::UXTW && type != ARM64_AM::UXTX &&
-        type != ARM64_AM::SXTW && type != ARM64_AM::SXTX)
-      return Error(Loc[1], "shift type invalid");
-    return false;
-  }
-  case ARM64::LDPWi:
-  case ARM64::LDPXi:
-  case ARM64::LDPSi:
-  case ARM64::LDPDi:
-  case ARM64::LDPQi:
-  case ARM64::LDPSWi:
-  case ARM64::STPWi:
-  case ARM64::STPXi:
-  case ARM64::STPSi:
-  case ARM64::STPDi:
-  case ARM64::STPQi:
-  case ARM64::LDPWpre:
-  case ARM64::LDPXpre:
-  case ARM64::LDPSpre:
-  case ARM64::LDPDpre:
-  case ARM64::LDPQpre:
-  case ARM64::LDPSWpre:
-  case ARM64::STPWpre:
-  case ARM64::STPXpre:
-  case ARM64::STPSpre:
-  case ARM64::STPDpre:
-  case ARM64::STPQpre:
-  case ARM64::LDPWpost:
-  case ARM64::LDPXpost:
-  case ARM64::LDPSpost:
-  case ARM64::LDPDpost:
-  case ARM64::LDPQpost:
-  case ARM64::LDPSWpost:
-  case ARM64::STPWpost:
-  case ARM64::STPXpost:
-  case ARM64::STPSpost:
-  case ARM64::STPDpost:
-  case ARM64::STPQpost:
-  case ARM64::LDNPWi:
-  case ARM64::LDNPXi:
-  case ARM64::LDNPSi:
-  case ARM64::LDNPDi:
-  case ARM64::LDNPQi:
-  case ARM64::STNPWi:
-  case ARM64::STNPXi:
-  case ARM64::STNPSi:
-  case ARM64::STNPDi:
-  case ARM64::STNPQi: {
-    // FIXME: Should accept expressions and error in fixup evaluation
-    // if out of range.
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[2], "immediate value expected");
-    int64_t offset = Inst.getOperand(3).getImm();
-    if (offset > 63 || offset < -64)
-      return Error(Loc[2], "offset value out of range");
-    return false;
-  }
-  default:
-    return false;
-  }
-}
-
-static void rewriteMOV(ARM64AsmParser::OperandVector &Operands,
-                       StringRef mnemonic, uint64_t imm, unsigned shift,
-                       MCContext &Context) {
-  ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[0]);
-  ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
-  Operands[0] =
-      ARM64Operand::CreateToken(mnemonic, false, Op->getStartLoc(), Context);
-
-  const MCExpr *NewImm = MCConstantExpr::Create(imm >> shift, Context);
-  Operands[2] = ARM64Operand::CreateImm(NewImm, Op2->getStartLoc(),
-                                        Op2->getEndLoc(), Context);
-
-  Operands.push_back(ARM64Operand::CreateShifter(
-      ARM64_AM::LSL, shift, Op2->getStartLoc(), Op2->getEndLoc(), Context));
-  delete Op2;
-  delete Op;
-}
-
-bool ARM64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
-  switch (ErrCode) {
-  case Match_MissingFeature:
-    return Error(Loc,
-                 "instruction requires a CPU feature not currently enabled");
-  case Match_InvalidOperand:
-    return Error(Loc, "invalid operand for instruction");
-  case Match_InvalidSuffix:
-    return Error(Loc, "invalid type suffix for instruction");
-  case Match_InvalidMemoryIndexedSImm9:
-    return Error(Loc, "index must be an integer in range [-256,255].");
-  case Match_InvalidMemoryIndexed32SImm7:
-    return Error(Loc, "index must be a multiple of 4 in range [-256,252].");
-  case Match_InvalidMemoryIndexed64SImm7:
-    return Error(Loc, "index must be a multiple of 8 in range [-512,504].");
-  case Match_InvalidMemoryIndexed128SImm7:
-    return Error(Loc, "index must be a multiple of 16 in range [-1024,1008].");
-  case Match_InvalidMemoryIndexed8:
-    return Error(Loc, "index must be an integer in range [0,4095].");
-  case Match_InvalidMemoryIndexed16:
-    return Error(Loc, "index must be a multiple of 2 in range [0,8190].");
-  case Match_InvalidMemoryIndexed32:
-    return Error(Loc, "index must be a multiple of 4 in range [0,16380].");
-  case Match_InvalidMemoryIndexed64:
-    return Error(Loc, "index must be a multiple of 8 in range [0,32760].");
-  case Match_InvalidMemoryIndexed128:
-    return Error(Loc, "index must be a multiple of 16 in range [0,65520].");
-  case Match_InvalidImm1_8:
-    return Error(Loc, "immediate must be an integer in range [1,8].");
-  case Match_InvalidImm1_16:
-    return Error(Loc, "immediate must be an integer in range [1,16].");
-  case Match_InvalidImm1_32:
-    return Error(Loc, "immediate must be an integer in range [1,32].");
-  case Match_InvalidImm1_64:
-    return Error(Loc, "immediate must be an integer in range [1,64].");
-  case Match_MnemonicFail:
-    return Error(Loc, "unrecognized instruction mnemonic");
-  default:
-    assert(0 && "unexpected error code!");
-    return Error(Loc, "invalid instruction format");
-  }
-}
-
-bool ARM64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                                             OperandVector &Operands,
-                                             MCStreamer &Out,
-                                             unsigned &ErrorInfo,
-                                             bool MatchingInlineAsm) {
-  assert(!Operands.empty() && "Unexpect empty operand list!");
-  ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[0]);
-  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
-
-  StringRef Tok = Op->getToken();
-  // Translate CMN/CMP pseudos to ADDS/SUBS with zero register destination.
-  // This needs to be done before the special handling of ADD/SUB immediates.
-  if (Tok == "cmp" || Tok == "cmn") {
-    // Replace the opcode with either ADDS or SUBS.
-    const char *Repl = StringSwitch<const char *>(Tok)
-                           .Case("cmp", "subs")
-                           .Case("cmn", "adds")
-                           .Default(0);
-    assert(Repl && "Unknown compare instruction");
-    delete Operands[0];
-    Operands[0] = ARM64Operand::CreateToken(Repl, false, IDLoc, getContext());
-
-    // Insert WZR or XZR as destination operand.
-    ARM64Operand *RegOp = static_cast<ARM64Operand *>(Operands[1]);
-    unsigned ZeroReg;
-    if (RegOp->isReg() &&
-        (isGPR32Register(RegOp->getReg()) || RegOp->getReg() == ARM64::WZR))
-      ZeroReg = ARM64::WZR;
-    else
-      ZeroReg = ARM64::XZR;
-    Operands.insert(
-        Operands.begin() + 1,
-        ARM64Operand::CreateReg(ZeroReg, false, IDLoc, IDLoc, getContext()));
-    // Update since we modified it above.
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[0]);
-    Tok = Op->getToken();
-  }
-
-  unsigned NumOperands = Operands.size();
-
-  if (Tok == "mov" && NumOperands == 3) {
-    // The MOV mnemomic is aliased to movn/movz, depending on the value of
-    // the immediate being instantiated.
-    // FIXME: Catching this here is a total hack, and we should use tblgen
-    // support to implement this instead as soon as it is available.
-
-    ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
-    if (Op2->isImm()) {
-      if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op2->getImm())) {
-        uint64_t Val = CE->getValue();
-        uint64_t NVal = ~Val;
-
-        // If this is a 32-bit register and the value has none of the upper
-        // set, clear the complemented upper 32-bits so the logic below works
-        // for 32-bit registers too.
-        ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-        if (Op1->isReg() && isGPR32Register(Op1->getReg()) &&
-            (Val & 0xFFFFFFFFULL) == Val)
-          NVal &= 0x00000000FFFFFFFFULL;
-
-        // MOVK Rd, imm << 0
-        if ((Val & 0xFFFF) == Val)
-          rewriteMOV(Operands, "movz", Val, 0, getContext());
-
-        // MOVK Rd, imm << 16
-        else if ((Val & 0xFFFF0000ULL) == Val)
-          rewriteMOV(Operands, "movz", Val, 16, getContext());
-
-        // MOVK Rd, imm << 32
-        else if ((Val & 0xFFFF00000000ULL) == Val)
-          rewriteMOV(Operands, "movz", Val, 32, getContext());
-
-        // MOVK Rd, imm << 48
-        else if ((Val & 0xFFFF000000000000ULL) == Val)
-          rewriteMOV(Operands, "movz", Val, 48, getContext());
-
-        // MOVN Rd, (~imm << 0)
-        else if ((NVal & 0xFFFFULL) == NVal)
-          rewriteMOV(Operands, "movn", NVal, 0, getContext());
-
-        // MOVN Rd, ~(imm << 16)
-        else if ((NVal & 0xFFFF0000ULL) == NVal)
-          rewriteMOV(Operands, "movn", NVal, 16, getContext());
-
-        // MOVN Rd, ~(imm << 32)
-        else if ((NVal & 0xFFFF00000000ULL) == NVal)
-          rewriteMOV(Operands, "movn", NVal, 32, getContext());
-
-        // MOVN Rd, ~(imm << 48)
-        else if ((NVal & 0xFFFF000000000000ULL) == NVal)
-          rewriteMOV(Operands, "movn", NVal, 48, getContext());
-      }
-    }
-  } else if (NumOperands == 4) {
-    if (Tok == "add" || Tok == "adds" || Tok == "sub" || Tok == "subs") {
-      // Handle the uimm24 immediate form, where the shift is not specified.
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      if (Op3->isImm()) {
-        if (const MCConstantExpr *CE =
-                dyn_cast<MCConstantExpr>(Op3->getImm())) {
-          uint64_t Val = CE->getValue();
-          if (Val >= (1 << 24)) {
-            Error(IDLoc, "immediate value is too large");
-            return true;
-          }
-          if (Val < (1 << 12)) {
-            Operands.push_back(ARM64Operand::CreateShifter(
-                ARM64_AM::LSL, 0, IDLoc, IDLoc, getContext()));
-          } else if ((Val & 0xfff) == 0) {
-            delete Operands[3];
-            CE = MCConstantExpr::Create(Val >> 12, getContext());
-            Operands[3] =
-                ARM64Operand::CreateImm(CE, IDLoc, IDLoc, getContext());
-            Operands.push_back(ARM64Operand::CreateShifter(
-                ARM64_AM::LSL, 12, IDLoc, IDLoc, getContext()));
-          } else {
-            Error(IDLoc, "immediate value is too large");
-            return true;
-          }
-        } else {
-          Operands.push_back(ARM64Operand::CreateShifter(
-              ARM64_AM::LSL, 0, IDLoc, IDLoc, getContext()));
-        }
-      }
-
-      // FIXME: Horible hack to handle the LSL -> UBFM alias.
-    } else if (NumOperands == 4 && Tok == "lsl") {
-      ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      if (Op2->isReg() && Op3->isImm()) {
-        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-        if (Op3CE) {
-          uint64_t Op3Val = Op3CE->getValue();
-          uint64_t NewOp3Val = 0;
-          uint64_t NewOp4Val = 0;
-          if (isGPR32Register(Op2->getReg()) || Op2->getReg() == ARM64::WZR) {
-            NewOp3Val = (32 - Op3Val) & 0x1f;
-            NewOp4Val = 31 - Op3Val;
-          } else {
-            NewOp3Val = (64 - Op3Val) & 0x3f;
-            NewOp4Val = 63 - Op3Val;
-          }
-
-          const MCExpr *NewOp3 =
-              MCConstantExpr::Create(NewOp3Val, getContext());
-          const MCExpr *NewOp4 =
-              MCConstantExpr::Create(NewOp4Val, getContext());
-
-          Operands[0] = ARM64Operand::CreateToken(
-              "ubfm", false, Op->getStartLoc(), getContext());
-          Operands[3] = ARM64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
-                                                Op3->getEndLoc(), getContext());
-          Operands.push_back(ARM64Operand::CreateImm(
-              NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext()));
-          delete Op3;
-          delete Op;
-        }
-      }
-
-      // FIXME: Horrible hack to handle the optional LSL shift for vector
-      //        instructions.
-    } else if (NumOperands == 4 && (Tok == "bic" || Tok == "orr")) {
-      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-      ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      if ((Op1->isToken() && Op2->isVectorReg() && Op3->isImm()) ||
-          (Op1->isVectorReg() && Op2->isToken() && Op3->isImm()))
-        Operands.push_back(ARM64Operand::CreateShifter(ARM64_AM::LSL, 0, IDLoc,
-                                                       IDLoc, getContext()));
-    } else if (NumOperands == 4 && (Tok == "movi" || Tok == "mvni")) {
-      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-      ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      if ((Op1->isToken() && Op2->isVectorReg() && Op3->isImm()) ||
-          (Op1->isVectorReg() && Op2->isToken() && Op3->isImm())) {
-        StringRef Suffix = Op1->isToken() ? Op1->getToken() : Op2->getToken();
-        // Canonicalize on lower-case for ease of comparison.
-        std::string CanonicalSuffix = Suffix.lower();
-        if (Tok != "movi" ||
-            (CanonicalSuffix != ".1d" && CanonicalSuffix != ".2d" &&
-             CanonicalSuffix != ".8b" && CanonicalSuffix != ".16b"))
-          Operands.push_back(ARM64Operand::CreateShifter(
-              ARM64_AM::LSL, 0, IDLoc, IDLoc, getContext()));
-      }
-    }
-  } else if (NumOperands == 5) {
-    // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
-    // UBFIZ -> UBFM aliases.
-    if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
-      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      ARM64Operand *Op4 = static_cast<ARM64Operand *>(Operands[4]);
-
-      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
-        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
-
-        if (Op3CE && Op4CE) {
-          uint64_t Op3Val = Op3CE->getValue();
-          uint64_t Op4Val = Op4CE->getValue();
-
-          uint64_t NewOp3Val = 0;
-          if (isGPR32Register(Op1->getReg()))
-            NewOp3Val = (32 - Op3Val) & 0x1f;
-          else
-            NewOp3Val = (64 - Op3Val) & 0x3f;
-
-          uint64_t NewOp4Val = Op4Val - 1;
-
-          const MCExpr *NewOp3 =
-              MCConstantExpr::Create(NewOp3Val, getContext());
-          const MCExpr *NewOp4 =
-              MCConstantExpr::Create(NewOp4Val, getContext());
-          Operands[3] = ARM64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
-                                                Op3->getEndLoc(), getContext());
-          Operands[4] = ARM64Operand::CreateImm(NewOp4, Op4->getStartLoc(),
-                                                Op4->getEndLoc(), getContext());
-          if (Tok == "bfi")
-            Operands[0] = ARM64Operand::CreateToken(
-                "bfm", false, Op->getStartLoc(), getContext());
-          else if (Tok == "sbfiz")
-            Operands[0] = ARM64Operand::CreateToken(
-                "sbfm", false, Op->getStartLoc(), getContext());
-          else if (Tok == "ubfiz")
-            Operands[0] = ARM64Operand::CreateToken(
-                "ubfm", false, Op->getStartLoc(), getContext());
-          else
-            llvm_unreachable("No valid mnemonic for alias?");
-
-          delete Op;
-          delete Op3;
-          delete Op4;
-        }
-      }
-
-      // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
-      // UBFX -> UBFM aliases.
-    } else if (NumOperands == 5 &&
-               (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
-      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      ARM64Operand *Op4 = static_cast<ARM64Operand *>(Operands[4]);
-
-      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
-        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
-
-        if (Op3CE && Op4CE) {
-          uint64_t Op3Val = Op3CE->getValue();
-          uint64_t Op4Val = Op4CE->getValue();
-          uint64_t NewOp4Val = Op3Val + Op4Val - 1;
-
-          if (NewOp4Val >= Op3Val) {
-            const MCExpr *NewOp4 =
-                MCConstantExpr::Create(NewOp4Val, getContext());
-            Operands[4] = ARM64Operand::CreateImm(
-                NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
-            if (Tok == "bfxil")
-              Operands[0] = ARM64Operand::CreateToken(
-                  "bfm", false, Op->getStartLoc(), getContext());
-            else if (Tok == "sbfx")
-              Operands[0] = ARM64Operand::CreateToken(
-                  "sbfm", false, Op->getStartLoc(), getContext());
-            else if (Tok == "ubfx")
-              Operands[0] = ARM64Operand::CreateToken(
-                  "ubfm", false, Op->getStartLoc(), getContext());
-            else
-              llvm_unreachable("No valid mnemonic for alias?");
-
-            delete Op;
-            delete Op4;
-          }
-        }
-      }
-    }
-  }
-  // FIXME: Horrible hack for tbz and tbnz with Wn register operand.
-  //        InstAlias can't quite handle this since the reg classes aren't
-  //        subclasses.
-  if (NumOperands == 4 && (Tok == "tbz" || Tok == "tbnz")) {
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
-    if (Op->isImm()) {
-      if (const MCConstantExpr *OpCE = dyn_cast<MCConstantExpr>(Op->getImm())) {
-        if (OpCE->getValue() < 32) {
-          // The source register can be Wn here, but the matcher expects a
-          // GPR64. Twiddle it here if necessary.
-          ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[1]);
-          if (Op->isReg()) {
-            unsigned Reg = getXRegFromWReg(Op->getReg());
-            Operands[1] = ARM64Operand::CreateReg(
-                Reg, false, Op->getStartLoc(), Op->getEndLoc(), getContext());
-            delete Op;
-          }
-        }
-      }
-    }
-  }
-  // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
-  //        InstAlias can't quite handle this since the reg classes aren't
-  //        subclasses.
-  if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
-    // The source register can be Wn here, but the matcher expects a
-    // GPR64. Twiddle it here if necessary.
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
-    if (Op->isReg()) {
-      unsigned Reg = getXRegFromWReg(Op->getReg());
-      Operands[2] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                            Op->getEndLoc(), getContext());
-      delete Op;
-    }
-  }
-  // FIXME: Likewise for [su]xt[bh] with a Xd dst operand
-  else if (NumOperands == 3 &&
-           (Tok == "sxtb" || Tok == "uxtb" || Tok == "sxth" || Tok == "uxth")) {
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[1]);
-    if (Op->isReg() && isGPR64Reg(Op->getReg())) {
-      // The source register can be Wn here, but the matcher expects a
-      // GPR64. Twiddle it here if necessary.
-      ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
-      if (Op->isReg()) {
-        unsigned Reg = getXRegFromWReg(Op->getReg());
-        Operands[2] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                              Op->getEndLoc(), getContext());
-        delete Op;
-      }
-    }
-  }
-
-  // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
-  if (NumOperands == 3 && Tok == "fmov") {
-    ARM64Operand *RegOp = static_cast<ARM64Operand *>(Operands[1]);
-    ARM64Operand *ImmOp = static_cast<ARM64Operand *>(Operands[2]);
-    if (RegOp->isReg() && ImmOp->isFPImm() &&
-        ImmOp->getFPImm() == (unsigned)-1) {
-      unsigned zreg =
-          isFPR32Register(RegOp->getReg()) ? ARM64::WZR : ARM64::XZR;
-      Operands[2] = ARM64Operand::CreateReg(zreg, false, Op->getStartLoc(),
-                                            Op->getEndLoc(), getContext());
-      delete ImmOp;
-    }
-  }
-
-  // FIXME: Horrible hack to handle the literal .d[1] vector index on
-  // FMOV instructions. The index isn't an actual instruction operand
-  // but rather syntactic sugar. It really should be part of the mnemonic,
-  // not the operand, but whatever.
-  if ((NumOperands == 5) && Tok == "fmov") {
-    // If the last operand is a vectorindex of '1', then replace it with
-    // a '[' '1' ']' token sequence, which is what the matcher
-    // (annoyingly) expects for a literal vector index operand.
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[NumOperands - 1]);
-    if (Op->isVectorIndexD() && Op->getVectorIndex() == 1) {
-      SMLoc Loc = Op->getStartLoc();
-      Operands.pop_back();
-      Operands.push_back(
-          ARM64Operand::CreateToken("[", false, Loc, getContext()));
-      Operands.push_back(
-          ARM64Operand::CreateToken("1", false, Loc, getContext()));
-      Operands.push_back(
-          ARM64Operand::CreateToken("]", false, Loc, getContext()));
-    } else if (Op->isReg()) {
-      // Similarly, check the destination operand for the GPR->High-lane
-      // variant.
-      unsigned OpNo = NumOperands - 2;
-      ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[OpNo]);
-      if (Op->isVectorIndexD() && Op->getVectorIndex() == 1) {
-        SMLoc Loc = Op->getStartLoc();
-        Operands[OpNo] =
-            ARM64Operand::CreateToken("[", false, Loc, getContext());
-        Operands.insert(
-            Operands.begin() + OpNo + 1,
-            ARM64Operand::CreateToken("1", false, Loc, getContext()));
-        Operands.insert(
-            Operands.begin() + OpNo + 2,
-            ARM64Operand::CreateToken("]", false, Loc, getContext()));
-      }
-    }
-  }
-
-  MCInst Inst;
-  // First try to match against the secondary set of tables containing the
-  // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
-  unsigned MatchResult =
-      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
-
-  // If that fails, try against the alternate table containing long-form NEON:
-  // "fadd v0.2s, v1.2s, v2.2s"
-  if (MatchResult != Match_Success)
-    MatchResult =
-        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
-
-  switch (MatchResult) {
-  case Match_Success: {
-    // Perform range checking and other semantic validations
-    SmallVector<SMLoc, 8> OperandLocs;
-    NumOperands = Operands.size();
-    for (unsigned i = 1; i < NumOperands; ++i)
-      OperandLocs.push_back(Operands[i]->getStartLoc());
-    if (validateInstruction(Inst, OperandLocs))
-      return true;
-
-    Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, STI);
-    return false;
-  }
-  case Match_MissingFeature:
-  case Match_MnemonicFail:
-    return showMatchError(IDLoc, MatchResult);
-  case Match_InvalidOperand: {
-    SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
-      if (ErrorInfo >= Operands.size())
-        return Error(IDLoc, "too few operands for instruction");
-
-      ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
-      if (ErrorLoc == SMLoc())
-        ErrorLoc = IDLoc;
-    }
-    // If the match failed on a suffix token operand, tweak the diagnostic
-    // accordingly.
-    if (((ARM64Operand *)Operands[ErrorInfo])->isToken() &&
-        ((ARM64Operand *)Operands[ErrorInfo])->isTokenSuffix())
-      MatchResult = Match_InvalidSuffix;
-
-    return showMatchError(ErrorLoc, MatchResult);
-  }
-  case Match_InvalidMemoryIndexedSImm9: {
-    // If there is not a '!' after the memory operand that failed, we really
-    // want the diagnostic for the non-pre-indexed instruction variant instead.
-    // Be careful to check for the post-indexed variant as well, which also
-    // uses this match diagnostic. Also exclude the explicitly unscaled
-    // mnemonics, as they want the unscaled diagnostic as well.
-    if (Operands.size() == ErrorInfo + 1 &&
-        !((ARM64Operand *)Operands[ErrorInfo])->isImm() &&
-        !Tok.startswith("stur") && !Tok.startswith("ldur")) {
-      // whether we want an Indexed64 or Indexed32 diagnostic depends on
-      // the register class of the previous operand. Default to 64 in case
-      // we see something unexpected.
-      MatchResult = Match_InvalidMemoryIndexed64;
-      if (ErrorInfo) {
-        ARM64Operand *PrevOp = (ARM64Operand *)Operands[ErrorInfo - 1];
-        if (PrevOp->isReg() && ARM64MCRegisterClasses[ARM64::GPR32RegClassID]
-                                   .contains(PrevOp->getReg()))
-          MatchResult = Match_InvalidMemoryIndexed32;
-      }
-    }
-    SMLoc ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
-    if (ErrorLoc == SMLoc())
-      ErrorLoc = IDLoc;
-    return showMatchError(ErrorLoc, MatchResult);
-  }
-  case Match_InvalidMemoryIndexed32:
-  case Match_InvalidMemoryIndexed64:
-  case Match_InvalidMemoryIndexed128:
-    // If there is a '!' after the memory operand that failed, we really
-    // want the diagnostic for the pre-indexed instruction variant instead.
-    if (Operands.size() > ErrorInfo + 1 &&
-        ((ARM64Operand *)Operands[ErrorInfo + 1])->isTokenEqual("!"))
-      MatchResult = Match_InvalidMemoryIndexedSImm9;
-  // FALL THROUGH
-  case Match_InvalidMemoryIndexed8:
-  case Match_InvalidMemoryIndexed16:
-  case Match_InvalidMemoryIndexed32SImm7:
-  case Match_InvalidMemoryIndexed64SImm7:
-  case Match_InvalidMemoryIndexed128SImm7:
-  case Match_InvalidImm1_8:
-  case Match_InvalidImm1_16:
-  case Match_InvalidImm1_32:
-  case Match_InvalidImm1_64: {
-    // Any time we get here, there's nothing fancy to do. Just get the
-    // operand SMLoc and display the diagnostic.
-    SMLoc ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
-    // If it's a memory operand, the error is with the offset immediate,
-    // so get that location instead.
-    if (((ARM64Operand *)Operands[ErrorInfo])->isMem())
-      ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getOffsetLoc();
-    if (ErrorLoc == SMLoc())
-      ErrorLoc = IDLoc;
-    return showMatchError(ErrorLoc, MatchResult);
-  }
-  }
-
-  llvm_unreachable("Implement any new match types added!");
-  return true;
-}
-
-/// ParseDirective parses the arm specific directives
-bool ARM64AsmParser::ParseDirective(AsmToken DirectiveID) {
-  StringRef IDVal = DirectiveID.getIdentifier();
-  SMLoc Loc = DirectiveID.getLoc();
-  if (IDVal == ".hword")
-    return parseDirectiveWord(2, Loc);
-  if (IDVal == ".word")
-    return parseDirectiveWord(4, Loc);
-  if (IDVal == ".xword")
-    return parseDirectiveWord(8, Loc);
-  if (IDVal == ".tlsdesccall")
-    return parseDirectiveTLSDescCall(Loc);
-
-  return parseDirectiveLOH(IDVal, Loc);
-}
-
-/// parseDirectiveWord
-///  ::= .word [ expression (, expression)* ]
-bool ARM64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
-      const MCExpr *Value;
-      if (getParser().parseExpression(Value))
-        return true;
-
-      getParser().getStreamer().EmitValue(Value, Size);
-
-      if (getLexer().is(AsmToken::EndOfStatement))
-        break;
-
-      // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma))
-        return Error(L, "unexpected token in directive");
-      Parser.Lex();
-    }
-  }
-
-  Parser.Lex();
-  return false;
-}
-
-// parseDirectiveTLSDescCall:
-//   ::= .tlsdesccall symbol
-bool ARM64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
-  StringRef Name;
-  if (getParser().parseIdentifier(Name))
-    return Error(L, "expected symbol after directive");
-
-  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
-  Expr = ARM64MCExpr::Create(Expr, ARM64MCExpr::VK_TLSDESC, getContext());
-
-  MCInst Inst;
-  Inst.setOpcode(ARM64::TLSDESCCALL);
-  Inst.addOperand(MCOperand::CreateExpr(Expr));
-
-  getParser().getStreamer().EmitInstruction(Inst, STI);
-  return false;
-}
-
-/// ::= .loh <lohName | lohId> label1, ..., labelN
-/// The number of arguments depends on the loh identifier.
-bool ARM64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
-  if (IDVal != MCLOHDirectiveName())
-    return true;
-  MCLOHType Kind;
-  if (getParser().getTok().isNot(AsmToken::Identifier)) {
-    if (getParser().getTok().isNot(AsmToken::Integer))
-      return TokError("expected an identifier or a number in directive");
-    // We successfully get a numeric value for the identifier.
-    // Check if it is valid.
-    int64_t Id = getParser().getTok().getIntVal();
-    Kind = (MCLOHType)Id;
-    // Check that Id does not overflow MCLOHType.
-    if (!isValidMCLOHType(Kind) || Id != Kind)
-      return TokError("invalid numeric identifier in directive");
-  } else {
-    StringRef Name = getTok().getIdentifier();
-    // We successfully parse an identifier.
-    // Check if it is a recognized one.
-    int Id = MCLOHNameToId(Name);
-
-    if (Id == -1)
-      return TokError("invalid identifier in directive");
-    Kind = (MCLOHType)Id;
-  }
-  // Consume the identifier.
-  Lex();
-  // Get the number of arguments of this LOH.
-  int NbArgs = MCLOHIdToNbArgs(Kind);
-
-  assert(NbArgs != -1 && "Invalid number of arguments");
-
-  SmallVector<MCSymbol *, 3> Args;
-  for (int Idx = 0; Idx < NbArgs; ++Idx) {
-    StringRef Name;
-    if (getParser().parseIdentifier(Name))
-      return TokError("expected identifier in directive");
-    Args.push_back(getContext().GetOrCreateSymbol(Name));
-
-    if (Idx + 1 == NbArgs)
-      break;
-    if (getLexer().isNot(AsmToken::Comma))
-      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
-    Lex();
-  }
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
-
-  getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
-  return false;
-}
-
-bool
-ARM64AsmParser::classifySymbolRef(const MCExpr *Expr,
-                                  ARM64MCExpr::VariantKind &ELFRefKind,
-                                  MCSymbolRefExpr::VariantKind &DarwinRefKind,
-                                  const MCConstantExpr *&Addend) {
-  ELFRefKind = ARM64MCExpr::VK_INVALID;
-  DarwinRefKind = MCSymbolRefExpr::VK_None;
-
-  if (const ARM64MCExpr *AE = dyn_cast<ARM64MCExpr>(Expr)) {
-    ELFRefKind = AE->getKind();
-    Expr = AE->getSubExpr();
-  }
-
-  const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
-  if (SE) {
-    // It's a simple symbol reference with no addend.
-    DarwinRefKind = SE->getKind();
-    Addend = 0;
-    return true;
-  }
-
-  const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
-  if (!BE)
-    return false;
-
-  SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
-  if (!SE)
-    return false;
-  DarwinRefKind = SE->getKind();
-
-  if (BE->getOpcode() != MCBinaryExpr::Add)
-    return false;
-
-  // See if the addend is is a constant, otherwise there's more going
-  // on here than we can deal with.
-  Addend = dyn_cast<MCConstantExpr>(BE->getRHS());
-  if (!Addend)
-    return false;
-
-  // It's some symbol reference + a constant addend, but really
-  // shouldn't use both Darwin and ELF syntax.
-  return ELFRefKind == ARM64MCExpr::VK_INVALID ||
-         DarwinRefKind == MCSymbolRefExpr::VK_None;
-}
-
-/// Force static initialization.
-extern "C" void LLVMInitializeARM64AsmParser() {
-  RegisterMCAsmParser<ARM64AsmParser> X(TheARM64Target);
-}
-
-#define GET_REGISTER_MATCHER
-#define GET_MATCHER_IMPLEMENTATION
-#include "ARM64GenAsmMatcher.inc"
-
-// Define this matcher function after the auto-generated include so we
-// have the match class enum definitions.
-unsigned ARM64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
-                                                    unsigned Kind) {
-  ARM64Operand *Op = static_cast<ARM64Operand *>(AsmOp);
-  // If the kind is a token for a literal immediate, check if our asm
-  // operand matches. This is for InstAliases which have a fixed-value
-  // immediate in the syntax.
-  int64_t ExpectedVal;
-  switch (Kind) {
-  default:
-    return Match_InvalidOperand;
-  case MCK__35_0:
-    ExpectedVal = 0;
-    break;
-  case MCK__35_1:
-    ExpectedVal = 1;
-    break;
-  case MCK__35_12:
-    ExpectedVal = 12;
-    break;
-  case MCK__35_16:
-    ExpectedVal = 16;
-    break;
-  case MCK__35_2:
-    ExpectedVal = 2;
-    break;
-  case MCK__35_24:
-    ExpectedVal = 24;
-    break;
-  case MCK__35_3:
-    ExpectedVal = 3;
-    break;
-  case MCK__35_32:
-    ExpectedVal = 32;
-    break;
-  case MCK__35_4:
-    ExpectedVal = 4;
-    break;
-  case MCK__35_48:
-    ExpectedVal = 48;
-    break;
-  case MCK__35_6:
-    ExpectedVal = 6;
-    break;
-  case MCK__35_64:
-    ExpectedVal = 64;
-    break;
-  case MCK__35_8:
-    ExpectedVal = 8;
-    break;
-  }
-  if (!Op->isImm())
-    return Match_InvalidOperand;
-  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
-  if (!CE)
-    return Match_InvalidOperand;
-  if (CE->getValue() == ExpectedVal)
-    return Match_Success;
-  return Match_InvalidOperand;
-}
diff --git a/lib/Target/ARM64/AsmParser/CMakeLists.txt b/lib/Target/ARM64/AsmParser/CMakeLists.txt
deleted file mode 100644
index 826158b..0000000
--- a/lib/Target/ARM64/AsmParser/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64AsmParser
-  ARM64AsmParser.cpp
-  )
-
diff --git a/lib/Target/ARM64/AsmParser/LLVMBuild.txt b/lib/Target/ARM64/AsmParser/LLVMBuild.txt
deleted file mode 100644
index 2c8fafe..0000000
--- a/lib/Target/ARM64/AsmParser/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64AsmParser
-parent = ARM64
-required_libraries = ARM64Desc ARM64Info MC MCParser Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/AsmParser/Makefile b/lib/Target/ARM64/AsmParser/Makefile
deleted file mode 100644
index d25c47f..0000000
--- a/lib/Target/ARM64/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64AsmParser
-
-# Hack: we need to include 'main' ARM target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/CMakeLists.txt b/lib/Target/ARM64/CMakeLists.txt
deleted file mode 100644
index 6de861c..0000000
--- a/lib/Target/ARM64/CMakeLists.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS ARM64.td)
-
-tablegen(LLVM ARM64GenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM ARM64GenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM ARM64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
-tablegen(LLVM ARM64GenMCPseudoLowering.inc -gen-pseudo-lowering)
-tablegen(LLVM ARM64GenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM ARM64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
-tablegen(LLVM ARM64GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM ARM64GenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM ARM64GenFastISel.inc -gen-fast-isel)
-tablegen(LLVM ARM64GenCallingConv.inc -gen-callingconv)
-tablegen(LLVM ARM64GenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM ARM64GenDisassemblerTables.inc -gen-disassembler)
-add_public_tablegen_target(ARM64CommonTableGen)
-
-add_llvm_target(ARM64CodeGen
-  ARM64AddressTypePromotion.cpp
-  ARM64AdvSIMDScalarPass.cpp
-  ARM64AsmPrinter.cpp
-  ARM64BranchRelaxation.cpp
-  ARM64CleanupLocalDynamicTLSPass.cpp
-  ARM64CollectLOH.cpp
-  ARM64ConditionalCompares.cpp
-  ARM64DeadRegisterDefinitionsPass.cpp
-  ARM64ExpandPseudoInsts.cpp
-  ARM64FastISel.cpp
-  ARM64FrameLowering.cpp
-  ARM64ISelDAGToDAG.cpp
-  ARM64ISelLowering.cpp
-  ARM64InstrInfo.cpp
-  ARM64LoadStoreOptimizer.cpp
-  ARM64MCInstLower.cpp
-  ARM64PromoteConstant.cpp
-  ARM64RegisterInfo.cpp
-  ARM64SelectionDAGInfo.cpp
-  ARM64StorePairSuppress.cpp
-  ARM64Subtarget.cpp
-  ARM64TargetMachine.cpp
-  ARM64TargetObjectFile.cpp
-  ARM64TargetTransformInfo.cpp
-)
-
-add_dependencies(LLVMARM64CodeGen intrinsics_gen)
-
-add_subdirectory(TargetInfo)
-add_subdirectory(AsmParser)
-add_subdirectory(Disassembler)
-add_subdirectory(InstPrinter)
-add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp b/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
deleted file mode 100644
index 44c501f..0000000
--- a/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
+++ /dev/null
@@ -1,2142 +0,0 @@
-//===- ARM64Disassembler.cpp - Disassembler for ARM64 -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-disassembler"
-
-#include "ARM64Disassembler.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64BaseInfo.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryObject.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-
-// Pull DecodeStatus and its enum values into the global namespace.
-typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
-
-// Forward declare these because the autogenerated code will reference them.
-// Definitions are further down.
-static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
-                                              unsigned RegNo, uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
-static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const void *Decoder);
-static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const void *Decoder);
-static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder);
-
-static DecodeStatus DecodeFixedPointScaleImm(llvm::MCInst &Inst, unsigned Imm,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeCondBranchTarget(llvm::MCInst &Inst, unsigned Imm,
-                                           uint64_t Address,
-                                           const void *Decoder);
-static DecodeStatus DecodeSystemRegister(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
-                                                  uint32_t insn,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
-static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeRegOffsetLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                            uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeSystemCPSRInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSIMDLdStPost(llvm::MCInst &Inst, uint32_t insn,
-                                       uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeSIMDLdStSingle(llvm::MCInst &Inst, uint32_t insn,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeSIMDLdStSingleTied(llvm::MCInst &Inst, uint32_t insn,
-                                             uint64_t Addr,
-                                             const void *Decoder);
-
-static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder);
-static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder);
-static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder);
-static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder);
-
-#include "ARM64GenDisassemblerTables.inc"
-#include "ARM64GenInstrInfo.inc"
-
-using namespace llvm;
-
-#define Success llvm::MCDisassembler::Success
-#define Fail llvm::MCDisassembler::Fail
-
-static MCDisassembler *createARM64Disassembler(const Target &T,
-                                               const MCSubtargetInfo &STI) {
-  return new ARM64Disassembler(STI);
-}
-
-DecodeStatus ARM64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                               const MemoryObject &Region,
-                                               uint64_t Address,
-                                               raw_ostream &os,
-                                               raw_ostream &cs) const {
-  CommentStream = &cs;
-
-  uint8_t bytes[4];
-
-  Size = 0;
-  // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
-    return Fail;
-  Size = 4;
-
-  // Encoded as a small-endian 32-bit word in the stream.
-  uint32_t insn =
-      (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
-
-  // Calling the auto-generated decoder function.
-  DecodeStatus result =
-      decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
-  if (!result)
-    return Fail;
-
-  return Success;
-}
-
-static MCSymbolRefExpr::VariantKind
-getVariant(uint64_t LLVMDisassembler_VariantKind) {
-  switch (LLVMDisassembler_VariantKind) {
-  case LLVMDisassembler_VariantKind_None:
-    return MCSymbolRefExpr::VK_None;
-  case LLVMDisassembler_VariantKind_ARM64_PAGE:
-    return MCSymbolRefExpr::VK_PAGE;
-  case LLVMDisassembler_VariantKind_ARM64_PAGEOFF:
-    return MCSymbolRefExpr::VK_PAGEOFF;
-  case LLVMDisassembler_VariantKind_ARM64_GOTPAGE:
-    return MCSymbolRefExpr::VK_GOTPAGE;
-  case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF:
-    return MCSymbolRefExpr::VK_GOTPAGEOFF;
-  case LLVMDisassembler_VariantKind_ARM64_TLVP:
-  case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
-  default:
-    assert(0 && "bad LLVMDisassembler_VariantKind");
-    return MCSymbolRefExpr::VK_None;
-  }
-}
-
-/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
-/// operand in place of the immediate Value in the MCInst.  The immediate
-/// Value has not had any PC adjustment made by the caller. If the instruction
-/// is a branch that adds the PC to the immediate Value then isBranch is
-/// Success, else Fail.  If the getOpInfo() function was set as part of the
-/// setupForSymbolicDisassembly() call then that function is called to get any
-/// symbolic information at the Address for this instrution.  If that returns
-/// non-zero then the symbolic information it returns is used to create an
-/// MCExpr and that is added as an operand to the MCInst.  If getOpInfo()
-/// returns zero and isBranch is Success then a symbol look up for
-/// Address + Value is done and if a symbol is found an MCExpr is created with
-/// that, else an MCExpr with Address + Value is created.  If getOpInfo()
-/// returns zero and isBranch is Fail then the the Opcode of the MCInst is
-/// tested and for ADRP an other instructions that help to load of pointers
-/// a symbol look up is done to see it is returns a specific reference type
-/// to add to the comment stream.  This function returns Success if it adds
-/// an operand to the MCInst and Fail otherwise.
-bool ARM64Disassembler::tryAddingSymbolicOperand(uint64_t Address, int Value,
-                                                 bool isBranch,
-                                                 uint64_t InstSize, MCInst &MI,
-                                                 uint32_t insn) const {
-  LLVMOpInfoCallback getOpInfo = getLLVMOpInfoCallback();
-
-  struct LLVMOpInfo1 SymbolicOp;
-  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-  SymbolicOp.Value = Value;
-  void *DisInfo = getDisInfoBlock();
-  uint64_t ReferenceType;
-  const char *ReferenceName;
-  const char *Name;
-  LLVMSymbolLookupCallback SymbolLookUp = getLLVMSymbolLookupCallback();
-  if (!getOpInfo ||
-      !getOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
-    if (isBranch) {
-      if (SymbolLookUp) {
-        ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
-        Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
-                            &ReferenceName);
-        if (Name) {
-          SymbolicOp.AddSymbol.Name = Name;
-          SymbolicOp.AddSymbol.Present = Success;
-          SymbolicOp.Value = 0;
-        } else {
-          SymbolicOp.Value = Address + Value;
-        }
-        if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
-          (*CommentStream) << "symbol stub for: " << ReferenceName;
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_Objc_Message)
-          (*CommentStream) << "Objc message: " << ReferenceName;
-      } else {
-        return false;
-      }
-    } else if (MI.getOpcode() == ARM64::ADRP) {
-      if (SymbolLookUp) {
-        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP;
-        Name = SymbolLookUp(DisInfo, insn, &ReferenceType, Address,
-                            &ReferenceName);
-        (*CommentStream) << format("0x%llx",
-                                   0xfffffffffffff000LL & (Address + Value));
-      } else {
-        return false;
-      }
-    } else if (MI.getOpcode() == ARM64::ADDXri ||
-               MI.getOpcode() == ARM64::LDRXui ||
-               MI.getOpcode() == ARM64::LDRXl || MI.getOpcode() == ARM64::ADR) {
-      if (SymbolLookUp) {
-        if (MI.getOpcode() == ARM64::ADDXri)
-          ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri;
-        else if (MI.getOpcode() == ARM64::LDRXui)
-          ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui;
-        if (MI.getOpcode() == ARM64::LDRXl) {
-          ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl;
-          Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
-                              &ReferenceName);
-        } else if (MI.getOpcode() == ARM64::ADR) {
-          ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR;
-          Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
-                              &ReferenceName);
-        } else {
-          Name = SymbolLookUp(DisInfo, insn, &ReferenceType, Address,
-                              &ReferenceName);
-        }
-        if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
-          (*CommentStream) << "literal pool symbol address: " << ReferenceName;
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
-          (*CommentStream) << "literal pool for: \"" << ReferenceName << "\"";
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
-          (*CommentStream) << "Objc cfstring ref: @\"" << ReferenceName << "\"";
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_Objc_Message)
-          (*CommentStream) << "Objc message: " << ReferenceName;
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
-          (*CommentStream) << "Objc message ref: " << ReferenceName;
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
-          (*CommentStream) << "Objc selector ref: " << ReferenceName;
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
-          (*CommentStream) << "Objc class ref: " << ReferenceName;
-        // For these instructions, the SymbolLookUp() above is just to get the
-        // ReferenceType and ReferenceName.  We want to make sure not to
-        // fall through so we don't build an MCExpr to leave the disassembly
-        // of the immediate values of these instructions to the InstPrinter.
-        return false;
-      } else {
-        return false;
-      }
-    } else {
-      return false;
-    }
-  }
-
-  MCContext *Ctx = getMCContext();
-  const MCExpr *Add = NULL;
-  if (SymbolicOp.AddSymbol.Present) {
-    if (SymbolicOp.AddSymbol.Name) {
-      StringRef Name(SymbolicOp.AddSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind);
-      if (Variant != MCSymbolRefExpr::VK_None)
-        Add = MCSymbolRefExpr::Create(Sym, Variant, *Ctx);
-      else
-        Add = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Sub = NULL;
-  if (SymbolicOp.SubtractSymbol.Present) {
-    if (SymbolicOp.SubtractSymbol.Name) {
-      StringRef Name(SymbolicOp.SubtractSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Off = NULL;
-  if (SymbolicOp.Value != 0)
-    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
-
-  const MCExpr *Expr;
-  if (Sub) {
-    const MCExpr *LHS;
-    if (Add)
-      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
-    else
-      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
-    else
-      Expr = LHS;
-  } else if (Add) {
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
-    else
-      Expr = Add;
-  } else {
-    if (Off != 0)
-      Expr = Off;
-    else
-      Expr = MCConstantExpr::Create(0, *Ctx);
-  }
-
-  MI.addOperand(MCOperand::CreateExpr(Expr));
-
-  return true;
-}
-
-extern "C" void LLVMInitializeARM64Disassembler() {
-  TargetRegistry::RegisterMCDisassembler(TheARM64Target,
-                                         createARM64Disassembler);
-}
-
-static const unsigned FPR128DecoderTable[] = {
-  ARM64::Q0,  ARM64::Q1,  ARM64::Q2,  ARM64::Q3,  ARM64::Q4,  ARM64::Q5,
-  ARM64::Q6,  ARM64::Q7,  ARM64::Q8,  ARM64::Q9,  ARM64::Q10, ARM64::Q11,
-  ARM64::Q12, ARM64::Q13, ARM64::Q14, ARM64::Q15, ARM64::Q16, ARM64::Q17,
-  ARM64::Q18, ARM64::Q19, ARM64::Q20, ARM64::Q21, ARM64::Q22, ARM64::Q23,
-  ARM64::Q24, ARM64::Q25, ARM64::Q26, ARM64::Q27, ARM64::Q28, ARM64::Q29,
-  ARM64::Q30, ARM64::Q31
-};
-
-static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR128DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                 uint64_t Addr,
-                                                 const void *Decoder) {
-  if (RegNo > 15)
-    return Fail;
-  return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
-}
-
-static const unsigned FPR64DecoderTable[] = {
-  ARM64::D0,  ARM64::D1,  ARM64::D2,  ARM64::D3,  ARM64::D4,  ARM64::D5,
-  ARM64::D6,  ARM64::D7,  ARM64::D8,  ARM64::D9,  ARM64::D10, ARM64::D11,
-  ARM64::D12, ARM64::D13, ARM64::D14, ARM64::D15, ARM64::D16, ARM64::D17,
-  ARM64::D18, ARM64::D19, ARM64::D20, ARM64::D21, ARM64::D22, ARM64::D23,
-  ARM64::D24, ARM64::D25, ARM64::D26, ARM64::D27, ARM64::D28, ARM64::D29,
-  ARM64::D30, ARM64::D31
-};
-
-static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR64DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned FPR32DecoderTable[] = {
-  ARM64::S0,  ARM64::S1,  ARM64::S2,  ARM64::S3,  ARM64::S4,  ARM64::S5,
-  ARM64::S6,  ARM64::S7,  ARM64::S8,  ARM64::S9,  ARM64::S10, ARM64::S11,
-  ARM64::S12, ARM64::S13, ARM64::S14, ARM64::S15, ARM64::S16, ARM64::S17,
-  ARM64::S18, ARM64::S19, ARM64::S20, ARM64::S21, ARM64::S22, ARM64::S23,
-  ARM64::S24, ARM64::S25, ARM64::S26, ARM64::S27, ARM64::S28, ARM64::S29,
-  ARM64::S30, ARM64::S31
-};
-
-static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR32DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned FPR16DecoderTable[] = {
-  ARM64::H0,  ARM64::H1,  ARM64::H2,  ARM64::H3,  ARM64::H4,  ARM64::H5,
-  ARM64::H6,  ARM64::H7,  ARM64::H8,  ARM64::H9,  ARM64::H10, ARM64::H11,
-  ARM64::H12, ARM64::H13, ARM64::H14, ARM64::H15, ARM64::H16, ARM64::H17,
-  ARM64::H18, ARM64::H19, ARM64::H20, ARM64::H21, ARM64::H22, ARM64::H23,
-  ARM64::H24, ARM64::H25, ARM64::H26, ARM64::H27, ARM64::H28, ARM64::H29,
-  ARM64::H30, ARM64::H31
-};
-
-static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR16DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned FPR8DecoderTable[] = {
-  ARM64::B0,  ARM64::B1,  ARM64::B2,  ARM64::B3,  ARM64::B4,  ARM64::B5,
-  ARM64::B6,  ARM64::B7,  ARM64::B8,  ARM64::B9,  ARM64::B10, ARM64::B11,
-  ARM64::B12, ARM64::B13, ARM64::B14, ARM64::B15, ARM64::B16, ARM64::B17,
-  ARM64::B18, ARM64::B19, ARM64::B20, ARM64::B21, ARM64::B22, ARM64::B23,
-  ARM64::B24, ARM64::B25, ARM64::B26, ARM64::B27, ARM64::B28, ARM64::B29,
-  ARM64::B30, ARM64::B31
-};
-
-static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR8DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned GPR64DecoderTable[] = {
-  ARM64::X0,  ARM64::X1,  ARM64::X2,  ARM64::X3,  ARM64::X4,  ARM64::X5,
-  ARM64::X6,  ARM64::X7,  ARM64::X8,  ARM64::X9,  ARM64::X10, ARM64::X11,
-  ARM64::X12, ARM64::X13, ARM64::X14, ARM64::X15, ARM64::X16, ARM64::X17,
-  ARM64::X18, ARM64::X19, ARM64::X20, ARM64::X21, ARM64::X22, ARM64::X23,
-  ARM64::X24, ARM64::X25, ARM64::X26, ARM64::X27, ARM64::X28, ARM64::FP,
-  ARM64::LR,  ARM64::XZR
-};
-
-static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = GPR64DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = GPR64DecoderTable[RegNo];
-  if (Register == ARM64::XZR)
-    Register = ARM64::SP;
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned GPR32DecoderTable[] = {
-  ARM64::W0,  ARM64::W1,  ARM64::W2,  ARM64::W3,  ARM64::W4,  ARM64::W5,
-  ARM64::W6,  ARM64::W7,  ARM64::W8,  ARM64::W9,  ARM64::W10, ARM64::W11,
-  ARM64::W12, ARM64::W13, ARM64::W14, ARM64::W15, ARM64::W16, ARM64::W17,
-  ARM64::W18, ARM64::W19, ARM64::W20, ARM64::W21, ARM64::W22, ARM64::W23,
-  ARM64::W24, ARM64::W25, ARM64::W26, ARM64::W27, ARM64::W28, ARM64::W29,
-  ARM64::W30, ARM64::WZR
-};
-
-static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = GPR32DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = GPR32DecoderTable[RegNo];
-  if (Register == ARM64::WZR)
-    Register = ARM64::WSP;
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned VectorDecoderTable[] = {
-  ARM64::Q0,  ARM64::Q1,  ARM64::Q2,  ARM64::Q3,  ARM64::Q4,  ARM64::Q5,
-  ARM64::Q6,  ARM64::Q7,  ARM64::Q8,  ARM64::Q9,  ARM64::Q10, ARM64::Q11,
-  ARM64::Q12, ARM64::Q13, ARM64::Q14, ARM64::Q15, ARM64::Q16, ARM64::Q17,
-  ARM64::Q18, ARM64::Q19, ARM64::Q20, ARM64::Q21, ARM64::Q22, ARM64::Q23,
-  ARM64::Q24, ARM64::Q25, ARM64::Q26, ARM64::Q27, ARM64::Q28, ARM64::Q29,
-  ARM64::Q30, ARM64::Q31
-};
-
-static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = VectorDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned QQDecoderTable[] = {
-  ARM64::Q0_Q1,   ARM64::Q1_Q2,   ARM64::Q2_Q3,   ARM64::Q3_Q4,
-  ARM64::Q4_Q5,   ARM64::Q5_Q6,   ARM64::Q6_Q7,   ARM64::Q7_Q8,
-  ARM64::Q8_Q9,   ARM64::Q9_Q10,  ARM64::Q10_Q11, ARM64::Q11_Q12,
-  ARM64::Q12_Q13, ARM64::Q13_Q14, ARM64::Q14_Q15, ARM64::Q15_Q16,
-  ARM64::Q16_Q17, ARM64::Q17_Q18, ARM64::Q18_Q19, ARM64::Q19_Q20,
-  ARM64::Q20_Q21, ARM64::Q21_Q22, ARM64::Q22_Q23, ARM64::Q23_Q24,
-  ARM64::Q24_Q25, ARM64::Q25_Q26, ARM64::Q26_Q27, ARM64::Q27_Q28,
-  ARM64::Q28_Q29, ARM64::Q29_Q30, ARM64::Q30_Q31, ARM64::Q31_Q0
-};
-
-static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                          uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = QQDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned QQQDecoderTable[] = {
-  ARM64::Q0_Q1_Q2,    ARM64::Q1_Q2_Q3,    ARM64::Q2_Q3_Q4,
-  ARM64::Q3_Q4_Q5,    ARM64::Q4_Q5_Q6,    ARM64::Q5_Q6_Q7,
-  ARM64::Q6_Q7_Q8,    ARM64::Q7_Q8_Q9,    ARM64::Q8_Q9_Q10,
-  ARM64::Q9_Q10_Q11,  ARM64::Q10_Q11_Q12, ARM64::Q11_Q12_Q13,
-  ARM64::Q12_Q13_Q14, ARM64::Q13_Q14_Q15, ARM64::Q14_Q15_Q16,
-  ARM64::Q15_Q16_Q17, ARM64::Q16_Q17_Q18, ARM64::Q17_Q18_Q19,
-  ARM64::Q18_Q19_Q20, ARM64::Q19_Q20_Q21, ARM64::Q20_Q21_Q22,
-  ARM64::Q21_Q22_Q23, ARM64::Q22_Q23_Q24, ARM64::Q23_Q24_Q25,
-  ARM64::Q24_Q25_Q26, ARM64::Q25_Q26_Q27, ARM64::Q26_Q27_Q28,
-  ARM64::Q27_Q28_Q29, ARM64::Q28_Q29_Q30, ARM64::Q29_Q30_Q31,
-  ARM64::Q30_Q31_Q0,  ARM64::Q31_Q0_Q1
-};
-
-static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = QQQDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned QQQQDecoderTable[] = {
-  ARM64::Q0_Q1_Q2_Q3,     ARM64::Q1_Q2_Q3_Q4,     ARM64::Q2_Q3_Q4_Q5,
-  ARM64::Q3_Q4_Q5_Q6,     ARM64::Q4_Q5_Q6_Q7,     ARM64::Q5_Q6_Q7_Q8,
-  ARM64::Q6_Q7_Q8_Q9,     ARM64::Q7_Q8_Q9_Q10,    ARM64::Q8_Q9_Q10_Q11,
-  ARM64::Q9_Q10_Q11_Q12,  ARM64::Q10_Q11_Q12_Q13, ARM64::Q11_Q12_Q13_Q14,
-  ARM64::Q12_Q13_Q14_Q15, ARM64::Q13_Q14_Q15_Q16, ARM64::Q14_Q15_Q16_Q17,
-  ARM64::Q15_Q16_Q17_Q18, ARM64::Q16_Q17_Q18_Q19, ARM64::Q17_Q18_Q19_Q20,
-  ARM64::Q18_Q19_Q20_Q21, ARM64::Q19_Q20_Q21_Q22, ARM64::Q20_Q21_Q22_Q23,
-  ARM64::Q21_Q22_Q23_Q24, ARM64::Q22_Q23_Q24_Q25, ARM64::Q23_Q24_Q25_Q26,
-  ARM64::Q24_Q25_Q26_Q27, ARM64::Q25_Q26_Q27_Q28, ARM64::Q26_Q27_Q28_Q29,
-  ARM64::Q27_Q28_Q29_Q30, ARM64::Q28_Q29_Q30_Q31, ARM64::Q29_Q30_Q31_Q0,
-  ARM64::Q30_Q31_Q0_Q1,   ARM64::Q31_Q0_Q1_Q2
-};
-
-static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = QQQQDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned DDDecoderTable[] = {
-  ARM64::D0_D1,   ARM64::D1_D2,   ARM64::D2_D3,   ARM64::D3_D4,
-  ARM64::D4_D5,   ARM64::D5_D6,   ARM64::D6_D7,   ARM64::D7_D8,
-  ARM64::D8_D9,   ARM64::D9_D10,  ARM64::D10_D11, ARM64::D11_D12,
-  ARM64::D12_D13, ARM64::D13_D14, ARM64::D14_D15, ARM64::D15_D16,
-  ARM64::D16_D17, ARM64::D17_D18, ARM64::D18_D19, ARM64::D19_D20,
-  ARM64::D20_D21, ARM64::D21_D22, ARM64::D22_D23, ARM64::D23_D24,
-  ARM64::D24_D25, ARM64::D25_D26, ARM64::D26_D27, ARM64::D27_D28,
-  ARM64::D28_D29, ARM64::D29_D30, ARM64::D30_D31, ARM64::D31_D0
-};
-
-static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                          uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = DDDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned DDDDecoderTable[] = {
-  ARM64::D0_D1_D2,    ARM64::D1_D2_D3,    ARM64::D2_D3_D4,
-  ARM64::D3_D4_D5,    ARM64::D4_D5_D6,    ARM64::D5_D6_D7,
-  ARM64::D6_D7_D8,    ARM64::D7_D8_D9,    ARM64::D8_D9_D10,
-  ARM64::D9_D10_D11,  ARM64::D10_D11_D12, ARM64::D11_D12_D13,
-  ARM64::D12_D13_D14, ARM64::D13_D14_D15, ARM64::D14_D15_D16,
-  ARM64::D15_D16_D17, ARM64::D16_D17_D18, ARM64::D17_D18_D19,
-  ARM64::D18_D19_D20, ARM64::D19_D20_D21, ARM64::D20_D21_D22,
-  ARM64::D21_D22_D23, ARM64::D22_D23_D24, ARM64::D23_D24_D25,
-  ARM64::D24_D25_D26, ARM64::D25_D26_D27, ARM64::D26_D27_D28,
-  ARM64::D27_D28_D29, ARM64::D28_D29_D30, ARM64::D29_D30_D31,
-  ARM64::D30_D31_D0,  ARM64::D31_D0_D1
-};
-
-static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = DDDDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned DDDDDecoderTable[] = {
-  ARM64::D0_D1_D2_D3,     ARM64::D1_D2_D3_D4,     ARM64::D2_D3_D4_D5,
-  ARM64::D3_D4_D5_D6,     ARM64::D4_D5_D6_D7,     ARM64::D5_D6_D7_D8,
-  ARM64::D6_D7_D8_D9,     ARM64::D7_D8_D9_D10,    ARM64::D8_D9_D10_D11,
-  ARM64::D9_D10_D11_D12,  ARM64::D10_D11_D12_D13, ARM64::D11_D12_D13_D14,
-  ARM64::D12_D13_D14_D15, ARM64::D13_D14_D15_D16, ARM64::D14_D15_D16_D17,
-  ARM64::D15_D16_D17_D18, ARM64::D16_D17_D18_D19, ARM64::D17_D18_D19_D20,
-  ARM64::D18_D19_D20_D21, ARM64::D19_D20_D21_D22, ARM64::D20_D21_D22_D23,
-  ARM64::D21_D22_D23_D24, ARM64::D22_D23_D24_D25, ARM64::D23_D24_D25_D26,
-  ARM64::D24_D25_D26_D27, ARM64::D25_D26_D27_D28, ARM64::D26_D27_D28_D29,
-  ARM64::D27_D28_D29_D30, ARM64::D28_D29_D30_D31, ARM64::D29_D30_D31_D0,
-  ARM64::D30_D31_D0_D1,   ARM64::D31_D0_D1_D2
-};
-
-static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = DDDDDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeFixedPointScaleImm(llvm::MCInst &Inst, unsigned Imm,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
-  return Success;
-}
-
-static DecodeStatus DecodeCondBranchTarget(llvm::MCInst &Inst, unsigned Imm,
-                                           uint64_t Addr, const void *Decoder) {
-  int64_t ImmVal = Imm;
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend 19-bit immediate.
-  if (ImmVal & (1 << (19 - 1)))
-    ImmVal |= ~((1LL << 19) - 1);
-
-  if (!Dis->tryAddingSymbolicOperand(Addr, ImmVal << 2,
-                                     Inst.getOpcode() != ARM64::LDRXl, 4, Inst))
-    Inst.addOperand(MCOperand::CreateImm(ImmVal));
-  return Success;
-}
-
-static DecodeStatus DecodeSystemRegister(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(Imm | 0x8000));
-  return Success;
-}
-
-static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
-                                       unsigned Add) {
-  Inst.addOperand(MCOperand::CreateImm(Add - Imm));
-  return Success;
-}
-
-static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
-                                       unsigned Add) {
-  Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1)));
-  return Success;
-}
-
-static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 64);
-}
-
-static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
-}
-
-static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 32);
-}
-
-static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
-}
-
-static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 16);
-}
-
-static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
-}
-
-static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 8);
-}
-
-static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 64);
-}
-
-static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 32);
-}
-
-static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 16);
-}
-
-static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 8);
-}
-
-static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn, uint64_t Addr,
-                                                   const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(insn, 16, 5);
-  unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
-  unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
-  unsigned shift = (shiftHi << 6) | shiftLo;
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::ANDWrs:
-  case ARM64::ANDSWrs:
-  case ARM64::BICWrs:
-  case ARM64::BICSWrs:
-  case ARM64::ORRWrs:
-  case ARM64::ORNWrs:
-  case ARM64::EORWrs:
-  case ARM64::EONWrs:
-  case ARM64::ADDWrs:
-  case ARM64::ADDSWrs:
-  case ARM64::SUBWrs:
-  case ARM64::SUBSWrs: {
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-  case ARM64::ANDXrs:
-  case ARM64::ANDSXrs:
-  case ARM64::BICXrs:
-  case ARM64::BICSXrs:
-  case ARM64::ORRXrs:
-  case ARM64::ORNXrs:
-  case ARM64::EORXrs:
-  case ARM64::EONXrs:
-  case ARM64::ADDXrs:
-  case ARM64::ADDSXrs:
-  case ARM64::SUBXrs:
-  case ARM64::SUBSXrs:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-
-  Inst.addOperand(MCOperand::CreateImm(shift));
-  return Success;
-}
-
-static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned imm = fieldFromInstruction(insn, 5, 16);
-  unsigned shift = fieldFromInstruction(insn, 21, 2);
-  shift <<= 4;
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::MOVZWi:
-  case ARM64::MOVNWi:
-  case ARM64::MOVKWi:
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::MOVZXi:
-  case ARM64::MOVNXi:
-  case ARM64::MOVKXi:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  }
-
-  if (Inst.getOpcode() == ARM64::MOVKWi || Inst.getOpcode() == ARM64::MOVKXi)
-    Inst.addOperand(Inst.getOperand(0));
-
-  Inst.addOperand(MCOperand::CreateImm(imm));
-  Inst.addOperand(MCOperand::CreateImm(shift));
-  return Success;
-}
-
-static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
-                                                  uint32_t insn, uint64_t Addr,
-                                                  const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned offset = fieldFromInstruction(insn, 10, 12);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::PRFMui:
-    // Rt is an immediate in prefetch.
-    Inst.addOperand(MCOperand::CreateImm(Rt));
-    break;
-  case ARM64::STRBBui:
-  case ARM64::LDRBBui:
-  case ARM64::LDRSBWui:
-  case ARM64::STRHHui:
-  case ARM64::LDRHHui:
-  case ARM64::LDRSHWui:
-  case ARM64::STRWui:
-  case ARM64::LDRWui:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSBXui:
-  case ARM64::LDRSHXui:
-  case ARM64::LDRSWui:
-  case ARM64::STRXui:
-  case ARM64::LDRXui:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRQui:
-  case ARM64::STRQui:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRDui:
-  case ARM64::STRDui:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSui:
-  case ARM64::STRSui:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRHui:
-  case ARM64::STRHui:
-    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRBui:
-  case ARM64::STRBui:
-    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  if (!Dis->tryAddingSymbolicOperand(Addr, offset, Fail, 4, Inst, insn))
-    Inst.addOperand(MCOperand::CreateImm(offset));
-  return Success;
-}
-
-static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  int64_t offset = fieldFromInstruction(insn, 12, 9);
-
-  // offset is a 9-bit signed immediate, so sign extend it to
-  // fill the unsigned.
-  if (offset & (1 << (9 - 1)))
-    offset |= ~((1LL << 9) - 1);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::PRFUMi:
-    // Rt is an immediate in prefetch.
-    Inst.addOperand(MCOperand::CreateImm(Rt));
-    break;
-  case ARM64::STURBBi:
-  case ARM64::LDURBBi:
-  case ARM64::LDURSBWi:
-  case ARM64::STURHHi:
-  case ARM64::LDURHHi:
-  case ARM64::LDURSHWi:
-  case ARM64::STURWi:
-  case ARM64::LDURWi:
-  case ARM64::LDTRSBWi:
-  case ARM64::LDTRSHWi:
-  case ARM64::STTRWi:
-  case ARM64::LDTRWi:
-  case ARM64::STTRHi:
-  case ARM64::LDTRHi:
-  case ARM64::LDTRBi:
-  case ARM64::STTRBi:
-  case ARM64::LDRSBWpre:
-  case ARM64::LDRSHWpre:
-  case ARM64::STRBBpre:
-  case ARM64::LDRBBpre:
-  case ARM64::STRHHpre:
-  case ARM64::LDRHHpre:
-  case ARM64::STRWpre:
-  case ARM64::LDRWpre:
-  case ARM64::LDRSBWpost:
-  case ARM64::LDRSHWpost:
-  case ARM64::STRBBpost:
-  case ARM64::LDRBBpost:
-  case ARM64::STRHHpost:
-  case ARM64::LDRHHpost:
-  case ARM64::STRWpost:
-  case ARM64::LDRWpost:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURSBXi:
-  case ARM64::LDURSHXi:
-  case ARM64::LDURSWi:
-  case ARM64::STURXi:
-  case ARM64::LDURXi:
-  case ARM64::LDTRSBXi:
-  case ARM64::LDTRSHXi:
-  case ARM64::LDTRSWi:
-  case ARM64::STTRXi:
-  case ARM64::LDTRXi:
-  case ARM64::LDRSBXpre:
-  case ARM64::LDRSHXpre:
-  case ARM64::STRXpre:
-  case ARM64::LDRSWpre:
-  case ARM64::LDRXpre:
-  case ARM64::LDRSBXpost:
-  case ARM64::LDRSHXpost:
-  case ARM64::STRXpost:
-  case ARM64::LDRSWpost:
-  case ARM64::LDRXpost:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURQi:
-  case ARM64::STURQi:
-  case ARM64::LDRQpre:
-  case ARM64::STRQpre:
-  case ARM64::LDRQpost:
-  case ARM64::STRQpost:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURDi:
-  case ARM64::STURDi:
-  case ARM64::LDRDpre:
-  case ARM64::STRDpre:
-  case ARM64::LDRDpost:
-  case ARM64::STRDpost:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURSi:
-  case ARM64::STURSi:
-  case ARM64::LDRSpre:
-  case ARM64::STRSpre:
-  case ARM64::LDRSpost:
-  case ARM64::STRSpost:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURHi:
-  case ARM64::STURHi:
-  case ARM64::LDRHpre:
-  case ARM64::STRHpre:
-  case ARM64::LDRHpost:
-  case ARM64::STRHpost:
-    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURBi:
-  case ARM64::STURBi:
-  case ARM64::LDRBpre:
-  case ARM64::STRBpre:
-  case ARM64::LDRBpost:
-  case ARM64::STRBpost:
-    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(offset));
-  return Success;
-}
-
-static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn, uint64_t Addr,
-                                                   const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
-  unsigned Rs = fieldFromInstruction(insn, 16, 5);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::STLXRW:
-  case ARM64::STLXRB:
-  case ARM64::STLXRH:
-  case ARM64::STXRW:
-  case ARM64::STXRB:
-  case ARM64::STXRH:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDARW:
-  case ARM64::LDARB:
-  case ARM64::LDARH:
-  case ARM64::LDAXRW:
-  case ARM64::LDAXRB:
-  case ARM64::LDAXRH:
-  case ARM64::LDXRW:
-  case ARM64::LDXRB:
-  case ARM64::LDXRH:
-  case ARM64::STLRW:
-  case ARM64::STLRB:
-  case ARM64::STLRH:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::STLXRX:
-  case ARM64::STXRX:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDARX:
-  case ARM64::LDAXRX:
-  case ARM64::LDXRX:
-  case ARM64::STLRX:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::STLXPW:
-  case ARM64::STXPW:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDAXPW:
-  case ARM64::LDXPW:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::STLXPX:
-  case ARM64::STXPX:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDAXPX:
-  case ARM64::LDXPX:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  return Success;
-}
-
-static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
-  int64_t offset = fieldFromInstruction(insn, 15, 7);
-
-  // offset is a 7-bit signed immediate, so sign extend it to
-  // fill the unsigned.
-  if (offset & (1 << (7 - 1)))
-    offset |= ~((1LL << 7) - 1);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::LDNPXi:
-  case ARM64::STNPXi:
-  case ARM64::LDPXpost:
-  case ARM64::STPXpost:
-  case ARM64::LDPSWpost:
-  case ARM64::LDPXi:
-  case ARM64::STPXi:
-  case ARM64::LDPSWi:
-  case ARM64::LDPXpre:
-  case ARM64::STPXpre:
-  case ARM64::LDPSWpre:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPWi:
-  case ARM64::STNPWi:
-  case ARM64::LDPWpost:
-  case ARM64::STPWpost:
-  case ARM64::LDPWi:
-  case ARM64::STPWi:
-  case ARM64::LDPWpre:
-  case ARM64::STPWpre:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPQi:
-  case ARM64::STNPQi:
-  case ARM64::LDPQpost:
-  case ARM64::STPQpost:
-  case ARM64::LDPQi:
-  case ARM64::STPQi:
-  case ARM64::LDPQpre:
-  case ARM64::STPQpre:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPDi:
-  case ARM64::STNPDi:
-  case ARM64::LDPDpost:
-  case ARM64::STPDpost:
-  case ARM64::LDPDi:
-  case ARM64::STPDi:
-  case ARM64::LDPDpre:
-  case ARM64::STPDpre:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPSi:
-  case ARM64::STNPSi:
-  case ARM64::LDPSpost:
-  case ARM64::STPSpost:
-  case ARM64::LDPSi:
-  case ARM64::STPSi:
-  case ARM64::LDPSpre:
-  case ARM64::STPSpre:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(offset));
-  return Success;
-}
-
-static DecodeStatus DecodeRegOffsetLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn, uint64_t Addr,
-                                                   const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(insn, 16, 5);
-  unsigned extendHi = fieldFromInstruction(insn, 13, 3);
-  unsigned extendLo = fieldFromInstruction(insn, 12, 1);
-  unsigned extend = 0;
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::LDRSWro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRXro:
-  case ARM64::STRXro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRWro:
-  case ARM64::STRWro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRQro:
-  case ARM64::STRQro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRDro:
-  case ARM64::STRDro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSro:
-  case ARM64::STRSro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRHro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRBro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRBBro:
-  case ARM64::STRBBro:
-  case ARM64::LDRSBWro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRHHro:
-  case ARM64::STRHHro:
-  case ARM64::LDRSHWro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSHXro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSBXro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::PRFMro:
-    extend = (extendHi << 1) | extendLo;
-    Inst.addOperand(MCOperand::CreateImm(Rt));
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-
-  if (extendHi == 0x3)
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-  else
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-
-  Inst.addOperand(MCOperand::CreateImm(extend));
-  return Success;
-}
-
-static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(insn, 16, 5);
-  unsigned extend = fieldFromInstruction(insn, 10, 6);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::ADDWrx:
-  case ARM64::SUBWrx:
-    DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDSWrx:
-  case ARM64::SUBSWrx:
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDXrx:
-  case ARM64::SUBXrx:
-    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDSXrx:
-  case ARM64::SUBSXrx:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDXrx64:
-  case ARM64::ADDSXrx64:
-  case ARM64::SUBXrx64:
-  case ARM64::SUBSXrx64:
-    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-
-  Inst.addOperand(MCOperand::CreateImm(extend));
-  return Success;
-}
-
-static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
-  unsigned imm;
-
-  if (Datasize) {
-    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
-    imm = fieldFromInstruction(insn, 10, 13);
-    if (!ARM64_AM::isValidDecodeLogicalImmediate(imm, 64))
-      return Fail;
-  } else {
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
-    imm = fieldFromInstruction(insn, 10, 12);
-    if (!ARM64_AM::isValidDecodeLogicalImmediate(imm, 32))
-      return Fail;
-  }
-  Inst.addOperand(MCOperand::CreateImm(imm));
-  return Success;
-}
-
-static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned cmode = fieldFromInstruction(insn, 12, 4);
-  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
-  imm |= fieldFromInstruction(insn, 5, 5);
-
-  if (Inst.getOpcode() == ARM64::MOVID)
-    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
-  else
-    DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-
-  Inst.addOperand(MCOperand::CreateImm(imm));
-
-  switch (Inst.getOpcode()) {
-  default:
-    break;
-  case ARM64::MOVIv4i16:
-  case ARM64::MOVIv8i16:
-  case ARM64::MVNIv4i16:
-  case ARM64::MVNIv8i16:
-  case ARM64::MOVIv2i32:
-  case ARM64::MOVIv4i32:
-  case ARM64::MVNIv2i32:
-  case ARM64::MVNIv4i32:
-    Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
-    break;
-  case ARM64::MOVIv2s_msl:
-  case ARM64::MOVIv4s_msl:
-  case ARM64::MVNIv2s_msl:
-  case ARM64::MVNIv4s_msl:
-    Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108));
-    break;
-  }
-
-  return Success;
-}
-
-static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned cmode = fieldFromInstruction(insn, 12, 4);
-  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
-  imm |= fieldFromInstruction(insn, 5, 5);
-
-  // Tied operands added twice.
-  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-
-  Inst.addOperand(MCOperand::CreateImm(imm));
-  Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
-
-  return Success;
-}
-
-static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                         uint64_t Addr, const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
-  imm |= fieldFromInstruction(insn, 29, 2);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend the 21-bit immediate.
-  if (imm & (1 << (21 - 1)))
-    imm |= ~((1LL << 21) - 1);
-
-  DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-  if (!Dis->tryAddingSymbolicOperand(Addr, imm, Fail, 4, Inst, insn))
-    Inst.addOperand(MCOperand::CreateImm(imm));
-
-  return Success;
-}
-
-static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Addr, const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Imm = fieldFromInstruction(insn, 10, 14);
-  unsigned S = fieldFromInstruction(insn, 29, 1);
-  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
-
-  unsigned ShifterVal = (Imm >> 12) & 3;
-  unsigned ImmVal = Imm & 0xFFF;
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  if (ShifterVal != 0 && ShifterVal != 1)
-    return Fail;
-
-  if (Datasize) {
-    if (Rd == 31 && !S)
-      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    else
-      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  } else {
-    if (Rd == 31 && !S)
-      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
-    else
-      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
-  }
-
-  if (!Dis->tryAddingSymbolicOperand(Addr, ImmVal, Fail, 4, Inst, insn))
-    Inst.addOperand(MCOperand::CreateImm(ImmVal));
-  Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal));
-  return Success;
-}
-
-static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  int64_t imm = fieldFromInstruction(insn, 0, 26);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend the 26-bit immediate.
-  if (imm & (1 << (26 - 1)))
-    imm |= ~((1LL << 26) - 1);
-
-  if (!Dis->tryAddingSymbolicOperand(Addr, imm << 2, true, 4, Inst))
-    Inst.addOperand(MCOperand::CreateImm(imm));
-
-  return Success;
-}
-
-static DecodeStatus DecodeSystemCPSRInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  uint64_t op1 = fieldFromInstruction(insn, 16, 3);
-  uint64_t op2 = fieldFromInstruction(insn, 5, 3);
-  uint64_t crm = fieldFromInstruction(insn, 8, 4);
-
-  Inst.addOperand(MCOperand::CreateImm((op1 << 3) | op2));
-  Inst.addOperand(MCOperand::CreateImm(crm));
-
-  return Success;
-}
-
-static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Addr, const void *Decoder) {
-  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
-  uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
-  bit |= fieldFromInstruction(insn, 19, 5);
-  int64_t dst = fieldFromInstruction(insn, 5, 14);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend 14-bit immediate.
-  if (dst & (1 << (14 - 1)))
-    dst |= ~((1LL << 14) - 1);
-
-  DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(bit));
-  if (!Dis->tryAddingSymbolicOperand(Addr, dst << 2, true, 4, Inst))
-    Inst.addOperand(MCOperand::CreateImm(dst));
-
-  return Success;
-}
-
-static DecodeStatus DecodeSIMDLdStPost(llvm::MCInst &Inst, uint32_t insn,
-                                       uint64_t Addr, const void *Decoder) {
-  uint64_t Rd = fieldFromInstruction(insn, 0, 5);
-  uint64_t Rn = fieldFromInstruction(insn, 5, 5);
-  uint64_t Rm = fieldFromInstruction(insn, 16, 5);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::ST1Onev8b_POST:
-  case ARM64::ST1Onev4h_POST:
-  case ARM64::ST1Onev2s_POST:
-  case ARM64::ST1Onev1d_POST:
-  case ARM64::LD1Onev8b_POST:
-  case ARM64::LD1Onev4h_POST:
-  case ARM64::LD1Onev2s_POST:
-  case ARM64::LD1Onev1d_POST:
-    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Onev16b_POST:
-  case ARM64::ST1Onev8h_POST:
-  case ARM64::ST1Onev4s_POST:
-  case ARM64::ST1Onev2d_POST:
-  case ARM64::LD1Onev16b_POST:
-  case ARM64::LD1Onev8h_POST:
-  case ARM64::LD1Onev4s_POST:
-  case ARM64::LD1Onev2d_POST:
-    DecodeFPR128RegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Twov8b_POST:
-  case ARM64::ST1Twov4h_POST:
-  case ARM64::ST1Twov2s_POST:
-  case ARM64::ST1Twov1d_POST:
-  case ARM64::ST2Twov8b_POST:
-  case ARM64::ST2Twov4h_POST:
-  case ARM64::ST2Twov2s_POST:
-  case ARM64::LD1Twov8b_POST:
-  case ARM64::LD1Twov4h_POST:
-  case ARM64::LD1Twov2s_POST:
-  case ARM64::LD1Twov1d_POST:
-  case ARM64::LD2Twov8b_POST:
-  case ARM64::LD2Twov4h_POST:
-  case ARM64::LD2Twov2s_POST:
-    DecodeDDRegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Threev8b_POST:
-  case ARM64::ST1Threev4h_POST:
-  case ARM64::ST1Threev2s_POST:
-  case ARM64::ST1Threev1d_POST:
-  case ARM64::ST3Threev8b_POST:
-  case ARM64::ST3Threev4h_POST:
-  case ARM64::ST3Threev2s_POST:
-  case ARM64::LD1Threev8b_POST:
-  case ARM64::LD1Threev4h_POST:
-  case ARM64::LD1Threev2s_POST:
-  case ARM64::LD1Threev1d_POST:
-  case ARM64::LD3Threev8b_POST:
-  case ARM64::LD3Threev4h_POST:
-  case ARM64::LD3Threev2s_POST:
-    DecodeDDDRegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Fourv8b_POST:
-  case ARM64::ST1Fourv4h_POST:
-  case ARM64::ST1Fourv2s_POST:
-  case ARM64::ST1Fourv1d_POST:
-  case ARM64::ST4Fourv8b_POST:
-  case ARM64::ST4Fourv4h_POST:
-  case ARM64::ST4Fourv2s_POST:
-  case ARM64::LD1Fourv8b_POST:
-  case ARM64::LD1Fourv4h_POST:
-  case ARM64::LD1Fourv2s_POST:
-  case ARM64::LD1Fourv1d_POST:
-  case ARM64::LD4Fourv8b_POST:
-  case ARM64::LD4Fourv4h_POST:
-  case ARM64::LD4Fourv2s_POST:
-    DecodeDDDDRegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Twov16b_POST:
-  case ARM64::ST1Twov8h_POST:
-  case ARM64::ST1Twov4s_POST:
-  case ARM64::ST1Twov2d_POST:
-  case ARM64::ST2Twov16b_POST:
-  case ARM64::ST2Twov8h_POST:
-  case ARM64::ST2Twov4s_POST:
-  case ARM64::ST2Twov2d_POST:
-  case ARM64::LD1Twov16b_POST:
-  case ARM64::LD1Twov8h_POST:
-  case ARM64::LD1Twov4s_POST:
-  case ARM64::LD1Twov2d_POST:
-  case ARM64::LD2Twov16b_POST:
-  case ARM64::LD2Twov8h_POST:
-  case ARM64::LD2Twov4s_POST:
-  case ARM64::LD2Twov2d_POST:
-    DecodeQQRegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Threev16b_POST:
-  case ARM64::ST1Threev8h_POST:
-  case ARM64::ST1Threev4s_POST:
-  case ARM64::ST1Threev2d_POST:
-  case ARM64::ST3Threev16b_POST:
-  case ARM64::ST3Threev8h_POST:
-  case ARM64::ST3Threev4s_POST:
-  case ARM64::ST3Threev2d_POST:
-  case ARM64::LD1Threev16b_POST:
-  case ARM64::LD1Threev8h_POST:
-  case ARM64::LD1Threev4s_POST:
-  case ARM64::LD1Threev2d_POST:
-  case ARM64::LD3Threev16b_POST:
-  case ARM64::LD3Threev8h_POST:
-  case ARM64::LD3Threev4s_POST:
-  case ARM64::LD3Threev2d_POST:
-    DecodeQQQRegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Fourv16b_POST:
-  case ARM64::ST1Fourv8h_POST:
-  case ARM64::ST1Fourv4s_POST:
-  case ARM64::ST1Fourv2d_POST:
-  case ARM64::ST4Fourv16b_POST:
-  case ARM64::ST4Fourv8h_POST:
-  case ARM64::ST4Fourv4s_POST:
-  case ARM64::ST4Fourv2d_POST:
-  case ARM64::LD1Fourv16b_POST:
-  case ARM64::LD1Fourv8h_POST:
-  case ARM64::LD1Fourv4s_POST:
-  case ARM64::LD1Fourv2d_POST:
-  case ARM64::LD4Fourv16b_POST:
-  case ARM64::LD4Fourv8h_POST:
-  case ARM64::LD4Fourv4s_POST:
-  case ARM64::LD4Fourv2d_POST:
-    DecodeQQQQRegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-  return Success;
-}
-
-static DecodeStatus DecodeSIMDLdStSingle(llvm::MCInst &Inst, uint32_t insn,
-                                         uint64_t Addr, const void *Decoder) {
-  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
-  uint64_t Rn = fieldFromInstruction(insn, 5, 5);
-  uint64_t Rm = fieldFromInstruction(insn, 16, 5);
-  uint64_t size = fieldFromInstruction(insn, 10, 2);
-  uint64_t S = fieldFromInstruction(insn, 12, 1);
-  uint64_t Q = fieldFromInstruction(insn, 30, 1);
-  uint64_t index = 0;
-
-  switch (Inst.getOpcode()) {
-  case ARM64::ST1i8:
-  case ARM64::ST1i8_POST:
-  case ARM64::ST2i8:
-  case ARM64::ST2i8_POST:
-  case ARM64::ST3i8_POST:
-  case ARM64::ST3i8:
-  case ARM64::ST4i8_POST:
-  case ARM64::ST4i8:
-    index = (Q << 3) | (S << 2) | size;
-    break;
-  case ARM64::ST1i16:
-  case ARM64::ST1i16_POST:
-  case ARM64::ST2i16:
-  case ARM64::ST2i16_POST:
-  case ARM64::ST3i16_POST:
-  case ARM64::ST3i16:
-  case ARM64::ST4i16_POST:
-  case ARM64::ST4i16:
-    index = (Q << 2) | (S << 1) | (size >> 1);
-    break;
-  case ARM64::ST1i32:
-  case ARM64::ST1i32_POST:
-  case ARM64::ST2i32:
-  case ARM64::ST2i32_POST:
-  case ARM64::ST3i32_POST:
-  case ARM64::ST3i32:
-  case ARM64::ST4i32_POST:
-  case ARM64::ST4i32:
-    index = (Q << 1) | S;
-    break;
-  case ARM64::ST1i64:
-  case ARM64::ST1i64_POST:
-  case ARM64::ST2i64:
-  case ARM64::ST2i64_POST:
-  case ARM64::ST3i64_POST:
-  case ARM64::ST3i64:
-  case ARM64::ST4i64_POST:
-  case ARM64::ST4i64:
-    index = Q;
-    break;
-  }
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::LD1Rv8b:
-  case ARM64::LD1Rv8b_POST:
-  case ARM64::LD1Rv4h:
-  case ARM64::LD1Rv4h_POST:
-  case ARM64::LD1Rv2s:
-  case ARM64::LD1Rv2s_POST:
-  case ARM64::LD1Rv1d:
-  case ARM64::LD1Rv1d_POST:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD1Rv16b:
-  case ARM64::LD1Rv16b_POST:
-  case ARM64::LD1Rv8h:
-  case ARM64::LD1Rv8h_POST:
-  case ARM64::LD1Rv4s:
-  case ARM64::LD1Rv4s_POST:
-  case ARM64::LD1Rv2d:
-  case ARM64::LD1Rv2d_POST:
-  case ARM64::ST1i8:
-  case ARM64::ST1i8_POST:
-  case ARM64::ST1i16:
-  case ARM64::ST1i16_POST:
-  case ARM64::ST1i32:
-  case ARM64::ST1i32_POST:
-  case ARM64::ST1i64:
-  case ARM64::ST1i64_POST:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD2Rv16b:
-  case ARM64::LD2Rv16b_POST:
-  case ARM64::LD2Rv8h:
-  case ARM64::LD2Rv8h_POST:
-  case ARM64::LD2Rv4s:
-  case ARM64::LD2Rv4s_POST:
-  case ARM64::LD2Rv2d:
-  case ARM64::LD2Rv2d_POST:
-  case ARM64::ST2i8:
-  case ARM64::ST2i8_POST:
-  case ARM64::ST2i16:
-  case ARM64::ST2i16_POST:
-  case ARM64::ST2i32:
-  case ARM64::ST2i32_POST:
-  case ARM64::ST2i64:
-  case ARM64::ST2i64_POST:
-    DecodeQQRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD2Rv8b:
-  case ARM64::LD2Rv8b_POST:
-  case ARM64::LD2Rv4h:
-  case ARM64::LD2Rv4h_POST:
-  case ARM64::LD2Rv2s:
-  case ARM64::LD2Rv2s_POST:
-  case ARM64::LD2Rv1d:
-  case ARM64::LD2Rv1d_POST:
-    DecodeDDRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD3Rv8b:
-  case ARM64::LD3Rv8b_POST:
-  case ARM64::LD3Rv4h:
-  case ARM64::LD3Rv4h_POST:
-  case ARM64::LD3Rv2s:
-  case ARM64::LD3Rv2s_POST:
-  case ARM64::LD3Rv1d:
-  case ARM64::LD3Rv1d_POST:
-    DecodeDDDRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD3Rv16b:
-  case ARM64::LD3Rv16b_POST:
-  case ARM64::LD3Rv8h:
-  case ARM64::LD3Rv8h_POST:
-  case ARM64::LD3Rv4s:
-  case ARM64::LD3Rv4s_POST:
-  case ARM64::LD3Rv2d:
-  case ARM64::LD3Rv2d_POST:
-  case ARM64::ST3i8:
-  case ARM64::ST3i8_POST:
-  case ARM64::ST3i16:
-  case ARM64::ST3i16_POST:
-  case ARM64::ST3i32:
-  case ARM64::ST3i32_POST:
-  case ARM64::ST3i64:
-  case ARM64::ST3i64_POST:
-    DecodeQQQRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD4Rv8b:
-  case ARM64::LD4Rv8b_POST:
-  case ARM64::LD4Rv4h:
-  case ARM64::LD4Rv4h_POST:
-  case ARM64::LD4Rv2s:
-  case ARM64::LD4Rv2s_POST:
-  case ARM64::LD4Rv1d:
-  case ARM64::LD4Rv1d_POST:
-    DecodeDDDDRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD4Rv16b:
-  case ARM64::LD4Rv16b_POST:
-  case ARM64::LD4Rv8h:
-  case ARM64::LD4Rv8h_POST:
-  case ARM64::LD4Rv4s:
-  case ARM64::LD4Rv4s_POST:
-  case ARM64::LD4Rv2d:
-  case ARM64::LD4Rv2d_POST:
-  case ARM64::ST4i8:
-  case ARM64::ST4i8_POST:
-  case ARM64::ST4i16:
-  case ARM64::ST4i16_POST:
-  case ARM64::ST4i32:
-  case ARM64::ST4i32_POST:
-  case ARM64::ST4i64:
-  case ARM64::ST4i64_POST:
-    DecodeQQQQRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  }
-
-  switch (Inst.getOpcode()) {
-  case ARM64::LD1Rv8b:
-  case ARM64::LD1Rv8b_POST:
-  case ARM64::LD1Rv16b:
-  case ARM64::LD1Rv16b_POST:
-  case ARM64::LD1Rv4h:
-  case ARM64::LD1Rv4h_POST:
-  case ARM64::LD1Rv8h:
-  case ARM64::LD1Rv8h_POST:
-  case ARM64::LD1Rv4s:
-  case ARM64::LD1Rv4s_POST:
-  case ARM64::LD1Rv2s:
-  case ARM64::LD1Rv2s_POST:
-  case ARM64::LD1Rv1d:
-  case ARM64::LD1Rv1d_POST:
-  case ARM64::LD1Rv2d:
-  case ARM64::LD1Rv2d_POST:
-  case ARM64::LD2Rv8b:
-  case ARM64::LD2Rv8b_POST:
-  case ARM64::LD2Rv16b:
-  case ARM64::LD2Rv16b_POST:
-  case ARM64::LD2Rv4h:
-  case ARM64::LD2Rv4h_POST:
-  case ARM64::LD2Rv8h:
-  case ARM64::LD2Rv8h_POST:
-  case ARM64::LD2Rv2s:
-  case ARM64::LD2Rv2s_POST:
-  case ARM64::LD2Rv4s:
-  case ARM64::LD2Rv4s_POST:
-  case ARM64::LD2Rv2d:
-  case ARM64::LD2Rv2d_POST:
-  case ARM64::LD2Rv1d:
-  case ARM64::LD2Rv1d_POST:
-  case ARM64::LD3Rv8b:
-  case ARM64::LD3Rv8b_POST:
-  case ARM64::LD3Rv16b:
-  case ARM64::LD3Rv16b_POST:
-  case ARM64::LD3Rv4h:
-  case ARM64::LD3Rv4h_POST:
-  case ARM64::LD3Rv8h:
-  case ARM64::LD3Rv8h_POST:
-  case ARM64::LD3Rv2s:
-  case ARM64::LD3Rv2s_POST:
-  case ARM64::LD3Rv4s:
-  case ARM64::LD3Rv4s_POST:
-  case ARM64::LD3Rv2d:
-  case ARM64::LD3Rv2d_POST:
-  case ARM64::LD3Rv1d:
-  case ARM64::LD3Rv1d_POST:
-  case ARM64::LD4Rv8b:
-  case ARM64::LD4Rv8b_POST:
-  case ARM64::LD4Rv16b:
-  case ARM64::LD4Rv16b_POST:
-  case ARM64::LD4Rv4h:
-  case ARM64::LD4Rv4h_POST:
-  case ARM64::LD4Rv8h:
-  case ARM64::LD4Rv8h_POST:
-  case ARM64::LD4Rv2s:
-  case ARM64::LD4Rv2s_POST:
-  case ARM64::LD4Rv4s:
-  case ARM64::LD4Rv4s_POST:
-  case ARM64::LD4Rv2d:
-  case ARM64::LD4Rv2d_POST:
-  case ARM64::LD4Rv1d:
-  case ARM64::LD4Rv1d_POST:
-    break;
-  default:
-    Inst.addOperand(MCOperand::CreateImm(index));
-  }
-
-  DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
-
-  switch (Inst.getOpcode()) {
-  case ARM64::ST1i8_POST:
-  case ARM64::ST1i16_POST:
-  case ARM64::ST1i32_POST:
-  case ARM64::ST1i64_POST:
-  case ARM64::LD1Rv8b_POST:
-  case ARM64::LD1Rv16b_POST:
-  case ARM64::LD1Rv4h_POST:
-  case ARM64::LD1Rv8h_POST:
-  case ARM64::LD1Rv2s_POST:
-  case ARM64::LD1Rv4s_POST:
-  case ARM64::LD1Rv1d_POST:
-  case ARM64::LD1Rv2d_POST:
-  case ARM64::ST2i8_POST:
-  case ARM64::ST2i16_POST:
-  case ARM64::ST2i32_POST:
-  case ARM64::ST2i64_POST:
-  case ARM64::LD2Rv8b_POST:
-  case ARM64::LD2Rv16b_POST:
-  case ARM64::LD2Rv4h_POST:
-  case ARM64::LD2Rv8h_POST:
-  case ARM64::LD2Rv2s_POST:
-  case ARM64::LD2Rv4s_POST:
-  case ARM64::LD2Rv2d_POST:
-  case ARM64::LD2Rv1d_POST:
-  case ARM64::ST3i8_POST:
-  case ARM64::ST3i16_POST:
-  case ARM64::ST3i32_POST:
-  case ARM64::ST3i64_POST:
-  case ARM64::LD3Rv8b_POST:
-  case ARM64::LD3Rv16b_POST:
-  case ARM64::LD3Rv4h_POST:
-  case ARM64::LD3Rv8h_POST:
-  case ARM64::LD3Rv2s_POST:
-  case ARM64::LD3Rv4s_POST:
-  case ARM64::LD3Rv2d_POST:
-  case ARM64::LD3Rv1d_POST:
-  case ARM64::ST4i8_POST:
-  case ARM64::ST4i16_POST:
-  case ARM64::ST4i32_POST:
-  case ARM64::ST4i64_POST:
-  case ARM64::LD4Rv8b_POST:
-  case ARM64::LD4Rv16b_POST:
-  case ARM64::LD4Rv4h_POST:
-  case ARM64::LD4Rv8h_POST:
-  case ARM64::LD4Rv2s_POST:
-  case ARM64::LD4Rv4s_POST:
-  case ARM64::LD4Rv2d_POST:
-  case ARM64::LD4Rv1d_POST:
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-  return Success;
-}
-
-static DecodeStatus DecodeSIMDLdStSingleTied(llvm::MCInst &Inst, uint32_t insn,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
-  uint64_t Rn = fieldFromInstruction(insn, 5, 5);
-  uint64_t Rm = fieldFromInstruction(insn, 16, 5);
-  uint64_t size = fieldFromInstruction(insn, 10, 2);
-  uint64_t S = fieldFromInstruction(insn, 12, 1);
-  uint64_t Q = fieldFromInstruction(insn, 30, 1);
-  uint64_t index = 0;
-
-  switch (Inst.getOpcode()) {
-  case ARM64::LD1i8:
-  case ARM64::LD1i8_POST:
-  case ARM64::LD2i8:
-  case ARM64::LD2i8_POST:
-  case ARM64::LD3i8_POST:
-  case ARM64::LD3i8:
-  case ARM64::LD4i8_POST:
-  case ARM64::LD4i8:
-    index = (Q << 3) | (S << 2) | size;
-    break;
-  case ARM64::LD1i16:
-  case ARM64::LD1i16_POST:
-  case ARM64::LD2i16:
-  case ARM64::LD2i16_POST:
-  case ARM64::LD3i16_POST:
-  case ARM64::LD3i16:
-  case ARM64::LD4i16_POST:
-  case ARM64::LD4i16:
-    index = (Q << 2) | (S << 1) | (size >> 1);
-    break;
-  case ARM64::LD1i32:
-  case ARM64::LD1i32_POST:
-  case ARM64::LD2i32:
-  case ARM64::LD2i32_POST:
-  case ARM64::LD3i32_POST:
-  case ARM64::LD3i32:
-  case ARM64::LD4i32_POST:
-  case ARM64::LD4i32:
-    index = (Q << 1) | S;
-    break;
-  case ARM64::LD1i64:
-  case ARM64::LD1i64_POST:
-  case ARM64::LD2i64:
-  case ARM64::LD2i64_POST:
-  case ARM64::LD3i64_POST:
-  case ARM64::LD3i64:
-  case ARM64::LD4i64_POST:
-  case ARM64::LD4i64:
-    index = Q;
-    break;
-  }
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::LD1i8:
-  case ARM64::LD1i8_POST:
-  case ARM64::LD1i16:
-  case ARM64::LD1i16_POST:
-  case ARM64::LD1i32:
-  case ARM64::LD1i32_POST:
-  case ARM64::LD1i64:
-  case ARM64::LD1i64_POST:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD2i8:
-  case ARM64::LD2i8_POST:
-  case ARM64::LD2i16:
-  case ARM64::LD2i16_POST:
-  case ARM64::LD2i32:
-  case ARM64::LD2i32_POST:
-  case ARM64::LD2i64:
-  case ARM64::LD2i64_POST:
-    DecodeQQRegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeQQRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD3i8:
-  case ARM64::LD3i8_POST:
-  case ARM64::LD3i16:
-  case ARM64::LD3i16_POST:
-  case ARM64::LD3i32:
-  case ARM64::LD3i32_POST:
-  case ARM64::LD3i64:
-  case ARM64::LD3i64_POST:
-    DecodeQQQRegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeQQQRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD4i8:
-  case ARM64::LD4i8_POST:
-  case ARM64::LD4i16:
-  case ARM64::LD4i16_POST:
-  case ARM64::LD4i32:
-  case ARM64::LD4i32_POST:
-  case ARM64::LD4i64:
-  case ARM64::LD4i64_POST:
-    DecodeQQQQRegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeQQQQRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  }
-
-  Inst.addOperand(MCOperand::CreateImm(index));
-  DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
-
-  switch (Inst.getOpcode()) {
-  case ARM64::LD1i8_POST:
-  case ARM64::LD1i16_POST:
-  case ARM64::LD1i32_POST:
-  case ARM64::LD1i64_POST:
-  case ARM64::LD2i8_POST:
-  case ARM64::LD2i16_POST:
-  case ARM64::LD2i32_POST:
-  case ARM64::LD2i64_POST:
-  case ARM64::LD3i8_POST:
-  case ARM64::LD3i16_POST:
-  case ARM64::LD3i32_POST:
-  case ARM64::LD3i64_POST:
-  case ARM64::LD4i8_POST:
-  case ARM64::LD4i16_POST:
-  case ARM64::LD4i32_POST:
-  case ARM64::LD4i64_POST:
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-  return Success;
-}
diff --git a/lib/Target/ARM64/Disassembler/ARM64Disassembler.h b/lib/Target/ARM64/Disassembler/ARM64Disassembler.h
deleted file mode 100644
index 35efc8d..0000000
--- a/lib/Target/ARM64/Disassembler/ARM64Disassembler.h
+++ /dev/null
@@ -1,54 +0,0 @@
-//===- ARM64Disassembler.h - Disassembler for ARM64 -------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64DISASSEMBLER_H
-#define ARM64DISASSEMBLER_H
-
-#include "llvm/MC/MCDisassembler.h"
-
-namespace llvm {
-
-class MCInst;
-class MemoryObject;
-class raw_ostream;
-
-class ARM64Disassembler : public MCDisassembler {
-public:
-  ARM64Disassembler(const MCSubtargetInfo &STI) : MCDisassembler(STI) {}
-
-  ~ARM64Disassembler() {}
-
-  /// getInstruction - See MCDisassembler.
-  MCDisassembler::DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
-                                              const MemoryObject &region,
-                                              uint64_t address,
-                                              raw_ostream &vStream,
-                                              raw_ostream &cStream) const;
-
-  /// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
-  /// operand in place of the immediate Value in the MCInst.  The immediate
-  /// Value has not had any PC adjustment made by the caller. If the instruction
-  /// adds the PC to the immediate Value then InstsAddsAddressToValue is true,
-  /// else false.  If the getOpInfo() function was set as part of the
-  /// setupForSymbolicDisassembly() call then that function is called to get any
-  /// symbolic information at the Address for this instrution.  If that returns
-  /// non-zero then the symbolic information it returns is used to create an
-  /// MCExpr and that is added as an operand to the MCInst.  This function
-  /// returns true if it adds an operand to the MCInst and false otherwise.
-  bool tryAddingSymbolicOperand(uint64_t Address, int Value,
-                                bool InstsAddsAddressToValue, uint64_t InstSize,
-                                MCInst &MI, uint32_t insn = 0) const;
-};
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/Disassembler/CMakeLists.txt b/lib/Target/ARM64/Disassembler/CMakeLists.txt
deleted file mode 100644
index ad998c2..0000000
--- a/lib/Target/ARM64/Disassembler/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64Disassembler
-  ARM64Disassembler.cpp
-  )
-# workaround for hanging compilation on MSVC8, 9 and 10
-#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
-#set_property(
-#  SOURCE ARMDisassembler.cpp
-#  PROPERTY COMPILE_FLAGS "/Od"
-#  )
-#endif()
-add_dependencies(LLVMARM64Disassembler ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/Disassembler/LLVMBuild.txt b/lib/Target/ARM64/Disassembler/LLVMBuild.txt
deleted file mode 100644
index 5935ee6..0000000
--- a/lib/Target/ARM64/Disassembler/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Disassembler
-parent = ARM64
-required_libraries = ARM64Desc ARM64Info MC Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/Disassembler/Makefile b/lib/Target/ARM64/Disassembler/Makefile
deleted file mode 100644
index 479d00c..0000000
--- a/lib/Target/ARM64/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM64/Disassembler/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Disassembler
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
deleted file mode 100644
index bb90707..0000000
--- a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
+++ /dev/null
@@ -1,1428 +0,0 @@
-//===-- ARM64InstPrinter.cpp - Convert ARM64 MCInst to assembly syntax ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints an ARM64 MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "asm-printer"
-#include "ARM64InstPrinter.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "MCTargetDesc/ARM64BaseInfo.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define GET_INSTRUCTION_NAME
-#define PRINT_ALIAS_INSTR
-#include "ARM64GenAsmWriter.inc"
-#define GET_INSTRUCTION_NAME
-#define PRINT_ALIAS_INSTR
-#include "ARM64GenAsmWriter1.inc"
-
-ARM64InstPrinter::ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                                   const MCRegisterInfo &MRI,
-                                   const MCSubtargetInfo &STI)
-    : MCInstPrinter(MAI, MII, MRI) {
-  // Initialize the set of available features.
-  setAvailableFeatures(STI.getFeatureBits());
-}
-
-ARM64AppleInstPrinter::ARM64AppleInstPrinter(const MCAsmInfo &MAI,
-                                             const MCInstrInfo &MII,
-                                             const MCRegisterInfo &MRI,
-                                             const MCSubtargetInfo &STI)
-    : ARM64InstPrinter(MAI, MII, MRI, STI) {}
-
-void ARM64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
-  // This is for .cfi directives.
-  OS << getRegisterName(RegNo);
-}
-
-void ARM64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                 StringRef Annot) {
-  // Check for special encodings and print the cannonical alias instead.
-
-  unsigned Opcode = MI->getOpcode();
-
-  if (Opcode == ARM64::SYS || Opcode == ARM64::SYSxt)
-    if (printSysAlias(MI, O)) {
-      printAnnotation(O, Annot);
-      return;
-    }
-
-  // TBZ/TBNZ should print the register operand as a Wreg if the bit
-  // number is < 32.
-  if ((Opcode == ARM64::TBNZ || Opcode == ARM64::TBZ) &&
-      MI->getOperand(1).getImm() < 32) {
-    MCInst newMI = *MI;
-    unsigned Reg = MI->getOperand(0).getReg();
-    newMI.getOperand(0).setReg(getWRegFromXReg(Reg));
-    printInstruction(&newMI, O);
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  // SBFM/UBFM should print to a nicer aliased form if possible.
-  if (Opcode == ARM64::SBFMXri || Opcode == ARM64::SBFMWri ||
-      Opcode == ARM64::UBFMXri || Opcode == ARM64::UBFMWri) {
-    const MCOperand &Op0 = MI->getOperand(0);
-    const MCOperand &Op1 = MI->getOperand(1);
-    const MCOperand &Op2 = MI->getOperand(2);
-    const MCOperand &Op3 = MI->getOperand(3);
-
-    if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
-      bool IsSigned = (Opcode == ARM64::SBFMXri || Opcode == ARM64::SBFMWri);
-      const char *AsmMnemonic = 0;
-
-      switch (Op3.getImm()) {
-      default:
-        break;
-      case 7:
-        AsmMnemonic = IsSigned ? "sxtb" : "uxtb";
-        break;
-      case 15:
-        AsmMnemonic = IsSigned ? "sxth" : "uxth";
-        break;
-      case 31:
-        AsmMnemonic = IsSigned ? "sxtw" : "uxtw";
-        break;
-      }
-
-      if (AsmMnemonic) {
-        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
-          << ", " << getRegisterName(Op1.getReg());
-        printAnnotation(O, Annot);
-        return;
-      }
-    }
-
-    // All immediate shifts are aliases, implemented using the Bitfield
-    // instruction. In all cases the immediate shift amount shift must be in
-    // the range 0 to (reg.size -1).
-    if (Op2.isImm() && Op3.isImm()) {
-      const char *AsmMnemonic = 0;
-      int shift = 0;
-      int64_t immr = Op2.getImm();
-      int64_t imms = Op3.getImm();
-      if (Opcode == ARM64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
-        AsmMnemonic = "lsl";
-        shift = 31 - imms;
-      } else if (Opcode == ARM64::UBFMXri && imms != 0x3f &&
-                 ((imms + 1 == immr))) {
-        AsmMnemonic = "lsl";
-        shift = 63 - imms;
-      } else if (Opcode == ARM64::UBFMWri && imms == 0x1f) {
-        AsmMnemonic = "lsr";
-        shift = immr;
-      } else if (Opcode == ARM64::UBFMXri && imms == 0x3f) {
-        AsmMnemonic = "lsr";
-        shift = immr;
-      } else if (Opcode == ARM64::SBFMWri && imms == 0x1f) {
-        AsmMnemonic = "asr";
-        shift = immr;
-      } else if (Opcode == ARM64::SBFMXri && imms == 0x3f) {
-        AsmMnemonic = "asr";
-        shift = immr;
-      }
-      if (AsmMnemonic) {
-        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
-          << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
-        printAnnotation(O, Annot);
-        return;
-      }
-    }
-  }
-
-  // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
-  // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
-  // printed.
-  if ((Opcode == ARM64::MOVZXi || Opcode == ARM64::MOVZWi ||
-       Opcode == ARM64::MOVNXi || Opcode == ARM64::MOVNWi) &&
-      MI->getOperand(1).isExpr()) {
-    if (Opcode == ARM64::MOVZXi || Opcode == ARM64::MOVZWi)
-      O << "\tmovz\t";
-    else
-      O << "\tmovn\t";
-
-    O << getRegisterName(MI->getOperand(0).getReg()) << ", #"
-      << *MI->getOperand(1).getExpr();
-    return;
-  }
-
-  if ((Opcode == ARM64::MOVKXi || Opcode == ARM64::MOVKWi) &&
-      MI->getOperand(2).isExpr()) {
-    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
-      << *MI->getOperand(2).getExpr();
-    return;
-  }
-
-  // ANDS WZR, Wn, #imm ==> TST Wn, #imm
-  // ANDS XZR, Xn, #imm ==> TST Xn, #imm
-  if (Opcode == ARM64::ANDSWri && MI->getOperand(0).getReg() == ARM64::WZR) {
-    O << "\ttst\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printLogicalImm32(MI, 2, O);
-    return;
-  }
-  if (Opcode == ARM64::ANDSXri && MI->getOperand(0).getReg() == ARM64::XZR) {
-    O << "\ttst\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printLogicalImm64(MI, 2, O);
-    return;
-  }
-  // ANDS WZR, Wn, Wm{, lshift #imm} ==> TST Wn{, lshift #imm}
-  // ANDS XZR, Xn, Xm{, lshift #imm} ==> TST Xn{, lshift #imm}
-  if ((Opcode == ARM64::ANDSWrs && MI->getOperand(0).getReg() == ARM64::WZR) ||
-      (Opcode == ARM64::ANDSXrs && MI->getOperand(0).getReg() == ARM64::XZR)) {
-    O << "\ttst\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printShiftedRegister(MI, 2, O);
-    return;
-  }
-
-  // SUBS WZR, Wn, #imm ==> CMP Wn, #imm
-  // SUBS XZR, Xn, #imm ==> CMP Xn, #imm
-  if ((Opcode == ARM64::SUBSWri && MI->getOperand(0).getReg() == ARM64::WZR) ||
-      (Opcode == ARM64::SUBSXri && MI->getOperand(0).getReg() == ARM64::XZR)) {
-    O << "\tcmp\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printAddSubImm(MI, 2, O);
-    return;
-  }
-  // SUBS WZR, Wn, Wm{, lshift #imm} ==> CMP Wn, Wm{, lshift #imm}
-  // SUBS XZR, Xn, Xm{, lshift #imm} ==> CMP Xn, Xm{, lshift #imm}
-  if ((Opcode == ARM64::SUBSWrs && MI->getOperand(0).getReg() == ARM64::WZR) ||
-      (Opcode == ARM64::SUBSXrs && MI->getOperand(0).getReg() == ARM64::XZR)) {
-    O << "\tcmp\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printShiftedRegister(MI, 2, O);
-    return;
-  }
-  // SUBS XZR, Xn, Wm, uxtb #imm ==> CMP Xn, uxtb #imm
-  // SUBS WZR, Wn, Xm, uxtb #imm ==> CMP Wn, uxtb #imm
-  if ((Opcode == ARM64::SUBSXrx && MI->getOperand(0).getReg() == ARM64::XZR) ||
-      (Opcode == ARM64::SUBSWrx && MI->getOperand(0).getReg() == ARM64::WZR)) {
-    O << "\tcmp\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printExtendedRegister(MI, 2, O);
-    return;
-  }
-  // SUBS XZR, Xn, Xm, uxtx #imm ==> CMP Xn, uxtb #imm
-  if (Opcode == ARM64::SUBSXrx64 && MI->getOperand(0).getReg() == ARM64::XZR) {
-    O << "\tcmp\t" << getRegisterName(MI->getOperand(1).getReg()) << ", "
-      << getRegisterName(MI->getOperand(2).getReg());
-    printExtend(MI, 3, O);
-    return;
-  }
-
-  // ADDS WZR, Wn, #imm ==> CMN Wn, #imm
-  // ADDS XZR, Xn, #imm ==> CMN Xn, #imm
-  if ((Opcode == ARM64::ADDSWri && MI->getOperand(0).getReg() == ARM64::WZR) ||
-      (Opcode == ARM64::ADDSXri && MI->getOperand(0).getReg() == ARM64::XZR)) {
-    O << "\tcmn\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printAddSubImm(MI, 2, O);
-    return;
-  }
-  // ADDS WZR, Wn, Wm{, lshift #imm} ==> CMN Wn, Wm{, lshift #imm}
-  // ADDS XZR, Xn, Xm{, lshift #imm} ==> CMN Xn, Xm{, lshift #imm}
-  if ((Opcode == ARM64::ADDSWrs && MI->getOperand(0).getReg() == ARM64::WZR) ||
-      (Opcode == ARM64::ADDSXrs && MI->getOperand(0).getReg() == ARM64::XZR)) {
-    O << "\tcmn\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printShiftedRegister(MI, 2, O);
-    return;
-  }
-  // ADDS XZR, Xn, Wm, uxtb #imm ==> CMN Xn, uxtb #imm
-  if (Opcode == ARM64::ADDSXrx && MI->getOperand(0).getReg() == ARM64::XZR) {
-    O << "\tcmn\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printExtendedRegister(MI, 2, O);
-    return;
-  }
-  // ADDS XZR, Xn, Xm, uxtx #imm ==> CMN Xn, uxtb #imm
-  if (Opcode == ARM64::ADDSXrx64 && MI->getOperand(0).getReg() == ARM64::XZR) {
-    O << "\tcmn\t" << getRegisterName(MI->getOperand(1).getReg()) << ", "
-      << getRegisterName(MI->getOperand(2).getReg());
-    printExtend(MI, 3, O);
-    return;
-  }
-
-  if (!printAliasInstr(MI, O))
-    printInstruction(MI, O);
-
-  printAnnotation(O, Annot);
-}
-
-static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
-                                bool &IsTbx) {
-  switch (Opcode) {
-  case ARM64::TBXv8i8One:
-  case ARM64::TBXv8i8Two:
-  case ARM64::TBXv8i8Three:
-  case ARM64::TBXv8i8Four:
-    IsTbx = true;
-    Layout = ".8b";
-    return true;
-  case ARM64::TBLv8i8One:
-  case ARM64::TBLv8i8Two:
-  case ARM64::TBLv8i8Three:
-  case ARM64::TBLv8i8Four:
-    IsTbx = false;
-    Layout = ".8b";
-    return true;
-  case ARM64::TBXv16i8One:
-  case ARM64::TBXv16i8Two:
-  case ARM64::TBXv16i8Three:
-  case ARM64::TBXv16i8Four:
-    IsTbx = true;
-    Layout = ".16b";
-    return true;
-  case ARM64::TBLv16i8One:
-  case ARM64::TBLv16i8Two:
-  case ARM64::TBLv16i8Three:
-  case ARM64::TBLv16i8Four:
-    IsTbx = false;
-    Layout = ".16b";
-    return true;
-  default:
-    return false;
-  }
-}
-
-struct LdStNInstrDesc {
-  unsigned Opcode;
-  const char *Mnemonic;
-  const char *Layout;
-  int LaneOperand;
-  int NaturalOffset;
-};
-
-static LdStNInstrDesc LdStNInstInfo[] = {
-  { ARM64::LD1i8,             "ld1",  ".b",     2, 0  },
-  { ARM64::LD1i16,            "ld1",  ".h",     2, 0  },
-  { ARM64::LD1i32,            "ld1",  ".s",     2, 0  },
-  { ARM64::LD1i64,            "ld1",  ".d",     2, 0  },
-  { ARM64::LD1i8_POST,        "ld1",  ".b",     2, 1  },
-  { ARM64::LD1i16_POST,       "ld1",  ".h",     2, 2  },
-  { ARM64::LD1i32_POST,       "ld1",  ".s",     2, 4  },
-  { ARM64::LD1i64_POST,       "ld1",  ".d",     2, 8  },
-  { ARM64::LD1Rv16b,          "ld1r", ".16b",   0, 0  },
-  { ARM64::LD1Rv8h,           "ld1r", ".8h",    0, 0  },
-  { ARM64::LD1Rv4s,           "ld1r", ".4s",    0, 0  },
-  { ARM64::LD1Rv2d,           "ld1r", ".2d",    0, 0  },
-  { ARM64::LD1Rv8b,           "ld1r", ".8b",    0, 0  },
-  { ARM64::LD1Rv4h,           "ld1r", ".4h",    0, 0  },
-  { ARM64::LD1Rv2s,           "ld1r", ".2s",    0, 0  },
-  { ARM64::LD1Rv1d,           "ld1r", ".1d",    0, 0  },
-  { ARM64::LD1Rv16b_POST,     "ld1r", ".16b",   0, 1  },
-  { ARM64::LD1Rv8h_POST,      "ld1r", ".8h",    0, 2  },
-  { ARM64::LD1Rv4s_POST,      "ld1r", ".4s",    0, 4  },
-  { ARM64::LD1Rv2d_POST,      "ld1r", ".2d",    0, 8  },
-  { ARM64::LD1Rv8b_POST,      "ld1r", ".8b",    0, 1  },
-  { ARM64::LD1Rv4h_POST,      "ld1r", ".4h",    0, 2  },
-  { ARM64::LD1Rv2s_POST,      "ld1r", ".2s",    0, 4  },
-  { ARM64::LD1Rv1d_POST,      "ld1r", ".1d",    0, 8  },
-  { ARM64::LD1Onev16b,        "ld1",  ".16b",   0, 0  },
-  { ARM64::LD1Onev8h,         "ld1",  ".8h",    0, 0  },
-  { ARM64::LD1Onev4s,         "ld1",  ".4s",    0, 0  },
-  { ARM64::LD1Onev2d,         "ld1",  ".2d",    0, 0  },
-  { ARM64::LD1Onev8b,         "ld1",  ".8b",    0, 0  },
-  { ARM64::LD1Onev4h,         "ld1",  ".4h",    0, 0  },
-  { ARM64::LD1Onev2s,         "ld1",  ".2s",    0, 0  },
-  { ARM64::LD1Onev1d,         "ld1",  ".1d",    0, 0  },
-  { ARM64::LD1Onev16b_POST,   "ld1",  ".16b",   0, 16 },
-  { ARM64::LD1Onev8h_POST,    "ld1",  ".8h",    0, 16 },
-  { ARM64::LD1Onev4s_POST,    "ld1",  ".4s",    0, 16 },
-  { ARM64::LD1Onev2d_POST,    "ld1",  ".2d",    0, 16 },
-  { ARM64::LD1Onev8b_POST,    "ld1",  ".8b",    0, 8  },
-  { ARM64::LD1Onev4h_POST,    "ld1",  ".4h",    0, 8  },
-  { ARM64::LD1Onev2s_POST,    "ld1",  ".2s",    0, 8  },
-  { ARM64::LD1Onev1d_POST,    "ld1",  ".1d",    0, 8  },
-  { ARM64::LD1Twov16b,        "ld1",  ".16b",   0, 0  },
-  { ARM64::LD1Twov8h,         "ld1",  ".8h",    0, 0  },
-  { ARM64::LD1Twov4s,         "ld1",  ".4s",    0, 0  },
-  { ARM64::LD1Twov2d,         "ld1",  ".2d",    0, 0  },
-  { ARM64::LD1Twov8b,         "ld1",  ".8b",    0, 0  },
-  { ARM64::LD1Twov4h,         "ld1",  ".4h",    0, 0  },
-  { ARM64::LD1Twov2s,         "ld1",  ".2s",    0, 0  },
-  { ARM64::LD1Twov1d,         "ld1",  ".1d",    0, 0  },
-  { ARM64::LD1Twov16b_POST,   "ld1",  ".16b",   0, 32 },
-  { ARM64::LD1Twov8h_POST,    "ld1",  ".8h",    0, 32 },
-  { ARM64::LD1Twov4s_POST,    "ld1",  ".4s",    0, 32 },
-  { ARM64::LD1Twov2d_POST,    "ld1",  ".2d",    0, 32 },
-  { ARM64::LD1Twov8b_POST,    "ld1",  ".8b",    0, 16 },
-  { ARM64::LD1Twov4h_POST,    "ld1",  ".4h",    0, 16 },
-  { ARM64::LD1Twov2s_POST,    "ld1",  ".2s",    0, 16 },
-  { ARM64::LD1Twov1d_POST,    "ld1",  ".1d",    0, 16 },
-  { ARM64::LD1Threev16b,      "ld1",  ".16b",   0, 0  },
-  { ARM64::LD1Threev8h,       "ld1",  ".8h",    0, 0  },
-  { ARM64::LD1Threev4s,       "ld1",  ".4s",    0, 0  },
-  { ARM64::LD1Threev2d,       "ld1",  ".2d",    0, 0  },
-  { ARM64::LD1Threev8b,       "ld1",  ".8b",    0, 0  },
-  { ARM64::LD1Threev4h,       "ld1",  ".4h",    0, 0  },
-  { ARM64::LD1Threev2s,       "ld1",  ".2s",    0, 0  },
-  { ARM64::LD1Threev1d,       "ld1",  ".1d",    0, 0  },
-  { ARM64::LD1Threev16b_POST, "ld1",  ".16b",   0, 48 },
-  { ARM64::LD1Threev8h_POST,  "ld1",  ".8h",    0, 48 },
-  { ARM64::LD1Threev4s_POST,  "ld1",  ".4s",    0, 48 },
-  { ARM64::LD1Threev2d_POST,  "ld1",  ".2d",    0, 48 },
-  { ARM64::LD1Threev8b_POST,  "ld1",  ".8b",    0, 24 },
-  { ARM64::LD1Threev4h_POST,  "ld1",  ".4h",    0, 24 },
-  { ARM64::LD1Threev2s_POST,  "ld1",  ".2s",    0, 24 },
-  { ARM64::LD1Threev1d_POST,  "ld1",  ".1d",    0, 24 },
-  { ARM64::LD1Fourv16b,       "ld1",  ".16b",   0, 0  },
-  { ARM64::LD1Fourv8h,        "ld1",  ".8h",    0, 0  },
-  { ARM64::LD1Fourv4s,        "ld1",  ".4s",    0, 0  },
-  { ARM64::LD1Fourv2d,        "ld1",  ".2d",    0, 0  },
-  { ARM64::LD1Fourv8b,        "ld1",  ".8b",    0, 0  },
-  { ARM64::LD1Fourv4h,        "ld1",  ".4h",    0, 0  },
-  { ARM64::LD1Fourv2s,        "ld1",  ".2s",    0, 0  },
-  { ARM64::LD1Fourv1d,        "ld1",  ".1d",    0, 0  },
-  { ARM64::LD1Fourv16b_POST,  "ld1",  ".16b",   0, 64 },
-  { ARM64::LD1Fourv8h_POST,   "ld1",  ".8h",    0, 64 },
-  { ARM64::LD1Fourv4s_POST,   "ld1",  ".4s",    0, 64 },
-  { ARM64::LD1Fourv2d_POST,   "ld1",  ".2d",    0, 64 },
-  { ARM64::LD1Fourv8b_POST,   "ld1",  ".8b",    0, 32 },
-  { ARM64::LD1Fourv4h_POST,   "ld1",  ".4h",    0, 32 },
-  { ARM64::LD1Fourv2s_POST,   "ld1",  ".2s",    0, 32 },
-  { ARM64::LD1Fourv1d_POST,   "ld1",  ".1d",    0, 32 },
-  { ARM64::LD2i8,             "ld2",  ".b",     2, 0  },
-  { ARM64::LD2i16,            "ld2",  ".h",     2, 0  },
-  { ARM64::LD2i32,            "ld2",  ".s",     2, 0  },
-  { ARM64::LD2i64,            "ld2",  ".d",     2, 0  },
-  { ARM64::LD2i8_POST,        "ld2",  ".b",     2, 2  },
-  { ARM64::LD2i16_POST,       "ld2",  ".h",     2, 4  },
-  { ARM64::LD2i32_POST,       "ld2",  ".s",     2, 8  },
-  { ARM64::LD2i64_POST,       "ld2",  ".d",     2, 16  },
-  { ARM64::LD2Rv16b,          "ld2r", ".16b",   0, 0  },
-  { ARM64::LD2Rv8h,           "ld2r", ".8h",    0, 0  },
-  { ARM64::LD2Rv4s,           "ld2r", ".4s",    0, 0  },
-  { ARM64::LD2Rv2d,           "ld2r", ".2d",    0, 0  },
-  { ARM64::LD2Rv8b,           "ld2r", ".8b",    0, 0  },
-  { ARM64::LD2Rv4h,           "ld2r", ".4h",    0, 0  },
-  { ARM64::LD2Rv2s,           "ld2r", ".2s",    0, 0  },
-  { ARM64::LD2Rv1d,           "ld2r", ".1d",    0, 0  },
-  { ARM64::LD2Rv16b_POST,     "ld2r", ".16b",   0, 2  },
-  { ARM64::LD2Rv8h_POST,      "ld2r", ".8h",    0, 4  },
-  { ARM64::LD2Rv4s_POST,      "ld2r", ".4s",    0, 8  },
-  { ARM64::LD2Rv2d_POST,      "ld2r", ".2d",    0, 16 },
-  { ARM64::LD2Rv8b_POST,      "ld2r", ".8b",    0, 2  },
-  { ARM64::LD2Rv4h_POST,      "ld2r", ".4h",    0, 4  },
-  { ARM64::LD2Rv2s_POST,      "ld2r", ".2s",    0, 8  },
-  { ARM64::LD2Rv1d_POST,      "ld2r", ".1d",    0, 16 },
-  { ARM64::LD2Twov16b,        "ld2",  ".16b",   0, 0  },
-  { ARM64::LD2Twov8h,         "ld2",  ".8h",    0, 0  },
-  { ARM64::LD2Twov4s,         "ld2",  ".4s",    0, 0  },
-  { ARM64::LD2Twov2d,         "ld2",  ".2d",    0, 0  },
-  { ARM64::LD2Twov8b,         "ld2",  ".8b",    0, 0  },
-  { ARM64::LD2Twov4h,         "ld2",  ".4h",    0, 0  },
-  { ARM64::LD2Twov2s,         "ld2",  ".2s",    0, 0  },
-  { ARM64::LD2Twov16b_POST,   "ld2",  ".16b",   0, 32 },
-  { ARM64::LD2Twov8h_POST,    "ld2",  ".8h",    0, 32 },
-  { ARM64::LD2Twov4s_POST,    "ld2",  ".4s",    0, 32 },
-  { ARM64::LD2Twov2d_POST,    "ld2",  ".2d",    0, 32 },
-  { ARM64::LD2Twov8b_POST,    "ld2",  ".8b",    0, 16 },
-  { ARM64::LD2Twov4h_POST,    "ld2",  ".4h",    0, 16 },
-  { ARM64::LD2Twov2s_POST,    "ld2",  ".2s",    0, 16 },
-  { ARM64::LD3i8,             "ld3",  ".b",     2, 0  },
-  { ARM64::LD3i16,            "ld3",  ".h",     2, 0  },
-  { ARM64::LD3i32,            "ld3",  ".s",     2, 0  },
-  { ARM64::LD3i64,            "ld3",  ".d",     2, 0  },
-  { ARM64::LD3i8_POST,        "ld3",  ".b",     2, 3  },
-  { ARM64::LD3i16_POST,       "ld3",  ".h",     2, 6  },
-  { ARM64::LD3i32_POST,       "ld3",  ".s",     2, 12  },
-  { ARM64::LD3i64_POST,       "ld3",  ".d",     2, 24  },
-  { ARM64::LD3Rv16b,          "ld3r", ".16b",   0, 0  },
-  { ARM64::LD3Rv8h,           "ld3r", ".8h",    0, 0  },
-  { ARM64::LD3Rv4s,           "ld3r", ".4s",    0, 0  },
-  { ARM64::LD3Rv2d,           "ld3r", ".2d",    0, 0  },
-  { ARM64::LD3Rv8b,           "ld3r", ".8b",    0, 0  },
-  { ARM64::LD3Rv4h,           "ld3r", ".4h",    0, 0  },
-  { ARM64::LD3Rv2s,           "ld3r", ".2s",    0, 0  },
-  { ARM64::LD3Rv1d,           "ld3r", ".1d",    0, 0  },
-  { ARM64::LD3Rv16b_POST,     "ld3r", ".16b",   0, 3  },
-  { ARM64::LD3Rv8h_POST,      "ld3r", ".8h",    0, 6  },
-  { ARM64::LD3Rv4s_POST,      "ld3r", ".4s",    0, 12 },
-  { ARM64::LD3Rv2d_POST,      "ld3r", ".2d",    0, 24 },
-  { ARM64::LD3Rv8b_POST,      "ld3r", ".8b",    0, 3  },
-  { ARM64::LD3Rv4h_POST,      "ld3r", ".4h",    0, 6  },
-  { ARM64::LD3Rv2s_POST,      "ld3r", ".2s",    0, 12 },
-  { ARM64::LD3Rv1d_POST,      "ld3r", ".1d",    0, 24 },
-  { ARM64::LD3Threev16b,      "ld3",  ".16b",   0, 0  },
-  { ARM64::LD3Threev8h,       "ld3",  ".8h",    0, 0  },
-  { ARM64::LD3Threev4s,       "ld3",  ".4s",    0, 0  },
-  { ARM64::LD3Threev2d,       "ld3",  ".2d",    0, 0  },
-  { ARM64::LD3Threev8b,       "ld3",  ".8b",    0, 0  },
-  { ARM64::LD3Threev4h,       "ld3",  ".4h",    0, 0  },
-  { ARM64::LD3Threev2s,       "ld3",  ".2s",    0, 0  },
-  { ARM64::LD3Threev16b_POST, "ld3",  ".16b",   0, 48 },
-  { ARM64::LD3Threev8h_POST,  "ld3",  ".8h",    0, 48 },
-  { ARM64::LD3Threev4s_POST,  "ld3",  ".4s",    0, 48 },
-  { ARM64::LD3Threev2d_POST,  "ld3",  ".2d",    0, 48 },
-  { ARM64::LD3Threev8b_POST,  "ld3",  ".8b",    0, 24 },
-  { ARM64::LD3Threev4h_POST,  "ld3",  ".4h",    0, 24 },
-  { ARM64::LD3Threev2s_POST,  "ld3",  ".2s",    0, 24 },
-  { ARM64::LD4i8,             "ld4",  ".b",     2, 0  },
-  { ARM64::LD4i16,            "ld4",  ".h",     2, 0  },
-  { ARM64::LD4i32,            "ld4",  ".s",     2, 0  },
-  { ARM64::LD4i64,            "ld4",  ".d",     2, 0  },
-  { ARM64::LD4i8_POST,        "ld4",  ".b",     2, 4  },
-  { ARM64::LD4i16_POST,       "ld4",  ".h",     2, 8  },
-  { ARM64::LD4i32_POST,       "ld4",  ".s",     2, 16 },
-  { ARM64::LD4i64_POST,       "ld4",  ".d",     2, 32 },
-  { ARM64::LD4Rv16b,          "ld4r", ".16b",   0, 0  },
-  { ARM64::LD4Rv8h,           "ld4r", ".8h",    0, 0  },
-  { ARM64::LD4Rv4s,           "ld4r", ".4s",    0, 0  },
-  { ARM64::LD4Rv2d,           "ld4r", ".2d",    0, 0  },
-  { ARM64::LD4Rv8b,           "ld4r", ".8b",    0, 0  },
-  { ARM64::LD4Rv4h,           "ld4r", ".4h",    0, 0  },
-  { ARM64::LD4Rv2s,           "ld4r", ".2s",    0, 0  },
-  { ARM64::LD4Rv1d,           "ld4r", ".1d",    0, 0  },
-  { ARM64::LD4Rv16b_POST,     "ld4r", ".16b",   0, 4  },
-  { ARM64::LD4Rv8h_POST,      "ld4r", ".8h",    0, 8  },
-  { ARM64::LD4Rv4s_POST,      "ld4r", ".4s",    0, 16 },
-  { ARM64::LD4Rv2d_POST,      "ld4r", ".2d",    0, 32 },
-  { ARM64::LD4Rv8b_POST,      "ld4r", ".8b",    0, 4  },
-  { ARM64::LD4Rv4h_POST,      "ld4r", ".4h",    0, 8  },
-  { ARM64::LD4Rv2s_POST,      "ld4r", ".2s",    0, 16 },
-  { ARM64::LD4Rv1d_POST,      "ld4r", ".1d",    0, 32 },
-  { ARM64::LD4Fourv16b,       "ld4",  ".16b",   0, 0  },
-  { ARM64::LD4Fourv8h,        "ld4",  ".8h",    0, 0  },
-  { ARM64::LD4Fourv4s,        "ld4",  ".4s",    0, 0  },
-  { ARM64::LD4Fourv2d,        "ld4",  ".2d",    0, 0  },
-  { ARM64::LD4Fourv8b,        "ld4",  ".8b",    0, 0  },
-  { ARM64::LD4Fourv4h,        "ld4",  ".4h",    0, 0  },
-  { ARM64::LD4Fourv2s,        "ld4",  ".2s",    0, 0  },
-  { ARM64::LD4Fourv16b_POST,  "ld4",  ".16b",   0, 64 },
-  { ARM64::LD4Fourv8h_POST,   "ld4",  ".8h",    0, 64 },
-  { ARM64::LD4Fourv4s_POST,   "ld4",  ".4s",    0, 64 },
-  { ARM64::LD4Fourv2d_POST,   "ld4",  ".2d",    0, 64 },
-  { ARM64::LD4Fourv8b_POST,   "ld4",  ".8b",    0, 32 },
-  { ARM64::LD4Fourv4h_POST,   "ld4",  ".4h",    0, 32 },
-  { ARM64::LD4Fourv2s_POST,   "ld4",  ".2s",    0, 32 },
-  { ARM64::ST1i8,             "st1",  ".b",     1, 0  },
-  { ARM64::ST1i16,            "st1",  ".h",     1, 0  },
-  { ARM64::ST1i32,            "st1",  ".s",     1, 0  },
-  { ARM64::ST1i64,            "st1",  ".d",     1, 0  },
-  { ARM64::ST1i8_POST,        "st1",  ".b",     1, 1  },
-  { ARM64::ST1i16_POST,       "st1",  ".h",     1, 2  },
-  { ARM64::ST1i32_POST,       "st1",  ".s",     1, 4  },
-  { ARM64::ST1i64_POST,       "st1",  ".d",     1, 8  },
-  { ARM64::ST1Onev16b,        "st1",  ".16b",   0, 0  },
-  { ARM64::ST1Onev8h,         "st1",  ".8h",    0, 0  },
-  { ARM64::ST1Onev4s,         "st1",  ".4s",    0, 0  },
-  { ARM64::ST1Onev2d,         "st1",  ".2d",    0, 0  },
-  { ARM64::ST1Onev8b,         "st1",  ".8b",    0, 0  },
-  { ARM64::ST1Onev4h,         "st1",  ".4h",    0, 0  },
-  { ARM64::ST1Onev2s,         "st1",  ".2s",    0, 0  },
-  { ARM64::ST1Onev1d,         "st1",  ".1d",    0, 0  },
-  { ARM64::ST1Onev16b_POST,   "st1",  ".16b",   0, 16 },
-  { ARM64::ST1Onev8h_POST,    "st1",  ".8h",    0, 16 },
-  { ARM64::ST1Onev4s_POST,    "st1",  ".4s",    0, 16 },
-  { ARM64::ST1Onev2d_POST,    "st1",  ".2d",    0, 16 },
-  { ARM64::ST1Onev8b_POST,    "st1",  ".8b",    0, 8  },
-  { ARM64::ST1Onev4h_POST,    "st1",  ".4h",    0, 8  },
-  { ARM64::ST1Onev2s_POST,    "st1",  ".2s",    0, 8  },
-  { ARM64::ST1Onev1d_POST,    "st1",  ".1d",    0, 8  },
-  { ARM64::ST1Twov16b,        "st1",  ".16b",   0, 0  },
-  { ARM64::ST1Twov8h,         "st1",  ".8h",    0, 0  },
-  { ARM64::ST1Twov4s,         "st1",  ".4s",    0, 0  },
-  { ARM64::ST1Twov2d,         "st1",  ".2d",    0, 0  },
-  { ARM64::ST1Twov8b,         "st1",  ".8b",    0, 0  },
-  { ARM64::ST1Twov4h,         "st1",  ".4h",    0, 0  },
-  { ARM64::ST1Twov2s,         "st1",  ".2s",    0, 0  },
-  { ARM64::ST1Twov1d,         "st1",  ".1d",    0, 0  },
-  { ARM64::ST1Twov16b_POST,   "st1",  ".16b",   0, 32 },
-  { ARM64::ST1Twov8h_POST,    "st1",  ".8h",    0, 32 },
-  { ARM64::ST1Twov4s_POST,    "st1",  ".4s",    0, 32 },
-  { ARM64::ST1Twov2d_POST,    "st1",  ".2d",    0, 32 },
-  { ARM64::ST1Twov8b_POST,    "st1",  ".8b",    0, 16 },
-  { ARM64::ST1Twov4h_POST,    "st1",  ".4h",    0, 16 },
-  { ARM64::ST1Twov2s_POST,    "st1",  ".2s",    0, 16 },
-  { ARM64::ST1Twov1d_POST,    "st1",  ".1d",    0, 16 },
-  { ARM64::ST1Threev16b,      "st1",  ".16b",   0, 0  },
-  { ARM64::ST1Threev8h,       "st1",  ".8h",    0, 0  },
-  { ARM64::ST1Threev4s,       "st1",  ".4s",    0, 0  },
-  { ARM64::ST1Threev2d,       "st1",  ".2d",    0, 0  },
-  { ARM64::ST1Threev8b,       "st1",  ".8b",    0, 0  },
-  { ARM64::ST1Threev4h,       "st1",  ".4h",    0, 0  },
-  { ARM64::ST1Threev2s,       "st1",  ".2s",    0, 0  },
-  { ARM64::ST1Threev1d,       "st1",  ".1d",    0, 0  },
-  { ARM64::ST1Threev16b_POST, "st1",  ".16b",   0, 48 },
-  { ARM64::ST1Threev8h_POST,  "st1",  ".8h",    0, 48 },
-  { ARM64::ST1Threev4s_POST,  "st1",  ".4s",    0, 48 },
-  { ARM64::ST1Threev2d_POST,  "st1",  ".2d",    0, 48 },
-  { ARM64::ST1Threev8b_POST,  "st1",  ".8b",    0, 24 },
-  { ARM64::ST1Threev4h_POST,  "st1",  ".4h",    0, 24 },
-  { ARM64::ST1Threev2s_POST,  "st1",  ".2s",    0, 24 },
-  { ARM64::ST1Threev1d_POST,  "st1",  ".1d",    0, 24 },
-  { ARM64::ST1Fourv16b,       "st1",  ".16b",   0, 0  },
-  { ARM64::ST1Fourv8h,        "st1",  ".8h",    0, 0  },
-  { ARM64::ST1Fourv4s,        "st1",  ".4s",    0, 0  },
-  { ARM64::ST1Fourv2d,        "st1",  ".2d",    0, 0  },
-  { ARM64::ST1Fourv8b,        "st1",  ".8b",    0, 0  },
-  { ARM64::ST1Fourv4h,        "st1",  ".4h",    0, 0  },
-  { ARM64::ST1Fourv2s,        "st1",  ".2s",    0, 0  },
-  { ARM64::ST1Fourv1d,        "st1",  ".1d",    0, 0  },
-  { ARM64::ST1Fourv16b_POST,  "st1",  ".16b",   0, 64 },
-  { ARM64::ST1Fourv8h_POST,   "st1",  ".8h",    0, 64 },
-  { ARM64::ST1Fourv4s_POST,   "st1",  ".4s",    0, 64 },
-  { ARM64::ST1Fourv2d_POST,   "st1",  ".2d",    0, 64 },
-  { ARM64::ST1Fourv8b_POST,   "st1",  ".8b",    0, 32 },
-  { ARM64::ST1Fourv4h_POST,   "st1",  ".4h",    0, 32 },
-  { ARM64::ST1Fourv2s_POST,   "st1",  ".2s",    0, 32 },
-  { ARM64::ST1Fourv1d_POST,   "st1",  ".1d",    0, 32 },
-  { ARM64::ST2i8,             "st2",  ".b",     1, 0  },
-  { ARM64::ST2i16,            "st2",  ".h",     1, 0  },
-  { ARM64::ST2i32,            "st2",  ".s",     1, 0  },
-  { ARM64::ST2i64,            "st2",  ".d",     1, 0  },
-  { ARM64::ST2i8_POST,        "st2",  ".b",     1, 2  },
-  { ARM64::ST2i16_POST,       "st2",  ".h",     1, 4  },
-  { ARM64::ST2i32_POST,       "st2",  ".s",     1, 8  },
-  { ARM64::ST2i64_POST,       "st2",  ".d",     1, 16 },
-  { ARM64::ST2Twov16b,        "st2",  ".16b",   0, 0  },
-  { ARM64::ST2Twov8h,         "st2",  ".8h",    0, 0  },
-  { ARM64::ST2Twov4s,         "st2",  ".4s",    0, 0  },
-  { ARM64::ST2Twov2d,         "st2",  ".2d",    0, 0  },
-  { ARM64::ST2Twov8b,         "st2",  ".8b",    0, 0  },
-  { ARM64::ST2Twov4h,         "st2",  ".4h",    0, 0  },
-  { ARM64::ST2Twov2s,         "st2",  ".2s",    0, 0  },
-  { ARM64::ST2Twov16b_POST,   "st2",  ".16b",   0, 32 },
-  { ARM64::ST2Twov8h_POST,    "st2",  ".8h",    0, 32 },
-  { ARM64::ST2Twov4s_POST,    "st2",  ".4s",    0, 32 },
-  { ARM64::ST2Twov2d_POST,    "st2",  ".2d",    0, 32 },
-  { ARM64::ST2Twov8b_POST,    "st2",  ".8b",    0, 16 },
-  { ARM64::ST2Twov4h_POST,    "st2",  ".4h",    0, 16 },
-  { ARM64::ST2Twov2s_POST,    "st2",  ".2s",    0, 16 },
-  { ARM64::ST3i8,             "st3",  ".b",     1, 0  },
-  { ARM64::ST3i16,            "st3",  ".h",     1, 0  },
-  { ARM64::ST3i32,            "st3",  ".s",     1, 0  },
-  { ARM64::ST3i64,            "st3",  ".d",     1, 0  },
-  { ARM64::ST3i8_POST,        "st3",  ".b",     1, 3  },
-  { ARM64::ST3i16_POST,       "st3",  ".h",     1, 6  },
-  { ARM64::ST3i32_POST,       "st3",  ".s",     1, 12 },
-  { ARM64::ST3i64_POST,       "st3",  ".d",     1, 24 },
-  { ARM64::ST3Threev16b,      "st3",  ".16b",   0, 0  },
-  { ARM64::ST3Threev8h,       "st3",  ".8h",    0, 0  },
-  { ARM64::ST3Threev4s,       "st3",  ".4s",    0, 0  },
-  { ARM64::ST3Threev2d,       "st3",  ".2d",    0, 0  },
-  { ARM64::ST3Threev8b,       "st3",  ".8b",    0, 0  },
-  { ARM64::ST3Threev4h,       "st3",  ".4h",    0, 0  },
-  { ARM64::ST3Threev2s,       "st3",  ".2s",    0, 0  },
-  { ARM64::ST3Threev16b_POST, "st3",  ".16b",   0, 48 },
-  { ARM64::ST3Threev8h_POST,  "st3",  ".8h",    0, 48 },
-  { ARM64::ST3Threev4s_POST,  "st3",  ".4s",    0, 48 },
-  { ARM64::ST3Threev2d_POST,  "st3",  ".2d",    0, 48 },
-  { ARM64::ST3Threev8b_POST,  "st3",  ".8b",    0, 24 },
-  { ARM64::ST3Threev4h_POST,  "st3",  ".4h",    0, 24 },
-  { ARM64::ST3Threev2s_POST,  "st3",  ".2s",    0, 24 },
-  { ARM64::ST4i8,             "st4",  ".b",     1, 0  },
-  { ARM64::ST4i16,            "st4",  ".h",     1, 0  },
-  { ARM64::ST4i32,            "st4",  ".s",     1, 0  },
-  { ARM64::ST4i64,            "st4",  ".d",     1, 0  },
-  { ARM64::ST4i8_POST,        "st4",  ".b",     1, 4  },
-  { ARM64::ST4i16_POST,       "st4",  ".h",     1, 8  },
-  { ARM64::ST4i32_POST,       "st4",  ".s",     1, 16 },
-  { ARM64::ST4i64_POST,       "st4",  ".d",     1, 32 },
-  { ARM64::ST4Fourv16b,       "st4",  ".16b",   0, 0  },
-  { ARM64::ST4Fourv8h,        "st4",  ".8h",    0, 0  },
-  { ARM64::ST4Fourv4s,        "st4",  ".4s",    0, 0  },
-  { ARM64::ST4Fourv2d,        "st4",  ".2d",    0, 0  },
-  { ARM64::ST4Fourv8b,        "st4",  ".8b",    0, 0  },
-  { ARM64::ST4Fourv4h,        "st4",  ".4h",    0, 0  },
-  { ARM64::ST4Fourv2s,        "st4",  ".2s",    0, 0  },
-  { ARM64::ST4Fourv16b_POST,  "st4",  ".16b",   0, 64 },
-  { ARM64::ST4Fourv8h_POST,   "st4",  ".8h",    0, 64 },
-  { ARM64::ST4Fourv4s_POST,   "st4",  ".4s",    0, 64 },
-  { ARM64::ST4Fourv2d_POST,   "st4",  ".2d",    0, 64 },
-  { ARM64::ST4Fourv8b_POST,   "st4",  ".8b",    0, 32 },
-  { ARM64::ST4Fourv4h_POST,   "st4",  ".4h",    0, 32 },
-  { ARM64::ST4Fourv2s_POST,   "st4",  ".2s",    0, 32 },
-};
-
-static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
-  unsigned Idx;
-  for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
-    if (LdStNInstInfo[Idx].Opcode == Opcode)
-      return &LdStNInstInfo[Idx];
-
-  return 0;
-}
-
-void ARM64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                      StringRef Annot) {
-  unsigned Opcode = MI->getOpcode();
-  StringRef Layout, Mnemonic;
-
-  bool IsTbx;
-  if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
-    O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
-      << getRegisterName(MI->getOperand(0).getReg(), ARM64::vreg) << ", ";
-
-    unsigned ListOpNum = IsTbx ? 2 : 1;
-    printVectorList(MI, ListOpNum, O, "");
-
-    O << ", "
-      << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), ARM64::vreg);
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
-    O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
-
-    // Now onto the operands: first a vector list with possible lane
-    // specifier. E.g. { v0 }[2]
-    printVectorList(MI, 0, O, "");
-
-    if (LdStDesc->LaneOperand != 0)
-      O << '[' << MI->getOperand(LdStDesc->LaneOperand).getImm() << ']';
-
-    // Next the address: [xN]
-    unsigned AddrOpNum = LdStDesc->LaneOperand + 1;
-    unsigned AddrReg = MI->getOperand(AddrOpNum).getReg();
-    O << ", [" << getRegisterName(AddrReg) << ']';
-
-    // Finally, there might be a post-indexed offset.
-    if (LdStDesc->NaturalOffset != 0) {
-      unsigned Reg = MI->getOperand(AddrOpNum + 1).getReg();
-      if (Reg != ARM64::XZR)
-        O << ", " << getRegisterName(Reg);
-      else {
-        assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
-        O << ", #" << LdStDesc->NaturalOffset;
-      }
-    }
-
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  ARM64InstPrinter::printInst(MI, O, Annot);
-}
-
-bool ARM64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
-#ifndef NDEBUG
-  unsigned Opcode = MI->getOpcode();
-  assert((Opcode == ARM64::SYS || Opcode == ARM64::SYSxt) &&
-         "Invalid opcode for SYS alias!");
-#endif
-
-  const char *Asm = 0;
-  const MCOperand &Op1 = MI->getOperand(0);
-  const MCOperand &Cn = MI->getOperand(1);
-  const MCOperand &Cm = MI->getOperand(2);
-  const MCOperand &Op2 = MI->getOperand(3);
-
-  unsigned Op1Val = Op1.getImm();
-  unsigned CnVal = Cn.getImm();
-  unsigned CmVal = Cm.getImm();
-  unsigned Op2Val = Op2.getImm();
-
-  if (CnVal == 7) {
-    switch (CmVal) {
-    default:
-      break;
-
-    // IC aliases
-    case 1:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tialluis";
-      break;
-    case 5:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tiallu";
-      else if (Op1Val == 3 && Op2Val == 1)
-        Asm = "ic\tivau";
-      break;
-
-    // DC aliases
-    case 4:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tzva";
-      break;
-    case 6:
-      if (Op1Val == 0 && Op2Val == 1)
-        Asm = "dc\tivac";
-      if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tisw";
-      break;
-    case 10:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcsw";
-      break;
-    case 11:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvau";
-      break;
-    case 14:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcivac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcisw";
-      break;
-
-    // AT aliases
-    case 8:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e1r"; break;
-        case 1: Asm = "at\ts1e1w"; break;
-        case 2: Asm = "at\ts1e0r"; break;
-        case 3: Asm = "at\ts1e0w"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e2r"; break;
-        case 1: Asm = "at\ts1e2w"; break;
-        case 4: Asm = "at\ts12e1r"; break;
-        case 5: Asm = "at\ts12e1w"; break;
-        case 6: Asm = "at\ts12e0r"; break;
-        case 7: Asm = "at\ts12e0w"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e3r"; break;
-        case 1: Asm = "at\ts1e3w"; break;
-        }
-        break;
-      }
-      break;
-    }
-  } else if (CnVal == 8) {
-    // TLBI aliases
-    switch (CmVal) {
-    default:
-      break;
-    case 3:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1is"; break;
-        case 1: Asm = "tlbi\tvae1is"; break;
-        case 2: Asm = "tlbi\taside1is"; break;
-        case 3: Asm = "tlbi\tvaae1is"; break;
-        case 5: Asm = "tlbi\tvale1is"; break;
-        case 7: Asm = "tlbi\tvaale1is"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2is"; break;
-        case 1: Asm = "tlbi\tvae2is"; break;
-        case 4: Asm = "tlbi\talle1is"; break;
-        case 5: Asm = "tlbi\tvale2is"; break;
-        case 6: Asm = "tlbi\tvmalls12e1is"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3is"; break;
-        case 1: Asm = "tlbi\tvae3is"; break;
-        case 5: Asm = "tlbi\tvale3is"; break;
-        }
-        break;
-      }
-      break;
-    case 4:
-      switch (Op1Val) {
-      default:
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 1: Asm = "tlbi\tipas2e1"; break;
-        case 5: Asm = "tlbi\tipas2le1"; break;
-        }
-        break;
-      }
-      break;
-    case 7:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1"; break;
-        case 1: Asm = "tlbi\tvae1"; break;
-        case 2: Asm = "tlbi\taside1"; break;
-        case 3: Asm = "tlbi\tvaae1"; break;
-        case 5: Asm = "tlbi\tvale1"; break;
-        case 7: Asm = "tlbi\tvaale1"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2"; break;
-        case 1: Asm = "tlbi\tvae2"; break;
-        case 4: Asm = "tlbi\talle1"; break;
-        case 5: Asm = "tlbi\tvale2"; break;
-        case 6: Asm = "tlbi\tvmalls12e1"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3"; break;
-        case 1: Asm = "tlbi\tvae3";  break;
-        case 5: Asm = "tlbi\tvale3"; break;
-        }
-        break;
-      }
-      break;
-    }
-  }
-
-  if (Asm) {
-    O << '\t' << Asm;
-    if (MI->getNumOperands() == 5)
-      O << ", " << getRegisterName(MI->getOperand(4).getReg());
-  }
-
-  return Asm != 0;
-}
-
-void ARM64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                    raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
-    O << getRegisterName(Reg);
-  } else if (Op.isImm()) {
-    O << '#' << Op.getImm();
-  } else {
-    assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << *Op.getExpr();
-  }
-}
-
-void ARM64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
-                                           unsigned Imm, raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
-    if (Reg == ARM64::XZR)
-      O << "#" << Imm;
-    else
-      O << getRegisterName(Reg);
-  } else
-    assert(0 && "unknown operand kind in printPostIncOperand64");
-}
-
-void ARM64InstPrinter::printPostIncOperand1(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 1, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand2(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 2, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand3(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 3, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand4(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 4, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand6(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 6, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand8(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 8, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand12(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 12, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand16(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 16, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand24(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 24, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand32(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 32, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand48(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 48, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand64(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 64, O);
-}
-
-void ARM64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  assert(Op.isReg() && "Non-register vreg operand!");
-  unsigned Reg = Op.getReg();
-  O << getRegisterName(Reg, ARM64::vreg);
-}
-
-void ARM64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
-  O << "c" << Op.getImm();
-}
-
-void ARM64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  if (MO.isImm()) {
-    unsigned Val = (MO.getImm() & 0xfff);
-    assert(Val == MO.getImm() && "Add/sub immediate out of range!");
-    unsigned Shift =
-        ARM64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
-    O << '#' << (Val << Shift);
-    // Distinguish "0, lsl #12" from "0, lsl #0".
-    if (Val == 0 && Shift != 0)
-      printShifter(MI, OpNum + 1, O);
-  } else {
-    assert(MO.isExpr() && "Unexpected operand type!");
-    O << *MO.getExpr();
-    printShifter(MI, OpNum + 1, O);
-  }
-}
-
-void ARM64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  uint64_t Val = MI->getOperand(OpNum).getImm();
-  O << "#0x";
-  O.write_hex(ARM64_AM::decodeLogicalImmediate(Val, 32));
-}
-
-void ARM64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  uint64_t Val = MI->getOperand(OpNum).getImm();
-  O << "#0x";
-  O.write_hex(ARM64_AM::decodeLogicalImmediate(Val, 64));
-}
-
-void ARM64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
-                                    raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNum).getImm();
-  // LSL #0 should not be printed.
-  if (ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
-      ARM64_AM::getShiftValue(Val) == 0)
-    return;
-  O << ", " << ARM64_AM::getShiftName(ARM64_AM::getShiftType(Val)) << " #"
-    << ARM64_AM::getShiftValue(Val);
-}
-
-void ARM64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-  O << getRegisterName(MI->getOperand(OpNum).getReg());
-  printShifter(MI, OpNum + 1, O);
-}
-
-void ARM64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
-                                             raw_ostream &O) {
-  O << getRegisterName(MI->getOperand(OpNum).getReg());
-  printExtend(MI, OpNum + 1, O);
-}
-
-void ARM64InstPrinter::printExtend(const MCInst *MI, unsigned OpNum,
-                                   raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNum).getImm();
-  ARM64_AM::ExtendType ExtType = ARM64_AM::getArithExtendType(Val);
-  unsigned ShiftVal = ARM64_AM::getArithShiftValue(Val);
-
-  // If the destination or first source register operand is [W]SP, print
-  // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
-  // all.
-  if (ExtType == ARM64_AM::UXTW || ExtType == ARM64_AM::UXTX) {
-    unsigned Dest = MI->getOperand(0).getReg();
-    unsigned Src1 = MI->getOperand(1).getReg();
-    if (Dest == ARM64::SP || Dest == ARM64::WSP || Src1 == ARM64::SP ||
-        Src1 == ARM64::WSP) {
-      if (ShiftVal != 0)
-        O << ", lsl #" << ShiftVal;
-      return;
-    }
-  }
-  O << ", " << ARM64_AM::getExtendName(ExtType);
-  if (ShiftVal != 0)
-    O << " #" << ShiftVal;
-}
-
-void ARM64InstPrinter::printDotCondCode(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O) {
-  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(OpNum).getImm();
-  if (CC != ARM64CC::AL)
-    O << '.' << ARM64CC::getCondCodeName(CC);
-}
-
-void ARM64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
-                                     raw_ostream &O) {
-  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(OpNum).getImm();
-  O << ARM64CC::getCondCodeName(CC);
-}
-
-void ARM64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
-}
-
-void ARM64InstPrinter::printImmScale4(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  O << '#' << 4 * MI->getOperand(OpNum).getImm();
-}
-
-void ARM64InstPrinter::printImmScale8(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  O << '#' << 8 * MI->getOperand(OpNum).getImm();
-}
-
-void ARM64InstPrinter::printImmScale16(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
-  O << '#' << 16 * MI->getOperand(OpNum).getImm();
-}
-
-void ARM64InstPrinter::printAMIndexed(const MCInst *MI, unsigned OpNum,
-                                      unsigned Scale, raw_ostream &O) {
-  const MCOperand MO1 = MI->getOperand(OpNum + 1);
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
-  if (MO1.isImm()) {
-    if (MO1.getImm() != 0)
-      O << ", #" << (MO1.getImm() * Scale);
-  } else {
-    assert(MO1.isExpr() && "Unexpected operand type!");
-    O << ", " << *MO1.getExpr();
-  }
-  O << ']';
-}
-
-void ARM64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
-  unsigned prfop = MI->getOperand(OpNum).getImm();
-  if (ARM64_AM::isNamedPrefetchOp(prfop))
-    O << ARM64_AM::getPrefetchOpName((ARM64_AM::PrefetchOp)prfop);
-  else
-    O << '#' << prfop;
-}
-
-void ARM64InstPrinter::printMemoryPostIndexed32(const MCInst *MI,
-                                                unsigned OpNum,
-                                                raw_ostream &O) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
-    << 4 * MI->getOperand(OpNum + 1).getImm();
-}
-
-void ARM64InstPrinter::printMemoryPostIndexed64(const MCInst *MI,
-                                                unsigned OpNum,
-                                                raw_ostream &O) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
-    << 8 * MI->getOperand(OpNum + 1).getImm();
-}
-
-void ARM64InstPrinter::printMemoryPostIndexed128(const MCInst *MI,
-                                                 unsigned OpNum,
-                                                 raw_ostream &O) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
-    << 16 * MI->getOperand(OpNum + 1).getImm();
-}
-
-void ARM64InstPrinter::printMemoryPostIndexed(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
-    << MI->getOperand(OpNum + 1).getImm();
-}
-
-void ARM64InstPrinter::printMemoryRegOffset(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O, int LegalShiftAmt) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ", "
-    << getRegisterName(MI->getOperand(OpNum + 1).getReg());
-
-  unsigned Val = MI->getOperand(OpNum + 2).getImm();
-  ARM64_AM::ExtendType ExtType = ARM64_AM::getMemExtendType(Val);
-  bool DoShift = ARM64_AM::getMemDoShift(Val);
-
-  if (ExtType == ARM64_AM::UXTX) {
-    if (DoShift)
-      O << ", lsl";
-  } else
-    O << ", " << ARM64_AM::getExtendName(ExtType);
-
-  if (DoShift)
-    O << " #" << LegalShiftAmt;
-
-  O << "]";
-}
-
-void ARM64InstPrinter::printMemoryRegOffset8(const MCInst *MI, unsigned OpNum,
-                                             raw_ostream &O) {
-  printMemoryRegOffset(MI, OpNum, O, 0);
-}
-
-void ARM64InstPrinter::printMemoryRegOffset16(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  printMemoryRegOffset(MI, OpNum, O, 1);
-}
-
-void ARM64InstPrinter::printMemoryRegOffset32(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  printMemoryRegOffset(MI, OpNum, O, 2);
-}
-
-void ARM64InstPrinter::printMemoryRegOffset64(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  printMemoryRegOffset(MI, OpNum, O, 3);
-}
-
-void ARM64InstPrinter::printMemoryRegOffset128(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &O) {
-  printMemoryRegOffset(MI, OpNum, O, 4);
-}
-
-void ARM64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  O << '#';
-  if (MO.isFPImm())
-    // FIXME: Should this ever happen?
-    O << MO.getFPImm();
-  else
-    O << ARM64_AM::getFPImmFloat(MO.getImm());
-}
-
-static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
-  while (Stride--) {
-    switch (Reg) {
-    default:
-      assert(0 && "Vector register expected!");
-    case ARM64::Q0:  Reg = ARM64::Q1;  break;
-    case ARM64::Q1:  Reg = ARM64::Q2;  break;
-    case ARM64::Q2:  Reg = ARM64::Q3;  break;
-    case ARM64::Q3:  Reg = ARM64::Q4;  break;
-    case ARM64::Q4:  Reg = ARM64::Q5;  break;
-    case ARM64::Q5:  Reg = ARM64::Q6;  break;
-    case ARM64::Q6:  Reg = ARM64::Q7;  break;
-    case ARM64::Q7:  Reg = ARM64::Q8;  break;
-    case ARM64::Q8:  Reg = ARM64::Q9;  break;
-    case ARM64::Q9:  Reg = ARM64::Q10; break;
-    case ARM64::Q10: Reg = ARM64::Q11; break;
-    case ARM64::Q11: Reg = ARM64::Q12; break;
-    case ARM64::Q12: Reg = ARM64::Q13; break;
-    case ARM64::Q13: Reg = ARM64::Q14; break;
-    case ARM64::Q14: Reg = ARM64::Q15; break;
-    case ARM64::Q15: Reg = ARM64::Q16; break;
-    case ARM64::Q16: Reg = ARM64::Q17; break;
-    case ARM64::Q17: Reg = ARM64::Q18; break;
-    case ARM64::Q18: Reg = ARM64::Q19; break;
-    case ARM64::Q19: Reg = ARM64::Q20; break;
-    case ARM64::Q20: Reg = ARM64::Q21; break;
-    case ARM64::Q21: Reg = ARM64::Q22; break;
-    case ARM64::Q22: Reg = ARM64::Q23; break;
-    case ARM64::Q23: Reg = ARM64::Q24; break;
-    case ARM64::Q24: Reg = ARM64::Q25; break;
-    case ARM64::Q25: Reg = ARM64::Q26; break;
-    case ARM64::Q26: Reg = ARM64::Q27; break;
-    case ARM64::Q27: Reg = ARM64::Q28; break;
-    case ARM64::Q28: Reg = ARM64::Q29; break;
-    case ARM64::Q29: Reg = ARM64::Q30; break;
-    case ARM64::Q30: Reg = ARM64::Q31; break;
-    // Vector lists can wrap around.
-    case ARM64::Q31:
-      Reg = ARM64::Q0;
-      break;
-    }
-  }
-  return Reg;
-}
-
-void ARM64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O, StringRef LayoutSuffix) {
-  unsigned Reg = MI->getOperand(OpNum).getReg();
-
-  O << "{ ";
-
-  // Work out how many registers there are in the list (if there is an actual
-  // list).
-  unsigned NumRegs = 1;
-  if (MRI.getRegClass(ARM64::DDRegClassID).contains(Reg) ||
-      MRI.getRegClass(ARM64::QQRegClassID).contains(Reg))
-    NumRegs = 2;
-  else if (MRI.getRegClass(ARM64::DDDRegClassID).contains(Reg) ||
-           MRI.getRegClass(ARM64::QQQRegClassID).contains(Reg))
-    NumRegs = 3;
-  else if (MRI.getRegClass(ARM64::DDDDRegClassID).contains(Reg) ||
-           MRI.getRegClass(ARM64::QQQQRegClassID).contains(Reg))
-    NumRegs = 4;
-
-  // Now forget about the list and find out what the first register is.
-  if (unsigned FirstReg = MRI.getSubReg(Reg, ARM64::dsub0))
-    Reg = FirstReg;
-  else if (unsigned FirstReg = MRI.getSubReg(Reg, ARM64::qsub0))
-    Reg = FirstReg;
-
-  // If it's a D-reg, we need to promote it to the equivalent Q-reg before
-  // printing (otherwise getRegisterName fails).
-  if (MRI.getRegClass(ARM64::FPR64RegClassID).contains(Reg)) {
-    const MCRegisterClass &FPR128RC = MRI.getRegClass(ARM64::FPR128RegClassID);
-    Reg = MRI.getMatchingSuperReg(Reg, ARM64::dsub, &FPR128RC);
-  }
-
-  for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
-    O << getRegisterName(Reg, ARM64::vreg) << LayoutSuffix;
-    if (i + 1 != NumRegs)
-      O << ", ";
-  }
-
-  O << " }";
-}
-
-void ARM64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
-                                                      unsigned OpNum,
-                                                      raw_ostream &O) {
-  printVectorList(MI, OpNum, O, "");
-}
-
-template <unsigned NumLanes, char LaneKind>
-void ARM64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-  std::string Suffix(".");
-  if (NumLanes)
-    Suffix += itostr(NumLanes) + LaneKind;
-  else
-    Suffix += LaneKind;
-
-  printVectorList(MI, OpNum, O, Suffix);
-}
-
-void ARM64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O) {
-  O << "[" << MI->getOperand(OpNum).getImm() << "]";
-}
-
-void ARM64InstPrinter::printAlignedBranchTarget(const MCInst *MI,
-                                                unsigned OpNum,
-                                                raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNum);
-
-  // If the label has already been resolved to an immediate offset (say, when
-  // we're running the disassembler), just print the immediate.
-  if (Op.isImm()) {
-    O << "#" << (Op.getImm() << 2);
-    return;
-  }
-
-  // If the branch target is simply an address then print it in hex.
-  const MCConstantExpr *BranchTarget =
-      dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
-  int64_t Address;
-  if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
-    O << "0x";
-    O.write_hex(Address);
-  } else {
-    // Otherwise, just print the expression.
-    O << *MI->getOperand(OpNum).getExpr();
-  }
-}
-
-void ARM64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNum);
-
-  // If the label has already been resolved to an immediate offset (say, when
-  // we're running the disassembler), just print the immediate.
-  if (Op.isImm()) {
-    O << "#" << (Op.getImm() << 12);
-    return;
-  }
-
-  // Otherwise, just print the expression.
-  O << *MI->getOperand(OpNum).getExpr();
-}
-
-void ARM64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-  const char *Name = ARM64SYS::getBarrierOptName((ARM64SYS::BarrierOption)Val);
-  if (Name)
-    O << Name;
-  else
-    O << "#" << Val;
-}
-
-void ARM64InstPrinter::printSystemRegister(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-  const char *Name =
-      ARM64SYS::getSystemRegisterName((ARM64SYS::SystemRegister)Val);
-  if (Name) {
-    O << Name;
-    return;
-  }
-
-  unsigned Op0 = 2 | ((Val >> 14) & 1);
-  unsigned Op1 = (Val >> 11) & 7;
-  unsigned CRn = (Val >> 7) & 0xf;
-  unsigned CRm = (Val >> 3) & 0xf;
-  unsigned Op2 = Val & 7;
-
-  O << 'S' << Op0 << '_' << Op1 << "_C" << CRn << "_C" << CRm << '_' << Op2;
-}
-
-void ARM64InstPrinter::printSystemCPSRField(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-  const char *Name = ARM64SYS::getCPSRFieldName((ARM64SYS::CPSRField)Val);
-  O << Name;
-}
-
-void ARM64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
-                                              raw_ostream &O) {
-  unsigned RawVal = MI->getOperand(OpNo).getImm();
-  uint64_t Val = ARM64_AM::decodeAdvSIMDModImmType10(RawVal);
-  O << format("#%#016llx", Val);
-}
diff --git a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
deleted file mode 100644
index ff66ff0..0000000
--- a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
+++ /dev/null
@@ -1,157 +0,0 @@
-//===-- ARM64InstPrinter.h - Convert ARM64 MCInst to assembly syntax ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints an ARM64 MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64INSTPRINTER_H
-#define ARM64INSTPRINTER_H
-
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-
-namespace llvm {
-
-class MCOperand;
-
-class ARM64InstPrinter : public MCInstPrinter {
-public:
-  ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                   const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
-
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-
-  // Autogenerated by tblgen.
-  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
-  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
-  virtual StringRef getRegName(unsigned RegNo) const {
-    return getRegisterName(RegNo);
-  }
-  static const char *getRegisterName(unsigned RegNo,
-                                     unsigned AltIdx = ARM64::NoRegAltName);
-
-protected:
-  bool printSysAlias(const MCInst *MI, raw_ostream &O);
-  // Operand printers
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
-                           raw_ostream &O);
-  void printPostIncOperand1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand2(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand3(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand4(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand6(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand8(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand12(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand16(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand24(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand32(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand48(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printDotCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAlignedBranchTarget(const MCInst *MI, unsigned OpNum,
-                                raw_ostream &O);
-  void printAMIndexed(const MCInst *MI, unsigned OpNum, unsigned Scale,
-                      raw_ostream &O);
-  void printAMIndexed128(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, 16, O);
-  }
-
-  void printAMIndexed64(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, 8, O);
-  }
-
-  void printAMIndexed32(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, 4, O);
-  }
-
-  void printAMIndexed16(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, 2, O);
-  }
-
-  void printAMIndexed8(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, 1, O);
-  }
-  void printAMUnscaled(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, 1, O);
-  }
-  void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printImmScale4(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printImmScale8(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printImmScale16(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemoryPostIndexed(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemoryPostIndexed32(const MCInst *MI, unsigned OpNum,
-                                raw_ostream &O);
-  void printMemoryPostIndexed64(const MCInst *MI, unsigned OpNum,
-                                raw_ostream &O);
-  void printMemoryPostIndexed128(const MCInst *MI, unsigned OpNum,
-                                 raw_ostream &O);
-  void printMemoryRegOffset(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                            int LegalShiftAmt);
-  void printMemoryRegOffset8(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemoryRegOffset16(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemoryRegOffset32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemoryRegOffset64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemoryRegOffset128(const MCInst *MI, unsigned OpNum,
-                               raw_ostream &O);
-
-  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                       StringRef LayoutSuffix);
-
-  /// Print a list of vector registers where the type suffix is implicit
-  /// (i.e. attached to the instruction rather than the registers).
-  void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O);
-
-  template <unsigned NumLanes, char LaneKind>
-  void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printSystemCPSRField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-};
-
-class ARM64AppleInstPrinter : public ARM64InstPrinter {
-public:
-  ARM64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                        const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
-
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-
-  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
-  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
-  virtual StringRef getRegName(unsigned RegNo) const {
-    return getRegisterName(RegNo);
-  }
-  static const char *getRegisterName(unsigned RegNo,
-                                     unsigned AltIdx = ARM64::NoRegAltName);
-};
-}
-
-#endif
diff --git a/lib/Target/ARM64/InstPrinter/CMakeLists.txt b/lib/Target/ARM64/InstPrinter/CMakeLists.txt
deleted file mode 100644
index b8ee12c..0000000
--- a/lib/Target/ARM64/InstPrinter/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64AsmPrinter
-  ARM64InstPrinter.cpp
-  )
-
-add_dependencies(LLVMARM64AsmPrinter ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/InstPrinter/LLVMBuild.txt b/lib/Target/ARM64/InstPrinter/LLVMBuild.txt
deleted file mode 100644
index 2ec83d2..0000000
--- a/lib/Target/ARM64/InstPrinter/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64AsmPrinter
-parent = ARM64
-required_libraries = MC Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/InstPrinter/Makefile b/lib/Target/ARM64/InstPrinter/Makefile
deleted file mode 100644
index a59efb0..0000000
--- a/lib/Target/ARM64/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM64/AsmPrinter/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64AsmPrinter
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/LLVMBuild.txt b/lib/Target/ARM64/LLVMBuild.txt
deleted file mode 100644
index 45b0628..0000000
--- a/lib/Target/ARM64/LLVMBuild.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-;===- ./lib/Target/ARM64/LLVMBuild.txt -------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[common]
-subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
-
-[component_0]
-type = TargetGroup
-name = ARM64
-parent = Target
-has_asmparser = 1
-has_asmprinter = 1
-has_disassembler = 1
-has_jit = 1
-
-[component_1]
-type = Library
-name = ARM64CodeGen
-parent = ARM64
-required_libraries = ARM64AsmPrinter ARM64Desc ARM64Info Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h b/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h
deleted file mode 100644
index 7717743..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h
+++ /dev/null
@@ -1,758 +0,0 @@
-//===- ARM64AddressingModes.h - ARM64 Addressing Modes ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 addressing mode implementation stuff.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H
-#define LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H
-
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-
-namespace llvm {
-
-/// ARM64_AM - ARM64 Addressing Mode Stuff
-namespace ARM64_AM {
-
-//===----------------------------------------------------------------------===//
-// Shifts
-//
-
-enum ShiftType {
-  InvalidShift = -1,
-  LSL = 0,
-  LSR = 1,
-  ASR = 2,
-  ROR = 3,
-  MSL = 4
-};
-
-/// getShiftName - Get the string encoding for the shift type.
-static inline const char *getShiftName(ARM64_AM::ShiftType ST) {
-  switch (ST) {
-  default: assert(false && "unhandled shift type!");
-  case ARM64_AM::LSL: return "lsl";
-  case ARM64_AM::LSR: return "lsr";
-  case ARM64_AM::ASR: return "asr";
-  case ARM64_AM::ROR: return "ror";
-  case ARM64_AM::MSL: return "msl";
-  }
-  return 0;
-}
-
-/// getShiftType - Extract the shift type.
-static inline ARM64_AM::ShiftType getShiftType(unsigned Imm) {
-  return ARM64_AM::ShiftType((Imm >> 6) & 0x7);
-}
-
-/// getShiftValue - Extract the shift value.
-static inline unsigned getShiftValue(unsigned Imm) {
-  return Imm & 0x3f;
-}
-
-/// getShifterImm - Encode the shift type and amount:
-///   imm:     6-bit shift amount
-///   shifter: 000 ==> lsl
-///            001 ==> lsr
-///            010 ==> asr
-///            011 ==> ror
-///            100 ==> msl
-///   {8-6}  = shifter
-///   {5-0}  = imm
-static inline unsigned getShifterImm(ARM64_AM::ShiftType ST, unsigned Imm) {
-  assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
-  return (unsigned(ST) << 6) | (Imm & 0x3f);
-}
-
-//===----------------------------------------------------------------------===//
-// Extends
-//
-
-enum ExtendType {
-  InvalidExtend = -1,
-  UXTB = 0,
-  UXTH = 1,
-  UXTW = 2,
-  UXTX = 3,
-  SXTB = 4,
-  SXTH = 5,
-  SXTW = 6,
-  SXTX = 7
-};
-
-/// getExtendName - Get the string encoding for the extend type.
-static inline const char *getExtendName(ARM64_AM::ExtendType ET) {
-  switch (ET) {
-  default: assert(false && "unhandled extend type!");
-  case ARM64_AM::UXTB: return "uxtb";
-  case ARM64_AM::UXTH: return "uxth";
-  case ARM64_AM::UXTW: return "uxtw";
-  case ARM64_AM::UXTX: return "uxtx";
-  case ARM64_AM::SXTB: return "sxtb";
-  case ARM64_AM::SXTH: return "sxth";
-  case ARM64_AM::SXTW: return "sxtw";
-  case ARM64_AM::SXTX: return "sxtx";
-  }
-  return 0;
-}
-
-/// getArithShiftValue - get the arithmetic shift value.
-static inline unsigned getArithShiftValue(unsigned Imm) {
-  return Imm & 0x7;
-}
-
-/// getExtendType - Extract the extend type for operands of arithmetic ops.
-static inline ARM64_AM::ExtendType getArithExtendType(unsigned Imm) {
-  return ARM64_AM::ExtendType((Imm >> 3) & 0x7);
-}
-
-/// getArithExtendImm - Encode the extend type and shift amount for an
-///                     arithmetic instruction:
-///   imm:     3-bit extend amount
-///   shifter: 000 ==> uxtb
-///            001 ==> uxth
-///            010 ==> uxtw
-///            011 ==> uxtx
-///            100 ==> sxtb
-///            101 ==> sxth
-///            110 ==> sxtw
-///            111 ==> sxtx
-///   {5-3}  = shifter
-///   {2-0}  = imm3
-static inline unsigned getArithExtendImm(ARM64_AM::ExtendType ET,
-                                         unsigned Imm) {
-  assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
-  return (unsigned(ET) << 3) | (Imm & 0x7);
-}
-
-/// getMemDoShift - Extract the "do shift" flag value for load/store
-/// instructions.
-static inline bool getMemDoShift(unsigned Imm) {
-  return (Imm & 0x1) != 0;
-}
-
-/// getExtendType - Extract the extend type for the offset operand of
-/// loads/stores.
-static inline ARM64_AM::ExtendType getMemExtendType(unsigned Imm) {
-  return ARM64_AM::ExtendType((Imm >> 1) & 0x7);
-}
-
-/// getExtendImm - Encode the extend type and amount for a load/store inst:
-///   doshift:     should the offset be scaled by the access size
-///   shifter: 000 ==> uxtb
-///            001 ==> uxth
-///            010 ==> uxtw
-///            011 ==> uxtx
-///            100 ==> sxtb
-///            101 ==> sxth
-///            110 ==> sxtw
-///            111 ==> sxtx
-///   {3-1}  = shifter
-///   {0}  = doshift
-static inline unsigned getMemExtendImm(ARM64_AM::ExtendType ET, bool DoShift) {
-  return (unsigned(ET) << 1) | unsigned(DoShift);
-}
-
-//===----------------------------------------------------------------------===//
-// Prefetch
-//
-
-/// Pre-fetch operator names.
-/// The enum values match the encoding values:
-///   prfop<4:3> 00=preload data, 10=prepare for store
-///   prfop<2:1> 00=target L1 cache, 01=target L2 cache, 10=target L3 cache,
-///   prfop<0> 0=non-streaming (temporal), 1=streaming (non-temporal)
-enum PrefetchOp {
-  InvalidPrefetchOp = -1,
-  PLDL1KEEP = 0x00,
-  PLDL1STRM = 0x01,
-  PLDL2KEEP = 0x02,
-  PLDL2STRM = 0x03,
-  PLDL3KEEP = 0x04,
-  PLDL3STRM = 0x05,
-  PSTL1KEEP = 0x10,
-  PSTL1STRM = 0x11,
-  PSTL2KEEP = 0x12,
-  PSTL2STRM = 0x13,
-  PSTL3KEEP = 0x14,
-  PSTL3STRM = 0x15
-};
-
-/// isNamedPrefetchOp - Check if the prefetch-op 5-bit value has a name.
-static inline bool isNamedPrefetchOp(unsigned prfop) {
-  switch (prfop) {
-  default: return false;
-  case ARM64_AM::PLDL1KEEP: case ARM64_AM::PLDL1STRM: case ARM64_AM::PLDL2KEEP:
-  case ARM64_AM::PLDL2STRM: case ARM64_AM::PLDL3KEEP: case ARM64_AM::PLDL3STRM:
-  case ARM64_AM::PSTL1KEEP: case ARM64_AM::PSTL1STRM: case ARM64_AM::PSTL2KEEP:
-  case ARM64_AM::PSTL2STRM: case ARM64_AM::PSTL3KEEP: case ARM64_AM::PSTL3STRM:
-    return true;
-  }
-}
-
-
-/// getPrefetchOpName - Get the string encoding for the prefetch operator.
-static inline const char *getPrefetchOpName(ARM64_AM::PrefetchOp prfop) {
-  switch (prfop) {
-  default: assert(false && "unhandled prefetch-op type!");
-  case ARM64_AM::PLDL1KEEP: return "pldl1keep";
-  case ARM64_AM::PLDL1STRM: return "pldl1strm";
-  case ARM64_AM::PLDL2KEEP: return "pldl2keep";
-  case ARM64_AM::PLDL2STRM: return "pldl2strm";
-  case ARM64_AM::PLDL3KEEP: return "pldl3keep";
-  case ARM64_AM::PLDL3STRM: return "pldl3strm";
-  case ARM64_AM::PSTL1KEEP: return "pstl1keep";
-  case ARM64_AM::PSTL1STRM: return "pstl1strm";
-  case ARM64_AM::PSTL2KEEP: return "pstl2keep";
-  case ARM64_AM::PSTL2STRM: return "pstl2strm";
-  case ARM64_AM::PSTL3KEEP: return "pstl3keep";
-  case ARM64_AM::PSTL3STRM: return "pstl3strm";
-  }
-  return 0;
-}
-
-static inline uint64_t ror(uint64_t elt, unsigned size) {
-  return ((elt & 1) << (size-1)) | (elt >> 1);
-}
-
-/// processLogicalImmediate - Determine if an immediate value can be encoded
-/// as the immediate operand of a logical instruction for the given register
-/// size.  If so, return true with "encoding" set to the encoded value in
-/// the form N:immr:imms.
-static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize,
-                                           uint64_t &encoding) {
-  if (imm == 0ULL || imm == ~0ULL ||
-      (regSize != 64 && (imm >> regSize != 0 || imm == ~0U)))
-    return false;
-
-  unsigned size = 2;
-  uint64_t eltVal = imm;
-
-  // First, determine the element size.
-  while (size < regSize) {
-    unsigned numElts = regSize / size;
-    unsigned mask = (1ULL << size) - 1;
-    uint64_t lowestEltVal = imm & mask;
-
-    bool allMatched = true;
-    for (unsigned i = 1; i < numElts; ++i) {
-     uint64_t currEltVal = (imm >> (i*size)) & mask;
-      if (currEltVal != lowestEltVal) {
-        allMatched = false;
-        break;
-      }
-    }
-
-    if (allMatched) {
-      eltVal = lowestEltVal;
-      break;
-    }
-
-    size *= 2;
-  }
-
-  // Second, determine the rotation to make the element be: 0^m 1^n.
-  for (unsigned i = 0; i < size; ++i) {
-    eltVal = ror(eltVal, size);
-    uint32_t clz = countLeadingZeros(eltVal) - (64 - size);
-    uint32_t cto = CountTrailingOnes_64(eltVal);
-
-    if (clz + cto == size) {
-      // Encode in immr the number of RORs it would take to get *from* this
-      // element value to our target value, where i+1 is the number of RORs
-      // to go the opposite direction.
-      unsigned immr = size - (i + 1);
-
-      // If size has a 1 in the n'th bit, create a value that has zeroes in
-      // bits [0, n] and ones above that.
-      uint64_t nimms = ~(size-1) << 1;
-
-      // Or the CTO value into the low bits, which must be below the Nth bit
-      // bit mentioned above.
-      nimms |= (cto-1);
-
-      // Extract the seventh bit and toggle it to create the N field.
-      unsigned N = ((nimms >> 6) & 1) ^ 1;
-
-      encoding = (N << 12) | (immr << 6) | (nimms & 0x3f);
-      return true;
-    }
-  }
-
-  return false;
-}
-
-/// isLogicalImmediate - Return true if the immediate is valid for a logical
-/// immediate instruction of the given register size. Return false otherwise.
-static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) {
-  uint64_t encoding;
-  return processLogicalImmediate(imm, regSize, encoding);
-}
-
-/// encodeLogicalImmediate - Return the encoded immediate value for a logical
-/// immediate instruction of the given register size.
-static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) {
-  uint64_t encoding = 0;
-  bool res = processLogicalImmediate(imm, regSize, encoding);
-  assert(res && "invalid logical immediate");
-  (void)res;
-  return encoding;
-}
-
-/// decodeLogicalImmediate - Decode a logical immediate value in the form
-/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the
-/// integer value it represents with regSize bits.
-static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) {
-  // Extract the N, imms, and immr fields.
-  unsigned N = (val >> 12) & 1;
-  unsigned immr = (val >> 6) & 0x3f;
-  unsigned imms = val & 0x3f;
-
-  assert((regSize == 64 || N == 0) && "undefined logical immediate encoding");
-  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
-  assert(len >= 0 && "undefined logical immediate encoding");
-  unsigned size = (1 << len);
-  unsigned R = immr & (size - 1);
-  unsigned S = imms & (size - 1);
-  assert(S != size - 1 && "undefined logical immediate encoding");
-  uint64_t pattern = (1ULL << (S + 1)) - 1;
-  for (unsigned i = 0; i < R; ++i)
-    pattern = ror(pattern, size);
-
-  // Replicate the pattern to fill the regSize.
-  while (size != regSize) {
-    pattern |= (pattern << size);
-    size *= 2;
-  }
-  return pattern;
-}
-
-/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value
-/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits)
-/// is a valid encoding for an integer value with regSize bits.
-static inline bool isValidDecodeLogicalImmediate(uint64_t val,
-                                                 unsigned regSize) {
-  // Extract the N and imms fields needed for checking.
-  unsigned N = (val >> 12) & 1;
-  unsigned imms = val & 0x3f;
-
-  if (regSize == 32 && N != 0) // undefined logical immediate encoding
-    return false;
-  int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f));
-  if (len < 0) // undefined logical immediate encoding
-    return false;
-  unsigned size = (1 << len);
-  unsigned S = imms & (size - 1);
-  if (S == size - 1) // undefined logical immediate encoding
-    return false;
-
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating-point Immediates
-//
-static inline float getFPImmFloat(unsigned Imm) {
-  // We expect an 8-bit binary encoding of a floating-point number here.
-  union {
-    uint32_t I;
-    float F;
-  } FPUnion;
-
-  uint8_t Sign = (Imm >> 7) & 0x1;
-  uint8_t Exp = (Imm >> 4) & 0x7;
-  uint8_t Mantissa = Imm & 0xf;
-
-  //   8-bit FP    iEEEE Float Encoding
-  //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
-  //
-  // where B = NOT(b);
-
-  FPUnion.I = 0;
-  FPUnion.I |= Sign << 31;
-  FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
-  FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
-  FPUnion.I |= (Exp & 0x3) << 23;
-  FPUnion.I |= Mantissa << 19;
-  return FPUnion.F;
-}
-
-/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
-/// floating-point value. If the value cannot be represented as an 8-bit
-/// floating-point value, then return -1.
-static inline int getFP32Imm(const APInt &Imm) {
-  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
-  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
-  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
-
-  // We can handle 4 bits of mantissa.
-  // mantissa = (16+UInt(e:f:g:h))/16.
-  if (Mantissa & 0x7ffff)
-    return -1;
-  Mantissa >>= 19;
-  if ((Mantissa & 0xf) != Mantissa)
-    return -1;
-
-  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
-  if (Exp < -3 || Exp > 4)
-    return -1;
-  Exp = ((Exp+3) & 0x7) ^ 4;
-
-  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
-}
-
-static inline int getFP32Imm(const APFloat &FPImm) {
-  return getFP32Imm(FPImm.bitcastToAPInt());
-}
-
-/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
-/// floating-point value. If the value cannot be represented as an 8-bit
-/// floating-point value, then return -1.
-static inline int getFP64Imm(const APInt &Imm) {
-  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
-  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023;   // -1022 to 1023
-  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
-
-  // We can handle 4 bits of mantissa.
-  // mantissa = (16+UInt(e:f:g:h))/16.
-  if (Mantissa & 0xffffffffffffULL)
-    return -1;
-  Mantissa >>= 48;
-  if ((Mantissa & 0xf) != Mantissa)
-    return -1;
-
-  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
-  if (Exp < -3 || Exp > 4)
-    return -1;
-  Exp = ((Exp+3) & 0x7) ^ 4;
-
-  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
-}
-
-static inline int getFP64Imm(const APFloat &FPImm) {
-  return getFP64Imm(FPImm.bitcastToAPInt());
-}
-
-//===--------------------------------------------------------------------===//
-// AdvSIMD Modified Immediates
-//===--------------------------------------------------------------------===//
-
-// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh
-static inline bool isAdvSIMDModImmType1(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm & 0xffffff00ffffff00ULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) {
-  return (Imm & 0xffULL);
-}
-
-static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 32) | EncVal;
-}
-
-// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00
-static inline bool isAdvSIMDModImmType2(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm & 0xffff00ffffff00ffULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) {
-  return (Imm & 0xff00ULL) >> 8;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 40) | (EncVal << 8);
-}
-
-// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00
-static inline bool isAdvSIMDModImmType3(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm & 0xff00ffffff00ffffULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) {
-  return (Imm & 0xff0000ULL) >> 16;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 48) | (EncVal << 16);
-}
-
-// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00
-static inline bool isAdvSIMDModImmType4(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm & 0x00ffffff00ffffffULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) {
-  return (Imm & 0xff000000ULL) >> 24;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 56) | (EncVal << 24);
-}
-
-// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh
-static inline bool isAdvSIMDModImmType5(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) &&
-         ((Imm & 0xff00ff00ff00ff00ULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) {
-  return (Imm & 0xffULL);
-}
-
-static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal;
-}
-
-// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00
-static inline bool isAdvSIMDModImmType6(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) &&
-         ((Imm & 0x00ff00ff00ff00ffULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) {
-  return (Imm & 0xff00ULL) >> 8;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8);
-}
-
-// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF
-static inline bool isAdvSIMDModImmType7(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) {
-  return (Imm & 0xff00ULL) >> 8;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL;
-}
-
-// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF
-static inline bool isAdvSIMDModImmType8(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL);
-}
-
-static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL;
-}
-
-static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) {
-  return (Imm & 0x00ff0000ULL) >> 16;
-}
-
-// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh
-static inline bool isAdvSIMDModImmType9(uint64_t Imm) {
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         ((Imm >> 48) == (Imm & 0x0000ffffULL)) &&
-         ((Imm >> 56) == (Imm & 0x000000ffULL));
-}
-
-static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) {
-  return (Imm & 0xffULL);
-}
-
-static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) {
-  uint64_t EncVal = Imm;
-  EncVal |= (EncVal << 8);
-  EncVal |= (EncVal << 16);
-  EncVal |= (EncVal << 32);
-  return EncVal;
-}
-
-// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
-// cmode: 1110, op: 1
-static inline bool isAdvSIMDModImmType10(uint64_t Imm) {
-  uint64_t ByteA = Imm & 0xff00000000000000ULL;
-  uint64_t ByteB = Imm & 0x00ff000000000000ULL;
-  uint64_t ByteC = Imm & 0x0000ff0000000000ULL;
-  uint64_t ByteD = Imm & 0x000000ff00000000ULL;
-  uint64_t ByteE = Imm & 0x00000000ff000000ULL;
-  uint64_t ByteF = Imm & 0x0000000000ff0000ULL;
-  uint64_t ByteG = Imm & 0x000000000000ff00ULL;
-  uint64_t ByteH = Imm & 0x00000000000000ffULL;
-
-  return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) &&
-         (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) &&
-         (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) &&
-         (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) &&
-         (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) &&
-         (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) &&
-         (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) &&
-         (ByteH == 0ULL || ByteH == 0x00000000000000ffULL);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) {
-  uint8_t BitA = (Imm & 0xff00000000000000ULL) != 0;
-  uint8_t BitB = (Imm & 0x00ff000000000000ULL) != 0;
-  uint8_t BitC = (Imm & 0x0000ff0000000000ULL) != 0;
-  uint8_t BitD = (Imm & 0x000000ff00000000ULL) != 0;
-  uint8_t BitE = (Imm & 0x00000000ff000000ULL) != 0;
-  uint8_t BitF = (Imm & 0x0000000000ff0000ULL) != 0;
-  uint8_t BitG = (Imm & 0x000000000000ff00ULL) != 0;
-  uint8_t BitH = (Imm & 0x00000000000000ffULL) != 0;
-
-  uint8_t EncVal = BitA;
-  EncVal <<= 1;
-  EncVal |= BitB;
-  EncVal <<= 1;
-  EncVal |= BitC;
-  EncVal <<= 1;
-  EncVal |= BitD;
-  EncVal <<= 1;
-  EncVal |= BitE;
-  EncVal <<= 1;
-  EncVal |= BitF;
-  EncVal <<= 1;
-  EncVal |= BitG;
-  EncVal <<= 1;
-  EncVal |= BitH;
-  return EncVal;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) {
-  uint64_t EncVal = 0;
-  if (Imm & 0x80) EncVal |= 0xff00000000000000ULL;
-  if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL;
-  if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL;
-  if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL;
-  if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL;
-  if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL;
-  if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL;
-  if (Imm & 0x01) EncVal |= 0x00000000000000ffULL;
-  return EncVal;
-}
-
-// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00
-static inline bool isAdvSIMDModImmType11(uint64_t Imm) {
-  uint64_t BString = (Imm & 0x7E000000ULL) >> 25;
-  return ((Imm >> 32) == (Imm & 0xffffffffULL)) &&
-         (BString == 0x1f || BString == 0x20) &&
-         ((Imm & 0x0007ffff0007ffffULL) == 0);
-}
-
-static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) {
-  uint8_t BitA = (Imm & 0x80000000ULL) != 0;
-  uint8_t BitB = (Imm & 0x20000000ULL) != 0;
-  uint8_t BitC = (Imm & 0x01000000ULL) != 0;
-  uint8_t BitD = (Imm & 0x00800000ULL) != 0;
-  uint8_t BitE = (Imm & 0x00400000ULL) != 0;
-  uint8_t BitF = (Imm & 0x00200000ULL) != 0;
-  uint8_t BitG = (Imm & 0x00100000ULL) != 0;
-  uint8_t BitH = (Imm & 0x00080000ULL) != 0;
-
-  uint8_t EncVal = BitA;
-  EncVal <<= 1;
-  EncVal |= BitB;
-  EncVal <<= 1;
-  EncVal |= BitC;
-  EncVal <<= 1;
-  EncVal |= BitD;
-  EncVal <<= 1;
-  EncVal |= BitE;
-  EncVal <<= 1;
-  EncVal |= BitF;
-  EncVal <<= 1;
-  EncVal |= BitG;
-  EncVal <<= 1;
-  EncVal |= BitH;
-  return EncVal;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) {
-  uint64_t EncVal = 0;
-  if (Imm & 0x80) EncVal |= 0x80000000ULL;
-  if (Imm & 0x40) EncVal |= 0x3e000000ULL;
-  else            EncVal |= 0x40000000ULL;
-  if (Imm & 0x20) EncVal |= 0x01000000ULL;
-  if (Imm & 0x10) EncVal |= 0x00800000ULL;
-  if (Imm & 0x08) EncVal |= 0x00400000ULL;
-  if (Imm & 0x04) EncVal |= 0x00200000ULL;
-  if (Imm & 0x02) EncVal |= 0x00100000ULL;
-  if (Imm & 0x01) EncVal |= 0x00080000ULL;
-  return (EncVal << 32) | EncVal;
-}
-
-// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00
-static inline bool isAdvSIMDModImmType12(uint64_t Imm) {
-  uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54;
-  return ((BString == 0xff || BString == 0x100) &&
-         ((Imm & 0x0000ffffffffffffULL) == 0));
-}
-
-static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) {
-  uint8_t BitA = (Imm & 0x8000000000000000ULL) != 0;
-  uint8_t BitB = (Imm & 0x0040000000000000ULL) != 0;
-  uint8_t BitC = (Imm & 0x0020000000000000ULL) != 0;
-  uint8_t BitD = (Imm & 0x0010000000000000ULL) != 0;
-  uint8_t BitE = (Imm & 0x0008000000000000ULL) != 0;
-  uint8_t BitF = (Imm & 0x0004000000000000ULL) != 0;
-  uint8_t BitG = (Imm & 0x0002000000000000ULL) != 0;
-  uint8_t BitH = (Imm & 0x0001000000000000ULL) != 0;
-
-  uint8_t EncVal = BitA;
-  EncVal <<= 1;
-  EncVal |= BitB;
-  EncVal <<= 1;
-  EncVal |= BitC;
-  EncVal <<= 1;
-  EncVal |= BitD;
-  EncVal <<= 1;
-  EncVal |= BitE;
-  EncVal <<= 1;
-  EncVal |= BitF;
-  EncVal <<= 1;
-  EncVal |= BitG;
-  EncVal <<= 1;
-  EncVal |= BitH;
-  return EncVal;
-}
-
-static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
-  uint64_t EncVal = 0;
-  if (Imm & 0x80) EncVal |= 0x8000000000000000ULL;
-  if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL;
-  else            EncVal |= 0x4000000000000000ULL;
-  if (Imm & 0x20) EncVal |= 0x0020000000000000ULL;
-  if (Imm & 0x10) EncVal |= 0x0010000000000000ULL;
-  if (Imm & 0x08) EncVal |= 0x0008000000000000ULL;
-  if (Imm & 0x04) EncVal |= 0x0004000000000000ULL;
-  if (Imm & 0x02) EncVal |= 0x0002000000000000ULL;
-  if (Imm & 0x01) EncVal |= 0x0001000000000000ULL;
-  return (EncVal << 32) | EncVal;
-}
-
-} // end namespace ARM64_AM
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
deleted file mode 100644
index 26813e2..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
+++ /dev/null
@@ -1,533 +0,0 @@
-//===-- ARM64AsmBackend.cpp - ARM64 Assembler Backend ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64RegisterInfo.h"
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCFixupKindInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
-using namespace llvm;
-
-namespace {
-
-class ARM64AsmBackend : public MCAsmBackend {
-  static const unsigned PCRelFlagVal =
-      MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
-
-public:
-  ARM64AsmBackend(const Target &T) : MCAsmBackend() {}
-
-  unsigned getNumFixupKinds() const { return ARM64::NumTargetFixupKinds; }
-
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
-    const static MCFixupKindInfo Infos[ARM64::NumTargetFixupKinds] = {
-      // This table *must* be in the order that the fixup_* kinds are defined in
-      // ARM64FixupKinds.h.
-      //
-      // Name                           Offset (bits) Size (bits)     Flags
-      { "fixup_arm64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_arm64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_arm64_add_imm12", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale1", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale2", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale4", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale8", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale16", 10, 12, 0 },
-      { "fixup_arm64_movw", 5, 16, 0 },
-      { "fixup_arm64_pcrel_branch14", 5, 14, PCRelFlagVal },
-      { "fixup_arm64_pcrel_imm19", 5, 19, PCRelFlagVal },
-      { "fixup_arm64_pcrel_branch26", 0, 26, PCRelFlagVal },
-      { "fixup_arm64_pcrel_call26", 0, 26, PCRelFlagVal },
-      { "fixup_arm64_tlsdesc_call", 0, 0, 0 }
-    };
-
-    if (Kind < FirstTargetFixupKind)
-      return MCAsmBackend::getFixupKindInfo(Kind);
-
-    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-           "Invalid kind!");
-    return Infos[Kind - FirstTargetFixupKind];
-  }
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const;
-
-  bool mayNeedRelaxation(const MCInst &Inst) const;
-  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const;
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
-
-  void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
-
-  unsigned getPointerSize() const { return 8; }
-};
-
-} // end anonymous namespace
-
-/// \brief The number of bytes the fixup may change.
-static unsigned getFixupKindNumBytes(unsigned Kind) {
-  switch (Kind) {
-  default:
-    assert(0 && "Unknown fixup kind!");
-
-  case ARM64::fixup_arm64_tlsdesc_call:
-    return 0;
-
-  case FK_Data_1:
-    return 1;
-
-  case FK_Data_2:
-  case ARM64::fixup_arm64_movw:
-    return 2;
-
-  case ARM64::fixup_arm64_pcrel_branch14:
-  case ARM64::fixup_arm64_add_imm12:
-  case ARM64::fixup_arm64_ldst_imm12_scale1:
-  case ARM64::fixup_arm64_ldst_imm12_scale2:
-  case ARM64::fixup_arm64_ldst_imm12_scale4:
-  case ARM64::fixup_arm64_ldst_imm12_scale8:
-  case ARM64::fixup_arm64_ldst_imm12_scale16:
-  case ARM64::fixup_arm64_pcrel_imm19:
-    return 3;
-
-  case ARM64::fixup_arm64_pcrel_adr_imm21:
-  case ARM64::fixup_arm64_pcrel_adrp_imm21:
-  case ARM64::fixup_arm64_pcrel_branch26:
-  case ARM64::fixup_arm64_pcrel_call26:
-  case FK_Data_4:
-    return 4;
-
-  case FK_Data_8:
-    return 8;
-  }
-}
-
-static unsigned AdrImmBits(unsigned Value) {
-  unsigned lo2 = Value & 0x3;
-  unsigned hi19 = (Value & 0x1ffffc) >> 2;
-  return (hi19 << 5) | (lo2 << 29);
-}
-
-static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
-  int64_t SignedValue = static_cast<int64_t>(Value);
-  switch (Kind) {
-  default:
-    assert(false && "Unknown fixup kind!");
-  case ARM64::fixup_arm64_pcrel_adr_imm21:
-    if (SignedValue > 2097151 || SignedValue < -2097152)
-      report_fatal_error("fixup value out of range");
-    return AdrImmBits(Value & 0x1fffffULL);
-  case ARM64::fixup_arm64_pcrel_adrp_imm21:
-    return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
-  case ARM64::fixup_arm64_pcrel_imm19:
-    // Signed 21-bit immediate
-    if (SignedValue > 2097151 || SignedValue < -2097152)
-      report_fatal_error("fixup value out of range");
-    // Low two bits are not encoded.
-    return (Value >> 2) & 0x7ffff;
-  case ARM64::fixup_arm64_add_imm12:
-  case ARM64::fixup_arm64_ldst_imm12_scale1:
-    // Unsigned 12-bit immediate
-    if (Value >= 0x1000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value;
-  case ARM64::fixup_arm64_ldst_imm12_scale2:
-    // Unsigned 12-bit immediate which gets multiplied by 2
-    if (Value & 1 || Value >= 0x2000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 1;
-  case ARM64::fixup_arm64_ldst_imm12_scale4:
-    // Unsigned 12-bit immediate which gets multiplied by 4
-    if (Value & 3 || Value >= 0x4000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 2;
-  case ARM64::fixup_arm64_ldst_imm12_scale8:
-    // Unsigned 12-bit immediate which gets multiplied by 8
-    if (Value & 7 || Value >= 0x8000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 3;
-  case ARM64::fixup_arm64_ldst_imm12_scale16:
-    // Unsigned 12-bit immediate which gets multiplied by 16
-    if (Value & 15 || Value >= 0x10000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 4;
-  case ARM64::fixup_arm64_movw:
-    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
-    return Value;
-  case ARM64::fixup_arm64_pcrel_branch14:
-    // Signed 16-bit immediate
-    if (SignedValue > 32767 || SignedValue < -32768)
-      report_fatal_error("fixup value out of range");
-    // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0x3)
-      report_fatal_error("fixup not sufficiently aligned");
-    return (Value >> 2) & 0x3fff;
-  case ARM64::fixup_arm64_pcrel_branch26:
-  case ARM64::fixup_arm64_pcrel_call26:
-    // Signed 28-bit immediate
-    if (SignedValue > 134217727 || SignedValue < -134217728)
-      report_fatal_error("fixup value out of range");
-    // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0x3)
-      report_fatal_error("fixup not sufficiently aligned");
-    return (Value >> 2) & 0x3ffffff;
-  case FK_Data_1:
-  case FK_Data_2:
-  case FK_Data_4:
-  case FK_Data_8:
-    return Value;
-  }
-}
-
-void ARM64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                 unsigned DataSize, uint64_t Value,
-                                 bool IsPCRel) const {
-  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  if (!Value)
-    return; // Doesn't change encoding.
-  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
-  // Apply any target-specific value adjustments.
-  Value = adjustFixupValue(Fixup.getKind(), Value);
-
-  // Shift the value into position.
-  Value <<= Info.TargetOffset;
-
-  unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-
-  // For each byte of the fragment that the fixup touches, mask in the
-  // bits from the fixup value.
-  for (unsigned i = 0; i != NumBytes; ++i)
-    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
-}
-
-bool ARM64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
-  return false;
-}
-
-bool ARM64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                                           const MCRelaxableFragment *DF,
-                                           const MCAsmLayout &Layout) const {
-  // FIXME:  This isn't correct for ARM64. Just moving the "generic" logic
-  // into the targets for now.
-  //
-  // Relax if the value is too big for a (signed) i8.
-  return int64_t(Value) != int64_t(int8_t(Value));
-}
-
-void ARM64AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
-  assert(false && "ARM64AsmBackend::relaxInstruction() unimplemented");
-}
-
-bool ARM64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  // If the count is not 4-byte aligned, we must be writing data into the text
-  // section (otherwise we have unaligned instructions, and thus have far
-  // bigger problems), so just write zeros instead.
-  if ((Count & 3) != 0) {
-    for (uint64_t i = 0, e = (Count & 3); i != e; ++i)
-      OW->Write8(0);
-  }
-
-  // We are properly aligned, so write NOPs as requested.
-  Count /= 4;
-  for (uint64_t i = 0; i != Count; ++i)
-    OW->Write32(0xd503201f);
-  return true;
-}
-
-namespace {
-
-namespace CU {
-
-/// \brief Compact unwind encoding values.
-enum CompactUnwindEncodings {
-  /// \brief A "frameless" leaf function, where no non-volatile registers are
-  /// saved. The return remains in LR throughout the function.
-  UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
-
-  /// \brief No compact unwind encoding available. Instead the low 23-bits of
-  /// the compact unwind encoding is the offset of the DWARF FDE in the
-  /// __eh_frame section. This mode is never used in object files. It is only
-  /// generated by the linker in final linked images, which have only DWARF info
-  /// for a function.
-  UNWIND_ARM64_MODE_DWARF = 0x03000000,
-
-  /// \brief This is a standard arm64 prologue where FP/LR are immediately
-  /// pushed on the stack, then SP is copied to FP. If there are any
-  /// non-volatile register saved, they are copied into the stack fame in pairs
-  /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
-  /// five X pairs and four D pairs can be saved, but the memory layout must be
-  /// in register number order.
-  UNWIND_ARM64_MODE_FRAME = 0x04000000,
-
-  /// \brief Frame register pair encodings.
-  UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
-  UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
-  UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
-  UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008,
-  UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010,
-  UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100,
-  UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200,
-  UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400,
-  UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800
-};
-
-} // end CU namespace
-
-// FIXME: This should be in a separate file.
-class DarwinARM64AsmBackend : public ARM64AsmBackend {
-  const MCRegisterInfo &MRI;
-
-  /// \brief Encode compact unwind stack adjustment for frameless functions.
-  /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
-  /// The stack size always needs to be 16 byte aligned.
-  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
-    return (StackSize / 16) << 12;
-  }
-
-public:
-  DarwinARM64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
-      : ARM64AsmBackend(T), MRI(MRI) {}
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createARM64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
-                                       MachO::CPU_SUBTYPE_ARM64_ALL);
-  }
-
-  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
-    // Any section for which the linker breaks things into atoms needs to
-    // preserve symbols, including assembler local symbols, to identify
-    // those atoms. These sections are:
-    // Sections of type:
-    //
-    //    S_CSTRING_LITERALS  (e.g. __cstring)
-    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
-    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
-    //
-    // Sections named:
-    //
-    //    __TEXT,__eh_frame
-    //    __TEXT,__ustring
-    //    __DATA,__cfstring
-    //    __DATA,__objc_classrefs
-    //    __DATA,__objc_catlist
-    //
-    // FIXME: It would be better if the compiler used actual linker local
-    // symbols for each of these sections rather than preserving what
-    // are ostensibly assembler local symbols.
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
-    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
-            SMO.getType() == MachO::S_4BYTE_LITERALS ||
-            SMO.getType() == MachO::S_8BYTE_LITERALS ||
-            SMO.getType() == MachO::S_16BYTE_LITERALS ||
-            SMO.getType() == MachO::S_LITERAL_POINTERS ||
-            (SMO.getSegmentName() == "__TEXT" &&
-             (SMO.getSectionName() == "__eh_frame" ||
-              SMO.getSectionName() == "__ustring")) ||
-            (SMO.getSegmentName() == "__DATA" &&
-             (SMO.getSectionName() == "__cfstring" ||
-              SMO.getSectionName() == "__objc_classrefs" ||
-              SMO.getSectionName() == "__objc_catlist")));
-  }
-
-  /// \brief Generate the compact unwind encoding from the CFI directives.
-  virtual uint32_t
-  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const
-      override {
-    if (Instrs.empty())
-      return CU::UNWIND_ARM64_MODE_FRAMELESS;
-
-    bool HasFP = false;
-    unsigned StackSize = 0;
-
-    uint32_t CompactUnwindEncoding = 0;
-    for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
-      const MCCFIInstruction &Inst = Instrs[i];
-
-      switch (Inst.getOperation()) {
-      default:
-        // Cannot handle this directive:  bail out.
-        return CU::UNWIND_ARM64_MODE_DWARF;
-      case MCCFIInstruction::OpDefCfa: {
-        // Defines a frame pointer.
-        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
-                   ARM64::FP &&
-               "Invalid frame pointer!");
-        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
-
-        const MCCFIInstruction &LRPush = Instrs[++i];
-        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
-               "Link register not pushed!");
-        const MCCFIInstruction &FPPush = Instrs[++i];
-        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
-               "Frame pointer not pushed!");
-
-        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
-        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
-
-        LRReg = getXRegFromWReg(LRReg);
-        FPReg = getXRegFromWReg(FPReg);
-
-        assert(LRReg == ARM64::LR && FPReg == ARM64::FP &&
-               "Pushing invalid registers for frame!");
-
-        // Indicate that the function has a frame.
-        CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
-        HasFP = true;
-        break;
-      }
-      case MCCFIInstruction::OpDefCfaOffset: {
-        assert(StackSize == 0 && "We already have the CFA offset!");
-        StackSize = std::abs(Inst.getOffset());
-        break;
-      }
-      case MCCFIInstruction::OpOffset: {
-        // Registers are saved in pairs. We expect there to be two consecutive
-        // `.cfi_offset' instructions with the appropriate registers specified.
-        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
-        if (i + 1 == e)
-          return CU::UNWIND_ARM64_MODE_DWARF;
-
-        const MCCFIInstruction &Inst2 = Instrs[++i];
-        if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
-          return CU::UNWIND_ARM64_MODE_DWARF;
-        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
-
-        // N.B. The encodings must be in register number order, and the X
-        // registers before the D registers.
-
-        // X19/X20 pair = 0x00000001,
-        // X21/X22 pair = 0x00000002,
-        // X23/X24 pair = 0x00000004,
-        // X25/X26 pair = 0x00000008,
-        // X27/X28 pair = 0x00000010
-        Reg1 = getXRegFromWReg(Reg1);
-        Reg2 = getXRegFromWReg(Reg2);
-
-        if (Reg1 == ARM64::X19 && Reg2 == ARM64::X20 &&
-            (CompactUnwindEncoding & 0xF1E) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR;
-        else if (Reg1 == ARM64::X21 && Reg2 == ARM64::X22 &&
-                 (CompactUnwindEncoding & 0xF1C) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR;
-        else if (Reg1 == ARM64::X23 && Reg2 == ARM64::X24 &&
-                 (CompactUnwindEncoding & 0xF18) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR;
-        else if (Reg1 == ARM64::X25 && Reg2 == ARM64::X26 &&
-                 (CompactUnwindEncoding & 0xF10) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR;
-        else if (Reg1 == ARM64::X27 && Reg2 == ARM64::X28 &&
-                 (CompactUnwindEncoding & 0xF00) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR;
-        else {
-          Reg1 = getDRegFromBReg(Reg1);
-          Reg2 = getDRegFromBReg(Reg2);
-
-          // D8/D9 pair   = 0x00000100,
-          // D10/D11 pair = 0x00000200,
-          // D12/D13 pair = 0x00000400,
-          // D14/D15 pair = 0x00000800
-          if (Reg1 == ARM64::D8 && Reg2 == ARM64::D9 &&
-              (CompactUnwindEncoding & 0xE00) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR;
-          else if (Reg1 == ARM64::D10 && Reg2 == ARM64::D11 &&
-                   (CompactUnwindEncoding & 0xC00) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR;
-          else if (Reg1 == ARM64::D12 && Reg2 == ARM64::D13 &&
-                   (CompactUnwindEncoding & 0x800) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR;
-          else if (Reg1 == ARM64::D14 && Reg2 == ARM64::D15)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR;
-          else
-            // A pair was pushed which we cannot handle.
-            return CU::UNWIND_ARM64_MODE_DWARF;
-        }
-
-        break;
-      }
-      }
-    }
-
-    if (!HasFP) {
-      // With compact unwind info we can only represent stack adjustments of up
-      // to 65520 bytes.
-      if (StackSize > 65520)
-        return CU::UNWIND_ARM64_MODE_DWARF;
-
-      CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS;
-      CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
-    }
-
-    return CompactUnwindEncoding;
-  }
-};
-
-} // end anonymous namespace
-
-namespace {
-
-class ELFARM64AsmBackend : public ARM64AsmBackend {
-public:
-  uint8_t OSABI;
-
-  ELFARM64AsmBackend(const Target &T, uint8_t OSABI)
-      : ARM64AsmBackend(T), OSABI(OSABI) {}
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createARM64ELFObjectWriter(OS, OSABI);
-  }
-
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
-};
-
-void ELFARM64AsmBackend::processFixupValue(const MCAssembler &Asm,
-                                           const MCAsmLayout &Layout,
-                                           const MCFixup &Fixup,
-                                           const MCFragment *DF,
-                                           const MCValue &Target,
-                                           uint64_t &Value, bool &IsResolved) {
-  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
-  // ~0xfff. This means that the required offset to reach a symbol can vary by
-  // up to one step depending on where the ADRP is in memory. For example:
-  //
-  //     ADRP x0, there
-  //  there:
-  //
-  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
-  // we'll need that as an offset. At any other address "there" will be in the
-  // same page as the ADRP and the instruction should encode 0x0. Assuming the
-  // section isn't 0x1000-aligned, we therefore need to delegate this decision
-  // to the linker -- a relocation!
-  if ((uint32_t)Fixup.getKind() == ARM64::fixup_arm64_pcrel_adrp_imm21)
-    IsResolved = false;
-}
-}
-
-MCAsmBackend *llvm::createARM64AsmBackend(const Target &T,
-                                          const MCRegisterInfo &MRI,
-                                          StringRef TT, StringRef CPU) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin())
-    return new DarwinARM64AsmBackend(T, MRI);
-
-  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
-  return new ELFARM64AsmBackend(T, TheTriple.getOS());
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h b/lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h
deleted file mode 100644
index d3c2cf7..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h
+++ /dev/null
@@ -1,998 +0,0 @@
-//===-- ARM64BaseInfo.h - Top level definitions for ARM64 -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains small standalone helper functions and enum definitions for
-// the ARM64 target useful for the compiler back-end and the MC libraries.
-// As such, it deliberately does not include references to LLVM core
-// code gen types, passes, etc..
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64BASEINFO_H
-#define ARM64BASEINFO_H
-
-#include "ARM64MCTargetDesc.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-
-inline static unsigned getWRegFromXReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::X0: return ARM64::W0;
-  case ARM64::X1: return ARM64::W1;
-  case ARM64::X2: return ARM64::W2;
-  case ARM64::X3: return ARM64::W3;
-  case ARM64::X4: return ARM64::W4;
-  case ARM64::X5: return ARM64::W5;
-  case ARM64::X6: return ARM64::W6;
-  case ARM64::X7: return ARM64::W7;
-  case ARM64::X8: return ARM64::W8;
-  case ARM64::X9: return ARM64::W9;
-  case ARM64::X10: return ARM64::W10;
-  case ARM64::X11: return ARM64::W11;
-  case ARM64::X12: return ARM64::W12;
-  case ARM64::X13: return ARM64::W13;
-  case ARM64::X14: return ARM64::W14;
-  case ARM64::X15: return ARM64::W15;
-  case ARM64::X16: return ARM64::W16;
-  case ARM64::X17: return ARM64::W17;
-  case ARM64::X18: return ARM64::W18;
-  case ARM64::X19: return ARM64::W19;
-  case ARM64::X20: return ARM64::W20;
-  case ARM64::X21: return ARM64::W21;
-  case ARM64::X22: return ARM64::W22;
-  case ARM64::X23: return ARM64::W23;
-  case ARM64::X24: return ARM64::W24;
-  case ARM64::X25: return ARM64::W25;
-  case ARM64::X26: return ARM64::W26;
-  case ARM64::X27: return ARM64::W27;
-  case ARM64::X28: return ARM64::W28;
-  case ARM64::FP: return ARM64::W29;
-  case ARM64::LR: return ARM64::W30;
-  case ARM64::SP: return ARM64::WSP;
-  case ARM64::XZR: return ARM64::WZR;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-inline static unsigned getXRegFromWReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::W0: return ARM64::X0;
-  case ARM64::W1: return ARM64::X1;
-  case ARM64::W2: return ARM64::X2;
-  case ARM64::W3: return ARM64::X3;
-  case ARM64::W4: return ARM64::X4;
-  case ARM64::W5: return ARM64::X5;
-  case ARM64::W6: return ARM64::X6;
-  case ARM64::W7: return ARM64::X7;
-  case ARM64::W8: return ARM64::X8;
-  case ARM64::W9: return ARM64::X9;
-  case ARM64::W10: return ARM64::X10;
-  case ARM64::W11: return ARM64::X11;
-  case ARM64::W12: return ARM64::X12;
-  case ARM64::W13: return ARM64::X13;
-  case ARM64::W14: return ARM64::X14;
-  case ARM64::W15: return ARM64::X15;
-  case ARM64::W16: return ARM64::X16;
-  case ARM64::W17: return ARM64::X17;
-  case ARM64::W18: return ARM64::X18;
-  case ARM64::W19: return ARM64::X19;
-  case ARM64::W20: return ARM64::X20;
-  case ARM64::W21: return ARM64::X21;
-  case ARM64::W22: return ARM64::X22;
-  case ARM64::W23: return ARM64::X23;
-  case ARM64::W24: return ARM64::X24;
-  case ARM64::W25: return ARM64::X25;
-  case ARM64::W26: return ARM64::X26;
-  case ARM64::W27: return ARM64::X27;
-  case ARM64::W28: return ARM64::X28;
-  case ARM64::W29: return ARM64::FP;
-  case ARM64::W30: return ARM64::LR;
-  case ARM64::WSP: return ARM64::SP;
-  case ARM64::WZR: return ARM64::XZR;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-static inline unsigned getBRegFromDReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::D0:  return ARM64::B0;
-  case ARM64::D1:  return ARM64::B1;
-  case ARM64::D2:  return ARM64::B2;
-  case ARM64::D3:  return ARM64::B3;
-  case ARM64::D4:  return ARM64::B4;
-  case ARM64::D5:  return ARM64::B5;
-  case ARM64::D6:  return ARM64::B6;
-  case ARM64::D7:  return ARM64::B7;
-  case ARM64::D8:  return ARM64::B8;
-  case ARM64::D9:  return ARM64::B9;
-  case ARM64::D10: return ARM64::B10;
-  case ARM64::D11: return ARM64::B11;
-  case ARM64::D12: return ARM64::B12;
-  case ARM64::D13: return ARM64::B13;
-  case ARM64::D14: return ARM64::B14;
-  case ARM64::D15: return ARM64::B15;
-  case ARM64::D16: return ARM64::B16;
-  case ARM64::D17: return ARM64::B17;
-  case ARM64::D18: return ARM64::B18;
-  case ARM64::D19: return ARM64::B19;
-  case ARM64::D20: return ARM64::B20;
-  case ARM64::D21: return ARM64::B21;
-  case ARM64::D22: return ARM64::B22;
-  case ARM64::D23: return ARM64::B23;
-  case ARM64::D24: return ARM64::B24;
-  case ARM64::D25: return ARM64::B25;
-  case ARM64::D26: return ARM64::B26;
-  case ARM64::D27: return ARM64::B27;
-  case ARM64::D28: return ARM64::B28;
-  case ARM64::D29: return ARM64::B29;
-  case ARM64::D30: return ARM64::B30;
-  case ARM64::D31: return ARM64::B31;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-
-static inline unsigned getDRegFromBReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::B0:  return ARM64::D0;
-  case ARM64::B1:  return ARM64::D1;
-  case ARM64::B2:  return ARM64::D2;
-  case ARM64::B3:  return ARM64::D3;
-  case ARM64::B4:  return ARM64::D4;
-  case ARM64::B5:  return ARM64::D5;
-  case ARM64::B6:  return ARM64::D6;
-  case ARM64::B7:  return ARM64::D7;
-  case ARM64::B8:  return ARM64::D8;
-  case ARM64::B9:  return ARM64::D9;
-  case ARM64::B10: return ARM64::D10;
-  case ARM64::B11: return ARM64::D11;
-  case ARM64::B12: return ARM64::D12;
-  case ARM64::B13: return ARM64::D13;
-  case ARM64::B14: return ARM64::D14;
-  case ARM64::B15: return ARM64::D15;
-  case ARM64::B16: return ARM64::D16;
-  case ARM64::B17: return ARM64::D17;
-  case ARM64::B18: return ARM64::D18;
-  case ARM64::B19: return ARM64::D19;
-  case ARM64::B20: return ARM64::D20;
-  case ARM64::B21: return ARM64::D21;
-  case ARM64::B22: return ARM64::D22;
-  case ARM64::B23: return ARM64::D23;
-  case ARM64::B24: return ARM64::D24;
-  case ARM64::B25: return ARM64::D25;
-  case ARM64::B26: return ARM64::D26;
-  case ARM64::B27: return ARM64::D27;
-  case ARM64::B28: return ARM64::D28;
-  case ARM64::B29: return ARM64::D29;
-  case ARM64::B30: return ARM64::D30;
-  case ARM64::B31: return ARM64::D31;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-namespace ARM64CC {
-
-// The CondCodes constants map directly to the 4-bit encoding of the condition
-// field for predicated instructions.
-enum CondCode {  // Meaning (integer)          Meaning (floating-point)
-  EQ = 0x0,      // Equal                      Equal
-  NE = 0x1,      // Not equal                  Not equal, or unordered
-  CS = 0x2,      // Carry set                  >, ==, or unordered
-  CC = 0x3,      // Carry clear                Less than
-  MI = 0x4,      // Minus, negative            Less than
-  PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
-  VS = 0x6,      // Overflow                   Unordered
-  VC = 0x7,      // No overflow                Not unordered
-  HI = 0x8,      // Unsigned higher            Greater than, or unordered
-  LS = 0x9,      // Unsigned lower or same     Less than or equal
-  GE = 0xa,      // Greater than or equal      Greater than or equal
-  LT = 0xb,      // Less than                  Less than, or unordered
-  GT = 0xc,      // Greater than               Greater than
-  LE = 0xd,      // Less than or equal         <, ==, or unordered
-  AL = 0xe       // Always (unconditional)     Always (unconditional)
-};
-
-inline static const char *getCondCodeName(CondCode Code) {
-  // cond<0> is ignored when cond<3:1> = 111, where 1110 is 0xe (aka AL).
-  if ((Code & AL) == AL)
-    Code = AL;
-  switch (Code) {
-  case EQ:  return "eq";
-  case NE:  return "ne";
-  case CS:  return "cs";
-  case CC:  return "cc";
-  case MI:  return "mi";
-  case PL:  return "pl";
-  case VS:  return "vs";
-  case VC:  return "vc";
-  case HI:  return "hi";
-  case LS:  return "ls";
-  case GE:  return "ge";
-  case LT:  return "lt";
-  case GT:  return "gt";
-  case LE:  return "le";
-  case AL:  return "al";
-  }
-  llvm_unreachable("Unknown condition code");
-}
-
-inline static CondCode getInvertedCondCode(CondCode Code) {
-  switch (Code) {
-  default: llvm_unreachable("Unknown condition code");
-  case EQ:  return NE;
-  case NE:  return EQ;
-  case CS:  return CC;
-  case CC:  return CS;
-  case MI:  return PL;
-  case PL:  return MI;
-  case VS:  return VC;
-  case VC:  return VS;
-  case HI:  return LS;
-  case LS:  return HI;
-  case GE:  return LT;
-  case LT:  return GE;
-  case GT:  return LE;
-  case LE:  return GT;
-  }
-}
-
-/// Given a condition code, return NZCV flags that would satisfy that condition.
-/// The flag bits are in the format expected by the ccmp instructions.
-/// Note that many different flag settings can satisfy a given condition code,
-/// this function just returns one of them.
-inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
-  // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
-  enum { N = 8, Z = 4, C = 2, V = 1 };
-  switch (Code) {
-  default: llvm_unreachable("Unknown condition code");
-  case EQ: return Z; // Z == 1
-  case NE: return 0; // Z == 0
-  case CS: return C; // C == 1
-  case CC: return 0; // C == 0
-  case MI: return N; // N == 1
-  case PL: return 0; // N == 0
-  case VS: return V; // V == 1
-  case VC: return 0; // V == 0
-  case HI: return C; // C == 1 && Z == 0
-  case LS: return 0; // C == 0 || Z == 1
-  case GE: return 0; // N == V
-  case LT: return N; // N != V
-  case GT: return 0; // Z == 0 && N == V
-  case LE: return Z; // Z == 1 || N != V
-  }
-}
-} // end namespace ARM64CC
-
-namespace ARM64SYS {
-enum BarrierOption {
-  InvalidBarrier = 0xff,
-  OSHLD = 0x1,
-  OSHST = 0x2,
-  OSH =   0x3,
-  NSHLD = 0x5,
-  NSHST = 0x6,
-  NSH =   0x7,
-  ISHLD = 0x9,
-  ISHST = 0xa,
-  ISH =   0xb,
-  LD =    0xd,
-  ST =    0xe,
-  SY =    0xf
-};
-
-inline static const char *getBarrierOptName(BarrierOption Opt) {
-  switch (Opt) {
-  default: return NULL;
-  case 0x1: return "oshld";
-  case 0x2: return "oshst";
-  case 0x3: return "osh";
-  case 0x5: return "nshld";
-  case 0x6: return "nshst";
-  case 0x7: return "nsh";
-  case 0x9: return "ishld";
-  case 0xa: return "ishst";
-  case 0xb: return "ish";
-  case 0xd: return "ld";
-  case 0xe: return "st";
-  case 0xf: return "sy";
-  }
-}
-
-#define A64_SYSREG_ENC(op0,CRn,op2,CRm,op1) ((op0) << 14 | (op1) << 11 | \
-                                             (CRn) << 7  | (CRm) << 3 | (op2))
-enum SystemRegister {
-  InvalidSystemReg = 0,
-  // Table in section 3.10.3
-  SPSR_EL1  = 0xc200,
-  SPSR_svc  = SPSR_EL1,
-  ELR_EL1   = 0xc201,
-  SP_EL0    = 0xc208,
-  SPSel     = 0xc210,
-  CurrentEL = 0xc212,
-  DAIF      = 0xda11,
-  NZCV      = 0xda10,
-  FPCR      = 0xda20,
-  FPSR      = 0xda21,
-  DSPSR     = 0xda28,
-  DLR       = 0xda29,
-  SPSR_EL2  = 0xe200,
-  SPSR_hyp  = SPSR_EL2,
-  ELR_EL2   = 0xe201,
-  SP_EL1    = 0xe208,
-  SPSR_irq  = 0xe218,
-  SPSR_abt  = 0xe219,
-  SPSR_und  = 0xe21a,
-  SPSR_fiq  = 0xe21b,
-  SPSR_EL3  = 0xf200,
-  ELR_EL3   = 0xf201,
-  SP_EL2    = 0xf208,
-
-
-  // Table in section 3.10.8
-  MIDR_EL1 = 0xc000,
-  CTR_EL0 = 0xd801,
-  MPIDR_EL1 = 0xc005,
-  ECOIDR_EL1 = 0xc006,
-  DCZID_EL0 = 0xd807,
-  MVFR0_EL1 = 0xc018,
-  MVFR1_EL1 = 0xc019,
-  ID_AA64PFR0_EL1 = 0xc020,
-  ID_AA64PFR1_EL1 = 0xc021,
-  ID_AA64DFR0_EL1 = 0xc028,
-  ID_AA64DFR1_EL1 = 0xc029,
-  ID_AA64ISAR0_EL1 = 0xc030,
-  ID_AA64ISAR1_EL1 = 0xc031,
-  ID_AA64MMFR0_EL1 = 0xc038,
-  ID_AA64MMFR1_EL1 = 0xc039,
-  CCSIDR_EL1 = 0xc800,
-  CLIDR_EL1 = 0xc801,
-  AIDR_EL1 = 0xc807,
-  CSSELR_EL1 = 0xd000,
-  VPIDR_EL2 = 0xe000,
-  VMPIDR_EL2 = 0xe005,
-  SCTLR_EL1 = 0xc080,
-  SCTLR_EL2 = 0xe080,
-  SCTLR_EL3 = 0xf080,
-  ACTLR_EL1 = 0xc081,
-  ACTLR_EL2 = 0xe081,
-  ACTLR_EL3 = 0xf081,
-  CPACR_EL1 = 0xc082,
-  CPTR_EL2 = 0xe08a,
-  CPTR_EL3 = 0xf08a,
-  SCR_EL3 = 0xf088,
-  HCR_EL2 = 0xe088,
-  MDCR_EL2 = 0xe089,
-  MDCR_EL3 = 0xf099,
-  HSTR_EL2 = 0xe08b,
-  HACR_EL2 = 0xe08f,
-  TTBR0_EL1 = 0xc100,
-  TTBR1_EL1 = 0xc101,
-  TTBR0_EL2 = 0xe100,
-  TTBR0_EL3 = 0xf100,
-  VTTBR_EL2 = 0xe108,
-  TCR_EL1 = 0xc102,
-  TCR_EL2 = 0xe102,
-  TCR_EL3 = 0xf102,
-  VTCR_EL2 = 0xe10a,
-  ADFSR_EL1 = 0xc288,
-  AIFSR_EL1 = 0xc289,
-  ADFSR_EL2 = 0xe288,
-  AIFSR_EL2 = 0xe289,
-  ADFSR_EL3 = 0xf288,
-  AIFSR_EL3 = 0xf289,
-  ESR_EL1 = 0xc290,
-  ESR_EL2 = 0xe290,
-  ESR_EL3 = 0xf290,
-  FAR_EL1 = 0xc300,
-  FAR_EL2 = 0xe300,
-  FAR_EL3 = 0xf300,
-  HPFAR_EL2 = 0xe304,
-  PAR_EL1 = 0xc3a0,
-  MAIR_EL1 = 0xc510,
-  MAIR_EL2 = 0xe510,
-  MAIR_EL3 = 0xf510,
-  AMAIR_EL1 = 0xc518,
-  AMAIR_EL2 = 0xe518,
-  AMAIR_EL3 = 0xf518,
-  VBAR_EL1 = 0xc600,
-  VBAR_EL2 = 0xe600,
-  VBAR_EL3 = 0xf600,
-  RVBAR_EL1 = 0xc601,
-  RVBAR_EL2 = 0xe601,
-  RVBAR_EL3 = 0xf601,
-  ISR_EL1 = 0xc608,
-  CONTEXTIDR_EL1 = 0xc681,
-  TPIDR_EL0 = 0xde82,
-  TPIDRRO_EL0 = 0xde83,
-  TPIDR_EL1 = 0xc684,
-  TPIDR_EL2 = 0xe682,
-  TPIDR_EL3 = 0xf682,
-  TEECR32_EL1 = 0x9000,
-  CNTFRQ_EL0 = 0xdf00,
-  CNTPCT_EL0 = 0xdf01,
-  CNTVCT_EL0 = 0xdf02,
-  CNTVOFF_EL2 = 0xe703,
-  CNTKCTL_EL1 = 0xc708,
-  CNTHCTL_EL2 = 0xe708,
-  CNTP_TVAL_EL0 = 0xdf10,
-  CNTP_CTL_EL0 = 0xdf11,
-  CNTP_CVAL_EL0 = 0xdf12,
-  CNTV_TVAL_EL0 = 0xdf18,
-  CNTV_CTL_EL0 = 0xdf19,
-  CNTV_CVAL_EL0 = 0xdf1a,
-  CNTHP_TVAL_EL2 = 0xe710,
-  CNTHP_CTL_EL2 = 0xe711,
-  CNTHP_CVAL_EL2 = 0xe712,
-  CNTPS_TVAL_EL1 = 0xff10,
-  CNTPS_CTL_EL1 = 0xff11,
-  CNTPS_CVAL_EL1= 0xff12,
-
-  PMEVCNTR0_EL0  = 0xdf40,
-  PMEVCNTR1_EL0  = 0xdf41,
-  PMEVCNTR2_EL0  = 0xdf42,
-  PMEVCNTR3_EL0  = 0xdf43,
-  PMEVCNTR4_EL0  = 0xdf44,
-  PMEVCNTR5_EL0  = 0xdf45,
-  PMEVCNTR6_EL0  = 0xdf46,
-  PMEVCNTR7_EL0  = 0xdf47,
-  PMEVCNTR8_EL0  = 0xdf48,
-  PMEVCNTR9_EL0  = 0xdf49,
-  PMEVCNTR10_EL0 = 0xdf4a,
-  PMEVCNTR11_EL0 = 0xdf4b,
-  PMEVCNTR12_EL0 = 0xdf4c,
-  PMEVCNTR13_EL0 = 0xdf4d,
-  PMEVCNTR14_EL0 = 0xdf4e,
-  PMEVCNTR15_EL0 = 0xdf4f,
-  PMEVCNTR16_EL0 = 0xdf50,
-  PMEVCNTR17_EL0 = 0xdf51,
-  PMEVCNTR18_EL0 = 0xdf52,
-  PMEVCNTR19_EL0 = 0xdf53,
-  PMEVCNTR20_EL0 = 0xdf54,
-  PMEVCNTR21_EL0 = 0xdf55,
-  PMEVCNTR22_EL0 = 0xdf56,
-  PMEVCNTR23_EL0 = 0xdf57,
-  PMEVCNTR24_EL0 = 0xdf58,
-  PMEVCNTR25_EL0 = 0xdf59,
-  PMEVCNTR26_EL0 = 0xdf5a,
-  PMEVCNTR27_EL0 = 0xdf5b,
-  PMEVCNTR28_EL0 = 0xdf5c,
-  PMEVCNTR29_EL0 = 0xdf5d,
-  PMEVCNTR30_EL0 = 0xdf5e,
-
-  PMEVTYPER0_EL0  = 0xdf60,
-  PMEVTYPER1_EL0  = 0xdf61,
-  PMEVTYPER2_EL0  = 0xdf62,
-  PMEVTYPER3_EL0  = 0xdf63,
-  PMEVTYPER4_EL0  = 0xdf64,
-  PMEVTYPER5_EL0  = 0xdf65,
-  PMEVTYPER6_EL0  = 0xdf66,
-  PMEVTYPER7_EL0  = 0xdf67,
-  PMEVTYPER8_EL0  = 0xdf68,
-  PMEVTYPER9_EL0  = 0xdf69,
-  PMEVTYPER10_EL0 = 0xdf6a,
-  PMEVTYPER11_EL0 = 0xdf6b,
-  PMEVTYPER12_EL0 = 0xdf6c,
-  PMEVTYPER13_EL0 = 0xdf6d,
-  PMEVTYPER14_EL0 = 0xdf6e,
-  PMEVTYPER15_EL0 = 0xdf6f,
-  PMEVTYPER16_EL0 = 0xdf70,
-  PMEVTYPER17_EL0 = 0xdf71,
-  PMEVTYPER18_EL0 = 0xdf72,
-  PMEVTYPER19_EL0 = 0xdf73,
-  PMEVTYPER20_EL0 = 0xdf74,
-  PMEVTYPER21_EL0 = 0xdf75,
-  PMEVTYPER22_EL0 = 0xdf76,
-  PMEVTYPER23_EL0 = 0xdf77,
-  PMEVTYPER24_EL0 = 0xdf78,
-  PMEVTYPER25_EL0 = 0xdf79,
-  PMEVTYPER26_EL0 = 0xdf7a,
-  PMEVTYPER27_EL0 = 0xdf7b,
-  PMEVTYPER28_EL0 = 0xdf7c,
-  PMEVTYPER29_EL0 = 0xdf7d,
-  PMEVTYPER30_EL0 = 0xdf7e,
-
-  PMCCFILTR_EL0  = 0xdf7f,
-
-  RMR_EL3 = 0xf602,
-  RMR_EL2 = 0xd602,
-  RMR_EL1 = 0xce02,
-
-  // Debug Architecture 5.3, Table 17.
-  MDCCSR_EL0   = A64_SYSREG_ENC(2, 0, 0, 1, 3),
-  MDCCINT_EL1  = A64_SYSREG_ENC(2, 0, 0, 2, 0),
-  DBGDTR_EL0   = A64_SYSREG_ENC(2, 0, 0, 4, 3),
-  DBGDTRRX_EL0 = A64_SYSREG_ENC(2, 0, 0, 5, 3),
-  DBGDTRTX_EL0 = DBGDTRRX_EL0,
-  DBGVCR32_EL2 = A64_SYSREG_ENC(2, 0, 0, 7, 4),
-  OSDTRRX_EL1  = A64_SYSREG_ENC(2, 0, 2, 0, 0),
-  MDSCR_EL1    = A64_SYSREG_ENC(2, 0, 2, 2, 0),
-  OSDTRTX_EL1  = A64_SYSREG_ENC(2, 0, 2, 3, 0),
-  OSECCR_EL11  = A64_SYSREG_ENC(2, 0, 2, 6, 0),
-
-  DBGBVR0_EL1  = A64_SYSREG_ENC(2, 0, 4, 0, 0),
-  DBGBVR1_EL1  = A64_SYSREG_ENC(2, 0, 4, 1, 0),
-  DBGBVR2_EL1  = A64_SYSREG_ENC(2, 0, 4, 2, 0),
-  DBGBVR3_EL1  = A64_SYSREG_ENC(2, 0, 4, 3, 0),
-  DBGBVR4_EL1  = A64_SYSREG_ENC(2, 0, 4, 4, 0),
-  DBGBVR5_EL1  = A64_SYSREG_ENC(2, 0, 4, 5, 0),
-  DBGBVR6_EL1  = A64_SYSREG_ENC(2, 0, 4, 6, 0),
-  DBGBVR7_EL1  = A64_SYSREG_ENC(2, 0, 4, 7, 0),
-  DBGBVR8_EL1  = A64_SYSREG_ENC(2, 0, 4, 8, 0),
-  DBGBVR9_EL1  = A64_SYSREG_ENC(2, 0, 4, 9, 0),
-  DBGBVR10_EL1 = A64_SYSREG_ENC(2, 0, 4, 10, 0),
-  DBGBVR11_EL1 = A64_SYSREG_ENC(2, 0, 4, 11, 0),
-  DBGBVR12_EL1 = A64_SYSREG_ENC(2, 0, 4, 12, 0),
-  DBGBVR13_EL1 = A64_SYSREG_ENC(2, 0, 4, 13, 0),
-  DBGBVR14_EL1 = A64_SYSREG_ENC(2, 0, 4, 14, 0),
-  DBGBVR15_EL1 = A64_SYSREG_ENC(2, 0, 4, 15, 0),
-
-  DBGBCR0_EL1  = A64_SYSREG_ENC(2, 0, 5, 0, 0),
-  DBGBCR1_EL1  = A64_SYSREG_ENC(2, 0, 5, 1, 0),
-  DBGBCR2_EL1  = A64_SYSREG_ENC(2, 0, 5, 2, 0),
-  DBGBCR3_EL1  = A64_SYSREG_ENC(2, 0, 5, 3, 0),
-  DBGBCR4_EL1  = A64_SYSREG_ENC(2, 0, 5, 4, 0),
-  DBGBCR5_EL1  = A64_SYSREG_ENC(2, 0, 5, 5, 0),
-  DBGBCR6_EL1  = A64_SYSREG_ENC(2, 0, 5, 6, 0),
-  DBGBCR7_EL1  = A64_SYSREG_ENC(2, 0, 5, 7, 0),
-  DBGBCR8_EL1  = A64_SYSREG_ENC(2, 0, 5, 8, 0),
-  DBGBCR9_EL1  = A64_SYSREG_ENC(2, 0, 5, 9, 0),
-  DBGBCR10_EL1 = A64_SYSREG_ENC(2, 0, 5, 10, 0),
-  DBGBCR11_EL1 = A64_SYSREG_ENC(2, 0, 5, 11, 0),
-  DBGBCR12_EL1 = A64_SYSREG_ENC(2, 0, 5, 12, 0),
-  DBGBCR13_EL1 = A64_SYSREG_ENC(2, 0, 5, 13, 0),
-  DBGBCR14_EL1 = A64_SYSREG_ENC(2, 0, 5, 14, 0),
-  DBGBCR15_EL1 = A64_SYSREG_ENC(2, 0, 5, 15, 0),
-
-  DBGWVR0_EL1  = A64_SYSREG_ENC(2, 0, 6, 0, 0),
-  DBGWVR1_EL1  = A64_SYSREG_ENC(2, 0, 6, 1, 0),
-  DBGWVR2_EL1  = A64_SYSREG_ENC(2, 0, 6, 2, 0),
-  DBGWVR3_EL1  = A64_SYSREG_ENC(2, 0, 6, 3, 0),
-  DBGWVR4_EL1  = A64_SYSREG_ENC(2, 0, 6, 4, 0),
-  DBGWVR5_EL1  = A64_SYSREG_ENC(2, 0, 6, 5, 0),
-  DBGWVR6_EL1  = A64_SYSREG_ENC(2, 0, 6, 6, 0),
-  DBGWVR7_EL1  = A64_SYSREG_ENC(2, 0, 6, 7, 0),
-  DBGWVR8_EL1  = A64_SYSREG_ENC(2, 0, 6, 8, 0),
-  DBGWVR9_EL1  = A64_SYSREG_ENC(2, 0, 6, 9, 0),
-  DBGWVR10_EL1 = A64_SYSREG_ENC(2, 0, 6, 10, 0),
-  DBGWVR11_EL1 = A64_SYSREG_ENC(2, 0, 6, 11, 0),
-  DBGWVR12_EL1 = A64_SYSREG_ENC(2, 0, 6, 12, 0),
-  DBGWVR13_EL1 = A64_SYSREG_ENC(2, 0, 6, 13, 0),
-  DBGWVR14_EL1 = A64_SYSREG_ENC(2, 0, 6, 14, 0),
-  DBGWVR15_EL1 = A64_SYSREG_ENC(2, 0, 6, 15, 0),
-
-  DBGWCR0_EL1  = A64_SYSREG_ENC(2, 0, 7, 0, 0),
-  DBGWCR1_EL1  = A64_SYSREG_ENC(2, 0, 7, 1, 0),
-  DBGWCR2_EL1  = A64_SYSREG_ENC(2, 0, 7, 2, 0),
-  DBGWCR3_EL1  = A64_SYSREG_ENC(2, 0, 7, 3, 0),
-  DBGWCR4_EL1  = A64_SYSREG_ENC(2, 0, 7, 4, 0),
-  DBGWCR5_EL1  = A64_SYSREG_ENC(2, 0, 7, 5, 0),
-  DBGWCR6_EL1  = A64_SYSREG_ENC(2, 0, 7, 6, 0),
-  DBGWCR7_EL1  = A64_SYSREG_ENC(2, 0, 7, 7, 0),
-  DBGWCR8_EL1  = A64_SYSREG_ENC(2, 0, 7, 8, 0),
-  DBGWCR9_EL1  = A64_SYSREG_ENC(2, 0, 7, 9, 0),
-  DBGWCR10_EL1 = A64_SYSREG_ENC(2, 0, 7, 10, 0),
-  DBGWCR11_EL1 = A64_SYSREG_ENC(2, 0, 7, 11, 0),
-  DBGWCR12_EL1 = A64_SYSREG_ENC(2, 0, 7, 12, 0),
-  DBGWCR13_EL1 = A64_SYSREG_ENC(2, 0, 7, 13, 0),
-  DBGWCR14_EL1 = A64_SYSREG_ENC(2, 0, 7, 14, 0),
-  DBGWCR15_EL1 = A64_SYSREG_ENC(2, 0, 7, 15, 0),
-
-  MDRAR_EL1    = A64_SYSREG_ENC(2, 1, 0, 0, 0),
-  OSLAR_EL1    = A64_SYSREG_ENC(2, 1, 4, 0, 0),
-  OSLSR_EL1    = A64_SYSREG_ENC(2, 1, 4, 1, 0),
-  OSDLR_EL1    = A64_SYSREG_ENC(2, 1, 4, 3, 0),
-  DBGPRCR_EL1  = A64_SYSREG_ENC(2, 1, 4, 4, 0),
-
-  DBGCLAIMSET_EL1   = A64_SYSREG_ENC(2, 7, 6, 8, 0),
-  DBGCLAIMCLR_EL1   = A64_SYSREG_ENC(2, 7, 6, 9, 0),
-  DBGAUTHSTATUS_EL1 = A64_SYSREG_ENC(2, 7, 6, 14, 0),
-
-  DBGDEVID2    = A64_SYSREG_ENC(2, 7, 7, 0, 0),
-  DBGDEVID1    = A64_SYSREG_ENC(2, 7, 7, 1, 0),
-  DBGDEVID0    = A64_SYSREG_ENC(2, 7, 7, 2, 0),
-
-  // The following registers are defined to allow access from AArch64 to
-  // registers which are only used in the AArch32 architecture.
-  DACR32_EL2 = 0xe180,
-  IFSR32_EL2 = 0xe281,
-  TEEHBR32_EL1 = 0x9080,
-  SDER32_EL3 = 0xf089,
-  FPEXC32_EL2 = 0xe298,
-
-  // Cyclone specific system registers
-  CPM_IOACC_CTL_EL3 = 0xff90,
-
-  // Architectural system registers
-  ID_PFR0_EL1 = 0xc008,
-  ID_PFR1_EL1 = 0xc009,
-  ID_DFR0_EL1 = 0xc00a,
-  ID_AFR0_EL1 = 0xc00b,
-  ID_ISAR0_EL1 = 0xc010,
-  ID_ISAR1_EL1 = 0xc011,
-  ID_ISAR2_EL1 = 0xc012,
-  ID_ISAR3_EL1 = 0xc013,
-  ID_ISAR4_EL1 = 0xc014,
-  ID_ISAR5_EL1 = 0xc015,
-  AFSR1_EL1 = 0xc289, // note same as old AIFSR_EL1
-  AFSR0_EL1 = 0xc288, // note same as old ADFSR_EL1
-  REVIDR_EL1 = 0xc006 // note same as old ECOIDR_EL1
-
-};
-#undef A64_SYSREG_ENC
-
-static inline const char *getSystemRegisterName(SystemRegister Reg) {
-  switch(Reg) {
-  default: return NULL; // Caller is responsible for handling invalid value.
-  case SPSR_EL1: return "SPSR_EL1";
-  case ELR_EL1: return "ELR_EL1";
-  case SP_EL0: return "SP_EL0";
-  case SPSel: return "SPSel";
-  case DAIF: return "DAIF";
-  case CurrentEL: return "CurrentEL";
-  case NZCV: return "NZCV";
-  case FPCR: return "FPCR";
-  case FPSR: return "FPSR";
-  case DSPSR: return "DSPSR";
-  case DLR: return "DLR";
-  case SPSR_EL2: return "SPSR_EL2";
-  case ELR_EL2: return "ELR_EL2";
-  case SP_EL1: return "SP_EL1";
-  case SPSR_irq: return "SPSR_irq";
-  case SPSR_abt: return "SPSR_abt";
-  case SPSR_und: return "SPSR_und";
-  case SPSR_fiq: return "SPSR_fiq";
-  case SPSR_EL3: return "SPSR_EL3";
-  case ELR_EL3: return "ELR_EL3";
-  case SP_EL2: return "SP_EL2";
-  case MIDR_EL1: return "MIDR_EL1";
-  case CTR_EL0: return "CTR_EL0";
-  case MPIDR_EL1: return "MPIDR_EL1";
-  case DCZID_EL0: return "DCZID_EL0";
-  case MVFR0_EL1: return "MVFR0_EL1";
-  case MVFR1_EL1: return "MVFR1_EL1";
-  case ID_AA64PFR0_EL1: return "ID_AA64PFR0_EL1";
-  case ID_AA64PFR1_EL1: return "ID_AA64PFR1_EL1";
-  case ID_AA64DFR0_EL1: return "ID_AA64DFR0_EL1";
-  case ID_AA64DFR1_EL1: return "ID_AA64DFR1_EL1";
-  case ID_AA64ISAR0_EL1: return "ID_AA64ISAR0_EL1";
-  case ID_AA64ISAR1_EL1: return "ID_AA64ISAR1_EL1";
-  case ID_AA64MMFR0_EL1: return "ID_AA64MMFR0_EL1";
-  case ID_AA64MMFR1_EL1: return "ID_AA64MMFR1_EL1";
-  case CCSIDR_EL1: return "CCSIDR_EL1";
-  case CLIDR_EL1: return "CLIDR_EL1";
-  case AIDR_EL1: return "AIDR_EL1";
-  case CSSELR_EL1: return "CSSELR_EL1";
-  case VPIDR_EL2: return "VPIDR_EL2";
-  case VMPIDR_EL2: return "VMPIDR_EL2";
-  case SCTLR_EL1: return "SCTLR_EL1";
-  case SCTLR_EL2: return "SCTLR_EL2";
-  case SCTLR_EL3: return "SCTLR_EL3";
-  case ACTLR_EL1: return "ACTLR_EL1";
-  case ACTLR_EL2: return "ACTLR_EL2";
-  case ACTLR_EL3: return "ACTLR_EL3";
-  case CPACR_EL1: return "CPACR_EL1";
-  case CPTR_EL2: return "CPTR_EL2";
-  case CPTR_EL3: return "CPTR_EL3";
-  case SCR_EL3: return "SCR_EL3";
-  case HCR_EL2: return "HCR_EL2";
-  case MDCR_EL2: return "MDCR_EL2";
-  case MDCR_EL3: return "MDCR_EL3";
-  case HSTR_EL2: return "HSTR_EL2";
-  case HACR_EL2: return "HACR_EL2";
-  case TTBR0_EL1: return "TTBR0_EL1";
-  case TTBR1_EL1: return "TTBR1_EL1";
-  case TTBR0_EL2: return "TTBR0_EL2";
-  case TTBR0_EL3: return "TTBR0_EL3";
-  case VTTBR_EL2: return "VTTBR_EL2";
-  case TCR_EL1: return "TCR_EL1";
-  case TCR_EL2: return "TCR_EL2";
-  case TCR_EL3: return "TCR_EL3";
-  case VTCR_EL2: return "VTCR_EL2";
-  case ADFSR_EL2: return "ADFSR_EL2";
-  case AIFSR_EL2: return "AIFSR_EL2";
-  case ADFSR_EL3: return "ADFSR_EL3";
-  case AIFSR_EL3: return "AIFSR_EL3";
-  case ESR_EL1: return "ESR_EL1";
-  case ESR_EL2: return "ESR_EL2";
-  case ESR_EL3: return "ESR_EL3";
-  case FAR_EL1: return "FAR_EL1";
-  case FAR_EL2: return "FAR_EL2";
-  case FAR_EL3: return "FAR_EL3";
-  case HPFAR_EL2: return "HPFAR_EL2";
-  case PAR_EL1: return "PAR_EL1";
-  case MAIR_EL1: return "MAIR_EL1";
-  case MAIR_EL2: return "MAIR_EL2";
-  case MAIR_EL3: return "MAIR_EL3";
-  case AMAIR_EL1: return "AMAIR_EL1";
-  case AMAIR_EL2: return "AMAIR_EL2";
-  case AMAIR_EL3: return "AMAIR_EL3";
-  case VBAR_EL1: return "VBAR_EL1";
-  case VBAR_EL2: return "VBAR_EL2";
-  case VBAR_EL3: return "VBAR_EL3";
-  case RVBAR_EL1: return "RVBAR_EL1";
-  case RVBAR_EL2: return "RVBAR_EL2";
-  case RVBAR_EL3: return "RVBAR_EL3";
-  case ISR_EL1: return "ISR_EL1";
-  case CONTEXTIDR_EL1: return "CONTEXTIDR_EL1";
-  case TPIDR_EL0: return "TPIDR_EL0";
-  case TPIDRRO_EL0: return "TPIDRRO_EL0";
-  case TPIDR_EL1: return "TPIDR_EL1";
-  case TPIDR_EL2: return "TPIDR_EL2";
-  case TPIDR_EL3: return "TPIDR_EL3";
-  case TEECR32_EL1: return "TEECR32_EL1";
-  case CNTFRQ_EL0: return "CNTFRQ_EL0";
-  case CNTPCT_EL0: return "CNTPCT_EL0";
-  case CNTVCT_EL0: return "CNTVCT_EL0";
-  case CNTVOFF_EL2: return "CNTVOFF_EL2";
-  case CNTKCTL_EL1: return "CNTKCTL_EL1";
-  case CNTHCTL_EL2: return "CNTHCTL_EL2";
-  case CNTP_TVAL_EL0: return "CNTP_TVAL_EL0";
-  case CNTP_CTL_EL0: return "CNTP_CTL_EL0";
-  case CNTP_CVAL_EL0: return "CNTP_CVAL_EL0";
-  case CNTV_TVAL_EL0: return "CNTV_TVAL_EL0";
-  case CNTV_CTL_EL0: return "CNTV_CTL_EL0";
-  case CNTV_CVAL_EL0: return "CNTV_CVAL_EL0";
-  case CNTHP_TVAL_EL2: return "CNTHP_TVAL_EL2";
-  case CNTHP_CTL_EL2: return "CNTHP_CTL_EL2";
-  case CNTHP_CVAL_EL2: return "CNTHP_CVAL_EL2";
-  case CNTPS_TVAL_EL1: return "CNTPS_TVAL_EL1";
-  case CNTPS_CTL_EL1: return "CNTPS_CTL_EL1";
-  case CNTPS_CVAL_EL1: return "CNTPS_CVAL_EL1";
-  case DACR32_EL2: return "DACR32_EL2";
-  case IFSR32_EL2: return "IFSR32_EL2";
-  case TEEHBR32_EL1: return "TEEHBR32_EL1";
-  case SDER32_EL3: return "SDER32_EL3";
-  case FPEXC32_EL2: return "FPEXC32_EL2";
-  case PMEVCNTR0_EL0: return "PMEVCNTR0_EL0";
-  case PMEVCNTR1_EL0: return "PMEVCNTR1_EL0";
-  case PMEVCNTR2_EL0: return "PMEVCNTR2_EL0";
-  case PMEVCNTR3_EL0: return "PMEVCNTR3_EL0";
-  case PMEVCNTR4_EL0: return "PMEVCNTR4_EL0";
-  case PMEVCNTR5_EL0: return "PMEVCNTR5_EL0";
-  case PMEVCNTR6_EL0: return "PMEVCNTR6_EL0";
-  case PMEVCNTR7_EL0: return "PMEVCNTR7_EL0";
-  case PMEVCNTR8_EL0: return "PMEVCNTR8_EL0";
-  case PMEVCNTR9_EL0: return "PMEVCNTR9_EL0";
-  case PMEVCNTR10_EL0: return "PMEVCNTR10_EL0";
-  case PMEVCNTR11_EL0: return "PMEVCNTR11_EL0";
-  case PMEVCNTR12_EL0: return "PMEVCNTR12_EL0";
-  case PMEVCNTR13_EL0: return "PMEVCNTR13_EL0";
-  case PMEVCNTR14_EL0: return "PMEVCNTR14_EL0";
-  case PMEVCNTR15_EL0: return "PMEVCNTR15_EL0";
-  case PMEVCNTR16_EL0: return "PMEVCNTR16_EL0";
-  case PMEVCNTR17_EL0: return "PMEVCNTR17_EL0";
-  case PMEVCNTR18_EL0: return "PMEVCNTR18_EL0";
-  case PMEVCNTR19_EL0: return "PMEVCNTR19_EL0";
-  case PMEVCNTR20_EL0: return "PMEVCNTR20_EL0";
-  case PMEVCNTR21_EL0: return "PMEVCNTR21_EL0";
-  case PMEVCNTR22_EL0: return "PMEVCNTR22_EL0";
-  case PMEVCNTR23_EL0: return "PMEVCNTR23_EL0";
-  case PMEVCNTR24_EL0: return "PMEVCNTR24_EL0";
-  case PMEVCNTR25_EL0: return "PMEVCNTR25_EL0";
-  case PMEVCNTR26_EL0: return "PMEVCNTR26_EL0";
-  case PMEVCNTR27_EL0: return "PMEVCNTR27_EL0";
-  case PMEVCNTR28_EL0: return "PMEVCNTR28_EL0";
-  case PMEVCNTR29_EL0: return "PMEVCNTR29_EL0";
-  case PMEVCNTR30_EL0: return "PMEVCNTR30_EL0";
-  case PMEVTYPER0_EL0: return "PMEVTYPER0_EL0";
-  case PMEVTYPER1_EL0: return "PMEVTYPER1_EL0";
-  case PMEVTYPER2_EL0: return "PMEVTYPER2_EL0";
-  case PMEVTYPER3_EL0: return "PMEVTYPER3_EL0";
-  case PMEVTYPER4_EL0: return "PMEVTYPER4_EL0";
-  case PMEVTYPER5_EL0: return "PMEVTYPER5_EL0";
-  case PMEVTYPER6_EL0: return "PMEVTYPER6_EL0";
-  case PMEVTYPER7_EL0: return "PMEVTYPER7_EL0";
-  case PMEVTYPER8_EL0: return "PMEVTYPER8_EL0";
-  case PMEVTYPER9_EL0: return "PMEVTYPER9_EL0";
-  case PMEVTYPER10_EL0: return "PMEVTYPER10_EL0";
-  case PMEVTYPER11_EL0: return "PMEVTYPER11_EL0";
-  case PMEVTYPER12_EL0: return "PMEVTYPER12_EL0";
-  case PMEVTYPER13_EL0: return "PMEVTYPER13_EL0";
-  case PMEVTYPER14_EL0: return "PMEVTYPER14_EL0";
-  case PMEVTYPER15_EL0: return "PMEVTYPER15_EL0";
-  case PMEVTYPER16_EL0: return "PMEVTYPER16_EL0";
-  case PMEVTYPER17_EL0: return "PMEVTYPER17_EL0";
-  case PMEVTYPER18_EL0: return "PMEVTYPER18_EL0";
-  case PMEVTYPER19_EL0: return "PMEVTYPER19_EL0";
-  case PMEVTYPER20_EL0: return "PMEVTYPER20_EL0";
-  case PMEVTYPER21_EL0: return "PMEVTYPER21_EL0";
-  case PMEVTYPER22_EL0: return "PMEVTYPER22_EL0";
-  case PMEVTYPER23_EL0: return "PMEVTYPER23_EL0";
-  case PMEVTYPER24_EL0: return "PMEVTYPER24_EL0";
-  case PMEVTYPER25_EL0: return "PMEVTYPER25_EL0";
-  case PMEVTYPER26_EL0: return "PMEVTYPER26_EL0";
-  case PMEVTYPER27_EL0: return "PMEVTYPER27_EL0";
-  case PMEVTYPER28_EL0: return "PMEVTYPER28_EL0";
-  case PMEVTYPER29_EL0: return "PMEVTYPER29_EL0";
-  case PMEVTYPER30_EL0: return "PMEVTYPER30_EL0";
-  case PMCCFILTR_EL0: return "PMCCFILTR_EL0";
-  case RMR_EL3: return "RMR_EL3";
-  case RMR_EL2: return "RMR_EL2";
-  case RMR_EL1: return "RMR_EL1";
-  case CPM_IOACC_CTL_EL3: return "CPM_IOACC_CTL_EL3";
-  case MDCCSR_EL0: return "MDCCSR_EL0";
-  case MDCCINT_EL1: return "MDCCINT_EL1";
-  case DBGDTR_EL0: return "DBGDTR_EL0";
-  case DBGDTRRX_EL0: return "DBGDTRRX_EL0";
-  case DBGVCR32_EL2: return "DBGVCR32_EL2";
-  case OSDTRRX_EL1: return "OSDTRRX_EL1";
-  case MDSCR_EL1: return "MDSCR_EL1";
-  case OSDTRTX_EL1: return "OSDTRTX_EL1";
-  case OSECCR_EL11: return "OSECCR_EL11";
-  case DBGBVR0_EL1: return "DBGBVR0_EL1";
-  case DBGBVR1_EL1: return "DBGBVR1_EL1";
-  case DBGBVR2_EL1: return "DBGBVR2_EL1";
-  case DBGBVR3_EL1: return "DBGBVR3_EL1";
-  case DBGBVR4_EL1: return "DBGBVR4_EL1";
-  case DBGBVR5_EL1: return "DBGBVR5_EL1";
-  case DBGBVR6_EL1: return "DBGBVR6_EL1";
-  case DBGBVR7_EL1: return "DBGBVR7_EL1";
-  case DBGBVR8_EL1: return "DBGBVR8_EL1";
-  case DBGBVR9_EL1: return "DBGBVR9_EL1";
-  case DBGBVR10_EL1: return "DBGBVR10_EL1";
-  case DBGBVR11_EL1: return "DBGBVR11_EL1";
-  case DBGBVR12_EL1: return "DBGBVR12_EL1";
-  case DBGBVR13_EL1: return "DBGBVR13_EL1";
-  case DBGBVR14_EL1: return "DBGBVR14_EL1";
-  case DBGBVR15_EL1: return "DBGBVR15_EL1";
-  case DBGBCR0_EL1: return "DBGBCR0_EL1";
-  case DBGBCR1_EL1: return "DBGBCR1_EL1";
-  case DBGBCR2_EL1: return "DBGBCR2_EL1";
-  case DBGBCR3_EL1: return "DBGBCR3_EL1";
-  case DBGBCR4_EL1: return "DBGBCR4_EL1";
-  case DBGBCR5_EL1: return "DBGBCR5_EL1";
-  case DBGBCR6_EL1: return "DBGBCR6_EL1";
-  case DBGBCR7_EL1: return "DBGBCR7_EL1";
-  case DBGBCR8_EL1: return "DBGBCR8_EL1";
-  case DBGBCR9_EL1: return "DBGBCR9_EL1";
-  case DBGBCR10_EL1: return "DBGBCR10_EL1";
-  case DBGBCR11_EL1: return "DBGBCR11_EL1";
-  case DBGBCR12_EL1: return "DBGBCR12_EL1";
-  case DBGBCR13_EL1: return "DBGBCR13_EL1";
-  case DBGBCR14_EL1: return "DBGBCR14_EL1";
-  case DBGBCR15_EL1: return "DBGBCR15_EL1";
-  case DBGWVR0_EL1: return "DBGWVR0_EL1";
-  case DBGWVR1_EL1: return "DBGWVR1_EL1";
-  case DBGWVR2_EL1: return "DBGWVR2_EL1";
-  case DBGWVR3_EL1: return "DBGWVR3_EL1";
-  case DBGWVR4_EL1: return "DBGWVR4_EL1";
-  case DBGWVR5_EL1: return "DBGWVR5_EL1";
-  case DBGWVR6_EL1: return "DBGWVR6_EL1";
-  case DBGWVR7_EL1: return "DBGWVR7_EL1";
-  case DBGWVR8_EL1: return "DBGWVR8_EL1";
-  case DBGWVR9_EL1: return "DBGWVR9_EL1";
-  case DBGWVR10_EL1: return "DBGWVR10_EL1";
-  case DBGWVR11_EL1: return "DBGWVR11_EL1";
-  case DBGWVR12_EL1: return "DBGWVR12_EL1";
-  case DBGWVR13_EL1: return "DBGWVR13_EL1";
-  case DBGWVR14_EL1: return "DBGWVR14_EL1";
-  case DBGWVR15_EL1: return "DBGWVR15_EL1";
-  case DBGWCR0_EL1: return "DBGWCR0_EL1";
-  case DBGWCR1_EL1: return "DBGWCR1_EL1";
-  case DBGWCR2_EL1: return "DBGWCR2_EL1";
-  case DBGWCR3_EL1: return "DBGWCR3_EL1";
-  case DBGWCR4_EL1: return "DBGWCR4_EL1";
-  case DBGWCR5_EL1: return "DBGWCR5_EL1";
-  case DBGWCR6_EL1: return "DBGWCR6_EL1";
-  case DBGWCR7_EL1: return "DBGWCR7_EL1";
-  case DBGWCR8_EL1: return "DBGWCR8_EL1";
-  case DBGWCR9_EL1: return "DBGWCR9_EL1";
-  case DBGWCR10_EL1: return "DBGWCR10_EL1";
-  case DBGWCR11_EL1: return "DBGWCR11_EL1";
-  case DBGWCR12_EL1: return "DBGWCR12_EL1";
-  case DBGWCR13_EL1: return "DBGWCR13_EL1";
-  case DBGWCR14_EL1: return "DBGWCR14_EL1";
-  case DBGWCR15_EL1: return "DBGWCR15_EL1";
-  case MDRAR_EL1: return "MDRAR_EL1";
-  case OSLAR_EL1: return "OSLAR_EL1";
-  case OSLSR_EL1: return "OSLSR_EL1";
-  case OSDLR_EL1: return "OSDLR_EL1";
-  case DBGPRCR_EL1: return "DBGPRCR_EL1";
-  case DBGCLAIMSET_EL1: return "DBGCLAIMSET_EL1";
-  case DBGCLAIMCLR_EL1: return "DBGCLAIMCLR_EL1";
-  case DBGAUTHSTATUS_EL1: return "DBGAUTHSTATUS_EL1";
-  case DBGDEVID2: return "DBGDEVID2";
-  case DBGDEVID1: return "DBGDEVID1";
-  case DBGDEVID0: return "DBGDEVID0";
-  case ID_PFR0_EL1: return "ID_PFR0_EL1";
-  case ID_PFR1_EL1: return "ID_PFR1_EL1";
-  case ID_DFR0_EL1: return "ID_DFR0_EL1";
-  case ID_AFR0_EL1: return "ID_AFR0_EL1";
-  case ID_ISAR0_EL1: return "ID_ISAR0_EL1";
-  case ID_ISAR1_EL1: return "ID_ISAR1_EL1";
-  case ID_ISAR2_EL1: return "ID_ISAR2_EL1";
-  case ID_ISAR3_EL1: return "ID_ISAR3_EL1";
-  case ID_ISAR4_EL1: return "ID_ISAR4_EL1";
-  case ID_ISAR5_EL1: return "ID_ISAR5_EL1";
-  case AFSR1_EL1: return "AFSR1_EL1";
-  case AFSR0_EL1: return "AFSR0_EL1";
-  case REVIDR_EL1: return "REVIDR_EL1";
-  }
-}
-
-enum CPSRField {
-  InvalidCPSRField = 0xff,
-  cpsr_SPSel = 0x5,
-  cpsr_DAIFSet = 0x1e,
-  cpsr_DAIFClr = 0x1f
-};
-
-static inline const char *getCPSRFieldName(CPSRField Val) {
-  switch(Val) {
-  default: assert(0 && "Invalid system register value!");
-  case cpsr_SPSel: return "SPSel";
-  case cpsr_DAIFSet: return "DAIFSet";
-  case cpsr_DAIFClr: return "DAIFClr";
-  }
-}
-
-} // end namespace ARM64SYS
-
-namespace ARM64II {
-  /// Target Operand Flag enum.
-  enum TOF {
-    //===------------------------------------------------------------------===//
-    // ARM64 Specific MachineOperand flags.
-
-    MO_NO_FLAG,
-
-    MO_FRAGMENT = 0x7,
-
-    /// MO_PAGE - A symbol operand with this flag represents the pc-relative
-    /// offset of the 4K page containing the symbol.  This is used with the
-    /// ADRP instruction.
-    MO_PAGE = 1,
-
-    /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
-    /// that symbol within a 4K page.  This offset is added to the page address
-    /// to produce the complete address.
-    MO_PAGEOFF = 2,
-
-    /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
-    /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G3 = 3,
-
-    /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
-    /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G2 = 4,
-
-    /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
-    /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G1 = 5,
-
-    /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
-    /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G0 = 6,
-
-    /// MO_GOT - This flag indicates that a symbol operand represents the
-    /// address of the GOT entry for the symbol, rather than the address of
-    /// the symbol itself.
-    MO_GOT = 8,
-
-    /// MO_NC - Indicates whether the linker is expected to check the symbol
-    /// reference for overflow. For example in an ADRP/ADD pair of relocations
-    /// the ADRP usually does check, but not the ADD.
-    MO_NC = 0x10,
-
-    /// MO_TLS - Indicates that the operand being accessed is some kind of
-    /// thread-local symbol. On Darwin, only one type of thread-local access
-    /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
-    /// referee will affect interpretation.
-    MO_TLS = 0x20
-  };
-} // end namespace ARM64II
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
deleted file mode 100644
index 1a132a1..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-//===-- ARM64ELFObjectWriter.cpp - ARM64 ELF Writer -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file handles ELF-specific object emission, converting LLVM's internal
-// fixups into the appropriate relocations.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-namespace {
-class ARM64ELFObjectWriter : public MCELFObjectTargetWriter {
-public:
-  ARM64ELFObjectWriter(uint8_t OSABI);
-
-  virtual ~ARM64ELFObjectWriter();
-
-protected:
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override;
-
-private:
-};
-}
-
-ARM64ELFObjectWriter::ARM64ELFObjectWriter(uint8_t OSABI)
-    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
-                              /*HasRelocationAddend*/ true) {}
-
-ARM64ELFObjectWriter::~ARM64ELFObjectWriter() {}
-
-unsigned ARM64ELFObjectWriter::GetRelocType(const MCValue &Target,
-                                            const MCFixup &Fixup,
-                                            bool IsPCRel) const {
-  ARM64MCExpr::VariantKind RefKind =
-      static_cast<ARM64MCExpr::VariantKind>(Target.getRefKind());
-  ARM64MCExpr::VariantKind SymLoc = ARM64MCExpr::getSymbolLoc(RefKind);
-  bool IsNC = ARM64MCExpr::isNotChecked(RefKind);
-
-  assert((!Target.getSymA() ||
-          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
-         "Should only be expression-level modifiers here");
-
-  assert((!Target.getSymB() ||
-          Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
-         "Should only be expression-level modifiers here");
-
-  if (IsPCRel) {
-    switch ((unsigned)Fixup.getKind()) {
-    case FK_Data_2:
-      return ELF::R_AARCH64_PREL16;
-    case FK_Data_4:
-      return ELF::R_AARCH64_PREL32;
-    case FK_Data_8:
-      return ELF::R_AARCH64_PREL64;
-    case ARM64::fixup_arm64_pcrel_adr_imm21:
-      llvm_unreachable("No ELF relocations supported for ADR at the moment");
-    case ARM64::fixup_arm64_pcrel_adrp_imm21:
-      if (SymLoc == ARM64MCExpr::VK_ABS && !IsNC)
-        return ELF::R_AARCH64_ADR_PREL_PG_HI21;
-      if (SymLoc == ARM64MCExpr::VK_GOT && !IsNC)
-        return ELF::R_AARCH64_ADR_GOT_PAGE;
-      if (SymLoc == ARM64MCExpr::VK_GOTTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
-      if (SymLoc == ARM64MCExpr::VK_TLSDESC && !IsNC)
-        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
-      llvm_unreachable("invalid symbol kind for ADRP relocation");
-    case ARM64::fixup_arm64_pcrel_branch26:
-      return ELF::R_AARCH64_JUMP26;
-    case ARM64::fixup_arm64_pcrel_call26:
-      return ELF::R_AARCH64_CALL26;
-    case ARM64::fixup_arm64_pcrel_imm19:
-      return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
-    default:
-      llvm_unreachable("Unsupported pc-relative fixup kind");
-    }
-  } else {
-    switch ((unsigned)Fixup.getKind()) {
-    case FK_Data_2:
-      return ELF::R_AARCH64_ABS16;
-    case FK_Data_4:
-      return ELF::R_AARCH64_ABS32;
-    case FK_Data_8:
-      return ELF::R_AARCH64_ABS64;
-    case ARM64::fixup_arm64_add_imm12:
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TLSDESC && IsNC)
-        return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_ADD_ABS_LO12_NC;
-
-      report_fatal_error("invalid fixup for add (uimm12) instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale1:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
-
-      report_fatal_error("invalid fixup for 8-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale2:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
-
-      report_fatal_error("invalid fixup for 16-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale4:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
-
-      report_fatal_error("invalid fixup for 32-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale8:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_GOT && IsNC)
-        return ELF::R_AARCH64_LD64_GOT_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_GOTTPREL && IsNC)
-        return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TLSDESC && IsNC)
-        return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
-
-      report_fatal_error("invalid fixup for 64-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale16:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
-
-      report_fatal_error("invalid fixup for 128-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_movw:
-      if (RefKind == ARM64MCExpr::VK_ABS_G3)
-        return ELF::R_AARCH64_MOVW_UABS_G3;
-      if (RefKind == ARM64MCExpr::VK_ABS_G2)
-        return ELF::R_AARCH64_MOVW_UABS_G2;
-      if (RefKind == ARM64MCExpr::VK_ABS_G2_NC)
-        return ELF::R_AARCH64_MOVW_UABS_G2_NC;
-      if (RefKind == ARM64MCExpr::VK_ABS_G1)
-        return ELF::R_AARCH64_MOVW_UABS_G1;
-      if (RefKind == ARM64MCExpr::VK_ABS_G1_NC)
-        return ELF::R_AARCH64_MOVW_UABS_G1_NC;
-      if (RefKind == ARM64MCExpr::VK_ABS_G0)
-        return ELF::R_AARCH64_MOVW_UABS_G0;
-      if (RefKind == ARM64MCExpr::VK_ABS_G0_NC)
-        return ELF::R_AARCH64_MOVW_UABS_G0_NC;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G2)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G1)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G1_NC)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G0)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G0_NC)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G2)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G1)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G1_NC)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G0)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G0_NC)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
-      if (RefKind == ARM64MCExpr::VK_GOTTPREL_G1)
-        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
-      if (RefKind == ARM64MCExpr::VK_GOTTPREL_G0_NC)
-        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
-      report_fatal_error("invalid fixup for movz/movk instruction");
-      return 0;
-    case ARM64::fixup_arm64_tlsdesc_call:
-      return ELF::R_AARCH64_TLSDESC_CALL;
-    default:
-      llvm_unreachable("Unknown ELF relocation type");
-    }
-  }
-
-  llvm_unreachable("Unimplemented fixup -> relocation");
-}
-
-MCObjectWriter *llvm::createARM64ELFObjectWriter(raw_ostream &OS,
-                                                 uint8_t OSABI) {
-  MCELFObjectTargetWriter *MOTW = new ARM64ELFObjectWriter(OSABI);
-  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
deleted file mode 100644
index 97a3493..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//===- lib/MC/ARM64ELFStreamer.cpp - ELF Object Output for ARM64 ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file assembles .s files and emits AArch64 ELF .o object files. Different
-// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit
-// regions of data and code.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-
-/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
-/// the appropriate points in the object files. These symbols are defined in the
-/// AArch64 ELF ABI:
-///    infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf
-///
-/// In brief: $x or $d should be emitted at the start of each contiguous region
-/// of A64 code or data in a section. In practice, this emission does not rely
-/// on explicit assembler directives but on inherent properties of the
-/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an
-/// instruction).
-///
-/// As a result this system is orthogonal to the DataRegion infrastructure used
-/// by MachO. Beware!
-class ARM64ELFStreamer : public MCELFStreamer {
-public:
-  ARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
-                   MCCodeEmitter *Emitter)
-      : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
-        LastEMS(EMS_None) {}
-
-  ~ARM64ELFStreamer() {}
-
-  virtual void ChangeSection(const MCSection *Section,
-                             const MCExpr *Subsection) {
-    // We have to keep track of the mapping symbol state of any sections we
-    // use. Each one should start off as EMS_None, which is provided as the
-    // default constructor by DenseMap::lookup.
-    LastMappingSymbols[getPreviousSection().first] = LastEMS;
-    LastEMS = LastMappingSymbols.lookup(Section);
-
-    MCELFStreamer::ChangeSection(Section, Subsection);
-  }
-
-  /// This function is the one used to emit instruction data into the ELF
-  /// streamer. We override it to add the appropriate mapping symbol if
-  /// necessary.
-  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) {
-    EmitA64MappingSymbol();
-    MCELFStreamer::EmitInstruction(Inst, STI);
-  }
-
-  /// This is one of the functions used to emit data into an ELF section, so the
-  /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d)
-  /// if necessary.
-  virtual void EmitBytes(StringRef Data) {
-    EmitDataMappingSymbol();
-    MCELFStreamer::EmitBytes(Data);
-  }
-
-  /// This is one of the functions used to emit data into an ELF section, so the
-  /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d)
-  /// if necessary.
-  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {
-    EmitDataMappingSymbol();
-    MCELFStreamer::EmitValueImpl(Value, Size);
-  }
-
-private:
-  enum ElfMappingSymbol {
-    EMS_None,
-    EMS_A64,
-    EMS_Data
-  };
-
-  void EmitDataMappingSymbol() {
-    if (LastEMS == EMS_Data)
-      return;
-    EmitMappingSymbol("$d");
-    LastEMS = EMS_Data;
-  }
-
-  void EmitA64MappingSymbol() {
-    if (LastEMS == EMS_A64)
-      return;
-    EmitMappingSymbol("$x");
-    LastEMS = EMS_A64;
-  }
-
-  void EmitMappingSymbol(StringRef Name) {
-    MCSymbol *Start = getContext().CreateTempSymbol();
-    EmitLabel(Start);
-
-    MCSymbol *Symbol = getContext().GetOrCreateSymbol(
-        Name + "." + Twine(MappingSymbolCounter++));
-
-    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-    MCELF::SetType(SD, ELF::STT_NOTYPE);
-    MCELF::SetBinding(SD, ELF::STB_LOCAL);
-    SD.setExternal(false);
-    Symbol->setSection(*getCurrentSection().first);
-
-    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
-    Symbol->setVariableValue(Value);
-  }
-
-  int64_t MappingSymbolCounter;
-
-  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
-  ElfMappingSymbol LastEMS;
-
-  /// @}
-};
-}
-
-namespace llvm {
-MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack) {
-  ARM64ELFStreamer *S = new ARM64ELFStreamer(Context, TAB, OS, Emitter);
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
-  if (NoExecStack)
-    S->getAssembler().setNoExecStack(true);
-  return S;
-}
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
deleted file mode 100644
index 72dadbc..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-- ARM64ELFStreamer.h - ELF Streamer for ARM64 -------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF streamer information for the ARM64 backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_AARCH64_ELF_STREAMER_H
-#define LLVM_AARCH64_ELF_STREAMER_H
-
-#include "llvm/MC/MCELFStreamer.h"
-
-namespace llvm {
-
-MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack);
-}
-
-#endif // ARM64_ELF_STREAMER_H
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h b/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
deleted file mode 100644
index 02eb91f..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
+++ /dev/null
@@ -1,72 +0,0 @@
-//===-- ARM64FixupKinds.h - ARM64 Specific Fixup Entries --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ARM64FIXUPKINDS_H
-#define LLVM_ARM64FIXUPKINDS_H
-
-#include "llvm/MC/MCFixup.h"
-
-namespace llvm {
-namespace ARM64 {
-
-enum Fixups {
-  // fixup_arm64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADR instruction.
-  fixup_arm64_pcrel_adr_imm21 = FirstTargetFixupKind,
-
-  // fixup_arm64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADRP instruction.
-  fixup_arm64_pcrel_adrp_imm21,
-
-  // fixup_arm64_imm12 - 12-bit fixup for add/sub instructions.
-  //     No alignment adjustment. All value bits are encoded.
-  fixup_arm64_add_imm12,
-
-  // fixup_arm64_ldst_imm12_* - unsigned 12-bit fixups for load and
-  // store instructions.
-  fixup_arm64_ldst_imm12_scale1,
-  fixup_arm64_ldst_imm12_scale2,
-  fixup_arm64_ldst_imm12_scale4,
-  fixup_arm64_ldst_imm12_scale8,
-  fixup_arm64_ldst_imm12_scale16,
-
-  // FIXME: comment
-  fixup_arm64_movw,
-
-  // fixup_arm64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
-  // immediate.
-  fixup_arm64_pcrel_branch14,
-
-  // fixup_arm64_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
-  // immediate. Same encoding as fixup_arm64_pcrel_adrhi, except this
-  // is not used as part of a lo/hi pair and thus generates relocations
-  // directly when necessary.
-  fixup_arm64_pcrel_imm19,
-
-  // fixup_arm64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
-  // immediate.
-  fixup_arm64_pcrel_branch26,
-
-  // fixup_arm64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
-  // immediate. Distinguished from branch26 only on ELF.
-  fixup_arm64_pcrel_call26,
-
-  // fixup_arm64_tlsdesc_call - zero-space placeholder for the ELF
-  // R_AARCH64_TLSDESC_CALL relocation.
-  fixup_arm64_tlsdesc_call,
-
-  // Marker
-  LastTargetFixupKind,
-  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-};
-
-} // end namespace ARM64
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
deleted file mode 100644
index 97e0d3c..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-//===-- ARM64MCAsmInfo.cpp - ARM64 asm properties -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of the ARM64MCAsmInfo properties.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/CommandLine.h"
-using namespace llvm;
-
-enum AsmWriterVariantTy {
-  Default = -1,
-  Generic = 0,
-  Apple = 1
-};
-
-static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
-    "arm64-neon-syntax", cl::init(Default),
-    cl::desc("Choose style of NEON code to emit from ARM64 backend:"),
-    cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
-               clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"),
-               clEnumValEnd));
-
-ARM64MCAsmInfoDarwin::ARM64MCAsmInfoDarwin() {
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
-
-  PrivateGlobalPrefix = "L";
-  SeparatorString = "%%";
-  CommentString = ";";
-  PointerSize = CalleeSaveStackSlotSize = 8;
-
-  AlignmentIsInBytes = false;
-  UsesELFSectionDirectiveForBSS = true;
-  SupportsDebugInformation = true;
-  UseDataRegionDirectives = true;
-
-  ExceptionsType = ExceptionHandling::DwarfCFI;
-}
-
-const MCExpr *ARM64MCAsmInfoDarwin::getExprForPersonalitySymbol(
-    const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
-  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
-  // is an indirect pc-relative reference. The default implementation
-  // won't reference using the GOT, so we need this target-specific
-  // version.
-  MCContext &Context = Streamer.getContext();
-  const MCExpr *Res =
-      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
-  MCSymbol *PCSym = Context.CreateTempSymbol();
-  Streamer.EmitLabel(PCSym);
-  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
-  return MCBinaryExpr::CreateSub(Res, PC, Context);
-}
-
-ARM64MCAsmInfoELF::ARM64MCAsmInfoELF() {
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
-
-  PointerSize = 8;
-
-  // ".comm align is in bytes but .align is pow-2."
-  AlignmentIsInBytes = false;
-
-  CommentString = "//";
-  PrivateGlobalPrefix = ".L";
-  Code32Directive = ".code\t32";
-
-  Data16bitsDirective = "\t.hword\t";
-  Data32bitsDirective = "\t.word\t";
-  Data64bitsDirective = "\t.xword\t";
-
-  UseDataRegionDirectives = false;
-
-  WeakRefDirective = "\t.weak\t";
-
-  HasLEB128 = true;
-  SupportsDebugInformation = true;
-
-  // Exceptions handling
-  ExceptionsType = ExceptionHandling::DwarfCFI;
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
deleted file mode 100644
index f2d33a7..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//=====-- ARM64MCAsmInfo.h - ARM64 asm properties -----------*- C++ -*--====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the ARM64MCAsmInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64TARGETASMINFO_H
-#define ARM64TARGETASMINFO_H
-
-#include "llvm/MC/MCAsmInfoDarwin.h"
-
-namespace llvm {
-class Target;
-class StringRef;
-class MCStreamer;
-struct ARM64MCAsmInfoDarwin : public MCAsmInfoDarwin {
-  explicit ARM64MCAsmInfoDarwin();
-  virtual const MCExpr *getExprForPersonalitySymbol(const MCSymbol *Sym,
-                                                    unsigned Encoding,
-                                                    MCStreamer &Streamer) const;
-};
-
-struct ARM64MCAsmInfoELF : public MCAsmInfo {
-  explicit ARM64MCAsmInfoELF();
-};
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
deleted file mode 100644
index 19559f8..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
+++ /dev/null
@@ -1,563 +0,0 @@
-//===-- ARM64/ARM64MCCodeEmitter.cpp - Convert ARM64 code to machine code -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64MCCodeEmitter class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mccodeemitter"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "MCTargetDesc/ARM64BaseInfo.h"
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
-STATISTIC(MCNumFixups, "Number of MC fixups created.");
-
-namespace {
-
-class ARM64MCCodeEmitter : public MCCodeEmitter {
-  MCContext &Ctx;
-
-  ARM64MCCodeEmitter(const ARM64MCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const ARM64MCCodeEmitter &);     // DO NOT IMPLEMENT
-public:
-  ARM64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                     MCContext &ctx)
-      : Ctx(ctx) {}
-
-  ~ARM64MCCodeEmitter() {}
-
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-
-  /// getAMIndexed8OpValue - Return encoding info for base register
-  /// and 12-bit unsigned immediate attached to a load, store or prfm
-  /// instruction. If operand requires a relocation, record it and
-  /// return zero in that part of the encoding.
-  template <uint32_t FixupKind>
-  uint32_t getAMIndexed8OpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
-  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
-  /// target.
-  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-
-  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
-  /// the 2-bit shift field.
-  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-
-  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
-  /// branch target.
-  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const;
-
-  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
-  /// branch target.
-  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const;
-
-  /// getBranchTargetOpValue - Return the encoded value for an unconditional
-  /// branch target.
-  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                  SmallVectorImpl<MCFixup> &Fixups,
-                                  const MCSubtargetInfo &STI) const;
-
-  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
-  /// of a MOVZ or MOVK instruction.
-  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
-  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
-  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
-  /// shifter (MSL).
-  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const;
-
-  /// getFixedPointScaleOpValue - Return the encoded value for the
-  // FP-to-fixed-point scale factor.
-  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const;
-
-  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
-  /// getSIMDShift64OpValue - Return the encoded value for the
-  // shift-by-immediate AdvSIMD instructions.
-  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                   const MCSubtargetInfo &STI) const;
-
-  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
-
-  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
-    // Output the constant in little endian byte order.
-    for (unsigned i = 0; i != Size; ++i) {
-      EmitByte(Val & 255, OS);
-      Val >>= 8;
-    }
-  }
-
-  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
-};
-
-} // end anonymous namespace
-
-MCCodeEmitter *llvm::createARM64MCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
-                                              const MCSubtargetInfo &STI,
-                                              MCContext &Ctx) {
-  return new ARM64MCCodeEmitter(MCII, STI, Ctx);
-}
-
-/// getMachineOpValue - Return binary encoding of operand. If the machine
-/// operand requires relocation, record the relocation and return zero.
-unsigned
-ARM64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const {
-  if (MO.isReg())
-    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
-  else {
-    assert(MO.isImm() && "did not expect relocated expression");
-    return static_cast<unsigned>(MO.getImm());
-  }
-
-  assert(0 && "Unable to encode MCOperand!");
-  return 0;
-}
-
-template <uint32_t FixupKind>
-uint32_t
-ARM64MCCodeEmitter::getAMIndexed8OpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  unsigned BaseReg = MI.getOperand(OpIdx).getReg();
-  BaseReg = Ctx.getRegisterInfo()->getEncodingValue(BaseReg);
-
-  const MCOperand &MO = MI.getOperand(OpIdx + 1);
-  uint32_t ImmVal = 0;
-
-  if (MO.isImm())
-    ImmVal = static_cast<uint32_t>(MO.getImm());
-  else {
-    assert(MO.isExpr() && "unable to encode load/store imm operand");
-    MCFixupKind Kind = MCFixupKind(FixupKind);
-    Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-    ++MCNumFixups;
-  }
-
-  return BaseReg | (ImmVal << 5);
-}
-
-/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
-/// target.
-uint32_t
-ARM64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected ADR target type!");
-  const MCExpr *Expr = MO.getExpr();
-
-  MCFixupKind Kind = MI.getOpcode() == ARM64::ADR
-                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_adr_imm21)
-                         : MCFixupKind(ARM64::fixup_arm64_pcrel_adrp_imm21);
-  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
-
-  MCNumFixups += 1;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
-/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
-/// return value.
-uint32_t
-ARM64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                        SmallVectorImpl<MCFixup> &Fixups,
-                                        const MCSubtargetInfo &STI) const {
-  // Suboperands are [imm, shifter].
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  assert(ARM64_AM::getShiftType(MO1.getImm()) == ARM64_AM::LSL &&
-         "unexpected shift type for add/sub immediate");
-  unsigned ShiftVal = ARM64_AM::getShiftValue(MO1.getImm());
-  assert((ShiftVal == 0 || ShiftVal == 12) &&
-         "unexpected shift value for add/sub immediate");
-  if (MO.isImm())
-    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
-  assert(MO.isExpr() && "Unable to encode MCOperand!");
-  const MCExpr *Expr = MO.getExpr();
-  assert(ShiftVal == 0 && "shift not allowed on add/sub immediate with fixup");
-
-  // Encode the 12 bits of the fixup.
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_add_imm12);
-  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  return 0;
-}
-
-/// getCondBranchTargetOpValue - Return the encoded value for a conditional
-/// branch target.
-uint32_t ARM64MCCodeEmitter::getCondBranchTargetOpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected target type!");
-
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_imm19);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected movz/movk immediate");
-
-  Fixups.push_back(MCFixup::Create(
-      0, MO.getExpr(), MCFixupKind(ARM64::fixup_arm64_movw), MI.getLoc()));
-
-  ++MCNumFixups;
-
-  return 0;
-}
-
-/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
-/// branch target.
-uint32_t ARM64MCCodeEmitter::getTestBranchTargetOpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected ADR target type!");
-
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_branch14);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getBranchTargetOpValue - Return the encoded value for an unconditional
-/// branch target.
-uint32_t
-ARM64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                           SmallVectorImpl<MCFixup> &Fixups,
-                                           const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected ADR target type!");
-
-  MCFixupKind Kind = MI.getOpcode() == ARM64::BL
-                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_call26)
-                         : MCFixupKind(ARM64::fixup_arm64_pcrel_branch26);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getVecShifterOpValue - Return the encoded value for the vector shifter:
-///
-///   00 -> 0
-///   01 -> 8
-///   10 -> 16
-///   11 -> 24
-uint32_t
-ARM64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-
-  switch (MO.getImm()) {
-  default:
-    break;
-  case 0:
-    return 0;
-  case 8:
-    return 1;
-  case 16:
-    return 2;
-  case 24:
-    return 3;
-  }
-
-  assert(false && "Invalid value for vector shift amount!");
-  return 0;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 64 - (MO.getImm());
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
-                                             SmallVectorImpl<MCFixup> &Fixups,
-                                             const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 64 - (MO.getImm() | 32);
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 32 - (MO.getImm() | 16);
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 16 - (MO.getImm() | 8);
-}
-
-/// getFixedPointScaleOpValue - Return the encoded value for the
-// FP-to-fixed-point scale factor.
-uint32_t ARM64MCCodeEmitter::getFixedPointScaleOpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 64 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 64 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 32 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 16 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 8 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 64;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 32;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 16;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 8;
-}
-
-/// getMoveVecShifterOpValue - Return the encoded value for the vector move
-/// shifter (MSL).
-uint32_t
-ARM64MCCodeEmitter::getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                             SmallVectorImpl<MCFixup> &Fixups,
-                                             const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() &&
-         "Expected an immediate value for the move shift amount!");
-  unsigned ShiftVal = ARM64_AM::getShiftValue(MO.getImm());
-  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
-  return ShiftVal == 8 ? 0 : 1;
-}
-
-unsigned ARM64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                                     const MCSubtargetInfo &STI) const {
-  // If one of the signed fixup kinds is applied to a MOVZ instruction, the
-  // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
-  // job to ensure that any bits possibly affected by this are 0. This means we
-  // must zero out bit 30 (essentially emitting a MOVN).
-  MCOperand UImm16MO = MI.getOperand(1);
-
-  // Nothing to do if there's no fixup.
-  if (UImm16MO.isImm())
-    return EncodedValue;
-
-  return EncodedValue & ~(1u << 30);
-}
-
-void ARM64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                                           SmallVectorImpl<MCFixup> &Fixups,
-                                           const MCSubtargetInfo &STI) const {
-  if (MI.getOpcode() == ARM64::TLSDESCCALL) {
-    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
-    // following (BLR) instruction. It doesn't emit any code itself so it
-    // doesn't go through the normal TableGenerated channels.
-    MCFixupKind Fixup = MCFixupKind(ARM64::fixup_arm64_tlsdesc_call);
-    Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
-    return;
-  }
-
-  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
-  EmitConstant(Binary, 4, OS);
-  ++MCNumEmitted; // Keep track of the # of mi's emitted.
-}
-
-#include "ARM64GenMCCodeEmitter.inc"
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
deleted file mode 100644
index d4ab140..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-//===-- ARM64MCExpr.cpp - ARM64 specific MC expression classes --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the assembly expression modifiers
-// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "aarch64symbolrefexpr"
-#include "ARM64MCExpr.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-const ARM64MCExpr *ARM64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
-                                       MCContext &Ctx) {
-  return new (Ctx) ARM64MCExpr(Expr, Kind);
-}
-
-StringRef ARM64MCExpr::getVariantKindName() const {
-  switch (static_cast<uint32_t>(getKind())) {
-  case VK_CALL:                return "";
-  case VK_LO12:                return ":lo12:";
-  case VK_ABS_G3:              return ":abs_g3:";
-  case VK_ABS_G2:              return ":abs_g2:";
-  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
-  case VK_ABS_G1:              return ":abs_g1:";
-  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
-  case VK_ABS_G0:              return ":abs_g0:";
-  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
-  case VK_DTPREL_G2:           return ":dtprel_g2:";
-  case VK_DTPREL_G1:           return ":dtprel_g1:";
-  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
-  case VK_DTPREL_G0:           return ":dtprel_g0:";
-  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
-  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
-  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
-  case VK_TPREL_G2:            return ":tprel_g2:";
-  case VK_TPREL_G1:            return ":tprel_g1:";
-  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
-  case VK_TPREL_G0:            return ":tprel_g0:";
-  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
-  case VK_TPREL_LO12:          return ":tprel_lo12:";
-  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
-  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
-  case VK_ABS_PAGE:            return "";
-  case VK_GOT_PAGE:            return ":got:";
-  case VK_GOT_LO12:            return ":got_lo12:";
-  case VK_GOTTPREL_PAGE:       return ":gottprel:";
-  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
-  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
-  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
-  case VK_TLSDESC:             return "";
-  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
-  default:
-    llvm_unreachable("Invalid ELF symbol kind");
-  }
-}
-
-void ARM64MCExpr::PrintImpl(raw_ostream &OS) const {
-  if (getKind() != VK_NONE)
-    OS << getVariantKindName();
-  OS << *Expr;
-}
-
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-// FIXME: really do above: now that two backends are using it.
-static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expr!");
-    break;
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbolsImpl(BE->getLHS(), Asm);
-    AddValueSymbolsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void ARM64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbolsImpl(getSubExpr(), Asm);
-}
-
-const MCSection *ARM64MCExpr::FindAssociatedSection() const {
-  llvm_unreachable("FIXME: what goes here?");
-}
-
-bool ARM64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                            const MCAsmLayout *Layout) const {
-  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
-    return false;
-
-  Res =
-      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
-
-  return true;
-}
-
-static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
-  switch (Expr->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expression");
-    break;
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
-    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
-    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef: {
-    // We're known to be under a TLS fixup, so any symbol should be
-    // modified. There should be only one.
-    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
-    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
-    MCELF::SetType(SD, ELF::STT_TLS);
-    break;
-  }
-
-  case MCExpr::Unary:
-    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void ARM64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
-  switch (getSymbolLoc(Kind)) {
-  default:
-    return;
-  case VK_DTPREL:
-  case VK_GOTTPREL:
-  case VK_TPREL:
-  case VK_TLSDESC:
-    break;
-  }
-
-  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
deleted file mode 100644
index a33fe43..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
+++ /dev/null
@@ -1,162 +0,0 @@
-//=---- ARM64MCExpr.h - ARM64 specific MC expression classes ------*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes ARM64-specific MCExprs, used for modifiers like
-// ":lo12:" or ":gottprel_g1:".
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ARM64MCEXPR_H
-#define LLVM_ARM64MCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-
-class ARM64MCExpr : public MCTargetExpr {
-public:
-  enum VariantKind {
-    VK_NONE     = 0x000,
-
-    // Symbol locations specifying (roughly speaking) what calculation should be
-    // performed to construct the final address for the relocated
-    // symbol. E.g. direct, via the GOT, ...
-    VK_ABS      = 0x001,
-    VK_SABS     = 0x002,
-    VK_GOT      = 0x003,
-    VK_DTPREL   = 0x004,
-    VK_GOTTPREL = 0x005,
-    VK_TPREL    = 0x006,
-    VK_TLSDESC  = 0x007,
-    VK_SymLocBits = 0x00f,
-
-    // Variants specifying which part of the final address calculation is
-    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
-    // MOVZ/MOVK.
-    VK_PAGE     = 0x010,
-    VK_PAGEOFF  = 0x020,
-    VK_G0       = 0x030,
-    VK_G1       = 0x040,
-    VK_G2       = 0x050,
-    VK_G3       = 0x060,
-    VK_AddressFragBits = 0x0f0,
-
-    // Whether the final relocation is a checked one (where a linker should
-    // perform a range-check on the final address) or not. Note that this field
-    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
-    // on its own is a non-checked relocation. We side with ELF on being
-    // explicit about this!
-    VK_NC       = 0x100,
-
-    // Convenience definitions for referring to specific textual representations
-    // of relocation specifiers. Note that this means the "_NC" is sometimes
-    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
-    // since a user would write ":lo12:").
-    VK_CALL              = VK_ABS,
-    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
-    VK_ABS_G3            = VK_ABS      | VK_G3,
-    VK_ABS_G2            = VK_ABS      | VK_G2,
-    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
-    VK_ABS_G1            = VK_ABS      | VK_G1,
-    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
-    VK_ABS_G0            = VK_ABS      | VK_G0,
-    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
-    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
-    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
-    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
-    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
-    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
-    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
-    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
-    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
-    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
-    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
-    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
-    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
-    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
-    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
-    VK_TPREL_G2          = VK_TPREL    | VK_G2,
-    VK_TPREL_G1          = VK_TPREL    | VK_G1,
-    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
-    VK_TPREL_G0          = VK_TPREL    | VK_G0,
-    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
-    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
-    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
-    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
-    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
-
-    VK_INVALID  = 0xfff
-  };
-
-private:
-  const MCExpr *Expr;
-  const VariantKind Kind;
-
-  explicit ARM64MCExpr(const MCExpr *Expr, VariantKind Kind)
-    : Expr(Expr), Kind(Kind) {}
-
-public:
-  /// @name Construction
-  /// @{
-
-  static const ARM64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
-                                   MCContext &Ctx);
-
-  /// @}
-  /// @name Accessors
-  /// @{
-
-  /// Get the kind of this expression.
-  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
-
-  /// Get the expression this modifier applies to.
-  const MCExpr *getSubExpr() const { return Expr; }
-
-  /// @}
-  /// @name VariantKind information extractors.
-  /// @{
-
-  static VariantKind getSymbolLoc(VariantKind Kind) {
-    return static_cast<VariantKind>(Kind & VK_SymLocBits);
-  }
-
-  static VariantKind getAddressFrag(VariantKind Kind) {
-    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
-  }
-
-  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
-
-  /// @}
-
-  /// Convert the variant kind into an ELF-appropriate modifier
-  /// (e.g. ":got:", ":lo12:").
-  StringRef getVariantKindName() const;
-
-  void PrintImpl(raw_ostream &OS) const;
-
-  void AddValueSymbols(MCAssembler *) const;
-
-  const MCSection *FindAssociatedSection() const;
-
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
-
-  static bool classof(const MCExpr *E) {
-    return E->getKind() == MCExpr::Target;
-  }
-
-  static bool classof(const ARM64MCExpr *) { return true; }
-
-};
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
deleted file mode 100644
index 8d54412..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-//===-- ARM64MCTargetDesc.cpp - ARM64 Target Descriptions -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides ARM64 specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCTargetDesc.h"
-#include "ARM64ELFStreamer.h"
-#include "ARM64MCAsmInfo.h"
-#include "InstPrinter/ARM64InstPrinter.h"
-#include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_INSTRINFO_MC_DESC
-#include "ARM64GenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_MC_DESC
-#include "ARM64GenSubtargetInfo.inc"
-
-#define GET_REGINFO_MC_DESC
-#include "ARM64GenRegisterInfo.inc"
-
-using namespace llvm;
-
-static MCInstrInfo *createARM64MCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitARM64MCInstrInfo(X);
-  return X;
-}
-
-static MCSubtargetInfo *createARM64MCSubtargetInfo(StringRef TT, StringRef CPU,
-                                                   StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitARM64MCSubtargetInfo(X, TT, CPU, FS);
-  return X;
-}
-
-static MCRegisterInfo *createARM64MCRegisterInfo(StringRef Triple) {
-  MCRegisterInfo *X = new MCRegisterInfo();
-  InitARM64MCRegisterInfo(X, ARM64::LR);
-  return X;
-}
-
-static MCAsmInfo *createARM64MCAsmInfo(const MCRegisterInfo &MRI,
-                                       StringRef TT) {
-  Triple TheTriple(TT);
-
-  MCAsmInfo *MAI;
-  if (TheTriple.isOSDarwin())
-    MAI = new ARM64MCAsmInfoDarwin();
-  else {
-    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
-    MAI = new ARM64MCAsmInfoELF();
-  }
-
-  // Initial state of the frame pointer is SP.
-  unsigned Reg = MRI.getDwarfRegNum(ARM64::SP, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
-  MAI->addInitialFrameState(Inst);
-
-  return MAI;
-}
-
-static MCCodeGenInfo *createARM64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                               CodeModel::Model CM,
-                                               CodeGenOpt::Level OL) {
-  Triple TheTriple(TT);
-  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
-         "Only expect Darwin and ELF targets");
-
-  if (CM == CodeModel::Default)
-    CM = CodeModel::Small;
-  // The default MCJIT memory managers make no guarantees about where they can
-  // find an executable page; JITed code needs to be able to refer to globals
-  // no matter how far away they are.
-  else if (CM == CodeModel::JITDefault)
-    CM = CodeModel::Large;
-  else if (CM != CodeModel::Small && CM != CodeModel::Large)
-    report_fatal_error("Only small and large code models are allowed on ARM64");
-
-  // ARM64 Darwin is always PIC.
-  if (TheTriple.isOSDarwin())
-    RM = Reloc::PIC_;
-  // On ELF platforms the default static relocation model has a smart enough
-  // linker to cope with referencing external symbols defined in a shared
-  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
-  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
-    RM = Reloc::Static;
-
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->InitMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
-static MCInstPrinter *createARM64MCInstPrinter(const Target &T,
-                                               unsigned SyntaxVariant,
-                                               const MCAsmInfo &MAI,
-                                               const MCInstrInfo &MII,
-                                               const MCRegisterInfo &MRI,
-                                               const MCSubtargetInfo &STI) {
-  if (SyntaxVariant == 0)
-    return new ARM64InstPrinter(MAI, MII, MRI, STI);
-  if (SyntaxVariant == 1)
-    return new ARM64AppleInstPrinter(MAI, MII, MRI, STI);
-
-  return 0;
-}
-
-static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
-                                    MCContext &Ctx, MCAsmBackend &TAB,
-                                    raw_ostream &OS, MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI, bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin())
-    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
-                               /*LabelSections*/ true);
-
-  return createARM64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializeARM64TargetMC() {
-  // Register the MC asm info.
-  RegisterMCAsmInfoFn X(TheARM64Target, createARM64MCAsmInfo);
-
-  // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(TheARM64Target,
-                                        createARM64MCCodeGenInfo);
-
-  // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(TheARM64Target, createARM64MCInstrInfo);
-
-  // Register the MC register info.
-  TargetRegistry::RegisterMCRegInfo(TheARM64Target, createARM64MCRegisterInfo);
-
-  // Register the MC subtarget info.
-  TargetRegistry::RegisterMCSubtargetInfo(TheARM64Target,
-                                          createARM64MCSubtargetInfo);
-
-  // Register the asm backend.
-  TargetRegistry::RegisterMCAsmBackend(TheARM64Target, createARM64AsmBackend);
-
-  // Register the MC Code Emitter
-  TargetRegistry::RegisterMCCodeEmitter(TheARM64Target,
-                                        createARM64MCCodeEmitter);
-
-  // Register the object streamer.
-  TargetRegistry::RegisterMCObjectStreamer(TheARM64Target, createMCStreamer);
-
-  // Register the MCInstPrinter.
-  TargetRegistry::RegisterMCInstPrinter(TheARM64Target,
-                                        createARM64MCInstPrinter);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
deleted file mode 100644
index 0db2b22..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
+++ /dev/null
@@ -1,62 +0,0 @@
-//===-- ARM64MCTargetDesc.h - ARM64 Target Descriptions ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides ARM64 specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64MCTARGETDESC_H
-#define ARM64MCTARGETDESC_H
-
-#include "llvm/Support/DataTypes.h"
-#include <string>
-
-namespace llvm {
-class MCAsmBackend;
-class MCCodeEmitter;
-class MCContext;
-class MCInstrInfo;
-class MCRegisterInfo;
-class MCObjectWriter;
-class MCSubtargetInfo;
-class StringRef;
-class Target;
-class raw_ostream;
-
-extern Target TheARM64Target;
-
-MCCodeEmitter *createARM64MCCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCRegisterInfo &MRI,
-                                        const MCSubtargetInfo &STI,
-                                        MCContext &Ctx);
-MCAsmBackend *createARM64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                                    StringRef TT, StringRef CPU);
-
-MCObjectWriter *createARM64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI);
-
-MCObjectWriter *createARM64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
-                                            uint32_t CPUSubtype);
-
-} // End llvm namespace
-
-// Defines symbolic names for ARM64 registers.  This defines a mapping from
-// register name to register number.
-//
-#define GET_REGINFO_ENUM
-#include "ARM64GenRegisterInfo.inc"
-
-// Defines symbolic names for the ARM64 instructions.
-//
-#define GET_INSTRINFO_ENUM
-#include "ARM64GenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_ENUM
-#include "ARM64GenSubtargetInfo.inc"
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
deleted file mode 100644
index 1733dc5..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
+++ /dev/null
@@ -1,396 +0,0 @@
-//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCAsmLayout.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCMachObjectWriter.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
-using namespace llvm;
-
-namespace {
-class ARM64MachObjectWriter : public MCMachObjectTargetWriter {
-  bool getARM64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
-                                  const MCSymbolRefExpr *Sym,
-                                  unsigned &Log2Size, const MCAssembler &Asm);
-
-public:
-  ARM64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
-      : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
-                                 /*UseAggressiveSymbolFolding=*/true) {}
-
-  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
-                        const MCAsmLayout &Layout, const MCFragment *Fragment,
-                        const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue);
-};
-}
-
-bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
-    const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
-    unsigned &Log2Size, const MCAssembler &Asm) {
-  RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
-  Log2Size = ~0U;
-
-  switch ((unsigned)Fixup.getKind()) {
-  default:
-    return false;
-
-  case FK_Data_1:
-    Log2Size = llvm::Log2_32(1);
-    return true;
-  case FK_Data_2:
-    Log2Size = llvm::Log2_32(2);
-    return true;
-  case FK_Data_4:
-    Log2Size = llvm::Log2_32(4);
-    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
-      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
-    return true;
-  case FK_Data_8:
-    Log2Size = llvm::Log2_32(8);
-    if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
-      RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
-    return true;
-  case ARM64::fixup_arm64_add_imm12:
-  case ARM64::fixup_arm64_ldst_imm12_scale1:
-  case ARM64::fixup_arm64_ldst_imm12_scale2:
-  case ARM64::fixup_arm64_ldst_imm12_scale4:
-  case ARM64::fixup_arm64_ldst_imm12_scale8:
-  case ARM64::fixup_arm64_ldst_imm12_scale16:
-    Log2Size = llvm::Log2_32(4);
-    switch (Sym->getKind()) {
-    default:
-      assert(0 && "Unexpected symbol reference variant kind!");
-    case MCSymbolRefExpr::VK_PAGEOFF:
-      RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12);
-      return true;
-    case MCSymbolRefExpr::VK_GOTPAGEOFF:
-      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12);
-      return true;
-    case MCSymbolRefExpr::VK_TLVPPAGEOFF:
-      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
-      return true;
-    }
-  case ARM64::fixup_arm64_pcrel_adrp_imm21:
-    Log2Size = llvm::Log2_32(4);
-    // This encompasses the relocation for the whole 21-bit value.
-    switch (Sym->getKind()) {
-    default:
-      Asm.getContext().FatalError(Fixup.getLoc(),
-                                  "ADR/ADRP relocations must be GOT relative");
-    case MCSymbolRefExpr::VK_PAGE:
-      RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
-      return true;
-    case MCSymbolRefExpr::VK_GOTPAGE:
-      RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21);
-      return true;
-    case MCSymbolRefExpr::VK_TLVPPAGE:
-      RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21);
-      return true;
-    }
-    return true;
-  case ARM64::fixup_arm64_pcrel_branch26:
-  case ARM64::fixup_arm64_pcrel_call26:
-    Log2Size = llvm::Log2_32(4);
-    RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
-    return true;
-  }
-}
-
-void ARM64MachObjectWriter::RecordRelocation(
-    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
-    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
-    uint64_t &FixedValue) {
-  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-
-  // See <reloc.h>.
-  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment);
-  unsigned Log2Size = 0;
-  int64_t Value = 0;
-  unsigned Index = 0;
-  unsigned IsExtern = 0;
-  unsigned Type = 0;
-  unsigned Kind = Fixup.getKind();
-
-  FixupOffset += Fixup.getOffset();
-
-  // ARM64 pcrel relocation addends do not include the section offset.
-  if (IsPCRel)
-    FixedValue += FixupOffset;
-
-  // ADRP fixups use relocations for the whole symbol value and only
-  // put the addend in the instruction itself. Clear out any value the
-  // generic code figured out from the sybmol definition.
-  if (Kind == ARM64::fixup_arm64_pcrel_adrp_imm21 ||
-      Kind == ARM64::fixup_arm64_pcrel_imm19)
-    FixedValue = 0;
-
-  // imm19 relocations are for conditional branches, which require
-  // assembler local symbols. If we got here, that's not what we have,
-  // so complain loudly.
-  if (Kind == ARM64::fixup_arm64_pcrel_imm19) {
-    Asm.getContext().FatalError(Fixup.getLoc(),
-                                "conditional branch requires assembler-local"
-                                " label. '" +
-                                    Target.getSymA()->getSymbol().getName() +
-                                    "' is external.");
-    return;
-  }
-
-  // 14-bit branch relocations should only target internal labels, and so
-  // should never get here.
-  if (Kind == ARM64::fixup_arm64_pcrel_branch14) {
-    Asm.getContext().FatalError(Fixup.getLoc(),
-                                "Invalid relocation on conditional branch!");
-    return;
-  }
-
-  if (!getARM64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
-                                  Asm)) {
-    Asm.getContext().FatalError(Fixup.getLoc(), "unknown ARM64 fixup kind!");
-    return;
-  }
-
-  Value = Target.getConstant();
-
-  if (Target.isAbsolute()) { // constant
-    // FIXME: Should this always be extern?
-    // SymbolNum of 0 indicates the absolute section.
-    Type = MachO::ARM64_RELOC_UNSIGNED;
-    Index = 0;
-
-    if (IsPCRel) {
-      IsExtern = 1;
-      Asm.getContext().FatalError(Fixup.getLoc(),
-                                  "PC relative absolute relocation!");
-
-      // FIXME: x86_64 sets the type to a branch reloc here. Should we do
-      // something similar?
-    }
-  } else if (Target.getSymB()) { // A - B + constant
-    const MCSymbol *A = &Target.getSymA()->getSymbol();
-    MCSymbolData &A_SD = Asm.getSymbolData(*A);
-    const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
-
-    const MCSymbol *B = &Target.getSymB()->getSymbol();
-    MCSymbolData &B_SD = Asm.getSymbolData(*B);
-    const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
-
-    // Check for "_foo@got - .", which comes through here as:
-    // Ltmp0:
-    //    ... _foo@got - Ltmp0
-    if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT &&
-        Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None &&
-        Layout.getSymbolOffset(&B_SD) ==
-            Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
-      // SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
-      Index = A_Base->getIndex();
-      IsExtern = 1;
-      Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
-      IsPCRel = 1;
-      MachO::any_relocation_info MRE;
-      MRE.r_word0 = FixupOffset;
-      MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                     (IsExtern << 27) | (Type << 28));
-      Writer->addRelocation(Fragment->getParent(), MRE);
-      return;
-    } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
-               Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
-      // Otherwise, neither symbol can be modified.
-      Asm.getContext().FatalError(Fixup.getLoc(),
-                                  "unsupported relocation of modified symbol");
-
-    // We don't support PCrel relocations of differences.
-    if (IsPCRel)
-      Asm.getContext().FatalError(Fixup.getLoc(),
-                                  "unsupported pc-relative relocation of "
-                                  "difference");
-
-    // ARM64 always uses external relocations. If there is no symbol to use as
-    // a base address (a local symbol with no preceeding non-local symbol),
-    // error out.
-    //
-    // FIXME: We should probably just synthesize an external symbol and use
-    // that.
-    if (!A_Base)
-      Asm.getContext().FatalError(
-          Fixup.getLoc(),
-          "unsupported relocation of local symbol '" + A->getName() +
-              "'. Must have non-local symbol earlier in section.");
-    if (!B_Base)
-      Asm.getContext().FatalError(
-          Fixup.getLoc(),
-          "unsupported relocation of local symbol '" + B->getName() +
-              "'. Must have non-local symbol earlier in section.");
-
-    if (A_Base == B_Base && A_Base)
-      Asm.getContext().FatalError(Fixup.getLoc(),
-                                  "unsupported relocation with identical base");
-
-    Value += (A_SD.getFragment() == NULL ? 0 : Writer->getSymbolAddress(
-                                                   &A_SD, Layout)) -
-             (A_Base == NULL || A_Base->getFragment() == NULL
-                  ? 0
-                  : Writer->getSymbolAddress(A_Base, Layout));
-    Value -= (B_SD.getFragment() == NULL ? 0 : Writer->getSymbolAddress(
-                                                   &B_SD, Layout)) -
-             (B_Base == NULL || B_Base->getFragment() == NULL
-                  ? 0
-                  : Writer->getSymbolAddress(B_Base, Layout));
-
-    Index = A_Base->getIndex();
-    IsExtern = 1;
-    Type = MachO::ARM64_RELOC_UNSIGNED;
-
-    MachO::any_relocation_info MRE;
-    MRE.r_word0 = FixupOffset;
-    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                   (IsExtern << 27) | (Type << 28));
-    Writer->addRelocation(Fragment->getParent(), MRE);
-
-    Index = B_Base->getIndex();
-    IsExtern = 1;
-    Type = MachO::ARM64_RELOC_SUBTRACTOR;
-  } else { // A + constant
-    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
-    MCSymbolData &SD = Asm.getSymbolData(*Symbol);
-    const MCSymbolData *Base = Asm.getAtom(&SD);
-    const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
-        Fragment->getParent()->getSection());
-
-    // If the symbol is a variable and we weren't able to get a Base for it
-    // (i.e., it's not in the symbol table associated with a section) resolve
-    // the relocation based its expansion instead.
-    if (Symbol->isVariable() && !Base) {
-      // If the evaluation is an absolute value, just use that directly
-      // to keep things easy.
-      int64_t Res;
-      if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
-              Res, Layout, Writer->getSectionAddressMap())) {
-        FixedValue = Res;
-        return;
-      }
-
-      // FIXME: Will the Target we already have ever have any data in it
-      // we need to preserve and merge with the new Target? How about
-      // the FixedValue?
-      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
-        Asm.getContext().FatalError(Fixup.getLoc(),
-                                    "unable to resolve variable '" +
-                                        Symbol->getName() + "'");
-      return RecordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
-                              FixedValue);
-    }
-
-    // Relocations inside debug sections always use local relocations when
-    // possible. This seems to be done because the debugger doesn't fully
-    // understand relocation entries and expects to find values that
-    // have already been fixed up.
-    if (Symbol->isInSection()) {
-      if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
-        Base = 0;
-    }
-
-    // ARM64 uses external relocations as much as possible. For debug sections,
-    // and for pointer-sized relocations (.quad), we allow section relocations.
-    // It's code sections that run into trouble.
-    if (Base) {
-      Index = Base->getIndex();
-      IsExtern = 1;
-
-      // Add the local offset, if needed.
-      if (Base != &SD)
-        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
-    } else if (Symbol->isInSection()) {
-      // Pointer-sized relocations can use a local relocation. Otherwise,
-      // we have to be in a debug info section.
-      if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
-        Asm.getContext().FatalError(
-            Fixup.getLoc(),
-            "unsupported relocation of local symbol '" + Symbol->getName() +
-                "'. Must have non-local symbol earlier in section.");
-      // Adjust the relocation to be section-relative.
-      // The index is the section ordinal (1-based).
-      const MCSectionData &SymSD =
-          Asm.getSectionData(SD.getSymbol().getSection());
-      Index = SymSD.getOrdinal() + 1;
-      IsExtern = 0;
-      Value += Writer->getSymbolAddress(&SD, Layout);
-
-      if (IsPCRel)
-        Value -= Writer->getFragmentAddress(Fragment, Layout) +
-                 Fixup.getOffset() + (1ULL << Log2Size);
-    } else {
-      // Resolve constant variables.
-      if (SD.getSymbol().isVariable()) {
-        int64_t Res;
-        if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute(
-                Res, Layout, Writer->getSectionAddressMap())) {
-          FixedValue = Res;
-          return;
-        }
-      }
-      Asm.getContext().FatalError(Fixup.getLoc(),
-                                  "unsupported relocation of variable '" +
-                                      Symbol->getName() + "'");
-    }
-  }
-
-  // If the relocation kind is Branch26, Page21, or Pageoff12, any addend
-  // is represented via an Addend relocation, not encoded directly into
-  // the instruction.
-  if ((Type == MachO::ARM64_RELOC_BRANCH26 ||
-       Type == MachO::ARM64_RELOC_PAGE21 ||
-       Type == MachO::ARM64_RELOC_PAGEOFF12) &&
-      Value) {
-    assert((Value & 0xff000000) == 0 && "Added relocation out of range!");
-
-    MachO::any_relocation_info MRE;
-    MRE.r_word0 = FixupOffset;
-    MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                   (IsExtern << 27) | (Type << 28));
-    Writer->addRelocation(Fragment->getParent(), MRE);
-
-    // Now set up the Addend relocation.
-    Type = MachO::ARM64_RELOC_ADDEND;
-    Index = Value;
-    IsPCRel = 0;
-    Log2Size = 2;
-    IsExtern = 0;
-
-    // Put zero into the instruction itself. The addend is in the relocation.
-    Value = 0;
-  }
-
-  // If there's any addend left to handle, encode it in the instruction.
-  FixedValue = Value;
-
-  // struct relocation_info (8 bytes)
-  MachO::any_relocation_info MRE;
-  MRE.r_word0 = FixupOffset;
-  MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
-                 (IsExtern << 27) | (Type << 28));
-  Writer->addRelocation(Fragment->getParent(), MRE);
-}
-
-MCObjectWriter *llvm::createARM64MachObjectWriter(raw_ostream &OS,
-                                                  uint32_t CPUType,
-                                                  uint32_t CPUSubtype) {
-  return createMachObjectWriter(new ARM64MachObjectWriter(CPUType, CPUSubtype),
-                                OS, /*IsLittleEndian=*/true);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
deleted file mode 100644
index f8665bc..0000000
--- a/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-add_llvm_library(LLVMARM64Desc
-  ARM64AsmBackend.cpp
-  ARM64ELFObjectWriter.cpp
-  ARM64ELFStreamer.cpp
-  ARM64MCAsmInfo.cpp
-  ARM64MCCodeEmitter.cpp
-  ARM64MCExpr.cpp
-  ARM64MCTargetDesc.cpp
-  ARM64MachObjectWriter.cpp
-)
-add_dependencies(LLVMARM64Desc ARM64CommonTableGen)
-
-# Hack: we need to include 'main' target directory to grab private headers
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt b/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
deleted file mode 100644
index e4c74d2..0000000
--- a/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Desc
-parent = ARM64
-required_libraries = ARM64AsmPrinter ARM64Info MC Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/MCTargetDesc/Makefile b/lib/Target/ARM64/MCTargetDesc/Makefile
deleted file mode 100644
index 013cc63..0000000
--- a/lib/Target/ARM64/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM64/TargetDesc/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Desc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/Makefile b/lib/Target/ARM64/Makefile
deleted file mode 100644
index 5f0f307..0000000
--- a/lib/Target/ARM64/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-##===- lib/Target/ARM64/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMARM64CodeGen
-TARGET = ARM64
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = ARM64GenRegisterInfo.inc ARM64GenInstrInfo.inc \
-		ARM64GenAsmWriter.inc ARM64GenAsmWriter1.inc \
-		ARM64GenDAGISel.inc \
-		ARM64GenCallingConv.inc ARM64GenAsmMatcher.inc \
-		ARM64GenSubtargetInfo.inc ARM64GenMCCodeEmitter.inc \
-		ARM64GenFastISel.inc ARM64GenDisassemblerTables.inc \
-		ARM64GenMCPseudoLowering.inc
-
-DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp b/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp
deleted file mode 100644
index dec09ed..0000000
--- a/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- ARM64TargetInfo.cpp - ARM64 Target Implementation -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/TargetRegistry.h"
-using namespace llvm;
-
-namespace llvm {
-Target TheARM64Target;
-} // end namespace llvm
-
-extern "C" void LLVMInitializeARM64TargetInfo() {
-  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64Target, "arm64",
-                                                   "ARM64");
-}
diff --git a/lib/Target/ARM64/TargetInfo/CMakeLists.txt b/lib/Target/ARM64/TargetInfo/CMakeLists.txt
deleted file mode 100644
index a0142c4..0000000
--- a/lib/Target/ARM64/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64Info
-  ARM64TargetInfo.cpp
-  )
-
-add_dependencies(LLVMARM64Info ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/TargetInfo/LLVMBuild.txt b/lib/Target/ARM64/TargetInfo/LLVMBuild.txt
deleted file mode 100644
index 5bea694..0000000
--- a/lib/Target/ARM64/TargetInfo/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Info
-parent = ARM64
-required_libraries = MC Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/TargetInfo/Makefile b/lib/Target/ARM64/TargetInfo/Makefile
deleted file mode 100644
index 2d5a1a0..0000000
--- a/lib/Target/ARM64/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM64/TargetInfo/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Info
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index afd1f51..15b574d 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -108,9 +108,9 @@ namespace {
     explicit CppWriter(formatted_raw_ostream &o) :
       ModulePass(ID), Out(o), uniqueNum(0), is_inline(false), indent_level(0){}
 
-    virtual const char *getPassName() const { return "C++ backend"; }
+    const char *getPassName() const override { return "C++ backend"; }
 
-    bool runOnModule(Module &M);
+    bool runOnModule(Module &M) override;
 
     void printProgram(const std::string& fname, const std::string& modName );
     void printModule(const std::string& fname, const std::string& modName );
@@ -396,7 +396,7 @@ std::string CppWriter::getCppName(Type* Ty) {
     return I->second;
 
   // Okay, let's build a new name for this type. Start with a prefix
-  const char* prefix = 0;
+  const char* prefix = nullptr;
   switch (Ty->getTypeID()) {
   case Type::FunctionTyID:    prefix = "FuncTy_"; break;
   case Type::StructTyID:      prefix = "StructTy_"; break;
@@ -1690,9 +1690,8 @@ void CppWriter::printFunctionUses(const Function* F) {
 
   // Print the function declarations for any functions encountered
   nl(Out) << "// Function Declarations"; nl(Out);
-  for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
-       I != E; ++I) {
-    if (Function* Fun = dyn_cast<Function>(*I)) {
+  for (auto *GV : gvs) {
+    if (Function *Fun = dyn_cast<Function>(GV)) {
       if (!is_inline || Fun != F)
         printFunctionHead(Fun);
     }
@@ -1700,17 +1699,15 @@ void CppWriter::printFunctionUses(const Function* F) {
 
   // Print the global variable declarations for any variables encountered
   nl(Out) << "// Global Variable Declarations"; nl(Out);
-  for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
-       I != E; ++I) {
-    if (GlobalVariable* F = dyn_cast<GlobalVariable>(*I))
+  for (auto *GV : gvs) {
+    if (GlobalVariable *F = dyn_cast<GlobalVariable>(GV))
       printVariableHead(F);
   }
 
   // Print the constants found
   nl(Out) << "// Constant Definitions"; nl(Out);
-  for (SmallPtrSet<Constant*,64>::iterator I = consts.begin(),
-         E = consts.end(); I != E; ++I) {
-    printConstant(*I);
+  for (const auto *C : consts) {
+    printConstant(C);
   }
 
   // Process the global variables definitions now that all the constants have
@@ -1718,10 +1715,9 @@ void CppWriter::printFunctionUses(const Function* F) {
   // initializers.
   if (GenerationType != GenFunction) {
     nl(Out) << "// Global Variable Definitions"; nl(Out);
-    for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
-         I != E; ++I) {
-      if (GlobalVariable* GV = dyn_cast<GlobalVariable>(*I))
-        printVariableBody(GV);
+    for (const auto &GV : gvs) {
+      if (GlobalVariable *Var = dyn_cast<GlobalVariable>(GV))
+        printVariableBody(Var);
     }
   }
 }
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index 477e788..673ade7 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -28,14 +28,12 @@ struct CPPTargetMachine : public TargetMachine {
                    CodeGenOpt::Level OL)
     : TargetMachine(T, TT, CPU, FS, Options) {}
 
-  virtual bool addPassesToEmitFile(PassManagerBase &PM,
-                                   formatted_raw_ostream &Out,
-                                   CodeGenFileType FileType,
-                                   bool DisableVerify,
-                                   AnalysisID StartAfter,
-                                   AnalysisID StopAfter);
-
-  virtual const DataLayout *getDataLayout() const { return 0; }
+  bool addPassesToEmitFile(PassManagerBase &PM, formatted_raw_ostream &Out,
+                           CodeGenFileType FileType, bool DisableVerify,
+                           AnalysisID StartAfter,
+                           AnalysisID StopAfter) override;
+
+  const DataLayout *getDataLayout() const override { return nullptr; }
 };
 
 extern Target TheCppBackendTarget;
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index c1b6d45..5f4a6c6 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -200,8 +200,6 @@ class Proc<string Name, SchedMachineModel Model,
            list<SubtargetFeature> Features>
  : ProcessorModel<Name, Model, Features>;
 
-def : Proc<"hexagonv2", HexagonModel,   [ArchV2]>;
-def : Proc<"hexagonv3", HexagonModel,   [ArchV2, ArchV3]>;
 def : Proc<"hexagonv4", HexagonModelV4, [ArchV2, ArchV3, ArchV4]>;
 def : Proc<"hexagonv5", HexagonModelV4, [ArchV2, ArchV3, ArchV4, ArchV5]>;
 
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index a588274..2e011bd 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "Hexagon.h"
 #include "HexagonAsmPrinter.h"
 #include "HexagonMachineFunctionInfo.h"
@@ -56,6 +55,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 static cl::opt<bool> AlignCalls(
          "hexagon-align-calls", cl::Hidden, cl::init(true),
           cl::desc("Insert falign after call instruction for Hexagon target"));
@@ -224,7 +225,7 @@ static MCInstPrinter *createHexagonMCInstPrinter(const Target &T,
   if (SyntaxVariant == 0)
     return(new HexagonInstPrinter(MAI, MII, MRI));
   else
-   return NULL;
+   return nullptr;
 }
 
 extern "C" void LLVMInitializeHexagonAsmPrinter() {
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.h b/lib/Target/Hexagon/HexagonAsmPrinter.h
index a186dc9..7fe8c57 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -30,21 +30,22 @@ namespace llvm {
       Subtarget = &TM.getSubtarget<HexagonSubtarget>();
     }
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon Assembly Printer";
     }
 
-    bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
+    bool isBlockOnlyReachableByFallthrough(
+                                   const MachineBasicBlock *MBB) const override;
 
-    virtual void EmitInstruction(const MachineInstr *MI);
+    void EmitInstruction(const MachineInstr *MI) override;
 
     void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
-                         raw_ostream &OS);
+                         raw_ostream &OS) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                unsigned AsmVariant, const char *ExtraCode,
-                               raw_ostream &OS);
+                               raw_ostream &OS) override;
 
     static const char *getRegisterName(unsigned RegNo);
   };
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 8597f11..de340e0 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexagon_cfg"
 #include "Hexagon.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonSubtarget.h"
@@ -26,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon_cfg"
+
 namespace llvm {
   void initializeHexagonCFGOptimizerPass(PassRegistry&);
 }
@@ -48,10 +49,10 @@ private:
     initializeHexagonCFGOptimizerPass(*PassRegistry::getPassRegistry());
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "Hexagon CFG Optimizer";
   }
-  bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 
 
@@ -146,8 +147,8 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
         MachineBasicBlock::succ_iterator SI = MBB->succ_begin();
         MachineBasicBlock* FirstSucc = *SI;
         MachineBasicBlock* SecondSucc = *(++SI);
-        MachineBasicBlock* LayoutSucc = NULL;
-        MachineBasicBlock* JumpAroundTarget = NULL;
+        MachineBasicBlock* LayoutSucc = nullptr;
+        MachineBasicBlock* JumpAroundTarget = nullptr;
 
         if (MBB->isLayoutSuccessor(FirstSucc)) {
           LayoutSucc = FirstSucc;
@@ -161,7 +162,7 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 
         // The target of the unconditional branch must be JumpAroundTarget.
         // TODO: If not, we should not invert the unconditional branch.
-        MachineBasicBlock* CondBranchTarget = NULL;
+        MachineBasicBlock* CondBranchTarget = nullptr;
         if ((MI->getOpcode() == Hexagon::JMP_t) ||
             (MI->getOpcode() == Hexagon::JMP_f)) {
           CondBranchTarget = MI->getOperand(1).getMBB();
@@ -239,7 +240,7 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 
 static void initializePassOnce(PassRegistry &Registry) {
   PassInfo *PI = new PassInfo("Hexagon CFG Optimizer", "hexagon-cfg",
-                              &HexagonCFGOptimizer::ID, 0, false, false);
+                              &HexagonCFGOptimizer::ID, nullptr, false, false);
   Registry.registerPass(*PI, true);
 }
 
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 60c933b..aeff680 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -11,8 +11,6 @@
 // to move them together. If we can move them next to each other we do so and
 // replace them with a combine instruction.
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "hexagon-copy-combine"
-
 #include "llvm/PassSupport.h"
 #include "Hexagon.h"
 #include "HexagonInstrInfo.h"
@@ -36,6 +34,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-copy-combine"
+
 static
 cl::opt<bool> IsCombinesDisabled("disable-merge-into-combines",
                                  cl::Hidden, cl::ZeroOrMore,
@@ -68,15 +68,15 @@ public:
     initializeHexagonCopyToCombinePass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "Hexagon Copy-To-Combine Pass";
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 
 private:
   MachineInstr *findPairable(MachineInstr *I1, bool &DoInsertAtI1);
@@ -262,7 +262,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
     unsigned KilledOperand = 0;
     if (I2->killsRegister(I2UseReg))
       KilledOperand = I2UseReg;
-    MachineInstr *KillingInstr = 0;
+    MachineInstr *KillingInstr = nullptr;
 
     for (; I != End; ++I) {
       // If the intervening instruction I:
@@ -306,7 +306,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
     // Track killed operands. If we move across an instruction that kills our
     // operand, we need to update the kill information on the moved I1. It kills
     // the operand now.
-    MachineInstr *KillingInstr = 0;
+    MachineInstr *KillingInstr = nullptr;
     unsigned KilledOperand = 0;
 
     while(++I != End) {
@@ -333,7 +333,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
 
       // Check for an exact kill (registers match).
       if (I1UseReg && I->killsRegister(I1UseReg)) {
-        assert(KillingInstr == 0 && "Should only see one killing instruction");
+        assert(!KillingInstr && "Should only see one killing instruction");
         KilledOperand = I1UseReg;
         KillingInstr = &*I;
       }
@@ -506,7 +506,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr *I1,
     // Not safe. Stop searching.
     break;
   }
-  return 0;
+  return nullptr;
 }
 
 void HexagonCopyToCombine::combine(MachineInstr *I1, MachineInstr *I2,
diff --git a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
index 8a5991f..3dafe80 100644
--- a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
+++ b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
@@ -60,10 +60,10 @@ class HexagonExpandPredSpillCode : public MachineFunctionPass {
       initializeHexagonExpandPredSpillCodePass(Registry);
     }
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon Expand Predicate Spill Code";
     }
-    bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 
 
@@ -187,7 +187,7 @@ static void initializePassOnce(PassRegistry &Registry) {
   const char *Name = "Hexagon Expand Predicate Spill Code";
   PassInfo *PI = new PassInfo(Name, "hexagon-spill-pred",
                               &HexagonExpandPredSpillCode::ID,
-                              0, false, false);
+                              nullptr, false, false);
   Registry.registerPass(*PI, true);
 }
 
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index a79264b..d41939a 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -40,11 +40,13 @@ namespace {
       initializeHexagonFixupHwLoopsPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
-    const char *getPassName() const { return "Hexagon Hardware Loop Fixup"; }
+    const char *getPassName() const override {
+      return "Hexagon Hardware Loop Fixup";
+    }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 0ea13d4..d551ca9 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -246,7 +246,7 @@ HexagonFrameLowering::spillCalleeSavedRegisters(
     //
     unsigned SuperReg = uniqueSuperReg(Reg, TRI);
     bool CanUseDblStore = false;
-    const TargetRegisterClass* SuperRegClass = 0;
+    const TargetRegisterClass* SuperRegClass = nullptr;
 
     if (ContiguousRegs && (i < CSI.size()-1)) {
       unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI);
@@ -300,7 +300,7 @@ bool HexagonFrameLowering::restoreCalleeSavedRegisters(
     // Check if we can use a double-word load.
     //
     unsigned SuperReg = uniqueSuperReg(Reg, TRI);
-    const TargetRegisterClass* SuperRegClass = 0;
+    const TargetRegisterClass* SuperRegClass = nullptr;
     bool CanUseDblLoad = false;
     if (ContiguousRegs && (i < CSI.size()-1)) {
       unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI);
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index a62c76a..446af16 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -28,25 +28,25 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-  virtual bool
-  spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            const std::vector<CalleeSavedInfo> &CSI,
-                            const TargetRegisterInfo *TRI) const;
-
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
-  virtual bool
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  void
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+
+  bool
   restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
                               const std::vector<CalleeSavedInfo> &CSI,
-                              const TargetRegisterInfo *TRI) const;
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
-  bool hasFP(const MachineFunction &MF) const;
+                              const TargetRegisterInfo *TRI) const override;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  bool hasFP(const MachineFunction &MF) const override;
   bool hasTailCall(MachineBasicBlock &MBB) const;
 };
 
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 936fb11..7f76421 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -26,7 +26,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hwloops"
 #include "llvm/ADT/SmallSet.h"
 #include "Hexagon.h"
 #include "HexagonTargetMachine.h"
@@ -47,6 +46,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "hwloops"
+
 #ifndef NDEBUG
 static cl::opt<int> HWLoopLimit("max-hwloop", cl::Hidden, cl::init(-1));
 #endif
@@ -77,11 +78,11 @@ namespace {
       initializeHexagonHardwareLoopsPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
-    const char *getPassName() const { return "Hexagon Hardware Loops"; }
+    const char *getPassName() const override { return "Hexagon Hardware Loops"; }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineDominatorTree>();
       AU.addRequired<MachineLoopInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -264,8 +265,8 @@ namespace {
       return Contents.ImmVal;
     }
 
-    void print(raw_ostream &OS, const TargetMachine *TM = 0) const {
-      const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : 0;
+    void print(raw_ostream &OS, const TargetMachine *TM = nullptr) const {
+      const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : nullptr;
       if (isReg()) { OS << PrintReg(Contents.R.Reg, TRI, Contents.R.Sub); }
       if (isImm()) { OS << Contents.ImmVal; }
     }
@@ -369,7 +370,7 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
   }  // for (instr)
 
   SmallVector<MachineOperand,2> Cond;
-  MachineBasicBlock *TB = 0, *FB = 0;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
   bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
   if (NotAnalyzed)
     return false;
@@ -434,37 +435,37 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
          "Loop must have more than one incoming edge!");
   MachineBasicBlock *Backedge = *PI++;
   if (PI == TopMBB->pred_end())  // dead loop?
-    return 0;
+    return nullptr;
   MachineBasicBlock *Incoming = *PI++;
   if (PI != TopMBB->pred_end())  // multiple backedges?
-    return 0;
+    return nullptr;
 
   // Make sure there is one incoming and one backedge and determine which
   // is which.
   if (L->contains(Incoming)) {
     if (L->contains(Backedge))
-      return 0;
+      return nullptr;
     std::swap(Incoming, Backedge);
   } else if (!L->contains(Backedge))
-    return 0;
+    return nullptr;
 
   // Look for the cmp instruction to determine if we can get a useful trip
   // count.  The trip count can be either a register or an immediate.  The
   // location of the value depends upon the type (reg or imm).
   MachineBasicBlock *Latch = L->getLoopLatch();
   if (!Latch)
-    return 0;
+    return nullptr;
 
   unsigned IVReg = 0;
   int64_t IVBump = 0;
   MachineInstr *IVOp;
   bool FoundIV = findInductionRegister(L, IVReg, IVBump, IVOp);
   if (!FoundIV)
-    return 0;
+    return nullptr;
 
   MachineBasicBlock *Preheader = L->getLoopPreheader();
 
-  MachineOperand *InitialValue = 0;
+  MachineOperand *InitialValue = nullptr;
   MachineInstr *IV_Phi = MRI->getVRegDef(IVReg);
   for (unsigned i = 1, n = IV_Phi->getNumOperands(); i < n; i += 2) {
     MachineBasicBlock *MBB = IV_Phi->getOperand(i+1).getMBB();
@@ -474,13 +475,13 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
       IVReg = IV_Phi->getOperand(i).getReg();  // Want IV reg after bump.
   }
   if (!InitialValue)
-    return 0;
+    return nullptr;
 
   SmallVector<MachineOperand,2> Cond;
-  MachineBasicBlock *TB = 0, *FB = 0;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
   bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
   if (NotAnalyzed)
-    return 0;
+    return nullptr;
 
   MachineBasicBlock *Header = L->getHeader();
   // TB must be non-null.  If FB is also non-null, one of them must be
@@ -489,7 +490,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   assert (TB && "Latch block without a branch?");
   assert ((!FB || TB == Header || FB == Header) && "Branches not to header?");
   if (!TB || (FB && TB != Header && FB != Header))
-    return 0;
+    return nullptr;
 
   // Branches of form "if (!P) ..." cause HexagonInstrInfo::AnalyzeBranch
   // to put imm(0), followed by P in the vector Cond.
@@ -505,7 +506,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   bool AnalyzedCmp = TII->analyzeCompare(CondI, CmpReg1, CmpReg2,
                                          Mask, ImmValue);
   if (!AnalyzedCmp)
-    return 0;
+    return nullptr;
 
   // The comparison operator type determines how we compute the loop
   // trip count.
@@ -521,7 +522,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   bool isSwapped = false;
   const MachineOperand &Op1 = CondI->getOperand(1);
   const MachineOperand &Op2 = CondI->getOperand(2);
-  const MachineOperand *EndValue = 0;
+  const MachineOperand *EndValue = nullptr;
 
   if (Op1.isReg()) {
     if (Op2.isImm() || Op1.getReg() == IVReg)
@@ -533,7 +534,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   }
 
   if (!EndValue)
-    return 0;
+    return nullptr;
 
   switch (CondOpc) {
     case Hexagon::CMPEQri:
@@ -552,7 +553,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
     case Hexagon::CMPbEQri_V4:
     case Hexagon::CMPhEQri_V4: {
       if (IVBump != 1)
-        return 0;
+        return nullptr;
 
       int64_t InitV, EndV;
       // Since the comparisons are "ri", the EndValue should be an
@@ -562,26 +563,26 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
       // Allow InitialValue to be a register defined with an immediate.
       if (InitialValue->isReg()) {
         if (!defWithImmediate(InitialValue->getReg()))
-          return 0;
+          return nullptr;
         InitV = getImmediate(*InitialValue);
       } else {
         assert(InitialValue->isImm());
         InitV = InitialValue->getImm();
       }
       if (InitV >= EndV)
-        return 0;
+        return nullptr;
       if (CondOpc == Hexagon::CMPbEQri_V4) {
         if (!isInt<8>(InitV) || !isInt<8>(EndV))
-          return 0;
+          return nullptr;
       } else {  // Hexagon::CMPhEQri_V4
         if (!isInt<16>(InitV) || !isInt<16>(EndV))
-          return 0;
+          return nullptr;
       }
       Cmp = !Negated ? Comparison::EQ : Comparison::NE;
       break;
     }
     default:
-      return 0;
+      return nullptr;
   }
 
   if (isSwapped)
@@ -591,14 +592,14 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
     unsigned R = InitialValue->getReg();
     MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
     if (!MDT->properlyDominates(DefBB, Header))
-      return 0;
+      return nullptr;
     OldInsts.push_back(MRI->getVRegDef(R));
   }
   if (EndValue->isReg()) {
     unsigned R = EndValue->getReg();
     MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
     if (!MDT->properlyDominates(DefBB, Header))
-      return 0;
+      return nullptr;
   }
 
   return computeCount(L, InitialValue, EndValue, IVReg, IVBump, Cmp);
@@ -616,7 +617,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
                                                Comparison::Kind Cmp) const {
   // Cannot handle comparison EQ, i.e. while (A == B).
   if (Cmp == Comparison::EQ)
-    return 0;
+    return nullptr;
 
   // Check if either the start or end values are an assignment of an immediate.
   // If so, use the immediate value rather than the register.
@@ -642,11 +643,11 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
   // If loop executes while iv is "less" with the iv value going down, then
   // the iv must wrap.
   if (CmpLess && IVBump < 0)
-    return 0;
+    return nullptr;
   // If loop executes while iv is "greater" with the iv value going up, then
   // the iv must wrap.
   if (CmpGreater && IVBump > 0)
-    return 0;
+    return nullptr;
 
   if (Start->isImm() && End->isImm()) {
     // Both, start and end are immediates.
@@ -654,15 +655,15 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
     int64_t EndV = End->getImm();
     int64_t Dist = EndV - StartV;
     if (Dist == 0)
-      return 0;
+      return nullptr;
 
     bool Exact = (Dist % IVBump) == 0;
 
     if (Cmp == Comparison::NE) {
       if (!Exact)
-        return 0;
+        return nullptr;
       if ((Dist < 0) ^ (IVBump < 0))
-        return 0;
+        return nullptr;
     }
 
     // For comparisons that include the final value (i.e. include equality
@@ -683,7 +684,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
     uint64_t Count = Dist1;
 
     if (Count > 0xFFFFFFFFULL)
-      return 0;
+      return nullptr;
 
     return new CountValue(CountValue::CV_Immediate, Count);
   }
@@ -695,7 +696,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
   // If the induction variable bump is not a power of 2, quit.
   // Othwerise we'd need a general integer division.
   if (!isPowerOf2_64(abs64(IVBump)))
-    return 0;
+    return nullptr;
 
   MachineBasicBlock *PH = Loop->getLoopPreheader();
   assert (PH && "Should have a preheader by now");
@@ -766,7 +767,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
   // Hardware loops cannot handle 64-bit registers.  If it's a double
   // register, it has to have a subregister.
   if (!SR && RC == &Hexagon::DoubleRegsRegClass)
-    return 0;
+    return nullptr;
   const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
 
   // Compute DistR (register with the distance between Start and End).
@@ -1013,7 +1014,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
 
   MachineBasicBlock *LastMBB = L->getExitingBlock();
   // Don't generate hw loop if the loop has more than one exit.
-  if (LastMBB == 0)
+  if (!LastMBB)
     return false;
 
   MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
@@ -1035,7 +1036,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
   SmallVector<MachineInstr*, 2> OldInsts;
   // Are we able to determine the trip count for the loop?
   CountValue *TripCount = getLoopTripCount(L, OldInsts);
-  if (TripCount == 0)
+  if (!TripCount)
     return false;
 
   // Is the trip count available in the preheader?
@@ -1127,7 +1128,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
       if (LastI != LastMBB->end())
         LastI = LastMBB->erase(LastI);
       SmallVector<MachineOperand, 0> Cond;
-      TII->InsertBranch(*LastMBB, BranchTarget, 0, Cond, LastIDL);
+      TII->InsertBranch(*LastMBB, BranchTarget, nullptr, Cond, LastIDL);
     }
   } else {
     // Conditional branch to loop start; just delete it.
@@ -1196,7 +1197,7 @@ MachineInstr *HexagonHardwareLoops::defWithImmediate(unsigned R) {
     case Hexagon::CONST64_Int_Real:
       return DI;
   }
-  return 0;
+  return nullptr;
 }
 
 
@@ -1291,7 +1292,7 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
   if (IndRegs.empty())
     return false;
 
-  MachineBasicBlock *TB = 0, *FB = 0;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
   SmallVector<MachineOperand,2> Cond;
   // AnalyzeBranch returns true if it fails to analyze branch.
   bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
@@ -1322,7 +1323,7 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
     return false;
 
   SmallSet<unsigned,2> CmpRegs;
-  MachineOperand *CmpImmOp = 0;
+  MachineOperand *CmpImmOp = nullptr;
 
   // Go over all operands to the compare and look for immediate and register
   // operands.  Assume that if the compare has a single register use and a
@@ -1420,7 +1421,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
   DebugLoc DL;
 
   if (!Latch || Header->hasAddressTaken())
-    return 0;
+    return nullptr;
 
   typedef MachineBasicBlock::instr_iterator instr_iterator;
 
@@ -1429,17 +1430,17 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
   typedef std::vector<MachineBasicBlock*> MBBVector;
   MBBVector Preds(Header->pred_begin(), Header->pred_end());
   SmallVector<MachineOperand,2> Tmp1;
-  MachineBasicBlock *TB = 0, *FB = 0;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
 
   if (TII->AnalyzeBranch(*Latch, TB, FB, Tmp1, false))
-    return 0;
+    return nullptr;
 
   for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
     MachineBasicBlock *PB = *I;
     if (PB != Latch) {
       bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp1, false);
       if (NotAnalyzed)
-        return 0;
+        return nullptr;
     }
   }
 
@@ -1515,7 +1516,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
   SmallVector<MachineOperand,1> Tmp2;
   SmallVector<MachineOperand,1> EmptyCond;
 
-  TB = FB = 0;
+  TB = FB = nullptr;
 
   for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
     MachineBasicBlock *PB = *I;
@@ -1525,22 +1526,22 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
       (void)NotAnalyzed; // suppress compiler warning
       assert (!NotAnalyzed && "Should be analyzable!");
       if (TB != Header && (Tmp2.empty() || FB != Header))
-        TII->InsertBranch(*PB, NewPH, 0, EmptyCond, DL);
+        TII->InsertBranch(*PB, NewPH, nullptr, EmptyCond, DL);
       PB->ReplaceUsesOfBlockWith(Header, NewPH);
     }
   }
 
   // It can happen that the latch block will fall through into the header.
   // Insert an unconditional branch to the header.
-  TB = FB = 0;
+  TB = FB = nullptr;
   bool LatchNotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Tmp2, false);
   (void)LatchNotAnalyzed; // suppress compiler warning
   assert (!LatchNotAnalyzed && "Should be analyzable!");
   if (!TB && !FB)
-    TII->InsertBranch(*Latch, Header, 0, EmptyCond, DL);
+    TII->InsertBranch(*Latch, Header, nullptr, EmptyCond, DL);
 
   // Finally, the branch from the preheader to the header.
-  TII->InsertBranch(*NewPH, Header, 0, EmptyCond, DL);
+  TII->InsertBranch(*NewPH, Header, nullptr, EmptyCond, DL);
   NewPH->addSuccessor(Header);
 
   return NewPH;
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index ed8c786..dabe650 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexagon-isel"
 #include "Hexagon.h"
 #include "HexagonISelLowering.h"
 #include "HexagonTargetMachine.h"
@@ -23,6 +22,8 @@
 #include "llvm/Support/Debug.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-isel"
+
 static
 cl::opt<unsigned>
 MaxNumOfUsesForConstExtenders("ga-max-num-uses-for-constant-extenders",
@@ -61,7 +62,7 @@ public:
   }
   bool hasNumUsesBelowThresGA(SDNode *N) const;
 
-  SDNode *Select(SDNode *N);
+  SDNode *Select(SDNode *N) override;
 
   // Complex Pattern Selectors.
   inline bool foldGlobalAddress(SDValue &N, SDValue &R);
@@ -78,15 +79,15 @@ public:
   bool SelectADDRriU6_1(SDValue& N, SDValue &R1, SDValue &R2);
   bool SelectADDRriU6_2(SDValue& N, SDValue &R1, SDValue &R2);
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "Hexagon DAG->DAG Pattern Instruction Selection";
   }
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
-  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                            char ConstraintCode,
-                                            std::vector<SDValue> &OutOps);
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
   bool SelectAddr(SDNode *Op, SDValue Addr, SDValue &Base, SDValue &Offset);
 
   SDNode *SelectLoad(SDNode *N);
@@ -186,7 +187,7 @@ FunctionPass *llvm::createHexagonISelDag(HexagonTargetMachine &TM,
 static void initializePassOnce(PassRegistry &Registry) {
   const char *Name = "Hexagon DAG->DAG Pattern Instruction Selection";
   PassInfo *PI = new PassInfo(Name, "hexagon-isel",
-                              &SelectionDAGISel::ID, 0, false, false);
+                              &SelectionDAGISel::ID, nullptr, false, false);
   Registry.registerPass(*PI, true);
 }
 
@@ -1238,7 +1239,7 @@ SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
         SDNode *PdRs = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1,
                                               SDValue(Arg, 0));
         Ops.push_back(SDValue(PdRs,0));
-      } else if (RC == NULL && (dyn_cast<ConstantSDNode>(Arg) != NULL)) {
+      } else if (!RC && (dyn_cast<ConstantSDNode>(Arg) != nullptr)) {
         // This is immediate operand. Lower it here making sure that we DO have
         // const SDNode for immediate value.
         int32_t Val = cast<ConstantSDNode>(Arg)->getSExtValue();
@@ -1346,7 +1347,7 @@ SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
 SDNode *HexagonDAGToDAGISel::Select(SDNode *N) {
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
 
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 92b794d..b8e5d24 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -39,6 +39,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-lowering"
+
 static cl::opt<bool>
 EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden,
                cl::desc("Control jump table emission on Hexagon target"));
@@ -135,7 +137,7 @@ CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
     return false;
   }
-  llvm_unreachable(0);
+  llvm_unreachable(nullptr);
 }
 
 
@@ -182,7 +184,7 @@ static bool CC_Hexagon32(unsigned ValNo, MVT ValVT,
                          MVT LocVT, CCValAssign::LocInfo LocInfo,
                          ISD::ArgFlagsTy ArgFlags, CCState &State) {
 
-  static const uint16_t RegList[] = {
+  static const MCPhysReg RegList[] = {
     Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
     Hexagon::R5
   };
@@ -205,10 +207,10 @@ static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
     return false;
   }
 
-  static const uint16_t RegList1[] = {
+  static const MCPhysReg RegList1[] = {
     Hexagon::D1, Hexagon::D2
   };
-  static const uint16_t RegList2[] = {
+  static const MCPhysReg RegList2[] = {
     Hexagon::R1, Hexagon::R3
   };
   if (unsigned Reg = State.AllocateReg(RegList1, RegList2, 2)) {
@@ -346,8 +348,7 @@ HexagonTargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other,
-                     &RetOps[0], RetOps.size());
+  return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
 
@@ -410,7 +411,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   int NumNamedVarArgParams = -1;
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee))
   {
-    const Function* CalleeFn = NULL;
+    const Function* CalleeFn = nullptr;
     Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, MVT::i32);
     if ((CalleeFn = dyn_cast<Function>(GA->getGlobal())))
     {
@@ -520,8 +521,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Transform all store nodes into one single node because all store
   // nodes are independent of each other.
   if (!MemOpChains.empty()) {
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains[0],
-                        MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
   }
 
   if (!isTailCall)
@@ -595,9 +595,9 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   if (isTailCall)
-    return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+    return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, Ops);
 
-  Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -817,7 +817,7 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                        Sub);
 
   SDValue Ops[2] = { ArgAdjust, CopyChain };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue
@@ -916,8 +916,7 @@ const {
   }
 
   if (!MemOps.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOps[0],
-                        MemOps.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
   if (isVarArg) {
     // This will point to the next argument passed via stack.
@@ -1480,7 +1479,7 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
 const char*
 HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-    default: return 0;
+    default: return nullptr;
     case HexagonISD::CONST32:     return "HexagonISD::CONST32";
     case HexagonISD::CONST32_GP: return "HexagonISD::CONST32_GP";
     case HexagonISD::CONST32_Int_Real: return "HexagonISD::CONST32_Int_Real";
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 73da226..4f27c27 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -92,14 +92,14 @@ namespace llvm {
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
                                       SelectionDAG& DAG) const;
 
-    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
-    virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
-    virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const;
+    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
 
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
     SDValue  LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
@@ -109,12 +109,12 @@ namespace llvm {
                                  CallingConv::ID CallConv, bool isVarArg,
                                  const SmallVectorImpl<ISD::InputArg> &Ins,
                                  SDLoc dl, SelectionDAG &DAG,
-                                 SmallVectorImpl<SDValue> &InVals) const;
+                                 SmallVectorImpl<SDValue> &InVals) const override;
     SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                      SmallVectorImpl<SDValue> &InVals) const;
+                      SmallVectorImpl<SDValue> &InVals) const override;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
@@ -133,46 +133,45 @@ namespace llvm {
                         CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
-                        SDLoc dl, SelectionDAG &DAG) const;
+                        SDLoc dl, SelectionDAG &DAG) const override;
 
-    virtual MachineBasicBlock
-    *EmitInstrWithCustomInserter(MachineInstr *MI,
-                                 MachineBasicBlock *BB) const;
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *BB) const override;
 
     SDValue  LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue  LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    virtual EVT getSetCCResultType(LLVMContext &C, EVT VT) const {
+    EVT getSetCCResultType(LLVMContext &C, EVT VT) const override {
       if (!VT.isVector())
         return MVT::i1;
       else
         return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
     }
 
-    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
-                                            SDValue &Base, SDValue &Offset,
-                                            ISD::MemIndexedMode &AM,
-                                            SelectionDAG &DAG) const;
+    bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                    SDValue &Base, SDValue &Offset,
+                                    ISD::MemIndexedMode &AM,
+                                    SelectionDAG &DAG) const override;
 
     std::pair<unsigned, const TargetRegisterClass*>
     getRegForInlineAsmConstraint(const std::string &Constraint,
-                                 MVT VT) const;
+                                 MVT VT) const override;
 
     // Intrinsics
-    virtual SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op,
-                                            SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
     /// The type may be VoidTy, in which case only return true if the addressing
     /// mode is legal for a load/store of any legal type.
     /// TODO: Handle pre/postinc as well.
-    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
-    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
     /// compare a register against the immediate without having to materialize
     /// the immediate into a register.
-    virtual bool isLegalICmpImmediate(int64_t Imm) const;
+    bool isLegalICmpImmediate(int64_t Imm) const override;
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index d25bfa8..1057343 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-//                         Hexagon Intruction Flags +
+//                         Hexagon Instruction Flags +
 //
 //                    *** Must match HexagonBaseInfo.h ***
 //===----------------------------------------------------------------------===//
@@ -68,7 +68,7 @@ def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
 
 
 //===----------------------------------------------------------------------===//
-//                         Intruction Class Declaration +
+//                         Instruction Class Declaration +
 //===----------------------------------------------------------------------===//
 
 class OpcodeHexagon {
@@ -104,54 +104,72 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   // Solo instructions, i.e., those that cannot be in a packet with others.
   bits<1> isSolo = 0;
   let TSFlags{5} = isSolo;
+  // Packed only with A or X-type instructions.
+  bits<1> isSoloAX = 0;
+  let TSFlags{6} = isSoloAX;
+  // Only A-type instruction in first slot or nothing.
+  bits<1> isSoloAin1 = 0;
+  let TSFlags{7} = isSoloAin1;
 
   // Predicated instructions.
   bits<1> isPredicated = 0;
-  let TSFlags{6} = isPredicated;
+  let TSFlags{8} = isPredicated;
   bits<1> isPredicatedFalse = 0;
-  let TSFlags{7} = isPredicatedFalse;
+  let TSFlags{9} = isPredicatedFalse;
   bits<1> isPredicatedNew = 0;
-  let TSFlags{8} = isPredicatedNew;
+  let TSFlags{10} = isPredicatedNew;
+  bits<1> isPredicateLate = 0;
+  let TSFlags{11} = isPredicateLate; // Late predicate producer insn.
 
   // New-value insn helper fields.
   bits<1> isNewValue = 0;
-  let TSFlags{9} = isNewValue; // New-value consumer insn.
+  let TSFlags{12} = isNewValue; // New-value consumer insn.
   bits<1> hasNewValue = 0;
-  let TSFlags{10} = hasNewValue; // New-value producer insn.
+  let TSFlags{13} = hasNewValue; // New-value producer insn.
   bits<3> opNewValue = 0;
-  let TSFlags{13-11} = opNewValue; // New-value produced operand.
-  bits<2> opNewBits = 0;
-  let TSFlags{15-14} = opNewBits; // New-value opcode bits location: 0, 8, 16.
+  let TSFlags{16-14} = opNewValue; // New-value produced operand.
   bits<1> isNVStorable = 0;
-  let TSFlags{16} = isNVStorable; // Store that can become new-value store.
+  let TSFlags{17} = isNVStorable; // Store that can become new-value store.
   bits<1> isNVStore = 0;
-  let TSFlags{17} = isNVStore; // New-value store insn.
+  let TSFlags{18} = isNVStore; // New-value store insn.
+  bits<1> isCVLoadable = 0;
+  let TSFlags{19} = isCVLoadable; // Load that can become cur-value load.
+  bits<1> isCVLoad = 0;
+  let TSFlags{20} = isCVLoad; // Cur-value load insn.
 
   // Immediate extender helper fields.
   bits<1> isExtendable = 0;
-  let TSFlags{18} = isExtendable; // Insn may be extended.
+  let TSFlags{21} = isExtendable; // Insn may be extended.
   bits<1> isExtended = 0;
-  let TSFlags{19} = isExtended; // Insn must be extended.
+  let TSFlags{22} = isExtended; // Insn must be extended.
   bits<3> opExtendable = 0;
-  let TSFlags{22-20} = opExtendable; // Which operand may be extended.
+  let TSFlags{25-23} = opExtendable; // Which operand may be extended.
   bits<1> isExtentSigned = 0;
-  let TSFlags{23} = isExtentSigned; // Signed or unsigned range.
+  let TSFlags{26} = isExtentSigned; // Signed or unsigned range.
   bits<5> opExtentBits = 0;
-  let TSFlags{28-24} = opExtentBits; //Number of bits of range before extending.
+  let TSFlags{31-27} = opExtentBits; //Number of bits of range before extending.
+  bits<2> opExtentAlign = 0;
+  let TSFlags{33-32} = opExtentAlign; // Alignment exponent before extending.
 
   // If an instruction is valid on a subtarget (v2-v5), set the corresponding
   // bit from validSubTargets. v2 is the least significant bit.
   // By default, instruction is valid on all subtargets.
   SubTarget validSubTargets = HasV2SubT;
-  let TSFlags{32-29} = validSubTargets.Value;
+  let TSFlags{37-34} = validSubTargets.Value;
 
   // Addressing mode for load/store instructions.
   AddrModeType addrMode = NoAddrMode;
-  let TSFlags{35-33} = addrMode.Value;
+  let TSFlags{42-40} = addrMode.Value;
 
   // Memory access size for mem access instructions (load/store)
   MemAccessSize accessSize = NoMemAccess;
-  let TSFlags{38-36} = accessSize.Value;
+  let TSFlags{45-43} = accessSize.Value;
+
+  bits<1> isTaken = 0;
+  let TSFlags {47} = isTaken; // Branch prediction.
+
+  bits<1> isFP = 0;
+  let TSFlags {48} = isFP; // Floating-point.
 
   // Fields used for relation models.
   string BaseOpcode = "";
@@ -173,14 +191,14 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
 }
 
 //===----------------------------------------------------------------------===//
-//                         Intruction Classes Definitions +
+//                         Instruction Classes Definitions +
 //===----------------------------------------------------------------------===//
 
 // LD Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, LD, TypeLD>;
+             string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>;
 
 let mayLoad = 1 in
 class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
@@ -199,16 +217,16 @@ class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 
 let mayLoad = 1 in
 class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : LDInst<outs, ins, asmstr, pattern, cstr>;
+              string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>;
 
 // ST Instruction Class in V2/V3 can take SLOT0 only.
 // ST Instruction Class in V4    can take SLOT0 & SLOT1.
 // Definition of the instruction class CHANGED from V2/V3 to V4.
 let mayStore = 1 in
 class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, ST, TypeST>;
+             string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
 
 class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "">
@@ -216,39 +234,39 @@ class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 
 let mayStore = 1 in
 class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, ST0, TypeST>;
+              string cstr = "", InstrItinClass itin = ST_tc_ld_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
 
 // ST Instruction Class in V2/V3 can take SLOT0 only.
 // ST Instruction Class in V4    can take SLOT0 & SLOT1.
 // Definition of the instruction class CHANGED from V2/V3 to V4.
 class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                 string cstr = "">
-  : STInst<outs, ins, asmstr, pattern, cstr>;
+                 string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
+  : STInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // SYSTEM Instruction Class in V4 can take SLOT0 only
 // In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
 class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, SYS, TypeSYSTEM>;
+              string cstr = "",  InstrItinClass itin = ST_tc_3stall_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeSYSTEM>;
 
 // ALU32 Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class ALU32Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, ALU32, TypeALU32>;
+                string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU32>;
 
 // ALU64 Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
 class ALU64Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, ALU64, TypeXTYPE>;
+                string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
 
 class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-  : ALU64Inst<outs, ins, asmstr, pattern, cstr>;
+                string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
+  : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
 
 // M Instruction Class in V2/V3.
@@ -256,55 +274,55 @@ class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
 class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-            string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, M, TypeXTYPE>;
+            string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
 
 // M Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
 class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-    : MInst<outs, ins, asmstr, pattern, cstr>;
+                string cstr = "", InstrItinClass itin = M_tc_2_SLOT23>
+    : MInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // S Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
 class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-            string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, S, TypeXTYPE>;
+            string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
 
 // S Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
 class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-  : SInst<outs, ins, asmstr, pattern, cstr>;
+                string cstr = "", InstrItinClass itin = S_3op_tc_1_SLOT23>
+  : SInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // J Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class JInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-            string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, J, TypeJ>;
+            string cstr = "", InstrItinClass itin = J_tc_2early_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>;
 
 // JR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, JR, TypeJR>;
+             string cstr = "", InstrItinClass itin = J_tc_2early_SLOT2>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJR>;
 
 // CR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, CR, TypeCR>;
+             string cstr = "", InstrItinClass itin = CR_tc_2early_SLOT3>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCR>;
 
 let isCodeGenOnly = 1, isPseudo = 1 in
 class Endloop<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, ENDLOOP, TypeENDLOOP>;
+              string cstr = "", InstrItinClass itin = J_tc_2early_SLOT0123>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeENDLOOP>;
 
 let isCodeGenOnly = 1, isPseudo = 1 in
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern = [],
@@ -317,39 +335,40 @@ class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
   : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDOM, TypePSEUDO>;
 
 //===----------------------------------------------------------------------===//
-//                         Intruction Classes Definitions -
+//                         Instruction Classes Definitions -
 //===----------------------------------------------------------------------===//
 
 
 //
 // ALU32 patterns
 //.
-class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr = "">
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
+class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
-class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr = "">
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
+class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
-class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr = "">
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
+class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
-class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr = "">
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
 
 //
 // ALU64 patterns.
 //
-class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr = "">
-   : ALU64Inst<outs, ins, asmstr, pattern, cstr>;
+class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU64_tc_1_SLOT23>
+   : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
-class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr = "">
-   : ALU64Inst<outs, ins, asmstr, pattern, cstr>;
+class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU64_tc_1_SLOT23>
+   : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // Post increment ST Instruction.
 class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index 9fda0da..d92f97b 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 //----------------------------------------------------------------------------//
-//                         Hexagon Intruction Flags +
+//                         Hexagon Instruction Flags
 //
 //                        *** Must match BaseInfo.h ***
 //----------------------------------------------------------------------------//
@@ -22,30 +22,30 @@ def TypeNV     : IType<10>;
 def TypePREFIX : IType<30>;
 
 //----------------------------------------------------------------------------//
-//                         Intruction Classes Definitions +
+//                         Instruction Classes Definitions
 //----------------------------------------------------------------------------//
 
 //
 // NV type instructions.
 //
 class NVInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, NV_V4, TypeNV>;
+             string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeNV>;
 
 class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-  : NVInst<outs, ins, asmstr, pattern, cstr>;
+                string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
+  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // Definition of Post increment new value store.
 class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "">
-  : NVInst<outs, ins, asmstr, pattern, cstr>;
+               string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // Post increment ST Instruction.
 let mayStore = 1 in
 class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "">
-  : NVInst<outs, ins, asmstr, pattern, cstr>;
+               string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // New-value conditional branch.
 class NCJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
@@ -54,13 +54,14 @@ class NCJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 
 let mayLoad = 1, mayStore = 1 in
 class MEMInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, MEM_V4, TypeMEMOP>;
+              string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeMEMOP>;
 
 class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                 string cstr = "">
-  : MEMInst<outs, ins, asmstr, pattern, cstr>;
+                 string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
+  : MEMInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 let isCodeGenOnly = 1 in
 class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
-  : InstHexagon<outs, ins, asmstr, pattern, "", PREFIX, TypePREFIX>;
+  : InstHexagon<outs, ins, asmstr, pattern, "", EXTENDER_tc_1_SLOT0123,
+                TypePREFIX>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 21a12de..ea6367a 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -26,13 +26,16 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-instrinfo"
+
 #define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRMAP_INFO
 #include "HexagonGenInstrInfo.inc"
 #include "HexagonGenDFAPacketizer.inc"
 
-using namespace llvm;
-
 ///
 /// Constants for Hexagon instructions.
 ///
@@ -135,7 +138,7 @@ HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
       regPos = 1;
     }
 
-    if (FBB == 0) {
+    if (!FBB) {
       if (Cond.empty()) {
         // Due to a bug in TailMerging/CFG Optimization, we need to add a
         // special case handling of a predicated jump followed by an
@@ -151,7 +154,7 @@ HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
           if (NewTBB == NextBB) {
             ReverseBranchCondition(Cond);
             RemoveBranch(MBB);
-            return InsertBranch(MBB, TBB, 0, Cond, DL);
+            return InsertBranch(MBB, TBB, nullptr, Cond, DL);
           }
         }
         BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
@@ -174,8 +177,8 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
-  TBB = NULL;
-  FBB = NULL;
+  TBB = nullptr;
+  FBB = nullptr;
 
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::instr_iterator I = MBB.instr_end();
@@ -224,7 +227,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
   // Get the last instruction in the block.
   MachineInstr *LastInst = I;
-  MachineInstr *SecondLastInst = NULL;
+  MachineInstr *SecondLastInst = nullptr;
   // Find one more terminator if present.
   do {
     if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(I)) {
@@ -557,7 +560,7 @@ MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                           const SmallVectorImpl<unsigned> &Ops,
                                                     int FI) const {
   // Hexagon_TODO: Implement.
-  return(0);
+  return nullptr;
 }
 
 unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 5da23cb..6b032c9 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -40,124 +40,121 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  virtual const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
+  const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
 
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-
-
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
-                                 MachineBasicBlock *&FBB,
-                                 SmallVectorImpl<MachineOperand> &Cond,
-                                 bool AllowModify) const;
-
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
-
-  virtual bool analyzeCompare(const MachineInstr *MI,
-                              unsigned &SrcReg, unsigned &SrcReg2,
-                              int &Mask, int &Value) const;
-
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
-
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-
-  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                              SmallVectorImpl<MachineOperand> &Addr,
-                              const TargetRegisterClass *RC,
-                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
-
-  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                               SmallVectorImpl<MachineOperand> &Addr,
-                               const TargetRegisterClass *RC,
-                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              int FrameIndex) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              MachineInstr* LoadMI) const {
-    return 0;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                         MachineBasicBlock *&FBB,
+                         SmallVectorImpl<MachineOperand> &Cond,
+                         bool AllowModify) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  bool analyzeCompare(const MachineInstr *MI,
+                      unsigned &SrcReg, unsigned &SrcReg2,
+                      int &Mask, int &Value) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                      SmallVectorImpl<MachineOperand> &Addr,
+                      const TargetRegisterClass *RC,
+                      SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                       SmallVectorImpl<MachineOperand> &Addr,
+                       const TargetRegisterClass *RC,
+                       SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const override;
+
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr* LoadMI) const override {
+    return nullptr;
   }
 
   unsigned createVR(MachineFunction* MF, MVT VT) const;
 
-  virtual bool isBranch(const MachineInstr *MI) const;
-  virtual bool isPredicable(MachineInstr *MI) const;
-  virtual bool
-  PredicateInstruction(MachineInstr *MI,
-                       const SmallVectorImpl<MachineOperand> &Cond) const;
-
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
-                                   unsigned ExtraPredCycles,
-                                   const BranchProbability &Probability) const;
-
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
-                                   unsigned NumTCycles, unsigned ExtraTCycles,
-                                   MachineBasicBlock &FMBB,
-                                   unsigned NumFCycles, unsigned ExtraFCycles,
-                                   const BranchProbability &Probability) const;
-
-  virtual bool isPredicated(const MachineInstr *MI) const;
-  virtual bool isPredicated(unsigned Opcode) const;
-  virtual bool isPredicatedTrue(const MachineInstr *MI) const;
-  virtual bool isPredicatedTrue(unsigned Opcode) const;
-  virtual bool isPredicatedNew(const MachineInstr *MI) const;
-  virtual bool isPredicatedNew(unsigned Opcode) const;
-  virtual bool DefinesPredicate(MachineInstr *MI,
-                                std::vector<MachineOperand> &Pred) const;
-  virtual bool
-  SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                    const SmallVectorImpl<MachineOperand> &Pred2) const;
-
-  virtual bool
-  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-
-  virtual bool
-  isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumCycles,
-                            const BranchProbability &Probability) const;
-
-  virtual DFAPacketizer*
+  bool isBranch(const MachineInstr *MI) const;
+  bool isPredicable(MachineInstr *MI) const override;
+  bool PredicateInstruction(MachineInstr *MI,
+                    const SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                           unsigned ExtraPredCycles,
+                           const BranchProbability &Probability) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumTCycles, unsigned ExtraTCycles,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumFCycles, unsigned ExtraFCycles,
+                           const BranchProbability &Probability) const override;
+
+  bool isPredicated(const MachineInstr *MI) const override;
+  bool isPredicated(unsigned Opcode) const;
+  bool isPredicatedTrue(const MachineInstr *MI) const;
+  bool isPredicatedTrue(unsigned Opcode) const;
+  bool isPredicatedNew(const MachineInstr *MI) const;
+  bool isPredicatedNew(unsigned Opcode) const;
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const override;
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                   const SmallVectorImpl<MachineOperand> &Pred2) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                           const BranchProbability &Probability) const override;
+
+  DFAPacketizer*
   CreateTargetScheduleState(const TargetMachine *TM,
-                            const ScheduleDAG *DAG) const;
+                            const ScheduleDAG *DAG) const override;
 
-  virtual bool isSchedulingBoundary(const MachineInstr *MI,
-                                    const MachineBasicBlock *MBB,
-                                    const MachineFunction &MF) const;
+  bool isSchedulingBoundary(const MachineInstr *MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
   bool isValidOffset(const int Opcode, const int Offset) const;
   bool isValidAutoIncImm(const EVT VT, const int Offset) const;
   bool isMemOp(const MachineInstr *MI) const;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index c96aaca..4dcf101 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -768,12 +768,13 @@ class T_JMP <dag InsDag, list<dag> JumpList = []>
 
 let InputType = "imm", isExtendable = 1, opExtendable = 1, isExtentSigned = 1,
 Defs = [PC], isPredicated = 1, opExtentBits = 17 in
-class T_JMP_c <bit PredNot, bit isPredNew, bit isTaken>:
+class T_JMP_c <bit PredNot, bit isPredNew, bit isTak>:
             JInst<(outs ), (ins PredRegs:$src, brtarget:$dst),
             !if(PredNot, "if (!$src", "if ($src")#
             !if(isPredNew, ".new) ", ") ")#"jump"#
-            !if(isPredNew, !if(isTaken, ":t ", ":nt "), " ")#"$dst"> {
+            !if(isPredNew, !if(isTak, ":t ", ":nt "), " ")#"$dst"> {
 
+    let isTaken = isTak;
     let isBrTaken = !if(isPredNew, !if(isTaken, "true", "false"), "");
     let isPredicatedFalse = PredNot;
     let isPredicatedNew = isPredNew;
@@ -784,7 +785,7 @@ class T_JMP_c <bit PredNot, bit isPredNew, bit isTaken>:
 
     let Inst{27-24} = 0b1100;
     let Inst{21} = PredNot;
-    let Inst{12} = !if(isPredNew, isTaken, zero);
+    let Inst{12} = !if(isPredNew, isTak, zero);
     let Inst{11} = isPredNew;
     let Inst{9-8} = src;
     let Inst{23-22} = dst{16-15};
@@ -806,12 +807,13 @@ class T_JMPr<dag InsDag = (ins IntRegs:$dst)>
 }
 
 let Defs = [PC], isPredicated = 1, InputType = "reg" in
-class T_JMPr_c <bit PredNot, bit isPredNew, bit isTaken>:
+class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>:
             JRInst <(outs ), (ins PredRegs:$src, IntRegs:$dst),
             !if(PredNot, "if (!$src", "if ($src")#
             !if(isPredNew, ".new) ", ") ")#"jumpr"#
-            !if(isPredNew, !if(isTaken, ":t ", ":nt "), " ")#"$dst"> {
+            !if(isPredNew, !if(isTak, ":t ", ":nt "), " ")#"$dst"> {
 
+    let isTaken = isTak;
     let isBrTaken = !if(isPredNew, !if(isTaken, "true", "false"), "");
     let isPredicatedFalse = PredNot;
     let isPredicatedNew = isPredNew;
@@ -823,7 +825,7 @@ class T_JMPr_c <bit PredNot, bit isPredNew, bit isTaken>:
     let Inst{27-22} = 0b001101;
     let Inst{21} = PredNot;
     let Inst{20-16} = dst;
-    let Inst{12} = !if(isPredNew, isTaken, zero);
+    let Inst{12} = !if(isPredNew, isTak, zero);
     let Inst{11} = isPredNew;
     let Inst{9-8} = src;
     let Predicates = !if(isPredNew, [HasV3T], [HasV2T]);
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index a95fb80..db5b7ea 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -1004,13 +1004,13 @@ defm POST_STwri: ST_PostInc_nv <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
 
 let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11 in
 class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
-                      bit isNegCond, bit isTaken>
+                      bit isNegCond, bit isTak>
   : NVInst_V4<(outs),
     (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
     "if ("#!if(isNegCond, "!","")#mnemonic#
     "($src1"#!if(!eq(NvOpNum, 0),".new, ",", ")#
     "$src2"#!if(!eq(NvOpNum, 1),".new))","))")#" jump:"
-    #!if(isTaken, "t","nt")#" $offset",
+    #!if(isTak, "t","nt")#" $offset",
     []>, Requires<[HasV4T]> {
 
       bits<5> src1;
@@ -1019,6 +1019,7 @@ class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
       bits<5> RegOp; // Non-New-Value Operand
       bits<11> offset;
 
+      let isTaken = isTak;
       let isBrTaken = !if(isTaken, "true", "false");
       let isPredicatedFalse = isNegCond;
 
@@ -1030,7 +1031,7 @@ class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
       let Inst{25-23} = majOp;
       let Inst{22} = isNegCond;
       let Inst{18-16} = Ns;
-      let Inst{13} = isTaken;
+      let Inst{13} = isTak;
       let Inst{12-8} = RegOp;
       let Inst{21-20} = offset{10-9};
       let Inst{7-1} = offset{8-2};
@@ -1078,13 +1079,14 @@ let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
 
 let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11 in
 class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
-                         bit isTaken>
+                         bit isTak>
   : NVInst_V4<(outs),
     (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset),
     "if ("#!if(isNegCond, "!","")#mnemonic#"($src1.new, #$src2)) jump:"
-    #!if(isTaken, "t","nt")#" $offset",
+    #!if(isTak, "t","nt")#" $offset",
     []>, Requires<[HasV4T]> {
 
+      let isTaken = isTak;
       let isPredicatedFalse = isNegCond;
       let isBrTaken = !if(isTaken, "true", "false");
 
@@ -1097,7 +1099,7 @@ class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
       let Inst{25-23} = majOp;
       let Inst{22} = isNegCond;
       let Inst{18-16} = src1;
-      let Inst{13} = isTaken;
+      let Inst{13} = isTak;
       let Inst{12-8} = src2;
       let Inst{21-20} = offset{10-9};
       let Inst{7-1} = offset{8-2};
@@ -1135,14 +1137,15 @@ let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
 
 let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 11 in
 class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
-                            bit isNegCond, bit isTaken>
+                            bit isNegCond, bit isTak>
   : NVInst_V4<(outs),
     (ins IntRegs:$src1, brtarget:$offset),
     "if ("#!if(isNegCond, "!","")#mnemonic
     #"($src1.new, #"#ImmVal#")) jump:"
-    #!if(isTaken, "t","nt")#" $offset",
+    #!if(isTak, "t","nt")#" $offset",
     []>, Requires<[HasV4T]> {
 
+      let isTaken = isTak;
       let isPredicatedFalse = isNegCond;
       let isBrTaken = !if(isTaken, "true", "false");
 
@@ -1153,7 +1156,7 @@ class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
       let Inst{25-23} = majOp;
       let Inst{22} = isNegCond;
       let Inst{18-16} = src1;
-      let Inst{13} = isTaken;
+      let Inst{13} = isTak;
       let Inst{21-20} = offset{10-9};
       let Inst{7-1} = offset{8-2};
 }
@@ -2019,9 +2022,10 @@ multiclass MemOpi_bitPats <PatFrag ldOp, PatFrag stOp, PatLeaf immPred,
 
   // mem[bhw](Rs+#0) = [clrbit|setbit](#U5)
   let AddedComplexity = 225 in
-  def : Pat <(stOp (OpNode (ldOp addrPred:$addr), immPred:$bitend),
-                   addrPred:$addr),
-             (MI IntRegs:$addr, #0, (xformFunc immPred:$bitend))>;
+  def : Pat <(stOp (OpNode (ldOp (addrPred IntRegs:$addr, extPred:$offset)),
+                           immPred:$bitend),
+                   (addrPred (i32 IntRegs:$addr), extPred:$offset)),
+             (MI IntRegs:$addr, extPred:$offset, (xformFunc immPred:$bitend))>;
 }
 
 multiclass MemOpi_bitExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
@@ -2065,9 +2069,10 @@ multiclass MemOpr_Pats <PatFrag ldOp, PatFrag stOp, ComplexPattern addrPred,
                      PatLeaf extPred, InstHexagon MI, SDNode OpNode> {
   let AddedComplexity = 141 in
   // mem[bhw](Rs+#0) [+-&|]= Rt
-  def : Pat <(stOp (OpNode (ldOp addrPred:$addr), (i32 IntRegs:$addend)),
-                   addrPred:$addr),
-             (MI IntRegs:$addr, #0, (i32 IntRegs:$addend) )>;
+  def : Pat <(stOp (OpNode (ldOp (addrPred IntRegs:$addr, extPred:$offset)),
+                           (i32 IntRegs:$addend)),
+                   (addrPred (i32 IntRegs:$addr), extPred:$offset)),
+             (MI IntRegs:$addr, extPred:$offset, (i32 IntRegs:$addend) )>;
 
   // mem[bhw](Rs+#U6:[012]) [+-&|]= Rt
   let AddedComplexity = 150 in
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 51318ff..7dd6e95 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -12,17 +12,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "misched"
-
 #include "HexagonMachineScheduler.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/IR/Function.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "misched"
+
 /// Platform specific modifications to DAG.
 void VLIWMachineScheduler::postprocessDAG() {
-  SUnit* LastSequentialCall = NULL;
+  SUnit* LastSequentialCall = nullptr;
   // Currently we only catch the situation when compare gets scheduled
   // before preceding call.
   for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
@@ -398,13 +398,13 @@ SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() {
   for (unsigned i = 0; Available.empty(); ++i) {
     assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
            "permanent hazard"); (void)i;
-    ResourceModel->reserveResources(0);
+    ResourceModel->reserveResources(nullptr);
     bumpCycle();
     releasePending();
   }
   if (Available.size() == 1)
     return *Available.begin();
-  return NULL;
+  return nullptr;
 }
 
 #ifndef NDEBUG
@@ -424,7 +424,7 @@ void ConvergingVLIWScheduler::traceCandidate(const char *Label,
 /// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
 /// of SU, return it, otherwise return null.
 static SUnit *getSingleUnscheduledPred(SUnit *SU) {
-  SUnit *OnlyAvailablePred = 0;
+  SUnit *OnlyAvailablePred = nullptr;
   for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     SUnit &Pred = *I->getSUnit();
@@ -432,7 +432,7 @@ static SUnit *getSingleUnscheduledPred(SUnit *SU) {
       // We found an available, but not scheduled, predecessor.  If it's the
       // only one we have found, keep track of it... otherwise give up.
       if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
-        return 0;
+        return nullptr;
       OnlyAvailablePred = &Pred;
     }
   }
@@ -442,7 +442,7 @@ static SUnit *getSingleUnscheduledPred(SUnit *SU) {
 /// getSingleUnscheduledSucc - If there is exactly one unscheduled successor
 /// of SU, return it, otherwise return null.
 static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
-  SUnit *OnlyAvailableSucc = 0;
+  SUnit *OnlyAvailableSucc = nullptr;
   for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
        I != E; ++I) {
     SUnit &Succ = *I->getSUnit();
@@ -450,7 +450,7 @@ static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
       // We found an available, but not scheduled, successor.  If it's the
       // only one we have found, keep track of it... otherwise give up.
       if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ)
-        return 0;
+        return nullptr;
       OnlyAvailableSucc = &Succ;
     }
   }
@@ -639,7 +639,7 @@ SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
   if (DAG->top() == DAG->bottom()) {
     assert(Top.Available.empty() && Top.Pending.empty() &&
            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
-    return NULL;
+    return nullptr;
   }
   SUnit *SU;
   if (llvm::ForceTopDown) {
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 300f1c7..99100a1 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -14,7 +14,6 @@
 #ifndef HEXAGONASMPRINTER_H
 #define HEXAGONASMPRINTER_H
 
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/PriorityQueue.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -57,7 +56,7 @@ class VLIWResourceModel {
 public:
 VLIWResourceModel(const TargetMachine &TM, const TargetSchedModel *SM) :
     SchedModel(SM), TotalPackets(0) {
-    ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM,NULL);
+    ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM, nullptr);
 
     // This hard requirement could be relaxed,
     // but for now do not let it proceed.
@@ -94,8 +93,9 @@ VLIWResourceModel(const TargetMachine &TM, const TargetSchedModel *SM) :
 /// top-level schedule() driver.
 class VLIWMachineScheduler : public ScheduleDAGMILive {
 public:
-  VLIWMachineScheduler(MachineSchedContext *C, MachineSchedStrategy *S):
-    ScheduleDAGMILive(C, S) {}
+  VLIWMachineScheduler(MachineSchedContext *C,
+                       std::unique_ptr<MachineSchedStrategy> S)
+      : ScheduleDAGMILive(C, std::move(S)) {}
 
   /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
   /// time to do some work.
@@ -120,7 +120,7 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
     // Best scheduling cost.
     int SCost;
 
-    SchedCandidate(): SU(NULL), SCost(0) {}
+    SchedCandidate(): SU(nullptr), SCost(0) {}
   };
   /// Represent the type of SchedCandidate found within a single queue.
   enum CandResult {
@@ -153,9 +153,9 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
     /// Pending queues extend the ready queues with the same ID and the
     /// PendingFlag set.
     VLIWSchedBoundary(unsigned ID, const Twine &Name):
-      DAG(0), SchedModel(0), Available(ID, Name+".A"),
+      DAG(nullptr), SchedModel(nullptr), Available(ID, Name+".A"),
       Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P"),
-      CheckPending(false), HazardRec(0), ResourceModel(0),
+      CheckPending(false), HazardRec(nullptr), ResourceModel(nullptr),
       CurrCycle(0), IssueCount(0),
       MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
 
@@ -203,8 +203,9 @@ public:
     LogMaxQID = 2
   };
 
-  ConvergingVLIWScheduler():
-    DAG(0), SchedModel(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+  ConvergingVLIWScheduler()
+    : DAG(nullptr), SchedModel(nullptr), Top(TopQID, "TopQ"),
+      Bot(BotQID, "BotQ") {}
 
   virtual void initialize(ScheduleDAGMI *dag) override;
 
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 3e238bf..b7c03a7 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -21,7 +21,6 @@
 //
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "hexagon-nvj"
 #include "llvm/PassSupport.h"
 #include "Hexagon.h"
 #include "HexagonInstrInfo.h"
@@ -47,6 +46,8 @@
 #include <map>
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-nvj"
+
 STATISTIC(NumNVJGenerated, "Number of New Value Jump Instructions created");
 
 static cl::opt<int>
@@ -74,16 +75,16 @@ namespace {
       initializeHexagonNewValueJumpPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBranchProbabilityInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon NewValueJump";
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
   private:
     /// \brief A handle to the branch probability pass.
@@ -393,8 +394,8 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
     bool MO2IsKill = false;
     MachineBasicBlock::iterator jmpPos;
     MachineBasicBlock::iterator cmpPos;
-    MachineInstr *cmpInstr = NULL, *jmpInstr = NULL;
-    MachineBasicBlock *jmpTarget = NULL;
+    MachineInstr *cmpInstr = nullptr, *jmpInstr = nullptr;
+    MachineBasicBlock *jmpTarget = nullptr;
     bool afterRA = false;
     bool isSecondOpReg = false;
     bool isSecondOpNewified = false;
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 5490ecd..48b6159 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -35,7 +35,6 @@
 
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexagon-peephole"
 #include "Hexagon.h"
 #include "HexagonTargetMachine.h"
 #include "llvm/ADT/DenseMap.h"
@@ -57,6 +56,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-peephole"
+
 static cl::opt<bool> DisableHexagonPeephole("disable-hexagon-peephole",
     cl::Hidden, cl::ZeroOrMore, cl::init(false),
     cl::desc("Disable Peephole Optimization"));
@@ -89,13 +90,13 @@ namespace {
       initializeHexagonPeepholePass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon optimize redundant zero and size extends";
     }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 9a20dfd..fb466d3 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -43,13 +43,12 @@ HexagonRegisterInfo::HexagonRegisterInfo(HexagonSubtarget &st)
     Subtarget(st) {
 }
 
-const uint16_t* HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction
-                                                        *MF)
-  const {
-  static const uint16_t CalleeSavedRegsV2[] = {
+const MCPhysReg *
+HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const MCPhysReg CalleeSavedRegsV2[] = {
     Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
   };
-  static const uint16_t CalleeSavedRegsV3[] = {
+  static const MCPhysReg CalleeSavedRegsV3[] = {
     Hexagon::R16,   Hexagon::R17,   Hexagon::R18,   Hexagon::R19,
     Hexagon::R20,   Hexagon::R21,   Hexagon::R22,   Hexagon::R23,
     Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 89af7c3..648b4af 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -48,16 +48,17 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
   HexagonRegisterInfo(HexagonSubtarget &st);
 
   /// Code Generation virtual methods...
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
 
-  const TargetRegisterClass* const* getCalleeSavedRegClasses(
-                                     const MachineFunction *MF = 0) const;
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction *MF = nullptr) const;
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = nullptr) const override;
 
   /// determineFrameLayout - Determine the size of the frame and maximum call
   /// frame size.
@@ -65,17 +66,17 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
 
   /// requiresRegisterScavenging - returns true since we may need scavenging for
   /// a temporary register when generating hardware loop instructions.
-  bool requiresRegisterScavenging(const MachineFunction &MF) const {
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
   }
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
     return true;
   }
 
   // Debug information queries.
   unsigned getRARegister() const;
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
   unsigned getFrameRegister() const;
   unsigned getStackRegister() const;
 };
diff --git a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
index cadcb32..2b459a4 100644
--- a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
+++ b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
@@ -33,13 +33,13 @@ namespace {
     HexagonRemoveExtendArgs() : FunctionPass(ID) {
       initializeHexagonRemoveExtendArgsPass(*PassRegistry::getPassRegistry());
     }
-    virtual bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Remove sign extends";
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineFunctionAnalysis>();
       AU.addPreserved<MachineFunctionAnalysis>();
       AU.addPreserved("stack-protector");
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index c2cfbb9..528cafc 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -7,57 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Functional Units
-def LSUNIT    : FuncUnit; // SLOT0
-def LUNIT     : FuncUnit; // SLOT1
-def MUNIT     : FuncUnit; // SLOT2
-def SUNIT     : FuncUnit; // SLOT3
-def LOOPUNIT  : FuncUnit;
-
-// Itinerary classes
-def ALU32     : InstrItinClass;
-def ALU64     : InstrItinClass;
-def CR        : InstrItinClass;
-def J         : InstrItinClass;
-def JR        : InstrItinClass;
-def LD        : InstrItinClass;
-def LD0       : InstrItinClass;
-def M         : InstrItinClass;
-def ST        : InstrItinClass;
-def ST0       : InstrItinClass;
-def S         : InstrItinClass;
-def SYS       : InstrItinClass;
-def ENDLOOP   : InstrItinClass;
-def PSEUDO    : InstrItinClass;
-def PSEUDOM   : InstrItinClass;
-
-def HexagonItineraries :
-      ProcessorItineraries<[LSUNIT, LUNIT, MUNIT, SUNIT, LOOPUNIT], [], [
-        InstrItinData<ALU32  , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
-        InstrItinData<ALU64  , [InstrStage<1, [MUNIT, SUNIT]>]>,
-        InstrItinData<CR     , [InstrStage<1, [SUNIT]>]>,
-        InstrItinData<J      , [InstrStage<1, [SUNIT, MUNIT]>]>,
-        InstrItinData<JR     , [InstrStage<1, [MUNIT]>]>,
-        InstrItinData<LD     , [InstrStage<1, [LUNIT, LSUNIT]>]>,
-        InstrItinData<LD0    , [InstrStage<1, [LSUNIT]>]>,
-        InstrItinData<M      , [InstrStage<1, [MUNIT, SUNIT]>]>,
-        InstrItinData<ST     , [InstrStage<1, [LSUNIT]>]>,
-        InstrItinData<ST0    , [InstrStage<1, [LSUNIT]>]>,
-        InstrItinData<S      , [InstrStage<1, [SUNIT, MUNIT]>]>,
-        InstrItinData<SYS    , [InstrStage<1, [LSUNIT]>]>,
-        InstrItinData<ENDLOOP, [InstrStage<1, [LOOPUNIT]>]>,
-        InstrItinData<PSEUDO , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
-        InstrItinData<PSEUDOM, [InstrStage<1, [MUNIT, SUNIT], 0>,
-                                InstrStage<1, [MUNIT, SUNIT]>]>
-      ]>;
-
-def HexagonModel : SchedMachineModel {
-  // Max issue per cycle == bundle width.
-  let IssueWidth = 4;
-  let Itineraries = HexagonItineraries;
-  let LoadLatency = 1;
-}
-
 //===----------------------------------------------------------------------===//
 // V4 Machine Info +
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
index ef72cf4..a7d2d47 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -34,29 +34,158 @@ def SLOT3       : FuncUnit;
 def SLOT_ENDLOOP: FuncUnit;
 
 // Itinerary classes.
-def NV_V4       : InstrItinClass;
-def MEM_V4      : InstrItinClass;
+def PSEUDO      : InstrItinClass;
+def PSEUDOM   : InstrItinClass;
 // ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4.
+def DUPLEX      : InstrItinClass;
 def PREFIX      : InstrItinClass;
+def COMPOUND    : InstrItinClass;
+
+def ALU32_2op_tc_1_SLOT0123  : InstrItinClass;
+def ALU32_2op_tc_2early_SLOT0123  : InstrItinClass;
+def ALU32_3op_tc_2early_SLOT0123  : InstrItinClass;
+def ALU32_3op_tc_1_SLOT0123  : InstrItinClass;
+def ALU32_3op_tc_2_SLOT0123  : InstrItinClass;
+def ALU32_ADDI_tc_1_SLOT0123 : InstrItinClass;
+def ALU64_tc_1_SLOT23        : InstrItinClass;
+def ALU64_tc_1or2_SLOT23     : InstrItinClass;
+def ALU64_tc_2_SLOT23        : InstrItinClass;
+def ALU64_tc_2early_SLOT23   : InstrItinClass;
+def ALU64_tc_3x_SLOT23       : InstrItinClass;
+def CR_tc_2_SLOT3            : InstrItinClass;
+def CR_tc_2early_SLOT23      : InstrItinClass;
+def CR_tc_2early_SLOT3       : InstrItinClass;
+def CR_tc_3x_SLOT23          : InstrItinClass;
+def CR_tc_3x_SLOT3           : InstrItinClass;
+def J_tc_2early_SLOT23       : InstrItinClass;
+def J_tc_2early_SLOT2        : InstrItinClass;
+def LD_tc_ld_SLOT01          : InstrItinClass;
+def LD_tc_ld_SLOT0           : InstrItinClass;
+def LD_tc_3or4stall_SLOT0    : InstrItinClass;
+def M_tc_1_SLOT23            : InstrItinClass;
+def M_tc_1or2_SLOT23         : InstrItinClass;
+def M_tc_2_SLOT23            : InstrItinClass;
+def M_tc_3_SLOT23            : InstrItinClass;
+def M_tc_3x_SLOT23           : InstrItinClass;
+def M_tc_3or4x_SLOT23        : InstrItinClass;
+def ST_tc_st_SLOT01          : InstrItinClass;
+def ST_tc_st_SLOT0           : InstrItinClass;
+def ST_tc_ld_SLOT0           : InstrItinClass;
+def ST_tc_3stall_SLOT0       : InstrItinClass;
+def S_2op_tc_1_SLOT23        : InstrItinClass;
+def S_2op_tc_2_SLOT23        : InstrItinClass;
+def S_2op_tc_2early_SLOT23   : InstrItinClass;
+def S_2op_tc_3or4x_SLOT23    : InstrItinClass;
+def S_3op_tc_1_SLOT23        : InstrItinClass;
+def S_3op_tc_1or2_SLOT23     : InstrItinClass;
+def S_3op_tc_2_SLOT23        : InstrItinClass;
+def S_3op_tc_2early_SLOT23   : InstrItinClass;
+def S_3op_tc_3_SLOT23        : InstrItinClass;
+def S_3op_tc_3x_SLOT23       : InstrItinClass;
+def NCJ_tc_3or4stall_SLOT0   : InstrItinClass;
+def V2LDST_tc_ld_SLOT01      : InstrItinClass;
+def V2LDST_tc_st_SLOT0       : InstrItinClass;
+def V2LDST_tc_st_SLOT01      : InstrItinClass;
+def V4LDST_tc_ld_SLOT01      : InstrItinClass;
+def V4LDST_tc_st_SLOT0       : InstrItinClass;
+def V4LDST_tc_st_SLOT01      : InstrItinClass;
+def J_tc_2early_SLOT0123     : InstrItinClass;
+def EXTENDER_tc_1_SLOT0123   : InstrItinClass;
+
 
 def HexagonItinerariesV4 :
       ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
-        InstrItinData<ALU32  , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<ALU64  , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<CR     , [InstrStage<1, [SLOT3]>]>,
-        InstrItinData<J      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<JR     , [InstrStage<1, [SLOT2]>]>,
-        InstrItinData<LD     , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<LD0    , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<M      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<ST     , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<ST0    , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<S      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<SYS    , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<NV_V4  , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<MEM_V4 , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<ENDLOOP, [InstrStage<1, [SLOT_ENDLOOP]>]>,
-        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        // ALU32
+        InstrItinData<ALU32_2op_tc_1_SLOT0123  ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_1_SLOT0123   ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2_SLOT0123   ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_ADDI_tc_1_SLOT0123  ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // ALU64
+        InstrItinData<ALU64_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_1or2_SLOT23   , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_3x_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // CR -> System
+        InstrItinData<CR_tc_2_SLOT3          , [InstrStage<1, [SLOT3]>]>,
+        InstrItinData<CR_tc_2early_SLOT3     , [InstrStage<1, [SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT3         , [InstrStage<1, [SLOT3]>]>,
+
+        // Jump (conditional/unconditional/return etc)
+        // CR
+        InstrItinData<CR_tc_2early_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT23        , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        // J
+        InstrItinData<J_tc_2early_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        // JR
+        InstrItinData<J_tc_2early_SLOT2      , [InstrStage<1, [SLOT2]>]>,
+
+        //Load
+        InstrItinData<LD_tc_ld_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<LD_tc_3or4stall_SLOT0  , [InstrStage<1, [SLOT0]>]>,
+
+        // M
+        InstrItinData<M_tc_1_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_1or2_SLOT23       , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_2_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3x_SLOT23         , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3or4x_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // Store
+        // ST
+        InstrItinData<ST_tc_st_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        // ST0
+        InstrItinData<ST_tc_st_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<ST_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+
+        // S
+        InstrItinData<S_2op_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_3or4x_SLOT23  , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_1or2_SLOT23   , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3x_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // SYS
+        InstrItinData<ST_tc_3stall_SLOT0     , [InstrStage<1, [SLOT0]>]>,
+
+        // New Value Compare Jump
+        InstrItinData<NCJ_tc_3or4stall_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+
+        // Mem ops - MEM_V4
+        InstrItinData<V2LDST_tc_st_SLOT0     , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V2LDST_tc_ld_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V2LDST_tc_st_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT0     , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V4LDST_tc_ld_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+        InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>,
+
+        // ENDLOOP
+        InstrItinData<J_tc_2early_SLOT0123   , [InstrStage<1, [SLOT_ENDLOOP]>]>,
+
+        // Extender/PREFIX
+        InstrItinData<EXTENDER_tc_1_SLOT0123,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
         InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
                                 InstrStage<1, [SLOT2, SLOT3]>]>
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index c37bf9f..9e1e0fd 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexagon-selectiondag-info"
 #include "HexagonTargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-selectiondag-info"
+
 bool llvm::flag_aligned_memcpy;
 
 HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const HexagonTargetMachine
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
index 31f278a..8ba6108 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -25,14 +25,13 @@ public:
   explicit HexagonSelectionDAGInfo(const HexagonTargetMachine &TM);
   ~HexagonSelectionDAGInfo();
 
-  virtual
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
                                   bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
-                                  MachinePointerInfo SrcPtrInfo) const;
+                                  MachinePointerInfo SrcPtrInfo) const override;
 };
 
 }
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 5303f44..247207f 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -17,11 +17,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "xfer"
-
-#include "HexagonTargetMachine.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -44,21 +43,22 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "xfer"
+
 namespace {
 
 class HexagonSplitConst32AndConst64 : public MachineFunctionPass {
-    const HexagonTargetMachine& QTM;
-    const HexagonSubtarget &QST;
+  const HexagonTargetMachine &QTM;
 
  public:
     static char ID;
-    HexagonSplitConst32AndConst64(const HexagonTargetMachine& TM)
-      : MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {}
+    HexagonSplitConst32AndConst64(const HexagonTargetMachine &TM)
+        : MachineFunctionPass(ID), QTM(TM) {}
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon Split Const32s and Const64s";
     }
-    bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 
 
@@ -67,6 +67,12 @@ char HexagonSplitConst32AndConst64::ID = 0;
 
 bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
 
+  const HexagonTargetObjectFile &TLOF =
+      (const HexagonTargetObjectFile &)
+      QTM.getTargetLowering()->getObjFileLowering();
+  if (TLOF.IsSmallDataEnabled())
+    return true;
+
   const TargetInstrInfo *TII = QTM.getInstrInfo();
 
   // Loop over all of the basic blocks
diff --git a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
index 8608e08..9601090 100644
--- a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
+++ b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
@@ -26,7 +26,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "xfer"
 #include "Hexagon.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonSubtarget.h"
@@ -49,6 +48,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "xfer"
+
 namespace llvm {
   void initializeHexagonSplitTFRCondSetsPass(PassRegistry&);
 }
@@ -67,10 +68,10 @@ class HexagonSplitTFRCondSets : public MachineFunctionPass {
       initializeHexagonSplitTFRCondSetsPass(*PassRegistry::getPassRegistry());
     }
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon Split TFRCondSets";
     }
-    bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 
 
@@ -221,7 +222,8 @@ bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) {
 static void initializePassOnce(PassRegistry &Registry) {
   const char *Name = "Hexagon Split TFRCondSets";
   PassInfo *PI = new PassInfo(Name, "hexagon-split-tfr",
-                              &HexagonSplitTFRCondSets::ID, 0, false, false);
+                              &HexagonSplitTFRCondSets::ID, nullptr, false,
+                              false);
   Registry.registerPass(*PI, true);
 }
 
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index fca6707..70c87fa 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -18,6 +18,8 @@
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-subtarget"
+
 #define GET_SUBTARGETINFO_CTOR
 #define GET_SUBTARGETINFO_TARGET_DESC
 #include "HexagonGenSubtargetInfo.inc"
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 9ce1fb8..b923764 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -52,7 +52,7 @@ extern "C" void LLVMInitializeHexagonTarget() {
 }
 
 static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
-  return new VLIWMachineScheduler(C, new ConvergingVLIWScheduler());
+  return new VLIWMachineScheduler(C, make_unique<ConvergingVLIWScheduler>());
 }
 
 static MachineSchedRegistry
@@ -79,20 +79,6 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
     initAsmInfo();
 }
 
-// addPassesForOptimizations - Allow the backend (target) to add Target
-// Independent Optimization passes to the Pass Manager.
-bool HexagonTargetMachine::addPassesForOptimizations(PassManagerBase &PM) {
-  if (getOptLevel() != CodeGenOpt::None) {
-    PM.add(createConstantPropagationPass());
-    PM.add(createLoopSimplifyPass());
-    PM.add(createDeadCodeEliminationPass());
-    PM.add(createConstantPropagationPass());
-    PM.add(createLoopUnrollPass());
-    PM.add(createLoopStrengthReducePass());
-  }
-  return true;
-}
-
 namespace {
 /// Hexagon Code Generator Pass Configuration Options.
 class HexagonPassConfig : public TargetPassConfig {
@@ -113,16 +99,16 @@ public:
     return getTM<HexagonTargetMachine>();
   }
 
-  virtual ScheduleDAGInstrs *
-  createMachineScheduler(MachineSchedContext *C) const {
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
     return createVLIWMachineSched(C);
   }
 
-  virtual bool addInstSelector();
-  virtual bool addPreRegAlloc();
-  virtual bool addPostRegAlloc();
-  virtual bool addPreSched2();
-  virtual bool addPreEmitPass();
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
@@ -164,16 +150,12 @@ bool HexagonPassConfig::addPostRegAlloc() {
 
 bool HexagonPassConfig::addPreSched2() {
   const HexagonTargetMachine &TM = getHexagonTargetMachine();
-  const HexagonTargetObjectFile &TLOF =
-    (const HexagonTargetObjectFile &)getTargetLowering()->getObjFileLowering();
 
   addPass(createHexagonCopyToCombine());
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
-  if (!TLOF.IsSmallDataEnabled()) {
-    addPass(createHexagonSplitConst32AndConst64(TM));
-    printAndVerify("After hexagon split const32/64 pass");
-  }
+  addPass(createHexagonSplitConst32AndConst64(TM));
+  printAndVerify("After hexagon split const32/64 pass");
   return true;
 }
 
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
index cf8f9aa..70b835e 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -41,39 +41,37 @@ public:
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 
-  virtual const HexagonInstrInfo *getInstrInfo() const {
+  const HexagonInstrInfo *getInstrInfo() const override {
     return &InstrInfo;
   }
-  virtual const HexagonSubtarget *getSubtargetImpl() const {
+  const HexagonSubtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
-  virtual const HexagonRegisterInfo *getRegisterInfo() const {
+  const HexagonRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
 
-  virtual const InstrItineraryData* getInstrItineraryData() const {
+  const InstrItineraryData* getInstrItineraryData() const override {
     return InstrItins;
   }
 
 
-  virtual const HexagonTargetLowering* getTargetLowering() const {
+  const HexagonTargetLowering* getTargetLowering() const override {
     return &TLInfo;
   }
 
-  virtual const HexagonFrameLowering* getFrameLowering() const {
+  const HexagonFrameLowering* getFrameLowering() const override {
     return &FrameLowering;
   }
 
-  virtual const HexagonSelectionDAGInfo* getSelectionDAGInfo() const {
+  const HexagonSelectionDAGInfo* getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
-  virtual const DataLayout       *getDataLayout() const { return &DL; }
+  const DataLayout       *getDataLayout() const override { return &DL; }
   static unsigned getModuleMatchQuality(const Module &M);
 
-  // Pass Pipeline Configuration.
-  virtual bool addPassesForOptimizations(PassManagerBase &PM);
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 };
 
 extern bool flag_aligned_memcpy;
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 976ff2b..87ce960 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -16,7 +16,6 @@
 // prune the dependence.
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "packets"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "Hexagon.h"
 #include "HexagonMachineFunctionInfo.h"
@@ -51,6 +50,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "packets"
+
 static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles",
       cl::ZeroOrMore, cl::Hidden, cl::init(true),
       cl::desc("Allow non-solo packetization of volatile memory references"));
@@ -69,7 +70,7 @@ namespace {
       initializeHexagonPacketizerPass(*PassRegistry::getPassRegistry());
     }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addRequired<MachineDominatorTree>();
       AU.addRequired<MachineBranchProbabilityInfo>();
@@ -79,11 +80,11 @@ namespace {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon Packetizer";
     }
 
-    bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
   };
   char HexagonPacketizer::ID = 0;
 
@@ -121,24 +122,25 @@ namespace {
                           const MachineBranchProbabilityInfo *MBPI);
 
     // initPacketizerState - initialize some internal flags.
-    void initPacketizerState();
+    void initPacketizerState() override;
 
     // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-    bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool ignorePseudoInstruction(MachineInstr *MI,
+                                 MachineBasicBlock *MBB) override;
 
     // isSoloInstruction - return true if instruction MI can not be packetized
     // with any other instruction, which means that MI itself is a packet.
-    bool isSoloInstruction(MachineInstr *MI);
+    bool isSoloInstruction(MachineInstr *MI) override;
 
     // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
     // together.
-    bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ);
+    bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override;
 
     // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
     // and SUJ.
-    bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ);
+    bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override;
 
-    MachineBasicBlock::iterator addToPacket(MachineInstr *MI);
+    MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override;
   private:
     bool IsCallDependent(MachineInstr* MI, SDep::Kind DepType, unsigned DepReg);
     bool PromoteToDotNew(MachineInstr* MI, SDep::Kind DepType,
@@ -390,7 +392,7 @@ static bool IsLoopN(MachineInstr *MI) {
 /// callee-saved register.
 static bool DoesModifyCalleeSavedReg(MachineInstr *MI,
                                      const TargetRegisterInfo *TRI) {
-  for (const uint16_t *CSR = TRI->getCalleeSavedRegs(); *CSR; ++CSR) {
+  for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(); *CSR; ++CSR) {
     unsigned CalleeSavedReg = *CSR;
     if (MI->modifiesRegister(CalleeSavedReg, TRI))
       return true;
@@ -603,7 +605,7 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
     // evaluate identically
     unsigned predRegNumSrc = 0;
     unsigned predRegNumDst = 0;
-    const TargetRegisterClass* predRegClass = NULL;
+    const TargetRegisterClass* predRegClass = nullptr;
 
     // Get predicate register used in the source instruction
     for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
@@ -1172,7 +1174,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       // of that (IsCallDependent) function. Bug 6216 is opened for this.
       //
       unsigned DepReg = 0;
-      const TargetRegisterClass* RC = NULL;
+      const TargetRegisterClass* RC = nullptr;
       if (DepType == SDep::Data) {
         DepReg = SUJ->Succs[i].getReg();
         RC = QRI->getMinimalPhysRegClass(DepReg);
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
index 33667f4..9942a60 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "HexagonAsmPrinter.h"
 #include "Hexagon.h"
 #include "HexagonInstPrinter.h"
@@ -24,6 +23,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #define GET_INSTRUCTION_NAME
 #include "HexagonGenAsmWriter.inc"
 
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
index d0cef68..09e3f88 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
@@ -27,7 +27,7 @@ namespace llvm {
                                 const MCRegisterInfo &MRI)
       : MCInstPrinter(MAI, MII, MRI), MII(MII) {}
 
-    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+    void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
     void printInst(const HexagonMCInst *MI, raw_ostream &O, StringRef Annot);
     virtual StringRef getOpcodeName(unsigned Opcode) const;
     void printInstruction(const MCInst *MI, raw_ostream &O);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 8519cf3..f8be77c 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -87,70 +87,82 @@ namespace HexagonII {
     // Solo instructions.
     SoloPos  = 5,
     SoloMask = 0x1,
+    // Packed only with A or X-type instructions.
+    SoloAXPos  = 6,
+    SoloAXMask = 0x1,
+    // Only A-type instruction in first slot or nothing.
+    SoloAin1Pos  = 7,
+    SoloAin1Mask = 0x1,
 
     // Predicated instructions.
-    PredicatedPos  = 6,
+    PredicatedPos  = 8,
     PredicatedMask = 0x1,
-    PredicatedFalsePos  = 7,
+    PredicatedFalsePos  = 9,
     PredicatedFalseMask = 0x1,
-    PredicatedNewPos  = 8,
+    PredicatedNewPos  = 10,
     PredicatedNewMask = 0x1,
+    PredicateLatePos  = 11,
+    PredicateLateMask = 0x1,
 
     // New-Value consumer instructions.
-    NewValuePos  = 9,
+    NewValuePos  = 12,
     NewValueMask = 0x1,
-
     // New-Value producer instructions.
-    hasNewValuePos  = 10,
+    hasNewValuePos  = 13,
     hasNewValueMask = 0x1,
-
     // Which operand consumes or produces a new value.
-    NewValueOpPos  = 11,
+    NewValueOpPos  = 14,
     NewValueOpMask = 0x7,
-
-    // Which bits encode the new value.
-    NewValueBitsPos  = 14,
-    NewValueBitsMask = 0x3,
-
     // Stores that can become new-value stores.
-    mayNVStorePos  = 16,
+    mayNVStorePos  = 17,
     mayNVStoreMask = 0x1,
-
     // New-value store instructions.
-    NVStorePos  = 17,
+    NVStorePos  = 18,
     NVStoreMask = 0x1,
+    // Loads that can become current-value loads.
+    mayCVLoadPos  = 19,
+    mayCVLoadMask = 0x1,
+    // Current-value load instructions.
+    CVLoadPos  = 20,
+    CVLoadMask = 0x1,
 
     // Extendable insns.
-    ExtendablePos  = 18,
+    ExtendablePos  = 21,
     ExtendableMask = 0x1,
-
     // Insns must be extended.
-    ExtendedPos  = 19,
+    ExtendedPos  = 22,
     ExtendedMask = 0x1,
-
     // Which operand may be extended.
-    ExtendableOpPos  = 20,
+    ExtendableOpPos  = 23,
     ExtendableOpMask = 0x7,
-
     // Signed or unsigned range.
-    ExtentSignedPos = 23,
+    ExtentSignedPos  = 26,
     ExtentSignedMask = 0x1,
-
     // Number of bits of range before extending operand.
-    ExtentBitsPos  = 24,
+    ExtentBitsPos  = 27,
     ExtentBitsMask = 0x1f,
+    // Alignment power-of-two before extending operand.
+    ExtentAlignPos  = 32,
+    ExtentAlignMask = 0x3,
 
     // Valid subtargets
-    validSubTargetPos = 29,
+    validSubTargetPos  = 34,
     validSubTargetMask = 0xf,
 
     // Addressing mode for load/store instructions.
-    AddrModePos = 33,
+    AddrModePos  = 40,
     AddrModeMask = 0x7,
+    // Access size for load/store instructions.
+    MemAccessSizePos = 43,
+    MemAccesSizeMask = 0x7,
+
+    // Branch predicted taken.
+    TakenPos = 47,
+    TakenMask = 0x1,
 
-    // Access size of memory access instructions (load/store).
-    MemAccessSizePos = 36,
-    MemAccesSizeMask = 0x7
+    // Floating-point instructions.
+    FPPos  = 48,
+    FPMask = 0x1
   };
 
   // *** The code above must match HexagonInstrFormat*.td *** //
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index f1a65c3..141e514 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -21,7 +21,7 @@ void HexagonMCAsmInfo::anchor() {}
 HexagonMCAsmInfo::HexagonMCAsmInfo(StringRef TT) {
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
-  Data64bitsDirective = 0;  // .xword is only supported by V9.
+  Data64bitsDirective = nullptr;  // .xword is only supported by V9.
   ZeroDirective = "\t.skip\t";
   CommentString = "//";
   HasLEB128 = true;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
index bd8cb76..953d804 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -19,7 +19,7 @@
 
 namespace llvm {
   class HexagonMCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit HexagonMCAsmInfo(StringRef TT);
   };
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
index 3ca71f0..3c52d45 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
@@ -31,7 +31,7 @@ namespace llvm {
 
   public:
     explicit HexagonMCInst():
-      MCInst(), MCID(0), packetStart(0), packetEnd(0) {};
+      MCInst(), MCID(nullptr), packetStart(0), packetEnd(0) {};
     HexagonMCInst(const MCInstrDesc& mcid):
       MCInst(), MCID(&mcid), packetStart(0), packetEnd(0) {};
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 7f103d8..581674d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -23,6 +23,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "HexagonGenInstrInfo.inc"
 
@@ -32,8 +34,6 @@
 #define GET_REGINFO_MC_DESC
 #include "HexagonGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static MCInstrInfo *createHexagonMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitHexagonMCInstrInfo(X);
@@ -60,7 +60,7 @@ static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
 
   // VirtualFP = (R30 + #0).
   MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
-      0, Hexagon::R30, 0);
+      nullptr, Hexagon::R30, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 13abaf8..1b0837c 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AArch64 ARM ARM64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
+subdirectories = ARM AArch64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
index 4b12aea..acf1214 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "MSP430InstPrinter.h"
 #include "MSP430.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -21,6 +20,8 @@
 #include "llvm/Support/FormattedStream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 
 // Include the auto-generated portion of the assembly writer.
 #include "MSP430GenAsmWriter.inc"
@@ -44,7 +45,7 @@ void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
 
 void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O, const char *Modifier) {
-  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported");
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
     O << getRegisterName(Op.getReg());
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
index d32eb3a..5afbd20 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -25,17 +25,17 @@ namespace llvm {
                       const MCRegisterInfo &MRI)
       : MCInstPrinter(MAI, MII, MRI) {}
 
-    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+    void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 
     // Autogenerated by tblgen.
     void printInstruction(const MCInst *MI, raw_ostream &O);
     static const char *getRegisterName(unsigned RegNo);
 
     void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
-                      const char *Modifier = 0);
+                      const char *Modifier = nullptr);
     void printPCRelImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
     void printSrcMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
-                            const char *Modifier = 0);
+                            const char *Modifier = nullptr);
     void printCCOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   };
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
index a7e0e58..ef805bb 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -20,7 +20,7 @@ namespace llvm {
   class StringRef;
 
   class MSP430MCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit MSP430MCAsmInfo(StringRef TT);
   };
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index 530e6aa..72adb45 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -20,6 +20,8 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "MSP430GenInstrInfo.inc"
 
@@ -29,8 +31,6 @@
 #define GET_REGINFO_MC_DESC
 #include "MSP430GenRegisterInfo.inc"
 
-using namespace llvm;
-
 static MCInstrInfo *createMSP430MCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitMSP430MCInstrInfo(X);
@@ -66,7 +66,7 @@ static MCInstPrinter *createMSP430MCInstPrinter(const Target &T,
                                                 const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
     return new MSP430InstPrinter(MAI, MII, MRI);
-  return 0;
+  return nullptr;
 }
 
 extern "C" void LLVMInitializeMSP430TargetMC() {
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 91065d8..22a973e 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "MSP430.h"
 #include "InstPrinter/MSP430InstPrinter.h"
 #include "MSP430InstrInfo.h"
@@ -35,27 +34,29 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 namespace {
   class MSP430AsmPrinter : public AsmPrinter {
   public:
     MSP430AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "MSP430 Assembly Printer";
     }
 
     void printOperand(const MachineInstr *MI, int OpNum,
-                      raw_ostream &O, const char* Modifier = 0);
+                      raw_ostream &O, const char* Modifier = nullptr);
     void printSrcMemOperand(const MachineInstr *MI, int OpNum,
                             raw_ostream &O);
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
-                         raw_ostream &O);
+                         raw_ostream &O) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI,
                                unsigned OpNo, unsigned AsmVariant,
-                               const char *ExtraCode, raw_ostream &O);
-    void EmitInstruction(const MachineInstr *MI);
+                               const char *ExtraCode, raw_ostream &O) override;
+    void EmitInstruction(const MachineInstr *MI) override;
   };
 } // end of anonymous namespace
 
diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp
index f128427..a96930a 100644
--- a/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "msp430-branch-select"
 #include "MSP430.h"
 #include "MSP430InstrInfo.h"
 #include "llvm/ADT/Statistic.h"
@@ -25,6 +24,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "msp430-branch-select"
+
 STATISTIC(NumExpanded, "Number of branches expanded to long format");
 
 namespace {
@@ -35,9 +36,9 @@ namespace {
     /// BlockSizes - The sizes of the basic blocks in the function.
     std::vector<unsigned> BlockSizes;
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "MSP430 Branch Selector";
     }
   };
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index ce078a3..82c8b29 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -242,7 +242,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       // alignment boundary.
       Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
 
-      MachineInstr *New = 0;
+      MachineInstr *New = nullptr;
       if (Old->getOpcode() == TII.getCallFrameSetupOpcode()) {
         New = BuildMI(MF, Old->getDebugLoc(),
                       TII.get(MSP430::SUB16ri), MSP430::SPW)
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index 8370714..d464dd9 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -32,26 +32,26 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
 
-  bool hasFP(const MachineFunction &MF) const;
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                       RegScavenger *RS = NULL) const;
+                                     RegScavenger *RS = nullptr) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 4152829..a9b9035 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -31,6 +31,8 @@
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "msp430-isel"
+
 namespace {
   struct MSP430ISelAddressMode {
     enum {
@@ -52,17 +54,17 @@ namespace {
     unsigned Align;    // CP alignment.
 
     MSP430ISelAddressMode()
-      : BaseType(RegBase), Disp(0), GV(0), CP(0), BlockAddr(0),
-        ES(0), JT(-1), Align(0) {
+      : BaseType(RegBase), Disp(0), GV(nullptr), CP(nullptr),
+        BlockAddr(nullptr), ES(nullptr), JT(-1), Align(0) {
     }
 
     bool hasSymbolicDisplacement() const {
-      return GV != 0 || CP != 0 || ES != 0 || JT != -1;
+      return GV != nullptr || CP != nullptr || ES != nullptr || JT != -1;
     }
 
     void dump() {
       errs() << "MSP430ISelAddressMode " << this << '\n';
-      if (BaseType == RegBase && Base.Reg.getNode() != 0) {
+      if (BaseType == RegBase && Base.Reg.getNode() != nullptr) {
         errs() << "Base.Reg ";
         Base.Reg.getNode()->dump();
       } else if (BaseType == FrameIndexBase) {
@@ -99,7 +101,7 @@ namespace {
         Lowering(*TM.getTargetLowering()),
         Subtarget(*TM.getSubtargetImpl()) { }
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "MSP430 DAG->DAG Pattern Instruction Selection";
     }
 
@@ -107,15 +109,14 @@ namespace {
     bool MatchWrapper(SDValue N, MSP430ISelAddressMode &AM);
     bool MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM);
 
-    virtual bool
-    SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
-                                 std::vector<SDValue> &OutOps);
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                                      std::vector<SDValue> &OutOps) override;
 
     // Include the pieces autogenerated from the target description.
   #include "MSP430GenDAGISel.inc"
 
   private:
-    SDNode *Select(SDNode *N);
+    SDNode *Select(SDNode *N) override;
     SDNode *SelectIndexedLoad(SDNode *Op);
     SDNode *SelectIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2,
                                unsigned Opc8, unsigned Opc16);
@@ -199,7 +200,7 @@ bool MSP430DAGToDAGISel::MatchAddress(SDValue N, MSP430ISelAddressMode &AM) {
 
   case ISD::FrameIndex:
     if (AM.BaseType == MSP430ISelAddressMode::RegBase
-        && AM.Base.Reg.getNode() == 0) {
+        && AM.Base.Reg.getNode() == nullptr) {
       AM.BaseType = MSP430ISelAddressMode::FrameIndexBase;
       AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
       return false;
@@ -228,7 +229,7 @@ bool MSP430DAGToDAGISel::MatchAddress(SDValue N, MSP430ISelAddressMode &AM) {
       // Start with the LHS as an addr mode.
       if (!MatchAddress(N.getOperand(0), AM) &&
           // Address could not have picked a GV address for the displacement.
-          AM.GV == NULL &&
+          AM.GV == nullptr &&
           // Check to see if the LHS & C is zero.
           CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
         AM.Disp += Offset;
@@ -330,7 +331,7 @@ static bool isValidIndexedLoad(const LoadSDNode *LD) {
 SDNode *MSP430DAGToDAGISel::SelectIndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   if (!isValidIndexedLoad(LD))
-    return NULL;
+    return nullptr;
 
   MVT VT = LD->getMemoryVT().getSimpleVT();
 
@@ -343,7 +344,7 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedLoad(SDNode *N) {
     Opcode = MSP430::MOV16rm_POST;
     break;
   default:
-    return NULL;
+    return nullptr;
   }
 
    return CurDAG->getMachineNode(Opcode, SDLoc(N),
@@ -359,7 +360,7 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
       IsLegalToFold(N1, Op, Op, OptLevel)) {
     LoadSDNode *LD = cast<LoadSDNode>(N1);
     if (!isValidIndexedLoad(LD))
-      return NULL;
+      return nullptr;
 
     MVT VT = LD->getMemoryVT().getSimpleVT();
     unsigned Opc = (VT == MVT::i16 ? Opc16 : Opc8);
@@ -367,9 +368,7 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
     MemRefs0[0] = cast<MemSDNode>(N1)->getMemOperand();
     SDValue Ops0[] = { N2, LD->getBasePtr(), LD->getChain() };
     SDNode *ResNode =
-      CurDAG->SelectNodeTo(Op, Opc,
-                           VT, MVT::i16, MVT::Other,
-                           Ops0, 3);
+      CurDAG->SelectNodeTo(Op, Opc, VT, MVT::i16, MVT::Other, Ops0);
     cast<MachineSDNode>(ResNode)->setMemRefs(MemRefs0, MemRefs0 + 1);
     // Transfer chain.
     ReplaceUses(SDValue(N1.getNode(), 2), SDValue(ResNode, 2));
@@ -378,7 +377,7 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
     return ResNode;
   }
 
-  return NULL;
+  return nullptr;
 }
 
 
@@ -396,7 +395,7 @@ SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
           Node->dump(CurDAG);
           errs() << "\n");
     Node->setNodeId(-1);
-    return NULL;
+    return nullptr;
   }
 
   // Few custom selection stuff.
@@ -484,7 +483,7 @@ SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
   SDNode *ResNode = SelectCode(Node);
 
   DEBUG(errs() << "=> ");
-  if (ResNode == NULL || ResNode == Node)
+  if (ResNode == nullptr || ResNode == Node)
     DEBUG(Node->dump(CurDAG));
   else
     DEBUG(ResNode->dump(CurDAG));
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index fe163d4..c5901bc 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "msp430-lower"
-
 #include "MSP430ISelLowering.h"
 #include "MSP430.h"
 #include "MSP430MachineFunctionInfo.h"
@@ -38,6 +36,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "msp430-lower"
+
 typedef enum {
   NoHWMult,
   HWMultIntr,
@@ -284,7 +284,7 @@ template<typename ArgT>
 static void AnalyzeArguments(CCState &State,
                              SmallVectorImpl<CCValAssign> &ArgLocs,
                              const SmallVectorImpl<ArgT> &Args) {
-  static const uint16_t RegList[] = {
+  static const MCPhysReg RegList[] = {
     MSP430::R15W, MSP430::R14W, MSP430::R13W, MSP430::R12W
   };
   static const unsigned NbRegs = array_lengthof(RegList);
@@ -462,7 +462,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
           errs() << "LowerFormalArguments Unhandled argument type: "
                << RegVT.getSimpleVT().SimpleTy << "\n";
 #endif
-          llvm_unreachable(0);
+          llvm_unreachable(nullptr);
         }
       case MVT::i16:
         unsigned VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass);
@@ -568,7 +568,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(Opc, dl, MVT::Other, &RetOps[0], RetOps.size());
+  return DAG.getNode(Opc, dl, MVT::Other, RetOps);
 }
 
 /// LowerCCCCallTo - functions arguments are copied from virtual regs to
@@ -629,7 +629,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
     } else {
       assert(VA.isMemLoc());
 
-      if (StackPtr.getNode() == 0)
+      if (!StackPtr.getNode())
         StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SPW, getPointerTy());
 
       SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
@@ -659,8 +659,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // Transform all store nodes into one single node because all store nodes are
   // independent of each other.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token chain and
   // flag operands which copy the outgoing args into registers.  The InFlag in
@@ -695,7 +694,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
-  Chain = DAG.getNode(MSP430ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(MSP430ISD::CALL, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -986,7 +985,7 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     Ops.push_back(Zero);
     Ops.push_back(TargetCC);
     Ops.push_back(Flag);
-    return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, &Ops[0], Ops.size());
+    return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
   }
 }
 
@@ -1009,7 +1008,7 @@ SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op,
   Ops.push_back(TargetCC);
   Ops.push_back(Flag);
 
-  return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, &Ops[0], Ops.size());
+  return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
 }
 
 SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
@@ -1148,7 +1147,7 @@ bool MSP430TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
 
 const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return NULL;
+  default: return nullptr;
   case MSP430ISD::RET_FLAG:           return "MSP430ISD::RET_FLAG";
   case MSP430ISD::RETI_FLAG:          return "MSP430ISD::RETI_FLAG";
   case MSP430ISD::RRA:                return "MSP430ISD::RRA";
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 85a861e..3ced61d 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -73,14 +73,14 @@ namespace llvm {
   public:
     explicit MSP430TargetLowering(MSP430TargetMachine &TM);
 
-    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; }
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     /// getTargetNodeName - This method returns the name of a target specific
     /// DAG node.
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
 
     SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -97,15 +97,16 @@ namespace llvm {
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
 
     TargetLowering::ConstraintType
-    getConstraintType(const std::string &Constraint) const;
+    getConstraintType(const std::string &Constraint) const override;
     std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+    getRegForInlineAsmConstraint(const std::string &Constraint,
+                                 MVT VT) const override;
 
     /// isTruncateFree - Return true if it's free to truncate a value of type
     /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in
     /// register R15W to i8 by referencing its sub-register R15B.
-    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
-    virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
     /// isZExtFree - Return true if any actual instruction that defines a value
     /// of type Ty1 implicit zero-extends the value to Ty2 in the result
@@ -115,12 +116,12 @@ namespace llvm {
     /// necessarily apply to truncate instructions. e.g. on msp430, all
     /// instructions that define 8-bit values implicit zero-extend the result
     /// out to 16 bits.
-    virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
-    virtual bool isZExtFree(EVT VT1, EVT VT2) const;
-    virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+    bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+    bool isZExtFree(EVT VT1, EVT VT2) const override;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
 
     MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                   MachineBasicBlock *BB) const;
+                                                   MachineBasicBlock *BB) const override;
     MachineBasicBlock* EmitShiftInstr(MachineInstr *MI,
                                       MachineBasicBlock *BB) const;
 
@@ -148,28 +149,27 @@ namespace llvm {
                             SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
 
-    virtual SDValue
+    SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
-    virtual SDValue
+                           SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
-
-    virtual SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv, bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const;
-
-    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
-                                            SDValue &Base,
-                                            SDValue &Offset,
-                                            ISD::MemIndexedMode &AM,
-                                            SelectionDAG &DAG) const;
+                SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue LowerReturn(SDValue Chain,
+                        CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        SDLoc dl, SelectionDAG &DAG) const override;
+
+    bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                    SDValue &Base,
+                                    SDValue &Offset,
+                                    ISD::MemIndexedMode &AM,
+                                    SelectionDAG &DAG) const override;
 
     const MSP430Subtarget &Subtarget;
     const DataLayout *TD;
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 38f73b9..0c04ddb 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -22,11 +22,11 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "MSP430GenInstrInfo.inc"
 
-using namespace llvm;
-
 // Pin the vtable to this file.
 void MSP430InstrInfo::anchor() {}
 
@@ -208,11 +208,11 @@ bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       while (std::next(I) != MBB.end())
         std::next(I)->eraseFromParent();
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       // Delete the JMP if it's equivalent to a fall-through.
       if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
-        TBB = 0;
+        TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
         continue;
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index ad2b8cc..1ffcebb 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -50,40 +50,41 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
 
   void copyPhysReg(MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator I, DebugLoc DL,
                    unsigned DestReg, unsigned SrcReg,
-                   bool KillSrc) const;
-
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   unsigned SrcReg, bool isKill,
-                                   int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    unsigned DestReg, int FrameIdx,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill,
+                           int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
 
   unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
 
   // Branch folding goodness
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-  bool isUnpredicatedTerminator(const MachineInstr *MI) const;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  bool isUnpredicatedTerminator(const MachineInstr *MI) const override;
   bool AnalyzeBranch(MachineBasicBlock &MBB,
                      MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify) const;
+                     bool AllowModify) const override;
 
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB,
                         const SmallVectorImpl<MachineOperand> &Cond,
-                        DebugLoc DL) const;
+                        DebugLoc DL) const override;
 
 };
 
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index f64017e..341fb64 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "msp430-reg-info"
-
 #include "MSP430RegisterInfo.h"
 #include "MSP430.h"
 #include "MSP430MachineFunctionInfo.h"
@@ -26,38 +24,40 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-reg-info"
+
 #define GET_REGINFO_TARGET_DESC
 #include "MSP430GenRegisterInfo.inc"
 
-using namespace llvm;
-
 // FIXME: Provide proper call frame setup / destroy opcodes.
 MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm)
   : MSP430GenRegisterInfo(MSP430::PCW), TM(tm) {
   StackAlign = TM.getFrameLowering()->getStackAlignment();
 }
 
-const uint16_t*
+const MCPhysReg*
 MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const TargetFrameLowering *TFI = MF->getTarget().getFrameLowering();
   const Function* F = MF->getFunction();
-  static const uint16_t CalleeSavedRegs[] = {
+  static const MCPhysReg CalleeSavedRegs[] = {
     MSP430::FPW, MSP430::R5W, MSP430::R6W, MSP430::R7W,
     MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
     0
   };
-  static const uint16_t CalleeSavedRegsFP[] = {
+  static const MCPhysReg CalleeSavedRegsFP[] = {
     MSP430::R5W, MSP430::R6W, MSP430::R7W,
     MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
     0
   };
-  static const uint16_t CalleeSavedRegsIntr[] = {
+  static const MCPhysReg CalleeSavedRegsIntr[] = {
     MSP430::FPW,  MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
     MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
     MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
     0
   };
-  static const uint16_t CalleeSavedRegsIntrFP[] = {
+  static const MCPhysReg CalleeSavedRegsIntrFP[] = {
     MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
     MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
     MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 78047cc..a607528 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -35,18 +35,20 @@ public:
   MSP430RegisterInfo(MSP430TargetMachine &tm);
 
   /// Code Generation virtual methods...
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
   const TargetRegisterClass*
-  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = nullptr) const override;
 
   // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
index 24f45fa..c700383 100644
--- a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
+++ b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "msp430-selectiondag-info"
 #include "MSP430TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "msp430-selectiondag-info"
+
 MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const MSP430TargetMachine &TM)
   : TargetSelectionDAGInfo(TM) {
 }
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index edeaf34..68ad091 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -15,12 +15,14 @@
 #include "MSP430.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "MSP430GenSubtargetInfo.inc"
 
-using namespace llvm;
-
 void MSP430Subtarget::anchor() { }
 
 MSP430Subtarget::MSP430Subtarget(const std::string &TT,
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 98a6003..50be2be 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -51,8 +51,8 @@ public:
     return getTM<MSP430TargetMachine>();
   }
 
-  virtual bool addInstSelector();
-  virtual bool addPreEmitPass();
+  bool addInstSelector() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index be695a2..ea5d407 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -43,25 +43,25 @@ public:
                       Reloc::Model RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
 
-  virtual const TargetFrameLowering *getFrameLowering() const {
+  const TargetFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const MSP430InstrInfo *getInstrInfo() const  { return &InstrInfo; }
-  virtual const DataLayout *getDataLayout() const     { return &DL;}
-  virtual const MSP430Subtarget *getSubtargetImpl() const { return &Subtarget; }
+  const MSP430InstrInfo *getInstrInfo() const override  { return &InstrInfo; }
+  const DataLayout *getDataLayout() const override     { return &DL;}
+  const MSP430Subtarget *getSubtargetImpl() const override { return &Subtarget; }
 
-  virtual const TargetRegisterInfo *getRegisterInfo() const {
+  const TargetRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
 
-  virtual const MSP430TargetLowering *getTargetLowering() const {
+  const MSP430TargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
 
-  virtual const MSP430SelectionDAGInfo* getSelectionDAGInfo() const {
+  const MSP430SelectionDAGInfo* getSelectionDAGInfo() const override {
     return &TSInfo;
   }
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 }; // MSP430TargetMachine.
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/Android.mk b/lib/Target/Mips/Android.mk
index 74b8a3b..4e8831c 100644
--- a/lib/Target/Mips/Android.mk
+++ b/lib/Target/Mips/Android.mk
@@ -24,6 +24,7 @@ mips_codegen_SRC_FILES := \
   MipsCodeEmitter.cpp \
   MipsConstantIslandPass.cpp \
   MipsDelaySlotFiller.cpp \
+  MipsFastISel.cpp \
   MipsFrameLowering.cpp \
   MipsInstrInfo.cpp \
   MipsISelDAGToDAG.cpp \
diff --git a/lib/Target/Mips/AsmParser/LLVMBuild.txt b/lib/Target/Mips/AsmParser/LLVMBuild.txt
index e7ca243..dd8e3cf 100644
--- a/lib/Target/Mips/AsmParser/LLVMBuild.txt
+++ b/lib/Target/Mips/AsmParser/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = MipsAsmParser
 parent = Mips
-required_libraries = MC MCParser Support MipsDesc MipsInfo
+required_libraries = MC MCParser MipsDesc MipsInfo Support
 add_to_library_groups = Mips
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 911a119..86fd386 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -29,6 +29,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-asm-parser"
+
 namespace llvm {
 class MCInstrInfo;
 }
@@ -73,10 +75,10 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand *> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
-                               bool MatchingInlineAsm);
+                               bool MatchingInlineAsm) override;
 
   /// Parse a register as used in CFI directives
-  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
   bool ParseParenSuffix(StringRef Name,
                         SmallVectorImpl<MCParsedAsmOperand *> &Operands);
@@ -84,11 +86,11 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool ParseBracketSuffix(StringRef Name,
                           SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
-  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  bool
+  ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
+                   SmallVectorImpl<MCParsedAsmOperand *> &Operands) override;
 
-  bool ParseDirective(AsmToken DirectiveID);
+  bool ParseDirective(AsmToken DirectiveID) override;
 
   MipsAsmParser::OperandMatchResultTy
   parseMemOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
@@ -135,6 +137,7 @@ class MipsAsmParser : public MCTargetAsmParser {
                      SmallVectorImpl<MCInst> &Instructions, bool isLoad,
                      bool isImmOpnd);
   bool reportParseError(StringRef ErrorMsg);
+  bool reportParseError(SMLoc Loc, StringRef ErrorMsg);
 
   bool parseMemOffset(const MCExpr *&Res, bool isParenExpr);
   bool parseRelocOperand(const MCExpr *&Res);
@@ -143,7 +146,9 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   bool isEvaluated(const MCExpr *Expr);
   bool parseSetFeature(uint64_t Feature);
+  bool parseDirectiveCPLoad(SMLoc Loc);
   bool parseDirectiveCPSetup();
+  bool parseDirectiveNaN();
   bool parseDirectiveSet();
   bool parseDirectiveOption();
 
@@ -212,21 +217,22 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   void setFeatureBits(unsigned Feature, StringRef FeatureString) {
     if (!(STI.getFeatureBits() & Feature)) {
-      setAvailableFeatures(ComputeAvailableFeatures(
-                           STI.ToggleFeature(FeatureString)));
+      setAvailableFeatures(
+          ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
     }
   }
 
   void clearFeatureBits(unsigned Feature, StringRef FeatureString) {
     if (STI.getFeatureBits() & Feature) {
-     setAvailableFeatures(ComputeAvailableFeatures(
-                           STI.ToggleFeature(FeatureString)));
+      setAvailableFeatures(
+          ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
     }
   }
 
 public:
   MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
-                const MCInstrInfo &MII)
+                const MCInstrInfo &MII,
+                const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(sti), Parser(parser) {
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -266,11 +272,12 @@ public:
                           /// context).
     RegKind_CCR = 128,    /// CCR
     RegKind_HWRegs = 256, /// HWRegs
+    RegKind_COP3 = 512,   /// COP3
 
     /// Potentially any (e.g. $1)
     RegKind_Numeric = RegKind_GPR | RegKind_FGR | RegKind_FCC | RegKind_MSA128 |
                       RegKind_MSACtrl | RegKind_COP2 | RegKind_ACC |
-                      RegKind_CCR | RegKind_HWRegs
+                      RegKind_CCR | RegKind_HWRegs | RegKind_COP3
   };
 
 private:
@@ -422,6 +429,14 @@ private:
     return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
   }
 
+  /// Coerce the register to COP3 and return the real register for the
+  /// current target.
+  unsigned getCOP3Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_COP3) && "Invalid access!");
+    unsigned ClassID = Mips::COP3RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
   /// Coerce the register to ACC64DSP and return the real register for the
   /// current target.
   unsigned getACC64DSPReg() const {
@@ -465,7 +480,7 @@ private:
 public:
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediate when possible.  Null MCExpr = 0.
-    if (Expr == 0)
+    if (!Expr)
       Inst.addOperand(MCOperand::CreateImm(0));
     else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
       Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
@@ -533,6 +548,11 @@ public:
     Inst.addOperand(MCOperand::CreateReg(getCOP2Reg()));
   }
 
+  void addCOP3AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getCOP3Reg()));
+  }
+
   void addACC64DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(getACC64DSPReg()));
@@ -573,7 +593,7 @@ public:
     addExpr(Inst, Expr);
   }
 
-  bool isReg() const {
+  bool isReg() const override {
     // As a special case until we sort out the definition of div/divu, pretend
     // that $0/$zero are k_PhysRegister so that MCK_ZERO works correctly.
     if (isGPRAsmReg() && RegIdx.Index == 0)
@@ -582,16 +602,16 @@ public:
     return Kind == k_PhysRegister;
   }
   bool isRegIdx() const { return Kind == k_RegisterIndex; }
-  bool isImm() const { return Kind == k_Immediate; }
+  bool isImm() const override { return Kind == k_Immediate; }
   bool isConstantImm() const {
     return isImm() && dyn_cast<MCConstantExpr>(getImm());
   }
-  bool isToken() const {
+  bool isToken() const override {
     // Note: It's not possible to pretend that other operand kinds are tokens.
     // The matcher emitter checks tokens first.
     return Kind == k_Token;
   }
-  bool isMem() const { return Kind == k_Memory; }
+  bool isMem() const override { return Kind == k_Memory; }
   bool isInvNum() const { return Kind == k_Immediate; }
   bool isLSAImm() const {
     if (!isConstantImm())
@@ -605,7 +625,7 @@ public:
     return StringRef(Tok.Data, Tok.Length);
   }
 
-  unsigned getReg() const {
+  unsigned getReg() const override {
     // As a special case until we sort out the definition of div/divu, pretend
     // that $0/$zero are k_PhysRegister so that MCK_ZERO works correctly.
     if (Kind == k_RegisterIndex && RegIdx.Index == 0 &&
@@ -744,6 +764,9 @@ public:
   bool isCOP2AsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_COP2 && RegIdx.Index <= 31;
   }
+  bool isCOP3AsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_COP3 && RegIdx.Index <= 31;
+  }
   bool isMSA128AsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_MSA128 && RegIdx.Index <= 31;
   }
@@ -752,11 +775,25 @@ public:
   }
 
   /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const { return StartLoc; }
+  SMLoc getStartLoc() const override { return StartLoc; }
   /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const { return EndLoc; }
+  SMLoc getEndLoc() const override { return EndLoc; }
 
-  virtual void print(raw_ostream &OS) const {
+  virtual ~MipsOperand() {
+    switch (Kind) {
+    case k_Immediate:
+      break;
+    case k_Memory:
+      delete Mem.Base;
+      break;
+    case k_PhysRegister:
+    case k_RegisterIndex:
+    case k_Token:
+      break;
+    }
+  }
+
+  void print(raw_ostream &OS) const override {
     switch (Kind) {
     case k_Immediate:
       OS << "Imm<";
@@ -906,10 +943,6 @@ bool MipsAsmParser::needsExpansion(MCInst &Inst) {
   case Mips::LoadImm32Reg:
   case Mips::LoadAddr32Imm:
   case Mips::LoadAddr32Reg:
-  case Mips::SUBi:
-  case Mips::SUBiu:
-  case Mips::DSUBi:
-  case Mips::DSUBiu:
     return true;
   default:
     return false;
@@ -925,30 +958,6 @@ void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
     return expandLoadAddressImm(Inst, IDLoc, Instructions);
   case Mips::LoadAddr32Reg:
     return expandLoadAddressReg(Inst, IDLoc, Instructions);
-  case Mips::SUBi:
-    Instructions.push_back(MCInstBuilder(Mips::ADDi)
-                               .addReg(Inst.getOperand(0).getReg())
-                               .addReg(Inst.getOperand(1).getReg())
-                               .addImm(-Inst.getOperand(2).getImm()));
-    return;
-  case Mips::SUBiu:
-    Instructions.push_back(MCInstBuilder(Mips::ADDiu)
-                               .addReg(Inst.getOperand(0).getReg())
-                               .addReg(Inst.getOperand(1).getReg())
-                               .addImm(-Inst.getOperand(2).getImm()));
-    return;
-  case Mips::DSUBi:
-    Instructions.push_back(MCInstBuilder(Mips::DADDi)
-                               .addReg(Inst.getOperand(0).getReg())
-                               .addReg(Inst.getOperand(1).getReg())
-                               .addImm(-Inst.getOperand(2).getImm()));
-    return;
-  case Mips::DSUBiu:
-    Instructions.push_back(MCInstBuilder(Mips::DADDiu)
-                               .addReg(Inst.getOperand(0).getReg())
-                               .addReg(Inst.getOperand(1).getReg())
-                               .addImm(-Inst.getOperand(2).getImm()));
-    return;
   }
 }
 
@@ -1586,6 +1595,8 @@ bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
       RegNo = isGP64() ? Operand.getGPR64Reg() : Operand.getGPR32Reg();
     }
 
+    delete &Operand;
+
     return (RegNo == (unsigned)-1);
   }
 
@@ -1624,7 +1635,7 @@ bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
 MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   DEBUG(dbgs() << "parseMemOperand\n");
-  const MCExpr *IdVal = 0;
+  const MCExpr *IdVal = nullptr;
   SMLoc S;
   bool isParenExpr = false;
   MipsAsmParser::OperandMatchResultTy Res = MatchOperand_NoMatch;
@@ -1654,6 +1665,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
             SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
 
         // Zero register assumed, add a memory operand with ZERO as its base.
+        // "Base" will be managed by k_Memory.
         MipsOperand *Base = MipsOperand::CreateGPRReg(
             0, getContext().getRegisterInfo(), S, E, *this);
         Operands.push_back(MipsOperand::CreateMem(Base, IdVal, S, E, *this));
@@ -1679,12 +1691,13 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
 
   Parser.Lex(); // Eat the ')' token.
 
-  if (IdVal == 0)
+  if (!IdVal)
     IdVal = MCConstantExpr::Create(0, getContext());
 
   // Replace the register operand with the memory operand.
   MipsOperand *op = static_cast<MipsOperand *>(Operands.back());
   // Remove the register from the operands.
+  // "op" will be managed by k_Memory.
   Operands.pop_back();
   // Add the memory operand.
   if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(IdVal)) {
@@ -1969,9 +1982,11 @@ MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
           .Case("call_lo", MCSymbolRefExpr::VK_Mips_CALL_LO16)
           .Case("higher", MCSymbolRefExpr::VK_Mips_HIGHER)
           .Case("highest", MCSymbolRefExpr::VK_Mips_HIGHEST)
+          .Case("pcrel_hi", MCSymbolRefExpr::VK_Mips_PCREL_HI16)
+          .Case("pcrel_lo", MCSymbolRefExpr::VK_Mips_PCREL_LO16)
           .Default(MCSymbolRefExpr::VK_None);
 
-  assert (VK != MCSymbolRefExpr::VK_None);
+  assert(VK != MCSymbolRefExpr::VK_None);
 
   return VK;
 }
@@ -2089,6 +2104,10 @@ bool MipsAsmParser::reportParseError(StringRef ErrorMsg) {
   return Error(Loc, ErrorMsg);
 }
 
+bool MipsAsmParser::reportParseError(SMLoc Loc, StringRef ErrorMsg) {
+  return Error(Loc, ErrorMsg);
+}
+
 bool MipsAsmParser::parseSetNoAtDirective() {
   // Line should look like: ".set noat".
   // set at reg to 0.
@@ -2248,29 +2267,30 @@ bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return reportParseError("unexpected token in .set directive");
 
-  switch(Feature) {
-    default: llvm_unreachable("Unimplemented feature");
-    case Mips::FeatureDSP:
-      setFeatureBits(Mips::FeatureDSP, "dsp");
-      getTargetStreamer().emitDirectiveSetDsp();
+  switch (Feature) {
+  default:
+    llvm_unreachable("Unimplemented feature");
+  case Mips::FeatureDSP:
+    setFeatureBits(Mips::FeatureDSP, "dsp");
+    getTargetStreamer().emitDirectiveSetDsp();
     break;
-    case Mips::FeatureMicroMips:
-      getTargetStreamer().emitDirectiveSetMicroMips();
+  case Mips::FeatureMicroMips:
+    getTargetStreamer().emitDirectiveSetMicroMips();
     break;
-    case Mips::FeatureMips16:
-      getTargetStreamer().emitDirectiveSetMips16();
+  case Mips::FeatureMips16:
+    getTargetStreamer().emitDirectiveSetMips16();
     break;
-    case Mips::FeatureMips32r2:
-      setFeatureBits(Mips::FeatureMips32r2, "mips32r2");
-      getTargetStreamer().emitDirectiveSetMips32R2();
+  case Mips::FeatureMips32r2:
+    setFeatureBits(Mips::FeatureMips32r2, "mips32r2");
+    getTargetStreamer().emitDirectiveSetMips32R2();
     break;
-    case Mips::FeatureMips64:
-      setFeatureBits(Mips::FeatureMips64, "mips64");
-      getTargetStreamer().emitDirectiveSetMips64();
+  case Mips::FeatureMips64:
+    setFeatureBits(Mips::FeatureMips64, "mips64");
+    getTargetStreamer().emitDirectiveSetMips64();
     break;
-    case Mips::FeatureMips64r2:
-      setFeatureBits(Mips::FeatureMips64r2, "mips64r2");
-      getTargetStreamer().emitDirectiveSetMips64R2();
+  case Mips::FeatureMips64r2:
+    setFeatureBits(Mips::FeatureMips64r2, "mips64r2");
+    getTargetStreamer().emitDirectiveSetMips64R2();
     break;
   }
   return false;
@@ -2302,10 +2322,34 @@ bool MipsAsmParser::eatComma(StringRef ErrorStr) {
     return Error(Loc, ErrorStr);
   }
 
-  Parser.Lex();  // Eat the comma.
+  Parser.Lex(); // Eat the comma.
   return true;
 }
 
+bool MipsAsmParser::parseDirectiveCPLoad(SMLoc Loc) {
+  if (Options.isReorder())
+    Warning(Loc, ".cpload in reorder section");
+
+  // FIXME: Warn if cpload is used in Mips16 mode.
+
+  SmallVector<MCParsedAsmOperand *, 1> Reg;
+  OperandMatchResultTy ResTy = ParseAnyRegister(Reg);
+  if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+    reportParseError("expected register containing function address");
+    return false;
+  }
+
+  MipsOperand *RegOpnd = static_cast<MipsOperand *>(Reg[0]);
+  if (!RegOpnd->isGPRAsmReg()) {
+    reportParseError(RegOpnd->getStartLoc(), "invalid register");
+    return false;
+  }
+
+  getTargetStreamer().emitDirectiveCpload(RegOpnd->getGPR32Reg());
+  delete RegOpnd;
+  return false;
+}
+
 bool MipsAsmParser::parseDirectiveCPSetup() {
   unsigned FuncReg;
   unsigned Save;
@@ -2336,60 +2380,28 @@ bool MipsAsmParser::parseDirectiveCPSetup() {
   if (Parser.parseIdentifier(Name))
     reportParseError("expected identifier");
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
-  unsigned GPReg = getGPR(matchCPURegisterName("gp"));
 
-  // FIXME: The code below this point should be in the TargetStreamers.
-  // Only N32 and N64 emit anything for .cpsetup
-  // FIXME: We should only emit something for PIC mode too.
-  if (!isN32() && !isN64())
-    return false;
+  getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, *Sym, SaveIsReg);
+  return false;
+}
 
-  MCStreamer &TS = getStreamer();
-  MCInst Inst;
-  // Either store the old $gp in a register or on the stack
-  if (SaveIsReg) {
-    // move $save, $gpreg
-    Inst.setOpcode(Mips::DADDu);
-    Inst.addOperand(MCOperand::CreateReg(Save));
-    Inst.addOperand(MCOperand::CreateReg(GPReg));
-    Inst.addOperand(MCOperand::CreateReg(getGPR(0)));
-  } else {
-    // sd $gpreg, offset($sp)
-    Inst.setOpcode(Mips::SD);
-    Inst.addOperand(MCOperand::CreateReg(GPReg));
-    Inst.addOperand(MCOperand::CreateReg(getGPR(matchCPURegisterName("sp"))));
-    Inst.addOperand(MCOperand::CreateImm(Save));
-  }
-  TS.EmitInstruction(Inst, STI);
-  Inst.clear();
-
-  const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
-      Sym->getName(), MCSymbolRefExpr::VK_Mips_GPOFF_HI,
-      getContext());
-  const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::Create(
-      Sym->getName(), MCSymbolRefExpr::VK_Mips_GPOFF_LO,
-      getContext());
-  // lui $gp, %hi(%neg(%gp_rel(funcSym)))
-  Inst.setOpcode(Mips::LUi);
-  Inst.addOperand(MCOperand::CreateReg(GPReg));
-  Inst.addOperand(MCOperand::CreateExpr(HiExpr));
-  TS.EmitInstruction(Inst, STI);
-  Inst.clear();
-
-  // addiu  $gp, $gp, %lo(%neg(%gp_rel(funcSym)))
-  Inst.setOpcode(Mips::ADDiu);
-  Inst.addOperand(MCOperand::CreateReg(GPReg));
-  Inst.addOperand(MCOperand::CreateReg(GPReg));
-  Inst.addOperand(MCOperand::CreateExpr(LoExpr));
-  TS.EmitInstruction(Inst, STI);
-  Inst.clear();
-
-  // daddu  $gp, $gp, $funcreg
-  Inst.setOpcode(Mips::DADDu);
-  Inst.addOperand(MCOperand::CreateReg(GPReg));
-  Inst.addOperand(MCOperand::CreateReg(GPReg));
-  Inst.addOperand(MCOperand::CreateReg(FuncReg));
-  TS.EmitInstruction(Inst, STI);
+bool MipsAsmParser::parseDirectiveNaN() {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    const AsmToken &Tok = Parser.getTok();
+
+    if (Tok.getString() == "2008") {
+      Parser.Lex();
+      getTargetStreamer().emitDirectiveNaN2008();
+      return false;
+    } else if (Tok.getString() == "legacy") {
+      Parser.Lex();
+      getTargetStreamer().emitDirectiveNaNLegacy();
+      return false;
+    }
+  }
+  // If we don't recognize the option passed to the .nan
+  // directive (e.g. no option or unknown option), emit an error.
+  reportParseError("invalid option in .nan directive");
   return false;
 }
 
@@ -2419,15 +2431,15 @@ bool MipsAsmParser::parseDirectiveSet() {
     Parser.eatToEndOfStatement();
     return false;
   } else if (Tok.getString() == "micromips") {
-      return parseSetFeature(Mips::FeatureMicroMips);
+    return parseSetFeature(Mips::FeatureMicroMips);
   } else if (Tok.getString() == "mips32r2") {
-      return parseSetFeature(Mips::FeatureMips32r2);
+    return parseSetFeature(Mips::FeatureMips32r2);
   } else if (Tok.getString() == "mips64") {
-      return parseSetFeature(Mips::FeatureMips64);
+    return parseSetFeature(Mips::FeatureMips64);
   } else if (Tok.getString() == "mips64r2") {
-      return parseSetFeature(Mips::FeatureMips64r2);
+    return parseSetFeature(Mips::FeatureMips64r2);
   } else if (Tok.getString() == "dsp") {
-      return parseSetFeature(Mips::FeatureDSP);
+    return parseSetFeature(Mips::FeatureDSP);
   } else {
     // It is just an identifier, look for an assignment.
     parseSetAssignment();
@@ -2537,6 +2549,8 @@ bool MipsAsmParser::parseDirectiveOption() {
 bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
+  if (IDVal == ".cpload")
+    return parseDirectiveCPLoad(DirectiveID.getLoc());
   if (IDVal == ".dword") {
     parseDataDirective(8, DirectiveID.getLoc());
     return false;
@@ -2576,6 +2590,9 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   }
 
+  if (IDVal == ".nan")
+    return parseDirectiveNaN();
+
   if (IDVal == ".gpword") {
     parseDirectiveGpWord();
     return false;
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index c304ee3..bf67d71 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -7,6 +7,7 @@ tablegen(LLVM MipsGenCodeEmitter.inc -gen-emitter)
 tablegen(LLVM MipsGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
 tablegen(LLVM MipsGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM MipsGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM MipsGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM MipsGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM MipsGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM MipsGenAsmMatcher.inc -gen-asm-matcher)
@@ -26,6 +27,7 @@ add_llvm_target(MipsCodeGen
   MipsCodeEmitter.cpp
   MipsConstantIslandPass.cpp
   MipsDelaySlotFiller.cpp
+  MipsFastISel.cpp
   MipsJITInfo.cpp
   MipsInstrInfo.cpp
   MipsISelDAGToDAG.cpp
diff --git a/lib/Target/Mips/Disassembler/LLVMBuild.txt b/lib/Target/Mips/Disassembler/LLVMBuild.txt
index 7101c06..bb70fd3 100644
--- a/lib/Target/Mips/Disassembler/LLVMBuild.txt
+++ b/lib/Target/Mips/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = MipsDisassembler
 parent = Mips
-required_libraries = MC Support MipsInfo
+required_libraries = MC MipsInfo Support
 add_to_library_groups = Mips
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index fc3b922..95670aa 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -14,6 +14,7 @@
 #include "Mips.h"
 #include "MipsRegisterInfo.h"
 #include "MipsSubtarget.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
@@ -24,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-disassembler"
+
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
@@ -33,19 +36,16 @@ class MipsDisassemblerBase : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  MipsDisassemblerBase(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+  MipsDisassemblerBase(const MCSubtargetInfo &STI, MCContext &Ctx,
                        bool bigEndian) :
-    MCDisassembler(STI), RegInfo(Info),
+    MCDisassembler(STI, Ctx),
     IsN64(STI.getFeatureBits() & Mips::FeatureN64), isBigEndian(bigEndian) {}
 
   virtual ~MipsDisassemblerBase() {}
 
-  const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
-
   bool isN64() const { return IsN64; }
 
 private:
-  OwningPtr<const MCRegisterInfo> RegInfo;
   bool IsN64;
 protected:
   bool isBigEndian;
@@ -57,19 +57,23 @@ class MipsDisassembler : public MipsDisassemblerBase {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  MipsDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+  MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
                    bool bigEndian) :
-    MipsDisassemblerBase(STI, Info, bigEndian) {
+    MipsDisassemblerBase(STI, Ctx, bigEndian) {
       IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
     }
 
+  bool isMips32r6() const {
+    return STI.getFeatureBits() & Mips::FeatureMips32r6;
+  }
+
   /// getInstruction - See MCDisassembler.
-  virtual DecodeStatus getInstruction(MCInst &instr,
-                                      uint64_t &size,
-                                      const MemoryObject &region,
-                                      uint64_t address,
-                                      raw_ostream &vStream,
-                                      raw_ostream &cStream) const;
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
 };
 
 
@@ -78,17 +82,17 @@ class Mips64Disassembler : public MipsDisassemblerBase {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  Mips64Disassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+  Mips64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
                      bool bigEndian) :
-    MipsDisassemblerBase(STI, Info, bigEndian) {}
+    MipsDisassemblerBase(STI, Ctx, bigEndian) {}
 
   /// getInstruction - See MCDisassembler.
-  virtual DecodeStatus getInstruction(MCInst &instr,
-                                      uint64_t &size,
-                                      const MemoryObject &region,
-                                      uint64_t address,
-                                      raw_ostream &vStream,
-                                      raw_ostream &cStream) const;
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
 };
 
 } // end anonymous namespace
@@ -195,6 +199,11 @@ static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
                                                uint64_t Address,
                                                const void *Decoder);
 
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
 static DecodeStatus DecodeBranchTarget(MCInst &Inst,
                                        unsigned Offset,
                                        uint64_t Address,
@@ -205,6 +214,16 @@ static DecodeStatus DecodeJumpTarget(MCInst &Inst,
                                      uint64_t Address,
                                      const void *Decoder);
 
+static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
 // DecodeBranchTargetMM - Decode microMIPS branch offset, which is
 // shifted left by 1 bit.
 static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
@@ -263,11 +282,40 @@ static DecodeStatus DecodeExtSize(MCInst &Inst,
                                   uint64_t Address,
                                   const void *Decoder);
 
+static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder);
+
 /// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't
 /// handle.
 template <typename InsnType>
 static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
                                    const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeAddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                      const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeDaddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                      const void *Decoder);
+
 namespace llvm {
 extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
               TheMips64elTarget;
@@ -275,26 +323,30 @@ extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
 
 static MCDisassembler *createMipsDisassembler(
                        const Target &T,
-                       const MCSubtargetInfo &STI) {
-  return new MipsDisassembler(STI, T.createMCRegInfo(""), true);
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new MipsDisassembler(STI, Ctx, true);
 }
 
 static MCDisassembler *createMipselDisassembler(
                        const Target &T,
-                       const MCSubtargetInfo &STI) {
-  return new MipsDisassembler(STI, T.createMCRegInfo(""), false);
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new MipsDisassembler(STI, Ctx, false);
 }
 
 static MCDisassembler *createMips64Disassembler(
                        const Target &T,
-                       const MCSubtargetInfo &STI) {
-  return new Mips64Disassembler(STI, T.createMCRegInfo(""), true);
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new Mips64Disassembler(STI, Ctx, true);
 }
 
 static MCDisassembler *createMips64elDisassembler(
                        const Target &T,
-                       const MCSubtargetInfo &STI) {
-  return new Mips64Disassembler(STI, T.createMCRegInfo(""), false);
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new Mips64Disassembler(STI, Ctx, false);
 }
 
 extern "C" void LLVMInitializeMipsDisassembler() {
@@ -311,6 +363,12 @@ extern "C" void LLVMInitializeMipsDisassembler() {
 
 #include "MipsGenDisassemblerTables.inc"
 
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const MipsDisassemblerBase *Dis = static_cast<const MipsDisassemblerBase*>(D);
+  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+  return *(RegInfo->getRegClass(RC).begin() + RegNo);
+}
+
 template <typename InsnType>
 static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
                                    const void *Decoder) {
@@ -357,6 +415,202 @@ static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
   return MCDisassembler::Success;
 }
 
+template <typename InsnType>
+static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the ADDI instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b001000 sssss ttttt iiiiiiiiiiiiiiii
+  //      BOVC if rs >= rt
+  //      BEQZALC if rs == 0 && rt != 0
+  //      BEQC if rs < rt && rs != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BOVC);
+    HasRs = true;
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BEQC);
+    HasRs = true;
+  } else
+    MI.setOpcode(Mips::BEQZALC);
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the ADDI instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b011000 sssss ttttt iiiiiiiiiiiiiiii
+  //      BNVC if rs >= rt
+  //      BNEZALC if rs == 0 && rt != 0
+  //      BNEC if rs < rt && rs != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BNVC);
+    HasRs = true;
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BNEC);
+    HasRs = true;
+  } else
+    MI.setOpcode(Mips::BNEZALC);
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BLEZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b010110 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid if rs == 0
+  //      BLEZC   if rs == 0  && rt != 0
+  //      BGEZC   if rs == rt && rt != 0
+  //      BGEC    if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BLEZC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BGEZC);
+  else
+    return MCDisassembler::Fail; // FIXME: BGEC is not implemented yet.
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BGTZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b010111 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid if rs == 0
+  //      BGTZC   if rs == 0  && rt != 0
+  //      BLTZC   if rs == rt && rt != 0
+  //      BLTC    if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BGTZC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BLTZC);
+  else
+    return MCDisassembler::Fail; // FIXME: BLTC is not implemented yet.
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BGTZ instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b000111 sssss ttttt iiiiiiiiiiiiiiii
+  //      BGTZ    if rt == 0
+  //      BGTZALC if rs == 0 && rt != 0
+  //      BLTZALC if rs != 0 && rs == rt
+  //      BLTUC   if rs != 0 && rs != rt
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+  bool HasRt = false;
+
+  if (Rt == 0) {
+    MI.setOpcode(Mips::BGTZ);
+    HasRs = true;
+  } else if (Rs == 0) {
+    MI.setOpcode(Mips::BGTZALC);
+    HasRt = true;
+  } else if (Rs == Rt) {
+    MI.setOpcode(Mips::BLTZALC);
+    HasRs = true;
+  } else
+    return MCDisassembler::Fail; // BLTUC not implemented yet
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  if (HasRt)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
   /// readInstruction - read four bytes from the MemoryObject
   /// and return 32 bit word sorted according to the given endianess
 static DecodeStatus readInstruction32(const MemoryObject &region,
@@ -426,6 +680,15 @@ MipsDisassembler::getInstruction(MCInst &instr,
     return MCDisassembler::Fail;
   }
 
+  if (isMips32r6()) {
+    Result = decodeInstruction(DecoderTableMips32r6_64r632, instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
   // Calling the auto-generated decoder function.
   Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
                              this, STI);
@@ -469,11 +732,6 @@ Mips64Disassembler::getInstruction(MCInst &instr,
   return MCDisassembler::Fail;
 }
 
-static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
-  const MipsDisassemblerBase *Dis = static_cast<const MipsDisassemblerBase*>(D);
-  return *(Dis->getRegInfo()->getRegClass(RC).begin() + RegNo);
-}
-
 static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
                                                  unsigned RegNo,
                                                  uint64_t Address,
@@ -828,12 +1086,23 @@ static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeBranchTarget(MCInst &Inst,
                                        unsigned Offset,
                                        uint64_t Address,
                                        const void *Decoder) {
-  unsigned BranchOffset = Offset & 0xffff;
-  BranchOffset = SignExtend32<18>(BranchOffset << 2) + 4;
+  int32_t BranchOffset = (SignExtend32<16>(Offset) << 2) + 4;
   Inst.addOperand(MCOperand::CreateImm(BranchOffset));
   return MCDisassembler::Success;
 }
@@ -848,12 +1117,31 @@ static DecodeStatus DecodeJumpTarget(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<21>(Offset) << 2;
+
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<26>(Offset) << 2;
+
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
                                          unsigned Offset,
                                          uint64_t Address,
                                          const void *Decoder) {
-  unsigned BranchOffset = Offset & 0xffff;
-  BranchOffset = SignExtend32<18>(BranchOffset << 1);
+  int32_t BranchOffset = SignExtend32<16>(Offset) << 1;
   Inst.addOperand(MCOperand::CreateImm(BranchOffset));
   return MCDisassembler::Success;
 }
@@ -903,3 +1191,9 @@ static DecodeStatus DecodeExtSize(MCInst &Inst,
   Inst.addOperand(MCOperand::CreateImm(SignExtend32<16>(Size)));
   return MCDisassembler::Success;
 }
+
+static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<19>(Insn) << 2));
+  return MCDisassembler::Success;
+}
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index c8f08f1..8c79751 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "MipsInstPrinter.h"
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MipsInstrInfo.h"
@@ -24,6 +23,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #define PRINT_ALIAS_INSTR
 #include "MipsGenAsmWriter.inc"
 
@@ -165,6 +166,8 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
   case MCSymbolRefExpr::VK_Mips_GOT_LO16:  OS << "%got_lo("; break;
   case MCSymbolRefExpr::VK_Mips_CALL_HI16: OS << "%call_hi("; break;
   case MCSymbolRefExpr::VK_Mips_CALL_LO16: OS << "%call_lo("; break;
+  case MCSymbolRefExpr::VK_Mips_PCREL_HI16: OS << "%pcrel_hi("; break;
+  case MCSymbolRefExpr::VK_Mips_PCREL_LO16: OS << "%pcrel_lo("; break;
   }
 
   OS << SRE->getSymbol();
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 2b745f0..550a0f1 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -85,10 +85,12 @@ public:
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 
   bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
 
 private:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 0f99ecc..5375a00 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 
 // Prepare value for the target space for it
 static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                 MCContext *Ctx = NULL) {
+                                 MCContext *Ctx = nullptr) {
 
   unsigned Kind = Fixup.getKind();
 
@@ -56,6 +56,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case Mips::fixup_MICROMIPS_GOT_PAGE:
   case Mips::fixup_MICROMIPS_GOT_OFST:
   case Mips::fixup_MICROMIPS_GOT_DISP:
+  case Mips::fixup_MIPS_PCLO16:
     break;
   case Mips::fixup_Mips_PC16:
     // So far we are only using this type for branches.
@@ -80,6 +81,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case Mips::fixup_Mips_GOT_HI16:
   case Mips::fixup_Mips_CALL_HI16:
   case Mips::fixup_MICROMIPS_HI16:
+  case Mips::fixup_MIPS_PCHI16:
     // Get the 2nd 16-bits. Also add 1 if bit 15 is 1.
     Value = ((Value + 0x8000) >> 16) & 0xffff;
     break;
@@ -102,6 +104,22 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     if (!isIntN(16, Value) && Ctx)
       Ctx->FatalError(Fixup.getLoc(), "out of range PC16 fixup");
     break;
+  case Mips::fixup_MIPS_PC21_S2:
+    Value -= 4;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 4;
+    // We now check if Value can be encoded as a 21-bit signed immediate.
+    if (!isIntN(21, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC21 fixup");
+    break;
+  case Mips::fixup_MIPS_PC26_S2:
+    Value -= 4;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 4;
+    // We now check if Value can be encoded as a 26-bit signed immediate.
+    if (!isIntN(26, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC26 fixup");
+    break;
   }
 
   return Value;
@@ -189,7 +207,7 @@ void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
 const MCFixupKindInfo &MipsAsmBackend::
 getFixupKindInfo(MCFixupKind Kind) const {
-  const static MCFixupKindInfo Infos[Mips::NumTargetFixupKinds] = {
+  const static MCFixupKindInfo LittleEndianInfos[Mips::NumTargetFixupKinds] = {
     // This table *must* be in same the order of fixup_* kinds in
     // MipsFixupKinds.h.
     //
@@ -229,6 +247,10 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_Mips_GOT_LO16",     0,     16,   0 },
     { "fixup_Mips_CALL_HI16",    0,     16,   0 },
     { "fixup_Mips_CALL_LO16",    0,     16,   0 },
+    { "fixup_MIPS_PC21_S2",      0,     21,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC26_S2",      0,     26,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCHI16",       0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCLO16",       0,     16,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_26_S1",   0,     26,   0 },
     { "fixup_MICROMIPS_HI16",    0,     16,   0 },
     { "fixup_MICROMIPS_LO16",    0,     16,   0 },
@@ -246,12 +268,76 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_MICROMIPS_TLS_TPREL_LO16",  0,     16,   0 }
   };
 
+  const static MCFixupKindInfo BigEndianInfos[Mips::NumTargetFixupKinds] = {
+    // This table *must* be in same the order of fixup_* kinds in
+    // MipsFixupKinds.h.
+    //
+    // name                    offset  bits  flags
+    { "fixup_Mips_16",          16,     16,   0 },
+    { "fixup_Mips_32",           0,     32,   0 },
+    { "fixup_Mips_REL32",        0,     32,   0 },
+    { "fixup_Mips_26",           6,     26,   0 },
+    { "fixup_Mips_HI16",        16,     16,   0 },
+    { "fixup_Mips_LO16",        16,     16,   0 },
+    { "fixup_Mips_GPREL16",     16,     16,   0 },
+    { "fixup_Mips_LITERAL",     16,     16,   0 },
+    { "fixup_Mips_GOT_Global",  16,     16,   0 },
+    { "fixup_Mips_GOT_Local",   16,     16,   0 },
+    { "fixup_Mips_PC16",        16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_CALL16",      16,     16,   0 },
+    { "fixup_Mips_GPREL32",      0,     32,   0 },
+    { "fixup_Mips_SHIFT5",      21,      5,   0 },
+    { "fixup_Mips_SHIFT6",      21,      5,   0 },
+    { "fixup_Mips_64",           0,     64,   0 },
+    { "fixup_Mips_TLSGD",       16,     16,   0 },
+    { "fixup_Mips_GOTTPREL",    16,     16,   0 },
+    { "fixup_Mips_TPREL_HI",    16,     16,   0 },
+    { "fixup_Mips_TPREL_LO",    16,     16,   0 },
+    { "fixup_Mips_TLSLDM",      16,     16,   0 },
+    { "fixup_Mips_DTPREL_HI",   16,     16,   0 },
+    { "fixup_Mips_DTPREL_LO",   16,     16,   0 },
+    { "fixup_Mips_Branch_PCRel",16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_GPOFF_HI",    16,     16,   0 },
+    { "fixup_Mips_GPOFF_LO",    16,     16,   0 },
+    { "fixup_Mips_GOT_PAGE",    16,     16,   0 },
+    { "fixup_Mips_GOT_OFST",    16,     16,   0 },
+    { "fixup_Mips_GOT_DISP",    16,     16,   0 },
+    { "fixup_Mips_HIGHER",      16,     16,   0 },
+    { "fixup_Mips_HIGHEST",     16,     16,   0 },
+    { "fixup_Mips_GOT_HI16",    16,     16,   0 },
+    { "fixup_Mips_GOT_LO16",    16,     16,   0 },
+    { "fixup_Mips_CALL_HI16",   16,     16,   0 },
+    { "fixup_Mips_CALL_LO16",   16,     16,   0 },
+    { "fixup_MIPS_PC21_S2",     11,     21,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC26_S2",      6,     26,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCHI16",      16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCLO16",      16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_26_S1",   6,     26,   0 },
+    { "fixup_MICROMIPS_HI16",   16,     16,   0 },
+    { "fixup_MICROMIPS_LO16",   16,     16,   0 },
+    { "fixup_MICROMIPS_GOT16",  16,     16,   0 },
+    { "fixup_MICROMIPS_PC16_S1",16,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_CALL16", 16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_DISP",        16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_PAGE",        16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_OFST",        16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_GD",          16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_LDM",         16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_HI16", 16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_LO16", 16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_HI16",  16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_LO16",  16,     16,   0 }
+  };
+
   if (Kind < FirstTargetFixupKind)
     return MCAsmBackend::getFixupKindInfo(Kind);
 
   assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
           "Invalid kind!");
-  return Infos[Kind - FirstTargetFixupKind];
+
+  if (IsLittle)
+    return LittleEndianInfos[Kind - FirstTargetFixupKind];
+  return BigEndianInfos[Kind - FirstTargetFixupKind];
 }
 
 /// WriteNopData - Write an (optimal) nop sequence of Count bytes
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index cc5207a..bc695e6 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -37,14 +37,14 @@ public:
       : MCAsmBackend(), OSType(_OSType), IsLittle(_isLittle),
         Is64Bit(_is64Bit) {}
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const;
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const;
+                  uint64_t Value, bool IsPCRel) const override;
 
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const;
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
-  unsigned getNumFixupKinds() const {
+  unsigned getNumFixupKinds() const override {
     return Mips::NumTargetFixupKinds;
   }
 
@@ -55,7 +55,7 @@ public:
   /// relaxation.
   ///
   /// \param Inst - The instruction to test.
-  bool mayNeedRelaxation(const MCInst &Inst) const {
+  bool mayNeedRelaxation(const MCInst &Inst) const override {
     return false;
   }
 
@@ -63,7 +63,7 @@ public:
   /// fixup requires the associated instruction to be relaxed.
    bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                              const MCRelaxableFragment *DF,
-                             const MCAsmLayout &Layout) const {
+                             const MCAsmLayout &Layout) const override {
     // FIXME.
     assert(0 && "RelaxInstruction() unimplemented");
     return false;
@@ -75,16 +75,16 @@ public:
   /// \param Inst - The instruction to relax, which may be the same
   /// as the output.
   /// \param [out] Res On return, the relaxed instruction.
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const {}
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {}
 
   /// @}
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
   void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
                          const MCFixup &Fixup, const MCFragment *DF,
                          const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved);
+                         bool &IsResolved) override;
 
 }; // class MipsAsmBackend
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 794978b..74c12ff 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -193,6 +193,18 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
     Type = ELF::R_MICROMIPS_TLS_TPREL_LO16;
     break;
+  case Mips::fixup_MIPS_PC21_S2:
+    Type = ELF::R_MIPS_PC21_S2;
+    break;
+  case Mips::fixup_MIPS_PC26_S2:
+    Type = ELF::R_MIPS_PC26_S2;
+    break;
+  case Mips::fixup_MIPS_PCHI16:
+    Type = ELF::R_MIPS_PCHI16;
+    break;
+  case Mips::fixup_MIPS_PCLO16:
+    Type = ELF::R_MIPS_PCLO16;
+    break;
   }
   return Type;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index dc6192c..3079004 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -128,6 +128,18 @@ namespace Mips {
     // resulting in - R_MIPS_CALL_LO16
     fixup_Mips_CALL_LO16,
 
+    // resulting in - R_MIPS_PC21_S2
+    fixup_MIPS_PC21_S2,
+
+    // resulting in - R_MIPS_PC26_S2
+    fixup_MIPS_PC26_S2,
+
+    // resulting in - R_MIPS_PCHI16
+    fixup_MIPS_PCHI16,
+
+    // resulting in - R_MIPS_PCLO16
+    fixup_MIPS_PCLO16,
+
     // resulting in - R_MICROMIPS_26_S1
     fixup_MICROMIPS_26_S1,
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 1000113..37ba0c4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -20,7 +20,7 @@ namespace llvm {
   class StringRef;
 
   class MipsMCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit MipsMCAsmInfo(StringRef TT);
   };
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index edd2146..85e0bf1 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -12,8 +12,6 @@
 //===----------------------------------------------------------------------===//
 //
 
-#define DEBUG_TYPE "mccodeemitter"
-
 #include "MipsMCCodeEmitter.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCExpr.h"
@@ -28,6 +26,8 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/raw_ostream.h"
 
+#define DEBUG_TYPE "mccodeemitter"
+
 #define GET_INSTRMAP_INFO
 #include "MipsGenInstrInfo.inc"
 #undef GET_INSTRMAP_INFO
@@ -242,6 +242,69 @@ getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+/// getBranchTarget21OpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+         "getBranchTarget21OpValue expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC21_S2)));
+  return 0;
+}
+
+/// getBranchTarget26OpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+         "getBranchTarget26OpValue expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC26_S2)));
+  return 0;
+}
+
+/// getJumpOffset16OpValue - Return binary encoding of the jump
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  if (MO.isImm()) return MO.getImm();
+
+  assert(MO.isExpr() &&
+         "getJumpOffset16OpValue expects only expressions or an immediate");
+
+   // TODO: Push fixup.
+   return 0;
+}
+
 /// getJumpTargetOpValue - Return binary encoding of the jump
 /// target operand. If the machine operand requires relocation,
 /// record the relocation and return zero.
@@ -417,6 +480,12 @@ getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups,
     case MCSymbolRefExpr::VK_Mips_CALL_LO16:
       FixupKind = Mips::fixup_Mips_CALL_LO16;
       break;
+    case MCSymbolRefExpr::VK_Mips_PCREL_HI16:
+      FixupKind = Mips::fixup_MIPS_PCHI16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_PCREL_LO16:
+      FixupKind = Mips::fixup_MIPS_PCLO16;
+      break;
     } // switch
 
     Fixups.push_back(MCFixup::Create(0, Expr, MCFixupKind(FixupKind)));
@@ -548,5 +617,15 @@ MipsMCCodeEmitter::getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
   return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) - 1;
 }
 
-#include "MipsGenMCCodeEmitter.inc"
+unsigned
+MipsMCCodeEmitter::getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo).isImm());
+  // The immediate is encoded as 'immediate << 2'.
+  unsigned Res = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+  assert((Res & 3) == 0);
+  return Res >> 2;
+}
 
+#include "MipsGenMCCodeEmitter.inc"
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 49a2490..3f7daab 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -52,7 +52,7 @@ public:
 
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   // getBinaryCodeForInstr - TableGen'erated function for getting the
   // binary encoding for an instruction.
@@ -88,6 +88,27 @@ public:
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     const MCSubtargetInfo &STI) const;
 
+  // getBranchTarget21OpValue - Return binary encoding of the branch
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  // getBranchTarget26OpValue - Return binary encoding of the branch
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  // getJumpOffset16OpValue - Return binary encoding of the jump
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
   // getMachineOpValue - Return binary encoding of operand. If the machin
   // operand requires relocation, record the relocation and return zero.
   unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
@@ -116,6 +137,10 @@ public:
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
 
+  unsigned getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
   unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index c7ba12d..21ccc3c 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mipsmcexpr"
 #include "MipsMCExpr.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
@@ -15,6 +14,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mipsmcexpr"
+
 bool MipsMCExpr::isSupportedBinaryExpr(MCSymbolRefExpr::VariantKind VK,
                                        const MCBinaryExpr *BE) {
   switch (VK) {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index 722bba7..8d7aacd 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -46,16 +46,16 @@ public:
   /// getSubExpr - Get the child of this expression.
   const MCExpr *getSubExpr() const { return Expr; }
 
-  void PrintImpl(raw_ostream &OS) const;
+  void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-  void AddValueSymbols(MCAssembler *) const;
-  const MCSection *FindAssociatedSection() const {
+                                 const MCAsmLayout *Layout) const override;
+  void AddValueSymbols(MCAssembler *) const override;
+  const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
   }
 
   // There are no TLS MipsMCExprs at the moment.
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
index 6992d06..01d5363 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -18,7 +18,7 @@ namespace llvm {
 static const unsigned MIPS_NACL_BUNDLE_ALIGN = 4u;
 
 bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
-                                  bool *IsStore = NULL);
+                                  bool *IsStore = nullptr);
 bool baseRegNeedsLoadStoreMask(unsigned Reg);
 
 // This function creates an MCELFStreamer for Mips NaCl.
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index eecca68..660e5a7 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -30,6 +30,8 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "MipsGenInstrInfo.inc"
 
@@ -39,8 +41,6 @@
 #define GET_REGINFO_MC_DESC
 #include "MipsGenRegisterInfo.inc"
 
-using namespace llvm;
-
 /// Select the Mips CPU for the given triple and cpu name.
 /// FIXME: Merge with the copy in MipsSubtarget.cpp
 static inline StringRef selectMipsCPU(StringRef TT, StringRef CPU) {
@@ -79,7 +79,7 @@ static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   MCAsmInfo *MAI = new MipsMCAsmInfo(TT);
 
   unsigned SP = MRI.getDwarfRegNum(Mips::SP, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, SP, 0);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -124,12 +124,11 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
 
 static MCStreamer *
 createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                    bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                    bool isVerboseAsm, bool useDwarfDirectory,
                     MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                     MCAsmBackend *TAB, bool ShowInst) {
-  MCStreamer *S =
-      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
-                              InstPrint, CE, TAB, ShowInst);
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
   new MipsTargetAsmStreamer(*S, OS);
   return S;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index 639a058..cd6be73 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -17,8 +17,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-mc-nacl"
-
 #include "Mips.h"
 #include "MipsELFStreamer.h"
 #include "MipsMCNaCl.h"
@@ -26,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-mc-nacl"
+
 namespace {
 
 const unsigned IndirectBranchMaskReg = Mips::T6;
@@ -120,7 +120,8 @@ private:
 public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer.  We override it to mask dangerous instructions.
-  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) {
+  void EmitInstruction(const MCInst &Inst,
+                       const MCSubtargetInfo &STI) override {
     // Sandbox indirect jumps.
     if (isIndirectJump(Inst)) {
       if (PendingCall)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index fb6aff2..a8fa272 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -85,6 +85,13 @@ void MipsTargetAsmStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
 }
 
 void MipsTargetAsmStreamer::emitDirectiveAbiCalls() { OS << "\t.abicalls\n"; }
+
+void MipsTargetAsmStreamer::emitDirectiveNaN2008() { OS << "\t.nan\t2008\n"; }
+
+void MipsTargetAsmStreamer::emitDirectiveNaNLegacy() {
+  OS << "\t.nan\tlegacy\n";
+}
+
 void MipsTargetAsmStreamer::emitDirectiveOptionPic0() {
   OS << "\t.option\tpic0\n";
 }
@@ -137,6 +144,29 @@ void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask,
   OS << "," << FPUTopSavedRegOff << '\n';
 }
 
+void MipsTargetAsmStreamer::emitDirectiveCpload(unsigned RegNo) {
+  OS << "\t.cpload\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
+                                                 int RegOrOffset,
+                                                 const MCSymbol &Sym,
+                                                 bool IsReg) {
+  OS << "\t.cpsetup\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << ", ";
+
+  if (IsReg)
+    OS << "$"
+       << StringRef(MipsInstPrinter::getRegisterName(RegOrOffset)).lower();
+  else
+    OS << RegOrOffset;
+
+  OS << ", ";
+
+  OS << Sym.getName() << "\n";
+}
+
 // This part is for ELF object output.
 MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
                                              const MCSubtargetInfo &STI)
@@ -180,6 +210,10 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
     EFlags |= ELF::EF_MIPS_ABI_O32;
   }
 
+  // Other options.
+  if (Features & Mips::FeatureNaN2008)
+    EFlags |= ELF::EF_MIPS_NAN2008;
+
   MCA.setELFHeaderEFlags(EFlags);
 }
 
@@ -325,6 +359,21 @@ void MipsTargetELFStreamer::emitDirectiveAbiCalls() {
   Flags |= ELF::EF_MIPS_CPIC | ELF::EF_MIPS_PIC;
   MCA.setELFHeaderEFlags(Flags);
 }
+
+void MipsTargetELFStreamer::emitDirectiveNaN2008() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_NAN2008;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveNaNLegacy() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags &= ~ELF::EF_MIPS_NAN2008;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
 void MipsTargetELFStreamer::emitDirectiveOptionPic0() {
   MCAssembler &MCA = getStreamer().getAssembler();
   unsigned Flags = MCA.getELFHeaderEFlags();
@@ -376,3 +425,107 @@ void MipsTargetELFStreamer::emitDirectiveSetMips64R2() {
 void MipsTargetELFStreamer::emitDirectiveSetDsp() {
   // No action required for ELF output.
 }
+
+void MipsTargetELFStreamer::emitDirectiveCpload(unsigned RegNo) {
+  // .cpload $reg
+  // This directive expands to:
+  // lui   $gp, %hi(_gp_disp)
+  // addui $gp, $gp, %lo(_gp_disp)
+  // addu  $gp, $gp, $reg
+  // when support for position independent code is enabled.
+  if (!Pic || (isN32() || isN64()))
+    return;
+
+  // There's a GNU extension controlled by -mno-shared that allows
+  // locally-binding symbols to be accessed using absolute addresses.
+  // This is currently not supported. When supported -mno-shared makes
+  // .cpload expand to:
+  //   lui     $gp, %hi(__gnu_local_gp)
+  //   addiu   $gp, $gp, %lo(__gnu_local_gp)
+
+  StringRef SymName("_gp_disp");
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCSymbol *GP_Disp = MCA.getContext().GetOrCreateSymbol(SymName);
+  MCA.getOrCreateSymbolData(*GP_Disp);
+
+  MCInst TmpInst;
+  TmpInst.setOpcode(Mips::LUi);
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  const MCSymbolRefExpr *HiSym = MCSymbolRefExpr::Create(
+      "_gp_disp", MCSymbolRefExpr::VK_Mips_ABS_HI, MCA.getContext());
+  TmpInst.addOperand(MCOperand::CreateExpr(HiSym));
+  getStreamer().EmitInstruction(TmpInst, STI);
+
+  TmpInst.clear();
+
+  TmpInst.setOpcode(Mips::ADDiu);
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  const MCSymbolRefExpr *LoSym = MCSymbolRefExpr::Create(
+      "_gp_disp", MCSymbolRefExpr::VK_Mips_ABS_LO, MCA.getContext());
+  TmpInst.addOperand(MCOperand::CreateExpr(LoSym));
+  getStreamer().EmitInstruction(TmpInst, STI);
+
+  TmpInst.clear();
+
+  TmpInst.setOpcode(Mips::ADDu);
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  TmpInst.addOperand(MCOperand::CreateReg(RegNo));
+  getStreamer().EmitInstruction(TmpInst, STI);
+}
+
+void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
+                                                 int RegOrOffset,
+                                                 const MCSymbol &Sym,
+                                                 bool IsReg) {
+  // Only N32 and N64 emit anything for .cpsetup iff PIC is set.
+  if (!Pic || !(isN32() || isN64()))
+    return;
+
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCInst Inst;
+
+  // Either store the old $gp in a register or on the stack
+  if (IsReg) {
+    // move $save, $gpreg
+    Inst.setOpcode(Mips::DADDu);
+    Inst.addOperand(MCOperand::CreateReg(RegOrOffset));
+    Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+    Inst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+  } else {
+    // sd $gpreg, offset($sp)
+    Inst.setOpcode(Mips::SD);
+    Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+    Inst.addOperand(MCOperand::CreateReg(Mips::SP));
+    Inst.addOperand(MCOperand::CreateImm(RegOrOffset));
+  }
+  getStreamer().EmitInstruction(Inst, STI);
+  Inst.clear();
+
+  const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
+      Sym.getName(), MCSymbolRefExpr::VK_Mips_GPOFF_HI, MCA.getContext());
+  const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::Create(
+      Sym.getName(), MCSymbolRefExpr::VK_Mips_GPOFF_LO, MCA.getContext());
+  // lui $gp, %hi(%neg(%gp_rel(funcSym)))
+  Inst.setOpcode(Mips::LUi);
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateExpr(HiExpr));
+  getStreamer().EmitInstruction(Inst, STI);
+  Inst.clear();
+
+  // addiu  $gp, $gp, %lo(%neg(%gp_rel(funcSym)))
+  Inst.setOpcode(Mips::ADDiu);
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateExpr(LoExpr));
+  getStreamer().EmitInstruction(Inst, STI);
+  Inst.clear();
+
+  // daddu  $gp, $gp, $funcreg
+  Inst.setOpcode(Mips::DADDu);
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateReg(RegNo));
+  getStreamer().EmitInstruction(Inst, STI);
+}
diff --git a/lib/Target/Mips/Makefile b/lib/Target/Mips/Makefile
index bcf951e..41efa47 100644
--- a/lib/Target/Mips/Makefile
+++ b/lib/Target/Mips/Makefile
@@ -13,7 +13,7 @@ TARGET = Mips
 
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = MipsGenRegisterInfo.inc MipsGenInstrInfo.inc \
-                MipsGenAsmWriter.inc MipsGenCodeEmitter.inc \
+                MipsGenAsmWriter.inc MipsGenFastISel.inc MipsGenCodeEmitter.inc \
                 MipsGenDAGISel.inc MipsGenCallingConv.inc \
                 MipsGenSubtargetInfo.inc MipsGenMCCodeEmitter.inc \
                 MipsGenDisassemblerTables.inc \
diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
index 91d447a..d95f9b0 100644
--- a/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -28,9 +28,9 @@ def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
 def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
                SWXC1_FM_MM<0x88>;
 def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>,
-               LWXC1_FM_MM<0x148>;
+               LWXC1_FM_MM<0x148>, INSN_MIPS5_32R2;
 def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>,
-               SWXC1_FM_MM<0x188>;
+               SWXC1_FM_MM<0x188>, INSN_MIPS5_32R2;
 
 def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>,
                   CEQS_FM_MM<0>;
@@ -70,9 +70,9 @@ def FSQRT_MM : MMRel, ABSS_FT<"sqrt.d", AFGR64Opnd, AFGR64Opnd, II_SQRT_D,
                               fsqrt>, ROUND_W_FM_MM<1, 0x28>;
 
 def CVT_L_S_MM   : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
-                   ROUND_W_FM_MM<0, 0x4>;
+                   ROUND_W_FM_MM<0, 0x4>, INSN_MIPS3_32R2;
 def CVT_L_D64_MM : MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
-                   ROUND_W_FM_MM<1, 0x4>;
+                   ROUND_W_FM_MM<1, 0x4>, INSN_MIPS3_32R2;
 
 def FABS_S_MM : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
                 ABS_FM_MM<0, 0xd>;
@@ -95,7 +95,7 @@ def FNEG_MM : MMRel, ABSS_FT<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>,
               ABS_FM_MM<1, 0x2d>;
 
 def FMOV_D32_MM : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
-                  ABS_FM_MM<1, 0x1>, Requires<[NotFP64bit, HasStdEnc]>;
+                  ABS_FM_MM<1, 0x1>, AdditionalRequires<[NotFP64bit]>;
 
 def MOVZ_I_S_MM : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd,
                                      II_MOVZ_S>, CMov_I_F_FM_MM<0x78, 0>;
@@ -124,9 +124,9 @@ def MFC1_MM : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd,
 def MTC1_MM : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd,
                              II_MTC1, bitconvert>, MFC1_FM_MM<0xa0>;
 def MFHC1_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>,
-               MFC1_FM_MM<3>;
+               MFC1_FM_MM<3>, ISA_MIPS32R2;
 def MTHC1_MM : MMRel, MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, II_MTHC1>,
-               MFC1_FM_MM<7>;
+               MFC1_FM_MM<7>, ISA_MIPS32R2;
 
 def MADD_S_MM : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
                 MADDS_FM_MM<0x1>;
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 3f13e83..9904bc6 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -218,15 +218,20 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def MSUBU_MM : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM_MM<0x3ec>;
 
   /// Count Leading
-  def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM_MM<0x16c>;
-  def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM_MM<0x12c>;
+  def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM_MM<0x16c>,
+               ISA_MIPS32;
+  def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM_MM<0x12c>,
+               ISA_MIPS32;
 
   /// Sign Ext In Register Instructions.
-  def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>, SEB_FM_MM<0x0ac>;
-  def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>, SEB_FM_MM<0x0ec>;
+  def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
+               SEB_FM_MM<0x0ac>, ISA_MIPS32R2;
+  def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
+               SEB_FM_MM<0x0ec>, ISA_MIPS32R2;
 
   /// Word Swap Bytes Within Halfwords
-  def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>;
+  def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>,
+                ISA_MIPS32R2;
 
   def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>,
                EXT_FM_MM<0x2c>;
@@ -268,8 +273,10 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def WAIT_MM    : WaitMM<"wait">, WAIT_FM_MM;
   def ERET_MM    : MMRel, ER_FT<"eret">, ER_FM_MM<0x3cd>;
   def DERET_MM   : MMRel, ER_FT<"deret">, ER_FM_MM<0x38d>;
-  def EI_MM      : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM_MM<0x15d>;
-  def DI_MM      : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM_MM<0x11d>;
+  def EI_MM      : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM_MM<0x15d>,
+                   ISA_MIPS32R2;
+  def DI_MM      : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM_MM<0x11d>,
+                   ISA_MIPS32R2;
 
   /// Trap Instructions
   def TEQ_MM  : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM_MM<0x0>;
@@ -296,5 +303,5 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
 //===----------------------------------------------------------------------===//
 
 let Predicates = [InMicroMips] in {
-  def : InstAlias<"wait", (WAIT_MM 0x0), 1>;
+  def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>;
 }
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 10a4699..ea16331 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -15,6 +15,33 @@
 
 include "llvm/Target/Target.td"
 
+// The overall idea of the PredicateControl class is to chop the Predicates list
+// into subsets that are usually overridden independently. This allows
+// subclasses to partially override the predicates of their superclasses without
+// having to re-add all the existing predicates.
+class PredicateControl {
+  // Predicates for the encoding scheme in use such as HasStdEnc
+  list<Predicate> EncodingPredicates = [];
+  // Predicates for the GPR size such as IsGP64bit
+  list<Predicate> GPRPredicates = [];
+  // Predicates for the FGR size and layout such as IsFP64bit
+  list<Predicate> FGRPredicates = [];
+  // Predicates for the instruction group membership such as ISA's and ASE's
+  list<Predicate> InsnPredicates = [];
+  // Predicates for anything else
+  list<Predicate> AdditionalPredicates = [];
+  list<Predicate> Predicates = !listconcat(EncodingPredicates,
+                                           GPRPredicates,
+                                           FGRPredicates,
+                                           InsnPredicates,
+                                           AdditionalPredicates);
+}
+
+// Like Requires<> but for the AdditionalPredicates list
+class AdditionalRequires<list<Predicate> preds> {
+  list<Predicate> AdditionalPredicates = preds;
+}
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
@@ -34,6 +61,8 @@ def FeatureGP64Bit     : SubtargetFeature<"gp64", "IsGP64bit", "true",
                                 "General Purpose Registers are 64-bit wide.">;
 def FeatureFP64Bit     : SubtargetFeature<"fp64", "IsFP64bit", "true",
                                 "Support 64-bit FP registers.">;
+def FeatureNaN2008     : SubtargetFeature<"nan2008", "IsNaN2008bit", "true",
+                                "IEEE 754-2008 NaN encoding.">;
 def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
                                 "true", "Only supports single precision float">;
 def FeatureO32         : SubtargetFeature<"o32", "MipsABI", "O32",
@@ -46,33 +75,62 @@ def FeatureEABI        : SubtargetFeature<"eabi", "MipsABI", "EABI",
                                 "Enable eabi ABI">;
 def FeatureVFPU        : SubtargetFeature<"vfpu", "HasVFPU",
                                 "true", "Enable vector FPU instructions.">;
-def FeatureSEInReg     : SubtargetFeature<"seinreg", "HasSEInReg", "true",
-                                "Enable 'signext in register' instructions.">;
-def FeatureCondMov     : SubtargetFeature<"condmov", "HasCondMov", "true",
-                                "Enable 'conditional move' instructions.">;
-def FeatureSwap        : SubtargetFeature<"swap", "HasSwap", "true",
-                                "Enable 'byte/half swap' instructions.">;
-def FeatureBitCount    : SubtargetFeature<"bitcount", "HasBitCount", "true",
-                                "Enable 'count leading bits' instructions.">;
-def FeatureFPIdx       : SubtargetFeature<"FPIdx", "HasFPIdx", "true",
-                                "Enable 'FP indexed load/store' instructions.">;
+def FeatureMips1       : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1",
+                                "Mips I ISA Support [highly experimental]">;
+def FeatureMips2       : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2",
+                                "Mips II ISA Support [highly experimental]",
+                                [FeatureMips1]>;
+def FeatureMips3_32    : SubtargetFeature<"mips3_32", "HasMips3_32", "true",
+                                "Subset of MIPS-III that is also in MIPS32 "
+                                "[highly experimental]">;
+def FeatureMips3_32r2  : SubtargetFeature<"mips3_32r2", "HasMips3_32r2", "true",
+                                "Subset of MIPS-III that is also in MIPS32r2 "
+                                "[highly experimental]">;
+def FeatureMips3       : SubtargetFeature<"mips3", "MipsArchVersion", "Mips3",
+                                "MIPS III ISA Support [highly experimental]",
+                                [FeatureMips2, FeatureMips3_32,
+                                 FeatureMips3_32r2, FeatureGP64Bit,
+                                 FeatureFP64Bit]>;
+def FeatureMips4_32    : SubtargetFeature<"mips4_32", "HasMips4_32", "true",
+                                "Subset of MIPS-IV that is also in MIPS32 "
+                                "[highly experimental]">;
+def FeatureMips4_32r2  : SubtargetFeature<"mips4_32r2", "HasMips4_32r2", "true",
+                                "Subset of MIPS-IV that is also in MIPS32r2 "
+                                "[highly experimental]">;
+def FeatureMips4       : SubtargetFeature<"mips4", "MipsArchVersion",
+                                "Mips4", "MIPS IV ISA Support",
+                                [FeatureMips3, FeatureMips4_32,
+                                 FeatureMips4_32r2]>;
+def FeatureMips5_32r2  : SubtargetFeature<"mips5_32r2", "HasMips5_32r2", "true",
+                                "Subset of MIPS-V that is also in MIPS32r2 "
+                                "[highly experimental]">;
+def FeatureMips5       : SubtargetFeature<"mips5", "MipsArchVersion", "Mips5",
+                                "MIPS V ISA Support [highly experimental]",
+                                [FeatureMips4, FeatureMips5_32r2]>;
 def FeatureMips32      : SubtargetFeature<"mips32", "MipsArchVersion", "Mips32",
                                 "Mips32 ISA Support",
-                                [FeatureCondMov, FeatureBitCount]>;
+                                [FeatureMips2, FeatureMips3_32,
+                                 FeatureMips4_32]>;
 def FeatureMips32r2    : SubtargetFeature<"mips32r2", "MipsArchVersion",
                                 "Mips32r2", "Mips32r2 ISA Support",
-                                [FeatureMips32, FeatureSEInReg, FeatureSwap,
-                                 FeatureFPIdx]>;
-def FeatureMips4       : SubtargetFeature<"mips4", "MipsArchVersion",
-                                "Mips4", "MIPS IV ISA Support",
-                                [FeatureGP64Bit, FeatureFP64Bit,
-                                 FeatureCondMov]>;
+                                [FeatureMips3_32r2, FeatureMips4_32r2,
+                                 FeatureMips5_32r2, FeatureMips32]>;
+def FeatureMips32r6    : SubtargetFeature<"mips32r6", "MipsArchVersion",
+                                "Mips32r6",
+                                "Mips32r6 ISA Support [experimental]",
+                                [FeatureMips32r2, FeatureFP64Bit,
+                                 FeatureNaN2008]>;
 def FeatureMips64      : SubtargetFeature<"mips64", "MipsArchVersion",
                                 "Mips64", "Mips64 ISA Support",
-                                [FeatureMips4, FeatureMips32, FeatureFPIdx]>;
+                                [FeatureMips5, FeatureMips32]>;
 def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
                                 "Mips64r2", "Mips64r2 ISA Support",
                                 [FeatureMips64, FeatureMips32r2]>;
+def FeatureMips64r6    : SubtargetFeature<"mips64r6", "MipsArchVersion",
+                                "Mips64r6",
+                                "Mips64r6 ISA Support [experimental]",
+                                [FeatureMips32r6, FeatureMips64r2,
+                                 FeatureNaN2008]>;
 
 def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
                                       "Mips16 mode">;
@@ -97,11 +155,18 @@ def FeatureCnMips     : SubtargetFeature<"cnmips", "HasCnMips",
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, MipsGenericItineraries, Features>;
 
+def : Proc<"mips1", [FeatureMips1, FeatureO32]>;
+def : Proc<"mips2", [FeatureMips2, FeatureO32]>;
 def : Proc<"mips32", [FeatureMips32, FeatureO32]>;
 def : Proc<"mips32r2", [FeatureMips32r2, FeatureO32]>;
+def : Proc<"mips32r6", [FeatureMips32r6, FeatureO32]>;
+
+def : Proc<"mips3", [FeatureMips3, FeatureN64]>;
 def : Proc<"mips4", [FeatureMips4, FeatureN64]>;
+def : Proc<"mips5", [FeatureMips5, FeatureN64]>;
 def : Proc<"mips64", [FeatureMips64, FeatureN64]>;
 def : Proc<"mips64r2", [FeatureMips64r2, FeatureN64]>;
+def : Proc<"mips64r6", [FeatureMips64r6, FeatureN64]>;
 def : Proc<"mips16", [FeatureMips16, FeatureO32]>;
 def : Proc<"octeon", [FeatureMips64r2, FeatureN64, FeatureCnMips]>;
 
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 028b049..c01d03a 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -71,7 +71,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
   }
   if (hasFP(MF))
     BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0)
-      .addReg(Mips::SP);
+      .addReg(Mips::SP).setMIFlag(MachineInstr::FrameSetup);
 
 }
 
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index 8ce2ced..3f7829d 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -24,27 +24,27 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
 
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
 
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const;
+                                            RegScavenger *RS) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index d321e21..14055d6 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips16-hard-float"
 #include "Mips16HardFloat.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
@@ -20,6 +19,8 @@
 #include <algorithm>
 #include <string>
 
+#define DEBUG_TYPE "mips16-hard-float"
+
 static void inlineAsmOut
   (LLVMContext &C, StringRef AsmString, BasicBlock *BB ) {
   std::vector<llvm::Type *> AsmArgTypes;
@@ -354,9 +355,8 @@ static const char *IntrinsicInline[] =
   };
 
 static bool isIntrinsicInline(Function *F) {
-  return std::binary_search(
-    IntrinsicInline, array_endof(IntrinsicInline),
-    F->getName());
+  return std::binary_search(std::begin(IntrinsicInline),
+                            std::end(IntrinsicInline), F->getName());
 }
 //
 // Returns of float, double and complex need to be handled with a helper
@@ -407,11 +407,11 @@ static bool fixupFPReturnAndCall
         CallInst::Create(F, Params, "", &Inst );
       } else if (const CallInst *CI = dyn_cast<CallInst>(I)) {
           const Value* V = CI->getCalledValue();
-          const Type* T = 0;
+          const Type* T = nullptr;
           if (V) T = V->getType();
-          const PointerType *PFT=0;
+          const PointerType *PFT=nullptr;
           if (T) PFT = dyn_cast<PointerType>(T);
-          const FunctionType *FT=0;
+          const FunctionType *FT=nullptr;
           if (PFT) FT = dyn_cast<FunctionType>(PFT->getElementType());
           Function *F_ =  CI->getCalledFunction();
           if (FT && needsFPReturnHelper(*FT) &&
diff --git a/lib/Target/Mips/Mips16HardFloat.h b/lib/Target/Mips/Mips16HardFloat.h
index b7f712a..826887e 100644
--- a/lib/Target/Mips/Mips16HardFloat.h
+++ b/lib/Target/Mips/Mips16HardFloat.h
@@ -34,11 +34,11 @@ public:
     TM(TM_), Subtarget(TM.getSubtarget<MipsSubtarget>()) {
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "MIPS16 Hard Float Pass";
   }
 
-  virtual bool runOnModule(Module &M);
+  bool runOnModule(Module &M) override;
 
 protected:
   /// Keep a pointer to the MipsSubtarget around so that we can make the right
diff --git a/lib/Target/Mips/Mips16HardFloatInfo.cpp b/lib/Target/Mips/Mips16HardFloatInfo.cpp
index d8b685e..2eb6e5d 100644
--- a/lib/Target/Mips/Mips16HardFloatInfo.cpp
+++ b/lib/Target/Mips/Mips16HardFloatInfo.cpp
@@ -30,7 +30,7 @@ const FuncNameSignature PredefinedFuncs[] = {
   { "__fixunssfsi", { FSig, NoFPRet } },
   { "__fixunssfdi", { FSig, NoFPRet } },
   { "__floatundisf", { NoSig, FRet } },
-  { 0, { NoSig, NoFPRet } }
+  { nullptr, { NoSig, NoFPRet } }
 };
 
 // just do a search for now. there are very few of these special cases.
@@ -44,7 +44,7 @@ extern FuncSignature const *findFuncSignature(const char *name) {
       return &PredefinedFuncs[i].Signature;
     i++;
   }
-  return 0;
+  return nullptr;
 }
 }
 }
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 9e36546..4e86a27 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-isel"
 #include "Mips16ISelDAGToDAG.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips.h"
@@ -35,6 +34,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-isel"
+
 bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   if (!Subtarget.inMips16Mode())
     return false;
@@ -44,7 +45,7 @@ bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
 std::pair<SDNode*, SDNode*>
 Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, SDLoc DL, EVT Ty,
                                bool HasLo, bool HasHi) {
-  SDNode *Lo = 0, *Hi = 0;
+  SDNode *Lo = nullptr, *Hi = nullptr;
   SDNode *Mul = CurDAG->getMachineNode(Opc, DL, MVT::Glue, N->getOperand(0),
                                        N->getOperand(1));
   SDValue InFlag = SDValue(Mul, 0);
@@ -224,10 +225,12 @@ bool Mips16DAGToDAGISel::selectAddr16(
     // If an indexed floating point load/store can be emitted, return false.
     const LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Parent);
 
-    if (LS &&
-        (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) &&
-        Subtarget.hasFPIdx())
-      return false;
+    if (LS) {
+      if (LS->getMemoryVT() == MVT::f32 && Subtarget.hasMips4_32r2())
+        return false;
+      if (LS->getMemoryVT() == MVT::f64 && Subtarget.hasMips4_32r2())
+        return false;
+    }
   }
   Base   = Addr;
   Offset = CurDAG->getTargetConstant(0, ValTy);
@@ -297,7 +300,7 @@ std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
     if (!SDValue(Node, 1).use_empty())
       ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
 
-    return std::make_pair(true, (SDNode*)NULL);
+    return std::make_pair(true, nullptr);
   }
 
   case ISD::MULHS:
@@ -308,7 +311,7 @@ std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
   }
   }
 
-  return std::make_pair(false, (SDNode*)NULL);
+  return std::make_pair(false, nullptr);
 }
 
 FunctionPass *llvm::createMips16ISelDag(MipsTargetMachine &TM) {
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.h b/lib/Target/Mips/Mips16ISelDAGToDAG.h
index 49dc6e5..e653b39 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.h
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.h
@@ -28,16 +28,16 @@ private:
 
   SDValue getMips16SPAliasReg();
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
   void getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg);
 
-  virtual bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
-                            SDValue &Offset, SDValue &Alias);
+  bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
+                    SDValue &Offset, SDValue &Alias) override;
 
-  virtual std::pair<bool, SDNode*> selectNode(SDNode *Node);
+  std::pair<bool, SDNode*> selectNode(SDNode *Node) override;
 
-  virtual void processFunctionAfterISel(MachineFunction &MF);
+  void processFunctionAfterISel(MachineFunction &MF) override;
 
   // Insert instructions to initialize the global base register in the
   // first MBB of the function.
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 5c6f302..9102450 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -10,7 +10,6 @@
 // Subclass of MipsTargetLowering specialized for mips16.
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "mips-lower"
 #include "Mips16ISelLowering.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsRegisterInfo.h"
@@ -23,6 +22,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-lower"
+
 static cl::opt<bool> DontExpandCondPseudos16(
   "mips16-dont-expand-cond-pseudo",
   cl::init(false),
@@ -353,7 +354,7 @@ unsigned int Mips16TargetLowering::getMips16HelperFunctionStubNumber
 #define T P "0" , T1
 #define P P_
 static char const * vMips16Helper[MAX_STUB_NUMBER+1] =
-  {0, T1 };
+  {nullptr, T1 };
 #undef P
 #define P P_ "sf_"
 static char const * sfMips16Helper[MAX_STUB_NUMBER+1] =
@@ -430,7 +431,7 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
   SelectionDAG &DAG = CLI.DAG;
   MachineFunction &MF = DAG.getMachineFunction();
   MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
-  const char* Mips16HelperFunction = 0;
+  const char* Mips16HelperFunction = nullptr;
   bool NeedMips16Helper = false;
 
   if (Subtarget->inMips16HardFloat()) {
@@ -443,8 +444,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
     if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) {
       Mips16Libcall Find = { RTLIB::UNKNOWN_LIBCALL, S->getSymbol() };
 
-      if (std::binary_search(HardFloatLibCalls, array_endof(HardFloatLibCalls),
-                             Find))
+      if (std::binary_search(std::begin(HardFloatLibCalls),
+                             std::end(HardFloatLibCalls), Find))
         LookupHelper = false;
       else {
         const char *Symbol = S->getSymbol();
@@ -471,13 +472,12 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
             FuncInfo->setSaveS2();
         }
         // one more look at list of intrinsics
-        if (std::binary_search(Mips16IntrinsicHelper,
-            array_endof(Mips16IntrinsicHelper),
-                                     IntrinsicFind)) {
-          const Mips16IntrinsicHelperType *h =(std::find(Mips16IntrinsicHelper,
-              array_endof(Mips16IntrinsicHelper),
-                                       IntrinsicFind));
-          Mips16HelperFunction = h->Helper;
+        const Mips16IntrinsicHelperType *Helper =
+            std::lower_bound(std::begin(Mips16IntrinsicHelper),
+                             std::end(Mips16IntrinsicHelper), IntrinsicFind);
+        if (Helper != std::end(Mips16IntrinsicHelper) &&
+            *Helper == IntrinsicFind) {
+          Mips16HelperFunction = Helper->Helper;
           NeedMips16Helper = true;
           LookupHelper = false;
         }
@@ -488,13 +488,13 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
       Mips16Libcall Find = { RTLIB::UNKNOWN_LIBCALL,
                              G->getGlobal()->getName().data() };
 
-      if (std::binary_search(HardFloatLibCalls, array_endof(HardFloatLibCalls),
-                             Find))
+      if (std::binary_search(std::begin(HardFloatLibCalls),
+                             std::end(HardFloatLibCalls), Find))
         LookupHelper = false;
     }
-    if (LookupHelper) Mips16HelperFunction =
-      getMips16HelperFunction(CLI.RetTy, CLI.Args, NeedMips16Helper);
-
+    if (LookupHelper)
+      Mips16HelperFunction =
+        getMips16HelperFunction(CLI.RetTy, CLI.getArgs(), NeedMips16Helper);
   }
 
   SDValue JumpTarget = Callee;
diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h
index 618ec90..df88333 100644
--- a/lib/Target/Mips/Mips16ISelLowering.h
+++ b/lib/Target/Mips/Mips16ISelLowering.h
@@ -21,17 +21,17 @@ namespace llvm {
   public:
     explicit Mips16TargetLowering(MipsTargetMachine &TM);
 
-    virtual bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
-                                               bool *Fast) const;
+    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+                                       bool *Fast) const override;
 
-    virtual MachineBasicBlock *
-    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *MBB) const override;
 
   private:
-    virtual bool
-    isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
-                                      unsigned NextStackOffset,
-                                      const MipsFunctionInfo& FI) const;
+    bool isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                     unsigned NextStackOffset,
+                                     const MipsFunctionInfo& FI) const override;
 
     void setMips16HardFloatLibCalls();
 
@@ -41,11 +41,12 @@ namespace llvm {
     const char *getMips16HelperFunction
       (Type* RetTy, ArgListTy &Args, bool &needHelper) const;
 
-    virtual void
+    void
     getOpndList(SmallVectorImpl<SDValue> &Ops,
                 std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
                 bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
-                CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
+                CallLoweringInfo &CLI, SDValue Callee,
+                SDValue Chain) const override;
 
     MachineBasicBlock *emitSel16(unsigned Opc, MachineInstr *MI,
                                  MachineBasicBlock *BB) const;
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 43c2fbd..79607de 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -29,6 +29,7 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips16-instrinfo"
 
 Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
   : MipsInstrInfo(tm, Mips::Bimm16),
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index e93925c..0dc0046 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -25,46 +25,46 @@ class Mips16InstrInfo : public MipsInstrInfo {
 public:
   explicit Mips16InstrInfo(MipsTargetMachine &TM);
 
-  virtual const MipsRegisterInfo &getRegisterInfo() const;
+  const MipsRegisterInfo &getRegisterInfo() const override;
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
 
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
 
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
 
-  virtual void storeRegToStack(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator MBBI,
-                               unsigned SrcReg, bool isKill, int FrameIndex,
-                               const TargetRegisterClass *RC,
-                               const TargetRegisterInfo *TRI,
-                               int64_t Offset) const;
+  void storeRegToStack(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MBBI,
+                       unsigned SrcReg, bool isKill, int FrameIndex,
+                       const TargetRegisterClass *RC,
+                       const TargetRegisterInfo *TRI,
+                       int64_t Offset) const override;
 
-  virtual void loadRegFromStack(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MBBI,
-                                unsigned DestReg, int FrameIndex,
-                                const TargetRegisterClass *RC,
-                                const TargetRegisterInfo *TRI,
-                                int64_t Offset) const;
+  void loadRegFromStack(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI,
+                        unsigned DestReg, int FrameIndex,
+                        const TargetRegisterClass *RC,
+                        const TargetRegisterInfo *TRI,
+                        int64_t Offset) const override;
 
-  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
-  virtual unsigned getOppositeBranchOpc(unsigned Opc) const;
+  unsigned getOppositeBranchOpc(unsigned Opc) const override;
 
   // Adjust SP by FrameSize bytes. Save RA, S0, S1
   void makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
@@ -104,9 +104,9 @@ public:
     (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const;
 
   unsigned getInlineAsmLength(const char *Str,
-                              const MCAsmInfo &MAI) const;
+                              const MCAsmInfo &MAI) const override;
 private:
-  virtual unsigned getAnalyzableBrOpc(unsigned Opc) const;
+  unsigned getAnalyzableBrOpc(unsigned Opc) const override;
 
   void ExpandRetRA16(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    unsigned Opc) const;
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index 3a50ed9..dbee774 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -39,6 +39,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips16-registerinfo"
+
 Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST)
   : MipsRegisterInfo(ST) {}
 
diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h
index 13e82a3..f59f1a7 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.h
+++ b/lib/Target/Mips/Mips16RegisterInfo.h
@@ -23,24 +23,24 @@ class Mips16RegisterInfo : public MipsRegisterInfo {
 public:
   Mips16RegisterInfo(const MipsSubtarget &Subtarget);
 
-  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
 
-  bool useFPForScavengingIndex(const MachineFunction &MF) const;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
 
   bool saveScavengerRegister(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I,
                                      MachineBasicBlock::iterator &UseMI,
                                      const TargetRegisterClass *RC,
-                                     unsigned Reg) const;
+                                     unsigned Reg) const override;
 
-  virtual const TargetRegisterClass *intRegClass(unsigned Size) const;
+  const TargetRegisterClass *intRegClass(unsigned Size) const override;
 
 private:
-  virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
-                           int FrameIndex, uint64_t StackSize,
-                           int64_t SPOffset) const;
+  void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                   int FrameIndex, uint64_t StackSize,
+                   int64_t SPOffset) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td
new file mode 100644
index 0000000..a3f9df5
--- /dev/null
+++ b/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -0,0 +1,386 @@
+//=- Mips32r6InstrFormats.td - Mips32r6 Instruction Formats -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class MipsR6Inst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+                   PredicateControl {
+  let DecoderNamespace = "Mips32r6_64r6";
+  let EncodingPredicates = [HasStdEnc];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Field Values
+//
+//===----------------------------------------------------------------------===//
+
+class OPGROUP<bits<6> Val> {
+  bits<6> Value = Val;
+}
+def OPGROUP_COP1     : OPGROUP<0b010001>;
+def OPGROUP_COP2     : OPGROUP<0b010010>;
+def OPGROUP_ADDI     : OPGROUP<0b001000>;
+def OPGROUP_AUI      : OPGROUP<0b001111>;
+def OPGROUP_BLEZ     : OPGROUP<0b000110>;
+def OPGROUP_BGTZ     : OPGROUP<0b000111>;
+def OPGROUP_BLEZL    : OPGROUP<0b010110>;
+def OPGROUP_BGTZL    : OPGROUP<0b010111>;
+def OPGROUP_DADDI    : OPGROUP<0b011000>;
+def OPGROUP_DAUI     : OPGROUP<0b011101>;
+def OPGROUP_PCREL    : OPGROUP<0b111011>;
+def OPGROUP_REGIMM   : OPGROUP<0b000001>;
+def OPGROUP_SPECIAL  : OPGROUP<0b000000>;
+def OPGROUP_SPECIAL3 : OPGROUP<0b011111>;
+
+class OPCODE2<bits<2> Val> {
+  bits<2> Value = Val;
+}
+def OPCODE2_ADDIUPC : OPCODE2<0b00>;
+def OPCODE2_LWPC    : OPCODE2<0b01>;
+def OPCODE2_LWUPC   : OPCODE2<0b10>;
+
+class OPCODE5<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def OPCODE5_ALUIPC : OPCODE5<0b11111>;
+def OPCODE5_AUIPC  : OPCODE5<0b11110>;
+def OPCODE5_DAHI : OPCODE5<0b00110>;
+def OPCODE5_DATI : OPCODE5<0b11110>;
+def OPCODE5_BC1EQZ : OPCODE5<0b01001>;
+def OPCODE5_BC1NEZ : OPCODE5<0b01101>;
+def OPCODE5_BC2EQZ : OPCODE5<0b01001>;
+def OPCODE5_BC2NEZ : OPCODE5<0b01101>;
+
+class OPCODE6<bits<6> Val> {
+  bits<6> Value = Val;
+}
+def OPCODE6_ALIGN    : OPCODE6<0b100000>;
+def OPCODE6_DALIGN   : OPCODE6<0b100100>;
+def OPCODE6_BITSWAP  : OPCODE6<0b100000>;
+def OPCODE6_DBITSWAP : OPCODE6<0b100100>;
+
+class FIELD_FMT<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def FIELD_FMT_S : FIELD_FMT<0b10000>;
+def FIELD_FMT_D : FIELD_FMT<0b10001>;
+
+class FIELD_CMP_COND<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def FIELD_CMP_COND_F    : FIELD_CMP_COND<0b00000>;
+def FIELD_CMP_COND_UN   : FIELD_CMP_COND<0b00001>;
+def FIELD_CMP_COND_EQ   : FIELD_CMP_COND<0b00010>;
+def FIELD_CMP_COND_UEQ  : FIELD_CMP_COND<0b00011>;
+def FIELD_CMP_COND_OLT  : FIELD_CMP_COND<0b00100>;
+def FIELD_CMP_COND_ULT  : FIELD_CMP_COND<0b00101>;
+def FIELD_CMP_COND_OLE  : FIELD_CMP_COND<0b00110>;
+def FIELD_CMP_COND_ULE  : FIELD_CMP_COND<0b00111>;
+def FIELD_CMP_COND_SF   : FIELD_CMP_COND<0b01000>;
+def FIELD_CMP_COND_NGLE : FIELD_CMP_COND<0b01001>;
+def FIELD_CMP_COND_SEQ  : FIELD_CMP_COND<0b01010>;
+def FIELD_CMP_COND_NGL  : FIELD_CMP_COND<0b01011>;
+def FIELD_CMP_COND_LT   : FIELD_CMP_COND<0b01100>;
+def FIELD_CMP_COND_NGE  : FIELD_CMP_COND<0b01101>;
+def FIELD_CMP_COND_LE   : FIELD_CMP_COND<0b01110>;
+def FIELD_CMP_COND_NGT  : FIELD_CMP_COND<0b01111>;
+
+class FIELD_CMP_FORMAT<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def FIELD_CMP_FORMAT_S : FIELD_CMP_FORMAT<0b10100>;
+def FIELD_CMP_FORMAT_D : FIELD_CMP_FORMAT<0b10101>;
+
+//===----------------------------------------------------------------------===//
+//
+// Disambiguators
+//
+//===----------------------------------------------------------------------===//
+//
+// Some encodings are ambiguous except by comparing field values.
+
+class DecodeDisambiguates<string Name> {
+  string DecoderMethod = !strconcat("Decode", Name);
+}
+
+class DecodeDisambiguatedBy<string Name> : DecodeDisambiguates<Name> {
+  string DecoderNamespace = "Mips32r6_64r6_Ambiguous";
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Encoding Formats
+//
+//===----------------------------------------------------------------------===//
+
+class AUI_FM : MipsR6Inst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_AUI.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = imm;
+}
+
+class DAUI_FM : AUI_FM {
+  let Inst{31-26} = OPGROUP_DAUI.Value;
+}
+
+class COP1_2R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Format.Value;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class COP1_3R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Format.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6} = fd;
+  let Inst{5-0} = funct;
+}
+
+class COP1_BCCZ_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> ft;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-0} = offset;
+}
+
+class COP2_BCCZ_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> ct;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP2.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = ct;
+  let Inst{15-0} = offset;
+}
+
+class PCREL16_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_PCREL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = Operation.Value;
+  let Inst{15-0} = imm;
+}
+
+class PCREL19_FM<OPCODE2 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<19> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_PCREL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-19} = Operation.Value;
+  let Inst{18-0} = imm;
+}
+
+class SPECIAL3_2R_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0b00000;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL_3R_FM<bits<5> mulop, bits<6> funct> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = mulop;
+  let Inst{5-0}   = funct;
+}
+
+// This class is ambiguous with other branches:
+//   BEQC/BNEC require that rs > rt
+class CMP_BRANCH_2R_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+// This class is ambiguous with other branches:
+//   BLEZC/BGEZC/BEQZALC/BNEZALC/BGTZALC require that rs == 0 && rt != 0
+// The '1R_RT' in the name means 1 register in the rt field.
+class CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+// This class is ambiguous with other branches:
+//   BLTZC/BGTZC/BLTZALC/BGEZALC require that rs == rt && rt != 0
+// The '1R_BOTH' in the name means 1 register in both the rs and rt fields.
+class CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+class CMP_BRANCH_OFF21_FM<bits<6> funct> : MipsR6Inst {
+  bits<5> rs; // rs != 0
+  bits<21> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = rs;
+  let Inst{20-0} = offset;
+}
+
+class JMP_IDX_COMPACT_FM<bits<6> funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = 0b000000;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+class BRANCH_OFF26_FM<bits<6> funct> : MipsR6Inst {
+  bits<32> Inst;
+  bits<26> offset;
+
+  let Inst{31-26} = funct;
+  let Inst{25-0} = offset;
+}
+
+class SPECIAL3_ALIGN_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<2> bp;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8}  = 0b010;
+  let Inst{7-6}   = bp;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL3_DALIGN_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<3> bp;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-9}  = 0b01;
+  let Inst{8-6}   = bp;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class REGIMM_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_REGIMM.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = Operation.Value;
+  let Inst{15-0} = imm;
+}
+
+class COP1_CMP_CONDN_FM<FIELD_CMP_FORMAT Format,
+                        FIELD_CMP_COND Cond> : MipsR6Inst {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> ft;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Format.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5}     = 0;
+  let Inst{4-0}   = Cond.Value;
+}
+
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
new file mode 100644
index 0000000..ffaf965
--- /dev/null
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -0,0 +1,583 @@
+//=- Mips32r6InstrInfo.td - Mips32r6 Instruction Information -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+include "Mips32r6InstrFormats.td"
+
+// Notes about removals/changes from MIPS32r6:
+// Unclear: ssnop
+// Reencoded: cache, pref
+// Reencoded: clo, clz
+// Reencoded: jr -> jalr
+// Reencoded: jr.hb -> jalr.hb
+// Reencoded: ldc2
+// Reencoded: ll, sc
+// Reencoded: lwc2
+// Reencoded: sdbbp
+// Reencoded: sdc2
+// Reencoded: swc2
+// Removed: bc1any2, bc1any4
+// Removed: bc2[ft]
+// Removed: bc2f, bc2t
+// Removed: bgezal
+// Removed: bltzal
+// Removed: c.cond.fmt, bc1[ft]
+// Removed: div, divu
+// Removed: jalx
+// Removed: ldxc1
+// Removed: luxc1
+// Removed: lwxc1
+// Removed: madd.[ds], nmadd.[ds], nmsub.[ds], sub.[ds]
+// Removed: mfhi, mflo, mthi, mtlo, madd, maddu, msub, msubu, mul
+// Removed: movf, movt
+// Removed: movf.fmt, movt.fmt, movn.fmt, movz.fmt
+// Removed: movn, movz
+// Removed: mult, multu
+// Removed: prefx
+// Removed: sdxc1
+// Removed: suxc1
+// Removed: swxc1
+// Rencoded: [ls][wd]c2
+
+def brtarget21 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget21OpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget21";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget26 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget26OpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget26";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def jmpoffset16 : Operand<OtherVT> {
+  let EncoderMethod = "getJumpOffset16OpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def calloffset16 : Operand<iPTR> {
+  let EncoderMethod = "getJumpOffset16OpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class ADDIUPC_ENC : PCREL19_FM<OPCODE2_ADDIUPC>;
+class ALIGN_ENC  : SPECIAL3_ALIGN_FM<OPCODE6_ALIGN>;
+class ALUIPC_ENC : PCREL16_FM<OPCODE5_ALUIPC>;
+class AUI_ENC    : AUI_FM;
+class AUIPC_ENC  : PCREL16_FM<OPCODE5_AUIPC>;
+
+class BALC_ENC  : BRANCH_OFF26_FM<0b111010>;
+class BC_ENC    : BRANCH_OFF26_FM<0b110010>;
+class BEQC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
+                  DecodeDisambiguates<"AddiGroupBranch">;
+class BEQZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_ADDI>,
+                    DecodeDisambiguatedBy<"DaddiGroupBranch">;
+class BNEC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
+                  DecodeDisambiguates<"DaddiGroupBranch">;
+class BNEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_DADDI>,
+                    DecodeDisambiguatedBy<"DaddiGroupBranch">;
+
+class BLTZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZL>,
+                  DecodeDisambiguates<"BgtzlGroupBranch">;
+class BGEZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguates<"BlezlGroupBranch">;
+class BGTZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZ>,
+                    DecodeDisambiguatedBy<"BgtzGroupBranch">;
+
+class BLEZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguatedBy<"BlezlGroupBranch">;
+class BLTZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZ>,
+                    DecodeDisambiguates<"BgtzGroupBranch">;
+class BGTZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZL>,
+                  DecodeDisambiguatedBy<"BgtzlGroupBranch">;
+
+class BEQZC_ENC : CMP_BRANCH_OFF21_FM<0b110110>;
+class BGEZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZ>;
+class BNEZC_ENC : CMP_BRANCH_OFF21_FM<0b111110>;
+
+class BC1EQZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1EQZ>;
+class BC1NEZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1NEZ>;
+class BC2EQZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2EQZ>;
+class BC2NEZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2NEZ>;
+
+class JIALC_ENC : JMP_IDX_COMPACT_FM<0b111110>;
+class JIC_ENC   : JMP_IDX_COMPACT_FM<0b110110>;
+
+class BITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_BITSWAP>;
+class BLEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZ>;
+class BNVC_ENC   : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
+                   DecodeDisambiguatedBy<"DaddiGroupBranch">;
+class BOVC_ENC   : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
+                   DecodeDisambiguatedBy<"AddiGroupBranch">;
+class DIV_ENC    : SPECIAL_3R_FM<0b00010, 0b011010>;
+class DIVU_ENC   : SPECIAL_3R_FM<0b00010, 0b011011>;
+class MOD_ENC    : SPECIAL_3R_FM<0b00011, 0b011010>;
+class MODU_ENC   : SPECIAL_3R_FM<0b00011, 0b011011>;
+class MUH_ENC    : SPECIAL_3R_FM<0b00011, 0b011000>;
+class MUHU_ENC   : SPECIAL_3R_FM<0b00011, 0b011001>;
+class MUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011000>;
+class MULU_ENC   : SPECIAL_3R_FM<0b00010, 0b011001>;
+
+class MADDF_S_ENC  : COP1_3R_FM<0b011000, FIELD_FMT_S>;
+class MADDF_D_ENC  : COP1_3R_FM<0b011000, FIELD_FMT_D>;
+class MSUBF_S_ENC  : COP1_3R_FM<0b011001, FIELD_FMT_S>;
+class MSUBF_D_ENC  : COP1_3R_FM<0b011001, FIELD_FMT_D>;
+
+class SEL_D_ENC  : COP1_3R_FM<0b010000, FIELD_FMT_D>;
+class SEL_S_ENC  : COP1_3R_FM<0b010000, FIELD_FMT_S>;
+
+class SELEQZ_ENC : SPECIAL_3R_FM<0b00000, 0b110101>;
+class SELNEZ_ENC : SPECIAL_3R_FM<0b00000, 0b110111>;
+
+class LWPC_ENC   : PCREL19_FM<OPCODE2_LWPC>;
+class LWUPC_ENC  : PCREL19_FM<OPCODE2_LWUPC>;
+
+class MAX_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>;
+class MAX_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>;
+class MIN_S_ENC : COP1_3R_FM<0b011100, FIELD_FMT_S>;
+class MIN_D_ENC : COP1_3R_FM<0b011100, FIELD_FMT_D>;
+
+class MAXA_S_ENC : COP1_3R_FM<0b011111, FIELD_FMT_S>;
+class MAXA_D_ENC : COP1_3R_FM<0b011111, FIELD_FMT_D>;
+class MINA_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>;
+class MINA_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>;
+
+class SELEQZ_S_ENC : COP1_3R_FM<0b010100, FIELD_FMT_S>;
+class SELEQZ_D_ENC : COP1_3R_FM<0b010100, FIELD_FMT_D>;
+class SELNEZ_S_ENC : COP1_3R_FM<0b010111, FIELD_FMT_S>;
+class SELNEZ_D_ENC : COP1_3R_FM<0b010111, FIELD_FMT_D>;
+
+class RINT_S_ENC : COP1_2R_FM<0b011010, FIELD_FMT_S>;
+class RINT_D_ENC : COP1_2R_FM<0b011010, FIELD_FMT_D>;
+class CLASS_S_ENC : COP1_2R_FM<0b011011, FIELD_FMT_S>;
+class CLASS_D_ENC : COP1_2R_FM<0b011011, FIELD_FMT_D>;
+
+class CMP_CONDN_DESC_BASE<string CondStr, string Typestr, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat("cmp.", CondStr, ".", Typestr, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Multiclasses
+//
+//===----------------------------------------------------------------------===//
+
+multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
+                     RegisterOperand FGROpnd>{
+  def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_F>,
+                    CMP_CONDN_DESC_BASE<"f", Typestr, FGROpnd>,
+                    ISA_MIPS32R6;
+  def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
+                     CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd>,
+                     ISA_MIPS32R6;
+  def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
+                     CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd>,
+                     ISA_MIPS32R6;
+  def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>,
+                      CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_OLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_OLT>,
+                      CMP_CONDN_DESC_BASE<"olt", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>,
+                      CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_OLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_OLE>,
+                      CMP_CONDN_DESC_BASE<"ole", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>,
+                      CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_SF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SF>,
+                     CMP_CONDN_DESC_BASE<"sf", Typestr, FGROpnd>,
+                     ISA_MIPS32R6;
+  def CMP_NGLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGLE>,
+                       CMP_CONDN_DESC_BASE<"ngle", Typestr, FGROpnd>,
+                       ISA_MIPS32R6;
+  def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>,
+                      CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_NGL_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGL>,
+                      CMP_CONDN_DESC_BASE<"ngl", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
+                     CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd>,
+                     ISA_MIPS32R6;
+  def CMP_NGE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGE>,
+                      CMP_CONDN_DESC_BASE<"nge", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
+                     CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd>,
+                     ISA_MIPS32R6;
+  def CMP_NGT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGT>,
+                      CMP_CONDN_DESC_BASE<"ngt", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class PCREL19_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins simm19_lsl2:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
+  list<dag> Pattern = [];
+}
+
+class ADDIUPC_DESC : PCREL19_DESC_BASE<"addiupc", GPR32Opnd>;
+class LWPC_DESC: PCREL19_DESC_BASE<"lwpc", GPR32Opnd>;
+class LWUPC_DESC: PCREL19_DESC_BASE<"lwupc", GPR32Opnd>;
+
+class ALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand ImmOpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp");
+  list<dag> Pattern = [];
+}
+
+class ALIGN_DESC : ALIGN_DESC_BASE<"align", GPR32Opnd, uimm2>;
+
+class ALUIPC_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
+  list<dag> Pattern = [];
+}
+
+class ALUIPC_DESC : ALUIPC_DESC_BASE<"aluipc", GPR32Opnd>;
+class AUIPC_DESC : ALUIPC_DESC_BASE<"auipc", GPR32Opnd>;
+
+class AUI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins GPROpnd:$rt, simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm");
+  list<dag> Pattern = [];
+}
+
+class AUI_DESC : AUI_DESC_BASE<"aui", GPR32Opnd>;
+
+class BRANCH_DESC_BASE {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 0;
+}
+
+class BC_DESC_BASE<string instr_asm, DAGOperand opnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$offset");
+  bit isBarrier = 1;
+}
+
+class CMP_BC_DESC_BASE<string instr_asm, DAGOperand opnd,
+                       RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $offset");
+  list<Register> Defs = [AT];
+}
+
+class CMP_CBR_EQNE_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
+                               RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rs, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $offset");
+  list<Register> Defs = [AT];
+}
+
+class CMP_CBR_RT_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
+                             RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $offset");
+  list<Register> Defs = [AT];
+}
+
+class BALC_DESC : BC_DESC_BASE<"balc", brtarget26> {
+  bit isCall = 1;
+  list<Register> Defs = [RA];
+}
+
+class BC_DESC : BC_DESC_BASE<"bc", brtarget26>;
+class BEQC_DESC : CMP_BC_DESC_BASE<"beqc", brtarget, GPR32Opnd>;
+class BNEC_DESC : CMP_BC_DESC_BASE<"bnec", brtarget, GPR32Opnd>;
+
+class BLTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzc", brtarget, GPR32Opnd>;
+class BGEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezc", brtarget, GPR32Opnd>;
+
+class BLEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezc", brtarget, GPR32Opnd>;
+class BGTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzc", brtarget, GPR32Opnd>;
+
+class BEQZC_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"beqzc", brtarget21, GPR32Opnd>;
+class BNEZC_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"bnezc", brtarget21, GPR32Opnd>;
+
+class COP1_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins FGR64Opnd:$ft, brtarget:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = instr_asm;
+  bit hasDelaySlot = 1;
+}
+
+class BC1EQZ_DESC : COP1_BCCZ_DESC_BASE<"bc1eqz $ft, $offset">;
+class BC1NEZ_DESC : COP1_BCCZ_DESC_BASE<"bc1nez $ft, $offset">;
+
+class COP2_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins COP2Opnd:$ct, brtarget:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = instr_asm;
+  bit hasDelaySlot = 1;
+}
+
+class BC2EQZ_DESC : COP2_BCCZ_DESC_BASE<"bc2eqz $ct, $offset">;
+class BC2NEZ_DESC : COP2_BCCZ_DESC_BASE<"bc2nez $ct, $offset">;
+
+class BOVC_DESC   : CMP_BC_DESC_BASE<"bovc", brtarget, GPR32Opnd>;
+class BNVC_DESC   : CMP_BC_DESC_BASE<"bnvc", brtarget, GPR32Opnd>;
+
+class JMP_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
+                                RegisterOperand GPROpnd> {
+  dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+  string AsmString = !strconcat(opstr, "\t$rt, $offset");
+  list<dag> Pattern = [];
+  bit isTerminator = 1;
+  bit hasDelaySlot = 0;
+  string DecoderMethod = "DecodeSimm16";
+}
+
+class JIALC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
+                                             GPR32Opnd> {
+  bit isCall = 1;
+  list<Register> Defs = [RA];
+}
+
+class JIC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR32Opnd> {
+  bit isBarrier = 1;
+  list<Register> Defs = [AT];
+}
+
+class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
+  list<dag> Pattern = [];
+}
+
+class BITSWAP_DESC : BITSWAP_DESC_BASE<"bitswap", GPR32Opnd>;
+
+class DIVMOD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [];
+}
+
+class DIV_DESC  : DIVMOD_DESC_BASE<"div", GPR32Opnd>;
+class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd>;
+class MOD_DESC  : DIVMOD_DESC_BASE<"mod", GPR32Opnd>;
+class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd>;
+
+class BEQZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"beqzalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BGEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BGTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BLEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BLTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BNEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bnezalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+class MUL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [];
+}
+
+class MUH_DESC    : MUL_R6_DESC_BASE<"muh", GPR32Opnd>;
+class MUHU_DESC   : MUL_R6_DESC_BASE<"muhu", GPR32Opnd>;
+class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd>;
+class MULU_DESC   : MUL_R6_DESC_BASE<"mulu", GPR32Opnd>;
+
+class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+  string Constraints = "$fd_in = $fd";
+}
+
+class SEL_D_DESC : COP1_4R_DESC_BASE<"sel.d", FGR64Opnd>;
+class SEL_S_DESC : COP1_4R_DESC_BASE<"sel.s", FGR32Opnd>;
+
+class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [];
+}
+
+class SELEQZ_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR32Opnd>;
+class SELNEZ_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR32Opnd>;
+
+class MADDF_S_DESC  : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd>;
+class MADDF_D_DESC  : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd>;
+class MSUBF_S_DESC  : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd>;
+class MSUBF_D_DESC  : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd>;
+
+class MAX_MIN_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+}
+
+class MAX_S_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd>;
+class MAX_D_DESC : MAX_MIN_DESC_BASE<"max.d", FGR64Opnd>;
+class MIN_S_DESC : MAX_MIN_DESC_BASE<"min.s", FGR32Opnd>;
+class MIN_D_DESC : MAX_MIN_DESC_BASE<"min.d", FGR64Opnd>;
+
+class MAXA_S_DESC : MAX_MIN_DESC_BASE<"maxa.s", FGR32Opnd>;
+class MAXA_D_DESC : MAX_MIN_DESC_BASE<"maxa.d", FGR64Opnd>;
+class MINA_S_DESC : MAX_MIN_DESC_BASE<"mina.s", FGR32Opnd>;
+class MINA_D_DESC : MAX_MIN_DESC_BASE<"mina.d", FGR64Opnd>;
+
+class SELEQNEZ_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+}
+
+class SELEQZ_S_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>;
+class SELEQZ_D_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>;
+class SELNEZ_S_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>;
+class SELNEZ_D_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>;
+
+class CLASS_RINT_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs");
+  list<dag> Pattern = [];
+}
+
+class RINT_S_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd>;
+class RINT_D_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>;
+class CLASS_S_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>;
+class CLASS_D_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+def ADDIUPC : ADDIUPC_ENC, ADDIUPC_DESC, ISA_MIPS32R6;
+def ALIGN : ALIGN_ENC, ALIGN_DESC, ISA_MIPS32R6;
+def ALUIPC : ALUIPC_ENC, ALUIPC_DESC, ISA_MIPS32R6;
+def AUI : AUI_ENC, AUI_DESC, ISA_MIPS32R6;
+def AUIPC : AUIPC_ENC, AUIPC_DESC, ISA_MIPS32R6;
+def BALC : BALC_ENC, BALC_DESC, ISA_MIPS32R6;
+def BC1EQZ : BC1EQZ_ENC, BC1EQZ_DESC, ISA_MIPS32R6;
+def BC1NEZ : BC1NEZ_ENC, BC1NEZ_DESC, ISA_MIPS32R6;
+def BC2EQZ : BC2EQZ_ENC, BC2EQZ_DESC, ISA_MIPS32R6;
+def BC2NEZ : BC2NEZ_ENC, BC2NEZ_DESC, ISA_MIPS32R6;
+def BC : BC_ENC, BC_DESC, ISA_MIPS32R6;
+def BEQC : BEQC_ENC, BEQC_DESC, ISA_MIPS32R6;
+def BEQZALC : BEQZALC_ENC, BEQZALC_DESC, ISA_MIPS32R6;
+def BEQZC : BEQZC_ENC, BEQZC_DESC, ISA_MIPS32R6;
+def BGEC;  // Also aliased to blec with operands swapped
+def BGEUC; // Also aliased to bleuc with operands swapped
+def BGEZALC : BGEZALC_ENC, BGEZALC_DESC, ISA_MIPS32R6;
+def BGEZC : BGEZC_ENC, BGEZC_DESC, ISA_MIPS32R6;
+def BGTZALC : BGTZALC_ENC, BGTZALC_DESC, ISA_MIPS32R6;
+def BGTZC : BGTZC_ENC, BGTZC_DESC, ISA_MIPS32R6;
+def BITSWAP : BITSWAP_ENC, BITSWAP_DESC, ISA_MIPS32R6;
+def BLEZALC : BLEZALC_ENC, BLEZALC_DESC, ISA_MIPS32R6;
+def BLEZC : BLEZC_ENC, BLEZC_DESC, ISA_MIPS32R6;
+def BLTC; // Also aliased to bgtc with operands swapped
+def BLTUC; // Also aliased to bgtuc with operands swapped
+def BLTZALC : BLTZALC_ENC, BLTZALC_DESC, ISA_MIPS32R6;
+def BLTZC : BLTZC_ENC, BLTZC_DESC, ISA_MIPS32R6;
+def BNEC : BNEC_ENC, BNEC_DESC, ISA_MIPS32R6;
+def BNEZALC : BNEZALC_ENC, BNEZALC_DESC, ISA_MIPS32R6;
+def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
+def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
+def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
+def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6;
+def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6;
+defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>;
+defm D : CMP_CC_M<FIELD_CMP_FORMAT_D, "d", FGR64Opnd>;
+def DIV : DIV_ENC, DIV_DESC, ISA_MIPS32R6;
+def DIVU : DIVU_ENC, DIVU_DESC, ISA_MIPS32R6;
+def JIALC : JIALC_ENC, JIALC_DESC, ISA_MIPS32R6;
+def JIC : JIC_ENC, JIC_DESC, ISA_MIPS32R6;
+// def LSA; // See MSA
+def LWPC : LWPC_ENC, LWPC_DESC, ISA_MIPS32R6;
+def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
+def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6;
+def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6;
+def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6;
+def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6;
+def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6;
+def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6;
+def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6;
+def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6;
+def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6;
+def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6;
+def MOD : MOD_ENC, MOD_DESC, ISA_MIPS32R6;
+def MODU : MODU_ENC, MODU_DESC, ISA_MIPS32R6;
+def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6;
+def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6;
+def MUH    : MUH_ENC, MUH_DESC, ISA_MIPS32R6;
+def MUHU   : MUHU_ENC, MUHU_DESC, ISA_MIPS32R6;
+def MUL_R6 : MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
+def MULU   : MULU_ENC, MULU_DESC, ISA_MIPS32R6;
+def NAL; // BAL with rd=0
+def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6;
+def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6;
+def SELEQZ : SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6;
+def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6;
+def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6;
+def SELNEZ : SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6;
+def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6;
+def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6;
+def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6;
+def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6;
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 7115d11..924b325 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -20,6 +20,9 @@ def uimm16_64      : Operand<i64> {
   let PrintMethod = "printUnsignedImm";
 }
 
+// Signed Operand
+def simm10_64 : Operand<i64>;
+
 // Transformation Function - get Imm - 32.
 def Subtract32 : SDNodeXForm<imm, [{
   return getImm(N, (unsigned)N->getZExtValue() - 32);
@@ -28,6 +31,11 @@ def Subtract32 : SDNodeXForm<imm, [{
 // shamt must fit in 6 bits.
 def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
 
+// Node immediate fits as 10-bit sign extended on target immediate.
+// e.g. seqi, snei
+def immSExt10_64 : PatLeaf<(i64 imm),
+                           [{ return isInt<10>(N->getSExtValue()); }]>;
+
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
@@ -53,10 +61,11 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
 //===----------------------------------------------------------------------===//
 let DecoderNamespace = "Mips64" in {
 /// Arithmetic Instructions (ALU Immediate)
-def DADDi   : ArithLogicI<"daddi", simm16_64, GPR64Opnd>, ADDI_FM<0x18>;
+def DADDi   : ArithLogicI<"daddi", simm16_64, GPR64Opnd>, ADDI_FM<0x18>,
+              ISA_MIPS3;
 def DADDiu  : ArithLogicI<"daddiu", simm16_64, GPR64Opnd, II_DADDIU,
                           immSExt16, add>,
-              ADDI_FM<0x19>, IsAsCheapAsAMove;
+              ADDI_FM<0x19>, IsAsCheapAsAMove, ISA_MIPS3;
 
 let isCodeGenOnly = 1 in {
 def SLTi64  : SetCC_I<"slti", setlt, simm16_64, immSExt16, GPR64Opnd>,
@@ -73,12 +82,14 @@ def LUi64   : LoadUpper<"lui", GPR64Opnd, uimm16_64>, LUI_FM;
 }
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def DADD   : ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>, ADD_FM<0, 0x2c>;
-def DADDu  : ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>,
-                              ADD_FM<0, 0x2d>;
-def DSUBu  : ArithLogicR<"dsubu", GPR64Opnd, 0, II_DSUBU, sub>,
-                              ADD_FM<0, 0x2f>;
-def DSUB   : ArithLogicR<"dsub", GPR64Opnd, 0, II_DSUB, sub>, ADD_FM<0, 0x2e>;
+def DADD   : ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>, ADD_FM<0, 0x2c>,
+             ISA_MIPS3;
+def DADDu  : ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>, ADD_FM<0, 0x2d>,
+             ISA_MIPS3;
+def DSUBu  : ArithLogicR<"dsubu", GPR64Opnd, 0, II_DSUBU, sub>, ADD_FM<0, 0x2f>,
+             ISA_MIPS3;
+def DSUB   : ArithLogicR<"dsub", GPR64Opnd, 0, II_DSUB>, ADD_FM<0, 0x2e>,
+             ISA_MIPS3;
 
 let isCodeGenOnly = 1 in {
 def SLT64  : SetCC_R<"slt", setlt, GPR64Opnd>, ADD_FM<0, 0x2a>;
@@ -91,33 +102,32 @@ def NOR64  : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>;
 
 /// Shift Instructions
 def DSLL   : shift_rotate_imm<"dsll", uimm6, GPR64Opnd, II_DSLL, shl, immZExt6>,
-             SRA_FM<0x38, 0>;
+             SRA_FM<0x38, 0>, ISA_MIPS3;
 def DSRL   : shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, II_DSRL, srl, immZExt6>,
-             SRA_FM<0x3a, 0>;
+             SRA_FM<0x3a, 0>, ISA_MIPS3;
 def DSRA   : shift_rotate_imm<"dsra", uimm6, GPR64Opnd, II_DSRA, sra, immZExt6>,
-             SRA_FM<0x3b, 0>;
+             SRA_FM<0x3b, 0>, ISA_MIPS3;
 def DSLLV  : shift_rotate_reg<"dsllv", GPR64Opnd, II_DSLLV, shl>,
-             SRLV_FM<0x14, 0>;
+             SRLV_FM<0x14, 0>, ISA_MIPS3;
 def DSRLV  : shift_rotate_reg<"dsrlv", GPR64Opnd, II_DSRLV, srl>,
-             SRLV_FM<0x16, 0>;
+             SRLV_FM<0x16, 0>, ISA_MIPS3;
 def DSRAV  : shift_rotate_reg<"dsrav", GPR64Opnd, II_DSRAV, sra>,
-             SRLV_FM<0x17, 0>;
+             SRLV_FM<0x17, 0>, ISA_MIPS3;
 def DSLL32 : shift_rotate_imm<"dsll32", uimm5, GPR64Opnd, II_DSLL32>,
-             SRA_FM<0x3c, 0>;
+             SRA_FM<0x3c, 0>, ISA_MIPS3;
 def DSRL32 : shift_rotate_imm<"dsrl32", uimm5, GPR64Opnd, II_DSRL32>,
-             SRA_FM<0x3e, 0>;
+             SRA_FM<0x3e, 0>, ISA_MIPS3;
 def DSRA32 : shift_rotate_imm<"dsra32", uimm5, GPR64Opnd, II_DSRA32>,
-             SRA_FM<0x3f, 0>;
+             SRA_FM<0x3f, 0>, ISA_MIPS3;
 
 // Rotate Instructions
-let Predicates = [HasMips64r2, HasStdEnc] in {
-  def DROTR  : shift_rotate_imm<"drotr", uimm6, GPR64Opnd, II_DROTR, rotr,
-                                immZExt6>, SRA_FM<0x3a, 1>;
-  def DROTRV : shift_rotate_reg<"drotrv", GPR64Opnd, II_DROTRV, rotr>,
-               SRLV_FM<0x16, 1>;
-  def DROTR32 : shift_rotate_imm<"drotr32", uimm5, GPR64Opnd, II_DROTR32>,
-                SRA_FM<0x3e, 1>;
-}
+def DROTR  : shift_rotate_imm<"drotr", uimm6, GPR64Opnd, II_DROTR, rotr,
+                              immZExt6>,
+             SRA_FM<0x3a, 1>, ISA_MIPS64R2;
+def DROTRV : shift_rotate_reg<"drotrv", GPR64Opnd, II_DROTRV, rotr>,
+             SRLV_FM<0x16, 1>, ISA_MIPS64R2;
+def DROTR32 : shift_rotate_imm<"drotr32", uimm5, GPR64Opnd, II_DROTR32>,
+              SRA_FM<0x3e, 1>, ISA_MIPS64R2;
 
 /// Load and Store Instructions
 ///  aligned
@@ -132,9 +142,9 @@ def SH64  : Store<"sh", GPR64Opnd, truncstorei16, II_SH>, LW_FM<0x29>;
 def SW64  : Store<"sw", GPR64Opnd, truncstorei32, II_SW>, LW_FM<0x2b>;
 }
 
-def LWu   : Load<"lwu", GPR64Opnd, zextloadi32, II_LWU>, LW_FM<0x27>;
-def LD    : Load<"ld", GPR64Opnd, load, II_LD>, LW_FM<0x37>;
-def SD    : Store<"sd", GPR64Opnd, store, II_SD>, LW_FM<0x3f>;
+def LWu   : Load<"lwu", GPR64Opnd, zextloadi32, II_LWU>, LW_FM<0x27>, ISA_MIPS3;
+def LD    : Load<"ld", GPR64Opnd, load, II_LD>, LW_FM<0x37>, ISA_MIPS3;
+def SD    : Store<"sd", GPR64Opnd, store, II_SD>, LW_FM<0x3f>, ISA_MIPS3;
 
 /// load/store left/right
 let isCodeGenOnly = 1 in {
@@ -144,14 +154,18 @@ def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, II_SWL>, LW_FM<0x2a>;
 def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, II_SWR>, LW_FM<0x2e>;
 }
 
-def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, II_LDL>, LW_FM<0x1a>;
-def LDR   : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, II_LDR>, LW_FM<0x1b>;
-def SDL   : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, II_SDL>, LW_FM<0x2c>;
-def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>;
+def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, II_LDL>, LW_FM<0x1a>,
+            ISA_MIPS3_NOT_32R6_64R6;
+def LDR   : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, II_LDR>, LW_FM<0x1b>,
+            ISA_MIPS3_NOT_32R6_64R6;
+def SDL   : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, II_SDL>, LW_FM<0x2c>,
+            ISA_MIPS3_NOT_32R6_64R6;
+def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>,
+            ISA_MIPS3_NOT_32R6_64R6;
 
 /// Load-linked, Store-conditional
-def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>;
-def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>;
+def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>, ISA_MIPS3;
+def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3;
 
 /// Jump and Branch Instructions
 let isCodeGenOnly = 1 in {
@@ -169,17 +183,17 @@ def TAILCALL64_R : TailCallReg<GPR64Opnd, JR, GPR32Opnd>;
 
 /// Multiply and Divide Instructions.
 def DMULT  : Mult<"dmult", II_DMULT, GPR64Opnd, [HI0_64, LO0_64]>,
-             MULT_FM<0, 0x1c>;
+             MULT_FM<0, 0x1c>, ISA_MIPS3;
 def DMULTu : Mult<"dmultu", II_DMULTU, GPR64Opnd, [HI0_64, LO0_64]>,
-             MULT_FM<0, 0x1d>;
+             MULT_FM<0, 0x1d>, ISA_MIPS3;
 def PseudoDMULT  : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult,
                                  II_DMULT>;
 def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu,
                                  II_DMULTU>;
 def DSDIV : Div<"ddiv", II_DDIV, GPR64Opnd, [HI0_64, LO0_64]>,
-            MULT_FM<0, 0x1e>;
+            MULT_FM<0, 0x1e>, ISA_MIPS3;
 def DUDIV : Div<"ddivu", II_DDIVU, GPR64Opnd, [HI0_64, LO0_64]>,
-            MULT_FM<0, 0x1f>;
+            MULT_FM<0, 0x1f>, ISA_MIPS3;
 def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem,
                                 II_DDIV, 0, 1, 1>;
 def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU,
@@ -195,17 +209,19 @@ def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>;
 def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>;
 
 /// Sign Ext In Register Instructions.
-def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>;
-def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>;
+def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>,
+            ISA_MIPS32R2;
+def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>,
+            ISA_MIPS32R2;
 }
 
 /// Count Leading
-def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>;
-def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>;
+def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>, ISA_MIPS64;
+def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>, ISA_MIPS64;
 
 /// Double Word Swap Bytes/HalfWords
-def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>;
-def DSHD : SubwordSwap<"dshd", GPR64Opnd>, SEB_FM<5, 0x24>;
+def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>, ISA_MIPS64R2;
+def DSHD : SubwordSwap<"dshd", GPR64Opnd>, SEB_FM<5, 0x24>, ISA_MIPS64R2;
 
 def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
 
@@ -229,8 +245,19 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
                     "sll\t$rd, $rt, 0", [], II_SLL>;
 }
 
+// We need the following pseudo instruction to avoid offset calculation for
+// long branches.  See the comment in file MipsLongBranch.cpp for detailed
+// explanation.
+
+// Expands to: daddiu $dst, $src, %PART($tgt - $baltgt)
+// where %PART may be %hi or %lo, depending on the relocation kind
+// that $tgt is annotated with.
+def LONG_BRANCH_DADDiu : PseudoSE<(outs GPR64Opnd:$dst),
+  (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+
 // Cavium Octeon cmMIPS instructions
-let Predicates = [HasCnMips] in {
+let EncodingPredicates = []<Predicate>, // FIXME: The lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [HasCnMips] in {
 
 class Count1s<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
@@ -254,6 +281,14 @@ class SetCC64_R<string opstr, PatFrag cond_op> :
   let TwoOperandAliasConstraint = "$rd = $rs";
 }
 
+class SetCC64_I<string opstr, PatFrag cond_op>:
+  InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, simm10_64:$imm10),
+         !strconcat(opstr, "\t$rt, $rs, $imm10"),
+         [(set GPR64Opnd:$rt, (cond_op GPR64Opnd:$rs, immSExt10_64:$imm10))],
+         II_SEQI_SNEI, FrmI, opstr> {
+  let TwoOperandAliasConstraint = "$rt = $rs";
+}
+
 // Unsigned Byte Add
 let Pattern = [(set GPR64Opnd:$rd,
                     (and (add GPR64Opnd:$rs, GPR64Opnd:$rt), 255))] in
@@ -287,7 +322,25 @@ def DPOP  : Count1s<"dpop", GPR64Opnd>, POP_FM<0x2d>;
 
 // Set on equal/not equal
 def SEQ   : SetCC64_R<"seq", seteq>, SEQ_FM<0x2a>;
+def SEQi  : SetCC64_I<"seqi", seteq>, SEQI_FM<0x2e>;
 def SNE   : SetCC64_R<"sne", setne>, SEQ_FM<0x2b>;
+def SNEi  : SetCC64_I<"snei", setne>, SEQI_FM<0x2f>;
+
+// 192-bit x 64-bit Unsigned Multiply and Add
+let Defs = [P0, P1, P2] in
+def V3MULU: ArithLogicR<"v3mulu", GPR64Opnd, 0, II_DMUL>,
+                                  ADD_FM<0x1c, 0x11>;
+
+// 64-bit Unsigned Multiply and Add Move
+let Defs = [MPL0, P0, P1, P2] in
+def VMM0  : ArithLogicR<"vmm0", GPR64Opnd, 0, II_DMUL>,
+                                ADD_FM<0x1c, 0x10>;
+
+// 64-bit Unsigned Multiply and Add
+let Defs = [MPL1, MPL2, P0, P1, P2] in
+def VMULU : ArithLogicR<"vmulu", GPR64Opnd, 0, II_DMUL>,
+                                 ADD_FM<0x1c, 0x0f>;
+
 }
 
 }
@@ -297,12 +350,10 @@ def SNE   : SetCC64_R<"sne", setne>, SEQ_FM<0x2b>;
 //===----------------------------------------------------------------------===//
 
 // extended loads
-let Predicates = [HasStdEnc] in {
-  def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
-}
+def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
+def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
+def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
+def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
 
 // hi/lo relocs
 def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
@@ -355,8 +406,7 @@ defm : SetgeImmPats<GPR64, SLTi64, SLTiu64>;
 
 // truncate
 def : MipsPat<(i32 (trunc GPR64:$src)),
-              (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>,
-      Requires<[HasStdEnc]>;
+              (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;
 
 // 32-to-64-bit extension
 def : MipsPat<(i64 (anyext GPR32:$src)), (SLL64_32 GPR32:$src)>;
@@ -373,64 +423,59 @@ def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>;
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
-def : InstAlias<"move $dst, $src",
-                (DADDu GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
-      Requires<[HasMips64]>;
-def : InstAlias<"daddu $rs, $rt, $imm",
-                (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
-                0>;
-def : InstAlias<"dadd $rs, $rt, $imm",
-                (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
-                0>;
-def : InstAlias<"daddu $rs, $imm",
-                (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                0>;
-def : InstAlias<"dadd $rs, $imm",
-                (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                0>;
-def : InstAlias<"add $rs, $imm",
-                (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
-                0>;
-def : InstAlias<"addu $rs, $imm",
-                (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
-                0>;
-let isPseudo=1, usesCustomInserter=1, isCodeGenOnly=1 in {
-def SUBi : MipsInst<(outs GPR32Opnd: $rt), (ins GPR32Opnd: $rs, simm16: $imm),
-                    "sub\t$rt, $rs, $imm", [], II_DSUB, Pseudo>;
-def SUBiu : MipsInst<(outs GPR32Opnd: $rt), (ins GPR32Opnd: $rs, simm16: $imm),
-                    "subu\t$rt, $rs, $imm", [], II_DSUB, Pseudo>;
-def DSUBi : MipsInst<(outs GPR64Opnd: $rt), (ins GPR64Opnd: $rs, simm16_64: $imm),
-                    "ssub\t$rt, $rs, $imm", [], II_DSUB, Pseudo>;
-def DSUBiu : MipsInst<(outs GPR64Opnd: $rt), (ins GPR64Opnd: $rs, simm16_64: $imm),
-                    "ssubu\t$rt, $rs, $imm", [], II_DSUB, Pseudo>;
-}
-def : InstAlias<"dsubu $rt, $rs, $imm",
-                (DSUBiu GPR64Opnd:$rt, GPR64Opnd:$rs, simm16_64: $imm),
-                0>;
-def : InstAlias<"sub $rs, $imm",
-                (SUBi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
-                0>;
-def : InstAlias<"subu $rs, $imm",
-                (SUBiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
-                0>;
-def : InstAlias<"dsub $rs, $imm",
-                (DSUBi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                0>;
-def : InstAlias<"dsubu $rs, $imm",
-                (DSUBiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                0>;
+def : MipsInstAlias<"move $dst, $src",
+                    (DADDu GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
+      GPR_64;
+def : MipsInstAlias<"daddu $rs, $rt, $imm",
+                    (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+                    0>;
+def : MipsInstAlias<"dadd $rs, $rt, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+                    0>;
+def : MipsInstAlias<"daddu $rs, $imm",
+                    (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                    0>;
+def : MipsInstAlias<"dadd $rs, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                    0>;
+def : MipsInstAlias<"add $rs, $imm",
+                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
+                    0>;
+def : MipsInstAlias<"addu $rs, $imm",
+                    (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
+                    0>;
+def : MipsInstAlias<"dsll $rd, $rt, $rs",
+                    (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                    ISA_MIPS3;
+def : MipsInstAlias<"dsubu $rt, $rs, $imm",
+                    (DADDiu GPR64Opnd:$rt, GPR64Opnd:$rs,
+                            InvertedImOperand64:$imm), 0>;
+def : MipsInstAlias<"dsub $rs, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
+                           InvertedImOperand64:$imm),
+                    0>;
+def : MipsInstAlias<"dsubu $rs, $imm",
+                    (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs,
+                            InvertedImOperand64:$imm),
+                    0>;
+def : MipsInstAlias<"dsra $rd, $rt, $rs",
+                    (DSRAV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                    ISA_MIPS3;
+def : MipsInstAlias<"dsrl $rd, $rt, $rs",
+                    (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                    ISA_MIPS3;
 
 /// Move between CPU and coprocessor registers
 let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
 def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd>, MFC3OP_FM<0x10, 1>;
-def DMTC0 : MFC3OP<"dmtc0", GPR64Opnd>, MFC3OP_FM<0x10, 5>;
-def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd>, MFC3OP_FM<0x12, 1>;
-def DMTC2 : MFC3OP<"dmtc2", GPR64Opnd>, MFC3OP_FM<0x12, 5>;
+def DMTC0 : MFC3OP<"dmtc0", GPR64Opnd>, MFC3OP_FM<0x10, 5>, ISA_MIPS3;
+def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd>, MFC3OP_FM<0x12, 1>, ISA_MIPS3;
+def DMTC2 : MFC3OP<"dmtc2", GPR64Opnd>, MFC3OP_FM<0x12, 5>, ISA_MIPS3;
 }
 
 // Two operand (implicit 0 selector) versions:
-def : InstAlias<"dmfc0 $rt, $rd", (DMFC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
-def : InstAlias<"dmtc0 $rt, $rd", (DMTC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
-def : InstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
-def : InstAlias<"dmtc2 $rt, $rd", (DMTC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmfc0 $rt, $rd", (DMFC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmtc0 $rt, $rd", (DMTC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmtc2 $rt, $rd", (DMTC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
 
diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td
new file mode 100644
index 0000000..f971218
--- /dev/null
+++ b/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -0,0 +1,88 @@
+//=- Mips64r6InstrInfo.td - Mips64r6 Instruction Information -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips64r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// Notes about removals/changes from MIPS32r6:
+// Reencoded: dclo, dclz
+// Reencoded: lld, scd
+// Removed: daddi
+// Removed: ddiv, ddivu, dmult, dmultu
+// Removed: div, divu
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class DALIGN_ENC  : SPECIAL3_DALIGN_FM<OPCODE6_DALIGN>;
+class DAUI_ENC    : DAUI_FM;
+class DAHI_ENC    : REGIMM_FM<OPCODE5_DAHI>;
+class DATI_ENC    : REGIMM_FM<OPCODE5_DATI>;
+class DBITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_DBITSWAP>;
+class DDIV_ENC    : SPECIAL_3R_FM<0b00010, 0b011110>;
+class DDIVU_ENC   : SPECIAL_3R_FM<0b00010, 0b011111>;
+class DMOD_ENC    : SPECIAL_3R_FM<0b00011, 0b011110>;
+class DMODU_ENC   : SPECIAL_3R_FM<0b00011, 0b011111>;
+class DMUH_ENC    : SPECIAL_3R_FM<0b00011, 0b111000>;
+class DMUHU_ENC   : SPECIAL_3R_FM<0b00011, 0b111001>;
+class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b111000>;
+class DMULU_ENC   : SPECIAL_3R_FM<0b00010, 0b111001>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class AHI_ATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins GPROpnd:$rt, simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $imm");
+  string Constraints = "$rs = $rt";
+}
+
+class DALIGN_DESC  : ALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3>;
+class DAHI_DESC    : AHI_ATI_DESC_BASE<"dahi", GPR64Opnd>;
+class DATI_DESC    : AHI_ATI_DESC_BASE<"dati", GPR64Opnd>;
+class DAUI_DESC    : AUI_DESC_BASE<"daui", GPR64Opnd>;
+class DBITSWAP_DESC : BITSWAP_DESC_BASE<"dbitswap", GPR64Opnd>;
+class DDIV_DESC    : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd>;
+class DDIVU_DESC   : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd>;
+class DMOD_DESC    : DIVMOD_DESC_BASE<"dmod", GPR64Opnd>;
+class DMODU_DESC   : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd>;
+class DMUH_DESC    : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd>;
+class DMUHU_DESC   : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd>;
+class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd>;
+class DMULU_DESC   : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6;
+def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6;
+def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6;
+def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6;
+def DBITSWAP : DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6;
+def DDIV : DDIV_ENC, DDIV_DESC, ISA_MIPS64R6;
+def DDIVU : DDIVU_ENC, DDIVU_DESC, ISA_MIPS64R6;
+// def DLSA; // See MSA
+def DMOD : DMOD_ENC, DMOD_DESC, ISA_MIPS64R6;
+def DMODU : DMODU_ENC, DMODU_DESC, ISA_MIPS64R6;
+def DMUH: DMUH_ENC, DMUH_DESC, ISA_MIPS64R6;
+def DMUHU: DMUHU_ENC, DMUHU_DESC, ISA_MIPS64R6;
+def DMUL_R6: DMUL_R6_ENC, DMUL_R6_DESC, ISA_MIPS64R6;
+def DMULU: DMULU_ENC, DMULU_DESC, ISA_MIPS64R6;
+def LDPC;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index d5df855..6df90aa 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-asm-printer"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsMCNaCl.h"
@@ -52,6 +51,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-asm-printer"
+
 MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() {
   return static_cast<MipsTargetStreamer &>(*OutStreamer.getTargetStreamer());
 }
@@ -147,7 +148,8 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // removing another test for this situation downstream in the
     // callchain.
     //
-    if (I->isPseudo() && !Subtarget->inMips16Mode())
+    if (I->isPseudo() && !Subtarget->inMips16Mode()
+        && !isLongBranchPseudo(I->getOpcode()))
       llvm_unreachable("Pseudo opcode found in EmitInstruction()");
 
     MCInst TmpInst0;
@@ -285,9 +287,8 @@ void MipsAsmPrinter::EmitFunctionEntryLabel() {
 
   if (Subtarget->inMicroMipsMode())
     TS.emitDirectiveSetMicroMips();
-  // leave out until FSF available gas has micromips changes
-  //  else
-  //    TS.emitDirectiveSetNoMicroMips();
+  else
+    TS.emitDirectiveSetNoMicroMips();
 
   if (Subtarget->inMips16Mode())
     TS.emitDirectiveSetMips16();
@@ -621,16 +622,29 @@ printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
 void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // TODO: Need to add -mabicalls and -mno-abicalls flags.
   // Currently we assume that -mabicalls is the default.
-  getTargetStreamer().emitDirectiveAbiCalls();
-  Reloc::Model RM = Subtarget->getRelocationModel();
-  if (RM == Reloc::Static && !Subtarget->hasMips64())
-    getTargetStreamer().emitDirectiveOptionPic0();
+  bool IsABICalls = true;
+  if (IsABICalls) {
+    getTargetStreamer().emitDirectiveAbiCalls();
+    Reloc::Model RM = Subtarget->getRelocationModel();
+    // FIXME: This condition should be a lot more complicated that it is here.
+    //        Ideally it should test for properties of the ABI and not the ABI
+    //        itself.
+    //        For the moment, I'm only correcting enough to make MIPS-IV work.
+    if (RM == Reloc::Static && !Subtarget->isABI_N64())
+      getTargetStreamer().emitDirectiveOptionPic0();
+  }
 
   // Tell the assembler which ABI we are using
   std::string SectionName = std::string(".mdebug.") + getCurrentABIString();
   OutStreamer.SwitchSection(OutContext.getELFSection(
       SectionName, ELF::SHT_PROGBITS, 0, SectionKind::getDataRel()));
 
+  // NaN: At the moment we only support:
+  // 1. .nan legacy (default)
+  // 2. .nan 2008
+  Subtarget->isNaN2008() ? getTargetStreamer().emitDirectiveNaN2008()
+    : getTargetStreamer().emitDirectiveNaNLegacy();
+
   // TODO: handle O64 ABI
 
   if (Subtarget->isABI_EABI()) {
@@ -824,7 +838,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   const MCSectionELF *M = OutContext.getELFSection(
       ".mips16.call.fp." + std::string(Symbol), ELF::SHT_PROGBITS,
       ELF::SHF_ALLOC | ELF::SHF_EXECINSTR, SectionKind::getText());
-  OutStreamer.SwitchSection(M, 0);
+  OutStreamer.SwitchSection(M, nullptr);
   //
   // .align 2
   //
@@ -941,6 +955,12 @@ void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
   }
 }
 
+bool MipsAsmPrinter::isLongBranchPseudo(int Opcode) const {
+  return (Opcode == Mips::LONG_BRANCH_LUi
+          || Opcode == Mips::LONG_BRANCH_ADDiu
+          || Opcode == Mips::LONG_BRANCH_DADDiu);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeMipsAsmPrinter() {
   RegisterAsmPrinter<MipsAsmPrinter> X(TheMipsTarget);
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 3e9093e..e82b145 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -75,6 +75,8 @@ private:
 
   void NaClAlignIndirectJumpTargets(MachineFunction &MF);
 
+  bool isLongBranchPseudo(int Opcode) const;
+
 public:
 
   const MipsSubtarget *Subtarget;
@@ -82,18 +84,18 @@ public:
   MipsMCInstLower MCInstLowering;
 
   explicit MipsAsmPrinter(TargetMachine &TM,  MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), MCP(0), InConstantPool(false),
+    : AsmPrinter(TM, Streamer), MCP(nullptr), InConstantPool(false),
       MCInstLowering(*this) {
     Subtarget = &TM.getSubtarget<MipsSubtarget>();
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "Mips Assembly Printer";
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual void EmitConstantPool() override {
+  void EmitConstantPool() override {
     bool UsingConstantPools =
       (Subtarget->inMips16Mode() && Subtarget->useConstantIslands());
     if (!UsingConstantPools)
@@ -101,30 +103,30 @@ public:
     // we emit constant pools customly!
   }
 
-  void EmitInstruction(const MachineInstr *MI);
+  void EmitInstruction(const MachineInstr *MI) override;
   void printSavedRegsBitmask();
   void emitFrameDirective();
   const char *getCurrentABIString() const;
-  virtual void EmitFunctionEntryLabel();
-  virtual void EmitFunctionBodyStart();
-  virtual void EmitFunctionBodyEnd();
-  virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
-                                                 MBB) const;
+  void EmitFunctionEntryLabel() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  bool isBlockOnlyReachableByFallthrough(
+                                   const MachineBasicBlock* MBB) const override;
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &O);
+                       raw_ostream &O) override;
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
                              unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &O);
+                             raw_ostream &O) override;
   void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printUnsignedImm(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printUnsignedImm8(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
-                       const char *Modifier = 0);
-  void EmitStartOfAsmFile(Module &M);
-  void EmitEndOfAsmFile(Module &M);
+                       const char *Modifier = nullptr);
+  void EmitStartOfAsmFile(Module &M) override;
+  void EmitEndOfAsmFile(Module &M) override;
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 };
 }
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 615310f..c83d880 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -245,8 +245,8 @@ def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
 def CSR_O32_FP64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 20), RA, FP,
                                         (sequence "S%u", 7, 0))>;
 
-def CSR_N32 : CalleeSavedRegs<(add D31_64, D29_64, D27_64, D25_64, D24_64,
-                                   D23_64, D22_64, D21_64, RA_64, FP_64, GP_64,
+def CSR_N32 : CalleeSavedRegs<(add D20_64, D22_64, D24_64, D26_64, D28_64,
+                                   D30_64, RA_64, FP_64, GP_64,
                                    (sequence "S%u_64", 7, 0))>;
 
 def CSR_N64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 24), RA_64, FP_64,
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index ea49086..13fa546 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -12,7 +12,6 @@
 //
 //===---------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsInstrInfo.h"
@@ -41,6 +40,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 STATISTIC(NumEmitted, "Number of machine instructions emitted");
 
 namespace {
@@ -56,7 +57,7 @@ class MipsCodeEmitter : public MachineFunctionPass {
   const std::vector<MachineJumpTableEntry> *MJTEs;
   bool IsPIC;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineModuleInfo> ();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -65,13 +66,13 @@ class MipsCodeEmitter : public MachineFunctionPass {
 
 public:
   MipsCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-    : MachineFunctionPass(ID), JTI(0), II(0), TD(0),
-      TM(tm), MCE(mce), MCPEs(0), MJTEs(0),
+    : MachineFunctionPass(ID), JTI(nullptr), II(nullptr), TD(nullptr),
+      TM(tm), MCE(mce), MCPEs(nullptr), MJTEs(nullptr),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
-  bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "Mips Machine Code Emitter";
   }
 
@@ -109,6 +110,12 @@ private:
   unsigned getBranchTargetOpValueMM(const MachineInstr &MI,
                                     unsigned OpNo) const;
 
+  unsigned getBranchTarget21OpValue(const MachineInstr &MI,
+                                    unsigned OpNo) const;
+  unsigned getBranchTarget26OpValue(const MachineInstr &MI,
+                                    unsigned OpNo) const;
+  unsigned getJumpOffset16OpValue(const MachineInstr &MI, unsigned OpNo) const;
+
   unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getMemEncodingMMImm12(const MachineInstr &MI, unsigned OpNo) const;
@@ -116,6 +123,7 @@ private:
   unsigned getSizeExtEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getLSAImmEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getSimm19Lsl2Encoding(const MachineInstr &MI, unsigned OpNo) const;
 
   /// Expand pseudo instructions with accumulator register operands.
   void expandACCInstr(MachineBasicBlock::instr_iterator MI,
@@ -138,7 +146,7 @@ bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
   TD = Target.getDataLayout();
   Subtarget = &TM.getSubtarget<MipsSubtarget> ();
   MCPEs = &MF.getConstantPool()->getConstants();
-  MJTEs = 0;
+  MJTEs = nullptr;
   if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
   JTI->Initialize(MF, IsPIC, Subtarget->isLittle());
   MCE.setModuleInfo(&getAnalysis<MachineModuleInfo> ());
@@ -201,6 +209,24 @@ unsigned MipsCodeEmitter::getBranchTargetOpValueMM(const MachineInstr &MI,
   return 0;
 }
 
+unsigned MipsCodeEmitter::getBranchTarget21OpValue(const MachineInstr &MI,
+                                                   unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getBranchTarget26OpValue(const MachineInstr &MI,
+                                                   unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getJumpOffset16OpValue(const MachineInstr &MI,
+                                                 unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 unsigned MipsCodeEmitter::getBranchTargetOpValue(const MachineInstr &MI,
                                                  unsigned OpNo) const {
   MachineOperand MO = MI.getOperand(OpNo);
@@ -247,6 +273,12 @@ unsigned MipsCodeEmitter::getLSAImmEncoding(const MachineInstr &MI,
   return 0;
 }
 
+unsigned MipsCodeEmitter::getSimm19Lsl2Encoding(const MachineInstr &MI,
+                                                unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 /// getMachineOpValue - Return binary encoding of operand. If the machine
 /// operand requires relocation, record the relocation and return zero.
 unsigned MipsCodeEmitter::getMachineOpValue(const MachineInstr &MI,
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 567eef9..7177f65 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -104,9 +104,9 @@ multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
 
 // Instantiation of instructions.
 def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>,
-               ADD_FM<0, 0xa>;
+               ADD_FM<0, 0xa>, INSN_MIPS4_32;
 
-let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1 in {
   def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
                      ADD_FM<0, 0xa>;
   def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
@@ -116,9 +116,9 @@ let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
 }
 
 def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>;
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32;
 
-let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1 in {
   def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
                      ADD_FM<0, 0xb>;
   def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
@@ -128,118 +128,112 @@ let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
 }
 
 def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
-               CMov_I_F_FM<18, 16>;
+               CMov_I_F_FM<18, 16>, INSN_MIPS4_32;
 
 let isCodeGenOnly = 1 in
 def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
-                 CMov_I_F_FM<18, 16>, Requires<[HasMips64, HasStdEnc]>;
+                 CMov_I_F_FM<18, 16>, AdditionalRequires<[HasMips64]>;
 
 def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
-               CMov_I_F_FM<19, 16>;
+               CMov_I_F_FM<19, 16>, INSN_MIPS4_32;
 
 let isCodeGenOnly = 1 in
 def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
-                 CMov_I_F_FM<19, 16>, Requires<[HasMips64, HasStdEnc]>;
+                 CMov_I_F_FM<19, 16>, AdditionalRequires<[IsGP64bit]>;
 
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
-                                      II_MOVZ_D>, CMov_I_F_FM<18, 17>;
-  def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
-                                      II_MOVN_D>, CMov_I_F_FM<19, 17>;
-}
+def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+                                    II_MOVZ_D>, CMov_I_F_FM<18, 17>,
+                 INSN_MIPS4_32, FGR_32;
+def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+                                    II_MOVN_D>, CMov_I_F_FM<19, 17>,
+                 INSN_MIPS4_32, FGR_32;
 
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+let DecoderNamespace = "Mips64" in {
   def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>,
-                   CMov_I_F_FM<18, 17>;
+                   CMov_I_F_FM<18, 17>, INSN_MIPS4_32, FGR_64;
   def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>,
-                   CMov_I_F_FM<19, 17>;
+                   CMov_I_F_FM<19, 17>, INSN_MIPS4_32, FGR_64;
   let isCodeGenOnly = 1 in {
     def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd,
-                                   II_MOVZ_D>, CMov_I_F_FM<18, 17>;
+                                   II_MOVZ_D>, CMov_I_F_FM<18, 17>, FGR_64;
     def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd,
-                                   II_MOVN_D>, CMov_I_F_FM<19, 17>;
+                                   II_MOVN_D>, CMov_I_F_FM<19, 17>, FGR_64;
   }
 }
 
 def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
-             CMov_F_I_FM<1>;
+             CMov_F_I_FM<1>, INSN_MIPS4_32;
 
 let isCodeGenOnly = 1 in
 def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
-               CMov_F_I_FM<1>, Requires<[HasMips64, HasStdEnc]>;
+               CMov_F_I_FM<1>, AdditionalRequires<[IsGP64bit]>;
 
 def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
-             CMov_F_I_FM<0>;
+             CMov_F_I_FM<0>, INSN_MIPS4_32;
 
 let isCodeGenOnly = 1 in
 def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
-               CMov_F_I_FM<0>, Requires<[HasMips64, HasStdEnc]>;
+               CMov_F_I_FM<0>, AdditionalRequires<[IsGP64bit]>;
 
 def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
-             CMov_F_F_FM<16, 1>;
+             CMov_F_F_FM<16, 1>, INSN_MIPS4_32;
 def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>,
-             CMov_F_F_FM<16, 0>;
+             CMov_F_F_FM<16, 0>, INSN_MIPS4_32;
 
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
-                                    MipsCMovFP_T>, CMov_F_F_FM<17, 1>;
-  def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
-                                    MipsCMovFP_F>, CMov_F_F_FM<17, 0>;
-}
+def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+                                  MipsCMovFP_T>, CMov_F_F_FM<17, 1>,
+               INSN_MIPS4_32, FGR_32;
+def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+                                  MipsCMovFP_F>, CMov_F_F_FM<17, 0>,
+               INSN_MIPS4_32, FGR_32;
 
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+let DecoderNamespace = "Mips64" in {
   def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>,
-                 CMov_F_F_FM<17, 1>;
+                 CMov_F_F_FM<17, 1>, INSN_MIPS4_32, FGR_64;
   def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>,
-                 CMov_F_F_FM<17, 0>;
+                 CMov_F_F_FM<17, 0>, INSN_MIPS4_32, FGR_64;
 }
 
 // Instantiation of conditional move patterns.
 defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>;
 defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>;
 defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>;
-let Predicates = [HasMips64, HasStdEnc] in {
-  defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>;
-  defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64,
-                   SLTiu64>;
-  defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64,
-                   SLTiu64>;
-  defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>;
-  defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>;
-  defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>;
-  defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>;
-  defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>;
-  defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>;
-}
+
+defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>, GPR_64;
+defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>,
+       GPR_64;
+defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64, SLTiu64>,
+       GPR_64;
+defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>, GPR_64;
+defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>, GPR_64;
+defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>, GPR_64;
+defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>, GPR_64;
+defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>, GPR_64;
+defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>, GPR_64;
 
 defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>;
-let Predicates = [HasMips64, HasStdEnc] in {
-  defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>;
-  defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>;
-  defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>;
-}
+
+defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, GPR_64;
+defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, GPR_64;
+defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, GPR_64;
 
 defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>;
 defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>;
 defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>;
-let Predicates = [HasMips64, HasStdEnc] in {
-  defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64,
-                   SLTiu64>;
-  defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>;
-  defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>;
-}
 
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>;
-  defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>;
-  defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>;
-}
-let Predicates = [IsFP64bit, HasStdEnc] in {
-  defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>;
-  defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64,
-                   SLTiu64>;
-  defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>;
-  defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>;
-  defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>;
-  defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>;
-}
+defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>,
+       GPR_64;
+defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, GPR_64;
+defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, GPR_64;
+
+defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>, FGR_32;
+defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, FGR_32;
+defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, FGR_32;
+
+defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>, FGR_64;
+defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>,
+       FGR_64;
+defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, FGR_64;
+defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>, FGR_64;
+defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, FGR_64;
+defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, FGR_64;
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index e5642ba..a37062f 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -21,8 +21,6 @@
 //
 //
 
-#define DEBUG_TYPE "mips-constant-islands"
-
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips16InstrInfo.h"
@@ -47,6 +45,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-constant-islands"
+
 STATISTIC(NumCPEs,       "Number of constpool entries");
 STATISTIC(NumSplit,      "Number of uncond branches inserted");
 STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
@@ -368,14 +368,14 @@ namespace {
       : MachineFunctionPass(ID), TM(tm),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
         ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
-        STI(&TM.getSubtarget<MipsSubtarget>()), MF(0), MCP(0),
+        STI(&TM.getSubtarget<MipsSubtarget>()), MF(nullptr), MCP(nullptr),
         PrescannedForConstants(false){}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Mips Constant Islands";
     }
 
-    bool runOnMachineFunction(MachineFunction &F);
+    bool runOnMachineFunction(MachineFunction &F) override;
 
     void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
     CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
@@ -628,7 +628,7 @@ MipsConstantIslands::CPEntry
     if (CPEs[i].CPEMI == CPEMI)
       return &CPEs[i];
   }
-  return NULL;
+  return nullptr;
 }
 
 /// getCPELogAlign - Returns the required alignment of the constant pool entry
@@ -1065,7 +1065,7 @@ bool MipsConstantIslands::decrementCPEReferenceCount(unsigned CPI,
   assert(CPE && "Unexpected!");
   if (--CPE->RefCount == 0) {
     removeDeadCPEMI(CPEMI);
-    CPE->CPEMI = NULL;
+    CPE->CPEMI = nullptr;
     --NumCPEs;
     return true;
   }
@@ -1098,7 +1098,7 @@ int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
     if (CPEs[i].CPEMI == CPEMI)
       continue;
     // Removing CPEs can leave empty entries, skip
-    if (CPEs[i].CPEMI == NULL)
+    if (CPEs[i].CPEMI == nullptr)
       continue;
     if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
                      U.NegOk)) {
@@ -1154,7 +1154,7 @@ int MipsConstantIslands::findLongFormInRangeCPEntry
     if (CPEs[i].CPEMI == CPEMI)
       continue;
     // Removing CPEs can leave empty entries, skip
-    if (CPEs[i].CPEMI == NULL)
+    if (CPEs[i].CPEMI == nullptr)
       continue;
     if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI,
                          U.getLongFormMaxDisp(), U.NegOk)) {
@@ -1486,7 +1486,7 @@ bool MipsConstantIslands::removeUnusedCPEntries() {
       for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
         if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
           removeDeadCPEMI(CPEs[j].CPEMI);
-          CPEs[j].CPEMI = NULL;
+          CPEs[j].CPEMI = nullptr;
           MadeChange = true;
         }
       }
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index eef9f38..d6c7cac 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "delay-slot-filler"
-
 #include "MCTargetDesc/MipsMCNaCl.h"
 #include "Mips.h"
 #include "MipsInstrInfo.h"
@@ -33,6 +31,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "delay-slot-filler"
+
 STATISTIC(FilledSlots, "Number of delay slots filled");
 STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
                        " are not NOP.");
@@ -124,7 +124,7 @@ namespace {
   public:
     NoMemInstr() : InspectMemInstr(true) {}
   private:
-    virtual bool hasHazard_(const MachineInstr &MI) { return true; }
+    bool hasHazard_(const MachineInstr &MI) override { return true; }
   };
 
   /// This subclass accepts loads from stacks and constant loads.
@@ -132,7 +132,7 @@ namespace {
   public:
     LoadFromStackOrConst() : InspectMemInstr(false) {}
   private:
-    virtual bool hasHazard_(const MachineInstr &MI);
+    bool hasHazard_(const MachineInstr &MI) override;
   };
 
   /// This subclass uses memory dependence information to determine whether a
@@ -142,19 +142,21 @@ namespace {
     MemDefsUses(const MachineFrameInfo *MFI);
 
   private:
-    virtual bool hasHazard_(const MachineInstr &MI);
+    typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
+
+    bool hasHazard_(const MachineInstr &MI) override;
 
     /// Update Defs and Uses. Return true if there exist dependences that
     /// disqualify the delay slot candidate between V and values in Uses and
     /// Defs.
-    bool updateDefsUses(const Value *V, bool MayStore);
+    bool updateDefsUses(ValueType V, bool MayStore);
 
     /// Get the list of underlying objects of MI's memory operand.
     bool getUnderlyingObjects(const MachineInstr &MI,
-                              SmallVectorImpl<const Value *> &Objects) const;
+                              SmallVectorImpl<ValueType> &Objects) const;
 
     const MachineFrameInfo *MFI;
-    SmallPtrSet<const Value*, 4> Uses, Defs;
+    SmallPtrSet<ValueType, 4> Uses, Defs;
 
     /// Flags indicating whether loads or stores with no underlying objects have
     /// been seen.
@@ -166,11 +168,11 @@ namespace {
     Filler(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm) { }
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Mips Delay Slot Filler";
     }
 
-    bool runOnMachineFunction(MachineFunction &F) {
+    bool runOnMachineFunction(MachineFunction &F) override {
       bool Changed = false;
       for (MachineFunction::iterator FI = F.begin(), FE = F.end();
            FI != FE; ++FI)
@@ -178,7 +180,7 @@ namespace {
       return Changed;
     }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBranchProbabilityInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -399,16 +401,15 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
   if (MI.mayStore())
     return true;
 
-  if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getValue())
+  if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getPseudoValue())
     return true;
 
-  const Value *V = (*MI.memoperands_begin())->getValue();
-
-  if (isa<FixedStackPseudoSourceValue>(V))
-    return false;
-
-  if (const PseudoSourceValue *PSV = dyn_cast<const PseudoSourceValue>(V))
-    return !PSV->isConstant(0) && V != PseudoSourceValue::getStack();
+  if (const PseudoSourceValue *PSV =
+      (*MI.memoperands_begin())->getPseudoValue()) {
+    if (isa<FixedStackPseudoSourceValue>(PSV))
+      return false;
+    return !PSV->isConstant(nullptr) && PSV != PseudoSourceValue::getStack();
+  }
 
   return true;
 }
@@ -419,11 +420,11 @@ MemDefsUses::MemDefsUses(const MachineFrameInfo *MFI_)
 
 bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
   bool HasHazard = false;
-  SmallVector<const Value *, 4> Objs;
+  SmallVector<ValueType, 4> Objs;
 
   // Check underlying object list.
   if (getUnderlyingObjects(MI, Objs)) {
-    for (SmallVectorImpl<const Value *>::const_iterator I = Objs.begin();
+    for (SmallVectorImpl<ValueType>::const_iterator I = Objs.begin();
          I != Objs.end(); ++I)
       HasHazard |= updateDefsUses(*I, MI.mayStore());
 
@@ -440,7 +441,7 @@ bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
   return HasHazard;
 }
 
-bool MemDefsUses::updateDefsUses(const Value *V, bool MayStore) {
+bool MemDefsUses::updateDefsUses(ValueType V, bool MayStore) {
   if (MayStore)
     return !Defs.insert(V) || Uses.count(V) || SeenNoObjStore || SeenNoObjLoad;
 
@@ -450,10 +451,20 @@ bool MemDefsUses::updateDefsUses(const Value *V, bool MayStore) {
 
 bool MemDefsUses::
 getUnderlyingObjects(const MachineInstr &MI,
-                     SmallVectorImpl<const Value *> &Objects) const {
-  if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getValue())
+                     SmallVectorImpl<ValueType> &Objects) const {
+  if (!MI.hasOneMemOperand() ||
+      (!(*MI.memoperands_begin())->getValue() &&
+       !(*MI.memoperands_begin())->getPseudoValue()))
     return false;
 
+  if (const PseudoSourceValue *PSV =
+      (*MI.memoperands_begin())->getPseudoValue()) {
+    if (!PSV->isAliased(MFI))
+      return false;
+    Objects.push_back(PSV);
+    return true;
+  }
+
   const Value *V = (*MI.memoperands_begin())->getValue();
 
   SmallVector<Value *, 4> Objs;
@@ -461,10 +472,7 @@ getUnderlyingObjects(const MachineInstr &MI,
 
   for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), E = Objs.end();
        I != E; ++I) {
-    if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(*I)) {
-      if (PSV->isAliased(MFI))
-        return false;
-    } else if (!isIdentifiedObject(V))
+    if (!isIdentifiedObject(V))
       return false;
 
     Objects.push_back(*I);
@@ -602,7 +610,7 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
   RegDefsUses RegDU(TM);
   bool HasMultipleSuccs = false;
   BB2BrMap BrMap;
-  OwningPtr<InspectMemInstr> IM;
+  std::unique_ptr<InspectMemInstr> IM;
   Iter Filler;
 
   // Iterate over SuccBB's predecessor list.
@@ -636,7 +644,7 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
 
 MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
   if (B.succ_empty())
-    return NULL;
+    return nullptr;
 
   // Select the successor with the larget edge weight.
   auto &Prob = getAnalysis<MachineBranchProbabilityInfo>();
@@ -645,14 +653,14 @@ MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
                                                const MachineBasicBlock *Dst1) {
     return Prob.getEdgeWeight(&B, Dst0) < Prob.getEdgeWeight(&B, Dst1);
   });
-  return S->isLandingPad() ? NULL : S;
+  return S->isLandingPad() ? nullptr : S;
 }
 
 std::pair<MipsInstrInfo::BranchType, MachineInstr *>
 Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
   const MipsInstrInfo *TII =
     static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
-  MachineBasicBlock *TrueBB = 0, *FalseBB = 0;
+  MachineBasicBlock *TrueBB = nullptr, *FalseBB = nullptr;
   SmallVector<MachineInstr*, 2> BranchInstrs;
   SmallVector<MachineOperand, 2> Cond;
 
@@ -660,11 +668,11 @@ Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
     TII->AnalyzeBranch(MBB, TrueBB, FalseBB, Cond, false, BranchInstrs);
 
   if ((R == MipsInstrInfo::BT_None) || (R == MipsInstrInfo::BT_NoBranch))
-    return std::make_pair(R, (MachineInstr*)NULL);
+    return std::make_pair(R, nullptr);
 
   if (R != MipsInstrInfo::BT_CondUncond) {
     if (!hasUnoccupiedSlot(BranchInstrs[0]))
-      return std::make_pair(MipsInstrInfo::BT_None, (MachineInstr*)NULL);
+      return std::make_pair(MipsInstrInfo::BT_None, nullptr);
 
     assert(((R != MipsInstrInfo::BT_Uncond) || (TrueBB == &Dst)));
 
@@ -681,7 +689,7 @@ Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
   if (hasUnoccupiedSlot(BranchInstrs[1]) && (FalseBB == &Dst))
     return std::make_pair(MipsInstrInfo::BT_Uncond, BranchInstrs[1]);
 
-  return std::make_pair(MipsInstrInfo::BT_None, (MachineInstr*)NULL);
+  return std::make_pair(MipsInstrInfo::BT_None, nullptr);
 }
 
 bool Filler::examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
new file mode 100644
index 0000000..268a0ed
--- /dev/null
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -0,0 +1,283 @@
+//===-- MipsastISel.cpp - Mips FastISel implementation
+//---------------------===//
+
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "MipsRegisterInfo.h"
+#include "MipsISelLowering.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+
+using namespace llvm;
+
+namespace {
+
+// All possible address modes.
+typedef struct Address {
+  enum { RegBase, FrameIndexBase } BaseType;
+
+  union {
+    unsigned Reg;
+    int FI;
+  } Base;
+
+  int64_t Offset;
+
+  // Innocuous defaults for our address.
+  Address() : BaseType(RegBase), Offset(0) { Base.Reg = 0; }
+} Address;
+
+class MipsFastISel final : public FastISel {
+
+  /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const MipsSubtarget *Subtarget;
+  Module &M;
+  const TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  const TargetLowering &TLI;
+  MipsFunctionInfo *MFI;
+
+  // Convenience variables to avoid some queries.
+  LLVMContext *Context;
+
+  bool TargetSupported;
+
+public:
+  explicit MipsFastISel(FunctionLoweringInfo &funcInfo,
+                        const TargetLibraryInfo *libInfo)
+      : FastISel(funcInfo, libInfo),
+        M(const_cast<Module &>(*funcInfo.Fn->getParent())),
+        TM(funcInfo.MF->getTarget()), TII(*TM.getInstrInfo()),
+        TLI(*TM.getTargetLowering()) {
+    Subtarget = &TM.getSubtarget<MipsSubtarget>();
+    MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
+    Context = &funcInfo.Fn->getContext();
+    TargetSupported = ((Subtarget->getRelocationModel() == Reloc::PIC_) &&
+                       (Subtarget->hasMips32r2() && (Subtarget->isABI_O32())));
+  }
+
+  bool TargetSelectInstruction(const Instruction *I) override;
+  unsigned TargetMaterializeConstant(const Constant *C) override;
+
+  bool ComputeAddress(const Value *Obj, Address &Addr);
+
+private:
+  bool EmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                 unsigned Alignment = 0);
+  bool SelectRet(const Instruction *I);
+  bool SelectStore(const Instruction *I);
+
+  bool isTypeLegal(Type *Ty, MVT &VT);
+  bool isLoadTypeLegal(Type *Ty, MVT &VT);
+
+  unsigned MaterializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned MaterializeGV(const GlobalValue *GV, MVT VT);
+  unsigned MaterializeInt(const Constant *C, MVT VT);
+  unsigned Materialize32BitInt(int64_t Imm, const TargetRegisterClass *RC);
+};
+
+bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(Ty, true);
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple())
+    return false;
+  VT = evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+bool MipsFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT))
+    return true;
+  // We will extend this in a later patch:
+  //   If this is a type than can be sign or zero-extended to a basic operation
+  //   go ahead and accept it now.
+  return false;
+}
+
+bool MipsFastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+  // This construct looks a big awkward but it is how other ports handle this
+  // and as this function is more fully completed, these cases which
+  // return false will have additional code in them.
+  //
+  if (isa<Instruction>(Obj))
+    return false;
+  else if (isa<ConstantExpr>(Obj))
+    return false;
+  Addr.Base.Reg = getRegForValue(Obj);
+  return Addr.Base.Reg != 0;
+}
+
+// Materialize a constant into a register, and return the register
+// number (or zero if we failed to handle it).
+unsigned MipsFastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return MaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return MaterializeGV(GV, VT);
+  else if (isa<ConstantInt>(C))
+    return MaterializeInt(C, VT);
+
+  return 0;
+}
+
+bool MipsFastISel::EmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                             unsigned Alignment) {
+  //
+  // more cases will be handled here in following patches.
+  //
+  if (VT != MVT::i32)
+    return false;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::SW))
+      .addReg(SrcReg)
+      .addReg(Addr.Base.Reg)
+      .addImm(Addr.Offset);
+  return true;
+}
+
+bool MipsFastISel::SelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // Atomic stores need special handling.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!EmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+    return false;
+  return true;
+}
+
+bool MipsFastISel::SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+  if (Ret->getNumOperands() > 0) {
+    return false;
+  }
+  unsigned RetOpc = Mips::RetRA;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(RetOpc));
+  return true;
+}
+
+bool MipsFastISel::TargetSelectInstruction(const Instruction *I) {
+  if (!TargetSupported)
+    return false;
+  switch (I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Store:
+    return SelectStore(I);
+  case Instruction::Ret:
+    return SelectRet(I);
+  }
+  return false;
+}
+}
+
+unsigned MipsFastISel::MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  return 0;
+}
+
+unsigned MipsFastISel::MaterializeGV(const GlobalValue *GV, MVT VT) {
+  // For now 32-bit only.
+  if (VT != MVT::i32)
+    return 0;
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  unsigned DestReg = createResultReg(RC);
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  bool IsThreadLocal = GVar && GVar->isThreadLocal();
+  // TLS not supported at this time.
+  if (IsThreadLocal)
+    return 0;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LW), DestReg)
+      .addReg(MFI->getGlobalBaseReg())
+      .addGlobalAddress(GV, 0, MipsII::MO_GOT);
+  return DestReg;
+}
+unsigned MipsFastISel::MaterializeInt(const Constant *C, MVT VT) {
+  if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
+    return 0;
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  const ConstantInt *CI = cast<ConstantInt>(C);
+  int64_t Imm;
+  if (CI->isNegative())
+    Imm = CI->getSExtValue();
+  else
+    Imm = CI->getZExtValue();
+  return Materialize32BitInt(Imm, RC);
+}
+
+unsigned MipsFastISel::Materialize32BitInt(int64_t Imm,
+                                           const TargetRegisterClass *RC) {
+  unsigned ResultReg = createResultReg(RC);
+
+  if (isInt<16>(Imm)) {
+    unsigned Opc = Mips::ADDiu;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+        .addReg(Mips::ZERO)
+        .addImm(Imm);
+    return ResultReg;
+  } else if (isUInt<16>(Imm)) {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::ORi),
+            ResultReg)
+        .addReg(Mips::ZERO)
+        .addImm(Imm);
+    return ResultReg;
+  }
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+  if (Lo) {
+    // Both Lo and Hi have nonzero bits.
+    unsigned TmpReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LUi),
+            TmpReg).addImm(Hi);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::ORi),
+            ResultReg)
+        .addReg(TmpReg)
+        .addImm(Lo);
+
+  } else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LUi),
+            ResultReg).addImm(Hi);
+  }
+  return ResultReg;
+}
+
+namespace llvm {
+FastISel *Mips::createFastISel(FunctionLoweringInfo &funcInfo,
+                               const TargetLibraryInfo *libInfo) {
+  return new MipsFastISel(funcInfo, libInfo);
+}
+}
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index eb9d49f..8ba35fa 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -110,7 +110,7 @@ uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
     Offset = std::max(Offset, -MFI->getObjectOffset(I));
 
   // Conservatively assume all callee-saved registers will be saved.
-  for (const uint16_t *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
+  for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
     unsigned Size = TRI.getMinimalPhysRegClass(*R)->getSize();
     Offset = RoundUpToAlignment(Offset + Size, Size);
   }
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 6a5f79d..e10a3a5 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -32,7 +32,7 @@ public:
   static const MipsFrameLowering *create(MipsTargetMachine &TM,
                                          const MipsSubtarget &ST);
 
-  bool hasFP(const MachineFunction &MF) const;
+  bool hasFP(const MachineFunction &MF) const override;
 
 protected:
   uint64_t estimateStackSize(const MachineFunction &MF) const;
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 941aeac..90cff63 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-isel"
 #include "MipsISelDAGToDAG.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips.h"
@@ -36,6 +35,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-isel"
+
 //===----------------------------------------------------------------------===//
 // Instruction Selector Implementation
 //===----------------------------------------------------------------------===//
@@ -182,7 +183,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
-    return NULL;
+    return nullptr;
   }
 
   // See if subclasses can handle this node.
@@ -201,8 +202,9 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
 #ifndef NDEBUG
   case ISD::LOAD:
   case ISD::STORE:
-    assert(cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
-           cast<MemSDNode>(Node)->getAlignment() &&
+    assert((Subtarget.systemSupportsUnalignedAccess() ||
+            cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
+            cast<MemSDNode>(Node)->getAlignment()) &&
            "Unexpected unaligned loads/stores.");
     break;
 #endif
@@ -212,7 +214,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   SDNode *ResNode = SelectCode(Node);
 
   DEBUG(errs() << "=> ");
-  if (ResNode == NULL || ResNode == Node)
+  if (ResNode == nullptr || ResNode == Node)
     DEBUG(Node->dump(CurDAG));
   else
     DEBUG(ResNode->dump(CurDAG));
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index 4546182..13becb6 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -35,11 +35,11 @@ public:
     : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<MipsSubtarget>()) {}
 
   // Pass Name
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "MIPS DAG->DAG Pattern Instruction Selection";
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
 protected:
   SDNode *getGlobalBaseReg();
@@ -110,7 +110,7 @@ private:
   /// starting at bit zero.
   virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
 
-  virtual SDNode *Select(SDNode *N);
+  SDNode *Select(SDNode *N) override;
 
   virtual std::pair<bool, SDNode*> selectNode(SDNode *Node) = 0;
 
@@ -121,9 +121,9 @@ private:
 
   virtual void processFunctionAfterISel(MachineFunction &MF) = 0;
 
-  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                            char ConstraintCode,
-                                            std::vector<SDValue> &OutOps);
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
 };
 
 /// createMipsISelDag - This pass converts a legalized DAG into a
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index abf36da..bfe5ea1 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -11,7 +11,6 @@
 // selection DAG.
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "mips-lower"
 #include "MipsISelLowering.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
@@ -39,6 +38,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-lower"
+
 STATISTIC(NumTailCalls, "Number of tail calls");
 
 static cl::opt<bool>
@@ -50,16 +51,21 @@ NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
                cl::desc("MIPS: Don't trap on integer division by zero."),
                cl::init(false));
 
-static const uint16_t O32IntRegs[4] = {
+cl::opt<bool>
+EnableMipsFastISel("mips-fast-isel", cl::Hidden,
+  cl::desc("Allow mips-fast-isel to be used"),
+  cl::init(false));
+
+static const MCPhysReg O32IntRegs[4] = {
   Mips::A0, Mips::A1, Mips::A2, Mips::A3
 };
 
-static const uint16_t Mips64IntRegs[8] = {
+static const MCPhysReg Mips64IntRegs[8] = {
   Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
   Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64
 };
 
-static const uint16_t Mips64DPRegs[8] = {
+static const MCPhysReg Mips64DPRegs[8] = {
   Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
   Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
 };
@@ -198,7 +204,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::PCKEV:             return "MipsISD::PCKEV";
   case MipsISD::PCKOD:             return "MipsISD::PCKOD";
   case MipsISD::INSVE:             return "MipsISD::INSVE";
-  default:                         return NULL;
+  default:                         return nullptr;
   }
 }
 
@@ -245,12 +251,7 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
   setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
 
-  if (!TM.Options.NoNaNsFPMath) {
-    setOperationAction(ISD::FABS,             MVT::f32,   Custom);
-    setOperationAction(ISD::FABS,             MVT::f64,   Custom);
-  }
-
-  if (hasMips64()) {
+  if (isGP64bit()) {
     setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
     setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
     setOperationAction(ISD::GlobalTLSAddress,   MVT::i64,   Custom);
@@ -262,14 +263,14 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::FP_TO_SINT,         MVT::i64,   Custom);
   }
 
-  if (!hasMips64()) {
+  if (!isGP64bit()) {
     setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Custom);
     setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Custom);
     setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Custom);
   }
 
   setOperationAction(ISD::ADD,                MVT::i32,   Custom);
-  if (hasMips64())
+  if (isGP64bit())
     setOperationAction(ISD::ADD,                MVT::i64,   Custom);
 
   setOperationAction(ISD::SDIV, MVT::i32, Expand);
@@ -334,11 +335,6 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::FREM,              MVT::f32,   Expand);
   setOperationAction(ISD::FREM,              MVT::f64,   Expand);
 
-  if (!TM.Options.NoNaNsFPMath) {
-    setOperationAction(ISD::FNEG,             MVT::f32,   Expand);
-    setOperationAction(ISD::FNEG,             MVT::f64,   Expand);
-  }
-
   setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
 
   setOperationAction(ISD::VAARG,             MVT::Other, Expand);
@@ -356,22 +352,23 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
 
   setInsertFencesForAtomic(true);
 
-  if (!Subtarget->hasSEInReg()) {
+  if (!Subtarget->hasMips32r2()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
   }
 
-  if (!Subtarget->hasBitCount()) {
+  // MIPS16 lacks MIPS32's clz and clo instructions.
+  if (!Subtarget->hasMips32() || Subtarget->inMips16Mode())
     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+  if (!Subtarget->hasMips64())
     setOperationAction(ISD::CTLZ, MVT::i64, Expand);
-  }
 
-  if (!Subtarget->hasSwap()) {
+  if (!Subtarget->hasMips32r2())
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  if (!Subtarget->hasMips64r2())
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
-  }
 
-  if (hasMips64()) {
+  if (isGP64bit()) {
     setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom);
     setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom);
@@ -387,7 +384,7 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
 
-  setMinFunctionAlignment(hasMips64() ? 3 : 2);
+  setMinFunctionAlignment(isGP64bit() ? 3 : 2);
 
   setStackPointerRegisterToSaveRestore(isN64() ? Mips::SP_64 : Mips::SP);
 
@@ -406,6 +403,15 @@ const MipsTargetLowering *MipsTargetLowering::create(MipsTargetMachine &TM) {
   return llvm::createMipsSETargetLowering(TM);
 }
 
+// Create a fast isel object.
+FastISel *
+MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  if (!EnableMipsFastISel)
+    return TargetLowering::createFastISel(funcInfo, libInfo);
+  return Mips::createFastISel(funcInfo, libInfo);
+}
+
 EVT MipsTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
@@ -779,7 +785,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::SETCC:              return lowerSETCC(Op, DAG);
   case ISD::VASTART:            return lowerVASTART(Op, DAG);
   case ISD::FCOPYSIGN:          return lowerFCOPYSIGN(Op, DAG);
-  case ISD::FABS:               return lowerFABS(Op, DAG);
   case ISD::FRAMEADDR:          return lowerFRAMEADDR(Op, DAG);
   case ISD::RETURNADDR:         return lowerRETURNADDR(Op, DAG);
   case ISD::EH_RETURN:          return lowerEH_RETURN(Op, DAG);
@@ -1506,7 +1511,7 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
       SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
                                               MipsII::MO_GPREL);
       SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, DL,
-                                      DAG.getVTList(MVT::i32), &GA, 1);
+                                      DAG.getVTList(MVT::i32), GA);
       SDValue GPReg = DAG.getRegister(Mips::GP, MVT::i32);
       return DAG.getNode(ISD::ADD, DL, MVT::i32, GPReg, GPRelNode);
     }
@@ -1572,11 +1577,9 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     Entry.Ty = PtrTy;
     Args.push_back(Entry);
 
-    TargetLowering::CallLoweringInfo CLI(DAG.getEntryNode(), PtrTy,
-                  false, false, false, false, 0, CallingConv::C,
-                  /*IsTailCall=*/false, /*doesNotRet=*/false,
-                  /*isReturnValueUsed=*/true,
-                  TlsGetAddr, Args, DAG, DL);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(DL).setChain(DAG.getEntryNode())
+      .setCallee(CallingConv::C, PtrTy, TlsGetAddr, &Args, 0);
     std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
     SDValue Ret = CallResult.first;
@@ -1765,71 +1768,12 @@ static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG,
 
 SDValue
 MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
-  if (Subtarget->hasMips64())
+  if (Subtarget->isGP64bit())
     return lowerFCOPYSIGN64(Op, DAG, Subtarget->hasExtractInsert());
 
   return lowerFCOPYSIGN32(Op, DAG, Subtarget->hasExtractInsert());
 }
 
-static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG,
-                           bool HasExtractInsert) {
-  SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
-  SDLoc DL(Op);
-
-  // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
-  // to i32.
-  SDValue X = (Op.getValueType() == MVT::f32) ?
-    DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(0)) :
-    DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
-                Const1);
-
-  // Clear MSB.
-  if (HasExtractInsert)
-    Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32,
-                      DAG.getRegister(Mips::ZERO, MVT::i32),
-                      DAG.getConstant(31, MVT::i32), Const1, X);
-  else {
-    SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i32, X, Const1);
-    Res = DAG.getNode(ISD::SRL, DL, MVT::i32, SllX, Const1);
-  }
-
-  if (Op.getValueType() == MVT::f32)
-    return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Res);
-
-  SDValue LowX = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
-                             Op.getOperand(0), DAG.getConstant(0, MVT::i32));
-  return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
-}
-
-static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG,
-                           bool HasExtractInsert) {
-  SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
-  SDLoc DL(Op);
-
-  // Bitcast to integer node.
-  SDValue X = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(0));
-
-  // Clear MSB.
-  if (HasExtractInsert)
-    Res = DAG.getNode(MipsISD::Ins, DL, MVT::i64,
-                      DAG.getRegister(Mips::ZERO_64, MVT::i64),
-                      DAG.getConstant(63, MVT::i32), Const1, X);
-  else {
-    SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i64, X, Const1);
-    Res = DAG.getNode(ISD::SRL, DL, MVT::i64, SllX, Const1);
-  }
-
-  return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Res);
-}
-
-SDValue
-MipsTargetLowering::lowerFABS(SDValue Op, SelectionDAG &DAG) const {
-  if (Subtarget->hasMips64() && (Op.getValueType() == MVT::f64))
-    return lowerFABS64(Op, DAG, Subtarget->hasExtractInsert());
-
-  return lowerFABS32(Op, DAG, Subtarget->hasExtractInsert());
-}
-
 SDValue MipsTargetLowering::
 lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   // check the depth
@@ -1931,7 +1875,7 @@ SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
   Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftLeftLo, Or);
 
   SDValue Ops[2] = {Lo, Hi};
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
@@ -1972,7 +1916,7 @@ SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
                    ShiftRightHi);
 
   SDValue Ops[2] = {Lo, Hi};
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 static SDValue createLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
@@ -1988,7 +1932,7 @@ static SDValue createLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
                       DAG.getConstant(Offset, BasePtrVT));
 
   SDValue Ops[] = { Chain, Ptr, Src };
-  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, 3, MemVT,
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, MemVT,
                                  LD->getMemOperand());
 }
 
@@ -1997,6 +1941,9 @@ SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   LoadSDNode *LD = cast<LoadSDNode>(Op);
   EVT MemVT = LD->getMemoryVT();
 
+  if (Subtarget->systemSupportsUnalignedAccess())
+    return Op;
+
   // Return if load is aligned or if MemVT is neither i32 nor i64.
   if ((LD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
       ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
@@ -2051,7 +1998,7 @@ SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDValue SLL = DAG.getNode(ISD::SHL, DL, MVT::i64, LWR, Const32);
   SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i64, SLL, Const32);
   SDValue Ops[] = { SRL, LWR.getValue(1) };
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 static SDValue createStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
@@ -2066,7 +2013,7 @@ static SDValue createStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
                       DAG.getConstant(Offset, BasePtrVT));
 
   SDValue Ops[] = { Chain, Value, Ptr };
-  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, 3, MemVT,
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, MemVT,
                                  SD->getMemOperand());
 }
 
@@ -2120,7 +2067,8 @@ SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   EVT MemVT = SD->getMemoryVT();
 
   // Lower unaligned integer stores.
-  if ((SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
+  if (!Subtarget->systemSupportsUnalignedAccess() &&
+      (SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
       ((MemVT == MVT::i32) || (MemVT == MVT::i64)))
     return lowerUnalignedIntStore(SD, DAG, Subtarget->isLittle());
 
@@ -2177,12 +2125,12 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
 
 static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
                        CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                       CCState &State, const uint16_t *F64Regs) {
+                       CCState &State, const MCPhysReg *F64Regs) {
 
   static const unsigned IntRegsSize = 4, FloatRegsSize = 2;
 
-  static const uint16_t IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
-  static const uint16_t F32Regs[] = { Mips::F12, Mips::F14 };
+  static const MCPhysReg IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
+  static const MCPhysReg F32Regs[] = { Mips::F12, Mips::F14 };
 
   // Do not process byval args here.
   if (ArgFlags.isByVal())
@@ -2254,7 +2202,7 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
 static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
                             MVT LocVT, CCValAssign::LocInfo LocInfo,
                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static const uint16_t F64Regs[] = { Mips::D6, Mips::D7 };
+  static const MCPhysReg F64Regs[] = { Mips::D6, Mips::D7 };
 
   return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
 }
@@ -2262,7 +2210,7 @@ static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
 static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT,
                             MVT LocVT, CCValAssign::LocInfo LocInfo,
                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static const uint16_t F64Regs[] = { Mips::D12_64, Mips::D14_64 };
+  static const MCPhysReg F64Regs[] = { Mips::D12_64, Mips::D14_64 };
 
   return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
 }
@@ -2383,7 +2331,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   MipsCCInfo.analyzeCallOperands(Outs, IsVarArg,
                                  Subtarget->mipsSEUsesSoftFloat(),
-                                 Callee.getNode(), CLI.Args);
+                                 Callee.getNode(), CLI.getArgs());
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NextStackOffset = CCInfo.getNextStackOffset();
@@ -2394,6 +2342,10 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       isEligibleForTailCallOptimization(MipsCCInfo, NextStackOffset,
                                         *MF.getInfo<MipsFunctionInfo>());
 
+  if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+    report_fatal_error("failed to perform tail call elimination on a call "
+                       "site marked musttail");
+
   if (IsTailCall)
     ++NumTailCalls;
 
@@ -2489,8 +2441,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Transform all store nodes into one single node because all store
   // nodes are independent of each other.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
@@ -2544,9 +2495,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
               CLI, Callee, Chain);
 
   if (IsTailCall)
-    return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, &Ops[0], Ops.size());
+    return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, Ops);
 
-  Chain = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, Ops);
   SDValue InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -2713,18 +2664,21 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
     }
   }
 
-  // The mips ABIs for returning structs by value requires that we copy
-  // the sret argument into $v0 for the return. Save the argument into
-  // a virtual register so that we can access it from the return points.
-  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
-    unsigned Reg = MipsFI->getSRetReturnReg();
-    if (!Reg) {
-      Reg = MF.getRegInfo().createVirtualRegister(
-          getRegClassFor(isN64() ? MVT::i64 : MVT::i32));
-      MipsFI->setSRetReturnReg(Reg);
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    // The mips ABIs for returning structs by value requires that we copy
+    // the sret argument into $v0 for the return. Save the argument into
+    // a virtual register so that we can access it from the return points.
+    if (Ins[i].Flags.isSRet()) {
+      unsigned Reg = MipsFI->getSRetReturnReg();
+      if (!Reg) {
+        Reg = MF.getRegInfo().createVirtualRegister(
+            getRegClassFor(isN64() ? MVT::i64 : MVT::i32));
+        MipsFI->setSRetReturnReg(Reg);
+      }
+      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[i]);
+      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
+      break;
     }
-    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[0]);
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
   }
 
   if (IsVarArg)
@@ -2734,8 +2688,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   // the size of Ins and InVals. This only happens when on varg functions
   if (!OutChains.empty()) {
     OutChains.push_back(Chain);
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                        &OutChains[0], OutChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
   }
 
   return Chain;
@@ -2820,7 +2773,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
     RetOps.push_back(Flag);
 
   // Return on Mips is always a "jr $ra"
-  return DAG.getNode(MipsISD::Ret, DL, MVT::Other, &RetOps[0], RetOps.size());
+  return DAG.getNode(MipsISD::Ret, DL, MVT::Other, RetOps);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2870,7 +2823,7 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
@@ -2948,12 +2901,12 @@ parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const {
   std::pair<bool, bool> R = parsePhysicalReg(C, Prefix, Reg);
 
   if (!R.first)
-    return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+    return std::make_pair(0U, nullptr);
 
   if ((Prefix == "hi" || Prefix == "lo")) { // Parse hi/lo.
     // No numeric characters follow "hi" or "lo".
     if (R.second)
-      return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+      return std::make_pair(0U, nullptr);
 
     RC = TRI->getRegClass(Prefix == "hi" ?
                           Mips::HI32RegClassID : Mips::LO32RegClassID);
@@ -2963,7 +2916,7 @@ parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const {
 
     // No numeric characters follow the name.
     if (R.second)
-      return std::make_pair((unsigned)0, (const TargetRegisterClass *)0);
+      return std::make_pair(0U, nullptr);
 
     Reg = StringSwitch<unsigned long long>(Prefix)
               .Case("$msair", Mips::MSAIR)
@@ -2977,14 +2930,14 @@ parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const {
               .Default(0);
 
     if (!Reg)
-      return std::make_pair((unsigned)0, (const TargetRegisterClass *)0);
+      return std::make_pair(0U, nullptr);
 
     RC = TRI->getRegClass(Mips::MSACtrlRegClassID);
     return std::make_pair(Reg, RC);
   }
 
   if (!R.second)
-    return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+    return std::make_pair(0U, nullptr);
 
   if (Prefix == "$f") { // Parse $f0-$f31.
     // If the size of FP registers is 64-bit or Reg is an even number, select
@@ -3032,7 +2985,7 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
       if (VT == MVT::i64 && isGP64bit())
         return std::make_pair(0U, &Mips::GPR64RegClass);
       // This will generate an error message
-      return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
+      return std::make_pair(0U, nullptr);
     case 'f': // FPU or MSA register
       if (VT == MVT::v16i8)
         return std::make_pair(0U, &Mips::MSA128BRegClass);
@@ -3062,7 +3015,7 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
     case 'x': // register suitable for indirect jump
       // Fixme: Not triggering the use of both hi and low
       // This will generate an error message
-      return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
+      return std::make_pair(0U, nullptr);
     }
   }
 
@@ -3081,7 +3034,7 @@ void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result;
 
   // Only support length 1 constraints for now.
   if (Constraint.length() > 1) return;
@@ -3265,7 +3218,7 @@ static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) {
 MipsTargetLowering::MipsCC::SpecialCallingConvType
   MipsTargetLowering::getSpecialCallingConv(SDValue Callee) const {
   MipsCC::SpecialCallingConvType SpecialCallingConv =
-    MipsCC::NoSpecialCallingConv;;
+    MipsCC::NoSpecialCallingConv;
   if (Subtarget->inMips16HardFloat()) {
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
       llvm::StringRef Sym = G->getGlobal()->getName();
@@ -3321,7 +3274,7 @@ analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args,
       dbgs() << "Call operand #" << I << " has unhandled type "
              << EVT(ArgVT).getEVTString();
 #endif
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
   }
 }
@@ -3344,7 +3297,7 @@ analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Args,
       continue;
     }
 
-    MVT RegVT = getRegVT(ArgVT, FuncArg->getType(), 0, IsSoftFloat);
+    MVT RegVT = getRegVT(ArgVT, FuncArg->getType(), nullptr, IsSoftFloat);
 
     if (!FixedFn(I, ArgVT, RegVT, CCValAssign::Full, ArgFlags, CCInfo))
       continue;
@@ -3353,7 +3306,7 @@ analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Args,
     dbgs() << "Formal Arg #" << I << " has unhandled type "
            << EVT(ArgVT).getEVTString();
 #endif
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 }
 
@@ -3378,7 +3331,7 @@ analyzeReturn(const SmallVectorImpl<Ty> &RetVals, bool IsSoftFloat,
       dbgs() << "Call result #" << I << " has unhandled type "
              << EVT(VT).getEVTString() << '\n';
 #endif
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
   }
 }
@@ -3392,7 +3345,7 @@ analyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsSoftFloat,
 void MipsTargetLowering::MipsCC::
 analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsSoftFloat,
               const Type *RetTy) const {
-  analyzeReturn(Outs, IsSoftFloat, 0, RetTy);
+  analyzeReturn(Outs, IsSoftFloat, nullptr, RetTy);
 }
 
 void MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
@@ -3426,7 +3379,7 @@ unsigned MipsTargetLowering::MipsCC::reservedArgArea() const {
   return (IsO32 && (CallConv != CallingConv::Fast)) ? 16 : 0;
 }
 
-const uint16_t *MipsTargetLowering::MipsCC::intArgRegs() const {
+const MCPhysReg *MipsTargetLowering::MipsCC::intArgRegs() const {
   return IsO32 ? O32IntRegs : Mips64IntRegs;
 }
 
@@ -3443,7 +3396,7 @@ llvm::CCAssignFn *MipsTargetLowering::MipsCC::varArgFn() const {
   return IsO32 ? (IsFP64 ? CC_MipsO32_FP64 : CC_MipsO32_FP32) : CC_MipsN_VarArg;
 }
 
-const uint16_t *MipsTargetLowering::MipsCC::shadowRegs() const {
+const MCPhysReg *MipsTargetLowering::MipsCC::shadowRegs() const {
   return IsO32 ? O32IntRegs : Mips64DPRegs;
 }
 
@@ -3451,7 +3404,7 @@ void MipsTargetLowering::MipsCC::allocateRegs(ByValArgInfo &ByVal,
                                               unsigned ByValSize,
                                               unsigned Align) {
   unsigned RegSize = regSize(), NumIntArgRegs = numIntArgRegs();
-  const uint16_t *IntArgRegs = intArgRegs(), *ShadowRegs = shadowRegs();
+  const MCPhysReg *IntArgRegs = intArgRegs(), *ShadowRegs = shadowRegs();
   assert(!(ByValSize % RegSize) && !(Align % RegSize) &&
          "Byval argument's size and alignment should be a multiple of"
          "RegSize.");
@@ -3536,21 +3489,22 @@ passByValArg(SDValue Chain, SDLoc DL,
              MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
              const MipsCC &CC, const ByValArgInfo &ByVal,
              const ISD::ArgFlagsTy &Flags, bool isLittle) const {
-  unsigned ByValSize = Flags.getByValSize();
-  unsigned Offset = 0; // Offset in # of bytes from the beginning of struct.
-  unsigned RegSize = CC.regSize();
-  unsigned Alignment = std::min(Flags.getByValAlign(), RegSize);
-  EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSize * 8);
+  unsigned ByValSizeInBytes = Flags.getByValSize();
+  unsigned OffsetInBytes = 0; // From beginning of struct
+  unsigned RegSizeInBytes = CC.regSize();
+  unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes);
+  EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
 
   if (ByVal.NumRegs) {
-    const uint16_t *ArgRegs = CC.intArgRegs();
-    bool LeftoverBytes = (ByVal.NumRegs * RegSize > ByValSize);
+    const MCPhysReg *ArgRegs = CC.intArgRegs();
+    bool LeftoverBytes = (ByVal.NumRegs * RegSizeInBytes > ByValSizeInBytes);
     unsigned I = 0;
 
     // Copy words to registers.
-    for (; I < ByVal.NumRegs - LeftoverBytes; ++I, Offset += RegSize) {
+    for (; I < ByVal.NumRegs - LeftoverBytes;
+         ++I, OffsetInBytes += RegSizeInBytes) {
       SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
-                                    DAG.getConstant(Offset, PtrTy));
+                                    DAG.getConstant(OffsetInBytes, PtrTy));
       SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
                                     MachinePointerInfo(), false, false, false,
                                     Alignment);
@@ -3560,38 +3514,38 @@ passByValArg(SDValue Chain, SDLoc DL,
     }
 
     // Return if the struct has been fully copied.
-    if (ByValSize == Offset)
+    if (ByValSizeInBytes == OffsetInBytes)
       return;
 
     // Copy the remainder of the byval argument with sub-word loads and shifts.
     if (LeftoverBytes) {
-      assert((ByValSize > Offset) && (ByValSize < Offset + RegSize) &&
-             "Size of the remainder should be smaller than RegSize.");
+      assert((ByValSizeInBytes > OffsetInBytes) &&
+             (ByValSizeInBytes < OffsetInBytes + RegSizeInBytes) &&
+             "Size of the remainder should be smaller than RegSizeInBytes.");
       SDValue Val;
 
-      for (unsigned LoadSize = RegSize / 2, TotalSizeLoaded = 0;
-           Offset < ByValSize; LoadSize /= 2) {
-        unsigned RemSize = ByValSize - Offset;
+      for (unsigned LoadSizeInBytes = RegSizeInBytes / 2, TotalBytesLoaded = 0;
+           OffsetInBytes < ByValSizeInBytes; LoadSizeInBytes /= 2) {
+        unsigned RemainingSizeInBytes = ByValSizeInBytes - OffsetInBytes;
 
-        if (RemSize < LoadSize)
+        if (RemainingSizeInBytes < LoadSizeInBytes)
           continue;
 
         // Load subword.
         SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
-                                      DAG.getConstant(Offset, PtrTy));
-        SDValue LoadVal =
-          DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr,
-                         MachinePointerInfo(), MVT::getIntegerVT(LoadSize * 8),
-                         false, false, Alignment);
+                                      DAG.getConstant(OffsetInBytes, PtrTy));
+        SDValue LoadVal = DAG.getExtLoad(
+            ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
+            MVT::getIntegerVT(LoadSizeInBytes * 8), false, false, Alignment);
         MemOpChains.push_back(LoadVal.getValue(1));
 
         // Shift the loaded value.
         unsigned Shamt;
 
         if (isLittle)
-          Shamt = TotalSizeLoaded;
+          Shamt = TotalBytesLoaded * 8;
         else
-          Shamt = (RegSize - (TotalSizeLoaded + LoadSize)) * 8;
+          Shamt = (RegSizeInBytes - (TotalBytesLoaded + LoadSizeInBytes)) * 8;
 
         SDValue Shift = DAG.getNode(ISD::SHL, DL, RegTy, LoadVal,
                                     DAG.getConstant(Shamt, MVT::i32));
@@ -3601,9 +3555,9 @@ passByValArg(SDValue Chain, SDLoc DL,
         else
           Val = Shift;
 
-        Offset += LoadSize;
-        TotalSizeLoaded += LoadSize;
-        Alignment = std::min(Alignment, LoadSize);
+        OffsetInBytes += LoadSizeInBytes;
+        TotalBytesLoaded += LoadSizeInBytes;
+        Alignment = std::min(Alignment, LoadSizeInBytes);
       }
 
       unsigned ArgReg = ArgRegs[ByVal.FirstIdx + I];
@@ -3613,14 +3567,14 @@ passByValArg(SDValue Chain, SDLoc DL,
   }
 
   // Copy remainder of byval arg to it with memcpy.
-  unsigned MemCpySize = ByValSize - Offset;
+  unsigned MemCpySize = ByValSizeInBytes - OffsetInBytes;
   SDValue Src = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
-                            DAG.getConstant(Offset, PtrTy));
+                            DAG.getConstant(OffsetInBytes, PtrTy));
   SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
                             DAG.getIntPtrConstant(ByVal.Address));
   Chain = DAG.getMemcpy(Chain, DL, Dst, Src, DAG.getConstant(MemCpySize, PtrTy),
                         Alignment, /*isVolatile=*/false, /*AlwaysInline=*/false,
-                        MachinePointerInfo(0), MachinePointerInfo(0));
+                        MachinePointerInfo(), MachinePointerInfo());
   MemOpChains.push_back(Chain);
 }
 
@@ -3628,7 +3582,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
                                          const MipsCC &CC, SDValue Chain,
                                          SDLoc DL, SelectionDAG &DAG) const {
   unsigned NumRegs = CC.numIntArgRegs();
-  const uint16_t *ArgRegs = CC.intArgRegs();
+  const MCPhysReg *ArgRegs = CC.intArgRegs();
   const CCState &CCInfo = CC.getCCInfo();
   unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs, NumRegs);
   unsigned RegSize = CC.regSize();
@@ -3662,7 +3616,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
     SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
     SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
                                  MachinePointerInfo(), false, false, 0);
-    cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue(0);
+    cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue((Value*)nullptr);
     OutChains.push_back(Store);
   }
 }
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 35dd396..4ac33bf 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -218,32 +218,38 @@ namespace llvm {
 
     static const MipsTargetLowering *create(MipsTargetMachine &TM);
 
-    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+    /// createFastISel - This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo) const override;
 
-    virtual void LowerOperationWrapper(SDNode *N,
-                                       SmallVectorImpl<SDValue> &Results,
-                                       SelectionDAG &DAG) const;
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+
+    void LowerOperationWrapper(SDNode *N,
+                               SmallVectorImpl<SDValue> &Results,
+                               SelectionDAG &DAG) const override;
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     /// ReplaceNodeResults - Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
-    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
-                                    SelectionDAG &DAG) const;
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
 
     /// getTargetNodeName - This method returns the name of a target specific
     //  DAG node.
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
 
     /// getSetCCResultType - get the ISD::SETCC result ValueType
-    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
-    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    virtual MachineBasicBlock *
-    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *MBB) const override;
 
     struct LTStr {
       bool operator()(const char *S1, const char *S2) const {
@@ -382,7 +388,7 @@ namespace llvm {
       unsigned reservedArgArea() const;
 
       /// Return pointer to array of integer argument registers.
-      const uint16_t *intArgRegs() const;
+      const MCPhysReg *intArgRegs() const;
 
       typedef SmallVectorImpl<ByValArgInfo>::const_iterator byval_iterator;
       byval_iterator byval_begin() const { return ByValArgs.begin(); }
@@ -403,7 +409,7 @@ namespace llvm {
       /// Return the function that analyzes variable argument list functions.
       llvm::CCAssignFn *varArgFn() const;
 
-      const uint16_t *shadowRegs() const;
+      const MCPhysReg *shadowRegs() const;
 
       void allocateRegs(ByValArgInfo &ByVal, unsigned ByValSize,
                         unsigned Align);
@@ -523,41 +529,39 @@ namespace llvm {
     void writeVarArgRegs(std::vector<SDValue> &OutChains, const MipsCC &CC,
                          SDValue Chain, SDLoc DL, SelectionDAG &DAG) const;
 
-    virtual SDValue
+    SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
+                           SmallVectorImpl<SDValue> &InVals) const override;
 
     SDValue passArgOnStack(SDValue StackPtr, unsigned Offset, SDValue Chain,
                            SDValue Arg, SDLoc DL, bool IsTailCall,
                            SelectionDAG &DAG) const;
 
-    virtual SDValue
-      LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
 
-    virtual bool
-      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-                     bool isVarArg,
-                     const SmallVectorImpl<ISD::OutputArg> &Outs,
-                     LLVMContext &Context) const;
-
-    virtual SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv, bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const;
+    bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                        bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
+    SDValue LowerReturn(SDValue Chain,
+                        CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        SDLoc dl, SelectionDAG &DAG) const override;
 
     // Inline asm support
-    ConstraintType getConstraintType(const std::string &Constraint) const;
+    ConstraintType
+      getConstraintType(const std::string &Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
     ConstraintWeight getSingleConstraintMatchWeight(
-      AsmOperandInfo &info, const char *constraint) const;
+      AsmOperandInfo &info, const char *constraint) const override;
 
     /// This function parses registers that appear in inline-asm constraints.
     /// It returns pair (0, 0) on failure.
@@ -566,33 +570,33 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass*>
               getRegForInlineAsmConstraint(const std::string &Constraint,
-                                           MVT VT) const;
+                                           MVT VT) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
     /// true it means one of the asm constraint of the inline asm instruction
     /// being processed is 'm'.
-    virtual void LowerAsmOperandForConstraint(SDValue Op,
-                                              std::string &Constraint,
-                                              std::vector<SDValue> &Ops,
-                                              SelectionDAG &DAG) const;
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
 
-    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
-    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
-    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-                                    unsigned SrcAlign,
-                                    bool IsMemset, bool ZeroMemset,
-                                    bool MemcpyStrSrc,
-                                    MachineFunction &MF) const;
+    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                            unsigned SrcAlign,
+                            bool IsMemset, bool ZeroMemset,
+                            bool MemcpyStrSrc,
+                            MachineFunction &MF) const override;
 
     /// isFPImmLegal - Returns true if the target can instruction select the
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
-    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
-    virtual unsigned getJumpTableEncoding() const;
+    unsigned getJumpTableEncoding() const override;
 
     MachineBasicBlock *emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                     unsigned Size, unsigned BinOpcode, bool Nand = false) const;
@@ -608,6 +612,11 @@ namespace llvm {
   /// Create MipsTargetLowering objects.
   const MipsTargetLowering *createMips16TargetLowering(MipsTargetMachine &TM);
   const MipsTargetLowering *createMipsSETargetLowering(MipsTargetMachine &TM);
+
+  namespace Mips {
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
+  }
 }
 
 #endif // MipsISELLOWERING_H
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 4b5a73e..32cda3b 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -66,6 +66,16 @@ def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">,
 def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">,
                        AssemblerPredicate<"!FeatureSingleFloat">;
 
+//===----------------------------------------------------------------------===//
+// Mips FGR size adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+class FGR_32 { list<Predicate> FGRPredicates = [NotFP64bit]; }
+class FGR_64 { list<Predicate> FGRPredicates = [IsFP64bit]; }
+
+//===----------------------------------------------------------------------===//
+
 // FP immediate patterns.
 def fpimm0 : PatLeaf<(fpimm), [{
   return N->isExactlyValue(+0.0);
@@ -100,10 +110,10 @@ class ADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin, bit IsComm,
 multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
                   SDPatternOperator OpNode = null_frag> {
   def _D32 : MMRel, ADDS_FT<opstr, AFGR64Opnd, Itin, IsComm, OpNode>,
-             Requires<[NotFP64bit, HasStdEnc]>;
+             AdditionalRequires<[NotFP64bit]>;
   def _D64 : ADDS_FT<opstr, FGR64Opnd, Itin,
                      IsComm, OpNode>,
-             Requires<[IsFP64bit, HasStdEnc]> {
+             AdditionalRequires<[IsFP64bit]> {
     string DecoderNamespace = "Mips64";
   }
 }
@@ -117,18 +127,18 @@ class ABSS_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
 multiclass ABSS_M<string opstr, InstrItinClass Itin,
                   SDPatternOperator OpNode= null_frag> {
   def _D32 : MMRel, ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
-             Requires<[NotFP64bit, HasStdEnc]>;
+             AdditionalRequires<[NotFP64bit]>;
   def _D64 : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>,
-             Requires<[IsFP64bit, HasStdEnc]> {
+             AdditionalRequires<[IsFP64bit]> {
     string DecoderNamespace = "Mips64";
   }
 }
 
 multiclass ROUND_M<string opstr, InstrItinClass Itin> {
   def _D32 : MMRel, ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>,
-             Requires<[NotFP64bit, HasStdEnc]>;
+             AdditionalRequires<[NotFP64bit]>;
   def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>,
-             Requires<[IsFP64bit, HasStdEnc]> {
+             AdditionalRequires<[IsFP64bit]> {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -241,77 +251,75 @@ multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt,
 
 defm S : C_COND_M<"s", FGR32Opnd, 16, II_C_CC_S>;
 defm D32 : C_COND_M<"d", AFGR64Opnd, 17, II_C_CC_D>,
-           Requires<[NotFP64bit, HasStdEnc]>;
+           AdditionalRequires<[NotFP64bit]>;
 let DecoderNamespace = "Mips64" in
 defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>,
-           Requires<[IsFP64bit, HasStdEnc]>;
+           AdditionalRequires<[IsFP64bit]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
 def ROUND_W_S  : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
-                 ABSS_FM<0xc, 16>;
+                 ABSS_FM<0xc, 16>, ISA_MIPS2;
 def TRUNC_W_S  : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
-                 ABSS_FM<0xd, 16>;
+                 ABSS_FM<0xd, 16>, ISA_MIPS2;
 def CEIL_W_S   : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
-                 ABSS_FM<0xe, 16>;
+                 ABSS_FM<0xe, 16>, ISA_MIPS2;
 def FLOOR_W_S  : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
-                 ABSS_FM<0xf, 16>;
+                 ABSS_FM<0xf, 16>, ISA_MIPS2;
 def CVT_W_S    : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
                  ABSS_FM<0x24, 16>;
 
-defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>;
-defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>;
-defm CEIL_W  : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>;
-defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>;
+defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
+defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
+defm CEIL_W  : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
+defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
 defm CVT_W   : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>;
 
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+let DecoderNamespace = "Mips64" in {
   def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>,
-                  ABSS_FM<0x8, 16>;
+                  ABSS_FM<0x8, 16>, FGR_64;
   def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>,
-                    ABSS_FM<0x8, 17>;
+                    ABSS_FM<0x8, 17>, FGR_64;
   def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, II_TRUNC>,
-                  ABSS_FM<0x9, 16>;
+                  ABSS_FM<0x9, 16>, FGR_64;
   def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, II_TRUNC>,
-                    ABSS_FM<0x9, 17>;
+                    ABSS_FM<0x9, 17>, FGR_64;
   def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64Opnd, FGR32Opnd, II_CEIL>,
-                  ABSS_FM<0xa, 16>;
+                  ABSS_FM<0xa, 16>, FGR_64;
   def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64Opnd, FGR64Opnd, II_CEIL>,
-                   ABSS_FM<0xa, 17>;
+                   ABSS_FM<0xa, 17>, FGR_64;
   def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64Opnd, FGR32Opnd, II_FLOOR>,
-                  ABSS_FM<0xb, 16>;
+                  ABSS_FM<0xb, 16>, FGR_64;
   def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, II_FLOOR>,
-                    ABSS_FM<0xb, 17>;
+                    ABSS_FM<0xb, 17>, FGR_64;
 }
 
 def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
               ABSS_FM<0x20, 20>;
 def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
-              ABSS_FM<0x25, 16>;
+              ABSS_FM<0x25, 16>, INSN_MIPS3_32R2;
 def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
-               ABSS_FM<0x25, 17>;
-
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
-                  ABSS_FM<0x20, 17>;
-  def CVT_D32_W : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
-                  ABSS_FM<0x21, 20>;
-  def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
-                  ABSS_FM<0x21, 16>;
-}
+               ABSS_FM<0x25, 17>, INSN_MIPS3_32R2;
+
+def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                ABSS_FM<0x20, 17>, FGR_32;
+def CVT_D32_W : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                ABSS_FM<0x21, 20>, FGR_32;
+def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                ABSS_FM<0x21, 16>, FGR_32;
 
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+let DecoderNamespace = "Mips64" in {
   def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
-                  ABSS_FM<0x20, 17>;
+                  ABSS_FM<0x20, 17>, FGR_64;
   def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
-                  ABSS_FM<0x20, 21>;
+                  ABSS_FM<0x20, 21>, FGR_64;
   def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
-                  ABSS_FM<0x21, 20>;
+                  ABSS_FM<0x21, 20>, FGR_64;
   def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
-                  ABSS_FM<0x21, 16>;
+                  ABSS_FM<0x21, 16>, FGR_64;
   def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, II_CVT>,
-                  ABSS_FM<0x21, 21>;
+                  ABSS_FM<0x21, 21>, FGR_64;
 }
 
 let isPseudo = 1, isCodeGenOnly = 1 in {
@@ -322,18 +330,16 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
   def PseudoCVT_D64_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, II_CVT>;
 }
 
-let Predicates = [NoNaNsFPMath, HasStdEnc] in {
-  def FABS_S : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
-               ABSS_FM<0x5, 16>;
-  def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
-               ABSS_FM<0x7, 16>;
-  defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>;
-  defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>;
-}
+def FABS_S : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
+             ABSS_FM<0x5, 16>;
+def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
+             ABSS_FM<0x7, 16>;
+defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>;
+defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>;
 
 def FSQRT_S : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, fsqrt>,
-              ABSS_FM<0x4, 16>;
-defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>;
+              ABSS_FM<0x4, 16>, ISA_MIPS2;
+defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
 
 // The odd-numbered registers are only referenced when doing loads,
 // stores, and moves between floating-point and integer registers.
@@ -348,76 +354,92 @@ def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
 def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
                           bitconvert>, MFC1_FM<4>;
 def MFHC1 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>,
-            MFC1_FM<3>;
+            MFC1_FM<3>, ISA_MIPS32R2;
 def MTHC1 : MMRel, MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, II_MTHC1>,
-            MFC1_FM<7>;
+            MFC1_FM<7>, ISA_MIPS32R2;
 def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1,
-            bitconvert>, MFC1_FM<1>;
+            bitconvert>, MFC1_FM<1>, ISA_MIPS3;
 def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1,
-            bitconvert>, MFC1_FM<5>;
+            bitconvert>, MFC1_FM<5>, ISA_MIPS3;
 
 def FMOV_S   : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
                ABSS_FM<0x6, 16>;
 def FMOV_D32 : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
-               ABSS_FM<0x6, 17>, Requires<[NotFP64bit, HasStdEnc]>;
+               ABSS_FM<0x6, 17>, AdditionalRequires<[NotFP64bit]>;
 def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>,
-               ABSS_FM<0x6, 17>, Requires<[IsFP64bit, HasStdEnc]> {
+               ABSS_FM<0x6, 17>, AdditionalRequires<[IsFP64bit]> {
                  let DecoderNamespace = "Mips64";
 }
 
 /// Floating Point Memory Instructions
-let Predicates = [HasStdEnc] in {
-  def LWC1 : MMRel, LW_FT<"lwc1", FGR32Opnd, II_LWC1, load>, LW_FM<0x31>;
-  def SWC1 : MMRel, SW_FT<"swc1", FGR32Opnd, II_SWC1, store>, LW_FM<0x39>;
-}
-
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def LDC164 : LW_FT<"ldc1", FGR64Opnd, II_LDC1, load>, LW_FM<0x35>;
-  def SDC164 : SW_FT<"sdc1", FGR64Opnd, II_SDC1, store>, LW_FM<0x3d>;
-}
-
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  def LDC1 : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM<0x35>;
-  def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>, LW_FM<0x3d>;
-}
-
-/// Cop2 Memory Instructions
-let Predicates = [HasStdEnc] in {
-  def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>;
-  def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>;
-  def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>;
-  def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>;
-}
+def LWC1 : MMRel, LW_FT<"lwc1", FGR32Opnd, II_LWC1, load>, LW_FM<0x31>;
+def SWC1 : MMRel, SW_FT<"swc1", FGR32Opnd, II_SWC1, store>, LW_FM<0x39>;
+
+let DecoderNamespace = "Mips64" in {
+  def LDC164 : LW_FT<"ldc1", FGR64Opnd, II_LDC1, load>, LW_FM<0x35>, ISA_MIPS2,
+               FGR_64;
+  def SDC164 : SW_FT<"sdc1", FGR64Opnd, II_SDC1, store>, LW_FM<0x3d>, ISA_MIPS2,
+               FGR_64;
+}
+
+def LDC1 : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM<0x35>,
+           ISA_MIPS2, FGR_32;
+def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>, LW_FM<0x3d>,
+           ISA_MIPS2, FGR_32;
+
+// Cop2 Memory Instructions
+// FIXME: These aren't really FPU instructions and as such don't belong in this
+//        file
+def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>;
+def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>;
+def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>, ISA_MIPS2;
+def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>, ISA_MIPS2;
+
+// Cop3 Memory Instructions
+// FIXME: These aren't really FPU instructions and as such don't belong in this
+//        file
+def LWC3 : LW_FT<"lwc3", COP3Opnd, NoItinerary, load>, LW_FM<0x33>;
+def SWC3 : SW_FT<"swc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3b>;
+def LDC3 : LW_FT<"ldc3", COP3Opnd, NoItinerary, load>, LW_FM<0x37>, ISA_MIPS2;
+def SDC3 : SW_FT<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>, ISA_MIPS2;
 
 // Indexed loads and stores.
 // Base register + offset register addressing mode (indicated by "x" in the
 // instruction mnemonic) is disallowed under NaCl.
-let Predicates = [HasFPIdx, HasStdEnc, IsNotNaCl] in {
-  def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>;
-  def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>;
+let AdditionalPredicates = [IsNotNaCl] in {
+  def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>,
+              INSN_MIPS4_32R2;
+  def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>,
+              INSN_MIPS4_32R2;
 }
 
-let Predicates = [HasFPIdx, NotFP64bit, HasStdEnc, NotInMicroMips,
-                  IsNotNaCl] in {
-  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>;
-  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>;
+let AdditionalPredicates = [NotInMicroMips, IsNotNaCl] in {
+  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
+              INSN_MIPS4_32R2, FGR_32;
+  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
+              INSN_MIPS4_32R2, FGR_32;
 }
 
-let Predicates = [HasFPIdx, IsFP64bit, HasStdEnc],
-    DecoderNamespace="Mips64" in {
-  def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>;
-  def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>;
+let DecoderNamespace="Mips64" in {
+  def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
+                INSN_MIPS4_32R2, FGR_64;
+  def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
+                INSN_MIPS4_32R2, FGR_64;
 }
 
 // Load/store doubleword indexed unaligned.
-let Predicates = [NotFP64bit, HasStdEnc, IsNotNaCl] in {
-  def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>;
-  def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>;
+let AdditionalPredicates = [IsNotNaCl] in {
+  def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
+              INSN_MIPS5_32R2, FGR_32;
+  def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
+              INSN_MIPS5_32R2, FGR_32;
 }
 
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace="Mips64" in {
-  def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>;
-  def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>;
+let DecoderNamespace="Mips64" in {
+  def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
+                INSN_MIPS5_32R2, FGR_64;
+  def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
+                INSN_MIPS5_32R2, FGR_64;
 }
 
 /// Floating-point Aritmetic
@@ -434,47 +456,43 @@ def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
              ADDS_FM<0x01, 16>;
 defm FSUB :  ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;
 
-let Predicates = [HasMips32r2, HasStdEnc] in {
-  def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
-               MADDS_FM<4, 0>;
-  def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
-               MADDS_FM<5, 0>;
-}
+def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
+             MADDS_FM<4, 0>, ISA_MIPS32R2;
+def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
+             MADDS_FM<5, 0>, ISA_MIPS32R2;
 
-let Predicates = [HasMips32r2, NoNaNsFPMath, HasStdEnc] in {
+let AdditionalPredicates = [NoNaNsFPMath] in {
   def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
-                MADDS_FM<6, 0>;
+                MADDS_FM<6, 0>, ISA_MIPS32R2;
   def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
-                MADDS_FM<7, 0>;
+                MADDS_FM<7, 0>, ISA_MIPS32R2;
 }
 
-let Predicates = [HasMips32r2, NotFP64bit, HasStdEnc] in {
-  def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
-                 MADDS_FM<4, 1>;
-  def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
-                 MADDS_FM<5, 1>;
-}
+def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
+               MADDS_FM<4, 1>, ISA_MIPS32R2, FGR_32;
+def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
+               MADDS_FM<5, 1>, ISA_MIPS32R2, FGR_32;
 
-let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath, HasStdEnc] in {
+let AdditionalPredicates = [NoNaNsFPMath] in {
   def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>;
+                  MADDS_FM<6, 1>, ISA_MIPS32R2, FGR_32;
   def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>;
+                  MADDS_FM<7, 1>, ISA_MIPS32R2, FGR_32;
 }
 
-let Predicates = [HasMips32r2, IsFP64bit, HasStdEnc], isCodeGenOnly=1 in {
+let isCodeGenOnly=1 in {
   def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
-                 MADDS_FM<4, 1>;
+                 MADDS_FM<4, 1>, ISA_MIPS32R2, FGR_64;
   def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
-                 MADDS_FM<5, 1>;
+                 MADDS_FM<5, 1>, ISA_MIPS32R2, FGR_64;
 }
 
-let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStdEnc],
+let AdditionalPredicates = [NoNaNsFPMath],
     isCodeGenOnly=1 in {
   def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>;
+                  MADDS_FM<6, 1>, ISA_MIPS32R2, FGR_64;
   def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>;
+                  MADDS_FM<7, 1>, ISA_MIPS32R2, FGR_64;
 }
 
 //===----------------------------------------------------------------------===//
@@ -515,10 +533,10 @@ def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 /// Floating Point Compare
 def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>;
 def FCMP_D32 : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
-               Requires<[NotFP64bit, HasStdEnc]>;
+               AdditionalRequires<[NotFP64bit]>;
 let DecoderNamespace = "Mips64" in
 def FCMP_D64 : CEQS_FT<"d", FGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
-               Requires<[IsFP64bit, HasStdEnc]>;
+               AdditionalRequires<[IsFP64bit]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Pseudo-Instructions
@@ -531,9 +549,9 @@ class BuildPairF64Base<RegisterOperand RO> :
            [(set RO:$dst, (MipsBuildPairF64 GPR32Opnd:$lo, GPR32Opnd:$hi))]>;
 
 def BuildPairF64 : BuildPairF64Base<AFGR64Opnd>,
-                   Requires<[NotFP64bit, HasStdEnc]>;
+                   AdditionalRequires<[NotFP64bit]>;
 def BuildPairF64_64 : BuildPairF64Base<FGR64Opnd>,
-                      Requires<[IsFP64bit, HasStdEnc]>;
+                      AdditionalRequires<[IsFP64bit]>;
 
 // This pseudo instr gets expanded into 2 mfc1 instrs after register
 // allocation.
@@ -544,15 +562,15 @@ class ExtractElementF64Base<RegisterOperand RO> :
            [(set GPR32Opnd:$dst, (MipsExtractElementF64 RO:$src, imm:$n))]>;
 
 def ExtractElementF64 : ExtractElementF64Base<AFGR64Opnd>,
-                        Requires<[NotFP64bit, HasStdEnc]>;
+                        AdditionalRequires<[NotFP64bit]>;
 def ExtractElementF64_64 : ExtractElementF64Base<FGR64Opnd>,
-                           Requires<[IsFP64bit, HasStdEnc]>;
+                           AdditionalRequires<[IsFP64bit]>;
 
 //===----------------------------------------------------------------------===//
 // InstAliases.
 //===----------------------------------------------------------------------===//
-def : InstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>;
-def : InstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>;
+def : MipsInstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>;
+def : MipsInstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Patterns
@@ -565,55 +583,45 @@ def : MipsPat<(f32 (sint_to_fp GPR32Opnd:$src)),
 def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
               (TRUNC_W_S FGR32Opnd:$src)>;
 
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
-                (PseudoCVT_D32_W GPR32Opnd:$src)>;
-  def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
-                (TRUNC_W_D32 AFGR64Opnd:$src)>;
-  def : MipsPat<(f32 (fround AFGR64Opnd:$src)),
-                (CVT_S_D32 AFGR64Opnd:$src)>;
-  def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
-                (CVT_D32_S FGR32Opnd:$src)>;
-}
+def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
+              (PseudoCVT_D32_W GPR32Opnd:$src)>, FGR_32;
+def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+              (TRUNC_W_D32 AFGR64Opnd:$src)>, FGR_32;
+def : MipsPat<(f32 (fround AFGR64Opnd:$src)),
+              (CVT_S_D32 AFGR64Opnd:$src)>, FGR_32;
+def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
+              (CVT_D32_S FGR32Opnd:$src)>, FGR_32;
+
+def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>, FGR_64;
+def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>, FGR_64;
+
+def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
+              (PseudoCVT_D64_W GPR32Opnd:$src)>, FGR_64;
+def : MipsPat<(f32 (sint_to_fp GPR64Opnd:$src)),
+              (EXTRACT_SUBREG (PseudoCVT_S_L GPR64Opnd:$src), sub_lo)>, FGR_64;
+def : MipsPat<(f64 (sint_to_fp GPR64Opnd:$src)),
+              (PseudoCVT_D64_L GPR64Opnd:$src)>, FGR_64;
+
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+              (TRUNC_W_D64 FGR64Opnd:$src)>, FGR_64;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+              (TRUNC_L_S FGR32Opnd:$src)>, FGR_64;
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+              (TRUNC_L_D64 FGR64Opnd:$src)>, FGR_64;
 
-let Predicates = [IsFP64bit, HasStdEnc] in {
-  def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>;
-  def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>;
-
-  def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
-                (PseudoCVT_D64_W GPR32Opnd:$src)>;
-  def : MipsPat<(f32 (sint_to_fp GPR64Opnd:$src)),
-                (EXTRACT_SUBREG (PseudoCVT_S_L GPR64Opnd:$src), sub_lo)>;
-  def : MipsPat<(f64 (sint_to_fp GPR64Opnd:$src)),
-                (PseudoCVT_D64_L GPR64Opnd:$src)>;
-
-  def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
-                (TRUNC_W_D64 FGR64Opnd:$src)>;
-  def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
-                (TRUNC_L_S FGR32Opnd:$src)>;
-  def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
-                (TRUNC_L_D64 FGR64Opnd:$src)>;
-
-  def : MipsPat<(f32 (fround FGR64Opnd:$src)),
-                (CVT_S_D64 FGR64Opnd:$src)>;
-  def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
-                (CVT_D64_S FGR32Opnd:$src)>;
-}
+def : MipsPat<(f32 (fround FGR64Opnd:$src)),
+              (CVT_S_D64 FGR64Opnd:$src)>, FGR_64;
+def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
+              (CVT_D64_S FGR32Opnd:$src)>, FGR_64;
 
 // Patterns for loads/stores with a reg+imm operand.
 let AddedComplexity = 40 in {
-  let Predicates = [HasStdEnc] in {
-    def : LoadRegImmPat<LWC1, f32, load>;
-    def : StoreRegImmPat<SWC1, f32>;
-  }
+  def : LoadRegImmPat<LWC1, f32, load>;
+  def : StoreRegImmPat<SWC1, f32>;
 
-  let Predicates = [IsFP64bit, HasStdEnc] in {
-    def : LoadRegImmPat<LDC164, f64, load>;
-    def : StoreRegImmPat<SDC164, f64>;
-  }
+  def : LoadRegImmPat<LDC164, f64, load>, FGR_64;
+  def : StoreRegImmPat<SDC164, f64>, FGR_64;
 
-  let Predicates = [NotFP64bit, HasStdEnc] in {
-    def : LoadRegImmPat<LDC1, f64, load>;
-    def : StoreRegImmPat<SDC1, f64>;
-  }
+  def : LoadRegImmPat<LDC1, f64, load>, FGR_32;
+  def : StoreRegImmPat<SDC1, f64>, FGR_32;
 }
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index e4405ab..0377eab 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -93,8 +93,8 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
 // Mips32/64 Instruction Format
 class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern,
              InstrItinClass itin, Format f, string opstr = ""> :
-  MipsInst<outs, ins, asmstr, pattern, itin, f> {
-  let Predicates = [HasStdEnc];
+  MipsInst<outs, ins, asmstr, pattern, itin, f>, PredicateControl {
+  let EncodingPredicates = [HasStdEnc];
   string BaseOpcode = opstr;
   string Arch;
 }
@@ -109,9 +109,9 @@ class MipsPseudo<dag outs, dag ins, list<dag> pattern,
 
 // Mips32/64 Pseudo Instruction Format
 class PseudoSE<dag outs, dag ins, list<dag> pattern,
-               InstrItinClass itin = IIPseudo>:
-  MipsPseudo<outs, ins, pattern, itin> {
-  let Predicates = [HasStdEnc];
+               InstrItinClass itin = IIPseudo> :
+  MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
+  let EncodingPredicates = [HasStdEnc];
 }
 
 // Pseudo-instructions for alternate assembly syntax (never used by codegen).
@@ -545,6 +545,20 @@ class SEQ_FM<bits<6> funct> : StdArch {
   let Inst{5-0}   = funct;
 }
 
+class SEQI_FM<bits<6> funct> : StdArch {
+  bits<5> rs;
+  bits<5> rt;
+  bits<10> imm10;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = imm10;
+  let Inst{5-0}   = funct;
+}
+
 //===----------------------------------------------------------------------===//
 //  System calls format <op|code_|funct>
 //===----------------------------------------------------------------------===//
@@ -829,3 +843,12 @@ class BARRIER_FM<bits<5> op> : StdArch {
   let Inst{10-6} = op; // Operation
   let Inst{5-0} = 0;   // SLL
 }
+
+class COP0_TLB_FM<bits<6> op> : StdArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10; // COP0
+  let Inst{25} = 1;       // CO
+  let Inst{24-6} = 0;
+  let Inst{5-0} = op;     // Operation
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 0ebad05..d6da6c6 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -22,11 +22,11 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "MipsGenInstrInfo.inc"
 
-using namespace llvm;
-
 // Pin the vtable to this file.
 void MipsInstrInfo::anchor() {}
 
@@ -195,7 +195,7 @@ AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   if (I == REnd || !isUnpredicatedTerminator(&*I)) {
     // This block ends with no branches (it just falls through to its succ).
     // Leave TBB/FBB null.
-    TBB = FBB = NULL;
+    TBB = FBB = nullptr;
     return BT_NoBranch;
   }
 
@@ -209,7 +209,7 @@ AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
 
   // Get the second to last instruction in the block.
   unsigned SecondLastOpc = 0;
-  MachineInstr *SecondLastInst = NULL;
+  MachineInstr *SecondLastInst = nullptr;
 
   if (++I != REnd) {
     SecondLastInst = &*I;
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index d9ac961..742193f 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -9,6 +9,10 @@
 //
 // This file contains the Mips implementation of the TargetInstrInfo class.
 //
+// FIXME: We need to override TargetInstrInfo::getInlineAsmLength method in
+// order for MipsLongBranch pass to work correctly when the code has inline
+// assembly.  The returned value doesn't have to be the asm instruction's exact
+// size in bytes; MipsLongBranch only expects it to be the correct upper bound.
 //===----------------------------------------------------------------------===//
 
 #ifndef MIPSINSTRUCTIONINFO_H
@@ -47,20 +51,20 @@ public:
   static const MipsInstrInfo *create(MipsTargetMachine &TM);
 
   /// Branch Analysis
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const;
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
 
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
 
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
 
-  virtual
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
   BranchType AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                            MachineBasicBlock *&FBB,
@@ -69,8 +73,8 @@ public:
                            SmallVectorImpl<MachineInstr*> &BranchInstrs) const;
 
   /// Insert nop instruction when hazard condition is found
-  virtual void insertNoop(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI) const;
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
@@ -83,19 +87,19 @@ public:
   /// Return the number of bytes of code the specified instruction may be.
   unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
 
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const {
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override {
     storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, TRI, 0);
   }
 
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const {
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override {
     loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, TRI, 0);
   }
 
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 07c37d8..0d3cb75 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -146,26 +146,40 @@ def MipsSDR : SDNode<"MipsISD::SDR", SDTStore,
 //===----------------------------------------------------------------------===//
 // Mips Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
-def HasSEInReg  :     Predicate<"Subtarget.hasSEInReg()">,
-                      AssemblerPredicate<"FeatureSEInReg">;
-def HasBitCount :     Predicate<"Subtarget.hasBitCount()">,
-                      AssemblerPredicate<"FeatureBitCount">;
-def HasSwap     :     Predicate<"Subtarget.hasSwap()">,
-                      AssemblerPredicate<"FeatureSwap">;
-def HasCondMov  :     Predicate<"Subtarget.hasCondMov()">,
-                      AssemblerPredicate<"FeatureCondMov">;
-def HasFPIdx    :     Predicate<"Subtarget.hasFPIdx()">,
-                      AssemblerPredicate<"FeatureFPIdx">;
+def HasMips2     :    Predicate<"Subtarget.hasMips2()">,
+                      AssemblerPredicate<"FeatureMips2">;
+def HasMips3_32  :    Predicate<"Subtarget.hasMips3_32()">,
+                      AssemblerPredicate<"FeatureMips3_32">;
+def HasMips3_32r2 :   Predicate<"Subtarget.hasMips3_32r2()">,
+                      AssemblerPredicate<"FeatureMips3_32r2">;
+def HasMips3     :    Predicate<"Subtarget.hasMips3()">,
+                      AssemblerPredicate<"FeatureMips3">;
+def HasMips4_32  :    Predicate<"Subtarget.hasMips4_32()">,
+                      AssemblerPredicate<"FeatureMips4_32">;
+def HasMips4_32r2 :   Predicate<"Subtarget.hasMips4_32r2()">,
+                      AssemblerPredicate<"FeatureMips4_32r2">;
+def HasMips5_32r2 :   Predicate<"Subtarget.hasMips5_32r2()">,
+                      AssemblerPredicate<"FeatureMips5_32r2">;
 def HasMips32    :    Predicate<"Subtarget.hasMips32()">,
                       AssemblerPredicate<"FeatureMips32">;
 def HasMips32r2  :    Predicate<"Subtarget.hasMips32r2()">,
                       AssemblerPredicate<"FeatureMips32r2">;
+def HasMips32r6  :    Predicate<"Subtarget.hasMips32r6()">,
+                      AssemblerPredicate<"FeatureMips32r6">;
+def NotMips32r6  :    Predicate<"!Subtarget.hasMips32r6()">,
+                      AssemblerPredicate<"!FeatureMips32r6">;
+def IsGP64bit    :    Predicate<"Subtarget.isGP64bit()">,
+                      AssemblerPredicate<"FeatureGP64Bit">;
+def IsGP32bit    :    Predicate<"!Subtarget.isGP64bit()">,
+                      AssemblerPredicate<"!FeatureGP64Bit">;
 def HasMips64    :    Predicate<"Subtarget.hasMips64()">,
                       AssemblerPredicate<"FeatureMips64">;
-def NotMips64    :    Predicate<"!Subtarget.hasMips64()">,
-                      AssemblerPredicate<"!FeatureMips64">;
 def HasMips64r2  :    Predicate<"Subtarget.hasMips64r2()">,
                       AssemblerPredicate<"FeatureMips64r2">;
+def HasMips64r6  :    Predicate<"Subtarget.hasMips64r6()">,
+                      AssemblerPredicate<"FeatureMips64r6">;
+def NotMips64r6  :    Predicate<"!Subtarget.hasMips64r6()">,
+                      AssemblerPredicate<"!FeatureMips64r6">;
 def IsN64       :     Predicate<"Subtarget.isABI_N64()">,
                       AssemblerPredicate<"FeatureN64">;
 def InMips16Mode :    Predicate<"Subtarget.inMips16Mode()">,
@@ -176,8 +190,7 @@ def RelocStatic :     Predicate<"TM.getRelocationModel() == Reloc::Static">,
                       AssemblerPredicate<"FeatureMips32">;
 def RelocPIC    :     Predicate<"TM.getRelocationModel() == Reloc::PIC_">,
                       AssemblerPredicate<"FeatureMips32">;
-def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">,
-                      AssemblerPredicate<"FeatureMips32">;
+def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">;
 def HasStdEnc :       Predicate<"Subtarget.hasStandardEncoding()">,
                       AssemblerPredicate<"!FeatureMips16">;
 def NotDSP :          Predicate<"!Subtarget.hasDSP()">;
@@ -189,9 +202,65 @@ def IsLE           :  Predicate<"Subtarget.isLittle()">;
 def IsBE           :  Predicate<"!Subtarget.isLittle()">;
 def IsNotNaCl    :    Predicate<"!Subtarget.isTargetNaCl()">;
 
-class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
-  let Predicates = [HasStdEnc];
+//===----------------------------------------------------------------------===//
+// Mips GPR size adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+class GPR_32 { list<Predicate> GPRPredicates = [IsGP32bit]; }
+class GPR_64 { list<Predicate> GPRPredicates = [IsGP64bit]; }
+
+//===----------------------------------------------------------------------===//
+// Mips ISA/ASE membership and instruction group membership adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+// FIXME: I'd prefer to use additive predicates to build the instruction sets
+//        but we are short on assembler feature bits at the moment. Using a
+//        subtractive predicate will hopefully keep us under the 32 predicate
+//        limit long enough to develop an alternative way to handle P1||P2
+//        predicates.
+class ISA_MIPS1_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS2    { list<Predicate> InsnPredicates = [HasMips2]; }
+class ISA_MIPS2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips2, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS3    { list<Predicate> InsnPredicates = [HasMips3]; }
+class ISA_MIPS3_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips3, NotMips32r6, NotMips64r6];
 }
+class ISA_MIPS32   { list<Predicate> InsnPredicates = [HasMips32]; }
+class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; }
+class ISA_MIPS64   { list<Predicate> InsnPredicates = [HasMips64]; }
+class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; }
+class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; }
+class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
+
+// The portions of MIPS-III that were also added to MIPS32
+class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; }
+
+// The portions of MIPS-III that were also added to MIPS32
+class INSN_MIPS3_32R2 { list<Predicate> InsnPredicates = [HasMips3_32r2]; }
+
+// The portions of MIPS-IV that were also added to MIPS32
+class INSN_MIPS4_32 { list<Predicate> InsnPredicates = [HasMips4_32]; }
+
+// The portions of MIPS-IV that were also added to MIPS32R2
+class INSN_MIPS4_32R2 { list<Predicate> InsnPredicates = [HasMips4_32r2]; }
+
+// The portions of MIPS-V that were also added to MIPS32R2
+class INSN_MIPS5_32R2 { list<Predicate> InsnPredicates = [HasMips5_32r2]; }
+
+//===----------------------------------------------------------------------===//
+
+class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl {
+  let EncodingPredicates = [HasStdEnc];
+}
+
+class MipsInstAlias<string Asm, dag Result, bit Emit = 0b1> :
+  InstAlias<Asm, Result, Emit>, PredicateControl;
 
 class IsCommutable {
   bit isCommutable = 1;
@@ -265,6 +334,11 @@ def simm16      : Operand<i32> {
   let DecoderMethod= "DecodeSimm16";
 }
 
+def simm19_lsl2 : Operand<i32> {
+  let EncoderMethod = "getSimm19Lsl2Encoding";
+  let DecoderMethod = "DecodeSimm19Lsl2";
+}
+
 def simm20      : Operand<i32> {
 }
 
@@ -284,6 +358,14 @@ def uimmz       : Operand<i32> {
 }
 
 // Unsigned Operand
+def uimm2 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+def uimm3 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
 def uimm5       : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
 }
@@ -314,6 +396,10 @@ def InvertedImOperand : Operand<i32> {
   let ParserMatchClass = MipsInvertedImmoperand;
 }
 
+def InvertedImOperand64 : Operand<i64> {
+  let ParserMatchClass = MipsInvertedImmoperand;
+}
+
 class mem_generic : Operand<iPTR> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops ptr_rc, simm16);
@@ -478,7 +564,9 @@ class shift_rotate_imm<string opstr, Operand ImmOpnd,
                        SDPatternOperator PF = null_frag> :
   InstSE<(outs RO:$rd), (ins RO:$rt, ImmOpnd:$shamt),
          !strconcat(opstr, "\t$rd, $rt, $shamt"),
-         [(set RO:$rd, (OpNode RO:$rt, PF:$shamt))], itin, FrmR, opstr>;
+         [(set RO:$rd, (OpNode RO:$rt, PF:$shamt))], itin, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rt = $rd";
+}
 
 class shift_rotate_reg<string opstr, RegisterOperand RO, InstrItinClass itin,
                        SDPatternOperator OpNode = null_frag>:
@@ -590,7 +678,7 @@ class UncondBranch<Instruction BEQInst> :
   let isTerminator = 1;
   let isBarrier = 1;
   let hasDelaySlot = 1;
-  let Predicates = [RelocPIC, HasStdEnc];
+  let AdditionalPredicates = [RelocPIC];
   let Defs = [AT];
 }
 
@@ -779,27 +867,22 @@ class EffectiveAddress<string opstr, RegisterOperand RO> :
 // Count Leading Ones/Zeros in Word
 class CountLeading0<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz RO:$rs))], II_CLZ, FrmR, opstr>,
-  Requires<[HasBitCount, HasStdEnc]>;
+         [(set RO:$rd, (ctlz RO:$rs))], II_CLZ, FrmR, opstr>;
 
 class CountLeading1<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz (not RO:$rs)))], II_CLO, FrmR, opstr>,
-  Requires<[HasBitCount, HasStdEnc]>;
+         [(set RO:$rd, (ctlz (not RO:$rs)))], II_CLO, FrmR, opstr>;
 
 // Sign Extend in Register.
 class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO,
                    InstrItinClass itin> :
   InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"),
-         [(set RO:$rd, (sext_inreg RO:$rt, vt))], itin, FrmR, opstr> {
-  let Predicates = [HasSEInReg, HasStdEnc];
-}
+         [(set RO:$rd, (sext_inreg RO:$rt, vt))], itin, FrmR, opstr>;
 
 // Subword Swap
 class SubwordSwap<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [],
          NoItinerary, FrmR, opstr> {
-  let Predicates = [HasSwap, HasStdEnc];
   let neverHasSideEffects = 1;
 }
 
@@ -814,17 +897,14 @@ class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
   InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ext:$size),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
          [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], NoItinerary,
-         FrmR, opstr> {
-  let Predicates = [HasMips32r2, HasStdEnc];
-}
+         FrmR, opstr>, ISA_MIPS32R2;
 
 class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
               SDPatternOperator Op = null_frag>:
   InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ins:$size, RO:$src),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
          [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size, RO:$src))],
-         NoItinerary, FrmR, opstr> {
-  let Predicates = [HasMips32r2, HasStdEnc];
+         NoItinerary, FrmR, opstr>, ISA_MIPS32R2 {
   let Constraints = "$src = $rt";
 }
 
@@ -915,6 +995,18 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
   def STORE_ACC64 : Store<"", ACC64>;
 }
 
+// We need these two pseudo instructions to avoid offset calculation for long
+// branches.  See the comment in file MipsLongBranch.cpp for detailed
+// explanation.
+
+// Expands to: lui $dst, %hi($tgt - $baltgt)
+def LONG_BRANCH_LUi : PseudoSE<(outs GPR32Opnd:$dst),
+  (ins brtarget:$tgt, brtarget:$baltgt), []>;
+
+// Expands to: addiu $dst, $src, %lo($tgt - $baltgt)
+def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst),
+  (ins GPR32Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+
 //===----------------------------------------------------------------------===//
 // Instruction definition
 //===----------------------------------------------------------------------===//
@@ -926,7 +1018,8 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
 def ADDiu : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU, immSExt16,
                                add>,
             ADDI_FM<0x9>, IsAsCheapAsAMove;
-def ADDi  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>, ADDI_FM<0x8>;
+def ADDi  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>, ADDI_FM<0x8>,
+            ISA_MIPS1_NOT_32R6_64R6;
 def SLTi  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
             SLTI_FM<0xa>;
 def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
@@ -949,7 +1042,7 @@ def SUBu  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
             ADD_FM<0, 0x23>;
 let Defs = [HI0, LO0] in
 def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
-            ADD_FM<0x1c, 2>;
+            ADD_FM<0x1c, 2>, ISA_MIPS32;
 def ADD   : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM<0, 0x20>;
 def SUB   : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM<0, 0x22>;
 def SLT   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
@@ -977,12 +1070,11 @@ def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, sra>,
            SRLV_FM<7, 0>;
 
 // Rotate Instructions
-let Predicates = [HasMips32r2, HasStdEnc] in {
-  def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR, rotr,
-                                      immZExt5>, SRA_FM<2, 1>;
-  def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV, rotr>,
-              SRLV_FM<6, 1>;
-}
+def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR, rotr,
+                                    immZExt5>,
+            SRA_FM<2, 1>, ISA_MIPS32R2;
+def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV, rotr>,
+            SRLV_FM<6, 1>, ISA_MIPS32R2;
 
 /// Load and Store Instructions
 ///  aligned
@@ -999,11 +1091,16 @@ def SH  : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>;
 def SW  : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>;
 
 /// load/store left/right
-let Predicates = [NotInMicroMips] in {
-def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>;
-def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>;
-def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>;
-def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>;
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [NotInMicroMips] in {
+def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
+          ISA_MIPS1_NOT_32R6_64R6;
 }
 
 def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM;
@@ -1014,34 +1111,41 @@ def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>;
 def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>;
 def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>;
 
-def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>;
-def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM<0x8>;
-def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM<0x9>;
-def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM<0xa>;
-def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM<0xb>;
-def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>;
+def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM<0x8>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM<0x9>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM<0xa>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM<0xb>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>,
+           ISA_MIPS2_NOT_32R6_64R6;
 
 def BREAK : MMRel, BRK_FT<"break">, BRK_FM<0xd>;
 def SYSCALL : MMRel, SYS_FT<"syscall">, SYS_FM<0xc>;
 def TRAP : TrapBase<BREAK>;
 
-def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>;
-def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>;
+def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>, INSN_MIPS3_32;
+def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>, ISA_MIPS32;
 
-def EI : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>;
-def DI : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>;
+def EI : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2;
+def DI : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>, ISA_MIPS32R2;
 
-let Predicates = [NotInMicroMips] in {
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [NotInMicroMips] in {
 def WAIT : WAIT_FT<"wait">, WAIT_FM;
 
 /// Load-linked, Store-conditional
-def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>;
-def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>;
+def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, ISA_MIPS2;
+def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, ISA_MIPS2;
 }
 
 /// Jump and Branch Instructions
 def J       : MMRel, JumpFJ<jmptarget, "j", br, bb, "j">, FJ<2>,
-              Requires<[RelocStatic, HasStdEnc]>, IsBranch;
+              AdditionalRequires<[RelocStatic]>, IsBranch;
 def JR      : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>;
 def BEQ     : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>;
 def BNE     : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>;
@@ -1056,7 +1160,7 @@ def BLTZ    : MMRel, CBranchZero<"bltz", brtarget, setlt, GPR32Opnd>,
 def B       : UncondBranch<BEQ>;
 
 def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
-let Predicates = [NotInMicroMips, HasStdEnc] in {
+let AdditionalPredicates = [NotInMicroMips] in {
 def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
 def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
 }
@@ -1102,21 +1206,24 @@ def UDIV  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
 
 def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>;
 def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>;
-let Predicates = [NotInMicroMips] in {
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [NotInMicroMips] in {
 def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>;
 def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>;
 }
 
 /// Sign Ext In Register Instructions.
-def SEB : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>, SEB_FM<0x10, 0x20>;
-def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>, SEB_FM<0x18, 0x20>;
+def SEB : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
+          SEB_FM<0x10, 0x20>, ISA_MIPS32R2;
+def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
+          SEB_FM<0x18, 0x20>, ISA_MIPS32R2;
 
 /// Count Leading
-def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>;
-def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>;
+def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>, ISA_MIPS32;
+def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>, ISA_MIPS32;
 
 /// Word Swap Bytes Within Halfwords
-def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>;
+def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>, ISA_MIPS32R2;
 
 /// No operation.
 def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
@@ -1128,12 +1235,12 @@ def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
 def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
 
 // MADD*/MSUB*
-def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>;
-def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>;
-def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>;
-def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>;
+def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>, ISA_MIPS32;
+def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>, ISA_MIPS32;
+def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>, ISA_MIPS32;
+def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>, ISA_MIPS32;
 
-let Predicates = [HasStdEnc, NotDSP] in {
+let AdditionalPredicates = [NotDSP] in {
 def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>;
 def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, II_MULTU>;
 def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>;
@@ -1156,8 +1263,8 @@ def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, EXT_FM<0>;
 def INS : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM<4>;
 
 /// Move Control Registers From/To CPU Registers
-def MFC0 : MFC3OP<"mfc0", GPR32Opnd>, MFC3OP_FM<0x10, 0>;
-def MTC0 : MFC3OP<"mtc0", GPR32Opnd>, MFC3OP_FM<0x10, 4>;
+def MFC0 : MFC3OP<"mfc0", GPR32Opnd>, MFC3OP_FM<0x10, 0>, ISA_MIPS32;
+def MTC0 : MFC3OP<"mtc0", GPR32Opnd>, MFC3OP_FM<0x10, 4>, ISA_MIPS32;
 def MFC2 : MFC3OP<"mfc2", GPR32Opnd>, MFC3OP_FM<0x12, 0>;
 def MTC2 : MFC3OP<"mtc2", GPR32Opnd>, MFC3OP_FM<0x12, 4>;
 
@@ -1165,67 +1272,94 @@ class Barrier<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
                                       FrmOther>;
 def SSNOP : Barrier<"ssnop">, BARRIER_FM<1>;
 def EHB : Barrier<"ehb">, BARRIER_FM<3>;
-def PAUSE : Barrier<"pause">, BARRIER_FM<5>, Requires<[HasMips32r2]>;
+def PAUSE : Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2;
+
+class TLB<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
+                                      FrmOther>;
+def TLBP : TLB<"tlbp">, COP0_TLB_FM<0x08>;
+def TLBR : TLB<"tlbr">, COP0_TLB_FM<0x01>;
+def TLBWI : TLB<"tlbwi">, COP0_TLB_FM<0x02>;
+def TLBWR : TLB<"tlbwr">, COP0_TLB_FM<0x06>;
 
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
-def : InstAlias<"move $dst, $src",
-                (ADDu GPR32Opnd:$dst, GPR32Opnd:$src,ZERO), 1>,
-      Requires<[NotMips64, NotInMicroMips]>;
-def : InstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>;
-def : InstAlias<"addu $rs, $rt, $imm",
-                (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-def : InstAlias<"add $rs, $rt, $imm",
-                (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-def : InstAlias<"and $rs, $rt, $imm",
-                (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-def : InstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"move $dst, $src",
+                    (ADDu GPR32Opnd:$dst, GPR32Opnd:$src,ZERO), 1>,
+      GPR_32 {
+  let AdditionalPredicates = [NotInMicroMips];
+}
+def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"addu $rs, $rt, $imm",
+                    (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"add $rs, $rt, $imm",
+                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"and $rs, $rt, $imm",
+                    (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
 let Predicates = [NotInMicroMips] in {
-def : InstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
-}
-def : InstAlias<"jal $rs", (JALR RA, GPR32Opnd:$rs), 0>;
-def : InstAlias<"jal $rd,$rs", (JALR GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
-def : InstAlias<"not $rt, $rs",
-                (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
-def : InstAlias<"neg $rt, $rs",
-                (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
-def : InstAlias<"negu $rt, $rs",
-                (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
-def : InstAlias<"slt $rs, $rt, $imm",
-                (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-def : InstAlias<"xor $rs, $rt, $imm",
-                (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
-def : InstAlias<"or $rs, $rt, $imm",
-                (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
-def : InstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
-def : InstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
-def : InstAlias<"mtc0 $rt, $rd", (MTC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
-def : InstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
-def : InstAlias<"mtc2 $rt, $rd", (MTC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
-def : InstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
-def : InstAlias<"bnez $rs,$offset",
-                (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-def : InstAlias<"beqz $rs,$offset",
-                (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-def : InstAlias<"syscall", (SYSCALL 0), 1>;
-
-def : InstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
-def : InstAlias<"break", (BREAK 0, 0), 1>;
-def : InstAlias<"ei", (EI ZERO), 1>;
-def : InstAlias<"di", (DI ZERO), 1>;
-
-def  : InstAlias<"teq $rs, $rt", (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : InstAlias<"tge $rs, $rt", (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : InstAlias<"tgeu $rs, $rt", (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : InstAlias<"tlt $rs, $rt", (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : InstAlias<"tltu $rs, $rt", (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : InstAlias<"tne $rs, $rt", (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def : InstAlias<"sub, $rd, $rs, $imm",
-                (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs, InvertedImOperand:$imm)>;
-def : InstAlias<"subu, $rd, $rs, $imm",
-                (ADDiu GPR32Opnd:$rd, GPR32Opnd:$rs, InvertedImOperand:$imm)>;
-
+def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+}
+def : MipsInstAlias<"jal $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"jal $rd,$rs", (JALR GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"not $rt, $rs",
+                    (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
+def : MipsInstAlias<"neg $rt, $rs",
+                    (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
+def : MipsInstAlias<"negu $rt",
+                    (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 0>;
+def : MipsInstAlias<"negu $rt, $rs",
+                    (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
+def : MipsInstAlias<"slt $rs, $rt, $imm",
+                    (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"sltu $rt, $rs, $imm",
+                    (SLTiu GPR32Opnd:$rt, GPR32Opnd:$rs, simm16:$imm), 0>;
+def : MipsInstAlias<"xor $rs, $rt, $imm",
+                    (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+def : MipsInstAlias<"or $rs, $rt, $imm",
+                    (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
+def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mtc2 $rt, $rd", (MTC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"bnez $rs,$offset",
+                    (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"beqz $rs,$offset",
+                    (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"syscall", (SYSCALL 0), 1>;
+    
+def : MipsInstAlias<"break", (BREAK 0, 0), 1>;
+def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
+def : MipsInstAlias<"ei", (EI ZERO), 1>;
+def : MipsInstAlias<"di", (DI ZERO), 1>;
+
+def  : MipsInstAlias<"teq $rs, $rt", (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : MipsInstAlias<"tge $rs, $rt", (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : MipsInstAlias<"tgeu $rs, $rt", (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0),
+                     1>;
+def  : MipsInstAlias<"tlt $rs, $rt", (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : MipsInstAlias<"tltu $rs, $rt", (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0),
+                     1>;
+def  : MipsInstAlias<"tne $rs, $rt", (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : MipsInstAlias<"sll $rd, $rt, $rs",
+                     (SLLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"sub, $rd, $rs, $imm",
+                    (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs,
+                          InvertedImOperand:$imm), 0>;
+def : MipsInstAlias<"sub $rs, $imm",
+                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, InvertedImOperand:$imm),
+                    0>;
+def : MipsInstAlias<"subu, $rd, $rs, $imm",
+                    (ADDiu GPR32Opnd:$rd, GPR32Opnd:$rs,
+                           InvertedImOperand:$imm), 0>;
+def : MipsInstAlias<"subu $rs, $imm", (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                             InvertedImOperand:$imm), 0>;
+def : MipsInstAlias<"sra $rd, $rt, $rs",
+                    (SRAV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"srl $rd, $rt, $rs",
+                    (SRLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -1271,7 +1405,7 @@ def : MipsPat<(i32 imm:$imm),
 // Carry MipsPatterns
 def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
               (SUBu GPR32:$lhs, GPR32:$rhs)>;
-let Predicates = [HasStdEnc, NotDSP] in {
+let AdditionalPredicates = [NotDSP] in {
   def : MipsPat<(addc GPR32:$lhs, GPR32:$rhs),
                 (ADDu GPR32:$lhs, GPR32:$rhs)>;
   def : MipsPat<(addc  GPR32:$src, immSExt16:$imm),
@@ -1340,14 +1474,11 @@ def : MipsPat<(not GPR32:$in),
               (NOR GPR32Opnd:$in, ZERO)>;
 
 // extended loads
-let Predicates = [HasStdEnc] in {
-  def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
-  def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
-  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
-}
+def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
+def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
+def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
 
 // peepholes
-let Predicates = [HasStdEnc] in
 def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
 
 // brcond patterns
@@ -1441,11 +1572,9 @@ def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>;
 
 // Load halfword/word patterns.
 let AddedComplexity = 40 in {
-  let Predicates = [HasStdEnc] in {
-    def : LoadRegImmPat<LBu, i32, zextloadi8>;
-    def : LoadRegImmPat<LH, i32, sextloadi16>;
-    def : LoadRegImmPat<LW, i32, load>;
-  }
+  def : LoadRegImmPat<LBu, i32, zextloadi8>;
+  def : LoadRegImmPat<LH, i32, sextloadi16>;
+  def : LoadRegImmPat<LW, i32, load>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1456,6 +1585,9 @@ include "MipsInstrFPU.td"
 include "Mips64InstrInfo.td"
 include "MipsCondMov.td"
 
+include "Mips32r6InstrInfo.td"
+include "Mips64r6InstrInfo.td"
+
 //
 // Mips16
 
diff --git a/lib/Target/Mips/MipsJITInfo.cpp b/lib/Target/Mips/MipsJITInfo.cpp
index d76cb1d..2072488 100644
--- a/lib/Target/Mips/MipsJITInfo.cpp
+++ b/lib/Target/Mips/MipsJITInfo.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "MipsJITInfo.h"
 #include "MipsInstrInfo.h"
 #include "MipsRelocations.h"
@@ -25,6 +24,8 @@
 #include <cstdlib>
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 
 void MipsJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
   unsigned NewAddr = (intptr_t)New;
diff --git a/lib/Target/Mips/MipsJITInfo.h b/lib/Target/Mips/MipsJITInfo.h
index ecda310..c9dfd83 100644
--- a/lib/Target/Mips/MipsJITInfo.h
+++ b/lib/Target/Mips/MipsJITInfo.h
@@ -37,26 +37,26 @@ class MipsJITInfo : public TargetJITInfo {
     /// overwriting OLD with a branch to NEW.  This is used for self-modifying
     /// code.
     ///
-    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+    void replaceMachineCodeForFunction(void *Old, void *New) override;
 
     // getStubLayout - Returns the size and alignment of the largest call stub
     // on Mips.
-    virtual StubLayout getStubLayout();
+    StubLayout getStubLayout() override;
 
     /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
     /// small native function that simply calls the function at the specified
     /// address.
-    virtual void *emitFunctionStub(const Function *F, void *Fn,
-                                   JITCodeEmitter &JCE);
+    void *emitFunctionStub(const Function *F, void *Fn,
+                           JITCodeEmitter &JCE) override;
 
     /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
-    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
 
     /// relocate - Before the JIT can run a block of code that has been emitted,
     /// it must rewrite the code to contain the actual addresses of any
     /// referenced global symbols.
-    virtual void relocate(void *Function, MachineRelocation *MR,
-                          unsigned NumRelocs, unsigned char *GOTBase);
+    void relocate(void *Function, MachineRelocation *MR,
+                  unsigned NumRelocs, unsigned char *GOTBase) override;
 
     /// Initialize - Initialize internal stage for the function being JITted.
     void Initialize(const MachineFunction &MF, bool isPIC,
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 2b6a874..acfe76e 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -10,14 +10,9 @@
 // This pass expands a branch or jump instruction into a long branch if its
 // offset is too large to fit into its immediate field.
 //
-// FIXME:
-// 1. Fix pc-region jump instructions which cross 256MB segment boundaries.
-// 2. If program has inline assembly statements whose size cannot be
-//    determined accurately, load branch target addresses from the GOT.
+// FIXME: Fix pc-region jump instructions which cross 256MB segment boundaries.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-long-branch"
-
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsTargetMachine.h"
@@ -33,6 +28,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-long-branch"
+
 STATISTIC(LongBranches, "Number of long branches.");
 
 static cl::opt<bool> SkipLongBranch(
@@ -56,7 +53,7 @@ namespace {
     bool HasLongBranch;
     MachineInstr *Br;
 
-    MBBInfo() : Size(0), HasLongBranch(false), Br(0) {}
+    MBBInfo() : Size(0), HasLongBranch(false), Br(nullptr) {}
   };
 
   class MipsLongBranch : public MachineFunctionPass {
@@ -67,13 +64,13 @@ namespace {
       : MachineFunctionPass(ID), TM(tm),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
         ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
-        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 13 : 9)) {}
+        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 10 : 9)) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Mips Long Branch";
     }
 
-    bool runOnMachineFunction(MachineFunction &F);
+    bool runOnMachineFunction(MachineFunction &F) override;
 
   private:
     void splitMBB(MachineBasicBlock *MBB);
@@ -111,7 +108,7 @@ static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) {
   }
 
   assert(false && "This instruction does not have an MBB operand.");
-  return 0;
+  return nullptr;
 }
 
 // Traverse the list of instructions backwards until a non-debug instruction is
@@ -267,20 +264,14 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
     LongBrMBB->addSuccessor(BalTgtMBB);
     BalTgtMBB->addSuccessor(TgtMBB);
 
-    int64_t TgtAddress = MBBInfos[TgtMBB->getNumber()].Address;
-    unsigned BalTgtMBBSize = 5;
-    int64_t Offset = TgtAddress - (I.Address + I.Size - BalTgtMBBSize * 4);
-    int64_t Lo = SignExtend64<16>(Offset & 0xffff);
-    int64_t Hi = SignExtend64<16>(((Offset + 0x8000) >> 16) & 0xffff);
-
     if (ABI != MipsSubtarget::N64) {
       // $longbr:
       //  addiu $sp, $sp, -8
       //  sw $ra, 0($sp)
-      //  bal $baltgt
       //  lui $at, %hi($tgt - $baltgt)
-      // $baltgt:
+      //  bal $baltgt
       //  addiu $at, $at, %lo($tgt - $baltgt)
+      // $baltgt:
       //  addu $at, $ra, $at
       //  lw $ra, 0($sp)
       //  jr $at
@@ -295,14 +286,31 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW)).addReg(Mips::RA)
         .addReg(Mips::SP).addImm(0);
 
+      // LUi and ADDiu instructions create 32-bit offset of the target basic
+      // block from the target of BAL instruction.  We cannot use immediate
+      // value for this offset because it cannot be determined accurately when
+      // the program has inline assembly statements.  We therefore use the
+      // relocation expressions %hi($tgt-$baltgt) and %lo($tgt-$baltgt) which
+      // are resolved during the fixup, so the values will always be correct.
+      //
+      // Since we cannot create %hi($tgt-$baltgt) and %lo($tgt-$baltgt)
+      // expressions at this point (it is possible only at the MC layer),
+      // we replace LUi and ADDiu with pseudo instructions
+      // LONG_BRANCH_LUi and LONG_BRANCH_ADDiu, and add both basic
+      // blocks as operands to these instructions.  When lowering these pseudo
+      // instructions to LUi and ADDiu in the MC layer, we will create
+      // %hi($tgt-$baltgt) and %lo($tgt-$baltgt) expressions and add them as
+      // operands to lowered instructions.
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi), Mips::AT)
+        .addMBB(TgtMBB).addMBB(BalTgtMBB);
       MIBundleBuilder(*LongBrMBB, Pos)
         .append(BuildMI(*MF, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB))
-        .append(BuildMI(*MF, DL, TII->get(Mips::LUi), Mips::AT).addImm(Hi));
+        .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
+                  .addReg(Mips::AT).addMBB(TgtMBB).addMBB(BalTgtMBB));
 
       Pos = BalTgtMBB->begin();
 
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::AT)
-        .addReg(Mips::AT).addImm(Lo);
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDu), Mips::AT)
         .addReg(Mips::RA).addReg(Mips::AT);
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
@@ -316,14 +324,11 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       // $longbr:
       //  daddiu $sp, $sp, -16
       //  sd $ra, 0($sp)
-      //  lui64 $at, %highest($tgt - $baltgt)
-      //  daddiu $at, $at, %higher($tgt - $baltgt)
+      //  daddiu $at, $zero, %hi($tgt - $baltgt)
       //  dsll $at, $at, 16
-      //  daddiu $at, $at, %hi($tgt - $baltgt)
       //  bal $baltgt
-      //  dsll $at, $at, 16
-      // $baltgt:
       //  daddiu $at, $at, %lo($tgt - $baltgt)
+      // $baltgt:
       //  daddu $at, $ra, $at
       //  ld $ra, 0($sp)
       //  jr64 $at
@@ -331,9 +336,20 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       // $fallthrough:
       //
 
-      int64_t Higher = SignExtend64<16>(((Offset + 0x80008000) >> 32) & 0xffff);
-      int64_t Highest =
-        SignExtend64<16>(((Offset + 0x800080008000LL) >> 48) & 0xffff);
+      // We assume the branch is within-function, and that offset is within
+      // +/- 2GB.  High 32 bits will therefore always be zero.
+
+      // Note that this will work even if the offset is negative, because
+      // of the +1 modification that's added in that case.  For example, if the
+      // offset is -1MB (0xFFFFFFFFFFF00000), the computation for %higher is
+      //
+      // 0xFFFFFFFFFFF00000 + 0x80008000 = 0x000000007FF08000
+      //
+      // and the bits [47:32] are zero.  For %highest
+      //
+      // 0xFFFFFFFFFFF00000 + 0x800080008000 = 0x000080007FF08000
+      //
+      // and the bits [63:48] are zero.
 
       Pos = LongBrMBB->begin();
 
@@ -341,24 +357,21 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
         .addReg(Mips::SP_64).addImm(-16);
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64)
         .addReg(Mips::SP_64).addImm(0);
-      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LUi64), Mips::AT_64)
-        .addImm(Highest);
-      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
-        .addReg(Mips::AT_64).addImm(Higher);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+              Mips::AT_64).addReg(Mips::ZERO_64)
+                          .addMBB(TgtMBB, MipsII::MO_ABS_HI).addMBB(BalTgtMBB);
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
         .addReg(Mips::AT_64).addImm(16);
-      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
-        .addReg(Mips::AT_64).addImm(Hi);
 
       MIBundleBuilder(*LongBrMBB, Pos)
         .append(BuildMI(*MF, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB))
-        .append(BuildMI(*MF, DL, TII->get(Mips::DSLL), Mips::AT_64)
-                .addReg(Mips::AT_64).addImm(16));
+        .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+                        Mips::AT_64).addReg(Mips::AT_64)
+                                    .addMBB(TgtMBB, MipsII::MO_ABS_LO)
+                                    .addMBB(BalTgtMBB));
 
       Pos = BalTgtMBB->begin();
 
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
-        .addReg(Mips::AT_64).addImm(Lo);
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDu), Mips::AT_64)
         .addReg(Mips::RA_64).addReg(Mips::AT_64);
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
@@ -370,8 +383,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
                 .addReg(Mips::SP_64).addImm(16));
     }
 
-    assert(BalTgtMBBSize == BalTgtMBB->size());
-    assert(LongBrMBB->size() + BalTgtMBBSize == LongBranchSeqSize);
+    assert(LongBrMBB->size() + BalTgtMBB->size() == LongBranchSeqSize);
   } else {
     // $longbr:
     //  j $tgt
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 7c9a9ed..821392e 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -151,7 +151,75 @@ MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
   return MCOperand();
 }
 
+MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
+                                     MachineBasicBlock *BB2,
+                                     MCSymbolRefExpr::VariantKind Kind) const {
+  const MCSymbolRefExpr *Sym1 = MCSymbolRefExpr::Create(BB1->getSymbol(), *Ctx);
+  const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::Create(BB2->getSymbol(), *Ctx);
+  const MCBinaryExpr *Sub = MCBinaryExpr::CreateSub(Sym1, Sym2, *Ctx);
+
+  return MCOperand::CreateExpr(MipsMCExpr::Create(Kind, Sub, *Ctx));
+}
+
+void MipsMCInstLower::
+lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(Mips::LUi);
+
+  // Lower register operand.
+  OutMI.addOperand(LowerOperand(MI->getOperand(0)));
+
+  // Create %hi($tgt-$baltgt).
+  OutMI.addOperand(createSub(MI->getOperand(1).getMBB(),
+                             MI->getOperand(2).getMBB(),
+                             MCSymbolRefExpr::VK_Mips_ABS_HI));
+}
+
+void MipsMCInstLower::
+lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI, int Opcode,
+                     MCSymbolRefExpr::VariantKind Kind) const {
+  OutMI.setOpcode(Opcode);
+
+  // Lower two register operands.
+  for (unsigned I = 0, E = 2; I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    OutMI.addOperand(LowerOperand(MO));
+  }
+
+  // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
+  OutMI.addOperand(createSub(MI->getOperand(2).getMBB(),
+                             MI->getOperand(3).getMBB(), Kind));
+}
+
+bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
+                                      MCInst &OutMI) const {
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case Mips::LONG_BRANCH_LUi:
+    lowerLongBranchLUi(MI, OutMI);
+    return true;
+  case Mips::LONG_BRANCH_ADDiu:
+    lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu,
+                         MCSymbolRefExpr::VK_Mips_ABS_LO);
+    return true;
+  case Mips::LONG_BRANCH_DADDiu:
+    unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
+    if (TargetFlags == MipsII::MO_ABS_HI)
+      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu,
+                           MCSymbolRefExpr::VK_Mips_ABS_HI);
+    else if (TargetFlags == MipsII::MO_ABS_LO)
+      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu,
+                           MCSymbolRefExpr::VK_Mips_ABS_LO);
+    else
+      report_fatal_error("Unexpected flags for LONG_BRANCH_DADDiu");
+    return true;
+  }
+}
+
 void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  if (lowerLongBranch(MI, OutMI))
+    return;
+
   OutMI.setOpcode(MI->getOpcode());
 
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index 4570bd9..269190f 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -9,6 +9,7 @@
 
 #ifndef MIPSMCINSTLOWER_H
 #define MIPSMCINSTLOWER_H
+#include "MCTargetDesc/MipsMCExpr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/Support/Compiler.h"
@@ -36,6 +37,13 @@ public:
 private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
                                MachineOperandType MOTy, unsigned Offset) const;
+  MCOperand createSub(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+                      MCSymbolRefExpr::VariantKind Kind) const;
+  void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const;
+  void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI,
+                            int Opcode,
+                            MCSymbolRefExpr::VariantKind Kind) const;
+  bool lowerLongBranch(const MachineInstr *MI, MCInst &OutMI) const;
 };
 }
 
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index 5722c6c..285bb14 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -65,10 +65,6 @@ def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT",
 
 // Operands
 
-def uimm2 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
-}
-
 // The immediate of an LSA instruction needs special handling
 // as the encoded value should be subtracted by one.
 def uimm2LSAAsmOperand : AsmOperandClass {
@@ -84,10 +80,6 @@ def LSAImm : Operand<i32> {
   let ParserMatchClass = uimm2LSAAsmOperand;
 }
 
-def uimm3 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
 def uimm4 : Operand<i32> {
   let PrintMethod = "printUnsignedImm8";
 }
@@ -1505,6 +1497,15 @@ class MSA_INSERT_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
   string Constraints = "$wd = $wd_in";
 }
 
+class MSA_INSERT_VIDX_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
+                                  RegisterOperand ROWD, RegisterOperand ROFS> :
+      MSAPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, GPR32Opnd:$n, ROFS:$fs),
+                [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
+                                        GPR32Opnd:$n))]> {
+  bit usesCustomInserter = 1;
+  string Constraints = "$wd = $wd_in";
+}
+
 class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                           RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
                           InstrItinClass itin = NoItinerary> {
@@ -2300,11 +2301,25 @@ class INSERT_W_DESC : MSA_INSERT_DESC_BASE<"insert.w", vinsert_v4i32,
 class INSERT_D_DESC : MSA_INSERT_DESC_BASE<"insert.d", vinsert_v2i64,
                                            MSA128DOpnd, GPR64Opnd>;
 
+class INSERT_B_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v16i8, MSA128BOpnd, GPR32Opnd>;
+class INSERT_H_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v8i16, MSA128HOpnd, GPR32Opnd>;
+class INSERT_W_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4i32, MSA128WOpnd, GPR32Opnd>;
+class INSERT_D_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2i64, MSA128DOpnd, GPR64Opnd>;
+
 class INSERT_FW_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v4f32,
                                                      MSA128WOpnd, FGR32Opnd>;
 class INSERT_FD_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v2f64,
                                                      MSA128DOpnd, FGR64Opnd>;
 
+class INSERT_FW_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4f32, MSA128WOpnd, FGR32Opnd>;
+class INSERT_FD_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd>;
+
 class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8,
                                          MSA128BOpnd>;
 class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16,
@@ -3214,6 +3229,13 @@ let DecoderMethod = "DecodeINSVE_DF" in {
 def INSERT_FW_PSEUDO : INSERT_FW_PSEUDO_DESC;
 def INSERT_FD_PSEUDO : INSERT_FD_PSEUDO_DESC;
 
+def INSERT_B_VIDX_PSEUDO : INSERT_B_VIDX_PSEUDO_DESC;
+def INSERT_H_VIDX_PSEUDO : INSERT_H_VIDX_PSEUDO_DESC;
+def INSERT_W_VIDX_PSEUDO : INSERT_W_VIDX_PSEUDO_DESC;
+def INSERT_D_VIDX_PSEUDO : INSERT_D_VIDX_PSEUDO_DESC;
+def INSERT_FW_VIDX_PSEUDO : INSERT_FW_VIDX_PSEUDO_DESC;
+def INSERT_FD_VIDX_PSEUDO : INSERT_FD_VIDX_PSEUDO_DESC;
+
 def LD_B: LD_B_ENC, LD_B_DESC;
 def LD_H: LD_H_ENC, LD_H_DESC;
 def LD_W: LD_W_ENC, LD_W_DESC;
@@ -3731,3 +3753,55 @@ def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64,
                                                MSA128D, NoItinerary>;
 def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
                                                MSA128B, NoItinerary>;
+
+// Vector extraction with variable index
+def : MSAPat<(i32 (vextract_sext_i8 v16i8:$ws, i32:$idx)),
+             (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 24))>;
+def : MSAPat<(i32 (vextract_sext_i16 v8i16:$ws, i32:$idx)),
+             (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_H v8i16:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 16))>;
+def : MSAPat<(i32 (vextract_sext_i32 v4i32:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_W v4i32:$ws,
+                                                             i32:$idx),
+                                                    sub_lo)),
+                               GPR32)>;
+def : MSAPat<(i64 (vextract_sext_i64 v2i64:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i64 (EXTRACT_SUBREG (SPLAT_D v2i64:$ws,
+                                                             i32:$idx),
+                                                    sub_64)),
+                               GPR64), [HasMSA, IsGP64bit]>;
+
+def : MSAPat<(i32 (vextract_zext_i8 v16i8:$ws, i32:$idx)),
+             (SRL (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 24))>;
+def : MSAPat<(i32 (vextract_zext_i16 v8i16:$ws, i32:$idx)),
+             (SRL (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_H v8i16:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 16))>;
+def : MSAPat<(i32 (vextract_zext_i32 v4i32:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_W v4i32:$ws,
+                                                             i32:$idx),
+                                                    sub_lo)),
+                               GPR32)>;
+def : MSAPat<(i64 (vextract_zext_i64 v2i64:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i64 (EXTRACT_SUBREG (SPLAT_D v2i64:$ws,
+                                                             i32:$idx),
+                                                    sub_64)),
+                               GPR64), [HasMSA, IsGP64bit]>;
+
+def : MSAPat<(f32 (vector_extract v4f32:$ws, i32:$idx)),
+             (f32 (EXTRACT_SUBREG (SPLAT_W v4f32:$ws,
+                                           i32:$idx),
+                                  sub_lo))>;
+def : MSAPat<(f64 (vector_extract v2f64:$ws, i32:$idx)),
+             (f64 (EXTRACT_SUBREG (SPLAT_D v2f64:$ws,
+                                           i32:$idx),
+                                  sub_64))>;
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index dedf802..e30302e 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -27,7 +27,7 @@ FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
 MipsCallEntry::MipsCallEntry(const StringRef &N) {
 #ifndef NDEBUG
   Name = N;
-  Val = 0;
+  Val = nullptr;
 #endif
 }
 
@@ -65,9 +65,8 @@ MipsFunctionInfo::~MipsFunctionInfo() {
        ++I)
     delete I->getValue();
 
-  for (ValueMap<const GlobalValue *, const MipsCallEntry *>::iterator
-       I = GlobalCallEntries.begin(), E = GlobalCallEntries.end(); I != E; ++I)
-    delete I->second;
+  for (const auto &Entry : GlobalCallEntries)
+    delete Entry.second;
 }
 
 bool MipsFunctionInfo::globalBaseRegSet() const {
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index 3e14c8c..e9101cc 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -37,12 +37,12 @@ class MipsCallEntry : public PseudoSourceValue {
 public:
   explicit MipsCallEntry(const StringRef &N);
   explicit MipsCallEntry(const GlobalValue *V);
-  virtual bool isConstant(const MachineFrameInfo *) const;
-  virtual bool isAliased(const MachineFrameInfo *) const;
-  virtual bool mayAlias(const MachineFrameInfo *) const;
+  bool isConstant(const MachineFrameInfo *) const override;
+  bool isAliased(const MachineFrameInfo *) const override;
+  bool mayAlias(const MachineFrameInfo *) const override;
 
 private:
-  virtual void printCustom(raw_ostream &O) const;
+  void printCustom(raw_ostream &O) const override;
 #ifndef NDEBUG
   std::string Name;
   const GlobalValue *Val;
diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
index c6abf17..03c76ea 100644
--- a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
@@ -14,6 +14,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+#define DEBUG_TYPE "mips-isel"
+
 namespace llvm {
 
 bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.h b/lib/Target/Mips/MipsModuleISelDAGToDAG.h
index fda35ae..a96862a 100644
--- a/lib/Target/Mips/MipsModuleISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsModuleISelDAGToDAG.h
@@ -41,15 +41,11 @@ public:
       TM(TM_), Subtarget(TM.getSubtarget<MipsSubtarget>()) {}
 
   // Pass Name
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "MIPS DAG->DAG Pattern Instruction Selection";
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  virtual SDNode *Select(SDNode *N) {
-    llvm_unreachable("unexpected");
-  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
 protected:
   /// Keep a pointer to the MipsSubtarget around so that we can make the right
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index db270f3..c234049 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -12,8 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "optimize-mips-pic-call"
-
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsMachineFunction.h"
@@ -25,6 +23,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "optimize-mips-pic-call"
+
 static cl::opt<bool> LoadTargetFromGOT("mips-load-target-from-got",
                                        cl::init(true),
                                        cl::desc("Load target address from GOT"),
@@ -35,11 +35,13 @@ static cl::opt<bool> EraseGPOpnd("mips-erase-gp-opnd",
                                  cl::Hidden);
 
 namespace {
+typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
+
 typedef std::pair<unsigned, unsigned> CntRegP;
 typedef RecyclingAllocator<BumpPtrAllocator,
-                           ScopedHashTableVal<const Value *, CntRegP> >
+                           ScopedHashTableVal<ValueType, CntRegP> >
 AllocatorTy;
-typedef ScopedHashTable<const Value *, CntRegP, DenseMapInfo<const Value *>,
+typedef ScopedHashTable<ValueType, CntRegP, DenseMapInfo<ValueType>,
                         AllocatorTy> ScopedHTType;
 
 class MBBInfo {
@@ -59,11 +61,11 @@ class OptimizePICCall : public MachineFunctionPass {
 public:
   OptimizePICCall(TargetMachine &tm) : MachineFunctionPass(ID) {}
 
-  virtual const char *getPassName() const { return "Mips OptimizePICCall"; }
+  const char *getPassName() const override { return "Mips OptimizePICCall"; }
 
-  bool runOnMachineFunction(MachineFunction &F);
+  bool runOnMachineFunction(MachineFunction &F) override;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -78,18 +80,18 @@ private:
   /// and the underlying object in Reg and Val respectively, if the function's
   /// address can be resolved lazily.
   bool isCallViaRegister(MachineInstr &MI, unsigned &Reg,
-                         const Value *&Val) const;
+                         ValueType &Val) const;
 
   /// \brief Return the number of instructions that dominate the current
   /// instruction and load the function address from object Entry.
-  unsigned getCount(const Value *Entry);
+  unsigned getCount(ValueType Entry);
 
   /// \brief Return the destination virtual register of the last instruction
   /// that loads from object Entry.
-  unsigned getReg(const Value *Entry);
+  unsigned getReg(ValueType Entry);
 
   /// \brief Update ScopedHT.
-  void incCntAndSetReg(const Value *Entry, unsigned Reg);
+  void incCntAndSetReg(ValueType Entry, unsigned Reg);
 
   ScopedHTType ScopedHT;
   static char ID;
@@ -101,13 +103,13 @@ char OptimizePICCall::ID = 0;
 /// Return the first MachineOperand of MI if it is a used virtual register.
 static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) {
   if (MI.getNumOperands() == 0)
-    return 0;
+    return nullptr;
 
   MachineOperand &MO = MI.getOperand(0);
 
   if (!MO.isReg() || !MO.isUse() ||
       !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-    return 0;
+    return nullptr;
 
   return &MO;
 }
@@ -153,10 +155,10 @@ static void eraseGPOpnd(MachineInstr &MI) {
     }
   }
 
-  llvm_unreachable(0);
+  llvm_unreachable(nullptr);
 }
 
-MBBInfo::MBBInfo(MachineDomTreeNode *N) : Node(N), HTScope(0) {}
+MBBInfo::MBBInfo(MachineDomTreeNode *N) : Node(N), HTScope(nullptr) {}
 
 const MachineDomTreeNode *MBBInfo::getNode() const { return Node; }
 
@@ -210,7 +212,7 @@ bool OptimizePICCall::visitNode(MBBInfo &MBBI) {
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
        ++I) {
     unsigned Reg;
-    const Value *Entry;
+    ValueType Entry;
 
     // Skip instructions that are not call instructions via registers.
     if (!isCallViaRegister(*I, Reg, Entry))
@@ -242,7 +244,7 @@ bool OptimizePICCall::visitNode(MBBInfo &MBBI) {
 }
 
 bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
-                                        const Value *&Val) const {
+                                        ValueType &Val) const {
   if (!MI.isCall())
     return false;
 
@@ -254,7 +256,7 @@ bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
 
   // Get the instruction that loads the function address from the GOT.
   Reg = MO->getReg();
-  Val = 0;
+  Val = (Value*)nullptr;
   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   MachineInstr *DefMI = MRI.getVRegDef(Reg);
 
@@ -273,20 +275,22 @@ bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
   // Return the underlying object for the GOT entry in Val.
   assert(DefMI->hasOneMemOperand());
   Val = (*DefMI->memoperands_begin())->getValue();
+  if (!Val)
+    Val = (*DefMI->memoperands_begin())->getPseudoValue();
   return true;
 }
 
-unsigned OptimizePICCall::getCount(const Value *Entry) {
+unsigned OptimizePICCall::getCount(ValueType Entry) {
   return ScopedHT.lookup(Entry).first;
 }
 
-unsigned OptimizePICCall::getReg(const Value *Entry) {
+unsigned OptimizePICCall::getReg(ValueType Entry) {
   unsigned Reg = ScopedHT.lookup(Entry).second;
   assert(Reg);
   return Reg;
 }
 
-void OptimizePICCall::incCntAndSetReg(const Value *Entry, unsigned Reg) {
+void OptimizePICCall::incCntAndSetReg(ValueType Entry, unsigned Reg) {
   CntRegP P = ScopedHT.lookup(Entry);
   ScopedHT.insert(Entry, std::make_pair(P.first + 1, Reg));
 }
diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp
index fe60841..7aae964 100644
--- a/lib/Target/Mips/MipsOs16.cpp
+++ b/lib/Target/Mips/MipsOs16.cpp
@@ -11,13 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-os16"
 #include "MipsOs16.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+#define DEBUG_TYPE "mips-os16"
+
 
 static cl::opt<std::string> Mips32FunctionMask(
   "mips32-function-mask",
diff --git a/lib/Target/Mips/MipsOs16.h b/lib/Target/Mips/MipsOs16.h
index 21beef8..55e5a81 100644
--- a/lib/Target/Mips/MipsOs16.h
+++ b/lib/Target/Mips/MipsOs16.h
@@ -34,11 +34,11 @@ public:
 
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "MIPS Os16 Optimization";
   }
 
-  virtual bool runOnModule(Module &M);
+  bool runOnModule(Module &M) override;
 
 };
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index d7fc93b..83d25ab 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-reg-info"
-
 #include "MipsRegisterInfo.h"
 #include "Mips.h"
 #include "MipsAnalyzeImmediate.h"
@@ -37,11 +35,13 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-reg-info"
+
 #define GET_REGINFO_TARGET_DESC
 #include "MipsGenRegisterInfo.inc"
 
-using namespace llvm;
-
 MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST)
   : MipsGenRegisterInfo(Mips::RA), Subtarget(ST) {}
 
@@ -79,8 +79,8 @@ MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 //===----------------------------------------------------------------------===//
 
 /// Mips Callee Saved Registers
-const uint16_t* MipsRegisterInfo::
-getCalleeSavedRegs(const MachineFunction *MF) const {
+const MCPhysReg *
+MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_SaveList;
 
@@ -119,11 +119,11 @@ const uint32_t *MipsRegisterInfo::getMips16RetHelperMask() {
 
 BitVector MipsRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
-  static const uint16_t ReservedGPR32[] = {
+  static const MCPhysReg ReservedGPR32[] = {
     Mips::ZERO, Mips::K0, Mips::K1, Mips::SP
   };
 
-  static const uint16_t ReservedGPR64[] = {
+  static const MCPhysReg ReservedGPR64[] = {
     Mips::ZERO_64, Mips::K0_64, Mips::K1_64, Mips::SP_64
   };
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 0450c6f..b34496f 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -43,30 +43,31 @@ public:
 
   /// Code Generation virtual methods...
   const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
-                                                unsigned Kind) const;
+                                                unsigned Kind) const override;
 
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const;
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
-  const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+                               MachineFunction &MF) const override;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
   static const uint32_t *getMips16RetHelperMask();
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
 
   /// Stack Frame Processing Methods
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = nullptr) const override;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                       RegScavenger *RS = NULL) const;
+                                       RegScavenger *RS = nullptr) const;
 
   /// Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   /// \brief Return GPR register class.
   virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0;
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 834e6c5..875a596 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -205,6 +205,10 @@ let Namespace = "Mips" in {
   foreach I = 0-31 in
   def COP2#I : MipsReg<#I, ""#I>;
 
+  // COP3 registers.
+  foreach I = 0-31 in
+  def COP3#I : MipsReg<#I, ""#I>;
+
   // PC register
   def PC : Register<"pc">;
 
@@ -387,6 +391,10 @@ def DSPCC : RegisterClass<"Mips", [v4i8, v2i16], 32, (add DSPCCond)>;
 def COP2 : RegisterClass<"Mips", [i32], 32, (sequence "COP2%u", 0, 31)>,
            Unallocatable;
 
+// Coprocessor 3 registers.
+def COP3 : RegisterClass<"Mips", [i32], 32, (sequence "COP3%u", 0, 31)>,
+           Unallocatable;
+
 // Octeon multiplier and product registers
 def OCTEON_MPL : RegisterClass<"Mips", [i64], 64, (add MPL0, MPL1, MPL2)>,
                  Unallocatable;
@@ -484,6 +492,10 @@ def COP2AsmOperand : MipsAsmRegOperand {
   let Name = "COP2AsmReg";
 }
 
+def COP3AsmOperand : MipsAsmRegOperand {
+  let Name = "COP3AsmReg";
+}
+
 def HWRegsOpnd : RegisterOperand<HWRegs> {
   let ParserMatchClass = HWRegsAsmOperand;
 }
@@ -524,6 +536,10 @@ def COP2Opnd : RegisterOperand<COP2> {
   let ParserMatchClass = COP2AsmOperand;
 }
 
+def COP3Opnd : RegisterOperand<COP3> {
+  let ParserMatchClass = COP3AsmOperand;
+}
+
 def MSA128BOpnd : RegisterOperand<MSA128B> {
   let ParserMatchClass = MSA128AsmOperand;
 }
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 0343a47..6ad5821 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -375,7 +375,8 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
   // if framepointer enabled, set it to point to the stack pointer.
   if (hasFP(MF)) {
     // Insert instruction "move $fp, $sp" at this location.
-    BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO);
+    BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO)
+      .setMIFlag(MachineInstr::FrameSetup);
 
     // emit ".cfi_def_cfa_register $fp"
     unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index 8fa9e46..5d2801f 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -25,22 +25,22 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
 
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const;
+                                            RegScavenger *RS) const override;
   unsigned ehDataReg(unsigned I) const;
 };
 
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 5b20a6c..d5385be 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-isel"
 #include "MipsSEISelDAGToDAG.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips.h"
@@ -35,6 +34,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-isel"
+
 bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   if (Subtarget.inMips16Mode())
     return false;
@@ -412,7 +413,7 @@ bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
 
   BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N);
 
-  if (Node == NULL)
+  if (!Node)
     return false;
 
   APInt SplatValue, SplatUndef;
@@ -813,16 +814,16 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     EVT ViaVecTy;
 
     if (!Subtarget.hasMSA() || !BVN->getValueType(0).is128BitVector())
-      return std::make_pair(false, (SDNode*)NULL);
+      return std::make_pair(false, nullptr);
 
     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
                               HasAnyUndefs, 8,
                               !Subtarget.isLittle()))
-      return std::make_pair(false, (SDNode*)NULL);
+      return std::make_pair(false, nullptr);
 
     switch (SplatBitSize) {
     default:
-      return std::make_pair(false, (SDNode*)NULL);
+      return std::make_pair(false, nullptr);
     case 8:
       LdiOp = Mips::LDI_B;
       ViaVecTy = MVT::v16i8;
@@ -842,7 +843,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     }
 
     if (!SplatValue.isSignedIntN(10))
-      return std::make_pair(false, (SDNode*)NULL);
+      return std::make_pair(false, nullptr);
 
     SDValue Imm = CurDAG->getTargetConstant(SplatValue,
                                             ViaVecTy.getVectorElementType());
@@ -868,7 +869,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
 
   }
 
-  return std::make_pair(false, (SDNode*)NULL);
+  return std::make_pair(false, nullptr);
 }
 
 FunctionPass *llvm::createMipsSEISelDag(MipsTargetMachine &TM) {
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index ba84a6d..57328d2 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -25,7 +25,7 @@ public:
 
 private:
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
   void addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
                              MachineFunction &MF);
@@ -44,66 +44,66 @@ private:
   bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
                                   unsigned OffsetBits) const;
 
-  virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
-                                SDValue &Offset) const;
+  bool selectAddrRegImm(SDValue Addr, SDValue &Base,
+                        SDValue &Offset) const override;
 
-  virtual bool selectAddrRegReg(SDValue Addr, SDValue &Base,
-                                SDValue &Offset) const;
+  bool selectAddrRegReg(SDValue Addr, SDValue &Base,
+                        SDValue &Offset) const override;
 
-  virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
-                                 SDValue &Offset) const;
+  bool selectAddrDefault(SDValue Addr, SDValue &Base,
+                         SDValue &Offset) const override;
 
-  virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
-                             SDValue &Offset) const;
+  bool selectIntAddr(SDValue Addr, SDValue &Base,
+                     SDValue &Offset) const override;
 
-  virtual bool selectAddrRegImm10(SDValue Addr, SDValue &Base,
-                                  SDValue &Offset) const;
+  bool selectAddrRegImm10(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) const;
 
-  virtual bool selectAddrRegImm12(SDValue Addr, SDValue &Base,
-                                  SDValue &Offset) const;
+  bool selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) const;
 
-  virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
-                               SDValue &Offset) const;
+  bool selectIntAddrMM(SDValue Addr, SDValue &Base,
+                       SDValue &Offset) const override;
 
-  virtual bool selectIntAddrMSA(SDValue Addr, SDValue &Base,
-                                SDValue &Offset) const;
+  bool selectIntAddrMSA(SDValue Addr, SDValue &Base,
+                        SDValue &Offset) const override;
 
   /// \brief Select constant vector splats.
-  virtual bool selectVSplat(SDNode *N, APInt &Imm) const;
+  bool selectVSplat(SDNode *N, APInt &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a given integer.
-  virtual bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+  bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
                                   unsigned ImmBitSize) const;
   /// \brief Select constant vector splats whose value fits in a uimm1.
-  virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm1(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a uimm2.
-  virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm2(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a uimm3.
-  virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm3(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a uimm4.
-  virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm4(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a uimm5.
-  virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm5(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a uimm6.
-  virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm6(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a uimm8.
-  virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm8(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a simm5.
-  virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
+  bool selectVSplatSimm5(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value is a power of 2.
-  virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value is the inverse of a
   /// power of 2.
-  virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value is a run of set bits
   /// ending at the most significant bit
-  virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
+  bool selectVSplatMaskL(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value is a run of set bits
   /// starting at bit zero.
-  virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
+  bool selectVSplatMaskR(SDValue N, SDValue &Imm) const override;
 
-  virtual std::pair<bool, SDNode*> selectNode(SDNode *Node);
+  std::pair<bool, SDNode*> selectNode(SDNode *Node) override;
 
-  virtual void processFunctionAfterISel(MachineFunction &MF);
+  void processFunctionAfterISel(MachineFunction &MF) override;
 
   // Insert instructions to initialize the global base register in the
   // first MBB of the function.
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 0dac0b7..969d730 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -10,7 +10,6 @@
 // Subclass of MipsTargetLowering specialized for mips32/64.
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "mips-isel"
 #include "MipsSEISelLowering.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
@@ -24,6 +23,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-isel"
+
 static cl::opt<bool>
 EnableMipsTailCalls("enable-mips-tail-calls", cl::Hidden,
                     cl::desc("MIPS: Enable tail calls."), cl::init(false));
@@ -119,10 +120,10 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
 
   if (Subtarget->hasCnMips())
     setOperationAction(ISD::MUL,              MVT::i64, Legal);
-  else if (hasMips64())
+  else if (isGP64bit())
     setOperationAction(ISD::MUL,              MVT::i64, Custom);
 
-  if (hasMips64()) {
+  if (isGP64bit()) {
     setOperationAction(ISD::MULHS,            MVT::i64, Custom);
     setOperationAction(ISD::MULHU,            MVT::i64, Custom);
   }
@@ -253,6 +254,16 @@ MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
                                                     bool *Fast) const {
   MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
 
+  if (Subtarget->systemSupportsUnalignedAccess()) {
+    // MIPS32r6/MIPS64r6 is required to support unaligned access. It's
+    // implementation defined whether this is handled by hardware, software, or
+    // a hybrid of the two but it's expected that most implementations will
+    // handle the majority of cases in hardware.
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
+
   switch (SVT) {
   case MVT::i64:
   case MVT::i32:
@@ -487,7 +498,8 @@ static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
         Log2 == ExtendTySize) {
       SDValue Ops[] = { Op0->getOperand(0), Op0->getOperand(1), Op0Op2 };
       DAG.MorphNodeTo(Op0.getNode(), MipsISD::VEXTRACT_ZEXT_ELT,
-                      Op0->getVTList(), Ops, Op0->getNumOperands());
+                      Op0->getVTList(),
+                      makeArrayRef(Ops, Op0->getNumOperands()));
       return Op0;
     }
   }
@@ -507,7 +519,7 @@ static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
 static bool isVSplat(SDValue N, APInt &Imm, bool IsLittleEndian) {
   BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N.getNode());
 
-  if (Node == NULL)
+  if (!Node)
     return false;
 
   APInt SplatValue, SplatUndef;
@@ -831,7 +843,8 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
         SDValue Ops[] = { Op0Op0->getOperand(0), Op0Op0->getOperand(1),
                           Op0Op0->getOperand(2) };
         DAG.MorphNodeTo(Op0Op0.getNode(), MipsISD::VEXTRACT_SEXT_ELT,
-                        Op0Op0->getVTList(), Ops, Op0Op0->getNumOperands());
+                        Op0Op0->getVTList(),
+                        makeArrayRef(Ops, Op0Op0->getNumOperands()));
         return Op0Op0;
       }
     }
@@ -1051,6 +1064,18 @@ MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return emitINSERT_FW(MI, BB);
   case Mips::INSERT_FD_PSEUDO:
     return emitINSERT_FD(MI, BB);
+  case Mips::INSERT_B_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 1, false);
+  case Mips::INSERT_H_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 2, false);
+  case Mips::INSERT_W_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 4, false);
+  case Mips::INSERT_D_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 8, false);
+  case Mips::INSERT_FW_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 4, true);
+  case Mips::INSERT_FD_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 8, true);
   case Mips::FILL_FW_PSEUDO:
     return emitFILL_FW(MI, BB);
   case Mips::FILL_FD_PSEUDO:
@@ -1117,7 +1142,7 @@ SDValue MipsSETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue BP = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
   SDValue Ops[2] = {BP, Hi.getValue(1)};
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -1168,7 +1193,7 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
     return HasLo ? Lo : Hi;
 
   SDValue Vals[] = { Lo, Hi };
-  return DAG.getMergeValues(Vals, 2, DL);
+  return DAG.getMergeValues(Vals, DL);
 }
 
 
@@ -1235,7 +1260,7 @@ static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
     ResTys.push_back((*I == MVT::i64) ? MVT::Untyped : *I);
 
   // Create node.
-  SDValue Val = DAG.getNode(Opc, DL, ResTys, &Ops[0], Ops.size());
+  SDValue Val = DAG.getNode(Opc, DL, ResTys, Ops);
   SDValue Out = (ResTys[0] == MVT::Untyped) ? extractLOHI(Val, DL, DAG) : Val;
 
   if (!HasChainIn)
@@ -1243,7 +1268,7 @@ static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
 
   assert(Val->getValueType(1) == MVT::Other);
   SDValue Vals[] = { Out, SDValue(Val.getNode(), 1) };
-  return DAG.getMergeValues(Vals, 2, DL);
+  return DAG.getMergeValues(Vals, DL);
 }
 
 // Lower an MSA copy intrinsic into the specified SelectionDAG node
@@ -1280,8 +1305,8 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
   SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
                       LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
 
-  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, Ops,
-                               ViaVecTy.getVectorNumElements());
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy,
+                       makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
 
   if (ViaVecTy != ResVecTy)
     Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
@@ -1320,8 +1345,8 @@ static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
                       SplatValueA, SplatValueB, SplatValueA, SplatValueB,
                       SplatValueA, SplatValueB, SplatValueA, SplatValueB };
 
-  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, Ops,
-                               ViaVecTy.getVectorNumElements());
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy,
+                       makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
 
   if (VecTy != ViaVecTy)
     Result = DAG.getNode(ISD::BITCAST, DL, VecTy, Result);
@@ -1355,7 +1380,7 @@ static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG,
     }
   }
 
-  if (Exp2Imm.getNode() == NULL) {
+  if (!Exp2Imm.getNode()) {
     // We couldnt constant fold, do a vector shift instead
 
     // Extend i32 to i64 if necessary. Sign or zero extend doesn't matter since
@@ -1735,7 +1760,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     // If ResTy is v2i64 then the type legalizer will break this node down into
     // an equivalent v4i32.
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, ResTy, &Ops[0], Ops.size());
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, ResTy, Ops);
   }
   case Intrinsic::mips_fexp2_w:
   case Intrinsic::mips_fexp2_d: {
@@ -2560,8 +2585,7 @@ static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
        ++I)
     Ops.push_back(DAG.getTargetConstant(*I, MaskEltTy));
 
-  SDValue MaskVec = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecTy, &Ops[0],
-                                Ops.size());
+  SDValue MaskVec = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecTy, Ops);
 
   if (Using1stVec && Using2ndVec) {
     Op0 = Op->getOperand(0);
@@ -2885,6 +2909,131 @@ MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI,
   return BB;
 }
 
+// Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction.
+//
+// For integer:
+// (INSERT_([BHWD]|F[WD])_PSEUDO $wd, $wd_in, $n, $rs)
+// =>
+// (SLL $lanetmp1, $lane, <log2size)
+// (SLD_B $wdtmp1, $wd_in, $wd_in, $lanetmp1)
+// (INSERT_[BHWD], $wdtmp2, $wdtmp1, 0, $rs)
+// (NEG $lanetmp2, $lanetmp1)
+// (SLD_B $wd, $wdtmp2, $wdtmp2,  $lanetmp2)
+//
+// For floating point:
+// (INSERT_([BHWD]|F[WD])_PSEUDO $wd, $wd_in, $n, $fs)
+// =>
+// (SUBREG_TO_REG $wt, $fs, <subreg>)
+// (SLL $lanetmp1, $lane, <log2size)
+// (SLD_B $wdtmp1, $wd_in, $wd_in, $lanetmp1)
+// (INSVE_[WD], $wdtmp2, 0, $wdtmp1, 0)
+// (NEG $lanetmp2, $lanetmp1)
+// (SLD_B $wd, $wdtmp2, $wdtmp2,  $lanetmp2)
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned EltSizeInBytes,
+                                         bool IsFP) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned SrcVecReg = MI->getOperand(1).getReg();
+  unsigned LaneReg = MI->getOperand(2).getReg();
+  unsigned SrcValReg = MI->getOperand(3).getReg();
+
+  const TargetRegisterClass *VecRC = nullptr;
+  const TargetRegisterClass *GPRRC = isGP64bit() ? &Mips::GPR64RegClass
+                                                 : &Mips::GPR32RegClass;
+  unsigned EltLog2Size;
+  unsigned InsertOp = 0;
+  unsigned InsveOp = 0;
+  switch (EltSizeInBytes) {
+  default:
+    llvm_unreachable("Unexpected size");
+  case 1:
+    EltLog2Size = 0;
+    InsertOp = Mips::INSERT_B;
+    InsveOp = Mips::INSVE_B;
+    VecRC = &Mips::MSA128BRegClass;
+    break;
+  case 2:
+    EltLog2Size = 1;
+    InsertOp = Mips::INSERT_H;
+    InsveOp = Mips::INSVE_H;
+    VecRC = &Mips::MSA128HRegClass;
+    break;
+  case 4:
+    EltLog2Size = 2;
+    InsertOp = Mips::INSERT_W;
+    InsveOp = Mips::INSVE_W;
+    VecRC = &Mips::MSA128WRegClass;
+    break;
+  case 8:
+    EltLog2Size = 3;
+    InsertOp = Mips::INSERT_D;
+    InsveOp = Mips::INSVE_D;
+    VecRC = &Mips::MSA128DRegClass;
+    break;
+  }
+
+  if (IsFP) {
+    unsigned Wt = RegInfo.createVirtualRegister(VecRC);
+    BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+        .addImm(0)
+        .addReg(SrcValReg)
+        .addImm(EltSizeInBytes == 8 ? Mips::sub_64 : Mips::sub_lo);
+    SrcValReg = Wt;
+  }
+
+  // Convert the lane index into a byte index
+  if (EltSizeInBytes != 1) {
+    unsigned LaneTmp1 = RegInfo.createVirtualRegister(GPRRC);
+    BuildMI(*BB, MI, DL, TII->get(Mips::SLL), LaneTmp1)
+        .addReg(LaneReg)
+        .addImm(EltLog2Size);
+    LaneReg = LaneTmp1;
+  }
+
+  // Rotate bytes around so that the desired lane is element zero
+  unsigned WdTmp1 = RegInfo.createVirtualRegister(VecRC);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), WdTmp1)
+      .addReg(SrcVecReg)
+      .addReg(SrcVecReg)
+      .addReg(LaneReg);
+
+  unsigned WdTmp2 = RegInfo.createVirtualRegister(VecRC);
+  if (IsFP) {
+    // Use insve.df to insert to element zero
+    BuildMI(*BB, MI, DL, TII->get(InsveOp), WdTmp2)
+        .addReg(WdTmp1)
+        .addImm(0)
+        .addReg(SrcValReg)
+        .addImm(0);
+  } else {
+    // Use insert.df to insert to element zero
+    BuildMI(*BB, MI, DL, TII->get(InsertOp), WdTmp2)
+        .addReg(WdTmp1)
+        .addReg(SrcValReg)
+        .addImm(0);
+  }
+
+  // Rotate elements the rest of the way for a full rotation.
+  // sld.df inteprets $rt modulo the number of columns so we only need to negate
+  // the lane index to do this.
+  unsigned LaneTmp2 = RegInfo.createVirtualRegister(GPRRC);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUB), LaneTmp2)
+      .addReg(Mips::ZERO)
+      .addReg(LaneReg);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), Wd)
+      .addReg(WdTmp2)
+      .addReg(WdTmp2)
+      .addReg(LaneTmp2);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
 // Emit the FILL_FW pseudo instruction.
 //
 // fill_fw_pseudo $wd, $fs
diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h
index 079fbf6..03a20ef 100644
--- a/lib/Target/Mips/MipsSEISelLowering.h
+++ b/lib/Target/Mips/MipsSEISelLowering.h
@@ -30,22 +30,23 @@ namespace llvm {
     void addMSAFloatType(MVT::SimpleValueType Ty,
                          const TargetRegisterClass *RC);
 
-    virtual bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS = 0,
-                                               bool *Fast = 0) const override;
+    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS = 0,
+                                       bool *Fast = nullptr) const override;
 
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    virtual MachineBasicBlock *
-    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *MBB) const override;
 
-    virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
-                                    EVT VT) const {
+    bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+                            EVT VT) const override {
       return false;
     }
 
-    virtual const TargetRegisterClass *getRepRegClassFor(MVT VT) const {
+    const TargetRegisterClass *getRepRegClassFor(MVT VT) const override {
       if (VT == MVT::Untyped)
         return Subtarget->hasDSP() ? &Mips::ACC64DSPRegClass :
                                      &Mips::ACC64RegClass;
@@ -54,16 +55,16 @@ namespace llvm {
     }
 
   private:
-    virtual bool
-    isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
-                                      unsigned NextStackOffset,
-                                      const MipsFunctionInfo& FI) const;
+    bool isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                     unsigned NextStackOffset,
+                                     const MipsFunctionInfo& FI) const override;
 
-    virtual void
+    void
     getOpndList(SmallVectorImpl<SDValue> &Ops,
                 std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
                 bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
-                CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
+                CallLoweringInfo &CLI, SDValue Callee,
+                SDValue Chain) const override;
 
     SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
@@ -97,6 +98,11 @@ namespace llvm {
     /// \brief Emit the INSERT_FD pseudo instruction
     MachineBasicBlock *emitINSERT_FD(MachineInstr *MI,
                                      MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction
+    MachineBasicBlock *emitINSERT_DF_VIDX(MachineInstr *MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned EltSizeInBytes,
+                                          bool IsFP) const;
     /// \brief Emit the FILL_FW pseudo instruction
     MachineBasicBlock *emitFILL_FW(MachineInstr *MI,
                                    MachineBasicBlock *BB) const;
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 094ee29..f6f364f 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -368,7 +368,7 @@ void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
   if (isInt<16>(Amount))// addi sp, sp, amount
     BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount);
   else { // Expand immediate that doesn't fit in 16-bit.
-    unsigned Reg = loadImmediate(Amount, MBB, I, DL, 0);
+    unsigned Reg = loadImmediate(Amount, MBB, I, DL, nullptr);
     BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(Reg, RegState::Kill);
   }
 }
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index 6d2dd90..aa68552 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -26,46 +26,46 @@ class MipsSEInstrInfo : public MipsInstrInfo {
 public:
   explicit MipsSEInstrInfo(MipsTargetMachine &TM);
 
-  virtual const MipsRegisterInfo &getRegisterInfo() const;
+  const MipsRegisterInfo &getRegisterInfo() const override;
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
 
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
 
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
 
-  virtual void storeRegToStack(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator MI,
-                               unsigned SrcReg, bool isKill, int FrameIndex,
-                               const TargetRegisterClass *RC,
-                               const TargetRegisterInfo *TRI,
-                               int64_t Offset) const;
+  void storeRegToStack(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MI,
+                       unsigned SrcReg, bool isKill, int FrameIndex,
+                       const TargetRegisterClass *RC,
+                       const TargetRegisterInfo *TRI,
+                       int64_t Offset) const override;
 
-  virtual void loadRegFromStack(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI,
-                                unsigned DestReg, int FrameIndex,
-                                const TargetRegisterClass *RC,
-                                const TargetRegisterInfo *TRI,
-                                int64_t Offset) const;
+  void loadRegFromStack(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MI,
+                        unsigned DestReg, int FrameIndex,
+                        const TargetRegisterClass *RC,
+                        const TargetRegisterInfo *TRI,
+                        int64_t Offset) const override;
 
-  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
-  virtual unsigned getOppositeBranchOpc(unsigned Opc) const;
+  unsigned getOppositeBranchOpc(unsigned Opc) const override;
 
   /// Adjust SP by Amount bytes.
   void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
@@ -79,7 +79,7 @@ public:
                          unsigned *NewImm) const;
 
 private:
-  virtual unsigned getAnalyzableBrOpc(unsigned Opc) const;
+  unsigned getAnalyzableBrOpc(unsigned Opc) const override;
 
   void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    unsigned Opc) const;
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 2ac082f..0af1a6b 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -39,6 +39,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-reg-info"
+
 MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST)
   : MipsRegisterInfo(ST) {}
 
@@ -187,7 +189,7 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
           *static_cast<const MipsSEInstrInfo *>(
                MBB.getParent()->getTarget().getInstrInfo());
       unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL,
-                                       OffsetBitSize == 16 ? &NewImm : NULL);
+                                       OffsetBitSize == 16 ? &NewImm : nullptr);
       BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)
         .addReg(Reg, RegState::Kill);
 
diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h
index 76cdd9d..f2f3a7e 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.h
+++ b/lib/Target/Mips/MipsSERegisterInfo.h
@@ -24,16 +24,16 @@ class MipsSERegisterInfo : public MipsRegisterInfo {
 public:
   MipsSERegisterInfo(const MipsSubtarget &Subtarget);
 
-  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
 
-  virtual const TargetRegisterClass *intRegClass(unsigned Size) const;
+  const TargetRegisterClass *intRegClass(unsigned Size) const override;
 
 private:
-  virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
-                           int FrameIndex, uint64_t StackSize,
-                           int64_t SPOffset) const;
+  void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                   int FrameIndex, uint64_t StackSize,
+                   int64_t SPOffset) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.cpp b/lib/Target/Mips/MipsSelectionDAGInfo.cpp
index e4d70fc..0d4398e 100644
--- a/lib/Target/Mips/MipsSelectionDAGInfo.cpp
+++ b/lib/Target/Mips/MipsSelectionDAGInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-selectiondag-info"
 #include "MipsTargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-selectiondag-info"
+
 MipsSelectionDAGInfo::MipsSelectionDAGInfo(const MipsTargetMachine &TM)
   : TargetSelectionDAGInfo(TM) {
 }
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 143b945..74ec064 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-subtarget"
-
 #include "MipsMachineFunction.h"
 #include "Mips.h"
 #include "MipsRegisterInfo.h"
@@ -25,13 +23,14 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "MipsGenSubtargetInfo.inc"
 
-
-using namespace llvm;
-
 // FIXME: Maybe this should be on by default when Mips16 is specified
 //
 static cl::opt<bool> Mixed16_32(
@@ -77,17 +76,16 @@ void MipsSubtarget::anchor() { }
 
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
                              const std::string &FS, bool little,
-                             Reloc::Model _RM, MipsTargetMachine *_TM) :
-  MipsGenSubtargetInfo(TT, CPU, FS),
-  MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little),
-  IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
-  HasCnMips(false), IsLinux(true), HasSEInReg(false), HasCondMov(false),
-  HasSwap(false), HasBitCount(false), HasFPIdx(false),
-  InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
-  InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
-  AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false),
-  RM(_RM), OverrideMode(NoOverride), TM(_TM), TargetTriple(TT)
-{
+                             Reloc::Model _RM, MipsTargetMachine *_TM)
+    : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(Mips32),
+      MipsABI(UnknownABI), IsLittle(little), IsSingleFloat(false),
+      IsFP64bit(false), IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false),
+      HasCnMips(false), IsLinux(true), HasMips3_32(false), HasMips3_32r2(false),
+      HasMips4_32(false), HasMips4_32r2(false), HasMips5_32r2(false),
+      InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
+      InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
+      AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false),
+      RM(_RM), OverrideMode(NoOverride), TM(_TM), TargetTriple(TT) {
   std::string CPUName = CPU;
   CPUName = selectMipsCPU(TT, CPUName);
 
@@ -109,6 +107,19 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
 
+  // Don't even attempt to generate code for MIPS-I, MIPS-II, MIPS-III, and
+  // MIPS-V. They have not been tested and currently exist for the integrated
+  // assembler only.
+  if (MipsArchVersion == Mips1)
+    report_fatal_error("Code generation for MIPS-I is not implemented", false);
+  if (MipsArchVersion == Mips2)
+    report_fatal_error("Code generation for MIPS-II is not implemented", false);
+  if (MipsArchVersion == Mips3)
+    report_fatal_error("Code generation for MIPS-III is not implemented",
+                       false);
+  if (MipsArchVersion == Mips5)
+    report_fatal_error("Code generation for MIPS-V is not implemented", false);
+
   // Assert exactly one ABI was chosen.
   assert(MipsABI != UnknownABI);
   assert((((getFeatureBits() & Mips::FeatureO32) != 0) +
@@ -126,15 +137,23 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
                        "See -mattr=+fp64.",
                        false);
 
+  if (hasMips32r6()) {
+    StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
+
+    assert(isFP64bit());
+    assert(isNaN2008());
+    if (hasDSP())
+      report_fatal_error(ISA + " is not compatible with the DSP ASE", false);
+  }
+
   // Is the target system Linux ?
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
 
   // Set UseSmallSection.
+  // TODO: Investigate the IsLinux check. I suspect it's really checking for
+  //       bare-metal.
   UseSmallSection = !IsLinux && (RM == Reloc::Static);
-  // set some subtarget specific features
-  if (inMips16Mode())
-    HasBitCount=false;
 }
 
 bool
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 2166b93..373f481 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -37,7 +37,10 @@ public:
   };
 
 protected:
-  enum MipsArchEnum { Mips32, Mips32r2, Mips4, Mips64, Mips64r2 };
+  enum MipsArchEnum {
+    Mips1, Mips2, Mips32, Mips32r2, Mips32r6, Mips3, Mips4, Mips5, Mips64,
+    Mips64r2, Mips64r6
+  };
 
   // Mips architecture version
   MipsArchEnum MipsArchVersion;
@@ -56,6 +59,9 @@ protected:
   // IsFP64bit - The target processor has 64-bit floating point registers.
   bool IsFP64bit;
 
+  // IsNan2008 - IEEE 754-2008 NaN encoding.
+  bool IsNaN2008bit;
+
   // IsFP64bit - General-purpose registers are 64 bits wide
   bool IsGP64bit;
 
@@ -73,20 +79,20 @@ protected:
 
   /// Features related to the presence of specific instructions.
 
-  // HasSEInReg - SEB and SEH (signext in register) instructions.
-  bool HasSEInReg;
+  // HasMips3_32 - The subset of MIPS-III instructions added to MIPS32
+  bool HasMips3_32;
 
-  // HasCondMov - Conditional mov (MOVZ, MOVN) instructions.
-  bool HasCondMov;
+  // HasMips3_32r2 - The subset of MIPS-III instructions added to MIPS32r2
+  bool HasMips3_32r2;
 
-  // HasSwap - Byte and half swap instructions.
-  bool HasSwap;
+  // HasMips4_32 - Has the subset of MIPS-IV present in MIPS32
+  bool HasMips4_32;
 
-  // HasBitCount - Count leading '1' and '0' bits.
-  bool HasBitCount;
+  // HasMips4_32r2 - Has the subset of MIPS-IV present in MIPS32r2
+  bool HasMips4_32r2;
 
-  // HasFPIdx -- Floating point indexed load/store instructions.
-  bool HasFPIdx;
+  // HasMips5_32r2 - Has the subset of MIPS-V present in MIPS32r2
+  bool HasMips5_32r2;
 
   // InMips16 -- can process Mips16 instructions
   bool InMips16Mode;
@@ -127,9 +133,9 @@ protected:
 
   Triple TargetTriple;
 public:
-  virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                                     AntiDepBreakMode& Mode,
-                                     RegClassVector& CriticalPathRCs) const;
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             AntiDepBreakMode& Mode,
+                             RegClassVector& CriticalPathRCs) const override;
 
   /// Only O32 and EABI supported right now.
   bool isABI_EABI() const { return MipsABI == EABI; }
@@ -148,16 +154,24 @@ public:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
+  bool hasMips2() const { return MipsArchVersion >= Mips2; }
+  bool hasMips3() const { return MipsArchVersion >= Mips3; }
+  bool hasMips4_32() const { return HasMips4_32; }
+  bool hasMips4_32r2() const { return HasMips4_32r2; }
   bool hasMips32() const { return MipsArchVersion >= Mips32; }
   bool hasMips32r2() const { return MipsArchVersion == Mips32r2 ||
                                    MipsArchVersion == Mips64r2; }
+  bool hasMips32r6() const { return MipsArchVersion == Mips32r6 ||
+                                   MipsArchVersion == Mips64r6; }
   bool hasMips64() const { return MipsArchVersion >= Mips64; }
   bool hasMips64r2() const { return MipsArchVersion == Mips64r2; }
+  bool hasMips64r6() const { return MipsArchVersion == Mips64r6; }
 
   bool hasCnMips() const { return HasCnMips; }
 
   bool isLittle() const { return IsLittle; }
   bool isFP64bit() const { return IsFP64bit; }
+  bool isNaN2008() const { return IsNaN2008bit; }
   bool isNotFP64bit() const { return !IsFP64bit; }
   bool isGP64bit() const { return IsGP64bit; }
   bool isGP32bit() const { return !IsGP64bit; }
@@ -197,11 +211,6 @@ public:
   }
 
   /// Features related to the presence of specific instructions.
-  bool hasSEInReg()   const { return HasSEInReg; }
-  bool hasCondMov()   const { return HasCondMov; }
-  bool hasSwap()      const { return HasSwap; }
-  bool hasBitCount()  const { return HasBitCount; }
-  bool hasFPIdx()     const { return HasFPIdx; }
   bool hasExtractInsert() const { return !inMips16Mode() && hasMips32r2(); }
 
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
@@ -213,10 +222,9 @@ public:
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isNotTargetNaCl() const { return !TargetTriple.isOSNaCl(); }
 
-// for now constant islands are on for the whole compilation unit but we only
-// really use them if in addition we are in mips16 mode
-//
-static bool useConstantIslands();
+  // for now constant islands are on for the whole compilation unit but we only
+  // really use them if in addition we are in mips16 mode
+  static bool useConstantIslands();
 
   unsigned stackAlignment() const { return hasMips64() ? 16 : 8; }
 
@@ -226,7 +234,12 @@ static bool useConstantIslands();
   /// \brief Reset the subtarget for the Mips target.
   void resetSubtarget(MachineFunction *MF);
 
-
+  /// Does the system support unaligned memory access.
+  ///
+  /// MIPS32r6/MIPS64r6 require full unaligned access support but does not
+  /// specify which component of the system provides it. Hardware, software, and
+  /// hybrid implementations are all valid.
+  bool systemSupportsUnalignedAccess() const { return hasMips32r6(); }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index e9053c8..984c58e 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -35,7 +35,7 @@
 #include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
-
+#define DEBUG_TYPE "mips"
 
 extern "C" void LLVMInitializeMipsTarget() {
   // Register the target.
@@ -171,12 +171,12 @@ public:
     return *getMipsTargetMachine().getSubtargetImpl();
   }
 
-  virtual void addIRPasses();
-  virtual bool addInstSelector();
-  virtual void addMachineSSAOptimization();
-  virtual bool addPreEmitPass();
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  void addMachineSSAOptimization() override;
+  bool addPreEmitPass() override;
 
-  virtual bool addPreRegAlloc();
+  bool addPreRegAlloc() override;
 
 };
 } // namespace
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 5a9a11d..a5aa39b 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -20,7 +20,6 @@
 #include "MipsJITInfo.h"
 #include "MipsSelectionDAGInfo.h"
 #include "MipsSubtarget.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/DataLayout.h"
@@ -34,15 +33,15 @@ class MipsRegisterInfo;
 class MipsTargetMachine : public LLVMTargetMachine {
   MipsSubtarget       Subtarget;
   const DataLayout    DL; // Calculates type size & alignment
-  OwningPtr<const MipsInstrInfo> InstrInfo;
-  OwningPtr<const MipsFrameLowering> FrameLowering;
-  OwningPtr<const MipsTargetLowering> TLInfo;
-  OwningPtr<const MipsInstrInfo> InstrInfo16;
-  OwningPtr<const MipsFrameLowering> FrameLowering16;
-  OwningPtr<const MipsTargetLowering> TLInfo16;
-  OwningPtr<const MipsInstrInfo> InstrInfoSE;
-  OwningPtr<const MipsFrameLowering> FrameLoweringSE;
-  OwningPtr<const MipsTargetLowering> TLInfoSE;
+  std::unique_ptr<const MipsInstrInfo> InstrInfo;
+  std::unique_ptr<const MipsFrameLowering> FrameLowering;
+  std::unique_ptr<const MipsTargetLowering> TLInfo;
+  std::unique_ptr<const MipsInstrInfo> InstrInfo16;
+  std::unique_ptr<const MipsFrameLowering> FrameLowering16;
+  std::unique_ptr<const MipsTargetLowering> TLInfo16;
+  std::unique_ptr<const MipsInstrInfo> InstrInfoSE;
+  std::unique_ptr<const MipsFrameLowering> FrameLoweringSE;
+  std::unique_ptr<const MipsTargetLowering> TLInfoSE;
   MipsSelectionDAGInfo TSInfo;
   const InstrItineraryData &InstrItins;
   MipsJITInfo JITInfo;
@@ -56,39 +55,38 @@ public:
 
   virtual ~MipsTargetMachine() {}
 
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  void addAnalysisPasses(PassManagerBase &PM) override;
 
-  virtual const MipsInstrInfo *getInstrInfo() const
+  const MipsInstrInfo *getInstrInfo() const override
   { return InstrInfo.get(); }
-  virtual const TargetFrameLowering *getFrameLowering() const
+  const TargetFrameLowering *getFrameLowering() const override
   { return FrameLowering.get(); }
-  virtual const MipsSubtarget *getSubtargetImpl() const
+  const MipsSubtarget *getSubtargetImpl() const override
   { return &Subtarget; }
-  virtual const DataLayout *getDataLayout()    const
+  const DataLayout *getDataLayout()    const override
   { return &DL;}
 
-  virtual const InstrItineraryData *getInstrItineraryData() const {
-    return Subtarget.inMips16Mode() ? 0 : &InstrItins;
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return Subtarget.inMips16Mode() ? nullptr : &InstrItins;
   }
 
-  virtual MipsJITInfo *getJITInfo()
-  { return &JITInfo; }
+  MipsJITInfo *getJITInfo() override { return &JITInfo; }
 
-  virtual const MipsRegisterInfo *getRegisterInfo()  const {
+  const MipsRegisterInfo *getRegisterInfo()  const override {
     return &InstrInfo->getRegisterInfo();
   }
 
-  virtual const MipsTargetLowering *getTargetLowering() const {
+  const MipsTargetLowering *getTargetLowering() const override {
     return TLInfo.get();
   }
 
-  virtual const MipsSelectionDAGInfo* getSelectionDAGInfo() const {
+  const MipsSelectionDAGInfo* getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
   // Pass Pipeline Configuration
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
-  virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override;
 
   // Set helper classes
   void setHelperClassesMips16();
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index 5f4b74b..4ad37ac 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -34,6 +34,8 @@ public:
 
   virtual void emitDirectiveEnt(const MCSymbol &Symbol) = 0;
   virtual void emitDirectiveAbiCalls() = 0;
+  virtual void emitDirectiveNaN2008() = 0;
+  virtual void emitDirectiveNaNLegacy() = 0;
   virtual void emitDirectiveOptionPic0() = 0;
   virtual void emitDirectiveOptionPic2() = 0;
   virtual void emitFrame(unsigned StackReg, unsigned StackSize,
@@ -45,6 +47,11 @@ public:
   virtual void emitDirectiveSetMips64() = 0;
   virtual void emitDirectiveSetMips64R2() = 0;
   virtual void emitDirectiveSetDsp() = 0;
+
+  // PIC support
+  virtual void emitDirectiveCpload(unsigned RegNo) = 0;
+  virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                                    const MCSymbol &Sym, bool IsReg) = 0;
 };
 
 // This part is for ascii assembly output
@@ -53,32 +60,39 @@ class MipsTargetAsmStreamer : public MipsTargetStreamer {
 
 public:
   MipsTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
-  virtual void emitDirectiveSetMicroMips();
-  virtual void emitDirectiveSetNoMicroMips();
-  virtual void emitDirectiveSetMips16();
-  virtual void emitDirectiveSetNoMips16();
-
-  virtual void emitDirectiveSetReorder();
-  virtual void emitDirectiveSetNoReorder();
-  virtual void emitDirectiveSetMacro();
-  virtual void emitDirectiveSetNoMacro();
-  virtual void emitDirectiveSetAt();
-  virtual void emitDirectiveSetNoAt();
-  virtual void emitDirectiveEnd(StringRef Name);
-
-  virtual void emitDirectiveEnt(const MCSymbol &Symbol);
-  virtual void emitDirectiveAbiCalls();
-  virtual void emitDirectiveOptionPic0();
-  virtual void emitDirectiveOptionPic2();
-  virtual void emitFrame(unsigned StackReg, unsigned StackSize,
-                         unsigned ReturnReg);
-  virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff);
-  virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff);
-
-  virtual void emitDirectiveSetMips32R2();
-  virtual void emitDirectiveSetMips64();
-  virtual void emitDirectiveSetMips64R2();
-  virtual void emitDirectiveSetDsp();
+  void emitDirectiveSetMicroMips() override;
+  void emitDirectiveSetNoMicroMips() override;
+  void emitDirectiveSetMips16() override;
+  void emitDirectiveSetNoMips16() override;
+
+  void emitDirectiveSetReorder() override;
+  void emitDirectiveSetNoReorder() override;
+  void emitDirectiveSetMacro() override;
+  void emitDirectiveSetNoMacro() override;
+  void emitDirectiveSetAt() override;
+  void emitDirectiveSetNoAt() override;
+  void emitDirectiveEnd(StringRef Name) override;
+
+  void emitDirectiveEnt(const MCSymbol &Symbol) override;
+  void emitDirectiveAbiCalls() override;
+  void emitDirectiveNaN2008() override;
+  void emitDirectiveNaNLegacy() override;
+  void emitDirectiveOptionPic0() override;
+  void emitDirectiveOptionPic2() override;
+  void emitFrame(unsigned StackReg, unsigned StackSize,
+                 unsigned ReturnReg) override;
+  void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
+  void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
+
+  void emitDirectiveSetMips32R2() override;
+  void emitDirectiveSetMips64() override;
+  void emitDirectiveSetMips64R2() override;
+  void emitDirectiveSetDsp() override;
+
+  // PIC support
+  virtual void emitDirectiveCpload(unsigned RegNo);
+  void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                            const MCSymbol &Sym, bool IsReg) override;
 };
 
 // This part is for ELF object output
@@ -92,36 +106,48 @@ public:
   MCELFStreamer &getStreamer();
   MipsTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
 
-  virtual void emitLabel(MCSymbol *Symbol) override;
-  virtual void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
+  void emitLabel(MCSymbol *Symbol) override;
+  void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   void finish() override;
 
-  virtual void emitDirectiveSetMicroMips();
-  virtual void emitDirectiveSetNoMicroMips();
-  virtual void emitDirectiveSetMips16();
-  virtual void emitDirectiveSetNoMips16();
-
-  virtual void emitDirectiveSetReorder();
-  virtual void emitDirectiveSetNoReorder();
-  virtual void emitDirectiveSetMacro();
-  virtual void emitDirectiveSetNoMacro();
-  virtual void emitDirectiveSetAt();
-  virtual void emitDirectiveSetNoAt();
-  virtual void emitDirectiveEnd(StringRef Name);
-
-  virtual void emitDirectiveEnt(const MCSymbol &Symbol);
-  virtual void emitDirectiveAbiCalls();
-  virtual void emitDirectiveOptionPic0();
-  virtual void emitDirectiveOptionPic2();
-  virtual void emitFrame(unsigned StackReg, unsigned StackSize,
-                         unsigned ReturnReg);
-  virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff);
-  virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff);
-
-  virtual void emitDirectiveSetMips32R2();
-  virtual void emitDirectiveSetMips64();
-  virtual void emitDirectiveSetMips64R2();
-  virtual void emitDirectiveSetDsp();
+  void emitDirectiveSetMicroMips() override;
+  void emitDirectiveSetNoMicroMips() override;
+  void emitDirectiveSetMips16() override;
+  void emitDirectiveSetNoMips16() override;
+
+  void emitDirectiveSetReorder() override;
+  void emitDirectiveSetNoReorder() override;
+  void emitDirectiveSetMacro() override;
+  void emitDirectiveSetNoMacro() override;
+  void emitDirectiveSetAt() override;
+  void emitDirectiveSetNoAt() override;
+  void emitDirectiveEnd(StringRef Name) override;
+
+  void emitDirectiveEnt(const MCSymbol &Symbol) override;
+  void emitDirectiveAbiCalls() override;
+  void emitDirectiveNaN2008() override;
+  void emitDirectiveNaNLegacy() override;
+  void emitDirectiveOptionPic0() override;
+  void emitDirectiveOptionPic2() override;
+  void emitFrame(unsigned StackReg, unsigned StackSize,
+                 unsigned ReturnReg) override;
+  void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
+  void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
+
+  void emitDirectiveSetMips32R2() override;
+  void emitDirectiveSetMips64() override;
+  void emitDirectiveSetMips64R2() override;
+  void emitDirectiveSetDsp() override;
+
+  // PIC support
+  virtual void emitDirectiveCpload(unsigned RegNo);
+  void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                            const MCSymbol &Sym, bool IsReg) override;
+
+protected:
+  bool isO32() const { return STI.getFeatureBits() & Mips::FeatureO32; }
+  bool isN32() const { return STI.getFeatureBits() & Mips::FeatureN32; }
+  bool isN64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
 };
 }
 #endif
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 029118a..4e35b18 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -9,6 +9,7 @@ tablegen(LLVM NVPTXGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(NVPTXCommonTableGen)
 
 set(NVPTXCodeGen_sources
+  NVPTXFavorNonGenericAddrSpaces.cpp
   NVPTXFrameLowering.cpp
   NVPTXInstrInfo.cpp
   NVPTXISelDAGToDAG.cpp
@@ -26,6 +27,8 @@ set(NVPTXCodeGen_sources
   NVPTXAssignValidGlobalNames.cpp
   NVPTXPrologEpilogPass.cpp
   NVPTXMCExpr.cpp
+  NVPTXReplaceImageHandles.cpp
+  NVPTXImageOptimizer.cpp
   )
 
 add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
index cf165be..9618896 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "InstPrinter/NVPTXInstPrinter.h"
 #include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "NVPTX.h"
@@ -25,6 +24,8 @@
 #include <cctype>
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #include "NVPTXGenAsmWriter.inc"
 
 
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
index 93029ae..1fb3c57 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
@@ -27,8 +27,8 @@ public:
   NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                    const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
 
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override;
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
@@ -37,15 +37,15 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
-                    const char *Modifier = 0);
+                    const char *Modifier = nullptr);
   void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
-                    const char *Modifier = 0);
+                    const char *Modifier = nullptr);
   void printLdStCode(const MCInst *MI, int OpNum,
-                     raw_ostream &O, const char *Modifier = 0);
+                     raw_ostream &O, const char *Modifier = nullptr);
   void printMemOperand(const MCInst *MI, int OpNum,
-                       raw_ostream &O, const char *Modifier = 0);
+                       raw_ostream &O, const char *Modifier = nullptr);
   void printProtoIdent(const MCInst *MI, int OpNum,
-                       raw_ostream &O, const char *Modifier = 0);
+                       raw_ostream &O, const char *Modifier = nullptr);
 };
 
 }
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
index edf4a80..ddb122f 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@@ -43,14 +43,16 @@ enum PropertyAnnotation {
   PROPERTY_ISSAMPLER,
   PROPERTY_ISREADONLY_IMAGE_PARAM,
   PROPERTY_ISWRITEONLY_IMAGE_PARAM,
+  PROPERTY_ISREADWRITE_IMAGE_PARAM,
   PROPERTY_ISKERNEL_FUNCTION,
   PROPERTY_ALIGN,
+  PROPERTY_MANAGED,
 
   // last property
   PROPERTY_LAST
 };
 
-const unsigned AnnotationNameLen = 8; // length of each annotation name
+const unsigned AnnotationNameLen = 9; // length of each annotation name
 const char PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = {
   "maxntidx",                         // PROPERTY_MAXNTID_X
   "maxntidy",                         // PROPERTY_MAXNTID_Y
@@ -64,8 +66,10 @@ const char PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = {
   "sampler",                          // PROPERTY_ISSAMPLER
   "rdoimage",                         // PROPERTY_ISREADONLY_IMAGE_PARAM
   "wroimage",                         // PROPERTY_ISWRITEONLY_IMAGE_PARAM
+  "rdwrimage",                        // PROPERTY_ISREADWRITE_IMAGE_PARAM
   "kernel",                           // PROPERTY_ISKERNEL_FUNCTION
   "align",                            // PROPERTY_ALIGN
+  "managed",                          // PROPERTY_MANAGED
 
               // last property
   "proplast", // PROPERTY_LAST
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index 3cf6e4b..158ca90 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -20,6 +20,8 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "NVPTXGenInstrInfo.inc"
 
@@ -29,8 +31,6 @@
 #define GET_REGINFO_MC_DESC
 #include "NVPTXGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static MCInstrInfo *createNVPTXMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitNVPTXMCInstrInfo(X);
@@ -66,7 +66,7 @@ static MCInstPrinter *createNVPTXMCInstPrinter(const Target &T,
                                                const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
     return new NVPTXInstPrinter(MAI, MII, MRI, STI);
-  return 0;
+  return nullptr;
 }
 
 // Force static initialization.
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 8cbdd47..e74c808 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -63,9 +63,12 @@ FunctionPass *
 createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
 ModulePass *createNVPTXAssignValidGlobalNamesPass();
 ModulePass *createGenericToNVVMPass();
+FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass();
 ModulePass *createNVVMReflectPass();
 ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
 MachineFunctionPass *createNVPTXPrologEpilogPass();
+MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
+FunctionPass *createNVPTXImageOptimizerPass();
 
 bool isImageOrSamplerVal(const Value *, const Module *);
 
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
index 22404b7..5b61068 100644
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.h
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -30,17 +30,17 @@ public:
   static char ID; // Pass ID
   NVPTXAllocaHoisting() : FunctionPass(ID) {}
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DataLayoutPass>();
     AU.addPreserved("stack-protector");
     AU.addPreserved<MachineFunctionAnalysis>();
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "NVPTX specific alloca hoisting";
   }
 
-  virtual bool runOnFunction(Function &function);
+  bool runOnFunction(Function &function) override;
 };
 
 extern FunctionPass *createAllocaHoisting();
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 97e2cc6..4ec575f 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
 #include "NVPTX.h"
 #include "NVPTXInstrInfo.h"
+#include "NVPTXMachineFunctionInfo.h"
 #include "NVPTXMCExpr.h"
 #include "NVPTXRegisterInfo.h"
 #include "NVPTXTargetMachine.h"
@@ -131,7 +132,7 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
     return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
 
   const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
-  if (CE == 0)
+  if (!CE)
     llvm_unreachable("Unknown constant value to lower!");
 
   switch (CE->getOpcode()) {
@@ -149,9 +150,24 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
       raw_string_ostream OS(S);
       OS << "Unsupported expression in static initializer: ";
       CE->printAsOperand(OS, /*PrintType=*/ false,
-                     !AP.MF ? 0 : AP.MF->getFunction()->getParent());
+                         !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
       report_fatal_error(OS.str());
     }
+  case Instruction::AddrSpaceCast: {
+    // Strip any addrspace(1)->addrspace(0) addrspace casts. These will be
+    // handled by the generic() logic in the MCExpr printer
+    PointerType *DstTy            = cast<PointerType>(CE->getType());
+    PointerType *SrcTy            = cast<PointerType>(CE->getOperand(0)->getType());
+    if (SrcTy->getAddressSpace() == 1 && DstTy->getAddressSpace() == 0) {
+      return LowerConstant(cast<const Constant>(CE->getOperand(0)), AP);
+    }
+    std::string S;
+    raw_string_ostream OS(S);
+    OS << "Unsupported expression in static initializer: ";
+    CE->printAsOperand(OS, /*PrintType=*/ false,
+                       !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
+    report_fatal_error(OS.str());
+  }
   case Instruction::GetElementPtr: {
     const DataLayout &TD = *AP.TM.getDataLayout();
     // Generate a symbolic expression for the byte address
@@ -310,13 +326,279 @@ void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   EmitToStreamer(OutStreamer, Inst);
 }
 
+// Handle symbol backtracking for targets that do not support image handles
+bool NVPTXAsmPrinter::lowerImageHandleOperand(const MachineInstr *MI,
+                                           unsigned OpNo, MCOperand &MCOp) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+
+  switch (MI->getOpcode()) {
+  default: return false;
+  case NVPTX::TEX_1D_F32_I32:
+  case NVPTX::TEX_1D_F32_F32:
+  case NVPTX::TEX_1D_F32_F32_LEVEL:
+  case NVPTX::TEX_1D_F32_F32_GRAD:
+  case NVPTX::TEX_1D_I32_I32:
+  case NVPTX::TEX_1D_I32_F32:
+  case NVPTX::TEX_1D_I32_F32_LEVEL:
+  case NVPTX::TEX_1D_I32_F32_GRAD:
+  case NVPTX::TEX_1D_ARRAY_F32_I32:
+  case NVPTX::TEX_1D_ARRAY_F32_F32:
+  case NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL:
+  case NVPTX::TEX_1D_ARRAY_F32_F32_GRAD:
+  case NVPTX::TEX_1D_ARRAY_I32_I32:
+  case NVPTX::TEX_1D_ARRAY_I32_F32:
+  case NVPTX::TEX_1D_ARRAY_I32_F32_LEVEL:
+  case NVPTX::TEX_1D_ARRAY_I32_F32_GRAD:
+  case NVPTX::TEX_2D_F32_I32:
+  case NVPTX::TEX_2D_F32_F32:
+  case NVPTX::TEX_2D_F32_F32_LEVEL:
+  case NVPTX::TEX_2D_F32_F32_GRAD:
+  case NVPTX::TEX_2D_I32_I32:
+  case NVPTX::TEX_2D_I32_F32:
+  case NVPTX::TEX_2D_I32_F32_LEVEL:
+  case NVPTX::TEX_2D_I32_F32_GRAD:
+  case NVPTX::TEX_2D_ARRAY_F32_I32:
+  case NVPTX::TEX_2D_ARRAY_F32_F32:
+  case NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL:
+  case NVPTX::TEX_2D_ARRAY_F32_F32_GRAD:
+  case NVPTX::TEX_2D_ARRAY_I32_I32:
+  case NVPTX::TEX_2D_ARRAY_I32_F32:
+  case NVPTX::TEX_2D_ARRAY_I32_F32_LEVEL:
+  case NVPTX::TEX_2D_ARRAY_I32_F32_GRAD:
+  case NVPTX::TEX_3D_F32_I32:
+  case NVPTX::TEX_3D_F32_F32:
+  case NVPTX::TEX_3D_F32_F32_LEVEL:
+  case NVPTX::TEX_3D_F32_F32_GRAD:
+  case NVPTX::TEX_3D_I32_I32:
+  case NVPTX::TEX_3D_I32_F32:
+  case NVPTX::TEX_3D_I32_F32_LEVEL:
+  case NVPTX::TEX_3D_I32_F32_GRAD:
+   {
+    // This is a texture fetch, so operand 4 is a texref and operand 5 is
+    // a samplerref
+    if (OpNo == 4) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+    if (OpNo == 5) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+  case NVPTX::SULD_1D_I8_TRAP:
+  case NVPTX::SULD_1D_I16_TRAP:
+  case NVPTX::SULD_1D_I32_TRAP:
+  case NVPTX::SULD_1D_ARRAY_I8_TRAP:
+  case NVPTX::SULD_1D_ARRAY_I16_TRAP:
+  case NVPTX::SULD_1D_ARRAY_I32_TRAP:
+  case NVPTX::SULD_2D_I8_TRAP:
+  case NVPTX::SULD_2D_I16_TRAP:
+  case NVPTX::SULD_2D_I32_TRAP:
+  case NVPTX::SULD_2D_ARRAY_I8_TRAP:
+  case NVPTX::SULD_2D_ARRAY_I16_TRAP:
+  case NVPTX::SULD_2D_ARRAY_I32_TRAP:
+  case NVPTX::SULD_3D_I8_TRAP:
+  case NVPTX::SULD_3D_I16_TRAP:
+  case NVPTX::SULD_3D_I32_TRAP: {
+    // This is a V1 surface load, so operand 1 is a surfref
+    if (OpNo == 1) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+  case NVPTX::SULD_1D_V2I8_TRAP:
+  case NVPTX::SULD_1D_V2I16_TRAP:
+  case NVPTX::SULD_1D_V2I32_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V2I8_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V2I16_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V2I32_TRAP:
+  case NVPTX::SULD_2D_V2I8_TRAP:
+  case NVPTX::SULD_2D_V2I16_TRAP:
+  case NVPTX::SULD_2D_V2I32_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V2I8_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V2I16_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V2I32_TRAP:
+  case NVPTX::SULD_3D_V2I8_TRAP:
+  case NVPTX::SULD_3D_V2I16_TRAP:
+  case NVPTX::SULD_3D_V2I32_TRAP: {
+    // This is a V2 surface load, so operand 2 is a surfref
+    if (OpNo == 2) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+  case NVPTX::SULD_1D_V4I8_TRAP:
+  case NVPTX::SULD_1D_V4I16_TRAP:
+  case NVPTX::SULD_1D_V4I32_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V4I8_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V4I16_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V4I32_TRAP:
+  case NVPTX::SULD_2D_V4I8_TRAP:
+  case NVPTX::SULD_2D_V4I16_TRAP:
+  case NVPTX::SULD_2D_V4I32_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V4I8_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V4I16_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V4I32_TRAP:
+  case NVPTX::SULD_3D_V4I8_TRAP:
+  case NVPTX::SULD_3D_V4I16_TRAP:
+  case NVPTX::SULD_3D_V4I32_TRAP: {
+    // This is a V4 surface load, so operand 4 is a surfref
+    if (OpNo == 4) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+  case NVPTX::SUST_B_1D_B8_TRAP:
+  case NVPTX::SUST_B_1D_B16_TRAP:
+  case NVPTX::SUST_B_1D_B32_TRAP:
+  case NVPTX::SUST_B_1D_V2B8_TRAP:
+  case NVPTX::SUST_B_1D_V2B16_TRAP:
+  case NVPTX::SUST_B_1D_V2B32_TRAP:
+  case NVPTX::SUST_B_1D_V4B8_TRAP:
+  case NVPTX::SUST_B_1D_V4B16_TRAP:
+  case NVPTX::SUST_B_1D_V4B32_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_B_2D_B8_TRAP:
+  case NVPTX::SUST_B_2D_B16_TRAP:
+  case NVPTX::SUST_B_2D_B32_TRAP:
+  case NVPTX::SUST_B_2D_V2B8_TRAP:
+  case NVPTX::SUST_B_2D_V2B16_TRAP:
+  case NVPTX::SUST_B_2D_V2B32_TRAP:
+  case NVPTX::SUST_B_2D_V4B8_TRAP:
+  case NVPTX::SUST_B_2D_V4B16_TRAP:
+  case NVPTX::SUST_B_2D_V4B32_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_B_3D_B8_TRAP:
+  case NVPTX::SUST_B_3D_B16_TRAP:
+  case NVPTX::SUST_B_3D_B32_TRAP:
+  case NVPTX::SUST_B_3D_V2B8_TRAP:
+  case NVPTX::SUST_B_3D_V2B16_TRAP:
+  case NVPTX::SUST_B_3D_V2B32_TRAP:
+  case NVPTX::SUST_B_3D_V4B8_TRAP:
+  case NVPTX::SUST_B_3D_V4B16_TRAP:
+  case NVPTX::SUST_B_3D_V4B32_TRAP:
+  case NVPTX::SUST_P_1D_B8_TRAP:
+  case NVPTX::SUST_P_1D_B16_TRAP:
+  case NVPTX::SUST_P_1D_B32_TRAP:
+  case NVPTX::SUST_P_1D_V2B8_TRAP:
+  case NVPTX::SUST_P_1D_V2B16_TRAP:
+  case NVPTX::SUST_P_1D_V2B32_TRAP:
+  case NVPTX::SUST_P_1D_V4B8_TRAP:
+  case NVPTX::SUST_P_1D_V4B16_TRAP:
+  case NVPTX::SUST_P_1D_V4B32_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_P_2D_B8_TRAP:
+  case NVPTX::SUST_P_2D_B16_TRAP:
+  case NVPTX::SUST_P_2D_B32_TRAP:
+  case NVPTX::SUST_P_2D_V2B8_TRAP:
+  case NVPTX::SUST_P_2D_V2B16_TRAP:
+  case NVPTX::SUST_P_2D_V2B32_TRAP:
+  case NVPTX::SUST_P_2D_V4B8_TRAP:
+  case NVPTX::SUST_P_2D_V4B16_TRAP:
+  case NVPTX::SUST_P_2D_V4B32_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_P_3D_B8_TRAP:
+  case NVPTX::SUST_P_3D_B16_TRAP:
+  case NVPTX::SUST_P_3D_B32_TRAP:
+  case NVPTX::SUST_P_3D_V2B8_TRAP:
+  case NVPTX::SUST_P_3D_V2B16_TRAP:
+  case NVPTX::SUST_P_3D_V2B32_TRAP:
+  case NVPTX::SUST_P_3D_V4B8_TRAP:
+  case NVPTX::SUST_P_3D_V4B16_TRAP:
+  case NVPTX::SUST_P_3D_V4B32_TRAP: {
+    // This is a surface store, so operand 0 is a surfref
+    if (OpNo == 0) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+  case NVPTX::TXQ_CHANNEL_ORDER:
+  case NVPTX::TXQ_CHANNEL_DATA_TYPE:
+  case NVPTX::TXQ_WIDTH:
+  case NVPTX::TXQ_HEIGHT:
+  case NVPTX::TXQ_DEPTH:
+  case NVPTX::TXQ_ARRAY_SIZE:
+  case NVPTX::TXQ_NUM_SAMPLES:
+  case NVPTX::TXQ_NUM_MIPMAP_LEVELS:
+  case NVPTX::SUQ_CHANNEL_ORDER:
+  case NVPTX::SUQ_CHANNEL_DATA_TYPE:
+  case NVPTX::SUQ_WIDTH:
+  case NVPTX::SUQ_HEIGHT:
+  case NVPTX::SUQ_DEPTH:
+  case NVPTX::SUQ_ARRAY_SIZE: {
+    // This is a query, so operand 1 is a surfref/texref
+    if (OpNo == 1) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+  }
+}
+
+void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) {
+  // Ewwww
+  TargetMachine &TM = const_cast<TargetMachine&>(MF->getTarget());
+  NVPTXTargetMachine &nvTM = static_cast<NVPTXTargetMachine&>(TM);
+  const NVPTXMachineFunctionInfo *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
+  const char *Sym = MFI->getImageHandleSymbol(Index);
+  std::string *SymNamePtr =
+    nvTM.getManagedStrPool()->getManagedString(Sym);
+  MCOp = GetSymbolRef(OutContext.GetOrCreateSymbol(
+    StringRef(SymNamePtr->c_str())));
+}
+
 void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
   OutMI.setOpcode(MI->getOpcode());
+  const NVPTXSubtarget &ST = TM.getSubtarget<NVPTXSubtarget>();
 
   // Special: Do not mangle symbol operand of CALL_PROTOTYPE
   if (MI->getOpcode() == NVPTX::CALL_PROTOTYPE) {
     const MachineOperand &MO = MI->getOperand(0);
-    OutMI.addOperand(GetSymbolRef(MO,
+    OutMI.addOperand(GetSymbolRef(
       OutContext.GetOrCreateSymbol(Twine(MO.getSymbolName()))));
     return;
   }
@@ -325,6 +607,13 @@ void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
     const MachineOperand &MO = MI->getOperand(i);
 
     MCOperand MCOp;
+    if (!ST.hasImageHandles()) {
+      if (lowerImageHandleOperand(MI, i, MCOp)) {
+        OutMI.addOperand(MCOp);
+        continue;
+      }
+    }
+
     if (lowerOperand(MO, MCOp))
       OutMI.addOperand(MCOp);
   }
@@ -345,10 +634,10 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
         MO.getMBB()->getSymbol(), OutContext));
     break;
   case MachineOperand::MO_ExternalSymbol:
-    MCOp = GetSymbolRef(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
+    MCOp = GetSymbolRef(GetExternalSymbolSymbol(MO.getSymbolName()));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = GetSymbolRef(MO, getSymbol(MO.getGlobal()));
+    MCOp = GetSymbolRef(getSymbol(MO.getGlobal()));
     break;
   case MachineOperand::MO_FPImmediate: {
     const ConstantFP *Cnt = MO.getFPImm();
@@ -407,8 +696,7 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
   }
 }
 
-MCOperand NVPTXAsmPrinter::GetSymbolRef(const MachineOperand &MO,
-                                        const MCSymbol *Symbol) {
+MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
   const MCExpr *Expr;
   Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
                                  OutContext);
@@ -750,7 +1038,7 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
   if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
     return false;
 
-  const Function *oneFunc = 0;
+  const Function *oneFunc = nullptr;
 
   bool flag = usedInOneFunc(gv, oneFunc);
   if (flag == false)
@@ -1010,6 +1298,8 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
   for (i = 0; i < n; i++)
     global_list.insert(global_list.end(), gv_array[i]);
 
+  clearAnnotationCache(&M);
+
   delete[] gv_array;
   return ret;
 
@@ -1105,10 +1395,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
   if (llvm::isSampler(*GVar)) {
     O << ".global .samplerref " << llvm::getSamplerName(*GVar);
 
-    const Constant *Initializer = NULL;
+    const Constant *Initializer = nullptr;
     if (GVar->hasInitializer())
       Initializer = GVar->getInitializer();
-    const ConstantInt *CI = NULL;
+    const ConstantInt *CI = nullptr;
     if (Initializer)
       CI = dyn_cast<ConstantInt>(Initializer);
     if (CI) {
@@ -1175,7 +1465,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
       return;
   }
 
-  const Function *demotedFunc = 0;
+  const Function *demotedFunc = nullptr;
   if (!processDemoted && canDemoteGlobalVar(GVar, demotedFunc)) {
     O << "// " << GVar->getName().str() << " has been demoted\n";
     if (localDecls.find(demotedFunc) != localDecls.end())
@@ -1347,7 +1637,7 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const {
       return "u32";
   }
   llvm_unreachable("unexpected type");
-  return NULL;
+  return nullptr;
 }
 
 void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
@@ -1495,19 +1785,33 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
     first = false;
 
     // Handle image/sampler parameters
-    if (llvm::isSampler(*I) || llvm::isImage(*I)) {
-      if (llvm::isImage(*I)) {
-        std::string sname = I->getName();
-        if (llvm::isImageWriteOnly(*I))
-          O << "\t.param .surfref " << *getSymbol(F) << "_param_"
-            << paramIndex;
-        else // Default image is read_only
-          O << "\t.param .texref " << *getSymbol(F) << "_param_"
-            << paramIndex;
-      } else // Should be llvm::isSampler(*I)
-        O << "\t.param .samplerref " << *getSymbol(F) << "_param_"
-          << paramIndex;
-      continue;
+    if (isKernelFunction(*F)) {
+      if (isSampler(*I) || isImage(*I)) {
+        if (isImage(*I)) {
+          std::string sname = I->getName();
+          if (isImageWriteOnly(*I) || isImageReadWrite(*I)) {
+            if (nvptxSubtarget.hasImageHandles())
+              O << "\t.param .u64 .ptr .surfref ";
+            else
+              O << "\t.param .surfref ";
+            O << *CurrentFnSym << "_param_" << paramIndex;
+          }
+          else { // Default image is read_only
+            if (nvptxSubtarget.hasImageHandles())
+              O << "\t.param .u64 .ptr .texref ";
+            else
+              O << "\t.param .texref ";
+            O << *CurrentFnSym << "_param_" << paramIndex;
+          }
+        } else {
+          if (nvptxSubtarget.hasImageHandles())
+            O << "\t.param .u64 .ptr .samplerref ";
+          else
+            O << "\t.param .samplerref ";
+          O << *CurrentFnSym << "_param_" << paramIndex;
+        }
+        continue;
+      }
     }
 
     if (PAL.hasAttribute(paramIndex + 1, Attribute::ByVal) == false) {
@@ -1752,13 +2056,35 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
     return;
   }
   if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
-    O << *getSymbol(GVar);
+    PointerType *PTy = dyn_cast<PointerType>(GVar->getType());
+    bool IsNonGenericPointer = false;
+    if (PTy && PTy->getAddressSpace() != 0) {
+      IsNonGenericPointer = true;
+    }
+    if (EmitGeneric && !isa<Function>(CPV) && !IsNonGenericPointer) {
+      O << "generic(";
+      O << *getSymbol(GVar);
+      O << ")";
+    } else {
+      O << *getSymbol(GVar);
+    }
     return;
   }
   if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
     const Value *v = Cexpr->stripPointerCasts();
+    PointerType *PTy = dyn_cast<PointerType>(Cexpr->getType());
+    bool IsNonGenericPointer = false;
+    if (PTy && PTy->getAddressSpace() != 0) {
+      IsNonGenericPointer = true;
+    }
     if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
-      O << *getSymbol(GVar);
+      if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
+        O << "generic(";
+        O << *getSymbol(GVar);
+        O << ")";
+      } else {
+        O << *getSymbol(GVar);
+      }
       return;
     } else {
       O << *LowerConstant(CPV, *this);
@@ -2121,7 +2447,7 @@ void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
 }
 
 LineReader *NVPTXAsmPrinter::getReader(std::string filename) {
-  if (reader == NULL) {
+  if (!reader) {
     reader = new LineReader(filename);
   }
 
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 7162420..a9f9bdd 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -96,6 +96,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
     unsigned curpos;
     raw_ostream &O;
     NVPTXAsmPrinter &AP;
+    bool EmitGeneric;
 
   public:
     AggBuffer(unsigned _size, raw_ostream &_O, NVPTXAsmPrinter &_AP)
@@ -104,6 +105,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
       size = _size;
       curpos = 0;
       numSymbols = 0;
+      EmitGeneric = AP.EmitGeneric;
     }
     ~AggBuffer() { delete[] buffer; }
     unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
@@ -155,7 +157,18 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
             const Value *v = Symbols[nSym];
             if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
               MCSymbol *Name = AP.getSymbol(GVar);
-              O << *Name;
+              PointerType *PTy = dyn_cast<PointerType>(GVar->getType());
+              bool IsNonGenericPointer = false;
+              if (PTy && PTy->getAddressSpace() != 0) {
+                IsNonGenericPointer = true;
+              }
+              if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
+                O << "generic(";
+                O << *Name;
+                O << ")";
+              } else {
+                O << *Name;
+              }
             } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
               O << *nvptx::LowerConstant(Cexpr, AP);
             } else
@@ -176,31 +189,31 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
 
   friend class AggBuffer;
 
-  virtual void emitSrcInText(StringRef filename, unsigned line);
+  void emitSrcInText(StringRef filename, unsigned line);
 
 private:
-  virtual const char *getPassName() const { return "NVPTX Assembly Printer"; }
+  const char *getPassName() const override { return "NVPTX Assembly Printer"; }
 
   const Function *F;
   std::string CurrentFnName;
 
-  void EmitFunctionEntryLabel();
-  void EmitFunctionBodyStart();
-  void EmitFunctionBodyEnd();
-  void emitImplicitDef(const MachineInstr *MI) const;
+  void EmitFunctionEntryLabel() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  void emitImplicitDef(const MachineInstr *MI) const override;
 
-  void EmitInstruction(const MachineInstr *);
+  void EmitInstruction(const MachineInstr *) override;
   void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
-  MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
+  MCOperand GetSymbolRef(const MCSymbol *Symbol);
   unsigned encodeVirtualRegister(unsigned Reg);
 
-  void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const {}
+  void EmitAlignment(unsigned NumBits, const GlobalValue *GV = nullptr) const {}
 
   void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier,
                                  raw_ostream &O);
   void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
-                       const char *Modifier = 0);
+                       const char *Modifier = nullptr);
   void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
   void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
                           bool = false);
@@ -221,15 +234,15 @@ private:
   void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &);
+                       raw_ostream &) override;
   void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
-                    const char *Modifier = 0);
+                    const char *Modifier = nullptr);
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                              unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &);
+                             raw_ostream &) override;
 protected:
-  bool doInitialization(Module &M);
-  bool doFinalization(Module &M);
+  bool doInitialization(Module &M) override;
+  bool doFinalization(Module &M) override;
 
 private:
   std::string CurrentBankselLabelInBasicBlock;
@@ -274,14 +287,33 @@ private:
   static const char *getRegisterName(unsigned RegNo);
   void emitDemotedVars(const Function *, raw_ostream &);
 
+  bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo,
+                               MCOperand &MCOp);
+  void lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp);
+
   LineReader *reader;
   LineReader *getReader(std::string);
+
+  // Used to control the need to emit .generic() in the initializer of
+  // module scope variables.
+  // Although ptx supports the hybrid mode like the following,
+  //    .global .u32 a;
+  //    .global .u32 b;
+  //    .global .u32 addr[] = {a, generic(b)}
+  // we have difficulty representing the difference in the NVVM IR.
+  //
+  // Since the address value should always be generic in CUDA C and always
+  // be specific in OpenCL, we use this simple control here.
+  //
+  bool EmitGeneric;
+
 public:
   NVPTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer),
         nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
     CurrentBankselLabelInBasicBlock = "";
-    reader = NULL;
+    reader = nullptr;
+    EmitGeneric = (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA);
   }
 
   ~NVPTXAsmPrinter() {
diff --git a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
index 158c482..962b123 100644
--- a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
+++ b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -33,7 +33,7 @@ public:
   static char ID;
   NVPTXAssignValidGlobalNames() : ModulePass(ID) {}
 
-  virtual bool runOnModule(Module &M);
+  bool runOnModule(Module &M) override;
 
   /// \brief Clean up the name to remove symbols invalid in PTX.
   std::string cleanUpName(StringRef Name);
diff --git a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
new file mode 100644
index 0000000..f3a095d
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@@ -0,0 +1,195 @@
+//===-- NVPTXFavorNonGenericAddrSpace.cpp - ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When a load/store accesses the generic address space, checks whether the
+// address is casted from a non-generic address space. If so, remove this
+// addrspacecast because accessing non-generic address spaces is typically
+// faster. Besides seeking addrspacecasts, this optimization also traces into
+// the base pointer of a GEP.
+//
+// For instance, the code below loads a float from an array allocated in
+// addrspace(3).
+//
+// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+// %1 = gep [10 x float]* %0, i64 0, i64 %i
+// %2 = load float* %1 ; emits ld.f32
+//
+// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast
+// and the GEP to expose more optimization opportunities to function
+// optimizeMemoryInst. The intermediate code looks like:
+//
+// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %1 = addrspacecast float addrspace(3)* %0 to float*
+// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly
+//
+// Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed
+// generic pointers, and folds the load and the addrspacecast into a load from
+// the original address space. The final code looks like:
+//
+// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32
+//
+// This pass may remove an addrspacecast in a different BB. Therefore, we
+// implement it as a FunctionPass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+// An option to disable this optimization. Enable it by default.
+static cl::opt<bool> DisableFavorNonGeneric(
+  "disable-nvptx-favor-non-generic",
+  cl::init(false),
+  cl::desc("Do not convert generic address space usage "
+           "to non-generic address space usage"),
+  cl::Hidden);
+
+namespace {
+/// \brief NVPTXFavorNonGenericAddrSpaces
+class NVPTXFavorNonGenericAddrSpaces : public FunctionPass {
+public:
+  static char ID;
+  NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  /// Optimizes load/store instructions. Idx is the index of the pointer operand
+  /// (0 for load, and 1 for store). Returns true if it changes anything.
+  bool optimizeMemoryInstruction(Instruction *I, unsigned Idx);
+  /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X,
+  /// indices)".  This reordering exposes to optimizeMemoryInstruction more
+  /// optimization opportunities on loads and stores. Returns true if it changes
+  /// the program.
+  bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP);
+};
+}
+
+char NVPTXFavorNonGenericAddrSpaces::ID = 0;
+
+namespace llvm {
+void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
+}
+INITIALIZE_PASS(NVPTXFavorNonGenericAddrSpaces, "nvptx-favor-non-generic",
+                "Remove unnecessary non-generic-to-generic addrspacecasts",
+                false, false)
+
+// Decides whether removing Cast is valid and beneficial. Cast can be an
+// instruction or a constant expression.
+static bool IsEliminableAddrSpaceCast(Operator *Cast) {
+  // Returns false if not even an addrspacecast.
+  if (Cast->getOpcode() != Instruction::AddrSpaceCast)
+    return false;
+
+  Value *Src = Cast->getOperand(0);
+  PointerType *SrcTy = cast<PointerType>(Src->getType());
+  PointerType *DestTy = cast<PointerType>(Cast->getType());
+  // TODO: For now, we only handle the case where the addrspacecast only changes
+  // the address space but not the type. If the type also changes, we could
+  // still get rid of the addrspacecast by adding an extra bitcast, but we
+  // rarely see such scenarios.
+  if (SrcTy->getElementType() != DestTy->getElementType())
+    return false;
+
+  // Checks whether the addrspacecast is from a non-generic address space to the
+  // generic address space.
+  return (SrcTy->getAddressSpace() != AddressSpace::ADDRESS_SPACE_GENERIC &&
+          DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC);
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(
+    GEPOperator *GEP) {
+  Operator *Cast = dyn_cast<Operator>(GEP->getPointerOperand());
+  if (!Cast)
+    return false;
+
+  if (!IsEliminableAddrSpaceCast(Cast))
+    return false;
+
+  SmallVector<Value *, 8> Indices(GEP->idx_begin(), GEP->idx_end());
+  if (Instruction *GEPI = dyn_cast<Instruction>(GEP)) {
+    // %1 = gep (addrspacecast X), indices
+    // =>
+    // %0 = gep X, indices
+    // %1 = addrspacecast %0
+    GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(Cast->getOperand(0),
+                                                           Indices,
+                                                           GEP->getName(),
+                                                           GEPI);
+    NewGEPI->setIsInBounds(GEP->isInBounds());
+    GEP->replaceAllUsesWith(
+        new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI));
+  } else {
+    // GEP is a constant expression.
+    Constant *NewGEPCE = ConstantExpr::getGetElementPtr(
+        cast<Constant>(Cast->getOperand(0)),
+        Indices,
+        GEP->isInBounds());
+    GEP->replaceAllUsesWith(
+        ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType()));
+  }
+
+  return true;
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI,
+                                                               unsigned Idx) {
+  // If the pointer operand is a GEP, hoist the addrspacecast if any from the
+  // GEP to expose more optimization opportunites.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(MI->getOperand(Idx))) {
+    hoistAddrSpaceCastFromGEP(GEP);
+  }
+
+  // load/store (addrspacecast X) => load/store X if shortcutting the
+  // addrspacecast is valid and can improve performance.
+  //
+  // e.g.,
+  // %1 = addrspacecast float addrspace(3)* %0 to float*
+  // %2 = load float* %1
+  // ->
+  // %2 = load float addrspace(3)* %0
+  //
+  // Note: the addrspacecast can also be a constant expression.
+  if (Operator *Cast = dyn_cast<Operator>(MI->getOperand(Idx))) {
+    if (IsEliminableAddrSpaceCast(Cast)) {
+      MI->setOperand(Idx, Cast->getOperand(0));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) {
+  if (DisableFavorNonGeneric)
+    return false;
+
+  bool Changed = false;
+  for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+      if (isa<LoadInst>(I)) {
+        // V = load P
+        Changed |= optimizeMemoryInstruction(I, 0);
+      } else if (isa<StoreInst>(I)) {
+        // store V, P
+        Changed |= optimizeMemoryInstruction(I, 1);
+      }
+    }
+  }
+  return Changed;
+}
+
+FunctionPass *llvm::createNVPTXFavorNonGenericAddrSpacesPass() {
+  return new NVPTXFavorNonGenericAddrSpaces();
+}
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index 819f1dd..2ae6d72 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -28,13 +28,13 @@ public:
       : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), tm(_tm),
         is64bit(_is64bit) {}
 
-  virtual bool hasFP(const MachineFunction &MF) const;
-  virtual void emitPrologue(MachineFunction &MF) const;
-  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  bool hasFP(const MachineFunction &MF) const override;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 45f0734..023dd5e 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -40,10 +40,9 @@ public:
 
   GenericToNVVM() : ModulePass(ID) {}
 
-  virtual bool runOnModule(Module &M);
+  bool runOnModule(Module &M) override;
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
 
 private:
   Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV,
@@ -88,7 +87,8 @@ bool GenericToNVVM::runOnModule(Module &M) {
         !GV->getName().startswith("llvm.")) {
       GlobalVariable *NewGV = new GlobalVariable(
           M, GV->getType()->getElementType(), GV->isConstant(),
-          GV->getLinkage(), GV->hasInitializer() ? GV->getInitializer() : NULL,
+          GV->getLinkage(),
+          GV->hasInitializer() ? GV->getInitializer() : nullptr,
           "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
       NewGV->copyAttributesFrom(GV);
       GVMap[GV] = NewGV;
@@ -162,7 +162,7 @@ Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
                                       GlobalVariable *GV,
                                       IRBuilder<> &Builder) {
   PointerType *GVType = GV->getType();
-  Value *CVTA = NULL;
+  Value *CVTA = nullptr;
 
   // See if the address space conversion requires the operand to be bitcast
   // to i8 addrspace(n)* first.
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index bd08d2d..cd30880 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -20,11 +20,10 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 
-#undef DEBUG_TYPE
-#define DEBUG_TYPE "nvptx-isel"
-
 using namespace llvm;
 
+#define DEBUG_TYPE "nvptx-isel"
+
 static cl::opt<int>
 FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
                  cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
@@ -120,10 +119,10 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
 
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL; // Already selected.
+    return nullptr; // Already selected.
   }
 
-  SDNode *ResNode = NULL;
+  SDNode *ResNode = nullptr;
   switch (N->getOpcode()) {
   case ISD::LOAD:
     ResNode = SelectLoad(N);
@@ -162,6 +161,98 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
   case NVPTXISD::StoreParamU32:
     ResNode = SelectStoreParam(N);
     break;
+  case ISD::INTRINSIC_WO_CHAIN:
+    ResNode = SelectIntrinsicNoChain(N);
+    break;
+  case NVPTXISD::Tex1DFloatI32:
+  case NVPTXISD::Tex1DFloatFloat:
+  case NVPTXISD::Tex1DFloatFloatLevel:
+  case NVPTXISD::Tex1DFloatFloatGrad:
+  case NVPTXISD::Tex1DI32I32:
+  case NVPTXISD::Tex1DI32Float:
+  case NVPTXISD::Tex1DI32FloatLevel:
+  case NVPTXISD::Tex1DI32FloatGrad:
+  case NVPTXISD::Tex1DArrayFloatI32:
+  case NVPTXISD::Tex1DArrayFloatFloat:
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+  case NVPTXISD::Tex1DArrayI32I32:
+  case NVPTXISD::Tex1DArrayI32Float:
+  case NVPTXISD::Tex1DArrayI32FloatLevel:
+  case NVPTXISD::Tex1DArrayI32FloatGrad:
+  case NVPTXISD::Tex2DFloatI32:
+  case NVPTXISD::Tex2DFloatFloat:
+  case NVPTXISD::Tex2DFloatFloatLevel:
+  case NVPTXISD::Tex2DFloatFloatGrad:
+  case NVPTXISD::Tex2DI32I32:
+  case NVPTXISD::Tex2DI32Float:
+  case NVPTXISD::Tex2DI32FloatLevel:
+  case NVPTXISD::Tex2DI32FloatGrad:
+  case NVPTXISD::Tex2DArrayFloatI32:
+  case NVPTXISD::Tex2DArrayFloatFloat:
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+  case NVPTXISD::Tex2DArrayI32I32:
+  case NVPTXISD::Tex2DArrayI32Float:
+  case NVPTXISD::Tex2DArrayI32FloatLevel:
+  case NVPTXISD::Tex2DArrayI32FloatGrad:
+  case NVPTXISD::Tex3DFloatI32:
+  case NVPTXISD::Tex3DFloatFloat:
+  case NVPTXISD::Tex3DFloatFloatLevel:
+  case NVPTXISD::Tex3DFloatFloatGrad:
+  case NVPTXISD::Tex3DI32I32:
+  case NVPTXISD::Tex3DI32Float:
+  case NVPTXISD::Tex3DI32FloatLevel:
+  case NVPTXISD::Tex3DI32FloatGrad:
+    ResNode = SelectTextureIntrinsic(N);
+    break;
+  case NVPTXISD::Suld1DI8Trap:
+  case NVPTXISD::Suld1DI16Trap:
+  case NVPTXISD::Suld1DI32Trap:
+  case NVPTXISD::Suld1DV2I8Trap:
+  case NVPTXISD::Suld1DV2I16Trap:
+  case NVPTXISD::Suld1DV2I32Trap:
+  case NVPTXISD::Suld1DV4I8Trap:
+  case NVPTXISD::Suld1DV4I16Trap:
+  case NVPTXISD::Suld1DV4I32Trap:
+  case NVPTXISD::Suld1DArrayI8Trap:
+  case NVPTXISD::Suld1DArrayI16Trap:
+  case NVPTXISD::Suld1DArrayI32Trap:
+  case NVPTXISD::Suld1DArrayV2I8Trap:
+  case NVPTXISD::Suld1DArrayV2I16Trap:
+  case NVPTXISD::Suld1DArrayV2I32Trap:
+  case NVPTXISD::Suld1DArrayV4I8Trap:
+  case NVPTXISD::Suld1DArrayV4I16Trap:
+  case NVPTXISD::Suld1DArrayV4I32Trap:
+  case NVPTXISD::Suld2DI8Trap:
+  case NVPTXISD::Suld2DI16Trap:
+  case NVPTXISD::Suld2DI32Trap:
+  case NVPTXISD::Suld2DV2I8Trap:
+  case NVPTXISD::Suld2DV2I16Trap:
+  case NVPTXISD::Suld2DV2I32Trap:
+  case NVPTXISD::Suld2DV4I8Trap:
+  case NVPTXISD::Suld2DV4I16Trap:
+  case NVPTXISD::Suld2DV4I32Trap:
+  case NVPTXISD::Suld2DArrayI8Trap:
+  case NVPTXISD::Suld2DArrayI16Trap:
+  case NVPTXISD::Suld2DArrayI32Trap:
+  case NVPTXISD::Suld2DArrayV2I8Trap:
+  case NVPTXISD::Suld2DArrayV2I16Trap:
+  case NVPTXISD::Suld2DArrayV2I32Trap:
+  case NVPTXISD::Suld2DArrayV4I8Trap:
+  case NVPTXISD::Suld2DArrayV4I16Trap:
+  case NVPTXISD::Suld2DArrayV4I32Trap:
+  case NVPTXISD::Suld3DI8Trap:
+  case NVPTXISD::Suld3DI16Trap:
+  case NVPTXISD::Suld3DI32Trap:
+  case NVPTXISD::Suld3DV2I8Trap:
+  case NVPTXISD::Suld3DV2I16Trap:
+  case NVPTXISD::Suld3DV2I32Trap:
+  case NVPTXISD::Suld3DV4I8Trap:
+  case NVPTXISD::Suld3DV4I16Trap:
+  case NVPTXISD::Suld3DV4I32Trap:
+    ResNode = SelectSurfaceIntrinsic(N);
+    break;
   case ISD::ADDRSPACECAST:
     ResNode = SelectAddrSpaceCast(N);
     break;
@@ -175,7 +266,7 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
 
 static unsigned int getCodeAddrSpace(MemSDNode *N,
                                      const NVPTXSubtarget &Subtarget) {
-  const Value *Src = N->getSrcValue();
+  const Value *Src = N->getMemOperand()->getValue();
 
   if (!Src)
     return NVPTX::PTXLdStInstCode::GENERIC;
@@ -194,6 +285,24 @@ static unsigned int getCodeAddrSpace(MemSDNode *N,
   return NVPTX::PTXLdStInstCode::GENERIC;
 }
 
+SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
+  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IID) {
+  default:
+    return nullptr;
+  case Intrinsic::nvvm_texsurf_handle_internal:
+    return SelectTexSurfHandle(N);
+  }
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
+  // Op 0 is the intrinsic ID
+  SDValue Wrapper = N->getOperand(1);
+  SDValue GlobalVal = Wrapper.getOperand(0);
+  return CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), MVT::i64,
+                                GlobalVal);
+}
+
 SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   SDValue Src = N->getOperand(0);
   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
@@ -258,14 +367,14 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
   EVT LoadedVT = LD->getMemoryVT();
-  SDNode *NVPTXLD = NULL;
+  SDNode *NVPTXLD = nullptr;
 
   // do not support pre/post inc/dec
   if (LD->isIndexed())
-    return NULL;
+    return nullptr;
 
   if (!LoadedVT.isSimple())
-    return NULL;
+    return nullptr;
 
   // Address Space Setting
   unsigned int codeAddrSpace = getCodeAddrSpace(LD, Subtarget);
@@ -288,7 +397,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
     else if (num == 4)
       vecType = NVPTX::PTXLdStInstCode::V4;
     else
-      return NULL;
+      return nullptr;
   }
 
   // Type Setting: fromType + fromTypeWidth
@@ -337,7 +446,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
       Opcode = NVPTX::LD_f64_avar;
       break;
     default:
-      return NULL;
+      return nullptr;
     }
     SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
                       getI32Imm(vecType), getI32Imm(fromType),
@@ -366,7 +475,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
       Opcode = NVPTX::LD_f64_asi;
       break;
     default:
-      return NULL;
+      return nullptr;
     }
     SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
                       getI32Imm(vecType), getI32Imm(fromType),
@@ -396,7 +505,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
         Opcode = NVPTX::LD_f64_ari_64;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     } else {
       switch (TargetVT) {
@@ -419,7 +528,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
         Opcode = NVPTX::LD_f64_ari;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     }
     SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
@@ -448,7 +557,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
         Opcode = NVPTX::LD_f64_areg_64;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     } else {
       switch (TargetVT) {
@@ -471,7 +580,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
         Opcode = NVPTX::LD_f64_areg;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     }
     SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
@@ -480,7 +589,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
     NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
   }
 
-  if (NVPTXLD != NULL) {
+  if (NVPTXLD) {
     MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
     MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
     cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
@@ -501,7 +610,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   EVT LoadedVT = MemSD->getMemoryVT();
 
   if (!LoadedVT.isSimple())
-    return NULL;
+    return nullptr;
 
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget);
@@ -547,7 +656,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     VecType = NVPTX::PTXLdStInstCode::V4;
     break;
   default:
-    return NULL;
+    return nullptr;
   }
 
   EVT EltVT = N->getValueType(0);
@@ -555,11 +664,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
-      return NULL;
+      return nullptr;
     case NVPTXISD::LoadV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::LDV_i8_v2_avar;
         break;
@@ -583,7 +692,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     case NVPTXISD::LoadV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::LDV_i8_v4_avar;
         break;
@@ -609,11 +718,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
                  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
-      return NULL;
+      return nullptr;
     case NVPTXISD::LoadV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::LDV_i8_v2_asi;
         break;
@@ -637,7 +746,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     case NVPTXISD::LoadV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::LDV_i8_v4_asi;
         break;
@@ -664,11 +773,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     if (Subtarget.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LoadV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v2_ari_64;
           break;
@@ -692,7 +801,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
       case NVPTXISD::LoadV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v4_ari_64;
           break;
@@ -711,11 +820,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LoadV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v2_ari;
           break;
@@ -739,7 +848,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
       case NVPTXISD::LoadV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v4_ari;
           break;
@@ -766,11 +875,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     if (Subtarget.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LoadV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v2_areg_64;
           break;
@@ -794,7 +903,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
       case NVPTXISD::LoadV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v4_areg_64;
           break;
@@ -813,11 +922,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LoadV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v2_areg;
           break;
@@ -841,7 +950,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
       case NVPTXISD::LoadV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v4_areg;
           break;
@@ -887,11 +996,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
-      return NULL;
+      return nullptr;
     case NVPTXISD::LDGV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar;
         break;
@@ -915,7 +1024,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     case NVPTXISD::LDUV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar;
         break;
@@ -939,7 +1048,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     case NVPTXISD::LDGV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar;
         break;
@@ -957,7 +1066,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     case NVPTXISD::LDUV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar;
         break;
@@ -975,19 +1084,18 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     }
 
     SDValue Ops[] = { Addr, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(),
-                                ArrayRef<SDValue>(Ops, 2));
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
   } else if (Subtarget.is64Bit()
                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (Subtarget.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64;
           break;
@@ -1011,7 +1119,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64;
           break;
@@ -1035,7 +1143,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64;
           break;
@@ -1053,7 +1161,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64;
           break;
@@ -1072,11 +1180,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32;
           break;
@@ -1100,7 +1208,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32;
           break;
@@ -1124,7 +1232,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32;
           break;
@@ -1142,7 +1250,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32;
           break;
@@ -1162,17 +1270,16 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
 
     SDValue Ops[] = { Base, Offset, Chain };
 
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(),
-                                ArrayRef<SDValue>(Ops, 3));
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
   } else {
     if (Subtarget.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64;
           break;
@@ -1196,7 +1303,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64;
           break;
@@ -1220,7 +1327,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64;
           break;
@@ -1238,7 +1345,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64;
           break;
@@ -1257,11 +1364,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32;
           break;
@@ -1285,7 +1392,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32;
           break;
@@ -1309,7 +1416,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32;
           break;
@@ -1327,7 +1434,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32;
           break;
@@ -1346,8 +1453,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     }
 
     SDValue Ops[] = { Op1, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(),
-                                ArrayRef<SDValue>(Ops, 2));
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
@@ -1361,14 +1467,14 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
   SDLoc dl(N);
   StoreSDNode *ST = cast<StoreSDNode>(N);
   EVT StoreVT = ST->getMemoryVT();
-  SDNode *NVPTXST = NULL;
+  SDNode *NVPTXST = nullptr;
 
   // do not support pre/post inc/dec
   if (ST->isIndexed())
-    return NULL;
+    return nullptr;
 
   if (!StoreVT.isSimple())
-    return NULL;
+    return nullptr;
 
   // Address Space Setting
   unsigned int codeAddrSpace = getCodeAddrSpace(ST, Subtarget);
@@ -1391,7 +1497,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
     else if (num == 4)
       vecType = NVPTX::PTXLdStInstCode::V4;
     else
-      return NULL;
+      return nullptr;
   }
 
   // Type Setting: toType + toTypeWidth
@@ -1435,7 +1541,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
       Opcode = NVPTX::ST_f64_avar;
       break;
     default:
-      return NULL;
+      return nullptr;
     }
     SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
                       getI32Imm(vecType), getI32Imm(toType),
@@ -1464,7 +1570,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
       Opcode = NVPTX::ST_f64_asi;
       break;
     default:
-      return NULL;
+      return nullptr;
     }
     SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
                       getI32Imm(vecType), getI32Imm(toType),
@@ -1494,7 +1600,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
         Opcode = NVPTX::ST_f64_ari_64;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     } else {
       switch (SourceVT) {
@@ -1517,7 +1623,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
         Opcode = NVPTX::ST_f64_ari;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     }
     SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
@@ -1546,7 +1652,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
         Opcode = NVPTX::ST_f64_areg_64;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     } else {
       switch (SourceVT) {
@@ -1569,7 +1675,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
         Opcode = NVPTX::ST_f64_areg;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     }
     SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
@@ -1578,7 +1684,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
     NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
   }
 
-  if (NVPTXST != NULL) {
+  if (NVPTXST) {
     MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
     MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
     cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
@@ -1645,7 +1751,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     N2 = N->getOperand(5);
     break;
   default:
-    return NULL;
+    return nullptr;
   }
 
   StOps.push_back(getI32Imm(IsVolatile));
@@ -1657,11 +1763,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
   if (SelectDirectAddr(N2, Addr)) {
     switch (N->getOpcode()) {
     default:
-      return NULL;
+      return nullptr;
     case NVPTXISD::StoreV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::STV_i8_v2_avar;
         break;
@@ -1685,7 +1791,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     case NVPTXISD::StoreV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::STV_i8_v4_avar;
         break;
@@ -1707,11 +1813,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
                  : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
-      return NULL;
+      return nullptr;
     case NVPTXISD::StoreV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::STV_i8_v2_asi;
         break;
@@ -1735,7 +1841,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     case NVPTXISD::StoreV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::STV_i8_v4_asi;
         break;
@@ -1759,11 +1865,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     if (Subtarget.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::StoreV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v2_ari_64;
           break;
@@ -1787,7 +1893,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       case NVPTXISD::StoreV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v4_ari_64;
           break;
@@ -1806,11 +1912,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::StoreV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v2_ari;
           break;
@@ -1834,7 +1940,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       case NVPTXISD::StoreV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v4_ari;
           break;
@@ -1857,11 +1963,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     if (Subtarget.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::StoreV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v2_areg_64;
           break;
@@ -1885,7 +1991,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       case NVPTXISD::StoreV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v4_areg_64;
           break;
@@ -1904,11 +2010,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::StoreV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v2_areg;
           break;
@@ -1932,7 +2038,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       case NVPTXISD::StoreV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v4_areg;
           break;
@@ -1973,7 +2079,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
   unsigned VecSize;
   switch (Node->getOpcode()) {
   default:
-    return NULL;
+    return nullptr;
   case NVPTXISD::LoadParam:
     VecSize = 1;
     break;
@@ -1992,11 +2098,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
 
   switch (VecSize) {
   default:
-    return NULL;
+    return nullptr;
   case 1:
     switch (MemVT.getSimpleVT().SimpleTy) {
     default:
-      return NULL;
+      return nullptr;
     case MVT::i1:
       Opc = NVPTX::LoadParamMemI8;
       break;
@@ -2023,7 +2129,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
   case 2:
     switch (MemVT.getSimpleVT().SimpleTy) {
     default:
-      return NULL;
+      return nullptr;
     case MVT::i1:
       Opc = NVPTX::LoadParamMemV2I8;
       break;
@@ -2050,7 +2156,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
   case 4:
     switch (MemVT.getSimpleVT().SimpleTy) {
     default:
-      return NULL;
+      return nullptr;
     case MVT::i1:
       Opc = NVPTX::LoadParamMemV4I8;
       break;
@@ -2077,7 +2183,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
     VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
   } else {
     EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
-    VTs = CurDAG->getVTList(&EVTs[0], array_lengthof(EVTs));
+    VTs = CurDAG->getVTList(EVTs);
   }
 
   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
@@ -2103,7 +2209,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   unsigned NumElts = 1;
   switch (N->getOpcode()) {
   default:
-    return NULL;
+    return nullptr;
   case NVPTXISD::StoreRetval:
     NumElts = 1;
     break;
@@ -2128,11 +2234,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   unsigned Opcode = 0;
   switch (NumElts) {
   default:
-    return NULL;
+    return nullptr;
   case 1:
     switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
     default:
-      return NULL;
+      return nullptr;
     case MVT::i1:
       Opcode = NVPTX::StoreRetvalI8;
       break;
@@ -2159,7 +2265,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   case 2:
     switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
     default:
-      return NULL;
+      return nullptr;
     case MVT::i1:
       Opcode = NVPTX::StoreRetvalV2I8;
       break;
@@ -2186,7 +2292,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   case 4:
     switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
     default:
-      return NULL;
+      return nullptr;
     case MVT::i1:
       Opcode = NVPTX::StoreRetvalV4I8;
       break;
@@ -2229,7 +2335,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
   unsigned NumElts = 1;
   switch (N->getOpcode()) {
   default:
-    return NULL;
+    return nullptr;
   case NVPTXISD::StoreParamU32:
   case NVPTXISD::StoreParamS32:
   case NVPTXISD::StoreParam:
@@ -2260,11 +2366,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
   default:
     switch (NumElts) {
     default:
-      return NULL;
+      return nullptr;
     case 1:
       switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i1:
         Opcode = NVPTX::StoreParamI8;
         break;
@@ -2291,7 +2397,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
     case 2:
       switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i1:
         Opcode = NVPTX::StoreParamV2I8;
         break;
@@ -2318,7 +2424,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
     case 4:
       switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i1:
         Opcode = NVPTX::StoreParamV4I8;
         break;
@@ -2371,6 +2477,488 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
   return Ret;
 }
 
+SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue TexRef = N->getOperand(1);
+  SDValue SampRef = N->getOperand(2);
+  SDNode *Ret = nullptr;
+  unsigned Opc = 0;
+  SmallVector<SDValue, 8> Ops;
+
+  switch (N->getOpcode()) {
+  default: return nullptr;
+  case NVPTXISD::Tex1DFloatI32:
+    Opc = NVPTX::TEX_1D_F32_I32;
+    break;
+  case NVPTXISD::Tex1DFloatFloat:
+    Opc = NVPTX::TEX_1D_F32_F32;
+    break;
+  case NVPTXISD::Tex1DFloatFloatLevel:
+    Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DFloatFloatGrad:
+    Opc = NVPTX::TEX_1D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DI32I32:
+    Opc = NVPTX::TEX_1D_I32_I32;
+    break;
+  case NVPTXISD::Tex1DI32Float:
+    Opc = NVPTX::TEX_1D_I32_F32;
+    break;
+  case NVPTXISD::Tex1DI32FloatLevel:
+    Opc = NVPTX::TEX_1D_I32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DI32FloatGrad:
+    Opc = NVPTX::TEX_1D_I32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DArrayFloatI32:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_I32;
+    break;
+  case NVPTXISD::Tex1DArrayFloatFloat:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DArrayI32I32:
+    Opc = NVPTX::TEX_1D_ARRAY_I32_I32;
+    break;
+  case NVPTXISD::Tex1DArrayI32Float:
+    Opc = NVPTX::TEX_1D_ARRAY_I32_F32;
+    break;
+  case NVPTXISD::Tex1DArrayI32FloatLevel:
+    Opc = NVPTX::TEX_1D_ARRAY_I32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DArrayI32FloatGrad:
+    Opc = NVPTX::TEX_1D_ARRAY_I32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DFloatI32:
+    Opc = NVPTX::TEX_2D_F32_I32;
+    break;
+  case NVPTXISD::Tex2DFloatFloat:
+    Opc = NVPTX::TEX_2D_F32_F32;
+    break;
+  case NVPTXISD::Tex2DFloatFloatLevel:
+    Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DFloatFloatGrad:
+    Opc = NVPTX::TEX_2D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DI32I32:
+    Opc = NVPTX::TEX_2D_I32_I32;
+    break;
+  case NVPTXISD::Tex2DI32Float:
+    Opc = NVPTX::TEX_2D_I32_F32;
+    break;
+  case NVPTXISD::Tex2DI32FloatLevel:
+    Opc = NVPTX::TEX_2D_I32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DI32FloatGrad:
+    Opc = NVPTX::TEX_2D_I32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DArrayFloatI32:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_I32;
+    break;
+  case NVPTXISD::Tex2DArrayFloatFloat:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DArrayI32I32:
+    Opc = NVPTX::TEX_2D_ARRAY_I32_I32;
+    break;
+  case NVPTXISD::Tex2DArrayI32Float:
+    Opc = NVPTX::TEX_2D_ARRAY_I32_F32;
+    break;
+  case NVPTXISD::Tex2DArrayI32FloatLevel:
+    Opc = NVPTX::TEX_2D_ARRAY_I32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DArrayI32FloatGrad:
+    Opc = NVPTX::TEX_2D_ARRAY_I32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex3DFloatI32:
+    Opc = NVPTX::TEX_3D_F32_I32;
+    break;
+  case NVPTXISD::Tex3DFloatFloat:
+    Opc = NVPTX::TEX_3D_F32_F32;
+    break;
+  case NVPTXISD::Tex3DFloatFloatLevel:
+    Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex3DFloatFloatGrad:
+    Opc = NVPTX::TEX_3D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex3DI32I32:
+    Opc = NVPTX::TEX_3D_I32_I32;
+    break;
+  case NVPTXISD::Tex3DI32Float:
+    Opc = NVPTX::TEX_3D_I32_F32;
+    break;
+  case NVPTXISD::Tex3DI32FloatLevel:
+    Opc = NVPTX::TEX_3D_I32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex3DI32FloatGrad:
+    Opc = NVPTX::TEX_3D_I32_F32_GRAD;
+    break;
+  }
+
+  Ops.push_back(TexRef);
+  Ops.push_back(SampRef);
+
+  // Copy over indices
+  for (unsigned i = 3; i < N->getNumOperands(); ++i) {
+    Ops.push_back(N->getOperand(i));
+  }
+
+  Ops.push_back(Chain);
+  Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+  return Ret;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue TexHandle = N->getOperand(1);
+  SDNode *Ret = nullptr;
+  unsigned Opc = 0;
+  SmallVector<SDValue, 8> Ops;
+  switch (N->getOpcode()) {
+  default: return nullptr;
+  case NVPTXISD::Suld1DI8Trap:
+    Opc = NVPTX::SULD_1D_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI16Trap:
+    Opc = NVPTX::SULD_1D_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI32Trap:
+    Opc = NVPTX::SULD_1D_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I8Trap:
+    Opc = NVPTX::SULD_1D_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I16Trap:
+    Opc = NVPTX::SULD_1D_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I32Trap:
+    Opc = NVPTX::SULD_1D_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I8Trap:
+    Opc = NVPTX::SULD_1D_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I16Trap:
+    Opc = NVPTX::SULD_1D_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I32Trap:
+    Opc = NVPTX::SULD_1D_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI8Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI16Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI32Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I8Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I16Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I32Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I8Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I16Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I32Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI8Trap:
+    Opc = NVPTX::SULD_2D_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI16Trap:
+    Opc = NVPTX::SULD_2D_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI32Trap:
+    Opc = NVPTX::SULD_2D_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I8Trap:
+    Opc = NVPTX::SULD_2D_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I16Trap:
+    Opc = NVPTX::SULD_2D_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I32Trap:
+    Opc = NVPTX::SULD_2D_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I8Trap:
+    Opc = NVPTX::SULD_2D_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I16Trap:
+    Opc = NVPTX::SULD_2D_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I32Trap:
+    Opc = NVPTX::SULD_2D_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI8Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI16Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI32Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I8Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I16Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I32Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I8Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I16Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I32Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI8Trap:
+    Opc = NVPTX::SULD_3D_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI16Trap:
+    Opc = NVPTX::SULD_3D_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI32Trap:
+    Opc = NVPTX::SULD_3D_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I8Trap:
+    Opc = NVPTX::SULD_3D_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I16Trap:
+    Opc = NVPTX::SULD_3D_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I32Trap:
+    Opc = NVPTX::SULD_3D_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I8Trap:
+    Opc = NVPTX::SULD_3D_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I16Trap:
+    Opc = NVPTX::SULD_3D_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I32Trap:
+    Opc = NVPTX::SULD_3D_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  }
+  Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+  return Ret;
+}
+
 // SelectDirectAddr - Match a direct address for DAG.
 // A direct address could be a globaladdress or externalsymbol.
 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
@@ -2464,14 +3052,18 @@ bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
 
 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
                                                  unsigned int spN) const {
-  const Value *Src = NULL;
+  const Value *Src = nullptr;
   // Even though MemIntrinsicSDNode is a subclas of MemSDNode,
   // the classof() for MemSDNode does not include MemIntrinsicSDNode
   // (See SelectionDAGNodes.h). So we need to check for both.
   if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
-    Src = mN->getSrcValue();
+    if (spN == 0 && mN->getMemOperand()->getPseudoValue())
+      return true;
+    Src = mN->getMemOperand()->getValue();
   } else if (MemSDNode *mN = dyn_cast<MemIntrinsicSDNode>(N)) {
-    Src = mN->getSrcValue();
+    if (spN == 0 && mN->getMemOperand()->getPseudoValue())
+      return true;
+    Src = mN->getMemOperand()->getValue();
   }
   if (!Src)
     return false;
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 93ad169..11f92e7 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "nvptx-isel"
-
 #include "NVPTX.h"
 #include "NVPTXISelLowering.h"
 #include "NVPTXRegisterInfo.h"
@@ -46,19 +44,22 @@ public:
                              CodeGenOpt::Level   OptLevel);
 
   // Pass Name
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "NVPTX DAG->DAG Pattern Instruction Selection";
   }
 
   const NVPTXSubtarget &Subtarget;
 
-  virtual bool SelectInlineAsmMemoryOperand(
-      const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps);
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
 private:
 // Include the pieces autogenerated from the target description.
 #include "NVPTXGenDAGISel.inc"
 
-  SDNode *Select(SDNode *N);
+  SDNode *Select(SDNode *N) override;
+  SDNode *SelectIntrinsicNoChain(SDNode *N);
+  SDNode *SelectTexSurfHandle(SDNode *N);
   SDNode *SelectLoad(SDNode *N);
   SDNode *SelectLoadVector(SDNode *N);
   SDNode *SelectLDGLDUVector(SDNode *N);
@@ -68,6 +69,8 @@ private:
   SDNode *SelectStoreRetval(SDNode *N);
   SDNode *SelectStoreParam(SDNode *N);
   SDNode *SelectAddrSpaceCast(SDNode *N);
+  SDNode *SelectTextureIntrinsic(SDNode *N);
+  SDNode *SelectSurfaceIntrinsic(SDNode *N);
         
   inline SDValue getI32Imm(unsigned Imm) {
     return CurDAG->getTargetConstant(Imm, MVT::i32);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8e25a65..b0943be 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -75,7 +75,7 @@ static bool IsPTXVectorType(MVT VT) {
 /// LowerCall, and LowerReturn.
 static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty,
                                SmallVectorImpl<EVT> &ValueVTs,
-                               SmallVectorImpl<uint64_t> *Offsets = 0,
+                               SmallVectorImpl<uint64_t> *Offsets = nullptr,
                                uint64_t StartingOffset = 0) {
   SmallVector<EVT, 16> TempVTs;
   SmallVector<uint64_t, 16> TempOffsets;
@@ -245,7 +245,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   default:
-    return 0;
+    return nullptr;
   case NVPTXISD::CALL:
     return "NVPTXISD::CALL";
   case NVPTXISD::RET_FLAG:
@@ -328,6 +328,116 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "NVPTXISD::StoreV2";
   case NVPTXISD::StoreV4:
     return "NVPTXISD::StoreV4";
+  case NVPTXISD::Tex1DFloatI32:        return "NVPTXISD::Tex1DFloatI32";
+  case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
+  case NVPTXISD::Tex1DFloatFloatLevel:
+    return "NVPTXISD::Tex1DFloatFloatLevel";
+  case NVPTXISD::Tex1DFloatFloatGrad:
+    return "NVPTXISD::Tex1DFloatFloatGrad";
+  case NVPTXISD::Tex1DI32I32:          return "NVPTXISD::Tex1DI32I32";
+  case NVPTXISD::Tex1DI32Float:        return "NVPTXISD::Tex1DI32Float";
+  case NVPTXISD::Tex1DI32FloatLevel:
+    return "NVPTXISD::Tex1DI32FloatLevel";
+  case NVPTXISD::Tex1DI32FloatGrad:
+    return "NVPTXISD::Tex1DI32FloatGrad";
+  case NVPTXISD::Tex1DArrayFloatI32:   return "NVPTXISD::Tex2DArrayFloatI32";
+  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
+  case NVPTXISD::Tex1DArrayI32I32:     return "NVPTXISD::Tex2DArrayI32I32";
+  case NVPTXISD::Tex1DArrayI32Float:   return "NVPTXISD::Tex2DArrayI32Float";
+  case NVPTXISD::Tex1DArrayI32FloatLevel:
+    return "NVPTXISD::Tex2DArrayI32FloatLevel";
+  case NVPTXISD::Tex1DArrayI32FloatGrad:
+    return "NVPTXISD::Tex2DArrayI32FloatGrad";
+  case NVPTXISD::Tex2DFloatI32:        return "NVPTXISD::Tex2DFloatI32";
+  case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
+  case NVPTXISD::Tex2DFloatFloatLevel:
+    return "NVPTXISD::Tex2DFloatFloatLevel";
+  case NVPTXISD::Tex2DFloatFloatGrad:
+    return "NVPTXISD::Tex2DFloatFloatGrad";
+  case NVPTXISD::Tex2DI32I32:          return "NVPTXISD::Tex2DI32I32";
+  case NVPTXISD::Tex2DI32Float:        return "NVPTXISD::Tex2DI32Float";
+  case NVPTXISD::Tex2DI32FloatLevel:
+    return "NVPTXISD::Tex2DI32FloatLevel";
+  case NVPTXISD::Tex2DI32FloatGrad:
+    return "NVPTXISD::Tex2DI32FloatGrad";
+  case NVPTXISD::Tex2DArrayFloatI32:   return "NVPTXISD::Tex2DArrayFloatI32";
+  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
+  case NVPTXISD::Tex2DArrayI32I32:     return "NVPTXISD::Tex2DArrayI32I32";
+  case NVPTXISD::Tex2DArrayI32Float:   return "NVPTXISD::Tex2DArrayI32Float";
+  case NVPTXISD::Tex2DArrayI32FloatLevel:
+    return "NVPTXISD::Tex2DArrayI32FloatLevel";
+  case NVPTXISD::Tex2DArrayI32FloatGrad:
+    return "NVPTXISD::Tex2DArrayI32FloatGrad";
+  case NVPTXISD::Tex3DFloatI32:        return "NVPTXISD::Tex3DFloatI32";
+  case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
+  case NVPTXISD::Tex3DFloatFloatLevel:
+    return "NVPTXISD::Tex3DFloatFloatLevel";
+  case NVPTXISD::Tex3DFloatFloatGrad:
+    return "NVPTXISD::Tex3DFloatFloatGrad";
+  case NVPTXISD::Tex3DI32I32:          return "NVPTXISD::Tex3DI32I32";
+  case NVPTXISD::Tex3DI32Float:        return "NVPTXISD::Tex3DI32Float";
+  case NVPTXISD::Tex3DI32FloatLevel:
+    return "NVPTXISD::Tex3DI32FloatLevel";
+  case NVPTXISD::Tex3DI32FloatGrad:
+    return "NVPTXISD::Tex3DI32FloatGrad";
+
+  case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
+  case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
+  case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
+  case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
+  case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
+  case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
+  case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
+  case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
+  case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
+
+  case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
+  case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
+  case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
+  case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
+  case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
+  case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
+  case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
+  case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
+  case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
+
+  case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
+  case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
+  case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
+  case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
+  case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
+  case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
+  case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
+  case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
+  case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
+
+  case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
+  case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
+  case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
+  case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
+  case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
+  case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
+  case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
+  case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
+  case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
+
+  case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
+  case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
+  case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
+  case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
+  case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
+  case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
+  case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
+  case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
+  case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
   }
 }
 
@@ -526,7 +636,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
   bool &isTailCall = CLI.IsTailCall;
-  ArgListTy &Args = CLI.Args;
+  ArgListTy &Args = CLI.getArgs();
   Type *retTy = CLI.RetTy;
   ImmutableCallSite *CS = CLI.CS;
 
@@ -575,7 +685,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                       DAG.getConstant(paramCount, MVT::i32),
                                       DAG.getConstant(sz, MVT::i32), InFlag };
         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                            DeclareParamOps, 5);
+                            DeclareParamOps);
         InFlag = Chain.getValue(1);
         unsigned curOffset = 0;
         for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
@@ -599,7 +709,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                        DAG.getConstant(curOffset, MVT::i32),
                                        StVal, InFlag };
             Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
-                                            CopyParamVTs, &CopyParamOps[0], 5,
+                                            CopyParamVTs, CopyParamOps,
                                             elemtype, MachinePointerInfo());
             InFlag = Chain.getValue(1);
             curOffset += sz / 8;
@@ -621,7 +731,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                       DAG.getConstant(paramCount, MVT::i32),
                                       DAG.getConstant(sz, MVT::i32), InFlag };
         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                            DeclareParamOps, 5);
+                            DeclareParamOps);
         InFlag = Chain.getValue(1);
         unsigned NumElts = ObjectVT.getVectorNumElements();
         EVT EltVT = ObjectVT.getVectorElementType();
@@ -644,7 +754,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                      DAG.getConstant(0, MVT::i32), Elt,
                                      InFlag };
           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
-                                          CopyParamVTs, &CopyParamOps[0], 5,
+                                          CopyParamVTs, CopyParamOps,
                                           MemVT, MachinePointerInfo());
           InFlag = Chain.getValue(1);
         } else if (NumElts == 2) {
@@ -661,7 +771,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                      DAG.getConstant(0, MVT::i32), Elt0, Elt1,
                                      InFlag };
           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
-                                          CopyParamVTs, &CopyParamOps[0], 6,
+                                          CopyParamVTs, CopyParamOps,
                                           MemVT, MachinePointerInfo());
           InFlag = Chain.getValue(1);
         } else {
@@ -735,9 +845,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
             Ops.push_back(InFlag);
 
             SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-            Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, &Ops[0],
-                                            Ops.size(), MemVT,
-                                            MachinePointerInfo());
+            Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
+                                            MemVT, MachinePointerInfo());
             InFlag = Chain.getValue(1);
             curOffset += PerStoreOffset;
           }
@@ -762,7 +871,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                     DAG.getConstant(sz, MVT::i32),
                                     DAG.getConstant(0, MVT::i32), InFlag };
       Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
-                          DeclareParamOps, 5);
+                          DeclareParamOps);
       InFlag = Chain.getValue(1);
       SDValue OutV = OutVals[OIdx];
       if (needExtend) {
@@ -781,7 +890,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         opcode = NVPTXISD::StoreParamU32;
       else if (Outs[OIdx].Flags.isSExt())
         opcode = NVPTXISD::StoreParamS32;
-      Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps, 5,
+      Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
                                       VT, MachinePointerInfo());
 
       InFlag = Chain.getValue(1);
@@ -806,7 +915,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       InFlag
     };
     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                        DeclareParamOps, 5);
+                        DeclareParamOps);
     InFlag = Chain.getValue(1);
     unsigned curOffset = 0;
     for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
@@ -834,7 +943,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                    DAG.getConstant(curOffset, MVT::i32), theVal,
                                    InFlag };
         Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
-                                        CopyParamOps, 5, elemtype,
+                                        CopyParamOps, elemtype,
                                         MachinePointerInfo());
 
         InFlag = Chain.getValue(1);
@@ -865,7 +974,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   DAG.getConstant(resultsz, MVT::i32),
                                   DAG.getConstant(0, MVT::i32), InFlag };
       Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
-                          DeclareRetOps, 5);
+                          DeclareRetOps);
       InFlag = Chain.getValue(1);
     } else {
       retAlignment = getArgumentAlignment(Callee, CS, retTy, 0);
@@ -875,7 +984,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   DAG.getConstant(resultsz / 8, MVT::i32),
                                   DAG.getConstant(0, MVT::i32), InFlag };
       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
-                          DeclareRetOps, 5);
+                          DeclareRetOps);
       InFlag = Chain.getValue(1);
     }
   }
@@ -895,7 +1004,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     SDValue ProtoOps[] = {
       Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
     };
-    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, &ProtoOps[0], 3);
+    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
     InFlag = Chain.getValue(1);
   }
   // Op to just print "call"
@@ -904,20 +1013,20 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, MVT::i32), InFlag
   };
   Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall),
-                      dl, PrintCallVTs, PrintCallOps, 3);
+                      dl, PrintCallVTs, PrintCallOps);
   InFlag = Chain.getValue(1);
 
   // Ops to print out the function name
   SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue CallVoidOps[] = { Chain, Callee, InFlag };
-  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps, 3);
+  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
   InFlag = Chain.getValue(1);
 
   // Ops to print out the param list
   SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue CallArgBeginOps[] = { Chain, InFlag };
   Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
-                      CallArgBeginOps, 2);
+                      CallArgBeginOps);
   InFlag = Chain.getValue(1);
 
   for (unsigned i = 0, e = paramCount; i != e; ++i) {
@@ -929,21 +1038,20 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue CallArgOps[] = { Chain, DAG.getConstant(1, MVT::i32),
                              DAG.getConstant(i, MVT::i32), InFlag };
-    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps, 4);
+    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
     InFlag = Chain.getValue(1);
   }
   SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue CallArgEndOps[] = { Chain, DAG.getConstant(Func ? 1 : 0, MVT::i32),
                               InFlag };
-  Chain =
-      DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps, 3);
+  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
   InFlag = Chain.getValue(1);
 
   if (!Func) {
     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue PrototypeOps[] = { Chain, DAG.getConstant(uniqueCallSite, MVT::i32),
                                InFlag };
-    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps, 3);
+    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
     InFlag = Chain.getValue(1);
   }
 
@@ -962,7 +1070,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
       if (NumElts == 1) {
         // Just a simple load
-        std::vector<EVT> LoadRetVTs;
+        SmallVector<EVT, 4> LoadRetVTs;
         if (needTruncate) {
           // If loading i1 result, generate
           //   load i16
@@ -972,15 +1080,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           LoadRetVTs.push_back(EltVT);
         LoadRetVTs.push_back(MVT::Other);
         LoadRetVTs.push_back(MVT::Glue);
-        std::vector<SDValue> LoadRetOps;
+        SmallVector<SDValue, 4> LoadRetOps;
         LoadRetOps.push_back(Chain);
         LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
         LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
         LoadRetOps.push_back(InFlag);
         SDValue retval = DAG.getMemIntrinsicNode(
             NVPTXISD::LoadParam, dl,
-            DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0],
-            LoadRetOps.size(), EltVT, MachinePointerInfo());
+            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
         Chain = retval.getValue(1);
         InFlag = retval.getValue(2);
         SDValue Ret0 = retval;
@@ -989,7 +1096,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         InVals.push_back(Ret0);
       } else if (NumElts == 2) {
         // LoadV2
-        std::vector<EVT> LoadRetVTs;
+        SmallVector<EVT, 4> LoadRetVTs;
         if (needTruncate) {
           // If loading i1 result, generate
           //   load i16
@@ -1002,15 +1109,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         }
         LoadRetVTs.push_back(MVT::Other);
         LoadRetVTs.push_back(MVT::Glue);
-        std::vector<SDValue> LoadRetOps;
+        SmallVector<SDValue, 4> LoadRetOps;
         LoadRetOps.push_back(Chain);
         LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
         LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
         LoadRetOps.push_back(InFlag);
         SDValue retval = DAG.getMemIntrinsicNode(
             NVPTXISD::LoadParamV2, dl,
-            DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0],
-            LoadRetOps.size(), EltVT, MachinePointerInfo());
+            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
         Chain = retval.getValue(2);
         InFlag = retval.getValue(3);
         SDValue Ret0 = retval.getValue(0);
@@ -1054,8 +1160,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           LoadRetOps.push_back(DAG.getConstant(Ofst, MVT::i32));
           LoadRetOps.push_back(InFlag);
           SDValue retval = DAG.getMemIntrinsicNode(
-              Opc, dl, DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()),
-              &LoadRetOps[0], LoadRetOps.size(), EltVT, MachinePointerInfo());
+              Opc, dl, DAG.getVTList(LoadRetVTs),
+              LoadRetOps, EltVT, MachinePointerInfo());
           if (VecSize == 2) {
             Chain = retval.getValue(2);
             InFlag = retval.getValue(3);
@@ -1110,8 +1216,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         LoadRetOps.push_back(InFlag);
         SDValue retval = DAG.getMemIntrinsicNode(
             NVPTXISD::LoadParam, dl,
-            DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0],
-            LoadRetOps.size(), TheLoadType, MachinePointerInfo());
+            DAG.getVTList(LoadRetVTs), LoadRetOps,
+            TheLoadType, MachinePointerInfo());
         Chain = retval.getValue(1);
         InFlag = retval.getValue(2);
         SDValue Ret0 = retval.getValue(0);
@@ -1153,8 +1259,7 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
                                 DAG.getIntPtrConstant(j)));
     }
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), &Ops[0],
-                     Ops.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops);
 }
 
 SDValue
@@ -1209,7 +1314,7 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
   // in LegalizeDAG.cpp which also uses MergeValues.
   SDValue Ops[] = { result, LD->getChain() };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -1297,7 +1402,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     MemSDNode *MemSD = cast<MemSDNode>(N);
 
     SDValue NewSt = DAG.getMemIntrinsicNode(
-        Opcode, DL, DAG.getVTList(MVT::Other), &Ops[0], Ops.size(),
+        Opcode, DL, DAG.getVTList(MVT::Other), Ops,
         MemSD->getMemoryVT(), MemSD->getMemOperand());
 
     //return DCI.CombineTo(N, NewSt, true);
@@ -1429,7 +1534,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     if (isImageOrSamplerVal(
             theArgs[i],
             (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
-                                     : 0))) {
+                                     : nullptr))) {
       assert(isKernel && "Only kernels can have image/sampler params");
       InVals.push_back(DAG.getConstant(i + 1, MVT::i32));
       continue;
@@ -1683,8 +1788,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   //}
 
   if (!OutChains.empty())
-    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &OutChains[0],
-                            OutChains.size()));
+    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
 
   return Chain;
 }
@@ -1726,7 +1830,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
         StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
       SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal };
       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
-                                      DAG.getVTList(MVT::Other), &Ops[0], 3,
+                                      DAG.getVTList(MVT::Other), Ops,
                                       EltVT, MachinePointerInfo());
 
     } else if (NumElts == 2) {
@@ -1742,7 +1846,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal0,
                         StoreVal1 };
       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
-                                      DAG.getVTList(MVT::Other), &Ops[0], 4,
+                                      DAG.getVTList(MVT::Other), Ops,
                                       EltVT, MachinePointerInfo());
     } else {
       // V4 stores
@@ -1814,8 +1918,8 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
         // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
         Chain =
-            DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), &Ops[0],
-                                    Ops.size(), EltVT, MachinePointerInfo());
+            DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
+                                    EltVT, MachinePointerInfo());
         Offset += PerStoreOffset;
       }
     }
@@ -1852,8 +1956,8 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
         SDValue Ops[] = { Chain, DAG.getConstant(SizeSoFar, MVT::i32), TmpVal };
         Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
-                                        DAG.getVTList(MVT::Other), &Ops[0],
-                                        3, TheStoreType,
+                                        DAG.getVTList(MVT::Other), Ops,
+                                        TheStoreType,
                                         MachinePointerInfo());
         if(TheValType.isVector())
           SizeSoFar += 
@@ -1891,6 +1995,195 @@ bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const {
   return false;
 }
 
+static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
+  switch (Intrinsic) {
+  default:
+    return 0;
+
+  case Intrinsic::nvvm_tex_1d_v4f32_i32:
+    return NVPTXISD::Tex1DFloatI32;
+  case Intrinsic::nvvm_tex_1d_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloat;
+  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_1d_v4i32_i32:
+    return NVPTXISD::Tex1DI32I32;
+  case Intrinsic::nvvm_tex_1d_v4i32_f32:
+    return NVPTXISD::Tex1DI32Float;
+  case Intrinsic::nvvm_tex_1d_level_v4i32_f32:
+    return NVPTXISD::Tex1DI32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4i32_f32:
+    return NVPTXISD::Tex1DI32FloatGrad;
+
+  case Intrinsic::nvvm_tex_1d_array_v4f32_i32:
+    return NVPTXISD::Tex1DArrayFloatI32;
+  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_1d_array_v4i32_i32:
+    return NVPTXISD::Tex1DArrayI32I32;
+  case Intrinsic::nvvm_tex_1d_array_v4i32_f32:
+    return NVPTXISD::Tex1DArrayI32Float;
+  case Intrinsic::nvvm_tex_1d_array_level_v4i32_f32:
+    return NVPTXISD::Tex1DArrayI32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4i32_f32:
+    return NVPTXISD::Tex1DArrayI32FloatGrad;
+
+  case Intrinsic::nvvm_tex_2d_v4f32_i32:
+    return NVPTXISD::Tex2DFloatI32;
+  case Intrinsic::nvvm_tex_2d_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloat;
+  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_2d_v4i32_i32:
+    return NVPTXISD::Tex2DI32I32;
+  case Intrinsic::nvvm_tex_2d_v4i32_f32:
+    return NVPTXISD::Tex2DI32Float;
+  case Intrinsic::nvvm_tex_2d_level_v4i32_f32:
+    return NVPTXISD::Tex2DI32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4i32_f32:
+    return NVPTXISD::Tex2DI32FloatGrad;
+
+  case Intrinsic::nvvm_tex_2d_array_v4f32_i32:
+    return NVPTXISD::Tex2DArrayFloatI32;
+  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_2d_array_v4i32_i32:
+    return NVPTXISD::Tex2DArrayI32I32;
+  case Intrinsic::nvvm_tex_2d_array_v4i32_f32:
+    return NVPTXISD::Tex2DArrayI32Float;
+  case Intrinsic::nvvm_tex_2d_array_level_v4i32_f32:
+    return NVPTXISD::Tex2DArrayI32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4i32_f32:
+    return NVPTXISD::Tex2DArrayI32FloatGrad;
+
+  case Intrinsic::nvvm_tex_3d_v4f32_i32:
+    return NVPTXISD::Tex3DFloatI32;
+  case Intrinsic::nvvm_tex_3d_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloat;
+  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_3d_v4i32_i32:
+    return NVPTXISD::Tex3DI32I32;
+  case Intrinsic::nvvm_tex_3d_v4i32_f32:
+    return NVPTXISD::Tex3DI32Float;
+  case Intrinsic::nvvm_tex_3d_level_v4i32_f32:
+    return NVPTXISD::Tex3DI32FloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4i32_f32:
+    return NVPTXISD::Tex3DI32FloatGrad;
+  }
+}
+
+static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
+  switch (Intrinsic) {
+  default:
+    return 0;
+  case Intrinsic::nvvm_suld_1d_i8_trap:
+    return NVPTXISD::Suld1DI8Trap;
+  case Intrinsic::nvvm_suld_1d_i16_trap:
+    return NVPTXISD::Suld1DI16Trap;
+  case Intrinsic::nvvm_suld_1d_i32_trap:
+    return NVPTXISD::Suld1DI32Trap;
+  case Intrinsic::nvvm_suld_1d_v2i8_trap:
+    return NVPTXISD::Suld1DV2I8Trap;
+  case Intrinsic::nvvm_suld_1d_v2i16_trap:
+    return NVPTXISD::Suld1DV2I16Trap;
+  case Intrinsic::nvvm_suld_1d_v2i32_trap:
+    return NVPTXISD::Suld1DV2I32Trap;
+  case Intrinsic::nvvm_suld_1d_v4i8_trap:
+    return NVPTXISD::Suld1DV4I8Trap;
+  case Intrinsic::nvvm_suld_1d_v4i16_trap:
+    return NVPTXISD::Suld1DV4I16Trap;
+  case Intrinsic::nvvm_suld_1d_v4i32_trap:
+    return NVPTXISD::Suld1DV4I32Trap;
+  case Intrinsic::nvvm_suld_1d_array_i8_trap:
+    return NVPTXISD::Suld1DArrayI8Trap;
+  case Intrinsic::nvvm_suld_1d_array_i16_trap:
+    return NVPTXISD::Suld1DArrayI16Trap;
+  case Intrinsic::nvvm_suld_1d_array_i32_trap:
+    return NVPTXISD::Suld1DArrayI32Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+    return NVPTXISD::Suld1DArrayV2I8Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+    return NVPTXISD::Suld1DArrayV2I16Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+    return NVPTXISD::Suld1DArrayV2I32Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+    return NVPTXISD::Suld1DArrayV4I8Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+    return NVPTXISD::Suld1DArrayV4I16Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+    return NVPTXISD::Suld1DArrayV4I32Trap;
+  case Intrinsic::nvvm_suld_2d_i8_trap:
+    return NVPTXISD::Suld2DI8Trap;
+  case Intrinsic::nvvm_suld_2d_i16_trap:
+    return NVPTXISD::Suld2DI16Trap;
+  case Intrinsic::nvvm_suld_2d_i32_trap:
+    return NVPTXISD::Suld2DI32Trap;
+  case Intrinsic::nvvm_suld_2d_v2i8_trap:
+    return NVPTXISD::Suld2DV2I8Trap;
+  case Intrinsic::nvvm_suld_2d_v2i16_trap:
+    return NVPTXISD::Suld2DV2I16Trap;
+  case Intrinsic::nvvm_suld_2d_v2i32_trap:
+    return NVPTXISD::Suld2DV2I32Trap;
+  case Intrinsic::nvvm_suld_2d_v4i8_trap:
+    return NVPTXISD::Suld2DV4I8Trap;
+  case Intrinsic::nvvm_suld_2d_v4i16_trap:
+    return NVPTXISD::Suld2DV4I16Trap;
+  case Intrinsic::nvvm_suld_2d_v4i32_trap:
+    return NVPTXISD::Suld2DV4I32Trap;
+  case Intrinsic::nvvm_suld_2d_array_i8_trap:
+    return NVPTXISD::Suld2DArrayI8Trap;
+  case Intrinsic::nvvm_suld_2d_array_i16_trap:
+    return NVPTXISD::Suld2DArrayI16Trap;
+  case Intrinsic::nvvm_suld_2d_array_i32_trap:
+    return NVPTXISD::Suld2DArrayI32Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+    return NVPTXISD::Suld2DArrayV2I8Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+    return NVPTXISD::Suld2DArrayV2I16Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+    return NVPTXISD::Suld2DArrayV2I32Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+    return NVPTXISD::Suld2DArrayV4I8Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+    return NVPTXISD::Suld2DArrayV4I16Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+    return NVPTXISD::Suld2DArrayV4I32Trap;
+  case Intrinsic::nvvm_suld_3d_i8_trap:
+    return NVPTXISD::Suld3DI8Trap;
+  case Intrinsic::nvvm_suld_3d_i16_trap:
+    return NVPTXISD::Suld3DI16Trap;
+  case Intrinsic::nvvm_suld_3d_i32_trap:
+    return NVPTXISD::Suld3DI32Trap;
+  case Intrinsic::nvvm_suld_3d_v2i8_trap:
+    return NVPTXISD::Suld3DV2I8Trap;
+  case Intrinsic::nvvm_suld_3d_v2i16_trap:
+    return NVPTXISD::Suld3DV2I16Trap;
+  case Intrinsic::nvvm_suld_3d_v2i32_trap:
+    return NVPTXISD::Suld3DV2I32Trap;
+  case Intrinsic::nvvm_suld_3d_v4i8_trap:
+    return NVPTXISD::Suld3DV4I8Trap;
+  case Intrinsic::nvvm_suld_3d_v4i16_trap:
+    return NVPTXISD::Suld3DV4I16Trap;
+  case Intrinsic::nvvm_suld_3d_v4i32_trap:
+    return NVPTXISD::Suld3DV4I32Trap;
+  }
+}
+
 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
 // TgtMemIntrinsic
 // because we need the information that is only available in the "Value" type
@@ -1944,6 +2237,142 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.align = 0;
     return true;
 
+  case Intrinsic::nvvm_tex_1d_v4f32_i32:
+  case Intrinsic::nvvm_tex_1d_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4f32_i32:
+  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_v4f32_i32:
+  case Intrinsic::nvvm_tex_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4f32_i32:
+  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_v4f32_i32:
+  case Intrinsic::nvvm_tex_3d_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: {
+    Info.opc = getOpcForTextureInstr(Intrinsic);
+    Info.memVT = MVT::f32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_tex_1d_v4i32_i32:
+  case Intrinsic::nvvm_tex_1d_v4i32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4i32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4i32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4i32_i32:
+  case Intrinsic::nvvm_tex_1d_array_v4i32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4i32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4i32_f32:
+  case Intrinsic::nvvm_tex_2d_v4i32_i32:
+  case Intrinsic::nvvm_tex_2d_v4i32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4i32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4i32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4i32_i32:
+  case Intrinsic::nvvm_tex_2d_array_v4i32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4i32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4i32_f32:
+  case Intrinsic::nvvm_tex_3d_v4i32_i32:
+  case Intrinsic::nvvm_tex_3d_v4i32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4i32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4i32_f32: {
+    Info.opc = getOpcForTextureInstr(Intrinsic);
+    Info.memVT = MVT::i32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i8_trap:
+  case Intrinsic::nvvm_suld_1d_v2i8_trap:
+  case Intrinsic::nvvm_suld_1d_v4i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+  case Intrinsic::nvvm_suld_2d_i8_trap:
+  case Intrinsic::nvvm_suld_2d_v2i8_trap:
+  case Intrinsic::nvvm_suld_2d_v4i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+  case Intrinsic::nvvm_suld_3d_i8_trap:
+  case Intrinsic::nvvm_suld_3d_v2i8_trap:
+  case Intrinsic::nvvm_suld_3d_v4i8_trap: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i8;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i16_trap:
+  case Intrinsic::nvvm_suld_1d_v2i16_trap:
+  case Intrinsic::nvvm_suld_1d_v4i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+  case Intrinsic::nvvm_suld_2d_i16_trap:
+  case Intrinsic::nvvm_suld_2d_v2i16_trap:
+  case Intrinsic::nvvm_suld_2d_v4i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+  case Intrinsic::nvvm_suld_3d_i16_trap:
+  case Intrinsic::nvvm_suld_3d_v2i16_trap:
+  case Intrinsic::nvvm_suld_3d_v4i16_trap: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i16;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i32_trap:
+  case Intrinsic::nvvm_suld_1d_v2i32_trap:
+  case Intrinsic::nvvm_suld_1d_v4i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+  case Intrinsic::nvvm_suld_2d_i32_trap:
+  case Intrinsic::nvvm_suld_2d_v2i32_trap:
+  case Intrinsic::nvvm_suld_2d_v4i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+  case Intrinsic::nvvm_suld_3d_i32_trap:
+  case Intrinsic::nvvm_suld_3d_v2i32_trap:
+  case Intrinsic::nvvm_suld_3d_v4i32_trap: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+
   }
   return false;
 }
@@ -2094,7 +2523,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
   case 4: {
     Opcode = NVPTXISD::LoadV4;
     EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
-    LdResVTs = DAG.getVTList(ListVTs, 5);
+    LdResVTs = DAG.getVTList(ListVTs);
     break;
   }
   }
@@ -2111,8 +2540,8 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
   // pass along the extension information
   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
 
-  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, &OtherOps[0],
-                                          OtherOps.size(), LD->getMemoryVT(),
+  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+                                          LD->getMemoryVT(),
                                           LD->getMemOperand());
 
   SmallVector<SDValue, 4> ScalarRes;
@@ -2126,8 +2555,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
 
   SDValue LoadChain = NewLD.getValue(NumElts);
 
-  SDValue BuildVec =
-      DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, &ScalarRes[0], NumElts);
+  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
 
   Results.push_back(BuildVec);
   Results.push_back(LoadChain);
@@ -2207,7 +2635,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
           break;
         }
         EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
-        LdResVTs = DAG.getVTList(ListVTs, 5);
+        LdResVTs = DAG.getVTList(ListVTs);
         break;
       }
       }
@@ -2224,9 +2652,9 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
 
       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
 
-      SDValue NewLD = DAG.getMemIntrinsicNode(
-          Opcode, DL, LdResVTs, &OtherOps[0], OtherOps.size(),
-          MemSD->getMemoryVT(), MemSD->getMemOperand());
+      SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+                                              MemSD->getMemoryVT(),
+                                              MemSD->getMemOperand());
 
       SmallVector<SDValue, 4> ScalarRes;
 
@@ -2241,7 +2669,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
       SDValue LoadChain = NewLD.getValue(NumElts);
 
       SDValue BuildVec =
-          DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, &ScalarRes[0], NumElts);
+          DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
 
       Results.push_back(BuildVec);
       Results.push_back(LoadChain);
@@ -2263,8 +2691,8 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
       // We make sure the memory type is i8, which will be used during isel
       // to select the proper instruction.
       SDValue NewLD =
-          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, &Ops[0],
-                                  Ops.size(), MVT::i8, MemSD->getMemOperand());
+          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
+                                  MVT::i8, MemSD->getMemOperand());
 
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
                                     NewLD.getValue(0)));
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index c1e8c21..7bad8a2 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -70,7 +70,100 @@ enum NodeType {
   StoreParamU32, // to zext and store a <32bit value, not used currently 
   StoreRetval,
   StoreRetvalV2,
-  StoreRetvalV4
+  StoreRetvalV4,
+
+  // Texture intrinsics
+  Tex1DFloatI32,
+  Tex1DFloatFloat,
+  Tex1DFloatFloatLevel,
+  Tex1DFloatFloatGrad,
+  Tex1DI32I32,
+  Tex1DI32Float,
+  Tex1DI32FloatLevel,
+  Tex1DI32FloatGrad,
+  Tex1DArrayFloatI32,
+  Tex1DArrayFloatFloat,
+  Tex1DArrayFloatFloatLevel,
+  Tex1DArrayFloatFloatGrad,
+  Tex1DArrayI32I32,
+  Tex1DArrayI32Float,
+  Tex1DArrayI32FloatLevel,
+  Tex1DArrayI32FloatGrad,
+  Tex2DFloatI32,
+  Tex2DFloatFloat,
+  Tex2DFloatFloatLevel,
+  Tex2DFloatFloatGrad,
+  Tex2DI32I32,
+  Tex2DI32Float,
+  Tex2DI32FloatLevel,
+  Tex2DI32FloatGrad,
+  Tex2DArrayFloatI32,
+  Tex2DArrayFloatFloat,
+  Tex2DArrayFloatFloatLevel,
+  Tex2DArrayFloatFloatGrad,
+  Tex2DArrayI32I32,
+  Tex2DArrayI32Float,
+  Tex2DArrayI32FloatLevel,
+  Tex2DArrayI32FloatGrad,
+  Tex3DFloatI32,
+  Tex3DFloatFloat,
+  Tex3DFloatFloatLevel,
+  Tex3DFloatFloatGrad,
+  Tex3DI32I32,
+  Tex3DI32Float,
+  Tex3DI32FloatLevel,
+  Tex3DI32FloatGrad,
+
+  // Surface intrinsics
+  Suld1DI8Trap,
+  Suld1DI16Trap,
+  Suld1DI32Trap,
+  Suld1DV2I8Trap,
+  Suld1DV2I16Trap,
+  Suld1DV2I32Trap,
+  Suld1DV4I8Trap,
+  Suld1DV4I16Trap,
+  Suld1DV4I32Trap,
+
+  Suld1DArrayI8Trap,
+  Suld1DArrayI16Trap,
+  Suld1DArrayI32Trap,
+  Suld1DArrayV2I8Trap,
+  Suld1DArrayV2I16Trap,
+  Suld1DArrayV2I32Trap,
+  Suld1DArrayV4I8Trap,
+  Suld1DArrayV4I16Trap,
+  Suld1DArrayV4I32Trap,
+
+  Suld2DI8Trap,
+  Suld2DI16Trap,
+  Suld2DI32Trap,
+  Suld2DV2I8Trap,
+  Suld2DV2I16Trap,
+  Suld2DV2I32Trap,
+  Suld2DV4I8Trap,
+  Suld2DV4I16Trap,
+  Suld2DV4I32Trap,
+
+  Suld2DArrayI8Trap,
+  Suld2DArrayI16Trap,
+  Suld2DArrayI32Trap,
+  Suld2DArrayV2I8Trap,
+  Suld2DArrayV2I16Trap,
+  Suld2DArrayV2I32Trap,
+  Suld2DArrayV4I8Trap,
+  Suld2DArrayV4I16Trap,
+  Suld2DArrayV4I32Trap,
+
+  Suld3DI8Trap,
+  Suld3DI16Trap,
+  Suld3DI32Trap,
+  Suld3DV2I8Trap,
+  Suld3DV2I16Trap,
+  Suld3DV2I32Trap,
+  Suld3DV4I8Trap,
+  Suld3DV4I16Trap,
+  Suld3DV4I32Trap
 };
 }
 
@@ -80,68 +173,70 @@ enum NodeType {
 class NVPTXTargetLowering : public TargetLowering {
 public:
   explicit NVPTXTargetLowering(NVPTXTargetMachine &TM);
-  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset,
                              SelectionDAG &DAG) const;
 
-  virtual const char *getTargetNodeName(unsigned Opcode) const;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
   bool isTypeSupportedInIntrinsic(MVT VT) const;
 
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
-                          unsigned Intrinsic) const;
+                          unsigned Intrinsic) const override;
 
   /// isLegalAddressingMode - Return true if the addressing mode represented
   /// by AM is legal for this target, for a load/store of the specified type
   /// Used to guide target specific optimizations, like loop strength
   /// reduction (LoopStrengthReduce.cpp) and memory optimization for
   /// address mode (CodeGenPrepare.cpp)
-  virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
   /// getFunctionAlignment - Return the Log2 alignment of this function.
-  virtual unsigned getFunctionAlignment(const Function *F) const;
+  unsigned getFunctionAlignment(const Function *F) const;
 
-  virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const {
+  EVT getSetCCResultType(LLVMContext &, EVT VT) const override {
     if (VT.isVector())
       return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
     return MVT::i1;
   }
 
-  ConstraintType getConstraintType(const std::string &Constraint) const;
+  ConstraintType
+  getConstraintType(const std::string &Constraint) const override;
   std::pair<unsigned, const TargetRegisterClass *>
-  getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+  getRegForInlineAsmConstraint(const std::string &Constraint,
+                               MVT VT) const override;
 
-  virtual SDValue LowerFormalArguments(
+  SDValue LowerFormalArguments(
       SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
-      SmallVectorImpl<SDValue> &InVals) const;
+      SmallVectorImpl<SDValue> &InVals) const override;
 
-  virtual SDValue
-  LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
 
   std::string getPrototype(Type *, const ArgListTy &,
                            const SmallVectorImpl<ISD::OutputArg> &,
                            unsigned retAlignment,
                            const ImmutableCallSite *CS) const;
 
-  virtual SDValue
+  SDValue
   LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
               const SmallVectorImpl<ISD::OutputArg> &Outs,
               const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
-              SelectionDAG &DAG) const;
+              SelectionDAG &DAG) const override;
 
-  virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
-                                            std::vector<SDValue> &Ops,
-                                            SelectionDAG &DAG) const;
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
 
   NVPTXTargetMachine *nvTM;
 
   // PTX always uses 32-bit shift amounts
-  virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+  MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
 
-  virtual bool shouldSplitVectorType(EVT VT) const override;
+  bool shouldSplitVectorType(EVT VT) const override;
 
 private:
   const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
@@ -160,8 +255,8 @@ private:
   SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
 
-  virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) const;
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
 
   unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS,
                                 Type *Ty, unsigned Idx) const;
diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
new file mode 100644
index 0000000..397f4bc
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -0,0 +1,178 @@
+//===-- NVPTXImageOptimizer.cpp - Image optimization pass -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR-level optimizations of image access code,
+// including:
+//
+// 1. Eliminate istypep intrinsics when image access qualifier is known
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ConstantFolding.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXImageOptimizer : public FunctionPass {
+private:
+  static char ID;
+  SmallVector<Instruction*, 4> InstrToDelete;
+
+public:
+  NVPTXImageOptimizer();
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  bool replaceIsTypePSampler(Instruction &I);
+  bool replaceIsTypePSurface(Instruction &I);
+  bool replaceIsTypePTexture(Instruction &I);
+  Value *cleanupValue(Value *V);
+  void replaceWith(Instruction *From, ConstantInt *To);
+};
+}
+
+char NVPTXImageOptimizer::ID = 0;
+
+NVPTXImageOptimizer::NVPTXImageOptimizer()
+  : FunctionPass(ID) {}
+
+bool NVPTXImageOptimizer::runOnFunction(Function &F) {
+  bool Changed = false;
+  InstrToDelete.clear();
+
+  // Look for call instructions in the function
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;
+       ++BI) {
+    for (BasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
+         I != E; ++I) {
+      Instruction &Instr = *I;
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        Function *CalledF = CI->getCalledFunction();
+        if (CalledF && CalledF->isIntrinsic()) {
+          // This is an intrinsic function call, check if its an istypep
+          switch (CalledF->getIntrinsicID()) {
+          default: break;
+          case Intrinsic::nvvm_istypep_sampler:
+            Changed |= replaceIsTypePSampler(Instr);
+            break;
+          case Intrinsic::nvvm_istypep_surface:
+            Changed |= replaceIsTypePSurface(Instr);
+            break;
+          case Intrinsic::nvvm_istypep_texture:
+            Changed |= replaceIsTypePTexture(Instr);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // Delete any istypep instances we replaced in the IR
+  for (unsigned i = 0, e = InstrToDelete.size(); i != e; ++i)
+    InstrToDelete[i]->eraseFromParent();
+
+  return Changed;
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePSampler(Instruction &I) {
+  Value *TexHandle = cleanupValue(I.getOperand(0));
+  if (isSampler(*TexHandle)) {
+    // This is an OpenCL sampler, so it must be a samplerref
+    replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+    return true;
+  } else if (isImageWriteOnly(*TexHandle) ||
+             isImageReadWrite(*TexHandle) ||
+             isImageReadOnly(*TexHandle)) {
+    // This is an OpenCL image, so it cannot be a samplerref
+    replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+    return true;
+  } else {
+    // The image type is unknown, so we cannot eliminate the intrinsic
+    return false;
+  }
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePSurface(Instruction &I) {
+  Value *TexHandle = cleanupValue(I.getOperand(0));
+  if (isImageReadWrite(*TexHandle) ||
+      isImageWriteOnly(*TexHandle)) {
+    // This is an OpenCL read-only/read-write image, so it must be a surfref
+    replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+    return true;
+  } else if (isImageReadOnly(*TexHandle) ||
+             isSampler(*TexHandle)) {
+    // This is an OpenCL read-only/ imageor sampler, so it cannot be
+    // a surfref
+    replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+    return true;
+  } else {
+    // The image type is unknown, so we cannot eliminate the intrinsic
+    return false;
+  }
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePTexture(Instruction &I) {
+  Value *TexHandle = cleanupValue(I.getOperand(0));
+  if (isImageReadOnly(*TexHandle)) {
+    // This is an OpenCL read-only image, so it must be a texref
+    replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+    return true;
+  } else if (isImageWriteOnly(*TexHandle) ||
+             isImageReadWrite(*TexHandle) ||
+             isSampler(*TexHandle)) {
+    // This is an OpenCL read-write/write-only image or a sampler, so it
+    // cannot be a texref
+    replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+    return true;
+  } else {
+    // The image type is unknown, so we cannot eliminate the intrinsic
+    return false;
+  }
+}
+
+void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) {
+  // We implement "poor man's DCE" here to make sure any code that is no longer
+  // live is actually unreachable and can be trivially eliminated by the
+  // unreachable block elimiation pass.
+  for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE; ++UI) {
+    if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) {
+      if (BI->isUnconditional()) continue;
+      BasicBlock *Dest;
+      if (To->isZero())
+        // Get false block
+        Dest = BI->getSuccessor(1);
+      else
+        // Get true block
+        Dest = BI->getSuccessor(0);
+      BranchInst::Create(Dest, BI);
+      InstrToDelete.push_back(BI);
+    }
+  }
+  From->replaceAllUsesWith(To);
+  InstrToDelete.push_back(From);
+}
+
+Value *NVPTXImageOptimizer::cleanupValue(Value *V) {
+  if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
+    return cleanupValue(EVI->getAggregateOperand());
+  }
+  return V;
+}
+
+FunctionPass *llvm::createNVPTXImageOptimizerPass() {
+  return new NVPTXImageOptimizer();
+}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 86ddd38..cdc8088 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -14,8 +14,6 @@
 #include "NVPTX.h"
 #include "NVPTXInstrInfo.h"
 #include "NVPTXTargetMachine.h"
-#define GET_INSTRINFO_CTOR_DTOR
-#include "NVPTXGenInstrInfo.inc"
 #include "llvm/IR/Function.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -24,6 +22,9 @@
 
 using namespace llvm;
 
+#define GET_INSTRINFO_CTOR_DTOR
+#include "NVPTXGenInstrInfo.inc"
+
 // Pin the vtable to this file.
 void NVPTXInstrInfo::anchor() {}
 
@@ -256,7 +257,7 @@ unsigned NVPTXInstrInfo::InsertBranch(
          "NVPTX branch conditions have two components!");
 
   // One-way branch.
-  if (FBB == 0) {
+  if (!FBB) {
     if (Cond.empty()) // Unconditional branch
       BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB);
     else // Conditional branch
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index 600fc5c..88a9e45 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -30,7 +30,7 @@ class NVPTXInstrInfo : public NVPTXGenInstrInfo {
 public:
   explicit NVPTXInstrInfo(NVPTXTargetMachine &TM);
 
-  virtual const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
+  const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
 
   /* The following virtual functions are used in register allocation.
    * They are not implemented because the existing interface and the logic
@@ -50,9 +50,9 @@ public:
    *                               const TargetRegisterClass *RC) const;
    */
 
-  virtual void copyPhysReg(
+  void copyPhysReg(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
-      unsigned DestReg, unsigned SrcReg, bool KillSrc) const;
+      unsigned DestReg, unsigned SrcReg, bool KillSrc) const override;
   virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
                            unsigned &DestReg) const;
   bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
@@ -61,13 +61,13 @@ public:
 
   virtual bool CanTailMerge(const MachineInstr *MI) const;
   // Branch analysis.
-  virtual bool AnalyzeBranch(
+  bool AnalyzeBranch(
       MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
-      SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const;
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-  virtual unsigned InsertBranch(
+      SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(
       MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-      const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const;
+      const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const override;
   unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
     return MI.getOperand(2).getImm();
   }
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 14049b1..5e228fc 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1666,6 +1666,9 @@ def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
                 (MoveParam texternalsym:$src)))),
                (nvvm_move_ptr32  texternalsym:$src)>;
 
+def texsurf_handles
+  : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+              "mov.u64 \t$result, $src;", []>;
 
 //-----------------------------------
 // Compiler Error Warn
@@ -1686,6 +1689,1826 @@ def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
                 [(int_nvvm_compiler_error Int64Regs:$a)]>;
 
 
+//-----------------------------------
+// Texture Intrinsics
+//-----------------------------------
+
+// NOTE: For Fermi support, any new texture/surface/sampler intrinsics must be
+// also defined in NVPTXReplaceImageHandles.cpp
+
+
+// Texture fetch instructions using handles
+def TEX_1D_F32_I32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+              "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+              "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$lod),
+              "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], $lod;",
+              []>;
+def TEX_1D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_1D_I32_I32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+              "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_I32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+              "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_I32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], $lod;",
+              []>;
+def TEX_1D_I32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_1D_ARRAY_F32_I32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_1D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_1D_ARRAY_I32_I32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_I32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_I32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_1D_ARRAY_I32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_2D_F32_I32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_2D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_2D_I32_I32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_I32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_I32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_2D_I32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_2D_ARRAY_F32_I32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_2D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_2D_ARRAY_I32_I32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_I32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_I32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_2D_ARRAY_I32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_3D_F32_I32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_3D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+def TEX_3D_I32_I32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_I32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_I32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_3D_I32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+
+
+// Surface load instructions
+def SULD_1D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b8.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b16.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b32.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b8.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b16.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b32.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b8.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b16.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b32.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b8.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b16.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b32.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b8.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b16.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b32.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b8.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b16.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b32.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+
+
+//-----------------------------------
+// Texture Query Intrinsics
+//-----------------------------------
+def TXQ_CHANNEL_ORDER
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.channel_order.b32 \t$d, [$a];",
+              []>;
+def TXQ_CHANNEL_DATA_TYPE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.channel_data_type.b32 \t$d, [$a];",
+              []>;
+def TXQ_WIDTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.width.b32 \t$d, [$a];",
+              []>;
+def TXQ_HEIGHT
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.height.b32 \t$d, [$a];",
+              []>;
+def TXQ_DEPTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.depth.b32 \t$d, [$a];",
+              []>;
+def TXQ_ARRAY_SIZE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.array_size.b32 \t$d, [$a];",
+              []>;
+def TXQ_NUM_SAMPLES
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.num_samples.b32 \t$d, [$a];",
+              []>;
+def TXQ_NUM_MIPMAP_LEVELS
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.num_mipmap_levels.b32 \t$d, [$a];",
+              []>;
+
+def : Pat<(int_nvvm_txq_channel_order Int64Regs:$a),
+          (TXQ_CHANNEL_ORDER Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_channel_data_type Int64Regs:$a),
+          (TXQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_width Int64Regs:$a),
+          (TXQ_WIDTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_height Int64Regs:$a),
+          (TXQ_HEIGHT Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_depth Int64Regs:$a),
+          (TXQ_DEPTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_array_size Int64Regs:$a),
+          (TXQ_ARRAY_SIZE Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_num_samples Int64Regs:$a),
+          (TXQ_NUM_SAMPLES Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a),
+          (TXQ_NUM_MIPMAP_LEVELS Int64Regs:$a)>;
+
+
+//-----------------------------------
+// Surface Query Intrinsics
+//-----------------------------------
+def SUQ_CHANNEL_ORDER
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.channel_order.b32 \t$d, [$a];",
+              []>;
+def SUQ_CHANNEL_DATA_TYPE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.channel_data_type.b32 \t$d, [$a];",
+              []>;
+def SUQ_WIDTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.width.b32 \t$d, [$a];",
+              []>;
+def SUQ_HEIGHT
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.height.b32 \t$d, [$a];",
+              []>;
+def SUQ_DEPTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.depth.b32 \t$d, [$a];",
+              []>;
+def SUQ_ARRAY_SIZE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.array_size.b32 \t$d, [$a];",
+              []>;
+
+def : Pat<(int_nvvm_suq_channel_order Int64Regs:$a),
+          (SUQ_CHANNEL_ORDER Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_channel_data_type Int64Regs:$a),
+          (SUQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_width Int64Regs:$a),
+          (SUQ_WIDTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_height Int64Regs:$a),
+          (SUQ_HEIGHT Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_depth Int64Regs:$a),
+          (SUQ_DEPTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_array_size Int64Regs:$a),
+          (SUQ_ARRAY_SIZE Int64Regs:$a)>;
+
+
+//===- Handle Query -------------------------------------------------------===//
+
+// TODO: These intrinsics are not yet finalized, pending PTX ISA design work
+def ISTYPEP_SAMPLER
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.samplerref \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_istypep_sampler Int64Regs:$a))]>;
+def ISTYPEP_SURFACE
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.surfref \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_istypep_surface Int64Regs:$a))]>;
+def ISTYPEP_TEXTURE
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.texref \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_istypep_texture Int64Regs:$a))]>;
+
+//===- Surface Stores -----------------------------------------------------===//
+
+// Unformatted
+
+def SUST_B_1D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.b.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.b.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_1D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.b.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.b.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.b.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.b.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.b.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.b.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.b.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_3D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.b.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.b.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.b.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.b.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.b.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+
+// Formatted
+
+def SUST_P_1D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.p.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.p.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.p.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.p.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.p.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.p.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.p.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_1D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.p.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.p.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.p.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.p.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.p.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_2D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.p.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.p.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.p.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_2D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_2D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.p.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_2D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.p.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.p.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.p.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_2D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.p.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.p.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.p.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.p.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_2D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.p.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_P_2D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.p.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_P_2D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.p.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.p.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.p.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_3D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.p.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_P_3D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.p.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_P_3D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.p.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_P_3D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.p.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_3D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.p.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_3D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.p.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_3D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.p.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_3D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.p.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_3D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.p.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+// Surface store instruction patterns
+// I'm not sure why we can't just include these in the instruction definitions,
+// but TableGen complains of type errors :(
+
+def : Pat<(int_nvvm_sust_b_1d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_ARRAY_B32_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_B_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_B_3D_B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_3D_V2B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_3D_V4B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+
+def : Pat<(int_nvvm_sust_p_1d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_P_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_1d_array_i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_P_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_2d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_P_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i8_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i16_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i32_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_2d_array_i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_ARRAY_B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_ARRAY_B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_P_2D_ARRAY_B32_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_P_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_3d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_P_3D_B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_P_3D_B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_P_3D_B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_3D_V2B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_3D_V2B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_3D_V2B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_3D_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_3D_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_3D_V4B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
 
 //===-- Old PTX Back-end Intrinsics ---------------------------------------===//
 
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
index c9aa87d..5ec1fc9 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -27,17 +27,17 @@ struct NVPTXLowerAggrCopies : public FunctionPass {
 
   NVPTXLowerAggrCopies() : FunctionPass(ID) {}
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DataLayoutPass>();
     AU.addPreserved("stack-protector");
     AU.addPreserved<MachineFunctionAnalysis>();
   }
 
-  virtual bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F) override;
 
   static const unsigned MaxAggrCopySize = 128;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "Lower aggregate copies/intrinsics into loops";
   }
 };
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.cpp b/lib/Target/NVPTX/NVPTXMCExpr.cpp
index ca24764..137248b 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.cpp
+++ b/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -7,13 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "nvptx-mcexpr"
 #include "NVPTXMCExpr.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "nvptx-mcexpr"
+
 const NVPTXFloatMCExpr*
 NVPTXFloatMCExpr::Create(VariantKind Kind, APFloat Flt, MCContext &Ctx) {
   return new (Ctx) NVPTXFloatMCExpr(Kind, Flt);
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h
index 0efb231..0ee018c 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -61,18 +61,18 @@ public:
 
 /// @}
 
-  void PrintImpl(raw_ostream &OS) const;
+  void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const {
+                                 const MCAsmLayout *Layout) const override {
     return false;
   }
-  void AddValueSymbols(MCAssembler *) const {};
-  const MCSection *FindAssociatedSection() const {
-    return NULL;
+  void AddValueSymbols(MCAssembler *) const override {};
+  const MCSection *FindAssociatedSection() const override {
+    return nullptr;
   }
 
   // There are no TLS NVPTXMCExprs at the moment.
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
new file mode 100644
index 0000000..67fb390
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -0,0 +1,46 @@
+//===-- NVPTXMachineFunctionInfo.h - NVPTX-specific Function Info  --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class is attached to a MachineFunction instance and tracks target-
+// dependent information
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
+private:
+  /// Stores a mapping from index to symbol name for removing image handles
+  /// on Fermi.
+  SmallVector<std::string, 8> ImageHandleList;
+
+public:
+  NVPTXMachineFunctionInfo(MachineFunction &MF) {}
+
+  /// Returns the index for the symbol \p Symbol. If the symbol was previously,
+  /// added, the same index is returned. Otherwise, the symbol is added and the
+  /// new index is returned.
+  unsigned getImageHandleSymbolIndex(const char *Symbol) {
+    // Is the symbol already present?
+    for (unsigned i = 0, e = ImageHandleList.size(); i != e; ++i)
+      if (ImageHandleList[i] == std::string(Symbol))
+        return i;
+    // Nope, insert it
+    ImageHandleList.push_back(Symbol);
+    return ImageHandleList.size()-1;
+  }
+
+  /// Returns the symbol name at the given index.
+  const char *getImageHandleSymbol(unsigned Idx) const {
+    assert(ImageHandleList.size() > Idx && "Bad index");
+    return ImageHandleList[Idx].c_str();
+  }
+};
+}
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index d5b042a..348ab0c 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -25,13 +25,15 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "nvptx-prolog-epilog"
+
 namespace {
 class NVPTXPrologEpilogPass : public MachineFunctionPass {
 public:
   static char ID;
   NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {}
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
 private:
   void calculateFrameObjectOffsets(MachineFunction &Fn);
@@ -58,7 +60,7 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         if (!MI->getOperand(i).isFI())
           continue;
-        TRI.eliminateFrameIndex(MI, 0, i, NULL);
+        TRI.eliminateFrameIndex(MI, 0, i, nullptr);
         Modified = true;
       }
     }
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 4d3a1d9..62f288b 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "nvptx-reg-info"
-
 #include "NVPTXRegisterInfo.h"
 #include "NVPTX.h"
 #include "NVPTXSubtarget.h"
@@ -25,6 +23,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "nvptx-reg-info"
+
 namespace llvm {
 std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
   if (RC == &NVPTX::Float32RegsRegClass) {
@@ -78,19 +78,12 @@ NVPTXRegisterInfo::NVPTXRegisterInfo(const NVPTXSubtarget &st)
 #include "NVPTXGenRegisterInfo.inc"
 
 /// NVPTX Callee Saved Registers
-const uint16_t *
+const MCPhysReg *
 NVPTXRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  static const uint16_t CalleeSavedRegs[] = { 0 };
+  static const MCPhysReg CalleeSavedRegs[] = { 0 };
   return CalleeSavedRegs;
 }
 
-// NVPTX Callee Saved Reg Classes
-const TargetRegisterClass *const *
-NVPTXRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  static const TargetRegisterClass *const CalleeSavedRegClasses[] = { 0 };
-  return CalleeSavedRegClasses;
-}
-
 BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   return Reserved;
@@ -113,12 +106,6 @@ void NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
 }
 
-int NVPTXRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  return 0;
-}
-
 unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return NVPTX::VRFrame;
 }
-
-unsigned NVPTXRegisterInfo::getRARegister() const { return 0; }
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h
index 0a20f29..a7594be 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.h
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -16,11 +16,10 @@
 
 #include "ManagedStringPool.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <sstream>
 
 #define GET_REGINFO_HEADER
 #include "NVPTXGenRegisterInfo.inc"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include <sstream>
 
 namespace llvm {
 
@@ -42,22 +41,16 @@ public:
   //------------------------------------------------------
 
   // NVPTX callee saved registers
-  virtual const uint16_t *
-  getCalleeSavedRegs(const MachineFunction *MF = 0) const;
-
-  // NVPTX callee saved register classes
-  virtual const TargetRegisterClass *const *
-  getCalleeSavedRegClasses(const MachineFunction *MF) const;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
 
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
-                                   unsigned FIOperandNum,
-                                   RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
 
-  virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  virtual unsigned getFrameRegister(const MachineFunction &MF) const;
-  virtual unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   ManagedStringPool *getStrPool() const {
     return const_cast<ManagedStringPool *>(&ManagedStrPool);
diff --git a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
new file mode 100644
index 0000000..afd53a6
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -0,0 +1,357 @@
+//===-- NVPTXReplaceImageHandles.cpp - Replace image handles for Fermi ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// On Fermi, image handles are not supported. To work around this, we traverse
+// the machine code and replace image handles with concrete symbols. For this
+// to work reliably, inlining of all function call must be performed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseSet.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXReplaceImageHandles : public MachineFunctionPass {
+private:
+  static char ID;
+  DenseSet<MachineInstr *> InstrsToRemove;
+
+public:
+  NVPTXReplaceImageHandles();
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+private:
+  bool processInstr(MachineInstr &MI);
+  void replaceImageHandle(MachineOperand &Op, MachineFunction &MF);
+};
+}
+
+char NVPTXReplaceImageHandles::ID = 0;
+
+NVPTXReplaceImageHandles::NVPTXReplaceImageHandles()
+  : MachineFunctionPass(ID) {}
+
+bool NVPTXReplaceImageHandles::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  InstrsToRemove.clear();
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    for (MachineBasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
+         I != E; ++I) {
+      MachineInstr &MI = *I;
+      Changed |= processInstr(MI);
+    }
+  }
+
+  // Now clean up any handle-access instructions
+  // This is needed in debug mode when code cleanup passes are not executed,
+  // but we need the handle access to be eliminated because they are not
+  // valid instructions when image handles are disabled.
+  for (DenseSet<MachineInstr *>::iterator I = InstrsToRemove.begin(),
+       E = InstrsToRemove.end(); I != E; ++I) {
+    (*I)->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) {
+  MachineFunction &MF = *MI.getParent()->getParent();
+  // Check if we have a surface/texture instruction
+  switch (MI.getOpcode()) {
+  default: return false;
+  case NVPTX::TEX_1D_F32_I32:
+  case NVPTX::TEX_1D_F32_F32:
+  case NVPTX::TEX_1D_F32_F32_LEVEL:
+  case NVPTX::TEX_1D_F32_F32_GRAD:
+  case NVPTX::TEX_1D_I32_I32:
+  case NVPTX::TEX_1D_I32_F32:
+  case NVPTX::TEX_1D_I32_F32_LEVEL:
+  case NVPTX::TEX_1D_I32_F32_GRAD:
+  case NVPTX::TEX_1D_ARRAY_F32_I32:
+  case NVPTX::TEX_1D_ARRAY_F32_F32:
+  case NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL:
+  case NVPTX::TEX_1D_ARRAY_F32_F32_GRAD:
+  case NVPTX::TEX_1D_ARRAY_I32_I32:
+  case NVPTX::TEX_1D_ARRAY_I32_F32:
+  case NVPTX::TEX_1D_ARRAY_I32_F32_LEVEL:
+  case NVPTX::TEX_1D_ARRAY_I32_F32_GRAD:
+  case NVPTX::TEX_2D_F32_I32:
+  case NVPTX::TEX_2D_F32_F32:
+  case NVPTX::TEX_2D_F32_F32_LEVEL:
+  case NVPTX::TEX_2D_F32_F32_GRAD:
+  case NVPTX::TEX_2D_I32_I32:
+  case NVPTX::TEX_2D_I32_F32:
+  case NVPTX::TEX_2D_I32_F32_LEVEL:
+  case NVPTX::TEX_2D_I32_F32_GRAD:
+  case NVPTX::TEX_2D_ARRAY_F32_I32:
+  case NVPTX::TEX_2D_ARRAY_F32_F32:
+  case NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL:
+  case NVPTX::TEX_2D_ARRAY_F32_F32_GRAD:
+  case NVPTX::TEX_2D_ARRAY_I32_I32:
+  case NVPTX::TEX_2D_ARRAY_I32_F32:
+  case NVPTX::TEX_2D_ARRAY_I32_F32_LEVEL:
+  case NVPTX::TEX_2D_ARRAY_I32_F32_GRAD:
+  case NVPTX::TEX_3D_F32_I32:
+  case NVPTX::TEX_3D_F32_F32:
+  case NVPTX::TEX_3D_F32_F32_LEVEL:
+  case NVPTX::TEX_3D_F32_F32_GRAD:
+  case NVPTX::TEX_3D_I32_I32:
+  case NVPTX::TEX_3D_I32_F32:
+  case NVPTX::TEX_3D_I32_F32_LEVEL:
+  case NVPTX::TEX_3D_I32_F32_GRAD: {
+    // This is a texture fetch, so operand 4 is a texref and operand 5 is
+    // a samplerref
+    MachineOperand &TexHandle = MI.getOperand(4);
+    MachineOperand &SampHandle = MI.getOperand(5);
+
+    replaceImageHandle(TexHandle, MF);
+    replaceImageHandle(SampHandle, MF);
+
+    return true;
+  }
+  case NVPTX::SULD_1D_I8_TRAP:
+  case NVPTX::SULD_1D_I16_TRAP:
+  case NVPTX::SULD_1D_I32_TRAP:
+  case NVPTX::SULD_1D_ARRAY_I8_TRAP:
+  case NVPTX::SULD_1D_ARRAY_I16_TRAP:
+  case NVPTX::SULD_1D_ARRAY_I32_TRAP:
+  case NVPTX::SULD_2D_I8_TRAP:
+  case NVPTX::SULD_2D_I16_TRAP:
+  case NVPTX::SULD_2D_I32_TRAP:
+  case NVPTX::SULD_2D_ARRAY_I8_TRAP:
+  case NVPTX::SULD_2D_ARRAY_I16_TRAP:
+  case NVPTX::SULD_2D_ARRAY_I32_TRAP:
+  case NVPTX::SULD_3D_I8_TRAP:
+  case NVPTX::SULD_3D_I16_TRAP:
+  case NVPTX::SULD_3D_I32_TRAP: {
+    // This is a V1 surface load, so operand 1 is a surfref
+    MachineOperand &SurfHandle = MI.getOperand(1);
+
+    replaceImageHandle(SurfHandle, MF);
+
+    return true;
+  }
+  case NVPTX::SULD_1D_V2I8_TRAP:
+  case NVPTX::SULD_1D_V2I16_TRAP:
+  case NVPTX::SULD_1D_V2I32_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V2I8_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V2I16_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V2I32_TRAP:
+  case NVPTX::SULD_2D_V2I8_TRAP:
+  case NVPTX::SULD_2D_V2I16_TRAP:
+  case NVPTX::SULD_2D_V2I32_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V2I8_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V2I16_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V2I32_TRAP:
+  case NVPTX::SULD_3D_V2I8_TRAP:
+  case NVPTX::SULD_3D_V2I16_TRAP:
+  case NVPTX::SULD_3D_V2I32_TRAP: {
+    // This is a V2 surface load, so operand 2 is a surfref
+    MachineOperand &SurfHandle = MI.getOperand(2);
+
+    replaceImageHandle(SurfHandle, MF);
+
+    return true;
+  }
+  case NVPTX::SULD_1D_V4I8_TRAP:
+  case NVPTX::SULD_1D_V4I16_TRAP:
+  case NVPTX::SULD_1D_V4I32_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V4I8_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V4I16_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V4I32_TRAP:
+  case NVPTX::SULD_2D_V4I8_TRAP:
+  case NVPTX::SULD_2D_V4I16_TRAP:
+  case NVPTX::SULD_2D_V4I32_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V4I8_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V4I16_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V4I32_TRAP:
+  case NVPTX::SULD_3D_V4I8_TRAP:
+  case NVPTX::SULD_3D_V4I16_TRAP:
+  case NVPTX::SULD_3D_V4I32_TRAP: {
+    // This is a V4 surface load, so operand 4 is a surfref
+    MachineOperand &SurfHandle = MI.getOperand(4);
+
+    replaceImageHandle(SurfHandle, MF);
+
+    return true;
+  }
+  case NVPTX::SUST_B_1D_B8_TRAP:
+  case NVPTX::SUST_B_1D_B16_TRAP:
+  case NVPTX::SUST_B_1D_B32_TRAP:
+  case NVPTX::SUST_B_1D_V2B8_TRAP:
+  case NVPTX::SUST_B_1D_V2B16_TRAP:
+  case NVPTX::SUST_B_1D_V2B32_TRAP:
+  case NVPTX::SUST_B_1D_V4B8_TRAP:
+  case NVPTX::SUST_B_1D_V4B16_TRAP:
+  case NVPTX::SUST_B_1D_V4B32_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_B_2D_B8_TRAP:
+  case NVPTX::SUST_B_2D_B16_TRAP:
+  case NVPTX::SUST_B_2D_B32_TRAP:
+  case NVPTX::SUST_B_2D_V2B8_TRAP:
+  case NVPTX::SUST_B_2D_V2B16_TRAP:
+  case NVPTX::SUST_B_2D_V2B32_TRAP:
+  case NVPTX::SUST_B_2D_V4B8_TRAP:
+  case NVPTX::SUST_B_2D_V4B16_TRAP:
+  case NVPTX::SUST_B_2D_V4B32_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_B_3D_B8_TRAP:
+  case NVPTX::SUST_B_3D_B16_TRAP:
+  case NVPTX::SUST_B_3D_B32_TRAP:
+  case NVPTX::SUST_B_3D_V2B8_TRAP:
+  case NVPTX::SUST_B_3D_V2B16_TRAP:
+  case NVPTX::SUST_B_3D_V2B32_TRAP:
+  case NVPTX::SUST_B_3D_V4B8_TRAP:
+  case NVPTX::SUST_B_3D_V4B16_TRAP:
+  case NVPTX::SUST_B_3D_V4B32_TRAP:
+  case NVPTX::SUST_P_1D_B8_TRAP:
+  case NVPTX::SUST_P_1D_B16_TRAP:
+  case NVPTX::SUST_P_1D_B32_TRAP:
+  case NVPTX::SUST_P_1D_V2B8_TRAP:
+  case NVPTX::SUST_P_1D_V2B16_TRAP:
+  case NVPTX::SUST_P_1D_V2B32_TRAP:
+  case NVPTX::SUST_P_1D_V4B8_TRAP:
+  case NVPTX::SUST_P_1D_V4B16_TRAP:
+  case NVPTX::SUST_P_1D_V4B32_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_P_2D_B8_TRAP:
+  case NVPTX::SUST_P_2D_B16_TRAP:
+  case NVPTX::SUST_P_2D_B32_TRAP:
+  case NVPTX::SUST_P_2D_V2B8_TRAP:
+  case NVPTX::SUST_P_2D_V2B16_TRAP:
+  case NVPTX::SUST_P_2D_V2B32_TRAP:
+  case NVPTX::SUST_P_2D_V4B8_TRAP:
+  case NVPTX::SUST_P_2D_V4B16_TRAP:
+  case NVPTX::SUST_P_2D_V4B32_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_P_3D_B8_TRAP:
+  case NVPTX::SUST_P_3D_B16_TRAP:
+  case NVPTX::SUST_P_3D_B32_TRAP:
+  case NVPTX::SUST_P_3D_V2B8_TRAP:
+  case NVPTX::SUST_P_3D_V2B16_TRAP:
+  case NVPTX::SUST_P_3D_V2B32_TRAP:
+  case NVPTX::SUST_P_3D_V4B8_TRAP:
+  case NVPTX::SUST_P_3D_V4B16_TRAP:
+  case NVPTX::SUST_P_3D_V4B32_TRAP: {
+    // This is a surface store, so operand 0 is a surfref
+    MachineOperand &SurfHandle = MI.getOperand(0);
+
+    replaceImageHandle(SurfHandle, MF);
+
+    return true;
+  }
+  case NVPTX::TXQ_CHANNEL_ORDER:
+  case NVPTX::TXQ_CHANNEL_DATA_TYPE:
+  case NVPTX::TXQ_WIDTH:
+  case NVPTX::TXQ_HEIGHT:
+  case NVPTX::TXQ_DEPTH:
+  case NVPTX::TXQ_ARRAY_SIZE:
+  case NVPTX::TXQ_NUM_SAMPLES:
+  case NVPTX::TXQ_NUM_MIPMAP_LEVELS:
+  case NVPTX::SUQ_CHANNEL_ORDER:
+  case NVPTX::SUQ_CHANNEL_DATA_TYPE:
+  case NVPTX::SUQ_WIDTH:
+  case NVPTX::SUQ_HEIGHT:
+  case NVPTX::SUQ_DEPTH:
+  case NVPTX::SUQ_ARRAY_SIZE: {
+    // This is a query, so operand 1 is a surfref/texref
+    MachineOperand &Handle = MI.getOperand(1);
+
+    replaceImageHandle(Handle, MF);
+
+    return true; 
+  }
+  }
+}
+
+void NVPTXReplaceImageHandles::
+replaceImageHandle(MachineOperand &Op, MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  NVPTXMachineFunctionInfo *MFI = MF.getInfo<NVPTXMachineFunctionInfo>();
+  // Which instruction defines the handle?
+  MachineInstr *MI = MRI.getVRegDef(Op.getReg());
+  assert(MI && "No def for image handle vreg?");
+  MachineInstr &TexHandleDef = *MI;
+
+  switch (TexHandleDef.getOpcode()) {
+  case NVPTX::LD_i64_avar: {
+    // The handle is a parameter value being loaded, replace with the
+    // parameter symbol
+    assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!");
+    StringRef Sym = TexHandleDef.getOperand(6).getSymbolName();
+    std::string ParamBaseName = MF.getName();
+    ParamBaseName += "_param_";
+    assert(Sym.startswith(ParamBaseName) && "Invalid symbol reference");
+    unsigned Param = atoi(Sym.data()+ParamBaseName.size());
+    std::string NewSym;
+    raw_string_ostream NewSymStr(NewSym);
+    NewSymStr << MF.getFunction()->getName() << "_param_" << Param;
+    Op.ChangeToImmediate(
+      MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str()));
+    InstrsToRemove.insert(&TexHandleDef);
+    break;
+  }
+  case NVPTX::texsurf_handles: {
+    // The handle is a global variable, replace with the global variable name
+    assert(TexHandleDef.getOperand(1).isGlobal() && "Load is not a global!");
+    const GlobalValue *GV = TexHandleDef.getOperand(1).getGlobal();
+    assert(GV->hasName() && "Global sampler must be named!");
+    Op.ChangeToImmediate(MFI->getImageHandleSymbolIndex(GV->getName().data()));
+    InstrsToRemove.insert(&TexHandleDef);
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown instruction operating on handle");
+  }
+}
+
+MachineFunctionPass *llvm::createNVPTXReplaceImageHandlesPass() {
+  return new NVPTXReplaceImageHandles();
+}
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index f8a692e..aa0436b 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -31,16 +31,16 @@ public:
 
   /// Override this as NVPTX has its own way of printing switching
   /// to a section.
-  virtual void PrintSwitchToSection(const MCAsmInfo &MAI,
-                                    raw_ostream &OS,
-                                    const MCExpr *Subsection) const {}
+  void PrintSwitchToSection(const MCAsmInfo &MAI,
+                            raw_ostream &OS,
+                            const MCExpr *Subsection) const override {}
 
   /// Base address of PTX sections is zero.
-  virtual bool isBaseAddressKnownZero() const { return true; }
-  virtual bool UseCodeAlign() const { return false; }
-  virtual bool isVirtualSection() const { return false; }
-  virtual std::string getLabelBeginName() const { return ""; }
-  virtual std::string getLabelEndName() const { return ""; }
+  bool isBaseAddressKnownZero() const override { return true; }
+  bool UseCodeAlign() const override { return false; }
+  bool isVirtualSection() const override { return false; }
+  std::string getLabelBeginName() const override { return ""; }
+  std::string getLabelEndName() const override { return ""; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 9771a17..8c7df52 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -12,14 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXSubtarget.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-subtarget"
+
 #define GET_SUBTARGETINFO_ENUM
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "NVPTXGenSubtargetInfo.inc"
 
-using namespace llvm;
-
-
 // Pin the vtable to this file.
 void NVPTXSubtarget::anchor() {}
 
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index f99bebd..581e5ed 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -16,12 +16,11 @@
 
 #include "NVPTX.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "NVPTXGenSubtargetInfo.inc"
 
-#include <string>
-
 namespace llvm {
 
 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
@@ -65,6 +64,10 @@ public:
   inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
   inline bool hasROT64() const { return SmVersion >= 20; }
 
+  bool hasImageHandles() const {
+    // Currently disabled
+    return false;
+  }
   bool is64Bit() const { return Is64Bit; }
 
   unsigned int getSmVersion() const { return SmVersion; }
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 7d7d793..26a4f84 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -16,7 +16,6 @@
 #include "NVPTX.h"
 #include "NVPTXAllocaHoisting.h"
 #include "NVPTXLowerAggrCopies.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
@@ -50,6 +49,7 @@ namespace llvm {
 void initializeNVVMReflectPass(PassRegistry&);
 void initializeGenericToNVVMPass(PassRegistry&);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
+void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
 }
 
 extern "C" void LLVMInitializeNVPTXTarget() {
@@ -62,6 +62,8 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
   initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
   initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
+  initializeNVPTXFavorNonGenericAddrSpacesPass(
+    *PassRegistry::getPassRegistry());
 }
 
 static std::string computeDataLayout(const NVPTXSubtarget &ST) {
@@ -113,14 +115,14 @@ public:
     return getTM<NVPTXTargetMachine>();
   }
 
-  virtual void addIRPasses();
-  virtual bool addInstSelector();
-  virtual bool addPreRegAlloc();
-  virtual bool addPostRegAlloc();
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
 
-  virtual FunctionPass *createTargetRegisterAllocator(bool) override;
-  virtual void addFastRegAlloc(FunctionPass *RegAllocPass);
-  virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass);
+  FunctionPass *createTargetRegisterAllocator(bool) override;
+  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
 };
 } // end anonymous namespace
 
@@ -140,15 +142,42 @@ void NVPTXPassConfig::addIRPasses() {
   disablePass(&BranchFolderPassID);
   disablePass(&TailDuplicateID);
 
+  addPass(createNVPTXImageOptimizerPass());
   TargetPassConfig::addIRPasses();
   addPass(createNVPTXAssignValidGlobalNamesPass());
   addPass(createGenericToNVVMPass());
+  addPass(createNVPTXFavorNonGenericAddrSpacesPass());
+  addPass(createSeparateConstOffsetFromGEPPass());
+  // The SeparateConstOffsetFromGEP pass creates variadic bases that can be used
+  // by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN generates
+  // significantly better code than EarlyCSE for some of our benchmarks.
+  if (getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createGVNPass());
+  else
+    addPass(createEarlyCSEPass());
+  // Both FavorNonGenericAddrSpaces and SeparateConstOffsetFromGEP may leave
+  // some dead code.  We could remove dead code in an ad-hoc manner, but that
+  // requires manual work and might be error-prone.
+  //
+  // The FavorNonGenericAddrSpaces pass shortcuts unnecessary addrspacecasts,
+  // and leave them unused.
+  //
+  // SeparateConstOffsetFromGEP rebuilds a new index from the old index, and the
+  // old index and some of its intermediate results may become unused.
+  addPass(createDeadCodeEliminationPass());
 }
 
 bool NVPTXPassConfig::addInstSelector() {
+  const NVPTXSubtarget &ST =
+    getTM<NVPTXTargetMachine>().getSubtarget<NVPTXSubtarget>();
+
   addPass(createLowerAggrCopies());
   addPass(createAllocaHoisting());
   addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
+
+  if (!ST.hasImageHandles())
+    addPass(createNVPTXReplaceImageHandlesPass());
+
   return false;
 }
 
@@ -159,7 +188,7 @@ bool NVPTXPassConfig::addPostRegAlloc() {
 }
 
 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
-  return 0; // No reg alloc
+  return nullptr; // No reg alloc
 }
 
 void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index 5fbcf73..2db7c18 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -51,22 +51,22 @@ public:
                      const TargetOptions &Options, Reloc::Model RM,
                      CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit);
 
-  virtual const TargetFrameLowering *getFrameLowering() const {
+  const TargetFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; }
-  virtual const DataLayout *getDataLayout() const { return &DL; }
-  virtual const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; }
 
-  virtual const NVPTXRegisterInfo *getRegisterInfo() const {
+  const NVPTXRegisterInfo *getRegisterInfo() const override {
     return &(InstrInfo.getRegisterInfo());
   }
 
-  virtual NVPTXTargetLowering *getTargetLowering() const {
+  NVPTXTargetLowering *getTargetLowering() const override {
     return const_cast<NVPTXTargetLowering *>(&TLInfo);
   }
 
-  virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const {
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
@@ -79,17 +79,17 @@ public:
     return const_cast<ManagedStringPool *>(&ManagedStrPool);
   }
 
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   // Emission of machine code through JITCodeEmitter is not supported.
-  virtual bool addPassesToEmitMachineCode(PassManagerBase &, JITCodeEmitter &,
-                                          bool = true) {
+  bool addPassesToEmitMachineCode(PassManagerBase &, JITCodeEmitter &,
+                                  bool = true) override {
     return true;
   }
 
   // Emission of machine code through MCJIT is not supported.
-  virtual bool addPassesToEmitMC(PassManagerBase &, MCContext *&, raw_ostream &,
-                                 bool = true) {
+  bool addPassesToEmitMC(PassManagerBase &, MCContext *&, raw_ostream &,
+                         bool = true) override {
     return true;
   }
 
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index 2a7281e..0b438c5 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -22,26 +22,26 @@ class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
 
 public:
   NVPTXTargetObjectFile() {
-    TextSection = 0;
-    DataSection = 0;
-    BSSSection = 0;
-    ReadOnlySection = 0;
+    TextSection = nullptr;
+    DataSection = nullptr;
+    BSSSection = nullptr;
+    ReadOnlySection = nullptr;
 
-    StaticCtorSection = 0;
-    StaticDtorSection = 0;
-    LSDASection = 0;
-    EHFrameSection = 0;
-    DwarfAbbrevSection = 0;
-    DwarfInfoSection = 0;
-    DwarfLineSection = 0;
-    DwarfFrameSection = 0;
-    DwarfPubTypesSection = 0;
-    DwarfDebugInlineSection = 0;
-    DwarfStrSection = 0;
-    DwarfLocSection = 0;
-    DwarfARangesSection = 0;
-    DwarfRangesSection = 0;
-    DwarfMacroInfoSection = 0;
+    StaticCtorSection = nullptr;
+    StaticDtorSection = nullptr;
+    LSDASection = nullptr;
+    EHFrameSection = nullptr;
+    DwarfAbbrevSection = nullptr;
+    DwarfInfoSection = nullptr;
+    DwarfLineSection = nullptr;
+    DwarfFrameSection = nullptr;
+    DwarfPubTypesSection = nullptr;
+    DwarfDebugInlineSection = nullptr;
+    DwarfStrSection = nullptr;
+    DwarfLocSection = nullptr;
+    DwarfARangesSection = nullptr;
+    DwarfRangesSection = nullptr;
+    DwarfMacroInfoSection = nullptr;
   }
 
   virtual ~NVPTXTargetObjectFile();
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index 60a5173..a9fd190b 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -22,9 +22,9 @@
 #include <map>
 #include <string>
 #include <vector>
-//#include <iostream>
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/Support/MutexGuard.h"
 
 using namespace llvm;
 
@@ -33,8 +33,15 @@ typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t;
 typedef std::map<const Module *, global_val_annot_t> per_module_annot_t;
 
 ManagedStatic<per_module_annot_t> annotationCache;
+static sys::Mutex Lock;
+
+void llvm::clearAnnotationCache(const llvm::Module *Mod) {
+  MutexGuard Guard(Lock);
+  annotationCache->erase(Mod);
+}
 
 static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+  MutexGuard Guard(Lock);
   assert(md && "Invalid mdnode for annotation");
   assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
   // start index = 1, to skip the global variable key
@@ -60,6 +67,7 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
 }
 
 static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
+  MutexGuard Guard(Lock);
   NamedMDNode *NMD = m->getNamedMetadata(llvm::NamedMDForAnnotations);
   if (!NMD)
     return;
@@ -92,6 +100,7 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
 
 bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
                                  unsigned &retval) {
+  MutexGuard Guard(Lock);
   const Module *m = gv->getParent();
   if ((*annotationCache).find(m) == (*annotationCache).end())
     cacheAnnotationFromMD(m, gv);
@@ -105,6 +114,7 @@ bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
 
 bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop,
                                  std::vector<unsigned> &retval) {
+  MutexGuard Guard(Lock);
   const Module *m = gv->getParent();
   if ((*annotationCache).find(m) == (*annotationCache).end())
     cacheAnnotationFromMD(m, gv);
@@ -195,8 +205,37 @@ bool llvm::isImageWriteOnly(const llvm::Value &val) {
   return false;
 }
 
+bool llvm::isImageReadWrite(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+                                    llvm::PropertyAnnotationNames[
+                                        llvm::PROPERTY_ISREADWRITE_IMAGE_PARAM],
+                                    annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
 bool llvm::isImage(const llvm::Value &val) {
-  return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val);
+  return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val) ||
+         llvm::isImageReadWrite(val);
+}
+
+bool llvm::isManaged(const llvm::Value &val) {
+  if(const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if(llvm::findOneNVVMAnnotation(gv,
+                          llvm::PropertyAnnotationNames[llvm::PROPERTY_MANAGED],
+                                   annot)) {
+      assert((annot == 1) && "Unexpected annotation on a managed symbol");
+      return true;
+    }
+  }
+  return false;
 }
 
 std::string llvm::getTextureName(const llvm::Value &val) {
@@ -354,12 +393,12 @@ llvm::skipPointerTransfer(const Value *V, bool ignore_GEP_indices) {
 const Value *
 llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) {
   if (processed.find(V) != processed.end())
-    return NULL;
+    return nullptr;
   processed.insert(V);
 
   const Value *V2 = V->stripPointerCasts();
   if (V2 != V && processed.find(V2) != processed.end())
-    return NULL;
+    return nullptr;
   processed.insert(V2);
 
   V = V2;
@@ -375,20 +414,20 @@ llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) {
       continue;
     } else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
       if (V != V2 && processed.find(V) != processed.end())
-        return NULL;
+        return nullptr;
       processed.insert(PN);
-      const Value *common = 0;
+      const Value *common = nullptr;
       for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
         const Value *pv = PN->getIncomingValue(i);
         const Value *base = skipPointerTransfer(pv, processed);
         if (base) {
-          if (common == 0)
+          if (!common)
             common = base;
           else if (common != base)
             return PN;
         }
       }
-      if (common == 0)
+      if (!common)
         return PN;
       V = common;
     }
@@ -406,7 +445,7 @@ BasicBlock *llvm::getParentBlock(Value *v) {
   if (Instruction *I = dyn_cast<Instruction>(v))
     return I->getParent();
 
-  return 0;
+  return nullptr;
 }
 
 Function *llvm::getParentFunction(Value *v) {
@@ -419,13 +458,13 @@ Function *llvm::getParentFunction(Value *v) {
   if (BasicBlock *B = dyn_cast<BasicBlock>(v))
     return B->getParent();
 
-  return 0;
+  return nullptr;
 }
 
 // Dump a block by name
 void llvm::dumpBlock(Value *v, char *blockName) {
   Function *F = getParentFunction(v);
-  if (F == 0)
+  if (!F)
     return;
 
   for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) {
@@ -440,8 +479,8 @@ void llvm::dumpBlock(Value *v, char *blockName) {
 // Find an instruction by name
 Instruction *llvm::getInst(Value *base, char *instName) {
   Function *F = getParentFunction(base);
-  if (F == 0)
-    return 0;
+  if (!F)
+    return nullptr;
 
   for (inst_iterator it = inst_begin(F), ie = inst_end(F); it != ie; ++it) {
     Instruction *I = &*it;
@@ -450,7 +489,7 @@ Instruction *llvm::getInst(Value *base, char *instName) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 // Dump an instruction by nane
diff --git a/lib/Target/NVPTX/NVPTXUtilities.h b/lib/Target/NVPTX/NVPTXUtilities.h
index a208004..446bfa1 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/lib/Target/NVPTX/NVPTXUtilities.h
@@ -28,6 +28,8 @@ namespace llvm {
 #define NVCL_IMAGE2D_READONLY_FUNCNAME "__is_image2D_readonly"
 #define NVCL_IMAGE3D_READONLY_FUNCNAME "__is_image3D_readonly"
 
+void clearAnnotationCache(const llvm::Module *);
+
 bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &);
 bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string,
                            std::vector<unsigned> &);
@@ -38,6 +40,8 @@ bool isSampler(const llvm::Value &);
 bool isImage(const llvm::Value &);
 bool isImageReadOnly(const llvm::Value &);
 bool isImageWriteOnly(const llvm::Value &);
+bool isImageReadWrite(const llvm::Value &);
+bool isManaged(const llvm::Value &);
 
 std::string getTextureName(const llvm::Value &);
 std::string getSurfaceName(const llvm::Value &);
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 8b5444a..cb8bd72 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -38,6 +38,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "nvptx-reflect"
+
 namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
 
 namespace {
@@ -49,13 +51,13 @@ private:
 
 public:
   static char ID;
-  NVVMReflect() : ModulePass(ID), ReflectFunction(0) {
+  NVVMReflect() : ModulePass(ID), ReflectFunction(nullptr) {
     initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
     VarMap.clear();
   }
 
   NVVMReflect(const StringMap<int> &Mapping)
-  : ModulePass(ID), ReflectFunction(0) {
+  : ModulePass(ID), ReflectFunction(nullptr) {
     initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
     for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end();
          I != E; ++I) {
@@ -63,8 +65,10 @@ public:
     }
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); }
-  virtual bool runOnModule(Module &);
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+  bool runOnModule(Module &) override;
 
   void setVarMap();
 };
@@ -126,7 +130,7 @@ bool NVVMReflect::runOnModule(Module &M) {
 
   // If reflect function is not used, then there will be
   // no entry in the module.
-  if (ReflectFunction == 0)
+  if (!ReflectFunction)
     return false;
 
   // Validate _reflect function
diff --git a/lib/Target/PowerPC/AsmParser/LLVMBuild.txt b/lib/Target/PowerPC/AsmParser/LLVMBuild.txt
index 02ebf1d..801f27b 100644
--- a/lib/Target/PowerPC/AsmParser/LLVMBuild.txt
+++ b/lib/Target/PowerPC/AsmParser/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/PowerPC/AsmParser/LLVMBuild.txt --------------*- Conf -*--===;
+;===- ./lib/Target/PowerPC/AsmParser/LLVMBuild.txt -------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -19,5 +19,5 @@
 type = Library
 name = PowerPCAsmParser
 parent = PowerPC
-required_libraries = PowerPCDesc PowerPCInfo MC MCParser Support
+required_libraries = MC MCParser PowerPCDesc PowerPCInfo Support
 add_to_library_groups = PowerPC
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 8bb91cf..3ac037d 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -230,7 +230,7 @@ class PPCAsmParser : public MCTargetAsmParser {
   bool MatchRegisterName(const AsmToken &Tok,
                          unsigned &RegNo, int64_t &IntVal);
 
-  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
   const MCExpr *ExtractModifierFromExpr(const MCExpr *E,
                                         PPCMCExpr::VariantKind &Variant);
@@ -248,7 +248,7 @@ class PPCAsmParser : public MCTargetAsmParser {
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
-                               bool MatchingInlineAsm);
+                               bool MatchingInlineAsm) override;
 
   void ProcessInstruction(MCInst &Inst,
                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
@@ -264,7 +264,8 @@ class PPCAsmParser : public MCTargetAsmParser {
 
 public:
   PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-               const MCInstrInfo &_MII)
+               const MCInstrInfo &_MII,
+               const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(_MII) {
     // Check for 64-bit vs. 32-bit pointer mode.
     Triple TheTriple(STI.getTargetTriple());
@@ -275,17 +276,18 @@ public:
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
-  virtual bool ParseInstruction(ParseInstructionInfo &Info,
-                                StringRef Name, SMLoc NameLoc,
-                                SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  bool ParseInstruction(ParseInstructionInfo &Info,
+                        StringRef Name, SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
 
-  virtual bool ParseDirective(AsmToken DirectiveID);
+  bool ParseDirective(AsmToken DirectiveID) override;
 
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, unsigned Kind);
+  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+                                      unsigned Kind) override;
 
-  virtual const MCExpr *applyModifierToExpr(const MCExpr *E,
-                                            MCSymbolRefExpr::VariantKind,
-                                            MCContext &Ctx);
+  const MCExpr *applyModifierToExpr(const MCExpr *E,
+                                    MCSymbolRefExpr::VariantKind,
+                                    MCContext &Ctx) override;
 };
 
 /// PPCOperand - Instances of this class represent a parsed PowerPC machine
@@ -350,10 +352,10 @@ public:
   }
 
   /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const { return StartLoc; }
+  SMLoc getStartLoc() const override { return StartLoc; }
 
   /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const { return EndLoc; }
+  SMLoc getEndLoc() const override { return EndLoc; }
 
   /// isPPC64 - True if this operand is for an instruction in 64-bit mode.
   bool isPPC64() const { return IsPPC64; }
@@ -378,7 +380,7 @@ public:
     return TLSReg.Sym;
   }
 
-  unsigned getReg() const {
+  unsigned getReg() const override {
     assert(isRegNumber() && "Invalid access!");
     return (unsigned) Imm.Val;
   }
@@ -403,8 +405,8 @@ public:
     return 7 - countTrailingZeros<uint64_t>(Imm.Val);
   }
 
-  bool isToken() const { return Kind == Token; }
-  bool isImm() const { return Kind == Immediate || Kind == Expression; }
+  bool isToken() const override { return Kind == Token; }
+  bool isImm() const override { return Kind == Immediate || Kind == Expression; }
   bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); }
   bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); }
   bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); }
@@ -437,8 +439,8 @@ public:
                                        && isUInt<5>(getImm())); }
   bool isCRBitMask() const { return Kind == Immediate && isUInt<8>(getImm()) &&
                                     isPowerOf2_32(getImm()); }
-  bool isMem() const { return false; }
-  bool isReg() const { return false; }
+  bool isMem() const override { return false; }
+  bool isReg() const override { return false; }
 
   void addRegOperands(MCInst &Inst, unsigned N) const {
     llvm_unreachable("addRegOperands");
@@ -544,7 +546,7 @@ public:
     return StringRef(Tok.Data, Tok.Length);
   }
 
-  virtual void print(raw_ostream &OS) const;
+  void print(raw_ostream &OS) const override;
 
   static PPCOperand *CreateToken(StringRef Str, SMLoc S, bool IsPPC64) {
     PPCOperand *Op = new PPCOperand(Token);
@@ -1021,7 +1023,7 @@ ExtractModifierFromExpr(const MCExpr *E,
   switch (E->getKind()) {
   case MCExpr::Target:
   case MCExpr::Constant:
-    return 0;
+    return nullptr;
 
   case MCExpr::SymbolRef: {
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
@@ -1049,7 +1051,7 @@ ExtractModifierFromExpr(const MCExpr *E,
       Variant = PPCMCExpr::VK_PPC_HIGHESTA;
       break;
     default:
-      return 0;
+      return nullptr;
     }
 
     return MCSymbolRefExpr::Create(&SRE->getSymbol(), Context);
@@ -1059,7 +1061,7 @@ ExtractModifierFromExpr(const MCExpr *E,
     const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
     const MCExpr *Sub = ExtractModifierFromExpr(UE->getSubExpr(), Variant);
     if (!Sub)
-      return 0;
+      return nullptr;
     return MCUnaryExpr::Create(UE->getOpcode(), Sub, Context);
   }
 
@@ -1070,7 +1072,7 @@ ExtractModifierFromExpr(const MCExpr *E,
     const MCExpr *RHS = ExtractModifierFromExpr(BE->getRHS(), RHSVariant);
 
     if (!LHS && !RHS)
-      return 0;
+      return nullptr;
 
     if (!LHS) LHS = BE->getLHS();
     if (!RHS) RHS = BE->getRHS();
@@ -1082,7 +1084,7 @@ ExtractModifierFromExpr(const MCExpr *E,
     else if (LHSVariant == RHSVariant)
       Variant = LHSVariant;
     else
-      return 0;
+      return nullptr;
 
     return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, Context);
   }
@@ -1593,6 +1595,6 @@ PPCAsmParser::applyModifierToExpr(const MCExpr *E,
   case MCSymbolRefExpr::VK_PPC_HIGHESTA:
     return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx);
   default:
-    return 0;
+    return nullptr;
   }
 }
diff --git a/lib/Target/PowerPC/Disassembler/LLVMBuild.txt b/lib/Target/PowerPC/Disassembler/LLVMBuild.txt
index 7f29040..c1011ff 100644
--- a/lib/Target/PowerPC/Disassembler/LLVMBuild.txt
+++ b/lib/Target/PowerPC/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = PowerPCDisassembler
 parent = PowerPC
-required_libraries = MC Support PowerPCDesc PowerPCInfo
+required_libraries = MC PowerPCDesc PowerPCInfo Support
 add_to_library_groups = PowerPC
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index c4a7544..a2305a9 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -17,13 +17,15 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ppc-disassembler"
+
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
 class PPCDisassembler : public MCDisassembler {
 public:
-  PPCDisassembler(const MCSubtargetInfo &STI)
-    : MCDisassembler(STI) {}
+  PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
   virtual ~PPCDisassembler() {}
 
   // Override MCDisassembler.
@@ -37,8 +39,9 @@ public:
 } // end anonymous namespace
 
 static MCDisassembler *createPPCDisassembler(const Target &T,
-                                             const MCSubtargetInfo &STI) {
-  return new PPCDisassembler(STI);
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new PPCDisassembler(STI, Ctx);
 }
 
 extern "C" void LLVMInitializePowerPCDisassembler() {
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index dc54b52..7279b09 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "PPCInstPrinter.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCPredicates.h"
@@ -23,6 +22,8 @@
 #include "llvm/Target/TargetOpcodes.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 // FIXME: Once the integrated assembler supports full register names, tie this
 // to the verbose-asm setting.
 static cl::opt<bool>
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 4d1df78..211a628 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -31,8 +31,8 @@ public:
     return IsDarwin;
   }
   
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
   
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
@@ -41,7 +41,7 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printPredicateOperand(const MCInst *MI, unsigned OpNo,
-                             raw_ostream &O, const char *Modifier = 0);
+                             raw_ostream &O, const char *Modifier = nullptr);
 
   void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index f7309bb..12584be 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -77,9 +77,11 @@ public:
   PPCAsmBackend(const Target &T, bool isLittle) : MCAsmBackend(), TheTarget(T),
     IsLittleEndian(isLittle) {}
 
-  unsigned getNumFixupKinds() const { return PPC::NumTargetFixupKinds; }
+  unsigned getNumFixupKinds() const override {
+    return PPC::NumTargetFixupKinds;
+  }
 
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
     const static MCFixupKindInfo InfosBE[PPC::NumTargetFixupKinds] = {
       // name                    offset  bits  flags
       { "fixup_ppc_br24",        6,      24,   MCFixupKindInfo::FKF_IsPCRel },
@@ -110,7 +112,7 @@ public:
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const {
+                  uint64_t Value, bool IsPCRel) const override {
     Value = adjustFixupValue(Fixup.getKind(), Value);
     if (!Value) return;           // Doesn't change encoding.
 
@@ -126,7 +128,7 @@ public:
     }
   }
 
-  bool mayNeedRelaxation(const MCInst &Inst) const {
+  bool mayNeedRelaxation(const MCInst &Inst) const override {
     // FIXME.
     return false;
   }
@@ -134,18 +136,18 @@ public:
   bool fixupNeedsRelaxation(const MCFixup &Fixup,
                             uint64_t Value,
                             const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const {
+                            const MCAsmLayout &Layout) const override {
     // FIXME.
     llvm_unreachable("relaxInstruction() unimplemented");
   }
 
 
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
     // FIXME.
     llvm_unreachable("relaxInstruction() unimplemented");
   }
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
     uint64_t NumNops = Count / 4;
     for (uint64_t i = 0; i != NumNops; ++i)
       OW->Write32(0x60000000);
@@ -180,7 +182,7 @@ namespace {
   public:
     DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { }
 
-    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
       bool is64 = getPointerSize() == 8;
       return createPPCMachObjectWriter(
           OS,
@@ -197,7 +199,7 @@ namespace {
       PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { }
 
 
-    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
       bool is64 = getPointerSize() == 8;
       return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI);
     }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index d19f6a0..cd3b4f4 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -41,11 +41,12 @@ PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI)
 PPCELFObjectWriter::~PPCELFObjectWriter() {
 }
 
-static MCSymbolRefExpr::VariantKind getAccessVariant(const MCFixup &Fixup) {
+static MCSymbolRefExpr::VariantKind getAccessVariant(const MCValue &Target,
+                                                     const MCFixup &Fixup) {
   const MCExpr *Expr = Fixup.getValue();
 
   if (Expr->getKind() != MCExpr::Target)
-    return Fixup.getAccessVariant();
+    return Target.getAccessVariant();
 
   switch (cast<PPCMCExpr>(Expr)->getKind()) {
   case PPCMCExpr::VK_PPC_None:
@@ -72,7 +73,7 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
                                                const MCFixup &Fixup,
                                                bool IsPCRel) const
 {
-  MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Fixup);
+  MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Target, Fixup);
 
   // determine the type of the relocation
   unsigned Type;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 18609e1..b95a2ac 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -28,7 +28,7 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) {
   ExceptionsType = ExceptionHandling::DwarfCFI;
 
   if (!is64Bit)
-    Data64bitsDirective = 0;      // We can't emit a 64-bit unit in PPC32 mode.
+    Data64bitsDirective = nullptr; // We can't emit a 64-bit unit in PPC32 mode.
 
   AssemblerDialect = 1;           // New-Style mnemonics.
   SupportsDebugInformation= true; // Debug information.
@@ -71,7 +71,7 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit, const Triple& T) {
   ExceptionsType = ExceptionHandling::DwarfCFI;
     
   ZeroDirective = "\t.space\t";
-  Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
+  Data64bitsDirective = is64Bit ? "\t.quad\t" : nullptr;
   AssemblerDialect = 1;           // New-Style mnemonics.
 
   if (T.getOS() == llvm::Triple::FreeBSD ||
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index cee2cb7..754330b 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -21,13 +21,13 @@ namespace llvm {
 class Triple;
 
   class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple&);
   };
 
   class PPCLinuxMCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit PPCLinuxMCAsmInfo(bool is64Bit, const Triple&);
   };
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index b259c5d..a4983ad 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mccodeemitter"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
 #include "llvm/ADT/Statistic.h"
@@ -26,6 +25,8 @@
 #include "llvm/Target/TargetOpcodes.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mccodeemitter"
+
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
@@ -88,7 +89,7 @@ public:
                                  const MCSubtargetInfo &STI) const;
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const {
+                         const MCSubtargetInfo &STI) const override {
     // For fast-isel, a float COPY_TO_REGCLASS can survive this long.
     // It's just a nop to keep the register classes happy, so don't
     // generate anything.
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index c181e03..10d068d 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ppcmcexpr"
 #include "PPCMCExpr.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
@@ -15,6 +14,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ppcmcexpr"
+
 const PPCMCExpr*
 PPCMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
                   bool isDarwin, MCContext &Ctx) {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index 5fc7918..3421b91 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -76,16 +76,16 @@ public:
 
   /// @}
 
-  void PrintImpl(raw_ostream &OS) const;
+  void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-  void AddValueSymbols(MCAssembler *) const;
-  const MCSection *FindAssociatedSection() const {
+                                 const MCAsmLayout *Layout) const override;
+  void AddValueSymbols(MCAssembler *) const override;
+  const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
   }
 
   // There are no TLS PPCMCExprs at the moment.
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 105c511..7057797 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -26,6 +26,8 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "PPCGenInstrInfo.inc"
 
@@ -35,8 +37,6 @@
 #define GET_REGINFO_MC_DESC
 #include "PPCGenRegisterInfo.inc"
 
-using namespace llvm;
-
 // Pin the vtable to this file.
 PPCTargetStreamer::~PPCTargetStreamer() {}
 PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
@@ -80,7 +80,7 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   // Initial state of the frame pointer is R1.
   unsigned Reg = isPPC64 ? PPC::X1 : PPC::R1;
   MCCFIInstruction Inst =
-      MCCFIInstruction::createDefCfa(0, MRI.getDwarfRegNum(Reg, true), 0);
+      MCCFIInstruction::createDefCfa(nullptr, MRI.getDwarfRegNum(Reg, true), 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -115,14 +115,14 @@ class PPCTargetAsmStreamer : public PPCTargetStreamer {
 public:
   PPCTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS)
       : PPCTargetStreamer(S), OS(OS) {}
-  virtual void emitTCEntry(const MCSymbol &S) {
+  void emitTCEntry(const MCSymbol &S) override {
     OS << "\t.tc ";
     OS << S.getName();
     OS << "[TC],";
     OS << S.getName();
     OS << '\n';
   }
-  virtual void emitMachine(StringRef CPU) {
+  void emitMachine(StringRef CPU) override {
     OS << "\t.machine " << CPU << '\n';
   }
 };
@@ -130,11 +130,11 @@ public:
 class PPCTargetELFStreamer : public PPCTargetStreamer {
 public:
   PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
-  virtual void emitTCEntry(const MCSymbol &S) {
+  void emitTCEntry(const MCSymbol &S) override {
     // Creates a R_PPC64_TOC relocation
     Streamer.EmitSymbolValue(&S, 8);
   }
-  virtual void emitMachine(StringRef CPU) {
+  void emitMachine(StringRef CPU) override {
     // FIXME: Is there anything to do in here or does this directive only
     // limit the parser?
   }
@@ -143,10 +143,10 @@ public:
 class PPCTargetMachOStreamer : public PPCTargetStreamer {
 public:
   PPCTargetMachOStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
-  virtual void emitTCEntry(const MCSymbol &S) {
+  void emitTCEntry(const MCSymbol &S) override {
     llvm_unreachable("Unknown pseudo-op: .tc");
   }
-  virtual void emitMachine(StringRef CPU) {
+  void emitMachine(StringRef CPU) override {
     // FIXME: We should update the CPUType, CPUSubType in the Object file if
     // the new values are different from the defaults.
   }
@@ -175,13 +175,12 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
 
 static MCStreamer *
 createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                    bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                    bool isVerboseAsm, bool useDwarfDirectory,
                     MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                     MCAsmBackend *TAB, bool ShowInst) {
 
-  MCStreamer *S =
-      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
-                              InstPrint, CE, TAB, ShowInst);
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
   new PPCTargetAsmStreamer(*S, OS);
   return S;
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index bbafe2e..cff27ba 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -44,7 +44,7 @@ public:
   void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
                         const MCAsmLayout &Layout, const MCFragment *Fragment,
                         const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) {
+                        uint64_t &FixedValue) override {
     if (Writer->is64Bit()) {
       report_fatal_error("Relocation emission for MachO/PPC64 unimplemented.");
     } else
@@ -206,7 +206,7 @@ bool PPCMachObjectWriter::RecordScatteredRelocation(
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
-  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
 
   if (!A_SD->getFragment())
     report_fatal_error("symbol '" + A->getName() +
@@ -219,7 +219,7 @@ bool PPCMachObjectWriter::RecordScatteredRelocation(
   uint32_t Value2 = 0;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
 
     if (!B_SD->getFragment())
       report_fatal_error("symbol '" + B->getSymbol().getName() +
@@ -324,7 +324,7 @@ void PPCMachObjectWriter::RecordPPCRelocation(
 
   // this doesn't seem right for RIT_PPC_BR24
   // Get the symbol data, if any.
-  MCSymbolData *SD = 0;
+  const MCSymbolData *SD = nullptr;
   if (Target.getSymA())
     SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
 
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 9ce8ea9..e89fb2d 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asmprinter"
 #include "PPC.h"
 #include "InstPrinter/PPCInstPrinter.h"
 #include "MCTargetDesc/PPCMCExpr.h"
@@ -59,6 +58,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asmprinter"
+
 namespace {
   class PPCAsmPrinter : public AsmPrinter {
   protected:
@@ -70,22 +71,22 @@ namespace {
       : AsmPrinter(TM, Streamer),
         Subtarget(TM.getSubtarget<PPCSubtarget>()), TOCLabelID(0) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "PowerPC Assembly Printer";
     }
 
     MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
 
-    virtual void EmitInstruction(const MachineInstr *MI);
+    void EmitInstruction(const MachineInstr *MI) override;
 
     void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
 
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
-                         raw_ostream &O);
+                         raw_ostream &O) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                unsigned AsmVariant, const char *ExtraCode,
-                               raw_ostream &O);
+                               raw_ostream &O) override;
   };
 
   /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
@@ -94,15 +95,15 @@ namespace {
     explicit PPCLinuxAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : PPCAsmPrinter(TM, Streamer) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Linux PPC Assembly Printer";
     }
 
-    bool doFinalization(Module &M);
+    bool doFinalization(Module &M) override;
 
-    virtual void EmitFunctionEntryLabel();
+    void EmitFunctionEntryLabel() override;
 
-    void EmitFunctionBodyEnd();
+    void EmitFunctionBodyEnd() override;
   };
 
   /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
@@ -112,12 +113,12 @@ namespace {
     explicit PPCDarwinAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : PPCAsmPrinter(TM, Streamer) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Darwin PPC Assembly Printer";
     }
 
-    bool doFinalization(Module &M);
-    void EmitStartOfAsmFile(Module &M);
+    bool doFinalization(Module &M) override;
+    void EmitStartOfAsmFile(Module &M) override;
 
     void EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs);
   };
@@ -180,7 +181,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
         MachineModuleInfoImpl::StubValueTy &StubSym = 
           MMI->getObjFileInfo<MachineModuleInfoMachO>()
             .getGVStubEntry(SymToPrint);
-        if (StubSym.getPointer() == 0)
+        if (!StubSym.getPointer())
           StubSym = MachineModuleInfoImpl::
             StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
       } else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
@@ -190,7 +191,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
         MachineModuleInfoImpl::StubValueTy &StubSym = 
           MMI->getObjFileInfo<MachineModuleInfoMachO>().
                     getHiddenGVStubEntry(SymToPrint);
-        if (StubSym.getPointer() == 0)
+        if (!StubSym.getPointer())
           StubSym = MachineModuleInfoImpl::
             StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
       } else {
@@ -207,7 +208,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
   }
 
   default:
-    O << "<unknown operand type: " << MO.getType() << ">";
+    O << "<unknown operand type: " << (unsigned)MO.getType() << ">";
     return;
   }
 }
@@ -288,9 +289,9 @@ MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
   MCSymbol *&TOCEntry = TOC[Sym];
 
   // To avoid name clash check if the name already exists.
-  while (TOCEntry == 0) {
+  while (!TOCEntry) {
     if (OutContext.LookupSymbol(Twine(DL->getPrivateGlobalPrefix()) +
-                                "C" + Twine(TOCLabelID++)) == 0) {
+                                "C" + Twine(TOCLabelID++)) == nullptr) {
       TOCEntry = GetTempSymbol("C", TOCLabelID);
     }
   }
@@ -342,7 +343,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     // Map symbol -> label of TOC entry
     assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
-    MCSymbol *MOSymbol = 0;
+    MCSymbol *MOSymbol = nullptr;
     if (MO.isGlobal())
       MOSymbol = getSymbol(MO.getGlobal());
     else if (MO.isCPI())
@@ -372,23 +373,19 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     assert((MO.isGlobal() || MO.isCPI() || MO.isJTI()) &&
            "Invalid operand for ADDIStocHA!");
-    MCSymbol *MOSymbol = 0;
+    MCSymbol *MOSymbol = nullptr;
     bool IsExternal = false;
     bool IsFunction = false;
     bool IsCommon = false;
     bool IsAvailExt = false;
 
     if (MO.isGlobal()) {
-      const GlobalValue *GValue = MO.getGlobal();
-      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue =
-          GAlias ? GAlias->getAliasedGlobal() : GValue;
-      MOSymbol = getSymbol(RealGValue);
-      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
-      IsExternal = GVar && !GVar->hasInitializer();
-      IsCommon = GVar && RealGValue->hasCommonLinkage();
-      IsFunction = !GVar;
-      IsAvailExt = GVar && RealGValue->hasAvailableExternallyLinkage();
+      const GlobalValue *GV = MO.getGlobal();
+      MOSymbol = getSymbol(GV);
+      IsExternal = GV->isDeclaration();
+      IsCommon = GV->hasCommonLinkage();
+      IsFunction = GV->getType()->getElementType()->isFunctionTy();
+      IsAvailExt = GV->hasAvailableExternallyLinkage();
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
     else if (MO.isJTI())
@@ -416,7 +413,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(1);
     assert((MO.isGlobal() || MO.isJTI() || MO.isCPI()) &&
            "Invalid operand for LDtocL!");
-    MCSymbol *MOSymbol = 0;
+    MCSymbol *MOSymbol = nullptr;
 
     if (MO.isJTI())
       MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
@@ -427,14 +424,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     else if (MO.isGlobal()) {
       const GlobalValue *GValue = MO.getGlobal();
-      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue =
-          GAlias ? GAlias->getAliasedGlobal() : GValue;
-      MOSymbol = getSymbol(RealGValue);
-      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
-    
-      if (!GVar || !GVar->hasInitializer() || RealGValue->hasCommonLinkage() ||
-          RealGValue->hasAvailableExternallyLinkage() ||
+      MOSymbol = getSymbol(GValue);
+      if (GValue->isDeclaration() || GValue->hasCommonLinkage() ||
+          GValue->hasAvailableExternallyLinkage() ||
           TM.getCodeModel() == CodeModel::Large)
         MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
     }
@@ -456,19 +448,15 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     TmpInst.setOpcode(PPC::ADDI8);
     const MachineOperand &MO = MI->getOperand(2);
     assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
-    MCSymbol *MOSymbol = 0;
+    MCSymbol *MOSymbol = nullptr;
     bool IsExternal = false;
     bool IsFunction = false;
 
     if (MO.isGlobal()) {
-      const GlobalValue *GValue = MO.getGlobal();
-      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue =
-          GAlias ? GAlias->getAliasedGlobal() : GValue;
-      MOSymbol = getSymbol(RealGValue);
-      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
-      IsExternal = GVar && !GVar->hasInitializer();
-      IsFunction = !GVar;
+      const GlobalValue *GV = MO.getGlobal();
+      MOSymbol = getSymbol(GV);
+      IsExternal = GV->isDeclaration();
+      IsFunction = GV->getType()->getElementType()->isFunctionTy();
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
 
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 9276211..ee90671 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ppc-branch-select"
 #include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPCInstrBuilder.h"
@@ -26,6 +25,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "ppc-branch-select"
+
 STATISTIC(NumExpanded, "Number of branches expanded to long format");
 
 namespace llvm {
@@ -42,9 +43,9 @@ namespace {
     /// BlockSizes - The sizes of the basic blocks in the function.
     std::vector<unsigned> BlockSizes;
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "PowerPC Branch Selector";
     }
   };
@@ -112,7 +113,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
       unsigned MBBStartOffset = 0;
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
            I != E; ++I) {
-        MachineBasicBlock *Dest = 0;
+        MachineBasicBlock *Dest = nullptr;
         if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm())
           Dest = I->getOperand(2).getMBB();
         else if ((I->getOpcode() == PPC::BC || I->getOpcode() == PPC::BCn) &&
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 9c5db50..ec1e34d 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -23,8 +23,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ctrloops"
-
 #include "llvm/Transforms/Scalar.h"
 #include "PPC.h"
 #include "PPCTargetMachine.h"
@@ -61,6 +59,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ctrloops"
+
 #ifndef NDEBUG
 static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
 #endif
@@ -84,16 +84,16 @@ namespace {
   public:
     static char ID;
 
-    PPCCTRLoops() : FunctionPass(ID), TM(0) {
+    PPCCTRLoops() : FunctionPass(ID), TM(nullptr) {
       initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
     }
     PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
       initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<LoopInfo>();
       AU.addPreserved<LoopInfo>();
       AU.addRequired<DominatorTreeWrapperPass>();
@@ -128,12 +128,12 @@ namespace {
       initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineDominatorTree>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
   private:
     MachineDominatorTree *MDT;
@@ -172,7 +172,7 @@ bool PPCCTRLoops::runOnFunction(Function &F) {
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   LibInfo = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   bool MadeChange = false;
@@ -370,6 +370,14 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
                 J->getOpcode() == Instruction::URem ||
                 J->getOpcode() == Instruction::SRem)) {
       return true;
+    } else if (TT.isArch32Bit() &&
+               isLargeIntegerTy(false, J->getType()->getScalarType()) &&
+               (J->getOpcode() == Instruction::Shl ||
+                J->getOpcode() == Instruction::AShr ||
+                J->getOpcode() == Instruction::LShr)) {
+      // Only on PPC32, for 128-bit integers (specifically not 64-bit
+      // integers), these might be runtime calls.
+      return true;
     } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
       // On PowerPC, indirect jumps use the counter register.
       return true;
@@ -424,9 +432,9 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   SmallVector<BasicBlock*, 4> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
-  BasicBlock *CountedExitBlock = 0;
-  const SCEV *ExitCount = 0;
-  BranchInst *CountedExitBranch = 0;
+  BasicBlock *CountedExitBlock = nullptr;
+  const SCEV *ExitCount = nullptr;
+  BranchInst *CountedExitBranch = nullptr;
   for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
        IE = ExitingBlocks.end(); I != IE; ++I) {
     const SCEV *EC = SE->getExitCount(L, *I);
diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
index 84fc888..0875523 100644
--- a/lib/Target/PowerPC/PPCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -32,7 +32,7 @@ namespace {
     JITCodeEmitter &MCE;
     MachineModuleInfo *MMI;
     
-    void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineModuleInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -73,11 +73,13 @@ namespace {
     unsigned getTLSRegEncoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getTLSCallEncoding(const MachineInstr &MI, unsigned OpNo) const;
 
-    const char *getPassName() const { return "PowerPC Machine Code Emitter"; }
+    const char *getPassName() const override {
+      return "PowerPC Machine Code Emitter";
+    }
 
     /// runOnMachineFunction - emits the given MachineFunction to memory
     ///
-    bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
     /// emitBasicBlock - emits the given MachineBasicBlock to memory
     ///
@@ -102,7 +104,7 @@ bool PPCCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
   MMI = &getAnalysis<MachineModuleInfo>();
   MCE.setModuleInfo(MMI);
   do {
-    MovePCtoLROffset = 0;
+    MovePCtoLROffset = nullptr;
     MCE.startFunction(MF);
     for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
       emitBasicBlock(*BB);
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index dd45683..ed3cb4d 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ppcfastisel"
 #include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPCISelLowering.h"
@@ -58,6 +57,8 @@
 //===----------------------------------------------------------------------===//
 using namespace llvm;
 
+#define DEBUG_TYPE "ppcfastisel"
+
 namespace {
 
 typedef struct Address {
@@ -85,7 +86,7 @@ class PPCFastISel final : public FastISel {
   const TargetMachine &TM;
   const TargetInstrInfo &TII;
   const TargetLowering &TLI;
-  const PPCSubtarget &PPCSubTarget;
+  const PPCSubtarget *PPCSubTarget;
   LLVMContext *Context;
 
   public:
@@ -95,31 +96,29 @@ class PPCFastISel final : public FastISel {
       TM(FuncInfo.MF->getTarget()),
       TII(*TM.getInstrInfo()),
       TLI(*TM.getTargetLowering()),
-      PPCSubTarget(
-       *((static_cast<const PPCTargetMachine *>(&TM))->getSubtargetImpl())
-      ),
+      PPCSubTarget(&TM.getSubtarget<PPCSubtarget>()),
       Context(&FuncInfo.Fn->getContext()) { }
 
   // Backend specific FastISel code.
   private:
-    virtual bool TargetSelectInstruction(const Instruction *I);
-    virtual unsigned TargetMaterializeConstant(const Constant *C);
-    virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI);
-    virtual bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
-                                     const LoadInst *LI);
-    virtual bool FastLowerArguments();
-    virtual unsigned FastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm);
-    virtual unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
-                                     const TargetRegisterClass *RC,
-                                     unsigned Op0, bool Op0IsKill,
-                                     uint64_t Imm);
-    virtual unsigned FastEmitInst_r(unsigned MachineInstOpcode,
-                                    const TargetRegisterClass *RC,
-                                    unsigned Op0, bool Op0IsKill);
-    virtual unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
-                                     const TargetRegisterClass *RC,
-                                     unsigned Op0, bool Op0IsKill,
-                                     unsigned Op1, bool Op1IsKill);
+    bool TargetSelectInstruction(const Instruction *I) override;
+    unsigned TargetMaterializeConstant(const Constant *C) override;
+    unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
+    bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                             const LoadInst *LI) override;
+    bool FastLowerArguments() override;
+    unsigned FastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm) override;
+    unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             uint64_t Imm);
+    unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC,
+                            unsigned Op0, bool Op0IsKill);
+    unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             unsigned Op1, bool Op1IsKill);
 
   // Instruction selection routines.
   private:
@@ -282,7 +281,7 @@ bool PPCFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
 // Given a value Obj, create an Address object Addr that represents its
 // address.  Return false if we can't handle it.
 bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
-  const User *U = NULL;
+  const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
     // Don't walk into other basic blocks unless the object is an alloca from
@@ -556,7 +555,7 @@ bool PPCFastISel::SelectLoad(const Instruction *I) {
   // to constrain RA from using R0/X0 when this is not legal.
   unsigned AssignedReg = FuncInfo.ValueMap[I];
   const TargetRegisterClass *RC =
-    AssignedReg ? MRI.getRegClass(AssignedReg) : 0;
+    AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
 
   unsigned ResultReg = 0;
   if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
@@ -739,7 +738,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
     return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
 
-  if (SrcVT == MVT::i1 && PPCSubTarget.useCRBits())
+  if (SrcVT == MVT::i1 && PPCSubTarget->useCRBits())
     return false;
 
   // See if operand 2 is an immediate encodeable in the compare.
@@ -900,7 +899,7 @@ unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
     if (!IsSigned) {
       LoadOpc = PPC::LFIWZX;
       Addr.Offset = 4;
-    } else if (PPCSubTarget.hasLFIWAX()) {
+    } else if (PPCSubTarget->hasLFIWAX()) {
       LoadOpc = PPC::LFIWAX;
       Addr.Offset = 4;
     }
@@ -941,7 +940,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
 
   // We can only lower an unsigned convert if we have the newer
   // floating-point conversion operations.
-  if (!IsSigned && !PPCSubTarget.hasFPCVT())
+  if (!IsSigned && !PPCSubTarget->hasFPCVT())
     return false;
 
   // FIXME: For now we require the newer floating-point conversion operations
@@ -949,7 +948,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
   // to single-precision float.  Otherwise we have to generate a lot of
   // fiddly code to avoid double rounding.  If necessary, the fiddly code
   // can be found in PPCTargetLowering::LowerINT_TO_FP().
-  if (DstVT == MVT::f32 && !PPCSubTarget.hasFPCVT())
+  if (DstVT == MVT::f32 && !PPCSubTarget->hasFPCVT())
     return false;
 
   // Extend the input if necessary.
@@ -1012,7 +1011,7 @@ unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
   // to determine the required register class.
   unsigned AssignedReg = FuncInfo.ValueMap[I];
   const TargetRegisterClass *RC =
-    AssignedReg ? MRI.getRegClass(AssignedReg) : 0;
+    AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
 
   unsigned ResultReg = 0;
   if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned))
@@ -1064,7 +1063,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
     if (IsSigned)
       Opc = PPC::FCTIWZ;
     else
-      Opc = PPCSubTarget.hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
+      Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
   else
     Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
 
@@ -1863,7 +1862,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
   if (!GVar) {
     // If GV is an alias, use the aliasee for determining thread-locality.
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GVar = dyn_cast_or_null<GlobalVariable>(GA->getAliasedGlobal());
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->getAliasee());
   }
 
   // FIXME: We don't yet handle the complexity of TLS.
@@ -2001,7 +2000,7 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
 unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT) {
   // If we're using CR bit registers for i1 values, handle that as a special
   // case first.
-  if (VT == MVT::i1 && PPCSubTarget.useCRBits()) {
+  if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
     const ConstantInt *CI = cast<ConstantInt>(C);
     unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -2149,7 +2148,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
   unsigned ResultReg = MI->getOperand(0).getReg();
 
-  if (!PPCEmitLoad(VT, ResultReg, Addr, 0, IsZExt))
+  if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt))
     return false;
 
   MI->eraseFromParent();
@@ -2175,7 +2174,7 @@ unsigned PPCFastISel::FastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
 
   // If we're using CR bit registers for i1 values, handle that as a special
   // case first.
-  if (VT == MVT::i1 && PPCSubTarget.useCRBits()) {
+  if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
     unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg);
@@ -2261,6 +2260,6 @@ namespace llvm {
     if (Subtarget->isPPC64() && Subtarget->isSVR4ABI())
       return new PPCFastISel(FuncInfo, LibInfo);
 
-    return 0;
+    return nullptr;
   }
 }
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index d8f491f..e294156 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -222,7 +222,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
   if (!DisableRedZone &&
       (Subtarget.isPPC64() ||                      // 32-bit SVR4, no stack-
        !Subtarget.isSVR4ABI() ||                   //   allocated locals.
-	FrameSize == 0) &&
+        FrameSize == 0) &&
       FrameSize <= 224 &&                          // Fits in red zone.
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
       !MFI->adjustsStack() &&                      // No calls.
@@ -281,8 +281,8 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
 
   // Naked functions have no stack frame pushed, so we don't have a frame
   // pointer.
-  if (MF.getFunction()->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                     Attribute::Naked))
+  if (MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::Naked))
     return false;
 
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
@@ -426,7 +426,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       assert(FPIndex && "No Frame Pointer Save Slot!");
       FPOffset = FFI->getObjectOffset(FPIndex);
     } else {
-      FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+      FPOffset =
+          PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
     }
   }
 
@@ -562,13 +563,14 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     assert(NegFrameSize);
     unsigned CFIIndex = MMI.addFrameInst(
         MCCFIInstruction::createDefCfaOffset(nullptr, NegFrameSize));
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
 
     if (HasFP) {
       unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, FPOffset));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -576,7 +578,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, BPOffset));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -584,7 +586,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned Reg = MRI->getDwarfRegNum(LRReg, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, LROffset));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
   }
@@ -601,7 +603,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
 
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
   }
@@ -629,7 +631,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
         unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
             nullptr, MRI->getDwarfRegNum(PPC::CR2, true), 8));
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex);
         continue;
       }
@@ -637,7 +639,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
       unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
           nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
   }
@@ -712,7 +714,8 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
       assert(FPIndex && "No Frame Pointer Save Slot!");
       FPOffset = FFI->getObjectOffset(FPIndex);
     } else {
-      FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+      FPOffset =
+          PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
     }
   }
 
@@ -930,9 +933,9 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     MFI->CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true);
   }
 
-  // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the 
+  // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the
   // function uses CR 2, 3, or 4.
-  if (!isPPC64 && !isDarwinABI && 
+  if (!isPPC64 && !isDarwinABI &&
       (MRI.isPhysRegUsed(PPC::CR2) ||
        MRI.isPhysRegUsed(PPC::CR3) ||
        MRI.isPhysRegUsed(PPC::CR4))) {
@@ -1106,10 +1109,10 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
       unsigned Reg = CSI[i].getReg();
 
       if ((Subtarget.isSVR4ABI() && Reg == PPC::CR2)
-	  // Leave Darwin logic as-is.
-	  || (!Subtarget.isSVR4ABI() &&
-	      (PPC::CRBITRCRegClass.contains(Reg) ||
-	       PPC::CRRCRegClass.contains(Reg)))) {
+          // Leave Darwin logic as-is.
+          || (!Subtarget.isSVR4ABI() &&
+              (PPC::CRBITRCRegClass.contains(Reg) ||
+               PPC::CRRCRegClass.contains(Reg)))) {
         int FI = CSI[i].getFrameIdx();
 
         FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
@@ -1190,11 +1193,11 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
   }
 }
 
-bool 
+bool
 PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-				     MachineBasicBlock::iterator MI,
-				     const std::vector<CalleeSavedInfo> &CSI,
-				     const TargetRegisterInfo *TRI) const {
+                                     MachineBasicBlock::iterator MI,
+                                     const std::vector<CalleeSavedInfo> &CSI,
+                                     const TargetRegisterInfo *TRI) const {
 
   // Currently, this function only handles SVR4 32- and 64-bit ABIs.
   // Return false otherwise to maintain pre-existing behavior.
@@ -1207,7 +1210,7 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   DebugLoc DL;
   bool CRSpilled = false;
   MachineInstrBuilder CRMIB;
-  
+
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
     // Only Darwin actually uses the VRSAVE register, but it can still appear
@@ -1237,21 +1240,21 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
         CRSpilled = true;
         FuncInfo->setSpillsCR();
 
-	// 32-bit:  FP-relative.  Note that we made sure CR2-CR4 all have
-	// the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
-	CRMIB = BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12)
+        // 32-bit:  FP-relative.  Note that we made sure CR2-CR4 all have
+        // the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
+        CRMIB = BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12)
                   .addReg(Reg, RegState::ImplicitKill);
 
-	MBB.insert(MI, CRMIB);
-	MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
-					 .addReg(PPC::R12,
-						 getKillRegState(true)),
-					 CSI[i].getFrameIdx()));
+        MBB.insert(MI, CRMIB);
+        MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
+                                         .addReg(PPC::R12,
+                                                 getKillRegState(true)),
+                                         CSI[i].getFrameIdx()));
       }
     } else {
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       TII.storeRegToStackSlot(MBB, MI, Reg, true,
-			      CSI[i].getFrameIdx(), RC, TRI);
+                              CSI[i].getFrameIdx(), RC, TRI);
     }
   }
   return true;
@@ -1260,8 +1263,8 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 static void
 restoreCRs(bool isPPC64, bool is31,
            bool CR2Spilled, bool CR3Spilled, bool CR4Spilled,
-	   MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-	   const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
+           MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+           const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
 
   MachineFunction *MF = MBB.getParent();
   const PPCInstrInfo &TII =
@@ -1275,12 +1278,12 @@ restoreCRs(bool isPPC64, bool is31,
   else {
     // 32-bit:  FP-relative
     MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ),
-					     PPC::R12),
-				     CSI[CSIIndex].getFrameIdx()));
+                                             PPC::R12),
+                                     CSI[CSIIndex].getFrameIdx()));
     RestoreOp = PPC::MTOCRF;
     MoveReg = PPC::R12;
   }
-  
+
   if (CR2Spilled)
     MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR2)
                .addReg(MoveReg, getKillRegState(!CR3Spilled && !CR4Spilled)));
@@ -1335,11 +1338,11 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-bool 
+bool
 PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-					MachineBasicBlock::iterator MI,
-				        const std::vector<CalleeSavedInfo> &CSI,
-					const TargetRegisterInfo *TRI) const {
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
 
   // Currently, this function only handles SVR4 32- and 64-bit ABIs.
   // Return false otherwise to maintain pre-existing behavior.
@@ -1387,20 +1390,20 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
       // When we first encounter a non-CR register after seeing at
       // least one CR register, restore all spilled CRs together.
       if ((CR2Spilled || CR3Spilled || CR4Spilled)
-	  && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+          && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
         bool is31 = needsFP(*MF);
         restoreCRs(Subtarget.isPPC64(), is31,
                    CR2Spilled, CR3Spilled, CR4Spilled,
-		   MBB, I, CSI, CSIIndex);
-	CR2Spilled = CR3Spilled = CR4Spilled = false;
+                   MBB, I, CSI, CSIIndex);
+        CR2Spilled = CR3Spilled = CR4Spilled = false;
       }
 
       // Default behavior for non-CR saves.
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(),
-			       RC, TRI);
+                               RC, TRI);
       assert(I != MBB.begin() &&
-	     "loadRegFromStackSlot didn't insert any code!");
+             "loadRegFromStackSlot didn't insert any code!");
       }
 
     // Insert in reverse order.
@@ -1409,16 +1412,15 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     else {
       I = BeforeI;
       ++I;
-    }	    
+    }
   }
 
   // If we haven't yet spilled the CRs, do so now.
   if (CR2Spilled || CR3Spilled || CR4Spilled) {
-    bool is31 = needsFP(*MF); 
+    bool is31 = needsFP(*MF);
     restoreCRs(Subtarget.isPPC64(), is31, CR2Spilled, CR3Spilled, CR4Spilled,
-	       MBB, I, CSI, CSIIndex);
+               MBB, I, CSI, CSIIndex);
   }
 
   return true;
 }
-
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 7aab37e..94e9b67 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -38,37 +38,37 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  bool hasFP(const MachineFunction &MF) const;
+  bool hasFP(const MachineFunction &MF) const override;
   bool needsFP(const MachineFunction &MF) const;
   void replaceFPWithRealFP(MachineFunction &MF) const;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS = NULL) const;
+                                     RegScavenger *RS = nullptr) const override;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                       RegScavenger *RS = NULL) const;
+                                     RegScavenger *RS = nullptr) const override;
   void addScavengingSpillSlot(MachineFunction &MF, RegScavenger *RS) const;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
 
   /// targetHandlesStackFrameRounding - Returns true if the target is
   /// responsible for rounding up the stack frame (probably at emitPrologue
   /// time).
-  bool targetHandlesStackFrameRounding() const { return true; }
+  bool targetHandlesStackFrameRounding() const override { return true; }
 
   /// getReturnSaveOffset - Return the previous frame offset to save the
   /// return address.
@@ -141,7 +141,7 @@ public:
 
   // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
   const SpillSlot *
-  getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+  getCalleeSavedSpillSlots(unsigned &NumEntries) const override {
     if (Subtarget.isDarwinABI()) {
       NumEntries = 1;
       if (Subtarget.isPPC64()) {
@@ -156,7 +156,7 @@ public:
     // Early exit if not using the SVR4 ABI.
     if (!Subtarget.isSVR4ABI()) {
       NumEntries = 0;
-      return 0;
+      return nullptr;
     }
 
     // Note that the offsets here overlap, but this is fixed up in
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 37c85b3..7ca706b 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "pre-RA-sched"
 #include "PPCHazardRecognizers.h"
 #include "PPC.h"
 #include "PPCInstrInfo.h"
@@ -22,6 +21,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "pre-RA-sched"
+
 bool PPCDispatchGroupSBHazardRecognizer::isLoadAfterStore(SUnit *SU) {
   // FIXME: Move this.
   if (isBCTRAfterSet(SU))
@@ -226,7 +227,7 @@ void PPCDispatchGroupSBHazardRecognizer::EmitNoop() {
     CurGroup.clear();
     CurSlots = CurBranches = 0;
   } else {
-    CurGroup.push_back(0);
+    CurGroup.push_back(nullptr);
     ++CurSlots;
   }
 }
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
index 6b7fe41..cf4332c 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.h
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -37,14 +37,14 @@ public:
     ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_),
     CurSlots(0), CurBranches(0) {}
 
-  virtual HazardType getHazardType(SUnit *SU, int Stalls);
-  virtual bool ShouldPreferAnother(SUnit* SU);
-  virtual unsigned PreEmitNoops(SUnit *SU);
-  virtual void EmitInstruction(SUnit *SU);
-  virtual void AdvanceCycle();
-  virtual void RecedeCycle();
-  virtual void Reset();
-  virtual void EmitNoop();
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  bool ShouldPreferAnother(SUnit* SU) override;
+  unsigned PreEmitNoops(SUnit *SU) override;
+  void EmitInstruction(SUnit *SU) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
+  void Reset() override;
+  void EmitNoop() override;
 };
 
 /// PPCHazardRecognizer970 - This class defines a finite state automata that
@@ -76,10 +76,10 @@ class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
 
 public:
   PPCHazardRecognizer970(const TargetMachine &TM);
-  virtual HazardType getHazardType(SUnit *SU, int Stalls);
-  virtual void EmitInstruction(SUnit *SU);
-  virtual void AdvanceCycle();
-  virtual void Reset();
+  virtual HazardType getHazardType(SUnit *SU, int Stalls) override;
+  virtual void EmitInstruction(SUnit *SU) override;
+  virtual void AdvanceCycle() override;
+  virtual void Reset() override;
 
 private:
   /// EndDispatchGroup - Called when we are finishing a new dispatch group.
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 3bbc839..251e8b6 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ppc-codegen"
 #include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPCTargetMachine.h"
@@ -35,6 +34,8 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "ppc-codegen"
+
 // FIXME: Remove this once the bug has been fixed!
 cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
 cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
@@ -50,29 +51,31 @@ namespace {
   ///
   class PPCDAGToDAGISel : public SelectionDAGISel {
     const PPCTargetMachine &TM;
-    const PPCTargetLowering &PPCLowering;
-    const PPCSubtarget &PPCSubTarget;
+    const PPCTargetLowering *PPCLowering;
+    const PPCSubtarget *PPCSubTarget;
     unsigned GlobalBaseReg;
   public:
     explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
       : SelectionDAGISel(tm), TM(tm),
-        PPCLowering(*TM.getTargetLowering()),
-        PPCSubTarget(*TM.getSubtargetImpl()) {
+        PPCLowering(TM.getTargetLowering()),
+        PPCSubTarget(TM.getSubtargetImpl()) {
       initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
       // Make sure we re-emit a set of the global base reg if necessary
       GlobalBaseReg = 0;
+      PPCLowering = TM.getTargetLowering();
+      PPCSubTarget = TM.getSubtargetImpl();
       SelectionDAGISel::runOnMachineFunction(MF);
 
-      if (!PPCSubTarget.isSVR4ABI())
+      if (!PPCSubTarget->isSVR4ABI())
         InsertVRSaveCode(MF);
 
       return true;
     }
 
-    virtual void PostprocessISelDAG();
+    void PostprocessISelDAG() override;
 
     /// getI32Imm - Return a target constant with the specified value, of type
     /// i32.
@@ -88,7 +91,7 @@ namespace {
 
     /// getSmallIPtrImm - Return a target constant of pointer type.
     inline SDValue getSmallIPtrImm(unsigned Imm) {
-      return CurDAG->getTargetConstant(Imm, PPCLowering.getPointerTy());
+      return CurDAG->getTargetConstant(Imm, PPCLowering->getPointerTy());
     }
 
     /// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s
@@ -109,7 +112,7 @@ namespace {
 
     // Select - Convert the specified operand from a target-independent to a
     // target-specific node if it hasn't already been changed.
-    SDNode *Select(SDNode *N);
+    SDNode *Select(SDNode *N) override;
 
     SDNode *SelectBitfieldInsert(SDNode *N);
 
@@ -121,7 +124,7 @@ namespace {
     /// a base register plus a signed 16-bit displacement [r+imm].
     bool SelectAddrImm(SDValue N, SDValue &Disp,
                        SDValue &Base) {
-      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
     }
 
     /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
@@ -141,20 +144,20 @@ namespace {
     /// represented as an indexed [r+r] operation.  Returns false if it can
     /// be represented by [r+imm], which are preferred.
     bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) {
-      return PPCLowering.SelectAddressRegReg(N, Base, Index, *CurDAG);
+      return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG);
     }
 
     /// SelectAddrIdxOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
     bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) {
-      return PPCLowering.SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
+      return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
     }
 
     /// SelectAddrImmX4 - Returns true if the address N can be represented by
     /// a base register plus a signed 16-bit displacement that is a multiple of 4.
     /// Suitable for use by STD and friends.
     bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
-      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
     }
 
     // Select an address into a single register.
@@ -168,16 +171,16 @@ namespace {
     /// a register.  The case of adding a (possibly relocatable) constant to a
     /// register can be improved, but it is wrong to substitute Reg+Reg for
     /// Reg in an asm, because the load or store opcode would have to change.
-   virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                              char ConstraintCode,
-                                              std::vector<SDValue> &OutOps) {
+   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                      char ConstraintCode,
+                                      std::vector<SDValue> &OutOps) override {
       OutOps.push_back(Op);
       return false;
     }
 
     void InsertVRSaveCode(MachineFunction &MF);
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "PowerPC DAG->DAG Pattern Instruction Selection";
     }
 
@@ -188,7 +191,7 @@ private:
     SDNode *SelectSETCC(SDNode *N);
 
     void PeepholePPC64();
-    void PeepholdCROps();
+    void PeepholeCROps();
 
     bool AllUsersSelectZero(SDNode *N);
     void SwapAllSelectUsers(SDNode *N);
@@ -271,7 +274,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     MachineBasicBlock::iterator MBBI = FirstMBB.begin();
     DebugLoc dl;
 
-    if (PPCLowering.getPointerTy() == MVT::i32) {
+    if (PPCLowering->getPointerTy() == MVT::i32) {
       GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass);
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
@@ -282,7 +285,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     }
   }
   return CurDAG->getRegister(GlobalBaseReg,
-                             PPCLowering.getPointerTy()).getNode();
+                             PPCLowering->getPointerTy()).getNode();
 }
 
 /// isIntS16Immediate - This method tests to see if the node is either a 32-bit
@@ -414,8 +417,8 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
   SDLoc dl(N);
 
   APInt LKZ, LKO, RKZ, RKO;
-  CurDAG->ComputeMaskedBits(Op0, LKZ, LKO);
-  CurDAG->ComputeMaskedBits(Op1, RKZ, RKO);
+  CurDAG->computeKnownBits(Op0, LKZ, LKO);
+  CurDAG->computeKnownBits(Op1, RKZ, RKO);
 
   unsigned TargetMask = LKZ.getZExtValue();
   unsigned InsertMask = RKZ.getZExtValue();
@@ -458,11 +461,18 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
         SH  = (Op1Opc == ISD::SHL) ? Value : 32 - Value;
       }
       if (Op1Opc == ISD::AND) {
+       // The AND mask might not be a constant, and we need to make sure that
+       // if we're going to fold the masking with the insert, all bits not
+       // know to be zero in the mask are known to be one.
+        APInt MKZ, MKO;
+        CurDAG->computeKnownBits(Op1.getOperand(1), MKZ, MKO);
+        bool CanFoldMask = InsertMask == MKO.getZExtValue();
+
         unsigned SHOpc = Op1.getOperand(0).getOpcode();
-        if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) &&
+        if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && CanFoldMask &&
             isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
-	  // Note that Value must be in range here (less than 32) because
-	  // otherwise there would not be any bits set in InsertMask.
+          // Note that Value must be in range here (less than 32) because
+          // otherwise there would not be any bits set in InsertMask.
           Op1 = Op1.getOperand(0).getOperand(0);
           SH  = (SHOpc == ISD::SHL) ? Value : 32 - Value;
         }
@@ -474,7 +484,7 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
       return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops);
     }
   }
-  return 0;
+  return nullptr;
 }
 
 /// SelectCC - Select a comparison of the specified values with the specified
@@ -572,7 +582,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
     Opc = PPC::FCMPUS;
   } else {
     assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
-    Opc = PPCSubTarget.hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
+    Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
   }
   return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
 }
@@ -738,7 +748,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = (PtrVT == MVT::i64);
 
-  if (!PPCSubTarget.useCRBits() &&
+  if (!PPCSubTarget->useCRBits() &&
       isInt32Immediate(N->getOperand(1), Imm)) {
     // We can codegen setcc op, imm very efficiently compared to a brcond.
     // Check for those cases here.
@@ -750,7 +760,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
       case ISD::SETEQ: {
         Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
         SDValue Ops[] = { Op, getI32Imm(27), getI32Imm(5), getI32Imm(31) };
-        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
       }
       case ISD::SETNE: {
         if (isPPC64) break;
@@ -762,14 +772,14 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
       }
       case ISD::SETLT: {
         SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
-        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
       }
       case ISD::SETGT: {
         SDValue T =
           SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0);
         T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
         SDValue Ops[] = { T, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
-        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
       }
       }
     } else if (Imm == ~0U) {        // setcc op, -1
@@ -799,7 +809,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
         SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD,
                                                     Op), 0);
         SDValue Ops[] = { AN, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
-        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
       }
       case ISD::SETGT: {
         SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
@@ -820,7 +830,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   if (LHS.getValueType().isVector()) {
     EVT VecVT = LHS.getValueType();
     MVT::SimpleValueType VT = VecVT.getSimpleVT().SimpleTy;
-    unsigned int VCmpInst = getVCmpInst(VT, CC, PPCSubTarget.hasVSX());
+    unsigned int VCmpInst = getVCmpInst(VT, CC, PPCSubTarget->hasVSX());
 
     switch (CC) {
       case ISD::SETEQ:
@@ -831,7 +841,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
       case ISD::SETONE:
       case ISD::SETUNE: {
         SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
-        return CurDAG->SelectNodeTo(N, PPCSubTarget.hasVSX() ? PPC::XXLNOR :
+        return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR :
                                                                PPC::VNOR,
                                     VecVT, VCmp, VCmp);
       } 
@@ -853,9 +863,9 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
           return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
         } else {
           SDValue VCmpGT(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
-          unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget.hasVSX());
+          unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget->hasVSX());
           SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
-          return CurDAG->SelectNodeTo(N, PPCSubTarget.hasVSX() ? PPC::XXLOR :
+          return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLOR :
                                                                  PPC::VOR,
                                       VecVT, VCmpGT, VCmpEQ);
         }
@@ -864,9 +874,9 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
       case ISD::SETOLE:
       case ISD::SETULE: {
         SDValue VCmpLE(CurDAG->getMachineNode(VCmpInst, dl, VecVT, RHS, LHS), 0);
-        unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget.hasVSX());
+        unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget->hasVSX());
         SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
-        return CurDAG->SelectNodeTo(N, PPCSubTarget.hasVSX() ? PPC::XXLOR :
+        return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLOR :
                                                                PPC::VOR,
                                     VecVT, VCmpLE, VCmpEQ);
       }
@@ -875,8 +885,8 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
     }
   }
 
-  if (PPCSubTarget.useCRBits())
-    return 0;
+  if (PPCSubTarget->useCRBits())
+    return nullptr;
 
   bool Inv;
   unsigned Idx = getCRIdxForSetCC(CC, Inv);
@@ -886,7 +896,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   // Force the ccreg into CR7.
   SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
 
-  SDValue InFlag(0, 0);  // Null incoming flag value.
+  SDValue InFlag(nullptr, 0);  // Null incoming flag value.
   CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
                                InFlag).getValue(1);
 
@@ -896,7 +906,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   SDValue Ops[] = { IntCR, getI32Imm((32-(3-Idx)) & 31),
                       getI32Imm(31), getI32Imm(31) };
   if (!Inv)
-    return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
 
   // Get the specified bit.
   SDValue Tmp =
@@ -911,7 +921,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
 
   switch (N->getOpcode()) {
@@ -1093,7 +1103,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Offset, Base, Chain };
       return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
-                                    PPCLowering.getPointerTy(),
+                                    PPCLowering->getPointerTy(),
                                     MVT::Other, Ops);
     } else {
       unsigned Opcode;
@@ -1128,7 +1138,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Base, Offset, Chain };
       return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
-                                    PPCLowering.getPointerTy(),
+                                    PPCLowering->getPointerTy(),
                                     MVT::Other, Ops);
     }
   }
@@ -1143,7 +1153,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) {
       SDValue Val = N->getOperand(0).getOperand(0);
       SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
-      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
     }
     // If this is just a masked value where the input is not handled above, and
     // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
@@ -1152,7 +1162,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         N->getOperand(0).getOpcode() != ISD::ROTL) {
       SDValue Val = N->getOperand(0);
       SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) };
-      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
     }
     // If this is a 64-bit zero-extension mask, emit rldicl.
     if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
@@ -1174,12 +1184,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       }
 
       SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB) };
-      return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops, 3);
+      return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
     }
     // AND X, 0 -> 0, not "rlwinm 32".
     if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
       ReplaceUses(SDValue(N, 0), N->getOperand(1));
-      return NULL;
+      return nullptr;
     }
     // ISD::OR doesn't get all the bitfield insertion fun.
     // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) is a bitfield insert
@@ -1212,7 +1222,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         isRotateAndMask(N, Imm, true, SH, MB, ME)) {
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
-      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
     }
 
     // Other cases are autogenerated.
@@ -1224,7 +1234,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         isRotateAndMask(N, Imm, true, SH, MB, ME)) {
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
-      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
     }
 
     // Other cases are autogenerated.
@@ -1259,7 +1269,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     bool isPPC64 = (PtrVT == MVT::i64);
 
     // If this is a select of i1 operands, we'll pattern match it.
-    if (PPCSubTarget.useCRBits() &&
+    if (PPCSubTarget->useCRBits() &&
         N->getOperand(0).getValueType() == MVT::i1)
       break;
 
@@ -1327,17 +1337,17 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
 
     SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3),
                         getI32Imm(BROpc) };
-    return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops, 4);
+    return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops);
   }
   case ISD::VSELECT:
-    if (PPCSubTarget.hasVSX()) {
+    if (PPCSubTarget->hasVSX()) {
       SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) };
-      return CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops, 3);
+      return CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops);
     }
 
     break;
   case ISD::VECTOR_SHUFFLE:
-    if (PPCSubTarget.hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
+    if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
                                   N->getValueType(0) == MVT::v2i64)) {
       ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
       
@@ -1364,23 +1374,23 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
           SDValue Chain = LD->getChain();
           SDValue Ops[] = { Base, Offset, Chain };
           return CurDAG->SelectNodeTo(N, PPC::LXVDSX,
-                                      N->getValueType(0), Ops, 3);
+                                      N->getValueType(0), Ops);
         }
       }
 
       SDValue Ops[] = { Op1, Op2, DMV };
-      return CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops, 3);
+      return CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops);
     }
 
     break;
   case PPCISD::BDNZ:
   case PPCISD::BDZ: {
-    bool IsPPC64 = PPCSubTarget.isPPC64();
+    bool IsPPC64 = PPCSubTarget->isPPC64();
     SDValue Ops[] = { N->getOperand(1), N->getOperand(0) };
     return CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ ?
                                    (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
                                    (IsPPC64 ? PPC::BDZ8 : PPC::BDZ),
-                                MVT::Other, Ops, 2);
+                                MVT::Other, Ops);
   }
   case PPCISD::COND_BRANCH: {
     // Op #0 is the Chain.
@@ -1393,7 +1403,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
     SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
       N->getOperand(0), N->getOperand(4) };
-    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 5);
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
   }
   case ISD::BR_CC: {
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
@@ -1422,7 +1432,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
     SDValue Ops[] = { getI32Imm(PCC), CondCode,
                         N->getOperand(4), N->getOperand(0) };
-    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 4);
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
   }
   case ISD::BRIND: {
     // FIXME: Should custom lower this.
@@ -1435,7 +1445,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
   }
   case PPCISD::TOC_ENTRY: {
-    assert (PPCSubTarget.isPPC64() && "Only supported for 64-bit ABI");
+    assert (PPCSubTarget->isPPC64() && "Only supported for 64-bit ABI");
 
     // For medium and large code model, we generate two instructions as
     // described below.  Otherwise we allow SelectCodeCommon to handle this,
@@ -1462,18 +1472,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
 
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
       const GlobalValue *GValue = G->getGlobal();
-      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue =
-          GAlias ? GAlias->getAliasedGlobal() : GValue;
-      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
-      assert((GVar || isa<Function>(RealGValue)) &&
-             "Unexpected global value subclass!");
-
-      // An external variable is one without an initializer.  For these,
-      // for variables with common linkage, and for Functions, generate
-      // the LDtocL form.
-      if (!GVar || !GVar->hasInitializer() || RealGValue->hasCommonLinkage() ||
-          RealGValue->hasAvailableExternallyLinkage())
+      if (GValue->isDeclaration() || GValue->hasCommonLinkage() ||
+        GValue->hasAvailableExternallyLinkage())
         return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
                                       SDValue(Tmp, 0));
     }
@@ -1566,7 +1566,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
     return;
 
   PeepholePPC64();
-  PeepholdCROps();
+  PeepholeCROps();
 }
 
 // Check if all users of this node will become isel where the second operand
@@ -1576,7 +1576,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
 // containing zero.
 bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
   // If we're not using isel, then this does not matter.
-  if (!PPCSubTarget.hasISEL())
+  if (!PPCSubTarget->hasISEL())
     return false;
 
   for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
@@ -1637,7 +1637,7 @@ void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) {
   }
 }
 
-void PPCDAGToDAGISel::PeepholdCROps() {
+void PPCDAGToDAGISel::PeepholeCROps() {
   bool IsModified;
   do {
     IsModified = false;
@@ -2038,7 +2038,7 @@ void PPCDAGToDAGISel::PeepholdCROps() {
 
 void PPCDAGToDAGISel::PeepholePPC64() {
   // These optimizations are currently supported only for 64-bit SVR4.
-  if (PPCSubTarget.isDarwin() || !PPCSubTarget.isPPC64())
+  if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
     return;
 
   SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
@@ -2196,8 +2196,8 @@ FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
 
 static void initializePassOnce(PassRegistry &Registry) {
   const char *Name = "PowerPC DAG->DAG Pattern Instruction Selection";
-  PassInfo *PI = new PassInfo(Name, "ppc-codegen", &SelectionDAGISel::ID, 0,
-                              false, false);
+  PassInfo *PI = new PassInfo(Name, "ppc-codegen", &SelectionDAGISel::ID,
+                              nullptr, false, false);
   Registry.registerPass(*PI, true);
 }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 32ac1dc..cf4c9e6 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -18,6 +18,7 @@
 #include "PPCTargetMachine.h"
 #include "PPCTargetObjectFile.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -459,6 +460,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
       setOperationAction(ISD::SDIVREM, VT, Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
       setOperationAction(ISD::FPOW, VT, Expand);
+      setOperationAction(ISD::BSWAP, VT, Expand);
       setOperationAction(ISD::CTPOP, VT, Expand);
       setOperationAction(ISD::CTLZ, VT, Expand);
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
@@ -758,7 +760,7 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
 
 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return 0;
+  default: return nullptr;
   case PPCISD::FSEL:            return "PPCISD::FSEL";
   case PPCISD::FCFID:           return "PPCISD::FCFID";
   case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
@@ -929,7 +931,7 @@ bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
 /// amount, otherwise return -1.
 int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) {
   if (N->getValueType(0) != MVT::v16i8)
-    return false;
+    return -1;
 
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
 
@@ -1019,7 +1021,7 @@ unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize) {
 /// the constant being splatted.  The ByteSize field indicates the number of
 /// bytes of each element [124] -> [bhw].
 SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
-  SDValue OpVal(0, 0);
+  SDValue OpVal(nullptr, 0);
 
   // If ByteSize of the splat is bigger than the element size of the
   // build_vector, then we have a case where we are checking for a splat where
@@ -1038,7 +1040,7 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
       if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
 
 
-      if (UniquedVals[i&(Multiple-1)].getNode() == 0)
+      if (!UniquedVals[i&(Multiple-1)].getNode())
         UniquedVals[i&(Multiple-1)] = N->getOperand(i);
       else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
         return SDValue();  // no match.
@@ -1053,21 +1055,21 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
     bool LeadingZero = true;
     bool LeadingOnes = true;
     for (unsigned i = 0; i != Multiple-1; ++i) {
-      if (UniquedVals[i].getNode() == 0) continue;  // Must have been undefs.
+      if (!UniquedVals[i].getNode()) continue;  // Must have been undefs.
 
       LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue();
       LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue();
     }
     // Finally, check the least significant entry.
     if (LeadingZero) {
-      if (UniquedVals[Multiple-1].getNode() == 0)
+      if (!UniquedVals[Multiple-1].getNode())
         return DAG.getTargetConstant(0, MVT::i32);  // 0,0,0,undef
       int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
       if (Val < 16)
         return DAG.getTargetConstant(Val, MVT::i32);  // 0,0,0,4 -> vspltisw(4)
     }
     if (LeadingOnes) {
-      if (UniquedVals[Multiple-1].getNode() == 0)
+      if (!UniquedVals[Multiple-1].getNode())
         return DAG.getTargetConstant(~0U, MVT::i32);  // -1,-1,-1,undef
       int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
       if (Val >= -16)                            // -1,-1,-1,-2 -> vspltisw(-2)
@@ -1080,13 +1082,13 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
   // Check to see if this buildvec has a single non-undef value in its elements.
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
-    if (OpVal.getNode() == 0)
+    if (!OpVal.getNode())
       OpVal = N->getOperand(i);
     else if (OpVal != N->getOperand(i))
       return SDValue();
   }
 
-  if (OpVal.getNode() == 0) return SDValue();  // All UNDEF: use implicit def.
+  if (!OpVal.getNode()) return SDValue();  // All UNDEF: use implicit def.
 
   unsigned ValSizeInBytes = EltSize;
   uint64_t Value = 0;
@@ -1135,7 +1137,7 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
 /// sign extension from a 16-bit value.  If so, this returns true and the
 /// immediate.
 static bool isIntS16Immediate(SDNode *N, short &Imm) {
-  if (N->getOpcode() != ISD::Constant)
+  if (!isa<ConstantSDNode>(N))
     return false;
 
   Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
@@ -1174,12 +1176,12 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
     // disjoint.
     APInt LHSKnownZero, LHSKnownOne;
     APInt RHSKnownZero, RHSKnownOne;
-    DAG.ComputeMaskedBits(N.getOperand(0),
-                          LHSKnownZero, LHSKnownOne);
+    DAG.computeKnownBits(N.getOperand(0),
+                         LHSKnownZero, LHSKnownOne);
 
     if (LHSKnownZero.getBoolValue()) {
-      DAG.ComputeMaskedBits(N.getOperand(1),
-                            RHSKnownZero, RHSKnownOne);
+      DAG.computeKnownBits(N.getOperand(1),
+                           RHSKnownZero, RHSKnownOne);
       // If all of the bits are known zero on the LHS or RHS, the add won't
       // carry.
       if (~(LHSKnownZero | RHSKnownZero) == 0) {
@@ -1279,7 +1281,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
       // (for better address arithmetic) if the LHS and RHS of the OR are
       // provably disjoint.
       APInt LHSKnownZero, LHSKnownOne;
-      DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
+      DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
 
       if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
         // If all of the bits are known zero on the LHS or RHS, the add won't
@@ -1439,7 +1441,8 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 /// GetLabelAccessInfo - Return true if we should reference labels using a
 /// PICBase, set the HiOpFlags and LoOpFlags to the target MO flags.
 static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags,
-                               unsigned &LoOpFlags, const GlobalValue *GV = 0) {
+                               unsigned &LoOpFlags,
+                               const GlobalValue *GV = nullptr) {
   HiOpFlags = PPCII::MO_HA;
   LoOpFlags = PPCII::MO_LO;
 
@@ -1885,17 +1888,12 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   Entry.Node = Nest; Args.push_back(Entry);
 
   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
-  TargetLowering::CallLoweringInfo CLI(Chain,
-                                       Type::getVoidTy(*DAG.getContext()),
-                                       false, false, false, false, 0,
-                                       CallingConv::C,
-                /*isTailCall=*/false,
-                                       /*doesNotRet=*/false,
-                                       /*isReturnValueUsed=*/true,
-                DAG.getExternalSymbol("__trampoline_setup", PtrVT),
-                Args, DAG, dl);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+               DAG.getExternalSymbol("__trampoline_setup", PtrVT), &Args, 0);
 
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
 }
 
@@ -2016,7 +2014,7 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
                                              CCValAssign::LocInfo &LocInfo,
                                              ISD::ArgFlagsTy &ArgFlags,
                                              CCState &State) {
-  static const uint16_t ArgRegs[] = {
+  static const MCPhysReg ArgRegs[] = {
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
@@ -2043,7 +2041,7 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
                                                CCValAssign::LocInfo &LocInfo,
                                                ISD::ArgFlagsTy &ArgFlags,
                                                CCState &State) {
-  static const uint16_t ArgRegs[] = {
+  static const MCPhysReg ArgRegs[] = {
     PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
     PPC::F8
   };
@@ -2067,8 +2065,8 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
 
 /// GetFPR - Get the set of FP registers that should be allocated for arguments,
 /// on Darwin.
-static const uint16_t *GetFPR() {
-  static const uint16_t FPR[] = {
+static const MCPhysReg *GetFPR() {
+  static const MCPhysReg FPR[] = {
     PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
     PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13
   };
@@ -2265,13 +2263,13 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
   if (isVarArg) {
-    static const uint16_t GPArgRegs[] = {
+    static const MCPhysReg GPArgRegs[] = {
       PPC::R3, PPC::R4, PPC::R5, PPC::R6,
       PPC::R7, PPC::R8, PPC::R9, PPC::R10,
     };
     const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
 
-    static const uint16_t FPArgRegs[] = {
+    static const MCPhysReg FPArgRegs[] = {
       PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
       PPC::F8
     };
@@ -2333,8 +2331,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
   }
 
   if (!MemOps.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl,
-                        MVT::Other, &MemOps[0], MemOps.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
   return Chain;
 }
@@ -2405,18 +2402,18 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   // Area that is at least reserved in caller of this function.
   unsigned MinReservedArea = ArgOffset;
 
-  static const uint16_t GPR[] = {
+  static const MCPhysReg GPR[] = {
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
 
-  static const uint16_t *FPR = GetFPR();
+  static const MCPhysReg *FPR = GetFPR();
 
-  static const uint16_t VR[] = {
+  static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
-  static const uint16_t VSRH[] = {
+  static const MCPhysReg VSRH[] = {
     PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
     PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
   };
@@ -2683,8 +2680,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   }
 
   if (!MemOps.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl,
-                        MVT::Other, &MemOps[0], MemOps.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
   return Chain;
 }
@@ -2714,18 +2710,18 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   // Area that is at least reserved in caller of this function.
   unsigned MinReservedArea = ArgOffset;
 
-  static const uint16_t GPR_32[] = {           // 32-bit registers.
+  static const MCPhysReg GPR_32[] = {           // 32-bit registers.
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
-  static const uint16_t GPR_64[] = {           // 64-bit registers.
+  static const MCPhysReg GPR_64[] = {           // 64-bit registers.
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
 
-  static const uint16_t *FPR = GetFPR();
+  static const MCPhysReg *FPR = GetFPR();
 
-  static const uint16_t VR[] = {
+  static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
@@ -2736,7 +2732,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
 
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
 
-  const uint16_t *GPR = isPPC64 ? GPR_64 : GPR_32;
+  const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
 
   // In 32-bit non-varargs functions, the stack space for vectors is after the
   // stack space for non-vectors.  We do not use this space unless we have
@@ -3039,8 +3035,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   }
 
   if (!MemOps.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl,
-                        MVT::Other, &MemOps[0], MemOps.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
   return Chain;
 }
@@ -3174,12 +3169,12 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 /// 32-bit value is representable in the immediate field of a BxA instruction.
 static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-  if (!C) return 0;
+  if (!C) return nullptr;
 
   int Addr = C->getZExtValue();
   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
       SignExtend32<26>(Addr) != Addr)
-    return 0;  // Top 6 bits have to be sext of immediate.
+    return nullptr;  // Top 6 bits have to be sext of immediate.
 
   return DAG.getConstant((int)C->getZExtValue() >> 2,
                          DAG.getTargetLoweringInfo().getPointerTy()).getNode();
@@ -3315,8 +3310,8 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                           SDLoc dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
-                       false, false, MachinePointerInfo(0),
-                       MachinePointerInfo(0));
+                       false, false, MachinePointerInfo(),
+                       MachinePointerInfo());
 }
 
 /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
@@ -3361,8 +3356,7 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
   StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
                                     MemOpChains2, dl);
   if (!MemOpChains2.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains2[0], MemOpChains2.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
 
   // Store the return address to the appropriate stack slot.
   Chain = EmitTailCallStoreFPAndRetAddr(DAG, MF, Chain, LROp, FPOp, SPDiff,
@@ -3476,8 +3470,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       // Load the address of the function entry point from the function
       // descriptor.
       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other, MVT::Glue);
-      SDValue LoadFuncPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, MTCTROps,
-                                        InFlag.getNode() ? 3 : 2);
+      SDValue LoadFuncPtr = DAG.getNode(PPCISD::LOAD, dl, VTs,
+                              makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
       Chain = LoadFuncPtr.getValue(1);
       InFlag = LoadFuncPtr.getValue(2);
 
@@ -3513,8 +3507,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       MTCTROps[2] = InFlag;
     }
 
-    Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, MTCTROps,
-                        2 + (InFlag.getNode() != 0));
+    Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys,
+                        makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
     InFlag = Chain.getValue(1);
 
     NodeTys.clear();
@@ -3522,7 +3516,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     NodeTys.push_back(MVT::Glue);
     Ops.push_back(Chain);
     CallOpc = PPCISD::BCTRL;
-    Callee.setNode(0);
+    Callee.setNode(nullptr);
     // Add use of X11 (holding environment pointer)
     if (isSVR4ABI && isPPC64)
       Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
@@ -3650,7 +3644,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
             isa<ConstantSDNode>(Callee)) &&
     "Expecting an global address, external symbol, absolute value or register");
 
-    return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, &Ops[0], Ops.size());
+    return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops);
   }
 
   // Add a NOP immediately after the branch instruction when using the 64-bit
@@ -3683,7 +3677,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
     }
   }
 
-  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   if (needsTOCRestore) {
@@ -3720,6 +3714,10 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
                                                    Ins, DAG);
 
+  if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
+    report_fatal_error("failed to perform tail call elimination on a call "
+                       "site marked musttail");
+
   if (PPCSubTarget.isSVR4ABI()) {
     if (PPCSubTarget.isPPC64())
       return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
@@ -3800,7 +3798,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
         errs() << "Call operand #" << i << " has unhandled type "
              << EVT(ArgVT).getEVTString() << "\n";
 #endif
-        llvm_unreachable(0);
+        llvm_unreachable(nullptr);
       }
     }
   } else {
@@ -3921,8 +3919,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
   }
 
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
@@ -3940,7 +3937,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
     SDValue Ops[] = { Chain, InFlag };
 
     Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
-                        dl, VTs, Ops, InFlag.getNode() ? 2 : 1);
+                        dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1));
 
     InFlag = Chain.getValue(1);
   }
@@ -4044,17 +4041,17 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
 
-  static const uint16_t GPR[] = {
+  static const MCPhysReg GPR[] = {
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
-  static const uint16_t *FPR = GetFPR();
+  static const MCPhysReg *FPR = GetFPR();
 
-  static const uint16_t VR[] = {
+  static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
-  static const uint16_t VSRH[] = {
+  static const MCPhysReg VSRH[] = {
     PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
     PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
   };
@@ -4333,8 +4330,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   }
 
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Check if this is an indirect call (MTCTR/BCTRL).
   // See PrepareCall() for more information about calls through function
@@ -4448,17 +4444,17 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
 
-  static const uint16_t GPR_32[] = {           // 32-bit registers.
+  static const MCPhysReg GPR_32[] = {           // 32-bit registers.
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
-  static const uint16_t GPR_64[] = {           // 64-bit registers.
+  static const MCPhysReg GPR_64[] = {           // 64-bit registers.
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
-  static const uint16_t *FPR = GetFPR();
+  static const MCPhysReg *FPR = GetFPR();
 
-  static const uint16_t VR[] = {
+  static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
@@ -4466,7 +4462,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   const unsigned NumFPRs = 13;
   const unsigned NumVRs  = array_lengthof(VR);
 
-  const uint16_t *GPR = isPPC64 ? GPR_64 : GPR_32;
+  const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
@@ -4696,8 +4692,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   }
 
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // On Darwin, R12 must contain the address of an indirect callee.  This does
   // not mean the MTCTR instruction must use R12; it's easier to model this as
@@ -4785,8 +4780,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other,
-                     &RetOps[0], RetOps.size());
+  return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
@@ -4889,7 +4883,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   // Build a DYNALLOC node.
   SDValue Ops[3] = { Chain, NegSize, FPSIdx };
   SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
-  return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops, 3);
+  return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
 }
 
 SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
@@ -4925,7 +4919,7 @@ SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
 
   SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -5097,8 +5091,7 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
       MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4);
     SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
     Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
-              DAG.getVTList(MVT::Other), Ops, array_lengthof(Ops),
-              MVT::i32, MMO);
+              DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
   } else
     Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr,
                          MPI, false, false, 0);
@@ -5225,7 +5218,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
                                    PPCISD::LFIWZX : PPCISD::LFIWAX,
                                  dl, DAG.getVTList(MVT::f64, MVT::Other),
-                                 Ops, 2, MVT::i32, MMO);
+                                 Ops, MVT::i32, MMO);
   } else {
     assert(PPCSubTarget.isPPC64() &&
            "i32->FP without LFIWAX supported only on PPC64");
@@ -5279,14 +5272,13 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   EVT VT = Op.getValueType();
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  SDValue MFFSreg, InFlag;
 
   // Save FP Control Word to register
   EVT NodeTys[] = {
     MVT::f64,    // return register
     MVT::Glue    // unused in this context
   };
-  SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, &InFlag, 0);
+  SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);
 
   // Save FP register to stack slot
   int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
@@ -5345,7 +5337,7 @@ SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
   SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
   SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
   SDValue OutOps[] = { OutLo, OutHi };
-  return DAG.getMergeValues(OutOps, 2, dl);
+  return DAG.getMergeValues(OutOps, dl);
 }
 
 SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
@@ -5374,7 +5366,7 @@ SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
   SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
   SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
   SDValue OutOps[] = { OutLo, OutHi };
-  return DAG.getMergeValues(OutOps, 2, dl);
+  return DAG.getMergeValues(OutOps, dl);
 }
 
 SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
@@ -5403,7 +5395,7 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
   SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, AmtVT),
                                   Tmp4, Tmp6, ISD::SETLE);
   SDValue OutOps[] = { OutLo, OutHi };
-  return DAG.getMergeValues(OutOps, 2, dl);
+  return DAG.getMergeValues(OutOps, dl);
 }
 
 //===----------------------------------------------------------------------===//
@@ -5432,8 +5424,7 @@ static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
   SDValue Elt = DAG.getConstant(Val, MVT::i32);
   SmallVector<SDValue, 8> Ops;
   Ops.assign(CanonicalVT.getVectorNumElements(), Elt);
-  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT,
-                              &Ops[0], Ops.size());
+  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, Ops);
   return DAG.getNode(ISD::BITCAST, dl, ReqVT, Res);
 }
 
@@ -5492,7 +5483,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                              SelectionDAG &DAG) const {
   SDLoc dl(Op);
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
-  assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
+  assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
 
   // Check if this is a splat of a constant value.
   APInt APSplatBits, APSplatUndef;
@@ -5540,10 +5531,14 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // we convert to a pseudo that will be expanded later into one of
     // the above forms.
     SDValue Elt = DAG.getConstant(SextVal, MVT::i32);
-    EVT VT = Op.getValueType();
-    int Size = VT == MVT::v16i8 ? 1 : (VT == MVT::v8i16 ? 2 : 4);
-    SDValue EltSize = DAG.getConstant(Size, MVT::i32);
-    return DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
+    EVT VT = (SplatSize == 1 ? MVT::v16i8 :
+              (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
+    SDValue EltSize = DAG.getConstant(SplatSize, MVT::i32);
+    SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
+    if (VT == Op.getValueType())
+      return RetVal;
+    else
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
   }
 
   // If this is 0x8000_0000 x 4, turn into vspltisw + vslw.  If it is
@@ -5838,7 +5833,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   }
 
   SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
-                                    &ResultMask[0], ResultMask.size());
+                                  ResultMask);
   return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V1, V2, VPermMask);
 }
 
@@ -5913,7 +5908,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     DAG.getConstant(CompareOpc, MVT::i32)
   };
   EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
-  SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
+  SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
 
   // Now that we have the comparison, emit a copy from the CR to a GPR.
   // This is flagged to the above dot comparison.
@@ -7232,8 +7227,8 @@ static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base,
     return true;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  const GlobalValue *GV1 = NULL;
-  const GlobalValue *GV2 = NULL;
+  const GlobalValue *GV1 = nullptr;
+  const GlobalValue *GV2 = nullptr;
   int64_t Offset1 = 0;
   int64_t Offset2 = 0;
   bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
@@ -7360,8 +7355,8 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
       // that the high bits are equal.
       APInt Op1Zero, Op1One;
       APInt Op2Zero, Op2One;
-      DAG.ComputeMaskedBits(N->getOperand(0), Op1Zero, Op1One);
-      DAG.ComputeMaskedBits(N->getOperand(1), Op2Zero, Op2One);
+      DAG.computeKnownBits(N->getOperand(0), Op1Zero, Op1One);
+      DAG.computeKnownBits(N->getOperand(1), Op2Zero, Op2One);
 
       // We don't really care about what is known about the first bit (if
       // anything), so clear it in all masks prior to comparing them.
@@ -7579,8 +7574,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
         Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);
 
     DAG.ReplaceAllUsesOfValueWith(PromOp,
-      DAG.getNode(PromOp.getOpcode(), dl, MVT::i1,
-                  Ops.data(), Ops.size()));
+      DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
   }
 
   // Now we're left with the initial truncation itself.
@@ -7816,8 +7810,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
     }
 
     DAG.ReplaceAllUsesOfValueWith(PromOp,
-      DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0),
-                  Ops.data(), Ops.size()));
+      DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
   }
 
   // Now we're left with the initial extension itself.
@@ -7883,7 +7876,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     if (N->getOperand(1).getOpcode() == ISD::FSQRT) {
       SDValue RV =
         DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0), DCI);
-      if (RV.getNode() != 0) {
+      if (RV.getNode()) {
         DCI.AddToWorklist(RV.getNode());
         return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
                            N->getOperand(0), RV);
@@ -7893,7 +7886,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       SDValue RV =
         DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
                                  DCI);
-      if (RV.getNode() != 0) {
+      if (RV.getNode()) {
         DCI.AddToWorklist(RV.getNode());
         RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N->getOperand(1)),
                          N->getValueType(0), RV);
@@ -7906,7 +7899,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       SDValue RV =
         DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
                                  DCI);
-      if (RV.getNode() != 0) {
+      if (RV.getNode()) {
         DCI.AddToWorklist(RV.getNode());
         RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N->getOperand(1)),
                          N->getValueType(0), RV,
@@ -7918,7 +7911,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     }
 
     SDValue RV = DAGCombineFastRecip(N->getOperand(1), DCI);
-    if (RV.getNode() != 0) {
+    if (RV.getNode()) {
       DCI.AddToWorklist(RV.getNode());
       return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
                          N->getOperand(0), RV);
@@ -7933,10 +7926,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the
     // reciprocal sqrt.
     SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(0), DCI);
-    if (RV.getNode() != 0) {
+    if (RV.getNode()) {
       DCI.AddToWorklist(RV.getNode());
       RV = DAGCombineFastRecip(RV, DCI);
-      if (RV.getNode() != 0) {
+      if (RV.getNode()) {
 	// Unfortunately, RV is now NaN if the input was exactly 0. Select out
 	// this case and force the answer to 0.
 
@@ -8014,7 +8007,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       };
 
       Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
-              DAG.getVTList(MVT::Other), Ops, array_lengthof(Ops),
+              DAG.getVTList(MVT::Other), Ops,
               cast<StoreSDNode>(N)->getMemoryVT(),
               cast<StoreSDNode>(N)->getMemOperand());
       DCI.AddToWorklist(Val.getNode());
@@ -8041,8 +8034,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       };
       return
         DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
-                                Ops, array_lengthof(Ops),
-                                cast<StoreSDNode>(N)->getMemoryVT(),
+                                Ops, cast<StoreSDNode>(N)->getMemoryVT(),
                                 cast<StoreSDNode>(N)->getMemOperand());
     }
     break;
@@ -8167,7 +8159,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
             Ops.push_back(*O);
         }
 
-        DAG.UpdateNodeOperands(User, Ops.data(), Ops.size());
+        DAG.UpdateNodeOperands(User, Ops);
       }
 
       return SDValue(N, 0);
@@ -8220,7 +8212,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
                                 DAG.getVTList(N->getValueType(0) == MVT::i64 ?
                                               MVT::i64 : MVT::i32, MVT::Other),
-                                Ops, 3, LD->getMemoryVT(), LD->getMemOperand());
+                                Ops, LD->getMemoryVT(), LD->getMemOperand());
 
       // If this is an i16 load, insert the truncate.
       SDValue ResVal = BSLoad;
@@ -8250,7 +8242,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         !N->getOperand(2).hasOneUse()) {
 
       // Scan all of the users of the LHS, looking for VCMPo's that match.
-      SDNode *VCMPoNode = 0;
+      SDNode *VCMPoNode = nullptr;
 
       SDNode *LHSN = N->getOperand(0).getNode();
       for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
@@ -8271,9 +8263,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // Look at the (necessarily single) use of the flag value.  If it has a
       // chain, this transformation is more complex.  Note that multiple things
       // could use the value result, which we should ignore.
-      SDNode *FlagUser = 0;
+      SDNode *FlagUser = nullptr;
       for (SDNode::use_iterator UI = VCMPoNode->use_begin();
-           FlagUser == 0; ++UI) {
+           FlagUser == nullptr; ++UI) {
         assert(UI != VCMPoNode->use_end() && "Didn't find user!");
         SDNode *User = *UI;
         for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
@@ -8378,7 +8370,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         DAG.getConstant(CompareOpc, MVT::i32)
       };
       EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
-      SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
+      SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
 
       // Unpack the result based on how the target uses it.
       PPC::Predicate CompOpc;
@@ -8414,11 +8406,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 // Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
-void PPCTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       APInt &KnownZero,
-                                                       APInt &KnownOne,
-                                                       const SelectionDAG &DAG,
-                                                       unsigned Depth) const {
+void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
   KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
   switch (Op.getOpcode()) {
   default: break;
@@ -8493,7 +8485,7 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
 
@@ -8599,7 +8591,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
-  SDValue Result(0,0);
+  SDValue Result;
 
   // Only support length 1 constraints.
   if (Constraint.length() > 1) return;
@@ -8766,6 +8758,30 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
   return FrameAddr;
 }
 
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned PPCTargetLowering::getRegisterByName(const char* RegName,
+                                              EVT VT) const {
+  bool isPPC64 = PPCSubTarget.isPPC64();
+  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+
+  if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) ||
+      (!isPPC64 && VT != MVT::i32))
+    report_fatal_error("Invalid register global variable type");
+
+  bool is64Bit = isPPC64 && VT == MVT::i64;
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                   .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
+                   .Case("r2", isDarwinABI ? 0 : (is64Bit ? PPC::X2 : PPC::R2))
+                   .Case("r13", (!isPPC64 && isDarwinABI) ? 0 :
+                                  (is64Bit ? PPC::X13 : PPC::R13))
+                   .Default(0);
+
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
 bool
 PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The PowerPC target isn't yet aware of offsets.
@@ -8795,6 +8811,42 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
   }
 }
 
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                          Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0 || BitSize > 64)
+    return false;
+  return true;
+}
+
+bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 == 64 && NumBits2 == 32;
+}
+
+bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 == 64 && NumBits2 == 32;
+}
+
+bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return isInt<16>(Imm) || isUInt<16>(Imm);
+}
+
+bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  return isInt<16>(Imm) || isUInt<16>(Imm);
+}
+
 bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
                                                       unsigned,
                                                       bool *Fast) const {
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index da6d4dc..080ef5d 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -351,20 +351,20 @@ namespace llvm {
 
     /// getTargetNodeName() - This method returns the name of a target specific
     /// DAG node.
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
 
-    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
     /// can be legally represented as pre-indexed load / store address.
-    virtual bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
-                                           SDValue &Offset,
-                                           ISD::MemIndexedMode &AM,
-                                           SelectionDAG &DAG) const;
+    bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                   SDValue &Offset,
+                                   ISD::MemIndexedMode &AM,
+                                   SelectionDAG &DAG) const override;
 
     /// SelectAddressRegReg - Given the specified addressed, check to see if it
     /// can be represented as an indexed [r+r] operation.  Returns false if it
@@ -384,29 +384,31 @@ namespace llvm {
     bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
                                  SelectionDAG &DAG) const;
 
-    Sched::Preference getSchedulingPreference(SDNode *N) const;
+    Sched::Preference getSchedulingPreference(SDNode *N) const override;
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     ///
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     /// ReplaceNodeResults - Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
-    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
-                                    SelectionDAG &DAG) const;
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
 
-    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                APInt &KnownZero,
-                                                APInt &KnownOne,
-                                                const SelectionDAG &DAG,
-                                                unsigned Depth = 0) const;
+    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
 
-    virtual MachineBasicBlock *
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
+
+    MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const;
+                                  MachineBasicBlock *MBB) const override;
     MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI,
                                         MachineBasicBlock *MBB, bool is64Bit,
                                         unsigned BinOpcode) const;
@@ -420,34 +422,58 @@ namespace llvm {
     MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
                                          MachineBasicBlock *MBB) const;
 
-    ConstraintType getConstraintType(const std::string &Constraint) const;
+    ConstraintType
+    getConstraintType(const std::string &Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
     ConstraintWeight getSingleConstraintMatchWeight(
-      AsmOperandInfo &info, const char *constraint) const;
+      AsmOperandInfo &info, const char *constraint) const override;
 
     std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   MVT VT) const;
+                                   MVT VT) const override;
 
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area.  This is the actual
     /// alignment, not its logarithm.
-    unsigned getByValTypeAlignment(Type *Ty) const;
+    unsigned getByValTypeAlignment(Type *Ty) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops.
-    virtual void LowerAsmOperandForConstraint(SDValue Op,
-                                              std::string &Constraint,
-                                              std::vector<SDValue> &Ops,
-                                              SelectionDAG &DAG) const;
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalICmpImmediate(int64_t Imm) const override;
+
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalAddImmediate(int64_t Imm) const override;
 
-    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+    /// isTruncateFree - Return true if it's free to truncate a value of
+    /// type Ty1 to type Ty2. e.g. On PPC it's free to truncate a i64 value in
+    /// register X1 to i32 by referencing its sub-register R1.
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// to just the constant itself.
+    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                           Type *Ty) const override;
+
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
     /// getOptimalMemOpType - Returns the target specific optimal type for load
     /// and store operations as a result of memset, memcpy, and memmove
@@ -460,32 +486,32 @@ namespace llvm {
     /// source is constant so it does not need to be loaded.
     /// It returns EVT::Other if the type should be determined using generic
     /// target-independent logic.
-    virtual EVT
+    EVT
     getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
                         bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
-                        MachineFunction &MF) const;
+                        MachineFunction &MF) const override;
 
     /// Is unaligned memory access allowed for the given type, and is it fast
     /// relative to software emulation.
-    virtual bool allowsUnalignedMemoryAccesses(EVT VT,
-                                               unsigned AddrSpace,
-                                               bool *Fast = 0) const;
+    bool allowsUnalignedMemoryAccesses(EVT VT,
+                                       unsigned AddrSpace,
+                                       bool *Fast = nullptr) const override;
 
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
     /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
     /// expanded to FMAs when this method returns true, otherwise fmuladd is
     /// expanded to fmul + fadd.
-    virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
+    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 
     // Should we expand the build vector with shuffles?
-    virtual bool
+    bool
     shouldExpandBuildVectorWithShuffles(EVT VT,
-                                        unsigned DefinedValues) const;
+                                        unsigned DefinedValues) const override;
 
     /// createFastISel - This method returns a target-specific FastISel object,
     /// or null if the target does not support "fast" instruction selection.
-    virtual FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
-                                     const TargetLibraryInfo *LibInfo) const;
+    FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+                             const TargetLibraryInfo *LibInfo) const override;
 
   private:
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
@@ -559,29 +585,29 @@ namespace llvm {
                        const SmallVectorImpl<ISD::InputArg> &Ins,
                        SmallVectorImpl<SDValue> &InVals) const;
 
-    virtual SDValue
+    SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
+                           SmallVectorImpl<SDValue> &InVals) const override;
 
-    virtual SDValue
+    SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
+                SmallVectorImpl<SDValue> &InVals) const override;
 
-    virtual bool
+    bool
       CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                    bool isVarArg,
                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                   LLVMContext &Context) const;
+                   LLVMContext &Context) const override;
 
-    virtual SDValue
+    SDValue
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const override;
 
     SDValue
       extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG,
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 2fd4a3e..f3c2eab 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -223,7 +223,7 @@ class VX2_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
 //===----------------------------------------------------------------------===//
 // Instruction Definitions.
 
-def HasAltivec : Predicate<"PPCSubTarget.hasAltivec()">;
+def HasAltivec : Predicate<"PPCSubTarget->hasAltivec()">;
 let Predicates = [HasAltivec] in {
 
 let isCodeGenOnly = 1 in {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 939bbdc..fd72384 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -35,12 +35,14 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-instr-info"
+
 #define GET_INSTRMAP_INFO
 #define GET_INSTRINFO_CTOR_DTOR
 #include "PPCGenInstrInfo.inc"
 
-using namespace llvm;
-
 static cl::
 opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
             cl::desc("Disable analysis for CTR loops"));
@@ -230,7 +232,7 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
 
   // Cannot commute if it has a non-zero rotate count.
   if (MI->getOperand(3).getImm() != 0)
-    return 0;
+    return nullptr;
 
   // If we have a zero rotate count, we have:
   //   M = mask(MB,ME)
@@ -539,7 +541,7 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
 
   // One-way branch.
-  if (FBB == 0) {
+  if (!FBB) {
     if (Cond.empty())   // Unconditional branch
       BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB);
     else if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
@@ -1399,10 +1401,10 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   // There are two possible candidates which can be changed to set CR[01].
   // One is MI, the other is a SUB instruction.
   // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
-  MachineInstr *Sub = NULL;
+  MachineInstr *Sub = nullptr;
   if (SrcReg2 != 0)
     // MI is not a candidate for CMPrr.
-    MI = NULL;
+    MI = nullptr;
   // FIXME: Conservatively refuse to convert an instruction which isn't in the
   // same BB as the comparison. This is to allow the check below to avoid calls
   // (and other explicit clobbers); instead we should really check for these
@@ -1810,10 +1812,15 @@ protected:
     }
 
 public:
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      // If we don't have VSX then go ahead and return without doing
+      // anything.
+      if (!TM->getSubtargetImpl()->hasVSX())
+        return false;
+
       LIS = &getAnalysis<LiveIntervals>();
 
-      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
       TII = TM->getInstrInfo();
 
       bool Changed = false;
@@ -1830,7 +1837,7 @@ public:
       return Changed;
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<LiveIntervals>();
       AU.addPreserved<LiveIntervals>();
       AU.addRequired<SlotIndexes>();
@@ -1962,8 +1969,11 @@ protected:
     }
 
 public:
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
       TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      // If we don't have VSX on the subtarget, don't do anything.
+      if (!TM->getSubtargetImpl()->hasVSX())
+        return false;
       TII = TM->getInstrInfo();
 
       bool Changed = false;
@@ -1977,7 +1987,7 @@ public:
       return Changed;
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
   };
@@ -2036,8 +2046,11 @@ protected:
     }
 
 public:
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
       TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      // If we don't have VSX don't bother doing anything here.
+      if (!TM->getSubtargetImpl()->hasVSX())
+        return false;
       TII = TM->getInstrInfo();
 
       bool Changed = false;
@@ -2051,7 +2064,7 @@ public:
       return Changed;
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
   };
@@ -2193,7 +2206,7 @@ protected:
     }
 
 public:
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
       TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
       TII = TM->getInstrInfo();
 
@@ -2213,7 +2226,7 @@ public:
       return Changed;
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
   };
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 3c8117c..d9db3e1 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -86,151 +86,148 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  virtual const PPCRegisterInfo &getRegisterInfo() const { return RI; }
+  const PPCRegisterInfo &getRegisterInfo() const { return RI; }
 
   ScheduleHazardRecognizer *
   CreateTargetHazardRecognizer(const TargetMachine *TM,
-                               const ScheduleDAG *DAG) const;
+                               const ScheduleDAG *DAG) const override;
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
-                                     const ScheduleDAG *DAG) const;
+                                     const ScheduleDAG *DAG) const override;
 
-  virtual
   int getOperandLatency(const InstrItineraryData *ItinData,
                         const MachineInstr *DefMI, unsigned DefIdx,
-                        const MachineInstr *UseMI, unsigned UseIdx) const;
-  virtual
+                        const MachineInstr *UseMI,
+                        unsigned UseIdx) const override;
   int getOperandLatency(const InstrItineraryData *ItinData,
                         SDNode *DefNode, unsigned DefIdx,
-                        SDNode *UseNode, unsigned UseIdx) const {
+                        SDNode *UseNode, unsigned UseIdx) const override {
     return PPCGenInstrInfo::getOperandLatency(ItinData, DefNode, DefIdx,
                                               UseNode, UseIdx);
   }
 
   bool isCoalescableExtInstr(const MachineInstr &MI,
                              unsigned &SrcReg, unsigned &DstReg,
-                             unsigned &SubIdx) const;
+                             unsigned &SubIdx) const override;
   unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                               int &FrameIndex) const;
+                               int &FrameIndex) const override;
   unsigned isStoreToStackSlot(const MachineInstr *MI,
-                              int &FrameIndex) const;
+                              int &FrameIndex) const override;
 
   // commuteInstruction - We can commute rlwimi instructions, but only if the
   // rotate amt is zero.  We also have to munge the immediates a bit.
-  virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
+  MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override;
 
-  virtual bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
-                                     unsigned &SrcOpIdx2) const;
+  bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
 
-  virtual void insertNoop(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI) const;
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
 
 
   // Branch analysis.
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const;
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
 
   // Select analysis.
-  virtual bool canInsertSelect(const MachineBasicBlock&,
-                               const SmallVectorImpl<MachineOperand> &Cond,
-                               unsigned, unsigned, int&, int&, int&) const;
-  virtual void insertSelect(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI, DebugLoc DL,
-                            unsigned DstReg,
-                            const SmallVectorImpl<MachineOperand> &Cond,
-                            unsigned TrueReg, unsigned FalseReg) const;
-
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
-
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
-
-  virtual
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-
-  virtual bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
-                             unsigned Reg, MachineRegisterInfo *MRI) const;
+  bool canInsertSelect(const MachineBasicBlock&,
+                       const SmallVectorImpl<MachineOperand> &Cond,
+                       unsigned, unsigned, int&, int&, int&) const override;
+  void insertSelect(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator MI, DebugLoc DL,
+                    unsigned DstReg,
+                    const SmallVectorImpl<MachineOperand> &Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+                     unsigned Reg, MachineRegisterInfo *MRI) const override;
 
   // If conversion by predication (only supported by some branch instructions).
   // All of the profitability checks always return true; it is always
   // profitable to use the predicated branches.
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                   unsigned NumCycles, unsigned ExtraPredCycles,
-                                   const BranchProbability &Probability) const {
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB,
+                          unsigned NumCycles, unsigned ExtraPredCycles,
+                          const BranchProbability &Probability) const override {
     return true;
   }
 
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
-                                   unsigned NumT, unsigned ExtraT,
-                                   MachineBasicBlock &FMBB,
-                                   unsigned NumF, unsigned ExtraF,
-                                   const BranchProbability &Probability) const;
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumT, unsigned ExtraT,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumF, unsigned ExtraF,
+                           const BranchProbability &Probability) const override;
 
-  virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
-                                         unsigned NumCycles,
-                                         const BranchProbability
-                                         &Probability) const {
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+                                 unsigned NumCycles,
+                                 const BranchProbability
+                                 &Probability) const override {
     return true;
   }
 
-  virtual bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
-                                         MachineBasicBlock &FMBB) const {
+  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                 MachineBasicBlock &FMBB) const override {
     return false;
   }
 
   // Predication support.
-  bool isPredicated(const MachineInstr *MI) const;
+  bool isPredicated(const MachineInstr *MI) const override;
 
-  virtual bool isUnpredicatedTerminator(const MachineInstr *MI) const;
+  bool isUnpredicatedTerminator(const MachineInstr *MI) const override;
 
-  virtual
   bool PredicateInstruction(MachineInstr *MI,
-                            const SmallVectorImpl<MachineOperand> &Pred) const;
+                    const SmallVectorImpl<MachineOperand> &Pred) const override;
 
-  virtual
   bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+                   const SmallVectorImpl<MachineOperand> &Pred2) const override;
 
-  virtual bool DefinesPredicate(MachineInstr *MI,
-                                std::vector<MachineOperand> &Pred) const;
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const override;
 
-  virtual bool isPredicable(MachineInstr *MI) const;
+  bool isPredicable(MachineInstr *MI) const override;
 
   // Comparison optimization.
 
 
-  virtual bool analyzeCompare(const MachineInstr *MI,
-                              unsigned &SrcReg, unsigned &SrcReg2,
-                              int &Mask, int &Value) const;
+  bool analyzeCompare(const MachineInstr *MI,
+                      unsigned &SrcReg, unsigned &SrcReg2,
+                      int &Mask, int &Value) const override;
 
-  virtual bool optimizeCompareInstr(MachineInstr *CmpInstr,
-                                    unsigned SrcReg, unsigned SrcReg2,
-                                    int Mask, int Value,
-                                    const MachineRegisterInfo *MRI) const;
+  bool optimizeCompareInstr(MachineInstr *CmpInstr,
+                            unsigned SrcReg, unsigned SrcReg2,
+                            int Mask, int Value,
+                            const MachineRegisterInfo *MRI) const override;
 
   /// GetInstSize - Return the number of bytes of code the specified
   /// instruction may be.  This returns the maximum number of bytes.
   ///
-  virtual unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
 };
 
 }
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 1d984ab..e421f8e 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -610,10 +610,10 @@ def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Instruction Predicate Definitions.
-def In32BitMode  : Predicate<"!PPCSubTarget.isPPC64()">;
-def In64BitMode  : Predicate<"PPCSubTarget.isPPC64()">;
-def IsBookE  : Predicate<"PPCSubTarget.isBookE()">;
-def IsNotBookE  : Predicate<"!PPCSubTarget.isBookE()">;
+def In32BitMode  : Predicate<"!PPCSubTarget->isPPC64()">;
+def In64BitMode  : Predicate<"PPCSubTarget->isPPC64()">;
+def IsBookE  : Predicate<"PPCSubTarget->isBookE()">;
+def IsNotBookE  : Predicate<"!PPCSubTarget->isBookE()">;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Multiclass Definitions.
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 9cc919e..49bcc48 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -39,7 +39,7 @@ multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL,
   }
 }
 
-def HasVSX : Predicate<"PPCSubTarget.hasVSX()">;
+def HasVSX : Predicate<"PPCSubTarget->hasVSX()">;
 let Predicates = [HasVSX] in {
 let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
 let neverHasSideEffects = 1 in { // VSX instructions don't have side effects.
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
index 227919c..7bbc71b 100644
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "PPCJITInfo.h"
 #include "PPCRelocations.h"
 #include "PPCTargetMachine.h"
@@ -22,6 +21,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 static TargetJITInfo::JITCompilerFn JITCompilerFunction;
 
 #define BUILD_ADDIS(RD,RS,IMM16) \
diff --git a/lib/Target/PowerPC/PPCJITInfo.h b/lib/Target/PowerPC/PPCJITInfo.h
index 46d4a08..0693e3e 100644
--- a/lib/Target/PowerPC/PPCJITInfo.h
+++ b/lib/Target/PowerPC/PPCJITInfo.h
@@ -30,19 +30,19 @@ namespace llvm {
       is64Bit = tmIs64Bit;
     }
 
-    virtual StubLayout getStubLayout();
-    virtual void *emitFunctionStub(const Function* F, void *Fn,
-                                   JITCodeEmitter &JCE);
-    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
-    virtual void relocate(void *Function, MachineRelocation *MR,
-                          unsigned NumRelocs, unsigned char* GOTBase);
-    
+    StubLayout getStubLayout() override;
+    void *emitFunctionStub(const Function* F, void *Fn,
+                           JITCodeEmitter &JCE) override;
+    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
+    void relocate(void *Function, MachineRelocation *MR,
+                  unsigned NumRelocs, unsigned char* GOTBase) override;
+
     /// replaceMachineCodeForFunction - Make it so that calling the function
     /// whose machine code is at OLD turns into a call to NEW, perhaps by
     /// overwriting OLD with a branch to NEW.  This is used for self-modifying
     /// code.
     ///
-    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+    void replaceMachineCodeForFunction(void *Old, void *New) override;
   };
 }
 
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 029bb8a..f8e84a5 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -96,7 +96,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
       (MO.getTargetFlags() & PPCII::MO_NLP_HIDDEN_FLAG) ? 
          MachO.getHiddenGVStubEntry(Sym) : MachO.getGVStubEntry(Sym);
     
-    if (StubSym.getPointer() == 0) {
+    if (!StubSym.getPointer()) {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym = MachineModuleInfoImpl::
                    StubValueTy(AP.getSymbol(MO.getGlobal()),
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 4ff282e..e333b51 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "reginfo"
 #include "PPCRegisterInfo.h"
 #include "PPC.h"
 #include "PPCFrameLowering.h"
@@ -42,11 +41,13 @@
 #include "llvm/Target/TargetOptions.h"
 #include <cstdlib>
 
+using namespace llvm;
+
+#define DEBUG_TYPE "reginfo"
+
 #define GET_REGINFO_TARGET_DESC
 #include "PPCGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static cl::opt<bool>
 EnableBasePointer("ppc-use-base-pointer", cl::Hidden, cl::init(true),
          cl::desc("Enable use of a base pointer for complex stack frames"));
@@ -96,7 +97,7 @@ PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
   return &PPC::GPRCRegClass;
 }
 
-const uint16_t*
+const MCPhysReg*
 PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (Subtarget.isDarwinABI())
     return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index c3e54b4..13a35f6 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -34,36 +34,37 @@ public:
   
   /// getPointerRegClass - Return the register class to use to hold pointers.
   /// This is used for addressing modes.
-  virtual const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const;
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override;
 
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const;
+                               MachineFunction &MF) const override;
 
   const TargetRegisterClass*
-  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const override;
 
   /// Code Generation virtual methods...
-  const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
-  const uint32_t *getCallPreservedMask(CallingConv::ID CC) const;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction* MF =nullptr) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID CC) const override;
   const uint32_t *getNoPreservedMask() const;
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   /// We require the register scavenger.
-  bool requiresRegisterScavenging(const MachineFunction &MF) const {
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
   }
 
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
     return true;
   }
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
     return true;
   }
 
-  virtual bool requiresVirtualBaseRegisters(const MachineFunction &MF) const {
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override {
     return true;
   }
 
@@ -82,28 +83,29 @@ public:
                           unsigned FrameIndex) const;
 
   bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
-			    int &FrameIdx) const;
+			    int &FrameIdx) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = nullptr) const override;
 
   // Support for virtual base registers.
-  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const;
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
   void materializeFrameBaseRegister(MachineBasicBlock *MBB,
                                     unsigned BaseReg, int FrameIdx,
-                                    int64_t Offset) const;
+                                    int64_t Offset) const override;
   void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
-                         int64_t Offset) const;
-  bool isFrameOffsetLegal(const MachineInstr *MI, int64_t Offset) const;
+                         int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI,
+                          int64_t Offset) const override;
 
   // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   // Base pointer (stack realignment) support.
   unsigned getBaseRegister(const MachineFunction &MF) const;
   bool hasBasePointer(const MachineFunction &MF) const;
   bool canRealignStack(const MachineFunction &MF) const;
-  bool needsStackRealignment(const MachineFunction &MF) const;
+  bool needsStackRealignment(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index e11f7d4..b3d145b 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -188,6 +188,13 @@ def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74, 74]>;
 def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>;
 }
 
+// The full condition-code register. This is not modeled fully, but defined
+// here primarily, for compatibility with gcc, to allow the inline asm "cc"
+// clobber specification to work.
+def CC : PPCReg<"cc">, DwarfRegAlias<CR0> {
+  let Aliases = [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7];
+}
+
 // Link register
 def LR  : SPR<8, "lr">, DwarfRegNum<[-2, 65]>;
 //let Aliases = [LR] in
@@ -300,3 +307,8 @@ def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
 def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> {
   let CopyCost = -1;
 }
+
+def CCRC : RegisterClass<"PPC", [i32], 32, (add CC)> {
+  let isAllocatable = 0;
+}
+
diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
index d4258b4..f742f72 100644
--- a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
+++ b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "powerpc-selectiondag-info"
 #include "PPCTargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "powerpc-selectiondag-info"
+
 PPCSelectionDAGInfo::PPCSelectionDAGInfo(const PPCTargetMachine &TM)
   : TargetSelectionDAGInfo(TM) {
 }
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index b07abe4..ea9daee 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -24,31 +24,21 @@
 #include "llvm/Target/TargetMachine.h"
 #include <cstdlib>
 
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "PPCGenSubtargetInfo.inc"
 
-using namespace llvm;
-
 PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS, bool is64Bit,
                            CodeGenOpt::Level OptLevel)
-  : PPCGenSubtargetInfo(TT, CPU, FS)
-  , IsPPC64(is64Bit)
-  , TargetTriple(TT) {
+    : PPCGenSubtargetInfo(TT, CPU, FS), IsPPC64(is64Bit), TargetTriple(TT),
+      OptLevel(OptLevel) {
   initializeEnvironment();
-
-  std::string FullFS = FS;
-
-  // At -O2 and above, track CR bits as individual registers.
-  if (OptLevel >= CodeGenOpt::Default) {
-    if (!FullFS.empty())
-      FullFS = "+crbits," + FullFS;
-    else
-      FullFS = "+crbits";
-  }
-
-  resetSubtargetFeatures(CPU, FullFS);
+  resetSubtargetFeatures(CPU, FS);
 }
 
 /// SetJITMode - This is called to inform the subtarget info that we are
@@ -138,6 +128,14 @@ void PPCSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
       FullFS = "+64bit";
   }
 
+  // At -O2 and above, track CR bits as individual registers.
+  if (OptLevel >= CodeGenOpt::Default) {
+    if (!FullFS.empty())
+      FullFS = "+crbits," + FullFS;
+    else
+      FullFS = "+crbits";
+  }
+
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FullFS);
 
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 87e012e..ee43fd5 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -99,6 +99,9 @@ protected:
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
 
+  /// OptLevel - What default optimization level we're emitting code for.
+  CodeGenOpt::Level OptLevel;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
@@ -129,7 +132,7 @@ public:
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
 
   /// \brief Reset the features for the PowerPC target.
-  virtual void resetSubtargetFeatures(const MachineFunction *MF);
+  void resetSubtargetFeatures(const MachineFunction *MF) override;
 private:
   void initializeEnvironment();
   void resetSubtargetFeatures(StringRef CPU, StringRef FS);
@@ -200,15 +203,17 @@ public:
   /// enablePostRAScheduler - True at 'More' optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
-                             RegClassVector& CriticalPathRCs) const;
+                             RegClassVector& CriticalPathRCs) const override;
+
+  bool enableEarlyIfConversion() const override { return hasISEL(); }
 
   // Scheduling customization.
-  bool enableMachineScheduler() const;
+  bool enableMachineScheduler() const override;
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            MachineInstr *begin,
                            MachineInstr *end,
-                           unsigned NumRegionInstrs) const;
-  bool useAA() const;
+                           unsigned NumRegionInstrs) const override;
+  bool useAA() const override;
 };
 } // End llvm namespace
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index e7438f3..2323add 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -127,12 +127,12 @@ public:
     return *getPPCTargetMachine().getSubtargetImpl();
   }
 
-  virtual bool addPreISel();
-  virtual bool addILPOpts();
-  virtual bool addInstSelector();
-  virtual bool addPreRegAlloc();
-  virtual bool addPreSched2();
-  virtual bool addPreEmitPass();
+  bool addPreISel() override;
+  bool addILPOpts() override;
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
@@ -148,12 +148,8 @@ bool PPCPassConfig::addPreISel() {
 }
 
 bool PPCPassConfig::addILPOpts() {
-  if (getPPCSubtarget().hasISEL()) {
-    addPass(&EarlyIfConverterID);
-    return true;
-  }
-
-  return false;
+  addPass(&EarlyIfConverterID);
+  return true;
 }
 
 bool PPCPassConfig::addInstSelector() {
@@ -165,25 +161,19 @@ bool PPCPassConfig::addInstSelector() {
     addPass(createPPCCTRLoopsVerify());
 #endif
 
-  if (getPPCSubtarget().hasVSX())
-    addPass(createPPCVSXCopyPass());
-
+  addPass(createPPCVSXCopyPass());
   return false;
 }
 
 bool PPCPassConfig::addPreRegAlloc() {
-  if (getPPCSubtarget().hasVSX()) {
-    initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
-    insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
-               &PPCVSXFMAMutateID);
-  }
-
+  initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+  insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
+             &PPCVSXFMAMutateID);
   return false;
 }
 
 bool PPCPassConfig::addPreSched2() {
-  if (getPPCSubtarget().hasVSX())
-    addPass(createPPCVSXCopyCleanupPass());
+  addPass(createPPCVSXCopyCleanupPass());
 
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 606ccb3..9e92494 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -43,34 +43,34 @@ public:
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL, bool is64Bit);
 
-  virtual const PPCInstrInfo      *getInstrInfo() const { return &InstrInfo; }
-  virtual const PPCFrameLowering  *getFrameLowering() const {
+  const PPCInstrInfo      *getInstrInfo() const override { return &InstrInfo; }
+  const PPCFrameLowering  *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual       PPCJITInfo        *getJITInfo()         { return &JITInfo; }
-  virtual const PPCTargetLowering *getTargetLowering() const {
+        PPCJITInfo        *getJITInfo() override         { return &JITInfo; }
+  const PPCTargetLowering *getTargetLowering() const override {
    return &TLInfo;
   }
-  virtual const PPCSelectionDAGInfo* getSelectionDAGInfo() const {
+  const PPCSelectionDAGInfo* getSelectionDAGInfo() const override {
     return &TSInfo;
   }
-  virtual const PPCRegisterInfo   *getRegisterInfo() const {
+  const PPCRegisterInfo   *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
 
-  virtual const DataLayout    *getDataLayout() const    { return &DL; }
-  virtual const PPCSubtarget  *getSubtargetImpl() const { return &Subtarget; }
-  virtual const InstrItineraryData *getInstrItineraryData() const {
+  const DataLayout    *getDataLayout() const override    { return &DL; }
+  const PPCSubtarget  *getSubtargetImpl() const override { return &Subtarget; }
+  const InstrItineraryData *getInstrItineraryData() const override {
     return &InstrItins;
   }
 
   // Pass Pipeline Configuration
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
-  virtual bool addCodeEmitter(PassManagerBase &PM,
-                              JITCodeEmitter &JCE);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  bool addCodeEmitter(PassManagerBase &PM,
+                      JITCodeEmitter &JCE) override;
 
   /// \brief Register PPC analysis passes with a pass manager.
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  void addAnalysisPasses(PassManagerBase &PM) override;
 };
 
 /// PPC32TargetMachine - PowerPC 32-bit target machine.
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2f4d5c1..007901b 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -14,17 +14,22 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ppctti"
 #include "PPC.h"
 #include "PPCTargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "ppctti"
+
+static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
+cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
+
 // Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializePPCTTIPass(PassRegistry &);
@@ -33,21 +38,16 @@ void initializePPCTTIPass(PassRegistry &);
 namespace {
 
 class PPCTTI final : public ImmutablePass, public TargetTransformInfo {
-  const PPCTargetMachine *TM;
   const PPCSubtarget *ST;
   const PPCTargetLowering *TLI;
 
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
 public:
-  PPCTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+  PPCTTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
   PPCTTI(const PPCTargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
         TLI(TM->getTargetLowering()) {
     initializePPCTTIPass(*PassRegistry::getPassRegistry());
   }
@@ -72,6 +72,13 @@ public:
 
   /// \name Scalar TTI Implementations
   /// @{
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+
   virtual PopcntSupportKind
   getPopcntSupport(unsigned TyWidth) const override;
   virtual void getUnrollingPreferences(
@@ -128,6 +135,142 @@ PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const {
   return PSK_Software;
 }
 
+unsigned PPCTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  if (DisablePPCConstHoist)
+    return TargetTransformInfo::getIntImmCost(Imm, Ty);
+
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  if (Imm == 0)
+    return TCC_Free;
+
+  if (Imm.getBitWidth() <= 64) {
+    if (isInt<16>(Imm.getSExtValue()))
+      return TCC_Basic;
+
+    if (isInt<32>(Imm.getSExtValue())) {
+      // A constant that can be materialized using lis.
+      if ((Imm.getZExtValue() & 0xFFFF) == 0)
+        return TCC_Basic;
+
+      return 2 * TCC_Basic;
+    }
+  }
+
+  return 4 * TCC_Basic;
+}
+
+unsigned PPCTTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                               const APInt &Imm, Type *Ty) const {
+  if (DisablePPCConstHoist)
+    return TargetTransformInfo::getIntImmCost(IID, Idx, Imm, Ty);
+
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  switch (IID) {
+  default: return TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+    if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
+      return TCC_Free;
+    break;
+  }
+  return PPCTTI::getIntImmCost(Imm, Ty);
+}
+
+unsigned PPCTTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                               Type *Ty) const {
+  if (DisablePPCConstHoist)
+    return TargetTransformInfo::getIntImmCost(Opcode, Idx, Imm, Ty);
+
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  unsigned ImmIdx = ~0U;
+  bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
+       ZeroFree = false;
+  switch (Opcode) {
+  default: return TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr. This prevents the
+    // creation of new constants for every base constant that gets constant
+    // folded with the offset.
+    if (Idx == 0)
+      return 2 * TCC_Basic;
+    return TCC_Free;
+  case Instruction::And:
+    RunFree = true; // (for the rotate-and-mask instructions)
+    // Fallthrough...
+  case Instruction::Add:
+  case Instruction::Or:
+  case Instruction::Xor:
+    ShiftedFree = true;
+    // Fallthrough...
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    ImmIdx = 1;
+    break;
+  case Instruction::ICmp:
+    UnsignedFree = true;
+    ImmIdx = 1;
+    // Fallthrough... (zero comparisons can use record-form instructions)
+  case Instruction::Select:
+    ZeroFree = true;
+    break;
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Ret:
+  case Instruction::Load:
+  case Instruction::Store:
+    break;
+  }
+
+  if (ZeroFree && Imm == 0)
+    return TCC_Free;
+
+  if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
+    if (isInt<16>(Imm.getSExtValue()))
+      return TCC_Free;
+
+    if (RunFree) {
+      if (Imm.getBitWidth() <= 32 &&
+          (isShiftedMask_32(Imm.getZExtValue()) ||
+           isShiftedMask_32(~Imm.getZExtValue())))
+        return TCC_Free;
+
+
+      if (ST->isPPC64() &&
+          (isShiftedMask_64(Imm.getZExtValue()) ||
+           isShiftedMask_64(~Imm.getZExtValue())))
+        return TCC_Free;
+    }
+
+    if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
+      return TCC_Free;
+
+    if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
+      return TCC_Free;
+  }
+
+  return PPCTTI::getIntImmCost(Imm, Ty);
+}
+
 void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
   if (ST->getDarwinDirective() == PPC::DIR_A2) {
     // The A2 is in-order with a deep pipeline, and concatenation unrolling
@@ -220,7 +363,9 @@ unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
   // experimentally as a minimum needed to prevent unprofitable
   // vectorization for the paq8p benchmark.  It may need to be
   // raised further if other unprofitable cases remain.
-  unsigned LHSPenalty = 12;
+  unsigned LHSPenalty = 2;
+  if (ISD == ISD::INSERT_VECTOR_ELT)
+    LHSPenalty += 7;
 
   // Vector element insert/extract with Altivec is very expensive,
   // because they require store and reload with the attendant
@@ -244,14 +389,32 @@ unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
   unsigned Cost =
     TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 
-  // FIXME: Update this for VSX loads/stores that support unaligned access.
+  // VSX loads/stores support unaligned access.
+  if (ST->hasVSX()) {
+    if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64)
+      return Cost;
+  }
+
+  bool UnalignedAltivec =
+    Src->isVectorTy() &&
+    Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() &&
+    LT.second.getSizeInBits() == 128 &&
+    Opcode == Instruction::Load;
 
   // PPC in general does not support unaligned loads and stores. They'll need
   // to be decomposed based on the alignment factor.
   unsigned SrcBytes = LT.second.getStoreSize();
-  if (SrcBytes && Alignment && Alignment < SrcBytes)
+  if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) {
     Cost += LT.first*(SrcBytes/Alignment-1);
 
+    // For a vector type, there is also scalarization overhead (only for
+    // stores, loads are expanded using the vector-load + permutation sequence,
+    // which is much less expensive).
+    if (Src->isVectorTy() && Opcode == Instruction::Store)
+      for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
+        Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
+  }
+
   return Cost;
 }
 
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 3e1848b..949fdfb 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -37,11 +37,15 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
 // SI Passes
 FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
 
+void initializeSILowerI1CopiesPass(PassRegistry &);
+extern char &SILowerI1CopiesID;
+
 // Passes common to R600 and SI
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
@@ -76,8 +80,8 @@ enum AddressSpaces {
   GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
   CONSTANT_ADDRESS = 2, ///< Address space for constant memory
   LOCAL_ADDRESS    = 3, ///< Address space for local memory.
-  REGION_ADDRESS   = 4, ///< Address space for region memory.
-  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
+  FLAT_ADDRESS     = 4, ///< Address space for flat memory.
+  REGION_ADDRESS   = 5, ///< Address space for region memory.
   PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
   PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
 
@@ -102,7 +106,8 @@ enum AddressSpaces {
   CONSTANT_BUFFER_13 = 21,
   CONSTANT_BUFFER_14 = 22,
   CONSTANT_BUFFER_15 = 23,
-  LAST_ADDRESS     = 24
+  ADDRESS_NONE = 24, ///< Address space for unknown memory.
+  LAST_ADDRESS = ADDRESS_NONE
 };
 
 } // namespace AMDGPUAS
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index d1e2cf5..2edc115 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -120,6 +120,17 @@ def AMDGPU : Target {
   let InstructionSet = AMDGPUInstrInfo;
 }
 
+//===----------------------------------------------------------------------===//
+// Predicate helper class
+//===----------------------------------------------------------------------===//
+
+class PredicateControl {
+  Predicate SubtargetPredicate;
+  list<Predicate> OtherPredicates = [];
+  list<Predicate> Predicates = !listconcat([SubtargetPredicate],
+                                            OtherPredicates);
+}
+
 // Include AMDGPU TD files
 include "R600Schedule.td"
 include "SISchedule.td"
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index b166c45..170f479 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -64,7 +64,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
   if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    findNumUsedRegistersSI(MF, KernelInfo.NumSGPR, KernelInfo.NumVGPR);
+    getSIProgramInfo(KernelInfo, MF);
     EmitProgramInfoSI(MF, KernelInfo);
   } else {
     EmitProgramInfoR600(MF);
@@ -84,8 +84,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                               SectionKind::getReadOnly());
     OutStreamer.SwitchSection(CommentSection);
 
-    if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
       OutStreamer.emitRawComment(" Kernel info:", false);
+      OutStreamer.emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
+                                 false);
       OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
                                  false);
       OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
@@ -184,9 +186,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
   }
 }
 
-void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
-                                              unsigned &NumSGPR,
-                                              unsigned &NumVGPR) const {
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+                                        MachineFunction &MF) const {
+  uint64_t CodeSize = 0;
   unsigned MaxSGPR = 0;
   unsigned MaxVGPR = 0;
   bool VCCUsed = false;
@@ -200,6 +202,9 @@ void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
                                                     I != E; ++I) {
       MachineInstr &MI = *I;
 
+      // TODO: CodeSize should account for multiple functions.
+      CodeSize += MI.getDesc().Size;
+
       unsigned numOperands = MI.getNumOperands();
       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
         MachineOperand &MO = MI.getOperand(op_idx);
@@ -274,13 +279,9 @@ void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
   if (VCCUsed)
     MaxSGPR += 2;
 
-  NumSGPR = MaxSGPR;
-  NumVGPR = MaxVGPR;
-}
-
-void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &Out,
-                                        MachineFunction &MF) const {
-  findNumUsedRegistersSI(MF, Out.NumSGPR, Out.NumVGPR);
+  ProgInfo.CodeLen = CodeSize;
+  ProgInfo.NumSGPR = MaxSGPR;
+  ProgInfo.NumVGPR = MaxVGPR;
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index a2b8337..71adc9a 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -24,7 +24,12 @@ namespace llvm {
 class AMDGPUAsmPrinter : public AsmPrinter {
 private:
   struct SIProgramInfo {
-    SIProgramInfo() : NumSGPR(0), NumVGPR(0) {}
+    SIProgramInfo() :
+      CodeLen(0),
+      NumSGPR(0),
+      NumVGPR(0) {}
+
+    uint64_t CodeLen;
     unsigned NumSGPR;
     unsigned NumVGPR;
   };
@@ -42,14 +47,14 @@ private:
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "AMDGPU Assembly Printer";
   }
 
   /// Implemented in AMDGPUMCInstLower.cpp
-  virtual void EmitInstruction(const MachineInstr *MI);
+  void EmitInstruction(const MachineInstr *MI) override;
 
 protected:
   bool DisasmEnabled;
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index 65cdb24..5f8ad8c 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -20,7 +20,7 @@ def CC_SI : CallingConv<[
   CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
-    SGPR16
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
   ]>>>,
 
   CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp
index 50297d1..91aeee2 100644
--- a/lib/Target/R600/AMDGPUConvertToISA.cpp
+++ b/lib/Target/R600/AMDGPUConvertToISA.cpp
@@ -31,9 +31,9 @@ public:
   AMDGPUConvertToISAPass(TargetMachine &tm) :
     MachineFunctionPass(ID), TM(tm) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const {return "AMDGPU Convert to ISA";}
+  const char *getPassName() const override {return "AMDGPU Convert to ISA";}
 
 };
 
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index 0325a00..e7e90d3 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -97,7 +97,7 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
 const TargetFrameLowering::SpillSlot *
 AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
   NumEntries = 0;
-  return 0;
+  return nullptr;
 }
 void
 AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const {
diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h
index cf5742e..d18ede5 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.h
+++ b/lib/Target/R600/AMDGPUFrameLowering.h
@@ -33,12 +33,13 @@ public:
 
   /// \returns The number of 32-bit sub-registers that are used when storing
   /// values to the stack.
-  virtual unsigned getStackWidth(const MachineFunction &MF) const;
-  virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
-  virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const;
-  virtual void emitPrologue(MachineFunction &MF) const;
-  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-  virtual bool hasFP(const MachineFunction &MF) const;
+  unsigned getStackWidth(const MachineFunction &MF) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  const SpillSlot *
+    getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool hasFP(const MachineFunction &MF) const override;
 };
 } // namespace llvm
 #endif // AMDILFRAME_LOWERING_H
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index e8c5f5b..f1f0bfa 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -16,15 +16,11 @@
 #include "AMDGPURegisterInfo.h"
 #include "R600InstrInfo.h"
 #include "SIISelLowering.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/ValueMap.h"
-#include "llvm/Support/Compiler.h"
-#include <list>
-#include <queue>
+#include "llvm/IR/Function.h"
 
 using namespace llvm;
 
@@ -43,11 +39,12 @@ public:
   AMDGPUDAGToDAGISel(TargetMachine &TM);
   virtual ~AMDGPUDAGToDAGISel();
 
-  SDNode *Select(SDNode *N);
-  virtual const char *getPassName() const;
-  virtual void PostprocessISelDAG();
+  SDNode *Select(SDNode *N) override;
+  const char *getPassName() const override;
+  void PostprocessISelDAG() override;
 
 private:
+  bool isInlineImmediate(SDNode *N) const;
   inline SDValue getSmallIPtrImm(unsigned Imm);
   bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
                    const R600InstrInfo *TII);
@@ -58,11 +55,9 @@ private:
   bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
   bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
   bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
-  SDValue SimplifyI24(SDValue &Op);
-  bool SelectI24(SDValue Addr, SDValue &Op);
-  bool SelectU24(SDValue Addr, SDValue &Op);
 
   static bool checkType(const Value *ptr, unsigned int addrspace);
+  static bool checkPrivateAddress(const MachineMemOperand *Op);
 
   static bool isGlobalStore(const StoreSDNode *N);
   static bool isPrivateStore(const StoreSDNode *N);
@@ -77,10 +72,15 @@ private:
   bool isLocalLoad(const LoadSDNode *N) const;
   bool isRegionLoad(const LoadSDNode *N) const;
 
+  /// \returns True if the current basic block being selected is at control
+  ///          flow depth 0.  Meaning that the current block dominates the
+  //           exit block.
+  bool isCFDepth0() const;
+
   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
   bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
-  bool SelectGlobalValueVariableOffset(SDValue Addr,
-      SDValue &BaseReg, SDValue& Offset);
+  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+                                       SDValue& Offset);
   bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
 
@@ -91,8 +91,7 @@ private:
 
 /// \brief This pass converts a legalized DAG into a AMDGPU-specific
 // DAG, ready for instruction scheduling.
-FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM
-                                       ) {
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) {
   return new AMDGPUDAGToDAGISel(TM);
 }
 
@@ -103,32 +102,39 @@ AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
 AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
 }
 
+bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const {
+  const SITargetLowering *TL
+      = static_cast<const SITargetLowering *>(getTargetLowering());
+  return TL->analyzeImmediate(N) == 0;
+}
+
 /// \brief Determine the register class for \p OpNo
 /// \returns The register class of the virtual register that will be used for
 /// the given operand number \OpNo or NULL if the register class cannot be
 /// determined.
 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
                                                           unsigned OpNo) const {
-  if (!N->isMachineOpcode()) {
-    return NULL;
-  }
+  if (!N->isMachineOpcode())
+    return nullptr;
+
   switch (N->getMachineOpcode()) {
   default: {
     const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode());
     unsigned OpIdx = Desc.getNumDefs() + OpNo;
     if (OpIdx >= Desc.getNumOperands())
-      return NULL;
+      return nullptr;
     int RegClass = Desc.OpInfo[OpIdx].RegClass;
-    if (RegClass == -1) {
-      return NULL;
-    }
+    if (RegClass == -1)
+      return nullptr;
+
     return TM.getRegisterInfo()->getRegClass(RegClass);
   }
   case AMDGPU::REG_SEQUENCE: {
-    const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(
-                      cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
-    unsigned SubRegIdx =
-            dyn_cast<ConstantSDNode>(N->getOperand(OpNo + 1))->getZExtValue();
+    unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(RCID);
+
+    SDValue SubRegOp = N->getOperand(OpNo + 1);
+    unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
     return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx);
   }
   }
@@ -139,7 +145,7 @@ SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
 }
 
 bool AMDGPUDAGToDAGISel::SelectADDRParam(
-    SDValue Addr, SDValue& R1, SDValue& R2) {
+  SDValue Addr, SDValue& R1, SDValue& R2) {
 
   if (Addr.getOpcode() == ISD::FrameIndex) {
     if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
@@ -196,15 +202,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
+
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
   switch (Opc) {
   default: break;
   // We are selecting i64 ADD here instead of custom lower it during
   // DAG legalization, so we can fold some i64 ADDs used for address
   // calculation into the LOAD and STORE instructions.
   case ISD::ADD: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (N->getValueType(0) != MVT::i64 ||
         ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
       break;
@@ -232,12 +239,13 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     AddLoArgs.push_back(SDValue(Lo0, 0));
     AddLoArgs.push_back(SDValue(Lo1, 0));
 
-    SDNode *AddLo = CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL,
-                                           VTList, AddLoArgs);
+    SDNode *AddLo = CurDAG->getMachineNode(
+        isCFDepth0() ? AMDGPU::S_ADD_I32 : AMDGPU::V_ADD_I32_e32,
+        DL, VTList, AddLoArgs);
     SDValue Carry = SDValue(AddLo, 1);
-    SDNode *AddHi = CurDAG->getMachineNode(AMDGPU::S_ADDC_U32, DL,
-                                           MVT::i32, SDValue(Hi0, 0),
-                                           SDValue(Hi1, 0), Carry);
+    SDNode *AddHi = CurDAG->getMachineNode(
+        isCFDepth0() ? AMDGPU::S_ADDC_U32 : AMDGPU::V_ADDC_U32_e32,
+        DL, MVT::i32, SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
 
     SDValue Args[5] = {
       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
@@ -246,11 +254,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       SDValue(AddHi,0),
       Sub1,
     };
-    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5);
+    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
   }
   case ISD::BUILD_VECTOR: {
     unsigned RegClassID;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     const AMDGPURegisterInfo *TRI =
                    static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
     const SIRegisterInfo *SIRI =
@@ -316,7 +323,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     // 16 = Max Num Vector Elements
     // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
     // 1 = Vector Register Class
-    SDValue RegSeqArgs[16 * 2 + 1];
+    SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(N->getNumOperands() * 2 + 1);
 
     RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
     bool IsRegSeq = true;
@@ -333,11 +340,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     if (!IsRegSeq)
       break;
     return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
-        RegSeqArgs, 2 * N->getNumOperands() + 1);
+                                RegSeqArgs);
   }
   case ISD::BUILD_PAIR: {
     SDValue RC, SubReg0, SubReg1;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
       break;
     }
@@ -346,7 +352,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
     } else if (N->getValueType(0) == MVT::i64) {
-      RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32);
+      RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32);
       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
     } else {
@@ -357,8 +363,37 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
                                   SDLoc(N), N->getValueType(0), Ops);
   }
-  case AMDGPUISD::REGISTER_LOAD: {
+
+  case ISD::Constant:
+  case ISD::ConstantFP: {
     const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+        N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+      break;
+
+    uint64_t Imm;
+    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+      Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
+    else {
+      ConstantSDNode *C = cast<ConstantSDNode>(N);
+      Imm = C->getZExtValue();
+    }
+
+    SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+                                CurDAG->getConstant(Imm & 0xFFFFFFFF, MVT::i32));
+    SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+                                CurDAG->getConstant(Imm >> 32, MVT::i32));
+    const SDValue Ops[] = {
+      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+      SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+      SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32)
+    };
+
+    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SDLoc(N),
+                                  N->getValueType(0), Ops);
+  }
+
+  case AMDGPUISD::REGISTER_LOAD: {
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
     SDValue Addr, Offset;
@@ -375,7 +410,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                   Ops);
   }
   case AMDGPUISD::REGISTER_STORE: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
     SDValue Addr, Offset;
@@ -391,42 +425,95 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                         CurDAG->getVTList(MVT::Other),
                                         Ops);
   }
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    // There is a scalar version available, but unlike the vector version which
+    // has a separate operand for the offset and width, the scalar version packs
+    // the width and offset into a single operand. Try to move to the scalar
+    // version if the offsets are constant, so that we can try to keep extended
+    // loads of kernel arguments in SGPRs.
+
+    // TODO: Technically we could try to pattern match scalar bitshifts of
+    // dynamic values, but it's probably not useful.
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+    // Transformation function, pack the offset and width of a BFE into
+    // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+    // source, bits [5:0] contain the offset and bits [22:16] the width.
+
+    uint32_t OffsetVal = Offset->getZExtValue();
+    uint32_t WidthVal = Width->getZExtValue();
+
+    uint32_t PackedVal = OffsetVal | WidthVal << 16;
+
+    SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32);
+    return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+                                  SDLoc(N),
+                                  MVT::i32,
+                                  N->getOperand(0),
+                                  PackedOffsetWidth);
+
+  }
   }
   return SelectCode(N);
 }
 
 
-bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
-  if (!ptr) {
+bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
+  assert(AS != 0 && "Use checkPrivateAddress instead.");
+  if (!Ptr)
     return false;
-  }
-  Type *ptrType = ptr->getType();
-  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
+
+  return Ptr->getType()->getPointerAddressSpace() == AS;
+}
+
+bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) {
+  if (Op->getPseudoValue())
+    return true;
+
+  if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType()))
+    return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+  return false;
 }
 
 bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
-  return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
-          && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
-          && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS));
+  const Value *MemVal = N->getMemOperand()->getValue();
+  return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+          !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+          !checkType(MemVal, AMDGPUAS::REGION_ADDRESS));
 }
 
 bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
-  if (CbId == -1) {
-    return checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS);
-  }
-  return checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
+  const Value *MemVal = N->getMemOperand()->getValue();
+  if (CbId == -1)
+    return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS);
+
+  return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
 }
 
 bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
@@ -437,27 +524,26 @@ bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
       return true;
     }
   }
-  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
-  return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) const {
-  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
-  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
   MachineMemOperand *MMO = N->getMemOperand();
-  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+  if (checkPrivateAddress(N->getMemOperand())) {
     if (MMO) {
-      const Value *V = MMO->getValue();
-      const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
+      const PseudoSourceValue *PSV = MMO->getPseudoValue();
       if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
         return true;
       }
@@ -467,24 +553,34 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
 }
 
 bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
-  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+  if (checkPrivateAddress(N->getMemOperand())) {
     // Check to make sure we are not a constant pool load or a constant load
     // that is marked as a private load
     if (isCPLoad(N) || isConstantLoad(N, -1)) {
       return false;
     }
   }
-  if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) {
+
+  const Value *MemVal = N->getMemOperand()->getValue();
+  if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){
     return true;
   }
   return false;
 }
 
+bool AMDGPUDAGToDAGISel::isCFDepth0() const {
+  // FIXME: Figure out a way to use DominatorTree analysis here.
+  const BasicBlock *CurBlock = FuncInfo->MBB->getBasicBlock();
+  const Function *Fn = FuncInfo->Fn;
+  return &Fn->front() == CurBlock || &Fn->back() == CurBlock;
+}
+
+
 const char *AMDGPUDAGToDAGISel::getPassName() const {
   return "AMDGPU DAG->DAG Pattern Instruction Selection";
 }
@@ -499,7 +595,7 @@ const char *AMDGPUDAGToDAGISel::getPassName() const {
 //===----------------------------------------------------------------------===//
 
 bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
-    SDValue& IntPtr) {
+                                                         SDValue& IntPtr) {
   if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
     IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
     return true;
@@ -509,7 +605,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
 
 bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
     SDValue& BaseReg, SDValue &Offset) {
-  if (!dyn_cast<ConstantSDNode>(Addr)) {
+  if (!isa<ConstantSDNode>(Addr)) {
     BaseReg = Addr;
     Offset = CurDAG->getIntPtrConstant(0, true);
     return true;
@@ -519,7 +615,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
 
 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
                                            SDValue &Offset) {
-  ConstantSDNode * IMMOffset;
+  ConstantSDNode *IMMOffset;
 
   if (Addr.getOpcode() == ISD::ADD
       && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
@@ -563,52 +659,9 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   return true;
 }
 
-SDValue AMDGPUDAGToDAGISel::SimplifyI24(SDValue &Op) {
-  APInt Demanded = APInt(32, 0x00FFFFFF);
-  APInt KnownZero, KnownOne;
-  TargetLowering::TargetLoweringOpt TLO(*CurDAG, true, true);
-  const TargetLowering *TLI = getTargetLowering();
-  if (TLI->SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) {
-    CurDAG->ReplaceAllUsesWith(Op, TLO.New);
-    CurDAG->RepositionNode(Op.getNode(), TLO.New.getNode());
-    return SimplifyI24(TLO.New);
-  } else {
-    return  Op;
-  }
-}
-
-bool AMDGPUDAGToDAGISel::SelectI24(SDValue Op, SDValue &I24) {
-
-  assert(Op.getValueType() == MVT::i32);
-
-  if (CurDAG->ComputeNumSignBits(Op) == 9) {
-    I24 = SimplifyI24(Op);
-    return true;
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectU24(SDValue Op, SDValue &U24) {
-  APInt KnownZero;
-  APInt KnownOne;
-  CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne);
-
-  assert (Op.getValueType() == MVT::i32);
-
-  // ANY_EXTEND and EXTLOAD operations can only be done on types smaller than
-  // i32.  These smaller types are legal to use with the i24 instructions.
-  if ((KnownZero & APInt(KnownZero.getBitWidth(), 0xFF000000)) == 0xFF000000 ||
-       Op.getOpcode() == ISD::ANY_EXTEND ||
-       ISD::isEXTLoad(Op.getNode())) {
-    U24 = SimplifyI24(Op);
-    return true;
-  }
-  return false;
-}
-
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
-    (*(const AMDGPUTargetLowering*)getTargetLowering());
+    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
   bool IsModified = false;
   do {
     IsModified = false;
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 183725c..6c443ea 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -28,8 +28,50 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 
 using namespace llvm;
+
+namespace {
+
+/// Diagnostic information for unimplemented or unsupported feature reporting.
+class DiagnosticInfoUnsupported : public DiagnosticInfo {
+private:
+  const Twine &Description;
+  const Function &Fn;
+
+  static int KindID;
+
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+
+public:
+  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
+                          DiagnosticSeverity Severity = DS_Error)
+    : DiagnosticInfo(getKindID(), Severity),
+      Description(Desc),
+      Fn(Fn) { }
+
+  const Function &getFunction() const { return Fn; }
+  const Twine &getDescription() const { return Description; }
+
+  void print(DiagnosticPrinter &DP) const override {
+    DP << "unsupported " << getDescription() << " in " << Fn.getName();
+  }
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+
+int DiagnosticInfoUnsupported::KindID = 0;
+}
+
+
 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
                       CCValAssign::LocInfo LocInfo,
                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
@@ -88,6 +130,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::f64, Promote);
   AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
 
+  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
+
   // Custom lowering of vector stores is required for local address space
   // stores.
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
@@ -103,6 +148,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   // handle 64-bit stores.
   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
 
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
   setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
@@ -126,6 +173,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::f64, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
 
+  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
+
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
@@ -152,15 +202,19 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
 
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+
   setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
   setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
 
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 
   setOperationAction(ISD::MUL, MVT::i64, Expand);
+  setOperationAction(ISD::SUB, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
   setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
   setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);
@@ -168,10 +222,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   static const MVT::SimpleValueType IntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
-  const size_t NumIntTypes = array_lengthof(IntTypes);
 
-  for (unsigned int x  = 0; x < NumIntTypes; ++x) {
-    MVT::SimpleValueType VT = IntTypes[x];
+  for (MVT VT : IntTypes) {
     //Expand the following operations for the current type by default
     setOperationAction(ISD::ADD,  VT, Expand);
     setOperationAction(ISD::AND,  VT, Expand);
@@ -195,12 +247,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   static const MVT::SimpleValueType FloatTypes[] = {
     MVT::v2f32, MVT::v4f32
   };
-  const size_t NumFloatTypes = array_lengthof(FloatTypes);
 
-  for (unsigned int x = 0; x < NumFloatTypes; ++x) {
-    MVT::SimpleValueType VT = FloatTypes[x];
+  for (MVT VT : FloatTypes) {
     setOperationAction(ISD::FABS, VT, Expand);
     setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
     setOperationAction(ISD::FDIV, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
     setOperationAction(ISD::FFLOOR, VT, Expand);
@@ -208,25 +259,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::FMUL, VT, Expand);
     setOperationAction(ISD::FRINT, VT, Expand);
     setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FSUB, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
   }
 
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SELECT_CC);
 }
 
 //===----------------------------------------------------------------------===//
@@ -325,6 +364,25 @@ SDValue AMDGPUTargetLowering::LowerReturn(
 // Target specific lowering
 //===---------------------------------------------------------------------===//
 
+SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                        SmallVectorImpl<SDValue> &InVals) const {
+  SDValue Callee = CLI.Callee;
+  SelectionDAG &DAG = CLI.DAG;
+
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+  StringRef FuncName("<unknown>");
+
+  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
+    FuncName = G->getSymbol();
+  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    FuncName = G->getGlobal()->getName();
+
+  DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
+  DAG.getContext()->diagnose(NoCalls);
+  return SDValue();
+}
+
 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
     const {
   switch (Op.getOpcode()) {
@@ -361,12 +419,111 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
     // nothing here and let the illegal result integer be handled normally.
     return;
+  case ISD::UDIV: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(UDIVREM);
+    break;
+  }
+  case ISD::UREM: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(UDIVREM.getValue(1));
+    break;
+  }
+  case ISD::UDIVREM: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+    SDValue one = DAG.getConstant(1, HalfVT);
+    SDValue zero = DAG.getConstant(0, HalfVT);
+
+    //HiLo split
+    SDValue LHS = N->getOperand(0);
+    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
+    SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
+
+    SDValue RHS = N->getOperand(1);
+    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
+    SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+
+    // Get Speculative values
+    SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
+    SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
+
+    SDValue REM_Hi = zero;
+    SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+
+    SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
+    SDValue DIV_Lo = zero;
+
+    const unsigned halfBitWidth = HalfVT.getSizeInBits();
+
+    for (unsigned i = 0; i < halfBitWidth; ++i) {
+      SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
+      // Get Value of high bit
+      SDValue HBit;
+      if (halfBitWidth == 32 && Subtarget->hasBFE()) {
+        HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
+      } else {
+        HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
+        HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+      }
+
+      SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
+        DAG.getConstant(halfBitWidth - 1, HalfVT));
+      REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
+      REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
+
+      REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
+      REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
+
+
+      SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+
+      SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+      SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
+
+      DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
+
+      // Update REM
+
+      SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
+
+      REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
+      REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
+      REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
+    }
 
+    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+    Results.push_back(DIV);
+    Results.push_back(REM);
+    break;
+  }
   default:
     return;
   }
 }
 
+// FIXME: This implements accesses to initialized globals in the constant
+// address space by copying them to private and accessing that. It does not
+// properly handle illegal types or vectors. The private vector loads are not
+// scalarized, and the illegal scalars hit an assertion. This technique will not
+// work well with large initializers, and this should eventually be
+// removed. Initialized globals should be placed into a data section that the
+// runtime will load into a buffer before the kernel is executed. Uses of the
+// global need to be replaced with a pointer loaded from an implicit kernel
+// argument into this buffer holding the copy of the data, which will remove the
+// need for any of this.
 SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                        const GlobalValue *GV,
                                                        const SDValue &InitPtr,
@@ -380,29 +537,60 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     return DAG.getStore(Chain, DL,  DAG.getConstant(*CI, VT), InitPtr,
                  MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
                  TD->getPrefTypeAlignment(CI->getType()));
-  } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
+  }
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
     EVT VT = EVT::getEVT(CFP->getType());
     PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
     return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr,
                  MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
                  TD->getPrefTypeAlignment(CFP->getType()));
-  } else if (Init->getType()->isAggregateType()) {
+  }
+
+  Type *InitTy = Init->getType();
+  if (StructType *ST = dyn_cast<StructType>(InitTy)) {
+    const StructLayout *SL = TD->getStructLayout(ST);
+
     EVT PtrVT = InitPtr.getValueType();
-    unsigned NumElements = Init->getType()->getArrayNumElements();
+    SmallVector<SDValue, 8> Chains;
+
+    for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
+      SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT);
+      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
+
+      Constant *Elt = Init->getAggregateElement(I);
+      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
+    }
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
+    EVT PtrVT = InitPtr.getValueType();
+
+    unsigned NumElements;
+    if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
+      NumElements = AT->getNumElements();
+    else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
+      NumElements = VT->getNumElements();
+    else
+      llvm_unreachable("Unexpected type");
+
+    unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
     SmallVector<SDValue, 8> Chains;
     for (unsigned i = 0; i < NumElements; ++i) {
-      SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize(
-          Init->getType()->getArrayElementType()), PtrVT);
+      SDValue Offset = DAG.getConstant(i * EltSize, PtrVT);
       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
-      Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i),
-                       GV, Ptr, Chain, DAG));
+
+      Constant *Elt = Init->getAggregateElement(i);
+      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
     }
-    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
-                       Chains.size());
-  } else {
-    Init->dump();
-    llvm_unreachable("Unhandled constant initializer");
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   }
+
+  Init->dump();
+  llvm_unreachable("Unhandled constant initializer");
 }
 
 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
@@ -440,7 +628,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
     unsigned Size = TD->getTypeAllocSize(EltType);
     unsigned Alignment = TD->getPrefTypeAlignment(EltType);
 
-    const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
+    const GlobalVariable *Var = cast<GlobalVariable>(GV);
     const Constant *Init = Var->getInitializer();
     int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
     SDValue InitPtr = DAG.getFrameIndex(FI,
@@ -461,7 +649,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
       for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
         Ops.push_back((*I)->getOperand(i));
       }
-      DAG.UpdateNodeOperands(*I, &Ops[0], Ops.size());
+      DAG.UpdateNodeOperands(*I, Ops);
     }
     return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op),
         getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
@@ -469,44 +657,28 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   }
 }
 
-void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
-                                         SmallVectorImpl<SDValue> &Args,
-                                         unsigned Start,
-                                         unsigned Count) const {
-  EVT VT = Op.getValueType();
-  for (unsigned i = Start, e = Start + Count; i != e; ++i) {
-    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
-                               VT.getVectorElementType(),
-                               Op, DAG.getConstant(i, MVT::i32)));
-  }
-}
-
 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SmallVector<SDValue, 8> Args;
   SDValue A = Op.getOperand(0);
   SDValue B = Op.getOperand(1);
 
-  ExtractVectorElements(A, DAG, Args, 0,
-                        A.getValueType().getVectorNumElements());
-  ExtractVectorElements(B, DAG, Args, 0,
-                        B.getValueType().getVectorNumElements());
+  DAG.ExtractVectorElements(A, Args);
+  DAG.ExtractVectorElements(B, Args);
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
-                     &Args[0], Args.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
 }
 
 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
                                                      SelectionDAG &DAG) const {
 
   SmallVector<SDValue, 8> Args;
-  EVT VT = Op.getValueType();
   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-  ExtractVectorElements(Op.getOperand(0), DAG, Args, Start,
-                        VT.getVectorNumElements());
+  EVT VT = Op.getValueType();
+  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
+                            VT.getVectorNumElements());
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
-                     &Args[0], Args.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
 }
 
 SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
@@ -560,6 +732,22 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
                                                   Op.getOperand(2));
 
+    case AMDGPUIntrinsic::AMDGPU_umul24:
+      return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_imul24:
+      return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_umad24:
+      return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_imad24:
+      return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
     case AMDGPUIntrinsic::AMDGPU_bfe_i32:
       return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
                          Op.getOperand(1),
@@ -590,8 +778,7 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
 ///IABS(a) = SMAX(sub(0, a), a)
 SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
-    SelectionDAG &DAG) const {
-
+                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
@@ -603,7 +790,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
 /// Linear Interpolation
 /// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
 SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
-    SelectionDAG &DAG) const {
+                                                SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
@@ -617,16 +804,16 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
 }
 
 /// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
-    SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
+SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
 
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue True = Op.getOperand(2);
-  SDValue False = Op.getOperand(3);
-  SDValue CC = Op.getOperand(4);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue True = N->getOperand(2);
+  SDValue False = N->getOperand(3);
+  SDValue CC = N->getOperand(4);
 
   if (VT != MVT::f32 ||
       !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
@@ -654,10 +841,8 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
   case ISD::SETOLT:
   case ISD::SETLE:
   case ISD::SETLT: {
-    if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
-    else
-      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
+    unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
   }
   case ISD::SETGT:
   case ISD::SETGE:
@@ -665,15 +850,13 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
   case ISD::SETOGE:
   case ISD::SETUGT:
   case ISD::SETOGT: {
-    if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
-    else
-      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
+    unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
   }
   case ISD::SETCC_INVALID:
     llvm_unreachable("Invalid setcc condcode!");
   }
-  return Op;
+  return SDValue();
 }
 
 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
@@ -695,8 +878,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
                         MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
                         Load->getAlignment()));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(),
-                     Loads.data(), Loads.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads);
 }
 
 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
@@ -713,32 +895,46 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
   }
 
   SDLoc DL(Op);
-  const SDValue &Value = Store->getValue();
+  SDValue Value = Store->getValue();
   EVT VT = Value.getValueType();
-  const SDValue &Ptr = Store->getBasePtr();
+  EVT ElemVT = VT.getVectorElementType();
+  SDValue Ptr = Store->getBasePtr();
   EVT MemEltVT = MemVT.getVectorElementType();
   unsigned MemEltBits = MemEltVT.getSizeInBits();
   unsigned MemNumElements = MemVT.getVectorNumElements();
-  EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
-  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, PackedVT);
+  unsigned PackedSize = MemVT.getStoreSizeInBits();
+  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32);
+
+  assert(Value.getValueType().getScalarSizeInBits() >= 32);
 
   SDValue PackedValue;
   for (unsigned i = 0; i < MemNumElements; ++i) {
-    EVT ElemVT = VT.getVectorElementType();
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
                               DAG.getConstant(i, MVT::i32));
-    Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT);
-    Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask);
-    SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT);
-    Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift);
+    Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
+    Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
+
+    SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32);
+    Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
+
     if (i == 0) {
       PackedValue = Elt;
     } else {
-      PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt);
+      PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
     }
   }
+
+  if (PackedSize < 32) {
+    EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
+    return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
+                             Store->getMemOperand()->getPointerInfo(),
+                             PackedVT,
+                             Store->isNonTemporal(), Store->isVolatile(),
+                             Store->getAlignment());
+  }
+
   return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
-                      MachinePointerInfo(Store->getMemOperand()->getValue()),
+                      Store->getMemOperand()->getPointerInfo(),
                       Store->isVolatile(),  Store->isNonTemporal(),
                       Store->getAlignment());
 }
@@ -766,7 +962,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
                          MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
                          Store->getAlignment()));
   }
-  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
 }
 
 SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
@@ -788,9 +984,24 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
   }
 
+  if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
+    assert(VT == MVT::i1 && "Only i1 non-extloads expected");
+    // FIXME: Copied from PPC
+    // First, load into 32 bits, then truncate to 1 bit.
+
+    SDValue Chain = Load->getChain();
+    SDValue BasePtr = Load->getBasePtr();
+    MachineMemOperand *MMO = Load->getMemOperand();
+
+    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+                                   BasePtr, MVT::i8, MMO);
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
+  }
+
   // Lower loads constant address space global variable loads
   if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
-      isa<GlobalVariable>(GetUnderlyingObject(Load->getPointerInfo().V))) {
+      isa<GlobalVariable>(
+          GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
 
     SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
@@ -887,15 +1098,13 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
-    SelectionDAG &DAG) const {
+                                           SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue Num = Op.getOperand(0);
   SDValue Den = Op.getOperand(1);
 
-  SmallVector<SDValue, 8> Results;
-
   // RCP =  URECIP(Den) = 2^32 / Den + e
   // e is rounding error.
   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
@@ -985,10 +1194,11 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
                             Remainder_A_Den, Rem, ISD::SETEQ);
-  SDValue Ops[2];
-  Ops[0] = Div;
-  Ops[1] = Rem;
-  return DAG.getMergeValues(Ops, 2, DL);
+  SDValue Ops[2] = {
+    Div,
+    Rem
+  };
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
@@ -1029,81 +1239,197 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   MVT ScalarVT = VT.getScalarType();
 
-  unsigned SrcBits = ExtraVT.getScalarType().getSizeInBits();
-  unsigned DestBits = ScalarVT.getSizeInBits();
-  unsigned BitsDiff = DestBits - SrcBits;
-
-  if (!Subtarget->hasBFE())
-    return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
+  if (!VT.isVector())
+    return SDValue();
 
   SDValue Src = Op.getOperand(0);
-  if (VT.isVector()) {
-    SDLoc DL(Op);
-    // Need to scalarize this, and revisit each of the scalars later.
-    // TODO: Don't scalarize on Evergreen?
-    unsigned NElts = VT.getVectorNumElements();
-    SmallVector<SDValue, 8> Args;
-    ExtractVectorElements(Src, DAG, Args, 0, NElts);
+  SDLoc DL(Op);
 
-    SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
-    for (unsigned I = 0; I < NElts; ++I)
-      Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
+  // TODO: Don't scalarize on Evergreen?
+  unsigned NElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 8> Args;
+  DAG.ExtractVectorElements(Src, Args, 0, NElts);
 
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args.data(), Args.size());
-  }
+  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
+  for (unsigned I = 0; I < NElts; ++I)
+    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
 
-  if (SrcBits == 32) {
-    SDLoc DL(Op);
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
+}
 
-    // If the source is 32-bits, this is really half of a 2-register pair, and
-    // we need to discard the unused half of the pair.
-    SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
-    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, TruncSrc);
-  }
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
 
-  unsigned NElts = VT.isVector() ? VT.getVectorNumElements() : 1;
+static bool isU24(SDValue Op, SelectionDAG &DAG) {
+  APInt KnownZero, KnownOne;
+  EVT VT = Op.getValueType();
+  DAG.computeKnownBits(Op, KnownZero, KnownOne);
 
-  // TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
-  // might not be worth the effort, and will need to expand to shifts when
-  // fixing SGPR copies.
-  if (SrcBits < 32 && DestBits <= 32) {
-    SDLoc DL(Op);
-    MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts);
-
-    if (DestBits != 32)
-      Src = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Src);
-
-    // FIXME: This should use TargetConstant, but that hits assertions for
-    // Evergreen.
-    SDValue Ext = DAG.getNode(AMDGPUISD::BFE_I32, DL, ExtVT,
-                              Op.getOperand(0), // Operand
-                              DAG.getConstant(0, ExtVT), // Offset
-                              DAG.getConstant(SrcBits, ExtVT)); // Width
-
-    // Truncate to the original type if necessary.
-    if (ScalarVT == MVT::i32)
-      return Ext;
-    return DAG.getNode(ISD::TRUNCATE, DL, VT, Ext);
-  }
+  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
+}
 
-  // For small types, extend to 32-bits first.
-  if (SrcBits < 32) {
-    SDLoc DL(Op);
-    MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts);
+static bool isI24(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
 
-    SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, ExtVT, Src);
-    SDValue Ext32 = DAG.getNode(AMDGPUISD::BFE_I32,
-                                DL,
-                                ExtVT,
-                                TruncSrc, // Operand
-                                DAG.getConstant(0, ExtVT), // Offset
-                                DAG.getConstant(SrcBits, ExtVT)); // Width
+  // In order for this to be a signed 24-bit value, bit 23, must
+  // be a sign bit.
+  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
+                                     // as unsigned 24-bit values.
+         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
+}
+
+static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
+
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = Op.getValueType();
+
+  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
+  APInt KnownZero, KnownOne;
+  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
+  if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+    DCI.CommitTargetLoweringOpt(TLO);
+}
 
-    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Ext32);
+template <typename IntTy>
+static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
+                               uint32_t Offset, uint32_t Width) {
+  if (Width + Offset < 32) {
+    IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width);
+    return DAG.getConstant(Result, MVT::i32);
   }
 
-  // For everything else, use the standard bitshift expansion.
-  return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
+  return DAG.getConstant(Src0 >> Offset, MVT::i32);
+}
+
+SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  switch(N->getOpcode()) {
+    default: break;
+    case ISD::MUL: {
+      EVT VT = N->getValueType(0);
+      SDValue N0 = N->getOperand(0);
+      SDValue N1 = N->getOperand(1);
+      SDValue Mul;
+
+      // FIXME: Add support for 24-bit multiply with 64-bit output on SI.
+      if (VT.isVector() || VT.getSizeInBits() > 32)
+        break;
+
+      if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+        N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+        N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+        Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
+      } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+        N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+        N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+        Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
+      } else {
+        break;
+      }
+
+      // We need to use sext even for MUL_U24, because MUL_U24 is used
+      // for signed multiply of 8 and 16-bit types.
+      SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT);
+
+      return Reg;
+    }
+    case AMDGPUISD::MUL_I24:
+    case AMDGPUISD::MUL_U24: {
+      SDValue N0 = N->getOperand(0);
+      SDValue N1 = N->getOperand(1);
+      simplifyI24(N0, DCI);
+      simplifyI24(N1, DCI);
+      return SDValue();
+    }
+    case ISD::SELECT_CC: {
+      return CombineMinMax(N, DAG);
+    }
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    assert(!N->getValueType(0).isVector() &&
+           "Vector handling of BFE not implemented");
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
+    if (WidthVal == 0)
+      return DAG.getConstant(0, MVT::i32);
+
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    SDValue BitsFrom = N->getOperand(0);
+    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
+
+    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
+
+    if (OffsetVal == 0) {
+      // This is already sign / zero extended, so try to fold away extra BFEs.
+      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
+
+      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
+      if (OpSignBits >= SignBits)
+        return BitsFrom;
+
+      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
+      if (Signed) {
+        // This is a sign_extend_inreg. Replace it to take advantage of existing
+        // DAG Combines. If not eliminated, we will match back to BFE during
+        // selection.
+
+        // TODO: The sext_inreg of extended types ends, although we can could
+        // handle them in a single BFE.
+        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
+                           DAG.getValueType(SmallVT));
+      }
+
+      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
+    }
+
+    if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (Signed) {
+        return constantFoldBFE<int32_t>(DAG,
+                                        Val->getSExtValue(),
+                                        OffsetVal,
+                                        WidthVal);
+      }
+
+      return constantFoldBFE<uint32_t>(DAG,
+                                       Val->getZExtValue(),
+                                       OffsetVal,
+                                       WidthVal);
+    }
+
+    APInt Demanded = APInt::getBitsSet(32,
+                                       OffsetVal,
+                                       OffsetVal + WidthVal);
+
+    if ((OffsetVal + WidthVal) >= 32) {
+      SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32);
+      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+                         BitsFrom, ShiftVal);
+    }
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+        TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) {
+      DCI.CommitTargetLoweringOpt(TLO);
+    }
+
+    break;
+  }
+  }
+  return SDValue();
 }
 
 //===----------------------------------------------------------------------===//
@@ -1181,7 +1507,7 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
 
 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return 0;
+  default: return nullptr;
   // AMDIL DAG nodes
   NODE_NAME_CASE(CALL);
   NODE_NAME_CASE(UMUL);
@@ -1202,6 +1528,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BFE_I32)
   NODE_NAME_CASE(BFI)
   NODE_NAME_CASE(BFM)
+  NODE_NAME_CASE(MUL_U24)
+  NODE_NAME_CASE(MUL_I24)
+  NODE_NAME_CASE(MAD_U24)
+  NODE_NAME_CASE(MAD_I24)
   NODE_NAME_CASE(URECIP)
   NODE_NAME_CASE(DOT4)
   NODE_NAME_CASE(EXPORT)
@@ -1219,22 +1549,22 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-static void computeMaskedBitsForMinMax(const SDValue Op0,
-                                       const SDValue Op1,
-                                       APInt &KnownZero,
-                                       APInt &KnownOne,
-                                       const SelectionDAG &DAG,
-                                       unsigned Depth) {
+static void computeKnownBitsForMinMax(const SDValue Op0,
+                                      const SDValue Op1,
+                                      APInt &KnownZero,
+                                      APInt &KnownOne,
+                                      const SelectionDAG &DAG,
+                                      unsigned Depth) {
   APInt Op0Zero, Op0One;
   APInt Op1Zero, Op1One;
-  DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth);
-  DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth);
+  DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
+  DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
 
   KnownZero = Op0Zero & Op1Zero;
   KnownOne = Op0One & Op1One;
 }
 
-void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
+void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   const SDValue Op,
   APInt &KnownZero,
   APInt &KnownOne,
@@ -1242,8 +1572,14 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
   unsigned Depth) const {
 
   KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+
+  APInt KnownZero2;
+  APInt KnownOne2;
   unsigned Opc = Op.getOpcode();
+
   switch (Opc) {
+  default:
+    break;
   case ISD::INTRINSIC_WO_CHAIN: {
     // FIXME: The intrinsic should just use the node.
     switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
@@ -1251,8 +1587,8 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
     case AMDGPUIntrinsic::AMDGPU_umax:
     case AMDGPUIntrinsic::AMDGPU_imin:
     case AMDGPUIntrinsic::AMDGPU_umin:
-      computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
-                                 KnownZero, KnownOne, DAG, Depth);
+      computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
+                                KnownZero, KnownOne, DAG, Depth);
       break;
     default:
       break;
@@ -1264,10 +1600,62 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
   case AMDGPUISD::UMAX:
   case AMDGPUISD::SMIN:
   case AMDGPUISD::UMIN:
-    computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
-                               KnownZero, KnownOne, DAG, Depth);
+    computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
+                              KnownZero, KnownOne, DAG, Depth);
     break;
-  default:
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!CWidth)
+      return;
+
+    unsigned BitWidth = 32;
+    uint32_t Width = CWidth->getZExtValue() & 0x1f;
+    if (Width == 0) {
+      KnownZero = APInt::getAllOnesValue(BitWidth);
+      KnownOne = APInt::getNullValue(BitWidth);
+      return;
+    }
+
+    // FIXME: This could do a lot more. If offset is 0, should be the same as
+    // sign_extend_inreg implementation, but that involves duplicating it.
+    if (Opc == AMDGPUISD::BFE_I32)
+      KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+    else
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+
     break;
   }
+  }
+}
+
+unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
+  SDValue Op,
+  const SelectionDAG &DAG,
+  unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  case AMDGPUISD::BFE_I32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!Width)
+      return 1;
+
+    unsigned SignBits = 32 - Width->getZExtValue() + 1;
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!Offset || !Offset->isNullValue())
+      return SignBits;
+
+    // TODO: Could probably figure something out with non-0 offsets.
+    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    return std::max(SignBits, Op0SignBits);
+  }
+
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
+  }
+
+  default:
+    return 1;
+  }
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index a019616..d5d821d 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -29,9 +29,6 @@ protected:
   const AMDGPUSubtarget *Subtarget;
 
 private:
-  void ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
-                             SmallVectorImpl<SDValue> &Args,
-                             unsigned Start, unsigned Count) const;
   SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV,
                                    const SDValue &InitPtr,
                                    SDValue Chain,
@@ -44,7 +41,7 @@ private:
   /// of the same bitwidth.
   SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
   /// \brief Split a vector store into multiple scalar stores.
-  /// \returns The resulting chain. 
+  /// \returns The resulting chain.
   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
@@ -83,62 +80,67 @@ protected:
 public:
   AMDGPUTargetLowering(TargetMachine &TM);
 
-  virtual bool isFAbsFree(EVT VT) const override;
-  virtual bool isFNegFree(EVT VT) const override;
-  virtual bool isTruncateFree(EVT Src, EVT Dest) const override;
-  virtual bool isTruncateFree(Type *Src, Type *Dest) const override;
-
-  virtual bool isZExtFree(Type *Src, Type *Dest) const override;
-  virtual bool isZExtFree(EVT Src, EVT Dest) const override;
-
-  virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
-
-  virtual MVT getVectorIdxTy() const override;
-  virtual bool isLoadBitCastBeneficial(EVT, EVT) const override;
-  virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                              bool isVarArg,
-                              const SmallVectorImpl<ISD::OutputArg> &Outs,
-                              const SmallVectorImpl<SDValue> &OutVals,
-                              SDLoc DL, SelectionDAG &DAG) const;
-  virtual SDValue LowerCall(CallLoweringInfo &CLI,
-                            SmallVectorImpl<SDValue> &InVals) const {
-    CLI.Callee.dump();
-    llvm_unreachable("Undefined function");
-  }
+  bool isFAbsFree(EVT VT) const override;
+  bool isFNegFree(EVT VT) const override;
+  bool isTruncateFree(EVT Src, EVT Dest) const override;
+  bool isTruncateFree(Type *Src, Type *Dest) const override;
+
+  bool isZExtFree(Type *Src, Type *Dest) const override;
+  bool isZExtFree(EVT Src, EVT Dest) const override;
+
+  bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
-  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-  virtual void ReplaceNodeResults(SDNode * N,
-                                  SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) const override;
+  MVT getVectorIdxTy() const override;
+  bool isLoadBitCastBeneficial(EVT, EVT) const override;
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      SDLoc DL, SelectionDAG &DAG) const override;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  void ReplaceNodeResults(SDNode * N,
+                          SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
 
   SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
-  virtual const char* getTargetNodeName(unsigned Opcode) const;
+  SDValue CombineMinMax(SDNode *N, SelectionDAG &DAG) const;
+  const char* getTargetNodeName(unsigned Opcode) const override;
 
-  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const {
+  virtual SDNode *PostISelFolding(MachineSDNode *N,
+                                  SelectionDAG &DAG) const {
     return N;
   }
 
   /// \brief Determine which of the bits specified in \p Mask are known to be
   /// either zero or one and return them in the \p KnownZero and \p KnownOne
   /// bitsets.
-  virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                              APInt &KnownZero,
-                                              APInt &KnownOne,
-                                              const SelectionDAG &DAG,
-                                              unsigned Depth = 0) const override;
+  void computeKnownBitsForTargetNode(const SDValue Op,
+                                     APInt &KnownZero,
+                                     APInt &KnownOne,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  virtual unsigned ComputeNumSignBitsForTargetNode(
+    SDValue Op,
+    const SelectionDAG &DAG,
+    unsigned Depth = 0) const override;
 
 // Functions defined in AMDILISelLowering.cpp
 public:
-  virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                  const CallInst &I, unsigned Intrinsic) const;
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+                          const CallInst &I, unsigned Intrinsic) const override;
 
   /// We want to mark f32/f64 floating point values as legal.
-  bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
   /// We don't want to shrink f64/f32 constants.
-  bool ShouldShrinkFPConstant(EVT VT) const;
+  bool ShouldShrinkFPConstant(EVT VT) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
 private:
   void InitAMDILLowering();
@@ -158,7 +160,6 @@ private:
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
   EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
 };
 
 namespace AMDGPUISD {
@@ -188,6 +189,10 @@ enum {
   BFE_I32, // Extract range of bits with sign extension to 32-bits.
   BFI, // (src0 & src1) | (~src0 & src2)
   BFM, // Insert a range of bits into a 32-bit word.
+  MUL_U24,
+  MUL_I24,
+  MAD_U24,
+  MAD_I24,
   TEXTURE_FETCH,
   EXPORT,
   CONST_ADDRESS,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index e32dd9f..1c3361a 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -20,14 +20,13 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRINFO_NAMED_OPS
 #define GET_INSTRMAP_INFO
 #include "AMDGPUGenInstrInfo.inc"
 
-using namespace llvm;
-
-
 // Pin the vtable to this file.
 void AMDGPUInstrInfo::anchor() {}
 
@@ -85,7 +84,7 @@ AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                       MachineBasicBlock::iterator &MBBI,
                                       LiveVariables *LV) const {
 // TODO: Implement this function
-  return NULL;
+  return nullptr;
 }
 bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
                                         MachineBasicBlock &MBB) const {
@@ -176,7 +175,7 @@ AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       int FrameIndex) const {
 // TODO: Implement this function
-  return 0;
+  return nullptr;
 }
 MachineInstr*
 AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
@@ -184,7 +183,7 @@ AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       MachineInstr *LoadMI) const {
   // TODO: Implement this function
-  return 0;
+  return nullptr;
 }
 bool
 AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
@@ -356,3 +355,14 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
   case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
   }
 }
+
+// Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
+// header files, so we need to wrap it in a function that takes unsigned 
+// instead.
+namespace llvm {
+namespace AMDGPU {
+int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+  return getMCOpcode(Opcode);
+}
+}
+}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index 426910c..74baf6b 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -52,14 +52,15 @@ public:
   virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
 
   bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
-                             unsigned &DstReg, unsigned &SubIdx) const;
+                             unsigned &DstReg, unsigned &SubIdx) const override;
 
-  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
   unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                     int &FrameIndex) const;
+                                     int &FrameIndex) const override;
   bool hasLoadFromStackSlot(const MachineInstr *MI,
                             const MachineMemOperand *&MMO,
-                            int &FrameIndex) const;
+                            int &FrameIndex) const override;
   unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
   unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
                                       int &FrameIndex) const;
@@ -70,7 +71,7 @@ public:
   MachineInstr *
   convertToThreeAddress(MachineFunction::iterator &MFI,
                         MachineBasicBlock::iterator &MBBI,
-                        LiveVariables *LV) const;
+                        LiveVariables *LV) const override;
 
 
   virtual void copyPhysReg(MachineBasicBlock &MBB,
@@ -78,61 +79,62 @@ public:
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const = 0;
 
-  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
 
 protected:
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr *MI,
                                       const SmallVectorImpl<unsigned> &Ops,
-                                      int FrameIndex) const;
+                                      int FrameIndex) const override;
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr *MI,
                                       const SmallVectorImpl<unsigned> &Ops,
-                                      MachineInstr *LoadMI) const;
+                                      MachineInstr *LoadMI) const override;
   /// \returns the smallest register index that will be accessed by an indirect
   /// read or write or -1 if indirect addressing is not used by this program.
-  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
+  int getIndirectIndexBegin(const MachineFunction &MF) const;
 
   /// \returns the largest register index that will be accessed by an indirect
   /// read or write or -1 if indirect addressing is not used by this program.
-  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+  int getIndirectIndexEnd(const MachineFunction &MF) const;
 
 public:
   bool canFoldMemoryOperand(const MachineInstr *MI,
-                            const SmallVectorImpl<unsigned> &Ops) const;
+                           const SmallVectorImpl<unsigned> &Ops) const override;
   bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
-                           unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
-                           SmallVectorImpl<MachineInstr *> &NewMIs) const;
+                        unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                        SmallVectorImpl<MachineInstr *> &NewMIs) const override;
   bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
-                           SmallVectorImpl<SDNode *> &NewNodes) const;
+                           SmallVectorImpl<SDNode *> &NewNodes) const override;
   unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
-                                      bool UnfoldLoad, bool UnfoldStore,
-                                      unsigned *LoadRegIndex = 0) const;
+                               bool UnfoldLoad, bool UnfoldStore,
+                               unsigned *LoadRegIndex = nullptr) const override;
   bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                int64_t Offset1, int64_t Offset2,
-                               unsigned NumLoads) const;
+                               unsigned NumLoads) const override;
 
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
   void insertNoop(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator MI) const;
-  bool isPredicated(const MachineInstr *MI) const;
+                  MachineBasicBlock::iterator MI) const override;
+  bool isPredicated(const MachineInstr *MI) const override;
   bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+                   const SmallVectorImpl<MachineOperand> &Pred2) const override;
   bool DefinesPredicate(MachineInstr *MI,
-                        std::vector<MachineOperand> &Pred) const;
-  bool isPredicable(MachineInstr *MI) const;
-  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+                        std::vector<MachineOperand> &Pred) const override;
+  bool isPredicable(MachineInstr *MI) const override;
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
 
   // Helper functions that check the opcode for status information
   bool isLoadInst(llvm::MachineInstr *MI) const;
@@ -186,8 +188,7 @@ public:
 
   /// \brief Convert the AMDIL MachineInstr to a supported ISA
   /// MachineInstr
-  virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
-    DebugLoc DL) const;
+  void convertToISA(MachineInstr & MI, MachineFunction &MF, DebugLoc DL) const;
 
   /// \brief Build a MOV instruction.
   virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 69d8059..f96dbb4 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -92,3 +92,18 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
+// Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
+// performing the mulitply.  The result is a 32-bit value.
+def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
+  [SDNPCommutative]
+>;
+def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
+  [SDNPCommutative]
+>;
+
+def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
+  []
+>;
+def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
+  []
+>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 505fc81..80bdf5b 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -37,6 +37,18 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
 def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
+def u32imm : Operand<i32> {
+  let PrintMethod = "printU32ImmOperand";
+}
+
+def u16imm : Operand<i16> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def u8imm : Operand<i8> {
+  let PrintMethod = "printU8ImmOperand";
+}
+
 //===----------------------------------------------------------------------===//
 // PatLeafs for floating-point comparisons
 //===----------------------------------------------------------------------===//
@@ -253,9 +265,6 @@ def FP_ONE : PatLeaf <
   [{return N->isExactlyValue(1.0);}]
 >;
 
-def U24 : ComplexPattern<i32, 1, "SelectU24", [], []>;
-def I24 : ComplexPattern<i32, 1, "SelectI24", [], []>;
-
 let isCodeGenOnly = 1, isPseudo = 1 in {
 
 let usesCustomInserter = 1  in {
@@ -414,6 +423,40 @@ class UMUL24Pattern <Instruction UMUL24> : Pat <
 >;
 */
 
+class IMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+class UMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> {
+  def _expand_imad24 : Pat <
+    (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2),
+    (AddInst (MulInst $src0, $src1), $src2)
+  >;
+
+  def _expand_imul24 : Pat <
+    (AMDGPUmul_i24 i32:$src0, i32:$src1),
+    (MulInst $src0, $src1)
+  >;
+}
+
+multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
+  def _expand_umad24 : Pat <
+    (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2),
+    (AddInst (MulInst $src0, $src1), $src2)
+  >;
+
+  def _expand_umul24 : Pat <
+    (AMDGPUmul_u24 i32:$src0, i32:$src1),
+    (MulInst $src0, $src1)
+  >;
+}
+
 include "R600Instructions.td"
 include "R700Instructions.td"
 include "EvergreenInstructions.td"
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index c6521d0..9ad5e72 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -49,6 +49,10 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index 2c9909f..b759495 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPUAsmPrinter.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
 #include "R600InstrInfo.h"
+#include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Constants.h"
@@ -31,16 +32,30 @@
 
 using namespace llvm;
 
-AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx):
-  Ctx(ctx)
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
+  Ctx(ctx), ST(st)
 { }
 
+enum AMDGPUMCInstLower::SISubtarget
+AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned) const {
+  return AMDGPUMCInstLower::SI;
+}
+
+unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const {
+
+  int MCOpcode = AMDGPU::getMCOpcode(MIOpcode,
+                              AMDGPUSubtargetToSISubtarget(ST.getGeneration()));
+  if (MCOpcode == -1)
+    MCOpcode = MIOpcode;
+
+  return MCOpcode;
+}
+
 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
-  OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  OutMI.setOpcode(getMCOpcode(MI->getOpcode()));
 
+  for (const MachineOperand &MO : MI->explicit_operands()) {
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
@@ -67,7 +82,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
 }
 
 void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  AMDGPUMCInstLower MCInstLowering(OutContext);
+  AMDGPUMCInstLower MCInstLowering(OutContext,
+                               MF->getTarget().getSubtarget<AMDGPUSubtarget>());
 
 #ifdef _DEBUG
   StringRef Err;
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
index d7d538e..2b7f1e3 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.h
+++ b/lib/Target/R600/AMDGPUMCInstLower.h
@@ -13,16 +13,30 @@
 
 namespace llvm {
 
+class AMDGPUSubtarget;
 class MCInst;
 class MCContext;
 class MachineInstr;
 
 class AMDGPUMCInstLower {
 
+  // This must be kept in sync with the SISubtarget class in SIInstrInfo.td
+  enum SISubtarget {
+    SI = 0
+  };
+
   MCContext &Ctx;
+  const AMDGPUSubtarget &ST;
+
+  /// Convert a member of the AMDGPUSubtarget::Generation enum to the
+  /// SISubtarget enum.
+  enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) const;
+
+  /// Get the MC opcode for this MachineInstr.
+  unsigned getMCOpcode(unsigned MIOpcode) const;
 
 public:
-  AMDGPUMCInstLower(MCContext &ctx);
+  AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST);
 
   /// \brief Lower a MachineInstr to an MCInst
   void lower(const MachineInstr *MI, MCInst &OutMI) const;
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 8fbec4e..19927fa 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -27,10 +27,10 @@ AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm)
 // they are not supported at this time.
 //===----------------------------------------------------------------------===//
 
-const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
+const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
 
-const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
-                                                                         const {
+const MCPhysReg*
+AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return &CalleeSavedReg;
 }
 
@@ -54,7 +54,7 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
     AMDGPU::sub15
   };
 
-  assert (Channel < array_lengthof(SubRegs));
+  assert(Channel < array_lengthof(SubRegs));
   return SubRegs[Channel];
 }
 
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index 688e1a0..a7cba0d 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -30,11 +30,11 @@ class TargetInstrInfo;
 
 struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   TargetMachine &TM;
-  static const uint16_t CalleeSavedReg;
+  static const MCPhysReg CalleeSavedReg;
 
   AMDGPURegisterInfo(TargetMachine &tm);
 
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const {
+  BitVector getReservedRegs(const MachineFunction &MF) const override {
     assert(!"Unimplemented");  return BitVector();
   }
 
@@ -43,11 +43,11 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   /// \returns The ISA reg class that is equivalent to \p RC.
   virtual const TargetRegisterClass * getISARegClass(
                                          const TargetRegisterClass * RC) const {
-    assert(!"Unimplemented"); return NULL;
+    assert(!"Unimplemented"); return nullptr;
   }
 
   virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
-    assert(!"Unimplemented"); return NULL;
+    assert(!"Unimplemented"); return nullptr;
   }
 
   virtual unsigned getHWRegIndex(unsigned Reg) const {
@@ -58,11 +58,11 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
   unsigned getSubRegFromChannel(unsigned Channel) const;
 
-  const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
+  const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
-                           RegScavenger *RS) const;
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+                           RegScavenger *RS) const override;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   unsigned getIndirectSubReg(unsigned IndirectIndex) const;
 
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index e77ab5e..f3b9932 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -16,6 +16,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "amdgpu-subtarget"
+
 #define GET_SUBTARGETINFO_ENUM
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
@@ -28,9 +30,6 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   // Default card
   StringRef GPU = CPU;
   Is64bit = false;
-  DefaultSize[0] = 64;
-  DefaultSize[1] = 1;
-  DefaultSize[2] = 1;
   HasVertexCache = false;
   TexVTXClauseSize = 0;
   Gen = AMDGPUSubtarget::R600;
@@ -106,14 +105,6 @@ bool
 AMDGPUSubtarget::isTargetELF() const {
   return false;
 }
-size_t
-AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
-  if (dim > 2) {
-    return 1;
-  } else {
-    return DefaultSize[dim];
-  }
-}
 
 std::string
 AMDGPUSubtarget::getDeviceName() const {
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 8874d14..1b041d6 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -38,7 +38,6 @@ public:
   };
 
 private:
-  size_t DefaultSize[3];
   std::string DevName;
   bool Is64bit;
   bool Is32on64bit;
@@ -60,7 +59,7 @@ public:
   AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
 
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
-  virtual void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
   bool is64bit() const;
   bool hasVertexCache() const;
@@ -77,20 +76,28 @@ public:
     return hasBFE();
   }
 
+  bool hasMulU24() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasMulI24() const {
+    return (getGeneration() >= SOUTHERN_ISLANDS ||
+            hasCaymanISA());
+  }
+
   bool IsIRStructurizerEnabled() const;
   bool isIfCvtEnabled() const;
   unsigned getWavefrontSize() const;
   unsigned getStackEntrySize() const;
   bool hasCFAluBug() const;
 
-  virtual bool enableMachineScheduler() const {
+  bool enableMachineScheduler() const override {
     return getGeneration() <= NORTHERN_ISLANDS;
   }
 
   // Helper functions to simplify if statements
   bool isTargetELF() const;
   std::string getDeviceName() const;
-  virtual size_t getDefaultSize(uint32_t dim) const;
   bool dumpCode() const { return DumpCode; }
   bool r600ALUEncoding() const { return R600ALUInst; }
 
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index b11fce3..174fdca 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -42,7 +42,7 @@ extern "C" void LLVMInitializeR600Target() {
 }
 
 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, new R600SchedStrategy());
+  return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
 }
 
 static MachineSchedRegistry
@@ -54,7 +54,7 @@ static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
 
   if (ST.is64bit()) {
     // 32-bit private, local, and region pointers. 64-bit global and constant.
-    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64";
+    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
   }
 
   Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
@@ -103,20 +103,20 @@ public:
     return getTM<AMDGPUTargetMachine>();
   }
 
-  virtual ScheduleDAGInstrs *
-  createMachineScheduler(MachineSchedContext *C) const {
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
     const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       return createR600MachineScheduler(C);
-    return 0;
+    return nullptr;
   }
 
-  virtual bool addPreISel();
-  virtual bool addInstSelector();
-  virtual bool addPreRegAlloc();
-  virtual bool addPostRegAlloc();
-  virtual bool addPreSched2();
-  virtual bool addPreEmitPass();
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
 };
 } // End of anonymous namespace
 
@@ -154,6 +154,7 @@ AMDGPUPassConfig::addPreISel() {
 
 bool AMDGPUPassConfig::addInstSelector() {
   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
+  addPass(createSILowerI1CopiesPass());
   return false;
 }
 
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index f942614..1287e13 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -20,7 +20,6 @@
 #include "AMDGPUSubtarget.h"
 #include "AMDILIntrinsicInfo.h"
 #include "R600ISelLowering.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/IR/DataLayout.h"
 
 namespace llvm {
@@ -31,8 +30,8 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
   const DataLayout Layout;
   AMDGPUFrameLowering FrameLowering;
   AMDGPUIntrinsicInfo IntrinsicInfo;
-  OwningPtr<AMDGPUInstrInfo> InstrInfo;
-  OwningPtr<AMDGPUTargetLowering> TLInfo;
+  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
+  std::unique_ptr<AMDGPUTargetLowering> TLInfo;
   const InstrItineraryData *InstrItins;
 
 public:
@@ -40,30 +39,32 @@ public:
                       StringRef CPU, TargetOptions Options, Reloc::Model RM,
                       CodeModel::Model CM, CodeGenOpt::Level OL);
   ~AMDGPUTargetMachine();
-  virtual const AMDGPUFrameLowering *getFrameLowering() const {
+  const AMDGPUFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const AMDGPUIntrinsicInfo *getIntrinsicInfo() const {
+  const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
     return &IntrinsicInfo;
   }
-  virtual const AMDGPUInstrInfo *getInstrInfo() const {
+  const AMDGPUInstrInfo *getInstrInfo() const override {
     return InstrInfo.get();
   }
-  virtual const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; }
-  virtual const AMDGPURegisterInfo *getRegisterInfo() const {
+  const AMDGPUSubtarget *getSubtargetImpl() const override {
+    return &Subtarget;
+  }
+  const AMDGPURegisterInfo *getRegisterInfo() const override {
     return &InstrInfo->getRegisterInfo();
   }
-  virtual AMDGPUTargetLowering *getTargetLowering() const {
+  AMDGPUTargetLowering *getTargetLowering() const override {
     return TLInfo.get();
   }
-  virtual const InstrItineraryData *getInstrItineraryData() const {
+  const InstrItineraryData *getInstrItineraryData() const override {
     return InstrItins;
   }
-  virtual const DataLayout *getDataLayout() const { return &Layout; }
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  const DataLayout *getDataLayout() const override { return &Layout; }
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   /// \brief Register R600 analysis passes with a pass manager.
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  void addAnalysisPasses(PassManagerBase &PM) override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
index 51225eb..ea78f43 100644
--- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "AMDGPUtti"
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -26,6 +25,8 @@
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "AMDGPUtti"
+
 // Declare the pass initialization routine locally as target-specific passes
 // don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
@@ -45,7 +46,7 @@ class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo {
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
 public:
-  AMDGPUTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+  AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
@@ -55,9 +56,9 @@ public:
     initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() override { pushTTIStack(this); }
+  void initializePass() override { pushTTIStack(this); }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
@@ -65,15 +66,16 @@ public:
   static char ID;
 
   /// Provide necessary pointer adjustments for the two base classes.
-  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+  void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
       return (TargetTransformInfo *)this;
     return this;
   }
 
-  virtual bool hasBranchDivergence() const override;
+  bool hasBranchDivergence() const override;
 
-  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
+  void getUnrollingPreferences(Loop *L,
+                               UnrollingPreferences &UP) const override;
 
   /// @}
 };
@@ -109,11 +111,11 @@ void AMDGPUTTI::getUnrollingPreferences(Loop *L,
         // require us to use indirect addressing, which is slow and prone to
         // compiler bugs.  If this loop does an address calculation on an
         // alloca ptr, then we want to use a higher than normal loop unroll
-	// threshold.  This will give SROA a better chance to eliminate these
-	// allocas.
-	//
-	// Don't use the maximum allowed value here as it will make some
-	// programs way too big.
+        // threshold. This will give SROA a better chance to eliminate these
+        // allocas.
+        //
+        // Don't use the maximum allowed value here as it will make some
+        // programs way too big.
         UP.Threshold = 500;
       }
     }
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index 21ca560..f3a0391 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -8,8 +8,6 @@
 /// \file
 //==-----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "structcfg"
-
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "R600InstrInfo.h"
@@ -34,6 +32,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "structcfg"
+
 #define DEFAULT_VEC_SLOTS 8
 
 // TODO: move-begin.
@@ -135,15 +135,15 @@ public:
   static char ID;
 
   AMDGPUCFGStructurizer() :
-      MachineFunctionPass(ID), TII(NULL), TRI(NULL) {
+      MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
     initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
   }
 
-   const char *getPassName() const {
+   const char *getPassName() const override {
     return "AMDGPU Control Flow Graph structurizer Pass";
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addPreserved<MachineFunctionAnalysis>();
     AU.addRequired<MachineFunctionAnalysis>();
     AU.addRequired<MachineDominatorTree>();
@@ -159,7 +159,7 @@ public:
   /// sure all loops have an exit block
   bool prepare();
 
-  bool runOnMachineFunction(MachineFunction &MF) {
+  bool runOnMachineFunction(MachineFunction &MF) override {
     TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
     TRI = &TII->getRegisterInfo();
     DEBUG(MF.dump(););
@@ -168,7 +168,7 @@ public:
     MLI = &getAnalysis<MachineLoopInfo>();
     DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
     MDT = &getAnalysis<MachineDominatorTree>();
-    DEBUG(MDT->print(dbgs(), (const llvm::Module*)0););
+    DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
     PDT = &getAnalysis<MachinePostDominatorTree>();
     DEBUG(PDT->print(dbgs()););
     prepare();
@@ -334,7 +334,7 @@ protected:
       MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
   void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
   void retireBlock(MachineBasicBlock *MBB);
-  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = NULL);
+  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr);
 
   MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
   /// This is work around solution for findNearestCommonDominator not avaiable
@@ -361,7 +361,7 @@ MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
     const {
   LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
   if (It == LLInfoMap.end())
-    return NULL;
+    return nullptr;
   return (*It).second;
 }
 
@@ -632,7 +632,7 @@ MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
   MachineInstr *MI = &*It;
   if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
     return MI;
-  return NULL;
+  return nullptr;
 }
 
 MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
@@ -648,7 +648,7 @@ MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
         break;
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
@@ -658,7 +658,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
     if (instr->getOpcode() == AMDGPU::RETURN)
       return instr;
   }
-  return NULL;
+  return nullptr;
 }
 
 MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
@@ -668,7 +668,7 @@ MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
     if (MI->getOpcode() == AMDGPU::CONTINUE)
       return MI;
   }
-  return NULL;
+  return nullptr;
 }
 
 bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
@@ -819,7 +819,7 @@ bool AMDGPUCFGStructurizer::run() {
 
     SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
         It;
-    MachineBasicBlock *SccBeginMBB = NULL;
+    MachineBasicBlock *SccBeginMBB = nullptr;
     int SccNumBlk = 0;  // The number of active blocks, init to a
                         // maximum possible number.
     int SccNumIter;     // Number of iteration in this SCC.
@@ -874,7 +874,7 @@ bool AMDGPUCFGStructurizer::run() {
       }
 
       if (ContNextScc)
-        SccBeginMBB = NULL;
+        SccBeginMBB = nullptr;
     } //while, "one iteration" over the function.
 
     MachineBasicBlock *EntryMBB =
@@ -933,7 +933,7 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
   MachineBasicBlock *MBB;
   for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
        ++It, ++SccNum) {
-    std::vector<MachineBasicBlock *> &SccNext = *It;
+    const std::vector<MachineBasicBlock *> &SccNext = *It;
     for (std::vector<MachineBasicBlock *>::const_iterator
          blockIter = SccNext.begin(), blockEnd = SccNext.end();
          blockIter != blockEnd; ++blockIter) {
@@ -1026,7 +1026,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
   } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
     // Triangle pattern, false is empty
     LandBlk = FalseMBB;
-    FalseMBB = NULL;
+    FalseMBB = nullptr;
   } else if (FalseMBB->succ_size() == 1
              && *FalseMBB->succ_begin() == TrueMBB) {
     // Triangle pattern, true is empty
@@ -1034,7 +1034,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
     std::swap(TrueMBB, FalseMBB);
     reversePredicateSetter(MBB->end());
     LandBlk = FalseMBB;
-    FalseMBB = NULL;
+    FalseMBB = nullptr;
   } else if (FalseMBB->succ_size() == 1
              && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
     LandBlk = *FalseMBB->succ_begin();
@@ -1075,13 +1075,11 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
 
 int AMDGPUCFGStructurizer::loopendPatternMatch() {
   std::vector<MachineLoop *> NestedLoops;
-  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end();
-      It != E; ++It) {
-    df_iterator<MachineLoop *> LpIt = df_begin(*It),
-        LpE = df_end(*It);
-    for (; LpIt != LpE; ++LpIt)
-      NestedLoops.push_back(*LpIt);
-  }
+  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end(); It != E;
+       ++It)
+    for (MachineLoop *ML : depth_first(*It))
+      NestedLoops.push_back(ML);
+
   if (NestedLoops.size() == 0)
     return 0;
 
@@ -1244,7 +1242,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
     DEBUG(
       dbgs() << " not working\n";
     );
-    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : NULL;
+    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
   } // walk down the postDomTree
 
   return Num;
@@ -1723,11 +1721,11 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
   const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
 
   if (!LoopHeader || !LoopLatch)
-    return NULL;
+    return nullptr;
   MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
   // Is LoopRep an infinite loop ?
   if (!BranchMI || !isUncondBranch(BranchMI))
-    return NULL;
+    return nullptr;
 
   MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
   FuncRep->push_back(DummyExitBlk);  //insert to function
@@ -1860,7 +1858,7 @@ AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
     return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
 
   if (!Node1 || !Node2)
-    return NULL;
+    return nullptr;
 
   Node1 = Node1->getIDom();
   while (Node1) {
@@ -1869,7 +1867,7 @@ AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
     Node1 = Node1->getIDom();
   }
 
-  return NULL;
+  return nullptr;
 }
 
 MachineBasicBlock *
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
index 0761ff4..7cea803 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -39,61 +39,55 @@ using namespace llvm;
 // TargetLowering Class Implementation Begins
 //===----------------------------------------------------------------------===//
 void AMDGPUTargetLowering::InitAMDILLowering() {
-  static const int types[] = {
-    (int)MVT::i8,
-    (int)MVT::i16,
-    (int)MVT::i32,
-    (int)MVT::f32,
-    (int)MVT::f64,
-    (int)MVT::i64,
-    (int)MVT::v2i8,
-    (int)MVT::v4i8,
-    (int)MVT::v2i16,
-    (int)MVT::v4i16,
-    (int)MVT::v4f32,
-    (int)MVT::v4i32,
-    (int)MVT::v2f32,
-    (int)MVT::v2i32,
-    (int)MVT::v2f64,
-    (int)MVT::v2i64
+  static const MVT::SimpleValueType types[] = {
+    MVT::i8,
+    MVT::i16,
+    MVT::i32,
+    MVT::f32,
+    MVT::f64,
+    MVT::i64,
+    MVT::v2i8,
+    MVT::v4i8,
+    MVT::v2i16,
+    MVT::v4i16,
+    MVT::v4f32,
+    MVT::v4i32,
+    MVT::v2f32,
+    MVT::v2i32,
+    MVT::v2f64,
+    MVT::v2i64
   };
 
-  static const int IntTypes[] = {
-    (int)MVT::i8,
-    (int)MVT::i16,
-    (int)MVT::i32,
-    (int)MVT::i64
+  static const MVT::SimpleValueType IntTypes[] = {
+    MVT::i8,
+    MVT::i16,
+    MVT::i32,
+    MVT::i64
   };
 
-  static const int FloatTypes[] = {
-    (int)MVT::f32,
-    (int)MVT::f64
+  static const MVT::SimpleValueType FloatTypes[] = {
+    MVT::f32,
+    MVT::f64
   };
 
-  static const int VectorTypes[] = {
-    (int)MVT::v2i8,
-    (int)MVT::v4i8,
-    (int)MVT::v2i16,
-    (int)MVT::v4i16,
-    (int)MVT::v4f32,
-    (int)MVT::v4i32,
-    (int)MVT::v2f32,
-    (int)MVT::v2i32,
-    (int)MVT::v2f64,
-    (int)MVT::v2i64
+  static const MVT::SimpleValueType VectorTypes[] = {
+    MVT::v2i8,
+    MVT::v4i8,
+    MVT::v2i16,
+    MVT::v4i16,
+    MVT::v4f32,
+    MVT::v4i32,
+    MVT::v2f32,
+    MVT::v2i32,
+    MVT::v2f64,
+    MVT::v2i64
   };
-  const size_t NumTypes = array_lengthof(types);
-  const size_t NumFloatTypes = array_lengthof(FloatTypes);
-  const size_t NumIntTypes = array_lengthof(IntTypes);
-  const size_t NumVectorTypes = array_lengthof(VectorTypes);
 
   const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
   // These are the current register classes that are
   // supported
 
-  for (unsigned int x  = 0; x < NumTypes; ++x) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
-
+  for (MVT VT : types) {
     setOperationAction(ISD::SUBE, VT, Expand);
     setOperationAction(ISD::SUBC, VT, Expand);
     setOperationAction(ISD::ADDE, VT, Expand);
@@ -109,9 +103,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
       setOperationAction(ISD::SDIV, VT, Custom);
     }
   }
-  for (unsigned int x = 0; x < NumFloatTypes; ++x) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
-
+  for (MVT VT : FloatTypes) {
     // IL does not have these operations for floating point types
     setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
     setOperationAction(ISD::SETOLT, VT, Expand);
@@ -124,9 +116,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     setOperationAction(ISD::SETULE, VT, Expand);
   }
 
-  for (unsigned int x = 0; x < NumIntTypes; ++x) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
-
+  for (MVT VT : IntTypes) {
     // GPU also does not have divrem function for signed or unsigned
     setOperationAction(ISD::SDIVREM, VT, Expand);
 
@@ -142,9 +132,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     setOperationAction(ISD::CTLZ, VT, Expand);
   }
 
-  for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
-
+  for (MVT VT : VectorTypes) {
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDILIntrinsicInfo.cpp
index 762ee39..fab4a3b 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.cpp
+++ b/lib/Target/R600/AMDILIntrinsicInfo.cpp
@@ -38,7 +38,7 @@ AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
   };
 
   if (IntrID < Intrinsic::num_intrinsics) {
-    return 0;
+    return nullptr;
   }
   assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics
       && "Invalid intrinsic ID");
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDILIntrinsicInfo.h
index 35559e2..924275a 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.h
+++ b/lib/Target/R600/AMDILIntrinsicInfo.h
@@ -34,13 +34,13 @@ enum ID {
 class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
 public:
   AMDGPUIntrinsicInfo(TargetMachine *tm);
-  std::string getName(unsigned int IntrId, Type **Tys = 0,
-                      unsigned int numTys = 0) const;
-  unsigned int lookupName(const char *Name, unsigned int Len) const;
-  bool isOverloaded(unsigned int IID) const;
+  std::string getName(unsigned int IntrId, Type **Tys = nullptr,
+                      unsigned int numTys = 0) const override;
+  unsigned int lookupName(const char *Name, unsigned int Len) const override;
+  bool isOverloaded(unsigned int IID) const override;
   Function *getDeclaration(Module *M, unsigned int ID,
-                           Type **Tys = 0,
-                           unsigned int numTys = 0) const;
+                           Type **Tys = nullptr,
+                           unsigned int numTys = 0) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td
index 658deb5..4a3e02e 100644
--- a/lib/Target/R600/AMDILIntrinsics.td
+++ b/lib/Target/R600/AMDILIntrinsics.td
@@ -92,10 +92,6 @@ let TargetPrefix = "AMDIL", isTarget = 1 in {
           BinaryIntInt;
   def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
           BinaryIntInt;
-  def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">,
-          BinaryIntInt;
-  def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">,
-          BinaryIntInt;
   def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
           BinaryIntInt;
   def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 93a5117..3c6fa5a 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -45,6 +45,7 @@ add_llvm_target(R600CodeGen
   SIInstrInfo.cpp
   SIISelLowering.cpp
   SILowerControlFlow.cpp
+  SILowerI1Copies.cpp
   SIMachineFunctionInfo.cpp
   SIRegisterInfo.cpp
   SITypeRewriter.cpp
diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td
index acd7bde..2630345 100644
--- a/lib/Target/R600/CaymanInstructions.td
+++ b/lib/Target/R600/CaymanInstructions.td
@@ -21,12 +21,14 @@ def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
 let Predicates = [isCayman] in {
 
 def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
-  [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))], VecALU
+  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU
 >;
 def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
-  [(set i32:$dst, (mul I24:$src0, I24:$src1))], VecALU
+  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU
 >;
 
+def : IMad24Pat<MULADD_INT24_cm>;
+
 let isVector = 1 in {
 
 def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
@@ -47,6 +49,7 @@ def COS_cm : COS_Common<0x8E>;
 def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
 
 defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>;
 
 // RECIP_UINT emulation for Cayman
 // The multiplication scales from [0,1] to the unsigned integer range
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
index 6430ca6..2065441 100644
--- a/lib/Target/R600/EvergreenInstructions.td
+++ b/lib/Target/R600/EvergreenInstructions.td
@@ -75,6 +75,8 @@ def COS_eg : COS_Common<0x8E>;
 def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
 def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
 
+defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>;
+
 //===----------------------------------------------------------------------===//
 // Memory read/write instructions
 //===----------------------------------------------------------------------===//
@@ -273,7 +275,7 @@ def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
   VecALU
 >;
 
-def BFE_INT_eg : R600_3OP <0x4, "BFE_INT",
+def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
   [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))],
   VecALU
 >;
@@ -286,6 +288,13 @@ def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
   VecALU
 >;
 
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>;
+def : Pat<(i32 (sext_inreg i32:$src, i8)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>;
+def : Pat<(i32 (sext_inreg i32:$src, i16)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
+
 defm : BFIPatterns <BFI_INT_eg>;
 
 def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
@@ -294,8 +303,11 @@ def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
 >;
 
 def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
-  [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))], VecALU
+  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU
 >;
+
+def : UMad24Pat<MULADD_UINT24_eg>;
+
 def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
 def : ROTRPattern <BIT_ALIGN_INT_eg>;
 def MULADD_eg : MULADD_Common<0x14>;
@@ -309,7 +321,7 @@ def CNDGE_eg : CNDGE_Common<0x1B>;
 def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
 def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
 def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
-  [(set i32:$dst, (mul U24:$src0, U24:$src1))], VecALU
+  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU
 >;
 def DOT4_eg : DOT4_Common<0xBE>;
 defm CUBE_eg : CUBE_Common<0xC0>;
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 7105879..11ae091 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -12,6 +12,8 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
@@ -23,6 +25,21 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   printAnnotation(OS, Annot);
 }
 
+void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff);
+}
+
+void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
+}
+
 void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
   switch (reg) {
   case AMDGPU::VCC:
@@ -41,43 +58,78 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
     break;
   }
 
-  // It's seems there's no way to use SIRegisterInfo here, and dealing with the
-  // giant enum of all the different shifted sets of registers is pretty
-  // unmanagable, so parse the name and reformat it to be prettier.
-  StringRef Name(getRegisterName(reg));
-
-  std::pair<StringRef, StringRef> Split = Name.split('_');
-  StringRef SubRegName = Split.first;
-  StringRef Rest = Split.second;
+  char Type;
+  unsigned NumRegs;
+
+  if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 1;
+  } else  if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 1;
+  } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 2;
+  } else  if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 2;
+  } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 4;
+  } else  if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 4;
+  } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 3;
+  } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 8;
+  } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 8;
+  } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 16;
+  } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 16;
+  } else {
+    O << getRegisterName(reg);
+    return;
+  }
 
-  if (SubRegName.size() <= 4) { // Must at least be as long as "SGPR"/"VGPR".
-    O << Name;
+  // The low 8 bits encoding value is the register index, for both VGPRs and
+  // SGPRs.
+  unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8)  - 1);
+  if (NumRegs == 1) {
+    O << Type << RegIdx;
     return;
   }
 
-  unsigned RegIndex;
-  StringRef RegIndexStr = SubRegName.drop_front(4);
+  O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
+}
 
-  if (RegIndexStr.getAsInteger(10, RegIndex)) {
-    O << Name;
+void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) {
+  int32_t SImm = static_cast<int32_t>(Imm);
+  if (SImm >= -16 && SImm <= 64) {
+    O << SImm;
     return;
   }
 
-  if (SubRegName.front() == 'V')
-    O << 'v';
-  else if (SubRegName.front() == 'S')
-    O << 's';
-  else {
-    O << Name;
+  if (Imm == FloatToBits(1.0f) ||
+      Imm == FloatToBits(-1.0f) ||
+      Imm == FloatToBits(0.5f) ||
+      Imm == FloatToBits(-0.5f) ||
+      Imm == FloatToBits(2.0f) ||
+      Imm == FloatToBits(-2.0f) ||
+      Imm == FloatToBits(4.0f) ||
+      Imm == FloatToBits(-4.0f)) {
+    O << BitsToFloat(Imm);
     return;
   }
 
-  if (Rest.empty()) // Only 1 32-bit register
-    O << RegIndex;
-  else {
-    unsigned NumReg = Rest.count('_') + 2;
-    O << '[' << RegIndex << ':' << (RegIndex + NumReg - 1) << ']';
-  }
+  O << formatHex(static_cast<uint64_t>(Imm));
 }
 
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -95,7 +147,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       break;
     }
   } else if (Op.isImm()) {
-    O << Op.getImm();
+    printImmediate(Op.getImm(), O);
   } else if (Op.isFPImm()) {
     O << Op.getFPImm();
   } else if (Op.isExpr()) {
@@ -106,6 +158,18 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned InputModifiers = MI->getOperand(OpNo).getImm();
+  if (InputModifiers & 0x1)
+    O << "-";
+  if (InputModifiers & 0x2)
+    O << "|";
+  printOperand(MI, OpNo + 1, O);
+  if (InputModifiers & 0x2)
+    O << "|";
+}
+
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNum).getImm();
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 1d24680..6ca7170 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -29,13 +29,18 @@ public:
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 
 private:
-  static void printRegOperand(unsigned RegNo, raw_ostream &O);
-  static void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRegOperand(unsigned RegNo, raw_ostream &O);
+  void printImmediate(uint32_t Imm, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  static void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
                          StringRef Asm, StringRef Default = "");
   static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index a6bb59f..489cec7 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -23,8 +23,8 @@ namespace {
 class AMDGPUMCObjectWriter : public MCObjectWriter {
 public:
   AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { }
-  virtual void ExecutePostLayoutBinding(MCAssembler &Asm,
-                                        const MCAsmLayout &Layout) {
+  void ExecutePostLayoutBinding(MCAssembler &Asm,
+                                const MCAsmLayout &Layout) override {
     //XXX: Implement if necessary.
   }
   void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
@@ -34,7 +34,7 @@ public:
     assert(!"Not implemented");
   }
 
-  virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout);
+  void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 
 };
 
@@ -43,19 +43,19 @@ public:
   AMDGPUAsmBackend(const Target &T)
     : MCAsmBackend() {}
 
-  virtual unsigned getNumFixupKinds() const { return 0; };
-  virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                          uint64_t Value, bool IsPCRel) const;
-  virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                                    const MCRelaxableFragment *DF,
-                                    const MCAsmLayout &Layout) const {
+  unsigned getNumFixupKinds() const override { return 0; };
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
     return false;
   }
-  virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
     assert(!"Not implemented");
   }
-  virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; }
-  virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
     return true;
   }
 };
@@ -88,7 +88,7 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
 public:
   ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { }
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
     return createAMDGPUELFObjectWriter(OS);
   }
 };
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index aee9bd1..78bbe0a 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -35,7 +35,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   Data16bitsDirective = ".short\t";
   Data32bitsDirective = ".long\t";
   Data64bitsDirective = ".quad\t";
-  GPRel32Directive = 0;
+  GPRel32Directive = nullptr;
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
@@ -58,5 +58,5 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
 
 const MCSection*
 AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
-  return 0;
+  return nullptr;
 }
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
index 22afd63..59aebec 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -22,7 +22,7 @@ class StringRef;
 class AMDGPUMCAsmInfo : public MCAsmInfo {
 public:
   explicit AMDGPUMCAsmInfo(StringRef &TT);
-  const MCSection* getNonexecutableStackSection(MCContext &CTX) const;
+  const MCSection* getNonexecutableStackSection(MCContext &CTX) const override;
 };
 } // namespace llvm
 #endif // AMDGPUMCASMINFO_H
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 6592b0e..38a2956 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -24,6 +24,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "AMDGPUGenInstrInfo.inc"
 
@@ -33,8 +35,6 @@
 #define GET_REGINFO_MC_DESC
 #include "AMDGPUGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static MCInstrInfo *createAMDGPUMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitAMDGPUMCInstrInfo(X);
diff --git a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
index b1beab0..74b8ca0 100644
--- a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
+;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt -------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -19,5 +19,5 @@
 type = Library
 name = R600Desc
 parent = R600
-required_libraries = R600AsmPrinter R600Info MC
+required_libraries = MC R600AsmPrinter R600Info Support
 add_to_library_groups = R600
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 286c7d1..5e7cefe 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -41,14 +41,14 @@ public:
     : MCII(mcii), MRI(mri) { }
 
   /// \brief Encode the instruction and write it to the OS.
-  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   /// \returns the encoding for an MCOperand.
-  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const;
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
 private:
 
   void EmitByte(unsigned int byte, raw_ostream &OS) const;
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
index f42e978..ee02111 100644
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -54,14 +54,14 @@ public:
   ~SIMCCodeEmitter() { }
 
   /// \brief Encode the instruction and write it to the OS.
-  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   /// \returns the encoding for an MCOperand.
-  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const;
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
 };
 
 } // End anonymous namespace
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index fde4481..ce17d7c 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -106,3 +106,5 @@ def : Proc<"kabini",     SI_Itin, [FeatureSeaIslands]>;
 def : Proc<"kaveri",     SI_Itin, [FeatureSeaIslands]>;
 
 def : Proc<"hawaii",     SI_Itin, [FeatureSeaIslands]>;
+
+def : Proc<"mullins",    SI_Itin, [FeatureSeaIslands]>;
diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp
index 3d9015c..92bf0df 100644
--- a/lib/Target/R600/R600ClauseMergePass.cpp
+++ b/lib/Target/R600/R600ClauseMergePass.cpp
@@ -13,7 +13,6 @@
 /// It needs to be called after IfCvt for best results.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "r600mergeclause"
 #include "AMDGPU.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
@@ -27,6 +26,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "r600mergeclause"
+
 namespace {
 
 static bool isCFAlu(const MachineInstr *MI) {
@@ -62,9 +63,9 @@ private:
 public:
   R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const;
+  const char *getPassName() const override;
 };
 
 char R600ClauseMergePass::ID = 0;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index f74bef3..d255e96 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -12,7 +12,6 @@
 /// computing their address on the fly ; it also sets STACK_SIZE info.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "r600cf"
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
 #include "R600Defines.h"
@@ -26,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "r600cf"
+
 namespace {
 
 struct CFStack {
@@ -468,13 +469,13 @@ private:
 
 public:
   R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII (0), TRI(0),
+    TII (nullptr), TRI(nullptr),
     ST(tm.getSubtarget<AMDGPUSubtarget>()) {
       const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
       MaxFetchInst = ST.getTexVTXClauseSize();
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF) {
+  bool runOnMachineFunction(MachineFunction &MF) override {
     TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
     TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
@@ -501,13 +502,13 @@ public:
           DEBUG(dbgs() << CfCount << ":"; I->dump(););
           FetchClauses.push_back(MakeFetchClause(MBB, I));
           CfCount++;
-          LastAlu.back() = 0;
+          LastAlu.back() = nullptr;
           continue;
         }
 
         MachineBasicBlock::iterator MI = I;
         if (MI->getOpcode() != AMDGPU::ENDIF)
-          LastAlu.back() = 0;
+          LastAlu.back() = nullptr;
         if (MI->getOpcode() == AMDGPU::CF_ALU)
           LastAlu.back() = MI;
         I++;
@@ -558,7 +559,7 @@ public:
           break;
         }
         case AMDGPU::IF_PREDICATE_SET: {
-          LastAlu.push_back(0);
+          LastAlu.push_back(nullptr);
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_JUMP))
               .addImm(0)
@@ -665,7 +666,7 @@ public:
     return false;
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Control Flow Finalizer Pass";
   }
 };
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index 5bd793a..38afebe 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -291,12 +291,12 @@ private:
 
 public:
   static char ID;
-  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(0), Address(0) {
+  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
 
     initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF) {
+  bool runOnMachineFunction(MachineFunction &MF) override {
     TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
 
     for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
@@ -315,7 +315,7 @@ public:
     return false;
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Emit Clause Markers Pass";
   }
 };
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index ca1189d..732b06d 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -38,11 +38,11 @@ private:
 
 public:
   R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII(0) { }
+    TII(nullptr) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Expand special instructions pass";
   }
 };
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 6405a82..d6c6830 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -82,9 +82,31 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SELECT, MVT::i32, Expand);
   setOperationAction(ISD::SELECT, MVT::f32, Expand);
   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
-  setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
-  setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
+
+  // Expand sign extension of vectors
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
+
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
+
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
+
 
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
@@ -117,6 +139,11 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
+  // These should be replaced by UDVIREM, but it does not happen automatically
+  // during Type Legalization
+  setOperationAction(ISD::UDIV, MVT::i64, Custom);
+  setOperationAction(ISD::UREM, MVT::i64, Custom);
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
@@ -538,8 +565,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
         DAG.getConstant(2, MVT::i32), // SWZ_Z
         DAG.getConstant(3, MVT::i32) // SWZ_W
       };
-      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
-          Args, 8);
+      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
     }
 
     // default for switch(IntrinsicID)
@@ -689,7 +715,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
         Op.getOperand(9),
         Op.getOperand(10)
       };
-      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
+      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
     }
     case AMDGPUIntrinsic::AMDGPU_dp4: {
       SDValue Args[8] = {
@@ -710,7 +736,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
           DAG.getConstant(3, MVT::i32))
       };
-      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
+      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
     }
 
     case Intrinsic::r600_read_ngroups_x:
@@ -960,13 +986,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
   }
 
-
-  // Possible Min/Max pattern
-  SDValue MinMax = LowerMinMax(Op, DAG);
-  if (MinMax.getNode()) {
-    return MinMax;
-  }
-
   // If we make it this for it means we have no native instructions to handle
   // this SELECT_CC, so we must lower it.
   SDValue HWTrue, HWFalse;
@@ -1088,10 +1107,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
         DAG.getConstant(0, MVT::i32),
         Mask
       };
-      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
+      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
       SDValue Args[3] = { Chain, Input, DWordAddr };
       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
-                                     Op->getVTList(), Args, 3, MemVT,
+                                     Op->getVTList(), Args, MemVT,
                                      StoreNode->getMemOperand());
     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
                Value.getValueType().bitsGE(MVT::i32)) {
@@ -1131,7 +1150,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   if (ValueVT.isVector()) {
     unsigned NumElemVT = ValueVT.getVectorNumElements();
     EVT ElemVT = ValueVT.getVectorElementType();
-    SDValue Stores[4];
+    SmallVector<SDValue, 4> Stores(NumElemVT);
 
     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
                                       "vector width in load");
@@ -1148,7 +1167,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
                               Chain, Elem, Ptr,
                               DAG.getTargetConstant(Channel, MVT::i32));
     }
-     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
+     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
    } else {
     if (ValueVT == MVT::i8) {
       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
@@ -1212,10 +1231,11 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
 
   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
   if (Ret.getNode()) {
-    SDValue Ops[2];
-    Ops[0] = Ret;
-    Ops[1] = Chain;
-    return DAG.getMergeValues(Ops, 2, DL);
+    SDValue Ops[2] = {
+      Ret,
+      Chain
+    };
+    return DAG.getMergeValues(Ops, DL);
   }
 
 
@@ -1224,7 +1244,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
       SplitVectorLoad(Op, DAG),
       Chain
     };
-    return DAG.getMergeValues(MergedValues, 2, DL);
+    return DAG.getMergeValues(MergedValues, DL);
   }
 
   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
@@ -1232,8 +1252,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
     SDValue Result;
-    if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
-        isa<Constant>(LoadNode->getSrcValue()) ||
+    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
+        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
         isa<ConstantSDNode>(Ptr)) {
       SDValue Slots[4];
       for (unsigned i = 0; i < 4; i++) {
@@ -1252,7 +1272,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
         NewVT = VT;
         NumElements = VT.getVectorNumElements();
       }
-      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
+      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
+                           makeArrayRef(Slots, NumElements));
     } else {
       // non-constant ptr can't be folded, keeps it as a v4f32 load
       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
@@ -1268,10 +1289,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     }
 
     SDValue MergedValues[2] = {
-        Result,
-        Chain
+      Result,
+      Chain
     };
-    return DAG.getMergeValues(MergedValues, 2, DL);
+    return DAG.getMergeValues(MergedValues, DL);
   }
 
   // For most operations returning SDValue() will result in the node being
@@ -1295,7 +1316,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
 
     SDValue MergedValues[2] = { Sra, Chain };
-    return DAG.getMergeValues(MergedValues, 2, DL);
+    return DAG.getMergeValues(MergedValues, DL);
   }
 
   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
@@ -1332,7 +1353,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
       Loads[i] = DAG.getUNDEF(ElemVT);
     }
     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
-    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
+    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
   } else {
     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
                               Chain, Ptr,
@@ -1340,11 +1361,12 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
                               Op.getOperand(2));
   }
 
-  SDValue Ops[2];
-  Ops[0] = LoweredLoad;
-  Ops[1] = Chain;
+  SDValue Ops[2] = {
+    LoweredLoad,
+    Chain
+  };
 
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 /// XXX Only kernel functions are supported, so we can assume for now that
@@ -1365,8 +1387,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
 
   SmallVector<ISD::InputArg, 8> LocalIns;
 
-  getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
-                          LocalIns);
+  getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
 
   AnalyzeFormalArguments(CCInfo, LocalIns);
 
@@ -1392,32 +1413,38 @@ SDValue R600TargetLowering::LowerFormalArguments(
 
     // The first 36 bytes of the input buffer contains information about
     // thread group and global sizes.
-    SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
+
+    // FIXME: This should really check the extload type, but the handling of
+    // extload vecto parameters seems to be broken.
+    //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+    ISD::LoadExtType Ext = ISD::SEXTLOAD;
+    SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
                                  MachinePointerInfo(UndefValue::get(PtrTy)),
                                  MemVT, false, false, 4);
-    // 4 is the preferred alignment for
-    // the CONSTANT memory space.
+
+    // 4 is the preferred alignment for the CONSTANT memory space.
     InVals.push_back(Arg);
   }
   return Chain;
 }
 
 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-   if (!VT.isVector()) return MVT::i32;
+   if (!VT.isVector())
+     return MVT::i32;
    return VT.changeVectorElementTypeToInteger();
 }
 
-static SDValue
-CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
-                        DenseMap<unsigned, unsigned> &RemapSwizzle) {
+static SDValue CompactSwizzlableVector(
+  SelectionDAG &DAG, SDValue VectorEntry,
+  DenseMap<unsigned, unsigned> &RemapSwizzle) {
   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   assert(RemapSwizzle.empty());
   SDValue NewBldVec[4] = {
-      VectorEntry.getOperand(0),
-      VectorEntry.getOperand(1),
-      VectorEntry.getOperand(2),
-      VectorEntry.getOperand(3)
+    VectorEntry.getOperand(0),
+    VectorEntry.getOperand(1),
+    VectorEntry.getOperand(2),
+    VectorEntry.getOperand(3)
   };
 
   for (unsigned i = 0; i < 4; i++) {
@@ -1448,7 +1475,7 @@ CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
   }
 
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
-      VectorEntry.getValueType(), NewBldVec, 4);
+                     VectorEntry.getValueType(), NewBldVec);
 }
 
 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
@@ -1486,7 +1513,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
   }
 
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
-      VectorEntry.getValueType(), NewBldVec, 4);
+                     VectorEntry.getValueType(), NewBldVec);
 }
 
 
@@ -1524,6 +1551,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
 
   switch (N->getOpcode()) {
+  default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
   case ISD::FP_ROUND: {
       SDValue Arg = N->getOperand(0);
@@ -1613,8 +1641,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     }
 
     // Return the new vector
-    return DAG.getNode(ISD::BUILD_VECTOR, dl,
-                       VT, &Ops[0], Ops.size());
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   }
 
   // Extract_vec (Build_vector) generated by custom lowering
@@ -1638,6 +1665,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   }
 
   case ISD::SELECT_CC: {
+    // Try common optimizations
+    SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+    if (Ret.getNode())
+      return Ret;
+
     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
     //      selectcc x, y, a, b, inv(cc)
     //
@@ -1697,7 +1729,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     };
     SDLoc DL(N);
     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
-    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
+    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
   }
   case AMDGPUISD::TEXTURE_FETCH: {
     SDValue Arg = N->getOperand(1);
@@ -1727,10 +1759,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     };
     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
-        NewArgs, 19);
+        NewArgs);
   }
   }
-  return SDValue();
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
 static bool
@@ -1779,8 +1812,7 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
     };
     std::vector<unsigned> Consts;
-    for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
-      int OtherSrcIdx = SrcIndices[i];
+    for (int OtherSrcIdx : SrcIndices) {
       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
         continue;
@@ -1791,14 +1823,14 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
       if (RegisterSDNode *Reg =
           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
         if (Reg->getReg() == AMDGPU::ALU_CONST) {
-          ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
-              ParentNode->getOperand(OtherSelIdx));
+          ConstantSDNode *Cst
+            = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
           Consts.push_back(Cst->getZExtValue());
         }
       }
     }
 
-    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+    ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
     Consts.push_back(Cst->getZExtValue());
     if (!TII->fitsConstReadLimitations(Consts)) {
       return false;
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index 22ef728..a8a464f 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -24,21 +24,21 @@ class R600InstrInfo;
 class R600TargetLowering : public AMDGPUTargetLowering {
 public:
   R600TargetLowering(TargetMachine &TM);
-  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
-      MachineBasicBlock * BB) const;
-  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  virtual void ReplaceNodeResults(SDNode * N,
-                                  SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) const override;
-  virtual SDValue LowerFormalArguments(
-                                      SDValue Chain,
-                                      CallingConv::ID CallConv,
-                                      bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc DL, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const;
-  virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const;
+  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
+      MachineBasicBlock * BB) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  void ReplaceNodeResults(SDNode * N,
+                          SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+  SDValue LowerFormalArguments(
+                              SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              SDLoc DL, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) const override;
+  EVT getSetCCResultType(LLVMContext &, EVT VT) const override;
 private:
   unsigned Gen;
   /// Each OpenCL kernel has nine implicit parameters that are stored in the
@@ -66,7 +66,7 @@ private:
   void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
                        unsigned &Channel, unsigned &PtrIncr) const;
   bool isZero(SDValue Op) const;
-  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
+  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
 };
 
 } // End namespace llvm;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 0281dd0..b0d9ae3 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -23,11 +23,11 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AMDGPUGenDFAPacketizer.inc"
 
-using namespace llvm;
-
 R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
   : AMDGPUInstrInfo(tm),
     RI(tm),
@@ -677,7 +677,7 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
       return MI;
   }
 
-  return NULL;
+  return nullptr;
 }
 
 static
@@ -797,7 +797,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
                             DebugLoc DL) const {
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
-  if (FBB == 0) {
+  if (!FBB) {
     if (Cond.empty()) {
       BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
       return 1;
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index d5ff4de..b5304a0 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -50,13 +50,13 @@ namespace llvm {
 
   explicit R600InstrInfo(AMDGPUTargetMachine &tm);
 
-  const R600RegisterInfo &getRegisterInfo() const;
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
+  const R600RegisterInfo &getRegisterInfo() const override;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
   bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI) const;
+                           MachineBasicBlock::iterator MBBI) const override;
 
   bool isTrig(const MachineInstr &MI) const;
   bool isPlaceHolderOpcode(unsigned opcode) const;
@@ -142,79 +142,79 @@ namespace llvm {
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
 
-  virtual unsigned getIEQOpcode() const;
-  virtual bool isMov(unsigned Opcode) const;
+  unsigned getIEQOpcode() const override;
+  bool isMov(unsigned Opcode) const override;
 
   DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM,
-                                           const ScheduleDAG *DAG) const;
+                                           const ScheduleDAG *DAG) const override;
 
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
-                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const;
+                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
 
-  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const override;
 
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
 
-  bool isPredicated(const MachineInstr *MI) const;
+  bool isPredicated(const MachineInstr *MI) const override;
 
-  bool isPredicable(MachineInstr *MI) const;
+  bool isPredicable(MachineInstr *MI) const override;
 
   bool
    isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
-                             const BranchProbability &Probability) const;
+                             const BranchProbability &Probability) const override;
 
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
                            unsigned ExtraPredCycles,
-                           const BranchProbability &Probability) const ;
+                           const BranchProbability &Probability) const override ;
 
   bool
    isProfitableToIfCvt(MachineBasicBlock &TMBB,
                        unsigned NumTCycles, unsigned ExtraTCycles,
                        MachineBasicBlock &FMBB,
                        unsigned NumFCycles, unsigned ExtraFCycles,
-                       const BranchProbability &Probability) const;
+                       const BranchProbability &Probability) const override;
 
   bool DefinesPredicate(MachineInstr *MI,
-                                  std::vector<MachineOperand> &Pred) const;
+                                  std::vector<MachineOperand> &Pred) const override;
 
   bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+                         const SmallVectorImpl<MachineOperand> &Pred2) const override;
 
   bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
-                                          MachineBasicBlock &FMBB) const;
+                                          MachineBasicBlock &FMBB) const override;
 
   bool PredicateInstruction(MachineInstr *MI,
-                        const SmallVectorImpl<MachineOperand> &Pred) const;
+                        const SmallVectorImpl<MachineOperand> &Pred) const override;
 
-  unsigned int getPredicationCost(const MachineInstr *) const;
+  unsigned int getPredicationCost(const MachineInstr *) const override;
 
   unsigned int getInstrLatency(const InstrItineraryData *ItinData,
                                const MachineInstr *MI,
-                               unsigned *PredCost = 0) const;
+                               unsigned *PredCost = nullptr) const override;
 
-  virtual int getInstrLatency(const InstrItineraryData *ItinData,
-                              SDNode *Node) const { return 1;}
+  int getInstrLatency(const InstrItineraryData *ItinData,
+                      SDNode *Node) const override { return 1;}
 
   /// \brief Reserve the registers that may be accesed using indirect addressing.
   void reserveIndirectRegisters(BitVector &Reserved,
                                 const MachineFunction &MF) const;
 
-  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
-                                            unsigned Channel) const;
+  unsigned calculateIndirectAddress(unsigned RegIndex,
+                                    unsigned Channel) const override;
 
-  virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
+  const TargetRegisterClass *getIndirectAddrRegClass() const override;
 
-  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                  MachineBasicBlock::iterator I,
-                                  unsigned ValueReg, unsigned Address,
-                                  unsigned OffsetReg) const;
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                          MachineBasicBlock::iterator I,
+                          unsigned ValueReg, unsigned Address,
+                          unsigned OffsetReg) const override;
 
-  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                  MachineBasicBlock::iterator I,
-                                  unsigned ValueReg, unsigned Address,
-                                  unsigned OffsetReg) const;
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg) const override;
 
   unsigned getMaxAlusPerClause() const;
 
@@ -244,7 +244,7 @@ namespace llvm {
 
   MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg) const;
+                              unsigned DstReg, unsigned SrcReg) const override;
 
   /// \brief Get the index of Op in the MachineInstr.
   ///
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index d2075c0..590fde2 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1625,6 +1625,12 @@ def : DwordAddrPat  <i32, R600_Reg32>;
 
 } // End isR600toCayman Predicate
 
+let Predicates = [isR600] in {
+// Intrinsic patterns
+defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>;
+defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>;
+} // End isR600
+
 def getLDSNoRetOp : InstrMapping {
   let FilterClass = "R600_LDS_1A1D";
   let RowFields = ["BaseOp"];
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index c1bec0a..b0ae22e 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -21,7 +21,7 @@
 namespace llvm {
 
 class R600MachineFunctionInfo : public AMDGPUMachineFunction {
-  virtual void anchor();
+  void anchor() override;
 public:
   R600MachineFunctionInfo(const MachineFunction &MF);
   SmallVector<unsigned, 4> LiveOuts;
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index d3ffb50..d1655d1 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -12,8 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "misched"
-
 #include "R600MachineScheduler.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -23,6 +21,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "misched"
+
 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
   DAG = static_cast<ScheduleDAGMILive*>(dag);
@@ -56,7 +56,7 @@ unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
 }
 
 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
-  SUnit *SU = 0;
+  SUnit *SU = nullptr;
   NextInstKind = IDOther;
 
   IsTopNode = false;
@@ -316,7 +316,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
 
 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
   if (Q.empty())
-    return NULL;
+    return nullptr;
   for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
       It != E; ++It) {
     SUnit *SU = *It;
@@ -331,7 +331,7 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
       InstructionsGroupCandidate.pop_back();
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 void R600SchedStrategy::LoadAlu() {
@@ -448,11 +448,11 @@ SUnit* R600SchedStrategy::pickAlu() {
     }
     PrepareNextSlot();
   }
-  return NULL;
+  return nullptr;
 }
 
 SUnit* R600SchedStrategy::pickOther(int QID) {
-  SUnit *SU = 0;
+  SUnit *SU = nullptr;
   std::vector<SUnit *> &AQ = Available[QID];
 
   if (AQ.empty()) {
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index b909ff7..fd475af 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -68,17 +68,16 @@ class R600SchedStrategy : public MachineSchedStrategy {
 
 public:
   R600SchedStrategy() :
-    DAG(0), TII(0), TRI(0), MRI(0) {
+    DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) {
   }
 
-  virtual ~R600SchedStrategy() {
-  }
+  virtual ~R600SchedStrategy() {}
 
-  virtual void initialize(ScheduleDAGMI *dag);
-  virtual SUnit *pickNode(bool &IsTopNode);
-  virtual void schedNode(SUnit *SU, bool IsTopNode);
-  virtual void releaseTopNode(SUnit *SU);
-  virtual void releaseBottomNode(SUnit *SU);
+  void initialize(ScheduleDAGMI *dag) override;
+  SUnit *pickNode(bool &IsTopNode) override;
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+  void releaseTopNode(SUnit *SU) override;
+  void releaseBottomNode(SUnit *SU) override;
 
 private:
   std::vector<MachineInstr *> InstructionsGroupCandidate;
diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
index 767e5e3..2314136 100644
--- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
@@ -27,7 +27,6 @@
 /// to reduce MOV count.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "vec-merger"
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
 #include "R600InstrInfo.h"
@@ -42,6 +41,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "vec-merger"
+
 namespace {
 
 static bool
@@ -107,9 +108,9 @@ private:
 public:
   static char ID;
   R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
-  TII(0) { }
+  TII(nullptr) { }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
@@ -118,11 +119,11 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Vector Registers Merge Pass";
   }
 
-  bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 
 char R600VectorRegMerger::ID = 0;
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index b7b7610..c2f6c03 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "packets"
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
 #include "R600InstrInfo.h"
@@ -28,6 +27,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "packets"
+
 namespace {
 
 class R600Packetizer : public MachineFunctionPass {
@@ -36,7 +37,7 @@ public:
   static char ID;
   R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
@@ -45,11 +46,11 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Packetizer";
   }
 
-  bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 char R600Packetizer::ID = 0;
 
@@ -155,18 +156,19 @@ public:
   }
 
   // initPacketizerState - initialize some internal flags.
-  void initPacketizerState() {
+  void initPacketizerState() override {
     ConsideredInstUsesAlreadyWrittenVectorElement = false;
   }
 
   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-  bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) {
+  bool ignorePseudoInstruction(MachineInstr *MI,
+                               MachineBasicBlock *MBB) override {
     return false;
   }
 
   // isSoloInstruction - return true if instruction MI can not be packetized
   // with any other instruction, which means that MI itself is a packet.
-  bool isSoloInstruction(MachineInstr *MI) {
+  bool isSoloInstruction(MachineInstr *MI) override {
     if (TII->isVector(*MI))
       return true;
     if (!TII->isALUInstr(MI->getOpcode()))
@@ -182,7 +184,7 @@ public:
 
   // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
   // together.
-  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override {
     MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
     if (getSlot(MII) == getSlot(MIJ))
       ConsideredInstUsesAlreadyWrittenVectorElement = true;
@@ -219,7 +221,9 @@ public:
 
   // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
   // and SUJ.
-  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {return false;}
+  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
+    return false;
+  }
 
   void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
     unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
@@ -288,7 +292,7 @@ public:
     return true;
   }
 
-  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override {
     MachineBasicBlock::iterator FirstInBundle =
         CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
     const DenseMap<unsigned, unsigned> &PV =
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index c74c49e..52e1a4b 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -28,27 +28,28 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
 
   R600RegisterInfo(AMDGPUTargetMachine &tm);
 
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   /// \param RC is an AMDIL reg class.
   ///
   /// \returns the R600 reg class that is equivalent to \p RC.
-  virtual const TargetRegisterClass *getISARegClass(
-    const TargetRegisterClass *RC) const;
+  const TargetRegisterClass *getISARegClass(
+    const TargetRegisterClass *RC) const override;
 
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
 
-  virtual unsigned getHWRegIndex(unsigned Reg) const;
+  unsigned getHWRegIndex(unsigned Reg) const override;
 
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
-  virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
 
-  virtual const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
+  const RegClassWeight &
+    getRegClassWeight(const TargetRegisterClass *RC) const override;
 
   // \returns true if \p Reg can be defined in one ALU caluse and used in another.
-  virtual bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
+  bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
index 9d24404..419ec8b 100644
--- a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
+++ b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
@@ -209,7 +209,7 @@ public:
     FunctionPass(ID) {
   }
 
-  virtual bool doInitialization(Module &M) {
+  bool doInitialization(Module &M) override {
     LLVMContext &Ctx = M.getContext();
     Mod = &M;
     FloatType = Type::getFloatTy(Ctx);
@@ -245,16 +245,16 @@ public:
     return false;
   }
 
-  virtual bool runOnFunction(Function &F) {
+  bool runOnFunction(Function &F) override {
     visit(F);
     return false;
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Texture Intrinsics Replacer";
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
   }
 
   void visitCallInst(CallInst &I) {
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
index f9214a8..d6e4451 100644
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -12,8 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "si-annotate-control-flow"
-
 #include "AMDGPU.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/IR/Constants.h"
@@ -26,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "si-annotate-control-flow"
+
 namespace {
 
 // Complex types used in this pass
@@ -91,15 +91,15 @@ public:
   SIAnnotateControlFlow():
     FunctionPass(ID) { }
 
-  virtual bool doInitialization(Module &M);
+  bool doInitialization(Module &M) override;
 
-  virtual bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "SI annotate control flow";
   }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
@@ -118,7 +118,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   Void = Type::getVoidTy(Context);
   Boolean = Type::getInt1Ty(Context);
   Int64 = Type::getInt64Ty(Context);
-  ReturnStruct = StructType::get(Boolean, Int64, (Type *)0);
+  ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
 
   BoolTrue = ConstantInt::getTrue(Context);
   BoolFalse = ConstantInt::getFalse(Context);
@@ -126,25 +126,25 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   Int64Zero = ConstantInt::get(Int64, 0);
 
   If = M.getOrInsertFunction(
-    IfIntrinsic, ReturnStruct, Boolean, (Type *)0);
+    IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
 
   Else = M.getOrInsertFunction(
-    ElseIntrinsic, ReturnStruct, Int64, (Type *)0);
+    ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
 
   Break = M.getOrInsertFunction(
-    BreakIntrinsic, Int64, Int64, (Type *)0);
+    BreakIntrinsic, Int64, Int64, (Type *)nullptr);
 
   IfBreak = M.getOrInsertFunction(
-    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0);
+    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
 
   ElseBreak = M.getOrInsertFunction(
-    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0);
+    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
 
   Loop = M.getOrInsertFunction(
-    LoopIntrinsic, Boolean, Int64, (Type *)0);
+    LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
 
   EndCf = M.getOrInsertFunction(
-    EndCfIntrinsic, Void, Int64, (Type *)0);
+    EndCfIntrinsic, Void, Int64, (Type *)nullptr);
 
   return false;
 }
diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
index 402f1f4..5f71453 100644
--- a/lib/Target/R600/SIFixSGPRCopies.cpp
+++ b/lib/Target/R600/SIFixSGPRCopies.cpp
@@ -65,7 +65,6 @@
 /// ultimately led to the creation of an illegal COPY.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sgpr-copies"
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -77,6 +76,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "sgpr-copies"
+
 namespace {
 
 class SIFixSGPRCopies : public MachineFunctionPass {
@@ -97,9 +98,9 @@ private:
 public:
   SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "SI Fix SGPR copies";
   }
 
@@ -184,7 +185,8 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
   const TargetRegisterClass *SrcRC;
 
   if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
-      DstRC == &AMDGPU::M0RegRegClass)
+      DstRC == &AMDGPU::M0RegRegClass ||
+      MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass)
     return false;
 
   SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg);
@@ -256,6 +258,19 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         TII->moveToVALU(MI);
         break;
       }
+      case AMDGPU::INSERT_SUBREG: {
+        const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
+        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
+        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
+        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+        if (TRI->isSGPRClass(DstRC) &&
+            (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
+          DEBUG(dbgs() << " Fixing INSERT_SUBREG:\n");
+          DEBUG(MI.print(dbgs()));
+          TII->moveToVALU(MI);
+        }
+        break;
+      }
       }
     }
   }
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 0b55411..c9e247c 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -29,22 +29,21 @@ using namespace llvm;
 
 SITargetLowering::SITargetLowering(TargetMachine &TM) :
     AMDGPUTargetLowering(TM) {
-  addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
-  addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass);
+  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
+  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
 
   addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
   addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
 
-  addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass);
-  addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass);
+  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
 
-  addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass);
-  addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass);
-  addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass);
+  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
 
-  addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
-  addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass);
 
   addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
@@ -78,8 +77,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::ADDC, MVT::i32, Legal);
   setOperationAction(ISD::ADDE, MVT::i32, Legal);
 
-  setOperationAction(ISD::BITCAST, MVT::i128, Legal);
-
   // We need to custom lower vector stores from local memory
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
@@ -99,10 +96,11 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::i1, Custom);
   setOperationAction(ISD::STORE, MVT::i32, Custom);
   setOperationAction(ISD::STORE, MVT::i64, Custom);
-  setOperationAction(ISD::STORE, MVT::i128, Custom);
   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 
+  setOperationAction(ISD::SELECT, MVT::f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Promote);
   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
@@ -119,6 +117,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
   setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
 
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
@@ -126,39 +140,48 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
 
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
   setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
-  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
 
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 
+  // These should use UDIVREM, so set them to expand
+  setOperationAction(ISD::UDIV, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   MVT VecTypes[] = {
     MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32
   };
 
-  const size_t NumVecTypes = array_lengthof(VecTypes);
-  for (unsigned Type = 0; Type < NumVecTypes; ++Type) {
+  for (MVT VT : VecTypes) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch(Op) {
       case ISD::LOAD:
@@ -172,7 +195,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
       case ISD::EXTRACT_SUBVECTOR:
         break;
       default:
-        setOperationAction(Op, VecTypes[Type], Expand);
+        setOperationAction(Op, VT, Expand);
         break;
       }
     }
@@ -189,6 +212,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+    setOperationAction(ISD::FRINT, MVT::f64, Legal);
   }
 
   setTargetDAGCombine(ISD::SELECT_CC);
@@ -204,10 +228,40 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT  VT,
                                                      unsigned AddrSpace,
                                                      bool *IsFast) const {
+  if (IsFast)
+    *IsFast = false;
+
   // XXX: This depends on the address space and also we may want to revist
   // the alignment values we specify in the DataLayout.
+
+  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
+  // which isn't a simple VT.
   if (!VT.isSimple() || VT == MVT::Other)
     return false;
+
+  // XXX - CI changes say "Support for unaligned memory accesses" but I don't
+  // see what for specifically. The wording everywhere else seems to be the
+  // same.
+
+  // 3.6.4 - Operations using pairs of VGPRs (for example: double-floats) have
+  // no alignment restrictions.
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
+    // Using any pair of GPRs should be the same as any other pair.
+    if (IsFast)
+      *IsFast = true;
+    return VT.bitsGE(MVT::i64);
+  }
+
+  // XXX - The only mention I see of this in the ISA manual is for LDS direct
+  // reads the "byte address and must be dword aligned". Is it also true for the
+  // normal loads and stores?
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS)
+    return false;
+
+  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
+  // byte-address are ignored, thus forcing Dword alignment.
+  if (IsFast)
+    *IsFast = true;
   return VT.bitsGT(MVT::i32);
 }
 
@@ -224,7 +278,7 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
 
 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          SDLoc DL, SDValue Chain,
-                                         unsigned Offset) const {
+                                         unsigned Offset, bool Signed) const {
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
   PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
                                             AMDGPUAS::CONSTANT_ADDRESS);
@@ -232,7 +286,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                            MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
   SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                              DAG.getConstant(Offset, MVT::i64));
-  return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr,
+  return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr,
                             MachinePointerInfo(UndefValue::get(PtrTy)), MemVT,
                             false, false, MemVT.getSizeInBits() >> 3);
 
@@ -340,7 +394,8 @@ SDValue SITargetLowering::LowerFormalArguments(
       // The first 36 bytes of the input buffer contains information about
       // thread group and global sizes.
       SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
-                                   36 + VA.getLocMemOffset());
+                                   36 + VA.getLocMemOffset(),
+                                   Ins[i].Flags.isSExt());
       InVals.push_back(Arg);
       continue;
     }
@@ -381,8 +436,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       for (unsigned j = 0; j != NumElements; ++j)
         Regs.push_back(DAG.getUNDEF(VT));
 
-      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT,
-                                   Regs.data(), Regs.size()));
+      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
       continue;
     }
 
@@ -395,15 +449,15 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
   MachineBasicBlock::iterator I = *MI;
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
 
   switch (MI->getOpcode()) {
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH: return BB;
   case AMDGPU::SI_ADDR64_RSRC: {
-    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
     unsigned SuperReg = MI->getOperand(0).getReg();
     unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
     unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
@@ -428,9 +482,7 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MI->eraseFromParent();
     break;
   }
-  case AMDGPU::V_SUB_F64: {
-    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  case AMDGPU::V_SUB_F64:
     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64),
             MI->getOperand(0).getReg())
             .addReg(MI->getOperand(1).getReg())
@@ -442,11 +494,9 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
             .addImm(2); /* NEG */
     MI->eraseFromParent();
     break;
-  }
+
   case AMDGPU::SI_RegisterStorePseudo: {
     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     MachineInstrBuilder MIB =
         BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
@@ -455,6 +505,50 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
       MIB.addOperand(MI->getOperand(i));
 
     MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FABS_SI: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
+            Reg)
+            .addImm(0x7fffffff);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32),
+            MI->getOperand(0).getReg())
+            .addReg(MI->getOperand(1).getReg())
+            .addReg(Reg);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FNEG_SI: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
+            Reg)
+            .addImm(0x80000000);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32),
+            MI->getOperand(0).getReg())
+            .addReg(MI->getOperand(1).getReg())
+            .addReg(Reg);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FCLAMP_SI: {
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64),
+            MI->getOperand(0).getReg())
+            .addImm(0) // SRC0 modifiers
+            .addOperand(MI->getOperand(1))
+            .addImm(0) // SRC1 modifiers
+            .addImm(0) // SRC1
+            .addImm(1) // CLAMP
+            .addImm(0); // OMOD
+    MI->eraseFromParent();
   }
   }
   return BB;
@@ -510,7 +604,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
         SplitVectorLoad(Op, DAG),
         Load->getChain()
       };
-      return DAG.getMergeValues(MergedValues, 2, SDLoc(Op));
+      return DAG.getMergeValues(MergedValues, SDLoc(Op));
     } else {
       return LowerLOAD(Op, DAG);
     }
@@ -533,23 +627,23 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     switch (IntrinsicID) {
     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
     case Intrinsic::r600_read_ngroups_x:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false);
     case Intrinsic::r600_read_ngroups_y:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false);
     case Intrinsic::r600_read_ngroups_z:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false);
     case Intrinsic::r600_read_global_size_x:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false);
     case Intrinsic::r600_read_global_size_y:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false);
     case Intrinsic::r600_read_global_size_z:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false);
     case Intrinsic::r600_read_local_size_x:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false);
     case Intrinsic::r600_read_local_size_y:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false);
     case Intrinsic::r600_read_local_size_z:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
     case Intrinsic::r600_read_tgid_x:
       return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
                      AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
@@ -570,7 +664,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
                                   AMDGPU::VGPR2, VT);
     case AMDGPUIntrinsic::SI_load_const: {
       SDValue Ops [] = {
-        ResourceDescriptorToi128(Op.getOperand(1), DAG),
+        Op.getOperand(1),
         Op.getOperand(2)
       };
 
@@ -579,7 +673,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
           MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
           VT.getSizeInBits() / 8, 4);
       return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
-                                     Op->getVTList(), Ops, 2, VT, MMO);
+                                     Op->getVTList(), Ops, VT, MMO);
     }
     case AMDGPUIntrinsic::SI_sample:
       return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
@@ -591,7 +685,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
       return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
     case AMDGPUIntrinsic::SI_vs_load_input:
       return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
-                         ResourceDescriptorToi128(Op.getOperand(1), DAG),
+                         Op.getOperand(1),
                          Op.getOperand(2),
                          Op.getOperand(3));
     }
@@ -606,7 +700,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
         SDLoc DL(Op);
         SDValue Ops [] = {
           Chain,
-          ResourceDescriptorToi128(Op.getOperand(2), DAG),
+          Op.getOperand(2),
           Op.getOperand(3),
           Op.getOperand(4),
           Op.getOperand(5),
@@ -627,8 +721,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
             MachineMemOperand::MOStore,
             VT.getSizeInBits() / 8, 4);
         return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                       Op->getVTList(), Ops,
-                                       sizeof(Ops)/sizeof(Ops[0]), VT, MMO);
+                                       Op->getVTList(), Ops, VT, MMO);
       }
       default:
         break;
@@ -650,7 +743,7 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
     if (I->getOpcode() == Opcode)
       return *I;
   }
-  return 0;
+  return nullptr;
 }
 
 /// This transforms the control flow intrinsics to get the branch destination as
@@ -662,7 +755,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
 
   SDNode *Intr = BRCOND.getOperand(1).getNode();
   SDValue Target = BRCOND.getOperand(2);
-  SDNode *BR = 0;
+  SDNode *BR = nullptr;
 
   if (Intr->getOpcode() == ISD::SETCC) {
     // As long as we negate the condition everything is fine
@@ -695,7 +788,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   // build the new intrinsic call
   SDNode *Result = DAG.getNode(
     Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
-    DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode();
+    DAG.getVTList(Res), Ops).getNode();
 
   if (BR) {
     // Give the branch instruction our target
@@ -703,7 +796,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
       BR->getOperand(0),
       BRCOND.getOperand(2)
     };
-    DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2);
+    DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops);
   }
 
   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
@@ -739,7 +832,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   MergedValues[1] = Load->getChain();
   if (Ret.getNode()) {
     MergedValues[0] = Ret;
-    return DAG.getMergeValues(MergedValues, 2, DL);
+    return DAG.getMergeValues(MergedValues, DL);
   }
 
   if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
@@ -770,30 +863,16 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
 
   MergedValues[0] = Ret;
-  return DAG.getMergeValues(MergedValues, 2, DL);
+  return DAG.getMergeValues(MergedValues, DL);
 
 }
 
-SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op,
-                                             SelectionDAG &DAG) const {
-
-  if (Op.getValueType() == MVT::i128) {
-    return Op;
-  }
-
-  assert(Op.getOpcode() == ISD::UNDEF);
-
-  return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128,
-                     DAG.getConstant(0, MVT::i64),
-                     DAG.getConstant(0, MVT::i64));
-}
-
 SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
                                                const SDValue &Op,
                                                SelectionDAG &DAG) const {
   return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
                      Op.getOperand(2),
-                     ResourceDescriptorToi128(Op.getOperand(3), DAG),
+                     Op.getOperand(3),
                      Op.getOperand(4));
 }
 
@@ -833,12 +912,6 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
 
-  // Possible Min/Max pattern
-  SDValue MinMax = LowerMinMax(Op, DAG);
-  if (MinMax.getNode()) {
-    return MinMax;
-  }
-
   SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
   return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
 }
@@ -948,8 +1021,12 @@ SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
     return SDValue();
   }
 
-  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0),
-                                              DAG.getConstant(0, MVT::i32));
+  SDValue Src = Op.getOperand(0);
+  if (Src.getValueType() != MVT::i32)
+    Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
+
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero);
 }
 
 //===----------------------------------------------------------------------===//
@@ -963,7 +1040,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   EVT VT = N->getValueType(0);
 
   switch (N->getOpcode()) {
-    default: break;
+    default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
     case ISD::SELECT_CC: {
       ConstantSDNode *True, *False;
       // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
@@ -982,7 +1059,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
       SDValue Arg0 = N->getOperand(0);
       SDValue Arg1 = N->getOperand(1);
       SDValue CC = N->getOperand(2);
-      ConstantSDNode * C = NULL;
+      ConstantSDNode * C = nullptr;
       ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
 
       // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
@@ -998,7 +1075,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
       break;
     }
   }
-  return SDValue();
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
 /// \brief Test if RegClass is one of the VSrc classes
@@ -1029,9 +1107,11 @@ int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
         return -1;
     }
     Imm.I = Node->getSExtValue();
-  } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N))
+  } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
+    if (N->getValueType(0) != MVT::f32)
+      return -1;
     Imm.F = Node->getValueAPF().convertToFloat();
-  else
+  } else
     return -1; // It isn't an immediate
 
   if ((Imm.I >= -16 && Imm.I <= 64) ||
@@ -1051,7 +1131,7 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
   MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
   const SIInstrInfo *TII =
     static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
-  if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode()))
+  if (!Mov || !TII->isMov(Mov->getMachineOpcode()))
     return false;
 
   const SDValue &Op = Mov->getOperand(0);
@@ -1098,7 +1178,7 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode(
       }
       return TRI.getPhysRegClass(Reg);
     }
-    default:  return NULL;
+    default:  return nullptr;
     }
   }
   const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode());
@@ -1202,17 +1282,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
 
   // Commuted opcode if available
   int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1;
-  const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev);
+  const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev);
 
   assert(!DescRev || DescRev->getNumDefs() == NumDefs);
   assert(!DescRev || DescRev->getNumOperands() == NumOps);
 
   // e64 version if available, -1 otherwise
   int OpcodeE64 = AMDGPU::getVOPe64(Opcode);
-  const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64);
+  const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64);
+  int InputModifiers[3] = {0};
 
   assert(!DescE64 || DescE64->getNumDefs() == NumDefs);
-  assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4));
 
   int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
   bool HaveVSrc = false, HaveSSrc = false;
@@ -1279,17 +1359,18 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
            fitsRegClass(DAG, Ops[1], OtherRegClass))) {
 
         // Swap commutable operands
-        SDValue Tmp = Ops[1];
-        Ops[1] = Ops[0];
-        Ops[0] = Tmp;
+        std::swap(Ops[0], Ops[1]);
 
         Desc = DescRev;
-        DescRev = 0;
+        DescRev = nullptr;
         continue;
       }
     }
 
-    if (DescE64 && !Immediate) {
+    if (Immediate)
+      continue;
+
+    if (DescE64) {
 
       // Test if it makes sense to switch to e64 encoding
       unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
@@ -1305,14 +1386,46 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
         Immediate = -1;
         Promote2e64 = true;
         Desc = DescE64;
-        DescE64 = 0;
+        DescE64 = nullptr;
       }
     }
+
+    if (!DescE64 && !Promote2e64)
+      continue;
+    if (!Operand.isMachineOpcode())
+      continue;
+    if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) {
+      Ops.pop_back();
+      Ops.push_back(Operand.getOperand(0));
+      InputModifiers[i] = 1;
+      Promote2e64 = true;
+      if (!DescE64)
+        continue;
+      Desc = DescE64;
+      DescE64 = 0;
+    }
+    else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) {
+      Ops.pop_back();
+      Ops.push_back(Operand.getOperand(0));
+      InputModifiers[i] = 2;
+      Promote2e64 = true;
+      if (!DescE64)
+        continue;
+      Desc = DescE64;
+      DescE64 = 0;
+    }
   }
 
   if (Promote2e64) {
+    std::vector<SDValue> OldOps(Ops);
+    Ops.clear();
+    for (unsigned i = 0; i < OldOps.size(); ++i) {
+      // src_modifier
+      Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32));
+      Ops.push_back(OldOps[i]);
+    }
     // Add the modifier flags while promoting
-    for (unsigned i = 0; i < 4; ++i)
+    for (unsigned i = 0; i < 2; ++i)
       Ops.push_back(DAG.getTargetConstant(0, MVT::i32));
   }
 
@@ -1390,7 +1503,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
   for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
     Ops.push_back(Node->getOperand(i));
-  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
+  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
 
   // If we only got one lane, replace it with a copy
   // (if NewDmask has only one bit set...)
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index ca73f53..c6eaa81 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -22,7 +22,7 @@ namespace llvm {
 
 class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL,
-                         SDValue Chain, unsigned Offset) const;
+                         SDValue Chain, unsigned Offset, bool Signed) const;
   SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
                                SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
@@ -33,7 +33,6 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue ResourceDescriptorToi128(SDValue Op, SelectionDAG &DAG) const;
   bool foldImm(SDValue &Operand, int32_t &Immediate,
                bool &ScalarSlotUsed) const;
   const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG,
@@ -49,32 +48,33 @@ class SITargetLowering : public AMDGPUTargetLowering {
 
 public:
   SITargetLowering(TargetMachine &tm);
-  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, bool *IsFast) const;
-  virtual bool shouldSplitVectorType(EVT VT) const override;
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
+                                     bool *IsFast) const override;
+  bool shouldSplitVectorType(EVT VT) const override;
 
-  virtual bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                                 Type *Ty) const override;
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                        Type *Ty) const override;
 
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
                                SDLoc DL, SelectionDAG &DAG,
-                               SmallVectorImpl<SDValue> &InVals) const;
+                               SmallVectorImpl<SDValue> &InVals) const override;
 
-  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
-                                              MachineBasicBlock * BB) const;
-  virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
-  virtual MVT getScalarShiftAmountTy(EVT VT) const;
-  virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
-  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
-  virtual void AdjustInstrPostInstrSelection(MachineInstr *MI,
-                                             SDNode *Node) const;
+  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
+                                      MachineBasicBlock * BB) const override;
+  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+  MVT getScalarShiftAmountTy(EVT VT) const override;
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+  void AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                     SDNode *Node) const override;
 
   int32_t analyzeImmediate(const SDNode *N) const;
   SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
-                               unsigned Reg, EVT VT) const;
+                               unsigned Reg, EVT VT) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 695ec40..a17fed7 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -97,13 +97,13 @@ private:
 public:
   SIInsertWaits(TargetMachine &tm) :
     MachineFunctionPass(ID),
-    TII(0),
-    TRI(0),
+    TII(nullptr),
+    TRI(nullptr),
     ExpInstrTypesSeen(0) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "SI insert wait  instructions";
   }
 
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index aa2c22c..168eff2 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
-    AMDGPUInst<outs, ins, asm, pattern> {
+    AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
 
   field bits<1> VM_CNT = 0;
   field bits<1> EXP_CNT = 0;
@@ -210,16 +210,19 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64 <outs, ins, asm, pattern> {
 
   bits<8> dst;
+  bits<2> src0_modifiers;
   bits<9> src0;
+  bits<2> src1_modifiers;
   bits<9> src1;
+  bits<2> src2_modifiers;
   bits<9> src2;
-  bits<3> abs;
   bits<1> clamp;
   bits<2> omod;
-  bits<3> neg;
 
   let Inst{7-0} = dst;
-  let Inst{10-8} = abs;
+  let Inst{8} = src0_modifiers{1};
+  let Inst{9} = src1_modifiers{1};
+  let Inst{10} = src2_modifiers{1};
   let Inst{11} = clamp;
   let Inst{25-17} = op;
   let Inst{31-26} = 0x34; //encoding
@@ -227,7 +230,9 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{49-41} = src1;
   let Inst{58-50} = src2;
   let Inst{60-59} = omod;
-  let Inst{63-61} = neg;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
   
   let mayLoad = 0;
   let mayStore = 0;
@@ -240,12 +245,14 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64 <outs, ins, asm, pattern> {
 
   bits<8> dst;
+  bits<2> src0_modifiers;
   bits<9> src0;
+  bits<2> src1_modifiers;
   bits<9> src1;
+  bits<2> src2_modifiers;
   bits<9> src2;
   bits<7> sdst;
   bits<2> omod;
-  bits<3> neg;
 
   let Inst{7-0} = dst;
   let Inst{14-8} = sdst;
@@ -255,7 +262,9 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{49-41} = src1;
   let Inst{58-50} = src2;
   let Inst{60-59} = omod;
-  let Inst{63-61} = neg;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
 
   let mayLoad = 0;
   let mayStore = 0;
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index ab2fe09..4a9e346 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -187,27 +187,45 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       int FrameIndex,
                                       const TargetRegisterClass *RC,
                                       const TargetRegisterInfo *TRI) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
   DebugLoc DL = MBB.findDebugLoc(MI);
   unsigned KillFlag = isKill ? RegState::Kill : 0;
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
   if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
-    unsigned Lane = MFI->SpillTracker.getNextLane(MRI);
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
-            MFI->SpillTracker.LaneVGPR)
+    unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent());
+
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), MFI->SpillTracker.LaneVGPR)
             .addReg(SrcReg, KillFlag)
             .addImm(Lane);
+    MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane);
+  } else if (RI.isSGPRClass(RC)) {
+    // We are only allowed to create one new instruction when spilling
+    // registers, so we need to use pseudo instruction for vector
+    // registers.
+    //
+    // Reserve a spot in the spill tracker for each sub-register of
+    // the vector register.
+    unsigned NumSubRegs = RC->getSize() / 4;
+    unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent(),
+                                                        NumSubRegs);
     MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR,
-                                    Lane);
-  } else {
-    for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) {
-      unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      BuildMI(MBB, MI, MBB.findDebugLoc(MI), get(AMDGPU::COPY), SubReg)
-              .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
-      storeRegToStackSlot(MBB, MI, SubReg, isKill, FrameIndex + i,
-                          &AMDGPU::SReg_32RegClass, TRI);
+                                    FirstLane);
+
+    unsigned Opcode;
+    switch (RC->getSize() * 8) {
+    case 64:  Opcode = AMDGPU::SI_SPILL_S64_SAVE;  break;
+    case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
+    case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
+    case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
+    default: llvm_unreachable("Cannot spill register class");
     }
+
+    BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR)
+            .addReg(SrcReg)
+            .addImm(FrameIndex);
+  } else {
+    llvm_unreachable("VGPR spilling not supported");
   }
 }
 
@@ -216,30 +234,125 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        unsigned DestReg, int FrameIndex,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
   DebugLoc DL = MBB.findDebugLoc(MI);
   if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) {
-     SIMachineFunctionInfo::SpilledReg Spill =
+    SIMachineFunctionInfo::SpilledReg Spill =
         MFI->SpillTracker.getSpilledReg(FrameIndex);
     assert(Spill.VGPR);
     BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg)
             .addReg(Spill.VGPR)
             .addImm(Spill.Lane);
+    insertNOPs(MI, 3);
+  } else if (RI.isSGPRClass(RC)){
+    unsigned Opcode;
+    switch(RC->getSize() * 8) {
+    case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
+    case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
+    case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
+    case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
+    default: llvm_unreachable("Cannot spill register class");
+    }
+
+    SIMachineFunctionInfo::SpilledReg Spill =
+        MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+    BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+            .addReg(Spill.VGPR)
+            .addImm(FrameIndex);
+    insertNOPs(MI, 3);
   } else {
-    for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) {
-      unsigned Flags = RegState::Define;
-      if (i == 0) {
-        Flags |= RegState::Undef;
-      }
-      unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      loadRegFromStackSlot(MBB, MI, SubReg, FrameIndex + i,
-                           &AMDGPU::SReg_32RegClass, TRI);
-      BuildMI(MBB, MI, DL, get(AMDGPU::COPY))
-              .addReg(DestReg, Flags, RI.getSubRegFromChannel(i))
-              .addReg(SubReg);
+    llvm_unreachable("VGPR spilling not supported");
+  }
+}
+
+static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+
+  switch (Op) {
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+    return 16;
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+    return 8;
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+    return 4;
+  case AMDGPU::SI_SPILL_S64_SAVE:
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+    return 2;
+  default: llvm_unreachable("Invalid spill opcode");
+  }
+}
+
+void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
+                             int Count) const {
+  while (Count > 0) {
+    int Arg;
+    if (Count >= 8)
+      Arg = 7;
+    else
+      Arg = Count - 1;
+    Count -= 8;
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP))
+            .addImm(Arg);
+  }
+}
+
+bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  SIMachineFunctionInfo *MFI =
+      MI->getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  switch (MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+
+  // SGPR register spill
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S64_SAVE: {
+    unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+    unsigned FrameIndex = MI->getOperand(2).getImm();
+
+    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+      SIMachineFunctionInfo::SpilledReg Spill;
+      unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(1).getReg(),
+                                            &AMDGPU::SGPR_32RegClass, i);
+      Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
+              MI->getOperand(0).getReg())
+              .addReg(SubReg)
+              .addImm(Spill.Lane + i);
+    }
+    MI->eraseFromParent();
+    break;
+  }
+
+  // SGPR register restore
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+  case AMDGPU::SI_SPILL_S64_RESTORE: {
+    unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+
+    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+      SIMachineFunctionInfo::SpilledReg Spill;
+      unsigned FrameIndex = MI->getOperand(2).getImm();
+      unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(0).getReg(),
+                                   &AMDGPU::SGPR_32RegClass, i);
+      Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), SubReg)
+              .addReg(MI->getOperand(1).getReg())
+              .addImm(Spill.Lane + i);
     }
+    MI->eraseFromParent();
+    break;
   }
+  }
+  return true;
 }
 
 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
@@ -247,18 +360,18 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
 
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg())
-    return 0;
+    return nullptr;
 
   // Cannot commute VOP2 if src0 is SGPR.
   if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() &&
       RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg())))
-   return 0;
+   return nullptr;
 
   if (!MI->getOperand(2).isReg()) {
     // XXX: Commute instructions with FPImm operands
     if (NewMI || MI->getOperand(2).isFPImm() ||
        (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
-      return 0;
+      return nullptr;
     }
 
     // XXX: Commute VOP3 instructions with abs and neg set.
@@ -267,7 +380,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
                         AMDGPU::OpName::abs)).getImm() ||
          MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                         AMDGPU::OpName::neg)).getImm()))
-      return 0;
+      return nullptr;
 
     unsigned Reg = MI->getOperand(1).getReg();
     unsigned SubReg = MI->getOperand(1).getSubReg();
@@ -516,6 +629,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
   case AMDGPU::COPY: return AMDGPU::COPY;
   case AMDGPU::PHI: return AMDGPU::PHI;
+  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
   case AMDGPU::S_MOV_B32:
     return MI.getOperand(1).isReg() ?
            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
@@ -536,6 +650,23 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
+  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
+  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
+  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
+  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
+  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
+  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
+  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
+  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
+  case AMDGPU::S_LOAD_DWORD_IMM:
+  case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
   }
 }
 
@@ -559,6 +690,8 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
   switch (MI.getOpcode()) {
   case AMDGPU::COPY:
   case AMDGPU::REG_SEQUENCE:
+  case AMDGPU::PHI:
+  case AMDGPU::INSERT_SUBREG:
     return RI.hasVGPRs(getOpRegClass(MI, 0));
   default:
     return RI.hasVGPRs(getOpRegClass(MI, OpNo));
@@ -737,11 +870,12 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     }
   }
 
-  // Legalize REG_SEQUENCE
+  // Legalize REG_SEQUENCE and PHI
   // The register class of the operands much be the same type as the register
   // class of the output.
-  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
-    const TargetRegisterClass *RC = NULL, *SRC = NULL, *VRC = NULL;
+  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE ||
+      MI->getOpcode() == AMDGPU::PHI) {
+    const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
     for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
       if (!MI->getOperand(i).isReg() ||
           !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
@@ -774,13 +908,40 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
           !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
         continue;
       unsigned DstReg = MRI.createVirtualRegister(RC);
-      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+      MachineBasicBlock *InsertBB;
+      MachineBasicBlock::iterator Insert;
+      if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+        InsertBB = MI->getParent();
+        Insert = MI;
+      } else {
+        // MI is a PHI instruction.
+        InsertBB = MI->getOperand(i + 1).getMBB();
+        Insert = InsertBB->getFirstTerminator();
+      }
+      BuildMI(*InsertBB, Insert, MI->getDebugLoc(),
               get(AMDGPU::COPY), DstReg)
               .addOperand(MI->getOperand(i));
       MI->getOperand(i).setReg(DstReg);
     }
   }
 
+  // Legalize INSERT_SUBREG
+  // src0 must have the same register class as dst
+  if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) {
+    unsigned Dst = MI->getOperand(0).getReg();
+    unsigned Src0 = MI->getOperand(1).getReg();
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
+    if (DstRC != Src0RC) {
+      MachineBasicBlock &MBB = *MI->getParent();
+      unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
+              .addReg(Src0);
+      MI->getOperand(1).setReg(NewSrc0);
+    }
+    return;
+  }
+
   // Legalize MUBUF* instructions
   // FIXME: If we start using the non-addr64 instructions for compute, we
   // may need to legalize them here.
@@ -886,6 +1047,72 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   }
 }
 
+void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  switch (MI->getOpcode()) {
+    case AMDGPU::S_LOAD_DWORD_IMM:
+    case AMDGPU::S_LOAD_DWORD_SGPR:
+    case AMDGPU::S_LOAD_DWORDX2_IMM:
+    case AMDGPU::S_LOAD_DWORDX2_SGPR:
+    case AMDGPU::S_LOAD_DWORDX4_IMM:
+    case AMDGPU::S_LOAD_DWORDX4_SGPR:
+      unsigned NewOpcode = getVALUOp(*MI);
+      unsigned RegOffset;
+      unsigned ImmOffset;
+
+      if (MI->getOperand(2).isReg()) {
+        RegOffset = MI->getOperand(2).getReg();
+        ImmOffset = 0;
+      } else {
+        assert(MI->getOperand(2).isImm());
+        // SMRD instructions take a dword offsets and MUBUF instructions
+        // take a byte offset.
+        ImmOffset = MI->getOperand(2).getImm() << 2;
+        RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+        if (isUInt<12>(ImmOffset)) {
+          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+                  RegOffset)
+                  .addImm(0);
+        } else {
+          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+                  RegOffset)
+                  .addImm(ImmOffset);
+          ImmOffset = 0;
+        }
+      }
+
+      unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+      unsigned DWord0 = RegOffset;
+      unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
+              .addImm(0);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
+              .addReg(DWord0)
+              .addImm(AMDGPU::sub0)
+              .addReg(DWord1)
+              .addImm(AMDGPU::sub1)
+              .addReg(DWord2)
+              .addImm(AMDGPU::sub2)
+              .addReg(DWord3)
+              .addImm(AMDGPU::sub3);
+     MI->setDesc(get(NewOpcode));
+     if (MI->getOperand(2).isReg()) {
+       MI->getOperand(2).setReg(MI->getOperand(1).getReg());
+     } else {
+       MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false);
+     }
+     MI->getOperand(1).setReg(SRsrc);
+     MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
+  }
+}
+
 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
   SmallVector<MachineInstr *, 128> Worklist;
   Worklist.push_back(&TopInst);
@@ -895,8 +1122,16 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     MachineBasicBlock *MBB = Inst->getParent();
     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
 
+    unsigned Opcode = Inst->getOpcode();
+    unsigned NewOpcode = getVALUOp(*Inst);
+
     // Handle some special cases
-    switch(Inst->getOpcode()) {
+    switch (Opcode) {
+    default:
+      if (isSMRD(Inst->getOpcode())) {
+        moveSMRDToVALU(Inst, MRI);
+      }
+      break;
     case AMDGPU::S_MOV_B64: {
       DebugLoc DL = Inst->getDebugLoc();
 
@@ -947,7 +1182,6 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       llvm_unreachable("Moving this op to VALU not implemented");
     }
 
-    unsigned NewOpcode = getVALUOp(*Inst);
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
       // We cannot move this instruction to the VALU, so we should try to
       // legalize its operands instead.
@@ -968,27 +1202,52 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
         Inst->RemoveOperand(i);
     }
 
-    // Add the implict and explicit register definitions.
-    if (NewDesc.ImplicitUses) {
-      for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
-        unsigned Reg = NewDesc.ImplicitUses[i];
-        Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
-      }
+    if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+      // We are converting these to a BFE, so we need to add the missing
+      // operands for the size and offset.
+      unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+      Inst->addOperand(Inst->getOperand(1));
+      Inst->getOperand(1).ChangeToImmediate(0);
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(Size));
+
+      // XXX - Other pointless operands. There are 4, but it seems you only need
+      // 3 to not hit an assertion later in MCInstLower.
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
     }
 
-    if (NewDesc.ImplicitDefs) {
-      for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
-        unsigned Reg = NewDesc.ImplicitDefs[i];
-        Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
-      }
+    addDescImplicitUseDef(NewDesc, Inst);
+
+    if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+      const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
+      // If we need to move this to VGPRs, we need to unpack the second operand
+      // back into the 2 separate ones for bit offset and width.
+      assert(OffsetWidthOp.isImm() &&
+             "Scalar BFE is only implemented for constant width and offset");
+      uint32_t Imm = OffsetWidthOp.getImm();
+
+      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+      Inst->RemoveOperand(2); // Remove old immediate.
+      Inst->addOperand(Inst->getOperand(1));
+      Inst->getOperand(1).ChangeToImmediate(0);
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(Offset));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(BitWidth));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
     }
 
-    legalizeOperands(Inst);
-
     // Update the destination register class.
+
     const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
 
-    switch (Inst->getOpcode()) {
+    switch (Opcode) {
       // For target instructions, getOpRegClass just returns the virtual
       // register class associated with the operand, so we need to find an
       // equivalent VGPR register class in order to move the instruction to the
@@ -996,6 +1255,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     case AMDGPU::COPY:
     case AMDGPU::PHI:
     case AMDGPU::REG_SEQUENCE:
+    case AMDGPU::INSERT_SUBREG:
       if (RI.hasVGPRs(NewDstRC))
         continue;
       NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
@@ -1010,6 +1270,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
     MRI.replaceRegWith(DstReg, NewDstReg);
 
+    // Legalize the operands
+    legalizeOperands(Inst);
+
     for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
            E = MRI.use_end(); I != E; ++I) {
       MachineInstr &UseMI = *I->getParent();
@@ -1097,6 +1360,24 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
   Worklist.push_back(HiHalf);
 }
 
+void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
+                                        MachineInstr *Inst) const {
+  // Add the implict and explicit register definitions.
+  if (NewDesc.ImplicitUses) {
+    for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
+      unsigned Reg = NewDesc.ImplicitUses[i];
+      Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+    }
+  }
+
+  if (NewDesc.ImplicitDefs) {
+    for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
+      unsigned Reg = NewDesc.ImplicitDefs[i];
+      Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
+    }
+  }
+}
+
 MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
                                    MachineBasicBlock *MBB,
                                    MachineBasicBlock::iterator I,
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index c537038..7b31a81 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -47,49 +47,52 @@ private:
   void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist,
                           MachineInstr *Inst, unsigned Opcode) const;
 
+  void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
 
 public:
   explicit SIInstrInfo(AMDGPUTargetMachine &tm);
 
-  const SIRegisterInfo &getRegisterInfo() const {
+  const SIRegisterInfo &getRegisterInfo() const override {
     return RI;
   }
 
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
                            unsigned SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const;
+                           const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI,
                             unsigned DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const;
+                            const TargetRegisterInfo *TRI) const override;
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
 
   unsigned commuteOpcode(unsigned Opcode) const;
 
-  virtual MachineInstr *commuteInstruction(MachineInstr *MI,
-                                           bool NewMI=false) const;
+  MachineInstr *commuteInstruction(MachineInstr *MI,
+                                   bool NewMI=false) const override;
 
   bool isTriviallyReMaterializable(const MachineInstr *MI,
-                                   AliasAnalysis *AA = 0) const;
+                                   AliasAnalysis *AA = nullptr) const;
 
-  virtual unsigned getIEQOpcode() const {
+  unsigned getIEQOpcode() const override {
     llvm_unreachable("Unimplemented");
   }
 
   MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg) const;
-  virtual bool isMov(unsigned Opcode) const;
+                              unsigned DstReg, unsigned SrcReg) const override;
+  bool isMov(unsigned Opcode) const override;
 
-  virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
   bool isDS(uint16_t Opcode) const;
   int isMIMG(uint16_t Opcode) const;
   int isSMRD(uint16_t Opcode) const;
@@ -101,8 +104,8 @@ public:
   bool isInlineConstant(const MachineOperand &MO) const;
   bool isLiteralConstant(const MachineOperand &MO) const;
 
-  virtual bool verifyInstruction(const MachineInstr *MI,
-                                 StringRef &ErrInfo) const;
+  bool verifyInstruction(const MachineInstr *MI,
+                         StringRef &ErrInfo) const override;
 
   bool isSALUInstr(const MachineInstr &MI) const;
   static unsigned getVALUOp(const MachineInstr &MI);
@@ -136,32 +139,36 @@ public:
   /// create new instruction and insert them before \p MI.
   void legalizeOperands(MachineInstr *MI) const;
 
+  void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
+
   /// \brief Replace this instruction's opcode with the equivalent VALU
   /// opcode.  This function will also move the users of \p MI to the
   /// VALU if necessary.
   void moveToVALU(MachineInstr &MI) const;
 
-  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
-                                            unsigned Channel) const;
+  unsigned calculateIndirectAddress(unsigned RegIndex,
+                                    unsigned Channel) const override;
 
-  virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
+  const TargetRegisterClass *getIndirectAddrRegClass() const override;
 
-  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                                 MachineBasicBlock::iterator I,
-                                                 unsigned ValueReg,
-                                                 unsigned Address,
-                                                 unsigned OffsetReg) const;
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned ValueReg,
+                                         unsigned Address,
+                                         unsigned OffsetReg) const override;
 
-  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                                MachineBasicBlock::iterator I,
-                                                unsigned ValueReg,
-                                                unsigned Address,
-                                                unsigned OffsetReg) const;
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg,
+                                        unsigned Address,
+                                        unsigned OffsetReg) const override;
   void reserveIndirectRegisters(BitVector &Reserved,
                                 const MachineFunction &MF) const;
 
   void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
               unsigned SavReg, unsigned IndexReg) const;
+
+  void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
 };
 
 namespace AMDGPU {
@@ -169,6 +176,7 @@ namespace AMDGPU {
   int getVOPe64(uint16_t Opcode);
   int getCommuteRev(uint16_t Opcode);
   int getCommuteOrig(uint16_t Opcode);
+  int getMCOpcode(uint16_t Opcode, unsigned Gen);
 
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
 
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index e05ab65..2242e6d 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -7,23 +7,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
+// in AMDGPUMCInstLower.h
+def SISubtarget {
+  int NONE = -1;
+  int SI = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // SI DAG Nodes
 //===----------------------------------------------------------------------===//
 
-// SMRD takes a 64bit memory address and can only add an 32bit offset
-def SIadd64bit32bit : SDNode<"ISD::ADD",
-  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]>
->;
-
 def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
-  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, i128>, SDTCisVT<2, i32>]>,
+  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
                       [SDNPMayLoad, SDNPMemOperand]
 >;
 
 def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
   SDTypeProfile<0, 13,
-    [SDTCisVT<0, i128>,   // rsrc(SGPR)
+    [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
      SDTCisVT<1, iAny>,   // vdata(VGPR)
      SDTCisVT<2, i32>,    // num_channels(imm)
      SDTCisVT<3, i32>,    // vaddr(VGPR)
@@ -41,13 +43,13 @@ def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
 >;
 
 def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
-  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, i128>, SDTCisVT<2, i16>,
+  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
                        SDTCisVT<3, i32>]>
 >;
 
 class SDSample<string opcode> : SDNode <opcode,
   SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>,
-                       SDTCisVT<3, i128>, SDTCisVT<4, i32>]>
+                       SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
 >;
 
 def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
@@ -111,14 +113,17 @@ def IMM16bit : PatLeaf <(imm),
   [{return isUInt<16>(N->getZExtValue());}]
 >;
 
+def IMM32bit : PatLeaf <(imm),
+  [{return isUInt<32>(N->getZExtValue());}]
+>;
+
 def mubuf_vaddr_offset : PatFrag<
   (ops node:$ptr, node:$offset, node:$imm_offset),
   (add (add node:$ptr, node:$offset), node:$imm_offset)
 >;
 
 class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
-  return
-    (*(const SITargetLowering *)getTargetLowering()).analyzeImmediate(N) == 0;
+  return isInlineImmediate(N);
 }]>;
 
 class SGPRImm <dag frag> : PatLeaf<frag, [{
@@ -138,7 +143,7 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{
 }]>;
 
 def FRAMEri32 : Operand<iPTR> {
-  let MIOperandInfo = (ops SReg_32:$ptr, i32imm:$index);
+  let MIOperandInfo = (ops i32:$ptr, i32imm:$index);
 }
 
 //===----------------------------------------------------------------------===//
@@ -197,15 +202,17 @@ class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
   opName#" $dst, $src0, $src1", pattern
 >;
 
-class SOPC_32 <bits<7> op, string opName, list<dag> pattern> : SOPC <
-  op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
-  opName#" $dst, $src0, $src1", pattern
->;
 
-class SOPC_64 <bits<7> op, string opName, list<dag> pattern> : SOPC <
-  op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
-  opName#" $dst, $src0, $src1", pattern
->;
+class SOPC_Helper <bits<7> op, RegisterClass rc, ValueType vt,
+                    string opName, PatLeaf cond> : SOPC <
+  op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
+  opName#" $dst, $src0, $src1", []>;
+
+class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+  : SOPC_Helper<op, SSrc_32, i32, opName, cond>;
+
+class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+  : SOPC_Helper<op, SSrc_64, i64, opName, cond>;
 
 class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK <
   op, (outs SReg_32:$dst), (ins i16imm:$src0),
@@ -221,7 +228,7 @@ multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass,
                         RegisterClass dstClass> {
   def _IMM : SMRD <
     op, 1, (outs dstClass:$dst),
-    (ins baseClass:$sbase, i32imm:$offset),
+    (ins baseClass:$sbase, u32imm:$offset),
     asm#" $dst, $sbase, $offset", []
   >;
 
@@ -245,6 +252,28 @@ class VOP2_REV <string revOp, bit isOrig> {
   bit IsOrig = isOrig;
 }
 
+class SIMCInstr <string pseudo, int subtarget> {
+  string PseudoInstr = pseudo;
+  int Subtarget = subtarget;
+}
+
+multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName> {
+
+  def "" : InstSI <outs, ins, "", pattern>, VOP <opName>,
+           SIMCInstr<OpName, SISubtarget.NONE> {
+    let isPseudo = 1;
+  }
+
+  def _si : VOP3 <op, outs, ins, asm, []>, SIMCInstr<opName, SISubtarget.SI>;
+
+}
+
+// This must always be right before the operand being input modified.
+def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
+  let PrintMethod = "printOperandAndMods";
+}
+
 multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
                         string opName, list<dag> pattern> {
 
@@ -256,10 +285,8 @@ multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
   def _e64 : VOP3 <
     {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
     (outs drc:$dst),
-    (ins src:$src0,
-         i32imm:$abs, i32imm:$clamp,
-         i32imm:$omod, i32imm:$neg),
-    opName#"_e64 $dst, $src0, $abs, $clamp, $omod, $neg", []
+    (ins InputMods:$src0_modifiers, src:$src0, i32imm:$clamp, i32imm:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $clamp, $omod", []
   >, VOP <opName> {
     let src1 = SIOperand.ZERO;
     let src2 = SIOperand.ZERO;
@@ -288,10 +315,10 @@ multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
   def _e64 : VOP3 <
     {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
     (outs vrc:$dst),
-    (ins arc:$src0, arc:$src1,
-         i32imm:$abs, i32imm:$clamp,
-         i32imm:$omod, i32imm:$neg),
-    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
+    (ins InputMods:$src0_modifiers, arc:$src0,
+         InputMods:$src1_modifiers, arc:$src1,
+         i32imm:$clamp, i32imm:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", []
   >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
     let src2 = SIOperand.ZERO;
   }
@@ -316,10 +343,10 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
   def _e64 : VOP3b <
     {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
     (outs VReg_32:$dst),
-    (ins VSrc_32:$src0, VSrc_32:$src1,
-         i32imm:$abs, i32imm:$clamp,
-         i32imm:$omod, i32imm:$neg),
-    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
+    (ins InputMods: $src0_modifiers, VSrc_32:$src0,
+         InputMods:$src1_modifiers, VSrc_32:$src1,
+         i32imm:$clamp, i32imm:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", []
   >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
     let src2 = SIOperand.ZERO;
     /* the VOP2 variant puts the carry out into VCC, the VOP3 variant
@@ -340,15 +367,16 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
   def _e64 : VOP3 <
     {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
     (outs SReg_64:$dst),
-    (ins arc:$src0, arc:$src1,
-         InstFlag:$abs, InstFlag:$clamp,
-         InstFlag:$omod, InstFlag:$neg),
-    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg",
+    (ins InputMods:$src0_modifiers, arc:$src0,
+         InputMods:$src1_modifiers, arc:$src1,
+         InstFlag:$clamp, InstFlag:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod",
     !if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>,
       [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
     )
   >, VOP <opName> {
     let src2 = SIOperand.ZERO;
+    let src2_modifiers = 0;
   }
 }
 
@@ -360,12 +388,13 @@ multiclass VOPC_64 <bits<8> op, string opName,
   ValueType vt = untyped, PatLeaf cond = COND_NULL>
   : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>;
 
-class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
+multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
   op, (outs VReg_32:$dst),
-  (ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
-   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
-  opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
->, VOP <opName>;
+  (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers,
+   VSrc_32:$src1, InputMods:$src2_modifiers, VSrc_32:$src2,
+   InstFlag:$clamp, InstFlag:$omod),
+  opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName
+>;
 
 class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
   op, (outs VReg_64:$dst),
@@ -374,10 +403,9 @@ class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
 >, VOP <opName> {
 
   let src2 = SIOperand.ZERO;
-  let abs = 0;
+  let src0_modifiers = 0;
   let clamp = 0;
   let omod = 0;
-  let neg = 0;
 }
 
 class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
@@ -403,7 +431,7 @@ class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> :
 class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs regClass:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, i16imm:$offset),
+  (ins i1imm:$gds, VReg_32:$addr, u16imm:$offset),
   asm#" $vdst, $addr, $offset, [M0]",
   []> {
   let data0 = 0;
@@ -415,7 +443,7 @@ class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
 class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
   op,
   (outs regClass:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, i8imm:$offset0, i8imm:$offset1),
+  (ins i1imm:$gds, VReg_32:$addr, u8imm:$offset0, u8imm:$offset1),
   asm#" $gds, $vdst, $addr, $offset0, $offset1, [M0]",
   []> {
   let data0 = 0;
@@ -427,7 +455,7 @@ class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
 class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, i16imm:$offset),
+  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u16imm:$offset),
   asm#" $addr, $data0, $offset [M0]",
   []> {
   let data1 = 0;
@@ -439,7 +467,7 @@ class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
 class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, i8imm:$offset0, i8imm:$offset1),
+  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u8imm:$offset0, u8imm:$offset1),
   asm#" $addr, $data0, $data1, $offset0, $offset1 [M0]",
   []> {
   let mayStore = 1;
@@ -450,7 +478,7 @@ class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A
 class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
   op,
   (outs rc:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, i16imm:$offset),
+  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, u16imm:$offset),
   asm#" $vdst, $addr, $data0, $offset, [M0]",
   []> {
 
@@ -462,7 +490,7 @@ class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
 class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs),
-  (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+  (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
    i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
    SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
   asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
@@ -481,7 +509,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
       let offen = 0, idxen = 0 in {
         def _OFFSET : MUBUF <op, (outs regClass:$vdata),
                              (ins SReg_128:$srsrc, VReg_32:$vaddr,
-                             i16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
+                             u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
                              i1imm:$slc, i1imm:$tfe),
                              asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
       }
@@ -497,7 +525,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
       let offen = 0, idxen = 1 in {
         def _IDXEN  : MUBUF <op, (outs regClass:$vdata),
                              (ins SReg_128:$srsrc, VReg_32:$vaddr,
-                             i16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
+                             u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
                              i1imm:$slc, i1imm:$tfe),
                              asm#" $vdata, $srsrc[$vaddr] + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
       }
@@ -513,7 +541,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
 
     let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in {
       def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
-                           (ins SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
+                           (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
                            asm#" $vdata, $srsrc + $vaddr + $offset", []>;
     }
   }
@@ -521,7 +549,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
 
 class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
     MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
-                            i16imm:$offset),
+                            u16imm:$offset),
           name#" $vdata, $srsrc + $vaddr + $offset",
          []> {
 
@@ -542,7 +570,7 @@ class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
 class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs regClass:$dst),
-  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
        i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
        i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
   asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
@@ -677,4 +705,12 @@ def isDS : InstrMapping {
   let ValueCols = [["8"]];
 }
 
+def getMCOpcode : InstrMapping {
+  let FilterClass = "SIMCInstr";
+  let RowFields = ["PseudoInstr"];
+  let ColFields = ["Subtarget"];
+  let KeyCol = [!cast<string>(SISubtarget.NONE)];
+  let ValueCols = [[!cast<string>(SISubtarget.SI)]];
+}
+
 include "SIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 5232139..500fa78 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -32,9 +32,56 @@ def isSI : Predicate<"Subtarget.getGeneration() "
 def isCI : Predicate<"Subtarget.getGeneration() "
                       ">= AMDGPUSubtarget::SEA_ISLANDS">;
 
+def isCFDepth0 : Predicate<"isCFDepth0()">;
+
 def WAIT_FLAG : InstFlag<"printWaitFlag">;
 
-let Predicates = [isSI] in {
+let SubtargetPredicate = isSI in {
+let OtherPredicates  = [isCFDepth0] in {
+
+//===----------------------------------------------------------------------===//
+// SMRD Instructions
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1 in {
+
+// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SGPR_32 register class does not include M0
+// and writing to M0 from an SMRD instruction will hang the GPU.
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
+
+defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
+  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
+>;
+
+defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
+  0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
+>;
+
+defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
+  0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
+>;
+
+defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
+  0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
+>;
+
+defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
+  0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
+>;
+
+} // mayLoad = 1
+
+//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
+//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 Instructions
+//===----------------------------------------------------------------------===//
 
 let neverHasSideEffects = 1 in {
 
@@ -45,7 +92,10 @@ def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
 def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
 } // End isMoveImm = 1
 
-def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>;
+def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32",
+  [(set i32:$dst, (not i32:$src0))]
+>;
+
 def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
 def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
 def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
@@ -65,8 +115,13 @@ def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
 //def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
 def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
 //def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
-//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>;
-//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>;
+def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8",
+  [(set i32:$dst, (sext_inreg i32:$src0, i8))]
+>;
+def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16",
+  [(set i32:$dst, (sext_inreg i32:$src0, i16))]
+>;
+
 ////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
 ////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
 ////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
@@ -99,6 +154,150 @@ def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
 def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
 def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
 def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [SCC] in { // Carry out goes to SCC
+let isCommutable = 1 in {
+def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
+def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
+  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
+>;
+} // End isCommutable = 1
+
+def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
+def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
+  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
+>;
+
+let Uses = [SCC] in { // Carry in comes from SCC
+let isCommutable = 1 in {
+def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
+  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End isCommutable = 1
+
+def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
+  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End Uses = [SCC]
+} // End Defs = [SCC]
+
+def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32",
+  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
+>;
+def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32",
+  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
+>;
+def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32",
+  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
+>;
+def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32",
+  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
+>;
+
+def S_CSELECT_B32 : SOP2 <
+  0x0000000a, (outs SReg_32:$dst),
+  (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
+  []
+>;
+
+def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
+
+def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32",
+  [(set i32:$dst, (and i32:$src0, i32:$src1))]
+>;
+
+def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
+  [(set i64:$dst, (and i64:$src0, i64:$src1))]
+>;
+
+def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
+  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+>;
+
+def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
+  [(set i64:$dst, (or i64:$src0, i64:$src1))]
+>;
+
+def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32",
+  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+>;
+
+def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
+  [(set i64:$dst, (xor i64:$src0, i64:$src1))]
+>;
+def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
+def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
+def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
+def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
+def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
+def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
+def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
+def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
+def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
+def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
+
+// Use added complexity so these patterns are preferred to the VALU patterns.
+let AddedComplexity = 1 in {
+
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
+  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
+def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
+  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
+  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
+def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
+  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
+  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
+def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
+  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+>;
+
+} // End AddedComplexity = 1
+
+def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
+def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
+def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
+def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
+def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
+def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
+def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
+//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
+def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
+
+//===----------------------------------------------------------------------===//
+// SOPC Instructions
+//===----------------------------------------------------------------------===//
+
+def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32">;
+def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32">;
+def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32">;
+def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32">;
+def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32">;
+def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32">;
+def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32">;
+def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32">;
+def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32">;
+def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32">;
+def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32">;
+def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32">;
+////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
+////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
+////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
+////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
+//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+
+//===----------------------------------------------------------------------===//
+// SOPK Instructions
+//===----------------------------------------------------------------------===//
+
 def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
 def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
 
@@ -147,6 +346,108 @@ def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
 //def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
 //def EXP : EXP_ <0x00000000, "EXP", []>;
 
+} // End let OtherPredicates = [isCFDepth0]
+
+//===----------------------------------------------------------------------===//
+// SOPP Instructions
+//===----------------------------------------------------------------------===//
+
+def S_NOP : SOPP <0x00000000, (ins i16imm:$SIMM16), "S_NOP $SIMM16", []>;
+
+let isTerminator = 1 in {
+
+def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
+  [(IL_retflag)]> {
+  let SIMM16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+}
+
+let isBranch = 1 in {
+def S_BRANCH : SOPP <
+  0x00000002, (ins brtarget:$target), "S_BRANCH $target",
+  [(br bb:$target)]> {
+  let isBarrier = 1;
+}
+
+let DisableEncoding = "$scc" in {
+def S_CBRANCH_SCC0 : SOPP <
+  0x00000004, (ins brtarget:$target, SCCReg:$scc),
+  "S_CBRANCH_SCC0 $target", []
+>;
+def S_CBRANCH_SCC1 : SOPP <
+  0x00000005, (ins brtarget:$target, SCCReg:$scc),
+  "S_CBRANCH_SCC1 $target",
+  []
+>;
+} // End DisableEncoding = "$scc"
+
+def S_CBRANCH_VCCZ : SOPP <
+  0x00000006, (ins brtarget:$target, VCCReg:$vcc),
+  "S_CBRANCH_VCCZ $target",
+  []
+>;
+def S_CBRANCH_VCCNZ : SOPP <
+  0x00000007, (ins brtarget:$target, VCCReg:$vcc),
+  "S_CBRANCH_VCCNZ $target",
+  []
+>;
+
+let DisableEncoding = "$exec" in {
+def S_CBRANCH_EXECZ : SOPP <
+  0x00000008, (ins brtarget:$target, EXECReg:$exec),
+  "S_CBRANCH_EXECZ $target",
+  []
+>;
+def S_CBRANCH_EXECNZ : SOPP <
+  0x00000009, (ins brtarget:$target, EXECReg:$exec),
+  "S_CBRANCH_EXECNZ $target",
+  []
+>;
+} // End DisableEncoding = "$exec"
+
+
+} // End isBranch = 1
+} // End isTerminator = 1
+
+let hasSideEffects = 1 in {
+def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
+  [(int_AMDGPU_barrier_local)]
+> {
+  let SIMM16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
+  []
+>;
+//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
+//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
+//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
+
+let Uses = [EXEC] in {
+  def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
+      [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
+  > {
+    let DisableEncoding = "$m0";
+  }
+} // End Uses = [EXEC]
+
+//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
+//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
+//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
+//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
+//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
+//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
+} // End hasSideEffects
+
+//===----------------------------------------------------------------------===//
+// VOPC Instructions
+//===----------------------------------------------------------------------===//
+
 let isCompare = 1 in {
 
 defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">;
@@ -403,6 +704,10 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
 
 } // End isCompare = 1
 
+//===----------------------------------------------------------------------===//
+// DS Instructions
+//===----------------------------------------------------------------------===//
+
 def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>;
 def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>;
 def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
@@ -427,6 +732,9 @@ def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "DS_READ2_B64", VReg_128>;
 // TODO: DS_READ2ST64_B32, DS_READ2ST64_B64,
 // DS_WRITE2ST64_B32, DS_WRITE2ST64_B64
 
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
 
 //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
 //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
@@ -499,6 +807,11 @@ def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
 //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
 //def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
 //def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Instructions
+//===----------------------------------------------------------------------===//
+
 //def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
 //def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
 //def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
@@ -508,41 +821,10 @@ def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FOR
 def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>;
 def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>;
 
-let mayLoad = 1 in {
-
-// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
-// SMRD instructions, because the SGPR_32 register class does not include M0
-// and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
-defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
-
-defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
-  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
->;
-
-defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
-  0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
->;
-
-defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
-  0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
->;
-
-defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
-  0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
->;
-
-defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
-  0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
->;
-
-} // mayLoad = 1
+//===----------------------------------------------------------------------===//
+// MIMG Instructions
+//===----------------------------------------------------------------------===//
 
-//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
-//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
 defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">;
 defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">;
 //def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
@@ -638,8 +920,12 @@ defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
 //def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>;
 //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
 //def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
-//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
 
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
 
 let neverHasSideEffects = 1, isMoveImm = 1 in {
 defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
@@ -691,8 +977,13 @@ defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
 //defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
 //defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
 //defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
-//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
-//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
+defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64",
+  [(set i32:$dst, (fp_to_uint f64:$src0))]
+>;
+defm V_CVT_F64_U32 : VOP1_64_32 <0x00000016, "V_CVT_F64_U32",
+  [(set f64:$dst, (uint_to_fp i32:$src0))]
+>;
+
 defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
   [(set f32:$dst, (AMDGPUfract f32:$src0))]
 >;
@@ -752,131 +1043,48 @@ defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>;
 //defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>;
 defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>;
 //def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>;
-defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
-defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
-defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
-
-def V_INTERP_P1_F32 : VINTRP <
-  0x00000000,
-  (outs VReg_32:$dst),
-  (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
-  "V_INTERP_P1_F32 $dst, $i, $attr_chan, $attr, [$m0]",
-  []> {
-  let DisableEncoding = "$m0";
-}
-
-def V_INTERP_P2_F32 : VINTRP <
-  0x00000001,
-  (outs VReg_32:$dst),
-  (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
-  "V_INTERP_P2_F32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]",
-  []> {
-
-  let Constraints = "$src0 = $dst";
-  let DisableEncoding = "$src0,$m0";
-
-}
-
-def V_INTERP_MOV_F32 : VINTRP <
-  0x00000002,
-  (outs VReg_32:$dst),
-  (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
-  "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr, [$m0]",
-  []> {
-  let DisableEncoding = "$m0";
-}
-
-//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>;
-
-let isTerminator = 1 in {
-
-def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
-  [(IL_retflag)]> {
-  let SIMM16 = 0;
-  let isBarrier = 1;
-  let hasCtrlDep = 1;
-}
-
-let isBranch = 1 in {
-def S_BRANCH : SOPP <
-  0x00000002, (ins brtarget:$target), "S_BRANCH $target",
-  [(br bb:$target)]> {
-  let isBarrier = 1;
-}
-
-let DisableEncoding = "$scc" in {
-def S_CBRANCH_SCC0 : SOPP <
-  0x00000004, (ins brtarget:$target, SCCReg:$scc),
-  "S_CBRANCH_SCC0 $target", []
->;
-def S_CBRANCH_SCC1 : SOPP <
-  0x00000005, (ins brtarget:$target, SCCReg:$scc),
-  "S_CBRANCH_SCC1 $target",
-  []
->;
-} // End DisableEncoding = "$scc"
-
-def S_CBRANCH_VCCZ : SOPP <
-  0x00000006, (ins brtarget:$target, VCCReg:$vcc),
-  "S_CBRANCH_VCCZ $target",
-  []
->;
-def S_CBRANCH_VCCNZ : SOPP <
-  0x00000007, (ins brtarget:$target, VCCReg:$vcc),
-  "S_CBRANCH_VCCNZ $target",
-  []
->;
-
-let DisableEncoding = "$exec" in {
-def S_CBRANCH_EXECZ : SOPP <
-  0x00000008, (ins brtarget:$target, EXECReg:$exec),
-  "S_CBRANCH_EXECZ $target",
-  []
->;
-def S_CBRANCH_EXECNZ : SOPP <
-  0x00000009, (ins brtarget:$target, EXECReg:$exec),
-  "S_CBRANCH_EXECNZ $target",
-  []
->;
-} // End DisableEncoding = "$exec"
+defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
+defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
+defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
 
 
-} // End isBranch = 1
-} // End isTerminator = 1
+//===----------------------------------------------------------------------===//
+// VINTRP Instructions
+//===----------------------------------------------------------------------===//
 
-let hasSideEffects = 1 in {
-def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
-  [(int_AMDGPU_barrier_local)]
-> {
-  let SIMM16 = 0;
-  let isBarrier = 1;
-  let hasCtrlDep = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
+def V_INTERP_P1_F32 : VINTRP <
+  0x00000000,
+  (outs VReg_32:$dst),
+  (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_P1_F32 $dst, $i, $attr_chan, $attr, [$m0]",
+  []> {
+  let DisableEncoding = "$m0";
 }
 
-def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
-  []
->;
-//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
-//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
-//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
+def V_INTERP_P2_F32 : VINTRP <
+  0x00000001,
+  (outs VReg_32:$dst),
+  (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_P2_F32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]",
+  []> {
 
-let Uses = [EXEC] in {
-  def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
-      [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
-  > {
-    let DisableEncoding = "$m0";
-  }
-} // End Uses = [EXEC]
+  let Constraints = "$src0 = $dst";
+  let DisableEncoding = "$src0,$m0";
 
-//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
-//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
-//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
-//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
-//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
-//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
-} // End hasSideEffects
+}
+
+def V_INTERP_MOV_F32 : VINTRP <
+  0x00000002,
+  (outs VReg_32:$dst),
+  (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr, [$m0]",
+  []> {
+  let DisableEncoding = "$m0";
+}
+
+//===----------------------------------------------------------------------===//
+// VOP2 Instructions
+//===----------------------------------------------------------------------===//
 
 def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
   (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
@@ -891,18 +1099,11 @@ def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
    InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
   "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg",
   [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))]
->;
-
-//f32 pattern for V_CNDMASK_B32_e64
-def : Pat <
-  (f32 (select i1:$src2, f32:$src1, f32:$src0)),
-  (V_CNDMASK_B32_e64 $src0, $src1, $src2)
->;
-
-def : Pat <
-  (i32 (trunc i64:$val)),
-  (EXTRACT_SUBREG $val, sub0)
->;
+> {
+  let src0_modifiers = 0;
+  let src1_modifiers = 0;
+  let src2_modifiers = 0;
+}
 
 def V_READLANE_B32 : VOP2 <
   0x00000001,
@@ -946,11 +1147,11 @@ defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
 
 
 defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24",
-  [(set i32:$dst, (mul I24:$src0, I24:$src1))]
+  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))]
 >;
 //defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
 defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24",
-  [(set i32:$dst, (mul U24:$src0, U24:$src1))]
+  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))]
 >;
 //defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
 
@@ -965,27 +1166,43 @@ defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
 
 defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
 defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
-defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
-defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
-defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
-defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
+defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32",
+  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]>;
+defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32",
+  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]>;
+defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32",
+  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]>;
+defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32",
+  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]>;
+
+defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32",
+  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
 
-defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
 defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">;
 
-defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
+defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
+  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
 defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
 
 let hasPostISelHook = 1 in {
 
-defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
+defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
+  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
 
 }
 defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
 
-defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", []>;
-defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", []>;
-defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", []>;
+defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
+  [(set i32:$dst, (and i32:$src0, i32:$src1))]>;
+defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
+  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+>;
+defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
+  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+>;
 
 } // End isCommutable = 1
 
@@ -1001,14 +1218,18 @@ defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
-defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", [], VSrc_32>;
-defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", [], VSrc_32>;
+defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
+  [(set i32:$dst, (add i32:$src0, i32:$src1))], VSrc_32>;
+defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
+  [(set i32:$dst, (sub i32:$src0, i32:$src1))], VSrc_32>;
 defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], VSrc_32,
                               "V_SUB_I32">;
 
 let Uses = [VCC] in { // Carry-in comes from VCC
-defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", [], VReg_32>;
-defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", [], VReg_32>;
+defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32",
+  [(set i32:$dst, (adde i32:$src0, i32:$src1))], VReg_32>;
+defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32",
+  [(set i32:$dst, (sube i32:$src0, i32:$src1))], VReg_32>;
 defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], VReg_32,
                                "V_SUBB_U32">;
 } // End Uses = [VCC]
@@ -1023,274 +1244,127 @@ defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
 >;
 ////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
 ////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
-def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>;
-def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>;
-def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>;
-def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>;
-def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>;
-def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>;
-def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>;
-def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>;
-def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>;
-def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>;
-def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>;
-def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>;
-////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
-////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
-////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
-////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
-//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
 
 let neverHasSideEffects = 1 in {
 
-def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
-def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
-def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
-  [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))]
+defm V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
+defm V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32",
+  [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
+>;
+defm V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
+  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))]
 >;
-def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
-  [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))]
+defm V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
+  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))]
 >;
 
 } // End neverHasSideEffects
-def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
-def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
-def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
-def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
+
+defm V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
+defm V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
+defm V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
+defm V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
 
 let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
-def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
+defm V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
   [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>;
-def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
+defm V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
   [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>;
 }
 
-def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32",
+defm V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32",
   [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))]>;
-defm : BFIPatterns <V_BFI_B32>;
-def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
+defm V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
   [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))]
 >;
-def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64",
-  [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
->;
-//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
-def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
-def : ROTRPattern <V_ALIGNBIT_B32>;
-
-def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
-def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
-////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
-////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
-////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
-////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>;
-////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>;
-////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>;
-////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
-////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
-////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
-//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
-//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
-//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
-def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
-////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
-def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
-def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
-
-def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64",
-  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
->;
-def V_LSHR_B64 : VOP3_64_Shift <0x00000162, "V_LSHR_B64",
-  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
->;
-def V_ASHR_I64 : VOP3_64_Shift <0x00000163, "V_ASHR_I64",
-  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
->;
-
-let isCommutable = 1 in {
-
-def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
-def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
-def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
-def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
-
-} // isCommutable = 1
-
-def : Pat <
-  (fadd f64:$src0, f64:$src1),
-  (V_ADD_F64 $src0, $src1, (i64 0))
->;
-
-def : Pat <
-  (fmul f64:$src0, f64:$src1),
-  (V_MUL_F64 $src0, $src1, (i64 0))
->;
-
-def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
-
-let isCommutable = 1 in {
-
-def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
-def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
-def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
-def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
-
-} // isCommutable = 1
-
-def : Pat <
-  (mul i32:$src0, i32:$src1),
-  (V_MUL_LO_I32 $src0, $src1, (i32 0))
->;
-
-def : Pat <
-  (mulhu i32:$src0, i32:$src1),
-  (V_MUL_HI_U32 $src0, $src1, (i32 0))
->;
-
-def : Pat <
-  (mulhs i32:$src0, i32:$src1),
-  (V_MUL_HI_I32 $src0, $src1, (i32 0))
->;
-
-def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
-def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
-def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
-def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
-//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
-//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
-//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
-def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
-
-let Defs = [SCC] in { // Carry out goes to SCC
-let isCommutable = 1 in {
-def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
-def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
-  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
->;
-} // End isCommutable = 1
-
-def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
-def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
-  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
->;
-
-let Uses = [SCC] in { // Carry in comes from SCC
-let isCommutable = 1 in {
-def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
-  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
-} // End isCommutable = 1
-
-def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
-  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
-} // End Uses = [SCC]
-} // End Defs = [SCC]
-
-def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32",
-  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
->;
-def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32",
-  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
->;
-def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32",
-  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
->;
-def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32",
-  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
->;
-
-def S_CSELECT_B32 : SOP2 <
-  0x0000000a, (outs SReg_32:$dst),
-  (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
-  []
->;
-
-def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
-
-def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32",
-  [(set i32:$dst, (and i32:$src0, i32:$src1))]
->;
-
-def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
-  [(set i64:$dst, (and i64:$src0, i64:$src1))]
->;
-
-def : Pat <
-  (i1 (and i1:$src0, i1:$src1)),
-  (S_AND_B64 $src0, $src1)
->;
-
-def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
-  [(set i32:$dst, (or i32:$src0, i32:$src1))]
->;
-
-def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
-  [(set i64:$dst, (or i64:$src0, i64:$src1))]
->;
-
-def : Pat <
-  (i1 (or i1:$src0, i1:$src1)),
-  (S_OR_B64 $src0, $src1)
->;
-
-def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32",
-  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
->;
-
-def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
-  [(set i1:$dst, (xor i1:$src0, i1:$src1))]
->;
-def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
-def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
-def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
-def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
-def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
-def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
-def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
-def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
-def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
-def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
+def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64",
+  [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
+>;
+//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
+defm V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
 
-// Use added complexity so these patterns are preferred to the VALU patterns.
-let AddedComplexity = 1 in {
+defm V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
+defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
+////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
+////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
+////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
+////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>;
+////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>;
+////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>;
+////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
+////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
+////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
+//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
+//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
+//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
+defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
+////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
+defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
+def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
 
-def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
-  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
->;
-def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
+def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64",
   [(set i64:$dst, (shl i64:$src0, i32:$src1))]
 >;
-def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
-  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
->;
-def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
+def V_LSHR_B64 : VOP3_64_Shift <0x00000162, "V_LSHR_B64",
   [(set i64:$dst, (srl i64:$src0, i32:$src1))]
 >;
-def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
-  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
->;
-def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
+def V_ASHR_I64 : VOP3_64_Shift <0x00000163, "V_ASHR_I64",
   [(set i64:$dst, (sra i64:$src0, i32:$src1))]
 >;
 
-} // End AddedComplexity = 1
+let isCommutable = 1 in {
 
-def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
-def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
-def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
-def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
-def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
-def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
-def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
-//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
-def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
+def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
+def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
+def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
+def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
+
+} // isCommutable = 1
+
+def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
+
+let isCommutable = 1 in {
+
+defm V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
+defm V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
+defm V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
+defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
+
+} // isCommutable = 1
+
+defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
+def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
+defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
+def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
+//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
+//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
+//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
+def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
 
 let isCodeGenOnly = 1, isPseudo = 1 in {
 
-def LOAD_CONST : AMDGPUShaderInst <
-  (outs GPRF32:$dst),
-  (ins i32imm:$src),
-  "LOAD_CONST $dst, $src",
-  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
+def V_MOV_I1 : InstSI <
+  (outs VReg_1:$dst),
+  (ins i1imm:$src),
+  "", [(set i1:$dst, (imm:$src))]
+>;
+
+def V_AND_I1 : InstSI <
+   (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+   [(set i1:$dst, (and i1:$src0, i1:$src1))]
+>;
+
+def V_OR_I1 : InstSI <
+   (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+   [(set i1:$dst, (or i1:$src0, i1:$src1))]
 >;
 
 // SI pseudo instructions. These are used by the CFG structurizer pass
@@ -1301,19 +1375,19 @@ let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
 
 let isBranch = 1, isTerminator = 1 in {
 
-def SI_IF : InstSI <
+def SI_IF: InstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$vcc, brtarget:$target),
-  "SI_IF $dst, $vcc, $target",
+  "",
   [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))]
 >;
 
 def SI_ELSE : InstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$src, brtarget:$target),
-  "SI_ELSE $dst, $src, $target",
-  [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]> {
-
+  "",
+  [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]
+> {
   let Constraints = "$src = $dst";
 }
 
@@ -1370,7 +1444,7 @@ let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
 
 let UseNamedOperandTable = 1 in {
 
-def SI_RegisterLoad : AMDGPUShaderInst <
+def SI_RegisterLoad : InstSI <
   (outs VReg_32:$dst, SReg_64:$temp),
   (ins FRAMEri32:$addr, i32imm:$chan),
   "", []
@@ -1379,7 +1453,7 @@ def SI_RegisterLoad : AMDGPUShaderInst <
   let mayLoad = 1;
 }
 
-class SIRegStore<dag outs> : AMDGPUShaderInst <
+class SIRegStore<dag outs> : InstSI <
   outs,
   (ins VReg_32:$val, FRAMEri32:$addr, i32imm:$chan),
   "", []
@@ -1439,8 +1513,33 @@ def V_SUB_F64 : InstSI <
 
 } // end usesCustomInserter
 
+multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
+
+  def _SAVE : InstSI <
+    (outs VReg_32:$dst),
+    (ins sgpr_class:$src, i32imm:$frame_idx),
+    "", []
+  >;
+
+  def _RESTORE : InstSI <
+    (outs sgpr_class:$dst),
+    (ins VReg_32:$src, i32imm:$frame_idx),
+    "", []
+  >;
+
+}
+
+defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
+defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
+defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+
 } // end IsCodeGenOnly, isPseudo
 
+} // end SubtargetPredicate = SI
+
+let Predicates = [isSI] in {
+
 def : Pat<
   (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
   (V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0))
@@ -1453,7 +1552,7 @@ def : Pat <
 
 /* int_SI_vs_load_input */
 def : Pat<
-  (SIload_input i128:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+  (SIload_input v4i32:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
   (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
 >;
 
@@ -1470,40 +1569,116 @@ def : Pat <
   (V_SUB_F64 $src0, $src1)
 >;
 
+//===----------------------------------------------------------------------===//
+// SMRD Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+
+  // 1. Offset as 8bit DWORD immediate
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
+    (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
+  >;
+
+  // 2. Offset loaded in an 32bit SGPR
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
+    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+  >;
+
+  // 3. No offset at all
+  def : Pat <
+    (constant_load i64:$sbase),
+    (vt (Instr_IMM $sbase, 0))
+  >;
+}
+
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+
+// 1. Offset as 8bit DWORD immediate
+def : Pat <
+  (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
+>;
+
+// 2. Offset loaded in an 32bit SGPR
+def : Pat <
+  (SIload_constant v4i32:$sbase, imm:$offset),
+  (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
+>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i1 (xor i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (or i64:$src0, i64:$src1),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub0),
+                  (EXTRACT_SUBREG i64:$src1, sub0)), sub0),
+    (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub1),
+                  (EXTRACT_SUBREG i64:$src1, sub1)), sub1)
+>;
+
+class SextInReg <ValueType vt, int ShiftAmt> : Pat <
+  (sext_inreg i32:$src0, vt),
+  (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0))
+>;
+
+def : SextInReg <i8, 24>;
+def : SextInReg <i16, 16>;
+
 /********** ======================= **********/
 /********** Image sampling patterns **********/
 /********** ======================= **********/
 
 /* SIsample for simple 1D texture lookup */
 def : Pat <
-  (SIsample i32:$addr, v32i8:$rsrc, i128:$sampler, imm),
+  (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
   (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, imm),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_RECT),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT),
     (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_ARRAY),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SampleShadowPattern<SDNode name, MIMG opcode,
                           ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SampleShadowArrayPattern<SDNode name, MIMG opcode,
                                ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW_ARRAY),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
@@ -1692,8 +1867,6 @@ def : BitConvert <i64, v2i32, VReg_64>;
 
 def : BitConvert <v4f32, v4i32, VReg_128>;
 def : BitConvert <v4i32, v4f32, VReg_128>;
-def : BitConvert <v4i32, i128,  VReg_128>;
-def : BitConvert <i128, v4i32,  VReg_128>;
 
 def : BitConvert <v8f32, v8i32, SReg_256>;
 def : BitConvert <v8i32, v8f32, SReg_256>;
@@ -1711,10 +1884,18 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
 /********** Src & Dst modifiers **********/
 /********** =================== **********/
 
+def FCLAMP_SI : AMDGPUShaderInst <
+  (outs VReg_32:$dst),
+  (ins VSrc_32:$src0),
+  "FCLAMP_SI $dst, $src0",
+  []
+> {
+  let usesCustomInserter = 1;
+}
+
 def : Pat <
   (int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
-  (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */),
-   0 /* ABS */, 1 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */)
+  (FCLAMP_SI f32:$src)
 >;
 
 /********** ================================ **********/
@@ -1733,14 +1914,32 @@ def : Pat <
   (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
 >;
 
+def FABS_SI : AMDGPUShaderInst <
+  (outs VReg_32:$dst),
+  (ins VSrc_32:$src0),
+  "FABS_SI $dst, $src0",
+  []
+> {
+  let usesCustomInserter = 1;
+}
+
 def : Pat <
   (fabs f32:$src),
-  (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) /* Clear sign bit */
+  (FABS_SI f32:$src)
 >;
 
+def FNEG_SI : AMDGPUShaderInst <
+  (outs VReg_32:$dst),
+  (ins VSrc_32:$src0),
+  "FNEG_SI $dst, $src0",
+  []
+> {
+  let usesCustomInserter = 1;
+}
+
 def : Pat <
   (fneg f32:$src),
-  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Toggle sign bit */
+  (FNEG_SI f32:$src)
 >;
 
 /********** ================== **********/
@@ -1768,30 +1967,10 @@ def : Pat <
 >;
 
 def : Pat <
-  (i1 imm:$imm),
-  (S_MOV_B64 imm:$imm)
->;
-
-def : Pat <
   (i64 InlineImm<i64>:$imm),
   (S_MOV_B64 InlineImm<i64>:$imm)
 >;
 
-// i64 immediates aren't supported in hardware, split it into two 32bit values
-def : Pat <
-  (i64 imm:$imm),
-  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-    (S_MOV_B32 (i32 (LO32 imm:$imm))), sub0),
-    (S_MOV_B32 (i32 (HI32 imm:$imm))), sub1)
->;
-
-def : Pat <
-  (f64 fpimm:$imm),
-  (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-    (V_MOV_B32_e32 (f32 (LO32f fpimm:$imm))), sub0),
-    (V_MOV_B32_e32 (f32 (HI32f fpimm:$imm))), sub1)
->;
-
 /********** ===================== **********/
 /********** Interpolation Paterns **********/
 /********** ===================== **********/
@@ -1875,21 +2054,9 @@ class Ext32Pat <SDNode ext> : Pat <
 def : Ext32Pat <zext>;
 def : Ext32Pat <anyext>;
 
-// 1. Offset as 8bit DWORD immediate
-def : Pat <
-  (SIload_constant i128:$sbase, IMM8bitDWORD:$offset),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
->;
-
-// 2. Offset loaded in an 32bit SGPR
-def : Pat <
-  (SIload_constant i128:$sbase, imm:$offset),
-  (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
->;
-
-// 3. Offset in an 32Bit VGPR
+// Offset in an 32Bit VGPR
 def : Pat <
-  (SIload_constant i128:$sbase, i32:$voff),
+  (SIload_constant v4i32:$sbase, i32:$voff),
   (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0)
 >;
 
@@ -1904,18 +2071,44 @@ def : Pat <
 def : Pat <
   (int_SI_tid),
   (V_MBCNT_HI_U32_B32_e32 0xffffffff,
-                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0, 0, 0))
+                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0))
 >;
 
-/********** ================== **********/
-/**********   VOP3 Patterns    **********/
-/********** ================== **********/
+//===----------------------------------------------------------------------===//
+// VOP3 Patterns
+//===----------------------------------------------------------------------===//
+
+def : IMad24Pat<V_MAD_I32_I24>;
+def : UMad24Pat<V_MAD_U32_U24>;
+
+def : Pat <
+  (fadd f64:$src0, f64:$src1),
+  (V_ADD_F64 $src0, $src1, (i64 0))
+>;
+
+def : Pat <
+  (fmul f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, $src1, (i64 0))
+>;
+
+def : Pat <
+  (mul i32:$src0, i32:$src1),
+  (V_MUL_LO_I32 $src0, $src1, (i32 0))
+>;
+
+def : Pat <
+  (mulhu i32:$src0, i32:$src1),
+  (V_MUL_HI_U32 $src0, $src1, (i32 0))
+>;
 
 def : Pat <
-  (f32 (fadd (fmul f32:$src0, f32:$src1), f32:$src2)),
-  (V_MAD_F32 $src0, $src1, $src2)
+  (mulhs i32:$src0, i32:$src1),
+  (V_MUL_HI_I32 $src0, $src1, (i32 0))
 >;
 
+defm : BFIPatterns <V_BFI_B32>;
+def : ROTRPattern <V_ALIGNBIT_B32>;
+
 /********** ======================= **********/
 /**********   Load/Store Patterns   **********/
 /********** ======================= **********/
@@ -1962,41 +2155,6 @@ def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
 def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val),
            (DS_SUB_U32_RTN 0, $ptr, $val, 0)>;
 
-/********** ================== **********/
-/**********   SMRD Patterns    **********/
-/********** ================== **********/
-
-multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
-
-  // 1. Offset as 8bit DWORD immediate
-  def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
-    (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
-  >;
-
-  // 2. Offset loaded in an 32bit SGPR
-  def : Pat <
-    (constant_load (SIadd64bit32bit i64:$sbase, imm:$offset)),
-    (vt (Instr_SGPR $sbase, (S_MOV_B32 imm:$offset)))
-  >;
-
-  // 3. No offset at all
-  def : Pat <
-    (constant_load i64:$sbase),
-    (vt (Instr_IMM $sbase, 0))
-  >;
-}
-
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, i128>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-
 //===----------------------------------------------------------------------===//
 // MUBUF Patterns
 //===----------------------------------------------------------------------===//
@@ -2083,7 +2241,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
                              MUBUF bothen> {
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
                                   imm:$offset, 0, 0, imm:$glc, imm:$slc,
                                   imm:$tfe)),
     (offset $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc),
@@ -2091,7 +2249,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
   >;
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
                                   imm, 1, 0, imm:$glc, imm:$slc,
                                   imm:$tfe)),
     (offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
@@ -2099,7 +2257,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
   >;
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
                                   imm:$offset, 0, 1, imm:$glc, imm:$slc,
                                   imm:$tfe)),
     (idxen $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc),
@@ -2107,7 +2265,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
   >;
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword i128:$rsrc, v2i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
                                   imm, 1, 1, imm:$glc, imm:$slc,
                                   imm:$tfe)),
     (bothen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
@@ -2128,7 +2286,7 @@ defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_
 
 // TBUFFER_STORE_FORMAT_*, addr64=0
 class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat<
-  (SItbuffer_store i128:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
+  (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
                    i32:$soffset, imm:$inst_offset, imm:$dfmt,
                    imm:$nfmt, imm:$offen, imm:$idxen,
                    imm:$glc, imm:$slc, imm:$tfe),
@@ -2156,12 +2314,13 @@ defm V_CEIL_F64 : VOP1_64 <0x00000018, "V_CEIL_F64",
 defm V_FLOOR_F64 : VOP1_64 <0x0000001A, "V_FLOOR_F64",
   [(set f64:$dst, (ffloor f64:$src0))]
 >;
+defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64",
+  [(set f64:$dst, (frint f64:$src0))]
+>;
 
-defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64", []>;
-
-def V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>;
-def V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>;
-def V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>;
+defm V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>;
+defm V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>;
+defm V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>;
 def V_MAD_U64_U32 : VOP3_64 <0x00000176, "V_MAD_U64_U32", []>;
 
 // XXX - Does this set VCC?
@@ -2248,17 +2407,43 @@ def : Pat<
 >;
 
 //===----------------------------------------------------------------------===//
-// Miscellaneous Patterns
+// Conversion Patterns
 //===----------------------------------------------------------------------===//
 
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+  (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
+
+// TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
+// might not be worth the effort, and will need to expand to shifts when
+// fixing SGPR copies.
+
+// Handle sext_inreg in i64
+def : Pat <
+  (i64 (sext_inreg i64:$src, i1)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_BFE_I32 (EXTRACT_SUBREG i64:$src, sub0), 65536), sub0), // 0 | 1 << 16
+    (S_MOV_B32 -1), sub1)
+>;
+
 def : Pat <
-  (i64 (trunc i128:$x)),
+  (i64 (sext_inreg i64:$src, i8)),
   (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-    (i32 (EXTRACT_SUBREG $x, sub0)), sub0),
-    (i32 (EXTRACT_SUBREG $x, sub1)), sub1)
+    (S_SEXT_I32_I8 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
+    (S_MOV_B32 -1), sub1)
 >;
 
 def : Pat <
+  (i64 (sext_inreg i64:$src, i16)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_SEXT_I32_I16 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
+    (S_MOV_B32 -1), sub1)
+>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
   (i32 (trunc i64:$a)),
   (EXTRACT_SUBREG $a, sub0)
 >;
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index c2f8696..6601f2a 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -67,7 +67,7 @@ private:
   static const unsigned SkipThreshold = 12;
 
   static char ID;
-  const TargetRegisterInfo *TRI;
+  const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
 
   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
@@ -92,11 +92,11 @@ private:
 
 public:
   SILowerControlFlowPass(TargetMachine &tm) :
-    MachineFunctionPass(ID), TRI(0), TII(0) { }
+    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "SI Lower control flow instructions";
   }
 
@@ -427,7 +427,7 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
 
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
   TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
-  TRI = MF.getTarget().getRegisterInfo();
+  TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp
new file mode 100644
index 0000000..738c90b
--- /dev/null
+++ b/lib/Target/R600/SILowerI1Copies.cpp
@@ -0,0 +1,148 @@
+//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// i1 values are usually inserted by the CFG Structurize pass and they are
+/// unique in that they can be copied from VALU to SALU registers.
+/// This is not possible for any other value type.  Since there are no
+/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
+///
+//===----------------------------------------------------------------------===//
+//
+
+#define DEBUG_TYPE "si-i1-copies"
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerI1Copies : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SILowerI1Copies() : MachineFunctionPass(ID) {
+    initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) override;
+
+  virtual const char *getPassName() const override {
+    return "SI Lower il Copies";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  AU.addRequired<MachineDominatorTree>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE,
+                      "SI Lower il Copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE,
+                    "SI Lower il Copies", false, false)
+
+char SILowerI1Copies::ID = 0;
+
+char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
+
+FunctionPass *llvm::createSILowerI1CopiesPass() {
+  return new SILowerI1Copies();
+}
+
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      MF.getTarget().getInstrInfo());
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  std::vector<unsigned> I1Defs;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::V_AND_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::V_OR_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() != AMDGPU::COPY ||
+          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
+          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
+        continue;
+
+
+      const TargetRegisterClass *DstRC =
+          MRI.getRegClass(MI.getOperand(0).getReg());
+      const TargetRegisterClass *SrcRC =
+          MRI.getRegClass(MI.getOperand(1).getReg());
+
+      if (DstRC == &AMDGPU::VReg_1RegClass &&
+          TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
+                .addOperand(MI.getOperand(0))
+                .addImm(0)
+                .addImm(-1)
+                .addOperand(MI.getOperand(1))
+                .addImm(0)
+                .addImm(0)
+                .addImm(0)
+                .addImm(0);
+        MI.eraseFromParent();
+      } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
+                 SrcRC == &AMDGPU::VReg_1RegClass) {
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
+                .addOperand(MI.getOperand(0))
+                .addImm(0)
+                .addOperand(MI.getOperand(1))
+                .addImm(0)
+                .addImm(0)
+                .addImm(0)
+                .addImm(0);
+        MI.eraseFromParent();
+      }
+    }
+  }
+
+  for (unsigned Reg : I1Defs)
+    MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass);
+
+  return false;
+}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index ea04346..af60995 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -10,8 +10,11 @@
 
 
 #include "SIMachineFunctionInfo.h"
+#include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
 
 #define MAX_LANES 64
 
@@ -26,21 +29,57 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     PSInputAddr(0),
     SpillTracker() { }
 
-static unsigned createLaneVGPR(MachineRegisterInfo &MRI) {
-  return MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
+  unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+
+  // We need to add this register as live out for the function, in order to
+  // have the live range calculated directly.
+  //
+  // When register spilling begins, we have already calculated the live
+  // live intervals for all the registers.  Since we are spilling SGPRs to
+  // VGPRs, we need to update the Lane VGPR's live interval every time we
+  // spill or restore a register.
+  //
+  // Unfortunately, there is no good way to update the live interval as
+  // the TargetInstrInfo callbacks for spilling and restoring don't give
+  // us access to the live interval information.
+  //
+  // We are lucky, though, because the InlineSpiller calls
+  // LiveRangeEdit::calculateRegClassAndHint() which iterates through
+  // all the new register that have been created when restoring a register
+  // and calls LiveIntervals::getInterval(), which creates and computes
+  // the live interval for the newly created register.  However, once this
+  // live intervals is created, it doesn't change and since we usually reuse
+  // the Lane VGPR multiple times, this means any uses after the first aren't
+  // added to the live interval.
+  //
+  // To work around this, we add Lane VGPRs to the functions live out list,
+  // so that we can guarantee its live range will cover all of its uses.
+
+  for (MachineBasicBlock &MBB : *MF) {
+    if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) {
+      MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true));
+      return VGPR;
+    }
+  }
+  MF->getFunction()->getContext().emitError(
+      "Could not found S_ENGPGM instrtuction.");
+  return VGPR;
 }
 
-unsigned SIMachineFunctionInfo::RegSpillTracker::getNextLane(MachineRegisterInfo &MRI) {
+unsigned SIMachineFunctionInfo::RegSpillTracker::reserveLanes(
+    MachineRegisterInfo &MRI, MachineFunction *MF, unsigned NumRegs) {
+  unsigned StartLane = CurrentLane;
+  CurrentLane += NumRegs;
   if (!LaneVGPR) {
-    LaneVGPR = createLaneVGPR(MRI);
+    LaneVGPR = createLaneVGPR(MRI, MF);
   } else {
-    CurrentLane++;
-    if (CurrentLane == MAX_LANES) {
-      CurrentLane = 0;
-      LaneVGPR = createLaneVGPR(MRI);
+    if (CurrentLane >= MAX_LANES) {
+      StartLane = CurrentLane = 0;
+      LaneVGPR = createLaneVGPR(MRI, MF);
     }
   }
-  return CurrentLane;
+  return StartLane;
 }
 
 void SIMachineFunctionInfo::RegSpillTracker::addSpilledReg(unsigned FrameIndex,
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 8dc82a0..96e619b 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -25,7 +25,7 @@ class MachineRegisterInfo;
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public AMDGPUMachineFunction {
-  virtual void anchor();
+  void anchor() override;
 public:
 
   struct SpilledReg {
@@ -43,7 +43,12 @@ public:
   public:
     unsigned LaneVGPR;
     RegSpillTracker() : CurrentLane(0), SpilledRegisters(), LaneVGPR(0) { }
-    unsigned getNextLane(MachineRegisterInfo &MRI);
+    /// \p NumRegs The number of consecutive registers what need to be spilled.
+    ///            This function will ensure that all registers are stored in
+    ///            the same VGPR.
+    /// \returns The lane to be used for storing the first register.
+    unsigned reserveLanes(MachineRegisterInfo &MRI, MachineFunction *MF,
+                          unsigned NumRegs = 1);
     void addSpilledReg(unsigned FrameIndex, unsigned Reg, int Lane = -1);
     const SpilledReg& getSpilledReg(unsigned FrameIndex);
     bool programSpillsRegisters() { return !SpilledRegisters.empty(); }
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index 6cef195..c72d549 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -71,13 +71,12 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
     &AMDGPU::SReg_256RegClass
   };
 
-  for (unsigned i = 0, e = sizeof(BaseClasses) /
-                           sizeof(const TargetRegisterClass*); i != e; ++i) {
-    if (BaseClasses[i]->contains(Reg)) {
-      return BaseClasses[i];
+  for (const TargetRegisterClass *BaseClass : BaseClasses) {
+    if (BaseClass->contains(Reg)) {
+      return BaseClass;
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const {
@@ -113,7 +112,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
     } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
       return &AMDGPU::VReg_512RegClass;
     }
-    return NULL;
+    return nullptr;
 }
 
 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
@@ -129,3 +128,10 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
     return &AMDGPU::VGPR_32RegClass;
   }
 }
+
+unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
+                                          const TargetRegisterClass *SubRC,
+                                          unsigned Channel) const {
+  unsigned Index = getHWRegIndex(Reg);
+  return SubRC->getRegister(Index + Channel);
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index 8148f7f..36b4fcd 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -27,22 +27,22 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
 
   SIRegisterInfo(AMDGPUTargetMachine &tm);
 
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  virtual unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                                       MachineFunction &MF) const;
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
 
   /// \param RC is an AMDIL reg class.
   ///
   /// \returns the SI register class that is equivalent to \p RC.
-  virtual const TargetRegisterClass *
-    getISARegClass(const TargetRegisterClass *RC) const;
+  const TargetRegisterClass *
+    getISARegClass(const TargetRegisterClass *RC) const override;
 
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
-  virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
 
-  virtual unsigned getHWRegIndex(unsigned Reg) const;
+  unsigned getHWRegIndex(unsigned Reg) const override;
 
   /// \brief Return the 'base' register class for this register.
   /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc.
@@ -63,6 +63,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   /// be returned.
   const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
                                             unsigned SubIdx) const;
+
+  /// \p Channel This is the register channel (e.g. a value from 0-16), not the
+  ///            SubReg index.
+  /// \returns The sub-register of Reg that is in Channel.
+  unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
+                            unsigned Channel) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 65cf311..f1f01de 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -168,7 +168,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
   (add SGPR_64Regs, VCCReg, EXECReg)
 >;
 
-def SReg_128 : RegisterClass<"AMDGPU", [i128, v4i32], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32], 128, (add SGPR_128)>;
 
 def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
 
@@ -183,14 +183,16 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
   let Size = 96;
 }
 
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, i128], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
 
 def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
 
 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
 
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>;
+
 //===----------------------------------------------------------------------===//
-//  [SV]Src_* register classes, can have either an immediate or an register
+//  [SV]Src_(32|64) register classes, can have either an immediate or an register
 //===----------------------------------------------------------------------===//
 
 def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
@@ -201,3 +203,9 @@ def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
 
 def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
 
+//===----------------------------------------------------------------------===//
+// SGPR and VGPR register classes
+//===----------------------------------------------------------------------===//
+
+def VSrc_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128,
+                             (add VReg_128, SReg_128)>;
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
index 9bf2caf..a0b6907 100644
--- a/lib/Target/R600/SITypeRewriter.cpp
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -35,13 +35,13 @@ class SITypeRewriter : public FunctionPass,
   static char ID;
   Module *Mod;
   Type *v16i8;
-  Type *i128;
+  Type *v4i32;
 
 public:
   SITypeRewriter() : FunctionPass(ID) { }
-  virtual bool doInitialization(Module &M);
-  virtual bool runOnFunction(Function &F);
-  virtual const char *getPassName() const {
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+  const char *getPassName() const override {
     return "SI Type Rewriter";
   }
   void visitLoadInst(LoadInst &I);
@@ -56,7 +56,7 @@ char SITypeRewriter::ID = 0;
 bool SITypeRewriter::doInitialization(Module &M) {
   Mod = &M;
   v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
-  i128 = Type::getIntNTy(M.getContext(), 128);
+  v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4);
   return false;
 }
 
@@ -84,7 +84,8 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) {
   Type *ElemTy = PtrTy->getPointerElementType();
   IRBuilder<> Builder(&I);
   if (ElemTy == v16i8)  {
-    Value *BitCast = Builder.CreateBitCast(Ptr, Type::getIntNPtrTy(I.getContext(), 128, 2));
+    Value *BitCast = Builder.CreateBitCast(Ptr,
+        PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
     LoadInst *Load = Builder.CreateLoad(BitCast);
     SmallVector <std::pair<unsigned, MDNode*>, 8> MD;
     I.getAllMetadataOtherThanDebugLoc(MD);
@@ -99,6 +100,7 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) {
 
 void SITypeRewriter::visitCallInst(CallInst &I) {
   IRBuilder<> Builder(&I);
+
   SmallVector <Value*, 8> Args;
   SmallVector <Type*, 8> Types;
   bool NeedToReplace = false;
@@ -107,10 +109,10 @@ void SITypeRewriter::visitCallInst(CallInst &I) {
   for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
     Value *Arg = I.getArgOperand(i);
     if (Arg->getType() == v16i8) {
-      Args.push_back(Builder.CreateBitCast(Arg, i128));
-      Types.push_back(i128);
+      Args.push_back(Builder.CreateBitCast(Arg, v4i32));
+      Types.push_back(v4i32);
       NeedToReplace = true;
-      Name = Name + ".i128";
+      Name = Name + ".v4i32";
     } else if (Arg->getType()->isVectorTy() &&
                Arg->getType()->getVectorNumElements() == 1 &&
                Arg->getType()->getVectorElementType() ==
@@ -144,12 +146,12 @@ void SITypeRewriter::visitCallInst(CallInst &I) {
 
 void SITypeRewriter::visitBitCast(BitCastInst &I) {
   IRBuilder<> Builder(&I);
-  if (I.getDestTy() != i128) {
+  if (I.getDestTy() != v4i32) {
     return;
   }
 
   if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
-    if (Op->getSrcTy() == i128) {
+    if (Op->getSrcTy() == v4i32) {
       I.replaceAllUsesWith(Op->getOperand(0));
       I.eraseFromParent();
     }
diff --git a/lib/Target/Sparc/AsmParser/LLVMBuild.txt b/lib/Target/Sparc/AsmParser/LLVMBuild.txt
index c3ddf5a..08fdc9d 100644
--- a/lib/Target/Sparc/AsmParser/LLVMBuild.txt
+++ b/lib/Target/Sparc/AsmParser/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SparcAsmParser
 parent = Sparc
-required_libraries = MC MCParser Support SparcDesc SparcInfo
+required_libraries = MC MCParser SparcDesc SparcInfo Support
 add_to_library_groups = Sparc
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 2ff6cdd..da88820 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -49,15 +49,15 @@ class SparcAsmParser : public MCTargetAsmParser {
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
-                               bool MatchingInlineAsm);
-  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+                               bool MatchingInlineAsm) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-  bool ParseDirective(AsmToken DirectiveID);
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
 
-  virtual unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
-                                              unsigned Kind);
+  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+                                      unsigned Kind) override;
 
   // Custom parse functions for Sparc specific operands.
   OperandMatchResultTy
@@ -83,7 +83,8 @@ class SparcAsmParser : public MCTargetAsmParser {
   bool is64Bit() const { return STI.getTargetTriple().startswith("sparcv9"); }
 public:
   SparcAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
-                const MCInstrInfo &MII)
+                const MCInstrInfo &MII,
+                const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(sti), Parser(parser) {
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -181,10 +182,10 @@ private:
     struct MemOp Mem;
   };
 public:
-  bool isToken() const { return Kind == k_Token; }
-  bool isReg() const { return Kind == k_Register; }
-  bool isImm() const { return Kind == k_Immediate; }
-  bool isMem() const { return isMEMrr() || isMEMri(); }
+  bool isToken() const override { return Kind == k_Token; }
+  bool isReg() const override { return Kind == k_Register; }
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isMem() const override { return isMEMrr() || isMEMri(); }
   bool isMEMrr() const { return Kind == k_MemoryReg; }
   bool isMEMri() const { return Kind == k_MemoryImm; }
 
@@ -203,7 +204,7 @@ public:
     return StringRef(Tok.Data, Tok.Length);
   }
 
-  unsigned getReg() const {
+  unsigned getReg() const override {
     assert((Kind == k_Register) && "Invalid access!");
     return Reg.RegNum;
   }
@@ -229,22 +230,22 @@ public:
   }
 
   /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const {
+  SMLoc getStartLoc() const override {
     return StartLoc;
   }
   /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const {
+  SMLoc getEndLoc() const override {
     return EndLoc;
   }
 
-  virtual void print(raw_ostream &OS) const {
+  void print(raw_ostream &OS) const override {
     switch (Kind) {
     case k_Token:     OS << "Token: " << getToken() << "\n"; break;
     case k_Register:  OS << "Reg: #" << getReg() << "\n"; break;
     case k_Immediate: OS << "Imm: " << getImm() << "\n"; break;
     case k_MemoryReg: OS << "Mem: " << getMemBase() << "+"
                          << getMemOffsetReg() << "\n"; break;
-    case k_MemoryImm: assert(getMemOff() != 0);
+    case k_MemoryImm: assert(getMemOff() != nullptr);
       OS << "Mem: " << getMemBase()
          << "+" << *getMemOff()
          << "\n"; break;
@@ -264,7 +265,7 @@ public:
 
   void addExpr(MCInst &Inst, const MCExpr *Expr) const{
     // Add as immediate when possible.  Null MCExpr = 0.
-    if (Expr == 0)
+    if (!Expr)
       Inst.addOperand(MCOperand::CreateImm(0));
     else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
       Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
@@ -323,7 +324,7 @@ public:
     assert(Op->Reg.Kind == rk_FloatReg);
     unsigned regIdx = Reg - Sparc::F0;
     if (regIdx % 2 || regIdx > 31)
-      return 0;
+      return nullptr;
     Op->Reg.RegNum = DoubleRegs[regIdx / 2];
     Op->Reg.Kind = rk_DoubleReg;
     return Op;
@@ -337,13 +338,13 @@ public:
     case rk_FloatReg:
       regIdx = Reg - Sparc::F0;
       if (regIdx % 4 || regIdx > 31)
-        return 0;
+        return nullptr;
       Reg = QuadFPRegs[regIdx / 4];
       break;
     case rk_DoubleReg:
       regIdx =  Reg - Sparc::D0;
       if (regIdx % 2 || regIdx > 31)
-        return 0;
+        return nullptr;
       Reg = QuadFPRegs[regIdx / 2];
       break;
     }
@@ -357,7 +358,7 @@ public:
     Op->Kind = k_MemoryReg;
     Op->Mem.Base = Base;
     Op->Mem.OffsetReg = offsetReg;
-    Op->Mem.Off = 0;
+    Op->Mem.Off = nullptr;
     return Op;
   }
 
@@ -564,7 +565,7 @@ parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands)
   case AsmToken::Comma:
   case AsmToken::RBrac:
   case AsmToken::EndOfStatement:
-    Operands.push_back(SparcOperand::CreateMEMri(BaseReg, 0, S, E));
+    Operands.push_back(SparcOperand::CreateMEMri(BaseReg, nullptr, S, E));
     return MatchOperand_Success;
 
   case AsmToken:: Plus:
@@ -574,7 +575,7 @@ parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands)
     break;
   }
 
-  SparcOperand *Offset = 0;
+  SparcOperand *Offset = nullptr;
   OperandMatchResultTy ResTy = parseSparcAsmOperand(Offset);
   if (ResTy != MatchOperand_Success || !Offset)
     return MatchOperand_NoMatch;
@@ -636,7 +637,7 @@ parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     return MatchOperand_Success;
   }
 
-  SparcOperand *Op = 0;
+  SparcOperand *Op = nullptr;
 
   ResTy = parseSparcAsmOperand(Op, (Mnemonic == "call"));
   if (ResTy != MatchOperand_Success || !Op)
@@ -656,7 +657,7 @@ SparcAsmParser::parseSparcAsmOperand(SparcOperand *&Op, bool isCall)
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   const MCExpr *EVal;
 
-  Op = 0;
+  Op = nullptr;
   switch (getLexer().getKind()) {
   default:  break;
 
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 88fba39..f3441ff 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -12,7 +12,6 @@
 // NOP is placed.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "delay-slot-filler"
 #include "Sparc.h"
 #include "SparcSubtarget.h"
 #include "llvm/ADT/SmallSet.h"
@@ -27,6 +26,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "delay-slot-filler"
+
 STATISTIC(FilledSlots, "Number of delay slots filled");
 
 static cl::opt<bool> DisableDelaySlotFiller(
@@ -49,12 +50,12 @@ namespace {
         Subtarget(&TM.getSubtarget<SparcSubtarget>()) {
     }
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "SPARC Delay Slot Filler";
     }
 
     bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
-    bool runOnMachineFunction(MachineFunction &F) {
+    bool runOnMachineFunction(MachineFunction &F) override {
       bool Changed = false;
 
       // This pass invalidates liveness information when it reorders
diff --git a/lib/Target/Sparc/Disassembler/LLVMBuild.txt b/lib/Target/Sparc/Disassembler/LLVMBuild.txt
index e7387cd..c27398f 100644
--- a/lib/Target/Sparc/Disassembler/LLVMBuild.txt
+++ b/lib/Target/Sparc/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SparcDisassembler
 parent = Sparc
-required_libraries = MC Support SparcInfo
+required_libraries = MC SparcInfo Support
 add_to_library_groups = Sparc
diff --git a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 5cd99d6..4df0990 100644
--- a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sparc-disassembler"
-
 #include "Sparc.h"
 #include "SparcRegisterInfo.h"
 #include "SparcSubtarget.h"
@@ -23,6 +21,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "sparc-disassembler"
+
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
@@ -32,22 +32,18 @@ class SparcDisassembler : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  SparcDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info) :
-    MCDisassembler(STI), RegInfo(Info)
+  SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx)
   {}
   virtual ~SparcDisassembler() {}
 
-  const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
-
   /// getInstruction - See MCDisassembler.
-  virtual DecodeStatus getInstruction(MCInst &instr,
-                                      uint64_t &size,
-                                      const MemoryObject &region,
-                                      uint64_t address,
-                                      raw_ostream &vStream,
-                                      raw_ostream &cStream) const;
-private:
-  OwningPtr<const MCRegisterInfo> RegInfo;
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
 };
 
 }
@@ -58,8 +54,9 @@ namespace llvm {
 
 static MCDisassembler *createSparcDisassembler(
                        const Target &T,
-                       const MCSubtargetInfo &STI) {
-  return new SparcDisassembler(STI, T.createMCRegInfo(""));
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new SparcDisassembler(STI, Ctx);
 }
 
 
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
index fabc125..261fb38 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
@@ -11,15 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "SparcInstPrinter.h"
 #include "Sparc.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 // The generated AsmMatcher SparcGenAsmWriter uses "Sparc" as the target
 // namespace. But SPARC backend uses "SP" as its namespace.
 namespace llvm {
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
index 45ee6c0..8fe4075 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
@@ -30,19 +30,21 @@ public:
                   const MCSubtargetInfo &sti)
    : MCInstPrinter(MAI, MII, MRI), STI(sti) {}
 
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
   bool printSparcAliasInstr(const MCInst *MI, raw_ostream &OS);
   bool isV9() const;
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
   bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
   void printOperand(const MCInst *MI, int opNum, raw_ostream &OS);
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &OS,
-                       const char *Modifier = 0);
+                       const char *Modifier = nullptr);
   void printCCOperand(const MCInst *MI, int opNum, raw_ostream &OS);
   bool printGetPCX(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 39c9996..7d517b6 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -102,11 +102,11 @@ namespace {
   public:
     SparcAsmBackend(const Target &T) : MCAsmBackend(), TheTarget(T) {}
 
-    unsigned getNumFixupKinds() const {
+    unsigned getNumFixupKinds() const override {
       return Sparc::NumTargetFixupKinds;
     }
 
-    const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
       const static MCFixupKindInfo Infos[Sparc::NumTargetFixupKinds] = {
         // name                    offset bits  flags
         { "fixup_sparc_call30",     2,     30,  MCFixupKindInfo::FKF_IsPCRel },
@@ -184,7 +184,7 @@ namespace {
       }
     }
 
-    bool mayNeedRelaxation(const MCInst &Inst) const {
+    bool mayNeedRelaxation(const MCInst &Inst) const override {
       // FIXME.
       return false;
     }
@@ -194,17 +194,17 @@ namespace {
     bool fixupNeedsRelaxation(const MCFixup &Fixup,
                               uint64_t Value,
                               const MCRelaxableFragment *DF,
-                              const MCAsmLayout &Layout) const {
+                              const MCAsmLayout &Layout) const override {
       // FIXME.
       assert(0 && "fixupNeedsRelaxation() unimplemented");
       return false;
     }
-    void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+    void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
       // FIXME.
       assert(0 && "relaxInstruction() unimplemented");
     }
 
-    bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+    bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
       // Cannot emit NOP with size not multiple of 32 bits.
       if (Count % 4 != 0)
         return false;
@@ -229,7 +229,7 @@ namespace {
       SparcAsmBackend(T), OSType(OSType) { }
 
     void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                    uint64_t Value, bool IsPCRel) const {
+                    uint64_t Value, bool IsPCRel) const override {
 
       Value = adjustFixupValue(Fixup.getKind(), Value);
       if (!Value) return;           // Doesn't change encoding.
@@ -244,7 +244,7 @@ namespace {
 
     }
 
-    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
       uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(OSType);
       return createSparcELFObjectWriter(OS, is64Bit(), OSABI);
     }
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index ef5f8ce..6875fc6 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -32,7 +32,7 @@ SparcELFMCAsmInfo::SparcELFMCAsmInfo(StringRef TT) {
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
   // .xword is only supported by V9.
-  Data64bitsDirective = (isV9) ? "\t.xword\t" : 0;
+  Data64bitsDirective = (isV9) ? "\t.xword\t" : nullptr;
   ZeroDirective = "\t.skip\t";
   CommentString = "!";
   HasLEB128 = true;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index d53d09d..e126b68 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -20,15 +20,15 @@ namespace llvm {
 class StringRef;
 
 class SparcELFMCAsmInfo : public MCAsmInfoELF {
-  virtual void anchor();
+  void anchor() override;
 public:
   explicit SparcELFMCAsmInfo(StringRef TT);
-  virtual const MCExpr* getExprForPersonalitySymbol(const MCSymbol *Sym,
-                                                    unsigned Encoding,
-                                                    MCStreamer &Streamer) const;
-  virtual const MCExpr* getExprForFDESymbol(const MCSymbol *Sym,
-                                            unsigned Encoding,
-                                            MCStreamer &Streamer) const;
+  const MCExpr*
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+  const MCExpr* getExprForFDESymbol(const MCSymbol *Sym,
+                                    unsigned Encoding,
+                                    MCStreamer &Streamer) const override;
 
 };
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 310fbd9..b19ad7b 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mccodeemitter"
 #include "SparcMCExpr.h"
 #include "MCTargetDesc/SparcFixupKinds.h"
 #include "SparcMCTargetDesc.h"
@@ -26,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mccodeemitter"
+
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
@@ -41,7 +42,7 @@ public:
 
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   // getBinaryCodeForInstr - TableGen'erated function for getting the
   // binary encoding for an instruction.
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index e6b2aca..ae57fdc 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sparcmcexpr"
 #include "SparcMCExpr.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -23,6 +22,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "sparcmcexpr"
+
 const SparcMCExpr*
 SparcMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
                       MCContext &Ctx) {
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index be6526e..78dd945 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -85,15 +85,15 @@ public:
   Sparc::Fixups getFixupKind() const { return getFixupKind(Kind); }
 
   /// @}
-  void PrintImpl(raw_ostream &OS) const;
+  void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-  void AddValueSymbols(MCAssembler *) const;
-  const MCSection *FindAssociatedSection() const {
+                                 const MCAsmLayout *Layout) const override;
+  void AddValueSymbols(MCAssembler *) const override;
+  const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
   }
 
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index c69af56..571017d 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -22,6 +22,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "SparcGenInstrInfo.inc"
 
@@ -31,14 +33,11 @@
 #define GET_REGINFO_MC_DESC
 #include "SparcGenRegisterInfo.inc"
 
-using namespace llvm;
-
-
 static MCAsmInfo *createSparcMCAsmInfo(const MCRegisterInfo &MRI,
                                        StringRef TT) {
   MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
   unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
   MAI->addInitialFrameState(Inst);
   return MAI;
 }
@@ -47,7 +46,7 @@ static MCAsmInfo *createSparcV9MCAsmInfo(const MCRegisterInfo &MRI,
                                        StringRef TT) {
   MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
   unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 2047);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 2047);
   MAI->addInitialFrameState(Inst);
   return MAI;
 }
@@ -136,13 +135,12 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
 
 static MCStreamer *
 createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                    bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                    bool isVerboseAsm, bool useDwarfDirectory,
                     MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                     MCAsmBackend *TAB, bool ShowInst) {
 
-  MCStreamer *S =
-      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
-                              InstPrint, CE, TAB, ShowInst);
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
   new SparcTargetAsmStreamer(*S, OS);
   return S;
 }
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index 50506a6..1b7330e 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "Sparc.h"
 #include "InstPrinter/SparcInstPrinter.h"
 #include "MCTargetDesc/SparcMCExpr.h"
@@ -35,6 +34,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 namespace {
   class SparcAsmPrinter : public AsmPrinter {
     SparcTargetStreamer &getTargetStreamer() {
@@ -45,18 +46,18 @@ namespace {
     explicit SparcAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Sparc Assembly Printer";
     }
 
     void printOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
     void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
-                         const char *Modifier = 0);
+                         const char *Modifier = nullptr);
     void printCCOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
 
-    virtual void EmitFunctionBodyStart();
-    virtual void EmitInstruction(const MachineInstr *MI);
-    virtual void EmitEndOfAsmFile(Module &M);
+    void EmitFunctionBodyStart() override;
+    void EmitInstruction(const MachineInstr *MI) override;
+    void EmitEndOfAsmFile(Module &M) override;
 
     static const char *getRegisterName(unsigned RegNo) {
       return SparcInstPrinter::getRegisterName(RegNo);
@@ -64,10 +65,10 @@ namespace {
 
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
-                         raw_ostream &O);
+                         raw_ostream &O) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                unsigned AsmVariant, const char *ExtraCode,
-                               raw_ostream &O);
+                               raw_ostream &O) override;
 
     void LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
                                    const MCSubtargetInfo &STI);
diff --git a/lib/Target/Sparc/SparcCodeEmitter.cpp b/lib/Target/Sparc/SparcCodeEmitter.cpp
index 4f8d477..247da2a 100644
--- a/lib/Target/Sparc/SparcCodeEmitter.cpp
+++ b/lib/Target/Sparc/SparcCodeEmitter.cpp
@@ -12,7 +12,6 @@
 //
 //===---------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "Sparc.h"
 #include "MCTargetDesc/SparcMCExpr.h"
 #include "SparcRelocations.h"
@@ -25,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 STATISTIC(NumEmitted, "Number of machine instructions emitted");
 
 namespace {
@@ -39,7 +40,7 @@ class SparcCodeEmitter : public MachineFunctionPass {
   const std::vector<MachineConstantPoolEntry> *MCPEs;
   bool IsPIC;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineModuleInfo> ();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -48,13 +49,13 @@ class SparcCodeEmitter : public MachineFunctionPass {
 
 public:
   SparcCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-    : MachineFunctionPass(ID), JTI(0), II(0), TD(0),
-      TM(tm), MCE(mce), MCPEs(0),
+    : MachineFunctionPass(ID), JTI(nullptr), II(nullptr), TD(nullptr),
+      TM(tm), MCE(mce), MCPEs(nullptr),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
-  bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "Sparc Machine Code Emitter";
   }
 
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index d96a4c0..a37da94 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -109,18 +109,21 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
   // Emit ".cfi_def_cfa_register 30".
   unsigned CFIIndex =
       MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, regFP));
-  BuildMI(MBB, MBBI, dl, TII.get(SP::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 
   // Emit ".cfi_window_save".
   CFIIndex = MMI.addFrameInst(MCCFIInstruction::createWindowSave(nullptr));
-  BuildMI(MBB, MBBI, dl, TII.get(SP::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 
   unsigned regInRA = MRI->getDwarfRegNum(SP::I7, true);
   unsigned regOutRA = MRI->getDwarfRegNum(SP::O7, true);
   // Emit ".cfi_register 15, 31".
   CFIIndex = MMI.addFrameInst(
       MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA));
-  BuildMI(MBB, MBBI, dl, TII.get(SP::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 void SparcFrameLowering::
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index 072fde3..bda7b7c 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -31,17 +31,18 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+  void
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
 
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
-  bool hasFP(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool hasFP(const MachineFunction &MF) const override;
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS = NULL) const;
+                                     RegScavenger *RS = nullptr) const override;
 
 private:
   // Remap input registers to output registers for leaf procedure.
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index b012bfd..2fade27 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -41,7 +41,7 @@ public:
       TM(tm) {
   }
 
-  SDNode *Select(SDNode *N);
+  SDNode *Select(SDNode *N) override;
 
   // Complex Pattern Selectors.
   bool SelectADDRrr(SDValue N, SDValue &R1, SDValue &R2);
@@ -49,11 +49,11 @@ public:
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
-  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                            char ConstraintCode,
-                                            std::vector<SDValue> &OutOps);
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "SPARC DAG->DAG Pattern Instruction Selection";
   }
 
@@ -143,7 +143,7 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
 
   switch (N->getOpcode()) {
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 8e720ee..ef61466 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -53,7 +53,7 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
                                 MVT &LocVT, CCValAssign::LocInfo &LocInfo,
                                 ISD::ArgFlagsTy &ArgFlags, CCState &State)
 {
-  static const uint16_t RegList[] = {
+  static const MCPhysReg RegList[] = {
     SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
   };
   // Try to get first reg.
@@ -235,8 +235,7 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other,
-                     &RetOps[0], RetOps.size());
+  return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
 // Lower return values for the 64-bit ABI.
@@ -315,8 +314,7 @@ SparcTargetLowering::LowerReturn_64(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other,
-                     &RetOps[0], RetOps.size());
+  return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
 SDValue SparcTargetLowering::
@@ -357,10 +355,13 @@ LowerFormalArguments_32(SDValue Chain,
 
   const unsigned StackOffset = 92;
 
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+  unsigned InIdx = 0;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++InIdx) {
     CCValAssign &VA = ArgLocs[i];
 
-    if (i == 0  && Ins[i].Flags.isSRet()) {
+    if (Ins[InIdx].Flags.isSRet()) {
+      if (InIdx != 0)
+        report_fatal_error("sparc only supports sret on the first parameter");
       // Get SRet from [%fp+64].
       int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, 64, true);
       SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
@@ -493,11 +494,11 @@ LowerFormalArguments_32(SDValue Chain,
 
   // Store remaining ArgRegs to the stack if this is a varargs function.
   if (isVarArg) {
-    static const uint16_t ArgRegs[] = {
+    static const MCPhysReg ArgRegs[] = {
       SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
     };
     unsigned NumAllocated = CCInfo.getFirstUnallocated(ArgRegs, 6);
-    const uint16_t *CurArgReg = ArgRegs+NumAllocated, *ArgRegEnd = ArgRegs+6;
+    const MCPhysReg *CurArgReg = ArgRegs+NumAllocated, *ArgRegEnd = ArgRegs+6;
     unsigned ArgOffset = CCInfo.getNextStackOffset();
     if (NumAllocated == 6)
       ArgOffset += StackOffset;
@@ -528,8 +529,7 @@ LowerFormalArguments_32(SDValue Chain,
 
     if (!OutChains.empty()) {
       OutChains.push_back(Chain);
-      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                          &OutChains[0], OutChains.size());
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
     }
   }
 
@@ -644,8 +644,7 @@ LowerFormalArguments_64(SDValue Chain,
   }
 
   if (!OutChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                        &OutChains[0], OutChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
 
   return Chain;
 }
@@ -663,7 +662,7 @@ static bool hasReturnsTwiceAttr(SelectionDAG &DAG, SDValue Callee,
   if (CS)
     return CS->hasFnAttr(Attribute::ReturnsTwice);
 
-  const Function *CalleeFn = 0;
+  const Function *CalleeFn = nullptr;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     CalleeFn = dyn_cast<Function>(G->getGlobal());
   } else if (ExternalSymbolSDNode *E =
@@ -877,8 +876,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 
   // Emit all stores, make sure the occur before any copies into physregs.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
@@ -927,7 +925,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
-  Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
@@ -961,9 +959,9 @@ static bool isFP128ABICall(const char *CalleeName)
        "_Q_sqrt", "_Q_neg",
        "_Q_itoq", "_Q_stoq", "_Q_dtoq", "_Q_utoq",
        "_Q_lltoq", "_Q_ulltoq",
-       0
+       nullptr
     };
-  for (const char * const *I = ABICalls; *I != 0; ++I)
+  for (const char * const *I = ABICalls; *I != nullptr; ++I)
     if (strcmp(CalleeName, *I) == 0)
       return true;
   return false;
@@ -972,7 +970,7 @@ static bool isFP128ABICall(const char *CalleeName)
 unsigned
 SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
 {
-  const Function *CalleeFn = 0;
+  const Function *CalleeFn = nullptr;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     CalleeFn = dyn_cast<Function>(G->getGlobal());
   } else if (ExternalSymbolSDNode *E =
@@ -1194,8 +1192,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
 
   // Emit all stores, make sure they occur before the call.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
   // Build a sequence of CopyToReg nodes glued together with token chain and
   // glue operands which copy the outgoing args into registers. The InGlue is
@@ -1245,7 +1242,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
 
   // Now the call itself.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(SPISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(SPISD::CALL, DL, NodeTys, Ops);
   InGlue = Chain.getValue(1);
 
   // Revert the stack pointer immediately after the call.
@@ -1263,7 +1260,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
 
   // Set inreg flag manually for codegen generated library calls that
   // return float.
-  if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && CLI.CS == 0)
+  if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && CLI.CS == nullptr)
     CLI.Ins[0].Flags.setInReg();
 
   RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_Sparc64);
@@ -1677,7 +1674,7 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
 
 const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return 0;
+  default: return nullptr;
   case SPISD::CMPICC:     return "SPISD::CMPICC";
   case SPISD::CMPFCC:     return "SPISD::CMPFCC";
   case SPISD::BRICC:      return "SPISD::BRICC";
@@ -1711,7 +1708,7 @@ EVT SparcTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
 /// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
 /// be zero. Op is expected to be a target specific node. Used by DAG
 /// combiner.
-void SparcTargetLowering::computeMaskedBitsForTargetNode
+void SparcTargetLowering::computeKnownBitsForTargetNode
                                 (const SDValue Op,
                                  APInt &KnownZero,
                                  APInt &KnownOne,
@@ -1725,10 +1722,8 @@ void SparcTargetLowering::computeMaskedBitsForTargetNode
   case SPISD::SELECT_ICC:
   case SPISD::SELECT_XCC:
   case SPISD::SELECT_FCC:
-    DAG.ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
-    DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    DAG.computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
 
     // Only known if known in both the LHS and RHS.
     KnownOne &= KnownOne2;
@@ -1914,7 +1909,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
     assert(Mask && "Missing call preserved mask for calling convention");
     Ops.push_back(DAG.getRegisterMask(Mask));
     Ops.push_back(InFlag);
-    Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, &Ops[0], Ops.size());
+    Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
     InFlag = Chain.getValue(1);
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(1, true),
                                DAG.getIntPtrConstant(0, true), InFlag, DL);
@@ -2033,13 +2028,10 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
   for (unsigned i = 0, e = numArgs; i != e; ++i) {
     Chain = LowerF128_LibCallArg(Chain, Args, Op.getOperand(i), SDLoc(Op), DAG);
   }
-  TargetLowering::
-    CallLoweringInfo CLI(Chain,
-                         RetTyABI,
-                         false, false, false, false,
-                         0, CallingConv::C,
-                         false, false, true,
-                         Callee, Args, DAG, SDLoc(Op));
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(SDLoc(Op)).setChain(Chain)
+    .setCallee(CallingConv::C, RetTyABI, Callee, &Args, 0);
+
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
 
   // chain is in second result.
@@ -2065,7 +2057,7 @@ SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
                                       SDLoc DL,
                                       SelectionDAG &DAG) const {
 
-  const char *LibCall = 0;
+  const char *LibCall = nullptr;
   bool is64Bit = Subtarget->is64Bit();
   switch(SPCC) {
   default: llvm_unreachable("Unhandled conditional code!");
@@ -2092,13 +2084,9 @@ SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
   Chain = LowerF128_LibCallArg(Chain, Args, LHS, DL, DAG);
   Chain = LowerF128_LibCallArg(Chain, Args, RHS, DL, DAG);
 
-  TargetLowering::
-    CallLoweringInfo CLI(Chain,
-                         RetTy,
-                         false, false, false, false,
-                         0, CallingConv::C,
-                         false, false, true,
-                         Callee, Args, DAG, DL);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(Chain)
+    .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
 
@@ -2174,7 +2162,7 @@ LowerF128_FPEXTEND(SDValue Op, SelectionDAG &DAG,
                            TLI.getLibcallName(RTLIB::FPEXT_F32_F128), 1);
 
   llvm_unreachable("fpextend with non-float operand!");
-  return SDValue(0, 0);
+  return SDValue();
 }
 
 static SDValue
@@ -2192,7 +2180,7 @@ LowerF128_FPROUND(SDValue Op, SelectionDAG &DAG,
                            TLI.getLibcallName(RTLIB::FPROUND_F128_F32), 1);
 
   llvm_unreachable("fpround to non-float!");
-  return SDValue(0, 0);
+  return SDValue();
 }
 
 static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
@@ -2213,7 +2201,7 @@ static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
 
   // Expand if the resulting type is illegal.
   if (!TLI.isTypeLegal(VT))
-    return SDValue(0, 0);
+    return SDValue();
 
   // Otherwise, Convert the fp value to integer in an FP register.
   if (VT == MVT::i32)
@@ -2244,7 +2232,7 @@ static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG,
 
   // Expand if the operand type is illegal.
   if (!TLI.isTypeLegal(OpVT))
-    return SDValue(0, 0);
+    return SDValue();
 
   // Otherwise, Convert the int value to FP in an FP register.
   SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, floatVT, Op.getOperand(0));
@@ -2262,7 +2250,7 @@ static SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG,
   // quad floating point instructions and the resulting type is legal.
   if (Op.getOperand(0).getValueType() != MVT::f128 ||
       (hasHardQuad && TLI.isTypeLegal(VT)))
-    return SDValue(0, 0);
+    return SDValue();
 
   assert(VT == MVT::i32 || VT == MVT::i64);
 
@@ -2283,7 +2271,7 @@ static SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG,
   // Expand if it does not involve f128 or the target has support for
   // quad floating point instructions and the operand type is legal.
   if (Op.getValueType() != MVT::f128 || (hasHardQuad && TLI.isTypeLegal(OpVT)))
-    return SDValue(0, 0);
+    return SDValue();
 
   return TLI.LowerF128Op(Op, DAG,
                          TLI.getLibcallName(OpVT == MVT::i32
@@ -2428,7 +2416,7 @@ static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
   SDValue NewVal = DAG.getNode(ISD::ADD, dl, VT, NewSP,
                                DAG.getConstant(regSpillArea, VT));
   SDValue Ops[2] = { NewVal, Chain };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 
@@ -2597,10 +2585,9 @@ static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
                                SubRegOdd);
   SDValue OutChains[2] = { SDValue(Hi64.getNode(), 1),
                            SDValue(Lo64.getNode(), 1) };
-  SDValue OutChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                                 &OutChains[0], 2);
+  SDValue OutChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   SDValue Ops[2] = {SDValue(InFP128,0), OutChain};
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 // Lower a f128 store into two f64 stores.
@@ -2644,8 +2631,7 @@ static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
                              LoPtr,
                              MachinePointerInfo(),
                              false, false, alignment);
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                     &OutChains[0], 2);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
 static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
@@ -2726,7 +2712,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
 
   SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
   SDValue Ops[2] = { Dst, Carry };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 // Custom lower UMULO/SMULO for SPARC. This code is similar to ExpandNode()
@@ -2773,7 +2759,7 @@ static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
   DAG.DeleteNode(MulResult.getNode());
 
   SDValue Ops[2] = { BottomHalf, TopHalf } ;
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
@@ -3092,7 +3078,7 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
   Value *CallOperandVal = info.CallOperandVal;
   // If we don't have a value, we can't do a match,
   // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
 
   // Look at the constraint type.
@@ -3117,7 +3103,7 @@ LowerAsmOperandForConstraint(SDValue Op,
                              std::string &Constraint,
                              std::vector<SDValue> &Ops,
                              SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result(nullptr, 0);
 
   // Only support length 1 constraints for now.
   if (Constraint.length() > 1)
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index f7b45d0..a24cc82 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -55,47 +55,47 @@ namespace llvm {
     const SparcSubtarget *Subtarget;
   public:
     SparcTargetLowering(TargetMachine &TM);
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-    /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+    /// computeKnownBitsForTargetNode - Determine which of the bits specified
     /// in Mask are known to be either zero or one and return them in the
     /// KnownZero/KnownOne bitsets.
-    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                APInt &KnownZero,
-                                                APInt &KnownOne,
-                                                const SelectionDAG &DAG,
-                                                unsigned Depth = 0) const;
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
 
-    virtual MachineBasicBlock *
+    MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const;
+                                  MachineBasicBlock *MBB) const override;
 
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
 
-    ConstraintType getConstraintType(const std::string &Constraint) const;
+    ConstraintType getConstraintType(const std::string &Constraint) const override;
     ConstraintWeight
     getSingleConstraintMatchWeight(AsmOperandInfo &info,
-                                   const char *constraint) const;
+                                   const char *constraint) const override;
     void LowerAsmOperandForConstraint(SDValue Op,
                                       std::string &Constraint,
                                       std::vector<SDValue> &Ops,
-                                      SelectionDAG &DAG) const;
+                                      SelectionDAG &DAG) const override;
     std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+    getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const override;
 
-    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
-    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
-    virtual SDValue
+    SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv,
                            bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
+                           SmallVectorImpl<SDValue> &InVals) const override;
     SDValue LowerFormalArguments_32(SDValue Chain,
                                     CallingConv::ID CallConv,
                                     bool isVarArg,
@@ -109,20 +109,20 @@ namespace llvm {
                                     SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const;
 
-    virtual SDValue
+    SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
+                SmallVectorImpl<SDValue> &InVals) const override;
     SDValue LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
                          SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
                          SmallVectorImpl<SDValue> &InVals) const;
 
-    virtual SDValue
+    SDValue
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const override;
     SDValue LowerReturn_32(SDValue Chain,
                            CallingConv::ID CallConv, bool IsVarArg,
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -156,15 +156,15 @@ namespace llvm {
                              SDLoc DL,
                              SelectionDAG &DAG) const;
 
-    bool ShouldShrinkFPConstant(EVT VT) const {
+    bool ShouldShrinkFPConstant(EVT VT) const override {
       // Do not shrink FP constpool if VT == MVT::f128.
       // (ldd, call _Q_fdtoq) is more expensive than two ldds.
       return VT != MVT::f128;
     }
 
-    virtual void ReplaceNodeResults(SDNode *N,
+    void ReplaceNodeResults(SDNode *N,
                                     SmallVectorImpl<SDValue>& Results,
-                                    SelectionDAG &DAG) const;
+                                    SelectionDAG &DAG) const override;
 
     MachineBasicBlock *expandSelectCC(MachineInstr *MI, MachineBasicBlock *BB,
                                       unsigned BROpcode) const;
diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td
index a34ce26..54d8240 100644
--- a/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/lib/Target/Sparc/SparcInstr64Bit.td
@@ -359,9 +359,9 @@ multiclass BranchOnReg<bits<3> cond, string OpcStr> {
 
 multiclass bpr_alias<string OpcStr, Instruction NAPT, Instruction APT> {
   def : InstAlias<!strconcat(OpcStr, ",pt $rs1, $imm16"),
-                  (NAPT I64Regs:$rs1, bprtarget16:$imm16)>;
+                  (NAPT I64Regs:$rs1, bprtarget16:$imm16), 0>;
   def : InstAlias<!strconcat(OpcStr, ",a,pt $rs1, $imm16"),
-                  (APT I64Regs:$rs1, bprtarget16:$imm16)>;
+                  (APT I64Regs:$rs1, bprtarget16:$imm16), 0>;
 }
 
 defm BPZ   : BranchOnReg<0b001, "brz">;
diff --git a/lib/Target/Sparc/SparcInstrAliases.td b/lib/Target/Sparc/SparcInstrAliases.td
index 33c2aa1..d36f67b 100644
--- a/lib/Target/Sparc/SparcInstrAliases.td
+++ b/lib/Target/Sparc/SparcInstrAliases.td
@@ -281,12 +281,12 @@ defm : fp_cond_alias<"o",     0b1111>;
 // Instruction aliases for JMPL.
 
 // jmp addr -> jmpl addr, %g0
-def : InstAlias<"jmp $addr", (JMPLrr G0, MEMrr:$addr)>;
-def : InstAlias<"jmp $addr", (JMPLri G0, MEMri:$addr)>;
+def : InstAlias<"jmp $addr", (JMPLrr G0, MEMrr:$addr), 0>;
+def : InstAlias<"jmp $addr", (JMPLri G0, MEMri:$addr), 0>;
 
 // call addr -> jmpl addr, %o7
-def : InstAlias<"call $addr", (JMPLrr O7, MEMrr:$addr)>;
-def : InstAlias<"call $addr", (JMPLri O7, MEMri:$addr)>;
+def : InstAlias<"call $addr", (JMPLrr O7, MEMrr:$addr), 0>;
+def : InstAlias<"call $addr", (JMPLri O7, MEMri:$addr), 0>;
 
 // retl -> RETL 8
 def : InstAlias<"retl", (RETL 8)>;
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index abf6c17..8b2e6bc 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -24,11 +24,10 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR_DTOR
-#include "SparcGenInstrInfo.inc"
-
 using namespace llvm;
 
+#define GET_INSTRINFO_CTOR_DTOR
+#include "SparcGenInstrInfo.inc"
 
 // Pin the vtable to this file.
 void SparcInstrInfo::anchor() {}
@@ -162,10 +161,10 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
         std::next(I)->eraseFromParent();
 
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
-        TBB = 0;
+        TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
         UnCondBrIter = MBB.end();
@@ -285,7 +284,7 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  bool KillSrc) const {
   unsigned numSubRegs = 0;
   unsigned movOpc     = 0;
-  const unsigned *subRegIdx = 0;
+  const unsigned *subRegIdx = nullptr;
 
   const unsigned DFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd };
   const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 };
@@ -329,11 +328,11 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   } else
     llvm_unreachable("Impossible reg-to-reg copy");
 
-  if (numSubRegs == 0 || subRegIdx == 0 || movOpc == 0)
+  if (numSubRegs == 0 || subRegIdx == nullptr || movOpc == 0)
     return;
 
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  MachineInstr *MovMI = 0;
+  MachineInstr *MovMI = nullptr;
 
   for (unsigned i = 0; i != numSubRegs; ++i) {
     unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index a86cbcb..3a1472e 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -45,52 +45,52 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  virtual const SparcRegisterInfo &getRegisterInfo() const { return RI; }
+  const SparcRegisterInfo &getRegisterInfo() const { return RI; }
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
 
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify = false) const ;
-
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
-
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
-
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override ;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
 
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 };
diff --git a/lib/Target/Sparc/SparcJITInfo.cpp b/lib/Target/Sparc/SparcJITInfo.cpp
index 959d12f..c775e9e 100644
--- a/lib/Target/Sparc/SparcJITInfo.cpp
+++ b/lib/Target/Sparc/SparcJITInfo.cpp
@@ -10,7 +10,6 @@
 // This file implements the JIT interfaces for the Sparc target.
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "jit"
 #include "SparcJITInfo.h"
 #include "Sparc.h"
 #include "SparcRelocations.h"
@@ -20,6 +19,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 /// JITCompilerFunction - This contains the address of the JIT function used to
 /// compile a function lazily.
 static TargetJITInfo::JITCompilerFn JITCompilerFunction;
diff --git a/lib/Target/Sparc/SparcJITInfo.h b/lib/Target/Sparc/SparcJITInfo.h
index 9c6e488..ff1b43a 100644
--- a/lib/Target/Sparc/SparcJITInfo.h
+++ b/lib/Target/Sparc/SparcJITInfo.h
@@ -34,27 +34,27 @@ class SparcJITInfo : public TargetJITInfo {
   /// overwriting OLD with a branch to NEW.  This is used for self-modifying
   /// code.
   ///
-  virtual void replaceMachineCodeForFunction(void *Old, void *New);
+  void replaceMachineCodeForFunction(void *Old, void *New) override;
 
   // getStubLayout - Returns the size and alignment of the largest call stub
   // on Sparc.
-  virtual StubLayout getStubLayout();
+  StubLayout getStubLayout() override;
 
 
   /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
   /// small native function that simply calls the function at the specified
   /// address.
-  virtual void *emitFunctionStub(const Function *F, void *Fn,
-                                 JITCodeEmitter &JCE);
+  void *emitFunctionStub(const Function *F, void *Fn,
+                         JITCodeEmitter &JCE) override;
 
   /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
-  virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+  LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
 
   /// relocate - Before the JIT can run a block of code that has been emitted,
   /// it must rewrite the code to contain the actual addresses of any
   /// referenced global symbols.
-  virtual void relocate(void *Function, MachineRelocation *MR,
-                        unsigned NumRelocs, unsigned char *GOTBase);
+  void relocate(void *Function, MachineRelocation *MR,
+                unsigned NumRelocs, unsigned char *GOTBase) override;
 
   /// Initialize - Initialize internal stage for the function being JITted.
   void Initialize(const MachineFunction &MF, bool isPIC) {
diff --git a/lib/Target/Sparc/SparcMCInstLower.cpp b/lib/Target/Sparc/SparcMCInstLower.cpp
index 737e378..9e94d2c 100644
--- a/lib/Target/Sparc/SparcMCInstLower.cpp
+++ b/lib/Target/Sparc/SparcMCInstLower.cpp
@@ -34,7 +34,7 @@ static MCOperand LowerSymbolOperand(const MachineInstr *MI,
 
   SparcMCExpr::VariantKind Kind =
     (SparcMCExpr::VariantKind)MO.getTargetFlags();
-  const MCSymbol *Symbol = 0;
+  const MCSymbol *Symbol = nullptr;
 
   switch(MO.getType()) {
   default: llvm_unreachable("Unknown type in LowerSymbolOperand");
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index f222382..dc1ec7c 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -25,11 +25,11 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
+using namespace llvm;
+
 #define GET_REGINFO_TARGET_DESC
 #include "SparcGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static cl::opt<bool>
 ReserveAppRegisters("sparc-reserve-app-registers", cl::Hidden, cl::init(false),
                     cl::desc("Reserve application registers (%g2-%g4)"));
@@ -38,8 +38,8 @@ SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st)
   : SparcGenRegisterInfo(SP::O7), Subtarget(st) {
 }
 
-const uint16_t* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
-                                                                         const {
+const MCPhysReg*
+SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return CSR_SaveList;
 }
 
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 00b5a98..77f879a 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -31,25 +31,26 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   SparcRegisterInfo(SparcSubtarget &st);
 
   /// Code Generation virtual methods...
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
-  const uint32_t* getCallPreservedMask(CallingConv::ID CC) const;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF =nullptr) const override;
+  const uint32_t* getCallPreservedMask(CallingConv::ID CC) const override;
 
   const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const;
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
-                                                unsigned Kind) const;
+                                                unsigned Kind) const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = nullptr) const override;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                       RegScavenger *RS = NULL) const;
+                                       RegScavenger *RS = nullptr) const;
 
   // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
index 190c575..eb36d29 100644
--- a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
+++ b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sparc-selectiondag-info"
 #include "SparcTargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "sparc-selectiondag-info"
+
 SparcSelectionDAGInfo::SparcSelectionDAGInfo(const SparcTargetMachine &TM)
   : TargetSelectionDAGInfo(TM) {
 }
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index 6fc9d56..e38fb02 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -16,12 +16,14 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "sparc-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "SparcGenSubtargetInfo.inc"
 
-using namespace llvm;
-
 void SparcSubtarget::anchor() { }
 
 SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 83f3474..2469d93 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -77,8 +77,8 @@ public:
     return getTM<SparcTargetMachine>();
   }
 
-  virtual bool addInstSelector();
-  virtual bool addPreEmitPass();
+  bool addInstSelector() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 8c9bcd3..7d04338 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -40,28 +40,28 @@ public:
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL, bool is64bit);
 
-  virtual const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
-  virtual const TargetFrameLowering  *getFrameLowering() const {
+  const SparcInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const TargetFrameLowering  *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const SparcSubtarget   *getSubtargetImpl() const{ return &Subtarget; }
-  virtual const SparcRegisterInfo *getRegisterInfo() const {
+  const SparcSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const SparcRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
-  virtual const SparcTargetLowering* getTargetLowering() const {
+  const SparcTargetLowering* getTargetLowering() const override {
     return &TLInfo;
   }
-  virtual const SparcSelectionDAGInfo* getSelectionDAGInfo() const {
+  const SparcSelectionDAGInfo* getSelectionDAGInfo() const override {
     return &TSInfo;
   }
-  virtual SparcJITInfo *getJITInfo() {
+  SparcJITInfo *getJITInfo() override {
     return &JITInfo;
   }
-  virtual const DataLayout       *getDataLayout() const { return &DL; }
+  const DataLayout       *getDataLayout() const override { return &DL; }
 
   // Pass Pipeline Configuration
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
-  virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override;
 };
 
 /// SparcV8TargetMachine - Sparc 32-bit target machine
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.cpp b/lib/Target/Sparc/SparcTargetObjectFile.cpp
index f1630e0..32b2240 100644
--- a/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -28,7 +28,7 @@ const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
     // Add information about the stub reference to ELFMMI so that the stub
     // gets emitted by the asmprinter.
     MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
-    if (StubSym.getPointer() == 0) {
+    if (!StubSym.getPointer()) {
       MCSymbol *Sym = TM.getSymbol(GV, Mang);
       StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
     }
diff --git a/lib/Target/Sparc/SparcTargetStreamer.h b/lib/Target/Sparc/SparcTargetStreamer.h
index 503ebd9..3767d8e 100644
--- a/lib/Target/Sparc/SparcTargetStreamer.h
+++ b/lib/Target/Sparc/SparcTargetStreamer.h
@@ -31,8 +31,8 @@ class SparcTargetAsmStreamer : public SparcTargetStreamer {
 
 public:
   SparcTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
-  virtual void emitSparcRegisterIgnore(unsigned reg);
-  virtual void emitSparcRegisterScratch(unsigned reg);
+  void emitSparcRegisterIgnore(unsigned reg) override;
+  void emitSparcRegisterScratch(unsigned reg) override;
 
 };
 
@@ -41,8 +41,8 @@ class SparcTargetELFStreamer : public SparcTargetStreamer {
 public:
   SparcTargetELFStreamer(MCStreamer &S);
   MCELFStreamer &getStreamer();
-  virtual void emitSparcRegisterIgnore(unsigned reg) {}
-  virtual void emitSparcRegisterScratch(unsigned reg) {}
+  void emitSparcRegisterIgnore(unsigned reg) override {}
+  void emitSparcRegisterScratch(unsigned reg) override {}
 };
 } // end namespace llvm
 
diff --git a/lib/Target/SystemZ/AsmParser/LLVMBuild.txt b/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
index 0b97e71..602898e 100644
--- a/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
+++ b/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SystemZAsmParser
 parent = SystemZ
-required_libraries = SystemZDesc SystemZInfo MC MCParser Support
+required_libraries = MC MCParser Support SystemZDesc SystemZInfo
 add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index a3dd4b6..71de64f 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -110,7 +110,7 @@ private:
 
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
-    if (Expr == 0)
+    if (!Expr)
       Inst.addOperand(MCOperand::CreateImm(0));
     else if (auto *CE = dyn_cast<MCConstantExpr>(Expr))
       Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
@@ -208,7 +208,7 @@ public:
     return (Kind == KindMem &&
             Mem.RegKind == RegKind &&
             (MemKind == BDXMem || !Mem.Index) &&
-            (MemKind == BDLMem) == (Mem.Length != 0));
+            (MemKind == BDLMem) == (Mem.Length != nullptr));
   }
   bool isMemDisp12(RegisterKind RegKind, MemoryKind MemKind) const {
     return isMem(RegKind, MemKind) && inRange(Mem.Disp, 0, 0xfff);
@@ -331,7 +331,8 @@ private:
 
 public:
   SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
-                   const MCInstrInfo &MII)
+                   const MCInstrInfo &MII,
+                   const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(sti), Parser(parser) {
     MCAsmParserExtension::Initialize(Parser);
 
@@ -526,7 +527,7 @@ bool SystemZAsmParser::parseAddress(unsigned &Base, const MCExpr *&Disp,
   // Parse the optional base and index.
   Index = 0;
   Base = 0;
-  Length = 0;
+  Length = nullptr;
   if (getLexer().is(AsmToken::LParen)) {
     Parser.Lex();
 
@@ -758,7 +759,7 @@ parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     return MatchOperand_NoMatch;
 
   Register Reg;
-  if (parseRegister(Reg, RegAccess, 0))
+  if (parseRegister(Reg, RegAccess, nullptr))
     return MatchOperand_ParseFail;
 
   Operands.push_back(SystemZOperand::createAccessReg(Reg.Num,
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 59a1fe9..2350776 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -17,13 +17,15 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-disassembler"
+
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
 class SystemZDisassembler : public MCDisassembler {
 public:
-  SystemZDisassembler(const MCSubtargetInfo &STI)
-    : MCDisassembler(STI) {}
+  SystemZDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
   virtual ~SystemZDisassembler() {}
 
   // Override MCDisassembler.
@@ -35,8 +37,9 @@ public:
 } // end anonymous namespace
 
 static MCDisassembler *createSystemZDisassembler(const Target &T,
-                                                 const MCSubtargetInfo &STI) {
-  return new SystemZDisassembler(STI);
+                                                 const MCSubtargetInfo &STI,
+                                                 MCContext &Ctx) {
+  return new SystemZDisassembler(STI, Ctx);
 }
 
 extern "C" void LLVMInitializeSystemZDisassembler() {
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index e1e64d3..d2ba9b6 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
-
 #include "SystemZInstPrinter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -16,6 +14,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #include "SystemZGenAsmWriter.inc"
 
 void SystemZInstPrinter::printAddress(unsigned Base, int64_t Disp,
diff --git a/lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt b/lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt
index cbdb59c..dabd214 100644
--- a/lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SystemZDesc
 parent = SystemZ
-required_libraries = MC SystemZAsmPrinter SystemZInfo Support
+required_libraries = MC Support SystemZAsmPrinter SystemZInfo
 add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index df50863..27b4bd8 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mccodeemitter"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "MCTargetDesc/SystemZMCFixups.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -21,6 +20,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mccodeemitter"
+
 namespace {
 class SystemZMCCodeEmitter : public MCCodeEmitter {
   const MCInstrInfo &MCII;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 54c6987..c6a1816 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -82,7 +82,7 @@ static unsigned getPLTReloc(unsigned Kind) {
 unsigned SystemZObjectWriter::GetRelocType(const MCValue &Target,
                                            const MCFixup &Fixup,
                                            bool IsPCRel) const {
-  MCSymbolRefExpr::VariantKind Modifier = Fixup.getAccessVariant();
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
   unsigned Kind = Fixup.getKind();
   switch (Modifier) {
   case MCSymbolRefExpr::VK_None:
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 8d1bac9..cc94869 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -16,6 +16,8 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "SystemZGenInstrInfo.inc"
 
@@ -25,8 +27,6 @@
 #define GET_REGINFO_MC_DESC
 #include "SystemZGenRegisterInfo.inc"
 
-using namespace llvm;
-
 const unsigned SystemZMC::GR32Regs[16] = {
   SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L,
   SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L,
@@ -98,7 +98,8 @@ static MCAsmInfo *createSystemZMCAsmInfo(const MCRegisterInfo &MRI,
                                          StringRef TT) {
   MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
   MCCFIInstruction Inst =
-      MCCFIInstruction::createDefCfa(0, MRI.getDwarfRegNum(SystemZ::R15D, true),
+      MCCFIInstruction::createDefCfa(nullptr,
+                                     MRI.getDwarfRegNum(SystemZ::R15D, true),
                                      SystemZMC::CFAOffsetFromInitialSP);
   MAI->addInitialFrameState(Inst);
   return MAI;
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index fdf80a9..dc210d6 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -13,8 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "systemz-elim-compare"
-
 #include "SystemZTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -28,6 +26,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-elim-compare"
+
 STATISTIC(BranchOnCounts, "Number of branch-on-count instructions");
 STATISTIC(EliminatedComparisons, "Number of eliminated comparisons");
 STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions");
@@ -64,14 +64,14 @@ class SystemZElimCompare : public MachineFunctionPass {
 public:
   static char ID;
   SystemZElimCompare(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID), TII(0), TRI(0) {}
+    : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {}
 
   const char *getPassName() const override {
     return "SystemZ Comparison Elimination";
   }
 
   bool processBlock(MachineBasicBlock &MBB);
-  bool runOnMachineFunction(MachineFunction &F);
+  bool runOnMachineFunction(MachineFunction &F) override;
 
 private:
   Reference getRegReferences(MachineInstr *MI, unsigned Reg);
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index c856955..65f3caf 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -93,7 +93,7 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // save and restore the stack pointer at the same time, via STMG and LMG.
   // This allows the deallocation to be done by the LMG, rather than needing
   // a separate %r15 addition.
-  const uint16_t *CSRegs = TRI->getCalleeSavedRegs(&MF);
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
   for (unsigned I = 0; CSRegs[I]; ++I) {
     unsigned Reg = CSRegs[I];
     if (SystemZ::GR64BitRegClass.contains(Reg) && MRI.isPhysRegUsed(Reg)) {
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index f46eb16..24f7584 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -19,6 +19,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-isel"
+
 namespace {
 // Used to build addressing modes.
 struct SystemZAddressingMode {
@@ -72,14 +74,14 @@ struct SystemZAddressingMode {
     errs() << "SystemZAddressingMode " << this << '\n';
 
     errs() << " Base ";
-    if (Base.getNode() != 0)
+    if (Base.getNode())
       Base.getNode()->dump();
     else
       errs() << "null\n";
 
     if (hasIndexField()) {
       errs() << " Index ";
-      if (Index.getNode() != 0)
+      if (Index.getNode())
         Index.getNode()->dump();
       else
         errs() << "null\n";
@@ -663,7 +665,7 @@ bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
   uint64_t Used = allOnes(Op.getValueType().getSizeInBits());
   if (Used != (AndMask | InsertMask)) {
     APInt KnownZero, KnownOne;
-    CurDAG->ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne);
+    CurDAG->computeKnownBits(Op.getOperand(0), KnownZero, KnownOne);
     if (Used != (AndMask | InsertMask | KnownZero.getZExtValue()))
       return false;
   }
@@ -712,7 +714,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
       // been removed from the mask.  See if adding them back in makes the
       // mask suitable.
       APInt KnownZero, KnownOne;
-      CurDAG->ComputeMaskedBits(Input, KnownZero, KnownOne);
+      CurDAG->computeKnownBits(Input, KnownZero, KnownOne);
       Mask |= KnownZero.getZExtValue();
       if (!refineRxSBGMask(RxSBG, Mask))
         return false;
@@ -736,7 +738,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
       // been removed from the mask.  See if adding them back in makes the
       // mask suitable.
       APInt KnownZero, KnownOne;
-      CurDAG->ComputeMaskedBits(Input, KnownZero, KnownOne);
+      CurDAG->computeKnownBits(Input, KnownZero, KnownOne);
       Mask &= ~KnownOne.getZExtValue();
       if (!refineRxSBGMask(RxSBG, Mask))
         return false;
@@ -867,12 +869,12 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
     if (RISBG.Input.getOpcode() != ISD::ANY_EXTEND)
       Count += 1;
   if (Count == 0)
-    return 0;
+    return nullptr;
   if (Count == 1) {
     // Prefer to use normal shift instructions over RISBG, since they can handle
     // all cases and are sometimes shorter.
     if (N->getOpcode() != ISD::AND)
-      return 0;
+      return nullptr;
 
     // Prefer register extensions like LLC over RISBG.  Also prefer to start
     // out with normal ANDs if one instruction would be enough.  We can convert
@@ -889,7 +891,7 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
         N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), NewMask);
         return SelectCode(N);
       }
-      return 0;
+      return nullptr;
     }
   }  
 
@@ -927,7 +929,7 @@ SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
 
   // Do nothing if neither operand is suitable.
   if (Count[0] == 0 && Count[1] == 0)
-    return 0;
+    return nullptr;
 
   // Pick the deepest second operand.
   unsigned I = Count[0] > Count[1] ? 0 : 1;
@@ -937,7 +939,7 @@ SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
   if (Opcode == SystemZ::ROSBG && (RxSBG[I].Mask & 0xff) == 0)
     if (auto *Load = dyn_cast<LoadSDNode>(Op0.getNode()))
       if (Load->getMemoryVT() == MVT::i8)
-        return 0;
+        return nullptr;
 
   // See whether we can avoid an AND in the first operand by converting
   // ROSBG to RISBG.
@@ -986,8 +988,8 @@ bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
     return true;
 
   // Otherwise we need to check whether there's an alias.
-  const Value *V1 = Load->getSrcValue();
-  const Value *V2 = Store->getSrcValue();
+  const Value *V1 = Load->getMemOperand()->getValue();
+  const Value *V2 = Store->getMemOperand()->getValue();
   if (!V1 || !V2)
     return false;
 
@@ -1037,11 +1039,11 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
-    return 0;
+    return nullptr;
   }
 
   unsigned Opcode = Node->getOpcode();
-  SDNode *ResNode = 0;
+  SDNode *ResNode = nullptr;
   switch (Opcode) {
   case ISD::OR:
     if (Node->getOperand(1).getOpcode() != ISD::Constant)
@@ -1114,7 +1116,7 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
     ResNode = SelectCode(Node);
 
   DEBUG(errs() << "=> ";
-        if (ResNode == NULL || ResNode == Node)
+        if (ResNode == nullptr || ResNode == Node)
           Node->dump(CurDAG);
         else
           ResNode->dump(CurDAG);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 714b6c9..6fe1fb9 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "systemz-lower"
-
 #include "SystemZISelLowering.h"
 #include "SystemZCallingConv.h"
 #include "SystemZConstantPoolValue.h"
@@ -26,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-lower"
+
 namespace {
 // Represents a sequence for extracting a 0/1 value from an IPM result:
 // (((X ^ XORValue) + AddValue) >> Bit)
@@ -424,7 +424,7 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
   Value *CallOperandVal = info.CallOperandVal;
   // If we don't have a value, we can't do a match,
   // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
@@ -492,7 +492,7 @@ parseRegisterNumber(const std::string &Constraint,
     if (Index < 16 && Map[Index])
       return std::make_pair(Map[Index], RC);
   }
-  return std::make_pair(0u, static_cast<TargetRegisterClass*>(0));
+  return std::make_pair(0U, nullptr);
 }
 
 std::pair<unsigned, const TargetRegisterClass *> SystemZTargetLowering::
@@ -772,8 +772,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
       }
       // Join the stores, which are independent of one another.
       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                          &MemOps[NumFixedFPRs],
-                          SystemZ::NumArgFPRs - NumFixedFPRs);
+                          makeArrayRef(&MemOps[NumFixedFPRs],
+                                       SystemZ::NumArgFPRs-NumFixedFPRs));
     }
   }
 
@@ -875,8 +875,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Join the stores, which are independent of one another.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
   // Accept direct calls by converting symbolic call addresses to the
   // associated Target* opcodes.  Force %r1 to be used for indirect
@@ -919,8 +918,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Emit the call.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   if (IsTailCall)
-    return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, &Ops[0], Ops.size());
-  Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
+    return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, Ops);
+  Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, Ops);
   Glue = Chain.getValue(1);
 
   // Mark the end of the call, which is glued to the call itself.
@@ -996,8 +995,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain,
   if (Glue.getNode())
     RetOps.push_back(Glue);
 
-  return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other,
-                     RetOps.data(), RetOps.size());
+  return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
 SDValue SystemZTargetLowering::
@@ -1489,7 +1487,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, Comparison &C) {
   // Check whether the nonconstant input is an AND with a constant mask.
   Comparison NewC(C);
   uint64_t MaskVal;
-  ConstantSDNode *Mask = 0;
+  ConstantSDNode *Mask = nullptr;
   if (C.Op0.getOpcode() == ISD::AND) {
     NewC.Op0 = C.Op0.getOperand(0);
     NewC.Op1 = C.Op0.getOperand(1);
@@ -1779,7 +1777,7 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
   Ops.push_back(Glue);
 
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
-  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, &Ops[0], Ops.size());
+  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
 }
 
 SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
@@ -1971,7 +1969,7 @@ SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
                              false, false, 0);
     Offset += 8;
   }
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps, NumFields);
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
 
 SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
@@ -2012,7 +2010,7 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);
 
   SDValue Ops[2] = { Result, Chain };
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
@@ -2054,7 +2052,7 @@ SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
     SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
     Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
   }
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
@@ -2073,7 +2071,7 @@ SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
     // low half first, so the results are in reverse order.
     lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
                      Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
@@ -2100,7 +2098,7 @@ SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
   SDValue Ops[2];
   lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, Opcode,
                    Op0, Op1, Ops[1], Ops[0]);
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
@@ -2118,7 +2116,7 @@ SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
   else
     lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_64, SystemZISD::UDIVREM64,
                      Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
@@ -2127,8 +2125,8 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
   // Get the known-zero masks for each operand.
   SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
   APInt KnownZero[2], KnownOne[2];
-  DAG.ComputeMaskedBits(Ops[0], KnownZero[0], KnownOne[0]);
-  DAG.ComputeMaskedBits(Ops[1], KnownZero[1], KnownOne[1]);
+  DAG.computeKnownBits(Ops[0], KnownZero[0], KnownOne[0]);
+  DAG.computeKnownBits(Ops[1], KnownZero[1], KnownOne[1]);
 
   // See if the upper 32 bits of one operand and the lower 32 bits of the
   // other are known zero.  They are the low and high operands respectively.
@@ -2259,7 +2257,6 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
   SDValue Ops[] = { ChainIn, AlignedAddr, Src2, BitShift, NegBitShift,
                     DAG.getConstant(BitSize, WideVT) };
   SDValue AtomicOp = DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops,
-                                             array_lengthof(Ops),
                                              NarrowVT, MMO);
 
   // Rotate the result of the final CS so that the field is in the lower
@@ -2269,7 +2266,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
   SDValue Result = DAG.getNode(ISD::ROTL, DL, WideVT, AtomicOp, ResultShift);
 
   SDValue RetOps[2] = { Result, AtomicOp.getValue(1) };
-  return DAG.getMergeValues(RetOps, 2, DL);
+  return DAG.getMergeValues(RetOps, DL);
 }
 
 // Op is an ATOMIC_LOAD_SUB operation.  Lower 8- and 16-bit operations
@@ -2351,8 +2348,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
   SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
                     NegBitShift, DAG.getConstant(BitSize, WideVT) };
   SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
-                                             VTList, Ops, array_lengthof(Ops),
-                                             NarrowVT, MMO);
+                                             VTList, Ops, NarrowVT, MMO);
   return AtomicOp;
 }
 
@@ -2388,7 +2384,7 @@ SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
     Op.getOperand(1)
   };
   return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, SDLoc(Op),
-                                 Node->getVTList(), Ops, array_lengthof(Ops),
+                                 Node->getVTList(), Ops,
                                  Node->getMemoryVT(), Node->getMemOperand());
 }
 
@@ -2517,7 +2513,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(ATOMIC_CMP_SWAPW);
     OPCODE(PREFETCH);
   }
-  return NULL;
+  return nullptr;
 #undef OPCODE
 }
 
@@ -3116,7 +3112,7 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
   // When generating more than one CLC, all but the last will need to
   // branch to the end when a difference is found.
   MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
-                               splitBlockAfter(MI, MBB) : 0);
+                               splitBlockAfter(MI, MBB) : nullptr);
 
   // Check for the loop form, in which operand 5 is the trip count.
   if (MI->getNumExplicitOperands() > 5) {
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 50badf8..add675a 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -516,7 +516,7 @@ class InstSS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 //
 //   Binary:
 //     One register output operand and two input operands.  The first
-//     input operand is always a register and he second may be a register,
+//     input operand is always a register and the second may be a register,
 //     immediate or memory.
 //
 //   Shift:
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index e20834c..6a18b2d 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -17,12 +17,12 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRMAP_INFO
 #include "SystemZGenInstrInfo.inc"
 
-using namespace llvm;
-
 // Return a mask with Count low bits set.
 static uint64_t allOnes(unsigned int Count) {
   return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
@@ -284,11 +284,11 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
         std::next(I)->eraseFromParent();
 
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       // Delete the JMP if it's equivalent to a fall-through.
       if (MBB.isLayoutSuccessor(Branch.Target->getMBB())) {
-        TBB = 0;
+        TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
         continue;
@@ -418,7 +418,7 @@ bool SystemZInstrInfo::analyzeCompare(const MachineInstr *MI,
 static MachineInstr *getDef(unsigned Reg,
                             const MachineRegisterInfo *MRI) {
   if (TargetRegisterInfo::isPhysicalRegister(Reg))
-    return 0;
+    return nullptr;
   return MRI->getUniqueVRegDef(Reg);
 }
 
@@ -442,7 +442,7 @@ static void eraseIfDead(MachineInstr *MI, const MachineRegisterInfo *MRI) {
 static bool removeIPMBasedCompare(MachineInstr *Compare, unsigned SrcReg,
                                   const MachineRegisterInfo *MRI,
                                   const TargetRegisterInfo *TRI) {
-  MachineInstr *LGFR = 0;
+  MachineInstr *LGFR = nullptr;
   MachineInstr *RLL = getDef(SrcReg, MRI);
   if (RLL && RLL->getOpcode() == SystemZ::LGFR) {
     LGFR = RLL;
@@ -542,7 +542,7 @@ PredicateInstruction(MachineInstr *MI,
       MI->setDesc(get(CondOpcode));
       MachineInstrBuilder(*MI->getParent()->getParent(), MI)
         .addImm(CCValid).addImm(CCMask)
-        .addReg(SystemZ::CC, RegState::Implicit);;
+        .addReg(SystemZ::CC, RegState::Implicit);
       return true;
     }
   }
@@ -740,7 +740,7 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return finishConvertToThreeAddress(MI, MIB, LV);
     }
   }
-  return 0;
+  return nullptr;
 }
 
 MachineInstr *
@@ -761,12 +761,12 @@ SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
         .addFrameIndex(FrameIndex).addImm(0)
         .addImm(MI->getOperand(2).getImm());
     }
-    return 0;
+    return nullptr;
   }
 
   // All other cases require a single operand.
   if (Ops.size() != 1)
-    return 0;
+    return nullptr;
 
   unsigned OpNum = Ops[0];
   assert(Size == MF.getRegInfo()
@@ -858,14 +858,14 @@ SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 MachineInstr *
 SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI,
                                         const SmallVectorImpl<unsigned> &Ops,
                                         MachineInstr* LoadMI) const {
-  return 0;
+  return nullptr;
 }
 
 bool
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 55f80af..09aee5d 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -229,7 +229,7 @@ public:
   // BRANCH exists, return the opcode for the latter, otherwise return 0.
   // MI, if nonnull, is the compare instruction.
   unsigned getCompareAndBranch(unsigned Opcode,
-                               const MachineInstr *MI = 0) const;
+                               const MachineInstr *MI = nullptr) const;
 
   // Emit code before MBBI in MI to move immediate value Value into
   // physical register Reg.
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index 1b88d06..8081334 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -53,8 +53,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "systemz-long-branch"
-
 #include "SystemZTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -68,6 +66,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-long-branch"
+
 STATISTIC(LongBranches, "Number of long branches.");
 
 namespace {
@@ -111,7 +111,8 @@ struct TerminatorInfo {
   // otherwise it is zero.
   unsigned ExtraRelaxSize;
 
-  TerminatorInfo() : Branch(0), Size(0), TargetBlock(0), ExtraRelaxSize(0) {}
+  TerminatorInfo() : Branch(nullptr), Size(0), TargetBlock(0),
+                     ExtraRelaxSize(0) {}
 };
 
 // Used to keep track of the current position while iterating over the blocks.
@@ -131,13 +132,13 @@ class SystemZLongBranch : public MachineFunctionPass {
 public:
   static char ID;
   SystemZLongBranch(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID), TII(0) {}
+    : MachineFunctionPass(ID), TII(nullptr) {}
 
   const char *getPassName() const override {
     return "SystemZ Long Branch";
   }
 
-  bool runOnMachineFunction(MachineFunction &F);
+  bool runOnMachineFunction(MachineFunction &F) override;
 
 private:
   void skipNonTerminators(BlockPosition &Position, MBBInfo &Block);
@@ -424,7 +425,7 @@ void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
 
   Terminator.Size += Terminator.ExtraRelaxSize;
   Terminator.ExtraRelaxSize = 0;
-  Terminator.Branch = 0;
+  Terminator.Branch = nullptr;
 
   ++LongBranches;
 }
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 1ac4e32..a04d703 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -12,17 +12,17 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+using namespace llvm;
+
 #define GET_REGINFO_TARGET_DESC
 #include "SystemZGenRegisterInfo.inc"
 
-using namespace llvm;
-
 SystemZRegisterInfo::SystemZRegisterInfo(SystemZTargetMachine &tm)
   : SystemZGenRegisterInfo(SystemZ::R14D), TM(tm) {}
 
-const uint16_t*
+const MCPhysReg*
 SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  static const uint16_t CalleeSavedRegs[] = {
+  static const MCPhysReg CalleeSavedRegs[] = {
     SystemZ::R6D,  SystemZ::R7D,  SystemZ::R8D,  SystemZ::R9D,
     SystemZ::R10D, SystemZ::R11D, SystemZ::R12D, SystemZ::R13D,
     SystemZ::R14D, SystemZ::R15D,
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 4ad8048..e236f71 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -49,7 +49,7 @@ public:
   bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
     return true;
   }
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF = nullptr) const
     override;
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator MI,
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 7635bdc..97abee3 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -11,12 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "systemz-selectiondag-info"
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-selectiondag-info"
+
 SystemZSelectionDAGInfo::
 SystemZSelectionDAGInfo(const SystemZTargetMachine &TM)
   : TargetSelectionDAGInfo(TM) {
@@ -230,7 +231,7 @@ EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
   Ops.push_back(DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, MVT::i32));
   Ops.push_back(Glue);
   VTs = DAG.getVTList(PtrVT, MVT::Glue);
-  End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, &Ops[0], Ops.size());
+  End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
   return std::make_pair(End, Chain);
 }
 
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index 9350779..aad899c 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -13,13 +13,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "systemz-shorten-inst"
-
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-shorten-inst"
+
 namespace {
 class SystemZShortenInst : public MachineFunctionPass {
 public:
@@ -31,7 +31,7 @@ public:
   }
 
   bool processBlock(MachineBasicBlock &MBB);
-  bool runOnMachineFunction(MachineFunction &F);
+  bool runOnMachineFunction(MachineFunction &F) override;
 
 private:
   bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
@@ -53,7 +53,7 @@ FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) {
 }
 
 SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm)
-  : MachineFunctionPass(ID), TII(0), LowGPRs(), HighGPRs() {
+  : MachineFunctionPass(ID), TII(nullptr), LowGPRs(), HighGPRs() {
   // Set up LowGPRs and HighGPRs.
   for (unsigned I = 0; I < 16; ++I) {
     LowGPRs[SystemZMC::GR32Regs[I]] |= 1 << I;
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index 33d7e06..a011157 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -12,12 +12,14 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/Host.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "SystemZGenSubtargetInfo.inc"
 
-using namespace llvm;
-
 // Pin the vtabel to this file.
 void SystemZSubtarget::anchor() {}
 
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index 627786d..d277f82 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -24,14 +24,6 @@
 
 using namespace llvm;
 
-inline DataLayout *unwrap(LLVMTargetDataRef P) {
-  return reinterpret_cast<DataLayout*>(P);
-}
-
-inline LLVMTargetDataRef wrap(const DataLayout *P) {
-  return reinterpret_cast<LLVMTargetDataRef>(const_cast<DataLayout*>(P));
-}
-
 inline TargetLibraryInfo *unwrap(LLVMTargetLibraryInfoRef P) {
   return reinterpret_cast<TargetLibraryInfo*>(P);
 }
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 50b1e31..39e0459 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -46,7 +46,7 @@ void TargetLoweringObjectFile::Initialize(MCContext &ctx,
   InitMCObjectFileInfo(TM.getTargetTriple(),
                        TM.getRelocationModel(), TM.getCodeModel(), *Ctx);
 }
-  
+
 TargetLoweringObjectFile::~TargetLoweringObjectFile() {
 }
 
@@ -62,7 +62,7 @@ static bool isSuitableForBSS(const GlobalVariable *GV, bool NoZerosInBSS) {
     return false;
 
   // If the global has an explicit section specified, don't put it in BSS.
-  if (!GV->getSection().empty())
+  if (GV->hasSection())
     return false;
 
   // If -nozero-initialized-in-bss is specified, don't ever use BSS.
@@ -138,7 +138,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
 
   // Early exit - functions should be always in text sections.
   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
-  if (GVar == 0)
+  if (!GVar)
     return SectionKind::getText();
 
   // Handle thread-local data first.
@@ -284,10 +284,10 @@ TargetLoweringObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
   if (Kind.isText())
     return getTextSection();
 
-  if (Kind.isBSS() && BSSSection != 0)
+  if (Kind.isBSS() && BSSSection != nullptr)
     return BSSSection;
 
-  if (Kind.isReadOnly() && ReadOnlySection != 0)
+  if (Kind.isReadOnly() && ReadOnlySection != nullptr)
     return ReadOnlySection;
 
   return getDataSection();
@@ -298,7 +298,7 @@ TargetLoweringObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
 /// should be placed in.
 const MCSection *
 TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind) const {
-  if (Kind.isReadOnly() && ReadOnlySection != 0)
+  if (Kind.isReadOnly() && ReadOnlySection != nullptr)
     return ReadOnlySection;
 
   return DataSection;
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index fe3c870..8365f64 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
@@ -28,24 +29,6 @@
 using namespace llvm;
 
 //---------------------------------------------------------------------------
-// Command-line options that tend to be useful on more than one back-end.
-//
-
-namespace llvm {
-  bool HasDivModLibcall;
-  bool AsmVerbosityDefault(false);
-}
-
-static cl::opt<bool>
-DataSections("fdata-sections",
-  cl::desc("Emit data into separate sections"),
-  cl::init(false));
-static cl::opt<bool>
-FunctionSections("ffunction-sections",
-  cl::desc("Emit functions into separate sections"),
-  cl::init(false));
-
-//---------------------------------------------------------------------------
 // TargetMachine Class
 //
 
@@ -53,12 +36,7 @@ TargetMachine::TargetMachine(const Target &T,
                              StringRef TT, StringRef CPU, StringRef FS,
                              const TargetOptions &Options)
   : TheTarget(T), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS),
-    CodeGenInfo(0), AsmInfo(0),
-    MCRelaxAll(false),
-    MCNoExecStack(false),
-    MCSaveTempLabels(false),
-    MCUseCFI(true),
-    MCUseDwarfDirectory(false),
+    CodeGenInfo(nullptr), AsmInfo(nullptr),
     RequireStructuredCFG(false),
     Options(Options) {
 }
@@ -89,6 +67,8 @@ void TargetMachine::resetTargetOptions(const MachineFunction *MF) const {
   RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
   RESET_OPTION(UseSoftFloat, "use-soft-float");
   RESET_OPTION(DisableTailCalls, "disable-tail-calls");
+
+  TO.MCOptions.SanitizeAddress = F->hasFnAttribute(Attribute::SanitizeAddress);
 }
 
 /// getRelocationModel - Returns the code generation relocation model. The
@@ -126,19 +106,13 @@ static TLSModel::Model getSelectedTLSModel(const GlobalVariable *Var) {
 }
 
 TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
-  // If GV is an alias then use the aliasee for determining
-  // thread-localness.
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    GV = GA->getAliasedGlobal();
-  const GlobalVariable *Var = cast<GlobalVariable>(GV);
-
-  bool isLocal = Var->hasLocalLinkage();
-  bool isDeclaration = Var->isDeclaration();
+  bool isLocal = GV->hasLocalLinkage();
+  bool isDeclaration = GV->isDeclaration();
   bool isPIC = getRelocationModel() == Reloc::PIC_;
   bool isPIE = Options.PositionIndependentExecutable;
   // FIXME: what should we do for protected and internal visibility?
   // For variables, is internal different from hidden?
-  bool isHidden = Var->hasHiddenVisibility();
+  bool isHidden = GV->hasHiddenVisibility();
 
   TLSModel::Model Model;
   if (isPIC && !isPIE) {
@@ -153,10 +127,13 @@ TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
       Model = TLSModel::InitialExec;
   }
 
-  // If the user specified a more specific model, use that.
-  TLSModel::Model SelectedModel = getSelectedTLSModel(Var);
-  if (SelectedModel > Model)
-    return SelectedModel;
+  const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
+  if (Var) {
+    // If the user specified a more specific model, use that.
+    TLSModel::Model SelectedModel = getSelectedTLSModel(Var);
+    if (SelectedModel > Model)
+      return SelectedModel;
+  }
 
   return Model;
 }
@@ -174,28 +151,28 @@ void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
     CodeGenInfo->setOptLevel(Level);
 }
 
-bool TargetMachine::getAsmVerbosityDefault() {
-  return AsmVerbosityDefault;
+bool TargetMachine::getAsmVerbosityDefault() const {
+  return Options.MCOptions.AsmVerbose;
 }
 
 void TargetMachine::setAsmVerbosityDefault(bool V) {
-  AsmVerbosityDefault = V;
+  Options.MCOptions.AsmVerbose = V;
 }
 
-bool TargetMachine::getFunctionSections() {
-  return FunctionSections;
+bool TargetMachine::getFunctionSections() const {
+  return Options.FunctionSections;
 }
 
-bool TargetMachine::getDataSections() {
-  return DataSections;
+bool TargetMachine::getDataSections() const {
+  return Options.DataSections;
 }
 
 void TargetMachine::setFunctionSections(bool V) {
-  FunctionSections = V;
+  Options.FunctionSections = V;
 }
 
 void TargetMachine::setDataSections(bool V) {
-  DataSections = V;
+  Options.DataSections = V;
 }
 
 void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index a2829d4..20923c9 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -29,23 +30,6 @@
 
 using namespace llvm;
 
-inline DataLayout *unwrap(LLVMTargetDataRef P) {
-  return reinterpret_cast<DataLayout*>(P);
-}
-
-inline LLVMTargetDataRef wrap(const DataLayout *P) {
-  return reinterpret_cast<LLVMTargetDataRef>(const_cast<DataLayout*>(P));
-}
-
-inline TargetLibraryInfo *unwrap(LLVMTargetLibraryInfoRef P) {
-  return reinterpret_cast<TargetLibraryInfo*>(P);
-}
-
-inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfo *P) {
-  TargetLibraryInfo *X = const_cast<TargetLibraryInfo*>(P);
-  return reinterpret_cast<LLVMTargetLibraryInfoRef>(X);
-}
-
 inline TargetMachine *unwrap(LLVMTargetMachineRef P) {
   return reinterpret_cast<TargetMachine*>(P);
 }
@@ -62,7 +46,7 @@ inline LLVMTargetRef wrap(const Target * P) {
 
 LLVMTargetRef LLVMGetFirstTarget() {
   if(TargetRegistry::begin() == TargetRegistry::end()) {
-    return NULL;
+    return nullptr;
   }
 
   const Target* target = &*TargetRegistry::begin();
@@ -80,7 +64,7 @@ LLVMTargetRef LLVMGetTargetFromName(const char *Name) {
       return wrap(&*IT);
   }
   
-  return NULL;
+  return nullptr;
 }
 
 LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T,
diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index df8948f..3ca13da 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp
@@ -24,11 +24,12 @@ TargetSubtargetInfo::TargetSubtargetInfo() {}
 TargetSubtargetInfo::~TargetSubtargetInfo() {}
 
 // Temporary option to compare overall performance change when moving from the
-// SD scheduler to the MachineScheduler pass pipeline. It should be removed
-// before 3.4. The normal way to enable/disable the MachineScheduling pass
-// itself is by using -enable-misched. For targets that already use MI sched
-// (via MySubTarget::enableMachineScheduler()) -misched-bench=false negates the
-// subtarget hook.
+// SD scheduler to the MachineScheduler pass pipeline. This is convenient for
+// benchmarking during the transition from SD to MI scheduling. Once armv7 makes
+// the switch, it should go away. The normal way to enable/disable the
+// MachineScheduling pass itself is by using -enable-misched. For targets that
+// already use MI sched (via MySubTarget::enableMachineScheduler())
+// -misched-bench=false negates the subtarget hook.
 static cl::opt<bool> BenchMachineSched("misched-bench", cl::Hidden,
     cl::desc("Migrate from the target's default SD scheduler to MI scheduler"));
 
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk
index 73031de..0d0a9ca 100644
--- a/lib/Target/X86/Android.mk
+++ b/lib/Target/X86/Android.mk
@@ -12,7 +12,6 @@ x86_codegen_TBLGEN_TABLES := \
 
 x86_codegen_SRC_FILES := \
   X86AsmPrinter.cpp \
-  X86COFFMachineModuleInfo.cpp \
   X86CodeEmitter.cpp \
   X86FastISel.cpp \
   X86FixupLEAs.cpp \
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index db29228..f3e6b3f 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -11,21 +11,25 @@
 #include "X86AsmInstrumentation.h"
 #include "X86Operand.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 
 namespace llvm {
 namespace {
 
-static cl::opt<bool> ClAsanInstrumentInlineAssembly(
-    "asan-instrument-inline-assembly", cl::desc("instrument inline assembly"),
-    cl::Hidden, cl::init(false));
+static cl::opt<bool> ClAsanInstrumentAssembly(
+    "asan-instrument-assembly",
+    cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
+    cl::init(false));
 
 bool IsStackReg(unsigned Reg) {
   return Reg == X86::RSP || Reg == X86::ESP || Reg == X86::SP;
@@ -38,14 +42,14 @@ std::string FuncName(unsigned AccessSize, bool IsWrite) {
 
 class X86AddressSanitizer : public X86AsmInstrumentation {
 public:
-  X86AddressSanitizer(MCSubtargetInfo &sti) : STI(sti) {}
+  X86AddressSanitizer(const MCSubtargetInfo &STI) : STI(STI) {}
   virtual ~X86AddressSanitizer() {}
 
   // X86AsmInstrumentation implementation:
   virtual void InstrumentInstruction(
       const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-      MCContext &Ctx, MCStreamer &Out) override {
-    InstrumentMOV(Inst, Operands, Ctx, Out);
+      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) override {
+    InstrumentMOV(Inst, Operands, Ctx, MII, Out);
   }
 
   // Should be implemented differently in x86_32 and x86_64 subclasses.
@@ -57,13 +61,13 @@ public:
                             bool IsWrite, MCContext &Ctx, MCStreamer &Out);
   void InstrumentMOV(const MCInst &Inst,
                      SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                     MCContext &Ctx, MCStreamer &Out);
+                     MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
   void EmitInstruction(MCStreamer &Out, const MCInst &Inst) {
     Out.EmitInstruction(Inst, STI);
   }
 
 protected:
-  MCSubtargetInfo &STI;
+  const MCSubtargetInfo &STI;
 };
 
 void X86AddressSanitizer::InstrumentMemOperand(
@@ -83,68 +87,53 @@ void X86AddressSanitizer::InstrumentMemOperand(
 
 void X86AddressSanitizer::InstrumentMOV(
     const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-    MCContext &Ctx, MCStreamer &Out) {
+    MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {
   // Access size in bytes.
   unsigned AccessSize = 0;
-  unsigned long OpIx = Operands.size();
+
   switch (Inst.getOpcode()) {
   case X86::MOV8mi:
   case X86::MOV8mr:
-    AccessSize = 1;
-    OpIx = 2;
-    break;
   case X86::MOV8rm:
     AccessSize = 1;
-    OpIx = 1;
     break;
   case X86::MOV16mi:
   case X86::MOV16mr:
-    AccessSize = 2;
-    OpIx = 2;
-    break;
   case X86::MOV16rm:
     AccessSize = 2;
-    OpIx = 1;
     break;
   case X86::MOV32mi:
   case X86::MOV32mr:
-    AccessSize = 4;
-    OpIx = 2;
-    break;
   case X86::MOV32rm:
     AccessSize = 4;
-    OpIx = 1;
     break;
   case X86::MOV64mi32:
   case X86::MOV64mr:
-    AccessSize = 8;
-    OpIx = 2;
-    break;
   case X86::MOV64rm:
     AccessSize = 8;
-    OpIx = 1;
     break;
   case X86::MOVAPDmr:
   case X86::MOVAPSmr:
-    AccessSize = 16;
-    OpIx = 2;
-    break;
   case X86::MOVAPDrm:
   case X86::MOVAPSrm:
     AccessSize = 16;
-    OpIx = 1;
     break;
-  }
-  if (OpIx >= Operands.size())
+  default:
     return;
+  }
 
-  const bool IsWrite = (OpIx != 1);
-  InstrumentMemOperand(Operands[OpIx], AccessSize, IsWrite, Ctx, Out);
+  const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
+  for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
+    MCParsedAsmOperand *Op = Operands[Ix];
+    if (Op && Op->isMem())
+      InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out);
+  }
 }
 
 class X86AddressSanitizer32 : public X86AddressSanitizer {
 public:
-  X86AddressSanitizer32(MCSubtargetInfo &sti) : X86AddressSanitizer(sti) {}
+  X86AddressSanitizer32(const MCSubtargetInfo &STI)
+      : X86AddressSanitizer(STI) {}
   virtual ~X86AddressSanitizer32() {}
 
   virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
@@ -172,14 +161,14 @@ void X86AddressSanitizer32::InstrumentMemOperandImpl(
         MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FuncExpr));
   }
-  EmitInstruction(Out, MCInstBuilder(X86::ADD32ri).addReg(X86::ESP)
-                           .addReg(X86::ESP).addImm(4));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
   EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
 }
 
 class X86AddressSanitizer64 : public X86AddressSanitizer {
 public:
-  X86AddressSanitizer64(MCSubtargetInfo &sti) : X86AddressSanitizer(sti) {}
+  X86AddressSanitizer64(const MCSubtargetInfo &STI)
+      : X86AddressSanitizer(STI) {}
   virtual ~X86AddressSanitizer64() {}
 
   virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
@@ -187,13 +176,26 @@ public:
                                         MCStreamer &Out) override;
 };
 
-void X86AddressSanitizer64::InstrumentMemOperandImpl(
-    X86Operand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-    MCStreamer &Out) {
+void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand *Op,
+                                                     unsigned AccessSize,
+                                                     bool IsWrite,
+                                                     MCContext &Ctx,
+                                                     MCStreamer &Out) {
   // FIXME: emit .cfi directives for correct stack unwinding.
-  // Set %rsp below current red zone (128 bytes wide)
-  EmitInstruction(Out, MCInstBuilder(X86::SUB64ri32).addReg(X86::RSP)
-                           .addReg(X86::RSP).addImm(128));
+
+  // Set %rsp below current red zone (128 bytes wide) using LEA instruction to
+  // preserve flags.
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA64r);
+    Inst.addOperand(MCOperand::CreateReg(X86::RSP));
+
+    const MCExpr *Disp = MCConstantExpr::Create(-128, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
   EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI));
   {
     MCInst Inst;
@@ -210,8 +212,19 @@ void X86AddressSanitizer64::InstrumentMemOperandImpl(
     EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FuncExpr));
   }
   EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI));
-  EmitInstruction(Out, MCInstBuilder(X86::ADD64ri32).addReg(X86::RSP)
-                           .addReg(X86::RSP).addImm(128));
+
+  // Restore old %rsp value.
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA64r);
+    Inst.addOperand(MCOperand::CreateReg(X86::RSP));
+
+    const MCExpr *Disp = MCConstantExpr::Create(128, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
 }
 
 } // End anonymous namespace
@@ -221,10 +234,15 @@ X86AsmInstrumentation::~X86AsmInstrumentation() {}
 
 void X86AsmInstrumentation::InstrumentInstruction(
     const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-    MCContext &Ctx, MCStreamer &Out) {}
-
-X86AsmInstrumentation *CreateX86AsmInstrumentation(MCSubtargetInfo &STI) {
-  if (ClAsanInstrumentInlineAssembly) {
+    MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {}
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                            const MCContext &Ctx, const MCSubtargetInfo &STI) {
+  Triple T(STI.getTargetTriple());
+  const bool hasCompilerRTSupport = T.isOSLinux();
+  if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
+      MCOptions.SanitizeAddress) {
     if ((STI.getFeatureBits() & X86::Mode32Bit) != 0)
       return new X86AddressSanitizer32(STI);
     if ((STI.getFeatureBits() & X86::Mode64Bit) != 0)
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index c783a78..0369b14 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -16,13 +16,17 @@ namespace llvm {
 
 class MCContext;
 class MCInst;
+class MCInstrInfo;
 class MCParsedAsmOperand;
 class MCStreamer;
 class MCSubtargetInfo;
+class MCTargetOptions;
 
 class X86AsmInstrumentation;
 
-X86AsmInstrumentation *CreateX86AsmInstrumentation(MCSubtargetInfo &STI);
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                            const MCContext &Ctx, const MCSubtargetInfo &STI);
 
 class X86AsmInstrumentation {
 public:
@@ -32,15 +36,18 @@ public:
   // instruction is sent to Out.
   virtual void InstrumentInstruction(
       const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-      MCContext &Ctx, MCStreamer &Out);
+      MCContext &Ctx,
+      const MCInstrInfo &MII,
+      MCStreamer &Out);
 
 protected:
   friend X86AsmInstrumentation *
-  CreateX86AsmInstrumentation(MCSubtargetInfo &STI);
+  CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                              const MCContext &Ctx, const MCSubtargetInfo &STI);
 
   X86AsmInstrumentation();
 };
 
-}  // End llvm namespace
+} // End llvm namespace
 
-#endif  // X86_ASM_INSTRUMENTATION_H
+#endif // X86_ASM_INSTRUMENTATION_H
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 9eddc74..d3e695e 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
@@ -55,6 +56,7 @@ static const char OpPrecedence[] = {
 class X86AsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
+  const MCInstrInfo &MII;
   ParseInstructionInfo *InstInfo;
   std::unique_ptr<X86AsmInstrumentation> Instrumentation;
 private:
@@ -257,7 +259,7 @@ private:
   public:
     IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
       State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
-      Scale(1), Imm(imm), Sym(0), StopOnLBrac(stoponlbrac),
+      Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac),
       AddImmPrefix(addimmprefix) { Info.clear(); }
     
     unsigned getBaseReg() { return BaseReg; }
@@ -618,7 +620,7 @@ private:
 
   X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) {
     Error(Loc, Msg);
-    return 0;
+    return nullptr;
   }
 
   X86Operand *DefaultMemSIOperand(SMLoc Loc);
@@ -710,13 +712,17 @@ private:
 
 public:
   X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
-               const MCInstrInfo &MII)
-      : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
+               const MCInstrInfo &mii,
+               const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(sti), Parser(parser), MII(mii),
+        InstInfo(nullptr) {
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
-    Instrumentation.reset(CreateX86AsmInstrumentation(STI));
+    Instrumentation.reset(
+        CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
   }
+
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
   bool
@@ -1173,9 +1179,9 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   // expression.
   IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
   if (ParseIntelExpression(SM, End))
-    return 0;
+    return nullptr;
 
-  const MCExpr *Disp = 0;
+  const MCExpr *Disp = nullptr;
   if (const MCExpr *Sym = SM.getSym()) {
     // A symbolic displacement.
     Disp = Sym;
@@ -1199,7 +1205,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   if (Tok.getString().find('.') != StringRef::npos) {
     const MCExpr *NewDisp;
     if (ParseIntelDotOperator(Disp, NewDisp))
-      return 0;
+      return nullptr;
     
     End = Tok.getEndLoc();
     Parser.Lex();  // Eat the field.
@@ -1220,7 +1226,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
     StringRef ErrMsg;
     if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
       Error(StartInBrac, ErrMsg);
-      return 0;
+      return nullptr;
     }
     return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
                                  End, Size);
@@ -1237,7 +1243,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
                                         InlineAsmIdentifierInfo &Info,
                                         bool IsUnevaluatedOperand, SMLoc &End) {
   assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
-  Val = 0;
+  Val = nullptr;
 
   StringRef LineBuf(Identifier.data());
   SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
@@ -1309,7 +1315,7 @@ X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg,
   StringRef Identifier = Tok.getString();
   if (ParseIntelIdentifier(Val, Identifier, Info,
                            /*Unevaluated=*/false, End))
-    return 0;
+    return nullptr;
   return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
                                /*Scale=*/1, Start, End, Size, Identifier, Info);
 }
@@ -1337,7 +1343,7 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
   StringRef Identifier = Tok.getString();
   if (ParseIntelIdentifier(Val, Identifier, Info,
                            /*Unevaluated=*/false, End))
-    return 0;
+    return nullptr;
 
   if (!getLexer().is(AsmToken::LBrac))
     return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0,
@@ -1349,19 +1355,19 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
   IntelExprStateMachine SM(/*ImmDisp=*/0, /*StopOnLBrac=*/true,
                            /*AddImmPrefix=*/false);
   if (ParseIntelExpression(SM, End))
-    return 0;
+    return nullptr;
 
   if (SM.getSym()) {
     Error(Start, "cannot use more than one symbol in memory operand");
-    return 0;
+    return nullptr;
   }
   if (SM.getBaseReg()) {
     Error(Start, "cannot use base register with variable reference");
-    return 0;
+    return nullptr;
   }
   if (SM.getIndexReg()) {
     Error(Start, "cannot use index register with variable reference");
-    return 0;
+    return nullptr;
   }
 
   const MCExpr *Disp = MCConstantExpr::Create(SM.getImm(), getContext());
@@ -1430,7 +1436,7 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
   StringRef Identifier = Tok.getString();
   if (ParseIntelIdentifier(Val, Identifier, Info,
                            /*Unevaluated=*/false, End))
-    return 0;
+    return nullptr;
 
   // Don't emit the offset operator.
   InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
@@ -1461,13 +1467,13 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   SMLoc TypeLoc = Tok.getLoc();
   Parser.Lex(); // Eat operator.
 
-  const MCExpr *Val = 0;
+  const MCExpr *Val = nullptr;
   InlineAsmIdentifierInfo Info;
   SMLoc Start = Tok.getLoc(), End;
   StringRef Identifier = Tok.getString();
   if (ParseIntelIdentifier(Val, Identifier, Info,
                            /*Unevaluated=*/true, End))
-    return 0;
+    return nullptr;
 
   if (!Info.OpDecl)
     return ErrorOperand(Start, "unable to lookup expression");
@@ -1522,7 +1528,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
     IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
                              /*AddImmPrefix=*/false);
     if (ParseIntelExpression(SM, End))
-      return 0;
+      return nullptr;
 
     int64_t Imm = SM.getImm();
     if (isParsingInlineAsm()) {
@@ -1580,11 +1586,11 @@ X86Operand *X86AsmParser::ParseATTOperand() {
     // Read the register.
     unsigned RegNo;
     SMLoc Start, End;
-    if (ParseRegister(RegNo, Start, End)) return 0;
+    if (ParseRegister(RegNo, Start, End)) return nullptr;
     if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
       Error(Start, "%eiz and %riz can only be used as index registers",
             SMRange(Start, End));
-      return 0;
+      return nullptr;
     }
 
     // If this is a segment register followed by a ':', then this is the start
@@ -1601,7 +1607,7 @@ X86Operand *X86AsmParser::ParseATTOperand() {
     Parser.Lex();
     const MCExpr *Val;
     if (getParser().parseExpression(Val, End))
-      return 0;
+      return nullptr;
     return X86Operand::CreateImm(Val, Start, End);
   }
   }
@@ -1630,7 +1636,7 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands
           StringSwitch<const char*>(getLexer().getTok().getIdentifier())
             .Case("to8",  "{1to8}")
             .Case("to16", "{1to16}")
-            .Default(0);
+            .Default(nullptr);
         if (!BroadcastPrimitive)
           return !ErrorAndEatStatement(getLexer().getLoc(),
                                        "Invalid memory broadcast primitive.");
@@ -1685,7 +1691,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
   if (getLexer().isNot(AsmToken::LParen)) {
     SMLoc ExprEnd;
-    if (getParser().parseExpression(Disp, ExprEnd)) return 0;
+    if (getParser().parseExpression(Disp, ExprEnd)) return nullptr;
 
     // After parsing the base expression we could either have a parenthesized
     // memory address or not.  If not, return now.  If so, eat the (.
@@ -1712,7 +1718,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
 
       // It must be an parenthesized expression, parse it now.
       if (getParser().parseParenExpression(Disp, ExprEnd))
-        return 0;
+        return nullptr;
 
       // After parsing the base expression we could either have a parenthesized
       // memory address or not.  If not, return now.  If so, eat the (.
@@ -1736,11 +1742,11 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   if (getLexer().is(AsmToken::Percent)) {
     SMLoc StartLoc, EndLoc;
     BaseLoc = Parser.getTok().getLoc();
-    if (ParseRegister(BaseReg, StartLoc, EndLoc)) return 0;
+    if (ParseRegister(BaseReg, StartLoc, EndLoc)) return nullptr;
     if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) {
       Error(StartLoc, "eiz and riz can only be used as index registers",
             SMRange(StartLoc, EndLoc));
-      return 0;
+      return nullptr;
     }
   }
 
@@ -1756,7 +1762,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
     // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
     if (getLexer().is(AsmToken::Percent)) {
       SMLoc L;
-      if (ParseRegister(IndexReg, L, L)) return 0;
+      if (ParseRegister(IndexReg, L, L)) return nullptr;
 
       if (getLexer().isNot(AsmToken::RParen)) {
         // Parse the scale amount:
@@ -1764,7 +1770,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
         if (getLexer().isNot(AsmToken::Comma)) {
           Error(Parser.getTok().getLoc(),
                 "expected comma in scale expression");
-          return 0;
+          return nullptr;
         }
         Parser.Lex(); // Eat the comma.
 
@@ -1774,18 +1780,18 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
           int64_t ScaleVal;
           if (getParser().parseAbsoluteExpression(ScaleVal)){
             Error(Loc, "expected scale expression");
-            return 0;
+            return nullptr;
           }
 
           // Validate the scale amount.
 	  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
               ScaleVal != 1) {
             Error(Loc, "scale factor in 16-bit address must be 1");
-            return 0;
+            return nullptr;
 	  }
           if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
             Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
-            return 0;
+            return nullptr;
           }
           Scale = (unsigned)ScaleVal;
         }
@@ -1797,7 +1803,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
 
       int64_t Value;
       if (getParser().parseAbsoluteExpression(Value))
-        return 0;
+        return nullptr;
 
       if (Value != 1)
         Warning(Loc, "scale factor without index register is ignored");
@@ -1808,7 +1814,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
   if (getLexer().isNot(AsmToken::RParen)) {
     Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
-    return 0;
+    return nullptr;
   }
   SMLoc MemEnd = Parser.getTok().getEndLoc();
   Parser.Lex(); // Eat the ')'.
@@ -1821,18 +1827,18 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
                          BaseReg != X86::SI && BaseReg != X86::DI)) &&
       BaseReg != X86::DX) {
     Error(BaseLoc, "invalid 16-bit base register");
-    return 0;
+    return nullptr;
   }
   if (BaseReg == 0 &&
       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
     Error(IndexLoc, "16-bit memory operand may not include only index register");
-    return 0;
+    return nullptr;
   }
 
   StringRef ErrMsg;
   if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
     Error(BaseLoc, ErrMsg);
-    return 0;
+    return nullptr;
   }
 
   return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
@@ -1851,7 +1857,7 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
     PatchedName = PatchedName.substr(0, Name.size()-1);
 
   // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
-  const MCExpr *ExtraImmOp = 0;
+  const MCExpr *ExtraImmOp = nullptr;
   if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
       (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
        PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
@@ -2070,8 +2076,10 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       (Name == "smov" || Name == "smovb" || Name == "smovw" ||
        Name == "smovl" || Name == "smovd" || Name == "smovq"))) {
     if (Operands.size() == 1) {
-      if (Name == "movsd")
+      if (Name == "movsd") {
+        delete Operands.back();
         Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
+      }
       if (isParsingIntelSyntax()) {
         Operands.push_back(DefaultMemDIOperand(NameLoc));
         Operands.push_back(DefaultMemSIOperand(NameLoc));
@@ -2253,7 +2261,8 @@ static const char *getSubtargetFeatureName(unsigned Val);
 void X86AsmParser::EmitInstruction(
     MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
     MCStreamer &Out) {
-  Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), Out);
+  Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII,
+                                         Out);
   Out.EmitInstruction(Inst, STI);
 }
 
@@ -2291,7 +2300,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         .Case("fstsw",  "fnstsw")
         .Case("fstsww", "fnstsw")
         .Case("fclex",  "fnclex")
-        .Default(0);
+        .Default(nullptr);
     assert(Repl && "Unknown wait-prefixed instruction");
     delete Operands[0];
     Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index 45fe2a9..de3be38 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -422,7 +422,7 @@ struct X86Operand : public MCParsedAsmOperand {
                                bool AddressOf = false,
                                SMLoc OffsetOfLoc = SMLoc(),
                                StringRef SymName = StringRef(),
-                               void *OpDecl = 0) {
+                               void *OpDecl = nullptr) {
     X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
     Res->Reg.RegNo = RegNo;
     Res->AddressOf = AddressOf;
@@ -441,7 +441,7 @@ struct X86Operand : public MCParsedAsmOperand {
   /// Create an absolute memory operand.
   static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
                                unsigned Size = 0, StringRef SymName = StringRef(),
-                               void *OpDecl = 0) {
+                               void *OpDecl = nullptr) {
     X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
@@ -461,7 +461,7 @@ struct X86Operand : public MCParsedAsmOperand {
                                unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
                                unsigned Size = 0,
                                StringRef SymName = StringRef(),
-                               void *OpDecl = 0) {
+                               void *OpDecl = nullptr) {
     // We should never just have a displacement, that should be parsed as an
     // absolute memory operand.
     assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 206b651..c54fbc1 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -14,7 +14,6 @@ add_public_tablegen_target(X86CommonTableGen)
 
 set(sources
   X86AsmPrinter.cpp
-  X86COFFMachineModuleInfo.cpp
   X86CodeEmitter.cpp
   X86FastISel.cpp
   X86FloatingPoint.cpp
diff --git a/lib/Target/X86/Disassembler/Android.mk b/lib/Target/X86/Disassembler/Android.mk
index 3984266..0b3b8a5 100644
--- a/lib/Target/X86/Disassembler/Android.mk
+++ b/lib/Target/X86/Disassembler/Android.mk
@@ -8,7 +8,8 @@ x86_disassembler_TBLGEN_TABLES := \
 
 x86_disassembler_SRC_FILES := \
   X86Disassembler.cpp \
-  X86DisassemblerDecoder.c
+  X86DisassemblerDecoder.cpp
+
 
 # For the device
 # =====================================================
diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt
index deed115..4370282 100644
--- a/lib/Target/X86/Disassembler/CMakeLists.txt
+++ b/lib/Target/X86/Disassembler/CMakeLists.txt
@@ -1,4 +1,4 @@
 add_llvm_library(LLVMX86Disassembler
   X86Disassembler.cpp
-  X86DisassemblerDecoder.c
+  X86DisassemblerDecoder.cpp
   )
diff --git a/lib/Target/X86/Disassembler/Makefile b/lib/Target/X86/Disassembler/Makefile
index 8669fd8..51e7b82 100644
--- a/lib/Target/X86/Disassembler/Makefile
+++ b/lib/Target/X86/Disassembler/Makefile
@@ -10,7 +10,9 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMX86Disassembler
 
-# Hack: we need to include 'main' x86 target directory to grab private headers
+# Hack: we need to include 'main' x86 target directory to grab private headers.
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
+
+.PHONY: $(PROJ_SRC_DIR)/X86DisassemblerDecoder.c
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index d5759cd..c366725 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -27,6 +27,11 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+#define DEBUG_TYPE "x86-disassembler"
+
 #define GET_REGINFO_ENUM
 #include "X86GenRegisterInfo.inc"
 #define GET_INSTRINFO_ENUM
@@ -34,21 +39,18 @@
 #define GET_SUBTARGETINFO_ENUM
 #include "X86GenSubtargetInfo.inc"
 
-using namespace llvm;
-using namespace llvm::X86Disassembler;
-
-void x86DisassemblerDebug(const char *file,
-                          unsigned line,
-                          const char *s) {
+void llvm::X86Disassembler::Debug(const char *file, unsigned line,
+                                  const char *s) {
   dbgs() << file << ":" << line << ": " << s;
 }
 
-const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) {
+const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode,
+                                                const void *mii) {
   const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
   return MII->getName(Opcode);
 }
 
-#define debug(s) DEBUG(x86DisassemblerDebug(__FILE__, __LINE__, s));
+#define debug(s) DEBUG(Debug(__FILE__, __LINE__, s));
 
 namespace llvm {  
   
@@ -74,9 +76,11 @@ static bool translateInstruction(MCInst &target,
                                 InternalInstruction &source,
                                 const MCDisassembler *Dis);
 
-X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI,
-                                               const MCInstrInfo *MII)
-  : MCDisassembler(STI), MII(MII) {
+X86GenericDisassembler::X86GenericDisassembler(
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx,
+                                         std::unique_ptr<const MCInstrInfo> MII)
+  : MCDisassembler(STI, Ctx), MII(std::move(MII)) {
   switch (STI.getFeatureBits() &
           (X86::Mode16Bit | X86::Mode32Bit | X86::Mode64Bit)) {
   case X86::Mode16Bit:
@@ -93,10 +97,6 @@ X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI,
   }
 }
 
-X86GenericDisassembler::~X86GenericDisassembler() {
-  delete MII;
-}
-
 /// regionReader - a callback function that wraps the readByte method from
 ///   MemoryObject.
 ///
@@ -140,14 +140,14 @@ X86GenericDisassembler::getInstruction(MCInst &instr,
 
   dlog_t loggerFn = logger;
   if (&vStream == &nulls())
-    loggerFn = 0; // Disable logging completely if it's going to nulls().
+    loggerFn = nullptr; // Disable logging completely if it's going to nulls().
   
   int ret = decodeInstruction(&internalInstr,
                               regionReader,
                               (const void*)&region,
                               loggerFn,
                               (void*)&vStream,
-                              (const void*)MII,
+                              (const void*)MII.get(),
                               address,
                               fMode);
 
@@ -319,7 +319,7 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   }
   // By default sign-extend all X86 immediates based on their encoding.
   else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
-           type == TYPE_IMM64) {
+           type == TYPE_IMM64 || type == TYPE_IMMv) {
     uint32_t Opcode = mcInst.getOpcode();
     switch (operand.encoding) {
     default:
@@ -787,13 +787,11 @@ static bool translateInstruction(MCInst &mcInst,
       mcInst.setOpcode(X86::XACQUIRE_PREFIX);
   }
   
-  int index;
-  
   insn.numImmediatesTranslated = 0;
   
-  for (index = 0; index < X86_MAX_OPERANDS; ++index) {
-    if (insn.operands[index].encoding != ENCODING_NONE) {
-      if (translateOperand(mcInst, insn.operands[index], insn, Dis)) {
+  for (const auto &Op : insn.operands) {
+    if (Op.encoding != ENCODING_NONE) {
+      if (translateOperand(mcInst, Op, insn, Dis)) {
         return true;
       }
     }
@@ -803,9 +801,10 @@ static bool translateInstruction(MCInst &mcInst,
 }
 
 static MCDisassembler *createX86Disassembler(const Target &T,
-                                             const MCSubtargetInfo &STI) {
-  return new X86Disassembler::X86GenericDisassembler(STI,
-                                                     T.createMCInstrInfo());
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo());
+  return new X86Disassembler::X86GenericDisassembler(STI, Ctx, std::move(MII));
 }
 
 extern "C" void LLVMInitializeX86Disassembler() { 
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
index 4e6e297..4dc7c29 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.h
+++ b/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -74,17 +74,7 @@
 #ifndef X86DISASSEMBLER_H
 #define X86DISASSEMBLER_H
 
-#define INSTRUCTION_SPECIFIER_FIELDS \
-  uint16_t operands;
-
-#define INSTRUCTION_IDS               \
-  uint16_t instructionIDs;
-
 #include "X86DisassemblerDecoderCommon.h"
-
-#undef INSTRUCTION_SPECIFIER_FIELDS
-#undef INSTRUCTION_IDS
-
 #include "llvm/MC/MCDisassembler.h"
 
 namespace llvm {
@@ -101,13 +91,12 @@ namespace X86Disassembler {
 ///   All each platform class should have to do is subclass the constructor, and
 ///   provide a different disassemblerMode value.
 class X86GenericDisassembler : public MCDisassembler {
-  const MCInstrInfo *MII;
+  std::unique_ptr<const MCInstrInfo> MII;
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  X86GenericDisassembler(const MCSubtargetInfo &STI, const MCInstrInfo *MII);
-private:
-  ~X86GenericDisassembler();
+  X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                         std::unique_ptr<const MCInstrInfo> MII);
 public:
 
   /// getInstruction - See MCDisassembler.
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
deleted file mode 100644
index 0801c96..0000000
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ /dev/null
@@ -1,1821 +0,0 @@
-/*===-- X86DisassemblerDecoder.c - Disassembler decoder ------------*- C -*-===*
- *
- *                     The LLVM Compiler Infrastructure
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *
- *===----------------------------------------------------------------------===*
- *
- * This file is part of the X86 Disassembler.
- * It contains the implementation of the instruction decoder.
- * Documentation for the disassembler can be found in X86Disassembler.h.
- *
- *===----------------------------------------------------------------------===*/
-
-#include <stdarg.h>   /* for va_*()       */
-#include <stdio.h>    /* for vsnprintf()  */
-#include <stdlib.h>   /* for exit()       */
-#include <string.h>   /* for memset()     */
-
-#include "X86DisassemblerDecoder.h"
-
-#include "X86GenDisassemblerTables.inc"
-
-#define TRUE  1
-#define FALSE 0
-
-#ifndef NDEBUG
-#define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0)
-#else
-#define debug(s) do { } while (0)
-#endif
-
-
-/*
- * contextForAttrs - Client for the instruction context table.  Takes a set of
- *   attributes and returns the appropriate decode context.
- *
- * @param attrMask  - Attributes, from the enumeration attributeBits.
- * @return          - The InstructionContext to use when looking up an
- *                    an instruction with these attributes.
- */
-static InstructionContext contextForAttrs(uint16_t attrMask) {
-  return CONTEXTS_SYM[attrMask];
-}
-
-/*
- * modRMRequired - Reads the appropriate instruction table to determine whether
- *   the ModR/M byte is required to decode a particular instruction.
- *
- * @param type        - The opcode type (i.e., how many bytes it has).
- * @param insnContext - The context for the instruction, as returned by
- *                      contextForAttrs.
- * @param opcode      - The last byte of the instruction's opcode, not counting
- *                      ModR/M extensions and escapes.
- * @return            - TRUE if the ModR/M byte is required, FALSE otherwise.
- */
-static int modRMRequired(OpcodeType type,
-                         InstructionContext insnContext,
-                         uint16_t opcode) {
-  const struct ContextDecision* decision = 0;
-
-  switch (type) {
-  case ONEBYTE:
-    decision = &ONEBYTE_SYM;
-    break;
-  case TWOBYTE:
-    decision = &TWOBYTE_SYM;
-    break;
-  case THREEBYTE_38:
-    decision = &THREEBYTE38_SYM;
-    break;
-  case THREEBYTE_3A:
-    decision = &THREEBYTE3A_SYM;
-    break;
-  case XOP8_MAP:
-    decision = &XOP8_MAP_SYM;
-    break;
-  case XOP9_MAP:
-    decision = &XOP9_MAP_SYM;
-    break;
-  case XOPA_MAP:
-    decision = &XOPA_MAP_SYM;
-    break;
-  }
-
-  return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
-    modrm_type != MODRM_ONEENTRY;
-}
-
-/*
- * decode - Reads the appropriate instruction table to obtain the unique ID of
- *   an instruction.
- *
- * @param type        - See modRMRequired().
- * @param insnContext - See modRMRequired().
- * @param opcode      - See modRMRequired().
- * @param modRM       - The ModR/M byte if required, or any value if not.
- * @return            - The UID of the instruction, or 0 on failure.
- */
-static InstrUID decode(OpcodeType type,
-                       InstructionContext insnContext,
-                       uint8_t opcode,
-                       uint8_t modRM) {
-  const struct ModRMDecision* dec = 0;
-
-  switch (type) {
-  case ONEBYTE:
-    dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
-    break;
-  case TWOBYTE:
-    dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
-    break;
-  case THREEBYTE_38:
-    dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
-    break;
-  case THREEBYTE_3A:
-    dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
-    break;
-  case XOP8_MAP:
-    dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
-    break;
-  case XOP9_MAP:
-    dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
-    break;
-  case XOPA_MAP:
-    dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
-    break;
-  }
-
-  switch (dec->modrm_type) {
-  default:
-    debug("Corrupt table!  Unknown modrm_type");
-    return 0;
-  case MODRM_ONEENTRY:
-    return modRMTable[dec->instructionIDs];
-  case MODRM_SPLITRM:
-    if (modFromModRM(modRM) == 0x3)
-      return modRMTable[dec->instructionIDs+1];
-    return modRMTable[dec->instructionIDs];
-  case MODRM_SPLITREG:
-    if (modFromModRM(modRM) == 0x3)
-      return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
-    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
-  case MODRM_SPLITMISC:
-    if (modFromModRM(modRM) == 0x3)
-      return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
-    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
-  case MODRM_FULL:
-    return modRMTable[dec->instructionIDs+modRM];
-  }
-}
-
-/*
- * specifierForUID - Given a UID, returns the name and operand specification for
- *   that instruction.
- *
- * @param uid - The unique ID for the instruction.  This should be returned by
- *              decode(); specifierForUID will not check bounds.
- * @return    - A pointer to the specification for that instruction.
- */
-static const struct InstructionSpecifier *specifierForUID(InstrUID uid) {
-  return &INSTRUCTIONS_SYM[uid];
-}
-
-/*
- * consumeByte - Uses the reader function provided by the user to consume one
- *   byte from the instruction's memory and advance the cursor.
- *
- * @param insn  - The instruction with the reader function to use.  The cursor
- *                for this instruction is advanced.
- * @param byte  - A pointer to a pre-allocated memory buffer to be populated
- *                with the data read.
- * @return      - 0 if the read was successful; nonzero otherwise.
- */
-static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
-  int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
-
-  if (!ret)
-    ++(insn->readerCursor);
-
-  return ret;
-}
-
-/*
- * lookAtByte - Like consumeByte, but does not advance the cursor.
- *
- * @param insn  - See consumeByte().
- * @param byte  - See consumeByte().
- * @return      - See consumeByte().
- */
-static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
-  return insn->reader(insn->readerArg, byte, insn->readerCursor);
-}
-
-static void unconsumeByte(struct InternalInstruction* insn) {
-  insn->readerCursor--;
-}
-
-#define CONSUME_FUNC(name, type)                                  \
-  static int name(struct InternalInstruction* insn, type* ptr) {  \
-    type combined = 0;                                            \
-    unsigned offset;                                              \
-    for (offset = 0; offset < sizeof(type); ++offset) {           \
-      uint8_t byte;                                               \
-      int ret = insn->reader(insn->readerArg,                     \
-                             &byte,                               \
-                             insn->readerCursor + offset);        \
-      if (ret)                                                    \
-        return ret;                                               \
-      combined = combined | ((uint64_t)byte << (offset * 8));     \
-    }                                                             \
-    *ptr = combined;                                              \
-    insn->readerCursor += sizeof(type);                           \
-    return 0;                                                     \
-  }
-
-/*
- * consume* - Use the reader function provided by the user to consume data
- *   values of various sizes from the instruction's memory and advance the
- *   cursor appropriately.  These readers perform endian conversion.
- *
- * @param insn    - See consumeByte().
- * @param ptr     - A pointer to a pre-allocated memory of appropriate size to
- *                  be populated with the data read.
- * @return        - See consumeByte().
- */
-CONSUME_FUNC(consumeInt8, int8_t)
-CONSUME_FUNC(consumeInt16, int16_t)
-CONSUME_FUNC(consumeInt32, int32_t)
-CONSUME_FUNC(consumeUInt16, uint16_t)
-CONSUME_FUNC(consumeUInt32, uint32_t)
-CONSUME_FUNC(consumeUInt64, uint64_t)
-
-/*
- * dbgprintf - Uses the logging function provided by the user to log a single
- *   message, typically without a carriage-return.
- *
- * @param insn    - The instruction containing the logging function.
- * @param format  - See printf().
- * @param ...     - See printf().
- */
-static void dbgprintf(struct InternalInstruction* insn,
-                      const char* format,
-                      ...) {
-  char buffer[256];
-  va_list ap;
-
-  if (!insn->dlog)
-    return;
-
-  va_start(ap, format);
-  (void)vsnprintf(buffer, sizeof(buffer), format, ap);
-  va_end(ap);
-
-  insn->dlog(insn->dlogArg, buffer);
-
-  return;
-}
-
-/*
- * setPrefixPresent - Marks that a particular prefix is present at a particular
- *   location.
- *
- * @param insn      - The instruction to be marked as having the prefix.
- * @param prefix    - The prefix that is present.
- * @param location  - The location where the prefix is located (in the address
- *                    space of the instruction's reader).
- */
-static void setPrefixPresent(struct InternalInstruction* insn,
-                                    uint8_t prefix,
-                                    uint64_t location)
-{
-  insn->prefixPresent[prefix] = 1;
-  insn->prefixLocations[prefix] = location;
-}
-
-/*
- * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
- *   present at a given location.
- *
- * @param insn      - The instruction to be queried.
- * @param prefix    - The prefix.
- * @param location  - The location to query.
- * @return          - Whether the prefix is at that location.
- */
-static BOOL isPrefixAtLocation(struct InternalInstruction* insn,
-                               uint8_t prefix,
-                               uint64_t location)
-{
-  if (insn->prefixPresent[prefix] == 1 &&
-     insn->prefixLocations[prefix] == location)
-    return TRUE;
-  else
-    return FALSE;
-}
-
-/*
- * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
- *   instruction as having them.  Also sets the instruction's default operand,
- *   address, and other relevant data sizes to report operands correctly.
- *
- * @param insn  - The instruction whose prefixes are to be read.
- * @return      - 0 if the instruction could be read until the end of the prefix
- *                bytes, and no prefixes conflicted; nonzero otherwise.
- */
-static int readPrefixes(struct InternalInstruction* insn) {
-  BOOL isPrefix = TRUE;
-  BOOL prefixGroups[4] = { FALSE };
-  uint64_t prefixLocation;
-  uint8_t byte = 0;
-  uint8_t nextByte;
-
-  BOOL hasAdSize = FALSE;
-  BOOL hasOpSize = FALSE;
-
-  dbgprintf(insn, "readPrefixes()");
-
-  while (isPrefix) {
-    prefixLocation = insn->readerCursor;
-
-    /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */
-    if (consumeByte(insn, &byte))
-      break;
-
-    /*
-     * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
-     * break and let it be disassembled as a normal "instruction".
-     */
-    if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
-      break;
-
-    if (insn->readerCursor - 1 == insn->startLocation
-        && (byte == 0xf2 || byte == 0xf3)
-        && !lookAtByte(insn, &nextByte))
-    {
-      /*
-       * If the byte is 0xf2 or 0xf3, and any of the following conditions are
-       * met:
-       * - it is followed by a LOCK (0xf0) prefix
-       * - it is followed by an xchg instruction
-       * then it should be disassembled as a xacquire/xrelease not repne/rep.
-       */
-      if ((byte == 0xf2 || byte == 0xf3) &&
-          ((nextByte == 0xf0) |
-          ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
-        insn->xAcquireRelease = TRUE;
-      /*
-       * Also if the byte is 0xf3, and the following condition is met:
-       * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
-       *                       "mov mem, imm" (opcode 0xc6/0xc7) instructions.
-       * then it should be disassembled as an xrelease not rep.
-       */
-      if (byte == 0xf3 &&
-          (nextByte == 0x88 || nextByte == 0x89 ||
-           nextByte == 0xc6 || nextByte == 0xc7))
-        insn->xAcquireRelease = TRUE;
-      if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
-        if (consumeByte(insn, &nextByte))
-          return -1;
-        if (lookAtByte(insn, &nextByte))
-          return -1;
-        unconsumeByte(insn);
-      }
-      if (nextByte != 0x0f && nextByte != 0x90)
-        break;
-    }
-
-    switch (byte) {
-    case 0xf0:  /* LOCK */
-    case 0xf2:  /* REPNE/REPNZ */
-    case 0xf3:  /* REP or REPE/REPZ */
-      if (prefixGroups[0])
-        dbgprintf(insn, "Redundant Group 1 prefix");
-      prefixGroups[0] = TRUE;
-      setPrefixPresent(insn, byte, prefixLocation);
-      break;
-    case 0x2e:  /* CS segment override -OR- Branch not taken */
-    case 0x36:  /* SS segment override -OR- Branch taken */
-    case 0x3e:  /* DS segment override */
-    case 0x26:  /* ES segment override */
-    case 0x64:  /* FS segment override */
-    case 0x65:  /* GS segment override */
-      switch (byte) {
-      case 0x2e:
-        insn->segmentOverride = SEG_OVERRIDE_CS;
-        break;
-      case 0x36:
-        insn->segmentOverride = SEG_OVERRIDE_SS;
-        break;
-      case 0x3e:
-        insn->segmentOverride = SEG_OVERRIDE_DS;
-        break;
-      case 0x26:
-        insn->segmentOverride = SEG_OVERRIDE_ES;
-        break;
-      case 0x64:
-        insn->segmentOverride = SEG_OVERRIDE_FS;
-        break;
-      case 0x65:
-        insn->segmentOverride = SEG_OVERRIDE_GS;
-        break;
-      default:
-        debug("Unhandled override");
-        return -1;
-      }
-      if (prefixGroups[1])
-        dbgprintf(insn, "Redundant Group 2 prefix");
-      prefixGroups[1] = TRUE;
-      setPrefixPresent(insn, byte, prefixLocation);
-      break;
-    case 0x66:  /* Operand-size override */
-      if (prefixGroups[2])
-        dbgprintf(insn, "Redundant Group 3 prefix");
-      prefixGroups[2] = TRUE;
-      hasOpSize = TRUE;
-      setPrefixPresent(insn, byte, prefixLocation);
-      break;
-    case 0x67:  /* Address-size override */
-      if (prefixGroups[3])
-        dbgprintf(insn, "Redundant Group 4 prefix");
-      prefixGroups[3] = TRUE;
-      hasAdSize = TRUE;
-      setPrefixPresent(insn, byte, prefixLocation);
-      break;
-    default:    /* Not a prefix byte */
-      isPrefix = FALSE;
-      break;
-    }
-
-    if (isPrefix)
-      dbgprintf(insn, "Found prefix 0x%hhx", byte);
-  }
-
-  insn->vectorExtensionType = TYPE_NO_VEX_XOP;
-
-  if (byte == 0x62) {
-    uint8_t byte1, byte2;
-
-    if (consumeByte(insn, &byte1)) {
-      dbgprintf(insn, "Couldn't read second byte of EVEX prefix");
-      return -1;
-    }
-
-    if (lookAtByte(insn, &byte2)) {
-      dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
-      return -1;
-    }
-
-    if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
-       ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
-      insn->vectorExtensionType = TYPE_EVEX;
-    }
-    else {
-      unconsumeByte(insn); /* unconsume byte1 */
-      unconsumeByte(insn); /* unconsume byte  */
-      insn->necessaryPrefixLocation = insn->readerCursor - 2;
-    }
-
-    if (insn->vectorExtensionType == TYPE_EVEX) {
-      insn->vectorExtensionPrefix[0] = byte;
-      insn->vectorExtensionPrefix[1] = byte1;
-      if (consumeByte(insn, &insn->vectorExtensionPrefix[2])) {
-        dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
-        return -1;
-      }
-      if (consumeByte(insn, &insn->vectorExtensionPrefix[3])) {
-        dbgprintf(insn, "Couldn't read fourth byte of EVEX prefix");
-        return -1;
-      }
-
-      /* We simulate the REX prefix for simplicity's sake */
-      if (insn->mode == MODE_64BIT) {
-        insn->rexPrefix = 0x40
-                        | (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3)
-                        | (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2)
-                        | (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1)
-                        | (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0);
-      }
-
-      dbgprintf(insn, "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx",
-              insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
-              insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]);
-    }
-  }
-  else if (byte == 0xc4) {
-    uint8_t byte1;
-
-    if (lookAtByte(insn, &byte1)) {
-      dbgprintf(insn, "Couldn't read second byte of VEX");
-      return -1;
-    }
-
-    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
-      insn->vectorExtensionType = TYPE_VEX_3B;
-      insn->necessaryPrefixLocation = insn->readerCursor - 1;
-    }
-    else {
-      unconsumeByte(insn);
-      insn->necessaryPrefixLocation = insn->readerCursor - 1;
-    }
-
-    if (insn->vectorExtensionType == TYPE_VEX_3B) {
-      insn->vectorExtensionPrefix[0] = byte;
-      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
-      consumeByte(insn, &insn->vectorExtensionPrefix[2]);
-
-      /* We simulate the REX prefix for simplicity's sake */
-
-      if (insn->mode == MODE_64BIT) {
-        insn->rexPrefix = 0x40
-                        | (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3)
-                        | (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2)
-                        | (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1)
-                        | (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
-      }
-
-      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
-                insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
-                insn->vectorExtensionPrefix[2]);
-    }
-  }
-  else if (byte == 0xc5) {
-    uint8_t byte1;
-
-    if (lookAtByte(insn, &byte1)) {
-      dbgprintf(insn, "Couldn't read second byte of VEX");
-      return -1;
-    }
-
-    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
-      insn->vectorExtensionType = TYPE_VEX_2B;
-    }
-    else {
-      unconsumeByte(insn);
-    }
-
-    if (insn->vectorExtensionType == TYPE_VEX_2B) {
-      insn->vectorExtensionPrefix[0] = byte;
-      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
-
-      if (insn->mode == MODE_64BIT) {
-        insn->rexPrefix = 0x40
-                        | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
-      }
-
-      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1]))
-      {
-      default:
-        break;
-      case VEX_PREFIX_66:
-        hasOpSize = TRUE;
-        break;
-      }
-
-      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx",
-                insn->vectorExtensionPrefix[0],
-                insn->vectorExtensionPrefix[1]);
-    }
-  }
-  else if (byte == 0x8f) {
-    uint8_t byte1;
-
-    if (lookAtByte(insn, &byte1)) {
-      dbgprintf(insn, "Couldn't read second byte of XOP");
-      return -1;
-    }
-
-    if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
-      insn->vectorExtensionType = TYPE_XOP;
-      insn->necessaryPrefixLocation = insn->readerCursor - 1;
-    }
-    else {
-      unconsumeByte(insn);
-      insn->necessaryPrefixLocation = insn->readerCursor - 1;
-    }
-
-    if (insn->vectorExtensionType == TYPE_XOP) {
-      insn->vectorExtensionPrefix[0] = byte;
-      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
-      consumeByte(insn, &insn->vectorExtensionPrefix[2]);
-
-      /* We simulate the REX prefix for simplicity's sake */
-
-      if (insn->mode == MODE_64BIT) {
-        insn->rexPrefix = 0x40
-                        | (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3)
-                        | (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2)
-                        | (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1)
-                        | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
-      }
-
-      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2]))
-      {
-      default:
-        break;
-      case VEX_PREFIX_66:
-        hasOpSize = TRUE;
-        break;
-      }
-
-      dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
-                insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
-                insn->vectorExtensionPrefix[2]);
-    }
-  }
-  else {
-    if (insn->mode == MODE_64BIT) {
-      if ((byte & 0xf0) == 0x40) {
-        uint8_t opcodeByte;
-
-        if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
-          dbgprintf(insn, "Redundant REX prefix");
-          return -1;
-        }
-
-        insn->rexPrefix = byte;
-        insn->necessaryPrefixLocation = insn->readerCursor - 2;
-
-        dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
-      } else {
-        unconsumeByte(insn);
-        insn->necessaryPrefixLocation = insn->readerCursor - 1;
-      }
-    } else {
-      unconsumeByte(insn);
-      insn->necessaryPrefixLocation = insn->readerCursor - 1;
-    }
-  }
-
-  if (insn->mode == MODE_16BIT) {
-    insn->registerSize       = (hasOpSize ? 4 : 2);
-    insn->addressSize        = (hasAdSize ? 4 : 2);
-    insn->displacementSize   = (hasAdSize ? 4 : 2);
-    insn->immediateSize      = (hasOpSize ? 4 : 2);
-  } else if (insn->mode == MODE_32BIT) {
-    insn->registerSize       = (hasOpSize ? 2 : 4);
-    insn->addressSize        = (hasAdSize ? 2 : 4);
-    insn->displacementSize   = (hasAdSize ? 2 : 4);
-    insn->immediateSize      = (hasOpSize ? 2 : 4);
-  } else if (insn->mode == MODE_64BIT) {
-    if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
-      insn->registerSize       = 8;
-      insn->addressSize        = (hasAdSize ? 4 : 8);
-      insn->displacementSize   = 4;
-      insn->immediateSize      = 4;
-    } else if (insn->rexPrefix) {
-      insn->registerSize       = (hasOpSize ? 2 : 4);
-      insn->addressSize        = (hasAdSize ? 4 : 8);
-      insn->displacementSize   = (hasOpSize ? 2 : 4);
-      insn->immediateSize      = (hasOpSize ? 2 : 4);
-    } else {
-      insn->registerSize       = (hasOpSize ? 2 : 4);
-      insn->addressSize        = (hasAdSize ? 4 : 8);
-      insn->displacementSize   = (hasOpSize ? 2 : 4);
-      insn->immediateSize      = (hasOpSize ? 2 : 4);
-    }
-  }
-
-  return 0;
-}
-
-/*
- * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
- *   extended or escape opcodes).
- *
- * @param insn  - The instruction whose opcode is to be read.
- * @return      - 0 if the opcode could be read successfully; nonzero otherwise.
- */
-static int readOpcode(struct InternalInstruction* insn) {
-  /* Determine the length of the primary opcode */
-
-  uint8_t current;
-
-  dbgprintf(insn, "readOpcode()");
-
-  insn->opcodeType = ONEBYTE;
-
-  if (insn->vectorExtensionType == TYPE_EVEX)
-  {
-    switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
-    default:
-      dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)",
-                mmFromEVEX2of4(insn->vectorExtensionPrefix[1]));
-      return -1;
-    case VEX_LOB_0F:
-      insn->opcodeType = TWOBYTE;
-      return consumeByte(insn, &insn->opcode);
-    case VEX_LOB_0F38:
-      insn->opcodeType = THREEBYTE_38;
-      return consumeByte(insn, &insn->opcode);
-    case VEX_LOB_0F3A:
-      insn->opcodeType = THREEBYTE_3A;
-      return consumeByte(insn, &insn->opcode);
-    }
-  }
-  else if (insn->vectorExtensionType == TYPE_VEX_3B) {
-    switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
-    default:
-      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
-                mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
-      return -1;
-    case VEX_LOB_0F:
-      insn->opcodeType = TWOBYTE;
-      return consumeByte(insn, &insn->opcode);
-    case VEX_LOB_0F38:
-      insn->opcodeType = THREEBYTE_38;
-      return consumeByte(insn, &insn->opcode);
-    case VEX_LOB_0F3A:
-      insn->opcodeType = THREEBYTE_3A;
-      return consumeByte(insn, &insn->opcode);
-    }
-  }
-  else if (insn->vectorExtensionType == TYPE_VEX_2B) {
-    insn->opcodeType = TWOBYTE;
-    return consumeByte(insn, &insn->opcode);
-  }
-  else if (insn->vectorExtensionType == TYPE_XOP) {
-    switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
-    default:
-      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
-                mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
-      return -1;
-    case XOP_MAP_SELECT_8:
-      insn->opcodeType = XOP8_MAP;
-      return consumeByte(insn, &insn->opcode);
-    case XOP_MAP_SELECT_9:
-      insn->opcodeType = XOP9_MAP;
-      return consumeByte(insn, &insn->opcode);
-    case XOP_MAP_SELECT_A:
-      insn->opcodeType = XOPA_MAP;
-      return consumeByte(insn, &insn->opcode);
-    }
-  }
-
-  if (consumeByte(insn, &current))
-    return -1;
-
-  if (current == 0x0f) {
-    dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
-
-    if (consumeByte(insn, &current))
-      return -1;
-
-    if (current == 0x38) {
-      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
-
-      if (consumeByte(insn, &current))
-        return -1;
-
-      insn->opcodeType = THREEBYTE_38;
-    } else if (current == 0x3a) {
-      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
-
-      if (consumeByte(insn, &current))
-        return -1;
-
-      insn->opcodeType = THREEBYTE_3A;
-    } else {
-      dbgprintf(insn, "Didn't find a three-byte escape prefix");
-
-      insn->opcodeType = TWOBYTE;
-    }
-  }
-
-  /*
-   * At this point we have consumed the full opcode.
-   * Anything we consume from here on must be unconsumed.
-   */
-
-  insn->opcode = current;
-
-  return 0;
-}
-
-static int readModRM(struct InternalInstruction* insn);
-
-/*
- * getIDWithAttrMask - Determines the ID of an instruction, consuming
- *   the ModR/M byte as appropriate for extended and escape opcodes,
- *   and using a supplied attribute mask.
- *
- * @param instructionID - A pointer whose target is filled in with the ID of the
- *                        instruction.
- * @param insn          - The instruction whose ID is to be determined.
- * @param attrMask      - The attribute mask to search.
- * @return              - 0 if the ModR/M could be read when needed or was not
- *                        needed; nonzero otherwise.
- */
-static int getIDWithAttrMask(uint16_t* instructionID,
-                             struct InternalInstruction* insn,
-                             uint16_t attrMask) {
-  BOOL hasModRMExtension;
-
-  uint16_t instructionClass;
-
-  instructionClass = contextForAttrs(attrMask);
-
-  hasModRMExtension = modRMRequired(insn->opcodeType,
-                                    instructionClass,
-                                    insn->opcode);
-
-  if (hasModRMExtension) {
-    if (readModRM(insn))
-      return -1;
-
-    *instructionID = decode(insn->opcodeType,
-                            instructionClass,
-                            insn->opcode,
-                            insn->modRM);
-  } else {
-    *instructionID = decode(insn->opcodeType,
-                            instructionClass,
-                            insn->opcode,
-                            0);
-  }
-
-  return 0;
-}
-
-/*
- * is16BitEquivalent - Determines whether two instruction names refer to
- * equivalent instructions but one is 16-bit whereas the other is not.
- *
- * @param orig  - The instruction that is not 16-bit
- * @param equiv - The instruction that is 16-bit
- */
-static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
-  off_t i;
-
-  for (i = 0;; i++) {
-    if (orig[i] == '\0' && equiv[i] == '\0')
-      return TRUE;
-    if (orig[i] == '\0' || equiv[i] == '\0')
-      return FALSE;
-    if (orig[i] != equiv[i]) {
-      if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
-        continue;
-      if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
-        continue;
-      if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
-        continue;
-      return FALSE;
-    }
-  }
-}
-
-/*
- * getID - Determines the ID of an instruction, consuming the ModR/M byte as
- *   appropriate for extended and escape opcodes.  Determines the attributes and
- *   context for the instruction before doing so.
- *
- * @param insn  - The instruction whose ID is to be determined.
- * @return      - 0 if the ModR/M could be read when needed or was not needed;
- *                nonzero otherwise.
- */
-static int getID(struct InternalInstruction* insn, const void *miiArg) {
-  uint16_t attrMask;
-  uint16_t instructionID;
-
-  dbgprintf(insn, "getID()");
-
-  attrMask = ATTR_NONE;
-
-  if (insn->mode == MODE_64BIT)
-    attrMask |= ATTR_64BIT;
-
-  if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
-    attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX;
-
-    if (insn->vectorExtensionType == TYPE_EVEX) {
-      switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) {
-      case VEX_PREFIX_66:
-        attrMask |= ATTR_OPSIZE;
-        break;
-      case VEX_PREFIX_F3:
-        attrMask |= ATTR_XS;
-        break;
-      case VEX_PREFIX_F2:
-        attrMask |= ATTR_XD;
-        break;
-      }
-
-      if (zFromEVEX4of4(insn->vectorExtensionPrefix[3]))
-        attrMask |= ATTR_EVEXKZ;
-      if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
-        attrMask |= ATTR_EVEXB;
-      if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
-        attrMask |= ATTR_EVEXK;
-      if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
-        attrMask |= ATTR_EVEXL;
-      if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
-        attrMask |= ATTR_EVEXL2;
-    }
-    else if (insn->vectorExtensionType == TYPE_VEX_3B) {
-      switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
-      case VEX_PREFIX_66:
-        attrMask |= ATTR_OPSIZE;
-        break;
-      case VEX_PREFIX_F3:
-        attrMask |= ATTR_XS;
-        break;
-      case VEX_PREFIX_F2:
-        attrMask |= ATTR_XD;
-        break;
-      }
-
-      if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
-        attrMask |= ATTR_VEXL;
-    }
-    else if (insn->vectorExtensionType == TYPE_VEX_2B) {
-      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
-      case VEX_PREFIX_66:
-        attrMask |= ATTR_OPSIZE;
-        break;
-      case VEX_PREFIX_F3:
-        attrMask |= ATTR_XS;
-        break;
-      case VEX_PREFIX_F2:
-        attrMask |= ATTR_XD;
-        break;
-      }
-
-      if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
-        attrMask |= ATTR_VEXL;
-    }
-    else if (insn->vectorExtensionType == TYPE_XOP) {
-      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
-      case VEX_PREFIX_66:
-        attrMask |= ATTR_OPSIZE;
-        break;
-      case VEX_PREFIX_F3:
-        attrMask |= ATTR_XS;
-        break;
-      case VEX_PREFIX_F2:
-        attrMask |= ATTR_XD;
-        break;
-      }
-
-      if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
-        attrMask |= ATTR_VEXL;
-    }
-    else {
-      return -1;
-    }
-  }
-  else {
-    if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
-      attrMask |= ATTR_OPSIZE;
-    else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
-      attrMask |= ATTR_ADSIZE;
-    else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
-      attrMask |= ATTR_XS;
-    else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
-      attrMask |= ATTR_XD;
-  }
-
-  if (insn->rexPrefix & 0x08)
-    attrMask |= ATTR_REXW;
-
-  if (getIDWithAttrMask(&instructionID, insn, attrMask))
-    return -1;
-
-  /*
-   * JCXZ/JECXZ need special handling for 16-bit mode because the meaning
-   * of the AdSize prefix is inverted w.r.t. 32-bit mode.
-   */
-  if (insn->mode == MODE_16BIT && insn->opcode == 0xE3) {
-    const struct InstructionSpecifier *spec;
-    spec = specifierForUID(instructionID);
-
-    /*
-     * Check for Ii8PCRel instructions. We could alternatively do a
-     * string-compare on the names, but this is probably cheaper.
-     */
-    if (x86OperandSets[spec->operands][0].type == TYPE_REL8) {
-      attrMask ^= ATTR_ADSIZE;
-      if (getIDWithAttrMask(&instructionID, insn, attrMask))
-        return -1;
-    }
-  }
-
-  /* The following clauses compensate for limitations of the tables. */
-
-  if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) &&
-      !(attrMask & ATTR_OPSIZE)) {
-    /*
-     * The instruction tables make no distinction between instructions that
-     * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
-     * particular spot (i.e., many MMX operations).  In general we're
-     * conservative, but in the specific case where OpSize is present but not
-     * in the right place we check if there's a 16-bit operation.
-     */
-
-    const struct InstructionSpecifier *spec;
-    uint16_t instructionIDWithOpsize;
-    const char *specName, *specWithOpSizeName;
-
-    spec = specifierForUID(instructionID);
-
-    if (getIDWithAttrMask(&instructionIDWithOpsize,
-                          insn,
-                          attrMask | ATTR_OPSIZE)) {
-      /*
-       * ModRM required with OpSize but not present; give up and return version
-       * without OpSize set
-       */
-
-      insn->instructionID = instructionID;
-      insn->spec = spec;
-      return 0;
-    }
-
-    specName = x86DisassemblerGetInstrName(instructionID, miiArg);
-    specWithOpSizeName =
-      x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg);
-
-    if (is16BitEquivalent(specName, specWithOpSizeName) &&
-        (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) {
-      insn->instructionID = instructionIDWithOpsize;
-      insn->spec = specifierForUID(instructionIDWithOpsize);
-    } else {
-      insn->instructionID = instructionID;
-      insn->spec = spec;
-    }
-    return 0;
-  }
-
-  if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
-      insn->rexPrefix & 0x01) {
-    /*
-     * NOOP shouldn't decode as NOOP if REX.b is set. Instead
-     * it should decode as XCHG %r8, %eax.
-     */
-
-    const struct InstructionSpecifier *spec;
-    uint16_t instructionIDWithNewOpcode;
-    const struct InstructionSpecifier *specWithNewOpcode;
-
-    spec = specifierForUID(instructionID);
-
-    /* Borrow opcode from one of the other XCHGar opcodes */
-    insn->opcode = 0x91;
-
-    if (getIDWithAttrMask(&instructionIDWithNewOpcode,
-                          insn,
-                          attrMask)) {
-      insn->opcode = 0x90;
-
-      insn->instructionID = instructionID;
-      insn->spec = spec;
-      return 0;
-    }
-
-    specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode);
-
-    /* Change back */
-    insn->opcode = 0x90;
-
-    insn->instructionID = instructionIDWithNewOpcode;
-    insn->spec = specWithNewOpcode;
-
-    return 0;
-  }
-
-  insn->instructionID = instructionID;
-  insn->spec = specifierForUID(insn->instructionID);
-
-  return 0;
-}
-
-/*
- * readSIB - Consumes the SIB byte to determine addressing information for an
- *   instruction.
- *
- * @param insn  - The instruction whose SIB byte is to be read.
- * @return      - 0 if the SIB byte was successfully read; nonzero otherwise.
- */
-static int readSIB(struct InternalInstruction* insn) {
-  SIBIndex sibIndexBase = 0;
-  SIBBase sibBaseBase = 0;
-  uint8_t index, base;
-
-  dbgprintf(insn, "readSIB()");
-
-  if (insn->consumedSIB)
-    return 0;
-
-  insn->consumedSIB = TRUE;
-
-  switch (insn->addressSize) {
-  case 2:
-    dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
-    return -1;
-  case 4:
-    sibIndexBase = SIB_INDEX_EAX;
-    sibBaseBase = SIB_BASE_EAX;
-    break;
-  case 8:
-    sibIndexBase = SIB_INDEX_RAX;
-    sibBaseBase = SIB_BASE_RAX;
-    break;
-  }
-
-  if (consumeByte(insn, &insn->sib))
-    return -1;
-
-  index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
-  if (insn->vectorExtensionType == TYPE_EVEX)
-    index |= v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4;
-
-  switch (index) {
-  case 0x4:
-    insn->sibIndex = SIB_INDEX_NONE;
-    break;
-  default:
-    insn->sibIndex = (SIBIndex)(sibIndexBase + index);
-    if (insn->sibIndex == SIB_INDEX_sib ||
-        insn->sibIndex == SIB_INDEX_sib64)
-      insn->sibIndex = SIB_INDEX_NONE;
-    break;
-  }
-
-  switch (scaleFromSIB(insn->sib)) {
-  case 0:
-    insn->sibScale = 1;
-    break;
-  case 1:
-    insn->sibScale = 2;
-    break;
-  case 2:
-    insn->sibScale = 4;
-    break;
-  case 3:
-    insn->sibScale = 8;
-    break;
-  }
-
-  base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
-
-  switch (base) {
-  case 0x5:
-  case 0xd:
-    switch (modFromModRM(insn->modRM)) {
-    case 0x0:
-      insn->eaDisplacement = EA_DISP_32;
-      insn->sibBase = SIB_BASE_NONE;
-      break;
-    case 0x1:
-      insn->eaDisplacement = EA_DISP_8;
-      insn->sibBase = (SIBBase)(sibBaseBase + base);
-      break;
-    case 0x2:
-      insn->eaDisplacement = EA_DISP_32;
-      insn->sibBase = (SIBBase)(sibBaseBase + base);
-      break;
-    case 0x3:
-      debug("Cannot have Mod = 0b11 and a SIB byte");
-      return -1;
-    }
-    break;
-  default:
-    insn->sibBase = (SIBBase)(sibBaseBase + base);
-    break;
-  }
-
-  return 0;
-}
-
-/*
- * readDisplacement - Consumes the displacement of an instruction.
- *
- * @param insn  - The instruction whose displacement is to be read.
- * @return      - 0 if the displacement byte was successfully read; nonzero
- *                otherwise.
- */
-static int readDisplacement(struct InternalInstruction* insn) {
-  int8_t d8;
-  int16_t d16;
-  int32_t d32;
-
-  dbgprintf(insn, "readDisplacement()");
-
-  if (insn->consumedDisplacement)
-    return 0;
-
-  insn->consumedDisplacement = TRUE;
-  insn->displacementOffset = insn->readerCursor - insn->startLocation;
-
-  switch (insn->eaDisplacement) {
-  case EA_DISP_NONE:
-    insn->consumedDisplacement = FALSE;
-    break;
-  case EA_DISP_8:
-    if (consumeInt8(insn, &d8))
-      return -1;
-    insn->displacement = d8;
-    break;
-  case EA_DISP_16:
-    if (consumeInt16(insn, &d16))
-      return -1;
-    insn->displacement = d16;
-    break;
-  case EA_DISP_32:
-    if (consumeInt32(insn, &d32))
-      return -1;
-    insn->displacement = d32;
-    break;
-  }
-
-  insn->consumedDisplacement = TRUE;
-  return 0;
-}
-
-/*
- * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
- *   displacement) for an instruction and interprets it.
- *
- * @param insn  - The instruction whose addressing information is to be read.
- * @return      - 0 if the information was successfully read; nonzero otherwise.
- */
-static int readModRM(struct InternalInstruction* insn) {
-  uint8_t mod, rm, reg;
-
-  dbgprintf(insn, "readModRM()");
-
-  if (insn->consumedModRM)
-    return 0;
-
-  if (consumeByte(insn, &insn->modRM))
-    return -1;
-  insn->consumedModRM = TRUE;
-
-  mod     = modFromModRM(insn->modRM);
-  rm      = rmFromModRM(insn->modRM);
-  reg     = regFromModRM(insn->modRM);
-
-  /*
-   * This goes by insn->registerSize to pick the correct register, which messes
-   * up if we're using (say) XMM or 8-bit register operands.  That gets fixed in
-   * fixupReg().
-   */
-  switch (insn->registerSize) {
-  case 2:
-    insn->regBase = MODRM_REG_AX;
-    insn->eaRegBase = EA_REG_AX;
-    break;
-  case 4:
-    insn->regBase = MODRM_REG_EAX;
-    insn->eaRegBase = EA_REG_EAX;
-    break;
-  case 8:
-    insn->regBase = MODRM_REG_RAX;
-    insn->eaRegBase = EA_REG_RAX;
-    break;
-  }
-
-  reg |= rFromREX(insn->rexPrefix) << 3;
-  rm  |= bFromREX(insn->rexPrefix) << 3;
-  if (insn->vectorExtensionType == TYPE_EVEX) {
-    reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
-    rm  |=  xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
-  }
-
-  insn->reg = (Reg)(insn->regBase + reg);
-
-  switch (insn->addressSize) {
-  case 2:
-    insn->eaBaseBase = EA_BASE_BX_SI;
-
-    switch (mod) {
-    case 0x0:
-      if (rm == 0x6) {
-        insn->eaBase = EA_BASE_NONE;
-        insn->eaDisplacement = EA_DISP_16;
-        if (readDisplacement(insn))
-          return -1;
-      } else {
-        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
-        insn->eaDisplacement = EA_DISP_NONE;
-      }
-      break;
-    case 0x1:
-      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
-      insn->eaDisplacement = EA_DISP_8;
-      insn->displacementSize = 1;
-      if (readDisplacement(insn))
-        return -1;
-      break;
-    case 0x2:
-      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
-      insn->eaDisplacement = EA_DISP_16;
-      if (readDisplacement(insn))
-        return -1;
-      break;
-    case 0x3:
-      insn->eaBase = (EABase)(insn->eaRegBase + rm);
-      if (readDisplacement(insn))
-        return -1;
-      break;
-    }
-    break;
-  case 4:
-  case 8:
-    insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
-
-    switch (mod) {
-    case 0x0:
-      insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
-      switch (rm) {
-      case 0x14:
-      case 0x4:
-      case 0xc:   /* in case REXW.b is set */
-        insn->eaBase = (insn->addressSize == 4 ?
-                        EA_BASE_sib : EA_BASE_sib64);
-        if (readSIB(insn) || readDisplacement(insn))
-          return -1;
-        break;
-      case 0x5:
-        insn->eaBase = EA_BASE_NONE;
-        insn->eaDisplacement = EA_DISP_32;
-        if (readDisplacement(insn))
-          return -1;
-        break;
-      default:
-        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
-        break;
-      }
-      break;
-    case 0x1:
-      insn->displacementSize = 1;
-      /* FALLTHROUGH */
-    case 0x2:
-      insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
-      switch (rm) {
-      case 0x14:
-      case 0x4:
-      case 0xc:   /* in case REXW.b is set */
-        insn->eaBase = EA_BASE_sib;
-        if (readSIB(insn) || readDisplacement(insn))
-          return -1;
-        break;
-      default:
-        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
-        if (readDisplacement(insn))
-          return -1;
-        break;
-      }
-      break;
-    case 0x3:
-      insn->eaDisplacement = EA_DISP_NONE;
-      insn->eaBase = (EABase)(insn->eaRegBase + rm);
-      break;
-    }
-    break;
-  } /* switch (insn->addressSize) */
-
-  return 0;
-}
-
-#define GENERIC_FIXUP_FUNC(name, base, prefix)            \
-  static uint8_t name(struct InternalInstruction *insn,   \
-                      OperandType type,                   \
-                      uint8_t index,                      \
-                      uint8_t *valid) {                   \
-    *valid = 1;                                           \
-    switch (type) {                                       \
-    default:                                              \
-      debug("Unhandled register type");                   \
-      *valid = 0;                                         \
-      return 0;                                           \
-    case TYPE_Rv:                                         \
-      return base + index;                                \
-    case TYPE_R8:                                         \
-      if (insn->rexPrefix &&                              \
-         index >= 4 && index <= 7) {                      \
-        return prefix##_SPL + (index - 4);                \
-      } else {                                            \
-        return prefix##_AL + index;                       \
-      }                                                   \
-    case TYPE_R16:                                        \
-      return prefix##_AX + index;                         \
-    case TYPE_R32:                                        \
-      return prefix##_EAX + index;                        \
-    case TYPE_R64:                                        \
-      return prefix##_RAX + index;                        \
-    case TYPE_XMM512:                                     \
-      return prefix##_ZMM0 + index;                       \
-    case TYPE_XMM256:                                     \
-      return prefix##_YMM0 + index;                       \
-    case TYPE_XMM128:                                     \
-    case TYPE_XMM64:                                      \
-    case TYPE_XMM32:                                      \
-    case TYPE_XMM:                                        \
-      return prefix##_XMM0 + index;                       \
-    case TYPE_VK1:                                        \
-    case TYPE_VK8:                                        \
-    case TYPE_VK16:                                       \
-      return prefix##_K0 + index;                         \
-    case TYPE_MM64:                                       \
-    case TYPE_MM32:                                       \
-    case TYPE_MM:                                         \
-      if (index > 7)                                      \
-        *valid = 0;                                       \
-      return prefix##_MM0 + index;                        \
-    case TYPE_SEGMENTREG:                                 \
-      if (index > 5)                                      \
-        *valid = 0;                                       \
-      return prefix##_ES + index;                         \
-    case TYPE_DEBUGREG:                                   \
-      if (index > 7)                                      \
-        *valid = 0;                                       \
-      return prefix##_DR0 + index;                        \
-    case TYPE_CONTROLREG:                                 \
-      if (index > 8)                                      \
-        *valid = 0;                                       \
-      return prefix##_CR0 + index;                        \
-    }                                                     \
-  }
-
-/*
- * fixup*Value - Consults an operand type to determine the meaning of the
- *   reg or R/M field.  If the operand is an XMM operand, for example, an
- *   operand would be XMM0 instead of AX, which readModRM() would otherwise
- *   misinterpret it as.
- *
- * @param insn  - The instruction containing the operand.
- * @param type  - The operand type.
- * @param index - The existing value of the field as reported by readModRM().
- * @param valid - The address of a uint8_t.  The target is set to 1 if the
- *                field is valid for the register class; 0 if not.
- * @return      - The proper value.
- */
-GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase,    MODRM_REG)
-GENERIC_FIXUP_FUNC(fixupRMValue,  insn->eaRegBase,  EA_REG)
-
-/*
- * fixupReg - Consults an operand specifier to determine which of the
- *   fixup*Value functions to use in correcting readModRM()'ss interpretation.
- *
- * @param insn  - See fixup*Value().
- * @param op    - The operand specifier.
- * @return      - 0 if fixup was successful; -1 if the register returned was
- *                invalid for its class.
- */
-static int fixupReg(struct InternalInstruction *insn,
-                    const struct OperandSpecifier *op) {
-  uint8_t valid;
-
-  dbgprintf(insn, "fixupReg()");
-
-  switch ((OperandEncoding)op->encoding) {
-  default:
-    debug("Expected a REG or R/M encoding in fixupReg");
-    return -1;
-  case ENCODING_VVVV:
-    insn->vvvv = (Reg)fixupRegValue(insn,
-                                    (OperandType)op->type,
-                                    insn->vvvv,
-                                    &valid);
-    if (!valid)
-      return -1;
-    break;
-  case ENCODING_REG:
-    insn->reg = (Reg)fixupRegValue(insn,
-                                   (OperandType)op->type,
-                                   insn->reg - insn->regBase,
-                                   &valid);
-    if (!valid)
-      return -1;
-    break;
-  case ENCODING_RM:
-    if (insn->eaBase >= insn->eaRegBase) {
-      insn->eaBase = (EABase)fixupRMValue(insn,
-                                          (OperandType)op->type,
-                                          insn->eaBase - insn->eaRegBase,
-                                          &valid);
-      if (!valid)
-        return -1;
-    }
-    break;
-  }
-
-  return 0;
-}
-
-/*
- * readOpcodeRegister - Reads an operand from the opcode field of an
- *   instruction and interprets it appropriately given the operand width.
- *   Handles AddRegFrm instructions.
- *
- * @param insn  - the instruction whose opcode field is to be read.
- * @param size  - The width (in bytes) of the register being specified.
- *                1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
- *                RAX.
- * @return      - 0 on success; nonzero otherwise.
- */
-static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
-  dbgprintf(insn, "readOpcodeRegister()");
-
-  if (size == 0)
-    size = insn->registerSize;
-
-  switch (size) {
-  case 1:
-    insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
-                                                  | (insn->opcode & 7)));
-    if (insn->rexPrefix &&
-        insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
-        insn->opcodeRegister < MODRM_REG_AL + 0x8) {
-      insn->opcodeRegister = (Reg)(MODRM_REG_SPL
-                                   + (insn->opcodeRegister - MODRM_REG_AL - 4));
-    }
-
-    break;
-  case 2:
-    insn->opcodeRegister = (Reg)(MODRM_REG_AX
-                                 + ((bFromREX(insn->rexPrefix) << 3)
-                                    | (insn->opcode & 7)));
-    break;
-  case 4:
-    insn->opcodeRegister = (Reg)(MODRM_REG_EAX
-                                 + ((bFromREX(insn->rexPrefix) << 3)
-                                    | (insn->opcode & 7)));
-    break;
-  case 8:
-    insn->opcodeRegister = (Reg)(MODRM_REG_RAX
-                                 + ((bFromREX(insn->rexPrefix) << 3)
-                                    | (insn->opcode & 7)));
-    break;
-  }
-
-  return 0;
-}
-
-/*
- * readImmediate - Consumes an immediate operand from an instruction, given the
- *   desired operand size.
- *
- * @param insn  - The instruction whose operand is to be read.
- * @param size  - The width (in bytes) of the operand.
- * @return      - 0 if the immediate was successfully consumed; nonzero
- *                otherwise.
- */
-static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
-  uint8_t imm8;
-  uint16_t imm16;
-  uint32_t imm32;
-  uint64_t imm64;
-
-  dbgprintf(insn, "readImmediate()");
-
-  if (insn->numImmediatesConsumed == 2) {
-    debug("Already consumed two immediates");
-    return -1;
-  }
-
-  if (size == 0)
-    size = insn->immediateSize;
-  else
-    insn->immediateSize = size;
-  insn->immediateOffset = insn->readerCursor - insn->startLocation;
-
-  switch (size) {
-  case 1:
-    if (consumeByte(insn, &imm8))
-      return -1;
-    insn->immediates[insn->numImmediatesConsumed] = imm8;
-    break;
-  case 2:
-    if (consumeUInt16(insn, &imm16))
-      return -1;
-    insn->immediates[insn->numImmediatesConsumed] = imm16;
-    break;
-  case 4:
-    if (consumeUInt32(insn, &imm32))
-      return -1;
-    insn->immediates[insn->numImmediatesConsumed] = imm32;
-    break;
-  case 8:
-    if (consumeUInt64(insn, &imm64))
-      return -1;
-    insn->immediates[insn->numImmediatesConsumed] = imm64;
-    break;
-  }
-
-  insn->numImmediatesConsumed++;
-
-  return 0;
-}
-
-/*
- * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix.
- *
- * @param insn  - The instruction whose operand is to be read.
- * @return      - 0 if the vvvv was successfully consumed; nonzero
- *                otherwise.
- */
-static int readVVVV(struct InternalInstruction* insn) {
-  dbgprintf(insn, "readVVVV()");
-
-  if (insn->vectorExtensionType == TYPE_EVEX)
-    insn->vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]);
-  else if (insn->vectorExtensionType == TYPE_VEX_3B)
-    insn->vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
-  else if (insn->vectorExtensionType == TYPE_VEX_2B)
-    insn->vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
-  else if (insn->vectorExtensionType == TYPE_XOP)
-    insn->vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
-  else
-    return -1;
-
-  if (insn->mode != MODE_64BIT)
-    insn->vvvv &= 0x7;
-
-  return 0;
-}
-
-/*
- * readMaskRegister - Reads an mask register from the opcode field of an
- *   instruction.
- *
- * @param insn    - The instruction whose opcode field is to be read.
- * @return        - 0 on success; nonzero otherwise.
- */
-static int readMaskRegister(struct InternalInstruction* insn) {
-  dbgprintf(insn, "readMaskRegister()");
-
-  if (insn->vectorExtensionType != TYPE_EVEX)
-    return -1;
-
-  insn->writemask = aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]);
-  return 0;
-}
-
-/*
- * readOperands - Consults the specifier for an instruction and consumes all
- *   operands for that instruction, interpreting them as it goes.
- *
- * @param insn  - The instruction whose operands are to be read and interpreted.
- * @return      - 0 if all operands could be read; nonzero otherwise.
- */
-static int readOperands(struct InternalInstruction* insn) {
-  int index;
-  int hasVVVV, needVVVV;
-  int sawRegImm = 0;
-
-  dbgprintf(insn, "readOperands()");
-
-  /* If non-zero vvvv specified, need to make sure one of the operands
-     uses it. */
-  hasVVVV = !readVVVV(insn);
-  needVVVV = hasVVVV && (insn->vvvv != 0);
-
-  for (index = 0; index < X86_MAX_OPERANDS; ++index) {
-    switch (x86OperandSets[insn->spec->operands][index].encoding) {
-    case ENCODING_NONE:
-    case ENCODING_SI:
-    case ENCODING_DI:
-      break;
-    case ENCODING_REG:
-    case ENCODING_RM:
-      if (readModRM(insn))
-        return -1;
-      if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
-        return -1;
-      break;
-    case ENCODING_CB:
-    case ENCODING_CW:
-    case ENCODING_CD:
-    case ENCODING_CP:
-    case ENCODING_CO:
-    case ENCODING_CT:
-      dbgprintf(insn, "We currently don't hande code-offset encodings");
-      return -1;
-    case ENCODING_IB:
-      if (sawRegImm) {
-        /* Saw a register immediate so don't read again and instead split the
-           previous immediate.  FIXME: This is a hack. */
-        insn->immediates[insn->numImmediatesConsumed] =
-          insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
-        ++insn->numImmediatesConsumed;
-        break;
-      }
-      if (readImmediate(insn, 1))
-        return -1;
-      if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM3 &&
-          insn->immediates[insn->numImmediatesConsumed - 1] > 7)
-        return -1;
-      if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM5 &&
-          insn->immediates[insn->numImmediatesConsumed - 1] > 31)
-        return -1;
-      if (x86OperandSets[insn->spec->operands][index].type == TYPE_XMM128 ||
-          x86OperandSets[insn->spec->operands][index].type == TYPE_XMM256)
-        sawRegImm = 1;
-      break;
-    case ENCODING_IW:
-      if (readImmediate(insn, 2))
-        return -1;
-      break;
-    case ENCODING_ID:
-      if (readImmediate(insn, 4))
-        return -1;
-      break;
-    case ENCODING_IO:
-      if (readImmediate(insn, 8))
-        return -1;
-      break;
-    case ENCODING_Iv:
-      if (readImmediate(insn, insn->immediateSize))
-        return -1;
-      break;
-    case ENCODING_Ia:
-      if (readImmediate(insn, insn->addressSize))
-        return -1;
-      break;
-    case ENCODING_RB:
-      if (readOpcodeRegister(insn, 1))
-        return -1;
-      break;
-    case ENCODING_RW:
-      if (readOpcodeRegister(insn, 2))
-        return -1;
-      break;
-    case ENCODING_RD:
-      if (readOpcodeRegister(insn, 4))
-        return -1;
-      break;
-    case ENCODING_RO:
-      if (readOpcodeRegister(insn, 8))
-        return -1;
-      break;
-    case ENCODING_Rv:
-      if (readOpcodeRegister(insn, 0))
-        return -1;
-      break;
-    case ENCODING_FP:
-      break;
-    case ENCODING_VVVV:
-      needVVVV = 0; /* Mark that we have found a VVVV operand. */
-      if (!hasVVVV)
-        return -1;
-      if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
-        return -1;
-      break;
-    case ENCODING_WRITEMASK:
-      if (readMaskRegister(insn))
-        return -1;
-      break;
-    case ENCODING_DUP:
-      break;
-    default:
-      dbgprintf(insn, "Encountered an operand with an unknown encoding.");
-      return -1;
-    }
-  }
-
-  /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
-  if (needVVVV) return -1;
-
-  return 0;
-}
-
-/*
- * decodeInstruction - Reads and interprets a full instruction provided by the
- *   user.
- *
- * @param insn      - A pointer to the instruction to be populated.  Must be
- *                    pre-allocated.
- * @param reader    - The function to be used to read the instruction's bytes.
- * @param readerArg - A generic argument to be passed to the reader to store
- *                    any internal state.
- * @param logger    - If non-NULL, the function to be used to write log messages
- *                    and warnings.
- * @param loggerArg - A generic argument to be passed to the logger to store
- *                    any internal state.
- * @param startLoc  - The address (in the reader's address space) of the first
- *                    byte in the instruction.
- * @param mode      - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
- *                    decode the instruction in.
- * @return          - 0 if the instruction's memory could be read; nonzero if
- *                    not.
- */
-int decodeInstruction(struct InternalInstruction* insn,
-                      byteReader_t reader,
-                      const void* readerArg,
-                      dlog_t logger,
-                      void* loggerArg,
-                      const void* miiArg,
-                      uint64_t startLoc,
-                      DisassemblerMode mode) {
-  memset(insn, 0, sizeof(struct InternalInstruction));
-
-  insn->reader = reader;
-  insn->readerArg = readerArg;
-  insn->dlog = logger;
-  insn->dlogArg = loggerArg;
-  insn->startLocation = startLoc;
-  insn->readerCursor = startLoc;
-  insn->mode = mode;
-  insn->numImmediatesConsumed = 0;
-
-  if (readPrefixes(insn)       ||
-      readOpcode(insn)         ||
-      getID(insn, miiArg)      ||
-      insn->instructionID == 0 ||
-      readOperands(insn))
-    return -1;
-
-  insn->operands = &x86OperandSets[insn->spec->operands][0];
-
-  insn->length = insn->readerCursor - insn->startLocation;
-
-  dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
-            startLoc, insn->readerCursor, insn->length);
-
-  if (insn->length > 15)
-    dbgprintf(insn, "Instruction exceeds 15-byte limit");
-
-  return 0;
-}
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
new file mode 100644
index 0000000..804606d
--- /dev/null
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -0,0 +1,1838 @@
+//===-- X86DisassemblerDecoder.c - Disassembler decoder -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the implementation of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdarg.h>   /* for va_*()       */
+#include <stdio.h>    /* for vsnprintf()  */
+#include <stdlib.h>   /* for exit()       */
+#include <string.h>   /* for memset()     */
+
+#include "X86DisassemblerDecoder.h"
+
+using namespace llvm::X86Disassembler;
+
+/// Specifies whether a ModR/M byte is needed and (if so) which
+/// instruction each possible value of the ModR/M byte corresponds to.  Once
+/// this information is known, we have narrowed down to a single instruction.
+struct ModRMDecision {
+  uint8_t modrm_type;
+  uint16_t instructionIDs;
+};
+
+/// Specifies which set of ModR/M->instruction tables to look at
+/// given a particular opcode.
+struct OpcodeDecision {
+  ModRMDecision modRMDecisions[256];
+};
+
+/// Specifies which opcode->instruction tables to look at given
+/// a particular context (set of attributes).  Since there are many possible
+/// contexts, the decoder first uses CONTEXTS_SYM to determine which context
+/// applies given a specific set of attributes.  Hence there are only IC_max
+/// entries in this table, rather than 2^(ATTR_max).
+struct ContextDecision {
+  OpcodeDecision opcodeDecisions[IC_max];
+};
+
+#include "X86GenDisassemblerTables.inc"
+
+#ifndef NDEBUG
+#define debug(s) do { Debug(__FILE__, __LINE__, s); } while (0)
+#else
+#define debug(s) do { } while (0)
+#endif
+
+
+/*
+ * contextForAttrs - Client for the instruction context table.  Takes a set of
+ *   attributes and returns the appropriate decode context.
+ *
+ * @param attrMask  - Attributes, from the enumeration attributeBits.
+ * @return          - The InstructionContext to use when looking up an
+ *                    an instruction with these attributes.
+ */
+static InstructionContext contextForAttrs(uint16_t attrMask) {
+  return static_cast<InstructionContext>(CONTEXTS_SYM[attrMask]);
+}
+
+/*
+ * modRMRequired - Reads the appropriate instruction table to determine whether
+ *   the ModR/M byte is required to decode a particular instruction.
+ *
+ * @param type        - The opcode type (i.e., how many bytes it has).
+ * @param insnContext - The context for the instruction, as returned by
+ *                      contextForAttrs.
+ * @param opcode      - The last byte of the instruction's opcode, not counting
+ *                      ModR/M extensions and escapes.
+ * @return            - true if the ModR/M byte is required, false otherwise.
+ */
+static int modRMRequired(OpcodeType type,
+                         InstructionContext insnContext,
+                         uint16_t opcode) {
+  const struct ContextDecision* decision = nullptr;
+
+  switch (type) {
+  case ONEBYTE:
+    decision = &ONEBYTE_SYM;
+    break;
+  case TWOBYTE:
+    decision = &TWOBYTE_SYM;
+    break;
+  case THREEBYTE_38:
+    decision = &THREEBYTE38_SYM;
+    break;
+  case THREEBYTE_3A:
+    decision = &THREEBYTE3A_SYM;
+    break;
+  case XOP8_MAP:
+    decision = &XOP8_MAP_SYM;
+    break;
+  case XOP9_MAP:
+    decision = &XOP9_MAP_SYM;
+    break;
+  case XOPA_MAP:
+    decision = &XOPA_MAP_SYM;
+    break;
+  }
+
+  return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
+    modrm_type != MODRM_ONEENTRY;
+}
+
+/*
+ * decode - Reads the appropriate instruction table to obtain the unique ID of
+ *   an instruction.
+ *
+ * @param type        - See modRMRequired().
+ * @param insnContext - See modRMRequired().
+ * @param opcode      - See modRMRequired().
+ * @param modRM       - The ModR/M byte if required, or any value if not.
+ * @return            - The UID of the instruction, or 0 on failure.
+ */
+static InstrUID decode(OpcodeType type,
+                       InstructionContext insnContext,
+                       uint8_t opcode,
+                       uint8_t modRM) {
+  const struct ModRMDecision* dec = nullptr;
+
+  switch (type) {
+  case ONEBYTE:
+    dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case TWOBYTE:
+    dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_38:
+    dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_3A:
+    dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOP8_MAP:
+    dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOP9_MAP:
+    dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOPA_MAP:
+    dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  }
+
+  switch (dec->modrm_type) {
+  default:
+    debug("Corrupt table!  Unknown modrm_type");
+    return 0;
+  case MODRM_ONEENTRY:
+    return modRMTable[dec->instructionIDs];
+  case MODRM_SPLITRM:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+1];
+    return modRMTable[dec->instructionIDs];
+  case MODRM_SPLITREG:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_SPLITMISC:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_FULL:
+    return modRMTable[dec->instructionIDs+modRM];
+  }
+}
+
+/*
+ * specifierForUID - Given a UID, returns the name and operand specification for
+ *   that instruction.
+ *
+ * @param uid - The unique ID for the instruction.  This should be returned by
+ *              decode(); specifierForUID will not check bounds.
+ * @return    - A pointer to the specification for that instruction.
+ */
+static const struct InstructionSpecifier *specifierForUID(InstrUID uid) {
+  return &INSTRUCTIONS_SYM[uid];
+}
+
+/*
+ * consumeByte - Uses the reader function provided by the user to consume one
+ *   byte from the instruction's memory and advance the cursor.
+ *
+ * @param insn  - The instruction with the reader function to use.  The cursor
+ *                for this instruction is advanced.
+ * @param byte  - A pointer to a pre-allocated memory buffer to be populated
+ *                with the data read.
+ * @return      - 0 if the read was successful; nonzero otherwise.
+ */
+static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
+  int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
+
+  if (!ret)
+    ++(insn->readerCursor);
+
+  return ret;
+}
+
+/*
+ * lookAtByte - Like consumeByte, but does not advance the cursor.
+ *
+ * @param insn  - See consumeByte().
+ * @param byte  - See consumeByte().
+ * @return      - See consumeByte().
+ */
+static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
+  return insn->reader(insn->readerArg, byte, insn->readerCursor);
+}
+
+static void unconsumeByte(struct InternalInstruction* insn) {
+  insn->readerCursor--;
+}
+
+#define CONSUME_FUNC(name, type)                                  \
+  static int name(struct InternalInstruction* insn, type* ptr) {  \
+    type combined = 0;                                            \
+    unsigned offset;                                              \
+    for (offset = 0; offset < sizeof(type); ++offset) {           \
+      uint8_t byte;                                               \
+      int ret = insn->reader(insn->readerArg,                     \
+                             &byte,                               \
+                             insn->readerCursor + offset);        \
+      if (ret)                                                    \
+        return ret;                                               \
+      combined = combined | ((uint64_t)byte << (offset * 8));     \
+    }                                                             \
+    *ptr = combined;                                              \
+    insn->readerCursor += sizeof(type);                           \
+    return 0;                                                     \
+  }
+
+/*
+ * consume* - Use the reader function provided by the user to consume data
+ *   values of various sizes from the instruction's memory and advance the
+ *   cursor appropriately.  These readers perform endian conversion.
+ *
+ * @param insn    - See consumeByte().
+ * @param ptr     - A pointer to a pre-allocated memory of appropriate size to
+ *                  be populated with the data read.
+ * @return        - See consumeByte().
+ */
+CONSUME_FUNC(consumeInt8, int8_t)
+CONSUME_FUNC(consumeInt16, int16_t)
+CONSUME_FUNC(consumeInt32, int32_t)
+CONSUME_FUNC(consumeUInt16, uint16_t)
+CONSUME_FUNC(consumeUInt32, uint32_t)
+CONSUME_FUNC(consumeUInt64, uint64_t)
+
+/*
+ * dbgprintf - Uses the logging function provided by the user to log a single
+ *   message, typically without a carriage-return.
+ *
+ * @param insn    - The instruction containing the logging function.
+ * @param format  - See printf().
+ * @param ...     - See printf().
+ */
+static void dbgprintf(struct InternalInstruction* insn,
+                      const char* format,
+                      ...) {
+  char buffer[256];
+  va_list ap;
+
+  if (!insn->dlog)
+    return;
+
+  va_start(ap, format);
+  (void)vsnprintf(buffer, sizeof(buffer), format, ap);
+  va_end(ap);
+
+  insn->dlog(insn->dlogArg, buffer);
+
+  return;
+}
+
+/*
+ * setPrefixPresent - Marks that a particular prefix is present at a particular
+ *   location.
+ *
+ * @param insn      - The instruction to be marked as having the prefix.
+ * @param prefix    - The prefix that is present.
+ * @param location  - The location where the prefix is located (in the address
+ *                    space of the instruction's reader).
+ */
+static void setPrefixPresent(struct InternalInstruction* insn,
+                                    uint8_t prefix,
+                                    uint64_t location)
+{
+  insn->prefixPresent[prefix] = 1;
+  insn->prefixLocations[prefix] = location;
+}
+
+/*
+ * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
+ *   present at a given location.
+ *
+ * @param insn      - The instruction to be queried.
+ * @param prefix    - The prefix.
+ * @param location  - The location to query.
+ * @return          - Whether the prefix is at that location.
+ */
+static bool isPrefixAtLocation(struct InternalInstruction* insn,
+                               uint8_t prefix,
+                               uint64_t location)
+{
+  if (insn->prefixPresent[prefix] == 1 &&
+     insn->prefixLocations[prefix] == location)
+    return true;
+  else
+    return false;
+}
+
+/*
+ * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
+ *   instruction as having them.  Also sets the instruction's default operand,
+ *   address, and other relevant data sizes to report operands correctly.
+ *
+ * @param insn  - The instruction whose prefixes are to be read.
+ * @return      - 0 if the instruction could be read until the end of the prefix
+ *                bytes, and no prefixes conflicted; nonzero otherwise.
+ */
+static int readPrefixes(struct InternalInstruction* insn) {
+  bool isPrefix = true;
+  bool prefixGroups[4] = { false };
+  uint64_t prefixLocation;
+  uint8_t byte = 0;
+  uint8_t nextByte;
+
+  bool hasAdSize = false;
+  bool hasOpSize = false;
+
+  dbgprintf(insn, "readPrefixes()");
+
+  while (isPrefix) {
+    prefixLocation = insn->readerCursor;
+
+    /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */
+    if (consumeByte(insn, &byte))
+      break;
+
+    /*
+     * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
+     * break and let it be disassembled as a normal "instruction".
+     */
+    if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
+      break;
+
+    if (insn->readerCursor - 1 == insn->startLocation
+        && (byte == 0xf2 || byte == 0xf3)
+        && !lookAtByte(insn, &nextByte))
+    {
+      /*
+       * If the byte is 0xf2 or 0xf3, and any of the following conditions are
+       * met:
+       * - it is followed by a LOCK (0xf0) prefix
+       * - it is followed by an xchg instruction
+       * then it should be disassembled as a xacquire/xrelease not repne/rep.
+       */
+      if ((byte == 0xf2 || byte == 0xf3) &&
+          ((nextByte == 0xf0) |
+          ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
+        insn->xAcquireRelease = true;
+      /*
+       * Also if the byte is 0xf3, and the following condition is met:
+       * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
+       *                       "mov mem, imm" (opcode 0xc6/0xc7) instructions.
+       * then it should be disassembled as an xrelease not rep.
+       */
+      if (byte == 0xf3 &&
+          (nextByte == 0x88 || nextByte == 0x89 ||
+           nextByte == 0xc6 || nextByte == 0xc7))
+        insn->xAcquireRelease = true;
+      if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
+        if (consumeByte(insn, &nextByte))
+          return -1;
+        if (lookAtByte(insn, &nextByte))
+          return -1;
+        unconsumeByte(insn);
+      }
+      if (nextByte != 0x0f && nextByte != 0x90)
+        break;
+    }
+
+    switch (byte) {
+    case 0xf0:  /* LOCK */
+    case 0xf2:  /* REPNE/REPNZ */
+    case 0xf3:  /* REP or REPE/REPZ */
+      if (prefixGroups[0])
+        dbgprintf(insn, "Redundant Group 1 prefix");
+      prefixGroups[0] = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x2e:  /* CS segment override -OR- Branch not taken */
+    case 0x36:  /* SS segment override -OR- Branch taken */
+    case 0x3e:  /* DS segment override */
+    case 0x26:  /* ES segment override */
+    case 0x64:  /* FS segment override */
+    case 0x65:  /* GS segment override */
+      switch (byte) {
+      case 0x2e:
+        insn->segmentOverride = SEG_OVERRIDE_CS;
+        break;
+      case 0x36:
+        insn->segmentOverride = SEG_OVERRIDE_SS;
+        break;
+      case 0x3e:
+        insn->segmentOverride = SEG_OVERRIDE_DS;
+        break;
+      case 0x26:
+        insn->segmentOverride = SEG_OVERRIDE_ES;
+        break;
+      case 0x64:
+        insn->segmentOverride = SEG_OVERRIDE_FS;
+        break;
+      case 0x65:
+        insn->segmentOverride = SEG_OVERRIDE_GS;
+        break;
+      default:
+        debug("Unhandled override");
+        return -1;
+      }
+      if (prefixGroups[1])
+        dbgprintf(insn, "Redundant Group 2 prefix");
+      prefixGroups[1] = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x66:  /* Operand-size override */
+      if (prefixGroups[2])
+        dbgprintf(insn, "Redundant Group 3 prefix");
+      prefixGroups[2] = true;
+      hasOpSize = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x67:  /* Address-size override */
+      if (prefixGroups[3])
+        dbgprintf(insn, "Redundant Group 4 prefix");
+      prefixGroups[3] = true;
+      hasAdSize = true;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    default:    /* Not a prefix byte */
+      isPrefix = false;
+      break;
+    }
+
+    if (isPrefix)
+      dbgprintf(insn, "Found prefix 0x%hhx", byte);
+  }
+
+  insn->vectorExtensionType = TYPE_NO_VEX_XOP;
+
+  if (byte == 0x62) {
+    uint8_t byte1, byte2;
+
+    if (consumeByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of EVEX prefix");
+      return -1;
+    }
+
+    if (lookAtByte(insn, &byte2)) {
+      dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+      return -1;
+    }
+
+    if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
+       ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
+      insn->vectorExtensionType = TYPE_EVEX;
+    }
+    else {
+      unconsumeByte(insn); /* unconsume byte1 */
+      unconsumeByte(insn); /* unconsume byte  */
+      insn->necessaryPrefixLocation = insn->readerCursor - 2;
+    }
+
+    if (insn->vectorExtensionType == TYPE_EVEX) {
+      insn->vectorExtensionPrefix[0] = byte;
+      insn->vectorExtensionPrefix[1] = byte1;
+      if (consumeByte(insn, &insn->vectorExtensionPrefix[2])) {
+        dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
+        return -1;
+      }
+      if (consumeByte(insn, &insn->vectorExtensionPrefix[3])) {
+        dbgprintf(insn, "Couldn't read fourth byte of EVEX prefix");
+        return -1;
+      }
+
+      /* We simulate the REX prefix for simplicity's sake */
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      dbgprintf(insn, "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx",
+              insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+              insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]);
+    }
+  }
+  else if (byte == 0xc4) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of VEX");
+      return -1;
+    }
+
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+      insn->vectorExtensionType = TYPE_VEX_3B;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+    else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+
+    if (insn->vectorExtensionType == TYPE_VEX_3B) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+      consumeByte(insn, &insn->vectorExtensionPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+                insn->vectorExtensionPrefix[2]);
+    }
+  }
+  else if (byte == 0xc5) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of VEX");
+      return -1;
+    }
+
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+      insn->vectorExtensionType = TYPE_VEX_2B;
+    }
+    else {
+      unconsumeByte(insn);
+    }
+
+    if (insn->vectorExtensionType == TYPE_VEX_2B) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
+      }
+
+      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1]))
+      {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = true;
+        break;
+      }
+
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0],
+                insn->vectorExtensionPrefix[1]);
+    }
+  }
+  else if (byte == 0x8f) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of XOP");
+      return -1;
+    }
+
+    if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
+      insn->vectorExtensionType = TYPE_XOP;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+    else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+
+    if (insn->vectorExtensionType == TYPE_XOP) {
+      insn->vectorExtensionPrefix[0] = byte;
+      consumeByte(insn, &insn->vectorExtensionPrefix[1]);
+      consumeByte(insn, &insn->vectorExtensionPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3)
+                        | (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2)
+                        | (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1)
+                        | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
+      }
+
+      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2]))
+      {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = true;
+        break;
+      }
+
+      dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+                insn->vectorExtensionPrefix[2]);
+    }
+  }
+  else {
+    if (insn->mode == MODE_64BIT) {
+      if ((byte & 0xf0) == 0x40) {
+        uint8_t opcodeByte;
+
+        if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
+          dbgprintf(insn, "Redundant REX prefix");
+          return -1;
+        }
+
+        insn->rexPrefix = byte;
+        insn->necessaryPrefixLocation = insn->readerCursor - 2;
+
+        dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
+      } else {
+        unconsumeByte(insn);
+        insn->necessaryPrefixLocation = insn->readerCursor - 1;
+      }
+    } else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+  }
+
+  if (insn->mode == MODE_16BIT) {
+    insn->registerSize       = (hasOpSize ? 4 : 2);
+    insn->addressSize        = (hasAdSize ? 4 : 2);
+    insn->displacementSize   = (hasAdSize ? 4 : 2);
+    insn->immediateSize      = (hasOpSize ? 4 : 2);
+  } else if (insn->mode == MODE_32BIT) {
+    insn->registerSize       = (hasOpSize ? 2 : 4);
+    insn->addressSize        = (hasAdSize ? 2 : 4);
+    insn->displacementSize   = (hasAdSize ? 2 : 4);
+    insn->immediateSize      = (hasOpSize ? 2 : 4);
+  } else if (insn->mode == MODE_64BIT) {
+    if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
+      insn->registerSize       = 8;
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = 4;
+      insn->immediateSize      = 4;
+    } else if (insn->rexPrefix) {
+      insn->registerSize       = (hasOpSize ? 2 : 4);
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = (hasOpSize ? 2 : 4);
+      insn->immediateSize      = (hasOpSize ? 2 : 4);
+    } else {
+      insn->registerSize       = (hasOpSize ? 2 : 4);
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = (hasOpSize ? 2 : 4);
+      insn->immediateSize      = (hasOpSize ? 2 : 4);
+    }
+  }
+
+  return 0;
+}
+
+/*
+ * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
+ *   extended or escape opcodes).
+ *
+ * @param insn  - The instruction whose opcode is to be read.
+ * @return      - 0 if the opcode could be read successfully; nonzero otherwise.
+ */
+static int readOpcode(struct InternalInstruction* insn) {
+  /* Determine the length of the primary opcode */
+
+  uint8_t current;
+
+  dbgprintf(insn, "readOpcode()");
+
+  insn->opcodeType = ONEBYTE;
+
+  if (insn->vectorExtensionType == TYPE_EVEX)
+  {
+    switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)",
+                mmFromEVEX2of4(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case VEX_LOB_0F:
+      insn->opcodeType = TWOBYTE;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F38:
+      insn->opcodeType = THREEBYTE_38;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F3A:
+      insn->opcodeType = THREEBYTE_3A;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
+  else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+    switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case VEX_LOB_0F:
+      insn->opcodeType = TWOBYTE;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F38:
+      insn->opcodeType = THREEBYTE_38;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F3A:
+      insn->opcodeType = THREEBYTE_3A;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
+  else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+    insn->opcodeType = TWOBYTE;
+    return consumeByte(insn, &insn->opcode);
+  }
+  else if (insn->vectorExtensionType == TYPE_XOP) {
+    switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
+      return -1;
+    case XOP_MAP_SELECT_8:
+      insn->opcodeType = XOP8_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_9:
+      insn->opcodeType = XOP9_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_A:
+      insn->opcodeType = XOPA_MAP;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
+
+  if (consumeByte(insn, &current))
+    return -1;
+
+  if (current == 0x0f) {
+    dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
+
+    if (consumeByte(insn, &current))
+      return -1;
+
+    if (current == 0x38) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEBYTE_38;
+    } else if (current == 0x3a) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEBYTE_3A;
+    } else {
+      dbgprintf(insn, "Didn't find a three-byte escape prefix");
+
+      insn->opcodeType = TWOBYTE;
+    }
+  }
+
+  /*
+   * At this point we have consumed the full opcode.
+   * Anything we consume from here on must be unconsumed.
+   */
+
+  insn->opcode = current;
+
+  return 0;
+}
+
+static int readModRM(struct InternalInstruction* insn);
+
+/*
+ * getIDWithAttrMask - Determines the ID of an instruction, consuming
+ *   the ModR/M byte as appropriate for extended and escape opcodes,
+ *   and using a supplied attribute mask.
+ *
+ * @param instructionID - A pointer whose target is filled in with the ID of the
+ *                        instruction.
+ * @param insn          - The instruction whose ID is to be determined.
+ * @param attrMask      - The attribute mask to search.
+ * @return              - 0 if the ModR/M could be read when needed or was not
+ *                        needed; nonzero otherwise.
+ */
+static int getIDWithAttrMask(uint16_t* instructionID,
+                             struct InternalInstruction* insn,
+                             uint16_t attrMask) {
+  bool hasModRMExtension;
+
+  InstructionContext instructionClass = contextForAttrs(attrMask);
+
+  hasModRMExtension = modRMRequired(insn->opcodeType,
+                                    instructionClass,
+                                    insn->opcode);
+
+  if (hasModRMExtension) {
+    if (readModRM(insn))
+      return -1;
+
+    *instructionID = decode(insn->opcodeType,
+                            instructionClass,
+                            insn->opcode,
+                            insn->modRM);
+  } else {
+    *instructionID = decode(insn->opcodeType,
+                            instructionClass,
+                            insn->opcode,
+                            0);
+  }
+
+  return 0;
+}
+
+/*
+ * is16BitEquivalent - Determines whether two instruction names refer to
+ * equivalent instructions but one is 16-bit whereas the other is not.
+ *
+ * @param orig  - The instruction that is not 16-bit
+ * @param equiv - The instruction that is 16-bit
+ */
+static bool is16BitEquivalent(const char* orig, const char* equiv) {
+  off_t i;
+
+  for (i = 0;; i++) {
+    if (orig[i] == '\0' && equiv[i] == '\0')
+      return true;
+    if (orig[i] == '\0' || equiv[i] == '\0')
+      return false;
+    if (orig[i] != equiv[i]) {
+      if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
+        continue;
+      if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
+        continue;
+      if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
+        continue;
+      return false;
+    }
+  }
+}
+
+/*
+ * getID - Determines the ID of an instruction, consuming the ModR/M byte as
+ *   appropriate for extended and escape opcodes.  Determines the attributes and
+ *   context for the instruction before doing so.
+ *
+ * @param insn  - The instruction whose ID is to be determined.
+ * @return      - 0 if the ModR/M could be read when needed or was not needed;
+ *                nonzero otherwise.
+ */
+static int getID(struct InternalInstruction* insn, const void *miiArg) {
+  uint16_t attrMask;
+  uint16_t instructionID;
+
+  dbgprintf(insn, "getID()");
+
+  attrMask = ATTR_NONE;
+
+  if (insn->mode == MODE_64BIT)
+    attrMask |= ATTR_64BIT;
+
+  if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+    attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX;
+
+    if (insn->vectorExtensionType == TYPE_EVEX) {
+      switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (zFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXKZ;
+      if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXB;
+      if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXK;
+      if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXL;
+      if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
+        attrMask |= ATTR_EVEXL2;
+    }
+    else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+      switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    }
+    else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
+        attrMask |= ATTR_VEXL;
+    }
+    else if (insn->vectorExtensionType == TYPE_XOP) {
+      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    }
+    else {
+      return -1;
+    }
+  }
+  else {
+    if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_OPSIZE;
+    else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_ADSIZE;
+    else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_XS;
+    else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_XD;
+  }
+
+  if (insn->rexPrefix & 0x08)
+    attrMask |= ATTR_REXW;
+
+  if (getIDWithAttrMask(&instructionID, insn, attrMask))
+    return -1;
+
+  /*
+   * JCXZ/JECXZ need special handling for 16-bit mode because the meaning
+   * of the AdSize prefix is inverted w.r.t. 32-bit mode.
+   */
+  if (insn->mode == MODE_16BIT && insn->opcode == 0xE3) {
+    const struct InstructionSpecifier *spec;
+    spec = specifierForUID(instructionID);
+
+    /*
+     * Check for Ii8PCRel instructions. We could alternatively do a
+     * string-compare on the names, but this is probably cheaper.
+     */
+    if (x86OperandSets[spec->operands][0].type == TYPE_REL8) {
+      attrMask ^= ATTR_ADSIZE;
+      if (getIDWithAttrMask(&instructionID, insn, attrMask))
+        return -1;
+    }
+  }
+
+  /* The following clauses compensate for limitations of the tables. */
+
+  if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) &&
+      !(attrMask & ATTR_OPSIZE)) {
+    /*
+     * The instruction tables make no distinction between instructions that
+     * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
+     * particular spot (i.e., many MMX operations).  In general we're
+     * conservative, but in the specific case where OpSize is present but not
+     * in the right place we check if there's a 16-bit operation.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithOpsize;
+    const char *specName, *specWithOpSizeName;
+
+    spec = specifierForUID(instructionID);
+
+    if (getIDWithAttrMask(&instructionIDWithOpsize,
+                          insn,
+                          attrMask | ATTR_OPSIZE)) {
+      /*
+       * ModRM required with OpSize but not present; give up and return version
+       * without OpSize set
+       */
+
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specName = GetInstrName(instructionID, miiArg);
+    specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg);
+
+    if (is16BitEquivalent(specName, specWithOpSizeName) &&
+        (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) {
+      insn->instructionID = instructionIDWithOpsize;
+      insn->spec = specifierForUID(instructionIDWithOpsize);
+    } else {
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+    }
+    return 0;
+  }
+
+  if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
+      insn->rexPrefix & 0x01) {
+    /*
+     * NOOP shouldn't decode as NOOP if REX.b is set. Instead
+     * it should decode as XCHG %r8, %eax.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithNewOpcode;
+    const struct InstructionSpecifier *specWithNewOpcode;
+
+    spec = specifierForUID(instructionID);
+
+    /* Borrow opcode from one of the other XCHGar opcodes */
+    insn->opcode = 0x91;
+
+    if (getIDWithAttrMask(&instructionIDWithNewOpcode,
+                          insn,
+                          attrMask)) {
+      insn->opcode = 0x90;
+
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode);
+
+    /* Change back */
+    insn->opcode = 0x90;
+
+    insn->instructionID = instructionIDWithNewOpcode;
+    insn->spec = specWithNewOpcode;
+
+    return 0;
+  }
+
+  insn->instructionID = instructionID;
+  insn->spec = specifierForUID(insn->instructionID);
+
+  return 0;
+}
+
+/*
+ * readSIB - Consumes the SIB byte to determine addressing information for an
+ *   instruction.
+ *
+ * @param insn  - The instruction whose SIB byte is to be read.
+ * @return      - 0 if the SIB byte was successfully read; nonzero otherwise.
+ */
+static int readSIB(struct InternalInstruction* insn) {
+  SIBIndex sibIndexBase = SIB_INDEX_NONE;
+  SIBBase sibBaseBase = SIB_BASE_NONE;
+  uint8_t index, base;
+
+  dbgprintf(insn, "readSIB()");
+
+  if (insn->consumedSIB)
+    return 0;
+
+  insn->consumedSIB = true;
+
+  switch (insn->addressSize) {
+  case 2:
+    dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
+    return -1;
+  case 4:
+    sibIndexBase = SIB_INDEX_EAX;
+    sibBaseBase = SIB_BASE_EAX;
+    break;
+  case 8:
+    sibIndexBase = SIB_INDEX_RAX;
+    sibBaseBase = SIB_BASE_RAX;
+    break;
+  }
+
+  if (consumeByte(insn, &insn->sib))
+    return -1;
+
+  index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+  if (insn->vectorExtensionType == TYPE_EVEX)
+    index |= v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4;
+
+  switch (index) {
+  case 0x4:
+    insn->sibIndex = SIB_INDEX_NONE;
+    break;
+  default:
+    insn->sibIndex = (SIBIndex)(sibIndexBase + index);
+    if (insn->sibIndex == SIB_INDEX_sib ||
+        insn->sibIndex == SIB_INDEX_sib64)
+      insn->sibIndex = SIB_INDEX_NONE;
+    break;
+  }
+
+  switch (scaleFromSIB(insn->sib)) {
+  case 0:
+    insn->sibScale = 1;
+    break;
+  case 1:
+    insn->sibScale = 2;
+    break;
+  case 2:
+    insn->sibScale = 4;
+    break;
+  case 3:
+    insn->sibScale = 8;
+    break;
+  }
+
+  base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
+
+  switch (base) {
+  case 0x5:
+  case 0xd:
+    switch (modFromModRM(insn->modRM)) {
+    case 0x0:
+      insn->eaDisplacement = EA_DISP_32;
+      insn->sibBase = SIB_BASE_NONE;
+      break;
+    case 0x1:
+      insn->eaDisplacement = EA_DISP_8;
+      insn->sibBase = (SIBBase)(sibBaseBase + base);
+      break;
+    case 0x2:
+      insn->eaDisplacement = EA_DISP_32;
+      insn->sibBase = (SIBBase)(sibBaseBase + base);
+      break;
+    case 0x3:
+      debug("Cannot have Mod = 0b11 and a SIB byte");
+      return -1;
+    }
+    break;
+  default:
+    insn->sibBase = (SIBBase)(sibBaseBase + base);
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readDisplacement - Consumes the displacement of an instruction.
+ *
+ * @param insn  - The instruction whose displacement is to be read.
+ * @return      - 0 if the displacement byte was successfully read; nonzero
+ *                otherwise.
+ */
+static int readDisplacement(struct InternalInstruction* insn) {
+  int8_t d8;
+  int16_t d16;
+  int32_t d32;
+
+  dbgprintf(insn, "readDisplacement()");
+
+  if (insn->consumedDisplacement)
+    return 0;
+
+  insn->consumedDisplacement = true;
+  insn->displacementOffset = insn->readerCursor - insn->startLocation;
+
+  switch (insn->eaDisplacement) {
+  case EA_DISP_NONE:
+    insn->consumedDisplacement = false;
+    break;
+  case EA_DISP_8:
+    if (consumeInt8(insn, &d8))
+      return -1;
+    insn->displacement = d8;
+    break;
+  case EA_DISP_16:
+    if (consumeInt16(insn, &d16))
+      return -1;
+    insn->displacement = d16;
+    break;
+  case EA_DISP_32:
+    if (consumeInt32(insn, &d32))
+      return -1;
+    insn->displacement = d32;
+    break;
+  }
+
+  insn->consumedDisplacement = true;
+  return 0;
+}
+
+/*
+ * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
+ *   displacement) for an instruction and interprets it.
+ *
+ * @param insn  - The instruction whose addressing information is to be read.
+ * @return      - 0 if the information was successfully read; nonzero otherwise.
+ */
+static int readModRM(struct InternalInstruction* insn) {
+  uint8_t mod, rm, reg;
+
+  dbgprintf(insn, "readModRM()");
+
+  if (insn->consumedModRM)
+    return 0;
+
+  if (consumeByte(insn, &insn->modRM))
+    return -1;
+  insn->consumedModRM = true;
+
+  mod     = modFromModRM(insn->modRM);
+  rm      = rmFromModRM(insn->modRM);
+  reg     = regFromModRM(insn->modRM);
+
+  /*
+   * This goes by insn->registerSize to pick the correct register, which messes
+   * up if we're using (say) XMM or 8-bit register operands.  That gets fixed in
+   * fixupReg().
+   */
+  switch (insn->registerSize) {
+  case 2:
+    insn->regBase = MODRM_REG_AX;
+    insn->eaRegBase = EA_REG_AX;
+    break;
+  case 4:
+    insn->regBase = MODRM_REG_EAX;
+    insn->eaRegBase = EA_REG_EAX;
+    break;
+  case 8:
+    insn->regBase = MODRM_REG_RAX;
+    insn->eaRegBase = EA_REG_RAX;
+    break;
+  }
+
+  reg |= rFromREX(insn->rexPrefix) << 3;
+  rm  |= bFromREX(insn->rexPrefix) << 3;
+  if (insn->vectorExtensionType == TYPE_EVEX) {
+    reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+    rm  |=  xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+  }
+
+  insn->reg = (Reg)(insn->regBase + reg);
+
+  switch (insn->addressSize) {
+  case 2:
+    insn->eaBaseBase = EA_BASE_BX_SI;
+
+    switch (mod) {
+    case 0x0:
+      if (rm == 0x6) {
+        insn->eaBase = EA_BASE_NONE;
+        insn->eaDisplacement = EA_DISP_16;
+        if (readDisplacement(insn))
+          return -1;
+      } else {
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        insn->eaDisplacement = EA_DISP_NONE;
+      }
+      break;
+    case 0x1:
+      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaDisplacement = EA_DISP_8;
+      insn->displacementSize = 1;
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    case 0x2:
+      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaDisplacement = EA_DISP_16;
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    case 0x3:
+      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    }
+    break;
+  case 4:
+  case 8:
+    insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+
+    switch (mod) {
+    case 0x0:
+      insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
+      switch (rm) {
+      case 0x14:
+      case 0x4:
+      case 0xc:   /* in case REXW.b is set */
+        insn->eaBase = (insn->addressSize == 4 ?
+                        EA_BASE_sib : EA_BASE_sib64);
+        if (readSIB(insn) || readDisplacement(insn))
+          return -1;
+        break;
+      case 0x5:
+        insn->eaBase = EA_BASE_NONE;
+        insn->eaDisplacement = EA_DISP_32;
+        if (readDisplacement(insn))
+          return -1;
+        break;
+      default:
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        break;
+      }
+      break;
+    case 0x1:
+      insn->displacementSize = 1;
+      /* FALLTHROUGH */
+    case 0x2:
+      insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
+      switch (rm) {
+      case 0x14:
+      case 0x4:
+      case 0xc:   /* in case REXW.b is set */
+        insn->eaBase = EA_BASE_sib;
+        if (readSIB(insn) || readDisplacement(insn))
+          return -1;
+        break;
+      default:
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        if (readDisplacement(insn))
+          return -1;
+        break;
+      }
+      break;
+    case 0x3:
+      insn->eaDisplacement = EA_DISP_NONE;
+      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      break;
+    }
+    break;
+  } /* switch (insn->addressSize) */
+
+  return 0;
+}
+
+#define GENERIC_FIXUP_FUNC(name, base, prefix)            \
+  static uint8_t name(struct InternalInstruction *insn,   \
+                      OperandType type,                   \
+                      uint8_t index,                      \
+                      uint8_t *valid) {                   \
+    *valid = 1;                                           \
+    switch (type) {                                       \
+    default:                                              \
+      debug("Unhandled register type");                   \
+      *valid = 0;                                         \
+      return 0;                                           \
+    case TYPE_Rv:                                         \
+      return base + index;                                \
+    case TYPE_R8:                                         \
+      if (insn->rexPrefix &&                              \
+         index >= 4 && index <= 7) {                      \
+        return prefix##_SPL + (index - 4);                \
+      } else {                                            \
+        return prefix##_AL + index;                       \
+      }                                                   \
+    case TYPE_R16:                                        \
+      return prefix##_AX + index;                         \
+    case TYPE_R32:                                        \
+      return prefix##_EAX + index;                        \
+    case TYPE_R64:                                        \
+      return prefix##_RAX + index;                        \
+    case TYPE_XMM512:                                     \
+      return prefix##_ZMM0 + index;                       \
+    case TYPE_XMM256:                                     \
+      return prefix##_YMM0 + index;                       \
+    case TYPE_XMM128:                                     \
+    case TYPE_XMM64:                                      \
+    case TYPE_XMM32:                                      \
+    case TYPE_XMM:                                        \
+      return prefix##_XMM0 + index;                       \
+    case TYPE_VK1:                                        \
+    case TYPE_VK8:                                        \
+    case TYPE_VK16:                                       \
+      return prefix##_K0 + index;                         \
+    case TYPE_MM64:                                       \
+    case TYPE_MM32:                                       \
+    case TYPE_MM:                                         \
+      if (index > 7)                                      \
+        *valid = 0;                                       \
+      return prefix##_MM0 + index;                        \
+    case TYPE_SEGMENTREG:                                 \
+      if (index > 5)                                      \
+        *valid = 0;                                       \
+      return prefix##_ES + index;                         \
+    case TYPE_DEBUGREG:                                   \
+      if (index > 7)                                      \
+        *valid = 0;                                       \
+      return prefix##_DR0 + index;                        \
+    case TYPE_CONTROLREG:                                 \
+      if (index > 8)                                      \
+        *valid = 0;                                       \
+      return prefix##_CR0 + index;                        \
+    }                                                     \
+  }
+
+/*
+ * fixup*Value - Consults an operand type to determine the meaning of the
+ *   reg or R/M field.  If the operand is an XMM operand, for example, an
+ *   operand would be XMM0 instead of AX, which readModRM() would otherwise
+ *   misinterpret it as.
+ *
+ * @param insn  - The instruction containing the operand.
+ * @param type  - The operand type.
+ * @param index - The existing value of the field as reported by readModRM().
+ * @param valid - The address of a uint8_t.  The target is set to 1 if the
+ *                field is valid for the register class; 0 if not.
+ * @return      - The proper value.
+ */
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase,    MODRM_REG)
+GENERIC_FIXUP_FUNC(fixupRMValue,  insn->eaRegBase,  EA_REG)
+
+/*
+ * fixupReg - Consults an operand specifier to determine which of the
+ *   fixup*Value functions to use in correcting readModRM()'ss interpretation.
+ *
+ * @param insn  - See fixup*Value().
+ * @param op    - The operand specifier.
+ * @return      - 0 if fixup was successful; -1 if the register returned was
+ *                invalid for its class.
+ */
+static int fixupReg(struct InternalInstruction *insn,
+                    const struct OperandSpecifier *op) {
+  uint8_t valid;
+
+  dbgprintf(insn, "fixupReg()");
+
+  switch ((OperandEncoding)op->encoding) {
+  default:
+    debug("Expected a REG or R/M encoding in fixupReg");
+    return -1;
+  case ENCODING_VVVV:
+    insn->vvvv = (Reg)fixupRegValue(insn,
+                                    (OperandType)op->type,
+                                    insn->vvvv,
+                                    &valid);
+    if (!valid)
+      return -1;
+    break;
+  case ENCODING_REG:
+    insn->reg = (Reg)fixupRegValue(insn,
+                                   (OperandType)op->type,
+                                   insn->reg - insn->regBase,
+                                   &valid);
+    if (!valid)
+      return -1;
+    break;
+  case ENCODING_RM:
+    if (insn->eaBase >= insn->eaRegBase) {
+      insn->eaBase = (EABase)fixupRMValue(insn,
+                                          (OperandType)op->type,
+                                          insn->eaBase - insn->eaRegBase,
+                                          &valid);
+      if (!valid)
+        return -1;
+    }
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readOpcodeRegister - Reads an operand from the opcode field of an
+ *   instruction and interprets it appropriately given the operand width.
+ *   Handles AddRegFrm instructions.
+ *
+ * @param insn  - the instruction whose opcode field is to be read.
+ * @param size  - The width (in bytes) of the register being specified.
+ *                1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
+ *                RAX.
+ * @return      - 0 on success; nonzero otherwise.
+ */
+static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
+  dbgprintf(insn, "readOpcodeRegister()");
+
+  if (size == 0)
+    size = insn->registerSize;
+
+  switch (size) {
+  case 1:
+    insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
+                                                  | (insn->opcode & 7)));
+    if (insn->rexPrefix &&
+        insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
+        insn->opcodeRegister < MODRM_REG_AL + 0x8) {
+      insn->opcodeRegister = (Reg)(MODRM_REG_SPL
+                                   + (insn->opcodeRegister - MODRM_REG_AL - 4));
+    }
+
+    break;
+  case 2:
+    insn->opcodeRegister = (Reg)(MODRM_REG_AX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | (insn->opcode & 7)));
+    break;
+  case 4:
+    insn->opcodeRegister = (Reg)(MODRM_REG_EAX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | (insn->opcode & 7)));
+    break;
+  case 8:
+    insn->opcodeRegister = (Reg)(MODRM_REG_RAX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | (insn->opcode & 7)));
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readImmediate - Consumes an immediate operand from an instruction, given the
+ *   desired operand size.
+ *
+ * @param insn  - The instruction whose operand is to be read.
+ * @param size  - The width (in bytes) of the operand.
+ * @return      - 0 if the immediate was successfully consumed; nonzero
+ *                otherwise.
+ */
+static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
+  uint8_t imm8;
+  uint16_t imm16;
+  uint32_t imm32;
+  uint64_t imm64;
+
+  dbgprintf(insn, "readImmediate()");
+
+  if (insn->numImmediatesConsumed == 2) {
+    debug("Already consumed two immediates");
+    return -1;
+  }
+
+  if (size == 0)
+    size = insn->immediateSize;
+  else
+    insn->immediateSize = size;
+  insn->immediateOffset = insn->readerCursor - insn->startLocation;
+
+  switch (size) {
+  case 1:
+    if (consumeByte(insn, &imm8))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm8;
+    break;
+  case 2:
+    if (consumeUInt16(insn, &imm16))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm16;
+    break;
+  case 4:
+    if (consumeUInt32(insn, &imm32))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm32;
+    break;
+  case 8:
+    if (consumeUInt64(insn, &imm64))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm64;
+    break;
+  }
+
+  insn->numImmediatesConsumed++;
+
+  return 0;
+}
+
+/*
+ * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix.
+ *
+ * @param insn  - The instruction whose operand is to be read.
+ * @return      - 0 if the vvvv was successfully consumed; nonzero
+ *                otherwise.
+ */
+static int readVVVV(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readVVVV()");
+
+  int vvvv;
+  if (insn->vectorExtensionType == TYPE_EVEX)
+    vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]);
+  else if (insn->vectorExtensionType == TYPE_VEX_3B)
+    vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
+  else if (insn->vectorExtensionType == TYPE_VEX_2B)
+    vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
+  else if (insn->vectorExtensionType == TYPE_XOP)
+    vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
+  else
+    return -1;
+
+  if (insn->mode != MODE_64BIT)
+    vvvv &= 0x7;
+
+  insn->vvvv = static_cast<Reg>(vvvv);
+  return 0;
+}
+
+/*
+ * readMaskRegister - Reads an mask register from the opcode field of an
+ *   instruction.
+ *
+ * @param insn    - The instruction whose opcode field is to be read.
+ * @return        - 0 on success; nonzero otherwise.
+ */
+static int readMaskRegister(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readMaskRegister()");
+
+  if (insn->vectorExtensionType != TYPE_EVEX)
+    return -1;
+
+  insn->writemask =
+      static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]));
+  return 0;
+}
+
+/*
+ * readOperands - Consults the specifier for an instruction and consumes all
+ *   operands for that instruction, interpreting them as it goes.
+ *
+ * @param insn  - The instruction whose operands are to be read and interpreted.
+ * @return      - 0 if all operands could be read; nonzero otherwise.
+ */
+static int readOperands(struct InternalInstruction* insn) {
+  int hasVVVV, needVVVV;
+  int sawRegImm = 0;
+
+  dbgprintf(insn, "readOperands()");
+
+  /* If non-zero vvvv specified, need to make sure one of the operands
+     uses it. */
+  hasVVVV = !readVVVV(insn);
+  needVVVV = hasVVVV && (insn->vvvv != 0);
+
+  for (const auto &Op : x86OperandSets[insn->spec->operands]) {
+    switch (Op.encoding) {
+    case ENCODING_NONE:
+    case ENCODING_SI:
+    case ENCODING_DI:
+      break;
+    case ENCODING_REG:
+    case ENCODING_RM:
+      if (readModRM(insn))
+        return -1;
+      if (fixupReg(insn, &Op))
+        return -1;
+      break;
+    case ENCODING_CB:
+    case ENCODING_CW:
+    case ENCODING_CD:
+    case ENCODING_CP:
+    case ENCODING_CO:
+    case ENCODING_CT:
+      dbgprintf(insn, "We currently don't hande code-offset encodings");
+      return -1;
+    case ENCODING_IB:
+      if (sawRegImm) {
+        /* Saw a register immediate so don't read again and instead split the
+           previous immediate.  FIXME: This is a hack. */
+        insn->immediates[insn->numImmediatesConsumed] =
+          insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
+        ++insn->numImmediatesConsumed;
+        break;
+      }
+      if (readImmediate(insn, 1))
+        return -1;
+      if (Op.type == TYPE_IMM3 &&
+          insn->immediates[insn->numImmediatesConsumed - 1] > 7)
+        return -1;
+      if (Op.type == TYPE_IMM5 &&
+          insn->immediates[insn->numImmediatesConsumed - 1] > 31)
+        return -1;
+      if (Op.type == TYPE_XMM128 ||
+          Op.type == TYPE_XMM256)
+        sawRegImm = 1;
+      break;
+    case ENCODING_IW:
+      if (readImmediate(insn, 2))
+        return -1;
+      break;
+    case ENCODING_ID:
+      if (readImmediate(insn, 4))
+        return -1;
+      break;
+    case ENCODING_IO:
+      if (readImmediate(insn, 8))
+        return -1;
+      break;
+    case ENCODING_Iv:
+      if (readImmediate(insn, insn->immediateSize))
+        return -1;
+      break;
+    case ENCODING_Ia:
+      if (readImmediate(insn, insn->addressSize))
+        return -1;
+      break;
+    case ENCODING_RB:
+      if (readOpcodeRegister(insn, 1))
+        return -1;
+      break;
+    case ENCODING_RW:
+      if (readOpcodeRegister(insn, 2))
+        return -1;
+      break;
+    case ENCODING_RD:
+      if (readOpcodeRegister(insn, 4))
+        return -1;
+      break;
+    case ENCODING_RO:
+      if (readOpcodeRegister(insn, 8))
+        return -1;
+      break;
+    case ENCODING_Rv:
+      if (readOpcodeRegister(insn, 0))
+        return -1;
+      break;
+    case ENCODING_FP:
+      break;
+    case ENCODING_VVVV:
+      needVVVV = 0; /* Mark that we have found a VVVV operand. */
+      if (!hasVVVV)
+        return -1;
+      if (fixupReg(insn, &Op))
+        return -1;
+      break;
+    case ENCODING_WRITEMASK:
+      if (readMaskRegister(insn))
+        return -1;
+      break;
+    case ENCODING_DUP:
+      break;
+    default:
+      dbgprintf(insn, "Encountered an operand with an unknown encoding.");
+      return -1;
+    }
+  }
+
+  /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
+  if (needVVVV) return -1;
+
+  return 0;
+}
+
+/*
+ * decodeInstruction - Reads and interprets a full instruction provided by the
+ *   user.
+ *
+ * @param insn      - A pointer to the instruction to be populated.  Must be
+ *                    pre-allocated.
+ * @param reader    - The function to be used to read the instruction's bytes.
+ * @param readerArg - A generic argument to be passed to the reader to store
+ *                    any internal state.
+ * @param logger    - If non-NULL, the function to be used to write log messages
+ *                    and warnings.
+ * @param loggerArg - A generic argument to be passed to the logger to store
+ *                    any internal state.
+ * @param startLoc  - The address (in the reader's address space) of the first
+ *                    byte in the instruction.
+ * @param mode      - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
+ *                    decode the instruction in.
+ * @return          - 0 if the instruction's memory could be read; nonzero if
+ *                    not.
+ */
+int llvm::X86Disassembler::decodeInstruction(
+    struct InternalInstruction *insn, byteReader_t reader,
+    const void *readerArg, dlog_t logger, void *loggerArg, const void *miiArg,
+    uint64_t startLoc, DisassemblerMode mode) {
+  memset(insn, 0, sizeof(struct InternalInstruction));
+
+  insn->reader = reader;
+  insn->readerArg = readerArg;
+  insn->dlog = logger;
+  insn->dlogArg = loggerArg;
+  insn->startLocation = startLoc;
+  insn->readerCursor = startLoc;
+  insn->mode = mode;
+  insn->numImmediatesConsumed = 0;
+
+  if (readPrefixes(insn)       ||
+      readOpcode(insn)         ||
+      getID(insn, miiArg)      ||
+      insn->instructionID == 0 ||
+      readOperands(insn))
+    return -1;
+
+  insn->operands = x86OperandSets[insn->spec->operands];
+
+  insn->length = insn->readerCursor - insn->startLocation;
+
+  dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
+            startLoc, insn->readerCursor, insn->length);
+
+  if (insn->length > 15)
+    dbgprintf(insn, "Instruction exceeds 15-byte limit");
+
+  return 0;
+}
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index ac3b39d..8c45402 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -1,39 +1,28 @@
-/*===-- X86DisassemblerDecoderInternal.h - Disassembler decoder ---*- C -*-===*
- *
- *                     The LLVM Compiler Infrastructure
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *
- *===----------------------------------------------------------------------===*
- *
- * This file is part of the X86 Disassembler.
- * It contains the public interface of the instruction decoder.
- * Documentation for the disassembler can be found in X86Disassembler.h.
- *
- *===----------------------------------------------------------------------===*/
+//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the public interface of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef X86DISASSEMBLERDECODER_H
 #define X86DISASSEMBLERDECODER_H
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define INSTRUCTION_SPECIFIER_FIELDS \
-  uint16_t operands;
-
-#define INSTRUCTION_IDS     \
-  uint16_t instructionIDs;
-
 #include "X86DisassemblerDecoderCommon.h"
+#include "llvm/ADT/ArrayRef.h"
 
-#undef INSTRUCTION_SPECIFIER_FIELDS
-#undef INSTRUCTION_IDS
+namespace llvm {
+namespace X86Disassembler {
 
-/*
- * Accessor functions for various fields of an Intel instruction
- */
+// Accessor functions for various fields of an Intel instruction
 #define modFromModRM(modRM)  (((modRM) & 0xc0) >> 6)
 #define regFromModRM(modRM)  (((modRM) & 0x38) >> 3)
 #define rmFromModRM(modRM)   ((modRM) & 0x7)
@@ -83,10 +72,7 @@ extern "C" {
 #define lFromXOP3of3(xop)       (((xop) & 0x4) >> 2)
 #define ppFromXOP3of3(xop)      ((xop) & 0x3)
 
-/*
- * These enums represent Intel registers for use by the decoder.
- */
-
+// These enums represent Intel registers for use by the decoder.
 #define REGS_8BIT     \
   ENTRY(AL)           \
   ENTRY(CL)           \
@@ -392,13 +378,11 @@ extern "C" {
   REGS_CONTROL        \
   ENTRY(RIP)
 
-/*
- * EABase - All possible values of the base field for effective-address
- *   computations, a.k.a. the Mod and R/M fields of the ModR/M byte.  We
- *   distinguish between bases (EA_BASE_*) and registers that just happen to be
- *   referred to when Mod == 0b11 (EA_REG_*).
- */
-typedef enum {
+/// \brief All possible values of the base field for effective-address
+/// computations, a.k.a. the Mod and R/M fields of the ModR/M byte.
+/// We distinguish between bases (EA_BASE_*) and registers that just happen
+/// to be referred to when Mod == 0b11 (EA_REG_*).
+enum EABase {
   EA_BASE_NONE,
 #define ENTRY(x) EA_BASE_##x,
   ALL_EA_BASES
@@ -407,15 +391,13 @@ typedef enum {
   ALL_REGS
 #undef ENTRY
   EA_max
-} EABase;
-
-/*
- * SIBIndex - All possible values of the SIB index field.
- *   Borrows entries from ALL_EA_BASES with the special case that
- *   sib is synonymous with NONE.
- * Vector SIB: index can be XMM or YMM.
- */
-typedef enum {
+};
+
+/// \brief All possible values of the SIB index field.
+/// borrows entries from ALL_EA_BASES with the special case that
+/// sib is synonymous with NONE.
+/// Vector SIB: index can be XMM or YMM.
+enum SIBIndex {
   SIB_INDEX_NONE,
 #define ENTRY(x) SIB_INDEX_##x,
   ALL_EA_BASES
@@ -424,23 +406,18 @@ typedef enum {
   REGS_ZMM
 #undef ENTRY
   SIB_INDEX_max
-} SIBIndex;
+};
 
-/*
- * SIBBase - All possible values of the SIB base field.
- */
-typedef enum {
+/// \brief All possible values of the SIB base field.
+enum SIBBase {
   SIB_BASE_NONE,
 #define ENTRY(x) SIB_BASE_##x,
   ALL_SIB_BASES
 #undef ENTRY
   SIB_BASE_max
-} SIBBase;
+};
 
-/*
- * EADisplacement - Possible displacement types for effective-address
- *   computations.
- */
+/// \brief Possible displacement types for effective-address computations.
 typedef enum {
   EA_DISP_NONE,
   EA_DISP_8,
@@ -448,20 +425,16 @@ typedef enum {
   EA_DISP_32
 } EADisplacement;
 
-/*
- * Reg - All possible values of the reg field in the ModR/M byte.
- */
-typedef enum {
+/// \brief All possible values of the reg field in the ModR/M byte.
+enum Reg {
 #define ENTRY(x) MODRM_REG_##x,
   ALL_REGS
 #undef ENTRY
   MODRM_REG_max
-} Reg;
+};
 
-/*
- * SegmentOverride - All possible segment overrides.
- */
-typedef enum {
+/// \brief All possible segment overrides.
+enum SegmentOverride {
   SEG_OVERRIDE_NONE,
   SEG_OVERRIDE_CS,
   SEG_OVERRIDE_SS,
@@ -470,235 +443,220 @@ typedef enum {
   SEG_OVERRIDE_FS,
   SEG_OVERRIDE_GS,
   SEG_OVERRIDE_max
-} SegmentOverride;
-
-/*
- * VEXLeadingOpcodeByte - Possible values for the VEX.m-mmmm field
- */
+};
 
-typedef enum {
+/// \brief Possible values for the VEX.m-mmmm field
+enum VEXLeadingOpcodeByte {
   VEX_LOB_0F = 0x1,
   VEX_LOB_0F38 = 0x2,
   VEX_LOB_0F3A = 0x3
-} VEXLeadingOpcodeByte;
+};
 
-typedef enum {
+enum XOPMapSelect {
   XOP_MAP_SELECT_8 = 0x8,
   XOP_MAP_SELECT_9 = 0x9,
   XOP_MAP_SELECT_A = 0xA
-} XOPMapSelect;
-
-/*
- * VEXPrefixCode - Possible values for the VEX.pp/EVEX.pp field
- */
+};
 
-typedef enum {
+/// \brief Possible values for the VEX.pp/EVEX.pp field
+enum VEXPrefixCode {
   VEX_PREFIX_NONE = 0x0,
   VEX_PREFIX_66 = 0x1,
   VEX_PREFIX_F3 = 0x2,
   VEX_PREFIX_F2 = 0x3
-} VEXPrefixCode;
+};
 
-typedef enum {
+enum VectorExtensionType {
   TYPE_NO_VEX_XOP   = 0x0,
   TYPE_VEX_2B       = 0x1,
   TYPE_VEX_3B       = 0x2,
   TYPE_EVEX         = 0x3,
   TYPE_XOP          = 0x4
-} VectorExtensionType;
-
-typedef uint8_t BOOL;
-
-/*
- * byteReader_t - Type for the byte reader that the consumer must provide to
- *   the decoder.  Reads a single byte from the instruction's address space.
- * @param arg     - A baton that the consumer can associate with any internal
- *                  state that it needs.
- * @param byte    - A pointer to a single byte in memory that should be set to
- *                  contain the value at address.
- * @param address - The address in the instruction's address space that should
- *                  be read from.
- * @return        - -1 if the byte cannot be read for any reason; 0 otherwise.
- */
-typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address);
-
-/*
- * dlog_t - Type for the logging function that the consumer can provide to
- *   get debugging output from the decoder.
- * @param arg     - A baton that the consumer can associate with any internal
- *                  state that it needs.
- * @param log     - A string that contains the message.  Will be reused after
- *                  the logger returns.
- */
-typedef void (*dlog_t)(void* arg, const char *log);
-
-/*
- * The x86 internal instruction, which is produced by the decoder.
- */
+};
+
+/// \brief Type for the byte reader that the consumer must provide to
+/// the decoder. Reads a single byte from the instruction's address space.
+/// \param arg     A baton that the consumer can associate with any internal
+///                state that it needs.
+/// \param byte    A pointer to a single byte in memory that should be set to
+///                contain the value at address.
+/// \param address The address in the instruction's address space that should
+///                be read from.
+/// \return        -1 if the byte cannot be read for any reason; 0 otherwise.
+typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address);
+
+/// \brief Type for the logging function that the consumer can provide to
+/// get debugging output from the decoder.
+/// \param arg A baton that the consumer can associate with any internal
+///            state that it needs.
+/// \param log A string that contains the message.  Will be reused after
+///            the logger returns.
+typedef void (*dlog_t)(void *arg, const char *log);
+
+/// The specification for how to extract and interpret a full instruction and
+/// its operands.
+struct InstructionSpecifier {
+  uint16_t operands;
+};
+
+/// The x86 internal instruction, which is produced by the decoder.
 struct InternalInstruction {
-  /* Reader interface (C) */
+  // Reader interface (C)
   byteReader_t reader;
-  /* Opaque value passed to the reader */
+  // Opaque value passed to the reader
   const void* readerArg;
-  /* The address of the next byte to read via the reader */
+  // The address of the next byte to read via the reader
   uint64_t readerCursor;
 
-  /* Logger interface (C) */
+  // Logger interface (C)
   dlog_t dlog;
-  /* Opaque value passed to the logger */
+  // Opaque value passed to the logger
   void* dlogArg;
 
-  /* General instruction information */
+  // General instruction information
 
-  /* The mode to disassemble for (64-bit, protected, real) */
+  // The mode to disassemble for (64-bit, protected, real)
   DisassemblerMode mode;
-  /* The start of the instruction, usable with the reader */
+  // The start of the instruction, usable with the reader
   uint64_t startLocation;
-  /* The length of the instruction, in bytes */
+  // The length of the instruction, in bytes
   size_t length;
 
-  /* Prefix state */
+  // Prefix state
 
-  /* 1 if the prefix byte corresponding to the entry is present; 0 if not */
+  // 1 if the prefix byte corresponding to the entry is present; 0 if not
   uint8_t prefixPresent[0x100];
-  /* contains the location (for use with the reader) of the prefix byte */
+  // contains the location (for use with the reader) of the prefix byte
   uint64_t prefixLocations[0x100];
-  /* The value of the vector extension prefix(EVEX/VEX/XOP), if present */
+  // The value of the vector extension prefix(EVEX/VEX/XOP), if present
   uint8_t vectorExtensionPrefix[4];
-  /* The type of the vector extension prefix */
+  // The type of the vector extension prefix
   VectorExtensionType vectorExtensionType;
-  /* The value of the REX prefix, if present */
+  // The value of the REX prefix, if present
   uint8_t rexPrefix;
-  /* The location where a mandatory prefix would have to be (i.e., right before
-     the opcode, or right before the REX prefix if one is present) */
+  // The location where a mandatory prefix would have to be (i.e., right before
+  // the opcode, or right before the REX prefix if one is present).
   uint64_t necessaryPrefixLocation;
-  /* The segment override type */
+  // The segment override type
   SegmentOverride segmentOverride;
-  /* 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease */
-  BOOL xAcquireRelease;
+  // 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease
+  bool xAcquireRelease;
 
-  /* Sizes of various critical pieces of data, in bytes */
+  // Sizes of various critical pieces of data, in bytes
   uint8_t registerSize;
   uint8_t addressSize;
   uint8_t displacementSize;
   uint8_t immediateSize;
 
-  /* Offsets from the start of the instruction to the pieces of data, which is
-     needed to find relocation entries for adding symbolic operands */
+  // Offsets from the start of the instruction to the pieces of data, which is
+  // needed to find relocation entries for adding symbolic operands.
   uint8_t displacementOffset;
   uint8_t immediateOffset;
 
-  /* opcode state */
+  // opcode state
 
-  /* The last byte of the opcode, not counting any ModR/M extension */
+  // The last byte of the opcode, not counting any ModR/M extension
   uint8_t opcode;
-  /* The ModR/M byte of the instruction, if it is an opcode extension */
+  // The ModR/M byte of the instruction, if it is an opcode extension
   uint8_t modRMExtension;
 
-  /* decode state */
+  // decode state
 
-  /* The type of opcode, used for indexing into the array of decode tables */
+  // The type of opcode, used for indexing into the array of decode tables
   OpcodeType opcodeType;
-  /* The instruction ID, extracted from the decode table */
+  // The instruction ID, extracted from the decode table
   uint16_t instructionID;
-  /* The specifier for the instruction, from the instruction info table */
-  const struct InstructionSpecifier *spec;
+  // The specifier for the instruction, from the instruction info table
+  const InstructionSpecifier *spec;
 
-  /* state for additional bytes, consumed during operand decode.  Pattern:
-     consumed___ indicates that the byte was already consumed and does not
-     need to be consumed again */
+  // state for additional bytes, consumed during operand decode.  Pattern:
+  // consumed___ indicates that the byte was already consumed and does not
+  // need to be consumed again.
 
-  /* The VEX.vvvv field, which contains a third register operand for some AVX
-     instructions */
+  // The VEX.vvvv field, which contains a third register operand for some AVX
+  // instructions.
   Reg                           vvvv;
 
-  /* The writemask for AVX-512 instructions which is contained in EVEX.aaa */
+  // The writemask for AVX-512 instructions which is contained in EVEX.aaa
   Reg                           writemask;
 
-  /* The ModR/M byte, which contains most register operands and some portion of
-     all memory operands */
-  BOOL                          consumedModRM;
+  // The ModR/M byte, which contains most register operands and some portion of
+  // all memory operands.
+  bool                          consumedModRM;
   uint8_t                       modRM;
 
-  /* The SIB byte, used for more complex 32- or 64-bit memory operands */
-  BOOL                          consumedSIB;
+  // The SIB byte, used for more complex 32- or 64-bit memory operands
+  bool                          consumedSIB;
   uint8_t                       sib;
 
-  /* The displacement, used for memory operands */
-  BOOL                          consumedDisplacement;
+  // The displacement, used for memory operands
+  bool                          consumedDisplacement;
   int32_t                       displacement;
 
-  /* Immediates.  There can be two in some cases */
+  // Immediates.  There can be two in some cases
   uint8_t                       numImmediatesConsumed;
   uint8_t                       numImmediatesTranslated;
   uint64_t                      immediates[2];
 
-  /* A register or immediate operand encoded into the opcode */
+  // A register or immediate operand encoded into the opcode
   Reg                           opcodeRegister;
 
-  /* Portions of the ModR/M byte */
+  // Portions of the ModR/M byte
 
-  /* These fields determine the allowable values for the ModR/M fields, which
-     depend on operand and address widths */
+  // These fields determine the allowable values for the ModR/M fields, which
+  // depend on operand and address widths.
   EABase                        eaBaseBase;
   EABase                        eaRegBase;
   Reg                           regBase;
 
-  /* The Mod and R/M fields can encode a base for an effective address, or a
-     register.  These are separated into two fields here */
+  // The Mod and R/M fields can encode a base for an effective address, or a
+  // register.  These are separated into two fields here.
   EABase                        eaBase;
   EADisplacement                eaDisplacement;
-  /* The reg field always encodes a register */
+  // The reg field always encodes a register
   Reg                           reg;
 
-  /* SIB state */
+  // SIB state
   SIBIndex                      sibIndex;
   uint8_t                       sibScale;
   SIBBase                       sibBase;
 
-  const struct OperandSpecifier *operands;
+  ArrayRef<OperandSpecifier> operands;
 };
 
-/* decodeInstruction - Decode one instruction and store the decoding results in
- *   a buffer provided by the consumer.
- * @param insn      - The buffer to store the instruction in.  Allocated by the
- *                    consumer.
- * @param reader    - The byteReader_t for the bytes to be read.
- * @param readerArg - An argument to pass to the reader for storing context
- *                    specific to the consumer.  May be NULL.
- * @param logger    - The dlog_t to be used in printing status messages from the
- *                    disassembler.  May be NULL.
- * @param loggerArg - An argument to pass to the logger for storing context
- *                    specific to the logger.  May be NULL.
- * @param startLoc  - The address (in the reader's address space) of the first
- *                    byte in the instruction.
- * @param mode      - The mode (16-bit, 32-bit, 64-bit) to decode in.
- * @return          - Nonzero if there was an error during decode, 0 otherwise.
- */
-int decodeInstruction(struct InternalInstruction* insn,
+/// \brief Decode one instruction and store the decoding results in
+/// a buffer provided by the consumer.
+/// \param insn      The buffer to store the instruction in.  Allocated by the
+///                  consumer.
+/// \param reader    The byteReader_t for the bytes to be read.
+/// \param readerArg An argument to pass to the reader for storing context
+///                  specific to the consumer.  May be NULL.
+/// \param logger    The dlog_t to be used in printing status messages from the
+///                  disassembler.  May be NULL.
+/// \param loggerArg An argument to pass to the logger for storing context
+///                  specific to the logger.  May be NULL.
+/// \param startLoc  The address (in the reader's address space) of the first
+///                  byte in the instruction.
+/// \param mode      The mode (16-bit, 32-bit, 64-bit) to decode in.
+/// \return          Nonzero if there was an error during decode, 0 otherwise.
+int decodeInstruction(InternalInstruction *insn,
                       byteReader_t reader,
-                      const void* readerArg,
+                      const void *readerArg,
                       dlog_t logger,
-                      void* loggerArg,
-                      const void* miiArg,
+                      void *loggerArg,
+                      const void *miiArg,
                       uint64_t startLoc,
                       DisassemblerMode mode);
 
-/* x86DisassemblerDebug - C-accessible function for printing a message to
- *   debugs()
- * @param file  - The name of the file printing the debug message.
- * @param line  - The line number that printed the debug message.
- * @param s     - The message to print.
- */
+/// \brief Print a message to debugs()
+/// \param file The name of the file printing the debug message.
+/// \param line The line number that printed the debug message.
+/// \param s    The message to print.
+void Debug(const char *file, unsigned line, const char *s);
 
-void x86DisassemblerDebug(const char *file,
-                          unsigned line,
-                          const char *s);
+const char *GetInstrName(unsigned Opcode, const void *mii);
 
-const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii);
-
-#ifdef __cplusplus
-}
-#endif
+} // namespace X86Disassembler
+} // namespace llvm
 
 #endif
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 523ae99..f59e0b6 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -1,29 +1,27 @@
-/*===-- X86DisassemblerDecoderCommon.h - Disassembler decoder -----*- C -*-===*
- *
- *                     The LLVM Compiler Infrastructure
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *
- *===----------------------------------------------------------------------===*
- *
- * This file is part of the X86 Disassembler.
- * It contains common definitions used by both the disassembler and the table
- *  generator.
- * Documentation for the disassembler can be found in X86Disassembler.h.
- *
- *===----------------------------------------------------------------------===*/
-
-/*
- * This header file provides those definitions that need to be shared between
- * the decoder and the table generator in a C-friendly manner.
- */
+//===-- X86DisassemblerDecoderCommon.h - Disassembler decoder ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains common definitions used by both the disassembler and the table
+//  generator.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef X86DISASSEMBLERDECODERCOMMON_H
 #define X86DISASSEMBLERDECODERCOMMON_H
 
 #include "llvm/Support/DataTypes.h"
 
+namespace llvm {
+namespace X86Disassembler {
+
 #define INSTRUCTIONS_SYM  x86DisassemblerInstrSpecifiers
 #define CONTEXTS_SYM      x86DisassemblerContexts
 #define ONEBYTE_SYM       x86DisassemblerOneByteOpcodes
@@ -44,11 +42,9 @@
 #define XOP9_MAP_STR      "x86DisassemblerXOP9Opcodes"
 #define XOPA_MAP_STR      "x86DisassemblerXOPAOpcodes"
 
-/*
- * Attributes of an instruction that must be known before the opcode can be
- * processed correctly.  Most of these indicate the presence of particular
- * prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
- */
+// Attributes of an instruction that must be known before the opcode can be
+// processed correctly.  Most of these indicate the presence of particular
+// prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
 #define ATTRIBUTE_BITS                  \
   ENUM_ENTRY(ATTR_NONE,   0x00)         \
   ENUM_ENTRY(ATTR_64BIT,  (0x1 << 0))   \
@@ -73,13 +69,11 @@ enum attributeBits {
 };
 #undef ENUM_ENTRY
 
-/*
- * Combinations of the above attributes that are relevant to instruction
- * decode. Although other combinations are possible, they can be reduced to
- * these without affecting the ultimately decoded instruction.
- */
+// Combinations of the above attributes that are relevant to instruction
+// decode. Although other combinations are possible, they can be reduced to
+// these without affecting the ultimately decoded instruction.
 
-/*           Class name           Rank  Rationale for rank assignment         */
+//           Class name           Rank  Rationale for rank assignment
 #define INSTRUCTION_CONTEXTS                                                   \
   ENUM_ENTRY(IC,                    0,  "says nothing about the instruction")  \
   ENUM_ENTRY(IC_64BIT,              1,  "says the instruction applies in "     \
@@ -274,17 +268,15 @@ enum attributeBits {
   ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4,  "requires EVEX_KZ, L2, W and OpSize")     
 
 #define ENUM_ENTRY(n, r, d) n,
-typedef enum {
+enum InstructionContext {
   INSTRUCTION_CONTEXTS
   IC_max
-} InstructionContext;
+};
 #undef ENUM_ENTRY
 
-/*
- * Opcode types, which determine which decode table to use, both in the Intel
- * manual and also for the decoder.
- */
-typedef enum {
+// Opcode types, which determine which decode table to use, both in the Intel
+// manual and also for the decoder.
+enum OpcodeType {
   ONEBYTE       = 0,
   TWOBYTE       = 1,
   THREEBYTE_38  = 2,
@@ -292,39 +284,33 @@ typedef enum {
   XOP8_MAP      = 4,
   XOP9_MAP      = 5,
   XOPA_MAP      = 6
-} OpcodeType;
-
-/*
- * The following structs are used for the hierarchical decode table.  After
- * determining the instruction's class (i.e., which IC_* constant applies to
- * it), the decoder reads the opcode.  Some instructions require specific
- * values of the ModR/M byte, so the ModR/M byte indexes into the final table.
- *
- * If a ModR/M byte is not required, "required" is left unset, and the values
- * for each instructionID are identical.
- */
+};
 
+// The following structs are used for the hierarchical decode table.  After
+// determining the instruction's class (i.e., which IC_* constant applies to
+// it), the decoder reads the opcode.  Some instructions require specific
+// values of the ModR/M byte, so the ModR/M byte indexes into the final table.
+//
+// If a ModR/M byte is not required, "required" is left unset, and the values
+// for each instructionID are identical.
 typedef uint16_t InstrUID;
 
-/*
- * ModRMDecisionType - describes the type of ModR/M decision, allowing the
- * consumer to determine the number of entries in it.
- *
- * MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
- *                  instruction is the same.
- * MODRM_SPLITRM  - If the ModR/M byte is between 0x00 and 0xbf, the opcode
- *                  corresponds to one instruction; otherwise, it corresponds to
- *                  a different instruction.
- * MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
- *                  divided by 8 is used to select instruction; otherwise, each
- *                  value of the ModR/M byte could correspond to a different
- *                  instruction.
- * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
-                    corresponds to instructions that use reg field as opcode
- * MODRM_FULL     - Potentially, each value of the ModR/M byte could correspond
- *                  to a different instruction.
- */
-
+// ModRMDecisionType - describes the type of ModR/M decision, allowing the
+// consumer to determine the number of entries in it.
+//
+// MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
+//                  instruction is the same.
+// MODRM_SPLITRM  - If the ModR/M byte is between 0x00 and 0xbf, the opcode
+//                  corresponds to one instruction; otherwise, it corresponds to
+//                  a different instruction.
+// MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+//                  divided by 8 is used to select instruction; otherwise, each
+//                  value of the ModR/M byte could correspond to a different
+//                  instruction.
+// MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
+//                  corresponds to instructions that use reg field as opcode
+// MODRM_FULL     - Potentially, each value of the ModR/M byte could correspond
+//                  to a different instruction.
 #define MODRMTYPES            \
   ENUM_ENTRY(MODRM_ONEENTRY)  \
   ENUM_ENTRY(MODRM_SPLITRM)   \
@@ -333,47 +319,13 @@ typedef uint16_t InstrUID;
   ENUM_ENTRY(MODRM_FULL)
 
 #define ENUM_ENTRY(n) n,
-typedef enum {
+enum ModRMDecisionType {
   MODRMTYPES
   MODRM_max
-} ModRMDecisionType;
-#undef ENUM_ENTRY
-
-/*
- * ModRMDecision - Specifies whether a ModR/M byte is needed and (if so) which
- *  instruction each possible value of the ModR/M byte corresponds to.  Once
- *  this information is known, we have narrowed down to a single instruction.
- */
-struct ModRMDecision {
-  uint8_t     modrm_type;
-
-  /* The macro below must be defined wherever this file is included. */
-  INSTRUCTION_IDS
-};
-
-/*
- * OpcodeDecision - Specifies which set of ModR/M->instruction tables to look at
- *   given a particular opcode.
- */
-struct OpcodeDecision {
-  struct ModRMDecision modRMDecisions[256];
-};
-
-/*
- * ContextDecision - Specifies which opcode->instruction tables to look at given
- *   a particular context (set of attributes).  Since there are many possible
- *   contexts, the decoder first uses CONTEXTS_SYM to determine which context
- *   applies given a specific set of attributes.  Hence there are only IC_max
- *   entries in this table, rather than 2^(ATTR_max).
- */
-struct ContextDecision {
-  struct OpcodeDecision opcodeDecisions[IC_max];
 };
+#undef ENUM_ENTRY
 
-/*
- * Physical encodings of instruction operands.
- */
-
+// Physical encodings of instruction operands.
 #define ENCODINGS                                                              \
   ENUM_ENTRY(ENCODING_NONE,   "")                                              \
   ENUM_ENTRY(ENCODING_REG,    "Register operand in ModR/M byte.")              \
@@ -408,16 +360,13 @@ struct ContextDecision {
   ENUM_ENTRY(ENCODING_DI,     "Destination index; encoded in prefixes")
 
 #define ENUM_ENTRY(n, d) n,
-  typedef enum {
-    ENCODINGS
-    ENCODING_max
-  } OperandEncoding;
+enum OperandEncoding {
+  ENCODINGS
+  ENCODING_max
+};
 #undef ENUM_ENTRY
 
-/*
- * Semantic interpretations of instruction operands.
- */
-
+// Semantic interpretations of instruction operands.
 #define TYPES                                                                  \
   ENUM_ENTRY(TYPE_NONE,       "")                                              \
   ENUM_ENTRY(TYPE_REL8,       "1-byte immediate address")                      \
@@ -508,56 +457,42 @@ struct ContextDecision {
   ENUM_ENTRY(TYPE_M512,       "512-bit FPU/MMX/XMM/MXCSR state")
 
 #define ENUM_ENTRY(n, d) n,
-typedef enum {
+enum OperandType {
   TYPES
   TYPE_max
-} OperandType;
+};
 #undef ENUM_ENTRY
 
-/*
- * OperandSpecifier - The specification for how to extract and interpret one
- *   operand.
- */
+/// \brief The specification for how to extract and interpret one operand.
 struct OperandSpecifier {
   uint8_t encoding;
   uint8_t type;
 };
 
-/*
- * Indicates where the opcode modifier (if any) is to be found.  Extended
- * opcodes with AddRegFrm have the opcode modifier in the ModR/M byte.
- */
-
+// Indicates where the opcode modifier (if any) is to be found.  Extended
+// opcodes with AddRegFrm have the opcode modifier in the ModR/M byte.
 #define MODIFIER_TYPES        \
   ENUM_ENTRY(MODIFIER_NONE)
 
 #define ENUM_ENTRY(n) n,
-typedef enum {
+enum ModifierType {
   MODIFIER_TYPES
   MODIFIER_max
-} ModifierType;
+};
 #undef ENUM_ENTRY
 
-#define X86_MAX_OPERANDS 5
-
-/*
- * The specification for how to extract and interpret a full instruction and
- * its operands.
- */
-struct InstructionSpecifier {
-  /* The macro below must be defined wherever this file is included. */
-  INSTRUCTION_SPECIFIER_FIELDS
-};
+static const unsigned X86_MAX_OPERANDS = 5;
 
-/*
- * Decoding mode for the Intel disassembler.  16-bit, 32-bit, and 64-bit mode
- * are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
- * respectively.
- */
-typedef enum {
+/// Decoding mode for the Intel disassembler.  16-bit, 32-bit, and 64-bit mode
+/// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
+/// respectively.
+enum DisassemblerMode {
   MODE_16BIT,
   MODE_32BIT,
   MODE_64BIT
-} DisassemblerMode;
+};
+
+} // namespace X86Disassembler
+} // namespace llvm
 
 #endif
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index eea0a76..b45b118 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "X86ATTInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
@@ -28,6 +27,8 @@
 #include <map>
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 // Include the auto-generated portion of the assembly writer.
 #define PRINT_ALIAS_INSTR
 #include "X86GenAsmWriter.inc"
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index f34e633..531183b 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -32,6 +32,8 @@ public:
   // Autogenerated by tblgen, returns true if we successfully printed an
   // alias.
   bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &OS);
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index db61fb0..baf6507 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -32,7 +32,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
                                   const char *(*getRegName)(unsigned)) {
   // If this is a shuffle operation, the switch should fill in this state.
   SmallVector<int, 8> ShuffleMask;
-  const char *DestName = 0, *Src1Name = 0, *Src2Name = 0;
+  const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
 
   switch (MI->getOpcode()) {
   case X86::INSERTPSrr:
@@ -492,7 +492,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   // If this was a shuffle operation, print the shuffle mask.
   if (!ShuffleMask.empty()) {
-    if (DestName == 0) DestName = Src1Name;
+    if (!DestName) DestName = Src1Name;
     OS << (DestName ? DestName : "mem") << " = ";
 
     // If the two sources are the same, canonicalize the input elements to be
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 1c95d37..1c8466b 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "X86IntelInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
@@ -25,6 +24,8 @@
 #include <cctype>
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #include "X86GenAsmWriter1.inc"
 
 void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
diff --git a/lib/Target/X86/MCTargetDesc/Android.mk b/lib/Target/X86/MCTargetDesc/Android.mk
index ee37c27..a3c9bc8 100644
--- a/lib/Target/X86/MCTargetDesc/Android.mk
+++ b/lib/Target/X86/MCTargetDesc/Android.mk
@@ -14,7 +14,8 @@ x86_mc_desc_SRC_FILES := \
   X86MCCodeEmitter.cpp \
   X86MachORelocationInfo.cpp \
   X86MachObjectWriter.cpp \
-  X86WinCOFFObjectWriter.cpp
+  X86WinCOFFObjectWriter.cpp \
+  X86WinCOFFStreamer.cpp
 
 # For the host
 # =====================================================
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index 3f5a0e2..129c28d 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(LLVMX86Desc
   X86MCCodeEmitter.cpp
   X86MachObjectWriter.cpp
   X86ELFObjectWriter.cpp
+  X86WinCOFFStreamer.cpp
   X86WinCOFFObjectWriter.cpp
   X86MachORelocationInfo.cpp
   X86ELFRelocationInfo.cpp
diff --git a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
index 9e1d29c..146d111 100644
--- a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = X86Desc
 parent = X86
-required_libraries = MC Support X86AsmPrinter X86Info
+required_libraries = MC Object Support X86AsmPrinter X86Info
 add_to_library_groups = X86
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 23763f7..bf30a8e 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -37,23 +37,29 @@ MCDisableArithRelaxation("mc-x86-disable-arith-relaxation",
 
 static unsigned getFixupKindLog2Size(unsigned Kind) {
   switch (Kind) {
-  default: llvm_unreachable("invalid fixup kind!");
+  default:
+    llvm_unreachable("invalid fixup kind!");
   case FK_PCRel_1:
   case FK_SecRel_1:
-  case FK_Data_1: return 0;
+  case FK_Data_1:
+    return 0;
   case FK_PCRel_2:
   case FK_SecRel_2:
-  case FK_Data_2: return 1;
+  case FK_Data_2:
+    return 1;
   case FK_PCRel_4:
   case X86::reloc_riprel_4byte:
   case X86::reloc_riprel_4byte_movq_load:
   case X86::reloc_signed_4byte:
   case X86::reloc_global_offset_table:
   case FK_SecRel_4:
-  case FK_Data_4: return 2;
+  case FK_Data_4:
+    return 2;
   case FK_PCRel_8:
   case FK_SecRel_8:
-  case FK_Data_8: return 3;
+  case FK_Data_8:
+  case X86::reloc_global_offset_table8:
+    return 3;
   }
 }
 
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 38fab15..6aeb1f2 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -643,6 +643,10 @@ namespace X86II {
   /// counted as one operand.
   ///
   inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+    bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+    bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+    bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+    
     switch (TSFlags & X86II::FormMask) {
     default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
     case X86II::Pseudo:
@@ -660,9 +664,6 @@ namespace X86II {
     case X86II::MRMDestMem:
       return 0;
     case X86II::MRMSrcMem: {
-      bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
-      bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
-      bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
       unsigned FirstMemOp = 1;
       if (HasVEX_4V)
         ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
@@ -690,6 +691,8 @@ namespace X86II {
       unsigned FirstMemOp = 0;
       if (HasVEX_4V)
         ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
+      if (HasEVEX_K)
+        ++FirstMemOp;// Skip the mask register
       return FirstMemOp;
     }
     case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index c44d88d..3fdec87 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -43,7 +43,7 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
                                           bool IsPCRel) const {
   // determine the type of the relocation
 
-  MCSymbolRefExpr::VariantKind Modifier = Fixup.getAccessVariant();
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
   unsigned Type;
   if (getEMachine() == ELF::EM_X86_64) {
     if (IsPCRel) {
@@ -98,6 +98,12 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
     } else {
       switch ((unsigned)Fixup.getKind()) {
       default: llvm_unreachable("invalid fixup kind!");
+      case X86::reloc_global_offset_table8:
+        Type = ELF::R_X86_64_GOTPC64;
+        break;
+      case X86::reloc_global_offset_table:
+        Type = ELF::R_X86_64_GOTPC32;
+        break;
       case FK_Data_8:
         switch (Modifier) {
         default:
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
index 4fa519c..b679316 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -39,7 +39,7 @@ public:
     if (Sym->isVariable() == false)
       Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
 
-    const MCExpr *Expr = 0;
+    const MCExpr *Expr = nullptr;
     // If hasAddend is true, then we need to add Addend (r_addend) to Expr.
     bool hasAddend = false;
 
diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index f2e34cb..09396b7 100644
--- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
+++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -23,6 +23,7 @@ enum Fixups {
   reloc_global_offset_table,                 // 32-bit, relative to the start
                                              // of the instruction. Used only
                                              // for _GLOBAL_OFFSET_TABLE_.
+  reloc_global_offset_table8,                // 64-bit variant.
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 6561804..39480ea 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -51,7 +51,7 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
   TextAlignFillValue = 0x90;
 
   if (!is64Bit)
-    Data64bitsDirective = 0;       // we can't emit a 64-bit unit
+    Data64bitsDirective = nullptr;       // we can't emit a 64-bit unit
 
   // Use ## as a comment string so that .s files generated by llvm can go
   // through the GCC preprocessor without causing an error.  This is needed
@@ -115,7 +115,7 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   // into two .words.
   if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) &&
        T.getArch() == Triple::x86)
-    Data64bitsDirective = 0;
+    Data64bitsDirective = nullptr;
 
   // Always enable the integrated assembler by default.
   // Clang also enabled it when the OS is Solaris but that is redundant here.
@@ -157,8 +157,10 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
 void X86MCAsmInfoGNUCOFF::anchor() { }
 
 X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
-  if (Triple.getArch() == Triple::x86_64)
+  if (Triple.getArch() == Triple::x86_64) {
     PrivateGlobalPrefix = ".L";
+    PointerSize = 8;
+  }
 
   AssemblerDialect = AsmWriterFlavor;
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index e6fb037..2152b21 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mccodeemitter"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
@@ -27,6 +26,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mccodeemitter"
+
 namespace {
 class X86MCCodeEmitter : public MCCodeEmitter {
   X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
@@ -285,7 +286,7 @@ enum GlobalOffsetTableExprKind {
 };
 static GlobalOffsetTableExprKind
 StartsWithGlobalOffsetTable(const MCExpr *Expr) {
-  const MCExpr *RHS = 0;
+  const MCExpr *RHS = nullptr;
   if (Expr->getKind() == MCExpr::Binary) {
     const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
     Expr = BE->getLHS();
@@ -316,7 +317,7 @@ void X86MCCodeEmitter::
 EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
               MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
               SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
-  const MCExpr *Expr = NULL;
+  const MCExpr *Expr = nullptr;
   if (DispOp.isImm()) {
     // If this is a simple integer displacement that doesn't require a
     // relocation, emit it now.
@@ -339,7 +340,13 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
     if (Kind != GOT_None) {
       assert(ImmOffset == 0);
 
-      FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+      if (Size == 8) {
+        FixupKind = MCFixupKind(X86::reloc_global_offset_table8);
+      } else {
+        assert(Size == 4);
+        FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+      }
+
       if (Kind == GOT_Normal)
         ImmOffset = CurByte;
     } else if (Expr->getKind() == MCExpr::SymbolRef) {
@@ -1421,6 +1428,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM6r: case X86II::MRM7r: {
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
+    if (HasEVEX_K) // Skip writemask
+      ++CurOp;
     EmitByte(BaseOpcode, CurByte, OS);
     uint64_t Form = TSFlags & X86II::FormMask;
     EmitRegModRMByte(MI.getOperand(CurOp++),
@@ -1436,6 +1445,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM6m: case X86II::MRM7m: {
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
+    if (HasEVEX_K) // Skip writemask
+      ++CurOp;
     EmitByte(BaseOpcode, CurByte, OS);
     uint64_t Form = TSFlags & X86II::FormMask;
     EmitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form-X86II::MRM0m,
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 09fdb9c..e63036c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -27,6 +27,12 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
 #define GET_REGINFO_MC_DESC
 #include "X86GenRegisterInfo.inc"
 
@@ -36,13 +42,6 @@
 #define GET_SUBTARGETINFO_MC_DESC
 #include "X86GenSubtargetInfo.inc"
 
-#if _MSC_VER
-#include <intrin.h>
-#endif
-
-using namespace llvm;
-
-
 std::string X86_MC::ParseX86Triple(StringRef TT) {
   Triple TheTriple(TT);
   std::string FS;
@@ -230,14 +229,8 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
   }
 
   std::string CPUName = CPU;
-  if (CPUName.empty()) {
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
-    || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
-    CPUName = sys::getHostCPUName();
-#else
+  if (CPUName.empty())
     CPUName = "generic";
-#endif
-  }
 
   MCSubtargetInfo *X = new MCSubtargetInfo();
   InitX86MCSubtargetInfo(X, TT, CPUName, ArchFS);
@@ -294,13 +287,13 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   // Initial state of the frame pointer is esp+stackGrowth.
   unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
   MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
-      0, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
+      nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
   MAI->addInitialFrameState(Inst);
 
   // Add return address to move list
   unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP;
   MCCFIInstruction Inst2 = MCCFIInstruction::createOffset(
-      0, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
+      nullptr, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
   MAI->addInitialFrameState(Inst2);
 
   return MAI;
@@ -365,13 +358,16 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     bool NoExecStack) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSBinFormatMachO())
+  switch (TheTriple.getObjectFormat()) {
+  default: llvm_unreachable("unsupported object format");
+  case Triple::MachO:
     return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
-
-  if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
-    return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
-
-  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+  case Triple::COFF:
+    assert(TheTriple.isOSWindows() && "only Windows COFF is supported");
+    return createX86WinCOFFStreamer(Ctx, MAB, _Emitter, _OS, RelaxAll);
+  case Triple::ELF:
+    return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+  }
 }
 
 static MCInstPrinter *createX86MCInstPrinter(const Target &T,
@@ -384,7 +380,7 @@ static MCInstPrinter *createX86MCInstPrinter(const Target &T,
     return new X86ATTInstPrinter(MAI, MII, MRI);
   if (SyntaxVariant == 1)
     return new X86IntelInstPrinter(MAI, MII, MRI);
-  return 0;
+  return nullptr;
 }
 
 static MCRelocationInfo *createX86MCRelocationInfo(StringRef TT,
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 41ae435..8fe40fd 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -26,6 +26,7 @@ class MCObjectWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCRelocationInfo;
+class MCStreamer;
 class Target;
 class StringRef;
 class raw_ostream;
@@ -84,6 +85,14 @@ MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
 MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
                                      StringRef TT, StringRef CPU);
 
+/// createX86WinCOFFStreamer - Construct an X86 Windows COFF machine code
+/// streamer which will generate PE/COFF format object files.
+///
+/// Takes ownership of \p AB and \p CE.
+MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+                                     MCCodeEmitter *CE, raw_ostream &OS,
+                                     bool RelaxAll);
+
 /// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
 MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
                                           bool Is64Bit,
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
index f2023e3..3b81d53 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -40,7 +40,7 @@ public:
     // FIXME: check that the value is actually the same.
     if (Sym->isVariable() == false)
       Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
-    const MCExpr *Expr = 0;
+    const MCExpr *Expr = nullptr;
 
     switch(RelType) {
     case X86_64_RELOC_TLV:
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 1a35ced..ead3338 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -146,13 +146,13 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     const MCSymbol *A = &Target.getSymA()->getSymbol();
     if (A->isTemporary())
       A = &A->AliasedSymbol();
-    MCSymbolData &A_SD = Asm.getSymbolData(*A);
+    const MCSymbolData &A_SD = Asm.getSymbolData(*A);
     const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
 
     const MCSymbol *B = &Target.getSymB()->getSymbol();
     if (B->isTemporary())
       B = &B->AliasedSymbol();
-    MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData &B_SD = Asm.getSymbolData(*B);
     const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
 
     // Neither symbol can be modified.
@@ -186,9 +186,9 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
                          false);
 
     Value += Writer->getSymbolAddress(&A_SD, Layout) -
-      (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout));
+      (!A_Base ? 0 : Writer->getSymbolAddress(A_Base, Layout));
     Value -= Writer->getSymbolAddress(&B_SD, Layout) -
-      (B_Base == NULL ? 0 : Writer->getSymbolAddress(B_Base, Layout));
+      (!B_Base ? 0 : Writer->getSymbolAddress(B_Base, Layout));
 
     if (A_Base) {
       Index = A_Base->getIndex();
@@ -220,7 +220,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     Type = MachO::X86_64_RELOC_SUBTRACTOR;
   } else {
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
-    MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
     const MCSymbolData *Base = Asm.getAtom(&SD);
 
     // Relocations inside debug sections always use local relocations when
@@ -231,7 +231,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
       const MCSectionMachO &Section = static_cast<const MCSectionMachO&>(
         Fragment->getParent()->getSection());
       if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
-        Base = 0;
+        Base = nullptr;
     }
 
     // x86_64 almost always uses external relocations, except when there is no
@@ -369,7 +369,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
-  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
 
   if (!A_SD->getFragment())
     report_fatal_error("symbol '" + A->getName() +
@@ -382,7 +382,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
   uint32_t Value2 = 0;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
 
     if (!B_SD->getFragment())
       report_fatal_error("symbol '" + B->getSymbol().getName() +
@@ -465,7 +465,7 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
   unsigned IsPCRel = 0;
 
   // Get the symbol data.
-  MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+  const MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
   unsigned Index = SD_A->getIndex();
 
   // We're only going to have a second symbol in pic mode and it'll be a
@@ -476,7 +476,8 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
     // If this is a subtraction then we're pcrel.
     uint32_t FixupAddress =
       Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
-    MCSymbolData *SD_B = &Asm.getSymbolData(Target.getSymB()->getSymbol());
+    const MCSymbolData *SD_B =
+        &Asm.getSymbolData(Target.getSymB()->getSymbol());
     IsPCRel = 1;
     FixedValue = (FixupAddress - Writer->getSymbolAddress(SD_B, Layout) +
                   Target.getConstant());
@@ -524,7 +525,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
   }
 
   // Get the symbol data, if any.
-  MCSymbolData *SD = 0;
+  const MCSymbolData *SD = nullptr;
   if (Target.getSymA())
     SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
 
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index ffc9e8d..40af822 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -23,10 +23,8 @@ namespace llvm {
 
 namespace {
   class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
-    const bool Is64Bit;
-
   public:
-    X86WinCOFFObjectWriter(bool Is64Bit_);
+    X86WinCOFFObjectWriter(bool Is64Bit);
     virtual ~X86WinCOFFObjectWriter();
 
     unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
@@ -34,10 +32,9 @@ namespace {
   };
 }
 
-X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit_)
-  : MCWinCOFFObjectTargetWriter(Is64Bit_ ? COFF::IMAGE_FILE_MACHINE_AMD64 :
-                                COFF::IMAGE_FILE_MACHINE_I386),
-    Is64Bit(Is64Bit_) {}
+X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
+    : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
+                                          : COFF::IMAGE_FILE_MACHINE_I386) {}
 
 X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
 
@@ -49,29 +46,46 @@ unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
   MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
     MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
 
-  switch (FixupKind) {
-  case FK_PCRel_4:
-  case X86::reloc_riprel_4byte:
-  case X86::reloc_riprel_4byte_movq_load:
-    return Is64Bit ? COFF::IMAGE_REL_AMD64_REL32 : COFF::IMAGE_REL_I386_REL32;
-  case FK_Data_4:
-  case X86::reloc_signed_4byte:
-    if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
-      return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32NB :
-                       COFF::IMAGE_REL_I386_DIR32NB;
-    return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32 : COFF::IMAGE_REL_I386_DIR32;
-  case FK_Data_8:
-    if (Is64Bit)
+  if (getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64) {
+    switch (FixupKind) {
+    case FK_PCRel_4:
+    case X86::reloc_riprel_4byte:
+    case X86::reloc_riprel_4byte_movq_load:
+      return COFF::IMAGE_REL_AMD64_REL32;
+    case FK_Data_4:
+    case X86::reloc_signed_4byte:
+      if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+        return COFF::IMAGE_REL_AMD64_ADDR32NB;
+      return COFF::IMAGE_REL_AMD64_ADDR32;
+    case FK_Data_8:
       return COFF::IMAGE_REL_AMD64_ADDR64;
-    llvm_unreachable("unsupported relocation type");
-  case FK_SecRel_2:
-    return Is64Bit ? COFF::IMAGE_REL_AMD64_SECTION
-                   : COFF::IMAGE_REL_I386_SECTION;
-  case FK_SecRel_4:
-    return Is64Bit ? COFF::IMAGE_REL_AMD64_SECREL : COFF::IMAGE_REL_I386_SECREL;
-  default:
-    llvm_unreachable("unsupported relocation type");
-  }
+    case FK_SecRel_2:
+      return COFF::IMAGE_REL_AMD64_SECTION;
+    case FK_SecRel_4:
+      return COFF::IMAGE_REL_AMD64_SECREL;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  } else if (getMachine() == COFF::IMAGE_FILE_MACHINE_I386) {
+    switch (FixupKind) {
+    case FK_PCRel_4:
+    case X86::reloc_riprel_4byte:
+    case X86::reloc_riprel_4byte_movq_load:
+      return COFF::IMAGE_REL_I386_REL32;
+    case FK_Data_4:
+    case X86::reloc_signed_4byte:
+      if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+        return COFF::IMAGE_REL_I386_DIR32NB;
+      return COFF::IMAGE_REL_I386_DIR32;
+    case FK_SecRel_2:
+      return COFF::IMAGE_REL_I386_SECTION;
+    case FK_SecRel_4:
+      return COFF::IMAGE_REL_I386_SECREL;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  } else
+    llvm_unreachable("Unsupported COFF machine type.");
 }
 
 MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
new file mode 100644
index 0000000..c62fd0a
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -0,0 +1,51 @@
+//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class X86WinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+  X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE,
+                     raw_ostream &OS)
+    : MCWinCOFFStreamer(C, AB, *CE, OS) { }
+
+  void EmitWin64EHHandlerData() override;
+  void FinishImpl() override;
+};
+
+void X86WinCOFFStreamer::EmitWin64EHHandlerData() {
+  MCStreamer::EmitWin64EHHandlerData();
+
+  // We have to emit the unwind info now, because this directive
+  // actually switches to the .xdata section!
+  MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentW64UnwindInfo());
+}
+
+void X86WinCOFFStreamer::FinishImpl() {
+  EmitFrames(nullptr);
+  EmitW64Tables();
+
+  MCWinCOFFStreamer::FinishImpl();
+}
+}
+
+namespace llvm {
+MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+                                     MCCodeEmitter *CE, raw_ostream &OS,
+                                     bool RelaxAll) {
+  X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS);
+  S->getAssembler().setRelaxAll(RelaxAll);
+  return S;
+}
+}
+
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 18e6845..64e8ea8 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -30,9 +30,9 @@ class X86TargetMachine;
 FunctionPass *createX86ISelDag(X86TargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 
-/// createGlobalBaseRegPass - This pass initializes a global base
+/// createX86GlobalBaseRegPass - This pass initializes a global base
 /// register for PIC on x86-32.
-FunctionPass* createGlobalBaseRegPass();
+FunctionPass* createX86GlobalBaseRegPass();
 
 /// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
 /// to local-dynamic TLS variables so that the TLS base address for the module
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 78edcf0..6912b57 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -166,6 +166,8 @@ def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
                                      "Call register indirect">;
 def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
                                    "LEA instruction needs inputs at AG stage">;
+def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
+                                   "LEA instruction with certain arguments is slow">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -195,8 +197,7 @@ def : Proc<"pentium3m",       [FeatureSSE1, FeatureSlowBTMem]>;
 def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
 def : Proc<"pentium4",        [FeatureSSE2]>;
 def : Proc<"pentium4m",       [FeatureSSE2, FeatureSlowBTMem]>;
-def : Proc<"x86-64",          [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
-                               FeatureFastUAMem]>;
+
 // Intel Core Duo.
 def : ProcessorModel<"yonah", SandyBridgeModel,
                      [FeatureSSE3, FeatureSlowBTMem]>;
@@ -227,6 +228,7 @@ def : ProcessorModel<"slm",  SLMModel, [ProcIntelSLM,
                                FeaturePCLMUL, FeatureAES,
                                FeatureCallRegIndirect,
                                FeaturePRFCHW,
+                               FeatureSlowLEA,
                                FeatureSlowBTMem, FeatureFastUAMem]>;
 // "Arrandale" along with corei3 and corei5
 def : ProcessorModel<"corei7", SandyBridgeModel,
@@ -329,6 +331,13 @@ def : Proc<"bdver3",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeaturePOPCNT, FeatureBMI,  FeatureTBM,
                                FeatureFMA, FeatureFSGSBase]>;
 
+// Excavator
+def : Proc<"bdver4",          [FeatureAVX2, FeatureXOP, FeatureFMA4,
+                               FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW,
+                               FeaturePCLMUL, FeatureF16C, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureBMI, FeatureBMI2,
+                               FeatureTBM, FeatureFMA, FeatureFSGSBase]>;
+
 def : Proc<"geode",           [Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
@@ -336,6 +345,20 @@ def : Proc<"winchip2",        [Feature3DNow]>;
 def : Proc<"c3",              [Feature3DNow]>;
 def : Proc<"c3-2",            [FeatureSSE1]>;
 
+// We also provide a generic 64-bit specific x86 processor model which tries to
+// be good for modern chips without enabling instruction set encodings past the
+// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
+// modern 64-bit x86 chip, and enables features that are generally beneficial.
+// 
+// We currently use the Sandy Bridge model as the default scheduling model as
+// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
+// covers a huge swath of x86 processors. If there are specific scheduling
+// knobs which need to be tuned differently for AMD chips, we might consider
+// forming a common base for them.
+def : ProcessorModel<"x86-64", SandyBridgeModel,
+                     [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
+                      FeatureFastUAMem]>;
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index fb66acc..1dca568 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -15,7 +15,6 @@
 #include "X86AsmPrinter.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
-#include "X86COFFMachineModuleInfo.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "llvm/ADT/SmallString.h"
@@ -102,7 +101,7 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
       MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
       MachineModuleInfoImpl::StubValueTy &StubSym =
           P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
-      if (StubSym.getPointer() == 0)
+      if (!StubSym.getPointer())
         StubSym = MachineModuleInfoImpl::
           StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
     } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
@@ -110,14 +109,14 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
       MachineModuleInfoImpl::StubValueTy &StubSym =
           P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(
               Sym);
-      if (StubSym.getPointer() == 0)
+      if (!StubSym.getPointer())
         StubSym = MachineModuleInfoImpl::
           StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
     } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
       MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$stub");
       MachineModuleInfoImpl::StubValueTy &StubSym =
           P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
-      if (StubSym.getPointer() == 0)
+      if (!StubSym.getPointer())
         StubSym = MachineModuleInfoImpl::
           StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
     }
@@ -174,7 +173,7 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
 
 static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
                          unsigned OpNo, raw_ostream &O,
-                         const char *Modifier = 0, unsigned AsmVariant = 0);
+                         const char *Modifier = nullptr, unsigned AsmVariant = 0);
 
 /// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value.  These print slightly differently, for
@@ -232,7 +231,7 @@ static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
 
 static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
                                  unsigned Op, raw_ostream &O,
-                                 const char *Modifier = NULL) {
+                                 const char *Modifier = nullptr) {
   const MachineOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
   const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
   const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
@@ -284,7 +283,7 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
 
 static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
                               unsigned Op, raw_ostream &O,
-                              const char *Modifier = NULL) {
+                              const char *Modifier = nullptr) {
   assert(isMem(MI, Op) && "Invalid memory reference!");
   const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
   if (Segment.getReg()) {
@@ -296,7 +295,7 @@ static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
 
 static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
                                    unsigned Op, raw_ostream &O,
-                                   const char *Modifier = NULL,
+                                   const char *Modifier = nullptr,
                                    unsigned AsmVariant = 1) {
   const MachineOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
   unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
@@ -464,7 +463,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     }
   }
 
-  printOperand(*this, MI, OpNo, O, /*Modifier*/ 0, AsmVariant);
+  printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant);
   return false;
 }
 
@@ -527,6 +526,55 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
   }
 }
 
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+                         MachineModuleInfoImpl::StubValueTy &MCSym) {
+  // L_foo$stub:
+  OutStreamer.EmitLabel(StubLabel);
+  //   .indirect_symbol _foo
+  OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+  if (MCSym.getInt())
+    // External to current translation unit.
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+  else
+    // Internal to current translation unit.
+    //
+    // When we place the LSDA into the TEXT section, the type info
+    // pointers need to be indirect and pc-rel. We accomplish this by
+    // using NLPs; however, sometimes the types are local to the file.
+    // We need to fill in the value for the NLP in those cases.
+    OutStreamer.EmitValue(
+        MCSymbolRefExpr::Create(MCSym.getPointer(), OutStreamer.getContext()),
+        4 /*size*/);
+}
+
+void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) {
+  SmallString<128> Directive;
+  raw_svector_ostream OS(Directive);
+  StringRef Name = Sym->getName();
+
+  if (Subtarget->isTargetKnownWindowsMSVC())
+    OS << " /EXPORT:";
+  else
+    OS << " -export:";
+
+  if ((Subtarget->isTargetWindowsGNU() || Subtarget->isTargetWindowsCygwin()) &&
+      (Name[0] == getDataLayout().getGlobalPrefix()))
+    Name = Name.drop_front();
+
+  OS << Name;
+
+  if (IsData) {
+    if (Subtarget->isTargetKnownWindowsMSVC())
+      OS << ",DATA";
+    else
+      OS << ",data";
+  }
+
+  OS.flush();
+  OutStreamer.EmitBytes(Directive);
+}
 
 void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
   if (Subtarget->isTargetMacho()) {
@@ -547,11 +595,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
                                    5, SectionKind::getMetadata());
       OutStreamer.SwitchSection(TheSection);
 
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      for (const auto &Stub : Stubs) {
         // L_foo$stub:
-        OutStreamer.EmitLabel(Stubs[i].first);
+        OutStreamer.EmitLabel(Stub.first);
         //   .indirect_symbol _foo
-        OutStreamer.EmitSymbolAttribute(Stubs[i].second.getPointer(),
+        OutStreamer.EmitSymbolAttribute(Stub.second.getPointer(),
                                         MCSA_IndirectSymbol);
         // hlt; hlt; hlt; hlt; hlt     hlt = 0xf4.
         const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
@@ -571,44 +619,24 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
                                    SectionKind::getMetadata());
       OutStreamer.SwitchSection(TheSection);
 
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        // L_foo$non_lazy_ptr:
-        OutStreamer.EmitLabel(Stubs[i].first);
-        // .indirect_symbol _foo
-        MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
-        OutStreamer.EmitSymbolAttribute(MCSym.getPointer(),
-                                        MCSA_IndirectSymbol);
-        // .long 0
-        if (MCSym.getInt())
-          // External to current translation unit.
-          OutStreamer.EmitIntValue(0, 4/*size*/);
-        else
-          // Internal to current translation unit.
-          //
-          // When we place the LSDA into the TEXT section, the type info
-          // pointers need to be indirect and pc-rel. We accomplish this by
-          // using NLPs.  However, sometimes the types are local to the file. So
-          // we need to fill in the value for the NLP in those cases.
-          OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
-                                                        OutContext), 4/*size*/);
-      }
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
+
       Stubs.clear();
       OutStreamer.AddBlankLine();
     }
 
     Stubs = MMIMacho.GetHiddenGVStubList();
     if (!Stubs.empty()) {
-      OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
-      EmitAlignment(2);
-
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        // L_foo$non_lazy_ptr:
-        OutStreamer.EmitLabel(Stubs[i].first);
-        // .long _foo
-        OutStreamer.EmitValue(MCSymbolRefExpr::
-                              Create(Stubs[i].second.getPointer(),
-                                     OutContext), 4/*size*/);
-      }
+      const MCSection *TheSection =
+        OutContext.getMachOSection("__IMPORT", "__pointers",
+                                   MachO::S_NON_LAZY_SYMBOL_POINTERS,
+                                   SectionKind::getMetadata());
+      OutStreamer.SwitchSection(TheSection);
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
+
       Stubs.clear();
       OutStreamer.AddBlankLine();
     }
@@ -630,46 +658,25 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
   }
 
   if (Subtarget->isTargetCOFF()) {
-    X86COFFMachineModuleInfo &COFFMMI =
-      MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
-
-    // Emit type information for external functions
-    typedef X86COFFMachineModuleInfo::externals_iterator externals_iterator;
-    for (externals_iterator I = COFFMMI.externals_begin(),
-                            E = COFFMMI.externals_end();
-                            I != E; ++I) {
-      OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
-      OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_EXTERNAL);
-      OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
-                                               << COFF::SCT_COMPLEX_TYPE_SHIFT);
-      OutStreamer.EndCOFFSymbolDef();
-    }
-
     // Necessary for dllexport support
     std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals;
 
-    for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
-      if (I->hasDLLExportStorageClass())
-        DLLExportedFns.push_back(getSymbol(I));
+    for (const auto &Function : M)
+      if (Function.hasDLLExportStorageClass())
+        DLLExportedFns.push_back(getSymbol(&Function));
 
-    for (Module::const_global_iterator I = M.global_begin(),
-           E = M.global_end(); I != E; ++I)
-      if (I->hasDLLExportStorageClass())
-        DLLExportedGlobals.push_back(getSymbol(I));
+    for (const auto &Global : M.globals())
+      if (Global.hasDLLExportStorageClass())
+        DLLExportedGlobals.push_back(getSymbol(&Global));
 
-    for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
-                                      I != E; ++I) {
-      const GlobalValue *GV = I;
-      if (!GV->hasDLLExportStorageClass())
+    for (const auto &Alias : M.aliases()) {
+      if (!Alias.hasDLLExportStorageClass())
         continue;
 
-      while (const GlobalAlias *A = dyn_cast<GlobalAlias>(GV))
-        GV = A->getAliasedGlobal();
-
-      if (isa<Function>(GV))
-        DLLExportedFns.push_back(getSymbol(I));
-      else if (isa<GlobalVariable>(GV))
-        DLLExportedGlobals.push_back(getSymbol(I));
+      if (Alias.getType()->getElementType()->isFunctionTy())
+        DLLExportedFns.push_back(getSymbol(&Alias));
+      else
+        DLLExportedGlobals.push_back(getSymbol(&Alias));
     }
 
     // Output linker support code for dllexported globals on windows.
@@ -678,28 +685,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
         static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
 
       OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection());
-      SmallString<128> name;
-      for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i) {
-        if (Subtarget->isTargetKnownWindowsMSVC())
-          name = " /EXPORT:";
-        else
-          name = " -export:";
-        name += DLLExportedGlobals[i]->getName();
-        if (Subtarget->isTargetKnownWindowsMSVC())
-          name += ",DATA";
-        else
-        name += ",data";
-        OutStreamer.EmitBytes(name);
-      }
 
-      for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) {
-        if (Subtarget->isTargetKnownWindowsMSVC())
-          name = " /EXPORT:";
-        else
-          name = " -export:";
-        name += DLLExportedFns[i]->getName();
-        OutStreamer.EmitBytes(name);
-      }
+      for (auto & Symbol : DLLExportedGlobals)
+        GenerateExportDirective(Symbol, /*IsData=*/true);
+      for (auto & Symbol : DLLExportedFns)
+        GenerateExportDirective(Symbol, /*IsData=*/false);
     }
   }
 
@@ -715,9 +705,9 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
       const DataLayout *TD = TM.getDataLayout();
 
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        OutStreamer.EmitLabel(Stubs[i].first);
-        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+      for (const auto &Stub : Stubs) {
+        OutStreamer.EmitLabel(Stub.first);
+        OutStreamer.EmitSymbolValue(Stub.second.getPointer(),
                                     TD->getPointerSize());
       }
       Stubs.clear();
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 3308cc2..e4eef5d 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -16,13 +16,15 @@
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
-
 class MCStreamer;
+class MCSymbol;
 
 class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   const X86Subtarget *Subtarget;
   StackMaps SM;
 
+  void GenerateExportDirective(const MCSymbol *Sym, bool IsData);
+
  public:
   explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
     : AsmPrinter(TM, Streamer), SM(*this) {
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/lib/Target/X86/X86COFFMachineModuleInfo.cpp
deleted file mode 100644
index 6a6125b..0000000
--- a/lib/Target/X86/X86COFFMachineModuleInfo.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-//===-- X86COFFMachineModuleInfo.cpp - X86 COFF MMI Impl ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is an MMI implementation for X86 COFF (windows) targets.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86COFFMachineModuleInfo.h"
-using namespace llvm;
-
-
-X86COFFMachineModuleInfo::~X86COFFMachineModuleInfo() {
-}
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.h b/lib/Target/X86/X86COFFMachineModuleInfo.h
deleted file mode 100644
index 0dfeb42..0000000
--- a/lib/Target/X86/X86COFFMachineModuleInfo.h
+++ /dev/null
@@ -1,46 +0,0 @@
-//===-- X86coffmachinemoduleinfo.h - X86 COFF MMI Impl ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is an MMI implementation for X86 COFF (windows) targets.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef X86COFF_MACHINEMODULEINFO_H
-#define X86COFF_MACHINEMODULEINFO_H
-
-#include "X86MachineFunctionInfo.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-
-namespace llvm {
-  class X86MachineFunctionInfo;
-  class DataLayout;
-
-/// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation
-/// for X86 COFF targets.
-class X86COFFMachineModuleInfo : public MachineModuleInfoImpl {
-  DenseSet<MCSymbol const *> Externals;
-public:
-  X86COFFMachineModuleInfo(const MachineModuleInfo &) {}
-  virtual ~X86COFFMachineModuleInfo();
-
-  void addExternalFunction(MCSymbol* Symbol) {
-    Externals.insert(Symbol);
-  }
-
-  typedef DenseSet<MCSymbol const *>::const_iterator externals_iterator;
-  externals_iterator externals_begin() const { return Externals.begin(); }
-  externals_iterator externals_end() const { return Externals.end(); }
-};
-
-
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h
index 040da35..e76f9fd 100644
--- a/lib/Target/X86/X86CallingConv.h
+++ b/lib/Target/X86/X86CallingConv.h
@@ -29,33 +29,6 @@ inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
   return false;
 }
 
-inline bool CC_X86_CDeclMethod_SRet(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                    CCValAssign::LocInfo &LocInfo,
-                                    ISD::ArgFlagsTy &ArgFlags, CCState &State) {
-  // Swap the order of the first two parameters if the first parameter is sret.
-  if (ArgFlags.isSRet()) {
-    assert(ValNo == 0);
-    assert(ValVT == MVT::i32);
-    State.AllocateStack(8, 4);
-    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, 4, LocVT, LocInfo));
-
-    // Indicate that we need to swap the order of the first and second
-    // parameters by "allocating" register zero.  There are no register
-    // parameters with cdecl methods, so we can use this to communicate to the
-    // next call.
-    State.AllocateReg(1);
-    return true;
-  } else if (ValNo == 1 && State.isAllocated(1)) {
-    assert(ValVT == MVT::i32 && "non-i32-sized this param unsupported");
-    // Stack was already allocated while processing sret.
-    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, 0, LocVT, LocInfo));
-    return true;
-  }
-
-  // All other args use the C calling convention.
-  return false;
-}
-
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 1cfd827..0824d4e 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -485,15 +485,6 @@ def CC_X86_32_ThisCall_Win : CallingConv<[
   CCDelegateTo<CC_X86_32_ThisCall_Common>
 ]>;
 
-def CC_X86_CDeclMethod : CallingConv<[
-  // Promote i8/i16 arguments to i32.
-  CCIfType<[i8, i16], CCPromoteToType<i32>>,
-
-  CCCustom<"CC_X86_CDeclMethod_SRet">,
-
-  CCDelegateTo<CC_X86_32_Common>
-]>;
-
 def CC_X86_32_ThisCall : CallingConv<[
   CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>,
   CCDelegateTo<CC_X86_32_ThisCall_Win>
@@ -583,7 +574,6 @@ def CC_Intel_OCL_BI : CallingConv<[
 def CC_X86_32 : CallingConv<[
   CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
   CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
-  CCIfCC<"CallingConv::X86_CDeclMethod", CCDelegateTo<CC_X86_CDeclMethod>>,
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index f6c4c2e..76718d0 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-emitter"
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86JITInfo.h"
@@ -36,6 +35,8 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-emitter"
+
 STATISTIC(NumEmitted, "Number of machine instructions emitted");
 
 namespace {
@@ -52,7 +53,7 @@ namespace {
   public:
     static char ID;
     explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
-      : MachineFunctionPass(ID), II(0), TD(0), TM(tm),
+      : MachineFunctionPass(ID), II(nullptr), TD(nullptr), TM(tm),
         MCE(mce), PICBaseOffset(0), Is64BitMode(false),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
@@ -450,7 +451,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
                                             intptr_t PCAdj) {
   const MachineOperand &Op3 = MI.getOperand(Op+3);
   int DispVal = 0;
-  const MachineOperand *DispForReloc = 0;
+  const MachineOperand *DispForReloc = nullptr;
 
   // Figure out what sort of displacement we have to handle here.
   if (Op3.isGlobal()) {
@@ -1475,7 +1476,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
 #ifndef NDEBUG
     dbgs() << "Cannot encode all operands of: " << MI << "\n";
 #endif
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 
   MCE.processDebugLoc(MI.getDebugLoc(), false);
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 1aab1ea..56bcfa3 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -183,7 +183,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
                                   unsigned &ResultReg) {
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
-  const TargetRegisterClass *RC = NULL;
+  const TargetRegisterClass *RC = nullptr;
   switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i1:
@@ -363,7 +363,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
     // it works...).
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
       if (const GlobalVariable *GVar =
-              dyn_cast_or_null<GlobalVariable>(GA->getAliasedGlobal()))
+              dyn_cast_or_null<GlobalVariable>(GA->getAliasee()))
         if (GVar->isThreadLocal())
           return false;
 
@@ -406,7 +406,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
       } else {
         // Issue load from stub.
         unsigned Opc = 0;
-        const TargetRegisterClass *RC = NULL;
+        const TargetRegisterClass *RC = nullptr;
         X86AddressMode StubAM;
         StubAM.Base.Reg = AM.Base.Reg;
         StubAM.GV = GV;
@@ -441,7 +441,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
       // Now construct the final address. Note that the Disp, Scale,
       // and Index values may already be set here.
       AM.Base.Reg = LoadReg;
-      AM.GV = 0;
+      AM.GV = nullptr;
       return true;
     }
   }
@@ -467,7 +467,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
 bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
   SmallVector<const Value *, 32> GEPs;
 redo_gep:
-  const User *U = NULL;
+  const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(V)) {
     // Don't walk into other basic blocks; it's possible we haven't
@@ -626,7 +626,7 @@ redo_gep:
 /// X86SelectCallAddress - Attempt to fill in an address from the given value.
 ///
 bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
-  const User *U = NULL;
+  const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   const Instruction *I = dyn_cast<Instruction>(V);
   // Record if the value is defined in the same basic block.
@@ -1247,7 +1247,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
 
 bool X86FastISel::X86SelectShift(const Instruction *I) {
   unsigned CReg = 0, OpReg = 0;
-  const TargetRegisterClass *RC = NULL;
+  const TargetRegisterClass *RC = nullptr;
   if (I->getType()->isIntegerTy(8)) {
     CReg = X86::CL;
     RC = &X86::GR8RegClass;
@@ -1487,7 +1487,7 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) {
   if (!Subtarget->hasCMov()) return false;
 
   unsigned Opc = 0;
-  const TargetRegisterClass *RC = NULL;
+  const TargetRegisterClass *RC = nullptr;
   if (VT == MVT::i16) {
     Opc = X86::CMOVE16rr;
     RC = &X86::GR16RegClass;
@@ -1821,10 +1821,10 @@ bool X86FastISel::FastLowerArguments() {
     }
   }
 
-  static const uint16_t GPR32ArgRegs[] = {
+  static const MCPhysReg GPR32ArgRegs[] = {
     X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
   };
-  static const uint16_t GPR64ArgRegs[] = {
+  static const MCPhysReg GPR64ArgRegs[] = {
     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
   };
 
@@ -1865,7 +1865,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   if (cast<CallInst>(I)->isTailCall())
     return false;
 
-  return DoSelectCall(I, 0);
+  return DoSelectCall(I, nullptr);
 }
 
 static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget,
@@ -1936,8 +1936,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   if (!X86SelectCallAddress(Callee, CalleeAM))
     return false;
   unsigned CalleeOp = 0;
-  const GlobalValue *GV = 0;
-  if (CalleeAM.GV != 0) {
+  const GlobalValue *GV = nullptr;
+  if (CalleeAM.GV != nullptr) {
     GV = CalleeAM.GV;
   } else if (CalleeAM.Base.Reg != 0) {
     CalleeOp = CalleeAM.Base.Reg;
@@ -2163,7 +2163,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
   if (Subtarget->is64Bit() && isVarArg && !isWin64) {
     // Count the number of XMM registers allocated.
-    static const uint16_t XMMArgRegs[] = {
+    static const MCPhysReg XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
@@ -2387,7 +2387,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
 
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
-  const TargetRegisterClass *RC = NULL;
+  const TargetRegisterClass *RC = nullptr;
   switch (VT.SimpleTy) {
   default: return 0;
   case MVT::i8:
@@ -2437,7 +2437,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
       // If the expression is just a basereg, then we're done, otherwise we need
       // to emit an LEA.
       if (AM.BaseType == X86AddressMode::RegBase &&
-          AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0)
+          AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
         return AM.Base.Reg;
 
       Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
@@ -2510,7 +2510,7 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
 
   // Get opcode and regclass for the given zero.
   unsigned Opc = 0;
-  const TargetRegisterClass *RC = NULL;
+  const TargetRegisterClass *RC = nullptr;
   switch (VT.SimpleTy) {
   default: return 0;
   case MVT::f32:
@@ -2558,7 +2558,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
   MachineInstr *Result =
     XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
-  if (Result == 0) return false;
+  if (!Result) return false;
 
   FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
   MI->eraseFromParent();
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index c2c234b..6c5b86f 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-fixup-LEAs"
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
@@ -28,6 +27,8 @@
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-fixup-LEAs"
+
 STATISTIC(NumLEAs, "Number of LEA instructions created");
 
 namespace {
@@ -56,6 +57,11 @@ namespace {
     void processInstruction(MachineBasicBlock::iterator& I,
                             MachineFunction::iterator MFI);
 
+    /// \brief Given a LEA instruction which is unprofitable
+    /// on Silvermont try to replace it with an equivalent ADD instruction
+    void processInstructionForSLM(MachineBasicBlock::iterator& I,
+                                  MachineFunction::iterator MFI);
+
     /// \brief Determine if an instruction references a machine register
     /// and, if so, whether it reads or writes the register.
     RegUsageState usesRegister(MachineOperand& p,
@@ -85,7 +91,7 @@ namespace {
   private:
     MachineFunction *MF;
     const TargetMachine *TM;
-    const TargetInstrInfo *TII; // Machine instruction info.
+    const X86InstrInfo *TII; // Machine instruction info.
 
   };
   char FixupLEAPass::ID = 0;
@@ -97,7 +103,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
   MachineInstr* MI = MBBI;
   MachineInstr* NewMI;
   switch (MI->getOpcode()) {
-  case X86::MOV32rr: 
+  case X86::MOV32rr:
   case X86::MOV64rr: {
     const MachineOperand& Src = MI->getOperand(1);
     const MachineOperand& Dest = MI->getOperand(0);
@@ -123,7 +129,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
     if (!MI->getOperand(2).isImm()) {
       // convertToThreeAddress will call getImm()
       // which requires isImm() to be true
-      return 0;
+      return nullptr;
     }
     break;
   case X86::ADD16rr:
@@ -132,10 +138,10 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
       // if src1 != src2, then convertToThreeAddress will
       // need to create a Virtual register, which we cannot do
       // after register allocation.
-      return 0;
+      return nullptr;
     }
   }
-  return TII->convertToThreeAddress(MFI, MBBI, 0);
+  return TII->convertToThreeAddress(MFI, MBBI, nullptr);
 }
 
 FunctionPass *llvm::createX86FixupLEAs() {
@@ -143,9 +149,12 @@ FunctionPass *llvm::createX86FixupLEAs() {
 }
 
 bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
-  MF = &Func;
-  TM = &MF->getTarget();
-  TII = TM->getInstrInfo();
+  TM = &Func.getTarget();
+  const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>();
+  if (!ST.LEAusesAG() && !ST.slowLEA())
+    return false;
+
+  TII = static_cast<const X86InstrInfo*>(TM->getInstrInfo());
 
   DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   // Process all basic blocks.
@@ -211,7 +220,7 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
     InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst);
     Found = getPreviousInstr(CurInst, MFI);
   }
-  return 0;
+  return nullptr;
 }
 
 void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I,
@@ -242,9 +251,9 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p,
     MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI);
     if (NewMI) {
       ++NumLEAs;
-      DEBUG(dbgs() << "Candidate to replace:"; MBI->dump(););
+      DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
       // now to replace with an equivalent LEA...
-      DEBUG(dbgs() << "Replaced by: "; NewMI->dump(););
+      DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
       MFI->erase(MBI);
       MachineBasicBlock::iterator J =
                              static_cast<MachineBasicBlock::iterator> (NewMI);
@@ -253,10 +262,80 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p,
   }
 }
 
+void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
+                                            MachineFunction::iterator MFI) {
+  MachineInstr *MI = I;
+  const int opcode = MI->getOpcode();
+  if (opcode != X86::LEA16r && opcode != X86::LEA32r && opcode != X86::LEA64r &&
+      opcode != X86::LEA64_32r)
+    return;
+  if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() ||
+      !TII->isSafeToClobberEFLAGS(*MFI, I))
+    return;
+  const unsigned DstR = MI->getOperand(0).getReg();
+  const unsigned SrcR1 = MI->getOperand(1).getReg();
+  const unsigned SrcR2 = MI->getOperand(3).getReg();
+  if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
+    return;
+  if (MI->getOperand(2).getImm() > 1)
+    return;
+  int addrr_opcode, addri_opcode;
+  switch (opcode) {
+  case X86::LEA16r:
+    addrr_opcode = X86::ADD16rr;
+    addri_opcode = X86::ADD16ri;
+    break;
+  case X86::LEA32r:
+    addrr_opcode = X86::ADD32rr;
+    addri_opcode = X86::ADD32ri;
+    break;
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+    addrr_opcode = X86::ADD64rr;
+    addri_opcode = X86::ADD64ri32;
+    break;
+  default:
+    assert(false && "Unexpected LEA instruction");
+  }
+  DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
+  DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+  MachineInstr *NewMI = 0;
+  const MachineOperand &Dst = MI->getOperand(0);
+  // Make ADD instruction for two registers writing to LEA's destination
+  if (SrcR1 != 0 && SrcR2 != 0) {
+    const MachineOperand &Src1 = MI->getOperand(SrcR1 == DstR ? 1 : 3);
+    const MachineOperand &Src2 = MI->getOperand(SrcR1 == DstR ? 3 : 1);
+    NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addrr_opcode))
+                .addOperand(Dst)
+                .addOperand(Src1)
+                .addOperand(Src2);
+    MFI->insert(I, NewMI);
+    DEBUG(NewMI->dump(););
+  }
+  // Make ADD instruction for immediate
+  if (MI->getOperand(4).getImm() != 0) {
+    const MachineOperand &SrcR = MI->getOperand(SrcR1 == DstR ? 1 : 3);
+    NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addri_opcode))
+                .addOperand(Dst)
+                .addOperand(SrcR)
+                .addImm(MI->getOperand(4).getImm());
+    MFI->insert(I, NewMI);
+    DEBUG(NewMI->dump(););
+  }
+  if (NewMI) {
+    MFI->erase(I);
+    I = static_cast<MachineBasicBlock::iterator>(NewMI);
+  }
+}
+
 bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
                                      MachineFunction::iterator MFI) {
 
-  for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I)
-    processInstruction(I, MFI);
+  for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
+    if (TM->getSubtarget<X86Subtarget>().isSLM())
+      processInstructionForSLM(I, MFI);
+    else
+      processInstruction(I, MFI);
+  }
   return false;
 }
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 7955ade..c8a3ab3 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -23,7 +23,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-codegen"
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
@@ -45,6 +44,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-codegen"
+
 STATISTIC(NumFXCH, "Number of fxch instructions inserted");
 STATISTIC(NumFP  , "Number of floating point instructions");
 
@@ -430,7 +431,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
     if (FPInstClass == X86II::NotFP)
       continue;  // Efficiently ignore non-fp insts!
 
-    MachineInstr *PrevMI = 0;
+    MachineInstr *PrevMI = nullptr;
     if (I != BB.begin())
       PrevMI = std::prev(I);
 
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index f0ad4d1..4c1374f 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -182,7 +182,7 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
       }
     }
 
-    MachineInstr *MI = NULL;
+    MachineInstr *MI = nullptr;
 
     if (UseLEA) {
       MI =  addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
@@ -204,7 +204,7 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
 /// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
 static
 void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-                      unsigned StackPtr, uint64_t *NumBytes = NULL) {
+                      unsigned StackPtr, uint64_t *NumBytes = nullptr) {
   if (MBBI == MBB.begin()) return;
 
   MachineBasicBlock::iterator PI = std::prev(MBBI);
@@ -225,11 +225,12 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
   }
 }
 
-/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower iterator.
+/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower
+/// iterator.
 static
 void mergeSPUpdatesDown(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator &MBBI,
-                        unsigned StackPtr, uint64_t *NumBytes = NULL) {
+                        unsigned StackPtr, uint64_t *NumBytes = nullptr) {
   // FIXME:  THIS ISN'T RUN!!!
   return;
 
@@ -257,19 +258,19 @@ void mergeSPUpdatesDown(MachineBasicBlock &MBB,
 }
 
 /// mergeSPUpdates - Checks the instruction before/after the passed
-/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and the
-/// stack adjustment is returned as a positive value for ADD/LEA and a negative for
-/// SUB.
+/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and
+/// the stack adjustment is returned as a positive value for ADD/LEA and a
+/// negative for SUB.
 static int mergeSPUpdates(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator &MBBI,
-                           unsigned StackPtr,
-                           bool doMergeWithPrevious) {
+                          MachineBasicBlock::iterator &MBBI, unsigned StackPtr,
+                          bool doMergeWithPrevious) {
   if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
       (!doMergeWithPrevious && MBBI == MBB.end()))
     return 0;
 
   MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
-  MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : std::next(MBBI);
+  MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
+                                                       : std::next(MBBI);
   unsigned Opc = PI->getOpcode();
   int Offset = 0;
 
@@ -366,8 +367,10 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
 
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
     unsigned CFIIndex =
-        MMI.addFrameInst(MCCFIInstruction::createOffset(0, DwarfReg, Offset));
-    BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+        MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg,
+                                                        Offset));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
   }
 }
 
@@ -446,7 +449,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       !MFI->adjustsStack() &&                           // No calls.
       !IsWin64 &&                                       // Win64 has no Red Zone
       !usesTheStack(MF) &&                              // Don't push and pop.
-      !MF.getTarget().Options.EnableSegmentedStacks) {  // Regular stack
+      !MF.shouldSplitStack()) {                         // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
     if (HasFP) MinSize += SlotSize;
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
@@ -511,15 +514,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
       unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(0, 2 * stackGrowth));
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+          MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
 
       // Change the rule for the FramePtr to be an "offset" rule.
       unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
       CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(0, DwarfFramePtr, 2 * stackGrowth));
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+          MCCFIInstruction::createOffset(nullptr,
+                                         DwarfFramePtr, 2 * stackGrowth));
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -534,8 +538,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       // Define the current CFA to use the EBP/RBP register.
       unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
       unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaRegister(0, DwarfFramePtr));
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+          MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -564,7 +568,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       assert(StackSize);
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
       StackOffset += stackGrowth;
     }
@@ -698,9 +702,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
       unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(0, -StackSize + stackGrowth));
+          MCCFIInstruction::createDefCfaOffset(nullptr,
+                                               -StackSize + stackGrowth));
 
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -905,7 +910,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
+int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                          int FI) const {
   const X86RegisterInfo *RegInfo =
     static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
   const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1170,6 +1176,15 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
       !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD())
     report_fatal_error("Segmented stacks not supported on this platform.");
 
+  // Eventually StackSize will be calculated by a link-time pass; which will
+  // also decide whether checking code needs to be injected into this particular
+  // prologue.
+  StackSize = MFI->getStackSize();
+
+  // Do not generate a prologue for functions with a stack of size zero
+  if (StackSize == 0)
+    return;
+
   MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
   MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1194,11 +1209,6 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MF.push_front(allocMBB);
   MF.push_front(checkMBB);
 
-  // Eventually StackSize will be calculated by a link-time pass; which will
-  // also decide whether checking code needs to be injected into this particular
-  // prologue.
-  StackSize = MFI->getStackSize();
-
   // When the frame size is less than 256 we just compare the stack
   // boundary directly to the value of the stack pointer, per gcc.
   bool CompareStackPointer = StackSize < kSplitStackAvailable;
@@ -1256,22 +1266,23 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
         .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
     } else if (STI.isTargetDarwin()) {
 
-      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register
+      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
       unsigned ScratchReg2;
       bool SaveScratch2;
       if (CompareStackPointer) {
-        // The primary scratch register is available for holding the TLS offset
+        // The primary scratch register is available for holding the TLS offset.
         ScratchReg2 = GetScratchRegister(Is64Bit, MF, true);
         SaveScratch2 = false;
       } else {
         // Need to use a second register to hold the TLS offset
         ScratchReg2 = GetScratchRegister(Is64Bit, MF, false);
 
-        // Unfortunately, with fastcc the second scratch register may hold an arg
+        // Unfortunately, with fastcc the second scratch register may hold an
+        // argument.
         SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
       }
 
-      // If Scratch2 is live-in then it needs to be saved
+      // If Scratch2 is live-in then it needs to be saved.
       assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
              "Scratch register is live-in and not saved");
 
@@ -1348,14 +1359,14 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
 /// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
 ///
 /// CheckStack:
-///	  temp0 = sp - MaxStack
-///	  if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+///       temp0 = sp - MaxStack
+///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 /// OldStart:
-///	  ...
+///       ...
 /// IncStack:
-///	  call inc_stack   # doubles the stack space
-///	  temp0 = sp - MaxStack
-///	  if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+///       call inc_stack   # doubles the stack space
+///       temp0 = sp - MaxStack
+///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
   const X86InstrInfo &TII = *TM.getInstrInfo();
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1514,7 +1525,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
     Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
 
-    MachineInstr *New = 0;
+    MachineInstr *New = nullptr;
     if (Opcode == TII.getCallFrameSetupOpcode()) {
       New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
                     StackPtr)
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index f0db8cb..208bb8b 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -47,7 +47,7 @@ public:
   void adjustForHiPEPrologue(MachineFunction &MF) const override;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                        RegScavenger *RS = NULL) const override;
+                                     RegScavenger *RS = nullptr) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 3e45adb..74386d3 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-isel"
 #include "X86.h"
 #include "X86InstrBuilder.h"
 #include "X86MachineFunctionInfo.h"
@@ -36,6 +35,8 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-isel"
+
 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
 
 //===----------------------------------------------------------------------===//
@@ -70,17 +71,18 @@ namespace {
 
     X86ISelAddressMode()
       : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
-        Segment(), GV(0), CP(0), BlockAddr(0), ES(0), JT(-1), Align(0),
-        SymbolFlags(X86II::MO_NO_FLAG) {
+        Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
+        JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {
     }
 
     bool hasSymbolicDisplacement() const {
-      return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0;
+      return GV != nullptr || CP != nullptr || ES != nullptr ||
+             JT != -1 || BlockAddr != nullptr;
     }
 
     bool hasBaseOrIndexReg() const {
       return BaseType == FrameIndexBase ||
-             IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
+             IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
     }
 
     /// isRIPRelative - Return true if this addressing mode is already RIP
@@ -102,14 +104,14 @@ namespace {
     void dump() {
       dbgs() << "X86ISelAddressMode " << this << '\n';
       dbgs() << "Base_Reg ";
-      if (Base_Reg.getNode() != 0)
+      if (Base_Reg.getNode())
         Base_Reg.getNode()->dump();
       else
         dbgs() << "nul";
       dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
              << " Scale" << Scale << '\n'
              << "IndexReg ";
-      if (IndexReg.getNode() != 0)
+      if (IndexReg.getNode())
         IndexReg.getNode()->dump();
       else
         dbgs() << "nul";
@@ -160,6 +162,13 @@ namespace {
       return "X86 DAG->DAG Instruction Selection";
     }
 
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      // Reset the subtarget each time through.
+      Subtarget = &TM.getSubtarget<X86Subtarget>();
+      SelectionDAGISel::runOnMachineFunction(MF);
+      return true;
+    }
+
     void EmitFunctionEntryCode() override;
 
     bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
@@ -374,14 +383,13 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
       else
         Ops.push_back(Chain.getOperand(i));
     SDValue NewChain =
-      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load),
-                      MVT::Other, &Ops[0], Ops.size());
+      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
     Ops.clear();
     Ops.push_back(NewChain);
   }
   for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i)
     Ops.push_back(OrigChain.getOperand(i));
-  CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size());
+  CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
   CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
                              Load.getOperand(1), Load.getOperand(2));
 
@@ -390,7 +398,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
   Ops.push_back(SDValue(Load.getNode(), 1));
   for (unsigned i = 1, e = NumOps; i != e; ++i)
     Ops.push_back(Call.getOperand(i));
-  CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], NumOps);
+  CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
 }
 
 /// isCalleeLoad - Return true if call address is a load and it can be
@@ -612,7 +620,7 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   // gs:0 (or fs:0 on X86-64) contains its own address.
   // For more information see http://people.redhat.com/drepper/tls.pdf
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
-    if (C->getSExtValue() == 0 && AM.Segment.getNode() == 0 &&
+    if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
         Subtarget->isTargetLinux())
       switch (N->getPointerInfo().getAddrSpace()) {
       case 256:
@@ -733,7 +741,7 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
   // a smaller encoding and avoids a scaled-index.
   if (AM.Scale == 2 &&
       AM.BaseType == X86ISelAddressMode::RegBase &&
-      AM.Base_Reg.getNode() == 0) {
+      AM.Base_Reg.getNode() == nullptr) {
     AM.Base_Reg = AM.IndexReg;
     AM.Scale = 1;
   }
@@ -745,8 +753,8 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
       Subtarget->is64Bit() &&
       AM.Scale == 1 &&
       AM.BaseType == X86ISelAddressMode::RegBase &&
-      AM.Base_Reg.getNode() == 0 &&
-      AM.IndexReg.getNode() == 0 &&
+      AM.Base_Reg.getNode() == nullptr &&
+      AM.IndexReg.getNode() == nullptr &&
       AM.SymbolFlags == X86II::MO_NO_FLAG &&
       AM.hasSymbolicDisplacement())
     AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
@@ -926,7 +934,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   APInt MaskedHighBits =
     APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
   APInt KnownZero, KnownOne;
-  DAG.ComputeMaskedBits(X, KnownZero, KnownOne);
+  DAG.computeKnownBits(X, KnownZero, KnownOne);
   if (MaskedHighBits != KnownZero) return true;
 
   // We've identified a pattern that can be transformed into a single shift
@@ -1009,7 +1017,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
   case ISD::FrameIndex:
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
-        AM.Base_Reg.getNode() == 0 &&
+        AM.Base_Reg.getNode() == nullptr &&
         (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
       AM.BaseType = X86ISelAddressMode::FrameIndexBase;
       AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
@@ -1018,7 +1026,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     break;
 
   case ISD::SHL:
-    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1)
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
       break;
 
     if (ConstantSDNode
@@ -1052,7 +1060,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
   case ISD::SRL: {
     // Scale must not be used already.
-    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
 
     SDValue And = N.getOperand(0);
     if (And.getOpcode() != ISD::AND) break;
@@ -1086,8 +1094,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
   case X86ISD::MUL_IMM:
     // X*[3,5,9] -> X+X*[2,4,8]
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
-        AM.Base_Reg.getNode() == 0 &&
-        AM.IndexReg.getNode() == 0) {
+        AM.Base_Reg.getNode() == nullptr &&
+        AM.IndexReg.getNode() == nullptr) {
       if (ConstantSDNode
             *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
         if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
@@ -1237,7 +1245,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     // with a constant to enable use of the scaled offset field.
 
     // Scale must not be used already.
-    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
 
     SDValue Shift = N.getOperand(0);
     if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
@@ -1276,7 +1284,7 @@ bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
   // Is the base register already occupied?
   if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
     // If so, check to see if the scale index register is set.
-    if (AM.IndexReg.getNode() == 0) {
+    if (!AM.IndexReg.getNode()) {
       AM.IndexReg = N;
       AM.Scale = 1;
       return false;
@@ -1567,7 +1575,7 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
 
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
-    return NULL;
+    return nullptr;
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
   const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
@@ -1756,7 +1764,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
 
 SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
   if (Node->hasAnyUseOfValue(0))
-    return 0;
+    return nullptr;
 
   SDLoc dl(Node);
 
@@ -1768,13 +1776,13 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
   SDValue Val = Node->getOperand(2);
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
-    return 0;
+    return nullptr;
 
   // Which index into the table.
   enum AtomicOpc Op;
   switch (Node->getOpcode()) {
     default:
-      return 0;
+      return nullptr;
     case ISD::ATOMIC_LOAD_OR:
       Op = OR;
       break;
@@ -1795,7 +1803,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
 
   unsigned Opc = 0;
   switch (NVT.SimpleTy) {
-    default: return 0;
+    default: return nullptr;
     case MVT::i8:
       if (isCN)
         Opc = AtomicOpcTbl[Op][ConstantI8];
@@ -1847,7 +1855,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
   }
   cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
   SDValue RetVals[] = { Undef, Ret };
-  return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+  return CurDAG->getMergeValues(RetVals, dl).getNode();
 }
 
 /// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
@@ -1990,7 +1998,7 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
       // Make a new TokenFactor with all the other input chains except
       // for the load.
       InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
-                                   MVT::Other, &ChainOps[0], ChainOps.size());
+                                   MVT::Other, ChainOps);
   }
   if (!ChainCheck)
     return false;
@@ -2027,7 +2035,7 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
   SDValue VMask = Node->getOperand(5);
   ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
   if (!Scale)
-    return 0;
+    return nullptr;
 
   SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
                                    MVT::Other);
@@ -2058,7 +2066,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
     Node->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
 
   switch (Opcode) {
@@ -2108,7 +2116,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       SDNode *RetVal = SelectGather(Node, Opc);
       if (RetVal)
         // We already called ReplaceUses inside SelectGather.
-        return NULL;
+        return nullptr;
       break;
     }
     }
@@ -2259,7 +2267,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
     ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2));
-    return NULL;
+    return nullptr;
   }
 
   case ISD::SMUL_LOHI:
@@ -2386,7 +2394,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
     // Copy the low half of the result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
-      if (ResLo.getNode() == 0) {
+      if (!ResLo.getNode()) {
         assert(LoReg && "Register for low half is not defined!");
         ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
                                        InFlag);
@@ -2397,7 +2405,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
     // Copy the high half of the result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
-      if (ResHi.getNode() == 0) {
+      if (!ResHi.getNode()) {
         assert(HiReg && "Register for high half is not defined!");
         ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
                                        InFlag);
@@ -2407,7 +2415,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
     }
 
-    return NULL;
+    return nullptr;
   }
 
   case ISD::SDIVREM:
@@ -2575,7 +2583,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       ReplaceUses(SDValue(Node, 1), Result);
       DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
     }
-    return NULL;
+    return nullptr;
   }
 
   case X86ISD::CMP:
@@ -2632,7 +2640,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
                     SDValue(NewNode, 0));
-        return NULL;
+        return nullptr;
       }
 
       // For example, "testl %eax, $2048" to "testb %ah, $8".
@@ -2669,7 +2677,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
                     SDValue(NewNode, 0));
-        return NULL;
+        return nullptr;
       }
 
       // For example, "testl %eax, $32776" to "testw %ax, $32776".
@@ -2691,7 +2699,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
                     SDValue(NewNode, 0));
-        return NULL;
+        return nullptr;
       }
 
       // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
@@ -2713,7 +2721,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
                     SDValue(NewNode, 0));
-        return NULL;
+        return nullptr;
       }
     }
     break;
@@ -2740,7 +2748,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue StoredVal = StoreNode->getOperand(1);
     unsigned Opc = StoredVal->getOpcode();
 
-    LoadSDNode *LoadNode = 0;
+    LoadSDNode *LoadNode = nullptr;
     SDValue InputChain;
     if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
                              LoadNode, InputChain))
@@ -2772,7 +2780,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   SDNode *ResNode = SelectCode(Node);
 
   DEBUG(dbgs() << "=> ";
-        if (ResNode == NULL || ResNode == Node)
+        if (ResNode == nullptr || ResNode == Node)
           Node->dump(CurDAG);
         else
           ResNode->dump(CurDAG);
@@ -2790,7 +2798,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
   case 'v':   // not offsetable    ??
   default: return true;
   case 'm':   // memory
-    if (!SelectAddr(0, Op, Op0, Op1, Op2, Op3, Op4))
+    if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
       return true;
     break;
   }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2a35061..cbaf44e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-isel"
 #include "X86ISelLowering.h"
 #include "Utils/X86ShuffleDecode.h"
 #include "X86CallingConv.h"
@@ -23,6 +22,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/VariadicFunction.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -52,6 +52,8 @@
 #include <cctype>
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-isel"
+
 STATISTIC(NumTailCalls, "Number of tail calls");
 
 // Forward declarations.
@@ -84,7 +86,8 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
   // If the input is a buildvector just emit a smaller one.
   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
-                       Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
+                       makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+                                    ElemsPerChunk));
 
   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
@@ -265,10 +268,10 @@ void X86TargetLowering::resetOperationActions() {
 
     // The _ftol2 runtime function has an unusual calling conv, which
     // is modeled by a special pseudo-instruction.
-    setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
-    setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
-    setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
-    setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
+    setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
+    setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
+    setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
+    setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
   }
 
   if (Subtarget->isTargetDarwin()) {
@@ -635,15 +638,8 @@ void X86TargetLowering::resetOperationActions() {
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  if (Subtarget->isOSWindows() && !Subtarget->isTargetMacho())
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
-                       MVT::i64 : MVT::i32, Custom);
-  else if (TM.Options.EnableSegmentedStacks)
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
-                       MVT::i64 : MVT::i32, Custom);
-  else
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
-                       MVT::i64 : MVT::i32, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+                     MVT::i64 : MVT::i32, Custom);
 
   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
@@ -832,7 +828,9 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FRINT, VT, Expand);
     setOperationAction(ISD::FNEARBYINT, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::UDIVREM, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
@@ -944,6 +942,10 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
+    setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
+    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
@@ -1036,6 +1038,10 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
 
     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
+
+    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
   }
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
@@ -1064,11 +1070,14 @@ void X86TargetLowering::resetOperationActions() {
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
-    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
+    // There is no BLENDI for byte vectors. We don't need to custom lower
+    // some vselects for now.
     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
 
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
@@ -1111,9 +1120,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
 
     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
-
-    setOperationAction(ISD::SDIV,              MVT::v8i16, Custom);
-    setOperationAction(ISD::SDIV,              MVT::v4i32, Custom);
   }
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
@@ -1178,8 +1184,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
 
-    setOperationAction(ISD::SDIV,              MVT::v16i16, Custom);
-
     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
@@ -1189,10 +1193,10 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
-    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
 
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
@@ -1232,9 +1236,13 @@ void X86TargetLowering::resetOperationActions() {
       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
       // Don't lower v32i8 because there is no 128-bit byte mul
 
-      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
+      setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
+      setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
+      setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
+      setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
 
-      setOperationAction(ISD::SDIV,            MVT::v8i32, Custom);
+      setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
+      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
     } else {
       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
@@ -1343,7 +1351,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
-    setOperationAction(ISD::SDIV,               MVT::v16i32, Custom);
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
@@ -1358,9 +1365,11 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
 
@@ -1392,6 +1401,8 @@ void X86TargetLowering::resetOperationActions() {
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
@@ -1474,6 +1485,8 @@ void X86TargetLowering::resetOperationActions() {
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  if (!Subtarget->is64Bit())
+    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
 
   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   // handle type legalization for these operations here.
@@ -1498,9 +1511,9 @@ void X86TargetLowering::resetOperationActions() {
 
   if (!Subtarget->is64Bit()) {
     // These libcalls are not available in 32-bit.
-    setLibcallName(RTLIB::SHL_I128, 0);
-    setLibcallName(RTLIB::SRL_I128, 0);
-    setLibcallName(RTLIB::SRA_I128, 0);
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
   }
 
   // Combine sin / cos into one node or libcall if possible.
@@ -1516,6 +1529,15 @@ void X86TargetLowering::resetOperationActions() {
     }
   }
 
+  if (Subtarget->isTargetWin64()) {
+    setOperationAction(ISD::SDIV, MVT::i128, Custom);
+    setOperationAction(ISD::UDIV, MVT::i128, Custom);
+    setOperationAction(ISD::SREM, MVT::i128, Custom);
+    setOperationAction(ISD::UREM, MVT::i128, Custom);
+    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
+  }
+
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
@@ -1540,6 +1562,7 @@ void X86TargetLowering::resetOperationActions() {
   setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
@@ -1738,7 +1761,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
 // FIXME: Why this routine is here? Move to RegInfo!
 std::pair<const TargetRegisterClass*, uint8_t>
 X86TargetLowering::findRepresentativeClass(MVT VT) const{
-  const TargetRegisterClass *RRC = 0;
+  const TargetRegisterClass *RRC = nullptr;
   uint8_t Cost = 1;
   switch (VT.SimpleTy) {
   default:
@@ -1806,8 +1829,8 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
   return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
 
-const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
-  static const uint16_t ScratchRegs[] = { X86::R11, 0 };
+const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
   return ScratchRegs;
 }
 
@@ -1930,8 +1953,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(X86ISD::RET_FLAG, dl,
-                     MVT::Other, &RetOps[0], RetOps.size());
+  return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -2285,22 +2307,25 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
     InVals.push_back(ArgValue);
   }
 
-  // The x86-64 ABIs require that for returning structs by value we copy
-  // the sret argument into %rax/%eax (depending on ABI) for the return.
-  // Win32 requires us to put the sret argument to %eax as well.
-  // Save the argument into a virtual register so that we can access it
-  // from the return points.
-  if (MF.getFunction()->hasStructRetAttr() &&
-      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
-    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
-    unsigned Reg = FuncInfo->getSRetReturnReg();
-    if (!Reg) {
-      MVT PtrTy = getPointerTy();
-      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
-      FuncInfo->setSRetReturnReg(Reg);
+  if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+      // The x86-64 ABIs require that for returning structs by value we copy
+      // the sret argument into %rax/%eax (depending on ABI) for the return.
+      // Win32 requires us to put the sret argument to %eax as well.
+      // Save the argument into a virtual register so that we can access it
+      // from the return points.
+      if (Ins[i].Flags.isSRet()) {
+        unsigned Reg = FuncInfo->getSRetReturnReg();
+        if (!Reg) {
+          MVT PtrTy = getPointerTy();
+          Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+          FuncInfo->setSRetReturnReg(Reg);
+        }
+        SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+        break;
+      }
     }
-    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   }
 
   unsigned StackSize = CCInfo.getNextStackOffset();
@@ -2320,17 +2345,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
 
       // FIXME: We should really autogenerate these arrays
-      static const uint16_t GPR64ArgRegsWin64[] = {
+      static const MCPhysReg GPR64ArgRegsWin64[] = {
         X86::RCX, X86::RDX, X86::R8,  X86::R9
       };
-      static const uint16_t GPR64ArgRegs64Bit[] = {
+      static const MCPhysReg GPR64ArgRegs64Bit[] = {
         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
       };
-      static const uint16_t XMMArgRegs64Bit[] = {
+      static const MCPhysReg XMMArgRegs64Bit[] = {
         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
       };
-      const uint16_t *GPR64ArgRegs;
+      const MCPhysReg *GPR64ArgRegs;
       unsigned NumXMMRegs = 0;
 
       if (IsWin64) {
@@ -2424,13 +2449,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
           SaveXMMOps.push_back(Val);
         }
         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
-                                     MVT::Other,
-                                     &SaveXMMOps[0], SaveXMMOps.size()));
+                                     MVT::Other, SaveXMMOps));
       }
 
       if (!MemOps.empty())
-        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                            &MemOps[0], MemOps.size());
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
     }
   }
 
@@ -2497,10 +2520,10 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
 
 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
 /// optimization is performed and it is required (FPDiff!=0).
-static SDValue
-EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
-                         SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
-                         unsigned SlotSize, int FPDiff, SDLoc dl) {
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+                                        SDValue Chain, SDValue RetAddrFrIdx,
+                                        EVT PtrVT, unsigned SlotSize,
+                                        int FPDiff, SDLoc dl) {
   // Store the return address to the appropriate stack slot.
   if (!FPDiff) return Chain;
   // Calculate the new stack slot for the return address.
@@ -2537,7 +2560,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (MF.getTarget().Options.DisableTailCalls)
     isTailCall = false;
 
-  if (isTailCall) {
+  bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+  if (IsMustTail) {
+    // Force this to be a tail call.  The verifier rules are enough to ensure
+    // that we can lower this successfully without moving the return address
+    // around.
+    isTailCall = true;
+  } else if (isTailCall) {
     // Check if it's really possible to do a tail call.
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                     isVarArg, SR != NotStructReturn,
@@ -2578,7 +2607,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
   int FPDiff = 0;
-  if (isTailCall && !IsSibcall) {
+  if (isTailCall && !IsSibcall && !IsMustTail) {
     // Lower arguments at fp - stackoffset + fpdiff.
     X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
@@ -2683,7 +2712,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       }
     } else if (!IsSibcall && (!isTailCall || isByVal)) {
       assert(VA.isMemLoc());
-      if (StackPtr.getNode() == 0)
+      if (!StackPtr.getNode())
         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
                                       getPointerTy());
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
@@ -2692,8 +2721,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   if (Subtarget->isPICStyleGOT()) {
     // ELF / PIC requires GOT in the EBX register before function calls via PLT
@@ -2730,7 +2758,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // registers used and is in the range 0 - 8 inclusive.
 
     // Count the number of XMM registers allocated.
-    static const uint16_t XMMArgRegs[] = {
+    static const MCPhysReg XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
@@ -2742,8 +2770,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
   }
 
-  // For tail calls lower the arguments to the 'real' stack slot.
-  if (isTailCall) {
+  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
+  // don't need this because the eligibility check rejects calls that require
+  // shuffling arguments passed in memory.
+  if (!IsSibcall && isTailCall) {
     // Force all the incoming stack arguments to be loaded from the stack
     // before any new outgoing arguments are stored to the stack, because the
     // outgoing stack slots may alias the incoming argument stack slots, and
@@ -2755,45 +2785,45 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     SmallVector<SDValue, 8> MemOpChains2;
     SDValue FIN;
     int FI = 0;
-    if (getTargetMachine().Options.GuaranteedTailCallOpt) {
-      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-        CCValAssign &VA = ArgLocs[i];
-        if (VA.isRegLoc())
-          continue;
-        assert(VA.isMemLoc());
-        SDValue Arg = OutVals[i];
-        ISD::ArgFlagsTy Flags = Outs[i].Flags;
-        // Create frame index.
-        int32_t Offset = VA.getLocMemOffset()+FPDiff;
-        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
-        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
-        FIN = DAG.getFrameIndex(FI, getPointerTy());
-
-        if (Flags.isByVal()) {
-          // Copy relative to framepointer.
-          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
-          if (StackPtr.getNode() == 0)
-            StackPtr = DAG.getCopyFromReg(Chain, dl,
-                                          RegInfo->getStackRegister(),
-                                          getPointerTy());
-          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
-
-          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
-                                                           ArgChain,
-                                                           Flags, DAG, dl));
-        } else {
-          // Store relative to framepointer.
-          MemOpChains2.push_back(
-            DAG.getStore(ArgChain, dl, Arg, FIN,
-                         MachinePointerInfo::getFixedStack(FI),
-                         false, false, 0));
-        }
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+      CCValAssign &VA = ArgLocs[i];
+      if (VA.isRegLoc())
+        continue;
+      assert(VA.isMemLoc());
+      SDValue Arg = OutVals[i];
+      ISD::ArgFlagsTy Flags = Outs[i].Flags;
+      // Skip inalloca arguments.  They don't require any work.
+      if (Flags.isInAlloca())
+        continue;
+      // Create frame index.
+      int32_t Offset = VA.getLocMemOffset()+FPDiff;
+      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+      FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+      if (Flags.isByVal()) {
+        // Copy relative to framepointer.
+        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+        if (!StackPtr.getNode())
+          StackPtr = DAG.getCopyFromReg(Chain, dl,
+                                        RegInfo->getStackRegister(),
+                                        getPointerTy());
+        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+                                                         ArgChain,
+                                                         Flags, DAG, dl));
+      } else {
+        // Store relative to framepointer.
+        MemOpChains2.push_back(
+          DAG.getStore(ArgChain, dl, Arg, FIN,
+                       MachinePointerInfo::getFixedStack(FI),
+                       false, false, 0));
       }
     }
 
     if (!MemOpChains2.empty())
-      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                          &MemOpChains2[0], MemOpChains2.size());
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
 
     // Store the return address to the appropriate stack slot.
     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
@@ -2930,10 +2960,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // This isn't right, although it's probably harmless on x86; liveouts
     // should be computed from returns not tail calls.  Consider a void
     // function making a tail call to a function returning int.
-    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
   }
 
-  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -3927,6 +3957,29 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
   return true;
 }
 
+/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to INSERTPS.
+/// i. e: If all but one element come from the same vector.
+static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
+  // TODO: Deal with AVX's VINSERTPS
+  if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
+    return false;
+
+  unsigned CorrectPosV1 = 0;
+  unsigned CorrectPosV2 = 0;
+  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
+    if (Mask[i] == i)
+      ++CorrectPosV1;
+    else if (Mask[i] == i + 4)
+      ++CorrectPosV2;
+
+  if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
+    // We have 3 elements from one vector, and one from another.
+    return true;
+
+  return false;
+}
+
 //
 // Some special combinations that can be optimized.
 //
@@ -4146,6 +4199,29 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   return true;
 }
 
+// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
+// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
+static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
+  if (!VT.is512BitVector())
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfSize = NumElts/2;
+  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
+      *Imm = 1;
+      return true;
+    }
+  }
+  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
+      *Imm = 0;
+      return true;
+    }
+  }
+  return false;
+}
+
 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSS,
 /// MOVSD, and MOVD, i.e. setting the lowest element.
@@ -4624,11 +4700,17 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
   return getInsertVINSERTImmediate(N, 256);
 }
 
+/// isZero - Returns true if Elt is a constant integer zero
+static bool isZero(SDValue V) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+  return C && C->isNullValue();
+}
+
 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
 /// constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
-  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
-    return CN->isNullValue();
+  if (isZero(Elt))
+    return true;
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
     return CFP->getValueAPF().isPosZero();
   return false;
@@ -4677,7 +4759,7 @@ static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
 /// isScalarLoadToVector - Returns true if the node is a scalar load that
 /// is promoted to a vector. It also returns the LoadSDNode by reference if
 /// required.
-static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
     return false;
   N = N->getOperand(0).getNode();
@@ -4803,28 +4885,24 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
     if (Subtarget->hasInt256()) { // AVX2
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
-                        array_lengthof(Ops));
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
     } else {
       // 256-bit logic and arithmetic instructions in AVX are all
       // floating-point, no support for integer ops. Emit fp zeroed vectors.
       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
-                        array_lengthof(Ops));
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
     }
   } else if (VT.is512BitVector()) { // AVX-512
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
   } else if (VT.getScalarType() == MVT::i1) {
     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
-                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                       Ops, VT.getVectorNumElements());
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   } else
     llvm_unreachable("Unexpected vector type");
 
@@ -4844,8 +4922,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
   if (VT.is256BitVector()) {
     if (HasInt256) { // AVX2
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
-                        array_lengthof(Ops));
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
     } else { // AVX
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
@@ -5307,7 +5384,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
     return SDValue();
 
   SDLoc dl(Op);
-  SDValue V(0, 0);
+  SDValue V;
   bool First = true;
   for (unsigned i = 0; i < 16; ++i) {
     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
@@ -5320,7 +5397,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
     }
 
     if ((i & 1) != 0) {
-      SDValue ThisElt(0, 0), LastElt(0, 0);
+      SDValue ThisElt, LastElt;
       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
       if (LastIsNonZero) {
         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
@@ -5355,7 +5432,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
     return SDValue();
 
   SDLoc dl(Op);
-  SDValue V(0, 0);
+  SDValue V;
   bool First = true;
   for (unsigned i = 0; i < 8; ++i) {
     bool isNonZero = (NonZeros & (1 << i)) != 0;
@@ -5376,6 +5453,79 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   return V;
 }
 
+/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
+                                     unsigned NonZeros, unsigned NumNonZero,
+                                     unsigned NumZero, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget,
+                                     const TargetLowering &TLI) {
+  // We know there's at least one non-zero element
+  unsigned FirstNonZeroIdx = 0;
+  SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+  while (FirstNonZero.getOpcode() == ISD::UNDEF ||
+         X86::isZeroNode(FirstNonZero)) {
+    ++FirstNonZeroIdx;
+    FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+  }
+
+  if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
+    return SDValue();
+
+  SDValue V = FirstNonZero.getOperand(0);
+  MVT VVT = V.getSimpleValueType();
+  if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
+    return SDValue();
+
+  unsigned FirstNonZeroDst =
+      cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
+  unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
+  unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
+  unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
+
+  for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
+    SDValue Elem = Op.getOperand(Idx);
+    if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
+      continue;
+
+    // TODO: What else can be here? Deal with it.
+    if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // TODO: Some optimizations are still possible here
+    // ex: Getting one element from a vector, and the rest from another.
+    if (Elem.getOperand(0) != V)
+      return SDValue();
+
+    unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
+    if (Dst == Idx)
+      ++CorrectIdx;
+    else if (IncorrectIdx == -1U) {
+      IncorrectIdx = Idx;
+      IncorrectDst = Dst;
+    } else
+      // There was already one element with an incorrect index.
+      // We can't optimize this case to an insertps.
+      return SDValue();
+  }
+
+  if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
+    SDLoc dl(Op);
+    EVT VT = Op.getSimpleValueType();
+    unsigned ElementMoveMask = 0;
+    if (IncorrectIdx == -1U)
+      ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
+    else
+      ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
+
+    SDValue InsertpsMask =
+        DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
+  }
+
+  return SDValue();
+}
+
 /// getVShift - Return a vector logical shift node.
 ///
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
@@ -5480,7 +5630,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   EVT EltVT = VT.getVectorElementType();
   unsigned NumElems = Elts.size();
 
-  LoadSDNode *LDBase = NULL;
+  LoadSDNode *LDBase = nullptr;
   unsigned LastLoadedElt = -1U;
 
   // For each element in the initializer, see if we've found a load or an undef.
@@ -5545,8 +5695,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
     SDValue ResNode =
-        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
-                                array_lengthof(Ops), MVT::i64,
+        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
                                 LDBase->getPointerInfo(),
                                 LDBase->getAlignment(),
                                 false/*isVolatile*/, true/*ReadMem*/,
@@ -5661,7 +5810,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
     unsigned ScalarSize = CVT.getSizeInBits();
 
     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
-      const Constant *C = 0;
+      const Constant *C = nullptr;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
         C = CI->getConstantIntValue();
       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
@@ -5706,6 +5855,41 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
   return SDValue();
 }
 
+/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// underlying vector and index.
+///
+/// Modifies \p ExtractedFromVec to the real vector and returns the real
+/// index.
+static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
+                                         SDValue ExtIdx) {
+  int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+  if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
+    return Idx;
+
+  // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
+  // lowered this:
+  //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
+  // to:
+  //   (extract_vector_elt (vector_shuffle<2,u,u,u>
+  //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
+  //                           undef)
+  //                       Constant<0>)
+  // In this case the vector is the extract_subvector expression and the index
+  // is 2, as specified by the shuffle.
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
+  SDValue ShuffleVec = SVOp->getOperand(0);
+  MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
+  assert(ShuffleVecVT.getVectorElementType() ==
+         ExtractedFromVec.getSimpleValueType().getVectorElementType());
+
+  int ShuffleIdx = SVOp->getMaskElt(Idx);
+  if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
+    ExtractedFromVec = ShuffleVec;
+    return ShuffleIdx;
+  }
+  return Idx;
+}
+
 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
@@ -5739,34 +5923,32 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
 
     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+    // Quit if non-constant index.
+    if (!isa<ConstantSDNode>(ExtIdx))
+      return SDValue();
+    int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
 
     // Quit if extracted from vector of different type.
     if (ExtractedFromVec.getValueType() != VT)
       return SDValue();
 
-    // Quit if non-constant index.
-    if (!isa<ConstantSDNode>(ExtIdx))
-      return SDValue();
-
-    if (VecIn1.getNode() == 0)
+    if (!VecIn1.getNode())
       VecIn1 = ExtractedFromVec;
     else if (VecIn1 != ExtractedFromVec) {
-      if (VecIn2.getNode() == 0)
+      if (!VecIn2.getNode())
         VecIn2 = ExtractedFromVec;
       else if (VecIn2 != ExtractedFromVec)
         // Quit if more than 2 vectors to shuffle
         return SDValue();
     }
 
-    unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
-
     if (ExtractedFromVec == VecIn1)
       Mask[i] = Idx;
     else if (ExtractedFromVec == VecIn2)
       Mask[i] = Idx + NumElems;
   }
 
-  if (VecIn1.getNode() == 0)
+  if (!VecIn1.getNode())
     return SDValue();
 
   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
@@ -5791,24 +5973,22 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
-                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                       Ops, VT.getVectorNumElements());
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   }
 
   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
-                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                       Ops, VT.getVectorNumElements());
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   }
 
   bool AllContants = true;
   uint64_t Immediate = 0;
   int NonConstIdx = -1;
   bool IsSplat = true;
+  unsigned NumNonConsts = 0;
+  unsigned NumConsts = 0;
   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
     SDValue In = Op.getOperand(idx);
     if (In.getOpcode() == ISD::UNDEF)
@@ -5816,9 +5996,13 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
     if (!isa<ConstantSDNode>(In)) {
       AllContants = false;
       NonConstIdx = idx;
+      NumNonConsts++;
     }
-    else if (cast<ConstantSDNode>(In)->getZExtValue())
+    else {
+      NumConsts++;
+      if (cast<ConstantSDNode>(In)->getZExtValue())
       Immediate |= (1ULL << idx);
+    }
     if (In != Op.getOperand(0))
       IsSplat = false;
   }
@@ -5830,6 +6014,19 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
                        DAG.getIntPtrConstant(0));
   }
 
+  if (NumNonConsts == 1 && NonConstIdx != 0) {
+    SDValue DstVec;
+    if (NumConsts) {
+      SDValue VecAsImm = DAG.getConstant(Immediate,
+                                         MVT::getIntegerVT(VT.getSizeInBits()));
+      DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
+    }
+    else 
+      DstVec = DAG.getUNDEF(VT);
+    return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+                       Op.getOperand(NonConstIdx),
+                       DAG.getIntPtrConstant(NonConstIdx));
+  }
   if (!IsSplat && (NonConstIdx != 0))
     llvm_unreachable("Unsupported BUILD_VECTOR operation");
   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
@@ -6043,9 +6240,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
 
     // Build both the lower and upper subvector.
-    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
-    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
-                                NumElems/2);
+    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+                                makeArrayRef(&V[0], NumElems/2));
+    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+                                makeArrayRef(&V[NumElems / 2], NumElems/2));
 
     // Recreate the wider vector with the lower and upper part.
     if (VT.is256BitVector())
@@ -6078,6 +6276,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     if (V.getNode()) return V;
   }
 
+  // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+  if (EVTBits == 32 && NumElems == 4) {
+    SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
+                                      NumZero, DAG, Subtarget, *this);
+    if (V.getNode())
+      return V;
+  }
+
   // If element VT is == 32 bits, turn it into a number of shuffles.
   SmallVector<SDValue, 8> V(NumElems);
   if (NumElems == 4 && NumZero > 0) {
@@ -6332,8 +6538,7 @@ static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
   if (ShufVT != VT)
     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
-                     DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT,
-                                 PshufbMask.data(), PshufbMask.size()));
+                     DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
 }
 
 // v8i16 shuffles - Prefer shuffles in the following order:
@@ -6516,7 +6721,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
 
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
                                   NewV.getOperand(0),
@@ -6540,7 +6745,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
 
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
                                   NewV.getOperand(0),
@@ -6635,7 +6840,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
     }
     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                 MVT::v16i8, &pshufbMask[0], 16));
+                                 MVT::v16i8, pshufbMask));
 
     // As PSHUFB will zero elements with negative indices, it's safe to ignore
     // the 2nd operand if it's undefined or zero.
@@ -6653,7 +6858,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
     }
     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                 MVT::v16i8, &pshufbMask[0], 16));
+                                 MVT::v16i8, pshufbMask));
     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
   }
 
@@ -6771,6 +6976,9 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
   unsigned Scale;
   switch (VT.SimpleTy) {
   default: llvm_unreachable("Unexpected!");
+  case MVT::v2i64:
+  case MVT::v2f64:
+           return SDValue(SVOp, 0);
   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
@@ -6805,7 +7013,7 @@ static SDValue getVZextMovL(MVT VT, MVT OpVT,
                             SDValue SrcOp, SelectionDAG &DAG,
                             const X86Subtarget *Subtarget, SDLoc dl) {
   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
-    LoadSDNode *LD = NULL;
+    LoadSDNode *LD = nullptr;
     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
       LD = dyn_cast<LoadSDNode>(SrcOp);
     if (!LD) {
@@ -6924,8 +7132,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
       }
 
       // Construct the output using a BUILD_VECTOR.
-      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
-                              SVOps.size());
+      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
     } else if (InputUsed[0] < 0) {
       // No input vectors were used! The result is undefined.
       Output[l] = DAG.getUNDEF(NVT);
@@ -7207,6 +7414,93 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
                               getShuffleSHUFImmediate(SVOp), DAG);
 }
 
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+                                         SelectionDAG &DAG) {
+  SDLoc dl(Load);
+  MVT VT = Load->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue Addr = Load->getOperand(1);
+  SDValue NewAddr = DAG.getNode(
+      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+      DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
+
+  SDValue NewLoad =
+      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+                  DAG.getMachineFunction().getMachineMemOperand(
+                      Load->getMemOperand(), 0, EVT.getStoreSize()));
+  return NewLoad;
+}
+
+// It is only safe to call this function if isINSERTPSMask is true for
+// this shufflevector mask.
+static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
+                           SelectionDAG &DAG) {
+  // Generate an insertps instruction when inserting an f32 from memory onto a
+  // v4f32 or when copying a member from one v4f32 to another.
+  // We also use it for transferring i32 from one register to another,
+  // since it simply copies the same bits.
+  // If we're transferring an i32 from memory to a specific element in a
+  // register, we output a generic DAG that will match the PINSRD
+  // instruction.
+  MVT VT = SVOp->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  auto Mask = SVOp->getMask();
+  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+         "unsupported vector type for insertps/pinsrd");
+
+  int FromV1 = std::count_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i < 4; });
+
+  SDValue From;
+  SDValue To;
+  unsigned DestIndex;
+  if (FromV1 == 1) {
+    From = V1;
+    To = V2;
+    DestIndex = std::find_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i < 4; }) -
+                Mask.begin();
+  } else {
+    From = V2;
+    To = V1;
+    DestIndex = std::find_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i >= 4; }) -
+                Mask.begin();
+  }
+
+  if (MayFoldLoad(From)) {
+    // Trivial case, when From comes from a load and is only used by the
+    // shuffle. Make it use insertps from the vector that we need from that
+    // load.
+    SDValue NewLoad =
+        NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
+    if (!NewLoad.getNode())
+      return SDValue();
+
+    if (EVT == MVT::f32) {
+      // Create this as a scalar to vector to match the instruction pattern.
+      SDValue LoadScalarToVector =
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
+      SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
+      return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
+                         InsertpsMask);
+    } else { // EVT == MVT::i32
+      // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
+      // instruction, to match the PINSRD instruction, which loads an i32 to a
+      // certain vector element.
+      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
+                         DAG.getConstant(DestIndex, MVT::i32));
+    }
+  }
+
+  // Vector-element-to-vector
+  unsigned SrcIndex = Mask[DestIndex] % 4;
+  SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
+}
+
 // Reduce a vector shuffle to zext.
 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
                                     SelectionDAG &DAG) {
@@ -7295,9 +7589,8 @@ static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
 }
 
-static SDValue
-NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
-                       SelectionDAG &DAG) {
+static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
@@ -7322,31 +7615,29 @@ NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
 
   // If the shuffle can be profitably rewritten as a narrower shuffle, then
   // do it!
-  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
-      VT == MVT::v16i16 || VT == MVT::v32i8) {
+  if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
+      VT == MVT::v32i8) {
     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
     if (NewOp.getNode())
       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
-  } else if ((VT == MVT::v4i32 ||
-             (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+  } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
     // FIXME: Figure out a cleaner way to do this.
-    // Try to make use of movq to zero out the top part.
     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
       if (NewOp.getNode()) {
         MVT NewVT = NewOp.getSimpleValueType();
         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
                                NewVT, true, false))
-          return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
-                              DAG, Subtarget, dl);
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
+                              dl);
       }
     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
       if (NewOp.getNode()) {
         MVT NewVT = NewOp.getSimpleValueType();
         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
-          return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
-                              DAG, Subtarget, dl);
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
+                              dl);
       }
     }
   }
@@ -7609,6 +7900,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                 getShuffleSHUFImmediate(SVOp), DAG);
   }
 
+  unsigned Idx;
+  if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
+    return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
+                              Idx*(NumElems/2), DAG, dl);
+
   // Handle VPERM2F128/VPERM2I128 permutations
   if (isVPERM2X128Mask(M, VT, HasFp256))
     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
@@ -7618,6 +7914,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   if (BlendOp.getNode())
     return BlendOp;
 
+  if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
+    return getINSERTPS(SVOp, dl, DAG);
+
   unsigned Imm8;
   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
@@ -7631,8 +7930,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
     }
 
-    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT,
-                                &permclMask[0], NumElems);
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
     if (V2IsUndef)
       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
       return DAG.getNode(X86ISD::VPERMV, dl, VT,
@@ -7684,6 +7982,109 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
+// This function assumes its argument is a BUILD_VECTOR of constants or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+                                    unsigned &MaskValue) {
+  MaskValue = 0;
+  unsigned NumElems = BuildVector->getNumOperands();
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  unsigned NumElemsInLane = NumElems / NumLanes;
+
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
+    SDValue EltCond = BuildVector->getOperand(i);
+    SDValue SndLaneEltCond =
+        (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+
+    int Lane1Cond = -1, Lane2Cond = -1;
+    if (isa<ConstantSDNode>(EltCond))
+      Lane1Cond = !isZero(EltCond);
+    if (isa<ConstantSDNode>(SndLaneEltCond))
+      Lane2Cond = !isZero(SndLaneEltCond);
+
+    if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+      // Lane1Cond != 0, means we want the first argument.
+      // Lane1Cond == 0, means we want the second argument.
+      // The encoding of this argument is 0 for the first argument, 1
+      // for the second. Therefore, invert the condition.
+      MaskValue |= !Lane1Cond << i;
+    else if (Lane1Cond < 0)
+      MaskValue |= !Lane2Cond << i;
+    else
+      return false;
+  }
+  return true;
+}
+
+// Try to lower a vselect node into a simple blend instruction.
+static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
+                                   SelectionDAG &DAG) {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  MVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+  }
+
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
+  if (BlendOp.getNode())
+    return BlendOp;
+
+  // Some types for vselect were previously set to Expand, not Legal or
+  // Custom. Return an empty SDValue so we fall-through to Expand, after
+  // the Custom lowering phase.
+  MVT VT = Op.getSimpleValueType();
+  switch (VT.SimpleTy) {
+  default:
+    break;
+  case MVT::v8i16:
+  case MVT::v16i16:
+    return SDValue();
+  }
+
+  // We couldn't create a "Blend with immediate" node.
+  // This node should still be legal, but we'll have to emit a blendv*
+  // instruction.
+  return Op;
+}
+
 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
@@ -7946,10 +8347,47 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   return SDValue();
 }
 
+/// Insert one bit to mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue 
+X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Elt = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+  MVT VecVT = Vec.getSimpleValueType();
+
+  if (!isa<ConstantSDNode>(Idx)) {
+    // Non constant index. Extend source and destination,
+    // insert element and then truncate the result.
+    MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
+    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 
+      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
+      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
+    return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+                       DAG.getConstant(IdxVal, MVT::i8));
+  const TargetRegisterClass* rc = getRegClassFor(VecVT);
+  unsigned MaxSift = rc->getSize()*8 - 1;
+  EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+                    DAG.getConstant(MaxSift, MVT::i8));
+  EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
+                    DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+  return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+}
 SDValue
 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
+  
+  if (EltVT == MVT::i1)
+    return InsertBitToMaskVector(Op, DAG);
 
   SDLoc dl(Op);
   SDValue N0 = Op.getOperand(0);
@@ -8294,10 +8732,10 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
 
   if (InFlag) {
     SDValue Ops[] = { Chain,  TGA, *InFlag };
-    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   } else {
     SDValue Ops[]  = { Chain, TGA };
-    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   }
 
   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
@@ -8325,7 +8763,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
 static SDValue
 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                 const EVT PtrVT) {
-  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
                     X86::RAX, X86II::MO_TLSGD);
 }
 
@@ -8342,7 +8780,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
 
   SDValue Base;
   if (is64Bit) {
-    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
   } else {
     SDValue InFlag;
@@ -8481,7 +8919,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     SDValue Chain = DAG.getEntryNode();
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Args[] = { Chain, Offset };
-    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
+    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
 
     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -8507,10 +8945,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // Windows 64bit: gs:0x58
     // Windows 32bit: fs:__tls_array
 
-    // If GV is an alias then use the aliasee for determining
-    // thread-localness.
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GV = GA->getAliasedGlobal();
     SDLoc dl(GA);
     SDValue Chain = DAG.getEntryNode();
 
@@ -8609,15 +9043,15 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
 
   if (Op.getOpcode() == ISD::SHL_PARTS) {
-    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
-    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   } else {
-    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
-    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   }
 
   SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
@@ -8680,8 +9114,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
                                            X86ISD::FILD, DL,
-                                           Tys, Ops, array_lengthof(Ops),
-                                           SrcVT, MMO);
+                                           Tys, Ops, SrcVT, MMO);
 
   if (useSSE) {
     Chain = Result.getValue(1);
@@ -8704,8 +9137,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
                             MachineMemOperand::MOStore, SSFISize, SSFISize);
 
     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
-                                    Ops, array_lengthof(Ops),
-                                    Op.getValueType(), MMO);
+                                    Ops, Op.getValueType(), MMO);
     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
                          MachinePointerInfo::getFixedStack(SSFI),
                          false, false, false, 0);
@@ -8900,7 +9332,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
-                                         array_lengthof(Ops), MVT::i64, MMO);
+                                         MVT::i64, MMO);
 
   APInt FF(32, 0x5F800000ULL);
 
@@ -8993,8 +9425,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     MachineMemOperand *MMO =
       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
                               MachineMemOperand::MOLoad, MemSize, MemSize);
-    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops,
-                                    array_lengthof(Ops), DstTy, MMO);
+    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
     Chain = Value.getValue(1);
     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
@@ -9008,8 +9439,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     // Build the FP_TO_INT*_IN_MEM
     SDValue Ops[] = { Chain, Value, StackSlot };
     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
-                                           Ops, array_lengthof(Ops), DstTy,
-                                           MMO);
+                                           Ops, DstTy, MMO);
     return std::make_pair(FIST, StackSlot);
   } else {
     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
@@ -9021,8 +9451,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
       MVT::i32, eax.getValue(2));
     SDValue Ops[] = { eax, edx };
     SDValue pair = IsReplace
-      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops))
-      : DAG.getMergeValues(Ops, array_lengthof(Ops), DL);
+      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
+      : DAG.getMergeValues(Ops, DL);
     return std::make_pair(pair, SDValue());
   }
 }
@@ -9217,8 +9647,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
         for (unsigned j = 0; j < 8; ++j)
           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
       }
-      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8,
-                               &pshufbMask[0], 32);
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
 
@@ -9284,7 +9713,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
     /*IsSigned=*/ true, /*IsReplace=*/ false);
   SDValue FIST = Vals.first, StackSlot = Vals.second;
   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
-  if (FIST.getNode() == 0) return Op;
+  if (!FIST.getNode()) return Op;
 
   if (StackSlot.getNode())
     // Load the result.
@@ -9581,12 +10010,29 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
                      VecIns.back(), VecIns.back());
 }
 
+/// \brief return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+  for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+       ++UI) {
+    SDNode *User = *UI;
+    unsigned UOpNo = UI.getOperandNo();
+    if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+      // Look pass truncate.
+      UOpNo = User->use_begin().getOperandNo();
+      User = *User->use_begin();
+    }
+
+    if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+        !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+      return true;
+  }
+  return false;
+}
+
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
-SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
                                     SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-
   if (Op.getValueType() == MVT::i1)
     // KORTEST instruction should be selected
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
@@ -9687,31 +10133,35 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     Opcode = X86ISD::ADD;
     NumOperands = 2;
     break;
-  case ISD::AND: {
-    // If the primary and result isn't used, don't bother using X86ISD::AND,
-    // because a TEST instruction will be better.
-    bool NonFlagUse = false;
-    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
-           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
-      SDNode *User = *UI;
-      unsigned UOpNo = UI.getOperandNo();
-      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
-        // Look pass truncate.
-        UOpNo = User->use_begin().getOperandNo();
-        User = *User->use_begin();
-      }
-
-      if (User->getOpcode() != ISD::BRCOND &&
-          User->getOpcode() != ISD::SETCC &&
-          !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
-        NonFlagUse = true;
+  case ISD::SHL:
+  case ISD::SRL:
+    // If we have a constant logical shift that's only used in a comparison
+    // against zero turn it into an equivalent AND. This allows turning it into
+    // a TEST instruction later.
+    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
+        isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
+      EVT VT = Op.getValueType();
+      unsigned BitWidth = VT.getSizeInBits();
+      unsigned ShAmt = Op->getConstantOperandVal(1);
+      if (ShAmt >= BitWidth) // Avoid undefined shifts.
         break;
-      }
+      APInt Mask = ArithOp.getOpcode() == ISD::SRL
+                       ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
+                       : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+      if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+        break;
+      SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
+                                DAG.getConstant(Mask, VT));
+      DAG.ReplaceAllUsesWith(Op, New);
+      Op = New;
     }
+    break;
 
-    if (!NonFlagUse)
+  case ISD::AND:
+    // If the primary and result isn't used, don't bother using X86ISD::AND,
+    // because a TEST instruction will be better.
+    if (!hasNonFlagsUse(Op))
       break;
-  }
     // FALL THROUGH
   case ISD::SUB:
   case ISD::OR:
@@ -9794,7 +10244,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
   for (unsigned i = 0; i != NumOperands; ++i)
     Ops.push_back(Op.getOperand(i));
 
-  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+  SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
   DAG.ReplaceAllUsesWith(Op, New);
   return SDValue(New.getNode(), 1);
 }
@@ -9802,11 +10252,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
-                                   SelectionDAG &DAG) const {
-  SDLoc dl(Op0);
+                                   SDLoc dl, SelectionDAG &DAG) const {
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
     if (C->getAPIntValue() == 0)
-      return EmitTest(Op0, X86CC, DAG);
+      return EmitTest(Op0, X86CC, dl, DAG);
 
      if (Op0.getValueType() == MVT::i1)
        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
@@ -9888,7 +10337,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
         unsigned AndBitWidth = And.getValueSizeInBits();
         if (BitWidth > AndBitWidth) {
           APInt Zeros, Ones;
-          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
+          DAG.computeKnownBits(Op0, Zeros, Ones);
           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
             return SDValue();
         }
@@ -10054,7 +10503,7 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
 /// operand \p Op1.  If non-trivial (for example because it's not constant)
 /// return an empty value.
-static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG)
+static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
 {
   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
   if (!BV)
@@ -10078,8 +10527,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG)
     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op1), VT, ULTOp1.data(),
-                     ULTOp1.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
 }
 
 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
@@ -10204,7 +10652,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       // Only do this pre-AVX since vpcmp* is no longer destructive.
       if (Subtarget->hasAVX())
         break;
-      SDValue ULEOp1 = ChangeVSETULTtoVSETULE(Op1, DAG);
+      SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
       if (ULEOp1.getNode()) {
         Op1 = ULEOp1;
         Subus = true; Invert = false; Swap = false;
@@ -10383,7 +10831,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   if (X86CC == X86::COND_INVALID)
     return SDValue();
 
-  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
@@ -10418,11 +10866,6 @@ static bool isX86LogicalCmp(SDValue Op) {
   return false;
 }
 
-static bool isZero(SDValue V) {
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
-  return C && C->isNullValue();
-}
-
 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   if (V.getOpcode() != ISD::TRUNCATE)
     return false;
@@ -10517,7 +10960,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         Res = DAG.getNOT(DL, Res, Res.getValueType());
 
       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
-      if (N2C == 0 || !N2C->isNullValue())
+      if (!N2C || !N2C->isNullValue())
         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
       return Res;
     }
@@ -10606,7 +11049,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   if (addTest) {
     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
-    Cond = EmitTest(Cond, X86::COND_NE, DAG);
+    Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
   }
 
   // a <  b ? -1 :  0 -> RES = ~setcc_carry
@@ -10646,7 +11089,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // condition is true.
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   SDValue Ops[] = { Op2, Op1, CC, Cond };
-  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
+  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
 }
 
 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
@@ -11027,7 +11470,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
   if (addTest) {
     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
-    Cond = EmitTest(Cond, X86::COND_NE, DAG);
+    Cond = EmitTest(Cond, X86::COND_NE, dl, DAG);
   }
   Cond = ConvertCmpIfNecessary(Cond, DAG);
   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
@@ -11042,13 +11485,50 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
-  assert((Subtarget->isOSWindows() ||
-          getTargetMachine().Options.EnableSegmentedStacks) &&
-         "This should be used only on Windows targets or when segmented stacks "
-         "are being used");
-  assert(!Subtarget->isTargetMacho() && "Not implemented");
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool SplitStack = MF.shouldSplitStack();
+  bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) ||
+               SplitStack;
   SDLoc dl(Op);
 
+  if (!Lower) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    SDNode* Node = Op.getNode();
+
+    unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+    assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+        " not tell us which reg is the stack pointer!");
+    EVT VT = Node->getValueType(0);
+    SDValue Tmp1 = SDValue(Node, 0);
+    SDValue Tmp2 = SDValue(Node, 1);
+    SDValue Tmp3 = Node->getOperand(2);
+    SDValue Chain = Tmp1.getOperand(0);
+
+    // Chain the dynamic stack allocation so that it doesn't modify the stack
+    // pointer when other instructions are using the stack.
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
+        SDLoc(Node));
+
+    SDValue Size = Tmp2.getOperand(1);
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+    Chain = SP.getValue(1);
+    unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+    const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+    unsigned StackAlign = TFI.getStackAlignment();
+    Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+    if (Align > StackAlign)
+      Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+          DAG.getConstant(-(uint64_t)Align, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
+
+    Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
+        DAG.getIntPtrConstant(0, true), SDValue(),
+        SDLoc(Node));
+
+    SDValue Ops[2] = { Tmp1, Tmp2 };
+    return DAG.getMergeValues(Ops, dl);
+  }
+
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
@@ -11058,8 +11538,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   bool Is64Bit = Subtarget->is64Bit();
   EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
 
-  if (getTargetMachine().Options.EnableSegmentedStacks) {
-    MachineFunction &MF = DAG.getMachineFunction();
+  if (SplitStack) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
 
     if (Is64Bit) {
@@ -11081,7 +11560,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
                                 DAG.getRegister(Vreg, SPTy));
     SDValue Ops1[2] = { Value, Chain };
-    return DAG.getMergeValues(Ops1, 2, dl);
+    return DAG.getMergeValues(Ops1, dl);
   } else {
     SDValue Flag;
     unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
@@ -11105,7 +11584,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     }
 
     SDValue Ops1[2] = { SP, Chain };
-    return DAG.getMergeValues(Ops1, 2, dl);
+    return DAG.getMergeValues(Ops1, dl);
   }
 }
 
@@ -11166,8 +11645,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
                        MachinePointerInfo(SV, 16), false, false, 0);
   MemOps.push_back(Store);
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                     &MemOps[0], MemOps.size());
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
 
 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
@@ -11221,8 +11699,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
-                                          VTs, &InstOps[0], InstOps.size(),
-                                          MVT::i64,
+                                          VTs, InstOps, MVT::i64,
                                           MachinePointerInfo(SV),
                                           /*Align=*/0,
                                           /*Volatile=*/false,
@@ -11262,6 +11739,10 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
                                           SelectionDAG &DAG) {
   MVT ElementType = VT.getVectorElementType();
 
+  // Fold this packed shift into its first operand if ShiftAmt is 0.
+  if (ShiftAmt == 0)
+    return SrcOp;
+
   // Check for ShiftAmt >= element width
   if (ShiftAmt >= ElementType.getSizeInBits()) {
     if (Opc == X86ISD::VSRAI)
@@ -11282,7 +11763,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
     ConstantSDNode *ND;
 
     switch(Opc) {
-    default: llvm_unreachable(0);
+    default: llvm_unreachable(nullptr);
     case X86ISD::VSHLI:
       for (unsigned i=0; i!=NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
@@ -11321,7 +11802,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
       break;
     }
 
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElts);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
   }
 
   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
@@ -11353,7 +11834,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
   ShOps[0] = ShAmt;
   ShOps[1] = DAG.getConstant(0, MVT::i32);
   ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
-  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
+  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps);
 
   // The return type has to be a 128-bit type with the same element
   // type as the input type.
@@ -11476,6 +11957,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
 
+  case Intrinsic::x86_sse41_pmuldq:
+  case Intrinsic::x86_avx2_pmul_dq:
+    return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pmulhu_w:
+  case Intrinsic::x86_avx2_pmulhu_w:
+    return DAG.getNode(ISD::MULHU, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pmulh_w:
+  case Intrinsic::x86_avx2_pmulh_w:
+    return DAG.getNode(ISD::MULHS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
   // SSE2/AVX2 sub with unsigned saturation intrinsics
   case Intrinsic::x86_sse2_psubus_b:
   case Intrinsic::x86_sse2_psubus_w:
@@ -11927,7 +12423,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     }
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                                 DAG.getConstant(X86CC, MVT::i8),
                                 SDValue(PCMP.getNode(), 1));
@@ -11944,7 +12440,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
 
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-    return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+    return DAG.getNode(Opcode, dl, VTs, NewOps);
   }
   case Intrinsic::x86_fma_vfmadd_ps:
   case Intrinsic::x86_fma_vfmadd_pd:
@@ -12042,27 +12538,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
-                             SDValue Base, SDValue Index,
-                             SDValue ScaleOp, SDValue Chain,
-                             const X86Subtarget * Subtarget) {
-  SDLoc dl(Op);
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
-  assert(C && "Invalid scale type");
-  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
-  SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
-  EVT MaskVT = MVT::getVectorVT(MVT::i1,
-                             Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
-  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
-  SDValue Segment = DAG.getRegister(0, MVT::i32);
-  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
-  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
-  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
-  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
-}
-
-static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                               SDValue Src, SDValue Mask, SDValue Base,
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
                               const X86Subtarget * Subtarget) {
@@ -12072,7 +12547,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
   EVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+  else
+    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -12081,12 +12561,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
-  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
+  return DAG.getMergeValues(RetOps, dl);
 }
 
 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
-                              SDValue Src, SDValue Base, SDValue Index,
-                              SDValue ScaleOp, SDValue Chain) {
+                               SDValue Src, SDValue Mask, SDValue Base,
+                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
   SDLoc dl(Op);
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
   assert(C && "Invalid scale type");
@@ -12095,52 +12575,218 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   EVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+  else
+    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   return SDValue(Res, 1);
 }
 
-static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
-                               SDValue Src, SDValue Mask, SDValue Base,
-                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
+static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                               SDValue Mask, SDValue Base, SDValue Index,
+                               SDValue ScaleOp, SDValue Chain) {
   SDLoc dl(Op);
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
   assert(C && "Invalid scale type");
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
-  EVT MaskVT = MVT::getVectorVT(MVT::i1,
-                             Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
-  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
-  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
-  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
-  return SDValue(Res, 1);
+  EVT MaskVT =
+    MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+  else
+    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  //SDVTList VTs = DAG.getVTList(MVT::Other);
+  SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
+  return SDValue(Res, 0);
+}
+
+// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
+// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
+// also used to custom lower READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
+                              SelectionDAG &DAG, const X86Subtarget *Subtarget,
+                              SmallVectorImpl<SDValue> &Results) {
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
+  SDValue LO, HI;
+
+  // The processor's time-stamp counter (a 64-bit MSR) is stored into the
+  // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
+  // and the EAX register is loaded with the low-order 32 bits.
+  if (Subtarget->is64Bit()) {
+    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+                            LO.getValue(2));
+  } else {
+    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+                            LO.getValue(2));
+  }
+  SDValue Chain = HI.getValue(1);
+
+  if (Opcode == X86ISD::RDTSCP_DAG) {
+    assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+
+    // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+    // the ECX register. Add 'ecx' explicitly to the chain.
+    SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
+                                     HI.getValue(2));
+    // Explicitly store the content of ECX at the location passed in input
+    // to the 'rdtscp' intrinsic.
+    Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
+                         MachinePointerInfo(), false, false, 0);
+  }
+
+  if (Subtarget->is64Bit()) {
+    // The EDX register is loaded with the high-order 32 bits of the MSR, and
+    // the EAX register is loaded with the low-order 32 bits.
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+                              DAG.getConstant(32, MVT::i8));
+    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+    Results.push_back(Chain);
+    return;
+  }
+
+  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+  SDValue Ops[] = { LO, HI };
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+  Results.push_back(Pair);
+  Results.push_back(Chain);
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
+  SmallVector<SDValue, 2> Results;
+  SDLoc DL(Op);
+  getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                          Results);
+  return DAG.getMergeValues(Results, DL);
+}
+
+enum IntrinsicType {
+  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST
+};
+
+struct IntrinsicData {
+  IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1)
+    :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {}
+  IntrinsicType Type;
+  unsigned      Opc0;
+  unsigned      Opc1;
+};
+
+std::map < unsigned, IntrinsicData> IntrMap;
+static void InitIntinsicsMap() {
+  static bool Initialized = false;
+  if (Initialized) 
+    return;
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+                                IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+                                IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512,
+                                IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512,
+                                IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512,
+                                IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0)));
+
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512,
+                                IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512, 
+                                IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512, 
+                                IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512, 
+                                IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0)));
+   
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm,
+                                                        X86::VGATHERPF1QPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm,
+                                                        X86::VGATHERPF1QPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm,
+                                                        X86::VGATHERPF1DPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm,
+                                                        X86::VGATHERPF1DPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm,
+                                                        X86::VSCATTERPF1QPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm,
+                                                        X86::VSCATTERPF1QPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm,
+                                                        X86::VSCATTERPF1DPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm,
+                                                        X86::VSCATTERPF1DPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16,
+                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32,
+                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64,
+                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16,
+                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32,
+                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64,
+                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_xtest,
+                                IntrinsicData(XTEST,  X86ISD::XTEST,  0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc,
+                                IntrinsicData(RDTSC,  X86ISD::RDTSC_DAG, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
+                                IntrinsicData(RDTSC,  X86ISD::RDTSCP_DAG, 0)));
+  Initialized = true;
 }
 
 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                                       SelectionDAG &DAG) {
-  SDLoc dl(Op);
+  InitIntinsicsMap();
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-  switch (IntNo) {
-  default: return SDValue();    // Don't custom lower most intrinsics.
+  std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo);
+  if (itr == IntrMap.end())
+    return SDValue();
 
-  // RDRAND/RDSEED intrinsics.
-  case Intrinsic::x86_rdrand_16:
-  case Intrinsic::x86_rdrand_32:
-  case Intrinsic::x86_rdrand_64:
-  case Intrinsic::x86_rdseed_16:
-  case Intrinsic::x86_rdseed_32:
-  case Intrinsic::x86_rdseed_64: {
-    unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
-                       IntNo == Intrinsic::x86_rdseed_32 ||
-                       IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED :
-                                                            X86ISD::RDRAND;
+  SDLoc dl(Op);
+  IntrinsicData Intr = itr->second;
+  switch(Intr.Type) {
+  case RDSEED:
+  case RDRAND: {
     // Emit the node with the right value type.
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
-    SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
+    SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0));
 
     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
     // Otherwise return the value from Rand, which is always 0, casted to i32.
@@ -12150,152 +12796,55 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                       SDValue(Result.getNode(), 1) };
     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
-                                  Ops, array_lengthof(Ops));
+                                  Ops);
 
     // Return { result, isValid, chain }.
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
                        SDValue(Result.getNode(), 2));
   }
-  //int_gather(index, base, scale);
-  case Intrinsic::x86_avx512_gather_qpd_512:
-  case Intrinsic::x86_avx512_gather_qps_512:
-  case Intrinsic::x86_avx512_gather_dpd_512:
-  case Intrinsic::x86_avx512_gather_qpi_512:
-  case Intrinsic::x86_avx512_gather_qpq_512:
-  case Intrinsic::x86_avx512_gather_dpq_512:
-  case Intrinsic::x86_avx512_gather_dps_512:
-  case Intrinsic::x86_avx512_gather_dpi_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
-    case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
-    case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
-    case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
-    case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
-    case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
-    case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
-    case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
-    }
-    SDValue Chain = Op.getOperand(0);
-    SDValue Index = Op.getOperand(2);
-    SDValue Base  = Op.getOperand(3);
-    SDValue Scale = Op.getOperand(4);
-    return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
-  }
-  //int_gather_mask(v1, mask, index, base, scale);
-  case Intrinsic::x86_avx512_gather_qps_mask_512:
-  case Intrinsic::x86_avx512_gather_qpd_mask_512:
-  case Intrinsic::x86_avx512_gather_dpd_mask_512:
-  case Intrinsic::x86_avx512_gather_dps_mask_512:
-  case Intrinsic::x86_avx512_gather_qpi_mask_512:
-  case Intrinsic::x86_avx512_gather_qpq_mask_512:
-  case Intrinsic::x86_avx512_gather_dpi_mask_512:
-  case Intrinsic::x86_avx512_gather_dpq_mask_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx512_gather_qps_mask_512:
-      Opc = X86::VGATHERQPSZrm; break;
-    case Intrinsic::x86_avx512_gather_qpd_mask_512:
-      Opc = X86::VGATHERQPDZrm; break;
-    case Intrinsic::x86_avx512_gather_dpd_mask_512:
-      Opc = X86::VGATHERDPDZrm; break;
-    case Intrinsic::x86_avx512_gather_dps_mask_512:
-      Opc = X86::VGATHERDPSZrm; break;
-    case Intrinsic::x86_avx512_gather_qpi_mask_512:
-      Opc = X86::VPGATHERQDZrm; break;
-    case Intrinsic::x86_avx512_gather_qpq_mask_512:
-      Opc = X86::VPGATHERQQZrm; break;
-    case Intrinsic::x86_avx512_gather_dpi_mask_512:
-      Opc = X86::VPGATHERDDZrm; break;
-    case Intrinsic::x86_avx512_gather_dpq_mask_512:
-      Opc = X86::VPGATHERDQZrm; break;
-    }
+  case GATHER: {
+  //gather(v1, mask, index, base, scale);
     SDValue Chain = Op.getOperand(0);
     SDValue Src   = Op.getOperand(2);
-    SDValue Mask  = Op.getOperand(3);
+    SDValue Base  = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
-    SDValue Base  = Op.getOperand(5);
+    SDValue Mask  = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
-    return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+    return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
                           Subtarget);
   }
-  //int_scatter(base, index, v1, scale);
-  case Intrinsic::x86_avx512_scatter_qpd_512:
-  case Intrinsic::x86_avx512_scatter_qps_512:
-  case Intrinsic::x86_avx512_scatter_dpd_512:
-  case Intrinsic::x86_avx512_scatter_qpi_512:
-  case Intrinsic::x86_avx512_scatter_qpq_512:
-  case Intrinsic::x86_avx512_scatter_dpq_512:
-  case Intrinsic::x86_avx512_scatter_dps_512:
-  case Intrinsic::x86_avx512_scatter_dpi_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx512_scatter_qpd_512:
-      Opc = X86::VSCATTERQPDZmr; break;
-    case Intrinsic::x86_avx512_scatter_qps_512:
-      Opc = X86::VSCATTERQPSZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpd_512:
-      Opc = X86::VSCATTERDPDZmr; break;
-    case Intrinsic::x86_avx512_scatter_dps_512:
-      Opc = X86::VSCATTERDPSZmr; break;
-    case Intrinsic::x86_avx512_scatter_qpi_512:
-      Opc = X86::VPSCATTERQDZmr; break;
-    case Intrinsic::x86_avx512_scatter_qpq_512:
-      Opc = X86::VPSCATTERQQZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpq_512:
-      Opc = X86::VPSCATTERDQZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpi_512:
-      Opc = X86::VPSCATTERDDZmr; break;
-    }
-    SDValue Chain = Op.getOperand(0);
-    SDValue Base  = Op.getOperand(2);
-    SDValue Index = Op.getOperand(3);
-    SDValue Src   = Op.getOperand(4);
-    SDValue Scale = Op.getOperand(5);
-    return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
-  }
-  //int_scatter_mask(base, mask, index, v1, scale);
-  case Intrinsic::x86_avx512_scatter_qps_mask_512:
-  case Intrinsic::x86_avx512_scatter_qpd_mask_512:
-  case Intrinsic::x86_avx512_scatter_dpd_mask_512:
-  case Intrinsic::x86_avx512_scatter_dps_mask_512:
-  case Intrinsic::x86_avx512_scatter_qpi_mask_512:
-  case Intrinsic::x86_avx512_scatter_qpq_mask_512:
-  case Intrinsic::x86_avx512_scatter_dpi_mask_512:
-  case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx512_scatter_qpd_mask_512:
-      Opc = X86::VSCATTERQPDZmr; break;
-    case Intrinsic::x86_avx512_scatter_qps_mask_512:
-      Opc = X86::VSCATTERQPSZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpd_mask_512:
-      Opc = X86::VSCATTERDPDZmr; break;
-    case Intrinsic::x86_avx512_scatter_dps_mask_512:
-      Opc = X86::VSCATTERDPSZmr; break;
-    case Intrinsic::x86_avx512_scatter_qpi_mask_512:
-      Opc = X86::VPSCATTERQDZmr; break;
-    case Intrinsic::x86_avx512_scatter_qpq_mask_512:
-      Opc = X86::VPSCATTERQQZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpq_mask_512:
-      Opc = X86::VPSCATTERDQZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpi_mask_512:
-      Opc = X86::VPSCATTERDDZmr; break;
-    }
+  case SCATTER: {
+  //scatter(base, mask, index, v1, scale);
     SDValue Chain = Op.getOperand(0);
     SDValue Base  = Op.getOperand(2);
     SDValue Mask  = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
     SDValue Src   = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
-    return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+    return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+  }
+  case PREFETCH: {
+    SDValue Hint = Op.getOperand(6);
+    unsigned HintVal;
+    if (dyn_cast<ConstantSDNode> (Hint) == 0 ||
+        (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
+      llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
+    unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
+    SDValue Chain = Op.getOperand(0);
+    SDValue Mask  = Op.getOperand(2);
+    SDValue Index = Op.getOperand(3);
+    SDValue Base  = Op.getOperand(4);
+    SDValue Scale = Op.getOperand(5);
+    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
+  }
+  // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
+  case RDTSC: {
+    SmallVector<SDValue, 2> Results;
+    getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
+    return DAG.getMergeValues(Results, dl);
   }
   // XTEST intrinsics.
-  case Intrinsic::x86_xtest: {
+  case XTEST: {
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
     SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
@@ -12306,6 +12855,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                        Ret, SDValue(InTrans.getNode(), 1));
   }
   }
+  llvm_unreachable("Unknown Intrinsic Type");
 }
 
 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
@@ -12358,6 +12908,19 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   return FrameAddr;
 }
 
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned X86TargetLowering::getRegisterByName(const char* RegName,
+                                              EVT VT) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("esp", X86::ESP)
+                       .Case("rsp", X86::RSP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
   const X86RegisterInfo *RegInfo =
@@ -12477,7 +13040,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                 MachinePointerInfo(TrmpAddr, 22),
                                 false, false, 0);
 
-    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   } else {
     const Function *Func =
       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
@@ -12557,7 +13120,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                 MachinePointerInfo(TrmpAddr, 6),
                                 false, false, 1);
 
-    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   }
 }
 
@@ -12600,8 +13163,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
                                           DAG.getVTList(MVT::Other),
-                                          Ops, array_lengthof(Ops), MVT::i16,
-                                          MMO);
+                                          Ops, MVT::i16, MMO);
 
   // Load FP Control Word from stack slot
   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
@@ -12654,7 +13216,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
     DAG.getConstant(X86::COND_E, MVT::i8),
     Op.getValue(1)
   };
-  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
+  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
 
   // Finally xor with NumBits-1.
   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
@@ -12706,7 +13268,7 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
     DAG.getConstant(X86::COND_E, MVT::i8),
     Op.getValue(1)
   };
-  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
+  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
 }
 
 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
@@ -12824,59 +13386,104 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
 }
 
-static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-  MVT EltTy = VT.getVectorElementType();
-  unsigned NumElts = VT.getVectorNumElements();
-  SDValue N0 = Op.getOperand(0);
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWin64() && "Unexpected target");
+  EVT VT = Op.getValueType();
+  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+         "Unexpected return type for lowering");
+
+  RTLIB::Libcall LC;
+  bool isSigned;
+  switch (Op->getOpcode()) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
+  case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
+  case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
+  case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
+  case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
+  case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
+  }
+
   SDLoc dl(Op);
+  SDValue InChain = DAG.getEntryNode();
 
-  // Lower sdiv X, pow2-const.
-  BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
-  if (!C)
-    return SDValue();
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Op->getOperand(i).getValueType();
+    assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+           "Unexpected argument type for lowering");
+    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+    Entry.Node = StackPtr;
+    InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
+                           false, false, 16);
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Ty = PointerType::get(ArgTy,0);
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy());
 
-  APInt SplatValue, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
-                          HasAnyUndefs) ||
-      EltTy.getSizeInBits() < SplatBitSize)
-    return SDValue();
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(getLibcallCallingConv(LC),
+               static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
+               Callee, &Args, 0)
+    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
+}
+
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+  EVT VT = Op0.getValueType();
+  SDLoc dl(Op);
 
-  if ((SplatValue != 0) &&
-      (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
-    unsigned Lg2 = SplatValue.countTrailingZeros();
-    // Splat the sign bit.
-    SmallVector<SDValue, 16> Sz(NumElts,
-                                DAG.getConstant(EltTy.getSizeInBits() - 1,
-                                                EltTy));
-    SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
-                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
-                                          NumElts));
-    // Add (N0 < 0) ? abs2 - 1 : 0;
-    SmallVector<SDValue, 16> Amt(NumElts,
-                                 DAG.getConstant(EltTy.getSizeInBits() - Lg2,
-                                                 EltTy));
-    SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
-                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
-                                          NumElts));
-    SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
-    SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy));
-    SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
-                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
-                                          NumElts));
-
-    // If we're dividing by a positive value, we're done.  Otherwise, we must
-    // negate the result.
-    if (SplatValue.isNonNegative())
-      return SRA;
-
-    SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
-    SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
-    return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
+  assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
+         (VT == MVT::v8i32 && Subtarget->hasInt256()));
+
+  // Get the high parts.
+  const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+  SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+
+  // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+  // ints.
+  MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+  bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
+  unsigned Opcode =
+      (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+  SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
+                             DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
+                             DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
+
+  // Shuffle it back into the right order.
+  const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15};
+  SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+  const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
+  SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+
+  // If we have a signed multiply but no PMULDQ fix up the high parts of a
+  // unsigned multiply.
+  if (IsSigned && !Subtarget->hasSSE41()) {
+    SDValue ShAmt =
+        DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
+    SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+    SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+    SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+    Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
   }
-  return SDValue();
+
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
 }
 
 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
@@ -12920,7 +13527,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
                                                      MVT::i8));
           return DAG.getNode(ISD::AND, dl, VT, SHL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
@@ -12933,7 +13540,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
                                                      MVT::i8));
           return DAG.getNode(ISD::AND, dl, VT, SRL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
         }
         if (Op.getOpcode() == ISD::SRA) {
           if (ShiftAmt == 7) {
@@ -12946,7 +13553,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
                                                          MVT::i8));
-          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
           return Res;
@@ -12966,7 +13573,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
                                                      MVT::i8));
           return DAG.getNode(ISD::AND, dl, VT, SHL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
@@ -12979,7 +13586,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
                                                      MVT::i8));
           return DAG.getNode(ISD::AND, dl, VT, SRL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
         }
         if (Op.getOpcode() == ISD::SRA) {
           if (ShiftAmt == 7) {
@@ -12992,7 +13599,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
                                                          MVT::i8));
-          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
           return Res;
@@ -13014,7 +13621,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
     uint64_t ShiftAmt = 0;
     for (unsigned i = 0; i != Ratio; ++i) {
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
-      if (C == 0)
+      if (!C)
         return SDValue();
       // 6 == Log2(64)
       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
@@ -13025,7 +13632,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
       for (unsigned j = 0; j != Ratio; ++j) {
         ConstantSDNode *C =
           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
-        if (C == 0)
+        if (!C)
           return SDValue();
         // 6 == Log2(64)
         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
@@ -13107,7 +13714,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
                BaseShAmt = InVec.getOperand(1);
            }
         }
-        if (BaseShAmt.getNode() == 0)
+        if (!BaseShAmt.getNode())
           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
                                   DAG.getIntPtrConstant(0));
       }
@@ -13260,7 +13867,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
       }
       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
     }
-    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElems);
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
   }
 
@@ -13274,6 +13881,79 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   }
 
+  // If possible, lower this shift as a sequence of two shifts by
+  // constant plus a MOVSS/MOVSD instead of scalarizing it.
+  // Example:
+  //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
+  //
+  // Could be rewritten as:
+  //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
+  //
+  // The advantage is that the two shifts from the example would be
+  // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
+  // the vector shift into four scalar shifts plus four pairs of vector
+  // insert/extract.
+  if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
+      ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+    unsigned TargetOpcode = X86ISD::MOVSS;
+    bool CanBeSimplified;
+    // The splat value for the first packed shift (the 'X' from the example).
+    SDValue Amt1 = Amt->getOperand(0);
+    // The splat value for the second packed shift (the 'Y' from the example).
+    SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
+                                        Amt->getOperand(2);
+
+    // See if it is possible to replace this node with a sequence of
+    // two shifts followed by a MOVSS/MOVSD
+    if (VT == MVT::v4i32) {
+      // Check if it is legal to use a MOVSS.
+      CanBeSimplified = Amt2 == Amt->getOperand(2) &&
+                        Amt2 == Amt->getOperand(3);
+      if (!CanBeSimplified) {
+        // Otherwise, check if we can still simplify this node using a MOVSD.
+        CanBeSimplified = Amt1 == Amt->getOperand(1) &&
+                          Amt->getOperand(2) == Amt->getOperand(3);
+        TargetOpcode = X86ISD::MOVSD;
+        Amt2 = Amt->getOperand(2);
+      }
+    } else {
+      // Do similar checks for the case where the machine value type
+      // is MVT::v8i16.
+      CanBeSimplified = Amt1 == Amt->getOperand(1);
+      for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
+        CanBeSimplified = Amt2 == Amt->getOperand(i);
+
+      if (!CanBeSimplified) {
+        TargetOpcode = X86ISD::MOVSD;
+        CanBeSimplified = true;
+        Amt2 = Amt->getOperand(4);
+        for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
+          CanBeSimplified = Amt1 == Amt->getOperand(i);
+        for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
+          CanBeSimplified = Amt2 == Amt->getOperand(j);
+      }
+    }
+    
+    if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
+        isa<ConstantSDNode>(Amt2)) {
+      // Replace this node with two shifts followed by a MOVSS/MOVSD.
+      EVT CastVT = MVT::v4i32;
+      SDValue Splat1 = 
+        DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
+      SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
+      SDValue Splat2 = 
+        DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
+      SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
+      if (TargetOpcode == X86ISD::MOVSD)
+        CastVT = MVT::v2i64;
+      SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
+      SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
+      SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
+                                            BitCast1, DAG);
+      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+    }
+  }
+
   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
 
@@ -13351,10 +14031,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
       for (unsigned i = NumElems/2; i != NumElems; ++i)
         Amt2Csts.push_back(Amt->getOperand(i));
 
-      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
-                                 &Amt1Csts[0], NumElems/2);
-      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
-                                 &Amt2Csts[0], NumElems/2);
+      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
+      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
     } else {
       // Variable shift amount
       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
@@ -13585,35 +14263,47 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
-                                           Ops, array_lengthof(Ops), T, MMO);
+                                           Ops, T, MMO);
   SDValue cpOut =
     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
   return cpOut;
 }
 
-static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
-                                     SelectionDAG &DAG) {
-  assert(Subtarget->is64Bit() && "Result not type legalized?");
-  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
-  SDValue TheChain = Op.getOperand(0);
-  SDLoc dl(Op);
-  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
-  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
-  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
-                                   rax.getValue(2));
-  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
-                            DAG.getConstant(32, MVT::i8));
-  SDValue Ops[] = {
-    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
-    rdx.getValue(1)
-  };
-  return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
-}
-
 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
                             SelectionDAG &DAG) {
   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
+
+  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    if (DstVT != MVT::f64)
+      // This conversion needs to be expanded.
+      return SDValue();
+
+    SDValue InVec = Op->getOperand(0);
+    SDLoc dl(Op);
+    unsigned NumElts = SrcVT.getVectorNumElements();
+    EVT SVT = SrcVT.getVectorElementType();
+
+    // Widen the vector in input in the case of MVT::v2i32.
+    // Example: from MVT::v2i32 to MVT::v4i32.
+    SmallVector<SDValue, 16> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
+                                 DAG.getIntPtrConstant(i)));
+
+    // Explicitly mark the extra elements as Undef.
+    SDValue Undef = DAG.getUNDEF(SVT);
+    for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
+      Elts.push_back(Undef);
+
+    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
+    SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
+                       DAG.getIntPtrConstant(0));
+  }
+
   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
          Subtarget->hasMMX() && "Unexpected custom BITCAST");
   assert((DstVT == MVT::i64 ||
@@ -13641,8 +14331,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
                        cast<AtomicSDNode>(Node)->getMemoryVT(),
                        Node->getOperand(0),
                        Node->getOperand(1), negOp,
-                       cast<AtomicSDNode>(Node)->getSrcValue(),
-                       cast<AtomicSDNode>(Node)->getAlignment(),
+                       cast<AtomicSDNode>(Node)->getMemOperand(),
                        cast<AtomicSDNode>(Node)->getOrdering(),
                        cast<AtomicSDNode>(Node)->getSynchScope());
 }
@@ -13730,12 +14419,11 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
   Type *RetTy = isF64
     ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
     : (Type*)VectorType::get(ArgTy, 4);
-  TargetLowering::
-    CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
-                         false, false, false, false, 0,
-                         CallingConv::C, /*isTaillCall=*/false,
-                         /*doesNotRet=*/false, /*isReturnValueUsed*/true,
-                         Callee, Args, DAG, dl);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
+
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
   if (isF64)
@@ -13764,6 +14452,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
@@ -13815,6 +14504,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
+  case ISD::UMUL_LOHI:
+  case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
@@ -13832,7 +14523,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::ADD:                return LowerADD(Op, DAG);
   case ISD::SUB:                return LowerSUB(Op, DAG);
-  case ISD::SDIV:               return LowerSDIV(Op, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   }
 }
@@ -13875,10 +14565,10 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
   SDValue Ops[] = { Chain, In1, In2L, In2H };
   SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
   SDValue Result =
-    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64,
+    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64,
                             cast<MemSDNode>(Node)->getMemOperand());
   SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
-  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF));
   Results.push_back(Result.getValue(2));
 }
 
@@ -13899,6 +14589,16 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::SUBE:
     // We don't want to expand or promote these.
     return;
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+  case ISD::SDIVREM:
+  case ISD::UDIVREM: {
+    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+    Results.push_back(V);
+    return;
+  }
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT: {
     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
@@ -13909,10 +14609,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     std::pair<SDValue,SDValue> Vals =
         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
     SDValue FIST = Vals.first, StackSlot = Vals.second;
-    if (FIST.getNode() != 0) {
+    if (FIST.getNode()) {
       EVT VT = N->getValueType(0);
       // Return a load from the stack slot.
-      if (StackSlot.getNode() != 0)
+      if (StackSlot.getNode())
         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
                                       MachinePointerInfo(),
                                       false, false, false, 0));
@@ -13945,20 +14645,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(V);
     return;
   }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default : llvm_unreachable("Do not know how to custom type "
+                               "legalize this intrinsic operation!");
+    case Intrinsic::x86_rdtsc:
+      return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                                     Results);
+    case Intrinsic::x86_rdtscp:
+      return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+                                     Results);
+    }
+  }
   case ISD::READCYCLECOUNTER: {
-    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
-    SDValue TheChain = N->getOperand(0);
-    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
-    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
-                                     rd.getValue(1));
-    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
-                                     eax.getValue(2));
-    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
-    SDValue Ops[] = { eax, edx };
-    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops,
-                                  array_lengthof(Ops)));
-    Results.push_back(edx.getValue(1));
-    return;
+    return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                                   Results);
   }
   case ISD::ATOMIC_CMP_SWAP: {
     EVT T = N->getValueType(0);
@@ -13994,8 +14696,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
                                   X86ISD::LCMPXCHG8_DAG;
-    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
-                                             Ops, array_lengthof(Ops), T, MMO);
+    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
                                         Regs64bit ? X86::RAX : X86::EAX,
                                         HalfT, Result.getValue(1));
@@ -14003,7 +14704,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                         Regs64bit ? X86::RDX : X86::EDX,
                                         HalfT, cpOutL.getValue(2));
     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
-    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
     Results.push_back(cpOutH.getValue(1));
     return;
   }
@@ -14058,14 +14759,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
     return;
   }
-  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_LOAD: {
     ReplaceATOMIC_LOAD(N, Results, DAG);
+    return;
+  }
+  case ISD::BITCAST: {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    EVT DstVT = N->getValueType(0);
+    EVT SrcVT = N->getOperand(0)->getValueType(0);
+
+    if (SrcVT != MVT::f64 ||
+        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+      return;
+
+    unsigned NumElts = DstVT.getVectorNumElements();
+    EVT SVT = DstVT.getVectorElementType();
+    EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   MVT::v2f64, N->getOperand(0));
+    SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
+
+    SmallVector<SDValue, 8> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
+                                   ToVecInt, DAG.getIntPtrConstant(i)));
+
+    Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
+  }
   }
 }
 
 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return NULL;
+  default: return nullptr;
   case X86ISD::BSF:                return "X86ISD::BSF";
   case X86ISD::BSR:                return "X86ISD::BSR";
   case X86ISD::SHLD:               return "X86ISD::SHLD";
@@ -14176,7 +14902,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::OR:                 return "X86ISD::OR";
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
-  case X86ISD::BZHI:               return "X86ISD::BZHI";
   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
@@ -14203,6 +14928,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
+  case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
   case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
@@ -14210,6 +14936,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
+  case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -14240,7 +14967,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
   Reloc::Model R = getTargetMachine().getRelocationModel();
 
   // X86 allows a sign-extended 32-bit immediate field as a displacement.
-  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
+  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
     return false;
 
   if (AM.BaseGV) {
@@ -14418,7 +15145,23 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
   if (VT.getSizeInBits() == 64)
     return false;
 
-  // FIXME: pshufb, blends, shifts.
+  // If this is a single-input shuffle with no 128 bit lane crossings we can
+  // lower it into pshufb.
+  if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
+      (SVT.is256BitVector() && Subtarget->hasInt256())) {
+    bool isLegal = true;
+    for (unsigned I = 0, E = M.size(); I != E; ++I) {
+      if (M[I] >= (int)SVT.getVectorNumElements() ||
+          ShuffleCrosses128bitLane(SVT, I, M[I])) {
+        isLegal = false;
+        break;
+      }
+    }
+    if (isLegal)
+      return true;
+  }
+
+  // FIXME: blends, shifts.
   return (SVT.getVectorNumElements() == 2 ||
           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
           isMOVLMask(M, SVT) ||
@@ -15366,7 +16109,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
     OffsetDestReg = 0; // unused
     OverflowDestReg = DestReg;
 
-    offsetMBB = NULL;
+    offsetMBB = nullptr;
     overflowMBB = thisMBB;
     endMBB = thisMBB;
   } else {
@@ -15736,7 +16479,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
   MachineFunction *MF = BB->getParent();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
-  assert(getTargetMachine().Options.EnableSegmentedStacks);
+  assert(MF->shouldSplitStack());
 
   unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
@@ -16509,11 +17252,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 //                           X86 Optimization Hooks
 //===----------------------------------------------------------------------===//
 
-void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       APInt &KnownZero,
-                                                       APInt &KnownOne,
-                                                       const SelectionDAG &DAG,
-                                                       unsigned Depth) const {
+void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
   unsigned BitWidth = KnownZero.getBitWidth();
   unsigned Opc = Op.getOpcode();
   assert((Opc >= ISD::BUILTIN_OP_END ||
@@ -16576,8 +17319,10 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   }
 }
 
-unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
-                                                         unsigned Depth) const {
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
+  SDValue Op,
+  const SelectionDAG &,
+  unsigned Depth) const {
   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
     return Op.getValueType().getScalarType().getSizeInBits();
@@ -16679,7 +17424,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
         SDValue ResNode =
           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
-                                  array_lengthof(Ops),
                                   Ld->getMemoryVT(),
                                   Ld->getPointerInfo(),
                                   Ld->getAlignment(),
@@ -17036,6 +17780,51 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
   return std::make_pair(Opc, NeedSplit);
 }
 
+static SDValue
+TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+
+  if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
+    SDValue CondSrc = Cond->getOperand(0);
+    if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
+      Cond = CondSrc->getOperand(0);
+  }
+
+  MVT VT = N->getSimpleValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  SmallVector<int, 8> ShuffleMask(NumElems, -1);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    // Be sure we emit undef where we can.
+    if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
+      ShuffleMask[i] = -1;
+    else
+      ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
+  }
+
+  return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
+}
+
 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
 /// nodes.
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -17378,7 +18167,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
 
       // Another special case: If C was a sign bit, the sub has been
       // canonicalized into a xor.
-      // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+      // FIXME: Would it be better to use computeKnownBits to determine whether
       //        it's safe to decanonicalize the xor?
       // x s< 0 ? x^C : 0 --> subus x, C
       if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
@@ -17544,7 +18333,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
   // to simplify previous instructions.
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
-      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
+      !DCI.isBeforeLegalize() &&
+      // We explicitly check against v8i16 and v16i16 because, although
+      // they're marked as Custom, they might only be legal when Cond is a
+      // build_vector of constants. This will be taken care in a later
+      // condition.
+      (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
+       VT != MVT::v8i16)) {
     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
 
     // Don't optimize vector selects that map to mask-registers.
@@ -17571,6 +18366,23 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       DCI.CommitTargetLoweringOpt(TLO);
   }
 
+  // We should generate an X86ISD::BLENDI from a vselect if its argument
+  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+  // constants. This specific pattern gets generated when we split a
+  // selector for a 512 bit vector in a machine without AVX512 (but with
+  // 256-bit vectors), during legalization:
+  //
+  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+  //
+  // Iff we find this pattern and the build_vectors are built from
+  // constants, we translate the vselect into a shuffle_vector that we
+  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+    SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+  }
+
   return SDValue();
 }
 
@@ -17605,7 +18417,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   SDValue Op2 = Cmp.getOperand(1);
 
   SDValue SetCC;
-  const ConstantSDNode* C = 0;
+  const ConstantSDNode* C = nullptr;
   bool needOppositeCond = (CC == X86::COND_E);
   bool checkAgainstTrue = false; // Is it a comparison against 1?
 
@@ -17740,8 +18552,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
     SDValue Ops[] = { FalseOp, TrueOp,
                       DAG.getConstant(CC, MVT::i8), Flags };
-    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
-                       Ops, array_lengthof(Ops));
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
   }
 
   // If this is a select between two integer constants, try to do some
@@ -17856,7 +18667,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
     // the DCI.xxxx conditions are provided to postpone the optimization as
     // late as possible.
 
-    ConstantSDNode *CmpAgainst = 0;
+    ConstantSDNode *CmpAgainst = nullptr;
     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
         !isa<ConstantSDNode>(Cond.getOperand(0))) {
@@ -17871,8 +18682,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
                           DAG.getConstant(CC, MVT::i8), Cond };
-        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
-                           array_lengthof(Ops));
+        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
       }
     }
   }
@@ -17880,6 +18690,106 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+                                                const X86Subtarget *Subtarget) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();
+  // SSE/AVX/AVX2 blend intrinsics.
+  case Intrinsic::x86_avx2_pblendvb:
+  case Intrinsic::x86_avx2_pblendw:
+  case Intrinsic::x86_avx2_pblendd_128:
+  case Intrinsic::x86_avx2_pblendd_256:
+    // Don't try to simplify this intrinsic if we don't have AVX2.
+    if (!Subtarget->hasAVX2())
+      return SDValue();
+    // FALL-THROUGH
+  case Intrinsic::x86_avx_blend_pd_256:
+  case Intrinsic::x86_avx_blend_ps_256:
+  case Intrinsic::x86_avx_blendv_pd_256:
+  case Intrinsic::x86_avx_blendv_ps_256:
+    // Don't try to simplify this intrinsic if we don't have AVX.
+    if (!Subtarget->hasAVX())
+      return SDValue();
+    // FALL-THROUGH
+  case Intrinsic::x86_sse41_pblendw:
+  case Intrinsic::x86_sse41_blendpd:
+  case Intrinsic::x86_sse41_blendps:
+  case Intrinsic::x86_sse41_blendvps:
+  case Intrinsic::x86_sse41_blendvpd:
+  case Intrinsic::x86_sse41_pblendvb: {
+    SDValue Op0 = N->getOperand(1);
+    SDValue Op1 = N->getOperand(2);
+    SDValue Mask = N->getOperand(3);
+
+    // Don't try to simplify this intrinsic if we don't have SSE4.1.
+    if (!Subtarget->hasSSE41())
+      return SDValue();
+
+    // fold (blend A, A, Mask) -> A
+    if (Op0 == Op1)
+      return Op0;
+    // fold (blend A, B, allZeros) -> A
+    if (ISD::isBuildVectorAllZeros(Mask.getNode()))
+      return Op0;
+    // fold (blend A, B, allOnes) -> B
+    if (ISD::isBuildVectorAllOnes(Mask.getNode()))
+      return Op1;
+    
+    // Simplify the case where the mask is a constant i32 value.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
+      if (C->isNullValue())
+        return Op0;
+      if (C->isAllOnesValue())
+        return Op1;
+    }
+  }
+
+  // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx2_psra_d: {
+    SDValue Op0 = N->getOperand(1);
+    SDValue Op1 = N->getOperand(2);
+    EVT VT = Op0.getValueType();
+    assert(VT.isVector() && "Expected a vector type!");
+
+    if (isa<BuildVectorSDNode>(Op1))
+      Op1 = Op1.getOperand(0);
+
+    if (!isa<ConstantSDNode>(Op1))
+      return SDValue();
+
+    EVT SVT = VT.getVectorElementType();
+    unsigned SVTBits = SVT.getSizeInBits();
+
+    ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
+    const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
+    uint64_t ShAmt = C.getZExtValue();
+
+    // Don't try to convert this shift into a ISD::SRA if the shift
+    // count is bigger than or equal to the element size.
+    if (ShAmt >= SVTBits)
+      return SDValue();
+
+    // Trivial case: if the shift count is zero, then fold this
+    // into the first operand.
+    if (ShAmt == 0)
+      return Op0;
+
+    // Replace this packed shift intrinsic with a target independent
+    // shift dag node.
+    SDValue Splat = DAG.getConstant(C, VT);
+    return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
+  }
+  }
+}
+
 /// PerformMulCombine - Optimize a single multiply with constant into two
 /// in order to implement it with two cheaper instructions, e.g.
 /// LEA + SHL, LEA + LEA.
@@ -18223,7 +19133,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
                      N1->getOperand(0));
     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
-    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
+    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
   } else if (RHSTrunc) {
     N1 = N1->getOperand(0);
   }
@@ -18260,40 +19170,13 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   if (R.getNode())
     return R;
 
-  // Create BEXTR and BZHI instructions
-  // BZHI is X & ((1 << Y) - 1)
+  // Create BEXTR instructions
   // BEXTR is ((X >> imm) & (2**size-1))
   if (VT == MVT::i32 || VT == MVT::i64) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
     SDLoc DL(N);
 
-    if (Subtarget->hasBMI2()) {
-      // Check for (and (add (shl 1, Y), -1), X)
-      if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) {
-        SDValue N00 = N0.getOperand(0);
-        if (N00.getOpcode() == ISD::SHL) {
-          SDValue N001 = N00.getOperand(1);
-          assert(N001.getValueType() == MVT::i8 && "unexpected type");
-          ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0));
-          if (C && C->getZExtValue() == 1)
-            return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001);
-        }
-      }
-
-      // Check for (and X, (add (shl 1, Y), -1))
-      if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) {
-        SDValue N10 = N1.getOperand(0);
-        if (N10.getOpcode() == ISD::SHL) {
-          SDValue N101 = N10.getOperand(1);
-          assert(N101.getValueType() == MVT::i8 && "unexpected type");
-          ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0));
-          if (C && C->getZExtValue() == 1)
-            return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101);
-        }
-      }
-    }
-
     // Check for BEXTR.
     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
@@ -18533,8 +19416,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
         SDValue Ops[] = { N0.getOperand(0), Neg,
                           DAG.getConstant(X86::COND_GE, MVT::i8),
                           SDValue(Neg.getNode(), 1) };
-        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
-                           Ops, array_lengthof(Ops));
+        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
       }
   return SDValue();
 }
@@ -18691,8 +19573,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
     }
 
-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
-                               Chains.size());
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
 
     // Bitcast the loaded value to a vector of the original element type, in
     // the size of the target vector type.
@@ -18867,8 +19748,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
       Chains.push_back(Ch);
     }
 
-    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
-                               Chains.size());
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   }
 
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
@@ -18891,7 +19771,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
       St->getChain().hasOneUse() && !St->isVolatile()) {
     SDNode* LdVal = St->getValue().getNode();
-    LoadSDNode *Ld = 0;
+    LoadSDNode *Ld = nullptr;
     int TokenFactorIndex = -1;
     SmallVector<SDValue, 8> Ops;
     SDNode* ChainVal = St->getChain().getNode();
@@ -18934,8 +19814,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
       SDValue NewChain = NewLd.getValue(1);
       if (TokenFactorIndex != -1) {
         Ops.push_back(NewChain);
-        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
-                               Ops.size());
+        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
       }
       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
                           St->getPointerInfo(),
@@ -18962,8 +19841,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     if (TokenFactorIndex != -1) {
       Ops.push_back(LoLd);
       Ops.push_back(HiLd);
-      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
-                             Ops.size());
+      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
     }
 
     LoAddr = St->getBasePtr();
@@ -19432,6 +20310,33 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  MVT VT = N->getOperand(1)->getSimpleValueType(0);
+  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+         "X86insertps is only defined for v4x32");
+
+  SDValue Ld = N->getOperand(1);
+  if (MayFoldLoad(Ld)) {
+    // Extract the countS bits from the immediate so we can get the proper
+    // address when narrowing the vector load to a specific element.
+    // When the second source op is a memory address, interps doesn't use
+    // countS and just gets an f32 from that address.
+    unsigned DestIndex =
+        cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+    Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
+  } else
+    return SDValue();
+
+  // Create this as a scalar to vector to match the instruction pattern.
+  SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+  // countS bits are ignored when loading from memory on insertps, which
+  // means we don't need to explicitly set them to 0.
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+                     LoadScalarToVector, N->getOperand(2));
+}
+
 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
 // as "sbb reg,reg", since it can be extended without zext and produces
 // an all-ones bit which is more useful than 0/1 in some cases.
@@ -19711,7 +20616,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
-  case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
+  case ISD::SIGN_EXTEND_INREG:
+    return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
@@ -19732,6 +20638,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::VPERM2X128:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
+  case X86ISD::INSERTPS:
+    return PerformINSERTPSCombine(N, DAG, Subtarget);
   }
 
   return SDValue();
@@ -20006,7 +20916,7 @@ TargetLowering::ConstraintWeight
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
@@ -20124,7 +21034,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result;
 
   // Only support length 1 constraints for now.
   if (Constraint.length() > 1) return;
@@ -20207,7 +21117,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
     // If we are in non-pic codegen mode, we allow the address of a global (with
     // an optional displacement) to be used with 'i'.
-    GlobalAddressSDNode *GA = 0;
+    GlobalAddressSDNode *GA = nullptr;
     int64_t Offset = 0;
 
     // Match either (GA), (GA+C), (GA+C1+C2), etc.
@@ -20363,7 +21273,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 
   // Not found as a standard register?
-  if (Res.second == 0) {
+  if (!Res.second) {
     // Map st(0) -> st(7) -> ST0
     if (Constraint.size() == 7 && Constraint[0] == '{' &&
         tolower(Constraint[1]) == 's' &&
@@ -20488,3 +21398,30 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
 
   return Res;
 }
+
+int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
+                                            Type *Ty) const {
+  // Scaling factors are not free at all.
+  // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
+  // will take 2 allocations in the out of order engine instead of 1
+  // for plain addressing mode, i.e. inst (reg1).
+  // E.g.,
+  // vaddps (%rsi,%drx), %ymm0, %ymm1
+  // Requires two allocations (one for the load, one for the computation)
+  // whereas:
+  // vaddps (%rsi), %ymm0, %ymm1
+  // Requires just 1 allocation, i.e., freeing allocations for other operations
+  // and having less micro operations to execute.
+  //
+  // For some X86 architectures, this is even worse because for instance for
+  // stores, the complex addressing mode forces the instruction to use the
+  // "load" ports instead of the dedicated "store" port.
+  // E.g., on Haswell:
+  // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
+  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.   
+  if (isLegalAddressingMode(AM, Ty))
+    // Scale represents reg2 * scale, thus account for 1
+    // as soon as we use a second register.
+    return AM.Scale != 0;
+  return -1;
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 0f0d17b..9f51b53 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -83,6 +83,9 @@ namespace llvm {
       /// readcyclecounter
       RDTSC_DAG,
 
+      /// X86 Read Time-Stamp Counter and Processor ID.
+      RDTSCP_DAG,
+
       /// X86 compare and logical compare instructions.
       CMP, COMI, UCOMI,
 
@@ -291,7 +294,6 @@ namespace llvm {
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
 
-      BZHI,   // BZHI - Zero high bits
       BEXTR,  // BEXTR - Bit field extract
 
       UMUL, // LOW, HI, FLAGS = umul LHS, RHS
@@ -345,6 +347,8 @@ namespace llvm {
 
       // PMULUDQ - Vector multiply packed unsigned doubleword integers
       PMULUDQ,
+      // PMULUDQ - Vector multiply packed signed doubleword integers
+      PMULDQ,
 
       // FMA nodes
       FMADD,
@@ -614,18 +618,19 @@ namespace llvm {
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
     EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
-    /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+    /// computeKnownBitsForTargetNode - Determine which of the bits specified
     /// in Mask are known to be either zero or one and return them in the
     /// KnownZero/KnownOne bitsets.
-    void computeMaskedBitsForTargetNode(const SDValue Op,
-                                        APInt &KnownZero,
-                                        APInt &KnownOne,
-                                        const SelectionDAG &DAG,
-                                        unsigned Depth = 0) const override;
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
 
     // ComputeNumSignBitsForTargetNode - Determine the number of bits in the
     // operation that are sign bits.
     unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                             const SelectionDAG &DAG,
                                              unsigned Depth) const override;
 
     bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
@@ -679,6 +684,12 @@ namespace llvm {
     /// the immediate into a register.
     bool isLegalAddImmediate(int64_t Imm) const override;
 
+    /// \brief Return the cost of the scaling factor used in the addressing
+    /// mode represented by AM for this target, for a load/store
+    /// of the specified type.
+    /// If the AM is supported, the return value must be >= 0.
+    /// If the AM is not supported, it returns a negative value.
+    int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
 
     bool isVectorShiftByScalarCheap(Type *Ty) const override;
 
@@ -771,10 +782,12 @@ namespace llvm {
                                            Type *Ty) const override;
 
     /// Intel processors have a unified instruction and data cache
-    const char * getClearCacheBuiltinName() const {
-      return 0; // nothing to do, move along.
+    const char * getClearCacheBuiltinName() const override {
+      return nullptr; // nothing to do, move along.
     }
 
+    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
@@ -871,8 +884,11 @@ namespace llvm {
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
+    SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
+
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -908,6 +924,7 @@ namespace llvm {
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue
       LowerFormalArguments(SDValue Chain,
@@ -936,7 +953,7 @@ namespace llvm {
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const override;
 
-    const uint16_t *getScratchRegisters(CallingConv::ID CC) const override;
+    const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
     /// Utility function to emit atomic-load-arith operations (and, or, xor,
     /// nand, max, min, umax, umin). It takes the corresponding instruction to
@@ -987,11 +1004,12 @@ namespace llvm {
 
     /// Emit nodes that will be selected as "test Op0,Op0", or something
     /// equivalent, for use with the given x86 condition code.
-    SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const;
+    SDValue EmitTest(SDValue Op0, unsigned X86CC, SDLoc dl,
+                     SelectionDAG &DAG) const;
 
     /// Emit nodes that will be selected as "cmp Op0,Op1", or something
     /// equivalent, for use with the given x86 condition code.
-    SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+    SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl,
                     SelectionDAG &DAG) const;
 
     /// Convert a comparison if required by the subtarget.
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 2c5edf6..37bcc52 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -209,12 +209,12 @@ def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1),
 def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
       (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-      [(set VR128X:$dst, (X86insrtps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+      [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
       EVEX_4V;
 def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
       (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-      [(set VR128X:$dst, (X86insrtps VR128X:$src1,
+      [(set VR128X:$dst, (X86insertps VR128X:$src1,
                           (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
                           imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
 
@@ -621,6 +621,22 @@ defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps",  VR512, memopv16f32, i512me
                                X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
 defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd",  VR512, memopv8f64, i512mem, 
                                X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx),
+                   (v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))),
+          (VPERMT2PSrr VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+def : Pat<(v16i32 (int_x86_avx512_mask_vpermt_d_512 (v16i32 VR512:$idx),
+                   (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))),
+          (VPERMT2Drr VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+def : Pat<(v8f64 (int_x86_avx512_mask_vpermt_pd_512 (v8i64 VR512:$idx),
+                   (v8f64 VR512:$src1), (v8f64 VR512:$src2), (i8 -1))),
+          (VPERMT2PDrr VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+def : Pat<(v8i64 (int_x86_avx512_mask_vpermt_q_512 (v8i64 VR512:$idx),
+                   (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))),
+          (VPERMT2Qrr VR512:$src1, VR512:$idx, VR512:$src2)>;
 //===----------------------------------------------------------------------===//
 // AVX-512 - BLEND using mask
 //
@@ -984,6 +1000,10 @@ let Predicates = [HasAVX512] in {
             (EXTRACT_SUBREG
              (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
               sub_16bit)>;
+  def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
+            (COPY_TO_REGCLASS VK1:$src, VK16)>;
+  def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
+            (COPY_TO_REGCLASS VK1:$src, VK8)>;
 }
 // With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
 let Predicates = [HasAVX512] in {
@@ -1356,6 +1376,23 @@ defm VMOVDQU64: avx512_load<0x6F, VR512, VK8WM, i512mem, load,
                               "vmovdqu64", SSEPackedInt, v8i64>,
                                XS, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
+def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr,
+                 (v16i32 immAllZerosV), GR16:$mask)),
+       (VMOVDQU32rmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr,
+                 (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)),
+       (VMOVDQU64rmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src),
+          GR16:$mask),
+         (VMOVDQU32mrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+            VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src),
+          GR8:$mask),
+         (VMOVDQU64mrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+            VR512:$src)>;
+
 let AddedComplexity = 20 in {
 def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src),
                            (bc_v8i64 (v16i32 immAllZerosV)))),
@@ -3112,6 +3149,17 @@ def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
           (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr 
            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
                                  
+def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
+           (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
+          (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+           (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+           
+def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+           (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
 
 def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src),
                    (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
@@ -3715,7 +3763,7 @@ defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512,
                                 EVEX_CD8<32, CD8VF>;
 
 def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1),
-                   imm:$src2, (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1),
+                   imm:$src2, (v16f32 VR512:$src1), (i16 -1),
                    FROUND_CURRENT)),
                    (VRNDSCALEPSZr VR512:$src1, imm:$src2)>;
 
@@ -3725,7 +3773,7 @@ defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512,
                                 VEX_W, EVEX_CD8<64, CD8VF>;
 
 def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1),
-                  imm:$src2, (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1),
+                  imm:$src2, (v8f64 VR512:$src1), (i8 -1),
                   FROUND_CURRENT)),
                    (VRNDSCALEPDZr VR512:$src1, imm:$src2)>;
 
@@ -3807,7 +3855,13 @@ multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
                !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"),
                []>, EVEX;
 
-  def krr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+  def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+               (ins KRC:$mask, srcRC:$src),
+               !strconcat(OpcodeStr,
+                 " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+               []>, EVEX, EVEX_K;
+
+  def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
                (ins KRC:$mask, srcRC:$src),
                !strconcat(OpcodeStr,
                  " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
@@ -3816,6 +3870,12 @@ multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
   def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src),
                !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                []>, EVEX;
+
+  def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
+               (ins x86memop:$dst, KRC:$mask, srcRC:$src),
+               !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"),
+               []>, EVEX, EVEX_K;
+
 }
 defm VPMOVQB    : avx512_trunc_sat<0x32, "vpmovqb",   VR128X, VR512, VK8WM, 
                                  i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
@@ -3855,60 +3915,86 @@ def : Pat<(v16i8  (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr  VR512:$src)>;
 def : Pat<(v8i32  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQDrr  VR512:$src)>;
 
 def : Pat<(v16i8  (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
-                  (VPMOVDBkrr VK16WM:$mask, VR512:$src)>;
+                  (VPMOVDBrrkz VK16WM:$mask, VR512:$src)>;
 def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
-                  (VPMOVDWkrr VK16WM:$mask, VR512:$src)>;
+                  (VPMOVDWrrkz VK16WM:$mask, VR512:$src)>;
 def : Pat<(v8i16  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
-                  (VPMOVQWkrr  VK8WM:$mask, VR512:$src)>;
+                  (VPMOVQWrrkz  VK8WM:$mask, VR512:$src)>;
 def : Pat<(v8i32  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
-                  (VPMOVQDkrr  VK8WM:$mask, VR512:$src)>;
+                  (VPMOVQDrrkz  VK8WM:$mask, VR512:$src)>;
 
 
-multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass DstRC,
-                      RegisterClass SrcRC, SDNode OpNode, PatFrag mem_frag, 
-                      X86MemOperand x86memop, ValueType OpVT, ValueType InVT> {
+multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                      RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode,
+                      PatFrag mem_frag, X86MemOperand x86memop,
+                      ValueType OpVT, ValueType InVT> {
 
   def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
               (ins SrcRC:$src),
               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX;
-  def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
+              (ins KRC:$mask, SrcRC:$src),
+              !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"),
+              []>, EVEX, EVEX_K;
+
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
+              (ins KRC:$mask, SrcRC:$src),
+              !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+              []>, EVEX, EVEX_KZ;
+
+  let mayLoad = 1 in {
+    def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
               (ins x86memop:$src),
               !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst,
                 (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>,
               EVEX;
+
+    def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins KRC:$mask, x86memop:$src),
+              !strconcat(OpcodeStr," \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"),
+              []>,
+              EVEX, EVEX_K;
+
+    def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins KRC:$mask, x86memop:$src),
+              !strconcat(OpcodeStr," \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+              []>,
+              EVEX, EVEX_KZ;
+  }
 }
 
-defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VR512, VR128X, X86vzext, 
+defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VK16WM, VR512, VR128X, X86vzext,
                              memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
                              EVEX_CD8<8, CD8VQ>;
-defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VR512, VR128X, X86vzext, 
+defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VK8WM, VR512, VR128X, X86vzext,
                              memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
                              EVEX_CD8<8, CD8VO>;
-defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VR512, VR256X, X86vzext, 
+defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VK16WM, VR512, VR256X, X86vzext,
                              memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
                              EVEX_CD8<16, CD8VH>;
-defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VR512, VR128X, X86vzext, 
+defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VK8WM, VR512, VR128X, X86vzext,
                              memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
                              EVEX_CD8<16, CD8VQ>;
-defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VR512, VR256X, X86vzext, 
+defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VK8WM, VR512, VR256X, X86vzext,
                              memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
                              EVEX_CD8<32, CD8VH>;
-                             
-defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VR512, VR128X, X86vsext, 
+
+defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VK16WM, VR512, VR128X, X86vsext,
                              memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
                              EVEX_CD8<8, CD8VQ>;
-defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VR512, VR128X, X86vsext, 
+defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VK8WM, VR512, VR128X, X86vsext,
                              memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
                              EVEX_CD8<8, CD8VO>;
-defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VR512, VR256X, X86vsext, 
+defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VK16WM, VR512, VR256X, X86vsext,
                              memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
                              EVEX_CD8<16, CD8VH>;
-defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VR512, VR128X, X86vsext, 
+defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VK8WM, VR512, VR128X, X86vsext,
                              memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
                              EVEX_CD8<16, CD8VQ>;
-defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VR512, VR256X, X86vsext, 
+defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VK8WM, VR512, VR256X, X86vsext,
                              memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
                              EVEX_CD8<32, CD8VH>;
 
@@ -3984,6 +4070,62 @@ defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>,
 defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>,
                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
+// prefetch
+multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
+                       RegisterClass KRC, X86MemOperand memop> {
+  let Predicates = [HasPFI], hasSideEffects = 1 in
+  def m  : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
+            !strconcat(OpcodeStr, " \t{$src {${mask}}|{${mask}}, $src}"),
+            []>, EVEX, EVEX_K;
+}
+
+defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+                     
+defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
 //===----------------------------------------------------------------------===//
 // VSHUFPS - VSHUFPD Operations
 
@@ -4200,3 +4342,19 @@ def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1,
                                               GR8:$mask),
           (VPCONFLICTQrrk VR512:$src1,
            (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
+
+def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
+
+def : Pat<(store VK1:$src, addr:$dst),
+          (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK16))>;
+
+def truncstorei1 : PatFrag<(ops node:$val, node:$ptr),
+                           (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i1;
+}]>;
+
+def : Pat<(truncstorei1 GR8:$src, addr:$dst),
+          (MOV8mr addr:$dst, GR8:$src)>;
+
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index aaef4a4..e421f8c 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -52,7 +52,8 @@ struct X86AddressMode {
   unsigned GVOpFlags;
 
   X86AddressMode()
-    : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) {
+    : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(nullptr),
+      GVOpFlags(0) {
     Base.Reg = 0;
   }
 
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 401849f..34d8fb9 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -1187,9 +1187,9 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
     return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
 
   APInt KnownZero0, KnownOne0;
-  CurDAG->ComputeMaskedBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+  CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
   APInt KnownZero1, KnownOne1;
-  CurDAG->ComputeMaskedBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+  CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
   return (~KnownZero0 & ~KnownZero1) == 0;
 }]>;
 
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index df6c9da..c0a6864 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -19,8 +19,9 @@ let Constraints = "$src1 = $dst" in {
 multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                     PatFrag MemFrag128, PatFrag MemFrag256,
                     ValueType OpVT128, ValueType OpVT256,
+                    bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
                     SDPatternOperator Op = null_frag> {
-  let usesCustomInserter = 1 in
+  let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
   def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
                    !strconcat(OpcodeStr,
@@ -28,7 +29,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
                                                VR128:$src1, VR128:$src3)))]>;
 
-  let mayLoad = 1 in
+  let mayLoad = 1, isCommutable = IsMVariantCommutable in
   def m     : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, f128mem:$src3),
                    !strconcat(OpcodeStr,
@@ -36,7 +37,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
                                                (MemFrag128 addr:$src3))))]>;
 
-  let usesCustomInserter = 1 in
+  let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
   def rY    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, VR256:$src3),
                    !strconcat(OpcodeStr,
@@ -44,7 +45,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
                                                VR256:$src3)))]>, VEX_L;
 
-  let mayLoad = 1 in
+  let mayLoad = 1, isCommutable = IsMVariantCommutable in
   def mY    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, f256mem:$src3),
                    !strconcat(OpcodeStr,
@@ -59,18 +60,27 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpcodeStr, string PackTy,
                        PatFrag MemFrag128, PatFrag MemFrag256,
                        SDNode Op, ValueType OpTy128, ValueType OpTy256> {
-  let isCommutable = 1 in
+  // For 213, both the register and memory variant are commutable.
+  // Indeed, the commutable operands are 1 and 2 and both live in registers
+  // for both variants.
   defm r213 : fma3p_rm<opc213,
                        !strconcat(OpcodeStr, "213", PackTy),
-                       MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
+                       MemFrag128, MemFrag256, OpTy128, OpTy256,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 1,
+                       Op>;
 let neverHasSideEffects = 1 in {
   defm r132 : fma3p_rm<opc132,
                        !strconcat(OpcodeStr, "132", PackTy),
                        MemFrag128, MemFrag256, OpTy128, OpTy256>;
-  let isCommutable = 1 in
+  // For 231, only the register variant is commutable.
+  // For the memory variant the folded operand must be in 3. Thus,
+  // in that case, it cannot be swapped with 2.
   defm r231 : fma3p_rm<opc231,
                        !strconcat(OpcodeStr, "231", PackTy),
-                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
+                       MemFrag128, MemFrag256, OpTy128, OpTy256,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 0>;
 } // neverHasSideEffects = 1
 }
 
@@ -119,8 +129,9 @@ let ExeDomain = SSEPackedDouble in {
 let Constraints = "$src1 = $dst" in {
 multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
                     RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
+                    bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
                     SDPatternOperator OpNode = null_frag> {
-  let usesCustomInserter = 1 in
+  let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
   def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
@@ -128,7 +139,7 @@ multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
                    [(set RC:$dst,
                      (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
 
-  let mayLoad = 1 in
+  let mayLoad = 1, isCommutable = IsMVariantCommutable in
   def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, x86memop:$src3),
                    !strconcat(OpcodeStr,
@@ -147,14 +158,21 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
 let neverHasSideEffects = 1 in {
   defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
                        x86memop, RC, OpVT, mem_frag>;
-  let isCommutable = 1 in
+  // See the other defm of r231 for the explanation regarding the
+  // commutable flags.
   defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
-                       x86memop, RC, OpVT, mem_frag>;
+                       x86memop, RC, OpVT, mem_frag,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 0>;
 }
 
-let isCommutable = 1 in
+// See the other defm of r213 for the explanation regarding the
+// commutable flags.
 defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
-                     x86memop, RC, OpVT, mem_frag, OpNode>;
+                     x86memop, RC, OpVT, mem_frag,
+                     /* IsRVariantCommutable */ 1,
+                     /* IsMVariantCommutable */ 1,
+                     OpNode>;
 }
 
 multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 486e5a9..1582f43 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -81,7 +81,7 @@ def X86pinsrb  : SDNode<"X86ISD::PINSRB",
 def X86pinsrw  : SDNode<"X86ISD::PINSRW",
                  SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
                                       SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
-def X86insrtps : SDNode<"X86ISD::INSERTPS",
+def X86insertps : SDNode<"X86ISD::INSERTPS",
                  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
                                       SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
 def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
@@ -175,6 +175,9 @@ def X86select  : SDNode<"X86ISD::SELECT"     , SDTSelect>;
 def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                       SDTCisSameAs<1,2>]>>;
+def X86pmuldq  : SDNode<"X86ISD::PMULDQ",
+                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                       SDTCisSameAs<1,2>]>>;
 
 // Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
 // translated into one of the target nodes below during lowering.
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 6450f2a..6993577 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -36,11 +36,13 @@
 #include "llvm/Target/TargetOptions.h"
 #include <limits>
 
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-instr-info"
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "X86GenInstrInfo.inc"
 
-using namespace llvm;
-
 static cl::opt<bool>
 NoFusing("disable-spill-fusing",
          cl::desc("Disable fusing of spill code into instructions"));
@@ -1511,12 +1513,14 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
 /// operand and follow operands form a reference to the stack frame.
 bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
                                   int &FrameIndex) const {
-  if (MI->getOperand(Op).isFI() && MI->getOperand(Op+1).isImm() &&
-      MI->getOperand(Op+2).isReg() && MI->getOperand(Op+3).isImm() &&
-      MI->getOperand(Op+1).getImm() == 1 &&
-      MI->getOperand(Op+2).getReg() == 0 &&
-      MI->getOperand(Op+3).getImm() == 0) {
-    FrameIndex = MI->getOperand(Op).getIndex();
+  if (MI->getOperand(Op+X86::AddrBaseReg).isFI() &&
+      MI->getOperand(Op+X86::AddrScaleAmt).isImm() &&
+      MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
+      MI->getOperand(Op+X86::AddrDisp).isImm() &&
+      MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 &&
+      MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 &&
+      MI->getOperand(Op+X86::AddrDisp).getImm() == 0) {
+    FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex();
     return true;
   }
   return false;
@@ -1680,15 +1684,16 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
   case X86::FsMOVAPSrm:
   case X86::FsMOVAPDrm: {
     // Loads from constant pools are trivially rematerializable.
-    if (MI->getOperand(1).isReg() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+    if (MI->getOperand(1+X86::AddrBaseReg).isReg() &&
+        MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
+        MI->getOperand(1+X86::AddrIndexReg).isReg() &&
+        MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
         MI->isInvariantLoad(AA)) {
-      unsigned BaseReg = MI->getOperand(1).getReg();
+      unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
       if (BaseReg == 0 || BaseReg == X86::RIP)
         return true;
       // Allow re-materialization of PIC load.
-      if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
+      if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal())
         return false;
       const MachineFunction &MF = *MI->getParent()->getParent();
       const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1699,13 +1704,14 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
 
   case X86::LEA32r:
   case X86::LEA64r: {
-    if (MI->getOperand(2).isImm() &&
-        MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
-        !MI->getOperand(4).isReg()) {
+    if (MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
+        MI->getOperand(1+X86::AddrIndexReg).isReg() &&
+        MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
+        !MI->getOperand(1+X86::AddrDisp).isReg()) {
       // lea fi#, lea GV, etc. are all rematerializable.
-      if (!MI->getOperand(1).isReg())
+      if (!MI->getOperand(1+X86::AddrBaseReg).isReg())
         return true;
-      unsigned BaseReg = MI->getOperand(1).getReg();
+      unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
       if (BaseReg == 0)
         return true;
       // Allow re-materialization of lea PICBase + x.
@@ -1722,12 +1728,8 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
   return true;
 }
 
-/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that
-/// would clobber the EFLAGS condition register. Note the result may be
-/// conservative. If it cannot definitely determine the safety after visiting
-/// a few instructions in each direction it assumes it's not safe.
-static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) {
+bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I) const {
   MachineBasicBlock::iterator E = MBB.end();
 
   // For compile time consideration, if we are not able to determine the
@@ -1998,7 +2000,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
     unsigned Src2 = MI->getOperand(2).getReg();
     bool isKill2 = MI->getOperand(2).isKill();
     unsigned leaInReg2 = 0;
-    MachineInstr *InsMI2 = 0;
+    MachineInstr *InsMI2 = nullptr;
     if (Src == Src2) {
       // ADD16rr %reg1028<kill>, %reg1028
       // just a single insert_subreg.
@@ -2062,14 +2064,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   // convert them to equivalent lea if the condition code register def's
   // are dead!
   if (hasLiveCondCodeDef(MI))
-    return 0;
+    return nullptr;
 
   MachineFunction &MF = *MI->getParent()->getParent();
   // All instructions input are two-addr instructions.  Get the known operands.
   const MachineOperand &Dest = MI->getOperand(0);
   const MachineOperand &Src = MI->getOperand(1);
 
-  MachineInstr *NewMI = NULL;
+  MachineInstr *NewMI = nullptr;
   // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
   // we have better subtarget support, enable the 16-bit LEA generation here.
   // 16-bit LEA is also slow on Core2.
@@ -2080,11 +2082,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   switch (MIOpc) {
   case X86::SHUFPSrri: {
     assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
-    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr;
 
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
-    if (B != C) return 0;
+    if (B != C) return nullptr;
     unsigned M = MI->getOperand(3).getImm();
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
       .addOperand(Dest).addOperand(Src).addImm(M);
@@ -2092,11 +2094,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   }
   case X86::SHUFPDrri: {
     assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!");
-    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr;
 
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
-    if (B != C) return 0;
+    if (B != C) return nullptr;
     unsigned M = MI->getOperand(3).getImm();
 
     // Convert to PSHUFD mask.
@@ -2109,13 +2111,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::SHL64ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
     // LEA can't handle RSP.
     if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
         !MF.getRegInfo().constrainRegClass(Src.getReg(),
                                            &X86::GR64_NOSPRegClass))
-      return 0;
+      return nullptr;
 
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
       .addOperand(Dest)
@@ -2125,7 +2127,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::SHL32ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
     unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
@@ -2135,7 +2137,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
                         SrcReg, isKill, isUndef, ImplicitOp))
-      return 0;
+      return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
       .addOperand(Dest)
@@ -2151,10 +2153,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::SHL16ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
     if (DisableLEA16)
-      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr;
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
       .addOperand(Dest)
       .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
@@ -2163,7 +2165,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   default: {
 
     switch (MIOpc) {
-    default: return 0;
+    default: return nullptr;
     case X86::INC64r:
     case X86::INC32r:
     case X86::INC64_32r: {
@@ -2175,7 +2177,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
       if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
                           SrcReg, isKill, isUndef, ImplicitOp))
-        return 0;
+        return nullptr;
 
       MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
           .addOperand(Dest)
@@ -2189,7 +2191,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::INC16r:
     case X86::INC64_16r:
       if (DisableLEA16)
-        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                       : nullptr;
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
       NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                         .addOperand(Dest).addOperand(Src), 1);
@@ -2206,7 +2209,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
       if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
                           SrcReg, isKill, isUndef, ImplicitOp))
-        return 0;
+        return nullptr;
 
       MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
           .addOperand(Dest)
@@ -2221,7 +2224,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::DEC16r:
     case X86::DEC64_16r:
       if (DisableLEA16)
-        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                       : nullptr;
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
       NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                         .addOperand(Dest).addOperand(Src), -1);
@@ -2242,7 +2246,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
       if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
                           SrcReg, isKill, isUndef, ImplicitOp))
-        return 0;
+        return nullptr;
 
       const MachineOperand &Src2 = MI->getOperand(2);
       bool isKill2, isUndef2;
@@ -2250,7 +2254,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
       if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
                           SrcReg2, isKill2, isUndef2, ImplicitOp2))
-        return 0;
+        return nullptr;
 
       MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
         .addOperand(Dest);
@@ -2272,7 +2276,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD16rr:
     case X86::ADD16rr_DB: {
       if (DisableLEA16)
-        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                       : nullptr;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Src2 = MI->getOperand(2).getReg();
       bool isKill2 = MI->getOperand(2).isKill();
@@ -2311,7 +2316,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
       if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
                           SrcReg, isKill, isUndef, ImplicitOp))
-        return 0;
+        return nullptr;
 
       MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
           .addOperand(Dest)
@@ -2327,7 +2332,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD16ri_DB:
     case X86::ADD16ri8_DB:
       if (DisableLEA16)
-        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                       : nullptr;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                         .addOperand(Dest).addOperand(Src),
@@ -2337,7 +2343,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   }
   }
 
-  if (!NewMI) return 0;
+  if (!NewMI) return nullptr;
 
   if (LV) {  // Update live variables
     if (Src.isKill())
@@ -2789,11 +2795,11 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
         std::next(I)->eraseFromParent();
 
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       // Delete the JMP if it's equivalent to a fall-through.
       if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
-        TBB = 0;
+        TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
         UnCondBrIter = MBB.end();
@@ -3549,6 +3555,26 @@ inline static bool isDefConvertible(MachineInstr *MI) {
   }
 }
 
+/// isUseDefConvertible - check whether the use can be converted
+/// to remove a comparison against zero.
+static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return X86::COND_INVALID;
+  case X86::LZCNT16rr: case X86::LZCNT16rm:
+  case X86::LZCNT32rr: case X86::LZCNT32rm:
+  case X86::LZCNT64rr: case X86::LZCNT64rm:
+    return X86::COND_B;
+  case X86::POPCNT16rr:case X86::POPCNT16rm:
+  case X86::POPCNT32rr:case X86::POPCNT32rm:
+  case X86::POPCNT64rr:case X86::POPCNT64rm:
+    return X86::COND_E;
+  case X86::TZCNT16rr: case X86::TZCNT16rm:
+  case X86::TZCNT32rr: case X86::TZCNT32rm:
+  case X86::TZCNT64rr: case X86::TZCNT64rm:
+    return X86::COND_B;
+  }
+}
+
 /// optimizeCompareInstr - Check if there exists an earlier instruction that
 /// operates on the same source operands and sets flags in the same way as
 /// Compare; remove Compare if possible.
@@ -3615,13 +3641,38 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // If we are comparing against zero, check whether we can use MI to update
   // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
   bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
-  if (IsCmpZero && (MI->getParent() != CmpInstr->getParent() ||
-      !isDefConvertible(MI)))
+  if (IsCmpZero && MI->getParent() != CmpInstr->getParent())
     return false;
 
+  // If we have a use of the source register between the def and our compare
+  // instruction we can eliminate the compare iff the use sets EFLAGS in the
+  // right way.
+  bool ShouldUpdateCC = false;
+  X86::CondCode NewCC = X86::COND_INVALID;
+  if (IsCmpZero && !isDefConvertible(MI)) {
+    // Scan forward from the use until we hit the use we're looking for or the
+    // compare instruction.
+    for (MachineBasicBlock::iterator J = MI;; ++J) {
+      // Do we have a convertible instruction?
+      NewCC = isUseDefConvertible(J);
+      if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
+          J->getOperand(1).getReg() == SrcReg) {
+        assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
+        ShouldUpdateCC = true; // Update CC later on.
+        // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
+        // with the new def.
+        MI = Def = J;
+        break;
+      }
+
+      if (J == I)
+        return false;
+    }
+  }
+
   // We are searching for an earlier instruction that can make CmpInstr
   // redundant and that instruction will be saved in Sub.
-  MachineInstr *Sub = NULL;
+  MachineInstr *Sub = nullptr;
   const TargetRegisterInfo *TRI = &getRegisterInfo();
 
   // We iterate backward, starting from the instruction before CmpInstr and
@@ -3634,7 +3685,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
       RE = CmpInstr->getParent() == MI->getParent() ?
            MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
            CmpInstr->getParent()->rend();
-  MachineInstr *Movr0Inst = 0;
+  MachineInstr *Movr0Inst = nullptr;
   for (; RI != RE; ++RI) {
     MachineInstr *Instr = &*RI;
     // Check whether CmpInstr can be made redundant by the current instruction.
@@ -3716,13 +3767,28 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
         // CF and OF are used, we can't perform this optimization.
         return false;
       }
+
+      // If we're updating the condition code check if we have to reverse the
+      // condition.
+      if (ShouldUpdateCC)
+        switch (OldCC) {
+        default:
+          return false;
+        case X86::COND_E:
+          break;
+        case X86::COND_NE:
+          NewCC = GetOppositeBranchCondition(NewCC);
+          break;
+        }
     } else if (IsSwapped) {
       // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
       // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
       // We swap the condition code and synthesize the new opcode.
-      X86::CondCode NewCC = getSwappedCondition(OldCC);
+      NewCC = getSwappedCondition(OldCC);
       if (NewCC == X86::COND_INVALID) return false;
+    }
 
+    if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) {
       // Synthesize the new opcode.
       bool HasMemoryOperand = Instr.hasOneMemOperand();
       unsigned NewOpc;
@@ -3809,19 +3875,19 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
                   unsigned &FoldAsLoadDefReg,
                   MachineInstr *&DefMI) const {
   if (FoldAsLoadDefReg == 0)
-    return 0;
+    return nullptr;
   // To be conservative, if there exists another load, clear the load candidate.
   if (MI->mayLoad()) {
     FoldAsLoadDefReg = 0;
-    return 0;
+    return nullptr;
   }
 
   // Check whether we can move DefMI here.
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
   bool SawStore = false;
-  if (!DefMI->isSafeToMove(this, 0, SawStore))
-    return 0;
+  if (!DefMI->isSafeToMove(this, nullptr, SawStore))
+    return nullptr;
 
   // We try to commute MI if possible.
   unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1;
@@ -3838,12 +3904,12 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
         continue;
       // Do not fold if we have a subreg use or a def or multiple uses.
       if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
-        return 0;
+        return nullptr;
 
       SrcOperandId = i;
       FoundSrcOperand = true;
     }
-    if (!FoundSrcOperand) return 0;
+    if (!FoundSrcOperand) return nullptr;
 
     // Check whether we can fold the def into SrcOperandId.
     SmallVector<unsigned, 8> Ops;
@@ -3857,22 +3923,22 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
     if (Idx == 1) {
       // MI was changed but it didn't help, commute it back!
       commuteInstruction(MI, false);
-      return 0;
+      return nullptr;
     }
 
     // Check whether we can commute MI and enable folding.
     if (MI->isCommutable()) {
       MachineInstr *NewMI = commuteInstruction(MI, false);
       // Unable to commute.
-      if (!NewMI) return 0;
+      if (!NewMI) return nullptr;
       if (NewMI != MI) {
         // New instruction. It doesn't need to be kept.
         NewMI->eraseFromParent();
-        return 0;
+        return nullptr;
       }
     }
   }
-  return 0;
+  return nullptr;
 }
 
 /// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
@@ -4007,7 +4073,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                     MachineInstr *MI, unsigned i,
                                     const SmallVectorImpl<MachineOperand> &MOs,
                                     unsigned Size, unsigned Align) const {
-  const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0;
+  const DenseMap<unsigned,
+                 std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
   bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect();
   bool isTwoAddrFold = false;
 
@@ -4015,7 +4082,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // when X86Subtarget is Atom.
   if (isCallRegIndirect &&
     (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) {
-    return NULL;
+    return nullptr;
   }
 
   unsigned NumOps = MI->getDesc().getNumOperands();
@@ -4026,9 +4093,9 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
   if (MI->getOpcode() == X86::ADD32ri &&
       MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
-    return NULL;
+    return nullptr;
 
-  MachineInstr *NewMI = NULL;
+  MachineInstr *NewMI = nullptr;
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
   // replacing the *two* registers with the memory location.
@@ -4063,7 +4130,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       unsigned Opcode = I->second.first;
       unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
       if (Align < MinAlign)
-        return NULL;
+        return nullptr;
       bool NarrowToMOV32rm = false;
       if (Size) {
         unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize();
@@ -4071,12 +4138,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
           // Check if it's safe to fold the load. If the size of the object is
           // narrower than the load width, then it's not.
           if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
-            return NULL;
+            return nullptr;
           // If this is a 64-bit load, but the spill slot is 32, then we can do
           // a 32-bit load which is implicitly zero-extended. This likely is due
           // to liveintervalanalysis remat'ing a load from stack slot.
           if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
-            return NULL;
+            return nullptr;
           Opcode = X86::MOV32rm;
           NarrowToMOV32rm = true;
         }
@@ -4105,7 +4172,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // No fusion
   if (PrintFailedFusing && !MI->isCopy())
     dbgs() << "We failed to fuse operand " << i << " in " << *MI;
-  return NULL;
+  return nullptr;
 }
 
 /// hasPartialRegUpdate - Return true for all instructions that only update
@@ -4270,14 +4337,14 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                     const SmallVectorImpl<unsigned> &Ops,
                                     int FrameIndex) const {
   // Check switch flag
-  if (NoFusing) return NULL;
+  if (NoFusing) return nullptr;
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
   if (!MF.getFunction()->getAttributes().
         hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
       hasPartialRegUpdate(MI->getOpcode()))
-    return 0;
+    return nullptr;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned Size = MFI->getObjectSize(FrameIndex);
@@ -4290,7 +4357,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
     switch (MI->getOpcode()) {
-    default: return NULL;
+    default: return nullptr;
     case X86::TEST8rr:  NewOpc = X86::CMP8ri; RCSize = 1; break;
     case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
     case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
@@ -4299,12 +4366,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
     // Check if it's safe to fold the load. If the size of the object is
     // narrower than the load width, then it's not.
     if (Size < RCSize)
-      return NULL;
+      return nullptr;
     // Change to CMPXXri r, 0 first.
     MI->setDesc(get(NewOpc));
     MI->getOperand(1).ChangeToImmediate(0);
   } else if (Ops.size() != 1)
-    return NULL;
+    return nullptr;
 
   SmallVector<MachineOperand,4> MOs;
   MOs.push_back(MachineOperand::CreateFI(FrameIndex));
@@ -4322,14 +4389,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
 
   // Check switch flag
-  if (NoFusing) return NULL;
+  if (NoFusing) return nullptr;
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
   if (!MF.getFunction()->getAttributes().
         hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
       hasPartialRegUpdate(MI->getOpcode()))
-    return 0;
+    return nullptr;
 
   // Determine the alignment of the load.
   unsigned Alignment = 0;
@@ -4352,12 +4419,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       Alignment = 4;
       break;
     default:
-      return 0;
+      return nullptr;
     }
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     switch (MI->getOpcode()) {
-    default: return NULL;
+    default: return nullptr;
     case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
     case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
     case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
@@ -4367,12 +4434,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     MI->setDesc(get(NewOpc));
     MI->getOperand(1).ChangeToImmediate(0);
   } else if (Ops.size() != 1)
-    return NULL;
+    return nullptr;
 
   // Make sure the subregisters match.
   // Otherwise we risk changing the size of the load.
   if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg())
-    return NULL;
+    return nullptr;
 
   SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
   switch (LoadMI->getOpcode()) {
@@ -4388,7 +4455,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     // Medium and large mode can't fold loads this way.
     if (TM.getCodeModel() != CodeModel::Small &&
         TM.getCodeModel() != CodeModel::Kernel)
-      return NULL;
+      return nullptr;
 
     // x86-32 PIC requires a PIC base register for constant pools.
     unsigned PICBase = 0;
@@ -4400,7 +4467,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
         // This doesn't work for several reasons.
         // 1. GlobalBaseReg may have been spilled.
         // 2. It may not be live at MI.
-        return NULL;
+        return nullptr;
     }
 
     // Create a constant-pool entry.
@@ -4436,14 +4503,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
           > 4)
       // These instructions only load 32 bits, we can't fold them if the
       // destination register is wider than 32 bits (4 bytes).
-      return NULL;
+      return nullptr;
     if ((LoadMI->getOpcode() == X86::MOVSDrm ||
          LoadMI->getOpcode() == X86::VMOVSDrm) &&
         MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
           > 8)
       // These instructions only load 64 bits, we can't fold them if the
       // destination register is wider than 64 bits (8 bytes).
-      return NULL;
+      return nullptr;
 
     // Folding a normal load. Just copy the load's address operands.
     for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
@@ -4489,7 +4556,8 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
   // replacing the *two* registers with the memory location.
-  const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0;
+  const DenseMap<unsigned,
+                 std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
   if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
   } else if (OpNum == 0) { // If operand 0
@@ -4671,7 +4739,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   AddrOps.push_back(Chain);
 
   // Emit the load instruction.
-  SDNode *Load = 0;
+  SDNode *Load = nullptr;
   if (FoldedLoad) {
     EVT VT = *RC->vt_begin();
     std::pair<MachineInstr::mmo_iterator,
@@ -4696,7 +4764,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
 
   // Emit the data processing instruction.
   std::vector<EVT> VTs;
-  const TargetRegisterClass *DstRC = 0;
+  const TargetRegisterClass *DstRC = nullptr;
   if (MCID.getNumDefs() > 0) {
     DstRC = getRegClass(MCID, 0, &RI, MF);
     VTs.push_back(*DstRC->vt_begin());
@@ -5190,14 +5258,14 @@ static const uint16_t *lookup(unsigned opcode, unsigned domain) {
   for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
     if (ReplaceableInstrs[i][domain-1] == opcode)
       return ReplaceableInstrs[i];
-  return 0;
+  return nullptr;
 }
 
 static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
   for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
     if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
       return ReplaceableInstrsAVX2[i];
-  return 0;
+  return nullptr;
 }
 
 std::pair<uint16_t, uint16_t>
@@ -5327,8 +5395,10 @@ namespace {
       const X86TargetMachine *TM =
         static_cast<const X86TargetMachine *>(&MF.getTarget());
 
-      assert(!TM->getSubtarget<X86Subtarget>().is64Bit() &&
-             "X86-64 PIC uses RIP relative addressing");
+      // Don't do anything if this is 64-bit as 64-bit PIC
+      // uses RIP relative addressing.
+      if (TM->getSubtarget<X86Subtarget>().is64Bit())
+        return false;
 
       // Only emit a global base reg in PIC mode.
       if (TM->getRelocationModel() != Reloc::PIC_)
@@ -5383,7 +5453,7 @@ namespace {
 
 char CGBR::ID = 0;
 FunctionPass*
-llvm::createGlobalBaseRegPass() { return new CGBR(); }
+llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
 
 namespace {
   struct LDTLSCleanup : public MachineFunctionPass {
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 156291e..5f34915 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -325,7 +325,7 @@ public:
   /// value.
   unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
                               bool UnfoldLoad, bool UnfoldStore,
-                              unsigned *LoadRegIndex = 0) const override;
+                              unsigned *LoadRegIndex = nullptr) const override;
 
   /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
   /// to determine if two loads are loading from the same base address. It
@@ -359,6 +359,13 @@ public:
   /// instruction that defines the specified register class.
   bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
 
+  /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
+  /// would clobber the EFLAGS condition register. Note the result may be
+  /// conservative. If it cannot definitely determine the safety after visiting
+  /// a few instructions in each direction it assumes it's not safe.
+  bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I) const;
+
   static bool isX86_64ExtendedReg(const MachineOperand &MO) {
     if (!MO.isReg()) return false;
     return X86II::isX86_64ExtendedReg(MO.getReg());
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 8edf873..0d97669 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -206,6 +206,8 @@ def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
 
 def X86rdtsc   : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
                         [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdtscp  : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
 
 def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
 def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
@@ -249,7 +251,6 @@ def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
 def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
 
-def X86bzhi   : SDNode<"X86ISD::BZHI",   SDTIntShiftOp>;
 def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
 
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
@@ -2001,6 +2002,46 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
                       (implicit EFLAGS)]>, XS;
 }
 
+let Predicates = [HasLZCNT] in {
+  def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E),
+              (X86cmp GR16:$src, (i16 0))), 
+            (LZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E),
+              (X86cmp GR32:$src, (i32 0))),
+            (LZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E),
+              (X86cmp GR64:$src, (i64 0))),
+            (LZCNT64rr GR64:$src)>;
+  def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E),
+              (X86cmp GR16:$src, (i16 0))),
+            (LZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E),
+              (X86cmp GR32:$src, (i32 0))),
+            (LZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E),
+              (X86cmp GR64:$src, (i64 0))),
+            (LZCNT64rr GR64:$src)>;
+
+  def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E),
+              (X86cmp (loadi16 addr:$src), (i16 0))), 
+            (LZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E),
+              (X86cmp (loadi32 addr:$src), (i32 0))), 
+            (LZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E),
+              (X86cmp (loadi64 addr:$src), (i64 0))), 
+            (LZCNT64rm addr:$src)>;
+  def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi16 addr:$src), (i16 0))), 
+            (LZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi32 addr:$src), (i32 0))), 
+            (LZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi64 addr:$src), (i64 0))), 
+            (LZCNT64rm addr:$src)>;
+}
+
 //===----------------------------------------------------------------------===//
 // BMI Instructions
 //
@@ -2077,6 +2118,47 @@ let Predicates = [HasBMI] in {
             (BLSI64rr GR64:$src)>;
 }
 
+let Predicates = [HasBMI] in {
+  def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E),
+              (X86cmp GR16:$src, (i16 0))),
+            (TZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E),
+              (X86cmp GR32:$src, (i32 0))),
+            (TZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E),
+              (X86cmp GR64:$src, (i64 0))),
+            (TZCNT64rr GR64:$src)>;
+  def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E),
+              (X86cmp GR16:$src, (i16 0))),
+            (TZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E),
+              (X86cmp GR32:$src, (i32 0))),
+            (TZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E),
+              (X86cmp GR64:$src, (i64 0))),
+            (TZCNT64rr GR64:$src)>;
+
+  def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E),
+              (X86cmp (loadi16 addr:$src), (i16 0))), 
+            (TZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E),
+              (X86cmp (loadi32 addr:$src), (i32 0))), 
+            (TZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E),
+              (X86cmp (loadi64 addr:$src), (i64 0))), 
+            (TZCNT64rm addr:$src)>;
+  def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi16 addr:$src), (i16 0))), 
+            (TZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi32 addr:$src), (i32 0))), 
+            (TZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi64 addr:$src), (i64 0))), 
+            (TZCNT64rm addr:$src)>;
+}
+
+
 multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
                           X86MemOperand x86memop, Intrinsic Int,
                           PatFrag ld_frag> {
@@ -2104,18 +2186,38 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in {
                                int_x86_bmi_bzhi_64, loadi64>, VEX_W;
 }
 
-def : Pat<(X86bzhi GR32:$src1, GR8:$src2),
-          (BZHI32rr GR32:$src1,
-                    (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-def : Pat<(X86bzhi (loadi32 addr:$src1), GR8:$src2),
-          (BZHI32rm addr:$src1,
-                    (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-def : Pat<(X86bzhi GR64:$src1, GR8:$src2),
-          (BZHI64rr GR64:$src1,
-                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-def : Pat<(X86bzhi (loadi64 addr:$src1), GR8:$src2),
-          (BZHI64rm addr:$src1,
-                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+def CountTrailingOnes : SDNodeXForm<imm, [{
+  // Count the trailing ones in the immediate.
+  return getI8Imm(CountTrailingOnes_64(N->getZExtValue()));
+}]>;
+
+def BZHIMask : ImmLeaf<i64, [{
+  return isMask_64(Imm) && (CountTrailingOnes_64(Imm) > 32);
+}]>;
+
+let Predicates = [HasBMI2] in {
+  def : Pat<(and GR64:$src, BZHIMask:$mask),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                             (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
+
+  def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)),
+            (BZHI32rr GR32:$src,
+              (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)),
+            (BZHI32rm addr:$src,
+              (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
+            (BZHI64rm addr:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+} // HasBMI2
 
 let Predicates = [HasBMI] in {
   def : Pat<(X86bextr GR32:$src1, GR32:$src2),
@@ -2617,21 +2719,21 @@ def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
 
 // lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
 // this is compatible with what GAS does.
-def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
-def : InstAlias<"lcall *$dst",      (FARCALL32m opaque48mem:$dst)>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp *$dst",       (FARJMP32m  opaque48mem:$dst)>, Requires<[Not16BitMode]>;
-def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
-def : InstAlias<"lcall *$dst",      (FARCALL16m opaque32mem:$dst)>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp *$dst",       (FARJMP16m  opaque32mem:$dst)>, Requires<[In16BitMode]>;
-
-def : InstAlias<"call *$dst",       (CALL64m i16mem:$dst)>, Requires<[In64BitMode]>;
-def : InstAlias<"jmp *$dst",        (JMP64m  i16mem:$dst)>, Requires<[In64BitMode]>;
-def : InstAlias<"call *$dst",       (CALL32m i16mem:$dst)>, Requires<[In32BitMode]>;
-def : InstAlias<"jmp *$dst",        (JMP32m  i16mem:$dst)>, Requires<[In32BitMode]>;
-def : InstAlias<"call *$dst",       (CALL16m i16mem:$dst)>, Requires<[In16BitMode]>;
-def : InstAlias<"jmp *$dst",        (JMP16m  i16mem:$dst)>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall *$dst",      (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp *$dst",       (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall *$dst",      (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp *$dst",       (FARJMP16m  opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+
+def : InstAlias<"call *$dst",       (CALL64m i16mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"jmp *$dst",        (JMP64m  i16mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"call *$dst",       (CALL32m i16mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp *$dst",        (JMP32m  i16mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"call *$dst",       (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp *$dst",        (JMP16m  i16mem:$dst), 0>, Requires<[In16BitMode]>;
 
 
 // "imul <imm>, B" is an alias for "imul <imm>, B, B".
@@ -2664,11 +2766,11 @@ def : InstAlias<"jmpl $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
 // Force mov without a suffix with a segment and mem to prefer the 'l' form of
 // the move.  All segment/mem forms are equivalent, this has the shortest
 // encoding.
-def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem)>;
-def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>;
+def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
+def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
 
 // Match 'movq <largeimm>, <reg>' as an alias for movabsq.
-def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>;
+def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
 
 // Match 'movq GR64, MMX' as an alias for movd.
 def : InstAlias<"movq $src, $dst",
@@ -2705,7 +2807,7 @@ def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>;
 // 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
 // effect (both store to a 16-bit mem).  Force to sldtw to avoid ambiguity
 // errors, since its encoding is the most compact.
-def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>;
+def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>;
 
 // shld/shrd op,op -> shld op, op, CL
 def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
@@ -2751,19 +2853,29 @@ defm : ShiftRotateByOneAlias<"ror", "ROR">;
 FIXME */
 
 // test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
-def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", (TEST8rm  GR8 :$val, i8mem :$mem)>;
-def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", (TEST16rm GR16:$val, i16mem:$mem)>;
-def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", (TEST32rm GR32:$val, i32mem:$mem)>;
-def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", (TEST64rm GR64:$val, i64mem:$mem)>;
+def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}",
+                (TEST8rm  GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}",
+                (TEST16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}",
+                (TEST32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}",
+                (TEST64rm GR64:$val, i64mem:$mem), 0>;
 
 // xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
-def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", (XCHG8rm  GR8 :$val, i8mem :$mem)>;
-def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", (XCHG16rm GR16:$val, i16mem:$mem)>;
-def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", (XCHG32rm GR32:$val, i32mem:$mem)>;
-def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", (XCHG64rm GR64:$val, i64mem:$mem)>;
+def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}",
+                (XCHG8rm  GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}",
+                (XCHG16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}",
+                (XCHG32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}",
+                (XCHG64rm GR64:$val, i64mem:$mem), 0>;
 
 // xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
-def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src)>;
-def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src)>, Requires<[Not64BitMode]>;
-def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
-def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src)>;
+def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+                (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+                (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 050ee39..ecf80a1 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -254,6 +254,11 @@ let neverHasSideEffects = 1 in
 def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}", [],
                         IIC_MMX_MOVQ_RR>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}", [],
+                        IIC_MMX_MOVQ_RR>;
+}
 } // SchedRW
 
 let SchedRW = [WriteLoad] in {
@@ -262,11 +267,12 @@ def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst, (load_mmx addr:$src))],
                         IIC_MMX_MOVQ_RM>;
+} // SchedRW
+let SchedRW = [WriteStore] in
 def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                         [(store (x86mmx VR64:$src), addr:$dst)],
                         IIC_MMX_MOVQ_RM>;
-} // SchedRW
 
 let SchedRW = [WriteMove] in {
 def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index f2f3967..1eb0485 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1561,9 +1561,9 @@ defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
 
 let Predicates = [UseAVX] in {
   def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
-                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>;
+                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
   def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
+                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
 
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -1627,9 +1627,9 @@ def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
                 (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
 
 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
-                (CVTSI2SSrm FR64:$dst, i32mem:$src)>;
+                (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
-                (CVTSI2SDrm FR64:$dst, i32mem:$src)>;
+                (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
 
 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
 // and/or XMM operand(s).
@@ -2005,7 +2005,7 @@ def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 
 // XMM only
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
-                (VCVTPD2DQrr VR128:$dst, VR128:$src)>;
+                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
 def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
@@ -2024,7 +2024,7 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
                        VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
-                (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
+                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 }
 
 def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
@@ -2127,7 +2127,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 
 // XMM only
 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
-                (VCVTTPD2DQrr VR128:$dst, VR128:$src)>;
+                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
 def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttpd2dqx\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
@@ -2146,7 +2146,7 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                           (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
                          IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
-                (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
+                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
@@ -2252,7 +2252,7 @@ def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 
 // XMM only
 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
-                (VCVTPD2PSrr VR128:$dst, VR128:$src)>;
+                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
 def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvtpd2psx\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
@@ -2271,7 +2271,7 @@ def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                           (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
 def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
-                (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
+                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
 
 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}",
@@ -2973,6 +2973,19 @@ defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
 let isCommutable = 0 in
   defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
 
+// AVX1 requires type coercions in order to fold loads directly into logical
+// operations.
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
+            (VANDPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
+            (VORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
+            (VXORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
+            (VANDNPSYrm VR256:$src1, addr:$src2)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Arithmetic Instructions
 //===----------------------------------------------------------------------===//
@@ -3144,23 +3157,23 @@ let Predicates = [UseSSE2] in {
 
 let Predicates = [UseSSE41] in {
   // If the subtarget has SSE4.1 but not AVX, the vector insert
-  // instruction is lowered into a X86insrtps rather than a X86Movss.
+  // instruction is lowered into a X86insertps rather than a X86Movss.
   // When selecting SSE scalar single-precision fp arithmetic instructions,
-  // make sure that we correctly match the X86insrtps.
+  // make sure that we correctly match the X86insertps.
 
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                   (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                     FR32:$src))), (iPTR 0))),
             (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                   (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                     FR32:$src))), (iPTR 0))),
             (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                   (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                     FR32:$src))), (iPTR 0))),
             (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                   (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                     FR32:$src))), (iPTR 0))),
             (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -3186,19 +3199,19 @@ let Predicates = [HasAVX] in {
                       (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
                       FR64:$src))))),
             (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                  (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                        FR32:$src))), (iPTR 0))),
             (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                  (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                        FR32:$src))), (iPTR 0))),
             (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                  (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                        FR32:$src))), (iPTR 0))),
             (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                  (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                        FR32:$src))), (iPTR 0))),
             (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -4068,6 +4081,10 @@ defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
                              SSE_INTALUQ_ITINS_P, 1>;
 defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
                              SSE_INTMUL_ITINS_P, 1>;
+defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1>;
+defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1>;
 defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
                              SSE_INTALU_ITINS_P, 0>;
 defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
@@ -4102,10 +4119,6 @@ defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
                                  int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
 defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
                                  int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
-defm PMULHUW : PDI_binop_all_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
-                                 int_x86_avx2_pmulhu_w, SSE_INTMUL_ITINS_P, 1>;
-defm PMULHW  : PDI_binop_all_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
-                                 int_x86_avx2_pmulh_w, SSE_INTMUL_ITINS_P, 1>;
 defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
                                  int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
 defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
@@ -6515,7 +6528,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
+        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
       Sched<[WriteFShuffle]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
@@ -6524,7 +6537,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86insrtps VR128:$src1,
+        (X86insertps VR128:$src1,
                    (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
                     imm:$src3))], itins.rm>,
       Sched<[WriteFShuffleLd, ReadAfterLd]>;
@@ -6537,6 +6550,29 @@ let ExeDomain = SSEPackedSingle in {
     defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
 }
 
+let Predicates = [UseSSE41] in {
+  // If we're inserting an element from a load or a null pshuf of a load,
+  // fold the load into the insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
+                       (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
+                   imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
+                      (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+let Predicates = [UseAVX] in {
+  // If we're inserting an element from a vbroadcast of a load, fold the
+  // load into the X86insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Round Instructions
 //===----------------------------------------------------------------------===//
@@ -6990,6 +7026,31 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
        Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
+/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
+/// types.
+multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins,
+                         bit IsCommutable = 0, bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
+  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+       Sched<[itins.Sched]>;
+  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in
   defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
@@ -7018,8 +7079,9 @@ let Predicates = [HasAVX] in {
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
-                                      0, DEFAULT_ITINS_VECIMULSCHED>, VEX_4V;
+  defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
+                                   VR128, loadv2i64, i128mem,
+                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
 }
 
 let Predicates = [HasAVX2] in {
@@ -7051,9 +7113,9 @@ let Predicates = [HasAVX2] in {
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
-                                        int_x86_avx2_pmul_dq, WriteVecIMul>,
-                                        VEX_4V, VEX_L;
+  defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
+                                  VR256, loadv4i64, i256mem,
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -7076,8 +7138,9 @@ let Constraints = "$src1 = $dst" in {
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq,
-                                     1, SSE_INTMUL_ITINS_P>;
+  defm PMULDQ   : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
+                                  VR128, memopv2i64, i128mem,
+                                  SSE_INTMUL_ITINS_P, 1>;
 }
 
 let Predicates = [HasAVX] in {
@@ -7394,6 +7457,7 @@ let Predicates = [UseSSE41] in {
 
 }
 
+let SchedRW = [WriteLoad] in {
 let Predicates = [HasAVX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "vmovntdqa\t{$src, $dst|$dst, $src}",
@@ -7407,6 +7471,7 @@ def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "movntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
+} // SchedRW
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - Compare Instructions
@@ -7831,18 +7896,20 @@ def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
 
 multiclass pclmul_alias<string asm, int immop> {
   def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
-                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>;
+                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
 
   def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
-                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>;
+                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
 
   def : InstAlias<!strconcat("vpclmul", asm,
                              "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>;
+                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
+                  0>;
 
   def : InstAlias<!strconcat("vpclmul", asm,
                              "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>;
+                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
+                  0>;
 }
 defm : pclmul_alias<"hqhq", 0x11>;
 defm : pclmul_alias<"hqlq", 0x01>;
@@ -8291,6 +8358,12 @@ let Predicates = [HasF16C] in {
   defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
   defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
   defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+
+  // Pattern match vcvtph2ps of a scalar i64 load.
+  def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
+            (VCVTPH2PSrm addr:$src)>;
+  def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
+            (VCVTPH2PSrm addr:$src)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 9d3aa1c..b5595cb 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -19,7 +19,7 @@ let Defs = [RAX, RDX] in
               TB;
 
 let Defs = [RAX, RCX, RDX] in
-  def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
+  def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
 
 // CPU flow control instructions
 
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index e99f2d9..e969ef2 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "X86JITInfo.h"
 #include "X86Relocations.h"
 #include "X86Subtarget.h"
@@ -24,6 +23,8 @@
 #include <cstring>
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 // Determine the platform we're running on
 #if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64)
 # define X86_64_JIT
@@ -427,9 +428,14 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
   TsanIgnoreWritesEnd();
 
 #if defined (X86_32_JIT) && !defined (_MSC_VER)
+#if defined(__SSE__)
+  // SSE Callback should be called for SSE-enabled LLVM.
+  return X86CompilationCallback_SSE;
+#else
   if (Subtarget->hasSSE1())
     return X86CompilationCallback_SSE;
 #endif
+#endif
 
   return X86CompilationCallback;
 }
@@ -437,7 +443,7 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
 X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) {
   Subtarget = &TM.getSubtarget<X86Subtarget>();
   useGOT = 0;
-  TLSOffset = 0;
+  TLSOffset = nullptr;
 }
 
 void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 6d7f3cb..0190080 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -120,7 +120,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getGVStubEntry(Sym);
-    if (StubSym.getPointer() == 0) {
+    if (!StubSym.getPointer()) {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym =
         MachineModuleInfoImpl::
@@ -132,7 +132,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getHiddenGVStubEntry(Sym);
-    if (StubSym.getPointer() == 0) {
+    if (!StubSym.getPointer()) {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym =
         MachineModuleInfoImpl::
@@ -168,7 +168,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                              MCSymbol *Sym) const {
   // FIXME: We would like an efficient form for this, so we don't have to do a
   // lot of extra uniquing.
-  const MCExpr *Expr = 0;
+  const MCExpr *Expr = nullptr;
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
 
   switch (MO.getTargetFlags()) {
@@ -223,7 +223,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     break;
   }
 
-  if (Expr == 0)
+  if (!Expr)
     Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
 
   if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 746d0d6..6639875 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -15,9 +15,9 @@
 
 #include <algorithm>
 
-#define DEBUG_TYPE "x86-pad-short-functions"
 #include "X86.h"
 #include "X86InstrInfo.h"
+#include "X86Subtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -30,6 +30,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-pad-short-functions"
+
 STATISTIC(NumBBsPadded, "Number of basic blocks padded");
 
 namespace {
@@ -49,7 +51,7 @@ namespace {
   struct PadShortFunc : public MachineFunctionPass {
     static char ID;
     PadShortFunc() : MachineFunctionPass(ID)
-                   , Threshold(4), TM(0), TII(0) {}
+                   , Threshold(4), TM(nullptr), TII(nullptr) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -100,6 +102,9 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
   }
 
   TM = &MF.getTarget();
+  if (!TM->getSubtarget<X86Subtarget>().padShortFunctions())
+    return false;
+
   TII = TM->getInstrInfo();
 
   // Search through basic blocks and mark the ones that have early returns
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 85aa9b5..a83e1e4 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -38,11 +38,11 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+using namespace llvm;
+
 #define GET_REGINFO_TARGET_DESC
 #include "X86GenRegisterInfo.inc"
 
-using namespace llvm;
-
 cl::opt<bool>
 ForceStackAlign("force-align-stack",
                  cl::desc("Force align the stack to the minimum alignment"
@@ -129,7 +129,7 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
   if (!Is64Bit && SubIdx == X86::sub_8bit) {
     A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
     if (!A)
-      return 0;
+      return nullptr;
   }
   return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
 }
@@ -231,7 +231,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
 }
 
-const uint16_t *
+const MCPhysReg *
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 6a71113..2289d91 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -100,7 +100,7 @@ public:
 
   /// getCalleeSavedRegs - Return a null-terminated list of all of the
   /// callee-save registers on this target.
-  const uint16_t *
+  const MCPhysReg *
   getCalleeSavedRegs(const MachineFunction* MF) const override;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
   const uint32_t *getNoPreservedMask() const;
@@ -122,7 +122,7 @@ public:
 
   void eliminateFrameIndex(MachineBasicBlock::iterator MI,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const override;
+                           RegScavenger *RS = nullptr) const override;
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const override;
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index f5b51ee..6966d61 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -20,6 +20,9 @@ def HaswellModel : SchedMachineModel {
   let LoadLatency = 4;
   let MispredictPenalty = 16;
 
+  // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+  let LoopMicroOpBufferSize = 50;
+
   // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
   // the scheduler to assign a default model to unrecognized opcodes.
   let CompleteModel = 0;
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index a58859a..83f0534 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -21,6 +21,9 @@ def SandyBridgeModel : SchedMachineModel {
   let LoadLatency = 4;
   let MispredictPenalty = 16;
 
+  // Based on the LSD (loop-stream detector) queue size.
+  let LoopMicroOpBufferSize = 28;
+
   // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
   // the scheduler to assign a default model to unrecognized opcodes.
   let CompleteModel = 0;
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index ba72f29..3256ee7 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -535,5 +535,9 @@ def AtomModel : SchedMachineModel {
   let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
   let HighLatency = 30;// Expected, may be overriden by OperandCycles.
 
+  // On the Atom, the throughput for taken branches is 2 cycles. For small
+  // simple loops, expand by a small factor to hide the backedge cost.
+  let LoopMicroOpBufferSize = 10;
+
   let Itineraries = AtomItineraries;
 }
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
index 6c2a304..823d101 100644
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -1,4 +1,4 @@
-//===- X86ScheduleSLM.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,662 +7,225 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the itinerary class data for the Intel Atom
-// (Silvermont) processor.
+// This file defines the machine model for Intel Silvermont to support
+// instruction scheduling and other instruction cost heuristics.
 //
 //===----------------------------------------------------------------------===//
 
-def IEC_RSV0 : FuncUnit;
-def IEC_RSV1 : FuncUnit;
-def FPC_RSV0 : FuncUnit;
-def FPC_RSV1 : FuncUnit;
-def MEC_RSV : FuncUnit;
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-def SLMItineraries : ProcessorItineraries<
-  [ IEC_RSV0, IEC_RSV1, FPC_RSV0, FPC_RSV1, MEC_RSV ],
-  [], [
-  // [InstrStage<N, [FPC_RSV0, FPC_RSV1]>] 
-  // [InstrStage<N, [FPC_RSV0, FPC_RSV1], 0>, InstrStage<N, [MEC_RSV]>]
-  // [InstrStage<N, [IEC_RSV0, IEC_RSV1]>] 
-  // [InstrStage<N, [IEC_RSV0, IEC_RSV1], 0>,InstrStage<N,[MEC_RSV]>]
-  //
-  // Default is 1 cycle, IEC_RSV0 or IEC_RSV1
-  //InstrItinData<IIC_DEFAULT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_ALU_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LEA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LEA_16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // mul
-  InstrItinData<IIC_MUL8, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MUL16_MEM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_MUL16_REG, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MUL32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_MUL32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MUL64, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  // imul by al, ax, eax, rax
-  InstrItinData<IIC_IMUL8, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL16_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<6, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL16_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL32_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<6, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL64, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  // imul reg by reg|mem
-  InstrItinData<IIC_IMUL16_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL16_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL32_RM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL32_RR, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL64_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL64_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>]  >,
-  // imul reg = reg/mem * imm
-  InstrItinData<IIC_IMUL16_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL32_RRI, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL64_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL16_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL32_RMI, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL64_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  // idiv - min latency
-  InstrItinData<IIC_IDIV8, [InstrStage<34, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IDIV16, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IDIV32, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IDIV64, [InstrStage<49, [IEC_RSV0, IEC_RSV1]>] >,
-  // div - min latency
-  InstrItinData<IIC_DIV8_REG, [InstrStage<25, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_DIV8_MEM, [InstrStage<25, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<25, [MEC_RSV]>] >,
-  InstrItinData<IIC_DIV16, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_DIV32, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_DIV64, [InstrStage<38, [IEC_RSV0, IEC_RSV1]>] >,
-  // neg/not/inc/dec
-  InstrItinData<IIC_UNARY_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  // add/sub/and/or/xor/adc/sbc/cmp/test
-  InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BIN_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  // adc/sbb
-  InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<2, [MEC_RSV]>] >,
-  // shift/rotate
-  InstrItinData<IIC_SR, [InstrStage<1, [IEC_RSV0], 0>,
-                   InstrStage<1, [MEC_RSV]>] >,
-  // shift double
-  InstrItinData<IIC_SHD16_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
-  InstrItinData<IIC_SHD16_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
-  InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
-                   InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
-                   InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
-  InstrItinData<IIC_SHD32_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
-  InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
-                   InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
-                   InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SHD64_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
-  InstrItinData<IIC_SHD64_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
-  InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
-                   InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
-                   InstrStage<4, [MEC_RSV]>] >,
-  // cmov
-  InstrItinData<IIC_CMOV16_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_CMOV16_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMOV32_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_CMOV32_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMOV64_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_CMOV64_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
-  // set
-  InstrItinData<IIC_SET_M, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SET_R, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // jcc
-  InstrItinData<IIC_Jcc, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // jcxz/jecxz/jrcxz
-  InstrItinData<IIC_JCXZ, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // jmp rel
-  InstrItinData<IIC_JMP_REL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // jmp indirect
-  InstrItinData<IIC_JMP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_JMP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  // jmp far
-  InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // loop/loope/loopne
-  InstrItinData<IIC_LOOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LOOPE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LOOPNE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // call - all but reg/imm
-  InstrItinData<IIC_CALL_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CALL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  //ret
-  InstrItinData<IIC_RET, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_RET_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  //sign extension movs
-  InstrItinData<IIC_MOVSX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  //zero extension movs
-  InstrItinData<IIC_MOVZX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_REP_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_REP_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  // SSE binary operations
-  // arithmetic fp scalar
-  InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<13, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<13, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<6, [MEC_RSV]>] >,
-
-  // arithmetic fp parallel
-  InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<27, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<27, [MEC_RSV]>] >,
-
-  // bitwise parallel
-  InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  // arithmetic int parallel
-  InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-
-  // multiply int parallel
-  InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [FPC_RSV0], 0>,
-                   InstrStage<5, [MEC_RSV]>] >,
-
-  // shift parallel
-  InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<2, [FPC_RSV0], 0>,
-                   InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [FPC_RSV0]>] >,
-
-  InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [FPC_RSV0]>] >,
-
-  InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [FPC_RSV0], 0>,
-                   InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [FPC_RSV0]>] >,
-
-  InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<26, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<26, [FPC_RSV0], 0>,
-                   InstrStage<26, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<13, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<13, [FPC_RSV0], 0>,
-                   InstrStage<13, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<26, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<26, [FPC_RSV0], 0>,
-                   InstrStage<26, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<13, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<13, [FPC_RSV0], 0>,
-                   InstrStage<13, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<9, [FPC_RSV0], 0>,
-                   InstrStage<9, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [FPC_RSV0], 0>,
-                   InstrStage<4, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_MOVMSK, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MASKMOV, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_LDDQU, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PAUSE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_STMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<6, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<9, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<9, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<9, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<5, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MWAIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MONITOR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  // conversions
-  // to/from PD ...
-  InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<5, [MEC_RSV]>] >,
-  // to/from PS except to/from PD and PS2PI
-  InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-
-  // MMX MOVs
-  InstrItinData<IIC_MMX_MOV_MM_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // other MMX
-  InstrItinData<IIC_MMX_ALU_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_ALU_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PMUL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PSADBW,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PCK_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PCK_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PSHUF,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PEXTR,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PINSRW,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // conversions
-  // from/to PD
-  InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // from/to PI
-  InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  InstrItinData<IIC_CMPX_LOCK, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  InstrItinData<IIC_FILD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FLD,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FLD80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+def SLMModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and SLM can decode 2
+  // instructions per cycle.
+  let IssueWidth = 2;
+  let MicroOpBufferSize = 32; // Based on the reorder buffer.
+  let LoadLatency = 3;
+  let MispredictPenalty = 10;
+
+  // For small loops, expand by a small factor to hide the backedge cost.
+  let LoopMicroOpBufferSize = 10;
+
+  // FIXME: SSE4 is unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
 
-  InstrItinData<IIC_FST,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FST80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FIST,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+let SchedModel = SLMModel in {
+
+// Silvermont has 5 reservation stations for micro-ops
+
+def IEC_RSV0 : ProcResource<1>;
+def IEC_RSV1 : ProcResource<1>;
+def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
+def FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
+def MEC_RSV  : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def IEC_RSV01  : ProcResGroup<[IEC_RSV0, IEC_RSV1]>;
+def FPC_RSV01  : ProcResGroup<[FPC_RSV0, FPC_RSV1]>;
+
+def SMDivider      : ProcResource<1>;
+def SMFPMultiplier : ProcResource<1>;
+def SMFPDivider    : ProcResource<1>;
+
+// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> {
+     let Latency = !add(Lat, 3);
+  }
+}
 
-  InstrItinData<IIC_FLDZ,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FUCOM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FUCOMI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FCOMI,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FNSTSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FNSTCW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FLDCW,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FNINIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FFREE,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FNCLEX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_WAIT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FXAM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FNOP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FLDL,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_F2XM1,  [InstrStage<88, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FYL2X,  [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FPTAN,  [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FPATAN,  [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FXTRACT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FPREM1,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FPSTP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FPREM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FYL2XP1,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FSINCOS,  [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FRNDINT,  [InstrStage<25, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FSCALE,  [InstrStage<74, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FCOMPP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FXSAVE,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FXRSTOR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FXCH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+// A folded store needs a cycle on MEC_RSV for the store data, but it does not
+// need an extra port cycle to recompute the address.
+def : WriteRes<WriteRMW, [MEC_RSV]>;
+
+def : WriteRes<WriteStore, [IEC_RSV01, MEC_RSV]>;
+def : WriteRes<WriteLoad,  [MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteMove,  [IEC_RSV01]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : SMWriteResPair<WriteALU,   IEC_RSV01, 1>;
+defm : SMWriteResPair<WriteIMul,  IEC_RSV1,  3>;
+defm : SMWriteResPair<WriteShift, IEC_RSV0,  1>;
+defm : SMWriteResPair<WriteJump,  IEC_RSV1,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [IEC_RSV1]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [IEC_RSV01, SMDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 25];
+}
 
-  // System instructions
-  InstrItinData<IIC_CPUID, [InstrStage<60, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_INT,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_INT3,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_INVD,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_INVLPG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IRET,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_HLT,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LXS,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LTR,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_RDTSC, [InstrStage<30, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_RSM,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SIDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SGDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SLDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_STR,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SWAPGS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SYSCALL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Scalar and vector floating point.
+defm : SMWriteResPair<WriteFAdd,   FPC_RSV1, 3>;
+defm : SMWriteResPair<WriteFRcp,   FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFSqrt,  FPC_RSV0, 15>;
+defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteFShuffle,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteFBlend,  FPC_RSV0,  1>;
+
+// This is quite rough, latency depends on precision
+def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> {
+  let Latency = 5;
+  let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 1, 2];
+}
 
-  InstrItinData<IIC_IN_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IN_RI,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_OUT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_OUT_IR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_INS,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> {
+  let Latency = 34;
+  let ResourceCycles = [1, 34];
+}
+def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> {
+  let Latency = 37;
+  let ResourceCycles = [1, 1, 34];
+}
 
-  InstrItinData<IIC_MOV_REG_DR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV_DR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // worst case for mov REG_CRx
-  InstrItinData<IIC_MOV_REG_CR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV_CR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Vector integer operations.
+defm : SMWriteResPair<WriteVecShift, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>;
+defm : SMWriteResPair<WriteVecALU,   FPC_RSV01,  1>;
+defm : SMWriteResPair<WriteVecIMul,  FPC_RSV0,   4>;
+defm : SMWriteResPair<WriteShuffle,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteBlend,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteMPSAD,  FPC_RSV0,  7>;
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
+  let Latency = 13;
+  let ResourceCycles = [13];
+}
+def : WriteRes<WritePCmpIStrMLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 13;
+  let ResourceCycles = [13, 1];
+}
 
-  InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV_MEM_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV_SR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV_SR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // LAR
-  InstrItinData<IIC_LAR_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LAR_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // LSL
-  InstrItinData<IIC_LSL_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LSL_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [FPC_RSV0]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpEStrMLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 17;
+  let ResourceCycles = [17, 1];
+}
 
-  InstrItinData<IIC_LGDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LIDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LLDT_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LLDT_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // push control register, segment registers
-  InstrItinData<IIC_PUSH_CS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_PUSH_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // pop control register, segment registers
-  InstrItinData<IIC_POP_SR,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_POP_SR_SS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // VERR, VERW
-  InstrItinData<IIC_VERR,     [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_VERW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_VERW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // WRMSR, RDMSR
-  InstrItinData<IIC_WRMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_RDMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_RDPMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // SMSW, LMSW
-  InstrItinData<IIC_SMSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LMSW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LMSW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [FPC_RSV0]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpIStrILd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 17;
+  let ResourceCycles = [17, 1];
+}
 
-  InstrItinData<IIC_ENTER, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LEAVE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [FPC_RSV0]> {
+  let Latency = 21;
+  let ResourceCycles = [21];
+}
+def : WriteRes<WritePCmpEStrILd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 21;
+  let ResourceCycles = [21, 1];
+}
 
-  InstrItinData<IIC_POP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_POP_REG16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_POP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_POP_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_POP_FD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_POP_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESDecEncLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
 
-  InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_PUSH_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_PUSH_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_PUSH_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_PUSH_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+def : WriteRes<WriteAESIMC, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESIMCLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
 
-  InstrItinData<IIC_BSWAP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<10, [MEC_RSV]>] >,
-  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SCAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_AHF, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BT_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_BT_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_BT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BTX_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_BTX_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_BTX_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BTX_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_XCHG_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_XCHG_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<5, [MEC_RSV]>] >,
-  InstrItinData<IIC_XADD_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_XADD_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<5, [MEC_RSV]>] >,
-  InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPXCHG_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<6, [MEC_RSV]>] >,
-  InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<6, [MEC_RSV]>] >,
-  InstrItinData<IIC_CMPXCHG_8B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPXCHG_16B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LODS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_OUTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CLC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CLD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CLI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CLTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_STC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_STI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_STD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_XLAT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_AAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_AAD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_AAM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_AAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_DAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_DAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BOUND, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_ARPL_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_ARPL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_MOVBE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_AES, [InstrStage<8, [FPC_RSV0]>] >,
-  InstrItinData<IIC_BLEND_NOMEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_BLEND_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<10, [MEC_RSV]>] >,
-  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CBW, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CRC32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CRC32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_DPPD_RR, [InstrStage<12, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_DPPD_RM, [InstrStage<12, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<12, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_DPPS_RR, [InstrStage<15, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_DPPS_RM, [InstrStage<15, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<15, [MEC_RSV]>] >,
-  InstrItinData<IIC_MMX_EMMS, [InstrStage<10, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_EXTRACTPS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_EXTRACTPS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_INSERTPS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_INSERTPS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MPSADBW_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MPSADBW_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PMULLD_RR, [InstrStage<11, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PMULLD_RM, [InstrStage<11, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<11, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_ROUNDPS_REG, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_ROUNDPS_MEM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<5, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_ROUNDPD_REG, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_ROUNDPD_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_POPCNT_RR, [InstrStage<4, [IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_POPCNT_RM, [InstrStage<4, [IEC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PCLMULQDQ_RR, [InstrStage<10, [IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PCLMULQDQ_RM, [InstrStage<10, [IEC_RSV1], 0>,
-                  InstrStage<10, [MEC_RSV]>] >,
+def : WriteRes<WriteAESKeyGen, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESKeyGenLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
 
-  InstrItinData<IIC_NOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >
-  ]>;
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [FPC_RSV0]> {
+  let Latency = 10;
+  let ResourceCycles = [10];
+}
+def : WriteRes<WriteCLMulLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 10;
+  let ResourceCycles = [10, 1];
+}
 
-// Silvermont machine model.
-def SLMModel : SchedMachineModel {
-  let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
-  let MinLatency = 1;  // InstrStage cycles overrides MinLatency.
-                       // OperandCycles may be used for expected latency.
-  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
-  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
 
-  let Itineraries = SLMItineraries;
-}
+def : WriteRes<WriteSystem,     [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteFence, [MEC_RSV]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+def  : WriteRes<WriteIMulH, [FPC_RSV0]>;
+defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteShuffle256, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0,  1>;
+} // SchedModel
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index b9c620f..744890d 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -11,12 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-selectiondag-info"
 #include "X86TargetMachine.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DerivedTypes.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-selectiondag-info"
+
 X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) :
   TargetSelectionDAGInfo(TM),
   Subtarget(&TM.getSubtarget<X86Subtarget>()),
@@ -50,7 +51,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
 
     if (const char *bzeroEntry =  V &&
-        V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
+        V->isNullValue() ? Subtarget->getBZeroEntry() : nullptr) {
       EVT IntPtr = TLI.getPointerTy();
       Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
@@ -60,15 +61,14 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
       Args.push_back(Entry);
       Entry.Node = Size;
       Args.push_back(Entry);
-      TargetLowering::
-      CallLoweringInfo CLI(Chain, Type::getVoidTy(*DAG.getContext()),
-                        false, false, false, false,
-                        0, CallingConv::C, /*isTailCall=*/false,
-                        /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
-                        DAG.getExternalSymbol(bzeroEntry, IntPtr), Args,
-                        DAG, dl);
-      std::pair<SDValue,SDValue> CallResult =
-        TLI.LowerCallTo(CLI);
+
+      TargetLowering::CallLoweringInfo CLI(DAG);
+      CLI.setDebugLoc(dl).setChain(Chain)
+        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
+        .setDiscardResult();
+
+      std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
       return CallResult.second;
     }
 
@@ -77,7 +77,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
   }
 
   uint64_t SizeVal = ConstantSize->getZExtValue();
-  SDValue InFlag(0, 0);
+  SDValue InFlag;
   EVT AVT;
   SDValue Count;
   ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
@@ -139,7 +139,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
-  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
 
   if (TwoRepStos) {
     InFlag = Chain.getValue(1);
@@ -153,7 +153,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     InFlag = Chain.getValue(1);
     Tys = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
-    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
   } else if (BytesLeft) {
     // Handle the last 1 - 7 bytes.
     unsigned Offset = SizeVal - BytesLeft;
@@ -225,7 +225,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   SDValue Count = DAG.getIntPtrConstant(CountVal);
   unsigned BytesLeft = SizeVal % UBytes;
 
-  SDValue InFlag(0, 0);
+  SDValue InFlag;
   Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
                                                               X86::ECX,
                             Count, InFlag);
@@ -241,8 +241,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
-  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops,
-                                array_lengthof(Ops));
+  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
 
   SmallVector<SDValue, 4> Results;
   Results.push_back(RepMovs);
@@ -263,6 +262,5 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                     SrcPtrInfo.getWithOffset(Offset)));
   }
 
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                     &Results[0], Results.size());
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
 }
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 207d0ba..989e0d6 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "subtarget"
 #include "X86Subtarget.h"
 #include "X86InstrInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
@@ -24,15 +24,24 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define DEBUG_TYPE "subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "X86GenSubtargetInfo.inc"
 
-using namespace llvm;
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
+               cl::desc("Enable early if-conversion on X86"));
 
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
 
 /// ClassifyBlockAddressReference - Classify a blockaddress reference for the
 /// current subtarget according to how we should reference it in a non-pcrel
@@ -153,7 +162,7 @@ const char *X86Subtarget::getBZeroEntry() const {
       !getTargetTriple().isMacOSXVersionLT(10, 6))
     return "__bzero";
 
-  return 0;
+  return nullptr;
 }
 
 bool X86Subtarget::hasSinCos() const {
@@ -173,251 +182,16 @@ bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
   return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
 }
 
-static bool OSHasAVXSupport() {
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
-    || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
-#if defined(__GNUC__)
-  // Check xgetbv; this uses a .byte sequence instead of the instruction
-  // directly because older assemblers do not include support for xgetbv and
-  // there is no easy way to conditionally compile based on the assembler used.
-  int rEAX, rEDX;
-  __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
-#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
-  unsigned long long rEAX = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
-#else
-  int rEAX = 0; // Ensures we return false
-#endif
-  return (rEAX & 6) == 6;
-#else
-  return false;
-#endif
-}
-
-void X86Subtarget::AutoDetectSubtargetFeatures() {
-  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
-  unsigned MaxLevel;
-  union {
-    unsigned u[3];
-    char     c[12];
-  } text;
-
-  if (X86_MC::GetCpuIDAndInfo(0, &MaxLevel, text.u+0, text.u+2, text.u+1) ||
-      MaxLevel < 1)
-    return;
-
-  X86_MC::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
-
-  if ((EDX >> 15) & 1) { HasCMov = true;      ToggleFeature(X86::FeatureCMOV); }
-  if ((EDX >> 23) & 1) { X86SSELevel = MMX;   ToggleFeature(X86::FeatureMMX);  }
-  if ((EDX >> 25) & 1) { X86SSELevel = SSE1;  ToggleFeature(X86::FeatureSSE1); }
-  if ((EDX >> 26) & 1) { X86SSELevel = SSE2;  ToggleFeature(X86::FeatureSSE2); }
-  if (ECX & 0x1)       { X86SSELevel = SSE3;  ToggleFeature(X86::FeatureSSE3); }
-  if ((ECX >> 9)  & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);}
-  if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
-  if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
-  if (((ECX >> 27) & 1) && ((ECX >> 28) & 1) && OSHasAVXSupport()) {
-    X86SSELevel = AVX;   ToggleFeature(X86::FeatureAVX);
-  }
-
-  bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
-  bool IsAMD   = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
-
-  if ((ECX >> 1) & 0x1) {
-    HasPCLMUL = true;
-    ToggleFeature(X86::FeaturePCLMUL);
-  }
-  if ((ECX >> 12) & 0x1) {
-    HasFMA = true;
-    ToggleFeature(X86::FeatureFMA);
-  }
-  if (IsIntel && ((ECX >> 22) & 0x1)) {
-    HasMOVBE = true;
-    ToggleFeature(X86::FeatureMOVBE);
-  }
-  if ((ECX >> 23) & 0x1) {
-    HasPOPCNT = true;
-    ToggleFeature(X86::FeaturePOPCNT);
-  }
-  if ((ECX >> 25) & 0x1) {
-    HasAES = true;
-    ToggleFeature(X86::FeatureAES);
-  }
-  if ((ECX >> 29) & 0x1) {
-    HasF16C = true;
-    ToggleFeature(X86::FeatureF16C);
-  }
-  if (IsIntel && ((ECX >> 30) & 0x1)) {
-    HasRDRAND = true;
-    ToggleFeature(X86::FeatureRDRAND);
-  }
-
-  if ((ECX >> 13) & 0x1) {
-    HasCmpxchg16b = true;
-    ToggleFeature(X86::FeatureCMPXCHG16B);
-  }
-
-  if (IsIntel || IsAMD) {
-    // Determine if bit test memory instructions are slow.
-    unsigned Family = 0;
-    unsigned Model  = 0;
-    X86_MC::DetectFamilyModel(EAX, Family, Model);
-    if (IsAMD || (Family == 6 && Model >= 13)) {
-      IsBTMemSlow = true;
-      ToggleFeature(X86::FeatureSlowBTMem);
-    }
-
-    // Determine if SHLD/SHRD instructions have higher latency then the
-    // equivalent series of shifts/or instructions. 
-    // FIXME: Add Intel's processors that have SHLD instructions with very
-    // poor latency. 
-    if (IsAMD) {
-      IsSHLDSlow = true;
-      ToggleFeature(X86::FeatureSlowSHLD);
-    }
-
-    // If it's an Intel chip since Nehalem and not an Atom chip, unaligned
-    // memory access is fast. We hard code model numbers here because they
-    // aren't strictly increasing for Intel chips it seems.
-    if (IsIntel &&
-        ((Family == 6 && Model == 0x1E) || // Nehalem: Clarksfield, Lynnfield,
-                                           //          Jasper Froest
-         (Family == 6 && Model == 0x1A) || // Nehalem: Bloomfield, Nehalem-EP
-         (Family == 6 && Model == 0x2E) || // Nehalem: Nehalem-EX
-         (Family == 6 && Model == 0x25) || // Westmere: Arrandale, Clarksdale
-         (Family == 6 && Model == 0x2C) || // Westmere: Gulftown, Westmere-EP
-         (Family == 6 && Model == 0x2F) || // Westmere: Westmere-EX
-         (Family == 6 && Model == 0x2A) || // SandyBridge
-         (Family == 6 && Model == 0x2D) || // SandyBridge: SandyBridge-E*
-         (Family == 6 && Model == 0x3A) || // IvyBridge
-         (Family == 6 && Model == 0x3E) || // IvyBridge EP
-         (Family == 6 && Model == 0x3C) || // Haswell
-         (Family == 6 && Model == 0x3F) || // ...
-         (Family == 6 && Model == 0x45) || // ...
-         (Family == 6 && Model == 0x46))) { // ...
-      IsUAMemFast = true;
-      ToggleFeature(X86::FeatureFastUAMem);
-    }
-
-    // Set processor type. Currently only Atom or Silvermont (SLM) is detected.
-    if (Family == 6 &&
-        (Model == 28 || Model == 38 || Model == 39 ||
-         Model == 53 || Model == 54)) {
-      X86ProcFamily = IntelAtom;
-
-      UseLeaForSP = true;
-      ToggleFeature(X86::FeatureLeaForSP);
-    }
-    else if (Family == 6 &&
-        (Model == 55 || Model == 74 || Model == 77)) {
-      X86ProcFamily = IntelSLM;
-    }
-
-    unsigned MaxExtLevel;
-    X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
-
-    if (MaxExtLevel >= 0x80000001) {
-      X86_MC::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
-      if ((EDX >> 29) & 0x1) {
-        HasX86_64 = true;
-        ToggleFeature(X86::Feature64Bit);
-      }
-      if ((ECX >> 5) & 0x1) {
-        HasLZCNT = true;
-        ToggleFeature(X86::FeatureLZCNT);
-      }
-      if (IsIntel && ((ECX >> 8) & 0x1)) {
-        HasPRFCHW = true;
-        ToggleFeature(X86::FeaturePRFCHW);
-      }
-      if (IsAMD) {
-        if ((ECX >> 6) & 0x1) {
-          HasSSE4A = true;
-          ToggleFeature(X86::FeatureSSE4A);
-        }
-        if ((ECX >> 11) & 0x1) {
-          HasXOP = true;
-          ToggleFeature(X86::FeatureXOP);
-        }
-        if ((ECX >> 16) & 0x1) {
-          HasFMA4 = true;
-          ToggleFeature(X86::FeatureFMA4);
-        }
-      }
-    }
-  }
-
-  if (MaxLevel >= 7) {
-    if (!X86_MC::GetCpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX)) {
-      if (IsIntel && (EBX & 0x1)) {
-        HasFSGSBase = true;
-        ToggleFeature(X86::FeatureFSGSBase);
-      }
-      if ((EBX >> 3) & 0x1) {
-        HasBMI = true;
-        ToggleFeature(X86::FeatureBMI);
-      }
-      if ((EBX >> 4) & 0x1) {
-        HasHLE = true;
-        ToggleFeature(X86::FeatureHLE);
-      }
-      if (IsIntel && ((EBX >> 5) & 0x1)) {
-        X86SSELevel = AVX2;
-        ToggleFeature(X86::FeatureAVX2);
-      }
-      if (IsIntel && ((EBX >> 8) & 0x1)) {
-        HasBMI2 = true;
-        ToggleFeature(X86::FeatureBMI2);
-      }
-      if (IsIntel && ((EBX >> 11) & 0x1)) {
-        HasRTM = true;
-        ToggleFeature(X86::FeatureRTM);
-      }
-      if (IsIntel && ((EBX >> 16) & 0x1)) {
-        X86SSELevel = AVX512F;
-        ToggleFeature(X86::FeatureAVX512);
-      }
-      if (IsIntel && ((EBX >> 18) & 0x1)) {
-        HasRDSEED = true;
-        ToggleFeature(X86::FeatureRDSEED);
-      }
-      if (IsIntel && ((EBX >> 19) & 0x1)) {
-        HasADX = true;
-        ToggleFeature(X86::FeatureADX);
-      }
-      if (IsIntel && ((EBX >> 26) & 0x1)) {
-        HasPFI = true;
-        ToggleFeature(X86::FeaturePFI);
-      }
-      if (IsIntel && ((EBX >> 27) & 0x1)) {
-        HasERI = true;
-        ToggleFeature(X86::FeatureERI);
-      }
-      if (IsIntel && ((EBX >> 28) & 0x1)) {
-        HasCDI = true;
-        ToggleFeature(X86::FeatureCDI);
-      }
-      if (IsIntel && ((EBX >> 29) & 0x1)) {
-        HasSHA = true;
-        ToggleFeature(X86::FeatureSHA);
-      }
-    }
-    if (IsAMD && ((ECX >> 21) & 0x1)) {
-      HasTBM = true;
-      ToggleFeature(X86::FeatureTBM);
-    }
-  }
-}
-
 void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) {
   AttributeSet FnAttrs = MF->getFunction()->getAttributes();
-  Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
-                                           "target-cpu");
-  Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
-                                          "target-features");
+  Attribute CPUAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+  Attribute FSAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
   std::string CPU =
-    !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : "";
+      !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString() : "";
   std::string FS =
-    !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+      !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
   if (!FS.empty()) {
     initializeEnvironment();
     resetSubtargetFeatures(CPU, FS);
@@ -426,54 +200,23 @@ void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) {
 
 void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   std::string CPUName = CPU;
-  if (!FS.empty() || !CPU.empty()) {
-    if (CPUName.empty()) {
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
-    || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
-      CPUName = sys::getHostCPUName();
-#else
-      CPUName = "generic";
-#endif
-    }
-
-    // Make sure 64-bit features are available in 64-bit mode. (But make sure
-    // SSE2 can be turned off explicitly.)
-    std::string FullFS = FS;
-    if (In64BitMode) {
-      if (!FullFS.empty())
-        FullFS = "+64bit,+sse2," + FullFS;
-      else
-        FullFS = "+64bit,+sse2";
-    }
-
-    // If feature string is not empty, parse features string.
-    ParseSubtargetFeatures(CPUName, FullFS);
-  } else {
-    if (CPUName.empty()) {
-#if defined (__x86_64__) || defined(__i386__)
-      CPUName = sys::getHostCPUName();
-#else
-      CPUName = "generic";
-#endif
-    }
-    // Otherwise, use CPUID to auto-detect feature set.
-    AutoDetectSubtargetFeatures();
-
-    // Make sure 64-bit features are available in 64-bit mode.
-    if (In64BitMode) {
-      if (!HasX86_64) { HasX86_64 = true; ToggleFeature(X86::Feature64Bit); }
-      if (!HasCMov)   { HasCMov   = true; ToggleFeature(X86::FeatureCMOV); }
-
-      if (X86SSELevel < SSE2) {
-        X86SSELevel = SSE2;
-        ToggleFeature(X86::FeatureSSE1);
-        ToggleFeature(X86::FeatureSSE2);
-      }
-    }
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  // Make sure 64-bit features are available in 64-bit mode. (But make sure
+  // SSE2 can be turned off explicitly.)
+  std::string FullFS = FS;
+  if (In64BitMode) {
+    if (!FullFS.empty())
+      FullFS = "+64bit,+sse2," + FullFS;
+    else
+      FullFS = "+64bit,+sse2";
   }
 
-  // CPUName may have been set by the CPU detection code. Make sure the
-  // new MCSchedModel is used.
+  // If feature string is not empty, parse features string.
+  ParseSubtargetFeatures(CPUName, FullFS);
+
+  // Make sure the right MCSchedModel is used.
   InitCPUSchedModel(CPUName);
 
   if (X86ProcFamily == IntelAtom || X86ProcFamily == IntelSLM)
@@ -547,33 +290,36 @@ void X86Subtarget::initializeEnvironment() {
   PadShortFunctions = false;
   CallRegIndirect = false;
   LEAUsesAG = false;
+  SlowLEA = false;
   stackAlignment = 4;
   // FIXME: this is a known good value for Yonah. How about others?
   MaxInlineSizeThreshold = 128;
 }
 
 X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS,
-                           unsigned StackAlignOverride)
-  : X86GenSubtargetInfo(TT, CPU, FS)
-  , X86ProcFamily(Others)
-  , PICStyle(PICStyles::None)
-  , TargetTriple(TT)
-  , StackAlignOverride(StackAlignOverride)
-  , In64BitMode(TargetTriple.getArch() == Triple::x86_64)
-  , In32BitMode(TargetTriple.getArch() == Triple::x86 &&
-                TargetTriple.getEnvironment() != Triple::CODE16)
-  , In16BitMode(TargetTriple.getArch() == Triple::x86 &&
-                TargetTriple.getEnvironment() == Triple::CODE16) {
+                           const std::string &FS, unsigned StackAlignOverride)
+    : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
+      PICStyle(PICStyles::None), TargetTriple(TT),
+      StackAlignOverride(StackAlignOverride),
+      In64BitMode(TargetTriple.getArch() == Triple::x86_64),
+      In32BitMode(TargetTriple.getArch() == Triple::x86 &&
+                  TargetTriple.getEnvironment() != Triple::CODE16),
+      In16BitMode(TargetTriple.getArch() == Triple::x86 &&
+                  TargetTriple.getEnvironment() == Triple::CODE16) {
   initializeEnvironment();
   resetSubtargetFeatures(CPU, FS);
 }
 
-bool X86Subtarget::enablePostRAScheduler(
-           CodeGenOpt::Level OptLevel,
-           TargetSubtargetInfo::AntiDepBreakMode& Mode,
-           RegClassVector& CriticalPathRCs) const {
+bool
+X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                                    TargetSubtargetInfo::AntiDepBreakMode &Mode,
+                                    RegClassVector &CriticalPathRCs) const {
   Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
   CriticalPathRCs.clear();
   return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
 }
+
+bool
+X86Subtarget::enableEarlyIfConversion() const {
+  return hasCMov() && X86EarlyIfConv;
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 52986b9..703559a 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -178,6 +178,9 @@ protected:
   ///             address generation (AG) time.
   bool LEAUsesAG;
 
+  /// SlowLEA - True if the LEA instruction with certain arguments is slow
+  bool SlowLEA;
+
   /// Processor has AVX-512 PreFetch Instructions
   bool HasPFI;
   
@@ -235,10 +238,6 @@ public:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID
-  /// instruction.
-  void AutoDetectSubtargetFeatures();
-
   /// \brief Reset the features for the X86 target.
   void resetSubtargetFeatures(const MachineFunction *MF) override;
 private:
@@ -319,11 +318,13 @@ public:
   bool padShortFunctions() const { return PadShortFunctions; }
   bool callRegIndirect() const { return CallRegIndirect; }
   bool LEAusesAG() const { return LEAUsesAG; }
+  bool slowLEA() const { return SlowLEA; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
   bool hasERI() const { return HasERI; }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
+  bool isSLM() const { return X86ProcFamily == IntelSLM; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
@@ -429,6 +430,8 @@ public:
 
   bool postRAScheduler() const { return PostRAScheduler; }
 
+  bool enableEarlyIfConversion() const override;
+
   /// getInstrItins = Return the instruction itineraries based on the
   /// subtarget selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 6f09ccf..93760ef 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -108,6 +108,13 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
   if (Options.FloatABIType == FloatABI::Default)
     this->Options.FloatABIType = FloatABI::Hard;
 
+  // Windows stack unwinder gets confused when execution flow "falls through"
+  // after a call to 'noreturn' function.
+  // To prevent that, we emit a trap for 'unreachable' IR instructions.
+  // (which on X86, happens to be the 'ud2' instruction)
+  if (Subtarget.isTargetWin64())
+    this->Options.TrapUnreachable = true;
+
   initAsmInfo();
 }
 
@@ -119,12 +126,6 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
   cl::desc("Minimize AVX to SSE transition penalty"),
   cl::init(true));
 
-// Temporary option to control early if-conversion for x86 while adding machine
-// models.
-static cl::opt<bool>
-X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
-	       cl::desc("Enable early if-conversion on X86"));
-
 //===----------------------------------------------------------------------===//
 // X86 Analysis Pass Setup
 //===----------------------------------------------------------------------===//
@@ -177,19 +178,14 @@ bool X86PassConfig::addInstSelector() {
   if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
     addPass(createCleanupLocalDynamicTLSPass());
 
-  // For 32-bit, prepend instructions to set the "global base reg" for PIC.
-  if (!getX86Subtarget().is64Bit())
-    addPass(createGlobalBaseRegPass());
+  addPass(createX86GlobalBaseRegPass());
 
   return false;
 }
 
 bool X86PassConfig::addILPOpts() {
-  if (X86EarlyIfConv && getX86Subtarget().hasCMov()) {
-    addPass(&EarlyIfConverterID);
-    return true;
-  }
-  return false;
+  addPass(&EarlyIfConverterID);
+  return true;
 }
 
 bool X86PassConfig::addPreRegAlloc() {
@@ -208,18 +204,13 @@ bool X86PassConfig::addPreEmitPass() {
     ShouldPrint = true;
   }
 
-  if (getX86Subtarget().hasAVX() && UseVZeroUpper) {
+  if (UseVZeroUpper) {
     addPass(createX86IssueVZeroUpperPass());
     ShouldPrint = true;
   }
 
-  if (getOptLevel() != CodeGenOpt::None &&
-      getX86Subtarget().padShortFunctions()) {
+  if (getOptLevel() != CodeGenOpt::None) {
     addPass(createX86PadShortFunctions());
-    ShouldPrint = true;
-  }
-  if (getOptLevel() != CodeGenOpt::None &&
-      getX86Subtarget().LEAusesAG()){
     addPass(createX86FixupLEAs());
     ShouldPrint = true;
   }
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 0a88e98..8157085 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -26,7 +26,7 @@ const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
 
   // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
   // is an indirect pc-relative reference.
-  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+  if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) {
     const MCSymbol *Sym = TM.getSymbol(GV, Mang);
     const MCExpr *Res =
       MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
@@ -62,7 +62,7 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
   // operation.
   const SubOperator *Sub = dyn_cast<SubOperator>(CE);
   if (!Sub)
-    return 0;
+    return nullptr;
 
   // Symbols must first be numbers before we can subtract them, we need to see a
   // ptrtoint on both subtraction operands.
@@ -71,13 +71,13 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
   const PtrToIntOperator *SubRHS =
       dyn_cast<PtrToIntOperator>(Sub->getOperand(1));
   if (!SubLHS || !SubRHS)
-    return 0;
+    return nullptr;
 
   // Our symbols should exist in address space zero, cowardly no-op if
   // otherwise.
   if (SubLHS->getPointerAddressSpace() != 0 ||
       SubRHS->getPointerAddressSpace() != 0)
-    return 0;
+    return nullptr;
 
   // Both ptrtoint instructions must wrap global variables:
   // - Only global variables are eligible for image relative relocations.
@@ -87,7 +87,7 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
   const GlobalVariable *GVRHS =
       dyn_cast<GlobalVariable>(SubRHS->getPointerOperand());
   if (!GVLHS || !GVRHS)
-    return 0;
+    return nullptr;
 
   // We expect __ImageBase to be a global variable without a section, externally
   // defined.
@@ -96,11 +96,11 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
   if (GVRHS->isThreadLocal() || GVRHS->getName() != "__ImageBase" ||
       !GVRHS->hasExternalLinkage() || GVRHS->hasInitializer() ||
       GVRHS->hasSection())
-    return 0;
+    return nullptr;
 
   // An image-relative, thread-local, symbol makes no sense.
   if (GVLHS->isThreadLocal())
-    return 0;
+    return nullptr;
 
   return MCSymbolRefExpr::Create(TM.getSymbol(GVLHS, Mang),
                                  MCSymbolRefExpr::VK_COFF_IMGREL32,
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index c04964d..91b9d40 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -14,37 +14,24 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86tti"
 #include "X86.h"
 #include "X86TargetMachine.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "x86tti"
+
 // Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializeX86TTIPass(PassRegistry &);
 }
 
-static cl::opt<bool>
-UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true),
-  cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden);
-static cl::opt<unsigned>
-PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0),
-  cl::desc("Threshold for X86 partial unrolling"), cl::Hidden);
-static cl::opt<unsigned>
-PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2),
-  cl::desc("Threshold for taken branches in X86 partial unrolling"),
-  cl::Hidden);
-
 namespace {
 
 class X86TTI final : public ImmutablePass, public TargetTransformInfo {
@@ -56,7 +43,7 @@ class X86TTI final : public ImmutablePass, public TargetTransformInfo {
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
 public:
-  X86TTI() : ImmutablePass(ID), ST(0), TLI(0) {
+  X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
@@ -87,8 +74,6 @@ public:
   /// \name Scalar TTI Implementations
   /// @{
   PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
-  void getUnrollingPreferences(Loop *L,
-                               UnrollingPreferences &UP) const override;
 
   /// @}
 
@@ -153,93 +138,6 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
   return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
 }
 
-void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
-  if (!UsePartialUnrolling)
-    return;
-  // According to the Intel 64 and IA-32 Architectures Optimization Reference
-  // Manual, Intel Core models and later have a loop stream detector
-  // (and associated uop queue) that can benefit from partial unrolling.
-  // The relevant requirements are:
-  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
-  //    taken, and none of them may be calls.
-  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
-
-  // According to the Software Optimization Guide for AMD Family 15h Processors,
-  // models 30h-4fh (Steamroller and later) have a loop predictor and loop
-  // buffer which can benefit from partial unrolling.
-  // The relevant requirements are:
-  //  - The loop must have fewer than 16 branches
-  //  - The loop must have less than 40 uops in all executed loop branches
-
-  unsigned MaxBranches, MaxOps;
-  if (PartialUnrollingThreshold.getNumOccurrences() > 0) {
-    MaxBranches = PartialUnrollingMaxBranches;
-    MaxOps = PartialUnrollingThreshold;
-  } else if (ST->isAtom()) {
-    // On the Atom, the throughput for taken branches is 2 cycles. For small
-    // simple loops, expand by a small factor to hide the backedge cost.
-    MaxBranches = 2;
-    MaxOps = 10;
-  } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) {
-    MaxBranches = 16;
-    MaxOps = 40;
-  } else if (ST->hasFMA4() /* Any other recent AMD */) {
-    return;
-  } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) {
-    MaxBranches = 8;
-    MaxOps = 28;
-  } else if (ST->hasSSSE3() /* Intel Core */) {
-    MaxBranches = 4;
-    MaxOps = 18;
-  } else {
-    return;
-  }
-
-  // Scan the loop: don't unroll loops with calls, and count the potential
-  // number of taken branches (this is somewhat conservative because we're
-  // counting all block transitions as potential branches while in reality some
-  // of these will become implicit via block placement).
-  unsigned MaxDepth = 0;
-  for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()),
-       DE = df_end(L->getHeader()); DI != DE;) {
-    if (!L->contains(*DI)) {
-      DI.skipChildren();
-      continue;
-    }
-
-    MaxDepth = std::max(MaxDepth, DI.getPathLength());
-    if (MaxDepth > MaxBranches)
-      return;
-
-    for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I)
-      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
-        ImmutableCallSite CS(I);
-        if (const Function *F = CS.getCalledFunction()) {
-          if (!isLoweredToCall(F))
-            continue;
-        }
-
-        return;
-      }
-
-    ++DI;
-  }
-
-  // Enable runtime and partial unrolling up to the specified size.
-  UP.Partial = UP.Runtime = true;
-  UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
-
-  // Set the maximum count based on the loop depth. The maximum number of
-  // branches taken in a loop (including the backedge) is equal to the maximum
-  // loop depth (the DFS path length from the loop header to any block in the
-  // loop). When the loop is unrolled, this depth (except for the backedge
-  // itself) is multiplied by the unrolling factor. This new unrolled depth
-  // must be less than the target-specific maximum branch count (which limits
-  // the number of taken branches in the uop buffer).
-  if (MaxDepth > 1)
-    UP.MaxCount = (MaxBranches-1)/(MaxDepth-1);
-}
-
 unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
   if (Vector && !ST->hasSSE1())
     return 0;
@@ -283,6 +181,21 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  static const CostTblEntry<MVT::SimpleValueType>
+  AVX2UniformConstCostTable[] = {
+    { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
+    { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
+    { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
+    { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasAVX2()) {
+    int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * AVX2UniformConstCostTable[Idx].Cost;
+  }
+
   static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
     // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
     // customize them to detect the cases where shift amount is a scalar one.
@@ -350,10 +263,19 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     { ISD::SRA,  MVT::v16i8,  4 }, // psrlw, pand, pxor, psubb.
     { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
     { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
+
+    { ISD::SDIV, MVT::v8i16,  6 }, // pmulhw sequence
+    { ISD::UDIV, MVT::v8i16,  6 }, // pmulhuw sequence
+    { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       ST->hasSSE2()) {
+    // pmuldq sequence.
+    if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+      return LT.first * 15;
+
     int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * SSE2UniformConstCostTable[Idx].Cost;
@@ -893,6 +815,13 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
   if (BitSize == 0)
     return ~0U;
 
+  // Never hoist constants larger than 128bit, because this might lead to
+  // incorrect code generation or assertions in codegen.
+  // Fixme: Create a cost model for types larger than i128 once the codegen
+  // issues have been fixed.
+  if (BitSize > 128)
+    return TCC_Free;
+
   if (Imm == 0)
     return TCC_Free;
 
@@ -908,8 +837,10 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
   if (BitSize == 0)
-    return ~0U;
+    return TCC_Free;
 
   unsigned ImmIdx = ~0U;
   switch (Opcode) {
@@ -931,15 +862,19 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
   case Instruction::SDiv:
   case Instruction::URem:
   case Instruction::SRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
   case Instruction::ICmp:
     ImmIdx = 1;
     break;
+  // Always return TCC_Free for the shift value of a shift instruction.
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    if (Idx == 1)
+      return TCC_Free;
+    break;
   case Instruction::Trunc:
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -966,8 +901,10 @@ unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
   if (BitSize == 0)
-    return ~0U;
+    return TCC_Free;
 
   switch (IID) {
   default: return TCC_Free;
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index d4341b9..0bb5f99 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-vzeroupper"
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
@@ -28,6 +27,8 @@
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-vzeroupper"
+
 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
 
 namespace {
@@ -246,7 +247,8 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// vzero upper instructions before function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
-  if (MF.getTarget().getSubtarget<X86Subtarget>().hasAVX512())
+  const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
+  if (!ST.hasAVX() || ST.hasAVX512())
     return false;
   TII = MF.getTarget().getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
diff --git a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 9c20abd..7fef796 100644
--- a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -14,6 +14,7 @@
 
 #include "XCore.h"
 #include "XCoreRegisterInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
@@ -23,16 +24,17 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "xcore-disassembler"
+
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
 
 /// \brief A disassembler class for XCore.
 class XCoreDisassembler : public MCDisassembler {
-  OwningPtr<const MCRegisterInfo> RegInfo;
 public:
-  XCoreDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info) :
-    MCDisassembler(STI), RegInfo(Info) {}
+  XCoreDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {}
 
   /// \brief See MCDisassembler.
   virtual DecodeStatus getInstruction(MCInst &instr,
@@ -40,9 +42,8 @@ public:
                                       const MemoryObject &region,
                                       uint64_t address,
                                       raw_ostream &vStream,
-                                      raw_ostream &cStream) const;
+                                      raw_ostream &cStream) const override;
 
-  const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
 };
 }
 
@@ -81,7 +82,8 @@ static bool readInstruction32(const MemoryObject &region,
 
 static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
   const XCoreDisassembler *Dis = static_cast<const XCoreDisassembler*>(D);
-  return *(Dis->getRegInfo()->getRegClass(RC).begin() + RegNo);
+  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+  return *(RegInfo->getRegClass(RC).begin() + RegNo);
 }
 
 static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
@@ -788,8 +790,9 @@ namespace llvm {
 }
 
 static MCDisassembler *createXCoreDisassembler(const Target &T,
-                                               const MCSubtargetInfo &STI) {
-  return new XCoreDisassembler(STI, T.createMCRegInfo(""));
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new XCoreDisassembler(STI, Ctx);
 }
 
 extern "C" void LLVMInitializeXCoreDisassembler() {
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
index 9ae8c0d..215fe89 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "XCoreInstPrinter.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCExpr.h"
@@ -22,6 +21,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #include "XCoreGenAsmWriter.inc"
 
 void XCoreInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
index 772c515..98e7c98 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -31,8 +31,8 @@ public:
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 private:
   void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O);
   void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O);
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
index f788c59..5665911 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -17,7 +17,7 @@ XCoreMCAsmInfo::XCoreMCAsmInfo(StringRef TT) {
   SupportsDebugInformation = true;
   Data16bitsDirective = "\t.short\t";
   Data32bitsDirective = "\t.long\t";
-  Data64bitsDirective = 0;
+  Data64bitsDirective = nullptr;
   ZeroDirective = "\t.space\t";
   CommentString = "#";
 
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
index e53c96b..da2689a 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -21,7 +21,7 @@ namespace llvm {
   class Target;
 
   class XCoreMCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit XCoreMCAsmInfo(StringRef TT);
   };
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 439d0ab..d54e94f 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -23,6 +23,8 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "XCoreGenInstrInfo.inc"
 
@@ -32,8 +34,6 @@
 #define GET_REGINFO_MC_DESC
 #include "XCoreGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static MCInstrInfo *createXCoreMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitXCoreMCInstrInfo(X);
@@ -58,7 +58,7 @@ static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
   MCAsmInfo *MAI = new XCoreMCAsmInfo(TT);
 
   // Initial state of the frame pointer is SP.
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, XCore::SP, 0);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, XCore::SP, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -128,12 +128,11 @@ void XCoreTargetAsmStreamer::emitCCBottomFunction(StringRef Name) {
 
 static MCStreamer *
 createXCoreMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                         bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                         bool isVerboseAsm, bool useDwarfDirectory,
                          MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                          MCAsmBackend *TAB, bool ShowInst) {
-  MCStreamer *S =
-      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
-                              InstPrint, CE, TAB, ShowInst);
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
   new XCoreTargetAsmStreamer(*S, OS);
   return S;
 }
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 21acedf..e98d4f9 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "XCore.h"
 #include "InstPrinter/XCoreInstPrinter.h"
 #include "XCoreInstrInfo.h"
@@ -47,6 +46,8 @@
 #include <cctype>
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 namespace {
   class XCoreAsmPrinter : public AsmPrinter {
     const XCoreSubtarget &Subtarget;
@@ -58,7 +59,7 @@ namespace {
       : AsmPrinter(TM, Streamer), Subtarget(TM.getSubtarget<XCoreSubtarget>()),
         MCInstLowering(*this) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "XCore Assembly Printer";
     }
 
@@ -70,18 +71,18 @@ namespace {
     void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
-                         raw_ostream &O);
+                         raw_ostream &O) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
                                unsigned AsmVariant, const char *ExtraCode,
                                raw_ostream &O) override;
 
     void emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV);
-    virtual void EmitGlobalVariable(const GlobalVariable *GV);
+    void EmitGlobalVariable(const GlobalVariable *GV) override;
 
-    void EmitFunctionEntryLabel();
-    void EmitInstruction(const MachineInstr *MI);
-    void EmitFunctionBodyStart();
-    void EmitFunctionBodyEnd();
+    void EmitFunctionEntryLabel() override;
+    void EmitInstruction(const MachineInstr *MI) override;
+    void EmitFunctionBodyStart() override;
+    void EmitFunctionBodyEnd() override;
   };
 } // end of anonymous namespace
 
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 954fddf..5499aba 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -64,7 +64,8 @@ static void EmitDefCfaRegister(MachineBasicBlock &MBB,
                                MachineModuleInfo *MMI, unsigned DRegNum) {
   unsigned CFIIndex = MMI->addFrameInst(
       MCCFIInstruction::createDefCfaRegister(nullptr, DRegNum));
-  BuildMI(MBB, MBBI, dl, TII.get(XCore::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 static void EmitDefCfaOffset(MachineBasicBlock &MBB,
@@ -73,7 +74,8 @@ static void EmitDefCfaOffset(MachineBasicBlock &MBB,
                              MachineModuleInfo *MMI, int Offset) {
   unsigned CFIIndex =
       MMI->addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -Offset));
-  BuildMI(MBB, MBBI, dl, TII.get(XCore::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 static void EmitCfiOffset(MachineBasicBlock &MBB,
@@ -82,7 +84,8 @@ static void EmitCfiOffset(MachineBasicBlock &MBB,
                           unsigned DRegNum, int Offset) {
   unsigned CFIIndex = MMI->addFrameInst(
       MCCFIInstruction::createOffset(nullptr, DRegNum, Offset));
-  BuildMI(MBB, MBBI, dl, TII.get(XCore::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 /// The SP register is moved in steps of 'MaxImmU16' towards the bottom of the
@@ -113,7 +116,8 @@ static void IfNeededExtSP(MachineBasicBlock &MBB,
 /// IfNeededLDAWSP emits the necessary LDAWSP instructions to move the SP only
 /// as far as to make 'OffsetFromTop' reachable using an LDAWSP_lru6.
 /// \param OffsetFromTop the spill offset from the top of the frame.
-/// \param [in,out] RemainingAdj the current SP offset from the top of the frame.
+/// \param [in,out] RemainingAdj the current SP offset from the top of the
+/// frame.
 static void IfNeededLDAWSP(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, DebugLoc dl,
                            const TargetInstrInfo &TII, int OffsetFromTop,
@@ -346,7 +350,8 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
   RemainingAdj /= 4;
 
   if (RetOpcode == XCore::EH_RETURN) {
-    // 'Restore' the exception info the unwinder has placed into the stack slots.
+    // 'Restore' the exception info the unwinder has placed into the stack
+    // slots.
     SmallVector<StackSlotInfo,2> SpillList;
     GetEHSpillList(SpillList, MFI, XFI, MF.getTarget().getTargetLowering());
     RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
@@ -495,7 +500,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
         errs() << "eliminateCallFramePseudoInstr size too big: "
                << Amount << "\n";
 #endif
-        llvm_unreachable(0);
+        llvm_unreachable(nullptr);
       }
 
       MachineInstr *New;
@@ -514,7 +519,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       MBB.insert(I, New);
     }
   }
-  
+
   MBB.erase(I);
 }
 
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index 6cd90c9..e4f806a 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -27,29 +27,30 @@ namespace llvm {
 
     /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
     /// the function.
-    void emitPrologue(MachineFunction &MF) const;
-    void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+    void emitPrologue(MachineFunction &MF) const override;
+    void emitEpilogue(MachineFunction &MF,
+                      MachineBasicBlock &MBB) const override;
 
     bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
     bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MI,
-                                     const std::vector<CalleeSavedInfo> &CSI,
-                                     const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
 
     void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                       MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator I) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
-    bool hasFP(const MachineFunction &MF) const;
+    bool hasFP(const MachineFunction &MF) const override;
 
     void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                              RegScavenger *RS = NULL) const;
+                                     RegScavenger *RS = nullptr) const override;
 
     void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                             RegScavenger *RS = NULL) const;
+                                     RegScavenger *RS = nullptr) const override;
 
     //! Stack slot size (4 bytes)
     static int stackSlotSize() {
diff --git a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
index c18eff9..30c7b59 100644
--- a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
+++ b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
@@ -26,9 +26,9 @@ namespace {
     static char ID;
     XCoreFTAOElim() : MachineFunctionPass(ID) {}
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "XCore FRAME_TO_ARGS_OFFSET Elimination";
     }
   };
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 5b0fcfa..86bc6f2 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -44,7 +44,7 @@ namespace {
       : SelectionDAGISel(TM, OptLevel),
         Subtarget(*TM.getSubtargetImpl()) { }
 
-    SDNode *Select(SDNode *N);
+    SDNode *Select(SDNode *N) override;
     SDNode *SelectBRIND(SDNode *N);
 
     /// getI32Imm - Return a target constant with the specified value, of type
@@ -70,7 +70,7 @@ namespace {
     bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
                                       std::vector<SDValue> &OutOps) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "XCore DAG->DAG Pattern Instruction Selection";
     } 
     
@@ -89,14 +89,14 @@ FunctionPass *llvm::createXCoreISelDag(XCoreTargetMachine &TM,
 
 bool XCoreDAGToDAGISel::SelectADDRspii(SDValue Addr, SDValue &Base,
                                        SDValue &Offset) {
-  FrameIndexSDNode *FIN = 0;
+  FrameIndexSDNode *FIN = nullptr;
   if ((FIN = dyn_cast<FrameIndexSDNode>(Addr))) {
     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
     Offset = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
   }
   if (Addr.getOpcode() == ISD::ADD) {
-    ConstantSDNode *CN = 0;
+    ConstantSDNode *CN = nullptr;
     if ((FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
       && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
       && (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
@@ -227,8 +227,7 @@ replaceInChain(SelectionDAG *CurDAG, SDValue Chain, SDValue Old, SDValue New)
   }
   if (!found)
     return SDValue();
-  return CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
-                         &Ops[0], Ops.size());
+  return CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, Ops);
 }
 
 SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
@@ -237,10 +236,10 @@ SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue Addr = N->getOperand(1);
   if (Addr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
-    return 0;
+    return nullptr;
   unsigned IntNo = cast<ConstantSDNode>(Addr->getOperand(1))->getZExtValue();
   if (IntNo != Intrinsic::xcore_checkevent)
-    return 0;
+    return nullptr;
   SDValue nextAddr = Addr->getOperand(2);
   SDValue CheckEventChainOut(Addr.getNode(), 1);
   if (!CheckEventChainOut.use_empty()) {
@@ -252,7 +251,7 @@ SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
     SDValue NewChain = replaceInChain(CurDAG, Chain, CheckEventChainOut,
                                       CheckEventChainIn);
     if (!NewChain.getNode())
-      return 0;
+      return nullptr;
     Chain = NewChain;
   }
   // Enable events on the thread using setsr 1 and then disable them immediately
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 1b74013..9d78586 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "xcore-lower"
-
 #include "XCoreISelLowering.h"
 #include "XCore.h"
 #include "XCoreMachineFunctionInfo.h"
@@ -41,6 +39,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "xcore-lower"
+
 const char *XCoreTargetLowering::
 getTargetNodeName(unsigned Opcode) const
 {
@@ -64,7 +64,7 @@ getTargetNodeName(unsigned Opcode) const
     case XCoreISD::FRAME_TO_ARGS_OFFSET : return "XCoreISD::FRAME_TO_ARGS_OFFSET";
     case XCoreISD::EH_RETURN         : return "XCoreISD::EH_RETURN";
     case XCoreISD::MEMBARRIER        : return "XCoreISD::MEMBARRIER";
-    default                          : return NULL;
+    default                          : return nullptr;
   }
 }
 
@@ -268,21 +268,19 @@ LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
                      Op.getOperand(1));
 }
 
-SDValue XCoreTargetLowering::
-getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
-                        SelectionDAG &DAG) const
-{
+SDValue XCoreTargetLowering::getGlobalAddressWrapper(SDValue GA,
+                                                     const GlobalValue *GV,
+                                                     SelectionDAG &DAG) const {
   // FIXME there is no actual debug info here
   SDLoc dl(GA);
   const GlobalValue *UnderlyingGV = GV;
   // If GV is an alias then use the aliasee to determine the wrapper type
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    UnderlyingGV = GA->getAliasedGlobal();
+    UnderlyingGV = GA->getAliasee();
   if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(UnderlyingGV)) {
-    if (  ( GVar->isConstant() &&
-            UnderlyingGV->isLocalLinkage(GV->getLinkage()) )
-       || ( GVar->hasSection() &&
-            StringRef(GVar->getSection()).startswith(".cp.") ) )
+    if ((GVar->isConstant() && GV->hasLocalLinkage()) ||
+        (GVar->hasSection() &&
+         StringRef(GVar->getSection()).startswith(".cp.")))
       return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
     return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
   }
@@ -428,13 +426,13 @@ lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base,
   Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
                       High.getValue(1));
   SDValue Ops[] = { Result, Chain };
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 static bool isWordAligned(SDValue Value, SelectionDAG &DAG)
 {
   APInt KnownZero, KnownOne;
-  DAG.ComputeMaskedBits(Value, KnownZero, KnownOne);
+  DAG.computeKnownBits(Value, KnownZero, KnownOne);
   return KnownZero.countTrailingOnes() >= 2;
 }
 
@@ -494,7 +492,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
                              High.getValue(1));
     SDValue Ops[] = { Result, Chain };
-    return DAG.getMergeValues(Ops, 2, DL);
+    return DAG.getMergeValues(Ops, DL);
   }
 
   // Lower to a call to __misaligned_load(BasePtr).
@@ -506,17 +504,15 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   Entry.Node = BasePtr;
   Args.push_back(Entry);
 
-  TargetLowering::CallLoweringInfo CLI(Chain, IntPtrTy, false, false,
-                    false, false, 0, CallingConv::C, /*isTailCall=*/false,
-                    /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
-                    DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
-                    Args, DAG, DL);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-
-  SDValue Ops[] =
-    { CallResult.first, CallResult.second };
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(Chain)
+    .setCallee(CallingConv::C, IntPtrTy,
+               DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
+               &Args, 0);
 
-  return DAG.getMergeValues(Ops, 2, DL);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  SDValue Ops[] = { CallResult.first, CallResult.second };
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue XCoreTargetLowering::
@@ -568,14 +564,13 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   Entry.Node = Value;
   Args.push_back(Entry);
 
-  TargetLowering::CallLoweringInfo CLI(Chain,
-                    Type::getVoidTy(*DAG.getContext()), false, false,
-                    false, false, 0, CallingConv::C, /*isTailCall=*/false,
-                    /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
-                    DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
-                    Args, DAG, dl);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+               DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
+               &Args, 0);
 
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
 }
 
@@ -593,7 +588,7 @@ LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
                            LHS, RHS);
   SDValue Lo(Hi.getNode(), 1);
   SDValue Ops[] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue XCoreTargetLowering::
@@ -610,7 +605,7 @@ LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
                            Zero, Zero);
   SDValue Lo(Hi.getNode(), 1);
   SDValue Ops[] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 /// isADDADDMUL - Return whether Op is in a form that is equivalent to
@@ -741,7 +736,7 @@ ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
 
   if (N->getOpcode() == ISD::ADD) {
     SDValue Result = TryExpandADDWithMul(N, DAG);
-    if (Result.getNode() != 0)
+    if (Result.getNode())
       return Result;
   }
 
@@ -886,7 +881,7 @@ LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
     DAG.getCopyToReg(Chain, dl, HandlerReg, Handler)
   };
 
-  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 2);
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 
   return DAG.getNode(XCoreISD::EH_RETURN, dl, MVT::Other, Chain,
                      DAG.getRegister(StackReg, MVT::i32),
@@ -952,7 +947,7 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
                               MachinePointerInfo(TrmpAddr, 16), false, false,
                               0);
 
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 5);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
 SDValue XCoreTargetLowering::
@@ -967,7 +962,7 @@ LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
                     Op.getOperand(1), Op.getOperand(2) , Op.getOperand(3));
       SDValue Crc(Data.getNode(), 1);
       SDValue Results[] = { Crc, Data };
-      return DAG.getMergeValues(Results, 2, DL);
+      return DAG.getMergeValues(Results, DL);
   }
   return SDValue();
 }
@@ -1111,7 +1106,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag,
     unsigned index = ResultMemLocs[i].second;
     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
     SDValue Ops[] = { Chain, DAG.getConstant(offset / 4, MVT::i32) };
-    SDValue load = DAG.getNode(XCoreISD::LDWSP, dl, VTs, Ops, 2);
+    SDValue load = DAG.getNode(XCoreISD::LDWSP, dl, VTs, Ops);
     InVals[index] = load;
     MemOpChains.push_back(load.getValue(1));
   }
@@ -1119,8 +1114,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag,
   // Transform all loads nodes into one single node because
   // all load nodes are independent of each other.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   return Chain;
 }
@@ -1204,8 +1198,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // Transform all store nodes into one single node because
   // all store nodes are independent of each other.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
@@ -1244,7 +1237,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
-  Chain  = DAG.getNode(XCoreISD::BL, dl, NodeTys, &Ops[0], Ops.size());
+  Chain  = DAG.getNode(XCoreISD::BL, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -1347,7 +1340,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
           errs() << "LowerFormalArguments Unhandled argument type: "
                  << RegVT.getSimpleVT().SimpleTy << "\n";
 #endif
-          llvm_unreachable(0);
+          llvm_unreachable(nullptr);
         }
       case MVT::i32:
         unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
@@ -1384,7 +1377,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
   // 1b. CopyFromReg vararg registers.
   if (isVarArg) {
     // Argument registers
-    static const uint16_t ArgRegs[] = {
+    static const MCPhysReg ArgRegs[] = {
       XCore::R0, XCore::R1, XCore::R2, XCore::R3
     };
     XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
@@ -1422,8 +1415,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
 
   // 2. chain CopyFromReg nodes into a TokenFactor.
   if (!CFRegNode.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &CFRegNode[0],
-                        CFRegNode.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, CFRegNode);
 
   // 3. Memcpy 'byVal' args & push final InVals.
   // Aggregates passed "byVal" need to be copied by the callee.
@@ -1452,8 +1444,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
   // 4, chain mem ops nodes into a TokenFactor.
   if (!MemOps.empty()) {
     MemOps.push_back(Chain);
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOps[0],
-                        MemOps.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
   }
 
   return Chain;
@@ -1535,8 +1526,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
   // Transform all store nodes into one single node because
   // all stores are independent of each other.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Now handle return values copied to registers.
   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
@@ -1558,8 +1548,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other,
-                     &RetOps[0], RetOps.size());
+  return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other, RetOps);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1696,7 +1685,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       SDValue Result = DAG.getNode(ISD::AND, dl, VT, N2,
                                    DAG.getConstant(1, VT));
       SDValue Ops[] = { Result, Carry };
-      return DAG.getMergeValues(Ops, 2, dl);
+      return DAG.getMergeValues(Ops, dl);
     }
 
     // fold (ladd x, 0, y) -> 0, add x, y iff carry is unused and y has only the
@@ -1705,12 +1694,12 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
-      DAG.ComputeMaskedBits(N2, KnownZero, KnownOne);
+      DAG.computeKnownBits(N2, KnownZero, KnownOne);
       if ((KnownZero & Mask) == Mask) {
         SDValue Carry = DAG.getConstant(0, VT);
         SDValue Result = DAG.getNode(ISD::ADD, dl, VT, N0, N2);
         SDValue Ops[] = { Result, Carry };
-        return DAG.getMergeValues(Ops, 2, dl);
+        return DAG.getMergeValues(Ops, dl);
       }
     }
   }
@@ -1728,13 +1717,13 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
-      DAG.ComputeMaskedBits(N2, KnownZero, KnownOne);
+      DAG.computeKnownBits(N2, KnownZero, KnownOne);
       if ((KnownZero & Mask) == Mask) {
         SDValue Borrow = N2;
         SDValue Result = DAG.getNode(ISD::SUB, dl, VT,
                                      DAG.getConstant(0, VT), N2);
         SDValue Ops[] = { Result, Borrow };
-        return DAG.getMergeValues(Ops, 2, dl);
+        return DAG.getMergeValues(Ops, dl);
       }
     }
 
@@ -1744,12 +1733,12 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
-      DAG.ComputeMaskedBits(N2, KnownZero, KnownOne);
+      DAG.computeKnownBits(N2, KnownZero, KnownOne);
       if ((KnownZero & Mask) == Mask) {
         SDValue Borrow = DAG.getConstant(0, VT);
         SDValue Result = DAG.getNode(ISD::SUB, dl, VT, N0, N2);
         SDValue Ops[] = { Result, Borrow };
-        return DAG.getMergeValues(Ops, 2, dl);
+        return DAG.getMergeValues(Ops, dl);
       }
     }
   }
@@ -1775,14 +1764,14 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       if (N->hasNUsesOfValue(0, 0)) {
         SDValue Lo = DAG.getNode(ISD::ADD, dl, VT, N2, N3);
         SDValue Ops[] = { Lo, Lo };
-        return DAG.getMergeValues(Ops, 2, dl);
+        return DAG.getMergeValues(Ops, dl);
       }
       // Otherwise fold to ladd(a, b, 0)
       SDValue Result =
         DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N2, N3, N1);
       SDValue Carry(Result.getNode(), 1);
       SDValue Ops[] = { Carry, Result };
-      return DAG.getMergeValues(Ops, 2, dl);
+      return DAG.getMergeValues(Ops, dl);
     }
   }
   break;
@@ -1866,11 +1855,11 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
   return SDValue();
 }
 
-void XCoreTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                         APInt &KnownZero,
-                                                         APInt &KnownOne,
-                                                         const SelectionDAG &DAG,
-                                                         unsigned Depth) const {
+void XCoreTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                        APInt &KnownZero,
+                                                        APInt &KnownOne,
+                                                        const SelectionDAG &DAG,
+                                                        unsigned Depth) const {
   KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
   switch (Op.getOpcode()) {
   default: break;
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 65e2bad..d28715b 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -97,31 +97,30 @@ namespace llvm {
     explicit XCoreTargetLowering(XCoreTargetMachine &TM);
 
     using TargetLowering::isZExtFree;
-    virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
 
 
-    virtual unsigned getJumpTableEncoding() const;
-    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+    unsigned getJumpTableEncoding() const override;
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     /// ReplaceNodeResults - Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
-    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
-                                    SelectionDAG &DAG) const;
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
 
     /// getTargetNodeName - This method returns the name of a target specific
     //  DAG node.
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
 
-    virtual MachineBasicBlock *
+    MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const;
+                                  MachineBasicBlock *MBB) const override;
 
-    virtual bool isLegalAddressingMode(const AddrMode &AM,
-                                       Type *Ty) const;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
   private:
     const XCoreTargetMachine &TM;
@@ -176,44 +175,44 @@ namespace llvm {
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass*>
     getRegForInlineAsmConstraint(const std::string &Constraint,
-                                 MVT VT) const;
+                                 MVT VT) const override;
 
     // Expand specifics
     SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const;
     SDValue ExpandADDSUB(SDNode *Op, SelectionDAG &DAG) const;
 
-    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                APInt &KnownZero,
-                                                APInt &KnownOne,
-                                                const SelectionDAG &DAG,
-                                                unsigned Depth = 0) const;
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
 
-    virtual SDValue
+    SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv,
                            bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
+                           SmallVectorImpl<SDValue> &InVals) const override;
 
-    virtual SDValue
+    SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
+                SmallVectorImpl<SDValue> &InVals) const override;
 
-    virtual SDValue
+    SDValue
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const override;
 
-    virtual bool
+    bool
       CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                      bool isVarArg,
                      const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
-                     LLVMContext &Context) const;
+                     LLVMContext &Context) const override;
   };
 }
 
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index cea3bbf..984f0cd 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -26,6 +26,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "XCoreGenInstrInfo.inc"
 
@@ -41,9 +43,6 @@ namespace XCore {
 }
 }
 
-using namespace llvm;
-
-
 // Pin the vtable to this file.
 void XCoreInstrInfo::anchor() {}
 
@@ -289,7 +288,7 @@ XCoreInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
   assert((Cond.size() == 2 || Cond.size() == 0) &&
          "Unexpected number of components!");
   
-  if (FBB == 0) { // One way branch.
+  if (!FBB) { // One way branch.
     if (Cond.empty()) {
       // Unconditional branch
       BuildMI(&MBB, DL, get(XCore::BRFU_lu6)).addMBB(TBB);
@@ -428,13 +427,21 @@ static inline bool isImmU16(unsigned val) {
   return val < (1 << 16);
 }
 
+static bool isImmMskBitp(unsigned val) {
+  if (!isMask_32(val)) {
+    return false;
+  }
+  int N = Log2_32(val) + 1;
+  return (N >= 1 && N <= 8) || N == 16 || N == 24 || N == 32;
+}
+
 MachineBasicBlock::iterator XCoreInstrInfo::loadImmediate(
                                               MachineBasicBlock &MBB,
                                               MachineBasicBlock::iterator MI,
                                               unsigned Reg, uint64_t Value) const {
   DebugLoc dl;
   if (MI != MBB.end()) dl = MI->getDebugLoc();
-  if (isMask_32(Value)) {
+  if (isImmMskBitp(Value)) {
     int N = Log2_32(Value) + 1;
     return BuildMI(MBB, MI, dl, get(XCore::MKMSK_rus), Reg).addImm(N);
   }
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index 48c9cb5..e0be96b 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -32,55 +32,55 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
-  
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-  
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const;
-  
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
-  
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
-
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
-
-  virtual bool ReverseBranchCondition(
-                            SmallVectorImpl<MachineOperand> &Cond) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool ReverseBranchCondition(
+                          SmallVectorImpl<MachineOperand> &Cond) const override;
 
   // Emit code before MBBI to load immediate value into physical register Reg.
   // Returns an iterator to the new instruction.
diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index b398c2d..ac3bae5 100644
--- a/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -48,7 +48,7 @@ namespace {
 
     bool lowerGlobal(GlobalVariable *GV);
 
-    bool runOnModule(Module &M);
+    bool runOnModule(Module &M) override;
   };
 }
 
@@ -189,13 +189,14 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
 
   // Create replacement global.
   ArrayType *NewType = createLoweredType(GV->getType()->getElementType());
-  Constant *NewInitializer = 0;
+  Constant *NewInitializer = nullptr;
   if (GV->hasInitializer())
     NewInitializer = createLoweredInitializer(NewType,
                                               GV->getInitializer());
   GlobalVariable *NewGV =
     new GlobalVariable(*M, NewType, GV->isConstant(), GV->getLinkage(),
-                       NewInitializer, "", 0, GlobalVariable::NotThreadLocal,
+                       NewInitializer, "", nullptr,
+                       GlobalVariable::NotThreadLocal,
                        GV->getType()->getAddressSpace(),
                        GV->isExternallyInitialized());
 
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index d85d717..316c82c 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -33,11 +33,13 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-reg-info"
+
 #define GET_REGINFO_TARGET_DESC
 #include "XCoreGenRegisterInfo.inc"
 
-using namespace llvm;
-
 XCoreRegisterInfo::XCoreRegisterInfo()
   : XCoreGenRegisterInfo(XCore::LR) {
 }
@@ -205,16 +207,16 @@ bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) {
     MF.getFunction()->needsUnwindTableEntry();
 }
 
-const uint16_t* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+const MCPhysReg* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
                                                                          const {
   // The callee saved registers LR & FP are explicitly handled during
   // emitPrologue & emitEpilogue and related functions.
-  static const uint16_t CalleeSavedRegs[] = {
+  static const MCPhysReg CalleeSavedRegs[] = {
     XCore::R4, XCore::R5, XCore::R6, XCore::R7,
     XCore::R8, XCore::R9, XCore::R10,
     0
   };
-  static const uint16_t CalleeSavedRegsFP[] = {
+  static const MCPhysReg CalleeSavedRegsFP[] = {
     XCore::R4, XCore::R5, XCore::R6, XCore::R7,
     XCore::R8, XCore::R9,
     0
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 36ba7b4..aa617a0 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -29,22 +29,23 @@ public:
 
   /// Code Generation virtual methods...
 
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF =nullptr) const override;
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
   
-  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
 
-  bool useFPForScavengingIndex(const MachineFunction &MF) const;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = nullptr) const override;
 
   // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   //! Return whether to emit frame moves
   static bool needsFrameMoves(const MachineFunction &MF);
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index 68ede6a..5a6bbe7 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "xcore-selectiondag-info"
 #include "XCoreTargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "xcore-selectiondag-info"
+
 XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const XCoreTargetMachine &TM)
   : TargetSelectionDAGInfo(TM) {
 }
@@ -41,13 +42,15 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
     Entry.Node = Src; Args.push_back(Entry);
     Entry.Node = Size; Args.push_back(Entry);
 
-    TargetLowering::CallLoweringInfo
-    CLI(Chain, Type::getVoidTy(*DAG.getContext()), false, false, false, false,
-        0, TLI.getLibcallCallingConv(RTLIB::MEMCPY), /*isTailCall=*/false,
-        /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
-        DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()), Args, DAG, dl);
-    std::pair<SDValue,SDValue> CallResult =
-      TLI.LowerCallTo(CLI);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl).setChain(Chain)
+      .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                 Type::getVoidTy(*DAG.getContext()),
+                 DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()),
+                 &Args, 0)
+      .setDiscardResult();
+
+    std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
     return CallResult.second;
   }
 
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h
index 31704f3..ea6af98 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.h
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -25,14 +25,14 @@ public:
   explicit XCoreSelectionDAGInfo(const XCoreTargetMachine &TM);
   ~XCoreSelectionDAGInfo();
 
-  virtual SDValue
+  SDValue
   EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                           SDValue Chain,
                           SDValue Op1, SDValue Op2,
                           SDValue Op3, unsigned Align, bool isVolatile,
                           bool AlwaysInline,
                           MachinePointerInfo DstPtrInfo,
-                          MachinePointerInfo SrcPtrInfo) const;
+                          MachinePointerInfo SrcPtrInfo) const override;
 };
 
 }
diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp
index 8cfb770..89ea03a 100644
--- a/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/lib/Target/XCore/XCoreSubtarget.cpp
@@ -15,12 +15,14 @@
 #include "XCore.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "XCoreGenSubtargetInfo.inc"
 
-using namespace llvm;
-
 void XCoreSubtarget::anchor() { }
 
 XCoreSubtarget::XCoreSubtarget(const std::string &TT,
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 781a87b..0fb21c5 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -46,9 +46,9 @@ public:
     return getTM<XCoreTargetMachine>();
   }
 
-  virtual bool addPreISel();
-  virtual bool addInstSelector();
-  virtual bool addPreEmitPass();
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index a19a677..a57ca55 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -37,28 +37,28 @@ public:
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 
-  virtual const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; }
-  virtual const XCoreFrameLowering *getFrameLowering() const {
+  const XCoreInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const XCoreFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const XCoreSubtarget *getSubtargetImpl() const { return &Subtarget; }
-  virtual const XCoreTargetLowering *getTargetLowering() const {
+  const XCoreSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const XCoreTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
 
-  virtual const XCoreSelectionDAGInfo* getSelectionDAGInfo() const {
+  const XCoreSelectionDAGInfo* getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
-  virtual const TargetRegisterInfo *getRegisterInfo() const {
+  const TargetRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
-  virtual const DataLayout       *getDataLayout() const { return &DL; }
+  const DataLayout       *getDataLayout() const override { return &DL; }
 
   // Pass Pipeline Configuration
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  void addAnalysisPasses(PassManagerBase &PM) override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.h b/lib/Target/XCore/XCoreTargetObjectFile.h
index 733e6d3..34d756e 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.h
+++ b/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -22,7 +22,7 @@ static const unsigned CodeModelLargeSize = 256;
    const MCSection *ReadOnlySectionLarge;
    const MCSection *DataRelROSectionLarge;
   public:
-    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
     const MCSection *
       getExplicitSectionGlobal(const GlobalValue *GV,
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.cpp b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
index 313d18d..80d193d 100644
--- a/lib/Target/XCore/XCoreTargetTransformInfo.cpp
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
@@ -14,7 +14,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "xcoretti"
 #include "XCore.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
@@ -22,8 +21,10 @@
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "xcoretti"
+
 // Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializeXCoreTTIPass(PassRegistry &);
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp
index c514c49..29b9bb8 100644
--- a/lib/Transforms/Hello/Hello.cpp
+++ b/lib/Transforms/Hello/Hello.cpp
@@ -12,13 +12,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hello"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "hello"
+
 STATISTIC(HelloCounter, "Counts number of functions greeted");
 
 namespace {
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 48d3fba..377fa15 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -29,7 +29,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "argpromotion"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Statistic.h"
@@ -49,6 +48,8 @@
 #include <set>
 using namespace llvm;
 
+#define DEBUG_TYPE "argpromotion"
+
 STATISTIC(NumArgumentsPromoted , "Number of pointer arguments promoted");
 STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
 STATISTIC(NumByValArgsPromoted , "Number of byval arguments promoted");
@@ -123,14 +124,14 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
   Function *F = CGN->getFunction();
 
   // Make sure that it is local to this module.
-  if (!F || !F->hasLocalLinkage()) return 0;
+  if (!F || !F->hasLocalLinkage()) return nullptr;
 
   // First check: see if there are any pointer arguments!  If not, quick exit.
   SmallVector<Argument*, 16> PointerArgs;
   for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I)
     if (I->getType()->isPointerTy())
       PointerArgs.push_back(I);
-  if (PointerArgs.empty()) return 0;
+  if (PointerArgs.empty()) return nullptr;
 
   // Second check: make sure that all callers are direct callers.  We can't
   // transform functions that have indirect callers.  Also see if the function
@@ -139,7 +140,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
   for (Use &U : F->uses()) {
     CallSite CS(U.getUser());
     // Must be a direct call.
-    if (CS.getInstruction() == 0 || !CS.isCallee(&U)) return 0;
+    if (CS.getInstruction() == nullptr || !CS.isCallee(&U)) return nullptr;
     
     if (CS.getInstruction()->getParent()->getParent() == F)
       isSelfRecursive = true;
@@ -207,7 +208,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
 
   // No promotable pointer arguments.
   if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) 
-    return 0;
+    return nullptr;
 
   return DoPromotion(F, ArgsToPromote, ByValArgsToTransform);
 }
@@ -660,7 +661,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         Type *AgTy = cast<PointerType>(I->getType())->getElementType();
         StructType *STy = cast<StructType>(AgTy);
         Value *Idxs[2] = {
-              ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 0 };
+              ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr };
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
           Value *Idx = GetElementPtrInst::Create(*AI, Idxs,
@@ -788,10 +789,10 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
 
       // Just add all the struct element types.
       Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-      Value *TheAlloca = new AllocaInst(AgTy, 0, "", InsertPt);
+      Value *TheAlloca = new AllocaInst(AgTy, nullptr, "", InsertPt);
       StructType *STy = cast<StructType>(AgTy);
       Value *Idxs[2] = {
-            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 0 };
+            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr };
 
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
         Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index 5c3acea..23be081 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -17,7 +17,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "constmerge"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerIntPair.h"
@@ -31,6 +30,8 @@
 #include "llvm/Pass.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "constmerge"
+
 STATISTIC(NumMerged, "Number of global constants merged");
 
 namespace {
@@ -66,7 +67,7 @@ ModulePass *llvm::createConstantMergePass() { return new ConstantMerge(); }
 /// Find values that are marked as llvm.used.
 static void FindUsedValues(GlobalVariable *LLVMUsed,
                            SmallPtrSet<const GlobalValue*, 8> &UsedValues) {
-  if (LLVMUsed == 0) return;
+  if (!LLVMUsed) return;
   ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
 
   for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) {
@@ -103,7 +104,7 @@ unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const {
 
 bool ConstantMerge::runOnModule(Module &M) {
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
 
   // Find all the globals that are marked "used".  These cannot be merged.
   SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
@@ -161,7 +162,7 @@ bool ConstantMerge::runOnModule(Module &M) {
       // If this is the first constant we find or if the old one is local,
       // replace with the current one. If the current is externally visible
       // it cannot be replace, but can be the canonical constant we merge with.
-      if (Slot == 0 || IsBetterCanonical(*GV, *Slot))
+      if (!Slot || IsBetterCanonical(*GV, *Slot))
         Slot = GV;
     }
 
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 1aba3df..284b896 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -17,7 +17,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "deadargelim"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
@@ -38,8 +37,11 @@
 #include "llvm/Support/raw_ostream.h"
 #include <map>
 #include <set>
+#include <tuple>
 using namespace llvm;
 
+#define DEBUG_TYPE "deadargelim"
+
 STATISTIC(NumArgumentsEliminated, "Number of unread args removed");
 STATISTIC(NumRetValsEliminated  , "Number of unused return values removed");
 STATISTIC(NumArgumentsReplacedWithUndef, 
@@ -764,7 +766,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 
   // Find out the new return value.
   Type *RetTy = FTy->getReturnType();
-  Type *NRetTy = NULL;
+  Type *NRetTy = nullptr;
   unsigned RetCount = NumRetVals(F);
 
   // -1 means unused, other numbers are the new index
@@ -1050,7 +1052,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
         Value *RetVal;
 
         if (NFTy->getReturnType()->isVoidTy()) {
-          RetVal = 0;
+          RetVal = nullptr;
         } else {
           assert (RetTy->isStructTy());
           // The original return value was a struct, insert
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index 4211f12..40ec9fa 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -27,11 +27,10 @@ using namespace llvm;
 /// the split module remain valid.
 static void makeVisible(GlobalValue &GV, bool Delete) {
   bool Local = GV.hasLocalLinkage();
-  if (Local)
-    GV.setVisibility(GlobalValue::HiddenVisibility);
-
   if (Local || Delete) {
     GV.setLinkage(GlobalValue::ExternalLinkage);
+    if (Local)
+      GV.setVisibility(GlobalValue::HiddenVisibility);
     return;
   }
 
@@ -95,7 +94,7 @@ namespace {
 	makeVisible(*I, Delete);
 
         if (Delete)
-          I->setInitializer(0);
+          I->setInitializer(nullptr);
       }
 
       // Visit the Functions.
@@ -134,7 +133,7 @@ namespace {
           } else {
             Declaration =
               new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage,
-                                 0, CurI->getName());
+                                 nullptr, CurI->getName());
 
           }
           CurI->replaceAllUsesWith(Declaration);
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index b716718..fed8839 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -18,7 +18,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "functionattrs"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SetVector.h"
@@ -35,6 +34,8 @@
 #include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "functionattrs"
+
 STATISTIC(NumReadNone, "Number of functions marked readnone");
 STATISTIC(NumReadOnly, "Number of functions marked readonly");
 STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
@@ -46,7 +47,7 @@ STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 namespace {
   struct FunctionAttrs : public CallGraphSCCPass {
     static char ID; // Pass identification, replacement for typeid
-    FunctionAttrs() : CallGraphSCCPass(ID), AA(0) {
+    FunctionAttrs() : CallGraphSCCPass(ID), AA(nullptr) {
       initializeFunctionAttrsPass(*PassRegistry::getPassRegistry());
     }
 
@@ -160,7 +161,7 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
   for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
     Function *F = (*I)->getFunction();
 
-    if (F == 0)
+    if (!F)
       // External node - may write memory.  Just give up.
       return false;
 
@@ -319,7 +320,7 @@ namespace {
     ArgumentGraphNode SyntheticRoot;
 
   public:
-    ArgumentGraph() { SyntheticRoot.Definition = 0; }
+    ArgumentGraph() { SyntheticRoot.Definition = nullptr; }
 
     typedef SmallVectorImpl<ArgumentGraphNode*>::iterator iterator;
 
@@ -521,7 +522,7 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
   for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
     Function *F = (*I)->getFunction();
 
-    if (F == 0)
+    if (!F)
       // External node - only a problem for arguments that we pass to it.
       continue;
 
@@ -600,7 +601,7 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
   // captures.
 
   for (scc_iterator<ArgumentGraph*> I = scc_begin(&AG); !I.isAtEnd(); ++I) {
-    std::vector<ArgumentGraphNode*> &ArgumentSCC = *I;
+    const std::vector<ArgumentGraphNode *> &ArgumentSCC = *I;
     if (ArgumentSCC.size() == 1) {
       if (!ArgumentSCC[0]->Definition) continue;  // synthetic root node
 
@@ -616,8 +617,8 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
     }
 
     bool SCCCaptured = false;
-    for (std::vector<ArgumentGraphNode*>::iterator I = ArgumentSCC.begin(),
-           E = ArgumentSCC.end(); I != E && !SCCCaptured; ++I) {
+    for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();
+         I != E && !SCCCaptured; ++I) {
       ArgumentGraphNode *Node = *I;
       if (Node->Uses.empty()) {
         if (!Node->Definition->hasNoCaptureAttr())
@@ -629,13 +630,12 @@ bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
     SmallPtrSet<Argument*, 8> ArgumentSCCNodes;
     // Fill ArgumentSCCNodes with the elements of the ArgumentSCC.  Used for
     // quickly looking up whether a given Argument is in this ArgumentSCC.
-    for (std::vector<ArgumentGraphNode*>::iterator I = ArgumentSCC.begin(),
-           E = ArgumentSCC.end(); I != E; ++I) {
+    for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end(); I != E; ++I) {
       ArgumentSCCNodes.insert((*I)->Definition);
     }
 
-    for (std::vector<ArgumentGraphNode*>::iterator I = ArgumentSCC.begin(),
-           E = ArgumentSCC.end(); I != E && !SCCCaptured; ++I) {
+    for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();
+         I != E && !SCCCaptured; ++I) {
       ArgumentGraphNode *N = *I;
       for (SmallVectorImpl<ArgumentGraphNode*>::iterator UI = N->Uses.begin(),
              UE = N->Uses.end(); UI != UE; ++UI) {
@@ -775,7 +775,7 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) {
   for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
     Function *F = (*I)->getFunction();
 
-    if (F == 0)
+    if (!F)
       // External node - skip it;
       return false;
 
@@ -1668,7 +1668,7 @@ bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) {
   for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
     Function *F = (*I)->getFunction();
 
-    if (F != 0 && F->isDeclaration())
+    if (F && F->isDeclaration())
       MadeChange |= inferPrototypeAttributes(*F);
   }
 
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 0c081f1..9decddc 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -15,15 +15,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "globaldce"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Pass.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "globaldce"
+
 STATISTIC(NumAliases  , "Number of global aliases removed");
 STATISTIC(NumFunctions, "Number of functions removed");
 STATISTIC(NumVariables, "Number of global variables removed");
@@ -53,6 +56,15 @@ namespace {
   };
 }
 
+/// Returns true if F contains only a single "ret" instruction.
+static bool isEmptyFunction(Function *F) {
+  BasicBlock &Entry = F->getEntryBlock();
+  if (Entry.size() != 1 || !isa<ReturnInst>(Entry.front()))
+    return false;
+  ReturnInst &RI = cast<ReturnInst>(Entry.front());
+  return RI.getReturnValue() == NULL;
+}
+
 char GlobalDCE::ID = 0;
 INITIALIZE_PASS(GlobalDCE, "globaldce",
                 "Dead Global Elimination", false, false)
@@ -61,7 +73,10 @@ ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCE(); }
 
 bool GlobalDCE::runOnModule(Module &M) {
   bool Changed = false;
-  
+
+  // Remove empty functions from the global ctors list.
+  Changed |= optimizeGlobalCtorsList(M, isEmptyFunction);
+
   // Loop over the module, adding globals which are obviously necessary.
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     Changed |= RemoveUnusedGlobalValue(*I);
@@ -99,7 +114,7 @@ bool GlobalDCE::runOnModule(Module &M) {
        I != E; ++I)
     if (!AliveGlobals.count(I)) {
       DeadGlobalVars.push_back(I);         // Keep track of dead globals
-      I->setInitializer(0);
+      I->setInitializer(nullptr);
     }
 
   // The second pass drops the bodies of functions which are dead...
@@ -117,7 +132,7 @@ bool GlobalDCE::runOnModule(Module &M) {
        ++I)
     if (!AliveGlobals.count(I)) {
       DeadAliases.push_back(I);
-      I->setAliasee(0);
+      I->setAliasee(nullptr);
     }
 
   if (!DeadFunctions.empty()) {
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 1a510cf..ae80c43 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "globalopt"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -39,11 +38,15 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
+#include <deque>
 using namespace llvm;
 
+#define DEBUG_TYPE "globalopt"
+
 STATISTIC(NumMarked    , "Number of globals marked constant");
 STATISTIC(NumUnnamed   , "Number of globals marked unnamed_addr");
 STATISTIC(NumSRA       , "Number of aggregate globals broken into scalars");
@@ -74,11 +77,9 @@ namespace {
     bool runOnModule(Module &M) override;
 
   private:
-    GlobalVariable *FindGlobalCtors(Module &M);
     bool OptimizeFunctions(Module &M);
     bool OptimizeGlobalVars(Module &M);
     bool OptimizeGlobalAliases(Module &M);
-    bool OptimizeGlobalCtorsList(GlobalVariable *&GCL);
     bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI);
     bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI,
                                const GlobalStatus &GS);
@@ -294,7 +295,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
       Changed = true;
     } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
       if (CE->getOpcode() == Instruction::GetElementPtr) {
-        Constant *SubInit = 0;
+        Constant *SubInit = nullptr;
         if (Init)
           SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
         Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, TLI);
@@ -302,7 +303,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
                   CE->getType()->isPointerTy()) ||
                  CE->getOpcode() == Instruction::AddrSpaceCast) {
         // Pointer cast, delete any stores and memsets to the global.
-        Changed |= CleanupConstantGlobalUsers(CE, 0, DL, TLI);
+        Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, TLI);
       }
 
       if (CE->use_empty()) {
@@ -313,7 +314,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
       // Do not transform "gepinst (gep constexpr (GV))" here, because forming
       // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold
       // and will invalidate our notion of what Init is.
-      Constant *SubInit = 0;
+      Constant *SubInit = nullptr;
       if (!isa<ConstantExpr>(GEP->getOperand(0))) {
         ConstantExpr *CE =
           dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP, DL, TLI));
@@ -370,7 +371,7 @@ static bool isSafeSROAElementUse(Value *V) {
 
   // Otherwise, it must be a GEP.
   GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I);
-  if (GEPI == 0) return false;
+  if (!GEPI) return false;
 
   if (GEPI->getNumOperands() < 3 || !isa<Constant>(GEPI->getOperand(1)) ||
       !cast<Constant>(GEPI->getOperand(1))->isNullValue())
@@ -470,7 +471,7 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
 static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
   // Make sure this global only has simple uses that we can SRA.
   if (!GlobalUsersSafeToSRA(GV))
-    return 0;
+    return nullptr;
 
   assert(GV->hasLocalLinkage() && !GV->isConstant());
   Constant *Init = GV->getInitializer();
@@ -514,7 +515,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
       NumElements = cast<VectorType>(STy)->getNumElements();
 
     if (NumElements > 16 && GV->hasNUsesOrMore(16))
-      return 0; // It's not worth it.
+      return nullptr; // It's not worth it.
     NewGlobals.reserve(NumElements);
 
     uint64_t EltSize = DL.getTypeAllocSize(STy->getElementType());
@@ -541,7 +542,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
   }
 
   if (NewGlobals.empty())
-    return 0;
+    return nullptr;
 
   DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV);
 
@@ -603,7 +604,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
       if (FirstGlobal == i) ++FirstGlobal;
     }
 
-  return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : 0;
+  return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : nullptr;
 }
 
 /// AllUsesOfValueWillTrapIfNull - Return true if all users of the specified
@@ -785,7 +786,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
       Changed |= CleanupPointerRootUsers(GV, TLI);
     } else {
       Changed = true;
-      CleanupConstantGlobalUsers(GV, 0, DL, TLI);
+      CleanupConstantGlobalUsers(GV, nullptr, DL, TLI);
     }
     if (GV->use_empty()) {
       DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
@@ -847,7 +848,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
   // If there are bitcast users of the malloc (which is typical, usually we have
   // a malloc + bitcast) then replace them with uses of the new global.  Update
   // other users to use the global as well.
-  BitCastInst *TheBC = 0;
+  BitCastInst *TheBC = nullptr;
   while (!CI->use_empty()) {
     Instruction *User = cast<Instruction>(CI->user_back());
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
@@ -858,7 +859,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
         BCI->setOperand(0, NewGV);
       }
     } else {
-      if (TheBC == 0)
+      if (!TheBC)
         TheBC = new BitCastInst(NewGV, CI->getType(), "newgv", CI);
       User->replaceUsesOfWith(CI, TheBC);
     }
@@ -1169,10 +1170,13 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
   } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
     // PN's type is pointer to struct.  Make a new PHI of pointer to struct
     // field.
-    StructType *ST = cast<StructType>(PN->getType()->getPointerElementType());
 
+    PointerType *PTy = cast<PointerType>(PN->getType());
+    StructType *ST = cast<StructType>(PTy->getElementType());
+
+    unsigned AS = PTy->getAddressSpace();
     PHINode *NewPN =
-     PHINode::Create(PointerType::getUnqual(ST->getElementType(FieldNo)),
+      PHINode::Create(PointerType::get(ST->getElementType(FieldNo), AS),
                      PN->getNumIncomingValues(),
                      PN->getName()+".f"+Twine(FieldNo), PN);
     Result = NewPN;
@@ -1284,9 +1288,10 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   std::vector<Value*> FieldGlobals;
   std::vector<Value*> FieldMallocs;
 
+  unsigned AS = GV->getType()->getPointerAddressSpace();
   for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
     Type *FieldTy = STy->getElementType(FieldNo);
-    PointerType *PFieldTy = PointerType::getUnqual(FieldTy);
+    PointerType *PFieldTy = PointerType::get(FieldTy, AS);
 
     GlobalVariable *NGV =
       new GlobalVariable(*GV->getParent(),
@@ -1302,7 +1307,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     Type *IntPtrTy = DL->getIntPtrType(CI->getType());
     Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
                                         ConstantInt::get(IntPtrTy, TypeSize),
-                                        NElems, 0,
+                                        NElems, nullptr,
                                         CI->getName() + ".f" + Twine(FieldNo));
     FieldMallocs.push_back(NMI);
     new StoreInst(NMI, NGV, CI);
@@ -1535,7 +1540,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
       Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
       Instruction *Malloc = CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy,
                                                    AllocSize, NumElements,
-                                                   0, CI->getName());
+                                                   nullptr, CI->getName());
       Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI);
       CI->replaceAllUsesWith(Cast);
       CI->eraseFromParent();
@@ -1750,7 +1755,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
                                                    ->getEntryBlock().begin());
     Type *ElemTy = GV->getType()->getElementType();
     // FIXME: Pass Global's alignment when globals have alignment
-    AllocaInst *Alloca = new AllocaInst(ElemTy, NULL, GV->getName(), &FirstI);
+    AllocaInst *Alloca = new AllocaInst(ElemTy, nullptr,
+                                        GV->getName(), &FirstI);
     if (!isa<UndefValue>(GV->getInitializer()))
       new StoreInst(GV->getInitializer(), Alloca, &FirstI);
 
@@ -1957,116 +1963,6 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) {
   return Changed;
 }
 
-/// FindGlobalCtors - Find the llvm.global_ctors list, verifying that all
-/// initializers have an init priority of 65535.
-GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) {
-  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
-  if (GV == 0) return 0;
-
-  // Verify that the initializer is simple enough for us to handle. We are
-  // only allowed to optimize the initializer if it is unique.
-  if (!GV->hasUniqueInitializer()) return 0;
-
-  if (isa<ConstantAggregateZero>(GV->getInitializer()))
-    return GV;
-  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
-
-  for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) {
-    if (isa<ConstantAggregateZero>(*i))
-      continue;
-    ConstantStruct *CS = cast<ConstantStruct>(*i);
-    if (isa<ConstantPointerNull>(CS->getOperand(1)))
-      continue;
-
-    // Must have a function or null ptr.
-    if (!isa<Function>(CS->getOperand(1)))
-      return 0;
-
-    // Init priority must be standard.
-    ConstantInt *CI = cast<ConstantInt>(CS->getOperand(0));
-    if (CI->getZExtValue() != 65535)
-      return 0;
-  }
-
-  return GV;
-}
-
-/// ParseGlobalCtors - Given a llvm.global_ctors list that we can understand,
-/// return a list of the functions and null terminator as a vector.
-static std::vector<Function*> ParseGlobalCtors(GlobalVariable *GV) {
-  if (GV->getInitializer()->isNullValue())
-    return std::vector<Function*>();
-  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
-  std::vector<Function*> Result;
-  Result.reserve(CA->getNumOperands());
-  for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) {
-    ConstantStruct *CS = cast<ConstantStruct>(*i);
-    Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
-  }
-  return Result;
-}
-
-/// InstallGlobalCtors - Given a specified llvm.global_ctors list, install the
-/// specified array, returning the new global to use.
-static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
-                                          const std::vector<Function*> &Ctors) {
-  // If we made a change, reassemble the initializer list.
-  Constant *CSVals[2];
-  CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()), 65535);
-  CSVals[1] = 0;
-
-  StructType *StructTy =
-    cast<StructType>(GCL->getType()->getElementType()->getArrayElementType());
-
-  // Create the new init list.
-  std::vector<Constant*> CAList;
-  for (unsigned i = 0, e = Ctors.size(); i != e; ++i) {
-    if (Ctors[i]) {
-      CSVals[1] = Ctors[i];
-    } else {
-      Type *FTy = FunctionType::get(Type::getVoidTy(GCL->getContext()),
-                                          false);
-      PointerType *PFTy = PointerType::getUnqual(FTy);
-      CSVals[1] = Constant::getNullValue(PFTy);
-      CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()),
-                                   0x7fffffff);
-    }
-    CAList.push_back(ConstantStruct::get(StructTy, CSVals));
-  }
-
-  // Create the array initializer.
-  Constant *CA = ConstantArray::get(ArrayType::get(StructTy,
-                                                   CAList.size()), CAList);
-
-  // If we didn't change the number of elements, don't create a new GV.
-  if (CA->getType() == GCL->getInitializer()->getType()) {
-    GCL->setInitializer(CA);
-    return GCL;
-  }
-
-  // Create the new global and insert it next to the existing list.
-  GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(),
-                                           GCL->getLinkage(), CA, "",
-                                           GCL->getThreadLocalMode());
-  GCL->getParent()->getGlobalList().insert(GCL, NGV);
-  NGV->takeName(GCL);
-
-  // Nuke the old list, replacing any uses with the new one.
-  if (!GCL->use_empty()) {
-    Constant *V = NGV;
-    if (V->getType() != GCL->getType())
-      V = ConstantExpr::getBitCast(V, GCL->getType());
-    GCL->replaceAllUsesWith(V);
-  }
-  GCL->eraseFromParent();
-
-  if (Ctors.size())
-    return NGV;
-  else
-    return 0;
-}
-
-
 static inline bool
 isSimpleEnoughValueToCommit(Constant *C,
                             SmallPtrSet<Constant*, 8> &SimpleConstants,
@@ -2271,22 +2167,16 @@ class Evaluator {
 public:
   Evaluator(const DataLayout *DL, const TargetLibraryInfo *TLI)
     : DL(DL), TLI(TLI) {
-    ValueStack.push_back(new DenseMap<Value*, Constant*>);
+    ValueStack.emplace_back();
   }
 
   ~Evaluator() {
-    DeleteContainerPointers(ValueStack);
-    while (!AllocaTmps.empty()) {
-      GlobalVariable *Tmp = AllocaTmps.back();
-      AllocaTmps.pop_back();
-
+    for (auto &Tmp : AllocaTmps)
       // If there are still users of the alloca, the program is doing something
       // silly, e.g. storing the address of the alloca somewhere and using it
       // later.  Since this is undefined, we'll just make it be null.
       if (!Tmp->use_empty())
         Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType()));
-      delete Tmp;
-    }
   }
 
   /// EvaluateFunction - Evaluate a call to function F, returning true if
@@ -2302,13 +2192,13 @@ public:
 
   Constant *getVal(Value *V) {
     if (Constant *CV = dyn_cast<Constant>(V)) return CV;
-    Constant *R = ValueStack.back()->lookup(V);
+    Constant *R = ValueStack.back().lookup(V);
     assert(R && "Reference to an uncomputed value!");
     return R;
   }
 
   void setVal(Value *V, Constant *C) {
-    ValueStack.back()->operator[](V) = C;
+    ValueStack.back()[V] = C;
   }
 
   const DenseMap<Constant*, Constant*> &getMutatedMemory() const {
@@ -2323,9 +2213,9 @@ private:
   Constant *ComputeLoadResult(Constant *P);
 
   /// ValueStack - As we compute SSA register values, we store their contents
-  /// here. The back of the vector contains the current function and the stack
+  /// here. The back of the deque contains the current function and the stack
   /// contains the values in the calling frames.
-  SmallVector<DenseMap<Value*, Constant*>*, 4> ValueStack;
+  std::deque<DenseMap<Value*, Constant*>> ValueStack;
 
   /// CallStack - This is used to detect recursion.  In pathological situations
   /// we could hit exponential behavior, but at least there is nothing
@@ -2340,7 +2230,7 @@ private:
   /// AllocaTmps - To 'execute' an alloca, we create a temporary global variable
   /// to represent its body.  This vector is needed so we can delete the
   /// temporary globals when we are done.
-  SmallVector<GlobalVariable*, 32> AllocaTmps;
+  SmallVector<std::unique_ptr<GlobalVariable>, 32> AllocaTmps;
 
   /// Invariants - These global variables have been marked invariant by the
   /// static constructor.
@@ -2369,7 +2259,7 @@ Constant *Evaluator::ComputeLoadResult(Constant *P) {
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
     if (GV->hasDefinitiveInitializer())
       return GV->getInitializer();
-    return 0;
+    return nullptr;
   }
 
   // Handle a constantexpr getelementptr.
@@ -2381,7 +2271,7 @@ Constant *Evaluator::ComputeLoadResult(Constant *P) {
         return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
     }
 
-  return 0;  // don't know how to evaluate.
+  return nullptr;  // don't know how to evaluate.
 }
 
 /// EvaluateBlock - Evaluate all instructions in block BB, returning true if
@@ -2391,7 +2281,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
                               BasicBlock *&NextBB) {
   // This is the main evaluation loop.
   while (1) {
-    Constant *InstResult = 0;
+    Constant *InstResult = nullptr;
 
     DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
 
@@ -2517,7 +2407,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
               "folding: " << *Ptr << "\n");
       }
       InstResult = ComputeLoadResult(Ptr);
-      if (InstResult == 0) {
+      if (!InstResult) {
         DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load."
               "\n");
         return false; // Could not evaluate load.
@@ -2530,11 +2420,10 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         return false;  // Cannot handle array allocs.
       }
       Type *Ty = AI->getType()->getElementType();
-      AllocaTmps.push_back(new GlobalVariable(Ty, false,
-                                              GlobalValue::InternalLinkage,
-                                              UndefValue::get(Ty),
-                                              AI->getName()));
-      InstResult = AllocaTmps.back();
+      AllocaTmps.push_back(
+          make_unique<GlobalVariable>(Ty, false, GlobalValue::InternalLinkage,
+                                      UndefValue::get(Ty), AI->getName()));
+      InstResult = AllocaTmps.back().get();
       DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
     } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
       CallSite CS(CurInst);
@@ -2636,17 +2525,17 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           return false;
         }
 
-        Constant *RetVal = 0;
+        Constant *RetVal = nullptr;
         // Execute the call, if successful, use the return value.
-        ValueStack.push_back(new DenseMap<Value*, Constant*>);
+        ValueStack.emplace_back();
         if (!EvaluateFunction(Callee, RetVal, Formals)) {
           DEBUG(dbgs() << "Failed to evaluate function.\n");
           return false;
         }
-        delete ValueStack.pop_back_val();
+        ValueStack.pop_back();
         InstResult = RetVal;
 
-        if (InstResult != NULL) {
+        if (InstResult) {
           DEBUG(dbgs() << "Successfully evaluated function. Result: " <<
                 InstResult << "\n\n");
         } else {
@@ -2678,7 +2567,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         else
           return false;  // Cannot determine.
       } else if (isa<ReturnInst>(CurInst)) {
-        NextBB = 0;
+        NextBB = nullptr;
       } else {
         // invoke, unwind, resume, unreachable.
         DEBUG(dbgs() << "Can not handle terminator.");
@@ -2743,13 +2632,13 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
   BasicBlock::iterator CurInst = CurBB->begin();
 
   while (1) {
-    BasicBlock *NextBB = 0; // Initialized to avoid compiler warnings.
+    BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings.
     DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
 
     if (!EvaluateBlock(CurInst, NextBB))
       return false;
 
-    if (NextBB == 0) {
+    if (!NextBB) {
       // Successfully running until there's no next block means that we found
       // the return.  Fill it the return value and pop the call stack.
       ReturnInst *RI = cast<ReturnInst>(CurBB->getTerminator());
@@ -2768,7 +2657,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
     // Okay, we have never been in this block before.  Check to see if there
     // are any PHI nodes.  If so, evaluate them with information about where
     // we came from.
-    PHINode *PN = 0;
+    PHINode *PN = nullptr;
     for (CurInst = NextBB->begin();
          (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
       setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB)));
@@ -2789,6 +2678,8 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout *DL,
                                            SmallVector<Constant*, 0>());
 
   if (EvalSuccess) {
+    ++NumCtorsEvaluated;
+
     // We succeeded at evaluation: commit the result.
     DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
           << F->getName() << "' to " << Eval.getMutatedMemory().size()
@@ -2806,46 +2697,6 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout *DL,
   return EvalSuccess;
 }
 
-/// OptimizeGlobalCtorsList - Simplify and evaluation global ctors if possible.
-/// Return true if anything changed.
-bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
-  std::vector<Function*> Ctors = ParseGlobalCtors(GCL);
-  bool MadeChange = false;
-  if (Ctors.empty()) return false;
-
-  // Loop over global ctors, optimizing them when we can.
-  for (unsigned i = 0; i != Ctors.size(); ++i) {
-    Function *F = Ctors[i];
-    // Found a null terminator in the middle of the list, prune off the rest of
-    // the list.
-    if (F == 0) {
-      if (i != Ctors.size()-1) {
-        Ctors.resize(i+1);
-        MadeChange = true;
-      }
-      break;
-    }
-    DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
-
-    // We cannot simplify external ctor functions.
-    if (F->empty()) continue;
-
-    // If we can evaluate the ctor at compile time, do.
-    if (EvaluateStaticConstructor(F, DL, TLI)) {
-      Ctors.erase(Ctors.begin()+i);
-      MadeChange = true;
-      --i;
-      ++NumCtorsEvaluated;
-      continue;
-    }
-  }
-
-  if (!MadeChange) return false;
-
-  GCL = InstallGlobalCtors(GCL, Ctors);
-  return true;
-}
-
 static int compareNames(Constant *const *A, Constant *const *B) {
   return (*A)->getName().compare((*B)->getName());
 }
@@ -3010,7 +2861,7 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
     if (!hasUsesToReplace(*J, Used, RenameTarget))
       continue;
 
-    J->replaceAllUsesWith(Aliasee);
+    J->replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J->getType()));
     ++NumAliasesResolved;
     Changed = true;
 
@@ -3042,12 +2893,12 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
 
 static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::cxa_atexit))
-    return 0;
+    return nullptr;
 
   Function *Fn = M.getFunction(TLI->getName(LibFunc::cxa_atexit));
 
   if (!Fn)
-    return 0;
+    return nullptr;
 
   FunctionType *FTy = Fn->getFunctionType();
 
@@ -3058,7 +2909,7 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
       !FTy->getParamType(0)->isPointerTy() ||
       !FTy->getParamType(1)->isPointerTy() ||
       !FTy->getParamType(2)->isPointerTy())
-    return 0;
+    return nullptr;
 
   return Fn;
 }
@@ -3160,12 +3011,9 @@ bool GlobalOpt::runOnModule(Module &M) {
   bool Changed = false;
 
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
 
-  // Try to find the llvm.globalctors list.
-  GlobalVariable *GlobalCtors = FindGlobalCtors(M);
-
   bool LocalChange = true;
   while (LocalChange) {
     LocalChange = false;
@@ -3174,8 +3022,9 @@ bool GlobalOpt::runOnModule(Module &M) {
     LocalChange |= OptimizeFunctions(M);
 
     // Optimize global_ctors list.
-    if (GlobalCtors)
-      LocalChange |= OptimizeGlobalCtorsList(GlobalCtors);
+    LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) {
+      return EvaluateStaticConstructor(F, DL, TLI);
+    });
 
     // Optimize non-address-taken globals.
     LocalChange |= OptimizeGlobalVars(M);
diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp
index 8684796..af541d1 100644
--- a/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ipconstprop"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -27,6 +26,8 @@
 #include "llvm/Pass.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "ipconstprop"
+
 STATISTIC(NumArgumentsProped, "Number of args turned into constants");
 STATISTIC(NumReturnValProped, "Number of return values turned into constants");
 
@@ -112,7 +113,7 @@ bool IPCP::PropagateConstantsIntoArguments(Function &F) {
         continue;
       
       Constant *C = dyn_cast<Constant>(*AI);
-      if (C && ArgumentConstants[i].first == 0) {
+      if (C && ArgumentConstants[i].first == nullptr) {
         ArgumentConstants[i].first = C;   // First constant seen.
       } else if (C && ArgumentConstants[i].first == C) {
         // Still the constant value we think it is.
@@ -139,7 +140,7 @@ bool IPCP::PropagateConstantsIntoArguments(Function &F) {
       continue;
   
     Value *V = ArgumentConstants[i].first;
-    if (V == 0) V = UndefValue::get(AI->getType());
+    if (!V) V = UndefValue::get(AI->getType());
     AI->replaceAllUsesWith(V);
     ++NumArgumentsProped;
     MadeChange = true;
@@ -209,7 +210,7 @@ bool IPCP::PropagateConstantReturn(Function &F) {
         }
         // Different or no known return value? Don't propagate this return
         // value.
-        RetVals[i] = 0;
+        RetVals[i] = nullptr;
         // All values non-constant? Stop looking.
         if (++NumNonConstant == RetVals.size())
           return false;
@@ -235,7 +236,7 @@ bool IPCP::PropagateConstantReturn(Function &F) {
 
     MadeChange = true;
 
-    if (STy == 0) {
+    if (!STy) {
       Value* New = RetVals[0];
       if (Argument *A = dyn_cast<Argument>(New))
         // Was an argument returned? Then find the corresponding argument in
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index 6cf3040..624cb90 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "inline"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/CallGraph.h"
@@ -28,6 +27,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "inline"
+
 namespace {
 
 /// \brief Inliner pass which only handles "always inline" functions.
@@ -36,12 +37,13 @@ class AlwaysInliner : public Inliner {
 
 public:
   // Use extremely low threshold.
-  AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true), ICA(0) {
+  AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true),
+                    ICA(nullptr) {
     initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
   }
 
   AlwaysInliner(bool InsertLifetime)
-      : Inliner(ID, -2000000000, InsertLifetime), ICA(0) {
+      : Inliner(ID, -2000000000, InsertLifetime), ICA(nullptr) {
     initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -93,8 +95,7 @@ InlineCost AlwaysInliner::getInlineCost(CallSite CS) {
   // that are viable for inlining. FIXME: We shouldn't even get here for
   // declarations.
   if (Callee && !Callee->isDeclaration() &&
-      Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::AlwaysInline) &&
+      CS.hasFnAttr(Attribute::AlwaysInline) &&
       ICA->isInlineViable(*Callee))
     return InlineCost::getAlways();
 
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 7141064..d189756 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "inline"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
@@ -26,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "inline"
+
 namespace {
 
 /// \brief Actual inliner pass implementation.
@@ -37,12 +38,12 @@ class SimpleInliner : public Inliner {
   InlineCostAnalysis *ICA;
 
 public:
-  SimpleInliner() : Inliner(ID), ICA(0) {
+  SimpleInliner() : Inliner(ID), ICA(nullptr) {
     initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
   }
 
   SimpleInliner(int Threshold)
-      : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(0) {
+      : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(nullptr) {
     initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
   }
 
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index e97fb83..9087ab2 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "inline"
 #include "llvm/Transforms/IPO/InlinerPass.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -21,6 +20,7 @@
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
@@ -32,6 +32,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "inline"
+
 STATISTIC(NumInlined, "Number of functions inlined");
 STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined");
 STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
@@ -183,7 +185,7 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
     // canonicalized to be an allocation *of* an array), or allocations whose
     // type is not itself an array (because we're afraid of pessimizing SRoA).
     ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType());
-    if (ATy == 0 || AI->isArrayAllocation())
+    if (!ATy || AI->isArrayAllocation())
       continue;
     
     // Get the list of all available allocas for this array type.
@@ -239,7 +241,7 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
       AI->eraseFromParent();
       MergedAwayAlloca = true;
       ++NumMergedAllocas;
-      IFI.StaticAllocas[AllocaNo] = 0;
+      IFI.StaticAllocas[AllocaNo] = nullptr;
       break;
     }
 
@@ -288,12 +290,24 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const {
   bool ColdCallee = Callee && !Callee->isDeclaration() &&
     Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                          Attribute::Cold);
-  if (ColdCallee && ColdThreshold < thres)
+  // Command line argument for InlineLimit will override the default
+  // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold,
+  // do not use the default cold threshold even if it is smaller.
+  if ((InlineLimit.getNumOccurrences() == 0 ||
+       ColdThreshold.getNumOccurrences() > 0) && ColdCallee &&
+      ColdThreshold < thres)
     thres = ColdThreshold;
 
   return thres;
 }
 
+static void emitAnalysis(CallSite CS, const Twine &Msg) {
+  Function *Caller = CS.getCaller();
+  LLVMContext &Ctx = Caller->getContext();
+  DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+  emitOptimizationRemarkAnalysis(Ctx, DEBUG_TYPE, *Caller, DLoc, Msg);
+}
+
 /// shouldInline - Return true if the inliner should attempt to inline
 /// at the given CallSite.
 bool Inliner::shouldInline(CallSite CS) {
@@ -302,12 +316,16 @@ bool Inliner::shouldInline(CallSite CS) {
   if (IC.isAlways()) {
     DEBUG(dbgs() << "    Inlining: cost=always"
           << ", Call: " << *CS.getInstruction() << "\n");
+    emitAnalysis(CS, Twine(CS.getCalledFunction()->getName()) +
+                         " should always be inlined (cost=always)");
     return true;
   }
   
   if (IC.isNever()) {
     DEBUG(dbgs() << "    NOT Inlining: cost=never"
           << ", Call: " << *CS.getInstruction() << "\n");
+    emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() +
+                           " should never be inlined (cost=never)"));
     return false;
   }
   
@@ -316,6 +334,10 @@ bool Inliner::shouldInline(CallSite CS) {
     DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
           << ", thres=" << (IC.getCostDelta() + IC.getCost())
           << ", Call: " << *CS.getInstruction() << "\n");
+    emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() +
+                           " too costly to inline (cost=") +
+                         Twine(IC.getCost()) + ", threshold=" +
+                         Twine(IC.getCostDelta() + IC.getCost()) + ")");
     return false;
   }
   
@@ -383,6 +405,11 @@ bool Inliner::shouldInline(CallSite CS) {
       DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction() <<
            " Cost = " << IC.getCost() <<
            ", outer Cost = " << TotalSecondaryCost << '\n');
+      emitAnalysis(
+          CS, Twine("Not inlining. Cost of inlining " +
+                    CS.getCalledFunction()->getName() +
+                    " increases the cost of inlining " +
+                    CS.getCaller()->getName() + " in other contexts"));
       return false;
     }
   }
@@ -390,6 +417,10 @@ bool Inliner::shouldInline(CallSite CS) {
   DEBUG(dbgs() << "    Inlining: cost=" << IC.getCost()
         << ", thres=" << (IC.getCostDelta() + IC.getCost())
         << ", Call: " << *CS.getInstruction() << '\n');
+  emitAnalysis(
+      CS, CS.getCalledFunction()->getName() + Twine(" can be inlined into ") +
+              CS.getCaller()->getName() + " with cost=" + Twine(IC.getCost()) +
+              " (threshold=" + Twine(IC.getCostDelta() + IC.getCost()) + ")");
   return true;
 }
 
@@ -410,7 +441,7 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  const DataLayout *DL = DLP ? &DLP->getDataLayout() : 0;
+  const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
   const TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   SmallPtrSet<Function*, 8> SCCFunctions;
@@ -499,7 +530,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
         ++NumCallsDeleted;
       } else {
         // We can only inline direct calls to non-declarations.
-        if (Callee == 0 || Callee->isDeclaration()) continue;
+        if (!Callee || Callee->isDeclaration()) continue;
       
         // If this call site was obtained by inlining another function, verify
         // that the include path for the function did not include the callee
@@ -511,18 +542,37 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
             InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory))
           continue;
         
-        
+        LLVMContext &CallerCtx = Caller->getContext();
+
+        // Get DebugLoc to report. CS will be invalid after Inliner.
+        DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+
         // If the policy determines that we should inline this function,
         // try to do so.
-        if (!shouldInline(CS))
+        if (!shouldInline(CS)) {
+          emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc,
+                                       Twine(Callee->getName() +
+                                             " will not be inlined into " +
+                                             Caller->getName()));
           continue;
+        }
 
         // Attempt to inline the function.
         if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas,
-                                  InlineHistoryID, InsertLifetime, DL))
+                                  InlineHistoryID, InsertLifetime, DL)) {
+          emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc,
+                                       Twine(Callee->getName() +
+                                             " will not be inlined into " +
+                                             Caller->getName()));
           continue;
+        }
         ++NumInlined;
-        
+
+        // Report the inline decision.
+        emitOptimizationRemark(
+            CallerCtx, DEBUG_TYPE, *Caller, DLoc,
+            Twine(Callee->getName() + " inlined into " + Caller->getName()));
+
         // If inlining this function gave us any new call sites, throw them
         // onto our worklist to process.  They are useful inline candidates.
         if (!InlineInfo.InlinedCalls.empty()) {
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index c1fe01c..c970a1a 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -19,7 +19,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "internalize"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -35,6 +34,8 @@
 #include <set>
 using namespace llvm;
 
+#define DEBUG_TYPE "internalize"
+
 STATISTIC(NumAliases  , "Number of aliases internalized");
 STATISTIC(NumFunctions, "Number of functions internalized");
 STATISTIC(NumGlobals  , "Number of global vars internalized");
@@ -131,8 +132,8 @@ static bool shouldInternalize(const GlobalValue &GV,
 
 bool InternalizePass::runOnModule(Module &M) {
   CallGraphWrapperPass *CGPass = getAnalysisIfAvailable<CallGraphWrapperPass>();
-  CallGraph *CG = CGPass ? &CGPass->getCallGraph() : 0;
-  CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0;
+  CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr;
+  CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : nullptr;
   bool Changed = false;
 
   SmallPtrSet<GlobalValue *, 8> Used;
@@ -158,6 +159,7 @@ bool InternalizePass::runOnModule(Module &M) {
     if (!shouldInternalize(*I, ExternalNames))
       continue;
 
+    I->setVisibility(GlobalValue::DefaultVisibility);
     I->setLinkage(GlobalValue::InternalLinkage);
 
     if (ExternalNode)
@@ -194,6 +196,7 @@ bool InternalizePass::runOnModule(Module &M) {
     if (!shouldInternalize(*I, ExternalNames))
       continue;
 
+    I->setVisibility(GlobalValue::DefaultVisibility);
     I->setLinkage(GlobalValue::InternalLinkage);
     Changed = true;
     ++NumGlobals;
@@ -206,6 +209,7 @@ bool InternalizePass::runOnModule(Module &M) {
     if (!shouldInternalize(*I, ExternalNames))
       continue;
 
+    I->setVisibility(GlobalValue::DefaultVisibility);
     I->setLinkage(GlobalValue::InternalLinkage);
     Changed = true;
     ++NumAliases;
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index 464aa99..20414aa 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loop-extract"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -30,6 +29,8 @@
 #include <set>
 using namespace llvm;
 
+#define DEBUG_TYPE "loop-extract"
+
 STATISTIC(NumExtracted, "Number of loops extracted");
 
 namespace {
@@ -136,7 +137,7 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
     if (NumLoops == 0) return Changed;
     --NumLoops;
     CodeExtractor Extractor(DT, *L);
-    if (Extractor.extractCodeRegion() != 0) {
+    if (Extractor.extractCodeRegion() != nullptr) {
       Changed = true;
       // After extraction, the loop is replaced by a function call, so
       // we shouldn't try to run any more loop passes on it.
@@ -241,7 +242,7 @@ void BlockExtractorPass::SplitLandingPadPreds(Function *F) {
     if (!Split) continue;
 
     SmallVector<BasicBlock*, 2> NewBBs;
-    SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", 0, NewBBs);
+    SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", nullptr, NewBBs);
   }
 }
 
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 8555d2c..c3a2b12 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -43,7 +43,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mergefunc"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -67,6 +66,8 @@
 #include <vector>
 using namespace llvm;
 
+#define DEBUG_TYPE "mergefunc"
+
 STATISTIC(NumFunctionsMerged, "Number of functions merged");
 STATISTIC(NumThunksWritten, "Number of thunks generated");
 STATISTIC(NumAliasesWritten, "Number of aliases generated");
@@ -120,12 +121,12 @@ public:
   void release() {
     assert(Func &&
            "Attempted to release function twice, or release empty/tombstone!");
-    Func = NULL;
+    Func = nullptr;
   }
 
 private:
   explicit ComparableFunction(unsigned Hash)
-    : Func(NULL), Hash(Hash), DL(NULL) {}
+    : Func(nullptr), Hash(Hash), DL(nullptr) {}
 
   AssertingVH<Function> Func;
   unsigned Hash;
@@ -175,19 +176,181 @@ private:
   /// Test whether two basic blocks have equivalent behaviour.
   bool compare(const BasicBlock *BB1, const BasicBlock *BB2);
 
+  /// Constants comparison.
+  /// Its analog to lexicographical comparison between hypothetical numbers
+  /// of next format:
+  /// <bitcastability-trait><raw-bit-contents>
+  ///
+  /// 1. Bitcastability.
+  /// Check whether L's type could be losslessly bitcasted to R's type.
+  /// On this stage method, in case when lossless bitcast is not possible
+  /// method returns -1 or 1, thus also defining which type is greater in
+  /// context of bitcastability.
+  /// Stage 0: If types are equal in terms of cmpTypes, then we can go straight
+  ///          to the contents comparison.
+  ///          If types differ, remember types comparison result and check
+  ///          whether we still can bitcast types.
+  /// Stage 1: Types that satisfies isFirstClassType conditions are always
+  ///          greater then others.
+  /// Stage 2: Vector is greater then non-vector.
+  ///          If both types are vectors, then vector with greater bitwidth is
+  ///          greater.
+  ///          If both types are vectors with the same bitwidth, then types
+  ///          are bitcastable, and we can skip other stages, and go to contents
+  ///          comparison.
+  /// Stage 3: Pointer types are greater than non-pointers. If both types are
+  ///          pointers of the same address space - go to contents comparison.
+  ///          Different address spaces: pointer with greater address space is
+  ///          greater.
+  /// Stage 4: Types are neither vectors, nor pointers. And they differ.
+  ///          We don't know how to bitcast them. So, we better don't do it,
+  ///          and return types comparison result (so it determines the
+  ///          relationship among constants we don't know how to bitcast).
+  ///
+  /// Just for clearance, let's see how the set of constants could look
+  /// on single dimension axis:
+  ///
+  /// [NFCT], [FCT, "others"], [FCT, pointers], [FCT, vectors]
+  /// Where: NFCT - Not a FirstClassType
+  ///        FCT - FirstClassTyp:
+  ///
+  /// 2. Compare raw contents.
+  /// It ignores types on this stage and only compares bits from L and R.
+  /// Returns 0, if L and R has equivalent contents.
+  /// -1 or 1 if values are different.
+  /// Pretty trivial:
+  /// 2.1. If contents are numbers, compare numbers.
+  ///    Ints with greater bitwidth are greater. Ints with same bitwidths
+  ///    compared by their contents.
+  /// 2.2. "And so on". Just to avoid discrepancies with comments
+  /// perhaps it would be better to read the implementation itself.
+  /// 3. And again about overall picture. Let's look back at how the ordered set
+  /// of constants will look like:
+  /// [NFCT], [FCT, "others"], [FCT, pointers], [FCT, vectors]
+  ///
+  /// Now look, what could be inside [FCT, "others"], for example:
+  /// [FCT, "others"] =
+  /// [
+  ///   [double 0.1], [double 1.23],
+  ///   [i32 1], [i32 2],
+  ///   { double 1.0 },       ; StructTyID, NumElements = 1
+  ///   { i32 1 },            ; StructTyID, NumElements = 1
+  ///   { double 1, i32 1 },  ; StructTyID, NumElements = 2
+  ///   { i32 1, double 1 }   ; StructTyID, NumElements = 2
+  /// ]
+  ///
+  /// Let's explain the order. Float numbers will be less than integers, just
+  /// because of cmpType terms: FloatTyID < IntegerTyID.
+  /// Floats (with same fltSemantics) are sorted according to their value.
+  /// Then you can see integers, and they are, like a floats,
+  /// could be easy sorted among each others.
+  /// The structures. Structures are grouped at the tail, again because of their
+  /// TypeID: StructTyID > IntegerTyID > FloatTyID.
+  /// Structures with greater number of elements are greater. Structures with
+  /// greater elements going first are greater.
+  /// The same logic with vectors, arrays and other possible complex types.
+  ///
+  /// Bitcastable constants.
+  /// Let's assume, that some constant, belongs to some group of
+  /// "so-called-equal" values with different types, and at the same time
+  /// belongs to another group of constants with equal types
+  /// and "really" equal values.
+  ///
+  /// Now, prove that this is impossible:
+  ///
+  /// If constant A with type TyA is bitcastable to B with type TyB, then:
+  /// 1. All constants with equal types to TyA, are bitcastable to B. Since
+  ///    those should be vectors (if TyA is vector), pointers
+  ///    (if TyA is pointer), or else (if TyA equal to TyB), those types should
+  ///    be equal to TyB.
+  /// 2. All constants with non-equal, but bitcastable types to TyA, are
+  ///    bitcastable to B.
+  ///    Once again, just because we allow it to vectors and pointers only.
+  ///    This statement could be expanded as below:
+  /// 2.1. All vectors with equal bitwidth to vector A, has equal bitwidth to
+  ///      vector B, and thus bitcastable to B as well.
+  /// 2.2. All pointers of the same address space, no matter what they point to,
+  ///      bitcastable. So if C is pointer, it could be bitcasted to A and to B.
+  /// So any constant equal or bitcastable to A is equal or bitcastable to B.
+  /// QED.
+  ///
+  /// In another words, for pointers and vectors, we ignore top-level type and
+  /// look at their particular properties (bit-width for vectors, and
+  /// address space for pointers).
+  /// If these properties are equal - compare their contents.
+  int cmpConstants(const Constant *L, const Constant *R);
+
   /// Assign or look up previously assigned numbers for the two values, and
   /// return whether the numbers are equal. Numbers are assigned in the order
   /// visited.
-  bool enumerate(const Value *V1, const Value *V2);
+  /// Comparison order:
+  /// Stage 0: Value that is function itself is always greater then others.
+  ///          If left and right values are references to their functions, then
+  ///          they are equal.
+  /// Stage 1: Constants are greater than non-constants.
+  ///          If both left and right are constants, then the result of
+  ///          cmpConstants is used as cmpValues result.
+  /// Stage 2: InlineAsm instances are greater than others. If both left and
+  ///          right are InlineAsm instances, InlineAsm* pointers casted to
+  ///          integers and compared as numbers.
+  /// Stage 3: For all other cases we compare order we meet these values in
+  ///          their functions. If right value was met first during scanning,
+  ///          then left value is greater.
+  ///          In another words, we compare serial numbers, for more details
+  ///          see comments for sn_mapL and sn_mapR.
+  int cmpValues(const Value *L, const Value *R);
+
+  bool enumerate(const Value *V1, const Value *V2) {
+    return cmpValues(V1, V2) == 0;
+  }
 
   /// Compare two Instructions for equivalence, similar to
   /// Instruction::isSameOperationAs but with modifications to the type
   /// comparison.
+  /// Stages are listed in "most significant stage first" order:
+  /// On each stage below, we do comparison between some left and right
+  /// operation parts. If parts are non-equal, we assign parts comparison
+  /// result to the operation comparison result and exit from method.
+  /// Otherwise we proceed to the next stage.
+  /// Stages:
+  /// 1. Operations opcodes. Compared as numbers.
+  /// 2. Number of operands.
+  /// 3. Operation types. Compared with cmpType method.
+  /// 4. Compare operation subclass optional data as stream of bytes:
+  /// just convert it to integers and call cmpNumbers.
+  /// 5. Compare in operation operand types with cmpType in
+  /// most significant operand first order.
+  /// 6. Last stage. Check operations for some specific attributes.
+  /// For example, for Load it would be:
+  /// 6.1.Load: volatile (as boolean flag)
+  /// 6.2.Load: alignment (as integer numbers)
+  /// 6.3.Load: synch-scope (as integer numbers)
+  /// On this stage its better to see the code, since its not more than 10-15
+  /// strings for particular instruction, and could change sometimes.
+  int cmpOperation(const Instruction *L, const Instruction *R) const;
+
   bool isEquivalentOperation(const Instruction *I1,
-                             const Instruction *I2) const;
+                             const Instruction *I2) const {
+    return cmpOperation(I1, I2) == 0;
+  }
 
   /// Compare two GEPs for equivalent pointer arithmetic.
-  bool isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2);
+  /// Parts to be compared for each comparison stage,
+  /// most significant stage first:
+  /// 1. Address space. As numbers.
+  /// 2. Constant offset, (if "DataLayout *DL" field is not NULL,
+  /// using GEPOperator::accumulateConstantOffset method).
+  /// 3. Pointer operand type (using cmpType method).
+  /// 4. Number of operands.
+  /// 5. Compare operands, using cmpValues method.
+  int cmpGEP(const GEPOperator *GEPL, const GEPOperator *GEPR);
+  int cmpGEP(const GetElementPtrInst *GEPL, const GetElementPtrInst *GEPR) {
+    return cmpGEP(cast<GEPOperator>(GEPL), cast<GEPOperator>(GEPR));
+  }
+
+  bool isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2) {
+    return cmpGEP(GEP1, GEP2) == 0;
+  }
   bool isEquivalentGEP(const GetElementPtrInst *GEP1,
                        const GetElementPtrInst *GEP2) {
     return isEquivalentGEP(cast<GEPOperator>(GEP1), cast<GEPOperator>(GEP2));
@@ -241,13 +404,50 @@ private:
 
   int cmpNumbers(uint64_t L, uint64_t R) const;
 
+  int cmpAPInt(const APInt &L, const APInt &R) const;
+  int cmpAPFloat(const APFloat &L, const APFloat &R) const;
+  int cmpStrings(StringRef L, StringRef R) const;
+  int cmpAttrs(const AttributeSet L, const AttributeSet R) const;
+
   // The two functions undergoing comparison.
   const Function *F1, *F2;
 
   const DataLayout *DL;
 
-  DenseMap<const Value *, const Value *> id_map;
-  DenseSet<const Value *> seen_values;
+  /// Assign serial numbers to values from left function, and values from
+  /// right function.
+  /// Explanation:
+  /// Being comparing functions we need to compare values we meet at left and
+  /// right sides.
+  /// Its easy to sort things out for external values. It just should be
+  /// the same value at left and right.
+  /// But for local values (those were introduced inside function body)
+  /// we have to ensure they were introduced at exactly the same place,
+  /// and plays the same role.
+  /// Let's assign serial number to each value when we meet it first time.
+  /// Values that were met at same place will be with same serial numbers.
+  /// In this case it would be good to explain few points about values assigned
+  /// to BBs and other ways of implementation (see below).
+  ///
+  /// 1. Safety of BB reordering.
+  /// It's safe to change the order of BasicBlocks in function.
+  /// Relationship with other functions and serial numbering will not be
+  /// changed in this case.
+  /// As follows from FunctionComparator::compare(), we do CFG walk: we start
+  /// from the entry, and then take each terminator. So it doesn't matter how in
+  /// fact BBs are ordered in function. And since cmpValues are called during
+  /// this walk, the numbering depends only on how BBs located inside the CFG.
+  /// So the answer is - yes. We will get the same numbering.
+  ///
+  /// 2. Impossibility to use dominance properties of values.
+  /// If we compare two instruction operands: first is usage of local
+  /// variable AL from function FL, and second is usage of local variable AR
+  /// from FR, we could compare their origins and check whether they are
+  /// defined at the same place.
+  /// But, we are still not able to compare operands of PHI nodes, since those
+  /// could be operands from further BBs we didn't scan yet.
+  /// So it's impossible to use dominance properties in general.
+  DenseMap<const Value*, int> sn_mapL, sn_mapR;
 };
 
 }
@@ -258,6 +458,206 @@ int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
   return 0;
 }
 
+int FunctionComparator::cmpAPInt(const APInt &L, const APInt &R) const {
+  if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth()))
+    return Res;
+  if (L.ugt(R)) return 1;
+  if (R.ugt(L)) return -1;
+  return 0;
+}
+
+int FunctionComparator::cmpAPFloat(const APFloat &L, const APFloat &R) const {
+  if (int Res = cmpNumbers((uint64_t)&L.getSemantics(),
+                           (uint64_t)&R.getSemantics()))
+    return Res;
+  return cmpAPInt(L.bitcastToAPInt(), R.bitcastToAPInt());
+}
+
+int FunctionComparator::cmpStrings(StringRef L, StringRef R) const {
+  // Prevent heavy comparison, compare sizes first.
+  if (int Res = cmpNumbers(L.size(), R.size()))
+    return Res;
+
+  // Compare strings lexicographically only when it is necessary: only when
+  // strings are equal in size.
+  return L.compare(R);
+}
+
+int FunctionComparator::cmpAttrs(const AttributeSet L,
+                                 const AttributeSet R) const {
+  if (int Res = cmpNumbers(L.getNumSlots(), R.getNumSlots()))
+    return Res;
+
+  for (unsigned i = 0, e = L.getNumSlots(); i != e; ++i) {
+    AttributeSet::iterator LI = L.begin(i), LE = L.end(i), RI = R.begin(i),
+                           RE = R.end(i);
+    for (; LI != LE && RI != RE; ++LI, ++RI) {
+      Attribute LA = *LI;
+      Attribute RA = *RI;
+      if (LA < RA)
+        return -1;
+      if (RA < LA)
+        return 1;
+    }
+    if (LI != LE)
+      return 1;
+    if (RI != RE)
+      return -1;
+  }
+  return 0;
+}
+
+/// Constants comparison:
+/// 1. Check whether type of L constant could be losslessly bitcasted to R
+/// type.
+/// 2. Compare constant contents.
+/// For more details see declaration comments.
+int FunctionComparator::cmpConstants(const Constant *L, const Constant *R) {
+
+  Type *TyL = L->getType();
+  Type *TyR = R->getType();
+
+  // Check whether types are bitcastable. This part is just re-factored
+  // Type::canLosslesslyBitCastTo method, but instead of returning true/false,
+  // we also pack into result which type is "less" for us.
+  int TypesRes = cmpType(TyL, TyR);
+  if (TypesRes != 0) {
+    // Types are different, but check whether we can bitcast them.
+    if (!TyL->isFirstClassType()) {
+      if (TyR->isFirstClassType())
+        return -1;
+      // Neither TyL nor TyR are values of first class type. Return the result
+      // of comparing the types
+      return TypesRes;
+    }
+    if (!TyR->isFirstClassType()) {
+      if (TyL->isFirstClassType())
+        return 1;
+      return TypesRes;
+    }
+
+    // Vector -> Vector conversions are always lossless if the two vector types
+    // have the same size, otherwise not.
+    unsigned TyLWidth = 0;
+    unsigned TyRWidth = 0;
+
+    if (const VectorType *VecTyL = dyn_cast<VectorType>(TyL))
+      TyLWidth = VecTyL->getBitWidth();
+    if (const VectorType *VecTyR = dyn_cast<VectorType>(TyR))
+      TyRWidth = VecTyR->getBitWidth();
+
+    if (TyLWidth != TyRWidth)
+      return cmpNumbers(TyLWidth, TyRWidth);
+
+    // Zero bit-width means neither TyL nor TyR are vectors.
+    if (!TyLWidth) {
+      PointerType *PTyL = dyn_cast<PointerType>(TyL);
+      PointerType *PTyR = dyn_cast<PointerType>(TyR);
+      if (PTyL && PTyR) {
+        unsigned AddrSpaceL = PTyL->getAddressSpace();
+        unsigned AddrSpaceR = PTyR->getAddressSpace();
+        if (int Res = cmpNumbers(AddrSpaceL, AddrSpaceR))
+          return Res;
+      }
+      if (PTyL)
+        return 1;
+      if (PTyR)
+        return -1;
+
+      // TyL and TyR aren't vectors, nor pointers. We don't know how to
+      // bitcast them.
+      return TypesRes;
+    }
+  }
+
+  // OK, types are bitcastable, now check constant contents.
+
+  if (L->isNullValue() && R->isNullValue())
+    return TypesRes;
+  if (L->isNullValue() && !R->isNullValue())
+    return 1;
+  if (!L->isNullValue() && R->isNullValue())
+    return -1;
+
+  if (int Res = cmpNumbers(L->getValueID(), R->getValueID()))
+    return Res;
+
+  switch (L->getValueID()) {
+  case Value::UndefValueVal: return TypesRes;
+  case Value::ConstantIntVal: {
+    const APInt &LInt = cast<ConstantInt>(L)->getValue();
+    const APInt &RInt = cast<ConstantInt>(R)->getValue();
+    return cmpAPInt(LInt, RInt);
+  }
+  case Value::ConstantFPVal: {
+    const APFloat &LAPF = cast<ConstantFP>(L)->getValueAPF();
+    const APFloat &RAPF = cast<ConstantFP>(R)->getValueAPF();
+    return cmpAPFloat(LAPF, RAPF);
+  }
+  case Value::ConstantArrayVal: {
+    const ConstantArray *LA = cast<ConstantArray>(L);
+    const ConstantArray *RA = cast<ConstantArray>(R);
+    uint64_t NumElementsL = cast<ArrayType>(TyL)->getNumElements();
+    uint64_t NumElementsR = cast<ArrayType>(TyR)->getNumElements();
+    if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+      return Res;
+    for (uint64_t i = 0; i < NumElementsL; ++i) {
+      if (int Res = cmpConstants(cast<Constant>(LA->getOperand(i)),
+                                 cast<Constant>(RA->getOperand(i))))
+        return Res;
+    }
+    return 0;
+  }
+  case Value::ConstantStructVal: {
+    const ConstantStruct *LS = cast<ConstantStruct>(L);
+    const ConstantStruct *RS = cast<ConstantStruct>(R);
+    unsigned NumElementsL = cast<StructType>(TyL)->getNumElements();
+    unsigned NumElementsR = cast<StructType>(TyR)->getNumElements();
+    if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+      return Res;
+    for (unsigned i = 0; i != NumElementsL; ++i) {
+      if (int Res = cmpConstants(cast<Constant>(LS->getOperand(i)),
+                                 cast<Constant>(RS->getOperand(i))))
+        return Res;
+    }
+    return 0;
+  }
+  case Value::ConstantVectorVal: {
+    const ConstantVector *LV = cast<ConstantVector>(L);
+    const ConstantVector *RV = cast<ConstantVector>(R);
+    unsigned NumElementsL = cast<VectorType>(TyL)->getNumElements();
+    unsigned NumElementsR = cast<VectorType>(TyR)->getNumElements();
+    if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+      return Res;
+    for (uint64_t i = 0; i < NumElementsL; ++i) {
+      if (int Res = cmpConstants(cast<Constant>(LV->getOperand(i)),
+                                 cast<Constant>(RV->getOperand(i))))
+        return Res;
+    }
+    return 0;
+  }
+  case Value::ConstantExprVal: {
+    const ConstantExpr *LE = cast<ConstantExpr>(L);
+    const ConstantExpr *RE = cast<ConstantExpr>(R);
+    unsigned NumOperandsL = LE->getNumOperands();
+    unsigned NumOperandsR = RE->getNumOperands();
+    if (int Res = cmpNumbers(NumOperandsL, NumOperandsR))
+      return Res;
+    for (unsigned i = 0; i < NumOperandsL; ++i) {
+      if (int Res = cmpConstants(cast<Constant>(LE->getOperand(i)),
+                                 cast<Constant>(RE->getOperand(i))))
+        return Res;
+    }
+    return 0;
+  }
+  case Value::FunctionVal:
+  case Value::GlobalVariableVal:
+  case Value::GlobalAliasVal:
+  default: // Unknown constant, cast L and R pointers to numbers and compare.
+    return cmpNumbers((uint64_t)L, (uint64_t)R);
+  }
+}
+
 /// cmpType - compares two types,
 /// defines total ordering among the types set.
 /// See method declaration comments for more details.
@@ -350,143 +750,209 @@ int FunctionComparator::cmpType(Type *TyL, Type *TyR) const {
 // Determine whether the two operations are the same except that pointer-to-A
 // and pointer-to-B are equivalent. This should be kept in sync with
 // Instruction::isSameOperationAs.
-bool FunctionComparator::isEquivalentOperation(const Instruction *I1,
-                                               const Instruction *I2) const {
+// Read method declaration comments for more details.
+int FunctionComparator::cmpOperation(const Instruction *L,
+                                     const Instruction *R) const {
   // Differences from Instruction::isSameOperationAs:
   //  * replace type comparison with calls to isEquivalentType.
   //  * we test for I->hasSameSubclassOptionalData (nuw/nsw/tail) at the top
   //  * because of the above, we don't test for the tail bit on calls later on
-  if (I1->getOpcode() != I2->getOpcode() ||
-      I1->getNumOperands() != I2->getNumOperands() ||
-      !isEquivalentType(I1->getType(), I2->getType()) ||
-      !I1->hasSameSubclassOptionalData(I2))
-    return false;
+  if (int Res = cmpNumbers(L->getOpcode(), R->getOpcode()))
+    return Res;
+
+  if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
+    return Res;
+
+  if (int Res = cmpType(L->getType(), R->getType()))
+    return Res;
+
+  if (int Res = cmpNumbers(L->getRawSubclassOptionalData(),
+                           R->getRawSubclassOptionalData()))
+    return Res;
 
   // We have two instructions of identical opcode and #operands.  Check to see
   // if all operands are the same type
-  for (unsigned i = 0, e = I1->getNumOperands(); i != e; ++i)
-    if (!isEquivalentType(I1->getOperand(i)->getType(),
-                          I2->getOperand(i)->getType()))
-      return false;
+  for (unsigned i = 0, e = L->getNumOperands(); i != e; ++i) {
+    if (int Res =
+            cmpType(L->getOperand(i)->getType(), R->getOperand(i)->getType()))
+      return Res;
+  }
 
   // Check special state that is a part of some instructions.
-  if (const LoadInst *LI = dyn_cast<LoadInst>(I1))
-    return LI->isVolatile() == cast<LoadInst>(I2)->isVolatile() &&
-           LI->getAlignment() == cast<LoadInst>(I2)->getAlignment() &&
-           LI->getOrdering() == cast<LoadInst>(I2)->getOrdering() &&
-           LI->getSynchScope() == cast<LoadInst>(I2)->getSynchScope();
-  if (const StoreInst *SI = dyn_cast<StoreInst>(I1))
-    return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() &&
-           SI->getAlignment() == cast<StoreInst>(I2)->getAlignment() &&
-           SI->getOrdering() == cast<StoreInst>(I2)->getOrdering() &&
-           SI->getSynchScope() == cast<StoreInst>(I2)->getSynchScope();
-  if (const CmpInst *CI = dyn_cast<CmpInst>(I1))
-    return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate();
-  if (const CallInst *CI = dyn_cast<CallInst>(I1))
-    return CI->getCallingConv() == cast<CallInst>(I2)->getCallingConv() &&
-           CI->getAttributes() == cast<CallInst>(I2)->getAttributes();
-  if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1))
-    return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() &&
-           CI->getAttributes() == cast<InvokeInst>(I2)->getAttributes();
-  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1))
-    return IVI->getIndices() == cast<InsertValueInst>(I2)->getIndices();
-  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1))
-    return EVI->getIndices() == cast<ExtractValueInst>(I2)->getIndices();
-  if (const FenceInst *FI = dyn_cast<FenceInst>(I1))
-    return FI->getOrdering() == cast<FenceInst>(I2)->getOrdering() &&
-           FI->getSynchScope() == cast<FenceInst>(I2)->getSynchScope();
-  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I1))
-    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I2)->isVolatile() &&
-           CXI->getSuccessOrdering() ==
-               cast<AtomicCmpXchgInst>(I2)->getSuccessOrdering() &&
-           CXI->getFailureOrdering() ==
-               cast<AtomicCmpXchgInst>(I2)->getFailureOrdering() &&
-           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I2)->getSynchScope();
-  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I1))
-    return RMWI->getOperation() == cast<AtomicRMWInst>(I2)->getOperation() &&
-           RMWI->isVolatile() == cast<AtomicRMWInst>(I2)->isVolatile() &&
-           RMWI->getOrdering() == cast<AtomicRMWInst>(I2)->getOrdering() &&
-           RMWI->getSynchScope() == cast<AtomicRMWInst>(I2)->getSynchScope();
+  if (const LoadInst *LI = dyn_cast<LoadInst>(L)) {
+    if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile()))
+      return Res;
+    if (int Res =
+            cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment()))
+      return Res;
+    if (int Res =
+            cmpNumbers(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
+      return Res;
+    return cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope());
+  }
+  if (const StoreInst *SI = dyn_cast<StoreInst>(L)) {
+    if (int Res =
+            cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile()))
+      return Res;
+    if (int Res =
+            cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment()))
+      return Res;
+    if (int Res =
+            cmpNumbers(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
+      return Res;
+    return cmpNumbers(SI->getSynchScope(), cast<StoreInst>(R)->getSynchScope());
+  }
+  if (const CmpInst *CI = dyn_cast<CmpInst>(L))
+    return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate());
+  if (const CallInst *CI = dyn_cast<CallInst>(L)) {
+    if (int Res = cmpNumbers(CI->getCallingConv(),
+                             cast<CallInst>(R)->getCallingConv()))
+      return Res;
+    return cmpAttrs(CI->getAttributes(), cast<CallInst>(R)->getAttributes());
+  }
+  if (const InvokeInst *CI = dyn_cast<InvokeInst>(L)) {
+    if (int Res = cmpNumbers(CI->getCallingConv(),
+                             cast<InvokeInst>(R)->getCallingConv()))
+      return Res;
+    return cmpAttrs(CI->getAttributes(), cast<InvokeInst>(R)->getAttributes());
+  }
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) {
+    ArrayRef<unsigned> LIndices = IVI->getIndices();
+    ArrayRef<unsigned> RIndices = cast<InsertValueInst>(R)->getIndices();
+    if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
+      return Res;
+    for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
+      if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
+        return Res;
+    }
+  }
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(L)) {
+    ArrayRef<unsigned> LIndices = EVI->getIndices();
+    ArrayRef<unsigned> RIndices = cast<ExtractValueInst>(R)->getIndices();
+    if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
+      return Res;
+    for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
+      if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
+        return Res;
+    }
+  }
+  if (const FenceInst *FI = dyn_cast<FenceInst>(L)) {
+    if (int Res =
+            cmpNumbers(FI->getOrdering(), cast<FenceInst>(R)->getOrdering()))
+      return Res;
+    return cmpNumbers(FI->getSynchScope(), cast<FenceInst>(R)->getSynchScope());
+  }
 
-  return true;
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(L)) {
+    if (int Res = cmpNumbers(CXI->isVolatile(),
+                             cast<AtomicCmpXchgInst>(R)->isVolatile()))
+      return Res;
+    if (int Res = cmpNumbers(CXI->getSuccessOrdering(),
+                             cast<AtomicCmpXchgInst>(R)->getSuccessOrdering()))
+      return Res;
+    if (int Res = cmpNumbers(CXI->getFailureOrdering(),
+                             cast<AtomicCmpXchgInst>(R)->getFailureOrdering()))
+      return Res;
+    return cmpNumbers(CXI->getSynchScope(),
+                      cast<AtomicCmpXchgInst>(R)->getSynchScope());
+  }
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(L)) {
+    if (int Res = cmpNumbers(RMWI->getOperation(),
+                             cast<AtomicRMWInst>(R)->getOperation()))
+      return Res;
+    if (int Res = cmpNumbers(RMWI->isVolatile(),
+                             cast<AtomicRMWInst>(R)->isVolatile()))
+      return Res;
+    if (int Res = cmpNumbers(RMWI->getOrdering(),
+                             cast<AtomicRMWInst>(R)->getOrdering()))
+      return Res;
+    return cmpNumbers(RMWI->getSynchScope(),
+                      cast<AtomicRMWInst>(R)->getSynchScope());
+  }
+  return 0;
 }
 
 // Determine whether two GEP operations perform the same underlying arithmetic.
-bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1,
-                                         const GEPOperator *GEP2) {
-  unsigned AS = GEP1->getPointerAddressSpace();
-  if (AS != GEP2->getPointerAddressSpace())
-    return false;
+// Read method declaration comments for more details.
+int FunctionComparator::cmpGEP(const GEPOperator *GEPL,
+                               const GEPOperator *GEPR) {
+
+  unsigned int ASL = GEPL->getPointerAddressSpace();
+  unsigned int ASR = GEPR->getPointerAddressSpace();
 
+  if (int Res = cmpNumbers(ASL, ASR))
+    return Res;
+
+  // When we have target data, we can reduce the GEP down to the value in bytes
+  // added to the address.
   if (DL) {
-    // When we have target data, we can reduce the GEP down to the value in bytes
-    // added to the address.
-    unsigned BitWidth = DL ? DL->getPointerSizeInBits(AS) : 1;
-    APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0);
-    if (GEP1->accumulateConstantOffset(*DL, Offset1) &&
-        GEP2->accumulateConstantOffset(*DL, Offset2)) {
-      return Offset1 == Offset2;
-    }
+    unsigned BitWidth = DL->getPointerSizeInBits(ASL);
+    APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0);
+    if (GEPL->accumulateConstantOffset(*DL, OffsetL) &&
+        GEPR->accumulateConstantOffset(*DL, OffsetR))
+      return cmpAPInt(OffsetL, OffsetR);
   }
 
-  if (GEP1->getPointerOperand()->getType() !=
-      GEP2->getPointerOperand()->getType())
-    return false;
+  if (int Res = cmpNumbers((uint64_t)GEPL->getPointerOperand()->getType(),
+                           (uint64_t)GEPR->getPointerOperand()->getType()))
+    return Res;
 
-  if (GEP1->getNumOperands() != GEP2->getNumOperands())
-    return false;
+  if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands()))
+    return Res;
 
-  for (unsigned i = 0, e = GEP1->getNumOperands(); i != e; ++i) {
-    if (!enumerate(GEP1->getOperand(i), GEP2->getOperand(i)))
-      return false;
+  for (unsigned i = 0, e = GEPL->getNumOperands(); i != e; ++i) {
+    if (int Res = cmpValues(GEPL->getOperand(i), GEPR->getOperand(i)))
+      return Res;
   }
 
-  return true;
+  return 0;
 }
 
-// Compare two values used by the two functions under pair-wise comparison. If
-// this is the first time the values are seen, they're added to the mapping so
-// that we will detect mismatches on next use.
-bool FunctionComparator::enumerate(const Value *V1, const Value *V2) {
-  // Check for function @f1 referring to itself and function @f2 referring to
-  // itself, or referring to each other, or both referring to either of them.
-  // They're all equivalent if the two functions are otherwise equivalent.
-  if (V1 == F1 && V2 == F2)
-    return true;
-  if (V1 == F2 && V2 == F1)
-    return true;
+/// Compare two values used by the two functions under pair-wise comparison. If
+/// this is the first time the values are seen, they're added to the mapping so
+/// that we will detect mismatches on next use.
+/// See comments in declaration for more details.
+int FunctionComparator::cmpValues(const Value *L, const Value *R) {
+  // Catch self-reference case.
+  if (L == F1) {
+    if (R == F2)
+      return 0;
+    return -1;
+  }
+  if (R == F2) {
+    if (L == F1)
+      return 0;
+    return 1;
+  }
 
-  if (const Constant *C1 = dyn_cast<Constant>(V1)) {
-    if (V1 == V2) return true;
-    const Constant *C2 = dyn_cast<Constant>(V2);
-    if (!C2) return false;
-    // TODO: constant expressions with GEP or references to F1 or F2.
-    if (C1->isNullValue() && C2->isNullValue() &&
-        isEquivalentType(C1->getType(), C2->getType()))
-      return true;
-    // Try bitcasting C2 to C1's type. If the bitcast is legal and returns C1
-    // then they must have equal bit patterns.
-    return C1->getType()->canLosslesslyBitCastTo(C2->getType()) &&
-      C1 == ConstantExpr::getBitCast(const_cast<Constant*>(C2), C1->getType());
-  }
-
-  if (isa<InlineAsm>(V1) || isa<InlineAsm>(V2))
-    return V1 == V2;
-
-  // Check that V1 maps to V2. If we find a value that V1 maps to then we simply
-  // check whether it's equal to V2. When there is no mapping then we need to
-  // ensure that V2 isn't already equivalent to something else. For this
-  // purpose, we track the V2 values in a set.
-
-  const Value *&map_elem = id_map[V1];
-  if (map_elem)
-    return map_elem == V2;
-  if (!seen_values.insert(V2).second)
-    return false;
-  map_elem = V2;
-  return true;
-}
+  const Constant *ConstL = dyn_cast<Constant>(L);
+  const Constant *ConstR = dyn_cast<Constant>(R);
+  if (ConstL && ConstR) {
+    if (L == R)
+      return 0;
+    return cmpConstants(ConstL, ConstR);
+  }
+
+  if (ConstL)
+    return 1;
+  if (ConstR)
+    return -1;
+
+  const InlineAsm *InlineAsmL = dyn_cast<InlineAsm>(L);
+  const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R);
+
+  if (InlineAsmL && InlineAsmR)
+    return cmpNumbers((uint64_t)L, (uint64_t)R);
+  if (InlineAsmL)
+    return 1;
+  if (InlineAsmR)
+    return -1;
+
+  auto LeftSN = sn_mapL.insert(std::make_pair(L, sn_mapL.size())),
+       RightSN = sn_mapR.insert(std::make_pair(R, sn_mapR.size()));
 
+  return cmpNumbers(LeftSN.first->second, RightSN.first->second);
+}
 // Test whether two basic blocks have equivalent behaviour.
 bool FunctionComparator::compare(const BasicBlock *BB1, const BasicBlock *BB2) {
   BasicBlock::const_iterator F1I = BB1->begin(), F1E = BB1->end();
@@ -535,6 +1001,9 @@ bool FunctionComparator::compare() {
   // We need to recheck everything, but check the things that weren't included
   // in the hash first.
 
+  sn_mapL.clear();
+  sn_mapR.clear();
+
   if (F1->getAttributes() != F2->getAttributes())
     return false;
 
@@ -683,7 +1152,7 @@ ModulePass *llvm::createMergeFunctionsPass() {
 bool MergeFunctions::runOnModule(Module &M) {
   bool Changed = false;
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
 
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage())
@@ -783,8 +1252,23 @@ void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
 // Helper for writeThunk,
 // Selects proper bitcast operation,
 // but a bit simpler then CastInst::getCastOpcode.
-static Value* createCast(IRBuilder<false> &Builder, Value *V, Type *DestTy) {
+static Value *createCast(IRBuilder<false> &Builder, Value *V, Type *DestTy) {
   Type *SrcTy = V->getType();
+  if (SrcTy->isStructTy()) {
+    assert(DestTy->isStructTy());
+    assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements());
+    Value *Result = UndefValue::get(DestTy);
+    for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) {
+      Value *Element = createCast(
+          Builder, Builder.CreateExtractValue(V, ArrayRef<unsigned int>(I)),
+          DestTy->getStructElementType(I));
+
+      Result =
+          Builder.CreateInsertValue(Result, Element, ArrayRef<unsigned int>(I));
+    }
+    return Result;
+  }
+  assert(!DestTy->isStructTy());
   if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
     return Builder.CreateIntToPtr(V, DestTy);
   else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
@@ -843,9 +1327,9 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
 
 // Replace G with an alias to F and delete G.
 void MergeFunctions::writeAlias(Function *F, Function *G) {
-  Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
-  GlobalAlias *GA = new GlobalAlias(G->getType(), G->getLinkage(), "",
-                                    BitcastF, G->getParent());
+  PointerType *PTy = G->getType();
+  auto *GA = GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                                 G->getLinkage(), "", F);
   F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
   GA->takeName(G);
   GA->setVisibility(G->getVisibility());
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index ac88aee..76d6dfa 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "partialinlining"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/CFG.h"
@@ -24,6 +23,8 @@
 #include "llvm/Transforms/Utils/CodeExtractor.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "partialinlining"
+
 STATISTIC(NumPartialInlined, "Number of functions partially inlined");
 
 namespace {
@@ -52,10 +53,10 @@ Function* PartialInliner::unswitchFunction(Function* F) {
   BasicBlock* entryBlock = F->begin();
   BranchInst *BR = dyn_cast<BranchInst>(entryBlock->getTerminator());
   if (!BR || BR->isUnconditional())
-    return 0;
+    return nullptr;
   
-  BasicBlock* returnBlock = 0;
-  BasicBlock* nonReturnBlock = 0;
+  BasicBlock* returnBlock = nullptr;
+  BasicBlock* nonReturnBlock = nullptr;
   unsigned returnCount = 0;
   for (succ_iterator SI = succ_begin(entryBlock), SE = succ_end(entryBlock);
        SI != SE; ++SI)
@@ -66,7 +67,7 @@ Function* PartialInliner::unswitchFunction(Function* F) {
       nonReturnBlock = *SI;
   
   if (returnCount != 1)
-    return 0;
+    return nullptr;
   
   // Clone the function, so that we can hack away on it.
   ValueToValueMapTy VMap;
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 4a28b34..38e1b8e 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -56,8 +56,9 @@ RunLoopRerolling("reroll-loops", cl::Hidden,
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
-    LibraryInfo = 0;
-    Inliner = 0;
+    LibraryInfo = nullptr;
+    Inliner = nullptr;
+    DisableTailCalls = false;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
     BBVectorize = RunBBVectorization;
@@ -128,7 +129,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   if (OptLevel == 0) {
     if (Inliner) {
       MPM.add(Inliner);
-      Inliner = 0;
+      Inliner = nullptr;
     }
 
     // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
@@ -156,6 +157,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
     MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
 
     MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
+    addExtensionsToPM(EP_Peephole, MPM);
     MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
   }
 
@@ -164,7 +166,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
     MPM.add(createPruneEHPass());             // Remove dead EH info
   if (Inliner) {
     MPM.add(Inliner);
-    Inliner = 0;
+    Inliner = nullptr;
   }
   if (!DisableUnitAtATime)
     MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
@@ -182,8 +184,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   MPM.add(createInstructionCombiningPass());  // Combine silly seq's
+  addExtensionsToPM(EP_Peephole, MPM);
 
-  MPM.add(createTailCallEliminationPass());   // Eliminate tail calls
+  if (!DisableTailCalls)
+    MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
   MPM.add(createLoopRotatePass());            // Rotate Loop
@@ -206,6 +210,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   // Run instcombine after redundancy elimination to exploit opportunities
   // opened up by them.
   MPM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, MPM);
   MPM.add(createJumpThreadingPass());         // Thread jumps
   MPM.add(createCorrelatedValuePropagationPass());
   MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
@@ -220,6 +225,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   if (BBVectorize) {
     MPM.add(createBBVectorizePass());
     MPM.add(createInstructionCombiningPass());
+    addExtensionsToPM(EP_Peephole, MPM);
     if (OptLevel > 1 && UseGVNAfterVectorization)
       MPM.add(createGVNPass());           // Remove redundancies
     else
@@ -233,6 +239,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
   MPM.add(createInstructionCombiningPass());  // Clean up after everything.
+  addExtensionsToPM(EP_Peephole, MPM);
 
   // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
   // pass manager that we are specifically trying to avoid. To prevent this
@@ -245,6 +252,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   // as function calls, so that we can only pass them when the vectorizer
   // changed the code.
   MPM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, MPM);
   MPM.add(createCFGSimplificationPass());
 
   if (!DisableUnrollLoops)
@@ -297,6 +305,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   // function pointers.  When this happens, we often have to resolve varargs
   // calls, etc, so let instcombine do this.
   PM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, PM);
 
   // Inline small functions
   if (RunInliner)
@@ -315,6 +324,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
 
   // The IPO passes may leave cruft around.  Clean up after them.
   PM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, PM);
   PM.add(createJumpThreadingPass());
 
   // Break up allocas
@@ -334,11 +344,17 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   // Nuke dead stores.
   PM.add(createDeadStoreEliminationPass());
 
-  // More loops are countable try to vectorize them.
+  // More loops are countable; try to optimize them.
+  PM.add(createIndVarSimplifyPass());
+  PM.add(createLoopDeletionPass());
   PM.add(createLoopVectorizePass(true, true));
 
+  // More scalar chains could be vectorized due to more alias information
+  PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+
   // Cleanup and simplify the code after the scalar optimizations.
   PM.add(createInstructionCombiningPass());
+  addExtensionsToPM(EP_Peephole, PM);
 
   PM.add(createJumpThreadingPass());
 
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index c61ec5e..b2c4a09 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "prune-eh"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -30,6 +29,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "prune-eh"
+
 STATISTIC(NumRemoved, "Number of invokes removed");
 STATISTIC(NumUnreach, "Number of noreturn calls optimized");
 
@@ -85,7 +86,7 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
   for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); 
        (!SCCMightUnwind || !SCCMightReturn) && I != E; ++I) {
     Function *F = (*I)->getFunction();
-    if (F == 0) {
+    if (!F) {
       SCCMightUnwind = true;
       SCCMightReturn = true;
     } else if (F->isDeclaration() || F->mayBeOverridden()) {
diff --git a/lib/Transforms/IPO/StripDeadPrototypes.cpp b/lib/Transforms/IPO/StripDeadPrototypes.cpp
index 1c6532d..956991a 100644
--- a/lib/Transforms/IPO/StripDeadPrototypes.cpp
+++ b/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -14,13 +14,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "strip-dead-prototypes"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "strip-dead-prototypes"
+
 STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed");
 
 namespace {
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 6d0be8f..1abbccc 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -192,7 +192,7 @@ static void StripTypeNames(Module &M, bool PreserveDbgInfo) {
 /// Find values that are marked as llvm.used.
 static void findUsedValues(GlobalVariable *LLVMUsed,
                            SmallPtrSet<const GlobalValue*, 8> &UsedValues) {
-  if (LLVMUsed == 0) return;
+  if (!LLVMUsed) return;
   UsedValues.insert(LLVMUsed);
 
   ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 822e146..e04b1be 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -20,34 +20,38 @@
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 
+#define DEBUG_TYPE "instcombine"
+
 namespace llvm {
-  class CallSite;
-  class DataLayout;
-  class TargetLibraryInfo;
-  class DbgDeclareInst;
-  class MemIntrinsic;
-  class MemSetInst;
+class CallSite;
+class DataLayout;
+class TargetLibraryInfo;
+class DbgDeclareInst;
+class MemIntrinsic;
+class MemSetInst;
 
 /// SelectPatternFlavor - We can match a variety of different patterns for
 /// select operations.
 enum SelectPatternFlavor {
   SPF_UNKNOWN = 0,
-  SPF_SMIN, SPF_UMIN,
-  SPF_SMAX, SPF_UMAX
-  //SPF_ABS - TODO.
+  SPF_SMIN,
+  SPF_UMIN,
+  SPF_SMAX,
+  SPF_UMAX
+  // SPF_ABS - TODO.
 };
 
 /// getComplexity:  Assign a complexity or rank value to LLVM Values...
 ///   0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst
 static inline unsigned getComplexity(Value *V) {
   if (isa<Instruction>(V)) {
-    if (BinaryOperator::isNeg(V) ||
-        BinaryOperator::isFNeg(V) ||
+    if (BinaryOperator::isNeg(V) || BinaryOperator::isFNeg(V) ||
         BinaryOperator::isNot(V))
       return 3;
     return 4;
   }
-  if (isa<Argument>(V)) return 3;
+  if (isa<Argument>(V))
+    return 3;
   return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
 }
 
@@ -60,18 +64,18 @@ static inline Constant *SubOne(Constant *C) {
   return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
 }
 
-
 /// InstCombineIRInserter - This is an IRBuilder insertion helper that works
 /// just like the normal insertion helper, but also adds any new instructions
 /// to the instcombine worklist.
 class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter
     : public IRBuilderDefaultInserter<true> {
   InstCombineWorklist &Worklist;
+
 public:
   InstCombineIRInserter(InstCombineWorklist &WL) : Worklist(WL) {}
 
-  void InsertHelper(Instruction *I, const Twine &Name,
-                    BasicBlock *BB, BasicBlock::iterator InsertPt) const {
+  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
+                    BasicBlock::iterator InsertPt) const {
     IRBuilderDefaultInserter<true>::InsertHelper(I, Name, BB, InsertPt);
     Worklist.Add(I);
   }
@@ -79,13 +83,14 @@ public:
 
 /// InstCombiner - The -instcombine pass.
 class LLVM_LIBRARY_VISIBILITY InstCombiner
-                             : public FunctionPass,
-                               public InstVisitor<InstCombiner, Instruction*> {
+    : public FunctionPass,
+      public InstVisitor<InstCombiner, Instruction *> {
   const DataLayout *DL;
   TargetLibraryInfo *TLI;
   bool MadeIRChange;
   LibCallSimplifier *Simplifier;
   bool MinimizeSize;
+
 public:
   /// Worklist - All of the instructions that need to be simplified.
   InstCombineWorklist Worklist;
@@ -96,7 +101,7 @@ public:
   BuilderTy *Builder;
 
   static char ID; // Pass identification, replacement for typeid
-  InstCombiner() : FunctionPass(ID), DL(0), Builder(0) {
+  InstCombiner() : FunctionPass(ID), DL(nullptr), Builder(nullptr) {
     MinimizeSize = false;
     initializeInstCombinerPass(*PassRegistry::getPassRegistry());
   }
@@ -144,9 +149,9 @@ public:
   Instruction *visitAnd(BinaryOperator &I);
   Value *FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS);
   Value *FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
-  Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op,
-                                   Value *A, Value *B, Value *C);
-  Instruction *visitOr (BinaryOperator &I);
+  Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, Value *A,
+                                   Value *B, Value *C);
+  Instruction *visitOr(BinaryOperator &I);
   Instruction *visitXor(BinaryOperator &I);
   Instruction *visitShl(BinaryOperator &I);
   Instruction *visitAShr(BinaryOperator &I);
@@ -156,12 +161,11 @@ public:
                                     Constant *RHSC);
   Instruction *FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
                                             GlobalVariable *GV, CmpInst &ICI,
-                                            ConstantInt *AndCst = 0);
+                                            ConstantInt *AndCst = nullptr);
   Instruction *visitFCmpInst(FCmpInst &I);
   Instruction *visitICmpInst(ICmpInst &I);
   Instruction *visitICmpInstWithCastAndCast(ICmpInst &ICI);
-  Instruction *visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
-                                              Instruction *LHS,
+  Instruction *visitICmpInstWithInstAndIntCst(ICmpInst &ICI, Instruction *LHS,
                                               ConstantInt *RHS);
   Instruction *FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
                               ConstantInt *DivRHS);
@@ -171,7 +175,7 @@ public:
                                 ICmpInst::Predicate Pred);
   Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                            ICmpInst::Predicate Cond, Instruction &I);
-  Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
+  Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1,
                                    BinaryOperator &I);
   Instruction *commonCastTransforms(CastInst &CI);
   Instruction *commonPointerCastTransforms(CastInst &CI);
@@ -188,9 +192,8 @@ public:
   Instruction *visitIntToPtr(IntToPtrInst &CI);
   Instruction *visitBitCast(BitCastInst &CI);
   Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
-  Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI,
-                              Instruction *FI);
-  Instruction *FoldSelectIntoOp(SelectInst &SI, Value*, Value*);
+  Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI);
+  Instruction *FoldSelectIntoOp(SelectInst &SI, Value *, Value *);
   Instruction *FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1,
                             Value *A, Value *B, Instruction &Outer,
                             SelectPatternFlavor SPF2, Value *C);
@@ -209,6 +212,7 @@ public:
   Instruction *visitStoreInst(StoreInst &SI);
   Instruction *visitBranchInst(BranchInst &BI);
   Instruction *visitSwitchInst(SwitchInst &SI);
+  Instruction *visitInsertValueInst(InsertValueInst &IV);
   Instruction *visitInsertElementInst(InsertElementInst &IE);
   Instruction *visitExtractElementInst(ExtractElementInst &EI);
   Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
@@ -216,21 +220,21 @@ public:
   Instruction *visitLandingPadInst(LandingPadInst &LI);
 
   // visitInstruction - Specify what to return for unhandled instructions...
-  Instruction *visitInstruction(Instruction &I) { return 0; }
+  Instruction *visitInstruction(Instruction &I) { return nullptr; }
 
 private:
   bool ShouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
-  Value *dyn_castFNegVal(Value *V, bool NoSignedZero=false) const;
+  Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
   Type *FindElementAtOffset(Type *PtrTy, int64_t Offset,
-                            SmallVectorImpl<Value*> &NewIndices);
+                            SmallVectorImpl<Value *> &NewIndices);
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
 
   /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually
   /// results in any code being generated and is interesting to optimize out. If
   /// the cast can be eliminated by some other simple transformation, we prefer
   /// to do the simplification first.
-  bool ShouldOptimizeCast(Instruction::CastOps opcode,const Value *V,
+  bool ShouldOptimizeCast(Instruction::CastOps opcode, const Value *V,
                           Type *Ty);
 
   Instruction *visitCallSite(CallSite CS);
@@ -251,10 +255,10 @@ public:
   // in the program.  Add the new instruction to the worklist.
   //
   Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
-    assert(New && New->getParent() == 0 &&
+    assert(New && !New->getParent() &&
            "New instruction already inserted into a basic block!");
     BasicBlock *BB = Old.getParent();
-    BB->getInstList().insert(&Old, New);  // Insert inst
+    BB->getInstList().insert(&Old, New); // Insert inst
     Worklist.Add(New);
     return New;
   }
@@ -274,7 +278,7 @@ public:
   // modified.
   //
   Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
-    Worklist.AddUsersToWorkList(I);   // Add all modified instrs to worklist.
+    Worklist.AddUsersToWorkList(I); // Add all modified instrs to worklist.
 
     // If we are replacing the instruction with itself, this must be in a
     // segment of unreachable code, so just clobber the instruction.
@@ -306,12 +310,12 @@ public:
     Worklist.Remove(&I);
     I.eraseFromParent();
     MadeIRChange = true;
-    return 0;  // Don't do anything with FI
+    return nullptr; // Don't do anything with FI
   }
 
-  void ComputeMaskedBits(Value *V, APInt &KnownZero,
-                         APInt &KnownOne, unsigned Depth = 0) const {
-    return llvm::ComputeMaskedBits(V, KnownZero, KnownOne, DL, Depth);
+  void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
+                        unsigned Depth = 0) const {
+    return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth);
   }
 
   bool MaskedValueIsZero(Value *V, const APInt &Mask,
@@ -323,7 +327,6 @@ public:
   }
 
 private:
-
   /// SimplifyAssociativeOrCommutative - This performs a few simplifications for
   /// operators which are associative or commutative.
   bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
@@ -337,12 +340,10 @@ private:
 
   /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value
   /// based on the demanded bits.
-  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
-                                 APInt& KnownZero, APInt& KnownOne,
-                                 unsigned Depth);
-  bool SimplifyDemandedBits(Use &U, APInt DemandedMask,
-                            APInt& KnownZero, APInt& KnownOne,
-                            unsigned Depth=0);
+  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero,
+                                 APInt &KnownOne, unsigned Depth);
+  bool SimplifyDemandedBits(Use &U, APInt DemandedMask, APInt &KnownZero,
+                            APInt &KnownOne, unsigned Depth = 0);
   /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
   /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
   Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl,
@@ -355,7 +356,9 @@ private:
   bool SimplifyDemandedInstructionBits(Instruction &Inst);
 
   Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
-                                    APInt& UndefElts, unsigned Depth = 0);
+                                    APInt &UndefElts, unsigned Depth = 0);
+
+  Value *SimplifyVectorOp(BinaryOperator &Inst);
 
   // FoldOpIntoPhi - Given a binary operator, cast instruction, or select
   // which has a PHI node as operand #0, see if we can fold the instruction
@@ -372,21 +375,19 @@ private:
   Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
   Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN);
 
-
   Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
                         ConstantInt *AndRHS, BinaryOperator &TheAnd);
 
   Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
                             bool isSub, Instruction &I);
-  Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
-                         bool isSigned, bool Inside);
+  Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, bool isSigned,
+                         bool Inside);
   Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
   Instruction *MatchBSwap(BinaryOperator &I);
   bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
   Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
   Instruction *SimplifyMemSet(MemSetInst *MI);
 
-
   Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
 
   /// Descale - Return a value X such that Val = X * Scale, or null if none.  If
@@ -394,8 +395,8 @@ private:
   Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
 };
 
-
-
 } // end namespace llvm.
 
+#undef DEBUG_TYPE
+
 #endif
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 97910c7..c37a9cf 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -20,6 +20,8 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "instcombine"
+
 namespace {
 
   /// Class representing coefficient of floating-point addend.
@@ -112,12 +114,12 @@ namespace {
   ///
   class FAddend {
   public:
-    FAddend() { Val = 0; }
+    FAddend() { Val = nullptr; }
 
     Value *getSymVal (void) const { return Val; }
     const FAddendCoef &getCoef(void) const { return Coeff; }
 
-    bool isConstant() const { return Val == 0; }
+    bool isConstant() const { return Val == nullptr; }
     bool isZero() const { return Coeff.isZero(); }
 
     void set(short Coefficient, Value *V) { Coeff.set(Coefficient), Val = V; }
@@ -154,7 +156,7 @@ namespace {
   ///
   class FAddCombine {
   public:
-    FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(0) {}
+    FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(nullptr) {}
     Value *simplify(Instruction *FAdd);
 
   private:
@@ -348,8 +350,8 @@ Value *FAddendCoef::getValue(Type *Ty) const {
 //
 unsigned FAddend::drillValueDownOneStep
   (Value *Val, FAddend &Addend0, FAddend &Addend1) {
-  Instruction *I = 0;
-  if (Val == 0 || !(I = dyn_cast<Instruction>(Val)))
+  Instruction *I = nullptr;
+  if (!Val || !(I = dyn_cast<Instruction>(Val)))
     return 0;
 
   unsigned Opcode = I->getOpcode();
@@ -359,16 +361,16 @@ unsigned FAddend::drillValueDownOneStep
     Value *Opnd0 = I->getOperand(0);
     Value *Opnd1 = I->getOperand(1);
     if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
-      Opnd0 = 0;
+      Opnd0 = nullptr;
 
     if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
-      Opnd1 = 0;
+      Opnd1 = nullptr;
 
     if (Opnd0) {
       if (!C0)
         Addend0.set(1, Opnd0);
       else
-        Addend0.set(C0, 0);
+        Addend0.set(C0, nullptr);
     }
 
     if (Opnd1) {
@@ -376,7 +378,7 @@ unsigned FAddend::drillValueDownOneStep
       if (!C1)
         Addend.set(1, Opnd1);
       else
-        Addend.set(C1, 0);
+        Addend.set(C1, nullptr);
       if (Opcode == Instruction::FSub)
         Addend.negate();
     }
@@ -385,7 +387,7 @@ unsigned FAddend::drillValueDownOneStep
       return Opnd0 && Opnd1 ? 2 : 1;
 
     // Both operands are zero. Weird!
-    Addend0.set(APFloat(C0->getValueAPF().getSemantics()), 0);
+    Addend0.set(APFloat(C0->getValueAPF().getSemantics()), nullptr);
     return 1;
   }
 
@@ -443,13 +445,13 @@ Value *FAddCombine::performFactorization(Instruction *I) {
   Instruction *I1 = dyn_cast<Instruction>(I->getOperand(1));
 
   if (!I0 || !I1 || I0->getOpcode() != I1->getOpcode())
-    return 0;
+    return nullptr;
 
   bool isMpy = false;
   if (I0->getOpcode() == Instruction::FMul)
     isMpy = true;
   else if (I0->getOpcode() != Instruction::FDiv)
-    return 0;
+    return nullptr;
 
   Value *Opnd0_0 = I0->getOperand(0);
   Value *Opnd0_1 = I0->getOperand(1);
@@ -461,8 +463,8 @@ Value *FAddCombine::performFactorization(Instruction *I) {
   // (x*y) +/- (x*z)        x        y         z
   // (y/x) +/- (z/x)        x        y         z
   //
-  Value *Factor = 0;
-  Value *AddSub0 = 0, *AddSub1 = 0;
+  Value *Factor = nullptr;
+  Value *AddSub0 = nullptr, *AddSub1 = nullptr;
 
   if (isMpy) {
     if (Opnd0_0 == Opnd1_0 || Opnd0_0 == Opnd1_1)
@@ -481,7 +483,7 @@ Value *FAddCombine::performFactorization(Instruction *I) {
   }
 
   if (!Factor)
-    return 0;
+    return nullptr;
 
   FastMathFlags Flags;
   Flags.setUnsafeAlgebra();
@@ -495,7 +497,7 @@ Value *FAddCombine::performFactorization(Instruction *I) {
   if (ConstantFP *CFP = dyn_cast<ConstantFP>(NewAddSub)) {
     const APFloat &F = CFP->getValueAPF();
     if (!F.isNormal())
-      return 0;
+      return nullptr;
   } else if (Instruction *II = dyn_cast<Instruction>(NewAddSub))
     II->setFastMathFlags(Flags);
 
@@ -517,7 +519,7 @@ Value *FAddCombine::simplify(Instruction *I) {
 
   // Currently we are not able to handle vector type.
   if (I->getType()->isVectorTy())
-    return 0;
+    return nullptr;
 
   assert((I->getOpcode() == Instruction::FAdd ||
           I->getOpcode() == Instruction::FSub) && "Expect add/sub");
@@ -568,7 +570,7 @@ Value *FAddCombine::simplify(Instruction *I) {
     // been optimized into "I = Y - X" in the previous steps.
     //
     const FAddendCoef &CE = Opnd0.getCoef();
-    return CE.isOne() ? Opnd0.getSymVal() : 0;
+    return CE.isOne() ? Opnd0.getSymVal() : nullptr;
   }
 
   // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
@@ -614,7 +616,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
   // constant close to supper-expr(s) will potentially reveal some optimization
   // opportunities in super-expr(s).
   //
-  const FAddend *ConstAdd = 0;
+  const FAddend *ConstAdd = nullptr;
 
   // Simplified addends are placed <SimpVect>.
   AddendVect SimpVect;
@@ -647,7 +649,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
       if (T && T->getSymVal() == Val) {
         // Set null such that next iteration of the outer loop will not process
         // this addend again.
-        Addends[SameSymIdx] = 0;
+        Addends[SameSymIdx] = nullptr;
         SimpVect.push_back(T);
       }
     }
@@ -661,7 +663,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
 
       // Pop all addends being folded and push the resulting folded addend.
       SimpVect.resize(StartIdx);
-      if (Val != 0) {
+      if (Val) {
         if (!R.isZero()) {
           SimpVect.push_back(&R);
         }
@@ -698,7 +700,7 @@ Value *FAddCombine::createNaryFAdd
   //
   unsigned InstrNeeded = calcInstrNumber(Opnds);
   if (InstrNeeded > InstrQuota)
-    return 0;
+    return nullptr;
 
   initCreateInstNum();
 
@@ -710,7 +712,7 @@ Value *FAddCombine::createNaryFAdd
   // N-ary addition has at most two instructions, and we don't need to worry
   // about tree-height when constructing the N-ary addition.
 
-  Value *LastVal = 0;
+  Value *LastVal = nullptr;
   bool LastValNeedNeg = false;
 
   // Iterate the addends, creating fadd/fsub using adjacent two addends.
@@ -870,10 +872,10 @@ Value *FAddCombine::createAddendVal
 //
 static inline Value *dyn_castFoldableMul(Value *V, Constant *&CST) {
   if (!V->hasOneUse() || !V->getType()->isIntOrIntVectorTy())
-    return 0;
+    return nullptr;
 
   Instruction *I = dyn_cast<Instruction>(V);
-  if (I == 0) return 0;
+  if (!I) return nullptr;
 
   if (I->getOpcode() == Instruction::Mul)
     if ((CST = dyn_cast<Constant>(I->getOperand(1))))
@@ -884,7 +886,7 @@ static inline Value *dyn_castFoldableMul(Value *V, Constant *&CST) {
       CST = ConstantExpr::getShl(ConstantInt::get(V->getType(), 1), CST);
       return I->getOperand(0);
     }
-  return 0;
+  return nullptr;
 }
 
 
@@ -918,6 +920,9 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(),
                                  I.hasNoUnsignedWrap(), DL))
     return ReplaceInstUsesWith(I, V);
@@ -942,7 +947,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       if (ZI->getSrcTy()->isIntegerTy(1))
         return SelectInst::Create(ZI->getOperand(0), AddOne(CI), CI);
 
-    Value *XorLHS = 0; ConstantInt *XorRHS = 0;
+    Value *XorLHS = nullptr; ConstantInt *XorRHS = nullptr;
     if (match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) {
       uint32_t TySizeBits = I.getType()->getScalarSizeInBits();
       const APInt &RHSVal = CI->getValue();
@@ -974,7 +979,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         IntegerType *IT = cast<IntegerType>(I.getType());
         APInt LHSKnownOne(IT->getBitWidth(), 0);
         APInt LHSKnownZero(IT->getBitWidth(), 0);
-        ComputeMaskedBits(XorLHS, LHSKnownZero, LHSKnownOne);
+        computeKnownBits(XorLHS, LHSKnownZero, LHSKnownOne);
         if ((XorRHS->getValue() | LHSKnownZero).isAllOnesValue())
           return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI),
                                            XorLHS);
@@ -1042,11 +1047,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
     APInt LHSKnownOne(IT->getBitWidth(), 0);
     APInt LHSKnownZero(IT->getBitWidth(), 0);
-    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
+    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
     if (LHSKnownZero != 0) {
       APInt RHSKnownOne(IT->getBitWidth(), 0);
       APInt RHSKnownZero(IT->getBitWidth(), 0);
-      ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
+      computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
 
       // No bits in common -> bitwise or.
       if ((LHSKnownZero|RHSKnownZero).isAllOnesValue())
@@ -1174,7 +1179,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 
   // Check for (x & y) + (x ^ y)
   {
-    Value *A = 0, *B = 0;
+    Value *A = nullptr, *B = nullptr;
     if (match(RHS, m_Xor(m_Value(A), m_Value(B))) &&
         (match(LHS, m_And(m_Specific(A), m_Specific(B))) ||
          match(LHS, m_And(m_Specific(B), m_Specific(A)))))
@@ -1186,13 +1191,16 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       return BinaryOperator::CreateOr(A, B);
   }
 
-  return Changed ? &I : 0;
+  return Changed ? &I : nullptr;
 }
 
 Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL))
     return ReplaceInstUsesWith(I, V);
 
@@ -1266,7 +1274,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
     if (match(LHS, m_Select(m_Value(C1), m_Value(A1), m_Value(B1))) &&
         match(RHS, m_Select(m_Value(C2), m_Value(A2), m_Value(B2)))) {
       if (C1 == C2) {
-        Constant *Z1=0, *Z2=0;
+        Constant *Z1=nullptr, *Z2=nullptr;
         Value *A, *B, *C=C1;
         if (match(A1, m_AnyZero()) && match(B2, m_AnyZero())) {
             Z1 = dyn_cast<Constant>(A1); A = A2;
@@ -1290,7 +1298,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
       return ReplaceInstUsesWith(I, V);
   }
 
-  return Changed ? &I : 0;
+  return Changed ? &I : nullptr;
 }
 
 
@@ -1305,7 +1313,7 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
   // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
   // this.
   bool Swapped = false;
-  GEPOperator *GEP1 = 0, *GEP2 = 0;
+  GEPOperator *GEP1 = nullptr, *GEP2 = nullptr;
 
   // For now we require one side to be the base pointer "A" or a constant
   // GEP derived from it.
@@ -1343,9 +1351,9 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
 
   // Avoid duplicating the arithmetic if GEP2 has non-constant indices and
   // multiple users.
-  if (GEP1 == 0 ||
-      (GEP2 != 0 && !GEP2->hasAllConstantIndices() && !GEP2->hasOneUse()))
-    return 0;
+  if (!GEP1 ||
+      (GEP2 && !GEP2->hasAllConstantIndices() && !GEP2->hasOneUse()))
+    return nullptr;
 
   // Emit the offset of the GEP and an intptr_t.
   Value *Result = EmitGEPOffset(GEP1);
@@ -1368,6 +1376,9 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
 Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(),
                                  I.hasNoUnsignedWrap(), DL))
     return ReplaceInstUsesWith(I, V);
@@ -1393,7 +1404,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
   if (Constant *C = dyn_cast<Constant>(Op0)) {
     // C - ~X == X + (1+C)
-    Value *X = 0;
+    Value *X = nullptr;
     if (match(Op1, m_Not(m_Value(X))))
       return BinaryOperator::CreateAdd(X, AddOne(C));
 
@@ -1451,9 +1462,9 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   }
 
   if (Op1->hasOneUse()) {
-    Value *X = 0, *Y = 0, *Z = 0;
-    Constant *C = 0;
-    Constant *CI = 0;
+    Value *X = nullptr, *Y = nullptr, *Z = nullptr;
+    Constant *C = nullptr;
+    Constant *CI = nullptr;
 
     // (X - (Y - Z))  -->  (X + (Z - Y)).
     if (match(Op1, m_Sub(m_Value(Y), m_Value(Z))))
@@ -1532,12 +1543,15 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
         return ReplaceInstUsesWith(I, Res);
   }
 
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), DL))
     return ReplaceInstUsesWith(I, V);
 
@@ -1574,5 +1588,5 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
       return ReplaceInstUsesWith(I, V);
   }
 
-  return 0;
+  return nullptr;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 2c1bfc7..4f5d65a 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -20,6 +20,8 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "instcombine"
+
 /// isFreeToInvert - Return true if the specified value is free to invert (apply
 /// ~ to).  This happens in cases where the ~ can be eliminated.
 static inline bool isFreeToInvert(Value *V) {
@@ -50,7 +52,7 @@ static inline Value *dyn_castNotVal(Value *V) {
   // Constants can be considered to be not'ed values...
   if (ConstantInt *C = dyn_cast<ConstantInt>(V))
     return ConstantInt::get(C->getType(), ~C->getValue());
-  return 0;
+  return nullptr;
 }
 
 /// getFCmpCode - Similar to getICmpCode but for FCmpInst. This encodes a fcmp
@@ -123,7 +125,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
                                     ConstantInt *AndRHS,
                                     BinaryOperator &TheAnd) {
   Value *X = Op->getOperand(0);
-  Constant *Together = 0;
+  Constant *Together = nullptr;
   if (!Op->isShift())
     Together = ConstantExpr::getAnd(AndRHS, OpRHS);
 
@@ -250,7 +252,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     }
     break;
   }
-  return 0;
+  return nullptr;
 }
 
 /// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
@@ -332,12 +334,12 @@ Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
                                         Instruction &I) {
   Instruction *LHSI = dyn_cast<Instruction>(LHS);
   if (!LHSI || LHSI->getNumOperands() != 2 ||
-      !isa<ConstantInt>(LHSI->getOperand(1))) return 0;
+      !isa<ConstantInt>(LHSI->getOperand(1))) return nullptr;
 
   ConstantInt *N = cast<ConstantInt>(LHSI->getOperand(1));
 
   switch (LHSI->getOpcode()) {
-  default: return 0;
+  default: return nullptr;
   case Instruction::And:
     if (ConstantExpr::getAnd(N, Mask) == Mask) {
       // If the AndRHS is a power of two minus one (0+1+), this is simple.
@@ -357,7 +359,7 @@ Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
           break;
       }
     }
-    return 0;
+    return nullptr;
   case Instruction::Or:
   case Instruction::Xor:
     // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0
@@ -365,7 +367,7 @@ Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
          Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth()
         && ConstantExpr::getAnd(N, Mask)->isNullValue())
       break;
-    return 0;
+    return nullptr;
   }
 
   if (isSub)
@@ -418,12 +420,12 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
   ConstantInt *BCst = dyn_cast<ConstantInt>(B);
   ConstantInt *CCst = dyn_cast<ConstantInt>(C);
   bool icmp_eq = (SCC == ICmpInst::ICMP_EQ);
-  bool icmp_abit = (ACst != 0 && !ACst->isZero() &&
+  bool icmp_abit = (ACst && !ACst->isZero() &&
                     ACst->getValue().isPowerOf2());
-  bool icmp_bbit = (BCst != 0 && !BCst->isZero() &&
+  bool icmp_bbit = (BCst && !BCst->isZero() &&
                     BCst->getValue().isPowerOf2());
   unsigned result = 0;
-  if (CCst != 0 && CCst->isZero()) {
+  if (CCst && CCst->isZero()) {
     // if C is zero, then both A and B qualify as mask
     result |= (icmp_eq ? (FoldMskICmp_Mask_AllZeroes |
                           FoldMskICmp_Mask_AllZeroes |
@@ -455,7 +457,7 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
                             FoldMskICmp_AMask_NotMixed)
                          : (FoldMskICmp_Mask_AllZeroes |
                             FoldMskICmp_AMask_Mixed));
-  } else if (ACst != 0 && CCst != 0 &&
+  } else if (ACst && CCst &&
              ConstantExpr::getAnd(ACst, CCst) == CCst) {
     result |= (icmp_eq ? FoldMskICmp_AMask_Mixed
                        : FoldMskICmp_AMask_NotMixed);
@@ -470,7 +472,7 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
                             FoldMskICmp_BMask_NotMixed)
                          : (FoldMskICmp_Mask_AllZeroes |
                             FoldMskICmp_BMask_Mixed));
-  } else if (BCst != 0 && CCst != 0 &&
+  } else if (BCst && CCst &&
              ConstantExpr::getAnd(BCst, CCst) == CCst) {
     result |= (icmp_eq ? FoldMskICmp_BMask_Mixed
                        : FoldMskICmp_BMask_NotMixed);
@@ -570,12 +572,12 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
   Value *L11,*L12,*L21,*L22;
   // Check whether the icmp can be decomposed into a bit test.
   if (decomposeBitTestICmp(LHS, LHSCC, L11, L12, L2)) {
-    L21 = L22 = L1 = 0;
+    L21 = L22 = L1 = nullptr;
   } else {
     // Look for ANDs in the LHS icmp.
     if (!L1->getType()->isIntegerTy()) {
       // You can icmp pointers, for example. They really aren't masks.
-      L11 = L12 = 0;
+      L11 = L12 = nullptr;
     } else if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) {
       // Any icmp can be viewed as being trivially masked; if it allows us to
       // remove one, it's worth it.
@@ -585,7 +587,7 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
 
     if (!L2->getType()->isIntegerTy()) {
       // You can icmp pointers, for example. They really aren't masks.
-      L21 = L22 = 0;
+      L21 = L22 = nullptr;
     } else if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) {
       L21 = L2;
       L22 = Constant::getAllOnesValue(L2->getType());
@@ -608,7 +610,7 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
     } else {
       return 0;
     }
-    E = R2; R1 = 0; ok = true;
+    E = R2; R1 = nullptr; ok = true;
   } else if (R1->getType()->isIntegerTy()) {
     if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
       // As before, model no mask as a trivial mask if it'll let us do an
@@ -665,11 +667,11 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
 /// into a single (icmp(A & X) ==/!= Y)
 static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                      llvm::InstCombiner::BuilderTy* Builder) {
-  Value *A = 0, *B = 0, *C = 0, *D = 0, *E = 0;
+  Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
   unsigned mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS,
                                                LHSCC, RHSCC);
-  if (mask == 0) return 0;
+  if (mask == 0) return nullptr;
   assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) &&
          "foldLogOpOfMaskedICmpsHelper must return an equality predicate.");
 
@@ -722,9 +724,9 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
   // their actual values. This isn't strictly, necessary, just a "handle the
   // easy cases for now" decision.
   ConstantInt *BCst = dyn_cast<ConstantInt>(B);
-  if (BCst == 0) return 0;
+  if (!BCst) return nullptr;
   ConstantInt *DCst = dyn_cast<ConstantInt>(D);
-  if (DCst == 0) return 0;
+  if (!DCst) return nullptr;
 
   if (mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) {
     // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
@@ -763,11 +765,11 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
     // with B and D, having a single bit set
     ConstantInt *CCst = dyn_cast<ConstantInt>(C);
-    if (CCst == 0) return 0;
+    if (!CCst) return nullptr;
     if (LHSCC != NEWCC)
       CCst = dyn_cast<ConstantInt>( ConstantExpr::getXor(BCst, CCst) );
     ConstantInt *ECst = dyn_cast<ConstantInt>(E);
-    if (ECst == 0) return 0;
+    if (!ECst) return nullptr;
     if (RHSCC != NEWCC)
       ECst = dyn_cast<ConstantInt>( ConstantExpr::getXor(DCst, ECst) );
     ConstantInt* MCst = dyn_cast<ConstantInt>(
@@ -776,13 +778,13 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     // if there is a conflict we should actually return a false for the
     // whole construct
     if (!MCst->isZero())
-      return 0;
+      return nullptr;
     Value *newOr1 = Builder->CreateOr(B, D);
     Value *newOr2 = ConstantExpr::getOr(CCst, ECst);
     Value *newAnd = Builder->CreateAnd(A, newOr1);
     return Builder->CreateICmp(NEWCC, newAnd, newOr2);
   }
-  return 0;
+  return nullptr;
 }
 
 /// FoldAndOfICmps - Fold (icmp)&(icmp) if possible.
@@ -811,7 +813,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
   ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
   ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
-  if (LHSCst == 0 || RHSCst == 0) return 0;
+  if (!LHSCst || !RHSCst) return nullptr;
 
   if (LHSCst == RHSCst && LHSCC == RHSCC) {
     // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
@@ -835,7 +837,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   if (LHSCC == ICmpInst::ICMP_EQ && LHSCC == RHSCC &&
       LHS->hasOneUse() && RHS->hasOneUse()) {
     Value *V;
-    ConstantInt *AndCst, *SmallCst = 0, *BigCst = 0;
+    ConstantInt *AndCst, *SmallCst = nullptr, *BigCst = nullptr;
 
     // (trunc x) == C1 & (and x, CA) == C2
     // (and x, CA) == C2 & (trunc x) == C1
@@ -866,14 +868,14 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 
   // From here on, we only handle:
   //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
-  if (Val != Val2) return 0;
+  if (Val != Val2) return nullptr;
 
   // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
   if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
       RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
       LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
       RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
-    return 0;
+    return nullptr;
 
   // Make a constant range that's the intersection of the two icmp ranges.
   // If the intersection is empty, we know that the result is false.
@@ -887,7 +889,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 
   // We can't fold (ugt x, C) & (sgt x, C2).
   if (!PredicatesFoldable(LHSCC, RHSCC))
-    return 0;
+    return nullptr;
 
   // Ensure that the larger constant is on the RHS.
   bool ShouldSwap;
@@ -1016,7 +1018,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     break;
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// FoldAndOfFCmps - Optimize (fcmp)&(fcmp).  NOTE: Unlike the rest of
@@ -1026,7 +1028,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
   if (LHS->getPredicate() == FCmpInst::FCMP_ORD &&
       RHS->getPredicate() == FCmpInst::FCMP_ORD) {
     if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType())
-      return 0;
+      return nullptr;
 
     // (fcmp ord x, c) & (fcmp ord y, c)  -> (fcmp ord x, y)
     if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1)))
@@ -1043,7 +1045,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
     if (isa<ConstantAggregateZero>(LHS->getOperand(1)) &&
         isa<ConstantAggregateZero>(RHS->getOperand(1)))
       return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
-    return 0;
+    return nullptr;
   }
 
   Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
@@ -1096,7 +1098,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 
@@ -1104,6 +1106,9 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyAndInst(Op0, Op1, DL))
     return ReplaceInstUsesWith(I, V);
 
@@ -1198,7 +1203,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     // If this is an integer truncation, and if the source is an 'and' with
     // immediate, transform it.  This frequently occurs for bitfield accesses.
     {
-      Value *X = 0; ConstantInt *YC = 0;
+      Value *X = nullptr; ConstantInt *YC = nullptr;
       if (match(Op0, m_Trunc(m_And(m_Value(X), m_ConstantInt(YC))))) {
         // Change: and (trunc (and X, YC) to T), C2
         // into  : and (trunc X to T), trunc(YC) & C2
@@ -1231,7 +1236,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       }
 
   {
-    Value *A = 0, *B = 0, *C = 0, *D = 0;
+    Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
     // (A|B) & ~(A&B) -> A^B
     if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
         match(Op1, m_Not(m_And(m_Value(C), m_Value(D)))) &&
@@ -1339,7 +1344,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   }
 
   {
-    Value *X = 0;
+    Value *X = nullptr;
     bool OpsSwapped = false;
     // Canonicalize SExt or Not to the LHS
     if (match(Op1, m_SExt(m_Value())) ||
@@ -1366,7 +1371,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       std::swap(Op0, Op1);
   }
 
-  return Changed ? &I : 0;
+  return Changed ? &I : nullptr;
 }
 
 /// CollectBSwapParts - Analyze the specified subexpression and see if it is
@@ -1498,7 +1503,7 @@ Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
   if (!ITy || ITy->getBitWidth() % 16 ||
       // ByteMask only allows up to 32-byte values.
       ITy->getBitWidth() > 32*8)
-    return 0;   // Can only bswap pairs of bytes.  Can't do vectors.
+    return nullptr;   // Can only bswap pairs of bytes.  Can't do vectors.
 
   /// ByteValues - For each byte of the result, we keep track of which value
   /// defines each byte.
@@ -1508,16 +1513,16 @@ Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
   // Try to find all the pieces corresponding to the bswap.
   uint32_t ByteMask = ~0U >> (32-ByteValues.size());
   if (CollectBSwapParts(&I, 0, ByteMask, ByteValues))
-    return 0;
+    return nullptr;
 
   // Check to see if all of the bytes come from the same value.
   Value *V = ByteValues[0];
-  if (V == 0) return 0;  // Didn't find a byte?  Must be zero.
+  if (!V) return nullptr;  // Didn't find a byte?  Must be zero.
 
   // Check to make sure that all of the bytes come from the same value.
   for (unsigned i = 1, e = ByteValues.size(); i != e; ++i)
     if (ByteValues[i] != V)
-      return 0;
+      return nullptr;
   Module *M = I.getParent()->getParent()->getParent();
   Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy);
   return CallInst::Create(F, V);
@@ -1529,10 +1534,10 @@ Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
 static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
                                          Value *C, Value *D) {
   // If A is not a select of -1/0, this cannot match.
-  Value *Cond = 0;
+  Value *Cond = nullptr;
   if (!match(A, m_SExt(m_Value(Cond))) ||
       !Cond->getType()->isIntegerTy(1))
-    return 0;
+    return nullptr;
 
   // ((cond?-1:0)&C) | (B&(cond?0:-1)) -> cond ? C : B.
   if (match(D, m_Not(m_SExt(m_Specific(Cond)))))
@@ -1545,7 +1550,7 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
     return SelectInst::Create(Cond, C, D);
   if (match(B, m_SExt(m_Not(m_Specific(Cond)))))
     return SelectInst::Create(Cond, C, D);
-  return 0;
+  return nullptr;
 }
 
 /// FoldOrOfICmps - Fold (icmp)|(icmp) if possible.
@@ -1566,8 +1571,8 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
         LAnd->getOpcode() == Instruction::And &&
         RAnd->getOpcode() == Instruction::And) {
 
-      Value *Mask = 0;
-      Value *Masked = 0;
+      Value *Mask = nullptr;
+      Value *Masked = nullptr;
       if (LAnd->getOperand(0) == RAnd->getOperand(0) &&
           isKnownToBeAPowerOfTwo(LAnd->getOperand(1)) &&
           isKnownToBeAPowerOfTwo(RAnd->getOperand(1))) {
@@ -1608,7 +1613,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   if (LHS->hasOneUse() || RHS->hasOneUse()) {
     // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
     // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
-    Value *A = 0, *B = 0;
+    Value *A = nullptr, *B = nullptr;
     if (LHSCC == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero()) {
       B = Val;
       if (RHSCC == ICmpInst::ICMP_ULT && Val == RHS->getOperand(1))
@@ -1632,7 +1637,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   }
 
   // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
-  if (LHSCst == 0 || RHSCst == 0) return 0;
+  if (!LHSCst || !RHSCst) return nullptr;
 
   if (LHSCst == RHSCst && LHSCC == RHSCC) {
     // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
@@ -1653,18 +1658,18 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 
   // From here on, we only handle:
   //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
-  if (Val != Val2) return 0;
+  if (Val != Val2) return nullptr;
 
   // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
   if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
       RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
       LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
       RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
-    return 0;
+    return nullptr;
 
   // We can't fold (ugt x, C) | (sgt x, C2).
   if (!PredicatesFoldable(LHSCC, RHSCC))
-    return 0;
+    return nullptr;
 
   // Ensure that the larger constant is on the RHS.
   bool ShouldSwap;
@@ -1809,7 +1814,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     }
     break;
   }
-  return 0;
+  return nullptr;
 }
 
 /// FoldOrOfFCmps - Optimize (fcmp)|(fcmp).  NOTE: Unlike the rest of
@@ -1837,7 +1842,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
         isa<ConstantAggregateZero>(RHS->getOperand(1)))
       return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0));
 
-    return 0;
+    return nullptr;
   }
 
   Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
@@ -1869,7 +1874,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
       return getFCmpValue(Op0Ordered, Op0Pred|Op1Pred, Op0LHS, Op0RHS, Builder);
     }
   }
-  return 0;
+  return nullptr;
 }
 
 /// FoldOrWithConstants - This helper function folds:
@@ -1884,27 +1889,30 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
 Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op,
                                                Value *A, Value *B, Value *C) {
   ConstantInt *CI1 = dyn_cast<ConstantInt>(C);
-  if (!CI1) return 0;
+  if (!CI1) return nullptr;
 
-  Value *V1 = 0;
-  ConstantInt *CI2 = 0;
-  if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2)))) return 0;
+  Value *V1 = nullptr;
+  ConstantInt *CI2 = nullptr;
+  if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2)))) return nullptr;
 
   APInt Xor = CI1->getValue() ^ CI2->getValue();
-  if (!Xor.isAllOnesValue()) return 0;
+  if (!Xor.isAllOnesValue()) return nullptr;
 
   if (V1 == A || V1 == B) {
     Value *NewOp = Builder->CreateAnd((V1 == A) ? B : A, CI1);
     return BinaryOperator::CreateOr(NewOp, V1);
   }
 
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyOrInst(Op0, Op1, DL))
     return ReplaceInstUsesWith(I, V);
 
@@ -1918,7 +1926,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     return &I;
 
   if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
-    ConstantInt *C1 = 0; Value *X = 0;
+    ConstantInt *C1 = nullptr; Value *X = nullptr;
     // (X & C1) | C2 --> (X | C2) & (C1|C2)
     // iff (C1 & C2) == 0.
     if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) &&
@@ -1949,8 +1957,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
         return NV;
   }
 
-  Value *A = 0, *B = 0;
-  ConstantInt *C1 = 0, *C2 = 0;
+  Value *A = nullptr, *B = nullptr;
+  ConstantInt *C1 = nullptr, *C2 = nullptr;
 
   // (A | B) | C  and  A | (B | C)                  -> bswap if possible.
   // (A >> B) | (C << D)  and  (A << B) | (B >> C)  -> bswap if possible.
@@ -1981,10 +1989,10 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   }
 
   // (A & C)|(B & D)
-  Value *C = 0, *D = 0;
+  Value *C = nullptr, *D = nullptr;
   if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
       match(Op1, m_And(m_Value(B), m_Value(D)))) {
-    Value *V1 = 0, *V2 = 0;
+    Value *V1 = nullptr, *V2 = nullptr;
     C1 = dyn_cast<ConstantInt>(C);
     C2 = dyn_cast<ConstantInt>(D);
     if (C1 && C2) {  // (A & C1)|(B & C2)
@@ -2028,7 +2036,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
 
         // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2)
         // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0.
-        ConstantInt *C3 = 0, *C4 = 0;
+        ConstantInt *C3 = nullptr, *C4 = nullptr;
         if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) &&
             (C3->getValue() & ~C1->getValue()) == 0 &&
             match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) &&
@@ -2220,7 +2228,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   // Since this OR statement hasn't been optimized further yet, we hope
   // that this transformation will allow the new ORs to be optimized.
   {
-    Value *X = 0, *Y = 0;
+    Value *X = nullptr, *Y = nullptr;
     if (Op0->hasOneUse() && Op1->hasOneUse() &&
         match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&
         match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {
@@ -2230,13 +2238,16 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     }
   }
 
-  return Changed ? &I : 0;
+  return Changed ? &I : nullptr;
 }
 
 Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyXorInst(Op0, Op1, DL))
     return ReplaceInstUsesWith(I, V);
 
@@ -2494,5 +2505,5 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       }
   }
 
-  return Changed ? &I : 0;
+  return Changed ? &I : nullptr;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 0bc3ac7..d4b583b 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -22,6 +22,8 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "instcombine"
+
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
 /// getPromotedType - Return the specified type promoted as it would be to pass
@@ -70,7 +72,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
   // load/store.
   ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
-  if (MemOpLength == 0) return 0;
+  if (!MemOpLength) return nullptr;
 
   // Source and destination pointer types are always "i8*" for intrinsic.  See
   // if the size is something we can handle with a single primitive load/store.
@@ -80,7 +82,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   assert(Size && "0-sized memory transferring should be removed already.");
 
   if (Size > 8 || (Size&(Size-1)))
-    return 0;  // If not 1/2/4/8 bytes, exit.
+    return nullptr;  // If not 1/2/4/8 bytes, exit.
 
   // Use an integer load+store unless we can find something better.
   unsigned SrcAddrSp =
@@ -99,7 +101,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   // dest address will be promotable.  See if we can find a better type than the
   // integer datatype.
   Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts();
-  MDNode *CopyMD = 0;
+  MDNode *CopyMD = nullptr;
   if (StrippedDest != MI->getArgOperand(0)) {
     Type *SrcETy = cast<PointerType>(StrippedDest->getType())
                                     ->getElementType();
@@ -163,7 +165,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
   ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
   ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
   if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
-    return 0;
+    return nullptr;
   uint64_t Len = LenC->getLimitedValue();
   Alignment = MI->getAlignment();
   assert(Len && "0-sized memory setting should be removed already.");
@@ -191,7 +193,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
     return MI;
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// visitCallInst - CallInst simplification.  This mostly only handles folding
@@ -233,7 +235,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // No other transformations apply to volatile transfers.
     if (MI->isVolatile())
-      return 0;
+      return nullptr;
 
     // If we have a memmove and the source operation is a constant global,
     // then the source and dest pointers can't alias, so we can change this
@@ -276,11 +278,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     uint64_t Size;
     if (getObjectSize(II->getArgOperand(0), Size, DL, TLI))
       return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size));
-    return 0;
+    return nullptr;
   }
   case Intrinsic::bswap: {
     Value *IIOperand = II->getArgOperand(0);
-    Value *X = 0;
+    Value *X = nullptr;
 
     // bswap(bswap(x)) -> x
     if (match(IIOperand, m_BSwap(m_Value(X))))
@@ -320,7 +322,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     uint32_t BitWidth = IT->getBitWidth();
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne);
+    computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne);
     unsigned TrailingZeros = KnownOne.countTrailingZeros();
     APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros));
     if ((Mask & KnownZero) == Mask)
@@ -338,7 +340,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     uint32_t BitWidth = IT->getBitWidth();
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne);
+    computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne);
     unsigned LeadingZeros = KnownOne.countLeadingZeros();
     APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros));
     if ((Mask & KnownZero) == Mask)
@@ -353,14 +355,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     uint32_t BitWidth = IT->getBitWidth();
     APInt LHSKnownZero(BitWidth, 0);
     APInt LHSKnownOne(BitWidth, 0);
-    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
+    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
     bool LHSKnownNegative = LHSKnownOne[BitWidth - 1];
     bool LHSKnownPositive = LHSKnownZero[BitWidth - 1];
 
     if (LHSKnownNegative || LHSKnownPositive) {
       APInt RHSKnownZero(BitWidth, 0);
       APInt RHSKnownOne(BitWidth, 0);
-      ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
+      computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
       bool RHSKnownNegative = RHSKnownOne[BitWidth - 1];
       bool RHSKnownPositive = RHSKnownZero[BitWidth - 1];
       if (LHSKnownNegative && RHSKnownNegative) {
@@ -447,10 +449,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     APInt LHSKnownZero(BitWidth, 0);
     APInt LHSKnownOne(BitWidth, 0);
-    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
+    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
     APInt RHSKnownZero(BitWidth, 0);
     APInt RHSKnownOne(BitWidth, 0);
-    ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
+    computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
 
     // Get the largest possible values for each operand.
     APInt LHSMax = ~LHSKnownZero;
@@ -554,6 +556,79 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  // Constant fold <A x Bi> << Ci.
+  // FIXME: We don't handle _dq because it's a shift of an i128, but is
+  // represented in the IR as <2 x i64>. A per element shift is wrong.
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w: {
+    // Simplify if count is constant. To 0 if >= BitWidth,
+    // otherwise to shl/lshr.
+    auto CDV = dyn_cast<ConstantDataVector>(II->getArgOperand(1));
+    auto CInt = dyn_cast<ConstantInt>(II->getArgOperand(1));
+    if (!CDV && !CInt)
+      break;
+    ConstantInt *Count;
+    if (CDV)
+      Count = cast<ConstantInt>(CDV->getElementAsConstant(0));
+    else
+      Count = CInt;
+
+    auto Vec = II->getArgOperand(0);
+    auto VT = cast<VectorType>(Vec->getType());
+    if (Count->getZExtValue() >
+        VT->getElementType()->getPrimitiveSizeInBits() - 1)
+      return ReplaceInstUsesWith(
+          CI, ConstantAggregateZero::get(Vec->getType()));
+
+    bool isPackedShiftLeft = true;
+    switch (II->getIntrinsicID()) {
+    default : break;
+    case Intrinsic::x86_sse2_psrl_d:
+    case Intrinsic::x86_sse2_psrl_q:
+    case Intrinsic::x86_sse2_psrl_w:
+    case Intrinsic::x86_sse2_psrli_d:
+    case Intrinsic::x86_sse2_psrli_q:
+    case Intrinsic::x86_sse2_psrli_w:
+    case Intrinsic::x86_avx2_psrl_d:
+    case Intrinsic::x86_avx2_psrl_q:
+    case Intrinsic::x86_avx2_psrl_w:
+    case Intrinsic::x86_avx2_psrli_d:
+    case Intrinsic::x86_avx2_psrli_q:
+    case Intrinsic::x86_avx2_psrli_w: isPackedShiftLeft = false; break;
+    }
+
+    unsigned VWidth = VT->getNumElements();
+    // Get a constant vector of the same type as the first operand.
+    auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue());
+    if (isPackedShiftLeft)
+      return BinaryOperator::CreateShl(Vec,
+          Builder->CreateVectorSplat(VWidth, VTCI));
+
+    return BinaryOperator::CreateLShr(Vec,
+        Builder->CreateVectorSplat(VWidth, VTCI));
+  }
 
   case Intrinsic::x86_sse41_pmovsxbw:
   case Intrinsic::x86_sse41_pmovsxwd:
@@ -576,6 +651,153 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::x86_sse4a_insertqi: {
+    // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top
+    // ones undef
+    // TODO: eventually we should lower this intrinsic to IR
+    if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
+      if (auto CIStart = dyn_cast<ConstantInt>(II->getArgOperand(3))) {
+        if (CIWidth->equalsInt(64) && CIStart->isZero()) {
+          Value *Vec = II->getArgOperand(1);
+          Value *Undef = UndefValue::get(Vec->getType());
+          const uint32_t Mask[] = { 0, 2 };
+          return ReplaceInstUsesWith(
+              CI,
+              Builder->CreateShuffleVector(
+                  Vec, Undef, ConstantDataVector::get(
+                                  II->getContext(), ArrayRef<uint32_t>(Mask))));
+
+        } else if (auto Source =
+                       dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
+          if (Source->hasOneUse() &&
+              Source->getArgOperand(1) == II->getArgOperand(1)) {
+            // If the source of the insert has only one use and it's another
+            // insert (and they're both inserting from the same vector), try to
+            // bundle both together.
+            auto CISourceWidth =
+                dyn_cast<ConstantInt>(Source->getArgOperand(2));
+            auto CISourceStart =
+                dyn_cast<ConstantInt>(Source->getArgOperand(3));
+            if (CISourceStart && CISourceWidth) {
+              unsigned Start = CIStart->getZExtValue();
+              unsigned Width = CIWidth->getZExtValue();
+              unsigned End = Start + Width;
+              unsigned SourceStart = CISourceStart->getZExtValue();
+              unsigned SourceWidth = CISourceWidth->getZExtValue();
+              unsigned SourceEnd = SourceStart + SourceWidth;
+              unsigned NewStart, NewWidth;
+              bool ShouldReplace = false;
+              if (Start <= SourceStart && SourceStart <= End) {
+                NewStart = Start;
+                NewWidth = std::max(End, SourceEnd) - NewStart;
+                ShouldReplace = true;
+              } else if (SourceStart <= Start && Start <= SourceEnd) {
+                NewStart = SourceStart;
+                NewWidth = std::max(SourceEnd, End) - NewStart;
+                ShouldReplace = true;
+              }
+
+              if (ShouldReplace) {
+                Constant *ConstantWidth = ConstantInt::get(
+                    II->getArgOperand(2)->getType(), NewWidth, false);
+                Constant *ConstantStart = ConstantInt::get(
+                    II->getArgOperand(3)->getType(), NewStart, false);
+                Value *Args[4] = { Source->getArgOperand(0),
+                                   II->getArgOperand(1), ConstantWidth,
+                                   ConstantStart };
+                Module *M = CI.getParent()->getParent()->getParent();
+                Value *F =
+                    Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
+                return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args));
+              }
+            }
+          }
+        }
+      }
+    }
+    break;
+  }
+
+  case Intrinsic::x86_sse41_pblendvb:
+  case Intrinsic::x86_sse41_blendvps:
+  case Intrinsic::x86_sse41_blendvpd:
+  case Intrinsic::x86_avx_blendv_ps_256:
+  case Intrinsic::x86_avx_blendv_pd_256:
+  case Intrinsic::x86_avx2_pblendvb: {
+    // Convert blendv* to vector selects if the mask is constant.
+    // This optimization is convoluted because the intrinsic is defined as
+    // getting a vector of floats or doubles for the ps and pd versions.
+    // FIXME: That should be changed.
+    Value *Mask = II->getArgOperand(2);
+    if (auto C = dyn_cast<ConstantDataVector>(Mask)) {
+      auto Tyi1 = Builder->getInt1Ty();
+      auto SelectorType = cast<VectorType>(Mask->getType());
+      auto EltTy = SelectorType->getElementType();
+      unsigned Size = SelectorType->getNumElements();
+      unsigned BitWidth =
+          EltTy->isFloatTy()
+              ? 32
+              : (EltTy->isDoubleTy() ? 64 : EltTy->getIntegerBitWidth());
+      assert((BitWidth == 64 || BitWidth == 32 || BitWidth == 8) &&
+             "Wrong arguments for variable blend intrinsic");
+      SmallVector<Constant *, 32> Selectors;
+      for (unsigned I = 0; I < Size; ++I) {
+        // The intrinsics only read the top bit
+        uint64_t Selector;
+        if (BitWidth == 8)
+          Selector = C->getElementAsInteger(I);
+        else
+          Selector = C->getElementAsAPFloat(I).bitcastToAPInt().getZExtValue();
+        Selectors.push_back(ConstantInt::get(Tyi1, Selector >> (BitWidth - 1)));
+      }
+      auto NewSelector = ConstantVector::get(Selectors);
+      return SelectInst::Create(NewSelector, II->getArgOperand(1),
+                                II->getArgOperand(0), "blendv");
+    } else {
+      break;
+    }
+  }
+
+  case Intrinsic::x86_avx_vpermilvar_ps:
+  case Intrinsic::x86_avx_vpermilvar_ps_256:
+  case Intrinsic::x86_avx_vpermilvar_pd:
+  case Intrinsic::x86_avx_vpermilvar_pd_256: {
+    // Convert vpermil* to shufflevector if the mask is constant.
+    Value *V = II->getArgOperand(1);
+    unsigned Size = cast<VectorType>(V->getType())->getNumElements();
+    assert(Size == 8 || Size == 4 || Size == 2);
+    uint32_t Indexes[8];
+    if (auto C = dyn_cast<ConstantDataVector>(V)) {
+      // The intrinsics only read one or two bits, clear the rest.
+      for (unsigned I = 0; I < Size; ++I) {
+        uint32_t Index = C->getElementAsInteger(I) & 0x3;
+        if (II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd ||
+            II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256)
+          Index >>= 1;
+        Indexes[I] = Index;
+      }
+    } else if (isa<ConstantAggregateZero>(V)) {
+      for (unsigned I = 0; I < Size; ++I)
+        Indexes[I] = 0;
+    } else {
+      break;
+    }
+    // The _256 variants are a bit trickier since the mask bits always index
+    // into the corresponding 128 half. In order to convert to a generic
+    // shuffle, we have to make that explicit.
+    if (II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_ps_256 ||
+        II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) {
+      for (unsigned I = Size / 2; I < Size; ++I)
+        Indexes[I] += Size / 2;
+    }
+    auto NewC =
+        ConstantDataVector::get(V->getContext(), makeArrayRef(Indexes, Size));
+    auto V1 = II->getArgOperand(0);
+    auto V2 = UndefValue::get(V1->getType());
+    auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC);
+    return ReplaceInstUsesWith(CI, Shuffle);
+  }
+
   case Intrinsic::ppc_altivec_vperm:
     // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
     if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
@@ -586,8 +808,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       bool AllEltsOk = true;
       for (unsigned i = 0; i != 16; ++i) {
         Constant *Elt = Mask->getAggregateElement(i);
-        if (Elt == 0 ||
-            !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
+        if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
           AllEltsOk = false;
           break;
         }
@@ -612,7 +833,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
             cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
           Idx &= 31;  // Match the hardware behavior.
 
-          if (ExtractedElts[Idx] == 0) {
+          if (!ExtractedElts[Idx]) {
             ExtractedElts[Idx] =
               Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1,
                                             Builder->getInt32(Idx&15));
@@ -655,8 +876,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu:
-  case Intrinsic::arm64_neon_smull:
-  case Intrinsic::arm64_neon_umull: {
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull: {
     Value *Arg0 = II->getArgOperand(0);
     Value *Arg1 = II->getArgOperand(1);
 
@@ -667,7 +888,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // Check for constant LHS & RHS - in this case we just simplify.
     bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu ||
-                 II->getIntrinsicID() == Intrinsic::arm64_neon_umull);
+                 II->getIntrinsicID() == Intrinsic::aarch64_neon_umull);
     VectorType *NewVT = cast<VectorType>(II->getType());
     if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
       if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
@@ -776,14 +997,14 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
 // mempcpy_chk, memmove_chk, memset_chk, strcpy_chk, stpcpy_chk, strncpy_chk,
 // strcat_chk and strncat_chk.
 Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *DL) {
-  if (CI->getCalledFunction() == 0) return 0;
+  if (!CI->getCalledFunction()) return nullptr;
 
   if (Value *With = Simplifier->optimizeCall(CI)) {
     ++NumSimplified;
     return CI->use_empty() ? CI : ReplaceInstUsesWith(*CI, With);
   }
 
-  return 0;
+  return nullptr;
 }
 
 static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) {
@@ -792,35 +1013,35 @@ static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) {
   Value *Underlying = TrampMem->stripPointerCasts();
   if (Underlying != TrampMem &&
       (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem))
-    return 0;
+    return nullptr;
   if (!isa<AllocaInst>(Underlying))
-    return 0;
+    return nullptr;
 
-  IntrinsicInst *InitTrampoline = 0;
+  IntrinsicInst *InitTrampoline = nullptr;
   for (User *U : TrampMem->users()) {
     IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
     if (!II)
-      return 0;
+      return nullptr;
     if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
       if (InitTrampoline)
         // More than one init_trampoline writes to this value.  Give up.
-        return 0;
+        return nullptr;
       InitTrampoline = II;
       continue;
     }
     if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
       // Allow any number of calls to adjust.trampoline.
       continue;
-    return 0;
+    return nullptr;
   }
 
   // No call to init.trampoline found.
   if (!InitTrampoline)
-    return 0;
+    return nullptr;
 
   // Check that the alloca is being used in the expected way.
   if (InitTrampoline->getOperand(0) != TrampMem)
-    return 0;
+    return nullptr;
 
   return InitTrampoline;
 }
@@ -837,9 +1058,9 @@ static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
           II->getOperand(0) == TrampMem)
         return II;
     if (Inst->mayWriteToMemory())
-      return 0;
+      return nullptr;
   }
-  return 0;
+  return nullptr;
 }
 
 // Given a call to llvm.adjust.trampoline, find and return the corresponding
@@ -851,7 +1072,7 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) {
   IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
   if (!AdjustTramp ||
       AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
-    return 0;
+    return nullptr;
 
   Value *TrampMem = AdjustTramp->getOperand(0);
 
@@ -859,7 +1080,7 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) {
     return IT;
   if (IntrinsicInst *IT = FindInitTrampolineFromBB(AdjustTramp, TrampMem))
     return IT;
-  return 0;
+  return nullptr;
 }
 
 // visitCallSite - Improvements for call and invoke instructions.
@@ -874,7 +1095,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
   // arguments of the call/invoke.
   Value *Callee = CS.getCalledValue();
   if (!isa<Function>(Callee) && transformConstExprCastCall(CS))
-    return 0;
+    return nullptr;
 
   if (Function *CalleeF = dyn_cast<Function>(Callee))
     // If the call and callee calling conventions don't match, this call must
@@ -899,7 +1120,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
       // change the callee to a null pointer.
       cast<InvokeInst>(OldCall)->setCalledFunction(
                                     Constant::getNullValue(CalleeF->getType()));
-      return 0;
+      return nullptr;
     }
 
   if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
@@ -911,7 +1132,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
 
     if (isa<InvokeInst>(CS.getInstruction())) {
       // Can't remove an invoke because we cannot change the CFG.
-      return 0;
+      return nullptr;
     }
 
     // This instruction is not reachable, just remove it.  We insert a store to
@@ -959,7 +1180,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     if (I) return EraseInstFromFunction(*I);
   }
 
-  return Changed ? CS.getInstruction() : 0;
+  return Changed ? CS.getInstruction() : nullptr;
 }
 
 // transformConstExprCastCall - If the callee is a constexpr cast of a function,
@@ -968,7 +1189,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
 bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   Function *Callee =
     dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
-  if (Callee == 0)
+  if (!Callee)
     return false;
   Instruction *Caller = CS.getInstruction();
   const AttributeSet &CallerPAL = CS.getAttributes();
@@ -1044,7 +1265,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
         CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1,
                                                          Attribute::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
-      if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || DL == 0)
+      if (!ParamPTy || !ParamPTy->getElementType()->isSized() || !DL)
         return false;
 
       Type *CurElTy = ActTy->getPointerElementType();
@@ -1235,7 +1456,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   // If the call already has the 'nest' attribute somewhere then give up -
   // otherwise 'nest' would occur twice after splicing in the chain.
   if (Attrs.hasAttrSomewhere(Attribute::Nest))
-    return 0;
+    return nullptr;
 
   assert(Tramp &&
          "transformCallThroughTrampoline called with incorrect CallSite.");
@@ -1247,7 +1468,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   const AttributeSet &NestAttrs = NestF->getAttributes();
   if (!NestAttrs.isEmpty()) {
     unsigned NestIdx = 1;
-    Type *NestTy = 0;
+    Type *NestTy = nullptr;
     AttributeSet NestAttr;
 
     // Look for a parameter marked with the 'nest' attribute.
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index c2b862a..356803a 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -19,6 +19,8 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "instcombine"
+
 /// DecomposeSimpleLinearExpr - Analyze 'Val', seeing if it is a simple linear
 /// expression.  If so, decompose it, returning some value X, such that Val is
 /// X*Scale+Offset.
@@ -79,7 +81,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
 Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
                                                    AllocaInst &AI) {
   // This requires DataLayout to get the alloca alignment and size information.
-  if (!DL) return 0;
+  if (!DL) return nullptr;
 
   PointerType *PTy = cast<PointerType>(CI.getType());
 
@@ -89,26 +91,26 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   // Get the type really allocated and the type casted to.
   Type *AllocElTy = AI.getAllocatedType();
   Type *CastElTy = PTy->getElementType();
-  if (!AllocElTy->isSized() || !CastElTy->isSized()) return 0;
+  if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
 
   unsigned AllocElTyAlign = DL->getABITypeAlignment(AllocElTy);
   unsigned CastElTyAlign = DL->getABITypeAlignment(CastElTy);
-  if (CastElTyAlign < AllocElTyAlign) return 0;
+  if (CastElTyAlign < AllocElTyAlign) return nullptr;
 
   // If the allocation has multiple uses, only promote it if we are strictly
   // increasing the alignment of the resultant allocation.  If we keep it the
   // same, we open the door to infinite loops of various kinds.
-  if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return 0;
+  if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr;
 
   uint64_t AllocElTySize = DL->getTypeAllocSize(AllocElTy);
   uint64_t CastElTySize = DL->getTypeAllocSize(CastElTy);
-  if (CastElTySize == 0 || AllocElTySize == 0) return 0;
+  if (CastElTySize == 0 || AllocElTySize == 0) return nullptr;
 
   // If the allocation has multiple uses, only promote it if we're not
   // shrinking the amount of memory being allocated.
   uint64_t AllocElTyStoreSize = DL->getTypeStoreSize(AllocElTy);
   uint64_t CastElTyStoreSize = DL->getTypeStoreSize(CastElTy);
-  if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return 0;
+  if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr;
 
   // See if we can satisfy the modulus by pulling a scale out of the array
   // size argument.
@@ -120,10 +122,10 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   // If we can now satisfy the modulus, by using a non-1 scale, we really can
   // do the xform.
   if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
-      (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return 0;
+      (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return nullptr;
 
   unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
-  Value *Amt = 0;
+  Value *Amt = nullptr;
   if (Scale == 1) {
     Amt = NumElements;
   } else {
@@ -141,6 +143,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
   New->setAlignment(AI.getAlignment());
   New->takeName(&AI);
+  New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
 
   // If the allocation has multiple real uses, insert a cast and change all
   // things that used it to use the new cast.  This will also hack on CI, but it
@@ -169,7 +172,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
 
   // Otherwise, it must be an instruction.
   Instruction *I = cast<Instruction>(V);
-  Instruction *Res = 0;
+  Instruction *Res = nullptr;
   unsigned Opc = I->getOpcode();
   switch (Opc) {
   case Instruction::Add:
@@ -245,11 +248,11 @@ isEliminableCastPair(
   Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode());
   Instruction::CastOps secondOp = Instruction::CastOps(opcode);
   Type *SrcIntPtrTy = DL && SrcTy->isPtrOrPtrVectorTy() ?
-    DL->getIntPtrType(SrcTy) : 0;
+    DL->getIntPtrType(SrcTy) : nullptr;
   Type *MidIntPtrTy = DL && MidTy->isPtrOrPtrVectorTy() ?
-    DL->getIntPtrType(MidTy) : 0;
+    DL->getIntPtrType(MidTy) : nullptr;
   Type *DstIntPtrTy = DL && DstTy->isPtrOrPtrVectorTy() ?
-    DL->getIntPtrType(DstTy) : 0;
+    DL->getIntPtrType(DstTy) : nullptr;
   unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
                                                 DstTy, SrcIntPtrTy, MidIntPtrTy,
                                                 DstIntPtrTy);
@@ -318,7 +321,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
         return NV;
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// CanEvaluateTruncated - Return true if we can evaluate the specified
@@ -470,7 +473,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   }
 
   // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
-  Value *A = 0; ConstantInt *Cst = 0;
+  Value *A = nullptr; ConstantInt *Cst = nullptr;
   if (Src->hasOneUse() &&
       match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst)))) {
     // We have three types to worry about here, the type of A, the source of
@@ -502,7 +505,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
                                      ConstantExpr::getTrunc(Cst, CI.getType()));
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// transformZExtICmp - Transform (zext icmp) to bitwise / integer operations
@@ -550,7 +553,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
       // If Op1C some other power of two, convert:
       uint32_t BitWidth = Op1C->getType()->getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      ComputeMaskedBits(ICI->getOperand(0), KnownZero, KnownOne);
+      computeKnownBits(ICI->getOperand(0), KnownZero, KnownOne);
 
       APInt KnownZeroMask(~KnownZero);
       if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
@@ -598,8 +601,8 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
 
       APInt KnownZeroLHS(BitWidth, 0), KnownOneLHS(BitWidth, 0);
       APInt KnownZeroRHS(BitWidth, 0), KnownOneRHS(BitWidth, 0);
-      ComputeMaskedBits(LHS, KnownZeroLHS, KnownOneLHS);
-      ComputeMaskedBits(RHS, KnownZeroRHS, KnownOneRHS);
+      computeKnownBits(LHS, KnownZeroLHS, KnownOneLHS);
+      computeKnownBits(RHS, KnownZeroRHS, KnownOneRHS);
 
       if (KnownZeroLHS == KnownZeroRHS && KnownOneLHS == KnownOneRHS) {
         APInt KnownBits = KnownZeroLHS | KnownOneLHS;
@@ -627,7 +630,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// CanEvaluateZExtd - Determine if the specified value can be computed in the
@@ -758,7 +761,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   // If this zero extend is only used by a truncate, let the truncate be
   // eliminated before we try to optimize this zext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
-    return 0;
+    return nullptr;
 
   // If one of the common conversion will work, do it.
   if (Instruction *Result = commonCastTransforms(CI))
@@ -883,7 +886,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1));
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// transformSExtICmp - Transform (sext icmp) to bitwise / integer operations
@@ -918,7 +921,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
         ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){
       unsigned BitWidth = Op1C->getType()->getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      ComputeMaskedBits(Op0, KnownZero, KnownOne);
+      computeKnownBits(Op0, KnownZero, KnownOne);
 
       APInt KnownZeroMask(~KnownZero);
       if (KnownZeroMask.isPowerOf2()) {
@@ -967,7 +970,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// CanEvaluateSExtd - Return true if we can take the specified value
@@ -1039,7 +1042,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   // If this sign extend is only used by a truncate, let the truncate be
   // eliminated before we try to optimize this sext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
-    return 0;
+    return nullptr;
 
   if (Instruction *I = commonCastTransforms(CI))
     return I;
@@ -1107,9 +1110,9 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   // into:
   //   %a = shl i32 %i, 30
   //   %d = ashr i32 %a, 30
-  Value *A = 0;
+  Value *A = nullptr;
   // TODO: Eventually this could be subsumed by EvaluateInDifferentType.
-  ConstantInt *BA = 0, *CA = 0;
+  ConstantInt *BA = nullptr, *CA = nullptr;
   if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_ConstantInt(BA)),
                         m_ConstantInt(CA))) &&
       BA == CA && A->getType() == CI.getType()) {
@@ -1121,7 +1124,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     return BinaryOperator::CreateAShr(A, ShAmtV);
   }
 
-  return 0;
+  return nullptr;
 }
 
 
@@ -1133,7 +1136,7 @@ static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
   (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
   if (!losesInfo)
     return ConstantFP::get(CFP->getContext(), F);
-  return 0;
+  return nullptr;
 }
 
 /// LookThroughFPExtensions - If this is an fp extension instruction, look
@@ -1345,7 +1348,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitFPExt(CastInst &CI) {
@@ -1354,7 +1357,7 @@ Instruction *InstCombiner::visitFPExt(CastInst &CI) {
 
 Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
   Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
-  if (OpI == 0)
+  if (!OpI)
     return commonCastTransforms(FI);
 
   // fptoui(uitofp(X)) --> X
@@ -1374,7 +1377,7 @@ Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
 
 Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
   Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
-  if (OpI == 0)
+  if (!OpI)
     return commonCastTransforms(FI);
 
   // fptosi(sitofp(X)) --> X
@@ -1421,7 +1424,7 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
   if (Instruction *I = commonCastTransforms(CI))
     return I;
 
-  return 0;
+  return nullptr;
 }
 
 /// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
@@ -1520,7 +1523,7 @@ static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
     // there yet.
     if (SrcTy->getElementType()->getPrimitiveSizeInBits() !=
         DestTy->getElementType()->getPrimitiveSizeInBits())
-      return 0;
+      return nullptr;
 
     SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
     InVal = IC.Builder->CreateBitCast(InVal, SrcTy);
@@ -1598,7 +1601,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
       ElementIndex = Elements.size() - ElementIndex - 1;
 
     // Fail if multiple elements are inserted into this slot.
-    if (Elements[ElementIndex] != 0)
+    if (Elements[ElementIndex])
       return false;
 
     Elements[ElementIndex] = V;
@@ -1638,7 +1641,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
   if (!V->hasOneUse()) return false;
 
   Instruction *I = dyn_cast<Instruction>(V);
-  if (I == 0) return false;
+  if (!I) return false;
   switch (I->getOpcode()) {
   default: return false; // Unhandled case.
   case Instruction::BitCast:
@@ -1659,7 +1662,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
   case Instruction::Shl: {
     // Must be shifting by a constant that is a multiple of the element size.
     ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
-    if (CI == 0) return false;
+    if (!CI) return false;
     Shift += CI->getZExtValue();
     if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false;
     return CollectInsertionElements(I->getOperand(0), Shift,
@@ -1687,7 +1690,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
 static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
                                                 InstCombiner &IC) {
   // We need to know the target byte order to perform this optimization.
-  if (!IC.getDataLayout()) return 0;
+  if (!IC.getDataLayout()) return nullptr;
 
   VectorType *DestVecTy = cast<VectorType>(CI.getType());
   Value *IntInput = CI.getOperand(0);
@@ -1695,14 +1698,14 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
   SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
   if (!CollectInsertionElements(IntInput, 0, Elements,
                                 DestVecTy->getElementType(), IC))
-    return 0;
+    return nullptr;
 
   // If we succeeded, we know that all of the element are specified by Elements
   // or are zero if Elements has a null entry.  Recast this as a set of
   // insertions.
   Value *Result = Constant::getNullValue(CI.getType());
   for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
-    if (Elements[i] == 0) continue;  // Unset element.
+    if (!Elements[i]) continue;  // Unset element.
 
     Result = IC.Builder->CreateInsertElement(Result, Elements[i],
                                              IC.Builder->getInt32(i));
@@ -1716,14 +1719,14 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
 /// bitcast.  The various long double bitcasts can't get in here.
 static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
   // We need to know the target byte order to perform this optimization.
-  if (!IC.getDataLayout()) return 0;
+  if (!IC.getDataLayout()) return nullptr;
 
   Value *Src = CI.getOperand(0);
   Type *DestTy = CI.getType();
 
   // If this is a bitcast from int to float, check to see if the int is an
   // extraction from a vector.
-  Value *VecInput = 0;
+  Value *VecInput = nullptr;
   // bitcast(trunc(bitcast(somevector)))
   if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) &&
       isa<VectorType>(VecInput->getType())) {
@@ -1747,7 +1750,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
   }
 
   // bitcast(trunc(lshr(bitcast(somevector), cst))
-  ConstantInt *ShAmt = 0;
+  ConstantInt *ShAmt = nullptr;
   if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)),
                                 m_ConstantInt(ShAmt)))) &&
       isa<VectorType>(VecInput->getType())) {
@@ -1769,7 +1772,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
       return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
     }
   }
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 8c0ad52..02e8bf1 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -24,6 +24,8 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "instcombine"
+
 static ConstantInt *getOne(Constant *C) {
   return ConstantInt::get(cast<IntegerType>(C->getType()), 1);
 }
@@ -218,15 +220,15 @@ Instruction *InstCombiner::
 FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
                              CmpInst &ICI, ConstantInt *AndCst) {
   // We need TD information to know the pointer size unless this is inbounds.
-  if (!GEP->isInBounds() && DL == 0)
-    return 0;
+  if (!GEP->isInBounds() && !DL)
+    return nullptr;
 
   Constant *Init = GV->getInitializer();
   if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
-    return 0;
+    return nullptr;
 
   uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
-  if (ArrayElementCount > 1024) return 0;  // Don't blow up on huge arrays.
+  if (ArrayElementCount > 1024) return nullptr; // Don't blow up on huge arrays.
 
   // There are many forms of this optimization we can handle, for now, just do
   // the simple index into a single-dimensional array.
@@ -236,7 +238,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
       !isa<ConstantInt>(GEP->getOperand(1)) ||
       !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
       isa<Constant>(GEP->getOperand(2)))
-    return 0;
+    return nullptr;
 
   // Check that indices after the variable are constants and in-range for the
   // type they index.  Collect the indices.  This is typically for arrays of
@@ -246,18 +248,18 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   Type *EltTy = Init->getType()->getArrayElementType();
   for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) {
     ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
-    if (Idx == 0) return 0;  // Variable index.
+    if (!Idx) return nullptr;  // Variable index.
 
     uint64_t IdxVal = Idx->getZExtValue();
-    if ((unsigned)IdxVal != IdxVal) return 0; // Too large array index.
+    if ((unsigned)IdxVal != IdxVal) return nullptr; // Too large array index.
 
     if (StructType *STy = dyn_cast<StructType>(EltTy))
       EltTy = STy->getElementType(IdxVal);
     else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
-      if (IdxVal >= ATy->getNumElements()) return 0;
+      if (IdxVal >= ATy->getNumElements()) return nullptr;
       EltTy = ATy->getElementType();
     } else {
-      return 0; // Unknown type.
+      return nullptr; // Unknown type.
     }
 
     LaterIndices.push_back(IdxVal);
@@ -296,7 +298,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
   for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
     Constant *Elt = Init->getAggregateElement(i);
-    if (Elt == 0) return 0;
+    if (!Elt) return nullptr;
 
     // If this is indexing an array of structures, get the structure element.
     if (!LaterIndices.empty())
@@ -321,7 +323,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
 
     // If we can't compute the result for any of the elements, we have to give
     // up evaluating the entire conditional.
-    if (!isa<ConstantInt>(C)) return 0;
+    if (!isa<ConstantInt>(C)) return nullptr;
 
     // Otherwise, we know if the comparison is true or false for this element,
     // update our state machines.
@@ -375,7 +377,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
     if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined &&
         SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined &&
         FalseRangeEnd == Overdefined)
-      return 0;
+      return nullptr;
   }
 
   // Now that we've scanned the entire array, emit our new comparison(s).  We
@@ -467,7 +469,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // of this load, replace it with computation that does:
   //   ((magic_cst >> i) & 1) != 0
   {
-    Type *Ty = 0;
+    Type *Ty = nullptr;
 
     // Look for an appropriate type:
     // - The type of Idx if the magic fits
@@ -480,7 +482,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
     else if (ArrayElementCount <= 32)
       Ty = Type::getInt32Ty(Init->getContext());
 
-    if (Ty != 0) {
+    if (Ty) {
       Value *V = Builder->CreateIntCast(Idx, Ty, false);
       V = Builder->CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
       V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V);
@@ -488,7 +490,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 
@@ -533,7 +535,7 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
 
   // If there are no variable indices, we must have a constant offset, just
   // evaluate it the general way.
-  if (i == e) return 0;
+  if (i == e) return nullptr;
 
   Value *VariableIdx = GEP->getOperand(i);
   // Determine the scale factor of the variable element.  For example, this is
@@ -543,7 +545,7 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
   // Verify that there are no other variable indices.  If so, emit the hard way.
   for (++i, ++GTI; i != e; ++i, ++GTI) {
     ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
-    if (!CI) return 0;
+    if (!CI) return nullptr;
 
     // Compute the aggregate offset of constant indices.
     if (CI->isZero()) continue;
@@ -587,7 +589,7 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
   // multiple of the variable scale.
   int64_t NewOffs = Offset / (int64_t)VariableScale;
   if (Offset != NewOffs*(int64_t)VariableScale)
-    return 0;
+    return nullptr;
 
   // Okay, we can do this evaluation.  Start by converting the index to intptr.
   if (VariableIdx->getType() != IntPtrTy)
@@ -608,7 +610,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
   // e.g. "&foo[0] <s &foo[1]" can't be folded to "true" because "foo" could be
   // the maximum signed value for the pointer type.
   if (ICmpInst::isSigned(Cond))
-    return 0;
+    return nullptr;
 
   // Look through bitcasts.
   if (BitCastInst *BCI = dyn_cast<BitCastInst>(RHS))
@@ -623,7 +625,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this);
 
     // If not, synthesize the offset the hard way.
-    if (Offset == 0)
+    if (!Offset)
       Offset = EmitGEPOffset(GEPLHS);
     return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
                         Constant::getNullValue(Offset->getType()));
@@ -661,7 +663,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 
       // Otherwise, the base pointers are different and the indices are
       // different, bail out.
-      return 0;
+      return nullptr;
     }
 
     // If one of the GEPs has all zero indices, recurse.
@@ -729,7 +731,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
     }
   }
-  return 0;
+  return nullptr;
 }
 
 /// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X".
@@ -812,11 +814,11 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
   // if it finds it.
   bool DivIsSigned = DivI->getOpcode() == Instruction::SDiv;
   if (!ICI.isEquality() && DivIsSigned != ICI.isSigned())
-    return 0;
+    return nullptr;
   if (DivRHS->isZero())
-    return 0; // The ProdOV computation fails on divide by zero.
+    return nullptr; // The ProdOV computation fails on divide by zero.
   if (DivIsSigned && DivRHS->isAllOnesValue())
-    return 0; // The overflow computation also screws up here
+    return nullptr; // The overflow computation also screws up here
   if (DivRHS->isOne()) {
     // This eliminates some funny cases with INT_MIN.
     ICI.setOperand(0, DivI->getOperand(0));   // X/1 == X.
@@ -850,7 +852,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
   // overflow variable is set to 0 if it's corresponding bound variable is valid
   // -1 if overflowed off the bottom end, or +1 if overflowed off the top end.
   int LoOverflow = 0, HiOverflow = 0;
-  Constant *LoBound = 0, *HiBound = 0;
+  Constant *LoBound = nullptr, *HiBound = nullptr;
 
   if (!DivIsSigned) {  // udiv
     // e.g. X/5 op 3  --> [15, 20)
@@ -890,7 +892,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
       HiBound = cast<ConstantInt>(ConstantExpr::getNeg(RangeSize));
       if (HiBound == DivRHS) {     // -INTMIN = INTMIN
         HiOverflow = 1;            // [INTMIN+1, overflow)
-        HiBound = 0;               // e.g. X/INTMIN = 0 --> X > INTMIN
+        HiBound = nullptr;         // e.g. X/INTMIN = 0 --> X > INTMIN
       }
     } else if (CmpRHSV.isStrictlyPositive()) {   // (X / neg) op pos
       // e.g. X/-5 op 3  --> [-19, -14)
@@ -964,20 +966,20 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
   uint32_t TypeBits = CmpRHSV.getBitWidth();
   uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
   if (ShAmtVal >= TypeBits || ShAmtVal == 0)
-    return 0;
+    return nullptr;
 
   if (!ICI.isEquality()) {
     // If we have an unsigned comparison and an ashr, we can't simplify this.
     // Similarly for signed comparisons with lshr.
     if (ICI.isSigned() != (Shr->getOpcode() == Instruction::AShr))
-      return 0;
+      return nullptr;
 
     // Otherwise, all lshr and most exact ashr's are equivalent to a udiv/sdiv
     // by a power of 2.  Since we already have logic to simplify these,
     // transform to div and then simplify the resultant comparison.
     if (Shr->getOpcode() == Instruction::AShr &&
         (!Shr->isExact() || ShAmtVal == TypeBits - 1))
-      return 0;
+      return nullptr;
 
     // Revisit the shift (to delete it).
     Worklist.Add(Shr);
@@ -994,7 +996,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
 
     // If the builder folded the binop, just return it.
     BinaryOperator *TheDiv = dyn_cast<BinaryOperator>(Tmp);
-    if (TheDiv == 0)
+    if (!TheDiv)
       return &ICI;
 
     // Otherwise, fold this div/compare.
@@ -1037,7 +1039,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
                                     Mask, Shr->getName()+".mask");
     return new ICmpInst(ICI.getPredicate(), And, ShiftedCmpRHS);
   }
-  return 0;
+  return nullptr;
 }
 
 
@@ -1056,7 +1058,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(),
              SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits();
       APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
-      ComputeMaskedBits(LHSI->getOperand(0), KnownZero, KnownOne);
+      computeKnownBits(LHSI->getOperand(0), KnownZero, KnownOne);
 
       // If all the high bits are known, we can do this xform.
       if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
@@ -1181,10 +1183,10 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       // access.
       BinaryOperator *Shift = dyn_cast<BinaryOperator>(LHSI->getOperand(0));
       if (Shift && !Shift->isShift())
-        Shift = 0;
+        Shift = nullptr;
 
       ConstantInt *ShAmt;
-      ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : 0;
+      ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : nullptr;
 
       // This seemingly simple opportunity to fold away a shift turns out to
       // be rather complicated. See PR17827
@@ -1777,7 +1779,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       }
     }
   }
-  return 0;
+  return nullptr;
 }
 
 /// visitICmpInstWithCastAndCast - Handle icmp (cast x to y), (cast/cst).
@@ -1794,7 +1796,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   // integer type is the same size as the pointer type.
   if (DL && LHSCI->getOpcode() == Instruction::PtrToInt &&
       DL->getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) {
-    Value *RHSOp = 0;
+    Value *RHSOp = nullptr;
     if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) {
       RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
     } else if (PtrToIntInst *RHSC = dyn_cast<PtrToIntInst>(ICI.getOperand(1))) {
@@ -1812,7 +1814,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   // Enforce this.
   if (LHSCI->getOpcode() != Instruction::ZExt &&
       LHSCI->getOpcode() != Instruction::SExt)
-    return 0;
+    return nullptr;
 
   bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt;
   bool isSignedCmp = ICI.isSigned();
@@ -1821,12 +1823,12 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
     // Not an extension from the same type?
     RHSCIOp = CI->getOperand(0);
     if (RHSCIOp->getType() != LHSCIOp->getType())
-      return 0;
+      return nullptr;
 
     // If the signedness of the two casts doesn't agree (i.e. one is a sext
     // and the other is a zext), then we can't handle this.
     if (CI->getOpcode() != LHSCI->getOpcode())
-      return 0;
+      return nullptr;
 
     // Deal with equality cases early.
     if (ICI.isEquality())
@@ -1844,7 +1846,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   // If we aren't dealing with a constant on the RHS, exit early
   ConstantInt *CI = dyn_cast<ConstantInt>(ICI.getOperand(1));
   if (!CI)
-    return 0;
+    return nullptr;
 
   // Compute the constant that would happen if we truncated to SrcTy then
   // reextended to DestTy.
@@ -1873,7 +1875,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   // by SimplifyICmpInst, so only deal with the tricky case.
 
   if (isSignedCmp || !isSignedExt)
-    return 0;
+    return nullptr;
 
   // Evaluate the comparison for LT (we invert for GT below). LE and GE cases
   // should have been folded away previously and not enter in here.
@@ -1909,12 +1911,12 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   // In order to eliminate the add-with-constant, the compare can be its only
   // use.
   Instruction *AddWithCst = cast<Instruction>(I.getOperand(0));
-  if (!AddWithCst->hasOneUse()) return 0;
+  if (!AddWithCst->hasOneUse()) return nullptr;
 
   // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow.
-  if (!CI2->getValue().isPowerOf2()) return 0;
+  if (!CI2->getValue().isPowerOf2()) return nullptr;
   unsigned NewWidth = CI2->getValue().countTrailingZeros();
-  if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31) return 0;
+  if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31) return nullptr;
 
   // The width of the new add formed is 1 more than the bias.
   ++NewWidth;
@@ -1922,7 +1924,7 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   // Check to see that CI1 is an all-ones value with NewWidth bits.
   if (CI1->getBitWidth() == NewWidth ||
       CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
-    return 0;
+    return nullptr;
 
   // This is only really a signed overflow check if the inputs have been
   // sign-extended; check for that condition. For example, if CI2 is 2^31 and
@@ -1930,7 +1932,7 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
   if (IC.ComputeNumSignBits(A) < NeededSignBits ||
       IC.ComputeNumSignBits(B) < NeededSignBits)
-    return 0;
+    return nullptr;
 
   // In order to replace the original add with a narrower
   // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
@@ -1946,8 +1948,8 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
     // original add had another add which was then immediately truncated, we
     // could still do the transformation.
     TruncInst *TI = dyn_cast<TruncInst>(U);
-    if (TI == 0 ||
-        TI->getType()->getPrimitiveSizeInBits() > NewWidth) return 0;
+    if (!TI || TI->getType()->getPrimitiveSizeInBits() > NewWidth)
+      return nullptr;
   }
 
   // If the pattern matches, truncate the inputs to the narrower type and
@@ -1983,11 +1985,11 @@ static Instruction *ProcessUAddIdiom(Instruction &I, Value *OrigAddV,
                                      InstCombiner &IC) {
   // Don't bother doing this transformation for pointers, don't do it for
   // vectors.
-  if (!isa<IntegerType>(OrigAddV->getType())) return 0;
+  if (!isa<IntegerType>(OrigAddV->getType())) return nullptr;
 
   // If the add is a constant expr, then we don't bother transforming it.
   Instruction *OrigAdd = dyn_cast<Instruction>(OrigAddV);
-  if (OrigAdd == 0) return 0;
+  if (!OrigAdd) return nullptr;
 
   Value *LHS = OrigAdd->getOperand(0), *RHS = OrigAdd->getOperand(1);
 
@@ -2008,6 +2010,236 @@ static Instruction *ProcessUAddIdiom(Instruction &I, Value *OrigAddV,
   return ExtractValueInst::Create(Call, 1, "uadd.overflow");
 }
 
+/// \brief Recognize and process idiom involving test for multiplication
+/// overflow.
+///
+/// The caller has matched a pattern of the form:
+///   I = cmp u (mul(zext A, zext B), V
+/// The function checks if this is a test for overflow and if so replaces
+/// multiplication with call to 'mul.with.overflow' intrinsic.
+///
+/// \param I Compare instruction.
+/// \param MulVal Result of 'mult' instruction.  It is one of the arguments of
+///               the compare instruction.  Must be of integer type.
+/// \param OtherVal The other argument of compare instruction.
+/// \returns Instruction which must replace the compare instruction, NULL if no
+///          replacement required.
+static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal,
+                                         Value *OtherVal, InstCombiner &IC) {
+  assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal);
+  assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal);
+  assert(isa<IntegerType>(MulVal->getType()));
+  Instruction *MulInstr = cast<Instruction>(MulVal);
+  assert(MulInstr->getOpcode() == Instruction::Mul);
+
+  Instruction *LHS = cast<Instruction>(MulInstr->getOperand(0)),
+              *RHS = cast<Instruction>(MulInstr->getOperand(1));
+  assert(LHS->getOpcode() == Instruction::ZExt);
+  assert(RHS->getOpcode() == Instruction::ZExt);
+  Value *A = LHS->getOperand(0), *B = RHS->getOperand(0);
+
+  // Calculate type and width of the result produced by mul.with.overflow.
+  Type *TyA = A->getType(), *TyB = B->getType();
+  unsigned WidthA = TyA->getPrimitiveSizeInBits(),
+           WidthB = TyB->getPrimitiveSizeInBits();
+  unsigned MulWidth;
+  Type *MulType;
+  if (WidthB > WidthA) {
+    MulWidth = WidthB;
+    MulType = TyB;
+  } else {
+    MulWidth = WidthA;
+    MulType = TyA;
+  }
+
+  // In order to replace the original mul with a narrower mul.with.overflow,
+  // all uses must ignore upper bits of the product.  The number of used low
+  // bits must be not greater than the width of mul.with.overflow.
+  if (MulVal->hasNUsesOrMore(2))
+    for (User *U : MulVal->users()) {
+      if (U == &I)
+        continue;
+      if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
+        // Check if truncation ignores bits above MulWidth.
+        unsigned TruncWidth = TI->getType()->getPrimitiveSizeInBits();
+        if (TruncWidth > MulWidth)
+          return nullptr;
+      } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
+        // Check if AND ignores bits above MulWidth.
+        if (BO->getOpcode() != Instruction::And)
+          return nullptr;
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          const APInt &CVal = CI->getValue();
+          if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth)
+            return nullptr;
+        }
+      } else {
+        // Other uses prohibit this transformation.
+        return nullptr;
+      }
+    }
+
+  // Recognize patterns
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_EQ:
+  case ICmpInst::ICMP_NE:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp eq/neq mulval, zext trunc mulval
+    if (ZExtInst *Zext = dyn_cast<ZExtInst>(OtherVal))
+      if (Zext->hasOneUse()) {
+        Value *ZextArg = Zext->getOperand(0);
+        if (TruncInst *Trunc = dyn_cast<TruncInst>(ZextArg))
+          if (Trunc->getType()->getPrimitiveSizeInBits() == MulWidth)
+            break; //Recognized
+      }
+
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp eq/neq mulval, and(mulval, mask), mask selects low MulWidth bits.
+    ConstantInt *CI;
+    Value *ValToMask;
+    if (match(OtherVal, m_And(m_Value(ValToMask), m_ConstantInt(CI)))) {
+      if (ValToMask != MulVal)
+        return nullptr;
+      const APInt &CVal = CI->getValue() + 1;
+      if (CVal.isPowerOf2()) {
+        unsigned MaskWidth = CVal.logBase2();
+        if (MaskWidth == MulWidth)
+          break; // Recognized
+      }
+    }
+    return nullptr;
+
+  case ICmpInst::ICMP_UGT:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp ugt mulval, max
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+      APInt MaxVal = APInt::getMaxValue(MulWidth);
+      MaxVal = MaxVal.zext(CI->getBitWidth());
+      if (MaxVal.eq(CI->getValue()))
+        break; // Recognized
+    }
+    return nullptr;
+
+  case ICmpInst::ICMP_UGE:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp uge mulval, max+1
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+      APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
+      if (MaxVal.eq(CI->getValue()))
+        break; // Recognized
+    }
+    return nullptr;
+
+  case ICmpInst::ICMP_ULE:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp ule mulval, max
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+      APInt MaxVal = APInt::getMaxValue(MulWidth);
+      MaxVal = MaxVal.zext(CI->getBitWidth());
+      if (MaxVal.eq(CI->getValue()))
+        break; // Recognized
+    }
+    return nullptr;
+
+  case ICmpInst::ICMP_ULT:
+    // Recognize pattern:
+    //   mulval = mul(zext A, zext B)
+    //   cmp ule mulval, max + 1
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+      APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
+      if (MaxVal.eq(CI->getValue()))
+        break; // Recognized
+    }
+    return nullptr;
+
+  default:
+    return nullptr;
+  }
+
+  InstCombiner::BuilderTy *Builder = IC.Builder;
+  Builder->SetInsertPoint(MulInstr);
+  Module *M = I.getParent()->getParent()->getParent();
+
+  // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B)
+  Value *MulA = A, *MulB = B;
+  if (WidthA < MulWidth)
+    MulA = Builder->CreateZExt(A, MulType);
+  if (WidthB < MulWidth)
+    MulB = Builder->CreateZExt(B, MulType);
+  Value *F =
+      Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow, MulType);
+  CallInst *Call = Builder->CreateCall2(F, MulA, MulB, "umul");
+  IC.Worklist.Add(MulInstr);
+
+  // If there are uses of mul result other than the comparison, we know that
+  // they are truncation or binary AND. Change them to use result of
+  // mul.with.overflow and adjust properly mask/size.
+  if (MulVal->hasNUsesOrMore(2)) {
+    Value *Mul = Builder->CreateExtractValue(Call, 0, "umul.value");
+    for (User *U : MulVal->users()) {
+      if (U == &I || U == OtherVal)
+        continue;
+      if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
+        if (TI->getType()->getPrimitiveSizeInBits() == MulWidth)
+          IC.ReplaceInstUsesWith(*TI, Mul);
+        else
+          TI->setOperand(0, Mul);
+      } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
+        assert(BO->getOpcode() == Instruction::And);
+        // Replace (mul & mask) --> zext (mul.with.overflow & short_mask)
+        ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
+        APInt ShortMask = CI->getValue().trunc(MulWidth);
+        Value *ShortAnd = Builder->CreateAnd(Mul, ShortMask);
+        Instruction *Zext =
+            cast<Instruction>(Builder->CreateZExt(ShortAnd, BO->getType()));
+        IC.Worklist.Add(Zext);
+        IC.ReplaceInstUsesWith(*BO, Zext);
+      } else {
+        llvm_unreachable("Unexpected Binary operation");
+      }
+      IC.Worklist.Add(cast<Instruction>(U));
+    }
+  }
+  if (isa<Instruction>(OtherVal))
+    IC.Worklist.Add(cast<Instruction>(OtherVal));
+
+  // The original icmp gets replaced with the overflow value, maybe inverted
+  // depending on predicate.
+  bool Inverse = false;
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_NE:
+    break;
+  case ICmpInst::ICMP_EQ:
+    Inverse = true;
+    break;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_UGE:
+    if (I.getOperand(0) == MulVal)
+      break;
+    Inverse = true;
+    break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_ULE:
+    if (I.getOperand(1) == MulVal)
+      break;
+    Inverse = true;
+    break;
+  default:
+    llvm_unreachable("Unexpected predicate");
+  }
+  if (Inverse) {
+    Value *Res = Builder->CreateExtractValue(Call, 1);
+    return BinaryOperator::CreateNot(Res);
+  }
+
+  return ExtractValueInst::Create(Call, 1);
+}
+
 // DemandedBitsLHSMask - When performing a comparison against a constant,
 // it is possible that not all the bits in the LHS are demanded.  This helper
 // method computes the mask that IS demanded.
@@ -2178,7 +2410,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 
   // See if we are doing a comparison with a constant.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
-    Value *A = 0, *B = 0;
+    Value *A = nullptr, *B = nullptr;
 
     // Match the following pattern, which is a common idiom when writing
     // overflow-safe integer arithmetic function.  The source performs an
@@ -2293,15 +2525,15 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       APInt Op0KnownZeroInverted = ~Op0KnownZero;
       if (~Op1KnownZero == 0 && Op0KnownZeroInverted.isPowerOf2()) {
         // If the LHS is an AND with the same constant, look through it.
-        Value *LHS = 0;
-        ConstantInt *LHSC = 0;
+        Value *LHS = nullptr;
+        ConstantInt *LHSC = nullptr;
         if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) ||
             LHSC->getValue() != Op0KnownZeroInverted)
           LHS = Op0;
 
         // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
         // then turn "((1 << x)&8) == 0" into "x != 3".
-        Value *X = 0;
+        Value *X = nullptr;
         if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
           unsigned CmpVal = Op0KnownZeroInverted.countTrailingZeros();
           return new ICmpInst(ICmpInst::ICMP_NE, X,
@@ -2330,15 +2562,15 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       APInt Op0KnownZeroInverted = ~Op0KnownZero;
       if (~Op1KnownZero == 0 && Op0KnownZeroInverted.isPowerOf2()) {
         // If the LHS is an AND with the same constant, look through it.
-        Value *LHS = 0;
-        ConstantInt *LHSC = 0;
+        Value *LHS = nullptr;
+        ConstantInt *LHSC = nullptr;
         if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) ||
             LHSC->getValue() != Op0KnownZeroInverted)
           LHS = Op0;
 
         // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
         // then turn "((1 << x)&8) != 0" into "x == 3".
-        Value *X = 0;
+        Value *X = nullptr;
         if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
           unsigned CmpVal = Op0KnownZeroInverted.countTrailingZeros();
           return new ICmpInst(ICmpInst::ICMP_EQ, X,
@@ -2470,7 +2702,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     if (SelectInst *SI = dyn_cast<SelectInst>(*I.user_begin()))
       if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) ||
           (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1))
-        return 0;
+        return nullptr;
 
   // See if we are doing a comparison between a constant and an instruction that
   // can be folded into the comparison.
@@ -2506,7 +2738,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         // If either operand of the select is a constant, we can fold the
         // comparison into the select arms, which will cause one to be
         // constant folded and the select turned into a bitwise or.
-        Value *Op1 = 0, *Op2 = 0;
+        Value *Op1 = nullptr, *Op2 = nullptr;
         if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1)))
           Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
         if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2)))
@@ -2618,7 +2850,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 
     // Analyze the case when either Op0 or Op1 is an add instruction.
     // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
-    Value *A = 0, *B = 0, *C = 0, *D = 0;
+    Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
     if (BO0 && BO0->getOpcode() == Instruction::Add)
       A = BO0->getOperand(0), B = BO0->getOperand(1);
     if (BO1 && BO1->getOpcode() == Instruction::Add)
@@ -2713,7 +2945,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 
     // Analyze the case when either Op0 or Op1 is a sub instruction.
     // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
-    A = 0; B = 0; C = 0; D = 0;
+    A = nullptr; B = nullptr; C = nullptr; D = nullptr;
     if (BO0 && BO0->getOpcode() == Instruction::Sub)
       A = BO0->getOperand(0), B = BO0->getOperand(1);
     if (BO1 && BO1->getOpcode() == Instruction::Sub)
@@ -2739,7 +2971,17 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         BO0->hasOneUse() && BO1->hasOneUse())
       return new ICmpInst(Pred, D, B);
 
-    BinaryOperator *SRem = NULL;
+    // icmp (0-X) < cst --> x > -cst
+    if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) {
+      Value *X;
+      if (match(BO0, m_Neg(m_Value(X))))
+        if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1))
+          if (!RHSC->isMinValue(/*isSigned=*/true))
+            return new ICmpInst(I.getSwappedPredicate(), X,
+                                ConstantExpr::getNeg(RHSC));
+    }
+
+    BinaryOperator *SRem = nullptr;
     // icmp (srem X, Y), Y
     if (BO0 && BO0->getOpcode() == Instruction::SRem &&
         Op1 == BO0->getOperand(1))
@@ -2877,6 +3119,16 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         (Op0 == A || Op0 == B))
       if (Instruction *R = ProcessUAddIdiom(I, Op1, *this))
         return R;
+
+    // (zext a) * (zext b)  --> llvm.umul.with.overflow.
+    if (match(Op0, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
+      if (Instruction *R = ProcessUMulZExtIdiom(I, Op0, Op1, *this))
+        return R;
+    }
+    if (match(Op1, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
+      if (Instruction *R = ProcessUMulZExtIdiom(I, Op1, Op0, *this))
+        return R;
+    }
   }
 
   if (I.isEquality()) {
@@ -2918,7 +3170,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
     if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
         match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
-      Value *X = 0, *Y = 0, *Z = 0;
+      Value *X = nullptr, *Y = nullptr, *Z = nullptr;
 
       if (A == C) {
         X = B; Y = D; Z = A;
@@ -3009,7 +3261,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X)
       return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate());
   }
-  return Changed ? &I : 0;
+  return Changed ? &I : nullptr;
 }
 
 /// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible.
@@ -3017,13 +3269,13 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
                                                 Instruction *LHSI,
                                                 Constant *RHSC) {
-  if (!isa<ConstantFP>(RHSC)) return 0;
+  if (!isa<ConstantFP>(RHSC)) return nullptr;
   const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
 
   // Get the width of the mantissa.  We don't want to hack on conversions that
   // might lose information from the integer, e.g. "i64 -> float"
   int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
-  if (MantissaWidth == -1) return 0;  // Unknown.
+  if (MantissaWidth == -1) return nullptr;  // Unknown.
 
   // Check to see that the input is converted from an integer type that is small
   // enough that preserves all bits.  TODO: check here for "known" sign bits.
@@ -3037,7 +3289,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
 
   // If the conversion would lose info, don't hack on this.
   if ((int)InputSize > MantissaWidth)
-    return 0;
+    return nullptr;
 
   // Otherwise, we can potentially simplify the comparison.  We know that it
   // will always come through as an integer value and we know the constant is
@@ -3383,5 +3635,5 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         return new FCmpInst(I.getPredicate(), LHSExt->getOperand(0),
                             RHSExt->getOperand(0));
 
-  return Changed ? &I : 0;
+  return Changed ? &I : nullptr;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index dcc8b0f..66d0938 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -20,6 +20,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "instcombine"
+
 STATISTIC(NumDeadStore,    "Number of dead stores eliminated");
 STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
 
@@ -29,10 +31,13 @@ STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
 static bool pointsToConstantGlobal(Value *V) {
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     return GV->isConstant();
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
     if (CE->getOpcode() == Instruction::BitCast ||
+        CE->getOpcode() == Instruction::AddrSpaceCast ||
         CE->getOpcode() == Instruction::GetElementPtr)
       return pointsToConstantGlobal(CE->getOperand(0));
+  }
   return false;
 }
 
@@ -60,9 +65,9 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
       continue;
     }
 
-    if (BitCastInst *BCI = dyn_cast<BitCastInst>(I)) {
+    if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
       // If uses of the bitcast are ok, we are ok.
-      if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, ToDelete, IsOffset))
+      if (!isOnlyCopiedFromConstantGlobal(I, TheCopy, ToDelete, IsOffset))
         return false;
       continue;
     }
@@ -112,7 +117,7 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
     // If this is isn't our memcpy/memmove, reject it as something we can't
     // handle.
     MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
-    if (MI == 0)
+    if (!MI)
       return false;
 
     // If the transfer is using the alloca as a source of the transfer, then
@@ -148,10 +153,10 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
 static MemTransferInst *
 isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
                                SmallVectorImpl<Instruction *> &ToDelete) {
-  MemTransferInst *TheCopy = 0;
+  MemTransferInst *TheCopy = nullptr;
   if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete))
     return TheCopy;
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
@@ -172,7 +177,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
       Type *NewTy =
         ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
-      AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName());
+      AllocaInst *New = Builder->CreateAlloca(NewTy, nullptr, AI.getName());
       New->setAlignment(AI.getAlignment());
 
       // Scan to the end of the allocation instructions, to skip over a block of
@@ -295,7 +300,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
 
     // If the address spaces don't match, don't eliminate the cast.
     if (DestTy->getAddressSpace() != SrcTy->getAddressSpace())
-      return 0;
+      return nullptr;
 
     Type *SrcPTy = SrcTy->getElementType();
 
@@ -346,7 +351,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
       }
     }
   }
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
@@ -373,7 +378,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
 
   // None of the following transforms are legal for volatile/atomic loads.
   // FIXME: Some of it is okay for atomic loads; needs refactoring.
-  if (!LI.isSimple()) return 0;
+  if (!LI.isSimple()) return nullptr;
 
   // Do really simple store-to-load forwarding and load CSE, to catch cases
   // where there are several consecutive memory accesses to the same location,
@@ -455,7 +460,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
         }
     }
   }
-  return 0;
+  return nullptr;
 }
 
 /// InstCombineStoreToCast - Fold store V, (cast P) -> store (cast V), P
@@ -467,12 +472,12 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
 
   Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
   PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
-  if (SrcTy == 0) return 0;
+  if (!SrcTy) return nullptr;
 
   Type *SrcPTy = SrcTy->getElementType();
 
   if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy())
-    return 0;
+    return nullptr;
 
   /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep"
   /// to its first element.  This allows us to handle things like:
@@ -506,20 +511,20 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
   }
 
   if (!SrcPTy->isIntegerTy() && !SrcPTy->isPointerTy())
-    return 0;
+    return nullptr;
 
   // If the pointers point into different address spaces don't do the
   // transformation.
   if (SrcTy->getAddressSpace() !=
       cast<PointerType>(CI->getType())->getAddressSpace())
-    return 0;
+    return nullptr;
 
   // If the pointers point to values of different sizes don't do the
   // transformation.
   if (!IC.getDataLayout() ||
       IC.getDataLayout()->getTypeSizeInBits(SrcPTy) !=
       IC.getDataLayout()->getTypeSizeInBits(DestPTy))
-    return 0;
+    return nullptr;
 
   // If the pointers point to pointers to different address spaces don't do the
   // transformation. It is not safe to introduce an addrspacecast instruction in
@@ -527,7 +532,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
   // cast.
   if (SrcPTy->isPointerTy() && DestPTy->isPointerTy() &&
       SrcPTy->getPointerAddressSpace() != DestPTy->getPointerAddressSpace())
-    return 0;
+    return nullptr;
 
   // Okay, we are casting from one integer or pointer type to another of
   // the same size.  Instead of casting the pointer before
@@ -607,7 +612,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 
   // Don't hack volatile/atomic stores.
   // FIXME: Some bits are legal for atomic stores; needs refactoring.
-  if (!SI.isSimple()) return 0;
+  if (!SI.isSimple()) return nullptr;
 
   // If the RHS is an alloca with a single use, zapify the store, making the
   // alloca dead.
@@ -674,7 +679,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       if (Instruction *U = dyn_cast<Instruction>(Val))
         Worklist.Add(U);  // Dropped a use.
     }
-    return 0;  // Do not modify these!
+    return nullptr;  // Do not modify these!
   }
 
   // store undef, Ptr -> noop
@@ -703,9 +708,9 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   if (BranchInst *BI = dyn_cast<BranchInst>(BBI))
     if (BI->isUnconditional())
       if (SimplifyStoreAtEndOfBlock(SI))
-        return 0;  // xform done!
+        return nullptr;  // xform done!
 
-  return 0;
+  return nullptr;
 }
 
 /// SimplifyStoreAtEndOfBlock - Turn things like:
@@ -728,7 +733,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   // the other predecessor.
   pred_iterator PI = pred_begin(DestBB);
   BasicBlock *P = *PI;
-  BasicBlock *OtherBB = 0;
+  BasicBlock *OtherBB = nullptr;
 
   if (P != StoreBB)
     OtherBB = P;
@@ -758,7 +763,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
 
   // If the other block ends in an unconditional branch, check for the 'if then
   // else' case.  there is an instruction before the branch.
-  StoreInst *OtherStore = 0;
+  StoreInst *OtherStore = nullptr;
   if (OtherBr->isUnconditional()) {
     --BBI;
     // Skip over debugging info.
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 71fbb6c..9996ebc 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -19,6 +19,8 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "instcombine"
+
 
 /// simplifyValueKnownNonZero - The specific integer value is used in a context
 /// where it is known to be non-zero.  If this allows us to simplify the
@@ -27,13 +29,13 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
   // If V has multiple uses, then we would have to do more analysis to determine
   // if this is safe.  For example, the use could be in dynamically unreached
   // code.
-  if (!V->hasOneUse()) return 0;
+  if (!V->hasOneUse()) return nullptr;
 
   bool MadeChange = false;
 
   // ((1 << A) >>u B) --> (1 << (A-B))
   // Because V cannot be zero, we know that B is less than A.
-  Value *A = 0, *B = 0, *PowerOf2 = 0;
+  Value *A = nullptr, *B = nullptr, *PowerOf2 = nullptr;
   if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(PowerOf2), m_Value(A))),
                       m_Value(B))) &&
       // The "1" can be any value known to be a power of 2.
@@ -68,7 +70,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
   //    If V is a phi node, we can call this on each of its operands.
   //    "select cond, X, 0" can simplify to "X".
 
-  return MadeChange ? V : 0;
+  return MadeChange ? V : nullptr;
 }
 
 
@@ -107,7 +109,7 @@ static Constant *getLogBase2Vector(ConstantDataVector *CV) {
   for (unsigned I = 0, E = CV->getNumElements(); I != E; ++I) {
     Constant *Elt = CV->getElementAsConstant(I);
     if (!match(Elt, m_APInt(IVal)) || !IVal->isPowerOf2())
-      return 0;
+      return nullptr;
     Elts.push_back(ConstantInt::get(Elt->getType(), IVal->logBase2()));
   }
 
@@ -118,6 +120,9 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyMulInst(Op0, Op1, DL))
     return ReplaceInstUsesWith(I, V);
 
@@ -139,7 +144,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       return BinaryOperator::CreateMul(NewOp, ConstantExpr::getShl(C1, C2));
 
     if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
-      Constant *NewCst = 0;
+      Constant *NewCst = nullptr;
       if (match(C1, m_APInt(IVal)) && IVal->isPowerOf2())
         // Replace X*(2^C) with X << C, where C is either a scalar or a splat.
         NewCst = ConstantInt::get(NewOp->getType(), IVal->logBase2());
@@ -165,10 +170,10 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       const APInt &   Val = CI->getValue();
       const APInt &PosVal = Val.abs();
       if (Val.isNegative() && PosVal.isPowerOf2()) {
-        Value *X = 0, *Y = 0;
+        Value *X = nullptr, *Y = nullptr;
         if (Op0->hasOneUse()) {
           ConstantInt *C1;
-          Value *Sub = 0;
+          Value *Sub = nullptr;
           if (match(Op0, m_Sub(m_Value(Y), m_Value(X))))
             Sub = Builder->CreateSub(X, Y, "suba");
           else if (match(Op0, m_Add(m_Value(Y), m_ConstantInt(C1))))
@@ -268,7 +273,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     // -2 is "-1 << 1" so it is all bits set except the low one.
     APInt Negative2(I.getType()->getPrimitiveSizeInBits(), (uint64_t)-2, true);
 
-    Value *BoolCast = 0, *OtherOp = 0;
+    Value *BoolCast = nullptr, *OtherOp = nullptr;
     if (MaskedValueIsZero(Op0, Negative2))
       BoolCast = Op0, OtherOp = Op1;
     else if (MaskedValueIsZero(Op1, Negative2))
@@ -281,7 +286,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
   }
 
-  return Changed ? &I : 0;
+  return Changed ? &I : nullptr;
 }
 
 //
@@ -384,7 +389,7 @@ Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, Constant *C,
   Constant *C0 = dyn_cast<Constant>(Opnd0);
   Constant *C1 = dyn_cast<Constant>(Opnd1);
 
-  BinaryOperator *R = 0;
+  BinaryOperator *R = nullptr;
 
   // (X * C0) * C => X * (C0*C)
   if (FMulOrDiv->getOpcode() == Instruction::FMul) {
@@ -426,6 +431,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (isa<Constant>(Op0))
     std::swap(Op0, Op1);
 
@@ -483,7 +491,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
           Value *M1 = ConstantExpr::getFMul(C1, C);
           Value *M0 = isNormalFp(cast<Constant>(M1)) ?
                       foldFMulConst(cast<Instruction>(Opnd0), C, &I) :
-                      0;
+                      nullptr;
           if (M0 && M1) {
             if (Swap && FAddSub->getOpcode() == Instruction::FSub)
               std::swap(M0, M1);
@@ -503,8 +511,8 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   // Under unsafe algebra do:
   // X * log2(0.5*Y) = X*log2(Y) - X
   if (I.hasUnsafeAlgebra()) {
-    Value *OpX = NULL;
-    Value *OpY = NULL;
+    Value *OpX = nullptr;
+    Value *OpY = nullptr;
     IntrinsicInst *Log2;
     detectLog2OfHalf(Op0, OpY, Log2);
     if (OpY) {
@@ -567,7 +575,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
       Value *Opnd0_0, *Opnd0_1;
       if (Opnd0->hasOneUse() &&
           match(Opnd0, m_FMul(m_Value(Opnd0_0), m_Value(Opnd0_1)))) {
-        Value *Y = 0;
+        Value *Y = nullptr;
         if (Opnd0_0 == Opnd1 && Opnd0_1 != Opnd1)
           Y = Opnd0_1;
         else if (Opnd0_1 == Opnd1 && Opnd0_0 != Opnd1)
@@ -621,7 +629,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
       break;
   }
 
-  return Changed ? &I : 0;
+  return Changed ? &I : nullptr;
 }
 
 /// SimplifyDivRemOfSelect - Try to fold a divide or remainder of a select
@@ -682,12 +690,12 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
 
     // If we past the instruction, quit looking for it.
     if (&*BBI == SI)
-      SI = 0;
+      SI = nullptr;
     if (&*BBI == SelectCond)
-      SelectCond = 0;
+      SelectCond = nullptr;
 
     // If we ran out of things to eliminate, break out of the loop.
-    if (SelectCond == 0 && SI == 0)
+    if (!SelectCond && !SI)
       break;
 
   }
@@ -719,7 +727,7 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
       if (Instruction::BinaryOps(LHS->getOpcode()) == I.getOpcode())
         if (ConstantInt *LHSRHS = dyn_cast<ConstantInt>(LHS->getOperand(1))) {
           if (MultiplyOverflows(RHS, LHSRHS,
-                                I.getOpcode()==Instruction::SDiv))
+                                I.getOpcode() == Instruction::SDiv))
             return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
           return BinaryOperator::Create(I.getOpcode(), LHS->getOperand(0),
                                         ConstantExpr::getMul(RHS, LHSRHS));
@@ -735,12 +743,31 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
     }
   }
 
+  if (ConstantInt *One = dyn_cast<ConstantInt>(Op0)) {
+    if (One->isOne() && !I.getType()->isIntegerTy(1)) {
+      bool isSigned = I.getOpcode() == Instruction::SDiv;
+      if (isSigned) {
+        // If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the
+        // result is one, if Op1 is -1 then the result is minus one, otherwise
+        // it's zero.
+        Value *Inc = Builder->CreateAdd(Op1, One);
+        Value *Cmp = Builder->CreateICmpULT(
+                         Inc, ConstantInt::get(I.getType(), 3));
+        return SelectInst::Create(Cmp, Op1, ConstantInt::get(I.getType(), 0));
+      } else {
+        // If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the
+        // result is one, otherwise it's zero.
+        return new ZExtInst(Builder->CreateICmpEQ(Op1, One), I.getType());
+      }
+    }
+  }
+
   // See if we can fold away this div instruction.
   if (SimplifyDemandedInstructionBits(I))
     return &I;
 
   // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y
-  Value *X = 0, *Z = 0;
+  Value *X = nullptr, *Z = nullptr;
   if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) { // (X - Z) / Y; Y = Op1
     bool isSigned = I.getOpcode() == Instruction::SDiv;
     if ((isSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) ||
@@ -748,7 +775,7 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
       return BinaryOperator::Create(I.getOpcode(), X, Op1);
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// dyn_castZExtVal - Checks if V is a zext or constant that can
@@ -761,7 +788,7 @@ static Value *dyn_castZExtVal(Value *V, Type *Ty) {
     if (C->getValue().getActiveBits() <= cast<IntegerType>(Ty)->getBitWidth())
       return ConstantExpr::getTrunc(C, Ty);
   }
-  return 0;
+  return nullptr;
 }
 
 namespace {
@@ -786,7 +813,7 @@ struct UDivFoldAction {
   };
 
   UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand)
-      : FoldAction(FA), OperandToFold(InputOperand), FoldResult(0) {}
+      : FoldAction(FA), OperandToFold(InputOperand), FoldResult(nullptr) {}
   UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS)
       : FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {}
 };
@@ -865,7 +892,8 @@ static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
   if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
     if (size_t LHSIdx = visitUDivOperand(Op0, SI->getOperand(1), I, Actions))
       if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions)) {
-        Actions.push_back(UDivFoldAction((FoldUDivOperandCb)0, Op1, LHSIdx-1));
+        Actions.push_back(UDivFoldAction((FoldUDivOperandCb)nullptr, Op1,
+                                         LHSIdx-1));
         return Actions.size();
       }
 
@@ -875,6 +903,9 @@ static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
 Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyUDivInst(Op0, Op1, DL))
     return ReplaceInstUsesWith(I, V);
 
@@ -928,12 +959,15 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
         return Inst;
     }
 
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifySDivInst(Op0, Op1, DL))
     return ReplaceInstUsesWith(I, V);
 
@@ -983,7 +1017,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// CvtFDivConstToReciprocal tries to convert X/C into X*1/C if C not a special
@@ -997,7 +1031,7 @@ static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
                                              Constant *Divisor,
                                              bool AllowReciprocal) {
   if (!isa<ConstantFP>(Divisor)) // TODO: handle vectors.
-    return 0;
+    return nullptr;
 
   const APFloat &FpVal = cast<ConstantFP>(Divisor)->getValueAPF();
   APFloat Reciprocal(FpVal.getSemantics());
@@ -1010,7 +1044,7 @@ static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
   }
 
   if (!Cvt)
-    return 0;
+    return nullptr;
 
   ConstantFP *R;
   R = ConstantFP::get(Dividend->getType()->getContext(), Reciprocal);
@@ -1020,6 +1054,9 @@ static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
 Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyFDivInst(Op0, Op1, DL))
     return ReplaceInstUsesWith(I, V);
 
@@ -1037,10 +1074,10 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
         return R;
 
     if (AllowReassociate) {
-      Constant *C1 = 0;
+      Constant *C1 = nullptr;
       Constant *C2 = Op1C;
       Value *X;
-      Instruction *Res = 0;
+      Instruction *Res = nullptr;
 
       if (match(Op0, m_FMul(m_Value(X), m_Constant(C1)))) {
         // (X*C1)/C2 => X * (C1/C2)
@@ -1071,12 +1108,12 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
       return T;
     }
 
-    return 0;
+    return nullptr;
   }
 
   if (AllowReassociate && isa<Constant>(Op0)) {
     Constant *C1 = cast<Constant>(Op0), *C2;
-    Constant *Fold = 0;
+    Constant *Fold = nullptr;
     Value *X;
     bool CreateDiv = true;
 
@@ -1098,13 +1135,13 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
       R->setFastMathFlags(I.getFastMathFlags());
       return R;
     }
-    return 0;
+    return nullptr;
   }
 
   if (AllowReassociate) {
     Value *X, *Y;
-    Value *NewInst = 0;
-    Instruction *SimpR = 0;
+    Value *NewInst = nullptr;
+    Instruction *SimpR = nullptr;
 
     if (Op0->hasOneUse() && match(Op0, m_FDiv(m_Value(X), m_Value(Y)))) {
       // (X/Y) / Z => X / (Y*Z)
@@ -1140,7 +1177,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// This function implements the transforms common to both integer remainder
@@ -1176,12 +1213,15 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyURemInst(Op0, Op1, DL))
     return ReplaceInstUsesWith(I, V);
 
@@ -1208,12 +1248,15 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
     return ReplaceInstUsesWith(I, Ext);
   }
 
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifySRemInst(Op0, Op1, DL))
     return ReplaceInstUsesWith(I, V);
 
@@ -1250,7 +1293,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
     bool hasMissing = false;
     for (unsigned i = 0; i != VWidth; ++i) {
       Constant *Elt = C->getAggregateElement(i);
-      if (Elt == 0) {
+      if (!Elt) {
         hasMissing = true;
         break;
       }
@@ -1279,12 +1322,15 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyFRemInst(Op0, Op1, DL))
     return ReplaceInstUsesWith(I, V);
 
@@ -1292,5 +1338,5 @@ Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
   if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
     return &I;
 
-  return 0;
+  return nullptr;
 }
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 0ab657a..46f7b8a 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -18,6 +18,8 @@
 #include "llvm/IR/DataLayout.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "instcombine"
+
 /// FoldPHIArgBinOpIntoPHI - If we have something like phi [add (a,b), add(a,c)]
 /// and if a/b/c and the add's all have a single use, turn this into a phi
 /// and a single binop.
@@ -48,12 +50,12 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
         // types.
         I->getOperand(0)->getType() != LHSType ||
         I->getOperand(1)->getType() != RHSType)
-      return 0;
+      return nullptr;
 
     // If they are CmpInst instructions, check their predicates
     if (CmpInst *CI = dyn_cast<CmpInst>(I))
       if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate())
-        return 0;
+        return nullptr;
 
     if (isNUW)
       isNUW = cast<OverflowingBinaryOperator>(I)->hasNoUnsignedWrap();
@@ -63,8 +65,8 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
       isExact = cast<PossiblyExactOperator>(I)->isExact();
 
     // Keep track of which operand needs a phi node.
-    if (I->getOperand(0) != LHSVal) LHSVal = 0;
-    if (I->getOperand(1) != RHSVal) RHSVal = 0;
+    if (I->getOperand(0) != LHSVal) LHSVal = nullptr;
+    if (I->getOperand(1) != RHSVal) RHSVal = nullptr;
   }
 
   // If both LHS and RHS would need a PHI, don't do this transformation,
@@ -72,14 +74,14 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   // which leads to higher register pressure. This is especially
   // bad when the PHIs are in the header of a loop.
   if (!LHSVal && !RHSVal)
-    return 0;
+    return nullptr;
 
   // Otherwise, this is safe to transform!
 
   Value *InLHS = FirstInst->getOperand(0);
   Value *InRHS = FirstInst->getOperand(1);
-  PHINode *NewLHS = 0, *NewRHS = 0;
-  if (LHSVal == 0) {
+  PHINode *NewLHS = nullptr, *NewRHS = nullptr;
+  if (!LHSVal) {
     NewLHS = PHINode::Create(LHSType, PN.getNumIncomingValues(),
                              FirstInst->getOperand(0)->getName() + ".pn");
     NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0));
@@ -87,7 +89,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
     LHSVal = NewLHS;
   }
 
-  if (RHSVal == 0) {
+  if (!RHSVal) {
     NewRHS = PHINode::Create(RHSType, PN.getNumIncomingValues(),
                              FirstInst->getOperand(1)->getName() + ".pn");
     NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0));
@@ -148,7 +150,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
     GetElementPtrInst *GEP= dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i));
     if (!GEP || !GEP->hasOneUse() || GEP->getType() != FirstInst->getType() ||
       GEP->getNumOperands() != FirstInst->getNumOperands())
-      return 0;
+      return nullptr;
 
     AllInBounds &= GEP->isInBounds();
 
@@ -170,19 +172,19 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
       // for struct indices, which must always be constant.
       if (isa<ConstantInt>(FirstInst->getOperand(op)) ||
           isa<ConstantInt>(GEP->getOperand(op)))
-        return 0;
+        return nullptr;
 
       if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType())
-        return 0;
+        return nullptr;
 
       // If we already needed a PHI for an earlier operand, and another operand
       // also requires a PHI, we'd be introducing more PHIs than we're
       // eliminating, which increases register pressure on entry to the PHI's
       // block.
       if (NeededPhi)
-        return 0;
+        return nullptr;
 
-      FixedOperands[op] = 0;  // Needs a PHI.
+      FixedOperands[op] = nullptr;  // Needs a PHI.
       NeededPhi = true;
     }
   }
@@ -194,7 +196,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
   // load up into the predecessors so that we have a load of a gep of an alloca,
   // which can usually all be folded into the load.
   if (AllBasePointersAreAllocas)
-    return 0;
+    return nullptr;
 
   // Otherwise, this is safe to transform.  Insert PHI nodes for each operand
   // that is variable.
@@ -288,7 +290,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
   // FIXME: This is overconservative; this transform is allowed in some cases
   // for atomic operations.
   if (FirstLI->isAtomic())
-    return 0;
+    return nullptr;
 
   // When processing loads, we need to propagate two bits of information to the
   // sunk load: whether it is volatile, and what its alignment is.  We currently
@@ -303,20 +305,20 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
   // load and the PHI.
   if (FirstLI->getParent() != PN.getIncomingBlock(0) ||
       !isSafeAndProfitableToSinkLoad(FirstLI))
-    return 0;
+    return nullptr;
 
   // If the PHI is of volatile loads and the load block has multiple
   // successors, sinking it would remove a load of the volatile value from
   // the path through the other successor.
   if (isVolatile &&
       FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1)
-    return 0;
+    return nullptr;
 
   // Check to see if all arguments are the same operation.
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
     LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i));
     if (!LI || !LI->hasOneUse())
-      return 0;
+      return nullptr;
 
     // We can't sink the load if the loaded value could be modified between
     // the load and the PHI.
@@ -324,12 +326,12 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
         LI->getParent() != PN.getIncomingBlock(i) ||
         LI->getPointerAddressSpace() != LoadAddrSpace ||
         !isSafeAndProfitableToSinkLoad(LI))
-      return 0;
+      return nullptr;
 
     // If some of the loads have an alignment specified but not all of them,
     // we can't do the transformation.
     if ((LoadAlignment != 0) != (LI->getAlignment() != 0))
-      return 0;
+      return nullptr;
 
     LoadAlignment = std::min(LoadAlignment, LI->getAlignment());
 
@@ -338,7 +340,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
     // the path through the other successor.
     if (isVolatile &&
         LI->getParent()->getTerminator()->getNumSuccessors() != 1)
-      return 0;
+      return nullptr;
   }
 
   // Okay, they are all the same operation.  Create a new PHI node of the
@@ -354,7 +356,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
     Value *NewInVal = cast<LoadInst>(PN.getIncomingValue(i))->getOperand(0);
     if (NewInVal != InVal)
-      InVal = 0;
+      InVal = nullptr;
     NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
   }
 
@@ -398,8 +400,8 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   // If all input operands to the phi are the same instruction (e.g. a cast from
   // the same type or "+42") we can pull the operation through the PHI, reducing
   // code size and simplifying code.
-  Constant *ConstantOp = 0;
-  Type *CastSrcTy = 0;
+  Constant *ConstantOp = nullptr;
+  Type *CastSrcTy = nullptr;
   bool isNUW = false, isNSW = false, isExact = false;
 
   if (isa<CastInst>(FirstInst)) {
@@ -409,13 +411,13 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     // the code by turning an i32 into an i1293.
     if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) {
       if (!ShouldChangeType(PN.getType(), CastSrcTy))
-        return 0;
+        return nullptr;
     }
   } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
     // Can fold binop, compare or shift here if the RHS is a constant,
     // otherwise call FoldPHIArgBinOpIntoPHI.
     ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
-    if (ConstantOp == 0)
+    if (!ConstantOp)
       return FoldPHIArgBinOpIntoPHI(PN);
 
     if (OverflowingBinaryOperator *BO =
@@ -426,19 +428,19 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
                dyn_cast<PossiblyExactOperator>(FirstInst))
       isExact = PEO->isExact();
   } else {
-    return 0;  // Cannot fold this operation.
+    return nullptr;  // Cannot fold this operation.
   }
 
   // Check to see if all arguments are the same operation.
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
     Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
-    if (I == 0 || !I->hasOneUse() || !I->isSameOperationAs(FirstInst))
-      return 0;
+    if (!I || !I->hasOneUse() || !I->isSameOperationAs(FirstInst))
+      return nullptr;
     if (CastSrcTy) {
       if (I->getOperand(0)->getType() != CastSrcTy)
-        return 0;  // Cast operation must match.
+        return nullptr;  // Cast operation must match.
     } else if (I->getOperand(1) != ConstantOp) {
-      return 0;
+      return nullptr;
     }
 
     if (isNUW)
@@ -462,7 +464,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
     Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0);
     if (NewInVal != InVal)
-      InVal = 0;
+      InVal = nullptr;
     NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
   }
 
@@ -587,10 +589,10 @@ namespace llvm {
   template<>
   struct DenseMapInfo<LoweredPHIRecord> {
     static inline LoweredPHIRecord getEmptyKey() {
-      return LoweredPHIRecord(0, 0);
+      return LoweredPHIRecord(nullptr, 0);
     }
     static inline LoweredPHIRecord getTombstoneKey() {
-      return LoweredPHIRecord(0, 1);
+      return LoweredPHIRecord(nullptr, 1);
     }
     static unsigned getHashValue(const LoweredPHIRecord &Val) {
       return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^
@@ -637,14 +639,14 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
     // bail out.
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
       InvokeInst *II = dyn_cast<InvokeInst>(PN->getIncomingValue(i));
-      if (II == 0) continue;
+      if (!II) continue;
       if (II->getParent() != PN->getIncomingBlock(i))
         continue;
 
       // If we have a phi, and if it's directly in the predecessor, then we have
       // a critical edge where we need to put the truncate.  Since we can't
       // split the edge in instcombine, we have to bail out.
-      return 0;
+      return nullptr;
     }
 
     for (User *U : PN->users()) {
@@ -667,7 +669,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
       if (UserI->getOpcode() != Instruction::LShr ||
           !UserI->hasOneUse() || !isa<TruncInst>(UserI->user_back()) ||
           !isa<ConstantInt>(UserI->getOperand(1)))
-        return 0;
+        return nullptr;
 
       unsigned Shift = cast<ConstantInt>(UserI->getOperand(1))->getZExtValue();
       PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, UserI->user_back()));
@@ -705,7 +707,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
 
     // If we've already lowered a user like this, reuse the previously lowered
     // value.
-    if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == 0) {
+    if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == nullptr) {
 
       // Otherwise, Create the new PHI node for this user.
       EltPHI = PHINode::Create(Ty, PN->getNumIncomingValues(),
@@ -894,5 +896,5 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
     if (Instruction *Res = SliceUpIllegalIntegerPHI(PN))
       return Res;
 
-  return 0;
+  return nullptr;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index e74d912..9a41e4b 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -18,16 +18,18 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "instcombine"
+
 /// MatchSelectPattern - Pattern match integer [SU]MIN, [SU]MAX, and ABS idioms,
 /// returning the kind and providing the out parameter results if we
 /// successfully match.
 static SelectPatternFlavor
 MatchSelectPattern(Value *V, Value *&LHS, Value *&RHS) {
   SelectInst *SI = dyn_cast<SelectInst>(V);
-  if (SI == 0) return SPF_UNKNOWN;
+  if (!SI) return SPF_UNKNOWN;
 
   ICmpInst *ICI = dyn_cast<ICmpInst>(SI->getCondition());
-  if (ICI == 0) return SPF_UNKNOWN;
+  if (!ICI) return SPF_UNKNOWN;
 
   LHS = ICI->getOperand(0);
   RHS = ICI->getOperand(1);
@@ -129,15 +131,15 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
     if (TI->isCast()) {
       Type *FIOpndTy = FI->getOperand(0)->getType();
       if (TI->getOperand(0)->getType() != FIOpndTy)
-        return 0;
+        return nullptr;
       // The select condition may be a vector. We may only change the operand
       // type if the vector width remains the same (and matches the condition).
       Type *CondTy = SI.getCondition()->getType();
       if (CondTy->isVectorTy() && (!FIOpndTy->isVectorTy() ||
           CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements()))
-        return 0;
+        return nullptr;
     } else {
-      return 0;  // unknown unary op.
+      return nullptr;  // unknown unary op.
     }
 
     // Fold this by inserting a select from the input values.
@@ -149,7 +151,7 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
 
   // Only handle binary operators here.
   if (!isa<BinaryOperator>(TI))
-    return 0;
+    return nullptr;
 
   // Figure out if the operations have any operands in common.
   Value *MatchOp, *OtherOpT, *OtherOpF;
@@ -165,7 +167,7 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
     OtherOpF = FI->getOperand(0);
     MatchIsOpZero = false;
   } else if (!TI->isCommutative()) {
-    return 0;
+    return nullptr;
   } else if (TI->getOperand(0) == FI->getOperand(1)) {
     MatchOp  = TI->getOperand(0);
     OtherOpT = TI->getOperand(1);
@@ -177,7 +179,7 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
     OtherOpF = FI->getOperand(1);
     MatchIsOpZero = true;
   } else {
-    return 0;
+    return nullptr;
   }
 
   // If we reach here, they do have operations in common.
@@ -282,7 +284,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// SimplifyWithOpReplaced - See if V simplifies when its operand Op is
@@ -296,7 +298,7 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
 
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I)
-    return 0;
+    return nullptr;
 
   // If this is a binary operator, try to simplify it with the replaced op.
   if (BinaryOperator *B = dyn_cast<BinaryOperator>(I)) {
@@ -347,7 +349,7 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// foldSelectICmpAndOr - We want to turn:
@@ -368,18 +370,18 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
                                   InstCombiner::BuilderTy *Builder) {
   const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
   if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy())
-    return 0;
+    return nullptr;
 
   Value *CmpLHS = IC->getOperand(0);
   Value *CmpRHS = IC->getOperand(1);
 
   if (!match(CmpRHS, m_Zero()))
-    return 0;
+    return nullptr;
 
   Value *X;
   const APInt *C1;
   if (!match(CmpLHS, m_And(m_Value(X), m_Power2(C1))))
-    return 0;
+    return nullptr;
 
   const APInt *C2;
   bool OrOnTrueVal = false;
@@ -388,7 +390,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
     OrOnTrueVal = match(TrueVal, m_Or(m_Specific(FalseVal), m_Power2(C2)));
 
   if (!OrOnFalseVal && !OrOnTrueVal)
-    return 0;
+    return nullptr;
 
   Value *V = CmpLHS;
   Value *Y = OrOnFalseVal ? TrueVal : FalseVal;
@@ -527,7 +529,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
   if (IntegerType *Ty = dyn_cast<IntegerType>(CmpLHS->getType())) {
     if (TrueVal->getType() == Ty) {
       if (ConstantInt *Cmp = dyn_cast<ConstantInt>(CmpRHS)) {
-        ConstantInt *C1 = NULL, *C2 = NULL;
+        ConstantInt *C1 = nullptr, *C2 = nullptr;
         if (Pred == ICmpInst::ICMP_SGT && Cmp->isAllOnesValue()) {
           C1 = dyn_cast<ConstantInt>(TrueVal);
           C2 = dyn_cast<ConstantInt>(FalseVal);
@@ -586,7 +588,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
   if (Value *V = foldSelectICmpAndOr(SI, TrueVal, FalseVal, Builder))
     return ReplaceInstUsesWith(SI, V);
 
-  return Changed ? &SI : 0;
+  return Changed ? &SI : nullptr;
 }
 
 
@@ -606,7 +608,7 @@ static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V,
   // If the value is a non-instruction value like a constant or argument, it
   // can always be mapped.
   const Instruction *I = dyn_cast<Instruction>(V);
-  if (I == 0) return true;
+  if (!I) return true;
 
   // If V is a PHI node defined in the same block as the condition PHI, we can
   // map the arguments.
@@ -649,11 +651,35 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
       return ReplaceInstUsesWith(Outer, C);
   }
 
-  // TODO: MIN(MIN(A, 23), 97)
-  return 0;
+  if (SPF1 == SPF2) {
+    if (ConstantInt *CB = dyn_cast<ConstantInt>(B)) {
+      if (ConstantInt *CC = dyn_cast<ConstantInt>(C)) {
+        APInt ACB = CB->getValue();
+        APInt ACC = CC->getValue();
+
+        // MIN(MIN(A, 23), 97) -> MIN(A, 23)
+        // MAX(MAX(A, 97), 23) -> MAX(A, 97)
+        if ((SPF1 == SPF_UMIN && ACB.ule(ACC)) ||
+            (SPF1 == SPF_SMIN && ACB.sle(ACC)) ||
+            (SPF1 == SPF_UMAX && ACB.uge(ACC)) ||
+            (SPF1 == SPF_SMAX && ACB.sge(ACC)))
+          return ReplaceInstUsesWith(Outer, Inner);
+
+        // MIN(MIN(A, 97), 23) -> MIN(A, 23)
+        // MAX(MAX(A, 23), 97) -> MAX(A, 97)
+        if ((SPF1 == SPF_UMIN && ACB.ugt(ACC)) ||
+            (SPF1 == SPF_SMIN && ACB.sgt(ACC)) ||
+            (SPF1 == SPF_UMAX && ACB.ult(ACC)) ||
+            (SPF1 == SPF_SMAX && ACB.slt(ACC))) {
+          Outer.replaceUsesOfWith(Inner, A);
+          return &Outer;
+        }
+      }
+    }
+  }
+  return nullptr;
 }
 
-
 /// foldSelectICmpAnd - If one of the constants is zero (we know they can't
 /// both be) and we have an icmp instruction with zero, and we have an 'and'
 /// with the non-constant value and a power of two we can turn the select
@@ -663,27 +689,27 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal,
                                 InstCombiner::BuilderTy *Builder) {
   const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
   if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy())
-    return 0;
+    return nullptr;
 
   if (!match(IC->getOperand(1), m_Zero()))
-    return 0;
+    return nullptr;
 
   ConstantInt *AndRHS;
   Value *LHS = IC->getOperand(0);
   if (!match(LHS, m_And(m_Value(), m_ConstantInt(AndRHS))))
-    return 0;
+    return nullptr;
 
   // If both select arms are non-zero see if we have a select of the form
   // 'x ? 2^n + C : C'. Then we can offset both arms by C, use the logic
   // for 'x ? 2^n : 0' and fix the thing up at the end.
-  ConstantInt *Offset = 0;
+  ConstantInt *Offset = nullptr;
   if (!TrueVal->isZero() && !FalseVal->isZero()) {
     if ((TrueVal->getValue() - FalseVal->getValue()).isPowerOf2())
       Offset = FalseVal;
     else if ((FalseVal->getValue() - TrueVal->getValue()).isPowerOf2())
       Offset = TrueVal;
     else
-      return 0;
+      return nullptr;
 
     // Adjust TrueVal and FalseVal to the offset.
     TrueVal = ConstantInt::get(Builder->getContext(),
@@ -696,7 +722,7 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal,
   if (!AndRHS->getValue().isPowerOf2() ||
       (!TrueVal->getValue().isPowerOf2() &&
        !FalseVal->getValue().isPowerOf2()))
-    return 0;
+    return nullptr;
 
   // Determine which shift is needed to transform result of the 'and' into the
   // desired result.
@@ -708,7 +734,7 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal,
   // or a trunc of the 'and'. The trunc case requires that all of the truncated
   // bits are zero, we can figure that out by looking at the 'and' mask.
   if (AndZeros >= ValC->getBitWidth())
-    return 0;
+    return nullptr;
 
   Value *V = Builder->CreateZExtOrTrunc(LHS, SI.getType());
   if (ValZeros > AndZeros)
@@ -866,7 +892,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   if (Instruction *TI = dyn_cast<Instruction>(TrueVal))
     if (Instruction *FI = dyn_cast<Instruction>(FalseVal))
       if (TI->hasOneUse() && FI->hasOneUse()) {
-        Instruction *AddOp = 0, *SubOp = 0;
+        Instruction *AddOp = nullptr, *SubOp = nullptr;
 
         // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
         if (TI->getOpcode() == FI->getOpcode())
@@ -888,7 +914,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
         }
 
         if (AddOp) {
-          Value *OtherAddOp = 0;
+          Value *OtherAddOp = nullptr;
           if (SubOp->getOperand(0) == AddOp->getOperand(0)) {
             OtherAddOp = AddOp->getOperand(1);
           } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) {
@@ -969,7 +995,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) {
     if (TrueSI->getCondition() == CondVal) {
       if (SI.getTrueValue() == TrueSI->getTrueValue())
-        return 0;
+        return nullptr;
       SI.setOperand(1, TrueSI->getTrueValue());
       return &SI;
     }
@@ -977,7 +1003,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) {
     if (FalseSI->getCondition() == CondVal) {
       if (SI.getFalseValue() == FalseSI->getFalseValue())
-        return 0;
+        return nullptr;
       SI.setOperand(2, FalseSI->getFalseValue());
       return &SI;
     }
@@ -1005,5 +1031,5 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 8273dfd..cc6665c 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -19,6 +19,8 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "instcombine"
+
 Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
   assert(I.getOperand(1)->getType() == I.getOperand(0)->getType());
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -33,7 +35,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
       if (Instruction *R = FoldOpIntoSelect(I, SI))
         return R;
 
-  if (ConstantInt *CUI = dyn_cast<ConstantInt>(Op1))
+  if (Constant *CUI = dyn_cast<Constant>(Op1))
     if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
       return Res;
 
@@ -50,7 +52,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
     return &I;
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// CanEvaluateShifted - See if we can compute the specified value, but shifted
@@ -78,7 +80,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
   // if the needed bits are already zero in the input.  This allows us to reuse
   // the value which means that we don't care if the shift has multiple uses.
   //  TODO:  Handle opposite shift by exact value.
-  ConstantInt *CI = 0;
+  ConstantInt *CI = nullptr;
   if ((isLeftShift && match(I, m_LShr(m_Value(), m_ConstantInt(CI)))) ||
       (!isLeftShift && match(I, m_Shl(m_Value(), m_ConstantInt(CI))))) {
     if (CI->getZExtValue() == NumBits) {
@@ -115,7 +117,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
   case Instruction::Shl: {
     // We can often fold the shift into shifts-by-a-constant.
     CI = dyn_cast<ConstantInt>(I->getOperand(1));
-    if (CI == 0) return false;
+    if (!CI) return false;
 
     // We can always fold shl(c1)+shl(c2) -> shl(c1+c2).
     if (isLeftShift) return true;
@@ -139,7 +141,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
   case Instruction::LShr: {
     // We can often fold the shift into shifts-by-a-constant.
     CI = dyn_cast<ConstantInt>(I->getOperand(1));
-    if (CI == 0) return false;
+    if (!CI) return false;
 
     // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
     if (!isLeftShift) return true;
@@ -309,37 +311,38 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
 
 
 
-Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
+Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
                                                BinaryOperator &I) {
   bool isLeftShift = I.getOpcode() == Instruction::Shl;
 
+  ConstantInt *COp1 = nullptr;
+  if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(Op1))
+    COp1 = dyn_cast_or_null<ConstantInt>(CV->getSplatValue());
+  else if (ConstantVector *CV = dyn_cast<ConstantVector>(Op1))
+    COp1 = dyn_cast_or_null<ConstantInt>(CV->getSplatValue());
+  else
+    COp1 = dyn_cast<ConstantInt>(Op1);
+
+  if (!COp1)
+    return nullptr;
 
   // See if we can propagate this shift into the input, this covers the trivial
   // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
   if (I.getOpcode() != Instruction::AShr &&
-      CanEvaluateShifted(Op0, Op1->getZExtValue(), isLeftShift, *this)) {
+      CanEvaluateShifted(Op0, COp1->getZExtValue(), isLeftShift, *this)) {
     DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
               " to eliminate shift:\n  IN: " << *Op0 << "\n  SH: " << I <<"\n");
 
     return ReplaceInstUsesWith(I,
-                 GetShiftedValue(Op0, Op1->getZExtValue(), isLeftShift, *this));
+                 GetShiftedValue(Op0, COp1->getZExtValue(), isLeftShift, *this));
   }
 
-
   // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
   uint32_t TypeBits = Op0->getType()->getScalarSizeInBits();
 
-  // shl i32 X, 32 = 0 and srl i8 Y, 9 = 0, ... just don't eliminate
-  // a signed shift.
-  //
-  if (Op1->uge(TypeBits)) {
-    if (I.getOpcode() != Instruction::AShr)
-      return ReplaceInstUsesWith(I, Constant::getNullValue(Op0->getType()));
-    // ashr i32 X, 32 --> ashr i32 X, 31
-    I.setOperand(1, ConstantInt::get(I.getType(), TypeBits-1));
-    return &I;
-  }
+  assert(!COp1->uge(TypeBits) &&
+         "Shift over the type width should have been removed already");
 
   // ((X*C1) << C2) == (X * (C1 << C2))
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0))
@@ -367,7 +370,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     if (TrOp && I.isLogicalShift() && TrOp->isShift() &&
         isa<ConstantInt>(TrOp->getOperand(1))) {
       // Okay, we'll do this xform.  Make the shift of shift.
-      Constant *ShAmt = ConstantExpr::getZExt(Op1, TrOp->getType());
+      Constant *ShAmt = ConstantExpr::getZExt(COp1, TrOp->getType());
       // (shift2 (shift1 & 0x00FF), c2)
       Value *NSh = Builder->CreateBinOp(I.getOpcode(), TrOp, ShAmt,I.getName());
 
@@ -384,10 +387,10 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       // shift.  We know that it is a logical shift by a constant, so adjust the
       // mask as appropriate.
       if (I.getOpcode() == Instruction::Shl)
-        MaskV <<= Op1->getZExtValue();
+        MaskV <<= COp1->getZExtValue();
       else {
         assert(I.getOpcode() == Instruction::LShr && "Unknown logical shift");
-        MaskV = MaskV.lshr(Op1->getZExtValue());
+        MaskV = MaskV.lshr(COp1->getZExtValue());
       }
 
       // shift1 & 0x00FF
@@ -421,9 +424,13 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           // (X + (Y << C))
           Value *X = Builder->CreateBinOp(Op0BO->getOpcode(), YS, V1,
                                           Op0BO->getOperand(1)->getName());
-          uint32_t Op1Val = Op1->getLimitedValue(TypeBits);
-          return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(),
-                     APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
+          uint32_t Op1Val = COp1->getLimitedValue(TypeBits);
+
+          APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
+          Constant *Mask = ConstantInt::get(I.getContext(), Bits);
+          if (VectorType *VT = dyn_cast<VectorType>(X->getType()))
+            Mask = ConstantVector::getSplat(VT->getNumElements(), Mask);
+          return BinaryOperator::CreateAnd(X, Mask);
         }
 
         // Turn (Y + ((X >> C) & CC)) << C  ->  ((X & (CC << C)) + (Y << C))
@@ -453,9 +460,13 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           // (X + (Y << C))
           Value *X = Builder->CreateBinOp(Op0BO->getOpcode(), V1, YS,
                                           Op0BO->getOperand(0)->getName());
-          uint32_t Op1Val = Op1->getLimitedValue(TypeBits);
-          return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(),
-                     APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
+          uint32_t Op1Val = COp1->getLimitedValue(TypeBits);
+
+          APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
+          Constant *Mask = ConstantInt::get(I.getContext(), Bits);
+          if (VectorType *VT = dyn_cast<VectorType>(X->getType()))
+            Mask = ConstantVector::getSplat(VT->getNumElements(), Mask);
+          return BinaryOperator::CreateAnd(X, Mask);
         }
 
         // Turn (((X >> C)&CC) + Y) << C  ->  (X + (Y << C)) & (CC << C)
@@ -523,7 +534,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
   // Find out if this is a shift of a shift by a constant.
   BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0);
   if (ShiftOp && !ShiftOp->isShift())
-    ShiftOp = 0;
+    ShiftOp = nullptr;
 
   if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
 
@@ -541,9 +552,9 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
 
     ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1));
     uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits);
-    uint32_t ShiftAmt2 = Op1->getLimitedValue(TypeBits);
+    uint32_t ShiftAmt2 = COp1->getLimitedValue(TypeBits);
     assert(ShiftAmt2 != 0 && "Should have been simplified earlier");
-    if (ShiftAmt1 == 0) return 0;  // Will be simplified in the future.
+    if (ShiftAmt1 == 0) return nullptr;  // Will be simplified in the future.
     Value *X = ShiftOp->getOperand(0);
 
     IntegerType *Ty = cast<IntegerType>(I.getType());
@@ -671,10 +682,13 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       }
     }
   }
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitShl(BinaryOperator &I) {
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
                                  DL))
@@ -709,10 +723,13 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
       match(I.getOperand(1), m_Constant(C2)))
     return BinaryOperator::CreateShl(ConstantExpr::getShl(C1, C2), A);
 
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1),
                                   I.isExact(), DL))
     return ReplaceInstUsesWith(I, V);
@@ -749,10 +766,13 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
+
   if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1),
                                   I.isExact(), DL))
     return ReplaceInstUsesWith(I, V);
@@ -805,6 +825,5 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
   if (NumSignBits == Op0->getType()->getScalarSizeInBits())
     return ReplaceInstUsesWith(I, Op0);
 
-  return 0;
+  return nullptr;
 }
-
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index a47b709..1b42d3d 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "InstCombine.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -21,6 +20,8 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+#define DEBUG_TYPE "instcombine"
+
 /// ShrinkDemandedConstant - Check to see if the specified operand of the
 /// specified instruction is a constant integer.  If so, check to see if there
 /// are any bits set in the constant that are not demanded.  If so, shrink the
@@ -57,7 +58,7 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
 
   Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask,
                                      KnownZero, KnownOne, 0);
-  if (V == 0) return false;
+  if (!V) return false;
   if (V == &Inst) return true;
   ReplaceInstUsesWith(Inst, V);
   return true;
@@ -71,7 +72,7 @@ bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask,
                                         unsigned Depth) {
   Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask,
                                           KnownZero, KnownOne, Depth);
-  if (NewVal == 0) return false;
+  if (!NewVal) return false;
   U = NewVal;
   return true;
 }
@@ -101,7 +102,7 @@ bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask,
 Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                                              APInt &KnownZero, APInt &KnownOne,
                                              unsigned Depth) {
-  assert(V != 0 && "Null pointer of Value???");
+  assert(V != nullptr && "Null pointer of Value???");
   assert(Depth <= 6 && "Limit Search Depth");
   uint32_t BitWidth = DemandedMask.getBitWidth();
   Type *VTy = V->getType();
@@ -118,33 +119,33 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // We know all of the bits for a constant!
     KnownOne = CI->getValue() & DemandedMask;
     KnownZero = ~KnownOne & DemandedMask;
-    return 0;
+    return nullptr;
   }
   if (isa<ConstantPointerNull>(V)) {
     // We know all of the bits for a constant!
     KnownOne.clearAllBits();
     KnownZero = DemandedMask;
-    return 0;
+    return nullptr;
   }
 
   KnownZero.clearAllBits();
   KnownOne.clearAllBits();
   if (DemandedMask == 0) {   // Not demanding any bits from V.
     if (isa<UndefValue>(V))
-      return 0;
+      return nullptr;
     return UndefValue::get(VTy);
   }
 
   if (Depth == 6)        // Limit search depth.
-    return 0;
+    return nullptr;
 
   APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
   APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
 
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
-    ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
-    return 0;        // Only analyze instructions.
+    computeKnownBits(V, KnownZero, KnownOne, Depth);
+    return nullptr;        // Only analyze instructions.
   }
 
   // If there are multiple uses of this value and we aren't at the root, then
@@ -157,8 +158,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // this instruction has a simpler value in that context.
     if (I->getOpcode() == Instruction::And) {
       // If either the LHS or the RHS are Zero, the result is zero.
-      ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
-      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
 
       // If all of the demanded bits are known 1 on one side, return the other.
       // These bits cannot contribute to the result of the 'and' in this
@@ -179,8 +180,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // only bits from X or Y are demanded.
 
       // If either the LHS or the RHS are One, the result is One.
-      ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
-      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
 
       // If all of the demanded bits are known zero on one side, return the
       // other.  These bits cannot contribute to the result of the 'or' in this
@@ -204,8 +205,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // We can simplify (X^Y) -> X or Y in the user's context if we know that
       // only bits from X or Y are demanded.
 
-      ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
-      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
 
       // If all of the demanded bits are known zero on one side, return the
       // other.
@@ -216,8 +217,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     }
 
     // Compute the KnownZero/KnownOne bits to simplify things downstream.
-    ComputeMaskedBits(I, KnownZero, KnownOne, Depth);
-    return 0;
+    computeKnownBits(I, KnownZero, KnownOne, Depth);
+    return nullptr;
   }
 
   // If this is the root being simplified, allow it to have multiple uses,
@@ -229,7 +230,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
   switch (I->getOpcode()) {
   default:
-    ComputeMaskedBits(I, KnownZero, KnownOne, Depth);
+    computeKnownBits(I, KnownZero, KnownOne, Depth);
     break;
   case Instruction::And:
     // If either the LHS or the RHS are Zero, the result is zero.
@@ -409,20 +410,20 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   }
   case Instruction::BitCast:
     if (!I->getOperand(0)->getType()->isIntOrIntVectorTy())
-      return 0;  // vector->int or fp->int?
+      return nullptr;  // vector->int or fp->int?
 
     if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
       if (VectorType *SrcVTy =
             dyn_cast<VectorType>(I->getOperand(0)->getType())) {
         if (DstVTy->getNumElements() != SrcVTy->getNumElements())
           // Don't touch a bitcast between vectors of different element counts.
-          return 0;
+          return nullptr;
       } else
         // Don't touch a scalar-to-vector bitcast.
-        return 0;
+        return nullptr;
     } else if (I->getOperand(0)->getType()->isVectorTy())
       // Don't touch a vector-to-scalar bitcast.
-      return 0;
+      return nullptr;
 
     if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
                              KnownZero, KnownOne, Depth+1))
@@ -578,9 +579,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         return I;
     }
 
-    // Otherwise just hand the sub off to ComputeMaskedBits to fill in
+    // Otherwise just hand the sub off to computeKnownBits to fill in
     // the known zeros and ones.
-    ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
+    computeKnownBits(V, KnownZero, KnownOne, Depth);
 
     // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
     // zero.
@@ -751,7 +752,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // remainder is zero.
     if (DemandedMask.isNegative() && KnownZero.isNonNegative()) {
       APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
         KnownZero.setBit(KnownZero.getBitWidth() - 1);
@@ -810,10 +811,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       }
       case Intrinsic::x86_sse42_crc32_64_64:
         KnownZero = APInt::getHighBitsSet(64, 32);
-        return 0;
+        return nullptr;
       }
     }
-    ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
+    computeKnownBits(V, KnownZero, KnownOne, Depth);
     break;
   }
 
@@ -821,7 +822,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   // constant.
   if ((DemandedMask & (KnownZero|KnownOne)) == DemandedMask)
     return Constant::getIntegerValue(VTy, KnownOne);
-  return 0;
+  return nullptr;
 }
 
 /// Helper routine of SimplifyDemandedUseBits. It tries to simplify
@@ -847,13 +848,13 @@ Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
   const APInt &ShlOp1 = cast<ConstantInt>(Shl->getOperand(1))->getValue();
   const APInt &ShrOp1 = cast<ConstantInt>(Shr->getOperand(1))->getValue();
   if (!ShlOp1 || !ShrOp1)
-      return 0; // Noop.
+      return nullptr; // Noop.
 
   Value *VarX = Shr->getOperand(0);
   Type *Ty = VarX->getType();
   unsigned BitWidth = Ty->getIntegerBitWidth();
   if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth))
-    return 0; // Undef.
+    return nullptr; // Undef.
 
   unsigned ShlAmt = ShlOp1.getZExtValue();
   unsigned ShrAmt = ShrOp1.getZExtValue();
@@ -882,7 +883,7 @@ Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
       return VarX;
 
     if (!Shr->hasOneUse())
-      return 0;
+      return nullptr;
 
     BinaryOperator *New;
     if (ShrAmt < ShlAmt) {
@@ -902,7 +903,7 @@ Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
     return InsertNewInstWith(New, *Shl);
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// SimplifyDemandedVectorElts - The specified value produces a vector with
@@ -923,7 +924,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
   if (isa<UndefValue>(V)) {
     // If the entire vector is undefined, just return this info.
     UndefElts = EltMask;
-    return 0;
+    return nullptr;
   }
 
   if (DemandedElts == 0) { // If nothing is demanded, provide undef.
@@ -938,7 +939,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     // Check if this is identity. If so, return 0 since we are not simplifying
     // anything.
     if (DemandedElts.isAllOnesValue())
-      return 0;
+      return nullptr;
 
     Type *EltTy = cast<VectorType>(V->getType())->getElementType();
     Constant *Undef = UndefValue::get(EltTy);
@@ -952,7 +953,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       }
 
       Constant *Elt = C->getAggregateElement(i);
-      if (Elt == 0) return 0;
+      if (!Elt) return nullptr;
 
       if (isa<UndefValue>(Elt)) {   // Already undef.
         Elts.push_back(Undef);
@@ -964,12 +965,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
     // If we changed the constant, return it.
     Constant *NewCV = ConstantVector::get(Elts);
-    return NewCV != C ? NewCV : 0;
+    return NewCV != C ? NewCV : nullptr;
   }
 
   // Limit search depth.
   if (Depth == 10)
-    return 0;
+    return nullptr;
 
   // If multiple users are using the root value, proceed with
   // simplification conservatively assuming that all elements
@@ -980,14 +981,14 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     // the main instcombine process.
     if (Depth != 0)
       // TODO: Just compute the UndefElts information recursively.
-      return 0;
+      return nullptr;
 
     // Conservatively assume that all elements are needed.
     DemandedElts = EltMask;
   }
 
   Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return 0;        // Only analyze instructions.
+  if (!I) return nullptr;        // Only analyze instructions.
 
   bool MadeChange = false;
   APInt UndefElts2(VWidth, 0);
@@ -999,7 +1000,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     // If this is a variable index, we don't know which element it overwrites.
     // demand exactly the same input as we produce.
     ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2));
-    if (Idx == 0) {
+    if (!Idx) {
       // Note that we can't propagate undef elt info, because we don't know
       // which elt is getting updated.
       TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
@@ -1281,5 +1282,5 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     break;
   }
   }
-  return MadeChange ? I : 0;
+  return MadeChange ? I : nullptr;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 521dc9c..8c5e202 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -17,6 +17,8 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "instcombine"
+
 /// CheapToScalarize - Return true if the value is cheaper to scalarize than it
 /// is to leave as a vector operation.  isConstant indicates whether we're
 /// extracting one known element.  If false we're extracting a variable index.
@@ -73,7 +75,7 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
   if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
     // If this is an insert to a variable element, we don't know what it is.
     if (!isa<ConstantInt>(III->getOperand(2)))
-      return 0;
+      return nullptr;
     unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
 
     // If this is an insert to the element we are looking for, return the
@@ -97,14 +99,14 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
   }
 
   // Extract a value from a vector add operation with a constant zero.
-  Value *Val = 0; Constant *Con = 0;
+  Value *Val = nullptr; Constant *Con = nullptr;
   if (match(V, m_Add(m_Value(Val), m_Constant(Con)))) {
     if (Con->getAggregateElement(EltNo)->isNullValue())
       return FindScalarElement(Val, EltNo);
   }
 
   // Otherwise, we don't know.
-  return 0;
+  return nullptr;
 }
 
 // If we have a PHI node with a vector type that has only 2 uses: feed
@@ -113,7 +115,7 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
 Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
   // Verify that the PHI node has exactly 2 uses. Otherwise return NULL.
   if (!PN->hasNUses(2))
-    return NULL;
+    return nullptr;
 
   // If so, it's known at this point that one operand is PHI and the other is
   // an extractelement node. Find the PHI user that is not the extractelement
@@ -128,7 +130,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
   // otherwise return NULL.
   if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) ||
       !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true))
-    return NULL;
+    return nullptr;
 
   // Create a scalar PHI node that will replace the vector PHI node
   // just before the current PHI node.
@@ -318,7 +320,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
       }
     }
   }
-  return 0;
+  return nullptr;
 }
 
 /// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns
@@ -440,10 +442,10 @@ static ShuffleOps CollectShuffleElements(Value *V,
 
         // Either the extracted from or inserted into vector must be RHSVec,
         // otherwise we'd end up with a shuffle of three inputs.
-        if (EI->getOperand(0) == PermittedRHS || PermittedRHS == 0) {
+        if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {
           Value *RHS = EI->getOperand(0);
           ShuffleOps LR = CollectShuffleElements(VecOp, Mask, RHS);
-          assert(LR.second == 0 || LR.second == RHS);
+          assert(LR.second == nullptr || LR.second == RHS);
 
           if (LR.first->getType() != RHS->getType()) {
             // We tried our best, but we can't find anything compatible with RHS
@@ -488,6 +490,41 @@ static ShuffleOps CollectShuffleElements(Value *V,
   return std::make_pair(V, nullptr);
 }
 
+/// Try to find redundant insertvalue instructions, like the following ones:
+///  %0 = insertvalue { i8, i32 } undef, i8 %x, 0
+///  %1 = insertvalue { i8, i32 } %0,    i8 %y, 0
+/// Here the second instruction inserts values at the same indices, as the
+/// first one, making the first one redundant.
+/// It should be transformed to:
+///  %0 = insertvalue { i8, i32 } undef, i8 %y, 0
+Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) {
+  bool IsRedundant = false;
+  ArrayRef<unsigned int> FirstIndices = I.getIndices();
+
+  // If there is a chain of insertvalue instructions (each of them except the
+  // last one has only one use and it's another insertvalue insn from this
+  // chain), check if any of the 'children' uses the same indices as the first
+  // instruction. In this case, the first one is redundant.
+  Value *V = &I;
+  unsigned Depth = 0;
+  while (V->hasOneUse() && Depth < 10) {
+    User *U = V->user_back();
+    auto UserInsInst = dyn_cast<InsertValueInst>(U);
+    if (!UserInsInst || U->getOperand(0) != V)
+      break;
+    if (UserInsInst->getIndices() == FirstIndices) {
+      IsRedundant = true;
+      break;
+    }
+    V = UserInsInst;
+    Depth++;
+  }
+
+  if (IsRedundant)
+    return ReplaceInstUsesWith(I, I.getOperand(0));
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
   Value *VecOp    = IE.getOperand(0);
   Value *ScalarOp = IE.getOperand(1);
@@ -523,13 +560,14 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
       // (and any insertelements it points to), into one big shuffle.
       if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) {
         SmallVector<Constant*, 16> Mask;
-        ShuffleOps LR = CollectShuffleElements(&IE, Mask, 0);
+        ShuffleOps LR = CollectShuffleElements(&IE, Mask, nullptr);
 
         // The proposed shuffle may be trivial, in which case we shouldn't
         // perform the combine.
         if (LR.first != &IE && LR.second != &IE) {
           // We now have a shuffle of LHS, RHS, Mask.
-          if (LR.second == 0) LR.second = UndefValue::get(LR.first->getType());
+          if (LR.second == nullptr)
+            LR.second = UndefValue::get(LR.first->getType());
           return new ShuffleVectorInst(LR.first, LR.second,
                                        ConstantVector::get(Mask));
         }
@@ -546,7 +584,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
     return &IE;
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// Return true if we can evaluate the specified expression tree if the vector
@@ -801,6 +839,20 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
   llvm_unreachable("failed to reorder elements of vector instruction!");
 }
 
+static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask,
+                                  bool &isLHSID, bool &isRHSID) {
+  isLHSID = isRHSID = true;
+
+  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+    if (Mask[i] < 0) continue;  // Ignore undef values.
+    // Is this an identity shuffle of the LHS value?
+    isLHSID &= (Mask[i] == (int)i);
+
+    // Is this an identity shuffle of the RHS value?
+    isRHSID &= (Mask[i]-e == i);
+  }
+}
+
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
@@ -864,16 +916,8 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
 
   if (VWidth == LHSWidth) {
     // Analyze the shuffle, are the LHS or RHS and identity shuffles?
-    bool isLHSID = true, isRHSID = true;
-
-    for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
-      if (Mask[i] < 0) continue;  // Ignore undef values.
-      // Is this an identity shuffle of the LHS value?
-      isLHSID &= (Mask[i] == (int)i);
-
-      // Is this an identity shuffle of the RHS value?
-      isRHSID &= (Mask[i]-e == i);
-    }
+    bool isLHSID, isRHSID;
+    RecognizeIdentityMask(Mask, isLHSID, isRHSID);
 
     // Eliminate identity shuffles.
     if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
@@ -932,16 +976,16 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS);
   if (LHSShuffle)
     if (!isa<UndefValue>(LHSShuffle->getOperand(1)) && !isa<UndefValue>(RHS))
-      LHSShuffle = NULL;
+      LHSShuffle = nullptr;
   if (RHSShuffle)
     if (!isa<UndefValue>(RHSShuffle->getOperand(1)))
-      RHSShuffle = NULL;
+      RHSShuffle = nullptr;
   if (!LHSShuffle && !RHSShuffle)
-    return MadeChange ? &SVI : 0;
+    return MadeChange ? &SVI : nullptr;
 
-  Value* LHSOp0 = NULL;
-  Value* LHSOp1 = NULL;
-  Value* RHSOp0 = NULL;
+  Value* LHSOp0 = nullptr;
+  Value* LHSOp1 = nullptr;
+  Value* RHSOp0 = nullptr;
   unsigned LHSOp0Width = 0;
   unsigned RHSOp0Width = 0;
   if (LHSShuffle) {
@@ -973,11 +1017,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   // case 4
   if (LHSOp0 == RHSOp0) {
     newLHS = LHSOp0;
-    newRHS = NULL;
+    newRHS = nullptr;
   }
 
   if (newLHS == LHS && newRHS == RHS)
-    return MadeChange ? &SVI : 0;
+    return MadeChange ? &SVI : nullptr;
 
   SmallVector<int, 16> LHSMask;
   SmallVector<int, 16> RHSMask;
@@ -1037,7 +1081,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
       // If newRHS == newLHS, we want to remap any references from newRHS to
       // newLHS so that we can properly identify splats that may occur due to
       // obfuscation across the two vectors.
-      if (eltMask >= 0 && newRHS != NULL && newLHS != newRHS)
+      if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS)
         eltMask += newLHSWidth;
     }
 
@@ -1063,10 +1107,17 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
         Elts.push_back(ConstantInt::get(Int32Ty, newMask[i]));
       }
     }
-    if (newRHS == NULL)
+    if (!newRHS)
       newRHS = UndefValue::get(newLHS->getType());
     return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts));
   }
 
-  return MadeChange ? &SVI : 0;
+  // If the result mask is an identity, replace uses of this instruction with
+  // corresponding argument.
+  bool isLHSID, isRHSID;
+  RecognizeIdentityMask(newMask, isLHSID, isRHSID);
+  if (isLHSID && VWidth == LHSOp0Width) return ReplaceInstUsesWith(SVI, newLHS);
+  if (isRHSID && VWidth == RHSOp0Width) return ReplaceInstUsesWith(SVI, newRHS);
+
+  return MadeChange ? &SVI : nullptr;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
index 8c780b5..1ab7db3 100644
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -10,7 +10,6 @@
 #ifndef INSTCOMBINE_WORKLIST_H
 #define INSTCOMBINE_WORKLIST_H
 
-#define DEBUG_TYPE "instcombine"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Instruction.h"
@@ -18,6 +17,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+#define DEBUG_TYPE "instcombine"
+
 namespace llvm {
 
 /// InstCombineWorklist - This is the worklist management logic for
@@ -68,7 +69,7 @@ public:
     if (It == WorklistMap.end()) return; // Not in worklist.
 
     // Don't bother moving everything down, just null out the slot.
-    Worklist[It->second] = 0;
+    Worklist[It->second] = nullptr;
 
     WorklistMap.erase(It);
   }
@@ -101,4 +102,6 @@ public:
 
 } // end namespace llvm.
 
+#undef DEBUG_TYPE
+
 #endif
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 0cab81b..4c36887 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -33,7 +33,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "instcombine"
 #include "llvm/Transforms/Scalar.h"
 #include "InstCombine.h"
 #include "llvm-c/Initialization.h"
@@ -58,6 +57,8 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+#define DEBUG_TYPE "instcombine"
+
 STATISTIC(NumCombined , "Number of insts combined");
 STATISTIC(NumConstProp, "Number of constant folds");
 STATISTIC(NumDeadInst , "Number of dead inst eliminated");
@@ -512,7 +513,7 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
       }
   }
 
-  return 0;
+  return nullptr;
 }
 
 // dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction
@@ -530,7 +531,7 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const {
     if (C->getType()->getElementType()->isIntegerTy())
       return ConstantExpr::getNeg(C);
 
-  return 0;
+  return nullptr;
 }
 
 // dyn_castFNegVal - Given a 'fsub' instruction, return the RHS of the
@@ -549,7 +550,7 @@ Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const {
     if (C->getType()->getElementType()->isFloatingPointTy())
       return ConstantExpr::getFNeg(C);
 
-  return 0;
+  return nullptr;
 }
 
 static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
@@ -595,13 +596,13 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
 // not have a second operand.
 Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
   // Don't modify shared select instructions
-  if (!SI->hasOneUse()) return 0;
+  if (!SI->hasOneUse()) return nullptr;
   Value *TV = SI->getOperand(1);
   Value *FV = SI->getOperand(2);
 
   if (isa<Constant>(TV) || isa<Constant>(FV)) {
     // Bool selects with constant operands can be folded to logical ops.
-    if (SI->getType()->isIntegerTy(1)) return 0;
+    if (SI->getType()->isIntegerTy(1)) return nullptr;
 
     // If it's a bitcast involving vectors, make sure it has the same number of
     // elements on both sides.
@@ -610,10 +611,10 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
       VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
 
       // Verify that either both or neither are vectors.
-      if ((SrcTy == NULL) != (DestTy == NULL)) return 0;
+      if ((SrcTy == nullptr) != (DestTy == nullptr)) return nullptr;
       // If vectors, verify that they have the same number of elements.
       if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements())
-        return 0;
+        return nullptr;
     }
 
     Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, this);
@@ -622,7 +623,7 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
     return SelectInst::Create(SI->getCondition(),
                               SelectTrueVal, SelectFalseVal);
   }
-  return 0;
+  return nullptr;
 }
 
 
@@ -634,7 +635,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   PHINode *PN = cast<PHINode>(I.getOperand(0));
   unsigned NumPHIValues = PN->getNumIncomingValues();
   if (NumPHIValues == 0)
-    return 0;
+    return nullptr;
 
   // We normally only transform phis with a single use.  However, if a PHI has
   // multiple uses and they are all the same operation, we can fold *all* of the
@@ -644,7 +645,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
     for (User *U : PN->users()) {
       Instruction *UI = cast<Instruction>(U);
       if (UI != &I && !I.isIdenticalTo(UI))
-        return 0;
+        return nullptr;
     }
     // Otherwise, we can replace *all* users with the new PHI we form.
   }
@@ -654,14 +655,14 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   // remember the BB it is in.  If there is more than one or if *it* is a PHI,
   // bail out.  We don't do arbitrary constant expressions here because moving
   // their computation can be expensive without a cost model.
-  BasicBlock *NonConstBB = 0;
+  BasicBlock *NonConstBB = nullptr;
   for (unsigned i = 0; i != NumPHIValues; ++i) {
     Value *InVal = PN->getIncomingValue(i);
     if (isa<Constant>(InVal) && !isa<ConstantExpr>(InVal))
       continue;
 
-    if (isa<PHINode>(InVal)) return 0;  // Itself a phi.
-    if (NonConstBB) return 0;  // More than one non-const value.
+    if (isa<PHINode>(InVal)) return nullptr;  // Itself a phi.
+    if (NonConstBB) return nullptr;  // More than one non-const value.
 
     NonConstBB = PN->getIncomingBlock(i);
 
@@ -669,22 +670,22 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
     // insert a computation after it without breaking the edge.
     if (InvokeInst *II = dyn_cast<InvokeInst>(InVal))
       if (II->getParent() == NonConstBB)
-        return 0;
+        return nullptr;
 
     // If the incoming non-constant value is in I's block, we will remove one
     // instruction, but insert another equivalent one, leading to infinite
     // instcombine.
     if (NonConstBB == I.getParent())
-      return 0;
+      return nullptr;
   }
 
   // If there is exactly one non-constant value, we can insert a copy of the
   // operation in that block.  However, if this is a critical edge, we would be
   // inserting the computation one some other paths (e.g. inside a loop).  Only
   // do this if the pred block is unconditionally branching into the phi block.
-  if (NonConstBB != 0) {
+  if (NonConstBB != nullptr) {
     BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
-    if (!BI || !BI->isUnconditional()) return 0;
+    if (!BI || !BI->isUnconditional()) return nullptr;
   }
 
   // Okay, we can do the transformation: create the new PHI node.
@@ -708,7 +709,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
       BasicBlock *ThisBB = PN->getIncomingBlock(i);
       Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB);
       Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB);
-      Value *InV = 0;
+      Value *InV = nullptr;
       // Beware of ConstantExpr:  it may eventually evaluate to getNullValue,
       // even if currently isNullValue gives false.
       Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
@@ -722,7 +723,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) {
     Constant *C = cast<Constant>(I.getOperand(1));
     for (unsigned i = 0; i != NumPHIValues; ++i) {
-      Value *InV = 0;
+      Value *InV = nullptr;
       if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
         InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
       else if (isa<ICmpInst>(CI))
@@ -736,7 +737,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   } else if (I.getNumOperands() == 2) {
     Constant *C = cast<Constant>(I.getOperand(1));
     for (unsigned i = 0; i != NumPHIValues; ++i) {
-      Value *InV = 0;
+      Value *InV = nullptr;
       if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
         InV = ConstantExpr::get(I.getOpcode(), InC, C);
       else
@@ -776,11 +777,11 @@ Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset,
   assert(PtrTy->isPtrOrPtrVectorTy());
 
   if (!DL)
-    return 0;
+    return nullptr;
 
   Type *Ty = PtrTy->getPointerElementType();
   if (!Ty->isSized())
-    return 0;
+    return nullptr;
 
   // Start with the index over the outer type.  Note that the type size
   // might be zero (even if the offset isn't zero) if the indexed type
@@ -806,7 +807,7 @@ Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset,
   while (Offset) {
     // Indexing into tail padding between struct/array elements.
     if (uint64_t(Offset*8) >= DL->getTypeSizeInBits(Ty))
-      return 0;
+      return nullptr;
 
     if (StructType *STy = dyn_cast<StructType>(Ty)) {
       const StructLayout *SL = DL->getStructLayout(STy);
@@ -827,7 +828,7 @@ Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset,
       Ty = AT->getElementType();
     } else {
       // Otherwise, we can't index into the middle of this atomic type, bail.
-      return 0;
+      return nullptr;
     }
   }
 
@@ -859,7 +860,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
 
   // If Scale is zero then it does not divide Val.
   if (Scale.isMinValue())
-    return 0;
+    return nullptr;
 
   // Look through chains of multiplications, searching for a constant that is
   // divisible by Scale.  For example, descaling X*(Y*(Z*4)) by a factor of 4
@@ -902,7 +903,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
       APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder);
       if (!Remainder.isMinValue())
         // Not divisible by Scale.
-        return 0;
+        return nullptr;
       // Replace with the quotient in the parent.
       Op = ConstantInt::get(CI->getType(), Quotient);
       NoSignedWrap = true;
@@ -915,7 +916,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
         // Multiplication.
         NoSignedWrap = BO->hasNoSignedWrap();
         if (RequireNoSignedWrap && !NoSignedWrap)
-          return 0;
+          return nullptr;
 
         // There are three cases for multiplication: multiplication by exactly
         // the scale, multiplication by a constant different to the scale, and
@@ -934,7 +935,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
 
           // Otherwise drill down into the constant.
           if (!Op->hasOneUse())
-            return 0;
+            return nullptr;
 
           Parent = std::make_pair(BO, 1);
           continue;
@@ -943,7 +944,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
         // Multiplication by something else. Drill down into the left-hand side
         // since that's where the reassociate pass puts the good stuff.
         if (!Op->hasOneUse())
-          return 0;
+          return nullptr;
 
         Parent = std::make_pair(BO, 0);
         continue;
@@ -954,7 +955,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
         // Multiplication by a power of 2.
         NoSignedWrap = BO->hasNoSignedWrap();
         if (RequireNoSignedWrap && !NoSignedWrap)
-          return 0;
+          return nullptr;
 
         Value *LHS = BO->getOperand(0);
         int32_t Amt = cast<ConstantInt>(BO->getOperand(1))->
@@ -968,7 +969,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
           break;
         }
         if (Amt < logScale || !Op->hasOneUse())
-          return 0;
+          return nullptr;
 
         // Multiplication by more than the scale.  Reduce the multiplying amount
         // by the scale in the parent.
@@ -979,7 +980,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
     }
 
     if (!Op->hasOneUse())
-      return 0;
+      return nullptr;
 
     if (CastInst *Cast = dyn_cast<CastInst>(Op)) {
       if (Cast->getOpcode() == Instruction::SExt) {
@@ -993,7 +994,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
         // Scale and the multiplication Y * SmallScale should not overflow.
         if (SmallScale.sext(Scale.getBitWidth()) != Scale)
           // SmallScale does not sign-extend to Scale.
-          return 0;
+          return nullptr;
         assert(SmallScale.exactLogBase2() == logScale);
         // Require that Y * SmallScale must not overflow.
         RequireNoSignedWrap = true;
@@ -1012,7 +1013,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
         // trunc (Y * sext Scale) does not, so nsw flags need to be cleared
         // from this point up in the expression (see later).
         if (RequireNoSignedWrap)
-          return 0;
+          return nullptr;
 
         // Drill down through the cast.
         unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
@@ -1026,7 +1027,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
     }
 
     // Unsupported expression, bail out.
-    return 0;
+    return nullptr;
   }
 
   // We know that we can successfully descale, so from here on we can safely
@@ -1082,6 +1083,101 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
   } while (1);
 }
 
+/// \brief Creates node of binary operation with the same attributes as the
+/// specified one but with other operands.
+static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS,
+                                 InstCombiner::BuilderTy *B) {
+  Value *BORes = B->CreateBinOp(Inst.getOpcode(), LHS, RHS);
+  if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BORes)) {
+    if (isa<OverflowingBinaryOperator>(NewBO)) {
+      NewBO->setHasNoSignedWrap(Inst.hasNoSignedWrap());
+      NewBO->setHasNoUnsignedWrap(Inst.hasNoUnsignedWrap());
+    }
+    if (isa<PossiblyExactOperator>(NewBO))
+      NewBO->setIsExact(Inst.isExact());
+  }
+  return BORes;
+}
+
+/// \brief Makes transformation of binary operation specific for vector types.
+/// \param Inst Binary operator to transform.
+/// \return Pointer to node that must replace the original binary operator, or
+///         null pointer if no transformation was made.
+Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
+  if (!Inst.getType()->isVectorTy()) return nullptr;
+
+  unsigned VWidth = cast<VectorType>(Inst.getType())->getNumElements();
+  Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1);
+  assert(cast<VectorType>(LHS->getType())->getNumElements() == VWidth);
+  assert(cast<VectorType>(RHS->getType())->getNumElements() == VWidth);
+
+  // If both arguments of binary operation are shuffles, which use the same
+  // mask and shuffle within a single vector, it is worthwhile to move the
+  // shuffle after binary operation:
+  //   Op(shuffle(v1, m), shuffle(v2, m)) -> shuffle(Op(v1, v2), m)
+  if (isa<ShuffleVectorInst>(LHS) && isa<ShuffleVectorInst>(RHS)) {
+    ShuffleVectorInst *LShuf = cast<ShuffleVectorInst>(LHS);
+    ShuffleVectorInst *RShuf = cast<ShuffleVectorInst>(RHS);
+    if (isa<UndefValue>(LShuf->getOperand(1)) &&
+        isa<UndefValue>(RShuf->getOperand(1)) &&
+        LShuf->getOperand(0)->getType() == RShuf->getOperand(0)->getType() &&
+        LShuf->getMask() == RShuf->getMask()) {
+      Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0),
+          RShuf->getOperand(0), Builder);
+      Value *Res = Builder->CreateShuffleVector(NewBO,
+          UndefValue::get(NewBO->getType()), LShuf->getMask());
+      return Res;
+    }
+  }
+
+  // If one argument is a shuffle within one vector, the other is a constant,
+  // try moving the shuffle after the binary operation.
+  ShuffleVectorInst *Shuffle = nullptr;
+  Constant *C1 = nullptr;
+  if (isa<ShuffleVectorInst>(LHS)) Shuffle = cast<ShuffleVectorInst>(LHS);
+  if (isa<ShuffleVectorInst>(RHS)) Shuffle = cast<ShuffleVectorInst>(RHS);
+  if (isa<Constant>(LHS)) C1 = cast<Constant>(LHS);
+  if (isa<Constant>(RHS)) C1 = cast<Constant>(RHS);
+  if (Shuffle && C1 && isa<UndefValue>(Shuffle->getOperand(1)) &&
+      Shuffle->getType() == Shuffle->getOperand(0)->getType()) {
+    SmallVector<int, 16> ShMask = Shuffle->getShuffleMask();
+    // Find constant C2 that has property:
+    //   shuffle(C2, ShMask) = C1
+    // If such constant does not exist (example: ShMask=<0,0> and C1=<1,2>)
+    // reorder is not possible.
+    SmallVector<Constant*, 16> C2M(VWidth,
+                               UndefValue::get(C1->getType()->getScalarType()));
+    bool MayChange = true;
+    for (unsigned I = 0; I < VWidth; ++I) {
+      if (ShMask[I] >= 0) {
+        assert(ShMask[I] < (int)VWidth);
+        if (!isa<UndefValue>(C2M[ShMask[I]])) {
+          MayChange = false;
+          break;
+        }
+        C2M[ShMask[I]] = C1->getAggregateElement(I);
+      }
+    }
+    if (MayChange) {
+      Constant *C2 = ConstantVector::get(C2M);
+      Value *NewLHS, *NewRHS;
+      if (isa<Constant>(LHS)) {
+        NewLHS = C2;
+        NewRHS = Shuffle->getOperand(0);
+      } else {
+        NewLHS = Shuffle->getOperand(0);
+        NewRHS = C2;
+      }
+      Value *NewBO = CreateBinOpAsGiven(Inst, NewLHS, NewRHS, Builder);
+      Value *Res = Builder->CreateShuffleVector(NewBO,
+          UndefValue::get(Inst.getType()), Shuffle->getMask());
+      return Res;
+    }
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
 
@@ -1130,7 +1226,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   //
   if (GEPOperator *Src = dyn_cast<GEPOperator>(PtrOp)) {
     if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
-      return 0;
+      return nullptr;
 
     // Note that if our source is a gep chain itself then we wait for that
     // chain to be resolved before we perform this transformation.  This
@@ -1138,7 +1234,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (GEPOperator *SrcGEP =
           dyn_cast<GEPOperator>(Src->getOperand(0)))
       if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
-        return 0;   // Wait until our source is folded to completion.
+        return nullptr;   // Wait until our source is folded to completion.
 
     SmallVector<Value*, 8> Indices;
 
@@ -1166,7 +1262,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         // intptr_t).  Just avoid transforming this until the input has been
         // normalized.
         if (SO1->getType() != GO1->getType())
-          return 0;
+          return nullptr;
         Sum = Builder->CreateAdd(SO1, GO1, PtrOp->getName()+".sum");
       }
 
@@ -1216,7 +1312,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
   // We do not handle pointer-vector geps here.
   if (!StrippedPtrTy)
-    return 0;
+    return nullptr;
 
   if (StrippedPtr != PtrOp) {
     bool HasZeroPointerIndex = false;
@@ -1241,7 +1337,15 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           GetElementPtrInst *Res =
             GetElementPtrInst::Create(StrippedPtr, Idx, GEP.getName());
           Res->setIsInBounds(GEP.isInBounds());
-          return Res;
+          if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace())
+            return Res;
+          // Insert Res, and create an addrspacecast.
+          // e.g.,
+          // GEP (addrspacecast i8 addrspace(1)* X to [0 x i8]*), i32 0, ...
+          // ->
+          // %0 = GEP i8 addrspace(1)* X, ...
+          // addrspacecast i8 addrspace(1)* %0 to i8*
+          return new AddrSpaceCastInst(Builder->Insert(Res), GEP.getType());
         }
 
         if (ArrayType *XATy =
@@ -1253,8 +1357,24 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // to an array of the same type as the destination pointer
             // array.  Because the array type is never stepped over (there
             // is a leading zero) we can fold the cast into this GEP.
-            GEP.setOperand(0, StrippedPtr);
-            return &GEP;
+            if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) {
+              GEP.setOperand(0, StrippedPtr);
+              return &GEP;
+            }
+            // Cannot replace the base pointer directly because StrippedPtr's
+            // address space is different. Instead, create a new GEP followed by
+            // an addrspacecast.
+            // e.g.,
+            // GEP (addrspacecast [10 x i8] addrspace(1)* X to [0 x i8]*),
+            //   i32 0, ...
+            // ->
+            // %0 = GEP [10 x i8] addrspace(1)* X, ...
+            // addrspacecast i8 addrspace(1)* %0 to i8*
+            SmallVector<Value*, 8> Idx(GEP.idx_begin(), GEP.idx_end());
+            Value *NewGEP = GEP.isInBounds() ?
+              Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) :
+              Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
+            return new AddrSpaceCastInst(NewGEP, GEP.getType());
           }
         }
       }
@@ -1360,7 +1480,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   }
 
   if (!DL)
-    return 0;
+    return nullptr;
 
   /// See if we can simplify:
   ///   X = bitcast A* to B*
@@ -1412,7 +1532,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 static bool
@@ -1527,7 +1647,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
     }
     return EraseInstFromFunction(MI);
   }
-  return 0;
+  return nullptr;
 }
 
 /// \brief Move the call to free before a NULL test.
@@ -1556,30 +1676,30 @@ tryToMoveFreeBeforeNullTest(CallInst &FI) {
   //        would duplicate the call to free in each predecessor and it may
   //        not be profitable even for code size.
   if (!PredBB)
-    return 0;
+    return nullptr;
 
   // Validate constraint #2: Does this block contains only the call to
   //                         free and an unconditional branch?
   // FIXME: We could check if we can speculate everything in the
   //        predecessor block
   if (FreeInstrBB->size() != 2)
-    return 0;
+    return nullptr;
   BasicBlock *SuccBB;
   if (!match(FreeInstrBB->getTerminator(), m_UnconditionalBr(SuccBB)))
-    return 0;
+    return nullptr;
 
   // Validate the rest of constraint #1 by matching on the pred branch.
   TerminatorInst *TI = PredBB->getTerminator();
   BasicBlock *TrueBB, *FalseBB;
   ICmpInst::Predicate Pred;
   if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Op), m_Zero()), TrueBB, FalseBB)))
-    return 0;
+    return nullptr;
   if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
-    return 0;
+    return nullptr;
 
   // Validate constraint #3: Ensure the null case just falls through.
   if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB))
-    return 0;
+    return nullptr;
   assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) &&
          "Broken CFG: missing edge from predecessor to successor");
 
@@ -1614,14 +1734,14 @@ Instruction *InstCombiner::visitFree(CallInst &FI) {
     if (Instruction *I = tryToMoveFreeBeforeNullTest(FI))
       return I;
 
-  return 0;
+  return nullptr;
 }
 
 
 
 Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
   // Change br (not X), label True, label False to: br X, label False, True
-  Value *X = 0;
+  Value *X = nullptr;
   BasicBlock *TrueDest;
   BasicBlock *FalseDest;
   if (match(&BI, m_Br(m_Not(m_Value(X)), TrueDest, FalseDest)) &&
@@ -1664,7 +1784,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
       return &BI;
     }
 
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
@@ -1688,7 +1808,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
         return &SI;
       }
   }
-  return 0;
+  return nullptr;
 }
 
 Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
@@ -1705,7 +1825,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       // first index
       return ExtractValueInst::Create(C2, EV.getIndices().slice(1));
     }
-    return 0; // Can't handle other constants
+    return nullptr; // Can't handle other constants
   }
 
   if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
@@ -1838,7 +1958,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
   // and if again single-use then via load (gep (gep)) to load (gep).
   // However, double extracts from e.g. function arguments or return values
   // aren't handled yet.
-  return 0;
+  return nullptr;
 }
 
 enum Personality_Type {
@@ -2177,7 +2297,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
     return &LI;
   }
 
-  return 0;
+  return nullptr;
 }
 
 
@@ -2270,7 +2390,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
         for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end();
              i != e; ++i) {
           ConstantExpr *CE = dyn_cast<ConstantExpr>(i);
-          if (CE == 0) continue;
+          if (CE == nullptr) continue;
 
           Constant*& FoldRes = FoldedConstants[CE];
           if (!FoldRes)
@@ -2374,7 +2494,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 
   while (!Worklist.isEmpty()) {
     Instruction *I = Worklist.RemoveOne();
-    if (I == 0) continue;  // skip null values.
+    if (I == nullptr) continue;  // skip null values.
 
     // Check to see if we can DCE the instruction.
     if (isInstructionTriviallyDead(I, TLI)) {
@@ -2516,7 +2636,7 @@ bool InstCombiner::runOnFunction(Function &F) {
     return false;
 
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
   // Minimizing size?
   MinimizeSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
@@ -2543,7 +2663,7 @@ bool InstCombiner::runOnFunction(Function &F) {
   while (DoOneIteration(F, Iteration++))
     EverMadeChange = true;
 
-  Builder = 0;
+  Builder = nullptr;
   return EverMadeChange;
 }
 
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index bbfa4c5..95fca75 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -13,8 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asan"
-
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -53,8 +51,11 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "asan"
+
 static const uint64_t kDefaultShadowScale = 3;
 static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
+static const uint64_t kIOSShadowOffset32 = 1ULL << 30;
 static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
 static const uint64_t kSmallX86_64ShadowOffset = 0x7FFF8000;  // < 2G.
 static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
@@ -79,6 +80,7 @@ static const char *const kAsanUnregisterGlobalsName =
 static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
 static const char *const kAsanInitName = "__asan_init_v3";
+static const char *const kAsanCovModuleInitName = "__sanitizer_cov_module_init";
 static const char *const kAsanCovName = "__sanitizer_cov";
 static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
 static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";
@@ -135,10 +137,12 @@ static cl::opt<bool> ClGlobals("asan-globals",
 static cl::opt<int> ClCoverage("asan-coverage",
        cl::desc("ASan coverage. 0: none, 1: entry block, 2: all blocks"),
        cl::Hidden, cl::init(false));
+static cl::opt<int> ClCoverageBlockThreshold("asan-coverage-block-threshold",
+       cl::desc("Add coverage instrumentation only to the entry block if there "
+                "are more than this number of blocks."),
+       cl::Hidden, cl::init(1500));
 static cl::opt<bool> ClInitializers("asan-initialization-order",
        cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false));
-static cl::opt<bool> ClMemIntrin("asan-memintrin",
-       cl::desc("Handle memset/memcpy/memmove"), cl::Hidden, cl::init(true));
 static cl::opt<bool> ClInvalidPointerPairs("asan-detect-invalid-pointer-pair",
        cl::desc("Instrument <, <=, >, >=, - with pointer operands"),
        cl::Hidden, cl::init(false));
@@ -148,6 +152,16 @@ static cl::opt<unsigned> ClRealignStack("asan-realign-stack",
 static cl::opt<std::string> ClBlacklistFile("asan-blacklist",
        cl::desc("File containing the list of objects to ignore "
                 "during instrumentation"), cl::Hidden);
+static cl::opt<int> ClInstrumentationWithCallsThreshold(
+    "asan-instrumentation-with-call-threshold",
+       cl::desc("If the function being instrumented contains more than "
+                "this number of memory accesses, use callbacks instead of "
+                "inline checks (-1 means never use callbacks)."),
+       cl::Hidden, cl::init(7000));
+static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
+       "asan-memory-access-callback-prefix",
+       cl::desc("Prefix for memory access callbacks"), cl::Hidden,
+       cl::init("__asan_"));
 
 // This is an experimental feature that will allow to choose between
 // instrumented and non-instrumented code at link-time.
@@ -238,7 +252,7 @@ struct ShadowMapping {
 static ShadowMapping getShadowMapping(const Module &M, int LongSize) {
   llvm::Triple TargetTriple(M.getTargetTriple());
   bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android;
-  // bool IsMacOSX = TargetTriple.getOS() == llvm::Triple::MacOSX;
+  bool IsIOS = TargetTriple.getOS() == llvm::Triple::IOS;
   bool IsFreeBSD = TargetTriple.getOS() == llvm::Triple::FreeBSD;
   bool IsLinux = TargetTriple.getOS() == llvm::Triple::Linux;
   bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 ||
@@ -256,6 +270,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize) {
       Mapping.Offset = kMIPS32_ShadowOffset32;
     else if (IsFreeBSD)
       Mapping.Offset = kFreeBSD_ShadowOffset32;
+    else if (IsIOS)
+      Mapping.Offset = kIOSShadowOffset32;
     else
       Mapping.Offset = kDefaultShadowOffset32;
   } else {  // LongSize == 64
@@ -303,20 +319,17 @@ struct AddressSanitizer : public FunctionPass {
   const char *getPassName() const override {
     return "AddressSanitizerFunctionPass";
   }
-  void instrumentMop(Instruction *I);
+  void instrumentMop(Instruction *I, bool UseCalls);
   void instrumentPointerComparisonOrSubtraction(Instruction *I);
   void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
                          Value *Addr, uint32_t TypeSize, bool IsWrite,
-                         Value *SizeArgument);
+                         Value *SizeArgument, bool UseCalls);
   Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
                            Value *ShadowValue, uint32_t TypeSize);
   Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
                                  bool IsWrite, size_t AccessSizeIndex,
                                  Value *SizeArgument);
-  bool instrumentMemIntrinsic(MemIntrinsic *MI);
-  void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr,
-                                   Value *Size,
-                                   Instruction *InsertBefore, bool IsWrite);
+  void instrumentMemIntrinsic(MemIntrinsic *MI);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
   bool runOnFunction(Function &F) override;
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
@@ -349,8 +362,11 @@ struct AddressSanitizer : public FunctionPass {
   std::unique_ptr<SpecialCaseList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
+  Function *AsanMemoryAccessCallback[2][kNumberOfAccessSizes];
   // This array is indexed by AccessIsWrite.
-  Function *AsanErrorCallbackSized[2];
+  Function *AsanErrorCallbackSized[2],
+           *AsanMemoryAccessCallbackSized[2];
+  Function *AsanMemmove, *AsanMemcpy, *AsanMemset;
   InlineAsm *EmptyAsm;
   SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
 
@@ -393,6 +409,7 @@ class AddressSanitizerModule : public ModulePass {
   Function *AsanUnpoisonGlobals;
   Function *AsanRegisterGlobals;
   Function *AsanUnregisterGlobals;
+  Function *AsanCovModuleInit;
 };
 
 // Stack poisoning does not play well with exception handling.
@@ -443,11 +460,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   bool runOnFunction() {
     if (!ClStack) return false;
     // Collect alloca, ret, lifetime instructions etc.
-    for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
-         DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
-      BasicBlock *BB = *DI;
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
       visit(*BB);
-    }
+
     if (AllocaVec.empty()) return false;
 
     initializeCallbacks(*F.getParent());
@@ -590,72 +605,54 @@ Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
     return IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
 }
 
-void AddressSanitizer::instrumentMemIntrinsicParam(
-    Instruction *OrigIns,
-    Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) {
-  IRBuilder<> IRB(InsertBefore);
-  if (Size->getType() != IntptrTy)
-    Size = IRB.CreateIntCast(Size, IntptrTy, false);
-  // Check the first byte.
-  instrumentAddress(OrigIns, InsertBefore, Addr, 8, IsWrite, Size);
-  // Check the last byte.
-  IRB.SetInsertPoint(InsertBefore);
-  Value *SizeMinusOne = IRB.CreateSub(Size, ConstantInt::get(IntptrTy, 1));
-  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
-  Value *AddrLast = IRB.CreateAdd(AddrLong, SizeMinusOne);
-  instrumentAddress(OrigIns, InsertBefore, AddrLast, 8, IsWrite, Size);
-}
-
 // Instrument memset/memmove/memcpy
-bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
-  Value *Dst = MI->getDest();
-  MemTransferInst *MemTran = dyn_cast<MemTransferInst>(MI);
-  Value *Src = MemTran ? MemTran->getSource() : 0;
-  Value *Length = MI->getLength();
-
-  Constant *ConstLength = dyn_cast<Constant>(Length);
-  Instruction *InsertBefore = MI;
-  if (ConstLength) {
-    if (ConstLength->isNullValue()) return false;
-  } else {
-    // The size is not a constant so it could be zero -- check at run-time.
-    IRBuilder<> IRB(InsertBefore);
-
-    Value *Cmp = IRB.CreateICmpNE(Length,
-                                  Constant::getNullValue(Length->getType()));
-    InsertBefore = SplitBlockAndInsertIfThen(Cmp, InsertBefore, false);
+void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall3(
+        isa<MemMoveInst>(MI) ? AsanMemmove : AsanMemcpy,
+        IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+        IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+        IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false));
+  } else if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall3(
+        AsanMemset,
+        IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+        IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+        IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false));
   }
-
-  instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true);
-  if (Src)
-    instrumentMemIntrinsicParam(MI, Src, Length, InsertBefore, false);
-  return true;
+  MI->eraseFromParent();
 }
 
 // If I is an interesting memory access, return the PointerOperand
-// and set IsWrite. Otherwise return NULL.
-static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) {
+// and set IsWrite/Alignment. Otherwise return NULL.
+static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
+                                        unsigned *Alignment) {
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-    if (!ClInstrumentReads) return NULL;
+    if (!ClInstrumentReads) return nullptr;
     *IsWrite = false;
+    *Alignment = LI->getAlignment();
     return LI->getPointerOperand();
   }
   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-    if (!ClInstrumentWrites) return NULL;
+    if (!ClInstrumentWrites) return nullptr;
     *IsWrite = true;
+    *Alignment = SI->getAlignment();
     return SI->getPointerOperand();
   }
   if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
-    if (!ClInstrumentAtomics) return NULL;
+    if (!ClInstrumentAtomics) return nullptr;
     *IsWrite = true;
+    *Alignment = 0;
     return RMW->getPointerOperand();
   }
   if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
-    if (!ClInstrumentAtomics) return NULL;
+    if (!ClInstrumentAtomics) return nullptr;
     *IsWrite = true;
+    *Alignment = 0;
     return XCHG->getPointerOperand();
   }
-  return NULL;
+  return nullptr;
 }
 
 static bool isPointerOperand(Value *V) {
@@ -700,9 +697,10 @@ AddressSanitizer::instrumentPointerComparisonOrSubtraction(Instruction *I) {
   IRB.CreateCall2(F, Param[0], Param[1]);
 }
 
-void AddressSanitizer::instrumentMop(Instruction *I) {
+void AddressSanitizer::instrumentMop(Instruction *I, bool UseCalls) {
   bool IsWrite = false;
-  Value *Addr = isInterestingMemoryAccess(I, &IsWrite);
+  unsigned Alignment = 0;
+  Value *Addr = isInterestingMemoryAccess(I, &IsWrite, &Alignment);
   assert(Addr);
   if (ClOpt && ClOptGlobals) {
     if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) {
@@ -737,22 +735,29 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
   else
     NumInstrumentedReads++;
 
-  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check.
-  if (TypeSize == 8  || TypeSize == 16 ||
-      TypeSize == 32 || TypeSize == 64 || TypeSize == 128)
-    return instrumentAddress(I, I, Addr, TypeSize, IsWrite, 0);
-  // Instrument unusual size (but still multiple of 8).
+  unsigned Granularity = 1 << Mapping.Scale;
+  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
+  // if the data is properly aligned.
+  if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
+       TypeSize == 128) &&
+      (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8))
+    return instrumentAddress(I, I, Addr, TypeSize, IsWrite, nullptr, UseCalls);
+  // Instrument unusual size or unusual alignment.
   // We can not do it with a single check, so we do 1-byte check for the first
   // and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
   // to report the actual access size.
   IRBuilder<> IRB(I);
-  Value *LastByte =  IRB.CreateIntToPtr(
-      IRB.CreateAdd(IRB.CreatePointerCast(Addr, IntptrTy),
-                    ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
-      OrigPtrTy);
   Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
-  instrumentAddress(I, I, Addr, 8, IsWrite, Size);
-  instrumentAddress(I, I, LastByte, 8, IsWrite, Size);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  if (UseCalls) {
+    IRB.CreateCall2(AsanMemoryAccessCallbackSized[IsWrite], AddrLong, Size);
+  } else {
+    Value *LastByte = IRB.CreateIntToPtr(
+        IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
+        OrigPtrTy);
+    instrumentAddress(I, I, Addr, 8, IsWrite, Size, false);
+    instrumentAddress(I, I, LastByte, 8, IsWrite, Size, false);
+  }
 }
 
 // Validate the result of Module::getOrInsertFunction called for an interface
@@ -800,11 +805,18 @@ Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
 }
 
 void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
-                                         Instruction *InsertBefore,
-                                         Value *Addr, uint32_t TypeSize,
-                                         bool IsWrite, Value *SizeArgument) {
+                                         Instruction *InsertBefore, Value *Addr,
+                                         uint32_t TypeSize, bool IsWrite,
+                                         Value *SizeArgument, bool UseCalls) {
   IRBuilder<> IRB(InsertBefore);
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+
+  if (UseCalls) {
+    IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][AccessSizeIndex],
+                   AddrLong);
+    return;
+  }
 
   Type *ShadowTy  = IntegerType::get(
       *C, std::max(8U, TypeSize >> Mapping.Scale));
@@ -815,9 +827,8 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
       IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
 
   Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
-  size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
   size_t Granularity = 1 << Mapping.Scale;
-  TerminatorInst *CrashTerm = 0;
+  TerminatorInst *CrashTerm = nullptr;
 
   if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
     TerminatorInst *CheckTerm =
@@ -842,8 +853,29 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
 void AddressSanitizerModule::createInitializerPoisonCalls(
     Module &M, GlobalValue *ModuleName) {
-  // We do all of our poisoning and unpoisoning within _GLOBAL__I_a.
-  Function *GlobalInit = M.getFunction("_GLOBAL__I_a");
+  // We do all of our poisoning and unpoisoning within a global constructor.
+  // These are called _GLOBAL__(sub_)?I_.*.
+  // TODO: Consider looking through the functions in
+  // M.getGlobalVariable("llvm.global_ctors") instead of using this stringly
+  // typed approach.
+  Function *GlobalInit = nullptr;
+  for (auto &F : M.getFunctionList()) {
+    StringRef FName = F.getName();
+
+    const char kGlobalPrefix[] = "_GLOBAL__";
+    if (!FName.startswith(kGlobalPrefix))
+      continue;
+    FName = FName.substr(strlen(kGlobalPrefix));
+
+    const char kOptionalSub[] = "sub_";
+    if (FName.startswith(kOptionalSub))
+      FName = FName.substr(strlen(kOptionalSub));
+
+    if (FName.startswith("I_")) {
+      GlobalInit = &F;
+      break;
+    }
+  }
   // If that function is not present, this TU contains no globals, or they have
   // all been optimized away
   if (!GlobalInit)
@@ -858,7 +890,7 @@ void AddressSanitizerModule::createInitializerPoisonCalls(
 
   // Add calls to unpoison all globals before each return instruction.
   for (Function::iterator I = GlobalInit->begin(), E = GlobalInit->end();
-      I != E; ++I) {
+       I != E; ++I) {
     if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) {
       CallInst::Create(AsanUnpoisonGlobals, "", RI);
     }
@@ -902,8 +934,8 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
     // Ignore the globals from the __OBJC section. The ObjC runtime assumes
     // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
     // them.
-    if ((Section.find("__OBJC,") == 0) ||
-        (Section.find("__DATA, __objc_") == 0)) {
+    if (Section.startswith("__OBJC,") ||
+        Section.startswith("__DATA, __objc_")) {
       DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
       return false;
     }
@@ -915,16 +947,26 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
     //     is placed into __DATA,__cfstring
     // Therefore there's no point in placing redzones into __DATA,__cfstring.
     // Moreover, it causes the linker to crash on OS X 10.7
-    if (Section.find("__DATA,__cfstring") == 0) {
+    if (Section.startswith("__DATA,__cfstring")) {
       DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
       return false;
     }
     // The linker merges the contents of cstring_literals and removes the
     // trailing zeroes.
-    if (Section.find("__TEXT,__cstring,cstring_literals") == 0) {
+    if (Section.startswith("__TEXT,__cstring,cstring_literals")) {
       DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
       return false;
     }
+
+    // Callbacks put into the CRT initializer/terminator sections
+    // should not be instrumented.
+    // See https://code.google.com/p/address-sanitizer/issues/detail?id=305
+    // and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx
+    if (Section.startswith(".CRT")) {
+      DEBUG(dbgs() << "Ignoring a global initializer callback: " << *G << "\n");
+      return false;
+    }
+
     // Globals from llvm.metadata aren't emitted, do not instrument them.
     if (Section == "llvm.metadata") return false;
   }
@@ -950,6 +992,10 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) {
       kAsanUnregisterGlobalsName,
       IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
   AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
+  AsanCovModuleInit = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanCovModuleInitName,
+      IRB.getVoidTy(), IntptrTy, NULL));
+  AsanCovModuleInit->setLinkage(Function::ExternalLinkage);
 }
 
 // This function replaces all global variables with new variables that have
@@ -980,6 +1026,14 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
       GlobalsToChange.push_back(G);
   }
 
+  Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
+  assert(CtorFunc);
+  IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
+
+  Function *CovFunc = M.getFunction(kAsanCovName);
+  int nCov = CovFunc ? CovFunc->getNumUses() : 0;
+  IRB.CreateCall(AsanCovModuleInit, ConstantInt::get(IntptrTy, nCov));
+
   size_t n = GlobalsToChange.size();
   if (n == 0) return false;
 
@@ -996,10 +1050,6 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
                                                IntptrTy, IntptrTy, NULL);
   SmallVector<Constant *, 16> Initializers(n);
 
-  Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
-  assert(CtorFunc);
-  IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
-
   bool HasDynamicallyInitializedGlobals = false;
 
   // We shouldn't merge same module names, as this string serves as unique
@@ -1110,12 +1160,16 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
     for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
          AccessSizeIndex++) {
       // IsWrite and TypeSize are encoded in the function name.
-      std::string FunctionName = std::string(kAsanReportErrorTemplate) +
+      std::string Suffix =
           (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex);
-      // If we are merging crash callbacks, they have two parameters.
       AsanErrorCallback[AccessIsWrite][AccessSizeIndex] =
-          checkInterfaceFunction(M.getOrInsertFunction(
-              FunctionName, IRB.getVoidTy(), IntptrTy, NULL));
+          checkInterfaceFunction(
+              M.getOrInsertFunction(kAsanReportErrorTemplate + Suffix,
+                                    IRB.getVoidTy(), IntptrTy, NULL));
+      AsanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] =
+          checkInterfaceFunction(
+              M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + Suffix,
+                                    IRB.getVoidTy(), IntptrTy, NULL));
     }
   }
   AsanErrorCallbackSized[0] = checkInterfaceFunction(M.getOrInsertFunction(
@@ -1123,8 +1177,25 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
   AsanErrorCallbackSized[1] = checkInterfaceFunction(M.getOrInsertFunction(
               kAsanReportStoreN, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
 
-  AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanHandleNoReturnName, IRB.getVoidTy(), NULL));
+  AsanMemoryAccessCallbackSized[0] = checkInterfaceFunction(
+      M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "loadN",
+                            IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanMemoryAccessCallbackSized[1] = checkInterfaceFunction(
+      M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "storeN",
+                            IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+
+  AsanMemmove = checkInterfaceFunction(M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, NULL));
+  AsanMemcpy = checkInterfaceFunction(M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memcpy", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, NULL));
+  AsanMemset = checkInterfaceFunction(M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memset", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy, NULL));
+
+  AsanHandleNoReturnFunc = checkInterfaceFunction(
+      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy(), NULL));
   AsanCovFunction = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanCovName, IRB.getVoidTy(), NULL));
   AsanPtrCmpFunction = checkInterfaceFunction(M.getOrInsertFunction(
@@ -1142,7 +1213,7 @@ bool AddressSanitizer::doInitialization(Module &M) {
   // Initialize the private fields. No one has accessed them before.
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   if (!DLP)
-    return false;
+    report_fatal_error("data layout missing");
   DL = &DLP->getDataLayout();
 
   BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
@@ -1241,7 +1312,8 @@ bool AddressSanitizer::InjectCoverage(Function &F,
                                       const ArrayRef<BasicBlock *> AllBlocks) {
   if (!ClCoverage) return false;
 
-  if (ClCoverage == 1) {
+  if (ClCoverage == 1 ||
+      (unsigned)ClCoverageBlockThreshold < AllBlocks.size()) {
     InjectCoverageAtBlock(F, F.getEntryBlock());
   } else {
     for (size_t i = 0, n = AllBlocks.size(); i < n; i++)
@@ -1275,6 +1347,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   SmallVector<Instruction*, 16> PointerComparisonsOrSubtracts;
   int NumAllocas = 0;
   bool IsWrite;
+  unsigned Alignment;
 
   // Fill the set of memory operations to instrument.
   for (Function::iterator FI = F.begin(), FE = F.end();
@@ -1285,7 +1358,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
       if (LooksLikeCodeInBug11395(BI)) return false;
-      if (Value *Addr = isInterestingMemoryAccess(BI, &IsWrite)) {
+      if (Value *Addr = isInterestingMemoryAccess(BI, &IsWrite, &Alignment)) {
         if (ClOpt && ClOptSameTemp) {
           if (!TempsToInstrument.insert(Addr))
             continue;  // We've seen this temp in the current BB.
@@ -1294,7 +1367,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
                  isInterestingPointerComparisonOrSubtraction(BI)) {
         PointerComparisonsOrSubtracts.push_back(BI);
         continue;
-      } else if (isa<MemIntrinsic>(BI) && ClMemIntrin) {
+      } else if (isa<MemIntrinsic>(BI)) {
         // ok, take it.
       } else {
         if (isa<AllocaInst>(BI))
@@ -1315,7 +1388,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
     }
   }
 
-  Function *UninstrumentedDuplicate = 0;
+  Function *UninstrumentedDuplicate = nullptr;
   bool LikelyToInstrument =
       !NoReturnCalls.empty() || !ToInstrument.empty() || (NumAllocas > 0);
   if (ClKeepUninstrumented && LikelyToInstrument) {
@@ -1326,14 +1399,19 @@ bool AddressSanitizer::runOnFunction(Function &F) {
     F.getParent()->getFunctionList().push_back(UninstrumentedDuplicate);
   }
 
+  bool UseCalls = false;
+  if (ClInstrumentationWithCallsThreshold >= 0 &&
+      ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold)
+    UseCalls = true;
+
   // Instrument.
   int NumInstrumented = 0;
   for (size_t i = 0, n = ToInstrument.size(); i != n; i++) {
     Instruction *Inst = ToInstrument[i];
     if (ClDebugMin < 0 || ClDebugMax < 0 ||
         (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
-      if (isInterestingMemoryAccess(Inst, &IsWrite))
-        instrumentMop(Inst);
+      if (isInterestingMemoryAccess(Inst, &IsWrite, &Alignment))
+        instrumentMop(Inst, UseCalls);
       else
         instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
     }
@@ -1464,12 +1542,23 @@ void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined(
   }
 }
 
+static DebugLoc getFunctionEntryDebugLocation(Function &F) {
+  BasicBlock::iterator I = F.getEntryBlock().begin(),
+                       E = F.getEntryBlock().end();
+  for (; I != E; ++I)
+    if (!isa<AllocaInst>(I))
+      break;
+  return I->getDebugLoc();
+}
+
 void FunctionStackPoisoner::poisonStack() {
   int StackMallocIdx = -1;
+  DebugLoc EntryDebugLocation = getFunctionEntryDebugLocation(F);
 
   assert(AllocaVec.size() > 0);
   Instruction *InsBefore = AllocaVec[0];
   IRBuilder<> IRB(InsBefore);
+  IRB.SetCurrentDebugLocation(EntryDebugLocation);
 
   SmallVector<ASanStackVariableDescription, 16> SVD;
   SVD.reserve(AllocaVec.size());
@@ -1493,6 +1582,7 @@ void FunctionStackPoisoner::poisonStack() {
   Type *ByteArrayTy = ArrayType::get(IRB.getInt8Ty(), LocalStackSize);
   AllocaInst *MyAlloca =
       new AllocaInst(ByteArrayTy, "MyAlloca", InsBefore);
+  MyAlloca->setDebugLoc(EntryDebugLocation);
   assert((ClRealignStack & (ClRealignStack - 1)) == 0);
   size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
   MyAlloca->setAlignment(FrameAlignment);
@@ -1513,11 +1603,13 @@ void FunctionStackPoisoner::poisonStack() {
     Instruction *Term = SplitBlockAndInsertIfThen(Cmp, InsBefore, false);
     BasicBlock *CmpBlock = cast<Instruction>(Cmp)->getParent();
     IRBuilder<> IRBIf(Term);
+    IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
     LocalStackBase = IRBIf.CreateCall2(
         AsanStackMallocFunc[StackMallocIdx],
         ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase);
     BasicBlock *SetBlock = cast<Instruction>(LocalStackBase)->getParent();
     IRB.SetInsertPoint(InsBefore);
+    IRB.SetCurrentDebugLocation(EntryDebugLocation);
     PHINode *Phi = IRB.CreatePHI(IntptrTy, 2);
     Phi->addIncoming(OrigStackBase, CmpBlock);
     Phi->addIncoming(LocalStackBase, SetBlock);
@@ -1654,7 +1746,7 @@ void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
 AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
   if (AllocaInst *AI = dyn_cast<AllocaInst>(V))
     // We're intested only in allocas we can handle.
-    return isInterestingAlloca(*AI) ? AI : 0;
+    return isInterestingAlloca(*AI) ? AI : nullptr;
   // See if we've already calculated (or started to calculate) alloca for a
   // given value.
   AllocaForValueMapTy::iterator I = AllocaForValue.find(V);
@@ -1662,8 +1754,8 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
     return I->second;
   // Store 0 while we're calculating alloca for value V to avoid
   // infinite recursion if the value references itself.
-  AllocaForValue[V] = 0;
-  AllocaInst *Res = 0;
+  AllocaForValue[V] = nullptr;
+  AllocaInst *Res = nullptr;
   if (CastInst *CI = dyn_cast<CastInst>(V))
     Res = findAllocaForValue(CI->getOperand(0));
   else if (PHINode *PN = dyn_cast<PHINode>(V)) {
@@ -1673,12 +1765,12 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
       if (IncValue == PN) continue;
       AllocaInst *IncValueAI = findAllocaForValue(IncValue);
       // AI for incoming values should exist and should all be equal.
-      if (IncValueAI == 0 || (Res != 0 && IncValueAI != Res))
-        return 0;
+      if (IncValueAI == nullptr || (Res != nullptr && IncValueAI != Res))
+        return nullptr;
       Res = IncValueAI;
     }
   }
-  if (Res != 0)
+  if (Res)
     AllocaForValue[V] = Res;
   return Res;
 }
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 505fb83..9a5cea8 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "bounds-checking"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -28,6 +27,8 @@
 #include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "bounds-checking"
+
 static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
                                   cl::desc("Use one trap block per function"));
 
@@ -61,7 +62,7 @@ namespace {
     BasicBlock *TrapBB;
 
     BasicBlock *getTrapBB();
-    void emitBranchToTrap(Value *Cmp = 0);
+    void emitBranchToTrap(Value *Cmp = nullptr);
     bool instrument(Value *Ptr, Value *Val);
  };
 }
@@ -103,7 +104,7 @@ void BoundsChecking::emitBranchToTrap(Value *Cmp) {
     if (!C->getZExtValue())
       return;
     else
-      Cmp = 0; // unconditional branch
+      Cmp = nullptr; // unconditional branch
   }
   ++ChecksAdded;
 
@@ -167,7 +168,7 @@ bool BoundsChecking::runOnFunction(Function &F) {
   DL = &getAnalysis<DataLayoutPass>().getDataLayout();
   TLI = &getAnalysis<TargetLibraryInfo>();
 
-  TrapBB = 0;
+  TrapBB = nullptr;
   BuilderTy TheBuilder(F.getContext(), TargetFolder(DL));
   Builder = &TheBuilder;
   ObjectSizeOffsetEvaluator TheObjSizeEval(DL, TLI, F.getContext(),
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index df1549d..7f468f7 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -211,7 +211,8 @@ class DataFlowSanitizer : public ModulePass {
 
  public:
   DataFlowSanitizer(StringRef ABIListFile = StringRef(),
-                    void *(*getArgTLS)() = 0, void *(*getRetValTLS)() = 0);
+                    void *(*getArgTLS)() = nullptr,
+                    void *(*getRetValTLS)() = nullptr);
   static char ID;
   bool doInitialization(Module &M) override;
   bool runOnModule(Module &M) override;
@@ -233,8 +234,8 @@ struct DFSanFunction {
 
   DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
       : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()),
-        IsNativeABI(IsNativeABI), ArgTLSPtr(0), RetvalTLSPtr(0),
-        LabelReturnAlloca(0) {}
+        IsNativeABI(IsNativeABI), ArgTLSPtr(nullptr), RetvalTLSPtr(nullptr),
+        LabelReturnAlloca(nullptr) {}
   Value *getArgTLSPtr();
   Value *getArgTLS(unsigned Index, Instruction *Pos);
   Value *getRetvalTLS();
@@ -303,7 +304,7 @@ FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
     ArgTypes.push_back(ShadowPtrTy);
   Type *RetType = T->getReturnType();
   if (!RetType->isVoidTy())
-    RetType = StructType::get(RetType, ShadowTy, (Type *)0);
+    RetType = StructType::get(RetType, ShadowTy, (Type *)nullptr);
   return FunctionType::get(RetType, ArgTypes, T->isVarArg());
 }
 
@@ -345,7 +346,7 @@ FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
 bool DataFlowSanitizer::doInitialization(Module &M) {
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   if (!DLP)
-    return false;
+    report_fatal_error("data layout missing");
   DL = &DLP->getDataLayout();
 
   Mod = &M;
@@ -373,18 +374,20 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
 
   if (GetArgTLSPtr) {
     Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
-    ArgTLS = 0;
+    ArgTLS = nullptr;
     GetArgTLS = ConstantExpr::getIntToPtr(
         ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
         PointerType::getUnqual(
-            FunctionType::get(PointerType::getUnqual(ArgTLSTy), (Type *)0)));
+            FunctionType::get(PointerType::getUnqual(ArgTLSTy),
+                              (Type *)nullptr)));
   }
   if (GetRetvalTLSPtr) {
-    RetvalTLS = 0;
+    RetvalTLS = nullptr;
     GetRetvalTLS = ConstantExpr::getIntToPtr(
         ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
         PointerType::getUnqual(
-            FunctionType::get(PointerType::getUnqual(ShadowTy), (Type *)0)));
+            FunctionType::get(PointerType::getUnqual(ShadowTy),
+                              (Type *)nullptr)));
   }
 
   ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
@@ -554,7 +557,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
     ++i;
     // Don't stop on weak.  We assume people aren't playing games with the
     // instrumentedness of overridden weak aliases.
-    if (Function *F = dyn_cast<Function>(GA->getAliasedGlobal())) {
+    if (Function *F = dyn_cast<Function>(GA->getAliasee())) {
       bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
       if (GAInst && FInst) {
         addGlobalNamePrefix(GA);
@@ -629,7 +632,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
                // function... yet.
     } else if (FT->isVarArg()) {
       UnwrappedFnMap[&F] = &F;
-      *i = 0;
+      *i = nullptr;
     } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) {
       // Build a wrapper function for F.  The wrapper simply calls F, and is
       // added to FnsToInstrument so that any instrumentation according to its
@@ -680,9 +683,8 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
 
     // DFSanVisitor may create new basic blocks, which confuses df_iterator.
     // Build a copy of the list before iterating over it.
-    llvm::SmallVector<BasicBlock *, 4> BBList;
-    std::copy(df_begin(&(*i)->getEntryBlock()), df_end(&(*i)->getEntryBlock()),
-              std::back_inserter(BBList));
+    llvm::SmallVector<BasicBlock *, 4> BBList(
+        depth_first(&(*i)->getEntryBlock()));
 
     for (llvm::SmallVector<BasicBlock *, 4>::iterator i = BBList.begin(),
                                                       e = BBList.end();
@@ -1313,7 +1315,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
     }
   }
 
-  Instruction *Next = 0;
+  Instruction *Next = nullptr;
   if (!CS.getType()->isVoidTy()) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
       if (II->getNormalDest()->getSinglePredecessor()) {
diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp
index 069886e..18bda1a 100644
--- a/lib/Transforms/Instrumentation/DebugIR.cpp
+++ b/lib/Transforms/Instrumentation/DebugIR.cpp
@@ -16,8 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "debug-ir"
-
 #include "llvm/IR/ValueMap.h"
 #include "DebugIR.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
@@ -42,6 +40,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "debug-ir"
+
 namespace {
 
 /// Builds a map of Value* to line numbers on which the Value appears in a
@@ -118,7 +118,7 @@ public:
 
   void visitInstruction(Instruction &I) {
     if (I.getMetadata(LLVMContext::MD_dbg))
-      I.setMetadata(LLVMContext::MD_dbg, 0);
+      I.setMetadata(LLVMContext::MD_dbg, nullptr);
   }
 
   void run(Module *M) {
@@ -168,11 +168,11 @@ class DIUpdater : public InstVisitor<DIUpdater> {
 
 public:
   DIUpdater(Module &M, StringRef Filename = StringRef(),
-            StringRef Directory = StringRef(), const Module *DisplayM = 0,
-            const ValueToValueMapTy *VMap = 0)
+            StringRef Directory = StringRef(), const Module *DisplayM = nullptr,
+            const ValueToValueMapTy *VMap = nullptr)
       : Builder(M), Layout(&M), LineTable(DisplayM ? DisplayM : &M), VMap(VMap),
-        Finder(), Filename(Filename), Directory(Directory), FileNode(0),
-        LexicalBlockFileNode(0), CUNode(0) {
+        Finder(), Filename(Filename), Directory(Directory), FileNode(nullptr),
+        LexicalBlockFileNode(nullptr), CUNode(nullptr) {
     Finder.processModule(M);
     visit(&M);
   }
@@ -184,7 +184,7 @@ public:
       report_fatal_error("DebugIR pass supports only a signle compile unit per "
                          "Module.");
     createCompileUnit(Finder.compile_unit_count() == 1 ?
-                      (MDNode*)*Finder.compile_units().begin() : 0);
+                      (MDNode*)*Finder.compile_units().begin() : nullptr);
   }
 
   void visitFunction(Function &F) {
@@ -232,7 +232,7 @@ public:
     /// If a ValueToValueMap is provided, use it to get the real instruction as
     /// the line table was generated on a clone of the module on which we are
     /// operating.
-    Value *RealInst = 0;
+    Value *RealInst = nullptr;
     if (VMap)
       RealInst = VMap->lookup(&I);
 
@@ -256,7 +256,7 @@ public:
       NewLoc = DebugLoc::get(Line, Col, Loc.getScope(RealInst->getContext()),
                              Loc.getInlinedAt(RealInst->getContext()));
     else if (MDNode *scope = findScope(&I))
-      NewLoc = DebugLoc::get(Line, Col, scope, 0);
+      NewLoc = DebugLoc::get(Line, Col, scope, nullptr);
     else {
       DEBUG(dbgs() << "WARNING: no valid scope for instruction " << &I
                    << ". no DebugLoc will be present."
@@ -334,7 +334,7 @@ private:
     }
     DEBUG(dbgs() << "unable to find DISubprogram node for function "
                  << F->getName().str() << "\n");
-    return 0;
+    return nullptr;
   }
 
   /// Sets Line to the line number on which V appears and returns true. If a
@@ -366,7 +366,7 @@ private:
     TypeNodeIter i = TypeDescriptors.find(T);
     if (i != TypeDescriptors.end())
       return i->second;
-    return 0;
+    return nullptr;
   }
 
   /// Returns a DebugInfo type from an LLVM type T.
@@ -375,12 +375,12 @@ private:
     if (N)
       return DIDerivedType(N);
     else if (T->isVoidTy())
-      return DIDerivedType(0);
+      return DIDerivedType(nullptr);
     else if (T->isStructTy()) {
       N = Builder.createStructType(
           DIScope(LexicalBlockFileNode), T->getStructName(), DIFile(FileNode),
           0, Layout.getTypeSizeInBits(T), Layout.getABITypeAlignment(T), 0,
-          DIType(0), DIArray(0)); // filled in later
+          DIType(nullptr), DIArray(nullptr)); // filled in later
 
       // N is added to the map (early) so that element search below can find it,
       // so as to avoid infinite recursion for structs that contain pointers to
@@ -535,7 +535,7 @@ void DebugIR::writeDebugBitcode(const Module *M, int *fd) {
     Out.reset(new raw_fd_ostream(*fd, true));
   }
 
-  M->print(*Out, 0);
+  M->print(*Out, nullptr);
   Out->close();
 }
 
diff --git a/lib/Transforms/Instrumentation/DebugIR.h b/lib/Transforms/Instrumentation/DebugIR.h
index 3f57da5..02831ed 100644
--- a/lib/Transforms/Instrumentation/DebugIR.h
+++ b/lib/Transforms/Instrumentation/DebugIR.h
@@ -90,7 +90,7 @@ private:
   /// Write M to disk, optionally passing in an fd to an open file which is
   /// closed by this function after writing. If no fd is specified, a new file
   /// is opened, written, and closed.
-  void writeDebugBitcode(const llvm::Module *M, int *fd = 0);
+  void writeDebugBitcode(const llvm::Module *M, int *fd = nullptr);
 };
 
 } // llvm namespace
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index bd00ec8..8330a9b 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -14,8 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "insert-gcov-profiling"
-
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
@@ -39,10 +37,13 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <utility>
 using namespace llvm;
 
+#define DEBUG_TYPE "insert-gcov-profiling"
+
 static cl::opt<std::string>
 DefaultGCOVVersion("default-gcov-version", cl::init("402*"), cl::Hidden,
                    cl::ValueRequired);
@@ -77,9 +78,6 @@ namespace {
              "GCOVProfiler asked to do nothing?");
       init();
     }
-    ~GCOVProfiler() {
-      DeleteContainerPointers(Funcs);
-    }
     const char *getPassName() const override {
       return "GCOV Profiler";
     }
@@ -141,7 +139,7 @@ namespace {
 
     Module *M;
     LLVMContext *Ctx;
-    SmallVector<GCOVFunction *, 16> Funcs;
+    SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
   };
 }
 
@@ -449,6 +447,21 @@ bool GCOVProfiler::runOnModule(Module &M) {
   return false;
 }
 
+static bool functionHasLines(Function *F) {
+  // Check whether this function actually has any source lines. Not only
+  // do these waste space, they also can crash gcov.
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), IE = BB->end();
+         I != IE; ++I) {
+      const DebugLoc &Loc = I->getDebugLoc();
+      if (Loc.isUnknown()) continue;
+      if (Loc.getLine() != 0)
+        return true;
+    }
+  }
+  return false;
+}
+
 void GCOVProfiler::emitProfileNotes() {
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (!CU_Nodes) return;
@@ -474,6 +487,7 @@ void GCOVProfiler::emitProfileNotes() {
 
       Function *F = SP.getFunction();
       if (!F) continue;
+      if (!functionHasLines(F)) continue;
 
       // gcov expects every function to start with an entry block that has a
       // single successor, so split the entry block to make sure of that.
@@ -483,19 +497,19 @@ void GCOVProfiler::emitProfileNotes() {
         ++It;
       EntryBlock.splitBasicBlock(It);
 
-      GCOVFunction *Func =
-        new GCOVFunction(SP, &out, i, Options.UseCfgChecksum);
-      Funcs.push_back(Func);
+      Funcs.push_back(
+          make_unique<GCOVFunction>(SP, &out, i, Options.UseCfgChecksum));
+      GCOVFunction &Func = *Funcs.back();
 
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        GCOVBlock &Block = Func->getBlock(BB);
+        GCOVBlock &Block = Func.getBlock(BB);
         TerminatorInst *TI = BB->getTerminator();
         if (int successors = TI->getNumSuccessors()) {
           for (int i = 0; i != successors; ++i) {
-            Block.addEdge(Func->getBlock(TI->getSuccessor(i)));
+            Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
           }
         } else if (isa<ReturnInst>(TI)) {
-          Block.addEdge(Func->getReturnBlock());
+          Block.addEdge(Func.getReturnBlock());
         }
 
         uint32_t Line = 0;
@@ -511,7 +525,7 @@ void GCOVProfiler::emitProfileNotes() {
           Lines.addLine(Loc.getLine());
         }
       }
-      EdgeDestinations += Func->getEdgeDestinations();
+      EdgeDestinations += Func.getEdgeDestinations();
     }
 
     FileChecksums.push_back(hash_value(EdgeDestinations));
@@ -519,9 +533,7 @@ void GCOVProfiler::emitProfileNotes() {
     out.write(ReversedVersion, 4);
     out.write(reinterpret_cast<char*>(&FileChecksums.back()), 4);
 
-    for (SmallVectorImpl<GCOVFunction *>::iterator I = Funcs.begin(),
-           E = Funcs.end(); I != E; ++I) {
-      GCOVFunction *Func = *I;
+    for (auto &Func : Funcs) {
       Func->setCfgChecksum(FileChecksums.back());
       Func->writeOut();
     }
@@ -549,6 +561,7 @@ bool GCOVProfiler::emitProfileArcs() {
         continue;
       Function *F = SP.getFunction();
       if (!F) continue;
+      if (!functionHasLines(F)) continue;
       if (!Result) Result = true;
       unsigned Edges = 0;
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index ec1a195..b8e632e 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -93,12 +93,11 @@
 
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "msan"
-
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -122,6 +121,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "msan"
+
 static const uint64_t kShadowMask32 = 1ULL << 31;
 static const uint64_t kShadowMask64 = 1ULL << 46;
 static const uint64_t kOriginOffset32 = 1ULL << 30;
@@ -129,6 +130,9 @@ static const uint64_t kOriginOffset64 = 1ULL << 45;
 static const unsigned kMinOriginAlignment = 4;
 static const unsigned kShadowTLSAlignment = 8;
 
+// Accesses sizes are powers of two: 1, 2, 4, 8.
+static const size_t kNumberOfAccessSizes = 4;
+
 /// \brief Track origins of uninitialized values.
 ///
 /// Adds a section to MemorySanitizer report that points to the allocation
@@ -178,6 +182,14 @@ static cl::opt<std::string>  ClBlacklistFile("msan-blacklist",
        cl::desc("File containing the list of functions where MemorySanitizer "
                 "should not report bugs"), cl::Hidden);
 
+static cl::opt<int> ClInstrumentationWithCallThreshold(
+    "msan-instrumentation-with-call-threshold",
+    cl::desc(
+        "If the function being instrumented requires more than "
+        "this number of checks and origin stores, use callbacks instead of "
+        "inline checks (-1 means never use callbacks)."),
+    cl::Hidden, cl::init(3500));
+
 // Experimental. Wraps all indirect calls in the instrumented code with
 // a call to the given function. This is needed to assist the dynamic
 // helper tool (MSanDR) to regain control on transition between instrumented and
@@ -203,8 +215,8 @@ class MemorySanitizer : public FunctionPass {
                   StringRef BlacklistFile = StringRef())
       : FunctionPass(ID),
         TrackOrigins(std::max(TrackOrigins, (int)ClTrackOrigins)),
-        DL(0),
-        WarningFn(0),
+        DL(nullptr),
+        WarningFn(nullptr),
         BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile : BlacklistFile),
         WrapIndirectCalls(!ClWrapIndirectCalls.empty()) {}
   const char *getPassName() const override { return "MemorySanitizer"; }
@@ -245,6 +257,10 @@ class MemorySanitizer : public FunctionPass {
 
   /// \brief The run-time callback to print a warning.
   Value *WarningFn;
+  // These arrays are indexed by log2(AccessSize).
+  Value *MaybeWarningFn[kNumberOfAccessSizes];
+  Value *MaybeStoreOriginFn[kNumberOfAccessSizes];
+
   /// \brief Run-time helper that generates a new origin value for a stack
   /// allocation.
   Value *MsanSetAllocaOrigin4Fn;
@@ -321,6 +337,20 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
                                         : "__msan_warning_noreturn";
   WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), NULL);
 
+  for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+       AccessSizeIndex++) {
+    unsigned AccessSize = 1 << AccessSizeIndex;
+    std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize);
+    MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
+        FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
+        IRB.getInt32Ty(), NULL);
+
+    FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize);
+    MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
+        FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
+        IRB.getInt8PtrTy(), IRB.getInt32Ty(), NULL);
+  }
+
   MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
     "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
     IRB.getInt8PtrTy(), IntptrTy, NULL);
@@ -341,31 +371,32 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   // Create globals.
   RetvalTLS = new GlobalVariable(
     M, ArrayType::get(IRB.getInt64Ty(), 8), false,
-    GlobalVariable::ExternalLinkage, 0, "__msan_retval_tls", 0,
+    GlobalVariable::ExternalLinkage, nullptr, "__msan_retval_tls", nullptr,
     GlobalVariable::InitialExecTLSModel);
   RetvalOriginTLS = new GlobalVariable(
-    M, OriginTy, false, GlobalVariable::ExternalLinkage, 0,
-    "__msan_retval_origin_tls", 0, GlobalVariable::InitialExecTLSModel);
+    M, OriginTy, false, GlobalVariable::ExternalLinkage, nullptr,
+    "__msan_retval_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
 
   ParamTLS = new GlobalVariable(
     M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
-    GlobalVariable::ExternalLinkage, 0, "__msan_param_tls", 0,
+    GlobalVariable::ExternalLinkage, nullptr, "__msan_param_tls", nullptr,
     GlobalVariable::InitialExecTLSModel);
   ParamOriginTLS = new GlobalVariable(
     M, ArrayType::get(OriginTy, 1000), false, GlobalVariable::ExternalLinkage,
-    0, "__msan_param_origin_tls", 0, GlobalVariable::InitialExecTLSModel);
+    nullptr, "__msan_param_origin_tls", nullptr,
+    GlobalVariable::InitialExecTLSModel);
 
   VAArgTLS = new GlobalVariable(
     M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
-    GlobalVariable::ExternalLinkage, 0, "__msan_va_arg_tls", 0,
+    GlobalVariable::ExternalLinkage, nullptr, "__msan_va_arg_tls", nullptr,
     GlobalVariable::InitialExecTLSModel);
   VAArgOverflowSizeTLS = new GlobalVariable(
-    M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, 0,
-    "__msan_va_arg_overflow_size_tls", 0,
+    M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
+    "__msan_va_arg_overflow_size_tls", nullptr,
     GlobalVariable::InitialExecTLSModel);
   OriginTLS = new GlobalVariable(
-    M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, 0,
-    "__msan_origin_tls", 0, GlobalVariable::InitialExecTLSModel);
+    M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
+    "__msan_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
 
   // We insert an empty inline asm after __msan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
@@ -379,14 +410,14 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
         ClWrapIndirectCalls, AnyFunctionPtrTy, AnyFunctionPtrTy, NULL);
   }
 
-  if (ClWrapIndirectCallsFast) {
+  if (WrapIndirectCalls && ClWrapIndirectCallsFast) {
     MsandrModuleStart = new GlobalVariable(
         M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage,
-        0, "__executable_start");
+        nullptr, "__executable_start");
     MsandrModuleStart->setVisibility(GlobalVariable::HiddenVisibility);
     MsandrModuleEnd = new GlobalVariable(
         M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage,
-        0, "_end");
+        nullptr, "_end");
     MsandrModuleEnd->setVisibility(GlobalVariable::HiddenVisibility);
   }
 }
@@ -397,7 +428,7 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
 bool MemorySanitizer::doInitialization(Module &M) {
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   if (!DLP)
-    return false;
+    report_fatal_error("data layout missing");
   DL = &DLP->getDataLayout();
 
   BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
@@ -474,6 +505,11 @@ VarArgHelper*
 CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
                    MemorySanitizerVisitor &Visitor);
 
+unsigned TypeSizeToSizeIndex(unsigned TypeSize) {
+  if (TypeSize <= 8) return 0;
+  return Log2_32_Ceil(TypeSize / 8);
+}
+
 /// This class does all the work for a given function. Store and Load
 /// instructions store and load corresponding shadow and origin
 /// values. Most instructions propagate shadow from arguments to their
@@ -529,9 +565,42 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return IRB.CreateCall(MS.MsanChainOriginFn, V);
   }
 
-  void materializeStores() {
+  void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin,
+                   unsigned Alignment, bool AsCall) {
+    if (isa<StructType>(Shadow->getType())) {
+      IRB.CreateAlignedStore(updateOrigin(Origin, IRB), getOriginPtr(Addr, IRB),
+                             Alignment);
+    } else {
+      Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+      // TODO(eugenis): handle non-zero constant shadow by inserting an
+      // unconditional check (can not simply fail compilation as this could
+      // be in the dead code).
+      if (isa<Constant>(ConvertedShadow)) return;
+      unsigned TypeSizeInBits =
+          MS.DL->getTypeSizeInBits(ConvertedShadow->getType());
+      unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
+      if (AsCall && SizeIndex < kNumberOfAccessSizes) {
+        Value *Fn = MS.MaybeStoreOriginFn[SizeIndex];
+        Value *ConvertedShadow2 = IRB.CreateZExt(
+            ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
+        IRB.CreateCall3(Fn, ConvertedShadow2,
+                        IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                        updateOrigin(Origin, IRB));
+      } else {
+        Value *Cmp = IRB.CreateICmpNE(
+            ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp");
+        Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+            Cmp, IRB.GetInsertPoint(), false, MS.OriginStoreWeights);
+        IRBuilder<> IRBNew(CheckTerm);
+        IRBNew.CreateAlignedStore(updateOrigin(Origin, IRBNew),
+                                  getOriginPtr(Addr, IRBNew), Alignment);
+      }
+    }
+  }
+
+  void materializeStores(bool InstrumentWithCalls) {
     for (size_t i = 0, n = StoreList.size(); i < n; i++) {
-      StoreInst& I = *dyn_cast<StoreInst>(StoreList[i]);
+      StoreInst &I = *dyn_cast<StoreInst>(StoreList[i]);
 
       IRBuilder<> IRB(&I);
       Value *Val = I.getValueOperand();
@@ -540,53 +609,41 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
 
       StoreInst *NewSI =
-        IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment());
+          IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment());
       DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
       (void)NewSI;
 
-      if (ClCheckAccessAddress)
-        insertShadowCheck(Addr, &I);
+      if (ClCheckAccessAddress) insertShadowCheck(Addr, &I);
 
-      if (I.isAtomic())
-        I.setOrdering(addReleaseOrdering(I.getOrdering()));
+      if (I.isAtomic()) I.setOrdering(addReleaseOrdering(I.getOrdering()));
 
       if (MS.TrackOrigins) {
         unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment());
-        if (isa<StructType>(Shadow->getType())) {
-          IRB.CreateAlignedStore(updateOrigin(getOrigin(Val), IRB),
-                                 getOriginPtr(Addr, IRB), Alignment);
-        } else {
-          Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
-
-          // TODO(eugenis): handle non-zero constant shadow by inserting an
-          // unconditional check (can not simply fail compilation as this could
-          // be in the dead code).
-          if (isa<Constant>(ConvertedShadow))
-            continue;
-
-          Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
-              getCleanShadow(ConvertedShadow), "_mscmp");
-          Instruction *CheckTerm =
-              SplitBlockAndInsertIfThen(Cmp, &I, false, MS.OriginStoreWeights);
-          IRBuilder<> IRBNew(CheckTerm);
-          IRBNew.CreateAlignedStore(updateOrigin(getOrigin(Val), IRBNew),
-                                    getOriginPtr(Addr, IRBNew), Alignment);
-        }
+        storeOrigin(IRB, Addr, Shadow, getOrigin(Val), Alignment,
+                    InstrumentWithCalls);
       }
     }
   }
 
-  void materializeChecks() {
-    for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) {
-      Value *Shadow = InstrumentationList[i].Shadow;
-      Instruction *OrigIns = InstrumentationList[i].OrigIns;
-      IRBuilder<> IRB(OrigIns);
-      DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
-      Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
-      DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
-      // See the comment in materializeStores().
-      if (isa<Constant>(ConvertedShadow))
-        continue;
+  void materializeOneCheck(Instruction *OrigIns, Value *Shadow, Value *Origin,
+                           bool AsCall) {
+    IRBuilder<> IRB(OrigIns);
+    DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
+    Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+    DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
+    // See the comment in materializeStores().
+    if (isa<Constant>(ConvertedShadow)) return;
+    unsigned TypeSizeInBits =
+        MS.DL->getTypeSizeInBits(ConvertedShadow->getType());
+    unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
+    if (AsCall && SizeIndex < kNumberOfAccessSizes) {
+      Value *Fn = MS.MaybeWarningFn[SizeIndex];
+      Value *ConvertedShadow2 =
+          IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
+      IRB.CreateCall2(Fn, ConvertedShadow2, MS.TrackOrigins && Origin
+                                                ? Origin
+                                                : (Value *)IRB.getInt32(0));
+    } else {
       Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
                                     getCleanShadow(ConvertedShadow), "_mscmp");
       Instruction *CheckTerm = SplitBlockAndInsertIfThen(
@@ -595,14 +652,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
       IRB.SetInsertPoint(CheckTerm);
       if (MS.TrackOrigins) {
-        Value *Origin = InstrumentationList[i].Origin;
-        IRB.CreateStore(Origin ? (Value*)Origin : (Value*)IRB.getInt32(0),
+        IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0),
                         MS.OriginTLS);
       }
       IRB.CreateCall(MS.WarningFn);
       IRB.CreateCall(MS.EmptyAsm);
       DEBUG(dbgs() << "  CHECK: " << *Cmp << "\n");
     }
+  }
+
+  void materializeChecks(bool InstrumentWithCalls) {
+    for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) {
+      Instruction *OrigIns = InstrumentationList[i].OrigIns;
+      Value *Shadow = InstrumentationList[i].Shadow;
+      Value *Origin = InstrumentationList[i].Origin;
+      materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls);
+    }
     DEBUG(dbgs() << "DONE:\n" << F);
   }
 
@@ -662,17 +727,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Iterate all BBs in depth-first order and create shadow instructions
     // for all instructions (where applicable).
     // For PHI nodes we create dummy shadow PHIs which will be finalized later.
-    for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
-         DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
-      BasicBlock *BB = *DI;
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
       visit(*BB);
-    }
+
 
     // Finalize PHI nodes.
     for (size_t i = 0, n = ShadowPHINodes.size(); i < n; i++) {
       PHINode *PN = ShadowPHINodes[i];
       PHINode *PNS = cast<PHINode>(getShadow(PN));
-      PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : 0;
+      PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : nullptr;
       size_t NumValues = PN->getNumIncomingValues();
       for (size_t v = 0; v < NumValues; v++) {
         PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v));
@@ -683,12 +746,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     VAHelper->finalizeInstrumentation();
 
+    bool InstrumentWithCalls = ClInstrumentationWithCallThreshold >= 0 &&
+                               InstrumentationList.size() + StoreList.size() >
+                                   (unsigned)ClInstrumentationWithCallThreshold;
+
     // Delayed instrumentation of StoreInst.
     // This may add new checks to be inserted later.
-    materializeStores();
+    materializeStores(InstrumentWithCalls);
 
     // Insert shadow value checks.
-    materializeChecks();
+    materializeChecks(InstrumentWithCalls);
 
     // Wrap indirect calls.
     materializeIndirectCalls();
@@ -704,7 +771,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// \brief Compute the shadow type that corresponds to a given Type.
   Type *getShadowTy(Type *OrigTy) {
     if (!OrigTy->isSized()) {
-      return 0;
+      return nullptr;
     }
     // For integer type, shadow is the same as the original type.
     // This may return weird-sized types like i1.
@@ -784,7 +851,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// \brief Compute the origin address for a given function argument.
   Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB,
                                  int ArgOffset) {
-    if (!MS.TrackOrigins) return 0;
+    if (!MS.TrackOrigins) return nullptr;
     Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
     Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
     return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
@@ -825,7 +892,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   Constant *getCleanShadow(Value *V) {
     Type *ShadowTy = getShadowTy(V);
     if (!ShadowTy)
-      return 0;
+      return nullptr;
     return Constant::getNullValue(ShadowTy);
   }
 
@@ -845,7 +912,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   Constant *getPoisonedShadow(Value *V) {
     Type *ShadowTy = getShadowTy(V);
     if (!ShadowTy)
-      return 0;
+      return nullptr;
     return getPoisonedShadow(ShadowTy);
   }
 
@@ -936,7 +1003,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   /// \brief Get the origin for a value.
   Value *getOrigin(Value *V) {
-    if (!MS.TrackOrigins) return 0;
+    if (!MS.TrackOrigins) return nullptr;
     if (isa<Instruction>(V) || isa<Argument>(V)) {
       Value *Origin = OriginMap[V];
       if (!Origin) {
@@ -1234,7 +1301,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   public:
     Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB) :
-      Shadow(0), Origin(0), IRB(IRB), MSV(MSV) {}
+      Shadow(nullptr), Origin(nullptr), IRB(IRB), MSV(MSV) {}
 
     /// \brief Add a pair of shadow and origin values to the mix.
     Combiner &Add(Value *OpShadow, Value *OpOrigin) {
@@ -1265,7 +1332,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     /// \brief Add an application value to the mix.
     Combiner &Add(Value *V) {
       Value *OpShadow = MSV->getShadow(V);
-      Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : 0;
+      Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : nullptr;
       return Add(OpShadow, OpOrigin);
     }
 
@@ -1480,7 +1547,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void handleSignedRelationalComparison(ICmpInst &I) {
     Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
     Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
-    Value* op = NULL;
+    Value* op = nullptr;
     CmpInst::Predicate pre = I.getPredicate();
     if (constOp0 && constOp0->isNullValue() &&
         (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE)) {
@@ -1789,7 +1856,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
     case 1:
       ConvertOp = I.getArgOperand(0);
-      CopyOp = NULL;
+      CopyOp = nullptr;
       break;
     default:
       llvm_unreachable("Cvt intrinsic with unsupported number of arguments.");
@@ -1803,7 +1870,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // FIXME: consider propagating shadow of ConvertOp, at least in the case of
     // int->any conversion.
     Value *ConvertShadow = getShadow(ConvertOp);
-    Value *AggShadow = 0;
+    Value *AggShadow = nullptr;
     if (ConvertOp->getType()->isVectorTy()) {
       AggShadow = IRB.CreateExtractElement(
           ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
@@ -2055,7 +2122,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         continue;
       }
       unsigned Size = 0;
-      Value *Store = 0;
+      Value *Store = nullptr;
       // Compute the Shadow for arg even if it is ByVal, because
       // in that case getShadow() will copy the actual arg shadow to
       // __msan_param_tls.
@@ -2080,7 +2147,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         IRB.CreateStore(getOrigin(A),
                         getOriginPtrForArgument(A, IRB, ArgOffset));
       (void)Store;
-      assert(Size != 0 && Store != 0);
+      assert(Size != 0 && Store != nullptr);
       DEBUG(dbgs() << "  Param:" << *Store << "\n");
       ArgOffset += DataLayout::RoundUpAlignment(Size, 8);
     }
@@ -2098,7 +2165,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Until we have full dynamic coverage, make sure the retval shadow is 0.
     Value *Base = getShadowPtrForRetval(&I, IRBBefore);
     IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment);
-    Instruction *NextInsn = 0;
+    Instruction *NextInsn = nullptr;
     if (CS.isCall()) {
       NextInsn = I.getNextNode();
     } else {
@@ -2318,7 +2385,8 @@ struct VarArgAMD64Helper : public VarArgHelper {
 
   VarArgAMD64Helper(Function &F, MemorySanitizer &MS,
                     MemorySanitizerVisitor &MSV)
-    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(0), VAArgOverflowSize(0) { }
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr),
+      VAArgOverflowSize(nullptr) {}
 
   enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
 
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 5ffb17c..8fe9bca 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -19,8 +19,6 @@
 // The rest is handled by the run-time library.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "tsan"
-
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -46,6 +44,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "tsan"
+
 static cl::opt<std::string>  ClBlacklistFile("tsan-blacklist",
        cl::desc("Blacklist file"), cl::Hidden);
 static cl::opt<bool>  ClInstrumentMemoryAccesses(
@@ -78,7 +78,7 @@ namespace {
 struct ThreadSanitizer : public FunctionPass {
   ThreadSanitizer(StringRef BlacklistFile = StringRef())
       : FunctionPass(ID),
-        DL(0),
+        DL(nullptr),
         BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
                                             : BlacklistFile) { }
   const char *getPassName() const override;
@@ -174,8 +174,8 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
 
     for (int op = AtomicRMWInst::FIRST_BINOP;
         op <= AtomicRMWInst::LAST_BINOP; ++op) {
-      TsanAtomicRMW[op][i] = NULL;
-      const char *NamePart = NULL;
+      TsanAtomicRMW[op][i] = nullptr;
+      const char *NamePart = nullptr;
       if (op == AtomicRMWInst::Xchg)
         NamePart = "_exchange";
       else if (op == AtomicRMWInst::Add)
@@ -226,7 +226,7 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
 bool ThreadSanitizer::doInitialization(Module &M) {
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   if (!DLP)
-    return false;
+    report_fatal_error("data layout missing");
   DL = &DLP->getDataLayout();
   BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
 
@@ -518,7 +518,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
     if (Idx < 0)
       return false;
     Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx];
-    if (F == NULL)
+    if (!F)
       return false;
     const size_t ByteSize = 1 << Idx;
     const size_t BitSize = ByteSize * 8;
diff --git a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index 4eac39d..4098428 100644
--- a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -43,34 +43,34 @@ public:
     EPT_RetainAutoreleaseRV
   };
 
-  ARCRuntimeEntryPoints() : TheModule(0),
-                            AutoreleaseRV(0),
-                            Release(0),
-                            Retain(0),
-                            RetainBlock(0),
-                            Autorelease(0),
-                            StoreStrong(0),
-                            RetainRV(0),
-                            RetainAutorelease(0),
-                            RetainAutoreleaseRV(0) { }
+  ARCRuntimeEntryPoints() : TheModule(nullptr),
+                            AutoreleaseRV(nullptr),
+                            Release(nullptr),
+                            Retain(nullptr),
+                            RetainBlock(nullptr),
+                            Autorelease(nullptr),
+                            StoreStrong(nullptr),
+                            RetainRV(nullptr),
+                            RetainAutorelease(nullptr),
+                            RetainAutoreleaseRV(nullptr) { }
 
   ~ARCRuntimeEntryPoints() { }
 
   void Initialize(Module *M) {
     TheModule = M;
-    AutoreleaseRV = 0;
-    Release = 0;
-    Retain = 0;
-    RetainBlock = 0;
-    Autorelease = 0;
-    StoreStrong = 0;
-    RetainRV = 0;
-    RetainAutorelease = 0;
-    RetainAutoreleaseRV = 0;
+    AutoreleaseRV = nullptr;
+    Release = nullptr;
+    Retain = nullptr;
+    RetainBlock = nullptr;
+    Autorelease = nullptr;
+    StoreStrong = nullptr;
+    RetainRV = nullptr;
+    RetainAutorelease = nullptr;
+    RetainAutoreleaseRV = nullptr;
   }
 
   Constant *get(const EntryPointType entry) {
-    assert(TheModule != 0 && "Not initialized.");
+    assert(TheModule != nullptr && "Not initialized.");
 
     switch (entry) {
     case EPT_AutoreleaseRV:
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 8780359..08c8842 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -20,7 +20,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "objc-arc-dependency"
 #include "ObjCARC.h"
 #include "DependencyAnalysis.h"
 #include "ProvenanceAnalysis.h"
@@ -29,6 +28,8 @@
 using namespace llvm;
 using namespace llvm::objcarc;
 
+#define DEBUG_TYPE "objc-arc-dependency"
+
 /// Test whether the given instruction can result in a reference count
 /// modification (positive or negative) for the pointer's object.
 bool
@@ -223,7 +224,7 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor,
         pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
         if (PI == PE)
           // If we've reached the function entry, produce a null dependence.
-          DependingInsts.insert(0);
+          DependingInsts.insert(nullptr);
         else
           // Add the predecessors to the worklist.
           do {
diff --git a/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index cb7e4da..1a25391 100644
--- a/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -24,7 +24,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "objc-arc-ap-elim"
 #include "ObjCARC.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Constants.h"
@@ -34,6 +33,8 @@
 using namespace llvm;
 using namespace llvm::objcarc;
 
+#define DEBUG_TYPE "objc-arc-ap-elim"
+
 namespace {
   /// \brief Autorelease pool elimination.
   class ObjCARCAPElim : public ModulePass {
@@ -93,7 +94,7 @@ bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
 bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
   bool Changed = false;
 
-  Instruction *Push = 0;
+  Instruction *Push = nullptr;
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
     Instruction *Inst = I++;
     switch (GetBasicInstructionClass(Inst)) {
@@ -112,11 +113,11 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
         Inst->eraseFromParent();
         Push->eraseFromParent();
       }
-      Push = 0;
+      Push = nullptr;
       break;
     case IC_CallOrUser:
       if (MayAutorelease(ImmutableCallSite(Inst)))
-        Push = 0;
+        Push = nullptr;
       break;
     default:
       break;
@@ -154,8 +155,8 @@ bool ObjCARCAPElim::runOnModule(Module &M) {
   for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
        OI != OE; ++OI) {
     Value *Op = *OI;
-    // llvm.global_ctors is an array of pairs where the second members
-    // are constructor functions.
+    // llvm.global_ctors is an array of three-field structs where the second
+    // members are constructor functions.
     Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
     // If the user used a constructor function with the wrong signature and
     // it got bitcasted or whatever, look the other way.
diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
index d18667b..2c09e70 100644
--- a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
@@ -20,7 +20,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "objc-arc-aa"
 #include "ObjCARC.h"
 #include "ObjCARCAliasAnalysis.h"
 #include "llvm/IR/Instruction.h"
@@ -28,6 +27,8 @@
 #include "llvm/PassAnalysisSupport.h"
 #include "llvm/PassSupport.h"
 
+#define DEBUG_TYPE "objc-arc-aa"
+
 namespace llvm {
   class Function;
   class Value;
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 3da5a0e..f48d53d 100644
--- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -26,7 +26,6 @@
 // TODO: ObjCARCContract could insert PHI nodes when uses aren't
 // dominated by single calls.
 
-#define DEBUG_TYPE "objc-arc-contract"
 #include "ObjCARC.h"
 #include "ARCRuntimeEntryPoints.h"
 #include "DependencyAnalysis.h"
@@ -40,6 +39,8 @@
 using namespace llvm;
 using namespace llvm::objcarc;
 
+#define DEBUG_TYPE "objc-arc-contract"
+
 STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
 STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
 
@@ -157,7 +158,7 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
 
   // Check that there are no instructions between the retain and the autorelease
   // (such as an autorelease_pop) which may change the count.
-  CallInst *Retain = 0;
+  CallInst *Retain = nullptr;
   if (Class == IC_AutoreleaseRV)
     FindDependencies(RetainAutoreleaseRVDep, Arg,
                      Autorelease->getParent(), Autorelease,
@@ -218,7 +219,7 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
   BasicBlock::iterator I = Load, End = BB->end();
   ++I;
   AliasAnalysis::Location Loc = AA->getLocation(Load);
-  StoreInst *Store = 0;
+  StoreInst *Store = nullptr;
   bool SawRelease = false;
   for (; !Store || !SawRelease; ++I) {
     if (I == End)
@@ -300,7 +301,7 @@ bool ObjCARCContract::doInitialization(Module &M) {
   EP.Initialize(&M);
 
   // Initialize RetainRVMarker.
-  RetainRVMarker = 0;
+  RetainRVMarker = nullptr;
   if (NamedMDNode *NMD =
         M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
     if (NMD->getNumOperands() == 1) {
diff --git a/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
index 8bec699..bf9fcbb 100644
--- a/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -23,8 +23,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "objc-arc-expand"
-
 #include "ObjCARC.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Function.h"
@@ -40,6 +38,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+#define DEBUG_TYPE "objc-arc-expand"
+
 namespace llvm {
   class Module;
 }
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index eed3cb2..dd4dd50 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -24,7 +24,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "objc-arc-opts"
 #include "ObjCARC.h"
 #include "ARCRuntimeEntryPoints.h"
 #include "DependencyAnalysis.h"
@@ -44,6 +43,8 @@
 using namespace llvm;
 using namespace llvm::objcarc;
 
+#define DEBUG_TYPE "objc-arc-opts"
+
 /// \defgroup MiscUtils Miscellaneous utilities that are not ARC specific.
 /// @{
 
@@ -156,7 +157,7 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
       return FindSingleUseIdentifiedObject(
                cast<CallInst>(Arg)->getArgOperand(0));
     if (!IsObjCIdentifiedObject(Arg))
-      return 0;
+      return nullptr;
     return Arg;
   }
 
@@ -165,12 +166,12 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
   if (IsObjCIdentifiedObject(Arg)) {
     for (const User *U : Arg->users())
       if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg)
-         return 0;
+         return nullptr;
 
     return Arg;
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// This is a wrapper around getUnderlyingObjCPtr along the lines of
@@ -373,7 +374,7 @@ namespace {
     bool CFGHazardAfflicted;
 
     RRInfo() :
-      KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0),
+      KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(nullptr),
       CFGHazardAfflicted(false) {}
 
     void clear();
@@ -388,7 +389,7 @@ namespace {
 void RRInfo::clear() {
   KnownSafe = false;
   IsTailCallRelease = false;
-  ReleaseMetadata = 0;
+  ReleaseMetadata = nullptr;
   Calls.clear();
   ReverseInsertPts.clear();
   CFGHazardAfflicted = false;
@@ -397,7 +398,7 @@ void RRInfo::clear() {
 bool RRInfo::Merge(const RRInfo &Other) {
     // Conservatively merge the ReleaseMetadata information.
     if (ReleaseMetadata != Other.ReleaseMetadata)
-      ReleaseMetadata = 0;
+      ReleaseMetadata = nullptr;
 
     // Conservatively merge the boolean state.
     KnownSafe &= Other.KnownSafe;
@@ -456,7 +457,7 @@ namespace {
     }
 
     bool IsTrackingImpreciseReleases() const {
-      return RRI.ReleaseMetadata != 0;
+      return RRI.ReleaseMetadata != nullptr;
     }
 
     const MDNode *GetReleaseMetadata() const {
@@ -818,7 +819,7 @@ ARCAnnotationTargetIdentifier("objc-arc-annotation-target-identifier",
 /// arc annotation processor tool. If the function is an
 static MDString *AppendMDNodeToSourcePtr(unsigned NodeId,
                                          Value *Ptr) {
-  MDString *Hash = 0;
+  MDString *Hash = nullptr;
 
   // If pointer is a result of an instruction and it does not have a source
   // MDNode it, attach a new MDNode onto it. If pointer is a result of
@@ -880,7 +881,7 @@ static void AppendMDNodeToInstForPtr(unsigned NodeId,
                                      MDString *PtrSourceMDNodeID,
                                      Sequence OldSeq,
                                      Sequence NewSeq) {
-  MDNode *Node = 0;
+  MDNode *Node = nullptr;
   Value *tmp[3] = {PtrSourceMDNodeID,
                    SequenceToMDString(Inst->getContext(),
                                       OldSeq),
@@ -916,7 +917,7 @@ static void GenerateARCBBEntranceAnnotation(const char *Name, BasicBlock *BB,
 
   Value *PtrName;
   StringRef Tmp = Ptr->getName();
-  if (0 == (PtrName = M->getGlobalVariable(Tmp, true))) {
+  if (nullptr == (PtrName = M->getGlobalVariable(Tmp, true))) {
     Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp,
                                                          Tmp + "_STR");
     PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
@@ -925,7 +926,7 @@ static void GenerateARCBBEntranceAnnotation(const char *Name, BasicBlock *BB,
 
   Value *S;
   std::string SeqStr = SequenceToString(Seq);
-  if (0 == (S = M->getGlobalVariable(SeqStr, true))) {
+  if (nullptr == (S = M->getGlobalVariable(SeqStr, true))) {
     Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr,
                                                          SeqStr + "_STR");
     S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
@@ -959,7 +960,7 @@ static void GenerateARCBBTerminatorAnnotation(const char *Name, BasicBlock *BB,
 
   Value *PtrName;
   StringRef Tmp = Ptr->getName();
-  if (0 == (PtrName = M->getGlobalVariable(Tmp, true))) {
+  if (nullptr == (PtrName = M->getGlobalVariable(Tmp, true))) {
     Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp,
                                                          Tmp + "_STR");
     PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
@@ -968,7 +969,7 @@ static void GenerateARCBBTerminatorAnnotation(const char *Name, BasicBlock *BB,
 
   Value *S;
   std::string SeqStr = SequenceToString(Seq);
-  if (0 == (S = M->getGlobalVariable(SeqStr, true))) {
+  if (nullptr == (S = M->getGlobalVariable(SeqStr, true))) {
     Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr,
                                                          SeqStr + "_STR");
     S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
@@ -1718,7 +1719,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
                                      BBState &MyStates) {
   bool NestingDetected = false;
   InstructionClass Class = GetInstructionClass(Inst);
-  const Value *Arg = 0;
+  const Value *Arg = nullptr;
 
   DEBUG(dbgs() << "Class: " << Class << "\n");
 
@@ -1974,7 +1975,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
                                     BBState &MyStates) {
   bool NestingDetected = false;
   InstructionClass Class = GetInstructionClass(Inst);
-  const Value *Arg = 0;
+  const Value *Arg = nullptr;
 
   switch (Class) {
   case IC_RetainBlock:
@@ -2026,7 +2027,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     switch (OldSeq) {
     case S_Retain:
     case S_CanRelease:
-      if (OldSeq == S_Retain || ReleaseMetadata != 0)
+      if (OldSeq == S_Retain || ReleaseMetadata != nullptr)
         S.ClearReverseInsertPts();
       // FALL THROUGH
     case S_Use:
@@ -2432,7 +2433,7 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
           } else {
             if (ReleasesToMove.ReleaseMetadata !=
                 NewRetainReleaseRRI.ReleaseMetadata)
-              ReleasesToMove.ReleaseMetadata = 0;
+              ReleasesToMove.ReleaseMetadata = nullptr;
             if (ReleasesToMove.IsTailCallRelease !=
                 NewRetainReleaseRRI.IsTailCallRelease)
               ReleasesToMove.IsTailCallRelease = false;
@@ -2884,7 +2885,7 @@ FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
   FindDependencies(CanChangeRetainCount, Arg,
                    BB, Autorelease, DepInsts, Visited, PA);
   if (DepInsts.size() != 1)
-    return 0;
+    return nullptr;
 
   CallInst *Retain =
     dyn_cast_or_null<CallInst>(*DepInsts.begin());
@@ -2893,7 +2894,7 @@ FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
   if (!Retain ||
       !IsRetain(GetBasicInstructionClass(Retain)) ||
       GetObjCArg(Retain) != Arg) {
-    return 0;
+    return nullptr;
   }
 
   return Retain;
@@ -2911,17 +2912,17 @@ FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
   FindDependencies(NeedsPositiveRetainCount, Arg,
                    BB, Ret, DepInsts, V, PA);
   if (DepInsts.size() != 1)
-    return 0;
+    return nullptr;
 
   CallInst *Autorelease =
     dyn_cast_or_null<CallInst>(*DepInsts.begin());
   if (!Autorelease)
-    return 0;
+    return nullptr;
   InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
   if (!IsAutorelease(AutoreleaseClass))
-    return 0;
+    return nullptr;
   if (GetObjCArg(Autorelease) != Arg)
-    return 0;
+    return nullptr;
 
   return Autorelease;
 }
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index fa8b598..1a3a4aa 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "adce"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -28,6 +27,8 @@
 #include "llvm/Pass.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "adce"
+
 STATISTIC(NumRemoved, "Number of instructions removed");
 
 namespace {
diff --git a/lib/Transforms/Scalar/Android.mk b/lib/Transforms/Scalar/Android.mk
index 3894f93..079cc86 100644
--- a/lib/Transforms/Scalar/Android.mk
+++ b/lib/Transforms/Scalar/Android.mk
@@ -32,6 +32,7 @@ transforms_scalar_SRC_FILES := \
   Scalar.cpp \
   Scalarizer.cpp \
   ScalarReplAggregates.cpp \
+  SeparateConstOffsetFromGEP.cpp \
   SimplifyCFGPass.cpp \
   Sink.cpp \
   StructurizeCFG.cpp \
@@ -60,11 +61,6 @@ include $(CLEAR_VARS)
 LOCAL_SRC_FILES := $(transforms_scalar_SRC_FILES)
 LOCAL_MODULE:= libLLVMScalarOpts
 
-# Override the default optimization level to work around a SIGSEGV
-# on x86 target builds for SROA.cpp.
-# Bug: 8047767
-LOCAL_CFLAGS_x86 += -O1
-
 LOCAL_MODULE_TAGS := optional
 
 include $(LLVM_DEVICE_BUILD_MK)
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 27434c1..3ad1488 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -5,19 +5,19 @@ add_llvm_library(LLVMScalarOpts
   CorrelatedValuePropagation.cpp
   DCE.cpp
   DeadStoreElimination.cpp
-  Scalarizer.cpp
   EarlyCSE.cpp
-  GlobalMerge.cpp
+  FlattenCFGPass.cpp
   GVN.cpp
+  GlobalMerge.cpp
   IndVarSimplify.cpp
   JumpThreading.cpp
   LICM.cpp
   LoopDeletion.cpp
   LoopIdiomRecognize.cpp
   LoopInstSimplify.cpp
+  LoopRerollPass.cpp
   LoopRotation.cpp
   LoopStrengthReduce.cpp
-  LoopRerollPass.cpp
   LoopUnrollPass.cpp
   LoopUnswitch.cpp
   LowerAtomic.cpp
@@ -25,13 +25,14 @@ add_llvm_library(LLVMScalarOpts
   PartiallyInlineLibCalls.cpp
   Reassociate.cpp
   Reg2Mem.cpp
-  SampleProfile.cpp
   SCCP.cpp
   SROA.cpp
+  SampleProfile.cpp
   Scalar.cpp
   ScalarReplAggregates.cpp
+  Scalarizer.cpp
+  SeparateConstOffsetFromGEP.cpp
   SimplifyCFGPass.cpp
-  FlattenCFGPass.cpp
   Sink.cpp
   StructurizeCFG.cpp
   TailRecursionElimination.cpp
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index 57a1521..763d02b 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -33,7 +33,6 @@
 // %0 = load i64* inttoptr (i64 big_constant to i64*)
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "consthoist"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -44,9 +43,12 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include <tuple>
 
 using namespace llvm;
 
+#define DEBUG_TYPE "consthoist"
+
 STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
 STATISTIC(NumConstantsRebased, "Number of constants rebased");
 
@@ -117,7 +119,8 @@ class ConstantHoisting : public FunctionPass {
   SmallVector<ConstantInfo, 8> ConstantVec;
 public:
   static char ID; // Pass identification, replacement for typeid
-  ConstantHoisting() : FunctionPass(ID), TTI(0), DT(0), Entry(0) {
+  ConstantHoisting() : FunctionPass(ID), TTI(nullptr), DT(nullptr),
+                       Entry(nullptr) {
     initializeConstantHoistingPass(*PassRegistry::getPassRegistry());
   }
 
@@ -206,7 +209,16 @@ bool ConstantHoisting::runOnFunction(Function &Fn) {
 /// \brief Find the constant materialization insertion point.
 Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst,
                                                unsigned Idx) const {
-  // The simple and common case.
+  // If the operand is a cast instruction, then we have to materialize the
+  // constant before the cast instruction.
+  if (Idx != ~0U) {
+    Value *Opnd = Inst->getOperand(Idx);
+    if (auto CastInst = dyn_cast<Instruction>(Opnd))
+      if (CastInst->isCast())
+        return CastInst;
+  }
+
+  // The simple and common case. This also includes constant expressions.
   if (!isa<PHINode>(Inst) && !isa<LandingPadInst>(Inst))
     return Inst;
 
@@ -228,7 +240,7 @@ findConstantInsertionPoint(const ConstantInfo &ConstInfo) const {
   SmallPtrSet<BasicBlock *, 8> BBs;
   for (auto const &RCI : ConstInfo.RebasedConstants)
     for (auto const &U : RCI.Uses)
-      BBs.insert(U.Inst->getParent());
+      BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
 
   if (BBs.count(Entry))
     return &Entry->front();
@@ -487,8 +499,8 @@ void ConstantHoisting::emitBaseConstants(Instruction *Base, Constant *Offset,
       ClonedCastInst->insertAfter(CastInst);
       // Use the same debug location as the original cast instruction.
       ClonedCastInst->setDebugLoc(CastInst->getDebugLoc());
-      DEBUG(dbgs() << "Clone instruction: " << *ClonedCastInst << '\n'
-                   << "To               : " << *CastInst << '\n');
+      DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
+                   << "To               : " << *ClonedCastInst << '\n');
     }
 
     DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index 7045b36..dd51ce1 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -18,7 +18,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "constprop"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -31,6 +30,8 @@
 #include <set>
 using namespace llvm;
 
+#define DEBUG_TYPE "constprop"
+
 STATISTIC(NumInstKilled, "Number of instructions killed");
 
 namespace {
@@ -68,7 +69,7 @@ bool ConstantPropagation::runOnFunction(Function &F) {
   }
   bool Changed = false;
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  const DataLayout *DL = DLP ? &DLP->getDataLayout() : 0;
+  const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
   TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
 
   while (!WorkList.empty()) {
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 0490767..0829462 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "correlated-value-propagation"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -26,6 +25,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "correlated-value-propagation"
+
 STATISTIC(NumPhis,      "Number of phis propagated");
 STATISTIC(NumSelects,   "Number of selects propagated");
 STATISTIC(NumMemAccess, "Number of memory access targets propagated");
@@ -138,7 +139,7 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) {
 }
 
 bool CorrelatedValuePropagation::processMemAccess(Instruction *I) {
-  Value *Pointer = 0;
+  Value *Pointer = nullptr;
   if (LoadInst *L = dyn_cast<LoadInst>(I))
     Pointer = L->getPointerOperand();
   else
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index 8377fd9..99fac75 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "dce"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/InstIterator.h"
@@ -26,6 +25,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "dce"
+
 STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
 STATISTIC(DCEEliminated, "Number of insts removed");
 
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index f54c00d..3af8ee7 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "dse"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -38,6 +37,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "dse"
+
 STATISTIC(NumFastStores, "Number of stores deleted");
 STATISTIC(NumFastOther , "Number of other instrs removed");
 
@@ -49,7 +50,7 @@ namespace {
     const TargetLibraryInfo *TLI;
 
     static char ID; // Pass identification, replacement for typeid
-    DSE() : FunctionPass(ID), AA(0), MD(0), DT(0) {
+    DSE() : FunctionPass(ID), AA(nullptr), MD(nullptr), DT(nullptr) {
       initializeDSEPass(*PassRegistry::getPassRegistry());
     }
 
@@ -69,7 +70,7 @@ namespace {
         if (DT->isReachableFromEntry(I))
           Changed |= runOnBasicBlock(*I);
 
-      AA = 0; MD = 0; DT = 0;
+      AA = nullptr; MD = nullptr; DT = nullptr;
       return Changed;
     }
 
@@ -111,9 +112,9 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
 /// If ValueSet is non-null, remove any deleted instructions from it as well.
 ///
 static void DeleteDeadInstruction(Instruction *I,
-                                  MemoryDependenceAnalysis &MD,
-                                  const TargetLibraryInfo *TLI,
-                                  SmallSetVector<Value*, 16> *ValueSet = 0) {
+                               MemoryDependenceAnalysis &MD,
+                               const TargetLibraryInfo *TLI,
+                               SmallSetVector<Value*, 16> *ValueSet = nullptr) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
   NowDeadInsts.push_back(I);
@@ -131,7 +132,7 @@ static void DeleteDeadInstruction(Instruction *I,
 
     for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
       Value *Op = DeadInst->getOperand(op);
-      DeadInst->setOperand(op, 0);
+      DeadInst->setOperand(op, nullptr);
 
       // If this operand just became dead, add it to the NowDeadInsts list.
       if (!Op->use_empty()) continue;
@@ -203,13 +204,13 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
     // If we don't have target data around, an unknown size in Location means
     // that we should use the size of the pointee type.  This isn't valid for
     // memset/memcpy, which writes more than an i8.
-    if (Loc.Size == AliasAnalysis::UnknownSize && DL == 0)
+    if (Loc.Size == AliasAnalysis::UnknownSize && DL == nullptr)
       return AliasAnalysis::Location();
     return Loc;
   }
 
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
-  if (II == 0) return AliasAnalysis::Location();
+  if (!II) return AliasAnalysis::Location();
 
   switch (II->getIntrinsicID()) {
   default: return AliasAnalysis::Location(); // Unhandled intrinsic.
@@ -217,7 +218,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
     // If we don't have target data around, an unknown size in Location means
     // that we should use the size of the pointee type.  This isn't valid for
     // init.trampoline, which writes more than an i8.
-    if (DL == 0) return AliasAnalysis::Location();
+    if (!DL) return AliasAnalysis::Location();
 
     // FIXME: We don't know the size of the trampoline, so we can't really
     // handle it here.
@@ -359,7 +360,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
       // If we have no DataLayout information around, then the size of the store
       // is inferrable from the pointee type.  If they are the same type, then
       // we know that the store is safe.
-      if (DL == 0 && Later.Ptr->getType() == Earlier.Ptr->getType())
+      if (DL == nullptr && Later.Ptr->getType() == Earlier.Ptr->getType())
         return OverwriteComplete;
 
       return OverwriteUnknown;
@@ -373,7 +374,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
   // Otherwise, we have to have size information, and the later store has to be
   // larger than the earlier one.
   if (Later.Size == AliasAnalysis::UnknownSize ||
-      Earlier.Size == AliasAnalysis::UnknownSize || DL == 0)
+      Earlier.Size == AliasAnalysis::UnknownSize || DL == nullptr)
     return OverwriteUnknown;
 
   // Check to see if the later store is to the entire object (either a global,
@@ -461,7 +462,7 @@ static bool isPossibleSelfRead(Instruction *Inst,
   // Self reads can only happen for instructions that read memory.  Get the
   // location read.
   AliasAnalysis::Location InstReadLoc = getLocForRead(Inst, AA);
-  if (InstReadLoc.Ptr == 0) return false;  // Not a reading instruction.
+  if (!InstReadLoc.Ptr) return false;  // Not a reading instruction.
 
   // If the read and written loc obviously don't alias, it isn't a read.
   if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
@@ -528,7 +529,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
 
           DeleteDeadInstruction(SI, *MD, TLI);
 
-          if (NextInst == 0)  // Next instruction deleted.
+          if (!NextInst)  // Next instruction deleted.
             BBI = BB.begin();
           else if (BBI != BB.begin())  // Revisit this instruction if possible.
             --BBI;
@@ -543,7 +544,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
     AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA);
 
     // If we didn't get a useful location, fail.
-    if (Loc.Ptr == 0)
+    if (!Loc.Ptr)
       continue;
 
     while (InstDep.isDef() || InstDep.isClobber()) {
@@ -557,7 +558,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
       Instruction *DepWrite = InstDep.getInst();
       AliasAnalysis::Location DepLoc = getLocForWrite(DepWrite, *AA);
       // If we didn't get a useful location, or if it isn't a size, bail out.
-      if (DepLoc.Ptr == 0)
+      if (!DepLoc.Ptr)
         break;
 
       // If we find a write that is a) removable (i.e., non-volatile), b) is
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index af2c3d1..735f5c1 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "early-cse"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/ScopedHashTable.h"
@@ -29,6 +28,8 @@
 #include <vector>
 using namespace llvm;
 
+#define DEBUG_TYPE "early-cse"
+
 STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
 STATISTIC(NumCSE,      "Number of instructions CSE'd");
 STATISTIC(NumCSELoad,  "Number of load instructions CSE'd");
@@ -207,7 +208,7 @@ namespace {
         return false;
 
       CallInst *CI = dyn_cast<CallInst>(Inst);
-      if (CI == 0 || !CI->onlyReadsMemory())
+      if (!CI || !CI->onlyReadsMemory())
         return false;
       return true;
     }
@@ -405,14 +406,14 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   // have invalidated the live-out memory values of our parent value.  For now,
   // just be conservative and invalidate memory if this block has multiple
   // predecessors.
-  if (BB->getSinglePredecessor() == 0)
+  if (!BB->getSinglePredecessor())
     ++CurrentGeneration;
 
   /// LastStore - Keep track of the last non-volatile store that we saw... for
   /// as long as there in no instruction that reads memory.  If we see a store
   /// to the same location, we delete the dead store.  This zaps trivial dead
   /// stores which can occur in bitfield code among other things.
-  StoreInst *LastStore = 0;
+  StoreInst *LastStore = nullptr;
 
   bool Changed = false;
 
@@ -462,7 +463,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
       // Ignore volatile loads.
       if (!LI->isSimple()) {
-        LastStore = 0;
+        LastStore = nullptr;
         continue;
       }
 
@@ -470,7 +471,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       // generation, replace this instruction.
       std::pair<Value*, unsigned> InVal =
         AvailableLoads->lookup(Inst->getOperand(0));
-      if (InVal.first != 0 && InVal.second == CurrentGeneration) {
+      if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
         DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << "  to: "
               << *InVal.first << '\n');
         if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first);
@@ -483,20 +484,20 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       // Otherwise, remember that we have this instruction.
       AvailableLoads->insert(Inst->getOperand(0),
                           std::pair<Value*, unsigned>(Inst, CurrentGeneration));
-      LastStore = 0;
+      LastStore = nullptr;
       continue;
     }
 
     // If this instruction may read from memory, forget LastStore.
     if (Inst->mayReadFromMemory())
-      LastStore = 0;
+      LastStore = nullptr;
 
     // If this is a read-only call, process it.
     if (CallValue::canHandle(Inst)) {
       // If we have an available version of this call, and if it is the right
       // generation, replace this instruction.
       std::pair<Value*, unsigned> InVal = AvailableCalls->lookup(Inst);
-      if (InVal.first != 0 && InVal.second == CurrentGeneration) {
+      if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
         DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << "  to: "
                      << *InVal.first << '\n');
         if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first);
@@ -528,7 +529,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
           LastStore->eraseFromParent();
           Changed = true;
           ++NumDSE;
-          LastStore = 0;
+          LastStore = nullptr;
           continue;
         }
 
@@ -558,7 +559,7 @@ bool EarlyCSE::runOnFunction(Function &F) {
   std::vector<StackNode *> nodesToProcess;
 
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
diff --git a/lib/Transforms/Scalar/FlattenCFGPass.cpp b/lib/Transforms/Scalar/FlattenCFGPass.cpp
index e7f2564..0430c18 100644
--- a/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "flattencfg"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/CFG.h"
@@ -19,6 +18,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "flattencfg"
+
 namespace {
 struct FlattenCFGPass : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 33c387c..6d07ddd 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -15,11 +15,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "gvn"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -50,6 +50,8 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "gvn"
+
 STATISTIC(NumGVNInstr,  "Number of instructions deleted");
 STATISTIC(NumGVNLoad,   "Number of loads deleted");
 STATISTIC(NumGVNPRE,    "Number of instructions PRE'd");
@@ -213,13 +215,13 @@ Expression ValueTable::create_cmp_expression(unsigned Opcode,
 }
 
 Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) {
-  assert(EI != 0 && "Not an ExtractValueInst?");
+  assert(EI && "Not an ExtractValueInst?");
   Expression e;
   e.type = EI->getType();
   e.opcode = 0;
 
   IntrinsicInst *I = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
-  if (I != 0 && EI->getNumIndices() == 1 && *EI->idx_begin() == 0 ) {
+  if (I != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0 ) {
     // EI might be an extract from one of our recognised intrinsics. If it
     // is we'll synthesize a semantically equivalent expression instead on
     // an extract value expression.
@@ -327,7 +329,7 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
     const MemoryDependenceAnalysis::NonLocalDepInfo &deps =
       MD->getNonLocalCallDependency(CallSite(C));
     // FIXME: Move the checking logic to MemDep!
-    CallInst* cdep = 0;
+    CallInst* cdep = nullptr;
 
     // Check to see if we have a single dominating call instruction that is
     // identical to C.
@@ -338,8 +340,8 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
 
       // We don't handle non-definitions.  If we already have a call, reject
       // instruction dependencies.
-      if (!I->getResult().isDef() || cdep != 0) {
-        cdep = 0;
+      if (!I->getResult().isDef() || cdep != nullptr) {
+        cdep = nullptr;
         break;
       }
 
@@ -350,7 +352,7 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
         continue;
       }
 
-      cdep = 0;
+      cdep = nullptr;
       break;
     }
 
@@ -551,7 +553,7 @@ namespace {
     static AvailableValueInBlock getUndef(BasicBlock *BB) {
       AvailableValueInBlock Res;
       Res.BB = BB;
-      Res.Val.setPointer(0);
+      Res.Val.setPointer(nullptr);
       Res.Val.setInt(UndefVal);
       Res.Offset = 0;
       return Res;
@@ -611,7 +613,7 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit GVN(bool noloads = false)
-        : FunctionPass(ID), NoLoads(noloads), MD(0) {
+        : FunctionPass(ID), NoLoads(noloads), MD(nullptr) {
       initializeGVNPass(*PassRegistry::getPassRegistry());
     }
 
@@ -649,7 +651,7 @@ namespace {
     /// removeFromLeaderTable - Scan the list of values corresponding to a given
     /// value number, and remove the given instruction if encountered.
     void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) {
-      LeaderTableEntry* Prev = 0;
+      LeaderTableEntry* Prev = nullptr;
       LeaderTableEntry* Curr = &LeaderTable[N];
 
       while (Curr->Val != I || Curr->BB != BB) {
@@ -661,8 +663,8 @@ namespace {
         Prev->Next = Curr->Next;
       } else {
         if (!Curr->Next) {
-          Curr->Val = 0;
-          Curr->BB = 0;
+          Curr->Val = nullptr;
+          Curr->BB = nullptr;
         } else {
           LeaderTableEntry* Next = Curr->Next;
           Curr->Val = Next->Val;
@@ -855,7 +857,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
                                              Instruction *InsertPt,
                                              const DataLayout &DL) {
   if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL))
-    return 0;
+    return nullptr;
 
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
@@ -1060,7 +1062,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
                                             const DataLayout &DL) {
   // If the mem operation is a non-constant size, we can't handle it.
   ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
-  if (SizeCst == 0) return -1;
+  if (!SizeCst) return -1;
   uint64_t MemSizeInBits = SizeCst->getZExtValue()*8;
 
   // If this is memset, we just need to see if the offset is valid in the size
@@ -1075,10 +1077,10 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   MemTransferInst *MTI = cast<MemTransferInst>(MI);
 
   Constant *Src = dyn_cast<Constant>(MTI->getSource());
-  if (Src == 0) return -1;
+  if (!Src) return -1;
 
   GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &DL));
-  if (GV == 0 || !GV->isConstant()) return -1;
+  if (!GV || !GV->isConstant()) return -1;
 
   // See if the access is within the bounds of the transfer.
   int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
@@ -1420,8 +1422,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
         // If this is a clobber and L is the first instruction in its block, then
         // we have the first instruction in the entry block.
         if (DepLI != LI && Address && DL) {
-          int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(),
-                                                     LI->getPointerOperand(),
+          int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(), Address,
                                                      DepLI, *DL);
 
           if (Offset != -1) {
@@ -1469,8 +1470,8 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       if (S->getValueOperand()->getType() != LI->getType()) {
         // If the stored value is larger or equal to the loaded value, we can
         // reuse it.
-        if (DL == 0 || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
-                                                        LI->getType(), *DL)) {
+        if (!DL || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
+                                                    LI->getType(), *DL)) {
           UnavailableBlocks.push_back(DepBB);
           continue;
         }
@@ -1486,7 +1487,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       if (LD->getType() != LI->getType()) {
         // If the stored value is larger or equal to the loaded value, we can
         // reuse it.
-        if (DL == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*DL)){
+        if (!DL || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*DL)) {
           UnavailableBlocks.push_back(DepBB);
           continue;
         }
@@ -1539,7 +1540,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 
   // Check to see how many predecessors have the loaded value fully
   // available.
-  DenseMap<BasicBlock*, Value*> PredLoads;
+  MapVector<BasicBlock *, Value *> PredLoads;
   DenseMap<BasicBlock*, char> FullyAvailableBlocks;
   for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i)
     FullyAvailableBlocks[ValuesPerBlock[i].BB] = true;
@@ -1553,7 +1554,6 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
       continue;
     }
-    PredLoads[Pred] = 0;
 
     if (Pred->getTerminator()->getNumSuccessors() != 1) {
       if (isa<IndirectBrInst>(Pred->getTerminator())) {
@@ -1570,11 +1570,14 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
       }
 
       CriticalEdgePred.push_back(Pred);
+    } else {
+      // Only add the predecessors that will not be split for now.
+      PredLoads[Pred] = nullptr;
     }
   }
 
   // Decide whether PRE is profitable for this load.
-  unsigned NumUnavailablePreds = PredLoads.size();
+  unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size();
   assert(NumUnavailablePreds != 0 &&
          "Fully available value should already be eliminated!");
 
@@ -1586,12 +1589,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
       return false;
 
   // Split critical edges, and update the unavailable predecessors accordingly.
-  for (SmallVectorImpl<BasicBlock *>::iterator I = CriticalEdgePred.begin(),
-         E = CriticalEdgePred.end(); I != E; I++) {
-    BasicBlock *OrigPred = *I;
+  for (BasicBlock *OrigPred : CriticalEdgePred) {
     BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
-    PredLoads.erase(OrigPred);
-    PredLoads[NewPred] = 0;
+    assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!");
+    PredLoads[NewPred] = nullptr;
     DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
                  << LoadBB->getName() << '\n');
   }
@@ -1599,9 +1600,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   // Check if the load can safely be moved to all the unavailable predecessors.
   bool CanDoPRE = true;
   SmallVector<Instruction*, 8> NewInsts;
-  for (DenseMap<BasicBlock*, Value*>::iterator I = PredLoads.begin(),
-         E = PredLoads.end(); I != E; ++I) {
-    BasicBlock *UnavailablePred = I->first;
+  for (auto &PredLoad : PredLoads) {
+    BasicBlock *UnavailablePred = PredLoad.first;
 
     // Do PHI translation to get its value in the predecessor if necessary.  The
     // returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
@@ -1610,20 +1610,20 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     // the load on the pred (?!?), so we can insert code to materialize the
     // pointer if it is not available.
     PHITransAddr Address(LI->getPointerOperand(), DL);
-    Value *LoadPtr = 0;
+    Value *LoadPtr = nullptr;
     LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
                                                 *DT, NewInsts);
 
     // If we couldn't find or insert a computation of this phi translated value,
     // we fail PRE.
-    if (LoadPtr == 0) {
+    if (!LoadPtr) {
       DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
             << *LI->getPointerOperand() << "\n");
       CanDoPRE = false;
       break;
     }
 
-    I->second = LoadPtr;
+    PredLoad.second = LoadPtr;
   }
 
   if (!CanDoPRE) {
@@ -1632,8 +1632,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
       if (MD) MD->removeInstruction(I);
       I->eraseFromParent();
     }
-    // HINT:Don't revert the edge-splitting as following transformation may 
-    // also need to split these critial edges.
+    // HINT: Don't revert the edge-splitting as following transformation may
+    // also need to split these critical edges.
     return !CriticalEdgePred.empty();
   }
 
@@ -1654,10 +1654,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     VN.lookup_or_add(NewInsts[i]);
   }
 
-  for (DenseMap<BasicBlock*, Value*>::iterator I = PredLoads.begin(),
-         E = PredLoads.end(); I != E; ++I) {
-    BasicBlock *UnavailablePred = I->first;
-    Value *LoadPtr = I->second;
+  for (const auto &PredLoad : PredLoads) {
+    BasicBlock *UnavailablePred = PredLoad.first;
+    Value *LoadPtr = PredLoad.second;
 
     Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false,
                                         LI->getAlignment(),
@@ -1776,7 +1775,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
       MDNode *ReplMD = Metadata[i].second;
       switch(Kind) {
       default:
-        ReplInst->setMetadata(Kind, NULL); // Remove unknown metadata
+        ReplInst->setMetadata(Kind, nullptr); // Remove unknown metadata
         break;
       case LLVMContext::MD_dbg:
         llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg");
@@ -1832,7 +1831,7 @@ bool GVN::processLoad(LoadInst *L) {
     // a common base + constant offset, and if the previous store (or memset)
     // completely covers this load.  This sort of thing can happen in bitfield
     // access code.
-    Value *AvailVal = 0;
+    Value *AvailVal = nullptr;
     if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) {
       int Offset = AnalyzeLoadFromClobberingStore(L->getType(),
                                                   L->getPointerOperand(),
@@ -1920,7 +1919,7 @@ bool GVN::processLoad(LoadInst *L) {
       if (DL) {
         StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(),
                                                    L, *DL);
-        if (StoredVal == 0)
+        if (!StoredVal)
           return false;
 
         DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal
@@ -1949,7 +1948,7 @@ bool GVN::processLoad(LoadInst *L) {
       if (DL) {
         AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(),
                                                       L, *DL);
-        if (AvailableVal == 0)
+        if (!AvailableVal)
           return false;
 
         DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal
@@ -1999,9 +1998,9 @@ bool GVN::processLoad(LoadInst *L) {
 // a few comparisons of DFS numbers.
 Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
   LeaderTableEntry Vals = LeaderTable[num];
-  if (!Vals.Val) return 0;
+  if (!Vals.Val) return nullptr;
 
-  Value *Val = 0;
+  Value *Val = nullptr;
   if (DT->dominates(Vals.BB, BB)) {
     Val = Vals.Val;
     if (isa<Constant>(Val)) return Val;
@@ -2052,7 +2051,7 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
   const BasicBlock *Src = E.getStart();
   assert((!Pred || Pred == Src) && "No edge between these basic blocks!");
   (void)Src;
-  return Pred != 0;
+  return Pred != nullptr;
 }
 
 /// propagateEquality - The given values are known to be equal in every block
@@ -2296,7 +2295,7 @@ bool GVN::processInstruction(Instruction *I) {
   // Perform fast-path value-number based elimination of values inherited from
   // dominators.
   Value *repl = findLeader(I->getParent(), Num);
-  if (repl == 0) {
+  if (!repl) {
     // Failure, just remember this instance for future use.
     addToLeaderTable(Num, I, I->getParent());
     return false;
@@ -2319,7 +2318,7 @@ bool GVN::runOnFunction(Function& F) {
     MD = &getAnalysis<MemoryDependenceAnalysis>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
   VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
   VN.setMemDep(MD);
@@ -2421,10 +2420,7 @@ bool GVN::processBlock(BasicBlock *BB) {
 bool GVN::performPRE(Function &F) {
   bool Changed = false;
   SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap;
-  for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
-       DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
-    BasicBlock *CurrentBlock = *DI;
-
+  for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
     // Nothing to PRE in the entry block.
     if (CurrentBlock == &F.getEntryBlock()) continue;
 
@@ -2464,7 +2460,7 @@ bool GVN::performPRE(Function &F) {
       // more complicated to get right.
       unsigned NumWith = 0;
       unsigned NumWithout = 0;
-      BasicBlock *PREPred = 0;
+      BasicBlock *PREPred = nullptr;
       predMap.clear();
 
       for (pred_iterator PI = pred_begin(CurrentBlock),
@@ -2482,8 +2478,8 @@ bool GVN::performPRE(Function &F) {
         }
 
         Value* predV = findLeader(P, ValNo);
-        if (predV == 0) {
-          predMap.push_back(std::make_pair(static_cast<Value *>(0), P));
+        if (!predV) {
+          predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
           PREPred = P;
           ++NumWithout;
         } else if (predV == CurInst) {
@@ -2637,9 +2633,8 @@ bool GVN::iterateOnFunction(Function &F) {
   //
   std::vector<BasicBlock *> BBVect;
   BBVect.reserve(256);
-  for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()),
-       DE = df_end(DT->getRootNode()); DI != DE; ++DI)
-    BBVect.push_back(DI->getBlock());
+  for (DomTreeNode *x : depth_first(DT->getRootNode()))
+    BBVect.push_back(x->getBlock());
 
   for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
        I != E; I++)
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp
index 8ffd64b..990d067 100644
--- a/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -51,7 +51,6 @@
 //  note that we saved 2 registers here almostly "for free".
 // ===---------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "global-merge"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -70,6 +69,8 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "global-merge"
+
 cl::opt<bool>
 EnableGlobalMerge("global-merge", cl::Hidden,
                   cl::desc("Enable global merge pass"),
@@ -107,7 +108,7 @@ namespace {
 
   public:
     static char ID;             // Pass identification, replacement for typeid.
-    explicit GlobalMerge(const TargetMachine *TM = 0)
+    explicit GlobalMerge(const TargetMachine *TM = nullptr)
       : FunctionPass(ID), TM(TM) {
       initializeGlobalMergePass(*PassRegistry::getPassRegistry());
     }
@@ -173,7 +174,8 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
     GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst,
                                                   GlobalValue::InternalLinkage,
                                                   MergedInit, "_MergedGlobals",
-                                                  0, GlobalVariable::NotThreadLocal,
+                                                  nullptr,
+                                                  GlobalVariable::NotThreadLocal,
                                                   AddrSpace);
     for (size_t k = i; k < j; ++k) {
       Constant *Idx[2] = {
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 7537632..e83a5c4 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -24,7 +24,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "indvars"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
@@ -50,6 +49,8 @@
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "indvars"
+
 STATISTIC(NumWidened     , "Number of indvars widened");
 STATISTIC(NumReplaced    , "Number of exit values replaced");
 STATISTIC(NumLFTR        , "Number of loop exit tests replaced");
@@ -79,8 +80,8 @@ namespace {
   public:
 
     static char ID; // Pass identification, replacement for typeid
-    IndVarSimplify() : LoopPass(ID), LI(0), SE(0), DT(0), DL(0),
-                       Changed(false) {
+    IndVarSimplify() : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr),
+                       DL(nullptr), Changed(false) {
       initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
     }
 
@@ -196,7 +197,7 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
   if (!PHI)
     return User;
 
-  Instruction *InsertPt = 0;
+  Instruction *InsertPt = nullptr;
   for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
     if (PHI->getIncomingValue(i) != Def)
       continue;
@@ -257,13 +258,13 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // an add or increment value can not be represented by an integer.
   BinaryOperator *Incr =
     dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge));
-  if (Incr == 0 || Incr->getOpcode() != Instruction::FAdd) return;
+  if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return;
 
   // If this is not an add of the PHI with a constantfp, or if the constant fp
   // is not an integer, bail out.
   ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1));
   int64_t IncValue;
-  if (IncValueVal == 0 || Incr->getOperand(0) != PN ||
+  if (IncValueVal == nullptr || Incr->getOperand(0) != PN ||
       !ConvertToSInt(IncValueVal->getValueAPF(), IncValue))
     return;
 
@@ -280,7 +281,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   FCmpInst *Compare = dyn_cast<FCmpInst>(U1);
   if (!Compare)
     Compare = dyn_cast<FCmpInst>(U2);
-  if (Compare == 0 || !Compare->hasOneUse() ||
+  if (!Compare || !Compare->hasOneUse() ||
       !isa<BranchInst>(Compare->user_back()))
     return;
 
@@ -301,7 +302,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // transform it.
   ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
   int64_t ExitValue;
-  if (ExitValueVal == 0 ||
+  if (ExitValueVal == nullptr ||
       !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue))
     return;
 
@@ -651,7 +652,8 @@ namespace {
     Type *WidestNativeType; // Widest integer type created [sz]ext
     bool IsSigned;          // Was an sext user seen before a zext?
 
-    WideIVInfo() : NarrowIV(0), WidestNativeType(0), IsSigned(false) {}
+    WideIVInfo() : NarrowIV(nullptr), WidestNativeType(nullptr),
+                   IsSigned(false) {}
   };
 }
 
@@ -693,7 +695,7 @@ struct NarrowIVDefUse {
   Instruction *NarrowUse;
   Instruction *WideDef;
 
-  NarrowIVDefUse(): NarrowDef(0), NarrowUse(0), WideDef(0) {}
+  NarrowIVDefUse(): NarrowDef(nullptr), NarrowUse(nullptr), WideDef(nullptr) {}
 
   NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD):
     NarrowDef(ND), NarrowUse(NU), WideDef(WD) {}
@@ -736,9 +738,9 @@ public:
     L(LI->getLoopFor(OrigPhi->getParent())),
     SE(SEv),
     DT(DTree),
-    WidePhi(0),
-    WideInc(0),
-    WideIncExpr(0),
+    WidePhi(nullptr),
+    WideInc(nullptr),
+    WideIncExpr(nullptr),
     DeadInsts(DI) {
     assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
   }
@@ -793,7 +795,7 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) {
   unsigned Opcode = DU.NarrowUse->getOpcode();
   switch (Opcode) {
   default:
-    return 0;
+    return nullptr;
   case Instruction::Add:
   case Instruction::Mul:
   case Instruction::UDiv:
@@ -838,14 +840,14 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) {
 const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
   // Handle the common case of add<nsw/nuw>
   if (DU.NarrowUse->getOpcode() != Instruction::Add)
-    return 0;
+    return nullptr;
 
   // One operand (NarrowDef) has already been extended to WideDef. Now determine
   // if extending the other will lead to a recurrence.
   unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
   assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
 
-  const SCEV *ExtendOperExpr = 0;
+  const SCEV *ExtendOperExpr = nullptr;
   const OverflowingBinaryOperator *OBO =
     cast<OverflowingBinaryOperator>(DU.NarrowUse);
   if (IsSigned && OBO->hasNoSignedWrap())
@@ -855,7 +857,7 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
     ExtendOperExpr = SE->getZeroExtendExpr(
       SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
   else
-    return 0;
+    return nullptr;
 
   // When creating this AddExpr, don't apply the current operations NSW or NUW
   // flags. This instruction may be guarded by control flow that the no-wrap
@@ -866,7 +868,7 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
     SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr));
 
   if (!AddRec || AddRec->getLoop() != L)
-    return 0;
+    return nullptr;
   return AddRec;
 }
 
@@ -877,14 +879,14 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
 /// recurrence. Otherwise return NULL.
 const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
   if (!SE->isSCEVable(NarrowUse->getType()))
-    return 0;
+    return nullptr;
 
   const SCEV *NarrowExpr = SE->getSCEV(NarrowUse);
   if (SE->getTypeSizeInBits(NarrowExpr->getType())
       >= SE->getTypeSizeInBits(WideType)) {
     // NarrowUse implicitly widens its operand. e.g. a gep with a narrow
     // index. So don't follow this use.
-    return 0;
+    return nullptr;
   }
 
   const SCEV *WideExpr = IsSigned ?
@@ -892,7 +894,7 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
     SE->getZeroExtendExpr(NarrowExpr, WideType);
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
   if (!AddRec || AddRec->getLoop() != L)
-    return 0;
+    return nullptr;
   return AddRec;
 }
 
@@ -930,7 +932,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
         DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi
               << " to " << *WidePhi << "\n");
       }
-      return 0;
+      return nullptr;
     }
   }
   // Our raison d'etre! Eliminate sign and zero extension.
@@ -968,7 +970,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
     // push the uses of WideDef here.
 
     // No further widening is needed. The deceased [sz]ext had done it for us.
-    return 0;
+    return nullptr;
   }
 
   // Does this user itself evaluate to a recurrence after widening?
@@ -981,7 +983,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
     // follow it. Instead insert a Trunc to kill off the original use,
     // eventually isolating the original narrow IV so it can be removed.
     truncateIVUse(DU, DT);
-    return 0;
+    return nullptr;
   }
   // Assume block terminators cannot evaluate to a recurrence. We can't to
   // insert a Trunc after a terminator if there happens to be a critical edge.
@@ -990,14 +992,14 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
 
   // Reuse the IV increment that SCEVExpander created as long as it dominates
   // NarrowUse.
-  Instruction *WideUse = 0;
+  Instruction *WideUse = nullptr;
   if (WideAddRec == WideIncExpr
       && Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
     WideUse = WideInc;
   else {
     WideUse = CloneIVUser(DU);
     if (!WideUse)
-      return 0;
+      return nullptr;
   }
   // Evaluation of WideAddRec ensured that the narrow expression could be
   // extended outside the loop without overflow. This suggests that the wide use
@@ -1008,7 +1010,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
     DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse
           << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec << "\n");
     DeadInsts.push_back(WideUse);
-    return 0;
+    return nullptr;
   }
 
   // Returning WideUse pushes it on the worklist.
@@ -1043,7 +1045,7 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
   // Is this phi an induction variable?
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
   if (!AddRec)
-    return NULL;
+    return nullptr;
 
   // Widen the induction variable expression.
   const SCEV *WideIVExpr = IsSigned ?
@@ -1056,7 +1058,7 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
   // Can the IV be extended outside the loop without overflow?
   AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr);
   if (!AddRec || AddRec->getLoop() != L)
-    return NULL;
+    return nullptr;
 
   // An AddRec must have loop-invariant operands. Since this AddRec is
   // materialized by a loop header phi, the expression cannot have any post-loop
@@ -1282,7 +1284,7 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) {
 static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
   Instruction *IncI = dyn_cast<Instruction>(IncV);
   if (!IncI)
-    return 0;
+    return nullptr;
 
   switch (IncI->getOpcode()) {
   case Instruction::Add:
@@ -1293,17 +1295,17 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
     if (IncI->getNumOperands() == 2)
       break;
   default:
-    return 0;
+    return nullptr;
   }
 
   PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
   if (Phi && Phi->getParent() == L->getHeader()) {
     if (isLoopInvariant(IncI->getOperand(1), L, DT))
       return Phi;
-    return 0;
+    return nullptr;
   }
   if (IncI->getOpcode() == Instruction::GetElementPtr)
-    return 0;
+    return nullptr;
 
   // Allow add/sub to be commuted.
   Phi = dyn_cast<PHINode>(IncI->getOperand(1));
@@ -1311,7 +1313,7 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
     if (isLoopInvariant(IncI->getOperand(0), L, DT))
       return Phi;
   }
-  return 0;
+  return nullptr;
 }
 
 /// Return the compare guarding the loop latch, or NULL for unrecognized tests.
@@ -1321,7 +1323,7 @@ static ICmpInst *getLoopTest(Loop *L) {
   BasicBlock *LatchBlock = L->getLoopLatch();
   // Don't bother with LFTR if the loop is not properly simplified.
   if (!LatchBlock)
-    return 0;
+    return nullptr;
 
   BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
   assert(BI && "expected exit branch");
@@ -1446,8 +1448,8 @@ FindLoopCounter(Loop *L, const SCEV *BECount,
     cast<BranchInst>(L->getExitingBlock()->getTerminator())->getCondition();
 
   // Loop over all of the PHI nodes, looking for a simple counter.
-  PHINode *BestPhi = 0;
-  const SCEV *BestInit = 0;
+  PHINode *BestPhi = nullptr;
+  const SCEV *BestInit = nullptr;
   BasicBlock *LatchBlock = L->getLoopLatch();
   assert(LatchBlock && "needsLFTR should guarantee a loop latch");
 
@@ -1571,7 +1573,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     // IVInit integer and IVCount pointer would only occur if a canonical IV
     // were generated on top of case #2, which is not expected.
 
-    const SCEV *IVLimit = 0;
+    const SCEV *IVLimit = nullptr;
     // For unit stride, IVCount = Start + BECount with 2's complement overflow.
     // For non-zero Start, compute IVCount here.
     if (AR->getStart()->isZero())
@@ -1813,7 +1815,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   DeadInsts.clear();
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 067deb7..230a381 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jump-threading"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -38,6 +37,8 @@
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "jump-threading"
+
 STATISTIC(NumThreads, "Number of jumps threaded");
 STATISTIC(NumFolds,   "Number of terminators folded");
 STATISTIC(NumDupes,   "Number of branch blocks duplicated to eliminate phi");
@@ -153,7 +154,7 @@ bool JumpThreading::runOnFunction(Function &F) {
 
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
   LVI = &getAnalysis<LazyValueInfo>();
 
@@ -308,7 +309,7 @@ void JumpThreading::FindLoopHeaders(Function &F) {
 /// Returns null if Val is null or not an appropriate constant.
 static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
   if (!Val)
-    return 0;
+    return nullptr;
 
   // Undef is "known" enough.
   if (UndefValue *U = dyn_cast<UndefValue>(Val))
@@ -352,7 +353,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
   // If V is a non-instruction value, or an instruction in a different block,
   // then it can't be derived from a PHI.
   Instruction *I = dyn_cast<Instruction>(V);
-  if (I == 0 || I->getParent() != BB) {
+  if (!I || I->getParent() != BB) {
 
     // Okay, if this is a live-in value, see if it has a known value at the end
     // of any of our predecessors.
@@ -495,7 +496,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
         Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB);
 
         Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, DL);
-        if (Res == 0) {
+        if (!Res) {
           if (!isa<Constant>(RHS))
             continue;
 
@@ -581,7 +582,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
           // Either operand will do, so be sure to pick the one that's a known
           // constant.
           // FIXME: Do this more cleverly if both values are known constants?
-          KnownCond = (TrueVal != 0);
+          KnownCond = (TrueVal != nullptr);
         }
 
         // See if the select has a known constant value for this predecessor.
@@ -737,7 +738,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   Instruction *CondInst = dyn_cast<Instruction>(Condition);
 
   // All the rest of our checks depend on the condition being an instruction.
-  if (CondInst == 0) {
+  if (!CondInst) {
     // FIXME: Unify this with code below.
     if (ProcessThreadableEdges(Condition, BB, Preference))
       return true;
@@ -890,7 +891,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   SmallPtrSet<BasicBlock*, 8> PredsScanned;
   typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
   AvailablePredsTy AvailablePreds;
-  BasicBlock *OneUnavailablePred = 0;
+  BasicBlock *OneUnavailablePred = nullptr;
 
   // If we got here, the loaded value is transparent through to the start of the
   // block.  Check to see if it is available in any of the predecessor blocks.
@@ -904,16 +905,16 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
     // Scan the predecessor to see if the value is available in the pred.
     BBIt = PredBB->end();
-    MDNode *ThisTBAATag = 0;
+    MDNode *ThisTBAATag = nullptr;
     Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6,
-                                                    0, &ThisTBAATag);
+                                                    nullptr, &ThisTBAATag);
     if (!PredAvailable) {
       OneUnavailablePred = PredBB;
       continue;
     }
 
     // If tbaa tags disagree or are not present, forget about them.
-    if (TBAATag != ThisTBAATag) TBAATag = 0;
+    if (TBAATag != ThisTBAATag) TBAATag = nullptr;
 
     // If so, this load is partially redundant.  Remember this info so that we
     // can create a PHI node.
@@ -929,7 +930,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   // predecessor, we want to insert a merge block for those common predecessors.
   // This ensures that we only have to insert one reload, thus not increasing
   // code size.
-  BasicBlock *UnavailablePred = 0;
+  BasicBlock *UnavailablePred = nullptr;
 
   // If there is exactly one predecessor where the value is unavailable, the
   // already computed 'OneUnavailablePred' block is it.  If it ends in an
@@ -996,7 +997,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     BasicBlock *P = *PI;
     AvailablePredsTy::iterator I =
       std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(),
-                       std::make_pair(P, (Value*)0));
+                       std::make_pair(P, (Value*)nullptr));
 
     assert(I != AvailablePreds.end() && I->first == P &&
            "Didn't find entry for predecessor!");
@@ -1103,7 +1104,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
   SmallPtrSet<BasicBlock*, 16> SeenPreds;
   SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList;
 
-  BasicBlock *OnlyDest = 0;
+  BasicBlock *OnlyDest = nullptr;
   BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
 
   for (unsigned i = 0, e = PredValues.size(); i != e; ++i) {
@@ -1120,7 +1121,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
 
     BasicBlock *DestBB;
     if (isa<UndefValue>(Val))
-      DestBB = 0;
+      DestBB = nullptr;
     else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
       DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
     else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
@@ -1171,7 +1172,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
 
   // If the threadable edges are branching on an undefined value, we get to pick
   // the destination that these predecessors should get to.
-  if (MostPopularDest == 0)
+  if (!MostPopularDest)
     MostPopularDest = BB->getTerminator()->
                             getSuccessor(GetBestDestForJumpOnUndef(BB));
 
@@ -1273,7 +1274,7 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) {
   }
 
   // Determine which value to split on, true, false, or undef if neither.
-  ConstantInt *SplitVal = 0;
+  ConstantInt *SplitVal = nullptr;
   if (NumTrue > NumFalse)
     SplitVal = ConstantInt::getTrue(BB->getContext());
   else if (NumTrue != 0 || NumFalse != 0)
@@ -1294,7 +1295,7 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) {
   // help us.  However, we can just replace the LHS or RHS with the constant.
   if (BlocksToFoldInto.size() ==
       cast<PHINode>(BB->front()).getNumIncomingValues()) {
-    if (SplitVal == 0) {
+    if (!SplitVal) {
       // If all preds provide undef, just nuke the xor, because it is undef too.
       BO->replaceAllUsesWith(UndefValue::get(BO->getType()));
       BO->eraseFromParent();
@@ -1531,7 +1532,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
   // can just clone the bits from BB into the end of the new PredBB.
   BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
 
-  if (OldPredBranch == 0 || !OldPredBranch->isUnconditional()) {
+  if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
     PredBB = SplitEdge(PredBB, BB, this);
     OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
   }
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index b69f2dc..0a8d16f 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -30,7 +30,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "licm"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -60,6 +59,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "licm"
+
 STATISTIC(NumSunk      , "Number of instructions sunk out of loop");
 STATISTIC(NumHoisted   , "Number of instructions hoisted out of loop");
 STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
@@ -223,7 +224,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -315,8 +316,8 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
          "Parent loop not left in LCSSA form after LICM!");
 
   // Clear out loops state information for the next iteration
-  CurLoop = 0;
-  Preheader = 0;
+  CurLoop = nullptr;
+  Preheader = nullptr;
 
   // If this loop is nested inside of another one, save the alias information
   // for when we process the outer loop.
@@ -334,7 +335,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
 /// iteration.
 ///
 void LICM::SinkRegion(DomTreeNode *N) {
-  assert(N != 0 && "Null dominator tree node?");
+  assert(N != nullptr && "Null dominator tree node?");
   BasicBlock *BB = N->getBlock();
 
   // If this subregion is not in the top level loop at all, exit.
@@ -381,7 +382,7 @@ void LICM::SinkRegion(DomTreeNode *N) {
 /// before uses, allowing us to hoist a loop body in one pass without iteration.
 ///
 void LICM::HoistRegion(DomTreeNode *N) {
-  assert(N != 0 && "Null dominator tree node?");
+  assert(N != nullptr && "Null dominator tree node?");
   BasicBlock *BB = N->getBlock();
 
   // If this subregion is not in the top level loop at all, exit.
@@ -774,7 +775,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
   // We start with an alignment of one and try to find instructions that allow
   // us to prove better alignment.
   unsigned Alignment = 1;
-  MDNode *TBAATag = 0;
+  MDNode *TBAATag = nullptr;
 
   // Check that all of the pointers in the alias set have the same type.  We
   // cannot (yet) promote a memory location that is loaded and stored in
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 9a520c8..5ab686a 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loop-delete"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -23,6 +22,8 @@
 #include "llvm/IR/Dominators.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "loop-delete"
+
 STATISTIC(NumDeleted, "Number of loops deleted");
 
 namespace {
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index e5e8b84..26a83df 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -41,7 +41,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loop-idiom"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -61,6 +60,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "loop-idiom"
+
 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
 
@@ -114,7 +115,7 @@ namespace {
     Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const;
 
     /// Return true iff the idiom is detected in the loop. and 1) \p CntInst
-    /// is set to the instruction counting the pupulation bit. 2) \p CntPhi
+    /// is set to the instruction counting the population bit. 2) \p CntPhi
     /// is set to the corresponding phi node. 3) \p Var is set to the value
     /// whose population bits are being counted.
     bool detectIdiom
@@ -138,7 +139,7 @@ namespace {
     static char ID;
     explicit LoopIdiomRecognize() : LoopPass(ID) {
       initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
-      DL = 0; DT = 0; SE = 0; TLI = 0; TTI = 0;
+      DL = nullptr; DT = nullptr; SE = nullptr; TLI = nullptr; TTI = nullptr;
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
@@ -182,7 +183,7 @@ namespace {
       if (DL)
         return DL;
       DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-      DL = DLP ? &DLP->getDataLayout() : 0;
+      DL = DLP ? &DLP->getDataLayout() : nullptr;
       return DL;
     }
 
@@ -247,7 +248,7 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE,
 
     for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
       Value *Op = DeadInst->getOperand(op);
-      DeadInst->setOperand(op, 0);
+      DeadInst->setOperand(op, nullptr);
 
       // If this operand just became dead, add it to the NowDeadInsts list.
       if (!Op->use_empty()) continue;
@@ -292,9 +293,9 @@ bool LIRUtil::isAlmostEmpty(BasicBlock *BB) {
 BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) {
   if (BasicBlock *BB = PreHead->getSinglePredecessor()) {
     BranchInst *Br = getBranch(BB);
-    return Br && Br->isConditional() ? BB : 0;
+    return Br && Br->isConditional() ? BB : nullptr;
   }
-  return 0;
+  return nullptr;
 }
 
 //===----------------------------------------------------------------------===//
@@ -304,7 +305,7 @@ BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) {
 //===----------------------------------------------------------------------===//
 
 NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR):
-  LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(0) {
+  LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(nullptr) {
 }
 
 bool NclPopcountRecognize::preliminaryScreen() {
@@ -341,25 +342,25 @@ bool NclPopcountRecognize::preliminaryScreen() {
   return true;
 }
 
-Value *NclPopcountRecognize::matchCondition (BranchInst *Br,
-                                             BasicBlock *LoopEntry) const {
+Value *NclPopcountRecognize::matchCondition(BranchInst *Br,
+                                            BasicBlock *LoopEntry) const {
   if (!Br || !Br->isConditional())
-    return 0;
+    return nullptr;
 
   ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition());
   if (!Cond)
-    return 0;
+    return nullptr;
 
   ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
   if (!CmpZero || !CmpZero->isZero())
-    return 0;
+    return nullptr;
 
   ICmpInst::Predicate Pred = Cond->getPredicate();
   if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) ||
       (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry))
     return Cond->getOperand(0);
 
-  return 0;
+  return nullptr;
 }
 
 bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst,
@@ -390,9 +391,9 @@ bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst,
   Value *VarX1, *VarX0;
   PHINode *PhiX, *CountPhi;
 
-  DefX2 = CountInst = 0;
-  VarX1 = VarX0 = 0;
-  PhiX = CountPhi = 0;
+  DefX2 = CountInst = nullptr;
+  VarX1 = VarX0 = nullptr;
+  PhiX = CountPhi = nullptr;
   LoopEntry = *(CurLoop->block_begin());
 
   // step 1: Check if the loop-back branch is in desirable form.
@@ -439,7 +440,7 @@ bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst,
 
   // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
   {
-    CountInst = NULL;
+    CountInst = nullptr;
     for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(),
            IterE = LoopEntry->end(); Iter != IterE; Iter++) {
       Instruction *Inst = Iter;
@@ -744,7 +745,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
 
       // If processing the store invalidated our iterator, start over from the
       // top of the block.
-      if (InstPtr == 0)
+      if (!InstPtr)
         I = BB->begin();
       continue;
     }
@@ -757,7 +758,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
 
       // If processing the memset invalidated our iterator, start over from the
       // top of the block.
-      if (InstPtr == 0)
+      if (!InstPtr)
         I = BB->begin();
       continue;
     }
@@ -784,7 +785,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
   // random store we can't handle.
   const SCEVAddRecExpr *StoreEv =
     dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
-  if (StoreEv == 0 || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
+  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
     return false;
 
   // Check to see if the stride matches the size of the store.  If so, then we
@@ -792,7 +793,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
   unsigned StoreSize = (unsigned)SizeInBits >> 3;
   const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
 
-  if (Stride == 0 || StoreSize != Stride->getValue()->getValue()) {
+  if (!Stride || StoreSize != Stride->getValue()->getValue()) {
     // TODO: Could also handle negative stride here someday, that will require
     // the validity check in mayLoopAccessLocation to be updated though.
     // Enable this to print exact negative strides.
@@ -841,7 +842,7 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) {
   // loop, which indicates a strided store.  If we have something else, it's a
   // random store we can't handle.
   const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
-  if (Ev == 0 || Ev->getLoop() != CurLoop || !Ev->isAffine())
+  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
     return false;
 
   // Reject memsets that are so large that they overflow an unsigned.
@@ -855,7 +856,7 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) {
 
   // TODO: Could also handle negative stride here someday, that will require the
   // validity check in mayLoopAccessLocation to be updated though.
-  if (Stride == 0 || MSI->getLength() != Stride->getValue())
+  if (!Stride || MSI->getLength() != Stride->getValue())
     return false;
 
   return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
@@ -908,23 +909,23 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) {
   // array.  We could theoretically do a store to an alloca or something, but
   // that doesn't seem worthwhile.
   Constant *C = dyn_cast<Constant>(V);
-  if (C == 0) return 0;
+  if (!C) return nullptr;
 
   // Only handle simple values that are a power of two bytes in size.
   uint64_t Size = DL.getTypeSizeInBits(V->getType());
   if (Size == 0 || (Size & 7) || (Size & (Size-1)))
-    return 0;
+    return nullptr;
 
   // Don't care enough about darwin/ppc to implement this.
   if (DL.isBigEndian())
-    return 0;
+    return nullptr;
 
   // Convert to size in bytes.
   Size /= 8;
 
   // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
   // if the top and bottom are the same (e.g. for vectors and large integers).
-  if (Size > 16) return 0;
+  if (Size > 16) return nullptr;
 
   // If the constant is exactly 16 bytes, just use it.
   if (Size == 16) return C;
@@ -949,7 +950,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   // are stored.  A store of i32 0x01020304 can never be turned into a memset,
   // but it can be turned into memset_pattern if the target supports it.
   Value *SplatValue = isBytewiseValue(StoredVal);
-  Constant *PatternValue = 0;
+  Constant *PatternValue = nullptr;
 
   unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
 
@@ -960,13 +961,13 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
       // promote the memset.
       CurLoop->isLoopInvariant(SplatValue)) {
     // Keep and use SplatValue.
-    PatternValue = 0;
+    PatternValue = nullptr;
   } else if (DestAS == 0 &&
              TLI->has(LibFunc::memset_pattern16) &&
              (PatternValue = getMemSetPatternValue(StoredVal, *DL))) {
     // Don't create memset_pattern16s with address spaces.
     // It looks like we can use PatternValue!
-    SplatValue = 0;
+    SplatValue = nullptr;
   } else {
     // Otherwise, this isn't an idiom we can transform.  For example, we can't
     // do anything with a 3-byte store.
@@ -1033,7 +1034,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
                                         Int8PtrTy,
                                         Int8PtrTy,
                                         IntPtr,
-                                        (void*)0);
+                                        (void*)nullptr);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
     // an constant array of 16-bytes.  Plop the value into a mergable global.
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 263ba93..ab1a939 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loop-instsimplify"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
@@ -26,6 +25,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "loop-instsimplify"
+
 STATISTIC(NumSimplified, "Number of redundant instructions simplified");
 
 namespace {
@@ -70,10 +71,10 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : 0;
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   LoopInfo *LI = &getAnalysis<LoopInfo>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  const DataLayout *DL = DLP ? &DLP->getDataLayout() : 0;
+  const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
   const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
@@ -126,7 +127,15 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
             ++NumSimplified;
           }
         }
-        LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+        bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+        if (res) {
+          // RecursivelyDeleteTriviallyDeadInstruction can remove
+          // more than one instruction, so simply incrementing the
+          // iterator does not work. When instructions get deleted
+          // re-iterate instead.
+          BI = BB->begin(); BE = BB->end();
+          LocalChanged |= res;
+        }
 
         if (IsSubloopHeader && !isa<PHINode>(I))
           break;
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index 81c1e42..8b5e036 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loop-reroll"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -36,6 +35,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "loop-reroll"
+
 STATISTIC(NumRerolledLoops, "Number of rerolled loops");
 
 static cl::opt<unsigned>
@@ -945,7 +946,7 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
       bool InReduction = Reductions.isPairInSame(J1, J2);
 
       if (!(InReduction && J1->isAssociative())) {
-        bool Swapped = false, SomeOpMatched = false;;
+        bool Swapped = false, SomeOpMatched = false;
         for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) {
           Value *Op2 = J2->getOperand(j);
 
@@ -1133,7 +1134,7 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   SE = &getAnalysis<ScalarEvolution>();
   TLI = &getAnalysis<TargetLibraryInfo>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   BasicBlock *Header = L->getHeader();
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index fde6bac..2ce5831 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loop-rotate"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CodeMetrics.h"
@@ -24,6 +23,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -31,7 +31,11 @@
 #include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 
-#define MAX_HEADER_SIZE 16
+#define DEBUG_TYPE "loop-rotate"
+
+static cl::opt<unsigned>
+DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden,
+       cl::desc("The default maximum header size for automatic loop rotation"));
 
 STATISTIC(NumRotated, "Number of loops rotated");
 namespace {
@@ -39,8 +43,12 @@ namespace {
   class LoopRotate : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
-    LoopRotate() : LoopPass(ID) {
+    LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
       initializeLoopRotatePass(*PassRegistry::getPassRegistry());
+      if (SpecifiedMaxHeaderSize == -1)
+        MaxHeaderSize = DefaultRotationThreshold;
+      else
+        MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
     }
 
     // LCSSA form makes instruction renaming easier.
@@ -61,6 +69,7 @@ namespace {
     bool rotateLoop(Loop *L, bool SimplifiedLatch);
 
   private:
+    unsigned MaxHeaderSize;
     LoopInfo *LI;
     const TargetTransformInfo *TTI;
   };
@@ -74,7 +83,9 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
 
-Pass *llvm::createLoopRotatePass() { return new LoopRotate(); }
+Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
+  return new LoopRotate(MaxHeaderSize);
+}
 
 /// Rotate Loop L as many times as possible. Return true if
 /// the loop is rotated at least once.
@@ -82,6 +93,9 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (skipOptnoneFunction(L))
     return false;
 
+  // Save the loop metadata.
+  MDNode *LoopMD = L->getLoopID();
+
   LI = &getAnalysis<LoopInfo>();
   TTI = &getAnalysis<TargetTransformInfo>();
 
@@ -96,6 +110,12 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
     MadeChange = true;
     SimplifiedLatch = false;
   }
+
+  // Restore the loop metadata.
+  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+  if ((MadeChange || SimplifiedLatch) && LoopMD)
+    L->setLoopID(LoopMD);
+
   return MadeChange;
 }
 
@@ -281,7 +301,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   BasicBlock *OrigLatch = L->getLoopLatch();
 
   BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
-  if (BI == 0 || BI->isUnconditional())
+  if (!BI || BI->isUnconditional())
     return false;
 
   // If the loop header is not one of the loop exiting blocks then
@@ -292,7 +312,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
   // If the loop latch already contains a branch that leaves the loop then the
   // loop is already rotated.
-  if (OrigLatch == 0)
+  if (!OrigLatch)
     return false;
 
   // Rotate if either the loop latch does *not* exit the loop, or if the loop
@@ -310,7 +330,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
             << " instructions: "; L->dump());
       return false;
     }
-    if (Metrics.NumInsts > MAX_HEADER_SIZE)
+    if (Metrics.NumInsts > MaxHeaderSize)
       return false;
   }
 
@@ -319,7 +339,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
   // If the loop could not be converted to canonical form, it must have an
   // indirectbr in it, just give up.
-  if (OrigPreheader == 0)
+  if (!OrigPreheader)
     return false;
 
   // Anything ScalarEvolution may know about this loop or the PHI nodes
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 272a16d..914b56a 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -53,7 +53,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loop-reduce"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
@@ -78,6 +77,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "loop-reduce"
+
 /// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
 /// bail out. This threshold is far beyond the number of users that LSR can
 /// conceivably solve, so it should not affect generated code, but catches the
@@ -237,7 +238,15 @@ struct Formula {
   int64_t Scale;
 
   /// BaseRegs - The list of "base" registers for this use. When this is
-  /// non-empty,
+  /// non-empty. The canonical representation of a formula is
+  /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
+  /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+  /// #1 enforces that the scaled register is always used when at least two
+  /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
+  /// #2 enforces that 1 * reg is reg.
+  /// This invariant can be temporarly broken while building a formula.
+  /// However, every formula inserted into the LSRInstance must be in canonical
+  /// form.
   SmallVector<const SCEV *, 4> BaseRegs;
 
   /// ScaledReg - The 'scaled' register for this use. This should be non-null
@@ -250,12 +259,18 @@ struct Formula {
   int64_t UnfoldedOffset;
 
   Formula()
-      : BaseGV(0), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(0),
-        UnfoldedOffset(0) {}
+      : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0),
+        ScaledReg(nullptr), UnfoldedOffset(0) {}
 
   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
-  unsigned getNumRegs() const;
+  bool isCanonical() const;
+
+  void Canonicalize();
+
+  bool Unscale();
+
+  size_t getNumRegs() const;
   Type *getType() const;
 
   void DeleteBaseReg(const SCEV *&S);
@@ -345,12 +360,58 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
       BaseRegs.push_back(Sum);
     HasBaseReg = true;
   }
+  Canonicalize();
+}
+
+/// \brief Check whether or not this formula statisfies the canonical
+/// representation.
+/// \see Formula::BaseRegs.
+bool Formula::isCanonical() const {
+  if (ScaledReg)
+    return Scale != 1 || !BaseRegs.empty();
+  return BaseRegs.size() <= 1;
+}
+
+/// \brief Helper method to morph a formula into its canonical representation.
+/// \see Formula::BaseRegs.
+/// Every formula having more than one base register, must use the ScaledReg
+/// field. Otherwise, we would have to do special cases everywhere in LSR
+/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
+/// On the other hand, 1*reg should be canonicalized into reg.
+void Formula::Canonicalize() {
+  if (isCanonical())
+    return;
+  // So far we did not need this case. This is easy to implement but it is
+  // useless to maintain dead code. Beside it could hurt compile time.
+  assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+  // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
+  ScaledReg = BaseRegs.back();
+  BaseRegs.pop_back();
+  Scale = 1;
+  size_t BaseRegsSize = BaseRegs.size();
+  size_t Try = 0;
+  // If ScaledReg is an invariant, try to find a variant expression.
+  while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
+    std::swap(ScaledReg, BaseRegs[Try++]);
+}
+
+/// \brief Get rid of the scale in the formula.
+/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
+/// \return true if it was possible to get rid of the scale, false otherwise.
+/// \note After this operation the formula may not be in the canonical form.
+bool Formula::Unscale() {
+  if (Scale != 1)
+    return false;
+  Scale = 0;
+  BaseRegs.push_back(ScaledReg);
+  ScaledReg = nullptr;
+  return true;
 }
 
 /// getNumRegs - Return the total number of register operands used by this
 /// formula. This does not include register uses implied by non-constant
 /// addrec strides.
-unsigned Formula::getNumRegs() const {
+size_t Formula::getNumRegs() const {
   return !!ScaledReg + BaseRegs.size();
 }
 
@@ -360,7 +421,7 @@ Type *Formula::getType() const {
   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
          ScaledReg ? ScaledReg->getType() :
          BaseGV ? BaseGV->getType() :
-         0;
+         nullptr;
 }
 
 /// DeleteBaseReg - Delete the given base reg from the BaseRegs list.
@@ -487,11 +548,11 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
   // Check for a division of a constant by a constant.
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
     if (!RC)
-      return 0;
+      return nullptr;
     const APInt &LA = C->getValue()->getValue();
     const APInt &RA = RC->getValue()->getValue();
     if (LA.srem(RA) != 0)
-      return 0;
+      return nullptr;
     return SE.getConstant(LA.sdiv(RA));
   }
 
@@ -500,16 +561,16 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
     if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
       const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
                                       IgnoreSignificantBits);
-      if (!Step) return 0;
+      if (!Step) return nullptr;
       const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
                                        IgnoreSignificantBits);
-      if (!Start) return 0;
+      if (!Start) return nullptr;
       // FlagNW is independent of the start value, step direction, and is
       // preserved with smaller magnitude steps.
       // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
       return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
     }
-    return 0;
+    return nullptr;
   }
 
   // Distribute the sdiv over add operands, if the add doesn't overflow.
@@ -520,12 +581,12 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
            I != E; ++I) {
         const SCEV *Op = getExactSDiv(*I, RHS, SE,
                                       IgnoreSignificantBits);
-        if (!Op) return 0;
+        if (!Op) return nullptr;
         Ops.push_back(Op);
       }
       return SE.getAddExpr(Ops);
     }
-    return 0;
+    return nullptr;
   }
 
   // Check for a multiply operand that we can pull RHS out of.
@@ -544,13 +605,13 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
           }
         Ops.push_back(S);
       }
-      return Found ? SE.getMulExpr(Ops) : 0;
+      return Found ? SE.getMulExpr(Ops) : nullptr;
     }
-    return 0;
+    return nullptr;
   }
 
   // Otherwise we don't know.
-  return 0;
+  return nullptr;
 }
 
 /// ExtractImmediate - If S involves the addition of a constant integer value,
@@ -604,7 +665,7 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
                            SCEV::FlagAnyWrap);
     return Result;
   }
-  return 0;
+  return nullptr;
 }
 
 /// isAddressUse - Returns true if the specified instruction is using the
@@ -755,12 +816,12 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
     Value *V = DeadInsts.pop_back_val();
     Instruction *I = dyn_cast_or_null<Instruction>(V);
 
-    if (I == 0 || !isInstructionTriviallyDead(I))
+    if (!I || !isInstructionTriviallyDead(I))
       continue;
 
     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
-        *OI = 0;
+        *OI = nullptr;
         if (U->use_empty())
           DeadInsts.push_back(U);
       }
@@ -775,9 +836,18 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
 namespace {
 class LSRUse;
 }
-// Check if it is legal to fold 2 base registers.
-static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
-                             const Formula &F);
+
+/// \brief Check if the addressing mode defined by \p F is completely
+/// folded in \p LU at isel time.
+/// This includes address-mode folding and special icmp tricks.
+/// This function returns true if \p LU can accommodate what \p F
+/// defines and up to 1 base + 1 scaled + offset.
+/// In other words, if \p F has several base registers, this function may
+/// still return true. Therefore, users still need to account for
+/// additional base registers and/or unfolded offsets to derive an
+/// accurate cost model.
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 const LSRUse &LU, const Formula &F);
 // Get the cost of the scaling factor used in F for LU.
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
                                      const LSRUse &LU, const Formula &F);
@@ -828,7 +898,7 @@ public:
                    const SmallVectorImpl<int64_t> &Offsets,
                    ScalarEvolution &SE, DominatorTree &DT,
                    const LSRUse &LU,
-                   SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
+                   SmallPtrSet<const SCEV *, 16> *LoserRegs = nullptr);
 
   void print(raw_ostream &OS) const;
   void dump() const;
@@ -921,6 +991,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
                        ScalarEvolution &SE, DominatorTree &DT,
                        const LSRUse &LU,
                        SmallPtrSet<const SCEV *, 16> *LoserRegs) {
+  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
   // Tally up the registers.
   if (const SCEV *ScaledReg = F.ScaledReg) {
     if (VisitedRegs.count(ScaledReg)) {
@@ -944,11 +1015,13 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
   }
 
   // Determine how many (unfolded) adds we'll need inside the loop.
-  size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
+  size_t NumBaseParts = F.getNumRegs();
   if (NumBaseParts > 1)
     // Do not count the base and a possible second register if the target
     // allows to fold 2 registers.
-    NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F));
+    NumBaseAdds +=
+        NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
+  NumBaseAdds += (F.UnfoldedOffset != 0);
 
   // Accumulate non-free scaling amounts.
   ScaleCost += getScalingFactorCost(TTI, LU, F);
@@ -1047,7 +1120,8 @@ struct LSRFixup {
 }
 
 LSRFixup::LSRFixup()
-  : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {}
+  : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)),
+    Offset(0) {}
 
 /// isUseFullyOutsideLoop - Test whether this fixup always uses its
 /// value outside of the given loop.
@@ -1183,7 +1257,7 @@ public:
                                       MaxOffset(INT64_MIN),
                                       AllFixupsOutsideLoop(true),
                                       RigidFormula(false),
-                                      WidestFixupType(0) {}
+                                      WidestFixupType(nullptr) {}
 
   bool HasFormulaWithSameRegs(const Formula &F) const;
   bool InsertFormula(const Formula &F);
@@ -1208,7 +1282,10 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
 
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
+/// The formula must be in canonical form.
 bool LSRUse::InsertFormula(const Formula &F) {
+  assert(F.isCanonical() && "Invalid canonical representation");
+
   if (!Formulae.empty() && RigidFormula)
     return false;
 
@@ -1234,6 +1311,8 @@ bool LSRUse::InsertFormula(const Formula &F) {
 
   // Record registers now being used by this use.
   Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+  if (F.ScaledReg)
+    Regs.insert(F.ScaledReg);
 
   return true;
 }
@@ -1300,12 +1379,10 @@ void LSRUse::dump() const {
 }
 #endif
 
-/// isLegalUse - Test whether the use described by AM is "legal", meaning it can
-/// be completely folded into the user instruction at isel time. This includes
-/// address-mode folding and special icmp tricks.
-static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind,
-                       Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset,
-                       bool HasBaseReg, int64_t Scale) {
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 LSRUse::KindType Kind, Type *AccessTy,
+                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 bool HasBaseReg, int64_t Scale) {
   switch (Kind) {
   case LSRUse::Address:
     return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
@@ -1356,10 +1433,11 @@ static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind,
   llvm_unreachable("Invalid LSRUse Kind!");
 }
 
-static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
-                       int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
-                       GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg,
-                       int64_t Scale) {
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 int64_t MinOffset, int64_t MaxOffset,
+                                 LSRUse::KindType Kind, Type *AccessTy,
+                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 bool HasBaseReg, int64_t Scale) {
   // Check for overflow.
   if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
       (MinOffset > 0))
@@ -1370,9 +1448,41 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
     return false;
   MaxOffset = (uint64_t)BaseOffset + MaxOffset;
 
-  return isLegalUse(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg,
-                    Scale) &&
-         isLegalUse(TTI, Kind, AccessTy, BaseGV, MaxOffset, HasBaseReg, Scale);
+  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
+                              HasBaseReg, Scale) &&
+         isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
+                              HasBaseReg, Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 int64_t MinOffset, int64_t MaxOffset,
+                                 LSRUse::KindType Kind, Type *AccessTy,
+                                 const Formula &F) {
+  // For the purpose of isAMCompletelyFolded either having a canonical formula
+  // or a scale not equal to zero is correct.
+  // Problems may arise from non canonical formulae having a scale == 0.
+  // Strictly speaking it would best to just rely on canonical formulae.
+  // However, when we generate the scaled formulae, we first check that the
+  // scaling factor is profitable before computing the actual ScaledReg for
+  // compile time sake.
+  assert((F.isCanonical() || F.Scale != 0));
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+                              F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+/// isLegalUse - Test whether we know how to expand the current formula.
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+                       int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
+                       GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg,
+                       int64_t Scale) {
+  // We know how to expand completely foldable formulae.
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+                              BaseOffset, HasBaseReg, Scale) ||
+         // Or formulae that use a base register produced by a sum of base
+         // registers.
+         (Scale == 1 &&
+          isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+                               BaseGV, BaseOffset, true, 0));
 }
 
 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
@@ -1382,36 +1492,23 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
                     F.BaseOffset, F.HasBaseReg, F.Scale);
 }
 
-static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
-                             const Formula &F) {
-  // If F is used as an Addressing Mode, it may fold one Base plus one
-  // scaled register. If the scaled register is nil, do as if another
-  // element of the base regs is a 1-scaled register.
-  // This is possible if BaseRegs has at least 2 registers.
-
-  // If this is not an address calculation, this is not an addressing mode
-  // use.
-  if (LU.Kind !=  LSRUse::Address)
-    return false;
-
-  // F is already scaled.
-  if (F.Scale != 0)
-    return false;
-
-  // We need to keep one register for the base and one to scale.
-  if (F.BaseRegs.size() < 2)
-    return false;
-
-  return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
-                    F.BaseGV, F.BaseOffset, F.HasBaseReg, 1);
- }
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+                                 const LSRUse &LU, const Formula &F) {
+  return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                              LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
+                              F.Scale);
+}
 
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
                                      const LSRUse &LU, const Formula &F) {
   if (!F.Scale)
     return 0;
-  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
-                    LU.AccessTy, F) && "Illegal formula in use.");
+
+  // If the use is not completely folded in that instruction, we will have to
+  // pay an extra cost only for scale != 1.
+  if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                            LU.AccessTy, F))
+    return F.Scale != 1;
 
   switch (LU.Kind) {
   case LSRUse::Address: {
@@ -1430,12 +1527,10 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
     return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
   }
   case LSRUse::ICmpZero:
-    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg.
-    // Therefore, return 0 in case F.Scale == -1.
-    return F.Scale != -1;
-
   case LSRUse::Basic:
   case LSRUse::Special:
+    // The use is completely folded, i.e., everything is folded into the
+    // instruction.
     return 0;
   }
 
@@ -1460,7 +1555,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
     HasBaseReg = true;
   }
 
-  return isLegalUse(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
+  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
+                              HasBaseReg, Scale);
 }
 
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
@@ -1485,8 +1581,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
   // base and a scale.
   int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
 
-  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
-                    BaseOffset, HasBaseReg, Scale);
+  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+                              BaseOffset, HasBaseReg, Scale);
 }
 
 namespace {
@@ -1515,7 +1611,7 @@ struct IVChain {
   SmallVector<IVInc,1> Incs;
   const SCEV *ExprBase;
 
-  IVChain() : ExprBase(0) {}
+  IVChain() : ExprBase(nullptr) {}
 
   IVChain(const IVInc &Head, const SCEV *Base)
     : Incs(1, Head), ExprBase(Base) {}
@@ -1642,8 +1738,19 @@ class LSRInstance {
 
   void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
                               unsigned Depth = 0);
+
+  void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+                                  const Formula &Base, unsigned Depth,
+                                  size_t Idx, bool IsScaledReg = false);
   void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                   const Formula &Base, size_t Idx,
+                                   bool IsScaledReg = false);
   void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                   const Formula &Base,
+                                   const SmallVectorImpl<int64_t> &Worklist,
+                                   size_t Idx, bool IsScaledReg = false);
   void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
   void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
@@ -1721,7 +1828,7 @@ void LSRInstance::OptimizeShadowIV() {
     IVUsers::const_iterator CandidateUI = UI;
     ++UI;
     Instruction *ShadowUse = CandidateUI->getUser();
-    Type *DestTy = 0;
+    Type *DestTy = nullptr;
     bool IsSigned = false;
 
     /* If shadow use is a int->float cast then insert a second IV
@@ -1783,7 +1890,7 @@ void LSRInstance::OptimizeShadowIV() {
       continue;
 
     /* Initialize new IV, double d = 0.0 in above example. */
-    ConstantInt *C = 0;
+    ConstantInt *C = nullptr;
     if (Incr->getOperand(0) == PH)
       C = dyn_cast<ConstantInt>(Incr->getOperand(1));
     else if (Incr->getOperand(1) == PH)
@@ -1905,7 +2012,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
   // for ICMP_ULE here because the comparison would be with zero, which
   // isn't interesting.
   CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
-  const SCEVNAryExpr *Max = 0;
+  const SCEVNAryExpr *Max = nullptr;
   if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
     Pred = ICmpInst::ICMP_SLE;
     Max = S;
@@ -1948,7 +2055,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
 
   // Check the right operand of the select, and remember it, as it will
   // be used in the new comparison instruction.
-  Value *NewRHS = 0;
+  Value *NewRHS = nullptr;
   if (ICmpInst::isTrueWhenEqual(Pred)) {
     // Look for n+1, and grab n.
     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
@@ -2018,7 +2125,7 @@ LSRInstance::OptimizeLoopTermCond() {
       continue;
 
     // Search IVUsesByStride to find Cond's IVUse if there is one.
-    IVStrideUse *CondUse = 0;
+    IVStrideUse *CondUse = nullptr;
     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
     if (!FindIVUserForCond(Cond, CondUse))
       continue;
@@ -2071,12 +2178,12 @@ LSRInstance::OptimizeLoopTermCond() {
             // Check for possible scaled-address reuse.
             Type *AccessTy = getAccessType(UI->getUser());
             int64_t Scale = C->getSExtValue();
-            if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0,
+            if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr,
                                           /*BaseOffset=*/ 0,
                                           /*HasBaseReg=*/ false, Scale))
               goto decline_post_inc;
             Scale = -Scale;
-            if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0,
+            if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr,
                                           /*BaseOffset=*/ 0,
                                           /*HasBaseReg=*/ false, Scale))
               goto decline_post_inc;
@@ -2146,23 +2253,25 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
   // the uses will have all its uses outside the loop, for example.
   if (LU.Kind != Kind)
     return false;
+
+  // Check for a mismatched access type, and fall back conservatively as needed.
+  // TODO: Be less conservative when the type is similar and can use the same
+  // addressing modes.
+  if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
+    NewAccessTy = Type::getVoidTy(AccessTy->getContext());
+
   // Conservatively assume HasBaseReg is true for now.
   if (NewOffset < LU.MinOffset) {
-    if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0,
+    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
                           LU.MaxOffset - NewOffset, HasBaseReg))
       return false;
     NewMinOffset = NewOffset;
   } else if (NewOffset > LU.MaxOffset) {
-    if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0,
+    if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
                           NewOffset - LU.MinOffset, HasBaseReg))
       return false;
     NewMaxOffset = NewOffset;
   }
-  // Check for a mismatched access type, and fall back conservatively as needed.
-  // TODO: Be less conservative when the type is similar and can use the same
-  // addressing modes.
-  if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
-    NewAccessTy = Type::getVoidTy(AccessTy->getContext());
 
   // Update the use.
   LU.MinOffset = NewMinOffset;
@@ -2183,7 +2292,7 @@ LSRInstance::getUse(const SCEV *&Expr,
   int64_t Offset = ExtractImmediate(Expr, SE);
 
   // Basic uses can't accept any offset, for example.
-  if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0,
+  if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
                         Offset, /*HasBaseReg=*/ true)) {
     Expr = Copy;
     Offset = 0;
@@ -2267,7 +2376,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
   }
 
   // Nothing looked good.
-  return 0;
+  return nullptr;
 }
 
 void LSRInstance::CollectInterestingTypesAndFactors() {
@@ -2385,7 +2494,7 @@ static const SCEV *getExprBase(const SCEV *S) {
   default: // uncluding scUnknown.
     return S;
   case scConstant:
-    return 0;
+    return nullptr;
   case scTruncate:
     return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
   case scZeroExtend:
@@ -2476,7 +2585,7 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
       && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
     --cost;
   }
-  const SCEV *LastIncExpr = 0;
+  const SCEV *LastIncExpr = nullptr;
   unsigned NumConstIncrements = 0;
   unsigned NumVarIncrements = 0;
   unsigned NumReusedIncrements = 0;
@@ -2535,7 +2644,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
   // Visit all existing chains. Check if its IVOper can be computed as a
   // profitable loop invariant increment from the last link in the Chain.
   unsigned ChainIdx = 0, NChains = IVChainVec.size();
-  const SCEV *LastIncExpr = 0;
+  const SCEV *LastIncExpr = nullptr;
   for (; ChainIdx < NChains; ++ChainIdx) {
     IVChain &Chain = IVChainVec[ChainIdx];
 
@@ -2755,7 +2864,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
 
   int64_t IncOffset = IncConst->getValue()->getSExtValue();
   if (!isAlwaysFoldable(TTI, LSRUse::Address,
-                        getAccessType(UserInst), /*BaseGV=*/ 0,
+                        getAccessType(UserInst), /*BaseGV=*/ nullptr,
                         IncOffset, /*HaseBaseReg=*/ false))
     return false;
 
@@ -2773,7 +2882,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
   // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
   User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
                                              IVOpEnd, L, SE);
-  Value *IVSrc = 0;
+  Value *IVSrc = nullptr;
   while (IVOpIter != IVOpEnd) {
     IVSrc = getWideOperand(*IVOpIter);
 
@@ -2800,7 +2909,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
   DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
   Type *IVTy = IVSrc->getType();
   Type *IntTy = SE.getEffectiveSCEVType(IVTy);
-  const SCEV *LeftOverExpr = 0;
+  const SCEV *LeftOverExpr = nullptr;
   for (IVChain::const_iterator IncI = Chain.begin(),
          IncE = Chain.end(); IncI != IncE; ++IncI) {
 
@@ -2831,7 +2940,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
                             TTI)) {
         assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
         IVSrc = IVOper;
-        LeftOverExpr = 0;
+        LeftOverExpr = nullptr;
       }
     }
     Type *OperTy = IncI->IVOperand->getType();
@@ -2886,7 +2995,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     LF.PostIncLoops = UI->getPostIncLoops();
 
     LSRUse::KindType Kind = LSRUse::Basic;
-    Type *AccessTy = 0;
+    Type *AccessTy = nullptr;
     if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
       Kind = LSRUse::Address;
       AccessTy = getAccessType(LF.UserInst);
@@ -2917,7 +3026,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
         if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
           // S is normalized, so normalize N before folding it into S
           // to keep the result normalized.
-          N = TransformForPostIncUse(Normalize, N, CI, 0,
+          N = TransformForPostIncUse(Normalize, N, CI, nullptr,
                                      LF.PostIncLoops, SE, DT);
           Kind = LSRUse::ICmpZero;
           S = SE.getMinusSCEV(N, S);
@@ -2992,6 +3101,9 @@ void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
+  // Do not insert formula that we will not be able to expand.
+  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
+         "Formula is illegal");
   if (!LU.InsertFormula(F))
     return false;
 
@@ -3068,7 +3180,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
         LSRFixup &LF = getNewFixup();
         LF.UserInst = const_cast<Instruction *>(UserInst);
         LF.OperandValToReplace = U;
-        std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
+        std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, nullptr);
         LF.LUIdx = P.first;
         LF.Offset = P.second;
         LSRUse &LU = Uses[LF.LUIdx];
@@ -3107,7 +3219,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
       if (Remainder)
         Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
     }
-    return 0;
+    return nullptr;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     // Split a non-zero base out of an addrec.
     if (AR->getStart()->isZero())
@@ -3119,7 +3231,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
     // does not pertain to this loop.
     if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
       Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
-      Remainder = 0;
+      Remainder = nullptr;
     }
     if (Remainder != AR->getStart()) {
       if (!Remainder)
@@ -3141,90 +3253,110 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
         CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
       if (Remainder)
         Ops.push_back(SE.getMulExpr(C, Remainder));
-      return 0;
+      return nullptr;
     }
   }
   return S;
 }
 
-/// GenerateReassociations - Split out subexpressions from adds and the bases of
-/// addrecs.
-void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
-                                         Formula Base,
-                                         unsigned Depth) {
-  // Arbitrarily cap recursion to protect compile time.
-  if (Depth >= 3) return;
-
-  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
-    const SCEV *BaseReg = Base.BaseRegs[i];
+/// \brief Helper function for LSRInstance::GenerateReassociations.
+void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+                                             const Formula &Base,
+                                             unsigned Depth, size_t Idx,
+                                             bool IsScaledReg) {
+  const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  SmallVector<const SCEV *, 8> AddOps;
+  const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
+  if (Remainder)
+    AddOps.push_back(Remainder);
+
+  if (AddOps.size() == 1)
+    return;
 
-    SmallVector<const SCEV *, 8> AddOps;
-    const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE);
-    if (Remainder)
-      AddOps.push_back(Remainder);
+  for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
+                                                     JE = AddOps.end();
+       J != JE; ++J) {
 
-    if (AddOps.size() == 1) continue;
+    // Loop-variant "unknown" values are uninteresting; we won't be able to
+    // do anything meaningful with them.
+    if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
+      continue;
 
-    for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
-         JE = AddOps.end(); J != JE; ++J) {
+    // Don't pull a constant into a register if the constant could be folded
+    // into an immediate field.
+    if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                         LU.AccessTy, *J, Base.getNumRegs() > 1))
+      continue;
 
-      // Loop-variant "unknown" values are uninteresting; we won't be able to
-      // do anything meaningful with them.
-      if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
-        continue;
+    // Collect all operands except *J.
+    SmallVector<const SCEV *, 8> InnerAddOps(
+        ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+    InnerAddOps.append(std::next(J),
+                       ((const SmallVector<const SCEV *, 8> &)AddOps).end());
+
+    // Don't leave just a constant behind in a register if the constant could
+    // be folded into an immediate field.
+    if (InnerAddOps.size() == 1 &&
+        isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                         LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
+      continue;
 
-      // Don't pull a constant into a register if the constant could be folded
-      // into an immediate field.
-      if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
-                           LU.AccessTy, *J, Base.getNumRegs() > 1))
-        continue;
+    const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
+    if (InnerSum->isZero())
+      continue;
+    Formula F = Base;
 
-      // Collect all operands except *J.
-      SmallVector<const SCEV *, 8> InnerAddOps(
-          ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
-      InnerAddOps.append(std::next(J),
-                         ((const SmallVector<const SCEV *, 8> &)AddOps).end());
-
-      // Don't leave just a constant behind in a register if the constant could
-      // be folded into an immediate field.
-      if (InnerAddOps.size() == 1 &&
-          isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
-                           LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
-        continue;
+    // Add the remaining pieces of the add back into the new formula.
+    const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+    if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
+        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                InnerSumSC->getValue()->getZExtValue())) {
+      F.UnfoldedOffset =
+          (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
+      if (IsScaledReg)
+        F.ScaledReg = nullptr;
+      else
+        F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
+    } else if (IsScaledReg)
+      F.ScaledReg = InnerSum;
+    else
+      F.BaseRegs[Idx] = InnerSum;
+
+    // Add J as its own register, or an unfolded immediate.
+    const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
+    if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+        TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                SC->getValue()->getZExtValue()))
+      F.UnfoldedOffset =
+          (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
+    else
+      F.BaseRegs.push_back(*J);
+    // We may have changed the number of register in base regs, adjust the
+    // formula accordingly.
+    F.Canonicalize();
+
+    if (InsertFormula(LU, LUIdx, F))
+      // If that formula hadn't been seen before, recurse to find more like
+      // it.
+      GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
+  }
+}
 
-      const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
-      if (InnerSum->isZero())
-        continue;
-      Formula F = Base;
+/// GenerateReassociations - Split out subexpressions from adds and the bases of
+/// addrecs.
+void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
+                                         Formula Base, unsigned Depth) {
+  assert(Base.isCanonical() && "Input must be in the canonical form");
+  // Arbitrarily cap recursion to protect compile time.
+  if (Depth >= 3)
+    return;
 
-      // Add the remaining pieces of the add back into the new formula.
-      const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
-      if (InnerSumSC &&
-          SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
-          TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
-                                  InnerSumSC->getValue()->getZExtValue())) {
-        F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
-                           InnerSumSC->getValue()->getZExtValue();
-        F.BaseRegs.erase(F.BaseRegs.begin() + i);
-      } else
-        F.BaseRegs[i] = InnerSum;
-
-      // Add J as its own register, or an unfolded immediate.
-      const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
-      if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
-          TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
-                                  SC->getValue()->getZExtValue()))
-        F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
-                           SC->getValue()->getZExtValue();
-      else
-        F.BaseRegs.push_back(*J);
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
 
-      if (InsertFormula(LU, LUIdx, F))
-        // If that formula hadn't been seen before, recurse to find more like
-        // it.
-        GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
-    }
-  }
+  if (Base.Scale == 1)
+    GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
+                               /* Idx */ -1, /* IsScaledReg */ true);
 }
 
 /// GenerateCombinations - Generate a formula consisting of all of the
@@ -3232,8 +3364,12 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
                                        Formula Base) {
   // This method is only interesting on a plurality of registers.
-  if (Base.BaseRegs.size() <= 1) return;
+  if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
+    return;
 
+  // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
+  // processing the formula.
+  Base.Unscale();
   Formula F = Base;
   F.BaseRegs.clear();
   SmallVector<const SCEV *, 4> Ops;
@@ -3253,29 +3389,87 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
     // rather than proceed with zero in a register.
     if (!Sum->isZero()) {
       F.BaseRegs.push_back(Sum);
+      F.Canonicalize();
       (void)InsertFormula(LU, LUIdx, F);
     }
   }
 }
 
+/// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
+void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+                                              const Formula &Base, size_t Idx,
+                                              bool IsScaledReg) {
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  GlobalValue *GV = ExtractSymbol(G, SE);
+  if (G->isZero() || !GV)
+    return;
+  Formula F = Base;
+  F.BaseGV = GV;
+  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+    return;
+  if (IsScaledReg)
+    F.ScaledReg = G;
+  else
+    F.BaseRegs[Idx] = G;
+  (void)InsertFormula(LU, LUIdx, F);
+}
+
 /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
                                           Formula Base) {
   // We can't add a symbolic offset if the address already contains one.
   if (Base.BaseGV) return;
 
-  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
-    const SCEV *G = Base.BaseRegs[i];
-    GlobalValue *GV = ExtractSymbol(G, SE);
-    if (G->isZero() || !GV)
-      continue;
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
+  if (Base.Scale == 1)
+    GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
+                                /* IsScaledReg */ true);
+}
+
+/// \brief Helper function for LSRInstance::GenerateConstantOffsets.
+void LSRInstance::GenerateConstantOffsetsImpl(
+    LSRUse &LU, unsigned LUIdx, const Formula &Base,
+    const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+  for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
+                                                E = Worklist.end();
+       I != E; ++I) {
     Formula F = Base;
-    F.BaseGV = GV;
-    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
-      continue;
-    F.BaseRegs[i] = G;
-    (void)InsertFormula(LU, LUIdx, F);
+    F.BaseOffset = (uint64_t)Base.BaseOffset - *I;
+    if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind,
+                   LU.AccessTy, F)) {
+      // Add the offset to the base register.
+      const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
+      // If it cancelled out, drop the base register, otherwise update it.
+      if (NewG->isZero()) {
+        if (IsScaledReg) {
+          F.Scale = 0;
+          F.ScaledReg = nullptr;
+        } else
+          F.DeleteBaseReg(F.BaseRegs[Idx]);
+        F.Canonicalize();
+      } else if (IsScaledReg)
+        F.ScaledReg = NewG;
+      else
+        F.BaseRegs[Idx] = NewG;
+
+      (void)InsertFormula(LU, LUIdx, F);
+    }
   }
+
+  int64_t Imm = ExtractImmediate(G, SE);
+  if (G->isZero() || Imm == 0)
+    return;
+  Formula F = Base;
+  F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
+  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+    return;
+  if (IsScaledReg)
+    F.ScaledReg = G;
+  else
+    F.BaseRegs[Idx] = G;
+  (void)InsertFormula(LU, LUIdx, F);
 }
 
 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
@@ -3288,38 +3482,11 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
   if (LU.MaxOffset != LU.MinOffset)
     Worklist.push_back(LU.MaxOffset);
 
-  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
-    const SCEV *G = Base.BaseRegs[i];
-
-    for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
-         E = Worklist.end(); I != E; ++I) {
-      Formula F = Base;
-      F.BaseOffset = (uint64_t)Base.BaseOffset - *I;
-      if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind,
-                     LU.AccessTy, F)) {
-        // Add the offset to the base register.
-        const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
-        // If it cancelled out, drop the base register, otherwise update it.
-        if (NewG->isZero()) {
-          std::swap(F.BaseRegs[i], F.BaseRegs.back());
-          F.BaseRegs.pop_back();
-        } else
-          F.BaseRegs[i] = NewG;
-
-        (void)InsertFormula(LU, LUIdx, F);
-      }
-    }
-
-    int64_t Imm = ExtractImmediate(G, SE);
-    if (G->isZero() || Imm == 0)
-      continue;
-    Formula F = Base;
-    F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
-    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
-      continue;
-    F.BaseRegs[i] = G;
-    (void)InsertFormula(LU, LUIdx, F);
-  }
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
+  if (Base.Scale == 1)
+    GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
+                                /* IsScaledReg */ true);
 }
 
 /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
@@ -3419,7 +3586,11 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
   if (!IntTy) return;
 
   // If this Formula already has a scaled register, we can't add another one.
-  if (Base.Scale != 0) return;
+  // Try to unscale the formula to generate a better scale.
+  if (Base.Scale != 0 && !Base.Unscale())
+    return;
+
+  assert(Base.Scale == 0 && "Unscale did not did its job!");
 
   // Check each interesting stride.
   for (SmallSetVector<int64_t, 8>::const_iterator
@@ -3460,6 +3631,11 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
           Formula F = Base;
           F.ScaledReg = Quotient;
           F.DeleteBaseReg(F.BaseRegs[i]);
+          // The canonical representation of 1*reg is reg, which is already in
+          // Base. In that case, do not try to insert the formula, it will be
+          // rejected anyway.
+          if (F.Scale == 1 && F.BaseRegs.empty())
+            continue;
           (void)InsertFormula(LU, LUIdx, F);
         }
       }
@@ -3624,7 +3800,12 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
 
     // TODO: Use a more targeted data structure.
     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
-      const Formula &F = LU.Formulae[L];
+      Formula F = LU.Formulae[L];
+      // FIXME: The code for the scaled and unscaled registers looks
+      // very similar but slightly different. Investigate if they
+      // could be merged. That way, we would not have to unscale the
+      // Formula.
+      F.Unscale();
       // Use the immediate in the scaled register.
       if (F.ScaledReg == OrigReg) {
         int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
@@ -3650,6 +3831,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
             continue;
 
         // OK, looks good.
+        NewF.Canonicalize();
         (void)InsertFormula(LU, LUIdx, NewF);
       } else {
         // Use the immediate in a base register.
@@ -3683,6 +3865,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                 goto skip_formula;
 
           // Ok, looks good.
+          NewF.Canonicalize();
           (void)InsertFormula(LU, LUIdx, NewF);
           break;
         skip_formula:;
@@ -3936,7 +4119,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
     for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
          E = LU.Formulae.end(); I != E; ++I) {
       const Formula &F = *I;
-      if (F.BaseOffset == 0 || F.Scale != 0)
+      if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
         continue;
 
       LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
@@ -4033,7 +4216,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
 
     // Pick the register which is used by the most LSRUses, which is likely
     // to be a good reuse register candidate.
-    const SCEV *Best = 0;
+    const SCEV *Best = nullptr;
     unsigned BestNum = 0;
     for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
          I != E; ++I) {
@@ -4130,19 +4313,22 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
        E = LU.Formulae.end(); I != E; ++I) {
     const Formula &F = *I;
 
-    // Ignore formulae which do not use any of the required registers.
-    bool SatisfiedReqReg = true;
+    // Ignore formulae which may not be ideal in terms of register reuse of
+    // ReqRegs.  The formula should use all required registers before
+    // introducing new ones.
+    int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
     for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
          JE = ReqRegs.end(); J != JE; ++J) {
       const SCEV *Reg = *J;
-      if ((!F.ScaledReg || F.ScaledReg != Reg) &&
-          std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
+      if ((F.ScaledReg && F.ScaledReg == Reg) ||
+          std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) !=
           F.BaseRegs.end()) {
-        SatisfiedReqReg = false;
-        break;
+        --NumReqRegsToFind;
+        if (NumReqRegsToFind == 0)
+          break;
       }
     }
-    if (!SatisfiedReqReg) {
+    if (NumReqRegsToFind != 0) {
       // If none of the formulae satisfied the required registers, then we could
       // clear ReqRegs and try again. Currently, we simply give up in this case.
       continue;
@@ -4240,7 +4426,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
     }
 
     bool AllDominate = true;
-    Instruction *BetterPos = 0;
+    Instruction *BetterPos = nullptr;
     Instruction *Tentative = IDom->getTerminator();
     for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
          E = Inputs.end(); I != E; ++I) {
@@ -4379,11 +4565,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
                                  LF.UserInst, LF.OperandValToReplace,
                                  Loops, SE, DT);
 
-    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
+    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, IP)));
   }
 
   // Expand the ScaledReg portion.
-  Value *ICmpScaledV = 0;
+  Value *ICmpScaledV = nullptr;
   if (F.Scale != 0) {
     const SCEV *ScaledS = F.ScaledReg;
 
@@ -4394,25 +4580,34 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
                                      Loops, SE, DT);
 
     if (LU.Kind == LSRUse::ICmpZero) {
-      // An interesting way of "folding" with an icmp is to use a negated
-      // scale, which we'll implement by inserting it into the other operand
-      // of the icmp.
-      assert(F.Scale == -1 &&
-             "The only scale supported by ICmpZero uses is -1!");
-      ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
+      // Expand ScaleReg as if it was part of the base regs.
+      if (F.Scale == 1)
+        Ops.push_back(
+            SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP)));
+      else {
+        // An interesting way of "folding" with an icmp is to use a negated
+        // scale, which we'll implement by inserting it into the other operand
+        // of the icmp.
+        assert(F.Scale == -1 &&
+               "The only scale supported by ICmpZero uses is -1!");
+        ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP);
+      }
     } else {
       // Otherwise just expand the scaled register and an explicit scale,
       // which is expected to be matched as part of the address.
 
       // Flush the operand list to suppress SCEVExpander hoisting address modes.
-      if (!Ops.empty() && LU.Kind == LSRUse::Address) {
+      // Unless the addressing mode will not be folded.
+      if (!Ops.empty() && LU.Kind == LSRUse::Address &&
+          isAMCompletelyFolded(TTI, LU, F)) {
         Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
         Ops.clear();
         Ops.push_back(SE.getUnknown(FullV));
       }
-      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
-      ScaledS = SE.getMulExpr(ScaledS,
-                              SE.getConstant(ScaledS->getType(), F.Scale));
+      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP));
+      if (F.Scale != 1)
+        ScaledS =
+            SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
       Ops.push_back(ScaledS);
     }
   }
@@ -4490,7 +4685,9 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       }
       CI->setOperand(1, ICmpScaledV);
     } else {
-      assert(F.Scale == 0 &&
+      // A scale of 1 means that the scale has been expanded as part of the
+      // base regs.
+      assert((F.Scale == 0 || F.Scale == 1) &&
              "ICmp does not support folding a global value and "
              "a scale at the same time!");
       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
@@ -4531,7 +4728,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
         Loop *PNLoop = LI.getLoopFor(Parent);
         if (!PNLoop || Parent != PNLoop->getHeader()) {
           // Split the critical edge.
-          BasicBlock *NewBB = 0;
+          BasicBlock *NewBB = nullptr;
           if (!Parent->isLandingPad()) {
             NewBB = SplitCriticalEdge(BB, Parent, P,
                                       /*MergeIdenticalEdges=*/true,
@@ -4560,7 +4757,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
       }
 
       std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
-        Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
+        Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
       if (!Pair.second)
         PN->setIncomingValue(i, Pair.first->second);
       else {
@@ -4670,7 +4867,7 @@ LSRInstance::LSRInstance(Loop *L, Pass *P)
       DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()),
       LI(P->getAnalysis<LoopInfo>()),
       TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false),
-      IVIncInsertPos(0) {
+      IVIncInsertPos(nullptr) {
   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->isLoopSimplifyForm())
     return;
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index ecd350b..fc28fd2 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -12,7 +12,6 @@
 // counts of loops easily.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loop-unroll"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -29,6 +28,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "loop-unroll"
+
 static cl::opt<unsigned>
 UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden,
   cl::desc("The cut-off point for automatic loop unrolling"));
@@ -237,9 +238,12 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
       return false;
     }
     uint64_t Size = (uint64_t)LoopSize*Count;
-    if (TripCount != 1 && Size > Threshold) {
-      DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
-            << " because size: " << Size << ">" << Threshold << "\n");
+    if (TripCount != 1 &&
+        (Size > Threshold || (Count != TripCount && Size > PartialThreshold))) {
+      if (Size > Threshold)
+        DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
+                     << " because size: " << Size << ">" << Threshold << "\n");
+
       bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial;
       if (!AllowPartial && !(Runtime && TripCount == 0)) {
         DEBUG(dbgs() << "  will not try to unroll partially because "
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 5954f4a..977c53a 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -26,7 +26,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loop-unswitch"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -53,6 +52,8 @@
 #include <set>
 using namespace llvm;
 
+#define DEBUG_TYPE "loop-unswitch"
+
 STATISTIC(NumBranches, "Number of branches unswitched");
 STATISTIC(NumSwitches, "Number of switches unswitched");
 STATISTIC(NumSelects , "Number of selects unswitched");
@@ -96,7 +97,7 @@ namespace {
     public:
 
       LUAnalysisCache() :
-        CurLoopInstructions(0), CurrentLoopProperties(0),
+        CurLoopInstructions(nullptr), CurrentLoopProperties(nullptr),
         MaxSize(Threshold)
       {}
 
@@ -151,8 +152,8 @@ namespace {
     static char ID; // Pass ID, replacement for typeid
     explicit LoopUnswitch(bool Os = false) :
       LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
-      currentLoop(0), DT(0), loopHeader(0),
-      loopPreheader(0) {
+      currentLoop(nullptr), DT(nullptr), loopHeader(nullptr),
+      loopPreheader(nullptr) {
         initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
       }
 
@@ -180,15 +181,6 @@ namespace {
       BranchesInfo.forgetLoop(currentLoop);
     }
 
-    /// RemoveLoopFromWorklist - If the specified loop is on the loop worklist,
-    /// remove it.
-    void RemoveLoopFromWorklist(Loop *L) {
-      std::vector<Loop*>::iterator I = std::find(LoopProcessWorklist.begin(),
-                                                 LoopProcessWorklist.end(), L);
-      if (I != LoopProcessWorklist.end())
-        LoopProcessWorklist.erase(I);
-    }
-
     void initLoopData() {
       loopHeader = currentLoop->getHeader();
       loopPreheader = currentLoop->getLoopPreheader();
@@ -212,9 +204,8 @@ namespace {
                                         Instruction *InsertPt);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
-    void RemoveLoopFromHierarchy(Loop *L);
-    bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0,
-                                    BasicBlock **LoopExit = 0);
+    bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = nullptr,
+                                    BasicBlock **LoopExit = nullptr);
 
   };
 }
@@ -283,8 +274,8 @@ void LUAnalysisCache::forgetLoop(const Loop *L) {
     LoopsProperties.erase(LIt);
   }
 
-  CurrentLoopProperties = 0;
-  CurLoopInstructions = 0;
+  CurrentLoopProperties = nullptr;
+  CurLoopInstructions = nullptr;
 }
 
 // Mark case value as unswitched.
@@ -355,10 +346,10 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
 
   // We can never unswitch on vector conditions.
   if (Cond->getType()->isVectorTy())
-    return 0;
+    return nullptr;
 
   // Constants should be folded, not unswitched on!
-  if (isa<Constant>(Cond)) return 0;
+  if (isa<Constant>(Cond)) return nullptr;
 
   // TODO: Handle: br (VARIANT|INVARIANT).
 
@@ -378,7 +369,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
         return RHS;
     }
 
-  return 0;
+  return nullptr;
 }
 
 bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
@@ -389,7 +380,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   LPM = &LPM_Ref;
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : 0;
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
   currentLoop = L;
   Function *F = currentLoop->getHeader()->getParent();
   bool Changed = false;
@@ -461,7 +452,7 @@ bool LoopUnswitch::processCurrentLoop() {
         // Find a value to unswitch on:
         // FIXME: this should chose the most expensive case!
         // FIXME: scan for a case with a non-critical edge?
-        Constant *UnswitchVal = 0;
+        Constant *UnswitchVal = nullptr;
 
         // Do not process same value again and again.
         // At this point we have some cases already unswitched and
@@ -518,7 +509,7 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
   if (!L->contains(BB)) {
     // Otherwise, this is a loop exit, this is fine so long as this is the
     // first exit.
-    if (ExitBB != 0) return false;
+    if (ExitBB) return false;
     ExitBB = BB;
     return true;
   }
@@ -545,10 +536,10 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
 static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
   std::set<BasicBlock*> Visited;
   Visited.insert(L->getHeader());  // Branches to header make infinite loops.
-  BasicBlock *ExitBB = 0;
+  BasicBlock *ExitBB = nullptr;
   if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
     return ExitBB;
-  return 0;
+  return nullptr;
 }
 
 /// IsTrivialUnswitchCondition - Check to see if this unswitch condition is
@@ -569,7 +560,7 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
   TerminatorInst *HeaderTerm = Header->getTerminator();
   LLVMContext &Context = Header->getContext();
 
-  BasicBlock *LoopExitBB = 0;
+  BasicBlock *LoopExitBB = nullptr;
   if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) {
     // If the header block doesn't end with a conditional branch on Cond, we
     // can't handle it.
@@ -639,8 +630,8 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
 /// unswitch the loop, reprocess the pieces, then return true.
 bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
   Function *F = loopHeader->getParent();
-  Constant *CondVal = 0;
-  BasicBlock *ExitBlock = 0;
+  Constant *CondVal = nullptr;
+  BasicBlock *ExitBlock = nullptr;
 
   if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) {
     // If the condition is trivial, always unswitch. There is no code growth
@@ -948,17 +939,6 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V,
   ++NumSimplify;
 }
 
-/// RemoveLoopFromHierarchy - We have discovered that the specified loop has
-/// become unwrapped, either because the backedge was deleted, or because the
-/// edge into the header was removed.  If the edge into the header from the
-/// latch block was removed, the loop is unwrapped but subloops are still alive,
-/// so they just reparent loops.  If the loops are actually dead, they will be
-/// removed later.
-void LoopUnswitch::RemoveLoopFromHierarchy(Loop *L) {
-  LPM->deleteLoopFromQueue(L);
-  RemoveLoopFromWorklist(L);
-}
-
 // RewriteLoopBodyWithConditionConstant - We know either that the value LIC has
 // the value specified by Val in the specified loop, or we know it does NOT have
 // that value.  Rewrite any uses of LIC or of properties correlated to it.
@@ -1020,7 +1000,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
 
     // If we know that LIC is not Val, use this info to simplify code.
     SwitchInst *SI = dyn_cast<SwitchInst>(UI);
-    if (SI == 0 || !isa<ConstantInt>(Val)) continue;
+    if (!SI || !isa<ConstantInt>(Val)) continue;
 
     SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
     // Default case is live for multiple values.
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
index 7c0a623..4251ac4 100644
--- a/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loweratomic"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -20,6 +19,8 @@
 #include "llvm/Pass.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "loweratomic"
+
 static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
   IRBuilder<> Builder(CXI->getParent(), CXI);
   Value *Ptr = CXI->getPointerOperand();
@@ -42,7 +43,7 @@ static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
   Value *Val = RMWI->getValOperand();
 
   LoadInst *Orig = Builder.CreateLoad(Ptr);
-  Value *Res = NULL;
+  Value *Res = nullptr;
 
   switch (RMWI->getOperation()) {
   default: llvm_unreachable("Unexpected RMW operation");
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 2603c96..b6bc792 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "memcpyopt"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -33,6 +32,8 @@
 #include <list>
 using namespace llvm;
 
+#define DEBUG_TYPE "memcpyopt"
+
 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
 STATISTIC(NumMemSetInfer, "Number of memsets inferred");
 STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
@@ -49,7 +50,7 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
   int64_t Offset = 0;
   for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
     ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
-    if (OpC == 0)
+    if (!OpC)
       return VariableIdxFound = true;
     if (OpC->isZero()) continue;  // No offset.
 
@@ -89,12 +90,12 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
 
   // If one pointer is a GEP and the other isn't, then see if the GEP is a
   // constant offset from the base, as in "P" and "gep P, 1".
-  if (GEP1 && GEP2 == 0 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) {
+  if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) {
     Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, TD);
     return !VariableIdxFound;
   }
 
-  if (GEP2 && GEP1 == 0 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) {
+  if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) {
     Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD);
     return !VariableIdxFound;
   }
@@ -317,9 +318,9 @@ namespace {
     static char ID; // Pass identification, replacement for typeid
     MemCpyOpt() : FunctionPass(ID) {
       initializeMemCpyOptPass(*PassRegistry::getPassRegistry());
-      MD = 0;
-      TLI = 0;
-      DL = 0;
+      MD = nullptr;
+      TLI = nullptr;
+      DL = nullptr;
     }
 
     bool runOnFunction(Function &F) override;
@@ -373,7 +374,7 @@ INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
 /// attempts to merge them together into a memcpy/memset.
 Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
                                              Value *StartPtr, Value *ByteVal) {
-  if (DL == 0) return 0;
+  if (!DL) return nullptr;
 
   // Okay, so we now have a single store that can be splatable.  Scan to find
   // all subsequent stores of the same value to offset from the same pointer.
@@ -426,7 +427,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
   // If we have no ranges, then we just had a single store with nothing that
   // could be merged in.  This is a very common case of course.
   if (Ranges.empty())
-    return 0;
+    return nullptr;
 
   // If we had at least one store that could be merged in, add the starting
   // store as well.  We try to avoid this unless there is at least something
@@ -440,7 +441,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
 
   // Now that we have full information about ranges, loop over the ranges and
   // emit memset's for anything big enough to be worthwhile.
-  Instruction *AMemSet = 0;
+  Instruction *AMemSet = nullptr;
   for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
        I != E; ++I) {
     const MemsetRange &Range = *I;
@@ -491,7 +492,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (!SI->isSimple()) return false;
 
-  if (DL == 0) return false;
+  if (!DL) return false;
 
   // Detect cases where we're performing call slot forwarding, but
   // happen to be using a load-store pair to implement it, rather than
@@ -500,7 +501,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
     if (LI->isSimple() && LI->hasOneUse() &&
         LI->getParent() == SI->getParent()) {
       MemDepResult ldep = MD->getDependency(LI);
-      CallInst *C = 0;
+      CallInst *C = nullptr;
       if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
         C = dyn_cast<CallInst>(ldep.getInst());
 
@@ -512,7 +513,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
         for (BasicBlock::iterator I = --BasicBlock::iterator(SI),
                                   E = C; I != E; --I) {
           if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) {
-            C = 0;
+            C = nullptr;
             break;
           }
         }
@@ -603,7 +604,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
     return false;
 
   // Check that all of src is copied to dest.
-  if (DL == 0) return false;
+  if (!DL) return false;
 
   ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
   if (!srcArraySize)
@@ -846,7 +847,7 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
 
   // The optimizations after this point require the memcpy size.
   ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
-  if (CopySize == 0) return false;
+  if (!CopySize) return false;
 
   // The are three possible optimizations we can do for memcpy:
   //   a) memcpy-memcpy xform which exposes redundance for DSE.
@@ -929,7 +930,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
 
 /// processByValArgument - This is called on every byval argument in call sites.
 bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
-  if (DL == 0) return false;
+  if (!DL) return false;
 
   // Find out what feeds this byval argument.
   Value *ByValArg = CS.getArgument(ArgNo);
@@ -946,13 +947,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   // a memcpy, see if we can byval from the source of the memcpy instead of the
   // result.
   MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
-  if (MDep == 0 || MDep->isVolatile() ||
+  if (!MDep || MDep->isVolatile() ||
       ByValArg->stripPointerCasts() != MDep->getDest())
     return false;
 
   // The length of the memcpy must be larger or equal to the size of the byval.
   ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
-  if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize)
+  if (!C1 || C1->getValue().getZExtValue() < ByValSize)
     return false;
 
   // Get the alignment of the byval.  If the call doesn't specify the alignment,
@@ -1043,7 +1044,7 @@ bool MemCpyOpt::runOnFunction(Function &F) {
   bool MadeChange = false;
   MD = &getAnalysis<MemoryDependenceAnalysis>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
 
   // If we don't have at least memset and memcpy, there is little point of doing
@@ -1058,6 +1059,6 @@ bool MemCpyOpt::runOnFunction(Function &F) {
     MadeChange = true;
   }
 
-  MD = 0;
+  MD = nullptr;
   return MadeChange;
 }
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 2f19935..7cce89e 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "partially-inline-libcalls"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Intrinsics.h"
@@ -25,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "partially-inline-libcalls"
+
 namespace {
   class PartiallyInlineLibCalls : public FunctionPass {
   public:
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index b6b4d97..986d6a4 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -20,7 +20,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "reassociate"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
@@ -42,6 +41,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "reassociate"
+
 STATISTIC(NumChanged, "Number of insts reassociated");
 STATISTIC(NumAnnihil, "Number of expr tree annihilated");
 STATISTIC(NumFactor , "Number of multiplies factored");
@@ -122,14 +123,14 @@ namespace {
   public:
     XorOpnd(Value *V);
 
-    bool isInvalid() const { return SymbolicPart == 0; }
+    bool isInvalid() const { return SymbolicPart == nullptr; }
     bool isOrExpr() const { return isOr; }
     Value *getValue() const { return OrigVal; }
     Value *getSymbolicPart() const { return SymbolicPart; }
     unsigned getSymbolicRank() const { return SymbolicRank; }
     const APInt &getConstPart() const { return ConstPart; }
 
-    void Invalidate() { SymbolicPart = OrigVal = 0; }
+    void Invalidate() { SymbolicPart = OrigVal = nullptr; }
     void setSymbolicRank(unsigned R) { SymbolicRank = R; }
 
     // Sort the XorOpnd-Pointer in ascending order of symbolic-value-rank.
@@ -236,7 +237,7 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
   if (V->hasOneUse() && isa<Instruction>(V) &&
       cast<Instruction>(V)->getOpcode() == Opcode)
     return cast<BinaryOperator>(V);
-  return 0;
+  return nullptr;
 }
 
 static bool isUnmovableInstruction(Instruction *I) {
@@ -284,7 +285,7 @@ void Reassociate::BuildRankMap(Function &F) {
 
 unsigned Reassociate::getRank(Value *V) {
   Instruction *I = dyn_cast<Instruction>(V);
-  if (I == 0) {
+  if (!I) {
     if (isa<Argument>(V)) return ValueRankMap[V];   // Function argument.
     return 0;  // Otherwise it's a global or constant, rank 0.
   }
@@ -705,7 +706,7 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
   // ExpressionChanged - Non-null if the rewritten expression differs from the
   // original in some non-trivial way, requiring the clearing of optional flags.
   // Flags are cleared from the operator in ExpressionChanged up to I inclusive.
-  BinaryOperator *ExpressionChanged = 0;
+  BinaryOperator *ExpressionChanged = nullptr;
   for (unsigned i = 0; ; ++i) {
     // The last operation (which comes earliest in the IR) is special as both
     // operands will come from Ops, rather than just one with the other being
@@ -995,7 +996,7 @@ static Value *EmitAddTreeOfValues(Instruction *I,
 /// remove Factor from the tree and return the new tree.
 Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
   BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
-  if (!BO) return 0;
+  if (!BO) return nullptr;
 
   SmallVector<RepeatedValue, 8> Tree;
   MadeChange |= LinearizeExprTree(BO, Tree);
@@ -1029,7 +1030,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
   if (!FoundFactor) {
     // Make sure to restore the operands to the expression tree.
     RewriteExprTree(BO, Factors);
-    return 0;
+    return nullptr;
   }
 
   BasicBlock::iterator InsertPt = BO; ++InsertPt;
@@ -1114,7 +1115,7 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
       ++NumAnnihil;
     }
   }
-  return 0;
+  return nullptr;
 }
 
 /// Helper funciton of CombineXorOpnd(). It creates a bitwise-and
@@ -1135,7 +1136,7 @@ static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
     }
     return Opnd;
   }
-  return 0;
+  return nullptr;
 }
 
 // Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
@@ -1261,7 +1262,7 @@ Value *Reassociate::OptimizeXor(Instruction *I,
     return V;
       
   if (Ops.size() == 1)
-    return 0;
+    return nullptr;
 
   SmallVector<XorOpnd, 8> Opnds;
   SmallVector<XorOpnd*, 8> OpndPtrs;
@@ -1294,7 +1295,7 @@ Value *Reassociate::OptimizeXor(Instruction *I,
   std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor());
 
   // Step 3: Combine adjacent operands
-  XorOpnd *PrevOpnd = 0;
+  XorOpnd *PrevOpnd = nullptr;
   bool Changed = false;
   for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
     XorOpnd *CurrOpnd = OpndPtrs[i];
@@ -1328,7 +1329,7 @@ Value *Reassociate::OptimizeXor(Instruction *I,
         PrevOpnd = CurrOpnd;
       } else {
         CurrOpnd->Invalidate();
-        PrevOpnd = 0;
+        PrevOpnd = nullptr;
       }
       Changed = true;
     }
@@ -1358,7 +1359,7 @@ Value *Reassociate::OptimizeXor(Instruction *I,
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// OptimizeAdd - Optimize a series of operands to an 'add' instruction.  This
@@ -1445,7 +1446,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
   // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
   // where they are actually the same multiply.
   unsigned MaxOcc = 0;
-  Value *MaxOccVal = 0;
+  Value *MaxOccVal = nullptr;
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul);
     if (!BOp)
@@ -1543,7 +1544,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
     Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// \brief Build up a vector of value/power pairs factoring a product.
@@ -1688,14 +1689,14 @@ Value *Reassociate::OptimizeMul(BinaryOperator *I,
   // We can only optimize the multiplies when there is a chain of more than
   // three, such that a balanced tree might require fewer total multiplies.
   if (Ops.size() < 4)
-    return 0;
+    return nullptr;
 
   // Try to turn linear trees of multiplies without other uses of the
   // intermediate stages into minimal multiply DAGs with perfect sub-expression
   // re-use.
   SmallVector<Factor, 4> Factors;
   if (!collectMultiplyFactors(Ops, Factors))
-    return 0; // All distinct factors, so nothing left for us to do.
+    return nullptr; // All distinct factors, so nothing left for us to do.
 
   IRBuilder<> Builder(I);
   Value *V = buildMinimalMultiplyDAG(Builder, Factors);
@@ -1704,14 +1705,14 @@ Value *Reassociate::OptimizeMul(BinaryOperator *I,
 
   ValueEntry NewEntry = ValueEntry(getRank(V), V);
   Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry);
-  return 0;
+  return nullptr;
 }
 
 Value *Reassociate::OptimizeExpression(BinaryOperator *I,
                                        SmallVectorImpl<ValueEntry> &Ops) {
   // Now that we have the linearized expression tree, try to optimize it.
   // Start by folding any constants that we found.
-  Constant *Cst = 0;
+  Constant *Cst = nullptr;
   unsigned Opcode = I->getOpcode();
   while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
     Constant *C = cast<Constant>(Ops.pop_back_val().Op);
@@ -1761,7 +1762,7 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
 
   if (Ops.size() != NumOps)
     return OptimizeExpression(I, Ops);
-  return 0;
+  return nullptr;
 }
 
 /// EraseInst - Zap the given instruction, adding interesting operands to the
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index d9809ce..b6023e2 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "reg2mem"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/BasicBlock.h"
@@ -30,6 +29,8 @@
 #include <list>
 using namespace llvm;
 
+#define DEBUG_TYPE "reg2mem"
+
 STATISTIC(NumRegsDemoted, "Number of registers demoted");
 STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
 
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index b8f10e9..feeb231 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -17,7 +17,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sccp"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -42,6 +41,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "sccp"
+
 STATISTIC(NumInstRemoved, "Number of instructions removed");
 STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
 
@@ -81,7 +82,7 @@ class LatticeVal {
   }
 
 public:
-  LatticeVal() : Val(0, undefined) {}
+  LatticeVal() : Val(nullptr, undefined) {}
 
   bool isUndefined() const { return getLatticeValue() == undefined; }
   bool isConstant() const {
@@ -133,7 +134,7 @@ public:
   ConstantInt *getConstantInt() const {
     if (isConstant())
       return dyn_cast<ConstantInt>(getConstant());
-    return 0;
+    return nullptr;
   }
 
   void markForcedConstant(Constant *V) {
@@ -403,7 +404,7 @@ private:
     if (Constant *C = dyn_cast<Constant>(V)) {
       Constant *Elt = C->getAggregateElement(i);
 
-      if (Elt == 0)
+      if (!Elt)
         LV.markOverdefined();      // Unknown sort of constant.
       else if (isa<UndefValue>(Elt))
         ; // Undef values remain undefined.
@@ -522,7 +523,7 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
 
     LatticeVal BCValue = getValueState(BI->getCondition());
     ConstantInt *CI = BCValue.getConstantInt();
-    if (CI == 0) {
+    if (!CI) {
       // Overdefined condition variables, and branches on unfoldable constant
       // conditions, mean the branch could go either way.
       if (!BCValue.isUndefined())
@@ -549,7 +550,7 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
     LatticeVal SCValue = getValueState(SI->getCondition());
     ConstantInt *CI = SCValue.getConstantInt();
 
-    if (CI == 0) {   // Overdefined or undefined condition?
+    if (!CI) {   // Overdefined or undefined condition?
       // All destinations are executable!
       if (!SCValue.isUndefined())
         Succs.assign(TI.getNumSuccessors(), true);
@@ -594,7 +595,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     // Overdefined condition variables mean the branch could go either way,
     // undef conditions mean that neither edge is feasible yet.
     ConstantInt *CI = BCValue.getConstantInt();
-    if (CI == 0)
+    if (!CI)
       return !BCValue.isUndefined();
 
     // Constant condition variables mean the branch can only go a single way.
@@ -612,7 +613,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     LatticeVal SCValue = getValueState(SI->getCondition());
     ConstantInt *CI = SCValue.getConstantInt();
 
-    if (CI == 0)
+    if (!CI)
       return !SCValue.isUndefined();
 
     return SI->findCaseValue(CI).getCaseSuccessor() == To;
@@ -626,7 +627,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
 #ifndef NDEBUG
   dbgs() << "Unknown terminator instruction: " << *TI << '\n';
 #endif
-  llvm_unreachable(0);
+  llvm_unreachable(nullptr);
 }
 
 // visit Implementations - Something changed in this instruction, either an
@@ -667,7 +668,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   // constant.  If they are constant and don't agree, the PHI is overdefined.
   // If there are no executable operands, the PHI remains undefined.
   //
-  Constant *OperandVal = 0;
+  Constant *OperandVal = nullptr;
   for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
     LatticeVal IV = getValueState(PN.getIncomingValue(i));
     if (IV.isUndefined()) continue;  // Doesn't influence PHI node.
@@ -678,7 +679,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
     if (IV.isOverdefined())    // PHI node becomes overdefined!
       return markOverdefined(&PN);
 
-    if (OperandVal == 0) {   // Grab the first value.
+    if (!OperandVal) {   // Grab the first value.
       OperandVal = IV.getConstant();
       continue;
     }
@@ -774,7 +775,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
 
 void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
   StructType *STy = dyn_cast<StructType>(IVI.getType());
-  if (STy == 0)
+  if (!STy)
     return markOverdefined(&IVI);
 
   // If this has more than one index, we can't handle it, drive all results to
@@ -862,7 +863,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
   // If this is an AND or OR with 0 or -1, it doesn't matter that the other
   // operand is overdefined.
   if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) {
-    LatticeVal *NonOverdefVal = 0;
+    LatticeVal *NonOverdefVal = nullptr;
     if (!V1State.isOverdefined())
       NonOverdefVal = &V1State;
     else if (!V2State.isOverdefined())
@@ -1081,7 +1082,7 @@ void SCCPSolver::visitCallSite(CallSite CS) {
   // The common case is that we aren't tracking the callee, either because we
   // are not doing interprocedural analysis or the callee is indirect, or is
   // external.  Handle these cases first.
-  if (F == 0 || F->isDeclaration()) {
+  if (!F || F->isDeclaration()) {
 CallOverdefined:
     // Void return and not tracking callee, just bail.
     if (I->getType()->isVoidTy()) return;
@@ -1555,7 +1556,7 @@ bool SCCP::runOnFunction(Function &F) {
 
   DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
   const DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  const DataLayout *DL = DLP ? &DLP->getDataLayout() : 0;
+  const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
   const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
   SCCPSolver Solver(DL, TLI);
 
@@ -1684,7 +1685,7 @@ static bool AddressIsTaken(const GlobalValue *GV) {
 
 bool IPSCCP::runOnModule(Module &M) {
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  const DataLayout *DL = DLP ? &DLP->getDataLayout() : 0;
+  const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
   const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
   SCCPSolver Solver(DL, TLI);
 
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index ed5e618..04bf4f8 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -23,7 +23,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sroa"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -64,6 +63,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "sroa"
+
 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
@@ -159,8 +160,8 @@ public:
 
   Use *getUse() const { return UseAndIsSplittable.getPointer(); }
 
-  bool isDead() const { return getUse() == 0; }
-  void kill() { UseAndIsSplittable.setPointer(0); }
+  bool isDead() const { return getUse() == nullptr; }
+  void kill() { UseAndIsSplittable.setPointer(nullptr); }
 
   /// \brief Support for ordering ranges.
   ///
@@ -320,7 +321,7 @@ static Value *foldSelectInst(SelectInst &SI) {
   if (SI.getOperand(1) == SI.getOperand(2))
     return SI.getOperand(1);
 
-  return 0;
+  return nullptr;
 }
 
 /// \brief Builder for the alloca slices.
@@ -642,7 +643,7 @@ private:
           Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
     } while (!Uses.empty());
 
-    return 0;
+    return nullptr;
   }
 
   void visitPHINode(PHINode &PN) {
@@ -724,7 +725,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
       AI(AI),
 #endif
-      PointerEscapingInstr(0) {
+      PointerEscapingInstr(nullptr) {
   SliceBuilder PB(DL, AI, *this);
   SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
   if (PtrI.isEscaped() || PtrI.isAborted()) {
@@ -873,7 +874,7 @@ public:
     for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
-      Value *Arg = 0;
+      Value *Arg = nullptr;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
         // If an argument is zero extended then use argument directly. The ZExt
         // may be zapped by an optimization pass in future.
@@ -969,7 +970,7 @@ class SROA : public FunctionPass {
 public:
   SROA(bool RequiresDomTree = true)
       : FunctionPass(ID), RequiresDomTree(RequiresDomTree),
-        C(0), DL(0), DT(0) {
+        C(nullptr), DL(nullptr), DT(nullptr) {
     initializeSROAPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F) override;
@@ -1011,9 +1012,9 @@ INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
 static Type *findCommonType(AllocaSlices::const_iterator B,
                             AllocaSlices::const_iterator E,
                             uint64_t EndOffset) {
-  Type *Ty = 0;
+  Type *Ty = nullptr;
   bool TyIsCommon = true;
-  IntegerType *ITy = 0;
+  IntegerType *ITy = nullptr;
 
   // Note that we need to look at *every* alloca slice's Use to ensure we
   // always get consistent results regardless of the order of slices.
@@ -1024,7 +1025,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
     if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
       continue;
 
-    Type *UserTy = 0;
+    Type *UserTy = nullptr;
     if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
       UserTy = LI->getType();
     } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
@@ -1074,7 +1075,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
 /// FIXME: This should be hoisted into a generic utility, likely in
 /// Transforms/Util/Local.h
 static bool isSafePHIToSpeculate(PHINode &PN,
-                                 const DataLayout *DL = 0) {
+                                 const DataLayout *DL = nullptr) {
   // For now, we can only do this promotion if the load is in the same block
   // as the PHI, and if there are no stores between the phi and load.
   // TODO: Allow recursive phi users.
@@ -1084,7 +1085,7 @@ static bool isSafePHIToSpeculate(PHINode &PN,
   bool HaveLoad = false;
   for (User *U : PN.users()) {
     LoadInst *LI = dyn_cast<LoadInst>(U);
-    if (LI == 0 || !LI->isSimple())
+    if (!LI || !LI->isSimple())
       return false;
 
     // For now we only allow loads in the same block as the PHI.  This is
@@ -1191,7 +1192,8 @@ static void speculatePHINodeLoads(PHINode &PN) {
 ///
 /// We can do this to a select if its only uses are loads and if the operand
 /// to the select can be loaded unconditionally.
-static bool isSafeSelectToSpeculate(SelectInst &SI, const DataLayout *DL = 0) {
+static bool isSafeSelectToSpeculate(SelectInst &SI,
+                                    const DataLayout *DL = nullptr) {
   Value *TValue = SI.getTrueValue();
   Value *FValue = SI.getFalseValue();
   bool TDerefable = TValue->isDereferenceablePointer();
@@ -1199,7 +1201,7 @@ static bool isSafeSelectToSpeculate(SelectInst &SI, const DataLayout *DL = 0) {
 
   for (User *U : SI.users()) {
     LoadInst *LI = dyn_cast<LoadInst>(U);
-    if (LI == 0 || !LI->isSimple())
+    if (!LI || !LI->isSimple())
       return false;
 
     // Both operands to the select need to be dereferencable, either
@@ -1332,19 +1334,21 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
 
   // We can't recurse through pointer types.
   if (Ty->isPointerTy())
-    return 0;
+    return nullptr;
 
   // We try to analyze GEPs over vectors here, but note that these GEPs are
   // extremely poorly defined currently. The long-term goal is to remove GEPing
   // over a vector from the IR completely.
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
     unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
-    if (ElementSizeInBits % 8)
-      return 0; // GEPs over non-multiple of 8 size vector elements are invalid.
+    if (ElementSizeInBits % 8 != 0) {
+      // GEPs over non-multiple of 8 size vector elements are invalid.
+      return nullptr;
+    }
     APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
     APInt NumSkippedElements = Offset.sdiv(ElementSize);
     if (NumSkippedElements.ugt(VecTy->getNumElements()))
-      return 0;
+      return nullptr;
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
     return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
@@ -1356,7 +1360,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
     APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
     APInt NumSkippedElements = Offset.sdiv(ElementSize);
     if (NumSkippedElements.ugt(ArrTy->getNumElements()))
-      return 0;
+      return nullptr;
 
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
@@ -1366,17 +1370,17 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
 
   StructType *STy = dyn_cast<StructType>(Ty);
   if (!STy)
-    return 0;
+    return nullptr;
 
   const StructLayout *SL = DL.getStructLayout(STy);
   uint64_t StructOffset = Offset.getZExtValue();
   if (StructOffset >= SL->getSizeInBytes())
-    return 0;
+    return nullptr;
   unsigned Index = SL->getElementContainingOffset(StructOffset);
   Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
   Type *ElementTy = STy->getElementType(Index);
   if (Offset.uge(DL.getTypeAllocSize(ElementTy)))
-    return 0; // The offset points into alignment padding.
+    return nullptr; // The offset points into alignment padding.
 
   Indices.push_back(IRB.getInt32(Index));
   return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
@@ -1402,14 +1406,14 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
   // Don't consider any GEPs through an i8* as natural unless the TargetTy is
   // an i8.
   if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
-    return 0;
+    return nullptr;
 
   Type *ElementTy = Ty->getElementType();
   if (!ElementTy->isSized())
-    return 0; // We can't GEP through an unsized element.
+    return nullptr; // We can't GEP through an unsized element.
   APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
   if (ElementSize == 0)
-    return 0; // Zero-length arrays can't help us build a natural GEP.
+    return nullptr; // Zero-length arrays can't help us build a natural GEP.
   APInt NumSkippedElements = Offset.sdiv(ElementSize);
 
   Offset -= NumSkippedElements * ElementSize;
@@ -1445,11 +1449,11 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
   // We may end up computing an offset pointer that has the wrong type. If we
   // never are able to compute one directly that has the correct type, we'll
   // fall back to it, so keep it around here.
-  Value *OffsetPtr = 0;
+  Value *OffsetPtr = nullptr;
 
   // Remember any i8 pointer we come across to re-use if we need to do a raw
   // byte offset.
-  Value *Int8Ptr = 0;
+  Value *Int8Ptr = nullptr;
   APInt Int8PtrOffset(Offset.getBitWidth(), 0);
 
   Type *TargetTy = PointerTy->getPointerElementType();
@@ -2043,14 +2047,14 @@ public:
         NewAllocaBeginOffset(NewAllocaBeginOffset),
         NewAllocaEndOffset(NewAllocaEndOffset),
         NewAllocaTy(NewAI.getAllocatedType()),
-        VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : 0),
-        ElementTy(VecTy ? VecTy->getElementType() : 0),
+        VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : nullptr),
+        ElementTy(VecTy ? VecTy->getElementType() : nullptr),
         ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
         IntTy(IsIntegerPromotable
                   ? Type::getIntNTy(
                         NewAI.getContext(),
                         DL.getTypeSizeInBits(NewAI.getAllocatedType()))
-                  : 0),
+                  : nullptr),
         BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
         OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
         IRB(NewAI.getContext(), ConstantFolder()) {
@@ -2144,7 +2148,7 @@ private:
   ///
   /// You can optionally pass a type to this routine and if that type's ABI
   /// alignment is itself suitable, this will return zero.
-  unsigned getSliceAlign(Type *Ty = 0) {
+  unsigned getSliceAlign(Type *Ty = nullptr) {
     unsigned NewAIAlign = NewAI.getAlignment();
     if (!NewAIAlign)
       NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
@@ -2594,7 +2598,7 @@ private:
     unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
     unsigned NumElements = EndIndex - BeginIndex;
     IntegerType *SubIntTy
-      = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0;
+      = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : nullptr;
 
     // Reset the other pointer type to match the register type we're going to
     // use, but using the address space of the original other pointer.
@@ -2992,22 +2996,22 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
     return stripAggregateTypeWrapping(DL, Ty);
   if (Offset > DL.getTypeAllocSize(Ty) ||
       (DL.getTypeAllocSize(Ty) - Offset) < Size)
-    return 0;
+    return nullptr;
 
   if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
     // We can't partition pointers...
     if (SeqTy->isPointerTy())
-      return 0;
+      return nullptr;
 
     Type *ElementTy = SeqTy->getElementType();
     uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
     uint64_t NumSkippedElements = Offset / ElementSize;
     if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
       if (NumSkippedElements >= ArrTy->getNumElements())
-        return 0;
+        return nullptr;
     } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) {
       if (NumSkippedElements >= VecTy->getNumElements())
-        return 0;
+        return nullptr;
     }
     Offset -= NumSkippedElements * ElementSize;
 
@@ -3015,7 +3019,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
     if (Offset > 0 || Size < ElementSize) {
       // Bail if the partition ends in a different array element.
       if ((Offset + Size) > ElementSize)
-        return 0;
+        return nullptr;
       // Recurse through the element type trying to peel off offset bytes.
       return getTypePartition(DL, ElementTy, Offset, Size);
     }
@@ -3026,20 +3030,20 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
     assert(Size > ElementSize);
     uint64_t NumElements = Size / ElementSize;
     if (NumElements * ElementSize != Size)
-      return 0;
+      return nullptr;
     return ArrayType::get(ElementTy, NumElements);
   }
 
   StructType *STy = dyn_cast<StructType>(Ty);
   if (!STy)
-    return 0;
+    return nullptr;
 
   const StructLayout *SL = DL.getStructLayout(STy);
   if (Offset >= SL->getSizeInBytes())
-    return 0;
+    return nullptr;
   uint64_t EndOffset = Offset + Size;
   if (EndOffset > SL->getSizeInBytes())
-    return 0;
+    return nullptr;
 
   unsigned Index = SL->getElementContainingOffset(Offset);
   Offset -= SL->getElementOffset(Index);
@@ -3047,12 +3051,12 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
   Type *ElementTy = STy->getElementType(Index);
   uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
   if (Offset >= ElementSize)
-    return 0; // The offset points into alignment padding.
+    return nullptr; // The offset points into alignment padding.
 
   // See if any partition must be contained by the element.
   if (Offset > 0 || Size < ElementSize) {
     if ((Offset + Size) > ElementSize)
-      return 0;
+      return nullptr;
     return getTypePartition(DL, ElementTy, Offset, Size);
   }
   assert(Offset == 0);
@@ -3065,14 +3069,14 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
   if (EndOffset < SL->getSizeInBytes()) {
     unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
     if (Index == EndIndex)
-      return 0; // Within a single element and its padding.
+      return nullptr; // Within a single element and its padding.
 
     // Don't try to form "natural" types if the elements don't line up with the
     // expected size.
     // FIXME: We could potentially recurse down through the last element in the
     // sub-struct to find a natural end point.
     if (SL->getElementOffset(EndIndex) != EndOffset)
-      return 0;
+      return nullptr;
 
     assert(Index < EndIndex);
     EE = STy->element_begin() + EndIndex;
@@ -3083,7 +3087,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty,
                                       STy->isPacked());
   const StructLayout *SubSL = DL.getStructLayout(SubTy);
   if (Size != SubSL->getSizeInBytes())
-    return 0; // The sub-struct doesn't have quite the size needed.
+    return nullptr; // The sub-struct doesn't have quite the size needed.
 
   return SubTy;
 }
@@ -3108,7 +3112,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
   // Try to compute a friendly type for this partition of the alloca. This
   // won't always succeed, in which case we fall back to a legal integer type
   // or an i8 array of an appropriate size.
-  Type *SliceTy = 0;
+  Type *SliceTy = nullptr;
   if (Type *CommonUseTy = findCommonType(B, E, EndOffset))
     if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize)
       SliceTy = CommonUseTy;
@@ -3155,7 +3159,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
     // the alloca's alignment unconstrained.
     if (Alignment <= DL->getABITypeAlignment(SliceTy))
       Alignment = 0;
-    NewAI = new AllocaInst(SliceTy, 0, Alignment,
+    NewAI = new AllocaInst(SliceTy, nullptr, Alignment,
                            AI.getName() + ".sroa." + Twine(B - S.begin()), &AI);
     ++NumNewAllocas;
   }
@@ -3494,7 +3498,7 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
     for (Use &Operand : I->operands())
       if (Instruction *U = dyn_cast<Instruction>(Operand)) {
         // Zero out the operand and see if it becomes trivially dead.
-        Operand = 0;
+        Operand = nullptr;
         if (isInstructionTriviallyDead(U))
           DeadInsts.insert(U);
       }
@@ -3612,7 +3616,7 @@ bool SROA::runOnFunction(Function &F) {
   DL = &DLP->getDataLayout();
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : 0;
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
 
   BasicBlock &EntryBB = F.getEntryBlock();
   for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp
index 20d6daa..8e557aa 100644
--- a/lib/Transforms/Scalar/SampleProfile.cpp
+++ b/lib/Transforms/Scalar/SampleProfile.cpp
@@ -22,8 +22,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sample-profile"
-
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -54,6 +52,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "sample-profile"
+
 // Command line option to specify the file to read samples from. This is
 // mainly used for debugging.
 static cl::opt<std::string> SampleProfileFile(
@@ -120,8 +120,8 @@ typedef DenseMap<BasicBlock *, SmallVector<BasicBlock *, 8>> BlockEdgeMap;
 class SampleFunctionProfile {
 public:
   SampleFunctionProfile()
-      : TotalSamples(0), TotalHeadSamples(0), HeaderLineno(0), DT(0), PDT(0),
-        LI(0), Ctx(0) {}
+      : TotalSamples(0), TotalHeadSamples(0), HeaderLineno(0), DT(nullptr),
+        PDT(nullptr), LI(nullptr), Ctx(nullptr) {}
 
   unsigned getFunctionLoc(Function &F);
   bool emitAnnotations(Function &F, DominatorTree *DomTree,
@@ -315,7 +315,7 @@ protected:
   /// \brief Name of the profile file to load.
   StringRef Filename;
 
-  /// \brief Flag indicating whether the profile input loaded succesfully.
+  /// \brief Flag indicating whether the profile input loaded successfully.
   bool ProfileIsValid;
 };
 }
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index e950eba..f8f828c 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -64,6 +64,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeStructurizeCFGPass(Registry);
   initializeSinkingPass(Registry);
   initializeTailCallElimPass(Registry);
+  initializeSeparateConstOffsetFromGEPPass(Registry);
 }
 
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
@@ -181,6 +182,7 @@ void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
 
 void LLVMAddVerifierPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createVerifierPass());
+  // FIXME: should this also add createDebugInfoVerifierPass()?
 }
 
 void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) {
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index e7b5ab2..58192fc 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -19,7 +19,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "scalarrepl"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -52,6 +51,8 @@
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "scalarrepl"
+
 STATISTIC(NumReplaced,  "Number of allocas broken up");
 STATISTIC(NumPromoted,  "Number of allocas promoted");
 STATISTIC(NumAdjusted,  "Number of scalar allocas adjusted to allow promotion");
@@ -304,7 +305,7 @@ public:
   explicit ConvertToScalarInfo(unsigned Size, const DataLayout &DL,
                                unsigned SLT)
     : AllocaSize(Size), DL(DL), ScalarLoadThreshold(SLT), IsNotTrivial(false),
-    ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false),
+    ScalarKind(Unknown), VectorTy(nullptr), HadNonMemTransferAccess(false),
     HadDynamicAccess(false) { }
 
   AllocaInst *TryConvert(AllocaInst *AI);
@@ -332,8 +333,8 @@ private:
 AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   // If we can't convert this scalar, or if mem2reg can trivially do it, bail
   // out.
-  if (!CanConvertToScalar(AI, 0, 0) || !IsNotTrivial)
-    return 0;
+  if (!CanConvertToScalar(AI, 0, nullptr) || !IsNotTrivial)
+    return nullptr;
 
   // If an alloca has only memset / memcpy uses, it may still have an Unknown
   // ScalarKind. Treat it as an Integer below.
@@ -361,23 +362,24 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
     // Do not convert to scalar integer if the alloca size exceeds the
     // scalar load threshold.
     if (BitWidth > ScalarLoadThreshold)
-      return 0;
+      return nullptr;
 
     if ((ScalarKind == ImplicitVector || ScalarKind == Integer) &&
         !HadNonMemTransferAccess && !DL.fitsInLegalInteger(BitWidth))
-      return 0;
+      return nullptr;
     // Dynamic accesses on integers aren't yet supported.  They need us to shift
     // by a dynamic amount which could be difficult to work out as we might not
     // know whether to use a left or right shift.
     if (ScalarKind == Integer && HadDynamicAccess)
-      return 0;
+      return nullptr;
 
     DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n");
     // Create and insert the integer alloca.
     NewTy = IntegerType::get(AI->getContext(), BitWidth);
   }
-  AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin());
-  ConvertUsesToScalar(AI, NewAI, 0, 0);
+  AllocaInst *NewAI = new AllocaInst(NewTy, nullptr, "",
+                                     AI->getParent()->begin());
+  ConvertUsesToScalar(AI, NewAI, 0, nullptr);
   return NewAI;
 }
 
@@ -508,7 +510,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
 
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
-      Value *GEPNonConstantIdx = 0;
+      Value *GEPNonConstantIdx = nullptr;
       if (!GEP->hasAllConstantIndices()) {
         if (!isa<VectorType>(PtrTy->getElementType()))
           return false;
@@ -564,7 +566,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
       if (NonConstantIdx)
         return false;
       ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength());
-      if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0)
+      if (!Len || Len->getZExtValue() != AllocaSize || Offset != 0)
         return false;
 
       IsNotTrivial = true;  // Can't be mem2reg'd.
@@ -608,7 +610,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
-      Value* GEPNonConstantIdx = 0;
+      Value* GEPNonConstantIdx = nullptr;
       if (!GEP->hasAllConstantIndices()) {
         assert(!NonConstantIdx &&
                "Dynamic GEP reading from dynamic GEP unsupported");
@@ -671,7 +673,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
         Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
         Value *New = ConvertScalar_InsertValue(
                                     ConstantInt::get(User->getContext(), APVal),
-                                               Old, Offset, 0, Builder);
+                                               Old, Offset, nullptr, Builder);
         Builder.CreateStore(New, NewAI);
 
         // If the load we just inserted is now dead, then the memset overwrote
@@ -809,7 +811,7 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
                                         Offset+Layout.getElementOffsetInBits(i),
-                                              0, Builder);
+                                              nullptr, Builder);
       Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
@@ -822,7 +824,8 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
     Value *Res = UndefValue::get(AT);
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
-                                              Offset+i*EltSize, 0, Builder);
+                                              Offset+i*EltSize, nullptr,
+                                              Builder);
       Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
@@ -938,7 +941,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
       Value *Elt = Builder.CreateExtractValue(SV, i);
       Old = ConvertScalar_InsertValue(Elt, Old,
                                       Offset+Layout.getElementOffsetInBits(i),
-                                      0, Builder);
+                                      nullptr, Builder);
     }
     return Old;
   }
@@ -949,7 +952,8 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
     uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType());
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = Builder.CreateExtractValue(SV, i);
-      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, 0, Builder);
+      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, nullptr,
+                                      Builder);
     }
     return Old;
   }
@@ -1024,7 +1028,7 @@ bool SROA::runOnFunction(Function &F) {
     return false;
 
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
 
   bool Changed = performPromotion(F);
 
@@ -1054,7 +1058,7 @@ class AllocaPromoter : public LoadAndStorePromoter {
 public:
   AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
                  DIBuilder *DB)
-    : LoadAndStorePromoter(Insts, S), AI(0), DIB(DB) {}
+    : LoadAndStorePromoter(Insts, S), AI(nullptr), DIB(DB) {}
 
   void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
     // Remember which alloca we're promoting (for isInstInList).
@@ -1100,7 +1104,7 @@ public:
     for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
-      Value *Arg = NULL;
+      Value *Arg = nullptr;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
         // If an argument is zero extended then use argument directly. The ZExt
         // may be zapped by an optimization pass in future.
@@ -1143,7 +1147,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) {
 
   for (User *U : SI->users()) {
     LoadInst *LI = dyn_cast<LoadInst>(U);
-    if (LI == 0 || !LI->isSimple()) return false;
+    if (!LI || !LI->isSimple()) return false;
 
     // Both operands to the select need to be dereferencable, either absolutely
     // (e.g. allocas) or at this point because we can see other accesses to it.
@@ -1183,7 +1187,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) {
   unsigned MaxAlign = 0;
   for (User *U : PN->users()) {
     LoadInst *LI = dyn_cast<LoadInst>(U);
-    if (LI == 0 || !LI->isSimple()) return false;
+    if (!LI || !LI->isSimple()) return false;
 
     // For now we only allow loads in the same block as the PHI.  This is a
     // common case that happens when instcombine merges two loads through a PHI.
@@ -1380,7 +1384,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) {
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
       BasicBlock *Pred = PN->getIncomingBlock(i);
       LoadInst *&Load = InsertedLoads[Pred];
-      if (Load == 0) {
+      if (!Load) {
         Load = new LoadInst(PN->getIncomingValue(i),
                             PN->getName() + "." + Pred->getName(),
                             Pred->getTerminator());
@@ -1400,7 +1404,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) {
 
 bool SROA::performPromotion(Function &F) {
   std::vector<AllocaInst*> Allocas;
-  DominatorTree *DT = 0;
+  DominatorTree *DT = nullptr;
   if (HasDomTree)
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
@@ -1537,7 +1541,7 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
   if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
     ElementAllocas.reserve(ST->getNumContainedTypes());
     for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) {
-      AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0,
+      AllocaInst *NA = new AllocaInst(ST->getContainedType(i), nullptr,
                                       AI->getAlignment(),
                                       AI->getName() + "." + Twine(i), AI);
       ElementAllocas.push_back(NA);
@@ -1548,7 +1552,7 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
     ElementAllocas.reserve(AT->getNumElements());
     Type *ElTy = AT->getElementType();
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
-      AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(),
+      AllocaInst *NA = new AllocaInst(ElTy, nullptr, AI->getAlignment(),
                                       AI->getName() + "." + Twine(i), AI);
       ElementAllocas.push_back(NA);
       WorkList.push_back(NA);  // Add to worklist for recursive processing
@@ -1577,7 +1581,7 @@ void SROA::DeleteDeadInstructions() {
         // Zero out the operand and see if it becomes trivially dead.
         // (But, don't add allocas to the dead instruction list -- they are
         // already on the worklist and will be deleted separately.)
-        *OI = 0;
+        *OI = nullptr;
         if (isInstructionTriviallyDead(U) && !isa<AllocaInst>(U))
           DeadInsts.push_back(U);
       }
@@ -1604,12 +1608,10 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
         isSafeForScalarRepl(GEPI, GEPOffset, Info);
     } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
       ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
-      if (Length == 0)
-        return MarkUnsafe(Info, User);
-      if (Length->isNegative())
+      if (!Length || Length->isNegative())
         return MarkUnsafe(Info, User);
 
-      isSafeMemAccess(Offset, Length->getZExtValue(), 0,
+      isSafeMemAccess(Offset, Length->getZExtValue(), nullptr,
                       U.getOperandNo() == 0, Info, MI,
                       true /*AllowWholeAccess*/);
     } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
@@ -1744,12 +1746,12 @@ static bool isHomogeneousAggregate(Type *T, unsigned &NumElts,
                                    Type *&EltTy) {
   if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
     NumElts = AT->getNumElements();
-    EltTy = (NumElts == 0 ? 0 : AT->getElementType());
+    EltTy = (NumElts == 0 ? nullptr : AT->getElementType());
     return true;
   }
   if (StructType *ST = dyn_cast<StructType>(T)) {
     NumElts = ST->getNumContainedTypes();
-    EltTy = (NumElts == 0 ? 0 : ST->getContainedType(0));
+    EltTy = (NumElts == 0 ? nullptr : ST->getContainedType(0));
     for (unsigned n = 1; n < NumElts; ++n) {
       if (ST->getContainedType(n) != EltTy)
         return false;
@@ -2038,7 +2040,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
   // In this case, it must be the last GEP operand which is dynamic so keep that
   // aside until we've found the constant GEP offset then add it back in at the
   // end.
-  Value* NonConstantIdx = 0;
+  Value* NonConstantIdx = nullptr;
   if (!GEPI->hasAllConstantIndices())
     NonConstantIdx = Indices.pop_back_val();
   Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
@@ -2108,7 +2110,8 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
   if (NewOffset) {
     // Splice the first element and index 'NewOffset' bytes in.  SROA will
     // split the alloca again later.
-    Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy());
+    unsigned AS = AI->getType()->getAddressSpace();
+    Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy(AS));
     V = Builder.CreateGEP(V, Builder.getInt64(NewOffset));
 
     IdxTy = NewElts[Idx]->getAllocatedType();
@@ -2155,7 +2158,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
   // appropriate type.  The "Other" pointer is the pointer that goes to memory
   // that doesn't have anything to do with the alloca that we are promoting. For
   // memset, this Value* stays null.
-  Value *OtherPtr = 0;
+  Value *OtherPtr = nullptr;
   unsigned MemAlignment = MI->getAlignment();
   if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy
     if (Inst == MTI->getRawDest())
@@ -2207,7 +2210,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
 
   for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
     // If this is a memcpy/memmove, emit a GEP of the other element address.
-    Value *OtherElt = 0;
+    Value *OtherElt = nullptr;
     unsigned OtherEltAlign = MemAlignment;
 
     if (OtherPtr) {
@@ -2449,7 +2452,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
 
   // There are two forms here: AI could be an array or struct.  Both cases
   // have different ways to compute the element offset.
-  const StructLayout *Layout = 0;
+  const StructLayout *Layout = nullptr;
   uint64_t ArrayEltBitOffset = 0;
   if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
     Layout = DL->getStructLayout(EltSTy);
diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp
index 006375c..7a73f11 100644
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "scalarizer"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
@@ -25,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "scalarizer"
+
 namespace {
 // Used to store the scattered form of a vector.
 typedef SmallVector<Value *, 8> ValueVector;
@@ -48,7 +49,7 @@ public:
   // insert them before BBI in BB.  If Cache is nonnull, use it to cache
   // the results.
   Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
-            ValueVector *cachePtr = 0);
+            ValueVector *cachePtr = nullptr);
 
   // Return component I, creating a new Value for it if necessary.
   Value *operator[](unsigned I);
@@ -101,7 +102,7 @@ struct BinarySplitter {
 
 // Information about a load or store that we're scalarizing.
 struct VectorLayout {
-  VectorLayout() : VecTy(0), ElemTy(0), VecAlign(0), ElemSize(0) {}
+  VectorLayout() : VecTy(nullptr), ElemTy(nullptr), VecAlign(0), ElemSize(0) {}
 
   // Return the alignment of element I.
   uint64_t getElemAlign(unsigned I) {
@@ -186,9 +187,9 @@ Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
     Ty = PtrTy->getElementType();
   Size = Ty->getVectorNumElements();
   if (!CachePtr)
-    Tmp.resize(Size, 0);
+    Tmp.resize(Size, nullptr);
   else if (CachePtr->empty())
-    CachePtr->resize(Size, 0);
+    CachePtr->resize(Size, nullptr);
   else
     assert(Size == CachePtr->size() && "Inconsistent vector sizes");
 }
@@ -241,7 +242,7 @@ bool Scalarizer::doInitialization(Module &M) {
 
 bool Scalarizer::runOnFunction(Function &F) {
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
     BasicBlock *BB = BBI;
     for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
new file mode 100644
index 0000000..b8529e1
--- /dev/null
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -0,0 +1,623 @@
+//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loop unrolling may create many similar GEPs for array accesses.
+// e.g., a 2-level loop
+//
+// float a[32][32]; // global variable
+//
+// for (int i = 0; i < 2; ++i) {
+//   for (int j = 0; j < 2; ++j) {
+//     ...
+//     ... = a[x + i][y + j];
+//     ...
+//   }
+// }
+//
+// will probably be unrolled to:
+//
+// gep %a, 0, %x, %y; load
+// gep %a, 0, %x, %y + 1; load
+// gep %a, 0, %x + 1, %y; load
+// gep %a, 0, %x + 1, %y + 1; load
+//
+// LLVM's GVN does not use partial redundancy elimination yet, and is thus
+// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
+// significant slowdown in targets with limited addressing modes. For instance,
+// because the PTX target does not support the reg+reg addressing mode, the
+// NVPTX backend emits PTX code that literally computes the pointer address of
+// each GEP, wasting tons of registers. It emits the following PTX for the
+// first load and similar PTX for other loads.
+//
+// mov.u32         %r1, %x;
+// mov.u32         %r2, %y;
+// mul.wide.u32    %rl2, %r1, 128;
+// mov.u64         %rl3, a;
+// add.s64         %rl4, %rl3, %rl2;
+// mul.wide.u32    %rl5, %r2, 4;
+// add.s64         %rl6, %rl4, %rl5;
+// ld.global.f32   %f1, [%rl6];
+//
+// To reduce the register pressure, the optimization implemented in this file
+// merges the common part of a group of GEPs, so we can compute each pointer
+// address by adding a simple offset to the common part, saving many registers.
+//
+// It works by splitting each GEP into a variadic base and a constant offset.
+// The variadic base can be computed once and reused by multiple GEPs, and the
+// constant offsets can be nicely folded into the reg+immediate addressing mode
+// (supported by most targets) without using any extra register.
+//
+// For instance, we transform the four GEPs and four loads in the above example
+// into:
+//
+// base = gep a, 0, x, y
+// load base
+// laod base + 1  * sizeof(float)
+// load base + 32 * sizeof(float)
+// load base + 33 * sizeof(float)
+//
+// Given the transformed IR, a backend that supports the reg+immediate
+// addressing mode can easily fold the pointer arithmetics into the loads. For
+// example, the NVPTX backend can easily fold the pointer arithmetics into the
+// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
+//
+// mov.u32         %r1, %tid.x;
+// mov.u32         %r2, %tid.y;
+// mul.wide.u32    %rl2, %r1, 128;
+// mov.u64         %rl3, a;
+// add.s64         %rl4, %rl3, %rl2;
+// mul.wide.u32    %rl5, %r2, 4;
+// add.s64         %rl6, %rl4, %rl5;
+// ld.global.f32   %f1, [%rl6]; // so far the same as unoptimized PTX
+// ld.global.f32   %f2, [%rl6+4]; // much better
+// ld.global.f32   %f3, [%rl6+128]; // much better
+// ld.global.f32   %f4, [%rl6+132]; // much better
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
+    "disable-separate-const-offset-from-gep", cl::init(false),
+    cl::desc("Do not separate the constant offset from a GEP instruction"),
+    cl::Hidden);
+
+namespace {
+
+/// \brief A helper class for separating a constant offset from a GEP index.
+///
+/// In real programs, a GEP index may be more complicated than a simple addition
+/// of something and a constant integer which can be trivially splitted. For
+/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
+/// constant offset, so that we can separate the index to (a << 3) + b and 5.
+///
+/// Therefore, this class looks into the expression that computes a given GEP
+/// index, and tries to find a constant integer that can be hoisted to the
+/// outermost level of the expression as an addition. Not every constant in an
+/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
+/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
+/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
+class ConstantOffsetExtractor {
+ public:
+  /// Extracts a constant offset from the given GEP index. It outputs the
+  /// numeric value of the extracted constant offset (0 if failed), and a
+  /// new index representing the remainder (equal to the original index minus
+  /// the constant offset).
+  /// \p Idx The given GEP index
+  /// \p NewIdx The new index to replace
+  /// \p DL The datalayout of the module
+  /// \p IP Calculating the new index requires new instructions. IP indicates
+  /// where to insert them (typically right before the GEP).
+  static int64_t Extract(Value *Idx, Value *&NewIdx, const DataLayout *DL,
+                         Instruction *IP);
+  /// Looks for a constant offset without extracting it. The meaning of the
+  /// arguments and the return value are the same as Extract.
+  static int64_t Find(Value *Idx, const DataLayout *DL);
+
+ private:
+  ConstantOffsetExtractor(const DataLayout *Layout, Instruction *InsertionPt)
+      : DL(Layout), IP(InsertionPt) {}
+  /// Searches the expression that computes V for a constant offset. If the
+  /// searching is successful, update UserChain as a path from V to the constant
+  /// offset.
+  int64_t find(Value *V);
+  /// A helper function to look into both operands of a binary operator U.
+  /// \p IsSub Whether U is a sub operator. If so, we need to negate the
+  /// constant offset at some point.
+  int64_t findInEitherOperand(User *U, bool IsSub);
+  /// After finding the constant offset and how it is reached from the GEP
+  /// index, we build a new index which is a clone of the old one except the
+  /// constant offset is removed. For example, given (a + (b + 5)) and knowning
+  /// the constant offset is 5, this function returns (a + b).
+  ///
+  /// We cannot simply change the constant to zero because the expression that
+  /// computes the index or its intermediate result may be used by others.
+  Value *rebuildWithoutConstantOffset();
+  // A helper function for rebuildWithoutConstantOffset that rebuilds the direct
+  // user (U) of the constant offset (C).
+  Value *rebuildLeafWithoutConstantOffset(User *U, Value *C);
+  /// Returns a clone of U except the first occurrence of From with To.
+  Value *cloneAndReplace(User *U, Value *From, Value *To);
+
+  /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0.
+  bool NoCommonBits(Value *LHS, Value *RHS) const;
+  /// Computes which bits are known to be one or zero.
+  /// \p KnownOne Mask of all bits that are known to be one.
+  /// \p KnownZero Mask of all bits that are known to be zero.
+  void ComputeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const;
+  /// Finds the first use of Used in U. Returns -1 if not found.
+  static unsigned FindFirstUse(User *U, Value *Used);
+  /// Returns whether OPC (sext or zext) can be distributed to the operands of
+  /// BO. e.g., sext can be distributed to the operands of an "add nsw" because
+  /// sext (add nsw a, b) == add nsw (sext a), (sext b).
+  static bool Distributable(unsigned OPC, BinaryOperator *BO);
+
+  /// The path from the constant offset to the old GEP index. e.g., if the GEP
+  /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
+  /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
+  /// UserChain[2] will be the entire expression "a * b + (c + 5)".
+  ///
+  /// This path helps rebuildWithoutConstantOffset rebuild the new GEP index.
+  SmallVector<User *, 8> UserChain;
+  /// The data layout of the module. Used in ComputeKnownBits.
+  const DataLayout *DL;
+  Instruction *IP;  /// Insertion position of cloned instructions.
+};
+
+/// \brief A pass that tries to split every GEP in the function into a variadic
+/// base and a constant offset. It is a FunctionPass because searching for the
+/// constant offset may inspect other basic blocks.
+class SeparateConstOffsetFromGEP : public FunctionPass {
+ public:
+  static char ID;
+  SeparateConstOffsetFromGEP() : FunctionPass(ID) {
+    initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DataLayoutPass>();
+    AU.addRequired<TargetTransformInfo>();
+  }
+  bool runOnFunction(Function &F) override;
+
+ private:
+  /// Tries to split the given GEP into a variadic base and a constant offset,
+  /// and returns true if the splitting succeeds.
+  bool splitGEP(GetElementPtrInst *GEP);
+  /// Finds the constant offset within each index, and accumulates them. This
+  /// function only inspects the GEP without changing it. The output
+  /// NeedsExtraction indicates whether we can extract a non-zero constant
+  /// offset from any index.
+  int64_t accumulateByteOffset(GetElementPtrInst *GEP, const DataLayout *DL,
+                               bool &NeedsExtraction);
+};
+}  // anonymous namespace
+
+char SeparateConstOffsetFromGEP::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+    "Split GEPs to a variadic base and a constant offset for better CSE", false,
+    false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(DataLayoutPass)
+INITIALIZE_PASS_END(
+    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
+    "Split GEPs to a variadic base and a constant offset for better CSE", false,
+    false)
+
+FunctionPass *llvm::createSeparateConstOffsetFromGEPPass() {
+  return new SeparateConstOffsetFromGEP();
+}
+
+bool ConstantOffsetExtractor::Distributable(unsigned OPC, BinaryOperator *BO) {
+  assert(OPC == Instruction::SExt || OPC == Instruction::ZExt);
+
+  // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
+  // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
+  if (BO->getOpcode() == Instruction::Add ||
+      BO->getOpcode() == Instruction::Sub) {
+    return (OPC == Instruction::SExt && BO->hasNoSignedWrap()) ||
+           (OPC == Instruction::ZExt && BO->hasNoUnsignedWrap());
+  }
+
+  // sext/zext (and/or/xor A, B) == and/or/xor (sext/zext A), (sext/zext B)
+  // -instcombine also leverages this invariant to do the reverse
+  // transformation to reduce integer casts.
+  return BO->getOpcode() == Instruction::And ||
+         BO->getOpcode() == Instruction::Or ||
+         BO->getOpcode() == Instruction::Xor;
+}
+
+int64_t ConstantOffsetExtractor::findInEitherOperand(User *U, bool IsSub) {
+  assert(U->getNumOperands() == 2);
+  int64_t ConstantOffset = find(U->getOperand(0));
+  // If we found a constant offset in the left operand, stop and return that.
+  // This shortcut might cause us to miss opportunities of combining the
+  // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
+  // However, such cases are probably already handled by -instcombine,
+  // given this pass runs after the standard optimizations.
+  if (ConstantOffset != 0) return ConstantOffset;
+  ConstantOffset = find(U->getOperand(1));
+  // If U is a sub operator, negate the constant offset found in the right
+  // operand.
+  return IsSub ? -ConstantOffset : ConstantOffset;
+}
+
+int64_t ConstantOffsetExtractor::find(Value *V) {
+  // TODO(jingyue): We can even trace into integer/pointer casts, such as
+  // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
+  // integers because it gives good enough results for our benchmarks.
+  assert(V->getType()->isIntegerTy());
+
+  User *U = dyn_cast<User>(V);
+  // We cannot do much with Values that are not a User, such as BasicBlock and
+  // MDNode.
+  if (U == nullptr) return 0;
+
+  int64_t ConstantOffset = 0;
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(U)) {
+    // Hooray, we found it!
+    ConstantOffset = CI->getSExtValue();
+  } else if (Operator *O = dyn_cast<Operator>(U)) {
+    // The GEP index may be more complicated than a simple addition of a
+    // varaible and a constant. Therefore, we trace into subexpressions for more
+    // hoisting opportunities.
+    switch (O->getOpcode()) {
+      case Instruction::Add: {
+        ConstantOffset = findInEitherOperand(U, false);
+        break;
+      }
+      case Instruction::Sub: {
+        ConstantOffset = findInEitherOperand(U, true);
+        break;
+      }
+      case Instruction::Or: {
+        // If LHS and RHS don't have common bits, (LHS | RHS) is equivalent to
+        // (LHS + RHS).
+        if (NoCommonBits(U->getOperand(0), U->getOperand(1)))
+          ConstantOffset = findInEitherOperand(U, false);
+        break;
+      }
+      case Instruction::SExt:
+      case Instruction::ZExt: {
+        // We trace into sext/zext if the operator can be distributed to its
+        // operand. e.g., we can transform into "sext (add nsw a, 5)" and
+        // extract constant 5, because
+        //   sext (add nsw a, 5) == add nsw (sext a), 5
+        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U->getOperand(0))) {
+          if (Distributable(O->getOpcode(), BO))
+            ConstantOffset = find(U->getOperand(0));
+        }
+        break;
+      }
+    }
+  }
+  // If we found a non-zero constant offset, adds it to the path for future
+  // transformation (rebuildWithoutConstantOffset). Zero is a valid constant
+  // offset, but doesn't help this optimization.
+  if (ConstantOffset != 0)
+    UserChain.push_back(U);
+  return ConstantOffset;
+}
+
+unsigned ConstantOffsetExtractor::FindFirstUse(User *U, Value *Used) {
+  for (unsigned I = 0, E = U->getNumOperands(); I < E; ++I) {
+    if (U->getOperand(I) == Used)
+      return I;
+  }
+  return -1;
+}
+
+Value *ConstantOffsetExtractor::cloneAndReplace(User *U, Value *From,
+                                                Value *To) {
+  // Finds in U the first use of From. It is safe to ignore future occurrences
+  // of From, because findInEitherOperand similarly stops searching the right
+  // operand when the first operand has a non-zero constant offset.
+  unsigned OpNo = FindFirstUse(U, From);
+  assert(OpNo != (unsigned)-1 && "UserChain wasn't built correctly");
+
+  // ConstantOffsetExtractor::find only follows Operators (i.e., Instructions
+  // and ConstantExprs). Therefore, U is either an Instruction or a
+  // ConstantExpr.
+  if (Instruction *I = dyn_cast<Instruction>(U)) {
+    Instruction *Clone = I->clone();
+    Clone->setOperand(OpNo, To);
+    Clone->insertBefore(IP);
+    return Clone;
+  }
+  // cast<Constant>(To) is safe because a ConstantExpr only uses Constants.
+  return cast<ConstantExpr>(U)
+      ->getWithOperandReplaced(OpNo, cast<Constant>(To));
+}
+
+Value *ConstantOffsetExtractor::rebuildLeafWithoutConstantOffset(User *U,
+                                                                 Value *C) {
+  assert(U->getNumOperands() <= 2 &&
+         "We didn't trace into any operator with more than 2 operands");
+  // If U has only one operand which is the constant offset, removing the
+  // constant offset leaves U as a null value.
+  if (U->getNumOperands() == 1)
+    return Constant::getNullValue(U->getType());
+
+  // U->getNumOperands() == 2
+  unsigned OpNo = FindFirstUse(U, C); // U->getOperand(OpNo) == C
+  assert(OpNo < 2 && "UserChain wasn't built correctly");
+  Value *TheOther = U->getOperand(1 - OpNo); // The other operand of U
+  // If U = C - X, removing C makes U = -X; otherwise U will simply be X.
+  if (!isa<SubOperator>(U) || OpNo == 1)
+    return TheOther;
+  if (isa<ConstantExpr>(U))
+    return ConstantExpr::getNeg(cast<Constant>(TheOther));
+  return BinaryOperator::CreateNeg(TheOther, "", IP);
+}
+
+Value *ConstantOffsetExtractor::rebuildWithoutConstantOffset() {
+  assert(UserChain.size() > 0 && "you at least found a constant, right?");
+  // Start with the constant and go up through UserChain, each time building a
+  // clone of the subexpression but with the constant removed.
+  // e.g., to build a clone of (a + (b + (c + 5)) but with the 5 removed, we
+  // first c, then (b + c), and finally (a + (b + c)).
+  //
+  // Fast path: if the GEP index is a constant, simply returns 0.
+  if (UserChain.size() == 1)
+    return ConstantInt::get(UserChain[0]->getType(), 0);
+
+  Value *Remainder =
+      rebuildLeafWithoutConstantOffset(UserChain[1], UserChain[0]);
+  for (size_t I = 2; I < UserChain.size(); ++I)
+    Remainder = cloneAndReplace(UserChain[I], UserChain[I - 1], Remainder);
+  return Remainder;
+}
+
+int64_t ConstantOffsetExtractor::Extract(Value *Idx, Value *&NewIdx,
+                                         const DataLayout *DL,
+                                         Instruction *IP) {
+  ConstantOffsetExtractor Extractor(DL, IP);
+  // Find a non-zero constant offset first.
+  int64_t ConstantOffset = Extractor.find(Idx);
+  if (ConstantOffset == 0)
+    return 0;
+  // Then rebuild a new index with the constant removed.
+  NewIdx = Extractor.rebuildWithoutConstantOffset();
+  return ConstantOffset;
+}
+
+int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL) {
+  return ConstantOffsetExtractor(DL, nullptr).find(Idx);
+}
+
+void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne,
+                                               APInt &KnownZero) const {
+  IntegerType *IT = cast<IntegerType>(V->getType());
+  KnownOne = APInt(IT->getBitWidth(), 0);
+  KnownZero = APInt(IT->getBitWidth(), 0);
+  llvm::computeKnownBits(V, KnownZero, KnownOne, DL, 0);
+}
+
+bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const {
+  assert(LHS->getType() == RHS->getType() &&
+         "LHS and RHS should have the same type");
+  APInt LHSKnownOne, LHSKnownZero, RHSKnownOne, RHSKnownZero;
+  ComputeKnownBits(LHS, LHSKnownOne, LHSKnownZero);
+  ComputeKnownBits(RHS, RHSKnownOne, RHSKnownZero);
+  return (LHSKnownZero | RHSKnownZero).isAllOnesValue();
+}
+
+int64_t SeparateConstOffsetFromGEP::accumulateByteOffset(
+    GetElementPtrInst *GEP, const DataLayout *DL, bool &NeedsExtraction) {
+  NeedsExtraction = false;
+  int64_t AccumulativeByteOffset = 0;
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      // Tries to extract a constant offset from this GEP index.
+      int64_t ConstantOffset =
+          ConstantOffsetExtractor::Find(GEP->getOperand(I), DL);
+      if (ConstantOffset != 0) {
+        NeedsExtraction = true;
+        // A GEP may have multiple indices.  We accumulate the extracted
+        // constant offset to a byte offset, and later offset the remainder of
+        // the original GEP with this byte offset.
+        AccumulativeByteOffset +=
+            ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
+      }
+    }
+  }
+  return AccumulativeByteOffset;
+}
+
+bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
+  // Skip vector GEPs.
+  if (GEP->getType()->isVectorTy())
+    return false;
+
+  // The backend can already nicely handle the case where all indices are
+  // constant.
+  if (GEP->hasAllConstantIndices())
+    return false;
+
+  bool Changed = false;
+
+  // Shortcuts integer casts. Eliminating these explicit casts can make
+  // subsequent optimizations more obvious: ConstantOffsetExtractor needn't
+  // trace into these casts.
+  if (GEP->isInBounds()) {
+    // Doing this to inbounds GEPs is safe because their indices are guaranteed
+    // to be non-negative and in bounds.
+    gep_type_iterator GTI = gep_type_begin(*GEP);
+    for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+      if (isa<SequentialType>(*GTI)) {
+        if (Operator *O = dyn_cast<Operator>(GEP->getOperand(I))) {
+          if (O->getOpcode() == Instruction::SExt ||
+              O->getOpcode() == Instruction::ZExt) {
+            GEP->setOperand(I, O->getOperand(0));
+            Changed = true;
+          }
+        }
+      }
+    }
+  }
+
+  const DataLayout *DL = &getAnalysis<DataLayoutPass>().getDataLayout();
+  bool NeedsExtraction;
+  int64_t AccumulativeByteOffset =
+      accumulateByteOffset(GEP, DL, NeedsExtraction);
+
+  if (!NeedsExtraction)
+    return Changed;
+  // Before really splitting the GEP, check whether the backend supports the
+  // addressing mode we are about to produce. If no, this splitting probably
+  // won't be beneficial.
+  TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
+  if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
+                                 /*BaseGV=*/nullptr, AccumulativeByteOffset,
+                                 /*HasBaseReg=*/true, /*Scale=*/0)) {
+    return Changed;
+  }
+
+  // Remove the constant offset in each GEP index. The resultant GEP computes
+  // the variadic base.
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      Value *NewIdx = nullptr;
+      // Tries to extract a constant offset from this GEP index.
+      int64_t ConstantOffset =
+          ConstantOffsetExtractor::Extract(GEP->getOperand(I), NewIdx, DL, GEP);
+      if (ConstantOffset != 0) {
+        assert(NewIdx != nullptr &&
+               "ConstantOffset != 0 implies NewIdx is set");
+        GEP->setOperand(I, NewIdx);
+        // Clear the inbounds attribute because the new index may be off-bound.
+        // e.g.,
+        //
+        // b = add i64 a, 5
+        // addr = gep inbounds float* p, i64 b
+        //
+        // is transformed to:
+        //
+        // addr2 = gep float* p, i64 a
+        // addr = gep float* addr2, i64 5
+        //
+        // If a is -4, although the old index b is in bounds, the new index a is
+        // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
+        // inbounds keyword is not present, the offsets are added to the base
+        // address with silently-wrapping two's complement arithmetic".
+        // Therefore, the final code will be a semantically equivalent.
+        //
+        // TODO(jingyue): do some range analysis to keep as many inbounds as
+        // possible. GEPs with inbounds are more friendly to alias analysis.
+        GEP->setIsInBounds(false);
+        Changed = true;
+      }
+    }
+  }
+
+  // Offsets the base with the accumulative byte offset.
+  //
+  //   %gep                        ; the base
+  //   ... %gep ...
+  //
+  // => add the offset
+  //
+  //   %gep2                       ; clone of %gep
+  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+  //   %gep                        ; will be removed
+  //   ... %gep ...
+  //
+  // => replace all uses of %gep with %new.gep and remove %gep
+  //
+  //   %gep2                       ; clone of %gep
+  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+  //   ... %new.gep ...
+  //
+  // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
+  // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
+  // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
+  // type of %gep.
+  //
+  //   %gep2                       ; clone of %gep
+  //   %0       = bitcast %gep2 to i8*
+  //   %uglygep = gep %0, <offset>
+  //   %new.gep = bitcast %uglygep to <type of %gep>
+  //   ... %new.gep ...
+  Instruction *NewGEP = GEP->clone();
+  NewGEP->insertBefore(GEP);
+
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  uint64_t ElementTypeSizeOfGEP =
+      DL->getTypeAllocSize(GEP->getType()->getElementType());
+  if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
+    // Very likely. As long as %gep is natually aligned, the byte offset we
+    // extracted should be a multiple of sizeof(*%gep).
+    // Per ANSI C standard, signed / unsigned = unsigned. Therefore, we
+    // cast ElementTypeSizeOfGEP to signed.
+    int64_t Index =
+        AccumulativeByteOffset / static_cast<int64_t>(ElementTypeSizeOfGEP);
+    NewGEP = GetElementPtrInst::Create(
+        NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP);
+  } else {
+    // Unlikely but possible. For example,
+    // #pragma pack(1)
+    // struct S {
+    //   int a[3];
+    //   int64 b[8];
+    // };
+    // #pragma pack()
+    //
+    // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
+    // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
+    // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
+    // sizeof(int64).
+    //
+    // Emit an uglygep in this case.
+    Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
+                                       GEP->getPointerAddressSpace());
+    NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
+    NewGEP = GetElementPtrInst::Create(
+        NewGEP, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true),
+        "uglygep", GEP);
+    if (GEP->getType() != I8PtrTy)
+      NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
+  }
+
+  GEP->replaceAllUsesWith(NewGEP);
+  GEP->eraseFromParent();
+
+  return true;
+}
+
+bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
+  if (DisableSeparateConstOffsetFromGEP)
+    return false;
+
+  bool Changed = false;
+  for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) {
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) {
+        Changed |= splitGEP(GEP);
+      }
+      // No need to split GEP ConstantExprs because all its indices are constant
+      // already.
+    }
+  }
+  return Changed;
+}
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index ceae5a7..5d5606b 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -21,7 +21,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "simplifycfg"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -38,6 +37,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "simplifycfg"
+
 STATISTIC(NumSimpl, "Number of blocks simplified");
 
 namespace {
@@ -71,7 +72,7 @@ FunctionPass *llvm::createCFGSimplificationPass() {
 static bool mergeEmptyReturnBlocks(Function &F) {
   bool Changed = false;
 
-  BasicBlock *RetBlock = 0;
+  BasicBlock *RetBlock = nullptr;
 
   // Scan all the blocks in the function, looking for empty return blocks.
   for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
@@ -79,7 +80,7 @@ static bool mergeEmptyReturnBlocks(Function &F) {
 
     // Only look at return blocks.
     ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
-    if (Ret == 0) continue;
+    if (!Ret) continue;
 
     // Only look at the block if it is empty or the only other thing in it is a
     // single PHI node that is the operand to the return.
@@ -98,7 +99,7 @@ static bool mergeEmptyReturnBlocks(Function &F) {
     }
 
     // If this is the first returning block, remember it and keep going.
-    if (RetBlock == 0) {
+    if (!RetBlock) {
       RetBlock = &BB;
       continue;
     }
@@ -119,7 +120,7 @@ static bool mergeEmptyReturnBlocks(Function &F) {
 
     // If the canonical return block has no PHI node, create one now.
     PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin());
-    if (RetBlockPHI == 0) {
+    if (!RetBlockPHI) {
       Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0);
       pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock);
       RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(),
@@ -173,7 +174,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
 
   const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  const DataLayout *DL = DLP ? &DLP->getDataLayout() : 0;
+  const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
   bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
   EverChanged |= iterativelySimplifyCFG(F, TTI, DL);
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 4107374..482c33a 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sink"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -25,6 +24,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "sink"
+
 STATISTIC(NumSunk, "Number of instructions sunk");
 STATISTIC(NumSinkIter, "Number of sinking iterations");
 
@@ -203,7 +204,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst,
     // Don't sink instructions into a loop.
     Loop *succ = LI->getLoopFor(SuccToSinkTo);
     Loop *cur = LI->getLoopFor(Inst->getParent());
-    if (succ != 0 && succ != cur)
+    if (succ != nullptr && succ != cur)
       return false;
   }
 
@@ -237,14 +238,14 @@ bool Sinking::SinkInstruction(Instruction *Inst,
 
   // SuccToSinkTo - This is the successor to sink this instruction to, once we
   // decide.
-  BasicBlock *SuccToSinkTo = 0;
+  BasicBlock *SuccToSinkTo = nullptr;
 
   // Instructions can only be sunk if all their uses are in blocks
   // dominated by one of the successors.
   // Look at all the postdominators and see if we can sink it in one.
   DomTreeNode *DTN = DT->getNode(Inst->getParent());
   for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
-      I != E && SuccToSinkTo == 0; ++I) {
+      I != E && SuccToSinkTo == nullptr; ++I) {
     BasicBlock *Candidate = (*I)->getBlock();
     if ((*I)->getIDom()->getBlock() == Inst->getParent() &&
         IsAcceptableTarget(Inst, Candidate))
@@ -254,13 +255,13 @@ bool Sinking::SinkInstruction(Instruction *Inst,
   // If no suitable postdominator was found, look at all the successors and
   // decide which one we should sink to, if any.
   for (succ_iterator I = succ_begin(Inst->getParent()),
-      E = succ_end(Inst->getParent()); I != E && SuccToSinkTo == 0; ++I) {
+      E = succ_end(Inst->getParent()); I != E && !SuccToSinkTo; ++I) {
     if (IsAcceptableTarget(Inst, *I))
       SuccToSinkTo = *I;
   }
 
   // If we couldn't find a block to sink to, ignore this instruction.
-  if (SuccToSinkTo == 0)
+  if (!SuccToSinkTo)
     return false;
 
   DEBUG(dbgs() << "Sink" << *Inst << " (";
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index 8fd2268..7b77ae1 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "structurizecfg"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SCCIterator.h"
@@ -21,6 +20,8 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+#define DEBUG_TYPE "structurizecfg"
+
 namespace {
 
 // Definition of the complex types used in this pass.
@@ -64,14 +65,14 @@ public:
   /// \brief Start a new query
   NearestCommonDominator(DominatorTree *DomTree) {
     DT = DomTree;
-    Result = 0;
+    Result = nullptr;
   }
 
   /// \brief Add BB to the resulting dominator
   void addBlock(BasicBlock *BB, bool Remember = true) {
     DomTreeNode *Node = DT->getNode(BB);
 
-    if (Result == 0) {
+    if (!Result) {
       unsigned Numbering = 0;
       for (;Node;Node = Node->getIDom())
         IndexMap[Node] = ++Numbering;
@@ -279,7 +280,7 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
 void StructurizeCFG::orderNodes() {
   scc_iterator<Region *> I = scc_begin(ParentRegion);
   for (Order.clear(); !I.isAtEnd(); ++I) {
-    std::vector<RegionNode *> &Nodes = *I;
+    const std::vector<RegionNode *> &Nodes = *I;
     Order.append(Nodes.begin(), Nodes.end());
   }
 }
@@ -453,10 +454,7 @@ void StructurizeCFG::insertConditions(bool Loops) {
   Value *Default = Loops ? BoolTrue : BoolFalse;
   SSAUpdater PhiInserter;
 
-  for (BranchVector::iterator I = Conds.begin(),
-       E = Conds.end(); I != E; ++I) {
-
-    BranchInst *Term = *I;
+  for (BranchInst *Term : Conds) {
     assert(Term->isConditional());
 
     BasicBlock *Parent = Term->getParent();
@@ -472,7 +470,7 @@ void StructurizeCFG::insertConditions(bool Loops) {
     NearestCommonDominator Dominator(DT);
     Dominator.addBlock(Parent, false);
 
-    Value *ParentValue = 0;
+    Value *ParentValue = nullptr;
     for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
          PI != PE; ++PI) {
 
@@ -591,7 +589,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
   if (Node->isSubRegion()) {
     Region *SubRegion = Node->getNodeAs<Region>();
     BasicBlock *OldExit = SubRegion->getExit();
-    BasicBlock *Dominator = 0;
+    BasicBlock *Dominator = nullptr;
 
     // Find all the edges from the sub region to the exit
     for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit);
@@ -678,7 +676,8 @@ BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
 
 /// \brief Set the previous node
 void StructurizeCFG::setPrevNode(BasicBlock *BB) {
-  PrevNode =  ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0;
+  PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
+                                        : nullptr;
 }
 
 /// \brief Does BB dominate all the predicates of Node ?
@@ -699,7 +698,7 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
   bool Dominated = false;
 
   // Regionentry is always true
-  if (PrevNode == 0)
+  if (!PrevNode)
     return true;
 
   for (BBPredicates::iterator I = Preds.begin(), E = Preds.end();
@@ -806,11 +805,11 @@ void StructurizeCFG::createFlow() {
   Conditions.clear();
   LoopConds.clear();
 
-  PrevNode = 0;
+  PrevNode = nullptr;
   Visited.clear();
 
   while (!Order.empty()) {
-    handleLoops(EntryDominatesExit, 0);
+    handleLoops(EntryDominatesExit, nullptr);
   }
 
   if (PrevNode)
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 6d02777..05b9892 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -50,12 +50,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "tailcallelim"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
@@ -64,6 +64,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -76,6 +77,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "tailcallelim"
+
 STATISTIC(NumEliminated, "Number of tail calls removed");
 STATISTIC(NumRetDuped,   "Number of return duplicated");
 STATISTIC(NumAccumAdded, "Number of accumulators introduced");
@@ -94,6 +97,9 @@ namespace {
     bool runOnFunction(Function &F) override;
 
   private:
+    bool runTRE(Function &F);
+    bool markTails(Function &F, bool &AllCallsAreTailCalls);
+
     CallInst *FindTRECandidate(Instruction *I,
                                bool CannotTailCallElimCallsMarkedTail);
     bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
@@ -131,55 +137,255 @@ void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetTransformInfo>();
 }
 
-/// CanTRE - Scan the specified basic block for alloca instructions.
-/// If it contains any that are variable-sized or not in the entry block,
-/// returns false.
-static bool CanTRE(AllocaInst *AI) {
-  // Because of PR962, we don't TRE allocas outside the entry block.
-
-  // If this alloca is in the body of the function, or if it is a variable
-  // sized allocation, we cannot tail call eliminate calls marked 'tail'
-  // with this mechanism.
-  BasicBlock *BB = AI->getParent();
-  return BB == &BB->getParent()->getEntryBlock() &&
-         isa<ConstantInt>(AI->getArraySize());
+/// \brief Scan the specified function for alloca instructions.
+/// If it contains any dynamic allocas, returns false.
+static bool CanTRE(Function &F) {
+  // Because of PR962, we don't TRE dynamic allocas.
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+        if (!AI->isStaticAlloca())
+          return false;
+      }
+    }
+  }
+
+  return true;
 }
 
-namespace {
-struct AllocaCaptureTracker : public CaptureTracker {
-  AllocaCaptureTracker() : Captured(false) {}
+bool TailCallElim::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
 
-  void tooManyUses() override { Captured = true; }
+  bool AllCallsAreTailCalls = false;
+  bool Modified = markTails(F, AllCallsAreTailCalls);
+  if (AllCallsAreTailCalls)
+    Modified |= runTRE(F);
+  return Modified;
+}
 
-  bool shouldExplore(const Use *U) override {
-    Value *V = U->getUser();
-    if (isa<CallInst>(V) || isa<InvokeInst>(V))
-      UsesAlloca.insert(V);
-    return true;
+namespace {
+struct AllocaDerivedValueTracker {
+  // Start at a root value and walk its use-def chain to mark calls that use the
+  // value or a derived value in AllocaUsers, and places where it may escape in
+  // EscapePoints.
+  void walk(Value *Root) {
+    SmallVector<Use *, 32> Worklist;
+    SmallPtrSet<Use *, 32> Visited;
+
+    auto AddUsesToWorklist = [&](Value *V) {
+      for (auto &U : V->uses()) {
+        if (!Visited.insert(&U))
+          continue;
+        Worklist.push_back(&U);
+      }
+    };
+
+    AddUsesToWorklist(Root);
+
+    while (!Worklist.empty()) {
+      Use *U = Worklist.pop_back_val();
+      Instruction *I = cast<Instruction>(U->getUser());
+
+      switch (I->getOpcode()) {
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        CallSite CS(I);
+        bool IsNocapture = !CS.isCallee(U) &&
+                           CS.doesNotCapture(CS.getArgumentNo(U));
+        callUsesLocalStack(CS, IsNocapture);
+        if (IsNocapture) {
+          // If the alloca-derived argument is passed in as nocapture, then it
+          // can't propagate to the call's return. That would be capturing.
+          continue;
+        }
+        break;
+      }
+      case Instruction::Load: {
+        // The result of a load is not alloca-derived (unless an alloca has
+        // otherwise escaped, but this is a local analysis).
+        continue;
+      }
+      case Instruction::Store: {
+        if (U->getOperandNo() == 0)
+          EscapePoints.insert(I);
+        continue;  // Stores have no users to analyze.
+      }
+      case Instruction::BitCast:
+      case Instruction::GetElementPtr:
+      case Instruction::PHI:
+      case Instruction::Select:
+      case Instruction::AddrSpaceCast:
+        break;
+      default:
+        EscapePoints.insert(I);
+        break;
+      }
+
+      AddUsesToWorklist(I);
+    }
   }
 
-  bool captured(const Use *U) override {
-    if (isa<ReturnInst>(U->getUser()))
-      return false;
-    Captured = true;
-    return true;
+  void callUsesLocalStack(CallSite CS, bool IsNocapture) {
+    // Add it to the list of alloca users. If it's already there, skip further
+    // processing.
+    if (!AllocaUsers.insert(CS.getInstruction()))
+      return;
+
+    // If it's nocapture then it can't capture the alloca.
+    if (IsNocapture)
+      return;
+
+    // If it can write to memory, it can leak the alloca value.
+    if (!CS.onlyReadsMemory())
+      EscapePoints.insert(CS.getInstruction());
   }
 
-  bool Captured;
-  SmallPtrSet<const Value *, 16> UsesAlloca;
+  SmallPtrSet<Instruction *, 32> AllocaUsers;
+  SmallPtrSet<Instruction *, 32> EscapePoints;
 };
-} // end anonymous namespace
+}
 
-bool TailCallElim::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
+bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) {
+  if (F.callsFunctionThatReturnsTwice())
     return false;
+  AllCallsAreTailCalls = true;
+
+  // The local stack holds all alloca instructions and all byval arguments.
+  AllocaDerivedValueTracker Tracker;
+  for (Argument &Arg : F.args()) {
+    if (Arg.hasByValAttr())
+      Tracker.walk(&Arg);
+  }
+  for (auto &BB : F) {
+    for (auto &I : BB)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+        Tracker.walk(AI);
+  }
 
+  bool Modified = false;
+
+  // Track whether a block is reachable after an alloca has escaped. Blocks that
+  // contain the escaping instruction will be marked as being visited without an
+  // escaped alloca, since that is how the block began.
+  enum VisitType {
+    UNVISITED,
+    UNESCAPED,
+    ESCAPED
+  };
+  DenseMap<BasicBlock *, VisitType> Visited;
+
+  // We propagate the fact that an alloca has escaped from block to successor.
+  // Visit the blocks that are propagating the escapedness first. To do this, we
+  // maintain two worklists.
+  SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped;
+
+  // We may enter a block and visit it thinking that no alloca has escaped yet,
+  // then see an escape point and go back around a loop edge and come back to
+  // the same block twice. Because of this, we defer setting tail on calls when
+  // we first encounter them in a block. Every entry in this list does not
+  // statically use an alloca via use-def chain analysis, but may find an alloca
+  // through other means if the block turns out to be reachable after an escape
+  // point.
+  SmallVector<CallInst *, 32> DeferredTails;
+
+  BasicBlock *BB = &F.getEntryBlock();
+  VisitType Escaped = UNESCAPED;
+  do {
+    for (auto &I : *BB) {
+      if (Tracker.EscapePoints.count(&I))
+        Escaped = ESCAPED;
+
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI || CI->isTailCall())
+        continue;
+
+      if (CI->doesNotAccessMemory()) {
+        // A call to a readnone function whose arguments are all things computed
+        // outside this function can be marked tail. Even if you stored the
+        // alloca address into a global, a readnone function can't load the
+        // global anyhow.
+        //
+        // Note that this runs whether we know an alloca has escaped or not. If
+        // it has, then we can't trust Tracker.AllocaUsers to be accurate.
+        bool SafeToTail = true;
+        for (auto &Arg : CI->arg_operands()) {
+          if (isa<Constant>(Arg.getUser()))
+            continue;
+          if (Argument *A = dyn_cast<Argument>(Arg.getUser()))
+            if (!A->hasByValAttr())
+              continue;
+          SafeToTail = false;
+          break;
+        }
+        if (SafeToTail) {
+          emitOptimizationRemark(
+              F.getContext(), "tailcallelim", F, CI->getDebugLoc(),
+              "marked this readnone call a tail call candidate");
+          CI->setTailCall();
+          Modified = true;
+          continue;
+        }
+      }
+
+      if (Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
+        DeferredTails.push_back(CI);
+      } else {
+        AllCallsAreTailCalls = false;
+      }
+    }
+
+    for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) {
+      auto &State = Visited[SuccBB];
+      if (State < Escaped) {
+        State = Escaped;
+        if (State == ESCAPED)
+          WorklistEscaped.push_back(SuccBB);
+        else
+          WorklistUnescaped.push_back(SuccBB);
+      }
+    }
+
+    if (!WorklistEscaped.empty()) {
+      BB = WorklistEscaped.pop_back_val();
+      Escaped = ESCAPED;
+    } else {
+      BB = nullptr;
+      while (!WorklistUnescaped.empty()) {
+        auto *NextBB = WorklistUnescaped.pop_back_val();
+        if (Visited[NextBB] == UNESCAPED) {
+          BB = NextBB;
+          Escaped = UNESCAPED;
+          break;
+        }
+      }
+    }
+  } while (BB);
+
+  for (CallInst *CI : DeferredTails) {
+    if (Visited[CI->getParent()] != ESCAPED) {
+      // If the escape point was part way through the block, calls after the
+      // escape point wouldn't have been put into DeferredTails.
+      emitOptimizationRemark(F.getContext(), "tailcallelim", F,
+                             CI->getDebugLoc(),
+                             "marked this call a tail call candidate");
+      CI->setTailCall();
+      Modified = true;
+    } else {
+      AllCallsAreTailCalls = false;
+    }
+  }
+
+  return Modified;
+}
+
+bool TailCallElim::runTRE(Function &F) {
   // If this function is a varargs function, we won't be able to PHI the args
   // right, so don't even try to convert it...
   if (F.getFunctionType()->isVarArg()) return false;
 
   TTI = &getAnalysis<TargetTransformInfo>();
-  BasicBlock *OldEntry = 0;
+  BasicBlock *OldEntry = nullptr;
   bool TailCallsAreMarkedTail = false;
   SmallVector<PHINode*, 8> ArgumentPHIs;
   bool MadeChange = false;
@@ -188,39 +394,23 @@ bool TailCallElim::runOnFunction(Function &F) {
   // marked with the 'tail' attribute, because doing so would cause the stack
   // size to increase (real TRE would deallocate variable sized allocas, TRE
   // doesn't).
-  bool CanTRETailMarkedCall = true;
-
-  // Find calls that can be marked tail.
-  AllocaCaptureTracker ACT;
-  for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) {
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
-        CanTRETailMarkedCall &= CanTRE(AI);
-        PointerMayBeCaptured(AI, &ACT);
-        // If any allocas are captured, exit.
-        if (ACT.Captured)
-          return false;
-      }
-    }
-  }
+  bool CanTRETailMarkedCall = CanTRE(F);
 
-  // Second pass, change any tail recursive calls to loops.
+  // Change any tail recursive calls to loops.
   //
   // FIXME: The code generator produces really bad code when an 'escaping
   // alloca' is changed from being a static alloca to being a dynamic alloca.
   // Until this is resolved, disable this transformation if that would ever
   // happen.  This bug is PR962.
-  if (ACT.UsesAlloca.empty()) {
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-      if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
-        bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
-                                            ArgumentPHIs, !CanTRETailMarkedCall);
-        if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
-          Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
-                                            TailCallsAreMarkedTail, ArgumentPHIs,
-                                            !CanTRETailMarkedCall);
-        MadeChange |= Change;
-      }
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+                                          ArgumentPHIs, !CanTRETailMarkedCall);
+      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
+        Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
+                                          TailCallsAreMarkedTail, ArgumentPHIs,
+                                          !CanTRETailMarkedCall);
+      MadeChange |= Change;
     }
   }
 
@@ -229,34 +419,13 @@ bool TailCallElim::runOnFunction(Function &F) {
   // with themselves.  Check to see if we did and clean up our mess if so.  This
   // occurs when a function passes an argument straight through to its tail
   // call.
-  if (!ArgumentPHIs.empty()) {
-    for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) {
-      PHINode *PN = ArgumentPHIs[i];
-
-      // If the PHI Node is a dynamic constant, replace it with the value it is.
-      if (Value *PNV = SimplifyInstruction(PN)) {
-        PN->replaceAllUsesWith(PNV);
-        PN->eraseFromParent();
-      }
-    }
-  }
+  for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) {
+    PHINode *PN = ArgumentPHIs[i];
 
-  // At this point, we know that the function does not have any captured
-  // allocas. If additionally the function does not call setjmp, mark all calls
-  // in the function that do not access stack memory with the tail keyword. This
-  // implies ensuring that there does not exist any path from a call that takes
-  // in an alloca but does not capture it and the call which we wish to mark
-  // with "tail".
-  if (!F.callsFunctionThatReturnsTwice()) {
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-        if (CallInst *CI = dyn_cast<CallInst>(I)) {
-          if (!ACT.UsesAlloca.count(CI)) {
-            CI->setTailCall();
-            MadeChange = true;
-          }
-        }
-      }
+    // If the PHI Node is a dynamic constant, replace it with the value it is.
+    if (Value *PNV = SimplifyInstruction(PN)) {
+      PN->replaceAllUsesWith(PNV);
+      PN->eraseFromParent();
     }
   }
 
@@ -343,11 +512,11 @@ static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
 //
 static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
   Function *F = CI->getParent()->getParent();
-  Value *ReturnedValue = 0;
+  Value *ReturnedValue = nullptr;
 
   for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) {
     ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator());
-    if (RI == 0 || RI == IgnoreRI) continue;
+    if (RI == nullptr || RI == IgnoreRI) continue;
 
     // We can only perform this transformation if the value returned is
     // evaluatable at the start of the initial invocation of the function,
@@ -355,10 +524,10 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
     //
     Value *RetOp = RI->getOperand(0);
     if (!isDynamicConstant(RetOp, CI, RI))
-      return 0;
+      return nullptr;
 
     if (ReturnedValue && RetOp != ReturnedValue)
-      return 0;     // Cannot transform if differing values are returned.
+      return nullptr;     // Cannot transform if differing values are returned.
     ReturnedValue = RetOp;
   }
   return ReturnedValue;
@@ -370,18 +539,18 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
 ///
 Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
                                                       CallInst *CI) {
-  if (!I->isAssociative() || !I->isCommutative()) return 0;
+  if (!I->isAssociative() || !I->isCommutative()) return nullptr;
   assert(I->getNumOperands() == 2 &&
          "Associative/commutative operations should have 2 args!");
 
   // Exactly one operand should be the result of the call instruction.
   if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
       (I->getOperand(0) != CI && I->getOperand(1) != CI))
-    return 0;
+    return nullptr;
 
   // The only user of this instruction we allow is a single return instruction.
   if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back()))
-    return 0;
+    return nullptr;
 
   // Ok, now we have to check all of the other return instructions in this
   // function.  If they return non-constants or differing values, then we cannot
@@ -402,11 +571,11 @@ TailCallElim::FindTRECandidate(Instruction *TI,
   Function *F = BB->getParent();
 
   if (&BB->front() == TI) // Make sure there is something before the terminator.
-    return 0;
+    return nullptr;
 
   // Scan backwards from the return, checking to see if there is a tail call in
   // this block.  If so, set CI to it.
-  CallInst *CI = 0;
+  CallInst *CI = nullptr;
   BasicBlock::iterator BBI = TI;
   while (true) {
     CI = dyn_cast<CallInst>(BBI);
@@ -414,14 +583,14 @@ TailCallElim::FindTRECandidate(Instruction *TI,
       break;
 
     if (BBI == BB->begin())
-      return 0;          // Didn't find a potential tail call.
+      return nullptr;          // Didn't find a potential tail call.
     --BBI;
   }
 
   // If this call is marked as a tail call, and if there are dynamic allocas in
   // the function, we cannot perform this optimization.
   if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
-    return 0;
+    return nullptr;
 
   // As a special case, detect code like this:
   //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
@@ -441,7 +610,7 @@ TailCallElim::FindTRECandidate(Instruction *TI,
     for (; I != E && FI != FE; ++I, ++FI)
       if (*I != &*FI) break;
     if (I == E && FI == FE)
-      return 0;
+      return nullptr;
   }
 
   return CI;
@@ -462,8 +631,8 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
   // which is different to the constant returned by other return instructions
   // (which is recorded in AccumulatorRecursionEliminationInitVal).  This is a
   // special case of accumulator recursion, the operation being "return C".
-  Value *AccumulatorRecursionEliminationInitVal = 0;
-  Instruction *AccumulatorRecursionInstr = 0;
+  Value *AccumulatorRecursionEliminationInitVal = nullptr;
+  Instruction *AccumulatorRecursionInstr = nullptr;
 
   // Ok, we found a potential tail call.  We can currently only transform the
   // tail call if all of the instructions between the call and the return are
@@ -493,8 +662,8 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
   // accumulator recursion variable eliminated.
   if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI &&
       !isa<UndefValue>(Ret->getReturnValue()) &&
-      AccumulatorRecursionEliminationInitVal == 0 &&
-      !getCommonReturnValue(0, CI)) {
+      AccumulatorRecursionEliminationInitVal == nullptr &&
+      !getCommonReturnValue(nullptr, CI)) {
     // One case remains that we are able to handle: the current return
     // instruction returns a constant, and all other return instructions
     // return a different constant.
@@ -510,9 +679,12 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
   BasicBlock *BB = Ret->getParent();
   Function *F = BB->getParent();
 
+  emitOptimizationRemark(F->getContext(), "tailcallelim", *F, CI->getDebugLoc(),
+                         "transforming tail recursion to loop");
+
   // OK! We can transform this tail call.  If this is the first one found,
   // create the new entry block, allowing us to branch back to the old entry.
-  if (OldEntry == 0) {
+  if (!OldEntry) {
     OldEntry = &F->getEntryBlock();
     BasicBlock *NewEntry = BasicBlock::Create(F->getContext(), "", F, OldEntry);
     NewEntry->takeName(OldEntry);
diff --git a/lib/Transforms/Utils/AddDiscriminators.cpp b/lib/Transforms/Utils/AddDiscriminators.cpp
index f42635e..196ac79 100644
--- a/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -52,8 +52,6 @@
 // http://wiki.dwarfstd.org/index.php?title=Path_Discriminators
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "add-discriminators"
-
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -69,6 +67,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "add-discriminators"
+
 namespace {
   struct AddDiscriminators : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
@@ -99,7 +99,7 @@ FunctionPass *llvm::createAddDiscriminatorsPass() {
 
 static bool hasDebugInfo(const Function &F) {
   NamedMDNode *CUNodes = F.getParent()->getNamedMetadata("llvm.dbg.cu");
-  return CUNodes != 0;
+  return CUNodes != nullptr;
 }
 
 /// \brief Assign DWARF discriminators.
@@ -154,10 +154,15 @@ static bool hasDebugInfo(const Function &F) {
 /// file and line location as I2. This new lexical block will have a
 /// different discriminator number than I1.
 bool AddDiscriminators::runOnFunction(Function &F) {
-  // No need to do anything if there is no debug info for this function.
   // If the function has debug information, but the user has disabled
   // discriminators, do nothing.
-  if (!hasDebugInfo(F) || NoDiscriminators) return false;
+  // Simlarly, if the function has no debug info, do nothing.
+  // Finally, if this module is built with dwarf versions earlier than 4,
+  // do nothing (discriminator support is a DWARF 4 feature).
+  if (NoDiscriminators ||
+      !hasDebugInfo(F) ||
+      F.getParent()->getDwarfVersion() < 4)
+    return false;
 
   bool Changed = false;
   Module *M = F.getParent();
diff --git a/lib/Transforms/Utils/Android.mk b/lib/Transforms/Utils/Android.mk
index ab4d8a8..cbd8dd0 100644
--- a/lib/Transforms/Utils/Android.mk
+++ b/lib/Transforms/Utils/Android.mk
@@ -11,6 +11,7 @@ transforms_utils_SRC_FILES := \
   CloneModule.cpp \
   CmpInstAnalysis.cpp \
   CodeExtractor.cpp \
+  CtorUtils.cpp \
   DemoteRegToStack.cpp \
   GlobalStatus.cpp \
   InlineFunction.cpp \
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index b3cd5ce..80b7e22 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -68,8 +68,8 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
 void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P) {
   if (!isa<PHINode>(BB->begin())) return;
 
-  AliasAnalysis *AA = 0;
-  MemoryDependenceAnalysis *MemDep = 0;
+  AliasAnalysis *AA = nullptr;
+  MemoryDependenceAnalysis *MemDep = nullptr;
   if (P) {
     AA = P->getAnalysisIfAvailable<AliasAnalysis>();
     MemDep = P->getAnalysisIfAvailable<MemoryDependenceAnalysis>();
@@ -130,7 +130,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
   BasicBlock *OnlySucc = BB;
   for (; SI != SE; ++SI)
     if (*SI != OnlySucc) {
-      OnlySucc = 0;     // There are multiple distinct successors!
+      OnlySucc = nullptr;     // There are multiple distinct successors!
       break;
     }
 
@@ -217,7 +217,7 @@ void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
 ///
 void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
                                BasicBlock::iterator &BI, Instruction *I) {
-  assert(I->getParent() == 0 &&
+  assert(I->getParent() == nullptr &&
          "ReplaceInstWithInst: Instruction already inserted into basic block!");
 
   // Insert the new instruction into the basic block...
@@ -254,7 +254,7 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
     // If the successor only has a single pred, split the top of the successor
     // block.
     assert(SP == BB && "CFG broken");
-    SP = NULL;
+    SP = nullptr;
     return SplitBlock(Succ, Succ->begin(), P);
   }
 
@@ -310,7 +310,7 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
   if (!P) return;
 
   LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
-  Loop *L = LI ? LI->getLoopFor(OldBB) : 0;
+  Loop *L = LI ? LI->getLoopFor(OldBB) : nullptr;
 
   // If we need to preserve loop analyses, collect some information about how
   // this split will affect loops.
@@ -351,7 +351,7 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
     // loop). To find this, examine each of the predecessors and determine which
     // loops enclose them, and select the most-nested loop which contains the
     // loop containing the block being split.
-    Loop *InnermostPredLoop = 0;
+    Loop *InnermostPredLoop = nullptr;
     for (ArrayRef<BasicBlock*>::iterator
            i = Preds.begin(), e = Preds.end(); i != e; ++i) {
       BasicBlock *Pred = *i;
@@ -384,51 +384,68 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
                            ArrayRef<BasicBlock*> Preds, BranchInst *BI,
                            Pass *P, bool HasLoopExit) {
   // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB.
-  AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : 0;
+  AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : nullptr;
+  SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end());
   for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) {
     PHINode *PN = cast<PHINode>(I++);
 
     // Check to see if all of the values coming in are the same.  If so, we
     // don't need to create a new PHI node, unless it's needed for LCSSA.
-    Value *InVal = 0;
+    Value *InVal = nullptr;
     if (!HasLoopExit) {
       InVal = PN->getIncomingValueForBlock(Preds[0]);
-      for (unsigned i = 1, e = Preds.size(); i != e; ++i)
-        if (InVal != PN->getIncomingValueForBlock(Preds[i])) {
-          InVal = 0;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        if (!PredSet.count(PN->getIncomingBlock(i)))
+          continue;
+        if (!InVal)
+          InVal = PN->getIncomingValue(i);
+        else if (InVal != PN->getIncomingValue(i)) {
+          InVal = nullptr;
           break;
         }
+      }
     }
 
     if (InVal) {
       // If all incoming values for the new PHI would be the same, just don't
       // make a new PHI.  Instead, just remove the incoming values from the old
       // PHI.
-      for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
-        // Explicitly check the BB index here to handle duplicates in Preds.
-        int Idx = PN->getBasicBlockIndex(Preds[i]);
-        if (Idx >= 0)
-          PN->removeIncomingValue(Idx, false);
-      }
-    } else {
-      // If the values coming into the block are not the same, we need a PHI.
-      // Create the new PHI node, insert it into NewBB at the end of the block
-      PHINode *NewPHI =
-        PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
-      if (AA) AA->copyValue(PN, NewPHI);
 
-      // Move all of the PHI values for 'Preds' to the new PHI.
-      for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
-        Value *V = PN->removeIncomingValue(Preds[i], false);
-        NewPHI->addIncoming(V, Preds[i]);
-      }
+      // NOTE! This loop walks backwards for a reason! First off, this minimizes
+      // the cost of removal if we end up removing a large number of values, and
+      // second off, this ensures that the indices for the incoming values
+      // aren't invalidated when we remove one.
+      for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i)
+        if (PredSet.count(PN->getIncomingBlock(i)))
+          PN->removeIncomingValue(i, false);
+
+      // Add an incoming value to the PHI node in the loop for the preheader
+      // edge.
+      PN->addIncoming(InVal, NewBB);
+      continue;
+    }
 
-      InVal = NewPHI;
+    // If the values coming into the block are not the same, we need a new
+    // PHI.
+    // Create the new PHI node, insert it into NewBB at the end of the block
+    PHINode *NewPHI =
+        PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
+    if (AA)
+      AA->copyValue(PN, NewPHI);
+
+    // NOTE! This loop walks backwards for a reason! First off, this minimizes
+    // the cost of removal if we end up removing a large number of values, and
+    // second off, this ensures that the indices for the incoming values aren't
+    // invalidated when we remove one.
+    for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
+      BasicBlock *IncomingBB = PN->getIncomingBlock(i);
+      if (PredSet.count(IncomingBB)) {
+        Value *V = PN->removeIncomingValue(i, false);
+        NewPHI->addIncoming(V, IncomingBB);
+      }
     }
 
-    // Add an incoming value to the PHI node in the loop for the preheader
-    // edge.
-    PN->addIncoming(InVal, NewBB);
+    PN->addIncoming(NewPHI, NewBB);
   }
 }
 
@@ -542,7 +559,7 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
     e = pred_end(OrigBB);
   }
 
-  BasicBlock *NewBB2 = 0;
+  BasicBlock *NewBB2 = nullptr;
   if (!NewBB2Preds.empty()) {
     // Create another basic block for the rest of OrigBB's predecessors.
     NewBB2 = BasicBlock::Create(OrigBB->getContext(),
@@ -607,7 +624,7 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end();
        i != e; ++i) {
     Value *V = *i;
-    Instruction *NewBC = 0;
+    Instruction *NewBC = nullptr;
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) {
       // Return value might be bitcasted. Clone and insert it before the
       // return instruction.
@@ -724,32 +741,32 @@ void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
 Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
                              BasicBlock *&IfFalse) {
   PHINode *SomePHI = dyn_cast<PHINode>(BB->begin());
-  BasicBlock *Pred1 = NULL;
-  BasicBlock *Pred2 = NULL;
+  BasicBlock *Pred1 = nullptr;
+  BasicBlock *Pred2 = nullptr;
 
   if (SomePHI) {
     if (SomePHI->getNumIncomingValues() != 2)
-      return NULL;
+      return nullptr;
     Pred1 = SomePHI->getIncomingBlock(0);
     Pred2 = SomePHI->getIncomingBlock(1);
   } else {
     pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
     if (PI == PE) // No predecessor
-      return NULL;
+      return nullptr;
     Pred1 = *PI++;
     if (PI == PE) // Only one predecessor
-      return NULL;
+      return nullptr;
     Pred2 = *PI++;
     if (PI != PE) // More than two predecessors
-      return NULL;
+      return nullptr;
   }
 
   // We can only handle branches.  Other control flow will be lowered to
   // branches if possible anyway.
   BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator());
   BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator());
-  if (Pred1Br == 0 || Pred2Br == 0)
-    return 0;
+  if (!Pred1Br || !Pred2Br)
+    return nullptr;
 
   // Eliminate code duplication by ensuring that Pred1Br is conditional if
   // either are.
@@ -759,7 +776,7 @@ Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
     // required anyway, we stand no chance of eliminating it, so the xform is
     // probably not profitable.
     if (Pred1Br->isConditional())
-      return 0;
+      return nullptr;
 
     std::swap(Pred1, Pred2);
     std::swap(Pred1Br, Pred2Br);
@@ -769,8 +786,8 @@ Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
     // The only thing we have to watch out for here is to make sure that Pred2
     // doesn't have incoming edges from other blocks.  If it does, the condition
     // doesn't dominate BB.
-    if (Pred2->getSinglePredecessor() == 0)
-      return 0;
+    if (!Pred2->getSinglePredecessor())
+      return nullptr;
 
     // If we found a conditional branch predecessor, make sure that it branches
     // to BB and Pred2Br.  If it doesn't, this isn't an "if statement".
@@ -785,7 +802,7 @@ Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
     } else {
       // We know that one arm of the conditional goes to BB, so the other must
       // go somewhere unrelated, and this must not be an "if statement".
-      return 0;
+      return nullptr;
     }
 
     return Pred1Br->getCondition();
@@ -795,12 +812,12 @@ Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
   // BB.  Don't panic!  If both blocks only have a single (identical)
   // predecessor, and THAT is a conditional branch, then we're all ok!
   BasicBlock *CommonPred = Pred1->getSinglePredecessor();
-  if (CommonPred == 0 || CommonPred != Pred2->getSinglePredecessor())
-    return 0;
+  if (CommonPred == nullptr || CommonPred != Pred2->getSinglePredecessor())
+    return nullptr;
 
   // Otherwise, if this is a conditional branch, then we can use it!
   BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator());
-  if (BI == 0) return 0;
+  if (!BI) return nullptr;
 
   assert(BI->isConditional() && "Two successors but not conditional?");
   if (BI->getSuccessor(0) == Pred1) {
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 76ebb9f..80bd516 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "break-crit-edges"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -30,6 +29,8 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "break-crit-edges"
+
 STATISTIC(NumBroken, "Number of blocks inserted");
 
 namespace {
@@ -141,7 +142,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
                                     Pass *P, bool MergeIdenticalEdges,
                                     bool DontDeleteUselessPhis,
                                     bool SplitLandingPads) {
-  if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return 0;
+  if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return nullptr;
 
   assert(!isa<IndirectBrInst>(TI) &&
          "Cannot split critical edge from IndirectBrInst");
@@ -151,7 +152,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
 
   // Splitting the critical edge to a landing pad block is non-trivial. Don't do
   // it in this generic function.
-  if (DestBB->isLandingPad()) return 0;
+  if (DestBB->isLandingPad()) return nullptr;
 
   // Create a new basic block, linking it into the CFG.
   BasicBlock *NewBB = BasicBlock::Create(TI->getContext(),
@@ -207,15 +208,15 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
 
 
   // If we don't have a pass object, we can't update anything...
-  if (P == 0) return NewBB;
+  if (!P) return NewBB;
 
   DominatorTreeWrapperPass *DTWP =
       P->getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : 0;
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
 
   // If we have nothing to update, just return.
-  if (DT == 0 && LI == 0)
+  if (!DT && !LI)
     return NewBB;
 
   // Now update analysis information.  Since the only predecessor of NewBB is
@@ -251,7 +252,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
     //
     if (TINode) {       // Don't break unreachable code!
       DomTreeNode *NewBBNode = DT->addNewBlock(NewBB, TIBB);
-      DomTreeNode *DestBBNode = 0;
+      DomTreeNode *DestBBNode = nullptr;
 
       // If NewBBDominatesDestBB hasn't been computed yet, do so with DT.
       if (!OtherPreds.empty()) {
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 82384a1..be00b69 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -27,7 +27,8 @@ using namespace llvm;
 
 /// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
 Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) {
-  return B.CreateBitCast(V, B.getInt8PtrTy(), "cstr");
+  unsigned AS = V->getType()->getPointerAddressSpace();
+  return B.CreateBitCast(V, B.getInt8PtrTy(AS), "cstr");
 }
 
 /// EmitStrLen - Emit a call to the strlen function to the builder, for the
@@ -35,7 +36,7 @@ Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) {
 Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strlen))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS[2];
@@ -64,7 +65,7 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
 Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
                          const DataLayout *TD, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strnlen))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS[2];
@@ -94,7 +95,7 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
 Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
                         const DataLayout *TD, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strchr))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
@@ -120,7 +121,7 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
                          IRBuilder<> &B, const DataLayout *TD,
                          const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strncmp))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS[3];
@@ -153,7 +154,7 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
                         const DataLayout *TD, const TargetLibraryInfo *TLI,
                         StringRef Name) {
   if (!TLI->has(LibFunc::strcpy))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS[2];
@@ -177,7 +178,7 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
                          IRBuilder<> &B, const DataLayout *TD,
                          const TargetLibraryInfo *TLI, StringRef Name) {
   if (!TLI->has(LibFunc::strncpy))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS[2];
@@ -204,7 +205,7 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
                            IRBuilder<> &B, const DataLayout *TD,
                            const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memcpy_chk))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS;
@@ -232,7 +233,7 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
                         Value *Len, IRBuilder<> &B, const DataLayout *TD,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memchr))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS;
@@ -260,7 +261,7 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
                         Value *Len, IRBuilder<> &B, const DataLayout *TD,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memcmp))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS[3];
@@ -347,7 +348,7 @@ Value *llvm::EmitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
 Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD,
                          const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::putchar))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(),
@@ -369,7 +370,7 @@ Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD,
 Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD,
                       const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::puts))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS[2];
@@ -393,7 +394,7 @@ Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD,
 Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
                        const DataLayout *TD, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fputc))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS[2];
@@ -426,7 +427,7 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
 Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
                        const DataLayout *TD, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fputs))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS[3];
@@ -459,7 +460,7 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
                         IRBuilder<> &B, const DataLayout *TD,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fwrite))
-    return 0;
+    return nullptr;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS[3];
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index 1f517d0..f2d5e07 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "bypass-slow-division"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/Function.h"
@@ -24,6 +23,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "bypass-slow-division"
+
 namespace {
   struct DivOpInfo {
     bool SignedOp;
@@ -53,11 +54,11 @@ namespace llvm {
     }
 
     static DivOpInfo getEmptyKey() {
-      return DivOpInfo(false, 0, 0);
+      return DivOpInfo(false, nullptr, nullptr);
     }
 
     static DivOpInfo getTombstoneKey() {
-      return DivOpInfo(true, 0, 0);
+      return DivOpInfo(true, nullptr, nullptr);
     }
 
     static unsigned getHashValue(const DivOpInfo &Val) {
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index dac2090..e10ca90 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(LLVMTransformUtils
   BreakCriticalEdges.cpp
   BuildLibCalls.cpp
   BypassSlowDivision.cpp
+  CtorUtils.cpp
   CloneFunction.cpp
   CloneModule.cpp
   CmpInstAnalysis.cpp
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index a199086..5c8f20d 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -159,7 +159,7 @@ static MDNode* FindSubprogram(const Function *F, DebugInfoFinder &Finder) {
   for (DISubprogram Subprogram : Finder.subprograms()) {
     if (Subprogram.describes(F)) return Subprogram;
   }
-  return NULL;
+  return nullptr;
 }
 
 // Add an operand to an existing MDNode. The new operand will be added at the
@@ -359,7 +359,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
       // If the condition was a known constant in the callee...
       ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
       // Or is a known constant in the caller...
-      if (Cond == 0) {
+      if (!Cond) {
         Value *V = VMap[BI->getCondition()];
         Cond = dyn_cast_or_null<ConstantInt>(V);
       }
@@ -375,7 +375,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
   } else if (const SwitchInst *SI = dyn_cast<SwitchInst>(OldTI)) {
     // If switching on a value known constant in the caller.
     ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
-    if (Cond == 0) { // Or known constant after constant prop in the callee...
+    if (!Cond) { // Or known constant after constant prop in the callee...
       Value *V = VMap[SI->getCondition()];
       Cond = dyn_cast_or_null<ConstantInt>(V);
     }
@@ -454,7 +454,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
        BI != BE; ++BI) {
     Value *V = VMap[BI];
     BasicBlock *NewBB = cast_or_null<BasicBlock>(V);
-    if (NewBB == 0) continue;  // Dead block.
+    if (!NewBB) continue;  // Dead block.
 
     // Add the new block to the new function.
     NewFunc->getBasicBlockList().push_back(NewBB);
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 64df089..eb67db1 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -47,8 +47,8 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
     GlobalVariable *GV = new GlobalVariable(*New, 
                                             I->getType()->getElementType(),
                                             I->isConstant(), I->getLinkage(),
-                                            (Constant*) 0, I->getName(),
-                                            (GlobalVariable*) 0,
+                                            (Constant*) nullptr, I->getName(),
+                                            (GlobalVariable*) nullptr,
                                             I->getThreadLocalMode(),
                                             I->getType()->getAddressSpace());
     GV->copyAttributesFrom(I);
@@ -67,8 +67,10 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
   // Loop over the aliases in the module
   for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
        I != E; ++I) {
-    GlobalAlias *GA = new GlobalAlias(I->getType(), I->getLinkage(),
-                                      I->getName(), NULL, New);
+    auto *PTy = cast<PointerType>(I->getType());
+    auto *GA =
+        GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                            I->getLinkage(), I->getName(), New);
     GA->copyAttributesFrom(I);
     VMap[I] = GA;
   }
@@ -105,8 +107,8 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
   for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
        I != E; ++I) {
     GlobalAlias *GA = cast<GlobalAlias>(VMap[I]);
-    if (const Constant *C = I->getAliasee())
-      GA->setAliasee(MapValue(C, VMap));
+    if (const GlobalObject *C = I->getAliasee())
+      GA->setAliasee(cast<GlobalObject>(MapValue(C, VMap)));
   }
 
   // And named metadata....
diff --git a/lib/Transforms/Utils/CmpInstAnalysis.cpp b/lib/Transforms/Utils/CmpInstAnalysis.cpp
index 8fa412a..3b15a0a 100644
--- a/lib/Transforms/Utils/CmpInstAnalysis.cpp
+++ b/lib/Transforms/Utils/CmpInstAnalysis.cpp
@@ -84,7 +84,7 @@ Value *llvm::getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
     case 7: // True.
       return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
   }
-  return NULL;
+  return nullptr;
 }
 
 /// PredicatesFoldable - Return true if both predicates match sign or if at
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index b814842..e70a7d6 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -38,6 +38,8 @@
 #include <set>
 using namespace llvm;
 
+#define DEBUG_TYPE "code-extractor"
+
 // Provide a command-line option to aggregate function arguments into a struct
 // for functions produced by the code extractor. This is useful when converting
 // extracted functions to pthread-based code, as only one argument (void*) can
@@ -118,7 +120,7 @@ buildExtractionBlockSet(const RegionNode &RN) {
 }
 
 CodeExtractor::CodeExtractor(BasicBlock *BB, bool AggregateArgs)
-  : DT(0), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+  : DT(nullptr), AggregateArgs(AggregateArgs||AggregateArgsOpt),
     Blocks(buildExtractionBlockSet(BB)), NumExitBlocks(~0U) {}
 
 CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
@@ -410,7 +412,7 @@ static BasicBlock* FindPhiPredForUseInBlock(Value* Used, BasicBlock* BB) {
        return P->getIncomingBlock(U);
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// emitCallAndSwitchStatement - This method sets up the caller side by adding
@@ -438,14 +440,14 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       StructValues.push_back(*i);
     } else {
       AllocaInst *alloca =
-        new AllocaInst((*i)->getType(), 0, (*i)->getName()+".loc",
+        new AllocaInst((*i)->getType(), nullptr, (*i)->getName()+".loc",
                        codeReplacer->getParent()->begin()->begin());
       ReloadOutputs.push_back(alloca);
       params.push_back(alloca);
     }
   }
 
-  AllocaInst *Struct = 0;
+  AllocaInst *Struct = nullptr;
   if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
     std::vector<Type*> ArgTypes;
     for (ValueSet::iterator v = StructValues.begin(),
@@ -455,7 +457,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
     // Allocate a struct at the beginning of this function
     Type *StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
     Struct =
-      new AllocaInst(StructArgTy, 0, "structArg",
+      new AllocaInst(StructArgTy, nullptr, "structArg",
                      codeReplacer->getParent()->begin()->begin());
     params.push_back(Struct);
 
@@ -484,7 +486,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
 
   // Reload the outputs passed in by reference
   for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
-    Value *Output = 0;
+    Value *Output = nullptr;
     if (AggregateArgs) {
       Value *Idx[2];
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
@@ -537,7 +539,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
                                          newFunction);
           unsigned SuccNum = switchVal++;
 
-          Value *brVal = 0;
+          Value *brVal = nullptr;
           switch (NumExitBlocks) {
           case 0:
           case 1: break;  // No value needed.
@@ -633,7 +635,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
 
     // Check if the function should return a value
     if (OldFnRetTy->isVoidTy()) {
-      ReturnInst::Create(Context, 0, TheSwitch);  // Return void
+      ReturnInst::Create(Context, nullptr, TheSwitch);  // Return void
     } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) {
       // return what we have
       ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch);
@@ -685,7 +687,7 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) {
 
 Function *CodeExtractor::extractCodeRegion() {
   if (!isEligible())
-    return 0;
+    return nullptr;
 
   ValueSet inputs, outputs;
 
diff --git a/lib/Transforms/Utils/CtorUtils.cpp b/lib/Transforms/Utils/CtorUtils.cpp
new file mode 100644
index 0000000..a359424
--- /dev/null
+++ b/lib/Transforms/Utils/CtorUtils.cpp
@@ -0,0 +1,183 @@
+//===- CtorUtils.cpp - Helpers for working with global_ctors ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines functions that are used to process llvm.global_ctors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "ctor_utils"
+
+namespace llvm {
+
+namespace {
+/// Given a specified llvm.global_ctors list, install the
+/// specified array.
+void installGlobalCtors(GlobalVariable *GCL,
+                        const std::vector<Function *> &Ctors) {
+  // If we made a change, reassemble the initializer list.
+  Constant *CSVals[3];
+
+  StructType *StructTy =
+      cast<StructType>(GCL->getType()->getElementType()->getArrayElementType());
+
+  // Create the new init list.
+  std::vector<Constant *> CAList;
+  for (Function *F : Ctors) {
+    Type *Int32Ty = Type::getInt32Ty(GCL->getContext());
+    if (F) {
+      CSVals[0] = ConstantInt::get(Int32Ty, 65535);
+      CSVals[1] = F;
+    } else {
+      CSVals[0] = ConstantInt::get(Int32Ty, 0x7fffffff);
+      CSVals[1] = Constant::getNullValue(StructTy->getElementType(1));
+    }
+    // FIXME: Only allow the 3-field form in LLVM 4.0.
+    size_t NumElts = StructTy->getNumElements();
+    if (NumElts > 2)
+      CSVals[2] = Constant::getNullValue(StructTy->getElementType(2));
+    CAList.push_back(
+        ConstantStruct::get(StructTy, makeArrayRef(CSVals, NumElts)));
+  }
+
+  // Create the array initializer.
+  Constant *CA =
+      ConstantArray::get(ArrayType::get(StructTy, CAList.size()), CAList);
+
+  // If we didn't change the number of elements, don't create a new GV.
+  if (CA->getType() == GCL->getInitializer()->getType()) {
+    GCL->setInitializer(CA);
+    return;
+  }
+
+  // Create the new global and insert it next to the existing list.
+  GlobalVariable *NGV =
+      new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(),
+                         CA, "", GCL->getThreadLocalMode());
+  GCL->getParent()->getGlobalList().insert(GCL, NGV);
+  NGV->takeName(GCL);
+
+  // Nuke the old list, replacing any uses with the new one.
+  if (!GCL->use_empty()) {
+    Constant *V = NGV;
+    if (V->getType() != GCL->getType())
+      V = ConstantExpr::getBitCast(V, GCL->getType());
+    GCL->replaceAllUsesWith(V);
+  }
+  GCL->eraseFromParent();
+}
+
+/// Given a llvm.global_ctors list that we can understand,
+/// return a list of the functions and null terminator as a vector.
+std::vector<Function*> parseGlobalCtors(GlobalVariable *GV) {
+  if (GV->getInitializer()->isNullValue())
+    return std::vector<Function *>();
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  std::vector<Function *> Result;
+  Result.reserve(CA->getNumOperands());
+  for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) {
+    ConstantStruct *CS = cast<ConstantStruct>(*i);
+    Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
+  }
+  return Result;
+}
+
+/// Find the llvm.global_ctors list, verifying that all initializers have an
+/// init priority of 65535.
+GlobalVariable *findGlobalCtors(Module &M) {
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return nullptr;
+
+  // Verify that the initializer is simple enough for us to handle. We are
+  // only allowed to optimize the initializer if it is unique.
+  if (!GV->hasUniqueInitializer())
+    return nullptr;
+
+  if (isa<ConstantAggregateZero>(GV->getInitializer()))
+    return GV;
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+
+  for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) {
+    if (isa<ConstantAggregateZero>(*i))
+      continue;
+    ConstantStruct *CS = cast<ConstantStruct>(*i);
+    if (isa<ConstantPointerNull>(CS->getOperand(1)))
+      continue;
+
+    // Must have a function or null ptr.
+    if (!isa<Function>(CS->getOperand(1)))
+      return nullptr;
+
+    // Init priority must be standard.
+    ConstantInt *CI = cast<ConstantInt>(CS->getOperand(0));
+    if (CI->getZExtValue() != 65535)
+      return nullptr;
+  }
+
+  return GV;
+}
+} // namespace
+
+/// Call "ShouldRemove" for every entry in M's global_ctor list and remove the
+/// entries for which it returns true.  Return true if anything changed.
+bool optimizeGlobalCtorsList(Module &M,
+                             function_ref<bool(Function *)> ShouldRemove) {
+  GlobalVariable *GlobalCtors = findGlobalCtors(M);
+  if (!GlobalCtors)
+    return false;
+
+  std::vector<Function *> Ctors = parseGlobalCtors(GlobalCtors);
+  if (Ctors.empty())
+    return false;
+
+  bool MadeChange = false;
+
+  // Loop over global ctors, optimizing them when we can.
+  for (unsigned i = 0; i != Ctors.size(); ++i) {
+    Function *F = Ctors[i];
+    // Found a null terminator in the middle of the list, prune off the rest of
+    // the list.
+    if (!F) {
+      if (i != Ctors.size() - 1) {
+        Ctors.resize(i + 1);
+        MadeChange = true;
+      }
+      break;
+    }
+    DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
+
+    // We cannot simplify external ctor functions.
+    if (F->empty())
+      continue;
+
+    // If we can evaluate the ctor at compile time, do.
+    if (ShouldRemove(F)) {
+      Ctors.erase(Ctors.begin() + i);
+      MadeChange = true;
+      --i;
+      continue;
+    }
+  }
+
+  if (!MadeChange)
+    return false;
+
+  installGlobalCtors(GlobalCtors, Ctors);
+  return true;
+}
+
+} // End llvm namespace
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index ac6926f..9972b22 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -25,17 +25,17 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
                                    Instruction *AllocaPoint) {
   if (I.use_empty()) {
     I.eraseFromParent();
-    return 0;
+    return nullptr;
   }
 
   // Create a stack slot to hold the value.
   AllocaInst *Slot;
   if (AllocaPoint) {
-    Slot = new AllocaInst(I.getType(), 0,
+    Slot = new AllocaInst(I.getType(), nullptr,
                           I.getName()+".reg2mem", AllocaPoint);
   } else {
     Function *F = I.getParent()->getParent();
-    Slot = new AllocaInst(I.getType(), 0, I.getName()+".reg2mem",
+    Slot = new AllocaInst(I.getType(), nullptr, I.getName()+".reg2mem",
                           F->getEntryBlock().begin());
   }
 
@@ -56,7 +56,7 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
         if (PN->getIncomingValue(i) == &I) {
           Value *&V = Loads[PN->getIncomingBlock(i)];
-          if (V == 0) {
+          if (!V) {
             // Insert the load into the predecessor block
             V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads,
                              PN->getIncomingBlock(i)->getTerminator());
@@ -110,17 +110,17 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
 AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
   if (P->use_empty()) {
     P->eraseFromParent();
-    return 0;
+    return nullptr;
   }
 
   // Create a stack slot to hold the value.
   AllocaInst *Slot;
   if (AllocaPoint) {
-    Slot = new AllocaInst(P->getType(), 0,
+    Slot = new AllocaInst(P->getType(), nullptr,
                           P->getName()+".reg2mem", AllocaPoint);
   } else {
     Function *F = P->getParent()->getParent();
-    Slot = new AllocaInst(P->getType(), 0, P->getName()+".reg2mem",
+    Slot = new AllocaInst(P->getType(), nullptr, P->getName()+".reg2mem",
                           F->getEntryBlock().begin());
   }
 
diff --git a/lib/Transforms/Utils/FlattenCFG.cpp b/lib/Transforms/Utils/FlattenCFG.cpp
index 39c80f8..51ead40 100644
--- a/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/lib/Transforms/Utils/FlattenCFG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "flattencfg"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -22,16 +21,19 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "flattencfg"
+
 namespace {
 class FlattenCFGOpt {
   AliasAnalysis *AA;
   /// \brief Use parallel-and or parallel-or to generate conditions for
   /// conditional branches.
-  bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = 0);
+  bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder,
+                            Pass *P = nullptr);
   /// \brief If \param BB is the merge block of an if-region, attempt to merge
   /// the if-region with an adjacent if-region upstream if two if-regions
   /// contain identical instructions.
-  bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = 0);
+  bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = nullptr);
   /// \brief Compare a pair of blocks: \p Block1 and \p Block2, which
   /// are from two if-regions whose entry blocks are \p Head1 and \p
   /// Head2.  \returns true if \p Block1 and \p Block2 contain identical
@@ -126,9 +128,9 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder,
   if (PHI)
     return false; // For simplicity, avoid cases containing PHI nodes.
 
-  BasicBlock *LastCondBlock = NULL;
-  BasicBlock *FirstCondBlock = NULL;
-  BasicBlock *UnCondBlock = NULL;
+  BasicBlock *LastCondBlock = nullptr;
+  BasicBlock *FirstCondBlock = nullptr;
+  BasicBlock *UnCondBlock = nullptr;
   int Idx = -1;
 
   // Check predecessors of \param BB.
diff --git a/lib/Transforms/Utils/GlobalStatus.cpp b/lib/Transforms/Utils/GlobalStatus.cpp
index e9ebc45..12057e4 100644
--- a/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/lib/Transforms/Utils/GlobalStatus.cpp
@@ -61,7 +61,7 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
     } else if (const Instruction *I = dyn_cast<Instruction>(UR)) {
       if (!GS.HasMultipleAccessingFunctions) {
         const Function *F = I->getParent()->getParent();
-        if (GS.AccessingFunction == 0)
+        if (!GS.AccessingFunction)
           GS.AccessingFunction = F;
         else if (GS.AccessingFunction != F)
           GS.HasMultipleAccessingFunctions = true;
@@ -176,6 +176,6 @@ bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
 
 GlobalStatus::GlobalStatus()
     : IsCompared(false), IsLoaded(false), StoredType(NotStored),
-      StoredOnceValue(0), AccessingFunction(0),
+      StoredOnceValue(nullptr), AccessingFunction(nullptr),
       HasMultipleAccessingFunctions(false), HasNonInstructionUser(false),
       Ordering(NotAtomic) {}
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 86def3e..e01d0c3 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -51,8 +52,8 @@ namespace {
 
   public:
     InvokeInliningInfo(InvokeInst *II)
-      : OuterResumeDest(II->getUnwindDest()), InnerResumeDest(0),
-        CallerLPad(0), InnerEHValuesPHI(0) {
+      : OuterResumeDest(II->getUnwindDest()), InnerResumeDest(nullptr),
+        CallerLPad(nullptr), InnerEHValuesPHI(nullptr) {
       // If there are PHI nodes in the unwind destination block, we need to keep
       // track of which values came into them from the invoke before removing
       // the edge from this block.
@@ -289,13 +290,13 @@ static void UpdateCallGraphAfterInlining(CallSite CS,
 
     ValueToValueMapTy::iterator VMI = VMap.find(OrigCall);
     // Only copy the edge if the call was inlined!
-    if (VMI == VMap.end() || VMI->second == 0)
+    if (VMI == VMap.end() || VMI->second == nullptr)
       continue;
     
     // If the call was inlined, but then constant folded, there is no edge to
     // add.  Check for this case.
     Instruction *NewCall = dyn_cast<Instruction>(VMI->second);
-    if (NewCall == 0) continue;
+    if (!NewCall) continue;
 
     // Remember that this call site got inlined for the client of
     // InlineFunction.
@@ -306,7 +307,7 @@ static void UpdateCallGraphAfterInlining(CallSite CS,
     // happens, set the callee of the new call site to a more precise
     // destination.  This can also happen if the call graph node of the caller
     // was just unnecessarily imprecise.
-    if (I->second->getFunction() == 0)
+    if (!I->second->getFunction())
       if (Function *F = CallSite(NewCall).getCalledFunction()) {
         // Indirect call site resolved to direct call.
         CallerNode->addCalledFunction(CallSite(NewCall), CG[F]);
@@ -322,13 +323,44 @@ static void UpdateCallGraphAfterInlining(CallSite CS,
   CallerNode->removeCallEdgeFor(CS);
 }
 
+static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
+                                    BasicBlock *InsertBlock,
+                                    InlineFunctionInfo &IFI) {
+  LLVMContext &Context = Src->getContext();
+  Type *VoidPtrTy = Type::getInt8PtrTy(Context);
+  Type *AggTy = cast<PointerType>(Src->getType())->getElementType();
+  Type *Tys[3] = { VoidPtrTy, VoidPtrTy, Type::getInt64Ty(Context) };
+  Function *MemCpyFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys);
+  IRBuilder<> builder(InsertBlock->begin());
+  Value *DstCast = builder.CreateBitCast(Dst, VoidPtrTy, "tmp");
+  Value *SrcCast = builder.CreateBitCast(Src, VoidPtrTy, "tmp");
+
+  Value *Size;
+  if (IFI.DL == nullptr)
+    Size = ConstantExpr::getSizeOf(AggTy);
+  else
+    Size = ConstantInt::get(Type::getInt64Ty(Context),
+                            IFI.DL->getTypeStoreSize(AggTy));
+
+  // Always generate a memcpy of alignment 1 here because we don't know
+  // the alignment of the src pointer.  Other optimizations can infer
+  // better alignment.
+  Value *CallArgs[] = {
+    DstCast, SrcCast, Size,
+    ConstantInt::get(Type::getInt32Ty(Context), 1),
+    ConstantInt::getFalse(Context) // isVolatile
+  };
+  builder.CreateCall(MemCpyFn, CallArgs);
+}
+
 /// HandleByValArgument - When inlining a call site that has a byval argument,
 /// we have to make the implicit memcpy explicit by adding it.
 static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
                                   const Function *CalledFunc,
                                   InlineFunctionInfo &IFI,
                                   unsigned ByValAlignment) {
-  Type *AggTy = cast<PointerType>(Arg->getType())->getElementType();
+  PointerType *ArgTy = cast<PointerType>(Arg->getType());
+  Type *AggTy = ArgTy->getElementType();
 
   // If the called function is readonly, then it could not mutate the caller's
   // copy of the byval'd memory.  In this case, it is safe to elide the copy and
@@ -349,11 +381,7 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
     // Otherwise, we have to make a memcpy to get a safe alignment.  This is bad
     // for code quality, but rarely happens and is required for correctness.
   }
-  
-  LLVMContext &Context = Arg->getContext();
 
-  Type *VoidPtrTy = Type::getInt8PtrTy(Context);
-  
   // Create the alloca.  If we have DataLayout, use nice alignment.
   unsigned Align = 1;
   if (IFI.DL)
@@ -366,32 +394,9 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   
   Function *Caller = TheCall->getParent()->getParent(); 
   
-  Value *NewAlloca = new AllocaInst(AggTy, 0, Align, Arg->getName(), 
+  Value *NewAlloca = new AllocaInst(AggTy, nullptr, Align, Arg->getName(), 
                                     &*Caller->begin()->begin());
-  // Emit a memcpy.
-  Type *Tys[3] = {VoidPtrTy, VoidPtrTy, Type::getInt64Ty(Context)};
-  Function *MemCpyFn = Intrinsic::getDeclaration(Caller->getParent(),
-                                                 Intrinsic::memcpy, 
-                                                 Tys);
-  Value *DestCast = new BitCastInst(NewAlloca, VoidPtrTy, "tmp", TheCall);
-  Value *SrcCast = new BitCastInst(Arg, VoidPtrTy, "tmp", TheCall);
-  
-  Value *Size;
-  if (IFI.DL == 0)
-    Size = ConstantExpr::getSizeOf(AggTy);
-  else
-    Size = ConstantInt::get(Type::getInt64Ty(Context),
-                            IFI.DL->getTypeStoreSize(AggTy));
-  
-  // Always generate a memcpy of alignment 1 here because we don't know
-  // the alignment of the src pointer.  Other optimizations can infer
-  // better alignment.
-  Value *CallArgs[] = {
-    DestCast, SrcCast, Size,
-    ConstantInt::get(Type::getInt32Ty(Context), 1),
-    ConstantInt::getFalse(Context) // isVolatile
-  };
-  IRBuilder<>(TheCall).CreateCall(MemCpyFn, CallArgs);
+  IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca));
   
   // Uses of the argument in the function should use our new alloca
   // instead.
@@ -417,8 +422,10 @@ static bool isUsedByLifetimeMarker(Value *V) {
 // hasLifetimeMarkers - Check whether the given alloca already has
 // lifetime.start or lifetime.end intrinsics.
 static bool hasLifetimeMarkers(AllocaInst *AI) {
-  Type *Int8PtrTy = Type::getInt8PtrTy(AI->getType()->getContext());
-  if (AI->getType() == Int8PtrTy)
+  Type *Ty = AI->getType();
+  Type *Int8PtrTy = Type::getInt8PtrTy(Ty->getContext(),
+                                       Ty->getPointerAddressSpace());
+  if (Ty == Int8PtrTy)
     return isUsedByLifetimeMarker(AI);
 
   // Do a scan to find all the casts to i8*.
@@ -472,6 +479,33 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
   }
 }
 
+/// Returns a musttail call instruction if one immediately precedes the given
+/// return instruction with an optional bitcast instruction between them.
+static CallInst *getPrecedingMustTailCall(ReturnInst *RI) {
+  Instruction *Prev = RI->getPrevNode();
+  if (!Prev)
+    return nullptr;
+
+  if (Value *RV = RI->getReturnValue()) {
+    if (RV != Prev)
+      return nullptr;
+
+    // Look through the optional bitcast.
+    if (auto *BI = dyn_cast<BitCastInst>(Prev)) {
+      RV = BI->getOperand(0);
+      Prev = BI->getPrevNode();
+      if (!Prev || RV != Prev)
+        return nullptr;
+    }
+  }
+
+  if (auto *CI = dyn_cast<CallInst>(Prev)) {
+    if (CI->isMustTailCall())
+      return CI;
+  }
+  return nullptr;
+}
+
 /// InlineFunction - This function inlines the called function into the basic
 /// block of the caller.  This returns false if it is not possible to inline
 /// this call.  The program is still in a well defined state if this occurs
@@ -491,15 +525,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   IFI.reset();
   
   const Function *CalledFunc = CS.getCalledFunction();
-  if (CalledFunc == 0 ||          // Can't inline external function or indirect
+  if (!CalledFunc ||              // Can't inline external function or indirect
       CalledFunc->isDeclaration() || // call, or call to a vararg function!
       CalledFunc->getFunctionType()->isVarArg()) return false;
 
-  // If the call to the callee is not a tail call, we must clear the 'tail'
-  // flags on any calls that we inline.
-  bool MustClearTailCallFlags =
-    !(isa<CallInst>(TheCall) && cast<CallInst>(TheCall)->isTailCall());
-
   // If the call to the callee cannot throw, set the 'nounwind' flag on any
   // calls that we inline.
   bool MarkNoUnwind = CS.doesNotThrow();
@@ -519,7 +548,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   }
 
   // Get the personality function from the callee if it contains a landing pad.
-  Value *CalleePersonality = 0;
+  Value *CalleePersonality = nullptr;
   for (Function::const_iterator I = CalledFunc->begin(), E = CalledFunc->end();
        I != E; ++I)
     if (const InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) {
@@ -562,6 +591,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
   { // Scope to destroy VMap after cloning.
     ValueToValueMapTy VMap;
+    // Keep a list of pair (dst, src) to emit byval initializations.
+    SmallVector<std::pair<Value*, Value*>, 4> ByValInit;
 
     assert(CalledFunc->arg_size() == CS.arg_size() &&
            "No varargs calls can be inlined!");
@@ -581,11 +612,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       if (CS.isByValArgument(ArgNo)) {
         ActualArg = HandleByValArgument(ActualArg, TheCall, CalledFunc, IFI,
                                         CalledFunc->getParamAlignment(ArgNo+1));
- 
-        // Calls that we inline may use the new alloca, so we need to clear
-        // their 'tail' flags if HandleByValArgument introduced a new alloca and
-        // the callee has calls.
-        MustClearTailCallFlags |= ActualArg != *AI;
+        if (ActualArg != *AI)
+          ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI));
       }
 
       VMap[I] = ActualArg;
@@ -602,6 +630,11 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // Remember the first block that is newly cloned over.
     FirstNewBlock = LastBlock; ++FirstNewBlock;
 
+    // Inject byval arguments initialization.
+    for (std::pair<Value*, Value*> &Init : ByValInit)
+      HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(),
+                              FirstNewBlock, IFI);
+
     // Update the callgraph if requested.
     if (IFI.CG)
       UpdateCallGraphAfterInlining(CS, FirstNewBlock, VMap, IFI);
@@ -619,7 +652,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     for (BasicBlock::iterator I = FirstNewBlock->begin(),
          E = FirstNewBlock->end(); I != E; ) {
       AllocaInst *AI = dyn_cast<AllocaInst>(I++);
-      if (AI == 0) continue;
+      if (!AI) continue;
       
       // If the alloca is now dead, remove it.  This often occurs due to code
       // specialization.
@@ -651,6 +684,45 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     }
   }
 
+  bool InlinedMustTailCalls = false;
+  if (InlinedFunctionInfo.ContainsCalls) {
+    CallInst::TailCallKind CallSiteTailKind = CallInst::TCK_None;
+    if (CallInst *CI = dyn_cast<CallInst>(TheCall))
+      CallSiteTailKind = CI->getTailCallKind();
+
+    for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E;
+         ++BB) {
+      for (Instruction &I : *BB) {
+        CallInst *CI = dyn_cast<CallInst>(&I);
+        if (!CI)
+          continue;
+
+        // We need to reduce the strength of any inlined tail calls.  For
+        // musttail, we have to avoid introducing potential unbounded stack
+        // growth.  For example, if functions 'f' and 'g' are mutually recursive
+        // with musttail, we can inline 'g' into 'f' so long as we preserve
+        // musttail on the cloned call to 'f'.  If either the inlined call site
+        // or the cloned call site is *not* musttail, the program already has
+        // one frame of stack growth, so it's safe to remove musttail.  Here is
+        // a table of example transformations:
+        //
+        //    f -> musttail g -> musttail f  ==>  f -> musttail f
+        //    f -> musttail g ->     tail f  ==>  f ->     tail f
+        //    f ->          g -> musttail f  ==>  f ->          f
+        //    f ->          g ->     tail f  ==>  f ->          f
+        CallInst::TailCallKind ChildTCK = CI->getTailCallKind();
+        ChildTCK = std::min(CallSiteTailKind, ChildTCK);
+        CI->setTailCallKind(ChildTCK);
+        InlinedMustTailCalls |= CI->isMustTailCall();
+
+        // Calls inlined through a 'nounwind' call site should be marked
+        // 'nounwind'.
+        if (MarkNoUnwind)
+          CI->setDoesNotThrow();
+      }
+    }
+  }
+
   // Leave lifetime markers for the static alloca's, scoping them to the
   // function we just inlined.
   if (InsertLifetime && !IFI.StaticAllocas.empty()) {
@@ -664,7 +736,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         continue;
 
       // Try to determine the size of the allocation.
-      ConstantInt *AllocaSize = 0;
+      ConstantInt *AllocaSize = nullptr;
       if (ConstantInt *AIArraySize =
           dyn_cast<ConstantInt>(AI->getArraySize())) {
         if (IFI.DL) {
@@ -683,9 +755,12 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       }
 
       builder.CreateLifetimeStart(AI, AllocaSize);
-      for (unsigned ri = 0, re = Returns.size(); ri != re; ++ri) {
-        IRBuilder<> builder(Returns[ri]);
-        builder.CreateLifetimeEnd(AI, AllocaSize);
+      for (ReturnInst *RI : Returns) {
+        // Don't insert llvm.lifetime.end calls between a musttail call and a
+        // return.  The return kills all local allocas.
+        if (InlinedMustTailCalls && getPrecedingMustTailCall(RI))
+          continue;
+        IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize);
       }
     }
   }
@@ -704,33 +779,56 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
     // Insert a call to llvm.stackrestore before any return instructions in the
     // inlined function.
-    for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
-      IRBuilder<>(Returns[i]).CreateCall(StackRestore, SavedPtr);
+    for (ReturnInst *RI : Returns) {
+      // Don't insert llvm.stackrestore calls between a musttail call and a
+      // return.  The return will restore the stack pointer.
+      if (InlinedMustTailCalls && getPrecedingMustTailCall(RI))
+        continue;
+      IRBuilder<>(RI).CreateCall(StackRestore, SavedPtr);
     }
   }
 
-  // If we are inlining tail call instruction through a call site that isn't
-  // marked 'tail', we must remove the tail marker for any calls in the inlined
-  // code.  Also, calls inlined through a 'nounwind' call site should be marked
-  // 'nounwind'.
-  if (InlinedFunctionInfo.ContainsCalls &&
-      (MustClearTailCallFlags || MarkNoUnwind)) {
-    for (Function::iterator BB = FirstNewBlock, E = Caller->end();
-         BB != E; ++BB)
-      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-        if (CallInst *CI = dyn_cast<CallInst>(I)) {
-          if (MustClearTailCallFlags)
-            CI->setTailCall(false);
-          if (MarkNoUnwind)
-            CI->setDoesNotThrow();
-        }
-  }
-
   // If we are inlining for an invoke instruction, we must make sure to rewrite
   // any call instructions into invoke instructions.
   if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall))
     HandleInlinedInvoke(II, FirstNewBlock, InlinedFunctionInfo);
 
+  // Handle any inlined musttail call sites.  In order for a new call site to be
+  // musttail, the source of the clone and the inlined call site must have been
+  // musttail.  Therefore it's safe to return without merging control into the
+  // phi below.
+  if (InlinedMustTailCalls) {
+    // Check if we need to bitcast the result of any musttail calls.
+    Type *NewRetTy = Caller->getReturnType();
+    bool NeedBitCast = !TheCall->use_empty() && TheCall->getType() != NewRetTy;
+
+    // Handle the returns preceded by musttail calls separately.
+    SmallVector<ReturnInst *, 8> NormalReturns;
+    for (ReturnInst *RI : Returns) {
+      CallInst *ReturnedMustTail = getPrecedingMustTailCall(RI);
+      if (!ReturnedMustTail) {
+        NormalReturns.push_back(RI);
+        continue;
+      }
+      if (!NeedBitCast)
+        continue;
+
+      // Delete the old return and any preceding bitcast.
+      BasicBlock *CurBB = RI->getParent();
+      auto *OldCast = dyn_cast_or_null<BitCastInst>(RI->getReturnValue());
+      RI->eraseFromParent();
+      if (OldCast)
+        OldCast->eraseFromParent();
+
+      // Insert a new bitcast and return with the right type.
+      IRBuilder<> Builder(CurBB);
+      Builder.CreateRet(Builder.CreateBitCast(ReturnedMustTail, NewRetTy));
+    }
+
+    // Leave behind the normal returns so we can merge control flow.
+    std::swap(Returns, NormalReturns);
+  }
+
   // If we cloned in _exactly one_ basic block, and if that block ends in a
   // return instruction, we splice the body of the inlined callee directly into
   // the calling basic block.
@@ -774,7 +872,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // "starter" and "ender" blocks.  How we accomplish this depends on whether
   // this is an invoke instruction or a call instruction.
   BasicBlock *AfterCallBB;
-  BranchInst *CreatedBranchToNormalDest = NULL;
+  BranchInst *CreatedBranchToNormalDest = nullptr;
   if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
 
     // Add an unconditional branch to make this look like the CallInst case...
@@ -813,7 +911,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // any users of the original call/invoke instruction.
   Type *RTy = CalledFunc->getReturnType();
 
-  PHINode *PHI = 0;
+  PHINode *PHI = nullptr;
   if (Returns.size() > 1) {
     // The PHI node should go at the front of the new basic block to merge all
     // possible incoming values.
@@ -886,6 +984,11 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // Since we are now done with the Call/Invoke, we can delete it.
   TheCall->eraseFromParent();
 
+  // If we inlined any musttail calls and the original return is now
+  // unreachable, delete it.  It can only contain a bitcast and ret.
+  if (InlinedMustTailCalls && pred_begin(AfterCallBB) == pred_end(AfterCallBB))
+    AfterCallBB->eraseFromParent();
+
   // We should always be able to fold the entry block of the function into the
   // single predecessor of the block...
   assert(cast<BranchInst>(Br)->isUnconditional() && "splitBasicBlock broken!");
diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
index e73a543..9f91eeb 100644
--- a/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/lib/Transforms/Utils/IntegerDivision.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "integer-division"
 #include "llvm/Transforms/Utils/IntegerDivision.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -24,6 +23,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "integer-division"
+
 /// Generate code to compute the remainder of two signed integers. Returns the
 /// remainder, which will have the sign of the dividend. Builder's insert point
 /// should be pointing where the caller wants code generated, e.g. at the srem
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index d538175..51a3d9c 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -27,7 +27,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "lcssa"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
@@ -44,6 +43,8 @@
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "lcssa"
+
 STATISTIC(NumLCSSA, "Number of live out of a loop variables");
 
 /// Return true if the specified block is in the list.
@@ -267,8 +268,6 @@ struct LCSSA : public FunctionPass {
   }
 
 private:
-  bool processLoop(Loop &L);
-
   void verifyAnalysis() const override;
 };
 }
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 9d0be8b..aedd787 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -43,6 +43,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "local"
+
 STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
 
 //===----------------------------------------------------------------------===//
@@ -159,7 +161,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       // Otherwise, check to see if the switch only branches to one destination.
       // We do this by reseting "TheOnlyDest" to null when we find two non-equal
       // destinations.
-      if (i.getCaseSuccessor() != TheOnlyDest) TheOnlyDest = 0;
+      if (i.getCaseSuccessor() != TheOnlyDest) TheOnlyDest = nullptr;
     }
 
     if (CI && !TheOnlyDest) {
@@ -180,7 +182,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
         // Found case matching a constant operand?
         BasicBlock *Succ = SI->getSuccessor(i);
         if (Succ == TheOnlyDest)
-          TheOnlyDest = 0;  // Don't modify the first branch to TheOnlyDest
+          TheOnlyDest = nullptr; // Don't modify the first branch to TheOnlyDest
         else
           Succ->removePredecessor(BB);
       }
@@ -233,7 +235,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
 
       for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
         if (IBI->getDestination(i) == TheOnlyDest)
-          TheOnlyDest = 0;
+          TheOnlyDest = nullptr;
         else
           IBI->getDestination(i)->removePredecessor(IBI->getParent());
       }
@@ -331,7 +333,7 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
     // dead as we go.
     for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
       Value *OpV = I->getOperand(i);
-      I->setOperand(i, 0);
+      I->setOperand(i, nullptr);
 
       if (!OpV->use_empty()) continue;
 
@@ -894,24 +896,26 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
     return PrefAlign;
   }
 
-  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+  if (auto *GO = dyn_cast<GlobalObject>(V)) {
     // If there is a large requested alignment and we can, bump up the alignment
     // of the global.
-    if (GV->isDeclaration()) return Align;
+    if (GO->isDeclaration())
+      return Align;
     // If the memory we set aside for the global may not be the memory used by
     // the final program then it is impossible for us to reliably enforce the
     // preferred alignment.
-    if (GV->isWeakForLinker()) return Align;
+    if (GO->isWeakForLinker())
+      return Align;
 
-    if (GV->getAlignment() >= PrefAlign)
-      return GV->getAlignment();
+    if (GO->getAlignment() >= PrefAlign)
+      return GO->getAlignment();
     // We can only increase the alignment of the global if it has no alignment
     // specified or if it is not assigned a section.  If it is assigned a
     // section, the global could be densely packed with other objects in the
     // section, increasing the alignment could cause padding issues.
-    if (!GV->hasSection() || GV->getAlignment() == 0)
-      GV->setAlignment(PrefAlign);
-    return GV->getAlignment();
+    if (!GO->hasSection() || GO->getAlignment() == 0)
+      GO->setAlignment(PrefAlign);
+    return GO->getAlignment();
   }
 
   return Align;
@@ -928,7 +932,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
   unsigned BitWidth = DL ? DL->getPointerTypeSizeInBits(V->getType()) : 64;
 
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  ComputeMaskedBits(V, KnownZero, KnownOne, DL);
+  computeKnownBits(V, KnownZero, KnownOne, DL);
   unsigned TrailZ = KnownZero.countTrailingOnes();
 
   // Avoid trouble with ridiculously large TrailZ values, such as
@@ -981,10 +985,10 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   if (LdStHasDebugValue(DIVar, SI))
     return true;
 
-  Instruction *DbgVal = NULL;
+  Instruction *DbgVal = nullptr;
   // If an argument is zero extended then use argument directly. The ZExt
   // may be zapped by an optimization pass in future.
-  Argument *ExtendedArg = NULL;
+  Argument *ExtendedArg = nullptr;
   if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
     ExtendedArg = dyn_cast<Argument>(ZExt->getOperand(0));
   if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
@@ -993,14 +997,7 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
     DbgVal = Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, SI);
   else
     DbgVal = Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, SI);
-
-  // Propagate any debug metadata from the store onto the dbg.value.
-  DebugLoc SIDL = SI->getDebugLoc();
-  if (!SIDL.isUnknown())
-    DbgVal->setDebugLoc(SIDL);
-  // Otherwise propagate debug metadata from dbg.declare.
-  else
-    DbgVal->setDebugLoc(DDI->getDebugLoc());
+  DbgVal->setDebugLoc(DDI->getDebugLoc());
   return true;
 }
 
@@ -1020,17 +1017,16 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   Instruction *DbgVal =
     Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0,
                                     DIVar, LI);
-
-  // Propagate any debug metadata from the store onto the dbg.value.
-  DebugLoc LIDL = LI->getDebugLoc();
-  if (!LIDL.isUnknown())
-    DbgVal->setDebugLoc(LIDL);
-  // Otherwise propagate debug metadata from dbg.declare.
-  else
-    DbgVal->setDebugLoc(DDI->getDebugLoc());
+  DbgVal->setDebugLoc(DDI->getDebugLoc());
   return true;
 }
 
+/// Determine whether this alloca is either a VLA or an array.
+static bool isArray(AllocaInst *AI) {
+  return AI->isArrayAllocation() ||
+    AI->getType()->getElementType()->isArrayTy();
+}
+
 /// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set
 /// of llvm.dbg.value intrinsics.
 bool llvm::LowerDbgDeclare(Function &F) {
@@ -1049,20 +1045,26 @@ bool llvm::LowerDbgDeclare(Function &F) {
     AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
     // If this is an alloca for a scalar variable, insert a dbg.value
     // at each load and store to the alloca and erase the dbg.declare.
-    if (AI && !AI->isArrayAllocation()) {
-
-      // We only remove the dbg.declare intrinsic if all uses are
-      // converted to dbg.value intrinsics.
-      bool RemoveDDI = true;
+    // The dbg.values allow tracking a variable even if it is not
+    // stored on the stack, while the dbg.declare can only describe
+    // the stack slot (and at a lexical-scope granularity). Later
+    // passes will attempt to elide the stack slot.
+    if (AI && !isArray(AI)) {
       for (User *U : AI->users())
         if (StoreInst *SI = dyn_cast<StoreInst>(U))
           ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
         else if (LoadInst *LI = dyn_cast<LoadInst>(U))
           ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
-        else
-          RemoveDDI = false;
-      if (RemoveDDI)
-        DDI->eraseFromParent();
+        else if (CallInst *CI = dyn_cast<CallInst>(U)) {
+	  // This is a call by-value or some other instruction that
+	  // takes a pointer to the variable. Insert a *value*
+	  // intrinsic that describes the alloca.
+	  auto DbgVal =
+	    DIB.insertDbgValueIntrinsic(AI, 0,
+					DIVariable(DDI->getVariable()), CI);
+	  DbgVal->setDebugLoc(DDI->getDebugLoc());
+	}
+      DDI->eraseFromParent();
     }
   }
   return true;
@@ -1076,7 +1078,7 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
       if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
         return DDI;
 
-  return 0;
+  return nullptr;
 }
 
 bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 47083ea..f7787da 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -37,7 +37,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loop-simplify"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetOperations.h"
@@ -63,6 +62,8 @@
 #include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "loop-simplify"
+
 STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted");
 STATISTIC(NumNested  , "Number of nested loops split out");
 
@@ -85,7 +86,7 @@ static void placeSplitBlockCarefully(BasicBlock *NewBB,
 
   // Figure out *which* outside block to put this after.  Prefer an outside
   // block that neighbors a BB actually in the loop.
-  BasicBlock *FoundBB = 0;
+  BasicBlock *FoundBB = nullptr;
   for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
     Function::iterator BBI = SplitPreds[i];
     if (++BBI != NewBB->getParent()->end() &&
@@ -119,7 +120,7 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) {
       // If the loop is branched to from an indirect branch, we won't
       // be able to fully transform the loop, because it prohibits
       // edge splitting.
-      if (isa<IndirectBrInst>(P->getTerminator())) return 0;
+      if (isa<IndirectBrInst>(P->getTerminator())) return nullptr;
 
       // Keep track of it.
       OutsideBlocks.push_back(P);
@@ -160,14 +161,14 @@ static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit, Pass *PP) {
     BasicBlock *P = *I;
     if (L->contains(P)) {
       // Don't do this if the loop is exited via an indirect branch.
-      if (isa<IndirectBrInst>(P->getTerminator())) return 0;
+      if (isa<IndirectBrInst>(P->getTerminator())) return nullptr;
 
       LoopBlocks.push_back(P);
     }
   }
 
   assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");
-  BasicBlock *NewExitBB = 0;
+  BasicBlock *NewExitBB = nullptr;
 
   if (Exit->isLandingPad()) {
     SmallVector<BasicBlock*, 2> NewBBs;
@@ -211,7 +212,7 @@ static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA,
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
     PHINode *PN = cast<PHINode>(I);
     ++I;
-    if (Value *V = SimplifyInstruction(PN, 0, 0, DT)) {
+    if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT)) {
       // This is a degenerate PHI already, don't modify it!
       PN->replaceAllUsesWith(V);
       if (AA) AA->deleteValue(PN);
@@ -226,7 +227,7 @@ static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA,
         // We found something tasty to remove.
         return PN;
   }
-  return 0;
+  return nullptr;
 }
 
 /// \brief If this loop has multiple backedges, try to pull one of them out into
@@ -253,14 +254,14 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
                                 LoopInfo *LI, ScalarEvolution *SE, Pass *PP) {
   // Don't try to separate loops without a preheader.
   if (!Preheader)
-    return 0;
+    return nullptr;
 
   // The header is not a landing pad; preheader insertion should ensure this.
   assert(!L->getHeader()->isLandingPad() &&
          "Can't insert backedge to landing pad");
 
   PHINode *PN = findPHIToPartitionLoops(L, AA, DT);
-  if (PN == 0) return 0;  // No known way to partition.
+  if (!PN) return nullptr;  // No known way to partition.
 
   // Pull out all predecessors that have varying values in the loop.  This
   // handles the case when a PHI node has multiple instances of itself as
@@ -271,7 +272,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
         !L->contains(PN->getIncomingBlock(i))) {
       // We can't split indirectbr edges.
       if (isa<IndirectBrInst>(PN->getIncomingBlock(i)->getTerminator()))
-        return 0;
+        return nullptr;
       OuterLoopPreds.push_back(PN->getIncomingBlock(i));
     }
   }
@@ -362,7 +363,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
 
   // Unique backedge insertion currently depends on having a preheader.
   if (!Preheader)
-    return 0;
+    return nullptr;
 
   // The header is not a landing pad; preheader insertion should ensure this.
   assert(!Header->isLandingPad() && "Can't insert backedge to landing pad");
@@ -374,7 +375,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
 
     // Indirectbr edges cannot be split, so we must fail if we find one.
     if (isa<IndirectBrInst>(P->getTerminator()))
-      return 0;
+      return nullptr;
 
     if (P != Preheader) BackedgeBlocks.push_back(P);
   }
@@ -403,7 +404,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
     // preheader over to the new PHI node.
     unsigned PreheaderIdx = ~0U;
     bool HasUniqueIncomingValue = true;
-    Value *UniqueValue = 0;
+    Value *UniqueValue = nullptr;
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
       BasicBlock *IBB = PN->getIncomingBlock(i);
       Value *IV = PN->getIncomingValue(i);
@@ -412,7 +413,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
       } else {
         NewPN->addIncoming(IV, IBB);
         if (HasUniqueIncomingValue) {
-          if (UniqueValue == 0)
+          if (!UniqueValue)
             UniqueValue = IV;
           else if (UniqueValue != IV)
             HasUniqueIncomingValue = false;
@@ -609,7 +610,7 @@ ReprocessLoop:
   PHINode *PN;
   for (BasicBlock::iterator I = L->getHeader()->begin();
        (PN = dyn_cast<PHINode>(I++)); )
-    if (Value *V = SimplifyInstruction(PN, 0, 0, DT)) {
+    if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT)) {
       if (AA) AA->deleteValue(PN);
       if (SE) SE->forgetValue(PN);
       PN->replaceAllUsesWith(V);
@@ -653,7 +654,8 @@ ReprocessLoop:
         if (Inst == CI)
           continue;
         if (!L->makeLoopInvariant(Inst, AnyInvariant,
-                                 Preheader ? Preheader->getTerminator() : 0)) {
+                                  Preheader ? Preheader->getTerminator()
+                                            : nullptr)) {
           AllInvariant = false;
           break;
         }
@@ -761,12 +763,6 @@ namespace {
 
     /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
     void verifyAnalysis() const override;
-
-  private:
-    bool ProcessLoop(Loop *L);
-    BasicBlock *RewriteLoopExitBlock(Loop *L, BasicBlock *Exit);
-    Loop *SeparateNestedLoop(Loop *L, BasicBlock *Preheader);
-    BasicBlock *InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader);
   };
 }
 
@@ -782,7 +778,7 @@ INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
 char &llvm::LoopSimplifyID = LoopSimplify::ID;
 Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
 
-/// runOnLoop - Run down all loops in the CFG (recursively, but we could do
+/// runOnFunction - Run down all loops in the CFG (recursively, but we could do
 /// it in any convenient order) inserting preheaders...
 ///
 bool LoopSimplify::runOnFunction(Function &F) {
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index d2dfc20..d953e30 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loop-unroll"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -25,6 +24,8 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -34,6 +35,8 @@
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "loop-unroll"
+
 // TODO: Should these be here or in LoopUnroll?
 STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
 STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
@@ -68,10 +71,10 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
   BasicBlock *OnlyPred = BB->getSinglePredecessor();
-  if (!OnlyPred) return 0;
+  if (!OnlyPred) return nullptr;
 
   if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
-    return 0;
+    return nullptr;
 
   DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred);
 
@@ -227,20 +230,33 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
       (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
   }
 
+  // Report the unrolling decision.
+  DebugLoc LoopLoc = L->getStartLoc();
+  Function *F = Header->getParent();
+  LLVMContext &Ctx = F->getContext();
+
   if (CompletelyUnroll) {
     DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
           << " with trip count " << TripCount << "!\n");
+    emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc,
+                           Twine("completely unrolled loop with ") +
+                               Twine(TripCount) + " iterations");
   } else {
     DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
           << " by " << Count);
+    Twine DiagMsg("unrolled loop by a factor of " + Twine(Count));
     if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
       DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
+      DiagMsg.concat(" with a breakout at trip " + Twine(BreakoutTrip));
     } else if (TripMultiple != 1) {
       DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+      DiagMsg.concat(" with " + Twine(TripMultiple) + " trips per branch");
     } else if (RuntimeTripCount) {
       DEBUG(dbgs() << " with run-time trip count");
+      DiagMsg.concat(" with run-time trip count");
     }
     DEBUG(dbgs() << "!\n");
+    emitOptimizationRemark(Ctx, DEBUG_TYPE, *F, LoopLoc, DiagMsg);
   }
 
   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
@@ -411,7 +427,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     }
   }
 
-  DominatorTree *DT = 0;
+  DominatorTree *DT = nullptr;
   if (PP) {
     // FIXME: Reconstruct dom info, because it is not preserved properly.
     // Incrementally updating domtree after loop unrolling would be easy.
@@ -458,7 +474,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
 
   Loop *OuterL = L->getParentLoop();
   // Remove the loop from the LoopPassManager if it's completely removed.
-  if (CompletelyUnroll && LPM != NULL)
+  if (CompletelyUnroll && LPM != nullptr)
     LPM->deleteLoopFromQueue(L);
 
   // If we have a pass and a DominatorTree we should re-simplify impacted loops
@@ -470,7 +486,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
       OuterL = L;
     if (OuterL) {
       ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>();
-      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ 0, SE);
+      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE);
       formLCSSARecursively(*OuterL, *DT, SE);
     }
   }
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index d801d5f..5bef091 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -21,7 +21,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "loop-unroll"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopIterator.h"
@@ -37,6 +36,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "loop-unroll"
+
 STATISTIC(NumRuntimeUnrolled,
           "Number of loops unrolled with run-time trip counts");
 
@@ -58,7 +59,7 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
                           BasicBlock *OrigPH, BasicBlock *NewPH,
                           ValueToValueMapTy &LVMap, Pass *P) {
   BasicBlock *Latch = L->getLoopLatch();
-  assert(Latch != 0 && "Loop must have a latch");
+  assert(Latch && "Loop must have a latch");
 
   // Create a PHI node for each outgoing value from the original loop
   // (which means it is an outgoing value from the prolog code too).
@@ -110,7 +111,7 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
     new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, TripCount,
                  ConstantInt::get(TripCount->getType(), Count));
   BasicBlock *Exit = L->getUniqueExitBlock();
-  assert(Exit != 0 && "Loop must have a single exit block only");
+  assert(Exit && "Loop must have a single exit block only");
   // Split the exit to maintain loop canonicalization guarantees
   SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit));
   if (!Exit->isLandingPad()) {
@@ -232,7 +233,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
 
   // Make sure the loop is in canonical form, and there is a single
   // exit block only.
-  if (!L->isLoopSimplifyForm() || L->getUniqueExitBlock() == 0)
+  if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock())
     return false;
 
   // Use Scalar Evolution to compute the trip count.  This allows more
@@ -240,7 +241,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
   if (!LPM)
     return false;
   ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
-  if (SE == 0)
+  if (!SE)
     return false;
 
   // Only unroll loops with a computable trip count and the trip count needs
@@ -301,7 +302,7 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
   ValueToValueMapTy LVMap;
   Function *F = Header->getParent();
   // These variables are used to update the CFG links in each iteration
-  BasicBlock *CompareBB = 0;
+  BasicBlock *CompareBB = nullptr;
   BasicBlock *LastLoopBB = PH;
   // Get an ordered list of blocks in the loop to help with the ordering of the
   // cloned blocks in the prolog code
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index 3e61289..ff89e74 100644
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "lower-expect-intrinsic"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/BasicBlock.h"
@@ -29,6 +28,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "lower-expect-intrinsic"
+
 STATISTIC(IfHandled, "Number of 'expect' intrinsic instructions handled");
 
 static cl::opt<uint32_t>
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index b1f758e..66d57b0 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "lowerinvoke"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -25,6 +24,8 @@
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "lowerinvoke"
+
 STATISTIC(NumInvokes, "Number of invokes replaced");
 
 namespace {
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 6fb7410..9ef694c 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -27,6 +27,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "lower-switch"
+
 namespace {
   /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch
   /// instructions.
@@ -51,7 +53,8 @@ namespace {
       Constant* High;
       BasicBlock* BB;
 
-      CaseRange(Constant *low = 0, Constant *high = 0, BasicBlock *bb = 0) :
+      CaseRange(Constant *low = nullptr, Constant *high = nullptr,
+                BasicBlock *bb = nullptr) :
         Low(low), High(high), BB(bb) { }
     };
 
@@ -182,7 +185,7 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
   F->getBasicBlockList().insert(++FI, NewLeaf);
 
   // Emit comparison
-  ICmpInst* Comp = NULL;
+  ICmpInst* Comp = nullptr;
   if (Leaf.Low == Leaf.High) {
     // Make the seteq instruction...
     Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_EQ, Val,
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index a188ac5..189caa7 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mem2reg"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Dominators.h"
@@ -22,6 +21,8 @@
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mem2reg"
+
 STATISTIC(NumPromoted, "Number of alloca's promoted");
 
 namespace {
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index ff6e6f9..d9dbbca 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -24,16 +24,16 @@ static void appendToGlobalArray(const char *Array,
                                 Module &M, Function *F, int Priority) {
   IRBuilder<> IRB(M.getContext());
   FunctionType *FnTy = FunctionType::get(IRB.getVoidTy(), false);
-  StructType *Ty = StructType::get(
-      IRB.getInt32Ty(), PointerType::getUnqual(FnTy), NULL);
-
-  Constant *RuntimeCtorInit = ConstantStruct::get(
-      Ty, IRB.getInt32(Priority), F, NULL);
 
   // Get the current set of static global constructors and add the new ctor
   // to the list.
   SmallVector<Constant *, 16> CurrentCtors;
-  if (GlobalVariable * GVCtor = M.getNamedGlobal(Array)) {
+  StructType *EltTy;
+  if (GlobalVariable *GVCtor = M.getNamedGlobal(Array)) {
+    // If there is a global_ctors array, use the existing struct type, which can
+    // have 2 or 3 fields.
+    ArrayType *ATy = cast<ArrayType>(GVCtor->getType()->getElementType());
+    EltTy = cast<StructType>(ATy->getElementType());
     if (Constant *Init = GVCtor->getInitializer()) {
       unsigned n = Init->getNumOperands();
       CurrentCtors.reserve(n + 1);
@@ -41,13 +41,26 @@ static void appendToGlobalArray(const char *Array,
         CurrentCtors.push_back(cast<Constant>(Init->getOperand(i)));
     }
     GVCtor->eraseFromParent();
+  } else {
+    // Use a simple two-field struct if there isn't one already.
+    EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
+                            nullptr);
   }
 
+  // Build a 2 or 3 field global_ctor entry.  We don't take a comdat key.
+  Constant *CSVals[3];
+  CSVals[0] = IRB.getInt32(Priority);
+  CSVals[1] = F;
+  // FIXME: Drop support for the two element form in LLVM 4.0.
+  if (EltTy->getNumElements() >= 3)
+    CSVals[2] = llvm::Constant::getNullValue(IRB.getInt8PtrTy());
+  Constant *RuntimeCtorInit =
+      ConstantStruct::get(EltTy, makeArrayRef(CSVals, EltTy->getNumElements()));
+
   CurrentCtors.push_back(RuntimeCtorInit);
 
   // Create a new initializer.
-  ArrayType *AT = ArrayType::get(RuntimeCtorInit->getType(),
-                                 CurrentCtors.size());
+  ArrayType *AT = ArrayType::get(EltTy, CurrentCtors.size());
   Constant *NewInit = ConstantArray::get(AT, CurrentCtors);
 
   // Create the new global variable and replace all uses of
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 25fab89..06d73fe 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -25,7 +25,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mem2reg"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -51,6 +50,8 @@
 #include <queue>
 using namespace llvm;
 
+#define DEBUG_TYPE "mem2reg"
+
 STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block");
 STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
 STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
@@ -59,6 +60,7 @@ STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
 bool llvm::isAllocaPromotable(const AllocaInst *AI) {
   // FIXME: If the memory unit is of pointer or integer type, we can permit
   // assignments to subsections of the memory unit.
+  unsigned AS = AI->getType()->getAddressSpace();
 
   // Only allow direct and non-volatile loads and stores...
   for (const User *U : AI->users()) {
@@ -79,12 +81,12 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
           II->getIntrinsicID() != Intrinsic::lifetime_end)
         return false;
     } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
-      if (BCI->getType() != Type::getInt8PtrTy(U->getContext()))
+      if (BCI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
         return false;
       if (!onlyUsedByLifetimeMarkers(BCI))
         return false;
     } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
-      if (GEPI->getType() != Type::getInt8PtrTy(U->getContext()))
+      if (GEPI->getType() != Type::getInt8PtrTy(U->getContext(), AS))
         return false;
       if (!GEPI->hasAllZeroIndices())
         return false;
@@ -114,11 +116,11 @@ struct AllocaInfo {
   void clear() {
     DefiningBlocks.clear();
     UsingBlocks.clear();
-    OnlyStore = 0;
-    OnlyBlock = 0;
+    OnlyStore = nullptr;
+    OnlyBlock = nullptr;
     OnlyUsedInOneBlock = true;
-    AllocaPointerVal = 0;
-    DbgDeclare = 0;
+    AllocaPointerVal = nullptr;
+    DbgDeclare = nullptr;
   }
 
   /// Scan the uses of the specified alloca, filling in the AllocaInfo used
@@ -146,7 +148,7 @@ struct AllocaInfo {
       }
 
       if (OnlyUsedInOneBlock) {
-        if (OnlyBlock == 0)
+        if (!OnlyBlock)
           OnlyBlock = User->getParent();
         else if (OnlyBlock != User->getParent())
           OnlyUsedInOneBlock = false;
@@ -162,7 +164,7 @@ class RenamePassData {
 public:
   typedef std::vector<Value *> ValVector;
 
-  RenamePassData() : BB(NULL), Pred(NULL), Values() {}
+  RenamePassData() : BB(nullptr), Pred(nullptr), Values() {}
   RenamePassData(BasicBlock *B, BasicBlock *P, const ValVector &V)
       : BB(B), Pred(P), Values(V) {}
   BasicBlock *BB;
@@ -471,7 +473,8 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
     // Find the nearest store that has a lower index than this load.
     StoresByIndexTy::iterator I =
         std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(),
-                         std::make_pair(LoadIdx, static_cast<StoreInst *>(0)),
+                         std::make_pair(LoadIdx,
+                                        static_cast<StoreInst *>(nullptr)),
                          less_first());
 
     if (I == StoresByIndex.begin())
@@ -632,7 +635,7 @@ void PromoteMem2Reg::run() {
   // and inserting the phi nodes we marked as necessary
   //
   std::vector<RenamePassData> RenamePassWorkList;
-  RenamePassWorkList.push_back(RenamePassData(F.begin(), 0, Values));
+  RenamePassWorkList.push_back(RenamePassData(F.begin(), nullptr, Values));
   do {
     RenamePassData RPD;
     RPD.swap(RenamePassWorkList.back());
@@ -682,7 +685,7 @@ void PromoteMem2Reg::run() {
       PHINode *PN = I->second;
 
       // If this PHI node merges one value and/or undefs, get the value.
-      if (Value *V = SimplifyInstruction(PN, 0, 0, &DT)) {
+      if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, &DT)) {
         if (AST && PN->getType()->isPointerTy())
           AST->deleteValue(PN);
         PN->replaceAllUsesWith(V);
@@ -990,7 +993,7 @@ NextIteration:
         // Get the next phi node.
         ++PNI;
         APN = dyn_cast<PHINode>(PNI);
-        if (APN == 0)
+        if (!APN)
           break;
 
         // Verify that it is missing entries.  If not, it is not being inserted
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index 28f5c44..3fcb789 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ssaupdater"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/TinyPtrVector.h"
@@ -28,20 +27,22 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ssaupdater"
+
 typedef DenseMap<BasicBlock*, Value*> AvailableValsTy;
 static AvailableValsTy &getAvailableVals(void *AV) {
   return *static_cast<AvailableValsTy*>(AV);
 }
 
 SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode*> *NewPHI)
-  : AV(0), ProtoType(0), ProtoName(), InsertedPHIs(NewPHI) {}
+  : AV(nullptr), ProtoType(nullptr), ProtoName(), InsertedPHIs(NewPHI) {}
 
 SSAUpdater::~SSAUpdater() {
   delete static_cast<AvailableValsTy*>(AV);
 }
 
 void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
-  if (AV == 0)
+  if (!AV)
     AV = new AvailableValsTy();
   else
     getAvailableVals(AV).clear();
@@ -54,7 +55,7 @@ bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const {
 }
 
 void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
-  assert(ProtoType != 0 && "Need to initialize SSAUpdater");
+  assert(ProtoType && "Need to initialize SSAUpdater");
   assert(ProtoType == V->getType() &&
          "All rewritten values must have the same type");
   getAvailableVals(AV)[BB] = V;
@@ -90,7 +91,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // Otherwise, we have the hard case.  Get the live-in values for each
   // predecessor.
   SmallVector<std::pair<BasicBlock*, Value*>, 8> PredValues;
-  Value *SingularValue = 0;
+  Value *SingularValue = nullptr;
 
   // We can get our predecessor info by walking the pred_iterator list, but it
   // is relatively slow.  If we already have PHI nodes in this block, walk one
@@ -105,7 +106,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
       if (i == 0)
         SingularValue = PredVal;
       else if (PredVal != SingularValue)
-        SingularValue = 0;
+        SingularValue = nullptr;
     }
   } else {
     bool isFirstPred = true;
@@ -119,7 +120,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
         SingularValue = PredVal;
         isFirstPred = false;
       } else if (PredVal != SingularValue)
-        SingularValue = 0;
+        SingularValue = nullptr;
     }
   }
 
@@ -128,7 +129,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
     return UndefValue::get(ProtoType);
 
   // Otherwise, if all the merged values are the same, just use it.
-  if (SingularValue != 0)
+  if (SingularValue)
     return SingularValue;
 
   // Otherwise, we do need a PHI: check to see if we already have one available
@@ -291,7 +292,7 @@ public:
     PHINode *PHI = ValueIsPHI(Val, Updater);
     if (PHI && PHI->getNumIncomingValues() == 0)
       return PHI;
-    return 0;
+    return nullptr;
   }
 
   /// GetPHIValue - For the specified PHI instruction, return the value
@@ -401,7 +402,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
     // the order of these instructions in the block.  If the first use in the
     // block is a load, then it uses the live in value.  The last store defines
     // the live out value.  We handle this by doing a linear scan of the block.
-    Value *StoredValue = 0;
+    Value *StoredValue = nullptr;
     for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
       if (LoadInst *L = dyn_cast<LoadInst>(II)) {
         // If this is a load from an unrelated pointer, ignore it.
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 1e88587..150dbdd 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "simplifycfg"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -50,6 +49,8 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+#define DEBUG_TYPE "simplifycfg"
+
 static cl::opt<unsigned>
 PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(1),
    cl::desc("Control the amount of phi node folding to perform (default = 1)"));
@@ -212,6 +213,7 @@ static unsigned ComputeSpeculationCost(const User *I) {
     if (!cast<GEPOperator>(I)->hasAllConstantIndices())
       return UINT_MAX;
     return 1;
+  case Instruction::ExtractValue:
   case Instruction::Load:
   case Instruction::Add:
   case Instruction::Sub:
@@ -272,12 +274,12 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   // branch to BB, then it must be in the 'conditional' part of the "if
   // statement".  If not, it definitely dominates the region.
   BranchInst *BI = dyn_cast<BranchInst>(PBB->getTerminator());
-  if (BI == 0 || BI->isConditional() || BI->getSuccessor(0) != BB)
+  if (!BI || BI->isConditional() || BI->getSuccessor(0) != BB)
     return true;
 
   // If we aren't allowing aggressive promotion anymore, then don't consider
   // instructions in the 'if region'.
-  if (AggressiveInsts == 0) return false;
+  if (!AggressiveInsts) return false;
 
   // If we have seen this instruction before, don't count it again.
   if (AggressiveInsts->count(I)) return true;
@@ -332,7 +334,7 @@ static ConstantInt *GetConstantInt(Value *V, const DataLayout *DL) {
           return cast<ConstantInt>
             (ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false));
       }
-  return 0;
+  return nullptr;
 }
 
 /// GatherConstantCompares - Given a potentially 'or'd or 'and'd together
@@ -343,7 +345,7 @@ static Value *
 GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
                        const DataLayout *DL, bool isEQ, unsigned &UsedICmps) {
   Instruction *I = dyn_cast<Instruction>(V);
-  if (I == 0) return 0;
+  if (!I) return nullptr;
 
   // If this is an icmp against a constant, handle this as one of the cases.
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
@@ -390,19 +392,19 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
 
       // If there are a ton of values, we don't want to make a ginormous switch.
       if (Span.getSetSize().ugt(8) || Span.isEmptySet())
-        return 0;
+        return nullptr;
 
       for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
         Vals.push_back(ConstantInt::get(V->getContext(), Tmp));
       UsedICmps++;
       return hasAdd ? RHSVal : I->getOperand(0);
     }
-    return 0;
+    return nullptr;
   }
 
   // Otherwise, we can only handle an | or &, depending on isEQ.
   if (I->getOpcode() != (isEQ ? Instruction::Or : Instruction::And))
-    return 0;
+    return nullptr;
 
   unsigned NumValsBeforeLHS = Vals.size();
   unsigned UsedICmpsBeforeLHS = UsedICmps;
@@ -420,19 +422,19 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
 
     // The RHS of the or/and can't be folded in and we haven't used "Extra" yet,
     // set it and return success.
-    if (Extra == 0 || Extra == I->getOperand(1)) {
+    if (Extra == nullptr || Extra == I->getOperand(1)) {
       Extra = I->getOperand(1);
       return LHS;
     }
 
     Vals.resize(NumValsBeforeLHS);
     UsedICmps = UsedICmpsBeforeLHS;
-    return 0;
+    return nullptr;
   }
 
   // If the LHS can't be folded in, but Extra is available and RHS can, try to
   // use LHS as Extra.
-  if (Extra == 0 || Extra == I->getOperand(0)) {
+  if (Extra == nullptr || Extra == I->getOperand(0)) {
     Value *OldExtra = Extra;
     Extra = I->getOperand(0);
     if (Value *RHS = GatherConstantCompares(I->getOperand(1), Vals, Extra, DL,
@@ -442,11 +444,11 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
     Extra = OldExtra;
   }
 
-  return 0;
+  return nullptr;
 }
 
 static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
-  Instruction *Cond = 0;
+  Instruction *Cond = nullptr;
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cond = dyn_cast<Instruction>(SI->getCondition());
   } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -463,7 +465,7 @@ static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
 /// isValueEqualityComparison - Return true if the specified terminator checks
 /// to see if a value is equal to constant integer value.
 Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
-  Value *CV = 0;
+  Value *CV = nullptr;
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     // Do not permit merging of large switch instructions into their
     // predecessors unless there is only one predecessor.
@@ -653,11 +655,11 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
 
   // Otherwise, TI's block must correspond to some matched value.  Find out
   // which value (or set of values) this is.
-  ConstantInt *TIV = 0;
+  ConstantInt *TIV = nullptr;
   BasicBlock *TIBB = TI->getParent();
   for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
     if (PredCases[i].Dest == TIBB) {
-      if (TIV != 0)
+      if (TIV)
         return false;  // Cannot handle multiple values coming to this block.
       TIV = PredCases[i].Value;
     }
@@ -665,7 +667,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
 
   // Okay, we found the one constant that our value can be if we get into TI's
   // BB.  Find out which successor will unconditionally be branched to.
-  BasicBlock *TheRealDest = 0;
+  BasicBlock *TheRealDest = nullptr;
   for (unsigned i = 0, e = ThisCases.size(); i != e; ++i)
     if (ThisCases[i].Value == TIV) {
       TheRealDest = ThisCases[i].Dest;
@@ -673,7 +675,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     }
 
   // If not handled by any explicit cases, it is handled by the default case.
-  if (TheRealDest == 0) TheRealDest = ThisDef;
+  if (!TheRealDest) TheRealDest = ThisDef;
 
   // Remove PHI node entries for dead edges.
   BasicBlock *CheckEdge = TheRealDest;
@@ -681,7 +683,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     if (*SI != CheckEdge)
       (*SI)->removePredecessor(TIBB);
     else
-      CheckEdge = 0;
+      CheckEdge = nullptr;
 
   // Insert the new branch.
   Instruction *NI = Builder.CreateBr(TheRealDest);
@@ -950,10 +952,10 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       // Okay, last check.  If BB is still a successor of PSI, then we must
       // have an infinite loop case.  If so, add an infinitely looping block
       // to handle the case to preserve the behavior of the code.
-      BasicBlock *InfLoopBlock = 0;
+      BasicBlock *InfLoopBlock = nullptr;
       for (unsigned i = 0, e = NewSI->getNumSuccessors(); i != e; ++i)
         if (NewSI->getSuccessor(i) == BB) {
-          if (InfLoopBlock == 0) {
+          if (!InfLoopBlock) {
             // Insert it at the end of the function, because it's either code,
             // or it won't matter if it's hot. :)
             InfLoopBlock = BasicBlock::Create(BB->getContext(),
@@ -1099,7 +1101,7 @@ HoistTerminator:
       // These values do not agree.  Insert a select instruction before NT
       // that determines the right value.
       SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
-      if (SI == 0)
+      if (!SI)
         SI = cast<SelectInst>
           (Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
                                 BB1V->getName()+"."+BB2V->getName()));
@@ -1144,7 +1146,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
 
   // Gather the PHI nodes in BBEnd.
   std::map<Value*, std::pair<Value*, PHINode*> > MapValueFromBB1ToBB2;
-  Instruction *FirstNonPhiInBBEnd = 0;
+  Instruction *FirstNonPhiInBBEnd = nullptr;
   for (BasicBlock::iterator I = BBEnd->begin(), E = BBEnd->end();
        I != E; ++I) {
     if (PHINode *PN = dyn_cast<PHINode>(I)) {
@@ -1222,7 +1224,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     // The operands should be either the same or they need to be generated
     // with a PHI node after sinking. We only handle the case where there is
     // a single pair of different operands.
-    Value *DifferentOp1 = 0, *DifferentOp2 = 0;
+    Value *DifferentOp1 = nullptr, *DifferentOp2 = nullptr;
     unsigned Op1Idx = 0;
     for (unsigned I = 0, E = I1->getNumOperands(); I != E; ++I) {
       if (I1->getOperand(I) == I2->getOperand(I))
@@ -1318,11 +1320,11 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
                                      BasicBlock *StoreBB, BasicBlock *EndBB) {
   StoreInst *StoreToHoist = dyn_cast<StoreInst>(I);
   if (!StoreToHoist)
-    return 0;
+    return nullptr;
 
   // Volatile or atomic.
   if (!StoreToHoist->isSimple())
-    return 0;
+    return nullptr;
 
   Value *StorePtr = StoreToHoist->getPointerOperand();
 
@@ -1334,7 +1336,7 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
 
     // Could be calling an instruction that effects memory like free().
     if (CurI->mayHaveSideEffects() && !isa<StoreInst>(CurI))
-      return 0;
+      return nullptr;
 
     StoreInst *SI = dyn_cast<StoreInst>(CurI);
     // Found the previous store make sure it stores to the same location.
@@ -1342,10 +1344,10 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
       // Found the previous store, return its value operand.
       return SI->getValueOperand();
     else if (SI)
-      return 0; // Unknown store.
+      return nullptr; // Unknown store.
   }
 
-  return 0;
+  return nullptr;
 }
 
 /// \brief Speculate a conditional basic block flattening the CFG.
@@ -1411,8 +1413,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
   SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
 
   unsigned SpeculationCost = 0;
-  Value *SpeculatedStoreValue = 0;
-  StoreInst *SpeculatedStore = 0;
+  Value *SpeculatedStoreValue = nullptr;
+  StoreInst *SpeculatedStore = nullptr;
   for (BasicBlock::iterator BBI = ThenBB->begin(),
                             BBE = std::prev(ThenBB->end());
        BBI != BBE; ++BBI) {
@@ -1620,7 +1622,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *DL) {
   // constants.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i));
-    if (CB == 0 || !CB->getType()->isIntegerTy(1)) continue;
+    if (!CB || !CB->getType()->isIntegerTy(1)) continue;
 
     // Okay, we now know that all edges from PredBB should be revectored to
     // branch to RealDest.
@@ -1745,7 +1747,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) {
   // If we folded the first phi, PN dangles at this point.  Refresh it.  If
   // we ran out of PHIs then we simplified them all.
   PN = dyn_cast<PHINode>(BB->begin());
-  if (PN == 0) return true;
+  if (!PN) return true;
 
   // Don't fold i1 branches on PHIs which contain binary operators.  These can
   // often be turned into switches and other things.
@@ -1759,11 +1761,11 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) {
   // instructions in the predecessor blocks can be promoted as well.  If
   // not, we won't be able to get rid of the control flow, so it's not
   // worth promoting to select instructions.
-  BasicBlock *DomBlock = 0;
+  BasicBlock *DomBlock = nullptr;
   BasicBlock *IfBlock1 = PN->getIncomingBlock(0);
   BasicBlock *IfBlock2 = PN->getIncomingBlock(1);
   if (cast<BranchInst>(IfBlock1->getTerminator())->isConditional()) {
-    IfBlock1 = 0;
+    IfBlock1 = nullptr;
   } else {
     DomBlock = *pred_begin(IfBlock1);
     for (BasicBlock::iterator I = IfBlock1->begin();!isa<TerminatorInst>(I);++I)
@@ -1776,7 +1778,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL) {
   }
 
   if (cast<BranchInst>(IfBlock2->getTerminator())->isConditional()) {
-    IfBlock2 = 0;
+    IfBlock2 = nullptr;
   } else {
     DomBlock = *pred_begin(IfBlock2);
     for (BasicBlock::iterator I = IfBlock2->begin();!isa<TerminatorInst>(I);++I)
@@ -1959,7 +1961,7 @@ static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
 bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   BasicBlock *BB = BI->getParent();
 
-  Instruction *Cond = 0;
+  Instruction *Cond = nullptr;
   if (BI->isConditional())
     Cond = dyn_cast<Instruction>(BI->getCondition());
   else {
@@ -1985,12 +1987,12 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
           }
         }
 
-    if (Cond == 0)
+    if (!Cond)
       return false;
   }
 
-  if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
-    Cond->getParent() != BB || !Cond->hasOneUse())
+  if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
+      Cond->getParent() != BB || !Cond->hasOneUse())
   return false;
 
   // Only allow this if the condition is a simple instruction that can be
@@ -2005,7 +2007,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   // that feeds the branch.  We later ensure that any values that _it_ uses
   // were also live in the predecessor, so that we don't unnecessarily create
   // register pressure or inhibit out-of-order execution.
-  Instruction *BonusInst = 0;
+  Instruction *BonusInst = nullptr;
   if (&*FrontIt != Cond &&
       FrontIt->hasOneUse() && FrontIt->user_back() == Cond &&
       isSafeToSpeculativelyExecute(FrontIt)) {
@@ -2040,7 +2042,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
 
   // Finally, don't infinitely unroll conditional loops.
   BasicBlock *TrueDest  = BI->getSuccessor(0);
-  BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : 0;
+  BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : nullptr;
   if (TrueDest == BB || FalseDest == BB)
     return false;
 
@@ -2052,7 +2054,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     // the common successor, verify that the same value flows in from both
     // blocks.
     SmallVector<PHINode*, 4> PHIs;
-    if (PBI == 0 || PBI->isUnconditional() ||
+    if (!PBI || PBI->isUnconditional() ||
         (BI->isConditional() &&
          !SafeToMergeTerminators(BI, PBI)) ||
         (!BI->isConditional() &&
@@ -2142,7 +2144,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     }
 
     // If we have a bonus inst, clone it into the predecessor block.
-    Instruction *NewBonus = 0;
+    Instruction *NewBonus = nullptr;
     if (BonusInst) {
       NewBonus = BonusInst->clone();
 
@@ -2218,14 +2220,14 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
                          MDBuilder(BI->getContext()).
                          createBranchWeights(MDWeights));
       } else
-        PBI->setMetadata(LLVMContext::MD_prof, NULL);
+        PBI->setMetadata(LLVMContext::MD_prof, nullptr);
     } else {
       // Update PHI nodes in the common successors.
       for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
         ConstantInt *PBI_C = cast<ConstantInt>(
           PHIs[i]->getIncomingValueForBlock(PBI->getParent()));
         assert(PBI_C->getType()->isIntegerTy(1));
-        Instruction *MergedCond = 0;
+        Instruction *MergedCond = nullptr;
         if (PBI->getSuccessor(0) == TrueDest) {
           // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value)
           // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value)
@@ -2498,16 +2500,16 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
   // If TrueBB and FalseBB are equal, only try to preserve one copy of that
   // successor.
   BasicBlock *KeepEdge1 = TrueBB;
-  BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : 0;
+  BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr;
 
   // Then remove the rest.
   for (unsigned I = 0, E = OldTerm->getNumSuccessors(); I != E; ++I) {
     BasicBlock *Succ = OldTerm->getSuccessor(I);
     // Make sure only to keep exactly one copy of each edge.
     if (Succ == KeepEdge1)
-      KeepEdge1 = 0;
+      KeepEdge1 = nullptr;
     else if (Succ == KeepEdge2)
-      KeepEdge2 = 0;
+      KeepEdge2 = nullptr;
     else
       Succ->removePredecessor(OldTerm->getParent());
   }
@@ -2516,7 +2518,7 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
   Builder.SetCurrentDebugLocation(OldTerm->getDebugLoc());
 
   // Insert an appropriate new terminator.
-  if ((KeepEdge1 == 0) && (KeepEdge2 == 0)) {
+  if (!KeepEdge1 && !KeepEdge2) {
     if (TrueBB == FalseBB)
       // We were only looking for one successor, and it was present.
       // Create an unconditional branch to it.
@@ -2538,7 +2540,7 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
     // One of the selected values was a successor, but the other wasn't.
     // Insert an unconditional branch to the one that was found;
     // the edge to the one that wasn't must be unreachable.
-    if (KeepEdge1 == 0)
+    if (!KeepEdge1)
       // Only TrueBB was found.
       Builder.CreateBr(TrueBB);
     else
@@ -2639,7 +2641,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
   // 'V' and this block is the default case for the switch.  In this case we can
   // fold the compared value into the switch to simplify things.
   BasicBlock *Pred = BB->getSinglePredecessor();
-  if (Pred == 0 || !isa<SwitchInst>(Pred->getTerminator())) return false;
+  if (!Pred || !isa<SwitchInst>(Pred->getTerminator())) return false;
 
   SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator());
   if (SI->getCondition() != V)
@@ -2681,7 +2683,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
   // the block.
   BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0);
   PHINode *PHIUse = dyn_cast<PHINode>(ICI->user_back());
-  if (PHIUse == 0 || PHIUse != &SuccBlock->front() ||
+  if (PHIUse == nullptr || PHIUse != &SuccBlock->front() ||
       isa<PHINode>(++BasicBlock::iterator(PHIUse)))
     return false;
 
@@ -2733,16 +2735,16 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
 static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *DL,
                                       IRBuilder<> &Builder) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
-  if (Cond == 0) return false;
+  if (!Cond) return false;
 
 
   // Change br (X == 0 | X == 1), T, F into a switch instruction.
   // If this is a bunch of seteq's or'd together, or if it's a bunch of
   // 'setne's and'ed together, collect them.
-  Value *CompVal = 0;
+  Value *CompVal = nullptr;
   std::vector<ConstantInt*> Values;
   bool TrueWhenEqual = true;
-  Value *ExtraCase = 0;
+  Value *ExtraCase = nullptr;
   unsigned UsedICmps = 0;
 
   if (Cond->getOpcode() == Instruction::Or) {
@@ -2755,7 +2757,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *DL,
   }
 
   // If we didn't have a multiply compared value, fail.
-  if (CompVal == 0) return false;
+  if (!CompVal) return false;
 
   // Avoid turning single icmps into a switch.
   if (UsedICmps <= 1)
@@ -3050,7 +3052,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         // Find the most popular block.
         unsigned MaxPop = 0;
         unsigned MaxIndex = 0;
-        BasicBlock *MaxBlock = 0;
+        BasicBlock *MaxBlock = nullptr;
         for (std::map<BasicBlock*, std::pair<unsigned, unsigned> >::iterator
              I = Popularity.begin(), E = Popularity.end(); I != E; ++I) {
           if (I->second.first > MaxPop ||
@@ -3188,7 +3190,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI) {
   Value *Cond = SI->getCondition();
   unsigned Bits = Cond->getType()->getIntegerBitWidth();
   APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
-  ComputeMaskedBits(Cond, KnownZero, KnownOne);
+  computeKnownBits(Cond, KnownZero, KnownOne);
 
   // Gather dead cases.
   SmallVector<ConstantInt*, 8> DeadCases;
@@ -3241,13 +3243,13 @@ static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
                                               BasicBlock *BB,
                                               int *PhiIndex) {
   if (BB->getFirstNonPHIOrDbg() != BB->getTerminator())
-    return NULL; // BB must be empty to be a candidate for simplification.
+    return nullptr; // BB must be empty to be a candidate for simplification.
   if (!BB->getSinglePredecessor())
-    return NULL; // BB must be dominated by the switch.
+    return nullptr; // BB must be dominated by the switch.
 
   BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
   if (!Branch || !Branch->isUnconditional())
-    return NULL; // Terminator must be unconditional branch.
+    return nullptr; // Terminator must be unconditional branch.
 
   BasicBlock *Succ = Branch->getSuccessor(0);
 
@@ -3263,7 +3265,7 @@ static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
     return PHI;
   }
 
-  return NULL;
+  return nullptr;
 }
 
 /// ForwardSwitchConditionToPHI - Try to forward the condition of a switch
@@ -3336,12 +3338,12 @@ ConstantFold(Instruction *I,
   if (SelectInst *Select = dyn_cast<SelectInst>(I)) {
     Constant *A = LookupConstant(Select->getCondition(), ConstantPool);
     if (!A)
-      return 0;
+      return nullptr;
     if (A->isAllOnesValue())
       return LookupConstant(Select->getTrueValue(), ConstantPool);
     if (A->isNullValue())
       return LookupConstant(Select->getFalseValue(), ConstantPool);
-    return 0;
+    return nullptr;
   }
 
   SmallVector<Constant *, 4> COps;
@@ -3349,7 +3351,7 @@ ConstantFold(Instruction *I,
     if (Constant *A = LookupConstant(I->getOperand(N), ConstantPool))
       COps.push_back(A);
     else
-      return 0;
+      return nullptr;
   }
 
   if (CmpInst *Cmp = dyn_cast<CmpInst>(I))
@@ -3492,7 +3494,8 @@ SwitchLookupTable::SwitchLookupTable(Module &M,
              const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values,
                                      Constant *DefaultValue,
                                      const DataLayout *DL)
-    : SingleValue(0), BitMap(0), BitMapElementTy(0), Array(0) {
+    : SingleValue(nullptr), BitMap(nullptr), BitMapElementTy(nullptr),
+      Array(nullptr) {
   assert(Values.size() && "Can't build lookup table without values!");
   assert(TableSize >= Values.size() && "Can't fit values in table!");
 
@@ -3513,7 +3516,7 @@ SwitchLookupTable::SwitchLookupTable(Module &M,
     TableContents[Idx] = CaseRes;
 
     if (CaseRes != SingleValue)
-      SingleValue = 0;
+      SingleValue = nullptr;
   }
 
   // Fill in any holes in the table with the default result.
@@ -3526,7 +3529,7 @@ SwitchLookupTable::SwitchLookupTable(Module &M,
     }
 
     if (DefaultValue != SingleValue)
-      SingleValue = 0;
+      SingleValue = nullptr;
   }
 
   // If each element in the table contains the same value, we only need to store
@@ -3696,7 +3699,7 @@ static bool SwitchToLookupTable(SwitchInst *SI,
   ConstantInt *MinCaseVal = CI.getCaseValue();
   ConstantInt *MaxCaseVal = CI.getCaseValue();
 
-  BasicBlock *CommonDest = 0;
+  BasicBlock *CommonDest = nullptr;
   typedef SmallVector<std::pair<ConstantInt*, Constant*>, 4> ResultListTy;
   SmallDenseMap<PHINode*, ResultListTy> ResultLists;
   SmallDenseMap<PHINode*, Constant*> DefaultResults;
@@ -3741,8 +3744,8 @@ static bool SwitchToLookupTable(SwitchInst *SI,
   SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
   bool HasDefaultResults = false;
   if (TableHasHoles) {
-    HasDefaultResults = GetCaseResults(SI, 0, SI->getDefaultDest(), &CommonDest,
-                                       DefaultResultsList, DL);
+    HasDefaultResults = GetCaseResults(SI, nullptr, SI->getDefaultDest(),
+                                       &CommonDest, DefaultResultsList, DL);
   }
   bool NeedMask = (TableHasHoles && !HasDefaultResults);
   if (NeedMask) {
@@ -4038,8 +4041,8 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // from BI.  We know that the condbr dominates the two blocks, so see if
   // there is any identical code in the "then" and "else" blocks.  If so, we
   // can hoist it up to the branching block.
-  if (BI->getSuccessor(0)->getSinglePredecessor() != 0) {
-    if (BI->getSuccessor(1)->getSinglePredecessor() != 0) {
+  if (BI->getSuccessor(0)->getSinglePredecessor()) {
+    if (BI->getSuccessor(1)->getSinglePredecessor()) {
       if (HoistThenElseCodeToIf(BI))
         return SimplifyCFG(BB, TTI, DL) | true;
     } else {
@@ -4051,7 +4054,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
         if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0)))
           return SimplifyCFG(BB, TTI, DL) | true;
     }
-  } else if (BI->getSuccessor(1)->getSinglePredecessor() != 0) {
+  } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
     // If Successor #0 has multiple preds, we may be able to conditionally
     // execute Successor #1 if it branches to successor #0.
     TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator();
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 30f56be..b284e6f 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -13,8 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "indvars"
-
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -34,6 +32,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "indvars"
+
 STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
 STATISTIC(NumElimOperand,  "Number of IV operands folded into a use");
 STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated");
@@ -56,14 +56,14 @@ namespace {
 
   public:
     SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LPPassManager *LPM,
-                   SmallVectorImpl<WeakVH> &Dead, IVUsers *IVU = NULL) :
+                   SmallVectorImpl<WeakVH> &Dead, IVUsers *IVU = nullptr) :
       L(Loop),
       LI(LPM->getAnalysisIfAvailable<LoopInfo>()),
       SE(SE),
       DeadInsts(Dead),
       Changed(false) {
       DataLayoutPass *DLP = LPM->getAnalysisIfAvailable<DataLayoutPass>();
-      DL = DLP ? &DLP->getDataLayout() : 0;
+      DL = DLP ? &DLP->getDataLayout() : nullptr;
       assert(LI && "IV simplification requires LoopInfo");
     }
 
@@ -72,7 +72,7 @@ namespace {
     /// Iteratively perform simplification on a worklist of users of the
     /// specified induction variable. This is the top-level driver that applies
     /// all simplicitions to users of an IV.
-    void simplifyUsers(PHINode *CurrIV, IVVisitor *V = NULL);
+    void simplifyUsers(PHINode *CurrIV, IVVisitor *V = nullptr);
 
     Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand);
 
@@ -95,25 +95,25 @@ namespace {
 /// be folded (in case more folding opportunities have been exposed).
 /// Otherwise return null.
 Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) {
-  Value *IVSrc = 0;
+  Value *IVSrc = nullptr;
   unsigned OperIdx = 0;
-  const SCEV *FoldedExpr = 0;
+  const SCEV *FoldedExpr = nullptr;
   switch (UseInst->getOpcode()) {
   default:
-    return 0;
+    return nullptr;
   case Instruction::UDiv:
   case Instruction::LShr:
     // We're only interested in the case where we know something about
     // the numerator and have a constant denominator.
     if (IVOperand != UseInst->getOperand(OperIdx) ||
         !isa<ConstantInt>(UseInst->getOperand(1)))
-      return 0;
+      return nullptr;
 
     // Attempt to fold a binary operator with constant operand.
     // e.g. ((I + 1) >> 2) => I >> 2
     if (!isa<BinaryOperator>(IVOperand)
         || !isa<ConstantInt>(IVOperand->getOperand(1)))
-      return 0;
+      return nullptr;
 
     IVSrc = IVOperand->getOperand(0);
     // IVSrc must be the (SCEVable) IV, since the other operand is const.
@@ -124,7 +124,7 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
       // Get a constant for the divisor. See createSCEV.
       uint32_t BitWidth = cast<IntegerType>(UseInst->getType())->getBitWidth();
       if (D->getValue().uge(BitWidth))
-        return 0;
+        return nullptr;
 
       D = ConstantInt::get(UseInst->getContext(),
                            APInt::getOneBitSet(BitWidth, D->getZExtValue()));
@@ -133,11 +133,11 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
   }
   // We have something that might fold it's operand. Compare SCEVs.
   if (!SE->isSCEVable(UseInst->getType()))
-    return 0;
+    return nullptr;
 
   // Bypass the operand if SCEV can prove it has no effect.
   if (SE->getSCEV(UseInst) != FoldedExpr)
-    return 0;
+    return nullptr;
 
   DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
         << " -> " << *UseInst << '\n');
@@ -283,8 +283,8 @@ Instruction *SimplifyIndvar::splitOverflowIntrinsic(Instruction *IVUser,
     return IVUser;
 
   // Find a branch guarded by the overflow check.
-  BranchInst *Branch = 0;
-  Instruction *AddVal = 0;
+  BranchInst *Branch = nullptr;
+  Instruction *AddVal = nullptr;
   for (User *U : II->users()) {
     if (ExtractValueInst *ExtractInst = dyn_cast<ExtractValueInst>(U)) {
       if (ExtractInst->getNumIndices() != 1)
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index bbd65f1..33b3637 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "instsimplify"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -29,6 +28,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "instsimplify"
+
 STATISTIC(NumSimplified, "Number of redundant instructions removed");
 
 namespace {
@@ -47,17 +48,18 @@ namespace {
     bool runOnFunction(Function &F) override {
       const DominatorTreeWrapperPass *DTWP =
           getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-      const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : 0;
+      const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
       DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-      const DataLayout *DL = DLP ? &DLP->getDataLayout() : 0;
+      const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
       const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
       SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
       bool Changed = false;
 
       do {
-        for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
-             DE = df_end(&F.getEntryBlock()); DI != DE; ++DI)
-          for (BasicBlock::iterator BI = DI->begin(), BE = DI->end(); BI != BE;) {
+        for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
+          // Here be subtlety: the iterator must be incremented before the loop
+          // body (not sure why), so a range-for loop won't work here.
+          for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
             Instruction *I = BI++;
             // The first time through the loop ToSimplify is empty and we try to
             // simplify all instructions.  On later iterations ToSimplify is not
@@ -74,7 +76,15 @@ namespace {
                 ++NumSimplified;
                 Changed = true;
               }
-            Changed |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+            bool res = RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+            if (res)  {
+              // RecursivelyDeleteTriviallyDeadInstruction can remove
+              // more than one instruction, so simply incrementing the
+              // iterator does not work. When instructions get deleted
+              // re-iterate instead.
+              BI = BB->begin(); BE = BB->end();
+              Changed |= res;
+            }
           }
 
         // Place the list of instructions to simplify on the next loop iteration
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index b5bc391..3b61bb5 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -75,7 +76,7 @@ public:
 
     // We never change the calling convention.
     if (!ignoreCallingConv() && CI->getCallingConv() != llvm::CallingConv::C)
-      return NULL;
+      return nullptr;
 
     return callOptimizer(CI->getCalledFunction(), CI, B);
   }
@@ -186,14 +187,14 @@ struct MemCpyChkOpt : public InstFortifiedLibCallOptimization {
         !FT->getParamType(1)->isPointerTy() ||
         FT->getParamType(2) != DL->getIntPtrType(Context) ||
         FT->getParamType(3) != DL->getIntPtrType(Context))
-      return 0;
+      return nullptr;
 
     if (isFoldable(3, 2, false)) {
       B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2), 1);
       return CI->getArgOperand(0);
     }
-    return 0;
+    return nullptr;
   }
 };
 
@@ -210,14 +211,14 @@ struct MemMoveChkOpt : public InstFortifiedLibCallOptimization {
         !FT->getParamType(1)->isPointerTy() ||
         FT->getParamType(2) != DL->getIntPtrType(Context) ||
         FT->getParamType(3) != DL->getIntPtrType(Context))
-      return 0;
+      return nullptr;
 
     if (isFoldable(3, 2, false)) {
       B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
                       CI->getArgOperand(2), 1);
       return CI->getArgOperand(0);
     }
-    return 0;
+    return nullptr;
   }
 };
 
@@ -234,7 +235,7 @@ struct MemSetChkOpt : public InstFortifiedLibCallOptimization {
         !FT->getParamType(1)->isIntegerTy() ||
         FT->getParamType(2) != DL->getIntPtrType(Context) ||
         FT->getParamType(3) != DL->getIntPtrType(Context))
-      return 0;
+      return nullptr;
 
     if (isFoldable(3, 2, false)) {
       Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(),
@@ -242,7 +243,7 @@ struct MemSetChkOpt : public InstFortifiedLibCallOptimization {
       B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
       return CI->getArgOperand(0);
     }
-    return 0;
+    return nullptr;
   }
 };
 
@@ -260,7 +261,7 @@ struct StrCpyChkOpt : public InstFortifiedLibCallOptimization {
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
         FT->getParamType(2) != DL->getIntPtrType(Context))
-      return 0;
+      return nullptr;
 
     Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
     if (Dst == Src)      // __strcpy_chk(x,x)  -> x
@@ -277,10 +278,10 @@ struct StrCpyChkOpt : public InstFortifiedLibCallOptimization {
     } else {
       // Maybe we can stil fold __strcpy_chk to __memcpy_chk.
       uint64_t Len = GetStringLength(Src);
-      if (Len == 0) return 0;
+      if (Len == 0) return nullptr;
 
       // This optimization require DataLayout.
-      if (!DL) return 0;
+      if (!DL) return nullptr;
 
       Value *Ret =
 	EmitMemCpyChk(Dst, Src,
@@ -288,7 +289,7 @@ struct StrCpyChkOpt : public InstFortifiedLibCallOptimization {
                       CI->getArgOperand(2), B, DL, TLI);
       return Ret;
     }
-    return 0;
+    return nullptr;
   }
 };
 
@@ -306,12 +307,12 @@ struct StpCpyChkOpt : public InstFortifiedLibCallOptimization {
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
         FT->getParamType(2) != DL->getIntPtrType(FT->getParamType(0)))
-      return 0;
+      return nullptr;
 
     Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
     if (Dst == Src) {  // stpcpy(x,x)  -> x+strlen(x)
       Value *StrLen = EmitStrLen(Src, B, DL, TLI);
-      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0;
+      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr;
     }
 
     // If a) we don't have any length information, or b) we know this will
@@ -325,10 +326,10 @@ struct StpCpyChkOpt : public InstFortifiedLibCallOptimization {
     } else {
       // Maybe we can stil fold __stpcpy_chk to __memcpy_chk.
       uint64_t Len = GetStringLength(Src);
-      if (Len == 0) return 0;
+      if (Len == 0) return nullptr;
 
       // This optimization require DataLayout.
-      if (!DL) return 0;
+      if (!DL) return nullptr;
 
       Type *PT = FT->getParamType(0);
       Value *LenV = ConstantInt::get(DL->getIntPtrType(PT), Len);
@@ -336,10 +337,10 @@ struct StpCpyChkOpt : public InstFortifiedLibCallOptimization {
                                   ConstantInt::get(DL->getIntPtrType(PT),
                                                    Len - 1));
       if (!EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, DL, TLI))
-        return 0;
+        return nullptr;
       return DstEnd;
     }
-    return 0;
+    return nullptr;
   }
 };
 
@@ -357,7 +358,7 @@ struct StrNCpyChkOpt : public InstFortifiedLibCallOptimization {
         FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
         !FT->getParamType(2)->isIntegerTy() ||
         FT->getParamType(3) != DL->getIntPtrType(Context))
-      return 0;
+      return nullptr;
 
     if (isFoldable(3, 2, false)) {
       Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
@@ -365,7 +366,7 @@ struct StrNCpyChkOpt : public InstFortifiedLibCallOptimization {
                                Name.substr(2, 7));
       return Ret;
     }
-    return 0;
+    return nullptr;
   }
 };
 
@@ -382,7 +383,7 @@ struct StrCatOpt : public LibCallOptimization {
         FT->getReturnType() != B.getInt8PtrTy() ||
         FT->getParamType(0) != FT->getReturnType() ||
         FT->getParamType(1) != FT->getReturnType())
-      return 0;
+      return nullptr;
 
     // Extract some information from the instruction
     Value *Dst = CI->getArgOperand(0);
@@ -390,7 +391,7 @@ struct StrCatOpt : public LibCallOptimization {
 
     // See if we can get the length of the input string.
     uint64_t Len = GetStringLength(Src);
-    if (Len == 0) return 0;
+    if (Len == 0) return nullptr;
     --Len;  // Unbias length.
 
     // Handle the simple, do-nothing case: strcat(x, "") -> x
@@ -398,7 +399,7 @@ struct StrCatOpt : public LibCallOptimization {
       return Dst;
 
     // These optimizations require DataLayout.
-    if (!DL) return 0;
+    if (!DL) return nullptr;
 
     return emitStrLenMemCpy(Src, Dst, Len, B);
   }
@@ -409,7 +410,7 @@ struct StrCatOpt : public LibCallOptimization {
     // memory is to be moved to. We just generate a call to strlen.
     Value *DstLen = EmitStrLen(Dst, B, DL, TLI);
     if (!DstLen)
-      return 0;
+      return nullptr;
 
     // Now that we have the destination's length, we must index into the
     // destination's pointer to get the actual memcpy destination (end of
@@ -434,7 +435,7 @@ struct StrNCatOpt : public StrCatOpt {
         FT->getParamType(0) != FT->getReturnType() ||
         FT->getParamType(1) != FT->getReturnType() ||
         !FT->getParamType(2)->isIntegerTy())
-      return 0;
+      return nullptr;
 
     // Extract some information from the instruction
     Value *Dst = CI->getArgOperand(0);
@@ -445,11 +446,11 @@ struct StrNCatOpt : public StrCatOpt {
     if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
       Len = LengthArg->getZExtValue();
     else
-      return 0;
+      return nullptr;
 
     // See if we can get the length of the input string.
     uint64_t SrcLen = GetStringLength(Src);
-    if (SrcLen == 0) return 0;
+    if (SrcLen == 0) return nullptr;
     --SrcLen;  // Unbias length.
 
     // Handle the simple, do-nothing cases:
@@ -458,10 +459,10 @@ struct StrNCatOpt : public StrCatOpt {
     if (SrcLen == 0 || Len == 0) return Dst;
 
     // These optimizations require DataLayout.
-    if (!DL) return 0;
+    if (!DL) return nullptr;
 
     // We don't optimize this case
-    if (Len < SrcLen) return 0;
+    if (Len < SrcLen) return nullptr;
 
     // strncat(x, s, c) -> strcat(x, s)
     // s is constant so the strcat can be optimized further
@@ -478,20 +479,20 @@ struct StrChrOpt : public LibCallOptimization {
         FT->getReturnType() != B.getInt8PtrTy() ||
         FT->getParamType(0) != FT->getReturnType() ||
         !FT->getParamType(1)->isIntegerTy(32))
-      return 0;
+      return nullptr;
 
     Value *SrcStr = CI->getArgOperand(0);
 
     // If the second operand is non-constant, see if we can compute the length
     // of the input string and turn this into memchr.
     ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
-    if (CharC == 0) {
+    if (!CharC) {
       // These optimizations require DataLayout.
-      if (!DL) return 0;
+      if (!DL) return nullptr;
 
       uint64_t Len = GetStringLength(SrcStr);
       if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32.
-        return 0;
+        return nullptr;
 
       return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
                         ConstantInt::get(DL->getIntPtrType(*Context), Len),
@@ -504,7 +505,7 @@ struct StrChrOpt : public LibCallOptimization {
     if (!getConstantStringInfo(SrcStr, Str)) {
       if (DL && CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
         return B.CreateGEP(SrcStr, EmitStrLen(SrcStr, B, DL, TLI), "strchr");
-      return 0;
+      return nullptr;
     }
 
     // Compute the offset, make sure to handle the case when we're searching for
@@ -528,21 +529,21 @@ struct StrRChrOpt : public LibCallOptimization {
         FT->getReturnType() != B.getInt8PtrTy() ||
         FT->getParamType(0) != FT->getReturnType() ||
         !FT->getParamType(1)->isIntegerTy(32))
-      return 0;
+      return nullptr;
 
     Value *SrcStr = CI->getArgOperand(0);
     ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
 
     // Cannot fold anything if we're not looking for a constant.
     if (!CharC)
-      return 0;
+      return nullptr;
 
     StringRef Str;
     if (!getConstantStringInfo(SrcStr, Str)) {
       // strrchr(s, 0) -> strchr(s, 0)
       if (DL && CharC->isZero())
         return EmitStrChr(SrcStr, '\0', B, DL, TLI);
-      return 0;
+      return nullptr;
     }
 
     // Compute the offset.
@@ -565,7 +566,7 @@ struct StrCmpOpt : public LibCallOptimization {
         !FT->getReturnType()->isIntegerTy(32) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != B.getInt8PtrTy())
-      return 0;
+      return nullptr;
 
     Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
     if (Str1P == Str2P)      // strcmp(x,x)  -> 0
@@ -591,14 +592,14 @@ struct StrCmpOpt : public LibCallOptimization {
     uint64_t Len2 = GetStringLength(Str2P);
     if (Len1 && Len2) {
       // These optimizations require DataLayout.
-      if (!DL) return 0;
+      if (!DL) return nullptr;
 
       return EmitMemCmp(Str1P, Str2P,
                         ConstantInt::get(DL->getIntPtrType(*Context),
                         std::min(Len1, Len2)), B, DL, TLI);
     }
 
-    return 0;
+    return nullptr;
   }
 };
 
@@ -612,7 +613,7 @@ struct StrNCmpOpt : public LibCallOptimization {
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
         !FT->getParamType(2)->isIntegerTy())
-      return 0;
+      return nullptr;
 
     Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
     if (Str1P == Str2P)      // strncmp(x,x,n)  -> 0
@@ -623,7 +624,7 @@ struct StrNCmpOpt : public LibCallOptimization {
     if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
       Length = LengthArg->getZExtValue();
     else
-      return 0;
+      return nullptr;
 
     if (Length == 0) // strncmp(x,y,0)   -> 0
       return ConstantInt::get(CI->getType(), 0);
@@ -649,7 +650,7 @@ struct StrNCmpOpt : public LibCallOptimization {
     if (HasStr2 && Str2.empty())  // strncmp(x, "", n) -> *x
       return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
 
-    return 0;
+    return nullptr;
   }
 };
 
@@ -662,18 +663,18 @@ struct StrCpyOpt : public LibCallOptimization {
         FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != B.getInt8PtrTy())
-      return 0;
+      return nullptr;
 
     Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
     if (Dst == Src)      // strcpy(x,x)  -> x
       return Src;
 
     // These optimizations require DataLayout.
-    if (!DL) return 0;
+    if (!DL) return nullptr;
 
     // See if we can get the length of the input string.
     uint64_t Len = GetStringLength(Src);
-    if (Len == 0) return 0;
+    if (Len == 0) return nullptr;
 
     // We have enough information to now generate the memcpy call to do the
     // copy for us.  Make a memcpy to copy the nul byte with align = 1.
@@ -692,20 +693,20 @@ struct StpCpyOpt: public LibCallOptimization {
         FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != B.getInt8PtrTy())
-      return 0;
+      return nullptr;
 
     // These optimizations require DataLayout.
-    if (!DL) return 0;
+    if (!DL) return nullptr;
 
     Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
     if (Dst == Src) {  // stpcpy(x,x)  -> x+strlen(x)
       Value *StrLen = EmitStrLen(Src, B, DL, TLI);
-      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0;
+      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr;
     }
 
     // See if we can get the length of the input string.
     uint64_t Len = GetStringLength(Src);
-    if (Len == 0) return 0;
+    if (Len == 0) return nullptr;
 
     Type *PT = FT->getParamType(0);
     Value *LenV = ConstantInt::get(DL->getIntPtrType(PT), Len);
@@ -728,7 +729,7 @@ struct StrNCpyOpt : public LibCallOptimization {
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
         !FT->getParamType(2)->isIntegerTy())
-      return 0;
+      return nullptr;
 
     Value *Dst = CI->getArgOperand(0);
     Value *Src = CI->getArgOperand(1);
@@ -736,7 +737,7 @@ struct StrNCpyOpt : public LibCallOptimization {
 
     // See if we can get the length of the input string.
     uint64_t SrcLen = GetStringLength(Src);
-    if (SrcLen == 0) return 0;
+    if (SrcLen == 0) return nullptr;
     --SrcLen;
 
     if (SrcLen == 0) {
@@ -749,15 +750,15 @@ struct StrNCpyOpt : public LibCallOptimization {
     if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp))
       Len = LengthArg->getZExtValue();
     else
-      return 0;
+      return nullptr;
 
     if (Len == 0) return Dst; // strncpy(x, y, 0) -> x
 
     // These optimizations require DataLayout.
-    if (!DL) return 0;
+    if (!DL) return nullptr;
 
     // Let strncpy handle the zero padding
-    if (Len > SrcLen+1) return 0;
+    if (Len > SrcLen+1) return nullptr;
 
     Type *PT = FT->getParamType(0);
     // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
@@ -776,7 +777,7 @@ struct StrLenOpt : public LibCallOptimization {
     if (FT->getNumParams() != 1 ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
         !FT->getReturnType()->isIntegerTy())
-      return 0;
+      return nullptr;
 
     Value *Src = CI->getArgOperand(0);
 
@@ -784,11 +785,26 @@ struct StrLenOpt : public LibCallOptimization {
     if (uint64_t Len = GetStringLength(Src))
       return ConstantInt::get(CI->getType(), Len-1);
 
+    // strlen(x?"foo":"bars") --> x ? 3 : 4
+    if (SelectInst *SI = dyn_cast<SelectInst>(Src)) {
+      uint64_t LenTrue = GetStringLength(SI->getTrueValue());
+      uint64_t LenFalse = GetStringLength(SI->getFalseValue());
+      if (LenTrue && LenFalse) {
+        emitOptimizationRemark(*Context, "simplify-libcalls", *Caller,
+                               SI->getDebugLoc(),
+                               "folded strlen(select) to select of constants");
+        return B.CreateSelect(SI->getCondition(),
+                              ConstantInt::get(CI->getType(), LenTrue-1),
+                              ConstantInt::get(CI->getType(), LenFalse-1));
+      }
+    }
+
     // strlen(x) != 0 --> *x != 0
     // strlen(x) == 0 --> *x == 0
     if (isOnlyUsedInZeroEqualityComparison(CI))
       return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
-    return 0;
+
+    return nullptr;
   }
 };
 
@@ -800,7 +816,7 @@ struct StrPBrkOpt : public LibCallOptimization {
         FT->getParamType(0) != B.getInt8PtrTy() ||
         FT->getParamType(1) != FT->getParamType(0) ||
         FT->getReturnType() != FT->getParamType(0))
-      return 0;
+      return nullptr;
 
     StringRef S1, S2;
     bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
@@ -824,7 +840,7 @@ struct StrPBrkOpt : public LibCallOptimization {
     if (DL && HasS2 && S2.size() == 1)
       return EmitStrChr(CI->getArgOperand(0), S2[0], B, DL, TLI);
 
-    return 0;
+    return nullptr;
   }
 };
 
@@ -835,7 +851,7 @@ struct StrToOpt : public LibCallOptimization {
     if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy())
-      return 0;
+      return nullptr;
 
     Value *EndPtr = CI->getArgOperand(1);
     if (isa<ConstantPointerNull>(EndPtr)) {
@@ -844,7 +860,7 @@ struct StrToOpt : public LibCallOptimization {
       CI->addAttribute(1, Attribute::NoCapture);
     }
 
-    return 0;
+    return nullptr;
   }
 };
 
@@ -856,7 +872,7 @@ struct StrSpnOpt : public LibCallOptimization {
         FT->getParamType(0) != B.getInt8PtrTy() ||
         FT->getParamType(1) != FT->getParamType(0) ||
         !FT->getReturnType()->isIntegerTy())
-      return 0;
+      return nullptr;
 
     StringRef S1, S2;
     bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
@@ -874,7 +890,7 @@ struct StrSpnOpt : public LibCallOptimization {
       return ConstantInt::get(CI->getType(), Pos);
     }
 
-    return 0;
+    return nullptr;
   }
 };
 
@@ -886,7 +902,7 @@ struct StrCSpnOpt : public LibCallOptimization {
         FT->getParamType(0) != B.getInt8PtrTy() ||
         FT->getParamType(1) != FT->getParamType(0) ||
         !FT->getReturnType()->isIntegerTy())
-      return 0;
+      return nullptr;
 
     StringRef S1, S2;
     bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
@@ -907,7 +923,7 @@ struct StrCSpnOpt : public LibCallOptimization {
     if (DL && HasS2 && S2.empty())
       return EmitStrLen(CI->getArgOperand(0), B, DL, TLI);
 
-    return 0;
+    return nullptr;
   }
 };
 
@@ -919,7 +935,7 @@ struct StrStrOpt : public LibCallOptimization {
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !FT->getReturnType()->isPointerTy())
-      return 0;
+      return nullptr;
 
     // fold strstr(x, x) -> x.
     if (CI->getArgOperand(0) == CI->getArgOperand(1))
@@ -929,11 +945,11 @@ struct StrStrOpt : public LibCallOptimization {
     if (DL && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
       Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, DL, TLI);
       if (!StrLen)
-        return 0;
+        return nullptr;
       Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
                                    StrLen, B, DL, TLI);
       if (!StrNCmp)
-        return 0;
+        return nullptr;
       for (auto UI = CI->user_begin(), UE = CI->user_end(); UI != UE;) {
         ICmpInst *Old = cast<ICmpInst>(*UI++);
         Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
@@ -969,9 +985,9 @@ struct StrStrOpt : public LibCallOptimization {
     // fold strstr(x, "y") -> strchr(x, 'y').
     if (HasStr2 && ToFindStr.size() == 1) {
       Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, DL, TLI);
-      return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0;
+      return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr;
     }
-    return 0;
+    return nullptr;
   }
 };
 
@@ -982,7 +998,7 @@ struct MemCmpOpt : public LibCallOptimization {
     if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !FT->getReturnType()->isIntegerTy(32))
-      return 0;
+      return nullptr;
 
     Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
 
@@ -991,7 +1007,7 @@ struct MemCmpOpt : public LibCallOptimization {
 
     // Make sure we have a constant length.
     ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-    if (!LenC) return 0;
+    if (!LenC) return nullptr;
     uint64_t Len = LenC->getZExtValue();
 
     if (Len == 0) // memcmp(s1,s2,0) -> 0
@@ -1012,7 +1028,7 @@ struct MemCmpOpt : public LibCallOptimization {
         getConstantStringInfo(RHS, RHSStr)) {
       // Make sure we're not reading out-of-bounds memory.
       if (Len > LHSStr.size() || Len > RHSStr.size())
-        return 0;
+        return nullptr;
       // Fold the memcmp and normalize the result.  This way we get consistent
       // results across multiple platforms.
       uint64_t Ret = 0;
@@ -1024,7 +1040,7 @@ struct MemCmpOpt : public LibCallOptimization {
       return ConstantInt::get(CI->getType(), Ret);
     }
 
-    return 0;
+    return nullptr;
   }
 };
 
@@ -1032,14 +1048,14 @@ struct MemCpyOpt : public LibCallOptimization {
   Value *callOptimizer(Function *Callee, CallInst *CI,
                        IRBuilder<> &B) override {
     // These optimizations require DataLayout.
-    if (!DL) return 0;
+    if (!DL) return nullptr;
 
     FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         FT->getParamType(2) != DL->getIntPtrType(*Context))
-      return 0;
+      return nullptr;
 
     // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
     B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
@@ -1052,14 +1068,14 @@ struct MemMoveOpt : public LibCallOptimization {
   Value *callOptimizer(Function *Callee, CallInst *CI,
                        IRBuilder<> &B) override {
     // These optimizations require DataLayout.
-    if (!DL) return 0;
+    if (!DL) return nullptr;
 
     FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         FT->getParamType(2) != DL->getIntPtrType(*Context))
-      return 0;
+      return nullptr;
 
     // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
     B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
@@ -1072,14 +1088,14 @@ struct MemSetOpt : public LibCallOptimization {
   Value *callOptimizer(Function *Callee, CallInst *CI,
                        IRBuilder<> &B) override {
     // These optimizations require DataLayout.
-    if (!DL) return 0;
+    if (!DL) return nullptr;
 
     FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isIntegerTy() ||
         FT->getParamType(2) != DL->getIntPtrType(FT->getParamType(0)))
-      return 0;
+      return nullptr;
 
     // memset(p, v, n) -> llvm.memset(p, v, n, 1)
     Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
@@ -1103,21 +1119,21 @@ struct UnaryDoubleFPOpt : public LibCallOptimization {
     FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
         !FT->getParamType(0)->isDoubleTy())
-      return 0;
+      return nullptr;
 
     if (CheckRetType) {
       // Check if all the uses for function like 'sin' are converted to float.
       for (User *U : CI->users()) {
         FPTruncInst *Cast = dyn_cast<FPTruncInst>(U);
-        if (Cast == 0 || !Cast->getType()->isFloatTy())
-          return 0;
+        if (!Cast || !Cast->getType()->isFloatTy())
+          return nullptr;
       }
     }
 
     // If this is something like 'floor((double)floatval)', convert to floorf.
     FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
-    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
-      return 0;
+    if (!Cast || !Cast->getOperand(0)->getType()->isFloatTy())
+      return nullptr;
 
     // floor((double)floatval) -> (double)floorf(floatval)
     Value *V = Cast->getOperand(0);
@@ -1138,15 +1154,15 @@ struct BinaryDoubleFPOpt : public LibCallOptimization {
     if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return nullptr;
 
     if (CheckRetType) {
       // Check if all the uses for function like 'fmin/fmax' are converted to
       // float.
       for (User *U : CI->users()) {
         FPTruncInst *Cast = dyn_cast<FPTruncInst>(U);
-        if (Cast == 0 || !Cast->getType()->isFloatTy())
-          return 0;
+        if (!Cast || !Cast->getType()->isFloatTy())
+          return nullptr;
       }
     }
 
@@ -1154,13 +1170,13 @@ struct BinaryDoubleFPOpt : public LibCallOptimization {
     // we convert it to fminf.
     FPExtInst *Cast1 = dyn_cast<FPExtInst>(CI->getArgOperand(0));
     FPExtInst *Cast2 = dyn_cast<FPExtInst>(CI->getArgOperand(1));
-    if (Cast1 == 0 || !Cast1->getOperand(0)->getType()->isFloatTy() ||
-        Cast2 == 0 || !Cast2->getOperand(0)->getType()->isFloatTy())
-      return 0;
+    if (!Cast1 || !Cast1->getOperand(0)->getType()->isFloatTy() ||
+        !Cast2 || !Cast2->getOperand(0)->getType()->isFloatTy())
+      return nullptr;
 
     // fmin((double)floatval1, (double)floatval2)
     //                      -> (double)fmin(floatval1, floatval2)
-    Value *V = NULL;
+    Value *V = nullptr;
     Value *V1 = Cast1->getOperand(0);
     Value *V2 = Cast2->getOperand(0);
     V = EmitBinaryFloatFnCall(V1, V2, Callee->getName(), B,
@@ -1180,7 +1196,7 @@ struct CosOpt : public UnsafeFPLibCallOptimization {
   CosOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
   Value *callOptimizer(Function *Callee, CallInst *CI,
                        IRBuilder<> &B) override {
-    Value *Ret = NULL;
+    Value *Ret = nullptr;
     if (UnsafeFPShrink && Callee->getName() == "cos" &&
         TLI->has(LibFunc::cosf)) {
       UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
@@ -1208,7 +1224,7 @@ struct PowOpt : public UnsafeFPLibCallOptimization {
   PowOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
   Value *callOptimizer(Function *Callee, CallInst *CI,
                        IRBuilder<> &B) override {
-    Value *Ret = NULL;
+    Value *Ret = nullptr;
     if (UnsafeFPShrink && Callee->getName() == "pow" &&
         TLI->has(LibFunc::powf)) {
       UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
@@ -1242,7 +1258,7 @@ struct PowOpt : public UnsafeFPLibCallOptimization {
     }
 
     ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
-    if (Op2C == 0) return Ret;
+    if (!Op2C) return Ret;
 
     if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
       return ConstantFP::get(CI->getType(), 1.0);
@@ -1275,7 +1291,7 @@ struct PowOpt : public UnsafeFPLibCallOptimization {
     if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
       return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0),
                           Op1, "powrecip");
-    return 0;
+    return nullptr;
   }
 };
 
@@ -1283,7 +1299,7 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization {
   Exp2Opt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
   Value *callOptimizer(Function *Callee, CallInst *CI,
                        IRBuilder<> &B) override {
-    Value *Ret = NULL;
+    Value *Ret = nullptr;
     if (UnsafeFPShrink && Callee->getName() == "exp2" &&
         TLI->has(LibFunc::exp2f)) {
       UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
@@ -1307,7 +1323,7 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization {
       LdExp = LibFunc::ldexp;
 
     if (TLI->has(LdExp)) {
-      Value *LdExpArg = 0;
+      Value *LdExpArg = nullptr;
       if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
         if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
           LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
@@ -1344,7 +1360,7 @@ struct SinCosPiOpt : public LibCallOptimization {
     // Make sure the prototype is as expected, otherwise the rest of the
     // function is probably invalid and likely to abort.
     if (!isTrigLibCall(CI))
-      return 0;
+      return nullptr;
 
     Value *Arg = CI->getArgOperand(0);
     SmallVector<CallInst *, 1> SinCalls;
@@ -1362,7 +1378,7 @@ struct SinCosPiOpt : public LibCallOptimization {
 
     // It's only worthwhile if both sinpi and cospi are actually used.
     if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty()))
-      return 0;
+      return nullptr;
 
     Value *Sin, *Cos, *SinCos;
     insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos,
@@ -1372,7 +1388,7 @@ struct SinCosPiOpt : public LibCallOptimization {
     replaceTrigInsts(CosCalls, Cos);
     replaceTrigInsts(SinCosCalls, SinCos);
 
-    return 0;
+    return nullptr;
   }
 
   bool isTrigLibCall(CallInst *CI) {
@@ -1498,7 +1514,7 @@ struct FFSOpt : public LibCallOptimization {
     if (FT->getNumParams() != 1 ||
         !FT->getReturnType()->isIntegerTy(32) ||
         !FT->getParamType(0)->isIntegerTy())
-      return 0;
+      return nullptr;
 
     Value *Op = CI->getArgOperand(0);
 
@@ -1531,7 +1547,7 @@ struct AbsOpt : public LibCallOptimization {
     // We require integer(integer) where the types agree.
     if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
         FT->getParamType(0) != FT->getReturnType())
-      return 0;
+      return nullptr;
 
     // abs(x) -> x >s -1 ? x : -x
     Value *Op = CI->getArgOperand(0);
@@ -1549,7 +1565,7 @@ struct IsDigitOpt : public LibCallOptimization {
     // We require integer(i32)
     if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
         !FT->getParamType(0)->isIntegerTy(32))
-      return 0;
+      return nullptr;
 
     // isdigit(c) -> (c-'0') <u 10
     Value *Op = CI->getArgOperand(0);
@@ -1566,7 +1582,7 @@ struct IsAsciiOpt : public LibCallOptimization {
     // We require integer(i32)
     if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
         !FT->getParamType(0)->isIntegerTy(32))
-      return 0;
+      return nullptr;
 
     // isascii(c) -> c <u 128
     Value *Op = CI->getArgOperand(0);
@@ -1582,7 +1598,7 @@ struct ToAsciiOpt : public LibCallOptimization {
     // We require i32(i32)
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isIntegerTy(32))
-      return 0;
+      return nullptr;
 
     // toascii(c) -> c & 0x7f
     return B.CreateAnd(CI->getArgOperand(0),
@@ -1612,7 +1628,7 @@ struct ErrorReportingOpt : public LibCallOptimization {
       CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold);
     }
 
-    return 0;
+    return nullptr;
   }
 
 protected:
@@ -1649,7 +1665,7 @@ struct PrintFOpt : public LibCallOptimization {
     // Check for a fixed format string.
     StringRef FormatStr;
     if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
-      return 0;
+      return nullptr;
 
     // Empty format string -> noop.
     if (FormatStr.empty())  // Tolerate printf's declared void.
@@ -1660,7 +1676,7 @@ struct PrintFOpt : public LibCallOptimization {
     // is used, in general the printf return value is not compatible with either
     // putchar() or puts().
     if (!CI->use_empty())
-      return 0;
+      return nullptr;
 
     // printf("x") -> putchar('x'), even for '%'.
     if (FormatStr.size() == 1) {
@@ -1697,7 +1713,7 @@ struct PrintFOpt : public LibCallOptimization {
         CI->getArgOperand(1)->getType()->isPointerTy()) {
       return EmitPutS(CI->getArgOperand(1), B, DL, TLI);
     }
-    return 0;
+    return nullptr;
   }
 
   Value *callOptimizer(Function *Callee, CallInst *CI,
@@ -1707,7 +1723,7 @@ struct PrintFOpt : public LibCallOptimization {
     if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
         !(FT->getReturnType()->isIntegerTy() ||
           FT->getReturnType()->isVoidTy()))
-      return 0;
+      return nullptr;
 
     if (Value *V = optimizeFixedFormatString(Callee, CI, B)) {
       return V;
@@ -1724,7 +1740,7 @@ struct PrintFOpt : public LibCallOptimization {
       B.Insert(New);
       return New;
     }
-    return 0;
+    return nullptr;
   }
 };
 
@@ -1734,7 +1750,7 @@ struct SPrintFOpt : public LibCallOptimization {
     // Check for a fixed format string.
     StringRef FormatStr;
     if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
-      return 0;
+      return nullptr;
 
     // If we just have a format string (nothing else crazy) transform it.
     if (CI->getNumArgOperands() == 2) {
@@ -1742,10 +1758,10 @@ struct SPrintFOpt : public LibCallOptimization {
       // %% -> % in the future if we cared.
       for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
         if (FormatStr[i] == '%')
-          return 0; // we found a format specifier, bail out.
+          return nullptr; // we found a format specifier, bail out.
 
       // These optimizations require DataLayout.
-      if (!DL) return 0;
+      if (!DL) return nullptr;
 
       // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
       B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
@@ -1758,12 +1774,12 @@ struct SPrintFOpt : public LibCallOptimization {
     // and have an extra operand.
     if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
         CI->getNumArgOperands() < 3)
-      return 0;
+      return nullptr;
 
     // Decode the second character of the format string.
     if (FormatStr[1] == 'c') {
       // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
-      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return nullptr;
       Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
       Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
       B.CreateStore(V, Ptr);
@@ -1775,14 +1791,14 @@ struct SPrintFOpt : public LibCallOptimization {
 
     if (FormatStr[1] == 's') {
       // These optimizations require DataLayout.
-      if (!DL) return 0;
+      if (!DL) return nullptr;
 
       // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
-      if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0;
+      if (!CI->getArgOperand(2)->getType()->isPointerTy()) return nullptr;
 
       Value *Len = EmitStrLen(CI->getArgOperand(2), B, DL, TLI);
       if (!Len)
-        return 0;
+        return nullptr;
       Value *IncLen = B.CreateAdd(Len,
                                   ConstantInt::get(Len->getType(), 1),
                                   "leninc");
@@ -1791,7 +1807,7 @@ struct SPrintFOpt : public LibCallOptimization {
       // The sprintf result is the unincremented number of bytes in the string.
       return B.CreateIntCast(Len, CI->getType(), false);
     }
-    return 0;
+    return nullptr;
   }
 
   Value *callOptimizer(Function *Callee, CallInst *CI,
@@ -1801,7 +1817,7 @@ struct SPrintFOpt : public LibCallOptimization {
     if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !FT->getReturnType()->isIntegerTy())
-      return 0;
+      return nullptr;
 
     if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) {
       return V;
@@ -1818,7 +1834,7 @@ struct SPrintFOpt : public LibCallOptimization {
       B.Insert(New);
       return New;
     }
-    return 0;
+    return nullptr;
   }
 };
 
@@ -1831,22 +1847,22 @@ struct FPrintFOpt : public LibCallOptimization {
     // All the optimizations depend on the format string.
     StringRef FormatStr;
     if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
-      return 0;
+      return nullptr;
 
     // Do not do any of the following transformations if the fprintf return
     // value is used, in general the fprintf return value is not compatible
     // with fwrite(), fputc() or fputs().
     if (!CI->use_empty())
-      return 0;
+      return nullptr;
 
     // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
     if (CI->getNumArgOperands() == 2) {
       for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
         if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
-          return 0; // We found a format specifier.
+          return nullptr; // We found a format specifier.
 
       // These optimizations require DataLayout.
-      if (!DL) return 0;
+      if (!DL) return nullptr;
 
       return EmitFWrite(CI->getArgOperand(1),
                         ConstantInt::get(DL->getIntPtrType(*Context),
@@ -1858,22 +1874,22 @@ struct FPrintFOpt : public LibCallOptimization {
     // and have an extra operand.
     if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
         CI->getNumArgOperands() < 3)
-      return 0;
+      return nullptr;
 
     // Decode the second character of the format string.
     if (FormatStr[1] == 'c') {
       // fprintf(F, "%c", chr) --> fputc(chr, F)
-      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return nullptr;
       return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, DL, TLI);
     }
 
     if (FormatStr[1] == 's') {
       // fprintf(F, "%s", str) --> fputs(str, F)
       if (!CI->getArgOperand(2)->getType()->isPointerTy())
-        return 0;
+        return nullptr;
       return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, DL, TLI);
     }
-    return 0;
+    return nullptr;
   }
 
   Value *callOptimizer(Function *Callee, CallInst *CI,
@@ -1883,7 +1899,7 @@ struct FPrintFOpt : public LibCallOptimization {
     if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !FT->getReturnType()->isIntegerTy())
-      return 0;
+      return nullptr;
 
     if (Value *V = optimizeFixedFormatString(Callee, CI, B)) {
       return V;
@@ -1900,7 +1916,7 @@ struct FPrintFOpt : public LibCallOptimization {
       B.Insert(New);
       return New;
     }
-    return 0;
+    return nullptr;
   }
 };
 
@@ -1917,12 +1933,12 @@ struct FWriteOpt : public LibCallOptimization {
         !FT->getParamType(2)->isIntegerTy() ||
         !FT->getParamType(3)->isPointerTy() ||
         !FT->getReturnType()->isIntegerTy())
-      return 0;
+      return nullptr;
 
     // Get the element size and count.
     ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
     ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-    if (!SizeC || !CountC) return 0;
+    if (!SizeC || !CountC) return nullptr;
     uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
 
     // If this is writing zero records, remove the call (it's a noop).
@@ -1934,10 +1950,10 @@ struct FWriteOpt : public LibCallOptimization {
     if (Bytes == 1 && CI->use_empty()) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
       Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
       Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, DL, TLI);
-      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
     }
 
-    return 0;
+    return nullptr;
   }
 };
 
@@ -1948,18 +1964,18 @@ struct FPutsOpt : public LibCallOptimization {
     (void) ER.callOptimizer(Callee, CI, B);
 
     // These optimizations require DataLayout.
-    if (!DL) return 0;
+    if (!DL) return nullptr;
 
     // Require two pointers.  Also, we can't optimize if return value is used.
     FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !CI->use_empty())
-      return 0;
+      return nullptr;
 
     // fputs(s,F) --> fwrite(s,1,strlen(s),F)
     uint64_t Len = GetStringLength(CI->getArgOperand(0));
-    if (!Len) return 0;
+    if (!Len) return nullptr;
     // Known to have no uses (see above).
     return EmitFWrite(CI->getArgOperand(0),
                       ConstantInt::get(DL->getIntPtrType(*Context), Len-1),
@@ -1975,12 +1991,12 @@ struct PutsOpt : public LibCallOptimization {
     if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
         !(FT->getReturnType()->isIntegerTy() ||
           FT->getReturnType()->isVoidTy()))
-      return 0;
+      return nullptr;
 
     // Check for a constant string.
     StringRef Str;
     if (!getConstantStringInfo(CI->getArgOperand(0), Str))
-      return 0;
+      return nullptr;
 
     if (Str.empty() && CI->use_empty()) {
       // puts("") -> putchar('\n')
@@ -1989,7 +2005,7 @@ struct PutsOpt : public LibCallOptimization {
       return B.CreateIntCast(Res, CI->getType(), true);
     }
 
-    return 0;
+    return nullptr;
   }
 };
 
@@ -2100,7 +2116,7 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
     case Intrinsic::exp2:
        return &Exp2;
     default:
-       return 0;
+       return nullptr;
     }
   }
 
@@ -2210,7 +2226,7 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
       case LibFunc::trunc:
         if (hasFloatVersion(FuncName))
           return &UnaryDoubleFP;
-        return 0;
+        return nullptr;
       case LibFunc::acos:
       case LibFunc::acosh:
       case LibFunc::asin:
@@ -2234,16 +2250,16 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
       case LibFunc::tanh:
         if (UnsafeFPShrink && hasFloatVersion(FuncName))
          return &UnsafeUnaryDoubleFP;
-        return 0;
+        return nullptr;
       case LibFunc::fmin:
       case LibFunc::fmax:
         if (hasFloatVersion(FuncName))
           return &BinaryDoubleFP;
-        return 0;
+        return nullptr;
       case LibFunc::memcpy_chk:
         return &MemCpyChk;
       default:
-        return 0;
+        return nullptr;
       }
   }
 
@@ -2263,7 +2279,7 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
       return &StrNCpyChk;
   }
 
-  return 0;
+  return nullptr;
 
 }
 
@@ -2273,7 +2289,7 @@ Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
     IRBuilder<> Builder(CI);
     return LCO->optimizeCall(CI, DL, TLI, LCS, Builder);
   }
-  return 0;
+  return nullptr;
 }
 
 LibCallSimplifier::LibCallSimplifier(const DataLayout *DL,
@@ -2287,7 +2303,7 @@ LibCallSimplifier::~LibCallSimplifier() {
 }
 
 Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
-  if (CI->isNoBuiltin()) return 0;
+  if (CI->isNoBuiltin()) return nullptr;
   return Impl->optimizeCall(CI);
 }
 
diff --git a/lib/Transforms/Utils/SpecialCaseList.cpp b/lib/Transforms/Utils/SpecialCaseList.cpp
index c318560..2c6fcd1 100644
--- a/lib/Transforms/Utils/SpecialCaseList.cpp
+++ b/lib/Transforms/Utils/SpecialCaseList.cpp
@@ -41,7 +41,7 @@ struct SpecialCaseList::Entry {
   StringSet<> Strings;
   Regex *RegEx;
 
-  Entry() : RegEx(0) {}
+  Entry() : RegEx(nullptr) {}
 
   bool match(StringRef Query) const {
     return Strings.count(Query) || (RegEx && RegEx->match(Query));
@@ -57,7 +57,7 @@ SpecialCaseList *SpecialCaseList::create(
   std::unique_ptr<MemoryBuffer> File;
   if (error_code EC = MemoryBuffer::getFile(Path, File)) {
     Error = (Twine("Can't open file '") + Path + "': " + EC.message()).str();
-    return 0;
+    return nullptr;
   }
   return create(File.get(), Error);
 }
@@ -66,7 +66,7 @@ SpecialCaseList *SpecialCaseList::create(
     const MemoryBuffer *MB, std::string &Error) {
   std::unique_ptr<SpecialCaseList> SCL(new SpecialCaseList());
   if (!SCL->parse(MB, Error))
-    return 0;
+    return nullptr;
   return SCL.release();
 }
 
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 560f581..0c2fc0a 100644
--- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -59,7 +59,7 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
 
   // Then unreachable blocks.
   if (UnreachableBlocks.empty()) {
-    UnreachableBlock = 0;
+    UnreachableBlock = nullptr;
   } else if (UnreachableBlocks.size() == 1) {
     UnreachableBlock = UnreachableBlocks.front();
   } else {
@@ -77,7 +77,7 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
 
   // Now handle return blocks.
   if (ReturningBlocks.empty()) {
-    ReturnBlock = 0;
+    ReturnBlock = nullptr;
     return false;                          // No blocks return
   } else if (ReturningBlocks.size() == 1) {
     ReturnBlock = ReturningBlocks.front(); // Already has a single return block
@@ -91,9 +91,9 @@ bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
   BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(),
                                                "UnifiedReturnBlock", &F);
 
-  PHINode *PN = 0;
+  PHINode *PN = nullptr;
   if (F.getReturnType()->isVoidTy()) {
-    ReturnInst::Create(F.getContext(), NULL, NewRetBlock);
+    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
   } else {
     // If the function doesn't return void... add a PHI node to the block...
     PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 457fc80..0f20e6d 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -71,12 +71,12 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     // Check all operands to see if any need to be remapped.
     for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) {
       Value *OP = MD->getOperand(i);
-      if (OP == 0) continue;
+      if (!OP) continue;
       Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper, Materializer);
       // Use identity map if Mapped_Op is null and we can ignore missing
       // entries.
       if (Mapped_OP == OP ||
-          (Mapped_OP == 0 && (Flags & RF_IgnoreMissingEntries)))
+          (Mapped_OP == nullptr && (Flags & RF_IgnoreMissingEntries)))
         continue;
 
       // Ok, at least one operand needs remapping.  
@@ -84,13 +84,13 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
       Elts.reserve(MD->getNumOperands());
       for (i = 0; i != e; ++i) {
         Value *Op = MD->getOperand(i);
-        if (Op == 0)
-          Elts.push_back(0);
+        if (!Op)
+          Elts.push_back(nullptr);
         else {
           Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper, Materializer);
           // Use identity map if Mapped_Op is null and we can ignore missing
           // entries.
-          if (Mapped_Op == 0 && (Flags & RF_IgnoreMissingEntries))
+          if (Mapped_Op == nullptr && (Flags & RF_IgnoreMissingEntries))
             Mapped_Op = Op;
           Elts.push_back(Mapped_Op);
         }
@@ -112,8 +112,8 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   // Okay, this either must be a constant (which may or may not be mappable) or
   // is something that is not in the mapping table.
   Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V));
-  if (C == 0)
-    return 0;
+  if (!C)
+    return nullptr;
   
   if (BlockAddress *BA = dyn_cast<BlockAddress>(C)) {
     Function *F = 
@@ -126,7 +126,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   // Otherwise, we have some other constant to remap.  Start by checking to see
   // if all operands have an identity remapping.
   unsigned OpNo = 0, NumOperands = C->getNumOperands();
-  Value *Mapped = 0;
+  Value *Mapped = nullptr;
   for (; OpNo != NumOperands; ++OpNo) {
     Value *Op = C->getOperand(OpNo);
     Mapped = MapValue(Op, VM, Flags, TypeMapper, Materializer);
@@ -187,7 +187,7 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
   for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) {
     Value *V = MapValue(*op, VMap, Flags, TypeMapper, Materializer);
     // If we aren't ignoring missing entries, assert that something happened.
-    if (V != 0)
+    if (V)
       *op = V;
     else
       assert((Flags & RF_IgnoreMissingEntries) &&
@@ -199,7 +199,7 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
       Value *V = MapValue(PN->getIncomingBlock(i), VMap, Flags);
       // If we aren't ignoring missing entries, assert that something happened.
-      if (V != 0)
+      if (V)
         PN->setIncomingBlock(i, cast<BasicBlock>(V));
       else
         assert((Flags & RF_IgnoreMissingEntries) &&
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 71350e7..28ec83b 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -15,7 +15,6 @@
 //===----------------------------------------------------------------------===//
 
 #define BBV_NAME "bb-vectorize"
-#define DEBUG_TYPE BBV_NAME
 #include "llvm/Transforms/Vectorize.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -50,6 +49,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE BBV_NAME
+
 static cl::opt<bool>
 IgnoreTargetInfo("bb-vectorize-ignore-target-info",  cl::init(false),
   cl::Hidden, cl::desc("Ignore target information"));
@@ -122,6 +123,10 @@ NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize floating-point math intrinsics"));
 
 static cl::opt<bool>
+  NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize BitManipulation intrinsics"));
+
+static cl::opt<bool>
 NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
 
@@ -202,8 +207,8 @@ namespace {
       DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
       SE = &P->getAnalysis<ScalarEvolution>();
       DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>();
-      DL = DLP ? &DLP->getDataLayout() : 0;
-      TTI = IgnoreTargetInfo ? 0 : &P->getAnalysis<TargetTransformInfo>();
+      DL = DLP ? &DLP->getDataLayout() : nullptr;
+      TTI = IgnoreTargetInfo ? nullptr : &P->getAnalysis<TargetTransformInfo>();
     }
 
     typedef std::pair<Value *, Value *> ValuePair;
@@ -279,7 +284,7 @@ namespace {
     bool trackUsesOfI(DenseSet<Value *> &Users,
                       AliasSetTracker &WriteSet, Instruction *I,
                       Instruction *J, bool UpdateUsers = true,
-                      DenseSet<ValuePair> *LoadMoveSetPairs = 0);
+                      DenseSet<ValuePair> *LoadMoveSetPairs = nullptr);
 
   void computePairsConnectedTo(
              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
@@ -292,8 +297,8 @@ namespace {
     bool pairsConflict(ValuePair P, ValuePair Q,
              DenseSet<ValuePair> &PairableInstUsers,
              DenseMap<ValuePair, std::vector<ValuePair> >
-               *PairableInstUserMap = 0,
-             DenseSet<VPPair> *PairableInstUserPairSet = 0);
+               *PairableInstUserMap = nullptr,
+             DenseSet<VPPair> *PairableInstUserPairSet = nullptr);
 
     bool pairWillFormCycle(ValuePair P,
              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
@@ -438,8 +443,8 @@ namespace {
       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
       SE = &getAnalysis<ScalarEvolution>();
       DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-      DL = DLP ? &DLP->getDataLayout() : 0;
-      TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>();
+      DL = DLP ? &DLP->getDataLayout() : nullptr;
+      TTI = IgnoreTargetInfo ? nullptr : &getAnalysis<TargetTransformInfo>();
 
       return vectorizeBB(BB);
     }
@@ -674,7 +679,20 @@ namespace {
       case Intrinsic::exp:
       case Intrinsic::exp2:
       case Intrinsic::pow:
+      case Intrinsic::round:
+      case Intrinsic::copysign:
+      case Intrinsic::ceil:
+      case Intrinsic::nearbyint:
+      case Intrinsic::rint:
+      case Intrinsic::trunc:
+      case Intrinsic::floor:
+      case Intrinsic::fabs:
         return Config.VectorizeMath;
+      case Intrinsic::bswap:
+      case Intrinsic::ctpop:
+      case Intrinsic::ctlz:
+      case Intrinsic::cttz:
+        return Config.VectorizeBitManipulations;
       case Intrinsic::fma:
       case Intrinsic::fmuladd:
         return Config.VectorizeFMA;
@@ -878,7 +896,7 @@ namespace {
     }
 
     // We can't vectorize memory operations without target data
-    if (DL == 0 && IsSimpleLoadStore)
+    if (!DL && IsSimpleLoadStore)
       return false;
 
     Type *T1, *T2;
@@ -915,7 +933,7 @@ namespace {
     if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
       return false;
 
-    if ((!Config.VectorizePointers || DL == 0) &&
+    if ((!Config.VectorizePointers || !DL) &&
         (T1->getScalarType()->isPointerTy() ||
          T2->getScalarType()->isPointerTy()))
       return false;
@@ -1049,7 +1067,7 @@ namespace {
               (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) {
             Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
             Constant *SplatValue = cast<Constant>(IOp)->getSplatValue();
-            if (SplatValue != NULL &&
+            if (SplatValue != nullptr &&
                 SplatValue == cast<Constant>(JOp)->getSplatValue())
               Op2VK = TargetTransformInfo::OK_UniformConstantValue;
           }
@@ -1079,13 +1097,14 @@ namespace {
       CostSavings = ICost + JCost - VCost;
     }
 
-    // The powi intrinsic is special because only the first argument is
-    // vectorized, the second arguments must be equal.
+    // The powi,ctlz,cttz intrinsics are special because only the first
+    // argument is vectorized, the second arguments must be equal.
     CallInst *CI = dyn_cast<CallInst>(I);
     Function *FI;
     if (CI && (FI = CI->getCalledFunction())) {
       Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID();
-      if (IID == Intrinsic::powi) {
+      if (IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
+          IID == Intrinsic::cttz) {
         Value *A1I = CI->getArgOperand(1),
               *A1J = cast<CallInst>(J)->getArgOperand(1);
         const SCEV *A1ISCEV = SE->getSCEV(A1I),
@@ -1109,7 +1128,8 @@ namespace {
         assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
                "Intrinsic argument counts differ");
         for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          if (IID == Intrinsic::powi && i == 1)
+          if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
+               IID == Intrinsic::cttz) && i == 1)
             Tys.push_back(CI->getArgOperand(i)->getType());
           else
             Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
@@ -1665,8 +1685,9 @@ namespace {
               C2->first.second == C->first.first ||
               C2->first.second == C->first.second ||
               pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0,
-                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : nullptr,
+                            UseCycleCheck ? &PairableInstUserPairSet
+                                          : nullptr)) {
             if (C2->second >= C->second) {
               CanAdd = false;
               break;
@@ -1686,8 +1707,9 @@ namespace {
               T->second == C->first.first ||
               T->second == C->first.second ||
               pairsConflict(*T, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0,
-                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : nullptr,
+                            UseCycleCheck ? &PairableInstUserPairSet
+                                          : nullptr)) {
             CanAdd = false;
             break;
           }
@@ -1704,8 +1726,9 @@ namespace {
               C2->first.second == C->first.first ||
               C2->first.second == C->first.second ||
               pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0,
-                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : nullptr,
+                            UseCycleCheck ? &PairableInstUserPairSet
+                                          : nullptr)) {
             CanAdd = false;
             break;
           }
@@ -1720,8 +1743,9 @@ namespace {
               ChosenPairs.begin(), E2 = ChosenPairs.end();
              C2 != E2; ++C2) {
           if (pairsConflict(*C2, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0,
-                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : nullptr,
+                            UseCycleCheck ? &PairableInstUserPairSet
+                                          : nullptr)) {
             CanAdd = false;
             break;
           }
@@ -1802,8 +1826,8 @@ namespace {
       for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
            E = ChosenPairs.end(); C != E; ++C) {
         if (pairsConflict(*C, IJ, PairableInstUsers,
-                          UseCycleCheck ? &PairableInstUserMap : 0,
-                          UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+                          UseCycleCheck ? &PairableInstUserMap : nullptr,
+                          UseCycleCheck ? &PairableInstUserPairSet : nullptr)) {
           DoesConflict = true;
           break;
         }
@@ -2373,7 +2397,7 @@ namespace {
         } while ((LIENext =
                    dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
 
-        LIENext = 0;
+        LIENext = nullptr;
         Value *LIEPrev = UndefValue::get(ArgTypeH);
         for (unsigned i = 0; i < numElemL; ++i) {
           if (isa<UndefValue>(VectElemts[i])) continue;
@@ -2441,14 +2465,14 @@ namespace {
     if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
       // We can have at most two unique vector inputs.
       bool CanUseInputs = true;
-      Value *I1, *I2 = 0;
+      Value *I1, *I2 = nullptr;
       if (LEE) {
         I1 = LEE->getOperand(0);
       } else {
         I1 = LSV->getOperand(0);
         I2 = LSV->getOperand(1);
         if (I2 == I1 || isa<UndefValue>(I2))
-          I2 = 0;
+          I2 = nullptr;
       }
   
       if (HEE) {
@@ -2764,10 +2788,11 @@ namespace {
 
           ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
           continue;
-        } else if (IID == Intrinsic::powi && o == 1) {
-          // The second argument of powi is a single integer and we've already
-          // checked that both arguments are equal. As a result, we just keep
-          // I's second argument.
+        } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
+                    IID == Intrinsic::cttz) && o == 1) {
+          // The second argument of powi/ctlz/cttz is a single integer/constant
+          // and we've already checked that both arguments are equal.
+          // As a result, we just keep I's second argument.
           ReplacedOperands[o] = I->getOperand(o);
           continue;
         }
@@ -2952,7 +2977,7 @@ namespace {
 
       switch (Kind) {
       default:
-        K->setMetadata(Kind, 0); // Remove unknown metadata
+        K->setMetadata(Kind, nullptr); // Remove unknown metadata
         break;
       case LLVMContext::MD_tbaa:
         K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
@@ -3123,7 +3148,7 @@ namespace {
 
       // Instruction insertion point:
       Instruction *InsertionPt = K;
-      Instruction *K1 = 0, *K2 = 0;
+      Instruction *K1 = nullptr, *K2 = nullptr;
       replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
 
       // The use dag of the first original instruction must be moved to after
@@ -3213,6 +3238,7 @@ VectorizeConfig::VectorizeConfig() {
   VectorizePointers = !::NoPointers;
   VectorizeCasts = !::NoCasts;
   VectorizeMath = !::NoMath;
+  VectorizeBitManipulations = !::NoBitManipulation;
   VectorizeFMA = !::NoFMA;
   VectorizeSelect = !::NoSelect;
   VectorizeCmp = !::NoCmp;
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9a98c44..34d8a10 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -42,9 +42,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define LV_NAME "loop-vectorize"
-#define DEBUG_TYPE LV_NAME
-
 #include "llvm/Transforms/Vectorize.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/EquivalenceClasses.h"
@@ -54,6 +51,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
@@ -67,7 +65,9 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -85,16 +85,23 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/VectorUtils.h"
 #include <algorithm>
 #include <map>
+#include <tuple>
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+STATISTIC(LoopsVectorized, "Number of loops vectorized");
+STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+
 static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
                     cl::desc("Sets the SIMD width. Zero is autoselect."));
@@ -223,8 +230,9 @@ public:
                       const TargetLibraryInfo *TLI, unsigned VecWidth,
                       unsigned UnrollFactor)
       : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
-        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
-        OldInduction(0), WidenMap(UnrollFactor), Legal(0) {}
+        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
+        Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
+        Legal(nullptr) {}
 
   // Perform the actual loop widening (vectorization).
   void vectorize(LoopVectorizationLegality *L) {
@@ -469,6 +477,24 @@ static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
     B.SetCurrentDebugLocation(DebugLoc());
 }
 
+#ifndef NDEBUG
+/// \return string containing a file name and a line # for the given loop.
+static std::string getDebugLocString(const Loop *L) {
+  std::string Result;
+  if (L) {
+    raw_string_ostream OS(Result);
+    const DebugLoc LoopDbgLoc = L->getStartLoc();
+    if (!LoopDbgLoc.isUnknown())
+      LoopDbgLoc.print(L->getHeader()->getContext(), OS);
+    else
+      // Just print the module name.
+      OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
+    OS.flush();
+  }
+  return Result;
+}
+#endif
+
 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
 /// to what vectorization factor.
 /// This class does not look at the profitability of vectorization, only the
@@ -491,8 +517,8 @@ public:
   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
                             DominatorTree *DT, TargetLibraryInfo *TLI)
       : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
-        DT(DT), TLI(TLI), Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
-        MaxSafeDepDistBytes(-1U) {}
+        DT(DT), TLI(TLI), Induction(nullptr), WidestIndTy(nullptr),
+        HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {}
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -530,7 +556,7 @@ public:
 
   /// This struct holds information about reduction variables.
   struct ReductionDescriptor {
-    ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
+    ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr),
       Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
 
     ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
@@ -602,7 +628,7 @@ public:
   /// A struct for saving information about induction variables.
   struct InductionInfo {
     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
-    InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
+    InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {}
     /// Start value.
     TrackingVH<Value> StartValue;
     /// Induction kind.
@@ -789,7 +815,8 @@ public:
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
   VectorizationFactor selectVectorizationFactor(bool OptForSize,
-                                                unsigned UserVF);
+                                                unsigned UserVF,
+                                                bool ForceVectorization);
 
   /// \return The size (in bits) of the widest type in the code that
   /// needs to be vectorized. We ignore values that remain scalar such as
@@ -856,35 +883,32 @@ private:
 
 /// Utility class for getting and setting loop vectorizer hints in the form
 /// of loop metadata.
-struct LoopVectorizeHints {
-  /// Vectorization width.
-  unsigned Width;
-  /// Vectorization unroll factor.
-  unsigned Unroll;
-  /// Vectorization forced (-1 not selected, 0 force disabled, 1 force enabled)
-  int Force;
+class LoopVectorizeHints {
+public:
+  enum ForceKind {
+    FK_Undefined = -1, ///< Not selected.
+    FK_Disabled = 0,   ///< Forcing disabled.
+    FK_Enabled = 1,    ///< Forcing enabled.
+  };
 
   LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
-  : Width(VectorizationFactor)
-  , Unroll(DisableUnrolling ? 1 : VectorizationUnroll)
-  , Force(-1)
-  , LoopID(L->getLoopID()) {
+      : Width(VectorizationFactor),
+        Unroll(DisableUnrolling),
+        Force(FK_Undefined),
+        LoopID(L->getLoopID()) {
     getHints(L);
-    // The command line options override any loop metadata except for when
-    // width == 1 which is used to indicate the loop is already vectorized.
-    if (VectorizationFactor.getNumOccurrences() > 0 && Width != 1)
-      Width = VectorizationFactor;
+    // force-vector-unroll overrides DisableUnrolling.
     if (VectorizationUnroll.getNumOccurrences() > 0)
       Unroll = VectorizationUnroll;
 
-    DEBUG(if (DisableUnrolling && Unroll == 1)
-            dbgs() << "LV: Unrolling disabled by the pass manager\n");
+    DEBUG(if (DisableUnrolling && Unroll == 1) dbgs()
+          << "LV: Unrolling disabled by the pass manager\n");
   }
 
   /// Return the loop vectorizer metadata prefix.
   static StringRef Prefix() { return "llvm.vectorizer."; }
 
-  MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) {
+  MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) const {
     SmallVector<Value*, 2> Vals;
     Vals.push_back(MDString::get(Context, Name));
     Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V));
@@ -918,9 +942,12 @@ struct LoopVectorizeHints {
     LoopID = NewLoopID;
   }
 
-private:
-  MDNode *LoopID;
+  unsigned getWidth() const { return Width; }
+  unsigned getUnroll() const { return Unroll; }
+  enum ForceKind getForce() const { return Force; }
+  MDNode *getLoopID() const { return LoopID; }
 
+private:
   /// Find hints specified in the loop metadata.
   void getHints(const Loop *L) {
     if (!LoopID)
@@ -931,7 +958,7 @@ private:
     assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
 
     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-      const MDString *S = 0;
+      const MDString *S = nullptr;
       SmallVector<Value*, 4> Args;
 
       // The expected hint is either a MDString or a MDNode with the first
@@ -980,13 +1007,23 @@ private:
         DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n");
     } else if (Hint == "enable") {
       if (C->getBitWidth() == 1)
-        Force = Val;
+        Force = Val == 1 ? LoopVectorizeHints::FK_Enabled
+                         : LoopVectorizeHints::FK_Disabled;
       else
         DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n");
     } else {
       DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n');
     }
   }
+
+  /// Vectorization width.
+  unsigned Width;
+  /// Vectorization unroll factor.
+  unsigned Unroll;
+  /// Vectorization forced
+  enum ForceKind Force;
+
+  MDNode *LoopID;
 };
 
 static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
@@ -1024,7 +1061,7 @@ struct LoopVectorize : public FunctionPass {
   bool runOnFunction(Function &F) override {
     SE = &getAnalysis<ScalarEvolution>();
     DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-    DL = DLP ? &DLP->getDataLayout() : 0;
+    DL = DLP ? &DLP->getDataLayout() : nullptr;
     LI = &getAnalysis<LoopInfo>();
     TTI = &getAnalysis<TargetTransformInfo>();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -1041,8 +1078,9 @@ struct LoopVectorize : public FunctionPass {
     if (!TTI->getNumberOfRegisters(true))
       return false;
 
-    if (DL == NULL) {
-      DEBUG(dbgs() << "LV: Not vectorizing: Missing data layout\n");
+    if (!DL) {
+      DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName()
+                   << ": Missing data layout\n");
       return false;
     }
 
@@ -1054,6 +1092,8 @@ struct LoopVectorize : public FunctionPass {
     for (Loop *L : *LI)
       addInnerLoop(*L, Worklist);
 
+    LoopsAnalyzed += Worklist.size();
+
     // Now walk the identified inner loops.
     bool Changed = false;
     while (!Worklist.empty())
@@ -1065,26 +1105,56 @@ struct LoopVectorize : public FunctionPass {
 
   bool processLoop(Loop *L) {
     assert(L->empty() && "Only process inner loops.");
-    DEBUG(dbgs() << "LV: Checking a loop in \"" <<
-          L->getHeader()->getParent()->getName() << "\"\n");
+
+#ifndef NDEBUG
+    const std::string DebugLocStr = getDebugLocString(L);
+#endif /* NDEBUG */
+
+    DEBUG(dbgs() << "\nLV: Checking a loop in \""
+                 << L->getHeader()->getParent()->getName() << "\" from "
+                 << DebugLocStr << "\n");
 
     LoopVectorizeHints Hints(L, DisableUnrolling);
 
-    if (Hints.Force == 0) {
+    DEBUG(dbgs() << "LV: Loop hints:"
+                 << " force="
+                 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
+                         ? "disabled"
+                         : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
+                                ? "enabled"
+                                : "?")) << " width=" << Hints.getWidth()
+                 << " unroll=" << Hints.getUnroll() << "\n");
+
+    if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
       DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
       return false;
     }
 
-    if (!AlwaysVectorize && Hints.Force != 1) {
+    if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
       DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
       return false;
     }
 
-    if (Hints.Width == 1 && Hints.Unroll == 1) {
+    if (Hints.getWidth() == 1 && Hints.getUnroll() == 1) {
       DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
       return false;
     }
 
+    // Check the loop for a trip count threshold:
+    // do not vectorize loops with a tiny trip count.
+    BasicBlock *Latch = L->getLoopLatch();
+    const unsigned TC = SE->getSmallConstantTripCount(L, Latch);
+    if (TC > 0u && TC < TinyTripCountVectorThreshold) {
+      DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+                   << "This loop is not worth vectorizing.");
+      if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
+        DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+      else {
+        DEBUG(dbgs() << "\n");
+        return false;
+      }
+    }
+
     // Check if it is legal to vectorize the loop.
     LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
     if (!LVL.canVectorize()) {
@@ -1098,8 +1168,8 @@ struct LoopVectorize : public FunctionPass {
     // Check the function attributes to find out if this function should be
     // optimized for size.
     Function *F = L->getHeader()->getParent();
-    bool OptForSize =
-        Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize);
+    bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+                      F->hasFnAttribute(Attribute::OptimizeForSize);
 
     // Compute the weighted frequency of this loop being executed and see if it
     // is less than 20% of the function entry baseline frequency. Note that we
@@ -1108,7 +1178,8 @@ struct LoopVectorize : public FunctionPass {
     // exactly what block frequency models.
     if (LoopVectorizeWithBlockFrequency) {
       BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
-      if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq)
+      if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+          LoopEntryFreq < ColdEntryFreq)
         OptForSize = true;
     }
 
@@ -1123,14 +1194,17 @@ struct LoopVectorize : public FunctionPass {
     }
 
     // Select the optimal vectorization factor.
-    LoopVectorizationCostModel::VectorizationFactor VF;
-    VF = CM.selectVectorizationFactor(OptForSize, Hints.Width);
+    const LoopVectorizationCostModel::VectorizationFactor VF =
+        CM.selectVectorizationFactor(OptForSize, Hints.getWidth(),
+                                     Hints.getForce() ==
+                                         LoopVectorizeHints::FK_Enabled);
+
     // Select the unroll factor.
-    unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
-                                        VF.Cost);
+    const unsigned UF =
+        CM.selectUnrollFactor(OptForSize, Hints.getUnroll(), VF.Width, VF.Cost);
 
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
-          F->getParent()->getModuleIdentifier() << '\n');
+    DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
+                 << DebugLocStr << '\n');
     DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
 
     if (VF.Width == 1) {
@@ -1138,6 +1212,13 @@ struct LoopVectorize : public FunctionPass {
       if (UF == 1)
         return false;
       DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
+
+      // Report the unrolling decision.
+      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+                             Twine("unrolled with interleaving factor " +
+                                   Twine(UF) +
+                                   " (vectorization not beneficial)"));
+
       // We decided not to vectorize, but we may want to unroll.
       InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
       Unroller.vectorize(&LVL);
@@ -1145,6 +1226,13 @@ struct LoopVectorize : public FunctionPass {
       // If we decided that it is *legal* to vectorize the loop then do it.
       InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
       LB.vectorize(&LVL);
+      ++LoopsVectorized;
+
+      // Report the vectorization decision.
+      emitOptimizationRemark(
+          F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+          Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) +
+              ", unrolling interleave factor: " + Twine(UF) + ")");
     }
 
     // Mark the loop as already vectorized to avoid vectorizing again.
@@ -1188,7 +1276,7 @@ static Value *stripIntegerCast(Value *V) {
 /// \p Ptr.
 static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
                                              ValueToValueMap &PtrToStride,
-                                             Value *Ptr, Value *OrigPtr = 0) {
+                                             Value *Ptr, Value *OrigPtr = nullptr) {
 
   const SCEV *OrigSCEV = SE->getSCEV(Ptr);
 
@@ -1355,7 +1443,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
 
   // We can emit wide load/stores only if the last non-zero index is the
   // induction variable.
-  const SCEV *Last = 0;
+  const SCEV *Last = nullptr;
   if (!Strides.count(Gep))
     Last = SE->getSCEV(Gep->getOperand(InductionOperand));
   else {
@@ -1604,17 +1692,17 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
 
-  Value *UndefVec = IsVoidRetTy ? 0 :
+  Value *UndefVec = IsVoidRetTy ? nullptr :
     UndefValue::get(VectorType::get(Instr->getType(), VF));
   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
   Instruction *InsertPt = Builder.GetInsertPoint();
   BasicBlock *IfBlock = Builder.GetInsertBlock();
-  BasicBlock *CondBlock = 0;
+  BasicBlock *CondBlock = nullptr;
 
   VectorParts Cond;
-  Loop *VectorLp = 0;
+  Loop *VectorLp = nullptr;
   if (IfPredicateStore) {
     assert(Instr->getParent()->getSinglePredecessor() &&
            "Only support single predecessor blocks");
@@ -1630,7 +1718,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
     for (unsigned Width = 0; Width < VF; ++Width) {
 
       // Start if-block.
-      Value *Cmp = 0;
+      Value *Cmp = nullptr;
       if (IfPredicateStore) {
         Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
         Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
@@ -1681,21 +1769,21 @@ static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
   if (FirstInst)
     return FirstInst;
   if (Instruction *I = dyn_cast<Instruction>(V))
-    return I->getParent() == Loc->getParent() ? I : 0;
-  return 0;
+    return I->getParent() == Loc->getParent() ? I : nullptr;
+  return nullptr;
 }
 
 std::pair<Instruction *, Instruction *>
 InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
-  Instruction *tnullptr = 0;
+  Instruction *tnullptr = nullptr;
   if (!Legal->mustCheckStrides())
     return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
 
   IRBuilder<> ChkBuilder(Loc);
 
   // Emit checks.
-  Value *Check = 0;
-  Instruction *FirstInst = 0;
+  Value *Check = nullptr;
+  Instruction *FirstInst = nullptr;
   for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
                                          SE = Legal->strides_end();
        SI != SE; ++SI) {
@@ -1727,7 +1815,7 @@ InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) {
   LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
   Legal->getRuntimePointerCheck();
 
-  Instruction *tnullptr = 0;
+  Instruction *tnullptr = nullptr;
   if (!PtrRtCheck->Need)
     return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
 
@@ -1737,7 +1825,7 @@ InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) {
 
   LLVMContext &Ctx = Loc->getContext();
   SCEVExpander Exp(*SE, "induction");
-  Instruction *FirstInst = 0;
+  Instruction *FirstInst = nullptr;
 
   for (unsigned i = 0; i < NumPointers; ++i) {
     Value *Ptr = PtrRtCheck->Pointers[i];
@@ -1764,7 +1852,7 @@ InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) {
 
   IRBuilder<> ChkBuilder(Loc);
   // Our instructions might fold to a constant.
-  Value *MemoryRuntimeCheck = 0;
+  Value *MemoryRuntimeCheck = nullptr;
   for (unsigned i = 0; i < NumPointers; ++i) {
     for (unsigned j = i+1; j < NumPointers; ++j) {
       // No need to check if two readonly pointers intersect.
@@ -2028,7 +2116,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
   // start value.
 
   // This variable saves the new starting index for the scalar loop.
-  PHINode *ResumeIndex = 0;
+  PHINode *ResumeIndex = nullptr;
   LoopVectorizationLegality::InductionList::iterator I, E;
   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
   // Set builder to point to last bypass block.
@@ -2044,9 +2132,9 @@ void InnerLoopVectorizer::createEmptyLoop() {
     // truncated version for the scalar loop.
     PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
       PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
-                      MiddleBlock->getTerminator()) : 0;
+                      MiddleBlock->getTerminator()) : nullptr;
 
-    Value *EndValue = 0;
+    Value *EndValue = nullptr;
     switch (II.IK) {
     case LoopVectorizationLegality::IK_NoInduction:
       llvm_unreachable("Unknown induction");
@@ -2209,148 +2297,6 @@ LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
   }
 }
 
-static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I,
-                                              Intrinsic::ID ValidIntrinsicID) {
-  if (I.getNumArgOperands() != 1 ||
-      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
-      I.getType() != I.getArgOperand(0)->getType() ||
-      !I.onlyReadsMemory())
-    return Intrinsic::not_intrinsic;
-
-  return ValidIntrinsicID;
-}
-
-static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I,
-                                               Intrinsic::ID ValidIntrinsicID) {
-  if (I.getNumArgOperands() != 2 ||
-      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
-      !I.getArgOperand(1)->getType()->isFloatingPointTy() ||
-      I.getType() != I.getArgOperand(0)->getType() ||
-      I.getType() != I.getArgOperand(1)->getType() ||
-      !I.onlyReadsMemory())
-    return Intrinsic::not_intrinsic;
-
-  return ValidIntrinsicID;
-}
-
-
-static Intrinsic::ID
-getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
-  // If we have an intrinsic call, check if it is trivially vectorizable.
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::sqrt:
-    case Intrinsic::sin:
-    case Intrinsic::cos:
-    case Intrinsic::exp:
-    case Intrinsic::exp2:
-    case Intrinsic::log:
-    case Intrinsic::log10:
-    case Intrinsic::log2:
-    case Intrinsic::fabs:
-    case Intrinsic::copysign:
-    case Intrinsic::floor:
-    case Intrinsic::ceil:
-    case Intrinsic::trunc:
-    case Intrinsic::rint:
-    case Intrinsic::nearbyint:
-    case Intrinsic::round:
-    case Intrinsic::pow:
-    case Intrinsic::fma:
-    case Intrinsic::fmuladd:
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
-      return II->getIntrinsicID();
-    default:
-      return Intrinsic::not_intrinsic;
-    }
-  }
-
-  if (!TLI)
-    return Intrinsic::not_intrinsic;
-
-  LibFunc::Func Func;
-  Function *F = CI->getCalledFunction();
-  // We're going to make assumptions on the semantics of the functions, check
-  // that the target knows that it's available in this environment and it does
-  // not have local linkage.
-  if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func))
-    return Intrinsic::not_intrinsic;
-
-  // Otherwise check if we have a call to a function that can be turned into a
-  // vector intrinsic.
-  switch (Func) {
-  default:
-    break;
-  case LibFunc::sin:
-  case LibFunc::sinf:
-  case LibFunc::sinl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::sin);
-  case LibFunc::cos:
-  case LibFunc::cosf:
-  case LibFunc::cosl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::cos);
-  case LibFunc::exp:
-  case LibFunc::expf:
-  case LibFunc::expl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::exp);
-  case LibFunc::exp2:
-  case LibFunc::exp2f:
-  case LibFunc::exp2l:
-    return checkUnaryFloatSignature(*CI, Intrinsic::exp2);
-  case LibFunc::log:
-  case LibFunc::logf:
-  case LibFunc::logl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::log);
-  case LibFunc::log10:
-  case LibFunc::log10f:
-  case LibFunc::log10l:
-    return checkUnaryFloatSignature(*CI, Intrinsic::log10);
-  case LibFunc::log2:
-  case LibFunc::log2f:
-  case LibFunc::log2l:
-    return checkUnaryFloatSignature(*CI, Intrinsic::log2);
-  case LibFunc::fabs:
-  case LibFunc::fabsf:
-  case LibFunc::fabsl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::fabs);
-  case LibFunc::copysign:
-  case LibFunc::copysignf:
-  case LibFunc::copysignl:
-    return checkBinaryFloatSignature(*CI, Intrinsic::copysign);
-  case LibFunc::floor:
-  case LibFunc::floorf:
-  case LibFunc::floorl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::floor);
-  case LibFunc::ceil:
-  case LibFunc::ceilf:
-  case LibFunc::ceill:
-    return checkUnaryFloatSignature(*CI, Intrinsic::ceil);
-  case LibFunc::trunc:
-  case LibFunc::truncf:
-  case LibFunc::truncl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::trunc);
-  case LibFunc::rint:
-  case LibFunc::rintf:
-  case LibFunc::rintl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::rint);
-  case LibFunc::nearbyint:
-  case LibFunc::nearbyintf:
-  case LibFunc::nearbyintl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint);
-  case LibFunc::round:
-  case LibFunc::roundf:
-  case LibFunc::roundl:
-    return checkUnaryFloatSignature(*CI, Intrinsic::round);
-  case LibFunc::pow:
-  case LibFunc::powf:
-  case LibFunc::powl:
-    return checkBinaryFloatSignature(*CI, Intrinsic::pow);
-  }
-
-  return Intrinsic::not_intrinsic;
-}
-
 /// This function translates the reduction kind to an LLVM binary operator.
 static unsigned
 getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
@@ -2651,7 +2597,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
       assert(isPowerOf2_32(VF) &&
              "Reduction emission only supported for pow2 vectors!");
       Value *TmpVec = ReducedPartRdx;
-      SmallVector<Constant*, 32> ShuffleMask(VF, 0);
+      SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
       for (unsigned i = VF; i != 1; i >>= 1) {
         // Move the upper half of the vector to the lower half.
         for (unsigned j = 0; j != i/2; ++j)
@@ -3049,7 +2995,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       VectorParts &A = getVectorValue(it->getOperand(0));
       VectorParts &B = getVectorValue(it->getOperand(1));
       for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *C = 0;
+        Value *C = nullptr;
         if (FCmp)
           C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
         else
@@ -3275,15 +3221,6 @@ bool LoopVectorizationLegality::canVectorize() {
     return false;
   }
 
-  // Do not loop-vectorize loops with a tiny trip count.
-  BasicBlock *Latch = TheLoop->getLoopLatch();
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
-  if (TC > 0u && TC < TinyTripCountVectorThreshold) {
-    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
-          "This loop is not worth vectorizing.\n");
-    return false;
-  }
-
   // Check if we can vectorize the instructions and CFG in this loop.
   if (!canVectorizeInstrs()) {
     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
@@ -3536,14 +3473,14 @@ static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE,
 
 ///\brief Look for a cast use of the passed value.
 static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
-  Value *UniqueCast = 0;
+  Value *UniqueCast = nullptr;
   for (User *U : Ptr->users()) {
     CastInst *CI = dyn_cast<CastInst>(U);
     if (CI && CI->getType() == Ty) {
       if (!UniqueCast)
         UniqueCast = CI;
       else
-        return 0;
+        return nullptr;
     }
   }
   return UniqueCast;
@@ -3556,7 +3493,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
                                    const DataLayout *DL, Loop *Lp) {
   const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
   if (!PtrTy || PtrTy->isAggregateType())
-    return 0;
+    return nullptr;
 
   // Try to remove a gep instruction to make the pointer (actually index at this
   // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
@@ -3576,11 +3513,11 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
 
   const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
   if (!S)
-    return 0;
+    return nullptr;
 
   V = S->getStepRecurrence(*SE);
   if (!V)
-    return 0;
+    return nullptr;
 
   // Strip off the size of access multiplication if we are still analyzing the
   // pointer.
@@ -3588,24 +3525,24 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
     DL->getTypeAllocSize(PtrTy->getElementType());
     if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
       if (M->getOperand(0)->getSCEVType() != scConstant)
-        return 0;
+        return nullptr;
 
       const APInt &APStepVal =
           cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
 
       // Huge step value - give up.
       if (APStepVal.getBitWidth() > 64)
-        return 0;
+        return nullptr;
 
       int64_t StepVal = APStepVal.getSExtValue();
       if (PtrAccessSize != StepVal)
-        return 0;
+        return nullptr;
       V = M->getOperand(1);
     }
   }
 
   // Strip off casts.
-  Type *StripedOffRecurrenceCast = 0;
+  Type *StripedOffRecurrenceCast = nullptr;
   if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
     StripedOffRecurrenceCast = C->getType();
     V = C->getOperand();
@@ -3614,11 +3551,11 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
   // Look for the loop invariant symbolic value.
   const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
   if (!U)
-    return 0;
+    return nullptr;
 
   Value *Stride = U->getValue();
   if (!Lp->isLoopInvariant(Stride))
-    return 0;
+    return nullptr;
 
   // If we have stripped off the recurrence cast we have to make sure that we
   // return the value that is used in this loop so that we can replace it later.
@@ -3629,7 +3566,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
 }
 
 void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) {
-  Value *Ptr = 0;
+  Value *Ptr = nullptr;
   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
     Ptr = LI->getPointerOperand();
   else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
@@ -4628,7 +4565,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // We only allow for a single reduction value to be used outside the loop.
   // This includes users of the reduction, variables (which form a cycle
   // which ends in the phi node).
-  Instruction *ExitInstruction = 0;
+  Instruction *ExitInstruction = nullptr;
   // Indicates that we found a reduction operation in our scan.
   bool FoundReduxOp = false;
 
@@ -4642,7 +4579,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // the number of instruction we saw from the recognized min/max pattern,
   //  to make sure we only see exactly the two instructions.
   unsigned NumCmpSelectPatternInst = 0;
-  ReductionInstDesc ReduxDesc(false, 0);
+  ReductionInstDesc ReduxDesc(false, nullptr);
 
   SmallPtrSet<Instruction *, 8> VisitedInsts;
   SmallVector<Instruction *, 8> Worklist;
@@ -4725,7 +4662,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
         // being used. In this case the user uses the value of the previous
         // iteration, in which case we would loose "VF-1" iterations of the
         // reduction operation if we vectorize.
-        if (ExitInstruction != 0 || Cur == Phi)
+        if (ExitInstruction != nullptr || Cur == Phi)
           return false;
 
         // The instruction used by an outside user must be the last instruction
@@ -4741,7 +4678,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       // Process instructions only once (termination). Each reduction cycle
       // value must only be used once, except by phi nodes and min/max
       // reductions which are represented as a cmp followed by a select.
-      ReductionInstDesc IgnoredVal(false, 0);
+      ReductionInstDesc IgnoredVal(false, nullptr);
       if (VisitedInsts.insert(UI)) {
         if (isa<PHINode>(UI))
           PHIs.push_back(UI);
@@ -4795,8 +4732,8 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
 
   assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
          "Expect a select instruction");
-  Instruction *Cmp = 0;
-  SelectInst *Select = 0;
+  Instruction *Cmp = nullptr;
+  SelectInst *Select = nullptr;
 
   // We must handle the select(cmp()) as a single instruction. Advance to the
   // select.
@@ -4982,7 +4919,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
 
 LoopVectorizationCostModel::VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
-                                                      unsigned UserVF) {
+                                                      unsigned UserVF,
+                                                      bool ForceVectorization) {
   // Width 1 means no vectorize
   VectorizationFactor Factor = { 1U, 0U };
   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
@@ -5052,8 +4990,18 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
   }
 
   float Cost = expectedCost(1);
+#ifndef NDEBUG
+  const float ScalarCost = Cost;
+#endif /* NDEBUG */
   unsigned Width = 1;
-  DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n");
+  DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+
+  // Ignore scalar width, because the user explicitly wants vectorization.
+  if (ForceVectorization && VF > 1) {
+    Width = 2;
+    Cost = expectedCost(Width) / (float)Width;
+  }
+
   for (unsigned i=2; i <= VF; i*=2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
@@ -5067,7 +5015,10 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
     }
   }
 
-  DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
+  DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
+        << "LV: Vectorization seems to be not beneficial, "
+        << "but was forced by a user.\n");
+  DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
   Factor.Width = Width;
   Factor.Cost = Width * Cost;
   return Factor;
@@ -5516,7 +5467,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       Op2VK = TargetTransformInfo::OK_UniformConstantValue;
     else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
       Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
-      if (cast<Constant>(Op2)->getSplatValue() != NULL)
+      if (cast<Constant>(Op2)->getSplatValue() != nullptr)
         Op2VK = TargetTransformInfo::OK_UniformConstantValue;
     }
 
@@ -5730,17 +5681,17 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
 
-  Value *UndefVec = IsVoidRetTy ? 0 :
+  Value *UndefVec = IsVoidRetTy ? nullptr :
   UndefValue::get(Instr->getType());
   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
   Instruction *InsertPt = Builder.GetInsertPoint();
   BasicBlock *IfBlock = Builder.GetInsertBlock();
-  BasicBlock *CondBlock = 0;
+  BasicBlock *CondBlock = nullptr;
 
   VectorParts Cond;
-  Loop *VectorLp = 0;
+  Loop *VectorLp = nullptr;
   if (IfPredicateStore) {
     assert(Instr->getParent()->getSinglePredecessor() &&
            "Only support single predecessor blocks");
@@ -5755,7 +5706,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
     // For each scalar that we create:
 
     // Start an "if (pred) a[i] = ..." block.
-    Value *Cmp = 0;
+    Value *Cmp = nullptr;
     if (IfPredicateStore) {
       if (Cond[Part]->getType()->isVectorTy())
         Cond[Part] =
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ee32227..e13ba95 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15,9 +15,6 @@
 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
 //
 //===----------------------------------------------------------------------===//
-#define SV_NAME "slp-vectorizer"
-#define DEBUG_TYPE "SLP"
-
 #include "llvm/Transforms/Vectorize.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
@@ -34,6 +31,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
@@ -41,11 +39,15 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/VectorUtils.h"
 #include <algorithm>
 #include <map>
 
 using namespace llvm;
 
+#define SV_NAME "slp-vectorizer"
+#define DEBUG_TYPE "SLP"
+
 static cl::opt<int>
     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
                      cl::desc("Only vectorize if you gain more than this "
@@ -72,8 +74,6 @@ struct BlockNumbering {
 
   BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {}
 
-  BlockNumbering() : BB(0), Valid(false) {}
-
   void numberInstructions() {
     unsigned Loc = 0;
     InstrIdx.clear();
@@ -120,15 +120,15 @@ private:
 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
   if (!I0)
-    return 0;
+    return nullptr;
   BasicBlock *BB = I0->getParent();
   for (int i = 1, e = VL.size(); i < e; i++) {
     Instruction *I = dyn_cast<Instruction>(VL[i]);
     if (!I)
-      return 0;
+      return nullptr;
 
     if (BB != I->getParent())
-      return 0;
+      return nullptr;
   }
   return BB;
 }
@@ -180,7 +180,7 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
 
       switch (Kind) {
       default:
-        MD = 0; // Remove unknown metadata
+        MD = nullptr; // Remove unknown metadata
         break;
       case LLVMContext::MD_tbaa:
         MD = MDNode::getMostGenericTBAA(MD, IMD);
@@ -201,7 +201,7 @@ static Type* getSameType(ArrayRef<Value *> VL) {
   Type *Ty = VL[0]->getType();
   for (int i = 1, e = VL.size(); i < e; i++)
     if (VL[i]->getType() != Ty)
-      return 0;
+      return nullptr;
 
   return Ty;
 }
@@ -345,17 +345,10 @@ public:
   typedef SmallVector<StoreInst *, 8> StoreList;
 
   BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
-          TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li,
-          DominatorTree *Dt) :
-    F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt),
-    Builder(Se->getContext()) {
-      // Setup the block numbering utility for all of the blocks in the
-      // function.
-      for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
-        BasicBlock *BB = it;
-        BlocksNumbers[BB] = BlockNumbering(BB);
-      }
-    }
+          TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
+          LoopInfo *Li, DominatorTree *Dt)
+      : F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
+        Builder(Se->getContext()) {}
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
   /// Returns the vectorized root.
@@ -365,13 +358,13 @@ public:
   /// A negative number means that this is profitable.
   int getTreeCost();
 
-  /// Construct a vectorizable tree that starts at \p Roots and is possibly
-  /// used by a reduction of \p RdxOps.
-  void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0);
+  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+  /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
+  void buildTree(ArrayRef<Value *> Roots,
+                 ArrayRef<Value *> UserIgnoreLst = None);
 
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
-    RdxOps = 0;
     VectorizableTree.clear();
     ScalarToTreeEntry.clear();
     MustGather.clear();
@@ -446,7 +439,7 @@ private:
   bool isFullyVectorizableTinyTree();
 
   struct TreeEntry {
-    TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0),
+    TreeEntry() : Scalars(), VectorizedValue(nullptr), LastScalarIndex(0),
     NeedToGather(0) {}
 
     /// \returns true if the scalars in VL are equal to this entry.
@@ -527,14 +520,22 @@ private:
   /// Numbers instructions in different blocks.
   DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers;
 
-  /// Reduction operators.
-  ValueSet *RdxOps;
+  /// \brief Get the corresponding instruction numbering list for a given
+  /// BasicBlock. The list is allocated lazily.
+  BlockNumbering &getBlockNumbering(BasicBlock *BB) {
+    auto I = BlocksNumbers.insert(std::make_pair(BB, BlockNumbering(BB)));
+    return I.first->second;
+  }
+
+  /// List of users to ignore during scheduling and that don't need extracting.
+  ArrayRef<Value *> UserIgnoreList;
 
   // Analysis and block reference.
   Function *F;
   ScalarEvolution *SE;
   const DataLayout *DL;
   TargetTransformInfo *TTI;
+  TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
   LoopInfo *LI;
   DominatorTree *DT;
@@ -542,9 +543,10 @@ private:
   IRBuilder<> Builder;
 };
 
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) {
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+                        ArrayRef<Value *> UserIgnoreLst) {
   deleteTree();
-  RdxOps = Rdx;
+  UserIgnoreList = UserIgnoreLst;
   if (!getSameType(Roots))
     return;
   buildTree_rec(Roots, 0);
@@ -576,8 +578,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) {
         if (!UserInst)
           continue;
 
-        // Ignore uses that are part of the reduction.
-        if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end())
+        // Ignore users in the user ignore list.
+        if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
+            UserIgnoreList.end())
           continue;
 
         DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
@@ -708,12 +711,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         continue;
       }
 
-      // This user is part of the reduction.
-      if (RdxOps && RdxOps->count(UI))
+      // Ignore users in the user ignore list.
+      if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UI) !=
+          UserIgnoreList.end())
         continue;
 
       // Make sure that we can schedule this unknown user.
-      BlockNumbering &BN = BlocksNumbers[BB];
+      BlockNumbering &BN = getBlockNumbering(BB);
       int UserIndex = BN.getIndex(UI);
       if (UserIndex < MyLastIndex) {
 
@@ -948,32 +952,36 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     }
     case Instruction::Call: {
       // Check if the calls are all to the same vectorizable intrinsic.
-      IntrinsicInst *II = dyn_cast<IntrinsicInst>(VL[0]);
-      if (II==NULL) {
+      CallInst *CI = cast<CallInst>(VL[0]);
+      // Check if this is an Intrinsic call or something that can be
+      // represented by an intrinsic call
+      Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+      if (!isTriviallyVectorizable(ID)) {
         newTreeEntry(VL, false);
         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
 
-      Function *Int = II->getCalledFunction();
+      Function *Int = CI->getCalledFunction();
 
       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
-        IntrinsicInst *II2 = dyn_cast<IntrinsicInst>(VL[i]);
-        if (!II2 || II2->getCalledFunction() != Int) {
+        CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
+        if (!CI2 || CI2->getCalledFunction() != Int ||
+            getIntrinsicIDForCall(CI2, TLI) != ID) {
           newTreeEntry(VL, false);
-          DEBUG(dbgs() << "SLP: mismatched calls:" << *II << "!=" << *VL[i]
+          DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                        << "\n");
           return;
         }
       }
 
       newTreeEntry(VL, true);
-      for (unsigned i = 0, e = II->getNumArgOperands(); i != e; ++i) {
+      for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
         for (unsigned j = 0; j < VL.size(); ++j) {
-          IntrinsicInst *II2 = dyn_cast<IntrinsicInst>(VL[j]);
-          Operands.push_back(II2->getArgOperand(i));
+          CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
+          Operands.push_back(CI2->getArgOperand(i));
         }
         buildTree_rec(Operands, Depth + 1);
       }
@@ -1090,7 +1098,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
         // If instead not all operands are constants, then set the operand kind
         // to OK_AnyValue. If all operands are constants but not the same,
         // then set the operand kind to OK_NonUniformConstantValue.
-        ConstantInt *CInt = NULL;
+        ConstantInt *CInt = nullptr;
         for (unsigned i = 0; i < VL.size(); ++i) {
           const Instruction *I = cast<Instruction>(VL[i]);
           if (!isa<ConstantInt>(I->getOperand(1))) {
@@ -1129,12 +1137,11 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
-      IntrinsicInst *II = cast<IntrinsicInst>(CI);
-      Intrinsic::ID ID = II->getIntrinsicID();
+      Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
 
       // Calculate the cost of the scalar and vector calls.
       SmallVector<Type*, 4> ScalarTys, VecTys;
-      for (unsigned op = 0, opc = II->getNumArgOperands(); op!= opc; ++op) {
+      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
         ScalarTys.push_back(CI->getArgOperand(op)->getType());
         VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
                                          VecTy->getNumElements()));
@@ -1147,7 +1154,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
 
       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
-            << " for " << *II << "\n");
+            << " for " << *CI << "\n");
 
       return VecCallCost - ScalarCallCost;
     }
@@ -1244,7 +1251,7 @@ Value *BoUpSLP::getPointerOperand(Value *I) {
     return LI->getPointerOperand();
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return SI->getPointerOperand();
-  return 0;
+  return nullptr;
 }
 
 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
@@ -1318,13 +1325,13 @@ Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) {
     if (!A.Ptr || !B.Ptr || AA->alias(A, B))
       return I;
   }
-  return 0;
+  return nullptr;
 }
 
 int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) {
   BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
-  assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
-  BlockNumbering &BN = BlocksNumbers[BB];
+  assert(BB == getSameBlock(VL) && "Invalid block");
+  BlockNumbering &BN = getBlockNumbering(BB);
 
   int MaxIdx = BN.getIndex(BB->getFirstNonPHI());
   for (unsigned i = 0, e = VL.size(); i < e; ++i)
@@ -1334,8 +1341,8 @@ int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) {
 
 Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) {
   BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
-  assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
-  BlockNumbering &BN = BlocksNumbers[BB];
+  assert(BB == getSameBlock(VL) && "Invalid block");
+  BlockNumbering &BN = getBlockNumbering(BB);
 
   int MaxIdx = BN.getIndex(cast<Instruction>(VL[0]));
   for (unsigned i = 1, e = VL.size(); i < e; ++i)
@@ -1394,7 +1401,7 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
     if (En->isSame(VL) && En->VectorizedValue)
       return En->VectorizedValue;
   }
-  return 0;
+  return nullptr;
 }
 
 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
@@ -1615,6 +1622,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
                                             VecTy->getPointerTo(AS));
       unsigned Alignment = LI->getAlignment();
       LI = Builder.CreateLoad(VecPtr);
+      if (!Alignment)
+        Alignment = DL->getABITypeAlignment(LI->getPointerOperand()->getType());
       LI->setAlignment(Alignment);
       E->VectorizedValue = LI;
       return propagateMetadata(LI, E->Scalars);
@@ -1634,13 +1643,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
                                             VecTy->getPointerTo(AS));
       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+      if (!Alignment)
+        Alignment = DL->getABITypeAlignment(SI->getPointerOperand()->getType());
       S->setAlignment(Alignment);
       E->VectorizedValue = S;
       return propagateMetadata(S, E->Scalars);
     }
     case Instruction::Call: {
       CallInst *CI = cast<CallInst>(VL0);
-
       setInsertPointAfterBundle(E->Scalars);
       std::vector<Value *> OpVecs;
       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
@@ -1656,8 +1666,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       Module *M = F->getParent();
-      IntrinsicInst *II = cast<IntrinsicInst>(CI);
-      Intrinsic::ID ID = II->getIntrinsicID();
+      Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
       Value *V = Builder.CreateCall(CF, OpVecs);
@@ -1667,7 +1676,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     default:
     llvm_unreachable("unknown inst");
   }
-  return 0;
+  return nullptr;
 }
 
 Value *BoUpSLP::vectorizeTree() {
@@ -1746,8 +1755,9 @@ Value *BoUpSLP::vectorizeTree() {
           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
           assert((ScalarToTreeEntry.count(U) ||
-                  // It is legal to replace the reduction users by undef.
-                  (RdxOps && RdxOps->count(U))) &&
+                  // It is legal to replace users in the ignorelist by undef.
+                  (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
+                   UserIgnoreList.end())) &&
                  "Replacing out-of-tree value with undef");
         }
 #endif
@@ -1759,9 +1769,9 @@ Value *BoUpSLP::vectorizeTree() {
     }
   }
 
-  for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
-    BlocksNumbers[it].forget();
-  }
+  for (auto &BN : BlocksNumbers)
+    BN.second.forget();
+
   Builder.ClearInsertionPoint();
 
   return VectorizableTree[0].VectorizedValue;
@@ -1802,11 +1812,19 @@ void BoUpSLP::optimizeGatherSequence() {
     Insert->moveBefore(PreHeader->getTerminator());
   }
 
+  // Make a list of all reachable blocks in our CSE queue.
+  SmallVector<const DomTreeNode *, 8> CSEWorkList;
+  CSEWorkList.reserve(CSEBlocks.size());
+  for (BasicBlock *BB : CSEBlocks)
+    if (DomTreeNode *N = DT->getNode(BB)) {
+      assert(DT->isReachableFromEntry(N));
+      CSEWorkList.push_back(N);
+    }
+
   // Sort blocks by domination. This ensures we visit a block after all blocks
   // dominating it are visited.
-  SmallVector<BasicBlock *, 8> CSEWorkList(CSEBlocks.begin(), CSEBlocks.end());
   std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
-                   [this](const BasicBlock *A, const BasicBlock *B) {
+                   [this](const DomTreeNode *A, const DomTreeNode *B) {
     return DT->properlyDominates(A, B);
   });
 
@@ -1814,12 +1832,10 @@ void BoUpSLP::optimizeGatherSequence() {
   // instructions. TODO: We can further optimize this scan if we split the
   // instructions into different buckets based on the insert lane.
   SmallVector<Instruction *, 16> Visited;
-  for (SmallVectorImpl<BasicBlock *>::iterator I = CSEWorkList.begin(),
-                                               E = CSEWorkList.end();
-       I != E; ++I) {
+  for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
     assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
            "Worklist not sorted properly!");
-    BasicBlock *BB = *I;
+    BasicBlock *BB = (*I)->getBlock();
     // For all instructions in blocks containing gather sequences:
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
       Instruction *In = it++;
@@ -1835,7 +1851,7 @@ void BoUpSLP::optimizeGatherSequence() {
             DT->dominates((*v)->getParent(), In->getParent())) {
           In->replaceAllUsesWith(*v);
           In->eraseFromParent();
-          In = 0;
+          In = nullptr;
           break;
         }
       }
@@ -1864,6 +1880,7 @@ struct SLPVectorizer : public FunctionPass {
   ScalarEvolution *SE;
   const DataLayout *DL;
   TargetTransformInfo *TTI;
+  TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
   LoopInfo *LI;
   DominatorTree *DT;
@@ -1874,8 +1891,9 @@ struct SLPVectorizer : public FunctionPass {
 
     SE = &getAnalysis<ScalarEvolution>();
     DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-    DL = DLP ? &DLP->getDataLayout() : 0;
+    DL = DLP ? &DLP->getDataLayout() : nullptr;
     TTI = &getAnalysis<TargetTransformInfo>();
+    TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
     AA = &getAnalysis<AliasAnalysis>();
     LI = &getAnalysis<LoopInfo>();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -1900,8 +1918,8 @@ struct SLPVectorizer : public FunctionPass {
     DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
 
     // Use the bottom up slp vectorizer to construct chains that start with
-    // he store instructions.
-    BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT);
+    // store instructions.
+    BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT);
 
     // Scan the blocks in the function in post order.
     for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
@@ -1951,8 +1969,11 @@ private:
   bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
 
   /// \brief Try to vectorize a list of operands.
+  /// \@param BuildVector A list of users to ignore for the purpose of
+  ///                     scheduling and that don't need extracting.
   /// \returns true if a value was vectorized.
-  bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R);
+  bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+                          ArrayRef<Value *> BuildVector = None);
 
   /// \brief Try to vectorize a chain that may start at the operands of \V;
   bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
@@ -2106,7 +2127,7 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
     // Check that the pointer points to scalars.
     Type *Ty = SI->getValueOperand()->getType();
     if (Ty->isAggregateType() || Ty->isVectorTy())
-      return 0;
+      continue;
 
     // Find the base pointer.
     Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
@@ -2125,7 +2146,8 @@ bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
   return tryToVectorizeList(VL, R);
 }
 
-bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
+bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+                                       ArrayRef<Value *> BuildVector) {
   if (VL.size() < 2)
     return false;
 
@@ -2153,7 +2175,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
 
   bool Changed = false;
 
-  // Keep track of values that were delete by vectorizing in the loop below.
+  // Keep track of values that were deleted by vectorizing in the loop below.
   SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
 
   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
@@ -2175,13 +2197,38 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
                  << "\n");
     ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
 
-    R.buildTree(Ops);
+    ArrayRef<Value *> BuildVectorSlice;
+    if (!BuildVector.empty())
+      BuildVectorSlice = BuildVector.slice(i, OpsWidth);
+
+    R.buildTree(Ops, BuildVectorSlice);
     int Cost = R.getTreeCost();
 
     if (Cost < -SLPCostThreshold) {
       DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
-      R.vectorizeTree();
-
+      Value *VectorizedRoot = R.vectorizeTree();
+
+      // Reconstruct the build vector by extracting the vectorized root. This
+      // way we handle the case where some elements of the vector are undefined.
+      //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
+      if (!BuildVectorSlice.empty()) {
+        // The insert point is the last build vector instruction. The vectorized
+        // root will precede it. This guarantees that we get an instruction. The
+        // vectorized tree could have been constant folded.
+        Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
+        unsigned VecIdx = 0;
+        for (auto &V : BuildVectorSlice) {
+          IRBuilder<true, NoFolder> Builder(
+              ++BasicBlock::iterator(InsertAfter));
+          InsertElementInst *IE = cast<InsertElementInst>(V);
+          Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
+              VectorizedRoot, Builder.getInt32(VecIdx++)));
+          IE->setOperand(1, Extract);
+          IE->removeFromParent();
+          IE->insertAfter(Extract);
+          InsertAfter = IE;
+        }
+      }
       // Move to the next bundle.
       i += VF - 1;
       Changed = true;
@@ -2290,7 +2337,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
 ///   *p =
 ///
 class HorizontalReduction {
-  SmallPtrSet<Value *, 16> ReductionOps;
+  SmallVector<Value *, 16> ReductionOps;
   SmallVector<Value *, 32> ReducedVals;
 
   BinaryOperator *ReductionRoot;
@@ -2308,7 +2355,7 @@ class HorizontalReduction {
 
 public:
   HorizontalReduction()
-    : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0),
+    : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
     ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
 
   /// \brief Try to find a reduction tree.
@@ -2323,10 +2370,10 @@ public:
     // In such a case start looking for a tree rooted in the first '+'.
     if (Phi) {
       if (B->getOperand(0) == Phi) {
-        Phi = 0;
+        Phi = nullptr;
         B = dyn_cast<BinaryOperator>(B->getOperand(1));
       } else if (B->getOperand(1) == Phi) {
-        Phi = 0;
+        Phi = nullptr;
         B = dyn_cast<BinaryOperator>(B->getOperand(0));
       }
     }
@@ -2384,7 +2431,7 @@ public:
           // We need to be able to reassociate the adds.
           if (!TreeN->isAssociative())
             return false;
-          ReductionOps.insert(TreeN);
+          ReductionOps.push_back(TreeN);
         }
         // Retract.
         Stack.pop_back();
@@ -2412,7 +2459,7 @@ public:
     if (NumReducedVals < ReduxWidth)
       return false;
 
-    Value *VectorizedTree = 0;
+    Value *VectorizedTree = nullptr;
     IRBuilder<> Builder(ReductionRoot);
     FastMathFlags Unsafe;
     Unsafe.setUnsafeAlgebra();
@@ -2421,7 +2468,7 @@ public:
 
     for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
       ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth);
-      V.buildTree(ValsToReduce, &ReductionOps);
+      V.buildTree(ValsToReduce, ReductionOps);
 
       // Estimate cost.
       int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
@@ -2455,13 +2502,13 @@ public:
       }
       // Update users.
       if (ReductionPHI) {
-        assert(ReductionRoot != NULL && "Need a reduction operation");
+        assert(ReductionRoot && "Need a reduction operation");
         ReductionRoot->setOperand(0, VectorizedTree);
         ReductionRoot->setOperand(1, ReductionPHI);
       } else
         ReductionRoot->replaceAllUsesWith(VectorizedTree);
     }
-    return VectorizedTree != 0;
+    return VectorizedTree != nullptr;
   }
 
 private:
@@ -2540,13 +2587,16 @@ private:
 ///
 /// Returns true if it matches
 ///
-static bool findBuildVector(InsertElementInst *IE,
-                            SmallVectorImpl<Value *> &Ops) {
-  if (!isa<UndefValue>(IE->getOperand(0)))
+static bool findBuildVector(InsertElementInst *FirstInsertElem,
+                            SmallVectorImpl<Value *> &BuildVector,
+                            SmallVectorImpl<Value *> &BuildVectorOpds) {
+  if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
     return false;
 
+  InsertElementInst *IE = FirstInsertElem;
   while (true) {
-    Ops.push_back(IE->getOperand(1));
+    BuildVector.push_back(IE);
+    BuildVectorOpds.push_back(IE->getOperand(1));
 
     if (IE->use_empty())
       return false;
@@ -2641,7 +2691,8 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       Value *Rdx =
           (P->getIncomingBlock(0) == BB
                ? (P->getIncomingValue(0))
-               : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0));
+               : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
+                                               : nullptr));
       // Check if this is a Binary Operator.
       BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
       if (!BI)
@@ -2680,7 +2731,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
         if (BinaryOperator *BinOp =
                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
           HorizontalReduction HorRdx;
-          if (((HorRdx.matchAssociativeReduction(0, BinOp, DL) &&
+          if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) &&
                 HorRdx.tryToReduce(R, TTI)) ||
                tryToVectorize(BinOp, R))) {
             Changed = true;
@@ -2716,12 +2767,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
     }
 
     // Try to vectorize trees that start at insertelement instructions.
-    if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) {
-      SmallVector<Value *, 8> Ops;
-      if (!findBuildVector(IE, Ops))
+    if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
+      SmallVector<Value *, 16> BuildVector;
+      SmallVector<Value *, 16> BuildVectorOpds;
+      if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
         continue;
 
-      if (tryToVectorizeList(Ops, R)) {
+      // Vectorize starting with the build vector operands ignoring the
+      // BuildVector instructions for the purpose of scheduling and user
+      // extraction.
+      if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
         Changed = true;
         it = BB->begin();
         e = BB->end();
diff --git a/shared_llvm.mk b/shared_llvm.mk
index caefb64..ff62966 100644
--- a/shared_llvm.mk
+++ b/shared_llvm.mk
@@ -63,14 +63,15 @@ llvm_post_static_libraries := \
   libLLVMAsmParser \
   libLLVMOption \
   libLLVMSupport \
-  libLLVMVectorize
+  libLLVMVectorize \
 
 llvm_host_static_libraries := \
   libLLVMExecutionEngine \
   libLLVMMCDisassembler \
   libLLVMRuntimeDyld \
   libLLVMJIT \
-  libLLVMMCJIT
+  libLLVMMCJIT \
+  libLLVMProfileData
 
 ifeq (true,$(FORCE_BUILD_LLVM_COMPONENTS))
 # HOST LLVM shared library build
@@ -135,4 +136,3 @@ include $(BUILD_SHARED_LIBRARY)
 endif
 
 endif # don't build in unbundled branches
-
diff --git a/test/Analysis/BlockFrequencyInfo/bad_input.ll b/test/Analysis/BlockFrequencyInfo/bad_input.ll
new file mode 100644
index 0000000..bcdc1e6
--- /dev/null
+++ b/test/Analysis/BlockFrequencyInfo/bad_input.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+
+declare void @g(i32 %x)
+
+; CHECK-LABEL: Printing analysis {{.*}} for function 'branch_weight_0':
+; CHECK-NEXT: block-frequency-info: branch_weight_0
+define void @branch_weight_0(i32 %a) {
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+entry:
+  br label %for.body
+
+; Check that we get 1,4 instead of 0,3.
+; CHECK-NEXT: for.body: float = 4.0,
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  call void @g(i32 %i)
+  %inc = add i32 %i, 1
+  %cmp = icmp ugt i32 %inc, %a
+  br i1 %cmp, label %for.end, label %for.body, !prof !0
+
+; CHECK-NEXT: for.end: float = 1.0, int = [[ENTRY]]
+for.end:
+  ret void
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 0, i32 3}
+
+; CHECK-LABEL: Printing analysis {{.*}} for function 'infinite_loop'
+; CHECK-NEXT: block-frequency-info: infinite_loop
+define void @infinite_loop(i1 %x) {
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+entry:
+  br i1 %x, label %for.body, label %for.end, !prof !1
+
+; Check that the loop scale maxes out at 4096, giving 2048 here.
+; CHECK-NEXT: for.body: float = 2048.0,
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  call void @g(i32 %i)
+  %inc = add i32 %i, 1
+  br label %for.body
+
+; Check that the exit weight is half of entry, since half is lost in the
+; infinite loop above.
+; CHECK-NEXT: for.end: float = 0.5,
+for.end:
+  ret void
+}
+
+!1 = metadata !{metadata !"branch_weights", i32 1, i32 1}
diff --git a/test/Analysis/BlockFrequencyInfo/basic.ll b/test/Analysis/BlockFrequencyInfo/basic.ll
index ce29fb5..006e6ab 100644
--- a/test/Analysis/BlockFrequencyInfo/basic.ll
+++ b/test/Analysis/BlockFrequencyInfo/basic.ll
@@ -1,13 +1,14 @@
 ; RUN: opt < %s -analyze -block-freq | FileCheck %s
 
 define i32 @test1(i32 %i, i32* %a) {
-; CHECK: Printing analysis {{.*}} for function 'test1'
-; CHECK: entry = 1.0
+; CHECK-LABEL: Printing analysis {{.*}} for function 'test1':
+; CHECK-NEXT: block-frequency-info: test1
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
 entry:
   br label %body
 
 ; Loop backedges are weighted and thus their bodies have a greater frequency.
-; CHECK: body = 32.0
+; CHECK-NEXT: body: float = 32.0,
 body:
   %iv = phi i32 [ 0, %entry ], [ %next, %body ]
   %base = phi i32 [ 0, %entry ], [ %sum, %body ]
@@ -18,29 +19,29 @@ body:
   %exitcond = icmp eq i32 %next, %i
   br i1 %exitcond, label %exit, label %body
 
-; CHECK: exit = 1.0
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
 exit:
   ret i32 %sum
 }
 
 define i32 @test2(i32 %i, i32 %a, i32 %b) {
-; CHECK: Printing analysis {{.*}} for function 'test2'
-; CHECK: entry = 1.0
+; CHECK-LABEL: Printing analysis {{.*}} for function 'test2':
+; CHECK-NEXT: block-frequency-info: test2
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
 entry:
   %cond = icmp ult i32 %i, 42
   br i1 %cond, label %then, label %else, !prof !0
 
 ; The 'then' branch is predicted more likely via branch weight metadata.
-; CHECK: then = 0.94116
+; CHECK-NEXT: then: float = 0.9411{{[0-9]*}},
 then:
   br label %exit
 
-; CHECK: else = 0.05877
+; CHECK-NEXT: else: float = 0.05882{{[0-9]*}},
 else:
   br label %exit
 
-; FIXME: It may be a bug that we don't sum back to 1.0.
-; CHECK: exit = 0.99993
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
 exit:
   %result = phi i32 [ %a, %then ], [ %b, %else ]
   ret i32 %result
@@ -49,37 +50,37 @@ exit:
 !0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
 
 define i32 @test3(i32 %i, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
-; CHECK: Printing analysis {{.*}} for function 'test3'
-; CHECK: entry = 1.0
+; CHECK-LABEL: Printing analysis {{.*}} for function 'test3':
+; CHECK-NEXT: block-frequency-info: test3
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
 entry:
   switch i32 %i, label %case_a [ i32 1, label %case_b
                                  i32 2, label %case_c
                                  i32 3, label %case_d
                                  i32 4, label %case_e ], !prof !1
 
-; CHECK: case_a = 0.04998
+; CHECK-NEXT: case_a: float = 0.05,
 case_a:
   br label %exit
 
-; CHECK: case_b = 0.04998
+; CHECK-NEXT: case_b: float = 0.05,
 case_b:
   br label %exit
 
 ; The 'case_c' branch is predicted more likely via branch weight metadata.
-; CHECK: case_c = 0.79998
+; CHECK-NEXT: case_c: float = 0.8,
 case_c:
   br label %exit
 
-; CHECK: case_d = 0.04998
+; CHECK-NEXT: case_d: float = 0.05,
 case_d:
   br label %exit
 
-; CHECK: case_e = 0.04998
+; CHECK-NEXT: case_e: float = 0.05,
 case_e:
   br label %exit
 
-; FIXME: It may be a bug that we don't sum back to 1.0.
-; CHECK: exit = 0.99993
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
 exit:
   %result = phi i32 [ %a, %case_a ],
                     [ %b, %case_b ],
@@ -91,44 +92,50 @@ exit:
 
 !1 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4}
 
-; CHECK: Printing analysis {{.*}} for function 'nested_loops'
-; CHECK: entry = 1.0
-; This test doesn't seem to be assigning sensible frequencies to nested loops.
 define void @nested_loops(i32 %a) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'nested_loops':
+; CHECK-NEXT: block-frequency-info: nested_loops
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
 entry:
   br label %for.cond1.preheader
 
+; CHECK-NEXT: for.cond1.preheader: float = 4001.0,
 for.cond1.preheader:
   %x.024 = phi i32 [ 0, %entry ], [ %inc12, %for.inc11 ]
   br label %for.cond4.preheader
 
+; CHECK-NEXT: for.cond4.preheader: float = 16008001.0,
 for.cond4.preheader:
   %y.023 = phi i32 [ 0, %for.cond1.preheader ], [ %inc9, %for.inc8 ]
   %add = add i32 %y.023, %x.024
   br label %for.body6
 
+; CHECK-NEXT: for.body6: float = 64048012001.0,
 for.body6:
   %z.022 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
   %add7 = add i32 %add, %z.022
-  tail call void @g(i32 %add7) #2
+  tail call void @g(i32 %add7)
   %inc = add i32 %z.022, 1
   %cmp5 = icmp ugt i32 %inc, %a
   br i1 %cmp5, label %for.inc8, label %for.body6, !prof !2
 
+; CHECK-NEXT: for.inc8: float = 16008001.0,
 for.inc8:
   %inc9 = add i32 %y.023, 1
   %cmp2 = icmp ugt i32 %inc9, %a
   br i1 %cmp2, label %for.inc11, label %for.cond4.preheader, !prof !2
 
+; CHECK-NEXT: for.inc11: float = 4001.0,
 for.inc11:
   %inc12 = add i32 %x.024, 1
   %cmp = icmp ugt i32 %inc12, %a
   br i1 %cmp, label %for.end13, label %for.cond1.preheader, !prof !2
 
+; CHECK-NEXT: for.end13: float = 1.0, int = [[ENTRY]]
 for.end13:
   ret void
 }
 
-declare void @g(i32) #1
+declare void @g(i32)
 
 !2 = metadata !{metadata !"branch_weights", i32 1, i32 4000}
diff --git a/test/Analysis/BlockFrequencyInfo/double_backedge.ll b/test/Analysis/BlockFrequencyInfo/double_backedge.ll
new file mode 100644
index 0000000..df8217c
--- /dev/null
+++ b/test/Analysis/BlockFrequencyInfo/double_backedge.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+
+define void @double_backedge(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'double_backedge':
+; CHECK-NEXT: block-frequency-info: double_backedge
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br label %loop
+
+loop:
+; CHECK-NEXT: loop: float = 10.0,
+  br i1 %x, label %exit, label %loop.1, !prof !0
+
+loop.1:
+; CHECK-NEXT: loop.1: float = 9.0,
+  br i1 %x, label %loop, label %loop.2, !prof !1
+
+loop.2:
+; CHECK-NEXT: loop.2: float = 5.0,
+  br label %loop
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!0 = metadata !{metadata !"branch_weights", i32 1, i32 9}
+!1 = metadata !{metadata !"branch_weights", i32 4, i32 5}
diff --git a/test/Analysis/BlockFrequencyInfo/double_exit.ll b/test/Analysis/BlockFrequencyInfo/double_exit.ll
new file mode 100644
index 0000000..75f664d
--- /dev/null
+++ b/test/Analysis/BlockFrequencyInfo/double_exit.ll
@@ -0,0 +1,165 @@
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+
+; CHECK-LABEL: Printing analysis {{.*}} for function 'double_exit':
+; CHECK-NEXT: block-frequency-info: double_exit
+define i32 @double_exit(i32 %N) {
+; Mass = 1
+; Frequency = 1
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+entry:
+  br label %outer
+
+; Mass = 1
+; Backedge mass = 1/3, exit mass = 2/3
+; Loop scale = 3/2
+; Pseudo-edges = exit
+; Pseudo-mass = 1
+; Frequency = 1*3/2*1 = 3/2
+; CHECK-NEXT: outer: float = 1.5,
+outer:
+  %I.0 = phi i32 [ 0, %entry ], [ %inc6, %outer.inc ]
+  %Return.0 = phi i32 [ 0, %entry ], [ %Return.1, %outer.inc ]
+  %cmp = icmp slt i32 %I.0, %N
+  br i1 %cmp, label %inner, label %exit, !prof !2 ; 2:1
+
+; Mass = 1
+; Backedge mass = 3/5, exit mass = 2/5
+; Loop scale = 5/2
+; Pseudo-edges = outer.inc @ 1/5, exit @ 1/5
+; Pseudo-mass = 2/3
+; Frequency = 3/2*1*5/2*2/3 = 5/2
+; CHECK-NEXT: inner: float = 2.5,
+inner:
+  %Return.1 = phi i32 [ %Return.0, %outer ], [ %call4, %inner.inc ]
+  %J.0 = phi i32 [ %I.0, %outer ], [ %inc, %inner.inc ]
+  %cmp2 = icmp slt i32 %J.0, %N
+  br i1 %cmp2, label %inner.body, label %outer.inc, !prof !1 ; 4:1
+
+; Mass = 4/5
+; Frequency = 5/2*4/5 = 2
+; CHECK-NEXT: inner.body: float = 2.0,
+inner.body:
+  %call = call i32 @c2(i32 %I.0, i32 %J.0)
+  %tobool = icmp ne i32 %call, 0
+  br i1 %tobool, label %exit, label %inner.inc, !prof !0 ; 3:1
+
+; Mass = 3/5
+; Frequency = 5/2*3/5 = 3/2
+; CHECK-NEXT: inner.inc: float = 1.5,
+inner.inc:
+  %call4 = call i32 @logic2(i32 %Return.1, i32 %I.0, i32 %J.0)
+  %inc = add nsw i32 %J.0, 1
+  br label %inner
+
+; Mass = 1/3
+; Frequency = 3/2*1/3 = 1/2
+; CHECK-NEXT: outer.inc: float = 0.5,
+outer.inc:
+  %inc6 = add nsw i32 %I.0, 1
+  br label %outer
+
+; Mass = 1
+; Frequency = 1
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+exit:
+  %Return.2 = phi i32 [ %Return.1, %inner.body ], [ %Return.0, %outer ]
+  ret i32 %Return.2
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 1, i32 3}
+!1 = metadata !{metadata !"branch_weights", i32 4, i32 1}
+!2 = metadata !{metadata !"branch_weights", i32 2, i32 1}
+
+declare i32 @c2(i32, i32)
+declare i32 @logic2(i32, i32, i32)
+
+; CHECK-LABEL: Printing analysis {{.*}} for function 'double_exit_in_loop':
+; CHECK-NEXT: block-frequency-info: double_exit_in_loop
+define i32 @double_exit_in_loop(i32 %N) {
+; Mass = 1
+; Frequency = 1
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+entry:
+  br label %outer
+
+; Mass = 1
+; Backedge mass = 1/2, exit mass = 1/2
+; Loop scale = 2
+; Pseudo-edges = exit
+; Pseudo-mass = 1
+; Frequency = 1*2*1 = 2
+; CHECK-NEXT: outer: float = 2.0,
+outer:
+  %I.0 = phi i32 [ 0, %entry ], [ %inc12, %outer.inc ]
+  %Return.0 = phi i32 [ 0, %entry ], [ %Return.3, %outer.inc ]
+  %cmp = icmp slt i32 %I.0, %N
+  br i1 %cmp, label %middle, label %exit, !prof !3 ; 1:1
+
+; Mass = 1
+; Backedge mass = 1/3, exit mass = 2/3
+; Loop scale = 3/2
+; Pseudo-edges = outer.inc
+; Pseudo-mass = 1/2
+; Frequency = 2*1*3/2*1/2 = 3/2
+; CHECK-NEXT: middle: float = 1.5,
+middle:
+  %J.0 = phi i32 [ %I.0, %outer ], [ %inc9, %middle.inc ]
+  %Return.1 = phi i32 [ %Return.0, %outer ], [ %Return.2, %middle.inc ]
+  %cmp2 = icmp slt i32 %J.0, %N
+  br i1 %cmp2, label %inner, label %outer.inc, !prof !2 ; 2:1
+
+; Mass = 1
+; Backedge mass = 3/5, exit mass = 2/5
+; Loop scale = 5/2
+; Pseudo-edges = middle.inc @ 1/5, outer.inc @ 1/5
+; Pseudo-mass = 2/3
+; Frequency = 3/2*1*5/2*2/3 = 5/2
+; CHECK-NEXT: inner: float = 2.5,
+inner:
+  %Return.2 = phi i32 [ %Return.1, %middle ], [ %call7, %inner.inc ]
+  %K.0 = phi i32 [ %J.0, %middle ], [ %inc, %inner.inc ]
+  %cmp5 = icmp slt i32 %K.0, %N
+  br i1 %cmp5, label %inner.body, label %middle.inc, !prof !1 ; 4:1
+
+; Mass = 4/5
+; Frequency = 5/2*4/5 = 2
+; CHECK-NEXT: inner.body: float = 2.0,
+inner.body:
+  %call = call i32 @c3(i32 %I.0, i32 %J.0, i32 %K.0)
+  %tobool = icmp ne i32 %call, 0
+  br i1 %tobool, label %outer.inc, label %inner.inc, !prof !0 ; 3:1
+
+; Mass = 3/5
+; Frequency = 5/2*3/5 = 3/2
+; CHECK-NEXT: inner.inc: float = 1.5,
+inner.inc:
+  %call7 = call i32 @logic3(i32 %Return.2, i32 %I.0, i32 %J.0, i32 %K.0)
+  %inc = add nsw i32 %K.0, 1
+  br label %inner
+
+; Mass = 1/3
+; Frequency = 3/2*1/3 = 1/2
+; CHECK-NEXT: middle.inc: float = 0.5,
+middle.inc:
+  %inc9 = add nsw i32 %J.0, 1
+  br label %middle
+
+; Mass = 1/2
+; Frequency = 2*1/2 = 1
+; CHECK-NEXT: outer.inc: float = 1.0,
+outer.inc:
+  %Return.3 = phi i32 [ %Return.2, %inner.body ], [ %Return.1, %middle ]
+  %inc12 = add nsw i32 %I.0, 1
+  br label %outer
+
+; Mass = 1
+; Frequency = 1
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+exit:
+  ret i32 %Return.0
+}
+
+!3 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+
+declare i32 @c3(i32, i32, i32)
+declare i32 @logic3(i32, i32, i32, i32)
diff --git a/test/Analysis/BlockFrequencyInfo/irreducible.ll b/test/Analysis/BlockFrequencyInfo/irreducible.ll
new file mode 100644
index 0000000..af4ad15
--- /dev/null
+++ b/test/Analysis/BlockFrequencyInfo/irreducible.ll
@@ -0,0 +1,421 @@
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+
+; A loop with multiple exits isn't irreducible.  It should be handled
+; correctly.
+;
+; CHECK-LABEL: Printing analysis {{.*}} for function 'multiexit':
+; CHECK-NEXT: block-frequency-info: multiexit
+define void @multiexit(i1 %x) {
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+entry:
+  br label %loop.1
+
+; CHECK-NEXT: loop.1: float = 2.0,
+loop.1:
+  br i1 %x, label %exit.1, label %loop.2, !prof !0
+
+; CHECK-NEXT: loop.2: float = 1.75,
+loop.2:
+  br i1 %x, label %exit.2, label %loop.1, !prof !1
+
+; CHECK-NEXT: exit.1: float = 0.25,
+exit.1:
+  br label %return
+
+; CHECK-NEXT: exit.2: float = 0.75,
+exit.2:
+  br label %return
+
+; CHECK-NEXT: return: float = 1.0, int = [[ENTRY]]
+return:
+  ret void
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 1, i32 7}
+!1 = metadata !{metadata !"branch_weights", i32 3, i32 4}
+
+; Irreducible control flow
+; ========================
+;
+; LoopInfo defines a loop as a non-trivial SCC dominated by a single block,
+; called the header.  A given loop, L, can have sub-loops, which are loops
+; within the subgraph of L that excludes the header.
+;
+; In addition to loops, -block-freq has limited support for irreducible SCCs,
+; which are SCCs with multiple entry blocks.  Irreducible SCCs are discovered
+; on they fly, and modelled as loops with multiple headers.
+;
+; The headers of irreducible sub-SCCs consist of its entry blocks and all nodes
+; that are targets of a backedge within it (excluding backedges within true
+; sub-loops).
+;
+; -block-freq is currently designed to act like a block is inserted that
+; intercepts all the edges to the headers.  All backedges and entries point to
+; this block.  Its successors are the headers, which split the frequency
+; evenly.
+;
+; There are a number of testcases below.  Only the first two have detailed
+; explanations.
+;
+; Testcase #1
+; ===========
+;
+; In this case c1 and c2 should have frequencies of 15/7 and 13/7,
+; respectively.  To calculate this, consider assigning 1.0 to entry, and
+; distributing frequency iteratively (to infinity).  At the first iteration,
+; entry gives 3/4 to c1 and 1/4 to c2.  At every step after, c1 and c2 give 3/4
+; of what they have to each other.  Somehow, all of it comes out to exit.
+;
+;       c1 = 3/4 + 1/4*3/4 + 3/4*3^2/4^2 + 1/4*3^3/4^3 + 3/4*3^3/4^3 + ...
+;       c2 = 1/4 + 3/4*3/4 + 1/4*3^2/4^2 + 3/4*3^3/4^3 + 1/4*3^3/4^3 + ...
+;
+; Simplify by splitting up the odd and even terms of the series and taking out
+; factors so that the infite series matches:
+;
+;       c1 =  3/4 *(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
+;          +  3/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
+;       c2 =  1/4 *(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
+;          +  9/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
+;
+;       c1 = 15/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
+;       c2 = 13/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
+;
+; Since this geometric series sums to 16/7:
+;
+;       c1 = 15/7
+;       c2 = 13/7
+;
+; If we treat c1 and c2 as members of the same loop, the exit frequency of the
+; loop as a whole is 1/4, so the loop scale should be 4.  Summing c1 and c2
+; gives 28/7, or 4.0, which is nice confirmation of the math above.
+;
+; -block-freq currently treats the two nodes as equals.
+define void @multientry(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'multientry':
+; CHECK-NEXT: block-frequency-info: multientry
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br i1 %x, label %c1, label %c2, !prof !2
+
+c1:
+; CHECK-NEXT: c1: float = 2.0,
+; The "correct" answer is: float = 2.142857{{[0-9]*}},
+  br i1 %x, label %c2, label %exit, !prof !2
+
+c2:
+; CHECK-NEXT: c2: float = 2.0,
+; The "correct" answer is: float = 1.857142{{[0-9]*}},
+  br i1 %x, label %c1, label %exit, !prof !2
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+
+!2 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+
+; Testcase #2
+; ===========
+;
+; In this case c1 and c2 should be treated as equals in a single loop.  The
+; exit frequency is 1/3, so the scaling factor for the loop should be 3.0.  The
+; loop is entered 2/3 of the time, and c1 and c2 split the total loop frequency
+; evenly (1/2), so they should each have frequencies of 1.0 (3.0*2/3*1/2).
+; Another way of computing this result is by assigning 1.0 to entry and showing
+; that c1 and c2 should accumulate frequencies of:
+;
+;       1/3   +   2/9   +   4/27  +   8/81  + ...
+;     2^0/3^1 + 2^1/3^2 + 2^2/3^3 + 2^3/3^4 + ...
+;
+; At the first step, c1 and c2 each get 1/3 of the entry.  At each subsequent
+; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined.  This
+; infinite series sums to 1.
+;
+; Since the currently algorithm *always* assumes entry blocks are equal,
+; -block-freq gets the right answers here.
+define void @crossloops(i2 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops':
+; CHECK-NEXT: block-frequency-info: crossloops
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  switch i2 %x, label %exit [ i2 1, label %c1
+                              i2 2, label %c2 ], !prof !3
+
+c1:
+; CHECK-NEXT: c1: float = 1.0,
+  switch i2 %x, label %exit [ i2 1, label %c1
+                              i2 2, label %c2 ], !prof !3
+
+c2:
+; CHECK-NEXT: c2: float = 1.0,
+  switch i2 %x, label %exit [ i2 1, label %c1
+                              i2 2, label %c2 ], !prof !3
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+
+!3 = metadata !{metadata !"branch_weights", i32 2, i32 2, i32 2}
+
+; A true loop with irreducible control flow inside.
+define void @loop_around_irreducible(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_around_irreducible':
+; CHECK-NEXT: block-frequency-info: loop_around_irreducible
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br label %loop
+
+loop:
+; CHECK-NEXT: loop: float = 4.0, int = [[HEAD:[0-9]+]]
+  br i1 %x, label %left, label %right, !prof !4
+
+left:
+; CHECK-NEXT: left: float = 8.0,
+  br i1 %x, label %right, label %loop.end, !prof !5
+
+right:
+; CHECK-NEXT: right: float = 8.0,
+  br i1 %x, label %left, label %loop.end, !prof !5
+
+loop.end:
+; CHECK-NEXT: loop.end: float = 4.0, int = [[HEAD]]
+  br i1 %x, label %loop, label %exit, !prof !5
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!4 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!5 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+
+; Two unrelated irreducible SCCs.
+define void @two_sccs(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'two_sccs':
+; CHECK-NEXT: block-frequency-info: two_sccs
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br i1 %x, label %a, label %b, !prof !6
+
+a:
+; CHECK-NEXT: a: float = 0.75,
+  br i1 %x, label %a.left, label %a.right, !prof !7
+
+a.left:
+; CHECK-NEXT: a.left: float = 1.5,
+  br i1 %x, label %a.right, label %exit, !prof !6
+
+a.right:
+; CHECK-NEXT: a.right: float = 1.5,
+  br i1 %x, label %a.left, label %exit, !prof !6
+
+b:
+; CHECK-NEXT: b: float = 0.25,
+  br i1 %x, label %b.left, label %b.right, !prof !7
+
+b.left:
+; CHECK-NEXT: b.left: float = 0.625,
+  br i1 %x, label %b.right, label %exit, !prof !8
+
+b.right:
+; CHECK-NEXT: b.right: float = 0.625,
+  br i1 %x, label %b.left, label %exit, !prof !8
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!6 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!7 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!8 = metadata !{metadata !"branch_weights", i32 4, i32 1}
+
+; A true loop inside irreducible control flow.
+define void @loop_inside_irreducible(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_inside_irreducible':
+; CHECK-NEXT: block-frequency-info: loop_inside_irreducible
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br i1 %x, label %left, label %right, !prof !9
+
+left:
+; CHECK-NEXT: left: float = 2.0,
+  br i1 %x, label %right, label %exit, !prof !10
+
+right:
+; CHECK-NEXT: right: float = 2.0, int = [[RIGHT:[0-9]+]]
+  br label %loop
+
+loop:
+; CHECK-NEXT: loop: float = 6.0,
+  br i1 %x, label %loop, label %right.end, !prof !11
+
+right.end:
+; CHECK-NEXT: right.end: float = 2.0, int = [[RIGHT]]
+  br i1 %x, label %left, label %exit, !prof !10
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!9 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!10 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!11 = metadata !{metadata !"branch_weights", i32 2, i32 1}
+
+; Irreducible control flow in a branch that's in a true loop.
+define void @loop_around_branch_with_irreducible(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_around_branch_with_irreducible':
+; CHECK-NEXT: block-frequency-info: loop_around_branch_with_irreducible
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br label %loop
+
+loop:
+; CHECK-NEXT: loop: float = 2.0, int = [[LOOP:[0-9]+]]
+  br i1 %x, label %normal, label %irreducible.entry, !prof !12
+
+normal:
+; CHECK-NEXT: normal: float = 1.5,
+  br label %loop.end
+
+irreducible.entry:
+; CHECK-NEXT: irreducible.entry: float = 0.5, int = [[IRREDUCIBLE:[0-9]+]]
+  br i1 %x, label %left, label %right, !prof !13
+
+left:
+; CHECK-NEXT: left: float = 1.0,
+  br i1 %x, label %right, label %irreducible.exit, !prof !12
+
+right:
+; CHECK-NEXT: right: float = 1.0,
+  br i1 %x, label %left, label %irreducible.exit, !prof !12
+
+irreducible.exit:
+; CHECK-NEXT: irreducible.exit: float = 0.5, int = [[IRREDUCIBLE]]
+  br label %loop.end
+
+loop.end:
+; CHECK-NEXT: loop.end: float = 2.0, int = [[LOOP]]
+  br i1 %x, label %loop, label %exit, !prof !13
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!12 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!13 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+
+; Irreducible control flow between two true loops.
+define void @loop_around_branch_with_irreducible_around_loop(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_around_branch_with_irreducible_around_loop':
+; CHECK-NEXT: block-frequency-info: loop_around_branch_with_irreducible_around_loop
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br label %loop
+
+loop:
+; CHECK-NEXT: loop: float = 3.0, int = [[LOOP:[0-9]+]]
+  br i1 %x, label %normal, label %irreducible, !prof !14
+
+normal:
+; CHECK-NEXT: normal: float = 2.0,
+  br label %loop.end
+
+irreducible:
+; CHECK-NEXT: irreducible: float = 1.0,
+  br i1 %x, label %left, label %right, !prof !15
+
+left:
+; CHECK-NEXT: left: float = 2.0,
+  br i1 %x, label %right, label %loop.end, !prof !16
+
+right:
+; CHECK-NEXT: right: float = 2.0, int = [[RIGHT:[0-9]+]]
+  br label %right.loop
+
+right.loop:
+; CHECK-NEXT: right.loop: float = 10.0,
+  br i1 %x, label %right.loop, label %right.end, !prof !17
+
+right.end:
+; CHECK-NEXT: right.end: float = 2.0, int = [[RIGHT]]
+  br i1 %x, label %left, label %loop.end, !prof !16
+
+loop.end:
+; CHECK-NEXT: loop.end: float = 3.0, int = [[LOOP]]
+  br i1 %x, label %loop, label %exit, !prof !14
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!14 = metadata !{metadata !"branch_weights", i32 2, i32 1}
+!15 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!16 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+!17 = metadata !{metadata !"branch_weights", i32 4, i32 1}
+
+; An irreducible SCC with a non-header.
+define void @nonheader(i1 %x) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'nonheader':
+; CHECK-NEXT: block-frequency-info: nonheader
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br i1 %x, label %left, label %right, !prof !18
+
+left:
+; CHECK-NEXT: left: float = 1.0,
+  br i1 %x, label %bottom, label %exit, !prof !19
+
+right:
+; CHECK-NEXT: right: float = 1.0,
+  br i1 %x, label %bottom, label %exit, !prof !20
+
+bottom:
+; CHECK-NEXT: bottom: float = 1.0,
+  br i1 %x, label %left, label %right, !prof !18
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!18 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!19 = metadata !{metadata !"branch_weights", i32 1, i32 3}
+!20 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+
+; An irreducible SCC with an irreducible sub-SCC.  In the current version of
+; -block-freq, this means an extra header.
+;
+; This testcases uses non-trivial branch weights.  The CHECK statements here
+; will start to fail if we change -block-freq to be more accurate.  Currently,
+; we expect left, right and top to be treated as equal headers.
+define void @nonentry_header(i1 %x, i2 %y) {
+; CHECK-LABEL: Printing analysis {{.*}} for function 'nonentry_header':
+; CHECK-NEXT: block-frequency-info: nonentry_header
+entry:
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+  br i1 %x, label %left, label %right, !prof !21
+
+left:
+; CHECK-NEXT: left: float = 3.0,
+  br i1 %x, label %top, label %bottom, !prof !22
+
+right:
+; CHECK-NEXT: right: float = 3.0,
+  br i1 %x, label %top, label %bottom, !prof !22
+
+top:
+; CHECK-NEXT: top: float = 3.0,
+  switch i2 %y, label %exit [ i2 0, label %left
+                              i2 1, label %right
+                              i2 2, label %bottom ], !prof !23
+
+bottom:
+; CHECK-NEXT: bottom: float = 4.5,
+  br label %top
+
+exit:
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+  ret void
+}
+!21 = metadata !{metadata !"branch_weights", i32 2, i32 1}
+!22 = metadata !{metadata !"branch_weights", i32 1, i32 1}
+!23 = metadata !{metadata !"branch_weights", i32 8, i32 1, i32 3, i32 12}
diff --git a/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll b/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll
new file mode 100644
index 0000000..9d27b6b
--- /dev/null
+++ b/test/Analysis/BlockFrequencyInfo/loop_with_branch.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+
+; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_with_branch':
+; CHECK-NEXT: block-frequency-info: loop_with_branch
+define void @loop_with_branch(i32 %a) {
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+entry:
+  %skip_loop = call i1 @foo0(i32 %a)
+  br i1 %skip_loop, label %skip, label %header, !prof !0
+
+; CHECK-NEXT: skip: float = 0.25,
+skip:
+  br label %exit
+
+; CHECK-NEXT: header: float = 4.5,
+header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %back ]
+  %i.next = add i32 %i, 1
+  %choose = call i2 @foo1(i32 %i)
+  switch i2 %choose, label %exit [ i2 0, label %left
+                                   i2 1, label %right ], !prof !1
+
+; CHECK-NEXT: left: float = 1.5,
+left:
+  br label %back
+
+; CHECK-NEXT: right: float = 2.25,
+right:
+  br label %back
+
+; CHECK-NEXT: back: float = 3.75,
+back:
+  br label %header
+
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+exit:
+  ret void
+}
+
+declare i1 @foo0(i32)
+declare i2 @foo1(i32)
+
+!0 = metadata !{metadata !"branch_weights", i32 1, i32 3}
+!1 = metadata !{metadata !"branch_weights", i32 1, i32 2, i32 3}
diff --git a/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll b/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll
new file mode 100644
index 0000000..d93ffce
--- /dev/null
+++ b/test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll
@@ -0,0 +1,59 @@
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+
+; CHECK-LABEL: Printing analysis {{.*}} for function 'nested_loop_with_branches'
+; CHECK-NEXT: block-frequency-info: nested_loop_with_branches
+define void @nested_loop_with_branches(i32 %a) {
+; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
+entry:
+  %v0 = call i1 @foo0(i32 %a)
+  br i1 %v0, label %exit, label %outer, !prof !0
+
+; CHECK-NEXT: outer: float = 12.0,
+outer:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %inner.end ], [ %i.next, %no_inner ]
+  %i.next = add i32 %i, 1
+  %do_inner = call i1 @foo1(i32 %i)
+  br i1 %do_inner, label %no_inner, label %inner, !prof !0
+
+; CHECK-NEXT: inner: float = 36.0,
+inner:
+  %j = phi i32 [ 0, %outer ], [ %j.next, %inner.end ]
+  %side = call i1 @foo3(i32 %j)
+  br i1 %side, label %left, label %right, !prof !0
+
+; CHECK-NEXT: left: float = 9.0,
+left:
+  %v4 = call i1 @foo4(i32 %j)
+  br label %inner.end
+
+; CHECK-NEXT: right: float = 27.0,
+right:
+  %v5 = call i1 @foo5(i32 %j)
+  br label %inner.end
+
+; CHECK-NEXT: inner.end: float = 36.0,
+inner.end:
+  %stay_inner = phi i1 [ %v4, %left ], [ %v5, %right ]
+  %j.next = add i32 %j, 1
+  br i1 %stay_inner, label %inner, label %outer, !prof !1
+
+; CHECK-NEXT: no_inner: float = 3.0,
+no_inner:
+  %continue = call i1 @foo6(i32 %i)
+  br i1 %continue, label %outer, label %exit, !prof !1
+
+; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
+exit:
+  ret void
+}
+
+declare i1 @foo0(i32)
+declare i1 @foo1(i32)
+declare i1 @foo2(i32)
+declare i1 @foo3(i32)
+declare i1 @foo4(i32)
+declare i1 @foo5(i32)
+declare i1 @foo6(i32)
+
+!0 = metadata !{metadata !"branch_weights", i32 1, i32 3}
+!1 = metadata !{metadata !"branch_weights", i32 3, i32 1}
diff --git a/test/Analysis/BranchProbabilityInfo/loop.ll b/test/Analysis/BranchProbabilityInfo/loop.ll
index b648cbb..40f1111 100644
--- a/test/Analysis/BranchProbabilityInfo/loop.ll
+++ b/test/Analysis/BranchProbabilityInfo/loop.ll
@@ -15,7 +15,7 @@ do.body:
   %i.0 = phi i32 [ 0, %entry ], [ %inc3, %do.end ]
   call void @g1()
   br label %do.body1
-; CHECK: edge do.body -> do.body1 probability is 124 / 124 = 100%
+; CHECK: edge do.body -> do.body1 probability is 16 / 16 = 100%
 
 do.body1:
   %j.0 = phi i32 [ 0, %do.body ], [ %inc, %do.body1 ]
@@ -55,8 +55,8 @@ for.body:
   %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc5, %for.end ]
   call void @g1()
   br i1 %cmp27, label %for.body3, label %for.end
-; CHECK: edge for.body -> for.body3 probability is 62 / 124 = 50%
-; CHECK: edge for.body -> for.end probability is 62 / 124 = 50%
+; CHECK: edge for.body -> for.body3 probability is 20 / 32 = 62.5%
+; CHECK: edge for.body -> for.end probability is 12 / 32 = 37.5%
 
 for.body3:
   %j.08 = phi i32 [ %inc, %for.body3 ], [ 0, %for.body ]
@@ -91,8 +91,8 @@ do.body:
   %0 = load i32* %c, align 4
   %cmp = icmp slt i32 %0, 42
   br i1 %cmp, label %do.body1, label %if.end
-; CHECK: edge do.body -> do.body1 probability is 62 / 124 = 50%
-; CHECK: edge do.body -> if.end probability is 62 / 124 = 50%
+; CHECK: edge do.body -> do.body1 probability is 16 / 32 = 50%
+; CHECK: edge do.body -> if.end probability is 16 / 32 = 50%
 
 do.body1:
   %j.0 = phi i32 [ %inc, %do.body1 ], [ 0, %do.body ]
@@ -165,7 +165,7 @@ do.body:
   %i.0 = phi i32 [ 0, %entry ], [ %inc4, %do.end ]
   call void @g1()
   br label %do.body1
-; CHECK: edge do.body -> do.body1 probability is 124 / 124 = 100%
+; CHECK: edge do.body -> do.body1 probability is 16 / 16 = 100%
 
 do.body1:
   %j.0 = phi i32 [ 0, %do.body ], [ %inc, %if.end ]
@@ -209,7 +209,7 @@ do.body:
   %i.0 = phi i32 [ 0, %entry ], [ %inc4, %do.end ]
   call void @g1()
   br label %do.body1
-; CHECK: edge do.body -> do.body1 probability is 124 / 124 = 100%
+; CHECK: edge do.body -> do.body1 probability is 16 / 16 = 100%
 
 do.body1:
   %j.0 = phi i32 [ 0, %do.body ], [ %inc, %do.cond ]
@@ -261,14 +261,14 @@ for.body:
   %0 = load i32* %c, align 4
   %cmp1 = icmp eq i32 %0, %i.011
   br i1 %cmp1, label %for.inc5, label %if.end
-; CHECK: edge for.body -> for.inc5 probability is 62 / 124 = 50%
-; CHECK: edge for.body -> if.end probability is 62 / 124 = 50%
+; CHECK: edge for.body -> for.inc5 probability is 16 / 32 = 50%
+; CHECK: edge for.body -> if.end probability is 16 / 32 = 50%
 
 if.end:
   call void @g1()
   br i1 %cmp38, label %for.body4, label %for.end
-; CHECK: edge if.end -> for.body4 probability is 62 / 124 = 50%
-; CHECK: edge if.end -> for.end probability is 62 / 124 = 50%
+; CHECK: edge if.end -> for.body4 probability is 20 / 32 = 62.5%
+; CHECK: edge if.end -> for.end probability is 12 / 32 = 37.5%
 
 for.body4:
   %j.09 = phi i32 [ %inc, %for.body4 ], [ 0, %if.end ]
@@ -282,7 +282,7 @@ for.body4:
 for.end:
   call void @g3()
   br label %for.inc5
-; CHECK: edge for.end -> for.inc5 probability is 124 / 124 = 100%
+; CHECK: edge for.end -> for.inc5 probability is 16 / 16 = 100%
 
 for.inc5:
   %inc6 = add nsw i32 %i.011, 1
@@ -314,35 +314,35 @@ for.body:
   %i.019 = phi i32 [ 0, %for.body.lr.ph ], [ %inc14, %for.end ]
   call void @g1()
   br i1 %cmp216, label %for.body3, label %for.end
-; CHECK: edge for.body -> for.body3 probability is 62 / 124 = 50%
-; CHECK: edge for.body -> for.end probability is 62 / 124 = 50%
+; CHECK: edge for.body -> for.body3 probability is 20 / 32 = 62.5%
+; CHECK: edge for.body -> for.end probability is 12 / 32 = 37.5%
 
 for.body3:
   %j.017 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
   %0 = load i32* %c, align 4
   %cmp4 = icmp eq i32 %0, %j.017
   br i1 %cmp4, label %for.inc, label %if.end
-; CHECK: edge for.body3 -> for.inc probability is 62 / 124 = 50%
-; CHECK: edge for.body3 -> if.end probability is 62 / 124 = 50%
+; CHECK: edge for.body3 -> for.inc probability is 16 / 32 = 50%
+; CHECK: edge for.body3 -> if.end probability is 16 / 32 = 50%
 
 if.end:
   %1 = load i32* %arrayidx5, align 4
   %cmp6 = icmp eq i32 %1, %j.017
   br i1 %cmp6, label %for.inc, label %if.end8
-; CHECK: edge if.end -> for.inc probability is 62 / 124 = 50%
-; CHECK: edge if.end -> if.end8 probability is 62 / 124 = 50%
+; CHECK: edge if.end -> for.inc probability is 16 / 32 = 50%
+; CHECK: edge if.end -> if.end8 probability is 16 / 32 = 50%
 
 if.end8:
   %2 = load i32* %arrayidx9, align 4
   %cmp10 = icmp eq i32 %2, %j.017
   br i1 %cmp10, label %for.inc, label %if.end12
-; CHECK: edge if.end8 -> for.inc probability is 62 / 124 = 50%
-; CHECK: edge if.end8 -> if.end12 probability is 62 / 124 = 50%
+; CHECK: edge if.end8 -> for.inc probability is 16 / 32 = 50%
+; CHECK: edge if.end8 -> if.end12 probability is 16 / 32 = 50%
 
 if.end12:
   call void @g2()
   br label %for.inc
-; CHECK: edge if.end12 -> for.inc probability is 124 / 124 = 100%
+; CHECK: edge if.end12 -> for.inc probability is 16 / 16 = 100%
 
 for.inc:
   %inc = add nsw i32 %j.017, 1
diff --git a/test/Analysis/BranchProbabilityInfo/pr18705.ll b/test/Analysis/BranchProbabilityInfo/pr18705.ll
new file mode 100644
index 0000000..9f239b4
--- /dev/null
+++ b/test/Analysis/BranchProbabilityInfo/pr18705.ll
@@ -0,0 +1,58 @@
+; RUN: opt < %s -analyze -branch-prob | FileCheck %s
+
+; Since neither of while.body's out-edges is an exit or a back edge,
+; calcLoopBranchHeuristics should return early without setting the weights.
+; calcFloatingPointHeuristics, which is run later, sets the weights.
+;
+; CHECK: edge while.body -> if.then probability is 20 / 32 = 62.5%
+; CHECK: edge while.body -> if.else probability is 12 / 32 = 37.5%
+
+define void @foo1(i32 %n, i32* nocapture %b, i32* nocapture %c, i32* nocapture %d, float* nocapture readonly %f0, float* nocapture readonly %f1) {
+entry:
+  %tobool8 = icmp eq i32 %n, 0
+  br i1 %tobool8, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:
+  %0 = sext i32 %n to i64
+  br label %while.body
+
+while.body:
+  %indvars.iv = phi i64 [ %0, %while.body.lr.ph ], [ %indvars.iv.next, %if.end ]
+  %b.addr.011 = phi i32* [ %b, %while.body.lr.ph ], [ %b.addr.1, %if.end ]
+  %d.addr.010 = phi i32* [ %d, %while.body.lr.ph ], [ %incdec.ptr4, %if.end ]
+  %c.addr.09 = phi i32* [ %c, %while.body.lr.ph ], [ %c.addr.1, %if.end ]
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds float* %f0, i64 %indvars.iv.next
+  %1 = load float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float* %f1, i64 %indvars.iv.next
+  %2 = load float* %arrayidx2, align 4
+  %cmp = fcmp une float %1, %2
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %incdec.ptr = getelementptr inbounds i32* %b.addr.011, i64 1
+  %3 = load i32* %b.addr.011, align 4
+  %add = add nsw i32 %3, 12
+  store i32 %add, i32* %b.addr.011, align 4
+  br label %if.end
+
+if.else:
+  %incdec.ptr3 = getelementptr inbounds i32* %c.addr.09, i64 1
+  %4 = load i32* %c.addr.09, align 4
+  %sub = add nsw i32 %4, -13
+  store i32 %sub, i32* %c.addr.09, align 4
+  br label %if.end
+
+if.end:
+  %c.addr.1 = phi i32* [ %c.addr.09, %if.then ], [ %incdec.ptr3, %if.else ]
+  %b.addr.1 = phi i32* [ %incdec.ptr, %if.then ], [ %b.addr.011, %if.else ]
+  %incdec.ptr4 = getelementptr inbounds i32* %d.addr.010, i64 1
+  store i32 14, i32* %d.addr.010, align 4
+  %5 = trunc i64 %indvars.iv.next to i32
+  %tobool = icmp eq i32 %5, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
diff --git a/test/Analysis/CostModel/AArch64/lit.local.cfg b/test/Analysis/CostModel/AArch64/lit.local.cfg
new file mode 100644
index 0000000..c420349
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/lit.local.cfg
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
diff --git a/test/Analysis/CostModel/AArch64/select.ll b/test/Analysis/CostModel/AArch64/select.ll
new file mode 100644
index 0000000..216dc5d
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/select.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+; CHECK-LABEL: select
+define void @select() {
+    ; Scalar values
+  ; CHECK: cost of 1 {{.*}} select
+  %v1 = select i1 undef, i8 undef, i8 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v2 = select i1 undef, i16 undef, i16 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v3 = select i1 undef, i32 undef, i32 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v4 = select i1 undef, i64 undef, i64 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v5 = select i1 undef, float undef, float undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v6 = select i1 undef, double undef, double undef
+
+  ; Vector values - check for vectors that have a high cost because they end up
+  ; scalarized.
+  ; CHECK: cost of 320 {{.*}} select
+  %v13b = select <16 x i1>  undef, <16 x i16> undef, <16 x i16> undef
+
+  ; CHECK: cost of 160 {{.*}} select
+  %v15b = select <8 x i1>  undef, <8 x i32> undef, <8 x i32> undef
+  ; CHECK: cost of 320 {{.*}} select
+  %v15c = select <16 x i1>  undef, <16 x i32> undef, <16 x i32> undef
+
+  ; CHECK: cost of 80 {{.*}} select
+  %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+  ; CHECK: cost of 160 {{.*}} select
+  %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+  ; CHECK: cost of 320 {{.*}} select
+  %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+
+    ret void
+}
diff --git a/test/Analysis/CostModel/AArch64/store.ll b/test/Analysis/CostModel/AArch64/store.ll
new file mode 100644
index 0000000..0c9883c
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/store.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+; CHECK-LABEL: store
+define void @store() {
+    ; Stores of <2 x i64> should be expensive because we don't split them and
+    ; and unaligned 16b stores have bad performance.
+    ; CHECK: cost of 12 {{.*}} store
+    store <2 x i64> undef, <2 x i64> * undef
+
+    ; We scalarize the loads/stores because there is no vector register name for
+    ; these types (they get extended to v.4h/v.2s).
+    ; CHECK: cost of 16 {{.*}} store
+    store <2 x i8> undef, <2 x i8> * undef
+    ; CHECK: cost of 64 {{.*}} store
+    store <4 x i8> undef, <4 x i8> * undef
+    ; CHECK: cost of 16 {{.*}} load
+    load <2 x i8> * undef
+    ; CHECK: cost of 64 {{.*}} load
+    load <4 x i8> * undef
+
+    ret void
+}
diff --git a/test/Analysis/CostModel/ARM64/lit.local.cfg b/test/Analysis/CostModel/ARM64/lit.local.cfg
deleted file mode 100644
index 84ac981..0000000
--- a/test/Analysis/CostModel/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/Analysis/CostModel/ARM64/select.ll b/test/Analysis/CostModel/ARM64/select.ll
deleted file mode 100644
index 216dc5d..0000000
--- a/test/Analysis/CostModel/ARM64/select.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-
-; CHECK-LABEL: select
-define void @select() {
-    ; Scalar values
-  ; CHECK: cost of 1 {{.*}} select
-  %v1 = select i1 undef, i8 undef, i8 undef
-  ; CHECK: cost of 1 {{.*}} select
-  %v2 = select i1 undef, i16 undef, i16 undef
-  ; CHECK: cost of 1 {{.*}} select
-  %v3 = select i1 undef, i32 undef, i32 undef
-  ; CHECK: cost of 1 {{.*}} select
-  %v4 = select i1 undef, i64 undef, i64 undef
-  ; CHECK: cost of 1 {{.*}} select
-  %v5 = select i1 undef, float undef, float undef
-  ; CHECK: cost of 1 {{.*}} select
-  %v6 = select i1 undef, double undef, double undef
-
-  ; Vector values - check for vectors that have a high cost because they end up
-  ; scalarized.
-  ; CHECK: cost of 320 {{.*}} select
-  %v13b = select <16 x i1>  undef, <16 x i16> undef, <16 x i16> undef
-
-  ; CHECK: cost of 160 {{.*}} select
-  %v15b = select <8 x i1>  undef, <8 x i32> undef, <8 x i32> undef
-  ; CHECK: cost of 320 {{.*}} select
-  %v15c = select <16 x i1>  undef, <16 x i32> undef, <16 x i32> undef
-
-  ; CHECK: cost of 80 {{.*}} select
-  %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-  ; CHECK: cost of 160 {{.*}} select
-  %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-  ; CHECK: cost of 320 {{.*}} select
-  %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-
-    ret void
-}
diff --git a/test/Analysis/CostModel/ARM64/store.ll b/test/Analysis/CostModel/ARM64/store.ll
deleted file mode 100644
index 0c9883c..0000000
--- a/test/Analysis/CostModel/ARM64/store.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-; CHECK-LABEL: store
-define void @store() {
-    ; Stores of <2 x i64> should be expensive because we don't split them and
-    ; and unaligned 16b stores have bad performance.
-    ; CHECK: cost of 12 {{.*}} store
-    store <2 x i64> undef, <2 x i64> * undef
-
-    ; We scalarize the loads/stores because there is no vector register name for
-    ; these types (they get extended to v.4h/v.2s).
-    ; CHECK: cost of 16 {{.*}} store
-    store <2 x i8> undef, <2 x i8> * undef
-    ; CHECK: cost of 64 {{.*}} store
-    store <4 x i8> undef, <4 x i8> * undef
-    ; CHECK: cost of 16 {{.*}} load
-    load <2 x i8> * undef
-    ; CHECK: cost of 64 {{.*}} load
-    load <4 x i8> * undef
-
-    ret void
-}
diff --git a/test/Analysis/CostModel/PowerPC/ext.ll b/test/Analysis/CostModel/PowerPC/ext.ll
index daaa8f5..7d6a14e 100644
--- a/test/Analysis/CostModel/PowerPC/ext.ll
+++ b/test/Analysis/CostModel/PowerPC/ext.ll
@@ -13,7 +13,7 @@ define void @exts() {
   ; CHECK: cost of 1 {{.*}} sext
   %v3 = sext <4 x i16> undef to <4 x i32>
 
-  ; CHECK: cost of 216 {{.*}} sext
+  ; CHECK: cost of 112 {{.*}} sext
   %v4 = sext <8 x i16> undef to <8 x i32>
 
   ret void
diff --git a/test/Analysis/CostModel/PowerPC/insert_extract.ll b/test/Analysis/CostModel/PowerPC/insert_extract.ll
index f51963d..8dc0031 100644
--- a/test/Analysis/CostModel/PowerPC/insert_extract.ll
+++ b/test/Analysis/CostModel/PowerPC/insert_extract.ll
@@ -3,13 +3,13 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "powerpc64-unknown-linux-gnu"
 
 define i32 @insert(i32 %arg) {
-  ; CHECK: cost of 13 {{.*}} insertelement
+  ; CHECK: cost of 10 {{.*}} insertelement
   %x = insertelement <4 x i32> undef, i32 %arg, i32 0
   ret i32 undef
 }
 
 define i32 @extract(<4 x i32> %arg) {
-  ; CHECK: cost of 13 {{.*}} extractelement
+  ; CHECK: cost of 3 {{.*}} extractelement
   %x = extractelement <4 x i32> %arg, i32 0
   ret i32 %x
 }
diff --git a/test/Analysis/CostModel/PowerPC/load_store.ll b/test/Analysis/CostModel/PowerPC/load_store.ll
index 8145a1d..368f0a7 100644
--- a/test/Analysis/CostModel/PowerPC/load_store.ll
+++ b/test/Analysis/CostModel/PowerPC/load_store.ll
@@ -31,9 +31,15 @@ define i32 @loads(i32 %arg) {
 
   ; FIXME: There actually are sub-vector Altivec loads, and so we could handle
   ; this with a small expense, but we don't currently.
-  ; CHECK: cost of 60 {{.*}} load
+  ; CHECK: cost of 48 {{.*}} load
   load <4 x i16>* undef, align 2
 
+  ; CHECK: cost of 1 {{.*}} load
+  load <4 x i32>* undef, align 4
+
+  ; CHECK: cost of 46 {{.*}} load
+  load <3 x float>* undef, align 1
+
   ret i32 undef
 }
 
diff --git a/test/Analysis/CostModel/X86/intrinsic-cost.ll b/test/Analysis/CostModel/X86/intrinsic-cost.ll
index 8eeee81..3b27b52 100644
--- a/test/Analysis/CostModel/X86/intrinsic-cost.ll
+++ b/test/Analysis/CostModel/X86/intrinsic-cost.ll
@@ -58,3 +58,31 @@ for.end:                                          ; preds = %vector.body
 }
 
 declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)  nounwind readnone
+
+define void @test3(float* nocapture %f, <4 x float> %b, <4 x float> %c) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float* %f, i64 %index
+  %1 = bitcast float* %0 to <4 x float>*
+  %wide.load = load <4 x float>* %1, align 4
+  %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
+  store <4 x float> %2, <4 x float>* %1, align 4
+  %index.next = add i64 %index, 4
+  %3 = icmp eq i64 %index.next, 1024
+  br i1 %3, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; CORE2: Printing analysis 'Cost Model Analysis' for function 'test3':
+; CORE2: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
+
+; COREI7: Printing analysis 'Cost Model Analysis' for function 'test3':
+; COREI7: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
+
+}
+
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
diff --git a/test/Analysis/CostModel/X86/vdiv-cost.ll b/test/Analysis/CostModel/X86/vdiv-cost.ll
new file mode 100644
index 0000000..c8e4557
--- /dev/null
+++ b/test/Analysis/CostModel/X86/vdiv-cost.ll
@@ -0,0 +1,92 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+
+define <4 x i32> @test1(<4 x i32> %a) {
+  %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %div
+
+; CHECK: 'Cost Model Analysis' for function 'test1':
+; SSE2: Found an estimated cost of 15 for instruction:   %div
+; AVX2: Found an estimated cost of 15 for instruction:   %div
+}
+
+define <8 x i32> @test2(<8 x i32> %a) {
+  %div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %div
+
+; CHECK: 'Cost Model Analysis' for function 'test2':
+; SSE2: Found an estimated cost of 30 for instruction:   %div
+; AVX2: Found an estimated cost of 15 for instruction:   %div
+}
+
+define <8 x i16> @test3(<8 x i16> %a) {
+  %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %div
+
+; CHECK: 'Cost Model Analysis' for function 'test3':
+; SSE2: Found an estimated cost of 6 for instruction:   %div
+; AVX2: Found an estimated cost of 6 for instruction:   %div
+}
+
+define <16 x i16> @test4(<16 x i16> %a) {
+  %div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i16> %div
+
+; CHECK: 'Cost Model Analysis' for function 'test4':
+; SSE2: Found an estimated cost of 12 for instruction:   %div
+; AVX2: Found an estimated cost of 6 for instruction:   %div
+}
+
+define <8 x i16> @test5(<8 x i16> %a) {
+  %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %div
+
+; CHECK: 'Cost Model Analysis' for function 'test5':
+; SSE2: Found an estimated cost of 6 for instruction:   %div
+; AVX2: Found an estimated cost of 6 for instruction:   %div
+}
+
+define <16 x i16> @test6(<16 x i16> %a) {
+  %div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i16> %div
+
+; CHECK: 'Cost Model Analysis' for function 'test6':
+; SSE2: Found an estimated cost of 12 for instruction:   %div
+; AVX2: Found an estimated cost of 6 for instruction:   %div
+}
+
+define <16 x i8> @test7(<16 x i8> %a) {
+  %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %div
+
+; CHECK: 'Cost Model Analysis' for function 'test7':
+; SSE2: Found an estimated cost of 320 for instruction:   %div
+; AVX2: Found an estimated cost of 320 for instruction:   %div
+}
+
+define <4 x i32> @test8(<4 x i32> %a) {
+  %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %div
+
+; CHECK: 'Cost Model Analysis' for function 'test8':
+; SSE2: Found an estimated cost of 19 for instruction:   %div
+; AVX2: Found an estimated cost of 15 for instruction:   %div
+}
+
+define <8 x i32> @test9(<8 x i32> %a) {
+  %div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %div
+
+; CHECK: 'Cost Model Analysis' for function 'test9':
+; SSE2: Found an estimated cost of 38 for instruction:   %div
+; AVX2: Found an estimated cost of 15 for instruction:   %div
+}
+
+define <8 x i32> @test10(<8 x i32> %a) {
+  %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %div
+
+; CHECK: 'Cost Model Analysis' for function 'test10':
+; SSE2: Found an estimated cost of 160 for instruction:   %div
+; AVX2: Found an estimated cost of 160 for instruction:   %div
+}
diff --git a/test/Analysis/CostModel/X86/vselect-cost.ll b/test/Analysis/CostModel/X86/vselect-cost.ll
new file mode 100644
index 0000000..2416777
--- /dev/null
+++ b/test/Analysis/CostModel/X86/vselect-cost.ll
@@ -0,0 +1,126 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+
+
+; Verify the cost of vector select instructions.
+
+; SSE41 added blend instructions with an immediate for <2 x double> and
+; <4 x float>. Integers of the same size should also use those instructions.
+
+define <2 x i64> @test_2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_2i64':
+; SSE2: Cost Model: {{.*}} 4 for instruction:   %sel = select <2 x i1>
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
+  %sel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %sel
+}
+
+define <2 x double> @test_2double(<2 x double> %a, <2 x double> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_2double':
+; SSE2: Cost Model: {{.*}} 3 for instruction:   %sel = select <2 x i1>
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <2 x i1>
+  %sel = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
+  ret <2 x double> %sel
+}
+
+define <4 x i32> @test_4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4i32':
+; SSE2: Cost Model: {{.*}} 8 for instruction:   %sel = select <4 x i1>
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+  %sel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %sel
+}
+
+define <4 x float> @test_4float(<4 x float> %a, <4 x float> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4float':
+; SSE2: Cost Model: {{.*}} 7 for instruction:   %sel = select <4 x i1>
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+  %sel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %sel
+}
+
+define <16 x i8> @test_16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_16i8':
+; SSE2: Cost Model: {{.*}} 32 for instruction:   %sel = select <16 x i1>
+; SSE41: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
+  %sel = select <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true>, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %sel
+}
+
+; AVX added blend instructions with an immediate for <4 x double> and
+; <8 x float>. Integers of the same size should also use those instructions.
+define <4 x i64> @test_4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4i64':
+; SSE2: Cost Model: {{.*}} 8 for instruction:   %sel = select <4 x i1>
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <4 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+  %sel = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i64> %a, <4 x i64> %b
+  ret <4 x i64> %sel
+}
+
+define <4 x double> @test_4double(<4 x double> %a, <4 x double> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_4double':
+; SSE2: Cost Model: {{.*}} 6 for instruction:   %sel = select <4 x i1>
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <4 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <4 x i1>
+  %sel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %a, <4 x double> %b
+  ret <4 x double> %sel
+}
+
+define <8 x i32> @test_8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_8i32':
+; SSE2: Cost Model: {{.*}} 16 for instruction:   %sel = select <8 x i1>
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <8 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <8 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <8 x i1>
+  %sel = select <8 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 false>, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %sel
+}
+
+define <8 x float> @test_8float(<8 x float> %a, <8 x float> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_8float':
+; SSE2: Cost Model: {{.*}} 14 for instruction:   %sel = select <8 x i1>
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <8 x i1>
+; AVX: Cost Model: {{.*}} 1 for instruction:   %sel = select <8 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <8 x i1>
+  %sel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %a, <8 x float> %b
+  ret <8 x float> %sel
+}
+
+; AVX2
+define <16 x i16> @test_16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK:Printing analysis 'Cost Model Analysis' for function 'test_16i16':
+; SSE2: Cost Model: {{.*}} 32 for instruction:   %sel = select <16 x i1>
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <16 x i1>
+;;; FIXME: This AVX cost is obviously wrong. We shouldn't be scalarizing.
+; AVX: Cost Model: {{.*}} 32 for instruction:   %sel = select <16 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <16 x i1>
+  %sel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %sel
+}
+
+define <32 x i8> @test_32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_32i8':
+; SSE2: Cost Model: {{.*}} 64 for instruction:   %sel = select <32 x i1>
+; SSE41: Cost Model: {{.*}} 2 for instruction:   %sel = select <32 x i1>
+;;; FIXME: This AVX cost is obviously wrong. We shouldn't be scalarizing.
+; AVX: Cost Model: {{.*}} 64 for instruction:   %sel = select <32 x i1>
+; AVX2: Cost Model: {{.*}} 1 for instruction:   %sel = select <32 x i1>
+  %sel = select <32 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true>, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %sel
+}
+
diff --git a/test/Analysis/Delinearization/a.ll b/test/Analysis/Delinearization/a.ll
index 9308749..efebcc4 100644
--- a/test/Analysis/Delinearization/a.ll
+++ b/test/Analysis/Delinearization/a.ll
@@ -12,17 +12,6 @@
 ; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(i32) bytes.
 ; CHECK: ArrayRef[{3,+,2}<%for.i>][{-4,+,3}<%for.j>][{7,+,5}<%for.k>]
 
-; AddRec: {{(8 + ((4 + (12 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(12 * %o)}<%for.j>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize][%o] with elements of sizeof(i32) bytes.
-; CHECK: ArrayRef[{(1 + (3 * %m)),+,(2 * %m)}<%for.i>][{2,+,(3 * %o)}<%for.j>]
-
-; AddRec: {(8 + ((-8 + (24 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize] with elements of 2 bytes.
-; CHECK: ArrayRef[{((1 + ((-1 + (3 * %m)) * %o)) * sizeof(i32)),+,(%m * %o * sizeof(i32))}<%for.i>]
-
-; Function Attrs: nounwind uwtable
 define void @foo(i64 %n, i64 %m, i64 %o, i32* nocapture %A) #0 {
 entry:
   %cmp32 = icmp sgt i64 %n, 0
diff --git a/test/Analysis/Delinearization/gcd_multiply_expr.ll b/test/Analysis/Delinearization/gcd_multiply_expr.ll
new file mode 100644
index 0000000..f962f6d
--- /dev/null
+++ b/test/Analysis/Delinearization/gcd_multiply_expr.ll
@@ -0,0 +1,153 @@
+; RUN: opt < %s -basicaa -da -analyze -delinearize
+;
+; a, b, c, d, g, h;
+; char *f;
+; static fn1(p1) {
+;   char *e = p1;
+;   for (; d;) {
+;     a = 0;
+;     for (;; ++a)
+;       for (; b; ++b)
+;         c = e[b + a];
+;   }
+; }
+;
+; fn2() {
+;   for (;;)
+;     fn1(&f[g * h]);
+; }
+
+@g = common global i32 0, align 4
+@h = common global i32 0, align 4
+@f = common global i8* null, align 4
+@a = common global i32 0, align 4
+@b = common global i32 0, align 4
+@c = common global i32 0, align 4
+@d = common global i32 0, align 4
+
+define i32 @fn2() {
+entry:
+  %.pr = load i32* @d, align 4
+  %phitmp = icmp eq i32 %.pr, 0
+  br label %for.cond
+
+for.cond:
+  %0 = phi i1 [ true, %for.cond ], [ %phitmp, %entry ]
+  br i1 %0, label %for.cond, label %for.cond2thread-pre-split.preheader.i
+
+for.cond2thread-pre-split.preheader.i:
+  %1 = load i32* @g, align 4
+  %2 = load i32* @h, align 4
+  %mul = mul nsw i32 %2, %1
+  %3 = load i8** @f, align 4
+  %.pr.pre.i = load i32* @b, align 4
+  br label %for.cond2thread-pre-split.i
+
+for.cond2thread-pre-split.i:
+  %.pr.i = phi i32 [ 0, %for.inc5.i ], [ %.pr.pre.i, %for.cond2thread-pre-split.preheader.i ]
+  %storemerge.i = phi i32 [ %inc6.i, %for.inc5.i ], [ 0, %for.cond2thread-pre-split.preheader.i ]
+  store i32 %storemerge.i, i32* @a, align 4
+  %tobool31.i = icmp eq i32 %.pr.i, 0
+  br i1 %tobool31.i, label %for.inc5.i, label %for.body4.preheader.i
+
+for.body4.preheader.i:
+  %4 = icmp slt i32 %.pr.i, -7
+  %add.i = add i32 %storemerge.i, %mul
+  br i1 %4, label %for.body4.i.preheader, label %for.body4.ur.i.preheader
+
+for.body4.i.preheader:
+  %5 = sub i32 -8, %.pr.i
+  %6 = lshr i32 %5, 3
+  %7 = mul i32 %6, 8
+  br label %for.body4.i
+
+for.body4.i:
+  %8 = phi i32 [ %inc.7.i, %for.body4.i ], [ %.pr.i, %for.body4.i.preheader ]
+  %arrayidx.sum1 = add i32 %add.i, %8
+  %arrayidx.i = getelementptr inbounds i8* %3, i32 %arrayidx.sum1
+  %9 = load i8* %arrayidx.i, align 1
+  %conv.i = sext i8 %9 to i32
+  store i32 %conv.i, i32* @c, align 4
+  %inc.i = add nsw i32 %8, 1
+  store i32 %inc.i, i32* @b, align 4
+  %arrayidx.sum2 = add i32 %add.i, %inc.i
+  %arrayidx.1.i = getelementptr inbounds i8* %3, i32 %arrayidx.sum2
+  %10 = load i8* %arrayidx.1.i, align 1
+  %conv.1.i = sext i8 %10 to i32
+  store i32 %conv.1.i, i32* @c, align 4
+  %inc.1.i = add nsw i32 %8, 2
+  store i32 %inc.1.i, i32* @b, align 4
+  %arrayidx.sum3 = add i32 %add.i, %inc.1.i
+  %arrayidx.2.i = getelementptr inbounds i8* %3, i32 %arrayidx.sum3
+  %11 = load i8* %arrayidx.2.i, align 1
+  %conv.2.i = sext i8 %11 to i32
+  store i32 %conv.2.i, i32* @c, align 4
+  %inc.2.i = add nsw i32 %8, 3
+  store i32 %inc.2.i, i32* @b, align 4
+  %arrayidx.sum4 = add i32 %add.i, %inc.2.i
+  %arrayidx.3.i = getelementptr inbounds i8* %3, i32 %arrayidx.sum4
+  %12 = load i8* %arrayidx.3.i, align 1
+  %conv.3.i = sext i8 %12 to i32
+  store i32 %conv.3.i, i32* @c, align 4
+  %inc.3.i = add nsw i32 %8, 4
+  store i32 %inc.3.i, i32* @b, align 4
+  %arrayidx.sum5 = add i32 %add.i, %inc.3.i
+  %arrayidx.4.i = getelementptr inbounds i8* %3, i32 %arrayidx.sum5
+  %13 = load i8* %arrayidx.4.i, align 1
+  %conv.4.i = sext i8 %13 to i32
+  store i32 %conv.4.i, i32* @c, align 4
+  %inc.4.i = add nsw i32 %8, 5
+  store i32 %inc.4.i, i32* @b, align 4
+  %arrayidx.sum6 = add i32 %add.i, %inc.4.i
+  %arrayidx.5.i = getelementptr inbounds i8* %3, i32 %arrayidx.sum6
+  %14 = load i8* %arrayidx.5.i, align 1
+  %conv.5.i = sext i8 %14 to i32
+  store i32 %conv.5.i, i32* @c, align 4
+  %inc.5.i = add nsw i32 %8, 6
+  store i32 %inc.5.i, i32* @b, align 4
+  %arrayidx.sum7 = add i32 %add.i, %inc.5.i
+  %arrayidx.6.i = getelementptr inbounds i8* %3, i32 %arrayidx.sum7
+  %15 = load i8* %arrayidx.6.i, align 1
+  %conv.6.i = sext i8 %15 to i32
+  store i32 %conv.6.i, i32* @c, align 4
+  %inc.6.i = add nsw i32 %8, 7
+  store i32 %inc.6.i, i32* @b, align 4
+  %arrayidx.sum8 = add i32 %add.i, %inc.6.i
+  %arrayidx.7.i = getelementptr inbounds i8* %3, i32 %arrayidx.sum8
+  %16 = load i8* %arrayidx.7.i, align 1
+  %conv.7.i = sext i8 %16 to i32
+  store i32 %conv.7.i, i32* @c, align 4
+  %inc.7.i = add nsw i32 %8, 8
+  store i32 %inc.7.i, i32* @b, align 4
+  %tobool3.7.i = icmp sgt i32 %inc.7.i, -8
+  br i1 %tobool3.7.i, label %for.inc5.loopexit.ur-lcssa.i, label %for.body4.i
+
+for.inc5.loopexit.ur-lcssa.i:
+  %17 = add i32 %.pr.i, 8
+  %18 = add i32 %17, %7
+  %19 = icmp eq i32 %18, 0
+  br i1 %19, label %for.inc5.i, label %for.body4.ur.i.preheader
+
+for.body4.ur.i.preheader:
+  %.ph = phi i32 [ %18, %for.inc5.loopexit.ur-lcssa.i ], [ %.pr.i, %for.body4.preheader.i ]
+  br label %for.body4.ur.i
+
+for.body4.ur.i:
+  %20 = phi i32 [ %inc.ur.i, %for.body4.ur.i ], [ %.ph, %for.body4.ur.i.preheader ]
+  %arrayidx.sum = add i32 %add.i, %20
+  %arrayidx.ur.i = getelementptr inbounds i8* %3, i32 %arrayidx.sum
+  %21 = load i8* %arrayidx.ur.i, align 1
+  %conv.ur.i = sext i8 %21 to i32
+  store i32 %conv.ur.i, i32* @c, align 4
+  %inc.ur.i = add nsw i32 %20, 1
+  store i32 %inc.ur.i, i32* @b, align 4
+  %tobool3.ur.i = icmp eq i32 %inc.ur.i, 0
+  br i1 %tobool3.ur.i, label %for.inc5.i.loopexit, label %for.body4.ur.i
+
+for.inc5.i.loopexit:
+  br label %for.inc5.i
+
+for.inc5.i:
+  %inc6.i = add nsw i32 %storemerge.i, 1
+  br label %for.cond2thread-pre-split.i
+}
diff --git a/test/Analysis/Delinearization/himeno_1.ll b/test/Analysis/Delinearization/himeno_1.ll
index 9458bd2..c94ca7a 100644
--- a/test/Analysis/Delinearization/himeno_1.ll
+++ b/test/Analysis/Delinearization/himeno_1.ll
@@ -31,16 +31,6 @@
 ; CHECK: ArrayDecl[UnknownSize][(sext i32 %a.cols to i64)][(sext i32 %a.deps to i64)] with elements of sizeof(float) bytes.
 ; CHECK: ArrayRef[{1,+,1}<nuw><nsw><%for.i>][{1,+,1}<nuw><nsw><%for.j>][{1,+,1}<nuw><nsw><%for.k>]
 
-; AddRec: {{(-4 + (4 * (sext i32 (-1 + %p.deps) to i64)) + (4 * (sext i32 %a.deps to i64) * (1 + (sext i32 %a.cols to i64))) + %a.base),+,(4 * (sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>,+,(4 * (sext i32 %a.deps to i64))}<%for.j>
-; CHECK: Base offset: %a.base
-; CHECK: ArrayDecl[UnknownSize][(sext i32 %a.deps to i64)] with elements of sizeof(float) bytes.
-; CHECK: ArrayRef[{(1 + (sext i32 %a.cols to i64)),+,(sext i32 %a.cols to i64)}<%for.i>][{(-1 + (sext i32 (-1 + %p.deps) to i64)),+,(sext i32 %a.deps to i64)}<%for.j>]
-
-; AddRec: {(-4 + (4 * (sext i32 (-1 + %p.deps) to i64)) + ((sext i32 %a.deps to i64) * (-4 + (4 * (sext i32 (-1 + %p.cols) to i64)) + (4 * (sext i32 %a.cols to i64)))) + %a.base),+,(4 * (sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>
-; CHECK: Base offset: %a.base
-; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(float) bytes.
-; CHECK: ArrayRef[{(-1 + (sext i32 (-1 + %p.deps) to i64) + ((sext i32 %a.deps to i64) * (-1 + (sext i32 (-1 + %p.cols) to i64) + (sext i32 %a.cols to i64)))),+,((sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>]
-
 %struct.Mat = type { float*, i32, i32, i32, i32 }
 
 define void @jacobi(i32 %nn, %struct.Mat* nocapture %a, %struct.Mat* nocapture %p) nounwind uwtable {
diff --git a/test/Analysis/Delinearization/himeno_2.ll b/test/Analysis/Delinearization/himeno_2.ll
index a290066..c256384 100644
--- a/test/Analysis/Delinearization/himeno_2.ll
+++ b/test/Analysis/Delinearization/himeno_2.ll
@@ -31,16 +31,6 @@
 ; CHECK: ArrayDecl[UnknownSize][(sext i32 %a.cols to i64)][(sext i32 %a.deps to i64)] with elements of sizeof(float) bytes.
 ; CHECK: ArrayRef[{1,+,1}<nuw><nsw><%for.i>][{1,+,1}<nuw><nsw><%for.j>][{1,+,1}<nuw><nsw><%for.k>]
 
-; AddRec: {{(-4 + (4 * (sext i32 (-1 + %p.deps) to i64)) + (4 * (sext i32 %a.deps to i64) * (1 + (sext i32 %a.cols to i64))) + %a.base),+,(4 * (sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>,+,(4 * (sext i32 %a.deps to i64))}<%for.j>
-; CHECK: Base offset: %a.base
-; CHECK: ArrayDecl[UnknownSize][(sext i32 %a.deps to i64)] with elements of sizeof(float) bytes.
-; CHECK: ArrayRef[{(1 + (sext i32 %a.cols to i64)),+,(sext i32 %a.cols to i64)}<%for.i>][{(-1 + (sext i32 (-1 + %p.deps) to i64)),+,(sext i32 %a.deps to i64)}<%for.j>]
-
-; AddRec: {(-4 + (4 * (sext i32 (-1 + %p.deps) to i64)) + ((sext i32 %a.deps to i64) * (-4 + (4 * (sext i32 (-1 + %p.cols) to i64)) + (4 * (sext i32 %a.cols to i64)))) + %a.base),+,(4 * (sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>
-; CHECK: Base offset: %a.base
-; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(float) bytes.
-; CHECK: ArrayRef[{(-1 + (sext i32 (-1 + %p.deps) to i64) + ((sext i32 %a.deps to i64) * (-1 + (sext i32 (-1 + %p.cols) to i64) + (sext i32 %a.cols to i64)))),+,((sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>]
-
 %struct.Mat = type { float*, i32, i32, i32, i32 }
 
 define void @jacobi(i32 %nn, %struct.Mat* nocapture %a, %struct.Mat* nocapture %p) nounwind uwtable {
diff --git a/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll b/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll
new file mode 100644
index 0000000..01a4b96
--- /dev/null
+++ b/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -analyze -delinearize | FileCheck %s
+
+; Derived from the following code:
+;
+; void foo(long n, long m, long b, double A[n][m]) {
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++)
+;       A[2i+b][2j] = 1.0;
+; }
+
+; AddRec: {{((%m * %b * sizeof(double)) + %A),+,(2 * %m * sizeof(double))}<%for.i>,+,(2 * sizeof(double))}<%for.j>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][%m] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{%b,+,2}<%for.i>][{0,+,2}<%for.j>]
+
+
+define void @foo(i64 %n, i64 %m, i64 %b, double* %A) {
+entry:
+  br label %for.i
+
+for.i:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
+  %outerdim = mul nsw i64 %i, 2
+  %outerdim2 = add nsw i64 %outerdim, %b
+  %tmp = mul nsw i64 %outerdim2, %m
+  br label %for.j
+
+for.j:
+  %j = phi i64 [ 0, %for.i ], [ %j.inc, %for.j ]
+  %prodj = mul i64 %j, 2
+  %vlaarrayidx.sum = add i64 %prodj, %tmp
+  %arrayidx = getelementptr inbounds double* %A, i64 %vlaarrayidx.sum
+  store double 1.0, double* %arrayidx
+  %j.inc = add nsw i64 %j, 1
+  %j.exitcond = icmp eq i64 %j.inc, %m
+  br i1 %j.exitcond, label %for.i.inc, label %for.j
+
+for.i.inc:
+  %i.inc = add nsw i64 %i, 1
+  %i.exitcond = icmp eq i64 %i.inc, %n
+  br i1 %i.exitcond, label %end, label %for.i
+
+end:
+  ret void
+}
diff --git a/test/Analysis/Delinearization/lit.local.cfg b/test/Analysis/Delinearization/lit.local.cfg
index 19eebc0..c6106e4 100644
--- a/test/Analysis/Delinearization/lit.local.cfg
+++ b/test/Analysis/Delinearization/lit.local.cfg
@@ -1 +1 @@
-config.suffixes = ['.ll', '.c', '.cpp']
+config.suffixes = ['.ll']
diff --git a/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll b/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll
index 82cab16..ae80ebc 100644
--- a/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll
+++ b/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll
@@ -13,16 +13,6 @@
 ; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double) bytes.
 ; CHECK: ArrayRef[{3,+,1}<nw><%for.i>][{-4,+,1}<nw><%for.j>][{7,+,1}<nw><%for.k>]
 
-; AddRec: {{(48 + ((-24 + (24 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize][%o] with elements of sizeof(double) bytes.
-; CHECK: ArrayRef[{(-3 + (3 * %m)),+,%m}<%for.i>][{6,+,%o}<%for.j>]
-
-; AddRec: {(48 + ((-32 + (32 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(double) bytes.
-; CHECK: ArrayRef[{(6 + ((-4 + (4 * %m)) * %o)),+,(%m * %o)}<%for.i>]
-
 define void @foo(i64 %n, i64 %m, i64 %o, double* %A) {
 entry:
   br label %for.i
diff --git a/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_nts_3d.ll b/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_nts_3d.ll
index a1e779f..75080da 100644
--- a/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_nts_3d.ll
+++ b/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_nts_3d.ll
@@ -13,16 +13,6 @@
 ; CHECK: ArrayDecl[UnknownSize][%m][(%o + %p)] with elements of sizeof(double) bytes.
 ; CHECK: ArrayRef[{3,+,1}<nw><%for.cond4.preheader.lr.ph.us>][{-4,+,1}<nw><%for.body6.lr.ph.us.us>][{7,+,1}<nw><%for.body6.us.us>]
 
-; AddRec: {{(48 + (8 * %o) + (8 * (-4 + (3 * %m)) * (%o + %p)) + %A),+,(8 * (%o + %p) * %m)}<%for.cond4.preheader.lr.ph.us>,+,(8 * (%o + %p))}<%for.body6.lr.ph.us.us>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize][(%o + %p)] with elements of sizeof(double) bytes.
-; CHECK: ArrayRef[{(-4 + (3 * %m)),+,%m}<%for.cond4.preheader.lr.ph.us>][{(6 + %o),+,(%o + %p)}<%for.body6.lr.ph.us.us>]
-
-; AddRec: {(48 + (8 * %o) + ((-40 + (32 * %m)) * (%o + %p)) + %A),+,(8 * (%o + %p) * %m)}<%for.cond4.preheader.lr.ph.us>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(double) bytes.
-; CHECK: ArrayRef[{(6 + ((-5 + (4 * %m)) * (%o + %p)) + %o),+,((%o + %p) * %m)}<%for.cond4.preheader.lr.ph.us>]
-
 define void @foo(i64 %n, i64 %m, i64 %o, i64 %p, double* nocapture %A) nounwind uwtable {
 entry:
   %add = add nsw i64 %p, %o
diff --git a/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll b/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll
index a52a4c9..e921444 100644
--- a/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll
+++ b/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll
@@ -13,16 +13,6 @@
 ; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double) bytes.
 ; CHECK: ArrayRef[{%p,+,1}<nw><%for.i>][{%q,+,1}<nw><%for.j>][{%r,+,1}<nw><%for.k>]
 
-; AddRec: {{(-8 + (8 * ((((%m * %p) + %q) * %o) + %r)) + (8 * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize][%o] with elements of sizeof(double) bytes.
-; CHECK: ArrayRef[{(1 + (%m * %p) + %q),+,%m}<%for.i>][{(-1 + %r),+,%o}<%for.j>]
-
-; AddRec: {(-8 + (8 * ((((%m * %p) + %q) * %o) + %r)) + (8 * %m * %o) + %A),+,(8 * %m * %o)}<%for.i>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(double) bytes.
-; CHECK: ArrayRef[{(-1 + ((((1 + %p) * %m) + %q) * %o) + %r),+,(%m * %o)}<%for.i>]
-
 define void @foo(i64 %n, i64 %m, i64 %o, double* %A, i64 %p, i64 %q, i64 %r) {
 entry:
   br label %for.i
diff --git a/test/Analysis/Delinearization/multidim_only_ivs_2d.ll b/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
index d68a158..48bec08 100644
--- a/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
+++ b/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
@@ -13,11 +13,6 @@
 ; CHECK: ArrayDecl[UnknownSize][%m] with elements of sizeof(double) bytes.
 ; CHECK: ArrayRef[{0,+,1}<nuw><nsw><%for.i>][{0,+,1}<nuw><nsw><%for.j>]
 
-; AddRec: {(-8 + (8 * %m) + %A),+,(8 * %m)}<%for.i>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(double) bytes.
-; CHECK: ArrayRef[{(-1 + %m),+,%m}<%for.i>]
-
 define void @foo(i64 %n, i64 %m, double* %A) {
 entry:
   br label %for.i
diff --git a/test/Analysis/Delinearization/multidim_only_ivs_2d_nested.ll b/test/Analysis/Delinearization/multidim_only_ivs_2d_nested.ll
index 7207420..810188f 100644
--- a/test/Analysis/Delinearization/multidim_only_ivs_2d_nested.ll
+++ b/test/Analysis/Delinearization/multidim_only_ivs_2d_nested.ll
@@ -1,4 +1,6 @@
 ; RUN: opt < %s -analyze -delinearize | FileCheck %s
+; XFAIL: *
+; We do not recognize anymore variable size arrays.
 
 ; extern void bar(long n, long m, double A[n][m]);
 ;
diff --git a/test/Analysis/Delinearization/multidim_only_ivs_3d.ll b/test/Analysis/Delinearization/multidim_only_ivs_3d.ll
index 24f9583..aad0f09 100644
--- a/test/Analysis/Delinearization/multidim_only_ivs_3d.ll
+++ b/test/Analysis/Delinearization/multidim_only_ivs_3d.ll
@@ -13,16 +13,6 @@
 ; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double) bytes.
 ; CHECK: ArrayRef[{0,+,1}<nuw><nsw><%for.i>][{0,+,1}<nuw><nsw><%for.j>][{0,+,1}<nuw><nsw><%for.k>]
 
-; AddRec: {{(-8 + (8 * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize][(%m * %o)] with elements of sizeof(double) bytes.
-; CHECK: ArrayRef[{0,+,1}<nuw><nsw><%for.i>][{(-1 + %o),+,%o}<%for.j>]
-
-; AddRec: {(-8 + (8 * %m * %o) + %A),+,(8 * %m * %o)}<%for.i>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(double) bytes.
-; CHECK: ArrayRef[{(-1 + (%m * %o)),+,(%m * %o)}<%for.i>]
-
 define void @foo(i64 %n, i64 %m, i64 %o, double* %A) {
 entry:
   br label %for.i
diff --git a/test/Analysis/Delinearization/multidim_only_ivs_3d_cast.ll b/test/Analysis/Delinearization/multidim_only_ivs_3d_cast.ll
index e151610..9e406d1 100644
--- a/test/Analysis/Delinearization/multidim_only_ivs_3d_cast.ll
+++ b/test/Analysis/Delinearization/multidim_only_ivs_3d_cast.ll
@@ -12,16 +12,6 @@
 ; CHECK: ArrayDecl[UnknownSize][(zext i32 %m to i64)][(zext i32 %o to i64)] with elements of 8 bytes.
 ; CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>]
 
-; AddRec: {{((8 * (zext i32 (-1 + %o) to i64)) + %A),+,(8 * (zext i32 %m to i64) * (zext i32 %o to i64))}<%for.i>,+,(8 * (zext i32 %o to i64))}<%for.j>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize][((zext i32 %m to i64) * (zext i32 %o to i64))] with elements of 8 bytes.
-; CHECK: ArrayRef[{0,+,1}<%for.i>][{(zext i32 (-1 + %o) to i64),+,(zext i32 %o to i64)}<%for.j>]
-
-; AddRec: {((8 * (zext i32 (-1 + %o) to i64)) + (8 * (zext i32 (-1 + %m) to i64) * (zext i32 %o to i64)) + %A),+,(8 * (zext i32 %m to i64) * (zext i32 %o to i64))}<%for.i>
-; CHECK: Base offset: %A
-; CHECK: ArrayDecl[UnknownSize] with elements of 8 bytes.
-; CHECK: ArrayRef[{((zext i32 (-1 + %o) to i64) + ((zext i32 (-1 + %m) to i64) * (zext i32 %o to i64))),+,((zext i32 %m to i64) * (zext i32 %o to i64))}<%for.i>]
-
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Analysis/Delinearization/multidim_two_accesses_different_delinearization.ll b/test/Analysis/Delinearization/multidim_two_accesses_different_delinearization.ll
new file mode 100644
index 0000000..6a98507
--- /dev/null
+++ b/test/Analysis/Delinearization/multidim_two_accesses_different_delinearization.ll
@@ -0,0 +1,43 @@
+; RUN: opt -basicaa -da -analyze -da-delinearize < %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Derived from the following code:
+;
+; void foo(long n, long m, double *A) {
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++)
+;       *(A + i * n + j) = 1.0;
+;       *(A + j * m + i) = 2.0;
+; }
+
+define void @foo(i64 %n, i64 %m, double* %A) {
+entry:
+  br label %for.i
+
+for.i:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
+  br label %for.j
+
+for.j:
+  %j = phi i64 [ 0, %for.i ], [ %j.inc, %for.j ]
+  %tmp = mul nsw i64 %i, %m
+  %vlaarrayidx.sum = add i64 %j, %tmp
+  %arrayidx = getelementptr inbounds double* %A, i64 %vlaarrayidx.sum
+  store double 1.0, double* %arrayidx
+  %tmp1 = mul nsw i64 %j, %n
+  %vlaarrayidx.sum1 = add i64 %i, %tmp1
+  %arrayidx1 = getelementptr inbounds double* %A, i64 %vlaarrayidx.sum1
+  store double 1.0, double* %arrayidx1
+  %j.inc = add nsw i64 %j, 1
+  %j.exitcond = icmp eq i64 %j.inc, %m
+  br i1 %j.exitcond, label %for.i.inc, label %for.j
+
+for.i.inc:
+  %i.inc = add nsw i64 %i, 1
+  %i.exitcond = icmp eq i64 %i.inc, %n
+  br i1 %i.exitcond, label %end, label %for.i
+
+end:
+  ret void
+}
diff --git a/test/Analysis/Delinearization/undef.ll b/test/Analysis/Delinearization/undef.ll
new file mode 100644
index 0000000..8ee64e3
--- /dev/null
+++ b/test/Analysis/Delinearization/undef.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s -analyze -delinearize
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(double* %Ey) {
+entry:
+  br i1 undef, label %for.cond55.preheader, label %for.end324
+
+for.cond55.preheader:
+  %iz.069 = phi i64 [ %inc323, %for.inc322 ], [ 0, %entry ]
+  br i1 undef, label %for.cond58.preheader, label %for.inc322
+
+for.cond58.preheader:
+  %iy.067 = phi i64 [ %inc320, %for.end ], [ 0, %for.cond55.preheader ]
+  br i1 undef, label %for.body60, label %for.end
+
+for.body60:
+  %ix.062 = phi i64 [ %inc, %for.body60 ], [ 0, %for.cond58.preheader ]
+  %0 = mul i64 %iz.069, undef
+  %tmp5 = add i64 %iy.067, %0
+  %tmp6 = mul i64 %tmp5, undef
+  %arrayidx69.sum = add i64 undef, %tmp6
+  %arrayidx70 = getelementptr inbounds double* %Ey, i64 %arrayidx69.sum
+  %1 = load double* %arrayidx70, align 8
+  %inc = add nsw i64 %ix.062, 1
+  br i1 false, label %for.body60, label %for.end
+
+for.end:
+  %inc320 = add nsw i64 %iy.067, 1
+  br i1 undef, label %for.cond58.preheader, label %for.inc322
+
+for.inc322:
+  %inc323 = add nsw i64 %iz.069, 1
+  br i1 undef, label %for.cond55.preheader, label %for.end324
+
+for.end324:
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Banerjee.ll b/test/Analysis/DependenceAnalysis/Banerjee.ll
index 5c17064..883a06d 100644
--- a/test/Analysis/DependenceAnalysis/Banerjee.ll
+++ b/test/Analysis/DependenceAnalysis/Banerjee.ll
@@ -24,7 +24,7 @@ entry:
 
 ; DELIN: 'Dependence Analysis' for function 'banerjee0':
 ; DELIN: da analyze - none!
-; DELIN: da analyze - consistent flow [0 1]!
+; DELIN: da analyze - flow [<= <>]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
 ; DELIN: da analyze - confused!
@@ -83,10 +83,10 @@ entry:
 ; CHECK: da analyze - output [* *]!
 
 ; DELIN: 'Dependence Analysis' for function 'banerjee1':
-; DELIN: da analyze - none
-; DELIN: da analyze - consistent flow [0 1]!
+; DELIN: da analyze - output [* *]!
+; DELIN: da analyze - flow [* <>]!
 ; DELIN: da analyze - confused!
-; DELIN: da analyze - none
+; DELIN: da analyze - input [* *]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - output [* *]!
 
@@ -218,7 +218,7 @@ entry:
 
 ; DELIN: 'Dependence Analysis' for function 'banerjee3':
 ; DELIN: da analyze - none!
-; DELIN: da analyze - consistent flow [-9 -9]!
+; DELIN: da analyze - flow [> >]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
 ; DELIN: da analyze - confused!
@@ -336,7 +336,7 @@ entry:
 
 ; DELIN: 'Dependence Analysis' for function 'banerjee5':
 ; DELIN: da analyze - none!
-; DELIN: da analyze - consistent flow [9 9]!
+; DELIN: da analyze - flow [< <]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
 ; DELIN: da analyze - confused!
@@ -395,7 +395,7 @@ entry:
 
 ; DELIN: 'Dependence Analysis' for function 'banerjee6':
 ; DELIN: da analyze - none!
-; DELIN: da analyze - consistent flow [0 -9]!
+; DELIN: da analyze - flow [=> <>]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
 ; DELIN: da analyze - confused!
@@ -454,7 +454,7 @@ entry:
 
 ; DELIN: 'Dependence Analysis' for function 'banerjee7':
 ; DELIN: da analyze - none!
-; DELIN: da analyze - consistent flow [-1 0]!
+; DELIN: da analyze - flow [> <=]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
 ; DELIN: da analyze - confused!
@@ -513,7 +513,7 @@ entry:
 
 ; DELIN: 'Dependence Analysis' for function 'banerjee8':
 ; DELIN: da analyze - none!
-; DELIN: da analyze - consistent flow [-1 -1]!
+; DELIN: da analyze - flow [> <>]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
 ; DELIN: da analyze - confused!
@@ -571,7 +571,7 @@ entry:
 ; CHECK: da analyze - none!
 
 ; DELIN: 'Dependence Analysis' for function 'banerjee9':
-; DELIN: da analyze - none!
+; DELIN: da analyze - output [* *]!
 ; DELIN: da analyze - flow [<= =|<]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
@@ -750,7 +750,7 @@ entry:
 
 ; DELIN: 'Dependence Analysis' for function 'banerjee12':
 ; DELIN: da analyze - none!
-; DELIN: da analyze - consistent flow [0 -11]!
+; DELIN: da analyze - flow [= <>]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
 ; DELIN: da analyze - confused!
diff --git a/test/Analysis/DependenceAnalysis/GCD.ll b/test/Analysis/DependenceAnalysis/GCD.ll
index 7efa8b5..7eca18e 100644
--- a/test/Analysis/DependenceAnalysis/GCD.ll
+++ b/test/Analysis/DependenceAnalysis/GCD.ll
@@ -24,10 +24,10 @@ entry:
 ; CHECK: da analyze - none!
 
 ; DELIN: 'Dependence Analysis' for function 'gcd0'
-; DELIN: da analyze - none!
+; DELIN: da analyze - output [* *]!
 ; DELIN: da analyze - flow [=> *|<]!
 ; DELIN: da analyze - confused!
-; DELIN: da analyze - none!
+; DELIN: da analyze - input [* *]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
 
@@ -85,10 +85,10 @@ entry:
 ; CHECK: da analyze - none!
 
 ; DELIN: 'Dependence Analysis' for function 'gcd1'
-; DELIN: da analyze - none!
+; DELIN: da analyze - output [* *]!
 ; DELIN: da analyze - none!
 ; DELIN: da analyze - confused!
-; DELIN: da analyze - none!
+; DELIN: da analyze - input [* *]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
 
@@ -147,10 +147,10 @@ entry:
 ; CHECK: da analyze - none!
 
 ; DELIN: 'Dependence Analysis' for function 'gcd2'
-; DELIN: da analyze - none!
+; DELIN: da analyze - output [* *]!
 ; DELIN: da analyze - none!
 ; DELIN: da analyze - confused!
-; DELIN: da analyze - none!
+; DELIN: da analyze - input [* *]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - none!
 
@@ -410,10 +410,10 @@ entry:
 ; CHECK: da analyze - output [* *]!
 
 ; DELIN: 'Dependence Analysis' for function 'gcd6'
-; DELIN: da analyze - none!
+; DELIN: da analyze - output [* *]!
 ; DELIN: da analyze - none!
 ; DELIN: da analyze - confused!
-; DELIN: da analyze - none!
+; DELIN: da analyze - input [* *]!
 ; DELIN: da analyze - confused!
 ; DELIN: da analyze - output [* *]!
 
diff --git a/test/Analysis/LazyCallGraph/basic.ll b/test/Analysis/LazyCallGraph/basic.ll
index ebadb75..b8108d9 100644
--- a/test/Analysis/LazyCallGraph/basic.ll
+++ b/test/Analysis/LazyCallGraph/basic.ll
@@ -124,3 +124,53 @@ define void @test2() {
   load i8** bitcast (void ()** @h to i8**)
   ret void
 }
+
+; Verify the SCCs formed.
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f7
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f6
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f5
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f4
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f3
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f2
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f1
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    test2
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f12
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f11
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f10
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f9
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f8
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    test1
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    f
+;
+; CHECK-LABEL: SCC with 1 functions:
+; CHECK-NEXT:    test0
diff --git a/test/Analysis/ScalarEvolution/max-trip-count.ll b/test/Analysis/ScalarEvolution/max-trip-count.ll
index 0cdbdf5..31f06a4 100644
--- a/test/Analysis/ScalarEvolution/max-trip-count.ll
+++ b/test/Analysis/ScalarEvolution/max-trip-count.ll
@@ -98,3 +98,112 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; CHECK: Determining loop execution counts for: @test
 ; CHECK-NEXT: backedge-taken count is
 ; CHECK-NEXT: max backedge-taken count is -1
+
+; PR19799: Indvars miscompile due to an incorrect max backedge taken count from SCEV.
+; CHECK-LABEL: @pr19799
+; CHECK: Loop %for.body.i: <multiple exits> Unpredictable backedge-taken count. 
+; CHECK: Loop %for.body.i: max backedge-taken count is 1
+@a = common global i32 0, align 4
+
+define i32 @pr19799() {
+entry:
+  store i32 -1, i32* @a, align 4
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.i, %entry
+  %storemerge1.i = phi i32 [ -1, %entry ], [ %add.i.i, %for.cond.i ]
+  %tobool.i = icmp eq i32 %storemerge1.i, 0
+  %add.i.i = add nsw i32 %storemerge1.i, 2
+  br i1 %tobool.i, label %bar.exit, label %for.cond.i
+
+for.cond.i:                                       ; preds = %for.body.i
+  store i32 %add.i.i, i32* @a, align 4
+  %cmp.i = icmp slt i32 %storemerge1.i, 0
+  br i1 %cmp.i, label %for.body.i, label %bar.exit
+
+bar.exit:                                         ; preds = %for.cond.i, %for.body.i
+  ret i32 0
+}
+
+; PR18886: Indvars miscompile due to an incorrect max backedge taken count from SCEV.
+; CHECK-LABEL: @pr18886
+; CHECK: Loop %for.body: <multiple exits> Unpredictable backedge-taken count. 
+; CHECK: Loop %for.body: max backedge-taken count is 3
+@aa = global i64 0, align 8
+
+define i32 @pr18886() {
+entry:
+  store i64 -21, i64* @aa, align 8
+  br label %for.body
+
+for.body:
+  %storemerge1 = phi i64 [ -21, %entry ], [ %add, %for.cond ]
+  %tobool = icmp eq i64 %storemerge1, 0
+  %add = add nsw i64 %storemerge1, 8
+  br i1 %tobool, label %return, label %for.cond
+
+for.cond:
+  store i64 %add, i64* @aa, align 8
+  %cmp = icmp slt i64 %add, 9
+  br i1 %cmp, label %for.body, label %return
+
+return:
+  %retval.0 = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+  ret i32 %retval.0
+}
+
+; Here we have a must-exit loop latch that is not computable and a
+; may-exit early exit that can only have one non-exiting iteration
+; before the check is forever skipped.
+;
+; CHECK-LABEL: @cannot_compute_mustexit
+; CHECK: Loop %for.body.i: <multiple exits> Unpredictable backedge-taken count. 
+; CHECK: Loop %for.body.i: Unpredictable max backedge-taken count. 
+@b = common global i32 0, align 4
+
+define i32 @cannot_compute_mustexit() {
+entry:
+  store i32 -1, i32* @a, align 4
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.i, %entry
+  %storemerge1.i = phi i32 [ -1, %entry ], [ %add.i.i, %for.cond.i ]
+  %tobool.i = icmp eq i32 %storemerge1.i, 0
+  %add.i.i = add nsw i32 %storemerge1.i, 2
+  br i1 %tobool.i, label %bar.exit, label %for.cond.i
+
+for.cond.i:                                       ; preds = %for.body.i
+  store i32 %add.i.i, i32* @a, align 4
+  %ld = load volatile i32* @b
+  %cmp.i = icmp ne i32 %ld, 0
+  br i1 %cmp.i, label %for.body.i, label %bar.exit
+
+bar.exit:                                         ; preds = %for.cond.i, %for.body.i
+  ret i32 0
+}
+
+; This loop has two must-exits, both of which dominate the latch. The
+; MaxBECount should be the minimum of them.
+;
+; CHECK-LABEL: @two_mustexit
+; CHECK: Loop %for.body.i: <multiple exits> Unpredictable backedge-taken count. 
+; CHECK: Loop %for.body.i: max backedge-taken count is 1
+define i32 @two_mustexit() {
+entry:
+  store i32 -1, i32* @a, align 4
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.i, %entry
+  %storemerge1.i = phi i32 [ -1, %entry ], [ %add.i.i, %for.cond.i ]
+  %tobool.i = icmp sgt i32 %storemerge1.i, 0
+  %add.i.i = add nsw i32 %storemerge1.i, 2
+  br i1 %tobool.i, label %bar.exit, label %for.cond.i
+
+for.cond.i:                                       ; preds = %for.body.i
+  store i32 %add.i.i, i32* @a, align 4
+  %cmp.i = icmp slt i32 %storemerge1.i, 3
+  br i1 %cmp.i, label %for.body.i, label %bar.exit
+
+bar.exit:                                         ; preds = %for.cond.i, %for.body.i
+  ret i32 0
+}
diff --git a/test/Assembler/2009-04-25-AliasGEP.ll b/test/Assembler/2009-04-25-AliasGEP.ll
deleted file mode 100644
index 6d07208..0000000
--- a/test/Assembler/2009-04-25-AliasGEP.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis
-; PR4066
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin9"
-	%struct.i2c_device_id = type { }
-@w83l785ts_id = internal constant [0 x %struct.i2c_device_id] zeroinitializer, align 1		; <[0 x %struct.i2c_device_id]*> [#uses=1]
-
-@__mod_i2c_device_table = alias getelementptr ([0 x %struct.i2c_device_id]* @w83l785ts_id, i32 0, i32 0)		; <%struct.i2c_device_id*> [#uses=0]
diff --git a/test/Assembler/addrspacecast-alias.ll b/test/Assembler/addrspacecast-alias.ll
index 6623a25..052a141 100644
--- a/test/Assembler/addrspacecast-alias.ll
+++ b/test/Assembler/addrspacecast-alias.ll
@@ -1,6 +1,7 @@
-; RUN: llvm-as -disable-output %s
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
 
 ; Test that global aliases are allowed to be constant addrspacecast
 
 @i = internal addrspace(1) global i8 42
-@ia = alias internal i8 addrspace(2)* addrspacecast (i8 addrspace(1)* @i to i8 addrspace(2)*)
+@ia = alias internal addrspace(2) i8 addrspace(3)*, i8 addrspace(1)* @i
+; CHECK: @ia = alias internal addrspace(2) i8 addrspace(3)*, i8 addrspace(1)* @i
diff --git a/test/Assembler/alias-addrspace.ll b/test/Assembler/alias-addrspace.ll
new file mode 100644
index 0000000..6d378e4
--- /dev/null
+++ b/test/Assembler/alias-addrspace.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as %s 2>&1 | FileCheck %s
+
+@foo = global i32 42
+@bar = alias internal addrspace(1) i32* @foo
+
+CHECK: error: A type is required if addrspace is given
diff --git a/test/Assembler/alias-redefinition.ll b/test/Assembler/alias-redefinition.ll
new file mode 100644
index 0000000..19ad85b
--- /dev/null
+++ b/test/Assembler/alias-redefinition.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as %s 2>&1 | FileCheck %s
+
+; CHECK: error: redefinition of global named '@bar'
+
+@foo = global i32 0
+@bar = alias i32* @foo
+@bar = alias i32* @foo
diff --git a/test/Assembler/alias-to-alias.ll b/test/Assembler/alias-to-alias.ll
new file mode 100644
index 0000000..1ea99bb
--- /dev/null
+++ b/test/Assembler/alias-to-alias.ll
@@ -0,0 +1,5 @@
+; RUN:  not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+; CHECK: Alias must point to function or variable
+
+@b1 = alias i32* @c1
+@c1 = alias i32* @b1
diff --git a/test/Assembler/alias-to-alias2.ll b/test/Assembler/alias-to-alias2.ll
new file mode 100644
index 0000000..a8a0196
--- /dev/null
+++ b/test/Assembler/alias-to-alias2.ll
@@ -0,0 +1,7 @@
+; RUN:  not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+; CHECK: error: Alias is pointed by alias b1
+
+@g = global i32 42
+
+@b1 = alias i32* @c1
+@c1 = alias i32* @g
diff --git a/test/Assembler/alias-type.ll b/test/Assembler/alias-type.ll
new file mode 100644
index 0000000..ead3e95
--- /dev/null
+++ b/test/Assembler/alias-type.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as %s 2>&1 | FileCheck %s
+
+@foo = global i32 42
+@bar = alias i32 @foo
+
+CHECK: error: An alias must have pointer type
diff --git a/test/Assembler/half-constprop.ll b/test/Assembler/half-constprop.ll
index 03ccdda..9e24f72 100644
--- a/test/Assembler/half-constprop.ll
+++ b/test/Assembler/half-constprop.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | opt -O3 | llvm-dis | FileCheck %s
+; RUN: opt < %s -O3 -S | FileCheck %s
 ; Testing half constant propagation.
 
 define half @abc() nounwind {
diff --git a/test/Assembler/half-conv.ll b/test/Assembler/half-conv.ll
index bf9ae57..70a6b86 100644
--- a/test/Assembler/half-conv.ll
+++ b/test/Assembler/half-conv.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | opt -O3 | llvm-dis | FileCheck %s
+; RUN: opt < %s -O3 -S | FileCheck %s
 ; Testing half to float conversion.
 
 define float @abc() nounwind {
diff --git a/test/Assembler/internal-hidden-alias.ll b/test/Assembler/internal-hidden-alias.ll
new file mode 100644
index 0000000..660514b
--- /dev/null
+++ b/test/Assembler/internal-hidden-alias.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+@global = global i32 0
+
+@alias = hidden alias internal i32* @global
+; CHECK: symbol with local linkage must have default visibility
diff --git a/test/Assembler/internal-hidden-function.ll b/test/Assembler/internal-hidden-function.ll
new file mode 100644
index 0000000..193ed7c
--- /dev/null
+++ b/test/Assembler/internal-hidden-function.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+define internal hidden void @function() {
+; CHECK: symbol with local linkage must have default visibility
+entry:
+  ret void
+}
diff --git a/test/Assembler/internal-hidden-variable.ll b/test/Assembler/internal-hidden-variable.ll
new file mode 100644
index 0000000..eddd067
--- /dev/null
+++ b/test/Assembler/internal-hidden-variable.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+@var = internal hidden global i32 0
+; CHECK: symbol with local linkage must have default visibility
diff --git a/test/Assembler/internal-protected-alias.ll b/test/Assembler/internal-protected-alias.ll
new file mode 100644
index 0000000..d785826
--- /dev/null
+++ b/test/Assembler/internal-protected-alias.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+@global = global i32 0
+
+@alias = protected alias internal i32* @global
+; CHECK: symbol with local linkage must have default visibility
diff --git a/test/Assembler/internal-protected-function.ll b/test/Assembler/internal-protected-function.ll
new file mode 100644
index 0000000..944cb75
--- /dev/null
+++ b/test/Assembler/internal-protected-function.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+define internal protected void @function() {
+; CHECK: symbol with local linkage must have default visibility
+entry:
+  ret void
+}
diff --git a/test/Assembler/internal-protected-variable.ll b/test/Assembler/internal-protected-variable.ll
new file mode 100644
index 0000000..df02275
--- /dev/null
+++ b/test/Assembler/internal-protected-variable.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+@var = internal protected global i32 0
+; CHECK: symbol with local linkage must have default visibility
diff --git a/test/Assembler/private-hidden-alias.ll b/test/Assembler/private-hidden-alias.ll
new file mode 100644
index 0000000..58be92a
--- /dev/null
+++ b/test/Assembler/private-hidden-alias.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+@global = global i32 0
+
+@alias = hidden alias private i32* @global
+; CHECK: symbol with local linkage must have default visibility
diff --git a/test/Assembler/private-hidden-function.ll b/test/Assembler/private-hidden-function.ll
new file mode 100644
index 0000000..dd73f04
--- /dev/null
+++ b/test/Assembler/private-hidden-function.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+define private hidden void @function() {
+; CHECK: symbol with local linkage must have default visibility
+entry:
+  ret void
+}
diff --git a/test/Assembler/private-hidden-variable.ll b/test/Assembler/private-hidden-variable.ll
new file mode 100644
index 0000000..ce6bfa9
--- /dev/null
+++ b/test/Assembler/private-hidden-variable.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+@var = private hidden global i32 0
+; CHECK: symbol with local linkage must have default visibility
diff --git a/test/Assembler/private-protected-alias.ll b/test/Assembler/private-protected-alias.ll
new file mode 100644
index 0000000..a72c248
--- /dev/null
+++ b/test/Assembler/private-protected-alias.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+@global = global i32 0
+
+@alias = protected alias private i32* @global
+; CHECK: symbol with local linkage must have default visibility
diff --git a/test/Assembler/private-protected-function.ll b/test/Assembler/private-protected-function.ll
new file mode 100644
index 0000000..5dbb420
--- /dev/null
+++ b/test/Assembler/private-protected-function.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+define private protected void @function() {
+; CHECK: symbol with local linkage must have default visibility
+entry:
+  ret void
+}
diff --git a/test/Assembler/private-protected-variable.ll b/test/Assembler/private-protected-variable.ll
new file mode 100644
index 0000000..c4458f5
--- /dev/null
+++ b/test/Assembler/private-protected-variable.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+@var = private protected global i32 0
+; CHECK: symbol with local linkage must have default visibility
diff --git a/test/Bitcode/attributes.ll b/test/Bitcode/attributes.ll
index 545f1cb..02e1bb1 100644
--- a/test/Bitcode/attributes.ll
+++ b/test/Bitcode/attributes.ll
@@ -218,6 +218,11 @@ define void @f36(i8* inalloca) {
         ret void
 }
 
+define nonnull i8* @f37(i8* nonnull %a) {
+; CHECK: define nonnull i8* @f37(i8* nonnull %a) {
+        ret i8* %a
+}
+
 ; CHECK: attributes #0 = { noreturn }
 ; CHECK: attributes #1 = { nounwind }
 ; CHECK: attributes #2 = { readnone }
diff --git a/test/Bitcode/deprecated-linker_private-linker_private_weak.ll b/test/Bitcode/deprecated-linker_private-linker_private_weak.ll
new file mode 100644
index 0000000..12a527c
--- /dev/null
+++ b/test/Bitcode/deprecated-linker_private-linker_private_weak.ll
@@ -0,0 +1,17 @@
+; RUN: llvm-as -o - %s | llvm-dis | FileCheck %s
+; RUN: llvm-as -o /dev/null %s 2>&1 | FileCheck %s -check-prefix CHECK-WARNINGS
+
+@.linker_private = linker_private unnamed_addr constant [15 x i8] c"linker_private\00", align 64
+@.linker_private_weak = linker_private_weak unnamed_addr constant [20 x i8] c"linker_private_weak\00", align 64
+
+; CHECK: @.linker_private = private unnamed_addr constant [15 x i8] c"linker_private\00", align 64
+; CHECK: @.linker_private_weak = private unnamed_addr constant [20 x i8] c"linker_private_weak\00", align 64
+
+; CHECK-WARNINGS: warning: '.linker_private' is deprecated, treating as PrivateLinkage
+; CHECK-WARNINGS: @.linker_private = linker_private unnamed_addr constant [15 x i8] c"linker_private\00", align 64
+; CHECK-WARNINGS:                    ^
+
+; CHECK-WARNINGS: warning: '.linker_private_weak' is deprecated, treating as PrivateLinkage
+; CHECK-WARNINGS: @.linker_private_weak = linker_private_weak unnamed_addr constant [20 x i8] c"linker_private_weak\00", align 64
+; CHECK-WARNINGS:                         ^
+
diff --git a/test/Bitcode/local-linkage-default-visibility.3.4.ll b/test/Bitcode/local-linkage-default-visibility.3.4.ll
new file mode 100644
index 0000000..45a7b12
--- /dev/null
+++ b/test/Bitcode/local-linkage-default-visibility.3.4.ll
@@ -0,0 +1,79 @@
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; local-linkage-default-visibility.3.4.ll.bc was generated by passing this file
+; to llvm-as-3.4.  The test checks that LLVM upgrades visibility of symbols
+; with local linkage to default visibility.
+
+@default.internal.var = internal global i32 0
+; CHECK: @default.internal.var = internal global i32 0
+
+@hidden.internal.var = internal hidden global i32 0
+; CHECK: @hidden.internal.var = internal global i32 0
+
+@protected.internal.var = internal protected global i32 0
+; CHECK: @protected.internal.var = internal global i32 0
+
+@default.private.var = private global i32 0
+; CHECK: @default.private.var = private global i32 0
+
+@hidden.private.var = private hidden global i32 0
+; CHECK: @hidden.private.var = private global i32 0
+
+@protected.private.var = private protected global i32 0
+; CHECK: @protected.private.var = private global i32 0
+
+@global = global i32 0
+
+@default.internal.alias = alias internal i32* @global
+; CHECK: @default.internal.alias = alias internal i32* @global
+
+@hidden.internal.alias = hidden alias internal i32* @global
+; CHECK: @hidden.internal.alias = alias internal i32* @global
+
+@protected.internal.alias = protected alias internal i32* @global
+; CHECK: @protected.internal.alias = alias internal i32* @global
+
+@default.private.alias = alias private i32* @global
+; CHECK: @default.private.alias = alias private i32* @global
+
+@hidden.private.alias = hidden alias private i32* @global
+; CHECK: @hidden.private.alias = alias private i32* @global
+
+@protected.private.alias = protected alias private i32* @global
+; CHECK: @protected.private.alias = alias private i32* @global
+
+define internal void @default.internal() {
+; CHECK: define internal void @default.internal
+entry:
+  ret void
+}
+
+define internal hidden void @hidden.internal() {
+; CHECK: define internal void @hidden.internal
+entry:
+  ret void
+}
+
+define internal protected void @protected.internal() {
+; CHECK: define internal void @protected.internal
+entry:
+  ret void
+}
+
+define private void @default.private() {
+; CHECK: define private void @default.private
+entry:
+  ret void
+}
+
+define private hidden void @hidden.private() {
+; CHECK: define private void @hidden.private
+entry:
+  ret void
+}
+
+define private protected void @protected.private() {
+; CHECK: define private void @protected.private
+entry:
+  ret void
+}
diff --git a/test/Bitcode/local-linkage-default-visibility.3.4.ll.bc b/test/Bitcode/local-linkage-default-visibility.3.4.ll.bc
new file mode 100644
index 0000000..6e49f7e
Binary files /dev/null and b/test/Bitcode/local-linkage-default-visibility.3.4.ll.bc differ
diff --git a/test/Bitcode/old-aliases.ll b/test/Bitcode/old-aliases.ll
new file mode 100644
index 0000000..4ef47c0
--- /dev/null
+++ b/test/Bitcode/old-aliases.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; old-aliases.bc consist of this file assembled with an old llvm-as (3.5 trunk)
+; from when aliases contained a ConstantExpr.
+
+@v1 = global i32 0
+; CHECK: @v1 = global i32 0
+
+@v2 = global [1 x i32] zeroinitializer
+; CHECK: @v2 = global [1 x i32] zeroinitializer
+
+@v3 = alias bitcast (i32* @v1 to i16*)
+; CHECK: @v3 = alias i16, i32* @v1
+
+@v4 = alias getelementptr ([1 x i32]* @v2, i32 0, i32 0)
+; CHECK: @v4 = alias i32, [1 x i32]* @v2
+
+@v5 = alias i32 addrspace(2)* addrspacecast (i32 addrspace(0)* @v1 to i32 addrspace(2)*)
+; CHECK: @v5 = alias addrspace(2) i32, i32* @v1
+
+@v6 = alias i16* @v3
+; CHECK: @v6 = alias i16, i32* @v1
diff --git a/test/Bitcode/old-aliases.ll.bc b/test/Bitcode/old-aliases.ll.bc
new file mode 100644
index 0000000..1f157b2
Binary files /dev/null and b/test/Bitcode/old-aliases.ll.bc differ
diff --git a/test/Bitcode/tailcall.ll b/test/Bitcode/tailcall.ll
new file mode 100644
index 0000000..765b470
--- /dev/null
+++ b/test/Bitcode/tailcall.ll
@@ -0,0 +1,17 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+; Check that musttail and tail roundtrip.
+
+declare cc8191 void @t1_callee()
+define cc8191 void @t1() {
+; CHECK: tail call cc8191 void @t1_callee()
+  tail call cc8191 void @t1_callee()
+  ret void
+}
+
+declare cc8191 void @t2_callee()
+define cc8191 void @t2() {
+; CHECK: musttail call cc8191 void @t2_callee()
+  musttail call cc8191 void @t2_callee()
+  ret void
+}
diff --git a/test/Bitcode/upgrade-global-ctors.ll b/test/Bitcode/upgrade-global-ctors.ll
new file mode 100644
index 0000000..bd253a8
--- /dev/null
+++ b/test/Bitcode/upgrade-global-ctors.ll
@@ -0,0 +1,3 @@
+; RUN:  llvm-dis < %s.bc| FileCheck %s
+
+; CHECK: @llvm.global_ctors = appending global [0 x { i32, void ()*, i8* }] zeroinitializer
diff --git a/test/Bitcode/upgrade-global-ctors.ll.bc b/test/Bitcode/upgrade-global-ctors.ll.bc
new file mode 100644
index 0000000..927fd91
Binary files /dev/null and b/test/Bitcode/upgrade-global-ctors.ll.bc differ
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 827cd76..3e08a16 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -37,6 +37,7 @@ set(LLVM_TEST_DEPENDS
           llvm-mc
           llvm-mcmarkup
           llvm-nm
+          llvm-size
           llvm-objdump
           llvm-profdata
           llvm-readobj
diff --git a/test/CodeGen/AArch64/128bit_load_store.ll b/test/CodeGen/AArch64/128bit_load_store.ll
index 502fd70..a6f0776 100644
--- a/test/CodeGen/AArch64/128bit_load_store.ll
+++ b/test/CodeGen/AArch64/128bit_load_store.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s --check-prefix=CHECK
 
 define void @test_store_f128(fp128* %ptr, fp128 %val) #0 {
-; CHECK: test_store_f128
+; CHECK-LABEL: test_store_f128
 ; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
 entry:
   store fp128 %val, fp128* %ptr, align 16
@@ -9,7 +9,7 @@ entry:
 }
 
 define fp128 @test_load_f128(fp128* readonly %ptr) #2 {
-; CHECK: test_load_f128
+; CHECK-LABEL: test_load_f128
 ; CHECK: ldr	 {{q[0-9]+}}, [{{x[0-9]+}}]
 entry:
   %0 = load fp128* %ptr, align 16
@@ -17,9 +17,9 @@ entry:
 }
 
 define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
-; CHECK: test_vstrq_p128
-; CHECK: str	{{x[0-9]+}}, [{{x[0-9]+}}, #8]
-; CHECK-NEXT: str	 {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-LABEL: test_vstrq_p128
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
+
 entry:
   %0 = bitcast i128* %ptr to fp128*
   %1 = bitcast i128 %val to fp128
@@ -28,9 +28,9 @@ entry:
 }
 
 define i128 @test_vldrq_p128(i128* readonly %ptr) #2 {
-; CHECK: test_vldrq_p128
-; CHECK: ldr	{{x[0-9]+}}, [{{x[0-9]+}}]
-; CHECK-NEXT: ldr	{{x[0-9]+}}, [{{x[0-9]+}}, #8]
+; CHECK-LABEL: test_vldrq_p128
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
+
 entry:
   %0 = bitcast i128* %ptr to fp128*
   %1 = load fp128* %0, align 16
@@ -39,7 +39,7 @@ entry:
 }
 
 define void @test_ld_st_p128(i128* nocapture %ptr) #0 {
-; CHECK: test_ld_st_p128
+; CHECK-LABEL: test_ld_st_p128
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
 ; CHECK-NEXT: str	{{q[0-9]+}}, [{{x[0-9]+}}, #16]
 entry:
diff --git a/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll b/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
new file mode 100644
index 0000000..c932253
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; arm64 has a separate copy as aarch64-neon-v1i1-setcc.ll
+
+; This file test the DAG node like "v1i1 SETCC v1i64, v1i64". As the v1i1 type
+; is illegal in AArch64 backend, the legalizer tries to scalarize this node.
+; As the v1i64 operands of SETCC are legal types, they will not be scalarized.
+; Currently the type legalizer will have an assertion failure as it assumes all
+; operands of SETCC have been legalized.
+; FIXME: If the algorithm of type scalarization is improved and can legaize
+; "v1i1 SETCC" correctly, these test cases are not needed.
+
+define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_0:
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
+  %1 = icmp sge <1 x i64> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  %vget_lane = sext i1 %2 to i64
+  ret i64 %vget_lane
+}
+
+define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_1:
+; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
+  %1 = fcmp oeq <1 x double> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  %vget_lane = sext i1 %2 to i64
+  ret i64 %vget_lane
+}
+
+define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_0:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_1:
+; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = fcmp oeq <1 x double> %v1, %v2
+  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
+; CHECK-LABEL: test_select_v1i1_2:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
+  ret <1 x double> %res
+}
+
+define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_br_extr_cmp:
+; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}}
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  br i1 %2, label %if.end, label %if.then
+
+if.then:
+  ret i32 0;
+
+if.end:
+  ret i32 1;
+}
diff --git a/test/CodeGen/AArch64/adc.ll b/test/CodeGen/AArch64/adc.ll
index 29637d3..892573b 100644
--- a/test/CodeGen/AArch64/adc.ll
+++ b/test/CodeGen/AArch64/adc.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
 
 define i128 @test_simple(i128 %a, i128 %b, i128 %c) {
 ; CHECK-LABEL: test_simple:
diff --git a/test/CodeGen/AArch64/addsub-shifted.ll b/test/CodeGen/AArch64/addsub-shifted.ll
index 269c1e8..0a93edd 100644
--- a/test/CodeGen/AArch64/addsub-shifted.ll
+++ b/test/CodeGen/AArch64/addsub-shifted.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -35,7 +35,7 @@ define void @test_lsl_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   %shift4a = shl i32 %lhs4a, 15
   %val4a = sub i32 0, %shift4a
   store volatile i32 %val4a, i32* @var32
-; CHECK: sub {{w[0-9]+}}, wzr, {{w[0-9]+}}, lsl #15
+; CHECK: neg {{w[0-9]+}}, {{w[0-9]+}}, lsl #15
 
   %rhs5 = load volatile i64* @var64
   %shift5 = shl i64 %rhs5, 18
@@ -66,7 +66,7 @@ define void @test_lsl_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   %shift8a = shl i64 %lhs8a, 60
   %val8a = sub i64 0, %shift8a
   store volatile i64 %val8a, i64* @var64
-; CHECK: sub {{x[0-9]+}}, xzr, {{x[0-9]+}}, lsl #60
+; CHECK: neg {{x[0-9]+}}, {{x[0-9]+}}, lsl #60
 
   ret void
 ; CHECK: ret
@@ -99,7 +99,7 @@ define void @test_lsr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   %shift4a = lshr i32 %lhs32, 15
   %val4a = sub i32 0, %shift4a
   store volatile i32 %val4a, i32* @var32
-; CHECK: sub {{w[0-9]+}}, wzr, {{w[0-9]+}}, lsr #15
+; CHECK: neg {{w[0-9]+}}, {{w[0-9]+}}, lsr #15
 
   %shift5 = lshr i64 %rhs64, 18
   %val5 = add i64 %lhs64, %shift5
@@ -125,7 +125,7 @@ define void @test_lsr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   %shift8a = lshr i64 %lhs64, 45
   %val8a = sub i64 0, %shift8a
   store volatile i64 %val8a, i64* @var64
-; CHECK: sub {{x[0-9]+}}, xzr, {{x[0-9]+}}, lsr #45
+; CHECK: neg {{x[0-9]+}}, {{x[0-9]+}}, lsr #45
 
   ret void
 ; CHECK: ret
@@ -158,7 +158,7 @@ define void @test_asr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   %shift4a = ashr i32 %lhs32, 15
   %val4a = sub i32 0, %shift4a
   store volatile i32 %val4a, i32* @var32
-; CHECK: sub {{w[0-9]+}}, wzr, {{w[0-9]+}}, asr #15
+; CHECK: neg {{w[0-9]+}}, {{w[0-9]+}}, asr #15
 
   %shift5 = ashr i64 %rhs64, 18
   %val5 = add i64 %lhs64, %shift5
@@ -184,7 +184,7 @@ define void @test_asr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   %shift8a = ashr i64 %lhs64, 45
   %val8a = sub i64 0, %shift8a
   store volatile i64 %val8a, i64* @var64
-; CHECK: sub {{x[0-9]+}}, xzr, {{x[0-9]+}}, asr #45
+; CHECK: neg {{x[0-9]+}}, {{x[0-9]+}}, asr #45
 
   ret void
 ; CHECK: ret
@@ -245,7 +245,7 @@ define i32 @test_cmn(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
   br i1 %tst1, label %t2, label %end
   ; Important that this isn't lowered to a cmn instruction because if %rhs32 ==
   ; 0 then the results will differ.
-; CHECK: sub [[RHS:w[0-9]+]], wzr, {{w[0-9]+}}, lsl #13
+; CHECK: neg [[RHS:w[0-9]+]], {{w[0-9]+}}, lsl #13
 ; CHECK: cmp {{w[0-9]+}}, [[RHS]]
 
 t2:
@@ -268,7 +268,7 @@ t4:
   %tst4 = icmp slt i64 %lhs64, %val4
   br i1 %tst4, label %t5, label %end
   ; Again, it's important that cmn isn't used here in case %rhs64 == 0.
-; CHECK: sub [[RHS:x[0-9]+]], xzr, {{x[0-9]+}}, lsl #43
+; CHECK: neg [[RHS:x[0-9]+]], {{x[0-9]+}}, lsl #43
 ; CHECK: cmp {{x[0-9]+}}, [[RHS]]
 
 t5:
diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll
index 4d46d04..b85fdbb 100644
--- a/test/CodeGen/AArch64/addsub.ll
+++ b/test/CodeGen/AArch64/addsub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 ; Note that this should be refactored (for efficiency if nothing else)
 ; when the PCS is implemented so we don't have to worry about the
@@ -28,12 +28,12 @@ define void @add_small() {
 define void @add_med() {
 ; CHECK-LABEL: add_med:
 
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #3567, lsl #12
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{#3567, lsl #12|#14610432}}
   %val32 = load i32* @var_i32
   %newval32 = add i32 %val32, 14610432 ; =0xdef000
   store i32 %newval32, i32* @var_i32
 
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #4095, lsl #12
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{#4095, lsl #12|#16773120}}
   %val64 = load i64* @var_i64
   %newval64 = add i64 %val64, 16773120 ; =0xfff000
   store i64 %newval64, i64* @var_i64
@@ -62,12 +62,12 @@ define void @sub_small() {
 define void @sub_med() {
 ; CHECK-LABEL: sub_med:
 
-; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, #3567, lsl #12
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{#3567, lsl #12|#14610432}}
   %val32 = load i32* @var_i32
   %newval32 = sub i32 %val32, 14610432 ; =0xdef000
   store i32 %newval32, i32* @var_i32
 
-; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, #4095, lsl #12
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{#4095, lsl #12|#16773120}}
   %val64 = load i64* @var_i64
   %newval64 = sub i64 %val64, 16773120 ; =0xfff000
   store i64 %newval64, i64* @var_i64
@@ -80,13 +80,13 @@ define void @testing() {
   %val = load i32* @var_i32
 
 ; CHECK: cmp {{w[0-9]+}}, #4095
-; CHECK: b.ne .LBB4_6
+; CHECK: b.ne [[RET:.?LBB[0-9]+_[0-9]+]]
   %cmp_pos_small = icmp ne i32 %val, 4095
   br i1 %cmp_pos_small, label %ret, label %test2
 
 test2:
-; CHECK: cmp {{w[0-9]+}}, #3567, lsl #12
-; CHECK: b.lo .LBB4_6
+; CHECK: cmp {{w[0-9]+}}, {{#3567, lsl #12|#14610432}}
+; CHECK: b.lo [[RET]]
   %newval2 = add i32 %val, 1
   store i32 %newval2, i32* @var_i32
   %cmp_pos_big = icmp ult i32 %val, 14610432
@@ -94,7 +94,7 @@ test2:
 
 test3:
 ; CHECK: cmp {{w[0-9]+}}, #123
-; CHECK: b.lt .LBB4_6
+; CHECK: b.lt [[RET]]
   %newval3 = add i32 %val, 2
   store i32 %newval3, i32* @var_i32
   %cmp_pos_slt = icmp slt i32 %val, 123
@@ -102,7 +102,7 @@ test3:
 
 test4:
 ; CHECK: cmp {{w[0-9]+}}, #321
-; CHECK: b.gt .LBB4_6
+; CHECK: b.gt [[RET]]
   %newval4 = add i32 %val, 3
   store i32 %newval4, i32* @var_i32
   %cmp_pos_sgt = icmp sgt i32 %val, 321
@@ -110,7 +110,7 @@ test4:
 
 test5:
 ; CHECK: cmn {{w[0-9]+}}, #444
-; CHECK: b.gt .LBB4_6
+; CHECK: b.gt [[RET]]
   %newval5 = add i32 %val, 4
   store i32 %newval5, i32* @var_i32
   %cmp_neg_uge = icmp sgt i32 %val, -444
diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll
index f0e11c6..a2266b1 100644
--- a/test/CodeGen/AArch64/addsub_ext.ll
+++ b/test/CodeGen/AArch64/addsub_ext.ll
@@ -1,11 +1,11 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var8 = global i8 0
 @var16 = global i16 0
 @var32 = global i32 0
 @var64 = global i64 0
 
-define void @addsub_i8rhs() {
+define void @addsub_i8rhs() minsize {
 ; CHECK-LABEL: addsub_i8rhs:
     %val8_tmp = load i8* @var8
     %lhs32 = load i32* @var32
@@ -80,7 +80,7 @@ end:
     ret void
 }
 
-define void @addsub_i16rhs() {
+define void @addsub_i16rhs() minsize {
 ; CHECK-LABEL: addsub_i16rhs:
     %val16_tmp = load i16* @var16
     %lhs32 = load i32* @var32
@@ -158,7 +158,7 @@ end:
 ; N.b. we could probably check more here ("add w2, w3, w1, uxtw" for
 ; example), but the remaining instructions are probably not idiomatic
 ; in the face of "add/sub (shifted register)" so I don't intend to.
-define void @addsub_i32rhs() {
+define void @addsub_i32rhs() minsize {
 ; CHECK-LABEL: addsub_i32rhs:
     %val32_tmp = load i32* @var32
     %lhs64 = load i64* @var64
diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll
index 1d3c0a0..f93efbc 100644
--- a/test/CodeGen/AArch64/alloca.ll
+++ b/test/CodeGen/AArch64/alloca.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
 
 declare void @use_addr(i8*)
 
@@ -8,23 +8,22 @@ define void @test_simple_alloca(i64 %n) {
 
   %buf = alloca i8, i64 %n
   ; Make sure we align the stack change to 16 bytes:
-; CHECK-DAG: add [[SPDELTA:x[0-9]+]], x0, #15
-; CHECK-DAG: and x0, [[SPDELTA]], #0xfffffffffffffff0
+; CHECK: {{mov|add}} x29
+; CHECK: mov [[TMP:x[0-9]+]], sp
+; CHECK: add [[SPDELTA_TMP:x[0-9]+]], x0, #15
+; CHECK: and [[SPDELTA:x[0-9]+]], [[SPDELTA_TMP]], #0xfffffffffffffff0
 
   ; Make sure we change SP. It would be surprising if anything but x0 were used
   ; for the final sp, but it could be if it was then moved into x0.
-; CHECK-DAG: mov [[TMP:x[0-9]+]], sp
-; CHECK-DAG: sub x0, [[TMP]], [[SPDELTA]]
-; CHECK: mov sp, x0
+; CHECK: sub [[NEWSP:x[0-9]+]], [[TMP]], [[SPDELTA]]
+; CHECK: mov sp, [[NEWSP]]
 
   call void @use_addr(i8* %buf)
 ; CHECK: bl use_addr
 
   ret void
   ; Make sure epilogue restores sp from fp
-; CHECK: sub sp, x29, #16
-; CHECK: ldp x29, x30, [sp, #16]
-; CHECK: add sp, sp, #32
+; CHECK: {{sub|mov}} sp, x29
 ; CHECK: ret
 }
 
@@ -32,57 +31,70 @@ declare void @use_addr_loc(i8*, i64*)
 
 define i64 @test_alloca_with_local(i64 %n) {
 ; CHECK-LABEL: test_alloca_with_local:
-; CHECK: sub sp, sp, #32
-; CHECK: stp x29, x30, [sp, #16]
+; CHECK-DAG: sub sp, sp, [[LOCAL_STACK:#[0-9]+]]
+; CHECK-DAG: {{mov|add}} x29, sp
 
   %loc = alloca i64
   %buf = alloca i8, i64 %n
   ; Make sure we align the stack change to 16 bytes:
-; CHECK-DAG: add [[SPDELTA:x[0-9]+]], x0, #15
-; CHECK-DAG: and x0, [[SPDELTA]], #0xfffffffffffffff0
+; CHECK: mov [[TMP:x[0-9]+]], sp
+; CHECK: add [[SPDELTA_TMP:x[0-9]+]], x0, #15
+; CHECK: and [[SPDELTA:x[0-9]+]], [[SPDELTA_TMP]], #0xfffffffffffffff0
 
   ; Make sure we change SP. It would be surprising if anything but x0 were used
   ; for the final sp, but it could be if it was then moved into x0.
-; CHECK-DAG: mov [[TMP:x[0-9]+]], sp
-; CHECK-DAG: sub x0, [[TMP]], [[SPDELTA]]
-; CHECK: mov sp, x0
+; CHECK: sub [[NEWSP:x[0-9]+]], [[TMP]], [[SPDELTA]]
+; CHECK: mov sp, [[NEWSP]]
 
-  ; Obviously suboptimal code here, but it to get &local in x1
-; CHECK: sub [[TMP:x[0-9]+]], x29, [[LOC_FROM_FP:#[0-9]+]]
-; CHECK: add x1, [[TMP]], #0
+; CHECK: sub {{x[0-9]+}}, x29, #[[LOC_FROM_FP:[0-9]+]]
 
   call void @use_addr_loc(i8* %buf, i64* %loc)
 ; CHECK: bl use_addr
 
   %val = load i64* %loc
-; CHECK: sub x[[TMP:[0-9]+]], x29, [[LOC_FROM_FP]]
-; CHECK: ldr x0, [x[[TMP]]]
+
+; CHECK: ldur x0, [x29, #-[[LOC_FROM_FP]]]
 
   ret i64 %val
   ; Make sure epilogue restores sp from fp
-; CHECK: sub sp, x29, #16
-; CHECK: ldp x29, x30, [sp, #16]
-; CHECK: add sp, sp, #32
+; CHECK: {{sub|mov}} sp, x29
 ; CHECK: ret
 }
 
 define void @test_variadic_alloca(i64 %n, ...) {
-; CHECK: test_variadic_alloca:
-
-; CHECK: sub     sp, sp, #208
-; CHECK: stp     x29, x30, [sp, #192]
-; CHECK: add     x29, sp, #192
-; CHECK: sub     [[TMP:x[0-9]+]], x29, #192
-; CHECK: add     x8, [[TMP]], #0
-; CHECK-FP: str     q7, [x8, #112]
+; CHECK-LABEL: test_variadic_alloca:
+
 ; [...]
-; CHECK-FP: str     q1, [x8, #16]
 
-; CHECK-NOFP: sub     sp, sp, #80
-; CHECK-NOFP: stp     x29, x30, [sp, #64]
-; CHECK-NOFP: add     x29, sp, #64
-; CHECK-NOFP: sub     [[TMP:x[0-9]+]], x29, #64
-; CHECK-NOFP: add     x8, [[TMP]], #0
+
+; CHECK-NOFP-AARCH64: sub     sp, sp, #80
+; CHECK-NOFP-AARCH64: stp     x29, x30, [sp, #64]
+; CHECK-NOFP-AARCH64: add     x29, sp, #64
+; CHECK-NOFP-AARCH64: sub     [[TMP:x[0-9]+]], x29, #64
+; CHECK-NOFP-AARCH64: add     x8, [[TMP]], #0
+
+
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
+; CHECK: sub     sp, sp, #192
+; CHECK: stp     q6, q7, [x29, #-96]
+; [...]
+; CHECK: stp     q0, q1, [x29, #-192]
+
+; CHECK: stp     x6, x7, [x29, #-16]
+; [...]
+; CHECK: stp     x2, x3, [x29, #-48]
+
+; CHECK-NOFP-ARM64: stp     x29, x30, [sp, #-16]!
+; CHECK-NOFP-ARM64: mov     x29, sp
+; CHECK-NOFP-ARM64: sub     sp, sp, #64
+; CHECK-NOFP-ARM64: stp     x6, x7, [x29, #-16]
+; [...]
+; CHECK-NOFP-ARM64: stp     x4, x5, [x29, #-32]
+; [...]
+; CHECK-NOFP-ARM64: stp     x2, x3, [x29, #-48]
+; [...]
+; CHECK-NOFP-ARM64: mov     x8, sp
 
   %addr = alloca i8, i64 %n
 
@@ -90,23 +102,24 @@ define void @test_variadic_alloca(i64 %n, ...) {
 ; CHECK: bl use_addr
 
   ret void
-; CHECK: sub sp, x29, #192
-; CHECK: ldp x29, x30, [sp, #192]
-; CHECK: add sp, sp, #208
 
-; CHECK-NOFP: sub sp, x29, #64
-; CHECK-NOFP: ldp x29, x30, [sp, #64]
-; CHECK-NOFP: add sp, sp, #80
+; CHECK-NOFP-AARCH64: sub sp, x29, #64
+; CHECK-NOFP-AARCH64: ldp x29, x30, [sp, #64]
+; CHECK-NOFP-AARCH64: add sp, sp, #80
+
+; CHECK-NOFP-ARM64: mov sp, x29
+; CHECK-NOFP-ARM64: ldp x29, x30, [sp], #16
 }
 
 define void @test_alloca_large_frame(i64 %n) {
 ; CHECK-LABEL: test_alloca_large_frame:
 
-; CHECK: sub sp, sp, #496
-; CHECK: stp x29, x30, [sp, #480]
-; CHECK: add x29, sp, #480
-; CHECK: sub sp, sp, #48
-; CHECK: sub sp, sp, #1953, lsl #12
+
+; CHECK: stp     x20, x19, [sp, #-32]!
+; CHECK: stp     x29, x30, [sp, #16]
+; CHECK: add     x29, sp, #16
+; CHECK: sub     sp, sp, #1953, lsl #12
+; CHECK: sub     sp, sp, #512
 
   %addr1 = alloca i8, i64 %n
   %addr2 = alloca i64, i64 1000000
@@ -114,9 +127,10 @@ define void @test_alloca_large_frame(i64 %n) {
   call void @use_addr_loc(i8* %addr1, i64* %addr2)
 
   ret void
-; CHECK: sub sp, x29, #480
-; CHECK: ldp x29, x30, [sp, #480]
-; CHECK: add sp, sp, #496
+
+; CHECK: sub     sp, x29, #16
+; CHECK: ldp     x29, x30, [sp, #16]
+; CHECK: ldp     x20, x19, [sp], #32
 }
 
 declare i8* @llvm.stacksave()
@@ -124,7 +138,6 @@ declare void @llvm.stackrestore(i8*)
 
 define void @test_scoped_alloca(i64 %n) {
 ; CHECK-LABEL: test_scoped_alloca:
-; CHECK: sub sp, sp, #32
 
   %sp = call i8* @llvm.stacksave()
 ; CHECK: mov [[SAVED_SP:x[0-9]+]], sp
diff --git a/test/CodeGen/AArch64/analyze-branch.ll b/test/CodeGen/AArch64/analyze-branch.ll
index 36bc2e0..6616b27 100644
--- a/test/CodeGen/AArch64/analyze-branch.ll
+++ b/test/CodeGen/AArch64/analyze-branch.ll
@@ -168,7 +168,7 @@ define void @test_TBZ_fallthrough_nottaken(i64 %in) nounwind {
   %tst = icmp eq i64 %bit, 0
   br i1 %tst, label %true, label %false, !prof !1
 
-; CHECK: tbz {{x[0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
+; CHECK: tbz {{[wx][0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: // BB#
 ; CHECK-NEXT: bl test_false
 
@@ -213,7 +213,7 @@ define void @test_TBNZ_fallthrough_nottaken(i64 %in) nounwind {
   %tst = icmp ne i64 %bit, 0
   br i1 %tst, label %true, label %false, !prof !1
 
-; CHECK: tbnz {{x[0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
+; CHECK: tbnz {{[wx][0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: // BB#
 ; CHECK-NEXT: bl test_false
 
diff --git a/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll b/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
new file mode 100644
index 0000000..6fb7c3f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin
+
+; Can't copy or spill / restore CPSR.
+; rdar://9105206
+
+define fastcc void @t() ssp align 2 {
+entry:
+  br i1 undef, label %bb3.i, label %bb2.i
+
+bb2.i:                                            ; preds = %entry
+  br label %bb3.i
+
+bb3.i:                                            ; preds = %bb2.i, %entry
+  br i1 undef, label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71, label %bb.i69
+
+bb.i69:                                           ; preds = %bb3.i
+  br label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+
+_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71: ; preds = %bb.i69, %bb3.i
+  %0 = select i1 undef, float 0.000000e+00, float undef
+  %1 = fdiv float %0, undef
+  %2 = fcmp ult float %1, 0xBF847AE140000000
+  %storemerge9 = select i1 %2, float %1, float 0.000000e+00
+  store float %storemerge9, float* undef, align 4
+  br i1 undef, label %bb42, label %bb47
+
+bb42:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+  br i1 undef, label %bb46, label %bb53
+
+bb46:                                             ; preds = %bb42
+  br label %bb48
+
+bb47:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+  br label %bb48
+
+bb48:                                             ; preds = %bb47, %bb46
+  br i1 undef, label %bb1.i14, label %bb.i13
+
+bb.i13:                                           ; preds = %bb48
+  br label %bb1.i14
+
+bb1.i14:                                          ; preds = %bb.i13, %bb48
+  br label %bb53
+
+bb53:                                             ; preds = %bb1.i14, %bb42
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
new file mode 100644
index 0000000..2b083d8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin
+
+; rdar://9146594
+
+define void @drt_vsprintf() nounwind ssp {
+entry:
+  %do_tab_convert = alloca i32, align 4
+  br i1 undef, label %if.then24, label %if.else295, !dbg !13
+
+if.then24:                                        ; preds = %entry
+  unreachable
+
+if.else295:                                       ; preds = %entry
+  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16), !dbg !18
+  store i32 0, i32* %do_tab_convert, align 4, !dbg !19
+  unreachable
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.gv = !{!0}
+!llvm.dbg.sp = !{!1, !7, !10, !11, !12}
+
+!0 = metadata !{i32 589876, i32 0, metadata !1, metadata !"vsplive", metadata !"vsplive", metadata !"", metadata !2, i32 617, metadata !6, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
+!1 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"drt_vsprintf", metadata !"drt_vsprintf", metadata !"", i32 616, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 589860, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"putc_mem", metadata !"putc_mem", metadata !"", i32 30, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!8 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!9 = metadata !{null}
+!10 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_double", metadata !"print_double", metadata !"", i32 203, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_number", metadata !"print_number", metadata !"", i32 75, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!12 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"get_flags", metadata !"get_flags", metadata !"", i32 508, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!13 = metadata !{i32 653, i32 5, metadata !14, null}
+!14 = metadata !{i32 589835, metadata !20, metadata !15, i32 652, i32 35, i32 2} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{i32 589835, metadata !20, metadata !1, i32 616, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{i32 590080, metadata !17, metadata !"do_tab_convert", metadata !2, i32 853, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!17 = metadata !{i32 589835, metadata !20, metadata !14, i32 850, i32 12, i32 33} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{i32 853, i32 11, metadata !17, null}
+!19 = metadata !{i32 853, i32 29, metadata !17, null}
+!20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"}
+!21 = metadata !{i32 0}
diff --git a/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll b/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
new file mode 100644
index 0000000..6f0ec34
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define void @foo(i64 %val) {
+; CHECK: foo
+;   The stack frame store is not 64-bit aligned. Make sure we use an
+;   instruction that can handle that.
+; CHECK: stur x0, [sp, #20]
+  %a = alloca [49 x i32], align 4
+  %p32 = getelementptr inbounds [49 x i32]* %a, i64 0, i64 2
+  %p = bitcast i32* %p32 to i64*
+  store i64 %val, i64* %p, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll b/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
new file mode 100644
index 0000000..88232fc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=arm64-apple-iOS5.0
+
+; CPSR is not allocatable so fast allocatable wouldn't mark them killed.
+; rdar://9313272
+
+define hidden void @t() nounwind {
+entry:
+  %cmp = icmp eq i32* null, undef
+  %frombool = zext i1 %cmp to i8
+  store i8 %frombool, i8* undef, align 1
+  %tmp4 = load i8* undef, align 1
+  %tobool = trunc i8 %tmp4 to i1
+  br i1 %tobool, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %land.lhs.true14, label %if.end33
+
+land.lhs.true14:                                  ; preds = %if.end
+  unreachable
+
+if.end33:                                         ; preds = %if.end
+  unreachable
+}
diff --git a/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
new file mode 100644
index 0000000..8f99bc3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+; Can't fold the increment by 1<<12 into a post-increment load
+; rdar://10301335
+
+@test_data = common global i32 0, align 4
+
+define void @t() nounwind ssp {
+; CHECK-LABEL: t:
+entry:
+  br label %for.body
+
+for.body:
+; CHECK: for.body
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
+; CHECK: add x[[REG:[0-9]+]],
+; CHECK:                      x[[REG]], #1, lsl  #12
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 12
+  %add = add nsw i64 %0, 34628173824
+  %1 = inttoptr i64 %add to i32*
+  %2 = load volatile i32* %1, align 4096
+  store volatile i32 %2, i32* @test_data, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll b/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
new file mode 100644
index 0000000..d47dbb2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=arm64
+
+; The target lowering for integer comparisons was replacing some DAG nodes
+; during operation legalization, which resulted in dangling pointers,
+; cycles in DAGs, and eventually crashes.  This is the testcase for
+; one of those crashes. (rdar://10653656)
+
+define void @test(i1 zeroext %IsArrow) nounwind ssp align 2 {
+entry:
+  br i1 undef, label %return, label %lor.lhs.false
+
+lor.lhs.false:
+  br i1 undef, label %return, label %if.end
+
+if.end:
+  %tmp.i = load i64* undef, align 8
+  %and.i.i.i = and i64 %tmp.i, -16
+  br i1 %IsArrow, label %if.else_crit_edge, label %if.end32
+
+if.else_crit_edge:
+  br i1 undef, label %if.end32, label %return
+
+if.end32:
+  %0 = icmp ult i32 undef, 3
+  %1 = zext i64 %tmp.i to i320
+  %.pn.v = select i1 %0, i320 128, i320 64
+  %.pn = shl i320 %1, %.pn.v
+  %ins346392 = or i320 %.pn, 0
+  store i320 %ins346392, i320* undef, align 8
+  br i1 undef, label %sw.bb.i.i, label %exit
+
+sw.bb.i.i:
+  unreachable
+
+exit:
+  unreachable
+
+return:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll b/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
new file mode 100644
index 0000000..a4d37e4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i32 @foo(<4 x i32> %a, i32 %n) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: fmov w0, s0
+; CHECK-NEXT: ret
+  %b = bitcast <4 x i32> %a to i128
+  %c = trunc i128 %b to i32
+  ret i32 %c
+}
+
+define i64 @bar(<2 x i64> %a, i64 %n) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: fmov x0, d0
+; CHECK-NEXT: ret
+  %b = bitcast <2 x i64> %a to i128
+  %c = trunc i128 %b to i64
+  ret i64 %c
+}
+
diff --git a/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
new file mode 100644
index 0000000..d59b0d0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march arm64 -mcpu=cyclone | FileCheck %s
+; <rdar://problem/11294426>
+
+@b = private unnamed_addr constant [3 x i32] [i32 1768775988, i32 1685481784, i32 1836253201], align 4
+
+; The important thing for this test is that we need an unaligned load of `l_b'
+; ("ldr w2, [x1, #8]" in this case).
+
+; CHECK:      adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}}
+; CHECK: add  x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}}
+; CHECK-NEXT: ldr  [[VAL:w[0-9]+]], [x[[ADDR]], #8]
+; CHECK-NEXT: str  [[VAL]], [x0, #8]
+; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
+; CHECK-NEXT: str  [[VAL2]], [x0]
+
+define void @foo(i8* %a) {
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([3 x i32]* @b to i8*), i64 12, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
new file mode 100644
index 0000000..d1840d3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; <rdar://problem/11392109>
+
+define hidden void @t() optsize ssp {
+entry:
+  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* undef, align 8
+; CHECK:             adrp    x{{[0-9]+}}, _x@GOTPAGE
+; CHECK:        ldr     x{{[0-9]+}}, [x{{[0-9]+}}, _x@GOTPAGEOFF]
+; CHECK-NEXT:        and     x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff
+; CHECK-NEXT:        str     x{{[0-9]+}}, [x{{[0-9]+}}]
+  unreachable
+}
+
+declare i64 @x(i32) optsize
+
+; Worth checking the Linux code is sensible too: only way to access
+; the GOT is via a 64-bit load. Just loading wN is unacceptable
+; (there's no ELF relocation to do that).
+
+; CHECK-LINUX: adrp {{x[0-9]+}}, :got:x
+; CHECK-LINUX: ldr {{x[0-9]+}}, [{{x[0-9]+}}, :got_lo12:x]
diff --git a/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
new file mode 100644
index 0000000..4b037db
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -verify-machineinstrs | FileCheck %s
+
+; LdStOpt bug created illegal instruction:
+;   %D1<def>, %D2<def> = LDPSi %X0, 1
+; rdar://11512047
+
+%0 = type opaque
+%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
+%struct.CGPoint = type { double, double }
+%struct.CGSize = type { double, double }
+
+@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+define hidden %struct.CGRect @t(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: ldp d{{[0-9]+}}, d{{[0-9]+}}
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr.sum = add i64 %ivar, 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr.sum17 = add i64 %ivar, 16
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.sum = add i64 %ivar, 24
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{}
diff --git a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
new file mode 100644
index 0000000..168e921
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=arm64 -O0 < %s | FileCheck %s
+; RUN: llc -march=arm64 -O3 < %s | FileCheck %s
+
+@.str = private unnamed_addr constant [9 x i8] c"%lf %lu\0A\00", align 1
+@.str1 = private unnamed_addr constant [8 x i8] c"%lf %u\0A\00", align 1
+@.str2 = private unnamed_addr constant [8 x i8] c"%f %lu\0A\00", align 1
+@.str3 = private unnamed_addr constant [7 x i8] c"%f %u\0A\00", align 1
+
+define void @testDouble(double %d) ssp {
+; CHECK-LABEL: testDouble:
+; CHECK:  fcvtzu x{{[0-9]+}}, d{{[0-9]+}}
+; CHECK:  fcvtzu w{{[0-9]+}}, d{{[0-9]+}}
+entry:
+  %d.addr = alloca double, align 8
+  store double %d, double* %d.addr, align 8
+  %0 = load double* %d.addr, align 8
+  %1 = load double* %d.addr, align 8
+  %conv = fptoui double %1 to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
+  %2 = load double* %d.addr, align 8
+  %3 = load double* %d.addr, align 8
+  %conv1 = fptoui double %3 to i32
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
+  ret void
+}
+
+declare i32 @printf(i8*, ...)
+
+define void @testFloat(float %f) ssp {
+; CHECK-LABEL: testFloat:
+; CHECK:  fcvtzu x{{[0-9]+}}, s{{[0-9]+}}
+; CHECK:  fcvtzu w{{[0-9]+}}, s{{[0-9]+}}
+entry:
+  %f.addr = alloca float, align 4
+  store float %f, float* %f.addr, align 4
+  %0 = load float* %f.addr, align 4
+  %conv = fpext float %0 to double
+  %1 = load float* %f.addr, align 4
+  %conv1 = fptoui float %1 to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
+  %2 = load float* %f.addr, align 4
+  %conv2 = fpext float %2 to double
+  %3 = load float* %f.addr, align 4
+  %conv3 = fptoui float %3 to i32
+  %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
+  ret void
+}
+
+define i32 @main(i32 %argc, i8** %argv) ssp {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  store i32 0, i32* %retval
+  store i32 %argc, i32* %argc.addr, align 4
+  store i8** %argv, i8*** %argv.addr, align 8
+  call void @testDouble(double 1.159198e+01)
+  call void @testFloat(float 0x40272F1800000000)
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll b/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
new file mode 100644
index 0000000..55ecfb5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios
+; rdar://11849816
+
+@shlib_path_substitutions = external hidden unnamed_addr global i8**, align 8
+
+declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+
+declare noalias i8* @xmalloc(i64) optsize
+
+declare i64 @strlen(i8* nocapture) nounwind readonly optsize
+
+declare i8* @__strcpy_chk(i8*, i8*, i64) nounwind optsize
+
+declare i8* @__strcat_chk(i8*, i8*, i64) nounwind optsize
+
+declare noalias i8* @xstrdup(i8*) optsize
+
+define i8* @dyld_fix_path(i8* %path) nounwind optsize ssp {
+entry:
+  br i1 undef, label %if.end56, label %for.cond
+
+for.cond:                                         ; preds = %entry
+  br i1 undef, label %for.cond10, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  unreachable
+
+for.cond10:                                       ; preds = %for.cond
+  br i1 undef, label %if.end56, label %for.body14
+
+for.body14:                                       ; preds = %for.cond10
+  %call22 = tail call i64 @strlen(i8* undef) nounwind optsize
+  %sext = shl i64 %call22, 32
+  %conv30 = ashr exact i64 %sext, 32
+  %add29 = sub i64 0, %conv30
+  %sub = add i64 %add29, 0
+  %add31 = shl i64 %sub, 32
+  %sext59 = add i64 %add31, 4294967296
+  %conv33 = ashr exact i64 %sext59, 32
+  %call34 = tail call noalias i8* @xmalloc(i64 %conv33) nounwind optsize
+  br i1 undef, label %cond.false45, label %cond.true43
+
+cond.true43:                                      ; preds = %for.body14
+  unreachable
+
+cond.false45:                                     ; preds = %for.body14
+  %add.ptr = getelementptr inbounds i8* %path, i64 %conv30
+  unreachable
+
+if.end56:                                         ; preds = %for.cond10, %entry
+  ret i8* null
+}
+
+declare i32 @strncmp(i8* nocapture, i8* nocapture, i64) nounwind readonly optsize
+
+declare i8* @strcpy(i8*, i8* nocapture) nounwind
diff --git a/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll b/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
new file mode 100644
index 0000000..e2c43d9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+;FAST-LABEL: _Z9example25v:
+;FAST: fcmgt.4s
+;FAST: ret
+
+;CHECK-LABEL: _Z9example25v:
+;CHECK: fcmgt.4s
+;CHECK: ret
+
+define <4 x i32> @_Z9example25v( <4 x float> %N0,  <4 x float> %N1) {
+  %A = fcmp olt <4 x float> %N0, %N1
+  %B = zext <4 x i1> %A to <4 x i32>
+  ret <4 x i32> %B
+}
diff --git a/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll b/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
new file mode 100644
index 0000000..9451124
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64
+; Make sure we are not crashing on this test.
+
+define void @autogen_SD13158() {
+entry:
+  %B26 = frem float 0.000000e+00, undef
+  br i1 undef, label %CF, label %CF77
+
+CF:                                               ; preds = %CF, %CF76
+  store float %B26, float* undef
+  br i1 undef, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll b/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
new file mode 100644
index 0000000..404027b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=arm64
+
+; Make sure we are not crashing on this test.
+
+define void @autogen_SD12881() {
+BB:
+  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
+  br label %CF
+
+CF:                                               ; preds = %CF83, %CF, %BB
+  br i1 undef, label %CF, label %CF83
+
+CF83:                                             ; preds = %CF
+  %FC70 = sitofp <4 x i32> %B17 to <4 x double>
+  br label %CF
+}
+
+
+define void @autogen_SD12881_2() {
+BB:
+  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
+  br label %CF
+
+CF:                                               ; preds = %CF83, %CF, %BB
+  br i1 undef, label %CF, label %CF83
+
+CF83:                                             ; preds = %CF
+  %FC70 = uitofp <4 x i32> %B17 to <4 x double>
+  br label %CF
+}
+
+define void @_Z12my_example2bv() nounwind noinline ssp {
+entry:
+  %0 = fptosi <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll b/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
new file mode 100644
index 0000000..a350ba1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple
+
+;CHECK-LABEL: Shuff:
+;CHECK: tbl.8b
+;CHECK: ret
+define <8 x i8 > @Shuff(<8 x i8> %in, <8 x i8>* %out) nounwind ssp {
+  %value = shufflevector <8 x i8> %in, <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %value
+}
+
+
diff --git a/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll b/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
new file mode 100644
index 0000000..a73b707
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=arm64
+
+; This test case tests an infinite loop bug in DAG combiner.
+; It just tries to do the following replacing endlessly:
+; (1)  Replacing.3 0x2c509f0: v4i32 = any_extend 0x2c4cd08 [ORD=4]
+;      With: 0x2c4d128: v4i32 = sign_extend 0x2c4cd08 [ORD=4]
+;
+; (2)  Replacing.2 0x2c4d128: v4i32 = sign_extend 0x2c4cd08 [ORD=4]
+;      With: 0x2c509f0: v4i32 = any_extend 0x2c4cd08 [ORD=4]
+; As we think the (2) optimization from SIGN_EXTEND to ANY_EXTEND is
+; an optimization to replace unused bits with undefined bits, we remove
+; the (1) optimization (It doesn't make sense to replace undefined bits
+; with signed bits).
+
+define <4 x i32> @infiniteLoop(<4 x i32> %in0, <4 x i16> %in1) {
+entry:
+  %cmp.i = icmp sge <4 x i16> %in1, <i16 32767, i16 32767, i16 -1, i16 -32768>
+  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+  %mul.i = mul <4 x i32> %in0, %sext.i
+  %sext = shl <4 x i32> %mul.i, <i32 16, i32 16, i32 16, i32 16>
+  %vmovl.i.i = ashr <4 x i32> %sext, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %vmovl.i.i
+}
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll b/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
new file mode 100644
index 0000000..3949b85
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -verify-machineinstrs -march=arm64 | FileCheck %s
+
+; Check if sqshl/uqshl with constant shift amout can be selected. 
+define i64 @test_vqshld_s64_i(i64 %a) {
+; CHECK-LABEL: test_vqshld_s64_i:
+; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
+  %1 = tail call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 36)
+  ret i64 %1
+}
+
+define i64 @test_vqshld_u64_i(i64 %a) {
+; CHECK-LABEL: test_vqshld_u64_i:
+; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
+  %1 = tail call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 36)
+  ret i64 %1
+}
+
+declare i64 @llvm.aarch64.neon.uqshl.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.sqshl.i64(i64, i64)
diff --git a/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll b/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
new file mode 100644
index 0000000..1b2d543
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; The following 2 test cases test shufflevector with beginning UNDEF mask.
+define <8 x i16> @test_vext_undef_traverse(<8 x i16> %in) {
+;CHECK-LABEL: test_vext_undef_traverse:
+;CHECK: {{ext.16b.*v0, #4}}
+  %vext = shufflevector <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0>, <8 x i16> %in, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x i16> %vext
+}
+
+define <8 x i16> @test_vext_undef_traverse2(<8 x i16> %in) {
+;CHECK-LABEL: test_vext_undef_traverse2:
+;CHECK: {{ext.16b.*v0, #6}}
+  %vext = shufflevector <8 x i16> %in, <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2>
+  ret <8 x i16> %vext
+}
+
+define <8 x i8> @test_vext_undef_traverse3(<8 x i8> %in) {
+;CHECK-LABEL: test_vext_undef_traverse3:
+;CHECK: {{ext.8b.*v0, #6}}
+  %vext = shufflevector <8 x i8> %in, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5>
+  ret <8 x i8> %vext
+}
diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
new file mode 100644
index 0000000..c4597d5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
+
+define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: bar:
+; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
+; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+; GENERIC-LABEL: bar:
+; GENERIC: add	v[[REG:[0-9]+]].2d, v0.2d, v1.2d
+; GENERIC: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; GENERIC: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+  %add = add <2 x i64> %a, %b
+  %vgetq_lane = extractelement <2 x i64> %add, i32 0
+  %vgetq_lane2 = extractelement <2 x i64> %b, i32 0
+  %add3 = add i64 %vgetq_lane, %vgetq_lane2
+  %sub = sub i64 %vgetq_lane, %vgetq_lane2
+  %vecinit = insertelement <2 x i64> undef, i64 %add3, i32 0
+  %vecinit8 = insertelement <2 x i64> %vecinit, i64 %sub, i32 1
+  ret <2 x i64> %vecinit8
+}
+
+define double @subdd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: subdd_su64:
+; CHECK: sub d0, d1, d0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: subdd_su64:
+; GENERIC: sub d0, d1, d0
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %sub.i = sub nsw i64 %vecext1, %vecext
+  %retval = bitcast i64 %sub.i to double
+  ret double %retval
+}
+
+define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vaddd_su64:
+; CHECK: add d0, d1, d0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: vaddd_su64:
+; GENERIC: add d0, d1, d0
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %add.i = add nsw i64 %vecext1, %vecext
+  %retval = bitcast i64 %add.i to double
+  ret double %retval
+}
+
+; sub MI doesn't access dsub register.
+define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: add_sub_su64:
+; CHECK: add d0, d1, d0
+; CHECK: sub d0, {{d[0-9]+}}, d0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: add_sub_su64:
+; GENERIC: add d0, d1, d0
+; GENERIC: sub d0, {{d[0-9]+}}, d0
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %add.i = add i64 %vecext1, %vecext
+  %sub.i = sub i64 0, %add.i
+  %retval = bitcast i64 %sub.i to double
+  ret double %retval
+}
diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
new file mode 100644
index 0000000..b713f0d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -0,0 +1,103 @@
+; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false < %s | FileCheck %s
+
+@var = global i32 0, align 4
+
+define i128 @test_i128_align(i32, i128 %arg, i32 %after) {
+  store i32 %after, i32* @var, align 4
+; CHECK: str w4, [{{x[0-9]+}}, :lo12:var]
+
+  ret i128 %arg
+; CHECK: mov x0, x2
+; CHECK: mov x1, x3
+}
+
+@var64 = global i64 0, align 8
+
+  ; Check stack slots are 64-bit at all times.
+define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
+                                i32 %int, i64 %long) {
+  ; Part of last store. Blasted scheduler.
+; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
+
+  %ext_bool = zext i1 %bool to i64
+  store volatile i64 %ext_bool, i64* @var64, align 8
+; CHECK: ldrb w[[EXT:[0-9]+]], [sp]
+; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1
+; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_char = zext i8 %char to i64
+  store volatile i64 %ext_char, i64* @var64, align 8
+; CHECK: ldrb w[[EXT:[0-9]+]], [sp, #8]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_short = zext i16 %short to i64
+  store volatile i64 %ext_short, i64* @var64, align 8
+; CHECK: ldrh w[[EXT:[0-9]+]], [sp, #16]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_int = zext i32 %int to i64
+  store volatile i64 %ext_int, i64* @var64, align 8
+; CHECK: ldr{{b?}} w[[EXT:[0-9]+]], [sp, #24]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  store volatile i64 %long, i64* @var64, align 8
+; CHECK: str [[LONG]], [{{x[0-9]+}}, :lo12:var64]
+
+  ret void
+}
+
+; Make sure the callee does extensions (in the absence of zext/sext
+; keyword on args) while we're here.
+
+define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
+  %ext_bool = zext i1 %bool to i64
+  store volatile i64 %ext_bool, i64* @var64
+; CHECK: and [[EXT:x[0-9]+]], x0, #0x1
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_char = sext i8 %char to i64
+  store volatile i64 %ext_char, i64* @var64
+; CHECK: sxtb [[EXT:x[0-9]+]], w1
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_short = zext i16 %short to i64
+  store volatile i64 %ext_short, i64* @var64
+; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_int = zext i32 %int to i64
+  store volatile i64 %ext_int, i64* @var64
+; CHECK: ubfx [[EXT:x[0-9]+]], x3, #0, #32
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  ret void
+}
+
+declare void @variadic(i32 %a, ...)
+
+  ; Under AAPCS variadic functions have the same calling convention as
+  ; others. The extra arguments should go in registers rather than on the stack.
+define void @test_variadic() {
+  call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0)
+; CHECK: fmov d0, #2.0
+; CHECK: orr w1, wzr, #0x1
+; CHECK: bl variadic
+  ret void
+}
+
+; We weren't marking x7 as used after deciding that the i128 didn't fit into
+; registers and putting the first half on the stack, so the *second* half went
+; into x7. Yuck!
+define i128 @test_i128_shadow([7 x i64] %x0_x6, i128 %sp) {
+; CHECK-LABEL: test_i128_shadow:
+; CHECK: ldp x0, x1, [sp]
+
+  ret i128 %sp
+}
+
+; This test is to check if fp128 can be correctly handled on stack.
+define fp128 @test_fp128([8 x float] %arg0, fp128 %arg1) {
+; CHECK-LABEL: test_fp128:
+; CHECK: ldr {{q[0-9]+}}, [sp]
+  ret fp128 %arg1
+}
diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
new file mode 100644
index 0000000..92db392
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -0,0 +1,191 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+target triple = "arm64-apple-ios7.0.0"
+
+; rdar://13625505
+; Here we have 9 fixed integer arguments the 9th argument in on stack, the
+; varargs start right after at 8-byte alignment.
+define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
+; CHECK-LABEL: fn9:
+; 9th fixed argument
+; CHECK: ldr {{w[0-9]+}}, [sp, #64]
+; CHECK: add [[ARGS:x[0-9]+]], sp, #72
+; CHECK: add {{x[0-9]+}}, [[ARGS]], #8
+; First vararg
+; CHECK: ldr {{w[0-9]+}}, [sp, #72]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Second vararg
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Third vararg
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  %7 = alloca i32, align 4
+  %8 = alloca i32, align 4
+  %9 = alloca i32, align 4
+  %args = alloca i8*, align 8
+  %a10 = alloca i32, align 4
+  %a11 = alloca i32, align 4
+  %a12 = alloca i32, align 4
+  store i32 %a1, i32* %1, align 4
+  store i32 %a2, i32* %2, align 4
+  store i32 %a3, i32* %3, align 4
+  store i32 %a4, i32* %4, align 4
+  store i32 %a5, i32* %5, align 4
+  store i32 %a6, i32* %6, align 4
+  store i32 %a7, i32* %7, align 4
+  store i32 %a8, i32* %8, align 4
+  store i32 %a9, i32* %9, align 4
+  %10 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %10)
+  %11 = va_arg i8** %args, i32
+  store i32 %11, i32* %a10, align 4
+  %12 = va_arg i8** %args, i32
+  store i32 %12, i32* %a11, align 4
+  %13 = va_arg i8** %args, i32
+  store i32 %13, i32* %a12, align 4
+  ret void
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+define i32 @main() nounwind ssp {
+; CHECK-LABEL: main:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp, #8]
+; CHECK: str {{w[0-9]+}}, [sp]
+  %a1 = alloca i32, align 4
+  %a2 = alloca i32, align 4
+  %a3 = alloca i32, align 4
+  %a4 = alloca i32, align 4
+  %a5 = alloca i32, align 4
+  %a6 = alloca i32, align 4
+  %a7 = alloca i32, align 4
+  %a8 = alloca i32, align 4
+  %a9 = alloca i32, align 4
+  %a10 = alloca i32, align 4
+  %a11 = alloca i32, align 4
+  %a12 = alloca i32, align 4
+  store i32 1, i32* %a1, align 4
+  store i32 2, i32* %a2, align 4
+  store i32 3, i32* %a3, align 4
+  store i32 4, i32* %a4, align 4
+  store i32 5, i32* %a5, align 4
+  store i32 6, i32* %a6, align 4
+  store i32 7, i32* %a7, align 4
+  store i32 8, i32* %a8, align 4
+  store i32 9, i32* %a9, align 4
+  store i32 10, i32* %a10, align 4
+  store i32 11, i32* %a11, align 4
+  store i32 12, i32* %a12, align 4
+  %1 = load i32* %a1, align 4
+  %2 = load i32* %a2, align 4
+  %3 = load i32* %a3, align 4
+  %4 = load i32* %a4, align 4
+  %5 = load i32* %a5, align 4
+  %6 = load i32* %a6, align 4
+  %7 = load i32* %a7, align 4
+  %8 = load i32* %a8, align 4
+  %9 = load i32* %a9, align 4
+  %10 = load i32* %a10, align 4
+  %11 = load i32* %a11, align 4
+  %12 = load i32* %a12, align 4
+  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
+  ret i32 0
+}
+
+;rdar://13668483
+@.str = private unnamed_addr constant [4 x i8] c"fmt\00", align 1
+define void @foo(i8* %fmt, ...) nounwind {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
+; CHECK: ldr {{w[0-9]+}}, [sp, #48]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+  %fmt.addr = alloca i8*, align 8
+  %args = alloca i8*, align 8
+  %vc = alloca i32, align 4
+  %vv = alloca <4 x i32>, align 16
+  store i8* %fmt, i8** %fmt.addr, align 8
+  %args1 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %args1)
+  %0 = va_arg i8** %args, i32
+  store i32 %0, i32* %vc, align 4
+  %1 = va_arg i8** %args, <4 x i32>
+  store <4 x i32> %1, <4 x i32>* %vv, align 16
+  ret void
+}
+
+define void @bar(i32 %x, <4 x i32> %y) nounwind {
+entry:
+; CHECK-LABEL: bar:
+; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp]
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca <4 x i32>, align 16
+  store i32 %x, i32* %x.addr, align 4
+  store <4 x i32> %y, <4 x i32>* %y.addr, align 16
+  %0 = load i32* %x.addr, align 4
+  %1 = load <4 x i32>* %y.addr, align 16
+  call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
+  ret void
+}
+
+; rdar://13668927
+; When passing 16-byte aligned small structs as vararg, make sure the caller
+; side is 16-byte aligned on stack.
+%struct.s41 = type { i32, i16, i32, i16 }
+define void @foo2(i8* %fmt, ...) nounwind {
+entry:
+; CHECK-LABEL: foo2:
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
+; CHECK: ldr {{w[0-9]+}}, [sp, #48]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+  %fmt.addr = alloca i8*, align 8
+  %args = alloca i8*, align 8
+  %vc = alloca i32, align 4
+  %vs = alloca %struct.s41, align 16
+  store i8* %fmt, i8** %fmt.addr, align 8
+  %args1 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %args1)
+  %0 = va_arg i8** %args, i32
+  store i32 %0, i32* %vc, align 4
+  %ap.cur = load i8** %args
+  %1 = getelementptr i8* %ap.cur, i32 15
+  %2 = ptrtoint i8* %1 to i64
+  %3 = and i64 %2, -16
+  %ap.align = inttoptr i64 %3 to i8*
+  %ap.next = getelementptr i8* %ap.align, i32 16
+  store i8* %ap.next, i8** %args
+  %4 = bitcast i8* %ap.align to %struct.s41*
+  %5 = bitcast %struct.s41* %vs to i8*
+  %6 = bitcast %struct.s41* %4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 16, i32 16, i1 false)
+  ret void
+}
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @bar2(i32 %x, i128 %s41.coerce) nounwind {
+entry:
+; CHECK-LABEL: bar2:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp]
+  %x.addr = alloca i32, align 4
+  %s41 = alloca %struct.s41, align 16
+  store i32 %x, i32* %x.addr, align 4
+  %0 = bitcast %struct.s41* %s41 to i128*
+  store i128 %s41.coerce, i128* %0, align 1
+  %1 = load i32* %x.addr, align 4
+  %2 = bitcast %struct.s41* %s41 to i128*
+  %3 = load i128* %2, align 1
+  call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
new file mode 100644
index 0000000..e2de434
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -0,0 +1,238 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+target triple = "arm64-apple-darwin"
+
+; rdar://9932559
+define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline {
+entry:
+; CHECK-LABEL: i8i16callee:
+; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
+; They are i8, i16, i8 and i8.
+; CHECK: ldrsb	{{w[0-9]+}}, [sp, #5]
+; CHECK: ldrsh	{{w[0-9]+}}, [sp, #2]
+; CHECK: ldrsb	{{w[0-9]+}}, [sp]
+; CHECK: ldrsb	{{w[0-9]+}}, [sp, #4]
+; FAST-LABEL: i8i16callee:
+; FAST: ldrb  {{w[0-9]+}}, [sp, #5]
+; FAST: ldrb  {{w[0-9]+}}, [sp, #4]
+; FAST: ldrh  {{w[0-9]+}}, [sp, #2]
+; FAST: ldrb  {{w[0-9]+}}, [sp]
+  %conv = sext i8 %a4 to i64
+  %conv3 = sext i16 %a5 to i64
+  %conv8 = sext i8 %b1 to i64
+  %conv9 = sext i16 %b2 to i64
+  %conv11 = sext i8 %b3 to i64
+  %conv13 = sext i8 %b4 to i64
+  %add10 = add i64 %a2, %a1
+  %add12 = add i64 %add10, %a3
+  %add14 = add i64 %add12, %conv
+  %add = add i64 %add14, %conv3
+  %add1 = add i64 %add, %a6
+  %add2 = add i64 %add1, %a7
+  %add4 = add i64 %add2, %a8
+  %add5 = add i64 %add4, %conv8
+  %add6 = add i64 %add5, %conv9
+  %add7 = add i64 %add6, %conv11
+  %add15 = add i64 %add7, %conv13
+  %sext = shl i64 %add15, 32
+  %conv17 = ashr exact i64 %sext, 32
+  ret i64 %conv17
+}
+
+define i32 @i8i16caller() nounwind readnone {
+entry:
+; CHECK: i8i16caller
+; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
+; They are i8, i16, i8 and i8.
+; CHECK: strb {{w[0-9]+}}, [sp, #5]
+; CHECK: strb {{w[0-9]+}}, [sp, #4]
+; CHECK: strh {{w[0-9]+}}, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp]
+; CHECK: bl
+; FAST: i8i16caller
+; FAST: strb {{w[0-9]+}}, [sp]
+; FAST: strh {{w[0-9]+}}, [sp, #2]
+; FAST: strb {{w[0-9]+}}, [sp, #4]
+; FAST: strb {{w[0-9]+}}, [sp, #5]
+; FAST: bl
+  %call = tail call i64 @i8i16callee(i64 0, i64 1, i64 2, i8 signext 3, i16 signext 4, i64 5, i64 6, i64 7, i8 signext 97, i16 signext 98, i8 signext 99, i8 signext 100)
+  %conv = trunc i64 %call to i32
+  ret i32 %conv
+}
+
+; rdar://12651543
+define double @circle_center([2 x float] %a) nounwind ssp {
+  %call = tail call double @ext([2 x float] %a) nounwind
+; CHECK: circle_center
+; CHECK: bl
+  ret double %call
+}
+declare double @ext([2 x float])
+
+; rdar://12656141
+; 16-byte vector should be aligned at 16-byte when passing on stack.
+; A double argument will be passed on stack, so vecotr should be at sp+16.
+define double @fixed_4i(<4 x i32>* nocapture %in) nounwind {
+entry:
+; CHECK: fixed_4i
+; CHECK: str [[REG_1:q[0-9]+]], [sp, #16]
+; FAST: fixed_4i
+; FAST: sub sp, sp, #64
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16]
+  %0 = load <4 x i32>* %in, align 16
+  %call = tail call double @args_vec_4i(double 3.000000e+00, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, double 3.000000e+00, <4 x i32> %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_vec_4i(double, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, double, <4 x i32>, i8 signext)
+
+; rdar://12695237
+; d8 at sp, i in register w0.
+@g_d = common global double 0.000000e+00, align 8
+define void @test1(float %f1, double %d1, double %d2, double %d3, double %d4,
+       double %d5, double %d6, double %d7, double %d8, i32 %i) nounwind ssp {
+entry:
+; CHECK: test1
+; CHECK: ldr [[REG_1:d[0-9]+]], [sp]
+; CHECK: scvtf [[REG_2:s[0-9]+]], w0
+; CHECK: fadd s0, [[REG_2]], s0
+  %conv = sitofp i32 %i to float
+  %add = fadd float %conv, %f1
+  %conv1 = fpext float %add to double
+  %add2 = fadd double %conv1, %d7
+  %add3 = fadd double %add2, %d8
+  store double %add3, double* @g_d, align 8
+  ret void
+}
+
+; i9 at sp, d1 in register s0.
+define void @test2(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+            i32 %i7, i32 %i8, i32 %i9, float %d1) nounwind ssp {
+entry:
+; CHECK: test2
+; CHECK: scvtf [[REG_2:s[0-9]+]], w0
+; CHECK: fadd s0, [[REG_2]], s0
+; CHECK: ldr [[REG_1:s[0-9]+]], [sp]
+  %conv = sitofp i32 %i1 to float
+  %add = fadd float %conv, %d1
+  %conv1 = fpext float %add to double
+  %conv2 = sitofp i32 %i8 to double
+  %add3 = fadd double %conv2, %conv1
+  %conv4 = sitofp i32 %i9 to double
+  %add5 = fadd double %conv4, %add3
+  store double %add5, double* @g_d, align 8
+  ret void
+}
+
+; rdar://12648441
+; Check alignment on stack for v64, f64, i64, f32, i32.
+define double @test3(<2 x i32>* nocapture %in) nounwind {
+entry:
+; CHECK: test3
+; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
+; FAST: test3
+; FAST: sub sp, sp, #32
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
+  %0 = load <2 x i32>* %in, align 8
+  %call = tail call double @args_vec_2i(double 3.000000e+00, <2 x i32> %0,
+          <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0,
+          <2 x i32> %0, float 3.000000e+00, <2 x i32> %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_vec_2i(double, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>,
+               <2 x i32>, <2 x i32>, <2 x i32>, float, <2 x i32>, i8 signext)
+
+define double @test4(double* nocapture %in) nounwind {
+entry:
+; CHECK: test4
+; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
+; CHECK: str [[REG_2:w[0-9]+]], [sp]
+; CHECK: orr w0, wzr, #0x3
+  %0 = load double* %in, align 8
+  %call = tail call double @args_f64(double 3.000000e+00, double %0, double %0,
+          double %0, double %0, double %0, double %0, double %0,
+          float 3.000000e+00, double %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_f64(double, double, double, double, double, double, double,
+               double, float, double, i8 signext)
+
+define i64 @test5(i64* nocapture %in) nounwind {
+entry:
+; CHECK: test5
+; CHECK: strb [[REG_3:w[0-9]+]], [sp, #16]
+; CHECK: str [[REG_1:x[0-9]+]], [sp, #8]
+; CHECK: str [[REG_2:w[0-9]+]], [sp]
+  %0 = load i64* %in, align 8
+  %call = tail call i64 @args_i64(i64 3, i64 %0, i64 %0, i64 %0, i64 %0, i64 %0,
+                         i64 %0, i64 %0, i32 3, i64 %0, i8 signext 3)
+  ret i64 %call
+}
+declare i64 @args_i64(i64, i64, i64, i64, i64, i64, i64, i64, i32, i64,
+             i8 signext)
+
+define i32 @test6(float* nocapture %in) nounwind {
+entry:
+; CHECK: test6
+; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
+; CHECK: str [[REG_1:s[0-9]+]], [sp, #4]
+; CHECK: strh [[REG_3:w[0-9]+]], [sp]
+  %0 = load float* %in, align 4
+  %call = tail call i32 @args_f32(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+          i32 7, i32 8, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0,
+          float 6.0, float 7.0, float 8.0, i16 signext 3, float %0,
+          i8 signext 3)
+  ret i32 %call
+}
+declare i32 @args_f32(i32, i32, i32, i32, i32, i32, i32, i32,
+                      float, float, float, float, float, float, float, float,
+                      i16 signext, float, i8 signext)
+
+define i32 @test7(i32* nocapture %in) nounwind {
+entry:
+; CHECK: test7
+; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
+; CHECK: str [[REG_1:w[0-9]+]], [sp, #4]
+; CHECK: strh [[REG_3:w[0-9]+]], [sp]
+  %0 = load i32* %in, align 4
+  %call = tail call i32 @args_i32(i32 3, i32 %0, i32 %0, i32 %0, i32 %0, i32 %0,
+                         i32 %0, i32 %0, i16 signext 3, i32 %0, i8 signext 4)
+  ret i32 %call
+}
+declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
+             i8 signext)
+
+define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
+entry:
+; CHECK: test8
+; CHECK: strb {{w[0-9]+}}, [sp, #3]
+; CHECK: strb wzr, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp, #1]
+; CHECK: strb wzr, [sp]
+; CHECK: bl
+; FAST: test8
+; FAST: strb {{w[0-9]+}}, [sp]
+; FAST: strb {{w[0-9]+}}, [sp, #1]
+; FAST: strb {{w[0-9]+}}, [sp, #2]
+; FAST: strb {{w[0-9]+}}, [sp, #3]
+; FAST: bl
+  tail call void @args_i1(i1 zeroext false, i1 zeroext true, i1 zeroext false,
+                  i1 zeroext true, i1 zeroext false, i1 zeroext true,
+                  i1 zeroext false, i1 zeroext true, i1 zeroext false,
+                  i1 zeroext true, i1 zeroext false, i1 zeroext true)
+  ret i32 0
+}
+
+declare void @args_i1(i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
+                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
+                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext)
+
+define i32 @i1_stack_incoming(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f,
+                               i64 %g, i64 %h, i64 %i, i1 zeroext %j) {
+; CHECK-LABEL: i1_stack_incoming:
+; CHECK: ldrb w0, [sp, #8]
+; CHECK: ret
+  %v = zext i1 %j to i32
+  ret i32 %v
+}
diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
new file mode 100644
index 0000000..44c5a07
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -0,0 +1,532 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+target triple = "arm64-apple-darwin"
+
+; rdar://12648441
+; Generated from arm64-arguments.c with -O2.
+; Test passing structs with size < 8, < 16 and > 16
+; with alignment of 16 and without
+
+; Structs with size < 8
+%struct.s38 = type { i32, i16 }
+; With alignment of 16, the size will be padded to multiple of 16 bytes.
+%struct.s39 = type { i32, i16, [10 x i8] }
+; Structs with size < 16
+%struct.s40 = type { i32, i16, i32, i16 }
+%struct.s41 = type { i32, i16, i32, i16 }
+; Structs with size > 16
+%struct.s42 = type { i32, i16, i32, i16, i32, i16 }
+%struct.s43 = type { i32, i16, i32, i16, i32, i16, [10 x i8] }
+
+@g38 = common global %struct.s38 zeroinitializer, align 4
+@g38_2 = common global %struct.s38 zeroinitializer, align 4
+@g39 = common global %struct.s39 zeroinitializer, align 16
+@g39_2 = common global %struct.s39 zeroinitializer, align 16
+@g40 = common global %struct.s40 zeroinitializer, align 4
+@g40_2 = common global %struct.s40 zeroinitializer, align 4
+@g41 = common global %struct.s41 zeroinitializer, align 16
+@g41_2 = common global %struct.s41 zeroinitializer, align 16
+@g42 = common global %struct.s42 zeroinitializer, align 4
+@g42_2 = common global %struct.s42 zeroinitializer, align 4
+@g43 = common global %struct.s43 zeroinitializer, align 16
+@g43_2 = common global %struct.s43 zeroinitializer, align 16
+
+; structs with size < 8 bytes, passed via i64 in x1 and x2
+define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce) #0 {
+entry:
+; CHECK: f38
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w2
+  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i64 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i64 %s2.coerce, 32
+  %sext8 = shl nuw nsw i64 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i64 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i64 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i64 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller38() #1 {
+entry:
+; CHECK: caller38
+; CHECK: ldr x1,
+; CHECK: ldr x2,
+  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
+  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
+  %call = tail call i32 @f38(i32 3, i64 %0, i64 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f38_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i64 %s1.coerce, i64 %s2.coerce) #0
+
+; structs with size < 8 bytes, passed on stack at [sp+8] and [sp+16]
+; i9 at [sp]
+define i32 @caller38_stack() #1 {
+entry:
+; CHECK: caller38_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
+  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
+  %call = tail call i32 @f38_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                                   i32 7, i32 8, i32 9, i64 %0, i64 %1) #5
+  ret i32 %call
+}
+
+; structs with size < 8 bytes, alignment of 16
+; passed via i128 in x1 and x3
+define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
+entry:
+; CHECK: f39
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
+  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i128 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i128 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller39() #1 {
+entry:
+; CHECK: caller39
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
+  %call = tail call i32 @f39(i32 3, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f39_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
+
+; structs with size < 8 bytes, alignment 16
+; passed on stack at [sp+16] and [sp+32]
+define i32 @caller39_stack() #1 {
+entry:
+; CHECK: caller39_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
+  %call = tail call i32 @f39_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                                   i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+; structs with size < 16 bytes
+; passed via i128 in x1 and x3
+define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 {
+entry:
+; CHECK: f40
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.coerce.fca.0.extract = extractvalue [2 x i64] %s1.coerce, 0
+  %s2.coerce.fca.0.extract = extractvalue [2 x i64] %s2.coerce, 0
+  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce.fca.0.extract to i32
+  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce.fca.0.extract to i32
+  %s1.sroa.0.4.extract.shift = lshr i64 %s1.coerce.fca.0.extract, 32
+  %sext8 = shl nuw nsw i64 %s1.sroa.0.4.extract.shift, 16
+  %sext = trunc i64 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %s2.sroa.0.4.extract.shift = lshr i64 %s2.coerce.fca.0.extract, 32
+  %sext1011 = shl nuw nsw i64 %s2.sroa.0.4.extract.shift, 16
+  %sext10 = trunc i64 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller40() #1 {
+entry:
+; CHECK: caller40
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
+  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
+  %call = tail call i32 @f40(i32 3, [2 x i64] %0, [2 x i64] %1) #5
+  ret i32 %call
+}
+
+declare i32 @f40_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0
+
+; structs with size < 16 bytes
+; passed on stack at [sp+8] and [sp+24]
+define i32 @caller40_stack() #1 {
+entry:
+; CHECK: caller40_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
+  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
+  %call = tail call i32 @f40_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                         i32 7, i32 8, i32 9, [2 x i64] %0, [2 x i64] %1) #5
+  ret i32 %call
+}
+
+; structs with size < 16 bytes, alignment of 16
+; passed via i128 in x1 and x3
+define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
+entry:
+; CHECK: f41
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
+  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i128 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i128 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller41() #1 {
+entry:
+; CHECK: caller41
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
+  %call = tail call i32 @f41(i32 3, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f41_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
+
+; structs with size < 16 bytes, alignment of 16
+; passed on stack at [sp+16] and [sp+32]
+define i32 @caller41_stack() #1 {
+entry:
+; CHECK: caller41_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
+  %call = tail call i32 @f41_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                            i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+; structs with size of 22 bytes, passed indirectly in x1 and x2
+define i32 @f42(i32 %i, %struct.s42* nocapture %s1, %struct.s42* nocapture %s2) #2 {
+entry:
+; CHECK: f42
+; CHECK: ldr w[[A:[0-9]+]], [x1]
+; CHECK: ldr w[[B:[0-9]+]], [x2]
+; CHECK: add w[[C:[0-9]+]], w[[A]], w0
+; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
+; FAST: f42
+; FAST: ldr w[[A:[0-9]+]], [x1]
+; FAST: ldr w[[B:[0-9]+]], [x2]
+; FAST: add w[[C:[0-9]+]], w[[A]], w0
+; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
+  %i1 = getelementptr inbounds %struct.s42* %s1, i64 0, i32 0
+  %0 = load i32* %i1, align 4, !tbaa !0
+  %i2 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 0
+  %1 = load i32* %i2, align 4, !tbaa !0
+  %s = getelementptr inbounds %struct.s42* %s1, i64 0, i32 1
+  %2 = load i16* %s, align 2, !tbaa !3
+  %conv = sext i16 %2 to i32
+  %s5 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 1
+  %3 = load i16* %s5, align 2, !tbaa !3
+  %conv6 = sext i16 %3 to i32
+  %add = add i32 %0, %i
+  %add3 = add i32 %add, %1
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+; For s1, we allocate a 22-byte space, pass its address via x1
+define i32 @caller42() #3 {
+entry:
+; CHECK: caller42
+; CHECK: str {{x[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK: str {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK: add x1, sp, #32
+; CHECK: mov x2, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+
+; FAST: caller42
+; FAST: sub sp, sp, #96
+; Space for s1 is allocated at fp-24 = sp+72
+; Space for s2 is allocated at sp+48
+; FAST: sub x[[A:[0-9]+]], x29, #24
+; FAST: add x[[A:[0-9]+]], sp, #48
+; Call memcpy with size = 24 (0x18)
+; FAST: orr {{x[0-9]+}}, xzr, #0x18
+  %tmp = alloca %struct.s42, align 4
+  %tmp1 = alloca %struct.s42, align 4
+  %0 = bitcast %struct.s42* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s42* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %call = call i32 @f42(i32 3, %struct.s42* %tmp, %struct.s42* %tmp1) #5
+  ret i32 %call
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #4
+
+declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                       i32 %i7, i32 %i8, i32 %i9, %struct.s42* nocapture %s1,
+                       %struct.s42* nocapture %s2) #2
+
+define i32 @caller42_stack() #3 {
+entry:
+; CHECK: caller42_stack
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #96
+; CHECK: stur {{x[0-9]+}}, [x29, #-16]
+; CHECK: stur {{q[0-9]+}}, [x29, #-32]
+; CHECK: str {{x[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; Space for s1 is allocated at x29-32 = sp+64
+; Space for s2 is allocated at sp+32
+; CHECK: add x[[B:[0-9]+]], sp, #32
+; CHECK: str x[[B]], [sp, #16]
+; CHECK: sub x[[A:[0-9]+]], x29, #32
+; Address of s1 is passed on stack at sp+8
+; CHECK: str x[[A]], [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+
+; FAST: caller42_stack
+; Space for s1 is allocated at fp-24
+; Space for s2 is allocated at fp-48
+; FAST: sub x[[A:[0-9]+]], x29, #24
+; FAST: sub x[[B:[0-9]+]], x29, #48
+; Call memcpy with size = 24 (0x18)
+; FAST: orr {{x[0-9]+}}, xzr, #0x18
+; FAST: str {{w[0-9]+}}, [sp]
+; Address of s1 is passed on stack at sp+8
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+  %tmp = alloca %struct.s42, align 4
+  %tmp1 = alloca %struct.s42, align 4
+  %0 = bitcast %struct.s42* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s42* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %call = call i32 @f42_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, %struct.s42* %tmp, %struct.s42* %tmp1) #5
+  ret i32 %call
+}
+
+; structs with size of 22 bytes, alignment of 16
+; passed indirectly in x1 and x2
+define i32 @f43(i32 %i, %struct.s43* nocapture %s1, %struct.s43* nocapture %s2) #2 {
+entry:
+; CHECK: f43
+; CHECK: ldr w[[A:[0-9]+]], [x1]
+; CHECK: ldr w[[B:[0-9]+]], [x2]
+; CHECK: add w[[C:[0-9]+]], w[[A]], w0
+; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
+; FAST: f43
+; FAST: ldr w[[A:[0-9]+]], [x1]
+; FAST: ldr w[[B:[0-9]+]], [x2]
+; FAST: add w[[C:[0-9]+]], w[[A]], w0
+; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
+  %i1 = getelementptr inbounds %struct.s43* %s1, i64 0, i32 0
+  %0 = load i32* %i1, align 4, !tbaa !0
+  %i2 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 0
+  %1 = load i32* %i2, align 4, !tbaa !0
+  %s = getelementptr inbounds %struct.s43* %s1, i64 0, i32 1
+  %2 = load i16* %s, align 2, !tbaa !3
+  %conv = sext i16 %2 to i32
+  %s5 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 1
+  %3 = load i16* %s5, align 2, !tbaa !3
+  %conv6 = sext i16 %3 to i32
+  %add = add i32 %0, %i
+  %add3 = add i32 %add, %1
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller43() #3 {
+entry:
+; CHECK: caller43
+; CHECK: str {{q[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK: add x1, sp, #32
+; CHECK: mov x2, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+
+; FAST: caller43
+; FAST: mov x29, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+; FAST: add x1, sp, #32
+; FAST: mov x2, sp
+; FAST: str {{x[0-9]+}}, [sp, #32]
+; FAST: str {{x[0-9]+}}, [sp, #40]
+; FAST: str {{x[0-9]+}}, [sp, #48]
+; FAST: str {{x[0-9]+}}, [sp, #56]
+; FAST: str {{x[0-9]+}}, [sp]
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+; FAST: str {{x[0-9]+}}, [sp, #24]
+  %tmp = alloca %struct.s43, align 16
+  %tmp1 = alloca %struct.s43, align 16
+  %0 = bitcast %struct.s43* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s43* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %call = call i32 @f43(i32 3, %struct.s43* %tmp, %struct.s43* %tmp1) #5
+  ret i32 %call
+}
+
+declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                       i32 %i7, i32 %i8, i32 %i9, %struct.s43* nocapture %s1,
+                       %struct.s43* nocapture %s2) #2
+
+define i32 @caller43_stack() #3 {
+entry:
+; CHECK: caller43_stack
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #96
+; CHECK: stur {{q[0-9]+}}, [x29, #-16]
+; CHECK: stur {{q[0-9]+}}, [x29, #-32]
+; CHECK: str {{q[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; Space for s1 is allocated at x29-32 = sp+64
+; Space for s2 is allocated at sp+32
+; CHECK: add x[[B:[0-9]+]], sp, #32
+; CHECK: str x[[B]], [sp, #16]
+; CHECK: sub x[[A:[0-9]+]], x29, #32
+; Address of s1 is passed on stack at sp+8
+; CHECK: str x[[A]], [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+
+; FAST: caller43_stack
+; FAST: sub sp, sp, #96
+; Space for s1 is allocated at fp-32 = sp+64
+; Space for s2 is allocated at sp+32
+; FAST: sub x[[A:[0-9]+]], x29, #32
+; FAST: add x[[B:[0-9]+]], sp, #32
+; FAST: stur {{x[0-9]+}}, [x29, #-32]
+; FAST: stur {{x[0-9]+}}, [x29, #-24]
+; FAST: stur {{x[0-9]+}}, [x29, #-16]
+; FAST: stur {{x[0-9]+}}, [x29, #-8]
+; FAST: str {{x[0-9]+}}, [sp, #32]
+; FAST: str {{x[0-9]+}}, [sp, #40]
+; FAST: str {{x[0-9]+}}, [sp, #48]
+; FAST: str {{x[0-9]+}}, [sp, #56]
+; FAST: str {{w[0-9]+}}, [sp]
+; Address of s1 is passed on stack at sp+8
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+  %tmp = alloca %struct.s43, align 16
+  %tmp1 = alloca %struct.s43, align 16
+  %0 = bitcast %struct.s43* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s43* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %call = call i32 @f43_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, %struct.s43* %tmp, %struct.s43* %tmp1) #5
+  ret i32 %call
+}
+
+; rdar://13668927
+; Check that we don't split an i128.
+declare i32 @callee_i128_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
+                               i32 %i6, i32 %i7, i128 %s1, i32 %i8)
+
+define i32 @i128_split() {
+entry:
+; CHECK: i128_split
+; "i128 %0" should be on stack at [sp].
+; "i32 8" should be on stack at [sp, #16].
+; CHECK: str {{w[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp]
+; FAST: i128_split
+; FAST: sub sp, sp, #48
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16]
+; Load/Store opt is disabled with -O0, so the i128 is split.
+; FAST: str {{x[0-9]+}}, [x[[ADDR]], #8]
+; FAST: str {{x[0-9]+}}, [x[[ADDR]]]
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %call = tail call i32 @callee_i128_split(i32 1, i32 2, i32 3, i32 4, i32 5,
+                                           i32 6, i32 7, i128 %0, i32 8) #5
+  ret i32 %call
+}
+
+declare i32 @callee_i64(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
+                               i32 %i6, i32 %i7, i64 %s1, i32 %i8)
+
+define i32 @i64_split() {
+entry:
+; CHECK: i64_split
+; "i64 %0" should be in register x7.
+; "i32 8" should be on stack at [sp].
+; CHECK: ldr x7, [{{x[0-9]+}}]
+; CHECK: str {{w[0-9]+}}, [sp]
+; FAST: i64_split
+; FAST: ldr x7, [{{x[0-9]+}}]
+; FAST: str {{w[0-9]+}}, [sp]
+  %0 = load i64* bitcast (%struct.s41* @g41 to i64*), align 16
+  %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5,
+                                    i32 6, i32 7, i64 %0, i32 8) #5
+  ret i32 %call
+}
+
+attributes #0 = { noinline nounwind readnone "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #1 = { nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #2 = { noinline nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #3 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #4 = { nounwind }
+attributes #5 = { nobuiltin }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"short", metadata !1}
+!4 = metadata !{i64 0, i64 4, metadata !0, i64 4, i64 2, metadata !3, i64 8, i64 4, metadata !0, i64 12, i64 2, metadata !3, i64 16, i64 4, metadata !0, i64 20, i64 2, metadata !3}
diff --git a/test/CodeGen/AArch64/arm64-addp.ll b/test/CodeGen/AArch64/arm64-addp.ll
new file mode 100644
index 0000000..3f1e5c5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-addp.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+
+define double @foo(<2 x double> %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: faddp.2d d0, v0
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x double> %a, i32 0
+  %lane1.i = extractelement <2 x double> %a, i32 1
+  %vpaddd.i = fadd double %lane0.i, %lane1.i
+  ret double %vpaddd.i
+}
+
+define i64 @foo0(<2 x i64> %a) nounwind {
+; CHECK-LABEL: foo0:
+; CHECK: addp.2d d0, v0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x i64> %a, i32 0
+  %lane1.i = extractelement <2 x i64> %a, i32 1
+  %vpaddd.i = add i64 %lane0.i, %lane1.i
+  ret i64 %vpaddd.i
+}
+
+define float @foo1(<2 x float> %a) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: faddp.2s
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x float> %a, i32 0
+  %lane1.i = extractelement <2 x float> %a, i32 1
+  %vpaddd.i = fadd float %lane0.i, %lane1.i
+  ret float %vpaddd.i
+}
diff --git a/test/CodeGen/AArch64/arm64-addr-mode-folding.ll b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
new file mode 100644
index 0000000..08fb8c9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
@@ -0,0 +1,171 @@
+; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s
+; <rdar://problem/13621857>
+
+@block = common global i8* null, align 8
+
+define i32 @fct(i32 %i1, i32 %i2) {
+; CHECK: @fct
+; Sign extension is used more than once, thus it should not be folded.
+; CodeGenPrepare is not sharing sext across uses, thus this is folded because
+; of that.
+; _CHECK-NOT_: , sxtw]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv8 = zext i1 %cmp7 to i32
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc9 = add nsw i32 %i2, 1
+  %idxprom10 = sext i32 %inc to i64
+  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
+  %3 = load i8* %arrayidx11, align 1
+  %idxprom12 = sext i32 %inc9 to i64
+  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
+  %4 = load i8* %arrayidx13, align 1
+  %cmp16 = icmp eq i8 %3, %4
+  br i1 %cmp16, label %if.end23, label %if.then18
+
+if.then18:                                        ; preds = %if.end
+  %cmp21 = icmp ugt i8 %3, %4
+  %conv22 = zext i1 %cmp21 to i32
+  br label %return
+
+if.end23:                                         ; preds = %if.end
+  %inc24 = add nsw i32 %i1, 2
+  %inc25 = add nsw i32 %i2, 2
+  %idxprom26 = sext i32 %inc24 to i64
+  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
+  %5 = load i8* %arrayidx27, align 1
+  %idxprom28 = sext i32 %inc25 to i64
+  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
+  %6 = load i8* %arrayidx29, align 1
+  %cmp32 = icmp eq i8 %5, %6
+  br i1 %cmp32, label %return, label %if.then34
+
+if.then34:                                        ; preds = %if.end23
+  %cmp37 = icmp ugt i8 %5, %6
+  %conv38 = zext i1 %cmp37 to i32
+  br label %return
+
+return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
+  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
+  ret i32 %retval.0
+}
+
+define i32 @fct1(i32 %i1, i32 %i2) optsize {
+; CHECK: @fct1
+; Addressing are folded when optimizing for code size.
+; CHECK: , sxtw]
+; CHECK: , sxtw]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv8 = zext i1 %cmp7 to i32
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc9 = add nsw i32 %i2, 1
+  %idxprom10 = sext i32 %inc to i64
+  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
+  %3 = load i8* %arrayidx11, align 1
+  %idxprom12 = sext i32 %inc9 to i64
+  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
+  %4 = load i8* %arrayidx13, align 1
+  %cmp16 = icmp eq i8 %3, %4
+  br i1 %cmp16, label %if.end23, label %if.then18
+
+if.then18:                                        ; preds = %if.end
+  %cmp21 = icmp ugt i8 %3, %4
+  %conv22 = zext i1 %cmp21 to i32
+  br label %return
+
+if.end23:                                         ; preds = %if.end
+  %inc24 = add nsw i32 %i1, 2
+  %inc25 = add nsw i32 %i2, 2
+  %idxprom26 = sext i32 %inc24 to i64
+  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
+  %5 = load i8* %arrayidx27, align 1
+  %idxprom28 = sext i32 %inc25 to i64
+  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
+  %6 = load i8* %arrayidx29, align 1
+  %cmp32 = icmp eq i8 %5, %6
+  br i1 %cmp32, label %return, label %if.then34
+
+if.then34:                                        ; preds = %if.end23
+  %cmp37 = icmp ugt i8 %5, %6
+  %conv38 = zext i1 %cmp37 to i32
+  br label %return
+
+return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
+  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
+  ret i32 %retval.0
+}
+
+; CHECK: @test
+; CHECK-NOT: , uxtw #2]
+define i32 @test(i32* %array, i8 zeroext %c, i32 %arg) {
+entry:
+  %conv = zext i8 %c to i32
+  %add = sub i32 0, %arg
+  %tobool = icmp eq i32 %conv, %add
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = zext i8 %c to i64
+  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
+  %0 = load volatile i32* %arrayidx, align 4
+  %1 = load volatile i32* %arrayidx, align 4
+  %add3 = add nsw i32 %1, %0
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
+  ret i32 %res.0
+}
+
+
+; CHECK: @test2
+; CHECK: , uxtw #2]
+; CHECK: , uxtw #2]
+define i32 @test2(i32* %array, i8 zeroext %c, i32 %arg) optsize {
+entry:
+  %conv = zext i8 %c to i32
+  %add = sub i32 0, %arg
+  %tobool = icmp eq i32 %conv, %add
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = zext i8 %c to i64
+  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
+  %0 = load volatile i32* %arrayidx, align 4
+  %1 = load volatile i32* %arrayidx, align 4
+  %add3 = add nsw i32 %1, %0
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
+  ret i32 %res.0
+}
diff --git a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
new file mode 100644
index 0000000..1a3ca8b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march arm64 < %s | FileCheck %s
+; rdar://13452552
+; ModuleID = 'reduced_test.ll'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+@block = common global i8* null, align 8
+
+define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
+; CHECK: fullGtU
+; CHECK: adrp [[PAGE:x[0-9]+]], _block@GOTPAGE
+; CHECK: ldr [[ADDR:x[0-9]+]], {{\[}}[[PAGE]], _block@GOTPAGEOFF]
+; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]]
+; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]],  w0, sxtw]
+; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], w1, sxtw]
+; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
+; CHECK-NEXT b.ne
+; Next BB
+; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw
+; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw
+; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1]
+; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1]
+; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]]
+; CHECK-NEXT: b.ne
+; Next BB
+; CHECK: ldrb [[LOADEDVAL3:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #2]
+; CHECK-NEXT: ldrb [[LOADEDVAL4:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #2]
+; CHECK-NEXT: cmp [[LOADEDVAL3]], [[LOADEDVAL4]]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %tmp = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %tmp, i64 %idxprom
+  %tmp1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %tmp, i64 %idxprom1
+  %tmp2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %tmp1, %tmp2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %tmp1, %tmp2
+  %conv9 = zext i1 %cmp7 to i8
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc10 = add nsw i32 %i2, 1
+  %idxprom11 = sext i32 %inc to i64
+  %arrayidx12 = getelementptr inbounds i8* %tmp, i64 %idxprom11
+  %tmp3 = load i8* %arrayidx12, align 1
+  %idxprom13 = sext i32 %inc10 to i64
+  %arrayidx14 = getelementptr inbounds i8* %tmp, i64 %idxprom13
+  %tmp4 = load i8* %arrayidx14, align 1
+  %cmp17 = icmp eq i8 %tmp3, %tmp4
+  br i1 %cmp17, label %if.end25, label %if.then19
+
+if.then19:                                        ; preds = %if.end
+  %cmp22 = icmp ugt i8 %tmp3, %tmp4
+  %conv24 = zext i1 %cmp22 to i8
+  br label %return
+
+if.end25:                                         ; preds = %if.end
+  %inc26 = add nsw i32 %i1, 2
+  %inc27 = add nsw i32 %i2, 2
+  %idxprom28 = sext i32 %inc26 to i64
+  %arrayidx29 = getelementptr inbounds i8* %tmp, i64 %idxprom28
+  %tmp5 = load i8* %arrayidx29, align 1
+  %idxprom30 = sext i32 %inc27 to i64
+  %arrayidx31 = getelementptr inbounds i8* %tmp, i64 %idxprom30
+  %tmp6 = load i8* %arrayidx31, align 1
+  %cmp34 = icmp eq i8 %tmp5, %tmp6
+  br i1 %cmp34, label %return, label %if.then36
+
+if.then36:                                        ; preds = %if.end25
+  %cmp39 = icmp ugt i8 %tmp5, %tmp6
+  %conv41 = zext i1 %cmp39 to i8
+  br label %return
+
+return:                                           ; preds = %if.then36, %if.end25, %if.then19, %if.then
+  %retval.0 = phi i8 [ %conv9, %if.then ], [ %conv24, %if.then19 ], [ %conv41, %if.then36 ], [ 0, %if.end25 ]
+  ret i8 %retval.0
+}
diff --git a/test/CodeGen/AArch64/arm64-addrmode.ll b/test/CodeGen/AArch64/arm64-addrmode.ll
new file mode 100644
index 0000000..700fba8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -0,0 +1,72 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+; rdar://10232252
+
+@object = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+; base + offset (imm9)
+; CHECK: @t1
+; CHECK: ldr xzr, [x{{[0-9]+}}, #8]
+; CHECK: ret
+define void @t1() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 1
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + offset (> imm9)
+; CHECK: @t2
+; CHECK: sub [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #264
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t2() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 -33
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + unsigned offset (> imm9 and <= imm12 * size of type in bytes)
+; CHECK: @t3
+; CHECK: ldr xzr, [x{{[0-9]+}}, #32760]
+; CHECK: ret
+define void @t3() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 4095
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + unsigned offset (> imm12 * size of type in bytes)
+; CHECK: @t4
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #8, lsl #12
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t4() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 4096
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + reg
+; CHECK: @t5
+; CHECK: ldr xzr, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #3]
+; CHECK: ret
+define void @t5(i64 %a) {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 %a
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + reg + imm
+; CHECK: @t6
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
+; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #8, lsl #12
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t6(i64 %a) {
+  %tmp1 = getelementptr inbounds i64* @object, i64 %a
+  %incdec.ptr = getelementptr inbounds i64* %tmp1, i64 4096
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
new file mode 100644
index 0000000..f396bc9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=false | FileCheck %s
+
+; rdar://12713765
+; Make sure we are not creating stack objects that are assumed to be 64-byte
+; aligned.
+@T3_retval = common global <16 x float> zeroinitializer, align 16
+
+define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp {
+entry:
+; CHECK: test
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE:x[0-9]+]], #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE]]]
+ %retval = alloca <16 x float>, align 16
+ %0 = load <16 x float>* @T3_retval, align 16
+ store <16 x float> %0, <16 x float>* %retval
+ %1 = load <16 x float>* %retval
+ store <16 x float> %1, <16 x float>* %agg.result, align 16
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
new file mode 100644
index 0000000..3750f31
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s
+
+; CHECK: foo
+; CHECK: ldr w[[REG:[0-9]+]], [x19, #264]
+; CHECK: str w[[REG]], [x19, #132]
+; CHECK: ldr w{{[0-9]+}}, [x19, #264]
+
+define i32 @foo(i32 %a) nounwind {
+  %retval = alloca i32, align 4
+  %a.addr = alloca i32, align 4
+  %arr = alloca [32 x i32], align 4
+  %i = alloca i32, align 4
+  %arr2 = alloca [32 x i32], align 4
+  %j = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %tmp1 = zext i32 %tmp to i64
+  %v = mul i64 4, %tmp1
+  %vla = alloca i8, i64 %v, align 4
+  %tmp2 = bitcast i8* %vla to i32*
+  %tmp3 = load i32* %a.addr, align 4
+  store i32 %tmp3, i32* %i, align 4
+  %tmp4 = load i32* %a.addr, align 4
+  store i32 %tmp4, i32* %j, align 4
+  %tmp5 = load i32* %j, align 4
+  store i32 %tmp5, i32* %retval
+  %x = load i32* %retval
+  ret i32 %x
+}
diff --git a/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll b/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
new file mode 100644
index 0000000..4194977
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
@@ -0,0 +1,72 @@
+; RUN: llc -O1 -march=arm64 -enable-andcmp-sinking=true < %s | FileCheck %s
+; ModuleID = 'and-cbz-extr-mr.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+define zeroext i1 @foo(i1 %IsEditable, i1 %isTextField, i8* %str1, i8* %str2, i8* %str3, i8* %str4, i8* %str5, i8* %str6, i8* %str7, i8* %str8, i8* %str9, i8* %str10, i8* %str11, i8* %str12, i8* %str13, i32 %int1, i8* %str14) unnamed_addr #0 align 2 {
+; CHECK: _foo:
+entry:
+  %tobool = icmp eq i8* %str14, null
+  br i1 %tobool, label %return, label %if.end
+
+; CHECK: %if.end
+; CHECK: tbz
+if.end:                                           ; preds = %entry
+  %and.i.i.i = and i32 %int1, 4
+  %tobool.i.i.i = icmp eq i32 %and.i.i.i, 0
+  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i
+
+land.rhs.i:                                       ; preds = %if.end
+  %cmp.i.i.i = icmp eq i8* %str12, %str13
+  br i1 %cmp.i.i.i, label %if.then3, label %lor.rhs.i.i.i
+
+lor.rhs.i.i.i:                                    ; preds = %land.rhs.i
+  %cmp.i13.i.i.i = icmp eq i8* %str10, %str11
+  br i1 %cmp.i13.i.i.i, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, label %if.end5
+
+_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit: ; preds = %lor.rhs.i.i.i
+  %cmp.i.i.i.i = icmp eq i8* %str8, %str9
+  br i1 %cmp.i.i.i.i, label %if.then3, label %if.end5
+
+if.then3:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %land.rhs.i
+  %tmp11 = load i8* %str14, align 8
+  %tmp12 = and i8 %tmp11, 2
+  %tmp13 = icmp ne i8 %tmp12, 0
+  br label %return
+
+if.end5:                                          ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %lor.rhs.i.i.i
+; CHECK: %if.end5
+; CHECK: tbz
+  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i19
+
+land.rhs.i19:                                     ; preds = %if.end5
+  %cmp.i.i.i18 = icmp eq i8* %str6, %str7
+  br i1 %cmp.i.i.i18, label %if.then7, label %lor.rhs.i.i.i23
+
+lor.rhs.i.i.i23:                                  ; preds = %land.rhs.i19
+  %cmp.i13.i.i.i22 = icmp eq i8* %str3, %str4
+  br i1 %cmp.i13.i.i.i22, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, label %if.end12
+
+_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28: ; preds = %lor.rhs.i.i.i23
+  %cmp.i.i.i.i26 = icmp eq i8* %str1, %str2
+  br i1 %cmp.i.i.i.i26, label %if.then7, label %if.end12
+
+if.then7:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %land.rhs.i19
+  br i1 %isTextField, label %if.then9, label %if.end12
+
+if.then9:                                         ; preds = %if.then7
+  %tmp23 = load i8* %str5, align 8
+  %tmp24 = and i8 %tmp23, 2
+  %tmp25 = icmp ne i8 %tmp24, 0
+  br label %return
+
+if.end12:                                         ; preds = %if.then7, %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %lor.rhs.i.i.i23, %if.end5, %if.end
+  %lnot = xor i1 %IsEditable, true
+  br label %return
+
+return:                                           ; preds = %if.end12, %if.then9, %if.then3, %entry
+  %retval.0 = phi i1 [ %tmp13, %if.then3 ], [ %tmp25, %if.then9 ], [ %lnot, %if.end12 ], [ true, %entry ]
+  ret i1 %retval.0
+}
+
+attributes #0 = { nounwind ssp }
diff --git a/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
new file mode 100644
index 0000000..34d6287
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
@@ -0,0 +1,31 @@
+; RUN: llc %s -o - | FileCheck %s
+; Check that ANDS (tst) is not merged with ADD when the immediate
+; is not 0.
+; <rdar://problem/16693089>
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+; CHECK-LABEL: tst1:
+; CHECK: add [[REG:w[0-9]+]], w{{[0-9]+}}, #1
+; CHECK: tst [[REG]], #0x1
+define void @tst1() {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %result.09 = phi i32 [ %add2.result.0, %for.body ], [ 1, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+  %and = and i32 %i.08, 1
+  %cmp1 = icmp eq i32 %and, 0
+  %add2.result.0 = select i1 %cmp1, i32 undef, i32 %result.09
+  %inc = add nsw i32 %i.08, 1
+  %cmp = icmp slt i32 %i.08, undef
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %add2.result.0.lcssa = phi i32 [ %add2.result.0, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-anyregcc-crash.ll b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
new file mode 100644
index 0000000..241cf97
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
@@ -0,0 +1,19 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+;
+; Check that misuse of anyregcc results in a compile time error.
+
+; CHECK: LLVM ERROR: ran out of registers during register allocation
+define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+                        i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+                        i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+                        i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
+entry:
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
+                i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+                i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+                i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+                i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32)
+  ret i64 %result
+}
+
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-anyregcc.ll b/test/CodeGen/AArch64/arm64-anyregcc.ll
new file mode 100644
index 0000000..e26875d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-anyregcc.ll
@@ -0,0 +1,363 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Stackmap Header: no constants - 6 callsites
+; CHECK-LABEL: .section	__LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 8
+; Num LargeConstants
+; CHECK-NEXT:   .long 0
+; Num Callsites
+; CHECK-NEXT:   .long 8
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _test
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _property_access1
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _property_access2
+; CHECK-NEXT:   .quad 32
+; CHECK-NEXT:   .quad _property_access3
+; CHECK-NEXT:   .quad 32
+; CHECK-NEXT:   .quad _anyreg_test1
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _anyreg_test2
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _patchpoint_spilldef
+; CHECK-NEXT:   .quad 112
+; CHECK-NEXT:   .quad _patchpoint_spillargs
+; CHECK-NEXT:   .quad 128
+
+
+; test
+; CHECK-LABEL:  .long   L{{.*}}-_test
+; CHECK-NEXT:   .short  0
+; 3 locations
+; CHECK-NEXT:   .short  3
+; Loc 0: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Constant 3
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long 3
+define i64 @test() nounwind ssp uwtable {
+entry:
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
+  ret i64 0
+}
+
+; property access 1 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL:  .long   L{{.*}}-_property_access1
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
+  ret i64 %ret
+}
+
+; property access 2 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL:  .long   L{{.*}}-_property_access2
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access2() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
+  ret i64 %ret
+}
+
+; property access 3 - %obj is a frame index
+; CHECK-LABEL:  .long   L{{.*}}-_property_access3
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Direct FP - 8
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+; CHECK-NEXT:   .long -8
+define i64 @property_access3() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
+  ret i64 %ret
+}
+
+; anyreg_test1
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test1
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; anyreg_test2
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test2
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; Test spilling the return value of an anyregcc call.
+;
+; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
+;
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 3
+; Loc 0: Register (some register that will be spilled to the stack)
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  ret i64 %result
+}
+
+; Test spilling the arguments of an anyregcc call.
+;
+; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
+;
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 5
+; Loc 0: Return a register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Arg0 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 2: Arg1 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 3: Arg2 spilled to FP -96
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -96
+; Loc 4: Arg3 spilled to FP - 88
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -88
+define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  ret i64 %result
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-arith-saturating.ll b/test/CodeGen/AArch64/arm64-arith-saturating.ll
new file mode 100644
index 0000000..78cd1fc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-arith-saturating.ll
@@ -0,0 +1,153 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qadds:
+; CHECK: sqadd s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqadd.i = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqadd.i
+}
+
+define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qaddd:
+; CHECK: sqadd d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqadd.i = tail call i64 @llvm.aarch64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqadd.i
+}
+
+define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqadds:
+; CHECK: uqadd s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqadd.i = tail call i32 @llvm.aarch64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqadd.i
+}
+
+define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqaddd:
+; CHECK: uqadd d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqadd.i = tail call i64 @llvm.aarch64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqadd.i
+}
+
+declare i64 @llvm.aarch64.neon.uqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.uqadd.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) nounwind readnone
+
+define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubs:
+; CHECK: sqsub s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqsub.i = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqsub.i
+}
+
+define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubd:
+; CHECK: sqsub d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqsub.i = tail call i64 @llvm.aarch64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqsub.i
+}
+
+define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubs:
+; CHECK: uqsub s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqsub.i = tail call i32 @llvm.aarch64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqsub.i
+}
+
+define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubd:
+; CHECK: uqsub d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqsub.i = tail call i64 @llvm.aarch64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqsub.i
+}
+
+declare i64 @llvm.aarch64.neon.uqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.uqsub.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) nounwind readnone
+
+define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qabss:
+; CHECK: sqabs s0, s0
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vqabs.i = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %vecext) nounwind
+  ret i32 %vqabs.i
+}
+
+define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qabsd:
+; CHECK: sqabs d0, d0
+; CHECK: ret
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqabs.i = tail call i64 @llvm.aarch64.neon.sqabs.i64(i64 %vecext) nounwind
+  ret i64 %vqabs.i
+}
+
+define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qnegs:
+; CHECK: sqneg s0, s0
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vqneg.i = tail call i32 @llvm.aarch64.neon.sqneg.i32(i32 %vecext) nounwind
+  ret i32 %vqneg.i
+}
+
+define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qnegd:
+; CHECK: sqneg d0, d0
+; CHECK: ret
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqneg.i = tail call i64 @llvm.aarch64.neon.sqneg.i64(i64 %vecext) nounwind
+  ret i64 %vqneg.i
+}
+
+declare i64 @llvm.aarch64.neon.sqneg.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqneg.i32(i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqabs.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqabs.i32(i32) nounwind readnone
+
+
+define i32 @vqmovund(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovund:
+; CHECK: sqxtun s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovun.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovun.i
+}
+
+define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_s:
+; CHECK: sqxtn s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovn.i
+}
+
+define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_u:
+; CHECK: uqxtn s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovn.i
+}
+
+declare i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-arith.ll b/test/CodeGen/AArch64/arm64-arith.ll
new file mode 100644
index 0000000..ed9b569
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-arith.ll
@@ -0,0 +1,262 @@
+; RUN: llc < %s -march=arm64 -asm-verbose=false | FileCheck %s
+
+define i32 @t1(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: add w0, w1, w0
+; CHECK: ret
+  %add = add i32 %b, %a
+  ret i32 %add
+}
+
+define i32 @t2(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: udiv w0, w0, w1
+; CHECK: ret
+  %udiv = udiv i32 %a, %b
+  ret i32 %udiv
+}
+
+define i64 @t3(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: udiv x0, x0, x1
+; CHECK: ret
+  %udiv = udiv i64 %a, %b
+  ret i64 %udiv
+}
+
+define i32 @t4(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: sdiv w0, w0, w1
+; CHECK: ret
+  %sdiv = sdiv i32 %a, %b
+  ret i32 %sdiv
+}
+
+define i64 @t5(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: sdiv x0, x0, x1
+; CHECK: ret
+  %sdiv = sdiv i64 %a, %b
+  ret i64 %sdiv
+}
+
+define i32 @t6(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: lsl w0, w0, w1
+; CHECK: ret
+  %shl = shl i32 %a, %b
+  ret i32 %shl
+}
+
+define i64 @t7(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: lsl x0, x0, x1
+; CHECK: ret
+  %shl = shl i64 %a, %b
+  ret i64 %shl
+}
+
+define i32 @t8(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: lsr w0, w0, w1
+; CHECK: ret
+  %lshr = lshr i32 %a, %b
+  ret i32 %lshr
+}
+
+define i64 @t9(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t9:
+; CHECK: lsr x0, x0, x1
+; CHECK: ret
+  %lshr = lshr i64 %a, %b
+  ret i64 %lshr
+}
+
+define i32 @t10(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t10:
+; CHECK: asr w0, w0, w1
+; CHECK: ret
+  %ashr = ashr i32 %a, %b
+  ret i32 %ashr
+}
+
+define i64 @t11(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t11:
+; CHECK: asr x0, x0, x1
+; CHECK: ret
+  %ashr = ashr i64 %a, %b
+  ret i64 %ashr
+}
+
+define i32 @t12(i16 %a, i32 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t12:
+; CHECK: add	w0, w1, w0, sxth
+; CHECK: ret
+  %c = sext i16 %a to i32
+  %e = add i32 %x, %c
+  ret i32 %e
+}
+
+define i32 @t13(i16 %a, i32 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t13:
+; CHECK: add	w0, w1, w0, sxth #2
+; CHECK: ret
+  %c = sext i16 %a to i32
+  %d = shl i32 %c, 2
+  %e = add i32 %x, %d
+  ret i32 %e
+}
+
+define i64 @t14(i16 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t14:
+; CHECK: add	x0, x1, w0, uxth #3
+; CHECK: ret
+  %c = zext i16 %a to i64
+  %d = shl i64 %c, 3
+  %e = add i64 %x, %d
+  ret i64 %e
+}
+
+; rdar://9160598
+define i64 @t15(i64 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t15:
+; CHECK: add x0, x1, w0, uxtw
+; CHECK: ret
+  %b = and i64 %a, 4294967295
+  %c = add i64 %x, %b
+  ret i64 %c
+}
+
+define i64 @t16(i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t16:
+; CHECK: lsl x0, x0, #1
+; CHECK: ret
+  %a = shl i64 %x, 1
+  ret i64 %a
+}
+
+; rdar://9166974
+define i64 @t17(i16 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t17:
+; CHECK: sxth [[REG:x[0-9]+]], w0
+; CHECK: neg x0, [[REG]], lsl #32
+; CHECK: ret
+  %tmp16 = sext i16 %a to i64
+  %tmp17 = mul i64 %tmp16, -4294967296
+  ret i64 %tmp17
+}
+
+define i32 @t18(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t18:
+; CHECK: sdiv w0, w0, w1
+; CHECK: ret
+  %sdiv = call i32 @llvm.aarch64.sdiv.i32(i32 %a, i32 %b)
+  ret i32 %sdiv
+}
+
+define i64 @t19(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t19:
+; CHECK: sdiv x0, x0, x1
+; CHECK: ret
+  %sdiv = call i64 @llvm.aarch64.sdiv.i64(i64 %a, i64 %b)
+  ret i64 %sdiv
+}
+
+define i32 @t20(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t20:
+; CHECK: udiv w0, w0, w1
+; CHECK: ret
+  %udiv = call i32 @llvm.aarch64.udiv.i32(i32 %a, i32 %b)
+  ret i32 %udiv
+}
+
+define i64 @t21(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t21:
+; CHECK: udiv x0, x0, x1
+; CHECK: ret
+  %udiv = call i64 @llvm.aarch64.udiv.i64(i64 %a, i64 %b)
+  ret i64 %udiv
+}
+
+declare i32 @llvm.aarch64.sdiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.sdiv.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.udiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.udiv.i64(i64, i64) nounwind readnone
+
+; 32-bit not.
+define i32 @inv_32(i32 %x) nounwind ssp {
+entry:
+; CHECK: inv_32
+; CHECK: mvn w0, w0
+; CHECK: ret
+  %inv = xor i32 %x, -1
+  ret i32 %inv
+}
+
+; 64-bit not.
+define i64 @inv_64(i64 %x) nounwind ssp {
+entry:
+; CHECK: inv_64
+; CHECK: mvn x0, x0
+; CHECK: ret
+  %inv = xor i64 %x, -1
+  ret i64 %inv
+}
+
+; Multiplying by a power of two plus or minus one is better done via shift
+; and add/sub rather than the madd/msub instructions. The latter are 4+ cycles,
+; and the former are two (total for the two instruction sequence for subtract).
+define i32 @f0(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f0:
+; CHECK-NEXT: add w0, w0, w0, lsl #3
+; CHECK-NEXT: ret
+  %res = mul i32 %a, 9
+  ret i32 %res
+}
+
+define i64 @f1(i64 %a) nounwind readnone ssp {
+; CHECK-LABEL: f1:
+; CHECK-NEXT: lsl x8, x0, #4
+; CHECK-NEXT: sub x0, x8, x0
+; CHECK-NEXT: ret
+  %res = mul i64 %a, 15
+  ret i64 %res
+}
+
+define i32 @f2(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f2:
+; CHECK-NEXT: lsl w8, w0, #3
+; CHECK-NEXT: sub w0, w8, w0
+; CHECK-NEXT: ret
+  %res = mul nsw i32 %a, 7
+  ret i32 %res
+}
+
+define i64 @f3(i64 %a) nounwind readnone ssp {
+; CHECK-LABEL: f3:
+; CHECK-NEXT: add x0, x0, x0, lsl #4
+; CHECK-NEXT: ret
+  %res = mul nsw i64 %a, 17
+  ret i64 %res
+}
diff --git a/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll b/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
new file mode 100644
index 0000000..0904b62
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=arm64 -aarch64-dead-def-elimination=false < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test1() #0 {
+  %tmp1 = alloca i8
+  %tmp2 = icmp eq i8* %tmp1, null
+  %tmp3 = zext i1 %tmp2 to i32
+
+  ret i32 %tmp3
+
+  ; CHECK-LABEL: test1
+  ; CHECK: adds {{x[0-9]+}}, sp, #15
+}
diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
new file mode 100644
index 0000000..3b43aa1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -0,0 +1,225 @@
+; RUN: llc < %s -march=arm64 -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+
+@var = global i128 0
+
+define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp   [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x[[ADDR:[0-9]+]]]
+; CHECK-DAG: eor     [[MISMATCH_LO:x[0-9]+]], [[RESULTLO]], x2
+; CHECK-DAG: eor     [[MISMATCH_HI:x[0-9]+]], [[RESULTHI]], x3
+; CHECK: orr [[MISMATCH:x[0-9]+]], [[MISMATCH_LO]], [[MISMATCH_HI]]
+; CHECK: cbnz    [[MISMATCH]], [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: stxp   [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+; CHECK: [[DONE]]:
+  %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+  ret i128 %val
+}
+
+define void @fetch_and_nand(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK-DAG: bic    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK-DAG: bic    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw nand i128* %p, i128 %bits release
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_or(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK-DAG: orr    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK-DAG: orr    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw or i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_add(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_add:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: adds   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: adcs   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw add i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_sub(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_sub:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: subs   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: sbcs    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw sub i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_min(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_min:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp   [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp     [[DEST_REGLO]], x2
+; CHECK: cset    [[LOCMP:w[0-9]+]], ls
+; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset    [[HICMP:w[0-9]+]], le
+; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp     [[CMP]], #0
+; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw min i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_max(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_max:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp     [[DEST_REGLO]], x2
+; CHECK: cset    [[LOCMP:w[0-9]+]], hi
+; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset    [[HICMP:w[0-9]+]], gt
+; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp     [[CMP]], #0
+; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw max i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_umin(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umin:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp     [[DEST_REGLO]], x2
+; CHECK: cset    [[LOCMP:w[0-9]+]], ls
+; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset    [[HICMP:w[0-9]+]], ls
+; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp     [[CMP]], #0
+; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw umin i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_umax(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umax:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp     [[DEST_REGLO]], x2
+; CHECK: cset    [[LOCMP:w[0-9]+]], hi
+; CHECK: cmp     [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset    [[HICMP:w[0-9]+]], hi
+; CHECK: csel    [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp     [[CMP]], #0
+; CHECK-DAG: csel    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str    [[DEST_REGHI]]
+; CHECK-DAG: str    [[DEST_REGLO]]
+  %val = atomicrmw umax i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define i128 @atomic_load_seq_cst(i128* %p) {
+; CHECK-LABEL: atomic_load_seq_cst:
+; CHECK-NOT: dmb
+; CHECK-LABEL: ldaxp
+; CHECK-NOT: dmb
+   %r = load atomic i128* %p seq_cst, align 16
+   ret i128 %r
+}
+
+define i128 @atomic_load_relaxed(i128* %p) {
+; CHECK-LABEL: atomic_load_relaxed:
+; CHECK-NOT: dmb
+; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK-NOT: dmb
+   %r = load atomic i128* %p monotonic, align 16
+   ret i128 %r
+}
+
+
+define void @atomic_store_seq_cst(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_seq_cst:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp xzr, xzr, [x2]
+; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p seq_cst, align 16
+   ret void
+}
+
+define void @atomic_store_release(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_release:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p release, align 16
+   ret void
+}
+
+define void @atomic_store_relaxed(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_relaxed:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: stxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p unordered, align 16
+   ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll
new file mode 100644
index 0000000..aa9b284
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-atomic.ll
@@ -0,0 +1,331 @@
+; RUN: llc < %s -march=arm64 -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+
+define i32 @val_compare_and_swap(i32* %p) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: orr    [[NEWVAL_REG:w[0-9]+]], wzr, #0x4
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   [[RESULT:w[0-9]+]], [x0]
+; CHECK: cmp    [[RESULT]], #7
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
+  ret i32 %val
+}
+
+define i64 @val_compare_and_swap_64(i64* %p) {
+; CHECK-LABEL: val_compare_and_swap_64:
+; CHECK: orr    w[[NEWVAL_REG:[0-9]+]], wzr, #0x4
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   [[RESULT:x[0-9]+]], [x0]
+; CHECK: cmp    [[RESULT]], #7
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK-NOT: stxr x[[NEWVAL_REG]], x[[NEWVAL_REG]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], x[[NEWVAL_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
+  ret i64 %val
+}
+
+define i32 @fetch_and_nand(i32* %p) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   w[[DEST_REG:[0-9]+]], [x0]
+; CHECK: and    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], #0xfffffff8
+; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
+; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, x[[DEST_REG]]
+  %val = atomicrmw nand i32* %p, i32 7 release
+  ret i32 %val
+}
+
+define i64 @fetch_and_nand_64(i64* %p) {
+; CHECK-LABEL: fetch_and_nand_64:
+; CHECK: mov    x[[ADDR:[0-9]+]], x0
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
+; CHECK: and    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0xfffffffffffffff8
+; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+
+  %val = atomicrmw nand i64* %p, i64 7 acq_rel
+  ret i64 %val
+}
+
+define i32 @fetch_and_or(i32* %p) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK: movz   [[OLDVAL_REG:w[0-9]+]], #0x5
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   w[[DEST_REG:[0-9]+]], [x0]
+; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
+; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
+; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, x[[DEST_REG]]
+  %val = atomicrmw or i32* %p, i32 5 seq_cst
+  ret i32 %val
+}
+
+define i64 @fetch_and_or_64(i64* %p) {
+; CHECK: fetch_and_or_64:
+; CHECK: mov    x[[ADDR:[0-9]+]], x0
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
+; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0x7
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+  %val = atomicrmw or i64* %p, i64 7 monotonic
+  ret i64 %val
+}
+
+define void @acquire_fence() {
+   fence acquire
+   ret void
+   ; CHECK-LABEL: acquire_fence:
+   ; CHECK: dmb ishld
+}
+
+define void @release_fence() {
+   fence release
+   ret void
+   ; CHECK-LABEL: release_fence:
+   ; CHECK: dmb ish{{$}}
+}
+
+define void @seq_cst_fence() {
+   fence seq_cst
+   ret void
+   ; CHECK-LABEL: seq_cst_fence:
+   ; CHECK: dmb ish{{$}}
+}
+
+define i32 @atomic_load(i32* %p) {
+   %r = load atomic i32* %p seq_cst, align 4
+   ret i32 %r
+   ; CHECK-LABEL: atomic_load:
+   ; CHECK: ldar
+}
+
+define i8 @atomic_load_relaxed_8(i8* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_8:
+  %ptr_unsigned = getelementptr i8* %p, i32 4095
+  %val_unsigned = load atomic i8* %ptr_unsigned monotonic, align 1
+; CHECK: ldrb {{w[0-9]+}}, [x0, #4095]
+
+  %ptr_regoff = getelementptr i8* %p, i32 %off32
+  %val_regoff = load atomic i8* %ptr_regoff unordered, align 1
+  %tot1 = add i8 %val_unsigned, %val_regoff
+; CHECK: ldrb {{w[0-9]+}}, [x0, w1, sxtw]
+
+  %ptr_unscaled = getelementptr i8* %p, i32 -256
+  %val_unscaled = load atomic i8* %ptr_unscaled monotonic, align 1
+  %tot2 = add i8 %tot1, %val_unscaled
+; CHECK: ldurb {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
+  %val_random = load atomic i8* %ptr_random unordered, align 1
+  %tot3 = add i8 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: ldrb {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i8 %tot3
+}
+
+define i16 @atomic_load_relaxed_16(i16* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_16:
+  %ptr_unsigned = getelementptr i16* %p, i32 4095
+  %val_unsigned = load atomic i16* %ptr_unsigned monotonic, align 2
+; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr i16* %p, i32 %off32
+  %val_regoff = load atomic i16* %ptr_regoff unordered, align 2
+  %tot1 = add i16 %val_unsigned, %val_regoff
+; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+  %ptr_unscaled = getelementptr i16* %p, i32 -128
+  %val_unscaled = load atomic i16* %ptr_unscaled monotonic, align 2
+  %tot2 = add i16 %tot1, %val_unscaled
+; CHECK: ldurh {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
+  %val_random = load atomic i16* %ptr_random unordered, align 2
+  %tot3 = add i16 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: ldrh {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i16 %tot3
+}
+
+define i32 @atomic_load_relaxed_32(i32* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_32:
+  %ptr_unsigned = getelementptr i32* %p, i32 4095
+  %val_unsigned = load atomic i32* %ptr_unsigned monotonic, align 4
+; CHECK: ldr {{w[0-9]+}}, [x0, #16380]
+
+  %ptr_regoff = getelementptr i32* %p, i32 %off32
+  %val_regoff = load atomic i32* %ptr_regoff unordered, align 4
+  %tot1 = add i32 %val_unsigned, %val_regoff
+; CHECK: ldr {{w[0-9]+}}, [x0, w1, sxtw #2]
+
+  %ptr_unscaled = getelementptr i32* %p, i32 -64
+  %val_unscaled = load atomic i32* %ptr_unscaled monotonic, align 4
+  %tot2 = add i32 %tot1, %val_unscaled
+; CHECK: ldur {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
+  %val_random = load atomic i32* %ptr_random unordered, align 4
+  %tot3 = add i32 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: ldr {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i32 %tot3
+}
+
+define i64 @atomic_load_relaxed_64(i64* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_64:
+  %ptr_unsigned = getelementptr i64* %p, i32 4095
+  %val_unsigned = load atomic i64* %ptr_unsigned monotonic, align 8
+; CHECK: ldr {{x[0-9]+}}, [x0, #32760]
+
+  %ptr_regoff = getelementptr i64* %p, i32 %off32
+  %val_regoff = load atomic i64* %ptr_regoff unordered, align 8
+  %tot1 = add i64 %val_unsigned, %val_regoff
+; CHECK: ldr {{x[0-9]+}}, [x0, w1, sxtw #3]
+
+  %ptr_unscaled = getelementptr i64* %p, i32 -32
+  %val_unscaled = load atomic i64* %ptr_unscaled monotonic, align 8
+  %tot2 = add i64 %tot1, %val_unscaled
+; CHECK: ldur {{x[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
+  %val_random = load atomic i64* %ptr_random unordered, align 8
+  %tot3 = add i64 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: ldr {{x[0-9]+}}, [x[[ADDR]]]
+
+  ret i64 %tot3
+}
+
+
+define void @atomc_store(i32* %p) {
+   store atomic i32 4, i32* %p seq_cst, align 4
+   ret void
+   ; CHECK-LABEL: atomc_store:
+   ; CHECK: stlr
+}
+
+define void @atomic_store_relaxed_8(i8* %p, i32 %off32, i8 %val) {
+; CHECK-LABEL: atomic_store_relaxed_8:
+  %ptr_unsigned = getelementptr i8* %p, i32 4095
+  store atomic i8 %val, i8* %ptr_unsigned monotonic, align 1
+; CHECK: strb {{w[0-9]+}}, [x0, #4095]
+
+  %ptr_regoff = getelementptr i8* %p, i32 %off32
+  store atomic i8 %val, i8* %ptr_regoff unordered, align 1
+; CHECK: strb {{w[0-9]+}}, [x0, w1, sxtw]
+
+  %ptr_unscaled = getelementptr i8* %p, i32 -256
+  store atomic i8 %val, i8* %ptr_unscaled monotonic, align 1
+; CHECK: sturb {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
+  store atomic i8 %val, i8* %ptr_random unordered, align 1
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: strb {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_16(i16* %p, i32 %off32, i16 %val) {
+; CHECK-LABEL: atomic_store_relaxed_16:
+  %ptr_unsigned = getelementptr i16* %p, i32 4095
+  store atomic i16 %val, i16* %ptr_unsigned monotonic, align 2
+; CHECK: strh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr i16* %p, i32 %off32
+  store atomic i16 %val, i16* %ptr_regoff unordered, align 2
+; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+  %ptr_unscaled = getelementptr i16* %p, i32 -128
+  store atomic i16 %val, i16* %ptr_unscaled monotonic, align 2
+; CHECK: sturh {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
+  store atomic i16 %val, i16* %ptr_random unordered, align 2
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: strh {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_32(i32* %p, i32 %off32, i32 %val) {
+; CHECK-LABEL: atomic_store_relaxed_32:
+  %ptr_unsigned = getelementptr i32* %p, i32 4095
+  store atomic i32 %val, i32* %ptr_unsigned monotonic, align 4
+; CHECK: str {{w[0-9]+}}, [x0, #16380]
+
+  %ptr_regoff = getelementptr i32* %p, i32 %off32
+  store atomic i32 %val, i32* %ptr_regoff unordered, align 4
+; CHECK: str {{w[0-9]+}}, [x0, w1, sxtw #2]
+
+  %ptr_unscaled = getelementptr i32* %p, i32 -64
+  store atomic i32 %val, i32* %ptr_unscaled monotonic, align 4
+; CHECK: stur {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
+  store atomic i32 %val, i32* %ptr_random unordered, align 4
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: str {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_64(i64* %p, i32 %off32, i64 %val) {
+; CHECK-LABEL: atomic_store_relaxed_64:
+  %ptr_unsigned = getelementptr i64* %p, i32 4095
+  store atomic i64 %val, i64* %ptr_unsigned monotonic, align 8
+; CHECK: str {{x[0-9]+}}, [x0, #32760]
+
+  %ptr_regoff = getelementptr i64* %p, i32 %off32
+  store atomic i64 %val, i64* %ptr_regoff unordered, align 8
+; CHECK: str {{x[0-9]+}}, [x0, w1, sxtw #3]
+
+  %ptr_unscaled = getelementptr i64* %p, i32 -32
+  store atomic i64 %val, i64* %ptr_unscaled monotonic, align 8
+; CHECK: stur {{x[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
+  store atomic i64 %val, i64* %ptr_random unordered, align 8
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: str {{x[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+; rdar://11531169
+; rdar://11531308
+
+%"class.X::Atomic" = type { %struct.x_atomic_t }
+%struct.x_atomic_t = type { i32 }
+
+@counter = external hidden global %"class.X::Atomic", align 4
+
+define i32 @next_id() nounwind optsize ssp align 2 {
+entry:
+  %0 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
+  %add.i = add i32 %0, 1
+  %tobool = icmp eq i32 %add.i, 0
+  br i1 %tobool, label %if.else, label %return
+
+if.else:                                          ; preds = %entry
+  %1 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
+  %add.i2 = add i32 %1, 1
+  br label %return
+
+return:                                           ; preds = %if.else, %entry
+  %retval.0 = phi i32 [ %add.i2, %if.else ], [ %add.i, %entry ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/AArch64/arm64-basic-pic.ll b/test/CodeGen/AArch64/arm64-basic-pic.ll
new file mode 100644
index 0000000..9fdb1e9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-basic-pic.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+
+@var = global i32 0
+
+define i32 @get_globalvar() {
+; CHECK-LABEL: get_globalvar:
+
+  %val = load i32* @var
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
+; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], :got_lo12:var]
+; CHECK: ldr w0, [x[[GOTLOC]]]
+
+  ret i32 %val
+}
+
+define i32* @get_globalvaraddr() {
+; CHECK-LABEL: get_globalvaraddr:
+
+  %val = load i32* @var
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:var]
+
+  ret i32* @var
+}
+
+@hiddenvar = hidden global i32 0
+
+define i32 @get_hiddenvar() {
+; CHECK-LABEL: get_hiddenvar:
+
+  %val = load i32* @hiddenvar
+; CHECK: adrp x[[HI:[0-9]+]], hiddenvar
+; CHECK: ldr w0, [x[[HI]], :lo12:hiddenvar]
+
+  ret i32 %val
+}
+
+define i32* @get_hiddenvaraddr() {
+; CHECK-LABEL: get_hiddenvaraddr:
+
+  %val = load i32* @hiddenvar
+; CHECK: adrp [[HI:x[0-9]+]], hiddenvar
+; CHECK: add x0, [[HI]], :lo12:hiddenvar
+
+  ret i32* @hiddenvar
+}
+
+define void()* @get_func() {
+; CHECK-LABEL: get_func:
+
+  ret void()* bitcast(void()*()* @get_func to void()*)
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func
+; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:get_func]
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
new file mode 100644
index 0000000..f0e968b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
@@ -0,0 +1,1101 @@
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
+
+; CHECK-LABEL: test_i64_f64:
+define void @test_i64_f64(double* %p, i64* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: str
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: str
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to i64
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_i64:
+define void @test_f64_i64(i64* %p, double* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: str
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: str
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to double
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
+; CHECK: ldr
+; CHECK: str
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: str
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: str
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to <1 x i64>
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to <2 x float>
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.2s }
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to <2 x i32>
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev16 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.4h }
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = bitcast <8 x i8> %2 to <4 x i16>
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = bitcast i64 %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = bitcast double %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = bitcast <1 x i64> %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = bitcast <2 x float> %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = bitcast <2 x i32> %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev16 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = bitcast <4 x i16> %2 to <8 x i8>
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: ext
+; CHECK: str
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: ext
+; CHECK: str
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: str q
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: str
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: str
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: ext
+; CHECK: str q
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to fp128
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
+; CHECK: ldr
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <2 x double>
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
+; CHECK: ldr
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <2 x i64>
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
+; CHECK: ldr q
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <4 x float>
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.4s }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <4 x i32>
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev16 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.8h }
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = bitcast <16 x i8> %2 to <8 x i16>
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
+; CHECK: ldr q
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = bitcast fp128 %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = bitcast <2 x double> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = bitcast <2 x i64> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = bitcast <4 x float> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = bitcast <4 x i32> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev16 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = bitcast <8 x i16> %2 to <16 x i8>
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-eh.ll b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
new file mode 100644
index 0000000..93e7da9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
@@ -0,0 +1,73 @@
+; RUN: llc -mtriple arm64_be-linux-gnu -filetype obj < %s | llvm-objdump -s - | FileCheck %s
+
+; ARM EHABI for big endian
+; This test case checks whether CIE length record is laid out in big endian format.
+;
+; This is the LLVM assembly generated from following C++ code:
+;
+; extern void foo(int);
+; void test(int a, int b) {
+;   try {
+;   foo(a);
+; } catch (...) {
+;   foo(b);
+; }
+;}
+
+define void @_Z4testii(i32 %a, i32 %b) #0 {
+entry:
+  invoke void @_Z3fooi(i32 %a)
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2
+  invoke void @_Z3fooi(i32 %b)
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %lpad
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %invoke.cont2
+  ret void
+
+lpad1:                                            ; preds = %lpad
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:                                        ; preds = %lpad1
+  resume { i8*, i32 } %3
+
+terminate.lpad:                                   ; preds = %lpad1
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %5 = extractvalue { i8*, i32 } %4, 0
+  tail call void @__clang_call_terminate(i8* %5) #3
+  unreachable
+}
+
+declare void @_Z3fooi(i32) #0
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: noinline noreturn nounwind
+define linkonce_odr hidden void @__clang_call_terminate(i8*) #1 {
+  %2 = tail call i8* @__cxa_begin_catch(i8* %0) #2
+  tail call void @_ZSt9terminatev() #3
+  unreachable
+}
+
+declare void @_ZSt9terminatev()
+
+; CHECK-LABEL: Contents of section .eh_frame:
+; CHECK-NEXT: 0000 0000001c
+
diff --git a/test/CodeGen/AArch64/arm64-big-endian-varargs.ll b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
new file mode 100644
index 0000000..d7b26b9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s | FileCheck %s
+
+; Vararg saving must save Q registers using the equivalent of STR/STP.
+
+target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64_be-arm-none-eabi"
+
+%struct.__va_list = type { i8*, i8*, i8*, i32, i32 }
+
+declare void @llvm.va_start(i8*) nounwind
+declare void @llvm.va_end(i8*) nounwind
+
+define double @callee(i32 %a, ...) {
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+entry:
+  %vl = alloca %struct.__va_list, align 8
+  %vl1 = bitcast %struct.__va_list* %vl to i8*
+  call void @llvm.va_start(i8* %vl1)
+  %vr_offs_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 4
+  %vr_offs = load i32* %vr_offs_p, align 4
+  %0 = icmp sgt i32 %vr_offs, -1
+  br i1 %0, label %vaarg.on_stack, label %vaarg.maybe_reg
+
+vaarg.maybe_reg:                                  ; preds = %entry
+  %new_reg_offs = add i32 %vr_offs, 16
+  store i32 %new_reg_offs, i32* %vr_offs_p, align 4
+  %inreg = icmp slt i32 %new_reg_offs, 1
+  br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack
+
+vaarg.in_reg:                                     ; preds = %vaarg.maybe_reg
+  %reg_top_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 2
+  %reg_top = load i8** %reg_top_p, align 8
+  %1 = sext i32 %vr_offs to i64
+  %2 = getelementptr i8* %reg_top, i64 %1
+  %3 = ptrtoint i8* %2 to i64
+  %align_be = add i64 %3, 8
+  %4 = inttoptr i64 %align_be to i8*
+  br label %vaarg.end
+
+vaarg.on_stack:                                   ; preds = %vaarg.maybe_reg, %entry
+  %stack_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 0
+  %stack = load i8** %stack_p, align 8
+  %new_stack = getelementptr i8* %stack, i64 8
+  store i8* %new_stack, i8** %stack_p, align 8
+  br label %vaarg.end
+
+vaarg.end:                                        ; preds = %vaarg.on_stack, %vaarg.in_reg
+  %.sink = phi i8* [ %4, %vaarg.in_reg ], [ %stack, %vaarg.on_stack ]
+  %5 = bitcast i8* %.sink to double*
+  %6 = load double* %5, align 8
+  call void @llvm.va_end(i8* %vl1)
+  ret double %6
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
new file mode 100644
index 0000000..1dcccf1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
@@ -0,0 +1,848 @@
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s
+
+; CHECK-LABEL: test_i64_f64:
+define i64 @test_i64_f64(double %p) {
+; CHECK-NOT: rev
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+define i64 @test_i64_v1i64(<1 x i64> %p) {
+; CHECK-NOT: rev
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+define i64 @test_i64_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+define i64 @test_i64_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+define i64 @test_i64_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+define i64 @test_i64_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+}
+
+; CHECK-LABEL: test_f64_i64:
+define double @test_f64_i64(i64 %p) {
+; CHECK-NOT: rev
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+define double @test_f64_v1i64(<1 x i64> %p) {
+; CHECK-NOT: rev
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+define double @test_f64_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+define double @test_f64_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+define double @test_f64_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+define double @test_f64_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+define <1 x i64> @test_v1i64_i64(i64 %p) {
+; CHECK-NOT: rev
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+define <1 x i64> @test_v1i64_f64(double %p) {
+; CHECK-NOT: rev
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+define <1 x i64> @test_v1i64_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+define <1 x i64> @test_v1i64_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+define <1 x i64> @test_v1i64_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+define <1 x i64> @test_v1i64_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+define <2 x float> @test_v2f32_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+define <2 x float> @test_v2f32_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+define <2 x float> @test_v2f32_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+define <2 x float> @test_v2f32_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+define <2 x float> @test_v2f32_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+define <2 x float> @test_v2f32_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+define <2 x i32> @test_v2i32_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+define <2 x i32> @test_v2i32_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+define <2 x i32> @test_v2i32_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+define <2 x i32> @test_v2i32_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+define <2 x i32> @test_v2i32_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+define <2 x i32> @test_v2i32_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+define <4 x i16> @test_v4i16_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+define <4 x i16> @test_v4i16_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+define <4 x i16> @test_v4i16_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+define <4 x i16> @test_v4i16_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+define <4 x i16> @test_v4i16_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+define <4 x i16> @test_v4i16_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+define <8 x i8> @test_v8i8_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+define <8 x i8> @test_v8i8_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+define <8 x i8> @test_v8i8_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+define <8 x i8> @test_v8i8_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+define <8 x i8> @test_v8i8_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+define <8 x i8> @test_v8i8_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+define fp128 @test_f128_v2f64(<2 x double> %p) {
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+define fp128 @test_f128_v2i64(<2 x i64> %p) {
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+define fp128 @test_f128_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+define fp128 @test_f128_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+define fp128 @test_f128_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+define fp128 @test_f128_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+define <2 x double> @test_v2f64_f128(fp128 %p) {
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+define <2 x double> @test_v2f64_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+define <2 x double> @test_v2f64_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+define <2 x double> @test_v2f64_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+define <2 x double> @test_v2f64_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+define <2 x double> @test_v2f64_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+define <2 x i64> @test_v2i64_f128(fp128 %p) {
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+define <2 x i64> @test_v2i64_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+define <2 x i64> @test_v2i64_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+define <2 x i64> @test_v2i64_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+define <2 x i64> @test_v2i64_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+define <2 x i64> @test_v2i64_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+define <4 x float> @test_v4f32_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+define <4 x float> @test_v4f32_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+define <4 x float> @test_v4f32_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+define <4 x float> @test_v4f32_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+define <4 x float> @test_v4f32_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+define <4 x float> @test_v4f32_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+define <4 x i32> @test_v4i32_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+define <4 x i32> @test_v4i32_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+define <4 x i32> @test_v4i32_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+define <4 x i32> @test_v4i32_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+define <4 x i32> @test_v4i32_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+define <4 x i32> @test_v4i32_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+define <8 x i16> @test_v8i16_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+define <8 x i16> @test_v8i16_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+define <8 x i16> @test_v8i16_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+define <8 x i16> @test_v8i16_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+define <8 x i16> @test_v8i16_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+define <8 x i16> @test_v8i16_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+define <16 x i8> @test_v16i8_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+define <16 x i8> @test_v16i8_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+define <16 x i8> @test_v16i8_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+define <16 x i8> @test_v16i8_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+define <16 x i8> @test_v16i8_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+define <16 x i8> @test_v16i8_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
new file mode 100644
index 0000000..9a12b7a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
@@ -0,0 +1,1100 @@
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
+
+; CHECK-LABEL: test_i64_f64:
+declare i64 @test_i64_f64_helper(double %p)
+define void @test_i64_f64(double* %p, i64* %q) {
+; CHECK-NOT: rev
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call i64 @test_i64_f64_helper(double %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+declare i64 @test_i64_v1i64_helper(<1 x i64> %p)
+define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
+; CHECK-NOT: rev
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call i64 @test_i64_v1i64_helper(<1 x i64> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+declare i64 @test_i64_v2f32_helper(<2 x float> %p)
+define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call i64 @test_i64_v2f32_helper(<2 x float> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+declare i64 @test_i64_v2i32_helper(<2 x i32> %p)
+define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call i64 @test_i64_v2i32_helper(<2 x i32> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+declare i64 @test_i64_v4i16_helper(<4 x i16> %p)
+define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call i64 @test_i64_v4i16_helper(<4 x i16> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+declare i64 @test_i64_v8i8_helper(<8 x i8> %p)
+define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call i64 @test_i64_v8i8_helper(<8 x i8> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_i64:
+declare double @test_f64_i64_helper(i64 %p)
+define void @test_f64_i64(i64* %p, double* %q) {
+; CHECK-NOT: rev
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call double @test_f64_i64_helper(i64 %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+declare double @test_f64_v1i64_helper(<1 x i64> %p)
+define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
+; CHECK-NOT: rev
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call double @test_f64_v1i64_helper(<1 x i64> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+declare double @test_f64_v2f32_helper(<2 x float> %p)
+define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call double @test_f64_v2f32_helper(<2 x float> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+declare double @test_f64_v2i32_helper(<2 x i32> %p)
+define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call double @test_f64_v2i32_helper(<2 x i32> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+declare double @test_f64_v4i16_helper(<4 x i16> %p)
+define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call double @test_f64_v4i16_helper(<4 x i16> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+declare double @test_f64_v8i8_helper(<8 x i8> %p)
+define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call double @test_f64_v8i8_helper(<8 x i8> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+declare <1 x i64> @test_v1i64_i64_helper(i64 %p)
+define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
+; CHECK-NOT: rev
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <1 x i64> @test_v1i64_i64_helper(i64 %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+declare <1 x i64> @test_v1i64_f64_helper(double %p)
+define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
+; CHECK-NOT: rev
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <1 x i64> @test_v1i64_f64_helper(double %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+declare <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %p)
+define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+declare <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %p)
+define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+declare <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %p)
+define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+declare <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %p)
+define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+declare <2 x float> @test_v2f32_i64_helper(i64 %p)
+define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <2 x float> @test_v2f32_i64_helper(i64 %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+declare <2 x float> @test_v2f32_f64_helper(double %p)
+define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <2 x float> @test_v2f32_f64_helper(double %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+declare <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %p)
+define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+declare <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %p)
+define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+declare <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %p)
+define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+declare <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %p)
+define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+declare <2 x i32> @test_v2i32_i64_helper(i64 %p)
+define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <2 x i32> @test_v2i32_i64_helper(i64 %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+declare <2 x i32> @test_v2i32_f64_helper(double %p)
+define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <2 x i32> @test_v2i32_f64_helper(double %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+declare <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %p)
+define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+declare <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %p)
+define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+declare <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %p)
+define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+declare <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %p)
+define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+declare <4 x i16> @test_v4i16_i64_helper(i64 %p)
+define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <4 x i16> @test_v4i16_i64_helper(i64 %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+declare <4 x i16> @test_v4i16_f64_helper(double %p)
+define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <4 x i16> @test_v4i16_f64_helper(double %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+declare <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %p)
+define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+declare <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %p)
+define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+declare <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %p)
+define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+declare <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %p)
+define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.4h
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+declare <8 x i8> @test_v8i8_i64_helper(i64 %p)
+define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <8 x i8> @test_v8i8_i64_helper(i64 %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+declare <8 x i8> @test_v8i8_f64_helper(double %p)
+define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <8 x i8> @test_v8i8_f64_helper(double %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+declare <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %p)
+define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+declare <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %p)
+define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+declare <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %p)
+define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+declare <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %p)
+define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.8b
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+declare fp128 @test_f128_v2f64_helper(<2 x double> %p)
+define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call fp128 @test_f128_v2f64_helper(<2 x double> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+declare fp128 @test_f128_v2i64_helper(<2 x i64> %p)
+define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call fp128 @test_f128_v2i64_helper(<2 x i64> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+declare fp128 @test_f128_v4f32_helper(<4 x float> %p)
+define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call fp128 @test_f128_v4f32_helper(<4 x float> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+declare fp128 @test_f128_v4i32_helper(<4 x i32> %p)
+define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call fp128 @test_f128_v4i32_helper(<4 x i32> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+declare fp128 @test_f128_v8i16_helper(<8 x i16> %p)
+define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call fp128 @test_f128_v8i16_helper(<8 x i16> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+declare fp128 @test_f128_v16i8_helper(<16 x i8> %p)
+define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call fp128 @test_f128_v16i8_helper(<16 x i8> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+declare <2 x double> @test_v2f64_f128_helper(fp128 %p)
+define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <2 x double> @test_v2f64_f128_helper(fp128 %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+declare <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %p)
+define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
+; CHECK: ext
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+declare <2 x double> @test_v2f64_v4f32_helper(<4 x float> %p)
+define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <2 x double> @test_v2f64_v4f32_helper(<4 x float> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+declare <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %p)
+define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+declare <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %p)
+define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+declare <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %p)
+define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+declare <2 x i64> @test_v2i64_f128_helper(fp128 %p)
+define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <2 x i64> @test_v2i64_f128_helper(fp128 %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+declare <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %p)
+define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
+; CHECK: ext
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+declare <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %p)
+define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+declare <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %p)
+define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+declare <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %p)
+define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+declare <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %p)
+define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+declare <4 x float> @test_v4f32_f128_helper(fp128 %p)
+define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <4 x float> @test_v4f32_f128_helper(fp128 %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+declare <4 x float> @test_v4f32_v2f64_helper(<2 x double> %p)
+define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <4 x float> @test_v4f32_v2f64_helper(<2 x double> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+declare <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %p)
+define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+declare <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %p)
+define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+declare <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %p)
+define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+declare <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %p)
+define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+declare <4 x i32> @test_v4i32_f128_helper(fp128 %p)
+define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <4 x i32> @test_v4i32_f128_helper(fp128 %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+declare <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %p)
+define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+declare <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %p)
+define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+declare <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %p)
+define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+declare <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %p)
+define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+declare <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %p)
+define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+declare <8 x i16> @test_v8i16_f128_helper(fp128 %p)
+define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <8 x i16> @test_v8i16_f128_helper(fp128 %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+declare <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %p)
+define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+declare <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %p)
+define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+declare <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %p)
+define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+declare <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %p)
+define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+declare <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %p)
+define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+declare <16 x i8> @test_v16i8_f128_helper(fp128 %p)
+define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <16 x i8> @test_v16i8_f128_helper(fp128 %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+declare <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %p)
+define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+declare <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %p)
+define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+declare <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %p)
+define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+declare <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %p)
+define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+declare <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %p)
+define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-big-imm-offsets.ll b/test/CodeGen/AArch64/arm64-big-imm-offsets.ll
new file mode 100644
index 0000000..a56df07
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-imm-offsets.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm64 < %s
+
+
+; Make sure large offsets aren't mistaken for valid immediate offsets.
+; <rdar://problem/13190511>
+define void @f(i32* nocapture %p) {
+entry:
+  %a = ptrtoint i32* %p to i64
+  %ao = add i64 %a, 25769803792
+  %b = inttoptr i64 %ao to i32*
+  store volatile i32 0, i32* %b, align 4
+  store volatile i32 0, i32* %b, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-big-stack.ll b/test/CodeGen/AArch64/arm64-big-stack.ll
new file mode 100644
index 0000000..3f91bb3c2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-stack.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-apple-macosx10"
+
+; Check that big stacks are generated correctly.
+; Currently, this is done by a sequence of sub instructions,
+; which can encode immediate with a 12 bits mask an optionally
+; shift left (up to 12). I.e., 16773120 is the biggest value.
+; <rdar://12513931>
+; CHECK-LABEL: foo:
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #2, lsl #12
+define void @foo() nounwind ssp {
+entry:
+  %buffer = alloca [33554432 x i8], align 1
+  %arraydecay = getelementptr inbounds [33554432 x i8]* %buffer, i64 0, i64 0
+  call void @doit(i8* %arraydecay) nounwind
+  ret void
+}
+
+declare void @doit(i8*)
diff --git a/test/CodeGen/AArch64/arm64-bitfield-extract.ll b/test/CodeGen/AArch64/arm64-bitfield-extract.ll
new file mode 100644
index 0000000..112efdd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-bitfield-extract.ll
@@ -0,0 +1,532 @@
+; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s | FileCheck --check-prefix=OPT %s
+; RUN: llc < %s -march=arm64 | FileCheck %s
+%struct.X = type { i8, i8, [2 x i8] }
+%struct.Y = type { i32, i8 }
+%struct.Z = type { i8, i8, [2 x i8], i16 }
+%struct.A = type { i64, i8 }
+
+define void @foo(%struct.X* nocapture %x, %struct.Y* nocapture %y) nounwind optsize ssp {
+; CHECK-LABEL: foo:
+; CHECK: ubfx
+; CHECK-NOT: and
+; CHECK: ret
+
+  %tmp = bitcast %struct.X* %x to i32*
+  %tmp1 = load i32* %tmp, align 4
+  %b = getelementptr inbounds %struct.Y* %y, i64 0, i32 1
+  %bf.clear = lshr i32 %tmp1, 3
+  %bf.clear.lobit = and i32 %bf.clear, 1
+  %frombool = trunc i32 %bf.clear.lobit to i8
+  store i8 %frombool, i8* %b, align 1
+  ret void
+}
+
+define i32 @baz(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: baz:
+; CHECK: sbfx  w0, w0, #0, #4
+  %tmp = trunc i64 %cav1.coerce to i32
+  %tmp1 = shl i32 %tmp, 28
+  %bf.val.sext = ashr exact i32 %tmp1, 28
+  ret i32 %bf.val.sext
+}
+
+define i32 @bar(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: sbfx  w0, w0, #4, #6
+  %tmp = trunc i64 %cav1.coerce to i32
+  %cav1.sroa.0.1.insert = shl i32 %tmp, 22
+  %tmp1 = ashr i32 %cav1.sroa.0.1.insert, 26
+  ret i32 %tmp1
+}
+
+define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind optsize ssp {
+; CHECK-LABEL: fct1:
+; CHECK: ubfx
+; CHECK-NOT: and
+; CHECK: ret
+
+  %tmp = bitcast %struct.Z* %x to i64*
+  %tmp1 = load i64* %tmp, align 4
+  %b = getelementptr inbounds %struct.A* %y, i64 0, i32 0
+  %bf.clear = lshr i64 %tmp1, 3
+  %bf.clear.lobit = and i64 %bf.clear, 1
+  store i64 %bf.clear.lobit, i64* %b, align 8
+  ret void
+}
+
+define i64 @fct2(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: fct2:
+; CHECK: sbfx  x0, x0, #0, #36
+  %tmp = shl i64 %cav1.coerce, 28
+  %bf.val.sext = ashr exact i64 %tmp, 28
+  ret i64 %bf.val.sext
+}
+
+define i64 @fct3(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: fct3:
+; CHECK: sbfx  x0, x0, #4, #38
+  %cav1.sroa.0.1.insert = shl i64 %cav1.coerce, 22
+  %tmp1 = ashr i64 %cav1.sroa.0.1.insert, 26
+  ret i64 %tmp1
+}
+
+define void @fct4(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct4:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #24
+; CHECK-NEXT: str [[REG1]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -16777216
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 16777215
+  %or = or i64 %and, %and1
+  store i64 %or, i64* %y, align 8
+  ret void
+}
+
+define void @fct5(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct5:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
+; CHECK-NEXT: str [[REG1]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  store i32 %or, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some low bits
+define void @fct6(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct6:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shr1 = lshr i32 %or, 2
+  store i32 %shr1, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+define void @fct7(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct7:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  store i32 %shl, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some low bits
+; (i64 version)
+define void @fct8(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct8:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shr1 = lshr i64 %or, 2
+  store i64 %shr1, i64* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; (i64 version)
+define void @fct9(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct9:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  store i64 %shl, i64* %y, align 8
+  ret void
+}
+
+; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
+; (i32 version)
+define void @fct10(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct10:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #0, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %and1 = and i32 %x, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  store i32 %shl, i32* %y, align 8
+  ret void
+}
+
+; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
+; (i64 version)
+define void @fct11(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct11:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #0, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %and1 = and i64 %x, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  store i64 %shl, i64* %y, align 8
+  ret void
+}
+
+define zeroext i1 @fct12bis(i32 %tmp2) unnamed_addr nounwind ssp align 2 {
+; CHECK-LABEL: fct12bis:
+; CHECK-NOT: and
+; CHECK: ubfx w0, w0, #11, #1
+  %and.i.i = and i32 %tmp2, 2048
+  %tobool.i.i = icmp ne i32 %and.i.i, 0
+  ret i1 %tobool.i.i
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+define void @fct12(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct12:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfx [[REG2:w[0-9]+]], [[REG1]], #2, #28
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  %shr2 = lshr i32 %shl, 4
+  store i32 %shr2, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+; (i64 version)
+define void @fct13(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct13:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfx [[REG2:x[0-9]+]], [[REG1]], #2, #60
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  %shr2 = lshr i64 %shl, 4
+  store i64 %shr2, i64* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+define void @fct14(i32* nocapture %y, i32 %x, i32 %x1) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct14:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #8
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #4
+; CHECK-NEXT: bfxil [[REG2]], w2, #5, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG3:w[0-9]+]], [[REG2]], #2
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -256
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 255
+  %or = or i32 %and, %and1
+  %shl = lshr i32 %or, 4
+  %and2 = and i32 %shl, -8
+  %shr1 = lshr i32 %x1, 5
+  %and3 = and i32 %shr1, 7
+  %or1 = or i32 %and2, %and3
+  %shl1 = shl i32 %or1, 2
+  store i32 %shl1, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+; (i64 version)
+define void @fct15(i64* nocapture %y, i64 %x, i64 %x1) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct15:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #8
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #4
+; CHECK-NEXT: bfxil [[REG2]], x2, #5, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG3:x[0-9]+]], [[REG2]], #2
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -256
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 255
+  %or = or i64 %and, %and1
+  %shl = lshr i64 %or, 4
+  %and2 = and i64 %shl, -8
+  %shr1 = lshr i64 %x1, 5
+  %and3 = and i64 %shr1, 7
+  %or1 = or i64 %and2, %and3
+  %shl1 = shl i64 %or1, 2
+  store i64 %shl1, i64* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits and a masking operation has to be kept
+define void @fct16(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct16:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; Create the constant
+; CHECK: movz [[REGCST:w[0-9]+]], #0x1a, lsl #16
+; CHECK: movk [[REGCST]], #0x8160
+; Do the masking
+; CHECK: and [[REG2:w[0-9]+]], [[REG1]], [[REGCST]]
+; CHECK-NEXT: bfxil [[REG2]], w1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfx [[REG3:w[0-9]+]], [[REG2]], #2, #28
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, 1737056
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  %shr2 = lshr i32 %shl, 4
+  store i32 %shr2, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits and a masking operation has to be kept
+; (i64 version)
+define void @fct17(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct17:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; Create the constant
+; CHECK: movz w[[REGCST:[0-9]+]], #0x1a, lsl #16
+; CHECK: movk w[[REGCST]], #0x8160
+; Do the masking
+; CHECK: and [[REG2:x[0-9]+]], [[REG1]], x[[REGCST]]
+; CHECK-NEXT: bfxil [[REG2]], x1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfx [[REG3:x[0-9]+]], [[REG2]], #2, #60
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, 1737056
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  %shr2 = lshr i64 %shl, 4
+  store i64 %shr2, i64* %y, align 8
+  ret void
+}
+
+define i64 @fct18(i32 %xor72) nounwind ssp {
+; CHECK-LABEL: fct18:
+; CHECK: ubfx x0, x0, #9, #8
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %result = and i64 %conv82, 255
+  ret i64 %result
+}
+
+; Using the access to the global array to keep the instruction and control flow.
+@first_ones = external global [65536 x i8]
+
+; Function Attrs: nounwind readonly ssp
+define i32 @fct19(i64 %arg1) nounwind readonly ssp  {
+; CHECK-LABEL: fct19:
+entry:
+  %x.sroa.1.0.extract.shift = lshr i64 %arg1, 16
+  %x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16
+  %x.sroa.3.0.extract.shift = lshr i64 %arg1, 32
+  %x.sroa.5.0.extract.shift = lshr i64 %arg1, 48
+  %tobool = icmp eq i64 %x.sroa.5.0.extract.shift, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %x.sroa.5.0.extract.shift
+  %0 = load i8* %arrayidx3, align 1
+  %conv = zext i8 %0 to i32
+  br label %return
+
+; OPT-LABEL: if.end
+if.end:                                           ; preds = %entry
+; OPT: lshr
+; CHECK: ubfx	[[REG1:x[0-9]+]], [[REG2:x[0-9]+]], #32, #16
+  %x.sroa.3.0.extract.trunc = trunc i64 %x.sroa.3.0.extract.shift to i16
+  %tobool6 = icmp eq i16 %x.sroa.3.0.extract.trunc, 0
+; CHECK: cbz
+  br i1 %tobool6, label %if.end13, label %if.then7
+
+; OPT-LABEL: if.then7
+if.then7:                                         ; preds = %if.end
+; OPT: lshr
+; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. 
+; So neither of them should be in the assemble code. 
+; CHECK-NOT: and
+; CHECK-NOT: ubfm
+  %idxprom10 = and i64 %x.sroa.3.0.extract.shift, 65535
+  %arrayidx11 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom10
+  %1 = load i8* %arrayidx11, align 1
+  %conv12 = zext i8 %1 to i32
+  %add = add nsw i32 %conv12, 16
+  br label %return
+
+; OPT-LABEL: if.end13
+if.end13:                                         ; preds = %if.end
+; OPT: lshr
+; OPT: trunc
+; CHECK: ubfx	[[REG3:x[0-9]+]], [[REG4:x[0-9]+]], #16, #16
+  %tobool16 = icmp eq i16 %x.sroa.1.0.extract.trunc, 0
+; CHECK: cbz
+  br i1 %tobool16, label %return, label %if.then17
+
+; OPT-LABEL: if.then17
+if.then17:                                        ; preds = %if.end13
+; OPT: lshr
+; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. 
+; So neither of them should be in the assemble code. 
+; CHECK-NOT: and
+; CHECK-NOT: ubfm
+  %idxprom20 = and i64 %x.sroa.1.0.extract.shift, 65535
+  %arrayidx21 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom20
+  %2 = load i8* %arrayidx21, align 1
+  %conv22 = zext i8 %2 to i32
+  %add23 = add nsw i32 %conv22, 32
+  br label %return
+
+return:                                           ; preds = %if.end13, %if.then17, %if.then7, %if.then
+; CHECK: ret
+  %retval.0 = phi i32 [ %conv, %if.then ], [ %add, %if.then7 ], [ %add23, %if.then17 ], [ 64, %if.end13 ]
+  ret i32 %retval.0
+}
+
+; Make sure we do not assert if the immediate in and is bigger than i64.
+; PR19503.
+; OPT-LABEL: @fct20
+; OPT: lshr
+; OPT-NOT: lshr
+; OPT: ret
+; CHECK-LABEL: fct20:
+; CHECK: ret
+define i80 @fct20(i128 %a, i128 %b) {
+entry:
+  %shr = lshr i128 %a, 18
+  %conv = trunc i128 %shr to i80
+  %tobool = icmp eq i128 %b, 0
+  br i1 %tobool, label %then, label %end
+then:                     
+  %and = and i128 %shr, 483673642326615442599424
+  %conv2 = trunc i128 %and to i80
+  br label %end
+end:
+  %conv3 = phi i80 [%conv, %entry], [%conv2, %then] 
+  ret i80 %conv3
+}
+
+; Check if we can still catch UBFX when "AND" is used by SHL.
+; CHECK-LABEL: fct21:
+; CHECK: ubfx
+@arr = external global [8 x [64 x i64]]
+define i64 @fct21(i64 %x) {
+entry:
+  %shr = lshr i64 %x, 4
+  %and = and i64 %shr, 15
+  %arrayidx = getelementptr inbounds [8 x [64 x i64]]* @arr, i64 0, i64 0, i64 %and
+  %0 = load i64* %arrayidx, align 8
+  ret i64 %0
+}
+
+define i16 @test_ignored_rightbits(i32 %dst, i32 %in) {
+; CHECK-LABEL: test_ignored_rightbits:
+
+  %positioned_field = shl i32 %in, 3
+  %positioned_masked_field = and i32 %positioned_field, 120
+  %masked_dst = and i32 %dst, 7
+  %insertion = or i32 %masked_dst, %positioned_masked_field
+; CHECK: {{bfm|bfi|bfxil}}
+
+  %shl16 = shl i32 %insertion, 8
+  %or18 = or i32 %shl16, %insertion
+  %conv19 = trunc i32 %or18 to i16
+; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #8, #7
+
+  ret i16 %conv19
+}
diff --git a/test/CodeGen/AArch64/arm64-blockaddress.ll b/test/CodeGen/AArch64/arm64-blockaddress.ll
new file mode 100644
index 0000000..ac4f19e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-blockaddress.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc < %s -mtriple=arm64-linux-gnu -code-model=large| FileCheck %s --check-prefix=CHECK-LARGE
+
+; rdar://9188695
+
+define i64 @t() nounwind ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: adrp [[REG:x[0-9]+]], Ltmp1@PAGE
+; CHECK: add {{x[0-9]+}}, [[REG]], Ltmp1@PAGEOFF
+
+; CHECK-LINUX-LABEL: t:
+; CHECK-LINUX: adrp [[REG:x[0-9]+]], .Ltmp1
+; CHECK-LINUX: add {{x[0-9]+}}, [[REG]], :lo12:.Ltmp1
+
+; CHECK-LARGE-LABEL: t:
+; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g3:[[DEST_LBL:.Ltmp[0-9]+]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g1_nc:[[DEST_LBL]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g0_nc:[[DEST_LBL]]
+
+  %recover = alloca i64, align 8
+  store volatile i64 ptrtoint (i8* blockaddress(@t, %mylabel) to i64), i64* %recover, align 8
+  br label %mylabel
+
+mylabel:
+  %tmp = load volatile i64* %recover, align 8
+  ret i64 %tmp
+}
diff --git a/test/CodeGen/AArch64/arm64-build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll
new file mode 100644
index 0000000..c109263
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; Check that building up a vector w/ only one non-zero lane initializes
+; intelligently.
+define void @one_lane(i32* nocapture %out_int, i32 %skip0) nounwind {
+; CHECK-LABEL: one_lane:
+; CHECK: dup.16b v[[REG:[0-9]+]], wzr
+; CHECK-NEXT: ins.b v[[REG]][0], w1
+; v and q are aliases, and str is preferred against st.16b when possible
+; rdar://11246289
+; CHECK: str q[[REG]], [x0]
+; CHECK: ret
+  %conv = trunc i32 %skip0 to i8
+  %vset_lane = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %conv, i32 0
+  %tmp = bitcast i32* %out_int to <4 x i32>*
+  %tmp1 = bitcast <16 x i8> %vset_lane to <4 x i32>
+  store <4 x i32> %tmp1, <4 x i32>* %tmp, align 16
+  ret void
+}
+
+; Check that building a vector from floats doesn't insert an unnecessary
+; copy for lane zero.
+define <4 x float>  @foo(float %a, float %b, float %c, float %d) nounwind {
+; CHECK-LABEL: foo:
+; CHECK-NOT: ins.s v0[0], v0[0]
+; CHECK: ins.s v0[1], v1[0]
+; CHECK: ins.s v0[2], v2[0]
+; CHECK: ins.s v0[3], v3[0]
+; CHECK: ret
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float %b, i32 1
+  %3 = insertelement <4 x float> %2, float %c, i32 2
+  %4 = insertelement <4 x float> %3, float %d, i32 3
+  ret <4 x float> %4
+}
diff --git a/test/CodeGen/AArch64/arm64-call-tailcalls.ll b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
new file mode 100644
index 0000000..487c1d9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+@t = weak global i32 ()* null
+@x = external global i32, align 4
+
+define void @t2() {
+; CHECK-LABEL: t2:
+; CHECK: adrp	x[[GOTADDR:[0-9]+]], _t@GOTPAGE
+; CHECK: ldr	x[[ADDR:[0-9]+]], [x[[GOTADDR]], _t@GOTPAGEOFF]
+; CHECK: ldr	x[[DEST:[0-9]+]], [x[[ADDR]]]
+; CHECK: br	x[[DEST]]
+  %tmp = load i32 ()** @t
+  %tmp.upgrd.2 = tail call i32 %tmp()
+  ret void
+}
+
+define void @t3() {
+; CHECK-LABEL: t3:
+; CHECK: b	_t2
+  tail call void @t2()
+  ret void
+}
+
+define double @t4(double %a) nounwind readonly ssp {
+; CHECK-LABEL: t4:
+; CHECK: b	_sin
+  %tmp = tail call double @sin(double %a) nounwind readonly
+  ret double %tmp
+}
+
+define float @t5(float %a) nounwind readonly ssp {
+; CHECK-LABEL: t5:
+; CHECK: b	_sinf
+  %tmp = tail call float @sinf(float %a) nounwind readonly
+  ret float %tmp
+}
+
+define void @t7() nounwind {
+; CHECK-LABEL: t7:
+; CHECK: b	_foo
+; CHECK: b	_bar
+
+  br i1 undef, label %bb, label %bb1.lr.ph
+
+bb1.lr.ph:                                        ; preds = %entry
+  tail call void @bar() nounwind
+  ret void
+
+bb:                                               ; preds = %entry
+  tail call void @foo() nounwind
+  ret void
+}
+
+define i32 @t8(i32 %x) nounwind ssp {
+; CHECK-LABEL: t8:
+; CHECK: b	_a
+; CHECK: b	_b
+; CHECK: b	_c
+  %and = and i32 %x, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 @a(i32 %x) nounwind
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %and1 = and i32 %x, 2
+  %tobool2 = icmp eq i32 %and1, 0
+  br i1 %tobool2, label %if.end5, label %if.then3
+
+if.then3:                                         ; preds = %if.end
+  %call4 = tail call i32 @b(i32 %x) nounwind
+  br label %return
+
+if.end5:                                          ; preds = %if.end
+  %call6 = tail call i32 @c(i32 %x) nounwind
+  br label %return
+
+return:                                           ; preds = %if.end5, %if.then3, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ %call4, %if.then3 ], [ %call6, %if.end5 ]
+  ret i32 %retval.0
+}
+
+declare float @sinf(float) nounwind readonly
+declare double @sin(double) nounwind readonly
+declare void @bar() nounwind
+declare void @foo() nounwind
+declare i32 @a(i32)
+declare i32 @b(i32)
+declare i32 @c(i32)
diff --git a/test/CodeGen/AArch64/arm64-cast-opt.ll b/test/CodeGen/AArch64/arm64-cast-opt.ll
new file mode 100644
index 0000000..65a871d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-cast-opt.ll
@@ -0,0 +1,31 @@
+; RUN: llc -O3 -march=arm64 -mtriple arm64-apple-ios5.0.0 < %s | FileCheck %s
+; <rdar://problem/15992732>
+; Zero truncation is not necessary when the values are extended properly
+; already.
+
+@block = common global i8* null, align 8
+
+define zeroext i8 @foo(i32 %i1, i32 %i2) {
+; CHECK-LABEL: foo:
+; CHECK: cset
+; CHECK-NOT: and
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv9 = zext i1 %cmp7 to i8
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i8 [ %conv9, %if.then ], [ 1, %entry ]
+  ret i8 %retval.0
+}
diff --git a/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
new file mode 100644
index 0000000..664a26c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
@@ -0,0 +1,190 @@
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp | FileCheck %s
+target triple = "arm64-apple-ios7.0.0"
+
+@channelColumns = external global i64
+@channelTracks = external global i64
+@mazeRoute = external hidden unnamed_addr global i8*, align 8
+@TOP = external global i64*
+@BOT = external global i64*
+@netsAssign = external global i64*
+
+; Function from yacr2/maze.c
+; The branch at the end of %if.then is driven by %cmp5 and %cmp6.
+; Isel converts the and i1 into two branches, and arm64-ccmp should not convert
+; it back again. %cmp6 has much higher latency than %cmp5.
+; CHECK: Maze1
+; CHECK: %if.then
+; CHECK: cmp x{{[0-9]+}}, #2
+; CHECK-NEXT b.cc
+; CHECK: %if.then
+; CHECK: cmp x{{[0-9]+}}, #2
+; CHECK-NEXT b.cc
+define i32 @Maze1() nounwind ssp {
+entry:
+  %0 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp90 = icmp eq i64 %0, 0
+  br i1 %cmp90, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %1 = phi i64 [ %0, %entry ], [ %37, %for.inc ]
+  %i.092 = phi i64 [ 1, %entry ], [ %inc53, %for.inc ]
+  %numLeft.091 = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
+  %2 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx = getelementptr inbounds i8* %2, i64 %i.092
+  %3 = load i8* %arrayidx, align 1, !tbaa !1
+  %tobool = icmp eq i8 %3, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %4 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx1 = getelementptr inbounds i64* %4, i64 %i.092
+  %5 = load i64* %arrayidx1, align 8, !tbaa !0
+  %6 = load i64** @netsAssign, align 8, !tbaa !3
+  %arrayidx2 = getelementptr inbounds i64* %6, i64 %5
+  %7 = load i64* %arrayidx2, align 8, !tbaa !0
+  %8 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx3 = getelementptr inbounds i64* %8, i64 %i.092
+  %9 = load i64* %arrayidx3, align 8, !tbaa !0
+  %arrayidx4 = getelementptr inbounds i64* %6, i64 %9
+  %10 = load i64* %arrayidx4, align 8, !tbaa !0
+  %cmp5 = icmp ugt i64 %i.092, 1
+  %cmp6 = icmp ugt i64 %10, 1
+  %or.cond = and i1 %cmp5, %cmp6
+  br i1 %or.cond, label %land.lhs.true7, label %if.else
+
+land.lhs.true7:                                   ; preds = %if.then
+  %11 = load i64* @channelTracks, align 8, !tbaa !0
+  %add = add i64 %11, 1
+  %call = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add, i64 %10, i64 0, i64 %7, i32 -1, i32 -1)
+  %tobool8 = icmp eq i32 %call, 0
+  br i1 %tobool8, label %land.lhs.true7.if.else_crit_edge, label %if.then9
+
+land.lhs.true7.if.else_crit_edge:                 ; preds = %land.lhs.true7
+  %.pre = load i64* @channelColumns, align 8, !tbaa !0
+  br label %if.else
+
+if.then9:                                         ; preds = %land.lhs.true7
+  %12 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx10 = getelementptr inbounds i8* %12, i64 %i.092
+  store i8 0, i8* %arrayidx10, align 1, !tbaa !1
+  %13 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx11 = getelementptr inbounds i64* %13, i64 %i.092
+  %14 = load i64* %arrayidx11, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %14)
+  %15 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx12 = getelementptr inbounds i64* %15, i64 %i.092
+  %16 = load i64* %arrayidx12, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %16)
+  br label %for.inc
+
+if.else:                                          ; preds = %land.lhs.true7.if.else_crit_edge, %if.then
+  %17 = phi i64 [ %.pre, %land.lhs.true7.if.else_crit_edge ], [ %1, %if.then ]
+  %cmp13 = icmp ult i64 %i.092, %17
+  %or.cond89 = and i1 %cmp13, %cmp6
+  br i1 %or.cond89, label %land.lhs.true16, label %if.else24
+
+land.lhs.true16:                                  ; preds = %if.else
+  %18 = load i64* @channelTracks, align 8, !tbaa !0
+  %add17 = add i64 %18, 1
+  %call18 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add17, i64 %10, i64 0, i64 %7, i32 1, i32 -1)
+  %tobool19 = icmp eq i32 %call18, 0
+  br i1 %tobool19, label %if.else24, label %if.then20
+
+if.then20:                                        ; preds = %land.lhs.true16
+  %19 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx21 = getelementptr inbounds i8* %19, i64 %i.092
+  store i8 0, i8* %arrayidx21, align 1, !tbaa !1
+  %20 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx22 = getelementptr inbounds i64* %20, i64 %i.092
+  %21 = load i64* %arrayidx22, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %21)
+  %22 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx23 = getelementptr inbounds i64* %22, i64 %i.092
+  %23 = load i64* %arrayidx23, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %23)
+  br label %for.inc
+
+if.else24:                                        ; preds = %land.lhs.true16, %if.else
+  br i1 %cmp5, label %land.lhs.true26, label %if.else36
+
+land.lhs.true26:                                  ; preds = %if.else24
+  %24 = load i64* @channelTracks, align 8, !tbaa !0
+  %cmp27 = icmp ult i64 %7, %24
+  br i1 %cmp27, label %land.lhs.true28, label %if.else36
+
+land.lhs.true28:                                  ; preds = %land.lhs.true26
+  %add29 = add i64 %24, 1
+  %call30 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add29, i64 %10, i32 -1, i32 1)
+  %tobool31 = icmp eq i32 %call30, 0
+  br i1 %tobool31, label %if.else36, label %if.then32
+
+if.then32:                                        ; preds = %land.lhs.true28
+  %25 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx33 = getelementptr inbounds i8* %25, i64 %i.092
+  store i8 0, i8* %arrayidx33, align 1, !tbaa !1
+  %26 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx34 = getelementptr inbounds i64* %26, i64 %i.092
+  %27 = load i64* %arrayidx34, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %27)
+  %28 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx35 = getelementptr inbounds i64* %28, i64 %i.092
+  %29 = load i64* %arrayidx35, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %29)
+  br label %for.inc
+
+if.else36:                                        ; preds = %land.lhs.true28, %land.lhs.true26, %if.else24
+  %30 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp37 = icmp ult i64 %i.092, %30
+  br i1 %cmp37, label %land.lhs.true38, label %if.else48
+
+land.lhs.true38:                                  ; preds = %if.else36
+  %31 = load i64* @channelTracks, align 8, !tbaa !0
+  %cmp39 = icmp ult i64 %7, %31
+  br i1 %cmp39, label %land.lhs.true40, label %if.else48
+
+land.lhs.true40:                                  ; preds = %land.lhs.true38
+  %add41 = add i64 %31, 1
+  %call42 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add41, i64 %10, i32 1, i32 1)
+  %tobool43 = icmp eq i32 %call42, 0
+  br i1 %tobool43, label %if.else48, label %if.then44
+
+if.then44:                                        ; preds = %land.lhs.true40
+  %32 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx45 = getelementptr inbounds i8* %32, i64 %i.092
+  store i8 0, i8* %arrayidx45, align 1, !tbaa !1
+  %33 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx46 = getelementptr inbounds i64* %33, i64 %i.092
+  %34 = load i64* %arrayidx46, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %34)
+  %35 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx47 = getelementptr inbounds i64* %35, i64 %i.092
+  %36 = load i64* %arrayidx47, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %36)
+  br label %for.inc
+
+if.else48:                                        ; preds = %land.lhs.true40, %land.lhs.true38, %if.else36
+  %inc = add nsw i32 %numLeft.091, 1
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.else48, %if.then44, %if.then32, %if.then20, %if.then9, %for.body
+  %numLeft.1 = phi i32 [ %numLeft.091, %if.then9 ], [ %numLeft.091, %if.then20 ], [ %numLeft.091, %if.then32 ], [ %numLeft.091, %if.then44 ], [ %inc, %if.else48 ], [ %numLeft.091, %for.body ]
+  %inc53 = add i64 %i.092, 1
+  %37 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp = icmp ugt i64 %inc53, %37
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %numLeft.0.lcssa = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
+  ret i32 %numLeft.0.lcssa
+}
+
+; Materializable
+declare hidden fastcc i32 @Maze1Mech(i64, i64, i64, i64, i64, i32, i32) nounwind ssp
+
+; Materializable
+declare hidden fastcc void @CleanNet(i64) nounwind ssp
+
+!0 = metadata !{metadata !"long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
new file mode 100644
index 0000000..63965f9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -0,0 +1,289 @@
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp -aarch64-stress-ccmp | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; CHECK: single_same
+; CHECK: cmp w0, #5
+; CHECK-NEXT: ccmp w1, #17, #4, ne
+; CHECK-NEXT: b.ne
+; CHECK: %if.then
+; CHECK: bl _foo
+; CHECK: %if.end
+define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  %cmp1 = icmp eq i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Different condition codes for the two compares.
+; CHECK: single_different
+; CHECK: cmp w0, #6
+; CHECK-NEXT: ccmp w1, #17, #0, ge
+; CHECK-NEXT: b.eq
+; CHECK: %if.then
+; CHECK: bl _foo
+; CHECK: %if.end
+define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp sle i32 %a, 5
+  %cmp1 = icmp ne i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Second block clobbers the flags, can't convert (easily).
+; CHECK: single_flagclobber
+; CHECK: cmp
+; CHECK: b.eq
+; CHECK: cmp
+; CHECK: b.gt
+define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp1 = icmp slt i32 %b, 7
+  %mul = shl nsw i32 %b, 1
+  %add = add nsw i32 %b, 1
+  %cond = select i1 %cmp1, i32 %mul, i32 %add
+  %cmp2 = icmp slt i32 %cond, 17
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  ret i32 7
+}
+
+; Second block clobbers the flags and ends with a tbz terminator.
+; CHECK: single_flagclobber_tbz
+; CHECK: cmp
+; CHECK: b.eq
+; CHECK: cmp
+; CHECK: tbz
+define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp1 = icmp slt i32 %b, 7
+  %mul = shl nsw i32 %b, 1
+  %add = add nsw i32 %b, 1
+  %cond = select i1 %cmp1, i32 %mul, i32 %add
+  %and = and i32 %cond, 8
+  %cmp2 = icmp ne i32 %and, 0
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  ret i32 7
+}
+
+; Speculatively execute division by zero.
+; The sdiv/udiv instructions do not trap when the divisor is zero, so they are
+; safe to speculate.
+; CHECK: speculate_division
+; CHECK-NOT: cmp
+; CHECK: sdiv
+; CHECK: cmp
+; CHECK-NEXT: ccmp
+define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %div = sdiv i32 %b, %a
+  %cmp1 = icmp slt i32 %div, 17
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Floating point compare.
+; CHECK: single_fcmp
+; CHECK: cmp
+; CHECK-NOT: b.
+; CHECK: fccmp {{.*}}, #8, ge
+; CHECK: b.lt
+define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %conv = sitofp i32 %a to float
+  %div = fdiv float %b, %conv
+  %cmp1 = fcmp oge float %div, 1.700000e+01
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Chain multiple compares.
+; CHECK: multi_different
+; CHECK: cmp
+; CHECK: ccmp
+; CHECK: ccmp
+; CHECK: b.
+define void @multi_different(i32 %a, i32 %b, i32 %c) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, %b
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %div = sdiv i32 %b, %a
+  %cmp1 = icmp eq i32 %div, 5
+  %cmp4 = icmp sgt i32 %div, %c
+  %or.cond = and i1 %cmp1, %cmp4
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Convert a cbz in the head block.
+; CHECK: cbz_head
+; CHECK: cmp w0, #0
+; CHECK: ccmp
+define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp ne i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Check that the immediate operand is in range. The ccmp instruction encodes a
+; smaller range of immediates than subs/adds.
+; The ccmp immediates must be in the range 0-31.
+; CHECK: immediate_range
+; CHECK-NOT: ccmp
+define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  %cmp1 = icmp eq i32 %b, 32
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Convert a cbz in the second block.
+; CHECK: cbz_second
+; CHECK: cmp w0, #0
+; CHECK: ccmp w1, #0, #0, ne
+; CHECK: b.eq
+define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp ne i32 %b, 0
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Convert a cbnz in the second block.
+; CHECK: cbnz_second
+; CHECK: cmp w0, #0
+; CHECK: ccmp w1, #0, #4, ne
+; CHECK: b.ne
+define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp eq i32 %b, 0
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+declare i32 @foo()
+
+%str1 = type { %str2 }
+%str2 = type { [24 x i8], i8*, i32, %str1*, i32, [4 x i8], %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, i8*, i8, i8*, %str1*, i8* }
+
+; Test case distilled from 126.gcc.
+; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor.
+; CHECK: build_modify_expr
+define void @build_modify_expr() nounwind ssp {
+entry:
+  switch i32 undef, label %sw.bb.i.i [
+    i32 69, label %if.end85
+    i32 70, label %if.end85
+    i32 71, label %if.end85
+    i32 72, label %if.end85
+    i32 73, label %if.end85
+    i32 105, label %if.end85
+    i32 106, label %if.end85
+  ]
+
+if.end85:
+  ret void
+
+sw.bb.i.i:
+  %ref.tr.i.i = phi %str1* [ %0, %sw.bb.i.i ], [ undef, %entry ]
+  %operands.i.i = getelementptr inbounds %str1* %ref.tr.i.i, i64 0, i32 0, i32 2
+  %arrayidx.i.i = bitcast i32* %operands.i.i to %str1**
+  %0 = load %str1** %arrayidx.i.i, align 8
+  %code1.i.i.phi.trans.insert = getelementptr inbounds %str1* %0, i64 0, i32 0, i32 0, i64 16
+  br label %sw.bb.i.i
+}
diff --git a/test/CodeGen/AArch64/arm64-clrsb.ll b/test/CodeGen/AArch64/arm64-clrsb.ll
new file mode 100644
index 0000000..042e52e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-clrsb.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=arm64 |  FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ctlz.i32(i32, i1) #0
+declare i64 @llvm.ctlz.i64(i64, i1) #1
+
+; Function Attrs: nounwind ssp
+define i32 @clrsb32(i32 %x) #2 {
+entry:
+  %shr = ashr i32 %x, 31
+  %xor = xor i32 %shr, %x
+  %mul = shl i32 %xor, 1
+  %add = or i32 %mul, 1
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %add, i1 false)
+
+  ret i32 %0
+; CHECK-LABEL: clrsb32
+; CHECK:   cls [[TEMP:w[0-9]+]], [[TEMP]]
+}
+
+; Function Attrs: nounwind ssp
+define i64 @clrsb64(i64 %x) #3 {
+entry:
+  %shr = ashr i64 %x, 63
+  %xor = xor i64 %shr, %x
+  %mul = shl nsw i64 %xor, 1
+  %add = or i64 %mul, 1
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %add, i1 false)
+
+  ret i64 %0
+; CHECK-LABEL: clrsb64
+; CHECK:   cls [[TEMP:x[0-9]+]], [[TEMP]]
+}
diff --git a/test/CodeGen/AArch64/arm64-coalesce-ext.ll b/test/CodeGen/AArch64/arm64-coalesce-ext.ll
new file mode 100644
index 0000000..9420bf3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-coalesce-ext.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-darwin < %s | FileCheck %s
+; Check that the peephole optimizer knows about sext and zext instructions.
+; CHECK: test1sext
+define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
+  %C = add i64 %A, %B
+  ; CHECK: add x[[SUM:[0-9]+]], x0, x1
+  %D = trunc i64 %C to i32
+  %E = shl i64 %C, 32
+  %F = ashr i64 %E, 32
+  ; CHECK: sxtw x[[EXT:[0-9]+]], w[[SUM]]
+  store volatile i64 %F, i64 *%P2
+  ; CHECK: str x[[EXT]]
+  store volatile i32 %D, i32* %P
+  ; Reuse low bits of extended register, don't extend live range of SUM.
+  ; CHECK: str w[[SUM]]
+  ret i32 %D
+}
diff --git a/test/CodeGen/AArch64/arm64-code-model-large-abs.ll b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll
new file mode 100644
index 0000000..264da2d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large < %s | FileCheck %s
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
+
+define i8* @global_addr() {
+; CHECK-LABEL: global_addr:
+  ret i8* @var8
+  ; The movz/movk calculation should end up returned directly in x0.
+; CHECK: movz x0, #:abs_g3:var8
+; CHECK: movk x0, #:abs_g2_nc:var8
+; CHECK: movk x0, #:abs_g1_nc:var8
+; CHECK: movk x0, #:abs_g0_nc:var8
+; CHECK-NEXT: ret
+}
+
+define i8 @global_i8() {
+; CHECK-LABEL: global_i8:
+  %val = load i8* @var8
+  ret i8 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var8
+; CHECK: ldrb w0, [x[[ADDR_REG]]]
+}
+
+define i16 @global_i16() {
+; CHECK-LABEL: global_i16:
+  %val = load i16* @var16
+  ret i16 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var16
+; CHECK: ldrh w0, [x[[ADDR_REG]]]
+}
+
+define i32 @global_i32() {
+; CHECK-LABEL: global_i32:
+  %val = load i32* @var32
+  ret i32 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var32
+; CHECK: ldr w0, [x[[ADDR_REG]]]
+}
+
+define i64 @global_i64() {
+; CHECK-LABEL: global_i64:
+  %val = load i64* @var64
+  ret i64 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var64
+; CHECK: ldr x0, [x[[ADDR_REG]]]
+}
+
+define <2 x i64> @constpool() {
+; CHECK-LABEL: constpool:
+  ret <2 x i64> <i64 123456789, i64 987654321100>
+
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:[[CPADDR:.LCPI[0-9]+_[0-9]+]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:[[CPADDR]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:[[CPADDR]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:[[CPADDR]]
+; CHECK: ldr q0, [x[[ADDR_REG]]]
+}
diff --git a/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
new file mode 100644
index 0000000..81cee38
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=arm64-apple-ios -O3 -aarch64-collect-loh -aarch64-collect-loh-bb-only=true -aarch64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
+; Check that the LOH analysis does not crash when the analysed chained
+; contains instructions that are filtered out.
+;
+; Before the fix for <rdar://problem/16041712>, these cases were removed
+; from the main container. Now, the deterministic container does not allow
+; to remove arbitrary values, so we have to live with garbage values.
+; <rdar://problem/16041712>
+
+%"class.H4ISP::H4ISPDevice" = type { i32 (%"class.H4ISP::H4ISPDevice"*, i32, i8*, i8*)*, i8*, i32*, %"class.H4ISP::H4ISPCameraManager"* }
+
+%"class.H4ISP::H4ISPCameraManager" = type opaque
+
+declare i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"*)
+
+@pH4ISPDevice = hidden global %"class.H4ISP::H4ISPDevice"* null, align 8
+
+; CHECK-LABEL: _foo:
+; CHECK: ret
+; CHECK-NOT: .loh AdrpLdrGotLdr
+define void @foo() {
+entry:
+  br label %if.then83
+if.then83:                                        ; preds = %if.end81
+  %tmp = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
+  %call84 = call i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"* %tmp) #19
+  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"()
+  %tmp2 = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
+  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x28}"()
+  %pCameraManager.i268 = getelementptr inbounds %"class.H4ISP::H4ISPDevice"* %tmp2, i64 0, i32 3
+  %tmp3 = load %"class.H4ISP::H4ISPCameraManager"** %pCameraManager.i268, align 8
+  %tobool.i269 = icmp eq %"class.H4ISP::H4ISPCameraManager"* %tmp3, null
+  br i1 %tobool.i269, label %if.then83, label %end
+end:
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-collect-loh-str.ll b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
new file mode 100644
index 0000000..d7bc00e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; Test case for <rdar://problem/15942912>.
+; AdrpAddStr cannot be used when the store uses same
+; register as address and value. Indeed, the related
+; if applied, may completely remove the definition or
+; at least provide a wrong one (with the offset folded
+; into the definition).
+
+%struct.anon = type { i32*, i32** }
+
+@pptp_wan_head = internal global %struct.anon zeroinitializer, align 8
+
+; CHECK-LABEL: _pptp_wan_init
+; CHECK: ret
+; CHECK-NOT: AdrpAddStr
+define i32 @pptp_wan_init() {
+entry:
+  store i32* null, i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), align 8
+  store i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), i32*** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 1), align 8
+  ret i32 0
+}
+
+
diff --git a/test/CodeGen/AArch64/arm64-collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll
new file mode 100644
index 0000000..6d73daa
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF
+
+; CHECK-ELF-NOT: .loh
+; CHECK-ELF-NOT: AdrpAdrp
+; CHECK-ELF-NOT: AdrpAdd
+; CHECK-ELF-NOT: AdrpLdrGot
+
+@a = internal unnamed_addr global i32 0, align 4
+@b = external global i32
+
+; Function Attrs: noinline nounwind ssp
+define void @foo(i32 %t) {
+entry:
+  %tmp = load i32* @a, align 4
+  %add = add nsw i32 %tmp, %t
+  store i32 %add, i32* @a, align 4
+  ret void
+}
+
+; Function Attrs: nounwind ssp
+; Testcase for <rdar://problem/15438605>, AdrpAdrp reuse is valid only when the first adrp
+; dominates the second.
+; The first adrp comes from the loading of 'a' and the second the loading of 'b'.
+; 'a' is loaded in if.then, 'b' in if.end4, if.then does not dominates if.end4.
+; CHECK-LABEL: _test
+; CHECK: ret
+; CHECK-NOT: .loh AdrpAdrp
+define i32 @test(i32 %t) {
+entry:
+  %cmp = icmp sgt i32 %t, 5
+  br i1 %cmp, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %entry
+  %tmp = load i32* @a, align 4
+  %add = add nsw i32 %tmp, %t
+  %cmp1 = icmp sgt i32 %add, 12
+  br i1 %cmp1, label %if.then2, label %if.end4
+
+if.then2:                                         ; preds = %if.then
+  tail call void @foo(i32 %add)
+  %tmp1 = load i32* @a, align 4
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.then2, %if.then, %entry
+  %t.addr.0 = phi i32 [ %tmp1, %if.then2 ], [ %t, %if.then ], [ %t, %entry ]
+  %tmp2 = load i32* @b, align 4
+  %add5 = add nsw i32 %tmp2, %t.addr.0
+  tail call void @foo(i32 %add5)
+  %tmp3 = load i32* @b, align 4
+  %add6 = add nsw i32 %tmp3, %t.addr.0
+  ret i32 %add6
+}
diff --git a/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll b/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
new file mode 100644
index 0000000..f65b116
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s
+
+; The DAG combiner decided to use a vector load/store for this struct copy
+; previously. This probably shouldn't happen without NEON, but the most
+; important thing is that it compiles.
+
+define void @store_combine() nounwind {
+  %src = alloca { double, double }, align 8
+  %dst = alloca { double, double }, align 8
+
+  %src.realp = getelementptr inbounds { double, double }* %src, i32 0, i32 0
+  %src.real = load double* %src.realp
+  %src.imagp = getelementptr inbounds { double, double }* %src, i32 0, i32 1
+  %src.imag = load double* %src.imagp
+
+  %dst.realp = getelementptr inbounds { double, double }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { double, double }* %dst, i32 0, i32 1
+  store double %src.real, double* %dst.realp
+  store double %src.imag, double* %dst.imagp
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-complex-ret.ll b/test/CodeGen/AArch64/arm64-complex-ret.ll
new file mode 100644
index 0000000..93d50a5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-complex-ret.ll
@@ -0,0 +1,7 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+define { i192, i192, i21, i192 } @foo(i192) {
+; CHECK-LABEL: foo:
+; CHECK: stp xzr, xzr, [x8]
+  ret { i192, i192, i21, i192 } {i192 0, i192 1, i21 2, i192 3}
+}
diff --git a/test/CodeGen/AArch64/arm64-const-addr.ll b/test/CodeGen/AArch64/arm64-const-addr.ll
new file mode 100644
index 0000000..c55a922
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-const-addr.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=arm64-darwin-unknown < %s | FileCheck %s
+
+%T = type { i32, i32, i32, i32 }
+
+; Test if the constant base address gets only materialized once.
+define i32 @test1() nounwind {
+; CHECK-LABEL:  test1
+; CHECK:        movz  w8, #0x40f, lsl #16
+; CHECK-NEXT:   movk  w8, #0xc000
+; CHECK-NEXT:   ldp w9, w10, [x8, #4]
+; CHECK:        ldr w8, [x8, #12]
+  %at = inttoptr i64 68141056 to %T*
+  %o1 = getelementptr %T* %at, i32 0, i32 1
+  %t1 = load i32* %o1
+  %o2 = getelementptr %T* %at, i32 0, i32 2
+  %t2 = load i32* %o2
+  %a1 = add i32 %t1, %t2
+  %o3 = getelementptr %T* %at, i32 0, i32 3
+  %t3 = load i32* %o3
+  %a2 = add i32 %a1, %t3
+  ret i32 %a2
+}
+
diff --git a/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll b/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
new file mode 100644
index 0000000..d862b1e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; CHECK: fptosi_1
+; CHECK: fcvtzs.2d
+; CHECK: xtn.2s
+; CHECK: ret
+define void @fptosi_1() nounwind noinline ssp {
+entry:
+  %0 = fptosi <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
+
+; CHECK: fptoui_1
+; CHECK: fcvtzu.2d
+; CHECK: xtn.2s
+; CHECK: ret
+define void @fptoui_1() nounwind noinline ssp {
+entry:
+  %0 = fptoui <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll b/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
new file mode 100644
index 0000000..daaf1e0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: f1:
+; CHECK: sshll.2d v0, v0, #0
+; CHECK-NEXT: scvtf.2d v0, v0
+; CHECK-NEXT: ret
+  %conv = sitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+define <2 x double> @f2(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: f2:
+; CHECK: ushll.2d v0, v0, #0
+; CHECK-NEXT: ucvtf.2d v0, v0
+; CHECK-NEXT: ret
+  %conv = uitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+; CHECK: autogen_SD19655
+; CHECK: scvtf
+; CHECK: ret
+define void @autogen_SD19655() {
+  %T = load <2 x i64>* undef
+  %F = sitofp <2 x i64> undef to <2 x float>
+  store <2 x float> %F, <2 x float>* undef
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-copy-tuple.ll b/test/CodeGen/AArch64/arm64-copy-tuple.ll
new file mode 100644
index 0000000..1803787
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-copy-tuple.ll
@@ -0,0 +1,146 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
+
+; The main purpose of this test is to find out whether copyPhysReg can deal with
+; the memmove-like situation arising in tuples, where an early copy can clobber
+; the value needed by a later one if the tuples overlap.
+
+; We use dummy inline asm to force LLVM to generate a COPY between the registers
+; we want by clobbering all the others.
+
+define void @test_D1D2_from_D0D1(i8* %addr) #0 {
+; CHECK-LABEL: test_D1D2_from_D0D1:
+; CHECK: mov.8b v2, v1
+; CHECK: mov.8b v1, v0
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D0D1_from_D1D2(i8* %addr) #0 {
+; CHECK-LABEL: test_D0D1_from_D1D2:
+; CHECK: mov.8b v0, v1
+; CHECK: mov.8b v1, v2
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D0D1_from_D31D0(i8* %addr) #0 {
+; CHECK-LABEL: test_D0D1_from_D31D0:
+; CHECK: mov.8b v1, v0
+; CHECK: mov.8b v0, v31
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D31D0_from_D0D1(i8* %addr) #0 {
+; CHECK-LABEL: test_D31D0_from_D0D1:
+; CHECK: mov.8b v31, v0
+; CHECK: mov.8b v0, v1
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D2D3D4_from_D0D1D2(i8* %addr) #0 {
+; CHECK-LABEL: test_D2D3D4_from_D0D1D2:
+; CHECK: mov.8b v4, v2
+; CHECK: mov.8b v3, v1
+; CHECK: mov.8b v2, v0
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 1
+  %vec2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 2
+
+  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+  ret void
+}
+
+define void @test_Q0Q1Q2_from_Q1Q2Q3(i8* %addr) #0 {
+; CHECK-LABEL: test_Q0Q1Q2_from_Q1Q2Q3:
+; CHECK: mov.16b v0, v1
+; CHECK: mov.16b v1, v2
+; CHECK: mov.16b v2, v3
+entry:
+  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
+  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
+  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
+  tail call void asm sideeffect "", "~{v0},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+  ret void
+}
+
+define void @test_Q1Q2Q3Q4_from_Q30Q31Q0Q1(i8* %addr) #0 {
+; CHECK-LABEL: test_Q1Q2Q3Q4_from_Q30Q31Q0Q1:
+; CHECK: mov.16b v4, v1
+; CHECK: mov.16b v3, v0
+; CHECK: mov.16b v2, v31
+; CHECK: mov.16b v1, v30
+  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
+  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
+  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
+  %vec3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 3
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}"()
+  tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
+
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
diff --git a/test/CodeGen/AArch64/arm64-crc32.ll b/test/CodeGen/AArch64/arm64-crc32.ll
new file mode 100644
index 0000000..d3099e6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-crc32.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=arm64 -mattr=+crc -o - %s | FileCheck %s
+
+define i32 @test_crc32b(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32b:
+; CHECK: crc32b w0, w0, w1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.aarch64.crc32b(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32h(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32h:
+; CHECK: crc32h w0, w0, w1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.aarch64.crc32h(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32w(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32w:
+; CHECK: crc32w w0, w0, w1
+  %val = call i32 @llvm.aarch64.crc32w(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32x(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32x:
+; CHECK: crc32x w0, w0, x1
+  %val = call i32 @llvm.aarch64.crc32x(i32 %cur, i64 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cb(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32cb:
+; CHECK: crc32cb w0, w0, w1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.aarch64.crc32cb(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32ch(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32ch:
+; CHECK: crc32ch w0, w0, w1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.aarch64.crc32ch(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32cw(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32cw:
+; CHECK: crc32cw w0, w0, w1
+  %val = call i32 @llvm.aarch64.crc32cw(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cx(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32cx:
+; CHECK: crc32cx w0, w0, x1
+  %val = call i32 @llvm.aarch64.crc32cx(i32 %cur, i64 %next)
+  ret i32 %val
+}
+
+declare i32 @llvm.aarch64.crc32b(i32, i32)
+declare i32 @llvm.aarch64.crc32h(i32, i32)
+declare i32 @llvm.aarch64.crc32w(i32, i32)
+declare i32 @llvm.aarch64.crc32x(i32, i64)
+
+declare i32 @llvm.aarch64.crc32cb(i32, i32)
+declare i32 @llvm.aarch64.crc32ch(i32, i32)
+declare i32 @llvm.aarch64.crc32cw(i32, i32)
+declare i32 @llvm.aarch64.crc32cx(i32, i64)
diff --git a/test/CodeGen/AArch64/arm64-crypto.ll b/test/CodeGen/AArch64/arm64-crypto.ll
new file mode 100644
index 0000000..2908b33
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-crypto.ll
@@ -0,0 +1,135 @@
+; RUN: llc -march=arm64 -mattr=crypto -aarch64-neon-syntax=apple -o - %s | FileCheck %s
+
+declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data)
+declare <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data)
+
+define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aese:
+; CHECK: aese.16b v0, v1
+  %res = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aesd:
+; CHECK: aesd.16b v0, v1
+  %res = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesmc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesmc:
+; CHECK: aesmc.16b v0, v0
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesimc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesimc:
+; CHECK: aesimc.16b v0, v0
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data)
+  ret <16 x i8> %res
+}
+
+declare <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e)
+declare <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+declare <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+
+define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+; <rdar://problem/14742333> Incomplete removal of unnecessary FMOV instructions in intrinsic SHA1
+define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c_in_a_row:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1
+; CHECK-NOT: fmov
+; CHECK: sha1c.4s q0, s[[SHA1RES]], v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %extract = extractelement <4 x i32> %res, i32 0
+  %res2 = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
+  ret <4 x i32> %res2
+}
+
+define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1p:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1p.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1m:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1m.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define i32 @test_sha1h(i32 %hash_e) {
+; CHECK-LABEL: test_sha1h:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
+; CHECK: fmov w0, [[RES]]
+  %res = call i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e)
+  ret i32 %res
+}
+
+define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) {
+; CHECK-LABEL: test_sha1su0:
+; CHECK: sha1su0.4s v0, v1, v2
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) {
+; CHECK-LABEL: test_sha1su1:
+; CHECK: sha1su1.4s v0, v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+declare <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+
+define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h:
+; CHECK: sha256h.4s q0, q1, v2
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h2:
+; CHECK: sha256h2.4s q0, q1, v2
+
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) {
+; CHECK-LABEL: test_sha256su0:
+; CHECK: sha256su0.4s v0, v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
+; CHECK-LABEL: test_sha256su1:
+; CHECK: sha256su1.4s v0, v1, v2
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+  ret <4 x i32> %res
+}
diff --git a/test/CodeGen/AArch64/arm64-cse.ll b/test/CodeGen/AArch64/arm64-cse.ll
new file mode 100644
index 0000000..bb14c89
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-cse.ll
@@ -0,0 +1,59 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; rdar://12462006
+; CSE between "icmp reg reg" and "sub reg reg".
+; Both can be in the same basic block or in different basic blocks.
+define i8* @t1(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: subs
+; CHECK-NOT: cmp
+; CHECK-NOT: sub
+; CHECK: b.ge
+; CHECK: sub
+; CHECK: sub
+; CHECK-NOT: sub
+; CHECK: ret
+ %0 = load i32* %offset, align 4
+ %cmp = icmp slt i32 %0, %size
+ %s = sub nsw i32 %0, %size
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %sub = sub nsw i32 %0, %size
+ %s2 = sub nsw i32 %s, %size
+ %s3 = sub nsw i32 %sub, %s2
+ store i32 %s3, i32* %offset, align 4
+ %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+ br label %return
+
+return:
+ %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ ret i8* %retval.0
+}
+
+; CSE between "icmp reg imm" and "sub reg imm".
+define i8* @t2(i8* %base, i32* nocapture %offset) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: subs
+; CHECK-NOT: cmp
+; CHECK-NOT: sub
+; CHECK: b.lt
+; CHECK-NOT: sub
+; CHECK: ret
+ %0 = load i32* %offset, align 4
+ %cmp = icmp slt i32 %0, 1
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %sub = sub nsw i32 %0, 1
+ store i32 %sub, i32* %offset, align 4
+ %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+ br label %return
+
+return:
+ %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ ret i8* %retval.0
+}
diff --git a/test/CodeGen/AArch64/arm64-csel.ll b/test/CodeGen/AArch64/arm64-csel.ll
new file mode 100644
index 0000000..98eba30
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-csel.ll
@@ -0,0 +1,230 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-unknown-unknown"
+
+; CHECK-LABEL: foo1
+; CHECK: cinc w{{[0-9]+}}, w{{[0-9]+}}, ne
+define i32 @foo1(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %not.tobool = icmp ne i32 %c, 0
+  %add = zext i1 %not.tobool to i32
+  %b.add = add i32 %c, %b
+  %add1 = add i32 %b.add, %add
+  ret i32 %add1
+}
+
+; CHECK-LABEL: foo2
+; CHECK: cneg w{{[0-9]+}}, w{{[0-9]+}}, ne
+define i32 @foo2(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %mul = sub i32 0, %b
+  %tobool = icmp eq i32 %c, 0
+  %b.mul = select i1 %tobool, i32 %b, i32 %mul
+  %add = add nsw i32 %b.mul, %c
+  ret i32 %add
+}
+
+; CHECK-LABEL: foo3
+; CHECK: cinv w{{[0-9]+}}, w{{[0-9]+}}, ne
+define i32 @foo3(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %not.tobool = icmp ne i32 %c, 0
+  %xor = sext i1 %not.tobool to i32
+  %b.xor = xor i32 %xor, %b
+  %add = add nsw i32 %b.xor, %c
+  ret i32 %add
+}
+
+; rdar://11632325
+define i32@foo4(i32 %a) nounwind ssp {
+; CHECK-LABEL: foo4
+; CHECK: cneg
+; CHECK-NEXT: ret
+  %cmp = icmp sgt i32 %a, -1
+  %neg = sub nsw i32 0, %a
+  %cond = select i1 %cmp, i32 %a, i32 %neg
+  ret i32 %cond
+}
+
+define i32@foo5(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK-LABEL: foo5
+; CHECK: subs
+; CHECK-NEXT: cneg
+; CHECK-NEXT: ret
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub3 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub3
+  ret i32 %cond
+}
+
+; make sure we can handle branch instruction in optimizeCompare.
+define i32@foo6(i32 %a, i32 %b) nounwind ssp {
+; CHECK-LABEL: foo6
+; CHECK: b
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, 0
+  br i1 %cmp, label %l.if, label %l.else
+
+l.if:
+  ret i32 1
+
+l.else:
+  ret i32 %sub
+}
+
+; If CPSR is used multiple times and V flag is used, we don't remove cmp.
+define i32 @foo7(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: foo7:
+; CHECK: sub
+; CHECK-next: adds
+; CHECK-next: csneg
+; CHECK-next: b
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub3 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub3
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = icmp slt i32 %sub, -1
+  %sel = select i1 %cmp2, i32 %cond, i32 %a
+  ret i32 %sel
+
+if.else:
+  ret i32 %cond
+}
+
+define i32 @foo8(i32 %v, i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: foo8:
+; CHECK: cmp w0, #0
+; CHECK: csinv w0, w1, w2, ne
+  %tobool = icmp eq i32 %v, 0
+  %neg = xor i32 -1, %b
+  %cond = select i1 %tobool, i32 %neg, i32 %a
+  ret i32 %cond
+}
+
+define i32 @foo9(i32 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo9:
+; CHECK: cmp w0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cinv w0, w[[REG]], eq
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 4, i32 -5
+  ret i32 %cond
+}
+
+define i64 @foo10(i64 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo10:
+; CHECK: cmp x0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cinv x0, x[[REG]], eq
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 4, i64 -5
+  ret i64 %cond
+}
+
+define i32 @foo11(i32 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo11:
+; CHECK: cmp w0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cneg w0, w[[REG]], eq
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 4, i32 -4
+  ret i32 %cond
+}
+
+define i64 @foo12(i64 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo12:
+; CHECK: cmp x0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cneg x0, x[[REG]], eq
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 4, i64 -4
+  ret i64 %cond
+}
+
+define i32 @foo13(i32 %v, i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo13:
+; CHECK: cmp w0, #0
+; CHECK: csneg w0, w1, w2, ne
+  %tobool = icmp eq i32 %v, 0
+  %sub = sub i32 0, %b
+  %cond = select i1 %tobool, i32 %sub, i32 %a
+  ret i32 %cond
+}
+
+define i64 @foo14(i64 %v, i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo14:
+; CHECK: cmp x0, #0
+; CHECK: csneg x0, x1, x2, ne
+  %tobool = icmp eq i64 %v, 0
+  %sub = sub i64 0, %b
+  %cond = select i1 %tobool, i64 %sub, i64 %a
+  ret i64 %cond
+}
+
+define i32 @foo15(i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo15:
+; CHECK: cmp w0, w1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc w0, w[[REG]], gt
+  %cmp = icmp sgt i32 %a, %b
+  %. = select i1 %cmp, i32 2, i32 1
+  ret i32 %.
+}
+
+define i32 @foo16(i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo16:
+; CHECK: cmp w0, w1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc w0, w[[REG]], le
+  %cmp = icmp sgt i32 %a, %b
+  %. = select i1 %cmp, i32 1, i32 2
+  ret i32 %.
+}
+
+define i64 @foo17(i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo17:
+; CHECK: cmp x0, x1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc x0, x[[REG]], gt
+  %cmp = icmp sgt i64 %a, %b
+  %. = select i1 %cmp, i64 2, i64 1
+  ret i64 %.
+}
+
+define i64 @foo18(i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo18:
+; CHECK: cmp x0, x1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc x0, x[[REG]], le
+  %cmp = icmp sgt i64 %a, %b
+  %. = select i1 %cmp, i64 1, i64 2
+  ret i64 %.
+}
+
+define i64 @foo19(i64 %a, i64 %b, i64 %c) {
+entry:
+; CHECK-LABEL: foo19:
+; CHECK: cinc x0, x2
+; CHECK-NOT: add
+  %cmp = icmp ult i64 %a, %b
+  %inc = zext i1 %cmp to i64
+  %inc.c = add i64 %inc, %c
+  ret i64 %inc.c
+}
diff --git a/test/CodeGen/AArch64/arm64-cvt.ll b/test/CodeGen/AArch64/arm64-cvt.ll
new file mode 100644
index 0000000..420a8bc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-cvt.ll
@@ -0,0 +1,401 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to away)
+;
+define i32 @fcvtas_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1s:
+;CHECK: fcvtas w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1s:
+;CHECK: fcvtas x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtas_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1d:
+;CHECK: fcvtas w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1d:
+;CHECK: fcvtas x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtas.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtas.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtas.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtas.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer
+;
+define i32 @fcvtau_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1s:
+;CHECK: fcvtau w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1s:
+;CHECK: fcvtau x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtau_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1d:
+;CHECK: fcvtau w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1d:
+;CHECK: fcvtau x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtau.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtau.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtau.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtau.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward -Inf)
+;
+define i32 @fcvtms_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1s:
+;CHECK: fcvtms w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1s:
+;CHECK: fcvtms x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtms_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1d:
+;CHECK: fcvtms w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1d:
+;CHECK: fcvtms x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtms.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtms.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtms.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtms.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward -Inf)
+;
+define i32 @fcvtmu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1s:
+;CHECK: fcvtmu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1s:
+;CHECK: fcvtmu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtmu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1d:
+;CHECK: fcvtmu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1d:
+;CHECK: fcvtmu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to even)
+;
+define i32 @fcvtns_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1s:
+;CHECK: fcvtns w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1s:
+;CHECK: fcvtns x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtns_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1d:
+;CHECK: fcvtns w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1d:
+;CHECK: fcvtns x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtns.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtns.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtns.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtns.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (to nearest with ties to even)
+;
+define i32 @fcvtnu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1s:
+;CHECK: fcvtnu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1s:
+;CHECK: fcvtnu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtnu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1d:
+;CHECK: fcvtnu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1d:
+;CHECK: fcvtnu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward +Inf)
+;
+define i32 @fcvtps_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1s:
+;CHECK: fcvtps w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1s:
+;CHECK: fcvtps x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtps_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1d:
+;CHECK: fcvtps w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1d:
+;CHECK: fcvtps x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtps.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtps.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtps.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtps.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward +Inf)
+;
+define i32 @fcvtpu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1s:
+;CHECK: fcvtpu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1s:
+;CHECK: fcvtpu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtpu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1d:
+;CHECK: fcvtpu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1d:
+;CHECK: fcvtpu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double) nounwind readnone
+
+;
+;  Floating-point scalar convert to signed integer (toward zero)
+;
+define i32 @fcvtzs_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1s:
+;CHECK: fcvtzs w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1s:
+;CHECK: fcvtzs x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtzs_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1d:
+;CHECK: fcvtzs w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1d:
+;CHECK: fcvtzs x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward zero)
+;
+define i32 @fcvtzu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1s:
+;CHECK: fcvtzu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1s:
+;CHECK: fcvtzu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtzu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1d:
+;CHECK: fcvtzu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1d:
+;CHECK: fcvtzu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll b/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
new file mode 100644
index 0000000..a45e313
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -o /dev/null
+; rdar://10795250
+; DAGCombiner should converge.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-macosx10.8.0"
+
+define i64 @foo(i128 %Params.coerce, i128 %SelLocs.coerce) {
+entry:
+  %tmp = lshr i128 %Params.coerce, 61
+  %.tr38.i = trunc i128 %tmp to i64
+  %mul.i = and i64 %.tr38.i, 4294967288
+  %tmp1 = lshr i128 %SelLocs.coerce, 62
+  %.tr.i = trunc i128 %tmp1 to i64
+  %mul7.i = and i64 %.tr.i, 4294967292
+  %add.i = add i64 %mul7.i, %mul.i
+  %conv.i.i = and i64 %add.i, 4294967292
+  ret i64 %conv.i.i
+}
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
new file mode 100644
index 0000000..2cf0135
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mcpu=cyclone < %s | FileCheck %s
+
+target datalayout = "e-i64:64-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+%"struct.SU" = type { i32, %"struct.SU"*, i32*, i32, i32, %"struct.BO", i32, [5 x i8] }
+%"struct.BO" = type { %"struct.RE" }
+
+%"struct.RE" = type { i32, i32, i32, i32 }
+
+; This is a read-modify-write of some bifields combined into an i48.  It gets
+; legalized into i32 and i16 accesses.  Only a single store of zero to the low
+; i32 part should be live.
+
+; CHECK-LABEL: test:
+; CHECK-NOT: ldr
+; CHECK: str wzr
+; CHECK-NOT: str
+define void @test(%"struct.SU"* nocapture %su) {
+entry:
+  %r1 = getelementptr inbounds %"struct.SU"* %su, i64 1, i32 5
+  %r2 = bitcast %"struct.BO"* %r1 to i48*
+  %r3 = load i48* %r2, align 8
+  %r4 = and i48 %r3, -4294967296
+  %r5 = or i48 0, %r4
+  store i48 %r5, i48* %r2, align 8
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
new file mode 100644
index 0000000..2e4b658
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
@@ -0,0 +1,46 @@
+; RUN: llc -O3 < %s | FileCheck %s
+; RUN: llc -O3 -addr-sink-using-gep=1 < %s | FileCheck %s
+; Test case for a DAG combiner bug where we combined an indexed load
+; with an extension (sext, zext, or any) into a regular extended load,
+; i.e., dropping the indexed value.
+; <rdar://problem/16389332>
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+%class.A = type { i64, i64 }
+%class.C = type { i64 }
+
+; CHECK-LABEL: XX:
+; CHECK: ldr
+define void @XX(%class.A* %K) {
+entry:
+  br i1 undef, label %if.then, label %lor.rhs.i
+
+lor.rhs.i:                                        ; preds = %entry
+  %tmp = load i32* undef, align 4
+  %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
+  %tmp1 = load i64* %y.i.i.i, align 8
+  %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
+  %div11.i = sdiv i32 %U.sroa.3.8.extract.trunc.i, 17
+  %add12.i = add nsw i32 0, %div11.i
+  %U.sroa.3.12.extract.shift.i = lshr i64 %tmp1, 32
+  %U.sroa.3.12.extract.trunc.i = trunc i64 %U.sroa.3.12.extract.shift.i to i32
+  %div15.i = sdiv i32 %U.sroa.3.12.extract.trunc.i, 13
+  %add16.i = add nsw i32 %add12.i, %div15.i
+  %rem.i.i = srem i32 %add16.i, %tmp
+  %idxprom = sext i32 %rem.i.i to i64
+  %arrayidx = getelementptr inbounds %class.C** undef, i64 %idxprom
+  %tobool533 = icmp eq %class.C* undef, null
+  br i1 %tobool533, label %while.end, label %while.body
+
+if.then:                                          ; preds = %entry
+  unreachable
+
+while.body:                                       ; preds = %lor.rhs.i
+  unreachable
+
+while.end:                                        ; preds = %lor.rhs.i
+  %tmp3 = load %class.C** %arrayidx, align 8
+  unreachable
+}
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll b/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
new file mode 100644
index 0000000..0679014
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
@@ -0,0 +1,102 @@
+; RUN: llc -mtriple arm64-apple-ios -O3 -o - < %s | FileCheck %s
+; <rdar://problem/14477220>
+
+%class.Complex = type { float, float }
+%class.Complex_int = type { i32, i32 }
+%class.Complex_long = type { i64, i64 }
+
+; CHECK-LABEL: @test
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
+; CHECK: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: fadd {{s[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: fadd {{s[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test(%class.Complex* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %0 = bitcast %class.Complex* %arrayidx to i64*
+  %1 = load i64* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
+  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
+  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
+  %4 = load float* %i.i, align 4
+  %add.i = fadd float %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
+  %5 = load float* %r.i, align 4
+  %add5.i = fadd float %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
+  store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
+
+; CHECK-LABEL: @test_int
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
+; CHECK: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: add {{w[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: add {{w[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test_int(%class.Complex_int* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex_int* %out, i64 %out_start
+  %0 = bitcast %class.Complex_int* %arrayidx to i64*
+  %1 = load i64* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to i32
+  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
+  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to i32
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex_int* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 0
+  %4 = load i32* %i.i, align 4
+  %add.i = add i32 %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i32> undef, i32 %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 1
+  %5 = load i32* %r.i, align 4
+  %add5.i = add i32 %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i32> %retval.sroa.0.0.vec.insert.i, i32 %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_int* %arrayidx to <2 x i32>*
+  store <2 x i32> %retval.sroa.0.4.vec.insert.i, <2 x i32>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
+
+; CHECK-LABEL: @test_long
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #4
+; CHECK: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], {{\[}}[[BASE]], #128]
+; CHECK: add {{x[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: add {{x[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test_long(%class.Complex_long* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex_long* %out, i64 %out_start
+  %0 = bitcast %class.Complex_long* %arrayidx to i128*
+  %1 = load i128* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i128 %1 to i64
+  %2 = bitcast i64 %t0.sroa.0.0.extract.trunc to i64
+  %t0.sroa.2.0.extract.shift = lshr i128 %1, 64
+  %t0.sroa.2.0.extract.trunc = trunc i128 %t0.sroa.2.0.extract.shift to i64
+  %3 = bitcast i64 %t0.sroa.2.0.extract.trunc to i64
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex_long* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 0
+  %4 = load i64* %i.i, align 4
+  %add.i = add i64 %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i64> undef, i64 %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 1
+  %5 = load i64* %r.i, align 4
+  %add5.i = add i64 %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i64> %retval.sroa.0.0.vec.insert.i, i64 %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_long* %arrayidx to <2 x i64>*
+  store <2 x i64> %retval.sroa.0.4.vec.insert.i, <2 x i64>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll b/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
new file mode 100644
index 0000000..9bb4b71
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test1() #0 {
+  %tmp1 = alloca i8
+  %tmp2 = alloca i32, i32 4096
+  %tmp3 = icmp eq i8* %tmp1, null
+  %tmp4 = zext i1 %tmp3 to i32
+
+  ret i32 %tmp4
+
+  ; CHECK-LABEL: test1
+  ; CHECK:   adds [[TEMP:[a-z0-9]+]], sp, #4, lsl #12
+  ; CHECK:   adds [[TEMP]], [[TEMP]], #15
+}
diff --git a/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll b/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
new file mode 100644
index 0000000..1bbcf50
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple="arm64-apple-ios" < %s | FileCheck %s
+;
+; Check that the dead register definition pass is considering implicit defs.
+; When rematerializing through truncates, the coalescer may produce instructions
+; with dead defs, but live implicit-defs of subregs:
+; E.g. %X1<def, dead> = MOVi64imm 2, %W1<imp-def>; %X1:GPR64, %W1:GPR32
+; These instructions are live, and their definitions should not be rewritten.
+;
+; <rdar://problem/16492408>
+
+define void @testcase() {
+; CHECK: testcase:
+; CHECK-NOT: orr xzr, xzr, #0x2
+
+bb1:
+  %tmp1 = tail call float @ceilf(float 2.000000e+00)
+  %tmp2 = fptoui float %tmp1 to i64
+  br i1 undef, label %bb2, label %bb3
+
+bb2:
+  tail call void @foo()
+  br label %bb3
+
+bb3:
+  %tmp3 = trunc i64 %tmp2 to i32
+  tail call void @bar(i32 %tmp3)
+  ret void
+}
+
+declare void @foo()
+declare void @bar(i32)
+declare float @ceilf(float) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-dup.ll b/test/CodeGen/AArch64/arm64-dup.ll
new file mode 100644
index 0000000..0c56b46
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dup.ll
@@ -0,0 +1,323 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+define <8 x i8> @v_dup8(i8 %A) nounwind {
+;CHECK-LABEL: v_dup8:
+;CHECK: dup.8b
+	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
+	ret <8 x i8> %tmp8
+}
+
+define <4 x i16> @v_dup16(i16 %A) nounwind {
+;CHECK-LABEL: v_dup16:
+;CHECK: dup.4h
+	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_dup32(i32 %A) nounwind {
+;CHECK-LABEL: v_dup32:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_dupfloat(float %A) nounwind {
+;CHECK-LABEL: v_dupfloat:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_dupQ8(i8 %A) nounwind {
+;CHECK-LABEL: v_dupQ8:
+;CHECK: dup.16b
+	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
+	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
+	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
+	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
+	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
+	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
+	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
+	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
+	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
+	ret <16 x i8> %tmp16
+}
+
+define <8 x i16> @v_dupQ16(i16 %A) nounwind {
+;CHECK-LABEL: v_dupQ16:
+;CHECK: dup.8h
+	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
+	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
+	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
+	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
+	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
+	ret <8 x i16> %tmp8
+}
+
+define <4 x i32> @v_dupQ32(i32 %A) nounwind {
+;CHECK-LABEL: v_dupQ32:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
+	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
+	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
+	ret <4 x i32> %tmp4
+}
+
+define <4 x float> @v_dupQfloat(float %A) nounwind {
+;CHECK-LABEL: v_dupQfloat:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
+	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
+	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
+	ret <4 x float> %tmp4
+}
+
+; Check to make sure it works with shuffles, too.
+
+define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
+;CHECK-LABEL: v_shuffledup8:
+;CHECK: dup.8b
+	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
+;CHECK-LABEL: v_shuffledup16:
+;CHECK: dup.4h
+	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
+;CHECK-LABEL: v_shuffledup32:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_shuffledupfloat(float %A) nounwind {
+;CHECK-LABEL: v_shuffledupfloat:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ8:
+;CHECK: dup.16b
+	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ16:
+;CHECK: dup.8h
+	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ32:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
+;CHECK-LABEL: v_shuffledupQfloat:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+	ret <4 x float> %tmp2
+}
+
+define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vduplane8:
+;CHECK: dup.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vduplane16:
+;CHECK: dup.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vduplane32:
+;CHECK: dup.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
+;CHECK-LABEL: vduplanefloat:
+;CHECK: dup.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ8:
+;CHECK: dup.16b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ16:
+;CHECK: dup.8h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ32:
+;CHECK: dup.4s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
+;CHECK-LABEL: vduplaneQfloat:
+;CHECK: dup.4s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x float> %tmp2
+}
+
+define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: foo:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: bar:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x i64> %0
+}
+
+define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: baz:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %0
+}
+
+define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: qux:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %0
+}
+
+define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone  {
+; CHECK-LABEL: f:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ret
+  %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
+  %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
+  ret <2 x i32> %vecinit1
+}
+
+define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
+; CHECK-LABEL: g:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ins.s v0[2], w1
+; CHECK-NEXT: ins.s v0[3], w0
+; CHECK-NEXT: ret
+  %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
+  ret <4 x i32> %vecinit3
+}
+
+define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone  {
+; CHECK-LABEL: h:
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: ins.d v0[1], x1
+; CHECK-NEXT: ret
+  %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
+  %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
+  ret <2 x i64> %vecinit1
+}
+
+; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
+; the single value needed was of the same type as the vector. This is false if
+; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
+; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
+; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
+;
+; *However*, it is a dup vD.4h, vN.h[2*idx].
+define <4 x i16> @test_build_illegal(<4 x i32> %in) {
+; CHECK-LABEL: test_build_illegal:
+; CHECK: dup.4h v0, v0[6]
+  %val = extractelement <4 x i32> %in, i32 3
+  %smallval = trunc i32 %val to i16
+  %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
+
+  ret <4 x i16> %vec
+}
+
+; We used to inherit an already extract_subvectored v4i16 from
+; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
+; the formation of an indexed-by-7 MLS.
+define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+; CHECK-LABEL: test_high_splat:
+; CHECK: mls.4h v0, v1, v2[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
diff --git a/test/CodeGen/AArch64/arm64-early-ifcvt.ll b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
new file mode 100644
index 0000000..17d783a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@@ -0,0 +1,423 @@
+; RUN: llc < %s -stress-early-ifcvt | FileCheck %s
+target triple = "arm64-apple-macosx"
+
+; CHECK: mm2
+define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  br label %do.body
+
+; CHECK: do.body
+; Loop body has no branches before the backedge.
+; CHECK-NOT: LBB
+do.body:
+  %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ]
+  %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ]
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ]
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ]
+  %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1
+  %0 = load i32* %p.addr.0, align 4
+  %cmp = icmp sgt i32 %0, %max.0
+  br i1 %cmp, label %do.cond, label %if.else
+
+if.else:
+  %cmp1 = icmp slt i32 %0, %min.0
+  %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0
+  br label %do.cond
+
+do.cond:
+  %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ]
+  %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ]
+; CHECK: cbnz
+  %dec = add i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:
+  %sub = sub nsw i32 %max.1, %min.1
+  ret i32 %sub
+}
+
+; CHECK-LABEL: fold_inc_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinc w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_inc_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inc = add nsw i32 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %inc, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inc_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinc x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_inc_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inc = add nsw i64 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %inc, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inc_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinc w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_inc_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inc = add nsw i32 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %inc, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inc_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinc x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_inc_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inc = add nsw i64 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %inc, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inv_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinv w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_inv_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inv = xor i32 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %inv, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inv_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinv x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_inv_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inv = xor i64 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %inv, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inv_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinv w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_inv_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inv = xor i32 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %inv, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inv_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinv x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_inv_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inv = xor i64 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %inv, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_neg_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csneg w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_neg_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %neg = sub nsw i32 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %neg, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_neg_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csneg x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_neg_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %neg = sub nsw i64 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %neg, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_neg_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csneg w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_neg_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %neg = sub nsw i32 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %neg, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_neg_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csneg x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_neg_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %neg = sub nsw i64 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %neg, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: cbnz_32
+; CHECK: {{subs.*wzr,|cmp}} w2, #0
+; CHECK-NEXT: csel w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @cbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: cbnz_64
+; CHECK: {{subs.*xzr,|cmp}} x2, #0
+; CHECK-NEXT: csel x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @cbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: cbz_32
+; CHECK: {{subs.*wzr,|cmp}} w2, #0
+; CHECK-NEXT: csel w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @cbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp ne i32 %c, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: cbz_64
+; CHECK: {{subs.*xzr,|cmp}} x2, #0
+; CHECK-NEXT: csel x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @cbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp ne i64 %c, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: tbnz_32
+; CHECK: {{ands.*xzr,|tst}} w2, #0x80
+; CHECK-NEXT: csel w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @tbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %mask = and i32 %c, 128
+  %tobool = icmp eq i32 %mask, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: tbnz_64
+; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
+; CHECK-NEXT: csel x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @tbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %mask = and i64 %c, 9223372036854775808
+  %tobool = icmp eq i64 %mask, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: tbz_32
+; CHECK: {{ands.*xzr,|tst}} w2, #0x80
+; CHECK-NEXT: csel w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @tbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %mask = and i32 %c, 128
+  %tobool = icmp ne i32 %mask, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: tbz_64
+; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
+; CHECK-NEXT: csel x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @tbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %mask = and i64 %c, 9223372036854775808
+  %tobool = icmp ne i64 %mask, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; This function from 175.vpr folds an ADDWri into a CSINC.
+; Remember to clear the kill flag on the ADDWri.
+define i32 @get_ytrack_to_xtracks() nounwind ssp {
+entry:
+  br label %for.body
+
+for.body:
+  %x0 = load i32* undef, align 4
+  br i1 undef, label %if.then.i146, label %is_sbox.exit155
+
+if.then.i146:
+  %add8.i143 = add nsw i32 0, %x0
+  %rem.i144 = srem i32 %add8.i143, %x0
+  %add9.i145 = add i32 %rem.i144, 1
+  br label %is_sbox.exit155
+
+is_sbox.exit155:                                  ; preds = %if.then.i146, %for.body
+  %seg_offset.0.i151 = phi i32 [ %add9.i145, %if.then.i146 ], [ undef, %for.body ]
+  %idxprom15.i152 = sext i32 %seg_offset.0.i151 to i64
+  %arrayidx18.i154 = getelementptr inbounds i32* null, i64 %idxprom15.i152
+  %x1 = load i32* %arrayidx18.i154, align 4
+  br i1 undef, label %for.body51, label %for.body
+
+for.body51:                                       ; preds = %is_sbox.exit155
+  call fastcc void @get_switch_type(i32 %x1, i32 undef, i16 signext undef, i16 signext undef, i16* undef)
+  unreachable
+}
+declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, i16* nocapture) nounwind ssp
diff --git a/test/CodeGen/AArch64/arm64-elf-calls.ll b/test/CodeGen/AArch64/arm64-elf-calls.ll
new file mode 100644
index 0000000..8c40203
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-elf-calls.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
+
+declare void @callee()
+
+define void @caller() {
+  call void @callee()
+  ret void
+; CHECK-LABEL: caller:
+; CHECK:     bl callee
+; CHECK-OBJ: R_AARCH64_CALL26 callee
+}
+
+define void @tail_caller() {
+  tail call void @callee()
+  ret void
+; CHECK-LABEL: tail_caller:
+; CHECK:     b callee
+; CHECK-OBJ: R_AARCH64_JUMP26 callee
+}
diff --git a/test/CodeGen/AArch64/arm64-elf-constpool.ll b/test/CodeGen/AArch64/arm64-elf-constpool.ll
new file mode 100644
index 0000000..95d3343
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-elf-constpool.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -o - %s | FileCheck %s
+
+; O0 checked for fastisel purposes. It has a separate path which
+; creates a constpool entry for floating values.
+
+define double @needs_const() {
+  ret double 3.14159
+; CHECK: .LCPI0_0:
+
+; CHECK: adrp {{x[0-9]+}}, .LCPI0_0
+; CHECK: ldr d0, [{{x[0-9]+}}, :lo12:.LCPI0_0]
+}
diff --git a/test/CodeGen/AArch64/arm64-elf-globals.ll b/test/CodeGen/AArch64/arm64-elf-globals.ll
new file mode 100644
index 0000000..4ed44e7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-elf-globals.ll
@@ -0,0 +1,115 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s -mcpu=cyclone | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC
+
+@var8 = external global i8, align 1
+@var16 = external global i16, align 2
+@var32 = external global i32, align 4
+@var64 = external global i64, align 8
+
+define i8 @test_i8(i8 %new) {
+  %val = load i8* @var8, align 1
+  store i8 %new, i8* @var8
+  ret i8 %val
+; CHECK-LABEL: test_i8:
+; CHECK: adrp x[[HIREG:[0-9]+]], var8
+; CHECK: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+; CHECK: strb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+
+; CHECK-PIC-LABEL: test_i8:
+; CHECK-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
+; CHECK-PIC: ldr x[[VAR_ADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
+; CHECK-PIC: ldrb {{w[0-9]+}}, [x[[VAR_ADDR]]]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var8
+; CHECK-FAST: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+
+; CHECK-FAST-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
+; CHECK-FAST-PIC: ldr x[[VARADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
+; CHECK-FAST-PIC: ldr {{w[0-9]+}}, [x[[VARADDR]]]
+}
+
+define i16 @test_i16(i16 %new) {
+  %val = load i16* @var16, align 2
+  store i16 %new, i16* @var16
+  ret i16 %val
+; CHECK-LABEL: test_i16:
+; CHECK: adrp x[[HIREG:[0-9]+]], var16
+; CHECK: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+; CHECK: strh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var16
+; CHECK-FAST: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+}
+
+define i32 @test_i32(i32 %new) {
+  %val = load i32* @var32, align 4
+  store i32 %new, i32* @var32
+  ret i32 %val
+; CHECK-LABEL: test_i32:
+; CHECK: adrp x[[HIREG:[0-9]+]], var32
+; CHECK: ldr {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
+; CHECK: str {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var32
+; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var32
+}
+
+define i64 @test_i64(i64 %new) {
+  %val = load i64* @var64, align 8
+  store i64 %new, i64* @var64
+  ret i64 %val
+; CHECK-LABEL: test_i64:
+; CHECK: adrp x[[HIREG:[0-9]+]], var64
+; CHECK: ldr {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
+; CHECK: str {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var64
+; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var64
+}
+
+define i64* @test_addr() {
+  ret i64* @var64
+; CHECK-LABEL: test_addr:
+; CHECK: adrp [[HIREG:x[0-9]+]], var64
+; CHECK: add x0, [[HIREG]], :lo12:var64
+
+; CHECK-FAST: adrp [[HIREG:x[0-9]+]], var64
+; CHECK-FAST: add x0, [[HIREG]], :lo12:var64
+}
+
+@hiddenvar = hidden global i32 0, align 4
+@protectedvar = protected global i32 0, align 4
+
+define i32 @test_vis() {
+  %lhs = load i32* @hiddenvar, align 4
+  %rhs = load i32* @protectedvar, align 4
+  %ret = add i32 %lhs, %rhs
+  ret i32 %ret
+; CHECK-PIC: adrp {{x[0-9]+}}, hiddenvar
+; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:hiddenvar]
+; CHECK-PIC: adrp {{x[0-9]+}}, protectedvar
+; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:protectedvar]
+}
+
+@var_default = external global [2 x i32]
+
+define i32 @test_default_align() {
+  %addr = getelementptr [2 x i32]* @var_default, i32 0, i32 0
+  %val = load i32* %addr
+  ret i32 %val
+; CHECK-LABEL: test_default_align:
+; CHECK: adrp x[[HIREG:[0-9]+]], var_default
+; CHECK: ldr w0, [x[[HIREG]], :lo12:var_default]
+}
+
+define i64 @test_default_unaligned() {
+  %addr = bitcast [2 x i32]* @var_default to i64*
+  %val = load i64* %addr
+  ret i64 %val
+; CHECK-LABEL: test_default_unaligned:
+; CHECK: adrp [[HIREG:x[0-9]+]], var_default
+; CHECK: add x[[ADDR:[0-9]+]], [[HIREG]], :lo12:var_default
+; CHECK: ldr x0, [x[[ADDR]]]
+}
diff --git a/test/CodeGen/AArch64/arm64-ext.ll b/test/CodeGen/AArch64/arm64-ext.ll
new file mode 100644
index 0000000..67860de
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ext.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd:
+;CHECK: {{ext.8b.*#3}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+	ret <8 x i8> %tmp3
+}
+
+define <8 x i8> @test_vextRd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRd:
+;CHECK: {{ext.8b.*#5}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextq:
+;CHECK: {{ext.16b.*3}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+	ret <16 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextRq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRq:
+;CHECK: {{ext.16b.*7}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @test_vextd16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: test_vextd16:
+;CHECK: {{ext.8b.*#6}}
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+	ret <4 x i16> %tmp3
+}
+
+define <4 x i32> @test_vextq32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: test_vextq32:
+;CHECK: {{ext.16b.*12}}
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+	ret <4 x i32> %tmp3
+}
+
+; Undef shuffle indices should not prevent matching to VEXT:
+
+define <8 x i8> @test_vextd_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd_undef:
+;CHECK: {{ext.8b.*}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10>
+	ret <8 x i8> %tmp3
+}
+
+define <8 x i8> @test_vextd_undef2(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd_undef2:
+;CHECK: {{ext.8b.*#6}}
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5>
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRq_undef:
+;CHECK: {{ext.16b.*#7}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 undef, i32 undef, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 undef, i32 6>
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @test_vextRq_undef2(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vextRq_undef2:
+;CHECK: {{ext.16b.*#10}}
+  %tmp1 = load <8 x i16>* %A
+  %vext = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x i16> %vext;
+}
+
+; Tests for ReconstructShuffle function. Indices have to be carefully
+; chosen to reach lowering phase as a BUILD_VECTOR.
+
+; One vector needs vext, the other can be handled by extract_subvector
+; Also checks interleaving of sources is handled correctly.
+; Essence: a vext is used on %A and something saner than stack load/store for final result.
+define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: test_interleaved:
+;CHECK: ext.8b
+;CHECK: zip1.4h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 3, i32 8, i32 5, i32 9>
+        ret <4 x i16> %tmp3
+}
+
+; An undef in the shuffle list should still be optimizable
+define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: test_undef:
+;CHECK: zip1.4h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 undef, i32 8, i32 5, i32 9>
+        ret <4 x i16> %tmp3
+}
diff --git a/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll b/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
new file mode 100644
index 0000000..048fdb0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <4 x float> @foo(<4 x i16> %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: ushll.4s	v0, v0, #0
+; CHECK-NEXT: ucvtf.4s	v0, v0
+; CHECK-NEXT: ret
+  %vcvt.i = uitofp <4 x i16> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @bar(<4 x i16> %a) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: sshll.4s	v0, v0, #0
+; CHECK-NEXT: scvtf.4s	v0, v0
+; CHECK-NEXT: ret
+  %vcvt.i = sitofp <4 x i16> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
diff --git a/test/CodeGen/AArch64/arm64-extend.ll b/test/CodeGen/AArch64/arm64-extend.ll
new file mode 100644
index 0000000..afcaca2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extend.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+@array = external global [0 x i32]
+
+define i64 @foo(i32 %i) {
+; CHECK: foo
+; CHECK:  adrp  x[[REG:[0-9]+]], _array@GOTPAGE
+; CHECK:  ldr x[[REG1:[0-9]+]], [x[[REG]], _array@GOTPAGEOFF]
+; CHECK:  ldrsw x0, [x[[REG1]], w0, sxtw #2]
+; CHECK:  ret
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds [0 x i32]* @array, i64 0, i64 %idxprom
+  %tmp1 = load i32* %arrayidx, align 4
+  %conv = sext i32 %tmp1 to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll
new file mode 100644
index 0000000..a239403
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extern-weak.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -o - < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
+
+declare extern_weak i32 @var()
+
+define i32()* @foo() {
+; The usual ADRP/ADD pair can't be used for a weak reference because it must
+; evaluate to 0 if the symbol is undefined. We use a litpool entry.
+  ret i32()* @var
+
+; CHECK: adrp x[[VAR:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[VAR]], :got_lo12:var]
+
+  ; In the large model, the usual relocations are absolute and can
+  ; materialise 0.
+; CHECK-LARGE: movz x0, #:abs_g3:var
+; CHECK-LARGE: movk x0, #:abs_g2_nc:var
+; CHECK-LARGE: movk x0, #:abs_g1_nc:var
+; CHECK-LARGE: movk x0, #:abs_g0_nc:var
+}
+
+
+@arr_var = extern_weak global [10 x i32]
+
+define i32* @bar() {
+  %addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5
+; CHECK: adrp x[[ARR_VAR_HI:[0-9]+]], :got:arr_var
+; CHECK: ldr [[ARR_VAR:x[0-9]+]], [x[[ARR_VAR_HI]], :got_lo12:arr_var]
+; CHECK: add x0, [[ARR_VAR]], #20
+  ret i32* %addr
+
+  ; In the large model, the usual relocations are absolute and can
+  ; materialise 0.
+; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g3:arr_var
+; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g2_nc:arr_var
+; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g1_nc:arr_var
+; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g0_nc:arr_var
+}
+
+@defined_weak_var = internal unnamed_addr global i32 0
+
+define i32* @wibble() {
+  ret i32* @defined_weak_var
+; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
+
+; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
+; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
+; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var
+; CHECK-LARGE: movk x0, #:abs_g0_nc:defined_weak_var
+}
diff --git a/test/CodeGen/AArch64/arm64-extload-knownzero.ll b/test/CodeGen/AArch64/arm64-extload-knownzero.ll
new file mode 100644
index 0000000..14e5fd3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extload-knownzero.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+; rdar://12771555
+
+define void @foo(i16* %ptr, i32 %a) nounwind {
+entry:
+; CHECK-LABEL: foo:
+  %tmp1 = icmp ult i32 %a, 100
+  br i1 %tmp1, label %bb1, label %bb2
+bb1:
+; CHECK: %bb1
+; CHECK: ldrh [[REG:w[0-9]+]]
+  %tmp2 = load i16* %ptr, align 2
+  br label %bb2
+bb2:
+; CHECK: %bb2
+; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
+; CHECK: cmp [[REG]], #23
+  %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ]
+  %cmp = icmp ult i16 %tmp3, 24
+  br i1 %cmp, label %bb3, label %exit
+bb3:
+  call void @bar() nounwind
+  br label %exit
+exit:
+  ret void
+}
+
+declare void @bar ()
diff --git a/test/CodeGen/AArch64/arm64-extract.ll b/test/CodeGen/AArch64/arm64-extract.ll
new file mode 100644
index 0000000..0198466
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extract.ll
@@ -0,0 +1,58 @@
+; RUN: llc -aarch64-extr-generation=true -verify-machineinstrs < %s \
+; RUN: -march=arm64 | FileCheck %s
+
+define i64 @ror_i64(i64 %in) {
+; CHECK-LABEL: ror_i64:
+    %left = shl i64 %in, 19
+    %right = lshr i64 %in, 45
+    %val5 = or i64 %left, %right
+; CHECK: ror {{x[0-9]+}}, x0, #45
+    ret i64 %val5
+}
+
+define i32 @ror_i32(i32 %in) {
+; CHECK-LABEL: ror_i32:
+    %left = shl i32 %in, 9
+    %right = lshr i32 %in, 23
+    %val5 = or i32 %left, %right
+; CHECK: ror {{w[0-9]+}}, w0, #23
+    ret i32 %val5
+}
+
+define i32 @extr_i32(i32 %lhs, i32 %rhs) {
+; CHECK-LABEL: extr_i32:
+  %left = shl i32 %lhs, 6
+  %right = lshr i32 %rhs, 26
+  %val = or i32 %left, %right
+  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+  ; something other than w0 and w1.
+; CHECK: extr {{w[0-9]+}}, w0, w1, #26
+
+  ret i32 %val
+}
+
+define i64 @extr_i64(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: extr_i64:
+  %right = lshr i64 %rhs, 40
+  %left = shl i64 %lhs, 24
+  %val = or i64 %right, %left
+  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+  ; something other than w0 and w1.
+; CHECK: extr {{x[0-9]+}}, x0, x1, #40
+
+  ret i64 %val
+}
+
+; Regression test: a bad experimental pattern crept into git which optimised
+; this pattern to a single EXTR.
+define i32 @extr_regress(i32 %a, i32 %b) {
+; CHECK-LABEL: extr_regress:
+
+    %sh1 = shl i32 %a, 14
+    %sh2 = lshr i32 %b, 14
+    %val = or i32 %sh2, %sh1
+; CHECK-NOT: extr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, #{{[0-9]+}}
+
+    ret i32 %val
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-extract_subvector.ll b/test/CodeGen/AArch64/arm64-extract_subvector.ll
new file mode 100644
index 0000000..8b15a64
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extract_subvector.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn.
+
+define <8 x i8> @v8i8(<16 x i8> %a) nounwind {
+; CHECK: v8i8
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32>  <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %ret
+}
+
+define <4 x i16> @v4i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: v4i16:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32>  <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %ret
+}
+
+define <2 x i32> @v2i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: v2i32:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32>  <i32 2, i32 3>
+  ret <2 x i32> %ret
+}
+
+define <1 x i64> @v1i64(<2 x i64> %a) nounwind {
+; CHECK-LABEL: v1i64:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32>  <i32 1>
+  ret <1 x i64> %ret
+}
+
+define <2 x float> @v2f32(<4 x float> %a) nounwind {
+; CHECK-LABEL: v2f32:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32>  <i32 2, i32 3>
+  ret <2 x float> %ret
+}
+
+define <1 x double> @v1f64(<2 x double> %a) nounwind {
+; CHECK-LABEL: v1f64:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32>  <i32 1>
+  ret <1 x double> %ret
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
new file mode 100644
index 0000000..ebd847e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+@sortlist = common global [5001 x i32] zeroinitializer, align 16
+@sortlist2 = common global [5001 x i64] zeroinitializer, align 16
+
+; Load an address with an offset larget then LDR imm can handle
+define i32 @foo() nounwind {
+entry:
+; CHECK: @foo
+; CHECK: adrp x[[REG:[0-9]+]], _sortlist@GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist@GOTPAGEOFF]
+; CHECK: movz x[[REG2:[0-9]+]], #0x4e20
+; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
+; CHECK: ldr w0, [x[[REG3]]]
+; CHECK: ret
+  %0 = load i32* getelementptr inbounds ([5001 x i32]* @sortlist, i32 0, i64 5000), align 4
+  ret i32 %0
+}
+
+define i64 @foo2() nounwind {
+entry:
+; CHECK: @foo2
+; CHECK: adrp x[[REG:[0-9]+]], _sortlist2@GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist2@GOTPAGEOFF]
+; CHECK: movz x[[REG2:[0-9]+]], #0x9c40
+; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
+; CHECK: ldr x0, [x[[REG3]]]
+; CHECK: ret
+  %0 = load i64* getelementptr inbounds ([5001 x i64]* @sortlist2, i32 0, i64 5000), align 4
+  ret i64 %0
+}
+
+; Load an address with a ridiculously large offset.
+; rdar://12505553
+@pd2 = common global i8* null, align 8
+
+define signext i8 @foo3() nounwind ssp {
+entry:
+; CHECK: @foo3
+; CHECK: movz x[[REG:[0-9]+]], #0xb3a, lsl #32
+; CHECK: movk x[[REG]], #0x73ce, lsl #16
+; CHECK: movk x[[REG]], #0x2ff2
+  %0 = load i8** @pd2, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 12345678901234
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
new file mode 100644
index 0000000..1706e9e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
@@ -0,0 +1,25 @@
+; This test should cause the TargetMaterializeAlloca to be invoked
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+%struct.S1Ty = type { i64 }
+%struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty }
+
+define void @takeS1(%struct.S1Ty* %V) nounwind {
+entry:
+  %V.addr = alloca %struct.S1Ty*, align 8
+  store %struct.S1Ty* %V, %struct.S1Ty** %V.addr, align 8
+  ret void
+}
+
+define void @main() nounwind {
+entry:
+; CHECK: main
+; CHECK: mov x29, sp
+; CHECK: mov x[[REG:[0-9]+]], sp
+; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8
+; CHECK-NEXT: add x0, x[[REG]], x[[REG1]]
+  %E = alloca %struct.S2Ty, align 4
+  %B = getelementptr inbounds %struct.S2Ty* %E, i32 0, i32 1
+  call void @takeS1(%struct.S1Ty* %B)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
new file mode 100644
index 0000000..37a8295
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
@@ -0,0 +1,155 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
+
+define void @branch1() nounwind uwtable ssp {
+  %x = alloca i32, align 4
+  store i32 0, i32* %x, align 4
+  %1 = load i32* %x, align 4
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %3, label %4
+
+; <label>:3                                       ; preds = %0
+  br label %4
+
+; <label>:4                                       ; preds = %3, %0
+  ret void
+}
+
+define void @branch2() nounwind uwtable ssp {
+  %1 = alloca i32, align 4
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  store i32 0, i32* %1
+  store i32 1, i32* %y, align 4
+  store i32 1, i32* %x, align 4
+  store i32 0, i32* %z, align 4
+  %2 = load i32* %x, align 4
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %4, label %5
+
+; <label>:4                                       ; preds = %0
+  store i32 0, i32* %1
+  br label %14
+
+; <label>:5                                       ; preds = %0
+  %6 = load i32* %y, align 4
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %8, label %13
+
+; <label>:8                                       ; preds = %5
+  %9 = load i32* %z, align 4
+  %10 = icmp ne i32 %9, 0
+  br i1 %10, label %11, label %12
+
+; <label>:11                                      ; preds = %8
+  store i32 1, i32* %1
+  br label %14
+
+; <label>:12                                      ; preds = %8
+  store i32 0, i32* %1
+  br label %14
+
+; <label>:13                                      ; preds = %5
+  br label %14
+
+; <label>:14                                      ; preds = %4, %11, %12, %13
+  %15 = load i32* %1
+  ret void
+}
+
+define void @true_() nounwind uwtable ssp {
+; CHECK: @true_
+; CHECK: b LBB2_1
+  br i1 true, label %1, label %2
+
+; <label>:1
+; CHECK: LBB2_1
+  br label %2
+
+; <label>:2
+  ret void
+}
+
+define void @false_() nounwind uwtable ssp {
+; CHECK: @false_
+; CHECK: b LBB3_2
+  br i1 false, label %1, label %2
+
+; <label>:1
+  br label %2
+
+; <label>:2
+; CHECK: LBB3_2
+  ret void
+}
+
+define zeroext i8 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) {
+entry:
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %0 = load i16* %b.addr, align 2
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: b.eq LBB4_2
+  %conv = trunc i16 %0 to i1
+  br i1 %conv, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo1()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %1 = load i32* %c.addr, align 4
+; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
+; CHECK: subs w{{[0-9]+}}, w[[REG]], #0
+; CHECK: b.eq LBB4_4
+  %conv1 = trunc i32 %1 to i1
+  br i1 %conv1, label %if.then3, label %if.end4
+
+if.then3:                                         ; preds = %if.end
+  call void @foo1()
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.then3, %if.end
+  %2 = load i64* %d.addr, align 8
+; CHECK: subs w{{[0-9]+}}, w{{[0-9]+}}, #0
+; CHECK: b.eq LBB4_6
+  %conv5 = trunc i64 %2 to i1
+  br i1 %conv5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.end4
+  call void @foo1()
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.then7, %if.end4
+  %3 = load i8* %a.addr, align 1
+  ret i8 %3
+}
+
+declare void @foo1()
+
+; rdar://15174028
+define i32 @trunc64(i64 %foo) nounwind {
+; CHECK: trunc64
+; CHECK: orr  [[REG:x[0-9]+]], xzr, #0x1
+; CHECK: and  [[REG2:x[0-9]+]], x0, [[REG]]
+; CHECK: mov  x[[REG3:[0-9]+]], [[REG2]]
+; CHECK: and  [[REG4:w[0-9]+]], w[[REG3]], #0x1
+; CHECK: subs {{w[0-9]+}}, [[REG4]], #0
+; CHECK: b.eq LBB5_2
+  %a = and i64 %foo, 1
+  %b = trunc i64 %a to i1
+  br i1 %b, label %if.then, label %if.else
+
+if.then:
+  ret i32 1
+
+if.else:
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-call.ll b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
new file mode 100644
index 0000000..8d756ae
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64_be-linux-gnu | FileCheck %s --check-prefix=CHECK-BE
+
+define void @call0() nounwind {
+entry:
+  ret void
+}
+
+define void @foo0() nounwind {
+entry:
+; CHECK: foo0
+; CHECK: bl _call0
+  call void @call0()
+  ret void
+}
+
+define i32 @call1(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  ret i32 %tmp
+}
+
+define i32 @foo1(i32 %a) nounwind {
+entry:
+; CHECK: foo1
+; CHECK: stur w0, [x29, #-4]
+; CHECK-NEXT: ldur w0, [x29, #-4]
+; CHECK-NEXT: bl _call1
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %call = call i32 @call1(i32 %tmp)
+  ret i32 %call
+}
+
+define i32 @sext_(i8 %a, i16 %b) nounwind {
+entry:
+; CHECK: @sext_
+; CHECK: sxtb w0, w0
+; CHECK: sxth w1, w1
+; CHECK: bl _foo_sext_
+  call void @foo_sext_(i8 signext %a, i16 signext %b)
+  ret i32 0
+}
+
+declare void @foo_sext_(i8 %a, i16 %b)
+
+define i32 @zext_(i8 %a, i16 %b) nounwind {
+entry:
+; CHECK: @zext_
+; CHECK: uxtb w0, w0
+; CHECK: uxth w1, w1
+  call void @foo_zext_(i8 zeroext %a, i16 zeroext %b)
+  ret i32 0
+}
+
+declare void @foo_zext_(i8 %a, i16 %b)
+
+define i32 @t1(i32 %argc, i8** nocapture %argv) {
+entry:
+; CHECK: @t1
+; The last parameter will be passed on stack via i8.
+; CHECK: strb w{{[0-9]+}}, [sp]
+; CHECK-NEXT: bl _bar
+  %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70, i8 zeroext 28, i8 zeroext 39, i8 zeroext -41)
+  ret i32 0
+}
+
+declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
+
+; Test materialization of integers.  Target-independent selector handles this.
+define i32 @t2() {
+entry:
+; CHECK: @t2
+; CHECK: movz x0, #0
+; CHECK: orr w1, wzr, #0xfffffff8
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x3ff
+; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x2
+; CHECK: movz w[[REG3:[0-9]+]], #0
+; CHECK: orr w[[REG4:[0-9]+]], wzr, #0x1
+; CHECK: uxth w2, w[[REG]]
+; CHECK: sxtb w3, w[[REG2]]
+; CHECK: and w4, w[[REG3]], #0x1
+; CHECK: and w5, w[[REG4]], #0x1
+; CHECK: bl	_func2
+  %call = call i32 @func2(i64 zeroext 0, i32 signext -8, i16 zeroext 1023, i8 signext -254, i1 zeroext 0, i1 zeroext 1)
+  ret i32 0
+}
+
+declare i32 @func2(i64 zeroext, i32 signext, i16 zeroext, i8 signext, i1 zeroext, i1 zeroext)
+
+declare void @callee_b0f(i8 %bp10, i8 %bp11, i8 %bp12, i8 %bp13, i8 %bp14, i8 %bp15, i8 %bp17, i8 %bp18, i8 %bp19)
+define void @caller_b1f() {
+entry:
+  ; CHECK-BE: strb w{{.*}}, [sp, #7]
+  call void @callee_b0f(i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 42)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
new file mode 100644
index 0000000..c5417de
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
@@ -0,0 +1,442 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
+
+;; Test various conversions.
+define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: trunc_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldr x3, [sp]
+; CHECK: mov x0, x3
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: strb w0, [sp, #15]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: uxtb w0, w0
+; CHECK: add sp, sp, #16
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i64* %d.addr, align 8
+  %conv = trunc i64 %tmp to i32
+  store i32 %conv, i32* %c.addr, align 4
+  %tmp1 = load i32* %c.addr, align 4
+  %conv2 = trunc i32 %tmp1 to i16
+  store i16 %conv2, i16* %b.addr, align 2
+  %tmp3 = load i16* %b.addr, align 2
+  %conv4 = trunc i16 %tmp3 to i8
+  store i8 %conv4, i8* %a.addr, align 1
+  %tmp5 = load i8* %a.addr, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  ret i32 %conv6
+}
+
+define i64 @zext_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: zext_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: uxtb w0, w0
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: uxth w0, w0
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: mov x3, x0
+; CHECK: ubfx x3, x3, #0, #32
+; CHECK: str x3, [sp]
+; CHECK: ldr x0, [sp]
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i8* %a.addr, align 1
+  %conv = zext i8 %tmp to i16
+  store i16 %conv, i16* %b.addr, align 2
+  %tmp1 = load i16* %b.addr, align 2
+  %conv2 = zext i16 %tmp1 to i32
+  store i32 %conv2, i32* %c.addr, align 4
+  %tmp3 = load i32* %c.addr, align 4
+  %conv4 = zext i32 %tmp3 to i64
+  store i64 %conv4, i64* %d.addr, align 8
+  %tmp5 = load i64* %d.addr, align 8
+  ret i64 %tmp5
+}
+
+define i32 @zext_i1_i32(i1 zeroext %a) nounwind ssp {
+entry:
+; CHECK: @zext_i1_i32
+; CHECK: and w0, w0, #0x1
+  %conv = zext i1 %a to i32
+  ret i32 %conv;
+}
+
+define i64 @zext_i1_i64(i1 zeroext %a) nounwind ssp {
+entry:
+; CHECK: @zext_i1_i64
+; CHECK: and w0, w0, #0x1
+  %conv = zext i1 %a to i64
+  ret i64 %conv;
+}
+
+define i64 @sext_(i8 signext %a, i16 signext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: sext_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: sxtb w0, w0
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: sxth w0, w0
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: mov x3, x0
+; CHECK: sxtw x3, w3
+; CHECK: str x3, [sp]
+; CHECK: ldr x0, [sp]
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i8* %a.addr, align 1
+  %conv = sext i8 %tmp to i16
+  store i16 %conv, i16* %b.addr, align 2
+  %tmp1 = load i16* %b.addr, align 2
+  %conv2 = sext i16 %tmp1 to i32
+  store i32 %conv2, i32* %c.addr, align 4
+  %tmp3 = load i32* %c.addr, align 4
+  %conv4 = sext i32 %tmp3 to i64
+  store i64 %conv4, i64* %d.addr, align 8
+  %tmp5 = load i64* %d.addr, align 8
+  ret i64 %tmp5
+}
+
+; Test sext i8 to i64
+
+define zeroext i64 @sext_i8_i64(i8 zeroext %in) {
+; CHECK-LABEL: sext_i8_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: sxtb x0, w[[TMP]]
+  %big = sext i8 %in to i64
+  ret i64 %big
+}
+
+define zeroext i64 @sext_i16_i64(i16 zeroext %in) {
+; CHECK-LABEL: sext_i16_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: sxth x0, w[[TMP]]
+  %big = sext i16 %in to i64
+  ret i64 %big
+}
+
+; Test sext i1 to i32
+define i32 @sext_i1_i32(i1 signext %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i32
+; CHECK: sbfx w0, w0, #0, #1
+  %conv = sext i1 %a to i32
+  ret i32 %conv
+}
+
+; Test sext i1 to i16
+define signext i16 @sext_i1_i16(i1 %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i16
+; CHECK: sbfx w0, w0, #0, #1
+  %conv = sext i1 %a to i16
+  ret i16 %conv
+}
+
+; Test sext i1 to i8
+define signext i8 @sext_i1_i8(i1 %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i8
+; CHECK: sbfx w0, w0, #0, #1
+  %conv = sext i1 %a to i8
+  ret i8 %conv
+}
+
+; Test fpext
+define double @fpext_(float %a) nounwind ssp {
+entry:
+; CHECK: fpext_
+; CHECK: fcvt d0, s0
+  %conv = fpext float %a to double
+  ret double %conv
+}
+
+; Test fptrunc
+define float @fptrunc_(double %a) nounwind ssp {
+entry:
+; CHECK: fptrunc_
+; CHECK: fcvt s0, d0
+  %conv = fptrunc double %a to float
+  ret float %conv
+}
+
+; Test fptosi
+define i32 @fptosi_ws(float %a) nounwind ssp {
+entry:
+; CHECK: fptosi_ws
+; CHECK: fcvtzs w0, s0
+  %conv = fptosi float %a to i32
+  ret i32 %conv
+}
+
+; Test fptosi
+define i32 @fptosi_wd(double %a) nounwind ssp {
+entry:
+; CHECK: fptosi_wd
+; CHECK: fcvtzs w0, d0
+  %conv = fptosi double %a to i32
+  ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_ws(float %a) nounwind ssp {
+entry:
+; CHECK: fptoui_ws
+; CHECK: fcvtzu w0, s0
+  %conv = fptoui float %a to i32
+  ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_wd(double %a) nounwind ssp {
+entry:
+; CHECK: fptoui_wd
+; CHECK: fcvtzu w0, d0
+  %conv = fptoui double %a to i32
+  ret i32 %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i1(i1 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i1
+; CHECK: sbfx w0, w0, #0, #1
+; CHECK: scvtf s0, w0
+  %conv = sitofp i1 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i8(i8 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i8
+; CHECK: sxtb w0, w0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i8 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i16(i16 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i16
+; CHECK: sxth w0, w0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i16 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw(i32 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw
+; CHECK: scvtf s0, w0
+  %conv = sitofp i32 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sx(i64 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sx
+; CHECK: scvtf s0, x0
+  %conv = sitofp i64 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define double @sitofp_dw(i32 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_dw
+; CHECK: scvtf d0, w0
+  %conv = sitofp i32 %a to double
+  ret double %conv
+}
+
+; Test sitofp
+define double @sitofp_dx(i64 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_dx
+; CHECK: scvtf d0, x0
+  %conv = sitofp i64 %a to double
+  ret double %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i1(i1 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i1
+; CHECK: and w0, w0, #0x1
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i1 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i8(i8 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i8
+; CHECK: uxtb w0, w0
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i8 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i16(i16 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i16
+; CHECK: uxth w0, w0
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i16 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw(i32 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i32 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sx(i64 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sx
+; CHECK: ucvtf s0, x0
+  %conv = uitofp i64 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define double @uitofp_dw(i32 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_dw
+; CHECK: ucvtf d0, w0
+  %conv = uitofp i32 %a to double
+  ret double %conv
+}
+
+; Test uitofp
+define double @uitofp_dx(i64 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_dx
+; CHECK: ucvtf d0, x0
+  %conv = uitofp i64 %a to double
+  ret double %conv
+}
+
+define i32 @i64_trunc_i32(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i32
+; CHECK: mov x1, x0
+  %conv = trunc i64 %a to i32
+  ret i32 %conv
+}
+
+define zeroext i16 @i64_trunc_i16(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i16
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xffff
+; CHECK: uxth w0, [[REG2]]
+  %conv = trunc i64 %a to i16
+  ret i16 %conv
+}
+
+define zeroext i8 @i64_trunc_i8(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i8
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xff
+; CHECK: uxtb w0, [[REG2]]
+  %conv = trunc i64 %a to i8
+  ret i8 %conv
+}
+
+define zeroext i1 @i64_trunc_i1(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i1
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0x1
+; CHECK: and w0, [[REG2]], #0x1
+  %conv = trunc i64 %a to i1
+  ret i1 %conv
+}
+
+; rdar://15101939
+define void @stack_trunc() nounwind {
+; CHECK: stack_trunc
+; CHECK: sub  sp, sp, #16
+; CHECK: ldr  [[REG:x[0-9]+]], [sp]
+; CHECK: mov  x[[REG2:[0-9]+]], [[REG]]
+; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0xff
+; CHECK: strb [[REG3]], [sp, #15]
+; CHECK: add  sp, sp, #16
+  %a = alloca i8, align 1
+  %b = alloca i64, align 8
+  %c = load i64* %b, align 8
+  %d = trunc i64 %c to i8
+  store i8 %d, i8* %a, align 1
+  ret void
+}
+
+define zeroext i64 @zext_i8_i64(i8 zeroext %in) {
+; CHECK-LABEL: zext_i8_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: ubfx x0, x[[TMP]], #0, #8
+  %big = zext i8 %in to i64
+  ret i64 %big
+}
+define zeroext i64 @zext_i16_i64(i16 zeroext %in) {
+; CHECK-LABEL: zext_i16_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: ubfx x0, x[[TMP]], #0, #16
+  %big = zext i16 %in to i64
+  ret i64 %big
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
new file mode 100644
index 0000000..f030596
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
@@ -0,0 +1,146 @@
+; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin | FileCheck %s
+
+define zeroext i1 @fcmp_float1(float %a) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_float1
+; CHECK: fcmp s0, #0.0
+; CHECK: cset w{{[0-9]+}}, ne
+  %cmp = fcmp une float %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_float2(float %a, float %b) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_float2
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, ne
+  %cmp = fcmp une float %a, %b
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_double1(double %a) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_double1
+; CHECK: fcmp d0, #0.0
+; CHECK: cset w{{[0-9]+}}, ne
+  %cmp = fcmp une double %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_double2(double %a, double %b) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_double2
+; CHECK: fcmp d0, d1
+; CHECK: cset w{{[0-9]+}}, ne
+  %cmp = fcmp une double %a, %b
+  ret i1 %cmp
+}
+
+; Check each fcmp condition
+define float @fcmp_oeq(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_oeq
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, eq
+  %cmp = fcmp oeq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ogt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ogt
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, gt
+  %cmp = fcmp ogt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_oge(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_oge
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, ge
+  %cmp = fcmp oge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_olt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_olt
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, mi
+  %cmp = fcmp olt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ole(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ole
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, ls
+  %cmp = fcmp ole float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ord(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ord
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, vc
+  %cmp = fcmp ord float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uno(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_uno
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, vs
+  %cmp = fcmp uno float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ugt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ugt
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, hi
+  %cmp = fcmp ugt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uge(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_uge
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, pl
+  %cmp = fcmp uge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ult(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ult
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, lt
+  %cmp = fcmp ult float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ule(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ule
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, le
+  %cmp = fcmp ule float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_une(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_une
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, ne
+  %cmp = fcmp une float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
new file mode 100644
index 0000000..dc4d895
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Test load/store of global value from global offset table.
+@seed = common global i64 0, align 8
+
+define void @Initrand() nounwind {
+entry:
+; CHECK: @Initrand
+; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
+; CHECK: str x{{[0-9]+}}, [x[[REG2]]]
+  store i64 74755, i64* @seed, align 8
+  ret void
+}
+
+define i32 @Rand() nounwind {
+entry:
+; CHECK: @Rand
+; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
+; CHECK: movz x[[REG3:[0-9]+]], #0x51d
+; CHECK: ldr x[[REG4:[0-9]+]], [x[[REG2]]]
+; CHECK: mul x[[REG5:[0-9]+]], x[[REG4]], x[[REG3]]
+; CHECK: movz x[[REG6:[0-9]+]], #0x3619
+; CHECK: add x[[REG7:[0-9]+]], x[[REG5]], x[[REG6]]
+; CHECK: orr x[[REG8:[0-9]+]], xzr, #0xffff
+; CHECK: and x[[REG9:[0-9]+]], x[[REG7]], x[[REG8]]
+; CHECK: str x[[REG9]], [x[[REG]]]
+; CHECK: ldr x{{[0-9]+}}, [x[[REG]]]
+  %0 = load i64* @seed, align 8
+  %mul = mul nsw i64 %0, 1309
+  %add = add nsw i64 %mul, 13849
+  %and = and i64 %add, 65535
+  store i64 %and, i64* @seed, align 8
+  %1 = load i64* @seed, align 8
+  %conv = trunc i64 %1 to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
new file mode 100644
index 0000000..971be5c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
@@ -0,0 +1,214 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
+entry:
+; CHECK: icmp_eq_imm
+; CHECK: cmp  w0, #31
+; CHECK: cset w0, eq
+  %cmp = icmp eq i32 %a, 31
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
+entry:
+; CHECK: icmp_eq_neg_imm
+; CHECK: cmn  w0, #7
+; CHECK: cset w0, eq
+  %cmp = icmp eq i32 %a, -7
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_eq(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, eq
+  %cmp = icmp eq i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ne
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, ne
+  %cmp = icmp ne i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ugt
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, hi
+  %cmp = icmp ugt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_uge
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, hs
+  %cmp = icmp uge i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ult
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, lo
+  %cmp = icmp ult i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ule
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, ls
+  %cmp = icmp ule i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sgt
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, gt
+  %cmp = icmp sgt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sge
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, ge
+  %cmp = icmp sge i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_slt
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, lt
+  %cmp = icmp slt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sle
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, le
+  %cmp = icmp sle i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
+entry:
+; CHECK: icmp_i64
+; CHECK: cmp  x0, x1
+; CHECK: cset w{{[0-9]+}}, le
+  %cmp = icmp sle i64 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq_i16
+; CHECK: sxth w0, w0
+; CHECK: sxth w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, eq
+  %cmp = icmp eq i16 %a, %b
+  ret i1 %cmp
+}
+
+define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq_i8
+; CHECK: sxtb w0, w0
+; CHECK: sxtb w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, eq
+  %cmp = icmp eq i8 %a, %b
+  ret i1 %cmp
+}
+
+define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
+entry:
+; CHECK: icmp_i16_unsigned
+; CHECK: uxth w0, w0
+; CHECK: uxth w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, lo
+  %cmp = icmp ult i16 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
+entry:
+; CHECK: @icmp_i8_signed
+; CHECK: sxtb w0, w0
+; CHECK: sxtb w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: cset w0, gt
+  %cmp = icmp sgt i8 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+
+define i32 @icmp_i16_signed_const(i16 %a) nounwind {
+entry:
+; CHECK: icmp_i16_signed_const
+; CHECK: sxth w0, w0
+; CHECK: cmn  w0, #233
+; CHECK: cset w0, lt
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp slt i16 %a, -233
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i8_signed_const(i8 %a) nounwind {
+entry:
+; CHECK: icmp_i8_signed_const
+; CHECK: sxtb w0, w0
+; CHECK: cmp  w0, #124
+; CHECK: cset w0, gt
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp sgt i8 %a, 124
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
+entry:
+; CHECK: icmp_i1_unsigned_const
+; CHECK: and w0, w0, #0x1
+; CHECK: cmp  w0, #0
+; CHECK: cset w0, lo
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp ult i1 %a, 0
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
new file mode 100644
index 0000000..70335ac
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+@fn.table = internal global [2 x i8*] [i8* blockaddress(@fn, %ZERO), i8* blockaddress(@fn, %ONE)], align 8
+
+define i32 @fn(i32 %target) nounwind {
+entry:
+; CHECK: @fn
+  %retval = alloca i32, align 4
+  %target.addr = alloca i32, align 4
+  store i32 %target, i32* %target.addr, align 4
+  %0 = load i32* %target.addr, align 4
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds [2 x i8*]* @fn.table, i32 0, i64 %idxprom
+  %1 = load i8** %arrayidx, align 8
+  br label %indirectgoto
+
+ZERO:                                             ; preds = %indirectgoto
+; CHECK: LBB0_1
+  store i32 0, i32* %retval
+  br label %return
+
+ONE:                                              ; preds = %indirectgoto
+; CHECK: LBB0_2
+  store i32 1, i32* %retval
+  br label %return
+
+return:                                           ; preds = %ONE, %ZERO
+  %2 = load i32* %retval
+  ret i32 %2
+
+indirectgoto:                                     ; preds = %entry
+; CHECK: ldr x0, [sp]
+; CHECK: br x0
+  %indirect.goto.dest = phi i8* [ %1, %entry ]
+  indirectbr i8* %indirect.goto.dest, [label %ZERO, label %ONE]
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
new file mode 100644
index 0000000..a3d5f6c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios | FileCheck %s --check-prefix=ARM64
+
+@message = global [80 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 16
+@temp = common global [80 x i8] zeroinitializer, align 16
+
+define void @t1() {
+; ARM64-LABEL: t1
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x0, x8, _message@PAGEOFF
+; ARM64: movz w9, #0
+; ARM64: movz x2, #0x50
+; ARM64: uxtb w1, w9
+; ARM64: bl _memset
+  call void @llvm.memset.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i8 0, i64 80, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+define void @t2() {
+; ARM64-LABEL: t2
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x1, x8, _message@PAGEOFF
+; ARM64: movz x2, #0x50
+; ARM64: bl _memcpy
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 80, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+
+define void @t3() {
+; ARM64-LABEL: t3
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x1, x8, _message@PAGEOFF
+; ARM64: movz x2, #0x14
+; ARM64: bl _memmove
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 20, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+
+define void @t4() {
+; ARM64-LABEL: t4
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr x10, [x9]
+; ARM64: str x10, [x8]
+; ARM64: ldr x10, [x9, #8]
+; ARM64: str x10, [x8, #8]
+; ARM64: ldrb w11, [x9, #16]
+; ARM64: strb w11, [x8, #16]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 16, i1 false)
+  ret void
+}
+
+define void @t5() {
+; ARM64-LABEL: t5
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr x10, [x9]
+; ARM64: str x10, [x8]
+; ARM64: ldr x10, [x9, #8]
+; ARM64: str x10, [x8, #8]
+; ARM64: ldrb w11, [x9, #16]
+; ARM64: strb w11, [x8, #16]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 8, i1 false)
+  ret void
+}
+
+define void @t6() {
+; ARM64-LABEL: t6
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr w10, [x9]
+; ARM64: str w10, [x8]
+; ARM64: ldr w10, [x9, #4]
+; ARM64: str w10, [x8, #4]
+; ARM64: ldrb w10, [x9, #8]
+; ARM64: strb w10, [x8, #8]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 9, i32 4, i1 false)
+  ret void
+}
+
+define void @t7() {
+; ARM64-LABEL: t7
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldrh w10, [x9]
+; ARM64: strh w10, [x8]
+; ARM64: ldrh w10, [x9, #2]
+; ARM64: strh w10, [x8, #2]
+; ARM64: ldrh w10, [x9, #4]
+; ARM64: strh w10, [x8, #4]
+; ARM64: ldrb w10, [x9, #6]
+; ARM64: strb w10, [x8, #6]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 7, i32 2, i1 false)
+  ret void
+}
+
+define void @t8() {
+; ARM64-LABEL: t8
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldrb w10, [x9]
+; ARM64: strb w10, [x8]
+; ARM64: ldrb w10, [x9, #1]
+; ARM64: strb w10, [x8, #1]
+; ARM64: ldrb w10, [x9, #2]
+; ARM64: strb w10, [x8, #2]
+; ARM64: ldrb w10, [x9, #3]
+; ARM64: strb w10, [x8, #3]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 4, i32 1, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
new file mode 100644
index 0000000..ffac131
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Materialize using fmov
+define void @float_(float* %value) {
+; CHECK: @float_
+; CHECK: fmov s0, #1.25000000
+  store float 1.250000e+00, float* %value, align 4
+  ret void
+}
+
+define void @double_(double* %value) {
+; CHECK: @double_
+; CHECK: fmov d0, #1.25000000
+  store double 1.250000e+00, double* %value, align 8
+  ret void
+}
+
+; Materialize from constant pool
+define float @float_cp() {
+; CHECK: @float_cp
+  ret float 0x400921FB60000000
+}
+
+define double @double_cp() {
+; CHECK: @double_cp
+  ret double 0x400921FB54442D18
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
new file mode 100644
index 0000000..483d179
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
@@ -0,0 +1,68 @@
+; RUN: llc -mtriple=arm64-apple-ios -O0 %s -o - | FileCheck %s
+
+; Fast-isel can't do vector conversions yet, but it was emitting some highly
+; suspect UCVTFUWDri MachineInstrs.
+define <4 x float> @test_uitofp(<4 x i32> %in) {
+; CHECK-LABEL: test_uitofp:
+; CHECK: ucvtf.4s v0, v0
+
+  %res = uitofp <4 x i32> %in to <4 x float>
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_sitofp(<2 x i32> %in) {
+; CHECK-LABEL: test_sitofp:
+; CHECK: sshll.2d [[EXT:v[0-9]+]], v0, #0
+; CHECK: scvtf.2d v0, [[EXT]]
+
+  %res = sitofp <2 x i32> %in to <2 x double>
+  ret <2 x double> %res
+}
+
+define <2 x i32> @test_fptoui(<2 x float> %in) {
+; CHECK-LABEL: test_fptoui:
+; CHECK: fcvtzu.2s v0, v0
+
+  %res = fptoui <2 x float> %in to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <2 x i64> @test_fptosi(<2 x double> %in) {
+; CHECK-LABEL: test_fptosi:
+; CHECK: fcvtzs.2d v0, v0
+
+  %res = fptosi <2 x double> %in to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define fp128 @uitofp_i32_fp128(i32 %a) {
+entry:
+; CHECK-LABEL: uitofp_i32_fp128
+; CHECK: bl ___floatunsitf
+  %conv = uitofp i32 %a to fp128
+  ret fp128 %conv
+}
+
+define fp128 @uitofp_i64_fp128(i64 %a) {
+entry:
+; CHECK-LABEL: uitofp_i64_fp128
+; CHECK: bl ___floatunditf
+  %conv = uitofp i64 %a to fp128
+  ret fp128 %conv
+}
+
+define i32 @uitofp_fp128_i32(fp128 %a) {
+entry:
+; CHECK-LABEL: uitofp_fp128_i32
+; CHECK: ___fixunstfsi
+  %conv = fptoui fp128 %a to i32
+  ret i32 %conv
+}
+
+define i64 @uitofp_fp128_i64(fp128 %a) {
+entry:
+; CHECK-LABEL: uitofp_fp128_i64
+; CHECK: ___fixunstfdi
+  %conv = fptoui fp128 %a to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-rem.ll b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
new file mode 100644
index 0000000..d5bdbaa
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -print-machineinstrs=expand-isel-pseudos -o /dev/null 2> %t
+; RUN: FileCheck %s < %t --check-prefix=CHECK-SSA
+; REQUIRES: asserts
+
+; CHECK-SSA-LABEL: Machine code for function t1
+
+; CHECK-SSA: [[QUOTREG:%vreg[0-9]+]]<def> = SDIVWr
+; CHECK-SSA-NOT: [[QUOTREG]]<def> =
+; CHECK-SSA: {{%vreg[0-9]+}}<def> = MSUBWrrr [[QUOTREG]]
+
+; CHECK-SSA-LABEL: Machine code for function t2
+
+define i32 @t1(i32 %a, i32 %b) {
+; CHECK: @t1
+; CHECK: sdiv [[TMP:w[0-9]+]], w0, w1
+; CHECK: msub w0, [[TMP]], w1, w0
+  %1 = srem i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @t2(i64 %a, i64 %b) {
+; CHECK: @t2
+; CHECK: sdiv [[TMP:x[0-9]+]], x0, x1
+; CHECK: msub x0, [[TMP]], x1, x0
+  %1 = srem i64 %a, %b
+  ret i64 %1
+}
+
+define i32 @t3(i32 %a, i32 %b) {
+; CHECK: @t3
+; CHECK: udiv [[TMP:w[0-9]+]], w0, w1
+; CHECK: msub w0, [[TMP]], w1, w0
+  %1 = urem i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @t4(i64 %a, i64 %b) {
+; CHECK: @t4
+; CHECK: udiv [[TMP:x[0-9]+]], x0, x1
+; CHECK: msub x0, [[TMP]], x1, x0
+  %1 = urem i64 %a, %b
+  ret i64 %1
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
new file mode 100644
index 0000000..d91fd28
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+;; Test returns.
+define void @t0() nounwind ssp {
+entry:
+; CHECK: t0
+; CHECK: ret
+  ret void
+}
+
+define i32 @t1(i32 %a) nounwind ssp {
+entry:
+; CHECK: t1
+; CHECK: str w0, [sp, #12]
+; CHECK-NEXT: ldr w0, [sp, #12]
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  ret i32 %tmp
+}
+
+define i64 @t2(i64 %a) nounwind ssp {
+entry:
+; CHECK: t2
+; CHECK: str x0, [sp, #8]
+; CHECK-NEXT: ldr x0, [sp, #8]
+; CHECK: ret
+  %a.addr = alloca i64, align 8
+  store i64 %a, i64* %a.addr, align 8
+  %tmp = load i64* %a.addr, align 8
+  ret i64 %tmp
+}
+
+define signext i16 @ret_i16(i16 signext %a) nounwind {
+entry:
+; CHECK: @ret_i16
+; CHECK: sxth	w0, w0
+  %a.addr = alloca i16, align 1
+  store i16 %a, i16* %a.addr, align 1
+  %0 = load i16* %a.addr, align 1
+  ret i16 %0
+}
+
+define signext i8 @ret_i8(i8 signext %a) nounwind {
+entry:
+; CHECK: @ret_i8
+; CHECK: sxtb	w0, w0
+  %a.addr = alloca i8, align 1
+  store i8 %a, i8* %a.addr, align 1
+  %0 = load i8* %a.addr, align 1
+  ret i8 %0
+}
+
+define signext i1 @ret_i1(i1 signext %a) nounwind {
+entry:
+; CHECK: @ret_i1
+; CHECK: and w0, w0, #0x1
+  %a.addr = alloca i1, align 1
+  store i1 %a, i1* %a.addr, align 1
+  %0 = load i1* %a.addr, align 1
+  ret i1 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-select.ll b/test/CodeGen/AArch64/arm64-fast-isel-select.ll
new file mode 100644
index 0000000..1cc207f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-select.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @t1(i32 %c) nounwind readnone {
+entry:
+; CHECK: @t1
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
+  %0 = icmp sgt i32 %c, 1
+  %1 = select i1 %0, i32 123, i32 357
+  ret i32 %1
+}
+
+define i64 @t2(i32 %c) nounwind readnone {
+entry:
+; CHECK: @t2
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
+  %0 = icmp sgt i32 %c, 1
+  %1 = select i1 %0, i64 123, i64 357
+  ret i64 %1
+}
+
+define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
+entry:
+; CHECK: @t3
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
+  %0 = select i1 %c, i32 %a, i32 %b
+  ret i32 %0
+}
+
+define i64 @t4(i1 %c, i64 %a, i64 %b) nounwind readnone {
+entry:
+; CHECK: @t4
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
+  %0 = select i1 %c, i64 %a, i64 %b
+  ret i64 %0
+}
+
+define float @t5(i1 %c, float %a, float %b) nounwind readnone {
+entry:
+; CHECK: @t5
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: fcsel s0, s0, s1, ne
+  %0 = select i1 %c, float %a, float %b
+  ret float %0
+}
+
+define double @t6(i1 %c, double %a, double %b) nounwind readnone {
+entry:
+; CHECK: @t6
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: fcsel d0, d0, d1, ne
+  %0 = select i1 %c, double %a, double %b
+  ret double %0
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel.ll b/test/CodeGen/AArch64/arm64-fast-isel.ll
new file mode 100644
index 0000000..0194b3a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel.ll
@@ -0,0 +1,95 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @t0(i32 %a) nounwind {
+entry:
+; CHECK: t0
+; CHECK: str {{w[0-9]+}}, [sp, #12]
+; CHECK-NEXT: ldr [[REGISTER:w[0-9]+]], [sp, #12]
+; CHECK-NEXT: str [[REGISTER]], [sp, #12]
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr
+  %tmp = load i32* %a.addr
+  store i32 %tmp, i32* %a.addr
+  ret void
+}
+
+define void @t1(i64 %a) nounwind {
+; CHECK: t1
+; CHECK: str {{x[0-9]+}}, [sp, #8]
+; CHECK-NEXT: ldr [[REGISTER:x[0-9]+]], [sp, #8]
+; CHECK-NEXT: str [[REGISTER]], [sp, #8]
+; CHECK: ret
+  %a.addr = alloca i64, align 4
+  store i64 %a, i64* %a.addr
+  %tmp = load i64* %a.addr
+  store i64 %tmp, i64* %a.addr
+  ret void
+}
+
+define zeroext i1 @i1(i1 %a) nounwind {
+entry:
+; CHECK: @i1
+; CHECK: and w0, w0, #0x1
+; CHECK: strb w0, [sp, #15]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: and w0, w0, #0x1
+; CHECK: and w0, w0, #0x1
+; CHECK: add sp, sp, #16
+; CHECK: ret
+  %a.addr = alloca i1, align 1
+  store i1 %a, i1* %a.addr, align 1
+  %0 = load i1* %a.addr, align 1
+  ret i1 %0
+}
+
+define i32 @t2(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -1
+  %1 = load i32* %0, align 4
+  ret i32 %1
+}
+
+define i32 @t3(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldur w0, [x0, #-256]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -64
+  %1 = load i32* %0, align 4
+  ret i32 %1
+}
+
+define void @t4(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: movz w8, #0
+; CHECK: stur w8, [x0, #-4]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -1
+  store i32 0, i32* %0, align 4
+  ret void
+}
+
+define void @t5(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: movz w8, #0
+; CHECK: stur w8, [x0, #-256]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -64
+  store i32 0, i32* %0, align 4
+  ret void
+}
+
+define void @t6() nounwind {
+; CHECK: t6
+; CHECK: brk #0x1
+  tail call void @llvm.trap()
+  ret void
+}
+
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll b/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
new file mode 100644
index 0000000..8a744c5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define void @caller(i32* nocapture %p, i32 %a, i32 %b) nounwind optsize ssp {
+; CHECK-NOT: stp
+; CHECK: b       {{_callee|callee}}
+; CHECK-NOT: ldp
+; CHECK: ret
+  %1 = icmp eq i32 %b, 0
+  br i1 %1, label %3, label %2
+
+  tail call fastcc void @callee(i32* %p, i32 %a) optsize
+  br label %3
+
+  ret void
+}
+
+define internal fastcc void @callee(i32* nocapture %p, i32 %a) nounwind optsize noinline ssp {
+  store volatile i32 %a, i32* %p, align 4, !tbaa !0
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll b/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
new file mode 100644
index 0000000..af9fe05
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
@@ -0,0 +1,18 @@
+; fastisel should not fold add with non-pointer bitwidth
+; sext(a) + sext(b) != sext(a + b)
+; RUN: llc -mtriple=arm64-apple-darwin %s -O0 -o - | FileCheck %s
+
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %add = add i8 64, 64 ; 0x40 + 0x40
+  %0 = load i8** %ptr.addr, align 8
+
+  ; CHECK-LABEL: _gep_promotion:
+  ; CHECK: ldrb {{[a-z][0-9]+}}, {{\[[a-z][0-9]+\]}}
+  %arrayidx = getelementptr inbounds i8* %0, i8 %add
+
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
+
diff --git a/test/CodeGen/AArch64/arm64-fcmp-opt.ll b/test/CodeGen/AArch64/arm64-fcmp-opt.ll
new file mode 100644
index 0000000..41027d4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fcmp-opt.ll
@@ -0,0 +1,204 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -aarch64-neon-syntax=apple | FileCheck %s
+; rdar://10263824
+
+define i1 @fcmp_float1(float %a) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_float1
+; CHECK: fcmp s0, #0.0
+; CHECK: cset w0, ne
+  %cmp = fcmp une float %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_float2(float %a, float %b) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_float2
+; CHECK: fcmp s0, s1
+; CHECK: cset w0, ne
+  %cmp = fcmp une float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_double1(double %a) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_double1
+; CHECK: fcmp d0, #0.0
+; CHECK: cset w0, ne
+  %cmp = fcmp une double %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_double2(double %a, double %b) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_double2
+; CHECK: fcmp d0, d1
+; CHECK: cset w0, ne
+  %cmp = fcmp une double %a, %b
+  ret i1 %cmp
+}
+
+; Check each fcmp condition
+define float @fcmp_oeq(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_oeq
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], eq
+
+  %cmp = fcmp oeq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ogt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ogt
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], gt
+
+  %cmp = fcmp ogt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_oge(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_oge
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ge
+
+  %cmp = fcmp oge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_olt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_olt
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], mi
+
+  %cmp = fcmp olt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ole(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ole
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ls
+
+  %cmp = fcmp ole float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ord(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ord
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], vc
+  %cmp = fcmp ord float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uno(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_uno
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], vs
+  %cmp = fcmp uno float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ugt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ugt
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], hi
+  %cmp = fcmp ugt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uge(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_uge
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], pl
+  %cmp = fcmp uge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ult(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ult
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], lt
+  %cmp = fcmp ult float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ule(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ule
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], le
+  %cmp = fcmp ule float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_une(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_une
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ne
+  %cmp = fcmp une float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+; Possible opportunity for improvement.  See comment in
+; ARM64TargetLowering::LowerSETCC()
+define float @fcmp_one(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_one
+;	fcmp	s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel [[TMP:s[0-9]+]], s[[ONE]], s[[ZERO]], mi
+; CHECK: fcsel s0, s[[ONE]], [[TMP]], gt
+  %cmp = fcmp one float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+; Possible opportunity for improvement.  See comment in
+; ARM64TargetLowering::LowerSETCC()
+define float @fcmp_ueq(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ueq
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel [[TMP:s[0-9]+]], s[[ONE]], s[[ZERO]], eq
+; CHECK: fcsel s0, s[[ONE]], [[TMP]], vs
+  %cmp = fcmp ueq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fcopysign.ll b/test/CodeGen/AArch64/arm64-fcopysign.ll
new file mode 100644
index 0000000..66241df
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fcopysign.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; rdar://9332258
+
+define float @test1(float %x, float %y) nounwind {
+entry:
+; CHECK-LABEL: test1:
+; CHECK: movi.4s	v2, #0x80, lsl #24
+; CHECK: bit.16b	v0, v1, v2
+  %0 = tail call float @copysignf(float %x, float %y) nounwind readnone
+  ret float %0
+}
+
+define double @test2(double %x, double %y) nounwind {
+entry:
+; CHECK-LABEL: test2:
+; CHECK: movi.2d	v2, #0
+; CHECK: fneg.2d	v2, v2
+; CHECK: bit.16b	v0, v1, v2
+  %0 = tail call double @copysign(double %x, double %y) nounwind readnone
+  ret double %0
+}
+
+; rdar://9545768
+define double @test3(double %a, float %b, float %c) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: fcvt d1, s1
+; CHECK: fneg.2d v2, v{{[0-9]+}}
+; CHECK: bit.16b v0, v1, v2
+  %tmp1 = fadd float %b, %c
+  %tmp2 = fpext float %tmp1 to double
+  %tmp = tail call double @copysign( double %a, double %tmp2 ) nounwind readnone
+  ret double %tmp
+}
+
+define float @test4() nounwind {
+entry:
+; CHECK-LABEL: test4:
+; CHECK: fcvt s0, d0
+; CHECK: movi.4s v[[CONST:[0-9]+]], #0x80, lsl #24
+; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
+  %0 = tail call double (...)* @bar() nounwind
+  %1 = fptrunc double %0 to float
+  %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
+  %3 = fadd float %1, %2
+  ret float %3
+}
+
+declare double @bar(...)
+declare double @copysign(double, double) nounwind readnone
+declare float @copysignf(float, float) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll b/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
new file mode 100644
index 0000000..e51c38b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; DAGCombine to transform a conversion of an extract_vector_elt to an
+; extract_vector_elt of a conversion, which saves a round trip of copies
+; of the value to a GPR and back to and FPR.
+; rdar://11855286
+define double @foo0(<2 x i64> %a) nounwind {
+; CHECK:  scvtf.2d  [[REG:v[0-9]+]], v0, #9
+; CHECK-NEXT:  ins.d v0[0], [[REG]][1]
+  %vecext = extractelement <2 x i64> %a, i32 1
+  %fcvt_n = tail call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
+  ret double %fcvt_n
+}
+
+declare double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-fmadd.ll b/test/CodeGen/AArch64/arm64-fmadd.ll
new file mode 100644
index 0000000..c791900
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fmadd.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+define float @fma32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fma32:
+; CHECK: fmadd s0, s0, s1, s2
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %0
+}
+
+define float @fnma32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fnma32:
+; CHECK: fnmadd s0, s0, s1, s2
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+  %mul = fmul float %0, -1.000000e+00
+  ret float %mul
+}
+
+define float @fms32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fms32:
+; CHECK: fmsub s0, s0, s1, s2
+  %mul = fmul float %b, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %a, float %mul, float %c)
+  ret float %0
+}
+
+define float @fms32_com(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fms32_com:
+; CHECK: fmsub s0, s1, s0, s2
+  %mul = fmul float %b, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %mul, float %a, float %c)
+  ret float %0
+}
+
+define float @fnms32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fnms32:
+; CHECK: fnmsub s0, s0, s1, s2
+  %mul = fmul float %c, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %mul)
+  ret float %0
+}
+
+define double @fma64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fma64:
+; CHECK: fmadd d0, d0, d1, d2
+entry:
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %0
+}
+
+define double @fnma64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnma64:
+; CHECK: fnmadd d0, d0, d1, d2
+entry:
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
+  %mul = fmul double %0, -1.000000e+00
+  ret double %mul
+}
+
+define double @fms64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fms64:
+; CHECK: fmsub d0, d0, d1, d2
+entry:
+  %mul = fmul double %b, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %a, double %mul, double %c)
+  ret double %0
+}
+
+define double @fms64_com(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fms64_com:
+; CHECK: fmsub d0, d1, d0, d2
+entry:
+  %mul = fmul double %b, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %mul, double %a, double %c)
+  ret double %0
+}
+
+define double @fnms64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnms64:
+; CHECK: fnmsub d0, d0, d1, d2
+entry:
+  %mul = fmul double %c, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %mul)
+  ret double %0
+}
+
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-fmax.ll b/test/CodeGen/AArch64/arm64-fmax.ll
new file mode 100644
index 0000000..94b7454
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fmax.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s
+
+define double @test_direct(float %in) #1 {
+; CHECK-LABEL: test_direct:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %longer = fpext float %in to double
+  %val = select i1 %cmp, double 0.000000e+00, double %longer
+  ret double %val
+
+; CHECK: fmax
+}
+
+define double @test_cross(float %in) #1 {
+; CHECK-LABEL: test_cross:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %longer = fpext float %in to double
+  %val = select i1 %cmp, double %longer, double 0.000000e+00
+  ret double %val
+
+; CHECK: fmin
+}
+
+; This isn't a min or a max, but passes the first condition for swapping the
+; results. Make sure they're put back before we resort to the normal fcsel.
+define float @test_cross_fail(float %lhs, float %rhs) {
+; CHECK-LABEL: test_cross_fail:
+  %tst = fcmp une float %lhs, %rhs
+  %res = select i1 %tst, float %rhs, float %lhs
+  ret float %res
+
+  ; The register allocator would have to decide to be deliberately obtuse before
+  ; other register were used.
+; CHECK: fcsel s0, s1, s0, ne
+}
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-fminv.ll b/test/CodeGen/AArch64/arm64-fminv.ll
new file mode 100644
index 0000000..f4c9735
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fminv.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+define float @test_fminv_v2f32(<2 x float> %in) {
+; CHECK: test_fminv_v2f32:
+; CHECK: fminp s0, v0.2s
+  %min = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %in)
+  ret float %min
+}
+
+define float @test_fminv_v4f32(<4 x float> %in) {
+; CHECK: test_fminv_v4f32:
+; CHECK: fminv s0, v0.4s
+  %min = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %in)
+  ret float %min
+}
+
+define double @test_fminv_v2f64(<2 x double> %in) {
+; CHECK: test_fminv_v2f64:
+; CHECK: fminp d0, v0.2d
+  %min = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxv_v2f32:
+; CHECK: fmaxp s0, v0.2s
+  %max = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %in)
+  ret float %max
+}
+
+define float @test_fmaxv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxv_v4f32:
+; CHECK: fmaxv s0, v0.4s
+  %max = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %in)
+  ret float %max
+}
+
+define double @test_fmaxv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxv_v2f64:
+; CHECK: fmaxp d0, v0.2d
+  %max = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double>)
+
+define float @test_fminnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fminnmv_v2f32:
+; CHECK: fminnmp s0, v0.2s
+  %minnm = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %in)
+  ret float %minnm
+}
+
+define float @test_fminnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fminnmv_v4f32:
+; CHECK: fminnmv s0, v0.4s
+  %minnm = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %in)
+  ret float %minnm
+}
+
+define double @test_fminnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fminnmv_v2f64:
+; CHECK: fminnmp d0, v0.2d
+  %minnm = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %minnm
+}
+
+declare float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxnmv_v2f32:
+; CHECK: fmaxnmp s0, v0.2s
+  %maxnm = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %in)
+  ret float %maxnm
+}
+
+define float @test_fmaxnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxnmv_v4f32:
+; CHECK: fmaxnmv s0, v0.4s
+  %maxnm = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %in)
+  ret float %maxnm
+}
+
+define double @test_fmaxnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxnmv_v2f64:
+; CHECK: fmaxnmp d0, v0.2d
+  %maxnm = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %maxnm
+}
+
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/AArch64/arm64-fmuladd.ll b/test/CodeGen/AArch64/arm64-fmuladd.ll
new file mode 100644
index 0000000..6c5eeca
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fmuladd.ll
@@ -0,0 +1,88 @@
+; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define float @test_f32(float* %A, float* %B, float* %C) nounwind {
+;CHECK-LABEL: test_f32:
+;CHECK: fmadd
+;CHECK-NOT: fmadd
+  %tmp1 = load float* %A
+  %tmp2 = load float* %B
+  %tmp3 = load float* %C
+  %tmp4 = call float @llvm.fmuladd.f32(float %tmp1, float %tmp2, float %tmp3)
+  ret float %tmp4
+}
+
+define <2 x float> @test_v2f32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: test_v2f32:
+;CHECK: fmla.2s
+;CHECK-NOT: fmla.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @test_v4f32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: test_v4f32:
+;CHECK: fmla.4s
+;CHECK-NOT: fmla.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <8 x float> @test_v8f32(<8 x float>* %A, <8 x float>* %B, <8 x float>* %C) nounwind {
+;CHECK-LABEL: test_v8f32:
+;CHECK: fmla.4s
+;CHECK: fmla.4s
+;CHECK-NOT: fmla.4s
+  %tmp1 = load <8 x float>* %A
+  %tmp2 = load <8 x float>* %B
+  %tmp3 = load <8 x float>* %C
+  %tmp4 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %tmp1, <8 x float> %tmp2, <8 x float> %tmp3)
+  ret <8 x float> %tmp4
+}
+
+define double @test_f64(double* %A, double* %B, double* %C) nounwind {
+;CHECK-LABEL: test_f64:
+;CHECK: fmadd
+;CHECK-NOT: fmadd
+  %tmp1 = load double* %A
+  %tmp2 = load double* %B
+  %tmp3 = load double* %C
+  %tmp4 = call double @llvm.fmuladd.f64(double %tmp1, double %tmp2, double %tmp3)
+  ret double %tmp4
+}
+
+define <2 x double> @test_v2f64(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: test_v2f64:
+;CHECK: fmla.2d
+;CHECK-NOT: fmla.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+define <4 x double> @test_v4f64(<4 x double>* %A, <4 x double>* %B, <4 x double>* %C) nounwind {
+;CHECK-LABEL: test_v4f64:
+;CHECK: fmla.2d
+;CHECK: fmla.2d
+;CHECK-NOT: fmla.2d
+  %tmp1 = load <4 x double>* %A
+  %tmp2 = load <4 x double>* %B
+  %tmp3 = load <4 x double>* %C
+  %tmp4 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %tmp1, <4 x double> %tmp2, <4 x double> %tmp3)
+  ret <4 x double> %tmp4
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+declare double @llvm.fmuladd.f64(double, double, double) nounwind readnone
+declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-fold-address.ll b/test/CodeGen/AArch64/arm64-fold-address.ll
new file mode 100644
index 0000000..96cc3e9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fold-address.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -O2 -mtriple=arm64-apple-darwin | FileCheck %s
+
+%0 = type opaque
+%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
+%struct.CGPoint = type { double, double }
+%struct.CGSize = type { double, double }
+
+@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+define hidden %struct.CGRect @nofold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: nofold:
+; CHECK: add x[[REG:[0-9]+]], x0, x{{[0-9]+}}
+; CHECK: ldp d0, d1, [x[[REG]]]
+; CHECK: ldp d2, d3, [x[[REG]], #16]
+; CHECK: ret
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr.sum = add i64 %ivar, 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr.sum17 = add i64 %ivar, 16
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.sum = add i64 %ivar, 24
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+define hidden %struct.CGRect @fold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: fold:
+; CHECK: ldr d0, [x0, x{{[0-9]+}}]
+; CHECK-NOT: add x0, x0, x1
+; CHECK: ret
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{}
diff --git a/test/CodeGen/AArch64/arm64-fold-lsl.ll b/test/CodeGen/AArch64/arm64-fold-lsl.ll
new file mode 100644
index 0000000..ec65e46
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fold-lsl.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+;
+; <rdar://problem/14486451>
+
+%struct.a = type [256 x i16]
+%struct.b = type [256 x i32]
+%struct.c = type [256 x i64]
+
+define i16 @load_halfword(%struct.a* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_halfword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldrh w0, [x0, [[REG]], lsl #1]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
+  %result = load i16* %arrayidx86, align 2
+  ret i16 %result
+}
+
+define i32 @load_word(%struct.b* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_word:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldr w0, [x0, [[REG]], lsl #2]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
+  %result = load i32* %arrayidx86, align 4
+  ret i32 %result
+}
+
+define i64 @load_doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_doubleword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldr x0, [x0, [[REG]], lsl #3]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
+  %result = load i64* %arrayidx86, align 8
+  ret i64 %result
+}
+
+define void @store_halfword(%struct.a* %ctx, i32 %xor72, i16 %val) nounwind {
+; CHECK-LABEL: store_halfword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: strh w2, [x0, [[REG]], lsl #1]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
+  store i16 %val, i16* %arrayidx86, align 8
+  ret void
+}
+
+define void @store_word(%struct.b* %ctx, i32 %xor72, i32 %val) nounwind {
+; CHECK-LABEL: store_word:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: str w2, [x0, [[REG]], lsl #2]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
+  store i32 %val, i32* %arrayidx86, align 8
+  ret void
+}
+
+define void @store_doubleword(%struct.c* %ctx, i32 %xor72, i64 %val) nounwind {
+; CHECK-LABEL: store_doubleword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: str x2, [x0, [[REG]], lsl #3]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
+  store i64 %val, i64* %arrayidx86, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fp-contract-zero.ll b/test/CodeGen/AArch64/arm64-fp-contract-zero.ll
new file mode 100644
index 0000000..f982cbb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp-contract-zero.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=arm64 -fp-contract=fast -o - %s | FileCheck %s
+
+
+; Make sure we don't try to fold an fneg into +0.0, creating an illegal constant
+; -0.0. It's also good, though not essential, that we don't resort to a litpool.
+define double @test_fms_fold(double %a, double %b) {
+; CHECK-LABEL: test_fms_fold:
+; CHECK: fmov {{d[0-9]+}}, xzr
+; CHECK: ret
+  %mul = fmul double %a, 0.000000e+00
+  %mul1 = fmul double %b, 0.000000e+00
+  %sub = fsub double %mul, %mul1
+  ret double %sub
+}
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-fp-imm.ll b/test/CodeGen/AArch64/arm64-fp-imm.ll
new file mode 100644
index 0000000..6e271e0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp-imm.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; CHECK: literal8
+; CHECK: .quad  4614256656552045848
+define double @foo() {
+; CHECK: _foo:
+; CHECK: adrp x[[REG:[0-9]+]], lCPI0_0@PAGE
+; CHECK: ldr  d0, [x[[REG]], lCPI0_0@PAGEOFF]
+; CHECK-NEXT: ret
+  ret double 0x400921FB54442D18
+}
+
+; CHECK: literal4
+; CHECK: .long 1078530011
+define float @bar() {
+; CHECK: _bar:
+; CHECK:  adrp  x[[REG:[0-9]+]], lCPI1_0@PAGE
+; CHECK:  ldr s0, [x[[REG]], lCPI1_0@PAGEOFF]
+; CHECK-NEXT:  ret
+  ret float 0x400921FB60000000
+}
+
+; CHECK: literal16
+; CHECK: .quad 0
+; CHECK: .quad 0
+define fp128 @baz() {
+; CHECK: _baz:
+; CHECK:  adrp x[[REG:[0-9]+]], lCPI2_0@PAGE
+; CHECK:  ldr  q0, [x[[REG]], lCPI2_0@PAGEOFF]
+; CHECK-NEXT:  ret
+  ret fp128 0xL00000000000000000000000000000000
+}
diff --git a/test/CodeGen/AArch64/arm64-fp.ll b/test/CodeGen/AArch64/arm64-fp.ll
new file mode 100644
index 0000000..08b1b67
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @t1(i1 %a, float %b, float %c) nounwind {
+; CHECK: t1
+; CHECK: fcsel	s0, s0, s1, ne
+  %sel = select i1 %a, float %b, float %c
+  ret float %sel
+}
diff --git a/test/CodeGen/AArch64/arm64-fp128-folding.ll b/test/CodeGen/AArch64/arm64-fp128-folding.ll
new file mode 100644
index 0000000..6a7d203
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp128-folding.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
+declare void @bar(i8*, i8*, i32*)
+
+; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
+; which is not supported.
+
+define fp128 @test_folding() {
+; CHECK-LABEL: test_folding:
+  %l = alloca i32
+  store i32 42, i32* %l
+  %val = load i32* %l
+  %fpval = sitofp i32 %val to fp128
+  ; If the value is loaded from a constant pool into an fp128, it's been folded
+  ; successfully.
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}},
+  ret fp128 %fpval
+}
diff --git a/test/CodeGen/AArch64/arm64-fp128.ll b/test/CodeGen/AArch64/arm64-fp128.ll
new file mode 100644
index 0000000..57bbb93
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp128.ll
@@ -0,0 +1,273 @@
+; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone < %s | FileCheck %s
+
+@lhs = global fp128 zeroinitializer, align 16
+@rhs = global fp128 zeroinitializer, align 16
+
+define fp128 @test_add() {
+; CHECK-LABEL: test_add:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fadd fp128 %lhs, %rhs
+; CHECK: bl __addtf3
+  ret fp128 %val
+}
+
+define fp128 @test_sub() {
+; CHECK-LABEL: test_sub:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fsub fp128 %lhs, %rhs
+; CHECK: bl __subtf3
+  ret fp128 %val
+}
+
+define fp128 @test_mul() {
+; CHECK-LABEL: test_mul:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fmul fp128 %lhs, %rhs
+; CHECK: bl __multf3
+  ret fp128 %val
+}
+
+define fp128 @test_div() {
+; CHECK-LABEL: test_div:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fdiv fp128 %lhs, %rhs
+; CHECK: bl __divtf3
+  ret fp128 %val
+}
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_fptosi() {
+; CHECK-LABEL: test_fptosi:
+  %val = load fp128* @lhs, align 16
+
+  %val32 = fptosi fp128 %val to i32
+  store i32 %val32, i32* @var32
+; CHECK: bl __fixtfsi
+
+  %val64 = fptosi fp128 %val to i64
+  store i64 %val64, i64* @var64
+; CHECK: bl __fixtfdi
+
+  ret void
+}
+
+define void @test_fptoui() {
+; CHECK-LABEL: test_fptoui:
+  %val = load fp128* @lhs, align 16
+
+  %val32 = fptoui fp128 %val to i32
+  store i32 %val32, i32* @var32
+; CHECK: bl __fixunstfsi
+
+  %val64 = fptoui fp128 %val to i64
+  store i64 %val64, i64* @var64
+; CHECK: bl __fixunstfdi
+
+  ret void
+}
+
+define void @test_sitofp() {
+; CHECK-LABEL: test_sitofp:
+
+  %src32 = load i32* @var32
+  %val32 = sitofp i32 %src32 to fp128
+  store volatile fp128 %val32, fp128* @lhs
+; CHECK: bl __floatsitf
+
+  %src64 = load i64* @var64
+  %val64 = sitofp i64 %src64 to fp128
+  store volatile fp128 %val64, fp128* @lhs
+; CHECK: bl __floatditf
+
+  ret void
+}
+
+define void @test_uitofp() {
+; CHECK-LABEL: test_uitofp:
+
+  %src32 = load i32* @var32
+  %val32 = uitofp i32 %src32 to fp128
+  store volatile fp128 %val32, fp128* @lhs
+; CHECK: bl __floatunsitf
+
+  %src64 = load i64* @var64
+  %val64 = uitofp i64 %src64 to fp128
+  store volatile fp128 %val64, fp128* @lhs
+; CHECK: bl __floatunditf
+
+  ret void
+}
+
+define i1 @test_setcc1() {
+; CHECK-LABEL: test_setcc1:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+; Technically, everything after the call to __letf2 is redundant, but we'll let
+; LLVM have its fun for now.
+  %val = fcmp ole fp128 %lhs, %rhs
+; CHECK: bl __letf2
+; CHECK: cmp w0, #0
+; CHECK: cset w0, le
+
+  ret i1 %val
+; CHECK: ret
+}
+
+define i1 @test_setcc2() {
+; CHECK-LABEL: test_setcc2:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fcmp ugt fp128 %lhs, %rhs
+; CHECK: bl      __gttf2
+; CHECK: cmp     w0, #0
+; CHECK: cset   [[GT:w[0-9]+]], gt
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp     w0, #0
+; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
+; CHECK: orr     w0, [[UNORDERED]], [[GT]]
+
+  ret i1 %val
+; CHECK: ret
+}
+
+define i32 @test_br_cc() {
+; CHECK-LABEL: test_br_cc:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
+  %cond = fcmp olt fp128 %lhs, %rhs
+; CHECK: bl      __getf2
+; CHECK: cmp     w0, #0
+; CHECK: cset   [[OGE:w[0-9]+]], ge
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp     w0, #0
+; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
+
+; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
+; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
+  br i1 %cond, label %iftrue, label %iffalse
+
+iftrue:
+  ret i32 42
+; CHECK-NEXT: BB#
+; CHECK-NEXT: movz w0, #0x2a
+; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
+
+iffalse:
+  ret i32 29
+; CHECK: [[RET29]]:
+; CHECK-NEXT: movz w0, #0x1d
+; CHECK-NEXT: [[REALRET]]:
+; CHECK: ret
+}
+
+define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
+; CHECK-LABEL: test_select:
+
+  %val = select i1 %cond, fp128 %lhs, fp128 %rhs
+  store fp128 %val, fp128* @lhs, align 16
+; CHECK: tst w0, #0x1
+; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: BB#
+; CHECK-NEXT: mov v[[VAL:[0-9]+]].16b, v0.16b
+; CHECK-NEXT: [[IFFALSE]]:
+; CHECK: str q[[VAL]], [{{x[0-9]+}}, :lo12:lhs]
+  ret void
+; CHECK: ret
+}
+
+@varfloat = global float 0.0, align 4
+@vardouble = global double 0.0, align 8
+
+define void @test_round() {
+; CHECK-LABEL: test_round:
+
+  %val = load fp128* @lhs, align 16
+
+  %float = fptrunc fp128 %val to float
+  store float %float, float* @varfloat, align 4
+; CHECK: bl __trunctfsf2
+; CHECK: str s0, [{{x[0-9]+}}, :lo12:varfloat]
+
+  %double = fptrunc fp128 %val to double
+  store double %double, double* @vardouble, align 8
+; CHECK: bl __trunctfdf2
+; CHECK: str d0, [{{x[0-9]+}}, :lo12:vardouble]
+
+  ret void
+}
+
+define void @test_extend() {
+; CHECK-LABEL: test_extend:
+
+  %val = load fp128* @lhs, align 16
+
+  %float = load float* @varfloat
+  %fromfloat = fpext float %float to fp128
+  store volatile fp128 %fromfloat, fp128* @lhs, align 16
+; CHECK: bl __extendsftf2
+; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
+
+  %double = load double* @vardouble
+  %fromdouble = fpext double %double to fp128
+  store volatile fp128 %fromdouble, fp128* @lhs, align 16
+; CHECK: bl __extenddftf2
+; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
+
+  ret void
+; CHECK: ret
+}
+
+define fp128 @test_neg(fp128 %in) {
+; CHECK: [[MINUS0:.LCPI[0-9]+_0]]:
+; Make sure the weird hex constant below *is* -0.0
+; CHECK-NEXT: fp128 -0
+
+; CHECK-LABEL: test_neg:
+
+  ; Could in principle be optimized to fneg which we can't select, this makes
+  ; sure that doesn't happen.
+  %ret = fsub fp128 0xL00000000000000008000000000000000, %in
+; CHECK: mov v1.16b, v0.16b
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:[[MINUS0]]]
+; CHECK: bl __subtf3
+
+  ret fp128 %ret
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-frame-index.ll b/test/CodeGen/AArch64/arm64-frame-index.ll
new file mode 100644
index 0000000..4a91ff3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-frame-index.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; rdar://11935841
+
+define void @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: add x{{[0-9]+}}, sp
+; CHECK: stp x28, x27, [sp, #-16]!
+  %v = alloca [288 x i32], align 4
+  unreachable
+}
diff --git a/test/CodeGen/AArch64/arm64-frameaddr.ll b/test/CodeGen/AArch64/arm64-frameaddr.ll
new file mode 100644
index 0000000..469078c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-frameaddr.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i8* @t() nounwind {
+entry:
+; CHECK-LABEL: t:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: mov x0, x29
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+	%0 = call i8* @llvm.frameaddress(i32 0)
+        ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-global-address.ll b/test/CodeGen/AArch64/arm64-global-address.ll
new file mode 100644
index 0000000..005f414
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-global-address.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; rdar://9618644
+
+@G = external global i32
+
+define i32 @test(i32 %off) nounwind {
+; CHECK-LABEL: test:
+; CHECK: adrp x[[REG:[0-9]+]], _G@GOTPAGE
+; CHECK: ldr  x[[REG2:[0-9]+]], [x[[REG]], _G@GOTPAGEOFF]
+; CHECK: add w0, w[[REG2]], w0
+  %tmp = ptrtoint i32* @G to i32
+  %tmp1 = add i32 %tmp, %off
+  ret i32 %tmp1
+}
diff --git a/test/CodeGen/AArch64/arm64-hello.ll b/test/CodeGen/AArch64/arm64-hello.ll
new file mode 100644
index 0000000..a6346fb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-hello.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+
+; CHECK-LABEL: main:
+; CHECK:	stp	x29, x30, [sp, #-16]!
+; CHECK-NEXT:	mov	x29, sp
+; CHECK-NEXT:	sub	sp, sp, #16
+; CHECK-NEXT:	stur	wzr, [x29, #-4]
+; CHECK:	adrp	x0, L_.str@PAGE
+; CHECK:	add	x0, x0, L_.str@PAGEOFF
+; CHECK-NEXT:	bl	_puts
+; CHECK-NEXT:	mov	sp, x29
+; CHECK-NEXT:	ldp	x29, x30, [sp], #16
+; CHECK-NEXT:	ret
+
+; CHECK-LINUX-LABEL: main:
+; CHECK-LINUX:	stp	x29, x30, [sp, #-16]!
+; CHECK-LINUX-NEXT:	mov	x29, sp
+; CHECK-LINUX-NEXT:	sub	sp, sp, #16
+; CHECK-LINUX-NEXT:	stur	wzr, [x29, #-4]
+; CHECK-LINUX:	adrp	x0, .L.str
+; CHECK-LINUX:	add	x0, x0, :lo12:.L.str
+; CHECK-LINUX-NEXT:	bl	puts
+; CHECK-LINUX-NEXT:	mov	sp, x29
+; CHECK-LINUX-NEXT:	ldp	x29, x30, [sp], #16
+; CHECK-LINUX-NEXT:	ret
+
+@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
+
+define i32 @main() nounwind ssp {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0))
+  ret i32 %call
+}
+
+declare i32 @puts(i8*)
diff --git a/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll b/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
new file mode 100644
index 0000000..ba759e3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @foo(<4 x i16>* %__a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: umov.h w{{[0-9]+}}, v{{[0-9]+}}[0]
+  %tmp18 = load <4 x i16>* %__a, align 8
+  %vget_lane = extractelement <4 x i16> %tmp18, i32 0
+  %conv = zext i16 %vget_lane to i32
+  %mul = mul nsw i32 3, %conv
+  ret i32 %mul
+}
+
diff --git a/test/CodeGen/AArch64/arm64-icmp-opt.ll b/test/CodeGen/AArch64/arm64-icmp-opt.ll
new file mode 100644
index 0000000..7b12ed7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-icmp-opt.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; Optimize (x > -1) to (x >= 0) etc.
+; Optimize (cmp (add / sub), 0): eliminate the subs used to update flag
+;   for comparison only
+; rdar://10233472
+
+define i32 @t1(i64 %a) nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: movn
+; CHECK: cmp  x0, #0
+; CHECK: cset w0, ge
+  %cmp = icmp sgt i64 %a, -1
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-illegal-float-ops.ll b/test/CodeGen/AArch64/arm64-illegal-float-ops.ll
new file mode 100644
index 0000000..9a35fe5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-illegal-float-ops.ll
@@ -0,0 +1,295 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+@varfp128 = global fp128 zeroinitializer
+
+declare float @llvm.cos.f32(float)
+declare double @llvm.cos.f64(double)
+declare fp128 @llvm.cos.f128(fp128)
+
+define void @test_cos(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_cos:
+
+   %cosfloat = call float @llvm.cos.f32(float %float)
+   store float %cosfloat, float* @varfloat
+; CHECK: bl cosf
+
+   %cosdouble = call double @llvm.cos.f64(double %double)
+   store double %cosdouble, double* @vardouble
+; CHECK: bl cos
+
+   %cosfp128 = call fp128 @llvm.cos.f128(fp128 %fp128)
+   store fp128 %cosfp128, fp128* @varfp128
+; CHECK: bl cosl
+
+  ret void
+}
+
+declare float @llvm.exp.f32(float)
+declare double @llvm.exp.f64(double)
+declare fp128 @llvm.exp.f128(fp128)
+
+define void @test_exp(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_exp:
+
+   %expfloat = call float @llvm.exp.f32(float %float)
+   store float %expfloat, float* @varfloat
+; CHECK: bl expf
+
+   %expdouble = call double @llvm.exp.f64(double %double)
+   store double %expdouble, double* @vardouble
+; CHECK: bl exp
+
+   %expfp128 = call fp128 @llvm.exp.f128(fp128 %fp128)
+   store fp128 %expfp128, fp128* @varfp128
+; CHECK: bl expl
+
+  ret void
+}
+
+declare float @llvm.exp2.f32(float)
+declare double @llvm.exp2.f64(double)
+declare fp128 @llvm.exp2.f128(fp128)
+
+define void @test_exp2(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_exp2:
+
+   %exp2float = call float @llvm.exp2.f32(float %float)
+   store float %exp2float, float* @varfloat
+; CHECK: bl exp2f
+
+   %exp2double = call double @llvm.exp2.f64(double %double)
+   store double %exp2double, double* @vardouble
+; CHECK: bl exp2
+
+   %exp2fp128 = call fp128 @llvm.exp2.f128(fp128 %fp128)
+   store fp128 %exp2fp128, fp128* @varfp128
+; CHECK: bl exp2l
+  ret void
+
+}
+
+declare float @llvm.log.f32(float)
+declare double @llvm.log.f64(double)
+declare fp128 @llvm.log.f128(fp128)
+
+define void @test_log(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log:
+
+   %logfloat = call float @llvm.log.f32(float %float)
+   store float %logfloat, float* @varfloat
+; CHECK: bl logf
+
+   %logdouble = call double @llvm.log.f64(double %double)
+   store double %logdouble, double* @vardouble
+; CHECK: bl log
+
+   %logfp128 = call fp128 @llvm.log.f128(fp128 %fp128)
+   store fp128 %logfp128, fp128* @varfp128
+; CHECK: bl logl
+
+  ret void
+}
+
+declare float @llvm.log2.f32(float)
+declare double @llvm.log2.f64(double)
+declare fp128 @llvm.log2.f128(fp128)
+
+define void @test_log2(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log2:
+
+   %log2float = call float @llvm.log2.f32(float %float)
+   store float %log2float, float* @varfloat
+; CHECK: bl log2f
+
+   %log2double = call double @llvm.log2.f64(double %double)
+   store double %log2double, double* @vardouble
+; CHECK: bl log2
+
+   %log2fp128 = call fp128 @llvm.log2.f128(fp128 %fp128)
+   store fp128 %log2fp128, fp128* @varfp128
+; CHECK: bl log2l
+  ret void
+
+}
+
+declare float @llvm.log10.f32(float)
+declare double @llvm.log10.f64(double)
+declare fp128 @llvm.log10.f128(fp128)
+
+define void @test_log10(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log10:
+
+   %log10float = call float @llvm.log10.f32(float %float)
+   store float %log10float, float* @varfloat
+; CHECK: bl log10f
+
+   %log10double = call double @llvm.log10.f64(double %double)
+   store double %log10double, double* @vardouble
+; CHECK: bl log10
+
+   %log10fp128 = call fp128 @llvm.log10.f128(fp128 %fp128)
+   store fp128 %log10fp128, fp128* @varfp128
+; CHECK: bl log10l
+
+  ret void
+}
+
+declare float @llvm.sin.f32(float)
+declare double @llvm.sin.f64(double)
+declare fp128 @llvm.sin.f128(fp128)
+
+define void @test_sin(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_sin:
+
+   %sinfloat = call float @llvm.sin.f32(float %float)
+   store float %sinfloat, float* @varfloat
+; CHECK: bl sinf
+
+   %sindouble = call double @llvm.sin.f64(double %double)
+   store double %sindouble, double* @vardouble
+; CHECK: bl sin
+
+   %sinfp128 = call fp128 @llvm.sin.f128(fp128 %fp128)
+   store fp128 %sinfp128, fp128* @varfp128
+; CHECK: bl sinl
+  ret void
+
+}
+
+declare float @llvm.pow.f32(float, float)
+declare double @llvm.pow.f64(double, double)
+declare fp128 @llvm.pow.f128(fp128, fp128)
+
+define void @test_pow(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_pow:
+
+   %powfloat = call float @llvm.pow.f32(float %float, float %float)
+   store float %powfloat, float* @varfloat
+; CHECK: bl powf
+
+   %powdouble = call double @llvm.pow.f64(double %double, double %double)
+   store double %powdouble, double* @vardouble
+; CHECK: bl pow
+
+   %powfp128 = call fp128 @llvm.pow.f128(fp128 %fp128, fp128 %fp128)
+   store fp128 %powfp128, fp128* @varfp128
+; CHECK: bl powl
+
+  ret void
+}
+
+declare float @llvm.powi.f32(float, i32)
+declare double @llvm.powi.f64(double, i32)
+declare fp128 @llvm.powi.f128(fp128, i32)
+
+define void @test_powi(float %float, double %double, i32 %exponent, fp128 %fp128) {
+; CHECK-LABEL: test_powi:
+
+   %powifloat = call float @llvm.powi.f32(float %float, i32 %exponent)
+   store float %powifloat, float* @varfloat
+; CHECK: bl __powisf2
+
+   %powidouble = call double @llvm.powi.f64(double %double, i32 %exponent)
+   store double %powidouble, double* @vardouble
+; CHECK: bl __powidf2
+
+   %powifp128 = call fp128 @llvm.powi.f128(fp128 %fp128, i32 %exponent)
+   store fp128 %powifp128, fp128* @varfp128
+; CHECK: bl __powitf2
+  ret void
+
+}
+
+define void @test_frem(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_frem:
+
+  %fremfloat = frem float %float, %float
+  store float %fremfloat, float* @varfloat
+; CHECK: bl fmodf
+
+  %fremdouble = frem double %double, %double
+  store double %fremdouble, double* @vardouble
+; CHECK: bl fmod
+
+  %fremfp128 = frem fp128 %fp128, %fp128
+  store fp128 %fremfp128, fp128* @varfp128
+; CHECK: bl fmodl
+
+  ret void
+}
+
+declare fp128 @llvm.fma.f128(fp128, fp128, fp128)
+
+define void @test_fma(fp128 %fp128) {
+; CHECK-LABEL: test_fma:
+
+  %fmafp128 = call fp128 @llvm.fma.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+  store fp128 %fmafp128, fp128* @varfp128
+; CHECK: bl fmal
+
+  ret void
+}
+
+declare fp128 @llvm.fmuladd.f128(fp128, fp128, fp128)
+
+define void @test_fmuladd(fp128 %fp128) {
+; CHECK-LABEL: test_fmuladd:
+
+  %fmuladdfp128 = call fp128 @llvm.fmuladd.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+  store fp128 %fmuladdfp128, fp128* @varfp128
+; CHECK-NOT: bl fmal
+; CHECK: bl __multf3
+; CHECK: bl __addtf3
+
+  ret void
+}
+
+define i32 @test_fptosi32(fp128 %a) {
+; CHECK-LABEL: test_fptosi32:
+; CHECK: bl __fixtfsi
+  %conv.i = fptosi fp128 %a to i32
+  %b = add nsw i32 %conv.i, 48
+  ret i32 %b
+}
+
+define i64 @test_fptosi64(fp128 %a) {
+; CHECK-LABEL: test_fptosi64:
+; CHECK: bl __fixtfdi
+  %conv.i = fptosi fp128 %a to i64
+  %b = add nsw i64 %conv.i, 48
+  ret i64 %b
+}
+
+define i128 @test_fptosi128(fp128 %a) {
+; CHECK-LABEL: test_fptosi128:
+; CHECK: bl __fixtfti
+  %conv.i = fptosi fp128 %a to i128
+  %b = add nsw i128 %conv.i, 48
+  ret i128 %b
+}
+
+define i32 @test_fptoui32(fp128 %a) {
+; CHECK-LABEL: test_fptoui32:
+; CHECK: bl __fixunstfsi
+  %conv.i = fptoui fp128 %a to i32
+  %b = add nsw i32 %conv.i, 48
+  ret i32 %b
+}
+
+define i64 @test_fptoui64(fp128 %a) {
+; CHECK-LABEL: test_fptoui64:
+; CHECK: bl __fixunstfdi
+  %conv.i = fptoui fp128 %a to i64
+  %b = add nsw i64 %conv.i, 48
+  ret i64 %b
+}
+
+define i128 @test_fptoui128(fp128 %a) {
+; CHECK-LABEL: test_fptoui128:
+; CHECK: bl __fixunstfti
+  %conv.i = fptoui fp128 %a to i128
+  %b = add nsw i128 %conv.i, 48
+  ret i128 %b
+}
diff --git a/test/CodeGen/AArch64/arm64-indexed-memory.ll b/test/CodeGen/AArch64/arm64-indexed-memory.ll
new file mode 100644
index 0000000..e501c6e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -0,0 +1,351 @@
+; RUN: llc < %s -march=arm64 -aarch64-redzone | FileCheck %s
+
+define void @store64(i64** nocapture %out, i64 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store64:
+; CHECK: str x{{[0-9+]}}, [x{{[0-9+]}}], #8
+; CHECK: ret
+  %tmp = load i64** %out, align 8
+  %incdec.ptr = getelementptr inbounds i64* %tmp, i64 1
+  store i64 %spacing, i64* %tmp, align 4
+  store i64* %incdec.ptr, i64** %out, align 8
+  ret void
+}
+
+define void @store32(i32** nocapture %out, i32 %index, i32 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store32:
+; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load i32** %out, align 8
+  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
+  store i32 %spacing, i32* %tmp, align 4
+  store i32* %incdec.ptr, i32** %out, align 8
+  ret void
+}
+
+define void @store16(i16** nocapture %out, i16 %index, i16 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store16:
+; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+  %tmp = load i16** %out, align 8
+  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
+  store i16 %spacing, i16* %tmp, align 4
+  store i16* %incdec.ptr, i16** %out, align 8
+  ret void
+}
+
+define void @store8(i8** nocapture %out, i8 %index, i8 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store8:
+; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
+; CHECK: ret
+  %tmp = load i8** %out, align 8
+  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
+  store i8 %spacing, i8* %tmp, align 4
+  store i8* %incdec.ptr, i8** %out, align 8
+  ret void
+}
+
+define void @truncst64to32(i32** nocapture %out, i32 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to32:
+; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load i32** %out, align 8
+  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i32
+  store i32 %trunc, i32* %tmp, align 4
+  store i32* %incdec.ptr, i32** %out, align 8
+  ret void
+}
+
+define void @truncst64to16(i16** nocapture %out, i16 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to16:
+; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+  %tmp = load i16** %out, align 8
+  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i16
+  store i16 %trunc, i16* %tmp, align 4
+  store i16* %incdec.ptr, i16** %out, align 8
+  ret void
+}
+
+define void @truncst64to8(i8** nocapture %out, i8 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to8:
+; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
+; CHECK: ret
+  %tmp = load i8** %out, align 8
+  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i8
+  store i8 %trunc, i8* %tmp, align 4
+  store i8* %incdec.ptr, i8** %out, align 8
+  ret void
+}
+
+
+define void @storef32(float** nocapture %out, float %index, float %spacing) nounwind noinline ssp {
+; CHECK-LABEL: storef32:
+; CHECK: str s{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load float** %out, align 8
+  %incdec.ptr = getelementptr inbounds float* %tmp, i64 1
+  store float %spacing, float* %tmp, align 4
+  store float* %incdec.ptr, float** %out, align 8
+  ret void
+}
+
+define void @storef64(double** nocapture %out, double %index, double %spacing) nounwind noinline ssp {
+; CHECK-LABEL: storef64:
+; CHECK: str d{{[0-9+]}}, [x{{[0-9+]}}], #8
+; CHECK: ret
+  %tmp = load double** %out, align 8
+  %incdec.ptr = getelementptr inbounds double* %tmp, i64 1
+  store double %spacing, double* %tmp, align 4
+  store double* %incdec.ptr, double** %out, align 8
+  ret void
+}
+
+define double * @pref64(double** nocapture %out, double %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pref64:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     d0, [x0, #32]!
+; CHECK-NEXT: ret
+  %tmp = load double** %out, align 8
+  %ptr = getelementptr inbounds double* %tmp, i64 4
+  store double %spacing, double* %ptr, align 4
+  ret double *%ptr
+}
+
+define float * @pref32(float** nocapture %out, float %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pref32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     s0, [x0, #12]!
+; CHECK-NEXT: ret
+  %tmp = load float** %out, align 8
+  %ptr = getelementptr inbounds float* %tmp, i64 3
+  store float %spacing, float* %ptr, align 4
+  ret float *%ptr
+}
+
+define i64 * @pre64(i64** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre64:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     x1, [x0, #16]!
+; CHECK-NEXT: ret
+  %tmp = load i64** %out, align 8
+  %ptr = getelementptr inbounds i64* %tmp, i64 2
+  store i64 %spacing, i64* %ptr, align 4
+  ret i64 *%ptr
+}
+
+define i32 * @pre32(i32** nocapture %out, i32 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     w1, [x0, #8]!
+; CHECK-NEXT: ret
+  %tmp = load i32** %out, align 8
+  %ptr = getelementptr inbounds i32* %tmp, i64 2
+  store i32 %spacing, i32* %ptr, align 4
+  ret i32 *%ptr
+}
+
+define i16 * @pre16(i16** nocapture %out, i16 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre16:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strh    w1, [x0, #4]!
+; CHECK-NEXT: ret
+  %tmp = load i16** %out, align 8
+  %ptr = getelementptr inbounds i16* %tmp, i64 2
+  store i16 %spacing, i16* %ptr, align 4
+  ret i16 *%ptr
+}
+
+define i8 * @pre8(i8** nocapture %out, i8 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre8:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strb    w1, [x0, #2]!
+; CHECK-NEXT: ret
+  %tmp = load i8** %out, align 8
+  %ptr = getelementptr inbounds i8* %tmp, i64 2
+  store i8 %spacing, i8* %ptr, align 4
+  ret i8 *%ptr
+}
+
+define i32 * @pretrunc64to32(i32** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     w1, [x0, #8]!
+; CHECK-NEXT: ret
+  %tmp = load i32** %out, align 8
+  %ptr = getelementptr inbounds i32* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i32
+  store i32 %trunc, i32* %ptr, align 4
+  ret i32 *%ptr
+}
+
+define i16 * @pretrunc64to16(i16** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to16:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strh    w1, [x0, #4]!
+; CHECK-NEXT: ret
+  %tmp = load i16** %out, align 8
+  %ptr = getelementptr inbounds i16* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i16
+  store i16 %trunc, i16* %ptr, align 4
+  ret i16 *%ptr
+}
+
+define i8 * @pretrunc64to8(i8** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to8:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strb    w1, [x0, #2]!
+; CHECK-NEXT: ret
+  %tmp = load i8** %out, align 8
+  %ptr = getelementptr inbounds i8* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i8
+  store i8 %trunc, i8* %ptr, align 4
+  ret i8 *%ptr
+}
+
+;-----
+; Pre-indexed loads
+;-----
+define double* @preidxf64(double* %src, double* %out) {
+; CHECK-LABEL: preidxf64:
+; CHECK: ldr     d0, [x0, #8]!
+; CHECK: str     d0, [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds double* %src, i64 1
+  %tmp = load double* %ptr, align 4
+  store double %tmp, double* %out, align 4
+  ret double* %ptr
+}
+
+define float* @preidxf32(float* %src, float* %out) {
+; CHECK-LABEL: preidxf32:
+; CHECK: ldr     s0, [x0, #4]!
+; CHECK: str     s0, [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds float* %src, i64 1
+  %tmp = load float* %ptr, align 4
+  store float %tmp, float* %out, align 4
+  ret float* %ptr
+}
+
+define i64* @preidx64(i64* %src, i64* %out) {
+; CHECK-LABEL: preidx64:
+; CHECK: ldr     x[[REG:[0-9]+]], [x0, #8]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i64* %src, i64 1
+  %tmp = load i64* %ptr, align 4
+  store i64 %tmp, i64* %out, align 4
+  ret i64* %ptr
+}
+
+define i32* @preidx32(i32* %src, i32* %out) {
+; CHECK: ldr     w[[REG:[0-9]+]], [x0, #4]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i32* %src, i64 1
+  %tmp = load i32* %ptr, align 4
+  store i32 %tmp, i32* %out, align 4
+  ret i32* %ptr
+}
+
+define i16* @preidx16zext32(i16* %src, i32* %out) {
+; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = zext i16 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i16* %ptr
+}
+
+define i16* @preidx16zext64(i16* %src, i64* %out) {
+; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = zext i16 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i16* %ptr
+}
+
+define i8* @preidx8zext32(i8* %src, i32* %out) {
+; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = zext i8 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i8* %ptr
+}
+
+define i8* @preidx8zext64(i8* %src, i64* %out) {
+; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = zext i8 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i8* %ptr
+}
+
+define i32* @preidx32sext64(i32* %src, i64* %out) {
+; CHECK: ldrsw   x[[REG:[0-9]+]], [x0, #4]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i32* %src, i64 1
+  %tmp = load i32* %ptr, align 4
+  %ext = sext i32 %tmp to i64
+  store i64 %ext, i64* %out, align 8
+  ret i32* %ptr
+}
+
+define i16* @preidx16sext32(i16* %src, i32* %out) {
+; CHECK: ldrsh   w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = sext i16 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i16* %ptr
+}
+
+define i16* @preidx16sext64(i16* %src, i64* %out) {
+; CHECK: ldrsh   x[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = sext i16 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i16* %ptr
+}
+
+define i8* @preidx8sext32(i8* %src, i32* %out) {
+; CHECK: ldrsb   w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = sext i8 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i8* %ptr
+}
+
+define i8* @preidx8sext64(i8* %src, i64* %out) {
+; CHECK: ldrsb   x[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = sext i8 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i8* %ptr
+}
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
new file mode 100644
index 0000000..c118f10
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s
+
+; This used to assert with "Overran sorted position" in AssignTopologicalOrder
+; due to a cycle created in performPostLD1Combine.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp
+define void @f(double* %P1) #0 {
+entry:
+  %arrayidx4 = getelementptr inbounds double* %P1, i64 1
+  %0 = load double* %arrayidx4, align 8, !tbaa !1
+  %1 = load double* %P1, align 8, !tbaa !1
+  %2 = insertelement <2 x double> undef, double %0, i32 0
+  %3 = insertelement <2 x double> %2, double %1, i32 1
+  %4 = fsub <2 x double> zeroinitializer, %3
+  %5 = fmul <2 x double> undef, %4
+  %6 = extractelement <2 x double> %5, i32 0
+  %cmp168 = fcmp olt double %6, undef
+  br i1 %cmp168, label %if.then172, label %return
+
+if.then172:                                       ; preds = %cond.end90
+  %7 = tail call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 false)
+  br label %return
+
+return:                                           ; preds = %if.then172, %cond.end90, %entry
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) #1
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"double", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
new file mode 100644
index 0000000..9ee4063
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -0,0 +1,6174 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+
+@ptr = global i8* null
+
+define <8 x i8> @test_v8i8_pre_load(<8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <8 x i8>* %addr, i32 5
+  %val = load <8 x i8>* %newaddr, align 8
+  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+  ret <8 x i8> %val
+}
+
+define <8 x i8> @test_v8i8_post_load(<8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <8 x i8>* %addr, i32 5
+  %val = load <8 x i8>* %addr, align 8
+  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+  ret <8 x i8> %val
+}
+
+define void @test_v8i8_pre_store(<8 x i8> %in, <8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <8 x i8>* %addr, i32 5
+  store <8 x i8> %in, <8 x i8>* %newaddr, align 8
+  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+  ret void
+}
+
+define void @test_v8i8_post_store(<8 x i8> %in, <8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <8 x i8>* %addr, i32 5
+  store <8 x i8> %in, <8 x i8>* %addr, align 8
+  store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+  ret void
+}
+
+define <4 x i16> @test_v4i16_pre_load(<4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <4 x i16>* %addr, i32 5
+  %val = load <4 x i16>* %newaddr, align 8
+  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+  ret <4 x i16> %val
+}
+
+define <4 x i16> @test_v4i16_post_load(<4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <4 x i16>* %addr, i32 5
+  %val = load <4 x i16>* %addr, align 8
+  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+  ret <4 x i16> %val
+}
+
+define void @test_v4i16_pre_store(<4 x i16> %in, <4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <4 x i16>* %addr, i32 5
+  store <4 x i16> %in, <4 x i16>* %newaddr, align 8
+  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+  ret void
+}
+
+define void @test_v4i16_post_store(<4 x i16> %in, <4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <4 x i16>* %addr, i32 5
+  store <4 x i16> %in, <4 x i16>* %addr, align 8
+  store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+  ret void
+}
+
+define <2 x i32> @test_v2i32_pre_load(<2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <2 x i32>* %addr, i32 5
+  %val = load <2 x i32>* %newaddr, align 8
+  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+  ret <2 x i32> %val
+}
+
+define <2 x i32> @test_v2i32_post_load(<2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <2 x i32>* %addr, i32 5
+  %val = load <2 x i32>* %addr, align 8
+  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+  ret <2 x i32> %val
+}
+
+define void @test_v2i32_pre_store(<2 x i32> %in, <2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <2 x i32>* %addr, i32 5
+  store <2 x i32> %in, <2 x i32>* %newaddr, align 8
+  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+  ret void
+}
+
+define void @test_v2i32_post_store(<2 x i32> %in, <2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <2 x i32>* %addr, i32 5
+  store <2 x i32> %in, <2 x i32>* %addr, align 8
+  store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+  ret void
+}
+
+define <2 x float> @test_v2f32_pre_load(<2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <2 x float>* %addr, i32 5
+  %val = load <2 x float>* %newaddr, align 8
+  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+  ret <2 x float> %val
+}
+
+define <2 x float> @test_v2f32_post_load(<2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <2 x float>* %addr, i32 5
+  %val = load <2 x float>* %addr, align 8
+  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+  ret <2 x float> %val
+}
+
+define void @test_v2f32_pre_store(<2 x float> %in, <2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <2 x float>* %addr, i32 5
+  store <2 x float> %in, <2 x float>* %newaddr, align 8
+  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+  ret void
+}
+
+define void @test_v2f32_post_store(<2 x float> %in, <2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <2 x float>* %addr, i32 5
+  store <2 x float> %in, <2 x float>* %addr, align 8
+  store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+  ret void
+}
+
+define <1 x i64> @test_v1i64_pre_load(<1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+  %newaddr = getelementptr <1 x i64>* %addr, i32 5
+  %val = load <1 x i64>* %newaddr, align 8
+  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+  ret <1 x i64> %val
+}
+
+define <1 x i64> @test_v1i64_post_load(<1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_post_load:
+; CHECK: ldr d0, [x0], #40
+  %newaddr = getelementptr <1 x i64>* %addr, i32 5
+  %val = load <1 x i64>* %addr, align 8
+  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+  ret <1 x i64> %val
+}
+
+define void @test_v1i64_pre_store(<1 x i64> %in, <1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_pre_store:
+; CHECK: str d0, [x0, #40]!
+  %newaddr = getelementptr <1 x i64>* %addr, i32 5
+  store <1 x i64> %in, <1 x i64>* %newaddr, align 8
+  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+  ret void
+}
+
+define void @test_v1i64_post_store(<1 x i64> %in, <1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_post_store:
+; CHECK: str d0, [x0], #40
+  %newaddr = getelementptr <1 x i64>* %addr, i32 5
+  store <1 x i64> %in, <1 x i64>* %addr, align 8
+  store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+  ret void
+}
+
+define <16 x i8> @test_v16i8_pre_load(<16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <16 x i8>* %addr, i32 5
+  %val = load <16 x i8>* %newaddr, align 8
+  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+  ret <16 x i8> %val
+}
+
+define <16 x i8> @test_v16i8_post_load(<16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <16 x i8>* %addr, i32 5
+  %val = load <16 x i8>* %addr, align 8
+  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+  ret <16 x i8> %val
+}
+
+define void @test_v16i8_pre_store(<16 x i8> %in, <16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <16 x i8>* %addr, i32 5
+  store <16 x i8> %in, <16 x i8>* %newaddr, align 8
+  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+  ret void
+}
+
+define void @test_v16i8_post_store(<16 x i8> %in, <16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <16 x i8>* %addr, i32 5
+  store <16 x i8> %in, <16 x i8>* %addr, align 8
+  store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+  ret void
+}
+
+define <8 x i16> @test_v8i16_pre_load(<8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <8 x i16>* %addr, i32 5
+  %val = load <8 x i16>* %newaddr, align 8
+  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+  ret <8 x i16> %val
+}
+
+define <8 x i16> @test_v8i16_post_load(<8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <8 x i16>* %addr, i32 5
+  %val = load <8 x i16>* %addr, align 8
+  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+  ret <8 x i16> %val
+}
+
+define void @test_v8i16_pre_store(<8 x i16> %in, <8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <8 x i16>* %addr, i32 5
+  store <8 x i16> %in, <8 x i16>* %newaddr, align 8
+  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+  ret void
+}
+
+define void @test_v8i16_post_store(<8 x i16> %in, <8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <8 x i16>* %addr, i32 5
+  store <8 x i16> %in, <8 x i16>* %addr, align 8
+  store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+  ret void
+}
+
+define <4 x i32> @test_v4i32_pre_load(<4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <4 x i32>* %addr, i32 5
+  %val = load <4 x i32>* %newaddr, align 8
+  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+  ret <4 x i32> %val
+}
+
+define <4 x i32> @test_v4i32_post_load(<4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <4 x i32>* %addr, i32 5
+  %val = load <4 x i32>* %addr, align 8
+  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+  ret <4 x i32> %val
+}
+
+define void @test_v4i32_pre_store(<4 x i32> %in, <4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <4 x i32>* %addr, i32 5
+  store <4 x i32> %in, <4 x i32>* %newaddr, align 8
+  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+  ret void
+}
+
+define void @test_v4i32_post_store(<4 x i32> %in, <4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <4 x i32>* %addr, i32 5
+  store <4 x i32> %in, <4 x i32>* %addr, align 8
+  store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+  ret void
+}
+
+
+define <4 x float> @test_v4f32_pre_load(<4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <4 x float>* %addr, i32 5
+  %val = load <4 x float>* %newaddr, align 8
+  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+  ret <4 x float> %val
+}
+
+define <4 x float> @test_v4f32_post_load(<4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <4 x float>* %addr, i32 5
+  %val = load <4 x float>* %addr, align 8
+  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+  ret <4 x float> %val
+}
+
+define void @test_v4f32_pre_store(<4 x float> %in, <4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <4 x float>* %addr, i32 5
+  store <4 x float> %in, <4 x float>* %newaddr, align 8
+  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+  ret void
+}
+
+define void @test_v4f32_post_store(<4 x float> %in, <4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <4 x float>* %addr, i32 5
+  store <4 x float> %in, <4 x float>* %addr, align 8
+  store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+  ret void
+}
+
+
+define <2 x i64> @test_v2i64_pre_load(<2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <2 x i64>* %addr, i32 5
+  %val = load <2 x i64>* %newaddr, align 8
+  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+  ret <2 x i64> %val
+}
+
+define <2 x i64> @test_v2i64_post_load(<2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <2 x i64>* %addr, i32 5
+  %val = load <2 x i64>* %addr, align 8
+  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+  ret <2 x i64> %val
+}
+
+define void @test_v2i64_pre_store(<2 x i64> %in, <2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <2 x i64>* %addr, i32 5
+  store <2 x i64> %in, <2 x i64>* %newaddr, align 8
+  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+  ret void
+}
+
+define void @test_v2i64_post_store(<2 x i64> %in, <2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <2 x i64>* %addr, i32 5
+  store <2 x i64> %in, <2 x i64>* %addr, align 8
+  store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+  ret void
+}
+
+
+define <2 x double> @test_v2f64_pre_load(<2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+  %newaddr = getelementptr <2 x double>* %addr, i32 5
+  %val = load <2 x double>* %newaddr, align 8
+  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+  ret <2 x double> %val
+}
+
+define <2 x double> @test_v2f64_post_load(<2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_post_load:
+; CHECK: ldr q0, [x0], #80
+  %newaddr = getelementptr <2 x double>* %addr, i32 5
+  %val = load <2 x double>* %addr, align 8
+  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+  ret <2 x double> %val
+}
+
+define void @test_v2f64_pre_store(<2 x double> %in, <2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_pre_store:
+; CHECK: str q0, [x0, #80]!
+  %newaddr = getelementptr <2 x double>* %addr, i32 5
+  store <2 x double> %in, <2 x double>* %newaddr, align 8
+  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+  ret void
+}
+
+define void @test_v2f64_post_store(<2 x double> %in, <2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_post_store:
+; CHECK: str q0, [x0], #80
+  %newaddr = getelementptr <2 x double>* %addr, i32 5
+  store <2 x double> %in, <2 x double>* %addr, align 8
+  store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+  ret void
+}
+
+define i8* @test_v16i8_post_imm_st1_lane(<16 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v16i8_post_imm_st1_lane:
+; CHECK: st1.b { v0 }[3], [x0], #1
+  %elt = extractelement <16 x i8> %in, i32 3
+  store i8 %elt, i8* %addr
+
+  %newaddr = getelementptr i8* %addr, i32 1
+  ret i8* %newaddr
+}
+
+define i8* @test_v16i8_post_reg_st1_lane(<16 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v16i8_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x2
+; CHECK: st1.b { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <16 x i8> %in, i32 3
+  store i8 %elt, i8* %addr
+
+  %newaddr = getelementptr i8* %addr, i32 2
+  ret i8* %newaddr
+}
+
+
+define i16* @test_v8i16_post_imm_st1_lane(<8 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v8i16_post_imm_st1_lane:
+; CHECK: st1.h { v0 }[3], [x0], #2
+  %elt = extractelement <8 x i16> %in, i32 3
+  store i16 %elt, i16* %addr
+
+  %newaddr = getelementptr i16* %addr, i32 1
+  ret i16* %newaddr
+}
+
+define i16* @test_v8i16_post_reg_st1_lane(<8 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v8i16_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x4
+; CHECK: st1.h { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <8 x i16> %in, i32 3
+  store i16 %elt, i16* %addr
+
+  %newaddr = getelementptr i16* %addr, i32 2
+  ret i16* %newaddr
+}
+
+define i32* @test_v4i32_post_imm_st1_lane(<4 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v4i32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[3], [x0], #4
+  %elt = extractelement <4 x i32> %in, i32 3
+  store i32 %elt, i32* %addr
+
+  %newaddr = getelementptr i32* %addr, i32 1
+  ret i32* %newaddr
+}
+
+define i32* @test_v4i32_post_reg_st1_lane(<4 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v4i32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <4 x i32> %in, i32 3
+  store i32 %elt, i32* %addr
+
+  %newaddr = getelementptr i32* %addr, i32 2
+  ret i32* %newaddr
+}
+
+define float* @test_v4f32_post_imm_st1_lane(<4 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v4f32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[3], [x0], #4
+  %elt = extractelement <4 x float> %in, i32 3
+  store float %elt, float* %addr
+
+  %newaddr = getelementptr float* %addr, i32 1
+  ret float* %newaddr
+}
+
+define float* @test_v4f32_post_reg_st1_lane(<4 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v4f32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <4 x float> %in, i32 3
+  store float %elt, float* %addr
+
+  %newaddr = getelementptr float* %addr, i32 2
+  ret float* %newaddr
+}
+
+define i64* @test_v2i64_post_imm_st1_lane(<2 x i64> %in, i64* %addr) {
+; CHECK-LABEL: test_v2i64_post_imm_st1_lane:
+; CHECK: st1.d { v0 }[1], [x0], #8
+  %elt = extractelement <2 x i64> %in, i64 1
+  store i64 %elt, i64* %addr
+
+  %newaddr = getelementptr i64* %addr, i64 1
+  ret i64* %newaddr
+}
+
+define i64* @test_v2i64_post_reg_st1_lane(<2 x i64> %in, i64* %addr) {
+; CHECK-LABEL: test_v2i64_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x10
+; CHECK: st1.d { v0 }[1], [x0], x[[OFFSET]]
+  %elt = extractelement <2 x i64> %in, i64 1
+  store i64 %elt, i64* %addr
+
+  %newaddr = getelementptr i64* %addr, i64 2
+  ret i64* %newaddr
+}
+
+define double* @test_v2f64_post_imm_st1_lane(<2 x double> %in, double* %addr) {
+; CHECK-LABEL: test_v2f64_post_imm_st1_lane:
+; CHECK: st1.d { v0 }[1], [x0], #8
+  %elt = extractelement <2 x double> %in, i32 1
+  store double %elt, double* %addr
+
+  %newaddr = getelementptr double* %addr, i32 1
+  ret double* %newaddr
+}
+
+define double* @test_v2f64_post_reg_st1_lane(<2 x double> %in, double* %addr) {
+; CHECK-LABEL: test_v2f64_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x10
+; CHECK: st1.d { v0 }[1], [x0], x[[OFFSET]]
+  %elt = extractelement <2 x double> %in, i32 1
+  store double %elt, double* %addr
+
+  %newaddr = getelementptr double* %addr, i32 2
+  ret double* %newaddr
+}
+
+define i8* @test_v8i8_post_imm_st1_lane(<8 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v8i8_post_imm_st1_lane:
+; CHECK: st1.b { v0 }[3], [x0], #1
+  %elt = extractelement <8 x i8> %in, i32 3
+  store i8 %elt, i8* %addr
+
+  %newaddr = getelementptr i8* %addr, i32 1
+  ret i8* %newaddr
+}
+
+define i8* @test_v8i8_post_reg_st1_lane(<8 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v8i8_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x2
+; CHECK: st1.b { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <8 x i8> %in, i32 3
+  store i8 %elt, i8* %addr
+
+  %newaddr = getelementptr i8* %addr, i32 2
+  ret i8* %newaddr
+}
+
+define i16* @test_v4i16_post_imm_st1_lane(<4 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v4i16_post_imm_st1_lane:
+; CHECK: st1.h { v0 }[3], [x0], #2
+  %elt = extractelement <4 x i16> %in, i32 3
+  store i16 %elt, i16* %addr
+
+  %newaddr = getelementptr i16* %addr, i32 1
+  ret i16* %newaddr
+}
+
+define i16* @test_v4i16_post_reg_st1_lane(<4 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v4i16_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x4
+; CHECK: st1.h { v0 }[3], [x0], x[[OFFSET]]
+  %elt = extractelement <4 x i16> %in, i32 3
+  store i16 %elt, i16* %addr
+
+  %newaddr = getelementptr i16* %addr, i32 2
+  ret i16* %newaddr
+}
+
+define i32* @test_v2i32_post_imm_st1_lane(<2 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v2i32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[1], [x0], #4
+  %elt = extractelement <2 x i32> %in, i32 1
+  store i32 %elt, i32* %addr
+
+  %newaddr = getelementptr i32* %addr, i32 1
+  ret i32* %newaddr
+}
+
+define i32* @test_v2i32_post_reg_st1_lane(<2 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v2i32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[1], [x0], x[[OFFSET]]
+  %elt = extractelement <2 x i32> %in, i32 1
+  store i32 %elt, i32* %addr
+
+  %newaddr = getelementptr i32* %addr, i32 2
+  ret i32* %newaddr
+}
+
+define float* @test_v2f32_post_imm_st1_lane(<2 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v2f32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[1], [x0], #4
+  %elt = extractelement <2 x float> %in, i32 1
+  store float %elt, float* %addr
+
+  %newaddr = getelementptr float* %addr, i32 1
+  ret float* %newaddr
+}
+
+define float* @test_v2f32_post_reg_st1_lane(<2 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v2f32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[1], [x0], x[[OFFSET]]
+  %elt = extractelement <2 x float> %in, i32 1
+  store float %elt, float* %addr
+
+  %newaddr = getelementptr float* %addr, i32 2
+  ret float* %newaddr
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld2:
+;CHECK: ld2.16b { v0, v1 }, [x0], #32
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld2:
+;CHECK: ld2.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld2:
+;CHECK: ld2.8b { v0, v1 }, [x0], #16
+  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 16
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld2:
+;CHECK: ld2.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld2:
+;CHECK: ld2.8h { v0, v1 }, [x0], #32
+  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld2:
+;CHECK: ld2.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld2:
+;CHECK: ld2.4h { v0, v1 }, [x0], #16
+  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 8
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld2:
+;CHECK: ld2.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], #32
+  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], #16
+  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], #32
+  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], #32
+  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], #16
+  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], #32
+  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld3:
+;CHECK: ld3.16b { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 48
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld3:
+;CHECK: ld3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld3:
+;CHECK: ld3.8b { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 24
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld3:
+;CHECK: ld3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld3:
+;CHECK: ld3.8h { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 24
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld3:
+;CHECK: ld3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld3:
+;CHECK: ld3.4h { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 12
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld3:
+;CHECK: ld3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 12
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 6
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 6
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 12
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 6
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
+  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 6
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld4:
+;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 64
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld4:
+;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld4:
+;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld4:
+;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld4:
+;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 32
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld4:
+;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld4:
+;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld4:
+;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 16
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 8
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 16
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
+  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 8
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double*)
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld1x2:
+;CHECK: ld1.16b { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld1x2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld1x2:
+;CHECK: ld1.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld1x2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld1x2:
+;CHECK: ld1.8b { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 16
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld1x2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld1x2:
+;CHECK: ld1.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld1x2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld1x2:
+;CHECK: ld1.8h { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld1x2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld1x2:
+;CHECK: ld1.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld1x2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld1x2:
+;CHECK: ld1.4h { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 8
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld1x2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld1x2:
+;CHECK: ld1.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld1x2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld1x2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld1x2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld1x2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld1x2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld1x2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld1x2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld1x2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld1x2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld1x2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld1x2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld1x2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld1x2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], #32
+  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld1x2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld1x2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld1x2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld1x2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld1x3:
+;CHECK: ld1.16b { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 48
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld1x3:
+;CHECK: ld1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld1x3:
+;CHECK: ld1.8b { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 24
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld1x3:
+;CHECK: ld1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld1x3:
+;CHECK: ld1.8h { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 24
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld1x3:
+;CHECK: ld1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld1x3:
+;CHECK: ld1.4h { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 12
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld1x3:
+;CHECK: ld1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 12
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 6
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 6
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 12
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 6
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
+  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 6
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld1x4:
+;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 64
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld1x4:
+;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld1x4:
+;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld1x4:
+;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld1x4:
+;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 32
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld1x4:
+;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld1x4:
+;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld1x4:
+;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 16
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 8
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 16
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
+  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 8
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld2r:
+;CHECK: ld2r.16b { v0, v1 }, [x0], #2
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld2r:
+;CHECK: ld2r.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld2r:
+;CHECK: ld2r.8b { v0, v1 }, [x0], #2
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld2r:
+;CHECK: ld2r.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld2r:
+;CHECK: ld2r.8h { v0, v1 }, [x0], #4
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld2r:
+;CHECK: ld2r.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld2r:
+;CHECK: ld2r.4h { v0, v1 }, [x0], #4
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld2r:
+;CHECK: ld2r.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], #8
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], #8
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], #16
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], #16
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], #8
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float*) nounwind readonly
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], #8
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], #16
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double*) nounwind readonly
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], #16
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld3r:
+;CHECK: ld3r.16b { v0, v1, v2 }, [x0], #3
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld3r:
+;CHECK: ld3r.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld3r:
+;CHECK: ld3r.8b { v0, v1, v2 }, [x0], #3
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld3r:
+;CHECK: ld3r.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld3r:
+;CHECK: ld3r.8h { v0, v1, v2 }, [x0], #6
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld3r:
+;CHECK: ld3r.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld3r:
+;CHECK: ld3r.4h { v0, v1, v2 }, [x0], #6
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld3r:
+;CHECK: ld3r.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float*) nounwind readonly
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double*) nounwind readonly
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld4r:
+;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], #4
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld4r:
+;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld4r:
+;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], #4
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld4r:
+;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld4r:
+;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], #8
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld4r:
+;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld4r:
+;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], #8
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld4r:
+;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float*) nounwind readonly
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double*) nounwind readonly
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], #2
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], #2
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], #4
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], #4
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 2
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float> } %ld2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float> } %ld2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double> } %ld2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) nounwind readonly
+
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 2
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double> } %ld2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 3
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
+
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 3
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  store i8* %tmp, i8** %ptr
+  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  store i16* %tmp, i16** %ptr
+  ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  store i32* %tmp, i32** %ptr
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i32 4
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  store i64* %tmp, i64** %ptr
+  ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  store float* %tmp, float** %ptr
+  ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
+
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i32 4
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  store double* %tmp, double** %ptr
+  ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
+
+
+define i8* @test_v16i8_post_imm_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st2:
+;CHECK: st2.16b { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st2:
+;CHECK: st2.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st2:
+;CHECK: st2.8b { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i32 16
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st2:
+;CHECK: st2.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st2:
+;CHECK: st2.8h { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st2:
+;CHECK: st2.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st2:
+;CHECK: st2.4h { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i32 8
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st2:
+;CHECK: st2.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 2
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 2
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st3:
+;CHECK: st3.16b { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i32 48
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st3:
+;CHECK: st3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st3:
+;CHECK: st3.8b { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i32 24
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st3:
+;CHECK: st3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st3:
+;CHECK: st3.8h { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i32 24
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st3:
+;CHECK: st3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st3:
+;CHECK: st3.4h { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i32 12
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st3:
+;CHECK: st3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i32 12
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i32 6
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 6
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 3
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i32 12
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i32 6
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 6
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 3
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st4:
+;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i32 64
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st4:
+;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st4:
+;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st4:
+;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st4:
+;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i32 32
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st4:
+;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st4:
+;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st4:
+;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
+
+
+define i32* @test_v4i32_post_imm_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i32 16
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
+
+
+define i32* @test_v2i32_post_imm_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 8
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
+
+
+define i64* @test_v1i64_post_imm_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
+
+
+define float* @test_v4f32_post_imm_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i32 16
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 8
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
+
+
+define double* @test_v1f64_post_imm_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st1x2:
+;CHECK: st1.16b { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st1x2:
+;CHECK: st1.16b { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st1x2:
+;CHECK: st1.8b { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i32 16
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st1x2:
+;CHECK: st1.8b { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st1x2:
+;CHECK: st1.8h { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st1x2:
+;CHECK: st1.8h { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st1x2:
+;CHECK: st1.4h { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i32 8
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st1x2:
+;CHECK: st1.4h { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 2
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 2
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st1x3:
+;CHECK: st1.16b { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i32 48
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st1x3:
+;CHECK: st1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st1x3:
+;CHECK: st1.8b { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i32 24
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st1x3:
+;CHECK: st1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st1x3:
+;CHECK: st1.8h { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i32 24
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st1x3:
+;CHECK: st1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st1x3:
+;CHECK: st1.4h { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i32 12
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st1x3:
+;CHECK: st1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i32 12
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i32 6
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 6
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 3
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i32 12
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i32 6
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 6
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 3
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st1x4:
+;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i32 64
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st1x4:
+;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st1x4:
+;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st1x4:
+;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st1x4:
+;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i32 32
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st1x4:
+;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st1x4:
+;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i32 16
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st1x4:
+;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
+
+
+define i32* @test_v4i32_post_imm_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i32 16
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
+
+
+define i32* @test_v2i32_post_imm_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i32 8
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 8
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
+
+
+define i64* @test_v1i64_post_imm_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
+
+
+define float* @test_v4f32_post_imm_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i32 16
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i32 8
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 8
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
+
+
+define double* @test_v1f64_post_imm_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) {
+  call void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) {
+  call void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i64, i8*) nounwind readnone
+
+
+define i8* @test_v16i8_post_imm_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], #2
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*)
+
+
+define i8* @test_v8i8_post_imm_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], #2
+  call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 2
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*)
+
+
+define i16* @test_v8i16_post_imm_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], #4
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*)
+
+
+define i16* @test_v4i16_post_imm_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], #4
+  call void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 2
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*)
+
+
+define i32* @test_v4i32_post_imm_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
+
+
+define i32* @test_v2i32_post_imm_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 2
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*)
+
+
+define i64* @test_v2i64_post_imm_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 2
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*)
+
+
+define i64* @test_v1i64_post_imm_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 2
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
+
+
+define float* @test_v4f32_post_imm_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
+
+
+define float* @test_v2f32_post_imm_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 2
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
+
+
+define double* @test_v2f64_post_imm_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 2
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*)
+
+
+define double* @test_v1f64_post_imm_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 2
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*)
+
+
+define i8* @test_v16i8_post_imm_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
+
+
+define i8* @test_v8i8_post_imm_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
+  call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 3
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
+
+
+define i16* @test_v8i16_post_imm_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
+
+
+define i16* @test_v4i16_post_imm_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
+  call void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 3
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
+
+
+define i32* @test_v4i32_post_imm_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+
+
+define i32* @test_v2i32_post_imm_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+  call void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 3
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
+
+
+define i64* @test_v2i64_post_imm_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 3
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
+
+
+define i64* @test_v1i64_post_imm_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+  call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 3
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+
+
+define float* @test_v4f32_post_imm_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+  call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
+
+
+define float* @test_v2f32_post_imm_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+  call void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 3
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
+
+
+define double* @test_v2f64_post_imm_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+  call void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 3
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*)
+
+
+define double* @test_v1f64_post_imm_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+  call void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 3
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*)
+
+
+define i8* @test_v16i8_post_imm_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
+
+
+define i8* @test_v8i8_post_imm_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
+  call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i32 4
+  ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %tmp = getelementptr i8* %A, i64 %inc
+  ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
+
+
+define i16* @test_v8i16_post_imm_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
+
+
+define i16* @test_v4i16_post_imm_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
+  call void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i32 4
+  ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %tmp = getelementptr i16* %A, i64 %inc
+  ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
+
+
+define i32* @test_v4i32_post_imm_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+
+
+define i32* @test_v2i32_post_imm_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i32 4
+  ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %tmp = getelementptr i32* %A, i64 %inc
+  ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
+
+
+define i64* @test_v2i64_post_imm_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
+
+
+define i64* @test_v1i64_post_imm_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+  call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 4
+  ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %tmp = getelementptr i64* %A, i64 %inc
+  ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+
+
+define float* @test_v4f32_post_imm_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
+
+
+define float* @test_v2f32_post_imm_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+  call void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i32 4
+  ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %tmp = getelementptr float* %A, i64 %inc
+  ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
+
+
+define double* @test_v2f64_post_imm_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+  call void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*)
+
+
+define double* @test_v1f64_post_imm_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+  call void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 4
+  ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+  call void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %tmp = getelementptr double* %A, i64 %inc
+  ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*)
+
+define <16 x i8> @test_v16i8_post_imm_ld1r(i8* %bar, i8** %ptr) {
+; CHECK-LABEL: test_v16i8_post_imm_ld1r:
+; CHECK: ld1r.16b { v0 }, [x0], #1
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+  %tmp18 = getelementptr i8* %bar, i64 1
+  store i8* %tmp18, i8** %ptr
+  ret <16 x i8> %tmp17
+}
+
+define <16 x i8> @test_v16i8_post_reg_ld1r(i8* %bar, i8** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v16i8_post_reg_ld1r:
+; CHECK: ld1r.16b { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+  %tmp18 = getelementptr i8* %bar, i64 %inc
+  store i8* %tmp18, i8** %ptr
+  ret <16 x i8> %tmp17
+}
+
+define <8 x i8> @test_v8i8_post_imm_ld1r(i8* %bar, i8** %ptr) {
+; CHECK-LABEL: test_v8i8_post_imm_ld1r:
+; CHECK: ld1r.8b { v0 }, [x0], #1
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = getelementptr i8* %bar, i64 1
+  store i8* %tmp10, i8** %ptr
+  ret <8 x i8> %tmp9
+}
+
+define <8 x i8> @test_v8i8_post_reg_ld1r(i8* %bar, i8** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v8i8_post_reg_ld1r:
+; CHECK: ld1r.8b { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = getelementptr i8* %bar, i64 %inc
+  store i8* %tmp10, i8** %ptr
+  ret <8 x i8> %tmp9
+}
+
+define <8 x i16> @test_v8i16_post_imm_ld1r(i16* %bar, i16** %ptr) {
+; CHECK-LABEL: test_v8i16_post_imm_ld1r:
+; CHECK: ld1r.8h { v0 }, [x0], #2
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+  %tmp10 = getelementptr i16* %bar, i64 1
+  store i16* %tmp10, i16** %ptr
+  ret <8 x i16> %tmp9
+}
+
+define <8 x i16> @test_v8i16_post_reg_ld1r(i16* %bar, i16** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v8i16_post_reg_ld1r:
+; CHECK: ld1r.8h { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+  %tmp10 = getelementptr i16* %bar, i64 %inc
+  store i16* %tmp10, i16** %ptr
+  ret <8 x i16> %tmp9
+}
+
+define <4 x i16> @test_v4i16_post_imm_ld1r(i16* %bar, i16** %ptr) {
+; CHECK-LABEL: test_v4i16_post_imm_ld1r:
+; CHECK: ld1r.4h { v0 }, [x0], #2
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = getelementptr i16* %bar, i64 1
+  store i16* %tmp6, i16** %ptr
+  ret <4 x i16> %tmp5
+}
+
+define <4 x i16> @test_v4i16_post_reg_ld1r(i16* %bar, i16** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v4i16_post_reg_ld1r:
+; CHECK: ld1r.4h { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = getelementptr i16* %bar, i64 %inc
+  store i16* %tmp6, i16** %ptr
+  ret <4 x i16> %tmp5
+}
+
+define <4 x i32> @test_v4i32_post_imm_ld1r(i32* %bar, i32** %ptr) {
+; CHECK-LABEL: test_v4i32_post_imm_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], #4
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+  %tmp6 = getelementptr i32* %bar, i64 1
+  store i32* %tmp6, i32** %ptr
+  ret <4 x i32> %tmp5
+}
+
+define <4 x i32> @test_v4i32_post_reg_ld1r(i32* %bar, i32** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v4i32_post_reg_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+  %tmp6 = getelementptr i32* %bar, i64 %inc
+  store i32* %tmp6, i32** %ptr
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i32> @test_v2i32_post_imm_ld1r(i32* %bar, i32** %ptr) {
+; CHECK-LABEL: test_v2i32_post_imm_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], #4
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = getelementptr i32* %bar, i64 1
+  store i32* %tmp4, i32** %ptr
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @test_v2i32_post_reg_ld1r(i32* %bar, i32** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2i32_post_reg_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = getelementptr i32* %bar, i64 %inc
+  store i32* %tmp4, i32** %ptr
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i64> @test_v2i64_post_imm_ld1r(i64* %bar, i64** %ptr) {
+; CHECK-LABEL: test_v2i64_post_imm_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], #8
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+  %tmp4 = getelementptr i64* %bar, i64 1
+  store i64* %tmp4, i64** %ptr
+  ret <2 x i64> %tmp3
+}
+
+define <2 x i64> @test_v2i64_post_reg_ld1r(i64* %bar, i64** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2i64_post_reg_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+  %tmp4 = getelementptr i64* %bar, i64 %inc
+  store i64* %tmp4, i64** %ptr
+  ret <2 x i64> %tmp3
+}
+
+define <4 x float> @test_v4f32_post_imm_ld1r(float* %bar, float** %ptr) {
+; CHECK-LABEL: test_v4f32_post_imm_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], #4
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> <float undef, float undef, float undef, float undef>, float %tmp1, i32 0
+  %tmp3 = insertelement <4 x float> %tmp2, float %tmp1, i32 1
+  %tmp4 = insertelement <4 x float> %tmp3, float %tmp1, i32 2
+  %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 3
+  %tmp6 = getelementptr float* %bar, i64 1
+  store float* %tmp6, float** %ptr
+  ret <4 x float> %tmp5
+}
+
+define <4 x float> @test_v4f32_post_reg_ld1r(float* %bar, float** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v4f32_post_reg_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> <float undef, float undef, float undef, float undef>, float %tmp1, i32 0
+  %tmp3 = insertelement <4 x float> %tmp2, float %tmp1, i32 1
+  %tmp4 = insertelement <4 x float> %tmp3, float %tmp1, i32 2
+  %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 3
+  %tmp6 = getelementptr float* %bar, i64 %inc
+  store float* %tmp6, float** %ptr
+  ret <4 x float> %tmp5
+}
+
+define <2 x float> @test_v2f32_post_imm_ld1r(float* %bar, float** %ptr) {
+; CHECK-LABEL: test_v2f32_post_imm_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], #4
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> <float undef, float undef>, float %tmp1, i32 0
+  %tmp3 = insertelement <2 x float> %tmp2, float %tmp1, i32 1
+  %tmp4 = getelementptr float* %bar, i64 1
+  store float* %tmp4, float** %ptr
+  ret <2 x float> %tmp3
+}
+
+define <2 x float> @test_v2f32_post_reg_ld1r(float* %bar, float** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2f32_post_reg_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> <float undef, float undef>, float %tmp1, i32 0
+  %tmp3 = insertelement <2 x float> %tmp2, float %tmp1, i32 1
+  %tmp4 = getelementptr float* %bar, i64 %inc
+  store float* %tmp4, float** %ptr
+  ret <2 x float> %tmp3
+}
+
+define <2 x double> @test_v2f64_post_imm_ld1r(double* %bar, double** %ptr) {
+; CHECK-LABEL: test_v2f64_post_imm_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], #8
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> <double undef, double undef>, double %tmp1, i32 0
+  %tmp3 = insertelement <2 x double> %tmp2, double %tmp1, i32 1
+  %tmp4 = getelementptr double* %bar, i64 1
+  store double* %tmp4, double** %ptr
+  ret <2 x double> %tmp3
+}
+
+define <2 x double> @test_v2f64_post_reg_ld1r(double* %bar, double** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2f64_post_reg_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], x{{[0-9]+}}
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> <double undef, double undef>, double %tmp1, i32 0
+  %tmp3 = insertelement <2 x double> %tmp2, double %tmp1, i32 1
+  %tmp4 = getelementptr double* %bar, i64 %inc
+  store double* %tmp4, double** %ptr
+  ret <2 x double> %tmp3
+}
+
+define <16 x i8> @test_v16i8_post_imm_ld1lane(i8* %bar, i8** %ptr, <16 x i8> %A) {
+; CHECK-LABEL: test_v16i8_post_imm_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], #1
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1
+  %tmp3 = getelementptr i8* %bar, i64 1
+  store i8* %tmp3, i8** %ptr
+  ret <16 x i8> %tmp2
+}
+
+define <16 x i8> @test_v16i8_post_reg_ld1lane(i8* %bar, i8** %ptr, i64 %inc, <16 x i8> %A) {
+; CHECK-LABEL: test_v16i8_post_reg_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1
+  %tmp3 = getelementptr i8* %bar, i64 %inc
+  store i8* %tmp3, i8** %ptr
+  ret <16 x i8> %tmp2
+}
+
+define <8 x i8> @test_v8i8_post_imm_ld1lane(i8* %bar, i8** %ptr, <8 x i8> %A) {
+; CHECK-LABEL: test_v8i8_post_imm_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], #1
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1
+  %tmp3 = getelementptr i8* %bar, i64 1
+  store i8* %tmp3, i8** %ptr
+  ret <8 x i8> %tmp2
+}
+
+define <8 x i8> @test_v8i8_post_reg_ld1lane(i8* %bar, i8** %ptr, i64 %inc, <8 x i8> %A) {
+; CHECK-LABEL: test_v8i8_post_reg_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1
+  %tmp3 = getelementptr i8* %bar, i64 %inc
+  store i8* %tmp3, i8** %ptr
+  ret <8 x i8> %tmp2
+}
+
+define <8 x i16> @test_v8i16_post_imm_ld1lane(i16* %bar, i16** %ptr, <8 x i16> %A) {
+; CHECK-LABEL: test_v8i16_post_imm_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], #2
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1
+  %tmp3 = getelementptr i16* %bar, i64 1
+  store i16* %tmp3, i16** %ptr
+  ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @test_v8i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <8 x i16> %A) {
+; CHECK-LABEL: test_v8i16_post_reg_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1
+  %tmp3 = getelementptr i16* %bar, i64 %inc
+  store i16* %tmp3, i16** %ptr
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i16> @test_v4i16_post_imm_ld1lane(i16* %bar, i16** %ptr, <4 x i16> %A) {
+; CHECK-LABEL: test_v4i16_post_imm_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], #2
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1
+  %tmp3 = getelementptr i16* %bar, i64 1
+  store i16* %tmp3, i16** %ptr
+  ret <4 x i16> %tmp2
+}
+
+define <4 x i16> @test_v4i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <4 x i16> %A) {
+; CHECK-LABEL: test_v4i16_post_reg_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1
+  %tmp3 = getelementptr i16* %bar, i64 %inc
+  store i16* %tmp3, i16** %ptr
+  ret <4 x i16> %tmp2
+}
+
+define <4 x i32> @test_v4i32_post_imm_ld1lane(i32* %bar, i32** %ptr, <4 x i32> %A) {
+; CHECK-LABEL: test_v4i32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1
+  %tmp3 = getelementptr i32* %bar, i64 1
+  store i32* %tmp3, i32** %ptr
+  ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @test_v4i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <4 x i32> %A) {
+; CHECK-LABEL: test_v4i32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1
+  %tmp3 = getelementptr i32* %bar, i64 %inc
+  store i32* %tmp3, i32** %ptr
+  ret <4 x i32> %tmp2
+}
+
+define <2 x i32> @test_v2i32_post_imm_ld1lane(i32* %bar, i32** %ptr, <2 x i32> %A) {
+; CHECK-LABEL: test_v2i32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1
+  %tmp3 = getelementptr i32* %bar, i64 1
+  store i32* %tmp3, i32** %ptr
+  ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @test_v2i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <2 x i32> %A) {
+; CHECK-LABEL: test_v2i32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1
+  %tmp3 = getelementptr i32* %bar, i64 %inc
+  store i32* %tmp3, i32** %ptr
+  ret <2 x i32> %tmp2
+}
+
+define <2 x i64> @test_v2i64_post_imm_ld1lane(i64* %bar, i64** %ptr, <2 x i64> %A) {
+; CHECK-LABEL: test_v2i64_post_imm_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], #8
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1
+  %tmp3 = getelementptr i64* %bar, i64 1
+  store i64* %tmp3, i64** %ptr
+  ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @test_v2i64_post_reg_ld1lane(i64* %bar, i64** %ptr, i64 %inc, <2 x i64> %A) {
+; CHECK-LABEL: test_v2i64_post_reg_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1
+  %tmp3 = getelementptr i64* %bar, i64 %inc
+  store i64* %tmp3, i64** %ptr
+  ret <2 x i64> %tmp2
+}
+
+define <4 x float> @test_v4f32_post_imm_ld1lane(float* %bar, float** %ptr, <4 x float> %A) {
+; CHECK-LABEL: test_v4f32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1
+  %tmp3 = getelementptr float* %bar, i64 1
+  store float* %tmp3, float** %ptr
+  ret <4 x float> %tmp2
+}
+
+define <4 x float> @test_v4f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <4 x float> %A) {
+; CHECK-LABEL: test_v4f32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1
+  %tmp3 = getelementptr float* %bar, i64 %inc
+  store float* %tmp3, float** %ptr
+  ret <4 x float> %tmp2
+}
+
+define <2 x float> @test_v2f32_post_imm_ld1lane(float* %bar, float** %ptr, <2 x float> %A) {
+; CHECK-LABEL: test_v2f32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1
+  %tmp3 = getelementptr float* %bar, i64 1
+  store float* %tmp3, float** %ptr
+  ret <2 x float> %tmp2
+}
+
+define <2 x float> @test_v2f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <2 x float> %A) {
+; CHECK-LABEL: test_v2f32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1
+  %tmp3 = getelementptr float* %bar, i64 %inc
+  store float* %tmp3, float** %ptr
+  ret <2 x float> %tmp2
+}
+
+define <2 x double> @test_v2f64_post_imm_ld1lane(double* %bar, double** %ptr, <2 x double> %A) {
+; CHECK-LABEL: test_v2f64_post_imm_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], #8
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1
+  %tmp3 = getelementptr double* %bar, i64 1
+  store double* %tmp3, double** %ptr
+  ret <2 x double> %tmp2
+}
+
+define <2 x double> @test_v2f64_post_reg_ld1lane(double* %bar, double** %ptr, i64 %inc, <2 x double> %A) {
+; CHECK-LABEL: test_v2f64_post_reg_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], x{{[0-9]+}}
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1
+  %tmp3 = getelementptr double* %bar, i64 %inc
+  store double* %tmp3, double** %ptr
+  ret <2 x double> %tmp2
+}
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
new file mode 100644
index 0000000..a7aaf9e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'I'
+
+define i32 @constraint_I(i32 %i, i32 %j) nounwind ssp {
+entry:
+  %0 = tail call i32 asm sideeffect "add $0, $1, $2", "=r,r,I"(i32 %i, i32 4097) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
new file mode 100644
index 0000000..077e1b8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'J'
+
+define i32 @constraint_J(i32 %i, i32 %j) nounwind ssp {
+entry:
+  %0 = tail call i32 asm sideeffect "sub $0, $1, $2", "=r,r,J"(i32 %i, i32 2) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
new file mode 100644
index 0000000..2a7f961
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'K'
+
+define i32 @constraint_K(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,K"(i32 %i, i32 -1) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
new file mode 100644
index 0000000..1701943
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'L'
+
+define i32 @constraint_L(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,L"(i32 %i, i64 -1) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
new file mode 100644
index 0000000..952bf60
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'M'
+
+define i32 @constraint_M(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,M"(i32 305418240) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
new file mode 100644
index 0000000..b4a199f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'N'
+
+define i32 @constraint_N(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,N"(i64 1311761352401879040) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll b/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
new file mode 100644
index 0000000..6bfce8f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
@@ -0,0 +1,11 @@
+; RUN: not llc < %s -march=arm64 2>&1 | FileCheck %s
+
+
+; The 'z' constraint allocates either xzr or wzr, but obviously an input of 1 is
+; incompatible.
+define void @test_bad_zero_reg() {
+  tail call void asm sideeffect "USE($0)", "z"(i32 1) nounwind
+; CHECK: error: invalid operand for inline asm constraint 'z'
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
new file mode 100644
index 0000000..d76cca3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as | FileCheck %s
+
+; rdar://9167275
+
+define i32 @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: mov {{w[0-9]+}}, 7
+  %0 = tail call i32 asm "mov ${0:w}, 7", "=r"() nounwind
+  ret i32 %0
+}
+
+define i64 @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: mov {{x[0-9]+}}, 7
+  %0 = tail call i64 asm "mov $0, 7", "=r"() nounwind
+  ret i64 %0
+}
+
+define i64 @t3() nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: mov {{w[0-9]+}}, 7
+  %0 = tail call i64 asm "mov ${0:w}, 7", "=r"() nounwind
+  ret i64 %0
+}
+
+; rdar://9281206
+
+define void @t4(i64 %op) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: mov x0, {{x[0-9]+}}; svc #0
+  %0 = tail call i64 asm sideeffect "mov x0, $1; svc #0;", "=r,r,r,~{x0}"(i64 %op, i64 undef) nounwind
+  ret void
+}
+
+; rdar://9394290
+
+define float @t5(float %x) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %0 = tail call float asm "fadd ${0:s}, ${0:s}, ${0:s}", "=w,0"(float %x) nounwind
+  ret float %0
+}
+
+; rdar://9553599
+
+define zeroext i8 @t6(i8* %src) nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: ldtrb {{w[0-9]+}}, [{{x[0-9]+}}]
+  %0 = tail call i8 asm "ldtrb ${0:w}, [$1]", "=r,r"(i8* %src) nounwind
+  ret i8 %0
+}
+
+define void @t7(i8* %f, i32 %g) nounwind {
+entry:
+  %f.addr = alloca i8*, align 8
+  store i8* %f, i8** %f.addr, align 8
+  ; CHECK-LABEL: t7:
+  ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+  call void asm "str ${1:w}, $0", "=*Q,r"(i8** %f.addr, i32 %g) nounwind
+  ret void
+}
+
+; rdar://10258229
+; ARM64TargetLowering::getRegForInlineAsmConstraint() should recognize 'v'
+; registers.
+define void @t8() nounwind ssp {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: stp {{d[0-9]+}}, {{d[0-9]+}}, [sp, #-16]
+  tail call void asm sideeffect "nop", "~{v8}"() nounwind
+  ret void
+}
+
+define i32 @constraint_I(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_I:
+  %0 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 16773120) nounwind
+  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #16773120
+  %1 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 4096) nounwind
+  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #4096
+  ret i32 %1
+}
+
+define i32 @constraint_J(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_J:
+  %0 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -16773120) nounwind
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4278194176
+  %1 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -1) nounwind
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4294967295
+  ret i32 %1
+}
+
+define i32 @constraint_KL(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_KL:
+  %0 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,K"(i32 %i, i32 255) nounwind
+  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #255
+  %1 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,L"(i32 %i, i64 16711680) nounwind
+  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #16711680
+  ret i32 %1
+}
+
+define i32 @constraint_MN(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_MN:
+  %0 = tail call i32 asm sideeffect "movk ${0:w}, $1", "=r,M"(i32 65535) nounwind
+  ; CHECK: movk  {{w[0-9]+}}, #65535
+  %1 = tail call i32 asm sideeffect "movz ${0:w}, $1", "=r,N"(i64 0) nounwind
+  ; CHECK: movz  {{w[0-9]+}}, #0
+  ret i32 %1
+}
+
+define void @t9() nounwind {
+entry:
+  ; CHECK-LABEL: t9:
+  %data = alloca <2 x double>, align 16
+  %0 = load <2 x double>* %data, align 16
+  call void asm sideeffect "mov.2d v4, $0\0A", "w,~{v4}"(<2 x double> %0) nounwind
+  ; CHECK: mov.2d v4, {{v[0-9]+}}
+  ret void
+}
+
+define void @t10() nounwind {
+entry:
+  ; CHECK-LABEL: t10:
+  %data = alloca <2 x float>, align 8
+  %a = alloca [2 x float], align 4
+  %arraydecay = getelementptr inbounds [2 x float]* %a, i32 0, i32 0
+  %0 = load <2 x float>* %data, align 8
+  call void asm sideeffect "ldr ${1:q}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:d}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:s}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:h}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{h[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:b}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{b[0-9]+}}, [{{x[0-9]+}}]
+  ret void
+}
+
+define void @t11() nounwind {
+entry:
+  ; CHECK-LABEL: t11:
+  %a = alloca i32, align 4
+  %0 = load i32* %a, align 4
+  call void asm sideeffect "mov ${1:x}, ${0:x}\0A", "r,i"(i32 %0, i32 0) nounwind
+  ; CHECK: mov xzr, {{x[0-9]+}}
+  %1 = load i32* %a, align 4
+  call void asm sideeffect "mov ${1:w}, ${0:w}\0A", "r,i"(i32 %1, i32 0) nounwind
+  ; CHECK: mov wzr, {{w[0-9]+}}
+  ret void
+}
+
+define void @t12() nounwind {
+entry:
+  ; CHECK-LABEL: t12:
+  %data = alloca <4 x float>, align 16
+  %0 = load <4 x float>* %data, align 16
+  call void asm sideeffect "mov.2d v4, $0\0A", "x,~{v4}"(<4 x float> %0) nounwind
+  ; CHECK mov.2d v4, {{v([0-9])|(1[0-5])}}
+  ret void
+}
+
+define void @t13() nounwind {
+entry:
+  ; CHECK-LABEL: t13:
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 1311673391471656960) nounwind
+  ; CHECK: mov x4, #1311673391471656960
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 -4662) nounwind
+  ; CHECK: mov x4, #-4662
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 4660) nounwind
+  ; CHECK: mov x4, #4660
+  call void asm sideeffect "mov x4, $0\0A", "N"(i64 -71777214294589696) nounwind
+  ; CHECK: mov x4, #-71777214294589696
+  ret void
+}
+
+define void @t14() nounwind {
+entry:
+  ; CHECK-LABEL: t14:
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 305397760) nounwind
+  ; CHECK: mov w4, #305397760
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 -4662) nounwind
+  ; CHECK: mov w4, #4294962634
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 4660) nounwind
+  ; CHECK: mov w4, #4660
+  call void asm sideeffect "mov w4, $0\0A", "M"(i32 -16711936) nounwind
+  ; CHECK: mov w4, #4278255360
+  ret void
+}
+
+define void @t15() nounwind {
+entry:
+  %0 = tail call double asm sideeffect "fmov $0, d8", "=r"() nounwind
+  ; CHECK: fmov {{x[0-9]+}}, d8
+  ret void
+}
+
+; rdar://problem/14285178
+
+define void @test_zero_reg(i32* %addr) {
+; CHECK-LABEL: test_zero_reg:
+
+  tail call void asm sideeffect "USE($0)", "z"(i32 0) nounwind
+; CHECK: USE(xzr)
+
+  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 0)
+; CHECK: USE(wzr)
+
+  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 1)
+; CHECK: orr [[VAL1:w[0-9]+]], wzr, #0x1
+; CHECK: USE([[VAL1]])
+
+  tail call void asm sideeffect "USE($0), USE($1)", "z,z"(i32 0, i32 0) nounwind
+; CHECK: USE(xzr), USE(xzr)
+
+  tail call void asm sideeffect "USE($0), USE(${1:w})", "z,z"(i32 0, i32 0) nounwind
+; CHECK: USE(xzr), USE(wzr)
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll
new file mode 100644
index 0000000..e99168b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-join-reserved.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+target triple = "arm64-apple-macosx10"
+
+; Make sure that a store to [sp] addresses off sp directly.
+; A move isn't necessary.
+; <rdar://problem/11492712>
+; CHECK-LABEL: g:
+; CHECK: str xzr, [sp]
+; CHECK: bl
+; CHECK: ret
+define void @g() nounwind ssp {
+entry:
+  tail call void (i32, ...)* @f(i32 0, i32 0) nounwind
+  ret void
+}
+
+declare void @f(i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-jumptable.ll b/test/CodeGen/AArch64/arm64-jumptable.ll
new file mode 100644
index 0000000..4635cfe
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-jumptable.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; <rdar://11417675>
+
+define void @sum(i32* %to) {
+entry:
+  switch i32 undef, label %exit [
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+  ]
+bb1:
+  store i32 undef, i32* %to
+  br label %exit
+bb2:
+  store i32 undef, i32* %to
+  br label %exit
+bb3:
+  store i32 undef, i32* %to
+  br label %exit
+bb4:
+  store i32 undef, i32* %to
+  br label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: sum:
+; CHECK: adrp    {{x[0-9]+}}, LJTI0_0@PAGE
+; CHECK:  add    {{x[0-9]+}}, {{x[0-9]+}}, LJTI0_0@PAGEOFF
+
+; CHECK-LINUX-LABEL: sum:
+; CHECK-LINUX: adrp    {{x[0-9]+}}, .LJTI0_0
+; CHECK-LINUX:  add    {{x[0-9]+}}, {{x[0-9]+}}, :lo12:.LJTI0_0
diff --git a/test/CodeGen/AArch64/arm64-large-frame.ll b/test/CodeGen/AArch64/arm64-large-frame.ll
new file mode 100644
index 0000000..5a53da6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-large-frame.ll
@@ -0,0 +1,69 @@
+; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
+declare void @use_addr(i8*)
+
+@addr = global i8* null
+
+define void @test_bigframe() {
+; CHECK-LABEL: test_bigframe:
+; CHECK: .cfi_startproc
+
+  %var1 = alloca i8, i32 20000000
+  %var2 = alloca i8, i32 16
+  %var3 = alloca i8, i32 20000000
+
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #1575, lsl #12
+; CHECK: sub sp, sp, #2576
+; CHECK: .cfi_def_cfa_offset 40000032
+
+
+; CHECK: add [[TMP:x[0-9]+]], sp, #4095, lsl #12
+; CHECK: add [[TMP1:x[0-9]+]], [[TMP]], #787, lsl #12
+; CHECK: add {{x[0-9]+}}, [[TMP1]], #3344
+  store volatile i8* %var1, i8** @addr
+
+  %var1plus2 = getelementptr i8* %var1, i32 2
+  store volatile i8* %var1plus2, i8** @addr
+
+; CHECK: add [[TMP:x[0-9]+]], sp, #4095, lsl #12
+; CHECK: add [[TMP1:x[0-9]+]], [[TMP]], #787, lsl #12
+; CHECK: add {{x[0-9]+}}, [[TMP1]], #3328
+  store volatile i8* %var2, i8** @addr
+
+  %var2plus2 = getelementptr i8* %var2, i32 2
+  store volatile i8* %var2plus2, i8** @addr
+
+  store volatile i8* %var3, i8** @addr
+
+  %var3plus2 = getelementptr i8* %var3, i32 2
+  store volatile i8* %var3plus2, i8** @addr
+
+; CHECK: add sp, sp, #4095, lsl #12
+; CHECK: add sp, sp, #4095, lsl #12
+; CHECK: add sp, sp, #1575, lsl #12
+; CHECK: add sp, sp, #2576
+; CHECK: .cfi_endproc
+  ret void
+}
+
+define void @test_mediumframe() {
+; CHECK-LABEL: test_mediumframe:
+  %var1 = alloca i8, i32 1000000
+  %var2 = alloca i8, i32 16
+  %var3 = alloca i8, i32 1000000
+; CHECK: sub sp, sp, #488, lsl #12
+; CHECK-NEXT: sub sp, sp, #1168
+
+  store volatile i8* %var1, i8** @addr
+; CHECK: add     [[VAR1ADDR:x[0-9]+]], sp, #244, lsl #12
+; CHECK: add     [[VAR1ADDR]], [[VAR1ADDR]], #592
+
+; CHECK: add [[VAR2ADDR:x[0-9]+]], sp, #244, lsl #12
+; CHECK: add [[VAR2ADDR]], [[VAR2ADDR]], #576
+
+  store volatile i8* %var2, i8** @addr
+; CHECK: add     sp, sp, #488, lsl #12
+; CHECK: add     sp, sp, #1168
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-ld1.ll b/test/CodeGen/AArch64/arm64-ld1.ll
new file mode 100644
index 0000000..72d808c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ld1.ll
@@ -0,0 +1,1345 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>, <8 x i8>,  <8 x i8> }
+
+define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
+; CHECK-LABEL: ld2_8b
+; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
+; and from the argument of the function also defined by ABI (i.e., x0)
+; CHECK ld2.8b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x2_t  %tmp2
+}
+
+define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
+; CHECK-LABEL: ld3_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.8b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x3_t  %tmp2
+}
+
+define %struct.__neon_int8x8x4_t @ld4_8b(i8* %A) nounwind {
+; CHECK-LABEL: ld4_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
+
+%struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x3_t = type { <16 x i8>,  <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x4_t = type { <16 x i8>,  <16 x i8>, <16 x i8>,  <16 x i8> }
+
+define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
+; CHECK-LABEL: ld2_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.16b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
+; CHECK-LABEL: ld3_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.16b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4_16b(i8* %A) nounwind {
+; CHECK-LABEL: ld4_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
+
+%struct.__neon_int16x4x2_t = type { <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>,  <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x4_t = type { <4 x i16>,  <4 x i16>, <4 x i16>,  <4 x i16> }
+
+define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
+; CHECK-LABEL: ld2_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.4h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x2_t  %tmp2
+}
+
+define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
+; CHECK-LABEL: ld3_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.4h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x3_t  %tmp2
+}
+
+define %struct.__neon_int16x4x4_t @ld4_4h(i16* %A) nounwind {
+; CHECK-LABEL: ld4_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
+
+%struct.__neon_int16x8x2_t = type { <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x3_t = type { <8 x i16>,  <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x4_t = type { <8 x i16>,  <8 x i16>, <8 x i16>,  <8 x i16> }
+
+define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
+; CHECK-LABEL: ld2_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.8h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
+; CHECK-LABEL: ld3_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.8h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x3_t %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4_8h(i16* %A) nounwind {
+; CHECK-LABEL: ld4_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
+
+%struct.__neon_int32x2x2_t = type { <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>,  <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>,  <2 x i32>, <2 x i32>,  <2 x i32> }
+
+define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
+; CHECK-LABEL: ld2_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.2s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x2_t  %tmp2
+}
+
+define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
+; CHECK-LABEL: ld3_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.2s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x3_t  %tmp2
+}
+
+define %struct.__neon_int32x2x4_t @ld4_2s(i32* %A) nounwind {
+; CHECK-LABEL: ld4_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
+
+%struct.__neon_int32x4x2_t = type { <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x3_t = type { <4 x i32>,  <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x4_t = type { <4 x i32>,  <4 x i32>, <4 x i32>,  <4 x i32> }
+
+define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
+; CHECK-LABEL: ld2_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.4s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
+; CHECK-LABEL: ld3_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.4s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4_4s(i32* %A) nounwind {
+; CHECK-LABEL: ld4_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
+
+%struct.__neon_int64x2x2_t = type { <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x3_t = type { <2 x i64>,  <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x4_t = type { <2 x i64>,  <2 x i64>, <2 x i64>,  <2 x i64> }
+
+define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
+; CHECK-LABEL: ld2_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.2d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
+; CHECK-LABEL: ld3_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.2d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4_2d(i64* %A) nounwind {
+; CHECK-LABEL: ld4_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
+
+%struct.__neon_int64x1x2_t = type { <1 x i64>,  <1 x i64> }
+%struct.__neon_int64x1x3_t = type { <1 x i64>,  <1 x i64>, <1 x i64> }
+%struct.__neon_int64x1x4_t = type { <1 x i64>,  <1 x i64>, <1 x i64>, <1 x i64> }
+
+
+define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
+; CHECK-LABEL: ld2_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x2_t  %tmp2
+}
+
+define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
+; CHECK-LABEL: ld3_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x3_t  %tmp2
+}
+
+define %struct.__neon_int64x1x4_t @ld4_1di64(i64* %A) nounwind {
+; CHECK-LABEL: ld4_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x4_t  %tmp2
+}
+
+
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
+
+%struct.__neon_float64x1x2_t = type { <1 x double>,  <1 x double> }
+%struct.__neon_float64x1x3_t = type { <1 x double>,  <1 x double>, <1 x double> }
+%struct.__neon_float64x1x4_t = type { <1 x double>,  <1 x double>, <1 x double>, <1 x double> }
+
+
+define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
+; CHECK-LABEL: ld2_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x2_t  %tmp2
+}
+
+define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
+; CHECK-LABEL: ld3_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x3_t  %tmp2
+}
+
+define %struct.__neon_float64x1x4_t @ld4_1df64(double* %A) nounwind {
+; CHECK-LABEL: ld4_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x4_t  %tmp2
+}
+
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
+
+
+define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_16b
+; CHECK ld2.b { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_16b
+; CHECK ld3.b { v0, v1, v2 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_16b
+; CHECK ld4.b { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_8h
+; CHECK ld2.h { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_8h
+; CHECK ld3.h { v0, v1, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x3_t  %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_8h
+; CHECK ld4.h { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_4s
+; CHECK ld2.s { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_4s
+; CHECK ld3.s { v0, v1, v2 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_4s
+; CHECK ld4.s { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_2d
+; CHECK ld2.d { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_2d
+; CHECK ld3.d { v0, v1, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_2d
+; CHECK ld4.d { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+define <8 x i8> @ld1r_8b(i8* %bar) {
+; CHECK: ld1r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.8b { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+  ret <8 x i8> %tmp9
+}
+
+define <16 x i8> @ld1r_16b(i8* %bar) {
+; CHECK: ld1r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.16b { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+  ret <16 x i8> %tmp17
+}
+
+define <4 x i16> @ld1r_4h(i16* %bar) {
+; CHECK: ld1r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4h { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+  ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @ld1r_8h(i16* %bar) {
+; CHECK: ld1r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.8h { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+  ret <8 x i16> %tmp9
+}
+
+define <2 x i32> @ld1r_2s(i32* %bar) {
+; CHECK: ld1r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @ld1r_4s(i32* %bar) {
+; CHECK: ld1r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ld1r_2d(i64* %bar) {
+; CHECK: ld1r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+  ret <2 x i64> %tmp3
+}
+
+define %struct.__neon_int8x8x2_t @ld2r_8b(i8* %A) nounwind {
+; CHECK: ld2r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.8b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x2_t  %tmp2
+}
+
+define %struct.__neon_int8x8x3_t @ld3r_8b(i8* %A) nounwind {
+; CHECK: ld3r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.8b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x3_t  %tmp2
+}
+
+define %struct.__neon_int8x8x4_t @ld4r_8b(i8* %A) nounwind {
+; CHECK: ld4r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.8b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld2r_16b(i8* %A) nounwind {
+; CHECK: ld2r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.16b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3r_16b(i8* %A) nounwind {
+; CHECK: ld3r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.16b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4r_16b(i8* %A) nounwind {
+; CHECK: ld4r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.16b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+
+define %struct.__neon_int16x4x2_t @ld2r_4h(i16* %A) nounwind {
+; CHECK: ld2r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.4h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x2_t  %tmp2
+}
+
+define %struct.__neon_int16x4x3_t @ld3r_4h(i16* %A) nounwind {
+; CHECK: ld3r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.4h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x3_t  %tmp2
+}
+
+define %struct.__neon_int16x4x4_t @ld4r_4h(i16* %A) nounwind {
+; CHECK: ld4r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.4h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2r_8h(i16* %A) nounwind {
+; CHECK: ld2r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.8h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3r_8h(i16* %A) nounwind {
+; CHECK: ld3r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.8h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x3_t  %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4r_8h(i16* %A) nounwind {
+; CHECK: ld4r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.8h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+
+define %struct.__neon_int32x2x2_t @ld2r_2s(i32* %A) nounwind {
+; CHECK: ld2r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.2s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x2_t  %tmp2
+}
+
+define %struct.__neon_int32x2x3_t @ld3r_2s(i32* %A) nounwind {
+; CHECK: ld3r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.2s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x3_t  %tmp2
+}
+
+define %struct.__neon_int32x2x4_t @ld4r_2s(i32* %A) nounwind {
+; CHECK: ld4r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.2s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2r_4s(i32* %A) nounwind {
+; CHECK: ld2r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.4s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3r_4s(i32* %A) nounwind {
+; CHECK: ld3r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.4s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4r_4s(i32* %A) nounwind {
+; CHECK: ld4r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.4s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+
+define %struct.__neon_int64x1x2_t @ld2r_1d(i64* %A) nounwind {
+; CHECK: ld2r_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x2_t  %tmp2
+}
+
+define %struct.__neon_int64x1x3_t @ld3r_1d(i64* %A) nounwind {
+; CHECK: ld3r_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x3_t  %tmp2
+}
+
+define %struct.__neon_int64x1x4_t @ld4r_1d(i64* %A) nounwind {
+; CHECK: ld4r_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2r_2d(i64* %A) nounwind {
+; CHECK: ld2r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.2d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3r_2d(i64* %A) nounwind {
+; CHECK: ld3r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.2d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4r_2d(i64* %A) nounwind {
+; CHECK: ld4r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.2d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+
+define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
+; CHECK-LABEL: ld1_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> %V, i8 %tmp1, i32 0
+  ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @ld1_8h(<8 x i16> %V, i16* %bar) {
+; CHECK-LABEL: ld1_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> %V, i16 %tmp1, i32 0
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @ld1_4s(<4 x i32> %V, i32* %bar) {
+; CHECK-LABEL: ld1_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> %V, i32 %tmp1, i32 0
+  ret <4 x i32> %tmp2
+}
+
+define <4 x float> @ld1_4s_float(<4 x float> %V, float* %bar) {
+; CHECK-LABEL: ld1_4s_float:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> %V, float %tmp1, i32 0
+  ret <4 x float> %tmp2
+}
+
+define <2 x i64> @ld1_2d(<2 x i64> %V, i64* %bar) {
+; CHECK-LABEL: ld1_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> %V, i64 %tmp1, i32 0
+  ret <2 x i64> %tmp2
+}
+
+define <2 x double> @ld1_2d_double(<2 x double> %V, double* %bar) {
+; CHECK-LABEL: ld1_2d_double:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> %V, double %tmp1, i32 0
+  ret <2 x double> %tmp2
+}
+
+define <1 x i64> @ld1_1d(<1 x i64>* %p) {
+; CHECK-LABEL: ld1_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr [[REG:d[0-9]+]], [x0]
+; CHECK-NEXT: ret
+  %tmp = load <1 x i64>* %p, align 8
+  ret <1 x i64> %tmp
+}
+
+define <8 x i8> @ld1_8b(<8 x i8> %V, i8* %bar) {
+; CHECK-LABEL: ld1_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> %V, i8 %tmp1, i32 0
+  ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @ld1_4h(<4 x i16> %V, i16* %bar) {
+; CHECK-LABEL: ld1_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> %V, i16 %tmp1, i32 0
+  ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @ld1_2s(<2 x i32> %V, i32* %bar) {
+; CHECK-LABEL: ld1_2s:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> %V, i32 %tmp1, i32 0
+  ret <2 x i32> %tmp2
+}
+
+define <2 x float> @ld1_2s_float(<2 x float> %V, float* %bar) {
+; CHECK-LABEL: ld1_2s_float:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> %V, float %tmp1, i32 0
+  ret <2 x float> %tmp2
+}
+
+
+; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
+define void @ld1r_2s_from_dup(i8* nocapture %a, i8* nocapture %b, i16* nocapture %diff) nounwind ssp {
+entry:
+; CHECK: ld1r_2s_from_dup
+; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0]
+; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1]
+; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]]
+; CHECK-NEXT: str d[[RESREGNUM]], [x2]
+; CHECK-NEXT: ret
+  %tmp = bitcast i8* %a to i32*
+  %tmp1 = load i32* %tmp, align 4
+  %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0
+  %lane = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmp3 = bitcast <2 x i32> %lane to <8 x i8>
+  %tmp4 = bitcast i8* %b to i32*
+  %tmp5 = load i32* %tmp4, align 4
+  %tmp6 = insertelement <2 x i32> undef, i32 %tmp5, i32 0
+  %lane1 = shufflevector <2 x i32> %tmp6, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmp7 = bitcast <2 x i32> %lane1 to <8 x i8>
+  %vmovl.i.i = zext <8 x i8> %tmp3 to <8 x i16>
+  %vmovl.i4.i = zext <8 x i8> %tmp7 to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i4.i
+  %tmp8 = bitcast <8 x i16> %sub.i to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %tmp8, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp9 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %tmp10 = bitcast i16* %diff to <4 x i16>*
+  store <4 x i16> %tmp9, <4 x i16>* %tmp10, align 8
+  ret void
+}
+
+; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
+define <4 x float> @ld1r_4s_float(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_4s_float
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp2 = insertelement <4 x float> %tmp1, float %tmp, i32 1
+  %tmp3 = insertelement <4 x float> %tmp2, float %tmp, i32 2
+  %tmp4 = insertelement <4 x float> %tmp3, float %tmp, i32 3
+  ret <4 x float> %tmp4
+}
+
+define <2 x float> @ld1r_2s_float(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2s_float
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+  %tmp2 = insertelement <2 x float> %tmp1, float %tmp, i32 1
+  ret <2 x float> %tmp2
+}
+
+define <2 x double> @ld1r_2d_double(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2d_double
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+  %tmp2 = insertelement <2 x double> %tmp1, double %tmp, i32 1
+  ret <2 x double> %tmp2
+}
+
+define <1 x double> @ld1r_1d_double(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_1d_double
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
+  ret <1 x double> %tmp1
+}
+
+define <4 x float> @ld1r_4s_float_shuff(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_4s_float_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+  %lane = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x float> @ld1r_2s_float_shuff(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2s_float_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+  %lane = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <2 x double> @ld1r_2d_double_shuff(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2d_double_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+  %lane = shufflevector <2 x double> %tmp1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <1 x double> @ld1r_1d_double_shuff(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_1d_double_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
+  %lane = shufflevector <1 x double> %tmp1, <1 x double> undef, <1 x i32> zeroinitializer
+  ret <1 x double> %lane
+}
+
+%struct.__neon_float32x2x2_t = type { <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x2_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x2_t %val
+}
+
+define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x2_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x2_t %val
+}
+
+define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x2_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x2_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x2_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x2_t %val
+}
+
+define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x2_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x2_t %val
+}
+
+
+%struct.__neon_float32x4x2_t = type { <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x3_t = type { <4 x float>,  <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x4_t = type { <4 x float>,  <4 x float>, <4 x float>,  <4 x float> }
+
+%struct.__neon_float64x2x2_t = type { <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x3_t = type { <2 x double>,  <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x4_t = type { <2 x double>,  <2 x double>, <2 x double>,  <2 x double> }
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x2_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x2_t %val
+}
+
+define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x2_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x2_t %val
+}
+
+define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x2_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x2_t %val
+}
+
+define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x2_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x2_t %val
+}
+
+define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x2_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x2_t %val
+}
+
+define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x2_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x2_t %val
+}
+
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x3_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x3_t %val
+}
+
+define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x3_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x3_t %val
+}
+
+define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x3_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x3_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x3_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x3_t %val
+}
+
+define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x3_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x3_t %val
+}
+
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x3_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x3_t %val
+}
+
+define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x3_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x3_t %val
+}
+
+define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x3_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x3_t %val
+}
+
+define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x3_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x3_t %val
+}
+
+define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x3_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x3_t %val
+}
+
+define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x3_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x3_t %val
+}
+
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x4_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x4_t %val
+}
+
+define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x4_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x4_t %val
+}
+
+define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x4_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x4_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x4_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x4_t %val
+}
+
+define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x4_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x4_t %val
+}
+
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x4_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x4_t %val
+}
+
+define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x4_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x4_t %val
+}
+
+define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x4_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x4_t %val
+}
+
+define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x4_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x4_t %val
+}
+
+define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x4_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x4_t %val
+}
+
+define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x4_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x4_t %val
+}
diff --git a/test/CodeGen/AArch64/arm64-ldp.ll b/test/CodeGen/AArch64/arm64-ldp.ll
new file mode 100644
index 0000000..5a98626
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldp.ll
@@ -0,0 +1,149 @@
+; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
+; RUN:   -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
+
+; CHECK: ldp_int
+; CHECK: ldp
+define i32 @ldp_int(i32* %p) nounwind {
+  %tmp = load i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32* %p, i64 1
+  %tmp1 = load i32* %add.ptr, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  ret i32 %add
+}
+
+; CHECK: ldp_long
+; CHECK: ldp
+define i64 @ldp_long(i64* %p) nounwind {
+  %tmp = load i64* %p, align 8
+  %add.ptr = getelementptr inbounds i64* %p, i64 1
+  %tmp1 = load i64* %add.ptr, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
+
+; CHECK: ldp_float
+; CHECK: ldp
+define float @ldp_float(float* %p) nounwind {
+  %tmp = load float* %p, align 4
+  %add.ptr = getelementptr inbounds float* %p, i64 1
+  %tmp1 = load float* %add.ptr, align 4
+  %add = fadd float %tmp, %tmp1
+  ret float %add
+}
+
+; CHECK: ldp_double
+; CHECK: ldp
+define double @ldp_double(double* %p) nounwind {
+  %tmp = load double* %p, align 8
+  %add.ptr = getelementptr inbounds double* %p, i64 1
+  %tmp1 = load double* %add.ptr, align 8
+  %add = fadd double %tmp, %tmp1
+  ret double %add
+}
+
+; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
+define i32 @ldur_int(i32* %a) nounwind {
+; LDUR_CHK: ldur_int
+; LDUR_CHK: ldp     [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add     w{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %a, i32 -1
+  %tmp1 = load i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %a, i32 -2
+  %tmp2 = load i32* %p2, align 2
+  %tmp3 = add i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+define i64 @ldur_long(i64* %a) nounwind ssp {
+; LDUR_CHK: ldur_long
+; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -1
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -2
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define float @ldur_float(float* %a) {
+; LDUR_CHK: ldur_float
+; LDUR_CHK: ldp     [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add     s{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds float* %a, i64 -1
+  %tmp1 = load float* %p1, align 2
+  %p2 = getelementptr inbounds float* %a, i64 -2
+  %tmp2 = load float* %p2, align 2
+  %tmp3 = fadd float %tmp1, %tmp2
+  ret float %tmp3
+}
+
+define double @ldur_double(double* %a) {
+; LDUR_CHK: ldur_double
+; LDUR_CHK: ldp     [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16]
+; LDUR_CHK-NEXT: add     d{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds double* %a, i64 -1
+  %tmp1 = load double* %p1, align 2
+  %p2 = getelementptr inbounds double* %a, i64 -2
+  %tmp2 = load double* %p2, align 2
+  %tmp3 = fadd double %tmp1, %tmp2
+  ret double %tmp3
+}
+
+; Now check some boundary conditions
+define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyIn
+; LDUR_CHK-NOT: ldur
+; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -31
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -32
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyOut
+; LDUR_CHK-NOT: ldp
+; Don't be fragile about which loads or manipulations of the base register
+; are used---just check that there isn't an ldp before the add
+; LDUR_CHK: add
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -32
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -33
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpNotAligned
+; LDUR_CHK-NOT: ldp
+; LDUR_CHK: ldur
+; LDUR_CHK-NEXT: ldur
+; LDUR_CHK-NEXT: add
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -18
+  %bp1 = bitcast i64* %p1 to i8*
+  %bp1p1 = getelementptr inbounds i8* %bp1, i64 1
+  %dp1 = bitcast i8* %bp1p1 to i64*
+  %tmp1 = load i64* %dp1, align 1
+
+  %p2 = getelementptr inbounds i64* %a, i64 -17
+  %bp2 = bitcast i64* %p2 to i8*
+  %bp2p1 = getelementptr inbounds i8* %bp2, i64 1
+  %dp2 = bitcast i8* %bp2p1 to i64*
+  %tmp2 = load i64* %dp2, align 1
+
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
diff --git a/test/CodeGen/AArch64/arm64-ldur.ll b/test/CodeGen/AArch64/arm64-ldur.ll
new file mode 100644
index 0000000..2848c06
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldur.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i64 @_f0(i64* %p) {
+; CHECK: f0:
+; CHECK: ldur x0, [x0, #-8]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i64* %p, i64 -1
+  %ret = load i64* %tmp, align 2
+  ret i64 %ret
+}
+define i32 @_f1(i32* %p) {
+; CHECK: f1:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i32* %p, i64 -1
+  %ret = load i32* %tmp, align 2
+  ret i32 %ret
+}
+define i16 @_f2(i16* %p) {
+; CHECK: f2:
+; CHECK: ldurh w0, [x0, #-2]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i16* %p, i64 -1
+  %ret = load i16* %tmp, align 2
+  ret i16 %ret
+}
+define i8 @_f3(i8* %p) {
+; CHECK: f3:
+; CHECK: ldurb w0, [x0, #-1]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i8* %p, i64 -1
+  %ret = load i8* %tmp, align 2
+  ret i8 %ret
+}
+
+define i64 @zext32(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext32:
+; CHECK: ldur w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp1 = bitcast i8* %p to i32*
+  %tmp2 = load i32* %tmp1, align 4
+  %ret = zext i32 %tmp2 to i64
+
+  ret i64 %ret
+}
+define i64 @zext16(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext16:
+; CHECK: ldurh w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp1 = bitcast i8* %p to i16*
+  %tmp2 = load i16* %tmp1, align 2
+  %ret = zext i16 %tmp2 to i64
+
+  ret i64 %ret
+}
+define i64 @zext8(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext8:
+; CHECK: ldurb w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp2 = load i8* %p, align 1
+  %ret = zext i8 %tmp2 to i64
+
+  ret i64 %ret
+}
diff --git a/test/CodeGen/AArch64/arm64-ldxr-stxr.ll b/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
new file mode 100644
index 0000000..9093df2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
@@ -0,0 +1,270 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
+
+%0 = type { i64, i64 }
+
+define i128 @f0(i8* %p) nounwind readonly {
+; CHECK-LABEL: f0:
+; CHECK: ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %ldrexd = tail call %0 @llvm.aarch64.ldxp(i8* %p)
+  %0 = extractvalue %0 %ldrexd, 1
+  %1 = extractvalue %0 %ldrexd, 0
+  %2 = zext i64 %0 to i128
+  %3 = zext i64 %1 to i128
+  %shl = shl nuw i128 %2, 64
+  %4 = or i128 %shl, %3
+  ret i128 %4
+}
+
+define i32 @f1(i8* %ptr, i128 %val) nounwind {
+; CHECK-LABEL: f1:
+; CHECK: stxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %tmp4 = trunc i128 %val to i64
+  %tmp6 = lshr i128 %val, 64
+  %tmp7 = trunc i128 %tmp6 to i64
+  %strexd = tail call i32 @llvm.aarch64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  ret i32 %strexd
+}
+
+declare %0 @llvm.aarch64.ldxp(i8*) nounwind
+declare i32 @llvm.aarch64.stxp(i64, i64, i8*) nounwind
+
+@var = global i64 0, align 8
+
+define void @test_load_i8(i8* %addr) {
+; CHECK-LABEL: test_load_i8:
+; CHECK: ldxrb w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
+  %shortval = trunc i64 %val to i8
+  %extval = zext i8 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i16(i16* %addr) {
+; CHECK-LABEL: test_load_i16:
+; CHECK: ldxrh w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i16(i16* %addr)
+  %shortval = trunc i64 %val to i16
+  %extval = zext i16 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i32(i32* %addr) {
+; CHECK-LABEL: test_load_i32:
+; CHECK: ldxr w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
+  %shortval = trunc i64 %val to i32
+  %extval = zext i32 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i64(i64* %addr) {
+; CHECK-LABEL: test_load_i64:
+; CHECK: ldxr x[[LOADVAL:[0-9]+]], [x0]
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i64(i64* %addr)
+  store i64 %val, i64* @var, align 8
+  ret void
+}
+
+
+declare i64 @llvm.aarch64.ldxr.p0i8(i8*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i16(i16*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i32(i32*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i64(i64*) nounwind
+
+define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
+; CHECK-LABEL: test_store_i8:
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: stxrb w0, w1, [x2]
+  %extval = zext i8 %val to i64
+  %res = call i32 @llvm.aarch64.stxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i16(i32, i16 %val, i16* %addr) {
+; CHECK-LABEL: test_store_i16:
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: stxrh w0, w1, [x2]
+  %extval = zext i16 %val to i64
+  %res = call i32 @llvm.aarch64.stxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i32(i32, i32 %val, i32* %addr) {
+; CHECK-LABEL: test_store_i32:
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: stxr w0, w1, [x2]
+  %extval = zext i32 %val to i64
+  %res = call i32 @llvm.aarch64.stxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i64(i32, i64 %val, i64* %addr) {
+; CHECK-LABEL: test_store_i64:
+; CHECK: stxr w0, x1, [x2]
+  %res = call i32 @llvm.aarch64.stxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %res
+}
+
+declare i32 @llvm.aarch64.stxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i64(i64, i64*) nounwind
+
+; CHECK: test_clear:
+; CHECK: clrex
+define void @test_clear() {
+  call void @llvm.aarch64.clrex()
+  ret void
+}
+
+declare void @llvm.aarch64.clrex() nounwind
+
+define i128 @test_load_acquire_i128(i8* %p) nounwind readonly {
+; CHECK-LABEL: test_load_acquire_i128:
+; CHECK: ldaxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %ldrexd = tail call %0 @llvm.aarch64.ldaxp(i8* %p)
+  %0 = extractvalue %0 %ldrexd, 1
+  %1 = extractvalue %0 %ldrexd, 0
+  %2 = zext i64 %0 to i128
+  %3 = zext i64 %1 to i128
+  %shl = shl nuw i128 %2, 64
+  %4 = or i128 %shl, %3
+  ret i128 %4
+}
+
+define i32 @test_store_release_i128(i8* %ptr, i128 %val) nounwind {
+; CHECK-LABEL: test_store_release_i128:
+; CHECK: stlxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %tmp4 = trunc i128 %val to i64
+  %tmp6 = lshr i128 %val, 64
+  %tmp7 = trunc i128 %tmp6 to i64
+  %strexd = tail call i32 @llvm.aarch64.stlxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  ret i32 %strexd
+}
+
+declare %0 @llvm.aarch64.ldaxp(i8*) nounwind
+declare i32 @llvm.aarch64.stlxp(i64, i64, i8*) nounwind
+
+define void @test_load_acquire_i8(i8* %addr) {
+; CHECK-LABEL: test_load_acquire_i8:
+; CHECK: ldaxrb w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
+  %shortval = trunc i64 %val to i8
+  %extval = zext i8 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_acquire_i16(i16* %addr) {
+; CHECK-LABEL: test_load_acquire_i16:
+; CHECK: ldaxrh w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i16(i16* %addr)
+  %shortval = trunc i64 %val to i16
+  %extval = zext i16 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_acquire_i32(i32* %addr) {
+; CHECK-LABEL: test_load_acquire_i32:
+; CHECK: ldaxr w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i32(i32* %addr)
+  %shortval = trunc i64 %val to i32
+  %extval = zext i32 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_acquire_i64(i64* %addr) {
+; CHECK-LABEL: test_load_acquire_i64:
+; CHECK: ldaxr x[[LOADVAL:[0-9]+]], [x0]
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i64(i64* %addr)
+  store i64 %val, i64* @var, align 8
+  ret void
+}
+
+
+declare i64 @llvm.aarch64.ldaxr.p0i8(i8*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i16(i16*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i32(i32*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i64(i64*) nounwind
+
+define i32 @test_store_release_i8(i32, i8 %val, i8* %addr) {
+; CHECK-LABEL: test_store_release_i8:
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: stlxrb w0, w1, [x2]
+  %extval = zext i8 %val to i64
+  %res = call i32 @llvm.aarch64.stlxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_release_i16(i32, i16 %val, i16* %addr) {
+; CHECK-LABEL: test_store_release_i16:
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: stlxrh w0, w1, [x2]
+  %extval = zext i16 %val to i64
+  %res = call i32 @llvm.aarch64.stlxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_release_i32(i32, i32 %val, i32* %addr) {
+; CHECK-LABEL: test_store_release_i32:
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: stlxr w0, w1, [x2]
+  %extval = zext i32 %val to i64
+  %res = call i32 @llvm.aarch64.stlxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_release_i64(i32, i64 %val, i64* %addr) {
+; CHECK-LABEL: test_store_release_i64:
+; CHECK: stlxr w0, x1, [x2]
+  %res = call i32 @llvm.aarch64.stlxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %res
+}
+
+declare i32 @llvm.aarch64.stlxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i64(i64, i64*) nounwind
diff --git a/test/CodeGen/AArch64/arm64-leaf.ll b/test/CodeGen/AArch64/arm64-leaf.ll
new file mode 100644
index 0000000..d3b2031
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-leaf.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; rdar://12829704
+
+define void @t8() nounwind ssp {
+; CHECK-LABEL: t8:
+; CHECK-NOT: stp	fp, lr, [sp, #-16]!
+; CHECK-NOT: mov	fp, sp
+; CHECK: nop
+; CHECK-NOT: mov	sp, fp
+; CHECK-NOT: ldp	fp, lr, [sp], #16
+  tail call void asm sideeffect "nop", "~{v8}"() nounwind
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-long-shift.ll b/test/CodeGen/AArch64/arm64-long-shift.ll
new file mode 100644
index 0000000..d5baf16
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-long-shift.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+define i128 @shl(i128 %r, i128 %s) nounwind readnone {
+; CHECK-LABEL: shl:
+; CHECK: lsl  [[XREG_0:x[0-9]+]], x1, x2
+; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
+; CHECK-NEXT: lsr  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
+; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
+; CHECK-NEXT: lsl  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
+; CHECK-NEXT: cmp   [[XREG_4]], #0
+; CHECK-NEXT: csel  x1, [[XREG_5]], [[XREG_6]], ge
+; CHECK-NEXT: lsl  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
+; CHECK-NEXT: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
+; CHECK-NEXT: ret
+
+  %shl = shl i128 %r, %s
+  ret i128 %shl
+}
+
+define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
+; CHECK-LABEL: ashr:
+; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
+; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
+; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
+; CHECK-NEXT: asr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: cmp   [[XREG_5]], #0
+; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
+; CHECK-NEXT: asr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
+; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
+; CHECK-NEXT: ret
+
+  %shr = ashr i128 %r, %s
+  ret i128 %shr
+}
+
+define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
+; CHECK-LABEL: lshr:
+; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
+; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
+; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
+; CHECK-NEXT: lsr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: cmp   [[XREG_5]], #0
+; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
+; CHECK-NEXT: lsr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
+; CHECK-NEXT: ret
+
+  %shr = lshr i128 %r, %s
+  ret i128 %shr
+}
diff --git a/test/CodeGen/AArch64/arm64-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
new file mode 100644
index 0000000..f921a59
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
+
+@src = external global %struct.x
+@dst = external global %struct.x
+
+@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
+@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
+@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
+@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR  \00", align 1
+@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
+@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
+@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
+
+define i32 @t0() {
+entry:
+; CHECK-LABEL: t0:
+; CHECK: ldrb [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #10]
+; CHECK: strb [[REG0]], [x[[BASEREG2:[0-9]+]], #10]
+; CHECK: ldrh [[REG1:w[0-9]+]], [x[[BASEREG]], #8]
+; CHECK: strh [[REG1]], [x[[BASEREG2]], #8]
+; CHECK: ldr [[REG2:x[0-9]+]],
+; CHECK: str [[REG2]],
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
+  ret i32 0
+}
+
+define void @t1(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
+; CHECK: stur [[DEST]], [x0, #15]
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
+; CHECK: str [[DEST]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
+  ret void
+}
+
+define void @t2(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: movz [[REG3:w[0-9]+]]
+; CHECK: movk [[REG3]],
+; CHECK: str [[REG3]], [x0, #32]
+; CHECK: ldp [[DEST1:q[0-9]+]], [[DEST2:q[0-9]+]], [x{{[0-9]+}}]
+; CHECK: stp [[DEST1]], [[DEST2]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
+  ret void
+}
+
+define void @t3(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16]
+; CHECK: str [[REG4]], [x0, #16]
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
+; CHECK: str [[DEST]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
+  ret void
+}
+
+define void @t4(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: orr [[REG5:w[0-9]+]], wzr, #0x20
+; CHECK: strh [[REG5]], [x0, #16]
+; CHECK: ldr [[REG6:q[0-9]+]], [x{{[0-9]+}}]
+; CHECK: str [[REG6]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
+  ret void
+}
+
+define void @t5(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: strb wzr, [x0, #6]
+; CHECK: movz [[REG7:w[0-9]+]], #0x5453
+; CHECK: strh [[REG7]], [x0, #4]
+; CHECK: movz [[REG8:w[0-9]+]],
+; CHECK: movk [[REG8]],
+; CHECK: str [[REG8]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
+  ret void
+}
+
+define void @t6() nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6]
+; CHECK: stur [[REG9]], [x{{[0-9]+}}, #6]
+; CHECK: ldr
+; CHECK: str
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
+  ret void
+}
+
+%struct.Foo = type { i32, i32, i32, i32 }
+
+define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
+entry:
+; CHECK: t7
+; CHECK: ldr [[REG10:q[0-9]+]], [x1]
+; CHECK: str [[REG10]], [x0]
+  %0 = bitcast %struct.Foo* %a to i8*
+  %1 = bitcast %struct.Foo* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/AArch64/arm64-memset-inline.ll b/test/CodeGen/AArch64/arm64-memset-inline.ll
new file mode 100644
index 0000000..2e237f4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-memset-inline.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define void @t1(i8* nocapture %c) nounwind optsize {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: str wzr, [x0, #8]
+; CHECK: str xzr, [x0]
+  call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
+  ret void
+}
+
+define void @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: strh wzr, [sp, #32]
+; CHECK: stp xzr, xzr, [sp, #16]
+; CHECK: str xzr, [sp, #8]
+  %buf = alloca [26 x i8], align 1
+  %0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
+  call void @something(i8* %0) nounwind
+  ret void
+}
+
+declare void @something(i8*) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
new file mode 100644
index 0000000..29036ca
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
@@ -0,0 +1,108 @@
+; RUN: llc %s -mtriple=arm64-apple-darwin -o - | \
+; RUN:   FileCheck --check-prefix=CHECK-DARWIN --check-prefix=CHECK %s
+; RUN: llc %s -mtriple=arm64-linux-gnu -o - | \
+; RUN:   FileCheck --check-prefix=CHECK-LINUX --check-prefix=CHECK %s
+; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
+
+; CHECK: @fct1
+; For small size (<= 256), we do not change memset to bzero.
+; CHECK: memset
+define void @fct1(i8* nocapture %ptr) {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+; CHECK: @fct2
+; When the size is bigger than 256, change into bzero.
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
+define void @fct2(i8* nocapture %ptr) {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: @fct3
+; For unknown size, change to bzero.
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
+define void @fct3(i8* nocapture %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: @fct4
+; Size <= 256, no change.
+; CHECK: memset
+define void @fct4(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
+
+declare i64 @llvm.objectsize.i64(i8*, i1)
+
+; CHECK: @fct5
+; Size > 256, change.
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
+define void @fct5(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct6
+; Size = unknown, change.
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
+define void @fct6(i8* %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp)
+  ret void
+}
+
+; Next functions check that memset is not turned into bzero
+; when the set constant is non-zero, whatever the given size.
+
+; CHECK: @fct7
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct7(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct8
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct8(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct9
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct9(i8* %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
new file mode 100644
index 0000000..f88bd6a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -0,0 +1,124 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+;
+; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
+; much higher than the ADD instructions in order to hide latency. When not
+; specifying a subtarget, the MADD will remain near the end of the block.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: main
+; CHECK: *** Final schedule for BB#2 ***
+; CHECK: MADDWrrr
+; CHECK: ADDWri
+; CHECK: ********** INTERVALS **********
+@main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
+@main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %x = alloca [8 x i32], align 4
+  %y = alloca [8 x i32], align 4
+  %i = alloca i32, align 4
+  %xx = alloca i32, align 4
+  %yy = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = bitcast [8 x i32]* %x to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([8 x i32]* @main.x to i8*), i64 32, i32 4, i1 false)
+  %1 = bitcast [8 x i32]* %y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([8 x i32]* @main.y to i8*), i64 32, i32 4, i1 false)
+  store i32 0, i32* %xx, align 4
+  store i32 0, i32* %yy, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %2 = load i32* %i, align 4
+  %cmp = icmp slt i32 %2, 8
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom
+  %4 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %4, 1
+  store i32 %add, i32* %xx, align 4
+  %5 = load i32* %xx, align 4
+  %add1 = add nsw i32 %5, 12
+  store i32 %add1, i32* %xx, align 4
+  %6 = load i32* %xx, align 4
+  %add2 = add nsw i32 %6, 23
+  store i32 %add2, i32* %xx, align 4
+  %7 = load i32* %xx, align 4
+  %add3 = add nsw i32 %7, 34
+  store i32 %add3, i32* %xx, align 4
+  %8 = load i32* %i, align 4
+  %idxprom4 = sext i32 %8 to i64
+  %arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4
+  %9 = load i32* %arrayidx5, align 4
+  %10 = load i32* %yy, align 4
+  %mul = mul nsw i32 %10, %9
+  store i32 %mul, i32* %yy, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %11 = load i32* %i, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %12 = load i32* %xx, align 4
+  %13 = load i32* %yy, align 4
+  %add6 = add nsw i32 %12, %13
+  ret i32 %add6
+}
+
+
+; The Cortex-A53 machine model will cause the FDIVvvv_42 to be raised to
+; hide latency. Whereas normally there would only be a single FADDvvv_4s
+; after it, this test checks to make sure there are more than one.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: neon4xfloat:BB#0
+; CHECK: *** Final schedule for BB#0 ***
+; CHECK: FDIVv4f32
+; CHECK: FADDv4f32
+; CHECK: FADDv4f32
+; CHECK: ********** INTERVALS **********
+define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
+        %tmp1 = fadd <4 x float> %A, %B;
+        %tmp2 = fadd <4 x float> %A, %tmp1;
+        %tmp3 = fadd <4 x float> %A, %tmp2;
+        %tmp4 = fadd <4 x float> %A, %tmp3;
+        %tmp5 = fadd <4 x float> %A, %tmp4;
+        %tmp6 = fadd <4 x float> %A, %tmp5;
+        %tmp7 = fadd <4 x float> %A, %tmp6;
+        %tmp8 = fadd <4 x float> %A, %tmp7;
+        %tmp9 = fdiv <4 x float> %A, %B;
+        %tmp10 = fadd <4 x float> %tmp8, %tmp9;
+
+        ret <4 x float> %tmp10
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+
+; Regression Test for PR19761
+;   [ARM64] Cortex-a53 schedule mode can't handle NEON post-increment load
+;
+; Nothing explicit to check other than llc not crashing.
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp = getelementptr i8* %A, i32 32
+  store i8* %tmp, i8** %ptr
+  ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
diff --git a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
new file mode 100644
index 0000000..97bfb5c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
@@ -0,0 +1,21 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+;
+; For Cortex-A53, shiftable operands that are not actually shifted
+; are not needed for an additional two cycles.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: shiftable
+; CHECK: *** Final schedule for BB#0 ***
+; CHECK: ADDXrr %vreg0, %vreg2
+; CHECK: ADDXrs %vreg0, %vreg2, 5
+; CHECK: ********** INTERVALS **********
+define i64 @shiftable(i64 %A, i64 %B) {
+        %tmp0 = sub i64 %B, 20
+        %tmp1 = shl i64 %tmp0, 5;
+        %tmp2 = add i64 %A, %tmp1;
+        %tmp3 = add i64 %A, %tmp0
+        %tmp4 = mul i64 %tmp2, %tmp3
+
+        ret i64 %tmp4
+}
diff --git a/test/CodeGen/AArch64/arm64-movi.ll b/test/CodeGen/AArch64/arm64-movi.ll
new file mode 100644
index 0000000..2cd368d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-movi.ll
@@ -0,0 +1,202 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+;==--------------------------------------------------------------------------==
+; Tests for MOV-immediate implemented with ORR-immediate.
+;==--------------------------------------------------------------------------==
+
+; 64-bit immed with 32-bit pattern size, rotated by 0.
+define i64 @test64_32_rot0() nounwind {
+; CHECK-LABEL: test64_32_rot0:
+; CHECK: orr x0, xzr, #0x700000007
+  ret i64 30064771079
+}
+
+; 64-bit immed with 32-bit pattern size, rotated by 2.
+define i64 @test64_32_rot2() nounwind {
+; CHECK-LABEL: test64_32_rot2:
+; CHECK: orr x0, xzr, #0xc0000003c0000003
+  ret i64 13835058071388291075
+}
+
+; 64-bit immed with 4-bit pattern size, rotated by 3.
+define i64 @test64_4_rot3() nounwind {
+; CHECK-LABEL: test64_4_rot3:
+; CHECK: orr  x0, xzr, #0xeeeeeeeeeeeeeeee
+  ret i64 17216961135462248174
+}
+
+; 32-bit immed with 32-bit pattern size, rotated by 16.
+define i32 @test32_32_rot16() nounwind {
+; CHECK-LABEL: test32_32_rot16:
+; CHECK: orr w0, wzr, #0xff0000
+  ret i32 16711680
+}
+
+; 32-bit immed with 2-bit pattern size, rotated by 1.
+define i32 @test32_2_rot1() nounwind {
+; CHECK-LABEL: test32_2_rot1:
+; CHECK: orr w0, wzr, #0xaaaaaaaa
+  ret i32 2863311530
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for MOVZ with MOVK.
+;==--------------------------------------------------------------------------==
+
+define i32 @movz() nounwind {
+; CHECK-LABEL: movz:
+; CHECK: movz w0, #0x5
+  ret i32 5
+}
+
+define i64 @movz_3movk() nounwind {
+; CHECK-LABEL: movz_3movk:
+; CHECK:      movz x0, #0x5, lsl #48
+; CHECK-NEXT: movk x0, #0x1234, lsl #32
+; CHECK-NEXT: movk x0, #0xabcd, lsl #16
+; CHECK-NEXT: movk x0, #0x5678
+  ret i64 1427392313513592
+}
+
+define i64 @movz_movk_skip1() nounwind {
+; CHECK-LABEL: movz_movk_skip1:
+; CHECK:      movz x0, #0x5, lsl #32
+; CHECK-NEXT: movk x0, #0x4321, lsl #16
+  ret i64 22601072640
+}
+
+define i64 @movz_skip1_movk() nounwind {
+; CHECK-LABEL: movz_skip1_movk:
+; CHECK:      movz x0, #0x8654, lsl #32
+; CHECK-NEXT: movk x0, #0x1234
+  ret i64 147695335379508
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for MOVN with MOVK.
+;==--------------------------------------------------------------------------==
+
+define i64 @movn() nounwind {
+; CHECK-LABEL: movn:
+; CHECK: movn x0, #0x29
+  ret i64 -42
+}
+
+define i64 @movn_skip1_movk() nounwind {
+; CHECK-LABEL: movn_skip1_movk:
+; CHECK:      movn x0, #0x29, lsl #32
+; CHECK-NEXT: movk x0, #0x1234
+  ret i64 -176093720012
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for ORR with MOVK.
+;==--------------------------------------------------------------------------==
+; rdar://14987673
+
+define i64 @orr_movk1() nounwind {
+; CHECK-LABEL: orr_movk1:
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #0xdead, lsl #16
+  ret i64 72056498262245120
+}
+
+define i64 @orr_movk2() nounwind {
+; CHECK-LABEL: orr_movk2:
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #0xdead, lsl #48
+  ret i64 -2400982650836746496
+}
+
+define i64 @orr_movk3() nounwind {
+; CHECK-LABEL: orr_movk3:
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #0xdead, lsl #32
+  ret i64 72020953688702720
+}
+
+define i64 @orr_movk4() nounwind {
+; CHECK-LABEL: orr_movk4:
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #0xdead
+  ret i64 72056494543068845
+}
+
+; rdar://14987618
+define i64 @orr_movk5() nounwind {
+; CHECK-LABEL: orr_movk5:
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #0xdead, lsl #16
+  ret i64 -71777214836900096
+}
+
+define i64 @orr_movk6() nounwind {
+; CHECK-LABEL: orr_movk6:
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #0xdead, lsl #16
+; CHECK: movk x0, #0xdead, lsl #48
+  ret i64 -2400982647117578496
+}
+
+define i64 @orr_movk7() nounwind {
+; CHECK-LABEL: orr_movk7:
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #0xdead, lsl #48
+  ret i64 -2400982646575268096
+}
+
+define i64 @orr_movk8() nounwind {
+; CHECK-LABEL: orr_movk8:
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #0xdead
+; CHECK: movk x0, #0xdead, lsl #48
+  ret i64 -2400982646575276371
+}
+
+; rdar://14987715
+define i64 @orr_movk9() nounwind {
+; CHECK-LABEL: orr_movk9:
+; CHECK: orr x0, xzr, #0xffffff000000000
+; CHECK: movk x0, #0xff00
+; CHECK: movk x0, #0xdead, lsl #16
+  ret i64 1152921439623315200
+}
+
+define i64 @orr_movk10() nounwind {
+; CHECK-LABEL: orr_movk10:
+; CHECK: orr x0, xzr, #0xfffffffffffff00
+; CHECK: movk x0, #0xdead, lsl #16
+  ret i64 1152921504047824640
+}
+
+define i64 @orr_movk11() nounwind {
+; CHECK-LABEL: orr_movk11:
+; CHECK: orr x0, xzr, #0xfff00000000000ff
+; CHECK: movk x0, #0xdead, lsl #16
+; CHECK: movk x0, #0xffff, lsl #32
+  ret i64 -4222125209747201
+}
+
+define i64 @orr_movk12() nounwind {
+; CHECK-LABEL: orr_movk12:
+; CHECK: orr x0, xzr, #0xfff00000000000ff
+; CHECK: movk x0, #0xdead, lsl #32
+  ret i64 -4258765016661761
+}
+
+define i64 @orr_movk13() nounwind {
+; CHECK-LABEL: orr_movk13:
+; CHECK: orr x0, xzr, #0xfffff000000
+; CHECK: movk x0, #0xdead
+; CHECK: movk x0, #0xdead, lsl #48
+  ret i64 -2401245434149282131
+}
+
+; rdar://13944082
+define i64 @g() nounwind {
+; CHECK-LABEL: g:
+; CHECK: movz x0, #0xffff, lsl #48
+; CHECK: movk x0, #0x2
+entry:
+  ret i64 -281474976710654
+}
diff --git a/test/CodeGen/AArch64/arm64-mul.ll b/test/CodeGen/AArch64/arm64-mul.ll
new file mode 100644
index 0000000..2e7986d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-mul.ll
@@ -0,0 +1,90 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; rdar://9296808
+; rdar://9349137
+
+define i128 @t1(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: umulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = zext i64 %a to i128
+  %tmp2 = zext i64 %b to i128
+  %tmp3 = mul i128 %tmp1, %tmp2
+  ret i128 %tmp3
+}
+
+define i128 @t2(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: smulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = sext i64 %a to i128
+  %tmp2 = sext i64 %b to i128
+  %tmp3 = mul i128 %tmp1, %tmp2
+  ret i128 %tmp3
+}
+
+define i64 @t3(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: umull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @t4(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: smull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @t5(i32 %a, i32 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = add i64 %c, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t6(i32 %a, i32 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 %c, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t7(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: umnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 0, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t8(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 0, %tmp3
+  ret i64 %tmp4
+}
diff --git a/test/CodeGen/AArch64/arm64-named-reg-alloc.ll b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
new file mode 100644
index 0000000..d86d2e6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
@@ -0,0 +1,14 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"x5\00"}
diff --git a/test/CodeGen/AArch64/arm64-named-reg-notareg.ll b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
new file mode 100644
index 0000000..3ca14c4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"notareg\00"}
diff --git a/test/CodeGen/AArch64/arm64-neg.ll b/test/CodeGen/AArch64/arm64-neg.ll
new file mode 100644
index 0000000..659ce98
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neg.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+define i32 @test_neg_i32(i32 %in) {
+; CHECK-LABEL: test_neg_i32:
+; CHECK: neg w0, w0
+  %res = sub i32 0, %in
+  ret i32 %res
+}
+
+define i64 @test_neg_i64(i64 %in) {
+; CHECK-LABEL: test_neg_i64:
+; CHECK: neg x0, x0
+  %res = sub i64 0, %in
+  ret i64 %res
+}
+
+define <8 x i8> @test_neg_v8i8(<8 x i8> %in) {
+; CHECK-LABEL: test_neg_v8i8:
+; CHECK: neg v0.8b, v0.8b
+  %res = sub <8 x i8> zeroinitializer, %in
+  ret <8 x i8> %res
+}
+
+define <4 x i16> @test_neg_v4i16(<4 x i16> %in) {
+; CHECK-LABEL: test_neg_v4i16:
+; CHECK: neg v0.4h, v0.4h
+  %res = sub <4 x i16> zeroinitializer, %in
+  ret <4 x i16> %res
+}
+
+define <2 x i32> @test_neg_v2i32(<2 x i32> %in) {
+; CHECK-LABEL: test_neg_v2i32:
+; CHECK: neg v0.2s, v0.2s
+  %res = sub <2 x i32> zeroinitializer, %in
+  ret <2 x i32> %res
+}
+
+define <16 x i8> @test_neg_v16i8(<16 x i8> %in) {
+; CHECK-LABEL: test_neg_v16i8:
+; CHECK: neg v0.16b, v0.16b
+  %res = sub <16 x i8> zeroinitializer, %in
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_neg_v8i16(<8 x i16> %in) {
+; CHECK-LABEL: test_neg_v8i16:
+; CHECK: neg v0.8h, v0.8h
+  %res = sub <8 x i16> zeroinitializer, %in
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_neg_v4i32(<4 x i32> %in) {
+; CHECK-LABEL: test_neg_v4i32:
+; CHECK: neg v0.4s, v0.4s
+  %res = sub <4 x i32> zeroinitializer, %in
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_neg_v2i64(<2 x i64> %in) {
+; CHECK-LABEL: test_neg_v2i64:
+; CHECK: neg v0.2d, v0.2d
+  %res = sub <2 x i64> zeroinitializer, %in
+  ret <2 x i64> %res
+}
+
+define <1 x i64> @test_neg_v1i64(<1 x i64> %in) {
+; CHECK-LABEL: test_neg_v1i64:
+; CHECK: neg d0, d0
+  %res = sub <1 x i64> zeroinitializer, %in
+  ret <1 x i64> %res
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-2velem-high.ll b/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
new file mode 100644
index 0000000..58df094
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
@@ -0,0 +1,341 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
+
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
+
+define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) {
+; CHECK-LABEL: test_vmull_high_n_s16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
+  %vmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  ret <4 x i32> %vmull15.i.i
+}
+
+define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vmull_high_n_s32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
+  %vmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  ret <2 x i64> %vmull9.i.i
+}
+
+define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) {
+; CHECK-LABEL: test_vmull_high_n_u16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
+  %vmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  ret <4 x i32> %vmull15.i.i
+}
+
+define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vmull_high_n_u32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
+  %vmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  ret <2 x i64> %vmull9.i.i
+}
+
+define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) {
+; CHECK-LABEL: test_vqdmull_high_n_s16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
+  %vqdmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  ret <4 x i32> %vqdmull15.i.i
+}
+
+define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK-LABEL: test_vqdmull_high_n_s32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
+  %vqdmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  ret <2 x i64> %vqdmull9.i.i
+}
+
+define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK-LABEL: test_vmlal_high_n_s16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK-LABEL: test_vmlal_high_n_s32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK-LABEL: test_vmlal_high_n_u16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK-LABEL: test_vmlal_high_n_u32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK-LABEL: test_vqdmlal_high_n_s16:
+; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vqdmlal15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlal17.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
+  ret <4 x i32> %vqdmlal17.i.i
+}
+
+define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK-LABEL: test_vqdmlal_high_n_s32:
+; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vqdmlal9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlal11.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
+  ret <2 x i64> %vqdmlal11.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK-LABEL: test_vmlsl_high_n_s16:
+; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK-LABEL: test_vmlsl_high_n_s32:
+; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK-LABEL: test_vmlsl_high_n_u16:
+; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK-LABEL: test_vmlsl_high_n_u32:
+; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK-LABEL: test_vqdmlsl_high_n_s16:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
+  ret <4 x i32> %vqdmlsl17.i.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK-LABEL: test_vqdmlsl_high_n_s32:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
+  ret <2 x i64> %vqdmlsl11.i.i
+}
+
+define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) {
+; CHECK-LABEL: test_vmul_n_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1
+  %mul.i = fmul <2 x float> %vecinit1.i, %a
+  ret <2 x float> %mul.i
+}
+
+define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
+; CHECK-LABEL: test_vmulq_n_f32:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3
+  %mul.i = fmul <4 x float> %vecinit3.i, %a
+  ret <4 x float> %mul.i
+}
+
+define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) {
+; CHECK-LABEL: test_vmulq_n_f64:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %vecinit.i = insertelement <2 x double> undef, double %b, i32 0
+  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1
+  %mul.i = fmul <2 x double> %vecinit1.i, %a
+  ret <2 x double> %mul.i
+}
+
+define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
+; CHECK-LABEL: test_vfma_n_f32:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
+; CHECK-LABEL: test_vfmaq_n_f32:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
+; CHECK-LABEL: test_vfms_n_f32:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
+  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+  %1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a)
+  ret <2 x float> %1
+}
+
+define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
+; CHECK-LABEL: test_vfmsq_n_f32:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
+  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a)
+  ret <4 x float> %1
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll
new file mode 100644
index 0000000..869966c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -0,0 +1,2853 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
+
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
+
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmla_lane_s16:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlaq_lane_s16:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmla_lane_s32:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlaq_lane_s32:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmla_laneq_s16:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlaq_laneq_s16:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmla_laneq_s32:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlaq_laneq_s32:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmls_lane_s16:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsq_lane_s16:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmls_lane_s32:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsq_lane_s32:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmls_laneq_s16:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsq_laneq_s16:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmls_laneq_s32:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsq_laneq_s32:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmul_lane_s16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmulq_lane_s16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmul_lane_s32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmulq_lane_s32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmul_lane_u16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmulq_lane_u16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmul_lane_u32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmulq_lane_u32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmul_laneq_s16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmulq_laneq_s16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmul_laneq_s32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmulq_laneq_s32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmul_laneq_u16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmulq_laneq_u16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmul_laneq_u32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmulq_laneq_u32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfma_lane_f32:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
+
+define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmaq_lane_f32:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfma_laneq_f32:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmaq_laneq_f32:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfms_lane_f32:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmsq_lane_f32:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfms_laneq_f32:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmsq_laneq_f32:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
+; CHECK-LABEL: test_vfmaq_lane_f64:
+; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
+define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmaq_laneq_f64:
+; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
+; CHECK-LABEL: test_vfmsq_lane_f64:
+; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <1 x double> <double -0.000000e+00>, %v
+  %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmsq_laneq_f64:
+; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmas_laneq_f32
+; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %extract = extractelement <4 x float> %v, i32 3
+  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
+  ret float %0
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
+; CHECK-LABEL: test_vfmsd_lane_f64
+; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: ret
+entry:
+  %extract.rhs = extractelement <1 x double> %v, i32 0
+  %extract = fsub double -0.000000e+00, %extract.rhs
+  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
+  ret double %0
+}
+
+declare double @llvm.fma.f64(double, double, double)
+
+define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmss_laneq_f32
+; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %extract.rhs = extractelement <4 x float> %v, i32 3
+  %extract = fsub float -0.000000e+00, %extract.rhs
+  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
+  ret float %0
+}
+
+define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmsd_laneq_f64
+; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %extract.rhs = extractelement <2 x double> %v, i32 1
+  %extract = fsub double -0.000000e+00, %extract.rhs
+  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
+  ret double %0
+}
+
+define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_lane_s16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_lane_s32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_laneq_s16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_laneq_s32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_s16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_s32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_s16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_s32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_lane_s16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_lane_s32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_s16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_s32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_s16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_s32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_s16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_s32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_lane_u16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_lane_u32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_laneq_u16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_laneq_u32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_u16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_u32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_u16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_u32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_lane_u16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_lane_u32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_u16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_u32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_u16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_u32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_u16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_u32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_lane_s16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_lane_s32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_lane_u16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_lane_u32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_lane_s16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_lane_s32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_lane_u16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_lane_u32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_laneq_s16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_laneq_s32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_laneq_u16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_laneq_u32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_s16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_s32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_u16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_u32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlal_lane_s16:
+; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlal_lane_s32:
+; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlal_high_lane_s16:
+; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlal_high_lane_s32:
+; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlsl_lane_s16:
+; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlsl_lane_s32:
+; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
+; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
+; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_lane_s16:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_lane_s32:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_laneq_s16:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_laneq_s32:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_high_lane_s16:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_high_lane_s32:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_high_laneq_s16:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_high_laneq_s32:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s16:
+; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s16:
+; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s32:
+; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s32:
+; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s16:
+; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s16:
+; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s32:
+; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s32:
+; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
+define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmul_lane_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
+; CHECK-LABEL: test_vmul_lane_f64:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <1 x double> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to double
+  %extract = extractelement <1 x double> %v, i32 0
+  %2 = fmul double %1, %extract
+  %3 = insertelement <1 x double> undef, double %2, i32 0
+  ret <1 x double> %3
+}
+
+define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulq_lane_f32:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
+; CHECK-LABEL: test_vmulq_lane_f64:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x double> %shuffle, %a
+  ret <2 x double> %mul
+}
+
+define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmul_laneq_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmul_laneq_f64:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <1 x double> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to double
+  %extract = extractelement <2 x double> %v, i32 1
+  %2 = fmul double %1, %extract
+  %3 = insertelement <1 x double> undef, double %2, i32 0
+  ret <1 x double> %3
+}
+
+define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmulq_laneq_f32:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmulq_laneq_f64:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %mul = fmul <2 x double> %shuffle, %a
+  ret <2 x double> %mul
+}
+
+define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulx_lane_f32:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulxq_lane_f32:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
+; CHECK-LABEL: test_vmulxq_lane_f64:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
+define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmulx_laneq_f32:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmulxq_laneq_f32:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmulxq_laneq_f64:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
+define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmla_lane_s16_0:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlaq_lane_s16_0:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmla_lane_s32_0:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlaq_lane_s32_0:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmla_laneq_s16_0:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlaq_laneq_s16_0:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmla_laneq_s32_0:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlaq_laneq_s32_0:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmls_lane_s16_0:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsq_lane_s16_0:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmls_lane_s32_0:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsq_lane_s32_0:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmls_laneq_s16_0:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsq_laneq_s16_0:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmls_laneq_s32_0:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsq_laneq_s32_0:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmul_lane_s16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmulq_lane_s16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmul_lane_s32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmulq_lane_s32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmul_lane_u16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmulq_lane_u16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmul_lane_u32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmulq_lane_u32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmul_laneq_s16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmulq_laneq_s16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmul_laneq_s32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmulq_laneq_s32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmul_laneq_u16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmulq_laneq_u16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmul_laneq_u32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmulq_laneq_u32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfma_lane_f32_0:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmaq_lane_f32_0:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfma_laneq_f32_0:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmaq_laneq_f32_0:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfms_lane_f32_0:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmsq_lane_f32_0:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfms_laneq_f32_0:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmsq_laneq_f32_0:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmaq_laneq_f64_0:
+; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmsq_laneq_f64_0:
+; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_lane_s16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_lane_s32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_laneq_s16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_laneq_s32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_s16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_s32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_lane_s16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_lane_s32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_s16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_s32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_lane_u16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_lane_u32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_laneq_u16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_laneq_u32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_u16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_lane_u32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_lane_u16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_lane_u32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_u16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_laneq_u32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_lane_s16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_lane_s32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_lane_u16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_lane_u32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_lane_s16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_lane_s32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_lane_u16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_lane_u32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_laneq_s16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_laneq_s32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_laneq_u16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_laneq_u32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_s16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_s32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_u16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vmull_high_laneq_u32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlal_lane_s16_0:
+; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlal_lane_s32_0:
+; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
+; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
+; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
+; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
+; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
+; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
+; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_lane_s16_0:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_lane_s32_0:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_laneq_s16_0:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_laneq_s32_0:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s16_0:
+; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
+; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulh_lane_s32_0:
+; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
+; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
+; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
+; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
+; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
+; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
+define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmul_lane_f32_0:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulq_lane_f32_0:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmul_laneq_f32_0:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmul_laneq_f64_0:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <1 x double> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to double
+  %extract = extractelement <2 x double> %v, i32 0
+  %2 = fmul double %1, %extract
+  %3 = insertelement <1 x double> undef, double %2, i32 0
+  ret <1 x double> %3
+}
+
+define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmulq_laneq_f32_0:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmulq_laneq_f64_0:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x double> %shuffle, %a
+  ret <2 x double> %mul
+}
+
+define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulx_lane_f32_0:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
+; CHECK-LABEL: test_vmulxq_lane_f32_0:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
+; CHECK-LABEL: test_vmulxq_lane_f64_0:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
+define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmulx_laneq_f32_0:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
+; CHECK-LABEL: test_vmulxq_laneq_f32_0:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
+; CHECK-LABEL: test_vmulxq_laneq_f64_0:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-NEXT: ret
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
diff --git a/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
new file mode 100644
index 0000000..cb9b36c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -0,0 +1,1829 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
+
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
+
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
+
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
+
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
+
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
+
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
+
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
+
+declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>)
+
+declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>)
+
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>)
+
+declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>)
+
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>)
+
+declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vaddl_s8:
+; CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vaddl_s16:
+; CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vaddl_s32:
+; CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vaddl_u8:
+; CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vaddl_u16:
+; CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vaddl_u32:
+; CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaddl_high_s8:
+; CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %1
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddl_high_s16:
+; CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %1
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddl_high_s32:
+; CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %1
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaddl_high_u8:
+; CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %1
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddl_high_u16:
+; CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %1
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddl_high_u32:
+; CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %1
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vaddw_s8:
+; CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vaddw_s16:
+; CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vaddw_s32:
+; CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vaddw_u8:
+; CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vaddw_u16:
+; CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vaddw_u32:
+; CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaddw_high_s8:
+; CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddw_high_s16:
+; CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddw_high_s32:
+; CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vaddw_high_u8:
+; CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddw_high_u16:
+; CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddw_high_u32:
+; CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vsubl_s8:
+; CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vsubl_s16:
+; CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vsubl_s32:
+; CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vsubl_u8:
+; CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vsubl_u16:
+; CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vsubl_u32:
+; CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsubl_high_s8:
+; CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %sub.i = sub <8 x i16> %0, %1
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubl_high_s16:
+; CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %sub.i = sub <4 x i32> %0, %1
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubl_high_s32:
+; CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %sub.i = sub <2 x i64> %0, %1
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsubl_high_u8:
+; CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %sub.i = sub <8 x i16> %0, %1
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubl_high_u16:
+; CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %sub.i = sub <4 x i32> %0, %1
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubl_high_u32:
+; CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %sub.i = sub <2 x i64> %0, %1
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vsubw_s8:
+; CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %vmovl.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vsubw_s16:
+; CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %vmovl.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vsubw_s32:
+; CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %vmovl.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vsubw_u8:
+; CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %vmovl.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vsubw_u16:
+; CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %vmovl.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vsubw_u32:
+; CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %vmovl.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsubw_high_s8:
+; CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %0
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubw_high_s16:
+; CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %0
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubw_high_s32:
+; CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %0
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vsubw_high_u8:
+; CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %0
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubw_high_u16:
+; CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %0
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubw_high_u32:
+; CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %0
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddhn_s16:
+; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i = add <8 x i16> %a, %b
+  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
+  ret <8 x i8> %vaddhn2.i
+}
+
+define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddhn_s32:
+; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i = add <4 x i32> %a, %b
+  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
+  ret <4 x i16> %vaddhn2.i
+}
+
+define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vaddhn_s64:
+; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i = add <2 x i64> %a, %b
+  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
+  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
+  ret <2 x i32> %vaddhn2.i
+}
+
+define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddhn_u16:
+; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i = add <8 x i16> %a, %b
+  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
+  ret <8 x i8> %vaddhn2.i
+}
+
+define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddhn_u32:
+; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i = add <4 x i32> %a, %b
+  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
+  ret <4 x i16> %vaddhn2.i
+}
+
+define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vaddhn_u64:
+; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i = add <2 x i64> %a, %b
+  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
+  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
+  ret <2 x i32> %vaddhn2.i
+}
+
+define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddhn_high_s16:
+; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i.i = add <8 x i16> %a, %b
+  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddhn_high_s32:
+; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i.i = add <4 x i32> %a, %b
+  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vaddhn_high_s64:
+; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i.i = add <2 x i64> %a, %b
+  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
+  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vaddhn_high_u16:
+; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i.i = add <8 x i16> %a, %b
+  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vaddhn_high_u32:
+; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i.i = add <4 x i32> %a, %b
+  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vaddhn_high_u64:
+; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i.i = add <2 x i64> %a, %b
+  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
+  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vraddhn_s16:
+; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vraddhn2.i
+}
+
+define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vraddhn_s32:
+; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vraddhn2.i
+}
+
+define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vraddhn_s64:
+; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vraddhn2.i
+}
+
+define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vraddhn_u16:
+; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vraddhn2.i
+}
+
+define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vraddhn_u32:
+; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vraddhn2.i
+}
+
+define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vraddhn_u64:
+; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vraddhn2.i
+}
+
+define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vraddhn_high_s16:
+; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vraddhn_high_s32:
+; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vraddhn_high_s64:
+; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vraddhn_high_u16:
+; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vraddhn_high_u32:
+; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vraddhn_high_u64:
+; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubhn_s16:
+; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i = sub <8 x i16> %a, %b
+  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
+  ret <8 x i8> %vsubhn2.i
+}
+
+define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubhn_s32:
+; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i = sub <4 x i32> %a, %b
+  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
+  ret <4 x i16> %vsubhn2.i
+}
+
+define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vsubhn_s64:
+; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i = sub <2 x i64> %a, %b
+  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
+  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
+  ret <2 x i32> %vsubhn2.i
+}
+
+define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubhn_u16:
+; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i = sub <8 x i16> %a, %b
+  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
+  ret <8 x i8> %vsubhn2.i
+}
+
+define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubhn_u32:
+; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i = sub <4 x i32> %a, %b
+  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
+  ret <4 x i16> %vsubhn2.i
+}
+
+define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vsubhn_u64:
+; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i = sub <2 x i64> %a, %b
+  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
+  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
+  ret <2 x i32> %vsubhn2.i
+}
+
+define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubhn_high_s16:
+; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i.i = sub <8 x i16> %a, %b
+  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubhn_high_s32:
+; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i.i = sub <4 x i32> %a, %b
+  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vsubhn_high_s64:
+; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i.i = sub <2 x i64> %a, %b
+  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
+  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vsubhn_high_u16:
+; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i.i = sub <8 x i16> %a, %b
+  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vsubhn_high_u32:
+; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i.i = sub <4 x i32> %a, %b
+  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vsubhn_high_u64:
+; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i.i = sub <2 x i64> %a, %b
+  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
+  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vrsubhn_s16:
+; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vrsubhn2.i
+}
+
+define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrsubhn_s32:
+; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vrsubhn2.i
+}
+
+define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vrsubhn_s64:
+; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vrsubhn2.i
+}
+
+define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vrsubhn_u16:
+; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vrsubhn2.i
+}
+
+define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrsubhn_u32:
+; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vrsubhn2.i
+}
+
+define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vrsubhn_u64:
+; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vrsubhn2.i
+}
+
+define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vrsubhn_high_s16:
+; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrsubhn_high_s32:
+; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vrsubhn_high_s64:
+; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vrsubhn_high_u16:
+; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vrsubhn_high_u32:
+; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vrsubhn_high_u64:
+; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vabdl_s8:
+; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i
+}
+
+define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vabdl_s16:
+; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i
+}
+
+define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vabdl_s32:
+; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i
+}
+
+define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vabdl_u8:
+; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i
+}
+
+define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vabdl_u16:
+; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i
+}
+
+define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vabdl_u32:
+; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i
+}
+
+define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: test_vabal_s8:
+; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vabal_s16:
+; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vabal_s32:
+; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: test_vabal_u8:
+; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vabal_u16:
+; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vabal_u32:
+; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vabdl_high_s8:
+; CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i.i
+}
+
+define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vabdl_high_s16:
+; CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i.i
+}
+
+define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vabdl_high_s32:
+; CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i.i
+}
+
+define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vabdl_high_u8:
+; CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i.i
+}
+
+define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vabdl_high_u16:
+; CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i.i
+}
+
+define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vabdl_high_u32:
+; CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i.i
+}
+
+define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vabal_high_s8:
+; CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
+  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vabal_high_s16:
+; CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
+  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vabal_high_s32:
+; CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
+  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vabal_high_u8:
+; CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
+  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vabal_high_u16:
+; CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
+  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vabal_high_u32:
+; CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
+  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vmull_s8:
+; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i16> %vmull.i
+}
+
+define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vmull_s16:
+; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vmull_s32:
+; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vmull_u8:
+; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i16> %vmull.i
+}
+
+define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vmull_u16:
+; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vmull_u32:
+; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmull_high_s8:
+; CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmull_high_s16:
+; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmull_high_s32:
+; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmull_high_u8:
+; CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vmull_high_u16:
+; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmull_high_u32:
+; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: test_vmlal_s8:
+; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %add.i = add <8 x i16> %vmull.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vmlal_s16:
+; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %add.i = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vmlal_s32:
+; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %add.i = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: test_vmlal_u8:
+; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %add.i = add <8 x i16> %vmull.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vmlal_u16:
+; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %add.i = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vmlal_u32:
+; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %add.i = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmlal_high_s8:
+; CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlal_high_s16:
+; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlal_high_s32:
+; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmlal_high_u8:
+; CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlal_high_u16:
+; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlal_high_u32:
+; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: test_vmlsl_s8:
+; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %sub.i = sub <8 x i16> %a, %vmull.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vmlsl_s16:
+; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %sub.i = sub <4 x i32> %a, %vmull2.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vmlsl_s32:
+; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %sub.i = sub <2 x i64> %a, %vmull2.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: test_vmlsl_u8:
+; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %sub.i = sub <8 x i16> %a, %vmull.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vmlsl_u16:
+; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %sub.i = sub <4 x i32> %a, %vmull2.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vmlsl_u32:
+; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %sub.i = sub <2 x i64> %a, %vmull2.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmlsl_high_s8:
+; CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
+  ret <8 x i16> %sub.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlsl_high_s16:
+; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlsl_high_s32:
+; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: test_vmlsl_high_u8:
+; CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
+  ret <8 x i16> %sub.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vmlsl_high_u16:
+; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vmlsl_high_u32:
+; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vqdmull_s16:
+; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vqdmull_s32:
+; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vqdmlal_s16:
+; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vqdmlal_s32:
+; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vqdmlsl_s16:
+; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vqdmlsl_s32:
+; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vqdmull_high_s16:
+; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vqdmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  ret <4 x i32> %vqdmull2.i.i
+}
+
+define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vqdmull_high_s32:
+; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vqdmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  ret <2 x i64> %vqdmull2.i.i
+}
+
+define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vqdmlal_high_s16:
+; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
+  ret <4 x i32> %vqdmlal4.i.i
+}
+
+define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vqdmlal_high_s32:
+; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vqdmlal2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
+  ret <2 x i64> %vqdmlal4.i.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vqdmlsl_high_s16:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
+  ret <4 x i32> %vqdmlsl4.i.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vqdmlsl_high_s32:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
+  ret <2 x i64> %vqdmlsl4.i.i
+}
+
+define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vmull_p8:
+; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i16> %vmull.i
+}
+
+define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vmull_high_p8:
+; CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  ret <8 x i16> %vmull.i.i
+}
+
+define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
+; CHECK-LABEL: test_vmull_p64
+; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
+entry:
+  %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
+  %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
+  ret i128 %vmull3.i
+}
+
+define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
+; CHECK-LABEL: test_vmull_high_p64
+; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %0 = extractelement <2 x i64> %a, i32 1
+  %1 = extractelement <2 x i64> %b, i32 1
+  %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %0, i64 %1) #1
+  %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
+  ret i128 %vmull3.i.i
+}
+
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
+
+
diff --git a/test/CodeGen/AArch64/arm64-neon-aba-abd.ll b/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
new file mode 100644
index 0000000..6404ab7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
@@ -0,0 +1,236 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uabd_v8i8:
+  %abd = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uabd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %abd
+}
+
+define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uaba_v8i8:
+  %abd = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %aba = add <8 x i8> %lhs, %abd
+; CHECK: uaba v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %aba
+}
+
+define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sabd_v8i8:
+  %abd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sabd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %abd
+}
+
+define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_saba_v8i8:
+  %abd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %aba = add <8 x i8> %lhs, %abd
+; CHECK: saba v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %aba
+}
+
+declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uabd_v16i8:
+  %abd = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uabd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %abd
+}
+
+define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uaba_v16i8:
+  %abd = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %aba = add <16 x i8> %lhs, %abd
+; CHECK: uaba v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %aba
+}
+
+define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sabd_v16i8:
+  %abd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sabd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %abd
+}
+
+define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_saba_v16i8:
+  %abd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %aba = add <16 x i8> %lhs, %abd
+; CHECK: saba v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %aba
+}
+
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uabd_v4i16:
+  %abd = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uabd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %abd
+}
+
+define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uaba_v4i16:
+  %abd = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %aba = add <4 x i16> %lhs, %abd
+; CHECK: uaba v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %aba
+}
+
+define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sabd_v4i16:
+  %abd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sabd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %abd
+}
+
+define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_saba_v4i16:
+  %abd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %aba = add <4 x i16> %lhs, %abd
+; CHECK: saba v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %aba
+}
+
+declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uabd_v8i16:
+  %abd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uabd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %abd
+}
+
+define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uaba_v8i16:
+  %abd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %aba = add <8 x i16> %lhs, %abd
+; CHECK: uaba v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %aba
+}
+
+define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sabd_v8i16:
+  %abd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sabd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %abd
+}
+
+define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_saba_v8i16:
+  %abd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %aba = add <8 x i16> %lhs, %abd
+; CHECK: saba v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %aba
+}
+
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uabd_v2i32:
+  %abd = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uabd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %abd
+}
+
+define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uaba_v2i32:
+  %abd = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %aba = add <2 x i32> %lhs, %abd
+; CHECK: uaba v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %aba
+}
+
+define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sabd_v2i32:
+  %abd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sabd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %abd
+}
+
+define <2 x i32> @test_sabd_v2i32_const() {
+; CHECK: test_sabd_v2i32_const:
+; CHECK: movi     d1, #0x00ffffffff0000
+; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s
+  %1 = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(
+    <2 x i32> <i32 -2147483648, i32 2147450880>,
+    <2 x i32> <i32 -65536, i32 65535>)
+  ret <2 x i32> %1
+}
+
+define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_saba_v2i32:
+  %abd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %aba = add <2 x i32> %lhs, %abd
+; CHECK: saba v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %aba
+}
+
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uabd_v4i32:
+  %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uabd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %abd
+}
+
+define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uaba_v4i32:
+  %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %aba = add <4 x i32> %lhs, %abd
+; CHECK: uaba v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %aba
+}
+
+define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sabd_v4i32:
+  %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sabd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %abd
+}
+
+define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_saba_v4i32:
+  %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %aba = add <4 x i32> %lhs, %abd
+; CHECK: saba v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %aba
+}
+
+declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>)
+
+define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fabd_v2f32:
+  %abd = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fabd v0.2s, v0.2s, v1.2s
+  ret <2 x float> %abd
+}
+
+declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>)
+
+define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fabd_v4f32:
+  %abd = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fabd v0.4s, v0.4s, v1.4s
+  ret <4 x float> %abd
+}
+
+declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>)
+
+define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fabd_v2f64:
+  %abd = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fabd v0.2d, v0.2d, v1.2d
+  ret <2 x double> %abd
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-across.ll b/test/CodeGen/AArch64/arm64-neon-across.ll
new file mode 100644
index 0000000..3a63673
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-across.ll
@@ -0,0 +1,460 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
+
+declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>)
+
+declare i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
+
+define i16 @test_vaddlv_s8(<8 x i8> %a) {
+; CHECK: test_vaddlv_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %saddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlv_s16(<4 x i16> %a) {
+; CHECK: test_vaddlv_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %a)
+  ret i32 %saddlvv.i
+}
+
+define i16 @test_vaddlv_u8(<8 x i8> %a) {
+; CHECK: test_vaddlv_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %uaddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlv_u16(<4 x i16> %a) {
+; CHECK: test_vaddlv_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a)
+  ret i32 %uaddlvv.i
+}
+
+define i16 @test_vaddlvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %saddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %a)
+  ret i32 %saddlvv.i
+}
+
+define i64 @test_vaddlvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_s32:
+; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %saddlvv.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %a)
+  ret i64 %saddlvv.i
+}
+
+define i16 @test_vaddlvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %uaddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
+  ret i32 %uaddlvv.i
+}
+
+define i64 @test_vaddlvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_u32:
+; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %uaddlvv.i = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %a)
+  ret i64 %uaddlvv.i
+}
+
+define i8 @test_vmaxv_s8(<8 x i8> %a) {
+; CHECK: test_vmaxv_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %smaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxv_s16(<4 x i16> %a) {
+; CHECK: test_vmaxv_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %smaxv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vmaxv_u8(<8 x i8> %a) {
+; CHECK: test_vmaxv_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %umaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxv_u16(<4 x i16> %a) {
+; CHECK: test_vmaxv_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %umaxv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vmaxvq_s8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %smaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxvq_s16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %smaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_s32:
+; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a)
+  ret i32 %smaxv.i
+}
+
+define i8 @test_vmaxvq_u8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %umaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxvq_u16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %umaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_u32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_u32:
+; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> %a)
+  ret i32 %umaxv.i
+}
+
+define i8 @test_vminv_s8(<8 x i8> %a) {
+; CHECK: test_vminv_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %sminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminv_s16(<4 x i16> %a) {
+; CHECK: test_vminv_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %sminv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vminv_u8(<8 x i8> %a) {
+; CHECK: test_vminv_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %uminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminv_u16(<4 x i16> %a) {
+; CHECK: test_vminv_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %uminv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vminvq_s8(<16 x i8> %a) {
+; CHECK: test_vminvq_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %sminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminvq_s16(<8 x i16> %a) {
+; CHECK: test_vminvq_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %sminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a) {
+; CHECK: test_vminvq_s32:
+; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a)
+  ret i32 %sminv.i
+}
+
+define i8 @test_vminvq_u8(<16 x i8> %a) {
+; CHECK: test_vminvq_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %uminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminvq_u16(<8 x i16> %a) {
+; CHECK: test_vminvq_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %uminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_u32(<4 x i32> %a) {
+; CHECK: test_vminvq_u32:
+; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> %a)
+  ret i32 %uminv.i
+}
+
+define i8 @test_vaddv_s8(<8 x i8> %a) {
+; CHECK: test_vaddv_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddv_s16(<4 x i16> %a) {
+; CHECK: test_vaddv_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vaddv_u8(<8 x i8> %a) {
+; CHECK: test_vaddv_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddv_u16(<4 x i16> %a) {
+; CHECK: test_vaddv_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vaddvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddvq_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddvq_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddvq_s32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a)
+  ret i32 %vaddv.i
+}
+
+define i8 @test_vaddvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddvq_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddvq_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddvq_u32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a)
+  ret i32 %vaddv.i
+}
+
+define float @test_vmaxvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxvq_f32:
+; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define float @test_vminvq_f32(<4 x float> %a) {
+; CHECK: test_vminvq_f32:
+; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define float @test_vmaxnmvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxnmvq_f32:
+; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define float @test_vminnmvq_f32(<4 x float> %a) {
+; CHECK: test_vminnmvq_f32:
+; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
diff --git a/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll b/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
new file mode 100644
index 0000000..d3dc1b8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_addp_v8i8:
+  %tmp1 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: addp v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_addp_v16i8:
+  %tmp1 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: addp v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_addp_v4i16:
+  %tmp1 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: addp v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_addp_v8i16:
+  %tmp1 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: addp v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_addp_v2i32:
+  %tmp1 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: addp v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_addp_v4i32:
+  %tmp1 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: addp v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_addp_v2i64:
+        %val = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: addp v0.2d, v0.2d, v1.2d
+        ret <2 x i64> %val
+}
+
+declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_faddp_v2f32:
+        %val = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: faddp v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_faddp_v4f32:
+        %val = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: faddp v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_faddp_v2f64:
+        %val = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: faddp v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
+
+define i32 @test_vaddv.v2i32(<2 x i32> %a) {
+; CHECK-LABEL: test_vaddv.v2i32
+; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %1 = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a)
+  ret i32 %1
+}
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
diff --git a/test/CodeGen/AArch64/arm64-neon-add-sub.ll b/test/CodeGen/AArch64/arm64-neon-add-sub.ll
new file mode 100644
index 0000000..fbde606
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-add-sub.ll
@@ -0,0 +1,237 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -aarch64-simd-scalar| FileCheck %s
+
+define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = add <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @add16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = add <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @add4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = add <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @add8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = add <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @add2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = add <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @add4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = add <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @add2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = add <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @add2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = fadd <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @add4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = fadd <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @add2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = fadd <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+define <8 x i8> @sub8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = sub <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sub16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = sub <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sub4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = sub <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sub8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = sub <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sub2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = sub <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sub4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = sub <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sub2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = sub <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @sub2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = fsub <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @sub4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = fsub <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = fsub <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vadd_f64
+; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fadd <1 x double> %a, %b
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmul_f64
+; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fmul <1 x double> %a, %b
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vdiv_f64
+; CHECK: fdiv d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fdiv <1 x double> %a, %b
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vmla_f64
+; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fmul <1 x double> %b, %c
+  %2 = fadd <1 x double> %1, %a
+  ret <1 x double> %2
+}
+
+define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vmls_f64
+; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fmul <1 x double> %b, %c
+  %2 = fsub <1 x double> %a, %1
+  ret <1 x double> %2
+}
+
+define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vfms_f64
+; CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fsub <1 x double> <double -0.000000e+00>, %b
+  %2 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %1, <1 x double> %c, <1 x double> %a)
+  ret <1 x double> %2
+}
+
+define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vfma_f64
+; CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vsub_f64
+; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fsub <1 x double> %a, %b
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vabd_f64
+; CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmax_f64
+; CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmin_f64
+; CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmaxnm_f64
+; CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vminnm_f64
+; CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vabs_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vabs_f64
+; CHECK: fabs d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.fabs.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vneg_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vneg_f64
+; CHECK: fneg d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fsub <1 x double> <double -0.000000e+00>, %a
+  ret <1 x double> %1
+}
+
+declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
diff --git a/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
new file mode 100644
index 0000000..cba81ef
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
@@ -0,0 +1,1191 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s
+
+define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp eq <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp eq <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp eq <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp eq <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp eq <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp eq <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp eq <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp sgt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp sgt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp sgt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp sgt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp sgt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp sgt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp sgt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp slt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp slt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp slt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp slt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp slt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp slt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp slt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp sge <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp sge <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp sge <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp sge <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp sge <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp sge <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp sge <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp sle <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp sle <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp sle <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp sle <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp sle <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp sle <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp sle <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ugt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ugt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp ugt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp ugt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp ugt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp ugt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp ugt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ult <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ult <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ult <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ult <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ult <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ult <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ult <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp uge <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp uge <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp uge <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp uge <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp uge <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp uge <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp uge <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ule <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ule <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ule <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ule <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ule <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ule <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ule <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
+;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
+	%tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
+	%tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
+	%tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
+	%tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
+	%tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
+	%tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
+	%tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
+	%tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
+	%tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
+	%tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
+	%tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
+	%tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
+	%tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
+	%tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v[[ZERO]].8b, v0.8b
+	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
+	%tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
+	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
+	%tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
+	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
+	%tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
+	%tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v[[ZERO]].8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
+	%tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
+	%tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
+	%tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
+	%tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
+	%tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
+	%tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <1 x i64> @cmeqz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmeqz_v1i64:
+; CHECK: cmeq d0, d0, #0
+  %tst = icmp eq <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmgez_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmgez_v1i64:
+; CHECK: cmge d0, d0, #0
+  %tst = icmp sge <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmgtz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmgtz_v1i64:
+; CHECK: cmgt d0, d0, #0
+  %tst = icmp sgt <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmlez_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmlez_v1i64:
+; CHECK: cmle d0, d0, #0
+  %tst = icmp sle <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmltz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmltz_v1i64:
+; CHECK: cmlt d0, d0, #0
+  %tst = icmp slt <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmeqz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmeqz_v1f64:
+; CHECK: fcmeq d0, d0, #0
+  %tst = fcmp oeq <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgez_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmgez_v1f64:
+; CHECK: fcmge d0, d0, #0
+  %tst = fcmp oge <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgtz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmgtz_v1f64:
+; CHECK: fcmgt d0, d0, #0
+  %tst = fcmp ogt <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlez_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmlez_v1f64:
+; CHECK: fcmle d0, d0, #0
+  %tst = fcmp ole <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmltz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmltz_v1f64:
+; CHECK: fcmlt d0, d0, #0
+  %tst = fcmp olt <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
new file mode 100644
index 0000000..cfc2ebf
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -0,0 +1,1445 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+
+define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
+; CHECK-LABEL: ins16bw:
+; CHECK: ins {{v[0-9]+}}.b[15], {{w[0-9]+}}
+  %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) {
+; CHECK-LABEL: ins8hw:
+; CHECK: ins {{v[0-9]+}}.h[6], {{w[0-9]+}}
+  %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6
+  ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) {
+; CHECK-LABEL: ins4sw:
+; CHECK: ins {{v[0-9]+}}.s[2], {{w[0-9]+}}
+  %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) {
+; CHECK-LABEL: ins2dw:
+; CHECK: ins {{v[0-9]+}}.d[1], {{x[0-9]+}}
+  %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1
+  ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) {
+; CHECK-LABEL: ins8bw:
+; CHECK: ins {{v[0-9]+}}.b[5], {{w[0-9]+}}
+  %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5
+  ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) {
+; CHECK-LABEL: ins4hw:
+; CHECK: ins {{v[0-9]+}}.h[3], {{w[0-9]+}}
+  %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3
+  ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
+; CHECK-LABEL: ins2sw:
+; CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}}
+  %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
+  ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
+; CHECK-LABEL: ins16b16:
+; CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
+  ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
+; CHECK-LABEL: ins8h8:
+; CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
+; CHECK-LABEL: ins4s4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
+; CHECK-LABEL: ins2d2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
+  ret <2 x i64> %tmp4
+}
+
+define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
+; CHECK-LABEL: ins4f4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+  %tmp3 = extractelement <4 x float> %tmp1, i32 2
+  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
+; CHECK-LABEL: ins2df2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  %tmp3 = extractelement <2 x double> %tmp1, i32 0
+  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
+  ret <2 x double> %tmp4
+}
+
+define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
+; CHECK-LABEL: ins8b16:
+; CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
+  ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
+; CHECK-LABEL: ins4h8:
+; CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
+; CHECK-LABEL: ins2s4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
+; CHECK-LABEL: ins1d2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
+  ret <2 x i64> %tmp4
+}
+
+define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
+; CHECK-LABEL: ins2f4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
+  %tmp3 = extractelement <2 x float> %tmp1, i32 1
+  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
+; CHECK-LABEL: ins1f2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+  %tmp3 = extractelement <1 x double> %tmp1, i32 0
+  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
+  ret <2 x double> %tmp4
+}
+
+define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
+; CHECK-LABEL: ins16b8:
+; CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
+  ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
+; CHECK-LABEL: ins8h4:
+; CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
+  ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
+; CHECK-LABEL: ins4s2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
+; CHECK-LABEL: ins2d1:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
+  ret <1 x i64> %tmp4
+}
+
+define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
+; CHECK-LABEL: ins4f2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+  %tmp3 = extractelement <4 x float> %tmp1, i32 2
+  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
+  ret <2 x float> %tmp4
+}
+
+define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
+; CHECK-LABEL: ins2f1:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+  %tmp3 = extractelement <2 x double> %tmp1, i32 1
+  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
+  ret <1 x double> %tmp4
+}
+
+define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
+; CHECK-LABEL: ins8b8:
+; CHECK: ins {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
+  ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
+; CHECK-LABEL: ins4h4:
+; CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
+  ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
+; CHECK-LABEL: ins2s2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 0
+  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
+; CHECK-LABEL: ins1d1:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
+  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
+  ret <1 x i64> %tmp4
+}
+
+define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
+; CHECK-LABEL: ins2f2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+  %tmp3 = extractelement <2 x float> %tmp1, i32 0
+  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
+  ret <2 x float> %tmp4
+}
+
+define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
+; CHECK-LABEL: ins1df1:
+; CHECK-NOT: ins {{v[0-9]+}}
+  %tmp3 = extractelement <1 x double> %tmp1, i32 0
+  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
+  ret <1 x double> %tmp4
+}
+
+define i32 @umovw16b(<16 x i8> %tmp1) {
+; CHECK-LABEL: umovw16b:
+; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
+  %tmp4 = zext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw8h(<8 x i16> %tmp1) {
+; CHECK-LABEL: umovw8h:
+; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = zext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw4s(<4 x i32> %tmp1) {
+; CHECK-LABEL: umovw4s:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  ret i32 %tmp3
+}
+
+define i64 @umovx2d(<2 x i64> %tmp1) {
+; CHECK-LABEL: umovx2d:
+; CHECK: mov {{x[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 1
+  ret i64 %tmp3
+}
+
+define i32 @umovw8b(<8 x i8> %tmp1) {
+; CHECK-LABEL: umovw8b:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.b[7]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 7
+  %tmp4 = zext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw4h(<4 x i16> %tmp1) {
+; CHECK-LABEL: umovw4h:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = zext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw2s(<2 x i32> %tmp1) {
+; CHECK-LABEL: umovw2s:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
+  ret i32 %tmp3
+}
+
+define i64 @umovx1d(<1 x i64> %tmp1) {
+; CHECK-LABEL: umovx1d:
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
+  ret i64 %tmp3
+}
+
+define i32 @smovw16b(<16 x i8> %tmp1) {
+; CHECK-LABEL: smovw16b:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
+  %tmp4 = sext i8 %tmp3 to i32
+  %tmp5 = add i32 %tmp4, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovw8h(<8 x i16> %tmp1) {
+; CHECK-LABEL: smovw8h:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  %tmp5 = add i32 %tmp4, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovx16b(<16 x i8> %tmp1) {
+; CHECK-LABEL: smovx16b:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[8]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
+  %tmp4 = sext i8 %tmp3 to i32
+  %tmp5 = add i32 %tmp4, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovx8h(<8 x i16> %tmp1) {
+; CHECK-LABEL: smovx8h:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i64 @smovx4s(<4 x i32> %tmp1) {
+; CHECK-LABEL: smovx4s:
+; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  %tmp4 = sext i32 %tmp3 to i64
+  ret i64 %tmp4
+}
+
+define i32 @smovw8b(<8 x i8> %tmp1) {
+; CHECK-LABEL: smovw8b:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 4
+  %tmp4 = sext i8 %tmp3 to i32
+  %tmp5 = add i32 %tmp4, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovw4h(<4 x i16> %tmp1) {
+; CHECK-LABEL: smovw4h:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  %tmp5 = add i32 %tmp4, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovx8b(<8 x i8> %tmp1) {
+; CHECK-LABEL: smovx8b:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[6]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 6
+  %tmp4 = sext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @smovx4h(<4 x i16> %tmp1) {
+; CHECK-LABEL: smovx4h:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i64 @smovx2s(<2 x i32> %tmp1) {
+; CHECK-LABEL: smovx2s:
+; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
+  %tmp4 = sext i32 %tmp3 to i64
+  ret i64 %tmp4
+}
+
+define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_vcopy_lane_s8:
+; CHECK: ins  {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
+  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
+  ret <8 x i8> %vset_lane
+}
+
+define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_vcopyq_laneq_s8:
+; CHECK: ins  {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
+  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
+  ret <16 x i8> %vset_lane
+}
+
+define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
+; CHECK-LABEL: test_vcopy_lane_swap_s8:
+; CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
+  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
+  ret <8 x i8> %vset_lane
+}
+
+define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: test_vcopyq_laneq_swap_s8:
+; CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
+  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vset_lane
+}
+
+define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
+; CHECK-LABEL: test_vdup_n_u8:
+; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+  %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
+  %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
+  %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
+  %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3
+  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4
+  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5
+  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6
+  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7
+  ret <8 x i8> %vecinit7.i
+}
+
+define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
+; CHECK-LABEL: test_vdup_n_u16:
+; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+  %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
+; CHECK-LABEL: test_vdup_n_u32:
+; CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
+  %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
+; CHECK-LABEL: test_vdup_n_u64:
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+  %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
+  ret <1 x i64> %vecinit.i
+}
+
+define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
+; CHECK-LABEL: test_vdupq_n_u8:
+; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+  %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
+  %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
+  %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
+  %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3
+  %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4
+  %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5
+  %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6
+  %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7
+  %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8
+  %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9
+  %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10
+  %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11
+  %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12
+  %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13
+  %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14
+  %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15
+  ret <16 x i8> %vecinit15.i
+}
+
+define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
+; CHECK-LABEL: test_vdupq_n_u16:
+; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+  %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7
+  ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
+; CHECK-LABEL: test_vdupq_n_u32:
+; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+  %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3
+  ret <4 x i32> %vecinit3.i
+}
+
+define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
+; CHECK-LABEL: test_vdupq_n_u64:
+; CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
+  %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
+  %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
+  ret <2 x i64> %vecinit1.i
+}
+
+define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
+; CHECK-LABEL: test_vdup_lane_s8:
+; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i8> %shuffle
+}
+
+define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
+; CHECK-LABEL: test_vdup_lane_s16:
+; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i16> %shuffle
+}
+
+define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
+; CHECK-LABEL: test_vdup_lane_s32:
+; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
+; CHECK-LABEL: test_vdupq_lane_s8:
+; CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
+; CHECK-LABEL: test_vdupq_lane_s16:
+; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
+; CHECK-LABEL: test_vdupq_lane_s32:
+; CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
+; CHECK-LABEL: test_vdupq_lane_s64:
+; CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+  %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %shuffle
+}
+
+define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
+; CHECK-LABEL: test_vdup_laneq_s8:
+; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i8> %shuffle
+}
+
+define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
+; CHECK-LABEL: test_vdup_laneq_s16:
+; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i16> %shuffle
+}
+
+define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
+; CHECK-LABEL: test_vdup_laneq_s32:
+; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
+; CHECK-LABEL: test_vdupq_laneq_s8:
+; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
+; CHECK-LABEL: test_vdupq_laneq_s16:
+; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
+; CHECK-LABEL: test_vdupq_laneq_s32:
+; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
+; CHECK-LABEL: test_vdupq_laneq_s64:
+; CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+  %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %shuffle
+}
+
+define i64 @test_bitcastv8i8toi64(<8 x i8> %in) {
+; CHECK-LABEL: test_bitcastv8i8toi64:
+   %res = bitcast <8 x i8> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv4i16toi64(<4 x i16> %in) {
+; CHECK-LABEL: test_bitcastv4i16toi64:
+   %res = bitcast <4 x i16> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv2i32toi64(<2 x i32> %in) {
+; CHECK-LABEL: test_bitcastv2i32toi64:
+   %res = bitcast <2 x i32> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv2f32toi64(<2 x float> %in) {
+; CHECK-LABEL: test_bitcastv2f32toi64:
+   %res = bitcast <2 x float> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv1i64toi64(<1 x i64> %in) {
+; CHECK-LABEL: test_bitcastv1i64toi64:
+   %res = bitcast <1 x i64> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv1f64toi64(<1 x double> %in) {
+; CHECK-LABEL: test_bitcastv1f64toi64:
+   %res = bitcast <1 x double> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define <8 x i8> @test_bitcasti64tov8i8(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov8i8:
+   %res = bitcast i64 %in to <8 x i8>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <8 x i8> %res
+}
+
+define <4 x i16> @test_bitcasti64tov4i16(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov4i16:
+   %res = bitcast i64 %in to <4 x i16>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <4 x i16> %res
+}
+
+define <2 x i32> @test_bitcasti64tov2i32(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov2i32:
+   %res = bitcast i64 %in to <2 x i32>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <2 x i32> %res
+}
+
+define <2 x float> @test_bitcasti64tov2f32(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov2f32:
+   %res = bitcast i64 %in to <2 x float>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <2 x float> %res
+}
+
+define <1 x i64> @test_bitcasti64tov1i64(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov1i64:
+   %res = bitcast i64 %in to <1 x i64>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <1 x i64> %res
+}
+
+define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov1f64:
+   %res = bitcast i64 %in to <1 x double>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <1 x double> %res
+}
+
+define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
+; CHECK-LABEL: test_bitcastv8i8tov1f64:
+; CHECK: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
+  %sub.i = sub <8 x i8> zeroinitializer, %a
+  %1 = bitcast <8 x i8> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
+; CHECK-LABEL: test_bitcastv4i16tov1f64:
+; CHECK: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
+  %sub.i = sub <4 x i16> zeroinitializer, %a
+  %1 = bitcast <4 x i16> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
+; CHECK-LABEL: test_bitcastv2i32tov1f64:
+; CHECK: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
+  %sub.i = sub <2 x i32> zeroinitializer, %a
+  %1 = bitcast <2 x i32> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1i64tov1f64:
+; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
+  %sub.i = sub <1 x i64> zeroinitializer, %a
+  %1 = bitcast <1 x i64> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
+; CHECK-LABEL: test_bitcastv2f32tov1f64:
+; CHECK: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
+  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+  %1 = bitcast <2 x float> %sub.i to <1 x double>
+  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
+  ret <1 x i64> %vcvt.i
+}
+
+define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov8i8:
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
+; CHECK-NEXT: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <8 x i8>
+  %sub.i = sub <8 x i8> zeroinitializer, %1
+  ret <8 x i8> %sub.i
+}
+
+define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov4i16:
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
+; CHECK-NEXT: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <4 x i16>
+  %sub.i = sub <4 x i16> zeroinitializer, %1
+  ret <4 x i16> %sub.i
+}
+
+define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov2i32:
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
+; CHECK-NEXT: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <2 x i32>
+  %sub.i = sub <2 x i32> zeroinitializer, %1
+  ret <2 x i32> %sub.i
+}
+
+define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov1i64:
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
+; CHECK-NEXT: neg {{d[0-9]+}}, {{d[0-9]+}}
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <1 x i64>
+  %sub.i = sub <1 x i64> zeroinitializer, %1
+  ret <1 x i64> %sub.i
+}
+
+define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
+; CHECK-LABEL: test_bitcastv1f64tov2f32:
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
+; CHECK-NEXT: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
+  %1 = bitcast <1 x double> %vcvt.i to <2 x float>
+  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %1
+  ret <2 x float> %sub.i
+}
+
+; Test insert element into an undef vector
+define <8 x i8> @scalar_to_vector.v8i8(i8 %a) {
+; CHECK-LABEL: scalar_to_vector.v8i8:
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+  %b = insertelement <8 x i8> undef, i8 %a, i32 0
+  ret <8 x i8> %b
+}
+
+define <16 x i8> @scalar_to_vector.v16i8(i8 %a) {
+; CHECK-LABEL: scalar_to_vector.v16i8:
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+  %b = insertelement <16 x i8> undef, i8 %a, i32 0
+  ret <16 x i8> %b
+}
+
+define <4 x i16> @scalar_to_vector.v4i16(i16 %a) {
+; CHECK-LABEL: scalar_to_vector.v4i16:
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+  %b = insertelement <4 x i16> undef, i16 %a, i32 0
+  ret <4 x i16> %b
+}
+
+define <8 x i16> @scalar_to_vector.v8i16(i16 %a) {
+; CHECK-LABEL: scalar_to_vector.v8i16:
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+  %b = insertelement <8 x i16> undef, i16 %a, i32 0
+  ret <8 x i16> %b
+}
+
+define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
+; CHECK-LABEL: scalar_to_vector.v2i32:
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+  %b = insertelement <2 x i32> undef, i32 %a, i32 0
+  ret <2 x i32> %b
+}
+
+define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
+; CHECK-LABEL: scalar_to_vector.v4i32:
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+  %b = insertelement <4 x i32> undef, i32 %a, i32 0
+  ret <4 x i32> %b
+}
+
+define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
+; CHECK-LABEL: scalar_to_vector.v2i64:
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+  %b = insertelement <2 x i64> undef, i64 %a, i32 0
+  ret <2 x i64> %b
+}
+
+define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
+; CHECK-LABEL: testDUP.v1i8:
+; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+  %b = extractelement <1 x i8> %a, i32 0
+  %c = insertelement <8 x i8> undef, i8 %b, i32 0
+  %d = insertelement <8 x i8> %c, i8 %b, i32 1
+  %e = insertelement <8 x i8> %d, i8 %b, i32 2
+  %f = insertelement <8 x i8> %e, i8 %b, i32 3
+  %g = insertelement <8 x i8> %f, i8 %b, i32 4
+  %h = insertelement <8 x i8> %g, i8 %b, i32 5
+  %i = insertelement <8 x i8> %h, i8 %b, i32 6
+  %j = insertelement <8 x i8> %i, i8 %b, i32 7
+  ret <8 x i8> %j
+}
+
+define <8 x i16> @testDUP.v1i16(<1 x i16> %a) {
+; CHECK-LABEL: testDUP.v1i16:
+; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+  %b = extractelement <1 x i16> %a, i32 0
+  %c = insertelement <8 x i16> undef, i16 %b, i32 0
+  %d = insertelement <8 x i16> %c, i16 %b, i32 1
+  %e = insertelement <8 x i16> %d, i16 %b, i32 2
+  %f = insertelement <8 x i16> %e, i16 %b, i32 3
+  %g = insertelement <8 x i16> %f, i16 %b, i32 4
+  %h = insertelement <8 x i16> %g, i16 %b, i32 5
+  %i = insertelement <8 x i16> %h, i16 %b, i32 6
+  %j = insertelement <8 x i16> %i, i16 %b, i32 7
+  ret <8 x i16> %j
+}
+
+define <4 x i32> @testDUP.v1i32(<1 x i32> %a) {
+; CHECK-LABEL: testDUP.v1i32:
+; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+  %b = extractelement <1 x i32> %a, i32 0
+  %c = insertelement <4 x i32> undef, i32 %b, i32 0
+  %d = insertelement <4 x i32> %c, i32 %b, i32 1
+  %e = insertelement <4 x i32> %d, i32 %b, i32 2
+  %f = insertelement <4 x i32> %e, i32 %b, i32 3
+  ret <4 x i32> %f
+}
+
+define <8 x i8> @getl(<16 x i8> %x) #0 {
+; CHECK-LABEL: getl:
+; CHECK: ret
+  %vecext = extractelement <16 x i8> %x, i32 0
+  %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0
+  %vecext1 = extractelement <16 x i8> %x, i32 1
+  %vecinit2 = insertelement <8 x i8> %vecinit, i8 %vecext1, i32 1
+  %vecext3 = extractelement <16 x i8> %x, i32 2
+  %vecinit4 = insertelement <8 x i8> %vecinit2, i8 %vecext3, i32 2
+  %vecext5 = extractelement <16 x i8> %x, i32 3
+  %vecinit6 = insertelement <8 x i8> %vecinit4, i8 %vecext5, i32 3
+  %vecext7 = extractelement <16 x i8> %x, i32 4
+  %vecinit8 = insertelement <8 x i8> %vecinit6, i8 %vecext7, i32 4
+  %vecext9 = extractelement <16 x i8> %x, i32 5
+  %vecinit10 = insertelement <8 x i8> %vecinit8, i8 %vecext9, i32 5
+  %vecext11 = extractelement <16 x i8> %x, i32 6
+  %vecinit12 = insertelement <8 x i8> %vecinit10, i8 %vecext11, i32 6
+  %vecext13 = extractelement <16 x i8> %x, i32 7
+  %vecinit14 = insertelement <8 x i8> %vecinit12, i8 %vecext13, i32 7
+  ret <8 x i8> %vecinit14
+}
+
+define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
+; CHECK-LABEL: test_dup_v2i32_v4i16:
+; CHECK: dup v0.4h, v0.h[2]
+entry:
+  %x = extractelement <2 x i32> %a, i32 1
+  %vget_lane = trunc i32 %x to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) {
+; CHECK-LABEL: test_dup_v4i32_v8i16:
+; CHECK: dup v0.8h, v0.h[6]
+entry:
+  %x = extractelement <4 x i32> %a, i32 3
+  %vget_lane = trunc i32 %x to i16
+  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+  ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) {
+; CHECK-LABEL: test_dup_v1i64_v4i16:
+; CHECK: dup v0.4h, v0.h[0]
+entry:
+  %x = extractelement <1 x i64> %a, i32 0
+  %vget_lane = trunc i64 %x to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) {
+; CHECK-LABEL: test_dup_v1i64_v2i32:
+; CHECK: dup v0.2s, v0.s[0]
+entry:
+  %x = extractelement <1 x i64> %a, i32 0
+  %vget_lane = trunc i64 %x to i32
+  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v8i16:
+; CHECK: dup v0.8h, v0.h[4]
+entry:
+  %x = extractelement <2 x i64> %a, i32 1
+  %vget_lane = trunc i64 %x to i16
+  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+  ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i32> @test_dup_v2i64_v4i32(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v4i32:
+; CHECK: dup v0.4s, v0.s[2]
+entry:
+  %x = extractelement <2 x i64> %a, i32 1
+  %vget_lane = trunc i64 %x to i32
+  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
+  ret <4 x i32> %vecinit3.i
+}
+
+define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) {
+; CHECK-LABEL: test_dup_v4i32_v4i16:
+; CHECK: dup v0.4h, v0.h[2]
+entry:
+  %x = extractelement <4 x i32> %a, i32 1
+  %vget_lane = trunc i32 %x to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v4i16:
+; CHECK: dup v0.4h, v0.h[0]
+entry:
+  %x = extractelement <2 x i64> %a, i32 0
+  %vget_lane = trunc i64 %x to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) {
+; CHECK-LABEL: test_dup_v2i64_v2i32:
+; CHECK: dup v0.2s, v0.s[0]
+entry:
+  %x = extractelement <2 x i64> %a, i32 0
+  %vget_lane = trunc i64 %x to i32
+  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+
+define <2 x float> @test_scalar_to_vector_f32_to_v2f32(<2 x float> %a) {
+; CHECK-LABEL: test_scalar_to_vector_f32_to_v2f32:
+; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
+; CHECK-NEXT: ret
+entry:
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
+  %1 = insertelement <1 x float> undef, float %0, i32 0
+  %2 = extractelement <1 x float> %1, i32 0
+  %vecinit1.i = insertelement <2 x float> undef, float %2, i32 0
+  ret <2 x float> %vecinit1.i
+}
+
+define <4 x float> @test_scalar_to_vector_f32_to_v4f32(<2 x float> %a) {
+; CHECK-LABEL: test_scalar_to_vector_f32_to_v4f32:
+; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
+; CHECK-NEXT: ret
+entry:
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
+  %1 = insertelement <1 x float> undef, float %0, i32 0
+  %2 = extractelement <1 x float> %1, i32 0
+  %vecinit1.i = insertelement <4 x float> undef, float %2, i32 0
+  ret <4 x float> %vecinit1.i
+}
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
+
+define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) {
+; CHECK-LABEL: test_concat_undef_v1i32:
+; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %0 = extractelement <2 x i32> %a, i32 0
+  %vecinit1.i = insertelement <2 x i32> undef, i32 %0, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+declare i32 @llvm.aarch64.neon.sqabs.i32(i32) #4
+
+define <2 x i32> @test_concat_v1i32_undef(i32 %a) {
+; CHECK-LABEL: test_concat_v1i32_undef:
+; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT: ret
+entry:
+  %b = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
+  %vecinit.i432 = insertelement <2 x i32> undef, i32 %b, i32 0
+  ret <2 x i32> %vecinit.i432
+}
+
+define <2 x i32> @test_concat_same_v1i32_v1i32(<2 x i32> %a) {
+; CHECK-LABEL: test_concat_same_v1i32_v1i32:
+; CHECK: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
+entry:
+  %0 = extractelement <2 x i32> %a, i32 0
+  %vecinit.i = insertelement <2 x i32> undef, i32 %0, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %0, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) {
+; CHECK-LABEL: test_concat_diff_v1i32_v1i32:
+; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
+  %d = insertelement <2 x i32> undef, i32 %c, i32 0
+  %e = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %b)
+  %f = insertelement <2 x i32> undef, i32 %e, i32 0
+  %h = shufflevector <2 x i32> %d, <2 x i32> %f, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %h
+}
+
+define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 {
+; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %vecinit30
+}
+
+define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
+; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <8 x i8> %x, i32 0
+  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
+  %vecext1 = extractelement <8 x i8> %x, i32 1
+  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
+  %vecext3 = extractelement <8 x i8> %x, i32 2
+  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
+  %vecext5 = extractelement <8 x i8> %x, i32 3
+  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
+  %vecext7 = extractelement <8 x i8> %x, i32 4
+  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
+  %vecext9 = extractelement <8 x i8> %x, i32 5
+  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
+  %vecext11 = extractelement <8 x i8> %x, i32 6
+  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
+  %vecext13 = extractelement <8 x i8> %x, i32 7
+  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
+  %vecinit30 = shufflevector <16 x i8> %vecinit14, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %vecinit30
+}
+
+define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
+; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <16 x i8> %x, i32 0
+  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
+  %vecext1 = extractelement <16 x i8> %x, i32 1
+  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
+  %vecext3 = extractelement <16 x i8> %x, i32 2
+  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
+  %vecext5 = extractelement <16 x i8> %x, i32 3
+  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
+  %vecext7 = extractelement <16 x i8> %x, i32 4
+  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
+  %vecext9 = extractelement <16 x i8> %x, i32 5
+  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
+  %vecext11 = extractelement <16 x i8> %x, i32 6
+  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
+  %vecext13 = extractelement <16 x i8> %x, i32 7
+  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
+  %vecext15 = extractelement <8 x i8> %y, i32 0
+  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
+  %vecext17 = extractelement <8 x i8> %y, i32 1
+  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
+  %vecext19 = extractelement <8 x i8> %y, i32 2
+  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
+  %vecext21 = extractelement <8 x i8> %y, i32 3
+  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
+  %vecext23 = extractelement <8 x i8> %y, i32 4
+  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
+  %vecext25 = extractelement <8 x i8> %y, i32 5
+  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
+  %vecext27 = extractelement <8 x i8> %y, i32 6
+  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
+  %vecext29 = extractelement <8 x i8> %y, i32 7
+  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
+  ret <16 x i8> %vecinit30
+}
+
+define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
+; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <8 x i8> %x, i32 0
+  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
+  %vecext1 = extractelement <8 x i8> %x, i32 1
+  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
+  %vecext3 = extractelement <8 x i8> %x, i32 2
+  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
+  %vecext5 = extractelement <8 x i8> %x, i32 3
+  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
+  %vecext7 = extractelement <8 x i8> %x, i32 4
+  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
+  %vecext9 = extractelement <8 x i8> %x, i32 5
+  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
+  %vecext11 = extractelement <8 x i8> %x, i32 6
+  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
+  %vecext13 = extractelement <8 x i8> %x, i32 7
+  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
+  %vecext15 = extractelement <8 x i8> %y, i32 0
+  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
+  %vecext17 = extractelement <8 x i8> %y, i32 1
+  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
+  %vecext19 = extractelement <8 x i8> %y, i32 2
+  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
+  %vecext21 = extractelement <8 x i8> %y, i32 3
+  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
+  %vecext23 = extractelement <8 x i8> %y, i32 4
+  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
+  %vecext25 = extractelement <8 x i8> %y, i32 5
+  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
+  %vecext27 = extractelement <8 x i8> %y, i32 6
+  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
+  %vecext29 = extractelement <8 x i8> %y, i32 7
+  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
+  ret <16 x i8> %vecinit30
+}
+
+define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 {
+; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i16> %vecinit14
+}
+
+define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
+; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <4 x i16> %x, i32 0
+  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
+  %vecext1 = extractelement <4 x i16> %x, i32 1
+  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
+  %vecext3 = extractelement <4 x i16> %x, i32 2
+  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
+  %vecext5 = extractelement <4 x i16> %x, i32 3
+  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
+  %vecinit14 = shufflevector <8 x i16> %vecinit6, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i16> %vecinit14
+}
+
+define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
+; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <8 x i16> %x, i32 0
+  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
+  %vecext1 = extractelement <8 x i16> %x, i32 1
+  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
+  %vecext3 = extractelement <8 x i16> %x, i32 2
+  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
+  %vecext5 = extractelement <8 x i16> %x, i32 3
+  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
+  %vecext7 = extractelement <4 x i16> %y, i32 0
+  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
+  %vecext9 = extractelement <4 x i16> %y, i32 1
+  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
+  %vecext11 = extractelement <4 x i16> %y, i32 2
+  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
+  %vecext13 = extractelement <4 x i16> %y, i32 3
+  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
+  ret <8 x i16> %vecinit14
+}
+
+define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
+; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <4 x i16> %x, i32 0
+  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
+  %vecext1 = extractelement <4 x i16> %x, i32 1
+  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
+  %vecext3 = extractelement <4 x i16> %x, i32 2
+  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
+  %vecext5 = extractelement <4 x i16> %x, i32 3
+  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
+  %vecext7 = extractelement <4 x i16> %y, i32 0
+  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
+  %vecext9 = extractelement <4 x i16> %y, i32 1
+  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
+  %vecext11 = extractelement <4 x i16> %y, i32 2
+  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
+  %vecext13 = extractelement <4 x i16> %y, i32 3
+  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
+  ret <8 x i16> %vecinit14
+}
+
+define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 {
+; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %vecinit6
+}
+
+define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
+; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <2 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <2 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecinit6 = shufflevector <4 x i32> %vecinit2, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %vecinit6
+}
+
+define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
+; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecext3 = extractelement <2 x i32> %y, i32 0
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
+  %vecext5 = extractelement <2 x i32> %y, i32 1
+  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3
+  ret <4 x i32> %vecinit6
+}
+
+define <4 x i32> @test_concat_v4i32_v2i32_v2i32(<2 x i32> %x, <2 x i32> %y) #0 {
+; CHECK-LABEL: test_concat_v4i32_v2i32_v2i32:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecinit6 = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %vecinit6
+}
+
+define <2 x i64> @test_concat_v2i64_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) #0 {
+; CHECK-LABEL: test_concat_v2i64_v2i64_v2i64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vecinit2 = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %vecinit2
+}
+
+define <2 x i64> @test_concat_v2i64_v1i64_v2i64(<1 x i64> %x, <2 x i64> %y) #0 {
+; CHECK-LABEL: test_concat_v2i64_v1i64_v2i64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vecext = extractelement <1 x i64> %x, i32 0
+  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
+  %vecinit2 = shufflevector <2 x i64> %vecinit, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %vecinit2
+}
+
+define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
+; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <2 x i64> %x, i32 0
+  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
+  %vecext1 = extractelement <1 x i64> %y, i32 0
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
+  ret <2 x i64> %vecinit2
+}
+
+define <2 x i64> @test_concat_v2i64_v1i64_v1i64(<1 x i64> %x, <1 x i64> %y) #0 {
+; CHECK-LABEL: test_concat_v2i64_v1i64_v1i64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %vecext = extractelement <1 x i64> %x, i32 0
+  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
+  %vecext1 = extractelement <1 x i64> %y, i32 0
+  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
+  ret <2 x i64> %vecinit2
+}
+
+
+define <4 x i16> @concat_vector_v4i16_const() {
+; CHECK-LABEL: concat_vector_v4i16_const:
+; CHECK: movi {{d[0-9]+}}, #0
+ %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %r
+}
+
+define <4 x i16> @concat_vector_v4i16_const_one() {
+; CHECK-LABEL: concat_vector_v4i16_const_one:
+; CHECK: movi {{v[0-9]+}}.4h, #0x1
+ %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %r
+}
+
+define <4 x i32> @concat_vector_v4i32_const() {
+; CHECK-LABEL: concat_vector_v4i32_const:
+; CHECK: movi {{v[0-9]+}}.2d, #0
+ %r = shufflevector <1 x i32> zeroinitializer, <1 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %r
+}
+
+define <8 x i8> @concat_vector_v8i8_const() {
+; CHECK-LABEL: concat_vector_v8i8_const:
+; CHECK: movi {{d[0-9]+}}, #0
+ %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
+ ret <8 x i8> %r
+}
+
+define <8 x i16> @concat_vector_v8i16_const() {
+; CHECK-LABEL: concat_vector_v8i16_const:
+; CHECK: movi {{v[0-9]+}}.2d, #0
+ %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %r
+}
+
+define <8 x i16> @concat_vector_v8i16_const_one() {
+; CHECK-LABEL: concat_vector_v8i16_const_one:
+; CHECK: movi {{v[0-9]+}}.8h, #0x1
+ %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %r
+}
+
+define <16 x i8> @concat_vector_v16i8_const() {
+; CHECK-LABEL: concat_vector_v16i8_const:
+; CHECK: movi {{v[0-9]+}}.2d, #0
+ %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %r
+}
+
+define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) {
+; CHECK-LABEL: concat_vector_v4i16:
+; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+ %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %r
+}
+
+define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) {
+; CHECK-LABEL: concat_vector_v4i32:
+; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+ %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %r
+}
+
+define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
+; CHECK-LABEL: concat_vector_v8i8:
+; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+ %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer
+ ret <8 x i8> %r
+}
+
+define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) {
+; CHECK-LABEL: concat_vector_v8i16:
+; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+ %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %r
+}
+
+define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
+; CHECK-LABEL: concat_vector_v16i8:
+; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+ %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %r
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll b/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
new file mode 100644
index 0000000..276ac13
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; arm64 has a separate copy due to intrinsics
+
+define <4 x i32> @copyTuple.QPair(i32* %a, i32* %b) {
+; CHECK-LABEL: copyTuple.QPair:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i64 1, i32* %a)
+  %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i64 1, i32* %b)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+define <4 x i32> @copyTuple.QTriple(i32* %a, i32* %b, <4 x i32> %c) {
+; CHECK-LABEL: copyTuple.QTriple:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
+  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i64 1, i32* %b)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+define <4 x i32> @copyTuple.QQuad(i32* %a, i32* %b, <4 x i32> %c) {
+; CHECK-LABEL: copyTuple.QQuad:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
+  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %b)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
diff --git a/test/CodeGen/AArch64/arm64-neon-mul-div.ll b/test/CodeGen/AArch64/arm64-neon-mul-div.ll
new file mode 100644
index 0000000..720f3eb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-mul-div.ll
@@ -0,0 +1,797 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; arm64 has its own copy of this because of the intrinsics
+
+define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: mul8xi8:
+; CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = mul <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: mul16xi8:
+; CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = mul <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: mul4xi16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = mul <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: mul8xi16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = mul <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: mul2xi32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = mul <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: mul4x32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = mul <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: mul1xi64:
+; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+  %tmp3 = mul <1 x i64> %A, %B;
+  ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: mul2xi64:
+; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+  %tmp3 = mul <2 x i64> %A, %B;
+  ret <2 x i64> %tmp3
+}
+
+ define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: mul2xfloat:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = fmul <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: mul4xfloat:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = fmul <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: mul2xdouble:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = fmul <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+
+ define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: div2xfloat:
+; CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = fdiv <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: div4xfloat:
+; CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = fdiv <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: div2xdouble:
+; CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = fdiv <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+define <1 x i8> @sdiv1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: sdiv1x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: sdiv8x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: sdiv16x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @sdiv1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: sdiv1x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @sdiv4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: sdiv4x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: sdiv8x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @sdiv1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: sdiv1x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @sdiv2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: sdiv2x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: sdiv4x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = sdiv <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @sdiv1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: sdiv1x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = sdiv <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: sdiv2x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = sdiv <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @udiv1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: udiv1x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: udiv8x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: udiv16x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @udiv1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: udiv1x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @udiv4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: udiv4x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: udiv8x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @udiv1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: udiv1x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @udiv2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: udiv2x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: udiv4x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = udiv <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @udiv1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: udiv1x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = udiv <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: udiv2x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = udiv <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @srem1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: srem1x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: srem8x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: srem16x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @srem1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: srem1x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: srem4x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: srem8x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @srem1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: srem1x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: srem2x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: srem4x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = srem <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @srem1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: srem1x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = srem <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: srem2x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = srem <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @urem1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: urem1x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <1 x i8> %A, %B;
+	ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: urem8x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: urem16x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @urem1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: urem1x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <1 x i16> %A, %B;
+	ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: urem4x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: urem8x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @urem1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: urem1x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <1 x i32> %A, %B;
+	ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: urem2x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: urem4x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+	%tmp3 = urem <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @urem1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: urem1x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = urem <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: urem2x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+	%tmp3 = urem <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: frem2f32:
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+	%tmp3 = frem <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frem4f32(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: frem4f32:
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+	%tmp3 = frem <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+
+define <1 x double> @frem1d64(<1 x double> %A, <1 x double> %B) {
+; CHECK-LABEL: frem1d64:
+; CHECK: bl fmod
+	%tmp3 = frem <1 x double> %A, %B;
+	ret <1 x double> %tmp3
+}
+
+define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: frem2d64:
+; CHECK: bl fmod
+; CHECK: bl fmod
+	%tmp3 = frem <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8>, <8 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8>, <16 x i8>)
+
+define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: poly_mulv8i8:
+   %prod = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: pmul v0.8b, v0.8b, v1.8b
+   ret <8 x i8> %prod
+}
+
+define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK-LABEL: poly_mulv16i8:
+   %prod = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: pmul v0.16b, v0.16b, v1.16b
+   ret <16 x i8> %prod
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
+   ret <4 x i16> %prod
+}
+
+define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
+   ret <8 x i16> %prod
+}
+
+define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
+   ret <2 x i32> %prod
+}
+
+define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
+   ret <4 x i32> %prod
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
+   ret <4 x i16> %prod
+}
+
+define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
+   ret <8 x i16> %prod
+}
+
+define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
+   ret <2 x i32> %prod
+}
+
+define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
+   ret <4 x i32> %prod
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmulx_v2f32:
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.2s, v0.2s, v1.2s
+        %val = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+        ret <2 x float> %val
+}
+
+define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmulx_v4f32:
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.4s, v0.4s, v1.4s
+        %val = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+        ret <4 x float> %val
+}
+
+define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK-LABEL: fmulx_v2f64:
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.2d, v0.2d, v1.2d
+        %val = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+        ret <2 x double> %val
+}
+
diff --git a/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
new file mode 100644
index 0000000..92ed239
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
@@ -0,0 +1,124 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmul_lane_ss2S
+  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = fmul float %a, %tmp1;
+  ret float %tmp2;
+}
+
+define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmul_lane_ss2S_swap
+  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = fmul float %tmp1, %a;
+  ret float %tmp2;
+}
+
+
+define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmul_lane_ss4S
+  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fmul float %a, %tmp1;
+  ret float %tmp2;
+}
+
+define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmul_lane_ss4S_swap
+  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fmul float %tmp1, %a;
+  ret float %tmp2;
+}
+
+
+define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
+  ; CHECK-LABEL: test_fmul_lane_ddD
+  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0]|d[0-9]+}}
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = fmul double %a, %tmp1;
+  ret double %tmp2;
+}
+
+
+
+define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmul_lane_dd2D
+  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fmul double %a, %tmp1;
+  ret double %tmp2;
+}
+
+
+define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmul_lane_dd2D_swap
+  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fmul double %tmp1, %a;
+  ret double %tmp2;
+}
+
+declare float @llvm.aarch64.neon.fmulx.f32(float, float)
+
+define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmulx_lane_f32
+  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
+  ret float %tmp2;
+}
+
+define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmulx_laneq_f32
+  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
+  ret float %tmp2;
+}
+
+define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmulx_laneq_f32_swap
+  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %tmp1, float %a)
+  ret float %tmp2;
+}
+
+declare double @llvm.aarch64.neon.fmulx.f64(double, double)
+
+define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
+  ; CHECK-LABEL: test_fmulx_lane_f64
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0]|d[0-9]+}}
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
+  ret double %tmp2;
+}
+
+define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmulx_laneq_f64_0
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  %tmp1 = extractelement <2 x double> %v, i32 0
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
+  ret double %tmp2;
+}
+
+
+define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmulx_laneq_f64_1
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
+  ret double %tmp2;
+}
+
+define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmulx_laneq_f64_1_swap
+  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %tmp1, double %a)
+  ret double %tmp2;
+}
+
diff --git a/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
new file mode 100644
index 0000000..255b90d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
@@ -0,0 +1,206 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_i8:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].8b, v[[LHS]].8b, v[[RHS]].8b
+; CHECK: dup [[DUPMASK:v[0-9]+]].8b, [[MASK]].b[0]
+; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i8 %a, %b
+  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+  ret <8x i8> %e
+}
+
+define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_f32:
+; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s
+; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
+; CHECK-NEXT: bsl [[DUPMASK]].8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+  ret <8x i8> %e
+}
+
+define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_f64:
+; CHECK: fcmeq d[[MASK:[0-9]+]], d0, d1
+; CHECK-NEXT: bsl v[[MASK]].8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+  ret <8x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_i8:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].16b, v[[LHS]].16b, v[[RHS]].16b
+; CHECK: dup [[DUPMASK:v[0-9]+]].16b, [[MASK]].b[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i8 %a, %b
+  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+  ret <16x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_f32:
+; CHECK: fcmeq [[MASK:v[0-9]+]].4s, v0.4s, v1.4s
+; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK-NEXT: bsl [[DUPMASK]].16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+  ret <16x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_f64:
+; CHECK: fcmeq [[MASK:v[0-9]+]].2d, v0.2d, v1.2d
+; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
+; CHECK-NEXT: bsl [[DUPMASK]].16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+  ret <16x i8> %e
+}
+
+define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) {
+; CHECK-LABEL: test_select_cc_v4i16:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].4h, v[[LHS]].4h, v[[RHS]].4h
+; CHECK: dup [[DUPMASK:v[0-9]+]].4h, [[MASK]].h[0]
+; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i16 %a, %b
+  %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
+  ret <4x i16> %e
+}
+
+define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) {
+; CHECK-LABEL: test_select_cc_v8i16:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].8h, v[[LHS]].8h, v[[RHS]].8h
+; CHECK: dup [[DUPMASK:v[0-9]+]].8h, [[MASK]].h[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i16 %a, %b
+  %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
+  ret <8x i16> %e
+}
+
+define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) {
+; CHECK-LABEL: test_select_cc_v2i32:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].2s, v[[LHS]].2s, v[[RHS]].2s
+; CHECK: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i32 %a, %b
+  %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
+  ret <2x i32> %e
+}
+
+define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) {
+; CHECK-LABEL: test_select_cc_v4i32:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].4s, v[[LHS]].4s, v[[RHS]].4s
+; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i32 %a, %b
+  %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
+  ret <4x i32> %e
+}
+
+define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) {
+; CHECK-LABEL: test_select_cc_v1i64:
+; CHECK-DAG: fmov d[[LHS:[0-9]+]], x0
+; CHECK-DAG: fmov d[[RHS:[0-9]+]], x1
+; CHECK: cmeq d[[MASK:[0-9]+]], d[[LHS]], d[[RHS]]
+; CHECK: bsl v[[MASK]].8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i64 %a, %b
+  %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
+  ret <1x i64> %e
+}
+
+define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) {
+; CHECK-LABEL: test_select_cc_v2i64:
+; CHECK-DAG: fmov d[[LHS:[0-9]+]], x0
+; CHECK-DAG: fmov d[[RHS:[0-9]+]], x1
+; CHECK: cmeq [[MASK:v[0-9]+]].2d, v[[LHS]].2d, v[[RHS]].2d
+; CHECK: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i64 %a, %b
+  %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
+  ret <2x i64> %e
+}
+
+define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) {
+; CHECK-LABEL: test_select_cc_v1f32:
+; CHECK: fcmp s0, s1
+; CHECK-NEXT: fcsel s0, s2, s3, eq
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d
+  ret <1 x float> %e
+}
+
+define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d ) {
+; CHECK-LABEL: test_select_cc_v2f32:
+; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s
+; CHECK: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <2 x float> %c, <2 x float> %d
+  ret <2 x float> %e
+}
+
+define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x float> %d ) {
+; CHECK-LABEL: test_select_cc_v4f32:
+; CHECK: fcmeq [[MASK:v[0-9]+]].4s, v0.4s, v1.4s
+; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
+  ret <4x float> %e
+}
+
+define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x float> %d ) {
+; CHECK-LABEL: test_select_cc_v4f32_icmp:
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].4s, v[[LHS]].4s, v[[RHS]].4s
+; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i32 %a, %b
+  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
+  ret <4x float> %e
+}
+
+define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c, <1 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v1f64:
+; CHECK: fcmeq d[[MASK:[0-9]+]], d0, d1
+; CHECK: bsl v[[MASK]].8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
+  ret <1 x double> %e
+}
+
+define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c, <1 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v1f64_icmp:
+; CHECK-DAG: fmov [[LHS:d[0-9]+]], x0
+; CHECK-DAG: fmov [[RHS:d[0-9]+]], x1
+; CHECK: cmeq d[[MASK:[0-9]+]], [[LHS]], [[RHS]]
+; CHECK: bsl v[[MASK]].8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i64 %a, %b
+  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
+  ret <1 x double> %e
+}
+
+define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v2f64:
+; CHECK: fcmeq [[MASK:v[0-9]+]].2d, v0.2d, v1.2d
+; CHECK: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
+; CHECK: bsl [[DUPMASK]].16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <2 x double> %c, <2 x double> %d
+  ret <2 x double> %e
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
new file mode 100644
index 0000000..cca6bfe
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
@@ -0,0 +1,482 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+
+%struct.uint8x16x2_t = type { [2 x <16 x i8>] }
+%struct.poly8x16x2_t = type { [2 x <16 x i8>] }
+%struct.uint8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int8x16x2_t = type { [2 x <16 x i8>] }
+%struct.int16x8x2_t = type { [2 x <8 x i16>] }
+%struct.int32x4x2_t = type { [2 x <4 x i32>] }
+%struct.int64x2x2_t = type { [2 x <2 x i64>] }
+%struct.float32x4x2_t = type { [2 x <4 x float>] }
+%struct.float64x2x2_t = type { [2 x <2 x double>] }
+%struct.int8x8x2_t = type { [2 x <8 x i8>] }
+%struct.int16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int32x2x2_t = type { [2 x <2 x i32>] }
+%struct.int64x1x2_t = type { [2 x <1 x i64>] }
+%struct.float32x2x2_t = type { [2 x <2 x float>] }
+%struct.float64x1x2_t = type { [2 x <1 x double>] }
+%struct.int8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int16x8x3_t = type { [3 x <8 x i16>] }
+%struct.int32x4x3_t = type { [3 x <4 x i32>] }
+%struct.int64x2x3_t = type { [3 x <2 x i64>] }
+%struct.float32x4x3_t = type { [3 x <4 x float>] }
+%struct.float64x2x3_t = type { [3 x <2 x double>] }
+%struct.int8x8x3_t = type { [3 x <8 x i8>] }
+%struct.int16x4x3_t = type { [3 x <4 x i16>] }
+%struct.int32x2x3_t = type { [3 x <2 x i32>] }
+%struct.int64x1x3_t = type { [3 x <1 x i64>] }
+%struct.float32x2x3_t = type { [3 x <2 x float>] }
+%struct.float64x1x3_t = type { [3 x <1 x double>] }
+%struct.int8x16x4_t = type { [4 x <16 x i8>] }
+%struct.int16x8x4_t = type { [4 x <8 x i16>] }
+%struct.int32x4x4_t = type { [4 x <4 x i32>] }
+%struct.int64x2x4_t = type { [4 x <2 x i64>] }
+%struct.float32x4x4_t = type { [4 x <4 x float>] }
+%struct.float64x2x4_t = type { [4 x <2 x double>] }
+%struct.int8x8x4_t = type { [4 x <8 x i8>] }
+%struct.int16x4x4_t = type { [4 x <4 x i16>] }
+%struct.int32x2x4_t = type { [4 x <2 x i32>] }
+%struct.int64x1x4_t = type { [4 x <1 x i64>] }
+%struct.float32x2x4_t = type { [4 x <2 x float>] }
+%struct.float64x1x4_t = type { [4 x <1 x double>] }
+
+define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
+; CHECK-LABEL: test_ld_from_poll_v16i8:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
+  ret <16 x i8> %b
+}
+
+define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: test_ld_from_poll_v8i16:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+  ret <8 x i16> %b
+}
+
+define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4i32:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %b
+}
+
+define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2i64:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <2 x i64> %a, <i64 1, i64 2>
+  ret <2 x i64> %b
+}
+
+define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4f32:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
+  ret <4 x float> %b
+}
+
+define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2f64:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = fadd <2 x double> %a, <double 1.0, double 2.0>
+  ret <2 x double> %b
+}
+
+define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
+; CHECK-LABEL: test_ld_from_poll_v8i8:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
+  ret <8 x i8> %b
+}
+
+define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4i16:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
+  ret <4 x i16> %b
+}
+
+define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2i32:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+  %b = add <2 x i32> %a, <i32 1, i32 2>
+  ret <2 x i32> %b
+}
+
+define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1q_dup_s8:
+; CHECK: ld1r {{{ ?v[0-9]+.16b ?}}}, [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
+  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %lane
+}
+
+define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1q_dup_s16:
+; CHECK: ld1r {{{ ?v[0-9]+.8h ?}}}, [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %lane
+}
+
+define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1q_dup_s32:
+; CHECK: ld1r {{{ ?v[0-9]+.4s ?}}}, [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %1 = insertelement <4 x i32> undef, i32 %0, i32 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %lane
+}
+
+define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1q_dup_s64:
+; CHECK: ld1r {{{ ?v[0-9]+.2d ?}}}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %1 = insertelement <2 x i64> undef, i64 %0, i32 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %lane
+}
+
+define <4 x float> @test_vld1q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1q_dup_f32:
+; CHECK: ld1r {{{ ?v[0-9]+.4s ?}}}, [x0]
+entry:
+  %0 = load float* %a, align 4
+  %1 = insertelement <4 x float> undef, float %0, i32 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x double> @test_vld1q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1q_dup_f64:
+; CHECK: ld1r {{{ ?v[0-9]+.2d ?}}}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %1 = insertelement <2 x double> undef, double %0, i32 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <8 x i8> @test_vld1_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1_dup_s8:
+; CHECK: ld1r {{{ ?v[0-9]+.8b ?}}}, [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  ret <8 x i8> %lane
+}
+
+define <4 x i16> @test_vld1_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1_dup_s16:
+; CHECK: ld1r {{{ ?v[0-9]+.4h ?}}}, [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  ret <4 x i16> %lane
+}
+
+define <2 x i32> @test_vld1_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1_dup_s32:
+; CHECK: ld1r {{{ ?v[0-9]+.2s ?}}}, [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  ret <2 x i32> %lane
+}
+
+define <1 x i64> @test_vld1_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1_dup_s64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %1 = insertelement <1 x i64> undef, i64 %0, i32 0
+  ret <1 x i64> %1
+}
+
+define <2 x float> @test_vld1_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1_dup_f32:
+; CHECK: ld1r {{{ ?v[0-9]+.2s ?}}}, [x0]
+entry:
+  %0 = load float* %a, align 4
+  %1 = insertelement <2 x float> undef, float %0, i32 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <1 x double> @test_vld1_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1_dup_f64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %1 = insertelement <1 x double> undef, double %0, i32 0
+  ret <1 x double> %1
+}
+
+define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
+; As there is a store operation depending on %1, LD1R pattern can't be selected.
+; So LDR and FMOV should be emitted.
+; CHECK-LABEL: testDUP.v1i64:
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-DAG: fmov {{d[0-9]+}}, {{x[0-9]+}}
+; CHECK-DAG: str {{x[0-9]+}}, [{{x[0-9]+}}]
+  %1 = load i64* %a, align 8
+  store i64 %1, i64* %b, align 8
+  %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
+  ret <1 x i64> %vecinit.i
+}
+
+define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
+; As there is a store operation depending on %1, LD1R pattern can't be selected.
+; So LDR and FMOV should be emitted.
+; CHECK-LABEL: testDUP.v1f64:
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
+  %1 = load double* %a, align 8
+  store double %1, double* %b, align 8
+  %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
+  ret <1 x double> %vecinit.i
+}
+
+define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vld1q_lane_s8:
+; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
+  ret <16 x i8> %vld1_lane
+}
+
+define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vld1q_lane_s16:
+; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
+  ret <8 x i16> %vld1_lane
+}
+
+define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vld1q_lane_s32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
+  ret <4 x i32> %vld1_lane
+}
+
+define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vld1q_lane_s64:
+; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
+  ret <2 x i64> %vld1_lane
+}
+
+define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vld1q_lane_f32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load float* %a, align 4
+  %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
+  ret <4 x float> %vld1_lane
+}
+
+define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vld1q_lane_f64:
+; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load double* %a, align 8
+  %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
+  ret <2 x double> %vld1_lane
+}
+
+define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vld1_lane_s8:
+; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
+  ret <8 x i8> %vld1_lane
+}
+
+define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vld1_lane_s16:
+; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
+  ret <4 x i16> %vld1_lane
+}
+
+define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vld1_lane_s32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
+  ret <2 x i32> %vld1_lane
+}
+
+define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vld1_lane_s64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
+  ret <1 x i64> %vld1_lane
+}
+
+define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vld1_lane_f32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = load float* %a, align 4
+  %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
+  ret <2 x float> %vld1_lane
+}
+
+define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vld1_lane_f64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
+  ret <1 x double> %vld1_lane
+}
+
+define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vst1q_lane_s8:
+; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <16 x i8> %b, i32 15
+  store i8 %0, i8* %a, align 1
+  ret void
+}
+
+define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vst1q_lane_s16:
+; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <8 x i16> %b, i32 7
+  store i16 %0, i16* %a, align 2
+  ret void
+}
+
+define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vst1q_lane_s32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x i32> %b, i32 3
+  store i32 %0, i32* %a, align 4
+  ret void
+}
+
+define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vst1q_lane_s64:
+; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x i64> %b, i32 1
+  store i64 %0, i64* %a, align 8
+  ret void
+}
+
+define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vst1q_lane_f32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x float> %b, i32 3
+  store float %0, float* %a, align 4
+  ret void
+}
+
+define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vst1q_lane_f64:
+; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x double> %b, i32 1
+  store double %0, double* %a, align 8
+  ret void
+}
+
+define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vst1_lane_s8:
+; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <8 x i8> %b, i32 7
+  store i8 %0, i8* %a, align 1
+  ret void
+}
+
+define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vst1_lane_s16:
+; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x i16> %b, i32 3
+  store i16 %0, i16* %a, align 2
+  ret void
+}
+
+define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vst1_lane_s32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x i32> %b, i32 1
+  store i32 %0, i32* %a, align 4
+  ret void
+}
+
+define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vst1_lane_s64:
+; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <1 x i64> %b, i32 0
+  store i64 %0, i64* %a, align 8
+  ret void
+}
+
+define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vst1_lane_f32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x float> %b, i32 1
+  store float %0, float* %a, align 4
+  ret void
+}
+
+define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vst1_lane_f64:
+; CHECK: str {{d[0-9]+}}, [x0]
+entry:
+  %0 = extractelement <1 x double> %b, i32 0
+  store double %0, double* %a, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-simd-shift.ll b/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
new file mode 100644
index 0000000..447fb63
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
@@ -0,0 +1,663 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) {
+; CHECK: test_vshr_n_s8
+; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vshr_n = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %vshr_n
+}
+
+define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) {
+; CHECK: test_vshr_n_s16
+; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vshr_n = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %vshr_n
+}
+
+define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) {
+; CHECK: test_vshr_n_s32
+; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vshr_n = ashr <2 x i32> %a, <i32 3, i32 3>
+  ret <2 x i32> %vshr_n
+}
+
+define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) {
+; CHECK: test_vshrq_n_s8
+; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vshr_n = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %vshr_n
+}
+
+define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) {
+; CHECK: test_vshrq_n_s16
+; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vshr_n = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %vshr_n
+}
+
+define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) {
+; CHECK: test_vshrq_n_s32
+; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vshr_n = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %vshr_n
+}
+
+define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) {
+; CHECK: test_vshrq_n_s64
+; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vshr_n = ashr <2 x i64> %a, <i64 3, i64 3>
+  ret <2 x i64> %vshr_n
+}
+
+define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) {
+; CHECK: test_vshr_n_u8
+; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vshr_n = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %vshr_n
+}
+
+define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) {
+; CHECK: test_vshr_n_u16
+; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vshr_n = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %vshr_n
+}
+
+define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) {
+; CHECK: test_vshr_n_u32
+; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vshr_n = lshr <2 x i32> %a, <i32 3, i32 3>
+  ret <2 x i32> %vshr_n
+}
+
+define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) {
+; CHECK: test_vshrq_n_u8
+; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vshr_n = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %vshr_n
+}
+
+define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) {
+; CHECK: test_vshrq_n_u16
+; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vshr_n = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %vshr_n
+}
+
+define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) {
+; CHECK: test_vshrq_n_u32
+; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vshr_n = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %vshr_n
+}
+
+define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) {
+; CHECK: test_vshrq_n_u64
+; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vshr_n = lshr <2 x i64> %a, <i64 3, i64 3>
+  ret <2 x i64> %vshr_n
+}
+
+define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsra_n_s8
+; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsra_n = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <8 x i8> %vsra_n, %a
+  ret <8 x i8> %1
+}
+
+define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsra_n_s16
+; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsra_n = ashr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
+  %1 = add <4 x i16> %vsra_n, %a
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsra_n_s32
+; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsra_n = ashr <2 x i32> %b, <i32 3, i32 3>
+  %1 = add <2 x i32> %vsra_n, %a
+  ret <2 x i32> %1
+}
+
+define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsraq_n_s8
+; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsra_n = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <16 x i8> %vsra_n, %a
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsraq_n_s16
+; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsra_n = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %1 = add <8 x i16> %vsra_n, %a
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsraq_n_s32
+; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsra_n = ashr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %vsra_n, %a
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsraq_n_s64
+; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsra_n = ashr <2 x i64> %b, <i64 3, i64 3>
+  %1 = add <2 x i64> %vsra_n, %a
+  ret <2 x i64> %1
+}
+
+define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsra_n_u8
+; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsra_n = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <8 x i8> %vsra_n, %a
+  ret <8 x i8> %1
+}
+
+define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsra_n_u16
+; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsra_n = lshr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
+  %1 = add <4 x i16> %vsra_n, %a
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsra_n_u32
+; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsra_n = lshr <2 x i32> %b, <i32 3, i32 3>
+  %1 = add <2 x i32> %vsra_n, %a
+  ret <2 x i32> %1
+}
+
+define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsraq_n_u8
+; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsra_n = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <16 x i8> %vsra_n, %a
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsraq_n_u16
+; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsra_n = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %1 = add <8 x i16> %vsra_n, %a
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsraq_n_u32
+; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsra_n = lshr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %vsra_n, %a
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsraq_n_u64
+; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsra_n = lshr <2 x i64> %b, <i64 3, i64 3>
+  %1 = add <2 x i64> %vsra_n, %a
+  ret <2 x i64> %1
+}
+
+define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) {
+; CHECK: test_vshrn_n_s16
+; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %1 = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  ret <8 x i8> %vshrn_n
+}
+
+define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) {
+; CHECK: test_vshrn_n_s32
+; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %1 = ashr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  ret <4 x i16> %vshrn_n
+}
+
+define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) {
+; CHECK: test_vshrn_n_s64
+; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %1 = ashr <2 x i64> %a, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %vshrn_n
+}
+
+define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) {
+; CHECK: test_vshrn_n_u16
+; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %1 = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  ret <8 x i8> %vshrn_n
+}
+
+define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) {
+; CHECK: test_vshrn_n_u32
+; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %1 = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  ret <4 x i16> %vshrn_n
+}
+
+define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) {
+; CHECK: test_vshrn_n_u64
+; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %1 = lshr <2 x i64> %a, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %vshrn_n
+}
+
+define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vshrn_high_n_s16
+; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %1 = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  %2 = bitcast <8 x i8> %a to <1 x i64>
+  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vshrn_high_n_s32
+; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %1 = ashr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  %2 = bitcast <4 x i16> %a to <1 x i64>
+  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %4
+}
+
+define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_high_n_s64
+; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %2 = ashr <2 x i64> %b, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vshrn_high_n_u16
+; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %1 = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  %2 = bitcast <8 x i8> %a to <1 x i64>
+  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vshrn_high_n_u32
+; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %1 = lshr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  %2 = bitcast <4 x i16> %a to <1 x i64>
+  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %4
+}
+
+define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_high_n_u64
+; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %2 = lshr <2 x i64> %b, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrun_high_n_s16
+; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrun_high_n_s32
+; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrun_high_n_s64
+; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vrshrn_high_n_s16
+; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vrshrn_high_n_s32
+; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vrshrn_high_n_s64
+; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrun_high_n_s16
+; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrun_high_n_s32
+; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrun_high_n_s64
+; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrn_high_n_s16
+; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrn_high_n_s32
+; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrn_high_n_s64
+; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrn_high_n_u16
+; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrn_high_n_u32
+; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrn_high_n_u64
+; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrn_high_n_s16
+; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrn_high_n_s32
+; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrn_high_n_s64
+; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrn_high_n_u16
+; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrn_high_n_u32
+; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrn_high_n_u64
+; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+
+
+declare <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32)
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
+
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
+
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
+
+define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_n_s64_f64
+; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_n_u64_f64
+; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
+  ret <1 x i64> %1
+}
+
+define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_n_f64_s64
+; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_n_f64_u64
+; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  ret <1 x double> %1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
+declare <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
+declare <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
diff --git a/test/CodeGen/AArch64/arm64-neon-simd-vget.ll b/test/CodeGen/AArch64/arm64-neon-simd-vget.ll
new file mode 100644
index 0000000..87f3956
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-simd-vget.ll
@@ -0,0 +1,225 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @test_vget_high_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_high_s8:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_s16:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_high_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_high_s32:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_high_s64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_high_s64:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vget_high_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_high_u8:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_u16:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_high_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_high_u32:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_high_u64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_high_u64:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle.i
+}
+
+define <1 x i64> @test_vget_high_p64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_high_p64:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_f16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_f16:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x float> @test_vget_high_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vget_high_f32:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x float> %shuffle.i
+}
+
+define <8 x i8> @test_vget_high_p8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_high_p8:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_p16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_p16:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <1 x double> @test_vget_high_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vget_high_f64:
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> <i32 1>
+  ret <1 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vget_low_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_low_s8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_s16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_low_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_low_s32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_low_s64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_low_s64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vget_low_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_low_u8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_u16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_low_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_low_u32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_low_u64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_low_u64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %shuffle.i
+}
+
+define <1 x i64> @test_vget_low_p64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_low_p64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_f16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_f16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x float> @test_vget_low_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vget_low_f32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuffle.i
+}
+
+define <8 x i8> @test_vget_low_p8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_low_p8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_p16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_p16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <1 x double> @test_vget_low_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vget_low_f64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> zeroinitializer
+  ret <1 x double> %shuffle.i
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll b/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
new file mode 100644
index 0000000..74e3af8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
@@ -0,0 +1,74 @@
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=arm64-none-linux-gnu | FileCheck %s
+
+; This is the analogue of AArch64's file of the same name. It's mostly testing
+; some form of correct lowering occurs, the tests are a little artificial but I
+; strongly suspect there's room for improved CodeGen (FIXME).
+
+define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_0:
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: cset
+  %1 = icmp sge <1 x i64> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  %vget_lane = sext i1 %2 to i64
+  ret i64 %vget_lane
+}
+
+define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_1:
+; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
+  %1 = fcmp oeq <1 x double> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  %vget_lane = sext i1 %2 to i64
+  ret i64 %vget_lane
+}
+
+define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_0:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_1:
+; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = fcmp oeq <1 x double> %v1, %v2
+  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
+; CHECK-LABEL: test_select_v1i1_2:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
+  ret <1 x double> %res
+}
+
+define <1 x i64> @test_select_v1i1_3(i64 %lhs, i64 %rhs, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_3:
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
+  %tst = icmp eq i64 %lhs, %rhs
+  %evil = insertelement <1 x i1> undef, i1 %tst, i32 0
+  %res = select <1 x i1> %evil, <1 x i64> zeroinitializer, <1 x i64> %v3
+  ret <1 x i64> %res
+}
+
+define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_br_extr_cmp:
+; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}}
+  %1 = icmp eq <1 x i64> %v1, %v2
+  %2 = extractelement <1 x i1> %1, i32 0
+  br i1 %2, label %if.end, label %if.then
+
+if.then:
+  ret i32 0;
+
+if.end:
+  ret i32 1;
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll b/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
new file mode 100644
index 0000000..8262fe4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
@@ -0,0 +1,175 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+; FIXME: We should not generate ld/st for such register spill/fill, because the
+; test case seems very simple and the register pressure is not high. If the
+; spill/fill algorithm is optimized, this test case may not be triggered. And
+; then we can delete it.
+define i32 @spill.DPairReg(i32* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.DPairReg:
+; CHECK: ld2 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %arg1)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <2 x i32>, <2 x i32> } %vld, 0
+  %res = extractelement <2 x i32> %vld.extract, i32 1
+  ret i32 %res
+}
+
+define i16 @spill.DTripleReg(i16* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.DTripleReg:
+; CHECK: ld3 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %arg1)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
+  %res = extractelement <4 x i16> %vld.extract, i32 1
+  ret i16 %res
+}
+
+define i16 @spill.DQuadReg(i16* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.DQuadReg:
+; CHECK: ld4 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %arg1)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
+  %res = extractelement <4 x i16> %vld.extract, i32 0
+  ret i16 %res
+}
+
+define i32 @spill.QPairReg(i32* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.QPairReg:
+; CHECK: ld2 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %arg1)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
+  %res = extractelement <4 x i32> %vld.extract, i32 1
+  ret i32 %res
+}
+
+define float @spill.QTripleReg(float* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.QTripleReg:
+; CHECK: ld3 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+entry:
+  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %arg1)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
+  %res = extractelement <4 x float> %vld3.extract, i32 1
+  ret float %res
+}
+
+define i8 @spill.QQuadReg(i8* %arg1, i32 %arg2) {
+; CHECK-LABEL: spill.QQuadReg:
+; CHECK: ld4 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+entry:
+  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %arg1)
+  %cmp = icmp eq i32 %arg2, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @foo()
+  br label %if.end
+
+if.end:
+  %vld.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld, 0
+  %res = extractelement <16 x i8> %vld.extract, i32 1
+  ret i8 %res
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*)
+
+declare void @foo()
+
+; FIXME: We should not generate ld/st for such register spill/fill, because the
+; test case seems very simple and the register pressure is not high. If the
+; spill/fill algorithm is optimized, this test case may not be triggered. And
+; then we can delete it.
+; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
+define <8 x i16> @test_2xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
+  tail call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
+  tail call void @foo()
+  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %sv to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %3 = mul <8 x i16> %2, %2
+  ret <8 x i16> %3
+}
+
+; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
+define <8 x i16> @test_3xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
+  tail call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
+  tail call void @foo()
+  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %sv to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %3 = mul <8 x i16> %2, %2
+  ret <8 x i16> %3
+}
+
+; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
+define <8 x i16> @test_4xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
+  tail call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
+  tail call void @foo()
+  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
+  %1 = bitcast <2 x i64> %sv to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %3 = mul <8 x i16> %2, %2
+  ret <8 x i16> %3
+}
+
+declare void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
diff --git a/test/CodeGen/AArch64/arm64-patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll
new file mode 100644
index 0000000..039cdfc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -0,0 +1,171 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone | FileCheck %s
+
+; Trivial patchpoint codegen
+;
+define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: trivial_patchpoint_codegen:
+; CHECK:       movz x16, #0xdead, lsl #32
+; CHECK-NEXT:  movk x16, #0xbeef, lsl #16
+; CHECK-NEXT:  movk x16, #0xcafe
+; CHECK-NEXT:  blr  x16
+; CHECK:       movz x16, #0xdead, lsl #32
+; CHECK-NEXT:  movk x16, #0xbeef, lsl #16
+; CHECK-NEXT:  movk x16, #0xcaff
+; CHECK-NEXT:  blr  x16
+; CHECK:       ret
+  %resolveCall2 = inttoptr i64 244837814094590 to i8*
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %resolveCall3 = inttoptr i64 244837814094591 to i8*
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  ret i64 %result
+}
+
+; Caller frame metadata with stackmaps. This should not be optimized
+; as a leaf function.
+;
+; CHECK-LABEL: caller_meta_leaf
+; CHECK:       mov x29, sp
+; CHECK-NEXT:  sub sp, sp, #32
+; CHECK:       Ltmp
+; CHECK:       mov sp, x29
+; CHECK:       ret
+
+define void @caller_meta_leaf() {
+entry:
+  %metadata = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata
+  store i64 12, i64* %metadata
+  store i64 13, i64* %metadata
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+  ret void
+}
+
+; Test the webkit_jscc calling convention.
+; One argument will be passed in register, the other will be pushed on the stack.
+; Return value in x0.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      str x{{.+}}, [sp]
+; CHECK-NEXT: mov  x0, x{{.+}}
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #0xffff, lsl #32
+; CHECK-NEXT: movk  x16, #0xdead, lsl #16
+; CHECK-NEXT: movk  x16, #0xbeef
+; CHECK-NEXT: blr x16
+  %resolveCall2 = inttoptr i64 281474417671919 to i8*
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %resolveCall3 = inttoptr i64 244837814038255 to i8*
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  ret void
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK:      Ltmp
+; CHECK:      orr w{{.+}}, wzr, #0x6
+; CHECK-NEXT: str x{{.+}}, [sp, #24]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
+; CHECK-NEXT: str w{{.+}}, [sp, #16]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
+; CHECK-NEXT: str x{{.+}}, [sp]
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #0xffff, lsl #32
+; CHECK-NEXT: movk  x16, #0xdead, lsl #16
+; CHECK-NEXT: movk  x16, #0xbeef
+; CHECK-NEXT: blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK:      Ltmp
+; CHECK:      movz  w{{.+}}, #0xa
+; CHECK-NEXT: str x{{.+}}, [sp, #48]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x8
+; CHECK-NEXT: str w{{.+}}, [sp, #36]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x6
+; CHECK-NEXT: str x{{.+}}, [sp, #24]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
+; CHECK-NEXT: str w{{.+}}, [sp, #16]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
+; CHECK-NEXT: str x{{.+}}, [sp]
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #0xffff, lsl #32
+; CHECK-NEXT: movk  x16, #0xdead, lsl #16
+; CHECK-NEXT: movk  x16, #0xbeef
+; CHECK-NEXT: blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  ret i64 %result
+}
+
+; Test patchpoints reusing the same TargetConstant.
+; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
+; There is no way to verify this, since it depends on memory allocation.
+; But I think it's useful to include as a working example.
+define i64 @testLowerConstant(i64 %arg, i64 %tmp2, i64 %tmp10, i64* %tmp33, i64 %tmp79) {
+entry:
+  %tmp80 = add i64 %tmp79, -16
+  %tmp81 = inttoptr i64 %tmp80 to i64*
+  %tmp82 = load i64* %tmp81, align 8
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  %tmp83 = load i64* %tmp33, align 8
+  %tmp84 = add i64 %tmp83, -24
+  %tmp85 = inttoptr i64 %tmp84 to i64*
+  %tmp86 = load i64* %tmp85, align 8
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  ret i64 10
+}
+
+; Test small patchpoints that don't emit calls.
+define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: small_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: ldp
+; CHECK-NEXT: ret
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
+  ret void
+}
+
+; Test that scratch registers are spilled around patchpoints
+; CHECK: InlineAsm End
+; CHECK-NEXT: mov x{{[0-9]+}}, x16
+; CHECK-NEXT: mov x{{[0-9]+}}, x17
+; CHECK-NEXT: Ltmp
+; CHECK-NEXT: nop
+define void @clobberScratch(i32* %p) {
+  %v = load i32* %p
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
+  store i32 %v, i32* %p
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+
+; CHECK-LABEL: test_i16:
+; CHECK: ldrh [[BREG:w[0-9]+]], [sp]
+; CHECK: add w0, w0, [[BREG]]
+define webkit_jscc i16 @test_i16(i16 zeroext %a, i16 zeroext %b) {
+  %sum = add i16 %a, %b
+  ret i16 %sum
+}
diff --git a/test/CodeGen/AArch64/arm64-pic-local-symbol.ll b/test/CodeGen/AArch64/arm64-pic-local-symbol.ll
new file mode 100644
index 0000000..627e741
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-pic-local-symbol.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64-unknown-linux-gnu -relocation-model=pic < %s | FileCheck %s
+
+@a = internal unnamed_addr global i32 0, align 4
+@.str = private unnamed_addr constant [6 x i8] c"test\0A\00", align 1
+
+define i32 @get() {
+; CHECK: get:
+; CHECK: adrp x{{[0-9]+}}, a
+; CHECK-NEXT: ldr w{{[0-9]+}}, [x{{[0-9]}}, :lo12:a]
+  %res = load i32* @a, align 4
+  ret i32 %res
+}
+
+define void @foo() nounwind {
+; CHECK: foo:
+; CHECK: adrp x{{[0-9]}}, .L.str
+; CHECK-NEXT: add x{{[0-9]}}, x{{[0-9]}}, :lo12:.L.str
+  tail call void @bar(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0))
+  ret void
+}
+
+declare void @bar(i8*)
diff --git a/test/CodeGen/AArch64/arm64-platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll
new file mode 100644
index 0000000..651c793
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-platform-reg.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+; x18 is reserved as a platform register on Darwin but not on other
+; systems. Create loads of register pressure and make sure this is respected.
+
+; Also, fp must always refer to a valid frame record, even if it's not the one
+; of the current function, so it shouldn't be used either.
+
+@var = global [30 x i64] zeroinitializer
+
+define void @keep_live() {
+  %val = load volatile [30 x i64]* @var
+  store volatile [30 x i64] %val, [30 x i64]* @var
+
+; CHECK: ldr x18
+; CHECK: str x18
+
+; CHECK-DARWIN-NOT: ldr fp
+; CHECK-DARWIN-NOT: ldr x18
+; CHECK-DARWIN: Spill
+; CHECK-DARWIN-NOT: ldr fp
+; CHECK-DARWIN-NOT: ldr x18
+; CHECK-DARWIN: ret
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-popcnt.ll b/test/CodeGen/AArch64/arm64-popcnt.ll
new file mode 100644
index 0000000..2afade2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+; CHECK: fmov	s0, w0
+; CHECK: cnt.8b	v0, v0
+; CHECK: uaddlv.8b	h0, v0
+; CHECK: fmov w0, s0
+; CHECK: ret
+}
+
+define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %cnt
+; CHECK: fmov	d0, x0
+; CHECK: cnt.8b	v0, v0
+; CHECK: uaddlv.8b	h0, v0
+; CHECK: fmov	w0, s0
+; CHECK: ret
+}
+
+; Do not use AdvSIMD when -mno-implicit-float is specified.
+; rdar://9473858
+
+define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+; CHECK-LABEL: cnt32:
+; CHECK-NOT 16b
+; CHECK: ret
+}
+
+define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %cnt
+; CHECK-LABEL: cnt64:
+; CHECK-NOT 16b
+; CHECK: ret
+}
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-prefetch.ll b/test/CodeGen/AArch64/arm64-prefetch.ll
new file mode 100644
index 0000000..b2e06ed
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-prefetch.ll
@@ -0,0 +1,88 @@
+; RUN: llc %s -march arm64 -o - | FileCheck %s
+
+@a = common global i32* null, align 8
+
+define void @test(i32 %i, i32 %j) nounwind ssp {
+entry:
+  ; CHECK: @test
+  %j.addr = alloca i32, align 4
+  store i32 %j, i32* %j.addr, align 4, !tbaa !0
+  %tmp = bitcast i32* %j.addr to i8*
+  ; CHECK: prfum pldl1strm
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 0, i32 1)
+  ; CHECK: prfum pldl3keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 1, i32 1)
+  ; CHECK: prfum pldl2keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 2, i32 1)
+  ; CHECK: prfum pldl1keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 1)
+
+  ; CHECK: prfum pstl1strm
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 0, i32 1)
+  ; CHECK: prfum pstl3keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 1, i32 1)
+  ; CHECK: prfum pstl2keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 2, i32 1)
+  ; CHECK: prfum pstl1keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 3, i32 1)
+
+  %tmp1 = load i32* %j.addr, align 4, !tbaa !0
+  %add = add nsw i32 %tmp1, %i
+  %idxprom = sext i32 %add to i64
+  %tmp2 = load i32** @a, align 8, !tbaa !3
+  %arrayidx = getelementptr inbounds i32* %tmp2, i64 %idxprom
+  %tmp3 = bitcast i32* %arrayidx to i8*
+
+  ; CHECK: prfm pldl1strm
+  call void @llvm.prefetch(i8* %tmp3, i32 0, i32 0, i32 1)
+  %tmp4 = load i32** @a, align 8, !tbaa !3
+  %arrayidx3 = getelementptr inbounds i32* %tmp4, i64 %idxprom
+  %tmp5 = bitcast i32* %arrayidx3 to i8*
+
+  ; CHECK: prfm pldl3keep
+  call void @llvm.prefetch(i8* %tmp5, i32 0, i32 1, i32 1)
+  %tmp6 = load i32** @a, align 8, !tbaa !3
+  %arrayidx6 = getelementptr inbounds i32* %tmp6, i64 %idxprom
+  %tmp7 = bitcast i32* %arrayidx6 to i8*
+
+  ; CHECK: prfm pldl2keep
+  call void @llvm.prefetch(i8* %tmp7, i32 0, i32 2, i32 1)
+  %tmp8 = load i32** @a, align 8, !tbaa !3
+  %arrayidx9 = getelementptr inbounds i32* %tmp8, i64 %idxprom
+  %tmp9 = bitcast i32* %arrayidx9 to i8*
+
+  ; CHECK: prfm pldl1keep
+  call void @llvm.prefetch(i8* %tmp9, i32 0, i32 3, i32 1)
+  %tmp10 = load i32** @a, align 8, !tbaa !3
+  %arrayidx12 = getelementptr inbounds i32* %tmp10, i64 %idxprom
+  %tmp11 = bitcast i32* %arrayidx12 to i8*
+
+  ; CHECK: prfm pstl1strm
+  call void @llvm.prefetch(i8* %tmp11, i32 1, i32 0, i32 1)
+  %tmp12 = load i32** @a, align 8, !tbaa !3
+  %arrayidx15 = getelementptr inbounds i32* %tmp12, i64 %idxprom
+  %tmp13 = bitcast i32* %arrayidx15 to i8*
+
+  ; CHECK: prfm pstl3keep
+  call void @llvm.prefetch(i8* %tmp13, i32 1, i32 1, i32 1)
+  %tmp14 = load i32** @a, align 8, !tbaa !3
+  %arrayidx18 = getelementptr inbounds i32* %tmp14, i64 %idxprom
+  %tmp15 = bitcast i32* %arrayidx18 to i8*
+
+  ; CHECK: prfm pstl2keep
+  call void @llvm.prefetch(i8* %tmp15, i32 1, i32 2, i32 1)
+  %tmp16 = load i32** @a, align 8, !tbaa !3
+  %arrayidx21 = getelementptr inbounds i32* %tmp16, i64 %idxprom
+  %tmp17 = bitcast i32* %arrayidx21 to i8*
+
+  ; CHECK: prfm pstl1keep
+  call void @llvm.prefetch(i8* %tmp17, i32 1, i32 3, i32 1)
+  ret void
+}
+
+declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/AArch64/arm64-promote-const.ll b/test/CodeGen/AArch64/arm64-promote-const.ll
new file mode 100644
index 0000000..380ff55
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-promote-const.ll
@@ -0,0 +1,255 @@
+; Disable machine cse to stress the different path of the algorithm.
+; Otherwise, we always fall in the simple case, i.e., only one definition.
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -aarch64-stress-promote-const -mcpu=cyclone | FileCheck -check-prefix=PROMOTED %s
+; The REGULAR run just checks that the inputs passed to promote const expose
+; the appropriate patterns.
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -aarch64-promote-const=false -mcpu=cyclone | FileCheck -check-prefix=REGULAR %s
+
+%struct.uint8x16x4_t = type { [4 x <16 x i8>] }
+
+; Constant is a structure
+define %struct.uint8x16x4_t @test1() {
+; PROMOTED-LABEL: test1:
+; Promote constant has created a big constant for the whole structure
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], __PromotedConst@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], __PromotedConst@PAGEOFF
+; Destination registers are defined by the ABI
+; PROMOTED-NEXT: ldp q0, q1, {{\[}}[[BASEADDR]]]
+; PROMOTED-NEXT: ldp q2, q3, {{\[}}[[BASEADDR]], #32]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test1:
+; Regular access is quite bad, it performs 4 loads, one for each chunk of
+; the structure
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; Destination registers are defined by the ABI
+; REGULAR: ldr q0, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR: ldr q1, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR2:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
+; REGULAR: ldr q2, {{\[}}[[PAGEADDR2]], [[CSTLABEL2]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR3:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
+; REGULAR: ldr q3, {{\[}}[[PAGEADDR3]], [[CSTLABEL3]]@PAGEOFF]
+; REGULAR-NEXT: ret
+entry:
+  ret %struct.uint8x16x4_t { [4 x <16 x i8>] [<16 x i8> <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, <16 x i8> <i8 32, i8 124, i8 121, i8 120, i8 8, i8 117, i8 -56, i8 113, i8 -76, i8 110, i8 -53, i8 107, i8 7, i8 105, i8 103, i8 102>, <16 x i8> <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>, <16 x i8> <i8 -104, i8 83, i8 -20, i8 81, i8 81, i8 80, i8 -59, i8 78, i8 73, i8 77, i8 -37, i8 75, i8 122, i8 74, i8 37, i8 73>] }
+}
+
+; Two different uses of the same constant in the same basic block
+define <16 x i8> @test2(<16 x i8> %arg) {
+entry:
+; PROMOTED-LABEL: test2:
+; In stress mode, constant vector are promoted
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1:__PromotedConst[0-9]+]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: mla.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test2:
+; Regular access is strickly the same as promoted access.
+; The difference is that the address (and thus the space in memory) is not
+; shared between constants
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: mla.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: ret
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %add.i9 = add <16 x i8> %add.i, %mul.i
+  ret <16 x i8> %add.i9
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; one dominates the other
+define <16 x i8> @test3(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test3:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: cbnz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV2:__PromotedConst[0-9]+]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV2]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM]], {{\[}}[[BASEADDR]]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; PROMOTED-NEXT: add.16b v0, v0, [[DESTV]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test3:
+; Regular mode does not elimitate common sub expression by its own.
+; In other words, the same loads appears several times.
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: cbz w0, [[LABELelse:LBB.*]]
+; Next BB
+; Redundant load
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
+; REGULAR-NEXT: b [[LABELend:LBB.*]]
+; Next BB
+; REGULAR-NEXT: [[LABELelse]]
+; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL2]]@PAGEOFF]
+; Next BB
+; REGULAR-NEXT: [[LABELend]]:
+; REGULAR-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; REGULAR-NEXT: add.16b v0, v0, [[DESTV]]
+; REGULAR-NEXT: ret
+entry:
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul.i13 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul.i = mul <16 x i8> %add.i, <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %ret2.0 = phi <16 x i8> [ %mul.i13, %if.then ], [ %mul.i, %if.else ]
+  %add.i12 = add <16 x i8> %add.i, %ret2.0
+  ret <16 x i8> %add.i12
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; none dominates the other
+define <16 x i8> @test4(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test4:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: mul.16b v0, v0, v[[REGNUM]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: ret
+
+
+; REGULAR-LABEL: test4:
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; Redundant expression
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: mul.16b v0, v0, v[[REGNUM]]
+; Next BB
+; REGULAR-NEXT: [[LABEL]]:
+; REGULAR-NEXT: ret
+entry:
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %ret.0 = phi <16 x i8> [ %mul.i, %if.then ], [ %add.i, %entry ]
+  ret <16 x i8> %ret.0
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; one is in a phi.
+define <16 x i8> @test5(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test5:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; PROMOTED-NEXT: mul.16b v[[REGNUM]], [[DESTV]], v[[REGNUM]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[REGNUM]], v[[REGNUM]]
+; PROMOTED-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
+; PROMOTED-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
+; PROMOTED-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test5:
+; REGULAR: cbz w0, [[LABELelse:LBB.*]]
+; Next BB
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR-NEXT: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; REGULAR-NEXT: mul.16b v[[DESTREGNUM:[0-9]+]], [[DESTV]], v[[REGNUM]]
+; REGULAR-NEXT: b [[LABELend:LBB.*]]
+; Next BB
+; REGULAR-NEXT: [[LABELelse]]
+; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[DESTREGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; Next BB
+; REGULAR-NEXT: [[LABELend]]:
+; REGULAR-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[DESTREGNUM]], v[[DESTREGNUM]]
+; REGULAR-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
+; REGULAR-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
+; REGULAR-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
+; REGULAR-NEXT: ret
+entry:
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %mul.i26 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %ret.0 = phi <16 x i8> [ %mul.i26, %if.then ], [ <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, %entry ]
+  %mul.i25 = mul <16 x i8> %ret.0, %ret.0
+  %mul.i24 = mul <16 x i8> %mul.i25, %mul.i25
+  %mul.i23 = mul <16 x i8> %mul.i24, %mul.i24
+  %mul.i = mul <16 x i8> %mul.i23, %mul.i23
+  ret <16 x i8> %mul.i
+}
+
+define void @accessBig(i64* %storage) {
+; PROMOTED-LABEL: accessBig:
+; PROMOTED: adrp
+; PROMOTED: ret
+  %addr = bitcast i64* %storage to <1 x i80>*
+  store <1 x i80> <i80 483673642326615442599424>, <1 x i80>* %addr
+  ret void
+}
+
+define void @asmStatement() {
+; PROMOTED-LABEL: asmStatement:
+; PROMOTED-NOT: adrp
+; PROMOTED: ret
+  call void asm sideeffect "bfxil w0, w0, $0, $1", "i,i"(i32 28, i32 4)
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-redzone.ll b/test/CodeGen/AArch64/arm64-redzone.ll
new file mode 100644
index 0000000..9b0c384
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-redzone.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=arm64 -aarch64-redzone | FileCheck %s
+
+define i32 @foo(i32 %a, i32 %b) nounwind ssp {
+; CHECK-LABEL: foo:
+; CHECK-NOT: sub sp, sp
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %x = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 %b, i32* %b.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %tmp1 = load i32* %b.addr, align 4
+  %add = add nsw i32 %tmp, %tmp1
+  store i32 %add, i32* %x, align 4
+  %tmp2 = load i32* %x, align 4
+  ret i32 %tmp2
+}
diff --git a/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll b/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
new file mode 100644
index 0000000..29255ef
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s | FileCheck %s
+
+define float @copy_FPR32(float %a, float %b) {
+;CHECK-LABEL: copy_FPR32:
+;CHECK: fmov s0, s1
+  ret float %b;
+}
+  
+define double @copy_FPR64(double %a, double %b) {
+;CHECK-LABEL: copy_FPR64:
+;CHECK: fmov d0, d1
+  ret double %b;
+}
+  
+define fp128 @copy_FPR128(fp128 %a, fp128 %b) {
+;CHECK-LABEL: copy_FPR128:
+;CHECK: str	q1, [sp, #-16]!
+;CHECK-NEXT: ldr	q0, [sp, #16]!
+  ret fp128 %b;
+}
diff --git a/test/CodeGen/AArch64/arm64-register-offset-addressing.ll b/test/CodeGen/AArch64/arm64-register-offset-addressing.ll
new file mode 100644
index 0000000..045712b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-register-offset-addressing.ll
@@ -0,0 +1,145 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i8 @test_64bit_add(i16* %a, i64 %b) {
+; CHECK-LABEL: test_64bit_add:
+; CHECK: lsl [[REG:x[0-9]+]], x1, #1
+; CHECK: ldrb w0, [x0, [[REG]]]
+; CHECK: ret
+  %tmp1 = getelementptr inbounds i16* %a, i64 %b
+  %tmp2 = load i16* %tmp1
+  %tmp3 = trunc i16 %tmp2 to i8
+  ret i8 %tmp3
+}
+
+; These tests are trying to form SEXT and ZEXT operations that never leave i64
+; space, to make sure LLVM can adapt the offset register correctly.
+define void @ldst_8bit(i8* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_8bit:
+
+   %off32.sext.tmp = shl i64 %offset, 32
+   %off32.sext = ashr i64 %off32.sext.tmp, 32
+   %addr8_sxtw = getelementptr i8* %base, i64 %off32.sext
+   %val8_sxtw = load volatile i8* %addr8_sxtw
+   %val32_signed = sext i8 %val8_sxtw to i32
+   store volatile i32 %val32_signed, i32* @var_32bit
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+  %addrint_uxtw = ptrtoint i8* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i8*
+  %val8_uxtw = load volatile i8* %addr_uxtw
+  %newval8 = add i8 %val8_uxtw, 1
+  store volatile i8 %newval8, i8* @var_8bit
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+   ret void
+}
+
+
+define void @ldst_16bit(i16* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_16bit:
+
+  %addrint_uxtw = ptrtoint i16* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i16*
+  %val8_uxtw = load volatile i16* %addr_uxtw
+  %newval8 = add i16 %val8_uxtw, 1
+  store volatile i16 %newval8, i16* @var_16bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i16* %base to i64
+  %offset_sxtw.tmp = shl i64 %offset, 32
+  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i16*
+  %val16_sxtw = load volatile i16* %addr_sxtw
+  %val64_signed = sext i16 %val16_sxtw to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_uxtwN = ptrtoint i16* %base to i64
+  %offset_uxtwN = and i64 %offset, 4294967295
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 1
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i16*
+  %val32 = load volatile i32* @var_32bit
+  %val16_trunc32 = trunc i32 %val32 to i16
+  store volatile i16 %val16_trunc32, i16* %addr_uxtwN
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #1]
+   ret void
+}
+
+define void @ldst_32bit(i32* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_32bit:
+
+  %addrint_uxtw = ptrtoint i32* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i32*
+  %val32_uxtw = load volatile i32* %addr_uxtw
+  %newval32 = add i32 %val32_uxtw, 1
+  store volatile i32 %newval32, i32* @var_32bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i32* %base to i64
+  %offset_sxtw.tmp = shl i64 %offset, 32
+  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i32*
+  %val32_sxtw = load volatile i32* %addr_sxtw
+  %val64_signed = sext i32 %val32_sxtw to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_uxtwN = ptrtoint i32* %base to i64
+  %offset_uxtwN = and i64 %offset, 4294967295
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 2
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i32*
+  %val32 = load volatile i32* @var_32bit
+  store volatile i32 %val32, i32* %addr_uxtwN
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+   ret void
+}
+
+define void @ldst_64bit(i64* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_64bit:
+
+  %addrint_uxtw = ptrtoint i64* %base to i64
+  %offset_uxtw = and i64 %offset, 4294967295
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i64*
+  %val64_uxtw = load volatile i64* %addr_uxtw
+  %newval8 = add i64 %val64_uxtw, 1
+  store volatile i64 %newval8, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i64* %base to i64
+  %offset_sxtw.tmp = shl i64 %offset, 32
+  %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i64*
+  %val64_sxtw = load volatile i64* %addr_sxtw
+  store volatile i64 %val64_sxtw, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_uxtwN = ptrtoint i64* %base to i64
+  %offset_uxtwN = and i64 %offset, 4294967295
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 3
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i64*
+  %val64 = load volatile i64* @var_64bit
+  store volatile i64 %val64, i64* %addr_uxtwN
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+   ret void
+}
+
+@var_8bit = global i8 0
+@var_16bit = global i16 0
+@var_32bit = global i32 0
+@var_64bit = global i64 0
diff --git a/test/CodeGen/AArch64/arm64-register-pairing.ll b/test/CodeGen/AArch64/arm64-register-pairing.ll
new file mode 100644
index 0000000..99defb1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-register-pairing.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+;
+; rdar://14075006
+
+define void @odd() nounwind {
+; CHECK-LABEL: odd:
+; CHECK: stp d15, d14, [sp, #-144]!
+; CHECK: stp d13, d12, [sp, #16]
+; CHECK: stp d11, d10, [sp, #32]
+; CHECK: stp d9, d8, [sp, #48]
+; CHECK: stp x28, x27, [sp, #64]
+; CHECK: stp x26, x25, [sp, #80]
+; CHECK: stp x24, x23, [sp, #96]
+; CHECK: stp x22, x21, [sp, #112]
+; CHECK: stp x20, x19, [sp, #128]
+; CHECK: movz x0, #0x2a
+; CHECK: ldp x20, x19, [sp, #128]
+; CHECK: ldp x22, x21, [sp, #112]
+; CHECK: ldp x24, x23, [sp, #96]
+; CHECK: ldp x26, x25, [sp, #80]
+; CHECK: ldp x28, x27, [sp, #64]
+; CHECK: ldp d9, d8, [sp, #48]
+; CHECK: ldp d11, d10, [sp, #32]
+; CHECK: ldp d13, d12, [sp, #16]
+; CHECK: ldp d15, d14, [sp], #144
+  call void asm sideeffect "mov x0, #42", "~{x0},~{x19},~{x21},~{x23},~{x25},~{x27},~{d8},~{d10},~{d12},~{d14}"() nounwind
+  ret void
+}
+
+define void @even() nounwind {
+; CHECK-LABEL: even:
+; CHECK: stp d15, d14, [sp, #-144]!
+; CHECK: stp d13, d12, [sp, #16]
+; CHECK: stp d11, d10, [sp, #32]
+; CHECK: stp d9, d8, [sp, #48]
+; CHECK: stp x28, x27, [sp, #64]
+; CHECK: stp x26, x25, [sp, #80]
+; CHECK: stp x24, x23, [sp, #96]
+; CHECK: stp x22, x21, [sp, #112]
+; CHECK: stp x20, x19, [sp, #128]
+; CHECK: movz x0, #0x2a
+; CHECK: ldp x20, x19, [sp, #128]
+; CHECK: ldp x22, x21, [sp, #112]
+; CHECK: ldp x24, x23, [sp, #96]
+; CHECK: ldp x26, x25, [sp, #80]
+; CHECK: ldp x28, x27, [sp, #64]
+; CHECK: ldp d9, d8, [sp, #48]
+; CHECK: ldp d11, d10, [sp, #32]
+; CHECK: ldp d13, d12, [sp, #16]
+; CHECK: ldp d15, d14, [sp], #144
+  call void asm sideeffect "mov x0, #42", "~{x0},~{x20},~{x22},~{x24},~{x26},~{x28},~{d9},~{d11},~{d13},~{d15}"() nounwind
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll b/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
new file mode 100644
index 0000000..a1daf03
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
+
+; We used to not mark NZCV as being used in the continuation basic-block
+; when lowering a 128-bit "select" to branches. This meant a subsequent use
+; of the same flags gave an internal fault here.
+
+declare void @foo(fp128)
+
+define double @test_f128csel_flags(i32 %lhs, fp128 %a, fp128 %b, double %l, double %r) nounwind {
+; CHECK: test_f128csel_flags
+
+    %tst = icmp ne i32 %lhs, 42
+    %val = select i1 %tst, fp128 %a, fp128 %b
+; CHECK: cmp w0, #42
+; CHECK: b.eq {{.?LBB0}}
+
+    call void @foo(fp128 %val)
+    %retval = select i1 %tst, double %l, double %r
+
+    ; It's also reasonably important that the actual fcsel comes before the
+    ; function call since bl may corrupt NZCV. We were doing the right thing anyway,
+    ; but just as well test it while we're here.
+; CHECK: fcsel {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, ne
+; CHECK: bl {{_?foo}}
+
+    ret double %retval
+}
diff --git a/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll b/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
new file mode 100644
index 0000000..fec8933
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+; This is mostly a "don't assert" test. The type of the RHS of a shift depended
+; on the phase of legalization, which led to the creation of an unexpected and
+; unselectable "rotr" node: (i32 (rotr i32, i64)).
+
+; FIXME: This test is xfailed because it relies on an optimization that has
+; been reverted (see PR17975).
+; XFAIL: *
+
+define void @foo(i64* nocapture %d) {
+; CHECK-LABEL: foo:
+; CHECK: rorv
+  %tmp = load i64* undef, align 8
+  %sub397 = sub i64 0, %tmp
+  %and398 = and i64 %sub397, 4294967295
+  %shr404 = lshr i64 %and398, 0
+  %or405 = or i64 0, %shr404
+  %xor406 = xor i64 %or405, 0
+  %xor417 = xor i64 0, %xor406
+  %xor428 = xor i64 0, %xor417
+  %sub430 = sub i64 %xor417, 0
+  %and431 = and i64 %sub430, 4294967295
+  %and432 = and i64 %xor428, 31
+  %sub433 = sub i64 32, %and432
+  %shl434 = shl i64 %and431, %sub433
+  %shr437 = lshr i64 %and431, %and432
+  %or438 = or i64 %shl434, %shr437
+  %xor439 = xor i64 %or438, %xor428
+  %sub441 = sub i64 %xor439, 0
+  store i64 %sub441, i64* %d, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-return-vector.ll b/test/CodeGen/AArch64/arm64-return-vector.ll
new file mode 100644
index 0000000..9457d8b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-return-vector.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; 2x64 vector should be returned in Q0.
+
+define <2 x double> @test(<2 x double>* %p) nounwind {
+; CHECK: test
+; CHECK: ldr q0, [x0]
+; CHECK: ret
+  %tmp1 = load <2 x double>* %p, align 16
+  ret <2 x double> %tmp1
+}
diff --git a/test/CodeGen/AArch64/arm64-returnaddr.ll b/test/CodeGen/AArch64/arm64-returnaddr.ll
new file mode 100644
index 0000000..285b295
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-returnaddr.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i8* @rt0(i32 %x) nounwind readnone {
+entry:
+; CHECK-LABEL: rt0:
+; CHECK: mov x0, x30
+; CHECK: ret
+  %0 = tail call i8* @llvm.returnaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @rt2() nounwind readnone {
+entry:
+; CHECK-LABEL: rt2:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: ldr x[[REG:[0-9]+]], [x29]
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]]]
+; CHECK: ldr x0, [x[[REG2]], #8]
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+  %0 = tail call i8* @llvm.returnaddress(i32 2)
+  ret i8* %0
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-rev.ll b/test/CodeGen/AArch64/arm64-rev.ll
new file mode 100644
index 0000000..30d9f4f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-rev.ll
@@ -0,0 +1,235 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @test_rev_w(i32 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev_w:
+; CHECK: rev w0, w0
+  %0 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %0
+}
+
+define i64 @test_rev_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev_x:
+; CHECK: rev x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %0
+}
+
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
+declare i64 @llvm.bswap.i64(i64) nounwind readnone
+
+define i32 @test_rev16_w(i32 %X) nounwind {
+entry:
+; CHECK-LABEL: test_rev16_w:
+; CHECK: rev16 w0, w0
+  %tmp1 = lshr i32 %X, 8
+  %X15 = bitcast i32 %X to i32
+  %tmp4 = shl i32 %X15, 8
+  %tmp2 = and i32 %tmp1, 16711680
+  %tmp5 = and i32 %tmp4, -16777216
+  %tmp9 = and i32 %tmp1, 255
+  %tmp13 = and i32 %tmp4, 65280
+  %tmp6 = or i32 %tmp5, %tmp2
+  %tmp10 = or i32 %tmp6, %tmp13
+  %tmp14 = or i32 %tmp10, %tmp9
+  ret i32 %tmp14
+}
+
+; 64-bit REV16 is *not* a swap then a 16-bit rotation:
+;   01234567 ->(bswap) 76543210 ->(rotr) 10765432
+;   01234567 ->(rev16) 10325476
+define i64 @test_rev16_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev16_x:
+; CHECK-NOT: rev16 x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %1 = lshr i64 %0, 16
+  %2 = shl i64 %0, 48
+  %3 = or i64 %1, %2
+  ret i64 %3
+}
+
+define i64 @test_rev32_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev32_x:
+; CHECK: rev32 x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %1 = lshr i64 %0, 32
+  %2 = shl i64 %0, 32
+  %3 = or i64 %1, %2
+  ret i64 %3
+}
+
+define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D8:
+;CHECK: rev64.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D16:
+;CHECK: rev64.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D32:
+;CHECK: rev64.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Df:
+;CHECK: rev64.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q8:
+;CHECK: rev64.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q16:
+;CHECK: rev64.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q32:
+;CHECK: rev64.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Qf:
+;CHECK: rev64.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x float> %tmp2
+}
+
+define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev32D8:
+;CHECK: rev32.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32D16:
+;CHECK: rev32.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i16> %tmp2
+}
+
+define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q8:
+;CHECK: rev32.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q16:
+;CHECK: rev32.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i16> %tmp2
+}
+
+define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev16D8:
+;CHECK: rev16.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev16Q8:
+;CHECK: rev16.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+	ret <16 x i8> %tmp2
+}
+
+; Undef shuffle indices should not prevent matching to VREV:
+
+define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D8_undef:
+;CHECK: rev64.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
+	ret <8 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q16_undef:
+;CHECK: rev32.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
+	ret <8 x i16> %tmp2
+}
+
+; vrev <4 x i16> should use REV32 and not REV64
+define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
+; CHECK-LABEL: test_vrev64:
+; CHECK: ldr [[DEST:q[0-9]+]],
+; CHECK: st1.h
+; CHECK: st1.h
+entry:
+  %0 = bitcast <4 x i16>* %source to <8 x i16>*
+  %tmp2 = load <8 x i16>* %0, align 4
+  %tmp3 = extractelement <8 x i16> %tmp2, i32 6
+  %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
+  %tmp9 = extractelement <8 x i16> %tmp2, i32 5
+  %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
+  store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
+  ret void
+}
+
+; Test vrev of float4
+define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
+; CHECK: float_vrev64
+; CHECK: ldr [[DEST:q[0-9]+]],
+; CHECK: rev64.4s
+entry:
+  %0 = bitcast float* %source to <4 x float>*
+  %tmp2 = load <4 x float>* %0, align 4
+  %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
+  %arrayidx8 = getelementptr inbounds <4 x float>* %dest, i32 11
+  store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
+  ret void
+}
+
+
+define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
+; CHECK-LABEL: test_vrev32_bswap:
+; CHECK: rev32.16b
+; CHECK-NOT: rev
+; CHECK: ret
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
+  ret <4 x i32> %bswap
+}
+
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-rounding.ll b/test/CodeGen/AArch64/arm64-rounding.ll
new file mode 100644
index 0000000..9311144
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-rounding.ll
@@ -0,0 +1,208 @@
+; RUN: llc -O3 < %s -mcpu=cyclone | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-ios6.0.0"
+
+; CHECK: test1
+; CHECK: frintx
+; CHECK: frintm
+define float @test1(float %a) #0 {
+entry:
+  %call = tail call float @floorf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @floorf(float) nounwind readnone
+
+; CHECK: test2
+; CHECK: frintx
+; CHECK: frintm
+define double @test2(double %a) #0 {
+entry:
+  %call = tail call double @floor(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @floor(double) nounwind readnone
+
+; CHECK: test3
+; CHECK: frinti
+define float @test3(float %a) #0 {
+entry:
+  %call = tail call float @nearbyintf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @nearbyintf(float) nounwind readnone
+
+; CHECK: test4
+; CHECK: frinti
+define double @test4(double %a) #0 {
+entry:
+  %call = tail call double @nearbyint(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @nearbyint(double) nounwind readnone
+
+; CHECK: test5
+; CHECK: frintx
+; CHECK: frintp
+define float @test5(float %a) #0 {
+entry:
+  %call = tail call float @ceilf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @ceilf(float) nounwind readnone
+
+; CHECK: test6
+; CHECK: frintx
+; CHECK: frintp
+define double @test6(double %a) #0 {
+entry:
+  %call = tail call double @ceil(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @ceil(double) nounwind readnone
+
+; CHECK: test7
+; CHECK: frintx
+define float @test7(float %a) #0 {
+entry:
+  %call = tail call float @rintf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @rintf(float) nounwind readnone
+
+; CHECK: test8
+; CHECK: frintx
+define double @test8(double %a) #0 {
+entry:
+  %call = tail call double @rint(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @rint(double) nounwind readnone
+
+; CHECK: test9
+; CHECK: frintx
+; CHECK: frintz
+define float @test9(float %a) #0 {
+entry:
+  %call = tail call float @truncf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @truncf(float) nounwind readnone
+
+; CHECK: test10
+; CHECK: frintx
+; CHECK: frintz
+define double @test10(double %a) #0 {
+entry:
+  %call = tail call double @trunc(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @trunc(double) nounwind readnone
+
+; CHECK: test11
+; CHECK: frintx
+; CHECK: frinta
+define float @test11(float %a) #0 {
+entry:
+  %call = tail call float @roundf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @roundf(float %a) nounwind readnone
+
+; CHECK: test12
+; CHECK: frintx
+; CHECK: frinta
+define double @test12(double %a) #0 {
+entry:
+  %call = tail call double @round(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @round(double %a) nounwind readnone
+
+; CHECK: test13
+; CHECK-NOT: frintx
+; CHECK: frintm
+define float @test13(float %a) #1 {
+entry:
+  %call = tail call float @floorf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test14
+; CHECK-NOT: frintx
+; CHECK: frintm
+define double @test14(double %a) #1 {
+entry:
+  %call = tail call double @floor(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test15
+; CHECK-NOT: frintx
+; CHECK: frintp
+define float @test15(float %a) #1 {
+entry:
+  %call = tail call float @ceilf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test16
+; CHECK-NOT: frintx
+; CHECK: frintp
+define double @test16(double %a) #1 {
+entry:
+  %call = tail call double @ceil(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test17
+; CHECK-NOT: frintx
+; CHECK: frintz
+define float @test17(float %a) #1 {
+entry:
+  %call = tail call float @truncf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test18
+; CHECK-NOT: frintx
+; CHECK: frintz
+define double @test18(double %a) #1 {
+entry:
+  %call = tail call double @trunc(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test19
+; CHECK-NOT: frintx
+; CHECK: frinta
+define float @test19(float %a) #1 {
+entry:
+  %call = tail call float @roundf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test20
+; CHECK-NOT: frintx
+; CHECK: frinta
+define double @test20(double %a) #1 {
+entry:
+  %call = tail call double @round(double %a) nounwind readnone
+  ret double %call
+}
+
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AArch64/arm64-scaled_iv.ll b/test/CodeGen/AArch64/arm64-scaled_iv.ll
new file mode 100644
index 0000000..987373e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-scaled_iv.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+; Scaling factor in addressing mode are costly.
+; Make loop-reduce prefer unscaled accesses.
+; <rdar://problem/13806271>
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp
+define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
+; CHECK: @mulDouble
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
+; Only one induction variable should have been generated.
+; CHECK-NOT: phi
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = add nsw i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds double* %b, i64 %tmp
+  %tmp1 = load double* %arrayidx, align 8
+; The induction variable should carry the scaling factor: 1 * 8 = 8.
+; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
+  %tmp2 = load double* %arrayidx2, align 8
+  %mul = fmul double %tmp1, %tmp2
+  %arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
+  store double %mul, double* %arrayidx4, align 8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; Comparison should be 19 * 8 = 152.
+; CHECK: icmp eq i32 {{%[^,]+}}, 152
+  %exitcond = icmp eq i32 %lftr.wideiv, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-scvt.ll b/test/CodeGen/AArch64/arm64-scvt.ll
new file mode 100644
index 0000000..2e006cf
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-scvt.ll
@@ -0,0 +1,830 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; rdar://13082402
+
+define float @t1(i32* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldr s0, [x0]
+; CHECK: scvtf s0, s0
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = sitofp i32 %tmp1 to float
+  ret float %tmp2
+}
+
+define float @t2(i32* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr s0, [x0]
+; CHECK: ucvtf s0, s0
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = uitofp i32 %tmp1 to float
+  ret float %tmp2
+}
+
+define double @t3(i64* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldr d0, [x0]
+; CHECK: scvtf d0, d0
+  %tmp1 = load i64* %src, align 4
+  %tmp2 = sitofp i64 %tmp1 to double
+  ret double %tmp2
+}
+
+define double @t4(i64* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: ldr d0, [x0]
+; CHECK: ucvtf d0, d0
+  %tmp1 = load i64* %src, align 4
+  %tmp2 = uitofp i64 %tmp1 to double
+  ret double %tmp2
+}
+
+; rdar://13136456
+define double @t5(i32* nocapture %src) nounwind ssp optsize {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: ldr [[REG:w[0-9]+]], [x0]
+; CHECK: scvtf d0, [[REG]]
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = sitofp i32 %tmp1 to double
+  ret double %tmp2
+}
+
+; Check that we load in FP register when we want to convert into
+; floating point value.
+; This is much faster than loading on GPR and making the conversion
+; GPR -> FPR.
+; <rdar://problem/14599607>
+;
+; Check the flollowing patterns for signed/unsigned:
+; 1. load with scaled imm to float.
+; 2. load with scaled register to float.
+; 3. load with scaled imm to double.
+; 4. load with scaled register to double.
+; 5. load with unscaled imm to float.
+; 6. load with unscaled imm to double.
+; With loading size: 8, 16, 32, and 64-bits.
+
+; ********* 1. load with scaled imm to float. *********
+define float @fct1(i8* nocapture %sp0) {
+; CHECK-LABEL: fct1:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct2(i16* nocapture %sp0) {
+; CHECK-LABEL: fct2:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct3(i32* nocapture %sp0) {
+; CHECK-LABEL: fct3:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct4(i64* nocapture %sp0) {
+; CHECK-LABEL: fct4:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 2. load with scaled register to float. *********
+define float @fct5(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct5:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct6(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct6:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct7(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct7:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct8(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct8:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+
+; ********* 3. load with scaled imm to double. *********
+define double @fct9(i8* nocapture %sp0) {
+; CHECK-LABEL: fct9:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct10(i16* nocapture %sp0) {
+; CHECK-LABEL: fct10:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct11(i32* nocapture %sp0) {
+; CHECK-LABEL: fct11:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct12(i64* nocapture %sp0) {
+; CHECK-LABEL: fct12:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 4. load with scaled register to double. *********
+define double @fct13(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct13:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct14(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct14:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct15(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct15:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct16(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct16:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 5. load with unscaled imm to float. *********
+define float @fct17(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: fct17:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct18(i16* nocapture %sp0) {
+; CHECK-LABEL: fct18:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct19(i32* nocapture %sp0) {
+; CHECK-LABEL: fct19:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct20(i64* nocapture %sp0) {
+; CHECK-LABEL: fct20:
+; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+
+}
+
+; ********* 6. load with unscaled imm to double. *********
+define double @fct21(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: fct21:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct22(i16* nocapture %sp0) {
+; CHECK-LABEL: fct22:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct23(i32* nocapture %sp0) {
+; CHECK-LABEL: fct23:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct24(i64* nocapture %sp0) {
+; CHECK-LABEL: fct24:
+; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+
+}
+
+; ********* 1s. load with scaled imm to float. *********
+define float @sfct1(i8* nocapture %sp0) {
+; CHECK-LABEL: sfct1:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct2(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct2:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct3(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct3:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct4(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct4:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 2s. load with scaled register to float. *********
+define float @sfct5(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct5:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct6(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct6:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct7(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct7:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct8(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct8:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 3s. load with scaled imm to double. *********
+define double @sfct9(i8* nocapture %sp0) {
+; CHECK-LABEL: sfct9:
+; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct10(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct10:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct11(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct11:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct12(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct12:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 4s. load with scaled register to double. *********
+define double @sfct13(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct13:
+; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct14(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct14:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct15(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct15:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct16(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct16:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 5s. load with unscaled imm to float. *********
+define float @sfct17(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: sfct17:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct18(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct18:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct19(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct19:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct20(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct20:
+; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+
+}
+
+; ********* 6s. load with unscaled imm to double. *********
+define double @sfct21(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: sfct21:
+; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct22(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct22:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct23(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct23:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct24(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct24:
+; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+
+}
+
+; Check that we do not use SSHLL code sequence when code size is a concern.
+define float @codesize_sfct17(i8* nocapture %sp0) optsize {
+entry:
+; CHECK-LABEL: codesize_sfct17:
+; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define double @codesize_sfct11(i32* nocapture %sp0) minsize {
+; CHECK-LABEL: sfct11:
+; CHECK: ldr w[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; Adding fp128 custom lowering makes these a little fragile since we have to
+; return the correct mix of Legal/Expand from the custom method.
+;
+; rdar://problem/14991489
+
+define float @float_from_i128(i128 %in) {
+; CHECK-LABEL: float_from_i128:
+; CHECK: bl {{_?__floatuntisf}}
+  %conv = uitofp i128 %in to float
+  ret float %conv
+}
+
+define double @double_from_i128(i128 %in) {
+; CHECK-LABEL: double_from_i128:
+; CHECK: bl {{_?__floattidf}}
+  %conv = sitofp i128 %in to double
+  ret double %conv
+}
+
+define fp128 @fp128_from_i128(i128 %in) {
+; CHECK-LABEL: fp128_from_i128:
+; CHECK: bl {{_?__floatuntitf}}
+  %conv = uitofp i128 %in to fp128
+  ret fp128 %conv
+}
+
+define i128 @i128_from_float(float %in) {
+; CHECK-LABEL: i128_from_float
+; CHECK: bl {{_?__fixsfti}}
+  %conv = fptosi float %in to i128
+  ret i128 %conv
+}
+
+define i128 @i128_from_double(double %in) {
+; CHECK-LABEL: i128_from_double
+; CHECK: bl {{_?__fixunsdfti}}
+  %conv = fptoui double %in to i128
+  ret i128 %conv
+}
+
+define i128 @i128_from_fp128(fp128 %in) {
+; CHECK-LABEL: i128_from_fp128
+; CHECK: bl {{_?__fixtfti}}
+  %conv = fptosi fp128 %in to i128
+  ret i128 %conv
+}
+
diff --git a/test/CodeGen/AArch64/arm64-shifted-sext.ll b/test/CodeGen/AArch64/arm64-shifted-sext.ll
new file mode 100644
index 0000000..b7b4e5d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-shifted-sext.ll
@@ -0,0 +1,277 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+;
+; <rdar://problem/13820218>
+
+define signext i16 @extendedLeftShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToshortBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #4, #8
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv1, 4
+  %conv2 = trunc i32 %shl to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedRightShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToshortBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfx w0, [[REG]], #4, #4
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shr4 = lshr i32 %conv1, 4
+  %conv2 = trunc i32 %shr4 to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedLeftShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToshortBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #8, #8
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv1, 8
+  %conv2 = trunc i32 %shl to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedRightShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToshortBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxtb [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #8
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shr4 = lshr i32 %conv1, 8
+  %conv2 = trunc i32 %shr4 to i16
+  ret i16 %conv2
+}
+
+define i32 @extendedLeftShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #4, #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv, 4
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfx w0, [[REG]], #4, #4
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shr = ashr i32 %conv, 4
+  ret i32 %shr
+}
+
+define i32 @extendedLeftShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharTointBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #8, #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv, 8
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharTointBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxtb [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shr = ashr i32 %conv, 8
+  ret i32 %shr
+}
+
+define i64 @extendedLeftShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #4, #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfx x0, x[[REG]], #4, #4
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToint64By8:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #8, #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shl = shl nsw i64 %conv, 8
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToint64By8:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxtb x[[REG]], w[[REG]]
+; CHECK: asr x0, x[[REG]], #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shr = ashr i64 %conv, 8
+  ret i64 %shr
+}
+
+define i32 @extendedLeftShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #4, #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shl = shl nsw i32 %conv, 4
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfx w0, [[REG]], #4, #12
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shr = ashr i32 %conv, 4
+  ret i32 %shr
+}
+
+define i32 @extendedLeftShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortTointBy16:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: lsl w0, [[REG]], #16
+  %inc = add i16 %a, 1
+  %conv2 = zext i16 %inc to i32
+  %shl = shl nuw i32 %conv2, 16
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortTointBy16:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxth [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shr = ashr i32 %conv, 16
+  ret i32 %shr
+}
+
+define i64 @extendedLeftShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #4, #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfx x0, x[[REG]], #4, #12
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortToint64By16:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #16, #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shl = shl nsw i64 %conv, 16
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortToint64By16:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxth x[[REG]], w[[REG]]
+; CHECK: asr x0, x[[REG]], #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shr = ashr i64 %conv, 16
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftintToint64By4(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftintToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #4, #32
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftintToint64By4(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftintToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfx x0, x[[REG]], #4, #28
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftintToint64By32(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftintToint64By32:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: lsl x0, x[[REG]], #32
+  %inc = add nsw i32 %a, 1
+  %conv2 = zext i32 %inc to i64
+  %shl = shl nuw i64 %conv2, 32
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftintToint64By32(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftintToint64By32:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxtw x[[REG]], w[[REG]]
+; CHECK: asr x0, x[[REG]], #32
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shr = ashr i64 %conv, 32
+  ret i64 %shr
+}
diff --git a/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll b/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
new file mode 100644
index 0000000..aed39e7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
+
+define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
+; CHECK: uaddlv.16b h0, v0
+; CHECK: rshrn.8b v0, v0, #4
+; CHECK: dup.16b v0, v0[0]
+; CHECK: ret
+
+; CHECK-FAST: uaddlv.16b
+; CHECK-FAST: rshrn.8b
+; CHECK-FAST: dup.16b
+  %tmp = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp1 = trunc i32 %tmp to i16
+  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
+  %tmp3 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
+  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp4
+}
+
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-simplest-elf.ll b/test/CodeGen/AArch64/arm64-simplest-elf.ll
new file mode 100644
index 0000000..1254365
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-simplest-elf.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump - -r -d --triple=arm64-linux-gnu | FileCheck --check-prefix=CHECK-ELF %s
+
+define void @foo() nounwind {
+  ret void
+}
+
+  ; Check source looks ELF-like: no leading underscore, comments with //
+; CHECK: foo: // @foo
+; CHECK:     ret
+
+  ; Similarly make sure ELF output works and is vaguely sane: aarch64 target
+  ; machine with correct section & symbol names.
+; CHECK-ELF: file format ELF64-aarch64
+
+; CHECK-ELF: Disassembly of section .text
+; CHECK-ELF-LABEL: foo:
+; CHECK-ELF:    ret
diff --git a/test/CodeGen/AArch64/arm64-sincos.ll b/test/CodeGen/AArch64/arm64-sincos.ll
new file mode 100644
index 0000000..06157b2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-sincos.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7 | FileCheck %s --check-prefix CHECK-IOS
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix CHECK-LINUX
+
+; Combine sin / cos into a single call.
+; rdar://12856873
+
+define float @test1(float %x) nounwind {
+entry:
+; CHECK-IOS-LABEL: test1:
+; CHECK-IOS: bl ___sincosf_stret
+; CHECK-IOS: fadd s0, s0, s1
+
+; CHECK-LINUX-LABEL: test1:
+; CHECK-LINUX: bl sinf
+; CHECK-LINUX: bl cosf
+
+  %call = tail call float @sinf(float %x) nounwind readnone
+  %call1 = tail call float @cosf(float %x) nounwind readnone
+  %add = fadd float %call, %call1
+  ret float %add
+}
+
+define double @test2(double %x) nounwind {
+entry:
+; CHECK-IOS-LABEL: test2:
+; CHECK-IOS: bl ___sincos_stret
+; CHECK-IOS: fadd d0, d0, d1
+
+; CHECK-LINUX-LABEL: test2:
+; CHECK-LINUX: bl sin
+; CHECK-LINUX: bl cos
+
+  %call = tail call double @sin(double %x) nounwind readnone
+  %call1 = tail call double @cos(double %x) nounwind readnone
+  %add = fadd double %call, %call1
+  ret double %add
+}
+
+declare float  @sinf(float) readonly
+declare double @sin(double) readonly
+declare float @cosf(float) readonly
+declare double @cos(double) readonly
diff --git a/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll b/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
new file mode 100644
index 0000000..10b433b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=arm64 -o -  %s | FileCheck %s
+
+; ARM64ISelLowering.cpp was creating a new (floating-point) load for efficiency
+; but not updating chain-successors of the old one. As a result, the two memory
+; operations in this function both ended up direct successors to the EntryToken
+; and could be reordered.
+
+@var = global i32 0, align 4
+
+define float @foo() {
+; CHECK-LABEL: foo:
+  ; Load must come before we clobber @var
+; CHECK: adrp x[[VARBASE:[0-9]+]], {{_?var}}
+; CHECK: ldr [[SREG:s[0-9]+]], [x[[VARBASE]],
+; CHECK: str wzr, [x[[VARBASE]],
+
+  %val = load i32* @var, align 4
+  store i32 0, i32* @var, align 4
+
+  %fltval = sitofp i32 %val to float
+  ret float %fltval
+}
diff --git a/test/CodeGen/AArch64/arm64-sli-sri-opt.ll b/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
new file mode 100644
index 0000000..7fec539
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
@@ -0,0 +1,41 @@
+; RUN: llc -aarch64-shift-insert-generation=true -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood:
+; CHECK: sli.16b v0, v1, #3
+  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+  %vshl_n = shl <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testLeftBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad:
+; CHECK-NOT: sli
+  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+  %vshl_n = shl <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testRightGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightGood:
+; CHECK: sri.16b v0, v1, #3
+  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+  %vshl_n = lshr <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testRightBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightBad:
+; CHECK-NOT: sri
+  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+  %vshl_n = lshr <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-smaxv.ll b/test/CodeGen/AArch64/arm64-smaxv.ll
new file mode 100644
index 0000000..183e667
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-smaxv.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
+; CHECK: test_vmaxv_s8
+; CHECK: smaxv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vmaxv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vmaxv_s16(<4 x i16> %a1) {
+; CHECK: test_vmaxv_s16
+; CHECK: smaxv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vmaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxv_s32(<2 x i32> %a1) {
+; CHECK: test_vmaxv_s32
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: smaxp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vmaxv.i
+}
+
+define signext i8 @test_vmaxvq_s8(<16 x i8> %a1) {
+; CHECK: test_vmaxvq_s8
+; CHECK: smaxv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vmaxv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vmaxvq_s16(<8 x i16> %a1) {
+; CHECK: test_vmaxvq_s16
+; CHECK: smaxv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vmaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a1) {
+; CHECK: test_vmaxvq_s32
+; CHECK: smaxv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vmaxv.i
+}
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
+
diff --git a/test/CodeGen/AArch64/arm64-sminv.ll b/test/CodeGen/AArch64/arm64-sminv.ll
new file mode 100644
index 0000000..195c4e5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-sminv.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vminv_s8(<8 x i8> %a1) {
+; CHECK: test_vminv_s8
+; CHECK: sminv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vminv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vminv_s16(<4 x i16> %a1) {
+; CHECK: test_vminv_s16
+; CHECK: sminv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminv_s32(<2 x i32> %a1) {
+; CHECK: test_vminv_s32
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: sminp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vminv.i
+}
+
+define signext i8 @test_vminvq_s8(<16 x i8> %a1) {
+; CHECK: test_vminvq_s8
+; CHECK: sminv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vminv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vminvq_s16(<8 x i16> %a1) {
+; CHECK: test_vminvq_s16
+; CHECK: sminv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a1) {
+; CHECK: test_vminvq_s32
+; CHECK: sminv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vminv.i
+}
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
+
diff --git a/test/CodeGen/AArch64/arm64-spill-lr.ll b/test/CodeGen/AArch64/arm64-spill-lr.ll
new file mode 100644
index 0000000..fb6588e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-spill-lr.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s
+@bar = common global i32 0, align 4
+
+; Leaf function which uses all callee-saved registers and allocates >= 256 bytes on the stack
+; this will cause processFunctionBeforeCalleeSavedScan() to spill LR as an additional scratch
+; register.
+;
+; This is a crash-only regression test for rdar://15124582.
+define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) nounwind {
+entry:
+  %stack = alloca [128 x i32], align 4
+  %0 = bitcast [128 x i32]* %stack to i8*
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom
+  store i32 %b, i32* %arrayidx, align 4
+  %1 = load volatile i32* @bar, align 4
+  %2 = load volatile i32* @bar, align 4
+  %3 = load volatile i32* @bar, align 4
+  %4 = load volatile i32* @bar, align 4
+  %5 = load volatile i32* @bar, align 4
+  %6 = load volatile i32* @bar, align 4
+  %7 = load volatile i32* @bar, align 4
+  %8 = load volatile i32* @bar, align 4
+  %9 = load volatile i32* @bar, align 4
+  %10 = load volatile i32* @bar, align 4
+  %11 = load volatile i32* @bar, align 4
+  %12 = load volatile i32* @bar, align 4
+  %13 = load volatile i32* @bar, align 4
+  %14 = load volatile i32* @bar, align 4
+  %15 = load volatile i32* @bar, align 4
+  %16 = load volatile i32* @bar, align 4
+  %17 = load volatile i32* @bar, align 4
+  %18 = load volatile i32* @bar, align 4
+  %19 = load volatile i32* @bar, align 4
+  %20 = load volatile i32* @bar, align 4
+  %idxprom1 = sext i32 %c to i64
+  %arrayidx2 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom1
+  %21 = load i32* %arrayidx2, align 4
+  %factor = mul i32 %h, -2
+  %factor67 = mul i32 %g, -2
+  %factor68 = mul i32 %f, -2
+  %factor69 = mul i32 %e, -2
+  %factor70 = mul i32 %d, -2
+  %factor71 = mul i32 %c, -2
+  %factor72 = mul i32 %b, -2
+  %sum = add i32 %2, %1
+  %sum73 = add i32 %sum, %3
+  %sum74 = add i32 %sum73, %4
+  %sum75 = add i32 %sum74, %5
+  %sum76 = add i32 %sum75, %6
+  %sum77 = add i32 %sum76, %7
+  %sum78 = add i32 %sum77, %8
+  %sum79 = add i32 %sum78, %9
+  %sum80 = add i32 %sum79, %10
+  %sum81 = add i32 %sum80, %11
+  %sum82 = add i32 %sum81, %12
+  %sum83 = add i32 %sum82, %13
+  %sum84 = add i32 %sum83, %14
+  %sum85 = add i32 %sum84, %15
+  %sum86 = add i32 %sum85, %16
+  %sum87 = add i32 %sum86, %17
+  %sum88 = add i32 %sum87, %18
+  %sum89 = add i32 %sum88, %19
+  %sum90 = add i32 %sum89, %20
+  %sub15 = sub i32 %21, %sum90
+  %sub16 = add i32 %sub15, %factor
+  %sub17 = add i32 %sub16, %factor67
+  %sub18 = add i32 %sub17, %factor68
+  %sub19 = add i32 %sub18, %factor69
+  %sub20 = add i32 %sub19, %factor70
+  %sub21 = add i32 %sub20, %factor71
+  %add = add i32 %sub21, %factor72
+  ret i32 %add
+}
diff --git a/test/CodeGen/AArch64/arm64-spill.ll b/test/CodeGen/AArch64/arm64-spill.ll
new file mode 100644
index 0000000..47cdc2b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-spill.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -aarch64-neon-syntax=apple -verify-machineinstrs
+
+; CHECK: fpr128
+; CHECK: ld1.2d
+; CHECK: str q
+; CHECK: inlineasm
+; CHECK: ldr q
+; CHECK: st1.2d
+define void @fpr128(<4 x float>* %p) nounwind ssp {
+entry:
+  %x = load <4 x float>* %p, align 16
+  call void asm sideeffect "; inlineasm", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23},~{q24},~{q25},~{q26},~{q27},~{q28},~{q29},~{q30},~{q31},~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
+  store <4 x float> %x, <4 x float>* %p, align 16
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-st1.ll b/test/CodeGen/AArch64/arm64-st1.ll
new file mode 100644
index 0000000..4370484
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-st1.ll
@@ -0,0 +1,676 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+define void @st1lane_16b(<16 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane_16b
+; CHECK: st1.b
+  %tmp = extractelement <16 x i8> %A, i32 1
+  store i8 %tmp, i8* %D
+  ret void
+}
+
+define void @st1lane_8h(<8 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane_8h
+; CHECK: st1.h
+  %tmp = extractelement <8 x i16> %A, i32 1
+  store i16 %tmp, i16* %D
+  ret void
+}
+
+define void @st1lane_4s(<4 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane_4s
+; CHECK: st1.s
+  %tmp = extractelement <4 x i32> %A, i32 1
+  store i32 %tmp, i32* %D
+  ret void
+}
+
+define void @st1lane_4s_float(<4 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_4s_float
+; CHECK: st1.s
+  %tmp = extractelement <4 x float> %A, i32 1
+  store float %tmp, float* %D
+  ret void
+}
+
+define void @st1lane_2d(<2 x i64> %A, i64* %D) {
+; CHECK-LABEL: st1lane_2d
+; CHECK: st1.d
+  %tmp = extractelement <2 x i64> %A, i32 1
+  store i64 %tmp, i64* %D
+  ret void
+}
+
+define void @st1lane_2d_double(<2 x double> %A, double* %D) {
+; CHECK-LABEL: st1lane_2d_double
+; CHECK: st1.d
+  %tmp = extractelement <2 x double> %A, i32 1
+  store double %tmp, double* %D
+  ret void
+}
+
+define void @st1lane_8b(<8 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane_8b
+; CHECK: st1.b
+  %tmp = extractelement <8 x i8> %A, i32 1
+  store i8 %tmp, i8* %D
+  ret void
+}
+
+define void @st1lane_4h(<4 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane_4h
+; CHECK: st1.h
+  %tmp = extractelement <4 x i16> %A, i32 1
+  store i16 %tmp, i16* %D
+  ret void
+}
+
+define void @st1lane_2s(<2 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane_2s
+; CHECK: st1.s
+  %tmp = extractelement <2 x i32> %A, i32 1
+  store i32 %tmp, i32* %D
+  ret void
+}
+
+define void @st1lane_2s_float(<2 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_2s_float
+; CHECK: st1.s
+  %tmp = extractelement <2 x float> %A, i32 1
+  store float %tmp, float* %D
+  ret void
+}
+
+define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
+; CHECK-LABEL: st2lane_16b
+; CHECK: st2.b
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
+  ret void
+}
+
+define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
+; CHECK-LABEL: st2lane_8h
+; CHECK: st2.h
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
+  ret void
+}
+
+define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
+; CHECK-LABEL: st2lane_4s
+; CHECK: st2.s
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
+  ret void
+}
+
+define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
+; CHECK-LABEL: st2lane_2d
+; CHECK: st2.d
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
+; CHECK-LABEL: st3lane_16b
+; CHECK: st3.b
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
+  ret void
+}
+
+define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
+; CHECK-LABEL: st3lane_8h
+; CHECK: st3.h
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
+  ret void
+}
+
+define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
+; CHECK-LABEL: st3lane_4s
+; CHECK: st3.s
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
+  ret void
+}
+
+define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
+; CHECK-LABEL: st3lane_2d
+; CHECK: st3.d
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
+; CHECK-LABEL: st4lane_16b
+; CHECK: st4.b
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
+  ret void
+}
+
+define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
+; CHECK-LABEL: st4lane_8h
+; CHECK: st4.h
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
+  ret void
+}
+
+define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
+; CHECK-LABEL: st4lane_4s
+; CHECK: st4.s
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
+  ret void
+}
+
+define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
+; CHECK-LABEL: st4lane_2d
+; CHECK: st4.d
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+
+define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
+; CHECK-LABEL: st2_8b
+; CHECK st2.8b
+	call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
+	ret void
+}
+
+define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
+; CHECK-LABEL: st3_8b
+; CHECK st3.8b
+	call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
+	ret void
+}
+
+define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
+; CHECK-LABEL: st4_8b
+; CHECK st4.8b
+	call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+
+define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
+; CHECK-LABEL: st2_16b
+; CHECK st2.16b
+	call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
+	ret void
+}
+
+define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
+; CHECK-LABEL: st3_16b
+; CHECK st3.16b
+	call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
+	ret void
+}
+
+define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
+; CHECK-LABEL: st4_16b
+; CHECK st4.16b
+	call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+
+define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
+; CHECK-LABEL: st2_4h
+; CHECK st2.4h
+	call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
+	ret void
+}
+
+define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
+; CHECK-LABEL: st3_4h
+; CHECK st3.4h
+	call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
+	ret void
+}
+
+define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
+; CHECK-LABEL: st4_4h
+; CHECK st4.4h
+	call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+
+define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
+; CHECK-LABEL: st2_8h
+; CHECK st2.8h
+	call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
+	ret void
+}
+
+define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
+; CHECK-LABEL: st3_8h
+; CHECK st3.8h
+	call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
+	ret void
+}
+
+define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
+; CHECK-LABEL: st4_8h
+; CHECK st4.8h
+	call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+
+define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
+; CHECK-LABEL: st2_2s
+; CHECK st2.2s
+	call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
+	ret void
+}
+
+define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
+; CHECK-LABEL: st3_2s
+; CHECK st3.2s
+	call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
+	ret void
+}
+
+define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
+; CHECK-LABEL: st4_2s
+; CHECK st4.2s
+	call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+
+define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
+; CHECK-LABEL: st2_4s
+; CHECK st2.4s
+	call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
+	ret void
+}
+
+define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
+; CHECK-LABEL: st3_4s
+; CHECK st3.4s
+	call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
+	ret void
+}
+
+define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
+; CHECK-LABEL: st4_4s
+; CHECK st4.4s
+	call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+
+define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
+; CHECK-LABEL: st2_1d
+; CHECK st1.2d
+	call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
+	ret void
+}
+
+define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
+; CHECK-LABEL: st3_1d
+; CHECK st1.3d
+	call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
+	ret void
+}
+
+define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
+; CHECK-LABEL: st4_1d
+; CHECK st1.4d
+	call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+
+define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
+; CHECK-LABEL: st2_2d
+; CHECK st2.2d
+	call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
+	ret void
+}
+
+define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
+; CHECK-LABEL: st3_2d
+; CHECK st2.3d
+	call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
+	ret void
+}
+
+define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
+; CHECK-LABEL: st4_2d
+; CHECK st2.4d
+	call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+
+declare void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x2_v8i8(<8 x i8> %A, <8 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
+  ret void
+}
+
+define void @st1_x2_v4i16(<4 x i16> %A, <4 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
+  ret void
+}
+
+define void @st1_x2_v2i32(<2 x i32> %A, <2 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
+  ret void
+}
+
+define void @st1_x2_v2f32(<2 x float> %A, <2 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
+  ret void
+}
+
+define void @st1_x2_v1i64(<1 x i64> %A, <1 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
+  ret void
+}
+
+define void @st1_x2_v1f64(<1 x double> %A, <1 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x2_v16i8(<16 x i8> %A, <16 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
+  ret void
+}
+
+define void @st1_x2_v8i16(<8 x i16> %A, <8 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
+  ret void
+}
+
+define void @st1_x2_v4i32(<4 x i32> %A, <4 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
+  ret void
+}
+
+define void @st1_x2_v4f32(<4 x float> %A, <4 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
+  ret void
+}
+
+define void @st1_x2_v2i64(<2 x i64> %A, <2 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
+  ret void
+}
+
+define void @st1_x2_v2f64(<2 x double> %A, <2 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
+  ret void
+}
+
+define void @st1_x3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
+  ret void
+}
+
+define void @st1_x3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
+  ret void
+}
+
+define void @st1_x3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
+  ret void
+}
+
+define void @st1_x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
+  ret void
+}
+
+define void @st1_x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
+  ret void
+}
+
+define void @st1_x3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
+  ret void
+}
+
+define void @st1_x3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
+  ret void
+}
+
+define void @st1_x3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
+  ret void
+}
+
+define void @st1_x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
+  ret void
+}
+
+define void @st1_x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
+  ret void
+}
+
+
+declare void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
+  ret void
+}
+
+define void @st1_x4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
+  ret void
+}
+
+define void @st1_x4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
+  ret void
+}
+
+define void @st1_x4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
+  ret void
+}
+
+define void @st1_x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
+  ret void
+}
+
+define void @st1_x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
+  ret void
+}
+
+define void @st1_x4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
+  ret void
+}
+
+define void @st1_x4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
+  ret void
+}
+
+define void @st1_x4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
+  ret void
+}
+
+define void @st1_x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
+  ret void
+}
+
+define void @st1_x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-stack-no-frame.ll b/test/CodeGen/AArch64/arm64-stack-no-frame.ll
new file mode 100644
index 0000000..b5970c0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stack-no-frame.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+
+@global = global [20 x i64] zeroinitializer, align 8
+
+; The following function has enough locals to need some restoring, but not a
+; frame record. In an intermediate frame refactoring, prologue and epilogue were
+; inconsistent about how much to move SP.
+define void @test_stack_no_frame() {
+; CHECK: test_stack_no_frame
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+  %local = alloca [20 x i64]
+  %val = load volatile [20 x i64]* @global, align 8
+  store volatile [20 x i64] %val, [20 x i64]* %local, align 8
+
+  %val2 = load volatile [20 x i64]* %local, align 8
+  store volatile [20 x i64] %val2, [20 x i64]* @global, align 8
+
+; CHECK: add sp, sp, #[[STACKSIZE]]
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
new file mode 100644
index 0000000..2c7c6ae
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stackmap.ll
@@ -0,0 +1,288 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+;
+; Note: Print verbose stackmaps using -debug-only=stackmaps.
+
+; We are not getting the correct stack alignment when cross compiling for arm64.
+; So specify a datalayout here.
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 11
+; Num LargeConstants
+; CHECK-NEXT:   .long 2
+; Num Callsites
+; CHECK-NEXT:   .long 11
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _constantargs
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _osrinline
+; CHECK-NEXT:   .quad 32
+; CHECK-NEXT:   .quad _osrcold
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _propertyRead
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _propertyWrite
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _jsVoidCall
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _jsIntCall
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _spilledValue
+; CHECK-NEXT:   .quad 160
+; CHECK-NEXT:   .quad _spilledStackMapValue
+; CHECK-NEXT:   .quad 128
+; CHECK-NEXT:   .quad _liveConstant
+; CHECK-NEXT:   .quad 16
+; CHECK-NEXT:   .quad _clobberLR
+; CHECK-NEXT:   .quad 112
+
+; Num LargeConstants
+; CHECK-NEXT:   .quad   4294967295
+; CHECK-NEXT:   .quad   4294967296
+
+; Constant arguments
+;
+; CHECK-NEXT:   .quad   1
+; CHECK-NEXT:   .long   L{{.*}}-_constantargs
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  4
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65535
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65536
+; SmallConstant
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
+; LargeConstant at index 0
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   1
+
+define void @constantargs() {
+entry:
+  %0 = inttoptr i64 244837814094590 to i8*
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  ret void
+}
+
+; Inline OSR Exit
+;
+; CHECK-LABEL:  .long   L{{.*}}-_osrinline
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrinline(i64 %a, i64 %b) {
+entry:
+  ; Runtime void->void call.
+  call void inttoptr (i64 244837814094590 to void ()*)()
+  ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+  ret void
+}
+
+; Cold OSR Exit
+;
+; 2 live variables in register.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_osrcold
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrcold(i64 %a, i64 %b) {
+entry:
+  %test = icmp slt i64 %a, %b
+  br i1 %test, label %ret, label %cold
+cold:
+  ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
+  %thunk = inttoptr i64 244837814094590 to i8*
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
+  unreachable
+ret:
+  ret void
+}
+
+; Property Read
+; CHECK-LABEL:  .long   L{{.*}}-_propertyRead
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+;
+; FIXME: There are currently no stackmap entries. After moving to
+; AnyRegCC, we will have entries for the object and return value.
+define i64 @propertyRead(i64* %obj) {
+entry:
+  %resolveRead = inttoptr i64 244837814094590 to i8*
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Property Write
+; CHECK-LABEL:  .long   L{{.*}}-_propertyWrite
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
+entry:
+  %resolveWrite = inttoptr i64 244837814094590 to i8*
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  ret void
+}
+
+; Void JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_jsVoidCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 244837814094590 to i8*
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  ret void
+}
+
+; i64 JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_jsIntCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 244837814094590 to i8*
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Spilled stack map values.
+;
+; Verify 28 stack map entries.
+;
+; CHECK-LABEL:  .long L{{.*}}-_spilledValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 28
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect FP + ...
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
+entry:
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+  ret void
+}
+
+; Spilled stack map values.
+;
+; Verify 23 stack map entries.
+;
+; CHECK-LABEL:  .long L{{.*}}-_spilledStackMapValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 30
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect FP + ...
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
+entry:
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
+  ret void
+}
+
+
+; Map a constant value.
+;
+; CHECK-LABEL:  .long L{{.*}}-_liveConstant
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   33
+
+define void @liveConstant() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
+  ret void
+}
+
+; Map a value when LR is the only free register.
+;
+; CHECK-LABEL:  .long L{{.*}}-_clobberLR
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: Indirect FP (r29) - offset
+; CHECK-NEXT:   .byte   3
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .short  29
+; CHECK-NEXT:   .long   -{{[0-9]+}}
+define void @clobberLR(i32 %a) {
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x31}"() nounwind
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-stackpointer.ll b/test/CodeGen/AArch64/arm64-stackpointer.ll
new file mode 100644
index 0000000..581faf1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stackpointer.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
+
+define i64 @get_stack() nounwind {
+entry:
+; CHECK-LABEL: get_stack:
+; CHECK: mov   x0, sp
+	%sp = call i64 @llvm.read_register.i64(metadata !0)
+  ret i64 %sp
+}
+
+define void @set_stack(i64 %val) nounwind {
+entry:
+; CHECK-LABEL: set_stack:
+; CHECK: mov   sp, x0
+  call void @llvm.write_register.i64(metadata !0, i64 %val)
+  ret void
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+declare void @llvm.write_register.i64(metadata, i64) nounwind
+
+; register unsigned long current_stack_pointer asm("sp");
+; CHECK-NOT: .asciz  "sp"
+!0 = metadata !{metadata !"sp\00"}
diff --git a/test/CodeGen/AArch64/arm64-stacksave.ll b/test/CodeGen/AArch64/arm64-stacksave.ll
new file mode 100644
index 0000000..a79e99b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stacksave.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -verify-coalescing
+; <rdar://problem/11522048>
+target triple = "arm64-apple-macosx10.8.0"
+
+; Verify that we can handle spilling the stack pointer without attempting
+; spilling it directly.
+; CHECK: f
+; CHECK: mov [[X0:x[0-9]+]], sp
+; CHECK: str [[X0]]
+; CHECK: inlineasm
+define void @f() nounwind ssp {
+entry:
+  %savedstack = call i8* @llvm.stacksave() nounwind
+  call void asm sideeffect "; inlineasm", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
+  call void @llvm.stackrestore(i8* %savedstack) nounwind
+  ret void
+}
+
+declare i8* @llvm.stacksave() nounwind
+declare void @llvm.stackrestore(i8*) nounwind
diff --git a/test/CodeGen/AArch64/arm64-stp.ll b/test/CodeGen/AArch64/arm64-stp.ll
new file mode 100644
index 0000000..40bdf22
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stp.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=arm64 -aarch64-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
+; RUN:   -verify-machineinstrs -mcpu=cyclone | FileCheck -check-prefix=STUR_CHK %s
+
+; CHECK: stp_int
+; CHECK: stp w0, w1, [x2]
+define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
+  store i32 %a, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32* %p, i64 1
+  store i32 %b, i32* %add.ptr, align 4
+  ret void
+}
+
+; CHECK: stp_long
+; CHECK: stp x0, x1, [x2]
+define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
+  store i64 %a, i64* %p, align 8
+  %add.ptr = getelementptr inbounds i64* %p, i64 1
+  store i64 %b, i64* %add.ptr, align 8
+  ret void
+}
+
+; CHECK: stp_float
+; CHECK: stp s0, s1, [x0]
+define void @stp_float(float %a, float %b, float* nocapture %p) nounwind {
+  store float %a, float* %p, align 4
+  %add.ptr = getelementptr inbounds float* %p, i64 1
+  store float %b, float* %add.ptr, align 4
+  ret void
+}
+
+; CHECK: stp_double
+; CHECK: stp d0, d1, [x0]
+define void @stp_double(double %a, double %b, double* nocapture %p) nounwind {
+  store double %a, double* %p, align 8
+  %add.ptr = getelementptr inbounds double* %p, i64 1
+  store double %b, double* %add.ptr, align 8
+  ret void
+}
+
+; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
+define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
+; STUR_CHK: stur_int
+; STUR_CHK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %p, i32 -1
+  store i32 %a, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %p, i32 -2
+  store i32 %b, i32* %p2, align 2
+  ret void
+}
+
+define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
+; STUR_CHK: stur_long
+; STUR_CHK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %p, i32 -1
+  store i64 %a, i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %p, i32 -2
+  store i64 %b, i64* %p2, align 2
+  ret void
+}
+
+define void @stur_float(float %a, float %b, float* nocapture %p) nounwind {
+; STUR_CHK: stur_float
+; STUR_CHK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds float* %p, i32 -1
+  store float %a, float* %p1, align 2
+  %p2 = getelementptr inbounds float* %p, i32 -2
+  store float %b, float* %p2, align 2
+  ret void
+}
+
+define void @stur_double(double %a, double %b, double* nocapture %p) nounwind {
+; STUR_CHK: stur_double
+; STUR_CHK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds double* %p, i32 -1
+  store double %a, double* %p1, align 2
+  %p2 = getelementptr inbounds double* %p, i32 -2
+  store double %b, double* %p2, align 2
+  ret void
+}
+
+define void @splat_v4i32(i32 %v, i32 *%p) {
+entry:
+
+; CHECK-LABEL: splat_v4i32
+; CHECK-DAG: stp w0, w0, [x1]
+; CHECK-DAG: stp w0, w0, [x1, #8]
+; CHECK: ret
+
+  %p17 = insertelement <4 x i32> undef, i32 %v, i32 0
+  %p18 = insertelement <4 x i32> %p17, i32 %v, i32 1
+  %p19 = insertelement <4 x i32> %p18, i32 %v, i32 2
+  %p20 = insertelement <4 x i32> %p19, i32 %v, i32 3
+  %p21 = bitcast i32* %p to <4 x i32>*
+  store <4 x i32> %p20, <4 x i32>* %p21, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-strict-align.ll b/test/CodeGen/AArch64/arm64-strict-align.ll
new file mode 100644
index 0000000..5d13704
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-strict-align.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-no-strict-align | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
+
+define i32 @f0(i32* nocapture %p) nounwind {
+; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
+; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
+; CHECK-STRICT: bfi [[LOW]], [[HIGH]], #16, #16
+; CHECK-STRICT: ret
+
+; CHECK: ldr w0, [x0]
+; CHECK: ret
+  %tmp = load i32* %p, align 2
+  ret i32 %tmp
+}
+
+define i64 @f1(i64* nocapture %p) nounwind {
+; CHECK-STRICT:	ldp	w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
+; CHECK-STRICT: bfi x[[LOW]], x[[HIGH]], #32, #32
+; CHECK-STRICT:	ret
+
+; CHECK: ldr x0, [x0]
+; CHECK: ret
+  %tmp = load i64* %p, align 4
+  ret i64 %tmp
+}
diff --git a/test/CodeGen/AArch64/arm64-stur.ll b/test/CodeGen/AArch64/arm64-stur.ll
new file mode 100644
index 0000000..a2e684d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stur.ll
@@ -0,0 +1,98 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+%struct.X = type <{ i32, i64, i64 }>
+
+define void @foo1(i32* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: 	stur	w1, [x0, #-4]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i32
+  %ptr = getelementptr inbounds i32* %p, i64 -1
+  store i32 %tmp1, i32* %ptr, align 4
+  ret void
+}
+define void @foo2(i16* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK: 	sturh	w1, [x0, #-2]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i16
+  %ptr = getelementptr inbounds i16* %p, i64 -1
+  store i16 %tmp1, i16* %ptr, align 2
+  ret void
+}
+define void @foo3(i8* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK: 	sturb	w1, [x0, #-1]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i8
+  %ptr = getelementptr inbounds i8* %p, i64 -1
+  store i8 %tmp1, i8* %ptr, align 1
+  ret void
+}
+define void @foo4(i16* %p, i32 %val) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK: 	sturh	w1, [x0, #-2]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i32 %val to i16
+  %ptr = getelementptr inbounds i16* %p, i32 -1
+  store i16 %tmp1, i16* %ptr, align 2
+  ret void
+}
+define void @foo5(i8* %p, i32 %val) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK: 	sturb	w1, [x0, #-1]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i32 %val to i8
+  %ptr = getelementptr inbounds i8* %p, i32 -1
+  store i8 %tmp1, i8* %ptr, align 1
+  ret void
+}
+
+define void @foo(%struct.X* nocapture %p) nounwind optsize ssp {
+; CHECK-LABEL: foo:
+; CHECK-NOT: str
+; CHECK: stur    xzr, [x0, #12]
+; CHECK-NEXT: stur    xzr, [x0, #4]
+; CHECK-NEXT: ret
+  %B = getelementptr inbounds %struct.X* %p, i64 0, i32 1
+  %val = bitcast i64* %B to i8*
+  call void @llvm.memset.p0i8.i64(i8* %val, i8 0, i64 16, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+; Unaligned 16b stores are split into 8b stores for performance.
+; radar://15424193
+
+; CHECK-LABEL: unaligned:
+; CHECK-NOT: str q0
+; CHECK: str     d[[REG:[0-9]+]], [x0]
+; CHECK: ext.16b v[[REG2:[0-9]+]], v[[REG]], v[[REG]], #8
+; CHECK: str     d[[REG2]], [x0, #8]
+define void @unaligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 4
+  ret void
+}
+
+; CHECK-LABEL: aligned:
+; CHECK: str q0
+define void @aligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p
+  ret void
+}
+
+; Don't split one and two byte aligned stores.
+; radar://16349308
+
+; CHECK-LABEL: twobytealign:
+; CHECK: str q0
+define void @twobytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 2
+  ret void
+}
+; CHECK-LABEL: onebytealign:
+; CHECK: str q0
+define void @onebytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 1
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-subsections.ll b/test/CodeGen/AArch64/arm64-subsections.ll
new file mode 100644
index 0000000..316e7c3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-subsections.ll
@@ -0,0 +1,5 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s --check-prefix=CHECK-MACHO
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s --check-prefix=CHECK-ELF
+
+; CHECK-MACHO: .subsections_via_symbols
+; CHECK-ELF-NOT: .subsections_via_symbols
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-subvector-extend.ll b/test/CodeGen/AArch64/arm64-subvector-extend.ll
new file mode 100644
index 0000000..d5a178a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+; Test efficient codegen of vector extends up from legal type to 128 bit
+; and 256 bit vector types.
+
+;-----
+; Vectors of i16.
+;-----
+define <8 x i16> @func1(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: func1:
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i8> %v0 to <8 x i16>
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @func2(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: func2:
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i8> %v0 to <8 x i16>
+  ret <8 x i16> %r
+}
+
+define <16 x i16> @func3(<16 x i8> %v0) nounwind {
+; CHECK-LABEL: func3:
+; CHECK-NEXT: ushll2.8h  v1, v0, #0
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <16 x i8> %v0 to <16 x i16>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @func4(<16 x i8> %v0) nounwind {
+; CHECK-LABEL: func4:
+; CHECK-NEXT: sshll2.8h  v1, v0, #0
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <16 x i8> %v0 to <16 x i16>
+  ret <16 x i16> %r
+}
+
+;-----
+; Vectors of i32.
+;-----
+
+define <4 x i32> @afunc1(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc1:
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i16> %v0 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @afunc2(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc2:
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i16> %v0 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <8 x i32> @afunc3(<8 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc3:
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i16> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @afunc4(<8 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc4:
+; CHECK-NEXT: sshll2.4s v1, v0, #0
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i16> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @bfunc1(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: bfunc1:
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i8> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @bfunc2(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: bfunc2:
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: sshll2.4s v1, v0, #0
+; CHECK-NEXT: sshll.4s  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i8> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+;-----
+; Vectors of i64.
+;-----
+
+define <4 x i64> @zfunc1(<4 x i32> %v0) nounwind {
+; CHECK-LABEL: zfunc1:
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i32> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @zfunc2(<4 x i32> %v0) nounwind {
+; CHECK-LABEL: zfunc2:
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i32> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @bfunc3(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: func3:
+; CHECK-NEXT: ushll.4s  v0, v0, #0
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i16> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @cfunc4(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: func4:
+; CHECK-NEXT: sshll.4s  v0, v0, #0
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i16> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
diff --git a/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll b/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
new file mode 100644
index 0000000..4ab2bee
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; rdar://13214163 - Make sure we generate a correct lookup table for the TBL
+; instruction when the element size of the vector is not 8 bits. We were
+; getting both the endianness wrong and the element indexing wrong.
+define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {
+; CHECK:	.section	__TEXT,__literal16,16byte_literals
+; CHECK:	.align	4
+; CHECK:lCPI0_0:
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.section __TEXT,__text,regular,pure_instructions
+; CHECK:	.globl	_foo
+; CHECK:	.align	2
+; CHECK:_foo:                                   ; @foo
+; CHECK:	adrp	[[BASE:x[0-9]+]], lCPI0_0@PAGE
+; CHECK:	ldr	q[[REG:[0-9]+]], {{\[}}[[BASE]], lCPI0_0@PAGEOFF]
+; CHECK:	tbl.16b	v0, { v0 }, v[[REG]]
+; CHECK:	ret
+
+  %val = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i16> %val
+}
diff --git a/test/CodeGen/AArch64/arm64-tbl.ll b/test/CodeGen/AArch64/arm64-tbl.ll
new file mode 100644
index 0000000..b1ce15a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tbl.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+; CHECK: tbl1_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+; CHECK: tbl1_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+; CHECK: tbl2_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK: tbl2_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbl3_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbl3_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbl4_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbl4_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+; CHECK: tbx1_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+; CHECK: tbx1_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbx2_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbx2_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbx3_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbx3_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+; CHECK: tbx4_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+; CHECK: tbx4_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-this-return.ll b/test/CodeGen/AArch64/arm64-this-return.ll
new file mode 100644
index 0000000..30f5b9b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-this-return.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+%struct.A = type { i8 }
+%struct.B = type { i32 }
+%struct.C = type { %struct.B }
+%struct.D = type { %struct.B }
+%struct.E = type { %struct.B, %struct.B }
+
+declare %struct.A* @A_ctor_base(%struct.A* returned)
+declare %struct.B* @B_ctor_base(%struct.B* returned, i32)
+declare %struct.B* @B_ctor_complete(%struct.B* returned, i32)
+
+declare %struct.A* @A_ctor_base_nothisret(%struct.A*)
+declare %struct.B* @B_ctor_base_nothisret(%struct.B*, i32)
+declare %struct.B* @B_ctor_complete_nothisret(%struct.B*, i32)
+
+define %struct.C* @C_ctor_base(%struct.C* returned %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_base:
+; CHECK-NOT: mov {{x[0-9]+}}, x0
+; CHECK: bl {{_?A_ctor_base}}
+; CHECK-NOT: mov x0, {{x[0-9]+}}
+; CHECK: b {{_?B_ctor_base}}
+  %0 = bitcast %struct.C* %this to %struct.A*
+  %call = tail call %struct.A* @A_ctor_base(%struct.A* %0)
+  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
+  %call2 = tail call %struct.B* @B_ctor_base(%struct.B* %1, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_base_nothisret:
+; CHECK: mov [[SAVETHIS:x[0-9]+]], x0
+; CHECK: bl {{_?A_ctor_base_nothisret}}
+; CHECK: mov x0, [[SAVETHIS]]
+; CHECK-NOT: b {{_?B_ctor_base_nothisret}}
+  %0 = bitcast %struct.C* %this to %struct.A*
+  %call = tail call %struct.A* @A_ctor_base_nothisret(%struct.A* %0)
+  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
+  %call2 = tail call %struct.B* @B_ctor_base_nothisret(%struct.B* %1, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_complete(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_complete:
+; CHECK: b {{_?C_ctor_base}}
+  %call = tail call %struct.C* @C_ctor_base(%struct.C* %this, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_complete_nothisret(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_complete_nothisret:
+; CHECK-NOT: b {{_?C_ctor_base_nothisret}}
+  %call = tail call %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.D* @D_ctor_base(%struct.D* %this, i32 %x) {
+entry:
+; CHECK-LABEL: D_ctor_base:
+; CHECK-NOT: mov {{x[0-9]+}}, x0
+; CHECK: bl {{_?B_ctor_complete}}
+; CHECK-NOT: mov x0, {{x[0-9]+}}
+; CHECK: b {{_?B_ctor_complete}}
+  %b = getelementptr inbounds %struct.D* %this, i32 0, i32 0
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  ret %struct.D* %this
+}
+
+define %struct.E* @E_ctor_base(%struct.E* %this, i32 %x) {
+entry:
+; CHECK-LABEL: E_ctor_base:
+; CHECK-NOT: b {{_?B_ctor_complete}}
+  %b = getelementptr inbounds %struct.E* %this, i32 0, i32 0
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %b2 = getelementptr inbounds %struct.E* %this, i32 0, i32 1
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b2, i32 %x)
+  ret %struct.E* %this
+}
diff --git a/test/CodeGen/AArch64/arm64-tls-darwin.ll b/test/CodeGen/AArch64/arm64-tls-darwin.ll
new file mode 100644
index 0000000..5e8ec33
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tls-darwin.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+
+@var = thread_local global i8 0
+
+; N.b. x0 must be the result of the first load (i.e. the address of the
+; descriptor) when tlv_get_addr is called. Likewise the result is returned in
+; x0.
+define i8 @get_var() {
+; CHECK-LABEL: get_var:
+; CHECK: adrp x[[TLVPDESC_SLOT_HI:[0-9]+]], _var@TLVPPAGE
+; CHECK: ldr x0, [x[[TLVPDESC_SLOT_HI]], _var@TLVPPAGEOFF]
+; CHECK: ldr [[TLV_GET_ADDR:x[0-9]+]], [x0]
+; CHECK: blr [[TLV_GET_ADDR]]
+; CHECK: ldrb w0, [x0]
+
+  %val = load i8* @var, align 1
+  ret i8 %val
+}
diff --git a/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
new file mode 100644
index 0000000..3daae62
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+
+; If the .tlsdesccall and blr parts are emitted completely separately (even with
+; glue) then LLVM will separate them quite happily (with a spill at O0, hence
+; the option). This is definitely wrong, so we make sure they are emitted
+; together.
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32* @general_dynamic_var
+  ret i32 %val
+
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr {{x[0-9]+}}
+}
diff --git a/test/CodeGen/AArch64/arm64-tls-dynamics.ll b/test/CodeGen/AArch64/arm64-tls-dynamics.ll
new file mode 100644
index 0000000..e8a83fd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tls-dynamics.ll
@@ -0,0 +1,135 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32* @general_dynamic_var
+  ret i32 %val
+
+  ; FIXME: the adrp instructions are redundant (if harmless).
+; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], x0]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+define i32* @test_generaldynamic_addr() {
+; CHECK-LABEL: test_generaldynamic_addr:
+
+  ret i32* @general_dynamic_var
+
+  ; FIXME: the adrp instructions are redundant (if harmless).
+; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], x0
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+}
+
+@local_dynamic_var = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic() {
+; CHECK-LABEL: test_localdynamic:
+
+  %val = load i32* @local_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
+; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
+
+; CHECK: add x[[TPREL:[0-9]+]], x0, [[DTP_OFFSET]]
+
+; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+
+; CHECK: ldr w0, [x[[TPIDR]], x[[TPREL]]]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+define i32* @test_localdynamic_addr() {
+; CHECK-LABEL: test_localdynamic_addr:
+
+  ret i32* @local_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
+; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
+
+; CHECK: add [[TPREL:x[0-9]+]], x0, [[DTP_OFFSET]]
+
+; CHECK: mrs [[TPIDR:x[0-9]+]], TPIDR_EL0
+
+; CHECK: add x0, [[TPIDR]], [[TPREL]]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+; The entire point of the local-dynamic access model is to have a single call to
+; the expensive resolver. Make sure we achieve that goal.
+
+@local_dynamic_var2 = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic_deduplicate() {
+; CHECK-LABEL: test_localdynamic_deduplicate:
+
+  %val = load i32* @local_dynamic_var
+  %val2 = load i32* @local_dynamic_var2
+
+  %sum = add i32 %val, %val2
+  ret i32 %sum
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK-NOT: _TLS_MODULE_BASE_
+
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-tls-execs.ll b/test/CodeGen/AArch64/arm64-tls-execs.ll
new file mode 100644
index 0000000..f0130d8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tls-execs.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+@initial_exec_var = external thread_local(initialexec) global i32
+
+define i32 @test_initial_exec() {
+; CHECK-LABEL: test_initial_exec:
+  %val = load i32* @initial_exec_var
+
+; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
+; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], x[[TP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
+; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+
+  ret i32 %val
+}
+
+define i32* @test_initial_exec_addr() {
+; CHECK-LABEL: test_initial_exec_addr:
+  ret i32* @initial_exec_var
+
+; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
+; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], [[TP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
+; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+
+}
+
+@local_exec_var = thread_local(localexec) global i32 0
+
+define i32 @test_local_exec() {
+; CHECK-LABEL: test_local_exec:
+  %val = load i32* @local_exec_var
+
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [0bAAA{{[01]+}},A,0b101AAAAA,0x92]
+; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+
+  ret i32 %val
+}
+
+define i32* @test_local_exec_addr() {
+; CHECK-LABEL: test_local_exec_addr:
+  ret i32* @local_exec_var
+
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
+; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], [[TP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+}
diff --git a/test/CodeGen/AArch64/arm64-trap.ll b/test/CodeGen/AArch64/arm64-trap.ll
new file mode 100644
index 0000000..5e99c32
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-trap.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define void @foo() nounwind {
+; CHECK: foo
+; CHECK: brk #0x1
+  tail call void @llvm.trap()
+  ret void
+}
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/AArch64/arm64-trn.ll b/test/CodeGen/AArch64/arm64-trn.ll
new file mode 100644
index 0000000..2db7a14
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-trn.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrni8:
+;CHECK: trn1.8b
+;CHECK: trn2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrni16:
+;CHECK: trn1.4h
+;CHECK: trn2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+; 2xi32 TRN is redundant with ZIP
+define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vtrni32:
+;CHECK: zip1.2s
+;CHECK: zip2.2s
+;CHECK-NEXT: add.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
+	%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+	ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: vtrnf:
+;CHECK: zip1.2s
+;CHECK: zip2.2s
+;CHECK-NEXT: fadd.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2>
+	%tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3>
+        %tmp5 = fadd <2 x float> %tmp3, %tmp4
+	ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrnQi8:
+;CHECK: trn1.16b
+;CHECK: trn2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrnQi16:
+;CHECK: trn1.8h
+;CHECK: trn2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vtrnQi32:
+;CHECK: trn1.4s
+;CHECK: trn2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vtrnQf:
+;CHECK: trn1.4s
+;CHECK: trn2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VTRN:
+
+define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrni8_undef:
+;CHECK: trn1.8b
+;CHECK: trn2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrnQi16_undef:
+;CHECK: trn1.8h
+;CHECK: trn2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
diff --git a/test/CodeGen/AArch64/arm64-trunc-store.ll b/test/CodeGen/AArch64/arm64-trunc-store.ll
new file mode 100644
index 0000000..cf15247
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-trunc-store.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+define void @bar(<8 x i16> %arg, <8 x i8>* %p) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: xtn.8b v[[REG:[0-9]+]], v0
+; CHECK-NEXT: str d[[REG]], [x0]
+; CHECK-NEXT: ret
+  %tmp = trunc <8 x i16> %arg to <8 x i8>
+  store <8 x i8> %tmp, <8 x i8>* %p, align 8
+  ret void
+}
+
+@zptr8 = common global i8* null, align 8
+@zptr16 = common global i16* null, align 8
+@zptr32 = common global i32* null, align 8
+
+define void @fct32(i32 %arg, i64 %var) {
+; CHECK: fct32
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr32@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr32@GOTPAGEOFF]
+; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
+; w1 is %var truncated
+; CHECK-NEXT: str w1, {{\[}}[[GLOBALADDR]], w[[OFFSETREGNUM]], sxtw #2]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i32** @zptr32, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i32* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i32
+  store i32 %tmp, i32* %arrayidx9, align 4
+  ret void
+}
+
+define void @fct16(i32 %arg, i64 %var) {
+; CHECK: fct16
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr16@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr16@GOTPAGEOFF]
+; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
+; w1 is %var truncated
+; CHECK-NEXT: strh w1, {{\[}}[[GLOBALADDR]], w[[OFFSETREGNUM]], sxtw #1]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i16** @zptr16, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i16* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i16
+  store i16 %tmp, i16* %arrayidx9, align 4
+  ret void
+}
+
+define void @fct8(i32 %arg, i64 %var) {
+; CHECK: fct8
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr8@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr8@GOTPAGEOFF]
+; CHECK: ldr [[BASEADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: add [[ADDR:x[0-9]+]], [[BASEADDR]], w0, sxtw
+; w1 is %var truncated
+; CHECK-NEXT: sturb w1, {{\[}}[[ADDR]], #-1]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i8** @zptr8, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i8* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i8
+  store i8 %tmp, i8* %arrayidx9, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-umaxv.ll b/test/CodeGen/AArch64/arm64-umaxv.ll
new file mode 100644
index 0000000..d523f31
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-umaxv.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u8x8:
+; CHECK: umaxv.8b        b[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...)
+
+define i32 @vmax_u4x16(<4 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u4x16:
+; CHECK: umaxv.4h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmax_u8x16(<8 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u8x16:
+; CHECK: umaxv.8h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmax_u16x8(<16 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u16x8:
+; CHECK: umaxv.16b        b[[REG:[0-9]+]], v0
+; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-uminv.ll b/test/CodeGen/AArch64/arm64-uminv.ll
new file mode 100644
index 0000000..3bade4b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-uminv.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u8x8:
+; CHECK: uminv.8b        b[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...)
+
+define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u4x16:
+; CHECK: uminv.4h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u8x16:
+; CHECK: uminv.8h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u16x8:
+; CHECK: uminv.16b        b[[REG:[0-9]+]], v0
+; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-umov.ll b/test/CodeGen/AArch64/arm64-umov.ll
new file mode 100644
index 0000000..a1ef990
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-umov.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define zeroext i8 @f1(<16 x i8> %a) {
+; CHECK-LABEL: f1:
+; CHECK: mov.b w0, v0[3]
+; CHECK-NEXT: ret
+  %vecext = extractelement <16 x i8> %a, i32 3
+  ret i8 %vecext
+}
+
+define zeroext i16 @f2(<4 x i16> %a) {
+; CHECK-LABEL: f2:
+; CHECK: mov.h w0, v0[2]
+; CHECK-NEXT: ret
+  %vecext = extractelement <4 x i16> %a, i32 2
+  ret i16 %vecext
+}
+
+define i32 @f3(<2 x i32> %a) {
+; CHECK-LABEL: f3:
+; CHECK: mov.s w0, v0[1]
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i32> %a, i32 1
+  ret i32 %vecext
+}
+
+define i64 @f4(<2 x i64> %a) {
+; CHECK-LABEL: f4:
+; CHECK: mov.d x0, v0[1]
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 1
+  ret i64 %vecext
+}
diff --git a/test/CodeGen/AArch64/arm64-unaligned_ldst.ll b/test/CodeGen/AArch64/arm64-unaligned_ldst.ll
new file mode 100644
index 0000000..20b80c0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-unaligned_ldst.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+; rdar://r11231896
+
+define void @t1(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: orr
+; CHECK: ldr [[X0:x[0-9]+]], [x1]
+; CHECK: str [[X0]], [x0]
+  %tmp1 = bitcast i8* %b to i64*
+  %tmp2 = bitcast i8* %a to i64*
+  %tmp3 = load i64* %tmp1, align 1
+  store i64 %tmp3, i64* %tmp2, align 1
+  ret void
+}
+
+define void @t2(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK-NOT: orr
+; CHECK: ldr [[W0:w[0-9]+]], [x1]
+; CHECK: str [[W0]], [x0]
+  %tmp1 = bitcast i8* %b to i32*
+  %tmp2 = bitcast i8* %a to i32*
+  %tmp3 = load i32* %tmp1, align 1
+  store i32 %tmp3, i32* %tmp2, align 1
+  ret void
+}
+
+define void @t3(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: orr
+; CHECK: ldrh [[W0:w[0-9]+]], [x1]
+; CHECK: strh [[W0]], [x0]
+  %tmp1 = bitcast i8* %b to i16*
+  %tmp2 = bitcast i8* %a to i16*
+  %tmp3 = load i16* %tmp1, align 1
+  store i16 %tmp3, i16* %tmp2, align 1
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-uzp.ll b/test/CodeGen/AArch64/arm64-uzp.ll
new file mode 100644
index 0000000..cdd8d31
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-uzp.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpi8:
+;CHECK: uzp1.8b
+;CHECK: uzp2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpi16:
+;CHECK: uzp1.4h
+;CHECK: uzp2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpQi8:
+;CHECK: uzp1.16b
+;CHECK: uzp2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpQi16:
+;CHECK: uzp1.8h
+;CHECK: uzp2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vuzpQi32:
+;CHECK: uzp1.4s
+;CHECK: uzp2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vuzpQf:
+;CHECK: uzp1.4s
+;CHECK: uzp2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VUZP:
+
+define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpi8_undef:
+;CHECK: uzp1.8b
+;CHECK: uzp2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpQi16_undef:
+;CHECK: uzp1.8h
+;CHECK: uzp2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
diff --git a/test/CodeGen/AArch64/arm64-vaargs.ll b/test/CodeGen/AArch64/arm64-vaargs.ll
new file mode 100644
index 0000000..ce07635
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vaargs.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-darwin11.0.0"
+
+define float @t1(i8* nocapture %fmt, ...) nounwind ssp {
+entry:
+; CHECK: t1
+; CHECK: fcvt
+  %argp = alloca i8*, align 8
+  %argp1 = bitcast i8** %argp to i8*
+  call void @llvm.va_start(i8* %argp1)
+  %0 = va_arg i8** %argp, i32
+  %1 = va_arg i8** %argp, float
+  call void @llvm.va_end(i8* %argp1)
+  ret float %1
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+declare void @llvm.va_end(i8*) nounwind
diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
new file mode 100644
index 0000000..5afc8d9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -0,0 +1,804 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+
+define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sabdl8h:
+;CHECK: sabdl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sabdl4s:
+;CHECK: sabdl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sabdl2d:
+;CHECK: sabdl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sabdl2_8h:
+;CHECK: sabdl2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sabdl2_4s:
+;CHECK: sabdl2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sabdl2_2d:
+;CHECK: sabdl2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uabdl8h:
+;CHECK: uabdl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uabdl4s:
+;CHECK: uabdl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uabdl2d:
+;CHECK: uabdl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+  ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uabdl2_8h:
+;CHECK: uabdl2.8h
+  %load1 = load <16 x i8>* %A
+  %load2 = load <16 x i8>* %B
+  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uabdl2_4s:
+;CHECK: uabdl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uabdl2_2d:
+;CHECK: uabdl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+  ret <2 x i64> %tmp4
+}
+
+define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fabd_2s:
+;CHECK: fabd.2s
+        %tmp1 = load <2 x float>* %A
+        %tmp2 = load <2 x float>* %B
+        %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        ret <2 x float> %tmp3
+}
+
+define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fabd_4s:
+;CHECK: fabd.4s
+        %tmp1 = load <4 x float>* %A
+        %tmp2 = load <4 x float>* %B
+        %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        ret <4 x float> %tmp3
+}
+
+define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fabd_2d:
+;CHECK: fabd.2d
+        %tmp1 = load <2 x double>* %A
+        %tmp2 = load <2 x double>* %B
+        %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sabd_8b:
+;CHECK: sabd.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sabd_16b:
+;CHECK: sabd.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sabd_4h:
+;CHECK: sabd.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sabd_8h:
+;CHECK: sabd.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sabd_2s:
+;CHECK: sabd.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sabd_4s:
+;CHECK: sabd.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uabd_8b:
+;CHECK: uabd.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uabd_16b:
+;CHECK: uabd.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uabd_4h:
+;CHECK: uabd.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uabd_8h:
+;CHECK: uabd.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uabd_2s:
+;CHECK: uabd.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uabd_4s:
+;CHECK: uabd.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqabs_8b:
+;CHECK: sqabs.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqabs_16b:
+;CHECK: sqabs.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqabs_4h:
+;CHECK: sqabs.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqabs_8h:
+;CHECK: sqabs.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqabs_2s:
+;CHECK: sqabs.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqabs_4s:
+;CHECK: sqabs.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqneg_8b:
+;CHECK: sqneg.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqneg_16b:
+;CHECK: sqneg.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqneg_4h:
+;CHECK: sqneg.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqneg_8h:
+;CHECK: sqneg.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqneg_2s:
+;CHECK: sqneg.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqneg_4s:
+;CHECK: sqneg.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: abs_8b:
+;CHECK: abs.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: abs_16b:
+;CHECK: abs.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: abs_4h:
+;CHECK: abs.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: abs_8h:
+;CHECK: abs.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: abs_2s:
+;CHECK: abs.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: abs_4s:
+;CHECK: abs.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
+; CHECK-LABEL: abs_1d:
+; CHECK: abs d0, d0
+  %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
+  ret <1 x i64> %abs
+}
+
+define i64 @abs_1d_honestly(i64 %A) nounwind {
+; CHECK-LABEL: abs_1d_honestly:
+; CHECK: abs d0, d0
+  %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
+  ret i64 %abs
+}
+
+declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
+
+define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
+;CHECK-LABEL: sabal8h:
+;CHECK: sabal.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sabal4s:
+;CHECK: sabal.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sabal2d:
+;CHECK: sabal.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: sabal2_8h:
+;CHECK: sabal2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sabal2_4s:
+;CHECK: sabal2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sabal2_2d:
+;CHECK: sabal2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uabal8h:
+;CHECK: uabal.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uabal4s:
+;CHECK: uabal.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: uabal2d:
+;CHECK: uabal.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uabal2_8h:
+;CHECK: uabal2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uabal2_4s:
+;CHECK: uabal2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: uabal2_2d:
+;CHECK: uabal2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK-LABEL: saba_8b:
+;CHECK: saba.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = load <8 x i8>* %C
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK-LABEL: saba_16b:
+;CHECK: saba.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp4 = load <16 x i8>* %C
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+        ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: saba_4h:
+;CHECK: saba.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = load <4 x i16>* %C
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: saba_8h:
+;CHECK: saba.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp4 = load <8 x i16>* %C
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: saba_2s:
+;CHECK: saba.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = load <2 x i32>* %C
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: saba_4s:
+;CHECK: saba.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp4 = load <4 x i32>* %C
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK-LABEL: uaba_8b:
+;CHECK: uaba.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = load <8 x i8>* %C
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK-LABEL: uaba_16b:
+;CHECK: uaba.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp4 = load <16 x i8>* %C
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+        ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: uaba_4h:
+;CHECK: uaba.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = load <4 x i16>* %C
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uaba_8h:
+;CHECK: uaba.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp4 = load <8 x i16>* %C
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: uaba_2s:
+;CHECK: uaba.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = load <2 x i32>* %C
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uaba_4s:
+;CHECK: uaba.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp4 = load <4 x i32>* %C
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+; Scalar FABD
+define float @fabds(float %a, float %b) nounwind {
+; CHECK-LABEL: fabds:
+; CHECK: fabd s0, s0, s1
+  %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
+  ret float %vabd.i
+}
+
+define double @fabdd(double %a, double %b) nounwind {
+; CHECK-LABEL: fabdd:
+; CHECK: fabd d0, d0, d1
+  %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
+  ret double %vabd.i
+}
+
+declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
+
+define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uabdl_from_extract_dup:
+; CHECK-NOT: ext.16b
+; CHECK: uabdl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
+
+define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: sabdl_from_extract_dup:
+; CHECK-NOT: ext.16b
+; CHECK: sabdl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
diff --git a/test/CodeGen/AArch64/arm64-vadd.ll b/test/CodeGen/AArch64/arm64-vadd.ll
new file mode 100644
index 0000000..9ed8aa6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vadd.ll
@@ -0,0 +1,941 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn8b:
+;CHECK: addhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn4h:
+;CHECK: addhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2s:
+;CHECK: addhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: addhn2_16b:
+;CHECK: addhn.8b
+;CHECK-NEXT: addhn2.16b
+  %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: addhn2_8h:
+;CHECK: addhn.4h
+;CHECK-NEXT: addhn2.8h
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: addhn2_4s:
+;CHECK: addhn.2s
+;CHECK-NEXT: addhn2.4s
+  %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: raddhn8b:
+;CHECK: raddhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: raddhn4h:
+;CHECK: raddhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: raddhn2s:
+;CHECK: raddhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: raddhn2_16b:
+;CHECK: raddhn.8b
+;CHECK-NEXT: raddhn2.16b
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: raddhn2_8h:
+;CHECK: raddhn.4h
+;CHECK-NEXT: raddhn2.8h
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: raddhn2_4s:
+;CHECK: raddhn.2s
+;CHECK-NEXT: raddhn2.4s
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: saddl8h:
+;CHECK: saddl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: saddl4s:
+;CHECK: saddl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: saddl2d:
+;CHECK: saddl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
+; CHECK-LABEL: saddl2_8h:
+; CHECK-NEXT: saddl2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
+  %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
+; CHECK-LABEL: saddl2_4s:
+; CHECK-NEXT: saddl2.4s v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
+  %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
+; CHECK-LABEL: saddl2_2d:
+; CHECK-NEXT: saddl2.2d v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
+  %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddl8h:
+;CHECK: uaddl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = add <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddl4s:
+;CHECK: uaddl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddl2d:
+;CHECK: uaddl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+
+define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
+; CHECK-LABEL: uaddl2_8h:
+; CHECK-NEXT: uaddl2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
+  %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
+; CHECK-LABEL: uaddl2_4s:
+; CHECK-NEXT: uaddl2.4s v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
+  %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
+; CHECK-LABEL: uaddl2_2d:
+; CHECK-NEXT: uaddl2.2d v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
+  %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddw8h:
+;CHECK: uaddw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = add <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddw4s:
+;CHECK: uaddw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = add <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddw2d:
+;CHECK: uaddw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = add <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddw2_8h:
+;CHECK: uaddw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+        %res = add <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddw2_4s:
+;CHECK: uaddw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+        %res = add <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddw2_2d:
+;CHECK: uaddw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+        %res = add <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: saddw8h:
+;CHECK: saddw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: saddw4s:
+;CHECK: saddw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: saddw2d:
+;CHECK: saddw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: saddw2_8h:
+;CHECK: saddw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = add <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: saddw2_4s:
+;CHECK: saddw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = add <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: saddw2_2d:
+;CHECK: saddw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = add <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: saddlp4h:
+;CHECK: saddlp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: saddlp2s:
+;CHECK: saddlp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: saddlp1d:
+;CHECK: saddlp.1d
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: saddlp8h:
+;CHECK: saddlp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: saddlp4s:
+;CHECK: saddlp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: saddlp2d:
+;CHECK: saddlp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        ret <2 x i64> %tmp3
+}
+
+declare <4 x i16>  @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16>  @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uaddlp4h:
+;CHECK: uaddlp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uaddlp2s:
+;CHECK: uaddlp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uaddlp1d:
+;CHECK: uaddlp.1d
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: uaddlp8h:
+;CHECK: uaddlp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uaddlp4s:
+;CHECK: uaddlp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uaddlp2d:
+;CHECK: uaddlp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        ret <2 x i64> %tmp3
+}
+
+declare <4 x i16>  @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16>  @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sadalp4h:
+;CHECK: sadalp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sadalp2s:
+;CHECK: sadalp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sadalp8h:
+;CHECK: sadalp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sadalp4s:
+;CHECK: sadalp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sadalp2d:
+;CHECK: sadalp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uadalp4h:
+;CHECK: uadalp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uadalp2s:
+;CHECK: uadalp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uadalp8h:
+;CHECK: uadalp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uadalp4s:
+;CHECK: uadalp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uadalp2d:
+;CHECK: uadalp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: addp_8b:
+;CHECK: addp.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: addp_16b:
+;CHECK: addp.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: addp_4h:
+;CHECK: addp.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addp_8h:
+;CHECK: addp.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: addp_2s:
+;CHECK: addp.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addp_4s:
+;CHECK: addp.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addp_2d:
+;CHECK: addp.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: faddp_2s:
+;CHECK: faddp.2s
+        %tmp1 = load <2 x float>* %A
+        %tmp2 = load <2 x float>* %B
+        %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        ret <2 x float> %tmp3
+}
+
+define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: faddp_4s:
+;CHECK: faddp.4s
+        %tmp1 = load <4 x float>* %A
+        %tmp2 = load <4 x float>* %B
+        %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        ret <4 x float> %tmp3
+}
+
+define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: faddp_2d:
+;CHECK: faddp.2d
+        %tmp1 = load <2 x double>* %A
+        %tmp2 = load <2 x double>* %B
+        %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uaddl2_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: uaddl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: saddl2_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: saddl2.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: usubl2_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: usubl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: ssubl2_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: ssubl2.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn8b_natural:
+;CHECK: addhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %sum = add <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        ret <8 x i8> %narrowed
+}
+
+define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn4h_natural:
+;CHECK: addhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %sum = add <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        ret <4 x i16> %narrowed
+}
+
+define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2s_natural:
+;CHECK: addhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %sum = add <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        ret <2 x i32> %narrowed
+}
+
+define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn2_16b_natural:
+;CHECK: addhn2.16b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %sum = add <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn2_8h_natural:
+;CHECK: addhn2.8h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %sum = add <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2_4s_natural:
+;CHECK: addhn2.4s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %sum = add <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn8b_natural:
+;CHECK: subhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %diff = sub <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        ret <8 x i8> %narrowed
+}
+
+define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn4h_natural:
+;CHECK: subhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %diff = sub <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        ret <4 x i16> %narrowed
+}
+
+define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2s_natural:
+;CHECK: subhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %diff = sub <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        ret <2 x i32> %narrowed
+}
+
+define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn2_16b_natural:
+;CHECK: subhn2.16b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %diff = sub <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn2_8h_natural:
+;CHECK: subhn2.8h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %diff = sub <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2_4s_natural:
+;CHECK: subhn2.4s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %diff = sub <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
diff --git a/test/CodeGen/AArch64/arm64-vaddlv.ll b/test/CodeGen/AArch64/arm64-vaddlv.ll
new file mode 100644
index 0000000..2d64138
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vaddlv.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_s32
+; CHECK: saddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddlv.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  ret i64 %vaddlv.i
+}
+
+define i64 @test_vaddlv_u32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_u32
+; CHECK: uaddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddlv.i = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  ret i64 %vaddlv.i
+}
+
+declare i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
+declare i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-vaddv.ll b/test/CodeGen/AArch64/arm64-vaddv.ll
new file mode 100644
index 0000000..2d92ce6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vaddv.ll
@@ -0,0 +1,245 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+
+define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_s8:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_s16:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddv_s32(<2 x i32> %a1) {
+; CHECK-LABEL: test_vaddv_s32:
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define i64 @test_vaddv_s64(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_s64:
+; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
+; CHECK-NEXT: fmov x0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a1)
+  ret i64 %vaddv.i
+}
+
+define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_u8:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_u8_masked:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %0 = and i32 %vaddv.i, 511 ; 0x1ff
+  ret i32 %0
+}
+
+define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_u16:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_u16_masked:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
+  ret i32 %0
+}
+
+define i32 @test_vaddv_u32(<2 x i32> %a1) {
+; CHECK-LABEL: test_vaddv_u32:
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define float @test_vaddv_f32(<2 x float> %a1) {
+; CHECK-LABEL: test_vaddv_f32:
+; CHECK: faddp.2s s0, v0
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
+  ret float %vaddv.i
+}
+
+define float @test_vaddv_v4f32(<4 x float> %a1) {
+; CHECK-LABEL: test_vaddv_v4f32:
+; CHECK: faddp.4s [[REGNUM:v[0-9]+]], v0, v0
+; CHECK: faddp.2s s0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
+  ret float %vaddv.i
+}
+
+define double @test_vaddv_f64(<2 x double> %a1) {
+; CHECK-LABEL: test_vaddv_f64:
+; CHECK: faddp.2d d0, v0
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
+  ret double %vaddv.i
+}
+
+define i64 @test_vaddv_u64(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_u64:
+; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
+; CHECK-NEXT: fmov x0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  ret i64 %vaddv.i
+}
+
+define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_u64_to_vec:
+; CHECK: addp.2d d0, v0
+; CHECK-NOT: fmov
+; CHECK-NOT: ins
+; CHECK: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  %vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
+  ret <1 x i64> %vec
+}
+
+define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
+; CHECK-LABEL: test_vaddvq_s8:
+; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
+; CHECK-LABEL: test_vaddvq_s16:
+; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a1) {
+; CHECK-LABEL: test_vaddvq_s32:
+; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
+; CHECK-LABEL: test_vaddvq_u8:
+; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
+; CHECK-LABEL: test_vaddvq_u16:
+; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a1) {
+; CHECK-LABEL: test_vaddvq_u32:
+; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
+
+declare i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
+
+declare i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
+
+declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
+declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
+declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
new file mode 100644
index 0000000..36a7bfd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -0,0 +1,143 @@
+; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s
+
+%va_list = type {i8*, i8*, i8*, i32, i32}
+
+@var = global %va_list zeroinitializer, align 8
+
+declare void @llvm.va_start(i8*)
+
+define void @test_simple(i32 %n, ...) {
+; CHECK-LABEL: test_simple:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
+
+; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+
+; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]]
+; ... omit middle ones ...
+; CHECK: str x7, [sp, #
+
+; CHECK: stp q0, q1, [sp]
+; ... omit middle ones ...
+; CHECK: stp q6, q7, [sp, #
+
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+
+; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
+; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #56
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+
+; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
+; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #128
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #0x37
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+
+; CHECK: orr [[VR_OFFS:w[0-9]+]], wzr, #0xffffff80
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+
+  ret void
+}
+
+define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
+; CHECK-LABEL: test_fewargs:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
+
+; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+
+; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]]
+; ... omit middle ones ...
+; CHECK: str x7, [sp, #
+
+; CHECK: stp q1, q2, [sp]
+; ... omit middle ones ...
+; CHECK: str q7, [sp, #
+
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+
+; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
+; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #40
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+
+; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
+; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #112
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #0x27
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+
+; CHECK: movn [[VR_OFFS:w[0-9]+]], #0x6f
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+
+  ret void
+}
+
+define void @test_nospare([8 x i64], [8 x float], ...) {
+; CHECK-LABEL: test_nospare:
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+; CHECK-NOT: sub sp, sp
+; CHECK: mov [[STACK:x[0-9]+]], sp
+; CHECK: str [[STACK]], [{{x[0-9]+}}, :lo12:var]
+
+  ret void
+}
+
+; If there are non-variadic arguments on the stack (here two i64s) then the
+; __stack field should point just past them.
+define void @test_offsetstack([10 x i64], [3 x float], ...) {
+; CHECK-LABEL: test_offsetstack:
+; CHECK: sub sp, sp, #80
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
+; CHECK: str [[STACK_TOP]], [{{x[0-9]+}}, :lo12:var]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+  ret void
+}
+
+declare void @llvm.va_end(i8*)
+
+define void @test_va_end() nounwind {
+; CHECK-LABEL: test_va_end:
+; CHECK-NEXT: BB#0
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_end(i8* %addr)
+
+  ret void
+; CHECK-NEXT: ret
+}
+
+declare void @llvm.va_copy(i8* %dest, i8* %src)
+
+@second_list = global %va_list zeroinitializer
+
+define void @test_va_copy() {
+; CHECK-LABEL: test_va_copy:
+  %srcaddr = bitcast %va_list* @var to i8*
+  %dstaddr = bitcast %va_list* @second_list to i8*
+  call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
+
+; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var
+
+; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]]
+; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
+; CHECK: str [[BLOCK]], [x[[DST]]]
+
+; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16]
+; CHECK: str [[BLOCK]], [x[[DST]], #16]
+  ret void
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-vbitwise.ll b/test/CodeGen/AArch64/arm64-vbitwise.ll
new file mode 100644
index 0000000..93de95e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vbitwise.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @rbit_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: rbit_8b:
+;CHECK: rbit.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @rbit_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: rbit_16b:
+;CHECK: rbit.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
+
+define <8 x i16> @sxtl8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sxtl8h:
+;CHECK: sshll.8h
+	%tmp1 = load <8 x i8>* %A
+  %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+  ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @uxtl8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uxtl8h:
+;CHECK: ushll.8h
+	%tmp1 = load <8 x i8>* %A
+  %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @sxtl4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sxtl4s:
+;CHECK: sshll.4s
+	%tmp1 = load <4 x i16>* %A
+  %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+  ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @uxtl4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uxtl4s:
+;CHECK: ushll.4s
+	%tmp1 = load <4 x i16>* %A
+  %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+  ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @sxtl2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sxtl2d:
+;CHECK: sshll.2d
+	%tmp1 = load <2 x i32>* %A
+  %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+  ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @uxtl2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uxtl2d:
+;CHECK: ushll.2d
+	%tmp1 = load <2 x i32>* %A
+  %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+  ret <2 x i64> %tmp2
+}
+
+; Check for incorrect use of vector bic.
+; rdar://11553859
+define void @test_vsliq(i8* nocapture %src, i8* nocapture %dest) nounwind noinline ssp {
+entry:
+; CHECK-LABEL: test_vsliq:
+; CHECK-NOT: bic
+; CHECK: movi.2d [[REG1:v[0-9]+]], #0x0000ff000000ff
+; CHECK: and.16b v{{[0-9]+}}, v{{[0-9]+}}, [[REG1]]
+  %0 = bitcast i8* %src to <16 x i8>*
+  %1 = load <16 x i8>* %0, align 16
+  %and.i = and <16 x i8> %1, <i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0>
+  %2 = bitcast <16 x i8> %and.i to <8 x i16>
+  %vshl_n = shl <8 x i16> %2, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %3 = or <8 x i16> %2, %vshl_n
+  %4 = bitcast <8 x i16> %3 to <4 x i32>
+  %vshl_n8 = shl <4 x i32> %4, <i32 16, i32 16, i32 16, i32 16>
+  %5 = or <4 x i32> %4, %vshl_n8
+  %6 = bitcast <4 x i32> %5 to <16 x i8>
+  %7 = bitcast i8* %dest to <16 x i8>*
+  store <16 x i8> %6, <16 x i8>* %7, align 16
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-vclz.ll b/test/CodeGen/AArch64/arm64-vclz.ll
new file mode 100644
index 0000000..cf5670a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vclz.ll
@@ -0,0 +1,109 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define <8 x i8> @test_vclz_u8(<8 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u8:
+  ; CHECK: clz.8b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
+  ret <8 x i8> %vclz.i
+}
+
+define <8 x i8> @test_vclz_s8(<8 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s8:
+  ; CHECK: clz.8b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
+  ret <8 x i8> %vclz.i
+}
+
+define <4 x i16> @test_vclz_u16(<4 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u16:
+  ; CHECK: clz.4h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
+  ret <4 x i16> %vclz1.i
+}
+
+define <4 x i16> @test_vclz_s16(<4 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s16:
+  ; CHECK: clz.4h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
+  ret <4 x i16> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_u32(<2 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u32:
+  ; CHECK: clz.2s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
+  ret <2 x i32> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_s32(<2 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s32:
+  ; CHECK: clz.2s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
+  ret <2 x i32> %vclz1.i
+}
+
+define <16 x i8> @test_vclzq_u8(<16 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u8:
+  ; CHECK: clz.16b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
+  ret <16 x i8> %vclz.i
+}
+
+define <16 x i8> @test_vclzq_s8(<16 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s8:
+  ; CHECK: clz.16b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
+  ret <16 x i8> %vclz.i
+}
+
+define <8 x i16> @test_vclzq_u16(<8 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u16:
+  ; CHECK: clz.8h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
+  ret <8 x i16> %vclz1.i
+}
+
+define <8 x i16> @test_vclzq_s16(<8 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s16:
+  ; CHECK: clz.8h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
+  ret <8 x i16> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_u32(<4 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u32:
+  ; CHECK: clz.4s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
+  ret <4 x i32> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_s32(<4 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s32:
+  ; CHECK: clz.4s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
+  ret <4 x i32> %vclz1.i
+}
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
+
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
+
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
+
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcmp.ll b/test/CodeGen/AArch64/arm64-vcmp.ll
new file mode 100644
index 0000000..982ab09
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcmp.ll
@@ -0,0 +1,236 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+
+define void @fcmltz_4s(<4 x float> %a, <4 x i16>* %p) nounwind {
+;CHECK-LABEL: fcmltz_4s:
+;CHECK: fcmlt.4s [[REG:v[0-9]+]], v0, #0
+;CHECK-NEXT: xtn.4h v[[REG_1:[0-9]+]], [[REG]]
+;CHECK-NEXT: str d[[REG_1]], [x0]
+;CHECK-NEXT: ret
+  %tmp = fcmp olt <4 x float> %a, zeroinitializer
+  %tmp2 = sext <4 x i1> %tmp to <4 x i16>
+  store <4 x i16> %tmp2, <4 x i16>* %p, align 8
+  ret void
+}
+
+define <2 x i32> @facge_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: facge_2s:
+;CHECK: facge.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @facge_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: facge_4s:
+;CHECK: facge.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @facge_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: facge_2d:
+;CHECK: facge.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i32> @facgt_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: facgt_2s:
+;CHECK: facgt.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @facgt_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: facgt_4s:
+;CHECK: facgt.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @facgt_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: facgt_2d:
+;CHECK: facgt.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @facge_s(float %A, float %B) nounwind {
+; CHECK-LABEL: facge_s:
+; CHECK: facge {{s[0-9]+}}, s0, s1
+  %mask = call i32 @llvm.aarch64.neon.facge.i32.f32(float %A, float %B)
+  ret i32 %mask
+}
+
+define i64 @facge_d(double %A, double %B) nounwind {
+; CHECK-LABEL: facge_d:
+; CHECK: facge {{d[0-9]+}}, d0, d1
+  %mask = call i64 @llvm.aarch64.neon.facge.i64.f64(double %A, double %B)
+  ret i64 %mask
+}
+
+declare i64 @llvm.aarch64.neon.facge.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facge.i32.f32(float, float)
+
+define i32 @facgt_s(float %A, float %B) nounwind {
+; CHECK-LABEL: facgt_s:
+; CHECK: facgt {{s[0-9]+}}, s0, s1
+  %mask = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %A, float %B)
+  ret i32 %mask
+}
+
+define i64 @facgt_d(double %A, double %B) nounwind {
+; CHECK-LABEL: facgt_d:
+; CHECK: facgt {{d[0-9]+}}, d0, d1
+  %mask = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %A, double %B)
+  ret i64 %mask
+}
+
+declare i64 @llvm.aarch64.neon.facgt.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facgt.i32.f32(float, float)
+
+define <8 x i8> @cmtst_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: cmtst_8b:
+;CHECK: cmtst.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %commonbits = and <8 x i8> %tmp1, %tmp2
+  %mask = icmp ne <8 x i8> %commonbits, zeroinitializer
+  %res = sext <8 x i1> %mask to <8 x i8>
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @cmtst_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: cmtst_16b:
+;CHECK: cmtst.16b
+  %tmp1 = load <16 x i8>* %A
+  %tmp2 = load <16 x i8>* %B
+  %commonbits = and <16 x i8> %tmp1, %tmp2
+  %mask = icmp ne <16 x i8> %commonbits, zeroinitializer
+  %res = sext <16 x i1> %mask to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <4 x i16> @cmtst_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: cmtst_4h:
+;CHECK: cmtst.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %commonbits = and <4 x i16> %tmp1, %tmp2
+  %mask = icmp ne <4 x i16> %commonbits, zeroinitializer
+  %res = sext <4 x i1> %mask to <4 x i16>
+  ret <4 x i16> %res
+}
+
+define <8 x i16> @cmtst_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: cmtst_8h:
+;CHECK: cmtst.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %commonbits = and <8 x i16> %tmp1, %tmp2
+  %mask = icmp ne <8 x i16> %commonbits, zeroinitializer
+  %res = sext <8 x i1> %mask to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <2 x i32> @cmtst_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: cmtst_2s:
+;CHECK: cmtst.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %commonbits = and <2 x i32> %tmp1, %tmp2
+  %mask = icmp ne <2 x i32> %commonbits, zeroinitializer
+  %res = sext <2 x i1> %mask to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @cmtst_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: cmtst_4s:
+;CHECK: cmtst.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %commonbits = and <4 x i32> %tmp1, %tmp2
+  %mask = icmp ne <4 x i32> %commonbits, zeroinitializer
+  %res = sext <4 x i1> %mask to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @cmtst_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: cmtst_2d:
+;CHECK: cmtst.2d
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i64>* %B
+  %commonbits = and <2 x i64> %tmp1, %tmp2
+  %mask = icmp ne <2 x i64> %commonbits, zeroinitializer
+  %res = sext <2 x i1> %mask to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define <1 x i64> @fcmeq_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmeq_d:
+; CHECK: fcmeq {{d[0-9]+}}, d0, d1
+  %tst = fcmp oeq <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmge_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmge_d:
+; CHECK: fcmge {{d[0-9]+}}, d0, d1
+  %tst = fcmp oge <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmle_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmle_d:
+; CHECK: fcmge {{d[0-9]+}}, d1, d0
+  %tst = fcmp ole <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgt_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmgt_d:
+; CHECK: fcmgt {{d[0-9]+}}, d0, d1
+  %tst = fcmp ogt <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlt_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmlt_d:
+; CHECK: fcmgt {{d[0-9]+}}, d1, d0
+  %tst = fcmp olt <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmnez_d(<1 x i64> %A) nounwind {
+; CHECK-LABEL: cmnez_d:
+; CHECK: cmeq d[[EQ:[0-9]+]], d0, #0
+; CHECK: mvn.8b v0, v[[EQ]]
+  %tst = icmp ne <1 x i64> %A, zeroinitializer
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
diff --git a/test/CodeGen/AArch64/arm64-vcnt.ll b/test/CodeGen/AArch64/arm64-vcnt.ll
new file mode 100644
index 0000000..903501e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcnt.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_8b:
+;CHECK: cls.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_16b:
+;CHECK: cls.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_4h:
+;CHECK: cls.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_8h:
+;CHECK: cls.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_2s:
+;CHECK: cls.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_4s:
+;CHECK: cls.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcombine.ll b/test/CodeGen/AArch64/arm64-vcombine.ll
new file mode 100644
index 0000000..fa12996
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcombine.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; LowerCONCAT_VECTORS() was reversing the order of two parts.
+; rdar://11558157
+; rdar://11559553
+define <16 x i8> @test(<16 x i8> %q0, <16 x i8> %q1, i8* nocapture %dest) nounwind {
+entry:
+; CHECK-LABEL: test:
+; CHECK: ins.d v0[1], v1[0]
+  %0 = bitcast <16 x i8> %q0 to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> zeroinitializer
+  %1 = bitcast <16 x i8> %q1 to <2 x i64>
+  %shuffle.i4 = shufflevector <2 x i64> %1, <2 x i64> undef, <1 x i32> zeroinitializer
+  %shuffle.i3 = shufflevector <1 x i64> %shuffle.i, <1 x i64> %shuffle.i4, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i3 to <16 x i8>
+  ret <16 x i8> %2
+}
diff --git a/test/CodeGen/AArch64/arm64-vcvt.ll b/test/CodeGen/AArch64/arm64-vcvt.ll
new file mode 100644
index 0000000..8c9e4e9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt.ll
@@ -0,0 +1,686 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtas_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtas_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtas_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtau_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtau_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtau_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtau_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtms_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtms_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtms_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtms_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtmu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtmu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtmu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtmu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtps_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtps_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtps_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtps_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtpu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtpu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtpu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtpu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtns_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtns_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtns_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtns_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtnu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtnu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtnu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtnu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzs_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <2 x float> %A to <2 x i32>
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzs_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzs_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <4 x float> %A to <4 x i32>
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzs_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <2 x double> %A to <2 x i64>
+	ret <2 x i64> %tmp3
+}
+
+
+define <2 x i32> @fcvtzu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <2 x float> %A to <2 x i32>
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <4 x float> %A to <4 x i32>
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <2 x double> %A to <2 x i64>
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @frinta_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frinta_2s:
+;CHECK-NOT: ld1
+;CHECK: frinta.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.round.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frinta_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frinta_4s:
+;CHECK-NOT: ld1
+;CHECK: frinta.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.round.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frinta_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frinta_2d:
+;CHECK-NOT: ld1
+;CHECK: frinta.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.round.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.round.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.round.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.round.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frinti_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frinti_2s:
+;CHECK-NOT: ld1
+;CHECK: frinti.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frinti_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frinti_4s:
+;CHECK-NOT: ld1
+;CHECK: frinti.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frinti_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frinti_2d:
+;CHECK-NOT: ld1
+;CHECK: frinti.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintm_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintm_2s:
+;CHECK-NOT: ld1
+;CHECK: frintm.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.floor.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintm_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintm_4s:
+;CHECK-NOT: ld1
+;CHECK: frintm.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.floor.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintm_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintm_2d:
+;CHECK-NOT: ld1
+;CHECK: frintm.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.floor.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintn_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintn_2s:
+;CHECK-NOT: ld1
+;CHECK: frintn.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintn_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintn_4s:
+;CHECK-NOT: ld1
+;CHECK: frintn.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintn_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintn_2d:
+;CHECK-NOT: ld1
+;CHECK: frintn.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintp_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintp_2s:
+;CHECK-NOT: ld1
+;CHECK: frintp.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.ceil.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintp_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintp_4s:
+;CHECK-NOT: ld1
+;CHECK: frintp.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintp_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintp_2d:
+;CHECK-NOT: ld1
+;CHECK: frintp.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintx_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintx_2s:
+;CHECK-NOT: ld1
+;CHECK: frintx.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.rint.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintx_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintx_4s:
+;CHECK-NOT: ld1
+;CHECK: frintx.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintx_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintx_2d:
+;CHECK-NOT: ld1
+;CHECK: frintx.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.rint.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintz_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintz_2s:
+;CHECK-NOT: ld1
+;CHECK: frintz.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.trunc.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintz_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintz_4s:
+;CHECK-NOT: ld1
+;CHECK: frintz.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintz_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintz_2d:
+;CHECK-NOT: ld1
+;CHECK: frintz.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtxn_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtxn v0.2s, v0.2d
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtxn_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtxn2 v0.4s, v1.2d
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+        %res = shufflevector <2 x float> %ret, <2 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+	ret <4 x float> %res
+}
+
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzsc_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzsc_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+
+define <2 x i32> @fcvtzuc_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzuc_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzuc_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+
+define <2 x float> @scvtf_2sc(<2 x i32> %A) nounwind {
+;CHECK-LABEL: scvtf_2sc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @scvtf_4sc(<4 x i32> %A) nounwind {
+;CHECK-LABEL: scvtf_4sc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @scvtf_2dc(<2 x i64> %A) nounwind {
+;CHECK-LABEL: scvtf_2dc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+
+define <2 x float> @ucvtf_2sc(<2 x i32> %A) nounwind {
+;CHECK-LABEL: ucvtf_2sc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @ucvtf_4sc(<4 x i32> %A) nounwind {
+;CHECK-LABEL: ucvtf_4sc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
+;CHECK-LABEL: ucvtf_2dc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	ret <2 x double> %tmp3
+}
+
+
+;CHECK-LABEL: autogen_SD28458:
+;CHECK: fcvt
+;CHECK: ret
+define void @autogen_SD28458() {
+  %Tr53 = fptrunc <8 x double> undef to <8 x float>
+  store <8 x float> %Tr53, <8 x float>* undef
+  ret void
+}
+
+;CHECK-LABEL: autogen_SD19225:
+;CHECK: fcvt
+;CHECK: ret
+define void @autogen_SD19225() {
+  %A = load <8 x float>* undef
+  %Tr53 = fpext <8 x float> %A to <8 x double>
+  store <8 x double> %Tr53, <8 x double>* undef
+  ret void
+}
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcvt_f.ll b/test/CodeGen/AArch64/arm64-vcvt_f.ll
new file mode 100644
index 0000000..d244958
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -0,0 +1,82 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_f64_f32:
+  %vcvt1.i = fpext <2 x float> %x to <2 x double>
+; CHECK: fcvtl	v0.2d, v0.2s
+  ret <2 x double> %vcvt1.i
+; CHECK: ret
+}
+
+define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_high_f64_f32:
+  %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
+; CHECK: fcvtl2	v0.2d, v0.4s
+  ret <2 x double> %vcvt1.i
+; CHECK: ret
+}
+
+define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_f32_f64:
+  %vcvt1.i = fptrunc <2 x double> %v to <2 x float>
+; CHECK: fcvtn
+  ret <2 x float> %vcvt1.i
+; CHECK: ret
+}
+
+define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_high_f32_f64:
+
+  %cvt = fptrunc <2 x double> %v to <2 x float>
+  %vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: fcvtn2
+  ret <4 x float> %vcvt2.i
+; CHECK: ret
+}
+
+define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvtx_f32_f64:
+  %vcvtx1.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+; CHECK: fcvtxn
+  ret <2 x float> %vcvtx1.i
+; CHECK: ret
+}
+
+define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvtx_high_f32_f64:
+  %vcvtx2.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+  %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: fcvtxn2
+  ret <4 x float> %res
+; CHECK: ret
+}
+
+
+declare <2 x double> @llvm.aarch64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfp2df(<2 x float>) nounwind readnone
+
+declare <2 x float> @llvm.aarch64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
+
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+
+define i16 @to_half(float %in) {
+; CHECK-LABEL: to_half:
+; CHECK: fcvt h[[HALFVAL:[0-9]+]], s0
+; CHECK: fmov {{w[0-9]+}}, {{s[0-9]+}}
+  %res = call i16 @llvm.convert.to.fp16(float %in)
+  ret i16 %res
+}
+
+define float @from_half(i16 %in) {
+; CHECK-LABEL: from_half:
+; CHECK: fmov s[[HALFVAL:[0-9]+]], {{w[0-9]+}}
+; CHECK: fcvt s0, h[[HALFVAL]]
+  %res = call float @llvm.convert.from.fp16(i16 %in)
+  ret float %res
+}
+
+declare float @llvm.convert.from.fp16(i16) #1
+declare i16 @llvm.convert.to.fp16(float) #1
diff --git a/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll b/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
new file mode 100644
index 0000000..1eb7b43
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @ucvt(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: ucvt:
+; CHECK: ucvtf.2s  v0, v0
+; CHECK: ret
+
+  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <2 x float> @scvt(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: scvt:
+; CHECK: scvtf.2s  v0, v0
+; CHECK: ret
+  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <4 x float> @ucvtq(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: ucvtq:
+; CHECK: ucvtf.4s  v0, v0
+; CHECK: ret
+  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @scvtq(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: scvtq:
+; CHECK: scvtf.4s  v0, v0
+; CHECK: ret
+  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @cvtf16(<4 x i16> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16:
+; CHECK: fcvtl  v0.4s, v0.4h
+; CHECK-NEXT: ret
+  %vcvt1.i = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> %a) nounwind
+  ret <4 x float> %vcvt1.i
+}
+
+define <4 x float> @cvtf16_high(<8 x i16> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16_high:
+; CHECK: fcvtl2  v0.4s, v0.8h
+; CHECK-NEXT: ret
+  %in = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vcvt1.i = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> %in) nounwind
+  ret <4 x float> %vcvt1.i
+}
+
+
+
+define <4 x i16> @cvtf16f32(<4 x float> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16f32:
+; CHECK: fcvtn  v0.4h, v0.4s
+; CHECK-NEXT: ret
+  %vcvt1.i = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) nounwind
+  ret <4 x i16> %vcvt1.i
+}
+
+define <8 x i16> @cvtf16f32_high(<4 x i16> %low, <4 x float> %high_big) {
+; CHECK-LABEL: cvtf16f32_high:
+; CHECK: fcvtn2 v0.8h, v1.4s
+; CHECK-NEXT: ret
+  %high = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %high_big)
+  %res = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+declare <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcvt_n.ll b/test/CodeGen/AArch64/arm64-vcvt_n.ll
new file mode 100644
index 0000000..7ed5be6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_n.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxpu:
+; CHECK: ucvtf.2s	v0, v0, #9
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
+  ret <2 x float> %vcvt_n1
+}
+
+define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxps:
+; CHECK: scvtf.2s	v0, v0, #12
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
+  ret <2 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxpu:
+; CHECK: ucvtf.4s	v0, v0, #18
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
+  ret <4 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxps:
+; CHECK: scvtf.4s	v0, v0, #30
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
+  ret <4 x float> %vcvt_n1
+}
+define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
+  ret <2 x double> %vcvt_n1
+}
+
+define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
+  ret <2 x double> %vcvt_n1
+}
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll b/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
new file mode 100644
index 0000000..985a5f7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x i32> @c1(<2 x float> %a) nounwind readnone ssp {
+; CHECK: c1
+; CHECK: fcvtzs.2s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <2 x i32> @c2(<2 x float> %a) nounwind readnone ssp {
+; CHECK: c2
+; CHECK: fcvtzu.2s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <4 x i32> @c3(<4 x float> %a) nounwind readnone ssp {
+; CHECK: c3
+; CHECK: fcvtzs.4s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
+define <4 x i32> @c4(<4 x float> %a) nounwind readnone ssp {
+; CHECK: c4
+; CHECK: fcvtzu.4s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
diff --git a/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll b/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
new file mode 100644
index 0000000..b29c22c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @fcvtxn(double %a) {
+; CHECK-LABEL: fcvtxn:
+; CHECK: fcvtxn s0, d0
+; CHECK-NEXT: ret
+  %vcvtxd.i = tail call float @llvm.aarch64.sisd.fcvtxn(double %a) nounwind
+  ret float %vcvtxd.i
+}
+
+declare float @llvm.aarch64.sisd.fcvtxn(double) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vecCmpBr.ll b/test/CodeGen/AArch64/arm64-vecCmpBr.ll
new file mode 100644
index 0000000..c7321e4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vecCmpBr.ll
@@ -0,0 +1,207 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+; ModuleID = 'arm64_vecCmpBr.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+
+define i32 @anyZero64(<4 x i16> %a) #0 {
+; CHECK: _anyZero64:
+; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...) #1
+
+define i32 @anyZero128(<8 x i16> %a) #0 {
+; CHECK: _anyZero128:
+; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @anyNonZero64(<4 x i16> %a) #0 {
+; CHECK: _anyNonZero64:
+; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @anyNonZero128(<8 x i16> %a) #0 {
+; CHECK: _anyNonZero128:
+; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allZero64(<4 x i16> %a) #0 {
+; CHECK: _allZero64:
+; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allZero128(<8 x i16> %a) #0 {
+; CHECK: _allZero128:
+; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allNonZero64(<4 x i16> %a) #0 {
+; CHECK: _allNonZero64:
+; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allNonZero128(<8 x i16> %a) #0 {
+; CHECK: _allNonZero128:
+; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>) #2
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>) #2
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) #2
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) #2
+
+attributes #0 = { nounwind ssp "target-cpu"="cyclone" }
+attributes #1 = { "target-cpu"="cyclone" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+attributes #4 = { nobuiltin nounwind }
diff --git a/test/CodeGen/AArch64/arm64-vecFold.ll b/test/CodeGen/AArch64/arm64-vecFold.ll
new file mode 100644
index 0000000..aeacfcc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vecFold.ll
@@ -0,0 +1,145 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple -o - %s| FileCheck %s
+
+define <16 x i8> @foov16i8(<8 x i16> %a0, <8 x i16> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov16i8:
+  %vshrn_low_shift = lshr <8 x i16> %a0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %vshrn_low = trunc <8 x i16> %vshrn_low_shift to <8 x i8>
+  %vshrn_high_shift = lshr <8 x i16> %b0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %vshrn_high = trunc <8 x i16> %vshrn_high_shift to <8 x i8>
+; CHECK: shrn.8b v0, v0, #5
+; CHECK-NEXT: shrn2.16b v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <8 x i8> %vshrn_low to <1 x i64>
+  %2 = bitcast <8 x i8> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @foov8i16(<4 x i32> %a0, <4 x i32> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov8i16:
+  %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_low = trunc <4 x i32> %vshrn_low_shift to <4 x i16>
+  %vshrn_high_shift = lshr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
+; CHECK: shrn.4h v0, v0, #5
+; CHECK-NEXT: shrn2.8h v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vshrn_low to <1 x i64>
+  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @foov4i32(<2 x i64> %a0, <2 x i64> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov4i32:
+  %vshrn_low_shift = lshr <2 x i64> %a0, <i64 5, i64 5>
+  %vshrn_low = trunc <2 x i64> %vshrn_low_shift to <2 x i32>
+  %vshrn_high_shift = lshr <2 x i64> %b0, <i64 5, i64 5>
+  %vshrn_high = trunc <2 x i64> %vshrn_high_shift to <2 x i32>
+; CHECK: shrn.2s v0, v0, #5
+; CHECK-NEXT: shrn2.4s v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <2 x i32> %vshrn_low to <1 x i64>
+  %2 = bitcast <2 x i32> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: bar:
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vaddhn2.i10 = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+; CHECK: addhn.4h	v0, v0, v1
+; CHECK-NEXT: addhn2.8h	v0, v2, v3
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vaddhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @baz(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: baz:
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vshrn_high_shift = ashr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
+; CHECK: addhn.4h	v0, v0, v1
+; CHECK-NEXT: shrn2.8h	v0, v2, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @raddhn(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: raddhn:
+entry:
+; CHECK: 	raddhn.4h	v0, v0, v1
+; CHECK-NEXT: 	raddhn2.8h	v0, v2, v3
+; CHECK-NEXT: 	ret
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vraddhn2.i10 = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %0 = bitcast <4 x i16> %vraddhn2.i to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @vrshrn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
+; CHECK-LABEL: vrshrn:
+; CHECK: rshrn.8b	v0, v0, #5
+; CHECK-NEXT: rshrn2.16b	v0, v2, #6
+; CHECK-NEXT: ret
+  %vrshrn_n1 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
+  %vrshrn_n4 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
+  %1 = bitcast <8 x i8> %vrshrn_n1 to <1 x i64>
+  %2 = bitcast <8 x i8> %vrshrn_n4 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @vrsubhn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
+; CHECK-LABEL: vrsubhn:
+; CHECK: rsubhn.8b	v0, v0, v1
+; CHECK: rsubhn2.16b	v0, v2, v3
+; CHECK-NEXT: 	ret
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
+  %vrsubhn2.i10 = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
+  %1 = bitcast <8 x i8> %vrsubhn2.i to <1 x i64>
+  %2 = bitcast <8 x i8> %vrsubhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @noOpt1(<2 x i32> %a0, <2 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: noOpt1:
+  %vqsub2.i = tail call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+; CHECK:	sqsub.2s	v0, v0, v1
+; CHECK-NEXT:	addhn2.8h	v0, v2, v3
+  %1 = bitcast <2 x i32> %vqsub2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
new file mode 100644
index 0000000..650ff1e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func30
+;CHECK: ushll.4s  v0, v0, #0
+;CHECK: movi.4s v1, #0x1
+;CHECK: and.16b v0, v0, v1
+;CHECK: str  q0, [x0]
+;CHECK: ret
+
+%T0_30 = type <4 x i1>
+%T1_30 = type <4 x i32>
+define void @func30(%T0_30 %v0, %T1_30* %p1) {
+  %r = zext %T0_30 %v0 to %T1_30
+  store %T1_30 %r, %T1_30* %p1
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-vector-imm.ll b/test/CodeGen/AArch64/arm64-vector-imm.ll
new file mode 100644
index 0000000..9fb088b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-imm.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @v_orrimm(<8 x i8>* %A) nounwind {
+; CHECK-LABEL: v_orrimm:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: orr
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = or <8 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @v_orrimmQ(<16 x i8>* %A) nounwind {
+; CHECK: v_orrimmQ
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: orr
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = or <16 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @v_bicimm(<8 x i8>* %A) nounwind {
+; CHECK-LABEL: v_bicimm:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: bic
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @v_bicimmQ(<16 x i8>* %A) nounwind {
+; CHECK-LABEL: v_bicimmQ:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: bic
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+	ret <16 x i8> %tmp3
+}
+
+define <2 x double> @foo(<2 x double> %bar) nounwind {
+; CHECK: foo
+; CHECK: fmov.2d	v1, #1.0000000
+  %add = fadd <2 x double> %bar, <double 1.0, double 1.0>
+  ret <2 x double> %add
+}
+
+define <4 x i32> @movi_4s_imm_t1() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t1:
+; CHECK: movi.4s v0, #0x4b
+  ret <4 x i32> <i32 75, i32 75, i32 75, i32 75>
+}
+
+define <4 x i32> @movi_4s_imm_t2() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t2:
+; CHECK: movi.4s v0, #0x4b, lsl #8
+  ret <4 x i32> <i32 19200, i32 19200, i32 19200, i32 19200>
+}
+
+define <4 x i32> @movi_4s_imm_t3() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t3:
+; CHECK: movi.4s v0, #0x4b, lsl #16
+  ret <4 x i32> <i32 4915200, i32 4915200, i32 4915200, i32 4915200>
+}
+
+define <4 x i32> @movi_4s_imm_t4() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t4:
+; CHECK: movi.4s v0, #0x4b, lsl #24
+  ret <4 x i32> <i32 1258291200, i32 1258291200, i32 1258291200, i32 1258291200>
+}
+
+define <8 x i16> @movi_8h_imm_t5() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_8h_imm_t5:
+; CHECK: movi.8h v0, #0x4b
+  ret <8 x i16> <i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75>
+}
+
+; rdar://11989841
+define <8 x i16> @movi_8h_imm_t6() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_8h_imm_t6:
+; CHECK: movi.8h v0, #0x4b, lsl #8
+  ret <8 x i16> <i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200>
+}
+
+define <4 x i32> @movi_4s_imm_t7() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t7:
+; CHECK: movi.4s v0, #0x4b, msl #8
+ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
+}
+
+define <4 x i32> @movi_4s_imm_t8() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t8:
+; CHECK: movi.4s v0, #0x4b, msl #16
+ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
+}
+
+define <16 x i8> @movi_16b_imm_t9() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_16b_imm_t9:
+; CHECK: movi.16b v0, #0x4b
+ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75,
+               i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
+}
+
+define <2 x i64> @movi_2d_imm_t10() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_2d_imm_t10:
+; CHECK: movi.2d v0, #0xff00ff00ff00ff
+ret <2 x i64> <i64 71777214294589695, i64 71777214294589695>
+}
+
+define <4 x i32> @movi_4s_imm_t11() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t11:
+; CHECK: fmov.4s v0, #-0.32812500
+ret <4 x i32> <i32 3198681088, i32 3198681088, i32 3198681088, i32 3198681088>
+}
+
+define <2 x i64> @movi_2d_imm_t12() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_2d_imm_t12:
+; CHECK: fmov.2d v0, #-0.17187500
+ret <2 x i64> <i64 13818732506632945664, i64 13818732506632945664>
+}
diff --git a/test/CodeGen/AArch64/arm64-vector-insertion.ll b/test/CodeGen/AArch64/arm64-vector-insertion.ll
new file mode 100644
index 0000000..8fbff71
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=arm64 -mcpu=generic -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define void @test0f(float* nocapture %x, float %a) #0 {
+entry:
+  %0 = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %a, i32 0
+  %1 = bitcast float* %x to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+
+  ; CHECK-LABEL: test0f
+  ; CHECK: movi.2d v[[TEMP:[0-9]+]], #0000000000000000
+  ; CHECK: ins.s v[[TEMP]][0], v{{[0-9]+}}[0]
+  ; CHECK: str q[[TEMP]], [x0]
+  ; CHECK: ret
+
+
+}
+
+
+define void @test1f(float* nocapture %x, float %a) #0 {
+entry:
+  %0 = insertelement <4 x float> <float undef, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, float %a, i32 0
+  %1 = bitcast float* %x to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+
+  ; CHECK-LABEL: test1f
+  ; CHECK: fmov  s[[TEMP:[0-9]+]], #1.0000000
+  ; CHECK: dup.4s  v[[TEMP2:[0-9]+]], v[[TEMP]][0]
+  ; CHECK: ins.s v[[TEMP2]][0], v0[0]
+  ; CHECK: str q[[TEMP2]], [x0]
+  ; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-vector-ldst.ll b/test/CodeGen/AArch64/arm64-vector-ldst.ll
new file mode 100644
index 0000000..c001915
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -0,0 +1,601 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+; rdar://9428579
+
+%type1 = type { <16 x i8> }
+%type2 = type { <8 x i8> }
+%type3 = type { <4 x i16> }
+
+
+define hidden fastcc void @t1(%type1** %argtable) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldr x[[REG:[0-9]+]], [x0]
+; CHECK: str q0, [x[[REG]]]
+  %tmp1 = load %type1** %argtable, align 8
+  %tmp2 = getelementptr inbounds %type1* %tmp1, i64 0, i32 0
+  store <16 x i8> zeroinitializer, <16 x i8>* %tmp2, align 16
+  ret void
+}
+
+define hidden fastcc void @t2(%type2** %argtable) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr x[[REG:[0-9]+]], [x0]
+; CHECK: str d0, [x[[REG]]]
+  %tmp1 = load %type2** %argtable, align 8
+  %tmp2 = getelementptr inbounds %type2* %tmp1, i64 0, i32 0
+  store <8 x i8> zeroinitializer, <8 x i8>* %tmp2, align 8
+  ret void
+}
+
+; add a bunch of tests for rdar://11246289
+
+@globalArray64x2 = common global <2 x i64>* null, align 8
+@globalArray32x4 = common global <4 x i32>* null, align 8
+@globalArray16x8 = common global <8 x i16>* null, align 8
+@globalArray8x16 = common global <16 x i8>* null, align 8
+@globalArray64x1 = common global <1 x i64>* null, align 8
+@globalArray32x2 = common global <2 x i32>* null, align 8
+@globalArray16x4 = common global <4 x i16>* null, align 8
+@globalArray8x8 = common global <8 x i8>* null, align 8
+@floatglobalArray64x2 = common global <2 x double>* null, align 8
+@floatglobalArray32x4 = common global <4 x float>* null, align 8
+@floatglobalArray64x1 = common global <1 x double>* null, align 8
+@floatglobalArray32x2 = common global <2 x float>* null, align 8
+
+define void @fct1_64x2(<2 x i64>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_64x2:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 %offset
+  %tmp = load <2 x i64>* %arrayidx, align 16
+  %tmp1 = load <2 x i64>** @globalArray64x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 %offset
+  store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_64x2(<2 x i64>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_64x2:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 3
+  %tmp = load <2 x i64>* %arrayidx, align 16
+  %tmp1 = load <2 x i64>** @globalArray64x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 5
+  store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_32x4(<4 x i32>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_32x4:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 %offset
+  %tmp = load <4 x i32>* %arrayidx, align 16
+  %tmp1 = load <4 x i32>** @globalArray32x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 %offset
+  store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_32x4(<4 x i32>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_32x4:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 3
+  %tmp = load <4 x i32>* %arrayidx, align 16
+  %tmp1 = load <4 x i32>** @globalArray32x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 5
+  store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_16x8(<8 x i16>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_16x8:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 %offset
+  %tmp = load <8 x i16>* %arrayidx, align 16
+  %tmp1 = load <8 x i16>** @globalArray16x8, align 8
+  %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 %offset
+  store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_16x8(<8 x i16>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_16x8:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 3
+  %tmp = load <8 x i16>* %arrayidx, align 16
+  %tmp1 = load <8 x i16>** @globalArray16x8, align 8
+  %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 5
+  store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_8x16(<16 x i8>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_8x16:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 %offset
+  %tmp = load <16 x i8>* %arrayidx, align 16
+  %tmp1 = load <16 x i8>** @globalArray8x16, align 8
+  %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 %offset
+  store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_8x16(<16 x i8>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_8x16:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 3
+  %tmp = load <16 x i8>* %arrayidx, align 16
+  %tmp1 = load <16 x i8>** @globalArray8x16, align 8
+  %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 5
+  store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_64x1(<1 x i64>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_64x1:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 %offset
+  %tmp = load <1 x i64>* %arrayidx, align 8
+  %tmp1 = load <1 x i64>** @globalArray64x1, align 8
+  %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 %offset
+  store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct2_64x1(<1 x i64>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_64x1:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+  %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 3
+  %tmp = load <1 x i64>* %arrayidx, align 8
+  %tmp1 = load <1 x i64>** @globalArray64x1, align 8
+  %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 5
+  store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct1_32x2(<2 x i32>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_32x2:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 %offset
+  %tmp = load <2 x i32>* %arrayidx, align 8
+  %tmp1 = load <2 x i32>** @globalArray32x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 %offset
+  store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct2_32x2(<2 x i32>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_32x2:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+  %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 3
+  %tmp = load <2 x i32>* %arrayidx, align 8
+  %tmp1 = load <2 x i32>** @globalArray32x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 5
+  store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct1_16x4(<4 x i16>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_16x4:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 %offset
+  %tmp = load <4 x i16>* %arrayidx, align 8
+  %tmp1 = load <4 x i16>** @globalArray16x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 %offset
+  store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct2_16x4(<4 x i16>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_16x4:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+  %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 3
+  %tmp = load <4 x i16>* %arrayidx, align 8
+  %tmp1 = load <4 x i16>** @globalArray16x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 5
+  store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct1_8x8(<8 x i8>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_8x8:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <8 x i8>* %array, i64 %offset
+  %tmp = load <8 x i8>* %arrayidx, align 8
+  %tmp1 = load <8 x i8>** @globalArray8x8, align 8
+  %arrayidx1 = getelementptr inbounds <8 x i8>* %tmp1, i64 %offset
+  store <8 x i8> %tmp, <8 x i8>* %arrayidx1, align 8
+  ret void
+}
+
+; Add a bunch of tests for rdar://13258794: Match LDUR/STUR for D and Q
+; registers for unscaled vector accesses
+@str = global [63 x i8] c"Test case for rdar://13258794: LDUR/STUR for D and Q registers\00", align 1
+
+define <1 x i64> @fct0() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct0:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
+  ret <1 x i64> %0
+}
+
+define <2 x i32> @fct1() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct1:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
+  ret <2 x i32> %0
+}
+
+define <4 x i16> @fct2() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct2:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
+  ret <4 x i16> %0
+}
+
+define <8 x i8> @fct3() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct3:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
+  ret <8 x i8> %0
+}
+
+define <2 x i64> @fct4() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct4:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
+  ret <2 x i64> %0
+}
+
+define <4 x i32> @fct5() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct5:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
+  ret <4 x i32> %0
+}
+
+define <8 x i16> @fct6() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct6:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
+  ret <8 x i16> %0
+}
+
+define <16 x i8> @fct7() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct7:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
+  ret <16 x i8> %0
+}
+
+define void @fct8() nounwind ssp {
+entry:
+; CHECK-LABEL: fct8:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
+  store <1 x i64> %0, <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <1 x i64>*), align 8
+  ret void
+}
+
+define void @fct9() nounwind ssp {
+entry:
+; CHECK-LABEL: fct9:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
+  store <2 x i32> %0, <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i32>*), align 8
+  ret void
+}
+
+define void @fct10() nounwind ssp {
+entry:
+; CHECK-LABEL: fct10:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
+  store <4 x i16> %0, <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i16>*), align 8
+  ret void
+}
+
+define void @fct11() nounwind ssp {
+entry:
+; CHECK-LABEL: fct11:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
+  store <8 x i8> %0, <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i8>*), align 8
+  ret void
+}
+
+define void @fct12() nounwind ssp {
+entry:
+; CHECK-LABEL: fct12:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
+  store <2 x i64> %0, <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i64>*), align 16
+  ret void
+}
+
+define void @fct13() nounwind ssp {
+entry:
+; CHECK-LABEL: fct13:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
+  store <4 x i32> %0, <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i32>*), align 16
+  ret void
+}
+
+define void @fct14() nounwind ssp {
+entry:
+; CHECK-LABEL: fct14:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
+  store <8 x i16> %0, <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i16>*), align 16
+  ret void
+}
+
+define void @fct15() nounwind ssp {
+entry:
+; CHECK-LABEL: fct15:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
+  store <16 x i8> %0, <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <16 x i8>*), align 16
+  ret void
+}
+
+; Check the building of vector from a single loaded value.
+; Part of <rdar://problem/14170854>
+;
+; Single loads with immediate offset.
+define <8 x i8> @fct16(i8* nocapture %sp0) {
+; CHECK-LABEL: fct16:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i8> %vec, %vec
+  ret <8 x i8> %vmull.i
+}
+
+define <16 x i8> @fct17(i8* nocapture %sp0) {
+; CHECK-LABEL: fct17:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <16 x i8> %vec, %vec
+  ret <16 x i8> %vmull.i
+}
+
+define <4 x i16> @fct18(i16* nocapture %sp0) {
+; CHECK-LABEL: fct18:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i16> %vec, %vec
+  ret <4 x i16> %vmull.i
+}
+
+define <8 x i16> @fct19(i16* nocapture %sp0) {
+; CHECK-LABEL: fct19:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i16> %vec, %vec
+  ret <8 x i16> %vmull.i
+}
+
+define <2 x i32> @fct20(i32* nocapture %sp0) {
+; CHECK-LABEL: fct20:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <2 x i32> %vec, %vec
+  ret <2 x i32> %vmull.i
+}
+
+define <4 x i32> @fct21(i32* nocapture %sp0) {
+; CHECK-LABEL: fct21:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i32> %vec, %vec
+  ret <4 x i32> %vmull.i
+}
+
+define <1 x i64> @fct22(i64* nocapture %sp0) {
+; CHECK-LABEL: fct22:
+; CHECK: ldr d0, [x0, #8]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+   ret <1 x i64> %vec
+}
+
+define <2 x i64> @fct23(i64* nocapture %sp0) {
+; CHECK-LABEL: fct23:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+  ret <2 x i64> %vec
+}
+
+;
+; Single loads with register offset.
+define <8 x i8> @fct24(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct24:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i8> %vec, %vec
+  ret <8 x i8> %vmull.i
+}
+
+define <16 x i8> @fct25(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct25:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <16 x i8> %vec, %vec
+  ret <16 x i8> %vmull.i
+}
+
+define <4 x i16> @fct26(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct26:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i16> %vec, %vec
+  ret <4 x i16> %vmull.i
+}
+
+define <8 x i16> @fct27(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct27:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i16> %vec, %vec
+  ret <8 x i16> %vmull.i
+}
+
+define <2 x i32> @fct28(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct28:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <2 x i32> %vec, %vec
+  ret <2 x i32> %vmull.i
+}
+
+define <4 x i32> @fct29(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct29:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i32> %vec, %vec
+  ret <4 x i32> %vmull.i
+}
+
+define <1 x i64> @fct30(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct30:
+; CHECK: ldr d0, [x0, x1, lsl #3]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+   ret <1 x i64> %vec
+}
+
+define <2 x i64> @fct31(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct31:
+; CHECK: ldr d0, [x0, x1, lsl #3]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+  ret <2 x i64> %vec
+}
diff --git a/test/CodeGen/AArch64/arm64-vext.ll b/test/CodeGen/AArch64/arm64-vext.ll
new file mode 100644
index 0000000..2240dfd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vext.ll
@@ -0,0 +1,464 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define void @test_vext_s8() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s8:
+  ; CHECK: {{ext.8.*#1}}
+  %xS8x8 = alloca <8 x i8>, align 8
+  %__a = alloca <8 x i8>, align 8
+  %__b = alloca <8 x i8>, align 8
+  %tmp = load <8 x i8>* %xS8x8, align 8
+  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+  %tmp1 = load <8 x i8>* %xS8x8, align 8
+  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+  %tmp2 = load <8 x i8>* %__a, align 8
+  %tmp3 = load <8 x i8>* %__b, align 8
+  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  store <8 x i8> %vext, <8 x i8>* %xS8x8, align 8
+  ret void
+}
+
+define void @test_vext_u8() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u8:
+  ; CHECK: {{ext.8.*#2}}
+  %xU8x8 = alloca <8 x i8>, align 8
+  %__a = alloca <8 x i8>, align 8
+  %__b = alloca <8 x i8>, align 8
+  %tmp = load <8 x i8>* %xU8x8, align 8
+  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+  %tmp1 = load <8 x i8>* %xU8x8, align 8
+  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+  %tmp2 = load <8 x i8>* %__a, align 8
+  %tmp3 = load <8 x i8>* %__b, align 8
+  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  store <8 x i8> %vext, <8 x i8>* %xU8x8, align 8
+  ret void
+}
+
+define void @test_vext_p8() nounwind ssp {
+  ; CHECK-LABEL: test_vext_p8:
+  ; CHECK: {{ext.8.*#3}}
+  %xP8x8 = alloca <8 x i8>, align 8
+  %__a = alloca <8 x i8>, align 8
+  %__b = alloca <8 x i8>, align 8
+  %tmp = load <8 x i8>* %xP8x8, align 8
+  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+  %tmp1 = load <8 x i8>* %xP8x8, align 8
+  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+  %tmp2 = load <8 x i8>* %__a, align 8
+  %tmp3 = load <8 x i8>* %__b, align 8
+  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  store <8 x i8> %vext, <8 x i8>* %xP8x8, align 8
+  ret void
+}
+
+define void @test_vext_s16() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s16:
+  ; CHECK: {{ext.8.*#2}}
+  %xS16x4 = alloca <4 x i16>, align 8
+  %__a = alloca <4 x i16>, align 8
+  %__b = alloca <4 x i16>, align 8
+  %tmp = load <4 x i16>* %xS16x4, align 8
+  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+  %tmp1 = load <4 x i16>* %xS16x4, align 8
+  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+  %tmp2 = load <4 x i16>* %__a, align 8
+  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+  %tmp4 = load <4 x i16>* %__b, align 8
+  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  store <4 x i16> %vext, <4 x i16>* %xS16x4, align 8
+  ret void
+}
+
+define void @test_vext_u16() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u16:
+  ; CHECK: {{ext.8.*#4}}
+  %xU16x4 = alloca <4 x i16>, align 8
+  %__a = alloca <4 x i16>, align 8
+  %__b = alloca <4 x i16>, align 8
+  %tmp = load <4 x i16>* %xU16x4, align 8
+  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+  %tmp1 = load <4 x i16>* %xU16x4, align 8
+  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+  %tmp2 = load <4 x i16>* %__a, align 8
+  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+  %tmp4 = load <4 x i16>* %__b, align 8
+  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  store <4 x i16> %vext, <4 x i16>* %xU16x4, align 8
+  ret void
+}
+
+define void @test_vext_p16() nounwind ssp {
+  ; CHECK-LABEL: test_vext_p16:
+  ; CHECK: {{ext.8.*#6}}
+  %xP16x4 = alloca <4 x i16>, align 8
+  %__a = alloca <4 x i16>, align 8
+  %__b = alloca <4 x i16>, align 8
+  %tmp = load <4 x i16>* %xP16x4, align 8
+  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+  %tmp1 = load <4 x i16>* %xP16x4, align 8
+  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+  %tmp2 = load <4 x i16>* %__a, align 8
+  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+  %tmp4 = load <4 x i16>* %__b, align 8
+  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  store <4 x i16> %vext, <4 x i16>* %xP16x4, align 8
+  ret void
+}
+
+define void @test_vext_s32() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s32:
+  ; CHECK: {{ext.8.*#4}}
+  %xS32x2 = alloca <2 x i32>, align 8
+  %__a = alloca <2 x i32>, align 8
+  %__b = alloca <2 x i32>, align 8
+  %tmp = load <2 x i32>* %xS32x2, align 8
+  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
+  %tmp1 = load <2 x i32>* %xS32x2, align 8
+  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
+  %tmp2 = load <2 x i32>* %__a, align 8
+  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
+  %tmp4 = load <2 x i32>* %__b, align 8
+  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
+  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i32> %vext, <2 x i32>* %xS32x2, align 8
+  ret void
+}
+
+define void @test_vext_u32() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u32:
+  ; CHECK: {{ext.8.*#4}}
+  %xU32x2 = alloca <2 x i32>, align 8
+  %__a = alloca <2 x i32>, align 8
+  %__b = alloca <2 x i32>, align 8
+  %tmp = load <2 x i32>* %xU32x2, align 8
+  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
+  %tmp1 = load <2 x i32>* %xU32x2, align 8
+  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
+  %tmp2 = load <2 x i32>* %__a, align 8
+  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
+  %tmp4 = load <2 x i32>* %__b, align 8
+  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
+  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i32> %vext, <2 x i32>* %xU32x2, align 8
+  ret void
+}
+
+define void @test_vext_f32() nounwind ssp {
+  ; CHECK-LABEL: test_vext_f32:
+  ; CHECK: {{ext.8.*#4}}
+  %xF32x2 = alloca <2 x float>, align 8
+  %__a = alloca <2 x float>, align 8
+  %__b = alloca <2 x float>, align 8
+  %tmp = load <2 x float>* %xF32x2, align 8
+  store <2 x float> %tmp, <2 x float>* %__a, align 8
+  %tmp1 = load <2 x float>* %xF32x2, align 8
+  store <2 x float> %tmp1, <2 x float>* %__b, align 8
+  %tmp2 = load <2 x float>* %__a, align 8
+  %tmp3 = bitcast <2 x float> %tmp2 to <8 x i8>
+  %tmp4 = load <2 x float>* %__b, align 8
+  %tmp5 = bitcast <2 x float> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x float>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x float>
+  %vext = shufflevector <2 x float> %tmp6, <2 x float> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x float> %vext, <2 x float>* %xF32x2, align 8
+  ret void
+}
+
+define void @test_vext_s64() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s64:
+  ; CHECK_FIXME: {{ext.8.*#1}}
+  ; this just turns into a load of the second element
+  %xS64x1 = alloca <1 x i64>, align 8
+  %__a = alloca <1 x i64>, align 8
+  %__b = alloca <1 x i64>, align 8
+  %tmp = load <1 x i64>* %xS64x1, align 8
+  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
+  %tmp1 = load <1 x i64>* %xS64x1, align 8
+  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
+  %tmp2 = load <1 x i64>* %__a, align 8
+  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
+  %tmp4 = load <1 x i64>* %__b, align 8
+  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
+  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
+  store <1 x i64> %vext, <1 x i64>* %xS64x1, align 8
+  ret void
+}
+
+define void @test_vext_u64() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u64:
+  ; CHECK_FIXME: {{ext.8.*#1}}
+  ; this is turned into a simple load of the 2nd element
+  %xU64x1 = alloca <1 x i64>, align 8
+  %__a = alloca <1 x i64>, align 8
+  %__b = alloca <1 x i64>, align 8
+  %tmp = load <1 x i64>* %xU64x1, align 8
+  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
+  %tmp1 = load <1 x i64>* %xU64x1, align 8
+  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
+  %tmp2 = load <1 x i64>* %__a, align 8
+  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
+  %tmp4 = load <1 x i64>* %__b, align 8
+  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
+  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
+  store <1 x i64> %vext, <1 x i64>* %xU64x1, align 8
+  ret void
+}
+
+define void @test_vextq_s8() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s8:
+  ; CHECK: {{ext.16.*#4}}
+  %xS8x16 = alloca <16 x i8>, align 16
+  %__a = alloca <16 x i8>, align 16
+  %__b = alloca <16 x i8>, align 16
+  %tmp = load <16 x i8>* %xS8x16, align 16
+  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+  %tmp1 = load <16 x i8>* %xS8x16, align 16
+  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+  %tmp2 = load <16 x i8>* %__a, align 16
+  %tmp3 = load <16 x i8>* %__b, align 16
+  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+  store <16 x i8> %vext, <16 x i8>* %xS8x16, align 16
+  ret void
+}
+
+define void @test_vextq_u8() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u8:
+  ; CHECK: {{ext.16.*#5}}
+  %xU8x16 = alloca <16 x i8>, align 16
+  %__a = alloca <16 x i8>, align 16
+  %__b = alloca <16 x i8>, align 16
+  %tmp = load <16 x i8>* %xU8x16, align 16
+  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+  %tmp1 = load <16 x i8>* %xU8x16, align 16
+  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+  %tmp2 = load <16 x i8>* %__a, align 16
+  %tmp3 = load <16 x i8>* %__b, align 16
+  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+  store <16 x i8> %vext, <16 x i8>* %xU8x16, align 16
+  ret void
+}
+
+define void @test_vextq_p8() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_p8:
+  ; CHECK: {{ext.16.*#6}}
+  %xP8x16 = alloca <16 x i8>, align 16
+  %__a = alloca <16 x i8>, align 16
+  %__b = alloca <16 x i8>, align 16
+  %tmp = load <16 x i8>* %xP8x16, align 16
+  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+  %tmp1 = load <16 x i8>* %xP8x16, align 16
+  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+  %tmp2 = load <16 x i8>* %__a, align 16
+  %tmp3 = load <16 x i8>* %__b, align 16
+  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+  store <16 x i8> %vext, <16 x i8>* %xP8x16, align 16
+  ret void
+}
+
+define void @test_vextq_s16() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s16:
+  ; CHECK: {{ext.16.*#14}}
+  %xS16x8 = alloca <8 x i16>, align 16
+  %__a = alloca <8 x i16>, align 16
+  %__b = alloca <8 x i16>, align 16
+  %tmp = load <8 x i16>* %xS16x8, align 16
+  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+  %tmp1 = load <8 x i16>* %xS16x8, align 16
+  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+  %tmp2 = load <8 x i16>* %__a, align 16
+  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+  %tmp4 = load <8 x i16>* %__b, align 16
+  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  store <8 x i16> %vext, <8 x i16>* %xS16x8, align 16
+  ret void
+}
+
+define void @test_vextq_u16() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u16:
+  ; CHECK: {{ext.16.*#8}}
+  %xU16x8 = alloca <8 x i16>, align 16
+  %__a = alloca <8 x i16>, align 16
+  %__b = alloca <8 x i16>, align 16
+  %tmp = load <8 x i16>* %xU16x8, align 16
+  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+  %tmp1 = load <8 x i16>* %xU16x8, align 16
+  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+  %tmp2 = load <8 x i16>* %__a, align 16
+  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+  %tmp4 = load <8 x i16>* %__b, align 16
+  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  store <8 x i16> %vext, <8 x i16>* %xU16x8, align 16
+  ret void
+}
+
+define void @test_vextq_p16() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_p16:
+  ; CHECK: {{ext.16.*#10}}
+  %xP16x8 = alloca <8 x i16>, align 16
+  %__a = alloca <8 x i16>, align 16
+  %__b = alloca <8 x i16>, align 16
+  %tmp = load <8 x i16>* %xP16x8, align 16
+  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+  %tmp1 = load <8 x i16>* %xP16x8, align 16
+  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+  %tmp2 = load <8 x i16>* %__a, align 16
+  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+  %tmp4 = load <8 x i16>* %__b, align 16
+  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+  store <8 x i16> %vext, <8 x i16>* %xP16x8, align 16
+  ret void
+}
+
+define void @test_vextq_s32() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s32:
+  ; CHECK: {{ext.16.*#4}}
+  %xS32x4 = alloca <4 x i32>, align 16
+  %__a = alloca <4 x i32>, align 16
+  %__b = alloca <4 x i32>, align 16
+  %tmp = load <4 x i32>* %xS32x4, align 16
+  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
+  %tmp1 = load <4 x i32>* %xS32x4, align 16
+  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
+  %tmp2 = load <4 x i32>* %__a, align 16
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = load <4 x i32>* %__b, align 16
+  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
+  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  store <4 x i32> %vext, <4 x i32>* %xS32x4, align 16
+  ret void
+}
+
+define void @test_vextq_u32() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u32:
+  ; CHECK: {{ext.16.*#8}}
+  %xU32x4 = alloca <4 x i32>, align 16
+  %__a = alloca <4 x i32>, align 16
+  %__b = alloca <4 x i32>, align 16
+  %tmp = load <4 x i32>* %xU32x4, align 16
+  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
+  %tmp1 = load <4 x i32>* %xU32x4, align 16
+  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
+  %tmp2 = load <4 x i32>* %__a, align 16
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = load <4 x i32>* %__b, align 16
+  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
+  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  store <4 x i32> %vext, <4 x i32>* %xU32x4, align 16
+  ret void
+}
+
+define void @test_vextq_f32() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_f32:
+  ; CHECK: {{ext.16.*#12}}
+  %xF32x4 = alloca <4 x float>, align 16
+  %__a = alloca <4 x float>, align 16
+  %__b = alloca <4 x float>, align 16
+  %tmp = load <4 x float>* %xF32x4, align 16
+  store <4 x float> %tmp, <4 x float>* %__a, align 16
+  %tmp1 = load <4 x float>* %xF32x4, align 16
+  store <4 x float> %tmp1, <4 x float>* %__b, align 16
+  %tmp2 = load <4 x float>* %__a, align 16
+  %tmp3 = bitcast <4 x float> %tmp2 to <16 x i8>
+  %tmp4 = load <4 x float>* %__b, align 16
+  %tmp5 = bitcast <4 x float> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x float>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x float>
+  %vext = shufflevector <4 x float> %tmp6, <4 x float> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  store <4 x float> %vext, <4 x float>* %xF32x4, align 16
+  ret void
+}
+
+define void @test_vextq_s64() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s64:
+  ; CHECK: {{ext.16.*#8}}
+  %xS64x2 = alloca <2 x i64>, align 16
+  %__a = alloca <2 x i64>, align 16
+  %__b = alloca <2 x i64>, align 16
+  %tmp = load <2 x i64>* %xS64x2, align 16
+  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
+  %tmp1 = load <2 x i64>* %xS64x2, align 16
+  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
+  %tmp2 = load <2 x i64>* %__a, align 16
+  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
+  %tmp4 = load <2 x i64>* %__b, align 16
+  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
+  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i64> %vext, <2 x i64>* %xS64x2, align 16
+  ret void
+}
+
+define void @test_vextq_u64() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u64:
+  ; CHECK: {{ext.16.*#8}}
+  %xU64x2 = alloca <2 x i64>, align 16
+  %__a = alloca <2 x i64>, align 16
+  %__b = alloca <2 x i64>, align 16
+  %tmp = load <2 x i64>* %xU64x2, align 16
+  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
+  %tmp1 = load <2 x i64>* %xU64x2, align 16
+  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
+  %tmp2 = load <2 x i64>* %__a, align 16
+  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
+  %tmp4 = load <2 x i64>* %__b, align 16
+  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
+  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i64> %vext, <2 x i64>* %xU64x2, align 16
+  ret void
+}
+
+; shuffles with an undef second operand can use an EXT also so long as the
+; indices wrap and stay sequential.
+; rdar://12051674
+define <16 x i8> @vext1(<16 x i8> %_a) nounwind {
+; CHECK-LABEL: vext1:
+; CHECK: ext.16b  v0, v0, v0, #8
+  %vext = shufflevector <16 x i8> %_a, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i8> %vext
+}
+
+; <rdar://problem/12212062>
+define <2 x i64> @vext2(<2 x i64> %p0, <2 x i64> %p1) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vext2:
+; CHECK: ext.16b v1, v1, v1, #8
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: add.2d  v0, v0, v1
+  %t0 = shufflevector <2 x i64> %p1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  %t1 = shufflevector <2 x i64> %p0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  %t2 = add <2 x i64> %t1, %t0
+  ret <2 x i64> %t2
+}
diff --git a/test/CodeGen/AArch64/arm64-vext_reverse.ll b/test/CodeGen/AArch64/arm64-vext_reverse.ll
new file mode 100644
index 0000000..c45e55e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vext_reverse.ll
@@ -0,0 +1,172 @@
+; RUN: llc -mtriple=arm64-linux-gnuabi < %s | FileCheck %s
+
+; The following tests is to check the correctness of reversing input operand 
+; of vext by enumerating all cases of using two undefs in shuffle masks.
+
+define <4 x i16> @vext_6701_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_0:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_12:
+; CHECK: ext	v0.8b, v0.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_13:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 7, i32 undef, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_14:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 7, i32 0, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_23:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 undef, i32 undef, i32 1>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_24:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 undef, i32 0, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_34:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #4
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 undef, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_0:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_12:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 7, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_13:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 6, i32 undef, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_14:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 6, i32 7, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_23:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 undef, i32 undef, i32 0>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_24:
+; CHECK: rev32   v0.4h, v1.4h
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 undef, i32 7, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_34:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #2
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 6, i32 undef, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_0:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_12:
+; CHECK: ext	v0.8b, v0.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 1, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_13:
+; CHECK: rev32   v0.4h, v0.4h
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 0, i32 undef, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_14:
+; CHECK: ext	v0.8b, v0.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_23:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 undef, i32 undef, i32 2>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_24:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 undef, i32 1, i32 undef>
+  ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_34:
+; CHECK: ext	v0.8b, v1.8b, v0.8b, #6
+  %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 0, i32 undef, i32 undef>
+  ret <4 x i16> %x
+}
diff --git a/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll b/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
new file mode 100644
index 0000000..255a182
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
@@ -0,0 +1,375 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+;;; Float vectors
+
+%v2f32 = type <2 x float>
+; CHECK: test_v2f32.sqrt:
+define %v2f32 @test_v2f32.sqrt(%v2f32 %a) {
+  ; CHECK: fsqrt.2s
+  %1 = call %v2f32 @llvm.sqrt.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.powi:
+define %v2f32 @test_v2f32.powi(%v2f32 %a, i32 %b) {
+  ; CHECK: pow
+  %1 = call %v2f32 @llvm.powi.v2f32(%v2f32 %a, i32 %b)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.sin:
+define %v2f32 @test_v2f32.sin(%v2f32 %a) {
+  ; CHECK: sin
+  %1 = call %v2f32 @llvm.sin.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.cos:
+define %v2f32 @test_v2f32.cos(%v2f32 %a) {
+  ; CHECK: cos
+  %1 = call %v2f32 @llvm.cos.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.pow:
+define %v2f32 @test_v2f32.pow(%v2f32 %a, %v2f32 %b) {
+  ; CHECK: pow
+  %1 = call %v2f32 @llvm.pow.v2f32(%v2f32 %a, %v2f32 %b)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.exp:
+define %v2f32 @test_v2f32.exp(%v2f32 %a) {
+  ; CHECK: exp
+  %1 = call %v2f32 @llvm.exp.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.exp2:
+define %v2f32 @test_v2f32.exp2(%v2f32 %a) {
+  ; CHECK: exp
+  %1 = call %v2f32 @llvm.exp2.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.log:
+define %v2f32 @test_v2f32.log(%v2f32 %a) {
+  ; CHECK: log
+  %1 = call %v2f32 @llvm.log.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.log10:
+define %v2f32 @test_v2f32.log10(%v2f32 %a) {
+  ; CHECK: log
+  %1 = call %v2f32 @llvm.log10.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.log2:
+define %v2f32 @test_v2f32.log2(%v2f32 %a) {
+  ; CHECK: log
+  %1 = call %v2f32 @llvm.log2.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.fma:
+define %v2f32 @test_v2f32.fma(%v2f32 %a, %v2f32 %b, %v2f32 %c) {
+  ; CHECK: fma
+  %1 = call %v2f32 @llvm.fma.v2f32(%v2f32 %a, %v2f32 %b, %v2f32 %c)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.fabs:
+define %v2f32 @test_v2f32.fabs(%v2f32 %a) {
+  ; CHECK: fabs
+  %1 = call %v2f32 @llvm.fabs.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.floor:
+define %v2f32 @test_v2f32.floor(%v2f32 %a) {
+  ; CHECK: frintm.2s
+  %1 = call %v2f32 @llvm.floor.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.ceil:
+define %v2f32 @test_v2f32.ceil(%v2f32 %a) {
+  ; CHECK: frintp.2s
+  %1 = call %v2f32 @llvm.ceil.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.trunc:
+define %v2f32 @test_v2f32.trunc(%v2f32 %a) {
+  ; CHECK: frintz.2s
+  %1 = call %v2f32 @llvm.trunc.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.rint:
+define %v2f32 @test_v2f32.rint(%v2f32 %a) {
+  ; CHECK: frintx.2s
+  %1 = call %v2f32 @llvm.rint.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.nearbyint:
+define %v2f32 @test_v2f32.nearbyint(%v2f32 %a) {
+  ; CHECK: frinti.2s
+  %1 = call %v2f32 @llvm.nearbyint.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+
+declare %v2f32 @llvm.sqrt.v2f32(%v2f32) #0
+declare %v2f32 @llvm.powi.v2f32(%v2f32, i32) #0
+declare %v2f32 @llvm.sin.v2f32(%v2f32) #0
+declare %v2f32 @llvm.cos.v2f32(%v2f32) #0
+declare %v2f32 @llvm.pow.v2f32(%v2f32, %v2f32) #0
+declare %v2f32 @llvm.exp.v2f32(%v2f32) #0
+declare %v2f32 @llvm.exp2.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log10.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log2.v2f32(%v2f32) #0
+declare %v2f32 @llvm.fma.v2f32(%v2f32, %v2f32, %v2f32) #0
+declare %v2f32 @llvm.fabs.v2f32(%v2f32) #0
+declare %v2f32 @llvm.floor.v2f32(%v2f32) #0
+declare %v2f32 @llvm.ceil.v2f32(%v2f32) #0
+declare %v2f32 @llvm.trunc.v2f32(%v2f32) #0
+declare %v2f32 @llvm.rint.v2f32(%v2f32) #0
+declare %v2f32 @llvm.nearbyint.v2f32(%v2f32) #0
+
+;;;
+
+%v4f32 = type <4 x float>
+; CHECK: test_v4f32.sqrt:
+define %v4f32 @test_v4f32.sqrt(%v4f32 %a) {
+  ; CHECK: fsqrt.4s
+  %1 = call %v4f32 @llvm.sqrt.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.powi:
+define %v4f32 @test_v4f32.powi(%v4f32 %a, i32 %b) {
+  ; CHECK: pow
+  %1 = call %v4f32 @llvm.powi.v4f32(%v4f32 %a, i32 %b)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.sin:
+define %v4f32 @test_v4f32.sin(%v4f32 %a) {
+  ; CHECK: sin
+  %1 = call %v4f32 @llvm.sin.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.cos:
+define %v4f32 @test_v4f32.cos(%v4f32 %a) {
+  ; CHECK: cos
+  %1 = call %v4f32 @llvm.cos.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.pow:
+define %v4f32 @test_v4f32.pow(%v4f32 %a, %v4f32 %b) {
+  ; CHECK: pow
+  %1 = call %v4f32 @llvm.pow.v4f32(%v4f32 %a, %v4f32 %b)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.exp:
+define %v4f32 @test_v4f32.exp(%v4f32 %a) {
+  ; CHECK: exp
+  %1 = call %v4f32 @llvm.exp.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.exp2:
+define %v4f32 @test_v4f32.exp2(%v4f32 %a) {
+  ; CHECK: exp
+  %1 = call %v4f32 @llvm.exp2.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.log:
+define %v4f32 @test_v4f32.log(%v4f32 %a) {
+  ; CHECK: log
+  %1 = call %v4f32 @llvm.log.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.log10:
+define %v4f32 @test_v4f32.log10(%v4f32 %a) {
+  ; CHECK: log
+  %1 = call %v4f32 @llvm.log10.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.log2:
+define %v4f32 @test_v4f32.log2(%v4f32 %a) {
+  ; CHECK: log
+  %1 = call %v4f32 @llvm.log2.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.fma:
+define %v4f32 @test_v4f32.fma(%v4f32 %a, %v4f32 %b, %v4f32 %c) {
+  ; CHECK: fma
+  %1 = call %v4f32 @llvm.fma.v4f32(%v4f32 %a, %v4f32 %b, %v4f32 %c)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.fabs:
+define %v4f32 @test_v4f32.fabs(%v4f32 %a) {
+  ; CHECK: fabs
+  %1 = call %v4f32 @llvm.fabs.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.floor:
+define %v4f32 @test_v4f32.floor(%v4f32 %a) {
+  ; CHECK: frintm.4s
+  %1 = call %v4f32 @llvm.floor.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.ceil:
+define %v4f32 @test_v4f32.ceil(%v4f32 %a) {
+  ; CHECK: frintp.4s
+  %1 = call %v4f32 @llvm.ceil.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.trunc:
+define %v4f32 @test_v4f32.trunc(%v4f32 %a) {
+  ; CHECK: frintz.4s
+  %1 = call %v4f32 @llvm.trunc.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.rint:
+define %v4f32 @test_v4f32.rint(%v4f32 %a) {
+  ; CHECK: frintx.4s
+  %1 = call %v4f32 @llvm.rint.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.nearbyint:
+define %v4f32 @test_v4f32.nearbyint(%v4f32 %a) {
+  ; CHECK: frinti.4s
+  %1 = call %v4f32 @llvm.nearbyint.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+
+declare %v4f32 @llvm.sqrt.v4f32(%v4f32) #0
+declare %v4f32 @llvm.powi.v4f32(%v4f32, i32) #0
+declare %v4f32 @llvm.sin.v4f32(%v4f32) #0
+declare %v4f32 @llvm.cos.v4f32(%v4f32) #0
+declare %v4f32 @llvm.pow.v4f32(%v4f32, %v4f32) #0
+declare %v4f32 @llvm.exp.v4f32(%v4f32) #0
+declare %v4f32 @llvm.exp2.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log10.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log2.v4f32(%v4f32) #0
+declare %v4f32 @llvm.fma.v4f32(%v4f32, %v4f32, %v4f32) #0
+declare %v4f32 @llvm.fabs.v4f32(%v4f32) #0
+declare %v4f32 @llvm.floor.v4f32(%v4f32) #0
+declare %v4f32 @llvm.ceil.v4f32(%v4f32) #0
+declare %v4f32 @llvm.trunc.v4f32(%v4f32) #0
+declare %v4f32 @llvm.rint.v4f32(%v4f32) #0
+declare %v4f32 @llvm.nearbyint.v4f32(%v4f32) #0
+
+;;; Double vector
+
+%v2f64 = type <2 x double>
+; CHECK: test_v2f64.sqrt:
+define %v2f64 @test_v2f64.sqrt(%v2f64 %a) {
+  ; CHECK: fsqrt.2d
+  %1 = call %v2f64 @llvm.sqrt.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.powi:
+define %v2f64 @test_v2f64.powi(%v2f64 %a, i32 %b) {
+  ; CHECK: pow
+  %1 = call %v2f64 @llvm.powi.v2f64(%v2f64 %a, i32 %b)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.sin:
+define %v2f64 @test_v2f64.sin(%v2f64 %a) {
+  ; CHECK: sin
+  %1 = call %v2f64 @llvm.sin.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.cos:
+define %v2f64 @test_v2f64.cos(%v2f64 %a) {
+  ; CHECK: cos
+  %1 = call %v2f64 @llvm.cos.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.pow:
+define %v2f64 @test_v2f64.pow(%v2f64 %a, %v2f64 %b) {
+  ; CHECK: pow
+  %1 = call %v2f64 @llvm.pow.v2f64(%v2f64 %a, %v2f64 %b)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.exp:
+define %v2f64 @test_v2f64.exp(%v2f64 %a) {
+  ; CHECK: exp
+  %1 = call %v2f64 @llvm.exp.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.exp2:
+define %v2f64 @test_v2f64.exp2(%v2f64 %a) {
+  ; CHECK: exp
+  %1 = call %v2f64 @llvm.exp2.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.log:
+define %v2f64 @test_v2f64.log(%v2f64 %a) {
+  ; CHECK: log
+  %1 = call %v2f64 @llvm.log.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.log10:
+define %v2f64 @test_v2f64.log10(%v2f64 %a) {
+  ; CHECK: log
+  %1 = call %v2f64 @llvm.log10.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.log2:
+define %v2f64 @test_v2f64.log2(%v2f64 %a) {
+  ; CHECK: log
+  %1 = call %v2f64 @llvm.log2.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.fma:
+define %v2f64 @test_v2f64.fma(%v2f64 %a, %v2f64 %b, %v2f64 %c) {
+  ; CHECK: fma
+  %1 = call %v2f64 @llvm.fma.v2f64(%v2f64 %a, %v2f64 %b, %v2f64 %c)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.fabs:
+define %v2f64 @test_v2f64.fabs(%v2f64 %a) {
+  ; CHECK: fabs
+  %1 = call %v2f64 @llvm.fabs.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.floor:
+define %v2f64 @test_v2f64.floor(%v2f64 %a) {
+  ; CHECK: frintm.2d
+  %1 = call %v2f64 @llvm.floor.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.ceil:
+define %v2f64 @test_v2f64.ceil(%v2f64 %a) {
+  ; CHECK: frintp.2d
+  %1 = call %v2f64 @llvm.ceil.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.trunc:
+define %v2f64 @test_v2f64.trunc(%v2f64 %a) {
+  ; CHECK: frintz.2d
+  %1 = call %v2f64 @llvm.trunc.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.rint:
+define %v2f64 @test_v2f64.rint(%v2f64 %a) {
+  ; CHECK: frintx.2d
+  %1 = call %v2f64 @llvm.rint.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.nearbyint:
+define %v2f64 @test_v2f64.nearbyint(%v2f64 %a) {
+  ; CHECK: frinti.2d
+  %1 = call %v2f64 @llvm.nearbyint.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+
+declare %v2f64 @llvm.sqrt.v2f64(%v2f64) #0
+declare %v2f64 @llvm.powi.v2f64(%v2f64, i32) #0
+declare %v2f64 @llvm.sin.v2f64(%v2f64) #0
+declare %v2f64 @llvm.cos.v2f64(%v2f64) #0
+declare %v2f64 @llvm.pow.v2f64(%v2f64, %v2f64) #0
+declare %v2f64 @llvm.exp.v2f64(%v2f64) #0
+declare %v2f64 @llvm.exp2.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log10.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log2.v2f64(%v2f64) #0
+declare %v2f64 @llvm.fma.v2f64(%v2f64, %v2f64, %v2f64) #0
+declare %v2f64 @llvm.fabs.v2f64(%v2f64) #0
+declare %v2f64 @llvm.floor.v2f64(%v2f64) #0
+declare %v2f64 @llvm.ceil.v2f64(%v2f64) #0
+declare %v2f64 @llvm.trunc.v2f64(%v2f64) #0
+declare %v2f64 @llvm.rint.v2f64(%v2f64) #0
+declare %v2f64 @llvm.nearbyint.v2f64(%v2f64) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AArch64/arm64-vhadd.ll b/test/CodeGen/AArch64/arm64-vhadd.ll
new file mode 100644
index 0000000..6178bf9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -0,0 +1,249 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shadd8b:
+;CHECK: shadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shadd16b:
+;CHECK: shadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shadd4h:
+;CHECK: shadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shadd8h:
+;CHECK: shadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shadd2s:
+;CHECK: shadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shadd4s:
+;CHECK: shadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uhadd8b:
+;CHECK: uhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uhadd16b:
+;CHECK: uhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uhadd4h:
+;CHECK: uhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uhadd8h:
+;CHECK: uhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uhadd2s:
+;CHECK: uhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uhadd4s:
+;CHECK: uhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srhadd8b:
+;CHECK: srhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srhadd16b:
+;CHECK: srhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srhadd4h:
+;CHECK: srhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srhadd8h:
+;CHECK: srhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srhadd2s:
+;CHECK: srhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srhadd4s:
+;CHECK: srhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: urhadd8b:
+;CHECK: urhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: urhadd16b:
+;CHECK: urhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: urhadd4h:
+;CHECK: urhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: urhadd8h:
+;CHECK: urhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: urhadd2s:
+;CHECK: urhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: urhadd4s:
+;CHECK: urhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vhsub.ll b/test/CodeGen/AArch64/arm64-vhsub.ll
new file mode 100644
index 0000000..13bfda3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vhsub.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @shsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shsub8b:
+;CHECK: shsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @shsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shsub16b:
+;CHECK: shsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @shsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shsub4h:
+;CHECK: shsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @shsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shsub8h:
+;CHECK: shsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @shsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shsub2s:
+;CHECK: shsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @shsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shsub4s:
+;CHECK: shsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @uhsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uhsub8b:
+;CHECK: uhsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uhsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uhsub16b:
+;CHECK: uhsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uhsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uhsub4h:
+;CHECK: uhsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uhsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uhsub8h:
+;CHECK: uhsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uhsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uhsub2s:
+;CHECK: uhsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uhsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uhsub4s:
+;CHECK: uhsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-virtual_base.ll b/test/CodeGen/AArch64/arm64-virtual_base.ll
new file mode 100644
index 0000000..cb95954
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-virtual_base.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -O3 -march arm64 | FileCheck %s
+; <rdar://13463602>
+
+%struct.Counter_Struct = type { i64, i64 }
+%struct.Bicubic_Patch_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64, i32, i32, i32, [4 x [4 x [3 x double]]], [3 x double], double, double, %struct.Bezier_Node_Struct* }
+%struct.Method_Struct = type { i32 (%struct.Object_Struct*, %struct.Ray_Struct*, %struct.istack_struct*)*, i32 (double*, %struct.Object_Struct*)*, void (double*, %struct.Object_Struct*, %struct.istk_entry*)*, i8* (%struct.Object_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*)*, void (%struct.Object_Struct*)* }
+%struct.Object_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64 }
+%struct.Texture_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.9, %struct.Texture_Struct*, %struct.Pigment_Struct*, %struct.Tnormal_Struct*, %struct.Finish_Struct*, %struct.Texture_Struct*, i32 }
+%struct.Warps_Struct = type { i16, %struct.Warps_Struct* }
+%struct.Pattern_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.6 }
+%struct.Blend_Map_Struct = type { i16, i16, i16, i64, %struct.Blend_Map_Entry* }
+%struct.Blend_Map_Entry = type { float, i8, %union.anon }
+%union.anon = type { [2 x double], [8 x i8] }
+%union.anon.6 = type { %struct.anon.7 }
+%struct.anon.7 = type { float, [3 x double] }
+%union.anon.9 = type { %struct.anon.10 }
+%struct.anon.10 = type { float, [3 x double] }
+%struct.Pigment_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.0, [5 x float] }
+%union.anon.0 = type { %struct.anon }
+%struct.anon = type { float, [3 x double] }
+%struct.Tnormal_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.3, float }
+%union.anon.3 = type { %struct.anon.4 }
+%struct.anon.4 = type { float, [3 x double] }
+%struct.Finish_Struct = type { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, [3 x float], [3 x float] }
+%struct.Interior_Struct = type { i64, i32, float, float, float, float, float, %struct.Media_Struct* }
+%struct.Media_Struct = type { i32, i32, i32, i32, i32, double, double, i32, i32, i32, i32, [5 x float], [5 x float], [5 x float], [5 x float], double, double, double, double*, %struct.Pigment_Struct*, %struct.Media_Struct* }
+%struct.Bounding_Box_Struct = type { [3 x float], [3 x float] }
+%struct.Ray_Struct = type { [3 x double], [3 x double], i32, [100 x %struct.Interior_Struct*] }
+%struct.istack_struct = type { %struct.istack_struct*, %struct.istk_entry*, i32 }
+%struct.istk_entry = type { double, [3 x double], [3 x double], %struct.Object_Struct*, i32, i32, double, double, i8* }
+%struct.Transform_Struct = type { [4 x [4 x double]], [4 x [4 x double]] }
+%struct.Bezier_Node_Struct = type { i32, [3 x double], double, i32, i8* }
+
+define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
+; CHECK: Precompute_Patch_Values
+; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288]
+; CHECK-NEXT: str [[VAL]], [sp, #232]
+; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272]
+; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
+entry:
+  %Control_Points = alloca [16 x [3 x double]], align 8
+  %arraydecay5.3.1 = getelementptr inbounds [16 x [3 x double]]* %Control_Points, i64 0, i64 9, i64 0
+  %tmp14 = bitcast double* %arraydecay5.3.1 to i8*
+  %arraydecay11.3.1 = getelementptr inbounds %struct.Bicubic_Patch_Struct* %Shape, i64 0, i32 12, i64 1, i64 3, i64 0
+  %tmp15 = bitcast double* %arraydecay11.3.1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp14, i8* %tmp15, i64 24, i32 1, i1 false)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
diff --git a/test/CodeGen/AArch64/arm64-vmax.ll b/test/CodeGen/AArch64/arm64-vmax.ll
new file mode 100644
index 0000000..3f2c134
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vmax.ll
@@ -0,0 +1,679 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @smax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smax_8b:
+;CHECK: smax.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smax_16b:
+;CHECK: smax.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smax_4h:
+;CHECK: smax.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smax_8h:
+;CHECK: smax.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smax_2s:
+;CHECK: smax.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smax_4s:
+;CHECK: smax.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umax_8b:
+;CHECK: umax.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umax_16b:
+;CHECK: umax.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umax_4h:
+;CHECK: umax.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umax_8h:
+;CHECK: umax.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umax_2s:
+;CHECK: umax.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umax_4s:
+;CHECK: umax.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @smin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smin_8b:
+;CHECK: smin.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smin_16b:
+;CHECK: smin.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smin_4h:
+;CHECK: smin.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smin_8h:
+;CHECK: smin.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smin_2s:
+;CHECK: smin.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smin_4s:
+;CHECK: smin.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umin_8b:
+;CHECK: umin.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umin_16b:
+;CHECK: umin.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umin_4h:
+;CHECK: umin.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umin_8h:
+;CHECK: umin.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umin_2s:
+;CHECK: umin.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umin_4s:
+;CHECK: umin.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @smaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smaxp_8b:
+;CHECK: smaxp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smaxp_16b:
+;CHECK: smaxp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smaxp_4h:
+;CHECK: smaxp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smaxp_8h:
+;CHECK: smaxp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smaxp_2s:
+;CHECK: smaxp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smaxp_4s:
+;CHECK: smaxp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umaxp_8b:
+;CHECK: umaxp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umaxp_16b:
+;CHECK: umaxp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umaxp_4h:
+;CHECK: umaxp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umaxp_8h:
+;CHECK: umaxp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umaxp_2s:
+;CHECK: umaxp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umaxp_4s:
+;CHECK: umaxp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sminp_8b:
+;CHECK: sminp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sminp_16b:
+;CHECK: sminp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sminp_4h:
+;CHECK: sminp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sminp_8h:
+;CHECK: sminp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sminp_2s:
+;CHECK: sminp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sminp_4s:
+;CHECK: sminp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @uminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uminp_8b:
+;CHECK: uminp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uminp_16b:
+;CHECK: uminp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uminp_4h:
+;CHECK: uminp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uminp_8h:
+;CHECK: uminp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uminp_2s:
+;CHECK: uminp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uminp_4s:
+;CHECK: uminp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x float> @fmax_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmax_2s:
+;CHECK: fmax.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmax_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmax_4s:
+;CHECK: fmax.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmax_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmax_2d:
+;CHECK: fmax.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmaxp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxp_2s:
+;CHECK: fmaxp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmaxp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxp_4s:
+;CHECK: fmaxp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmaxp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmaxp_2d:
+;CHECK: fmaxp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmin_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmin_2s:
+;CHECK: fmin.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmin_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmin_4s:
+;CHECK: fmin.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmin_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmin_2d:
+;CHECK: fmin.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fminp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fminp_2s:
+;CHECK: fminp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fminp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fminp_4s:
+;CHECK: fminp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fminp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fminp_2d:
+;CHECK: fminp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fminnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fminnmp_2s:
+;CHECK: fminnmp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fminnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fminnmp_4s:
+;CHECK: fminnmp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fminnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fminnmp_2d:
+;CHECK: fminnmp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmaxnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_2s:
+;CHECK: fmaxnmp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmaxnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_4s:
+;CHECK: fmaxnmp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmaxnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_2d:
+;CHECK: fmaxnmp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vminmaxnm.ll b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
new file mode 100644
index 0000000..b5aca45
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vmaxnm2.i
+}
+
+define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.4s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vmaxnm2.i
+}
+
+define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2d	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vmaxnm2.i
+}
+
+define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.2s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vminnm2.i
+}
+
+define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.4s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vminnm2.i
+}
+
+define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fminnm.2d	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vminnm2.i
+}
+
+declare <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+
+define double @test_fmaxnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fmaxnmv:
+; CHECK: fmaxnmp.2d d0, v0
+  %max = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+define double @test_fminnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fminnmv:
+; CHECK: fminnmp.2d d0, v0
+  %min = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double>)
+declare double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/AArch64/arm64-vmovn.ll b/test/CodeGen/AArch64/arm64-vmovn.ll
new file mode 100644
index 0000000..67e2816
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vmovn.ll
@@ -0,0 +1,242 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @xtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: xtn8b:
+;CHECK-NOT: ld1
+;CHECK: xtn.8b v0, v0
+;CHECK-NEXT: ret
+  %tmp3 = trunc <8 x i16> %A to <8 x i8>
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @xtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: xtn4h:
+;CHECK-NOT: ld1
+;CHECK: xtn.4h v0, v0
+;CHECK-NEXT: ret
+  %tmp3 = trunc <4 x i32> %A to <4 x i16>
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @xtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: xtn2s:
+;CHECK-NOT: ld1
+;CHECK: xtn.2s v0, v0
+;CHECK-NEXT: ret
+  %tmp3 = trunc <2 x i64> %A to <2 x i32>
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @xtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: xtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: xtn2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = trunc <8 x i16> %A to <8 x i8>
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @xtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: xtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: xtn2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = trunc <4 x i32> %A to <4 x i16>
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @xtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: xtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: xtn2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = trunc <2 x i64> %A to <2 x i32>
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+define <8 x i8> @sqxtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtn8b:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.8b v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqxtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtn4h:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.4h v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqxtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtn2s:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.2s v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A)
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @sqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A)
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @sqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A)
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
+
+define <8 x i8> @uqxtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: uqxtn8b:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.8b v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqxtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: uqxtn4h:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.4h v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqxtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: uqxtn2s:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.2s v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: uqxtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A)
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @uqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: uqxtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A)
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @uqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: uqxtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A)
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
+
+define <8 x i8> @sqxtun8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtun8b:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.8b v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqxtun4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtun4h:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.4h v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqxtun2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtun2s:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.2s v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqxtun2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtun2_16b:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A)
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @sqxtun2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtun2_8h:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A)
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @sqxtun2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtun2_4s:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A)
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll
new file mode 100644
index 0000000..6fa60fe
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vmul.ll
@@ -0,0 +1,2036 @@
+; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+
+define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smull8h:
+;CHECK: smull.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smull4s:
+;CHECK: smull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smull2d:
+;CHECK: smull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+declare <8 x i16>  @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umull8h:
+;CHECK: umull.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umull4s:
+;CHECK: umull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umull2d:
+;CHECK: umull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+declare <8 x i16>  @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull4s:
+;CHECK: sqdmull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2d:
+;CHECK: sqdmull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_4s:
+;CHECK: sqdmull2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_2d:
+;CHECK: sqdmull2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: pmull8h:
+;CHECK: pmull.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_4h:
+;CHECK: sqdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_8h:
+;CHECK: sqdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_2s:
+;CHECK: sqdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_4s:
+;CHECK: sqdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
+;CHECK-LABEL: sqdmulh_1s:
+;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
+  %tmp1 = load i32* %A
+  %tmp2 = load i32* %B
+  %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
+  ret i32 %tmp3
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
+
+define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_4h:
+;CHECK: sqrdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_8h:
+;CHECK: sqrdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_2s:
+;CHECK: sqrdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_4s:
+;CHECK: sqrdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_1s:
+;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
+  %tmp1 = load i32* %A
+  %tmp2 = load i32* %B
+  %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
+  ret i32 %tmp3
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
+
+define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_2s:
+;CHECK: fmulx.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+  ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_4s:
+;CHECK: fmulx.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+  ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmulx_2d:
+;CHECK: fmulx.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+  ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlal4s:
+;CHECK: smlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlal2d:
+;CHECK: smlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlsl4s:
+;CHECK: smlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlsl2d:
+;CHECK: smlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+
+define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal4s:
+;CHECK: sqdmlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2d:
+;CHECK: sqdmlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_4s:
+;CHECK: sqdmlal2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_2d:
+;CHECK: sqdmlal2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl4s:
+;CHECK: sqdmlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2d:
+;CHECK: sqdmlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_4s:
+;CHECK: sqdmlsl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_2d:
+;CHECK: sqdmlsl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlal4s:
+;CHECK: umlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlal2d:
+;CHECK: umlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlsl4s:
+;CHECK: umlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlsl2d:
+;CHECK: umlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmla_2s:
+;CHECK: fmla.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmla_4s:
+;CHECK: fmla.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmla_2d:
+;CHECK: fmla.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_2s:
+;CHECK: fmls.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
+  ret <2 x float> %tmp5
+}
+
+define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_4s:
+;CHECK: fmls.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
+  ret <4 x float> %tmp5
+}
+
+define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmls_2d:
+;CHECK: fmls.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
+  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
+  ret <2 x double> %tmp5
+}
+
+define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_2s:
+;CHECK: fmls.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
+  ret <2 x float> %tmp5
+}
+
+define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_4s:
+;CHECK: fmls.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
+  ret <4 x float> %tmp5
+}
+
+define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_2d:
+;CHECK: fmls.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
+  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
+  ret <2 x double> %tmp5
+}
+
+define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_2s:
+;CHECK: fmls.2s
+entry:
+  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
+  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
+  %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
+  ret <2 x float> %fmls1
+}
+
+define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_4s:
+;CHECK: fmls.4s
+entry:
+  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+  %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
+  ret <4 x float> %fmls1
+}
+
+define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_2d:
+;CHECK: fmls.2d
+entry:
+  %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
+  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
+  %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
+  ret <2 x double> %fmls1
+}
+
+define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fmla_indexed_scalar_2s:
+; CHECK-NEXT: fmla.2s
+; CHECK-NEXT: ret
+  %v1 = insertelement <2 x float> undef, float %c, i32 0
+  %v2 = insertelement <2 x float> %v1, float %c, i32 1
+  %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
+  ret <2 x float> %fmla1
+}
+
+define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fmla_indexed_scalar_4s:
+; CHECK-NEXT: fmla.4s
+; CHECK-NEXT: ret
+  %v1 = insertelement <4 x float> undef, float %c, i32 0
+  %v2 = insertelement <4 x float> %v1, float %c, i32 1
+  %v3 = insertelement <4 x float> %v2, float %c, i32 2
+  %v4 = insertelement <4 x float> %v3, float %c, i32 3
+  %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %fmla1
+}
+
+define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fmla_indexed_scalar_2d:
+; CHECK-NEXT: fmla.2d
+; CHECK-NEXT: ret
+entry:
+  %v1 = insertelement <2 x double> undef, double %c, i32 0
+  %v2 = insertelement <2 x double> %v1, double %c, i32 1
+  %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %fmla1
+}
+
+define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: mul_4h:
+;CHECK-NOT: dup
+;CHECK: mul.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = mul <4 x i16> %tmp1, %tmp3
+  ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: mul_8h:
+;CHECK-NOT: dup
+;CHECK: mul.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = mul <8 x i16> %tmp1, %tmp3
+  ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: mul_2s:
+;CHECK-NOT: dup
+;CHECK: mul.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = mul <2 x i32> %tmp1, %tmp3
+  ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: mul_4s:
+;CHECK-NOT: dup
+;CHECK: mul.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = mul <4 x i32> %tmp1, %tmp3
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK-LABEL: mul_2d:
+; CHECK: mul
+; CHECK: mul
+  %tmp1 = mul <2 x i64> %A, %B
+  ret <2 x i64> %tmp1
+}
+
+define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_2s:
+;CHECK-NOT: dup
+;CHECK: fmul.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = fmul <2 x float> %tmp1, %tmp3
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_4s:
+;CHECK-NOT: dup
+;CHECK: fmul.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = fmul <4 x float> %tmp1, %tmp3
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_2d:
+;CHECK-NOT: dup
+;CHECK: fmul.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = fmul <2 x double> %tmp1, %tmp3
+  ret <2 x double> %tmp4
+}
+
+define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
+;CHECK-LABEL: fmul_lane_s:
+;CHECK-NOT: dup
+;CHECK: fmul.s s0, s0, v1[3]
+  %B = extractelement <4 x float> %vec, i32 3
+  %res = fmul float %A, %B
+  ret float %res
+}
+
+define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
+;CHECK-LABEL: fmul_lane_d:
+;CHECK-NOT: dup
+;CHECK: fmul.d d0, d0, v1[1]
+  %B = extractelement <2 x double> %vec, i32 1
+  %res = fmul double %A, %B
+  ret double %res
+}
+
+
+
+define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_2s:
+;CHECK-NOT: dup
+;CHECK: fmulx.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_4s:
+;CHECK-NOT: dup
+;CHECK: fmulx.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_2d:
+;CHECK-NOT: dup
+;CHECK: fmulx.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_4h:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_8h:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_2s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_1s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
+  %tmp1 = extractelement <4 x i32> %B, i32 1
+  %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
+  ret i32 %tmp2
+}
+
+define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_4h:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_8h:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_2s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_1s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
+  %tmp1 = extractelement <4 x i32> %B, i32 1
+  %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
+  ret i32 %tmp2
+}
+
+define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmull2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmull2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = add <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = add <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlal2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlal2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_1s:
+;CHECK: sqdmlal.4s
+  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
+  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod = extractelement <4 x i32> %prod.vec, i32 0
+  %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
+  ret i32 %res
+}
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
+
+define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_1s:
+;CHECK: sqdmlsl.4s
+  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
+  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod = extractelement <4 x i32> %prod.vec, i32 0
+  %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
+  ret i32 %res
+}
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
+
+define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_1d:
+;CHECK: sqdmlal.s
+  %rhs = extractelement <2 x i32> %C, i32 1
+  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
+  ret i64 %res
+}
+declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
+
+define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_1d:
+;CHECK: sqdmlsl.s
+  %rhs = extractelement <2 x i32> %C, i32 1
+  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
+  ret i64 %res
+}
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
+
+
+define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = add <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = add <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+
+define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = sub <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = sub <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = sub <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = sub <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+; Scalar FMULX
+define float @fmulxs(float %a, float %b) nounwind {
+; CHECK-LABEL: fmulxs:
+; CHECKNEXT: fmulx s0, s0, s1
+  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
+; CHECKNEXT: ret
+  ret float %fmulx.i
+}
+
+define double @fmulxd(double %a, double %b) nounwind {
+; CHECK-LABEL: fmulxd:
+; CHECKNEXT: fmulx d0, d0, d1
+  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
+; CHECKNEXT: ret
+  ret double %fmulx.i
+}
+
+define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
+; CHECK-LABEL: fmulxs_lane:
+; CHECKNEXT: fmulx.s s0, s0, v1[3]
+  %b = extractelement <4 x float> %vec, i32 3
+  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
+; CHECKNEXT: ret
+  ret float %fmulx.i
+}
+
+define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
+; CHECK-LABEL: fmulxd_lane:
+; CHECKNEXT: fmulx d0, d0, v1[1]
+  %b = extractelement <2 x double> %vec, i32 1
+  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
+; CHECKNEXT: ret
+  ret double %fmulx.i
+}
+
+declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
+
+
+define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: smull2_8h_simple:
+; CHECK-NEXT: smull2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: foo0:
+; CHECK: smull2.8h v0, v0, v1
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: smull2.4s v0, v0, v1
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK: smull2.2d v0, v0, v1
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK: umull2.8h v0, v0, v1
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK: umull2.4s v0, v0, v1
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK: umull2.2d v0, v0, v1
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo6:
+; CHECK-NEXT: smull2.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo7:
+; CHECK-NEXT: smull2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo8:
+; CHECK-NEXT: umull2.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo9:
+; CHECK-NEXT: umull2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
+; CHECK-LABEL: bar0:
+; CHECK: smlal2.8h v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %add.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
+; CHECK-LABEL: bar1:
+; CHECK: smlal2.4s v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
+; CHECK-LABEL: bar2:
+; CHECK: smlal2.2d v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
+; CHECK-LABEL: bar3:
+; CHECK: umlal2.8h v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %add.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
+; CHECK-LABEL: bar4:
+; CHECK: umlal2.4s v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
+; CHECK-LABEL: bar5:
+; CHECK: umlal2.2d v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
+; CHECK-LABEL: mlal2_1:
+; CHECK: smlal2.4s v0, v1, v2[3]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
+; CHECK-LABEL: mlal2_2:
+; CHECK: smlal2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
+; CHECK-LABEL: mlal2_4:
+; CHECK: umlal2.4s v0, v1, v2[2]
+; CHECK-NEXT: ret
+
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
+; CHECK-LABEL: mlal2_5:
+; CHECK: umlal2.2d v0, v1, v2[0]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add
+}
+
+; rdar://12328502
+define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmulq_n_f64:
+; CHECK-NOT: dup.2d
+; CHECK: fmul.2d v0, v0, v1[0]
+  %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
+  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
+  %mul.i = fmul <2 x double> %vecinit1.i, %x
+  ret <2 x double> %mul.i
+}
+
+define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmulq_n_f32:
+; CHECK-NOT: dup.4s
+; CHECK: fmul.4s v0, v0, v1[0]
+  %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
+  %mul.i = fmul <4 x float> %vecinit3.i, %x
+  ret <4 x float> %mul.i
+}
+
+define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmul_n_f32:
+; CHECK-NOT: dup.2s
+; CHECK: fmul.2s v0, v0, v1[0]
+  %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
+  %mul.i = fmul <2 x float> %vecinit1.i, %x
+  ret <2 x float> %mul.i
+}
+
+define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
+entry:
+; CHECK: vmla_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: mla.4h v0, v1, v2[6]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
+entry:
+; CHECK: vmla_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: mla.2s v0, v1, v2[3]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
+entry:
+; CHECK: not_really_vmlaq_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: mla.8h v0, v1, v2[5]
+; CHECK-NEXT: ret
+  %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <8 x i16> %shuffle2, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
+entry:
+; CHECK: not_really_vmlaq_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: mla.4s v0, v1, v2[3]
+; CHECK-NEXT: ret
+  %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle2, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: smull.4s v0, v0, v1[6]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: smull.2d v0, v0, v1[2]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  ret <2 x i64> %vmull2.i
+}
+define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_u16_test
+; CHECK-NOT: ext
+; CHECK: umull.4s v0, v0, v1[6]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_u32_test
+; CHECK-NOT: ext
+; CHECK: umull.2d v0, v0, v1[2]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_s16_test
+; CHECK-NOT: ext
+; CHECK: smull2.4s
+; CHECK-NEXT: ret
+  %conv = trunc i32 %d to i16
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_s32_test
+; CHECK-NOT: ext
+; CHECK: smull2.2d
+; CHECK-NEXT: ret
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_u16_test
+; CHECK-NOT: ext
+; CHECK: umull2.4s
+; CHECK-NEXT: ret
+  %conv = trunc i32 %d to i16
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_u32_test
+; CHECK-NOT: ext
+; CHECK: umull2.2d
+; CHECK-NEXT: ret
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmul_built_dup_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1]
+  %vget_lane = extractelement <4 x i32> %b, i32 1
+  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
+  %prod = mul <4 x i32> %a, %vecinit3.i
+  ret <4 x i32> %prod
+}
+
+define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmul_built_dup_fromsmall_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3]
+  %vget_lane = extractelement <4 x i16> %b, i32 3
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %prod = mul <4 x i16> %a, %vecinit3.i
+  ret <4 x i16> %prod
+}
+
+define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+  %vget_lane = extractelement <4 x i16> %b, i32 0
+  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+  %prod = mul <8 x i16> %a, %vecinit7.i
+  ret <8 x i16> %prod
+}
+
+define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: mull_from_two_extracts:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: mlal_from_two_extracts:
+; CHECK-NOT: ext
+; CHECK: sqdmlal2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  ret <2 x i64> %sum
+}
+
+define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  ret <2 x i64> %res
+}
+
+define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup:
+; CHECK-NOT: ext
+; CHECK: pmull2.8h
+  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
+  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: pmull2.8h
+
+  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+  ret <8 x i16> %res
+}
+
+define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: sqdmlal2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  ret <2 x i64> %sum
+}
+
+define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: umlal2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = add <2 x i64> %accum, %res
+  ret <2 x i64> %sum
+}
+
+define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
+; CHECK: fmla.s s0, s1, v2[3]
+  %rhs = extractelement <4 x float> %rvec, i32 3
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
+; CHECK: fmla.s s0, s1, v2[1]
+  %rhs = extractelement <2 x float> %rvec, i32 1
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
+; CHECK: fmls.s s0, s1, v2[3]
+  %rhs.scal = extractelement <4 x float> %rvec, i32 3
+  %rhs = fsub float -0.0, %rhs.scal
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
+; CHECK: fmls.s s0, s1, v2[1]
+  %rhs.scal = extractelement <2 x float> %rvec, i32 1
+  %rhs = fsub float -0.0, %rhs.scal
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
+; CHECK: fmla.d d0, d1, v2[1]
+  %rhs = extractelement <2 x double> %rvec, i32 1
+  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
+  ret double %res
+}
+
+define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
+; CHECK: fmls.d d0, d1, v2[1]
+  %rhs.scal = extractelement <2 x double> %rvec, i32 1
+  %rhs = fsub double -0.0, %rhs.scal
+  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
+  ret double %res
+}
+
+declare double @llvm.fma.f64(double, double, double)
+
+define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
+; CHECK: fmls.2s v0, v1, v2[3]
+  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
+  ret <2 x float> %res
+}
+
+define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
+; CHECK: fmls.2s v0, v1, v2[1]
+  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
+  ret <2 x float> %res
+}
+
+define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
+; CHECK: fmls.4s v0, v1, v2[3]
+  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
+  ret <4 x float> %res
+}
+
+define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
+; CHECK: fmls.4s v0, v1, v2[1]
+  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
+  ret <4 x float> %res
+}
+
+define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
+; CHECK: fmls.2d v0, v1, v2[1]
+  %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
+  %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
+  ret <2 x double> %res
+}
+
+define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
+; CHECK-LABEL: test_fmul_v1f64:
+; CHECK: fmul
+  %prod = fmul <1 x double> %L, %R
+  ret <1 x double> %prod
+}
+
+define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
+; CHECK-LABEL: test_fdiv_v1f64:
+; CHECK-LABEL: fdiv
+  %prod = fdiv <1 x double> %L, %R
+  ret <1 x double> %prod
+}
+
+define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
+;CHECK-LABEL: sqdmlal_d:
+;CHECK: sqdmlal
+  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+  %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
+  ret i64 %tmp5
+}
+
+define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
+;CHECK-LABEL: sqdmlsl_d:
+;CHECK: sqdmlsl
+  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+  %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
+  ret i64 %tmp5
+}
+
+define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: test_pmull_64:
+; CHECK: pmull.1q
+  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
+  ret <16 x i8> %val
+}
+
+define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
+; CHECK-LABEL: test_pmull_high_64:
+; CHECK: pmull2.1q
+  %l_hi = extractelement <2 x i64> %l, i32 1
+  %r_hi = extractelement <2 x i64> %r, i32 1
+  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
+  ret <16 x i8> %val
+}
+
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
+
+define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
+; CHECK-LABEL: test_mul_v1i64:
+; CHECK: mul
+  %prod = mul <1 x i64> %lhs, %rhs
+  ret <1 x i64> %prod
+}
diff --git a/test/CodeGen/AArch64/arm64-volatile.ll b/test/CodeGen/AArch64/arm64-volatile.ll
new file mode 100644
index 0000000..e00ac5a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-volatile.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define i64 @normal_load(i64* nocapture %bar) nounwind readonly {
+; CHECK: normal_load
+; CHECK: ldp
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+  %add.ptr = getelementptr inbounds i64* %bar, i64 1
+  %tmp = load i64* %add.ptr, align 8
+  %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
+  %tmp1 = load i64* %add.ptr1, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
+
+define i64 @volatile_load(i64* nocapture %bar) nounwind {
+; CHECK: volatile_load
+; CHECK: ldr
+; CHECK-NEXT: ldr
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+  %add.ptr = getelementptr inbounds i64* %bar, i64 1
+  %tmp = load volatile i64* %add.ptr, align 8
+  %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
+  %tmp1 = load volatile i64* %add.ptr1, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
diff --git a/test/CodeGen/AArch64/arm64-vpopcnt.ll b/test/CodeGen/AArch64/arm64-vpopcnt.ll
new file mode 100644
index 0000000..25306eb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vpopcnt.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; The non-byte ones used to fail with "Cannot select"
+
+; CHECK-LABEL: ctpopv8i8
+; CHECK: cnt.8b
+define <8 x i8> @ctpopv8i8(<8 x i8> %x) nounwind readnone {
+  %cnt = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %x)
+  ret <8 x i8> %cnt
+}
+
+declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
+
+; CHECK-LABEL: ctpopv4i16
+; CHECK: cnt.8b
+define <4 x i16> @ctpopv4i16(<4 x i16> %x) nounwind readnone {
+  %cnt = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %x)
+  ret <4 x i16> %cnt
+}
+
+declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
+
+; CHECK-LABEL: ctpopv2i32
+; CHECK: cnt.8b
+define <2 x i32> @ctpopv2i32(<2 x i32> %x) nounwind readnone {
+  %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x)
+  ret <2 x i32> %cnt
+}
+
+declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
+
+
+; CHECK-LABEL: ctpopv16i8
+; CHECK: cnt.16b
+define <16 x i8> @ctpopv16i8(<16 x i8> %x) nounwind readnone {
+  %cnt = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %x)
+  ret <16 x i8> %cnt
+}
+
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
+
+; CHECK-LABEL: ctpopv8i16
+; CHECK: cnt.8b
+define <8 x i16> @ctpopv8i16(<8 x i16> %x) nounwind readnone {
+  %cnt = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %x)
+  ret <8 x i16> %cnt
+}
+
+declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
+
+; CHECK-LABEL: ctpopv4i32
+; CHECK: cnt.8b
+define <4 x i32> @ctpopv4i32(<4 x i32> %x) nounwind readnone {
+  %cnt = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x)
+  ret <4 x i32> %cnt
+}
+
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
+
+; CHECK-LABEL: ctpopv2i64
+; CHECK: cnt.8b
+define <2 x i64> @ctpopv2i64(<2 x i64> %x) nounwind readnone {
+  %cnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)
+  ret <2 x i64> %cnt
+}
+
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vqadd.ll b/test/CodeGen/AArch64/arm64-vqadd.ll
new file mode 100644
index 0000000..20f7e2c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vqadd.ll
@@ -0,0 +1,332 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqadd8b:
+;CHECK: sqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqadd4h:
+;CHECK: sqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqadd2s:
+;CHECK: sqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqadd8b:
+;CHECK: uqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqadd4h:
+;CHECK: uqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqadd2s:
+;CHECK: uqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqadd16b:
+;CHECK: sqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqadd8h:
+;CHECK: sqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqadd4s:
+;CHECK: sqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqadd2d:
+;CHECK: sqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqadd16b:
+;CHECK: uqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqadd8h:
+;CHECK: uqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqadd4s:
+;CHECK: uqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqadd2d:
+;CHECK: uqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @usqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usqadd8b:
+;CHECK: usqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @usqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usqadd4h:
+;CHECK: usqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @usqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usqadd2s:
+;CHECK: usqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @usqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usqadd16b:
+;CHECK: usqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @usqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usqadd8h:
+;CHECK: usqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @usqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usqadd4s:
+;CHECK: usqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @usqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: usqadd2d:
+;CHECK: usqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define i64 @usqadd_d(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: usqadd_d:
+; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %l, i64 %r)
+  ret i64 %sum
+}
+
+define i32 @usqadd_s(i32 %l, i32 %r) nounwind {
+; CHECK-LABEL: usqadd_s:
+; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
+  %sum = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %l, i32 %r)
+  ret i32 %sum
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.usqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.usqadd.i32(i32, i32) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @suqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: suqadd8b:
+;CHECK: suqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @suqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: suqadd4h:
+;CHECK: suqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @suqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: suqadd2s:
+;CHECK: suqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @suqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: suqadd16b:
+;CHECK: suqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @suqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: suqadd8h:
+;CHECK: suqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @suqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: suqadd4s:
+;CHECK: suqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @suqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: suqadd2d:
+;CHECK: suqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i64> @suqadd_1d(<1 x i64> %l, <1 x i64> %r) nounwind {
+; CHECK-LABEL: suqadd_1d:
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
+  ret <1 x i64> %sum
+}
+
+define i64 @suqadd_d(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: suqadd_d:
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %l, i64 %r)
+  ret i64 %sum
+}
+
+define i32 @suqadd_s(i32 %l, i32 %r) nounwind {
+; CHECK-LABEL: suqadd_s:
+; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
+  %sum = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %l, i32 %r)
+  ret i32 %sum
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.suqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.suqadd.i32(i32, i32) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vqsub.ll b/test/CodeGen/AArch64/arm64-vqsub.ll
new file mode 100644
index 0000000..dde3ac3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vqsub.ll
@@ -0,0 +1,147 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub8b:
+;CHECK: sqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub4h:
+;CHECK: sqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub2s:
+;CHECK: sqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub8b:
+;CHECK: uqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub4h:
+;CHECK: uqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub2s:
+;CHECK: uqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub16b:
+;CHECK: sqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub8h:
+;CHECK: sqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub4s:
+;CHECK: sqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqsub2d:
+;CHECK: sqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub16b:
+;CHECK: uqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub8h:
+;CHECK: uqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub4s:
+;CHECK: uqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqsub2d:
+;CHECK: uqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vselect.ll b/test/CodeGen/AArch64/arm64-vselect.ll
new file mode 100644
index 0000000..9988512
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vselect.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func63
+;CHECK: cmeq.4h v0, v0, v1
+
+;FIXME: currently, it will generate 3 instructions:
+; ushll.4s	v0, v0, #0
+; shl.4s	v0, v0, #31
+; sshr.4s	v0, v0, #31
+;But these instrucitons can be optimized into 1 instruction:
+; sshll.4s  v0, v0, #0
+
+;CHECK: bsl.16b v0, v2, v3
+;CHECK: str  q0, [x0]
+;CHECK: ret
+
+%T0_63 = type <4 x i16>
+%T1_63 = type <4 x i32>
+%T2_63 = type <4 x i1>
+define void @func63(%T1_63* %out, %T0_63 %v0, %T0_63 %v1, %T1_63 %v2, %T1_63 %v3) {
+  %cond = icmp eq %T0_63 %v0, %v1
+  %r = select %T2_63 %cond, %T1_63 %v2, %T1_63 %v3
+  store %T1_63 %r, %T1_63* %out
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-vsetcc_fp.ll b/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
new file mode 100644
index 0000000..f4f4714
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+define <2 x i32> @fcmp_one(<2 x float> %x, <2 x float> %y) nounwind optsize readnone {
+; CHECK-LABEL: fcmp_one:
+; CHECK-NEXT: fcmgt.2s [[REG:v[0-9]+]], v0, v1
+; CHECK-NEXT: fcmgt.2s [[REG2:v[0-9]+]], v1, v0
+; CHECK-NEXT: orr.8b v0, [[REG2]], [[REG]]
+; CHECK-NEXT: ret
+  %tmp = fcmp one <2 x float> %x, %y
+  %or = sext <2 x i1> %tmp to <2 x i32>
+  ret <2 x i32> %or
+}
diff --git a/test/CodeGen/AArch64/arm64-vshift.ll b/test/CodeGen/AArch64/arm64-vshift.ll
new file mode 100644
index 0000000..82ae486
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vshift.ll
@@ -0,0 +1,1917 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -enable-misched=false | FileCheck %s
+
+define <8 x i8> @sqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqshl8b:
+;CHECK: sqshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqshl4h:
+;CHECK: sqshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqshl2s:
+;CHECK: sqshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqshl8b:
+;CHECK: uqshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqshl4h:
+;CHECK: uqshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqshl2s:
+;CHECK: uqshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqshl16b:
+;CHECK: sqshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqshl8h:
+;CHECK: sqshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqshl4s:
+;CHECK: sqshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqshl2d:
+;CHECK: sqshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqshl16b:
+;CHECK: uqshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqshl8h:
+;CHECK: uqshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqshl4s:
+;CHECK: uqshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqshl2d:
+;CHECK: uqshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @srshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srshl8b:
+;CHECK: srshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @srshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srshl4h:
+;CHECK: srshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @srshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srshl2s:
+;CHECK: srshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @urshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: urshl8b:
+;CHECK: urshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @urshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: urshl4h:
+;CHECK: urshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @urshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: urshl2s:
+;CHECK: urshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @srshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srshl16b:
+;CHECK: srshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @srshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srshl8h:
+;CHECK: srshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @srshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srshl4s:
+;CHECK: srshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @srshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: srshl2d:
+;CHECK: srshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @urshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: urshl16b:
+;CHECK: urshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @urshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: urshl8h:
+;CHECK: urshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @urshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: urshl4s:
+;CHECK: urshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @urshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: urshl2d:
+;CHECK: urshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @sqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqrshl8b:
+;CHECK: sqrshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrshl4h:
+;CHECK: sqrshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrshl2s:
+;CHECK: sqrshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqrshl8b:
+;CHECK: uqrshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqrshl4h:
+;CHECK: uqrshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqrshl2s:
+;CHECK: uqrshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqrshl16b:
+;CHECK: sqrshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrshl8h:
+;CHECK: sqrshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrshl4s:
+;CHECK: sqrshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqrshl2d:
+;CHECK: sqrshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqrshl16b:
+;CHECK: uqrshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqrshl8h:
+;CHECK: uqrshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqrshl4s:
+;CHECK: uqrshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqrshl2d:
+;CHECK: uqrshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @urshr8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: urshr8b:
+;CHECK: urshr.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @urshr4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: urshr4h:
+;CHECK: urshr.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @urshr2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: urshr2s:
+;CHECK: urshr.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @urshr16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: urshr16b:
+;CHECK: urshr.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @urshr8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: urshr8h:
+;CHECK: urshr.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @urshr4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: urshr4s:
+;CHECK: urshr.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @urshr2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: urshr2d:
+;CHECK: urshr.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @srshr8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: srshr8b:
+;CHECK: srshr.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @srshr4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: srshr4h:
+;CHECK: srshr.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @srshr2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: srshr2s:
+;CHECK: srshr.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @srshr16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: srshr16b:
+;CHECK: srshr.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @srshr8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: srshr8h:
+;CHECK: srshr.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @srshr4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: srshr4s:
+;CHECK: srshr.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @srshr2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: srshr2d:
+;CHECK: srshr.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @sqshlu8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshlu8b:
+;CHECK: sqshlu.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshlu4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshlu4h:
+;CHECK: sqshlu.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshlu2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshlu2s:
+;CHECK: sqshlu.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshlu16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshlu16b:
+;CHECK: sqshlu.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshlu8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshlu8h:
+;CHECK: sqshlu.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshlu4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshlu4s:
+;CHECK: sqshlu.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshlu2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshlu2d:
+;CHECK: sqshlu.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @rshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: rshrn8b:
+;CHECK: rshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @rshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: rshrn4h:
+;CHECK: rshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @rshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: rshrn2s:
+;CHECK: rshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @rshrn16b(<8 x i8> *%ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: rshrn16b:
+;CHECK: rshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @rshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: rshrn8h:
+;CHECK: rshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @rshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: rshrn4s:
+;CHECK: rshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define <8 x i8> @shrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: shrn8b:
+;CHECK: shrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @shrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: shrn4h:
+;CHECK: shrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @shrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: shrn2s:
+;CHECK: shrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @shrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: shrn16b:
+;CHECK: shrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @shrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: shrn8h:
+;CHECK: shrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @shrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: shrn4s:
+;CHECK: shrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: sqshrn1s:
+; CHECK: sqshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrn8b:
+;CHECK: sqshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrn4h:
+;CHECK: sqshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrn2s:
+;CHECK: sqshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+
+define <16 x i8> @sqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrn16b:
+;CHECK: sqshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrn8h:
+;CHECK: sqshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrn4s:
+;CHECK: sqshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.aarch64.neon.sqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqshrun1s(i64 %A) nounwind {
+; CHECK-LABEL: sqshrun1s:
+; CHECK: sqshrun {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqshrun8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrun8b:
+;CHECK: sqshrun.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshrun4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrun4h:
+;CHECK: sqshrun.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshrun2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrun2s:
+;CHECK: sqshrun.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrun16b:
+;CHECK: sqshrun2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrun8h:
+;CHECK: sqshrun2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrun4s:
+;CHECK: sqshrun2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.aarch64.neon.sqshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqrshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: sqrshrn1s:
+; CHECK: sqrshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqrshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrn8b:
+;CHECK: sqrshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrn4h:
+;CHECK: sqrshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrn2s:
+;CHECK: sqrshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrn16b:
+;CHECK: sqrshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrn8h:
+;CHECK: sqrshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrn4s:
+;CHECK: sqrshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.aarch64.neon.sqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqrshrun1s(i64 %A) nounwind {
+; CHECK-LABEL: sqrshrun1s:
+; CHECK: sqrshrun {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqrshrun8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrun8b:
+;CHECK: sqrshrun.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshrun4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrun4h:
+;CHECK: sqrshrun.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshrun2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrun2s:
+;CHECK: sqrshrun.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrun16b:
+;CHECK: sqrshrun2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqrshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrun8h:
+;CHECK: sqrshrun2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqrshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrun4s:
+;CHECK: sqrshrun2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.aarch64.neon.sqrshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @uqrshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: uqrshrn1s:
+; CHECK: uqrshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @uqrshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqrshrn8b:
+;CHECK: uqrshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqrshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqrshrn4h:
+;CHECK: uqrshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqrshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqrshrn2s:
+;CHECK: uqrshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqrshrn16b:
+;CHECK: uqrshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @uqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqrshrn8h:
+;CHECK: uqrshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqrshrn4s:
+;CHECK: uqrshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.aarch64.neon.uqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @uqshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: uqshrn1s:
+; CHECK: uqshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @uqshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshrn8b:
+;CHECK: uqshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshrn4h:
+;CHECK: uqshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshrn2s:
+;CHECK: uqshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshrn16b:
+;CHECK: uqshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @uqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshrn8h:
+;CHECK: uqshrn2.8h v0, {{v[0-9]+}}, #1
+  %out = load <4 x i16>* %ret
+  %tmp1 = load <4 x i32>* %A
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+  %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshrn4s:
+;CHECK: uqshrn2.4s v0, {{v[0-9]+}}, #1
+  %out = load <2 x i32>* %ret
+  %tmp1 = load <2 x i64>* %A
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+  %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.aarch64.neon.uqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define <8 x i16> @ushll8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: ushll8h:
+;CHECK: ushll.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ushll4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: ushll4s:
+;CHECK: ushll.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ushll2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: ushll2d:
+;CHECK: ushll.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: ushll2_8h:
+;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1
+        %load1 = load <16 x i8>* %A
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: ushll2_4s:
+;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1
+        %load1 = load <8 x i16>* %A
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: ushll2_2d:
+;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1
+        %load1 = load <4 x i32>* %A
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @sshll8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sshll8h:
+;CHECK: sshll.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sshll4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sshll4s:
+;CHECK: sshll.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sshll2d:
+;CHECK: sshll.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sshll2_8h:
+;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1
+        %load1 = load <16 x i8>* %A
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sshll2_4s:
+;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1
+        %load1 = load <8 x i16>* %A
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sshll2_2d:
+;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1
+        %load1 = load <4 x i32>* %A
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @sqshli8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshli8b:
+;CHECK: sqshl.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshli4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshli4h:
+;CHECK: sqshl.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshli2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshli2s:
+;CHECK: sqshl.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshli16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshli16b:
+;CHECK: sqshl.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshli8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshli8h:
+;CHECK: sqshl.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshli4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshli4s:
+;CHECK: sqshl.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshli2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshli2d:
+;CHECK: sqshl.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli8b:
+;CHECK: uqshl.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshli4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshli4h:
+;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshli2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshli2s:
+;CHECK: uqshl.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqshli16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli16b:
+;CHECK: uqshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqshli8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshli8h:
+;CHECK: uqshl.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqshli4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshli4s:
+;CHECK: uqshl.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqshli2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshli2d:
+;CHECK: uqshl.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @ursra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ursra8b:
+;CHECK: ursra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @ursra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ursra4h:
+;CHECK: ursra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @ursra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ursra2s:
+;CHECK: ursra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @ursra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ursra16b:
+;CHECK: ursra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @ursra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ursra8h:
+;CHECK: ursra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ursra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ursra4s:
+;CHECK: ursra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ursra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: ursra2d:
+;CHECK: ursra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @srsra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srsra8b:
+;CHECK: srsra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @srsra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srsra4h:
+;CHECK: srsra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @srsra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srsra2s:
+;CHECK: srsra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @srsra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srsra16b:
+;CHECK: srsra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @srsra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srsra8h:
+;CHECK: srsra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @srsra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srsra4s:
+;CHECK: srsra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @srsra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: srsra2d:
+;CHECK: srsra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @usra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usra8b:
+;CHECK: usra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @usra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usra4h:
+;CHECK: usra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @usra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usra2s:
+;CHECK: usra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @usra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usra16b:
+;CHECK: usra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @usra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usra8h:
+;CHECK: usra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @usra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usra4s:
+;CHECK: usra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @usra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: usra2d:
+;CHECK: usra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @ssra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssra8b:
+;CHECK: ssra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = ashr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @ssra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssra4h:
+;CHECK: ssra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = ashr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @ssra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssra2s:
+;CHECK: ssra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = ashr <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @ssra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssra16b:
+;CHECK: ssra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = ashr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @ssra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssra8h:
+;CHECK: ssra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = ashr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ssra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssra4s:
+;CHECK: ssra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = ashr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ssra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: ssra2d:
+;CHECK: ssra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = ashr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @shr_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shr_orr8b:
+;CHECK: shr.8b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i8>* %A
+        %tmp4 = load <8 x i8>* %B
+        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @shr_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shr_orr4h:
+;CHECK: shr.4h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i16>* %A
+        %tmp4 = load <4 x i16>* %B
+        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @shr_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shr_orr2s:
+;CHECK: shr.2s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i32>* %A
+        %tmp4 = load <2 x i32>* %B
+        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp5 = or <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @shr_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shr_orr16b:
+;CHECK: shr.16b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <16 x i8>* %A
+        %tmp4 = load <16 x i8>* %B
+        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @shr_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shr_orr8h:
+;CHECK: shr.8h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp4 = load <8 x i16>* %B
+        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @shr_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shr_orr4s:
+;CHECK: shr.4s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp4 = load <4 x i32>* %B
+        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp5 = or <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @shr_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: shr_orr2d:
+;CHECK: shr.2d v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp4 = load <2 x i64>* %B
+        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp5 = or <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @shl_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shl_orr8b:
+;CHECK: shl.8b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i8>* %A
+        %tmp4 = load <8 x i8>* %B
+        %tmp3 = shl <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @shl_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shl_orr4h:
+;CHECK: shl.4h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i16>* %A
+        %tmp4 = load <4 x i16>* %B
+        %tmp3 = shl <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @shl_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shl_orr2s:
+;CHECK: shl.2s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i32>* %A
+        %tmp4 = load <2 x i32>* %B
+        %tmp3 = shl <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp5 = or <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @shl_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shl_orr16b:
+;CHECK: shl.16b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <16 x i8>* %A
+        %tmp4 = load <16 x i8>* %B
+        %tmp3 = shl <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @shl_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shl_orr8h:
+;CHECK: shl.8h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp4 = load <8 x i16>* %B
+        %tmp3 = shl <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @shl_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shl_orr4s:
+;CHECK: shl.4s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp4 = load <4 x i32>* %B
+        %tmp3 = shl <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp5 = or <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @shl_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: shl_orr2d:
+;CHECK: shl.2d v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp4 = load <2 x i64>* %B
+        %tmp3 = shl <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp5 = or <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @shll(<8 x i8> %in) {
+; CHECK-LABEL: shll:
+; CHECK: shll.8h v0, {{v[0-9]+}}, #8
+  %ext = zext <8 x i8> %in to <8 x i16>
+  %res = shl <8 x i16> %ext, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @shll_high(<8 x i16> %in) {
+; CHECK-LABEL: shll_high
+; CHECK: shll2.4s v0, {{v[0-9]+}}, #16
+  %extract = shufflevector <8 x i16> %in, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ext = zext <4 x i16> %extract to <4 x i32>
+  %res = shl <4 x i32> %ext, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %res
+}
+
+define <8 x i8> @sli8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sli8b:
+;CHECK: sli.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sli4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sli4h:
+;CHECK: sli.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sli2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sli2s:
+;CHECK: sli.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @sli1d(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: sli1d:
+;CHECK: sli d0, {{d[0-9]+}}, #1
+        %tmp1 = load <1 x i64>* %A
+        %tmp2 = load <1 x i64>* %B
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
+        ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @sli16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sli16b:
+;CHECK: sli.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sli8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sli8h:
+;CHECK: sli.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sli4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sli4s:
+;CHECK: sli.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sli2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sli2d:
+;CHECK: sli.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
+
+define <1 x i64> @ashr_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: ashr_v1i64:
+; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: sshl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %c = ashr <1 x i64> %a, %b
+  ret <1 x i64> %c
+}
diff --git a/test/CodeGen/AArch64/arm64-vshr.ll b/test/CodeGen/AArch64/arm64-vshr.ll
new file mode 100644
index 0000000..21eb579
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vshr.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+
+define <8 x i16> @testShiftRightArith_v8i16(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK-LABEL: testShiftRightArith_v8i16:
+; CHECK: neg.8h	[[REG1:v[0-9]+]], [[REG1]]
+; CHECK-NEXT: sshl.8h [[REG2:v[0-9]+]], [[REG2]], [[REG1]]
+
+entry:
+  %a.addr = alloca <8 x i16>, align 16
+  %b.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %a, <8 x i16>* %a.addr, align 16
+  store <8 x i16> %b, <8 x i16>* %b.addr, align 16
+  %0 = load <8 x i16>* %a.addr, align 16
+  %1 = load <8 x i16>* %b.addr, align 16
+  %shr = ashr <8 x i16> %0, %1
+  ret <8 x i16> %shr
+}
+
+define <4 x i32> @testShiftRightArith_v4i32(<4 x i32> %a, <4 x i32> %b) #0 {
+; CHECK-LABEL: testShiftRightArith_v4i32:
+; CHECK: neg.4s	[[REG3:v[0-9]+]], [[REG3]]
+; CHECK-NEXT: sshl.4s [[REG4:v[0-9]+]], [[REG4]], [[REG3]]
+entry:
+  %a.addr = alloca <4 x i32>, align 32
+  %b.addr = alloca <4 x i32>, align 32
+  store <4 x i32> %a, <4 x i32>* %a.addr, align 32
+  store <4 x i32> %b, <4 x i32>* %b.addr, align 32
+  %0 = load <4 x i32>* %a.addr, align 32
+  %1 = load <4 x i32>* %b.addr, align 32
+  %shr = ashr <4 x i32> %0, %1
+  ret <4 x i32> %shr
+}
+
+define <8 x i16> @testShiftRightLogical(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK: testShiftRightLogical
+; CHECK: neg.8h	[[REG5:v[0-9]+]], [[REG5]]
+; CHECK-NEXT: ushl.8h [[REG6:v[0-9]+]], [[REG6]], [[REG5]]
+entry:
+  %a.addr = alloca <8 x i16>, align 16
+  %b.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %a, <8 x i16>* %a.addr, align 16
+  store <8 x i16> %b, <8 x i16>* %b.addr, align 16
+  %0 = load <8 x i16>* %a.addr, align 16
+  %1 = load <8 x i16>* %b.addr, align 16
+  %shr = lshr <8 x i16> %0, %1
+  ret <8 x i16> %shr
+}
+
+define <1 x i64> @sshr_v1i64(<1 x i64> %A) nounwind {
+; CHECK-LABEL: sshr_v1i64:
+; CHECK: sshr d0, d0, #63
+  %tmp3 = ashr <1 x i64> %A, < i64 63 >
+  ret <1 x i64> %tmp3
+}
+
+define <1 x i64> @ushr_v1i64(<1 x i64> %A) nounwind {
+; CHECK-LABEL: ushr_v1i64:
+; CHECK: ushr d0, d0, #63
+  %tmp3 = lshr <1 x i64> %A, < i64 63 >
+  ret <1 x i64> %tmp3
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll
new file mode 100644
index 0000000..62fd961
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -0,0 +1,115 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s
+
+
+; The mask:
+; CHECK: lCPI0_0:
+; CHECK:  .byte   2                       ; 0x2
+; CHECK:  .byte   255                     ; 0xff
+; CHECK:  .byte   6                       ; 0x6
+; CHECK:  .byte   255                     ; 0xff
+; The second vector is legalized to undef and the elements of the first vector
+; are used instead.
+; CHECK:  .byte   2                       ; 0x2
+; CHECK:  .byte   4                       ; 0x4
+; CHECK:  .byte   6                       ; 0x6
+; CHECK:  .byte   0                       ; 0x0
+; CHECK: test1
+; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0
+; CHECK: movi.8h v[[REG1:[0-9]+]], #0x1, lsl #8
+; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <8 x i1> @test1() {
+entry:
+  %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
+                                   i1 7>,
+                         <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
+                                   i1 7>,
+                         <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10,
+                                    i32 12, i32 14, i32 0>
+  ret <8 x i1> %Shuff
+}
+
+; CHECK: lCPI1_0:
+; CHECK:          .byte   2                       ; 0x2
+; CHECK:          .byte   255                     ; 0xff
+; CHECK:          .byte   6                       ; 0x6
+; CHECK:          .byte   255                     ; 0xff
+; CHECK:          .byte   10                      ; 0xa
+; CHECK:          .byte   12                      ; 0xc
+; CHECK:          .byte   14                      ; 0xe
+; CHECK:          .byte   0                       ; 0x0
+; CHECK: test2
+; CHECK: ldr     d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF]
+; CHECK: adrp    x[[REG2:[0-9]+]], lCPI1_1@PAGE
+; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG2]], lCPI1_1@PAGEOFF]
+; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <8 x i1>@test2() {
+bb:
+  %Shuff = shufflevector <8 x i1> zeroinitializer,
+     <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
+     <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
+                i32 0>
+  ret <8 x i1> %Shuff
+}
+
+; CHECK: lCPI2_0:
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK: test3
+; CHECK: adrp    x[[REG3:[0-9]+]], lCPI2_0@PAGE
+; CHECK: ldr     q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF]
+; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG3]], lCPI2_1@PAGEOFF]
+; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <16 x i1> @test3(i1* %ptr, i32 %v) {
+bb:
+  %Shuff = shufflevector <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i1> undef,
+     <16 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
+                 i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12,
+                 i32 14, i32 0>
+  ret <16 x i1> %Shuff
+}
+; CHECK: lCPI3_1:
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   1                       ; 0x1
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   18                      ; 0x12
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   31                      ; 0x1f
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   30                      ; 0x1e
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK: _test4:
+; CHECK:         ldr     q[[REG1:[0-9]+]]
+; CHECK:         movi.2d v[[REG0:[0-9]+]], #0000000000000000
+; CHECK:         adrp    x[[REG3:[0-9]+]], lCPI3_1@PAGE
+; CHECK:         ldr     q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF]
+; CHECK:         tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]]
+define <16 x i1> @test4(i1* %ptr, i32 %v) {
+bb:
+  %Shuff = shufflevector <16 x i1> zeroinitializer,
+     <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1,
+                i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
+     <16 x i32> <i32 2, i32 1, i32 6, i32 18, i32 10, i32 12, i32 14, i32 0,
+                 i32 2, i32 31, i32 6, i32 30, i32 10, i32 12, i32 14, i32 0>
+  ret <16 x i1> %Shuff
+}
diff --git a/test/CodeGen/AArch64/arm64-vsqrt.ll b/test/CodeGen/AArch64/arm64-vsqrt.ll
new file mode 100644
index 0000000..02b7c7e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vsqrt.ll
@@ -0,0 +1,232 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @frecps_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: frecps_2s:
+;CHECK: frecps.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frecps_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: frecps_4s:
+;CHECK: frecps.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frecps_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: frecps_2d:
+;CHECK: frecps.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: frsqrts_2s:
+;CHECK: frsqrts.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frsqrts_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: frsqrts_4s:
+;CHECK: frsqrts.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frsqrts_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: frsqrts_2d:
+;CHECK: frsqrts.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @frecpe_2s(<2 x float>* %A) nounwind {
+;CHECK-LABEL: frecpe_2s:
+;CHECK: frecpe.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frecpe_4s(<4 x float>* %A) nounwind {
+;CHECK-LABEL: frecpe_4s:
+;CHECK: frecpe.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frecpe_2d(<2 x double>* %A) nounwind {
+;CHECK-LABEL: frecpe_2d:
+;CHECK: frecpe.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> %tmp1)
+	ret <2 x double> %tmp3
+}
+
+define float @frecpe_s(float* %A) nounwind {
+;CHECK-LABEL: frecpe_s:
+;CHECK: frecpe s0, {{s[0-9]+}}
+  %tmp1 = load float* %A
+  %tmp3 = call float @llvm.aarch64.neon.frecpe.f32(float %tmp1)
+  ret float %tmp3
+}
+
+define double @frecpe_d(double* %A) nounwind {
+;CHECK-LABEL: frecpe_d:
+;CHECK: frecpe d0, {{d[0-9]+}}
+  %tmp1 = load double* %A
+  %tmp3 = call double @llvm.aarch64.neon.frecpe.f64(double %tmp1)
+  ret double %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.aarch64.neon.frecpe.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frecpe.f64(double) nounwind readnone
+
+define float @frecpx_s(float* %A) nounwind {
+;CHECK-LABEL: frecpx_s:
+;CHECK: frecpx s0, {{s[0-9]+}}
+  %tmp1 = load float* %A
+  %tmp3 = call float @llvm.aarch64.neon.frecpx.f32(float %tmp1)
+  ret float %tmp3
+}
+
+define double @frecpx_d(double* %A) nounwind {
+;CHECK-LABEL: frecpx_d:
+;CHECK: frecpx d0, {{d[0-9]+}}
+  %tmp1 = load double* %A
+  %tmp3 = call double @llvm.aarch64.neon.frecpx.f64(double %tmp1)
+  ret double %tmp3
+}
+
+declare float @llvm.aarch64.neon.frecpx.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frecpx.f64(double) nounwind readnone
+
+define <2 x float> @frsqrte_2s(<2 x float>* %A) nounwind {
+;CHECK-LABEL: frsqrte_2s:
+;CHECK: frsqrte.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frsqrte_4s(<4 x float>* %A) nounwind {
+;CHECK-LABEL: frsqrte_4s:
+;CHECK: frsqrte.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frsqrte_2d(<2 x double>* %A) nounwind {
+;CHECK-LABEL: frsqrte_2d:
+;CHECK: frsqrte.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> %tmp1)
+	ret <2 x double> %tmp3
+}
+
+define float @frsqrte_s(float* %A) nounwind {
+;CHECK-LABEL: frsqrte_s:
+;CHECK: frsqrte s0, {{s[0-9]+}}
+  %tmp1 = load float* %A
+  %tmp3 = call float @llvm.aarch64.neon.frsqrte.f32(float %tmp1)
+  ret float %tmp3
+}
+
+define double @frsqrte_d(double* %A) nounwind {
+;CHECK-LABEL: frsqrte_d:
+;CHECK: frsqrte d0, {{d[0-9]+}}
+  %tmp1 = load double* %A
+  %tmp3 = call double @llvm.aarch64.neon.frsqrte.f64(double %tmp1)
+  ret double %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.aarch64.neon.frsqrte.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frsqrte.f64(double) nounwind readnone
+
+define <2 x i32> @urecpe_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: urecpe_2s:
+;CHECK: urecpe.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urecpe_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: urecpe_4s:
+;CHECK: urecpe.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
+
+define <2 x i32> @ursqrte_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: ursqrte_2s:
+;CHECK: ursqrte.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @ursqrte_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: ursqrte_4s:
+;CHECK: ursqrte.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
+
+define float @f1(float %a, float %b) nounwind readnone optsize ssp {
+; CHECK-LABEL: f1:
+; CHECK: frsqrts s0, s0, s1
+; CHECK-NEXT: ret
+  %vrsqrtss.i = tail call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) nounwind
+  ret float %vrsqrtss.i
+}
+
+define double @f2(double %a, double %b) nounwind readnone optsize ssp {
+; CHECK-LABEL: f2:
+; CHECK: frsqrts d0, d0, d1
+; CHECK-NEXT: ret
+  %vrsqrtsd.i = tail call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) nounwind
+  ret double %vrsqrtsd.i
+}
+
+declare double @llvm.aarch64.neon.frsqrts.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.frsqrts.f32(float, float) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vsra.ll b/test/CodeGen/AArch64/arm64-vsra.ll
new file mode 100644
index 0000000..5e9cef3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vsra.ll
@@ -0,0 +1,150 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vsras8:
+;CHECK: ssra.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = ashr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vsras16:
+;CHECK: ssra.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = ashr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vsras32:
+;CHECK: ssra.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = ashr <2 x i32> %tmp2, < i32 31, i32 31 >
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vsraQs8:
+;CHECK: ssra.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = ashr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vsraQs16:
+;CHECK: ssra.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = ashr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vsraQs32:
+;CHECK: ssra.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = ashr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vsraQs64:
+;CHECK: ssra.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = ashr <2 x i64> %tmp2, < i64 63, i64 63 >
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vsrau8:
+;CHECK: usra.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = lshr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vsrau16:
+;CHECK: usra.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = lshr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vsrau32:
+;CHECK: usra.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = lshr <2 x i32> %tmp2, < i32 31, i32 31 >
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+
+define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vsraQu8:
+;CHECK: usra.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = lshr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vsraQu16:
+;CHECK: usra.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = lshr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vsraQu32:
+;CHECK: usra.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = lshr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vsraQu64:
+;CHECK: usra.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = lshr <2 x i64> %tmp2, < i64 63, i64 63 >
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <1 x i64> @vsra_v1i64(<1 x i64> %A, <1 x i64> %B) nounwind {
+; CHECK-LABEL: vsra_v1i64:
+; CHECK: ssra d0, d1, #63
+  %tmp3 = ashr <1 x i64> %B, < i64 63 >
+  %tmp4 = add <1 x i64> %A, %tmp3
+  ret <1 x i64> %tmp4
+}
diff --git a/test/CodeGen/AArch64/arm64-vsub.ll b/test/CodeGen/AArch64/arm64-vsub.ll
new file mode 100644
index 0000000..c2c8755
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vsub.ll
@@ -0,0 +1,417 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn8b:
+;CHECK: subhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn4h:
+;CHECK: subhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2s:
+;CHECK: subhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
+;CHECK-LABEL: subhn2_16b:
+;CHECK: subhn.8b
+;CHECK-NEXT: subhn2.16b
+  %vsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
+;CHECK-LABEL: subhn2_8h:
+;CHECK: subhn.4h
+;CHECK-NEXT: subhn2.8h
+  %vsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
+;CHECK-LABEL: subhn2_4s:
+;CHECK: subhn.2s
+;CHECK-NEXT: subhn2.4s
+  %vsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: rsubhn8b:
+;CHECK: rsubhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: rsubhn4h:
+;CHECK: rsubhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: rsubhn2s:
+;CHECK: rsubhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
+;CHECK-LABEL: rsubhn2_16b:
+;CHECK: rsubhn.8b
+;CHECK-NEXT: rsubhn2.16b
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vrsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
+;CHECK-LABEL: rsubhn2_8h:
+;CHECK: rsubhn.4h
+;CHECK-NEXT: rsubhn2.8h
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vrsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
+;CHECK-LABEL: rsubhn2_4s:
+;CHECK: rsubhn.2s
+;CHECK-NEXT: rsubhn2.4s
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vrsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubl8h:
+;CHECK: ssubl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sub <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ssubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubl4s:
+;CHECK: ssubl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubl2d:
+;CHECK: ssubl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubl2_8h:
+;CHECK: ssubl2.8h
+        %tmp1 = load <16 x i8>* %A
+        %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext1 = sext <8 x i8> %high1 to <8 x i16>
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = sub <8 x i16> %ext1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubl2_4s:
+;CHECK: ssubl2.4s
+        %tmp1 = load <8 x i16>* %A
+        %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext1 = sext <4 x i16> %high1 to <4 x i32>
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = sub <4 x i32> %ext1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubl2_2d:
+;CHECK: ssubl2.2d
+        %tmp1 = load <4 x i32>* %A
+        %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext1 = sext <2 x i32> %high1 to <2 x i64>
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = sub <2 x i64> %ext1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @usubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usubl8h:
+;CHECK: usubl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sub <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @usubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usubl4s:
+;CHECK: usubl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usubl2d:
+;CHECK: usubl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usubl2_8h:
+;CHECK: usubl2.8h
+  %tmp1 = load <16 x i8>* %A
+  %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ext1 = zext <8 x i8> %high1 to <8 x i16>
+
+  %tmp2 = load <16 x i8>* %B
+  %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+  %res = sub <8 x i16> %ext1, %ext2
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usubl2_4s:
+;CHECK: usubl2.4s
+  %tmp1 = load <8 x i16>* %A
+  %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ext1 = zext <4 x i16> %high1 to <4 x i32>
+
+  %tmp2 = load <8 x i16>* %B
+  %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+  %res = sub <4 x i32> %ext1, %ext2
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usubl2_2d:
+;CHECK: usubl2.2d
+  %tmp1 = load <4 x i32>* %A
+  %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %ext1 = zext <2 x i32> %high1 to <2 x i64>
+
+  %tmp2 = load <4 x i32>* %B
+  %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+  %res = sub <2 x i64> %ext1, %ext2
+  ret <2 x i64> %res
+}
+
+define <8 x i16> @ssubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubw8h:
+;CHECK: ssubw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = sub <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ssubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubw4s:
+;CHECK: ssubw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = sub <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubw2d:
+;CHECK: ssubw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = sub <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubw2_8h:
+;CHECK: ssubw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = sub <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubw2_4s:
+;CHECK: ssubw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = sub <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubw2_2d:
+;CHECK: ssubw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = sub <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @usubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usubw8h:
+;CHECK: usubw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = sub <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @usubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usubw4s:
+;CHECK: usubw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = sub <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usubw2d:
+;CHECK: usubw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = sub <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usubw2_8h:
+;CHECK: usubw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+        %res = sub <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usubw2_4s:
+;CHECK: usubw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+        %res = sub <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usubw2_2d:
+;CHECK: usubw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+        %res = sub <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
diff --git a/test/CodeGen/AArch64/arm64-weak-reference.ll b/test/CodeGen/AArch64/arm64-weak-reference.ll
new file mode 100644
index 0000000..b2135e0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-weak-reference.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+@x = extern_weak global i32
+
+define i32 @fn() nounwind ssp {
+; CHECK-LABEL: fn:
+; CHECK: .weak_reference
+  %val = load i32* @x, align 4
+  ret i32 %val
+}
diff --git a/test/CodeGen/AArch64/arm64-xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll
new file mode 100644
index 0000000..6cffbde
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -0,0 +1,524 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+;
+; Get the actual value of the overflow bit.
+;
+define i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo.i32
+; CHECK:        adds w8, w0, w1
+; CHECK-NEXT:   cset w0, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo.i64
+; CHECK:        adds x8, x0, x1
+; CHECK-NEXT:   cset w0, vs
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  uaddo.i32
+; CHECK:        adds w8, w0, w1
+; CHECK-NEXT:   cset w0, hs
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  uaddo.i64
+; CHECK:        adds x8, x0, x1
+; CHECK-NEXT:   cset w0, hs
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  ssubo.i32
+; CHECK:        subs w8, w0, w1
+; CHECK-NEXT:   cset w0, vs
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  ssubo.i64
+; CHECK:        subs x8, x0, x1
+; CHECK-NEXT:   cset w0, vs
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  usubo.i32
+; CHECK:        subs w8, w0, w1
+; CHECK-NEXT:   cset w0, lo
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  usubo.i64
+; CHECK:        subs x8, x0, x1
+; CHECK-NEXT:   cset w0, lo
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  smulo.i32
+; CHECK:        smull x8, w0, w1
+; CHECK-NEXT:   lsr x9, x8, #32
+; CHECK-NEXT:   cmp w9, w8, asr #31
+; CHECK-NEXT:   cset w0, ne
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  smulo.i64
+; CHECK:        mul x8, x0, x1
+; CHECK-NEXT:   smulh x9, x0, x1
+; CHECK-NEXT:   cmp x9, x8, asr #63
+; CHECK-NEXT:   cset w0, ne
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  umulo.i32
+; CHECK:        umull x8, w0, w1
+; CHECK-NEXT:   cmp xzr, x8, lsr #32
+; CHECK-NEXT:   cset w0, ne
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  umulo.i64
+; CHECK:        umulh x8, x0, x1
+; CHECK-NEXT:   cmp xzr, x8
+; CHECK-NEXT:   cset w8, ne
+; CHECK-NEXT:   mul x9, x0, x1
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+
+;
+; Check the use of the overflow bit in combination with a select instruction.
+;
+define i32 @saddo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  saddo.select.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  saddo.select.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, vs
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.select.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, hs
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.select.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, hs
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @ssubo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.select.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, vs
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.select.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, vs
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  usubo.select.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, lo
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  usubo.select.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, lo
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  smulo.select.i32
+; CHECK:        smull    x8, w0, w1
+; CHECK-NEXT:   lsr     x9, x8, #32
+; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK-NEXT:   csel    w0, w0, w1, ne
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  smulo.select.i64
+; CHECK:        mul      x8, x0, x1
+; CHECK-NEXT:   smulh   x9, x0, x1
+; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK-NEXT:   csel    x0, x0, x1, ne
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  umulo.select.i32
+; CHECK:        umull    x8, w0, w1
+; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK-NEXT:   csel    w0, w0, w1, ne
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  umulo.select.i64
+; CHECK:        umulh   x8, x0, x1
+; CHECK-NEXT:   cmp     xzr, x8
+; CHECK-NEXT:   csel    x0, x0, x1, ne
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+
+;
+; Check the use of the overflow bit in combination with a branch instruction.
+;
+define i1 @saddo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  saddo.br.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   b.vc
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @saddo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  saddo.br.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   b.vc
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.br.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   b.lo
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.br.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   b.lo
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.br.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   b.vc
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.br.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   b.vc
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @usubo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  usubo.br.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   b.hs
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @usubo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  usubo.br.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   b.hs
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @smulo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  smulo.br.i32
+; CHECK:        smull    x8, w0, w1
+; CHECK-NEXT:   lsr     x9, x8, #32
+; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK-NEXT:   b.eq
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @smulo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  smulo.br.i64
+; CHECK:        mul      x8, x0, x1
+; CHECK-NEXT:   smulh   x9, x0, x1
+; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK-NEXT:   b.eq
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @umulo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  umulo.br.i32
+; CHECK:        umull    x8, w0, w1
+; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK-NEXT:   b.eq
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @umulo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  umulo.br.i64
+; CHECK:        umulh   x8, x0, x1
+; CHECK-NEXT:   cbz
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll b/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
new file mode 100644
index 0000000..c56d607
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; rdar://12254953
+
+define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: mov x0, [[REG1:x[0-9]+]]
+; CHECK: mov x1, [[REG2:x[0-9]+]]
+; CHECK: bl _foo
+; CHECK: mov x0, [[REG1]]
+; CHECK: mov x1, [[REG2]]
+  %call = call i32 @foo(i32 %c, i32 %d) nounwind
+  %call1 = call i32 @foo(i32 %c, i32 %d) nounwind
+  unreachable
+}
+
+declare i32 @foo(i32, i32)
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
new file mode 100644
index 0000000..349bb6f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; rdar://11481771
+; rdar://13713797
+
+define void @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: fmov
+; CHECK: movi.2d v0, #0000000000000000
+; CHECK: movi.2d v1, #0000000000000000
+; CHECK: movi.2d v2, #0000000000000000
+; CHECK: movi.2d v3, #0000000000000000
+  tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind
+  ret void
+}
+
+define void @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK-NOT: mov w0, wzr
+; CHECK: movz w0, #0
+; CHECK: movz w1, #0
+  tail call void @bari(i32 0, i32 0) nounwind
+  ret void
+}
+
+define void @t3() nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: mov x0, xzr
+; CHECK: movz x0, #0
+; CHECK: movz x1, #0
+  tail call void @barl(i64 0, i64 0) nounwind
+  ret void
+}
+
+define void @t4() nounwind ssp {
+; CHECK-LABEL: t4:
+; CHECK-NOT: fmov
+; CHECK: movi.2d v0, #0000000000000000
+; CHECK: movi.2d v1, #0000000000000000
+  tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
+  ret void
+}
+
+declare void @bar(double, double, double, double)
+declare void @bari(i32, i32)
+declare void @barl(i64, i64)
+declare void @barf(float, float)
diff --git a/test/CodeGen/AArch64/arm64-zext.ll b/test/CodeGen/AArch64/arm64-zext.ll
new file mode 100644
index 0000000..8d9e5ea
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zext.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i64 @foo(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: add w0, w1, w0
+; CHECK: ret
+  %add = add i32 %b, %a
+  %conv = zext i32 %add to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-zextload-unscaled.ll b/test/CodeGen/AArch64/arm64-zextload-unscaled.ll
new file mode 100644
index 0000000..c475dbd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zextload-unscaled.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+@var32 = global i32 0
+
+define void @test_zextloadi1_unscaled(i1* %base) {
+; CHECK-LABEL: test_zextloadi1_unscaled:
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
+
+  %addr = getelementptr i1* %base, i32 -7
+  %val = load i1* %addr, align 1
+
+  %extended = zext i1 %val to i32
+  store i32 %extended, i32* @var32, align 4
+  ret void
+}
+
+define void @test_zextloadi8_unscaled(i8* %base) {
+; CHECK-LABEL: test_zextloadi8_unscaled:
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
+
+  %addr = getelementptr i8* %base, i32 -7
+  %val = load i8* %addr, align 1
+
+  %extended = zext i8 %val to i32
+  store i32 %extended, i32* @var32, align 4
+  ret void
+}
+
+define void @test_zextloadi16_unscaled(i16* %base) {
+; CHECK-LABEL: test_zextloadi16_unscaled:
+; CHECK: ldurh {{w[0-9]+}}, [{{x[0-9]+}}, #-14]
+
+  %addr = getelementptr i16* %base, i32 -7
+  %val = load i16* %addr, align 2
+
+  %extended = zext i16 %val to i32
+  store i32 %extended, i32* @var32, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-zip.ll b/test/CodeGen/AArch64/arm64-zip.ll
new file mode 100644
index 0000000..304b280
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zip.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipi8:
+;CHECK: zip1.8b
+;CHECK: zip2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vzipi16:
+;CHECK: zip1.4h
+;CHECK: zip2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipQi8:
+;CHECK: zip1.16b
+;CHECK: zip2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vzipQi16:
+;CHECK: zip1.8h
+;CHECK: zip2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vzipQi32:
+;CHECK: zip1.4s
+;CHECK: zip2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vzipQf:
+;CHECK: zip1.4s
+;CHECK: zip2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VZIP:
+
+define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipi8_undef:
+;CHECK: zip1.8b
+;CHECK: zip2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 1, i32 9, i32 undef, i32 10, i32 3, i32 11>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 undef, i32 undef, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @vzipQi8_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipQi8_undef:
+;CHECK: zip1.16b
+;CHECK: zip2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
diff --git a/test/CodeGen/AArch64/asm-large-immediate.ll b/test/CodeGen/AArch64/asm-large-immediate.ll
new file mode 100644
index 0000000..05e4ddd
--- /dev/null
+++ b/test/CodeGen/AArch64/asm-large-immediate.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=aarch64 -no-integrated-as < %s | FileCheck %s
+
+define void @test() {
+entry:
+; CHECK: /* result: 68719476738 */
+        tail call void asm sideeffect "/* result: ${0:c} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
+; CHECK: /* result: -68719476738 */
+        tail call void asm sideeffect "/* result: ${0:n} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
+        ret void
+}
diff --git a/test/CodeGen/AArch64/assertion-rc-mismatch.ll b/test/CodeGen/AArch64/assertion-rc-mismatch.ll
index 02b0c0e..bcf206e 100644
--- a/test/CodeGen/AArch64/assertion-rc-mismatch.ll
+++ b/test/CodeGen/AArch64/assertion-rc-mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 ; Test case related to <rdar://problem/15633429>.
 
 ; CHECK-LABEL: small
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index 5fe2936..58b5d1d 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -1,5 +1,11 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-REG %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
+
+
+; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
+; (i.e. reusing a register for status & data in store exclusive).
+; CHECK-REG-NOT: stlxrb w[[NEW:[0-9]+]], w[[NEW]], [x{{[0-9]+}}]
+; CHECK-REG-NOT: stlxrb w[[NEW:[0-9]+]], x[[NEW]], [x{{[0-9]+}}]
 
 @var8 = global i8 0
 @var16 = global i16 0
@@ -11,20 +17,18 @@ define i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
    %old = atomicrmw add i8* @var8, i8 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -33,20 +37,18 @@ define i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
    %old = atomicrmw add i16* @var16, i16 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -55,20 +57,18 @@ define i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
    %old = atomicrmw add i32* @var32, i32 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -77,15 +77,13 @@ define i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
    %old = atomicrmw add i64* @var64, i64 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: add [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: add x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -99,20 +97,18 @@ define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
    %old = atomicrmw sub i8* @var8, i8 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -121,20 +117,18 @@ define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
    %old = atomicrmw sub i16* @var16, i16 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -143,20 +137,18 @@ define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
    %old = atomicrmw sub i32* @var32, i32 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -165,15 +157,13 @@ define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
    %old = atomicrmw sub i64* @var64, i64 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: sub [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: sub x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -187,20 +177,18 @@ define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
    %old = atomicrmw and i8* @var8, i8 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -209,20 +197,18 @@ define i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
    %old = atomicrmw and i16* @var16, i16 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -231,20 +217,18 @@ define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
    %old = atomicrmw and i32* @var32, i32 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -253,15 +237,13 @@ define i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
    %old = atomicrmw and i64* @var64, i64 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: and [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: and x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -275,20 +257,18 @@ define i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
    %old = atomicrmw or i8* @var8, i8 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -297,20 +277,18 @@ define i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
    %old = atomicrmw or i16* @var16, i16 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -319,20 +297,18 @@ define i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
    %old = atomicrmw or i32* @var32, i32 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -341,15 +317,13 @@ define i64 @test_atomic_load_or_i64(i64 %offset) nounwind {
    %old = atomicrmw or i64* @var64, i64 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: orr [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: orr x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -363,20 +337,18 @@ define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
    %old = atomicrmw xor i8* @var8, i8 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -385,20 +357,18 @@ define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind {
    %old = atomicrmw xor i16* @var16, i16 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -407,20 +377,18 @@ define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
    %old = atomicrmw xor i32* @var32, i32 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -429,15 +397,13 @@ define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind {
    %old = atomicrmw xor i64* @var64, i64 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 ; CHECK-NEXT: eor [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: eor x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -451,18 +417,17 @@ define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
    %old = atomicrmw xchg i8* @var8, i8 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-REG-NOT: stxrb w0, w0, [x{{[0-9]+}}]
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -471,18 +436,17 @@ define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
    %old = atomicrmw xchg i16* @var16, i16 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-REG-NOT: stlxrh w0, w0, [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -491,18 +455,17 @@ define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
    %old = atomicrmw xchg i32* @var32, i32 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-REG-NOT: stlxr w0, w0, [x{{[0-9]+}}]
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -511,13 +474,12 @@ define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
    %old = atomicrmw xchg i64* @var64, i64 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-REG-NOT: stxr w0, x0, [x{{[0-9]+}}]
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], x0, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -532,21 +494,22 @@ define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
    %old = atomicrmw min i8* @var8, i8 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], sxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt
-; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -555,21 +518,23 @@ define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
    %old = atomicrmw min i16* @var16, i16 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], sxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt
-; CHECK-REG-NOT: stlxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+
+
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -578,21 +543,22 @@ define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
    %old = atomicrmw min i32* @var32, i32 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]]
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt
-; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+
+
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -601,16 +567,17 @@ define i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
    %old = atomicrmw min i64* @var64, i64 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: cmp x0, x[[OLD]]
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
-; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, gt
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, le
+
+
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -624,21 +591,23 @@ define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
    %old = atomicrmw max i8* @var8, i8 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], sxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+
+
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -647,21 +616,23 @@ define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
    %old = atomicrmw max i16* @var16, i16 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], sxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+
+
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -670,21 +641,22 @@ define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
    %old = atomicrmw max i32* @var32, i32 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]]
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+
+
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -693,16 +665,17 @@ define i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
    %old = atomicrmw max i64* @var64, i64 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: cmp x0, x[[OLD]]
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lt
-; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, lt
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
+
+
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -716,21 +689,22 @@ define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
    %old = atomicrmw umin i8* @var8, i8 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], uxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+
+
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -739,21 +713,22 @@ define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
    %old = atomicrmw umin i16* @var16, i16 %offset acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], uxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0, uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+
+
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -762,21 +737,22 @@ define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
    %old = atomicrmw umin i32* @var32, i32 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]]
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+
+
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -785,16 +761,17 @@ define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
    %old = atomicrmw umin i64* @var64, i64 %offset acq_rel
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: cmp x0, x[[OLD]]
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
-; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, hi
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, ls
+
+
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -808,21 +785,22 @@ define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
    %old = atomicrmw umax i8* @var8, i8 %offset acq_rel
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], uxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+
+
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -831,21 +809,22 @@ define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
    %old = atomicrmw umax i16* @var16, i16 %offset monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], uxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0, uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+
+
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -854,21 +833,22 @@ define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
    %old = atomicrmw umax i32* @var32, i32 %offset seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
-; CHECK-NEXT: cmp w0, w[[OLD]]
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+
+
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -877,16 +857,17 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
    %old = atomicrmw umax i64* @var64, i64 %offset release
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: cmp x0, x[[OLD]]
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lo
-; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, lo
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
+
+
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
@@ -900,21 +881,20 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
    %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 
 ; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
   ; As above, w1 is a reasonable guess.
-; CHECK-REG-NOT: stxrb w1, w1, [x{{[0-9]+}}]
 ; CHECK: stxrb [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i8 %old
 }
 
@@ -923,21 +903,20 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
    %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
 
 ; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
   ; As above, w1 is a reasonable guess.
-; CHECK-REG-NOT: stlxrh w1, w1, [x{{[0-9]+}}]
 ; CHECK: stlxrh [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i16 %old
 }
 
@@ -946,45 +925,44 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
    %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp w[[OLD]], w0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
   ; As above, w1 is a reasonable guess.
-; CHECK-REG-NOT: stlxr w1, w1, [x{{[0-9]+}}]
 ; CHECK: stlxr [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
-define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
+define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK-LABEL: test_atomic_cmpxchg_i64:
    %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
 
 ; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp x[[OLD]], x0
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
   ; As above, w1 is a reasonable guess.
-; CHECK-REG-NOT: stxr w1, x1, [x{{[0-9]+}}]
 ; CHECK: stxr [[STATUS:w[0-9]+]], x1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
 ; CHECK-NOT: dmb
 
-; CHECK: mov x0, x[[OLD]]
-   ret i64 %old
+; CHECK: str x[[OLD]],
+   store i64 %old, i64* @var64
+   ret void
 }
 
 define i8 @test_atomic_load_monotonic_i8() nounwind {
@@ -992,7 +970,7 @@ define i8 @test_atomic_load_monotonic_i8() nounwind {
   %val = load atomic i8* @var8 monotonic, align 1
 ; CHECK-NOT: dmb
 ; CHECK: adrp x[[HIADDR:[0-9]+]], var8
-; CHECK: ldrb w0, [x[[HIADDR]], #:lo12:var8]
+; CHECK: ldrb w0, [x[[HIADDR]], {{#?}}:lo12:var8]
 ; CHECK-NOT: dmb
 
   ret i8 %val
@@ -1017,7 +995,7 @@ define i8 @test_atomic_load_acquire_i8() nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
 ; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
 ; CHECK-NOT: dmb
 ; CHECK: ldarb w0, [x[[ADDR]]]
 ; CHECK-NOT: dmb
@@ -1030,7 +1008,7 @@ define i8 @test_atomic_load_seq_cst_i8() nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[HIADDR:x[0-9]+]], var8
 ; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var8
 ; CHECK-NOT: dmb
 ; CHECK: ldarb w0, [x[[ADDR]]]
 ; CHECK-NOT: dmb
@@ -1043,7 +1021,7 @@ define i16 @test_atomic_load_monotonic_i16() nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp x[[HIADDR:[0-9]+]], var16
 ; CHECK-NOT: dmb
-; CHECK: ldrh w0, [x[[HIADDR]], #:lo12:var16]
+; CHECK: ldrh w0, [x[[HIADDR]], {{#?}}:lo12:var16]
 ; CHECK-NOT: dmb
 
   ret i16 %val
@@ -1068,7 +1046,7 @@ define i64 @test_atomic_load_seq_cst_i64() nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[HIADDR:x[0-9]+]], var64
 ; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var64
 ; CHECK-NOT: dmb
 ; CHECK: ldar x0, [x[[ADDR]]]
 ; CHECK-NOT: dmb
@@ -1079,7 +1057,7 @@ define void @test_atomic_store_monotonic_i8(i8 %val) nounwind {
 ; CHECK-LABEL: test_atomic_store_monotonic_i8:
   store atomic i8 %val, i8* @var8 monotonic, align 1
 ; CHECK: adrp x[[HIADDR:[0-9]+]], var8
-; CHECK: strb w0, [x[[HIADDR]], #:lo12:var8]
+; CHECK: strb w0, [x[[HIADDR]], {{#?}}:lo12:var8]
 
   ret void
 }
@@ -1101,7 +1079,7 @@ define void @test_atomic_store_release_i8(i8 %val) nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[HIADDR:x[0-9]+]], var8
 ; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var8
 ; CHECK-NOT: dmb
 ; CHECK: stlrb w0, [x[[ADDR]]]
 ; CHECK-NOT: dmb
@@ -1114,7 +1092,7 @@ define void @test_atomic_store_seq_cst_i8(i8 %val) nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[HIADDR:x[0-9]+]], var8
 ; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var8
 ; CHECK-NOT: dmb
 ; CHECK: stlrb w0, [x[[ADDR]]]
 ; CHECK-NOT: dmb
@@ -1128,7 +1106,7 @@ define void @test_atomic_store_monotonic_i16(i16 %val) nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp x[[HIADDR:[0-9]+]], var16
 ; CHECK-NOT: dmb
-; CHECK: strh w0, [x[[HIADDR]], #:lo12:var16]
+; CHECK: strh w0, [x[[HIADDR]], {{#?}}:lo12:var16]
 ; CHECK-NOT: dmb
   ret void
 }
@@ -1153,7 +1131,7 @@ define void @test_atomic_store_release_i64(i64 %val) nounwind {
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[HIADDR:x[0-9]+]], var64
 ; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var64
 ; CHECK-NOT: dmb
 ; CHECK: stlr x0, [x[[ADDR]]]
 ; CHECK-NOT: dmb
diff --git a/test/CodeGen/AArch64/basic-pic.ll b/test/CodeGen/AArch64/basic-pic.ll
index 682b7ba..62d41bc 100644
--- a/test/CodeGen/AArch64/basic-pic.ll
+++ b/test/CodeGen/AArch64/basic-pic.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
 
 @var = global i32 0
 
@@ -7,7 +7,7 @@ define i32 @get_globalvar() {
 
   %val = load i32* @var
 ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
-; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], #:got_lo12:var]
+; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], {{#?}}:got_lo12:var]
 ; CHECK: ldr w0, [x[[GOTLOC]]]
 
   ret i32 %val
@@ -18,7 +18,7 @@ define i32* @get_globalvaraddr() {
 
   %val = load i32* @var
 ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
-; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:var]
+; CHECK: ldr x0, [x[[GOTHI]], {{#?}}:got_lo12:var]
 
   ret i32* @var
 }
@@ -30,7 +30,7 @@ define i32 @get_hiddenvar() {
 
   %val = load i32* @hiddenvar
 ; CHECK: adrp x[[HI:[0-9]+]], hiddenvar
-; CHECK: ldr w0, [x[[HI]], #:lo12:hiddenvar]
+; CHECK: ldr w0, [x[[HI]], {{#?}}:lo12:hiddenvar]
 
   ret i32 %val
 }
@@ -40,7 +40,7 @@ define i32* @get_hiddenvaraddr() {
 
   %val = load i32* @hiddenvar
 ; CHECK: adrp [[HI:x[0-9]+]], hiddenvar
-; CHECK: add x0, [[HI]], #:lo12:hiddenvar
+; CHECK: add x0, [[HI]], {{#?}}:lo12:hiddenvar
 
   ret i32* @hiddenvar
 }
@@ -50,5 +50,5 @@ define void()* @get_func() {
 
   ret void()* bitcast(void()*()* @get_func to void()*)
 ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func
-; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:get_func]
+; CHECK: ldr x0, [x[[GOTHI]], {{#?}}:got_lo12:get_func]
 }
diff --git a/test/CodeGen/AArch64/bitfield-insert-0.ll b/test/CodeGen/AArch64/bitfield-insert-0.ll
index 37a18b7..da0ed8a 100644
--- a/test/CodeGen/AArch64/bitfield-insert-0.ll
+++ b/test/CodeGen/AArch64/bitfield-insert-0.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -disassemble - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -filetype=obj -o - %s | llvm-objdump -disassemble - | FileCheck %s
 
 ; The encoding of lsb -> immr in the CGed bitfield instructions was wrong at one
 ; point, in the edge case where lsb = 0. Just make sure.
diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll
index 1f04608..2369a55 100644
--- a/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK
 
 ; First, a simple example from Clang. The registers could plausibly be
 ; different, but probably won't be.
@@ -7,8 +7,7 @@
 
 define [1 x i64] @from_clang([1 x i64] %f.coerce, i32 %n) nounwind readnone {
 ; CHECK-LABEL: from_clang:
-; CHECK: bfi w0, w1, #3, #4
-; CHECK-NEXT: ret
+; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
 
 entry:
   %f.coerce.fca.0.extract = extractvalue [1 x i64] %f.coerce, 0
@@ -26,6 +25,7 @@ entry:
 
 define void @test_whole32(i32* %existing, i32* %new) {
 ; CHECK-LABEL: test_whole32:
+
 ; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5
 
   %oldval = load volatile i32* %existing
@@ -62,8 +62,10 @@ define void @test_whole64(i64* %existing, i64* %new) {
 
 define void @test_whole32_from64(i64* %existing, i64* %new) {
 ; CHECK-LABEL: test_whole32_from64:
-; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16
-; CHECK-NOT: and
+
+
+; CHECK: bfxil {{x[0-9]+}}, {{x[0-9]+}}, #0, #16
+
 ; CHECK: ret
 
   %oldval = load volatile i64* %existing
@@ -80,8 +82,9 @@ define void @test_whole32_from64(i64* %existing, i64* %new) {
 
 define void @test_32bit_masked(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_masked:
+
+; CHECK: and
 ; CHECK: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
-; CHECK: and {{w[0-9]+}}, [[INSERT]], #0xff
 
   %oldval = load volatile i32* %existing
   %oldval_keep = and i32 %oldval, 135 ; = 0x87
@@ -98,8 +101,8 @@ define void @test_32bit_masked(i32 *%existing, i32 *%new) {
 
 define void @test_64bit_masked(i64 *%existing, i64 *%new) {
 ; CHECK-LABEL: test_64bit_masked:
+; CHECK: and
 ; CHECK: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
-; CHECK: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000
 
   %oldval = load volatile i64* %existing
   %oldval_keep = and i64 %oldval, 1095216660480 ; = 0xff_0000_0000
@@ -117,8 +120,9 @@ define void @test_64bit_masked(i64 *%existing, i64 *%new) {
 ; Mask is too complicated for literal ANDwwi, make sure other avenues are tried.
 define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_complexmask:
+
+; CHECK: and
 ; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
-; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
 
   %oldval = load volatile i32* %existing
   %oldval_keep = and i32 %oldval, 647 ; = 0x287
@@ -137,6 +141,7 @@ define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
 define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_badmask:
 ; CHECK-NOT: bfi
+; CHECK-NOT: bfm
 ; CHECK: ret
 
   %oldval = load volatile i32* %existing
@@ -156,6 +161,7 @@ define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
 define void @test_64bit_badmask(i64 *%existing, i64 *%new) {
 ; CHECK-LABEL: test_64bit_badmask:
 ; CHECK-NOT: bfi
+; CHECK-NOT: bfm
 ; CHECK: ret
 
   %oldval = load volatile i64* %existing
@@ -186,8 +192,7 @@ define void @test_32bit_with_shr(i32* %existing, i32* %new) {
   %combined = or i32 %oldval_keep, %newval_masked
   store volatile i32 %combined, i32* %existing
 ; CHECK: lsr [[BIT:w[0-9]+]], {{w[0-9]+}}, #14
-; CHECK: bfi {{w[0-9]}}, [[BIT]], #26, #5
+; CHECK: bfi {{w[0-9]+}}, [[BIT]], #26, #5
 
   ret void
 }
-
diff --git a/test/CodeGen/AArch64/bitfield.ll b/test/CodeGen/AArch64/bitfield.ll
index 1c84f5d..0e12653 100644
--- a/test/CodeGen/AArch64/bitfield.ll
+++ b/test/CodeGen/AArch64/bitfield.ll
@@ -1,5 +1,4 @@
-
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -24,7 +23,7 @@ define void @test_extendb(i8 %var) {
 
   %uxt64 = zext i8 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK: uxtb {{x[0-9]+}}, {{w[0-9]+}}
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff
   ret void
 }
 
@@ -48,7 +47,7 @@ define void @test_extendh(i16 %var) {
 
   %uxt64 = zext i16 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK: uxth {{x[0-9]+}}, {{w[0-9]+}}
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff
   ret void
 }
 
@@ -61,7 +60,7 @@ define void @test_extendw(i32 %var) {
 
   %uxt64 = zext i32 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK: ubfx {{w[0-9]+}}, {{w[0-9]+}}, #0, #32
+; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #32
   ret void
 }
 
@@ -190,7 +189,6 @@ define i32 @test_ubfx32(i32* %addr) {
 define i64 @test_ubfx64(i64* %addr) {
 ; CHECK-LABEL: test_ubfx64:
 ; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #25, #10
-
    %fields = load i64* %addr
    %shifted = lshr i64 %fields, 25
    %masked = and i64 %shifted, 1023
diff --git a/test/CodeGen/AArch64/blockaddress.ll b/test/CodeGen/AArch64/blockaddress.ll
index 8cda431..1eec4cc 100644
--- a/test/CodeGen/AArch64/blockaddress.ll
+++ b/test/CodeGen/AArch64/blockaddress.ll
@@ -9,7 +9,7 @@ define void @test_blockaddress() {
   %val = load volatile i8** @addr
   indirectbr i8* %val, [label %block]
 ; CHECK: adrp [[DEST_HI:x[0-9]+]], [[DEST_LBL:.Ltmp[0-9]+]]
-; CHECK: add [[DEST:x[0-9]+]], [[DEST_HI]], #:lo12:[[DEST_LBL]]
+; CHECK: add [[DEST:x[0-9]+]], [[DEST_HI]], {{#?}}:lo12:[[DEST_LBL]]
 ; CHECK: str [[DEST]],
 ; CHECK: ldr [[NEWDEST:x[0-9]+]]
 ; CHECK: br [[NEWDEST]]
diff --git a/test/CodeGen/AArch64/bool-loads.ll b/test/CodeGen/AArch64/bool-loads.ll
index 5c7640b..881aeaa 100644
--- a/test/CodeGen/AArch64/bool-loads.ll
+++ b/test/CodeGen/AArch64/bool-loads.ll
@@ -1,54 +1,54 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
 
 @var = global i1 0
 
 define i32 @test_sextloadi32() {
-; CHECK: test_sextloadi32
+; CHECK-LABEL: test_sextloadi32
 
   %val = load i1* @var
   %ret = sext i1 %val to i32
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
-; CHECK: sbfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #1
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var]
+; CHECK: {{sbfx x[0-9]+, x[0-9]+, #0, #1|sbfx w[0-9]+, w[0-9]+, #0, #1}}
 
   ret i32 %ret
 ; CHECK: ret
 }
 
 define i64 @test_sextloadi64() {
-; CHECK: test_sextloadi64
+; CHECK-LABEL: test_sextloadi64
 
   %val = load i1* @var
   %ret = sext i1 %val to i64
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
-; CHECK: sbfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #1
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var]
+; CHECK: {{sbfx x[0-9]+, x[0-9]+, #0, #1}}
 
   ret i64 %ret
 ; CHECK: ret
 }
 
 define i32 @test_zextloadi32() {
-; CHECK: test_zextloadi32
+; CHECK-LABEL: test_zextloadi32
 
 ; It's not actually necessary that "ret" is next, but as far as LLVM
 ; is concerned only 0 or 1 should be loadable so no extension is
 ; necessary.
   %val = load i1* @var
   %ret = zext i1 %val to i32
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var]
 
   ret i32 %ret
 ; CHECK-NEXT: ret
 }
 
 define i64 @test_zextloadi64() {
-; CHECK: test_zextloadi64
+; CHECK-LABEL: test_zextloadi64
 
 ; It's not actually necessary that "ret" is next, but as far as LLVM
 ; is concerned only 0 or 1 should be loadable so no extension is
 ; necessary.
   %val = load i1* @var
   %ret = zext i1 %val to i64
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var]
 
   ret i64 %ret
 ; CHECK-NEXT: ret
diff --git a/test/CodeGen/AArch64/breg.ll b/test/CodeGen/AArch64/breg.ll
index 1ed5b9b..591f483 100644
--- a/test/CodeGen/AArch64/breg.ll
+++ b/test/CodeGen/AArch64/breg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @stored_label = global i8* null
 
@@ -7,7 +7,7 @@ define void @foo() {
   %lab = load i8** @stored_label
   indirectbr i8* %lab, [label  %otherlab, label %retlab]
 ; CHECK: adrp {{x[0-9]+}}, stored_label
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:stored_label]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:stored_label]
 ; CHECK: br {{x[0-9]+}}
 
 otherlab:
diff --git a/test/CodeGen/AArch64/callee-save.ll b/test/CodeGen/AArch64/callee-save.ll
index 52243b0..046e6ce 100644
--- a/test/CodeGen/AArch64/callee-save.ll
+++ b/test/CodeGen/AArch64/callee-save.ll
@@ -1,14 +1,14 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @var = global float 0.0
 
 define void @foo() {
 ; CHECK-LABEL: foo:
 
-; CHECK: stp d14, d15, [sp
-; CHECK: stp d12, d13, [sp
-; CHECK: stp d10, d11, [sp
-; CHECK: stp d8, d9, [sp
+; CHECK: stp d15, d14, [sp
+; CHECK: stp d13, d12, [sp
+; CHECK: stp d11, d10, [sp
+; CHECK: stp d9, d8, [sp
 
   ; Create lots of live variables to exhaust the supply of
   ; caller-saved registers
@@ -78,9 +78,9 @@ define void @foo() {
   store volatile float %val31, float* @var
   store volatile float %val32, float* @var
 
-; CHECK: ldp     d8, d9, [sp
-; CHECK: ldp     d10, d11, [sp
-; CHECK: ldp     d12, d13, [sp
-; CHECK: ldp     d14, d15, [sp
+; CHECK: ldp     d9, d8, [sp
+; CHECK: ldp     d11, d10, [sp
+; CHECK: ldp     d13, d12, [sp
+; CHECK: ldp     d15, d14, [sp
   ret void
 }
diff --git a/test/CodeGen/AArch64/code-model-large-abs.ll b/test/CodeGen/AArch64/code-model-large-abs.ll
index b387f28..ca92500 100644
--- a/test/CodeGen/AArch64/code-model-large-abs.ll
+++ b/test/CodeGen/AArch64/code-model-large-abs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -o - %s | FileCheck %s
 
 @var8 = global i8 0
 @var16 = global i16 0
diff --git a/test/CodeGen/AArch64/compare-branch.ll b/test/CodeGen/AArch64/compare-branch.ll
index 75efd9d..a1a87cf 100644
--- a/test/CodeGen/AArch64/compare-branch.ll
+++ b/test/CodeGen/AArch64/compare-branch.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/AArch64/concatvector-bugs.ll b/test/CodeGen/AArch64/concatvector-bugs.ll
deleted file mode 100644
index 5889e22..0000000
--- a/test/CodeGen/AArch64/concatvector-bugs.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon
-; Bug: i8 type in FRP8 register but not registering with register class causes segmentation fault.
-; Fix: Removed i8 type from FPR8 register class.
-
-define void @test_concatvector_v8i8() {
-entry.split:
-  br i1 undef, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry.split
-  unreachable
-
-if.end:                                           ; preds = %entry.split
-  br i1 undef, label %if.then9, label %if.end18
-
-if.then9:                                         ; preds = %if.end
-  unreachable
-
-if.end18:                                         ; preds = %if.end
-  br label %for.body
-
-for.body:                                         ; preds = %for.inc, %if.end18
-  br i1 false, label %if.then30, label %for.inc
-
-if.then30:                                        ; preds = %for.body
-  unreachable
-
-for.inc:                                          ; preds = %for.body
-  br i1 undef, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.inc
-  br label %for.body77
-
-for.body77:                                       ; preds = %for.body77, %for.end
-  br i1 undef, label %for.end106, label %for.body77
-
-for.end106:                                       ; preds = %for.body77
-  br i1 undef, label %for.body130.us.us, label %stmt.for.body130.us.us
-
-stmt.for.body130.us.us:                     ; preds = %stmt.for.body130.us.us, %for.end106
-  %_p_splat.us = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
-  store <8 x i8> %_p_splat.us, <8 x i8>* undef, align 1
-  br label %stmt.for.body130.us.us
-
-for.body130.us.us:                                ; preds = %for.body130.us.us, %for.end106
-  br label %for.body130.us.us
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32)
-
-define <8 x i16> @test_splat(i32 %l) nounwind {
-; CHECK-LABEL: test_splat:
-; CHECK: ret
-  %lhs = insertelement <1 x i32> undef, i32 %l, i32 0
-  %shift = tail call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %lhs, i32 11)
-  %vec = shufflevector <1 x i16> %shift, <1 x i16> undef, <8 x i32> zeroinitializer
-  ret <8 x i16> %vec
-}
-
-
-define <8 x i16> @test_notsplat(<8 x i16> %a, <8 x i16> %b, i32 %l) nounwind {
-; CHECK-LABEL: test_notsplat:
-; CHECK: ret
-entry:
-  %lhs = insertelement <1 x i32> undef, i32 %l, i32 0
-  %shift = tail call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %lhs, i32 11)
-  %vec = shufflevector <1 x i16> %shift, <1 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0>
-  ret <8 x i16> %vec
-}
diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll
index 9c1dfeb..5f81cba 100644
--- a/test/CodeGen/AArch64/cond-sel.ll
+++ b/test/CodeGen/AArch64/cond-sel.ll
@@ -1,25 +1,25 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var32 = global i32 0
 @var64 = global i64 0
 
-define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
 ; CHECK-LABEL: test_csel:
 
   %tst1 = icmp ugt i32 %lhs32, %rhs32
   %val1 = select i1 %tst1, i32 42, i32 52
   store i32 %val1, i32* @var32
-; CHECK-DAG: movz [[W52:w[0-9]+]], #52
-; CHECK-DAG: movz [[W42:w[0-9]+]], #42
+; CHECK-DAG: movz [[W52:w[0-9]+]], #{{52|0x34}}
+; CHECK-DAG: movz [[W42:w[0-9]+]], #{{42|0x2a}}
 ; CHECK: csel {{w[0-9]+}}, [[W42]], [[W52]], hi
 
   %rhs64 = sext i32 %rhs32 to i64
   %tst2 = icmp sle i64 %lhs64, %rhs64
   %val2 = select i1 %tst2, i64 %lhs64, i64 %rhs64
   store i64 %val2, i64* @var64
-; CHECK-DAG: cmp [[LHS:x[0-9]+]], [[RHS:w[0-9]+]], sxtw
-; CHECK-DAG: sxtw [[EXT_RHS:x[0-9]+]], [[RHS]]
+; CHECK: sxtw [[EXT_RHS:x[0-9]+]], {{[wx]}}[[RHS:[0-9]+]]
+; CHECK: cmp [[LHS:x[0-9]+]], w[[RHS]], sxtw
 ; CHECK: csel {{x[0-9]+}}, [[LHS]], [[EXT_RHS]], le
 
   ret void
@@ -34,8 +34,8 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
 ; CHECK-NOFP-NOT: fcmp
   %val1 = select i1 %tst1, i32 42, i32 52
   store i32 %val1, i32* @var32
-; CHECK: movz [[W52:w[0-9]+]], #52
-; CHECK: movz [[W42:w[0-9]+]], #42
+; CHECK: movz [[W52:w[0-9]+]], #{{52|0x34}}
+; CHECK: movz [[W42:w[0-9]+]], #{{42|0x2a}}
 ; CHECK: csel [[MAYBETRUE:w[0-9]+]], [[W42]], [[W52]], mi
 ; CHECK: csel {{w[0-9]+}}, [[W42]], [[MAYBETRUE]], gt
 
@@ -45,17 +45,17 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
 ; CHECK-NOFP-NOT: fcmp
   %val2 = select i1 %tst2, i64 9, i64 15
   store i64 %val2, i64* @var64
-; CHECK: movz [[CONST15:x[0-9]+]], #15
-; CHECK: movz [[CONST9:x[0-9]+]], #9
-; CHECK: csel [[MAYBETRUE:x[0-9]+]], [[CONST9]], [[CONST15]], eq
-; CHECK: csel {{x[0-9]+}}, [[CONST9]], [[MAYBETRUE]], vs
+; CHECK: orr w[[CONST15:[0-9]+]], wzr, #0xf
+; CHECK: movz {{[wx]}}[[CONST9:[0-9]+]], #{{9|0x9}}
+; CHECK: csel [[MAYBETRUE:x[0-9]+]], x[[CONST9]], x[[CONST15]], eq
+; CHECK: csel {{x[0-9]+}}, x[[CONST9]], [[MAYBETRUE]], vs
 
   ret void
 ; CHECK: ret
 }
 
 
-define void @test_csinc(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+define void @test_csinc(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
 ; CHECK-LABEL: test_csinc:
 
 ; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
@@ -95,7 +95,7 @@ define void @test_csinc(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
 ; CHECK: ret
 }
 
-define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
 ; CHECK-LABEL: test_csinv:
 
 ; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
@@ -135,7 +135,7 @@ define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
 ; CHECK: ret
 }
 
-define void @test_csneg(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+define void @test_csneg(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
 ; CHECK-LABEL: test_csneg:
 
 ; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
@@ -184,13 +184,13 @@ define void @test_cset(i32 %lhs, i32 %rhs, i64 %lhs64) {
   %val1 = zext i1 %tst1 to i32
   store i32 %val1, i32* @var32
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ne
+; CHECK: cset {{w[0-9]+}}, eq
 
   %rhs64 = sext i32 %rhs to i64
   %tst2 = icmp ule i64 %lhs64, %rhs64
   %val2 = zext i1 %tst2 to i64
   store i64 %val2, i64* @var64
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, hi
+; CHECK: cset {{w[0-9]+}}, ls
 
   ret void
 ; CHECK: ret
@@ -203,13 +203,13 @@ define void @test_csetm(i32 %lhs, i32 %rhs, i64 %lhs64) {
   %val1 = sext i1 %tst1 to i32
   store i32 %val1, i32* @var32
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: csinv {{w[0-9]+}}, wzr, wzr, ne
+; CHECK: csetm {{w[0-9]+}}, eq
 
   %rhs64 = sext i32 %rhs to i64
   %tst2 = icmp ule i64 %lhs64, %rhs64
   %val2 = sext i1 %tst2 to i64
   store i64 %val2, i64* @var64
-; CHECK: csinv {{x[0-9]+}}, xzr, xzr, hi
+; CHECK: csetm {{x[0-9]+}}, ls
 
   ret void
 ; CHECK: ret
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index f0b60f0..f0f36bd 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -1,9 +1,10 @@
 ; This tests that llc accepts all valid AArch64 CPUs
 
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
+
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
 ; CHECK-NOT: {{.*}}  is not a recognized processor for this target
 ; INVALID: {{.*}}  is not a recognized processor for this target
diff --git a/test/CodeGen/AArch64/directcond.ll b/test/CodeGen/AArch64/directcond.ll
index 12c7b6a..1b51928 100644
--- a/test/CodeGen/AArch64/directcond.ll
+++ b/test/CodeGen/AArch64/directcond.ll
@@ -1,11 +1,10 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 ; CHECK-LABEL: test_select_i32:
   %val = select i1 %bit, i32 %a, i32 %b
-; CHECK: movz [[ONE:w[0-9]+]], #1
-; CHECK: tst w0, [[ONE]]
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: csel w0, w1, w2, ne
 
   ret i32 %val
@@ -14,8 +13,7 @@ define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) {
 ; CHECK-LABEL: test_select_i64:
   %val = select i1 %bit, i64 %a, i64 %b
-; CHECK: movz [[ONE:w[0-9]+]], #1
-; CHECK: tst w0, [[ONE]]
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: csel x0, x1, x2, ne
 
   ret i64 %val
@@ -24,8 +22,7 @@ define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) {
 define float @test_select_float(i1 %bit, float %a, float %b) {
 ; CHECK-LABEL: test_select_float:
   %val = select i1 %bit, float %a, float %b
-; CHECK: movz [[ONE:w[0-9]+]], #1
-; CHECK: tst w0, [[ONE]]
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: fcsel s0, s0, s1, ne
 ; CHECK-NOFP-NOT: fcsel
   ret float %val
@@ -34,8 +31,7 @@ define float @test_select_float(i1 %bit, float %a, float %b) {
 define double @test_select_double(i1 %bit, double %a, double %b) {
 ; CHECK-LABEL: test_select_double:
   %val = select i1 %bit, double %a, double %b
-; CHECK: movz [[ONE:w[0-9]+]], #1
-; CHECK: tst w0, [[ONE]]
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: fcsel d0, d0, d1, ne
 ; CHECK-NOFP-NOT: fcsel
 
@@ -45,7 +41,7 @@ define double @test_select_double(i1 %bit, double %a, double %b) {
 define i32 @test_brcond(i1 %bit) {
 ; CHECK-LABEL: test_brcond:
   br i1 %bit, label %true, label %false
-; CHECK: tbz {{w[0-9]+}}, #0, .LBB
+; CHECK: tbz {{w[0-9]+}}, #0, {{.?LBB}}
 
 true:
   ret i32 0
@@ -57,7 +53,7 @@ define i1 @test_setcc_float(float %lhs, float %rhs) {
 ; CHECK: test_setcc_float
   %val = fcmp oeq float %lhs, %rhs
 ; CHECK: fcmp s0, s1
-; CHECK: csinc w0, wzr, wzr, ne
+; CHECK: cset w0, eq
 ; CHECK-NOFP-NOT: fcmp
   ret i1 %val
 }
@@ -66,7 +62,7 @@ define i1 @test_setcc_double(double %lhs, double %rhs) {
 ; CHECK: test_setcc_double
   %val = fcmp oeq double %lhs, %rhs
 ; CHECK: fcmp d0, d1
-; CHECK: csinc w0, wzr, wzr, ne
+; CHECK: cset w0, eq
 ; CHECK-NOFP-NOT: fcmp
   ret i1 %val
 }
@@ -75,7 +71,7 @@ define i1 @test_setcc_i32(i32 %lhs, i32 %rhs) {
 ; CHECK: test_setcc_i32
   %val = icmp ugt i32 %lhs, %rhs
 ; CHECK: cmp w0, w1
-; CHECK: csinc w0, wzr, wzr, ls
+; CHECK: cset w0, hi
   ret i1 %val
 }
 
@@ -83,6 +79,6 @@ define i1 @test_setcc_i64(i64 %lhs, i64 %rhs) {
 ; CHECK: test_setcc_i64
   %val = icmp ne i64 %lhs, %rhs
 ; CHECK: cmp x0, x1
-; CHECK: csinc w0, wzr, wzr, eq
+; CHECK: cset w0, ne
   ret i1 %val
 }
diff --git a/test/CodeGen/AArch64/dp-3source.ll b/test/CodeGen/AArch64/dp-3source.ll
index 81d9e15..22bd4a8 100644
--- a/test/CodeGen/AArch64/dp-3source.ll
+++ b/test/CodeGen/AArch64/dp-3source.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i32 @test_madd32(i32 %val0, i32 %val1, i32 %val2) {
 ; CHECK-LABEL: test_madd32:
diff --git a/test/CodeGen/AArch64/dp1.ll b/test/CodeGen/AArch64/dp1.ll
index 6a8d55c..662b415 100644
--- a/test/CodeGen/AArch64/dp1.ll
+++ b/test/CodeGen/AArch64/dp1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/AArch64/dp2.ll b/test/CodeGen/AArch64/dp2.ll
index 48b0701..71b3169 100644
--- a/test/CodeGen/AArch64/dp2.ll
+++ b/test/CodeGen/AArch64/dp2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64 | FileCheck %s
 
 @var32_0 = global i32 0
 @var32_1 = global i32 0
@@ -13,7 +13,7 @@ define void @rorv_i64() {
     %val3_tmp = shl i64 %val0_tmp, %val2_tmp
     %val4_tmp = lshr i64 %val0_tmp, %val1_tmp
     %val5_tmp = or i64 %val3_tmp, %val4_tmp
-; CHECK: ror	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: {{ror|rorv}} {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
     store volatile i64 %val5_tmp, i64* @var64_0
     ret void
 }
@@ -23,7 +23,7 @@ define void @asrv_i64() {
     %val0_tmp = load i64* @var64_0
     %val1_tmp = load i64* @var64_1
     %val4_tmp = ashr i64 %val0_tmp, %val1_tmp
-; CHECK: asr	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: {{asr|asrv}} {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
     store volatile i64 %val4_tmp, i64* @var64_1
     ret void
 }
@@ -33,7 +33,7 @@ define void @lsrv_i64() {
     %val0_tmp = load i64* @var64_0
     %val1_tmp = load i64* @var64_1
     %val4_tmp = lshr i64 %val0_tmp, %val1_tmp
-; CHECK: lsr	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: {{lsr|lsrv}} {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
     store volatile i64 %val4_tmp, i64* @var64_0
     ret void
 }
@@ -43,7 +43,7 @@ define void @lslv_i64() {
     %val0_tmp = load i64* @var64_0
     %val1_tmp = load i64* @var64_1
     %val4_tmp = shl i64 %val0_tmp, %val1_tmp
-; CHECK: lsl	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: {{lsl|lslv}} {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
     store volatile i64 %val4_tmp, i64* @var64_1
     ret void
 }
@@ -75,7 +75,7 @@ define void @lsrv_i32() {
     %val1_tmp = load i32* @var32_1
     %val2_tmp = add i32 1, %val1_tmp
     %val4_tmp = lshr i32 %val0_tmp, %val2_tmp
-; CHECK: lsr	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{lsr|lsrv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
     store volatile i32 %val4_tmp, i32* @var32_0
     ret void
 }
@@ -86,7 +86,7 @@ define void @lslv_i32() {
     %val1_tmp = load i32* @var32_1
     %val2_tmp = add i32 1, %val1_tmp
     %val4_tmp = shl i32 %val0_tmp, %val2_tmp
-; CHECK: lsl	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{lsl|lslv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
     store volatile i32 %val4_tmp, i32* @var32_1
     ret void
 }
@@ -100,7 +100,7 @@ define void @rorv_i32() {
     %val3_tmp = shl i32 %val0_tmp, %val2_tmp
     %val4_tmp = lshr i32 %val0_tmp, %val1_tmp
     %val5_tmp = or i32 %val3_tmp, %val4_tmp
-; CHECK: ror	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{ror|rorv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
     store volatile i32 %val5_tmp, i32* @var32_0
     ret void
 }
@@ -111,7 +111,7 @@ define void @asrv_i32() {
     %val1_tmp = load i32* @var32_1
     %val2_tmp = add i32 1, %val1_tmp
     %val4_tmp = ashr i32 %val0_tmp, %val2_tmp
-; CHECK: asr	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{asr|asrv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
     store volatile i32 %val4_tmp, i32* @var32_1
     ret void
 }
@@ -143,7 +143,7 @@ define i32 @test_lsl32() {
 
   %val = load i32* @var32_0
   %ret = shl i32 1, %val
-; CHECK: lsl {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{lsl|lslv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
 
   ret i32 %ret
 }
@@ -153,7 +153,7 @@ define i32 @test_lsr32() {
 
   %val = load i32* @var32_0
   %ret = lshr i32 1, %val
-; CHECK: lsr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{lsr|lsrv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
 
   ret i32 %ret
 }
@@ -163,7 +163,7 @@ define i32 @test_asr32(i32 %in) {
 
   %val = load i32* @var32_0
   %ret = ashr i32 %in, %val
-; CHECK: asr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{asr|asrv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
 
   ret i32 %ret
 }
diff --git a/test/CodeGen/AArch64/eliminate-trunc.ll b/test/CodeGen/AArch64/eliminate-trunc.ll
new file mode 100644
index 0000000..ea86a08
--- /dev/null
+++ b/test/CodeGen/AArch64/eliminate-trunc.ll
@@ -0,0 +1,39 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-apple-ios7.0 -mcpu=cyclone | FileCheck %s
+
+; Check  trunc i64 operation is translated as a subregister access
+; eliminating an i32 induction varible.
+
+; CHECK-NOT: add {{x[0-9]+}}, {{x[0-9]+}}, #1
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK-NEXT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+define void @test1_signed([8 x i8]* nocapture %a, i8* nocapture readonly %box, i8 %limit) minsize {
+entry:
+  %conv = zext i8 %limit to i32
+  %cmp223 = icmp eq i8 %limit, 0
+  br i1 %cmp223, label %for.end15, label %for.body4.lr.ph.us
+
+for.body4.us:
+  %indvars.iv = phi i64 [ 0, %for.body4.lr.ph.us ], [ %indvars.iv.next, %for.body4.us ]
+  %arrayidx6.us = getelementptr inbounds [8 x i8]* %a, i64 %indvars.iv26, i64 %indvars.iv
+  %0 = load i8* %arrayidx6.us, align 1
+  %idxprom7.us = zext i8 %0 to i64
+  %arrayidx8.us = getelementptr inbounds i8* %box, i64 %idxprom7.us
+  %1 = load i8* %arrayidx8.us, align 1
+  store i8 %1, i8* %arrayidx6.us, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %2 = trunc i64 %indvars.iv.next to i32
+  %cmp2.us = icmp slt i32 %2, %conv
+  br i1 %cmp2.us, label %for.body4.us, label %for.cond1.for.inc13_crit_edge.us
+
+for.body4.lr.ph.us:
+  %indvars.iv26 = phi i64 [ %indvars.iv.next27, %for.cond1.for.inc13_crit_edge.us ], [ 0, %entry ]
+  br label %for.body4.us
+
+for.cond1.for.inc13_crit_edge.us:
+  %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
+  %exitcond28 = icmp eq i64 %indvars.iv26, 3
+  br i1 %exitcond28, label %for.end15, label %for.body4.lr.ph.us
+
+for.end15:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll
index 322b3f4..ce5c0f6 100644
--- a/test/CodeGen/AArch64/extern-weak.ll
+++ b/test/CodeGen/AArch64/extern-weak.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -o - < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
 
@@ -7,10 +7,10 @@ define i32()* @foo() {
 ; The usual ADRP/ADD pair can't be used for a weak reference because it must
 ; evaluate to 0 if the symbol is undefined. We use a litpool entry.
   ret i32()* @var
-; CHECK: .LCPI0_0:
-; CHECK-NEXT: .xword var
 
-; CHECK: ldr x0, [{{x[0-9]+}}, #:lo12:.LCPI0_0]
+
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[ADDRHI]], :got_lo12:var]
 
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
@@ -25,27 +25,29 @@ define i32()* @foo() {
 
 define i32* @bar() {
   %addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5
-; CHECK: .LCPI1_0:
-; CHECK-NEXT: .xword arr_var
 
-; CHECK: ldr [[BASE:x[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI1_0]
+
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:arr_var
+; CHECK: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
 ; CHECK: add x0, [[BASE]], #20
+
   ret i32* %addr
 
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
-; CHECK-LARGE: movz x0, #:abs_g3:arr_var
-; CHECK-LARGE: movk x0, #:abs_g2_nc:arr_var
-; CHECK-LARGE: movk x0, #:abs_g1_nc:arr_var
-; CHECK-LARGE: movk x0, #:abs_g0_nc:arr_var
+; CHECK-LARGE: movz [[ADDR:x[0-9]+]], #:abs_g3:arr_var
+; CHECK-LARGE: movk [[ADDR]], #:abs_g2_nc:arr_var
+; CHECK-LARGE: movk [[ADDR]], #:abs_g1_nc:arr_var
+; CHECK-LARGE: movk [[ADDR]], #:abs_g0_nc:arr_var
 }
 
 @defined_weak_var = internal unnamed_addr global i32 0
 
 define i32* @wibble() {
   ret i32* @defined_weak_var
+
 ; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
-; CHECK: add x0, [[BASE]], #:lo12:defined_weak_var
+; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
 
 ; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
diff --git a/test/CodeGen/AArch64/extract.ll b/test/CodeGen/AArch64/extract.ll
index 62d9ed2..1fc9387 100644
--- a/test/CodeGen/AArch64/extract.ll
+++ b/test/CodeGen/AArch64/extract.ll
@@ -1,11 +1,11 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i64 @ror_i64(i64 %in) {
 ; CHECK-LABEL: ror_i64:
     %left = shl i64 %in, 19
     %right = lshr i64 %in, 45
     %val5 = or i64 %left, %right
-; CHECK: extr {{x[0-9]+}}, x0, x0, #45
+; CHECK: ror {{x[0-9]+}}, x0, #45
     ret i64 %val5
 }
 
@@ -14,7 +14,7 @@ define i32 @ror_i32(i32 %in) {
     %left = shl i32 %in, 9
     %right = lshr i32 %in, 23
     %val5 = or i32 %left, %right
-; CHECK: extr {{w[0-9]+}}, w0, w0, #23
+; CHECK: ror {{w[0-9]+}}, w0, #23
     ret i32 %val5
 }
 
diff --git a/test/CodeGen/AArch64/fastcc-reserved.ll b/test/CodeGen/AArch64/fastcc-reserved.ll
index c6c0505..a392619 100644
--- a/test/CodeGen/AArch64/fastcc-reserved.ll
+++ b/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -12,8 +12,8 @@ define fastcc void @foo(i32 %in) {
   %addr = alloca i8, i32 %in
 
 ; Normal frame setup stuff:
-; CHECK: sub sp, sp,
-; CHECK: stp x29, x30
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame:
 ; CHECK: sub sp, sp, #16
@@ -26,8 +26,8 @@ define fastcc void @foo(i32 %in) {
 ; CHECK-NOT: sub sp, sp, #16
 ; CHECK-NOT: add sp, sp,
 
-; CHECK: ldp x29, x30
-; CHECK: add sp, sp,
+; CHECK: mov     sp, x29
+; CHECK: ldp     x29, x30, [sp], #16
   ret void
 }
 
@@ -38,8 +38,8 @@ define void @foo1(i32 %in) {
 
   %addr = alloca i8, i32 %in
 ; Normal frame setup again
-; CHECK: sub sp, sp,
-; CHECK: stp x29, x30
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame
 ; CHECK: sub sp, sp, #16
@@ -52,7 +52,7 @@ define void @foo1(i32 %in) {
 
 ; Check for epilogue (primarily to make sure sp spotted above wasn't
 ; part of it).
-; CHECK: ldp x29, x30
-; CHECK: add sp, sp,
+; CHECK: mov     sp, x29
+; CHECK: ldp     x29, x30, [sp], #16
   ret void
 }
diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll
index a4cd378..9917fcd 100644
--- a/test/CodeGen/AArch64/fastcc.ll
+++ b/test/CodeGen/AArch64/fastcc.ll
@@ -6,10 +6,13 @@
 
 define fastcc void @func_stack0() {
 ; CHECK-LABEL: func_stack0:
-; CHECK: sub sp, sp, #48
+; CHECK: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #32
 
 ; CHECK-TAIL-LABEL: func_stack0:
-; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-TAIL-NEXT: mov x29, sp
+; CHECK-TAIL-NEXT: sub sp, sp, #32
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -24,6 +27,7 @@ define fastcc void @func_stack0() {
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
+
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
@@ -32,30 +36,39 @@ define fastcc void @func_stack0() {
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
+
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #48
-; CHECK-TAIL-NEXT: ret
 
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-LABEL: func_stack8:
-; CHECK: sub sp, sp, #48
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #32
+
 
 ; CHECK-TAIL-LABEL: func_stack8:
-; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-TAIL: mov x29, sp
+; CHECK-TAIL: sub sp, sp, #32
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
+
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
@@ -64,6 +77,7 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
+
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
@@ -76,19 +90,22 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-TAIL-NOT: sub sp, sp
 
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #64
+
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32:
-; CHECK: sub sp, sp, #48
+; CHECK: mov x29, sp
 
 ; CHECK-TAIL-LABEL: func_stack32:
-; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL: mov x29, sp
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -103,6 +120,7 @@ define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
+
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
@@ -111,13 +129,16 @@ define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
+
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #80
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
 }
diff --git a/test/CodeGen/AArch64/fcmp.ll b/test/CodeGen/AArch64/fcmp.ll
index a9518ea..3c74508 100644
--- a/test/CodeGen/AArch64/fcmp.ll
+++ b/test/CodeGen/AArch64/fcmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 declare void @bar(i32)
 
diff --git a/test/CodeGen/AArch64/fcvt-fixed.ll b/test/CodeGen/AArch64/fcvt-fixed.ll
index 9d66da4..ccb3616 100644
--- a/test/CodeGen/AArch64/fcvt-fixed.ll
+++ b/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -1,4 +1,8 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 -O0
+
+; (The O0 test is to make sure FastISel still constrains its operands properly
+; and the verifier doesn't trigger).
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/AArch64/fcvt-int.ll b/test/CodeGen/AArch64/fcvt-int.ll
index 97427a7..d549c7e 100644
--- a/test/CodeGen/AArch64/fcvt-int.ll
+++ b/test/CodeGen/AArch64/fcvt-int.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i32 @test_floattoi32(float %in) {
 ; CHECK-LABEL: test_floattoi32:
diff --git a/test/CodeGen/AArch64/flags-multiuse.ll b/test/CodeGen/AArch64/flags-multiuse.ll
index e99c728..c9b0b9f 100644
--- a/test/CodeGen/AArch64/flags-multiuse.ll
+++ b/test/CodeGen/AArch64/flags-multiuse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 ; LLVM should be able to cope with multiple uses of the same flag-setting
 ; instruction at different points of a routine. Either by rematerializing the
@@ -15,7 +15,7 @@ define i32 @test_multiflag(i32 %n, i32 %m, i32 %o) {
 ; CHECK: cmp [[LHS:w[0-9]+]], [[RHS:w[0-9]+]]
 
   %val = zext i1 %test to i32
-; CHECK: csinc {{[xw][0-9]+}}, {{xzr|wzr}}, {{xzr|wzr}}, eq
+; CHECK: cset {{[xw][0-9]+}}, ne
 
   store i32 %val, i32* @var
 
diff --git a/test/CodeGen/AArch64/floatdp_1source.ll b/test/CodeGen/AArch64/floatdp_1source.ll
index 3d7f8f0..8c02787 100644
--- a/test/CodeGen/AArch64/floatdp_1source.ll
+++ b/test/CodeGen/AArch64/floatdp_1source.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @varhalf = global half 0.0
 @varfloat = global float 0.0
diff --git a/test/CodeGen/AArch64/floatdp_2source.ll b/test/CodeGen/AArch64/floatdp_2source.ll
index bb65528..2622717 100644
--- a/test/CodeGen/AArch64/floatdp_2source.ll
+++ b/test/CodeGen/AArch64/floatdp_2source.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -mcpu=cyclone | FileCheck %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/test/CodeGen/AArch64/fp-cond-sel.ll b/test/CodeGen/AArch64/fp-cond-sel.ll
index 572f42e..b4f4d77 100644
--- a/test/CodeGen/AArch64/fp-cond-sel.ll
+++ b/test/CodeGen/AArch64/fp-cond-sel.ll
@@ -1,25 +1,34 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
 
+declare void @use_float(float)
+declare void @use_double(double)
+
 define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
 ; CHECK-LABEL: test_csel:
 
   %tst1 = icmp ugt i32 %lhs32, %rhs32
   %val1 = select i1 %tst1, float 0.0, float 1.0
   store float %val1, float* @varfloat
-; CHECK: ldr [[FLT0:s[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI
-; CHECK: fmov [[FLT1:s[0-9]+]], #1.0
-; CHECK: fcsel {{s[0-9]+}}, [[FLT0]], [[FLT1]], hi
+; CHECK: movi v[[FLT0:[0-9]+]].2d, #0
+; CHECK: fmov s[[FLT1:[0-9]+]], #1.0
+; CHECK: fcsel {{s[0-9]+}}, s[[FLT0]], s[[FLT1]], hi
 
   %rhs64 = sext i32 %rhs32 to i64
   %tst2 = icmp sle i64 %lhs64, %rhs64
   %val2 = select i1 %tst2, double 1.0, double 0.0
   store double %val2, double* @vardouble
-; CHECK: ldr [[FLT0:d[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI
-; CHECK: fmov [[FLT1:d[0-9]+]], #1.0
-; CHECK: fcsel {{d[0-9]+}}, [[FLT1]], [[FLT0]], le
+; FLT0 is reused from above on ARM64.
+; CHECK: fmov d[[FLT1:[0-9]+]], #1.0
+; CHECK: fcsel {{d[0-9]+}}, d[[FLT1]], d[[FLT0]], le
+
+  call void @use_float(float 0.0)
+  call void @use_float(float 1.0)
+
+  call void @use_double(double 0.0)
+  call void @use_double(double 1.0)
 
   ret void
 ; CHECK: ret
diff --git a/test/CodeGen/AArch64/fp-dp3.ll b/test/CodeGen/AArch64/fp-dp3.ll
index 2a6790e..10f88fd 100644
--- a/test/CodeGen/AArch64/fp-dp3.ll
+++ b/test/CodeGen/AArch64/fp-dp3.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s -check-prefix=CHECK-NOFAST
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s -check-prefix=CHECK-NOFAST
 
 declare float @llvm.fma.f32(float, float, float)
 declare double @llvm.fma.f64(double, double, double)
diff --git a/test/CodeGen/AArch64/fp128-folding.ll b/test/CodeGen/AArch64/fp128-folding.ll
index b1c560d..892b19c 100644
--- a/test/CodeGen/AArch64/fp128-folding.ll
+++ b/test/CodeGen/AArch64/fp128-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 declare void @bar(i8*, i8*, i32*)
 
 ; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
@@ -12,6 +12,6 @@ define fp128 @test_folding() {
   %fpval = sitofp i32 %val to fp128
   ; If the value is loaded from a constant pool into an fp128, it's been folded
   ; successfully.
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.LCPI
   ret fp128 %fpval
 }
diff --git a/test/CodeGen/AArch64/fp128.ll b/test/CodeGen/AArch64/fp128.ll
deleted file mode 100644
index c312bb1..0000000
--- a/test/CodeGen/AArch64/fp128.ll
+++ /dev/null
@@ -1,279 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-
-@lhs = global fp128 zeroinitializer
-@rhs = global fp128 zeroinitializer
-
-define fp128 @test_add() {
-; CHECK-LABEL: test_add:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-  %val = fadd fp128 %lhs, %rhs
-; CHECK: bl __addtf3
-  ret fp128 %val
-}
-
-define fp128 @test_sub() {
-; CHECK-LABEL: test_sub:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-  %val = fsub fp128 %lhs, %rhs
-; CHECK: bl __subtf3
-  ret fp128 %val
-}
-
-define fp128 @test_mul() {
-; CHECK-LABEL: test_mul:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-  %val = fmul fp128 %lhs, %rhs
-; CHECK: bl __multf3
-  ret fp128 %val
-}
-
-define fp128 @test_div() {
-; CHECK-LABEL: test_div:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-  %val = fdiv fp128 %lhs, %rhs
-; CHECK: bl __divtf3
-  ret fp128 %val
-}
-
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @test_fptosi() {
-; CHECK-LABEL: test_fptosi:
-  %val = load fp128* @lhs
-
-  %val32 = fptosi fp128 %val to i32
-  store i32 %val32, i32* @var32
-; CHECK: bl __fixtfsi
-
-  %val64 = fptosi fp128 %val to i64
-  store i64 %val64, i64* @var64
-; CHECK: bl __fixtfdi
-
-  ret void
-}
-
-define void @test_fptoui() {
-; CHECK-LABEL: test_fptoui:
-  %val = load fp128* @lhs
-
-  %val32 = fptoui fp128 %val to i32
-  store i32 %val32, i32* @var32
-; CHECK: bl __fixunstfsi
-
-  %val64 = fptoui fp128 %val to i64
-  store i64 %val64, i64* @var64
-; CHECK: bl __fixunstfdi
-
-  ret void
-}
-
-define void @test_sitofp() {
-; CHECK-LABEL: test_sitofp:
-
-  %src32 = load i32* @var32
-  %val32 = sitofp i32 %src32 to fp128
-  store volatile fp128 %val32, fp128* @lhs
-; CHECK: bl __floatsitf
-
-  %src64 = load i64* @var64
-  %val64 = sitofp i64 %src64 to fp128
-  store volatile fp128 %val64, fp128* @lhs
-; CHECK: bl __floatditf
-
-  ret void
-}
-
-define void @test_uitofp() {
-; CHECK-LABEL: test_uitofp:
-
-  %src32 = load i32* @var32
-  %val32 = uitofp i32 %src32 to fp128
-  store volatile fp128 %val32, fp128* @lhs
-; CHECK: bl __floatunsitf
-
-  %src64 = load i64* @var64
-  %val64 = uitofp i64 %src64 to fp128
-  store volatile fp128 %val64, fp128* @lhs
-; CHECK: bl __floatunditf
-
-  ret void
-}
-
-define i1 @test_setcc1() {
-; CHECK-LABEL: test_setcc1:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-; Technically, everything after the call to __letf2 is redundant, but we'll let
-; LLVM have its fun for now.
-  %val = fcmp ole fp128 %lhs, %rhs
-; CHECK: bl __letf2
-; CHECK: cmp w0, #0
-; CHECK: csinc w0, wzr, wzr, gt
-
-  ret i1 %val
-; CHECK: ret
-}
-
-define i1 @test_setcc2() {
-; CHECK-LABEL: test_setcc2:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-; Technically, everything after the call to __letf2 is redundant, but we'll let
-; LLVM have its fun for now.
-  %val = fcmp ugt fp128 %lhs, %rhs
-; CHECK: bl      __gttf2
-; CHECK: cmp w0, #0
-; CHECK: csinc   [[GT:w[0-9]+]], wzr, wzr, le
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp w0, #0
-; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
-
-; CHECK: orr     w0, [[UNORDERED]], [[GT]]
-
-  ret i1 %val
-; CHECK: ret
-}
-
-define i32 @test_br_cc() {
-; CHECK-LABEL: test_br_cc:
-
-  %lhs = load fp128* @lhs
-  %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
-
-  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
-  %cond = fcmp olt fp128 %lhs, %rhs
-; CHECK: bl      __getf2
-; CHECK: cmp w0, #0
-; CHECK: csinc   [[OGE:w[0-9]+]], wzr, wzr, lt
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp w0, #0
-; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
-
-; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
-; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
-  br i1 %cond, label %iftrue, label %iffalse
-
-iftrue:
-  ret i32 42
-; CHECK-NEXT: BB#
-; CHECK-NEXT: movz x0, #42
-; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
-
-iffalse:
-  ret i32 29
-; CHECK: [[RET29]]:
-; CHECK-NEXT: movz x0, #29
-; CHECK-NEXT: [[REALRET]]:
-; CHECK: ret
-}
-
-define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
-; CHECK-LABEL: test_select:
-
-  %val = select i1 %cond, fp128 %lhs, fp128 %rhs
-  store fp128 %val, fp128* @lhs
-; CHECK: cmp w0, #0
-; CHECK: str q1, [sp]
-; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: BB#
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: [[IFFALSE]]:
-; CHECK-NEXT: ldr q0, [sp]
-; CHECK: str q0, [{{x[0-9]+}}, #:lo12:lhs]
-  ret void
-; CHECK: ret
-}
-
-@varfloat = global float 0.0
-@vardouble = global double 0.0
-
-define void @test_round() {
-; CHECK-LABEL: test_round:
-
-  %val = load fp128* @lhs
-
-  %float = fptrunc fp128 %val to float
-  store float %float, float* @varfloat
-; CHECK: bl __trunctfsf2
-; CHECK: str s0, [{{x[0-9]+}}, #:lo12:varfloat]
-
-  %double = fptrunc fp128 %val to double
-  store double %double, double* @vardouble
-; CHECK: bl __trunctfdf2
-; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble]
-
-  ret void
-}
-
-define void @test_extend() {
-; CHECK-LABEL: test_extend:
-
-  %val = load fp128* @lhs
-
-  %float = load float* @varfloat
-  %fromfloat = fpext float %float to fp128
-  store volatile fp128 %fromfloat, fp128* @lhs
-; CHECK: bl __extendsftf2
-; CHECK: str q0, [{{x[0-9]+}}, #:lo12:lhs]
-
-  %double = load double* @vardouble
-  %fromdouble = fpext double %double to fp128
-  store volatile fp128 %fromdouble, fp128* @lhs
-; CHECK: bl __extenddftf2
-; CHECK: str q0, [{{x[0-9]+}}, #:lo12:lhs]
-
-  ret void
-; CHECK: ret
-}
-
-define fp128 @test_neg(fp128 %in) {
-; CHECK: [[MINUS0:.LCPI[0-9]+_0]]:
-; Make sure the weird hex constant below *is* -0.0
-; CHECK-NEXT: fp128 -0
-
-; CHECK-LABEL: test_neg:
-
-  ; Could in principle be optimized to fneg which we can't select, this makes
-  ; sure that doesn't happen.
-  %ret = fsub fp128 0xL00000000000000008000000000000000, %in
-; CHECK: str q0, [sp, #-16]
-; CHECK-NEXT: ldr q1, [sp], #16
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:[[MINUS0]]]
-; CHECK: bl __subtf3
-
-  ret fp128 %ret
-; CHECK: ret
-}
diff --git a/test/CodeGen/AArch64/fpimm.ll b/test/CodeGen/AArch64/fpimm.ll
index b8f7169..e59520c 100644
--- a/test/CodeGen/AArch64/fpimm.ll
+++ b/test/CodeGen/AArch64/fpimm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @varf32 = global float 0.0
 @varf64 = global double 0.0
@@ -13,7 +13,7 @@ define void @check_float() {
 
   %newval2 = fadd float %val, 128.0
   store volatile float %newval2, float* @varf32
-; CHECK-DAG: ldr [[HARD:s[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI0_0
+; CHECK-DAG: ldr [[HARD:s[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:.LCPI0_0
 
 ; CHECK: ret
   ret void
@@ -29,7 +29,7 @@ define void @check_double() {
 
   %newval2 = fadd double %val, 128.0
   store volatile double %newval2, double* @varf64
-; CHECK-DAG: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI1_0
+; CHECK-DAG: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.LCPI1_0
 
 ; CHECK: ret
   ret void
diff --git a/test/CodeGen/AArch64/frameaddr.ll b/test/CodeGen/AArch64/frameaddr.ll
index 182704b..85d95e2 100644
--- a/test/CodeGen/AArch64/frameaddr.ll
+++ b/test/CodeGen/AArch64/frameaddr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu  | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios7.0  | FileCheck %s
 
 define i8* @t() nounwind {
 entry:
@@ -12,7 +12,7 @@ define i8* @t2() nounwind {
 entry:
 ; CHECK-LABEL: t2:
 ; CHECK: ldr x[[reg:[0-9]+]], [x29]
-; CHECK: ldr x[[reg]], [x[[reg]]]
+; CHECK: ldr {{x[0-9]+}}, [x[[reg]]]
 	%0 = call i8* @llvm.frameaddress(i32 2)
         ret i8* %0
 }
diff --git a/test/CodeGen/AArch64/free-zext.ll b/test/CodeGen/AArch64/free-zext.ll
new file mode 100644
index 0000000..d69105e
--- /dev/null
+++ b/test/CodeGen/AArch64/free-zext.ll
@@ -0,0 +1,14 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+define i64 @test_free_zext(i8* %a, i16* %b) {
+; CHECK-LABEL: test_free_zext
+; CHECK-DAG: ldrb w[[A:[0-9]+]], [x0]
+; CHECK: ldrh w[[B:[0-9]+]], [x1]
+; CHECK: add x0, x[[B]], x[[A]]
+  %1 = load i8* %a, align 1
+  %conv = zext i8 %1 to i64
+  %2 = load i16* %b, align 2
+  %conv1 = zext i16 %2 to i64
+  %add = add nsw i64 %conv1, %conv
+  ret i64 %add
+}
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
index f307686..abb732c 100644
--- a/test/CodeGen/AArch64/func-argpassing.ll
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -1,7 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -18,7 +16,7 @@ define void @take_i8s(i8 %val1, i8 %val2) {
     store i8 %val2, i8* @var8
     ; Not using w1 may be technically allowed, but it would indicate a
     ; problem in itself.
-;  CHECK: strb w1, [{{x[0-9]+}}, #:lo12:var8]
+;  CHECK: strb w1, [{{x[0-9]+}}, {{#?}}:lo12:var8]
     ret void
 }
 
@@ -28,7 +26,7 @@ define void @add_floats(float %val1, float %val2) {
 ; CHECK: fadd [[ADDRES:s[0-9]+]], s0, s1
 ; CHECK-NOFP-NOT: fadd
     store float %newval, float* @varfloat
-; CHECK: str [[ADDRES]], [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK: str [[ADDRES]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
     ret void
 }
 
@@ -43,12 +41,12 @@ define void @take_struct(%myStruct* byval %structval) {
     ; Some weird move means x0 is used for one access
 ; CHECK: ldr [[REG32:w[0-9]+]], [{{x[0-9]+|sp}}, #12]
     store volatile i32 %val0, i32* @var32
-; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: str [[REG32]], [{{x[0-9]+}}, {{#?}}:lo12:var32]
 
     %val1 = load volatile i64* %addr1
 ; CHECK: ldr [[REG64:x[0-9]+]], [{{x[0-9]+|sp}}]
     store volatile i64 %val1, i64* @var64
-; CHECK: str [[REG64]], [{{x[0-9]+}}, #:lo12:var64]
+; CHECK: str [[REG64]], [{{x[0-9]+}}, {{#?}}:lo12:var64]
 
     ret void
 }
@@ -62,15 +60,14 @@ define void @check_byval_align(i32* byval %ignore, %myStruct* byval align 16 %st
 
     %val0 = load volatile i32* %addr0
     ; Some weird move means x0 is used for one access
-; CHECK: add x[[STRUCTVAL_ADDR:[0-9]+]], sp, #16
-; CHECK: ldr [[REG32:w[0-9]+]], [x[[STRUCTVAL_ADDR]], #12]
+; CHECK: ldr [[REG32:w[0-9]+]], [sp, #28]
     store i32 %val0, i32* @var32
-; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: str [[REG32]], [{{x[0-9]+}}, {{#?}}:lo12:var32]
 
     %val1 = load volatile i64* %addr1
 ; CHECK: ldr [[REG64:x[0-9]+]], [sp, #16]
     store i64 %val1, i64* @var64
-; CHECK: str [[REG64]], [{{x[0-9]+}}, #:lo12:var64]
+; CHECK: str [[REG64]], [{{x[0-9]+}}, {{#?}}:lo12:var64]
 
     ret void
 }
@@ -79,7 +76,7 @@ define i32 @return_int() {
 ; CHECK-LABEL: return_int:
     %val = load i32* @var32
     ret i32 %val
-; CHECK: ldr w0, [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: ldr w0, [{{x[0-9]+}}, {{#?}}:lo12:var32]
     ; Make sure epilogue follows
 ; CHECK-NEXT: ret
 }
@@ -87,7 +84,7 @@ define i32 @return_int() {
 define double @return_double() {
 ; CHECK-LABEL: return_double:
     ret double 3.14
-; CHECK: ldr d0, [{{x[0-9]+}}, #:lo12:.LCPI
+; CHECK: ldr d0, [{{x[0-9]+}}, {{#?}}:lo12:.LCPI
 ; CHECK-NOFP-NOT: ldr d0,
 }
 
@@ -99,10 +96,10 @@ define [2 x i64] @return_struct() {
     %addr = bitcast %myStruct* @varstruct to [2 x i64]*
     %val = load [2 x i64]* %addr
     ret [2 x i64] %val
-; CHECK: ldr x0, [{{x[0-9]+}}, #:lo12:varstruct]
+; CHECK-DAG: ldr x0, [{{x[0-9]+}}, {{#?}}:lo12:varstruct]
     ; Odd register regex below disallows x0 which we want to be live now.
-; CHECK: add {{x[1-9][0-9]*}}, {{x[1-9][0-9]*}}, #:lo12:varstruct
-; CHECK-NEXT: ldr x1, [{{x[1-9][0-9]*}}, #8]
+; CHECK-DAG: add {{x[1-9][0-9]*}}, {{x[1-9][0-9]*}}, {{#?}}:lo12:varstruct
+; CHECK: ldr x1, [{{x[1-9][0-9]*}}, #8]
     ; Make sure epilogue immediately follows
 ; CHECK-NEXT: ret
 }
@@ -139,17 +136,16 @@ define i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var
     store volatile i64 %val64, i64* @var64
     ; Currently nothing on local stack, so struct should be at sp
 ; CHECK: ldr [[VAL64:x[0-9]+]], [sp]
-; CHECK: str [[VAL64]], [{{x[0-9]+}}, #:lo12:var64]
+; CHECK: str [[VAL64]], [{{x[0-9]+}}, {{#?}}:lo12:var64]
 
     store volatile double %notstacked, double* @vardouble
 ; CHECK-NOT: ldr d0
-; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble
+; CHECK: str d0, [{{x[0-9]+}}, {{#?}}:lo12:vardouble
 ; CHECK-NOFP-NOT: str d0,
 
     %retval = load volatile i32* %stacked
     ret i32 %retval
 ; CHECK-LE: ldr w0, [sp, #16]
-; CHECK-BE: ldr w0, [sp, #20]
 }
 
 define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
@@ -159,36 +155,36 @@ define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
     store float %var8, float* @varfloat
     ; Beware as above: the offset would be different on big-endian
     ; machines if the first ldr were changed to use s-registers.
-; CHECK: ldr d[[VALFLOAT:[0-9]+]], [sp]
-; CHECK: str s[[VALFLOAT]], [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK: ldr {{[ds]}}[[VALFLOAT:[0-9]+]], [sp]
+; CHECK: str s[[VALFLOAT]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
 
     ret void
 }
 
 ; 128-bit integer types should be passed in xEVEN, xODD rather than
 ; the reverse. In this case x2 and x3. Nothing should use x1.
-define i32 @check_i128_regalign(i32 %val0, i128 %val1, i32 %val2) {
-; CHECK: check_i128_regalign
+define i64 @check_i128_regalign(i32 %val0, i128 %val1, i64 %val2) {
+; CHECK-LABEL: check_i128_regalign
     store i128 %val1, i128* @var128
-; CHECK: str x2, [{{x[0-9]+}}, #:lo12:var128]
-; CHECK: str x3, [{{x[0-9]+}}, #8]
+; CHECK-DAG: str x2, [{{x[0-9]+}}, {{#?}}:lo12:var128]
+; CHECK-DAG: str x3, [{{x[0-9]+}}, #8]
 
-    ret i32 %val2
+    ret i64 %val2
 ; CHECK: mov x0, x4
 }
 
 define void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
                                    i32 %val4, i32 %val5, i32 %val6, i32 %val7,
                                    i32 %stack1, i128 %stack2) {
-; CHECK: check_i128_stackalign
+; CHECK-LABEL: check_i128_stackalign
     store i128 %stack2, i128* @var128
     ; Nothing local on stack in current codegen, so first stack is 16 away
 ; CHECK-LE: add     x[[REG:[0-9]+]], sp, #16
 ; CHECK-LE: ldr {{x[0-9]+}}, [x[[REG]], #8]
-; CHECK-BE: ldr {{x[0-9]+}}, [sp, #24]
 
     ; Important point is that we address sp+24 for second dword
-; CHECK: ldr     {{x[0-9]+}}, [sp, #16]
+
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
     ret void
 }
 
@@ -200,3 +196,13 @@ define i32 @test_extern() {
 ; CHECK: bl memcpy
   ret i32 0
 }
+
+
+; A sub-i32 stack argument must be loaded on big endian with ldr{h,b}, not just
+; implicitly extended to a 32-bit load.
+define i16 @stacked_i16(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
+                        i32 %val4, i32 %val5, i32 %val6, i32 %val7,
+                        i16 %stack1) {
+; CHECK-LABEL: stacked_i16
+  ret i16 %stack1
+}
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index f029bf2..422c576 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-BE --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -24,15 +24,15 @@ define void @simple_args() {
   %char1 = load i8* @var8
   %char2 = load i8* @var8_2
   call void @take_i8s(i8 %char1, i8 %char2)
-; CHECK-DAG: ldrb w0, [{{x[0-9]+}}, #:lo12:var8]
-; CHECK-DAG: ldrb w1, [{{x[0-9]+}}, #:lo12:var8_2]
+; CHECK-DAG: ldrb w0, [{{x[0-9]+}}, {{#?}}:lo12:var8]
+; CHECK-DAG: ldrb w1, [{{x[0-9]+}}, {{#?}}:lo12:var8_2]
 ; CHECK: bl take_i8s
 
   %float1 = load float* @varfloat
   %float2 = load float* @varfloat_2
   call void @take_floats(float %float1, float %float2)
-; CHECK-DAG: ldr s1, [{{x[0-9]+}}, #:lo12:varfloat_2]
-; CHECK-DAG: ldr s0, [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK-DAG: ldr s1, [{{x[0-9]+}}, {{#?}}:lo12:varfloat_2]
+; CHECK-DAG: ldr s0, [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
 ; CHECK: bl take_floats
 ; CHECK-NOFP-NOT: ldr s1,
 ; CHECK-NOFP-NOT: ldr s0,
@@ -51,22 +51,22 @@ define void @simple_rets() {
   %int = call i32 @return_int()
   store i32 %int, i32* @var32
 ; CHECK: bl return_int
-; CHECK: str w0, [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: str w0, [{{x[0-9]+}}, {{#?}}:lo12:var32]
 
   %dbl = call double @return_double()
   store double %dbl, double* @vardouble
 ; CHECK: bl return_double
-; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble]
+; CHECK: str d0, [{{x[0-9]+}}, {{#?}}:lo12:vardouble]
 ; CHECK-NOFP-NOT: str d0,
 
   %arr = call [2 x i64] @return_smallstruct()
   store [2 x i64] %arr, [2 x i64]* @varsmallstruct
 ; CHECK: bl return_smallstruct
 ; CHECK: str x1, [{{x[0-9]+}}, #8]
-; CHECK: str x0, [{{x[0-9]+}}, #:lo12:varsmallstruct]
+; CHECK: str x0, [{{x[0-9]+}}, {{#?}}:lo12:varsmallstruct]
 
   call void @return_large_struct(%myStruct* sret @varstruct)
-; CHECK: add x8, {{x[0-9]+}}, #:lo12:varstruct
+; CHECK: add x8, {{x[0-9]+}}, {{#?}}:lo12:varstruct
 ; CHECK: bl return_large_struct
 
   ret void
@@ -88,19 +88,28 @@ define void @check_stack_args() {
   ; Want to check that the final double is passed in registers and
   ; that varstruct is passed on the stack. Rather dependent on how a
   ; memcpy gets created, but the following works for now.
-; CHECK: mov x[[SPREG:[0-9]+]], sp
-; CHECK-DAG: str {{w[0-9]+}}, [x[[SPREG]]]
-; CHECK-DAG: str {{w[0-9]+}}, [x[[SPREG]], #12]
-; CHECK-DAG: fmov d0,
+
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
+; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b
+
+; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
+; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]]
+
 ; CHECK: bl struct_on_stack
 ; CHECK-NOFP-NOT: fmov
 
   call void @stacked_fpu(float -1.0, double 1.0, float 4.0, float 2.0,
                          float -2.0, float -8.0, float 16.0, float 1.0,
                          float 64.0)
-; CHECK: ldr s[[STACKEDREG:[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI
-; CHECK: mov x0, sp
-; CHECK: str d[[STACKEDREG]], [x0]
+
+; CHECK:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK: str [[SIXTY_FOUR]], [sp]
+
+; CHECK-NONEON:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK-NONEON: str [[SIXTY_FOUR]], [sp]
+
 ; CHECK: bl stacked_fpu
   ret void
 }
@@ -119,18 +128,20 @@ define void @check_i128_align() {
   call void @check_i128_stackalign(i32 0, i32 1, i32 2, i32 3,
                                    i32 4, i32 5, i32 6, i32 7,
                                    i32 42, i128 %val)
-; CHECK: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var128]
+; CHECK: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:var128]
 ; CHECK: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
-; CHECK: mov x[[SPREG:[0-9]+]], sp
-; CHECK: str [[I128HI]], [x[[SPREG]], #24]
-; CHECK: str [[I128LO]], [x[[SPREG]], #16]
+; CHECK: stp [[I128LO]], [[I128HI]], [sp, #16]
+
+; CHECK-NONEON: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, :lo12:var128]
+; CHECK-NONEON: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
+; CHECK-NONEON: stp [[I128LO]], [[I128HI]], [sp, #16]
 ; CHECK: bl check_i128_stackalign
 
   call void @check_i128_regalign(i32 0, i128 42)
 ; CHECK-NOT: mov x1
-; CHECK-LE: movz x2, #42
+; CHECK-LE: movz x2, #{{0x2a|42}}
 ; CHECK-LE: mov x3, xzr
-; CHECK-BE: movz x3, #42
+; CHECK-BE: movz {{x|w}}3, #{{0x2a|42}}
 ; CHECK-BE: mov x2, xzr
 ; CHECK: bl check_i128_regalign
 
@@ -143,7 +154,7 @@ define void @check_indirect_call() {
 ; CHECK-LABEL: check_indirect_call:
   %func = load void()** @fptr
   call void %func()
-; CHECK: ldr [[FPTR:x[0-9]+]], [{{x[0-9]+}}, #:lo12:fptr]
+; CHECK: ldr [[FPTR:x[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:fptr]
 ; CHECK: blr [[FPTR]]
 
   ret void
diff --git a/test/CodeGen/AArch64/global-alignment.ll b/test/CodeGen/AArch64/global-alignment.ll
index 56e5cba..451b9d6 100644
--- a/test/CodeGen/AArch64/global-alignment.ll
+++ b/test/CodeGen/AArch64/global-alignment.ll
@@ -1,8 +1,9 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 @var32 = global [3 x i32] zeroinitializer
 @var64 = global [3 x i64] zeroinitializer
 @var32_align64 = global [3 x i32] zeroinitializer, align 8
+@alias = alias [3 x i32]* @var32_align64
 
 define i64 @test_align32() {
 ; CHECK-LABEL: test_align32:
@@ -12,7 +13,7 @@ define i64 @test_align32() {
   ; emit an "LDR x0, [x0, #:lo12:var32] instruction to implement this load.
   %val = load i64* %addr
 ; CHECK: adrp [[HIBITS:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], {{#?}}:lo12:var32
 ; CHECK: ldr x0, [x[[ADDR]]]
 
   ret i64 %val
@@ -27,7 +28,7 @@ define i64 @test_align64() {
   %val = load i64* %addr
 ; CHECK: adrp x[[HIBITS:[0-9]+]], var64
 ; CHECK-NOT: add x[[HIBITS]]
-; CHECK: ldr x0, [x[[HIBITS]], #:lo12:var64]
+; CHECK: ldr x0, [x[[HIBITS]], {{#?}}:lo12:var64]
 
   ret i64 %val
 }
@@ -41,7 +42,20 @@ define i64 @test_var32_align64() {
   %val = load i64* %addr
 ; CHECK: adrp x[[HIBITS:[0-9]+]], var32_align64
 ; CHECK-NOT: add x[[HIBITS]]
-; CHECK: ldr x0, [x[[HIBITS]], #:lo12:var32_align64]
+; CHECK: ldr x0, [x[[HIBITS]], {{#?}}:lo12:var32_align64]
+
+  ret i64 %val
+}
+
+define i64 @test_var32_alias() {
+; CHECK-LABEL: test_var32_alias:
+  %addr = bitcast [3 x i32]* @alias to i64*
+
+  ; Test that we can find the alignment for aliases.
+  %val = load i64* %addr
+; CHECK: adrp x[[HIBITS:[0-9]+]], alias
+; CHECK-NOT: add x[[HIBITS]]
+; CHECK: ldr x0, [x[[HIBITS]], {{#?}}:lo12:alias]
 
   ret i64 %val
 }
@@ -56,7 +70,7 @@ define i64 @test_yet_another_var() {
   ; so we can't fold the load.
   %val = load i64* bitcast({i32, i32}* @yet_another_var to i64*)
 ; CHECK: adrp [[HIBITS:x[0-9]+]], yet_another_var
-; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], #:lo12:yet_another_var
+; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], {{#?}}:lo12:yet_another_var
 ; CHECK: ldr x0, [x[[ADDR]]]
   ret i64 %val
 }
@@ -65,5 +79,5 @@ define i64()* @test_functions() {
 ; CHECK-LABEL: test_functions:
   ret i64()* @test_yet_another_var
 ; CHECK: adrp [[HIBITS:x[0-9]+]], test_yet_another_var
-; CHECK: add x0, [[HIBITS]], #:lo12:test_yet_another_var
+; CHECK: add x0, [[HIBITS]], {{#?}}:lo12:test_yet_another_var
 }
diff --git a/test/CodeGen/AArch64/got-abuse.ll b/test/CodeGen/AArch64/got-abuse.ll
index 8b06031..7a02b10 100644
--- a/test/CodeGen/AArch64/got-abuse.ll
+++ b/test/CodeGen/AArch64/got-abuse.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj -o - %s
 
 ; LLVM gives well-defined semantics to this horrible construct (though C says
 ; it's undefined). Regardless, we shouldn't crash. The important feature here is
@@ -17,7 +17,7 @@ define void @foo() nounwind {
 entry:
   call void @consume(i32 ptrtoint (void ()* @func to i32))
 ; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:func
-; CHECK: ldr {{x[0-9]+}}, [x[[ADDRHI]], #:got_lo12:func]
+; CHECK: ldr {{x[0-9]+}}, [x[[ADDRHI]], {{#?}}:got_lo12:func]
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/i1-contents.ll b/test/CodeGen/AArch64/i1-contents.ll
new file mode 100644
index 0000000..7f133fc
--- /dev/null
+++ b/test/CodeGen/AArch64/i1-contents.ll
@@ -0,0 +1,55 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+%big = type i32
+
+@var = global %big 0
+
+; AAPCS: low 8 bits of %in (== w0) will be either 0 or 1. Need to extend to
+; 32-bits.
+define void @consume_i1_arg(i1 %in) {
+; CHECK-LABEL: consume_i1_arg:
+; CHECK: and [[BOOL32:w[0-9]+]], w0, #{{0x1|0xff}}
+; CHECK: str [[BOOL32]], [{{x[0-9]+}}, :lo12:var]
+  %val = zext i1 %in to %big
+  store %big %val, %big* @var
+  ret void
+}
+
+; AAPCS: low 8 bits of %val1 (== w0) will be either 0 or 1. Need to extend to
+; 32-bits (doesn't really matter if it's from 1 or 8 bits).
+define void @consume_i1_ret() {
+; CHECK-LABEL: consume_i1_ret:
+; CHECK: bl produce_i1_ret
+; CHECK: and [[BOOL32:w[0-9]+]], w0, #{{0x1|0xff}}
+; CHECK: str [[BOOL32]], [{{x[0-9]+}}, :lo12:var]
+  %val1 = call i1 @produce_i1_ret()
+  %val = zext i1 %val1 to %big
+  store %big %val, %big* @var
+  ret void
+}
+
+; AAPCS: low 8 bits of w0 must be either 0 or 1. Need to mask them off.
+define i1 @produce_i1_ret() {
+; CHECK-LABEL: produce_i1_ret:
+; CHECK: ldr [[VAR32:w[0-9]+]], [{{x[0-9]+}}, :lo12:var]
+; CHECK: and w0, [[VAR32]], #{{0x1|0xff}}
+  %val = load %big* @var
+  %val1 = trunc %big %val to i1
+  ret i1 %val1
+}
+
+define void @produce_i1_arg() {
+; CHECK-LABEL: produce_i1_arg:
+; CHECK: ldr [[VAR32:w[0-9]+]], [{{x[0-9]+}}, :lo12:var]
+; CHECK: and w0, [[VAR32]], #{{0x1|0xff}}
+; CHECK: bl consume_i1_arg
+  %val = load %big* @var
+  %val1 = trunc %big %val to i1
+  call void @consume_i1_arg(i1 %val1)
+  ret void
+}
+
+
+;define zeroext i1 @foo(i8 %in) {
+;  %val = trunc i8 %in to i1
+;  ret i1 %val
+;}
diff --git a/test/CodeGen/AArch64/i128-align.ll b/test/CodeGen/AArch64/i128-align.ll
index 21ca7ed..a1b4d6f 100644
--- a/test/CodeGen/AArch64/i128-align.ll
+++ b/test/CodeGen/AArch64/i128-align.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios7.0 -verify-machineinstrs -o - %s | FileCheck %s
 
 %struct = type { i32, i128, i8 }
 
@@ -13,7 +13,7 @@ define i64 @check_size() {
 
   %diff = sub i64 %endi, %starti
   ret i64 %diff
-; CHECK: movz x0, #48
+; CHECK: {{movz x0, #48|orr w0, wzr, #0x30}}
 }
 
 define i64 @check_field() {
@@ -25,5 +25,5 @@ define i64 @check_field() {
 
   %diff = sub i64 %endi, %starti
   ret i64 %diff
-; CHECK: movz x0, #16
+; CHECK: {{movz x0, #16|orr w0, wzr, #0x10}}
 }
diff --git a/test/CodeGen/AArch64/i128-shift.ll b/test/CodeGen/AArch64/i128-shift.ll
deleted file mode 100644
index d786d44..0000000
--- a/test/CodeGen/AArch64/i128-shift.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-
-define i128 @test_i128_lsl(i128 %a, i32 %shift) {
-; CHECK-LABEL: test_i128_lsl:
-
-  %sh_prom = zext i32 %shift to i128
-  %shl = shl i128 %a, %sh_prom
-
-; CHECK: movz [[SIXTYFOUR:x[0-9]+]], #64
-; CHECK-NEXT: sub [[REVSHAMT:x[0-9]+]], [[SIXTYFOUR]], [[SHAMT_32:w[0-9]+]], uxtw
-; CHECK-NEXT: lsr [[TMP1:x[0-9]+]], [[LO:x[0-9]+]], [[REVSHAMT]]
-; CHECK: lsl [[TMP2:x[0-9]+]], [[HI:x[0-9]+]], [[SHAMT:x[0-9]+]]
-; CHECK-NEXT: orr [[FALSEVAL:x[0-9]+]], [[TMP1]], [[TMP2]]
-; CHECK-NEXT: sub [[EXTRASHAMT:x[0-9]+]], [[SHAMT]], #64
-; CHECK-NEXT: lsl [[TMP3:x[0-9]+]], [[LO]], [[EXTRASHAMT]]
-; CHECK-NEXT: cmp [[EXTRASHAMT]], #0
-; CHECK-NEXT: csel [[RESULTHI:x[0-9]+]], [[TMP3]], [[FALSEVAL]], ge
-; CHECK-NEXT: lsl [[TMP4:x[0-9]+]], [[LO]], [[SHAMT]]
-; CHECK-NEXT: csel [[RESULTLO:x[0-9]+]], xzr, [[TMP4]], ge
-
-  ret i128 %shl
-}
-
-define i128 @test_i128_shr(i128 %a, i32 %shift) {
-; CHECK-LABEL: test_i128_shr:
-
-  %sh_prom = zext i32 %shift to i128
-  %shr = lshr i128 %a, %sh_prom
-
-; CHECK: movz [[SIXTYFOUR]], #64
-; CHECK-NEXT: sub [[REVSHAMT:x[0-9]+]], [[SIXTYFOUR]], [[SHAMT_32:w[0-9]+]], uxtw
-; CHECK-NEXT: lsl [[TMP2:x[0-9]+]], [[HI:x[0-9]+]], [[REVSHAMT]]
-; CHECK: lsr [[TMP1:x[0-9]+]], [[LO:x[0-9]+]], [[SHAMT:x[0-9]+]]
-; CHECK-NEXT: orr [[FALSEVAL:x[0-9]+]], [[TMP1]], [[TMP2]]
-; CHECK-NEXT: sub [[EXTRASHAMT:x[0-9]+]], [[SHAMT]], #64
-; CHECK-NEXT: lsr [[TRUEVAL:x[0-9]+]], [[HI]], [[EXTRASHAMT]]
-; CHECK-NEXT: cmp [[EXTRASHAMT]], #0
-; CHECK-NEXT: csel [[RESULTLO:x[0-9]+]], [[TRUEVAL]], [[FALSEVAL]], ge
-; CHECK-NEXT: lsr [[TMP3:x[0-9]+]], [[HI]], [[SHAMT]]
-; CHECK-NEXT: csel [[RESULTHI:x[0-9]+]], xzr, [[TMP3]], ge
-
-  ret i128 %shr
-}
diff --git a/test/CodeGen/AArch64/illegal-float-ops.ll b/test/CodeGen/AArch64/illegal-float-ops.ll
index 03c6d8d..9f7dd99 100644
--- a/test/CodeGen/AArch64/illegal-float-ops.ll
+++ b/test/CodeGen/AArch64/illegal-float-ops.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/test/CodeGen/AArch64/init-array.ll b/test/CodeGen/AArch64/init-array.ll
index 076ae27..f47b490 100644
--- a/test/CodeGen/AArch64/init-array.ll
+++ b/test/CodeGen/AArch64/init-array.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
 
 define internal void @_GLOBAL__I_a() section ".text.startup" {
   ret void
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
index 61bbfc2..9d833d9 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
@@ -1,7 +1,7 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+; RUN: not llc -mtriple=aarch64-none-linux-gnu -o - %s
 
 define void @foo() {
   ; Out of range immediate for I.
-  call void asm sideeffect "add x0, x0, $0", "I"(i32 4096)
+  call void asm sideeffect "add x0, x0, $0", "I"(i32 4097)
   ret void
 }
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
index 40746e1..6ffc05d 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+; RUN: not llc -mtriple=arm64-apple-ios7.0 -o - %s
 
 define void @foo() {
   ; 32-bit bitpattern ending in 1101 can't be produced.
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
index 2c53381..1726013 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+; RUN: not llc -mtriple=aarch64-none-linux-gnu -o - %s
 
 define void @foo() {
   ; 32-bit bitpattern ending in 1101 can't be produced.
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badL.ll b/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
index d82d5a2..3c2f60c 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+; RUN: not llc -mtriple=arm64-apple-ios7.0 -o - %s
 
 define void @foo() {
   ; 32-bit bitpattern ending in 1101 can't be produced.
diff --git a/test/CodeGen/AArch64/inline-asm-constraints.ll b/test/CodeGen/AArch64/inline-asm-constraints.ll
deleted file mode 100644
index 365453c..0000000
--- a/test/CodeGen/AArch64/inline-asm-constraints.ll
+++ /dev/null
@@ -1,137 +0,0 @@
-;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -no-integrated-as < %s | FileCheck %s
-
-define i64 @test_inline_constraint_r(i64 %base, i32 %offset) {
-; CHECK-LABEL: test_inline_constraint_r:
-  %val = call i64 asm "add $0, $1, $2, sxtw", "=r,r,r"(i64 %base, i32 %offset)
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw
-  ret i64 %val
-}
-
-define i16 @test_small_reg(i16 %lhs, i16 %rhs) {
-; CHECK-LABEL: test_small_reg:
-  %val = call i16 asm sideeffect "add $0, $1, $2, sxth", "=r,r,r"(i16 %lhs, i16 %rhs)
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth
-  ret i16 %val
-}
-
-define i64 @test_inline_constraint_r_imm(i64 %base, i32 %offset) {
-; CHECK-LABEL: test_inline_constraint_r_imm:
-  %val = call i64 asm "add $0, $1, $2, sxtw", "=r,r,r"(i64 4, i32 12)
-; CHECK: movz [[FOUR:x[0-9]+]], #4
-; CHECK: movz [[TWELVE:w[0-9]+]], #12
-; CHECK: add {{x[0-9]+}}, [[FOUR]], [[TWELVE]], sxtw
-  ret i64 %val
-}
-
-; m is permitted to have a base/offset form. We don't do that
-; currently though.
-define i32 @test_inline_constraint_m(i32 *%ptr) {
-; CHECK-LABEL: test_inline_constraint_m:
-  %val = call i32 asm "ldr $0, $1", "=r,m"(i32 *%ptr)
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
-  ret i32 %val
-}
-
-@arr = global [8 x i32] zeroinitializer
-
-; Q should *never* have base/offset form even if given the chance.
-define i32 @test_inline_constraint_Q(i32 *%ptr) {
-; CHECK-LABEL: test_inline_constraint_Q:
-  %val = call i32 asm "ldr $0, $1", "=r,Q"(i32* getelementptr([8 x i32]* @arr, i32 0, i32 1))
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
-  ret i32 %val
-}
-
-@dump = global fp128 zeroinitializer
-
-define void @test_inline_constraint_w(<8 x i8> %vec64, <4 x float> %vec128, half %hlf, float %flt, double %dbl, fp128 %quad) {
-; CHECK: test_inline_constraint_w:
-  call <8 x i8> asm sideeffect "add $0.8b, $1.8b, $1.8b", "=w,w"(<8 x i8> %vec64)
-  call <8 x i8> asm sideeffect "fadd $0.4s, $1.4s, $1.4s", "=w,w"(<4 x float> %vec128)
-; CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-
-  ; Arguably semantically dodgy to output "vN", but it's what GCC does
-  ; so purely for compatibility we want vector registers to be output.
-  call float asm sideeffect "fcvt ${0:s}, ${1:h}", "=w,w"(half undef)
-  call float asm sideeffect "fadd $0.2s, $0.2s, $0.2s", "=w,w"(float %flt)
-  call double asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(double %dbl)
-  call fp128 asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(fp128 %quad)
-; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}}
-; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  ret void
-}
-
-define void @test_inline_constraint_I() {
-; CHECK-LABEL: test_inline_constraint_I:
-  call void asm sideeffect "add x0, x0, $0", "I"(i32 0)
-  call void asm sideeffect "add x0, x0, $0", "I"(i64 4095)
-; CHECK: add x0, x0, #0
-; CHECK: add x0, x0, #4095
-
-  ret void
-}
-
-; Skip J because it's useless
-
-define void @test_inline_constraint_K() {
-; CHECK-LABEL: test_inline_constraint_K:
-  call void asm sideeffect "and w0, w0, $0", "K"(i32 2863311530) ; = 0xaaaaaaaa
-  call void asm sideeffect "and w0, w0, $0", "K"(i32 65535)
-; CHECK: and w0, w0, #-1431655766
-; CHECK: and w0, w0, #65535
-
-  ret void
-}
-
-define void @test_inline_constraint_L() {
-; CHECK-LABEL: test_inline_constraint_L:
-  call void asm sideeffect "and x0, x0, $0", "L"(i64 4294967296) ; = 0xaaaaaaaa
-  call void asm sideeffect "and x0, x0, $0", "L"(i64 65535)
-; CHECK: and x0, x0, #4294967296
-; CHECK: and x0, x0, #65535
-
-  ret void
-}
-
-; Skip M and N because we don't support MOV pseudo-instructions yet.
-
-@var = global i32 0
-
-define void @test_inline_constraint_S() {
-; CHECK-LABEL: test_inline_constraint_S:
-  call void asm sideeffect "adrp x0, $0", "S"(i32* @var)
-  call void asm sideeffect "adrp x0, ${0:A}", "S"(i32* @var)
-  call void asm sideeffect "add x0, x0, ${0:L}", "S"(i32* @var)
-; CHECK: adrp x0, var
-; CHECK: adrp x0, var
-; CHECK: add x0, x0, #:lo12:var
-  ret void
-}
-
-define i32 @test_inline_constraint_S_label(i1 %in) {
-; CHECK-LABEL: test_inline_constraint_S_label:
-  call void asm sideeffect "adr x0, $0", "S"(i8* blockaddress(@test_inline_constraint_S_label, %loc))
-; CHECK: adr x0, .Ltmp{{[0-9]+}}
-  br i1 %in, label %loc, label %loc2
-loc:
-  ret i32 0
-loc2:
-  ret i32 42
-}
-
-define void @test_inline_constraint_Y() {
-; CHECK-LABEL: test_inline_constraint_Y:
-  call void asm sideeffect "fcmp s0, $0", "Y"(float 0.0)
-; CHECK: fcmp s0, #0.0
-  ret void
-}
-
-define void @test_inline_constraint_Z() {
-; CHECK-LABEL: test_inline_constraint_Z:
-  call void asm sideeffect "cmp w0, $0", "Z"(i32 0)
-; CHECK: cmp w0, #0
-  ret void
-}
diff --git a/test/CodeGen/AArch64/inline-asm-modifiers.ll b/test/CodeGen/AArch64/inline-asm-modifiers.ll
deleted file mode 100644
index cb66335..0000000
--- a/test/CodeGen/AArch64/inline-asm-modifiers.ll
+++ /dev/null
@@ -1,147 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -no-integrated-as < %s | FileCheck %s
-
-@var_simple = hidden global i32 0
-@var_got = global i32 0
-@var_tlsgd = thread_local global i32 0
-@var_tlsld = thread_local(localdynamic) global i32 0
-@var_tlsie = thread_local(initialexec) global i32 0
-@var_tlsle = thread_local(localexec) global i32 0
-
-define void @test_inline_modifier_L() nounwind {
-; CHECK-LABEL: test_inline_modifier_L:
-  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_simple)
-  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "S,~{x0}"(i32* @var_got)
-  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsgd)
-  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsld)
-  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "S,~{x0}"(i32* @var_tlsie)
-  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsle)
-; CHECK: add x0, x0, #:lo12:var_simple
-; CHECK: ldr x0, [x0, #:got_lo12:var_got]
-; CHECK: add x0, x0, #:tlsdesc_lo12:var_tlsgd
-; CHECK: add x0, x0, #:dtprel_lo12:var_tlsld
-; CHECK: ldr x0, [x0, #:gottprel_lo12:var_tlsie]
-; CHECK: add x0, x0, #:tprel_lo12:var_tlsle
-
-  call void asm sideeffect "add x0, x0, ${0:L}", "Si,~{x0}"(i32 64)
-  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "Si,~{x0}"(i32 64)
-; CHECK: add x0, x0, #64
-; CHECK: ldr x0, [x0, #64]
-
-  ret void
-}
-
-define void @test_inline_modifier_G() nounwind {
-; CHECK-LABEL: test_inline_modifier_G:
-  call void asm sideeffect "add x0, x0, ${0:G}, lsl #12", "S,~{x0}"(i32* @var_tlsld)
-  call void asm sideeffect "add x0, x0, ${0:G}, lsl #12", "S,~{x0}"(i32* @var_tlsle)
-; CHECK: add x0, x0, #:dtprel_hi12:var_tlsld, lsl #12
-; CHECK: add x0, x0, #:tprel_hi12:var_tlsle, lsl #12
-
-  call void asm sideeffect "add x0, x0, ${0:G}", "Si,~{x0}"(i32 42)
-; CHECK: add x0, x0, #42
-  ret void
-}
-
-define void @test_inline_modifier_A() nounwind {
-; CHECK-LABEL: test_inline_modifier_A:
-  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_simple)
-  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_got)
-  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_tlsgd)
-  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_tlsie)
-  ; N.b. All tprel and dtprel relocs are modified: lo12 or granules.
-; CHECK: adrp x0, var_simple
-; CHECK: adrp x0, :got:var_got
-; CHECK: adrp x0, :tlsdesc:var_tlsgd
-; CHECK: adrp x0, :gottprel:var_tlsie
-
-  call void asm sideeffect "adrp x0, ${0:A}", "Si,~{x0}"(i32 40)
-; CHECK: adrp x0, #40
-
-  ret void
-}
-
-define void @test_inline_modifier_wx(i32 %small, i64 %big) nounwind {
-; CHECK-LABEL: test_inline_modifier_wx:
-  call i32 asm sideeffect "add $0, $0, $0", "=r,0"(i32 %small)
-  call i32 asm sideeffect "add ${0:w}, ${0:w}, ${0:w}", "=r,0"(i32 %small)
-  call i32 asm sideeffect "add ${0:x}, ${0:x}, ${0:x}", "=r,0"(i32 %small)
-; CHECK: //APP
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-
-  call i64 asm sideeffect "add $0, $0, $0", "=r,0"(i64 %big)
-  call i64 asm sideeffect "add ${0:w}, ${0:w}, ${0:w}", "=r,0"(i64 %big)
-  call i64 asm sideeffect "add ${0:x}, ${0:x}, ${0:x}", "=r,0"(i64 %big)
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-
-  call i32 asm sideeffect "add ${0:w}, ${1:w}, ${1:w}", "=r,r"(i32 0)
-  call i32 asm sideeffect "add ${0:x}, ${1:x}, ${1:x}", "=r,r"(i32 0)
-; CHECK: add {{w[0-9]+}}, wzr, wzr
-; CHECK: add {{x[0-9]+}}, xzr, xzr
-
-  call i32 asm sideeffect "add ${0:w}, ${0:w}, ${1:w}", "=r,Ir,0"(i32 123, i32 %small)
-  call i64 asm sideeffect "add ${0:x}, ${0:x}, ${1:x}", "=r,Ir,0"(i32 456, i64 %big)
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #123
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #456
-
-  ret void
-}
-
-define void @test_inline_modifier_bhsdq() nounwind {
-; CHECK-LABEL: test_inline_modifier_bhsdq:
-  call float asm sideeffect "ldr ${0:b}, [sp]", "=w"()
-  call float asm sideeffect "ldr ${0:h}, [sp]", "=w"()
-  call float asm sideeffect "ldr ${0:s}, [sp]", "=w"()
-  call float asm sideeffect "ldr ${0:d}, [sp]", "=w"()
-  call float asm sideeffect "ldr ${0:q}, [sp]", "=w"()
-; CHECK: ldr b0, [sp]
-; CHECK: ldr h0, [sp]
-; CHECK: ldr s0, [sp]
-; CHECK: ldr d0, [sp]
-; CHECK: ldr q0, [sp]
-
-  call double asm sideeffect "ldr ${0:b}, [sp]", "=w"()
-  call double asm sideeffect "ldr ${0:h}, [sp]", "=w"()
-  call double asm sideeffect "ldr ${0:s}, [sp]", "=w"()
-  call double asm sideeffect "ldr ${0:d}, [sp]", "=w"()
-  call double asm sideeffect "ldr ${0:q}, [sp]", "=w"()
-; CHECK: ldr b0, [sp]
-; CHECK: ldr h0, [sp]
-; CHECK: ldr s0, [sp]
-; CHECK: ldr d0, [sp]
-; CHECK: ldr q0, [sp]
-
-  call void asm sideeffect "fcmp b0, ${0:b}", "Yw"(float 0.0)
-  call void asm sideeffect "fcmp h0, ${0:h}", "Yw"(float 0.0)
-  call void asm sideeffect "fcmp s0, ${0:s}", "Yw"(float 0.0)
-  call void asm sideeffect "fcmp d0, ${0:d}", "Yw"(float 0.0)
-  call void asm sideeffect "fcmp q0, ${0:q}", "Yw"(float 0.0)
-; CHECK: fcmp b0, #0
-; CHECK: fcmp h0, #0
-; CHECK: fcmp s0, #0
-; CHECK: fcmp d0, #0
-; CHECK: fcmp q0, #0
-
-  ret void
-}
-
-define void @test_inline_modifier_c() nounwind {
-; CHECK-LABEL: test_inline_modifier_c:
-  call void asm sideeffect "adr x0, ${0:c}", "i"(i32 3)
-; CHECK: adr x0, 3
-
-  ret void
-}
-
-define void @test_inline_modifier_a() nounwind {
-; CHECK-LABEL: test_inline_modifier_a:
-  call void asm sideeffect "prfm pldl1keep, ${0:a}", "r"(i32* @var_simple)
-; CHECK: adrp [[VARHI:x[0-9]+]], var_simple
-; CHECK: add x[[VARADDR:[0-9]+]], [[VARHI]], #:lo12:var_simple
-; CHECK: prfm pldl1keep, [x[[VARADDR]]]
-  ret void
-}
-
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 94717f5..1dfb789 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -code-model=large -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic <%s | FileCheck --check-prefix=CHECK-PIC %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -o - %s | FileCheck --check-prefix=CHECK-PIC %s
 
 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
@@ -12,7 +12,7 @@ define i32 @test_jumptable(i32 %in) {
     i32 4, label %lbl4
   ]
 ; CHECK: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
-; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], #:lo12:.LJTI0_0
+; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
 ; CHECK: ldr [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #3]
 ; CHECK: br [[DEST]]
 
@@ -24,7 +24,7 @@ define i32 @test_jumptable(i32 %in) {
 ; CHECK-LARGE: br [[DEST]]
 
 ; CHECK-PIC: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
-; CHECK-PIC: add x[[JT:[0-9]+]], [[JTPAGE]], #:lo12:.LJTI0_0
+; CHECK-PIC: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
 ; CHECK-PIC: ldrsw [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #2]
 ; CHECK-PIC: add [[TABLE:x[0-9]+]], [[DEST]], x[[JT]]
 ; CHECK-PIC: br [[TABLE]]
diff --git a/test/CodeGen/AArch64/large-consts.ll b/test/CodeGen/AArch64/large-consts.ll
index 1b769c6..6bf85e8 100644
--- a/test/CodeGen/AArch64/large-consts.ll
+++ b/test/CodeGen/AArch64/large-consts.ll
@@ -4,10 +4,11 @@
 ; it's not the linker's job to put it there.
 
 define double @foo() {
-; CHECK: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0   // encoding: [A,A,0xe0'A',0xd2'A']
-; CHECK: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [A,A,0xc0'A',0xf2'A']
-; CHECK: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [A,A,0xa0'A',0xf2'A']
-; CHECK: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [A,A,0x80'A',0xf2'A']
+
+; CHECK: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0   // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
+; CHECK: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b110AAAAA,0xf2]
+; CHECK: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b101AAAAA,0xf2]
+; CHECK: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b100AAAAA,0xf2]
 
   ret double 3.14159
 }
diff --git a/test/CodeGen/AArch64/large-frame.ll b/test/CodeGen/AArch64/large-frame.ll
deleted file mode 100644
index fde3036..0000000
--- a/test/CodeGen/AArch64/large-frame.ll
+++ /dev/null
@@ -1,119 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-declare void @use_addr(i8*)
-
-@addr = global i8* null
-
-define void @test_bigframe() {
-; CHECK-LABEL: test_bigframe:
-; CHECK: .cfi_startproc
-
-  %var1 = alloca i8, i32 20000000
-  %var2 = alloca i8, i32 16
-  %var3 = alloca i8, i32 20000000
-; CHECK: sub sp, sp, #496
-; CHECK: .cfi_def_cfa sp, 496
-; CHECK: str x30, [sp, #488]
-  ; Total adjust is 39999536
-; CHECK: movz [[SUBCONST:x[0-9]+]], #22576
-; CHECK: movk [[SUBCONST]], #610, lsl #16
-; CHECK: sub sp, sp, [[SUBCONST]]
-; CHECK: .cfi_def_cfa sp, 40000032
-; CHECK: .cfi_offset x30, -8
-
-  ; Total offset is 20000024
-; CHECK: movz [[VAR1OFFSET:x[0-9]+]], #11544
-; CHECK: movk [[VAR1OFFSET]], #305, lsl #16
-; CHECK: add {{x[0-9]+}}, sp, [[VAR1OFFSET]]
-  store volatile i8* %var1, i8** @addr
-
-  %var1plus2 = getelementptr i8* %var1, i32 2
-  store volatile i8* %var1plus2, i8** @addr
-
-; CHECK: movz [[VAR2OFFSET:x[0-9]+]], #11528
-; CHECK: movk [[VAR2OFFSET]], #305, lsl #16
-; CHECK: add {{x[0-9]+}}, sp, [[VAR2OFFSET]]
-  store volatile i8* %var2, i8** @addr
-
-  %var2plus2 = getelementptr i8* %var2, i32 2
-  store volatile i8* %var2plus2, i8** @addr
-
-  store volatile i8* %var3, i8** @addr
-
-  %var3plus2 = getelementptr i8* %var3, i32 2
-  store volatile i8* %var3plus2, i8** @addr
-
-; CHECK: movz [[ADDCONST:x[0-9]+]], #22576
-; CHECK: movk [[ADDCONST]], #610, lsl #16
-; CHECK: add sp, sp, [[ADDCONST]]
-; CHECK: .cfi_endproc
-  ret void
-}
-
-define void @test_mediumframe() {
-; CHECK-LABEL: test_mediumframe:
-  %var1 = alloca i8, i32 1000000
-  %var2 = alloca i8, i32 16
-  %var3 = alloca i8, i32 1000000
-; CHECK: sub sp, sp, #496
-; CHECK: str x30, [sp, #488]
-; CHECK: sub sp, sp, #688
-; CHECK-NEXT: sub sp, sp, #488, lsl #12
-
-  store volatile i8* %var1, i8** @addr
-; CHECK: add [[VAR1ADDR:x[0-9]+]], sp, #600
-; CHECK: add [[VAR1ADDR]], [[VAR1ADDR]], #244, lsl #12
-
-  %var1plus2 = getelementptr i8* %var1, i32 2
-  store volatile i8* %var1plus2, i8** @addr
-; CHECK: add [[VAR1PLUS2:x[0-9]+]], {{x[0-9]+}}, #2
-
-  store volatile i8* %var2, i8** @addr
-; CHECK: add [[VAR2ADDR:x[0-9]+]], sp, #584
-; CHECK: add [[VAR2ADDR]], [[VAR2ADDR]], #244, lsl #12
-
-  %var2plus2 = getelementptr i8* %var2, i32 2
-  store volatile i8* %var2plus2, i8** @addr
-; CHECK: add [[VAR2PLUS2:x[0-9]+]], {{x[0-9]+}}, #2
-
-  store volatile i8* %var3, i8** @addr
-
-  %var3plus2 = getelementptr i8* %var3, i32 2
-  store volatile i8* %var3plus2, i8** @addr
-
-; CHECK: add sp, sp, #688
-; CHECK: add sp, sp, #488, lsl #12
-; CHECK: ldr x30, [sp, #488]
-; CHECK: add sp, sp, #496
-  ret void
-}
-
-
-@bigspace = global [8 x i64] zeroinitializer
-
-; If temporary registers are allocated for adjustment, they should *not* clobber
-; argument registers.
-define void @test_tempallocation([8 x i64] %val) nounwind {
-; CHECK-LABEL: test_tempallocation:
-  %var = alloca i8, i32 1000000
-; CHECK: sub sp, sp,
-
-; Make sure the prologue is reasonably efficient
-; CHECK-NEXT: stp x29, x30, [sp,
-; CHECK-NEXT: stp x25, x26, [sp,
-; CHECK-NEXT: stp x23, x24, [sp,
-; CHECK-NEXT: stp x21, x22, [sp,
-; CHECK-NEXT: stp x19, x20, [sp,
-
-; Make sure we don't trash an argument register
-; CHECK-NOT: movz {{x[0-7],}}
-; CHECK: sub sp, sp,
-
-; CHECK-NOT: movz {{x[0-7],}}
-
-; CHECK: bl use_addr
-  call void @use_addr(i8* %var)
-
-  store [8 x i64] %val, [8 x i64]* @bigspace
-  ret void
-; CHECK: ret
-}
diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll
new file mode 100644
index 0000000..1ce5c95
--- /dev/null
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@@ -0,0 +1,301 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+
+; This file contains tests for the AArch64 load/store optimizer.
+
+%padding = type { i8*, i8*, i8*, i8* }
+%s.word = type { i32, i32 }
+%s.doubleword = type { i64, i32 }
+%s.quadword = type { fp128, i32 }
+%s.float = type { float, i32 }
+%s.double = type { double, i32 }
+%struct.word = type { %padding, %s.word }
+%struct.doubleword = type { %padding, %s.doubleword }
+%struct.quadword = type { %padding, %s.quadword }
+%struct.float = type { %padding, %s.float }
+%struct.double = type { %padding, %s.double }
+
+; Check the following transform:
+;
+; (ldr|str) X, [x0, #32]
+;  ...
+; add x0, x0, #32
+;  ->
+; (ldr|str) X, [x0, #32]!
+;
+; with X being either w1, x1, s0, d0 or q0.
+
+declare void @bar_word(%s.word*, i32)
+
+define void @load-pre-indexed-word(%struct.word* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-word
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1, i32 0
+  %add = load i32* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1
+  tail call void @bar_word(%s.word* %c, i32 %add)
+  ret void
+}
+
+define void @store-pre-indexed-word(%struct.word* %ptr, i32 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-word
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1, i32 0
+  store i32 %val, i32* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1
+  tail call void @bar_word(%s.word* %c, i32 %val)
+  ret void
+}
+
+declare void @bar_doubleword(%s.doubleword*, i64)
+
+define void @load-pre-indexed-doubleword(%struct.doubleword* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-doubleword
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1, i32 0
+  %add = load i64* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1
+  tail call void @bar_doubleword(%s.doubleword* %c, i64 %add)
+  ret void
+}
+
+define void @store-pre-indexed-doubleword(%struct.doubleword* %ptr, i64 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-doubleword
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1, i32 0
+  store i64 %val, i64* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1
+  tail call void @bar_doubleword(%s.doubleword* %c, i64 %val)
+  ret void
+}
+
+declare void @bar_quadword(%s.quadword*, fp128)
+
+define void @load-pre-indexed-quadword(%struct.quadword* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-quadword
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1, i32 0
+  %add = load fp128* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1
+  tail call void @bar_quadword(%s.quadword* %c, fp128 %add)
+  ret void
+}
+
+define void @store-pre-indexed-quadword(%struct.quadword* %ptr, fp128 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-quadword
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1, i32 0
+  store fp128 %val, fp128* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1
+  tail call void @bar_quadword(%s.quadword* %c, fp128 %val)
+  ret void
+}
+
+declare void @bar_float(%s.float*, float)
+
+define void @load-pre-indexed-float(%struct.float* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-float
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1, i32 0
+  %add = load float* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1
+  tail call void @bar_float(%s.float* %c, float %add)
+  ret void
+}
+
+define void @store-pre-indexed-float(%struct.float* %ptr, float %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-float
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1, i32 0
+  store float %val, float* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1
+  tail call void @bar_float(%s.float* %c, float %val)
+  ret void
+}
+
+declare void @bar_double(%s.double*, double)
+
+define void @load-pre-indexed-double(%struct.double* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-double
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1, i32 0
+  %add = load double* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1
+  tail call void @bar_double(%s.double* %c, double %add)
+  ret void
+}
+
+define void @store-pre-indexed-double(%struct.double* %ptr, double %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-double
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1, i32 0
+  store double %val, double* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1
+  tail call void @bar_double(%s.double* %c, double %val)
+  ret void
+}
+
+; Check the following transform:
+;
+; ldr X, [x20]
+;  ...
+; add x20, x20, #32
+;  ->
+; ldr X, [x20], #32
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @load-post-indexed-word(i32* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-word
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+  %gep1 = getelementptr i32* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i32* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i32* %iv2, i64 -1
+  %load = load i32* %gep2
+  call void @use-word(i32 %load)
+  %load2 = load i32* %iv2
+  call void @use-word(i32 %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i32* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-doubleword(i64* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-doubleword
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+  %gep1 = getelementptr i64* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i64* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i64* %iv2, i64 -1
+  %load = load i64* %gep2
+  call void @use-doubleword(i64 %load)
+  %load2 = load i64* %iv2
+  call void @use-doubleword(i64 %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i64* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-quadword(<2 x i64>* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-quadword
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}], #64
+entry:
+  %gep1 = getelementptr <2 x i64>* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi <2 x i64>* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr <2 x i64>* %iv2, i64 -1
+  %load = load <2 x i64>* %gep2
+  call void @use-quadword(<2 x i64> %load)
+  %load2 = load <2 x i64>* %iv2
+  call void @use-quadword(<2 x i64> %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr <2 x i64>* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-float(float* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-float
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+  %gep1 = getelementptr float* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi float* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr float* %iv2, i64 -1
+  %load = load float* %gep2
+  call void @use-float(float %load)
+  %load2 = load float* %iv2
+  call void @use-float(float %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr float* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-double(double* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-double
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+  %gep1 = getelementptr double* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi double* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr double* %iv2, i64 -1
+  %load = load double* %gep2
+  call void @use-double(double %load)
+  %load2 = load double* %iv2
+  call void @use-double(double %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr double* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+declare void @use-word(i32)
+declare void @use-doubleword(i64)
+declare void @use-quadword(<2 x i64>)
+declare void @use-float(float)
+declare void @use-double(double)
diff --git a/test/CodeGen/AArch64/ldst-regoffset.ll b/test/CodeGen/AArch64/ldst-regoffset.ll
index db30fd9..e2fa08b 100644
--- a/test/CodeGen/AArch64/ldst-regoffset.ll
+++ b/test/CodeGen/AArch64/ldst-regoffset.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
@@ -9,14 +9,14 @@
 @var_float = global float 0.0
 @var_double = global double 0.0
 
-define void @ldst_8bit(i8* %base, i32 %off32, i64 %off64) {
+define void @ldst_8bit(i8* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_8bit:
 
    %addr8_sxtw = getelementptr i8* %base, i32 %off32
    %val8_sxtw = load volatile i8* %addr8_sxtw
    %val32_signed = sext i8 %val8_sxtw to i32
    store volatile i32 %val32_signed, i32* @var_32bit
-; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{[wx][0-9]+}}, sxtw]
 
   %addr_lsl = getelementptr i8* %base, i64 %off64
   %val8_lsl = load volatile i8* %addr_lsl
@@ -31,20 +31,20 @@ define void @ldst_8bit(i8* %base, i32 %off32, i64 %off64) {
   %val8_uxtw = load volatile i8* %addr_uxtw
   %newval8 = add i8 %val8_uxtw, 1
   store volatile i8 %newval8, i8* @var_8bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
 
    ret void
 }
 
 
-define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
+define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_16bit:
 
    %addr8_sxtwN = getelementptr i16* %base, i32 %off32
    %val8_sxtwN = load volatile i16* %addr8_sxtwN
    %val32_signed = sext i16 %val8_sxtwN to i32
    store volatile i32 %val32_signed, i32* @var_32bit
-; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #1]
+; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #1]
 
   %addr_lslN = getelementptr i16* %base, i64 %off64
   %val8_lslN = load volatile i16* %addr_lslN
@@ -59,7 +59,7 @@ define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
   %val8_uxtw = load volatile i16* %addr_uxtw
   %newval8 = add i16 %val8_uxtw, 1
   store volatile i16 %newval8, i16* @var_16bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
 
   %base_sxtw = ptrtoint i16* %base to i64
   %offset_sxtw = sext i32 %off32 to i64
@@ -68,7 +68,7 @@ define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
   %val16_sxtw = load volatile i16* %addr_sxtw
   %val64_signed = sext i16 %val16_sxtw to i64
   store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{[wx][0-9]+}}, sxtw]
 
 
   %base_lsl = ptrtoint i16* %base to i64
@@ -87,17 +87,17 @@ define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
   %val32 = load volatile i32* @var_32bit
   %val16_trunc32 = trunc i32 %val32 to i16
   store volatile i16 %val16_trunc32, i16* %addr_uxtwN
-; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #1]
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #1]
    ret void
 }
 
-define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
+define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_32bit:
 
    %addr_sxtwN = getelementptr i32* %base, i32 %off32
    %val_sxtwN = load volatile i32* %addr_sxtwN
    store volatile i32 %val_sxtwN, i32* @var_32bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #2]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #2]
 
   %addr_lslN = getelementptr i32* %base, i64 %off64
   %val_lslN = load volatile i32* %addr_lslN
@@ -111,7 +111,7 @@ define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
   %val_uxtw = load volatile i32* %addr_uxtw
   %newval8 = add i32 %val_uxtw, 1
   store volatile i32 %newval8, i32* @var_32bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
 
 
   %base_sxtw = ptrtoint i32* %base to i64
@@ -121,7 +121,7 @@ define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
   %val16_sxtw = load volatile i32* %addr_sxtw
   %val64_signed = sext i32 %val16_sxtw to i64
   store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
 
 
   %base_lsl = ptrtoint i32* %base to i64
@@ -139,17 +139,17 @@ define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
   %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i32*
   %val32 = load volatile i32* @var_32bit
   store volatile i32 %val32, i32* %addr_uxtwN
-; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #2]
    ret void
 }
 
-define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
+define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_64bit:
 
    %addr_sxtwN = getelementptr i64* %base, i32 %off32
    %val_sxtwN = load volatile i64* %addr_sxtwN
    store volatile i64 %val_sxtwN, i64* @var_64bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #3]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #3]
 
   %addr_lslN = getelementptr i64* %base, i64 %off64
   %val_lslN = load volatile i64* %addr_lslN
@@ -163,7 +163,7 @@ define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
   %val8_uxtw = load volatile i64* %addr_uxtw
   %newval8 = add i64 %val8_uxtw, 1
   store volatile i64 %newval8, i64* @var_64bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
 
   %base_sxtw = ptrtoint i64* %base to i64
   %offset_sxtw = sext i32 %off32 to i64
@@ -171,7 +171,7 @@ define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
   %addr_sxtw = inttoptr i64 %addrint_sxtw to i64*
   %val64_sxtw = load volatile i64* %addr_sxtw
   store volatile i64 %val64_sxtw, i64* @var_64bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
 
   %base_lsl = ptrtoint i64* %base to i64
   %addrint_lsl = add i64 %base_lsl, %off64
@@ -187,17 +187,17 @@ define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
   %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i64*
   %val64 = load volatile i64* @var_64bit
   store volatile i64 %val64, i64* %addr_uxtwN
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #3]
    ret void
 }
 
-define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
+define void @ldst_float(float* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_float:
 
    %addr_sxtwN = getelementptr float* %base, i32 %off32
    %val_sxtwN = load volatile float* %addr_sxtwN
    store volatile float %val_sxtwN, float* @var_float
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #2]
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #2]
 ; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %addr_lslN = getelementptr float* %base, i64 %off64
@@ -212,7 +212,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %addr_uxtw = inttoptr i64 %addrint1_uxtw to float*
   %val_uxtw = load volatile float* %addr_uxtw
   store volatile float %val_uxtw, float* @var_float
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
 ; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %base_sxtw = ptrtoint float* %base to i64
@@ -221,7 +221,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %addr_sxtw = inttoptr i64 %addrint_sxtw to float*
   %val64_sxtw = load volatile float* %addr_sxtw
   store volatile float %val64_sxtw, float* @var_float
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
 ; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %base_lsl = ptrtoint float* %base to i64
@@ -239,18 +239,18 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %addr_uxtwN = inttoptr i64 %addrint_uxtwN to float*
   %val64 = load volatile float* @var_float
   store volatile float %val64, float* %addr_uxtwN
-; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #2]
 ; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
    ret void
 }
 
-define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
+define void @ldst_double(double* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_double:
 
    %addr_sxtwN = getelementptr double* %base, i32 %off32
    %val_sxtwN = load volatile double* %addr_sxtwN
    store volatile double %val_sxtwN, double* @var_double
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #3]
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #3]
 ; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %addr_lslN = getelementptr double* %base, i64 %off64
@@ -265,7 +265,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %addr_uxtw = inttoptr i64 %addrint1_uxtw to double*
   %val_uxtw = load volatile double* %addr_uxtw
   store volatile double %val_uxtw, double* @var_double
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
 ; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %base_sxtw = ptrtoint double* %base to i64
@@ -274,7 +274,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %addr_sxtw = inttoptr i64 %addrint_sxtw to double*
   %val64_sxtw = load volatile double* %addr_sxtw
   store volatile double %val64_sxtw, double* @var_double
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
 ; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %base_lsl = ptrtoint double* %base to i64
@@ -292,26 +292,26 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %addr_uxtwN = inttoptr i64 %addrint_uxtwN to double*
   %val64 = load volatile double* @var_double
   store volatile double %val64, double* %addr_uxtwN
-; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #3]
 ; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
    ret void
 }
 
 
-define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
+define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) minsize {
 ; CHECK-LABEL: ldst_128bit:
 
    %addr_sxtwN = getelementptr fp128* %base, i32 %off32
    %val_sxtwN = load volatile fp128* %addr_sxtwN
    store volatile fp128 %val_sxtwN, fp128* %base
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
 
   %addr_lslN = getelementptr fp128* %base, i64 %off64
   %val_lslN = load volatile fp128* %addr_lslN
   store volatile fp128 %val_lslN, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #4]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
 
   %addrint_uxtw = ptrtoint fp128* %base to i64
   %offset_uxtw = zext i32 %off32 to i64
@@ -319,8 +319,8 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %addr_uxtw = inttoptr i64 %addrint1_uxtw to fp128*
   %val_uxtw = load volatile fp128* %addr_uxtw
   store volatile fp128 %val_uxtw, fp128* %base
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
 
   %base_sxtw = ptrtoint fp128* %base to i64
   %offset_sxtw = sext i32 %off32 to i64
@@ -328,8 +328,8 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %addr_sxtw = inttoptr i64 %addrint_sxtw to fp128*
   %val64_sxtw = load volatile fp128* %addr_sxtw
   store volatile fp128 %val64_sxtw, fp128* %base
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
 
   %base_lsl = ptrtoint fp128* %base to i64
   %addrint_lsl = add i64 %base_lsl, %off64
@@ -337,7 +337,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %val64_lsl = load volatile fp128* %addr_lsl
   store volatile fp128 %val64_lsl, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
 
   %base_uxtwN = ptrtoint fp128* %base to i64
   %offset_uxtwN = zext i32 %off32 to i64
@@ -346,7 +346,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %addr_uxtwN = inttoptr i64 %addrint_uxtwN to fp128*
   %val64 = load volatile fp128* %base
   store volatile fp128 %val64, fp128* %addr_uxtwN
-; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #4]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
    ret void
 }
diff --git a/test/CodeGen/AArch64/ldst-unscaledimm.ll b/test/CodeGen/AArch64/ldst-unscaledimm.ll
index bea5bb5..1de8443 100644
--- a/test/CodeGen/AArch64/ldst-unscaledimm.ll
+++ b/test/CodeGen/AArch64/ldst-unscaledimm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
@@ -160,7 +160,7 @@ define void @ldst_32bit() {
   %val64_unsigned = zext i32 %val32_zext to i64
   store volatile i64 %val64_unsigned, i64* @var_64bit
 ; CHECK: ldur {{w[0-9]+}}, [{{x[0-9]+}}, #-256]
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
 
 ; Sign-extension to 64-bits
   %addr32_8_sext = getelementptr i8* %addr_8bit, i64 -12
@@ -169,7 +169,7 @@ define void @ldst_32bit() {
   %val64_signed = sext i32 %val32_sext to i64
   store volatile i64 %val64_signed, i64* @var_64bit
 ; CHECK: ldursw {{x[0-9]+}}, [{{x[0-9]+}}, #-12]
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
 
 ; Truncation from 64-bits
   %addr64_8_trunc = getelementptr i8* %addr_8bit, i64 255
diff --git a/test/CodeGen/AArch64/ldst-unsignedimm.ll b/test/CodeGen/AArch64/ldst-unsignedimm.ll
index 44c1586..e171d22 100644
--- a/test/CodeGen/AArch64/ldst-unsignedimm.ll
+++ b/test/CodeGen/AArch64/ldst-unsignedimm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
@@ -20,25 +20,25 @@ define void @ldst_8bit() {
    %val32_signed = sext i8 %val8_sext32 to i32
    store volatile i32 %val32_signed, i32* @var_32bit
 ; CHECK: adrp {{x[0-9]+}}, var_8bit
-; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
 ; match a zero-extending load volatile 8-bit -> 32-bit
   %val8_zext32 = load volatile i8* @var_8bit
   %val32_unsigned = zext i8 %val8_zext32 to i32
   store volatile i32 %val32_unsigned, i32* @var_32bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
 ; match an any-extending load volatile 8-bit -> 32-bit
   %val8_anyext = load volatile i8* @var_8bit
   %newval8 = add i8 %val8_anyext, 1
   store volatile i8 %newval8, i8* @var_8bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
 ; match a sign-extending load volatile 8-bit -> 64-bit
   %val8_sext64 = load volatile i8* @var_8bit
   %val64_signed = sext i8 %val8_sext64 to i64
   store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsb {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrsb {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
 ; match a zero-extending load volatile 8-bit -> 64-bit.
 ; This uses the fact that ldrb w0, [x0] will zero out the high 32-bits
@@ -46,19 +46,19 @@ define void @ldst_8bit() {
   %val8_zext64 = load volatile i8* @var_8bit
   %val64_unsigned = zext i8 %val8_zext64 to i64
   store volatile i64 %val64_unsigned, i64* @var_64bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
 ; truncating store volatile 32-bits to 8-bits
   %val32 = load volatile i32* @var_32bit
   %val8_trunc32 = trunc i32 %val32 to i8
   store volatile i8 %val8_trunc32, i8* @var_8bit
-; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
 ; truncating store volatile 64-bits to 8-bits
   %val64 = load volatile i64* @var_64bit
   %val8_trunc64 = trunc i64 %val64 to i8
   store volatile i8 %val8_trunc64, i8* @var_8bit
-; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
 
    ret void
 }
@@ -74,25 +74,25 @@ define void @ldst_16bit() {
   %val32_signed = sext i16 %val16_sext32 to i32
   store volatile i32 %val32_signed, i32* @var_32bit
 ; CHECK: adrp {{x[0-9]+}}, var_16bit
-; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
 ; match a zero-extending load volatile 16-bit -> 32-bit
   %val16_zext32 = load volatile i16* @var_16bit
   %val32_unsigned = zext i16 %val16_zext32 to i32
   store volatile i32 %val32_unsigned, i32* @var_32bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
 ; match an any-extending load volatile 16-bit -> 32-bit
   %val16_anyext = load volatile i16* @var_16bit
   %newval16 = add i16 %val16_anyext, 1
   store volatile i16 %newval16, i16* @var_16bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
 ; match a sign-extending load volatile 16-bit -> 64-bit
   %val16_sext64 = load volatile i16* @var_16bit
   %val64_signed = sext i16 %val16_sext64 to i64
   store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
 ; match a zero-extending load volatile 16-bit -> 64-bit.
 ; This uses the fact that ldrb w0, [x0] will zero out the high 32-bits
@@ -100,19 +100,19 @@ define void @ldst_16bit() {
   %val16_zext64 = load volatile i16* @var_16bit
   %val64_unsigned = zext i16 %val16_zext64 to i64
   store volatile i64 %val64_unsigned, i64* @var_64bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
 ; truncating store volatile 32-bits to 16-bits
   %val32 = load volatile i32* @var_32bit
   %val16_trunc32 = trunc i32 %val32 to i16
   store volatile i16 %val16_trunc32, i16* @var_16bit
-; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
 ; truncating store volatile 64-bits to 16-bits
   %val64 = load volatile i64* @var_64bit
   %val16_trunc64 = trunc i64 %val64 to i16
   store volatile i16 %val16_trunc64, i16* @var_16bit
-; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
 
   ret void
 }
@@ -124,29 +124,29 @@ define void @ldst_32bit() {
   %val32_noext = load volatile i32* @var_32bit
   store volatile i32 %val32_noext, i32* @var_32bit
 ; CHECK: adrp {{x[0-9]+}}, var_32bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
-; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
 
 ; Zero-extension to 64-bits
   %val32_zext = load volatile i32* @var_32bit
   %val64_unsigned = zext i32 %val32_zext to i64
   store volatile i64 %val64_unsigned, i64* @var_64bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
 
 ; Sign-extension to 64-bits
   %val32_sext = load volatile i32* @var_32bit
   %val64_signed = sext i32 %val32_sext to i64
   store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
 
 ; Truncation from 64-bits
   %val64_trunc = load volatile i64* @var_64bit
   %val32_trunc = trunc i64 %val64_trunc to i32
   store volatile i32 %val32_trunc, i32* @var_32bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
-; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
 
   ret void
 }
@@ -165,7 +165,7 @@ define void @ldst_complex_offsets() {
 ; CHECK: ldst_complex_offsets
   %arr8_addr = load volatile i8** @arr8
 ; CHECK: adrp {{x[0-9]+}}, arr8
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr8]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:arr8]
 
   %arr8_sub1_addr = getelementptr i8* %arr8_addr, i64 1
   %arr8_sub1 = load volatile i8* %arr8_sub1_addr
@@ -180,7 +180,7 @@ define void @ldst_complex_offsets() {
 
   %arr16_addr = load volatile i16** @arr16
 ; CHECK: adrp {{x[0-9]+}}, arr16
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr16]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:arr16]
 
   %arr16_sub1_addr = getelementptr i16* %arr16_addr, i64 1
   %arr16_sub1 = load volatile i16* %arr16_sub1_addr
@@ -195,7 +195,7 @@ define void @ldst_complex_offsets() {
 
   %arr32_addr = load volatile i32** @arr32
 ; CHECK: adrp {{x[0-9]+}}, arr32
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr32]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:arr32]
 
   %arr32_sub1_addr = getelementptr i32* %arr32_addr, i64 1
   %arr32_sub1 = load volatile i32* %arr32_sub1_addr
@@ -210,7 +210,7 @@ define void @ldst_complex_offsets() {
 
   %arr64_addr = load volatile i64** @arr64
 ; CHECK: adrp {{x[0-9]+}}, arr64
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr64]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:arr64]
 
   %arr64_sub1_addr = getelementptr i64* %arr64_addr, i64 1
   %arr64_sub1 = load volatile i64* %arr64_sub1_addr
@@ -230,11 +230,11 @@ define void @ldst_float() {
 
    %valfp = load volatile float* @var_float
 ; CHECK: adrp {{x[0-9]+}}, var_float
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float]
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_float]
 ; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   store volatile float %valfp, float* @var_float
-; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float]
+; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_float]
 ; CHECK-NOFP-NOT: str {{s[0-9]+}},
 
    ret void
@@ -245,11 +245,11 @@ define void @ldst_double() {
 
    %valfp = load volatile double* @var_double
 ; CHECK: adrp {{x[0-9]+}}, var_double
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double]
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_double]
 ; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   store volatile double %valfp, double* @var_double
-; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double]
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_double]
 ; CHECK-NOFP-NOT: str {{d[0-9]+}},
 
    ret void
diff --git a/test/CodeGen/AArch64/lit.local.cfg b/test/CodeGen/AArch64/lit.local.cfg
index 9a66a00..77493d8 100644
--- a/test/CodeGen/AArch64/lit.local.cfg
+++ b/test/CodeGen/AArch64/lit.local.cfg
@@ -1,4 +1,11 @@
+import re
+
+config.suffixes = ['.ll']
+
 targets = set(config.root.targets_to_build.split())
 if not 'AArch64' in targets:
     config.unsupported = True
 
+# For now we don't test arm64-win32.
+if re.search(r'cygwin|mingw32|win32', config.target_triple):
+    config.unsupported = True
diff --git a/test/CodeGen/AArch64/literal_pools.ll b/test/CodeGen/AArch64/literal_pools.ll
deleted file mode 100644
index fc33aee..0000000
--- a/test/CodeGen/AArch64/literal_pools.ll
+++ /dev/null
@@ -1,103 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
-
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @foo() {
-; CHECK-LABEL: foo:
-    %val32 = load i32* @var32
-    %val64 = load i64* @var64
-
-    %val32_lit32 = and i32 %val32, 123456785
-    store volatile i32 %val32_lit32, i32* @var32
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldr {{w[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{w[0-9]+}}, [x[[LITADDR]]]
-
-    %val64_lit32 = and i64 %val64, 305402420
-    store volatile i64 %val64_lit32, i64* @var64
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldr {{w[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{w[0-9]+}}, [x[[LITADDR]]]
-
-    %val64_lit32signed = and i64 %val64, -12345678
-    store volatile i64 %val64_lit32signed, i64* @var64
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldrsw {{x[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldrsw {{x[0-9]+}}, [x[[LITADDR]]]
-
-    %val64_lit64 = and i64 %val64, 1234567898765432
-    store volatile i64 %val64_lit64, i64* @var64
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldr {{x[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{x[0-9]+}}, [x[[LITADDR]]]
-
-    ret void
-}
-
-@varfloat = global float 0.0
-@vardouble = global double 0.0
-
-define void @floating_lits() {
-; CHECK-LABEL: floating_lits:
-
-  %floatval = load float* @varfloat
-  %newfloat = fadd float %floatval, 128.0
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]]
-; CHECK: ldr [[LIT128:s[0-9]+]], [x[[LITBASE]], #:lo12:[[CURLIT]]]
-; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI1_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{s[0-9]+}}, [x[[LITADDR]]]
-; CHECK-LARGE: fadd
-; CHECK-NOFP-LARGE-NOT: ldr {{s[0-9]+}},
-; CHECK-NOFP-LARGE-NOT: fadd
-
-  store float %newfloat, float* @varfloat
-
-  %doubleval = load double* @vardouble
-  %newdouble = fadd double %doubleval, 129.0
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]]
-; CHECK: ldr [[LIT129:d[0-9]+]], [x[[LITBASE]], #:lo12:[[CURLIT]]]
-; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, [[LIT128]]
-; CHECK: fadd {{d[0-9]+}}, {{d[0-9]+}}, [[LIT129]]
-; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
-; CHECK-NOFP-NOT: fadd
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI1_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{d[0-9]+}}, [x[[LITADDR]]]
-; CHECK-NOFP-LARGE-NOT: ldr {{d[0-9]+}},
-
-  store double %newdouble, double* @vardouble
-
-  ret void
-}
diff --git a/test/CodeGen/AArch64/literal_pools_float.ll b/test/CodeGen/AArch64/literal_pools_float.ll
new file mode 100644
index 0000000..e53b8b6
--- /dev/null
+++ b/test/CodeGen/AArch64/literal_pools_float.ll
@@ -0,0 +1,46 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -code-model=large -mcpu=cyclone | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+
+define void @floating_lits() {
+; CHECK-LABEL: floating_lits:
+
+  %floatval = load float* @varfloat
+  %newfloat = fadd float %floatval, 128.0
+; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI[0-9]+_[0-9]+]]
+; CHECK: ldr [[LIT128:s[0-9]+]], [x[[LITBASE]], {{#?}}:lo12:[[CURLIT]]]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
+
+; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI[0-9]+_[0-9]+]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
+; CHECK-LARGE: ldr {{s[0-9]+}}, [x[[LITADDR]]]
+; CHECK-LARGE: fadd
+; CHECK-NOFP-LARGE-NOT: ldr {{s[0-9]+}},
+; CHECK-NOFP-LARGE-NOT: fadd
+
+  store float %newfloat, float* @varfloat
+
+  %doubleval = load double* @vardouble
+  %newdouble = fadd double %doubleval, 129.0
+; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI[0-9]+_[0-9]+]]
+; CHECK: ldr [[LIT129:d[0-9]+]], [x[[LITBASE]], {{#?}}:lo12:[[CURLIT]]]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
+; CHECK-NOFP-NOT: fadd
+
+; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI[0-9]+_[0-9]+]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
+; CHECK-LARGE: ldr {{d[0-9]+}}, [x[[LITADDR]]]
+; CHECK-NOFP-LARGE-NOT: ldr {{d[0-9]+}},
+
+  store double %newdouble, double* @vardouble
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/local_vars.ll b/test/CodeGen/AArch64/local_vars.ll
index b5cef85..2f5b9f2 100644
--- a/test/CodeGen/AArch64/local_vars.ll
+++ b/test/CodeGen/AArch64/local_vars.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP-ARM64 %s
 
 ; Make sure a reasonably sane prologue and epilogue are
 ; generated. This test is not robust in the face of an frame-handling
@@ -16,7 +16,7 @@
 declare void @foo()
 
 define void @trivial_func() nounwind {
-; CHECK: trivial_func: // @trivial_func
+; CHECK-LABEL: trivial_func: // @trivial_func
 ; CHECK-NEXT: // BB#0
 ; CHECK-NEXT: ret
 
@@ -24,11 +24,14 @@ define void @trivial_func() nounwind {
 }
 
 define void @trivial_fp_func() {
-; CHECK-WITHFP-LABEL: trivial_fp_func:
+; CHECK-WITHFP-AARCH64-LABEL: trivial_fp_func:
+; CHECK-WITHFP-AARCH64: sub sp, sp, #16
+; CHECK-WITHFP-AARCH64: stp x29, x30, [sp]
+; CHECK-WITHFP-AARCH64-NEXT: mov x29, sp
 
-; CHECK-WITHFP: sub sp, sp, #16
-; CHECK-WITHFP: stp x29, x30, [sp]
-; CHECK-WITHFP-NEXT: mov x29, sp
+; CHECK-WITHFP-ARM64-LABEL: trivial_fp_func:
+; CHECK-WITHFP-ARM64: stp x29, x30, [sp, #-16]!
+; CHECK-WITHFP-ARM64-NEXT: mov x29, sp
 
 ; Dont't really care, but it would be a Bad Thing if this came after the epilogue.
 ; CHECK: bl foo
@@ -48,10 +51,10 @@ define void @stack_local() {
 
   %val = load i64* @var
   store i64 %val, i64* %local_var
-; CHECK: str {{x[0-9]+}}, [sp, #{{[0-9]+}}]
+; CHECK-DAG: str {{x[0-9]+}}, [sp, #{{[0-9]+}}]
 
   store i64* %local_var, i64** @local_addr
-; CHECK: add {{x[0-9]+}}, sp, #{{[0-9]+}}
+; CHECK-DAG: add {{x[0-9]+}}, sp, #{{[0-9]+}}
 
   ret void
 }
diff --git a/test/CodeGen/AArch64/logical-imm.ll b/test/CodeGen/AArch64/logical-imm.ll
index e04bb51..a5e4a99 100644
--- a/test/CodeGen/AArch64/logical-imm.ll
+++ b/test/CodeGen/AArch64/logical-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/test/CodeGen/AArch64/logical_shifted_reg.ll b/test/CodeGen/AArch64/logical_shifted_reg.ll
index a08ba20..b249d72 100644
--- a/test/CodeGen/AArch64/logical_shifted_reg.ll
+++ b/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var1_32 = global i32 0
 @var2_32 = global i32 0
@@ -6,7 +6,7 @@
 @var1_64 = global i64 0
 @var2_64 = global i64 0
 
-define void @logical_32bit() {
+define void @logical_32bit() minsize {
 ; CHECK-LABEL: logical_32bit:
   %val1 = load i32* @var1_32
   %val2 = load i32* @var2_32
@@ -96,7 +96,7 @@ define void @logical_32bit() {
   ret void
 }
 
-define void @logical_64bit() {
+define void @logical_64bit() minsize {
 ; CHECK-LABEL: logical_64bit:
   %val1 = load i64* @var1_64
   %val2 = load i64* @var2_64
diff --git a/test/CodeGen/AArch64/mature-mc-support.ll b/test/CodeGen/AArch64/mature-mc-support.ll
index 06e3cc7..276c54d 100644
--- a/test/CodeGen/AArch64/mature-mc-support.ll
+++ b/test/CodeGen/AArch64/mature-mc-support.ll
@@ -1,11 +1,11 @@
 ; Test that inline assembly is parsed by the MC layer when MC support is mature
 ; (even when the output is assembly).
 
-; RUN: not llc -mtriple=aarch64-pc-linux < %s > /dev/null 2> %t1
-; RUN: FileCheck %s < %t1
+; RUN: not llc -mtriple=aarch64-pc-linux < %s > /dev/null 2> %t3
+; RUN: FileCheck %s < %t3
 
-; RUN: not llc -mtriple=aarch64-pc-linux -filetype=obj < %s > /dev/null 2> %t2
-; RUN: FileCheck %s < %t2
+; RUN: not llc -mtriple=aarch64-pc-linux -filetype=obj < %s > /dev/null 2> %t4
+; RUN: FileCheck %s < %t4
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
 
diff --git a/test/CodeGen/AArch64/misched-basic-A53.ll b/test/CodeGen/AArch64/misched-basic-A53.ll
deleted file mode 100644
index 1555c48..0000000
--- a/test/CodeGen/AArch64/misched-basic-A53.ll
+++ /dev/null
@@ -1,112 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
-;
-; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
-; much higher than the ADD instructions in order to hide latency. When not
-; specifying a subtarget, the MADD will remain near the end of the block.
-;
-; CHECK: ********** MI Scheduling **********
-; CHECK: main
-; CHECK: *** Final schedule for BB#2 ***
-; CHECK: SU(13)
-; CHECK: MADDwwww
-; CHECK: SU(4)
-; CHECK: ADDwwi_lsl0_s
-; CHECK: ********** INTERVALS **********
-@main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
-@main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4
-
-; Function Attrs: nounwind
-define i32 @main() #0 {
-entry:
-  %retval = alloca i32, align 4
-  %x = alloca [8 x i32], align 4
-  %y = alloca [8 x i32], align 4
-  %i = alloca i32, align 4
-  %xx = alloca i32, align 4
-  %yy = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = bitcast [8 x i32]* %x to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([8 x i32]* @main.x to i8*), i64 32, i32 4, i1 false)
-  %1 = bitcast [8 x i32]* %y to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([8 x i32]* @main.y to i8*), i64 32, i32 4, i1 false)
-  store i32 0, i32* %xx, align 4
-  store i32 0, i32* %yy, align 4
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %2 = load i32* %i, align 4
-  %cmp = icmp slt i32 %2, 8
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %3 = load i32* %i, align 4
-  %idxprom = sext i32 %3 to i64
-  %arrayidx = getelementptr inbounds [8 x i32]* %x, i32 0, i64 %idxprom
-  %4 = load i32* %arrayidx, align 4
-  %add = add nsw i32 %4, 1
-  store i32 %add, i32* %xx, align 4
-  %5 = load i32* %xx, align 4
-  %add1 = add nsw i32 %5, 12
-  store i32 %add1, i32* %xx, align 4
-  %6 = load i32* %xx, align 4
-  %add2 = add nsw i32 %6, 23
-  store i32 %add2, i32* %xx, align 4
-  %7 = load i32* %xx, align 4
-  %add3 = add nsw i32 %7, 34
-  store i32 %add3, i32* %xx, align 4
-  %8 = load i32* %i, align 4
-  %idxprom4 = sext i32 %8 to i64
-  %arrayidx5 = getelementptr inbounds [8 x i32]* %y, i32 0, i64 %idxprom4
-  %9 = load i32* %arrayidx5, align 4
-  %10 = load i32* %yy, align 4
-  %mul = mul nsw i32 %10, %9
-  store i32 %mul, i32* %yy, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %11 = load i32* %i, align 4
-  %inc = add nsw i32 %11, 1
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %12 = load i32* %xx, align 4
-  %13 = load i32* %yy, align 4
-  %add6 = add nsw i32 %12, %13
-  ret i32 %add6
-}
-
-
-; The Cortex-A53 machine model will cause the FDIVvvv_42 to be raised to
-; hide latency. Whereas normally there would only be a single FADDvvv_4s
-; after it, this test checks to make sure there are more than one.
-;
-; CHECK: ********** MI Scheduling **********
-; CHECK: neon4xfloat:BB#0
-; CHECK: *** Final schedule for BB#0 ***
-; CHECK: FDIVvvv_4S
-; CHECK: FADDvvv_4S
-; CHECK: FADDvvv_4S
-; CHECK: ********** INTERVALS **********
-define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
-        %tmp1 = fadd <4 x float> %A, %B;
-        %tmp2 = fadd <4 x float> %A, %tmp1;
-        %tmp3 = fadd <4 x float> %A, %tmp2;
-        %tmp4 = fadd <4 x float> %A, %tmp3;
-        %tmp5 = fadd <4 x float> %A, %tmp4;
-        %tmp6 = fadd <4 x float> %A, %tmp5;
-        %tmp7 = fadd <4 x float> %A, %tmp6;
-        %tmp8 = fadd <4 x float> %A, %tmp7;
-        %tmp9 = fdiv <4 x float> %A, %B;
-        %tmp10 = fadd <4 x float> %tmp8, %tmp9;
-
-        ret <4 x float> %tmp10
-}
-
-; Function Attrs: nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
diff --git a/test/CodeGen/AArch64/movw-consts.ll b/test/CodeGen/AArch64/movw-consts.ll
index 38e37db..93c1812 100644
--- a/test/CodeGen/AArch64/movw-consts.ll
+++ b/test/CodeGen/AArch64/movw-consts.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
 
 define i64 @test0() {
 ; CHECK-LABEL: test0:
@@ -9,43 +9,43 @@ define i64 @test0() {
 
 define i64 @test1() {
 ; CHECK-LABEL: test1:
-; CHECK: movz x0, #1
+; CHECK: orr w0, wzr, #0x1
   ret i64 1
 }
 
 define i64 @test2() {
 ; CHECK-LABEL: test2:
-; CHECK: movz x0, #65535
+; CHECK: orr w0, wzr, #0xffff
   ret i64 65535
 }
 
 define i64 @test3() {
 ; CHECK-LABEL: test3:
-; CHECK: movz x0, #1, lsl #16
+; CHECK: orr w0, wzr, #0x10000
   ret i64 65536
 }
 
 define i64 @test4() {
 ; CHECK-LABEL: test4:
-; CHECK: movz x0, #65535, lsl #16
+; CHECK: orr w0, wzr, #0xffff0000
   ret i64 4294901760
 }
 
 define i64 @test5() {
 ; CHECK-LABEL: test5:
-; CHECK: movz x0, #1, lsl #32
+; CHECK: orr x0, xzr, #0x100000000
   ret i64 4294967296
 }
 
 define i64 @test6() {
 ; CHECK-LABEL: test6:
-; CHECK: movz x0, #65535, lsl #32
+; CHECK: orr x0, xzr, #0xffff00000000
   ret i64 281470681743360
 }
 
 define i64 @test7() {
 ; CHECK-LABEL: test7:
-; CHECK: movz x0, #1, lsl #48
+; CHECK: orr x0, xzr, #0x1000000000000
   ret i64 281474976710656
 }
 
@@ -53,7 +53,7 @@ define i64 @test7() {
 ; couldn't. Useful even for i64
 define i64 @test8() {
 ; CHECK-LABEL: test8:
-; CHECK: movn w0, #60875
+; CHECK: movn w0, #{{60875|0xedcb}}
   ret i64 4294906420
 }
 
@@ -65,7 +65,7 @@ define i64 @test9() {
 
 define i64 @test10() {
 ; CHECK-LABEL: test10:
-; CHECK: movn x0, #60875, lsl #16
+; CHECK: movn x0, #{{60875|0xedcb}}, lsl #16
   ret i64 18446744069720047615
 }
 
@@ -75,35 +75,35 @@ define i64 @test10() {
 
 define void @test11() {
 ; CHECK-LABEL: test11:
-; CHECK: mov {{w[0-9]+}}, wzr
+; CHECK: str wzr
   store i32 0, i32* @var32
   ret void
 }
 
 define void @test12() {
 ; CHECK-LABEL: test12:
-; CHECK: movz {{w[0-9]+}}, #1
+; CHECK: orr {{w[0-9]+}}, wzr, #0x1
   store i32 1, i32* @var32
   ret void
 }
 
 define void @test13() {
 ; CHECK-LABEL: test13:
-; CHECK: movz {{w[0-9]+}}, #65535
+; CHECK: orr {{w[0-9]+}}, wzr, #0xffff
   store i32 65535, i32* @var32
   ret void
 }
 
 define void @test14() {
 ; CHECK-LABEL: test14:
-; CHECK: movz {{w[0-9]+}}, #1, lsl #16
+; CHECK: orr {{w[0-9]+}}, wzr, #0x10000
   store i32 65536, i32* @var32
   ret void
 }
 
 define void @test15() {
 ; CHECK-LABEL: test15:
-; CHECK: movz {{w[0-9]+}}, #65535, lsl #16
+; CHECK: orr {{w[0-9]+}}, wzr, #0xffff0000
   store i32 4294901760, i32* @var32
   ret void
 }
@@ -119,6 +119,6 @@ define i64 @test17() {
 ; CHECK-LABEL: test17:
 
   ; Mustn't MOVN w0 here.
-; CHECK: movn x0, #2
+; CHECK: orr x0, xzr, #0xfffffffffffffffd
   ret i64 -3
 }
diff --git a/test/CodeGen/AArch64/movw-shift-encoding.ll b/test/CodeGen/AArch64/movw-shift-encoding.ll
index ec133bd..178fccc 100644
--- a/test/CodeGen/AArch64/movw-shift-encoding.ll
+++ b/test/CodeGen/AArch64/movw-shift-encoding.ll
@@ -7,8 +7,9 @@
 
 define i32* @get_var() {
   ret i32* @var
-; CHECK: movz    x0, #:abs_g3:var        // encoding: [A,A,0xe0'A',0xd2'A']
-; CHECK: movk    x0, #:abs_g2_nc:var     // encoding: [A,A,0xc0'A',0xf2'A']
-; CHECK: movk    x0, #:abs_g1_nc:var     // encoding: [A,A,0xa0'A',0xf2'A']
-; CHECK: movk    x0, #:abs_g0_nc:var     // encoding: [A,A,0x80'A',0xf2'A']
+
+; CHECK: movz    x0, #:abs_g3:var        // encoding: [0bAAA00000,A,0b111AAAAA,0xd2]
+; CHECK: movk    x0, #:abs_g2_nc:var     // encoding: [0bAAA00000,A,0b110AAAAA,0xf2]
+; CHECK: movk    x0, #:abs_g1_nc:var     // encoding: [0bAAA00000,A,0b101AAAAA,0xf2]
+; CHECK: movk    x0, #:abs_g0_nc:var     // encoding: [0bAAA00000,A,0b100AAAAA,0xf2]
 }
diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll
index f58c598..0689fbd 100644
--- a/test/CodeGen/AArch64/mul-lohi.ll
+++ b/test/CodeGen/AArch64/mul-lohi.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
 
 define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: test_128bitmul:
diff --git a/test/CodeGen/AArch64/neon-2velem-high.ll b/test/CodeGen/AArch64/neon-2velem-high.ll
deleted file mode 100644
index 97031d9..0000000
--- a/test/CodeGen/AArch64/neon-2velem-high.ll
+++ /dev/null
@@ -1,331 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
-
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
-
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
-
-define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) {
-; CHECK: test_vmull_high_n_s16:
-; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  ret <4 x i32> %vmull15.i.i
-}
-
-define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) {
-; CHECK: test_vmull_high_n_s32:
-; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  ret <2 x i64> %vmull9.i.i
-}
-
-define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) {
-; CHECK: test_vmull_high_n_u16:
-; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  ret <4 x i32> %vmull15.i.i
-}
-
-define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) {
-; CHECK: test_vmull_high_n_u32:
-; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  ret <2 x i64> %vmull9.i.i
-}
-
-define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) {
-; CHECK: test_vqdmull_high_n_s16:
-; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vqdmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  ret <4 x i32> %vqdmull15.i.i
-}
-
-define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) {
-; CHECK: test_vqdmull_high_n_s32:
-; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vqdmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  ret <2 x i64> %vqdmull9.i.i
-}
-
-define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlal_high_n_s16:
-; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlal_high_n_s32:
-; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlal_high_n_u16:
-; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlal_high_n_u32:
-; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vqdmlal_high_n_s16:
-; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vqdmlal15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %vqdmlal17.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
-  ret <4 x i32> %vqdmlal17.i.i
-}
-
-define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vqdmlal_high_n_s32:
-; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vqdmlal9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %vqdmlal11.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
-  ret <2 x i64> %vqdmlal11.i.i
-}
-
-define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlsl_high_n_s16:
-; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
-  ret <4 x i32> %sub.i.i
-}
-
-define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlsl_high_n_s32:
-; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
-  ret <2 x i64> %sub.i.i
-}
-
-define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlsl_high_n_u16:
-; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
-  ret <4 x i32> %sub.i.i
-}
-
-define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlsl_high_n_u32:
-; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
-  ret <2 x i64> %sub.i.i
-}
-
-define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vqdmlsl_high_n_s16:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
-  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
-  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
-  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
-  ret <4 x i32> %vqdmlsl17.i.i
-}
-
-define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vqdmlsl_high_n_s32:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
-  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
-  ret <2 x i64> %vqdmlsl11.i.i
-}
-
-define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) {
-; CHECK: test_vmul_n_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-entry:
-  %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
-  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1
-  %mul.i = fmul <2 x float> %vecinit1.i, %a
-  ret <2 x float> %mul.i
-}
-
-define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
-; CHECK: test_vmulq_n_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-entry:
-  %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
-  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1
-  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2
-  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3
-  %mul.i = fmul <4 x float> %vecinit3.i, %a
-  ret <4 x float> %mul.i
-}
-
-define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) {
-; CHECK: test_vmulq_n_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-entry:
-  %vecinit.i = insertelement <2 x double> undef, double %b, i32 0
-  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1
-  %mul.i = fmul <2 x double> %vecinit1.i, %a
-  ret <2 x double> %mul.i
-}
-
-define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
-; CHECK: test_vfma_n_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
-  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
-; CHECK: test_vfmaq_n_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
-  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
-  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
-  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
-; CHECK: test_vfms_n_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
-  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
-  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
-  %1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a)
-  ret <2 x float> %1
-}
-
-define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
-; CHECK: test_vfmsq_n_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
-entry:
-  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
-  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
-  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
-  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
-  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
-  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a)
-  ret <4 x float> %1
-}
diff --git a/test/CodeGen/AArch64/neon-2velem.ll b/test/CodeGen/AArch64/neon-2velem.ll
deleted file mode 100644
index acffb14..0000000
--- a/test/CodeGen/AArch64/neon-2velem.ll
+++ /dev/null
@@ -1,2853 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
-
-declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>)
-
-declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>)
-
-declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
-
-declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
-
-declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
-
-declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>)
-
-declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>)
-
-declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>)
-
-declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
-
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmla_lane_s16:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlaq_lane_s16:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <8 x i16> %shuffle, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmla_lane_s32:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlaq_lane_s32:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmla_laneq_s16:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlaq_laneq_s16:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <8 x i16> %shuffle, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmla_laneq_s32:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlaq_laneq_s32:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i32> %shuffle, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmls_lane_s16:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
-
-define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsq_lane_s16:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <8 x i16> %shuffle, %b
-  %sub = sub <8 x i16> %a, %mul
-  ret <8 x i16> %sub
-}
-
-define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmls_lane_s32:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %mul = mul <2 x i32> %shuffle, %b
-  %sub = sub <2 x i32> %a, %mul
-  ret <2 x i32> %sub
-}
-
-define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsq_lane_s32:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle, %b
-  %sub = sub <4 x i32> %a, %mul
-  ret <4 x i32> %sub
-}
-
-define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmls_laneq_s16:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
-
-define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsq_laneq_s16:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <8 x i16> %shuffle, %b
-  %sub = sub <8 x i16> %a, %mul
-  ret <8 x i16> %sub
-}
-
-define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmls_laneq_s32:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %b
-  %sub = sub <2 x i32> %a, %mul
-  ret <2 x i32> %sub
-}
-
-define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsq_laneq_s32:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i32> %shuffle, %b
-  %sub = sub <4 x i32> %a, %mul
-  ret <4 x i32> %sub
-}
-
-define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_s16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_s16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_s32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_s32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_u16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_u16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_u32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_u32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_s16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_s16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_s32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_s32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_u16:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_u16:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_u32:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_u32:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfma_lane_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
-
-define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmaq_lane_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
-
-define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfma_laneq_f32:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmaq_laneq_f32:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfms_lane_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmsq_lane_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfms_laneq_f32:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmsq_laneq_f32:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
-; CHECK: test_vfmaq_lane_f64:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
-
-define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmaq_laneq_f64:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
-; CHECK: test_vfmsq_lane_f64:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <1 x double> <double -0.000000e+00>, %v
-  %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmsq_laneq_f64:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
-  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
-; CHECK-LABEL: test_vfmas_laneq_f32
-; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %extract = extractelement <4 x float> %v, i32 3
-  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
-  ret float %0
-}
-
-declare float @llvm.fma.f32(float, float, float)
-
-define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
-; CHECK-LABEL: test_vfmsd_lane_f64
-; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %extract.rhs = extractelement <1 x double> %v, i32 0
-  %extract = fsub double -0.000000e+00, %extract.rhs
-  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
-  ret double %0
-}
-
-declare double @llvm.fma.f64(double, double, double)
-
-define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
-; CHECK: test_vfmss_laneq_f32
-; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %extract.rhs = extractelement <4 x float> %v, i32 3
-  %extract = fsub float -0.000000e+00, %extract.rhs
-  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
-  ret float %0
-}
-
-define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
-; CHECK-LABEL: test_vfmsd_laneq_f64
-; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %extract.rhs = extractelement <2 x double> %v, i32 1
-  %extract = fsub double -0.000000e+00, %extract.rhs
-  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
-  ret double %0
-}
-
-define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_s16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_s32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_s16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_s32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_s16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_s32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_s16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_s32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_s16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_s32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_s16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_s32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_s16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_s32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_s16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_s32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_u16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_u32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_u16:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_u32:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_u16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_u32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_u16:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_u32:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_u16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_u32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_u16:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_u32:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_u16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_u32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_u16:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_u32:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_s16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_s32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_u16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_u32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_s16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_s32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_u16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_u32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_s16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_s32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_u16:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_u32:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_s16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_s32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_u16:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_u32:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_lane_s16:
-; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_lane_s32:
-; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_high_lane_s16:
-; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_high_lane_s32:
-; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_lane_s16:
-; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_lane_s32:
-; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_high_lane_s16:
-; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_high_lane_s32:
-; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_lane_s16:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_lane_s32:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_laneq_s16:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_laneq_s32:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_high_lane_s16:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_high_lane_s32:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_high_laneq_s16:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_high_laneq_s32:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulh_lane_s16:
-; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i16> %vqdmulh2.i
-}
-
-define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulhq_lane_s16:
-; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
-  ret <8 x i16> %vqdmulh2.i
-}
-
-define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulh_lane_s32:
-; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i32> %vqdmulh2.i
-}
-
-define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulhq_lane_s32:
-; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
-  ret <4 x i32> %vqdmulh2.i
-}
-
-define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulh_lane_s16:
-; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i16> %vqrdmulh2.i
-}
-
-define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulhq_lane_s16:
-; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
-  ret <8 x i16> %vqrdmulh2.i
-}
-
-define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulh_lane_s32:
-; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i32> %vqrdmulh2.i
-}
-
-define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulhq_lane_s32:
-; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
-  ret <4 x i32> %vqrdmulh2.i
-}
-
-define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmul_lane_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %mul = fmul <2 x float> %shuffle, %a
-  ret <2 x float> %mul
-}
-
-define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
-; CHECK: test_vmul_lane_f64:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <1 x double> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to double
-  %extract = extractelement <1 x double> %v, i32 0
-  %2 = fmul double %1, %extract
-  %3 = insertelement <1 x double> undef, double %2, i32 0
-  ret <1 x double> %3
-}
-
-define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulq_lane_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %mul = fmul <4 x float> %shuffle, %a
-  ret <4 x float> %mul
-}
-
-define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
-; CHECK: test_vmulq_lane_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %mul = fmul <2 x double> %shuffle, %a
-  ret <2 x double> %mul
-}
-
-define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmul_laneq_f32:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %mul = fmul <2 x float> %shuffle, %a
-  ret <2 x float> %mul
-}
-
-define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
-; CHECK: test_vmul_laneq_f64:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <1 x double> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to double
-  %extract = extractelement <2 x double> %v, i32 1
-  %2 = fmul double %1, %extract
-  %3 = insertelement <1 x double> undef, double %2, i32 0
-  ret <1 x double> %3
-}
-
-define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulq_laneq_f32:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %mul = fmul <4 x float> %shuffle, %a
-  ret <4 x float> %mul
-}
-
-define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulq_laneq_f64:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %mul = fmul <2 x double> %shuffle, %a
-  ret <2 x double> %mul
-}
-
-define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulx_lane_f32:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
-  ret <2 x float> %vmulx2.i
-}
-
-define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulxq_lane_f32:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
-  ret <4 x float> %vmulx2.i
-}
-
-define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
-; CHECK: test_vmulxq_lane_f64:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
-  ret <2 x double> %vmulx2.i
-}
-
-define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulx_laneq_f32:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
-  ret <2 x float> %vmulx2.i
-}
-
-define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulxq_laneq_f32:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
-  ret <4 x float> %vmulx2.i
-}
-
-define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulxq_laneq_f64:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
-  ret <2 x double> %vmulx2.i
-}
-
-define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmla_lane_s16_0:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlaq_lane_s16_0:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmla_lane_s32_0:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlaq_lane_s32_0:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmla_laneq_s16_0:
-; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlaq_laneq_s16_0:
-; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %b
-  %add = add <8 x i16> %mul, %a
-  ret <8 x i16> %add
-}
-
-define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmla_laneq_s32_0:
-; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlaq_laneq_s32_0:
-; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %b
-  %add = add <4 x i32> %mul, %a
-  ret <4 x i32> %add
-}
-
-define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmls_lane_s16_0:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
-
-define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsq_lane_s16_0:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %b
-  %sub = sub <8 x i16> %a, %mul
-  ret <8 x i16> %sub
-}
-
-define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmls_lane_s32_0:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %b
-  %sub = sub <2 x i32> %a, %mul
-  ret <2 x i32> %sub
-}
-
-define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsq_lane_s32_0:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %b
-  %sub = sub <4 x i32> %a, %mul
-  ret <4 x i32> %sub
-}
-
-define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmls_laneq_s16_0:
-; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
-
-define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsq_laneq_s16_0:
-; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %b
-  %sub = sub <8 x i16> %a, %mul
-  ret <8 x i16> %sub
-}
-
-define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmls_laneq_s32_0:
-; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %b
-  %sub = sub <2 x i32> %a, %mul
-  ret <2 x i32> %sub
-}
-
-define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsq_laneq_s32_0:
-; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %b
-  %sub = sub <4 x i32> %a, %mul
-  ret <4 x i32> %sub
-}
-
-define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_s16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_s16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_s32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_s32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_u16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_u16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_u32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_u32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_s16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_s16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_s32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_s32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_u16_0:
-; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i16> %shuffle, %a
-  ret <4 x i16> %mul
-}
-
-define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_u16_0:
-; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
-  %mul = mul <8 x i16> %shuffle, %a
-  ret <8 x i16> %mul
-}
-
-define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_u32_0:
-; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %mul = mul <2 x i32> %shuffle, %a
-  ret <2 x i32> %mul
-}
-
-define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_u32_0:
-; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
-  %mul = mul <4 x i32> %shuffle, %a
-  ret <4 x i32> %mul
-}
-
-define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfma_lane_f32_0:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmaq_lane_f32_0:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfma_laneq_f32_0:
-; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmaq_laneq_f32_0:
-; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfms_lane_f32_0:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmsq_lane_f32_0:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfms_laneq_f32_0:
-; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
-  ret <2 x float> %0
-}
-
-define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmsq_laneq_f32_0:
-; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
-  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
-  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
-  ret <4 x float> %0
-}
-
-define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmaq_laneq_f64_0:
-; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmsq_laneq_f64_0:
-; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
-  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
-  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
-  ret <2 x double> %0
-}
-
-define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_s16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_s32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_s16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_s32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_s16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_s32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_s16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_s32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_s16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_s32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_s16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_s32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_s16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_s32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_s16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_s32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_u16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_u32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_u16_0:
-; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_u32_0:
-; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_u16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_u32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_u16_0:
-; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %add = add <4 x i32> %vmull2.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_u32_0:
-; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %add = add <2 x i64> %vmull2.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_u16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_u32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_u16_0:
-; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_u32_0:
-; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_u16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_u32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_u16_0:
-; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %sub = sub <4 x i32> %a, %vmull2.i
-  ret <4 x i32> %sub
-}
-
-define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_u32_0:
-; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %sub = sub <2 x i64> %a, %vmull2.i
-  ret <2 x i64> %sub
-}
-
-define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_s16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_s32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_u16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_u32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_s16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_s32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_u16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_u32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_s16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_s32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_u16_0:
-; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_u32_0:
-; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_s16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_s32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_u16_0:
-; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_u32_0:
-; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_lane_s16_0:
-; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_lane_s32_0:
-; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_high_lane_s16_0:
-; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_high_lane_s32_0:
-; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_lane_s16_0:
-; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_lane_s32_0:
-; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_high_lane_s16_0:
-; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_high_lane_s32_0:
-; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_lane_s16_0:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_lane_s32_0:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_laneq_s16_0:
-; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_laneq_s32_0:
-; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_high_lane_s16_0:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_high_lane_s32_0:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_high_laneq_s16_0:
-; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_high_laneq_s32_0:
-; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulh_lane_s16_0:
-; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i16> %vqdmulh2.i
-}
-
-define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulhq_lane_s16_0:
-; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
-  ret <8 x i16> %vqdmulh2.i
-}
-
-define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulh_lane_s32_0:
-; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i32> %vqdmulh2.i
-}
-
-define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulhq_lane_s32_0:
-; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
-  ret <4 x i32> %vqdmulh2.i
-}
-
-define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulh_lane_s16_0:
-; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
-  ret <4 x i16> %vqrdmulh2.i
-}
-
-define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulhq_lane_s16_0:
-; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
-  ret <8 x i16> %vqrdmulh2.i
-}
-
-define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulh_lane_s32_0:
-; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
-  ret <2 x i32> %vqrdmulh2.i
-}
-
-define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulhq_lane_s32_0:
-; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
-  ret <4 x i32> %vqrdmulh2.i
-}
-
-define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmul_lane_f32_0:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
-  %mul = fmul <2 x float> %shuffle, %a
-  ret <2 x float> %mul
-}
-
-define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulq_lane_f32_0:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
-  %mul = fmul <4 x float> %shuffle, %a
-  ret <4 x float> %mul
-}
-
-define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmul_laneq_f32_0:
-; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
-  %mul = fmul <2 x float> %shuffle, %a
-  ret <2 x float> %mul
-}
-
-define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
-; CHECK: test_vmul_laneq_f64_0:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <1 x double> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to double
-  %extract = extractelement <2 x double> %v, i32 0
-  %2 = fmul double %1, %extract
-  %3 = insertelement <1 x double> undef, double %2, i32 0
-  ret <1 x double> %3
-}
-
-define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulq_laneq_f32_0:
-; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-  %mul = fmul <4 x float> %shuffle, %a
-  ret <4 x float> %mul
-}
-
-define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulq_laneq_f64_0:
-; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-  %mul = fmul <2 x double> %shuffle, %a
-  ret <2 x double> %mul
-}
-
-define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulx_lane_f32_0:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
-  ret <2 x float> %vmulx2.i
-}
-
-define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulxq_lane_f32_0:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
-  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
-  ret <4 x float> %vmulx2.i
-}
-
-define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
-; CHECK: test_vmulxq_lane_f64_0:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
-  ret <2 x double> %vmulx2.i
-}
-
-define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulx_laneq_f32_0:
-; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
-  ret <2 x float> %vmulx2.i
-}
-
-define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulxq_laneq_f32_0:
-; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
-  ret <4 x float> %vmulx2.i
-}
-
-define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulxq_laneq_f64_0:
-; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-; CHECK-NEXT: ret
-entry:
-  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
-  ret <2 x double> %vmulx2.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-3vdiff.ll b/test/CodeGen/AArch64/neon-3vdiff.ll
deleted file mode 100644
index 96400eb..0000000
--- a/test/CodeGen/AArch64/neon-3vdiff.ll
+++ /dev/null
@@ -1,1833 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>)
-
-declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
-
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
-
-declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>)
-
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
-
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
-
-declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>)
-
-declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
-
-declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
-
-declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
-
-declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
-
-declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
-
-declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
-
-declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>)
-
-declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>)
-
-declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>)
-
-declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>)
-
-declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>)
-
-declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vaddl_s8:
-; CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
-  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vaddl_s16:
-; CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
-  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vaddl_s32:
-; CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
-  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vaddl_u8:
-; CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
-  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vaddl_u16:
-; CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
-  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vaddl_u32:
-; CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
-  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vaddl_high_s8:
-; CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
-  %add.i = add <8 x i16> %0, %1
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddl_high_s16:
-; CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
-  %add.i = add <4 x i32> %0, %1
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddl_high_s32:
-; CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
-  %add.i = add <2 x i64> %0, %1
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vaddl_high_u8:
-; CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
-  %add.i = add <8 x i16> %0, %1
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddl_high_u16:
-; CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
-  %add.i = add <4 x i32> %0, %1
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddl_high_u32:
-; CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
-  %add.i = add <2 x i64> %0, %1
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vaddw_s8:
-; CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vaddw_s16:
-; CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vaddw_s32:
-; CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vaddw_u8:
-; CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vaddw_u16:
-; CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vaddw_u32:
-; CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vaddw_high_s8:
-; CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %add.i = add <8 x i16> %0, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vaddw_high_s16:
-; CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %add.i = add <4 x i32> %0, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vaddw_high_s32:
-; CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %add.i = add <2 x i64> %0, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vaddw_high_u8:
-; CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %add.i = add <8 x i16> %0, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vaddw_high_u16:
-; CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %add.i = add <4 x i32> %0, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vaddw_high_u32:
-; CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %add.i = add <2 x i64> %0, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsubl_s8:
-; CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
-  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
-  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsubl_s16:
-; CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
-  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
-  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsubl_s32:
-; CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
-  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
-  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsubl_u8:
-; CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
-  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
-  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsubl_u16:
-; CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
-  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
-  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsubl_u32:
-; CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
-  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
-  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsubl_high_s8:
-; CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
-  %sub.i = sub <8 x i16> %0, %1
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubl_high_s16:
-; CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
-  %sub.i = sub <4 x i32> %0, %1
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubl_high_s32:
-; CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
-  %sub.i = sub <2 x i64> %0, %1
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsubl_high_u8:
-; CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
-  %sub.i = sub <8 x i16> %0, %1
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubl_high_u16:
-; CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
-  %sub.i = sub <4 x i32> %0, %1
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubl_high_u32:
-; CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
-  %sub.i = sub <2 x i64> %0, %1
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vsubw_s8:
-; CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
-  %sub.i = sub <8 x i16> %a, %vmovl.i.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vsubw_s16:
-; CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
-  %sub.i = sub <4 x i32> %a, %vmovl.i.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vsubw_s32:
-; CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
-  %sub.i = sub <2 x i64> %a, %vmovl.i.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vsubw_u8:
-; CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
-entry:
-  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
-  %sub.i = sub <8 x i16> %a, %vmovl.i.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vsubw_u16:
-; CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
-entry:
-  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
-  %sub.i = sub <4 x i32> %a, %vmovl.i.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vsubw_u32:
-; CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
-entry:
-  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
-  %sub.i = sub <2 x i64> %a, %vmovl.i.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vsubw_high_s8:
-; CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %sub.i = sub <8 x i16> %a, %0
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vsubw_high_s16:
-; CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %sub.i = sub <4 x i32> %a, %0
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vsubw_high_s32:
-; CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %sub.i = sub <2 x i64> %a, %0
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vsubw_high_u8:
-; CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
-  %sub.i = sub <8 x i16> %a, %0
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vsubw_high_u16:
-; CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
-  %sub.i = sub <4 x i32> %a, %0
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vsubw_high_u32:
-; CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
-  %sub.i = sub <2 x i64> %a, %0
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_s16:
-; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vaddhn.i = add <8 x i16> %a, %b
-  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
-  ret <8 x i8> %vaddhn2.i
-}
-
-define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_s32:
-; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vaddhn.i = add <4 x i32> %a, %b
-  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
-  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
-  ret <4 x i16> %vaddhn2.i
-}
-
-define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_s64:
-; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vaddhn.i = add <2 x i64> %a, %b
-  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
-  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
-  ret <2 x i32> %vaddhn2.i
-}
-
-define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_u16:
-; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vaddhn.i = add <8 x i16> %a, %b
-  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
-  ret <8 x i8> %vaddhn2.i
-}
-
-define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_u32:
-; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vaddhn.i = add <4 x i32> %a, %b
-  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
-  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
-  ret <4 x i16> %vaddhn2.i
-}
-
-define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_u64:
-; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vaddhn.i = add <2 x i64> %a, %b
-  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
-  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
-  ret <2 x i32> %vaddhn2.i
-}
-
-define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_high_s16:
-; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vaddhn.i.i = add <8 x i16> %a, %b
-  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_high_s32:
-; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vaddhn.i.i = add <4 x i32> %a, %b
-  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
-  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_high_s64:
-; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vaddhn.i.i = add <2 x i64> %a, %b
-  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
-  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_high_u16:
-; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vaddhn.i.i = add <8 x i16> %a, %b
-  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_high_u32:
-; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vaddhn.i.i = add <4 x i32> %a, %b
-  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
-  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_high_u64:
-; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vaddhn.i.i = add <2 x i64> %a, %b
-  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
-  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_s16:
-; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i8> %vraddhn2.i
-}
-
-define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_s32:
-; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i16> %vraddhn2.i
-}
-
-define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_s64:
-; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i32> %vraddhn2.i
-}
-
-define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_u16:
-; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i8> %vraddhn2.i
-}
-
-define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_u32:
-; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i16> %vraddhn2.i
-}
-
-define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_u64:
-; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i32> %vraddhn2.i
-}
-
-define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_high_s16:
-; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_high_s32:
-; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_high_s64:
-; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_high_u16:
-; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_high_u32:
-; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_high_u64:
-; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_s16:
-; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vsubhn.i = sub <8 x i16> %a, %b
-  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
-  ret <8 x i8> %vsubhn2.i
-}
-
-define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_s32:
-; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vsubhn.i = sub <4 x i32> %a, %b
-  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
-  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
-  ret <4 x i16> %vsubhn2.i
-}
-
-define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_s64:
-; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vsubhn.i = sub <2 x i64> %a, %b
-  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
-  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
-  ret <2 x i32> %vsubhn2.i
-}
-
-define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_u16:
-; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vsubhn.i = sub <8 x i16> %a, %b
-  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
-  ret <8 x i8> %vsubhn2.i
-}
-
-define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_u32:
-; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vsubhn.i = sub <4 x i32> %a, %b
-  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
-  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
-  ret <4 x i16> %vsubhn2.i
-}
-
-define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_u64:
-; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vsubhn.i = sub <2 x i64> %a, %b
-  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
-  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
-  ret <2 x i32> %vsubhn2.i
-}
-
-define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_high_s16:
-; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vsubhn.i.i = sub <8 x i16> %a, %b
-  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_high_s32:
-; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vsubhn.i.i = sub <4 x i32> %a, %b
-  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
-  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_high_s64:
-; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vsubhn.i.i = sub <2 x i64> %a, %b
-  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
-  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_high_u16:
-; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vsubhn.i.i = sub <8 x i16> %a, %b
-  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_high_u32:
-; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vsubhn.i.i = sub <4 x i32> %a, %b
-  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
-  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_high_u64:
-; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vsubhn.i.i = sub <2 x i64> %a, %b
-  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
-  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_s16:
-; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i8> %vrsubhn2.i
-}
-
-define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_s32:
-; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i16> %vrsubhn2.i
-}
-
-define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_s64:
-; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i32> %vrsubhn2.i
-}
-
-define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_u16:
-; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  ret <8 x i8> %vrsubhn2.i
-}
-
-define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_u32:
-; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  ret <4 x i16> %vrsubhn2.i
-}
-
-define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_u64:
-; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i32> %vrsubhn2.i
-}
-
-define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_high_s16:
-; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_high_s32:
-; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_high_s64:
-; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_high_u16:
-; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
-  %0 = bitcast <8 x i8> %r to <1 x i64>
-  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_high_u32:
-; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
-  %0 = bitcast <4 x i16> %r to <1 x i64>
-  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_high_u64:
-; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
-  %0 = bitcast <2 x i32> %r to <1 x i64>
-  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
-  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
-  ret <4 x i32> %2
-}
-
-define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vabdl_s8:
-; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
-  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
-  ret <8 x i16> %vmovl.i.i
-}
-
-define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vabdl_s16:
-; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
-  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
-  ret <4 x i32> %vmovl.i.i
-}
-
-define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vabdl_s32:
-; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
-  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
-  ret <2 x i64> %vmovl.i.i
-}
-
-define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vabdl_u8:
-; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
-  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
-  ret <8 x i16> %vmovl.i.i
-}
-
-define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vabdl_u16:
-; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
-  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
-  ret <4 x i32> %vmovl.i.i
-}
-
-define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vabdl_u32:
-; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
-  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
-  ret <2 x i64> %vmovl.i.i
-}
-
-define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vabal_s8:
-; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
-  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vabal_s16:
-; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
-  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vabal_s32:
-; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
-  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vabal_u8:
-; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
-  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vabal_u16:
-; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
-  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vabal_u32:
-; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
-  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vabdl_high_s8:
-; CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
-  ret <8 x i16> %vmovl.i.i.i
-}
-
-define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vabdl_high_s16:
-; CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
-  ret <4 x i32> %vmovl.i.i.i
-}
-
-define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vabdl_high_s32:
-; CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
-  ret <2 x i64> %vmovl.i.i.i
-}
-
-define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vabdl_high_u8:
-; CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
-  ret <8 x i16> %vmovl.i.i.i
-}
-
-define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vabdl_high_u16:
-; CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
-  ret <4 x i32> %vmovl.i.i.i
-}
-
-define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vabdl_high_u32:
-; CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
-  ret <2 x i64> %vmovl.i.i.i
-}
-
-define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vabal_high_s8:
-; CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
-  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
-  ret <8 x i16> %add.i.i
-}
-
-define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vabal_high_s16:
-; CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
-  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vabal_high_s32:
-; CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
-  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vabal_high_u8:
-; CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
-  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
-  ret <8 x i16> %add.i.i
-}
-
-define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vabal_high_u16:
-; CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
-  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vabal_high_u32:
-; CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
-  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vmull_s8:
-; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b)
-  ret <8 x i16> %vmull.i
-}
-
-define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vmull_s16:
-; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vmull_s32:
-; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b)
-  ret <2 x i64> %vmull2.i
-}
-
-define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vmull_u8:
-; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b)
-  ret <8 x i16> %vmull.i
-}
-
-define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vmull_u16:
-; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b)
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vmull_u32:
-; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b)
-  ret <2 x i64> %vmull2.i
-}
-
-define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vmull_high_s8:
-; CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  ret <8 x i16> %vmull.i.i
-}
-
-define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vmull_high_s16:
-; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vmull_high_s32:
-; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vmull_high_u8:
-; CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  ret <8 x i16> %vmull.i.i
-}
-
-define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vmull_high_u16:
-; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vmull_high_u32:
-; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlal_s8:
-; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
-  %add.i = add <8 x i16> %vmull.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlal_s16:
-; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %add.i = add <4 x i32> %vmull2.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlal_s32:
-; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %add.i = add <2 x i64> %vmull2.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlal_u8:
-; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
-  %add.i = add <8 x i16> %vmull.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlal_u16:
-; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %add.i = add <4 x i32> %vmull2.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlal_u32:
-; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %add.i = add <2 x i64> %vmull2.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlal_high_s8:
-; CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
-  ret <8 x i16> %add.i.i
-}
-
-define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlal_high_s16:
-; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlal_high_s32:
-; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlal_high_u8:
-; CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
-  ret <8 x i16> %add.i.i
-}
-
-define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlal_high_u16:
-; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i.i
-}
-
-define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlal_high_u32:
-; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i.i
-}
-
-define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlsl_s8:
-; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
-  %sub.i = sub <8 x i16> %a, %vmull.i.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlsl_s16:
-; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %sub.i = sub <4 x i32> %a, %vmull2.i.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlsl_s32:
-; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %sub.i = sub <2 x i64> %a, %vmull2.i.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlsl_u8:
-; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
-  %sub.i = sub <8 x i16> %a, %vmull.i.i
-  ret <8 x i16> %sub.i
-}
-
-define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlsl_u16:
-; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %sub.i = sub <4 x i32> %a, %vmull2.i.i
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlsl_u32:
-; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %sub.i = sub <2 x i64> %a, %vmull2.i.i
-  ret <2 x i64> %sub.i
-}
-
-define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlsl_high_s8:
-; CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
-  ret <8 x i16> %sub.i.i
-}
-
-define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlsl_high_s16:
-; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
-  ret <4 x i32> %sub.i.i
-}
-
-define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlsl_high_s32:
-; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
-  ret <2 x i64> %sub.i.i
-}
-
-define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlsl_high_u8:
-; CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
-  ret <8 x i16> %sub.i.i
-}
-
-define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlsl_high_u16:
-; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
-  ret <4 x i32> %sub.i.i
-}
-
-define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlsl_high_u32:
-; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
-  ret <2 x i64> %sub.i.i
-}
-
-define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vqdmull_s16:
-; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
-  ret <4 x i32> %vqdmull2.i
-}
-
-define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vqdmull_s32:
-; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
-  ret <2 x i64> %vqdmull2.i
-}
-
-define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vqdmlal_s16:
-; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
-  ret <4 x i32> %vqdmlal4.i
-}
-
-define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vqdmlal_s32:
-; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
-  ret <2 x i64> %vqdmlal4.i
-}
-
-define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vqdmlsl_s16:
-; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-entry:
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
-  ret <4 x i32> %vqdmlsl4.i
-}
-
-define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vqdmlsl_s32:
-; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
-  ret <2 x i64> %vqdmlsl4.i
-}
-
-define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vqdmull_high_s16:
-; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  ret <4 x i32> %vqdmull2.i.i
-}
-
-define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vqdmull_high_s32:
-; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  ret <2 x i64> %vqdmull2.i.i
-}
-
-define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vqdmlal_high_s16:
-; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmlal2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vqdmlal4.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
-  ret <4 x i32> %vqdmlal4.i.i
-}
-
-define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vqdmlal_high_s32:
-; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmlal2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vqdmlal4.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
-  ret <2 x i64> %vqdmlal4.i.i
-}
-
-define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vqdmlsl_high_s16:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-entry:
-  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
-  ret <4 x i32> %vqdmlsl4.i.i
-}
-
-define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vqdmlsl_high_s32:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
-  ret <2 x i64> %vqdmlsl4.i.i
-}
-
-define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vmull_p8:
-; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b)
-  ret <8 x i16> %vmull.i
-}
-
-define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vmull_high_p8:
-; CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
-  ret <8 x i16> %vmull.i.i
-}
-
-define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
-; CHECK: test_vmull_p64
-; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
-entry:
-  %vmull.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vmull1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i, <1 x i64> %vmull1.i) #1
-  %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
-  ret i128 %vmull3.i
-}
-
-define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
-; CHECK: test_vmull_high_p64
-; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-entry:
-  %0 = extractelement <2 x i64> %a, i32 1
-  %1 = extractelement <2 x i64> %b, i32 1
-  %vmull.i.i = insertelement <1 x i64> undef, i64 %0, i32 0
-  %vmull1.i.i = insertelement <1 x i64> undef, i64 %1, i32 0
-  %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i.i, <1 x i64> %vmull1.i.i) #1
-  %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
-  ret i128 %vmull3.i.i
-}
-
-declare <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64>, <1 x i64>) #5
-
-
diff --git a/test/CodeGen/AArch64/neon-aba-abd.ll b/test/CodeGen/AArch64/neon-aba-abd.ll
deleted file mode 100644
index 5400984..0000000
--- a/test/CodeGen/AArch64/neon-aba-abd.ll
+++ /dev/null
@@ -1,236 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uabd_v8i8:
-  %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uabd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %abd
-}
-
-define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uaba_v8i8:
-  %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-  %aba = add <8 x i8> %lhs, %abd
-; CHECK: uaba v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %aba
-}
-
-define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sabd_v8i8:
-  %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sabd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %abd
-}
-
-define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_saba_v8i8:
-  %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-  %aba = add <8 x i8> %lhs, %abd
-; CHECK: saba v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %aba
-}
-
-declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uabd_v16i8:
-  %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uabd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %abd
-}
-
-define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uaba_v16i8:
-  %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-  %aba = add <16 x i8> %lhs, %abd
-; CHECK: uaba v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %aba
-}
-
-define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sabd_v16i8:
-  %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sabd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %abd
-}
-
-define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_saba_v16i8:
-  %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-  %aba = add <16 x i8> %lhs, %abd
-; CHECK: saba v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %aba
-}
-
-declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uabd_v4i16:
-  %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uabd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %abd
-}
-
-define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uaba_v4i16:
-  %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-  %aba = add <4 x i16> %lhs, %abd
-; CHECK: uaba v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %aba
-}
-
-define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sabd_v4i16:
-  %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sabd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %abd
-}
-
-define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_saba_v4i16:
-  %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-  %aba = add <4 x i16> %lhs, %abd
-; CHECK: saba v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %aba
-}
-
-declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uabd_v8i16:
-  %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uabd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %abd
-}
-
-define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uaba_v8i16:
-  %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-  %aba = add <8 x i16> %lhs, %abd
-; CHECK: uaba v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %aba
-}
-
-define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sabd_v8i16:
-  %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sabd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %abd
-}
-
-define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_saba_v8i16:
-  %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-  %aba = add <8 x i16> %lhs, %abd
-; CHECK: saba v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %aba
-}
-
-declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uabd_v2i32:
-  %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uabd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %abd
-}
-
-define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uaba_v2i32:
-  %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-  %aba = add <2 x i32> %lhs, %abd
-; CHECK: uaba v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %aba
-}
-
-define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sabd_v2i32:
-  %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sabd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %abd
-}
-
-define <2 x i32> @test_sabd_v2i32_const() {
-; CHECK: test_sabd_v2i32_const:
-; CHECK: movi     d1, #0xffffffff0000
-; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s
-  %1 = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(
-    <2 x i32> <i32 -2147483648, i32 2147450880>,
-    <2 x i32> <i32 -65536, i32 65535>)
-  ret <2 x i32> %1
-}
-
-define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_saba_v2i32:
-  %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-  %aba = add <2 x i32> %lhs, %abd
-; CHECK: saba v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %aba
-}
-
-declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uabd_v4i32:
-  %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uabd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %abd
-}
-
-define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uaba_v4i32:
-  %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-  %aba = add <4 x i32> %lhs, %abd
-; CHECK: uaba v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %aba
-}
-
-define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sabd_v4i32:
-  %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sabd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %abd
-}
-
-define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_saba_v4i32:
-  %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-  %aba = add <4 x i32> %lhs, %abd
-; CHECK: saba v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %aba
-}
-
-declare <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float>, <2 x float>)
-
-define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fabd_v2f32:
-  %abd = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fabd v0.2s, v0.2s, v1.2s
-  ret <2 x float> %abd
-}
-
-declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>)
-
-define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fabd_v4f32:
-  %abd = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fabd v0.4s, v0.4s, v1.4s
-  ret <4 x float> %abd
-}
-
-declare <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double>, <2 x double>)
-
-define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fabd_v2f64:
-  %abd = call <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fabd v0.2d, v0.2d, v1.2d
-  ret <2 x double> %abd
-}
diff --git a/test/CodeGen/AArch64/neon-across.ll b/test/CodeGen/AArch64/neon-across.ll
deleted file mode 100644
index 6d30c95..0000000
--- a/test/CodeGen/AArch64/neon-across.ll
+++ /dev/null
@@ -1,472 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare float @llvm.aarch64.neon.vminnmv(<4 x float>)
-
-declare float @llvm.aarch64.neon.vmaxnmv(<4 x float>)
-
-declare float @llvm.aarch64.neon.vminv(<4 x float>)
-
-declare float @llvm.aarch64.neon.vmaxv(<4 x float>)
-
-declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32>)
-
-declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8>)
-
-declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32>)
-
-declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8>)
-
-define i16 @test_vaddlv_s8(<8 x i8> %a) {
-; CHECK: test_vaddlv_s8:
-; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i16> %saddlv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddlv_s16(<4 x i16> %a) {
-; CHECK: test_vaddlv_s16:
-; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i32> %saddlv.i, i32 0
-  ret i32 %0
-}
-
-define i16 @test_vaddlv_u8(<8 x i8> %a) {
-; CHECK: test_vaddlv_u8:
-; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i16> %uaddlv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddlv_u16(<4 x i16> %a) {
-; CHECK: test_vaddlv_u16:
-; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i32> %uaddlv.i, i32 0
-  ret i32 %0
-}
-
-define i16 @test_vaddlvq_s8(<16 x i8> %a) {
-; CHECK: test_vaddlvq_s8:
-; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i16> %saddlv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddlvq_s16(<8 x i16> %a) {
-; CHECK: test_vaddlvq_s16:
-; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i32> %saddlv.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vaddlvq_s32(<4 x i32> %a) {
-; CHECK: test_vaddlvq_s32:
-; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %saddlv.i = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i64> %saddlv.i, i32 0
-  ret i64 %0
-}
-
-define i16 @test_vaddlvq_u8(<16 x i8> %a) {
-; CHECK: test_vaddlvq_u8:
-; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i16> %uaddlv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddlvq_u16(<8 x i16> %a) {
-; CHECK: test_vaddlvq_u16:
-; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i32> %uaddlv.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vaddlvq_u32(<4 x i32> %a) {
-; CHECK: test_vaddlvq_u32:
-; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %uaddlv.i = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i64> %uaddlv.i, i32 0
-  ret i64 %0
-}
-
-define i8 @test_vmaxv_s8(<8 x i8> %a) {
-; CHECK: test_vmaxv_s8:
-; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %smaxv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vmaxv_s16(<4 x i16> %a) {
-; CHECK: test_vmaxv_s16:
-; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %smaxv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vmaxv_u8(<8 x i8> %a) {
-; CHECK: test_vmaxv_u8:
-; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %umaxv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vmaxv_u16(<4 x i16> %a) {
-; CHECK: test_vmaxv_u16:
-; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %umaxv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vmaxvq_s8(<16 x i8> %a) {
-; CHECK: test_vmaxvq_s8:
-; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %smaxv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vmaxvq_s16(<8 x i16> %a) {
-; CHECK: test_vmaxvq_s16:
-; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %smaxv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vmaxvq_s32(<4 x i32> %a) {
-; CHECK: test_vmaxvq_s32:
-; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %smaxv.i = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %smaxv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vmaxvq_u8(<16 x i8> %a) {
-; CHECK: test_vmaxvq_u8:
-; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %umaxv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vmaxvq_u16(<8 x i16> %a) {
-; CHECK: test_vmaxvq_u16:
-; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %umaxv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vmaxvq_u32(<4 x i32> %a) {
-; CHECK: test_vmaxvq_u32:
-; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %umaxv.i = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %umaxv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vminv_s8(<8 x i8> %a) {
-; CHECK: test_vminv_s8:
-; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %sminv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vminv_s16(<4 x i16> %a) {
-; CHECK: test_vminv_s16:
-; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %sminv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vminv_u8(<8 x i8> %a) {
-; CHECK: test_vminv_u8:
-; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %uminv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vminv_u16(<4 x i16> %a) {
-; CHECK: test_vminv_u16:
-; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %uminv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vminvq_s8(<16 x i8> %a) {
-; CHECK: test_vminvq_s8:
-; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %sminv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vminvq_s16(<8 x i16> %a) {
-; CHECK: test_vminvq_s16:
-; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %sminv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vminvq_s32(<4 x i32> %a) {
-; CHECK: test_vminvq_s32:
-; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sminv.i = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %sminv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vminvq_u8(<16 x i8> %a) {
-; CHECK: test_vminvq_u8:
-; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %uminv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vminvq_u16(<8 x i16> %a) {
-; CHECK: test_vminvq_u16:
-; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %uminv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vminvq_u32(<4 x i32> %a) {
-; CHECK: test_vminvq_u32:
-; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %uminv.i = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %uminv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vaddv_s8(<8 x i8> %a) {
-; CHECK: test_vaddv_s8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %vaddv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vaddv_s16(<4 x i16> %a) {
-; CHECK: test_vaddv_s16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %vaddv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vaddv_u8(<8 x i8> %a) {
-; CHECK: test_vaddv_u8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a)
-  %0 = extractelement <1 x i8> %vaddv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vaddv_u16(<4 x i16> %a) {
-; CHECK: test_vaddv_u16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a)
-  %0 = extractelement <1 x i16> %vaddv.i, i32 0
-  ret i16 %0
-}
-
-define i8 @test_vaddvq_s8(<16 x i8> %a) {
-; CHECK: test_vaddvq_s8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %vaddv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vaddvq_s16(<8 x i16> %a) {
-; CHECK: test_vaddvq_s16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %vaddv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_s32(<4 x i32> %a) {
-; CHECK: test_vaddvq_s32:
-; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %vaddv.i, i32 0
-  ret i32 %0
-}
-
-define i8 @test_vaddvq_u8(<16 x i8> %a) {
-; CHECK: test_vaddvq_u8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a)
-  %0 = extractelement <1 x i8> %vaddv.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vaddvq_u16(<8 x i16> %a) {
-; CHECK: test_vaddvq_u16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a)
-  %0 = extractelement <1 x i16> %vaddv.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_u32(<4 x i32> %a) {
-; CHECK: test_vaddvq_u32:
-; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a)
-  %0 = extractelement <1 x i32> %vaddv.i, i32 0
-  ret i32 %0
-}
-
-define float @test_vmaxvq_f32(<4 x float> %a) {
-; CHECK: test_vmaxvq_f32:
-; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.aarch64.neon.vmaxv(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vminvq_f32(<4 x float> %a) {
-; CHECK: test_vminvq_f32:
-; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.aarch64.neon.vminv(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vmaxnmvq_f32(<4 x float> %a) {
-; CHECK: test_vmaxnmvq_f32:
-; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.aarch64.neon.vmaxnmv(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vminnmvq_f32(<4 x float> %a) {
-; CHECK: test_vminnmvq_f32:
-; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.aarch64.neon.vminnmv(<4 x float> %a)
-  ret float %0
-}
-
diff --git a/test/CodeGen/AArch64/neon-add-pairwise.ll b/test/CodeGen/AArch64/neon-add-pairwise.ll
deleted file mode 100644
index 32d8222..0000000
--- a/test/CodeGen/AArch64/neon-add-pairwise.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_addp_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: addp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_addp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: addp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_addp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: addp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_addp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: addp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_addp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: addp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_addp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: addp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
-declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_addp_v2i64:
-        %val = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: addp v0.2d, v0.2d, v1.2d
-        ret <2 x i64> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_faddp_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: faddp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_faddp_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: faddp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_faddp_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: faddp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-define i32 @test_vaddv.v2i32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddv.v2i32
-; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v2i32(<2 x i32>)
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-add-sub.ll b/test/CodeGen/AArch64/neon-add-sub.ll
deleted file mode 100644
index 9015237..0000000
--- a/test/CodeGen/AArch64/neon-add-sub.ll
+++ /dev/null
@@ -1,279 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = add <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @add16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = add <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @add4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = add <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @add8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = add <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @add2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = add <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @add4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = add <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @add2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = add <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <2 x float> @add2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fadd <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @add4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fadd <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @add2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fadd <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-define <8 x i8> @sub8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = sub <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sub16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = sub <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @sub4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = sub <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sub8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = sub <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sub2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = sub <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sub4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = sub <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sub2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = sub <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <2 x float> @sub2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fsub <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @sub4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fsub <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fsub <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vadd_f64
-; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fadd <1 x double> %a, %b
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vmul_f64
-; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fmul <1 x double> %a, %b
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vdiv_f64
-; CHECK: fdiv d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fdiv <1 x double> %a, %b
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
-; CHECK-LABEL: test_vmla_f64
-; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fmul <1 x double> %b, %c
-  %2 = fadd <1 x double> %1, %a
-  ret <1 x double> %2
-}
-
-define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
-; CHECK-LABEL: test_vmls_f64
-; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fmul <1 x double> %b, %c
-  %2 = fsub <1 x double> %a, %1
-  ret <1 x double> %2
-}
-
-define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
-; CHECK-LABEL: test_vfms_f64
-; CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fsub <1 x double> <double -0.000000e+00>, %b
-  %2 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %1, <1 x double> %c, <1 x double> %a)
-  ret <1 x double> %2
-}
-
-define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
-; CHECK-LABEL: test_vfma_f64
-; CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vsub_f64
-; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fsub <1 x double> %a, %b
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vabd_f64
-; CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vmax_f64
-; CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vmin_f64
-; CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vmaxnm_f64
-; CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vminnm_f64
-; CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vabs_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vabs_f64
-; CHECK: fabs d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.fabs.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vneg_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vneg_f64
-; CHECK: fneg d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fsub <1 x double> <double -0.000000e+00>, %a
-  ret <1 x double> %1
-}
-
-declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
-declare <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
-
-define <1 x i8> @test_add_v1i8(<1 x i8> %a, <1 x i8> %b) {
-;CHECK-LABEL: test_add_v1i8:
-;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  %c = add <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @test_add_v1i16(<1 x i16> %a, <1 x i16> %b) {
-;CHECK-LABEL: test_add_v1i16:
-;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  %c = add <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @test_add_v1i32(<1 x i32> %a, <1 x i32> %b) {
-;CHECK-LABEL: test_add_v1i32:
-;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %c = add <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @test_sub_v1i8(<1 x i8> %a, <1 x i8> %b) {
-;CHECK-LABEL: test_sub_v1i8:
-;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  %c = sub <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @test_sub_v1i16(<1 x i16> %a, <1 x i16> %b) {
-;CHECK-LABEL: test_sub_v1i16:
-;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  %c = sub <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @test_sub_v1i32(<1 x i32> %a, <1 x i32> %b) {
-;CHECK-LABEL: test_sub_v1i32:
-;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %c = sub <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 7e5b693..6497856 100644
--- a/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1,45 +1,52 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: and8xi8:
+; CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <8 x i8> %a, %b;
 	ret <8 x i8> %tmp1
 }
 
 define <16 x i8> @and16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: and16xi8:
+; CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <16 x i8> %a, %b;
 	ret <16 x i8> %tmp1
 }
 
 
 define <8 x i8> @orr8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orr8xi8:
+; CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = or <8 x i8> %a, %b;
 	ret <8 x i8> %tmp1
 }
 
 define <16 x i8> @orr16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orr16xi8:
+; CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = or <16 x i8> %a, %b;
 	ret <16 x i8> %tmp1
 }
 
 
 define <8 x i8> @xor8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: xor8xi8:
+; CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = xor <8 x i8> %a, %b;
 	ret <8 x i8> %tmp1
 }
 
 define <16 x i8> @xor16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: xor16xi8:
+; CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = xor <16 x i8> %a, %b;
 	ret <16 x i8> %tmp1
 }
 
 define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl8xi8_const:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0 >
 	%tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1 >
 	%tmp3 = or <8 x i8> %tmp1, %tmp2
@@ -47,7 +54,8 @@ define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b)  {
 }
 
 define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl16xi8_const:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0 >
 	%tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1 >
 	%tmp3 = or <16 x i8> %tmp1, %tmp2
@@ -55,398 +63,461 @@ define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) {
 }
 
 define <8 x i8> @orn8xi8(<8 x i8> %a, <8 x i8> %b)  {
-;CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orn8xi8:
+; CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
   %tmp2 = or <8 x i8> %a, %tmp1
   ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @orn16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orn16xi8:
+; CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
   %tmp2 = or <16 x i8> %a, %tmp1
   ret <16 x i8> %tmp2
 }
 
 define <8 x i8> @bic8xi8(<8 x i8> %a, <8 x i8> %b)  {
-;CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bic8xi8:
+; CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
   %tmp2 = and <8 x i8> %a, %tmp1
   ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @bic16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bic16xi8:
+; CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
   %tmp2 = and <16 x i8> %a, %tmp1
   ret <16 x i8> %tmp2
 }
 
 define <2 x i32> @orrimm2s_lsl0(<2 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: orrimm2s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = or <2 x i32> %a, < i32 255, i32 255>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @orrimm2s_lsl8(<2 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: orrimm2s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <2 x i32> %a, < i32 65280, i32 65280>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @orrimm2s_lsl16(<2 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: orrimm2s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <2 x i32> %a, < i32 16711680, i32 16711680>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @orrimm2s_lsl24(<2 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: orrimm2s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <2 x i32> %a, < i32 4278190080, i32 4278190080>
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i32> @orrimm4s_lsl0(<4 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: orrimm4s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = or <4 x i32> %a, < i32 255, i32 255, i32 255, i32 255>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @orrimm4s_lsl8(<4 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: orrimm4s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <4 x i32> %a, < i32 65280, i32 65280, i32 65280, i32 65280>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @orrimm4s_lsl16(<4 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: orrimm4s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <4 x i32> %a, < i32 16711680, i32 16711680, i32 16711680, i32 16711680>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @orrimm4s_lsl24(<4 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: orrimm4s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <4 x i32> %a, < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i16> @orrimm4h_lsl0(<4 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orrimm4h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = or <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255 >
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @orrimm4h_lsl8(<4 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orrimm4h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 >
 	ret <4 x i16> %tmp1
 }
 
 define <8 x i16> @orrimm8h_lsl0(<8 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orrimm8h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = or <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @orrimm8h_lsl8(<8 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orrimm8h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i32> @bicimm2s_lsl0(<2 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0x10
+; CHECK-LABEL: bicimm2s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0x10|16}}
 	%tmp1 = and <2 x i32> %a, < i32 4294967279, i32 4294967279 >
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @bicimm2s_lsl8(<2 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0x10, lsl #8
+; CHECK-LABEL: bicimm2s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #8
 	%tmp1 = and <2 x i32> %a, < i32 4294963199, i32  4294963199 >
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @bicimm2s_lsl16(<2 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0x10, lsl #16
+; CHECK-LABEL: bicimm2s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #16
 	%tmp1 = and <2 x i32> %a, < i32 4293918719, i32 4293918719 >
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @bicimm2s_lsl124(<2 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0x10, lsl #24
+; CHECK-LABEL: bicimm2s_lsl124:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #24
 	%tmp1 = and <2 x i32> %a, < i32 4026531839, i32  4026531839>
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i32> @bicimm4s_lsl0(<4 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0x10
+; CHECK-LABEL: bicimm4s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0x10|16}}
 	%tmp1 = and <4 x i32> %a, < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 >
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @bicimm4s_lsl8(<4 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0x10, lsl #8
+; CHECK-LABEL: bicimm4s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #8
 	%tmp1 = and <4 x i32> %a, < i32 4294963199, i32  4294963199, i32  4294963199, i32  4294963199 >
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @bicimm4s_lsl16(<4 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0x10, lsl #16
+; CHECK-LABEL: bicimm4s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #16
 	%tmp1 = and <4 x i32> %a, < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 >
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @bicimm4s_lsl124(<4 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0x10, lsl #24
+; CHECK-LABEL: bicimm4s_lsl124:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #24
 	%tmp1 = and <4 x i32> %a, < i32 4026531839, i32  4026531839, i32  4026531839, i32  4026531839>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i16> @bicimm4h_lsl0_a(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0x10
+; CHECK-LABEL: bicimm4h_lsl0_a:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0x10|16}}
 	%tmp1 = and <4 x i16> %a, < i16 4294967279, i16  4294967279, i16  4294967279, i16  4294967279 >
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @bicimm4h_lsl0_b(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: bicimm4h_lsl0_b:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = and <4 x i16> %a, < i16 65280, i16  65280, i16  65280, i16 65280 >
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @bicimm4h_lsl8_a(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0x10, lsl #8
+; CHECK-LABEL: bicimm4h_lsl8_a:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0x10|16}}, lsl #8
 	%tmp1 = and <4 x i16> %a, < i16 4294963199, i16  4294963199, i16  4294963199, i16  4294963199>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @bicimm4h_lsl8_b(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: bicimm4h_lsl8_b:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255>
 	ret <4 x i16> %tmp1
 }
 
 define <8 x i16> @bicimm8h_lsl0_a(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0x10
+; CHECK-LABEL: bicimm8h_lsl0_a:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0x10|16}}
 	%tmp1 = and <8 x i16> %a, < i16 4294967279, i16  4294967279, i16  4294967279, i16  4294967279,
    i16  4294967279, i16  4294967279, i16  4294967279, i16  4294967279 >
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @bicimm8h_lsl0_b(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: bicimm8h_lsl0_b:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = and <8 x i16> %a, < i16 65280, i16  65280, i16  65280, i16 65280, i16 65280, i16  65280, i16  65280, i16 65280 >
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @bicimm8h_lsl8_a(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0x10, lsl #8
+; CHECK-LABEL: bicimm8h_lsl8_a:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0x10|16}}, lsl #8
 	%tmp1 = and <8 x i16> %a, < i16 4294963199, i16  4294963199, i16  4294963199, i16  4294963199,
    i16  4294963199, i16  4294963199, i16  4294963199, i16  4294963199>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @bicimm8h_lsl8_b(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: bicimm8h_lsl8_b:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i32> @and2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: and2xi32:
+; CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <2 x i32> %a, %b;
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i16> @and4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: and4xi16:
+; CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <4 x i16> %a, %b;
 	ret <4 x i16> %tmp1
 }
 
 define <1 x i64> @and1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: and1xi64:
+; CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <1 x i64> %a, %b;
 	ret <1 x i64> %tmp1
 }
 
 define <4 x i32> @and4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: and4xi32:
+; CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <4 x i32> %a, %b;
 	ret <4 x i32> %tmp1
 }
 
 define <8 x i16> @and8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: and8xi16:
+; CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <8 x i16> %a, %b;
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @and2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: and2xi64:
+; CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <2 x i64> %a, %b;
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i32> @orr2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orr2xi32:
+; CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = or <2 x i32> %a, %b;
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i16> @orr4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orr4xi16:
+; CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = or <4 x i16> %a, %b;
 	ret <4 x i16> %tmp1
 }
 
 define <1 x i64> @orr1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orr1xi64:
+; CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = or <1 x i64> %a, %b;
 	ret <1 x i64> %tmp1
 }
 
 define <4 x i32> @orr4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orr4xi32:
+; CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = or <4 x i32> %a, %b;
 	ret <4 x i32> %tmp1
 }
 
 define <8 x i16> @orr8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orr8xi16:
+; CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = or <8 x i16> %a, %b;
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @orr2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orr2xi64:
+; CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = or <2 x i64> %a, %b;
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i32> @eor2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: eor2xi32:
+; CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = xor <2 x i32> %a, %b;
 	ret <2 x i32> %tmp1
 }
 
 define <4 x i16> @eor4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: eor4xi16:
+; CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = xor <4 x i16> %a, %b;
 	ret <4 x i16> %tmp1
 }
 
 define <1 x i64> @eor1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: eor1xi64:
+; CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = xor <1 x i64> %a, %b;
 	ret <1 x i64> %tmp1
 }
 
 define <4 x i32> @eor4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: eor4xi32:
+; CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = xor <4 x i32> %a, %b;
 	ret <4 x i32> %tmp1
 }
 
 define <8 x i16> @eor8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: eor8xi16:
+; CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = xor <8 x i16> %a, %b;
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @eor2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: eor2xi64:
+; CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = xor <2 x i64> %a, %b;
 	ret <2 x i64> %tmp1
 }
 
 
 define <2 x i32> @bic2xi32(<2 x i32> %a, <2 x i32> %b)  {
-;CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bic2xi32:
+; CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 >
   %tmp2 = and <2 x i32> %a, %tmp1
   ret <2 x i32> %tmp2
 }
 
 define <4 x i16> @bic4xi16(<4 x i16> %a, <4 x i16> %b)  {
-;CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bic4xi16:
+; CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 >
   %tmp2 = and <4 x i16> %a, %tmp1
   ret <4 x i16> %tmp2
 }
 
 define <1 x i64> @bic1xi64(<1 x i64> %a, <1 x i64> %b)  {
-;CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bic1xi64:
+; CHECK:  bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <1 x i64> %b, < i64 -1>
   %tmp2 = and <1 x i64> %a, %tmp1
   ret <1 x i64> %tmp2
 }
 
 define <4 x i32> @bic4xi32(<4 x i32> %a, <4 x i32> %b)  {
-;CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bic4xi32:
+; CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1>
   %tmp2 = and <4 x i32> %a, %tmp1
   ret <4 x i32> %tmp2
 }
 
 define <8 x i16> @bic8xi16(<8 x i16> %a, <8 x i16> %b)  {
-;CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bic8xi16:
+; CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 >
   %tmp2 = and <8 x i16> %a, %tmp1
   ret <8 x i16> %tmp2
 }
 
 define <2 x i64> @bic2xi64(<2 x i64> %a, <2 x i64> %b)  {
-;CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bic2xi64:
+; CHECK:  bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1>
   %tmp2 = and <2 x i64> %a, %tmp1
   ret <2 x i64> %tmp2
 }
 
 define <2 x i32> @orn2xi32(<2 x i32> %a, <2 x i32> %b)  {
-;CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orn2xi32:
+; CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 >
   %tmp2 = or <2 x i32> %a, %tmp1
   ret <2 x i32> %tmp2
 }
 
 define <4 x i16> @orn4xi16(<4 x i16> %a, <4 x i16> %b)  {
-;CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orn4xi16:
+; CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 >
   %tmp2 = or <4 x i16> %a, %tmp1
   ret <4 x i16> %tmp2
 }
 
 define <1 x i64> @orn1xi64(<1 x i64> %a, <1 x i64> %b)  {
-;CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orn1xi64:
+; CHECK:  orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %tmp1 = xor <1 x i64> %b, < i64 -1>
   %tmp2 = or <1 x i64> %a, %tmp1
   ret <1 x i64> %tmp2
 }
 
 define <4 x i32> @orn4xi32(<4 x i32> %a, <4 x i32> %b)  {
-;CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orn4xi32:
+; CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1>
   %tmp2 = or <4 x i32> %a, %tmp1
   ret <4 x i32> %tmp2
 }
 
 define <8 x i16> @orn8xi16(<8 x i16> %a, <8 x i16> %b)  {
-;CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orn8xi16:
+; CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 >
   %tmp2 = or <8 x i16> %a, %tmp1
   ret <8 x i16> %tmp2
 }
 
 define <2 x i64> @orn2xi64(<2 x i64> %a, <2 x i64> %b)  {
-;CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orn2xi64:
+; CHECK:  orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1>
   %tmp2 = or <2 x i64> %a, %tmp1
   ret <2 x i64> %tmp2
 }
 
 define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl2xi32_const:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <2 x i32> %a, < i32 -1, i32 0 >
 	%tmp2 = and <2 x i32> %b, < i32 0, i32 -1 >
 	%tmp3 = or <2 x i32> %tmp1, %tmp2
@@ -455,7 +526,8 @@ define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b)  {
 
 
 define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl4xi16_const:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <4 x i16> %a, < i16 -1, i16 0, i16 -1,i16 0 >
 	%tmp2 = and <4 x i16> %b, < i16 0, i16 -1,i16 0, i16 -1 >
 	%tmp3 = or <4 x i16> %tmp1, %tmp2
@@ -463,7 +535,8 @@ define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b)  {
 }
 
 define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl1xi64_const:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp1 = and <1 x i64> %a, < i64 -16 >
 	%tmp2 = and <1 x i64> %b, < i64 15 >
 	%tmp3 = or <1 x i64> %tmp1, %tmp2
@@ -471,7 +544,8 @@ define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b)  {
 }
 
 define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl4xi32_const:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <4 x i32> %a, < i32 -1, i32 0, i32 -1, i32 0 >
 	%tmp2 = and <4 x i32> %b, < i32 0, i32 -1, i32 0, i32 -1 >
 	%tmp3 = or <4 x i32> %tmp1, %tmp2
@@ -479,7 +553,8 @@ define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b)  {
 }
 
 define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl8xi16_const:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 0,i16 0, i16 -1, i16 -1, i16 0,i16 0 >
 	%tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 -1, i16 -1, i16 0, i16 0, i16 -1, i16 -1 >
 	%tmp3 = or <8 x i16> %tmp1, %tmp2
@@ -487,7 +562,8 @@ define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b)  {
 }
 
 define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b)  {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl2xi64_const:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp1 = and <2 x i64> %a, < i64 -1, i64 0 >
 	%tmp2 = and <2 x i64> %b, < i64 0, i64 -1 >
 	%tmp3 = or <2 x i64> %tmp1, %tmp2
@@ -496,7 +572,8 @@ define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b)  {
 
 
 define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl8xi8:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %1 = and <8 x i8> %v1, %v2
   %2 = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   %3 = and <8 x i8> %2, %v3
@@ -505,7 +582,8 @@ define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
 }
 
 define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl4xi16:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %1 = and <4 x i16> %v1, %v2
   %2 = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1>
   %3 = and <4 x i16> %2, %v3
@@ -514,7 +592,8 @@ define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
 }
 
 define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl2xi32:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %1 = and <2 x i32> %v1, %v2
   %2 = xor <2 x i32> %v1, <i32 -1, i32 -1>
   %3 = and <2 x i32> %2, %v3
@@ -523,7 +602,8 @@ define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
 }
 
 define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl1xi64:
+; CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %1 = and <1 x i64> %v1, %v2
   %2 = xor <1 x i64> %v1, <i64 -1>
   %3 = and <1 x i64> %2, %v3
@@ -532,7 +612,8 @@ define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
 }
 
 define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl16xi8:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %1 = and <16 x i8> %v1, %v2
   %2 = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   %3 = and <16 x i8> %2, %v3
@@ -541,7 +622,8 @@ define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
 }
 
 define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl8xi16:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %1 = and <8 x i16> %v1, %v2
   %2 = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   %3 = and <8 x i16> %2, %v3
@@ -550,7 +632,8 @@ define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
 }
 
 define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl4xi32:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %1 = and <4 x i32> %v1, %v2
   %2 = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
   %3 = and <4 x i32> %2, %v3
@@ -559,56 +642,63 @@ define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
 }
 
 define <8 x i8> @vselect_v8i8(<8 x i8> %a) {
-;CHECK:  movi	 {{d[0-9]+}}, #0xffff
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_v8i8:
+; CHECK:  movi {{d[0-9]+}}, #0x{{0*}}ffff
+; CHECK-NEXT:  {{bsl v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b|and v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b}}
   %b = select <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i8> %a, <8 x i8> <i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
   ret <8 x i8> %b
 }
 
 define <4 x i16> @vselect_v4i16(<4 x i16> %a) {
-;CHECK:  movi	 {{d[0-9]+}}, #0xffff
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_v4i16:
+; CHECK:  movi {{d[0-9]+}}, #0x{{0*}}ffff
+; CHECK-NEXT:  {{bsl v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b|and v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b}}
   %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i16> %a, <4 x i16> <i16 undef, i16 0, i16 0, i16 0>
   ret <4 x i16> %b
 }
 
 define <8 x i8> @vselect_cmp_ne(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT:  not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_cmp_ne:
+; CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT:  {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %cmp = icmp ne <8 x i8> %a, %b
   %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
   ret <8 x i8> %d
 }
 
 define <8 x i8> @vselect_cmp_eq(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_cmp_eq:
+; CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %cmp = icmp eq <8 x i8> %a, %b
   %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
   ret <8 x i8> %d
 }
 
 define <8 x i8> @vselect_cmpz_ne(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-;CHECK-NEXT:  not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_cmpz_ne:
+; CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+; CHECK-NEXT:  {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %cmp = icmp ne <8 x i8> %a, zeroinitializer
   %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
   ret <8 x i8> %d
 }
 
 define <8 x i8> @vselect_cmpz_eq(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_cmpz_eq:
+; CHECK:  cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+; CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   %cmp = icmp eq <8 x i8> %a, zeroinitializer
   %d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
   ret <8 x i8> %d
 }
 
 define <8 x i8> @vselect_tst(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK:  cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_tst:
+; CHECK:  cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = and <8 x i8> %a, %b
 	%tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer
   %d = select <8 x i1> %tmp4, <8 x i8> %b, <8 x i8> %c
@@ -616,7 +706,8 @@ define <8 x i8> @vselect_tst(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
 }
 
 define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl2xi64:
+; CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   %1 = and <2 x i64> %v1, %v2
   %2 = xor <2 x i64> %v1, <i64 -1, i64 -1>
   %3 = and <2 x i64> %2, %v3
@@ -625,458 +716,534 @@ define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
 }
 
 define <8 x i8> @orrimm8b_as_orrimm4h_lsl0(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orrimm8b_as_orrimm4h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}
   %val = or <8 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
   ret <8 x i8> %val
 }
 
 define <8 x i8> @orrimm8b_as_orimm4h_lsl8(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orrimm8b_as_orimm4h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
   %val = or <8 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
   ret <8 x i8> %val
 }
 
 define <16 x i8> @orimm16b_as_orrimm8h_lsl0(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orimm16b_as_orrimm8h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}
   %val = or <16 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
   ret <16 x i8> %val
 }
 
 define <16 x i8> @orimm16b_as_orrimm8h_lsl8(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orimm16b_as_orrimm8h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
   %val = or <16 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
   ret <16 x i8> %val
 }
 
 define <8 x i8> @and8imm2s_lsl0(<8 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: and8imm2s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = and <8 x i8> %a, < i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @and8imm2s_lsl8(<8 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: and8imm2s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = and <8 x i8> %a, < i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @and8imm2s_lsl16(<8 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: and8imm2s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = and <8 x i8> %a, < i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @and8imm2s_lsl24(<8 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xfe, lsl #24
+; CHECK-LABEL: and8imm2s_lsl24:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xfe|254}}, lsl #24
 	%tmp1 = and <8 x i8> %a, < i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1>
 	ret <8 x i8> %tmp1
 }
 
 define <4 x i16> @and16imm2s_lsl0(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: and16imm2s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = and <4 x i16> %a, < i16 65280, i16 65535, i16 65280, i16 65535>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @and16imm2s_lsl8(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: and16imm2s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = and <4 x i16> %a, < i16 255, i16 65535, i16 255, i16 65535>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @and16imm2s_lsl16(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: and16imm2s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = and <4 x i16> %a, < i16 65535, i16 65280, i16 65535, i16 65280>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @and16imm2s_lsl24(<4 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xfe, lsl #24
+; CHECK-LABEL: and16imm2s_lsl24:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xfe|254}}, lsl #24
 	%tmp1 = and <4 x i16> %a, < i16 65535, i16 511, i16 65535, i16 511>
 	ret <4 x i16> %tmp1
 }
 
 
 define <1 x i64> @and64imm2s_lsl0(<1 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: and64imm2s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = and <1 x i64> %a, < i64 -1095216660736>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @and64imm2s_lsl8(<1 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: and64imm2s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = and <1 x i64> %a, < i64 -280375465148161>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @and64imm2s_lsl16(<1 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: and64imm2s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = and <1 x i64> %a, < i64 -71776119077928961>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @and64imm2s_lsl24(<1 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.2s, #0xfe, lsl #24
+; CHECK-LABEL: and64imm2s_lsl24:
+; CHECK:  bic {{v[0-9]+}}.2s, #{{0xfe|254}}, lsl #24
 	%tmp1 = and <1 x i64> %a, < i64 144115183814443007>
 	ret <1 x i64> %tmp1
 }
 
 define <16 x i8> @and8imm4s_lsl0(<16 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: and8imm4s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = and <16 x i8> %a, < i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @and8imm4s_lsl8(<16 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: and8imm4s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = and <16 x i8> %a, < i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @and8imm4s_lsl16(<16 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: and8imm4s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = and <16 x i8> %a, < i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @and8imm4s_lsl24(<16 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xfe, lsl #24
+; CHECK-LABEL: and8imm4s_lsl24:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xfe|254}}, lsl #24
 	%tmp1 = and <16 x i8> %a, < i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1>
 	ret <16 x i8> %tmp1
 }
 
 define <8 x i16> @and16imm4s_lsl0(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: and16imm4s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = and <8 x i16> %a, < i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @and16imm4s_lsl8(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: and16imm4s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = and <8 x i16> %a, < i16 255, i16 65535, i16 255, i16 65535, i16 255, i16 65535, i16 255, i16 65535>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @and16imm4s_lsl16(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: and16imm4s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = and <8 x i16> %a, < i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @and16imm4s_lsl24(<8 x i16> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xfe, lsl #24
+; CHECK-LABEL: and16imm4s_lsl24:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xfe|254}}, lsl #24
 	%tmp1 = and <8 x i16> %a, < i16 65535, i16 511, i16 65535, i16 511, i16 65535, i16 511, i16 65535, i16 511>
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @and64imm4s_lsl0(<2 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: and64imm4s_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = and <2 x i64> %a, < i64 -1095216660736, i64 -1095216660736>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @and64imm4s_lsl8(<2 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: and64imm4s_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = and <2 x i64> %a, < i64 -280375465148161, i64 -280375465148161>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @and64imm4s_lsl16(<2 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: and64imm4s_lsl16:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = and <2 x i64> %a, < i64 -71776119077928961, i64 -71776119077928961>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @and64imm4s_lsl24(<2 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.4s, #0xfe, lsl #24
+; CHECK-LABEL: and64imm4s_lsl24:
+; CHECK:  bic {{v[0-9]+}}.4s, #{{0xfe|254}}, lsl #24
 	%tmp1 = and <2 x i64> %a, < i64 144115183814443007, i64 144115183814443007>
 	ret <2 x i64> %tmp1
 }
 
 define <8 x i8> @and8imm4h_lsl0(<8 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: and8imm4h_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = and <8 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @and8imm4h_lsl8(<8 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: and8imm4h_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <8 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
 	ret <8 x i8> %tmp1
 }
 
 define <2 x i32> @and16imm4h_lsl0(<2 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: and16imm4h_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = and <2 x i32> %a, < i32 4278255360, i32 4278255360>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @and16imm4h_lsl8(<2 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: and16imm4h_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <2 x i32> %a, < i32 16711935, i32 16711935>
 	ret <2 x i32> %tmp1
 }
 
 define <1 x i64> @and64imm4h_lsl0(<1 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: and64imm4h_lsl0:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = and <1 x i64> %a, < i64 -71777214294589696>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @and64imm4h_lsl8(<1 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: and64imm4h_lsl8:
+; CHECK:  bic {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <1 x i64> %a, < i64 71777214294589695>
 	ret <1 x i64> %tmp1
 }
 
 define <16 x i8> @and8imm8h_lsl0(<16 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: and8imm8h_lsl0:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = and <16 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255 >
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @and8imm8h_lsl8(<16 x i8> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: and8imm8h_lsl8:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <16 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0 >
 	ret <16 x i8> %tmp1
 }
 
 define <4 x i32> @and16imm8h_lsl0(<4 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: and16imm8h_lsl0:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = and <4 x i32> %a, < i32 4278255360, i32 4278255360, i32 4278255360, i32 4278255360>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @and16imm8h_lsl8(<4 x i32> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: and16imm8h_lsl8:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <4 x i32> %a, < i32 16711935, i32 16711935, i32 16711935, i32 16711935>
 	ret <4 x i32> %tmp1
 }
 
 define <2 x i64> @and64imm8h_lsl0(<2 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: and64imm8h_lsl0:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = and <2 x i64> %a, < i64 -71777214294589696, i64 -71777214294589696>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @and64imm8h_lsl8(<2 x i64> %a) {
-;CHECK:  bic {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: and64imm8h_lsl8:
+; CHECK:  bic {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = and <2 x i64> %a, < i64 71777214294589695, i64 71777214294589695>
 	ret <2 x i64> %tmp1
 }
 
 define <8 x i8> @orr8imm2s_lsl0(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: orr8imm2s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = or <8 x i8> %a, < i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @orr8imm2s_lsl8(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: orr8imm2s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <8 x i8> %a, < i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @orr8imm2s_lsl16(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: orr8imm2s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <8 x i8> %a, < i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @orr8imm2s_lsl24(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: orr8imm2s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <8 x i8> %a, < i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255>
 	ret <8 x i8> %tmp1
 }
 
 define <4 x i16> @orr16imm2s_lsl0(<4 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: orr16imm2s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = or <4 x i16> %a, < i16 255, i16 0, i16 255, i16 0>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @orr16imm2s_lsl8(<4 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: orr16imm2s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <4 x i16> %a, < i16 65280, i16 0, i16 65280, i16 0>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @orr16imm2s_lsl16(<4 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: orr16imm2s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <4 x i16> %a, < i16 0, i16 255, i16 0, i16 255>
 	ret <4 x i16> %tmp1
 }
 
 define <4 x i16> @orr16imm2s_lsl24(<4 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: orr16imm2s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <4 x i16> %a, < i16 0, i16 65280, i16 0, i16 65280>
 	ret <4 x i16> %tmp1
 }
 
 define <1 x i64> @orr64imm2s_lsl0(<1 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: orr64imm2s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}
 	%tmp1 = or <1 x i64> %a, < i64 1095216660735>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @orr64imm2s_lsl8(<1 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: orr64imm2s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <1 x i64> %a, < i64 280375465148160>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @orr64imm2s_lsl16(<1 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: orr64imm2s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <1 x i64> %a, < i64 71776119077928960>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @orr64imm2s_lsl24(<1 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: orr64imm2s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <1 x i64> %a, < i64 -72057589759737856>
 	ret <1 x i64> %tmp1
 }
 
 define <16 x i8> @orr8imm4s_lsl0(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: orr8imm4s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = or <16 x i8> %a, < i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @orr8imm4s_lsl8(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: orr8imm4s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <16 x i8> %a, < i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @orr8imm4s_lsl16(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: orr8imm4s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <16 x i8> %a, < i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @orr8imm4s_lsl24(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: orr8imm4s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <16 x i8> %a, < i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255>
 	ret <16 x i8> %tmp1
 }
 
 define <8 x i16> @orr16imm4s_lsl0(<8 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: orr16imm4s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = or <8 x i16> %a, < i16 255, i16 0, i16 255, i16 0, i16 255, i16 0, i16 255, i16 0>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @orr16imm4s_lsl8(<8 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: orr16imm4s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <8 x i16> %a, < i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @orr16imm4s_lsl16(<8 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: orr16imm4s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <8 x i16> %a, < i16 0, i16 255, i16 0, i16 255, i16 0, i16 255, i16 0, i16 255>
 	ret <8 x i16> %tmp1
 }
 
 define <8 x i16> @orr16imm4s_lsl24(<8 x i16> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: orr16imm4s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <8 x i16> %a, < i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280>
 	ret <8 x i16> %tmp1
 }
 
 define <2 x i64> @orr64imm4s_lsl0(<2 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: orr64imm4s_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}
 	%tmp1 = or <2 x i64> %a, < i64 1095216660735, i64 1095216660735>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @orr64imm4s_lsl8(<2 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: orr64imm4s_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
 	%tmp1 = or <2 x i64> %a, < i64 280375465148160, i64 280375465148160>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @orr64imm4s_lsl16(<2 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: orr64imm4s_lsl16:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
 	%tmp1 = or <2 x i64> %a, < i64 71776119077928960, i64 71776119077928960>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @orr64imm4s_lsl24(<2 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: orr64imm4s_lsl24:
+; CHECK:  orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #24
 	%tmp1 = or <2 x i64> %a, < i64 -72057589759737856, i64 -72057589759737856>
 	ret <2 x i64> %tmp1
 }
 
 define <8 x i8> @orr8imm4h_lsl0(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orr8imm4h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = or <8 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
 	ret <8 x i8> %tmp1
 }
 
 define <8 x i8> @orr8imm4h_lsl8(<8 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orr8imm4h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <8 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
 	ret <8 x i8> %tmp1
 }
 
 define <2 x i32> @orr16imm4h_lsl0(<2 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orr16imm4h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = or <2 x i32> %a, < i32 16711935, i32 16711935>
 	ret <2 x i32> %tmp1
 }
 
 define <2 x i32> @orr16imm4h_lsl8(<2 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orr16imm4h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <2 x i32> %a, < i32 4278255360, i32 4278255360>
 	ret <2 x i32> %tmp1
 }
 
 define <1 x i64> @orr64imm4h_lsl0(<1 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orr64imm4h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}
 	%tmp1 = or <1 x i64> %a, < i64 71777214294589695>
 	ret <1 x i64> %tmp1
 }
 
 define <1 x i64> @orr64imm4h_lsl8(<1 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orr64imm4h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <1 x i64> %a, < i64 -71777214294589696>
 	ret <1 x i64> %tmp1
 }
 
 define <16 x i8> @orr8imm8h_lsl0(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orr8imm8h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = or <16 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
 	ret <16 x i8> %tmp1
 }
 
 define <16 x i8> @orr8imm8h_lsl8(<16 x i8> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orr8imm8h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <16 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
 	ret <16 x i8> %tmp1
 }
 
 define <4 x i32> @orr16imm8h_lsl0(<4 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orr16imm8h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = or <4 x i32> %a, < i32 16711935, i32 16711935, i32 16711935, i32 16711935>
 	ret <4 x i32> %tmp1
 }
 
 define <4 x i32> @orr16imm8h_lsl8(<4 x i32> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orr16imm8h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <4 x i32> %a, < i32 4278255360, i32 4278255360, i32 4278255360, i32 4278255360>
 	ret <4 x i32> %tmp1
 }
 
 define <2 x i64> @orr64imm8h_lsl0(<2 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orr64imm8h_lsl0:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}
 	%tmp1 = or <2 x i64> %a, < i64 71777214294589695, i64 71777214294589695>
 	ret <2 x i64> %tmp1
 }
 
 define <2 x i64> @orr64imm8h_lsl8(<2 x i64> %a) {
-;CHECK:  orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orr64imm8h_lsl8:
+; CHECK:  orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
 	%tmp1 = or <2 x i64> %a, < i64 -71777214294589696, i64 -71777214294589696>
 	ret <2 x i64> %tmp1
 }
diff --git a/test/CodeGen/AArch64/neon-bsl.ll b/test/CodeGen/AArch64/neon-bsl.ll
deleted file mode 100644
index c55fd01..0000000
--- a/test/CodeGen/AArch64/neon-bsl.ll
+++ /dev/null
@@ -1,235 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double>, <2 x double>, <2 x double>)
-
-declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
-
-declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float>, <4 x float>, <4 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-
-declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
-
-declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
-
-declare <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double>, <1 x double>, <1 x double>)
-
-declare <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float>, <2 x float>, <2 x float>)
-
-declare <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>)
-
-declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
-
-define <8 x i8> @test_vbsl_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-; CHECK-LABEL: test_vbsl_s8:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vbsl_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-; CHECK-LABEL: test_vbsl_s16:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
-  %0 = bitcast <4 x i16> %vbsl3.i to <8 x i8>
-  ret <8 x i8> %0
-}
-
-define <2 x i32> @test_vbsl_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
-; CHECK-LABEL: test_vbsl_s32:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3)
-  ret <2 x i32> %vbsl3.i
-}
-
-define <1 x i64> @test_vbsl_s64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_vbsl_s64:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3)
-  ret <1 x i64> %vbsl3.i
-}
-
-define <8 x i8> @test_vbsl_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-; CHECK-LABEL: test_vbsl_u8:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
-  ret <8 x i8> %vbsl.i
-}
-
-define <4 x i16> @test_vbsl_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-; CHECK-LABEL: test_vbsl_u16:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
-  ret <4 x i16> %vbsl3.i
-}
-
-define <2 x i32> @test_vbsl_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
-; CHECK-LABEL: test_vbsl_u32:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3)
-  ret <2 x i32> %vbsl3.i
-}
-
-define <1 x i64> @test_vbsl_u64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_vbsl_u64:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3)
-  ret <1 x i64> %vbsl3.i
-}
-
-define <2 x float> @test_vbsl_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) {
-; CHECK-LABEL: test_vbsl_f32:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3)
-  ret <2 x float> %vbsl3.i
-}
-
-define <1 x double> @test_vbsl_f64(<1 x i64> %v1, <1 x double> %v2, <1 x double> %v3) {
-; CHECK-LABEL: test_vbsl_f64:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl.i = bitcast <1 x i64> %v1 to <1 x double>
-  %vbsl3.i = tail call <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double> %vbsl.i, <1 x double> %v2, <1 x double> %v3)
-  ret <1 x double> %vbsl3.i
-}
-
-define <8 x i8> @test_vbsl_p8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-; CHECK-LABEL: test_vbsl_p8:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
-  ret <8 x i8> %vbsl.i
-}
-
-define <4 x i16> @test_vbsl_p16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-; CHECK-LABEL: test_vbsl_p16:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
-  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
-  ret <4 x i16> %vbsl3.i
-}
-
-define <16 x i8> @test_vbslq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: test_vbslq_s8:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
-  ret <16 x i8> %vbsl.i
-}
-
-define <8 x i16> @test_vbslq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-; CHECK-LABEL: test_vbslq_s16:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
-  ret <8 x i16> %vbsl3.i
-}
-
-define <4 x i32> @test_vbslq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: test_vbslq_s32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3)
-  ret <4 x i32> %vbsl3.i
-}
-
-define <2 x i64> @test_vbslq_s64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
-; CHECK-LABEL: test_vbslq_s64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3)
-  ret <2 x i64> %vbsl3.i
-}
-
-define <16 x i8> @test_vbslq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: test_vbslq_u8:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
-  ret <16 x i8> %vbsl.i
-}
-
-define <8 x i16> @test_vbslq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-; CHECK-LABEL: test_vbslq_u16:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
-  ret <8 x i16> %vbsl3.i
-}
-
-define <4 x i32> @test_vbslq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: test_vbslq_u32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3)
-  ret <4 x i32> %vbsl3.i
-}
-
-define <2 x i64> @test_vbslq_u64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
-; CHECK-LABEL: test_vbslq_u64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3)
-  ret <2 x i64> %vbsl3.i
-}
-
-define <4 x float> @test_vbslq_f32(<4 x i32> %v1, <4 x float> %v2, <4 x float> %v3) {
-; CHECK-LABEL: test_vbslq_f32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = bitcast <4 x i32> %v1 to <4 x float>
-  %vbsl3.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %vbsl.i, <4 x float> %v2, <4 x float> %v3)
-  ret <4 x float> %vbsl3.i
-}
-
-define <16 x i8> @test_vbslq_p8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: test_vbslq_p8:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
-  ret <16 x i8> %vbsl.i
-}
-
-define <8 x i16> @test_vbslq_p16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-; CHECK-LABEL: test_vbslq_p16:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
-  ret <8 x i16> %vbsl3.i
-}
-
-define <2 x double> @test_vbslq_f64(<2 x i64> %v1, <2 x double> %v2, <2 x double> %v3) {
-; CHECK-LABEL: test_vbslq_f64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %vbsl.i = bitcast <2 x i64> %v1 to <2 x double>
-  %vbsl3.i = tail call <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double> %vbsl.i, <2 x double> %v2, <2 x double> %v3)
-  ret <2 x double> %vbsl3.i
-}
-
-define <2 x double> @test_bsl_v2f64(<2 x i1> %v1, <2 x double> %v2, <2 x double> %v3) {
-; CHECK-LABEL: test_bsl_v2f64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  %1 = select <2 x i1> %v1, <2 x double> %v2, <2 x double> %v3
-  ret <2 x double> %1
-}
-
-define <4 x float> @test_bsl_v4f32(<4 x i1> %v1, <4 x float> %v2, <4 x float> %v3) {
-; CHECK-LABEL: test_bsl_v4f32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  %1 = select <4 x i1> %v1, <4 x float> %v2, <4 x float> %v3
-  ret <4 x float> %1
-}
diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll
index 68f0342..6d89dfb 100644
--- a/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1,560 +1,631 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmeq8xi8:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp eq <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmeq16xi8:
+; CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp eq <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmeq4xi16:
+; CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp eq <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmeq8xi16:
+; CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp eq <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmeq2xi32:
+; CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp eq <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmeq4xi32:
+; CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp eq <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmeq2xi64:
+; CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp eq <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmne8xi8:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmne16xi8:
+; CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmne4xi16:
+; CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmne8xi16:
+; CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmne2xi32:
+; CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmne4xi32:
+; CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmne2xi64:
+; CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmgt8xi8:
+; CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp sgt <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmgt16xi8:
+; CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp sgt <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmgt4xi16:
+; CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp sgt <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmgt8xi16:
+; CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp sgt <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmgt2xi32:
+; CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp sgt <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmgt4xi32:
+; CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp sgt <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmgt2xi64:
+; CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp sgt <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: cmlt8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
 	%tmp3 = icmp slt <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: cmlt16xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
 	%tmp3 = icmp slt <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: cmlt4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp slt <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: cmlt8xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
 	%tmp3 = icmp slt <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: cmlt2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp slt <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: cmlt4xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 	%tmp3 = icmp slt <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: cmlt2xi64:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 	%tmp3 = icmp slt <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmge8xi8:
+; CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp sge <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmge16xi8:
+; CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp sge <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmge4xi16:
+; CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp sge <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmge8xi16:
+; CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp sge <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmge2xi32:
+; CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp sge <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmge4xi32:
+; CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp sge <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmge2xi64:
+; CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp sge <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: cmle8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
 	%tmp3 = icmp sle <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: cmle16xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
 	%tmp3 = icmp sle <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: cmle4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp sle <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: cmle8xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
 	%tmp3 = icmp sle <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: cmle2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp sle <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: cmle4xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
 	%tmp3 = icmp sle <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: cmle2xi64:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
 	%tmp3 = icmp sle <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmhi8xi8:
+; CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ugt <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmhi16xi8:
+; CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ugt <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmhi4xi16:
+; CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp ugt <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmhi8xi16:
+; CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp ugt <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmhi2xi32:
+; CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp ugt <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmhi4xi32:
+; CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp ugt <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmhi2xi64:
+; CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp ugt <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: cmlo8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
 	%tmp3 = icmp ult <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: cmlo16xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
 	%tmp3 = icmp ult <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: cmlo4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp ult <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: cmlo8xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
 	%tmp3 = icmp ult <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: cmlo2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp ult <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: cmlo4xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
 	%tmp3 = icmp ult <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: cmlo2xi64:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
 	%tmp3 = icmp ult <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmhs8xi8:
+; CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp uge <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmhs16xi8:
+; CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp uge <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmhs4xi16:
+; CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp uge <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmhs8xi16:
+; CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp uge <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmhs2xi32:
+; CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp uge <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmhs4xi32:
+; CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp uge <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmhs2xi64:
+; CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp uge <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: cmls8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
 	%tmp3 = icmp ule <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: cmls16xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
 	%tmp3 = icmp ule <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: cmls4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp ule <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: cmls8xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
 	%tmp3 = icmp ule <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: cmls2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp ule <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: cmls4xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
 	%tmp3 = icmp ule <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: cmls2xi64:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
 	%tmp3 = icmp ule <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmtst8xi8:
+; CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = and <8 x i8> %A, %B
 	%tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer
    %tmp5 = sext <8 x i1> %tmp4 to <8 x i8>
@@ -562,7 +633,8 @@ define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) {
 }
 
 define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmtst16xi8:
+; CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = and <16 x i8> %A, %B
 	%tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer
    %tmp5 = sext <16 x i1> %tmp4 to <16 x i8>
@@ -570,7 +642,8 @@ define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) {
 }
 
 define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmtst4xi16:
+; CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = and <4 x i16> %A, %B
 	%tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer
    %tmp5 = sext <4 x i1> %tmp4 to <4 x i16>
@@ -578,7 +651,8 @@ define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) {
 }
 
 define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmtst8xi16:
+; CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = and <8 x i16> %A, %B
 	%tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer
    %tmp5 = sext <8 x i1> %tmp4 to <8 x i16>
@@ -586,7 +660,8 @@ define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) {
 }
 
 define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmtst2xi32:
+; CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = and <2 x i32> %A, %B
 	%tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer
    %tmp5 = sext <2 x i1> %tmp4 to <2 x i32>
@@ -594,7 +669,8 @@ define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) {
 }
 
 define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmtst4xi32:
+; CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = and <4 x i32> %A, %B
 	%tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer
    %tmp5 = sext <4 x i1> %tmp4 to <4 x i32>
@@ -602,7 +678,8 @@ define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) {
 }
 
 define <2 x i64> @cmtst2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmtst2xi64:
+; CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = and <2 x i64> %A, %B
 	%tmp4 = icmp ne <2 x i64> %tmp3, zeroinitializer
    %tmp5 = sext <2 x i1> %tmp4 to <2 x i64>
@@ -612,49 +689,56 @@ define <2 x i64> @cmtst2xi64(<2 x i64> %A, <2 x i64> %B) {
 
 
 define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmeqz8xi8:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 	%tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmeqz16xi8:
+; CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 	%tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmeqz4xi16:
+; CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 	%tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmeqz8xi16:
+; CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 	%tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmeqz2xi32:
+; CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 	%tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmeqz4xi32:
+; CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 	%tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmeqz2xi64:
+; CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 	%tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -662,49 +746,56 @@ define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
 
 
 define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
-;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmgez8xi8:
+; CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 	%tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
-;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmgez16xi8:
+; CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 	%tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
-;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmgez4xi16:
+; CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 	%tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
-;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmgez8xi16:
+; CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 	%tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
-;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmgez2xi32:
+; CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 	%tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
-;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmgez4xi32:
+; CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 	%tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
-;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmgez2xi64:
+; CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 	%tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -712,259 +803,294 @@ define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
 
 
 define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
-;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmgtz8xi8:
+; CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 	%tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
-;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmgtz16xi8:
+; CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 	%tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
-;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmgtz4xi16:
+; CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 	%tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
-;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmgtz8xi16:
+; CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 	%tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
-;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmgtz2xi32:
+; CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 	%tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
-;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmgtz4xi32:
+; CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 	%tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
-;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmgtz2xi64:
+; CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 	%tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
-;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmlez8xi8:
+; CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 	%tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
-;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmlez16xi8:
+; CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 	%tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
-;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmlez4xi16:
+; CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 	%tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
-;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmlez8xi16:
+; CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 	%tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
-;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmlez2xi32:
+; CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 	%tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
-;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmlez4xi32:
+; CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 	%tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
-;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmlez2xi64:
+; CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 	%tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
-;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmltz8xi8:
+; CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
 	%tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
-;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmltz16xi8:
+; CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
 	%tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
-;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmltz4xi16:
+; CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
 	%tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
-;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmltz8xi16:
+; CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
 	%tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
-;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmltz2xi32:
+; CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
 	%tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
-;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmltz4xi32:
+; CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
 	%tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
-;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmltz2xi64:
+; CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
 	%tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmneqz8xi8:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmneqz16xi8:
+; CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmneqz4xi16:
+; CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmneqz8xi16:
+; CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmneqz2xi32:
+; CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmneqz4xi32:
+; CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmneqz2xi64:
+; CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmhsz8xi8:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmhsz16xi8:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmhsz4xi16:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmhsz8xi16:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmhsz2xi32:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmhsz4xi32:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmhsz2xi64:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -972,196 +1098,217 @@ define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
 
 
 define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmhiz8xi8:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmhiz16xi8:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmhiz4xi16:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 	%tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmhiz8xi16:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 	%tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmhiz2xi32:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 	%tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmhiz4xi32:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 	%tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmhiz2xi64:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 	%tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
+; CHECK-LABEL: cmlsz8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
 	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
+; CHECK-LABEL: cmlsz16xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
 	%tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
+; CHECK-LABEL: cmlsz4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
+; CHECK-LABEL: cmlsz8xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
 	%tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
+; CHECK-LABEL: cmlsz2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
+; CHECK-LABEL: cmlsz4xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
 	%tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
+; CHECK-LABEL: cmlsz2xi64:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
 	%tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
+; CHECK-LABEL: cmloz8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v1.8b, {{v[0-9]+}}.8b
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v1.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
 }
 
 define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
+; CHECK-LABEL: cmloz16xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
 	%tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
 }
 
 define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
+; CHECK-LABEL: cmloz4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
 }
 
 define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
+; CHECK-LABEL: cmloz8xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
 	%tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
 }
 
 define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
+; CHECK-LABEL: cmloz2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
+; CHECK-LABEL: cmloz4xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
 	%tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
+; CHECK-LABEL: cmloz2xi64:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
 	%tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1169,144 +1316,162 @@ define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
 
 
 define <2 x i32> @fcmoeq2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: fcmoeq2xfloat:
+; CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    %tmp3 = fcmp oeq <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmoeq4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: fcmoeq4xfloat:
+; CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    %tmp3 = fcmp oeq <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmoeq2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: fcmoeq2xdouble:
+; CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    %tmp3 = fcmp oeq <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmoge2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: fcmoge2xfloat:
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    %tmp3 = fcmp oge <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmoge4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: fcmoge4xfloat:
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    %tmp3 = fcmp oge <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmoge2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: fcmoge2xdouble:
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    %tmp3 = fcmp oge <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmogt2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: fcmogt2xfloat:
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    %tmp3 = fcmp ogt <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmogt4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: fcmogt4xfloat:
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    %tmp3 = fcmp ogt <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmogt2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: fcmogt2xdouble:
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    %tmp3 = fcmp ogt <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmole2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmole2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
    %tmp3 = fcmp ole <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmole4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmole4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
    %tmp3 = fcmp ole <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmole2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmole2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
    %tmp3 = fcmp ole <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmolt2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmolt2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
    %tmp3 = fcmp olt <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmolt4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmolt4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
    %tmp3 = fcmp olt <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmolt2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmolt2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
    %tmp3 = fcmp olt <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmone2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmone2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp one <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmone4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmone4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp one <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmone2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmone2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; todo check reversed operands
    %tmp3 = fcmp one <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
@@ -1315,11 +1480,12 @@ define <2 x i64> @fcmone2xdouble(<2 x double> %A, <2 x double> %B) {
 
 
 define <2 x i32> @fcmord2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmord2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ord <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1327,22 +1493,24 @@ define <2 x i32> @fcmord2xfloat(<2 x float> %A, <2 x float> %B) {
 
 
 define <4 x i32> @fcmord4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmord4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ord <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmord2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmord2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ord <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1350,236 +1518,260 @@ define <2 x i64> @fcmord2xdouble(<2 x double> %A, <2 x double> %B) {
 
 
 define <2 x i32> @fcmuno2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmuno2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uno <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmuno4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmuno4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmuno2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmuno2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmueq2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmueq2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ueq <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmueq4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmueq4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmueq2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmueq2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmuge2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmuge2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGE = ULE with swapped operands, ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uge <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmuge4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmuge4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGE = ULE with swapped operands, ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmuge2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmuge2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGE = ULE with swapped operands, ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmugt2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmugt2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGT = ULT with swapped operands, ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ugt <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmugt4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmugt4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGT = ULT with swapped operands, ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmugt2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: fcmugt2xdouble:
+; CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmule2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmule2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ule <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmule4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmule4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmule2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmule2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmult2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmult2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ult <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmult4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmult4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmult2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmult2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmune2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmune2xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNE = !OEQ.
-;CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp une <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmune4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmune4xfloat:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNE = !OEQ.
-;CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmune2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmune2xdouble:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNE = !OEQ.
-;CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmoeqz2xfloat(<2 x float> %A) {
-;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmoeqz2xfloat:
+; CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
    %tmp3 = fcmp oeq <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmoeqz4xfloat(<4 x float> %A) {
-;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmoeqz4xfloat:
+; CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
    %tmp3 = fcmp oeq <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmoeqz2xdouble(<2 x double> %A) {
-;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmoeqz2xdouble:
+; CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
    %tmp3 = fcmp oeq <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1587,250 +1779,280 @@ define <2 x i64> @fcmoeqz2xdouble(<2 x double> %A) {
 
 
 define <2 x i32> @fcmogez2xfloat(<2 x float> %A) {
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmogez2xfloat:
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
    %tmp3 = fcmp oge <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmogez4xfloat(<4 x float> %A) {
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmogez4xfloat:
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
    %tmp3 = fcmp oge <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmogez2xdouble(<2 x double> %A) {
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmogez2xdouble:
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
    %tmp3 = fcmp oge <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmogtz2xfloat(<2 x float> %A) {
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmogtz2xfloat:
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
    %tmp3 = fcmp ogt <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmogtz4xfloat(<4 x float> %A) {
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmogtz4xfloat:
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
    %tmp3 = fcmp ogt <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmogtz2xdouble(<2 x double> %A) {
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmogtz2xdouble:
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
    %tmp3 = fcmp ogt <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmoltz2xfloat(<2 x float> %A) {
-;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmoltz2xfloat:
+; CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
    %tmp3 = fcmp olt <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmoltz4xfloat(<4 x float> %A) {
-;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmoltz4xfloat:
+; CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
    %tmp3 = fcmp olt <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmoltz2xdouble(<2 x double> %A) {
-;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmoltz2xdouble:
+; CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
    %tmp3 = fcmp olt <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmolez2xfloat(<2 x float> %A) {
-;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmolez2xfloat:
+; CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
    %tmp3 = fcmp ole <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmolez4xfloat(<4 x float> %A) {
-;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmolez4xfloat:
+; CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
    %tmp3 = fcmp ole <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmolez2xdouble(<2 x double> %A) {
-;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmolez2xdouble:
+; CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
    %tmp3 = fcmp ole <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmonez2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmonez2xfloat:
 ; ONE with zero = OLT | OGT
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp one <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmonez4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmonez4xfloat:
 ; ONE with zero = OLT | OGT
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp one <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmonez2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmonez2xdouble:
 ; ONE with zero = OLT | OGT
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp one <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmordz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmordz2xfloat:
 ; ORD with zero = OLT | OGE
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ord <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmordz4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmordz4xfloat:
 ; ORD with zero = OLT | OGE
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ord <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmordz2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmordz2xdouble:
 ; ORD with zero = OLT | OGE
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ord <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmueqz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmueqz2xfloat:
 ; UEQ with zero = !ONE = !(OLT |OGT)
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ueq <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmueqz4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmueqz4xfloat:
 ; UEQ with zero = !ONE = !(OLT |OGT)
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmueqz2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmueqz2xdouble:
 ; UEQ with zero = !ONE = !(OLT |OGT)
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmugez2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmugez2xfloat:
 ; UGE with zero = !OLT
-;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uge <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmugez4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmugez4xfloat:
 ; UGE with zero = !OLT
-;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmugez2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmugez2xdouble:
 ; UGE with zero = !OLT
-;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmugtz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmugtz2xfloat:
 ; UGT with zero = !OLE
-;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ugt <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmugtz4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmugtz4xfloat:
 ; UGT with zero = !OLE
-;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmugtz2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmugtz2xdouble:
 ; UGT with zero = !OLE
-;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmultz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmultz2xfloat:
 ; ULT with zero = !OGE
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ult <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmultz4xfloat(<4 x float> %A) {
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: fcmultz4xfloat:
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmultz2xdouble(<2 x double> %A) {
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: fcmultz2xdouble:
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1838,53 +2060,59 @@ define <2 x i64> @fcmultz2xdouble(<2 x double> %A) {
 
 
 define <2 x i32> @fcmulez2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmulez2xfloat:
 ; ULE with zero = !OGT
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ule <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmulez4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmulez4xfloat:
 ; ULE with zero = !OGT
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmulez2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmulez2xdouble:
 ; ULE with zero = !OGT
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
 }
 
 define <2 x i32> @fcmunez2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmunez2xfloat:
 ; UNE with zero = !OEQ with zero
-;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp une <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmunez4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmunez4xfloat:
 ; UNE with zero = !OEQ with zero
-;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmunez2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmunez2xdouble:
 ; UNE with zero = !OEQ with zero
-;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1892,33 +2120,36 @@ define <2 x i64> @fcmunez2xdouble(<2 x double> %A) {
 
 
 define <2 x i32> @fcmunoz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmunoz2xfloat:
 ; UNO with zero = !ORD = !(OLT | OGE)
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uno <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
 }
 
 define <4 x i32> @fcmunoz4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmunoz4xfloat:
 ; UNO with zero = !ORD = !(OLT | OGE)
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @fcmunoz2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmunoz2xdouble:
 ; UNO with zero = !ORD = !(OLT | OGE)
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
diff --git a/test/CodeGen/AArch64/neon-copy.ll b/test/CodeGen/AArch64/neon-copy.ll
deleted file mode 100644
index b4d55df..0000000
--- a/test/CodeGen/AArch64/neon-copy.ll
+++ /dev/null
@@ -1,1402 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-
-define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[15], {{w[0-9]+}}
-  %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[6], {{w[0-9]+}}
-  %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6
-  ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[2], {{w[0-9]+}}
-  %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2
-  ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{x[0-9]+}}
-  %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1
-  ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[5], {{w[0-9]+}}
-  %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5
-  ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[3], {{w[0-9]+}}
-  %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3
-  ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}}
-  %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
-  ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
-  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
-  ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
-  ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
-  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
-  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
-  ret <2 x i64> %tmp4
-}
-
-define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x float> %tmp1, i32 2
-  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
-  ret <4 x float> %tmp4
-}
-
-define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x double> %tmp1, i32 0
-  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
-  ret <2 x double> %tmp4
-}
-
-define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
-  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
-  ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
-  ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
-  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
-  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
-  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
-  ret <2 x i64> %tmp4
-}
-
-define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
-  %tmp3 = extractelement <2 x float> %tmp1, i32 1
-  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
-  ret <4 x float> %tmp4
-}
-
-define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <1 x double> %tmp1, i32 0
-  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
-  ret <2 x double> %tmp4
-}
-
-define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
-  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
-  ret <8 x i8> %tmp4
-}
-
-define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
-  ret <4 x i16> %tmp4
-}
-
-define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
-  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
-  ret <2 x i32> %tmp4
-}
-
-define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
-  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
-  ret <1 x i64> %tmp4
-}
-
-define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x float> %tmp1, i32 2
-  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
-  ret <2 x float> %tmp4
-}
-
-define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x double> %tmp1, i32 0
-  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
-  ret <1 x double> %tmp4
-}
-
-define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
-  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
-  ret <8 x i8> %tmp4
-}
-
-define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
-  ret <4 x i16> %tmp4
-}
-
-define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  %tmp3 = extractelement <2 x i32> %tmp1, i32 0
-  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
-  ret <2 x i32> %tmp4
-}
-
-define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
-  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
-  ret <1 x i64> %tmp4
-}
-
-define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-  %tmp3 = extractelement <2 x float> %tmp1, i32 0
-  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
-  ret <2 x float> %tmp4
-}
-
-define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <1 x double> %tmp1, i32 0
-  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
-  ret <1 x double> %tmp4
-}
-
-define i32 @umovw16b(<16 x i8> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
-  %tmp4 = zext i8 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @umovw8h(<8 x i16> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = zext i16 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @umovw4s(<4 x i32> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
-  ret i32 %tmp3
-}
-
-define i64 @umovx2d(<2 x i64> %tmp1) {
-;CHECK: umov {{x[0-9]+}}, {{v[0-9]+}}.d[0]
-  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
-  ret i64 %tmp3
-}
-
-define i32 @umovw8b(<8 x i8> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[7]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 7
-  %tmp4 = zext i8 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @umovw4h(<4 x i16> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = zext i16 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @umovw2s(<2 x i32> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
-  ret i32 %tmp3
-}
-
-define i64 @umovx1d(<1 x i64> %tmp1) {
-;CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
-  ret i64 %tmp3
-}
-
-define i32 @smovw16b(<16 x i8> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
-  %tmp4 = sext i8 %tmp3 to i32
-  %tmp5 = add i32 5, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovw8h(<8 x i16> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  %tmp5 = add i32 5, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovx16b(<16 x i8> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[8]
-  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
-  %tmp4 = sext i8 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @smovx8h(<8 x i16> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i64 @smovx4s(<4 x i32> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2]
-  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
-  %tmp4 = sext i32 %tmp3 to i64
-  ret i64 %tmp4
-}
-
-define i32 @smovw8b(<8 x i8> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 4
-  %tmp4 = sext i8 %tmp3 to i32
-  %tmp5 = add i32 5, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovw4h(<4 x i16> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  %tmp5 = add i32 5, %tmp4
-  ret i32 %tmp5
-}
-
-define i32 @smovx8b(<8 x i8> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[6]
-  %tmp3 = extractelement <8 x i8> %tmp1, i32 6
-  %tmp4 = sext i8 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i32 @smovx4h(<4 x i16> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
-  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  ret i32 %tmp4
-}
-
-define i64 @smovx2s(<2 x i32> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
-  %tmp4 = sext i32 %tmp3 to i64
-  ret i64 %tmp4
-}
-
-define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
-;CHECK: ins  {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
-  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
-  ret <8 x i8> %vset_lane
-}
-
-define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
-;CHECK: ins  {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
-  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
-  ret <16 x i8> %vset_lane
-}
-
-define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
-;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
-  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
-  ret <8 x i8> %vset_lane
-}
-
-define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
-;CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
-  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ret <16 x i8> %vset_lane
-}
-
-define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
-  %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
-  %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
-  %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
-  %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3
-  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4
-  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5
-  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6
-  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7
-  ret <8 x i8> %vecinit7.i
-}
-
-define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
-  %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
-  %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
-;CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-  %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
-  ret <1 x i64> %vecinit.i
-}
-
-define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
-  %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
-  %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
-  %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
-  %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3
-  %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4
-  %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5
-  %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6
-  %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7
-  %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8
-  %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9
-  %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10
-  %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11
-  %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12
-  %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13
-  %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14
-  %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15
-  ret <16 x i8> %vecinit15.i
-}
-
-define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
-  %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
-  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
-  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
-  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3
-  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4
-  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5
-  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6
-  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7
-  ret <8 x i16> %vecinit7.i
-}
-
-define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
-  %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
-  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
-  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
-  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3
-  ret <4 x i32> %vecinit3.i
-}
-
-define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
-  %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
-  %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
-  ret <2 x i64> %vecinit1.i
-}
-
-define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
-  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-  ret <8 x i8> %shuffle
-}
-
-define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
-  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-  ret <4 x i16> %shuffle
-}
-
-define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i32> %shuffle
-}
-
-define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
-;CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
-  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-  ret <16 x i8> %shuffle
-}
-
-define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
-;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
-  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  ret <8 x i16> %shuffle
-}
-
-define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
-;CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  ret <4 x i32> %shuffle
-}
-
-define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
-;CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %shuffle
-}
-
-define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
-  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-  ret <8 x i8> %shuffle
-}
-
-define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
-  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-  ret <4 x i16> %shuffle
-}
-
-define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i32> %shuffle
-}
-
-define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
-  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-  ret <16 x i8> %shuffle
-}
-
-define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
-;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
-  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  ret <8 x i16> %shuffle
-}
-
-define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  ret <4 x i32> %shuffle
-}
-
-define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %shuffle
-}
-
-define i64 @test_bitcastv8i8toi64(<8 x i8> %in) {
-; CHECK-LABEL: test_bitcastv8i8toi64:
-   %res = bitcast <8 x i8> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv4i16toi64(<4 x i16> %in) {
-; CHECK-LABEL: test_bitcastv4i16toi64:
-   %res = bitcast <4 x i16> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv2i32toi64(<2 x i32> %in) {
-; CHECK-LABEL: test_bitcastv2i32toi64:
-   %res = bitcast <2 x i32> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv2f32toi64(<2 x float> %in) {
-; CHECK-LABEL: test_bitcastv2f32toi64:
-   %res = bitcast <2 x float> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv1i64toi64(<1 x i64> %in) {
-; CHECK-LABEL: test_bitcastv1i64toi64:
-   %res = bitcast <1 x i64> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define i64 @test_bitcastv1f64toi64(<1 x double> %in) {
-; CHECK-LABEL: test_bitcastv1f64toi64:
-   %res = bitcast <1 x double> %in to i64
-; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
-   ret i64 %res
-}
-
-define <8 x i8> @test_bitcasti64tov8i8(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov8i8:
-   %res = bitcast i64 %in to <8 x i8>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <8 x i8> %res
-}
-
-define <4 x i16> @test_bitcasti64tov4i16(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov4i16:
-   %res = bitcast i64 %in to <4 x i16>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <4 x i16> %res
-}
-
-define <2 x i32> @test_bitcasti64tov2i32(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov2i32:
-   %res = bitcast i64 %in to <2 x i32>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <2 x i32> %res
-}
-
-define <2 x float> @test_bitcasti64tov2f32(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov2f32:
-   %res = bitcast i64 %in to <2 x float>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <2 x float> %res
-}
-
-define <1 x i64> @test_bitcasti64tov1i64(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov1i64:
-   %res = bitcast i64 %in to <1 x i64>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <1 x i64> %res
-}
-
-define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
-; CHECK-LABEL: test_bitcasti64tov1f64:
-   %res = bitcast i64 %in to <1 x double>
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-   ret <1 x double> %res
-}
-
-define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
-; CHECK-LABEL: test_bitcastv8i8tov1f64:
-; CHECK: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
-  %sub.i = sub <8 x i8> zeroinitializer, %a
-  %1 = bitcast <8 x i8> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
-; CHECK-LABEL: test_bitcastv4i16tov1f64:
-; CHECK: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
-  %sub.i = sub <4 x i16> zeroinitializer, %a
-  %1 = bitcast <4 x i16> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_bitcastv2i32tov1f64:
-; CHECK: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
-  %sub.i = sub <2 x i32> zeroinitializer, %a
-  %1 = bitcast <2 x i32> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1i64tov1f64:
-; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
-  %sub.i = sub <1 x i64> zeroinitializer, %a
-  %1 = bitcast <1 x i64> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
-; CHECK-LABEL: test_bitcastv2f32tov1f64:
-; CHECK: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
-  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
-  %1 = bitcast <2 x float> %sub.i to <1 x double>
-  %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
-  ret <1 x i64> %vcvt.i
-}
-
-define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov8i8:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <8 x i8>
-  %sub.i = sub <8 x i8> zeroinitializer, %1
-  ret <8 x i8> %sub.i
-}
-
-define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov4i16:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <4 x i16>
-  %sub.i = sub <4 x i16> zeroinitializer, %1
-  ret <4 x i16> %sub.i
-}
-
-define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov2i32:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <2 x i32>
-  %sub.i = sub <2 x i32> zeroinitializer, %1
-  ret <2 x i32> %sub.i
-}
-
-define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov1i64:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: neg {{d[0-9]+}}, {{d[0-9]+}}
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <1 x i64>
-  %sub.i = sub <1 x i64> zeroinitializer, %1
-  ret <1 x i64> %sub.i
-}
-
-define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
-; CHECK-LABEL: test_bitcastv1f64tov2f32:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %vcvt.i = sitofp <1 x i64> %a to <1 x double>
-  %1 = bitcast <1 x double> %vcvt.i to <2 x float>
-  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %1
-  ret <2 x float> %sub.i
-}
-
-; Test insert element into an undef vector
-define <8 x i8> @scalar_to_vector.v8i8(i8 %a) {
-; CHECK-LABEL: scalar_to_vector.v8i8:
-; CHECK: ins {{v[0-9]+}}.b[0], {{w[0-9]+}}
-  %b = insertelement <8 x i8> undef, i8 %a, i32 0
-  ret <8 x i8> %b
-}
-
-define <16 x i8> @scalar_to_vector.v16i8(i8 %a) {
-; CHECK-LABEL: scalar_to_vector.v16i8:
-; CHECK: ins {{v[0-9]+}}.b[0], {{w[0-9]+}}
-  %b = insertelement <16 x i8> undef, i8 %a, i32 0
-  ret <16 x i8> %b
-}
-
-define <4 x i16> @scalar_to_vector.v4i16(i16 %a) {
-; CHECK-LABEL: scalar_to_vector.v4i16:
-; CHECK: ins {{v[0-9]+}}.h[0], {{w[0-9]+}}
-  %b = insertelement <4 x i16> undef, i16 %a, i32 0
-  ret <4 x i16> %b
-}
-
-define <8 x i16> @scalar_to_vector.v8i16(i16 %a) {
-; CHECK-LABEL: scalar_to_vector.v8i16:
-; CHECK: ins {{v[0-9]+}}.h[0], {{w[0-9]+}}
-  %b = insertelement <8 x i16> undef, i16 %a, i32 0
-  ret <8 x i16> %b
-}
-
-define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
-; CHECK-LABEL: scalar_to_vector.v2i32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{w[0-9]+}}
-  %b = insertelement <2 x i32> undef, i32 %a, i32 0
-  ret <2 x i32> %b
-}
-
-define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
-; CHECK-LABEL: scalar_to_vector.v4i32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{w[0-9]+}}
-  %b = insertelement <4 x i32> undef, i32 %a, i32 0
-  ret <4 x i32> %b
-}
-
-define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
-; CHECK-LABEL: scalar_to_vector.v2i64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{x[0-9]+}}
-  %b = insertelement <2 x i64> undef, i64 %a, i32 0
-  ret <2 x i64> %b
-}
-
-define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
-; CHECK-LABEL: testDUP.v1i8:
-; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
-  %b = extractelement <1 x i8> %a, i32 0
-  %c = insertelement <8 x i8> undef, i8 %b, i32 0
-  %d = insertelement <8 x i8> %c, i8 %b, i32 1
-  %e = insertelement <8 x i8> %d, i8 %b, i32 2
-  %f = insertelement <8 x i8> %e, i8 %b, i32 3
-  %g = insertelement <8 x i8> %f, i8 %b, i32 4
-  %h = insertelement <8 x i8> %g, i8 %b, i32 5
-  %i = insertelement <8 x i8> %h, i8 %b, i32 6
-  %j = insertelement <8 x i8> %i, i8 %b, i32 7
-  ret <8 x i8> %j
-}
-
-define <8 x i16> @testDUP.v1i16(<1 x i16> %a) {
-; CHECK-LABEL: testDUP.v1i16:
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
-  %b = extractelement <1 x i16> %a, i32 0
-  %c = insertelement <8 x i16> undef, i16 %b, i32 0
-  %d = insertelement <8 x i16> %c, i16 %b, i32 1
-  %e = insertelement <8 x i16> %d, i16 %b, i32 2
-  %f = insertelement <8 x i16> %e, i16 %b, i32 3
-  %g = insertelement <8 x i16> %f, i16 %b, i32 4
-  %h = insertelement <8 x i16> %g, i16 %b, i32 5
-  %i = insertelement <8 x i16> %h, i16 %b, i32 6
-  %j = insertelement <8 x i16> %i, i16 %b, i32 7
-  ret <8 x i16> %j
-}
-
-define <4 x i32> @testDUP.v1i32(<1 x i32> %a) {
-; CHECK-LABEL: testDUP.v1i32:
-; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
-  %b = extractelement <1 x i32> %a, i32 0
-  %c = insertelement <4 x i32> undef, i32 %b, i32 0
-  %d = insertelement <4 x i32> %c, i32 %b, i32 1
-  %e = insertelement <4 x i32> %d, i32 %b, i32 2
-  %f = insertelement <4 x i32> %e, i32 %b, i32 3
-  ret <4 x i32> %f
-}
-
-define <8 x i8> @getl(<16 x i8> %x) #0 {
-; CHECK-LABEL: getl:
-; CHECK: ret
-  %vecext = extractelement <16 x i8> %x, i32 0
-  %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0
-  %vecext1 = extractelement <16 x i8> %x, i32 1
-  %vecinit2 = insertelement <8 x i8> %vecinit, i8 %vecext1, i32 1
-  %vecext3 = extractelement <16 x i8> %x, i32 2
-  %vecinit4 = insertelement <8 x i8> %vecinit2, i8 %vecext3, i32 2
-  %vecext5 = extractelement <16 x i8> %x, i32 3
-  %vecinit6 = insertelement <8 x i8> %vecinit4, i8 %vecext5, i32 3
-  %vecext7 = extractelement <16 x i8> %x, i32 4
-  %vecinit8 = insertelement <8 x i8> %vecinit6, i8 %vecext7, i32 4
-  %vecext9 = extractelement <16 x i8> %x, i32 5
-  %vecinit10 = insertelement <8 x i8> %vecinit8, i8 %vecext9, i32 5
-  %vecext11 = extractelement <16 x i8> %x, i32 6
-  %vecinit12 = insertelement <8 x i8> %vecinit10, i8 %vecext11, i32 6
-  %vecext13 = extractelement <16 x i8> %x, i32 7
-  %vecinit14 = insertelement <8 x i8> %vecinit12, i8 %vecext13, i32 7
-  ret <8 x i8> %vecinit14
-}
-
-define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
-; CHECK-LABEL: test_dup_v2i32_v4i16:
-; CHECK: dup v0.4h, v0.h[2]
-entry:
-  %x = extractelement <2 x i32> %a, i32 1
-  %vget_lane = trunc i32 %x to i16
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) {
-; CHECK-LABEL: test_dup_v4i32_v8i16:
-; CHECK: dup v0.8h, v0.h[6]
-entry:
-  %x = extractelement <4 x i32> %a, i32 3
-  %vget_lane = trunc i32 %x to i16
-  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
-  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
-  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
-  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
-  ret <8 x i16> %vecinit7.i
-}
-
-define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) {
-; CHECK-LABEL: test_dup_v1i64_v4i16:
-; CHECK: dup v0.4h, v0.h[0]
-entry:
-  %x = extractelement <1 x i64> %a, i32 0
-  %vget_lane = trunc i64 %x to i16
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) {
-; CHECK-LABEL: test_dup_v1i64_v2i32:
-; CHECK: dup v0.2s, v0.s[0]
-entry:
-  %x = extractelement <1 x i64> %a, i32 0
-  %vget_lane = trunc i64 %x to i32
-  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) {
-; CHECK-LABEL: test_dup_v2i64_v8i16:
-; CHECK: dup v0.8h, v0.h[4]
-entry:
-  %x = extractelement <2 x i64> %a, i32 1
-  %vget_lane = trunc i64 %x to i16
-  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
-  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
-  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
-  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
-  ret <8 x i16> %vecinit7.i
-}
-
-define <4 x i32> @test_dup_v2i64_v4i32(<2 x i64> %a) {
-; CHECK-LABEL: test_dup_v2i64_v4i32:
-; CHECK: dup v0.4s, v0.s[2]
-entry:
-  %x = extractelement <2 x i64> %a, i32 1
-  %vget_lane = trunc i64 %x to i32
-  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
-  ret <4 x i32> %vecinit3.i
-}
-
-define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) {
-; CHECK-LABEL: test_dup_v4i32_v4i16:
-; CHECK: dup v0.4h, v0.h[2]
-entry:
-  %x = extractelement <4 x i32> %a, i32 1
-  %vget_lane = trunc i32 %x to i16
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) {
-; CHECK-LABEL: test_dup_v2i64_v4i16:
-; CHECK: dup v0.4h, v0.h[0]
-entry:
-  %x = extractelement <2 x i64> %a, i32 0
-  %vget_lane = trunc i64 %x to i16
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  ret <4 x i16> %vecinit3.i
-}
-
-define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) {
-; CHECK-LABEL: test_dup_v2i64_v2i32:
-; CHECK: dup v0.2s, v0.s[0]
-entry:
-  %x = extractelement <2 x i64> %a, i32 0
-  %vget_lane = trunc i64 %x to i32
-  %vecinit.i = insertelement <2 x i32> undef, i32 %vget_lane, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %vget_lane, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-
-define <2 x float> @test_scalar_to_vector_f32_to_v2f32(<2 x float> %a) {
-; CHECK-LABEL: test_scalar_to_vector_f32_to_v2f32:
-; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
-; CHECK-NEXT: ret
-entry:
-  %0 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
-  %1 = insertelement <1 x float> undef, float %0, i32 0
-  %2 = extractelement <1 x float> %1, i32 0
-  %vecinit1.i = insertelement <2 x float> undef, float %2, i32 0
-  ret <2 x float> %vecinit1.i
-}
-
-define <4 x float> @test_scalar_to_vector_f32_to_v4f32(<2 x float> %a) {
-; CHECK-LABEL: test_scalar_to_vector_f32_to_v4f32:
-; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
-; CHECK-NEXT: ret
-entry:
-  %0 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
-  %1 = insertelement <1 x float> undef, float %0, i32 0
-  %2 = extractelement <1 x float> %1, i32 0
-  %vecinit1.i = insertelement <4 x float> undef, float %2, i32 0
-  ret <4 x float> %vecinit1.i
-}
-
-declare float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float>)
-
-define <2 x i32> @test_concat_undef_v1i32(<1 x i32> %a) {
-; CHECK-LABEL: test_concat_undef_v1i32:
-; CHECK: ins v{{[0-9]+}}.s[1], v{{[0-9]+}}.s[0]
-entry:
-  %0 = extractelement <1 x i32> %a, i32 0
-  %vecinit1.i = insertelement <2 x i32> undef, i32 %0, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>) #4
-
-define <2 x i32> @test_concat_v1i32_undef(<1 x i32> %a) {
-; CHECK-LABEL: test_concat_v1i32_undef:
-; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: ret
-entry:
-  %b = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %a)
-  %0 = extractelement <1 x i32> %b, i32 0
-  %vecinit.i432 = insertelement <2 x i32> undef, i32 %0, i32 0
-  ret <2 x i32> %vecinit.i432
-}
-
-define <2 x i32> @test_concat_same_v1i32_v1i32(<1 x i32> %a) {
-; CHECK-LABEL: test_concat_same_v1i32_v1i32:
-; CHECK: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
-entry:
-  %0 = extractelement <1 x i32> %a, i32 0
-  %vecinit.i = insertelement <2 x i32> undef, i32 %0, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %0, i32 1
-  ret <2 x i32> %vecinit1.i
-}
-
-define <2 x i32> @test_concat_diff_v1i32_v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: test_concat_diff_v1i32_v1i32:
-; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: ins v0.s[1], v1.s[0]
-entry:
-  %c = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %a)
-  %d = extractelement <1 x i32> %c, i32 0
-  %e = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %b)
-  %f = extractelement <1 x i32> %e, i32 0
-  %h = shufflevector <1 x i32> %c, <1 x i32> %e, <2 x i32> <i32 0, i32 1>
-  ret <2 x i32> %h
-}
-
-define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 {
-; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  ret <16 x i8> %vecinit30
-}
-
-define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
-; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <8 x i8> %x, i32 0
-  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
-  %vecext1 = extractelement <8 x i8> %x, i32 1
-  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
-  %vecext3 = extractelement <8 x i8> %x, i32 2
-  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
-  %vecext5 = extractelement <8 x i8> %x, i32 3
-  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
-  %vecext7 = extractelement <8 x i8> %x, i32 4
-  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
-  %vecext9 = extractelement <8 x i8> %x, i32 5
-  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
-  %vecext11 = extractelement <8 x i8> %x, i32 6
-  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
-  %vecext13 = extractelement <8 x i8> %x, i32 7
-  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
-  %vecinit30 = shufflevector <16 x i8> %vecinit14, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  ret <16 x i8> %vecinit30
-}
-
-define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
-; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <16 x i8> %x, i32 0
-  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
-  %vecext1 = extractelement <16 x i8> %x, i32 1
-  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
-  %vecext3 = extractelement <16 x i8> %x, i32 2
-  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
-  %vecext5 = extractelement <16 x i8> %x, i32 3
-  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
-  %vecext7 = extractelement <16 x i8> %x, i32 4
-  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
-  %vecext9 = extractelement <16 x i8> %x, i32 5
-  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
-  %vecext11 = extractelement <16 x i8> %x, i32 6
-  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
-  %vecext13 = extractelement <16 x i8> %x, i32 7
-  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
-  %vecext15 = extractelement <8 x i8> %y, i32 0
-  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
-  %vecext17 = extractelement <8 x i8> %y, i32 1
-  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
-  %vecext19 = extractelement <8 x i8> %y, i32 2
-  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
-  %vecext21 = extractelement <8 x i8> %y, i32 3
-  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
-  %vecext23 = extractelement <8 x i8> %y, i32 4
-  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
-  %vecext25 = extractelement <8 x i8> %y, i32 5
-  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
-  %vecext27 = extractelement <8 x i8> %y, i32 6
-  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
-  %vecext29 = extractelement <8 x i8> %y, i32 7
-  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
-  ret <16 x i8> %vecinit30
-}
-
-define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
-; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <8 x i8> %x, i32 0
-  %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0
-  %vecext1 = extractelement <8 x i8> %x, i32 1
-  %vecinit2 = insertelement <16 x i8> %vecinit, i8 %vecext1, i32 1
-  %vecext3 = extractelement <8 x i8> %x, i32 2
-  %vecinit4 = insertelement <16 x i8> %vecinit2, i8 %vecext3, i32 2
-  %vecext5 = extractelement <8 x i8> %x, i32 3
-  %vecinit6 = insertelement <16 x i8> %vecinit4, i8 %vecext5, i32 3
-  %vecext7 = extractelement <8 x i8> %x, i32 4
-  %vecinit8 = insertelement <16 x i8> %vecinit6, i8 %vecext7, i32 4
-  %vecext9 = extractelement <8 x i8> %x, i32 5
-  %vecinit10 = insertelement <16 x i8> %vecinit8, i8 %vecext9, i32 5
-  %vecext11 = extractelement <8 x i8> %x, i32 6
-  %vecinit12 = insertelement <16 x i8> %vecinit10, i8 %vecext11, i32 6
-  %vecext13 = extractelement <8 x i8> %x, i32 7
-  %vecinit14 = insertelement <16 x i8> %vecinit12, i8 %vecext13, i32 7
-  %vecext15 = extractelement <8 x i8> %y, i32 0
-  %vecinit16 = insertelement <16 x i8> %vecinit14, i8 %vecext15, i32 8
-  %vecext17 = extractelement <8 x i8> %y, i32 1
-  %vecinit18 = insertelement <16 x i8> %vecinit16, i8 %vecext17, i32 9
-  %vecext19 = extractelement <8 x i8> %y, i32 2
-  %vecinit20 = insertelement <16 x i8> %vecinit18, i8 %vecext19, i32 10
-  %vecext21 = extractelement <8 x i8> %y, i32 3
-  %vecinit22 = insertelement <16 x i8> %vecinit20, i8 %vecext21, i32 11
-  %vecext23 = extractelement <8 x i8> %y, i32 4
-  %vecinit24 = insertelement <16 x i8> %vecinit22, i8 %vecext23, i32 12
-  %vecext25 = extractelement <8 x i8> %y, i32 5
-  %vecinit26 = insertelement <16 x i8> %vecinit24, i8 %vecext25, i32 13
-  %vecext27 = extractelement <8 x i8> %y, i32 6
-  %vecinit28 = insertelement <16 x i8> %vecinit26, i8 %vecext27, i32 14
-  %vecext29 = extractelement <8 x i8> %y, i32 7
-  %vecinit30 = insertelement <16 x i8> %vecinit28, i8 %vecext29, i32 15
-  ret <16 x i8> %vecinit30
-}
-
-define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 {
-; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  ret <8 x i16> %vecinit14
-}
-
-define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
-; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <4 x i16> %x, i32 0
-  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
-  %vecext1 = extractelement <4 x i16> %x, i32 1
-  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
-  %vecext3 = extractelement <4 x i16> %x, i32 2
-  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
-  %vecext5 = extractelement <4 x i16> %x, i32 3
-  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
-  %vecinit14 = shufflevector <8 x i16> %vecinit6, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  ret <8 x i16> %vecinit14
-}
-
-define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
-; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <8 x i16> %x, i32 0
-  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
-  %vecext1 = extractelement <8 x i16> %x, i32 1
-  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
-  %vecext3 = extractelement <8 x i16> %x, i32 2
-  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
-  %vecext5 = extractelement <8 x i16> %x, i32 3
-  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
-  %vecext7 = extractelement <4 x i16> %y, i32 0
-  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
-  %vecext9 = extractelement <4 x i16> %y, i32 1
-  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
-  %vecext11 = extractelement <4 x i16> %y, i32 2
-  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
-  %vecext13 = extractelement <4 x i16> %y, i32 3
-  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
-  ret <8 x i16> %vecinit14
-}
-
-define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
-; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <4 x i16> %x, i32 0
-  %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0
-  %vecext1 = extractelement <4 x i16> %x, i32 1
-  %vecinit2 = insertelement <8 x i16> %vecinit, i16 %vecext1, i32 1
-  %vecext3 = extractelement <4 x i16> %x, i32 2
-  %vecinit4 = insertelement <8 x i16> %vecinit2, i16 %vecext3, i32 2
-  %vecext5 = extractelement <4 x i16> %x, i32 3
-  %vecinit6 = insertelement <8 x i16> %vecinit4, i16 %vecext5, i32 3
-  %vecext7 = extractelement <4 x i16> %y, i32 0
-  %vecinit8 = insertelement <8 x i16> %vecinit6, i16 %vecext7, i32 4
-  %vecext9 = extractelement <4 x i16> %y, i32 1
-  %vecinit10 = insertelement <8 x i16> %vecinit8, i16 %vecext9, i32 5
-  %vecext11 = extractelement <4 x i16> %y, i32 2
-  %vecinit12 = insertelement <8 x i16> %vecinit10, i16 %vecext11, i32 6
-  %vecext13 = extractelement <4 x i16> %y, i32 3
-  %vecinit14 = insertelement <8 x i16> %vecinit12, i16 %vecext13, i32 7
-  ret <8 x i16> %vecinit14
-}
-
-define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 {
-; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  ret <4 x i32> %vecinit6
-}
-
-define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
-; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <2 x i32> %x, i32 0
-  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
-  %vecext1 = extractelement <2 x i32> %x, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
-  %vecinit6 = shufflevector <4 x i32> %vecinit2, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  ret <4 x i32> %vecinit6
-}
-
-define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
-; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <4 x i32> %x, i32 0
-  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
-  %vecext1 = extractelement <4 x i32> %x, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
-  %vecext3 = extractelement <2 x i32> %y, i32 0
-  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
-  %vecext5 = extractelement <2 x i32> %y, i32 1
-  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3
-  ret <4 x i32> %vecinit6
-}
-
-define <4 x i32> @test_concat_v4i32_v2i32_v2i32(<2 x i32> %x, <2 x i32> %y) #0 {
-; CHECK-LABEL: test_concat_v4i32_v2i32_v2i32:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <2 x i32> %x, i32 0
-  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
-  %vecext1 = extractelement <2 x i32> %x, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
-  %vecext3 = extractelement <2 x i32> %y, i32 0
-  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
-  %vecext5 = extractelement <2 x i32> %y, i32 1
-  %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3
-  ret <4 x i32> %vecinit6
-}
-
-define <2 x i64> @test_concat_v2i64_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v2i64_v2i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecinit2 = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
-  ret <2 x i64> %vecinit2
-}
-
-define <2 x i64> @test_concat_v2i64_v1i64_v2i64(<1 x i64> %x, <2 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v1i64_v2i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <1 x i64> %x, i32 0
-  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
-  %vecinit2 = shufflevector <2 x i64> %vecinit, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
-  ret <2 x i64> %vecinit2
-}
-
-define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <2 x i64> %x, i32 0
-  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
-  %vecext1 = extractelement <1 x i64> %y, i32 0
-  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
-  ret <2 x i64> %vecinit2
-}
-
-define <2 x i64> @test_concat_v2i64_v1i64_v1i64(<1 x i64> %x, <1 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v1i64_v1i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-entry:
-  %vecext = extractelement <1 x i64> %x, i32 0
-  %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
-  %vecext1 = extractelement <1 x i64> %y, i32 0
-  %vecinit2 = insertelement <2 x i64> %vecinit, i64 %vecext1, i32 1
-  ret <2 x i64> %vecinit2
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>)
-
-; This case tests the copy of two FPR8 registers, which is implemented by fmov
-; of two FPR32 registers.
-define <1 x i8> @test_copy_FPR8_FPR8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: test_copy_FPR8_FPR8:
-; CHECK: usqadd b1, b0
-; CHECK-NEXT: fmov s0, s1
-entry:
- %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %b, <1 x i8> %a)
- ret <1 x i8> %vsqadd2.i
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_copy_FPR16_FPR16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: test_copy_FPR16_FPR16:
-; CHECK: usqadd h1, h0
-; CHECK-NEXT: fmov s0, s1
-entry:
-  %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %b, <1 x i16> %a)
-  ret <1 x i16> %vsqadd2.i
-}
-
-define <4 x i16> @concat_vector_v4i16_const() {
-; CHECK-LABEL: concat_vector_v4i16_const:
-; CHECK: dup {{v[0-9]+}}.4h, wzr
- %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
- ret <4 x i16> %r
-}
-
-define <4 x i16> @concat_vector_v4i16_const_one() {
-; CHECK-LABEL: concat_vector_v4i16_const_one:
-; CHECK: movz {{w[0-9]+}}, #1
-; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
- %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <4 x i32> zeroinitializer
- ret <4 x i16> %r
-}
-
-define <4 x i32> @concat_vector_v4i32_const() {
-; CHECK-LABEL: concat_vector_v4i32_const:
-; CHECK: dup {{v[0-9]+}}.4s, wzr
- %r = shufflevector <1 x i32> zeroinitializer, <1 x i32> undef, <4 x i32> zeroinitializer
- ret <4 x i32> %r
-}
-
-define <8 x i8> @concat_vector_v8i8_const() {
-; CHECK-LABEL: concat_vector_v8i8_const:
-; CHECK: dup {{v[0-9]+}}.8b, wzr
- %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
- ret <8 x i8> %r
-}
-
-define <8 x i16> @concat_vector_v8i16_const() {
-; CHECK-LABEL: concat_vector_v8i16_const:
-; CHECK: dup {{v[0-9]+}}.8h, wzr
- %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <8 x i32> zeroinitializer
- ret <8 x i16> %r
-}
-
-define <8 x i16> @concat_vector_v8i16_const_one() {
-; CHECK-LABEL: concat_vector_v8i16_const_one:
-; CHECK: movz {{w[0-9]+}}, #1
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
- %r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <8 x i32> zeroinitializer
- ret <8 x i16> %r
-}
-
-define <16 x i8> @concat_vector_v16i8_const() {
-; CHECK-LABEL: concat_vector_v16i8_const:
-; CHECK: dup {{v[0-9]+}}.16b, wzr
- %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <16 x i32> zeroinitializer
- ret <16 x i8> %r
-}
-
-define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) {
-; CHECK-LABEL: concat_vector_v4i16:
-; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
- %r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer
- ret <4 x i16> %r
-}
-
-define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) {
-; CHECK-LABEL: concat_vector_v4i32:
-; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
- %r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer
- ret <4 x i32> %r
-}
-
-define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
-; CHECK-LABEL: concat_vector_v8i8:
-; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[0]
- %r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer
- ret <8 x i8> %r
-}
-
-define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) {
-; CHECK-LABEL: concat_vector_v8i16:
-; CHECK: dup {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
- %r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer
- ret <8 x i16> %r
-}
-
-define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
-; CHECK-LABEL: concat_vector_v16i8:
-; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[0]
- %r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer
- ret <16 x i8> %r
-}
diff --git a/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll b/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
deleted file mode 100644
index 4dffcd1..0000000
--- a/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <4 x i32> @copyTuple.QPair(i8* %a, i8* %b) {
-; CHECK-LABEL: copyTuple.QPair:
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i32 0, i32 4)
-  %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 1, i32 4)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-define <4 x i32> @copyTuple.QTriple(i8* %a, i8* %b, <4 x i32> %c) {
-; CHECK-LABEL: copyTuple.QTriple:
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4)
-  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i32 1, i32 4)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-define <4 x i32> @copyTuple.QQuad(i8* %a, i8* %b, <4 x i32> %c) {
-; CHECK-LABEL: copyTuple.QQuad:
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4)
-  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i32 1, i32 4)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-crypto.ll b/test/CodeGen/AArch64/neon-crypto.ll
deleted file mode 100644
index c0014fa..0000000
--- a/test/CodeGen/AArch64/neon-crypto.ll
+++ /dev/null
@@ -1,144 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -mattr=+crypto | FileCheck %s
-; RUN: not llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s
-
-declare <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha256h(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1m(<4 x i32>, i32, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1p(<4 x i32>, i32, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1c(<4 x i32>, i32, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32>, <4 x i32>) #1
-
-declare i32 @llvm.arm.neon.sha1h(i32) #1
-
-declare <16 x i8> @llvm.arm.neon.aesimc(<16 x i8>) #1
-
-declare <16 x i8> @llvm.arm.neon.aesmc(<16 x i8>) #1
-
-declare <16 x i8> @llvm.arm.neon.aesd(<16 x i8>, <16 x i8>) #1
-
-declare <16 x i8> @llvm.arm.neon.aese(<16 x i8>, <16 x i8>) #1
-
-define <16 x i8> @test_vaeseq_u8(<16 x i8> %data, <16 x i8> %key) {
-; CHECK: test_vaeseq_u8:
-; CHECK: aese {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-; CHECK-NO-CRYPTO: Cannot select: intrinsic %llvm.arm.neon.aese
-entry:
-  %aese.i = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data, <16 x i8> %key)
-  ret <16 x i8> %aese.i
-}
-
-define <16 x i8> @test_vaesdq_u8(<16 x i8> %data, <16 x i8> %key) {
-; CHECK: test_vaesdq_u8:
-; CHECK: aesd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %aesd.i = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data, <16 x i8> %key)
-  ret <16 x i8> %aesd.i
-}
-
-define <16 x i8> @test_vaesmcq_u8(<16 x i8> %data) {
-; CHECK: test_vaesmcq_u8:
-; CHECK: aesmc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %aesmc.i = tail call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %data)
-  ret <16 x i8> %aesmc.i
-}
-
-define <16 x i8> @test_vaesimcq_u8(<16 x i8> %data) {
-; CHECK: test_vaesimcq_u8:
-; CHECK: aesimc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
-  %aesimc.i = tail call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %data)
-  ret <16 x i8> %aesimc.i
-}
-
-define i32 @test_vsha1h_u32(i32 %hash_e) {
-; CHECK: test_vsha1h_u32:
-; CHECK: sha1h {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %sha1h1.i = tail call i32 @llvm.arm.neon.sha1h(i32 %hash_e)
-  ret i32 %sha1h1.i
-}
-
-define <4 x i32> @test_vsha1su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w12_15) {
-; CHECK: test_vsha1su1q_u32:
-; CHECK: sha1su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %sha1su12.i = tail call <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32> %tw0_3, <4 x i32> %w12_15)
-  ret <4 x i32> %sha1su12.i
-}
-
-define <4 x i32> @test_vsha256su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7) {
-; CHECK: test_vsha256su0q_u32:
-; CHECK: sha256su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %sha256su02.i = tail call <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
-  ret <4 x i32> %sha256su02.i
-}
-
-define <4 x i32> @test_vsha1cq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK: test_vsha1cq_u32:
-; CHECK: sha1c {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha1c1.i = tail call <4 x i32> @llvm.arm.neon.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %sha1c1.i
-}
-
-define <4 x i32> @test_vsha1pq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK: test_vsha1pq_u32:
-; CHECK: sha1p {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha1p1.i = tail call <4 x i32> @llvm.arm.neon.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %sha1p1.i
-}
-
-define <4 x i32> @test_vsha1mq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK: test_vsha1mq_u32:
-; CHECK: sha1m {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha1m1.i = tail call <4 x i32> @llvm.arm.neon.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %sha1m1.i
-}
-
-define <4 x i32> @test_vsha1su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11) {
-; CHECK: test_vsha1su0q_u32:
-; CHECK: sha1su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %sha1su03.i = tail call <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11)
-  ret <4 x i32> %sha1su03.i
-}
-
-define <4 x i32> @test_vsha256hq_u32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
-; CHECK: test_vsha256hq_u32:
-; CHECK: sha256h {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha256h3.i = tail call <4 x i32> @llvm.arm.neon.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
-  ret <4 x i32> %sha256h3.i
-}
-
-define <4 x i32> @test_vsha256h2q_u32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
-; CHECK: test_vsha256h2q_u32:
-; CHECK: sha256h2 {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sha256h23.i = tail call <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
-  ret <4 x i32> %sha256h23.i
-}
-
-define <4 x i32> @test_vsha256su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
-; CHECK: test_vsha256su1q_u32:
-; CHECK: sha256su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %sha256su13.i = tail call <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
-  ret <4 x i32> %sha256su13.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-diagnostics.ll b/test/CodeGen/AArch64/neon-diagnostics.ll
index f546aa7..099b685 100644
--- a/test/CodeGen/AArch64/neon-diagnostics.ll
+++ b/test/CodeGen/AArch64/neon-diagnostics.ll
@@ -21,4 +21,4 @@ define <4 x i32> @test_vshrn_not_match(<2 x i32> %a, <2 x i64> %b) {
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
   %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
   ret <4 x i32> %4
-}
\ No newline at end of file
+}
diff --git a/test/CodeGen/AArch64/neon-extract.ll b/test/CodeGen/AArch64/neon-extract.ll
index cddc226..f270b54 100644
--- a/test/CodeGen/AArch64/neon-extract.ll
+++ b/test/CodeGen/AArch64/neon-extract.ll
@@ -1,221 +1,221 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vext_s8:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+; CHECK-LABEL: test_vext_s8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x2|2}}
 entry:
   %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   ret <8 x i8> %vext
 }
 
 define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vext_s16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+; CHECK-LABEL: test_vext_s16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x6|6}}
 entry:
   %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i16> %vext
 }
 
 define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vext_s32:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+; CHECK-LABEL: test_vext_s32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
 entry:
   %vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x i32> %vext
 }
 
 define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK: test_vext_s64:
+; CHECK-LABEL: test_vext_s64:
 entry:
   %vext = shufflevector <1 x i64> %a, <1 x i64> %b, <1 x i32> <i32 0>
   ret <1 x i64> %vext
 }
 
 define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vextq_s8:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+; CHECK-LABEL: test_vextq_s8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x2|2}}
 entry:
   %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   ret <16 x i8> %vext
 }
 
 define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vextq_s16:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_vextq_s16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
 entry:
   %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   ret <8 x i16> %vext
 }
 
 define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vextq_s32:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+; CHECK-LABEL: test_vextq_s32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x4|4}}
 entry:
   %vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i32> %vext
 }
 
 define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vextq_s64:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+; CHECK-LABEL: test_vextq_s64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x8|8}}
 entry:
   %vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %vext
 }
 
 define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vext_u8:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+; CHECK-LABEL: test_vext_u8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x2|2}}
 entry:
   %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   ret <8 x i8> %vext
 }
 
 define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vext_u16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+; CHECK-LABEL: test_vext_u16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x6|6}}
 entry:
   %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i16> %vext
 }
 
 define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vext_u32:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+; CHECK-LABEL: test_vext_u32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
 entry:
   %vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x i32> %vext
 }
 
 define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK: test_vext_u64:
+; CHECK-LABEL: test_vext_u64:
 entry:
   %vext = shufflevector <1 x i64> %a, <1 x i64> %b, <1 x i32> <i32 0>
   ret <1 x i64> %vext
 }
 
 define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vextq_u8:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+; CHECK-LABEL: test_vextq_u8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x2|2}}
 entry:
   %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   ret <16 x i8> %vext
 }
 
 define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vextq_u16:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_vextq_u16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
 entry:
   %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   ret <8 x i16> %vext
 }
 
 define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vextq_u32:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+; CHECK-LABEL: test_vextq_u32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x4|4}}
 entry:
   %vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i32> %vext
 }
 
 define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vextq_u64:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+; CHECK-LABEL: test_vextq_u64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x8|8}}
 entry:
   %vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %vext
 }
 
 define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vext_f32:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+; CHECK-LABEL: test_vext_f32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
 entry:
   %vext = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %vext
 }
 
 define <1 x double> @test_vext_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK: test_vext_f64:
+; CHECK-LABEL: test_vext_f64:
 entry:
   %vext = shufflevector <1 x double> %a, <1 x double> %b, <1 x i32> <i32 0>
   ret <1 x double> %vext
 }
 
 define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vextq_f32:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+; CHECK-LABEL: test_vextq_f32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x4|4}}
 entry:
   %vext = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x float> %vext
 }
 
 define <2 x double> @test_vextq_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vextq_f64:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+; CHECK-LABEL: test_vextq_f64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x8|8}}
 entry:
   %vext = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x double> %vext
 }
 
 define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vext_p8:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+; CHECK-LABEL: test_vext_p8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x2|2}}
 entry:
   %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   ret <8 x i8> %vext
 }
 
 define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vext_p16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+; CHECK-LABEL: test_vext_p16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x6|6}}
 entry:
   %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x i16> %vext
 }
 
 define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vextq_p8:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+; CHECK-LABEL: test_vextq_p8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x2|2}}
 entry:
   %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   ret <16 x i8> %vext
 }
 
 define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vextq_p16:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_vextq_p16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
 entry:
   %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   ret <8 x i16> %vext
 }
 
 define <8 x i8> @test_undef_vext_s8(<8 x i8> %a) {
-; CHECK: test_undef_vext_s8:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+; CHECK-LABEL: test_undef_vext_s8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x2|2}}
 entry:
   %vext = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 10, i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   ret <8 x i8> %vext
 }
 
 define <16 x i8> @test_undef_vextq_s8(<16 x i8> %a) {
-; CHECK: test_undef_vextq_s8:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_undef_vextq_s8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
 entry:
   %vext = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 20, i32 20, i32 20, i32 20, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 20, i32 20, i32 20, i32 20, i32 20>
   ret <16 x i8> %vext
 }
 
 define <4 x i16> @test_undef_vext_s16(<4 x i16> %a) {
-; CHECK: test_undef_vext_s16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+; CHECK-LABEL: test_undef_vext_s16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
 entry:
   %vext = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   ret <4 x i16> %vext
 }
 
 define <8 x i16> @test_undef_vextq_s16(<8 x i16> %a) {
-; CHECK: test_undef_vextq_s16:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_undef_vextq_s16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
 entry:
   %vext = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   ret <8 x i16> %vext
diff --git a/test/CodeGen/AArch64/neon-facge-facgt.ll b/test/CodeGen/AArch64/neon-facge-facgt.ll
deleted file mode 100644
index 28e8212..0000000
--- a/test/CodeGen/AArch64/neon-facge-facgt.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float>, <2 x float>)
-declare <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float>, <4 x float>)
-declare <2 x i64> @llvm.arm.neon.vacge.v2i64.v2f64(<2 x double>, <2 x double>)
-
-define <2 x i32> @facge_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facge_from_intr_v2i32:
-  %val = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %A, <2 x float> %B)
-; CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  ret <2 x i32> %val
-}
-define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facge_from_intr_v4i32:
-  %val = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %A, <4 x float> %B)
-; CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  ret <4 x i32> %val
-}
-
-define <2 x i64> @facge_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facge_from_intr_v2i64:
-  %val = call <2 x i64> @llvm.arm.neon.vacge.v2i64.v2f64(<2 x double> %A, <2 x double> %B)
-; CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  ret <2 x i64> %val
-}
-
-declare <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float>, <2 x float>)
-declare <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float>, <4 x float>)
-declare <2 x i64> @llvm.arm.neon.vacgt.v2i64.v2f64(<2 x double>, <2 x double>)
-
-define <2 x i32> @facgt_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facgt_from_intr_v2i32:
-  %val = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %A, <2 x float> %B)
-; CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  ret <2 x i32> %val
-}
-define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facgt_from_intr_v4i32:
-  %val = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %A, <4 x float> %B)
-; CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  ret <4 x i32> %val
-}
-
-define <2 x i64> @facgt_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facgt_from_intr_v2i64:
-  %val = call <2 x i64> @llvm.arm.neon.vacgt.v2i64.v2f64(<2 x double> %A, <2 x double> %B)
-; CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  ret <2 x i64> %val
-}
-
diff --git a/test/CodeGen/AArch64/neon-frsqrt-frecp.ll b/test/CodeGen/AArch64/neon-frsqrt-frecp.ll
deleted file mode 100644
index 46fe25d..0000000
--- a/test/CodeGen/AArch64/neon-frsqrt-frecp.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon  | FileCheck %s
-
-; Set of tests for when the intrinsic is used.
-
-declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @frsqrts_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frsqrts v0.2s, v0.2s, v1.2s
-        %val = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-        ret <2 x float> %val
-}
-
-define <4 x float> @frsqrts_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frsqrts v0.4s, v0.4s, v1.4s
-        %val = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-        ret <4 x float> %val
-}
-
-define <2 x double> @frsqrts_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frsqrts v0.2d, v0.2d, v1.2d
-        %val = call <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @frecps_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frecps v0.2s, v0.2s, v1.2s
-        %val = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-        ret <2 x float> %val
-}
-
-define <4 x float> @frecps_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frecps v0.4s, v0.4s, v1.4s
-        %val = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-        ret <4 x float> %val
-}
-
-define <2 x double> @frecps_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frecps v0.2d, v0.2d, v1.2d
-        %val = call <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-        ret <2 x double> %val
-}
-
diff --git a/test/CodeGen/AArch64/neon-halving-add-sub.ll b/test/CodeGen/AArch64/neon-halving-add-sub.ll
deleted file mode 100644
index a8f59db..0000000
--- a/test/CodeGen/AArch64/neon-halving-add-sub.ll
+++ /dev/null
@@ -1,207 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uhadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uhadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_shadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_shadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: shadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uhadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uhadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_shadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_shadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: shadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uhadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uhadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_shadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_shadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: shadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uhadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uhadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_shadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_shadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: shadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uhadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uhadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_shadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_shadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: shadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uhadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uhadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_shadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_shadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: shadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
-declare <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uhsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uhsub_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uhsub v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_shsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_shsub_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: shsub v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uhsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uhsub_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uhsub v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_shsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_shsub_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: shsub v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uhsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uhsub_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uhsub v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_shsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_shsub_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: shsub v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uhsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uhsub_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uhsub v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_shsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_shsub_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: shsub v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uhsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uhsub_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uhsub v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_shsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_shsub_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: shsub v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uhsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uhsub_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uhsub v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_shsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_shsub_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: shsub v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-idiv.ll b/test/CodeGen/AArch64/neon-idiv.ll
new file mode 100644
index 0000000..de402c4
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-idiv.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
+
+define <4 x i32> @test1(<4 x i32> %a) {
+  %rem = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %rem
+; CHECK-LABEL: test1
+; FIXME: Can we lower this more efficiently?
+; CHECK: mul
+; CHECK: mul
+; CHECK: mul
+; CHECK: mul
+}
+
diff --git a/test/CodeGen/AArch64/neon-load-store-v1i32.ll b/test/CodeGen/AArch64/neon-load-store-v1i32.ll
deleted file mode 100644
index 92f704d..0000000
--- a/test/CodeGen/AArch64/neon-load-store-v1i32.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-; Test load/store of v1i8, v1i16, v1i32 types can be selected correctly
-define void @load.store.v1i8(<1 x i8>* %ptr, <1 x i8>* %ptr2) {
-; CHECK-LABEL: load.store.v1i8:
-; CHECK: ldr b{{[0-9]+}}, [x{{[0-9]+|sp}}]
-; CHECK: str b{{[0-9]+}}, [x{{[0-9]+|sp}}]
-  %a = load <1 x i8>* %ptr
-  store <1 x i8> %a, <1 x i8>* %ptr2
-  ret void
-}
-
-define void @load.store.v1i16(<1 x i16>* %ptr, <1 x i16>* %ptr2) {
-; CHECK-LABEL: load.store.v1i16:
-; CHECK: ldr h{{[0-9]+}}, [x{{[0-9]+|sp}}]
-; CHECK: str h{{[0-9]+}}, [x{{[0-9]+|sp}}]
-  %a = load <1 x i16>* %ptr
-  store <1 x i16> %a, <1 x i16>* %ptr2
-  ret void
-}
-
-define void @load.store.v1i32(<1 x i32>* %ptr, <1 x i32>* %ptr2) {
-; CHECK-LABEL: load.store.v1i32:
-; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+|sp}}]
-; CHECK: str s{{[0-9]+}}, [x{{[0-9]+|sp}}]
-  %a = load <1 x i32>* %ptr
-  store <1 x i32> %a, <1 x i32>* %ptr2
-  ret void
-}
diff --git a/test/CodeGen/AArch64/neon-max-min-pairwise.ll b/test/CodeGen/AArch64/neon-max-min-pairwise.ll
deleted file mode 100644
index 3e18077..0000000
--- a/test/CodeGen/AArch64/neon-max-min-pairwise.ll
+++ /dev/null
@@ -1,346 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_smaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_smaxp_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: smaxp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_umaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: umaxp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_smaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_smaxp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: smaxp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_umaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_umaxp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: umaxp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_smaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_smaxp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: smaxp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_umaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_umaxp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: umaxp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_smaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_smaxp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: smaxp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_umaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_umaxp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: umaxp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_smaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_smaxp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: smaxp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_umaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_umaxp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: umaxp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_smaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_smaxp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: smaxp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_umaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_umaxp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: umaxp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_sminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_sminp_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sminp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_uminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uminp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_sminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sminp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sminp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_uminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uminp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uminp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_sminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sminp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sminp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_uminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uminp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uminp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_sminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sminp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sminp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_uminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uminp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uminp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_sminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sminp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sminp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_uminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uminp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uminp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_sminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sminp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sminp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_uminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uminp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uminp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmaxp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmaxp_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmaxp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmaxp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmaxp_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmaxp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmaxp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmaxp_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmaxp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fminp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fminp_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fminp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fminp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fminp_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fminp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fminp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fminp_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fminp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmaxnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmaxnmp_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmaxnmp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmaxnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmaxnmp_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmaxnmp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmaxnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmaxnmp_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmaxnmp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fminnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fminnmp_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fminnmp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fminnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fminnmp_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fminnmp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fminnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fminnmp_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fminnmp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-define i32 @test_vminv_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vminv_s32
-; CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-define i32 @test_vminv_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vminv_u32
-; CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-define i32 @test_vmaxv_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vmaxv_s32
-; CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-define i32 @test_vmaxv_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vmaxv_u32
-; CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i32> %1, i32 0
-  ret i32 %2
-}
-
-declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v2i32(<2 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v2i32(<2 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v2i32(<2 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v2i32(<2 x i32>)
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-max-min.ll b/test/CodeGen/AArch64/neon-max-min.ll
deleted file mode 100644
index 7889c77..0000000
--- a/test/CodeGen/AArch64/neon-max-min.ll
+++ /dev/null
@@ -1,310 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_smax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_smax_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: smax v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_umax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: umax v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_smax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_smax_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: smax v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_umax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_umax_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: umax v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_smax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_smax_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: smax v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_umax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_umax_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: umax v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_smax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_smax_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: smax v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_umax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_umax_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: umax v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_smax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_smax_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: smax v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_umax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_umax_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: umax v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_smax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_smax_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: smax v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_umax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_umax_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: umax v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_smin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_smin_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: smin v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_umin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: umin v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_smin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_smin_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: smin v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_umin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_umin_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: umin v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_smin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_smin_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: smin v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_umin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_umin_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: umin v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_smin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_smin_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: smin v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_umin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_umin_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: umin v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_smin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_smin_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: smin v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_umin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_umin_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: umin v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_smin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_smin_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: smin v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_umin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_umin_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: umin v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmax_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmax_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmax v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmax_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmax_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmax v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmax_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmax_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmax v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmin_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmin_v2f32:
-        %val = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmin v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmin_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmin_v4f32:
-        %val = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmin v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmin_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmin_v2f64:
-        %val = call <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmin v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-
-declare <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmaxnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmaxnm_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmaxnm v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fmaxnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmaxnm_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmaxnm v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fmaxnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmaxnm_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmaxnm v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fminnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fminnm_v2f32:
-        %val = call <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fminnm v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_fminnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fminnm_v4f32:
-        %val = call <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fminnm v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_fminnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fminnm_v2f64:
-        %val = call <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fminnm v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
diff --git a/test/CodeGen/AArch64/neon-misc-scalar.ll b/test/CodeGen/AArch64/neon-misc-scalar.ll
deleted file mode 100644
index cca8deb..0000000
--- a/test/CodeGen/AArch64/neon-misc-scalar.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-;RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>)
-
-declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>)
-
-declare <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64>)
-
-declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>)
-
-declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_vuqadd_s64(<1 x i64> %a, <1 x i64> %b) {
-entry:
-  ; CHECK: test_vuqadd_s64
-  %vuqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
-  ; CHECK: suqadd d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vuqadd2.i
-}
-
-define <1 x i64> @test_vsqadd_u64(<1 x i64> %a, <1 x i64> %b) {
-entry:
-  ; CHECK: test_vsqadd_u64
-  %vsqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
-  ; CHECK: usqadd d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vsqadd2.i
-}
-
-define <1 x i64> @test_vabs_s64(<1 x i64> %a) {
-  ; CHECK: test_vabs_s64
-entry:
-  %vabs1.i = tail call <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64> %a)
-  ; CHECK: abs d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vabs1.i
-}
-
-define <1 x i64> @test_vqabs_s64(<1 x i64> %a) {
-  ; CHECK: test_vqabs_s64
-entry:
-  %vqabs1.i = tail call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %a)
-  ; CHECK: sqabs d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vqabs1.i
-}
-
-define <1 x i64> @test_vqneg_s64(<1 x i64> %a) {
-  ; CHECK: test_vqneg_s64
-entry:
-  %vqneg1.i = tail call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %a)
-  ; CHECK: sqneg d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %vqneg1.i
-}
-
-define <1 x i64> @test_vneg_s64(<1 x i64> %a) {
-  ; CHECK: test_vneg_s64
-entry:
-  %sub.i = sub <1 x i64> zeroinitializer, %a
-  ; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
-  ret <1 x i64> %sub.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-misc.ll b/test/CodeGen/AArch64/neon-misc.ll
deleted file mode 100644
index 7ec36c2..0000000
--- a/test/CodeGen/AArch64/neon-misc.ll
+++ /dev/null
@@ -1,2014 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-
-define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
-; CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-  ret <8 x i8> %shuffle.i
-}
-
-define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
-; CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  ret <4 x i16> %shuffle.i
-}
-
-define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-  ret <8 x i16> %shuffle.i
-}
-
-define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-  ret <2 x i32> %shuffle.i
-}
-
-define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %shuffle.i = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-  ret <2 x float> %shuffle.i
-}
-
-define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  ret <4 x i32> %shuffle.i
-}
-
-define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-  ret <4 x float> %shuffle.i
-}
-
-define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
-  %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4
-  ret <4 x i16> %vpaddl.i
-}
-
-define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
-  %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a) #4
-  ret <2 x i32> %vpaddl1.i
-}
-
-define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
-  %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a) #4
-  ret <1 x i64> %vpaddl1.i
-}
-
-define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
-  %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4
-  ret <4 x i16> %vpaddl.i
-}
-
-define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
-  %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a) #4
-  ret <2 x i32> %vpaddl1.i
-}
-
-define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
-  %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a) #4
-  ret <1 x i64> %vpaddl1.i
-}
-
-define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
-  %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4
-  ret <8 x i16> %vpaddl.i
-}
-
-define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a) #4
-  ret <4 x i32> %vpaddl1.i
-}
-
-define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a) #4
-  ret <2 x i64> %vpaddl1.i
-}
-
-define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
-  %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4
-  ret <8 x i16> %vpaddl.i
-}
-
-define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a) #4
-  ret <4 x i32> %vpaddl1.i
-}
-
-define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a) #4
-  ret <2 x i64> %vpaddl1.i
-}
-
-define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
-  %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
-  ret <4 x i16> %vpadal1.i
-}
-
-define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
-  %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
-  ret <2 x i32> %vpadal2.i
-}
-
-define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
-  %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
-  ret <1 x i64> %vpadal2.i
-}
-
-define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
-  %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
-  ret <4 x i16> %vpadal1.i
-}
-
-define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
-  %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
-  ret <2 x i32> %vpadal2.i
-}
-
-define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
-  %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
-  ret <1 x i64> %vpadal2.i
-}
-
-define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
-  %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
-  ret <8 x i16> %vpadal1.i
-}
-
-define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
-  ret <4 x i32> %vpadal2.i
-}
-
-define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
-  ret <2 x i64> %vpadal2.i
-}
-
-define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
-  %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
-  ret <8 x i16> %vpadal1.i
-}
-
-define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
-  ret <4 x i32> %vpadal2.i
-}
-
-define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
-  ret <2 x i64> %vpadal2.i
-}
-
-define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vqabs.i = tail call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vqabs.i
-}
-
-define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vqabs.i = tail call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vqabs.i
-}
-
-define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vqabs1.i = tail call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a) #4
-  ret <4 x i16> %vqabs1.i
-}
-
-define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vqabs1.i = tail call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a) #4
-  ret <8 x i16> %vqabs1.i
-}
-
-define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vqabs1.i = tail call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vqabs1.i
-}
-
-define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vqabs1.i = tail call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vqabs1.i
-}
-
-define <2 x i64> @test_vqabsq_s64(<2 x i64> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vqabs1.i = tail call <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64> %a) #4
-  ret <2 x i64> %vqabs1.i
-}
-
-define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vqneg.i = tail call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vqneg.i
-}
-
-define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vqneg.i = tail call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vqneg.i
-}
-
-define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vqneg1.i = tail call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a) #4
-  ret <4 x i16> %vqneg1.i
-}
-
-define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vqneg1.i = tail call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a) #4
-  ret <8 x i16> %vqneg1.i
-}
-
-define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vqneg1.i = tail call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vqneg1.i
-}
-
-define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vqneg1.i = tail call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vqneg1.i
-}
-
-define <2 x i64> @test_vqnegq_s64(<2 x i64> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vqneg1.i = tail call <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64> %a) #4
-  ret <2 x i64> %vqneg1.i
-}
-
-define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %sub.i = sub <8 x i8> zeroinitializer, %a
-  ret <8 x i8> %sub.i
-}
-
-define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %sub.i = sub <16 x i8> zeroinitializer, %a
-  ret <16 x i8> %sub.i
-}
-
-define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %sub.i = sub <4 x i16> zeroinitializer, %a
-  ret <4 x i16> %sub.i
-}
-
-define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %sub.i = sub <8 x i16> zeroinitializer, %a
-  ret <8 x i16> %sub.i
-}
-
-define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %sub.i = sub <2 x i32> zeroinitializer, %a
-  ret <2 x i32> %sub.i
-}
-
-define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %sub.i = sub <4 x i32> zeroinitializer, %a
-  ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vnegq_s64(<2 x i64> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %sub.i = sub <2 x i64> zeroinitializer, %a
-  ret <2 x i64> %sub.i
-}
-
-define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
-; CHECK: fneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
-  ret <2 x float> %sub.i
-}
-
-define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
-; CHECK: fneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
-  ret <4 x float> %sub.i
-}
-
-define <2 x double> @test_vnegq_f64(<2 x double> %a) #0 {
-; CHECK: fneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
-  ret <2 x double> %sub.i
-}
-
-define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vabs.i = tail call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vabs.i
-}
-
-define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vabs.i = tail call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vabs.i
-}
-
-define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vabs1.i = tail call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a) #4
-  ret <4 x i16> %vabs1.i
-}
-
-define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vabs1.i = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a) #4
-  ret <8 x i16> %vabs1.i
-}
-
-define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vabs1.i = tail call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vabs1.i
-}
-
-define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vabs1.i = tail call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vabs1.i
-}
-
-define <2 x i64> @test_vabsq_s64(<2 x i64> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vabs1.i = tail call <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64> %a) #4
-  ret <2 x i64> %vabs1.i
-}
-
-define <2 x float> @test_vabs_f32(<2 x float> %a) #1 {
-; CHECK: fabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vabs1.i = tail call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vabs1.i
-}
-
-define <4 x float> @test_vabsq_f32(<4 x float> %a) #1 {
-; CHECK: fabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vabs1.i = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vabs1.i
-}
-
-define <2 x double> @test_vabsq_f64(<2 x double> %a) #1 {
-; CHECK: fabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vabs1.i = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vabs1.i
-}
-
-define <8 x i8> @test_vuqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vuqadd.i = tail call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
-  ret <8 x i8> %vuqadd.i
-}
-
-define <16 x i8> @test_vuqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vuqadd.i = tail call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
-  ret <16 x i8> %vuqadd.i
-}
-
-define <4 x i16> @test_vuqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vuqadd2.i = tail call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
-  ret <4 x i16> %vuqadd2.i
-}
-
-define <8 x i16> @test_vuqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vuqadd2.i = tail call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4
-  ret <8 x i16> %vuqadd2.i
-}
-
-define <2 x i32> @test_vuqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vuqadd2.i = tail call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
-  ret <2 x i32> %vuqadd2.i
-}
-
-define <4 x i32> @test_vuqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vuqadd2.i = tail call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4
-  ret <4 x i32> %vuqadd2.i
-}
-
-define <2 x i64> @test_vuqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vuqadd2.i = tail call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4
-  ret <2 x i64> %vuqadd2.i
-}
-
-define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vcls.i = tail call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vcls.i
-}
-
-define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vcls.i = tail call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vcls.i
-}
-
-define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vcls1.i = tail call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) #4
-  ret <4 x i16> %vcls1.i
-}
-
-define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vcls1.i = tail call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) #4
-  ret <8 x i16> %vcls1.i
-}
-
-define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcls1.i = tail call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vcls1.i
-}
-
-define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcls1.i = tail call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vcls1.i
-}
-
-define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
-  ret <8 x i8> %vclz.i
-}
-
-define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
-  ret <16 x i8> %vclz.i
-}
-
-define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #4
-  ret <4 x i16> %vclz1.i
-}
-
-define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #4
-  ret <8 x i16> %vclz1.i
-}
-
-define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #4
-  ret <2 x i32> %vclz1.i
-}
-
-define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #4
-  ret <4 x i32> %vclz1.i
-}
-
-define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
-; CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vctpop.i = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vctpop.i
-}
-
-define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
-; CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vctpop.i = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vctpop.i
-}
-
-define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
-; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %neg.i = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-  ret <8 x i8> %neg.i
-}
-
-define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
-; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %neg.i = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-  ret <16 x i8> %neg.i
-}
-
-define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
-; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %neg.i = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
-  ret <4 x i16> %neg.i
-}
-
-define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
-; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %neg.i = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-  ret <8 x i16> %neg.i
-}
-
-define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
-; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %neg.i = xor <2 x i32> %a, <i32 -1, i32 -1>
-  ret <2 x i32> %neg.i
-}
-
-define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
-; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %neg.i = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
-  ret <4 x i32> %neg.i
-}
-
-define <8 x i8> @test_vrbit_s8(<8 x i8> %a) #0 {
-; CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %vrbit.i = tail call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #4
-  ret <8 x i8> %vrbit.i
-}
-
-define <16 x i8> @test_vrbitq_s8(<16 x i8> %a) #0 {
-; CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %vrbit.i = tail call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #4
-  ret <16 x i8> %vrbit.i
-}
-
-define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
-; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-  %vmovn.i = trunc <8 x i16> %a to <8 x i8>
-  ret <8 x i8> %vmovn.i
-}
-
-define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
-; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vmovn.i = trunc <4 x i32> %a to <4 x i16>
-  ret <4 x i16> %vmovn.i
-}
-
-define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
-; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vmovn.i = trunc <2 x i64> %a to <2 x i32>
-  ret <2 x i32> %vmovn.i
-}
-
-define <16 x i8> @test_vmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
-  %vmovn.i.i = trunc <8 x i16> %b to <8 x i8>
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vmovn.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
-  %vmovn.i.i = trunc <4 x i32> %b to <4 x i16>
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vmovn.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vmovn.i.i = trunc <2 x i64> %b to <2 x i32>
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vmovn.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %shuffle.i
-}
-
-define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
-; CHECK: sqxtun v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-  %vqdmull1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a) #4
-  ret <8 x i8> %vqdmull1.i
-}
-
-define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
-; CHECK: sqxtun v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vqdmull1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a) #4
-  ret <4 x i16> %vqdmull1.i
-}
-
-define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
-; CHECK: sqxtun v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vqdmull1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a) #4
-  ret <2 x i32> %vqdmull1.i
-}
-
-define <16 x i8> @test_vqmovun_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: sqxtun2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
-  %vqdmull1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %b) #4
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqdmull1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vqmovun_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: sqxtun2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
-  %vqdmull1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %b) #4
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqdmull1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vqmovun_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: sqxtun2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vqdmull1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %b) #4
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqdmull1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %shuffle.i
-}
-
-define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
-; CHECK: sqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-  %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a) #4
-  ret <8 x i8> %vqmovn1.i
-}
-
-define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
-; CHECK: sqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a) #4
-  ret <4 x i16> %vqmovn1.i
-}
-
-define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
-; CHECK: sqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a) #4
-  ret <2 x i32> %vqmovn1.i
-}
-
-define <16 x i8> @test_vqmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: sqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
-  %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %b) #4
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vqmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: test_vqmovn_high_s32
-  %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %b) #4
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vqmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: test_vqmovn_high_s64
-  %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %b) #4
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %shuffle.i
-}
-
-define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
-; CHECK: uqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-  %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a) #4
-  ret <8 x i8> %vqmovn1.i
-}
-
-define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
-; CHECK: uqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a) #4
-  ret <4 x i16> %vqmovn1.i
-}
-
-define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
-; CHECK: uqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a) #4
-  ret <2 x i32> %vqmovn1.i
-}
-
-define <16 x i8> @test_vqmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: uqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
-  %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %b) #4
-  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vqmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: uqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
-  %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %b) #4
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vqmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: uqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %b) #4
-  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %shuffle.i
-}
-
-define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
-  %1 = sext <8 x i8> %a to <8 x i16>
-  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
-  %1 = sext <4 x i16> %a to <4 x i32>
-  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
-  %1 = sext <2 x i32> %a to <2 x i64>
-  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
-  ret <2 x i64> %vshll_n
-}
-
-define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
-  %1 = zext <8 x i8> %a to <8 x i16>
-  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
-  %1 = zext <4 x i16> %a to <4 x i32>
-  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
-  %1 = zext <2 x i32> %a to <2 x i64>
-  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
-  ret <2 x i64> %vshll_n
-}
-
-define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = sext <8 x i8> %shuffle.i to <8 x i16>
-  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = sext <4 x i16> %shuffle.i to <4 x i32>
-  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = sext <2 x i32> %shuffle.i to <2 x i64>
-  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
-  ret <2 x i64> %vshll_n
-}
-
-define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %1 = zext <8 x i8> %shuffle.i to <8 x i16>
-  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %1 = zext <4 x i16> %shuffle.i to <4 x i32>
-  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %1 = zext <2 x i32> %shuffle.i to <2 x i64>
-  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
-  ret <2 x i64> %vshll_n
-}
-
-define <4 x i16> @test_vcvt_f16_f32(<4 x float> %a) #0 {
-; CHECK: fcvtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vcvt1.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a) #4
-  ret <4 x i16> %vcvt1.i
-}
-
-define <8 x i16> @test_vcvt_high_f16_f32(<4 x i16> %a, <4 x float> %b) #0 {
-; CHECK: fcvtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
-  %vcvt1.i.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %b) #4
-  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vcvt1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %shuffle.i
-}
-
-define <4 x float> @test_vcvt_f32_f16(<4 x i16> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.4s, v{{[0-9]+}}.4h
-  %vcvt1.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %a) #4
-  ret <4 x float> %vcvt1.i
-}
-
-define <4 x float> @test_vcvt_high_f32_f16(<8 x i16> %a) #0 {
-; CHECK: fcvtl2 v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
-  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vcvt1.i.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %shuffle.i.i) #4
-  ret <4 x float> %vcvt1.i.i
-}
-
-define <2 x float> @test_vcvt_f32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = fptrunc <2 x double> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
-; CHECK: fcvtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vcvt.i.i = fptrunc <2 x double> %b to <2 x float>
-  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvt.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x float> %shuffle.i
-}
-
-define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvtx_f32_f641.i = call <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double> %a) #4
-  ret <2 x float> %vcvtx_f32_f641.i
-}
-
-define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
-; CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
-  %vcvtx_f32_f641.i.i = tail call <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double> %b) #4
-  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvtx_f32_f641.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x float> %shuffle.i
-}
-
-define <2 x double> @test_vcvt_f64_f32(<2 x float> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
-  %vcvt.i = fpext <2 x float> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %a) #0 {
-; CHECK: fcvtl2 v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
-  %shuffle.i.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
-  %vcvt.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
-  ret <2 x double> %vcvt.i.i
-}
-
-define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
-; CHECK: frintn v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndn1.i = tail call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndn1.i
-}
-
-define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
-; CHECK: frintn v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrndn1.i = tail call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndn1.i
-}
-
-define <2 x double> @test_vrndnq_f64(<2 x double> %a) #0 {
-; CHECK: frintn v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrndn1.i = tail call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndn1.i
-}
-
-define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
-; CHECK: frinta v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrnda1.i = tail call <2 x float> @llvm.round.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrnda1.i
-}
-
-define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
-; CHECK: frinta v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-   %vrnda1.i = tail call <4 x float> @llvm.round.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrnda1.i
-}
-
-define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 {
-; CHECK: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrnda1.i = tail call <2 x double> @llvm.round.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrnda1.i
-}
-
-define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
-; CHECK: frintp v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndp1.i = tail call <2 x float> @llvm.ceil.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndp1.i
-}
-
-define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
-; CHECK: frintp v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrndp1.i = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndp1.i
-}
-
-define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 {
-; CHECK: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrndp1.i = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndp1.i
-}
-
-define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
-; CHECK: frintm v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndm1.i = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndm1.i
-}
-
-define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
-; CHECK: frintm v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrndm1.i = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndm1.i
-}
-
-define <2 x double> @test_vrndmq_f64(<2 x double> %a) #0 {
-; CHECK: frintm v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-   %vrndm1.i = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndm1.i
-}
-
-define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
-; CHECK: frintx v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndx1.i = tail call <2 x float> @llvm.rint.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndx1.i
-}
-
-define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
-; CHECK: frintx v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrndx1.i = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndx1.i
-}
-
-define <2 x double> @test_vrndxq_f64(<2 x double> %a) #0 {
-; CHECK: frintx v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrndx1.i = tail call <2 x double> @llvm.rint.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndx1.i
-}
-
-define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
-; CHECK: frintz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-   %vrnd1.i = tail call <2 x float> @llvm.trunc.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrnd1.i
-}
-
-define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
-; CHECK: frintz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrnd1.i = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrnd1.i
-}
-
-define <2 x double> @test_vrndq_f64(<2 x double> %a) #0 {
-; CHECK: frintz v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrnd1.i = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrnd1.i
-}
-
-define <2 x float> @test_vrndi_f32(<2 x float> %a) #0 {
-; CHECK: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrndi1.i = tail call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrndi1.i
-}
-
-define <4 x float> @test_vrndiq_f32(<4 x float> %a) #0 {
-; CHECK: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrndi1.i = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrndi1.i
-}
-
-define <2 x double> @test_vrndiq_f64(<2 x double> %a) #0 {
-; CHECK: frinti v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrndi1.i = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrndi1.i
-}
-
-define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
-  ret <4 x i32> %vcvt.i
-}
-
-define <2 x i64> @test_vcvtq_s64_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = fptosi <2 x double> %a to <2 x i64>
-  ret <2 x i64> %vcvt.i
-}
-
-define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
-  ret <4 x i32> %vcvt.i
-}
-
-define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = fptoui <2 x double> %a to <2 x i64>
-  ret <2 x i64> %vcvt.i
-}
-
-define <2 x i64> @test_vcvt_s64_f32(<2 x float> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
-; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = fptosi <2 x float> %a to <2 x i64>
-  ret <2 x i64> %vcvt.i
-}
-
-define <2 x i64> @test_vcvt_u64_f32(<2 x float> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
-; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = fptoui <2 x float> %a to <2 x i64>
-  ret <2 x i64> %vcvt.i
-}
-
-define <4 x i16> @test_vcvt_s16_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vcvt.i = fptosi <4 x float> %a to <4 x i16>
-  ret <4 x i16> %vcvt.i
-}
-
-define <4 x i16> @test_vcvt_u16_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-  %vcvt.i = fptoui <4 x float> %a to <4 x i16>
-  ret <4 x i16> %vcvt.i
-}
-
-define <2 x i32> @test_vcvt_s32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = fptosi <2 x double> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <2 x i32> @test_vcvt_u32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = fptoui <2 x double> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <1 x i8> @test_vcvt_s8_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.b[0], w{{[0-9]+}}
-  %vcvt.i = fptosi <1 x double> %a to <1 x i8>
-  ret <1 x i8> %vcvt.i
-}
-
-define <1 x i8> @test_vcvt_u8_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.b[0], w{{[0-9]+}}
-  %vcvt.i = fptoui <1 x double> %a to <1 x i8>
-  ret <1 x i8> %vcvt.i
-}
-
-define <1 x i16> @test_vcvt_s16_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.h[0], w{{[0-9]+}}
-  %vcvt.i = fptosi <1 x double> %a to <1 x i16>
-  ret <1 x i16> %vcvt.i
-}
-
-define <1 x i16> @test_vcvt_u16_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.h[0], w{{[0-9]+}}
-  %vcvt.i = fptoui <1 x double> %a to <1 x i16>
-  ret <1 x i16> %vcvt.i
-}
-
-define <1 x i32> @test_vcvt_s32_f64_v1(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fmov s{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = fptosi <1 x double> %a to <1 x i32>
-  ret <1 x i32> %vcvt.i
-}
-
-define <1 x i32> @test_vcvt_u32_f64_v1(<1 x double> %a) #0 {
-; CHECK: fcvtzu w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fmov s{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = fptoui <1 x double> %a to <1 x i32>
-  ret <1 x i32> %vcvt.i
-}
-
-define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtn_s32_f32
-; CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtns_f321.i = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtns_f321.i
-}
-
-define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtnq_s32_f32
-; CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtns_f321.i = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtns_f321.i
-}
-
-define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtnq_s64_f64
-; CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtns_f641.i = call <2 x i64> @llvm.arm.neon.vcvtns.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtns_f641.i
-}
-
-define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtn_u32_f32
-; CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtnu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtnu_f321.i
-}
-
-define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtnq_u32_f32
-; CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtnu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtnu_f321.i
-}
-
-define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtnq_u64_f64
-; CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtnu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtnu.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtnu_f641.i
-}
-
-define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtp_s32_f32
-; CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtps_f321.i = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtps_f321.i
-}
-
-define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtpq_s32_f32
-; CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtps_f321.i = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtps_f321.i
-}
-
-define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtpq_s64_f64
-; CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtps_f641.i = call <2 x i64> @llvm.arm.neon.vcvtps.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtps_f641.i
-}
-
-define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtp_u32_f32
-; CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtpu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtpu_f321.i
-}
-
-define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtpq_u32_f32
-; CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtpu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtpu_f321.i
-}
-
-define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtpq_u64_f64
-; CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtpu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtpu.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtpu_f641.i
-}
-
-define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtm_s32_f32
-; CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtms_f321.i = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtms_f321.i
-}
-
-define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtmq_s32_f32
-; CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtms_f321.i = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtms_f321.i
-}
-
-define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtmq_s64_f64
-; CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtms_f641.i = call <2 x i64> @llvm.arm.neon.vcvtms.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtms_f641.i
-}
-
-define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtm_u32_f32
-; CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtmu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtmu_f321.i
-}
-
-define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtmq_u32_f32
-; CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtmu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtmu_f321.i
-}
-
-define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtmq_u64_f64
-; CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtmu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtmu.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtmu_f641.i
-}
-
-define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvta_s32_f32
-; CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtas_f321.i = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtas_f321.i
-}
-
-define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtaq_s32_f32
-; CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtas_f321.i = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtas_f321.i
-}
-
-define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtaq_s64_f64
-; CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtas_f641.i = call <2 x i64> @llvm.arm.neon.vcvtas.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtas_f641.i
-}
-
-define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvta_u32_f32
-; CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvtau_f321.i = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a)
-  ret <2 x i32> %vcvtau_f321.i
-}
-
-define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtaq_u32_f32
-; CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvtau_f321.i = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a)
-  ret <4 x i32> %vcvtau_f321.i
-}
-
-define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtaq_u64_f64
-; CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvtau_f641.i = call <2 x i64> @llvm.arm.neon.vcvtau.v2i64.v2f64(<2 x double> %a)
-  ret <2 x i64> %vcvtau_f641.i
-}
-
-define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
-; CHECK: frsqrte v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrsqrte1.i = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrsqrte1.i
-}
-
-define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
-; CHECK: frsqrte v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrsqrte1.i = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrsqrte1.i
-}
-
-define <2 x double> @test_vrsqrteq_f64(<2 x double> %a) #0 {
-; CHECK: frsqrte v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrsqrte1.i = tail call <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrsqrte1.i
-}
-
-define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
-; CHECK: frecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrecpe1.i = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vrecpe1.i
-}
-
-define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
-; CHECK: frecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrecpe1.i = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vrecpe1.i
-}
-
-define <2 x double> @test_vrecpeq_f64(<2 x double> %a) #0 {
-; CHECK: frecpe v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vrecpe1.i = tail call <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vrecpe1.i
-}
-
-define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
-; CHECK: urecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vrecpe1.i = tail call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a) #4
-  ret <2 x i32> %vrecpe1.i
-}
-
-define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
-; CHECK: urecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vrecpe1.i = tail call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a) #4
-  ret <4 x i32> %vrecpe1.i
-}
-
-define <2 x float> @test_vsqrt_f32(<2 x float> %a) #0 {
-; CHECK: fsqrt v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vsqrt1.i = tail call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #4
-  ret <2 x float> %vsqrt1.i
-}
-
-define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 {
-; CHECK: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vsqrt1.i = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #4
-  ret <4 x float> %vsqrt1.i
-}
-
-define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 {
-; CHECK: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vsqrt1.i = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #4
-  ret <2 x double> %vsqrt1.i
-}
-
-define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <2 x double> @test_vcvtq_f64_s64(<2 x i64> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = sitofp <2 x i64> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = uitofp <2 x i64> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <2 x float> @test_vcvt_f32_s64(<2 x i64> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = sitofp <2 x i64> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <2 x float> @test_vcvt_f32_u64(<2 x i64> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-  %vcvt.i = uitofp <2 x i64> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvt_f32_s16(<4 x i16> %a) #0 {
-; CHECK: sshll v{{[0-9]+}}.4s, v{{[0-9]+}}.4h, #0
-; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = sitofp <4 x i16> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvt_f32_u16(<4 x i16> %a) #0 {
-; CHECK: ushll v{{[0-9]+}}.4s, v{{[0-9]+}}.4h, #0
-; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %vcvt.i = uitofp <4 x i16> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <2 x double> @test_vcvt_f64_s32(<2 x i32> %a) #0 {
-; CHECK: sshll v{{[0-9]+}}.2d, v{{[0-9]+}}.2s, #0
-; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = sitofp <2 x i32> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <2 x double> @test_vcvt_f64_u32(<2 x i32> %a) #0 {
-; CHECK: ushll v{{[0-9]+}}.2d, v{{[0-9]+}}.2s, #0
-; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %vcvt.i = uitofp <2 x i32> %a to <2 x double>
-  ret <2 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_s8(<1 x i8> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.b[0]
-; CHECK: sxtb w{{[0-9]+}}, w{{[0-9]+}}
-; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = sitofp <1 x i8> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_u8(<1 x i8> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.b[0]
-; CHECK: and w{{[0-9]+}}, w{{[0-9]+}}, #0xff
-; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = uitofp <1 x i8> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_s16(<1 x i16> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.h[0]
-; CHECK: sxth w{{[0-9]+}}, w{{[0-9]+}}
-; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = sitofp <1 x i16> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_u16(<1 x i16> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.h[0]
-; CHECK: and w{{[0-9]+}}, w{{[0-9]+}}, #0xffff
-; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = uitofp <1 x i16> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_s32_v1(<1 x i32> %a) #0 {
-; CHECK: fmov w{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = sitofp <1 x i32> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_u32_v1(<1 x i32> %a) #0 {
-; CHECK: fmov w{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
-  %vcvt.i = uitofp <1 x i32> %a to <1 x double>
-  ret <1 x double> %vcvt.i
-}
-
-declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #2
-
-declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) #2
-
-declare <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) #2
-
-declare <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) #2
-
-declare <2 x i64> @llvm.arm.neon.vcvtau.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtas.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtmu.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtms.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtpu.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtps.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtnu.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtns.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float>)
-
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.trunc.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.rint.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.rint.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.floor.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.floor.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.ceil.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.round.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.round.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.round.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) #2
-
-declare <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double>) #2
-
-declare <2 x float> @llvm.aarch64.neon.fcvtn.v2f32.v2f64(<2 x double>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) #2
-
-declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) #2
-
-declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) #2
-
-declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) #2
-
-declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) #2
-
-declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) #2
-
-declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) #2
-
-declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) #2
-
-declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) #2
-
-declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) #2
-
-declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) #2
-
-declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) #2
-
-declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) #2
-
-declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) #2
-
-declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) #2
-
-declare <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) #2
-
-declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #3
-
-declare <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64>) #2
-
-declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64>) #2
-
-declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64>) #2
-
-declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64>, <4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32>, <8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16>, <16 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64>, <4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32>, <8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16>, <16 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64>, <2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32>, <4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16>, <8 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64>, <2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32>, <4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16>, <8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) #2
-
-declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) #2
-
-
-define <1 x i64> @test_vcvt_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_s64_f64
-; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fptosi <1 x double> %a to <1 x i64>
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_u64_f64
-; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fptoui <1 x double> %a to <1 x i64>
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtn_s64_f64
-; CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtns.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtn_u64_f64
-; CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtnu.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtp_s64_f64
-; CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtps.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtp_u64_f64
-; CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtpu.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtm_s64_f64
-; CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtms.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtm_u64_f64
-; CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtmu.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvta_s64_f64
-; CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtas.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvta_u64_f64
-; CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = call <1 x i64> @llvm.arm.neon.vcvtau.v1i64.v1f64(<1 x double> %a)
-  ret <1 x i64> %1
-}
-
-define <1 x double> @test_vcvt_f64_s64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_f64_s64
-; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = sitofp <1 x i64> %a to <1 x double>
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_f64_u64
-; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = uitofp <1 x i64> %a to <1 x double>
-  ret <1 x double> %1
-}
-
-declare <1 x i64> @llvm.arm.neon.vcvtau.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtas.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtmu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtms.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtpu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtps.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtnu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtns.v1i64.v1f64(<1 x double>)
-
-define <1 x double> @test_vrndn_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndn_f64
-; CHECK: frintn d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrnda_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrnda_f64
-; CHECK: frinta d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.round.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndp_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndp_f64
-; CHECK: frintp d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.ceil.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndm_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndm_f64
-; CHECK: frintm d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.floor.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndx_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndx_f64
-; CHECK: frintx d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.rint.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrnd_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrnd_f64
-; CHECK: frintz d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.trunc.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndi_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndi_f64
-; CHECK: frinti d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-declare <1 x double> @llvm.nearbyint.v1f64(<1 x double>)
-declare <1 x double> @llvm.trunc.v1f64(<1 x double>)
-declare <1 x double> @llvm.rint.v1f64(<1 x double>)
-declare <1 x double> @llvm.floor.v1f64(<1 x double>)
-declare <1 x double> @llvm.ceil.v1f64(<1 x double>)
-declare <1 x double> @llvm.round.v1f64(<1 x double>)
-declare <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double>)
-
-define <1 x double> @test_vrsqrte_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrsqrte_f64
-; CHECK: frsqrte d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrecpe_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrecpe_f64
-; CHECK: frecpe d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vsqrt_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vsqrt_f64
-; CHECK: fsqrt d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrecps_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vrecps_f64
-; CHECK: frecps d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vrsqrts_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vrsqrts_f64
-; CHECK: frsqrts d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double> %a, <1 x double> %b)
-  ret <1 x double> %1
-}
-
-declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.sqrt.v1f64(<1 x double>)
-declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>)
-declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>)
-
-define i64 @test_vaddlv_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddlv_s32
-; CHECK: saddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i64> %1, i32 0
-  ret i64 %2
-}
-
-define i64 @test_vaddlv_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddlv_u32
-; CHECK: uaddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
-  %1 = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v2i32(<2 x i32> %a)
-  %2 = extractelement <1 x i64> %1, i32 0
-  ret i64 %2
-}
-
-declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v2i32(<2 x i32>)
-declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v2i32(<2 x i32>)
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll
index 4035b91..40649ae 100644
--- a/test/CodeGen/AArch64/neon-mov.ll
+++ b/test/CodeGen/AArch64/neon-mov.ll
@@ -1,218 +1,259 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
 define <8 x i8> @movi8b() {
-;CHECK:  movi {{v[0-9]+}}.8b, #0x8
+; CHECK-LABEL: movi8b:
+; CHECK:  movi {{v[0-9]+}}.8b, #{{0x8|8}}
    ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
 }
 
 define <16 x i8> @movi16b() {
-;CHECK:  movi {{v[0-9]+}}.16b, #0x8
+; CHECK-LABEL: movi16b:
+; CHECK:  movi {{v[0-9]+}}.16b, #{{0x8|8}}
    ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
 }
 
 define <2 x i32> @movi2s_lsl0() {
-;CHECK:  movi {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: movi2s_lsl0:
+; CHECK: movi {{d[0-9]+}}, #0x0000ff000000ff
    ret <2 x i32> < i32 255, i32 255 >
 }
 
 define <2 x i32> @movi2s_lsl8() {
-;CHECK:  movi {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: movi2s_lsl8:
+; CHECK: movi {{d[0-9]+}}, #0x00ff000000ff00
    ret <2 x i32> < i32 65280, i32 65280 >
 }
 
 define <2 x i32> @movi2s_lsl16() {
-;CHECK:  movi {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: movi2s_lsl16:
+; CHECK: movi {{d[0-9]+}}, #0xff000000ff0000
    ret <2 x i32> < i32 16711680, i32 16711680 >
 
 }
 
 define <2 x i32> @movi2s_lsl24() {
-;CHECK:  movi {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: movi2s_lsl24:
+; CHECK: movi {{d[0-9]+}}, #0xff000000ff000000
    ret <2 x i32> < i32 4278190080, i32 4278190080 >
 }
 
 define <4 x i32> @movi4s_lsl0() {
-;CHECK:  movi {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: movi4s_lsl0:
+; CHECK: movi {{v[0-9]+}}.2d, #0x0000ff000000ff
    ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 >
 }
 
 define <4 x i32> @movi4s_lsl8() {
-;CHECK:  movi {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: movi4s_lsl8:
+; CHECK: movi {{v[0-9]+}}.2d, #0x00ff000000ff00
    ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 >
 }
 
 define <4 x i32> @movi4s_lsl16() {
-;CHECK:  movi {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: movi4s_lsl16:
+; CHECK:  movi {{v[0-9]+}}.2d, #0xff000000ff0000
    ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 >
 
 }
 
 define <4 x i32> @movi4s_lsl24() {
-;CHECK:  movi {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: movi4s_lsl24:
+; CHECK:  movi {{v[0-9]+}}.2d, #0xff000000ff000000
    ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 >
 }
 
 define <4 x i16> @movi4h_lsl0() {
-;CHECK:  movi {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: movi4h_lsl0:
+; CHECK:  movi {{d[0-9]+}}, #0xff00ff00ff00ff
    ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 >
 }
 
 define <4 x i16> @movi4h_lsl8() {
-;CHECK:  movi {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: movi4h_lsl8:
+; CHECK: movi d0, #0xff00ff00ff00ff00
    ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 >
 }
 
 define <8 x i16> @movi8h_lsl0() {
-;CHECK:  movi {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: movi8h_lsl0:
+; CHECK: movi v0.2d, #0xff00ff00ff00ff
    ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
 }
 
 define <8 x i16> @movi8h_lsl8() {
-;CHECK:  movi {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: movi8h_lsl8:
+; CHECK: movi v0.2d, #0xff00ff00ff00ff00
    ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
 }
 
 
 define <2 x i32> @mvni2s_lsl0() {
-;CHECK:  mvni {{v[0-9]+}}.2s, #0x10
+; CHECK-LABEL: mvni2s_lsl0:
+; CHECK:  mvni {{v[0-9]+}}.2s, #{{0x10|16}}
    ret <2 x i32> < i32 4294967279, i32 4294967279 >
 }
 
 define <2 x i32> @mvni2s_lsl8() {
-;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, lsl #8
+; CHECK-LABEL: mvni2s_lsl8:
+; CHECK:  mvni {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #8
    ret <2 x i32> < i32 4294963199, i32 4294963199 >
 }
 
 define <2 x i32> @mvni2s_lsl16() {
-;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, lsl #16
+; CHECK-LABEL: mvni2s_lsl16:
+; CHECK:  mvni {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #16
    ret <2 x i32> < i32 4293918719, i32 4293918719 >
 }
 
 define <2 x i32> @mvni2s_lsl24() {
-;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, lsl #24
+; CHECK-LABEL: mvni2s_lsl24:
+; CHECK:  mvni {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #24
    ret <2 x i32> < i32 4026531839, i32 4026531839 >
 }
 
 define <4 x i32> @mvni4s_lsl0() {
-;CHECK:  mvni {{v[0-9]+}}.4s, #0x10
+; CHECK-LABEL: mvni4s_lsl0:
+; CHECK:  mvni {{v[0-9]+}}.4s, #{{0x10|16}}
    ret <4 x i32> < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 >
 }
 
 define <4 x i32> @mvni4s_lsl8() {
-;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, lsl #8
+; CHECK-LABEL: mvni4s_lsl8:
+; CHECK:  mvni {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #8
    ret <4 x i32> < i32 4294963199, i32 4294963199, i32 4294963199, i32 4294963199 >
 }
 
 define <4 x i32> @mvni4s_lsl16() {
-;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, lsl #16
+; CHECK-LABEL: mvni4s_lsl16:
+; CHECK:  mvni {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #16
    ret <4 x i32> < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 >
 
 }
 
 define <4 x i32> @mvni4s_lsl24() {
-;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, lsl #24
+; CHECK-LABEL: mvni4s_lsl24:
+; CHECK:  mvni {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #24
    ret <4 x i32> < i32 4026531839, i32 4026531839, i32 4026531839, i32 4026531839 >
 }
 
 
 define <4 x i16> @mvni4h_lsl0() {
-;CHECK:  mvni {{v[0-9]+}}.4h, #0x10
+; CHECK-LABEL: mvni4h_lsl0:
+; CHECK:  mvni {{v[0-9]+}}.4h, #{{0x10|16}}
    ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 >
 }
 
 define <4 x i16> @mvni4h_lsl8() {
-;CHECK:  mvni {{v[0-9]+}}.4h, #0x10, lsl #8
+; CHECK-LABEL: mvni4h_lsl8:
+; CHECK:  mvni {{v[0-9]+}}.4h, #{{0x10|16}}, lsl #8
    ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 >
 }
 
 define <8 x i16> @mvni8h_lsl0() {
-;CHECK:  mvni {{v[0-9]+}}.8h, #0x10
+; CHECK-LABEL: mvni8h_lsl0:
+; CHECK:  mvni {{v[0-9]+}}.8h, #{{0x10|16}}
    ret <8 x i16> < i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519 >
 }
 
 define <8 x i16> @mvni8h_lsl8() {
-;CHECK:  mvni {{v[0-9]+}}.8h, #0x10, lsl #8
+; CHECK-LABEL: mvni8h_lsl8:
+; CHECK:  mvni {{v[0-9]+}}.8h, #{{0x10|16}}, lsl #8
    ret <8 x i16> < i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439 >
 }
 
 
 define <2 x i32> @movi2s_msl8(<2 x i32> %a) {
-;CHECK:  movi {{v[0-9]+}}.2s, #0xff, msl #8
+; CHECK-LABEL: movi2s_msl8:
+; CHECK: movi {{d[0-9]+}}, #0x00ffff0000ffff
 	ret <2 x i32> < i32 65535, i32 65535 >
 }
 
 define <2 x i32> @movi2s_msl16() {
-;CHECK:  movi {{v[0-9]+}}.2s, #0xff, msl #16
+; CHECK-LABEL: movi2s_msl16:
+; CHECK:  movi d0, #0xffffff00ffffff
    ret <2 x i32> < i32 16777215, i32 16777215 >
 }
 
 
 define <4 x i32> @movi4s_msl8() {
-;CHECK:  movi {{v[0-9]+}}.4s, #0xff, msl #8
+; CHECK-LABEL: movi4s_msl8:
+; CHECK:  movi v0.2d, #0x00ffff0000ffff
    ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 >
 }
 
 define <4 x i32> @movi4s_msl16() {
-;CHECK:  movi {{v[0-9]+}}.4s, #0xff, msl #16
+; CHECK-LABEL: movi4s_msl16:
+; CHECK:  movi v0.2d, #0xffffff00ffffff
    ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 >
 }
 
 define <2 x i32> @mvni2s_msl8() {
-;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, msl #8
+; CHECK-LABEL: mvni2s_msl8:
+; CHECK:  mvni {{v[0-9]+}}.2s, #{{0x10|16}}, msl #8
    ret <2 x i32> < i32 18446744073709547264, i32 18446744073709547264>
 }
 
 define <2 x i32> @mvni2s_msl16() {
-;CHECK:  mvni {{v[0-9]+}}.2s, #0x10, msl #16
+; CHECK-LABEL: mvni2s_msl16:
+; CHECK:  mvni {{v[0-9]+}}.2s, #{{0x10|16}}, msl #16
    ret <2 x i32> < i32 18446744073708437504, i32 18446744073708437504>
 }
 
 define <4 x i32> @mvni4s_msl8() {
-;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, msl #8
+; CHECK-LABEL: mvni4s_msl8:
+; CHECK:  mvni {{v[0-9]+}}.4s, #{{0x10|16}}, msl #8
    ret <4 x i32> < i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264>
 }
 
 define <4 x i32> @mvni4s_msl16() {
-;CHECK:  mvni {{v[0-9]+}}.4s, #0x10, msl #16
+; CHECK-LABEL: mvni4s_msl16:
+; CHECK:  mvni {{v[0-9]+}}.4s, #{{0x10|16}}, msl #16
    ret <4 x i32> < i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504>
 }
 
 define <2 x i64> @movi2d() {
-;CHECK: movi {{v[0-9]+}}.2d, #0xff0000ff0000ffff
+; CHECK-LABEL: movi2d:
+; CHECK: movi {{v[0-9]+}}.2d, #0xff0000ff0000ffff
 	ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
 }
 
 define <1 x i64> @movid() {
-;CHECK: movi {{d[0-9]+}}, #0xff0000ff0000ffff
+; CHECK-LABEL: movid:
+; CHECK: movi {{d[0-9]+}}, #0xff0000ff0000ffff
 	ret  <1 x i64> < i64 18374687574888349695 >
 }
 
 define <2 x float> @fmov2s() {
-;CHECK:  fmov {{v[0-9]+}}.2s, #-12.00000000
+; CHECK-LABEL: fmov2s:
+; CHECK:  fmov {{v[0-9]+}}.2s, #{{-12.00000000|-1.200000e\+01}}
 	ret <2 x float> < float -1.2e1, float -1.2e1>
 }
 
 define <4 x float> @fmov4s() {
-;CHECK:  fmov {{v[0-9]+}}.4s, #-12.00000000
+; CHECK-LABEL: fmov4s:
+; CHECK:  fmov {{v[0-9]+}}.4s, #{{-12.00000000|-1.200000e\+01}}
 	ret <4 x float> < float -1.2e1, float -1.2e1, float -1.2e1, float -1.2e1>
 }
 
 define <2 x double> @fmov2d() {
-;CHECK:  fmov {{v[0-9]+}}.2d, #-12.00000000
+; CHECK-LABEL: fmov2d:
+; CHECK:  fmov {{v[0-9]+}}.2d, #{{-12.00000000|-1.200000e\+01}}
 	ret <2 x double> < double -1.2e1, double -1.2e1>
 }
 
 define <2 x i32> @movi1d_1() {
-; CHECK: movi    d0, #0xffffffff0000
+; CHECK-LABEL: movi1d_1:
+; CHECK: movi    d0, #0x{{0*}}ffffffff0000
   ret <2 x i32> < i32  -65536, i32 65535>
 }
 
 
 declare <2 x i32> @test_movi1d(<2 x i32>, <2 x i32>)
 define <2 x i32> @movi1d() {
+; CHECK-LABEL: movi1d:
 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-; CHECK-NEXT: movi     d1, #0xffffffff0000
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+; CHECK-NEXT: movi     d1, #0x{{0*}}ffffffff0000
   %1 = tail call <2 x i32> @test_movi1d(<2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>)
   ret <2 x i32> %1
 }
diff --git a/test/CodeGen/AArch64/neon-mul-div.ll b/test/CodeGen/AArch64/neon-mul-div.ll
deleted file mode 100644
index da22ce8..0000000
--- a/test/CodeGen/AArch64/neon-mul-div.ll
+++ /dev/null
@@ -1,754 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-
-define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = mul <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = mul <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = mul <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = mul <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = mul <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = mul <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK-LABEL: mul1xi64:
-;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-  %tmp3 = mul <1 x i64> %A, %B;
-  ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK-LABEL: mul2xi64:
-;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-  %tmp3 = mul <2 x i64> %A, %B;
-  ret <2 x i64> %tmp3
-}
-
- define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fmul <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fmul <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fmul <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-
- define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = fdiv <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = fdiv <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = fdiv <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-define <1 x i8> @sdiv1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @sdiv1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @sdiv4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @sdiv1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @sdiv2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = sdiv <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @sdiv1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = sdiv <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = sdiv <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @udiv1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @udiv1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @udiv4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @udiv1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @udiv2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = udiv <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @udiv1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = udiv <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = udiv <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @srem1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @srem1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @srem1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = srem <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @srem1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = srem <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = srem <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @urem1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <1 x i8> %A, %B;
-	ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <8 x i8> %A, %B;
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <16 x i8> %A, %B;
-	ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @urem1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <1 x i16> %A, %B;
-	ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <4 x i16> %A, %B;
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <8 x i16> %A, %B;
-	ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @urem1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <1 x i32> %A, %B;
-	ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <2 x i32> %A, %B;
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-	%tmp3 = urem <4 x i32> %A, %B;
-	ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @urem1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = urem <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-	%tmp3 = urem <2 x i64> %A, %B;
-	ret <2 x i64> %tmp3
-}
-
-define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) {
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-	%tmp3 = frem <2 x float> %A, %B;
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frem4f32(<4 x float> %A, <4 x float> %B) {
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-	%tmp3 = frem <4 x float> %A, %B;
-	ret <4 x float> %tmp3
-}
-
-define <1 x double> @frem1d64(<1 x double> %A, <1 x double> %B) {
-; CHECK: bl fmod
-	%tmp3 = frem <1 x double> %A, %B;
-	ret <1 x double> %tmp3
-}
-
-define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) {
-; CHECK: bl fmod
-; CHECK: bl fmod
-	%tmp3 = frem <2 x double> %A, %B;
-	ret <2 x double> %tmp3
-}
-
-declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>)
-declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>)
-
-define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: poly_mulv8i8:
-   %prod = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: pmul v0.8b, v0.8b, v1.8b
-   ret <8 x i8> %prod
-}
-
-define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: poly_mulv16i8:
-   %prod = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: pmul v0.16b, v0.16b, v1.16b
-   ret <16 x i8> %prod
-}
-
-declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqdmulh_v4i16:
-   %prod = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
-   ret <4 x i16> %prod
-}
-
-define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqdmulh_v8i16:
-   %prod = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
-   ret <8 x i16> %prod
-}
-
-define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqdmulh_v2i32:
-   %prod = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
-   ret <2 x i32> %prod
-}
-
-define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqdmulh_v4i32:
-   %prod = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
-   ret <4 x i32> %prod
-}
-
-declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqrdmulh_v4i16:
-   %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
-   ret <4 x i16> %prod
-}
-
-define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqrdmulh_v8i16:
-   %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
-   ret <8 x i16> %prod
-}
-
-define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqrdmulh_v2i32:
-   %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
-   ret <2 x i32> %prod
-}
-
-define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqrdmulh_v4i32:
-   %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
-   ret <4 x i32> %prod
-}
-
-declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.2s, v0.2s, v1.2s
-        %val = call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-        ret <2 x float> %val
-}
-
-define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.4s, v0.4s, v1.4s
-        %val = call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-        ret <4 x float> %val
-}
-
-define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.2d, v0.2d, v1.2d
-        %val = call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-        ret <2 x double> %val
-}
-
-define <1 x i8> @test_mul_v1i8(<1 x i8> %a, <1 x i8> %b) {
-;CHECK-LABEL: test_mul_v1i8:
-;CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  %c = mul <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @test_mul_v1i16(<1 x i16> %a, <1 x i16> %b) {
-;CHECK-LABEL: test_mul_v1i16:
-;CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  %c = mul <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @test_mul_v1i32(<1 x i32> %a, <1 x i32> %b) {
-;CHECK-LABEL: test_mul_v1i32:
-;CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %c = mul <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
index a0b17e1..4f8571d 100644
--- a/test/CodeGen/AArch64/neon-perm.ll
+++ b/test/CodeGen/AArch64/neon-perm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
@@ -20,7 +20,7 @@
 %struct.poly16x8x2_t = type { [2 x <8 x i16>] }
 
 define <8 x i8> @test_vuzp1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp1_s8:
+; CHECK-LABEL: test_vuzp1_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -28,7 +28,7 @@ entry:
 }
 
 define <16 x i8> @test_vuzp1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp1q_s8:
+; CHECK-LABEL: test_vuzp1q_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -36,7 +36,7 @@ entry:
 }
 
 define <4 x i16> @test_vuzp1_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp1_s16:
+; CHECK-LABEL: test_vuzp1_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -44,7 +44,7 @@ entry:
 }
 
 define <8 x i16> @test_vuzp1q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp1q_s16:
+; CHECK-LABEL: test_vuzp1q_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -52,15 +52,15 @@ entry:
 }
 
 define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp1_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vuzp1_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vuzp1q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzp1q_s32:
+; CHECK-LABEL: test_vuzp1q_s32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -68,15 +68,15 @@ entry:
 }
 
 define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vuzp1q_s64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vuzp1q_s64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
 }
 
 define <8 x i8> @test_vuzp1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp1_u8:
+; CHECK-LABEL: test_vuzp1_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -84,7 +84,7 @@ entry:
 }
 
 define <16 x i8> @test_vuzp1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp1q_u8:
+; CHECK-LABEL: test_vuzp1q_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -92,7 +92,7 @@ entry:
 }
 
 define <4 x i16> @test_vuzp1_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp1_u16:
+; CHECK-LABEL: test_vuzp1_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -100,7 +100,7 @@ entry:
 }
 
 define <8 x i16> @test_vuzp1q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp1q_u16:
+; CHECK-LABEL: test_vuzp1q_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -108,15 +108,15 @@ entry:
 }
 
 define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp1_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vuzp1_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vuzp1q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzp1q_u32:
+; CHECK-LABEL: test_vuzp1q_u32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -124,23 +124,23 @@ entry:
 }
 
 define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vuzp1q_u64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vuzp1q_u64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vuzp1_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vuzp1_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
 }
 
 define <4 x float> @test_vuzp1q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vuzp1q_f32:
+; CHECK-LABEL: test_vuzp1q_f32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -148,15 +148,15 @@ entry:
 }
 
 define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vuzp1q_f64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vuzp1q_f64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
 }
 
 define <8 x i8> @test_vuzp1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp1_p8:
+; CHECK-LABEL: test_vuzp1_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -164,7 +164,7 @@ entry:
 }
 
 define <16 x i8> @test_vuzp1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp1q_p8:
+; CHECK-LABEL: test_vuzp1q_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -172,7 +172,7 @@ entry:
 }
 
 define <4 x i16> @test_vuzp1_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp1_p16:
+; CHECK-LABEL: test_vuzp1_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -180,7 +180,7 @@ entry:
 }
 
 define <8 x i16> @test_vuzp1q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp1q_p16:
+; CHECK-LABEL: test_vuzp1q_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -188,7 +188,7 @@ entry:
 }
 
 define <8 x i8> @test_vuzp2_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp2_s8:
+; CHECK-LABEL: test_vuzp2_s8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -196,7 +196,7 @@ entry:
 }
 
 define <16 x i8> @test_vuzp2q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp2q_s8:
+; CHECK-LABEL: test_vuzp2q_s8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -204,7 +204,7 @@ entry:
 }
 
 define <4 x i16> @test_vuzp2_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp2_s16:
+; CHECK-LABEL: test_vuzp2_s16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -212,7 +212,7 @@ entry:
 }
 
 define <8 x i16> @test_vuzp2q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp2q_s16:
+; CHECK-LABEL: test_vuzp2q_s16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -220,15 +220,15 @@ entry:
 }
 
 define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp2_s32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp2_s32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vuzp2q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzp2q_s32:
+; CHECK-LABEL: test_vuzp2q_s32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -236,16 +236,15 @@ entry:
 }
 
 define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vuzp2q_s64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: test_vuzp2q_s64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
 }
 
 define <8 x i8> @test_vuzp2_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp2_u8:
+; CHECK-LABEL: test_vuzp2_u8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -253,7 +252,7 @@ entry:
 }
 
 define <16 x i8> @test_vuzp2q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp2q_u8:
+; CHECK-LABEL: test_vuzp2q_u8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -261,7 +260,7 @@ entry:
 }
 
 define <4 x i16> @test_vuzp2_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp2_u16:
+; CHECK-LABEL: test_vuzp2_u16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -269,7 +268,7 @@ entry:
 }
 
 define <8 x i16> @test_vuzp2q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp2q_u16:
+; CHECK-LABEL: test_vuzp2q_u16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -277,15 +276,15 @@ entry:
 }
 
 define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp2_u32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp2_u32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vuzp2q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzp2q_u32:
+; CHECK-LABEL: test_vuzp2q_u32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -293,24 +292,23 @@ entry:
 }
 
 define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vuzp2q_u64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: test_vuzp2q_u64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vuzp2_f32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp2_f32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
 }
 
 define <4 x float> @test_vuzp2q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vuzp2q_f32:
+; CHECK-LABEL: test_vuzp2q_f32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -318,16 +316,15 @@ entry:
 }
 
 define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vuzp2q_f64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: test_vuzp2q_f64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
 }
 
 define <8 x i8> @test_vuzp2_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp2_p8:
+; CHECK-LABEL: test_vuzp2_p8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -335,7 +332,7 @@ entry:
 }
 
 define <16 x i8> @test_vuzp2q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp2q_p8:
+; CHECK-LABEL: test_vuzp2q_p8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -343,7 +340,7 @@ entry:
 }
 
 define <4 x i16> @test_vuzp2_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp2_p16:
+; CHECK-LABEL: test_vuzp2_p16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -351,7 +348,7 @@ entry:
 }
 
 define <8 x i16> @test_vuzp2q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp2q_p16:
+; CHECK-LABEL: test_vuzp2q_p16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -359,7 +356,7 @@ entry:
 }
 
 define <8 x i8> @test_vzip1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip1_s8:
+; CHECK-LABEL: test_vzip1_s8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -367,7 +364,7 @@ entry:
 }
 
 define <16 x i8> @test_vzip1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip1q_s8:
+; CHECK-LABEL: test_vzip1q_s8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -375,7 +372,7 @@ entry:
 }
 
 define <4 x i16> @test_vzip1_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip1_s16:
+; CHECK-LABEL: test_vzip1_s16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -383,7 +380,7 @@ entry:
 }
 
 define <8 x i16> @test_vzip1q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip1q_s16:
+; CHECK-LABEL: test_vzip1q_s16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -391,15 +388,15 @@ entry:
 }
 
 define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip1_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vzip1_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vzip1q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzip1q_s32:
+; CHECK-LABEL: test_vzip1q_s32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -407,15 +404,15 @@ entry:
 }
 
 define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vzip1q_s64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vzip1q_s64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
 }
 
 define <8 x i8> @test_vzip1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip1_u8:
+; CHECK-LABEL: test_vzip1_u8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -423,7 +420,7 @@ entry:
 }
 
 define <16 x i8> @test_vzip1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip1q_u8:
+; CHECK-LABEL: test_vzip1q_u8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -431,7 +428,7 @@ entry:
 }
 
 define <4 x i16> @test_vzip1_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip1_u16:
+; CHECK-LABEL: test_vzip1_u16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -439,7 +436,7 @@ entry:
 }
 
 define <8 x i16> @test_vzip1q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip1q_u16:
+; CHECK-LABEL: test_vzip1q_u16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -447,15 +444,15 @@ entry:
 }
 
 define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip1_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vzip1_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vzip1q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzip1q_u32:
+; CHECK-LABEL: test_vzip1q_u32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -463,23 +460,23 @@ entry:
 }
 
 define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vzip1q_u64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vzip1q_u64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vzip1_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vzip1_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
 }
 
 define <4 x float> @test_vzip1q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vzip1q_f32:
+; CHECK-LABEL: test_vzip1q_f32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -487,15 +484,15 @@ entry:
 }
 
 define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vzip1q_f64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vzip1q_f64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
 }
 
 define <8 x i8> @test_vzip1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip1_p8:
+; CHECK-LABEL: test_vzip1_p8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -503,7 +500,7 @@ entry:
 }
 
 define <16 x i8> @test_vzip1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip1q_p8:
+; CHECK-LABEL: test_vzip1q_p8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -511,7 +508,7 @@ entry:
 }
 
 define <4 x i16> @test_vzip1_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip1_p16:
+; CHECK-LABEL: test_vzip1_p16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -519,7 +516,7 @@ entry:
 }
 
 define <8 x i16> @test_vzip1q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip1q_p16:
+; CHECK-LABEL: test_vzip1q_p16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -527,7 +524,7 @@ entry:
 }
 
 define <8 x i8> @test_vzip2_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip2_s8:
+; CHECK-LABEL: test_vzip2_s8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -535,7 +532,7 @@ entry:
 }
 
 define <16 x i8> @test_vzip2q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip2q_s8:
+; CHECK-LABEL: test_vzip2q_s8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -543,7 +540,7 @@ entry:
 }
 
 define <4 x i16> @test_vzip2_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip2_s16:
+; CHECK-LABEL: test_vzip2_s16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -551,7 +548,7 @@ entry:
 }
 
 define <8 x i16> @test_vzip2q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip2q_s16:
+; CHECK-LABEL: test_vzip2q_s16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -559,15 +556,15 @@ entry:
 }
 
 define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip2_s32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip2_s32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vzip2q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzip2q_s32:
+; CHECK-LABEL: test_vzip2q_s32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -575,15 +572,15 @@ entry:
 }
 
 define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vzip2q_s64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vzip2q_s64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
 }
 
 define <8 x i8> @test_vzip2_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip2_u8:
+; CHECK-LABEL: test_vzip2_u8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -591,7 +588,7 @@ entry:
 }
 
 define <16 x i8> @test_vzip2q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip2q_u8:
+; CHECK-LABEL: test_vzip2q_u8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -599,7 +596,7 @@ entry:
 }
 
 define <4 x i16> @test_vzip2_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip2_u16:
+; CHECK-LABEL: test_vzip2_u16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -607,7 +604,7 @@ entry:
 }
 
 define <8 x i16> @test_vzip2q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip2q_u16:
+; CHECK-LABEL: test_vzip2q_u16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -615,15 +612,15 @@ entry:
 }
 
 define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip2_u32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip2_u32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vzip2q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzip2q_u32:
+; CHECK-LABEL: test_vzip2q_u32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -631,23 +628,23 @@ entry:
 }
 
 define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vzip2q_u64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vzip2q_u64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vzip2_f32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip2_f32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
 }
 
 define <4 x float> @test_vzip2q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vzip2q_f32:
+; CHECK-LABEL: test_vzip2q_f32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -655,15 +652,15 @@ entry:
 }
 
 define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vzip2q_f64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vzip2q_f64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
 }
 
 define <8 x i8> @test_vzip2_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip2_p8:
+; CHECK-LABEL: test_vzip2_p8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -671,7 +668,7 @@ entry:
 }
 
 define <16 x i8> @test_vzip2q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip2q_p8:
+; CHECK-LABEL: test_vzip2q_p8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -679,7 +676,7 @@ entry:
 }
 
 define <4 x i16> @test_vzip2_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip2_p16:
+; CHECK-LABEL: test_vzip2_p16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -687,7 +684,7 @@ entry:
 }
 
 define <8 x i16> @test_vzip2q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip2q_p16:
+; CHECK-LABEL: test_vzip2q_p16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -695,7 +692,7 @@ entry:
 }
 
 define <8 x i8> @test_vtrn1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn1_s8:
+; CHECK-LABEL: test_vtrn1_s8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -703,7 +700,7 @@ entry:
 }
 
 define <16 x i8> @test_vtrn1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn1q_s8:
+; CHECK-LABEL: test_vtrn1q_s8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -711,7 +708,7 @@ entry:
 }
 
 define <4 x i16> @test_vtrn1_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn1_s16:
+; CHECK-LABEL: test_vtrn1_s16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -719,7 +716,7 @@ entry:
 }
 
 define <8 x i16> @test_vtrn1q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn1q_s16:
+; CHECK-LABEL: test_vtrn1q_s16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -727,15 +724,15 @@ entry:
 }
 
 define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn1_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vtrn1_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vtrn1q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrn1q_s32:
+; CHECK-LABEL: test_vtrn1q_s32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -743,15 +740,15 @@ entry:
 }
 
 define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vtrn1q_s64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vtrn1q_s64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
 }
 
 define <8 x i8> @test_vtrn1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn1_u8:
+; CHECK-LABEL: test_vtrn1_u8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -759,7 +756,7 @@ entry:
 }
 
 define <16 x i8> @test_vtrn1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn1q_u8:
+; CHECK-LABEL: test_vtrn1q_u8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -767,7 +764,7 @@ entry:
 }
 
 define <4 x i16> @test_vtrn1_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn1_u16:
+; CHECK-LABEL: test_vtrn1_u16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -775,7 +772,7 @@ entry:
 }
 
 define <8 x i16> @test_vtrn1q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn1q_u16:
+; CHECK-LABEL: test_vtrn1q_u16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -783,15 +780,15 @@ entry:
 }
 
 define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn1_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vtrn1_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vtrn1q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrn1q_u32:
+; CHECK-LABEL: test_vtrn1q_u32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -799,23 +796,23 @@ entry:
 }
 
 define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vtrn1q_u64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vtrn1q_u64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vtrn1_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vtrn1_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
 }
 
 define <4 x float> @test_vtrn1q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vtrn1q_f32:
+; CHECK-LABEL: test_vtrn1q_f32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -823,15 +820,15 @@ entry:
 }
 
 define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vtrn1q_f64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vtrn1q_f64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
 }
 
 define <8 x i8> @test_vtrn1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn1_p8:
+; CHECK-LABEL: test_vtrn1_p8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -839,7 +836,7 @@ entry:
 }
 
 define <16 x i8> @test_vtrn1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn1q_p8:
+; CHECK-LABEL: test_vtrn1q_p8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -847,7 +844,7 @@ entry:
 }
 
 define <4 x i16> @test_vtrn1_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn1_p16:
+; CHECK-LABEL: test_vtrn1_p16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -855,7 +852,7 @@ entry:
 }
 
 define <8 x i16> @test_vtrn1q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn1q_p16:
+; CHECK-LABEL: test_vtrn1q_p16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -863,7 +860,7 @@ entry:
 }
 
 define <8 x i8> @test_vtrn2_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn2_s8:
+; CHECK-LABEL: test_vtrn2_s8:
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -871,7 +868,7 @@ entry:
 }
 
 define <16 x i8> @test_vtrn2q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn2q_s8:
+; CHECK-LABEL: test_vtrn2q_s8:
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -879,7 +876,7 @@ entry:
 }
 
 define <4 x i16> @test_vtrn2_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn2_s16:
+; CHECK-LABEL: test_vtrn2_s16:
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -887,7 +884,7 @@ entry:
 }
 
 define <8 x i16> @test_vtrn2q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn2q_s16:
+; CHECK-LABEL: test_vtrn2q_s16:
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -895,15 +892,15 @@ entry:
 }
 
 define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn2_s32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn2_s32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vtrn2q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrn2q_s32:
+; CHECK-LABEL: test_vtrn2q_s32:
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -911,15 +908,15 @@ entry:
 }
 
 define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vtrn2q_s64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vtrn2q_s64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
 }
 
 define <8 x i8> @test_vtrn2_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn2_u8:
+; CHECK-LABEL: test_vtrn2_u8:
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -927,7 +924,7 @@ entry:
 }
 
 define <16 x i8> @test_vtrn2q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn2q_u8:
+; CHECK-LABEL: test_vtrn2q_u8:
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -935,7 +932,7 @@ entry:
 }
 
 define <4 x i16> @test_vtrn2_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn2_u16:
+; CHECK-LABEL: test_vtrn2_u16:
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -943,7 +940,7 @@ entry:
 }
 
 define <8 x i16> @test_vtrn2q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn2q_u16:
+; CHECK-LABEL: test_vtrn2q_u16:
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -951,15 +948,15 @@ entry:
 }
 
 define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn2_u32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn2_u32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
 }
 
 define <4 x i32> @test_vtrn2q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrn2q_u32:
+; CHECK-LABEL: test_vtrn2q_u32:
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -967,23 +964,23 @@ entry:
 }
 
 define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vtrn2q_u64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vtrn2q_u64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
 }
 
 define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vtrn2_f32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn2_f32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
 }
 
 define <4 x float> @test_vtrn2q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vtrn2q_f32:
+; CHECK-LABEL: test_vtrn2q_f32:
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -991,15 +988,15 @@ entry:
 }
 
 define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vtrn2q_f64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vtrn2q_f64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
 }
 
 define <8 x i8> @test_vtrn2_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn2_p8:
+; CHECK-LABEL: test_vtrn2_p8:
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1007,7 +1004,7 @@ entry:
 }
 
 define <16 x i8> @test_vtrn2q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn2q_p8:
+; CHECK-LABEL: test_vtrn2q_p8:
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -1015,7 +1012,7 @@ entry:
 }
 
 define <4 x i16> @test_vtrn2_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn2_p16:
+; CHECK-LABEL: test_vtrn2_p16:
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1023,7 +1020,7 @@ entry:
 }
 
 define <8 x i16> @test_vtrn2q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn2q_p16:
+; CHECK-LABEL: test_vtrn2q_p16:
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1031,7 +1028,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vuzp1_s8(<8 x i8> %a) {
-; CHECK: test_same_vuzp1_s8:
+; CHECK-LABEL: test_same_vuzp1_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1039,7 +1036,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vuzp1q_s8(<16 x i8> %a) {
-; CHECK: test_same_vuzp1q_s8:
+; CHECK-LABEL: test_same_vuzp1q_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1047,7 +1044,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vuzp1_s16(<4 x i16> %a) {
-; CHECK: test_same_vuzp1_s16:
+; CHECK-LABEL: test_same_vuzp1_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1055,7 +1052,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vuzp1q_s16(<8 x i16> %a) {
-; CHECK: test_same_vuzp1q_s16:
+; CHECK-LABEL: test_same_vuzp1q_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1063,7 +1060,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vuzp1q_s32(<4 x i32> %a) {
-; CHECK: test_same_vuzp1q_s32:
+; CHECK-LABEL: test_same_vuzp1q_s32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1071,7 +1068,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vuzp1_u8(<8 x i8> %a) {
-; CHECK: test_same_vuzp1_u8:
+; CHECK-LABEL: test_same_vuzp1_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1079,7 +1076,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vuzp1q_u8(<16 x i8> %a) {
-; CHECK: test_same_vuzp1q_u8:
+; CHECK-LABEL: test_same_vuzp1q_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1087,7 +1084,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vuzp1_u16(<4 x i16> %a) {
-; CHECK: test_same_vuzp1_u16:
+; CHECK-LABEL: test_same_vuzp1_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1095,7 +1092,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vuzp1q_u16(<8 x i16> %a) {
-; CHECK: test_same_vuzp1q_u16:
+; CHECK-LABEL: test_same_vuzp1q_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1103,7 +1100,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vuzp1q_u32(<4 x i32> %a) {
-; CHECK: test_same_vuzp1q_u32:
+; CHECK-LABEL: test_same_vuzp1q_u32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1111,7 +1108,7 @@ entry:
 }
 
 define <4 x float> @test_same_vuzp1q_f32(<4 x float> %a) {
-; CHECK: test_same_vuzp1q_f32:
+; CHECK-LABEL: test_same_vuzp1q_f32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1119,7 +1116,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vuzp1_p8(<8 x i8> %a) {
-; CHECK: test_same_vuzp1_p8:
+; CHECK-LABEL: test_same_vuzp1_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1127,7 +1124,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vuzp1q_p8(<16 x i8> %a) {
-; CHECK: test_same_vuzp1q_p8:
+; CHECK-LABEL: test_same_vuzp1q_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1135,7 +1132,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vuzp1_p16(<4 x i16> %a) {
-; CHECK: test_same_vuzp1_p16:
+; CHECK-LABEL: test_same_vuzp1_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1143,7 +1140,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vuzp1q_p16(<8 x i16> %a) {
-; CHECK: test_same_vuzp1q_p16:
+; CHECK-LABEL: test_same_vuzp1q_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1151,7 +1148,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vuzp2_s8(<8 x i8> %a) {
-; CHECK: test_same_vuzp2_s8:
+; CHECK-LABEL: test_same_vuzp2_s8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1159,7 +1156,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vuzp2q_s8(<16 x i8> %a) {
-; CHECK: test_same_vuzp2q_s8:
+; CHECK-LABEL: test_same_vuzp2q_s8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1167,7 +1164,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vuzp2_s16(<4 x i16> %a) {
-; CHECK: test_same_vuzp2_s16:
+; CHECK-LABEL: test_same_vuzp2_s16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1175,7 +1172,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vuzp2q_s16(<8 x i16> %a) {
-; CHECK: test_same_vuzp2q_s16:
+; CHECK-LABEL: test_same_vuzp2q_s16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1183,7 +1180,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vuzp2q_s32(<4 x i32> %a) {
-; CHECK: test_same_vuzp2q_s32:
+; CHECK-LABEL: test_same_vuzp2q_s32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1191,7 +1188,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vuzp2_u8(<8 x i8> %a) {
-; CHECK: test_same_vuzp2_u8:
+; CHECK-LABEL: test_same_vuzp2_u8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1199,7 +1196,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vuzp2q_u8(<16 x i8> %a) {
-; CHECK: test_same_vuzp2q_u8:
+; CHECK-LABEL: test_same_vuzp2q_u8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1207,7 +1204,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vuzp2_u16(<4 x i16> %a) {
-; CHECK: test_same_vuzp2_u16:
+; CHECK-LABEL: test_same_vuzp2_u16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1215,7 +1212,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vuzp2q_u16(<8 x i16> %a) {
-; CHECK: test_same_vuzp2q_u16:
+; CHECK-LABEL: test_same_vuzp2q_u16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1223,7 +1220,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vuzp2q_u32(<4 x i32> %a) {
-; CHECK: test_same_vuzp2q_u32:
+; CHECK-LABEL: test_same_vuzp2q_u32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1231,7 +1228,7 @@ entry:
 }
 
 define <4 x float> @test_same_vuzp2q_f32(<4 x float> %a) {
-; CHECK: test_same_vuzp2q_f32:
+; CHECK-LABEL: test_same_vuzp2q_f32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1239,7 +1236,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vuzp2_p8(<8 x i8> %a) {
-; CHECK: test_same_vuzp2_p8:
+; CHECK-LABEL: test_same_vuzp2_p8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1247,7 +1244,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vuzp2q_p8(<16 x i8> %a) {
-; CHECK: test_same_vuzp2q_p8:
+; CHECK-LABEL: test_same_vuzp2q_p8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1255,7 +1252,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vuzp2_p16(<4 x i16> %a) {
-; CHECK: test_same_vuzp2_p16:
+; CHECK-LABEL: test_same_vuzp2_p16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1263,7 +1260,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vuzp2q_p16(<8 x i16> %a) {
-; CHECK: test_same_vuzp2q_p16:
+; CHECK-LABEL: test_same_vuzp2q_p16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1271,7 +1268,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vzip1_s8(<8 x i8> %a) {
-; CHECK: test_same_vzip1_s8:
+; CHECK-LABEL: test_same_vzip1_s8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1279,7 +1276,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vzip1q_s8(<16 x i8> %a) {
-; CHECK: test_same_vzip1q_s8:
+; CHECK-LABEL: test_same_vzip1q_s8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -1287,7 +1284,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vzip1_s16(<4 x i16> %a) {
-; CHECK: test_same_vzip1_s16:
+; CHECK-LABEL: test_same_vzip1_s16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1295,7 +1292,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vzip1q_s16(<8 x i16> %a) {
-; CHECK: test_same_vzip1q_s16:
+; CHECK-LABEL: test_same_vzip1q_s16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1303,7 +1300,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vzip1q_s32(<4 x i32> %a) {
-; CHECK: test_same_vzip1q_s32:
+; CHECK-LABEL: test_same_vzip1q_s32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1311,7 +1308,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vzip1_u8(<8 x i8> %a) {
-; CHECK: test_same_vzip1_u8:
+; CHECK-LABEL: test_same_vzip1_u8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1319,7 +1316,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vzip1q_u8(<16 x i8> %a) {
-; CHECK: test_same_vzip1q_u8:
+; CHECK-LABEL: test_same_vzip1q_u8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -1327,7 +1324,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vzip1_u16(<4 x i16> %a) {
-; CHECK: test_same_vzip1_u16:
+; CHECK-LABEL: test_same_vzip1_u16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1335,7 +1332,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vzip1q_u16(<8 x i16> %a) {
-; CHECK: test_same_vzip1q_u16:
+; CHECK-LABEL: test_same_vzip1q_u16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1343,7 +1340,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vzip1q_u32(<4 x i32> %a) {
-; CHECK: test_same_vzip1q_u32:
+; CHECK-LABEL: test_same_vzip1q_u32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1351,7 +1348,7 @@ entry:
 }
 
 define <4 x float> @test_same_vzip1q_f32(<4 x float> %a) {
-; CHECK: test_same_vzip1q_f32:
+; CHECK-LABEL: test_same_vzip1q_f32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1359,7 +1356,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vzip1_p8(<8 x i8> %a) {
-; CHECK: test_same_vzip1_p8:
+; CHECK-LABEL: test_same_vzip1_p8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1367,7 +1364,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vzip1q_p8(<16 x i8> %a) {
-; CHECK: test_same_vzip1q_p8:
+; CHECK-LABEL: test_same_vzip1q_p8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -1375,7 +1372,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vzip1_p16(<4 x i16> %a) {
-; CHECK: test_same_vzip1_p16:
+; CHECK-LABEL: test_same_vzip1_p16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1383,7 +1380,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vzip1q_p16(<8 x i16> %a) {
-; CHECK: test_same_vzip1q_p16:
+; CHECK-LABEL: test_same_vzip1q_p16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1391,7 +1388,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vzip2_s8(<8 x i8> %a) {
-; CHECK: test_same_vzip2_s8:
+; CHECK-LABEL: test_same_vzip2_s8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1399,7 +1396,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vzip2q_s8(<16 x i8> %a) {
-; CHECK: test_same_vzip2q_s8:
+; CHECK-LABEL: test_same_vzip2q_s8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -1407,7 +1404,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vzip2_s16(<4 x i16> %a) {
-; CHECK: test_same_vzip2_s16:
+; CHECK-LABEL: test_same_vzip2_s16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1415,7 +1412,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vzip2q_s16(<8 x i16> %a) {
-; CHECK: test_same_vzip2q_s16:
+; CHECK-LABEL: test_same_vzip2q_s16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1423,7 +1420,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vzip2q_s32(<4 x i32> %a) {
-; CHECK: test_same_vzip2q_s32:
+; CHECK-LABEL: test_same_vzip2q_s32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1431,7 +1428,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vzip2_u8(<8 x i8> %a) {
-; CHECK: test_same_vzip2_u8:
+; CHECK-LABEL: test_same_vzip2_u8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1439,7 +1436,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vzip2q_u8(<16 x i8> %a) {
-; CHECK: test_same_vzip2q_u8:
+; CHECK-LABEL: test_same_vzip2q_u8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -1447,7 +1444,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vzip2_u16(<4 x i16> %a) {
-; CHECK: test_same_vzip2_u16:
+; CHECK-LABEL: test_same_vzip2_u16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1455,7 +1452,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vzip2q_u16(<8 x i16> %a) {
-; CHECK: test_same_vzip2q_u16:
+; CHECK-LABEL: test_same_vzip2q_u16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1463,7 +1460,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vzip2q_u32(<4 x i32> %a) {
-; CHECK: test_same_vzip2q_u32:
+; CHECK-LABEL: test_same_vzip2q_u32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1471,7 +1468,7 @@ entry:
 }
 
 define <4 x float> @test_same_vzip2q_f32(<4 x float> %a) {
-; CHECK: test_same_vzip2q_f32:
+; CHECK-LABEL: test_same_vzip2q_f32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1479,7 +1476,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vzip2_p8(<8 x i8> %a) {
-; CHECK: test_same_vzip2_p8:
+; CHECK-LABEL: test_same_vzip2_p8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1487,7 +1484,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vzip2q_p8(<16 x i8> %a) {
-; CHECK: test_same_vzip2q_p8:
+; CHECK-LABEL: test_same_vzip2q_p8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -1495,7 +1492,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vzip2_p16(<4 x i16> %a) {
-; CHECK: test_same_vzip2_p16:
+; CHECK-LABEL: test_same_vzip2_p16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1503,7 +1500,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vzip2q_p16(<8 x i16> %a) {
-; CHECK: test_same_vzip2q_p16:
+; CHECK-LABEL: test_same_vzip2q_p16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1511,7 +1508,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vtrn1_s8(<8 x i8> %a) {
-; CHECK: test_same_vtrn1_s8:
+; CHECK-LABEL: test_same_vtrn1_s8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1519,7 +1516,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vtrn1q_s8(<16 x i8> %a) {
-; CHECK: test_same_vtrn1q_s8:
+; CHECK-LABEL: test_same_vtrn1q_s8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -1527,7 +1524,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vtrn1_s16(<4 x i16> %a) {
-; CHECK: test_same_vtrn1_s16:
+; CHECK-LABEL: test_same_vtrn1_s16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1535,7 +1532,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vtrn1q_s16(<8 x i16> %a) {
-; CHECK: test_same_vtrn1q_s16:
+; CHECK-LABEL: test_same_vtrn1q_s16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1543,7 +1540,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vtrn1q_s32(<4 x i32> %a) {
-; CHECK: test_same_vtrn1q_s32:
+; CHECK-LABEL: test_same_vtrn1q_s32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1551,7 +1548,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vtrn1_u8(<8 x i8> %a) {
-; CHECK: test_same_vtrn1_u8:
+; CHECK-LABEL: test_same_vtrn1_u8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1559,7 +1556,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vtrn1q_u8(<16 x i8> %a) {
-; CHECK: test_same_vtrn1q_u8:
+; CHECK-LABEL: test_same_vtrn1q_u8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -1567,7 +1564,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vtrn1_u16(<4 x i16> %a) {
-; CHECK: test_same_vtrn1_u16:
+; CHECK-LABEL: test_same_vtrn1_u16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1575,7 +1572,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vtrn1q_u16(<8 x i16> %a) {
-; CHECK: test_same_vtrn1q_u16:
+; CHECK-LABEL: test_same_vtrn1q_u16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1583,7 +1580,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vtrn1q_u32(<4 x i32> %a) {
-; CHECK: test_same_vtrn1q_u32:
+; CHECK-LABEL: test_same_vtrn1q_u32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1591,7 +1588,7 @@ entry:
 }
 
 define <4 x float> @test_same_vtrn1q_f32(<4 x float> %a) {
-; CHECK: test_same_vtrn1q_f32:
+; CHECK-LABEL: test_same_vtrn1q_f32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1599,7 +1596,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vtrn1_p8(<8 x i8> %a) {
-; CHECK: test_same_vtrn1_p8:
+; CHECK-LABEL: test_same_vtrn1_p8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1607,7 +1604,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vtrn1q_p8(<16 x i8> %a) {
-; CHECK: test_same_vtrn1q_p8:
+; CHECK-LABEL: test_same_vtrn1q_p8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -1615,7 +1612,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vtrn1_p16(<4 x i16> %a) {
-; CHECK: test_same_vtrn1_p16:
+; CHECK-LABEL: test_same_vtrn1_p16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1623,7 +1620,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vtrn1q_p16(<8 x i16> %a) {
-; CHECK: test_same_vtrn1q_p16:
+; CHECK-LABEL: test_same_vtrn1q_p16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1631,7 +1628,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vtrn2_s8(<8 x i8> %a) {
-; CHECK: test_same_vtrn2_s8:
+; CHECK-LABEL: test_same_vtrn2_s8:
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1639,7 +1636,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vtrn2q_s8(<16 x i8> %a) {
-; CHECK: test_same_vtrn2q_s8:
+; CHECK-LABEL: test_same_vtrn2q_s8:
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -1647,7 +1644,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vtrn2_s16(<4 x i16> %a) {
-; CHECK: test_same_vtrn2_s16:
+; CHECK-LABEL: test_same_vtrn2_s16:
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1655,7 +1652,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vtrn2q_s16(<8 x i16> %a) {
-; CHECK: test_same_vtrn2q_s16:
+; CHECK-LABEL: test_same_vtrn2q_s16:
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1663,7 +1660,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vtrn2q_s32(<4 x i32> %a) {
-; CHECK: test_same_vtrn2q_s32:
+; CHECK-LABEL: test_same_vtrn2q_s32:
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1671,7 +1668,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vtrn2_u8(<8 x i8> %a) {
-; CHECK: test_same_vtrn2_u8:
+; CHECK-LABEL: test_same_vtrn2_u8:
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1679,7 +1676,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vtrn2q_u8(<16 x i8> %a) {
-; CHECK: test_same_vtrn2q_u8:
+; CHECK-LABEL: test_same_vtrn2q_u8:
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -1687,7 +1684,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vtrn2_u16(<4 x i16> %a) {
-; CHECK: test_same_vtrn2_u16:
+; CHECK-LABEL: test_same_vtrn2_u16:
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1695,7 +1692,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vtrn2q_u16(<8 x i16> %a) {
-; CHECK: test_same_vtrn2q_u16:
+; CHECK-LABEL: test_same_vtrn2q_u16:
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1703,7 +1700,7 @@ entry:
 }
 
 define <4 x i32> @test_same_vtrn2q_u32(<4 x i32> %a) {
-; CHECK: test_same_vtrn2q_u32:
+; CHECK-LABEL: test_same_vtrn2q_u32:
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1711,7 +1708,7 @@ entry:
 }
 
 define <4 x float> @test_same_vtrn2q_f32(<4 x float> %a) {
-; CHECK: test_same_vtrn2q_f32:
+; CHECK-LABEL: test_same_vtrn2q_f32:
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1719,7 +1716,7 @@ entry:
 }
 
 define <8 x i8> @test_same_vtrn2_p8(<8 x i8> %a) {
-; CHECK: test_same_vtrn2_p8:
+; CHECK-LABEL: test_same_vtrn2_p8:
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1727,7 +1724,7 @@ entry:
 }
 
 define <16 x i8> @test_same_vtrn2q_p8(<16 x i8> %a) {
-; CHECK: test_same_vtrn2q_p8:
+; CHECK-LABEL: test_same_vtrn2q_p8:
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -1735,7 +1732,7 @@ entry:
 }
 
 define <4 x i16> @test_same_vtrn2_p16(<4 x i16> %a) {
-; CHECK: test_same_vtrn2_p16:
+; CHECK-LABEL: test_same_vtrn2_p16:
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1743,7 +1740,7 @@ entry:
 }
 
 define <8 x i16> @test_same_vtrn2q_p16(<8 x i16> %a) {
-; CHECK: test_same_vtrn2q_p16:
+; CHECK-LABEL: test_same_vtrn2q_p16:
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1752,7 +1749,7 @@ entry:
 
 
 define <8 x i8> @test_undef_vuzp1_s8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp1_s8:
+; CHECK-LABEL: test_undef_vuzp1_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1760,7 +1757,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp1q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp1q_s8:
+; CHECK-LABEL: test_undef_vuzp1q_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1768,7 +1765,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vuzp1_s16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp1_s16:
+; CHECK-LABEL: test_undef_vuzp1_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1776,7 +1773,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp1q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp1q_s16:
+; CHECK-LABEL: test_undef_vuzp1q_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1784,7 +1781,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vuzp1q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vuzp1q_s32:
+; CHECK-LABEL: test_undef_vuzp1q_s32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1792,7 +1789,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vuzp1_u8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp1_u8:
+; CHECK-LABEL: test_undef_vuzp1_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1800,7 +1797,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp1q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp1q_u8:
+; CHECK-LABEL: test_undef_vuzp1q_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1808,7 +1805,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vuzp1_u16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp1_u16:
+; CHECK-LABEL: test_undef_vuzp1_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1816,7 +1813,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp1q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp1q_u16:
+; CHECK-LABEL: test_undef_vuzp1q_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1824,7 +1821,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vuzp1q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vuzp1q_u32:
+; CHECK-LABEL: test_undef_vuzp1q_u32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1832,7 +1829,7 @@ entry:
 }
 
 define <4 x float> @test_undef_vuzp1q_f32(<4 x float> %a) {
-; CHECK: test_undef_vuzp1q_f32:
+; CHECK-LABEL: test_undef_vuzp1q_f32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1840,7 +1837,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vuzp1_p8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp1_p8:
+; CHECK-LABEL: test_undef_vuzp1_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1848,7 +1845,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp1q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp1q_p8:
+; CHECK-LABEL: test_undef_vuzp1q_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1856,7 +1853,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vuzp1_p16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp1_p16:
+; CHECK-LABEL: test_undef_vuzp1_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1864,7 +1861,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp1q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp1q_p16:
+; CHECK-LABEL: test_undef_vuzp1q_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1872,7 +1869,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vuzp2_s8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp2_s8:
+; CHECK-LABEL: test_undef_vuzp2_s8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1880,7 +1877,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp2q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp2q_s8:
+; CHECK-LABEL: test_undef_vuzp2q_s8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1888,7 +1885,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vuzp2_s16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp2_s16:
+; CHECK-LABEL: test_undef_vuzp2_s16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1896,7 +1893,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp2q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp2q_s16:
+; CHECK-LABEL: test_undef_vuzp2q_s16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1904,7 +1901,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vuzp2q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vuzp2q_s32:
+; CHECK-LABEL: test_undef_vuzp2q_s32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1912,7 +1909,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vuzp2_u8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp2_u8:
+; CHECK-LABEL: test_undef_vuzp2_u8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1920,7 +1917,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp2q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp2q_u8:
+; CHECK-LABEL: test_undef_vuzp2q_u8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1928,7 +1925,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vuzp2_u16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp2_u16:
+; CHECK-LABEL: test_undef_vuzp2_u16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1936,7 +1933,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp2q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp2q_u16:
+; CHECK-LABEL: test_undef_vuzp2q_u16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1944,7 +1941,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vuzp2q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vuzp2q_u32:
+; CHECK-LABEL: test_undef_vuzp2q_u32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1952,7 +1949,7 @@ entry:
 }
 
 define <4 x float> @test_undef_vuzp2q_f32(<4 x float> %a) {
-; CHECK: test_undef_vuzp2q_f32:
+; CHECK-LABEL: test_undef_vuzp2q_f32:
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1960,7 +1957,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vuzp2_p8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp2_p8:
+; CHECK-LABEL: test_undef_vuzp2_p8:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1968,7 +1965,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vuzp2q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp2q_p8:
+; CHECK-LABEL: test_undef_vuzp2q_p8:
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1976,7 +1973,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vuzp2_p16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp2_p16:
+; CHECK-LABEL: test_undef_vuzp2_p16:
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1984,7 +1981,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vuzp2q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp2q_p16:
+; CHECK-LABEL: test_undef_vuzp2q_p16:
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1992,7 +1989,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vzip1_s8(<8 x i8> %a) {
-; CHECK: test_undef_vzip1_s8:
+; CHECK-LABEL: test_undef_vzip1_s8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2000,7 +1997,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vzip1q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vzip1q_s8:
+; CHECK-LABEL: test_undef_vzip1q_s8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -2008,7 +2005,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vzip1_s16(<4 x i16> %a) {
-; CHECK: test_undef_vzip1_s16:
+; CHECK-LABEL: test_undef_vzip1_s16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2016,7 +2013,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vzip1q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vzip1q_s16:
+; CHECK-LABEL: test_undef_vzip1q_s16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2024,7 +2021,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vzip1q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vzip1q_s32:
+; CHECK-LABEL: test_undef_vzip1q_s32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2032,7 +2029,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vzip1_u8(<8 x i8> %a) {
-; CHECK: test_undef_vzip1_u8:
+; CHECK-LABEL: test_undef_vzip1_u8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2040,7 +2037,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vzip1q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vzip1q_u8:
+; CHECK-LABEL: test_undef_vzip1q_u8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -2048,7 +2045,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vzip1_u16(<4 x i16> %a) {
-; CHECK: test_undef_vzip1_u16:
+; CHECK-LABEL: test_undef_vzip1_u16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2056,7 +2053,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vzip1q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vzip1q_u16:
+; CHECK-LABEL: test_undef_vzip1q_u16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2064,7 +2061,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vzip1q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vzip1q_u32:
+; CHECK-LABEL: test_undef_vzip1q_u32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2072,7 +2069,7 @@ entry:
 }
 
 define <4 x float> @test_undef_vzip1q_f32(<4 x float> %a) {
-; CHECK: test_undef_vzip1q_f32:
+; CHECK-LABEL: test_undef_vzip1q_f32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2080,7 +2077,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vzip1_p8(<8 x i8> %a) {
-; CHECK: test_undef_vzip1_p8:
+; CHECK-LABEL: test_undef_vzip1_p8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2088,7 +2085,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vzip1q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vzip1q_p8:
+; CHECK-LABEL: test_undef_vzip1q_p8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -2096,7 +2093,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vzip1_p16(<4 x i16> %a) {
-; CHECK: test_undef_vzip1_p16:
+; CHECK-LABEL: test_undef_vzip1_p16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2104,7 +2101,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vzip1q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vzip1q_p16:
+; CHECK-LABEL: test_undef_vzip1q_p16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2112,7 +2109,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vzip2_s8(<8 x i8> %a) {
-; CHECK: test_undef_vzip2_s8:
+; CHECK-LABEL: test_undef_vzip2_s8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2120,7 +2117,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vzip2q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vzip2q_s8:
+; CHECK-LABEL: test_undef_vzip2q_s8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -2128,7 +2125,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vzip2_s16(<4 x i16> %a) {
-; CHECK: test_undef_vzip2_s16:
+; CHECK-LABEL: test_undef_vzip2_s16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2136,7 +2133,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vzip2q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vzip2q_s16:
+; CHECK-LABEL: test_undef_vzip2q_s16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2144,7 +2141,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vzip2q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vzip2q_s32:
+; CHECK-LABEL: test_undef_vzip2q_s32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2152,7 +2149,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vzip2_u8(<8 x i8> %a) {
-; CHECK: test_undef_vzip2_u8:
+; CHECK-LABEL: test_undef_vzip2_u8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2160,7 +2157,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vzip2q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vzip2q_u8:
+; CHECK-LABEL: test_undef_vzip2q_u8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -2168,7 +2165,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vzip2_u16(<4 x i16> %a) {
-; CHECK: test_undef_vzip2_u16:
+; CHECK-LABEL: test_undef_vzip2_u16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2176,7 +2173,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vzip2q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vzip2q_u16:
+; CHECK-LABEL: test_undef_vzip2q_u16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2184,7 +2181,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vzip2q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vzip2q_u32:
+; CHECK-LABEL: test_undef_vzip2q_u32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2192,7 +2189,7 @@ entry:
 }
 
 define <4 x float> @test_undef_vzip2q_f32(<4 x float> %a) {
-; CHECK: test_undef_vzip2q_f32:
+; CHECK-LABEL: test_undef_vzip2q_f32:
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2200,7 +2197,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vzip2_p8(<8 x i8> %a) {
-; CHECK: test_undef_vzip2_p8:
+; CHECK-LABEL: test_undef_vzip2_p8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2208,7 +2205,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vzip2q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vzip2q_p8:
+; CHECK-LABEL: test_undef_vzip2q_p8:
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -2216,7 +2213,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vzip2_p16(<4 x i16> %a) {
-; CHECK: test_undef_vzip2_p16:
+; CHECK-LABEL: test_undef_vzip2_p16:
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2224,7 +2221,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vzip2q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vzip2q_p16:
+; CHECK-LABEL: test_undef_vzip2q_p16:
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2232,7 +2229,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn1_s8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn1_s8:
+; CHECK-LABEL: test_undef_vtrn1_s8:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2240,7 +2237,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vtrn1q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn1q_s8:
+; CHECK-LABEL: test_undef_vtrn1q_s8:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -2248,7 +2245,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vtrn1_s16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn1_s16:
+; CHECK-LABEL: test_undef_vtrn1_s16:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2256,7 +2253,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vtrn1q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn1q_s16:
+; CHECK-LABEL: test_undef_vtrn1q_s16:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2264,7 +2261,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vtrn1q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vtrn1q_s32:
+; CHECK-LABEL: test_undef_vtrn1q_s32:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2272,7 +2269,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn1_u8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn1_u8:
+; CHECK-LABEL: test_undef_vtrn1_u8:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2280,7 +2277,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vtrn1q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn1q_u8:
+; CHECK-LABEL: test_undef_vtrn1q_u8:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -2288,7 +2285,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vtrn1_u16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn1_u16:
+; CHECK-LABEL: test_undef_vtrn1_u16:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2296,7 +2293,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vtrn1q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn1q_u16:
+; CHECK-LABEL: test_undef_vtrn1q_u16:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2304,7 +2301,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vtrn1q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vtrn1q_u32:
+; CHECK-LABEL: test_undef_vtrn1q_u32:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2312,7 +2309,7 @@ entry:
 }
 
 define <4 x float> @test_undef_vtrn1q_f32(<4 x float> %a) {
-; CHECK: test_undef_vtrn1q_f32:
+; CHECK-LABEL: test_undef_vtrn1q_f32:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2320,7 +2317,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn1_p8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn1_p8:
+; CHECK-LABEL: test_undef_vtrn1_p8:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2328,7 +2325,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vtrn1q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn1q_p8:
+; CHECK-LABEL: test_undef_vtrn1q_p8:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -2336,7 +2333,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vtrn1_p16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn1_p16:
+; CHECK-LABEL: test_undef_vtrn1_p16:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2344,7 +2341,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vtrn1q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn1q_p16:
+; CHECK-LABEL: test_undef_vtrn1q_p16:
 ; CHECK: ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2352,7 +2349,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn2_s8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn2_s8:
+; CHECK-LABEL: test_undef_vtrn2_s8:
 ; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2360,7 +2357,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vtrn2q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn2q_s8:
+; CHECK-LABEL: test_undef_vtrn2q_s8:
 ; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -2368,7 +2365,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vtrn2_s16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn2_s16:
+; CHECK-LABEL: test_undef_vtrn2_s16:
 ; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2376,7 +2373,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vtrn2q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn2q_s16:
+; CHECK-LABEL: test_undef_vtrn2q_s16:
 ; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2384,7 +2381,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vtrn2q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vtrn2q_s32:
+; CHECK-LABEL: test_undef_vtrn2q_s32:
 ; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2392,7 +2389,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn2_u8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn2_u8:
+; CHECK-LABEL: test_undef_vtrn2_u8:
 ; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2400,7 +2397,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vtrn2q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn2q_u8:
+; CHECK-LABEL: test_undef_vtrn2q_u8:
 ; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -2408,7 +2405,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vtrn2_u16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn2_u16:
+; CHECK-LABEL: test_undef_vtrn2_u16:
 ; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2416,7 +2413,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vtrn2q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn2q_u16:
+; CHECK-LABEL: test_undef_vtrn2q_u16:
 ; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2424,7 +2421,7 @@ entry:
 }
 
 define <4 x i32> @test_undef_vtrn2q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vtrn2q_u32:
+; CHECK-LABEL: test_undef_vtrn2q_u32:
 ; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2432,7 +2429,7 @@ entry:
 }
 
 define <4 x float> @test_undef_vtrn2q_f32(<4 x float> %a) {
-; CHECK: test_undef_vtrn2q_f32:
+; CHECK-LABEL: test_undef_vtrn2q_f32:
 ; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
   %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2440,7 +2437,7 @@ entry:
 }
 
 define <8 x i8> @test_undef_vtrn2_p8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn2_p8:
+; CHECK-LABEL: test_undef_vtrn2_p8:
 ; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2448,7 +2445,7 @@ entry:
 }
 
 define <16 x i8> @test_undef_vtrn2q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn2q_p8:
+; CHECK-LABEL: test_undef_vtrn2q_p8:
 ; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
   %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -2456,7 +2453,7 @@ entry:
 }
 
 define <4 x i16> @test_undef_vtrn2_p16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn2_p16:
+; CHECK-LABEL: test_undef_vtrn2_p16:
 ; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2464,7 +2461,7 @@ entry:
 }
 
 define <8 x i16> @test_undef_vtrn2q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn2q_p16:
+; CHECK-LABEL: test_undef_vtrn2q_p16:
 ; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2472,7 +2469,7 @@ entry:
 }
 
 define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp_s8:
+; CHECK-LABEL: test_vuzp_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2484,7 +2481,7 @@ entry:
 }
 
 define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp_s16:
+; CHECK-LABEL: test_vuzp_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2496,9 +2493,9 @@ entry:
 }
 
 define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2508,7 +2505,7 @@ entry:
 }
 
 define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp_u8:
+; CHECK-LABEL: test_vuzp_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2520,7 +2517,7 @@ entry:
 }
 
 define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp_u16:
+; CHECK-LABEL: test_vuzp_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2532,9 +2529,9 @@ entry:
 }
 
 define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2544,9 +2541,9 @@ entry:
 }
 
 define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vuzp_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2556,7 +2553,7 @@ entry:
 }
 
 define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp_p8:
+; CHECK-LABEL: test_vuzp_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2568,7 +2565,7 @@ entry:
 }
 
 define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp_p16:
+; CHECK-LABEL: test_vuzp_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2580,7 +2577,7 @@ entry:
 }
 
 define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzpq_s8:
+; CHECK-LABEL: test_vuzpq_s8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -2592,7 +2589,7 @@ entry:
 }
 
 define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzpq_s16:
+; CHECK-LABEL: test_vuzpq_s16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -2604,7 +2601,7 @@ entry:
 }
 
 define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzpq_s32:
+; CHECK-LABEL: test_vuzpq_s32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -2616,7 +2613,7 @@ entry:
 }
 
 define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzpq_u8:
+; CHECK-LABEL: test_vuzpq_u8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -2628,7 +2625,7 @@ entry:
 }
 
 define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzpq_u16:
+; CHECK-LABEL: test_vuzpq_u16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -2640,7 +2637,7 @@ entry:
 }
 
 define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzpq_u32:
+; CHECK-LABEL: test_vuzpq_u32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -2652,7 +2649,7 @@ entry:
 }
 
 define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vuzpq_f32:
+; CHECK-LABEL: test_vuzpq_f32:
 ; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -2664,7 +2661,7 @@ entry:
 }
 
 define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzpq_p8:
+; CHECK-LABEL: test_vuzpq_p8:
 ; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -2676,7 +2673,7 @@ entry:
 }
 
 define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzpq_p16:
+; CHECK-LABEL: test_vuzpq_p16:
 ; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -2688,7 +2685,7 @@ entry:
 }
 
 define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip_s8:
+; CHECK-LABEL: test_vzip_s8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2700,7 +2697,7 @@ entry:
 }
 
 define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip_s16:
+; CHECK-LABEL: test_vzip_s16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2712,9 +2709,9 @@ entry:
 }
 
 define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2724,7 +2721,7 @@ entry:
 }
 
 define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip_u8:
+; CHECK-LABEL: test_vzip_u8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2736,7 +2733,7 @@ entry:
 }
 
 define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip_u16:
+; CHECK-LABEL: test_vzip_u16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2748,9 +2745,9 @@ entry:
 }
 
 define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2760,9 +2757,9 @@ entry:
 }
 
 define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vzip_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2772,7 +2769,7 @@ entry:
 }
 
 define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip_p8:
+; CHECK-LABEL: test_vzip_p8:
 ; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2784,7 +2781,7 @@ entry:
 }
 
 define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip_p16:
+; CHECK-LABEL: test_vzip_p16:
 ; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2796,7 +2793,7 @@ entry:
 }
 
 define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzipq_s8:
+; CHECK-LABEL: test_vzipq_s8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -2808,7 +2805,7 @@ entry:
 }
 
 define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzipq_s16:
+; CHECK-LABEL: test_vzipq_s16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -2820,7 +2817,7 @@ entry:
 }
 
 define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzipq_s32:
+; CHECK-LABEL: test_vzipq_s32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -2832,7 +2829,7 @@ entry:
 }
 
 define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzipq_u8:
+; CHECK-LABEL: test_vzipq_u8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -2844,7 +2841,7 @@ entry:
 }
 
 define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzipq_u16:
+; CHECK-LABEL: test_vzipq_u16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -2856,7 +2853,7 @@ entry:
 }
 
 define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzipq_u32:
+; CHECK-LABEL: test_vzipq_u32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -2868,7 +2865,7 @@ entry:
 }
 
 define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vzipq_f32:
+; CHECK-LABEL: test_vzipq_f32:
 ; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -2880,7 +2877,7 @@ entry:
 }
 
 define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzipq_p8:
+; CHECK-LABEL: test_vzipq_p8:
 ; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -2892,7 +2889,7 @@ entry:
 }
 
 define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzipq_p16:
+; CHECK-LABEL: test_vzipq_p16:
 ; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -2904,7 +2901,7 @@ entry:
 }
 
 define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn_s8:
+; CHECK-LABEL: test_vtrn_s8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2916,7 +2913,7 @@ entry:
 }
 
 define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn_s16:
+; CHECK-LABEL: test_vtrn_s16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2928,9 +2925,9 @@ entry:
 }
 
 define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2940,7 +2937,7 @@ entry:
 }
 
 define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn_u8:
+; CHECK-LABEL: test_vtrn_u8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -2952,7 +2949,7 @@ entry:
 }
 
 define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn_u16:
+; CHECK-LABEL: test_vtrn_u16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -2964,9 +2961,9 @@ entry:
 }
 
 define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2976,9 +2973,9 @@ entry:
 }
 
 define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vtrn_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2988,7 +2985,7 @@ entry:
 }
 
 define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn_p8:
+; CHECK-LABEL: test_vtrn_p8:
 ; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 ; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
@@ -3000,7 +2997,7 @@ entry:
 }
 
 define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn_p16:
+; CHECK-LABEL: test_vtrn_p16:
 ; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 ; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
@@ -3012,7 +3009,7 @@ entry:
 }
 
 define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrnq_s8:
+; CHECK-LABEL: test_vtrnq_s8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -3024,7 +3021,7 @@ entry:
 }
 
 define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrnq_s16:
+; CHECK-LABEL: test_vtrnq_s16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -3036,7 +3033,7 @@ entry:
 }
 
 define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrnq_s32:
+; CHECK-LABEL: test_vtrnq_s32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -3048,7 +3045,7 @@ entry:
 }
 
 define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrnq_u8:
+; CHECK-LABEL: test_vtrnq_u8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -3060,7 +3057,7 @@ entry:
 }
 
 define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrnq_u16:
+; CHECK-LABEL: test_vtrnq_u16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -3072,7 +3069,7 @@ entry:
 }
 
 define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrnq_u32:
+; CHECK-LABEL: test_vtrnq_u32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -3084,7 +3081,7 @@ entry:
 }
 
 define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vtrnq_f32:
+; CHECK-LABEL: test_vtrnq_f32:
 ; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
@@ -3096,7 +3093,7 @@ entry:
 }
 
 define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrnq_p8:
+; CHECK-LABEL: test_vtrnq_p8:
 ; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 ; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 entry:
@@ -3108,7 +3105,7 @@ entry:
 }
 
 define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrnq_p16:
+; CHECK-LABEL: test_vtrnq_p16:
 ; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
@@ -3120,7 +3117,7 @@ entry:
 }
 
 define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
-; CHECK: test_uzp:
+; CHECK-LABEL: test_uzp:
 
   %vuzp.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %vuzp1.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -3128,7 +3125,4 @@ define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
   %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
   ret %struct.uint8x8x2_t %.fca.0.1.insert
 
-; CHECK: dup	{{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: uzp1	{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK-NEXT: uzp2	{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
diff --git a/test/CodeGen/AArch64/neon-rounding-halving-add.ll b/test/CodeGen/AArch64/neon-rounding-halving-add.ll
deleted file mode 100644
index 009da3b..0000000
--- a/test/CodeGen/AArch64/neon-rounding-halving-add.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_urhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_urhadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: urhadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_srhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_srhadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: srhadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_urhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_urhadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: urhadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_srhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_srhadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: srhadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_urhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_urhadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: urhadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_srhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_srhadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: srhadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_urhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_urhadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: urhadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_srhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_srhadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: srhadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_urhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_urhadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: urhadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_srhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_srhadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: srhadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_urhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_urhadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: urhadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_srhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_srhadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: srhadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
diff --git a/test/CodeGen/AArch64/neon-rounding-shift.ll b/test/CodeGen/AArch64/neon-rounding-shift.ll
deleted file mode 100644
index 5b4ec28..0000000
--- a/test/CodeGen/AArch64/neon-rounding-shift.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_urshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_urshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: urshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_srshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_srshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: srshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_urshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_urshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: urshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_srshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_srshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: srshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_urshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_urshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: urshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_srshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_srshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: srshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_urshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_urshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: urshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_srshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_srshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: srshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_urshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_urshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: urshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_srshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_srshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: srshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_urshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_urshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: urshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_srshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_srshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: srshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_urshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_urshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: urshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_srshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_srshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: srshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-saturating-add-sub.ll
deleted file mode 100644
index fc60d90..0000000
--- a/test/CodeGen/AArch64/neon-saturating-add-sub.ll
+++ /dev/null
@@ -1,241 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqadd_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqadd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqadd_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqadd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqadd_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqadd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqadd_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqadd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqadd_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqadd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqadd_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqadd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
-
-declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqadd_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqadd v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqadd_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqadd v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqsub_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqsub v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqsub_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqsub v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqsub_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqsub v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqsub_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqsub v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqsub_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqsub v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqsub_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqsub v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqsub_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqsub v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqsub_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqsub v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqsub_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqsub v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqsub_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqsub v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqsub_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqsub v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqsub_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqsub v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqsub_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqsub v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqsub_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqsub v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
diff --git a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
deleted file mode 100644
index d89262c..0000000
--- a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqrshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqrshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqrshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqrshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqrshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqrshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqrshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqrshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqrshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqrshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqrshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqrshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqrshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqrshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqrshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqrshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqrshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqrshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqrshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqrshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqrshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqrshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqrshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqrshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqrshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqrshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqrshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqrshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-saturating-shift.ll b/test/CodeGen/AArch64/neon-saturating-shift.ll
deleted file mode 100644
index 11009fb..0000000
--- a/test/CodeGen/AArch64/neon-saturating-shift.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-scalar-abs.ll b/test/CodeGen/AArch64/neon-scalar-abs.ll
deleted file mode 100644
index 03a89e04..0000000
--- a/test/CodeGen/AArch64/neon-scalar-abs.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define i64 @test_vabsd_s64(i64 %a) {
-; CHECK: test_vabsd_s64
-; CHECK: abs {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vabs.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vabs1.i = tail call <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64> %vabs.i)
-  %0 = extractelement <1 x i64> %vabs1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64>)
-
-define i8 @test_vqabsb_s8(i8 %a) {
-; CHECK: test_vqabsb_s8
-; CHECK: sqabs {{b[0-9]+}}, {{b[0-9]+}}
-entry:
-  %vqabs.i = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vqabs1.i = call <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8> %vqabs.i)
-  %0 = extractelement <1 x i8> %vqabs1.i, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8>)
-
-define i16 @test_vqabsh_s16(i16 %a) {
-; CHECK: test_vqabsh_s16
-; CHECK: sqabs {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqabs.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqabs1.i = call <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16> %vqabs.i)
-  %0 = extractelement <1 x i16> %vqabs1.i, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16>)
-
-define i32 @test_vqabss_s32(i32 %a) {
-; CHECK: test_vqabss_s32
-; CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqabs.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqabs1.i = call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %vqabs.i)
-  %0 = extractelement <1 x i32> %vqabs1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>)
-
-define i64 @test_vqabsd_s64(i64 %a) {
-; CHECK: test_vqabsd_s64
-; CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqabs.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqabs1.i = call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %vqabs.i)
-  %0 = extractelement <1 x i64> %vqabs1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-add-sub.ll
deleted file mode 100644
index 4f322e0..0000000
--- a/test/CodeGen/AArch64/neon-scalar-add-sub.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-	%tmp3 = add <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-	%tmp3 = sub <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_add_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_add_v1i64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_uadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uadd_v1i64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_sub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sub_v1i64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_usub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_usub_v1i64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index 247514c..32f5962 100644
--- a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -4,7 +4,7 @@ declare float @llvm.fma.f32(float, float, float)
 declare double @llvm.fma.f64(double, double, double)
 
 define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
-  ; CHECK: test_fmla_ss4S
+  ; CHECK-LABEL: test_fmla_ss4S
   ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
@@ -12,7 +12,7 @@ define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
 }
 
 define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK: test_fmla_ss4S_swap
+  ; CHECK-LABEL: test_fmla_ss4S_swap
   ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.fma.f32(float %tmp1, float %a, float %a)
@@ -20,7 +20,7 @@ define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) {
 }
 
 define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) {
-  ; CHECK: test_fmla_ss2S
+  ; CHECK-LABEL: test_fmla_ss2S
   ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
@@ -28,15 +28,15 @@ define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) {
 }
 
 define double @test_fmla_ddD(double %a, double %b, <1 x double> %v) {
-  ; CHECK: test_fmla_ddD
-  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  ; CHECK-LABEL: test_fmla_ddD
+  ; CHECK: {{fmla d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmadd d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
   ret double %tmp2
 }
 
 define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) {
-  ; CHECK: test_fmla_dd2D
+  ; CHECK-LABEL: test_fmla_dd2D
   ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
@@ -44,7 +44,7 @@ define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) {
 }
 
 define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK: test_fmla_dd2D_swap
+  ; CHECK-LABEL: test_fmla_dd2D_swap
   ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
@@ -52,7 +52,7 @@ define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) {
 }
 
 define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) {
-  ; CHECK: test_fmls_ss4S
+  ; CHECK-LABEL: test_fmls_ss4S
   ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fsub float -0.0, %tmp1
@@ -61,7 +61,7 @@ define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) {
 }
 
 define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK: test_fmls_ss4S_swap
+  ; CHECK-LABEL: test_fmls_ss4S_swap
   ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fsub float -0.0, %tmp1
@@ -71,7 +71,7 @@ define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) {
 
 
 define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) {
-  ; CHECK: test_fmls_ss2S
+  ; CHECK-LABEL: test_fmls_ss2S
   ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fsub float -0.0, %tmp1
@@ -80,8 +80,8 @@ define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) {
 }
 
 define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) {
-  ; CHECK: test_fmls_ddD
-  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+  ; CHECK-LABEL: test_fmls_ddD
+  ; CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmsub d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = fsub double -0.0, %tmp1
   %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a)
@@ -89,7 +89,7 @@ define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) {
 }
 
 define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) {
-  ; CHECK: test_fmls_dd2D
+  ; CHECK-LABEL: test_fmls_dd2D
   ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fsub double -0.0, %tmp1
@@ -98,7 +98,7 @@ define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) {
 }
 
 define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK: test_fmls_dd2D_swap
+  ; CHECK-LABEL: test_fmls_dd2D_swap
   ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fsub double -0.0, %tmp1
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
deleted file mode 100644
index c9128e7..0000000
--- a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
+++ /dev/null
@@ -1,124 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
-  ; CHECK: test_fmul_lane_ss2S
-  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp1 = extractelement <2 x float> %v, i32 1
-  %tmp2 = fmul float %a, %tmp1;
-  ret float %tmp2;
-}
-
-define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
-  ; CHECK: test_fmul_lane_ss2S_swap
-  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp1 = extractelement <2 x float> %v, i32 1
-  %tmp2 = fmul float %tmp1, %a;
-  ret float %tmp2;
-}
-
-
-define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
-  ; CHECK: test_fmul_lane_ss4S
-  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-  %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = fmul float %a, %tmp1;
-  ret float %tmp2;
-}
-
-define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
-  ; CHECK: test_fmul_lane_ss4S_swap
-  ; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-  %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = fmul float %tmp1, %a;
-  ret float %tmp2;
-}
-
-
-define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
-  ; CHECK: test_fmul_lane_ddD
-  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-  %tmp1 = extractelement <1 x double> %v, i32 0
-  %tmp2 = fmul double %a, %tmp1;
-  ret double %tmp2;
-}
-
-
-
-define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
-  ; CHECK: test_fmul_lane_dd2D
-  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = fmul double %a, %tmp1;
-  ret double %tmp2;
-}
-
-
-define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
-  ; CHECK: test_fmul_lane_dd2D_swap
-  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = fmul double %tmp1, %a;
-  ret double %tmp2;
-}
-
-declare float @llvm.aarch64.neon.vmulx.f32(float, float)
-
-define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
-  ; CHECK: test_fmulx_lane_f32
-  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
-  %tmp1 = extractelement <2 x float> %v, i32 1
-  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
-  ret float %tmp2;
-}
-
-define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
-  ; CHECK: test_fmulx_laneq_f32
-  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-  %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
-  ret float %tmp2;
-}
-
-define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
-  ; CHECK: test_fmulx_laneq_f32_swap
-  ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
-  %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %tmp1, float %a)
-  ret float %tmp2;
-}
-
-declare double @llvm.aarch64.neon.vmulx.f64(double, double)
-
-define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
-  ; CHECK: test_fmulx_lane_f64
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-  %tmp1 = extractelement <1 x double> %v, i32 0
-  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
-  ret double %tmp2;
-}
-
-define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
-  ; CHECK: test_fmulx_laneq_f64_0
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
-  %tmp1 = extractelement <2 x double> %v, i32 0
-  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
-  ret double %tmp2;
-}
-
-
-define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) {
-  ; CHECK: test_fmulx_laneq_f64_1
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
-  ret double %tmp2;
-}
-
-define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
-  ; CHECK: test_fmulx_laneq_f64_1_swap
-  ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-  %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %tmp1, double %a)
-  ret double %tmp2;
-}
-
diff --git a/test/CodeGen/AArch64/neon-scalar-compare.ll b/test/CodeGen/AArch64/neon-scalar-compare.ll
deleted file mode 100644
index e1f3964..0000000
--- a/test/CodeGen/AArch64/neon-scalar-compare.ll
+++ /dev/null
@@ -1,343 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-;; Scalar Integer Compare
-
-define i64 @test_vceqd(i64 %a, i64 %b) {
-; CHECK: test_vceqd
-; CHECK: cmeq {{d[0-9]+}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vceq.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vceq1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vceq2.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceq.i, <1 x i64> %vceq1.i)
-  %0 = extractelement <1 x i64> %vceq2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vceqzd(i64 %a) {
-; CHECK: test_vceqzd
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vceqz.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vceqz1.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceqz.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vceqz1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcged(i64 %a, i64 %b) {
-; CHECK: test_vcged
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcge.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcge1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i)
-  %0 = extractelement <1 x i64> %vcge2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcgezd(i64 %a) {
-; CHECK: test_vcgezd
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vcgez.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcgez1.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcgez.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vcgez1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcgtd(i64 %a, i64 %b) {
-; CHECK: test_vcgtd
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcgt.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcgt1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i)
-  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcgtzd(i64 %a) {
-; CHECK: test_vcgtzd
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vcgtz.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcgtz1.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgtz.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vcgtz1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcled(i64 %a, i64 %b) {
-; CHECK: test_vcled
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcgt.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vcgt1.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i)
-  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vclezd(i64 %a) {
-; CHECK: test_vclezd
-; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vclez.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vclez1.i = call <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64> %vclez.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vclez1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcltd(i64 %a, i64 %b) {
-; CHECK: test_vcltd
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcge.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vcge1.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i)
-  %0 = extractelement <1 x i64> %vcge2.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vcltzd(i64 %a) {
-; CHECK: test_vcltzd
-; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
-  %vcltz.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vcltz1.i = call <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64> %vcltz.i, <1 x i64> zeroinitializer)
-  %0 = extractelement <1 x i64> %vcltz1.i, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vtstd(i64 %a, i64 %b) {
-; CHECK: test_vtstd
-; CHECK: cmtst {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vtst.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vtst1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vtst2.i = call <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64> %vtst.i, <1 x i64> %vtst1.i)
-  %0 = extractelement <1 x i64> %vtst2.i, i32 0
-  ret i64 %0
-}
-
-
-define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcage_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcage2.i = tail call <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #2
-  ret <1 x i64> %vcage2.i
-}
-
-define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcagt_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcagt2.i = tail call <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #2
-  ret <1 x i64> %vcagt2.i
-}
-
-define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcale_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcage2.i = tail call <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #2
-  ret <1 x i64> %vcage2.i
-}
-
-define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcalt_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %vcagt2.i = tail call <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #2
-  ret <1 x i64> %vcagt2.i
-}
-
-define <1 x i64> @test_vceq_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vceq_s64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp eq <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vceq_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vceq_u64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp eq <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vceq_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vceq_f64
-; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp oeq <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcge_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcge_s64
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp sge <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcge_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcge_u64
-; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp uge <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcge_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcge_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp oge <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcle_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcle_s64
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp sle <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcle_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcle_u64
-; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp ule <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcle_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcle_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp ole <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcgt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcgt_s64
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp sgt <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcgt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcgt_u64
-; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp ugt <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcgt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcgt_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp ogt <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vclt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vclt_s64
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp slt <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vclt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vclt_u64
-; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = icmp ult <1 x i64> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vclt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vclt_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-  %cmp.i = fcmp olt <1 x double> %a, %b
-  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
-  ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 {
-; CHECK: test_vceqz_s64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp eq <1 x i64> %a, zeroinitializer
-  %vceqz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vceqz.i
-}
-
-define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 {
-; CHECK: test_vceqz_u64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp eq <1 x i64> %a, zeroinitializer
-  %vceqz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vceqz.i
-}
-
-define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 {
-; CHECK: test_vceqz_p64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp eq <1 x i64> %a, zeroinitializer
-  %vceqz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vceqz.i
-}
-
-define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 {
-; CHECK: test_vceqzq_p64
-; CHECK: cmeq  {{v[0-9]}}.2d, {{v[0-9]}}.2d, #0
-  %1 = icmp eq <2 x i64> %a, zeroinitializer
-  %vceqz.i = sext <2 x i1> %1 to <2 x i64>
-  ret <2 x i64> %vceqz.i
-}
-
-define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 {
-; CHECK: test_vcgez_s64
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp sge <1 x i64> %a, zeroinitializer
-  %vcgez.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vcgez.i
-}
-
-define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 {
-; CHECK: test_vclez_s64
-; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp sle <1 x i64> %a, zeroinitializer
-  %vclez.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vclez.i
-}
-
-define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 {
-; CHECK: test_vcgtz_s64
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
-  %1 = icmp sgt <1 x i64> %a, zeroinitializer
-  %vcgtz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vcgtz.i
-}
-
-define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 {
-; CHECK: test_vcltz_s64
-; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0
-  %1 = icmp slt <1 x i64> %a, zeroinitializer
-  %vcltz.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vcltz.i
-}
-
-declare <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vchs.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vchi.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll
index fadd734..a01df32 100644
--- a/test/CodeGen/AArch64/neon-scalar-copy.ll
+++ b/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -1,103 +1,101 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK
+
 
 define float @test_dup_sv2S(<2 x float> %v) {
- ;CHECK: test_dup_sv2S
- ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+ ; CHECK-LABEL: test_dup_sv2S
+ ; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
  %tmp1 = extractelement <2 x float> %v, i32 1
  ret float  %tmp1
 }
 
 define float @test_dup_sv2S_0(<2 x float> %v) {
- ;CHECK-LABEL: test_dup_sv2S_0
- ;CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[0]
- ;CHECK: ret
+ ; CHECK-LABEL: test_dup_sv2S_0
+ ; CHECK-NOT: dup {{[vsd][0-9]+}}
+ ; CHECK-NOT: ins {{[vsd][0-9]+}}
+ ; CHECK: ret
  %tmp1 = extractelement <2 x float> %v, i32 0
  ret float  %tmp1
 }
 
 define float @test_dup_sv4S(<4 x float> %v) {
- ;CHECK-LABEL: test_dup_sv4S
- ;CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[0]
- ;CHECK: ret
+ ; CHECK-LABEL: test_dup_sv4S
+ ; CHECK-NOT: dup {{[vsd][0-9]+}}
+ ; CHECK-NOT: ins {{[vsd][0-9]+}}
+ ; CHECK: ret
  %tmp1 = extractelement <4 x float> %v, i32 0
  ret float  %tmp1
 }
 
 define double @test_dup_dvD(<1 x double> %v) {
- ;CHECK: test_dup_dvD
- ;CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
- ;CHECK: ret
+ ; CHECK-LABEL: test_dup_dvD
+ ; CHECK-NOT: dup {{[vsd][0-9]+}}
+ ; CHECK-NOT: ins {{[vsd][0-9]+}}
+ ; CHECK: ret
  %tmp1 = extractelement <1 x double> %v, i32 0
  ret double  %tmp1
 }
 
 define double @test_dup_dv2D(<2 x double> %v) {
- ;CHECK: test_dup_dv2D
- ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+ ; CHECK-LABEL: test_dup_dv2D
+ ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
  %tmp1 = extractelement <2 x double> %v, i32 1
  ret double  %tmp1
 }
 
 define double @test_dup_dv2D_0(<2 x double> %v) {
- ;CHECK: test_dup_dv2D_0
- ;CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
- ;CHECK: ret
+ ; CHECK-LABEL: test_dup_dv2D_0
+ ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+ ; CHECK: ret
  %tmp1 = extractelement <2 x double> %v, i32 1
  ret double  %tmp1
 }
 
 define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) {
- ;CHECK: test_vector_dup_bv16B
- ;CHECK: dup {{b[0-9]+}}, {{v[0-9]+}}.b[14]
+ ; CHECK-LABEL: test_vector_dup_bv16B
  %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> <i32 14> 
  ret <1 x i8> %shuffle.i
 }
 
 define <1 x i8> @test_vector_dup_bv8B(<8 x i8> %v1) {
- ;CHECK: test_vector_dup_bv8B
- ;CHECK: dup {{b[0-9]+}}, {{v[0-9]+}}.b[7]
+ ; CHECK-LABEL: test_vector_dup_bv8B
  %shuffle.i = shufflevector <8 x i8> %v1, <8 x i8> undef, <1 x i32> <i32 7> 
  ret <1 x i8> %shuffle.i
 }
 
 define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) {
- ;CHECK: test_vector_dup_hv8H
- ;CHECK: dup {{h[0-9]+}}, {{v[0-9]+}}.h[7]
+ ; CHECK-LABEL: test_vector_dup_hv8H
  %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> <i32 7> 
  ret <1 x i16> %shuffle.i
 }
 
 define <1 x i16> @test_vector_dup_hv4H(<4 x i16> %v1) {
- ;CHECK: test_vector_dup_hv4H
- ;CHECK: dup {{h[0-9]+}}, {{v[0-9]+}}.h[3]
+ ; CHECK-LABEL: test_vector_dup_hv4H
  %shuffle.i = shufflevector <4 x i16> %v1, <4 x i16> undef, <1 x i32> <i32 3> 
  ret <1 x i16> %shuffle.i
 }
 
 define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) {
- ;CHECK: test_vector_dup_sv4S
- ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+ ; CHECK-LABEL: test_vector_dup_sv4S
  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> <i32 3> 
  ret <1 x i32> %shuffle
 }
 
 define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) {
- ;CHECK: test_vector_dup_sv2S
- ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+ ; CHECK-LABEL: test_vector_dup_sv2S
  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <1 x i32> <i32 1> 
  ret <1 x i32> %shuffle
 }
 
 define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) {
- ;CHECK: test_vector_dup_dv2D
- ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+ ; CHECK-LABEL: test_vector_dup_dv2D
+ ; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8
  %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> <i32 1> 
  ret <1 x i64> %shuffle.i
 }
 
 define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) {
-  ;CHECK: test_vector_copy_dup_dv2D
-  ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+  ; CHECK-LABEL: test_vector_copy_dup_dv2D
+  ; CHECK: {{dup|mov}} {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %vget_lane = extractelement <2 x i64> %c, i32 1
   %vset_lane = insertelement <1 x i64> undef, i64 %vget_lane, i32 0
   ret <1 x i64> %vset_lane
diff --git a/test/CodeGen/AArch64/neon-scalar-cvt.ll b/test/CodeGen/AArch64/neon-scalar-cvt.ll
deleted file mode 100644
index 3a19bed..0000000
--- a/test/CodeGen/AArch64/neon-scalar-cvt.ll
+++ /dev/null
@@ -1,133 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define float @test_vcvts_f32_s32(i32 %a) {
-; CHECK: test_vcvts_f32_s32
-; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtint2fps.f32.v1i32(<1 x i32> %vcvtf.i)
-  ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtint2fps.f32.v1i32(<1 x i32>)
-
-define double @test_vcvtd_f64_s64(i64 %a) {
-; CHECK: test_vcvtd_f64_s64
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtint2fps.f64.v1i64(<1 x i64> %vcvtf.i)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtint2fps.f64.v1i64(<1 x i64>)
-
-define float @test_vcvts_f32_u32(i32 %a) {
-; CHECK: test_vcvts_f32_u32
-; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtint2fpu.f32.v1i32(<1 x i32> %vcvtf.i)
-  ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtint2fpu.f32.v1i32(<1 x i32>)
-
-define double @test_vcvtd_f64_u64(i64 %a) {
-; CHECK: test_vcvtd_f64_u64
-; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtint2fpu.f64.v1i64(<1 x i64> %vcvtf.i)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtint2fpu.f64.v1i64(<1 x i64>)
-
-define float @test_vcvts_n_f32_s32(i32 %a) {
-; CHECK: test_vcvts_n_f32_s32
-; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
-entry:
-  %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtfxs2fp.n.f32.v1i32(<1 x i32> %vcvtf, i32 1)
-  ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtfxs2fp.n.f32.v1i32(<1 x i32>, i32)
-
-define double @test_vcvtd_n_f64_s64(i64 %a) {
-; CHECK: test_vcvtd_n_f64_s64
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
-entry:
-  %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtfxs2fp.n.f64.v1i64(<1 x i64> %vcvtf, i32 1)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtfxs2fp.n.f64.v1i64(<1 x i64>, i32)
-
-define float @test_vcvts_n_f32_u32(i32 %a) {
-; CHECK: test_vcvts_n_f32_u32
-; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
-entry:
-  %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
-  %0 = call float @llvm.aarch64.neon.vcvtfxu2fp.n.f32.v1i32(<1 x i32> %vcvtf, i32 1)
-  ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtfxu2fp.n.f32.v1i32(<1 x i32>, i32)
-
-define double @test_vcvtd_n_f64_u64(i64 %a) {
-; CHECK: test_vcvtd_n_f64_u64
-; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
-entry:
-  %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
-  %0 = call double @llvm.aarch64.neon.vcvtfxu2fp.n.f64.v1i64(<1 x i64> %vcvtf, i32 1)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtfxu2fp.n.f64.v1i64(<1 x i64>, i32)
-
-define i32 @test_vcvts_n_s32_f32(float %a) {
-; CHECK: test_vcvts_n_s32_f32
-; CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1
-entry:
-  %fcvtzs1 = call <1 x i32> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i32.f32(float %a, i32 1)
-  %0 = extractelement <1 x i32> %fcvtzs1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i32.f32(float, i32)
-
-define i64 @test_vcvtd_n_s64_f64(double %a) {
-; CHECK: test_vcvtd_n_s64_f64
-; CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1
-entry:
-  %fcvtzs1 = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i64.f64(double %a, i32 1)
-  %0 = extractelement <1 x i64> %fcvtzs1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i64.f64(double, i32)
-
-define i32 @test_vcvts_n_u32_f32(float %a) {
-; CHECK: test_vcvts_n_u32_f32
-; CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32
-entry:
-  %fcvtzu1 = call <1 x i32> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i32.f32(float %a, i32 32)
-  %0 = extractelement <1 x i32> %fcvtzu1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i32.f32(float, i32)
-
-define i64 @test_vcvtd_n_u64_f64(double %a) {
-; CHECK: test_vcvtd_n_u64_f64
-; CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64
-entry:
-  %fcvtzu1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i64.f64(double %a, i32 64)
-  %0 = extractelement <1 x i64> %fcvtzu1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i64.f64(double, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-ext.ll b/test/CodeGen/AArch64/neon-scalar-ext.ll
deleted file mode 100644
index 51dea06..0000000
--- a/test/CodeGen/AArch64/neon-scalar-ext.ll
+++ /dev/null
@@ -1,113 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define <1 x i64> @test_zext_v1i32_v1i64(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i32_v1i64:
-; CHECK: ushll	v0.2d, v0.2s, #0
-  %1 = extractelement <2 x i32> %v, i32 0
-  %2 = insertelement <1 x i32> undef, i32 %1, i32 0
-  %3 = zext <1 x i32> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i32> @test_zext_v1i16_v1i32(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i16_v1i32:
-; CHECK: ushll	v0.4s, v0.4h, #0
-  %1 = extractelement <4 x i16> %v, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
-  %3 = zext <1 x i16> %2 to <1 x i32>
-  ret <1 x i32> %3
-}
-
-define <1 x i16> @test_zext_v1i8_v1i16(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i8_v1i16:
-; CHECK: ushll	v0.8h, v0.8b, #0
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = zext <1 x i8> %2 to <1 x i16>
-  ret <1 x i16> %3
-}
-
-define <1 x i32> @test_zext_v1i8_v1i32(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i8_v1i32:
-; CHECK: dup     b0, v0.b[0]
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = zext <1 x i8> %2 to <1 x i32>
-  ret <1 x i32> %3
-}
-
-define <1 x i64> @test_zext_v1i16_v1i64(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i16_v1i64:
-; CHECK: dup    h0, v0.h[0]
-  %1 = extractelement <4 x i16> %v, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
-  %3 = zext <1 x i16> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i64> @test_zext_v1i8_v1i64(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i8_v1i64:
-; CHECK: dup	b0, v0.b[0]
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = zext <1 x i8> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i64> @test_sext_v1i32_v1i64(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i32_v1i64:
-; CHECK: sshll	v0.2d, v0.2s, #0
-  %1 = extractelement <2 x i32> %v, i32 0
-  %2 = insertelement <1 x i32> undef, i32 %1, i32 0
-  %3 = sext <1 x i32> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i32> @test_sext_v1i16_v1i32(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i16_v1i32:
-; CHECK: sshll	v0.4s, v0.4h, #0
-  %1 = extractelement <4 x i16> %v, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
-  %3 = sext <1 x i16> %2 to <1 x i32>
-  ret <1 x i32> %3
-}
-
-define <1 x i16> @test_sext_v1i8_v1i16(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i8_v1i16:
-; CHECK: sshll	v0.8h, v0.8b, #0
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = sext <1 x i8> %2 to <1 x i16>
-  ret <1 x i16> %3
-}
-
-define <1 x i32> @test_sext_v1i8_v1i32(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i8_v1i32:
-; CHECK: sshll	v0.8h, v0.8b, #0
-; CHECK: sshll	v0.4s, v0.4h, #0
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = sext <1 x i8> %2 to <1 x i32>
-  ret <1 x i32> %3
-}
-
-define <1 x i64> @test_sext_v1i16_v1i64(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i16_v1i64:
-; CHECK: sshll	v0.4s, v0.4h, #0
-; CHECK: sshll	v0.2d, v0.2s, #0
-  %1 = extractelement <4 x i16> %v, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %1, i32 0
-  %3 = sext <1 x i16> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
-
-define <1 x i64> @test_sext_v1i8_v1i64(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i8_v1i64:
-; CHECK: sshll	v0.8h, v0.8b, #0
-; CHECK: sshll	v0.4s, v0.4h, #0
-; CHECK: sshll	v0.2d, v0.2s, #0
-  %1 = extractelement <8 x i8> %v, i32 0
-  %2 = insertelement <1 x i8> undef, i8 %1, i32 0
-  %3 = sext <1 x i8> %2 to <1 x i64>
-  ret <1 x i64> %3
-}
diff --git a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll b/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
deleted file mode 100644
index faf521b..0000000
--- a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define i8 @test_vqmovunh_s16(i16 %a) {
-; CHECK: test_vqmovunh_s16
-; CHECK: sqxtun {{b[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqmovun.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqmovun1.i = call <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16> %vqmovun.i)
-  %0 = extractelement <1 x i8> %vqmovun1.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vqmovuns_s32(i32 %a) {
-; CHECK: test_vqmovuns_s32
-; CHECK: sqxtun {{h[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqmovun.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqmovun1.i = call <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32> %vqmovun.i)
-  %0 = extractelement <1 x i16> %vqmovun1.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vqmovund_s64(i64 %a) {
-; CHECK: test_vqmovund_s64
-; CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqmovun.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqmovun1.i = call <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64> %vqmovun.i)
-  %0 = extractelement <1 x i32> %vqmovun1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64>)
-
-define i8 @test_vqmovnh_s16(i16 %a) {
-; CHECK: test_vqmovnh_s16
-; CHECK: sqxtn {{b[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16> %vqmovn.i)
-  %0 = extractelement <1 x i8> %vqmovn1.i, i32 0
-  ret i8 %0
-}
-
-define i16 @test_vqmovns_s32(i32 %a) {
-; CHECK: test_vqmovns_s32
-; CHECK: sqxtn {{h[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32> %vqmovn.i)
-  %0 = extractelement <1 x i16> %vqmovn1.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vqmovnd_s64(i64 %a) {
-; CHECK: test_vqmovnd_s64
-; CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64> %vqmovn.i)
-  %0 = extractelement <1 x i32> %vqmovn1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64>)
-
-define i8 @test_vqmovnh_u16(i16 %a) {
-; CHECK: test_vqmovnh_u16
-; CHECK: uqxtn {{b[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16> %vqmovn.i)
-  %0 = extractelement <1 x i8> %vqmovn1.i, i32 0
-  ret i8 %0
-}
-
-
-define i16 @test_vqmovns_u32(i32 %a) {
-; CHECK: test_vqmovns_u32
-; CHECK: uqxtn {{h[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32> %vqmovn.i)
-  %0 = extractelement <1 x i16> %vqmovn1.i, i32 0
-  ret i16 %0
-}
-
-define i32 @test_vqmovnd_u64(i64 %a) {
-; CHECK: test_vqmovnd_u64
-; CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64> %vqmovn.i)
-  %0 = extractelement <1 x i32> %vqmovn1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-fabd.ll b/test/CodeGen/AArch64/neon-scalar-fabd.ll
deleted file mode 100644
index 6343310..0000000
--- a/test/CodeGen/AArch64/neon-scalar-fabd.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define float @test_vabds_f32(float %a, float %b) {
-; CHECK-LABEL: test_vabds_f32
-; CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %0 = call float @llvm.aarch64.neon.vabd.f32(float %a, float %a)
-  ret float %0
-}
-
-define double @test_vabdd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vabdd_f64
-; CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = call double @llvm.aarch64.neon.vabd.f64(double %a, double %b)
-  ret double %0
-}
-
-declare double @llvm.aarch64.neon.vabd.f64(double, double)
-declare float @llvm.aarch64.neon.vabd.f32(float, float)
diff --git a/test/CodeGen/AArch64/neon-scalar-fcvt.ll b/test/CodeGen/AArch64/neon-scalar-fcvt.ll
deleted file mode 100644
index 6cf30a7..0000000
--- a/test/CodeGen/AArch64/neon-scalar-fcvt.ll
+++ /dev/null
@@ -1,233 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-;; Scalar Floating-point Convert
-
-define float @test_vcvtxn(double %a) {
-; CHECK: test_vcvtxn
-; CHECK: fcvtxn {{s[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtf = call float @llvm.aarch64.neon.fcvtxn(double %a)
-  ret float %vcvtf
-}
-
-declare float @llvm.aarch64.neon.fcvtxn(double)
-
-define i32 @test_vcvtass(float %a) {
-; CHECK: test_vcvtass
-; CHECK: fcvtas {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtas1.i = call <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtas1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.f32(float)
-
-define i64 @test_test_vcvtasd(double %a) {
-; CHECK: test_test_vcvtasd
-; CHECK: fcvtas {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtas1.i = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtas1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.f64(double)
-
-define i32 @test_vcvtaus(float %a) {
-; CHECK: test_vcvtaus
-; CHECK: fcvtau {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtau1.i = call <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtau1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.f32(float)
-
-define i64 @test_vcvtaud(double %a) {
-; CHECK: test_vcvtaud
-; CHECK: fcvtau {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtau1.i = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtau1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.f64(double) 
-
-define i32 @test_vcvtmss(float %a) {
-; CHECK: test_vcvtmss
-; CHECK: fcvtms {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtms1.i = call <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtms1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.f32(float)
-
-define i64 @test_vcvtmd_s64_f64(double %a) {
-; CHECK: test_vcvtmd_s64_f64
-; CHECK: fcvtms {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtms1.i = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtms1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.f64(double)
-
-define i32 @test_vcvtmus(float %a) {
-; CHECK: test_vcvtmus
-; CHECK: fcvtmu {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtmu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtmu1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.f32(float)
-
-define i64 @test_vcvtmud(double %a) {
-; CHECK: test_vcvtmud
-; CHECK: fcvtmu {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtmu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtmu1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.f64(double)
-
-define i32 @test_vcvtnss(float %a) {
-; CHECK: test_vcvtnss
-; CHECK: fcvtns {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtns1.i = call <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtns1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.f32(float)
-
-define i64 @test_vcvtnd_s64_f64(double %a) {
-; CHECK: test_vcvtnd_s64_f64
-; CHECK: fcvtns {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtns1.i = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtns1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.f64(double)
-
-define i32 @test_vcvtnus(float %a) {
-; CHECK: test_vcvtnus
-; CHECK: fcvtnu {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtnu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtnu1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.f32(float)
-
-define i64 @test_vcvtnud(double %a) {
-; CHECK: test_vcvtnud
-; CHECK: fcvtnu {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtnu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtnu1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.f64(double)
-
-define i32 @test_vcvtpss(float %a) {
-; CHECK: test_vcvtpss
-; CHECK: fcvtps {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtps1.i = call <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtps1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.f32(float)
-
-define i64 @test_vcvtpd_s64_f64(double %a) {
-; CHECK: test_vcvtpd_s64_f64
-; CHECK: fcvtps {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtps1.i = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtps1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.f64(double)
-
-define i32 @test_vcvtpus(float %a) {
-; CHECK: test_vcvtpus
-; CHECK: fcvtpu {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtpu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtpu1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.f32(float)
-
-define i64 @test_vcvtpud(double %a) {
-; CHECK: test_vcvtpud
-; CHECK: fcvtpu {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtpu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtpu1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.f64(double)
-
-define i32 @test_vcvtss(float %a) {
-; CHECK: test_vcvtss
-; CHECK: fcvtzs {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtzs1.i = call <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtzs1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.f32(float)
-
-define i64 @test_vcvtd_s64_f64(double %a) {
-; CHECK: test_vcvtd_s64_f64
-; CHECK: fcvtzs {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvzs1.i = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvzs1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.f64(double)
-
-define i32 @test_vcvtus(float %a) {
-; CHECK: test_vcvtus
-; CHECK: fcvtzu {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %vcvtzu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.f32(float %a)
-  %0 = extractelement <1 x i32> %vcvtzu1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.f32(float)
-
-define i64 @test_vcvtud(double %a) {
-; CHECK: test_vcvtud
-; CHECK: fcvtzu {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %vcvtzu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.f64(double %a)
-  %0 = extractelement <1 x i64> %vcvtzu1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.f64(double)
diff --git a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
deleted file mode 100644
index e0dce13..0000000
--- a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
+++ /dev/null
@@ -1,282 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-;; Scalar Floating-point Compare
-
-define i32 @test_vceqs_f32(float %a, float %b) {
-; CHECK-LABEL: test_vceqs_f32
-; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fceq2.i = call <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fceq2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vceqd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vceqd_f64
-; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fceq2.i = call <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fceq2.i, i32 0
-  ret i64 %0
-}
-
-define <1 x i64> @test_vceqz_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vceqz_f64
-; CHECK: fcmeq  {{d[0-9]+}}, {{d[0-9]+}}, #0.0
-entry:
-  %0 = fcmp oeq <1 x double> %a, zeroinitializer
-  %vceqz.i = sext <1 x i1> %0 to <1 x i64>
-  ret <1 x i64> %vceqz.i
-}
-
-define i32 @test_vceqzs_f32(float %a) {
-; CHECK-LABEL: test_vceqzs_f32
-; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fceq1.i = call <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fceq1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vceqzd_f64(double %a) {
-; CHECK-LABEL: test_vceqzd_f64
-; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fceq1.i = call <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fceq1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcges_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcges_f32
-; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcge2.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcge2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcged_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcged_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcge2.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcge2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcgezs_f32(float %a) {
-; CHECK-LABEL: test_vcgezs_f32
-; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fcge1.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fcge1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcgezd_f64(double %a) {
-; CHECK-LABEL: test_vcgezd_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fcge1.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fcge1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcgts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcgts_f32
-; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcgt2.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcgt2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcgtd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcgtd_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcgt2.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcgt2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcgtzs_f32(float %a) {
-; CHECK-LABEL: test_vcgtzs_f32
-; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fcgt1.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fcgt1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcgtzd_f64(double %a) {
-; CHECK-LABEL: test_vcgtzd_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fcgt1.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fcgt1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcles_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcles_f32
-; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcge2.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcge2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcled_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcled_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcge2.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcge2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vclezs_f32(float %a) {
-; CHECK-LABEL: test_vclezs_f32
-; CHECK: fcmle {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fcle1.i = call <1 x i32> @llvm.aarch64.neon.fclez.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fcle1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vclezd_f64(double %a) {
-; CHECK-LABEL: test_vclezd_f64
-; CHECK: fcmle {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fcle1.i = call <1 x i64> @llvm.aarch64.neon.fclez.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fcle1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vclts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vclts_f32
-; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcgt2.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcgt2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcltd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcltd_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcgt2.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcgt2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcltzs_f32(float %a) {
-; CHECK-LABEL: test_vcltzs_f32
-; CHECK: fcmlt {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
-  %fclt1.i = call <1 x i32> @llvm.aarch64.neon.fcltz.v1i32.f32.f32(float %a, float 0.0)
-  %0 = extractelement <1 x i32> %fclt1.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcltzd_f64(double %a) {
-; CHECK-LABEL: test_vcltzd_f64
-; CHECK: fcmlt {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
-  %fclt1.i = call <1 x i64> @llvm.aarch64.neon.fcltz.v1i64.f64.f32(double %a, float 0.0)
-  %0 = extractelement <1 x i64> %fclt1.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcages_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcages_f32
-; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcage2.i = call <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcage2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcaged_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcaged_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcage2.i = call <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcage2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcagts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcagts_f32
-; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcagt2.i = call <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcagt2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcagtd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcagtd_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcagt2.i = call <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcagt2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcales_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcales_f32
-; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcage2.i = call <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcage2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcaled_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcaled_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcage2.i = call <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcage2.i, i32 0
-  ret i64 %0
-}
-
-define i32 @test_vcalts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcalts_f32
-; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
-  %fcalt2.i = call <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float %a, float %b)
-  %0 = extractelement <1 x i32> %fcalt2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vcaltd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcaltd_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
-  %fcalt2.i = call <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double %a, double %b)
-  %0 = extractelement <1 x i64> %fcalt2.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f32(double, float)
-declare <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f32(double, float)
-declare <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fclez.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fclez.v1i64.f64.f32(double, float)
-declare <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f32(double, float)
-declare <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fcltz.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcltz.v1i64.f64.f32(double, float)
-declare <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double, double)
diff --git a/test/CodeGen/AArch64/neon-scalar-mul.ll b/test/CodeGen/AArch64/neon-scalar-mul.ll
deleted file mode 100644
index 991037f..0000000
--- a/test/CodeGen/AArch64/neon-scalar-mul.ll
+++ /dev/null
@@ -1,143 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) {
-; CHECK: test_vqdmulhh_s16
-; CHECK: sqdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  %1 = insertelement <1 x i16> undef, i16 %a, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %b, i32 0
-  %3 = call <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16> %1, <1 x i16> %2)
-  %4 = extractelement <1 x i16> %3, i32 0
-  ret i16 %4
-}
-
-define i32 @test_vqdmulhs_s32(i32 %a, i32 %b) {
-; CHECK: test_vqdmulhs_s32
-; CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = insertelement <1 x i32> undef, i32 %a, i32 0
-  %2 = insertelement <1 x i32> undef, i32 %b, i32 0
-  %3 = call <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32> %1, <1 x i32> %2)
-  %4 = extractelement <1 x i32> %3, i32 0
-  ret i32 %4
-}
-
-declare <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32>, <1 x i32>)
-
-define i16 @test_vqrdmulhh_s16(i16 %a, i16 %b) {
-; CHECK: test_vqrdmulhh_s16
-; CHECK: sqrdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  %1 = insertelement <1 x i16> undef, i16 %a, i32 0
-  %2 = insertelement <1 x i16> undef, i16 %b, i32 0
-  %3 = call <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16> %1, <1 x i16> %2)
-  %4 = extractelement <1 x i16> %3, i32 0
-  ret i16 %4
-}
-
-define i32 @test_vqrdmulhs_s32(i32 %a, i32 %b) {
-; CHECK: test_vqrdmulhs_s32
-; CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = insertelement <1 x i32> undef, i32 %a, i32 0
-  %2 = insertelement <1 x i32> undef, i32 %b, i32 0
-  %3 = call <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32> %1, <1 x i32> %2)
-  %4 = extractelement <1 x i32> %3, i32 0
-  ret i32 %4
-}
-
-declare <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32>, <1 x i32>)
-
-define float @test_vmulxs_f32(float %a, float %b) {
-; CHECK: test_vmulxs_f32
-; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %b)
-  ret float %1
-}
-
-define double @test_vmulxd_f64(double %a, double %b) {
-; CHECK: test_vmulxd_f64
-; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %b)
-  ret double %1
-}
-
-declare float @llvm.aarch64.neon.vmulx.f32(float, float)
-declare double @llvm.aarch64.neon.vmulx.f64(double, double)
-
-define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) {
-; CHECK: test_vqdmlalh_s16
-; CHECK: sqdmlal {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqdmlal.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqdmlal1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vqdmlal2.i = insertelement <1 x i16> undef, i16 %c, i32 0
-  %vqdmlal3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32> %vqdmlal.i, <1 x i16> %vqdmlal1.i, <1 x i16> %vqdmlal2.i)
-  %0 = extractelement <1 x i32> %vqdmlal3.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vqdmlals_s32(i64 %a, i32 %b, i32 %c) {
-; CHECK: test_vqdmlals_s32
-; CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqdmlal.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqdmlal1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vqdmlal2.i = insertelement <1 x i32> undef, i32 %c, i32 0
-  %vqdmlal3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64> %vqdmlal.i, <1 x i32> %vqdmlal1.i, <1 x i32> %vqdmlal2.i)
-  %0 = extractelement <1 x i64> %vqdmlal3.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32>, <1 x i16>, <1 x i16>)
-declare <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64>, <1 x i32>, <1 x i32>)
-
-define i32 @test_vqdmlslh_s16(i32 %a, i16 %b, i16 %c) {
-; CHECK: test_vqdmlslh_s16
-; CHECK: sqdmlsl {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqdmlsl.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqdmlsl1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vqdmlsl2.i = insertelement <1 x i16> undef, i16 %c, i32 0
-  %vqdmlsl3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32> %vqdmlsl.i, <1 x i16> %vqdmlsl1.i, <1 x i16> %vqdmlsl2.i)
-  %0 = extractelement <1 x i32> %vqdmlsl3.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vqdmlsls_s32(i64 %a, i32 %b, i32 %c) {
-; CHECK: test_vqdmlsls_s32
-; CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqdmlsl.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqdmlsl1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vqdmlsl2.i = insertelement <1 x i32> undef, i32 %c, i32 0
-  %vqdmlsl3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64> %vqdmlsl.i, <1 x i32> %vqdmlsl1.i, <1 x i32> %vqdmlsl2.i)
-  %0 = extractelement <1 x i64> %vqdmlsl3.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32>, <1 x i16>, <1 x i16>)
-declare <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64>, <1 x i32>, <1 x i32>)
-
-define i32 @test_vqdmullh_s16(i16 %a, i16 %b) {
-; CHECK: test_vqdmullh_s16
-; CHECK: sqdmull {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqdmull.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqdmull1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vqdmull2.i = call <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16> %vqdmull.i, <1 x i16> %vqdmull1.i)
-  %0 = extractelement <1 x i32> %vqdmull2.i, i32 0
-  ret i32 %0
-}
-
-define i64 @test_vqdmulls_s32(i32 %a, i32 %b) {
-; CHECK: test_vqdmulls_s32
-; CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqdmull.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqdmull1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vqdmull2.i = call <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32> %vqdmull.i, <1 x i32> %vqdmull1.i)
-  %0 = extractelement <1 x i64> %vqdmull2.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16>, <1 x i16>)
-declare <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32>, <1 x i32>)
diff --git a/test/CodeGen/AArch64/neon-scalar-neg.ll b/test/CodeGen/AArch64/neon-scalar-neg.ll
deleted file mode 100644
index 4dc9d51..0000000
--- a/test/CodeGen/AArch64/neon-scalar-neg.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define i64 @test_vnegd_s64(i64 %a) {
-; CHECK: test_vnegd_s64
-; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vneg.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vneg1.i = tail call <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64> %vneg.i)
-  %0 = extractelement <1 x i64> %vneg1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64>)
-
-define i8 @test_vqnegb_s8(i8 %a) {
-; CHECK: test_vqnegb_s8
-; CHECK: sqneg {{b[0-9]+}}, {{b[0-9]+}}
-entry:
-  %vqneg.i = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vqneg1.i = call <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8> %vqneg.i)
-  %0 = extractelement <1 x i8> %vqneg1.i, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8>)
-
-define i16 @test_vqnegh_s16(i16 %a) {
-; CHECK: test_vqnegh_s16
-; CHECK: sqneg {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vqneg.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vqneg1.i = call <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16> %vqneg.i)
-  %0 = extractelement <1 x i16> %vqneg1.i, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16>)
-
-define i32 @test_vqnegs_s32(i32 %a) {
-; CHECK: test_vqnegs_s32
-; CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vqneg.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vqneg1.i = call <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32> %vqneg.i)
-  %0 = extractelement <1 x i32> %vqneg1.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32>)
-
-define i64 @test_vqnegd_s64(i64 %a) {
-; CHECK: test_vqnegd_s64
-; CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vqneg.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vqneg1.i = call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %vqneg.i)
-  %0 = extractelement <1 x i64> %vqneg1.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>)
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-scalar-recip.ll b/test/CodeGen/AArch64/neon-scalar-recip.ll
deleted file mode 100644
index 100839b..0000000
--- a/test/CodeGen/AArch64/neon-scalar-recip.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define float @test_vrecpss_f32(float %a, float %b) {
-; CHECK: test_vrecpss_f32
-; CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = call float @llvm.aarch64.neon.vrecps.f32(float %a, float %b)
-  ret float %1
-}
-
-define double @test_vrecpsd_f64(double %a, double %b) {
-; CHECK: test_vrecpsd_f64
-; CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = call double @llvm.aarch64.neon.vrecps.f64(double %a, double %b)
-  ret double %1
-}
-
-declare float @llvm.aarch64.neon.vrecps.f32(float, float)
-declare double @llvm.aarch64.neon.vrecps.f64(double, double)
-
-define float @test_vrsqrtss_f32(float %a, float %b) {
-; CHECK: test_vrsqrtss_f32
-; CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %1 = call float @llvm.aarch64.neon.vrsqrts.f32(float %a, float %b)
-  ret float %1
-}
-
-define double @test_vrsqrtsd_f64(double %a, double %b) {
-; CHECK: test_vrsqrtsd_f64
-; CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  %1 = call double @llvm.aarch64.neon.vrsqrts.f64(double %a, double %b)
-  ret double %1
-}
-
-declare float @llvm.aarch64.neon.vrsqrts.f32(float, float)
-declare double @llvm.aarch64.neon.vrsqrts.f64(double, double)
-
-define float @test_vrecpes_f32(float %a) {
-; CHECK: test_vrecpes_f32
-; CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %0 = call float @llvm.aarch64.neon.vrecpe.f32(float %a)
-  ret float %0
-}
-
-define double @test_vrecped_f64(double %a) {
-; CHECK: test_vrecped_f64
-; CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = call double @llvm.aarch64.neon.vrecpe.f64(double %a)
-  ret double %0
-}
-
-declare float @llvm.aarch64.neon.vrecpe.f32(float)
-declare double @llvm.aarch64.neon.vrecpe.f64(double)
-
-define float @test_vrecpxs_f32(float %a) {
-; CHECK: test_vrecpxs_f32
-; CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %0 = call float @llvm.aarch64.neon.vrecpx.f32(float %a)
-  ret float %0
-}
-
-define double @test_vrecpxd_f64(double %a) {
-; CHECK: test_vrecpxd_f64
-; CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = call double @llvm.aarch64.neon.vrecpx.f64(double %a)
-  ret double %0
-}
-
-declare float @llvm.aarch64.neon.vrecpx.f32(float)
-declare double @llvm.aarch64.neon.vrecpx.f64(double)
-
-define float @test_vrsqrtes_f32(float %a) {
-; CHECK: test_vrsqrtes_f32
-; CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %0 = call float @llvm.aarch64.neon.vrsqrte.f32(float %a)
-  ret float %0
-}
-
-define double @test_vrsqrted_f64(double %a) {
-; CHECK: test_vrsqrted_f64
-; CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = call double @llvm.aarch64.neon.vrsqrte.f64(double %a)
-  ret double %0
-}
-
-declare float @llvm.aarch64.neon.vrsqrte.f32(float)
-declare double @llvm.aarch64.neon.vrsqrte.f64(double)
diff --git a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
deleted file mode 100644
index 33ce5cf..0000000
--- a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
+++ /dev/null
@@ -1,215 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64>)
-
-define <1 x i64> @test_addp_v1i64(<2 x i64> %a) {
-; CHECK: test_addp_v1i64:
-; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64> %a)
-  ret <1 x i64> %val
-}
-
-declare float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float>)
-
-define float @test_faddp_f32(<2 x float> %a) {
-; CHECK: test_faddp_f32:
-; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double>)
-
-define double @test_faddp_f64(<2 x double> %a) {
-; CHECK: test_faddp_f64:
-; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-
-declare float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float>)
-
-define float @test_fmaxp_f32(<2 x float> %a) {
-; CHECK: test_fmaxp_f32:
-; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double>)
-
-define double @test_fmaxp_f64(<2 x double> %a) {
-; CHECK: test_fmaxp_f64:
-; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-declare float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float>)
-
-define float @test_fminp_f32(<2 x float> %a) {
-; CHECK: test_fminp_f32:
-; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double>)
-
-define double @test_fminp_f64(<2 x double> %a) {
-; CHECK: test_fminp_f64:
-; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-declare float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float>)
-
-define float @test_fmaxnmp_f32(<2 x float> %a) {
-; CHECK: test_fmaxnmp_f32:
-; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double>)
-
-define double @test_fmaxnmp_f64(<2 x double> %a) {
-; CHECK: test_fmaxnmp_f64:
-; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-declare float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float>)
-
-define float @test_fminnmp_f32(<2 x float> %a) {
-; CHECK: test_fminnmp_f32:
-; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %val = call float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float> %a)
-  ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double>)
-
-define double @test_fminnmp_f64(<2 x double> %a) {
-; CHECK: test_fminnmp_f64:
-; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %val = call double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double> %a)
-  ret double %val
-}
-
-define float @test_vaddv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vaddv_f32
-; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define float @test_vaddvq_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vaddvq_f32
-; CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpfadd.f32.v4f32(<4 x float> %a)
-  ret float %1
-}
-
-define double @test_vaddvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vaddvq_f64
-; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define float @test_vmaxv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vmaxv_f32
-; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define double @test_vmaxvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vmaxvq_f64
-; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define float @test_vminv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vminv_f32
-; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define double @test_vminvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vminvq_f64
-; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define double @test_vmaxnmvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vmaxnmvq_f64
-; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define float @test_vmaxnmv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vmaxnmv_f32
-; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define double @test_vminnmvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vminnmvq_f64
-; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double> %a)
-  ret double %1
-}
-
-define float @test_vminnmv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vminnmv_f32
-; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
-  %1 = call float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float> %a)
-  ret float %1
-}
-
-define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vpaddq_s64
-; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  %1 = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %1
-}
-
-define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vpaddq_u64
-; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  %1 = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
-  ret <2 x i64> %1
-}
-
-define i64 @test_vaddvq_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vaddvq_s64
-; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
-  %2 = extractelement <1 x i64> %1, i32 0
-  ret i64 %2
-}
-
-define i64 @test_vaddvq_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vaddvq_u64
-; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
-  %1 = call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
-  %2 = extractelement <1 x i64> %1, i32 0
-  ret i64 %2
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64>)
-
-declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
-
-declare float @llvm.aarch64.neon.vpfadd.f32.v4f32(<4 x float>)
diff --git a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
deleted file mode 100644
index 7c9ffa0..0000000
--- a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-
-declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_urshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_srshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_urshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_urshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_srshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_srshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
deleted file mode 100644
index 5c010ef..0000000
--- a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
+++ /dev/null
@@ -1,242 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqadd_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqadd {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqadd_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqadd {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-declare <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqsub_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqsub {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqsub_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqsub {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-declare <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqadd_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqadd_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-declare <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqsub_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqsub_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-declare <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqadd_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-define <1 x i32> @test_sqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqadd_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-declare <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqsub_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-
-define <1 x i32> @test_sqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqsub_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqadd_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqadd_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqsub_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqsub_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define i8 @test_vuqaddb_s8(i8 %a, i8 %b) {
-; CHECK: test_vuqaddb_s8
-; CHECK: suqadd {{b[0-9]+}}, {{b[0-9]+}}
-entry:
-  %vuqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vuqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0
-  %vuqadd2.i = call <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8> %vuqadd.i, <1 x i8> %vuqadd1.i)
-  %0 = extractelement <1 x i8> %vuqadd2.i, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>)
-
-define i16 @test_vuqaddh_s16(i16 %a, i16 %b) {
-; CHECK: test_vuqaddh_s16
-; CHECK: suqadd {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vuqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vuqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vuqadd2.i = call <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16> %vuqadd.i, <1 x i16> %vuqadd1.i)
-  %0 = extractelement <1 x i16> %vuqadd2.i, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>)
-
-define i32 @test_vuqadds_s32(i32 %a, i32 %b) {
-; CHECK: test_vuqadds_s32
-; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vuqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vuqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vuqadd2.i = call <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32> %vuqadd.i, <1 x i32> %vuqadd1.i)
-  %0 = extractelement <1 x i32> %vuqadd2.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32>, <1 x i32>)
-
-define i64 @test_vuqaddd_s64(i64 %a, i64 %b) {
-; CHECK: test_vuqaddd_s64
-; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vuqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vuqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vuqadd2.i = call <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64> %vuqadd.i, <1 x i64> %vuqadd1.i)
-  %0 = extractelement <1 x i64> %vuqadd2.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64>, <1 x i64>)
-
-define i8 @test_vsqaddb_u8(i8 %a, i8 %b) {
-; CHECK: test_vsqaddb_u8
-; CHECK: usqadd {{b[0-9]+}}, {{b[0-9]+}}
-entry:
-  %vsqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vsqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0
-  %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %vsqadd.i, <1 x i8> %vsqadd1.i)
-  %0 = extractelement <1 x i8> %vsqadd2.i, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8>, <1 x i8>)
-
-define i16 @test_vsqaddh_u16(i16 %a, i16 %b) {
-; CHECK: test_vsqaddh_u16
-; CHECK: usqadd {{h[0-9]+}}, {{h[0-9]+}}
-entry:
-  %vsqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0
-  %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %vsqadd.i, <1 x i16> %vsqadd1.i)
-  %0 = extractelement <1 x i16> %vsqadd2.i, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16>, <1 x i16>)
-
-define i32 @test_vsqadds_u32(i32 %a, i32 %b) {
-; CHECK: test_vsqadds_u32
-; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
-entry:
-  %vsqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0
-  %vsqadd2.i = call <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32> %vsqadd.i, <1 x i32> %vsqadd1.i)
-  %0 = extractelement <1 x i32> %vsqadd2.i, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32>, <1 x i32>)
-
-define i64 @test_vsqaddd_u64(i64 %a, i64 %b) {
-; CHECK: test_vsqaddd_u64
-; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %vsqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsqadd2.i = call <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64> %vsqadd.i, <1 x i64> %vsqadd1.i)
-  %0 = extractelement <1 x i64> %vsqadd2.i, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
deleted file mode 100644
index dbf9669..0000000
--- a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
+++ /dev/null
@@ -1,94 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqrshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqrshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqrshl_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqrshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-
-  ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqrshl_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqrshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqrshl_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqrshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-
-  ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqrshl_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqrshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqrshl_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-
-  ret <1 x i32> %tmp1
-}
-
-define <1 x i32> @test_sqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqrshl_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqrshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqrshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
deleted file mode 100644
index 0a1f4c9..0000000
--- a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqshl_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqshl_v1i8_aarch64:
-  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-  ret <1 x i8> %tmp1
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqshl_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqshl_v1i16_aarch64:
-  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-  ret <1 x i16> %tmp1
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqshl_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-define <1 x i32> @test_sqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqshl_v1i32_aarch64:
-  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  ret <1 x i32> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll b/test/CodeGen/AArch64/neon-scalar-shift-imm.ll
deleted file mode 100644
index 6224361..0000000
--- a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll
+++ /dev/null
@@ -1,531 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define i64 @test_vshrd_n_s64(i64 %a) {
-; CHECK: test_vshrd_n_s64
-; CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsshr = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsshr1 = call <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64> %vsshr, i32 63)
-  %0 = extractelement <1 x i64> %vsshr1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64>, i32)
-
-define i64 @test_vshrd_n_u64(i64 %a) {
-; CHECK: test_vshrd_n_u64
-; CHECK: ushr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vushr = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vushr1 = call <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64> %vushr, i32 63)
-  %0 = extractelement <1 x i64> %vushr1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64>, i32)
-
-define i64 @test_vrshrd_n_s64(i64 %a) {
-; CHECK: test_vrshrd_n_s64
-; CHECK: srshr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsrshr = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsrshr1 = call <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64> %vsrshr, i32 63)
-  %0 = extractelement <1 x i64> %vsrshr1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64>, i32)
-
-define i64 @test_vrshrd_n_u64(i64 %a) {
-; CHECK: test_vrshrd_n_u64
-; CHECK: urshr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vurshr = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vurshr1 = call <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64> %vurshr, i32 63)
-  %0 = extractelement <1 x i64> %vurshr1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64>, i32)
-
-define i64 @test_vsrad_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vsrad_n_s64
-; CHECK: ssra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vssra = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vssra1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vssra2 = call <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64> %vssra, <1 x i64> %vssra1, i32 63)
-  %0 = extractelement <1 x i64> %vssra2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vsrad_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vsrad_n_u64
-; CHECK: usra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vusra = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vusra1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vusra2 = call <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64> %vusra, <1 x i64> %vusra1, i32 63)
-  %0 = extractelement <1 x i64> %vusra2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vrsrad_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vrsrad_n_s64
-; CHECK: srsra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsrsra = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsrsra1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsrsra2 = call <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64> %vsrsra, <1 x i64> %vsrsra1, i32 63)
-  %0 = extractelement <1 x i64> %vsrsra2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vrsrad_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vrsrad_n_u64
-; CHECK: ursra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vursra = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vursra1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vursra2 = call <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64> %vursra, <1 x i64> %vursra1, i32 63)
-  %0 = extractelement <1 x i64> %vursra2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vshld_n_s64(i64 %a) {
-; CHECK: test_vshld_n_s64
-; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vshl = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63)
-  %0 = extractelement <1 x i64> %vshl1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64>, i32)
-
-define i64 @test_vshld_n_u64(i64 %a) {
-; CHECK: test_vshld_n_u64
-; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vshl = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63)
-  %0 = extractelement <1 x i64> %vshl1, i32 0
-  ret i64 %0
-}
-
-define i8 @test_vqshlb_n_s8(i8 %a) {
-; CHECK: test_vqshlb_n_s8
-; CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, #7
-entry:
-  %vsqshl = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vsqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8> %vsqshl, i32 7)
-  %0 = extractelement <1 x i8> %vsqshl1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8>, i32)
-
-define i16 @test_vqshlh_n_s16(i16 %a) {
-; CHECK: test_vqshlh_n_s16
-; CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, #15
-entry:
-  %vsqshl = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16> %vsqshl, i32 15)
-  %0 = extractelement <1 x i16> %vsqshl1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16>, i32)
-
-define i32 @test_vqshls_n_s32(i32 %a) {
-; CHECK: test_vqshls_n_s32
-; CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
-entry:
-  %vsqshl = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32> %vsqshl, i32 31)
-  %0 = extractelement <1 x i32> %vsqshl1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32>, i32)
-
-define i64 @test_vqshld_n_s64(i64 %a) {
-; CHECK: test_vqshld_n_s64
-; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsqshl = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64> %vsqshl, i32 63)
-  %0 = extractelement <1 x i64> %vsqshl1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64>, i32)
-
-define i8 @test_vqshlb_n_u8(i8 %a) {
-; CHECK: test_vqshlb_n_u8
-; CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, #7
-entry:
-  %vuqshl = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vuqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8> %vuqshl, i32 7)
-  %0 = extractelement <1 x i8> %vuqshl1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8>, i32)
-
-define i16 @test_vqshlh_n_u16(i16 %a) {
-; CHECK: test_vqshlh_n_u16
-; CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, #15
-entry:
-  %vuqshl = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vuqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16> %vuqshl, i32 15)
-  %0 = extractelement <1 x i16> %vuqshl1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16>, i32)
-
-define i32 @test_vqshls_n_u32(i32 %a) {
-; CHECK: test_vqshls_n_u32
-; CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
-entry:
-  %vuqshl = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vuqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32> %vuqshl, i32 31)
-  %0 = extractelement <1 x i32> %vuqshl1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32>, i32)
-
-define i64 @test_vqshld_n_u64(i64 %a) {
-; CHECK: test_vqshld_n_u64
-; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vuqshl = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vuqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64> %vuqshl, i32 63)
-  %0 = extractelement <1 x i64> %vuqshl1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64>, i32)
-
-define i8 @test_vqshlub_n_s8(i8 %a) {
-; CHECK: test_vqshlub_n_s8
-; CHECK: sqshlu {{b[0-9]+}}, {{b[0-9]+}}, #7
-entry:
-  %vsqshlu = insertelement <1 x i8> undef, i8 %a, i32 0
-  %vsqshlu1 = call <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8> %vsqshlu, i32 7)
-  %0 = extractelement <1 x i8> %vsqshlu1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8>, i32)
-
-define i16 @test_vqshluh_n_s16(i16 %a) {
-; CHECK: test_vqshluh_n_s16
-; CHECK: sqshlu {{h[0-9]+}}, {{h[0-9]+}}, #15
-entry:
-  %vsqshlu = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqshlu1 = call <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16> %vsqshlu, i32 15)
-  %0 = extractelement <1 x i16> %vsqshlu1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16>, i32)
-
-define i32 @test_vqshlus_n_s32(i32 %a) {
-; CHECK: test_vqshlus_n_s32
-; CHECK: sqshlu {{s[0-9]+}}, {{s[0-9]+}}, #31
-entry:
-  %vsqshlu = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqshlu1 = call <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32> %vsqshlu, i32 31)
-  %0 = extractelement <1 x i32> %vsqshlu1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32>, i32)
-
-define i64 @test_vqshlud_n_s64(i64 %a) {
-; CHECK: test_vqshlud_n_s64
-; CHECK: sqshlu {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsqshlu = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqshlu1 = call <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64> %vsqshlu, i32 63)
-  %0 = extractelement <1 x i64> %vsqshlu1, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64>, i32)
-
-define i64 @test_vsrid_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vsrid_n_s64
-; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsri = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63)
-  %0 = extractelement <1 x i64> %vsri2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vsrid_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vsrid_n_u64
-; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsri = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63)
-  %0 = extractelement <1 x i64> %vsri2, i32 0
-  ret i64 %0
-}
-
-define i64 @test_vslid_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vslid_n_s64
-; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsli = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63)
-  %0 = extractelement <1 x i64> %vsli2, i32 0
-  ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vslid_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vslid_n_u64
-; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
-  %vsli = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0
-  %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63)
-  %0 = extractelement <1 x i64> %vsli2, i32 0
-  ret i64 %0
-}
-
-define i8 @test_vqshrnh_n_s16(i16 %a) {
-; CHECK: test_vqshrnh_n_s16
-; CHECK: sqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vsqshrn = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16> %vsqshrn, i32 8)
-  %0 = extractelement <1 x i8> %vsqshrn1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqshrns_n_s32(i32 %a) {
-; CHECK: test_vqshrns_n_s32
-; CHECK: sqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vsqshrn = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32> %vsqshrn, i32 16)
-  %0 = extractelement <1 x i16> %vsqshrn1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqshrnd_n_s64(i64 %a) {
-; CHECK: test_vqshrnd_n_s64
-; CHECK: sqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vsqshrn = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64> %vsqshrn, i32 32)
-  %0 = extractelement <1 x i32> %vsqshrn1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqshrnh_n_u16(i16 %a) {
-; CHECK: test_vqshrnh_n_u16
-; CHECK: uqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vuqshrn = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vuqshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16> %vuqshrn, i32 8)
-  %0 = extractelement <1 x i8> %vuqshrn1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqshrns_n_u32(i32 %a) {
-; CHECK: test_vqshrns_n_u32
-; CHECK: uqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vuqshrn = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vuqshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32> %vuqshrn, i32 16)
-  %0 = extractelement <1 x i16> %vuqshrn1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqshrnd_n_u64(i64 %a) {
-; CHECK: test_vqshrnd_n_u64
-; CHECK: uqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vuqshrn = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vuqshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64> %vuqshrn, i32 32)
-  %0 = extractelement <1 x i32> %vuqshrn1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqrshrnh_n_s16(i16 %a) {
-; CHECK: test_vqrshrnh_n_s16
-; CHECK: sqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vsqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16> %vsqrshrn, i32 8)
-  %0 = extractelement <1 x i8> %vsqrshrn1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqrshrns_n_s32(i32 %a) {
-; CHECK: test_vqrshrns_n_s32
-; CHECK: sqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vsqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32> %vsqrshrn, i32 16)
-  %0 = extractelement <1 x i16> %vsqrshrn1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqrshrnd_n_s64(i64 %a) {
-; CHECK: test_vqrshrnd_n_s64
-; CHECK: sqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vsqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64> %vsqrshrn, i32 32)
-  %0 = extractelement <1 x i32> %vsqrshrn1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqrshrnh_n_u16(i16 %a) {
-; CHECK: test_vqrshrnh_n_u16
-; CHECK: uqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vuqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vuqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16> %vuqrshrn, i32 8)
-  %0 = extractelement <1 x i8> %vuqrshrn1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqrshrns_n_u32(i32 %a) {
-; CHECK: test_vqrshrns_n_u32
-; CHECK: uqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vuqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vuqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %vuqrshrn, i32 16)
-  %0 = extractelement <1 x i16> %vuqrshrn1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqrshrnd_n_u64(i64 %a) {
-; CHECK: test_vqrshrnd_n_u64
-; CHECK: uqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vuqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vuqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64> %vuqrshrn, i32 32)
-  %0 = extractelement <1 x i32> %vuqrshrn1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqshrunh_n_s16(i16 %a) {
-; CHECK: test_vqshrunh_n_s16
-; CHECK: sqshrun {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vsqshrun = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16> %vsqshrun, i32 8)
-  %0 = extractelement <1 x i8> %vsqshrun1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqshruns_n_s32(i32 %a) {
-; CHECK: test_vqshruns_n_s32
-; CHECK: sqshrun {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vsqshrun = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32> %vsqshrun, i32 16)
-  %0 = extractelement <1 x i16> %vsqshrun1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqshrund_n_s64(i64 %a) {
-; CHECK: test_vqshrund_n_s64
-; CHECK: sqshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vsqshrun = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64> %vsqshrun, i32 32)
-  %0 = extractelement <1 x i32> %vsqshrun1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqrshrunh_n_s16(i16 %a) {
-; CHECK: test_vqrshrunh_n_s16
-; CHECK: sqrshrun {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
-  %vsqrshrun = insertelement <1 x i16> undef, i16 %a, i32 0
-  %vsqrshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16> %vsqrshrun, i32 8)
-  %0 = extractelement <1 x i8> %vsqrshrun1, i32 0
-  ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqrshruns_n_s32(i32 %a) {
-; CHECK: test_vqrshruns_n_s32
-; CHECK: sqrshrun {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
-  %vsqrshrun = insertelement <1 x i32> undef, i32 %a, i32 0
-  %vsqrshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32> %vsqrshrun, i32 16)
-  %0 = extractelement <1 x i16> %vsqrshrun1, i32 0
-  ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqrshrund_n_s64(i64 %a) {
-; CHECK: test_vqrshrund_n_s64
-; CHECK: sqrshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
-  %vsqrshrun = insertelement <1 x i64> undef, i64 %a, i32 0
-  %vsqrshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64> %vsqrshrun, i32 32)
-  %0 = extractelement <1 x i32> %vsqrshrun1, i32 0
-  ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64>, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-shift.ll b/test/CodeGen/AArch64/neon-scalar-shift.ll
deleted file mode 100644
index b712ea4..0000000
--- a/test/CodeGen/AArch64/neon-scalar-shift.ll
+++ /dev/null
@@ -1,236 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_ushl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_ushl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_ushl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sshl_v1i64_aarch64:
-  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_vtst_s64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vtst_s64
-; CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = and <1 x i64> %a, %b
-  %1 = icmp ne <1 x i64> %0, zeroinitializer
-  %vtst.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vtst.i
-}
-
-define <1 x i64> @test_vtst_u64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vtst_u64
-; CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-entry:
-  %0 = and <1 x i64> %a, %b
-  %1 = icmp ne <1 x i64> %0, zeroinitializer
-  %vtst.i = sext <1 x i1> %1 to <1 x i64>
-  ret <1 x i64> %vtst.i
-}
-
-define <1 x i64> @test_vsli_n_p64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vsli_n_p64
-; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #0
-entry:
-  %vsli_n2 = tail call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %a, <1 x i64> %b, i32 0)
-  ret <1 x i64> %vsli_n2
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32)
-
-define <2 x i64> @test_vsliq_n_p64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsliq_n_p64
-; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-entry:
-  %vsli_n2 = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 0)
-  ret <2 x i64> %vsli_n2
-}
-
-declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32)
-
-define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vrsqrte_u32
-; CHECK: ursqrte {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
-  %vrsqrte1.i = tail call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a)
-  ret <2 x i32> %vrsqrte1.i
-}
-
-define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vrsqrteq_u32
-; CHECK: ursqrte {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
-  %vrsqrte1.i = tail call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a)
-  ret <4 x i32> %vrsqrte1.i
-}
-
-define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) {
-; CHECK-LABEL: test_vqshl_n_s8
-; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-entry:
-  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
-  ret <8 x i8> %vqshl_n
-}
-
-declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vqshlq_n_s8
-; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-entry:
-  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
-  ret <16 x i8> %vqshl_n
-}
-
-declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) {
-; CHECK-LABEL: test_vqshl_n_s16
-; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-entry:
-  %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> zeroinitializer)
-  ret <4 x i16> %vqshl_n1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_vqshlq_n_s16
-; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-entry:
-  %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> zeroinitializer)
-  ret <8 x i16> %vqshl_n1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vqshl_n_s32
-; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-entry:
-  %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer)
-  ret <2 x i32> %vqshl_n1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_vqshlq_n_s32
-; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-entry:
-  %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> zeroinitializer)
-  ret <4 x i32> %vqshl_n1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vqshlq_n_s64
-; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-entry:
-  %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> zeroinitializer)
-  ret <2 x i64> %vqshl_n1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) {
-; CHECK-LABEL: test_vqshl_n_u8
-; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-entry:
-  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
-  ret <8 x i8> %vqshl_n
-}
-
-declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
-
-define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_vqshlq_n_u8
-; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-entry:
-  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
-  ret <16 x i8> %vqshl_n
-}
-
-declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>)
-
-define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) {
-; CHECK-LABEL: test_vqshl_n_u16
-; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-entry:
-  %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> zeroinitializer)
-  ret <4 x i16> %vqshl_n1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
-
-define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_vqshlq_n_u16
-; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-entry:
-  %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> zeroinitializer)
-  ret <8 x i16> %vqshl_n1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>)
-
-define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vqshl_n_u32
-; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-entry:
-  %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer)
-  ret <2 x i32> %vqshl_n1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>)
-
-define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vqshlq_n_u32
-; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-entry:
-  %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> zeroinitializer)
-  ret <4 x i32> %vqshl_n1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
-
-define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vqshlq_n_u64
-; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d,
-entry:
-  %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> zeroinitializer)
-  ret <2 x i64> %vqshl_n1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>)
-
-declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>)
diff --git a/test/CodeGen/AArch64/neon-select_cc.ll b/test/CodeGen/AArch64/neon-select_cc.ll
deleted file mode 100644
index f6b5d3c..0000000
--- a/test/CodeGen/AArch64/neon-select_cc.ll
+++ /dev/null
@@ -1,202 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v8i8_i8:
-; CHECK: and	w0, w0, #0xff
-; CHECK-NEXT: cmp	w0, w1, uxtb
-; CHECK-NEXT: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.8b, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i8 %a, %b
-  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
-  ret <8x i8> %e
-}
-
-define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v8i8_f32:
-; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup	v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
-  ret <8x i8> %e
-}
-
-define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v8i8_f64:
-; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
-  %cmp31 = fcmp oeq double %a, %b
-  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
-  ret <8x i8> %e
-}
-
-define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v16i8_i8:
-; CHECK: and	w0, w0, #0xff
-; CHECK-NEXT: cmp	w0, w1, uxtb
-; CHECK-NEXT: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.16b, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i8 %a, %b
-  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
-  ret <16x i8> %e
-}
-
-define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v16i8_f32:
-; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup	v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
-  ret <16x i8> %e
-}
-
-define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16x i8> %d ) {
-; CHECK-LABEL: test_select_cc_v16i8_f64:
-; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT: dup	v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
-  %cmp31 = fcmp oeq double %a, %b
-  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
-  ret <16x i8> %e
-}
-
-define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) {
-; CHECK-LABEL: test_select_cc_v4i16:
-; CHECK: and	w0, w0, #0xffff
-; CHECK-NEXT: cmp	w0, w1, uxth
-; CHECK-NEXT: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.4h, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i16 %a, %b
-  %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
-  ret <4x i16> %e
-}
-
-define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) {
-; CHECK-LABEL: test_select_cc_v8i16:
-; CHECK: and	w0, w0, #0xffff
-; CHECK-NEXT: cmp	w0, w1, uxth
-; CHECK-NEXT: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.8h, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i16 %a, %b
-  %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
-  ret <8x i16> %e
-}
-
-define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) {
-; CHECK-LABEL: test_select_cc_v2i32:
-; CHECK: cmp	w0, w1, uxtw
-; CHECK-NEXT: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.2s, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i32 %a, %b
-  %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
-  ret <2x i32> %e
-}
-
-define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) {
-; CHECK-LABEL: test_select_cc_v4i32:
-; CHECK: cmp	w0, w1, uxtw
-; CHECK-NEXT: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.4s, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i32 %a, %b
-  %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
-  ret <4x i32> %e
-}
-
-define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) {
-; CHECK-LABEL: test_select_cc_v1i64:
-; CHECK: cmp	x0, x1
-; CHECK-NEXT: csinv	x0, xzr, xzr, ne
-; CHECK-NEXT: fmov	d{{[0-9]+}}, x0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i64 %a, %b
-  %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
-  ret <1x i64> %e
-}
-
-define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) {
-; CHECK-LABEL: test_select_cc_v2i64:
-; CHECK: cmp	x0, x1
-; CHECK-NEXT: csinv	x0, xzr, xzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.2d, x0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i64 %a, %b
-  %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
-  ret <2x i64> %e
-}
-
-define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) {
-; CHECK-LABEL: test_select_cc_v1f32:
-; CHECK: fcmp	s0, s1
-; CHECK-NEXT: fcsel	s0, s2, s3, eq
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d
-  ret <1 x float> %e
-}
-  
-define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d ) {
-; CHECK-LABEL: test_select_cc_v2f32:
-; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup	v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <2 x float> %c, <2 x float> %d
-  ret <2 x float> %e
-}
-
-define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x float> %d ) {
-; CHECK-LABEL: test_select_cc_v4f32:
-; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup	v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
-  %cmp31 = fcmp oeq float %a, %b
-  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
-  ret <4x float> %e
-}
-
-define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x float> %d ) {
-; CHECK-LABEL: test_select_cc_v4f32_icmp:
-; CHECK: cmp	w0, w1, uxtw
-; CHECK: csinv	w0, wzr, wzr, ne
-; CHECK-NEXT: dup	v{{[0-9]+}}.4s, w0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
-  %cmp31 = icmp eq i32 %a, %b
-  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
-  ret <4x float> %e
-}
-
-define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c, <1 x double> %d ) {
-; CHECK-LABEL: test_select_cc_v1f64:
-; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
-  %cmp31 = fcmp oeq double %a, %b
-  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
-  ret <1 x double> %e
-}
-
-define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c, <1 x double> %d ) {
-; CHECK-LABEL: test_select_cc_v1f64_icmp:
-; CHECK: cmp	 x0, x1
-; CHECK-NEXT: csinv	x0, xzr, xzr, ne
-; CHECK-NEXT: fmov	d{{[0-9]+}}, x0
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
-  %cmp31 = icmp eq i64 %a, %b
-  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
-  ret <1 x double> %e
-}
-
-define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d ) {
-; CHECK-LABEL: test_select_cc_v2f64:
-; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT: dup	v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
-; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
-  %cmp31 = fcmp oeq double %a, %b
-  %e = select i1 %cmp31, <2 x double> %c, <2 x double> %d
-  ret <2 x double> %e
-}
diff --git a/test/CodeGen/AArch64/neon-shift.ll b/test/CodeGen/AArch64/neon-shift.ll
deleted file mode 100644
index 33b04ce..0000000
--- a/test/CodeGen/AArch64/neon-shift.ll
+++ /dev/null
@@ -1,171 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: ushl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqshl_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sshl v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_ushl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_ushl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: ushl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sshl_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sshl v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_ushl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_ushl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: ushl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sshl_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sshl v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_ushl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_ushl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: ushl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sshl_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sshl v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_ushl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_ushl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: ushl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sshl_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sshl v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_ushl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_ushl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: ushl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sshl_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sshl v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_ushl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_ushl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: ushl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sshl_v2i64:
-  %tmp1 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sshl v0.2d, v0.2d, v1.2d
-  ret <2 x i64> %tmp1
-}
-
-
-define <8 x i8> @test_shl_v8i8(<8 x i8> %a) {
-; CHECK: test_shl_v8i8:
-; CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %tmp = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <8 x i8> %tmp
-}
-
-define <4 x i16> @test_shl_v4i16(<4 x i16> %a) {
-; CHECK: test_shl_v4i16:
-; CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %tmp = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
-  ret <4 x i16> %tmp
-}
-
-define <2 x i32> @test_shl_v2i32(<2 x i32> %a) {
-; CHECK: test_shl_v2i32:
-; CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %tmp = shl <2 x i32> %a, <i32 3, i32 3>
-  ret <2 x i32> %tmp
-}
-
-define <16 x i8> @test_shl_v16i8(<16 x i8> %a) {
-; CHECK: test_shl_v16i8:
-; CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %tmp = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <16 x i8> %tmp
-}
-
-define <8 x i16> @test_shl_v8i16(<8 x i16> %a) {
-; CHECK: test_shl_v8i16:
-; CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %tmp = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  ret <8 x i16> %tmp
-}
-
-define <4 x i32> @test_shl_v4i32(<4 x i32> %a) {
-; CHECK: test_shl_v4i32:
-; CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %tmp = shl <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %tmp
-}
-
-define <2 x i64> @test_shl_v2i64(<2 x i64> %a) {
-; CHECK: test_shl_v2i64:
-; CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #63
-  %tmp = shl <2 x i64> %a, <i64 63, i64 63>
-  ret <2 x i64> %tmp
-}
-
diff --git a/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll b/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
deleted file mode 100644
index 0b520d7..0000000
--- a/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
+++ /dev/null
@@ -1,333 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <8 x i8> @shl.v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: shl.v8i8:
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = shl <8 x i8> %a, %b
-  ret <8 x i8> %c
-}
-
-define <4 x i16> @shl.v4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: shl.v4i16:
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = shl <4 x i16> %a, %b
-  ret <4 x i16> %c
-}
-
-define <2 x i32> @shl.v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: shl.v2i32:
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = shl <2 x i32> %a, %b
-  ret <2 x i32> %c
-}
-
-define <1 x i64> @shl.v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: shl.v1i64:
-; CHECK: ushl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %c = shl <1 x i64> %a, %b
-  ret <1 x i64> %c
-}
-
-define <16 x i8> @shl.v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shl.v16i8:
-; CHECK: ushl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %c = shl <16 x i8> %a, %b
-  ret <16 x i8> %c
-}
-
-define <8 x i16> @shl.v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shl.v8i16:
-; CHECK: ushl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %c = shl <8 x i16> %a, %b
-  ret <8 x i16> %c
-}
-
-define <4 x i32> @shl.v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shl.v4i32:
-; CHECK: ushl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %c = shl <4 x i32> %a, %b
-  ret <4 x i32> %c
-}
-
-define <2 x i64> @shl.v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: shl.v2i64:
-; CHECK: ushl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %c = shl <2 x i64> %a, %b
-  ret <2 x i64> %c
-}
-
-define <8 x i8> @lshr.v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: lshr.v8i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = lshr <8 x i8> %a, %b
-  ret <8 x i8> %c
-}
-
-define <4 x i16> @lshr.v4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: lshr.v4i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = lshr <4 x i16> %a, %b
-  ret <4 x i16> %c
-}
-
-define <2 x i32> @lshr.v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: lshr.v2i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = lshr <2 x i32> %a, %b
-  ret <2 x i32> %c
-}
-
-define <1 x i64> @lshr.v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: lshr.v1i64:
-; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ushl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %c = lshr <1 x i64> %a, %b
-  ret <1 x i64> %c
-}
-
-define <16 x i8> @lshr.v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: lshr.v16i8:
-; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ushl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %c = lshr <16 x i8> %a, %b
-  ret <16 x i8> %c
-}
-
-define <8 x i16> @lshr.v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: lshr.v8i16:
-; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-; CHECK: ushl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %c = lshr <8 x i16> %a, %b
-  ret <8 x i16> %c
-}
-
-define <4 x i32> @lshr.v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: lshr.v4i32:
-; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: ushl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %c = lshr <4 x i32> %a, %b
-  ret <4 x i32> %c
-}
-
-define <2 x i64> @lshr.v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: lshr.v2i64:
-; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: ushl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %c = lshr <2 x i64> %a, %b
-  ret <2 x i64> %c
-}
-
-define <8 x i8> @ashr.v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: ashr.v8i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: sshl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = ashr <8 x i8> %a, %b
-  ret <8 x i8> %c
-}
-
-define <4 x i16> @ashr.v4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: ashr.v4i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: sshl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = ashr <4 x i16> %a, %b
-  ret <4 x i16> %c
-}
-
-define <2 x i32> @ashr.v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: ashr.v2i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: sshl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = ashr <2 x i32> %a, %b
-  ret <2 x i32> %c
-}
-
-define <1 x i64> @ashr.v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: ashr.v1i64:
-; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: sshl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %c = ashr <1 x i64> %a, %b
-  ret <1 x i64> %c
-}
-
-define <16 x i8> @ashr.v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: ashr.v16i8:
-; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: sshl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-  %c = ashr <16 x i8> %a, %b
-  ret <16 x i8> %c
-}
-
-define <8 x i16> @ashr.v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: ashr.v8i16:
-; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-; CHECK: sshl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-  %c = ashr <8 x i16> %a, %b
-  ret <8 x i16> %c
-}
-
-define <4 x i32> @ashr.v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: ashr.v4i32:
-; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: sshl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-  %c = ashr <4 x i32> %a, %b
-  ret <4 x i32> %c
-}
-
-define <2 x i64> @ashr.v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: ashr.v2i64:
-; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: sshl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-  %c = ashr <2 x i64> %a, %b
-  ret <2 x i64> %c
-}
-
-define <1 x i64> @shl.v1i64.0(<1 x i64> %a) {
-; CHECK-LABEL: shl.v1i64.0:
-; CHECK-NOT: shl d{{[0-9]+}}, d{{[0-9]+}}, #0
-  %c = shl <1 x i64> %a, zeroinitializer
-  ret <1 x i64> %c
-}
-
-define <2 x i32> @shl.v2i32.0(<2 x i32> %a) {
-; CHECK-LABEL: shl.v2i32.0:
-; CHECK-NOT: shl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #0
-  %c = shl <2 x i32> %a, zeroinitializer
-  ret <2 x i32> %c
-}
-
-; The following test cases test shl/ashr/lshr with v1i8/v1i16/v1i32 types
-
-define <1 x i8> @shl.v1i8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: shl.v1i8:
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = shl <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @shl.v1i16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: shl.v1i16:
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = shl <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @shl.v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: shl.v1i32:
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = shl <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @ashr.v1i8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: ashr.v1i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: sshl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = ashr <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @ashr.v1i16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: ashr.v1i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: sshl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = ashr <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @ashr.v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: ashr.v1i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: sshl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = ashr <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @lshr.v1i8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: lshr.v1i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %c = lshr <1 x i8> %a, %b
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @lshr.v1i16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: lshr.v1i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-  %c = lshr <1 x i16> %a, %b
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @lshr.v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: lshr.v1i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-  %c = lshr <1 x i32> %a, %b
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @shl.v1i8.imm(<1 x i8> %a) {
-; CHECK-LABEL: shl.v1i8.imm:
-; CHECK: shl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
-  %c = shl <1 x i8> %a, <i8 3>
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @shl.v1i16.imm(<1 x i16> %a) {
-; CHECK-LABEL: shl.v1i16.imm:
-; CHECK: shl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #5
-  %c = shl <1 x i16> %a, <i16 5>
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @shl.v1i32.imm(<1 x i32> %a) {
-; CHECK-LABEL: shl.v1i32.imm:
-; CHECK-NOT: shl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #0
-  %c = shl <1 x i32> %a, zeroinitializer
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @ashr.v1i8.imm(<1 x i8> %a) {
-; CHECK-LABEL: ashr.v1i8.imm:
-; CHECK: sshr v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
-  %c = ashr <1 x i8> %a, <i8 3>
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @ashr.v1i16.imm(<1 x i16> %a) {
-; CHECK-LABEL: ashr.v1i16.imm:
-; CHECK: sshr v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #10
-  %c = ashr <1 x i16> %a, <i16 10>
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @ashr.v1i32.imm(<1 x i32> %a) {
-; CHECK-LABEL: ashr.v1i32.imm:
-; CHECK: sshr v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #31
-  %c = ashr <1 x i32> %a, <i32 31>
-  ret <1 x i32> %c
-}
-
-define <1 x i8> @lshr.v1i8.imm(<1 x i8> %a) {
-; CHECK-LABEL: lshr.v1i8.imm:
-; CHECK: ushr v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
-  %c = lshr <1 x i8> %a, <i8 3>
-  ret <1 x i8> %c
-}
-
-define <1 x i16> @lshr.v1i16.imm(<1 x i16> %a) {
-; CHECK-LABEL: lshr.v1i16.imm:
-; CHECK: ushr v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #10
-  %c = lshr <1 x i16> %a, <i16 10>
-  ret <1 x i16> %c
-}
-
-define <1 x i32> @lshr.v1i32.imm(<1 x i32> %a) {
-; CHECK-LABEL: lshr.v1i32.imm:
-; CHECK: ushr v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #31
-  %c = lshr <1 x i32> %a, <i32 31>
-  ret <1 x i32> %c
-}
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
deleted file mode 100644
index d5557c0..0000000
--- a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
+++ /dev/null
@@ -1,2314 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define void @test_ldst1_v16i8(<16 x i8>* %ptr, <16 x i8>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v16i8:
-; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-  %tmp = load <16 x i8>* %ptr
-  store <16 x i8> %tmp, <16 x i8>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v8i16(<8 x i16>* %ptr, <8 x i16>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v8i16:
-; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-  %tmp = load <8 x i16>* %ptr
-  store <8 x i16> %tmp, <8 x i16>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v4i32(<4 x i32>* %ptr, <4 x i32>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v4i32:
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %tmp = load <4 x i32>* %ptr
-  store <4 x i32> %tmp, <4 x i32>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v2i64(<2 x i64>* %ptr, <2 x i64>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v2i64:
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %tmp = load <2 x i64>* %ptr
-  store <2 x i64> %tmp, <2 x i64>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v8i8(<8 x i8>* %ptr, <8 x i8>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v8i8:
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-  %tmp = load <8 x i8>* %ptr
-  store <8 x i8> %tmp, <8 x i8>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v4i16(<4 x i16>* %ptr, <4 x i16>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v4i16:
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-  %tmp = load <4 x i16>* %ptr
-  store <4 x i16> %tmp, <4 x i16>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v2i32(<2 x i32>* %ptr, <2 x i32>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v2i32:
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %tmp = load <2 x i32>* %ptr
-  store <2 x i32> %tmp, <2 x i32>* %ptr2
-  ret void
-}
-
-define void @test_ldst1_v1i64(<1 x i64>* %ptr, <1 x i64>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v1i64:
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %tmp = load <1 x i64>* %ptr
-  store <1 x i64> %tmp, <1 x i64>* %ptr2
-  ret void
-}
-
-%struct.int8x16x2_t = type { [2 x <16 x i8>] }
-%struct.int16x8x2_t = type { [2 x <8 x i16>] }
-%struct.int32x4x2_t = type { [2 x <4 x i32>] }
-%struct.int64x2x2_t = type { [2 x <2 x i64>] }
-%struct.float32x4x2_t = type { [2 x <4 x float>] }
-%struct.float64x2x2_t = type { [2 x <2 x double>] }
-%struct.int8x8x2_t = type { [2 x <8 x i8>] }
-%struct.int16x4x2_t = type { [2 x <4 x i16>] }
-%struct.int32x2x2_t = type { [2 x <2 x i32>] }
-%struct.int64x1x2_t = type { [2 x <1 x i64>] }
-%struct.float32x2x2_t = type { [2 x <2 x float>] }
-%struct.float64x1x2_t = type { [2 x <1 x double>] }
-%struct.int8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int16x8x3_t = type { [3 x <8 x i16>] }
-%struct.int32x4x3_t = type { [3 x <4 x i32>] }
-%struct.int64x2x3_t = type { [3 x <2 x i64>] }
-%struct.float32x4x3_t = type { [3 x <4 x float>] }
-%struct.float64x2x3_t = type { [3 x <2 x double>] }
-%struct.int8x8x3_t = type { [3 x <8 x i8>] }
-%struct.int16x4x3_t = type { [3 x <4 x i16>] }
-%struct.int32x2x3_t = type { [3 x <2 x i32>] }
-%struct.int64x1x3_t = type { [3 x <1 x i64>] }
-%struct.float32x2x3_t = type { [3 x <2 x float>] }
-%struct.float64x1x3_t = type { [3 x <1 x double>] }
-%struct.int8x16x4_t = type { [4 x <16 x i8>] }
-%struct.int16x8x4_t = type { [4 x <8 x i16>] }
-%struct.int32x4x4_t = type { [4 x <4 x i32>] }
-%struct.int64x2x4_t = type { [4 x <2 x i64>] }
-%struct.float32x4x4_t = type { [4 x <4 x float>] }
-%struct.float64x2x4_t = type { [4 x <2 x double>] }
-%struct.int8x8x4_t = type { [4 x <8 x i8>] }
-%struct.int16x4x4_t = type { [4 x <4 x i16>] }
-%struct.int32x2x4_t = type { [4 x <2 x i32>] }
-%struct.int64x1x4_t = type { [4 x <1 x i64>] }
-%struct.float32x2x4_t = type { [4 x <2 x float>] }
-%struct.float64x1x4_t = type { [4 x <1 x double>] }
-
-
-define <16 x i8> @test_vld1q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld1q_s8
-; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-  %vld1 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %a, i32 1)
-  ret <16 x i8> %vld1
-}
-
-define <8 x i16> @test_vld1q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld1q_s16
-; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld1 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %1, i32 2)
-  ret <8 x i16> %vld1
-}
-
-define <4 x i32> @test_vld1q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld1q_s32
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld1 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %1, i32 4)
-  ret <4 x i32> %vld1
-}
-
-define <2 x i64> @test_vld1q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld1q_s64
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld1 = tail call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %1, i32 8)
-  ret <2 x i64> %vld1
-}
-
-define <4 x float> @test_vld1q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld1q_f32
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %1, i32 4)
-  ret <4 x float> %vld1
-}
-
-define <2 x double> @test_vld1q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld1q_f64
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld1 = tail call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %1, i32 8)
-  ret <2 x double> %vld1
-}
-
-define <8 x i8> @test_vld1_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld1_s8
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-  %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
-  ret <8 x i8> %vld1
-}
-
-define <4 x i16> @test_vld1_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld1_s16
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
-  ret <4 x i16> %vld1
-}
-
-define <2 x i32> @test_vld1_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld1_s32
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld1 = tail call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %1, i32 4)
-  ret <2 x i32> %vld1
-}
-
-define <1 x i64> @test_vld1_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld1_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld1 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %1, i32 8)
-  ret <1 x i64> %vld1
-}
-
-define <2 x float> @test_vld1_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld1_f32
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %1, i32 4)
-  ret <2 x float> %vld1
-}
-
-define <1 x double> @test_vld1_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld1_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld1 = tail call <1 x double> @llvm.arm.neon.vld1.v1f64(i8* %1, i32 8)
-  ret <1 x double> %vld1
-}
-
-define <8 x i8> @test_vld1_p8(i8* readonly %a) {
-; CHECK-LABEL: test_vld1_p8
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-  %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
-  ret <8 x i8> %vld1
-}
-
-define <4 x i16> @test_vld1_p16(i16* readonly %a) {
-; CHECK-LABEL: test_vld1_p16
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
-  ret <4 x i16> %vld1
-}
-
-define %struct.int8x16x2_t @test_vld2q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld2q_s8
-; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-  %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %a, i32 1)
-  %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2.fca.1.extract, 0, 1
-  ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x2_t @test_vld2q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld2q_s16
-; CHECK: ld2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8* %1, i32 2)
-  %vld2.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2.fca.1.extract, 0, 1
-  ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld2q_s32
-; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %1, i32 4)
-  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2.fca.1.extract, 0, 1
-  ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld2q_s64
-; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8* %1, i32 8)
-  %vld2.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2.fca.1.extract, 0, 1
-  ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld2q_f32
-; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
-  %vld2.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2.fca.1.extract, 0, 1
-  ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld2q_f64
-; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld2 = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8* %1, i32 8)
-  %vld2.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2.fca.1.extract, 0, 1
-  ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld2_s8
-; CHECK: ld2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-  %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %a, i32 1)
-  %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2.fca.1.extract, 0, 1
-  ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld2_s16
-; CHECK: ld2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8* %1, i32 2)
-  %vld2.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2.fca.1.extract, 0, 1
-  ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld2_s32
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %1, i32 4)
-  %vld2.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2.fca.1.extract, 0, 1
-  ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld2_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %1, i32 8)
-  %vld2.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2.fca.1.extract, 0, 1
-  ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld2_f32
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld2 = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %1, i32 4)
-  %vld2.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2.fca.1.extract, 0, 1
-  ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld2_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld2 = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %1, i32 8)
-  %vld2.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2.fca.1.extract, 0, 1
-  ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld3q_s8
-; CHECK: ld3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-  %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %a, i32 1)
-  %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3.fca.2.extract, 0, 2
-  ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld3q_s16
-; CHECK: ld3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8* %1, i32 2)
-  %vld3.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3.fca.2.extract, 0, 2
-  ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld3q_s32
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %1, i32 4)
-  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3.fca.2.extract, 0, 2
-  ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld3q_s64
-; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8* %1, i32 8)
-  %vld3.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3.fca.2.extract, 0, 2
-  ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld3q_f32
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %1, i32 4)
-  %vld3.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3.fca.2.extract, 0, 2
-  ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld3q_f64
-; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8* %1, i32 8)
-  %vld3.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3.fca.2.extract, 0, 2
-  ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld3_s8
-; CHECK: ld3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-  %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %a, i32 1)
-  %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3.fca.2.extract, 0, 2
-  ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld3_s16
-; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %1, i32 2)
-  %vld3.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3.fca.2.extract, 0, 2
-  ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld3_s32
-; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8* %1, i32 4)
-  %vld3.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3.fca.2.extract, 0, 2
-  ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld3_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %1, i32 8)
-  %vld3.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3.fca.2.extract, 0, 2
-  ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld3_f32
-; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8* %1, i32 4)
-  %vld3.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3.fca.2.extract, 0, 2
-  ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld3_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %1, i32 8)
-  %vld3.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3.fca.2.extract, 0, 2
-  ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld4q_s8
-; CHECK: ld4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-  %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %a, i32 1)
-  %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld4.fca.3.extract, 0, 3
-  ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld4q_s16
-; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %1, i32 2)
-  %vld4.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld4.fca.3.extract, 0, 3
-  ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld4q_s32
-; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8* %1, i32 4)
-  %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld4.fca.3.extract, 0, 3
-  ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld4q_s64
-; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8* %1, i32 8)
-  %vld4.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld4.fca.3.extract, 0, 3
-  ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld4q_f32
-; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
-  %vld4.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld4.fca.3.extract, 0, 3
-  ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld4q_f64
-; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8* %1, i32 8)
-  %vld4.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld4.fca.3.extract, 0, 3
-  ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld4_s8
-; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-  %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %a, i32 1)
-  %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld4.fca.3.extract, 0, 3
-  ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld4_s16
-; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %vld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %1, i32 2)
-  %vld4.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld4.fca.3.extract, 0, 3
-  ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld4_s32
-; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %vld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8* %1, i32 4)
-  %vld4.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld4.fca.3.extract, 0, 3
-  ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld4_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %vld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %1, i32 8)
-  %vld4.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld4.fca.3.extract, 0, 3
-  ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld4_f32
-; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %vld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8* %1, i32 4)
-  %vld4.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld4.fca.3.extract, 0, 3
-  ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld4_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %vld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %1, i32 8)
-  %vld4.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld4.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld4.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld4.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld4.fca.3.extract, 0, 3
-  ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32)
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32)
-declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32)
-declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32)
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32)
-declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32)
-declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32)
-declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
-declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
-declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32)
-declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32)
-declare <1 x double> @llvm.arm.neon.vld1.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
-
-define void @test_vst1q_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vst1q_s8
-; CHECK: st1 {v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  tail call void @llvm.arm.neon.vst1.v16i8(i8* %a, <16 x i8> %b, i32 1)
-  ret void
-}
-
-define void @test_vst1q_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vst1q_s16
-; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v8i16(i8* %1, <8 x i16> %b, i32 2)
-  ret void
-}
-
-define void @test_vst1q_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vst1q_s32
-; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v4i32(i8* %1, <4 x i32> %b, i32 4)
-  ret void
-}
-
-define void @test_vst1q_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vst1q_s64
-; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v2i64(i8* %1, <2 x i64> %b, i32 8)
-  ret void
-}
-
-define void @test_vst1q_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vst1q_f32
-; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v4f32(i8* %1, <4 x float> %b, i32 4)
-  ret void
-}
-
-define void @test_vst1q_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vst1q_f64
-; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v2f64(i8* %1, <2 x double> %b, i32 8)
-  ret void
-}
-
-define void @test_vst1_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vst1_s8
-; CHECK: st1 {v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  tail call void @llvm.arm.neon.vst1.v8i8(i8* %a, <8 x i8> %b, i32 1)
-  ret void
-}
-
-define void @test_vst1_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vst1_s16
-; CHECK: st1 {v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v4i16(i8* %1, <4 x i16> %b, i32 2)
-  ret void
-}
-
-define void @test_vst1_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vst1_s32
-; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v2i32(i8* %1, <2 x i32> %b, i32 4)
-  ret void
-}
-
-define void @test_vst1_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vst1_s64
-; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v1i64(i8* %1, <1 x i64> %b, i32 8)
-  ret void
-}
-
-define void @test_vst1_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vst1_f32
-; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v2f32(i8* %1, <2 x float> %b, i32 4)
-  ret void
-}
-
-define void @test_vst1_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vst1_f64
-; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst1.v1f64(i8* %1, <1 x double> %b, i32 8)
-  ret void
-}
-
-define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s8
-; CHECK: st2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  tail call void @llvm.arm.neon.vst2.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 1)
-  ret void
-}
-
-define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s16
-; CHECK: st2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 2)
-  ret void
-}
-
-define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s32
-; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 4)
-  ret void
-}
-
-define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s64
-; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 8)
-  ret void
-}
-
-define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_f32
-; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 4)
-  ret void
-}
-
-define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_f64
-; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 8)
-  ret void
-}
-
-define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s8
-; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  tail call void @llvm.arm.neon.vst2.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 1)
-  ret void
-}
-
-define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s16
-; CHECK: st2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 2)
-  ret void
-}
-
-define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s32
-; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 4)
-  ret void
-}
-
-define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 8)
-  ret void
-}
-
-define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2_f32
-; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 4)
-  ret void
-}
-
-define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2_f64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst2.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 8)
-  ret void
-}
-
-define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s8
-; CHECK: st3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  tail call void @llvm.arm.neon.vst3.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 1)
-  ret void
-}
-
-define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s16
-; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 2)
-  ret void
-}
-
-define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s32
-; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 4)
-  ret void
-}
-
-define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s64
-; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 8)
-  ret void
-}
-
-define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_f32
-; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 4)
-  ret void
-}
-
-define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_f64
-; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 8)
-  ret void
-}
-
-define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s8
-; CHECK: st3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 1)
-  ret void
-}
-
-define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s16
-; CHECK: st3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 2)
-  ret void
-}
-
-define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s32
-; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 4)
-  ret void
-}
-
-define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 8)
-  ret void
-}
-
-define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3_f32
-; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 4)
-  ret void
-}
-
-define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3_f64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst3.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 8)
-  ret void
-}
-
-define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s8
-; CHECK: st4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  tail call void @llvm.arm.neon.vst4.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 1)
-  ret void
-}
-
-define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s16
-; CHECK: st4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 2)
-  ret void
-}
-
-define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s32
-; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 4)
-  ret void
-}
-
-define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s64
-; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 8)
-  ret void
-}
-
-define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_f32
-; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 4)
-  ret void
-}
-
-define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_f64
-; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 8)
-  ret void
-}
-
-define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s8
-; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  tail call void @llvm.arm.neon.vst4.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 1)
-  ret void
-}
-
-define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s16
-; CHECK: st4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
-  %1 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 2)
-  ret void
-}
-
-define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s32
-; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
-  %1 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 4)
-  ret void
-}
-
-define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
-  %1 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 8)
-  ret void
-}
-
-define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4_f32
-; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
-  %1 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 4)
-  ret void
-}
-
-define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4_f64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
-  %1 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 8)
-  ret void
-}
-
-declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32)
-declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32)
-declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
-declare void @llvm.arm.neon.vst1.v1f64(i8*, <1 x double>, i32)
-declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst2.v2f64(i8*, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst2.v1f64(i8*, <1 x double>, <1 x double>, i32)
-declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
-declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
-
-define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a)  {
-; CHECK-LABEL: test_vld1q_s8_x2
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %1 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
-  %3 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
-  %4 = insertvalue %struct.int8x16x2_t undef, <16 x i8> %2, 0, 0
-  %5 = insertvalue %struct.int8x16x2_t %4, <16 x i8> %3, 0, 1
-  ret %struct.int8x16x2_t %5
-}
-
-define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a)  {
-; CHECK-LABEL: test_vld1q_s16_x2
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
-  %4 = extractvalue { <8 x i16>, <8 x i16> } %2, 1
-  %5 = insertvalue %struct.int16x8x2_t undef, <8 x i16> %3, 0, 0
-  %6 = insertvalue %struct.int16x8x2_t %5, <8 x i16> %4, 0, 1
-  ret %struct.int16x8x2_t %6
-}
-
-define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a)  {
-; CHECK-LABEL: test_vld1q_s32_x2
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
-  %4 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
-  %5 = insertvalue %struct.int32x4x2_t undef, <4 x i32> %3, 0, 0
-  %6 = insertvalue %struct.int32x4x2_t %5, <4 x i32> %4, 0, 1
-  ret %struct.int32x4x2_t %6
-}
-
-define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a)  {
-; CHECK-LABEL: test_vld1q_s64_x2
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x i64>, <2 x i64> } %2, 0
-  %4 = extractvalue { <2 x i64>, <2 x i64> } %2, 1
-  %5 = insertvalue %struct.int64x2x2_t undef, <2 x i64> %3, 0, 0
-  %6 = insertvalue %struct.int64x2x2_t %5, <2 x i64> %4, 0, 1
-  ret %struct.int64x2x2_t %6
-}
-
-define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a)  {
-; CHECK-LABEL: test_vld1q_f32_x2
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x float>, <4 x float> } %2, 0
-  %4 = extractvalue { <4 x float>, <4 x float> } %2, 1
-  %5 = insertvalue %struct.float32x4x2_t undef, <4 x float> %3, 0, 0
-  %6 = insertvalue %struct.float32x4x2_t %5, <4 x float> %4, 0, 1
-  ret %struct.float32x4x2_t %6
-}
-
-
-define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a)  {
-; CHECK-LABEL: test_vld1q_f64_x2
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x double>, <2 x double> } %2, 0
-  %4 = extractvalue { <2 x double>, <2 x double> } %2, 1
-  %5 = insertvalue %struct.float64x2x2_t undef, <2 x double> %3, 0, 0
-  %6 = insertvalue %struct.float64x2x2_t %5, <2 x double> %4, 0, 1
-  ret %struct.float64x2x2_t %6
-}
-
-define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a)  {
-; CHECK-LABEL: test_vld1_s8_x2
-; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %1 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8* %a, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8> } %1, 0
-  %3 = extractvalue { <8 x i8>, <8 x i8> } %1, 1
-  %4 = insertvalue %struct.int8x8x2_t undef, <8 x i8> %2, 0, 0
-  %5 = insertvalue %struct.int8x8x2_t %4, <8 x i8> %3, 0, 1
-  ret %struct.int8x8x2_t %5
-}
-
-define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a)  {
-; CHECK-LABEL: test_vld1_s16_x2
-; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8* %1, i32 2)
-  %3 = extractvalue { <4 x i16>, <4 x i16> } %2, 0
-  %4 = extractvalue { <4 x i16>, <4 x i16> } %2, 1
-  %5 = insertvalue %struct.int16x4x2_t undef, <4 x i16> %3, 0, 0
-  %6 = insertvalue %struct.int16x4x2_t %5, <4 x i16> %4, 0, 1
-  ret %struct.int16x4x2_t %6
-}
-
-define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a)  {
-; CHECK-LABEL: test_vld1_s32_x2
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x i32>, <2 x i32> } %2, 0
-  %4 = extractvalue { <2 x i32>, <2 x i32> } %2, 1
-  %5 = insertvalue %struct.int32x2x2_t undef, <2 x i32> %3, 0, 0
-  %6 = insertvalue %struct.int32x2x2_t %5, <2 x i32> %4, 0, 1
-  ret %struct.int32x2x2_t %6
-}
-
-define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a)  {
-; CHECK-LABEL: test_vld1_s64_x2
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x i64>, <1 x i64> } %2, 0
-  %4 = extractvalue { <1 x i64>, <1 x i64> } %2, 1
-  %5 = insertvalue %struct.int64x1x2_t undef, <1 x i64> %3, 0, 0
-  %6 = insertvalue %struct.int64x1x2_t %5, <1 x i64> %4, 0, 1
-  ret %struct.int64x1x2_t %6
-}
-
-define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a)  {
-; CHECK-LABEL: test_vld1_f32_x2
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x float>, <2 x float> } %2, 0
-  %4 = extractvalue { <2 x float>, <2 x float> } %2, 1
-  %5 = insertvalue %struct.float32x2x2_t undef, <2 x float> %3, 0, 0
-  %6 = insertvalue %struct.float32x2x2_t %5, <2 x float> %4, 0, 1
-  ret %struct.float32x2x2_t %6
-}
-
-define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a)  {
-; CHECK-LABEL: test_vld1_f64_x2
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x double>, <1 x double> } %2, 0
-  %4 = extractvalue { <1 x double>, <1 x double> } %2, 1
-  %5 = insertvalue %struct.float64x1x2_t undef, <1 x double> %3, 0, 0
-  %6 = insertvalue %struct.float64x1x2_t %5, <1 x double> %4, 0, 1
-  ret %struct.float64x1x2_t %6
-}
-
-define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a)  {
-; CHECK-LABEL: test_vld1q_s8_x3
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b},
-; [{{x[0-9]+|sp}}]
-  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8* %a, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
-  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
-  %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
-  %5 = insertvalue %struct.int8x16x3_t undef, <16 x i8> %2, 0, 0
-  %6 = insertvalue %struct.int8x16x3_t %5, <16 x i8> %3, 0, 1
-  %7 = insertvalue %struct.int8x16x3_t %6, <16 x i8> %4, 0, 2
-  ret %struct.int8x16x3_t %7
-}
-
-define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a)  {
-; CHECK-LABEL: test_vld1q_s16_x3
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
-  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
-  %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
-  %6 = insertvalue %struct.int16x8x3_t undef, <8 x i16> %3, 0, 0
-  %7 = insertvalue %struct.int16x8x3_t %6, <8 x i16> %4, 0, 1
-  %8 = insertvalue %struct.int16x8x3_t %7, <8 x i16> %5, 0, 2
-  ret %struct.int16x8x3_t %8
-}
-
-define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a)  {
-; CHECK-LABEL: test_vld1q_s32_x3
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
-  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
-  %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
-  %6 = insertvalue %struct.int32x4x3_t undef, <4 x i32> %3, 0, 0
-  %7 = insertvalue %struct.int32x4x3_t %6, <4 x i32> %4, 0, 1
-  %8 = insertvalue %struct.int32x4x3_t %7, <4 x i32> %5, 0, 2
-  ret %struct.int32x4x3_t %8
-}
-
-define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a)  {
-; CHECK-LABEL: test_vld1q_s64_x3
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
-  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
-  %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
-  %6 = insertvalue %struct.int64x2x3_t undef, <2 x i64> %3, 0, 0
-  %7 = insertvalue %struct.int64x2x3_t %6, <2 x i64> %4, 0, 1
-  %8 = insertvalue %struct.int64x2x3_t %7, <2 x i64> %5, 0, 2
-  ret %struct.int64x2x3_t %8
-}
-
-define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a)  {
-; CHECK-LABEL: test_vld1q_f32_x3
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 0
-  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 1
-  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 2
-  %6 = insertvalue %struct.float32x4x3_t undef, <4 x float> %3, 0, 0
-  %7 = insertvalue %struct.float32x4x3_t %6, <4 x float> %4, 0, 1
-  %8 = insertvalue %struct.float32x4x3_t %7, <4 x float> %5, 0, 2
-  ret %struct.float32x4x3_t %8
-}
-
-
-define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a)  {
-; CHECK-LABEL: test_vld1q_f64_x3
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 0
-  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 1
-  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 2
-  %6 = insertvalue %struct.float64x2x3_t undef, <2 x double> %3, 0, 0
-  %7 = insertvalue %struct.float64x2x3_t %6, <2 x double> %4, 0, 1
-  %8 = insertvalue %struct.float64x2x3_t %7, <2 x double> %5, 0, 2
-  ret %struct.float64x2x3_t %8
-}
-
-define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a)  {
-; CHECK-LABEL: test_vld1_s8_x3
-; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b},
-; [{{x[0-9]+|sp}}]
-  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8* %a, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-  %5 = insertvalue %struct.int8x8x3_t undef, <8 x i8> %2, 0, 0
-  %6 = insertvalue %struct.int8x8x3_t %5, <8 x i8> %3, 0, 1
-  %7 = insertvalue %struct.int8x8x3_t %6, <8 x i8> %4, 0, 2
-  ret %struct.int8x8x3_t %7
-}
-
-define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a)  {
-; CHECK-LABEL: test_vld1_s16_x3
-; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8* %1, i32 2)
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
-  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
-  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
-  %6 = insertvalue %struct.int16x4x3_t undef, <4 x i16> %3, 0, 0
-  %7 = insertvalue %struct.int16x4x3_t %6, <4 x i16> %4, 0, 1
-  %8 = insertvalue %struct.int16x4x3_t %7, <4 x i16> %5, 0, 2
-  ret %struct.int16x4x3_t %8
-}
-
-define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a)  {
-  %1 = bitcast i32* %a to i8*
-; CHECK-LABEL: test_vld1_s32_x3
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
-; [{{x[0-9]+|sp}}]
-  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
-  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
-  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
-  %6 = insertvalue %struct.int32x2x3_t undef, <2 x i32> %3, 0, 0
-  %7 = insertvalue %struct.int32x2x3_t %6, <2 x i32> %4, 0, 1
-  %8 = insertvalue %struct.int32x2x3_t %7, <2 x i32> %5, 0, 2
-  ret %struct.int32x2x3_t %8
-}
-
-define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a)  {
-; CHECK-LABEL: test_vld1_s64_x3
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
-  %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
-  %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
-  %6 = insertvalue %struct.int64x1x3_t undef, <1 x i64> %3, 0, 0
-  %7 = insertvalue %struct.int64x1x3_t %6, <1 x i64> %4, 0, 1
-  %8 = insertvalue %struct.int64x1x3_t %7, <1 x i64> %5, 0, 2
-  ret %struct.int64x1x3_t %8
-}
-
-define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a)  {
-; CHECK-LABEL: test_vld1_f32_x3
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 0
-  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 1
-  %5 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 2
-  %6 = insertvalue %struct.float32x2x3_t undef, <2 x float> %3, 0, 0
-  %7 = insertvalue %struct.float32x2x3_t %6, <2 x float> %4, 0, 1
-  %8 = insertvalue %struct.float32x2x3_t %7, <2 x float> %5, 0, 2
-  ret %struct.float32x2x3_t %8
-}
-
-
-define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a)  {
-; CHECK-LABEL: test_vld1_f64_x3
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
-; [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 0
-  %4 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 1
-  %5 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 2
-  %6 = insertvalue %struct.float64x1x3_t undef, <1 x double> %3, 0, 0
-  %7 = insertvalue %struct.float64x1x3_t %6, <1 x double> %4, 0, 1
-  %8 = insertvalue %struct.float64x1x3_t %7, <1 x double> %5, 0, 2
-  ret %struct.float64x1x3_t %8
-}
-
-define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a)  {
-; CHECK-LABEL: test_vld1q_s8_x4
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
-; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8* %a, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
-  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
-  %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
-  %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 3
-  %6 = insertvalue %struct.int8x16x4_t undef, <16 x i8> %2, 0, 0
-  %7 = insertvalue %struct.int8x16x4_t %6, <16 x i8> %3, 0, 1
-  %8 = insertvalue %struct.int8x16x4_t %7, <16 x i8> %4, 0, 2
-  %9 = insertvalue %struct.int8x16x4_t %8, <16 x i8> %5, 0, 3
-  ret %struct.int8x16x4_t %9
-}
-
-define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a)  {
-; CHECK-LABEL: test_vld1q_s16_x4
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
-; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
-  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
-  %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
-  %6 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 3
-  %7 = insertvalue %struct.int16x8x4_t undef, <8 x i16> %3, 0, 0
-  %8 = insertvalue %struct.int16x8x4_t %7, <8 x i16> %4, 0, 1
-  %9 = insertvalue %struct.int16x8x4_t %8, <8 x i16> %5, 0, 2
-  %10 = insertvalue %struct.int16x8x4_t %9, <8 x i16> %6, 0, 3
-  ret %struct.int16x8x4_t %10
-}
-
-define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a)  {
-; CHECK-LABEL: test_vld1q_s32_x4
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
-  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
-  %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
-  %6 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 3
-  %7 = insertvalue %struct.int32x4x4_t undef, <4 x i32> %3, 0, 0
-  %8 = insertvalue %struct.int32x4x4_t %7, <4 x i32> %4, 0, 1
-  %9 = insertvalue %struct.int32x4x4_t %8, <4 x i32> %5, 0, 2
-  %10 = insertvalue %struct.int32x4x4_t %9, <4 x i32> %6, 0, 3
-  ret %struct.int32x4x4_t %10
-}
-
-define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a)  {
-; CHECK-LABEL: test_vld1q_s64_x4
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
-  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
-  %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
-  %6 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 3
-  %7 = insertvalue %struct.int64x2x4_t undef, <2 x i64> %3, 0, 0
-  %8 = insertvalue %struct.int64x2x4_t %7, <2 x i64> %4, 0, 1
-  %9 = insertvalue %struct.int64x2x4_t %8, <2 x i64> %5, 0, 2
-  %10 = insertvalue %struct.int64x2x4_t %9, <2 x i64> %6, 0, 3
-  ret %struct.int64x2x4_t %10
-}
-
-define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a)  {
-; CHECK-LABEL: test_vld1q_f32_x4
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
-  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
-  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2
-  %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
-  %7 = insertvalue %struct.float32x4x4_t undef, <4 x float> %3, 0, 0
-  %8 = insertvalue %struct.float32x4x4_t %7, <4 x float> %4, 0, 1
-  %9 = insertvalue %struct.float32x4x4_t %8, <4 x float> %5, 0, 2
-  %10 = insertvalue %struct.float32x4x4_t %9, <4 x float> %6, 0, 3
-  ret %struct.float32x4x4_t %10
-}
-
-define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a)  {
-; CHECK-LABEL: test_vld1q_f64_x4
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
-  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
-  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
-  %6 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
-  %7 = insertvalue %struct.float64x2x4_t undef, <2 x double> %3, 0, 0
-  %8 = insertvalue %struct.float64x2x4_t %7, <2 x double> %4, 0, 1
-  %9 = insertvalue %struct.float64x2x4_t %8, <2 x double> %5, 0, 2
-  %10 = insertvalue %struct.float64x2x4_t %9, <2 x double> %6, 0, 3
-  ret %struct.float64x2x4_t %10
-}
-
-define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a)  {
-; CHECK-LABEL: test_vld1_s8_x4
-; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
-; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-  %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3
-  %6 = insertvalue %struct.int8x8x4_t undef, <8 x i8> %2, 0, 0
-  %7 = insertvalue %struct.int8x8x4_t %6, <8 x i8> %3, 0, 1
-  %8 = insertvalue %struct.int8x8x4_t %7, <8 x i8> %4, 0, 2
-  %9 = insertvalue %struct.int8x8x4_t %8, <8 x i8> %5, 0, 3
-  ret %struct.int8x8x4_t %9
-}
-
-define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a)  {
-; CHECK-LABEL: test_vld1_s16_x4
-; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
-; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8* %1, i32 2)
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
-  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
-  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
-  %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 3
-  %7 = insertvalue %struct.int16x4x4_t undef, <4 x i16> %3, 0, 0
-  %8 = insertvalue %struct.int16x4x4_t %7, <4 x i16> %4, 0, 1
-  %9 = insertvalue %struct.int16x4x4_t %8, <4 x i16> %5, 0, 2
-  %10 = insertvalue %struct.int16x4x4_t %9, <4 x i16> %6, 0, 3
-  ret %struct.int16x4x4_t %10
-}
-
-define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a)  {
-; CHECK-LABEL: test_vld1_s32_x4
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
-  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
-  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
-  %6 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
-  %7 = insertvalue %struct.int32x2x4_t undef, <2 x i32> %3, 0, 0
-  %8 = insertvalue %struct.int32x2x4_t %7, <2 x i32> %4, 0, 1
-  %9 = insertvalue %struct.int32x2x4_t %8, <2 x i32> %5, 0, 2
-  %10 = insertvalue %struct.int32x2x4_t %9, <2 x i32> %6, 0, 3
-  ret %struct.int32x2x4_t %10
-}
-
-define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a)  {
-; CHECK-LABEL: test_vld1_s64_x4
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
-  %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
-  %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
-  %6 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 3
-  %7 = insertvalue %struct.int64x1x4_t undef, <1 x i64> %3, 0, 0
-  %8 = insertvalue %struct.int64x1x4_t %7, <1 x i64> %4, 0, 1
-  %9 = insertvalue %struct.int64x1x4_t %8, <1 x i64> %5, 0, 2
-  %10 = insertvalue %struct.int64x1x4_t %9, <1 x i64> %6, 0, 3
-  ret %struct.int64x1x4_t %10
-}
-
-define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a)  {
-; CHECK-LABEL: test_vld1_f32_x4
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8* %1, i32 4)
-  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 0
-  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 1
-  %5 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 2
-  %6 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 3
-  %7 = insertvalue %struct.float32x2x4_t undef, <2 x float> %3, 0, 0
-  %8 = insertvalue %struct.float32x2x4_t %7, <2 x float> %4, 0, 1
-  %9 = insertvalue %struct.float32x2x4_t %8, <2 x float> %5, 0, 2
-  %10 = insertvalue %struct.float32x2x4_t %9, <2 x float> %6, 0, 3
-  ret %struct.float32x2x4_t %10
-}
-
-
-define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a)  {
-; CHECK-LABEL: test_vld1_f64_x4
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8* %1, i32 8)
-  %3 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 0
-  %4 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 1
-  %5 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 2
-  %6 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 3
-  %7 = insertvalue %struct.float64x1x4_t undef, <1 x double> %3, 0, 0
-  %8 = insertvalue %struct.float64x1x4_t %7, <1 x double> %4, 0, 1
-  %9 = insertvalue %struct.float64x1x4_t %8, <1 x double> %5, 0, 2
-  %10 = insertvalue %struct.float64x1x4_t %9, <1 x double> %6, 0, 3
-  ret %struct.float64x1x4_t %10
-}
-
-define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b)  {
-; CHECK-LABEL: test_vst1q_s8_x2
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <16 x i8>] %b, 0
-  %2 = extractvalue [2 x <16 x i8>] %b, 1
-  tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
-  ret void
-}
-
-define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b)  {
-; CHECK-LABEL: test_vst1q_s16_x2
-; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <8 x i16>] %b, 0
-  %2 = extractvalue [2 x <8 x i16>] %b, 1
-  %3 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
-  ret void
-}
-
-define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b)  {
-; CHECK-LABEL: test_vst1q_s32_x2
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <4 x i32>] %b, 0
-  %2 = extractvalue [2 x <4 x i32>] %b, 1
-  %3 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v4i32(i8* %3, <4 x i32> %1, <4 x i32> %2, i32 4)
-  ret void
-}
-
-define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b)  {
-; CHECK-LABEL: test_vst1q_s64_x2
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <2 x i64>] %b, 0
-  %2 = extractvalue [2 x <2 x i64>] %b, 1
-  %3 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v2i64(i8* %3, <2 x i64> %1, <2 x i64> %2, i32 8)
-  ret void
-}
-
-define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b)  {
-; CHECK-LABEL: test_vst1q_f32_x2
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <4 x float>] %b, 0
-  %2 = extractvalue [2 x <4 x float>] %b, 1
-  %3 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v4f32(i8* %3, <4 x float> %1, <4 x float> %2, i32 4)
-  ret void
-}
-
-
-define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b)  {
-; CHECK-LABEL: test_vst1q_f64_x2
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <2 x double>] %b, 0
-  %2 = extractvalue [2 x <2 x double>] %b, 1
-  %3 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v2f64(i8* %3, <2 x double> %1, <2 x double> %2, i32 8)
-  ret void
-}
-
-define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b)  {
-; CHECK-LABEL: test_vst1_s8_x2
-; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <8 x i8>] %b, 0
-  %2 = extractvalue [2 x <8 x i8>] %b, 1
-  tail call void @llvm.aarch64.neon.vst1x2.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 1)
-  ret void
-}
-
-define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b)  {
-; CHECK-LABEL: test_vst1_s16_x2
-; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <4 x i16>] %b, 0
-  %2 = extractvalue [2 x <4 x i16>] %b, 1
-  %3 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v4i16(i8* %3, <4 x i16> %1, <4 x i16> %2, i32 2)
-  ret void
-}
-
-define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b)  {
-; CHECK-LABEL: test_vst1_s32_x2
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <2 x i32>] %b, 0
-  %2 = extractvalue [2 x <2 x i32>] %b, 1
-  %3 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 4)
-  ret void
-}
-
-define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b)  {
-; CHECK-LABEL: test_vst1_s64_x2
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <1 x i64>] %b, 0
-  %2 = extractvalue [2 x <1 x i64>] %b, 1
-  %3 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v1i64(i8* %3, <1 x i64> %1, <1 x i64> %2, i32 8)
-  ret void
-}
-
-define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b)  {
-; CHECK-LABEL: test_vst1_f32_x2
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <2 x float>] %b, 0
-  %2 = extractvalue [2 x <2 x float>] %b, 1
-  %3 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v2f32(i8* %3, <2 x float> %1, <2 x float> %2, i32 4)
-  ret void
-}
-
-define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b)  {
-; CHECK-LABEL: test_vst1_f64_x2
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [2 x <1 x double>] %b, 0
-  %2 = extractvalue [2 x <1 x double>] %b, 1
-  %3 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v1f64(i8* %3, <1 x double> %1, <1 x double> %2, i32 8)
-  ret void
-}
-
-define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b)  {
-; CHECK-LABEL: test_vst1q_s8_x3
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <16 x i8>] %b, 0
-  %2 = extractvalue [3 x <16 x i8>] %b, 1
-  %3 = extractvalue [3 x <16 x i8>] %b, 2
-  tail call void @llvm.aarch64.neon.vst1x3.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, i32 1)
-  ret void
-}
-
-define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b)  {
-; CHECK-LABEL: test_vst1q_s16_x3
-; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <8 x i16>] %b, 0
-  %2 = extractvalue [3 x <8 x i16>] %b, 1
-  %3 = extractvalue [3 x <8 x i16>] %b, 2
-  %4 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v8i16(i8* %4, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, i32 2)
-  ret void
-}
-
-define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b)  {
-; CHECK-LABEL: test_vst1q_s32_x3
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <4 x i32>] %b, 0
-  %2 = extractvalue [3 x <4 x i32>] %b, 1
-  %3 = extractvalue [3 x <4 x i32>] %b, 2
-  %4 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v4i32(i8* %4, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, i32 4)
-  ret void
-}
-
-define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b)  {
-; CHECK-LABEL: test_vst1q_s64_x3
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <2 x i64>] %b, 0
-  %2 = extractvalue [3 x <2 x i64>] %b, 1
-  %3 = extractvalue [3 x <2 x i64>] %b, 2
-  %4 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2i64(i8* %4, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, i32 8)
-  ret void
-}
-
-define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b)  {
-; CHECK-LABEL: test_vst1q_f32_x3
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <4 x float>] %b, 0
-  %2 = extractvalue [3 x <4 x float>] %b, 1
-  %3 = extractvalue [3 x <4 x float>] %b, 2
-  %4 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 4)
-  ret void
-}
-
-define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b)  {
-; CHECK-LABEL: test_vst1q_f64_x3
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <2 x double>] %b, 0
-  %2 = extractvalue [3 x <2 x double>] %b, 1
-  %3 = extractvalue [3 x <2 x double>] %b, 2
-  %4 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2f64(i8* %4, <2 x double> %1, <2 x double> %2, <2 x double> %3, i32 8)
-  ret void
-}
-
-define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b)  {
-; CHECK-LABEL: test_vst1_s8_x3
-; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <8 x i8>] %b, 0
-  %2 = extractvalue [3 x <8 x i8>] %b, 1
-  %3 = extractvalue [3 x <8 x i8>] %b, 2
-  tail call void @llvm.aarch64.neon.vst1x3.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 1)
-  ret void
-}
-
-define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b)  {
-; CHECK-LABEL: test_vst1_s16_x3
-; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <4 x i16>] %b, 0
-  %2 = extractvalue [3 x <4 x i16>] %b, 1
-  %3 = extractvalue [3 x <4 x i16>] %b, 2
-  %4 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 2)
-  ret void
-}
-
-define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b)  {
-; CHECK-LABEL: test_vst1_s32_x3
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <2 x i32>] %b, 0
-  %2 = extractvalue [3 x <2 x i32>] %b, 1
-  %3 = extractvalue [3 x <2 x i32>] %b, 2
-  %4 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
-  ret void
-}
-
-define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b)  {
-; CHECK-LABEL: test_vst1_s64_x3
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <1 x i64>] %b, 0
-  %2 = extractvalue [3 x <1 x i64>] %b, 1
-  %3 = extractvalue [3 x <1 x i64>] %b, 2
-  %4 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
-  ret void
-}
-
-define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b)  {
-; CHECK-LABEL: test_vst1_f32_x3
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <2 x float>] %b, 0
-  %2 = extractvalue [3 x <2 x float>] %b, 1
-  %3 = extractvalue [3 x <2 x float>] %b, 2
-  %4 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 4)
-  ret void
-}
-
-define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b)  {
-; CHECK-LABEL: test_vst1_f64_x3
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
-; [{{x[0-9]+|sp}}]
-  %1 = extractvalue [3 x <1 x double>] %b, 0
-  %2 = extractvalue [3 x <1 x double>] %b, 1
-  %3 = extractvalue [3 x <1 x double>] %b, 2
-  %4 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v1f64(i8* %4, <1 x double> %1, <1 x double> %2, <1 x double> %3, i32 8)
-  ret void
-}
-
-define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b)  {
-; CHECK-LABEL: test_vst1q_s8_x4
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
-; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <16 x i8>] %b, 0
-  %2 = extractvalue [4 x <16 x i8>] %b, 1
-  %3 = extractvalue [4 x <16 x i8>] %b, 2
-  %4 = extractvalue [4 x <16 x i8>] %b, 3
-  tail call void @llvm.aarch64.neon.vst1x4.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, <16 x i8> %4, i32 1)
-  ret void
-}
-
-define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b)  {
-; CHECK-LABEL: test_vst1q_s16_x4
-; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
-; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <8 x i16>] %b, 0
-  %2 = extractvalue [4 x <8 x i16>] %b, 1
-  %3 = extractvalue [4 x <8 x i16>] %b, 2
-  %4 = extractvalue [4 x <8 x i16>] %b, 3
-  %5 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v8i16(i8* %5, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, <8 x i16> %4, i32 2)
-  ret void
-}
-
-define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b)  {
-; CHECK-LABEL: test_vst1q_s32_x4
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <4 x i32>] %b, 0
-  %2 = extractvalue [4 x <4 x i32>] %b, 1
-  %3 = extractvalue [4 x <4 x i32>] %b, 2
-  %4 = extractvalue [4 x <4 x i32>] %b, 3
-  %5 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v4i32(i8* %5, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, i32 4)
-  ret void
-}
-
-define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b)  {
-; CHECK-LABEL: test_vst1q_s64_x4
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <2 x i64>] %b, 0
-  %2 = extractvalue [4 x <2 x i64>] %b, 1
-  %3 = extractvalue [4 x <2 x i64>] %b, 2
-  %4 = extractvalue [4 x <2 x i64>] %b, 3
-  %5 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2i64(i8* %5, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, <2 x i64> %4, i32 8)
-  ret void
-}
-
-define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b)  {
-; CHECK-LABEL: test_vst1q_f32_x4
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <4 x float>] %b, 0
-  %2 = extractvalue [4 x <4 x float>] %b, 1
-  %3 = extractvalue [4 x <4 x float>] %b, 2
-  %4 = extractvalue [4 x <4 x float>] %b, 3
-  %5 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
-  ret void
-}
-
-define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b)  {
-; CHECK-LABEL: test_vst1q_f64_x4
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <2 x double>] %b, 0
-  %2 = extractvalue [4 x <2 x double>] %b, 1
-  %3 = extractvalue [4 x <2 x double>] %b, 2
-  %4 = extractvalue [4 x <2 x double>] %b, 3
-  %5 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
-  ret void
-}
-
-define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b)  {
-; CHECK-LABEL: test_vst1_s8_x4
-; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
-; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <8 x i8>] %b, 0
-  %2 = extractvalue [4 x <8 x i8>] %b, 1
-  %3 = extractvalue [4 x <8 x i8>] %b, 2
-  %4 = extractvalue [4 x <8 x i8>] %b, 3
-  tail call void @llvm.aarch64.neon.vst1x4.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %4, i32 1)
-  ret void
-}
-
-define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b)  {
-; CHECK-LABEL: test_vst1_s16_x4
-; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
-; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <4 x i16>] %b, 0
-  %2 = extractvalue [4 x <4 x i16>] %b, 1
-  %3 = extractvalue [4 x <4 x i16>] %b, 2
-  %4 = extractvalue [4 x <4 x i16>] %b, 3
-  %5 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v4i16(i8* %5, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, <4 x i16> %4, i32 2)
-  ret void
-}
-
-define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b)  {
-; CHECK-LABEL: test_vst1_s32_x4
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <2 x i32>] %b, 0
-  %2 = extractvalue [4 x <2 x i32>] %b, 1
-  %3 = extractvalue [4 x <2 x i32>] %b, 2
-  %4 = extractvalue [4 x <2 x i32>] %b, 3
-  %5 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 4)
-  ret void
-}
-
-define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b)  {
-; CHECK-LABEL: test_vst1_s64_x4
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <1 x i64>] %b, 0
-  %2 = extractvalue [4 x <1 x i64>] %b, 1
-  %3 = extractvalue [4 x <1 x i64>] %b, 2
-  %4 = extractvalue [4 x <1 x i64>] %b, 3
-  %5 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v1i64(i8* %5, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, <1 x i64> %4, i32 8)
-  ret void
-}
-
-define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b)  {
-; CHECK-LABEL: test_vst1_f32_x4
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <2 x float>] %b, 0
-  %2 = extractvalue [4 x <2 x float>] %b, 1
-  %3 = extractvalue [4 x <2 x float>] %b, 2
-  %4 = extractvalue [4 x <2 x float>] %b, 3
-  %5 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 4)
-  ret void
-}
-
-define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b)  {
-; CHECK-LABEL: test_vst1_f64_x4
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
-  %1 = extractvalue [4 x <1 x double>] %b, 0
-  %2 = extractvalue [4 x <1 x double>] %b, 1
-  %3 = extractvalue [4 x <1 x double>] %b, 2
-  %4 = extractvalue [4 x <1 x double>] %b, 3
-  %5 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v1f64(i8* %5, <1 x double> %1, <1 x double> %2, <1 x double> %3, <1 x double> %4, i32 8)
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8*, i32)
-declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v4f32(i8*, <4 x float>, <4 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2f64(i8*, <2 x double>, <2 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2f32(i8*, <2 x float>, <2 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v1f64(i8*, <1 x double>, <1 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
deleted file mode 100644
index 927c933..0000000
--- a/test/CodeGen/AArch64/neon-simd-ldst-one.ll
+++ /dev/null
@@ -1,2299 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-%struct.uint8x16x2_t = type { [2 x <16 x i8>] }
-%struct.poly8x16x2_t = type { [2 x <16 x i8>] }
-%struct.uint8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int8x16x2_t = type { [2 x <16 x i8>] }
-%struct.int16x8x2_t = type { [2 x <8 x i16>] }
-%struct.int32x4x2_t = type { [2 x <4 x i32>] }
-%struct.int64x2x2_t = type { [2 x <2 x i64>] }
-%struct.float32x4x2_t = type { [2 x <4 x float>] }
-%struct.float64x2x2_t = type { [2 x <2 x double>] }
-%struct.int8x8x2_t = type { [2 x <8 x i8>] }
-%struct.int16x4x2_t = type { [2 x <4 x i16>] }
-%struct.int32x2x2_t = type { [2 x <2 x i32>] }
-%struct.int64x1x2_t = type { [2 x <1 x i64>] }
-%struct.float32x2x2_t = type { [2 x <2 x float>] }
-%struct.float64x1x2_t = type { [2 x <1 x double>] }
-%struct.int8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int16x8x3_t = type { [3 x <8 x i16>] }
-%struct.int32x4x3_t = type { [3 x <4 x i32>] }
-%struct.int64x2x3_t = type { [3 x <2 x i64>] }
-%struct.float32x4x3_t = type { [3 x <4 x float>] }
-%struct.float64x2x3_t = type { [3 x <2 x double>] }
-%struct.int8x8x3_t = type { [3 x <8 x i8>] }
-%struct.int16x4x3_t = type { [3 x <4 x i16>] }
-%struct.int32x2x3_t = type { [3 x <2 x i32>] }
-%struct.int64x1x3_t = type { [3 x <1 x i64>] }
-%struct.float32x2x3_t = type { [3 x <2 x float>] }
-%struct.float64x1x3_t = type { [3 x <1 x double>] }
-%struct.int8x16x4_t = type { [4 x <16 x i8>] }
-%struct.int16x8x4_t = type { [4 x <8 x i16>] }
-%struct.int32x4x4_t = type { [4 x <4 x i32>] }
-%struct.int64x2x4_t = type { [4 x <2 x i64>] }
-%struct.float32x4x4_t = type { [4 x <4 x float>] }
-%struct.float64x2x4_t = type { [4 x <2 x double>] }
-%struct.int8x8x4_t = type { [4 x <8 x i8>] }
-%struct.int16x4x4_t = type { [4 x <4 x i16>] }
-%struct.int32x2x4_t = type { [4 x <2 x i32>] }
-%struct.int64x1x4_t = type { [4 x <1 x i64>] }
-%struct.float32x2x4_t = type { [4 x <2 x float>] }
-%struct.float64x1x4_t = type { [4 x <1 x double>] }
-
-define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
-; CHECK-LABEL: test_ld_from_poll_v16i8
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
-  ret <16 x i8> %b
-}
-
-define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
-; CHECK-LABEL: test_ld_from_poll_v8i16
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
-  ret <8 x i16> %b
-}
-
-define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4i32
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
-  ret <4 x i32> %b
-}
-
-define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2i64
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <2 x i64> %a, <i64 1, i64 2>
-  ret <2 x i64> %b
-}
-
-define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4f32
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
-  ret <4 x float> %b
-}
-
-define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2f64
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = fadd <2 x double> %a, <double 1.0, double 2.0>
-  ret <2 x double> %b
-}
-
-define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
-; CHECK-LABEL: test_ld_from_poll_v8i8
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
-  ret <8 x i8> %b
-}
-
-define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4i16
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
-  ret <4 x i16> %b
-}
-
-define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2i32
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
-  %b = add <2 x i32> %a, <i32 1, i32 2>
-  ret <2 x i32> %b
-}
-
-define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld1q_dup_s8
-; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
-  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  ret <16 x i8> %lane
-}
-
-define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld1q_dup_s16
-; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  ret <8 x i16> %lane
-}
-
-define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld1q_dup_s32
-; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %1 = insertelement <4 x i32> undef, i32 %0, i32 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  ret <4 x i32> %lane
-}
-
-define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld1q_dup_s64
-; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %1 = insertelement <2 x i64> undef, i64 %0, i32 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %lane
-}
-
-define <4 x float> @test_vld1q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld1q_dup_f32
-; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = load float* %a, align 4
-  %1 = insertelement <4 x float> undef, float %0, i32 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  ret <4 x float> %lane
-}
-
-define <2 x double> @test_vld1q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld1q_dup_f64
-; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = load double* %a, align 8
-  %1 = insertelement <2 x double> undef, double %0, i32 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  ret <2 x double> %lane
-}
-
-define <8 x i8> @test_vld1_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld1_dup_s8
-; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
-  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  ret <8 x i8> %lane
-}
-
-define <4 x i16> @test_vld1_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld1_dup_s16
-; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  ret <4 x i16> %lane
-}
-
-define <2 x i32> @test_vld1_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld1_dup_s32
-; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  ret <2 x i32> %lane
-}
-
-define <1 x i64> @test_vld1_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld1_dup_s64
-; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %1 = insertelement <1 x i64> undef, i64 %0, i32 0
-  ret <1 x i64> %1
-}
-
-define <2 x float> @test_vld1_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld1_dup_f32
-; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = load float* %a, align 4
-  %1 = insertelement <2 x float> undef, float %0, i32 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  ret <2 x float> %lane
-}
-
-define <1 x double> @test_vld1_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld1_dup_f64
-; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = load double* %a, align 8
-  %1 = insertelement <1 x double> undef, double %0, i32 0
-  ret <1 x double> %1
-}
-
-define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
-; As there is a store operation depending on %1, LD1R pattern can't be selected.
-; So LDR and FMOV should be emitted.
-; CHECK-LABEL: testDUP.v1i64
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}]
-  %1 = load i64* %a, align 8
-  store i64 %1, i64* %b, align 8
-  %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
-  ret <1 x i64> %vecinit.i
-}
-
-define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
-; As there is a store operation depending on %1, LD1R pattern can't be selected.
-; So LDR and FMOV should be emitted.
-; CHECK-LABEL: testDUP.v1f64
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
-; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
-  %1 = load double* %a, align 8
-  store double %1, double* %b, align 8
-  %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
-  ret <1 x double> %vecinit.i
-}
-
-define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld2q_dup_s8
-; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
-entry:
-  %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
-  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
-  %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
-  ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld2q_dup_s16
-; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
-  ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld2q_dup_s32
-; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
-  ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld2q_dup_s64
-; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
-  ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld2q_dup_f32
-; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
-  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
-  ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld2q_dup_f64
-; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
-  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
-  ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld2_dup_s8
-; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
-entry:
-  %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
-  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
-  %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
-  ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld2_dup_s16
-; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
-  ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld2_dup_s32
-; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
-  ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld2_dup_s64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
-  ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld2_dup_f32
-; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
-  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
-  ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld2_dup_f64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
-  ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld3q_dup_s8
-; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
-entry:
-  %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
-  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
-  %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
-  %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
-  ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld3q_dup_s16
-; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
-  %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
-  ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld3q_dup_s32
-; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
-  %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
-  ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld3q_dup_s64
-; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
-  %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
-  ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld3q_dup_f32
-; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
-  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
-  %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
-  ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld3q_dup_f64
-; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
-  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
-  %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
-  ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld3_dup_s8
-; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
-entry:
-  %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
-  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
-  %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
-  %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
-  ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld3_dup_s16
-; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
-  %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
-  ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld3_dup_s32
-; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
-  %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
-  ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld3_dup_s64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
-  %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
-  ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld3_dup_f32
-; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
-  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
-  %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
-  ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld3_dup_f64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
-  %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
-  ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld4q_dup_s8
-; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
-entry:
-  %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
-  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
-  %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
-  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
-  %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
-  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
-  %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
-  ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld4q_dup_s16
-; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
-  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
-  %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
-  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
-  %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
-  ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld4q_dup_s32
-; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
-  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
-  %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
-  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
-  %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
-  ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld4q_dup_s64
-; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
-  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
-  %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
-  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
-  %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
-  ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld4q_dup_f32
-; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
-  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
-  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
-  %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
-  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
-  %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
-  ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld4q_dup_f64
-; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
-  %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
-  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
-  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
-  %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
-  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
-  %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
-  ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld4_dup_s8
-; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
-entry:
-  %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
-  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
-  %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
-  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
-  %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
-  %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
-  ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld4_dup_s16
-; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
-entry:
-  %0 = bitcast i16* %a to i8*
-  %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-  %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
-  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
-  %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
-  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
-  %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
-  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
-  %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
-  ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld4_dup_s32
-; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = bitcast i32* %a to i8*
-  %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
-  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
-  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
-  %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
-  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
-  %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
-  ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld4_dup_s64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = bitcast i64* %a to i8*
-  %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
-  %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
-  %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
-  ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld4_dup_f32
-; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
-  %0 = bitcast float* %a to i8*
-  %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
-  %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
-  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
-  %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
-  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
-  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
-  %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
-  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
-  %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
-  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
-  ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld4_dup_f64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = bitcast double* %a to i8*
-  %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
-  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
-  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
-  %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
-  %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
-  ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vld1q_lane_s8
-; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
-  ret <16 x i8> %vld1_lane
-}
-
-define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vld1q_lane_s16
-; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
-  ret <8 x i16> %vld1_lane
-}
-
-define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vld1q_lane_s32
-; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
-  ret <4 x i32> %vld1_lane
-}
-
-define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vld1q_lane_s64
-; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
-  ret <2 x i64> %vld1_lane
-}
-
-define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vld1q_lane_f32
-; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load float* %a, align 4
-  %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
-  ret <4 x float> %vld1_lane
-}
-
-define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vld1q_lane_f64
-; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load double* %a, align 8
-  %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
-  ret <2 x double> %vld1_lane
-}
-
-define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vld1_lane_s8
-; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i8* %a, align 1
-  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
-  ret <8 x i8> %vld1_lane
-}
-
-define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vld1_lane_s16
-; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i16* %a, align 2
-  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
-  ret <4 x i16> %vld1_lane
-}
-
-define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vld1_lane_s32
-; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load i32* %a, align 4
-  %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
-  ret <2 x i32> %vld1_lane
-}
-
-define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vld1_lane_s64
-; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = load i64* %a, align 8
-  %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
-  ret <1 x i64> %vld1_lane
-}
-
-define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vld1_lane_f32
-; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = load float* %a, align 4
-  %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
-  ret <2 x float> %vld1_lane
-}
-
-define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vld1_lane_f64
-; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
-entry:
-  %0 = load double* %a, align 8
-  %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
-  ret <1 x double> %vld1_lane
-}
-
-define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s16
-; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
-  %0 = bitcast i16* %a to i8*
-  %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
-  %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s32
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
-  %0 = bitcast i32* %a to i8*
-  %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
-  %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s64
-; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
-  %0 = bitcast i64* %a to i8*
-  %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
-  %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_f32
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
-  %0 = bitcast float* %a to i8*
-  %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
-  %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_f64
-; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
-  %0 = bitcast double* %a to i8*
-  %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
-  %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s8
-; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
-  %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s16
-; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
-  %0 = bitcast i16* %a to i8*
-  %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
-  %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s32
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
-  %0 = bitcast i32* %a to i8*
-  %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
-  %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s64
-; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
-  %0 = bitcast i64* %a to i8*
-  %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
-  %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_f32
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
-  %0 = bitcast float* %a to i8*
-  %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
-  %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_f64
-; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
-  %0 = bitcast double* %a to i8*
-  %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
-  %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s16
-; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
-  %0 = bitcast i16* %a to i8*
-  %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
-  %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s32
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
-  %0 = bitcast i32* %a to i8*
-  %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s64
-; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
-  %0 = bitcast i64* %a to i8*
-  %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_f32
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
-  %0 = bitcast float* %a to i8*
-  %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_f64
-; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
-  %0 = bitcast double* %a to i8*
-  %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s8
-; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s16
-; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
-  %0 = bitcast i16* %a to i8*
-  %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s32
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
-  %0 = bitcast i32* %a to i8*
-  %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s64
-; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
-  %0 = bitcast i64* %a to i8*
-  %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_f32
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
-  %0 = bitcast float* %a to i8*
-  %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_f64
-; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
-  %0 = bitcast double* %a to i8*
-  %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s8
-; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s16
-; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
-  %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s32
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
-  %0 = bitcast i32* %a to i8*
-  %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s64
-; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
-  %0 = bitcast i64* %a to i8*
-  %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_f32
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
-  %0 = bitcast float* %a to i8*
-  %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_f64
-; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %0 = bitcast double* %a to i8*
-  %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s8
-; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s16
-; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
-  %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s32
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
-  %0 = bitcast i32* %a to i8*
-  %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s64
-; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
-  %0 = bitcast i64* %a to i8*
-  %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_f32
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
-  %0 = bitcast float* %a to i8*
-  %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
-  %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_f64
-; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
-  %0 = bitcast double* %a to i8*
-  %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
-  %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
-  %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
-  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
-  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
-  ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vst1q_lane_s8
-; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <16 x i8> %b, i32 15
-  store i8 %0, i8* %a, align 1
-  ret void
-}
-
-define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vst1q_lane_s16
-; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <8 x i16> %b, i32 7
-  store i16 %0, i16* %a, align 2
-  ret void
-}
-
-define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vst1q_lane_s32
-; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <4 x i32> %b, i32 3
-  store i32 %0, i32* %a, align 4
-  ret void
-}
-
-define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vst1q_lane_s64
-; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x i64> %b, i32 1
-  store i64 %0, i64* %a, align 8
-  ret void
-}
-
-define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vst1q_lane_f32
-; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <4 x float> %b, i32 3
-  store float %0, float* %a, align 4
-  ret void
-}
-
-define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vst1q_lane_f64
-; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x double> %b, i32 1
-  store double %0, double* %a, align 8
-  ret void
-}
-
-define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vst1_lane_s8
-; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <8 x i8> %b, i32 7
-  store i8 %0, i8* %a, align 1
-  ret void
-}
-
-define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vst1_lane_s16
-; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <4 x i16> %b, i32 3
-  store i16 %0, i16* %a, align 2
-  ret void
-}
-
-define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vst1_lane_s32
-; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x i32> %b, i32 1
-  store i32 %0, i32* %a, align 4
-  ret void
-}
-
-define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vst1_lane_s64
-; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <1 x i64> %b, i32 0
-  store i64 %0, i64* %a, align 8
-  ret void
-}
-
-define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vst1_lane_f32
-; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <2 x float> %b, i32 1
-  store float %0, float* %a, align 4
-  ret void
-}
-
-define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vst1_lane_f64
-; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %0 = extractelement <1 x double> %b, i32 0
-  store double %0, double* %a, align 8
-  ret void
-}
-
-define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s8
-; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
-  ret void
-}
-
-define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s16
-; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
-  ret void
-}
-
-define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s32
-; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s64
-; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_f32
-; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_f64
-; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s8
-; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
-  ret void
-}
-
-define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s16
-; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
-  ret void
-}
-
-define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s32
-; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s64
-; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_f32
-; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_f64
-; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s8
-; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
-  ret void
-}
-
-define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s16
-; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
-  ret void
-}
-
-define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s32
-; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s64
-; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_f32
-; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_f64
-; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s8
-; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
-  ret void
-}
-
-define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s16
-; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
-  ret void
-}
-
-define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s32
-; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s64
-; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_f32
-; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_f64
-; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s8
-; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
-  ret void
-}
-
-define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s16
-; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
-  ret void
-}
-
-define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s32
-; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s64
-; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_f32
-; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
-  ret void
-}
-
-define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_f64
-; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
-  ret void
-}
-
-define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s8
-; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
-  ret void
-}
-
-define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s16
-; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
-  %0 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
-  ret void
-}
-
-define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s32
-; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s64
-; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
-  %0 = bitcast i64* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
-  ret void
-}
-
-define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_f32
-; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
-  %0 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
-  ret void
-}
-
-define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_f64
-; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
-  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
-  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
-  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
-  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
-  %0 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
-declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
-declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-
-define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s8
-; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
-  %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
-  %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld2q_lane_u8
-; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
-  %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
-  %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.uint8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld2q_lane_p8
-; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
-  %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
-  %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
-  %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
-  %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
-  ret %struct.poly8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s8
-; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
-  %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
-  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld3q_lane_u8
-; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
-  %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
-  %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
-  %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
-  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
-  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
-  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
-  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
-  %.fca.0.0.insert = insertvalue %struct.uint8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
-  %.fca.0.1.insert = insertvalue %struct.uint8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
-  %.fca.0.2.insert = insertvalue %struct.uint8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
-  ret %struct.uint8x16x3_t %.fca.0.2.insert
-}
-
diff --git a/test/CodeGen/AArch64/neon-simd-ldst.ll b/test/CodeGen/AArch64/neon-simd-ldst.ll
deleted file mode 100644
index afc0901..0000000
--- a/test/CodeGen/AArch64/neon-simd-ldst.ll
+++ /dev/null
@@ -1,164 +0,0 @@
-; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define void @test_ldstq_4v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldstq_4v
-; CHECK: ld4     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
-; CHECK: st4     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
-entry:
-  %tobool62 = icmp eq i32 %count, 0
-  br i1 %tobool62, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.063 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.063, -1
-  %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %io, i32 1)
-  %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
-  tail call void @llvm.arm.neon.vst4.v16i8(i8* %io, <16 x i8> %vld4.fca.0.extract, <16 x i8> %vld4.fca.1.extract, <16 x i8> %vld4.fca.2.extract, <16 x i8> %vld4.fca.3.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-
-define void @test_ldstq_3v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldstq_3v
-; CHECK: ld3     {v0.16b, v1.16b, v2.16b}, [x0]
-; CHECK: st3     {v0.16b, v1.16b, v2.16b}, [x0]
-entry:
-  %tobool47 = icmp eq i32 %count, 0
-  br i1 %tobool47, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.048 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.048, -1
-  %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %io, i32 1)
-  %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
-  tail call void @llvm.arm.neon.vst3.v16i8(i8* %io, <16 x i8> %vld3.fca.0.extract, <16 x i8> %vld3.fca.1.extract, <16 x i8> %vld3.fca.2.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-
-define void @test_ldstq_2v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldstq_2v
-; CHECK: ld2     {v0.16b, v1.16b}, [x0]
-; CHECK: st2     {v0.16b, v1.16b}, [x0]
-entry:
-  %tobool22 = icmp eq i32 %count, 0
-  br i1 %tobool22, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.023, -1
-  %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %io, i32 1)
-  %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
-  tail call void @llvm.arm.neon.vst2.v16i8(i8* %io, <16 x i8> %vld2.fca.0.extract, <16 x i8> %vld2.fca.1.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-
-define void @test_ldst_4v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldst_4v
-; CHECK: ld4     {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
-; CHECK: st4     {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
-entry:
-  %tobool42 = icmp eq i32 %count, 0
-  br i1 %tobool42, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.043 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.043, -1
-  %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %io, i32 1)
-  %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
-  %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
-  %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
-  %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
-  tail call void @llvm.arm.neon.vst4.v8i8(i8* %io, <8 x i8> %vld4.fca.0.extract, <8 x i8> %vld4.fca.1.extract, <8 x i8> %vld4.fca.2.extract, <8 x i8> %vld4.fca.3.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-
-define void @test_ldst_3v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldst_3v
-; CHECK: ld3     {v0.8b, v1.8b, v2.8b}, [x0]
-; CHECK: st3     {v0.8b, v1.8b, v2.8b}, [x0]
-entry:
-  %tobool32 = icmp eq i32 %count, 0
-  br i1 %tobool32, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.033 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.033, -1
-  %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %io, i32 1)
-  %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
-  %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
-  %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
-  tail call void @llvm.arm.neon.vst3.v8i8(i8* %io, <8 x i8> %vld3.fca.0.extract, <8 x i8> %vld3.fca.1.extract, <8 x i8> %vld3.fca.2.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-
-define void @test_ldst_2v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldst_2v
-; CHECK: ld2     {v0.8b, v1.8b}, [x0]
-; CHECK: st2     {v0.8b, v1.8b}, [x0]
-entry:
-  %tobool22 = icmp eq i32 %count, 0
-  br i1 %tobool22, label %while.end, label %while.body
-
-while.body:                                       ; preds = %entry, %while.body
-  %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
-  %dec = add i32 %count.addr.023, -1
-  %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %io, i32 1)
-  %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
-  %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
-  tail call void @llvm.arm.neon.vst2.v8i8(i8* %io, <8 x i8> %vld2.fca.0.extract, <8 x i8> %vld2.fca.1.extract, i32 1)
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %while.end, label %while.body
-
-while.end:                                        ; preds = %while.body, %entry
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
deleted file mode 100644
index 156fe1d..0000000
--- a/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
+++ /dev/null
@@ -1,354 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-;Check for a post-increment updating load.
-define <4 x i16> @test_vld1_fx_update(i16** %ptr) nounwind {
-; CHECK: test_vld1_fx_update
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #8
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 2)
-  %tmp2 = getelementptr i16* %A, i32 4
-  store i16* %tmp2, i16** %ptr
-  ret <4 x i16> %tmp1
-}
-
-;Check for a post-increment updating load with register increment.
-define <2 x i32> @test_vld1_reg_update(i32** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld1_reg_update
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i32** %ptr
-  %tmp0 = bitcast i32* %A to i8*
-  %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 4)
-  %tmp2 = getelementptr i32* %A, i32 %inc
-  store i32* %tmp2, i32** %ptr
-  ret <2 x i32> %tmp1
-}
-
-define <2 x float> @test_vld2_fx_update(float** %ptr) nounwind {
-; CHECK: test_vld2_fx_update
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16
-  %A = load float** %ptr
-  %tmp0 = bitcast float* %A to i8*
-  %tmp1 = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 4)
-  %tmp2 = extractvalue { <2 x float>, <2 x float> } %tmp1, 0
-  %tmp3 = getelementptr float* %A, i32 4
-  store float* %tmp3, float** %ptr
-  ret <2 x float> %tmp2
-}
-
-define <16 x i8> @test_vld2_reg_update(i8** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld2_reg_update
-; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i8** %ptr
-  %tmp0 = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %A, i32 1)
-  %tmp1 = extractvalue { <16 x i8>, <16 x i8> } %tmp0, 0
-  %tmp2 = getelementptr i8* %A, i32 %inc
-  store i8* %tmp2, i8** %ptr
-  ret <16 x i8> %tmp1
-}
-
-define <4 x i32> @test_vld3_fx_update(i32** %ptr) nounwind {
-; CHECK: test_vld3_fx_update
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #48
-  %A = load i32** %ptr
-  %tmp0 = bitcast i32* %A to i8*
-  %tmp1 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 4)
-  %tmp2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %tmp1, 0
-  %tmp3 = getelementptr i32* %A, i32 12
-  store i32* %tmp3, i32** %ptr
-  ret <4 x i32> %tmp2
-}
-
-define <4 x i16> @test_vld3_reg_update(i16** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld3_reg_update
-; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  %tmp1 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 2)
-  %tmp2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %tmp1, 0
-  %tmp3 = getelementptr i16* %A, i32 %inc
-  store i16* %tmp3, i16** %ptr
-  ret <4 x i16> %tmp2
-}
-
-define <8 x i16> @test_vld4_fx_update(i16** %ptr) nounwind {
-; CHECK: test_vld4_fx_update
-; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], #64
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  %tmp1 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8)
-  %tmp2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %tmp1, 0
-  %tmp3 = getelementptr i16* %A, i32 32
-  store i16* %tmp3, i16** %ptr
-  ret <8 x i16> %tmp2
-}
-
-define <8 x i8> @test_vld4_reg_update(i8** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld4_reg_update
-; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i8** %ptr
-  %tmp0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %A, i32 1)
-  %tmp1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %tmp0, 0
-  %tmp2 = getelementptr i8* %A, i32 %inc
-  store i8* %tmp2, i8** %ptr
-  ret <8 x i8> %tmp1
-}
-
-define void @test_vst1_fx_update(float** %ptr, <2 x float> %B) nounwind {
-; CHECK: test_vst1_fx_update
-; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}], #8
-  %A = load float** %ptr
-  %tmp0 = bitcast float* %A to i8*
-  call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %B, i32 4)
-  %tmp2 = getelementptr float* %A, i32 2
-  store float* %tmp2, float** %ptr
-  ret void
-}
-
-define void @test_vst1_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind {
-; CHECK: test_vst1_reg_update
-; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %B, i32 2)
-  %tmp1 = getelementptr i16* %A, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret void
-}
-
-define void @test_vst2_fx_update(i64** %ptr, <1 x i64> %B) nounwind {
-; CHECK: test_vst2_fx_update
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}], #16
-  %A = load i64** %ptr
-  %tmp0 = bitcast i64* %A to i8*
-  call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %B, <1 x i64> %B, i32 8)
-  %tmp1 = getelementptr i64* %A, i32 2
-  store i64* %tmp1, i64** %ptr
-  ret void
-}
-
-define void @test_vst2_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind {
-; CHECK: test_vst2_reg_update
-; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i8** %ptr
-  call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, i32 4)
-  %tmp0 = getelementptr i8* %A, i32 %inc
-  store i8* %tmp0, i8** %ptr
-  ret void
-}
-
-define void @test_vst3_fx_update(i32** %ptr, <2 x i32> %B) nounwind {
-; CHECK: test_vst3_fx_update
-; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}], #24
-  %A = load i32** %ptr
-  %tmp0 = bitcast i32* %A to i8*
-  call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %B, <2 x i32> %B, <2 x i32> %B, i32 4)
-  %tmp1 = getelementptr i32* %A, i32 6
-  store i32* %tmp1, i32** %ptr
-  ret void
-}
-
-define void @test_vst3_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind {
-; CHECK: test_vst3_reg_update
-; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i16** %ptr
-  %tmp0 = bitcast i16* %A to i8*
-  call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %B, <8 x i16> %B, <8 x i16> %B, i32 2)
-  %tmp1 = getelementptr i16* %A, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret void
-}
-
-define void @test_vst4_fx_update(float** %ptr, <4 x float> %B) nounwind {
-; CHECK: test_vst4_fx_update
-; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}], #64
-  %A = load float** %ptr
-  %tmp0 = bitcast float* %A to i8*
-  call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %B, <4 x float> %B, <4 x float> %B, <4 x float> %B, i32 4)
-  %tmp1 = getelementptr float* %A, i32 16
-  store float* %tmp1, float** %ptr
-  ret void
-}
-
-define void @test_vst4_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind {
-; CHECK: test_vst4_reg_update
-; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
-  %A = load i8** %ptr
-  call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, i32 1)
-  %tmp0 = getelementptr i8* %A, i32 %inc
-  store i8* %tmp0, i8** %ptr
-  ret void
-}
-
-
-declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
-declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-
-define <16 x i8> @test_vld1x2_fx_update(i8* %a, i8** %ptr) {
-; CHECK: test_vld1x2_fx_update
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #32
-  %1 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
-  %tmp1 = getelementptr i8* %a, i32 32
-  store i8* %tmp1, i8** %ptr
-  ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vld1x2_reg_update(i16* %a, i16** %ptr, i32 %inc) {
-; CHECK: test_vld1x2_reg_update
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret <8 x i16> %3
-}
-
-define <2 x i64> @test_vld1x3_fx_update(i64* %a, i64** %ptr) {
-; CHECK: test_vld1x3_fx_update
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], #48
-  %1 = bitcast i64* %a to i8*
-  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
-  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
-  %tmp1 = getelementptr i64* %a, i32 6
-  store i64* %tmp1, i64** %ptr
-  ret  <2 x i64> %3
-}
-
-define <8 x i16> @test_vld1x3_reg_update(i16* %a, i16** %ptr, i32 %inc) {
-; CHECK: test_vld1x3_reg_update
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
-  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret <8 x i16> %3
-}
-
-define <4 x float> @test_vld1x4_fx_update(float* %a, float** %ptr) {
-; CHECK: test_vld1x4_fx_update
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #64
-  %1 = bitcast float* %a to i8*
-  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
-  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
-  %tmp1 = getelementptr float* %a, i32 16
-  store float* %tmp1, float** %ptr
-  ret <4 x float> %3
-}
-
-define <8 x i8> @test_vld1x4_reg_update(i8* readonly %a, i8** %ptr, i32 %inc) #0 {
-; CHECK: test_vld1x4_reg_update
-; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-  %tmp1 = getelementptr i8* %a, i32 %inc
-  store i8* %tmp1, i8** %ptr
-  ret <8 x i8> %2
-}
-
-define void @test_vst1x2_fx_update(i8* %a, [2 x <16 x i8>] %b.coerce, i8** %ptr) #2 {
-; CHECK: test_vst1x2_fx_update
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #32
-  %1 = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %2 = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
-  %tmp1 = getelementptr i8* %a, i32 32
-  store i8* %tmp1, i8** %ptr
-  ret void
-}
-
-define void @test_vst1x2_reg_update(i16* %a, [2 x <8 x i16>] %b.coerce, i16** %ptr, i32 %inc) #2 {
-; CHECK: test_vst1x2_reg_update
-; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [2 x <8 x i16>] %b.coerce, 0
-  %2 = extractvalue [2 x <8 x i16>] %b.coerce, 1
-  %3 = bitcast i16* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret void
-}
-
-define void @test_vst1x3_fx_update(i32* %a, [3 x <2 x i32>] %b.coerce, i32** %ptr) #2 {
-; CHECK: test_vst1x3_fx_update
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #24
-  %1 = extractvalue [3 x <2 x i32>] %b.coerce, 0
-  %2 = extractvalue [3 x <2 x i32>] %b.coerce, 1
-  %3 = extractvalue [3 x <2 x i32>] %b.coerce, 2
-  %4 = bitcast i32* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
-  %tmp1 = getelementptr i32* %a, i32 6
-  store i32* %tmp1, i32** %ptr
-  ret void
-}
-
-define void @test_vst1x3_reg_update(i64* %a, [3 x <1 x i64>] %b.coerce, i64** %ptr, i32 %inc) #2 {
-; CHECK: test_vst1x3_reg_update
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [3 x <1 x i64>] %b.coerce, 0
-  %2 = extractvalue [3 x <1 x i64>] %b.coerce, 1
-  %3 = extractvalue [3 x <1 x i64>] %b.coerce, 2
-  %4 = bitcast i64* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
-  %tmp1 = getelementptr i64* %a, i32 %inc
-  store i64* %tmp1, i64** %ptr
-  ret void
-}
-
-define void @test_vst1x4_fx_update(float* %a, [4 x <4 x float>] %b.coerce, float** %ptr) #2 {
-; CHECK: test_vst1x4_fx_update
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #64
-  %1 = extractvalue [4 x <4 x float>] %b.coerce, 0
-  %2 = extractvalue [4 x <4 x float>] %b.coerce, 1
-  %3 = extractvalue [4 x <4 x float>] %b.coerce, 2
-  %4 = extractvalue [4 x <4 x float>] %b.coerce, 3
-  %5 = bitcast float* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
-  %tmp1 = getelementptr float* %a, i32 16
-  store float* %tmp1, float** %ptr
-  ret void
-}
-
-define void @test_vst1x4_reg_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr, i32 %inc) #2 {
-; CHECK: test_vst1x4_reg_update
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %5 = bitcast double* %a to i8*
-  tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
-  %tmp1 = getelementptr double* %a, i32 %inc
-  store double* %tmp1, double** %ptr
-  ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
-declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #3
-declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32) #3
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
deleted file mode 100644
index 80a9347..0000000
--- a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
+++ /dev/null
@@ -1,319 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define { [2 x <16 x i8>] } @test_vld2q_dup_fx_update(i8* %a, i8** %ptr) {
-; CHECK-LABEL: test_vld2q_dup_fx_update
-; CHECK: ld2r  {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #2
-  %1 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
-  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
-  %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
-  %4 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
-  %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer
-  %6 = insertvalue { [2 x <16 x i8>] } undef, <16 x i8> %3, 0, 0
-  %7 = insertvalue { [2 x <16 x i8>] } %6, <16 x i8> %5, 0, 1
-  %tmp1 = getelementptr i8* %a, i32 2
-  store i8* %tmp1, i8** %ptr
-  ret { [2 x <16 x i8>] } %7
-}
-
-define { [2 x <4 x i32>] } @test_vld2q_dup_reg_update(i32* %a, i32** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld2q_dup_reg_update
-; CHECK: ld2r  {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %1, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
-  %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
-  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
-  %5 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
-  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer
-  %7 = insertvalue { [2 x <4 x i32>] } undef, <4 x i32> %4, 0, 0
-  %8 = insertvalue { [2 x <4 x i32>] } %7, <4 x i32> %6, 0, 1
-  %tmp1 = getelementptr i32* %a, i32 %inc
-  store i32* %tmp1, i32** %ptr
-  ret { [2 x <4 x i32>] } %8
-}
-
-define { [3 x <4 x i16>] } @test_vld3_dup_fx_update(i16* %a, i16** %ptr) {
-; CHECK-LABEL: test_vld3_dup_fx_update
-; CHECK: ld3r  {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #6
-  %1 = bitcast i16* %a to i8*
-  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %1, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
-  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
-  %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
-  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
-  %6 = shufflevector <4 x i16> %5, <4 x i16> undef, <4 x i32> zeroinitializer
-  %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
-  %8 = shufflevector <4 x i16> %7, <4 x i16> undef, <4 x i32> zeroinitializer
-  %9 = insertvalue { [3 x <4 x i16>] }  undef, <4 x i16> %4, 0, 0
-  %10 = insertvalue { [3 x <4 x i16>] }  %9, <4 x i16> %6, 0, 1
-  %11 = insertvalue { [3 x <4 x i16>] }  %10, <4 x i16> %8, 0, 2
-  %tmp1 = getelementptr i16* %a, i32 3
-  store i16* %tmp1, i16** %ptr
-  ret { [3 x <4 x i16>] }  %11
-}
-
-define { [3 x <8 x i8>] } @test_vld3_dup_reg_update(i8* %a, i8** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld3_dup_reg_update
-; CHECK: ld3r  {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
-  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
-  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
-  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
-  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <8 x i32> zeroinitializer
-  %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
-  %7 = shufflevector <8 x i8> %6, <8 x i8> undef, <8 x i32> zeroinitializer
-  %8 = insertvalue { [3 x <8 x i8>] } undef, <8 x i8> %3, 0, 0
-  %9 = insertvalue { [3 x <8 x i8>] } %8, <8 x i8> %5, 0, 1
-  %10 = insertvalue { [3 x <8 x i8>] } %9, <8 x i8> %7, 0, 2
-  %tmp1 = getelementptr i8* %a, i32 %inc
-  store i8* %tmp1, i8** %ptr
-  ret { [3 x <8 x i8>] }%10
-}
-
-define { [4 x <2 x i32>] } @test_vld4_dup_fx_update(i32* %a, i32** %ptr) #0 {
-; CHECK-LABEL: test_vld4_dup_fx_update
-; CHECK: ld4r  {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16
-  %1 = bitcast i32* %a to i8*
-  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %1, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
-  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
-  %4 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
-  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
-  %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <2 x i32> zeroinitializer
-  %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
-  %8 = shufflevector <2 x i32> %7, <2 x i32> undef, <2 x i32> zeroinitializer
-  %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
-  %10 = shufflevector <2 x i32> %9, <2 x i32> undef, <2 x i32> zeroinitializer
-  %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %4, 0, 0
-  %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %6, 0, 1
-  %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %8, 0, 2
-  %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
-  %tmp1 = getelementptr i32* %a, i32 4
-  store i32* %tmp1, i32** %ptr
-  ret { [4 x <2 x i32>] } %14
-}
-
-define { [4 x <2 x double>] } @test_vld4_dup_reg_update(double* %a, double** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld4_dup_reg_update
-; CHECK: ld4r  {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = bitcast double* %a to i8*
-  %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %1, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
-  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
-  %4 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
-  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
-  %6 = shufflevector <2 x double> %5, <2 x double> undef, <2 x i32> zeroinitializer
-  %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
-  %8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer
-  %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
-  %10 = shufflevector <2 x double> %9, <2 x double> undef, <2 x i32> zeroinitializer
-  %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %4, 0, 0
-  %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %6, 0, 1
-  %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %8, 0, 2
-  %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
-  %tmp1 = getelementptr double* %a, i32 %inc
-  store double* %tmp1, double** %ptr
-  ret { [4 x <2 x double>] } %14
-}
-
-define { [2 x <8 x i8>] } @test_vld2_lane_fx_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr) {
-; CHECK-LABEL: test_vld2_lane_fx_update
-; CHECK: ld2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
-  %1 = extractvalue [2 x <8 x i8>] %b, 0
-  %2 = extractvalue [2 x <8 x i8>] %b, 1
-  %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
-  %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
-  %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
-  %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
-  %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
-  %tmp1 = getelementptr i8* %a, i32 2
-  store i8* %tmp1, i8** %ptr
-  ret { [2 x <8 x i8>] } %7
-}
-
-define { [2 x <8 x i8>] } @test_vld2_lane_reg_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld2_lane_reg_update
-; CHECK: ld2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[6], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [2 x <8 x i8>] %b, 0
-  %2 = extractvalue [2 x <8 x i8>] %b, 1
-  %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 6, i32 1)
-  %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
-  %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
-  %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
-  %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
-  %tmp1 = getelementptr i8* %a, i32 %inc
-  store i8* %tmp1, i8** %ptr
-  ret { [2 x <8 x i8>] } %7
-}
-
-define { [3 x <2 x float>] } @test_vld3_lane_fx_update(float* %a, [3 x <2 x float>] %b, float** %ptr) {
-; CHECK-LABEL: test_vld3_lane_fx_update
-; CHECK: ld3  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #12
-  %1 = extractvalue [3 x <2 x float>] %b, 0
-  %2 = extractvalue [3 x <2 x float>] %b, 1
-  %3 = extractvalue [3 x <2 x float>] %b, 2
-  %4 = bitcast float* %a to i8*
-  %5 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4)
-  %6 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 0
-  %7 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 1
-  %8 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 2
-  %9 = insertvalue { [3 x <2 x float>] } undef, <2 x float> %6, 0, 0
-  %10 = insertvalue { [3 x <2 x float>] } %9, <2 x float> %7, 0, 1
-  %11 = insertvalue { [3 x <2 x float>] } %10, <2 x float> %8, 0, 2
-  %tmp1 = getelementptr float* %a, i32 3
-  store float* %tmp1, float** %ptr
-  ret { [3 x <2 x float>] } %11
-}
-
-define { [3 x <4 x i16>] } @test_vld3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld3_lane_reg_update
-; CHECK: ld3  {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [3 x <4 x i16>] %b, 0
-  %2 = extractvalue [3 x <4 x i16>] %b, 1
-  %3 = extractvalue [3 x <4 x i16>] %b, 2
-  %4 = bitcast i16* %a to i8*
-  %5 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
-  %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 0
-  %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 1
-  %8 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 2
-  %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %6, 0, 0
-  %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %7, 0, 1
-  %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret { [3 x <4 x i16>] } %11
-}
-
-define { [4 x <2 x i32>] } @test_vld4_lane_fx_update(i32* readonly %a, [4 x <2 x i32>] %b, i32** %ptr) {
-; CHECK-LABEL: test_vld4_lane_fx_update
-; CHECK: ld4  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #16
-  %1 = extractvalue [4 x <2 x i32>] %b, 0
-  %2 = extractvalue [4 x <2 x i32>] %b, 1
-  %3 = extractvalue [4 x <2 x i32>] %b, 2
-  %4 = extractvalue [4 x <2 x i32>] %b, 3
-  %5 = bitcast i32* %a to i8*
-  %6 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 1, i32 4)
-  %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 0
-  %8 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 1
-  %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 2
-  %10 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 3
-  %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %7, 0, 0
-  %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %8, 0, 1
-  %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %9, 0, 2
-  %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
-  %tmp1 = getelementptr i32* %a, i32 4
-  store i32* %tmp1, i32** %ptr
-  ret { [4 x <2 x i32>] } %14
-}
-
-define { [4 x <2 x double>] } @test_vld4_lane_reg_update(double* readonly %a, [4 x <2 x double>] %b, double** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld4_lane_reg_update
-; CHECK: ld4  {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [4 x <2 x double>] %b, 0
-  %2 = extractvalue [4 x <2 x double>] %b, 1
-  %3 = extractvalue [4 x <2 x double>] %b, 2
-  %4 = extractvalue [4 x <2 x double>] %b, 3
-  %5 = bitcast double* %a to i8*
-  %6 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
-  %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 0
-  %8 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 1
-  %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 2
-  %10 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 3
-  %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %7, 0, 0
-  %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %8, 0, 1
-  %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %9, 0, 2
-  %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
-  %tmp1 = getelementptr double* %a, i32 %inc
-  store double* %tmp1, double** %ptr
-  ret { [4 x <2 x double>] } %14
-}
-
-define void @test_vst2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) {
-; CHECK-LABEL: test_vst2_lane_fx_update
-; CHECK: st2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
-  %1 = extractvalue [2 x <8 x i8>] %b, 0
-  %2 = extractvalue [2 x <8 x i8>] %b, 1
-  call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
-  %tmp1 = getelementptr i8* %a, i32 2
-  store i8* %tmp1, i8** %ptr
-  ret void
-}
-
-define void @test_vst2_lane_reg_update(i32* %a, [2 x <2 x i32>] %b.coerce, i32** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vst2_lane_reg_update
-; CHECK: st2  {v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [2 x <2 x i32>] %b.coerce, 0
-  %2 = extractvalue [2 x <2 x i32>] %b.coerce, 1
-  %3 = bitcast i32* %a to i8*
-  tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
-  %tmp1 = getelementptr i32* %a, i32 %inc
-  store i32* %tmp1, i32** %ptr
-  ret void
-}
-
-define void @test_vst3_lane_fx_update(float* %a, [3 x <4 x float>] %b, float** %ptr) {
-; CHECK-LABEL: test_vst3_lane_fx_update
-; CHECK: st3  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[3], [x{{[0-9]+|sp}}], #12
-  %1 = extractvalue [3 x <4 x float>] %b, 0
-  %2 = extractvalue [3 x <4 x float>] %b, 1
-  %3 = extractvalue [3 x <4 x float>] %b, 2
-  %4 = bitcast float* %a to i8*
-  call void @llvm.arm.neon.vst3lane.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 3, i32 4)
-  %tmp1 = getelementptr float* %a, i32 3
-  store float* %tmp1, float** %ptr
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @test_vst3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vst3_lane_reg_update
-; CHECK: st3  {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [3 x <4 x i16>] %b, 0
-  %2 = extractvalue [3 x <4 x i16>] %b, 1
-  %3 = extractvalue [3 x <4 x i16>] %b, 2
-  %4 = bitcast i16* %a to i8*
-  tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
-  %tmp1 = getelementptr i16* %a, i32 %inc
-  store i16* %tmp1, i16** %ptr
-  ret void
-}
-
-define void @test_vst4_lane_fx_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr) {
-; CHECK-LABEL: test_vst4_lane_fx_update
-; CHECK: st4  {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], #32
-  %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
-  %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
-  %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
-  %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
-  %5 = bitcast double* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
-  %tmp1 = getelementptr double* %a, i32 4
-  store double* %tmp1, double** %ptr
-  ret void
-}
-
-
-define void @test_vst4_lane_reg_update(float* %a, [4 x <2 x float>] %b.coerce, float** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vst4_lane_reg_update
-; CHECK: st4  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
-  %1 = extractvalue [4 x <2 x float>] %b.coerce, 0
-  %2 = extractvalue [4 x <2 x float>] %b.coerce, 1
-  %3 = extractvalue [4 x <2 x float>] %b.coerce, 2
-  %4 = extractvalue [4 x <2 x float>] %b.coerce, 3
-  %5 = bitcast float* %a to i8*
-  tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 1, i32 4)
-  %tmp1 = getelementptr float* %a, i32 %inc
-  store float* %tmp1, float** %ptr
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-shift.ll b/test/CodeGen/AArch64/neon-simd-shift.ll
deleted file mode 100644
index fd76265..0000000
--- a/test/CodeGen/AArch64/neon-simd-shift.ll
+++ /dev/null
@@ -1,1556 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) {
-; CHECK: test_vshr_n_s8
-; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vshr_n = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <8 x i8> %vshr_n
-}
-
-define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) {
-; CHECK: test_vshr_n_s16
-; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vshr_n = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
-  ret <4 x i16> %vshr_n
-}
-
-define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) {
-; CHECK: test_vshr_n_s32
-; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vshr_n = ashr <2 x i32> %a, <i32 3, i32 3>
-  ret <2 x i32> %vshr_n
-}
-
-define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) {
-; CHECK: test_vshrq_n_s8
-; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vshr_n = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <16 x i8> %vshr_n
-}
-
-define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) {
-; CHECK: test_vshrq_n_s16
-; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vshr_n = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  ret <8 x i16> %vshr_n
-}
-
-define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) {
-; CHECK: test_vshrq_n_s32
-; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vshr_n = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %vshr_n
-}
-
-define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) {
-; CHECK: test_vshrq_n_s64
-; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vshr_n = ashr <2 x i64> %a, <i64 3, i64 3>
-  ret <2 x i64> %vshr_n
-}
-
-define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) {
-; CHECK: test_vshr_n_u8
-; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vshr_n = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <8 x i8> %vshr_n
-}
-
-define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) {
-; CHECK: test_vshr_n_u16
-; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vshr_n = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
-  ret <4 x i16> %vshr_n
-}
-
-define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) {
-; CHECK: test_vshr_n_u32
-; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vshr_n = lshr <2 x i32> %a, <i32 3, i32 3>
-  ret <2 x i32> %vshr_n
-}
-
-define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) {
-; CHECK: test_vshrq_n_u8
-; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vshr_n = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  ret <16 x i8> %vshr_n
-}
-
-define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) {
-; CHECK: test_vshrq_n_u16
-; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vshr_n = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  ret <8 x i16> %vshr_n
-}
-
-define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) {
-; CHECK: test_vshrq_n_u32
-; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vshr_n = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
-  ret <4 x i32> %vshr_n
-}
-
-define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) {
-; CHECK: test_vshrq_n_u64
-; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vshr_n = lshr <2 x i64> %a, <i64 3, i64 3>
-  ret <2 x i64> %vshr_n
-}
-
-define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsra_n_s8
-; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsra_n = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <8 x i8> %vsra_n, %a
-  ret <8 x i8> %1
-}
-
-define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsra_n_s16
-; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsra_n = ashr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
-  %1 = add <4 x i16> %vsra_n, %a
-  ret <4 x i16> %1
-}
-
-define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsra_n_s32
-; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsra_n = ashr <2 x i32> %b, <i32 3, i32 3>
-  %1 = add <2 x i32> %vsra_n, %a
-  ret <2 x i32> %1
-}
-
-define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsraq_n_s8
-; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsra_n = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <16 x i8> %vsra_n, %a
-  ret <16 x i8> %1
-}
-
-define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsraq_n_s16
-; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsra_n = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %1 = add <8 x i16> %vsra_n, %a
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsraq_n_s32
-; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsra_n = ashr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
-  %1 = add <4 x i32> %vsra_n, %a
-  ret <4 x i32> %1
-}
-
-define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsraq_n_s64
-; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsra_n = ashr <2 x i64> %b, <i64 3, i64 3>
-  %1 = add <2 x i64> %vsra_n, %a
-  ret <2 x i64> %1
-}
-
-define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsra_n_u8
-; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsra_n = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <8 x i8> %vsra_n, %a
-  ret <8 x i8> %1
-}
-
-define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsra_n_u16
-; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsra_n = lshr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
-  %1 = add <4 x i16> %vsra_n, %a
-  ret <4 x i16> %1
-}
-
-define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsra_n_u32
-; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsra_n = lshr <2 x i32> %b, <i32 3, i32 3>
-  %1 = add <2 x i32> %vsra_n, %a
-  ret <2 x i32> %1
-}
-
-define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsraq_n_u8
-; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsra_n = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %1 = add <16 x i8> %vsra_n, %a
-  ret <16 x i8> %1
-}
-
-define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsraq_n_u16
-; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsra_n = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %1 = add <8 x i16> %vsra_n, %a
-  ret <8 x i16> %1
-}
-
-define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsraq_n_u32
-; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsra_n = lshr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
-  %1 = add <4 x i32> %vsra_n, %a
-  ret <4 x i32> %1
-}
-
-define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsraq_n_u64
-; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsra_n = lshr <2 x i64> %b, <i64 3, i64 3>
-  %1 = add <2 x i64> %vsra_n, %a
-  ret <2 x i64> %1
-}
-
-define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) {
-; CHECK: test_vrshr_n_s8
-; CHECK: srshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %a, i32 3)
-  ret <8 x i8> %vrshr_n
-}
-
-
-define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) {
-; CHECK: test_vrshr_n_s16
-; CHECK: srshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %a, i32 3)
-  ret <4 x i16> %vrshr_n
-}
-
-
-define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) {
-; CHECK: test_vrshr_n_s32
-; CHECK: srshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %a, i32 3)
-  ret <2 x i32> %vrshr_n
-}
-
-
-define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) {
-; CHECK: test_vrshrq_n_s8
-; CHECK: srshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %a, i32 3)
-  ret <16 x i8> %vrshr_n
-}
-
-
-define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) {
-; CHECK: test_vrshrq_n_s16
-; CHECK: srshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %a, i32 3)
-  ret <8 x i16> %vrshr_n
-}
-
-
-define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) {
-; CHECK: test_vrshrq_n_s32
-; CHECK: srshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %a, i32 3)
-  ret <4 x i32> %vrshr_n
-}
-
-
-define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) {
-; CHECK: test_vrshrq_n_s64
-; CHECK: srshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %a, i32 3)
-  ret <2 x i64> %vrshr_n
-}
-
-
-define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) {
-; CHECK: test_vrshr_n_u8
-; CHECK: urshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %a, i32 3)
-  ret <8 x i8> %vrshr_n
-}
-
-
-define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) {
-; CHECK: test_vrshr_n_u16
-; CHECK: urshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %a, i32 3)
-  ret <4 x i16> %vrshr_n
-}
-
-
-define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) {
-; CHECK: test_vrshr_n_u32
-; CHECK: urshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %a, i32 3)
-  ret <2 x i32> %vrshr_n
-}
-
-
-define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) {
-; CHECK: test_vrshrq_n_u8
-; CHECK: urshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %a, i32 3)
-  ret <16 x i8> %vrshr_n
-}
-
-
-define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) {
-; CHECK: test_vrshrq_n_u16
-; CHECK: urshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %a, i32 3)
-  ret <8 x i16> %vrshr_n
-}
-
-
-define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) {
-; CHECK: test_vrshrq_n_u32
-; CHECK: urshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %a, i32 3)
-  ret <4 x i32> %vrshr_n
-}
-
-
-define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) {
-; CHECK: test_vrshrq_n_u64
-; CHECK: urshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %a, i32 3)
-  ret <2 x i64> %vrshr_n
-}
-
-
-define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vrsra_n_s8
-; CHECK: srsra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %1 = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %b, i32 3)
-  %vrsra_n = add <8 x i8> %1, %a
-  ret <8 x i8> %vrsra_n
-}
-
-define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vrsra_n_s16
-; CHECK: srsra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %1 = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %b, i32 3)
-  %vrsra_n = add <4 x i16> %1, %a
-  ret <4 x i16> %vrsra_n
-}
-
-define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vrsra_n_s32
-; CHECK: srsra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %1 = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %b, i32 3)
-  %vrsra_n = add <2 x i32> %1, %a
-  ret <2 x i32> %vrsra_n
-}
-
-define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vrsraq_n_s8
-; CHECK: srsra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %1 = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %b, i32 3)
-  %vrsra_n = add <16 x i8> %1, %a
-  ret <16 x i8> %vrsra_n
-}
-
-define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsraq_n_s16
-; CHECK: srsra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %1 = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %b, i32 3)
-  %vrsra_n = add <8 x i16> %1, %a
-  ret <8 x i16> %vrsra_n
-}
-
-define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsraq_n_s32
-; CHECK: srsra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %1 = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %b, i32 3)
-  %vrsra_n = add <4 x i32> %1, %a
-  ret <4 x i32> %vrsra_n
-}
-
-define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsraq_n_s64
-; CHECK: srsra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %1 = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %b, i32 3)
-  %vrsra_n = add <2 x i64> %1, %a
-  ret <2 x i64> %vrsra_n
-}
-
-define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vrsra_n_u8
-; CHECK: ursra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %1 = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %b, i32 3)
-  %vrsra_n = add <8 x i8> %1, %a
-  ret <8 x i8> %vrsra_n
-}
-
-define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vrsra_n_u16
-; CHECK: ursra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %1 = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %b, i32 3)
-  %vrsra_n = add <4 x i16> %1, %a
-  ret <4 x i16> %vrsra_n
-}
-
-define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vrsra_n_u32
-; CHECK: ursra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %1 = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %b, i32 3)
-  %vrsra_n = add <2 x i32> %1, %a
-  ret <2 x i32> %vrsra_n
-}
-
-define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vrsraq_n_u8
-; CHECK: ursra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %1 = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %b, i32 3)
-  %vrsra_n = add <16 x i8> %1, %a
-  ret <16 x i8> %vrsra_n
-}
-
-define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsraq_n_u16
-; CHECK: ursra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %1 = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %b, i32 3)
-  %vrsra_n = add <8 x i16> %1, %a
-  ret <8 x i16> %vrsra_n
-}
-
-define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsraq_n_u32
-; CHECK: ursra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %1 = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %b, i32 3)
-  %vrsra_n = add <4 x i32> %1, %a
-  ret <4 x i32> %vrsra_n
-}
-
-define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsraq_n_u64
-; CHECK: ursra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %1 = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %b, i32 3)
-  %vrsra_n = add <2 x i64> %1, %a
-  ret <2 x i64> %vrsra_n
-}
-
-define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsri_n_s8
-; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-  ret <8 x i8> %vsri_n
-}
-
-
-define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsri_n_s16
-; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3)
-  ret <4 x i16> %vsri
-}
-
-
-define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsri_n_s32
-; CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsri = tail call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3)
-  ret <2 x i32> %vsri
-}
-
-
-define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsriq_n_s8
-; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-  ret <16 x i8> %vsri_n
-}
-
-
-define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsriq_n_s16
-; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3)
-  ret <8 x i16> %vsri
-}
-
-
-define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsriq_n_s32
-; CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsri = tail call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3)
-  ret <4 x i32> %vsri
-}
-
-
-define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsriq_n_s64
-; CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsri = tail call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3)
-  ret <2 x i64> %vsri
-}
-
-define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsri_n_p8
-; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-  ret <8 x i8> %vsri_n
-}
-
-define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsri_n_p16
-; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
-  %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15)
-  ret <4 x i16> %vsri
-}
-
-define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsriq_n_p8
-; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-  ret <16 x i8> %vsri_n
-}
-
-define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsriq_n_p16
-; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
-  %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15)
-  ret <8 x i16> %vsri
-}
-
-define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsli_n_s8
-; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-  ret <8 x i8> %vsli_n
-}
-
-define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsli_n_s16
-; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3)
-  ret <4 x i16> %vsli
-}
-
-define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsli_n_s32
-; CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vsli = tail call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3)
-  ret <2 x i32> %vsli
-}
-
-define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsliq_n_s8
-; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-  ret <16 x i8> %vsli_n
-}
-
-define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsliq_n_s16
-; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3)
-  ret <8 x i16> %vsli
-}
-
-define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsliq_n_s32
-; CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vsli = tail call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3)
-  ret <4 x i32> %vsli
-}
-
-define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsliq_n_s64
-; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vsli = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3)
-  ret <2 x i64> %vsli
-}
-
-define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsli_n_p8
-; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
-  ret <8 x i8> %vsli_n
-}
-
-define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsli_n_p16
-; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
-  %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15)
-  ret <4 x i16> %vsli
-}
-
-define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsliq_n_p8
-; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
-  ret <16 x i8> %vsli_n
-}
-
-define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsliq_n_p16
-; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
-  %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15)
-  ret <8 x i16> %vsli
-}
-
-define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) {
-; CHECK: test_vqshl_n_s8
-; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vqshl = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <8 x i8> %vqshl
-}
-
-
-define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) {
-; CHECK: test_vqshl_n_s16
-; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
-  ret <4 x i16> %vqshl
-}
-
-
-define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) {
-; CHECK: test_vqshl_n_s32
-; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>)
-  ret <2 x i32> %vqshl
-}
-
-
-define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) {
-; CHECK: test_vqshlq_n_s8
-; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <16 x i8> %vqshl_n
-}
-
-
-define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshlq_n_s16
-; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-  ret <8 x i16> %vqshl
-}
-
-
-define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshlq_n_s32
-; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
-  ret <4 x i32> %vqshl
-}
-
-
-define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshlq_n_s64
-; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>)
-  ret <2 x i64> %vqshl
-}
-
-
-define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) {
-; CHECK: test_vqshl_n_u8
-; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <8 x i8> %vqshl_n
-}
-
-
-define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) {
-; CHECK: test_vqshl_n_u16
-; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
-  ret <4 x i16> %vqshl
-}
-
-
-define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) {
-; CHECK: test_vqshl_n_u32
-; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>)
-  ret <2 x i32> %vqshl
-}
-
-
-define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) {
-; CHECK: test_vqshlq_n_u8
-; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <16 x i8> %vqshl_n
-}
-
-
-define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) {
-; CHECK: test_vqshlq_n_u16
-; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-  ret <8 x i16> %vqshl
-}
-
-
-define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) {
-; CHECK: test_vqshlq_n_u32
-; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
-  ret <4 x i32> %vqshl
-}
-
-
-define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) {
-; CHECK: test_vqshlq_n_u64
-; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>)
-  ret <2 x i64> %vqshl
-}
-
-define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) {
-; CHECK: test_vqshlu_n_s8
-; CHECK: sqshlu {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
-  %vqshlu = tail call <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8> %a, i32 3)
-  ret <8 x i8> %vqshlu
-}
-
-
-define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) {
-; CHECK: test_vqshlu_n_s16
-; CHECK: sqshlu {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
-  %vqshlu = tail call <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16> %a, i32 3)
-  ret <4 x i16> %vqshlu
-}
-
-
-define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) {
-; CHECK: test_vqshlu_n_s32
-; CHECK: sqshlu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
-  %vqshlu = tail call <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32> %a, i32 3)
-  ret <2 x i32> %vqshlu
-}
-
-
-define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) {
-; CHECK: test_vqshluq_n_s8
-; CHECK: sqshlu {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
-  %vqshlu = tail call <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8> %a, i32 3)
-  ret <16 x i8> %vqshlu
-}
-
-
-define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshluq_n_s16
-; CHECK: sqshlu {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
-  %vqshlu = tail call <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16> %a, i32 3)
-  ret <8 x i16> %vqshlu
-}
-
-
-define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshluq_n_s32
-; CHECK: sqshlu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
-  %vqshlu = tail call <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32> %a, i32 3)
-  ret <4 x i32> %vqshlu
-}
-
-
-define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshluq_n_s64
-; CHECK: sqshlu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
-  %vqshlu = tail call <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64> %a, i32 3)
-  ret <2 x i64> %vqshlu
-}
-
-
-define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vshrn_n_s16
-; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %1 = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  ret <8 x i8> %vshrn_n
-}
-
-define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vshrn_n_s32
-; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %1 = ashr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  ret <4 x i16> %vshrn_n
-}
-
-define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vshrn_n_s64
-; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %1 = ashr <2 x i64> %a, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
-  ret <2 x i32> %vshrn_n
-}
-
-define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vshrn_n_u16
-; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %1 = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  ret <8 x i8> %vshrn_n
-}
-
-define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vshrn_n_u32
-; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %1 = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  ret <4 x i16> %vshrn_n
-}
-
-define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vshrn_n_u64
-; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %1 = lshr <2 x i64> %a, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
-  ret <2 x i32> %vshrn_n
-}
-
-define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vshrn_high_n_s16
-; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %1 = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  %2 = bitcast <8 x i8> %a to <1 x i64>
-  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %4
-}
-
-define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vshrn_high_n_s32
-; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %1 = ashr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  %2 = bitcast <4 x i16> %a to <1 x i64>
-  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %4
-}
-
-define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vshrn_high_n_s64
-; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %2 = ashr <2 x i64> %b, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
-  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %4
-}
-
-define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vshrn_high_n_u16
-; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %1 = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
-  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
-  %2 = bitcast <8 x i8> %a to <1 x i64>
-  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %4
-}
-
-define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vshrn_high_n_u32
-; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %1 = lshr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
-  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
-  %2 = bitcast <4 x i16> %a to <1 x i64>
-  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %4
-}
-
-define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vshrn_high_n_u64
-; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %2 = lshr <2 x i64> %b, <i64 19, i64 19>
-  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
-  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
-  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %4
-}
-
-define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshrun_n_s16
-; CHECK: sqshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqshrun
-}
-
-
-define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshrun_n_s32
-; CHECK: sqshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqshrun
-}
-
-define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshrun_n_s64
-; CHECK: sqshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqshrun
-}
-
-define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrun_high_n_s16
-; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrun_high_n_s32
-; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrun_high_n_s64
-; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vrshrn_n_s16
-; CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vrshrn
-}
-
-
-define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vrshrn_n_s32
-; CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vrshrn
-}
-
-
-define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vrshrn_n_s64
-; CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vrshrn
-}
-
-define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vrshrn_high_n_s16
-; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vrshrn_high_n_s32
-; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vrshrn_high_n_s64
-; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) {
-; CHECK: test_vqrshrun_n_s16
-; CHECK: sqrshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqrshrun
-}
-
-define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) {
-; CHECK: test_vqrshrun_n_s32
-; CHECK: sqrshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqrshrun
-}
-
-define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) {
-; CHECK: test_vqrshrun_n_s64
-; CHECK: sqrshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqrshrun
-}
-
-define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrun_high_n_s16
-; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrun_high_n_s32
-; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrun_high_n_s64
-; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshrn_n_s16
-; CHECK: sqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqshrn
-}
-
-
-define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshrn_n_s32
-; CHECK: sqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqshrn
-}
-
-
-define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshrn_n_s64
-; CHECK: sqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqshrn
-}
-
-
-define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vqshrn_n_u16
-; CHECK: uqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqshrn
-}
-
-
-define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vqshrn_n_u32
-; CHECK: uqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqshrn
-}
-
-
-define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vqshrn_n_u64
-; CHECK: uqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqshrn
-}
-
-
-define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrn_high_n_s16
-; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrn_high_n_s32
-; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrn_high_n_s64
-; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrn_high_n_u16
-; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrn_high_n_u32
-; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrn_high_n_u64
-; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vqrshrn_n_s16
-; CHECK: sqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqrshrn
-}
-
-
-define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vqrshrn_n_s32
-; CHECK: sqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqrshrn
-}
-
-
-define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vqrshrn_n_s64
-; CHECK: sqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqrshrn
-}
-
-
-define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vqrshrn_n_u16
-; CHECK: uqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %a, i32 3)
-  ret <8 x i8> %vqrshrn
-}
-
-
-define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vqrshrn_n_u32
-; CHECK: uqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %a, i32 9)
-  ret <4 x i16> %vqrshrn
-}
-
-
-define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vqrshrn_n_u64
-; CHECK: uqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
-  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %a, i32 19)
-  ret <2 x i32> %vqrshrn
-}
-
-
-define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrn_high_n_s16
-; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrn_high_n_s32
-; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrn_high_n_s64
-; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrn_high_n_u16
-; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %b, i32 3)
-  %1 = bitcast <8 x i8> %a to <1 x i64>
-  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrn_high_n_u32
-; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %b, i32 9)
-  %1 = bitcast <4 x i16> %a to <1 x i64>
-  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrn_high_n_u64
-; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
-  %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %b, i32 19)
-  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) {
-; CHECK: test_vcvt_n_f32_s32
-; CHECK: scvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
-  %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 31)
-  ret <2 x float> %vcvt
-}
-
-define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) {
-; CHECK: test_vcvtq_n_f32_s32
-; CHECK: scvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
-  %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 31)
-  ret <4 x float> %vcvt
-}
-
-define <2 x double> @test_vcvtq_n_f64_s64(<2 x i64> %a) {
-; CHECK: test_vcvtq_n_f64_s64
-; CHECK: scvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
-  %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 50)
-  ret <2 x double> %vcvt
-}
-
-define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) {
-; CHECK: test_vcvt_n_f32_u32
-; CHECK: ucvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
-  %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 31)
-  ret <2 x float> %vcvt
-}
-
-define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) {
-; CHECK: test_vcvtq_n_f32_u32
-; CHECK: ucvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
-  %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 31)
-  ret <4 x float> %vcvt
-}
-
-define <2 x double> @test_vcvtq_n_f64_u64(<2 x i64> %a) {
-; CHECK: test_vcvtq_n_f64_u64
-; CHECK: ucvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
-  %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 50)
-  ret <2 x double> %vcvt
-}
-
-define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) {
-; CHECK: test_vcvt_n_s32_f32
-; CHECK: fcvtzs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
-  %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %a, i32 31)
-  ret <2 x i32> %vcvt
-}
-
-define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) {
-; CHECK: test_vcvtq_n_s32_f32
-; CHECK: fcvtzs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
-  %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %a, i32 31)
-  ret <4 x i32> %vcvt
-}
-
-define <2 x i64> @test_vcvtq_n_s64_f64(<2 x double> %a) {
-; CHECK: test_vcvtq_n_s64_f64
-; CHECK: fcvtzs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
-  %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %a, i32 50)
-  ret <2 x i64> %vcvt
-}
-
-define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) {
-; CHECK: test_vcvt_n_u32_f32
-; CHECK: fcvtzu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
-  %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %a, i32 31)
-  ret <2 x i32> %vcvt
-}
-
-define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) {
-; CHECK: test_vcvt_n_u32_f32
-; CHECK: fcvtzu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
-  %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %a, i32 31)
-  ret <4 x i32> %vcvt
-}
-
-define <2 x i64> @test_vcvtq_n_u64_f64(<2 x double> %a) {
-; CHECK: test_vcvtq_n_u64_f64
-; CHECK: fcvtzu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
-  %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %a, i32 50)
-  ret <2 x i64> %vcvt
-}
-
-declare <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8>, <8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16>, <4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32>, <2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8>, <16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16>, <8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32>, <4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64>, <2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
-
-declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
-
-declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
-
-declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
-
-declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
-
-declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
-
-declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
-
-declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
-
-declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) 
-
-declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) 
-
-declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) 
-
-declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64>, i32)
-
-declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
-
-declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
-
-declare <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
-
-declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
-
-declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
-
-declare <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
-
-declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
-
-declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
-
-declare <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
-
-declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
-
-declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
-
-declare <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
-
-define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_n_s64_f64
-; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
-  ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_n_u64_f64
-; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
-  ret <1 x i64> %1
-}
-
-define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_n_f64_s64
-; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
-  ret <1 x double> %1
-}
-
-define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_n_f64_u64
-; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
-  ret <1 x double> %1
-}
-
-declare <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
-declare <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
-declare <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
-declare <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-simd-tbl.ll b/test/CodeGen/AArch64/neon-simd-tbl.ll
deleted file mode 100644
index 7a51c0f..0000000
--- a/test/CodeGen/AArch64/neon-simd-tbl.ll
+++ /dev/null
@@ -1,828 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8>, <8 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8>, <16 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtbl1_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl11.i
-}
-
-define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vqtbl1_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
-  ret <8 x i8> %vtbl1.i
-}
-
-define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl2_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
-  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl17.i
-}
-
-define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl2_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl2.i
-}
-
-define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl3_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl212.i
-}
-
-define <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl3_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl3.i
-}
-
-define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl4_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl216.i
-}
-
-define <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl4_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl4.i
-}
-
-define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vqtbl1q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %vtbl1.i
-}
-
-define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl2q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl2.i
-}
-
-define <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl3q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl3.i
-}
-
-define <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl4q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl4.i
-}
-
-define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vtbx1_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx2_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx17.i
-}
-
-define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx3_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx4_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx216.i
-}
-
-define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vqtbx1_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
-  ret <8 x i8> %vtbx1.i
-}
-
-define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx2_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx2.i
-}
-
-define <8 x i8> @test_vqtbx3_s8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx3_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx3.i
-}
-
-define <8 x i8> @test_vqtbx4_s8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx4_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx4.i
-}
-
-define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vqtbx1q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %vtbx1.i
-}
-
-define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx2q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx2.i
-}
-
-define <16 x i8> @test_vqtbx3q_s8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx3q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx3.i
-}
-
-define <16 x i8> @test_vqtbx4q_s8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx4q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx4.i
-}
-
-define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtbl1_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl11.i
-}
-
-define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vqtbl1_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
-  ret <8 x i8> %vtbl1.i
-}
-
-define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl2_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
-  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl17.i
-}
-
-define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl2_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl2.i
-}
-
-define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl3_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl212.i
-}
-
-define <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl3_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl3.i
-}
-
-define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl4_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl216.i
-}
-
-define <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl4_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl4.i
-}
-
-define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vqtbl1q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %vtbl1.i
-}
-
-define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl2q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl2.i
-}
-
-define <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl3q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl3.i
-}
-
-define <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl4q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl4.i
-}
-
-define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vtbx1_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx2_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx17.i
-}
-
-define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx3_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx4_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx216.i
-}
-
-define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vqtbx1_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
-  ret <8 x i8> %vtbx1.i
-}
-
-define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx2_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx2.i
-}
-
-define <8 x i8> @test_vqtbx3_u8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx3_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx3.i
-}
-
-define <8 x i8> @test_vqtbx4_u8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx4_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx4.i
-}
-
-define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vqtbx1q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %vtbx1.i
-}
-
-define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx2q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx2.i
-}
-
-define <16 x i8> @test_vqtbx3q_u8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx3q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx3.i
-}
-
-define <16 x i8> @test_vqtbx4q_u8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx4q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx4.i
-}
-
-define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtbl1_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl11.i
-}
-
-define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vqtbl1_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
-  ret <8 x i8> %vtbl1.i
-}
-
-define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl2_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
-  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl17.i
-}
-
-define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl2_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl2.i
-}
-
-define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl3_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl212.i
-}
-
-define <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl3_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl3.i
-}
-
-define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl4_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
-  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl216.i
-}
-
-define <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl4_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
-  ret <8 x i8> %vtbl4.i
-}
-
-define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vqtbl1q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
-  ret <16 x i8> %vtbl1.i
-}
-
-define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl2q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
-  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl2.i
-}
-
-define <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl3q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
-  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl3.i
-}
-
-define <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl4q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
-  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
-  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
-  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
-  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
-  ret <16 x i8> %vtbl4.i
-}
-
-define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vtbx1_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx2_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
-  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx17.i
-}
-
-define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx3_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
-  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
-  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
-  %1 = sext <8 x i1> %0 to <8 x i8>
-  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
-  ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx4_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
-  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx216.i
-}
-
-define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vqtbx1_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
-  ret <8 x i8> %vtbx1.i
-}
-
-define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx2_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx2.i
-}
-
-define <8 x i8> @test_vqtbx3_p8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx3_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx3.i
-}
-
-define <8 x i8> @test_vqtbx4_p8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx4_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
-  ret <8 x i8> %vtbx4.i
-}
-
-define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vqtbx1q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %vtbx1.i
-}
-
-define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx2q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
-  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx2.i
-}
-
-define <16 x i8> @test_vqtbx3q_p8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx3q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
-  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx3.i
-}
-
-define <16 x i8> @test_vqtbx4q_p8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx4q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
-  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
-  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
-  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
-  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
-  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
-  ret <16 x i8> %vtbx4.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-simd-vget.ll b/test/CodeGen/AArch64/neon-simd-vget.ll
deleted file mode 100644
index 6474499..0000000
--- a/test/CodeGen/AArch64/neon-simd-vget.ll
+++ /dev/null
@@ -1,225 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <8 x i8> @test_vget_high_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_high_s8:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_high_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_high_s16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vget_high_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_vget_high_s32:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  ret <2 x i32> %shuffle.i
-}
-
-define <1 x i64> @test_vget_high_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_high_s64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
-  ret <1 x i64> %shuffle.i
-}
-
-define <8 x i8> @test_vget_high_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_high_u8:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_high_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_high_u16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vget_high_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vget_high_u32:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  ret <2 x i32> %shuffle.i
-}
-
-define <1 x i64> @test_vget_high_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_high_u64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
-  ret <1 x i64> %shuffle.i
-}
-
-define <1 x i64> @test_vget_high_p64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_high_p64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
-  ret <1 x i64> %shuffle.i
-}
-
-define <4 x i16> @test_vget_high_f16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_high_f16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x float> @test_vget_high_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vget_high_f32:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
-  ret <2 x float> %shuffle.i
-}
-
-define <8 x i8> @test_vget_high_p8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_high_p8:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_high_p16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_high_p16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %shuffle.i
-}
-
-define <1 x double> @test_vget_high_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vget_high_f64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
-entry:
-  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> <i32 1>
-  ret <1 x double> %shuffle.i
-}
-
-define <8 x i8> @test_vget_low_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_low_s8:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_low_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_low_s16:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vget_low_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_vget_low_s32:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i32> %shuffle.i
-}
-
-define <1 x i64> @test_vget_low_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_low_s64:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
-  ret <1 x i64> %shuffle.i
-}
-
-define <8 x i8> @test_vget_low_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_low_u8:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_low_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_low_u16:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vget_low_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vget_low_u32:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i32> %shuffle.i
-}
-
-define <1 x i64> @test_vget_low_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_low_u64:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
-  ret <1 x i64> %shuffle.i
-}
-
-define <1 x i64> @test_vget_low_p64(<2 x i64> %a) {
-; CHECK-LABEL: test_vget_low_p64:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
-  ret <1 x i64> %shuffle.i
-}
-
-define <4 x i16> @test_vget_low_f16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_low_f16:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i16> %shuffle.i
-}
-
-define <2 x float> @test_vget_low_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vget_low_f32:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x float> %shuffle.i
-}
-
-define <8 x i8> @test_vget_low_p8(<16 x i8> %a) {
-; CHECK-LABEL: test_vget_low_p8:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vget_low_p16(<8 x i16> %a) {
-; CHECK-LABEL: test_vget_low_p16:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i16> %shuffle.i
-}
-
-define <1 x double> @test_vget_low_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vget_low_f64:
-; CHECK: ret
-entry:
-  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> zeroinitializer
-  ret <1 x double> %shuffle.i
-}
diff --git a/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll b/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
deleted file mode 100644
index bb3300e..0000000
--- a/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-; This file tests the spill of FPR8/FPR16. The volatile loads/stores force the
-; allocator to keep the value live until it's needed.
-
-%bigtype_v1i8 = type [20 x <1 x i8>]
-
-define void @spill_fpr8(%bigtype_v1i8* %addr) {
-; CHECK-LABEL: spill_fpr8:
-; CHECK: 1-byte Folded Spill
-; CHECK: 1-byte Folded Reload
-  %val1 = load volatile %bigtype_v1i8* %addr
-  %val2 = load volatile %bigtype_v1i8* %addr
-  store volatile %bigtype_v1i8 %val1, %bigtype_v1i8* %addr
-  store volatile %bigtype_v1i8 %val2, %bigtype_v1i8* %addr
-  ret void
-}
-
-%bigtype_v1i16 = type [20 x <1 x i16>]
-
-define void @spill_fpr16(%bigtype_v1i16* %addr) {
-; CHECK-LABEL: spill_fpr16:
-; CHECK: 2-byte Folded Spill
-; CHECK: 2-byte Folded Reload
-  %val1 = load volatile %bigtype_v1i16* %addr
-  %val2 = load volatile %bigtype_v1i16* %addr
-  store volatile %bigtype_v1i16 %val1, %bigtype_v1i16* %addr
-  store volatile %bigtype_v1i16 %val2, %bigtype_v1i16* %addr
-  ret void
-}
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-truncStore-extLoad.ll b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
index e5b7694..1df3719 100644
--- a/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
+++ b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
@@ -5,7 +5,7 @@
 define void @truncStore.v2i64(<2 x i64> %a, <2 x i32>* %result) {
 ; CHECK-LABEL: truncStore.v2i64:
 ; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+; CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [x{{[0-9]+|sp}}]
   %b = trunc <2 x i64> %a to <2 x i32>
   store <2 x i32> %b, <2 x i32>* %result
   ret void
@@ -14,7 +14,7 @@ define void @truncStore.v2i64(<2 x i64> %a, <2 x i32>* %result) {
 define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) {
 ; CHECK-LABEL: truncStore.v4i32:
 ; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+; CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [x{{[0-9]+|sp}}]
   %b = trunc <4 x i32> %a to <4 x i16>
   store <4 x i16> %b, <4 x i16>* %result
   ret void
@@ -23,7 +23,7 @@ define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) {
 define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) {
 ; CHECK-LABEL: truncStore.v8i16:
 ; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+; CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [x{{[0-9]+|sp}}]
   %b = trunc <8 x i16> %a to <8 x i8>
   store <8 x i8> %b, <8 x i8>* %result
   ret void
@@ -54,4 +54,4 @@ define i32 @loadExt.i32(<4 x i8>* %ref) {
   %vecext = extractelement <4 x i8> %a, i32 0
   %conv = zext i8 %vecext to i32
   ret i32 %conv
-}
\ No newline at end of file
+}
diff --git a/test/CodeGen/AArch64/neon-v1i1-setcc.ll b/test/CodeGen/AArch64/neon-v1i1-setcc.ll
deleted file mode 100644
index 6c7d009..0000000
--- a/test/CodeGen/AArch64/neon-v1i1-setcc.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-; This file test the DAG node like "v1i1 SETCC v1i64, v1i64". As the v1i1 type
-; is illegal in AArch64 backend, the legalizer tries to scalarize this node.
-; As the v1i64 operands of SETCC are legal types, they will not be scalarized.
-; Currently the type legalizer will have an assertion failure as it assumes all
-; operands of SETCC have been legalized.
-; FIXME: If the algorithm of type scalarization is improved and can legaize
-; "v1i1 SETCC" correctly, these test cases are not needed.
-
-define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
-; CHECK-LABEL: test_sext_extr_cmp_0:
-; CHECK: cmge d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = icmp sge <1 x i64> %v1, %v2
-  %2 = extractelement <1 x i1> %1, i32 0
-  %vget_lane = sext i1 %2 to i64
-  ret i64 %vget_lane
-}
-
-define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
-; CHECK-LABEL: test_sext_extr_cmp_1:
-; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = fcmp oeq <1 x double> %v1, %v2
-  %2 = extractelement <1 x i1> %1, i32 0
-  %vget_lane = sext i1 %2 to i64
-  ret i64 %vget_lane
-}
-
-define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_select_v1i1_0:
-; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %1 = icmp eq <1 x i64> %v1, %v2
-  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
-  ret <1 x i64> %res
-}
-
-define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_select_v1i1_1:
-; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %1 = fcmp oeq <1 x double> %v1, %v2
-  %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
-  ret <1 x i64> %res
-}
-
-define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
-; CHECK-LABEL: test_select_v1i1_2:
-; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-  %1 = icmp eq <1 x i64> %v1, %v2
-  %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
-  ret <1 x double> %res
-}
-
-define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) {
-; CHECK-LABEL: test_br_extr_cmp:
-; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}}
-  %1 = icmp eq <1 x i64> %v1, %v2
-  %2 = extractelement <1 x i1> %1, i32 0
-  br i1 %2, label %if.end, label %if.then
-
-if.then:
-  ret i32 0;
-
-if.end:
-  ret i32 1;
-}
diff --git a/test/CodeGen/AArch64/neon-vector-list-spill.ll b/test/CodeGen/AArch64/neon-vector-list-spill.ll
deleted file mode 100644
index 3ab69c4..0000000
--- a/test/CodeGen/AArch64/neon-vector-list-spill.ll
+++ /dev/null
@@ -1,175 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast
-
-; FIXME: We should not generate ld/st for such register spill/fill, because the
-; test case seems very simple and the register pressure is not high. If the
-; spill/fill algorithm is optimized, this test case may not be triggered. And
-; then we can delete it.
-define i32 @spill.DPairReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.DPairReg:
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %arg1, i32 4)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <2 x i32>, <2 x i32> } %vld, 0
-  %res = extractelement <2 x i32> %vld.extract, i32 1
-  ret i32 %res
-}
-
-define i16 @spill.DTripleReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.DTripleReg:
-; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %arg1, i32 4)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
-  %res = extractelement <4 x i16> %vld.extract, i32 1
-  ret i16 %res
-}
-
-define i16 @spill.DQuadReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.DQuadReg:
-; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %arg1, i32 4)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
-  %res = extractelement <4 x i16> %vld.extract, i32 0
-  ret i16 %res
-}
-
-define i32 @spill.QPairReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.QPairReg:
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %arg1, i32 4)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
-  %res = extractelement <4 x i32> %vld.extract, i32 1
-  ret i32 %res
-}
-
-define float @spill.QTripleReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.QTripleReg:
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-entry:
-  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %arg1, i32 4)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
-  %res = extractelement <4 x float> %vld3.extract, i32 1
-  ret float %res
-}
-
-define i8 @spill.QQuadReg(i8* %arg1, i32 %arg2) {
-; CHECK-LABEL: spill.QQuadReg:
-; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-entry:
-  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %arg1, i32 4)
-  %cmp = icmp eq i32 %arg2, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  tail call void @foo()
-  br label %if.end
-
-if.end:
-  %vld.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld, 0
-  %res = extractelement <16 x i8> %vld.extract, i32 1
-  ret i8 %res
-}
-
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
-
-declare void @foo()
-
-; FIXME: We should not generate ld/st for such register spill/fill, because the
-; test case seems very simple and the register pressure is not high. If the
-; spill/fill algorithm is optimized, this test case may not be triggered. And
-; then we can delete it.
-; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_2xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
-  tail call void @foo()
-  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
-  %1 = bitcast <2 x i64> %sv to <8 x i16>
-  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %3 = mul <8 x i16> %2, %2
-  ret <8 x i16> %3
-}
-
-; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_3xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
-  tail call void @foo()
-  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
-  %1 = bitcast <2 x i64> %sv to <8 x i16>
-  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %3 = mul <8 x i16> %2, %2
-  ret <8 x i16> %3
-}
-
-; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_4xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
-  tail call void @foo()
-  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
-  %1 = bitcast <2 x i64> %sv to <8 x i16>
-  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %3 = mul <8 x i16> %2, %2
-  ret <8 x i16> %3
-}
-
-declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
\ No newline at end of file
diff --git a/test/CodeGen/AArch64/nzcv-save.ll b/test/CodeGen/AArch64/nzcv-save.ll
new file mode 100644
index 0000000..32baff3
--- /dev/null
+++ b/test/CodeGen/AArch64/nzcv-save.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=aarch64 < %s | FileCheck %s
+
+; CHECK: mrs [[NZCV_SAVE:x[0-9]+]], NZCV
+; CHECK: msr NZCV, [[NZCV_SAVE]]
+
+; DAG ends up with two uses for the flags from an ADCS node, which means they
+; must be saved for later.
+define void @f(i256* nocapture %a, i256* nocapture %b, i256* nocapture %cc, i256* nocapture %dd) nounwind uwtable noinline ssp {
+entry:
+  %c = load i256* %cc
+  %d = load i256* %dd
+  %add = add nsw i256 %c, %d
+  store i256 %add, i256* %a, align 8
+  %or = or i256 %c, 1606938044258990275541962092341162602522202993782792835301376
+  %add6 = add nsw i256 %or, %d
+  store i256 %add6, i256* %b, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll
index 3404d3f..e8c7625 100644
--- a/test/CodeGen/AArch64/pic-eh-stubs.ll
+++ b/test/CodeGen/AArch64/pic-eh-stubs.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
 
 ; Make sure exception-handling PIC code can be linked correctly. An alternative
 ; to the sequence described below would have .gcc_except_table itself writable
@@ -11,8 +11,8 @@
   ; ... referring indirectly to stubs for its typeinfo ...
 ; CHECK: // @TType Encoding = indirect pcrel sdata8
   ; ... one of which is "int"'s typeinfo
-; CHECK: .Ltmp7:
-; CHECK-NEXT: .xword  .L_ZTIi.DW.stub-.Ltmp7
+; CHECK: [[TYPEINFO_LBL:.Ltmp[0-9]+]]: // TypeInfo 1
+; CHECK-NEXT: .xword  .L_ZTIi.DW.stub-[[TYPEINFO_LBL]]
 
   ; .. and which is properly defined (in a writable section for the dynamic loader) later.
 ; CHECK: .section .data.rel,"aw"
diff --git a/test/CodeGen/AArch64/ragreedy-csr.ll b/test/CodeGen/AArch64/ragreedy-csr.ll
index 18a948b..de29b1b 100644
--- a/test/CodeGen/AArch64/ragreedy-csr.ll
+++ b/test/CodeGen/AArch64/ragreedy-csr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -regalloc=greedy -regalloc-csr-first-time-cost=15 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -regalloc=greedy -regalloc-csr-first-time-cost=15 | FileCheck %s
 
 ; This testing case is reduced from 197.parser prune_match function.
 ; We make sure that we do not use callee-saved registers (x19 to x25).
@@ -6,14 +6,14 @@
 
 ; CHECK-LABEL: prune_match:
 ; CHECK: entry
-; CHECK: str x30, [sp
+; CHECK: {{str x30|stp x29, x30}}, [sp
 ; CHECK-NOT: stp x25,
 ; CHECK-NOT: stp x23, x24
 ; CHECK-NOT: stp x21, x22
 ; CHECK-NOT: stp x19, x20
 ; CHECK: if.end
 ; CHECK: return
-; CHECK: ldr x30, [sp
+; CHECK: {{ldr x30|ldp x29, x30}}, [sp
 ; CHECK-NOT: ldp x19, x20
 ; CHECK-NOT: ldp x21, x22
 ; CHECK-NOT: ldp x23, x24
diff --git a/test/CodeGen/AArch64/regress-bitcast-formals.ll b/test/CodeGen/AArch64/regress-bitcast-formals.ll
index 9655f90..58e0542 100644
--- a/test/CodeGen/AArch64/regress-bitcast-formals.ll
+++ b/test/CodeGen/AArch64/regress-bitcast-formals.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios7.0 -verify-machineinstrs < %s | FileCheck %s
 
 ; CallingConv.td requires a bitcast for vector arguments. Make sure we're
 ; actually capable of that (the test was omitted from LowerFormalArguments).
diff --git a/test/CodeGen/AArch64/regress-f128csel-flags.ll b/test/CodeGen/AArch64/regress-f128csel-flags.ll
index b35185c..25b5e0c 100644
--- a/test/CodeGen/AArch64/regress-f128csel-flags.ll
+++ b/test/CodeGen/AArch64/regress-f128csel-flags.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 ; We used to not mark NZCV as being used in the continuation basic-block
 ; when lowering a 128-bit "select" to branches. This meant a subsequent use
diff --git a/test/CodeGen/AArch64/regress-fp128-livein.ll b/test/CodeGen/AArch64/regress-fp128-livein.ll
index cb8432a..5e6ab0a 100644
--- a/test/CodeGen/AArch64/regress-fp128-livein.ll
+++ b/test/CodeGen/AArch64/regress-fp128-livein.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s
 
 ; Regression test for NZCV reg live-in not being added to fp128csel IfTrue BB,
 ; causing a crash during live range calc.
diff --git a/test/CodeGen/AArch64/regress-tail-livereg.ll b/test/CodeGen/AArch64/regress-tail-livereg.ll
index 053249c..e32ac84 100644
--- a/test/CodeGen/AArch64/regress-tail-livereg.ll
+++ b/test/CodeGen/AArch64/regress-tail-livereg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
 @var = global void()* zeroinitializer
 
 declare void @bar()
diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll
index ff77fb4..477d996 100644
--- a/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
 
 ; When generating DAG selection tables, TableGen used to only flag an
 ; instruction as needing a chain on its own account if it had a built-in pattern
@@ -17,17 +17,18 @@ define i64 @test_chains() {
   %locvar = alloca i8
 
   call void @bar(i8* %locvar)
-; CHECK: bl bar
+; CHECK: bl {{_?bar}}
 
   %inc.1 = load i8* %locvar
   %inc.2 = zext i8 %inc.1 to i64
   %inc.3 = add i64 %inc.2, 1
   %inc.4 = trunc i64 %inc.3 to i8
   store i8 %inc.4, i8* %locvar
-; CHECK: ldrb {{w[0-9]+}}, [sp, [[LOCADDR:#[0-9]+]]]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #1
-; CHECK: strb {{w[0-9]+}}, [sp, [[LOCADDR]]]
-; CHECK: ldrb {{w[0-9]+}}, [sp, [[LOCADDR]]]
+
+; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]]
+; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]]
 
   %ret.1 = load i8* %locvar
   %ret.2 = zext i8 %ret.1 to i64
diff --git a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
index 0ef9818..c3167e4 100644
--- a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
+++ b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
@@ -5,22 +5,7 @@ declare void @bar()
 
 define void @test_w29_reserved() {
 ; CHECK-LABEL: test_w29_reserved:
-; CHECK: .cfi_startproc
-; CHECK: .cfi_def_cfa sp, 96
 ; CHECK: add x29, sp, #{{[0-9]+}}
-; CHECK: .cfi_def_cfa x29, 16
-; CHECK: .cfi_offset x30, -8
-; CHECK: .cfi_offset x29, -16
-; CHECK: .cfi_offset x28, -24
-; CHECK: .cfi_offset x27, -32
-; CHECK: .cfi_offset x26, -40
-; CHECK: .cfi_offset x25, -48
-; CHECK: .cfi_offset x24, -56
-; CHECK: .cfi_offset x23, -64
-; CHECK: .cfi_offset x22, -72
-; CHECK: .cfi_offset x21, -80
-; CHECK: .cfi_offset x20, -88
-; CHECK: .cfi_offset x19, -96
 
   %val1 = load volatile i32* @var
   %val2 = load volatile i32* @var
diff --git a/test/CodeGen/AArch64/regress-wzr-allocatable.ll b/test/CodeGen/AArch64/regress-wzr-allocatable.ll
deleted file mode 100644
index 764d2bc..0000000
--- a/test/CodeGen/AArch64/regress-wzr-allocatable.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0
-
-; When WZR wasn't marked as reserved, this function tried to allocate
-; it at O0 and then generated an internal fault (mostly incidentally)
-; when it discovered that it was already in use for a multiplication.
-
-; I'm not really convinced this is a good test since it could easily
-; stop testing what it does now with no-one any the wiser. However, I
-; can't think of a better way to force the allocator to use WZR
-; specifically.
-
-define void @test() nounwind {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.body, %entry
-  br i1 undef, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  br label %for.cond6
-
-for.cond6:                                        ; preds = %for.body9, %for.end
-  br i1 undef, label %for.body9, label %while.cond30
-
-for.body9:                                        ; preds = %for.cond6
-  store i16 0, i16* undef, align 2
-  %0 = load i32* undef, align 4
-  %1 = load i32* undef, align 4
-  %mul15 = mul i32 %0, %1
-  %add16 = add i32 %mul15, 32768
-  %div = udiv i32 %add16, 65535
-  %add17 = add i32 %div, 1
-  store i32 %add17, i32* undef, align 4
-  br label %for.cond6
-
-while.cond30:                                     ; preds = %for.cond6
-  ret void
-}
diff --git a/test/CodeGen/AArch64/returnaddr.ll b/test/CodeGen/AArch64/returnaddr.ll
index c85f9ec..b136f04 100644
--- a/test/CodeGen/AArch64/returnaddr.ll
+++ b/test/CodeGen/AArch64/returnaddr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu  | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i8* @rt0(i32 %x) nounwind readnone {
 entry:
diff --git a/test/CodeGen/AArch64/setcc-takes-i32.ll b/test/CodeGen/AArch64/setcc-takes-i32.ll
index bd79685..ec86159 100644
--- a/test/CodeGen/AArch64/setcc-takes-i32.ll
+++ b/test/CodeGen/AArch64/setcc-takes-i32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
 
 ; Most important point here is that the promotion of the i1 works
 ; correctly. Previously LLVM thought that i64 was the appropriate SetCC output,
diff --git a/test/CodeGen/AArch64/sext_inreg.ll b/test/CodeGen/AArch64/sext_inreg.ll
deleted file mode 100644
index 2f76081..0000000
--- a/test/CodeGen/AArch64/sext_inreg.ll
+++ /dev/null
@@ -1,198 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-; For formal arguments, we have the following vector type promotion,
-; v2i8 is promoted to v2i32(f64)
-; v2i16 is promoted to v2i32(f64)
-; v4i8 is promoted to v4i16(f64)
-; v8i1 is promoted to v8i16(f128)
-
-define <2 x i8> @test_sext_inreg_v2i8i16(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i16
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %1 = sext <2 x i8> %v1 to <2 x i16>
-  %2 = sext <2 x i8> %v2 to <2 x i16>
-  %3 = shufflevector <2 x i16> %1, <2 x i16> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i16> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <2 x i8> @test_sext_inreg_v2i8i16_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i16_2
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %a1 = shl <2 x i32> %v1, <i32 24, i32 24>
-  %a2 = ashr <2 x i32> %a1, <i32 24, i32 24>
-  %b1 = shl <2 x i32> %v2, <i32 24, i32 24>
-  %b2 = ashr <2 x i32> %b1, <i32 24, i32 24>
-  %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> <i32 0, i32 2>
-  %d = trunc <2 x i32> %c to <2 x i8>
-  ret <2 x i8> %d
-}
-
-define <2 x i8> @test_sext_inreg_v2i8i32(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i32
-; CHECK: sshll	 v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll	 v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %1 = sext <2 x i8> %v1 to <2 x i32>
-  %2 = sext <2 x i8> %v2 to <2 x i32>
-  %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i32> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <2 x i8> @test_sext_inreg_v2i8i64(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i64
-; CHECK: ushll   v1.2d, v1.2s, #0
-; CHECK: ushll   v0.2d, v0.2s, #0
-; CHECK: shl     v0.2d, v0.2d, #56
-; CHECK: sshr    v0.2d, v0.2d, #56
-; CHECK: shl     v1.2d, v1.2d, #56
-; CHECK: sshr    v1.2d, v1.2d, #56
-  %1 = sext <2 x i8> %v1 to <2 x i64>
-  %2 = sext <2 x i8> %v2 to <2 x i64>
-  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i64> %3 to <2 x i8>
-  ret <2 x i8> %4
-}
-
-define <4 x i8> @test_sext_inreg_v4i8i16(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i8i16
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %1 = sext <4 x i8> %v1 to <4 x i16>
-  %2 = sext <4 x i8> %v2 to <4 x i16>
-  %3 = shufflevector <4 x i16> %1, <4 x i16> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %4 = trunc <4 x i16> %3 to <4 x i8>
-  ret <4 x i8> %4
-}
-
-define <4 x i8> @test_sext_inreg_v4i8i16_2(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i8i16_2
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1    v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll   v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1    v1.8h, v1.8h, v1.8h
-  %a1 = shl <4 x i16> %v1, <i16 8, i16 8, i16 8, i16 8>
-  %a2 = ashr <4 x i16> %a1, <i16 8, i16 8, i16 8, i16 8>
-  %b1 = shl <4 x i16> %v2, <i16 8, i16 8, i16 8, i16 8>
-  %b2 = ashr <4 x i16> %b1, <i16 8, i16 8, i16 8, i16 8>
-  %c = shufflevector <4 x i16> %a2, <4 x i16> %b2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %d = trunc <4 x i16> %c to <4 x i8>
-  ret <4 x i8> %d
-}
-
-define <4 x i8> @test_sext_inreg_v4i8i32(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i8i32
-; CHECK: ushll   v1.4s, v1.4h, #0
-; CHECK: ushll   v0.4s, v0.4h, #0
-; CHECK: shl     v0.4s, v0.4s, #24
-; CHECK: sshr    v0.4s, v0.4s, #24
-; CHECK: shl     v1.4s, v1.4s, #24
-; CHECK: sshr    v1.4s, v1.4s, #24
-  %1 = sext <4 x i8> %v1 to <4 x i32>
-  %2 = sext <4 x i8> %v2 to <4 x i32>
-  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %4 = trunc <4 x i32> %3 to <4 x i8>
-  ret <4 x i8> %4
-}
-
-define <8 x i8> @test_sext_inreg_v8i8i16(<8 x i8> %v1, <8 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v8i8i16
-; CHECK: sshll   v0.8h, v0.8b, #0
-; CHECK: sshll   v1.8h, v1.8b, #0
-  %1 = sext <8 x i8> %v1 to <8 x i16>
-  %2 = sext <8 x i8> %v2 to <8 x i16>
-  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  %4 = trunc <8 x i16> %3 to <8 x i8>
-  ret <8 x i8> %4
-}
-
-define <8 x i1> @test_sext_inreg_v8i1i16(<8 x i1> %v1, <8 x i1> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v8i1i16
-; CHECK: ushll   v1.8h, v1.8b, #0
-; CHECK: ushll   v0.8h, v0.8b, #0
-; CHECK: shl     v0.8h, v0.8h, #15
-; CHECK: sshr    v0.8h, v0.8h, #15
-; CHECK: shl     v1.8h, v1.8h, #15
-; CHECK: sshr    v1.8h, v1.8h, #15
-  %1 = sext <8 x i1> %v1 to <8 x i16>
-  %2 = sext <8 x i1> %v2 to <8 x i16>
-  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  %4 = trunc <8 x i16> %3 to <8 x i1>
-  ret <8 x i1> %4
-}
-
-define <2 x i16> @test_sext_inreg_v2i16i32(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i16i32
-; CHECK: sshll   v0.4s, v0.4h, #0
-; CHECK-NEXT: uzp1    v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: sshll   v1.4s, v1.4h, #0
-; CHECK-NEXT: uzp1    v1.4s, v1.4s, v1.4s
-  %1 = sext <2 x i16> %v1 to <2 x i32>
-  %2 = sext <2 x i16> %v2 to <2 x i32>
-  %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i32> %3 to <2 x i16>
-  ret <2 x i16> %4
-}
-
-define <2 x i16> @test_sext_inreg_v2i16i32_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i16i32_2
-; CHECK: sshll   v0.4s, v0.4h, #0
-; CHECK-NEXT: uzp1    v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: sshll   v1.4s, v1.4h, #0
-; CHECK-NEXT: uzp1    v1.4s, v1.4s, v1.4s
-  %a1 = shl <2 x i32> %v1, <i32 16, i32 16>
-  %a2 = ashr <2 x i32> %a1, <i32 16, i32 16>
-  %b1 = shl <2 x i32> %v2, <i32 16, i32 16>
-  %b2 = ashr <2 x i32> %b1, <i32 16, i32 16>
-  %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> <i32 0, i32 2>
-  %d = trunc <2 x i32> %c to <2 x i16>
-  ret <2 x i16> %d
-}
-
-define <2 x i16> @test_sext_inreg_v2i16i64(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i16i64
-; CHECK: ushll   v1.2d, v1.2s, #0
-; CHECK: ushll   v0.2d, v0.2s, #0
-; CHECK: shl     v0.2d, v0.2d, #48
-; CHECK: sshr    v0.2d, v0.2d, #48
-; CHECK: shl     v1.2d, v1.2d, #48
-; CHECK: sshr    v1.2d, v1.2d, #48
-  %1 = sext <2 x i16> %v1 to <2 x i64>
-  %2 = sext <2 x i16> %v2 to <2 x i64>
-  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i64> %3 to <2 x i16>
-  ret <2 x i16> %4
-}
-
-define <4 x i16> @test_sext_inreg_v4i16i32(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i16i32
-; CHECK: sshll v0.4s, v0.4h, #0
-; CHECK: sshll v1.4s, v1.4h, #0
-  %1 = sext <4 x i16> %v1 to <4 x i32>
-  %2 = sext <4 x i16> %v2 to <4 x i32>
-  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  %4 = trunc <4 x i32> %3 to <4 x i16>
-  ret <4 x i16> %4
-}
-
-define <2 x i32> @test_sext_inreg_v2i32i64(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i32i64
-; CHECK: sshll v0.2d, v0.2s, #0
-; CHECK: sshll v1.2d, v1.2s, #0
-  %1 = sext <2 x i32> %v1 to <2 x i64>
-  %2 = sext <2 x i32> %v2 to <2 x i64>
-  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
-  %4 = trunc <2 x i64> %3 to <2 x i32>
-  ret <2 x i32> %4
-}
-
diff --git a/test/CodeGen/AArch64/sibling-call.ll b/test/CodeGen/AArch64/sibling-call.ll
index 20f1062..34e3bb4 100644
--- a/test/CodeGen/AArch64/sibling-call.ll
+++ b/test/CodeGen/AArch64/sibling-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -aarch64-load-store-opt=0 | FileCheck %s
 
 declare void @callee_stack0()
 declare void @callee_stack8([8 x i32], i64)
@@ -73,10 +73,10 @@ define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
   tail call void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
   ret void
 
-; CHECK: ldr x0,
-; CHECK: ldr x1,
-; CHECK: str x1,
-; CHECK: str x0,
+; CHECK: ldr [[VAL0:x[0-9]+]],
+; CHECK: ldr [[VAL1:x[0-9]+]],
+; CHECK: str [[VAL1]],
+; CHECK: str [[VAL0]],
 
 ; CHECK-NOT: add sp, sp,
 ; CHECK: b callee_stack16
@@ -91,7 +91,7 @@ define void @indirect_tail() {
   %fptr = load void(i32)** @func
   tail call void %fptr(i32 42)
   ret void
-; CHECK: ldr [[FPTR:x[1-9]+]], [{{x[0-9]+}}, #:lo12:func]
-; CHECK: movz w0, #42
+; CHECK: ldr [[FPTR:x[1-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:func]
+; CHECK: movz w0, #{{42|0x2a}}
 ; CHECK: br [[FPTR]]
 }
diff --git a/test/CodeGen/AArch64/sincos-expansion.ll b/test/CodeGen/AArch64/sincos-expansion.ll
index 4cd4449..c3a172d 100644
--- a/test/CodeGen/AArch64/sincos-expansion.ll
+++ b/test/CodeGen/AArch64/sincos-expansion.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 define float @test_sincos_f32(float %f) {
   %sin = call float @sinf(float %f) readnone
diff --git a/test/CodeGen/AArch64/sincospow-vector-expansion.ll b/test/CodeGen/AArch64/sincospow-vector-expansion.ll
index 259a55e..22f33a8 100644
--- a/test/CodeGen/AArch64/sincospow-vector-expansion.ll
+++ b/test/CodeGen/AArch64/sincospow-vector-expansion.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc -o - %s -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+neon | FileCheck %s
 
 
 define <2 x float> @test_cos_v2f64(<2 x double> %v1) {
diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
index 81885f1..8aab842 100644
--- a/test/CodeGen/AArch64/tail-call.ll
+++ b/test/CodeGen/AArch64/tail-call.ll
@@ -7,8 +7,10 @@ declare fastcc void @callee_stack16([8 x i32], i64, i64)
 define fastcc void @caller_to0_from0() nounwind {
 ; CHECK-LABEL: caller_to0_from0:
 ; CHECK-NEXT: // BB
+
   tail call fastcc void @callee_stack0()
   ret void
+
 ; CHECK-NEXT: b callee_stack0
 }
 
@@ -17,6 +19,7 @@ define fastcc void @caller_to0_from8([8 x i32], i64) {
 
   tail call fastcc void @callee_stack0()
   ret void
+
 ; CHECK: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack0
 }
@@ -29,8 +32,8 @@ define fastcc void @caller_to8_from0() {
 ; pointer (we didn't have arg space to reuse).
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+
+; CHECK: str {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack8
 }
 
@@ -41,8 +44,8 @@ define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
 ; Key point is that the "%a" should go where at SP on entry.
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+
+; CHECK: str {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack8
 }
 
@@ -54,10 +57,10 @@ define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
 ; above %a on the stack. If it tries to go below incoming-SP then the
 ; callee will not deallocate the space, even in fastcc.
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
-; CHECK: str {{x[0-9]+}}, [sp, #24]
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK: add sp, sp, #16
-; CHECK: b callee_stack16
+
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b callee_stack16
   ret void
 }
 
@@ -69,8 +72,8 @@ define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
 ; Key point is that the "%a" should go where at #16 above SP on entry.
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
-; CHECK: str {{x[0-9]+}}, [sp, #32]
-; CHECK-NEXT: add sp, sp, #32
+
+; CHECK: str {{x[0-9]+}}, [sp, #32]!
 ; CHECK-NEXT: b callee_stack8
 }
 
@@ -84,11 +87,8 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
   ret void
 
-; CHECK: ldr x0,
-; CHECK: ldr x1,
-; CHECK: str x1,
-; CHECK: str x0,
-
-; CHECK: add sp, sp, #16
-; CHECK: b callee_stack16
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b callee_stack16
 }
diff --git a/test/CodeGen/AArch64/tls-dynamic-together.ll b/test/CodeGen/AArch64/tls-dynamic-together.ll
deleted file mode 100644
index b5d7d89..0000000
--- a/test/CodeGen/AArch64/tls-dynamic-together.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -O0 -mtriple=aarch64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
-
-; If the .tlsdesccall and blr parts are emitted completely separately (even with
-; glue) then LLVM will separate them quite happily (with a spill at O0, hence
-; the option). This is definitely wrong, so we make sure they are emitted
-; together.
-
-@general_dynamic_var = external thread_local global i32
-
-define i32 @test_generaldynamic() {
-; CHECK-LABEL: test_generaldynamic:
-
-  %val = load i32* @general_dynamic_var
-  ret i32 %val
-
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr {{x[0-9]+}}
-}
diff --git a/test/CodeGen/AArch64/tls-dynamics.ll b/test/CodeGen/AArch64/tls-dynamics.ll
deleted file mode 100644
index 68c481c..0000000
--- a/test/CodeGen/AArch64/tls-dynamics.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
-
-@general_dynamic_var = external thread_local global i32
-
-define i32 @test_generaldynamic() {
-; CHECK-LABEL: test_generaldynamic:
-
-  %val = load i32* @general_dynamic_var
-  ret i32 %val
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
-; CHECK: ldr w0, [x[[TP]], x0]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-define i32* @test_generaldynamic_addr() {
-; CHECK-LABEL: test_generaldynamic_addr:
-
-  ret i32* @general_dynamic_var
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
-; CHECK: add x0, [[TP]], x0
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-@local_dynamic_var = external thread_local(localdynamic) global i32
-
-define i32 @test_localdynamic() {
-; CHECK-LABEL: test_localdynamic:
-
-  %val = load i32* @local_dynamic_var
-  ret i32 %val
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
-; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-
-; CHECK: ldr w0, [x0, [[DTP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-define i32* @test_localdynamic_addr() {
-; CHECK-LABEL: test_localdynamic_addr:
-
-  ret i32* @local_dynamic_var
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
-; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-
-; CHECK: add x0, x0, [[DTP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-; The entire point of the local-dynamic access model is to have a single call to
-; the expensive resolver. Make sure we achieve that goal.
-
-@local_dynamic_var2 = external thread_local(localdynamic) global i32
-
-define i32 @test_localdynamic_deduplicate() {
-; CHECK-LABEL: test_localdynamic_deduplicate:
-
-  %val = load i32* @local_dynamic_var
-  %val2 = load i32* @local_dynamic_var2
-
-  %sum = add i32 %val, %val2
-  ret i32 %sum
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK-NOT: _TLS_MODULE_BASE_
-
-; CHECK: ret
-}
diff --git a/test/CodeGen/AArch64/tls-execs.ll b/test/CodeGen/AArch64/tls-execs.ll
deleted file mode 100644
index 39ceb9a..0000000
--- a/test/CodeGen/AArch64/tls-execs.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
-
-@initial_exec_var = external thread_local(initialexec) global i32
-
-define i32 @test_initial_exec() {
-; CHECK-LABEL: test_initial_exec:
-  %val = load i32* @initial_exec_var
-
-; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
-; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], #:gottprel_lo12:initial_exec_var]
-; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
-; CHECK: ldr w0, [x[[TP]], x[[TP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
-; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
-
-  ret i32 %val
-}
-
-define i32* @test_initial_exec_addr() {
-; CHECK-LABEL: test_initial_exec_addr:
-  ret i32* @initial_exec_var
-
-; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
-; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], #:gottprel_lo12:initial_exec_var]
-; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
-; CHECK: add x0, [[TP]], [[TP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
-; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
-
-}
-
-@local_exec_var = thread_local(initialexec) global i32 0
-
-define i32 @test_local_exec() {
-; CHECK-LABEL: test_local_exec:
-  %val = load i32* @local_exec_var
-
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [A,A,0xa0'A',0x92'A']
-; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
-; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
-
-  ret i32 %val
-}
-
-define i32* @test_local_exec_addr() {
-; CHECK-LABEL: test_local_exec_addr:
-  ret i32* @local_exec_var
-
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
-; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
-; CHECK: add x0, [[TP]], [[TP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
-}
diff --git a/test/CodeGen/AArch64/tst-br.ll b/test/CodeGen/AArch64/tst-br.ll
index 154bc08..8a2fe26 100644
--- a/test/CodeGen/AArch64/tst-br.ll
+++ b/test/CodeGen/AArch64/tst-br.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 ; We've got the usual issues with LLVM reordering blocks here. The
 ; tests are correct for the current order, but who knows when that
@@ -15,7 +15,7 @@ define i32 @test_tbz() {
   %tbit0 = and i32 %val, 32768
   %tst0 = icmp ne i32 %tbit0, 0
   br i1 %tst0, label %test1, label %end1
-; CHECK: tbz {{w[0-9]+}}, #15, [[LBL_end1:.LBB0_[0-9]+]]
+; CHECK: tbz {{w[0-9]+}}, #15, [[LBL_end1:.?LBB0_[0-9]+]]
 
 test1:
   %tbit1 = and i32 %val, 4096
@@ -27,22 +27,22 @@ test2:
   %tbit2 = and i64 %val64, 32768
   %tst2 = icmp ne i64 %tbit2, 0
   br i1 %tst2, label %test3, label %end1
-; CHECK: tbz {{x[0-9]+}}, #15, [[LBL_end1]]
+; CHECK: tbz {{[wx][0-9]+}}, #15, [[LBL_end1]]
 
 test3:
   %tbit3 = and i64 %val64, 4096
   %tst3 = icmp ne i64 %tbit3, 0
   br i1 %tst3, label %end2, label %end1
-; CHECK: tbz {{x[0-9]+}}, #12, [[LBL_end1]]
+; CHECK: tbz {{[wx][0-9]+}}, #12, [[LBL_end1]]
 
 end2:
-; CHECK: movz x0, #1
+; CHECK: {{movz x0, #1|orr w0, wzr, #0x1}}
 ; CHECK-NEXT: ret
   ret i32 1
 
 end1:
 ; CHECK: [[LBL_end1]]:
-; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: {{mov x0, xzr|mov w0, wzr}}
 ; CHECK-NEXT: ret
   ret i32 0
 }
diff --git a/test/CodeGen/AArch64/variadic.ll b/test/CodeGen/AArch64/variadic.ll
deleted file mode 100644
index 1c7f1e0..0000000
--- a/test/CodeGen/AArch64/variadic.ll
+++ /dev/null
@@ -1,241 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 < %s | FileCheck --check-prefix=CHECK-NOFP %s
-
-%va_list = type {i8*, i8*, i8*, i32, i32}
-
-@var = global %va_list zeroinitializer
-
-declare void @llvm.va_start(i8*)
-
-define void @test_simple(i32 %n, ...) {
-; CHECK-LABEL: test_simple:
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #112]
-; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
-; CHECK: str x7, [x[[GPRBASE]], #48]
-
-; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK-NOFP: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
-; CHECK-NOFP: str x7, [x[[GPRBASE]], #48]
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOFP: str x1, [sp, #[[GPRFROMSP]]]
-
-; Omit the middle ones
-
-; CHECK: str q0, [sp]
-; CHECK: str x1, [sp, #[[GPRFROMSP]]]
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-
-; CHECK-NOFP-NOT: str q0, [sp]
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #127
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #55
-; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #128
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #56
-; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #55
-; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #56
-; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-  ret void
-}
-
-define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
-; CHECK-LABEL: test_fewargs:
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #96]
-; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
-; CHECK: str x7, [x[[GPRBASE]], #32]
-
-; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOFP: mov x[[GPRBASE:[0-9]+]], sp
-; CHECK-NOFP: str x7, [x[[GPRBASE]], #24]
-
-; Omit the middle ones
-
-; CHECK: str q1, [sp]
-; CHECK: str x3, [sp, #[[GPRFROMSP]]]
-
-; CHECK-NOFP-NOT: str q1, [sp]
-; CHECK-NOFP: str x4, [sp]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #111
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #39
-; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #112
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #40
-; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #31
-; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #32
-; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-  ret void
-}
-
-define void @test_nospare([8 x i64], [8 x float], ...) {
-; CHECK-LABEL: test_nospare:
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK-NOT: sub sp, sp
-; CHECK: mov [[STACK:x[0-9]+]], sp
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP-NOT: sub sp, sp
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #64
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-  ret void
-}
-
-; If there are non-variadic arguments on the stack (here two i64s) then the
-; __stack field should point just past them.
-define void @test_offsetstack([10 x i64], [3 x float], ...) {
-; CHECK-LABEL: test_offsetstack:
-; CHECK: sub sp, sp, #80
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #64]
-
-; CHECK-NOT: str x{{[0-9]+}},
-
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOT: str x7,
-
-; Omit the middle ones
-
-; CHECK: str q3, [sp]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #79
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: str wzr, [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #80
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: add [[STACK:x[0-9]+]], sp, #96
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #40
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24]
-  ret void
-}
-
-declare void @llvm.va_end(i8*)
-
-define void @test_va_end() nounwind {
-; CHECK-LABEL: test_va_end:
-; CHECK-NEXT: BB#0
-; CHECK-NOFP: BB#0
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_end(i8* %addr)
-
-  ret void
-; CHECK-NEXT: ret
-; CHECK-NOFP-NEXT: ret
-}
-
-declare void @llvm.va_copy(i8* %dest, i8* %src)
-
-@second_list = global %va_list zeroinitializer
-
-define void @test_va_copy() {
-; CHECK-LABEL: test_va_copy:
-  %srcaddr = bitcast %va_list* @var to i8*
-  %dstaddr = bitcast %va_list* @second_list to i8*
-  call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
-
-; Check beginning and end again:
-
-; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
-; CHECK: ldr [[BLOCK1:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK: ldr [[BLOCK2:x[0-9]+]], [x[[SRC_LIST]], #24]
-; CHECK: str [[BLOCK1]], [{{x[0-9]+}}, #:lo12:second_list]
-; CHECK: str [[BLOCK2]], [x[[DEST_LIST]], #24]
-
-; CHECK-NOFP: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
-; CHECK-NOFP: ldr [[BLOCK1:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK-NOFP: ldr [[BLOCK2:x[0-9]+]], [x[[SRC_LIST]], #24]
-; CHECK-NOFP: str [[BLOCK1]], [{{x[0-9]+}}, #:lo12:second_list]
-; CHECK-NOFP: str [[BLOCK2]], [x[[DEST_LIST]], #24]
-
-  ret void
-; CHECK: ret
-; CHECK-NOFP: ret
-}
-
-%struct.s_3i = type { i32, i32, i32 }
-
-; This checks that, if the last named argument is not a multiple of 8 bytes,
-; and is allocated on the stack, that __va_list.__stack is initialised to the
-; first 8-byte aligned location above it.
-define void @test_va_odd_struct_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, [1 x i64], %struct.s_3i* byval nocapture readnone align 4 %h, ...) {
-; CHECK-LABEL: test_va_odd_struct_on_stack:
-
-; CHECK: sub sp, sp, #128
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #112]
-
-; CHECK-NOT: str x{{[0-9]+}},
-
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOT: str x7,
-
-; Omit the middle ones
-
-; CHECK: str q0, [sp]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #127
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: str wzr, [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #128
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; This constant would be #140 if it was not 8-byte aligned
-; CHECK: add [[STACK:x[0-9]+]], sp, #144
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; This constant would be #12 if it was not 8-byte aligned
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #16
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24]
-  ret void
-}
diff --git a/test/CodeGen/AArch64/zero-reg.ll b/test/CodeGen/AArch64/zero-reg.ll
index 9b1e527..bc112ab 100644
--- a/test/CodeGen/AArch64/zero-reg.ll
+++ b/test/CodeGen/AArch64/zero-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -7,9 +7,9 @@ define void @test_zr() {
 ; CHECK-LABEL: test_zr:
 
   store i32 0, i32* @var32
-; CHECK: str wzr, [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: str wzr, [{{x[0-9]+}}, {{#?}}:lo12:var32]
   store i64 0, i64* @var64
-; CHECK: str xzr, [{{x[0-9]+}}, #:lo12:var64]
+; CHECK: str xzr, [{{x[0-9]+}}, {{#?}}:lo12:var64]
 
   ret void
 ; CHECK: ret
@@ -23,8 +23,7 @@ define void @test_sp(i32 %val) {
 ; instruction (0b11111 in the Rn field would mean "sp").
   %addr = getelementptr i32* null, i64 0
   store i32 %val, i32* %addr
-; CHECK: mov x[[NULL:[0-9]+]], xzr
-; CHECK: str {{w[0-9]+}}, [x[[NULL]]]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+|sp}}]
 
   ret void
 ; CHECK: ret
diff --git a/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll b/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll
index 95aa595..dabe620 100644
--- a/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll
+++ b/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll
@@ -14,4 +14,6 @@ bb3:		; preds = %bb1
 }
 
 ; CHECK-NOT: 255
+; CHECK: .file{{.*}}SxtInRegBug.ll
+; CHECK-NOT: 255
 
diff --git a/test/CodeGen/ARM/2010-08-04-StackVariable.ll b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
index bc4cc98..48de244 100644
--- a/test/CodeGen/ARM/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
@@ -123,7 +123,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !43 = metadata !{i32 26, i32 0, metadata !39, null}
 !44 = metadata !{i32 786688, metadata !39, metadata !"k", metadata !2, i32 26, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
 !45 = metadata !{i32 27, i32 0, metadata !39, null}
-!46 = metadata !{metadata !0, metadata !9, metadata !16, metadata !17, metadata !20}
+!46 = metadata !{metadata !16, metadata !17, metadata !20}
 !47 = metadata !{}
 !48 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
 !49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll b/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll
index defb946..efb8202 100644
--- a/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll
+++ b/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+v7,+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mattr=+v7,+thumb2 %s -o - | FileCheck %s
 
 define i8 @f1(i8* %call1, i8* %call3, i32 %h, i32 %w, i32 %Width) {
 ; CHECK: f1:
diff --git a/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll b/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
new file mode 100644
index 0000000..1e40e4a
--- /dev/null
+++ b/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
@@ -0,0 +1,50 @@
+; Assertion `Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only"' failed.
+; Broken in r208166, fixed in 208715.
+
+; RUN: llc -mtriple=arm-linux-androideabi -o - -filetype=asm -relocation-model=pic %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "armv4t--linux-androideabi"
+
+@_ZTIi = external constant i8*
+
+define void @_Z3fn2v() #0 {
+entry:
+  invoke void @_Z3fn1v()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 1
+  %2 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) #2
+  %matches = icmp eq i32 %1, %2
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %3 = extractvalue { i8*, i32 } %0, 0
+  %4 = tail call i8* @__cxa_begin_catch(i8* %3) #2
+  tail call void @__cxa_end_catch() #2
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch
+  ret void
+
+eh.resume:                                        ; preds = %lpad
+  resume { i8*, i32 } %0
+}
+
+declare void @_Z3fn1v() #0
+
+declare i32 @__gxx_personality_v0(...)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*) #1
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll b/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
new file mode 100644
index 0000000..a82f614
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple thumbv7--windows-itanium -code-model large -filetype obj -o - %s \
+; RUN:    | llvm-objdump -no-show-raw-insn -d - | FileCheck %s
+
+; ModuleID = 'reduced.c'
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7--windows-itanium"
+
+define arm_aapcs_vfpcc i8 @isel(i32 %i) {
+entry:
+  %i.addr = alloca i32, align 4
+  %buffer = alloca [4096 x i8], align 1
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32* %i.addr, align 4
+  %rem = urem i32 %0, 4096
+  %arrayidx = getelementptr inbounds [4096 x i8]* %buffer, i32 0, i32 %rem
+  %1 = load volatile i8* %arrayidx, align 1
+  ret i8 %1
+}
+
+; CHECK-LABEL: isel
+; CHECK: push {r4, r5}
+; CHECK: movw r4, #{{\d*}}
+; CHECK: movw r12, #0
+; CHECK: movt r12, #0
+; CHECK: blx r12
+; CHECK: sub.w sp, sp, r4
+
diff --git a/test/CodeGen/ARM/Windows/chkstk.ll b/test/CodeGen/ARM/Windows/chkstk.ll
new file mode 100644
index 0000000..cb787e1
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/chkstk.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 %s -o - \
+; RUN:  | FileCheck -check-prefix CHECK-DEFAULT-CODE-MODEL %s
+
+; RUN: llc -mtriple=thumbv7-windows -mcpu=cortex-a9 -code-model=large %s -o - \
+; RUN:  | FileCheck -check-prefix CHECK-LARGE-CODE-MODEL %s
+
+define arm_aapcs_vfpcc void @check_watermark() {
+entry:
+  %buffer = alloca [4096 x i8], align 1
+  ret void
+}
+
+; CHECK-DEFAULT-CODE-MODEL: check_watermark:
+; CHECK-DEFAULT-CODE-MODEL: 	movw r4, #1024
+; CHECK-DEFAULT-CODE-MODEL: 	bl __chkstk
+; CHECK-DEFAULT-CODE-MODEL: 	sub.w sp, sp, r4
+
+; CHECK-LARGE-CODE-MODEL: check_watermark:
+; CHECK-LARGE-CODE-MODEL: 	movw r12, :lower16:__chkstk
+; CHECK-LARGE-CODE-MODEL: 	movt r12, :upper16:__chkstk
+; CHECK-LARGE-CODE-MODEL: 	movw r4, #1024
+; CHECK-LARGE-CODE-MODEL: 	blx r12
+; CHECK-LARGE-CODE-MODEL: 	sub.w sp, sp, r4
+
diff --git a/test/CodeGen/ARM/Windows/frame-register.ll b/test/CodeGen/ARM/Windows/frame-register.ll
new file mode 100644
index 0000000..31167d7
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/frame-register.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple thumbv7-windows -disable-fp-elim -filetype asm -o - %s \
+; RUN:     | FileCheck %s
+
+declare void @callee(i32)
+
+define i32 @calleer(i32 %i) {
+entry:
+  %i.addr = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32* %i.addr, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %j, align 4
+  %1 = load i32* %j, align 4
+  call void @callee(i32 %1)
+  %2 = load i32* %j, align 4
+  %add1 = add nsw i32 %2, 1
+  ret i32 %add1
+}
+
+; CHECK: push.w {r11, lr}
+
diff --git a/test/CodeGen/ARM/Windows/integer-floating-point-conversion.ll b/test/CodeGen/ARM/Windows/integer-floating-point-conversion.ll
new file mode 100644
index 0000000..acf21a1
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/integer-floating-point-conversion.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple thumbv7-windows -filetype asm -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc i64 @stoi64(float %f) {
+entry:
+  %conv = fptosi float %f to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: stoi64
+; CHECK: bl __stoi64
+
+define arm_aapcs_vfpcc i64 @stou64(float %f) {
+entry:
+  %conv = fptoui float %f to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: stou64
+; CHECK: bl __stou64
+
+define arm_aapcs_vfpcc float @i64tos(i64 %i64) {
+entry:
+  %conv = sitofp i64 %i64 to float
+  ret float %conv
+}
+
+; CHECK-LABEL: i64tos
+; CHECK: bl __i64tos
+
+define arm_aapcs_vfpcc float @u64tos(i64 %u64) {
+entry:
+  %conv = uitofp i64 %u64 to float
+  ret float %conv
+}
+
+; CHECK-LABEL: u64tos
+; CHECK: bl __u64tos
+
+define arm_aapcs_vfpcc i64 @dtoi64(double %d) {
+entry:
+  %conv = fptosi double %d to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: dtoi64
+; CHECK: bl __dtoi64
+
+define arm_aapcs_vfpcc i64 @dtou64(double %d) {
+entry:
+  %conv = fptoui double %d to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: dtou64
+; CHECK: bl __dtou64
+
+define arm_aapcs_vfpcc double @i64tod(i64 %i64) {
+entry:
+  %conv = sitofp i64 %i64 to double
+  ret double %conv
+}
+
+; CHECK-LABEL: i64tod
+; CHECK: bl __i64tod
+
+define arm_aapcs_vfpcc double @u64tod(i64 %i64) {
+entry:
+  %conv = uitofp i64 %i64 to double
+  ret double %conv
+}
+
+; CHECK-LABEL: u64tod
+; CHECK: bl __u64tod
+
diff --git a/test/CodeGen/ARM/Windows/memset.ll b/test/CodeGen/ARM/Windows/memset.ll
new file mode 100644
index 0000000..500e25e
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/memset.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple thumbv7--windows-itanium -filetype asm -o - %s | FileCheck %s
+
+@source = common global [512 x i8] zeroinitializer, align 4
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+
+define void @function() {
+entry:
+  call void @llvm.memset.p0i8.i32(i8* bitcast ([512 x i8]* @source to i8*), i8 0, i32 512, i32 0, i1 false)
+  unreachable
+}
+
+; CHECK: movw r0, :lower16:source
+; CHECK: movt r0, :upper16:source
+; CHECK: movs r1, #0
+; CHECK: mov.w r2, #512
+; CHECK: memset
+
diff --git a/test/CodeGen/ARM/Windows/mov32t-bundling.ll b/test/CodeGen/ARM/Windows/mov32t-bundling.ll
new file mode 100644
index 0000000..5f83837
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/mov32t-bundling.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple thumbv7-windows-itanium -filetype asm -o - %s | FileCheck %s
+
+@_begin = external global i8
+@_end = external global i8
+
+declare arm_aapcs_vfpcc void @force_emission()
+
+define arm_aapcs_vfpcc void @bundle() {
+entry:
+  br i1 icmp uge (i32 sub (i32 ptrtoint (i8* @_end to i32), i32 ptrtoint (i8* @_begin to i32)), i32 4), label %if.then, label %if.end
+
+if.then:
+  tail call arm_aapcs_vfpcc void @force_emission()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK-LABEL: bundle
+; CHECK-NOT: subs r0, r1, r0
+; CHECK: movw r0, :lower16:_begin
+; CHECK-NEXT: movt r0, :upper16:_begin
+; CHECK-NEXT: movw r1, :lower16:_end
+; CHECK-NEXT: movt r1, :upper16:_end
+; CHECK-NEXT: subs r0, r1, r0
+; CHECK-NEXT: cmp r0, #4
+
diff --git a/test/CodeGen/ARM/Windows/movw-movt-relocations.ll b/test/CodeGen/ARM/Windows/movw-movt-relocations.ll
new file mode 100644
index 0000000..3ae6428
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/movw-movt-relocations.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=thumbv7-windows -o - %s \
+; RUN:   | FileCheck %s -check-prefix CHECK-WINDOWS
+
+; RUN: llc -mtriple=thumbv7-eabi -o - %s \
+; RUN:   | FileCheck %s -check-prefix CHECK-EABI
+
+@i = common global i32 0, align 4
+@j = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize readonly
+define i32 @relocation(i32 %j, i32 %k) {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %add = add nsw i32 %1, %0
+  ret i32 %add
+}
+
+; CHECK-WINDOWS: movw r[[i:[0-4]]], :lower16:i
+; CHECK-WINDOWS-NEXT: movt r[[i]], :upper16:i
+; CHECK-WINDOWS: movw r[[j:[0-4]]], :lower16:j
+; CHECK-WINDOWS-NEXT: movt r[[j]], :upper16:j
+
+; CHECK-EABI: movw r[[i:[0-4]]], :lower16:i
+; CHECK-EABI: movw r[[j:[0-4]]], :lower16:j
+; CHECK-EABI-NEXT: movt r[[i]], :upper16:i
+; CHECK-EABI-NEXT: movt r[[j]], :upper16:j
diff --git a/test/CodeGen/ARM/Windows/no-aeabi.ll b/test/CodeGen/ARM/Windows/no-aeabi.ll
index 4c6676f..3971b9c 100644
--- a/test/CodeGen/ARM/Windows/no-aeabi.ll
+++ b/test/CodeGen/ARM/Windows/no-aeabi.ll
@@ -1,5 +1,27 @@
 ; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s | FileCheck %s
 
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+@source = common global [512 x i8] zeroinitializer, align 4
+@target = common global [512 x i8] zeroinitializer, align 4
+
+define void @move() nounwind {
+entry:
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* bitcast ([512 x i8]* @target to i8*), i8* bitcast ([512 x i8]* @source to i8*), i32 512, i32 0, i1 false)
+  unreachable
+}
+
+; CHECK-NOT: __aeabi_memmove
+
+define void @copy() nounwind {
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([512 x i8]* @target to i8*), i8* bitcast ([512 x i8]* @source to i8*), i32 512, i32 0, i1 false)
+  unreachable
+}
+
+; CHECK-NOT: __aeabi_memcpy
+
 define i32 @divide(i32 %i, i32 %j) nounwind {
 entry:
   %quotient = sdiv i32 %i, %j
diff --git a/test/CodeGen/ARM/Windows/pic.ll b/test/CodeGen/ARM/Windows/pic.ll
new file mode 100644
index 0000000..28d371f
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/pic.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple thumbv7-windows-itanium -relocation-model pic -filetype asm -o - %s \
+; RUN:    | FileCheck %s
+
+@external = external global i8
+
+define arm_aapcs_vfpcc i8 @return_external() {
+entry:
+  %0 = load i8* @external, align 1
+  ret i8 %0
+}
+
+; CHECK-LABEL: return_external
+; CHECK: movw r0, :lower16:external
+; CHECK: movt r0, :upper16:external
+; CHECK: ldrb r0, [r0]
+
diff --git a/test/CodeGen/ARM/Windows/read-only-data.ll b/test/CodeGen/ARM/Windows/read-only-data.ll
new file mode 100644
index 0000000..0ccb5ed
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/read-only-data.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple thumbv7-windows -filetype asm -o - %s | FileCheck %s
+
+@.str = private unnamed_addr constant [7 x i8] c"string\00", align 1
+
+declare arm_aapcs_vfpcc void @callee(i8*)
+
+define arm_aapcs_vfpcc void @function() {
+entry:
+  call arm_aapcs_vfpcc void @callee(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0))
+  ret void
+}
+
+; CHECK: .section .rdata,"rd"
+; CHECK-NOT: .section ".rodata.str1.1"
+
diff --git a/test/CodeGen/ARM/aapcs-hfa-code.ll b/test/CodeGen/ARM/aapcs-hfa-code.ll
new file mode 100644
index 0000000..396e838
--- /dev/null
+++ b/test/CodeGen/ARM/aapcs-hfa-code.ll
@@ -0,0 +1,111 @@
+; RUN: llc < %s -mtriple=armv7-linux-gnueabihf -o - | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7em-none-eabi -mcpu=cortex-m4 | FileCheck %s --check-prefix=CHECK-M4F
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define arm_aapcs_vfpcc void @test_1float({ float } %a) {
+  call arm_aapcs_vfpcc void @test_1float({ float } { float 1.0 })
+  ret void
+
+; CHECK-LABEL: test_1float:
+; CHECK-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK: bl test_1float
+
+; CHECK-M4F-LABEL: test_1float:
+; CHECK-M4F-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-M4F: bl test_1float
+}
+
+define arm_aapcs_vfpcc void @test_2float({ float, float } %a) {
+  call arm_aapcs_vfpcc void @test_2float({ float, float } { float 1.0, float 2.0 })
+  ret void
+
+; CHECK-LABEL: test_2float:
+; CHECK-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-DAG: vmov.f32 s1, #2.{{0+}}e+00
+; CHECK: bl test_2float
+
+; CHECK-M4F-LABEL: test_2float:
+; CHECK-M4F-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-M4F-DAG: vmov.f32 s1, #2.{{0+}}e+00
+; CHECK-M4F: bl test_2float
+}
+
+define arm_aapcs_vfpcc void @test_3float({ float, float, float } %a) {
+  call arm_aapcs_vfpcc void @test_3float({ float, float, float } { float 1.0, float 2.0, float 3.0 })
+  ret void
+
+; CHECK-LABEL: test_3float:
+; CHECK-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-DAG: vmov.f32 s1, #2.{{0+}}e+00
+; CHECK-DAG: vmov.f32 s2, #3.{{0+}}e+00
+; CHECK: bl test_3float
+
+; CHECK-M4F-LABEL: test_3float:
+; CHECK-M4F-DAG: vmov.f32 s0, #1.{{0+}}e+00
+; CHECK-M4F-DAG: vmov.f32 s1, #2.{{0+}}e+00
+; CHECK-M4F-DAG: vmov.f32 s2, #3.{{0+}}e+00
+; CHECK-M4F: bl test_3float
+}
+
+define arm_aapcs_vfpcc void @test_1double({ double } %a) {
+; CHECK-LABEL: test_1double:
+; CHECK-DAG: vmov.f64 d0, #1.{{0+}}e+00
+; CHECK: bl test_1double
+
+; CHECK-M4F-LABEL: test_1double:
+; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
+; CHECK-M4F: movt [[ONEHI]], #16368
+; CHECK-M4F-DAG: vmov s0, [[ONELO]]
+; CHECK-M4F-DAG: vmov s1, [[ONEHI]]
+; CHECK-M4F: bl test_1double
+
+  call arm_aapcs_vfpcc void @test_1double({ double } { double 1.0 })
+  ret void
+}
+
+; Final double argument might be put in s15 & [sp] if we're careless. It should
+; go all on the stack.
+define arm_aapcs_vfpcc void @test_1double_nosplit([4 x float], [4 x double], [3 x float], double %a) {
+; CHECK-LABEL: test_1double_nosplit:
+; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
+; CHECK-DAG: movw [[ONEHI:r[0-9]+]], #0
+; CHECK-DAG: movt [[ONEHI]], #16368
+; CHECK: strd [[ONELO]], [[ONEHI]], [sp]
+; CHECK: bl test_1double_nosplit
+
+; CHECK-M4F-LABEL: test_1double_nosplit:
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movt [[ONEHI]], #16368
+; CHECK-M4F-DAG: str [[ONELO]], [sp]
+; CHECK-M4F-DAG: str [[ONEHI]], [sp, #4]
+; CHECK-M4F: bl test_1double_nosplit
+  call arm_aapcs_vfpcc void @test_1double_nosplit([4 x float] undef, [4 x double] undef, [3 x float] undef, double 1.0)
+  ret void
+}
+
+; Final double argument might go at [sp, #4] if we're careless. Should go at
+; [sp, #8] to preserve alignment.
+define arm_aapcs_vfpcc void @test_1double_misaligned([4 x double], [4 x double], float, double) {
+  call arm_aapcs_vfpcc void @test_1double_misaligned([4 x double] undef, [4 x double] undef, float undef, double 1.0)
+
+; CHECK-LABEL: test_1double_misaligned:
+; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
+; CHECK-DAG: mov r[[BASE:[0-9]+]], sp
+; CHECK-DAG: movw [[ONEHI:r[0-9]+]], #0
+; CHECK-DAG: movt [[ONEHI]], #16368
+; CHECK-DAG: str [[ONELO]], [r[[BASE]], #8]!
+; CHECK-DAG: str [[ONEHI]], [r[[BASE]], #4]
+
+; CHECK-M4F-LABEL: test_1double_misaligned:
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movt [[ONEHI]], #16368
+; CHECK-M4F-DAG: str [[ONELO]], [sp, #8]
+; CHECK-M4F-DAG: str [[ONEHI]], [sp, #12]
+; CHECK-M4F: bl test_1double_misaligned
+
+  ret void
+}
diff --git a/test/CodeGen/ARM/aapcs-hfa.ll b/test/CodeGen/ARM/aapcs-hfa.ll
new file mode 100644
index 0000000..6448e00
--- /dev/null
+++ b/test/CodeGen/ARM/aapcs-hfa.ll
@@ -0,0 +1,164 @@
+; RUN: llc < %s -float-abi=hard -debug-only arm-isel 2>&1 | FileCheck %s
+; RUN: llc < %s -float-abi=soft -debug-only arm-isel 2>&1 | FileCheck %s --check-prefix=SOFT
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "armv7-none--eabi"
+
+; SOFT-NOT: isHA
+
+; CHECK: isHA: 1 { float }
+define void @f0b({ float } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { float, float }
+define void @f1({ float, float } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { float, float, float }
+define void @f1b({ float, float, float } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { float, float, float, float }
+define void @f1c({ float, float, float, float } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { float, float, float, float, float }
+define void @f2({ float, float, float, float, float } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { double }
+define void @f3({ double } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { double, double, double, double }
+define void @f4({ double, double, double, double } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { double, double, double, double, double }
+define void @f5({ double, double, double, double, double } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { i32, i32 }
+define void @f5b({ i32, i32 } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { [1 x float] }
+define void @f6({ [1 x float] } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { [4 x float] }
+define void @f7({ [4 x float] } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { [5 x float] }
+define void @f8({ [5 x float] } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 [1 x float]
+define void @f6b([1 x float] %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 [4 x float]
+define void @f7b([4 x float] %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 [5 x float]
+define void @f8b([5 x float] %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { [2 x float], [2 x float] }
+define void @f9({ [2 x float], [2 x float] } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { [1 x float], [3 x float] }
+define void @f9b({ [1 x float], [3 x float] } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { [3 x float], [3 x float] }
+define void @f10({ [3 x float], [3 x float] } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { <2 x float> }
+define void @f11({ <2 x float>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { <3 x float> }
+define void @f12({ <3 x float>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { <4 x float> }
+define void @f13({ <4 x float>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { <2 x float>, <2 x float> }
+define void @f15({ <2 x float>, <2 x float>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { <2 x float>, float }
+define void @f15b({ <2 x float>, float  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { <2 x float>, [2 x float] }
+define void @f15c({ <2 x float>, [2 x float]  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { <2 x float>, <4 x float> }
+define void @f16({ <2 x float>, <4 x float>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { <2 x double> }
+define void @f17({ <2 x double>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { <2 x i32> }
+define void @f18({ <2 x i32>  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { <2 x i64>, <4 x i32> }
+define void @f19({ <2 x i64>, <4 x i32> } %a) {
+  ret void
+}
+
+; CHECK: isHA: 1 { [4 x <4 x float>] }
+define void @f20({ [4 x <4 x float>]  } %a) {
+  ret void
+}
+
+; CHECK: isHA: 0 { [5 x <4 x float>] }
+define void @f21({ [5 x <4 x float>]  } %a) {
+  ret void
+}
+
+; CHECK-NOT: isHA
+define void @f22({ float } %a, ...) {
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index f55ae10..4de305b 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -29,7 +29,7 @@ define i32 @foo_f() {
 
 @bar_i = alias internal i32* @bar
 
-@A = alias bitcast (i32* @bar to i64*)
+@A = alias i64, i32* @bar
 
 define i32 @test() {
 entry:
diff --git a/test/CodeGen/ARM/argaddr.ll b/test/CodeGen/ARM/argaddr.ll
index 116a32f..40bc5e0 100644
--- a/test/CodeGen/ARM/argaddr.ll
+++ b/test/CodeGen/ARM/argaddr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm
+; RUN: llc -mtriple=arm-eabi %s -o /dev/null
 
 define void @f(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index a881d5f..9913f30 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -1,12 +1,16 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-THUMB
+; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-LE
+; RUN: llc < %s -mtriple=armebv7 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+; RUN: llc < %s -mtriple=thumbebv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-BE
 
 define i64 @test1(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test1:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: adds [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK: adc [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-LE: adds [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-LE: adc [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE: adds [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE: adc [[REG3:(r[0-9]?[02468])]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -15,8 +19,10 @@ define i64 @test1(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test1:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: adds.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB: adc.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-LE: adds.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-LE: adc.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE: adds.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE: adc.w [[REG3:[a-z0-9]+]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -30,8 +36,10 @@ define i64 @test2(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test2:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: subs [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK: sbc [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-LE: subs [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-LE: sbc [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE: subs [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE: sbc [[REG3:(r[0-9]?[02468])]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -40,8 +48,10 @@ define i64 @test2(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test2:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: subs.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB: sbc.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-LE: subs.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-LE: sbc.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE: subs.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE: sbc.w [[REG3:[a-z0-9]+]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -55,8 +65,10 @@ define i64 @test3(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test3:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-DAG: and [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK-DAG: and [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-LE-DAG: and [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-LE-DAG: and [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE-DAG: and [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE-DAG: and [[REG3:(r[0-9]?[02468])]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -65,8 +77,10 @@ define i64 @test3(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test3:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-DAG: and.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB-DAG: and.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-LE-DAG: and.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-LE-DAG: and.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: and.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: and.w [[REG3:[a-z0-9]+]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -80,8 +94,10 @@ define i64 @test4(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test4:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-DAG: orr [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK-DAG: orr [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-LE-DAG: orr [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-LE-DAG: orr [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE-DAG: orr [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE-DAG: orr [[REG3:(r[0-9]?[02468])]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -90,8 +106,10 @@ define i64 @test4(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test4:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-DAG: orr.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB-DAG: orr.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-LE-DAG: orr.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-LE-DAG: orr.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: orr.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: orr.w [[REG3:[a-z0-9]+]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -105,8 +123,10 @@ define i64 @test5(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test5:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-DAG: eor [[REG3:(r[0-9]?[02468])]], [[REG1]]
-; CHECK-DAG: eor [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-LE-DAG: eor [[REG3:(r[0-9]?[02468])]], [[REG1]]
+; CHECK-LE-DAG: eor [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE-DAG: eor [[REG4:(r[0-9]?[13579])]], [[REG2]]
+; CHECK-BE-DAG: eor [[REG3:(r[0-9]?[02468])]], [[REG1]]
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
@@ -115,8 +135,10 @@ define i64 @test5(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test5:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-DAG: eor.w [[REG3:[a-z0-9]+]], [[REG1]]
-; CHECK-THUMB-DAG: eor.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-LE-DAG: eor.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-LE-DAG: eor.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: eor.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB-BE-DAG: eor.w [[REG3:[a-z0-9]+]], [[REG1]]
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK-THUMB: cmp
 ; CHECK-THUMB: bne
@@ -151,8 +173,10 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK-LABEL: test7:
 ; CHECK: dmb {{ish$}}
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG1]], r1
-; CHECK-DAG: eor     [[MISMATCH_HI:r[0-9]+]], [[REG2]], r2
+; CHECK-LE-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG1]], r1
+; CHECK-LE-DAG: eor     [[MISMATCH_HI:r[0-9]+]], [[REG2]], r2
+; CHECK-BE-DAG: eor     [[MISMATCH_LO:r[0-9]+]], [[REG2]], r2
+; CHECK-BE-DAG: eor     [[MISMATCH_HI:r[0-9]+]], [[REG1]], r1
 ; CHECK: orrs    {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
 ; CHECK: bne
 ; CHECK: strexd {{[a-z0-9]+}}, {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
@@ -163,8 +187,10 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK-THUMB-LABEL: test7:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
-; CHECK-THUMB-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
+; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG1]], r2
+; CHECK-THUMB-LE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG2]], r3
+; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_HI:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB-BE-DAG: eor.w     [[MISMATCH_LO:[a-z0-9]+]], [[REG2]]
 ; CHECK-THUMB: orrs    [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-THUMB: bne
 ; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
@@ -220,9 +246,11 @@ define i64 @test10(i64* %ptr, i64 %val) {
 ; CHECK: mov     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK: mov     [[CARRY_HI:[a-z0-9]+]], #0
 ; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
-; CHECK: cmp     [[REG1]], r1
+; CHECK-LE: cmp     [[REG1]], r1
+; CHECK-BE: cmp     [[REG2]], r2
 ; CHECK: movwls  [[CARRY_LO]], #1
-; CHECK: cmp     [[REG2]], r2
+; CHECK-LE: cmp     [[REG2]], r2
+; CHECK-BE: cmp     [[REG1]], r1
 ; CHECK: movwle  [[CARRY_HI]], #1
 ; CHECK: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK: cmp     [[CARRY_HI]], #0
@@ -237,11 +265,13 @@ define i64 @test10(i64* %ptr, i64 %val) {
 ; CHECK-THUMB-LABEL: test10:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+]], #0
-; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+]], #0
-; CHECK-THUMB: cmp     [[REG1]], r2
+; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+|lr]], #0
+; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+|lr]], #0
+; CHECK-THUMB-LE: cmp     [[REG1]], r2
+; CHECK-THUMB-BE: cmp     [[REG2]], r3
 ; CHECK-THUMB: movls.w  [[CARRY_LO]], #1
-; CHECK-THUMB: cmp     [[REG2]], r3
+; CHECK-THUMB-LE: cmp     [[REG2]], r3
+; CHECK-THUMB-BE: cmp     [[REG1]], r2
 ; CHECK-THUMB: movle  [[CARRY_HI]], #1
 ; CHECK-THUMB: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK-THUMB: mov     [[OUT_HI:[a-z0-9]+]], r3
@@ -265,9 +295,11 @@ define i64 @test11(i64* %ptr, i64 %val) {
 ; CHECK: mov     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK: mov     [[CARRY_HI:[a-z0-9]+]], #0
 ; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
-; CHECK: cmp     [[REG1]], r1
+; CHECK-LE: cmp     [[REG1]], r1
+; CHECK-BE: cmp     [[REG2]], r2
 ; CHECK: movwls  [[CARRY_LO]], #1
-; CHECK: cmp     [[REG2]], r2
+; CHECK-LE: cmp     [[REG2]], r2
+; CHECK-BE: cmp     [[REG1]], r1
 ; CHECK: movwls  [[CARRY_HI]], #1
 ; CHECK: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK: cmp     [[CARRY_HI]], #0
@@ -279,15 +311,16 @@ define i64 @test11(i64* %ptr, i64 %val) {
 ; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
-
 ; CHECK-THUMB-LABEL: test11:
 ; CHECK-THUMB: dmb {{ish$}}
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+]], #0
-; CHECK-THUMB: cmp     [[REG1]], r2
+; CHECK-THUMB-LE: cmp     [[REG1]], r2
+; CHECK-THUMB-BE: cmp     [[REG2]], r3
 ; CHECK-THUMB: movls.w  [[CARRY_LO]], #1
-; CHECK-THUMB: cmp     [[REG2]], r3
+; CHECK-THUMB-LE: cmp     [[REG2]], r3
+; CHECK-THUMB-BE: cmp     [[REG1]], r2
 ; CHECK-THUMB: movls  [[CARRY_HI]], #1
 ; CHECK-THUMB: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK-THUMB: mov     [[OUT_HI:[a-z0-9]+]], r3
@@ -311,9 +344,11 @@ define i64 @test12(i64* %ptr, i64 %val) {
 ; CHECK: mov     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK: mov     [[CARRY_HI:[a-z0-9]+]], #0
 ; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
-; CHECK: cmp     [[REG1]], r1
+; CHECK-LE: cmp     [[REG1]], r1
+; CHECK-BE: cmp     [[REG2]], r2
 ; CHECK: movwhi  [[CARRY_LO]], #1
-; CHECK: cmp     [[REG2]], r2
+; CHECK-LE: cmp     [[REG2]], r2
+; CHECK-BE: cmp     [[REG1]], r1
 ; CHECK: movwgt  [[CARRY_HI]], #1
 ; CHECK: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK: cmp     [[CARRY_HI]], #0
@@ -330,9 +365,11 @@ define i64 @test12(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+]], #0
-; CHECK-THUMB: cmp     [[REG1]], r2
+; CHECK-THUMB-LE: cmp     [[REG1]], r2
+; CHECK-THUMB-BE: cmp     [[REG2]], r3
 ; CHECK-THUMB: movhi.w  [[CARRY_LO]], #1
-; CHECK-THUMB: cmp     [[REG2]], r3
+; CHECK-THUMB-LE: cmp     [[REG2]], r3
+; CHECK-THUMB-BE: cmp     [[REG1]], r2
 ; CHECK-THUMB: movgt  [[CARRY_HI]], #1
 ; CHECK-THUMB: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK-THUMB: mov     [[OUT_HI:[a-z0-9]+]], r3
@@ -356,9 +393,11 @@ define i64 @test13(i64* %ptr, i64 %val) {
 ; CHECK: mov     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK: mov     [[CARRY_HI:[a-z0-9]+]], #0
 ; CHECK: mov     [[OUT_HI:[a-z0-9]+]], r2
-; CHECK: cmp     [[REG1]], r1
+; CHECK-LE: cmp     [[REG1]], r1
+; CHECK-BE: cmp     [[REG2]], r2
 ; CHECK: movwhi  [[CARRY_LO]], #1
-; CHECK: cmp     [[REG2]], r2
+; CHECK-LE: cmp     [[REG2]], r2
+; CHECK-BE: cmp     [[REG1]], r1
 ; CHECK: movwhi  [[CARRY_HI]], #1
 ; CHECK: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK: cmp     [[CARRY_HI]], #0
@@ -375,9 +414,11 @@ define i64 @test13(i64* %ptr, i64 %val) {
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
 ; CHECK-THUMB: mov.w     [[CARRY_LO:[a-z0-9]+]], #0
 ; CHECK-THUMB: movs     [[CARRY_HI:[a-z0-9]+]], #0
-; CHECK-THUMB: cmp     [[REG1]], r2
+; CHECK-THUMB-LE: cmp     [[REG1]], r2
+; CHECK-THUMB-BE: cmp     [[REG2]], r3
 ; CHECK-THUMB: movhi.w  [[CARRY_LO]], #1
-; CHECK-THUMB: cmp     [[REG2]], r3
+; CHECK-THUMB-LE: cmp     [[REG2]], r3
+; CHECK-THUMB-BE: cmp     [[REG1]], r2
 ; CHECK-THUMB: movhi  [[CARRY_HI]], #1
 ; CHECK-THUMB: moveq   [[CARRY_HI]], [[CARRY_LO]]
 ; CHECK-THUMB: mov     [[OUT_HI:[a-z0-9]+]], r3
diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll
index 7922e22..a39565e 100644
--- a/test/CodeGen/ARM/atomic-ops-v8.ll
+++ b/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mtriple=armv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM
-; RUN: llc -mtriple=thumbv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-THUMB
+; RUN: llc -mtriple=armv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE --check-prefix=CHECK-ARM --check-prefix=CHECK-ARM-LE
+; RUN: llc -mtriple=armebv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE --check-prefix=CHECK-ARM --check-prefix=CHECK-ARM-BE
+; RUN: llc -mtriple=thumbv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-LE
+; RUN: llc -mtriple=thumbebv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE --check-prefix=CHECK-THUMB --check-prefix=CHECK-THUMB-BE
 
 @var8 = global i8 0
 @var16 = global i16 0
@@ -87,8 +89,10 @@ define void @test_atomic_load_add_i64(i64 %offset) nounwind {
 ; CHECK: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: adds{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
-; CHECK-NEXT: adc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-LE-NEXT: adds{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
+; CHECK-LE-NEXT: adc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-BE-NEXT: adds{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-NEXT: adc{{(\.w)?}}  [[NEW1:r[0-9]+]], r[[OLD1]], r0
 ; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -181,8 +185,10 @@ define void @test_atomic_load_sub_i64(i64 %offset) nounwind {
 ; CHECK: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-NEXT: subs{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
-; CHECK-NEXT: sbc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-LE-NEXT: subs{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
+; CHECK-LE-NEXT: sbc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-BE-NEXT: subs{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-NEXT: sbc{{(\.w)?}}  [[NEW1:r[0-9]+]], r[[OLD1]], r0
 ; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -275,8 +281,10 @@ define void @test_atomic_load_and_i64(i64 %offset) nounwind {
 ; CHECK: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-DAG: and{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
-; CHECK-DAG: and{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-LE-DAG: and{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
+; CHECK-LE-DAG: and{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-DAG: and{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-DAG: and{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
 ; CHECK: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -369,8 +377,10 @@ define void @test_atomic_load_or_i64(i64 %offset) nounwind {
 ; CHECK: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-DAG: orr{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
-; CHECK-DAG: orr{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-LE-DAG: orr{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
+; CHECK-LE-DAG: orr{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-DAG: orr{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-DAG: orr{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
 ; CHECK: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -463,8 +473,10 @@ define void @test_atomic_load_xor_i64(i64 %offset) nounwind {
 ; CHECK: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-DAG: eor{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
-; CHECK-DAG: eor{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-LE-DAG: eor{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
+; CHECK-LE-DAG: eor{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-DAG: eor{{(\.w)?}} [[NEW2:r[0-9]+|lr]], r[[OLD2]], r1
+; CHECK-BE-DAG: eor{{(\.w)?}} [[NEW1:r[0-9]+|lr]], r[[OLD1]], r0
 ; CHECK: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
@@ -657,10 +669,14 @@ define void @test_atomic_load_min_i64(i64 %offset) nounwind {
   ; function there.
 ; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
 ; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
-; CHECK-ARM: cmp [[OLD1]], r0
-; CHECK-ARM: movwls [[LOCARRY]], #1
-; CHECK-ARM: cmp [[OLD2]], r1
-; CHECK-ARM: movwle [[HICARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD1]], r0
+; CHECK-ARM-LE: movwls [[LOCARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD2]], r1
+; CHECK-ARM-LE: movwle [[HICARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD2]], r1
+; CHECK-ARM-BE: movwls [[LOCARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD1]], r0
+; CHECK-ARM-BE: movwle [[HICARRY]], #1
 ; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
 ; CHECK-ARM: cmp [[HICARRY]], #0
 ; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
@@ -771,10 +787,14 @@ define void @test_atomic_load_max_i64(i64 %offset) nounwind {
   ; function there.
 ; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
 ; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
-; CHECK-ARM: cmp [[OLD1]], r0
-; CHECK-ARM: movwhi [[LOCARRY]], #1
-; CHECK-ARM: cmp [[OLD2]], r1
-; CHECK-ARM: movwgt [[HICARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD1]], r0
+; CHECK-ARM-LE: movwhi [[LOCARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD2]], r1
+; CHECK-ARM-LE: movwgt [[HICARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD2]], r1
+; CHECK-ARM-BE: movwhi [[LOCARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD1]], r0
+; CHECK-ARM-BE: movwgt [[HICARRY]], #1
 ; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
 ; CHECK-ARM: cmp [[HICARRY]], #0
 ; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
@@ -885,10 +905,14 @@ define void @test_atomic_load_umin_i64(i64 %offset) nounwind {
   ; function there.
 ; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
 ; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
-; CHECK-ARM: cmp [[OLD1]], r0
-; CHECK-ARM: movwls [[LOCARRY]], #1
-; CHECK-ARM: cmp [[OLD2]], r1
-; CHECK-ARM: movwls [[HICARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD1]], r0
+; CHECK-ARM-LE: movwls [[LOCARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD2]], r1
+; CHECK-ARM-LE: movwls [[HICARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD2]], r1
+; CHECK-ARM-BE: movwls [[LOCARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD1]], r0
+; CHECK-ARM-BE: movwls [[HICARRY]], #1
 ; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
 ; CHECK-ARM: cmp [[HICARRY]], #0
 ; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
@@ -999,10 +1023,14 @@ define void @test_atomic_load_umax_i64(i64 %offset) nounwind {
   ; function there.
 ; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0
 ; CHECK-ARM: mov [[HICARRY:r[0-9]+|lr]], #0
-; CHECK-ARM: cmp [[OLD1]], r0
-; CHECK-ARM: movwhi [[LOCARRY]], #1
-; CHECK-ARM: cmp [[OLD2]], r1
-; CHECK-ARM: movwhi [[HICARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD1]], r0
+; CHECK-ARM-LE: movwhi [[LOCARRY]], #1
+; CHECK-ARM-LE: cmp [[OLD2]], r1
+; CHECK-ARM-LE: movwhi [[HICARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD2]], r1
+; CHECK-ARM-BE: movwhi [[LOCARRY]], #1
+; CHECK-ARM-BE: cmp [[OLD1]], r0
+; CHECK-ARM-BE: movwhi [[HICARRY]], #1
 ; CHECK-ARM: moveq [[HICARRY]], [[LOCARRY]]
 ; CHECK-ARM: cmp [[HICARRY]], #0
 ; CHECK-ARM: mov [[MINHI:r[0-9]+]], r1
@@ -1112,9 +1140,12 @@ define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK: ldrexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
   ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
-; CHECK-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
-; CHECK-DAG: eor{{(\.w)?}} [[MISMATCH_HI:r[0-9]+|lr]], [[OLD2]], r1
-; CHECK: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
+; CHECK-LE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
+; CHECK-LE-DAG: eor{{(\.w)?}} [[MISMATCH_HI:r[0-9]+|lr]], [[OLD2]], r1
+; CHECK-LE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]]
+; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_HI:r[0-9]+|lr]], [[OLD2]], r1
+; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
+; CHECK-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
 ; CHECK-NEXT: BB#2:
   ; As above, r2, r3 is a reasonable guess.
@@ -1151,7 +1182,8 @@ define i8 @test_atomic_load_monotonic_regoff_i8(i64 %base, i64 %off) nounwind {
   %val = load atomic i8* %addr monotonic, align 1
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: ldrb r0, [r0, r2]
+; CHECK-LE: ldrb r0, [r0, r2]
+; CHECK-BE: ldrb r0, [r1, r3]
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
@@ -1218,7 +1250,8 @@ define i32 @test_atomic_load_monotonic_regoff_i32(i64 %base, i64 %off) nounwind
   %val = load atomic i32* %addr monotonic, align 4
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: ldr r0, [r0, r2]
+; CHECK-LE: ldr r0, [r0, r2]
+; CHECK-BE: ldr r0, [r1, r3]
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
@@ -1259,8 +1292,10 @@ define void @test_atomic_store_monotonic_regoff_i8(i64 %base, i64 %off, i8 %val)
   %addr = inttoptr i64 %addr_int to i8*
 
   store atomic i8 %val, i8* %addr monotonic, align 1
-; CHECK: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp]
-; CHECK: strb [[VAL]], [r0, r2]
+; CHECK-LE: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp]
+; CHECK-LE: strb [[VAL]], [r0, r2]
+; CHECK-BE: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp, #3]
+; CHECK-BE: strb [[VAL]], [r1, r3]
 
   ret void
 }
@@ -1328,7 +1363,8 @@ define void @test_atomic_store_monotonic_regoff_i32(i64 %base, i64 %off, i32 %va
 ; CHECK: ldr [[VAL:r[0-9]+]], [sp]
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
-; CHECK: str [[VAL]], [r0, r2]
+; CHECK-LE: str [[VAL]], [r0, r2]
+; CHECK-BE: str [[VAL]], [r1, r3]
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
diff --git a/test/CodeGen/ARM/available_externally.ll b/test/CodeGen/ARM/available_externally.ll
index 0f646d5..d925b5c 100644
--- a/test/CodeGen/ARM/available_externally.ll
+++ b/test/CodeGen/ARM/available_externally.ll
@@ -11,6 +11,8 @@ define i32 @t1() {
 }
 
 ; CHECK:      L_A$non_lazy_ptr:
-; CHECK-NEXT: .long _A
+; CHECK-NEXT: .indirect_symbol _A
+; CHECK-NEXT: .long 0
 ; CHECK:      L_B$non_lazy_ptr:
-; CHECK-NEXT: .long _B
+; CHECK-NEXT: .indirect_symbol _B
+; CHECK-NEXT: .long 0
diff --git a/test/CodeGen/ARM/big-endian-eh-unwind.ll b/test/CodeGen/ARM/big-endian-eh-unwind.ll
new file mode 100644
index 0000000..630dfed
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-eh-unwind.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -mtriple armeb-eabi -mattr v7 -filetype obj -o - | llvm-objdump -s - | FileCheck %s
+
+; ARM EHABI for big endian
+; This test case checks whether frame unwinding instructions are laid out in big endian format.
+; 
+; This is the LLVM assembly generated from following C++ code:
+;
+; extern void foo(int);
+; void test(int a, int b) {
+;   try {
+;   foo(a);
+; } catch (...) {
+;   foo(b);
+; }
+;}
+
+define void @_Z4testii(i32 %a, i32 %b) #0 {
+entry:
+  invoke void @_Z3fooi(i32 %a)
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2
+  invoke void @_Z3fooi(i32 %b)
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %lpad
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %invoke.cont2
+  ret void
+
+lpad1:                                            ; preds = %lpad
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:                                        ; preds = %lpad1
+  resume { i8*, i32 } %3
+
+terminate.lpad:                                   ; preds = %lpad1
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %5 = extractvalue { i8*, i32 } %4, 0
+  tail call void @__clang_call_terminate(i8* %5) #3
+  unreachable
+}
+
+declare void @_Z3fooi(i32) #0
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: noinline noreturn nounwind
+define linkonce_odr hidden void @__clang_call_terminate(i8*) #1 {
+  %2 = tail call i8* @__cxa_begin_catch(i8* %0) #2
+  tail call void @_ZSt9terminatev() #3
+  unreachable
+}
+
+declare void @_ZSt9terminatev()
+
+; CHECK-LABEL: Contents of section .ARM.extab:
+; CHECK-NEXT: 0000 00000000 00a8b0b0
+
diff --git a/test/CodeGen/ARM/big-endian-neon-bitconv.ll b/test/CodeGen/ARM/big-endian-neon-bitconv.ll
new file mode 100644
index 0000000..427d2e7
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-neon-bitconv.ll
@@ -0,0 +1,392 @@
+; RUN: llc < %s -march armeb -mtriple arm-eabi -mattr v7,neon -float-abi soft -o - | FileCheck %s
+; RUN: llc < %s -march armeb -mtriple arm-eabi -mattr v7,neon -float-abi hard -o - | FileCheck %s -check-prefix CHECK-HARD
+
+@v2i64 = global <2 x i64> zeroinitializer
+@v2i32 = global <2 x i32> zeroinitializer
+@v4i32 = global <4 x i32> zeroinitializer
+@v4i16 = global <4 x i16> zeroinitializer
+@v8i16 = global <8 x i16> zeroinitializer
+@v8i8 = global <8 x i8> zeroinitializer
+@v16i8 = global <16 x i8> zeroinitializer
+
+@v2f32 = global <2 x float> zeroinitializer
+@v2f64 = global <2 x double> zeroinitializer
+@v4f32 = global <4 x float> zeroinitializer
+
+
+; 64 bit conversions
+define void @conv_i64_to_v8i8( i64 %val,  <8 x i8>* %store ) {
+; CHECK-LABEL: conv_i64_to_v8i8:
+; CHECK: vrev64.8
+  %v = bitcast i64 %val to <8 x i8>
+  %w = load <8 x i8>* @v8i8
+  %a = add <8 x i8> %v, %w
+  store <8 x i8> %a, <8 x i8>* %store
+  ret void
+}
+
+define void @conv_v8i8_to_i64( <8 x i8>* %load, <8 x i8>* %store ) {
+; CHECK-LABEL: conv_v8i8_to_i64:
+; CHECK: vrev64.8
+  %v = load <8 x i8>* %load
+  %w = load <8 x i8>* @v8i8
+  %a = add <8 x i8> %v, %w
+  %f = bitcast <8 x i8> %a to i64
+  call void @conv_i64_to_v8i8( i64 %f, <8 x i8>* %store )
+  ret void
+}
+
+define void @conv_i64_to_v4i16( i64 %val,  <4 x i16>* %store ) {
+; CHECK-LABEL: conv_i64_to_v4i16:
+; CHECK: vrev64.16
+  %v = bitcast i64 %val to <4 x i16>
+  %w = load <4 x i16>* @v4i16
+  %a = add <4 x i16> %v, %w
+  store <4 x i16> %a, <4 x i16>* %store
+  ret void
+}
+
+define void @conv_v4i16_to_i64( <4 x i16>* %load, <4 x i16>* %store ) {
+; CHECK-LABEL: conv_v4i16_to_i64:
+; CHECK: vrev64.16
+  %v = load <4 x i16>* %load
+  %w = load <4 x i16>* @v4i16
+  %a = add <4 x i16> %v, %w
+  %f = bitcast <4 x i16> %a to i64
+  call void @conv_i64_to_v4i16( i64 %f, <4 x i16>* %store )
+  ret void
+}
+
+define void @conv_i64_to_v2i32( i64 %val,  <2 x i32>* %store ) {
+; CHECK-LABEL: conv_i64_to_v2i32:
+; CHECK: vrev64.32
+  %v = bitcast i64 %val to <2 x i32>
+  %w = load <2 x i32>* @v2i32
+  %a = add <2 x i32> %v, %w
+  store <2 x i32> %a, <2 x i32>* %store
+  ret void
+}
+
+define void @conv_v2i32_to_i64( <2 x i32>* %load, <2 x i32>* %store ) {
+; CHECK-LABEL: conv_v2i32_to_i64:
+; CHECK: vrev64.32
+  %v = load <2 x i32>* %load
+  %w = load <2 x i32>* @v2i32
+  %a = add <2 x i32> %v, %w
+  %f = bitcast <2 x i32> %a to i64
+  call void @conv_i64_to_v2i32( i64 %f, <2 x i32>* %store )
+  ret void
+}
+
+define void @conv_i64_to_v2f32( i64 %val,  <2 x float>* %store ) {
+; CHECK-LABEL: conv_i64_to_v2f32:
+; CHECK: vrev64.32
+  %v = bitcast i64 %val to <2 x float>
+  %w = load <2 x float>* @v2f32
+  %a = fadd <2 x float> %v, %w
+  store <2 x float> %a, <2 x float>* %store
+  ret void
+}
+
+define void @conv_v2f32_to_i64( <2 x float>* %load, <2 x float>* %store ) {
+; CHECK-LABEL: conv_v2f32_to_i64:
+; CHECK: vrev64.32
+  %v = load <2 x float>* %load
+  %w = load <2 x float>* @v2f32
+  %a = fadd <2 x float> %v, %w
+  %f = bitcast <2 x float> %a to i64
+  call void @conv_i64_to_v2f32( i64 %f, <2 x float>* %store )
+  ret void
+}
+
+define void @conv_f64_to_v8i8( double %val,  <8 x i8>* %store ) {
+; CHECK-LABEL: conv_f64_to_v8i8:
+; CHECK: vrev64.8
+  %v = bitcast double %val to <8 x i8>
+  %w = load <8 x i8>* @v8i8
+  %a = add <8 x i8> %v, %w
+  store <8 x i8> %a, <8 x i8>* %store
+  ret void
+}
+
+define void @conv_v8i8_to_f64( <8 x i8>* %load, <8 x i8>* %store ) {
+; CHECK-LABEL: conv_v8i8_to_f64:
+; CHECK: vrev64.8
+  %v = load <8 x i8>* %load
+  %w = load <8 x i8>* @v8i8
+  %a = add <8 x i8> %v, %w
+  %f = bitcast <8 x i8> %a to double
+  call void @conv_f64_to_v8i8( double %f, <8 x i8>* %store )
+  ret void
+}
+
+define void @conv_f64_to_v4i16( double %val,  <4 x i16>* %store ) {
+; CHECK-LABEL: conv_f64_to_v4i16:
+; CHECK: vrev64.16
+  %v = bitcast double %val to <4 x i16>
+  %w = load <4 x i16>* @v4i16
+  %a = add <4 x i16> %v, %w
+  store <4 x i16> %a, <4 x i16>* %store
+  ret void
+}
+
+define void @conv_v4i16_to_f64( <4 x i16>* %load, <4 x i16>* %store ) {
+; CHECK-LABEL: conv_v4i16_to_f64:
+; CHECK: vrev64.16
+  %v = load <4 x i16>* %load
+  %w = load <4 x i16>* @v4i16
+  %a = add <4 x i16> %v, %w
+  %f = bitcast <4 x i16> %a to double
+  call void @conv_f64_to_v4i16( double %f, <4 x i16>* %store )
+  ret void
+}
+
+define void @conv_f64_to_v2i32( double %val,  <2 x i32>* %store ) {
+; CHECK-LABEL: conv_f64_to_v2i32:
+; CHECK: vrev64.32
+  %v = bitcast double %val to <2 x i32>
+  %w = load <2 x i32>* @v2i32
+  %a = add <2 x i32> %v, %w
+  store <2 x i32> %a, <2 x i32>* %store
+  ret void
+}
+
+define void @conv_v2i32_to_f64( <2 x i32>* %load, <2 x i32>* %store ) {
+; CHECK-LABEL: conv_v2i32_to_f64:
+; CHECK: vrev64.32
+  %v = load <2 x i32>* %load
+  %w = load <2 x i32>* @v2i32
+  %a = add <2 x i32> %v, %w
+  %f = bitcast <2 x i32> %a to double
+  call void @conv_f64_to_v2i32( double %f, <2 x i32>* %store )
+  ret void
+}
+
+define void @conv_f64_to_v2f32( double %val,  <2 x float>* %store ) {
+; CHECK-LABEL: conv_f64_to_v2f32:
+; CHECK: vrev64.32
+  %v = bitcast double %val to <2 x float>
+  %w = load <2 x float>* @v2f32
+  %a = fadd <2 x float> %v, %w
+  store <2 x float> %a, <2 x float>* %store
+  ret void
+}
+
+define void @conv_v2f32_to_f64( <2 x float>* %load, <2 x float>* %store ) {
+; CHECK-LABEL: conv_v2f32_to_f64:
+; CHECK: vrev64.32
+  %v = load <2 x float>* %load
+  %w = load <2 x float>* @v2f32
+  %a = fadd <2 x float> %v, %w
+  %f = bitcast <2 x float> %a to double
+  call void @conv_f64_to_v2f32( double %f, <2 x float>* %store )
+  ret void
+}
+
+; 128 bit conversions
+
+
+define void @conv_i128_to_v16i8( i128 %val,  <16 x i8>* %store ) {
+; CHECK-LABEL: conv_i128_to_v16i8:
+; CHECK: vrev32.8
+  %v = bitcast i128 %val to <16 x i8>
+  %w = load  <16 x i8>* @v16i8
+  %a = add <16 x i8> %v, %w
+  store <16 x i8> %a, <16 x i8>* %store
+  ret void
+}
+
+define void @conv_v16i8_to_i128( <16 x i8>* %load, <16 x i8>* %store ) {
+; CHECK-LABEL: conv_v16i8_to_i128:
+; CHECK: vrev32.8
+  %v = load <16 x i8>* %load
+  %w = load <16 x i8>* @v16i8
+  %a = add <16 x i8> %v, %w
+  %f = bitcast <16 x i8> %a to i128
+  call void @conv_i128_to_v16i8( i128 %f, <16 x i8>* %store )
+  ret void
+}
+
+define void @conv_i128_to_v8i16( i128 %val,  <8 x i16>* %store ) {
+; CHECK-LABEL: conv_i128_to_v8i16:
+; CHECK: vrev32.16
+  %v = bitcast i128 %val to <8 x i16>
+  %w = load  <8 x i16>* @v8i16
+  %a = add <8 x i16> %v, %w
+  store <8 x i16> %a, <8 x i16>* %store
+  ret void
+}
+
+define void @conv_v8i16_to_i128( <8 x i16>* %load, <8 x i16>* %store ) {
+; CHECK-LABEL: conv_v8i16_to_i128:
+; CHECK: vrev32.16
+  %v = load <8 x i16>* %load
+  %w = load <8 x i16>* @v8i16
+  %a = add <8 x i16> %v, %w
+  %f = bitcast <8 x i16> %a to i128
+  call void @conv_i128_to_v8i16( i128 %f, <8 x i16>* %store )
+  ret void
+}
+
+define void @conv_i128_to_v4i32( i128 %val,  <4 x i32>* %store ) {
+; CHECK-LABEL: conv_i128_to_v4i32:
+; CHECK: vrev64.32
+  %v = bitcast i128 %val to <4 x i32>
+  %w = load <4 x i32>* @v4i32
+  %a = add <4 x i32> %v, %w
+  store <4 x i32> %a, <4 x i32>* %store
+  ret void
+}
+
+define void @conv_v4i32_to_i128( <4 x i32>* %load, <4 x i32>* %store ) {
+; CHECK-LABEL: conv_v4i32_to_i128:
+; CHECK: vrev64.32
+  %v = load <4 x i32>* %load
+  %w = load <4 x i32>* @v4i32
+  %a = add <4 x i32> %v, %w
+  %f = bitcast <4 x i32> %a to i128
+  call void @conv_i128_to_v4i32( i128 %f, <4 x i32>* %store )
+  ret void
+}
+
+define void @conv_i128_to_v4f32( i128 %val,  <4 x float>* %store ) {
+; CHECK-LABEL: conv_i128_to_v4f32:
+; CHECK: vrev64.32
+  %v = bitcast i128 %val to <4 x float>
+  %w = load <4 x float>* @v4f32
+  %a = fadd <4 x float> %v, %w
+  store <4 x float> %a, <4 x float>* %store
+  ret void
+}
+
+define void @conv_v4f32_to_i128( <4 x float>* %load, <4 x float>* %store ) {
+; CHECK-LABEL: conv_v4f32_to_i128:
+; CHECK: vrev64.32
+  %v = load <4 x float>* %load
+  %w = load <4 x float>* @v4f32
+  %a = fadd <4 x float> %v, %w
+  %f = bitcast <4 x float> %a to i128
+  call void @conv_i128_to_v4f32( i128 %f, <4 x float>* %store )
+  ret void
+}
+
+define void @conv_f128_to_v2f64( fp128 %val,  <2 x double>* %store ) {
+; CHECK-LABEL: conv_f128_to_v2f64:
+; CHECK: vrev64.32
+  %v = bitcast fp128 %val to <2 x double>
+  %w = load <2 x double>* @v2f64
+  %a = fadd <2 x double> %v, %w
+  store <2 x double> %a, <2 x double>* %store
+  ret void
+}
+
+define void @conv_v2f64_to_f128( <2 x double>* %load, <2 x double>* %store ) {
+; CHECK-LABEL: conv_v2f64_to_f128:
+; CHECK: vrev64.32
+  %v = load <2 x double>* %load
+  %w = load <2 x double>* @v2f64
+  %a = fadd <2 x double> %v, %w
+  %f = bitcast <2 x double> %a to fp128
+  call void @conv_f128_to_v2f64( fp128 %f, <2 x double>* %store )
+  ret void
+}
+
+define void @conv_f128_to_v16i8( fp128 %val,  <16 x i8>* %store ) {
+; CHECK-LABEL: conv_f128_to_v16i8:
+; CHECK: vrev32.8
+  %v = bitcast fp128 %val to <16 x i8>
+  %w = load  <16 x i8>* @v16i8
+  %a = add <16 x i8> %v, %w
+  store <16 x i8> %a, <16 x i8>* %store
+  ret void
+}
+
+define void @conv_v16i8_to_f128( <16 x i8>* %load, <16 x i8>* %store ) {
+; CHECK-LABEL: conv_v16i8_to_f128:
+; CHECK: vrev32.8
+  %v = load <16 x i8>* %load
+  %w = load <16 x i8>* @v16i8
+  %a = add <16 x i8> %v, %w
+  %f = bitcast <16 x i8> %a to fp128
+  call void @conv_f128_to_v16i8( fp128 %f, <16 x i8>* %store )
+  ret void
+}
+
+define void @conv_f128_to_v8i16( fp128 %val,  <8 x i16>* %store ) {
+; CHECK-LABEL: conv_f128_to_v8i16:
+; CHECK: vrev32.16
+  %v = bitcast fp128 %val to <8 x i16>
+  %w = load  <8 x i16>* @v8i16
+  %a = add <8 x i16> %v, %w
+  store <8 x i16> %a, <8 x i16>* %store
+  ret void
+}
+
+define void @conv_v8i16_to_f128( <8 x i16>* %load, <8 x i16>* %store ) {
+; CHECK-LABEL: conv_v8i16_to_f128:
+; CHECK: vrev32.16
+  %v = load <8 x i16>* %load
+  %w = load <8 x i16>* @v8i16
+  %a = add <8 x i16> %v, %w
+  %f = bitcast <8 x i16> %a to fp128
+  call void @conv_f128_to_v8i16( fp128 %f, <8 x i16>* %store )
+  ret void
+}
+
+define void @conv_f128_to_v4f32( fp128 %val,  <4 x float>* %store ) {
+; CHECK-LABEL: conv_f128_to_v4f32:
+; CHECK: vrev64.32
+  %v = bitcast fp128 %val to <4 x float>
+  %w = load <4 x float>* @v4f32
+  %a = fadd <4 x float> %v, %w
+  store <4 x float> %a, <4 x float>* %store
+  ret void
+}
+
+define void @conv_v4f32_to_f128( <4 x float>* %load, <4 x float>* %store ) {
+; CHECK-LABEL: conv_v4f32_to_f128:
+; CHECK: vrev64.32
+  %v = load <4 x float>* %load
+  %w = load <4 x float>* @v4f32
+  %a = fadd <4 x float> %v, %w
+  %f = bitcast <4 x float> %a to fp128
+  call void @conv_f128_to_v4f32( fp128 %f, <4 x float>* %store )
+  ret void
+}
+
+define void @arg_v4i32( <4 x i32> %var, <4 x i32>* %store ) {
+; CHECK-LABEL: arg_v4i32:
+; CHECK: vmov   [[REG2:d[0-9]+]], r3, r2
+; CHECK: vmov   [[REG1:d[0-9]+]], r1, r0
+; CHECK: vst1.64 {[[REG1]], [[REG2]]},
+; CHECK-HARD-LABEL: arg_v4i32:
+; CHECK-HARD-NOT: vmov
+; CHECK-HARD: vst1.64 {d0, d1}
+  store <4 x i32> %var, <4 x i32>* %store
+  ret void
+}
+
+define void @arg_v8i16( <8 x i16> %var, <8 x i16>* %store ) {
+; CHECK-LABEL: arg_v8i16:
+; CHECK: vmov   [[REG2:d[0-9]+]], r3, r2
+; CHECK: vmov   [[REG1:d[0-9]+]], r1, r0
+; CHECK: vst1.64 {[[REG1]], [[REG2]]},
+; CHECK-HARD-LABEL: arg_v8i16:
+; CHECK-HARD-NOT: vmov
+; CHECK-HARD: vst1.64 {d0, d1}
+  store <8 x i16> %var, <8 x i16>* %store
+  ret void
+}
+
+define void @arg_v16i8( <16 x i8> %var, <16 x i8>* %store ) {
+; CHECK-LABEL: arg_v16i8:
+; CHECK: vmov   [[REG2:d[0-9]+]], r3, r2
+; CHECK: vmov   [[REG1:d[0-9]+]], r1, r0
+; CHECK: vst1.64 {[[REG1]], [[REG2]]},
+; CHECK-HARD-LABEL: arg_v16i8:
+; CHECK-HARD-NOT: vmov
+; CHECK-HARD: vst1.64 {d0, d1}
+  store <16 x i8> %var, <16 x i8>* %store
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/big-endian-vector-callee.ll b/test/CodeGen/ARM/big-endian-vector-callee.ll
new file mode 100644
index 0000000..4db8bde
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-vector-callee.ll
@@ -0,0 +1,1172 @@
+; RUN: llc -mtriple armeb-eabi -mattr v7,neon -float-abi soft %s -o - | FileCheck %s -check-prefix CHECK -check-prefix SOFT
+; RUN: llc -mtriple armeb-eabi -mattr v7,neon -float-abi hard %s -o - | FileCheck %s -check-prefix CHECK -check-prefix HARD
+
+; CHECK-LABEL: test_i64_f64:
+define i64 @test_i64_f64(double %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+define i64 @test_i64_v1i64(<1 x i64> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 d{{[0-9]+}}, d0
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+define i64 @test_i64_v2f32(<2 x float> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+define i64 @test_i64_v2i32(<2 x i32> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+define i64 @test_i64_v4i16(<4 x i16> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 d{{[0-9]+}}, d0
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+define i64 @test_i64_v8i8(<8 x i8> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 d{{[0-9]+}}, d0
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to i64
+    %3 = add i64 %2, %2
+    ret i64 %3
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_f64_i64:
+define double @test_f64_i64(i64 %p) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+define double @test_f64_v1i64(<1 x i64> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 d{{[0-9]+}}, d0
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+define double @test_f64_v2f32(<2 x float> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+define double @test_f64_v2i32(<2 x i32> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+define double @test_f64_v4i16(<4 x i16> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 d{{[0-9]+}}, d0
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+define double @test_f64_v8i8(<8 x i8> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 d{{[0-9]+}}, d0
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to double
+    %3 = fadd double %2, %2
+    ret double %3
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+define <1 x i64> @test_v1i64_i64(i64 %p) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+define <1 x i64> @test_v1i64_f64(double %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+define <1 x i64> @test_v1i64_v2f32(<2 x float> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+define <1 x i64> @test_v1i64_v2i32(<2 x i32> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+define <1 x i64> @test_v1i64_v4i16(<4 x i16> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 d{{[0-9]+}}, d0
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+define <1 x i64> @test_v1i64_v8i8(<8 x i8> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 d{{[0-9]+}}, d0
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <1 x i64>
+    %3 = add <1 x i64> %2, %2
+    ret <1 x i64> %3
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+define <2 x float> @test_v2f32_i64(i64 %p) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+define <2 x float> @test_v2f32_f64(double %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+define <2 x float> @test_v2f32_v1i64(<1 x i64> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 d{{[0-9]+}}, d0
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+define <2 x float> @test_v2f32_v2i32(<2 x i32> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+define <2 x float> @test_v2f32_v4i16(<4 x i16> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 d{{[0-9]+}}, d0
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+define <2 x float> @test_v2f32_v8i8(<8 x i8> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 d{{[0-9]+}}, d0
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <2 x float>
+    %3 = fadd <2 x float> %2, %2
+    ret <2 x float> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+define <2 x i32> @test_v2i32_i64(i64 %p) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+define <2 x i32> @test_v2i32_f64(double %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+define <2 x i32> @test_v2i32_v1i64(<1 x i64> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 d{{[0-9]+}}, d0
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+define <2 x i32> @test_v2i32_v2f32(<2 x float> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+define <2 x i32> @test_v2i32_v4i16(<4 x i16> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 d{{[0-9]+}}, d0
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+define <2 x i32> @test_v2i32_v8i8(<8 x i8> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 d{{[0-9]+}}, d0
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <2 x i32>
+    %3 = add <2 x i32> %2, %2
+    ret <2 x i32> %3
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+define <4 x i16> @test_v4i16_i64(i64 %p) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+define <4 x i16> @test_v4i16_f64(double %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+define <4 x i16> @test_v4i16_v1i64(<1 x i64> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 d{{[0-9]+}}, d0
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+define <4 x i16> @test_v4i16_v2f32(<2 x float> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+define <4 x i16> @test_v4i16_v2i32(<2 x i32> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+define <4 x i16> @test_v4i16_v8i8(<8 x i8> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 d{{[0-9]+}}, d0
+    %1 = add <8 x i8> %p, %p
+    %2 = bitcast <8 x i8> %1 to <4 x i16>
+    %3 = add <4 x i16> %2, %2
+    ret <4 x i16> %3
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+define <8 x i8> @test_v8i8_i64(i64 %p) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = add i64 %p, %p
+    %2 = bitcast i64 %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+define <8 x i8> @test_v8i8_f64(double %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd double %p, %p
+    %2 = bitcast double %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+define <8 x i8> @test_v8i8_v1i64(<1 x i64> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 d{{[0-9]+}}, d0
+    %1 = add <1 x i64> %p, %p
+    %2 = bitcast <1 x i64> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+define <8 x i8> @test_v8i8_v2f32(<2 x float> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = fadd <2 x float> %p, %p
+    %2 = bitcast <2 x float> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+define <8 x i8> @test_v8i8_v2i32(<2 x i32> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 d{{[0-9]+}}, d0
+    %1 = add <2 x i32> %p, %p
+    %2 = bitcast <2 x i32> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+define <8 x i8> @test_v8i8_v4i16(<4 x i16> %p) {
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 d{{[0-9]+}}, d0
+    %1 = add <4 x i16> %p, %p
+    %2 = bitcast <4 x i16> %1 to <8 x i8>
+    %3 = add <8 x i8> %2, %2
+    ret <8 x i8> %3
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+define fp128 @test_f128_v2f64(<2 x double> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG1]]
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG2]]
+; HARD: vadd.f64 d{{[0-9]+}}, d1
+; HARD: vadd.f64 d{{[0-9]+}}, d0
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
+; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+define fp128 @test_f128_v2i64(<2 x i64> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vadd.i64 q{{[0-9]+}}, q0
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
+; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+define fp128 @test_f128_v4f32(<4 x float> %p) {
+; HARD: vrev64.32 q{{[0-9]+}}, q0
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
+; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+define fp128 @test_f128_v4i32(<4 x i32> %p) {
+; HARD: vrev64.32 q{{[0-9]+}}, q0
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
+; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+define fp128 @test_f128_v8i16(<8 x i16> %p) {
+; HARD: vrev64.16 q{{[0-9]+}}, q0
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
+; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+define fp128 @test_f128_v16i8(<16 x i8> %p) {
+; HARD: vrev64.8 q{{[0-9]+}}, q0
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to fp128
+    %3 = fadd fp128 %2, %2
+    ret fp128 %3
+; CHECK: vst1.32 {d{{[0-9]+}}[1]}, [{{[a-z0-9]+}}:32]
+; CHECK: vst1.32 {d{{[0-9]+}}[0]}, [{{[a-z0-9]+}}:32]
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+define <2 x double> @test_v2f64_f128(fp128 %p) {
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG1]][1], r1
+; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG2]][1], r3
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG2]]
+; SOFT: vmov r3, r2, [[REG1]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+define <2 x double> @test_v2f64_v2i64(<2 x i64> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vadd.i64 q{{[0-9]+}}, q0
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG2]]
+; SOFT: vmov r3, r2, [[REG1]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+define <2 x double> @test_v2f64_v4f32(<4 x float> %p) {
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG2]]
+; SOFT: vmov r3, r2, [[REG1]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+define <2 x double> @test_v2f64_v4i32(<4 x i32> %p) {
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG2]]
+; SOFT: vmov r3, r2, [[REG1]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+define <2 x double> @test_v2f64_v8i16(<8 x i16> %p) {
+; HARD: vrev64.16  q{{[0-9]+}}, q0
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG2]]
+; SOFT: vmov r3, r2, [[REG1]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+define <2 x double> @test_v2f64_v16i8(<16 x i8> %p) {
+; HARD: vrev64.8  q{{[0-9]+}}, q0
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <2 x double>
+    %3 = fadd <2 x double> %2, %2
+    ret <2 x double> %3
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG2]]
+; SOFT: vmov r3, r2, [[REG1]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+define <2 x i64> @test_v2i64_f128(fp128 %p) {
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG1]][1], r1
+; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG2]][1], r3
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+define <2 x i64> @test_v2i64_v2f64(<2 x double> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG1]]
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG2]]
+; HARD: vadd.f64  d{{[0-9]+}}, d1
+; HARD: vadd.f64  d{{[0-9]+}}, d0
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+define <2 x i64> @test_v2i64_v4f32(<4 x float> %p) {
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+define <2 x i64> @test_v2i64_v4i32(<4 x i32> %p) {
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+define <2 x i64> @test_v2i64_v8i16(<8 x i16> %p) {
+; HARD: vrev64.16  q{{[0-9]+}}, q0
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+define <2 x i64> @test_v2i64_v16i8(<16 x i8> %p) {
+; HARD: vrev64.8  q{{[0-9]+}}, q0
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <2 x i64>
+    %3 = add <2 x i64> %2, %2
+    ret <2 x i64> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+define <4 x float> @test_v4f32_f128(fp128 %p) {
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG1]][1], r1
+; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG2]][1], r3
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+define <4 x float> @test_v4f32_v2f64(<2 x double> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG1]]
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG2]]
+; HARD: vadd.f64  d{{[0-9]+}}, d1
+; HARD: vadd.f64  d{{[0-9]+}}, d0
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+define <4 x float> @test_v4f32_v2i64(<2 x i64> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vadd.i64  q{{[0-9]+}}, q0
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+define <4 x float> @test_v4f32_v4i32(<4 x i32> %p) {
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+define <4 x float> @test_v4f32_v8i16(<8 x i16> %p) {
+; HARD: vrev64.16  q{{[0-9]+}}, q0
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+define <4 x float> @test_v4f32_v16i8(<16 x i8> %p) {
+; HARD: vrev64.8  q{{[0-9]+}}, q0
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <4 x float>
+    %3 = fadd <4 x float> %2, %2
+    ret <4 x float> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+define <4 x i32> @test_v4i32_f128(fp128 %p) {
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG1]][1], r1
+; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG2]][1], r3
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+define <4 x i32> @test_v4i32_v2f64(<2 x double> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG1]]
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG2]]
+; HARD: vadd.f64  d{{[0-9]+}}, d1
+; HARD: vadd.f64  d{{[0-9]+}}, d0
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+define <4 x i32> @test_v4i32_v2i64(<2 x i64> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vadd.i64  q{{[0-9]+}}, q0
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+define <4 x i32> @test_v4i32_v4f32(<4 x float> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+define <4 x i32> @test_v4i32_v8i16(<8 x i16> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.16  q{{[0-9]+}}, q0
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+define <4 x i32> @test_v4i32_v16i8(<16 x i8> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.8  q{{[0-9]+}}, q0
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <4 x i32>
+    %3 = add <4 x i32> %2, %2
+    ret <4 x i32> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+define <8 x i16> @test_v8i16_f128(fp128 %p) {
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG1]][1], r1
+; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG2]][1], r3
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+define <8 x i16> @test_v8i16_v2f64(<2 x double> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG1]]
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG2]]
+; HARD: vadd.f64  d{{[0-9]+}}, d1
+; HARD: vadd.f64  d{{[0-9]+}}, d0
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+define <8 x i16> @test_v8i16_v2i64(<2 x i64> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vadd.i64  q{{[0-9]+}}, q0
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+define <8 x i16> @test_v8i16_v4f32(<4 x float> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+define <8 x i16> @test_v8i16_v4i32(<4 x i32> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.32  q{{[0-9]+}}, q0
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+define <8 x i16> @test_v8i16_v16i8(<16 x i8> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.8 q{{[0-9]+}}, q0
+    %1 = add <16 x i8> %p, %p
+    %2 = bitcast <16 x i8> %1 to <8 x i16>
+    %3 = add <8 x i16> %2, %2
+    ret <8 x i16> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+define <16 x i8> @test_v16i8_f128(fp128 %p) {
+; CHECK: vmov.32 [[REG1:d[0-9]+]][0], r0
+; CHECK: vmov.32 [[REG1]][1], r1
+; CHECK: vmov.32 [[REG2:d[0-9]+]][0], r2
+; CHECK: vmov.32 [[REG2]][1], r3
+    %1 = fadd fp128 %p, %p
+    %2 = bitcast fp128 %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+define <16 x i8> @test_v16i8_v2f64(<2 x double> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG1]]
+; SOFT: vadd.f64 d{{[0-9]+}}, [[REG2]]
+; HARD: vadd.f64  d{{[0-9]+}}, d1
+; HARD: vadd.f64  d{{[0-9]+}}, d0
+    %1 = fadd <2 x double> %p, %p
+    %2 = bitcast <2 x double> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+define <16 x i8> @test_v16i8_v2i64(<2 x i64> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vadd.i64  q{{[0-9]+}}, q0
+    %1 = add <2 x i64> %p, %p
+    %2 = bitcast <2 x i64> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+define <16 x i8> @test_v16i8_v4f32(<4 x float> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.32 q{{[0-9]+}}, q0
+    %1 = fadd <4 x float> %p, %p
+    %2 = bitcast <4 x float> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+define <16 x i8> @test_v16i8_v4i32(<4 x i32> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.32 q{{[0-9]+}}, q0
+    %1 = add <4 x i32> %p, %p
+    %2 = bitcast <4 x i32> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+define <16 x i8> @test_v16i8_v8i16(<8 x i16> %p) {
+; SOFT: vmov [[REG1:d[0-9]+]], r3, r2
+; SOFT: vmov [[REG2:d[0-9]+]], r1, r0
+; HARD: vrev64.16 q{{[0-9]+}}, q0
+    %1 = add <8 x i16> %p, %p
+    %2 = bitcast <8 x i16> %1 to <16 x i8>
+    %3 = add <16 x i8> %2, %2
+    ret <16 x i8> %3
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+}
diff --git a/test/CodeGen/ARM/big-endian-vector-caller.ll b/test/CodeGen/ARM/big-endian-vector-caller.ll
new file mode 100644
index 0000000..d01b0a7
--- /dev/null
+++ b/test/CodeGen/ARM/big-endian-vector-caller.ll
@@ -0,0 +1,1369 @@
+; RUN: llc -mtriple armeb-eabi -mattr v7,neon -float-abi soft %s -o - | FileCheck %s -check-prefix CHECK -check-prefix SOFT
+; RUN: llc -mtriple armeb-eabi -mattr v7,neon -float-abi hard %s -o - | FileCheck %s -check-prefix CHECK -check-prefix HARD
+
+; CHECK-LABEL: test_i64_f64:
+declare i64 @test_i64_f64_helper(double %p)
+define void @test_i64_f64(double* %p, i64* %q) {
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call i64 @test_i64_f64_helper(double %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+declare i64 @test_i64_v1i64_helper(<1 x i64> %p)
+define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call i64 @test_i64_v1i64_helper(<1 x i64> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+declare i64 @test_i64_v2f32_helper(<2 x float> %p)
+define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call i64 @test_i64_v2f32_helper(<2 x float> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+declare i64 @test_i64_v2i32_helper(<2 x i32> %p)
+define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call i64 @test_i64_v2i32_helper(<2 x i32> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+declare i64 @test_i64_v4i16_helper(<4 x i16> %p)
+define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call i64 @test_i64_v4i16_helper(<4 x i16> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+declare i64 @test_i64_v8i8_helper(<8 x i8> %p)
+define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call i64 @test_i64_v8i8_helper(<8 x i8> %2)
+    %4 = add i64 %3, %3
+    store i64 %4, i64* %q
+    ret void
+; CHECK: adds r1
+; CHECK: adc r0
+}
+
+; CHECK-LABEL: test_f64_i64:
+declare double @test_f64_i64_helper(i64 %p)
+define void @test_f64_i64(i64* %p, double* %q) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call double @test_f64_i64_helper(i64 %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+declare double @test_f64_v1i64_helper(<1 x i64> %p)
+define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call double @test_f64_v1i64_helper(<1 x i64> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+declare double @test_f64_v2f32_helper(<2 x float> %p)
+define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call double @test_f64_v2f32_helper(<2 x float> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+declare double @test_f64_v2i32_helper(<2 x i32> %p)
+define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call double @test_f64_v2i32_helper(<2 x i32> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+declare double @test_f64_v4i16_helper(<4 x i16> %p)
+define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call double @test_f64_v4i16_helper(<4 x i16> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+declare double @test_f64_v8i8_helper(<8 x i8> %p)
+define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call double @test_f64_v8i8_helper(<8 x i8> %2)
+    %4 = fadd double %3, %3
+    store double %4, double* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.f64 [[REG]]
+; HARD: vadd.f64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+declare <1 x i64> @test_v1i64_i64_helper(i64 %p)
+define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <1 x i64> @test_v1i64_i64_helper(i64 %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+declare <1 x i64> @test_v1i64_f64_helper(double %p)
+define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <1 x i64> @test_v1i64_f64_helper(double %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+declare <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %p)
+define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
+; HARD: vrev64.32 d0
+; SOFT: vadd.f32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+declare <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %p)
+define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
+; HARD: vrev64.32 d0
+; SOFT: vadd.i32 [[REG:d[0-9]+]]
+; SOFT: vrev64.32 [[REG]]
+; SOFT: vmov r1, r0, [[REG]]
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+declare <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %p)
+define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+declare <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %p)
+define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %2)
+    %4 = add <1 x i64> %3, %3
+    store <1 x i64> %4, <1 x i64>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vadd.i64 [[REG]]
+; HARD: vadd.i64 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+declare <2 x float> @test_v2f32_i64_helper(i64 %p)
+define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <2 x float> @test_v2f32_i64_helper(i64 %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+declare <2 x float> @test_v2f32_f64_helper(double %p)
+define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <2 x float> @test_v2f32_f64_helper(double %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+declare <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %p)
+define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+declare <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %p)
+define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
+; HARD: vrev64.32 d0
+; SOFT: vadd.i32 [[REG:d[0-9]+]]
+; SOFT: vrev64.32 [[REG]]
+; SOFT: vmov r1, r0, [[REG]]
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+declare <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %p)
+define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+declare <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %p)
+define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %2)
+    %4 = fadd <2 x float> %3, %3
+    store <2 x float> %4, <2 x float>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+declare <2 x i32> @test_v2i32_i64_helper(i64 %p)
+define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <2 x i32> @test_v2i32_i64_helper(i64 %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+declare <2 x i32> @test_v2i32_f64_helper(double %p)
+define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <2 x i32> @test_v2i32_f64_helper(double %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+declare <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %p)
+define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+declare <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %p)
+define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
+; HARD: vadd.f32 [[REG:d[0-9]+]]
+; HARD: vrev64.32 d0, [[REG]]
+; SOFT: vadd.f32 [[REG:d[0-9]+]]
+; SOFT: vrev64.32 [[REG]]
+; SOFT: vmov r1, r0, [[REG]]
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+declare <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %p)
+define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+declare <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %p)
+define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %2)
+    %4 = add <2 x i32> %3, %3
+    store <2 x i32> %4, <2 x i32>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.32 [[REG]]
+; HARD: vrev64.32 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+declare <4 x i16> @test_v4i16_i64_helper(i64 %p)
+define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <4 x i16> @test_v4i16_i64_helper(i64 %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+declare <4 x i16> @test_v4i16_f64_helper(double %p)
+define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <4 x i16> @test_v4i16_f64_helper(double %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+declare <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %p)
+define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+declare <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %p)
+define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
+; HARD: vadd.f32 [[REG:d[0-9]+]]
+; HARD: vrev64.32 d0, [[REG]]
+; SOFT: vadd.f32 [[REG:d[0-9]+]]
+; SOFT: vrev64.32 [[REG]]
+; SOFT: vmov r1, r0, [[REG]]
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+declare <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %p)
+define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
+; HARD: vadd.i32 [[REG:d[0-9]+]]
+; HARD: vrev64.32 d0, [[REG]]
+; SOFT: vadd.i32 [[REG:d[0-9]+]]
+; SOFT: vrev64.32 [[REG]]
+; SOFT: vmov r1, r0, [[REG]]
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+declare <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %p)
+define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
+; SOFT: vrev64.8 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.8 d0
+    %1 = load <8 x i8>* %p
+    %2 = add <8 x i8> %1, %1
+    %3 = call <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %2)
+    %4 = add <4 x i16> %3, %3
+    store <4 x i16> %4, <4 x i16>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.16 [[REG]]
+; HARD: vrev64.16 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+declare <8 x i8> @test_v8i8_i64_helper(i64 %p)
+define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
+; CHECK: adds r1
+; CHECK: adc r0
+    %1 = load i64* %p
+    %2 = add i64 %1, %1
+    %3 = call <8 x i8> @test_v8i8_i64_helper(i64 %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+declare <8 x i8> @test_v8i8_f64_helper(double %p)
+define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
+; SOFT: vadd.f64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.f64 d0
+    %1 = load double* %p
+    %2 = fadd double %1, %1
+    %3 = call <8 x i8> @test_v8i8_f64_helper(double %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+declare <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %p)
+define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
+; SOFT: vadd.i64 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vadd.i64 d0
+    %1 = load <1 x i64>* %p
+    %2 = add <1 x i64> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+declare <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %p)
+define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+    %1 = load <2 x float>* %p
+    %2 = fadd <2 x float> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+declare <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %p)
+define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
+; SOFT: vrev64.32 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.32 d0
+    %1 = load <2 x i32>* %p
+    %2 = add <2 x i32> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+declare <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %p)
+define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
+; SOFT: vrev64.16 [[REG:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG]]
+; HARD: vrev64.16 d0
+    %1 = load <4 x i16>* %p
+    %2 = add <4 x i16> %1, %1
+    %3 = call <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %2)
+    %4 = add <8 x i8> %3, %3
+    store <8 x i8> %4, <8 x i8>* %q
+    ret void
+; SOFT: vmov [[REG:d[0-9]+]], r1, r0
+; SOFT: vrev64.8 [[REG]]
+; HARD: vrev64.8 {{d[0-9]+}}, d0
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+declare fp128 @test_f128_v2f64_helper(<2 x double> %p)
+define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
+; SOFT: vadd.f64 [[REG2:d[0-9]+]]
+; SOFT: vadd.f64 [[REG1:d[0-9]+]]
+; SOFT: vmov r1, r0, [[REG1]]
+; SOFT: vmov r3, r2, [[REG2]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call fp128 @test_f128_v2f64_helper(<2 x double> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+; CHECK: stm sp, {r0, r1, r2, r3}
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+declare fp128 @test_f128_v2i64_helper(<2 x i64> %p)
+define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call fp128 @test_f128_v2i64_helper(<2 x i64> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+; CHECK: stm sp, {r0, r1, r2, r3}
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+declare fp128 @test_f128_v4f32_helper(<4 x float> %p)
+define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call fp128 @test_f128_v4f32_helper(<4 x float> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+; CHECK: stm sp, {r0, r1, r2, r3}
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+declare fp128 @test_f128_v4i32_helper(<4 x i32> %p)
+define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call fp128 @test_f128_v4i32_helper(<4 x i32> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+; CHECK: stm sp, {r0, r1, r2, r3}
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+declare fp128 @test_f128_v8i16_helper(<8 x i16> %p)
+define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call fp128 @test_f128_v8i16_helper(<8 x i16> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+; CHECK: stm sp, {r0, r1, r2, r3}
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+declare fp128 @test_f128_v16i8_helper(<16 x i8> %p)
+define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call fp128 @test_f128_v16i8_helper(<16 x i8> %2)
+    %4 = fadd fp128 %3, %3
+    store fp128 %4, fp128* %q
+    ret void
+; CHECK: stm sp, {r0, r1, r2, r3}
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+declare <2 x double> @test_v2f64_f128_helper(fp128 %p)
+define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <2 x double> @test_v2f64_f128_helper(fp128 %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+declare <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %p)
+define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+declare <2 x double> @test_v2f64_v4f32_helper(<4 x float> %p)
+define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <2 x double> @test_v2f64_v4f32_helper(<4 x float> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+declare <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %p)
+define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+declare <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %p)
+define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+declare <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %p)
+define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %2)
+    %4 = fadd <2 x double> %3, %3
+    store <2 x double> %4, <2 x double>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+declare <2 x i64> @test_v2i64_f128_helper(fp128 %p)
+define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <2 x i64> @test_v2i64_f128_helper(fp128 %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+declare <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %p)
+define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
+; SOFT: vmov r1, r0, [[REG1]]
+; SOFT: vmov r3, r2, [[REG2]]
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+declare <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %p) 
+define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+declare <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %p)
+define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+declare <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %p)
+define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+declare <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %p)
+define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %2)
+    %4 = add <2 x i64> %3, %3
+    store <2 x i64> %4, <2 x i64>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+declare <4 x float> @test_v4f32_f128_helper(fp128 %p)
+define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <4 x float> @test_v4f32_f128_helper(fp128 %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+declare <4 x float> @test_v4f32_v2f64_helper(<2 x double> %p)
+define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.f64  d1
+; HARD: vadd.f64  d0
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <4 x float> @test_v4f32_v2f64_helper(<2 x double> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+declare <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %p)
+define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+declare <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %p)
+define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+declare <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %p)
+define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+declare <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %p)
+define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %2)
+    %4 = fadd <4 x float> %3, %3
+    store <4 x float> %4, <4 x float>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+declare <4 x i32> @test_v4i32_f128_helper(fp128 %p)
+define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <4 x i32> @test_v4i32_f128_helper(fp128 %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+declare <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %p)
+define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+declare <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %p)
+define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+declare <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %p)
+define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+declare <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %p)
+define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+declare <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %p)
+define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %2)
+    %4 = add <4 x i32> %3, %3
+    store <4 x i32> %4, <4 x i32>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+declare <8 x i16> @test_v8i16_f128_helper(fp128 %p)
+define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <8 x i16> @test_v8i16_f128_helper(fp128 %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+declare <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %p)
+define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+declare <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %p)
+define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+declare <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %p)
+define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+declare <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %p)
+define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+declare <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %p)
+define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.8 q0
+    %1 = load <16 x i8>* %p
+    %2 = add <16 x i8> %1, %1
+    %3 = call <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %2)
+    %4 = add <8 x i16> %3, %3
+    store <8 x i16> %4, <8 x i16>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+declare <16 x i8> @test_v16i8_f128_helper(fp128 %p)
+define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
+    %1 = load fp128* %p
+    %2 = fadd fp128 %1, %1
+    %3 = call <16 x i8> @test_v16i8_f128_helper(fp128 %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+declare <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %p)
+define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.f64 d1
+; HARD: vadd.f64 d0
+    %1 = load <2 x double>* %p
+    %2 = fadd <2 x double> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+declare <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %p)
+define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vadd.i64 q0
+    %1 = load <2 x i64>* %p
+    %2 = add <2 x i64> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+declare <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %p)
+define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x float>* %p
+    %2 = fadd <4 x float> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+declare <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %p)
+define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.32 q0
+    %1 = load <4 x i32>* %p
+    %2 = add <4 x i32> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+declare <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %p)
+define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
+; SOFT: vmov r1, r0
+; SOFT: vmov r3, r2
+; HARD: vrev64.16 q0
+    %1 = load <8 x i16>* %p
+    %2 = add <8 x i16> %1, %1
+    %3 = call <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %2)
+    %4 = add <16 x i8> %3, %3
+    store <16 x i8> %4, <16 x i8>* %q
+    ret void
+; SOFT: vmov {{d[0-9]+}}, r3, r2
+; SOFT: vmov {{d[0-9]+}}, r1, r0
+}
diff --git a/test/CodeGen/ARM/bswap16.ll b/test/CodeGen/ARM/bswap16.ll
new file mode 100644
index 0000000..70c62d2
--- /dev/null
+++ b/test/CodeGen/ARM/bswap16.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mtriple=arm-darwin  -mattr=v6 < %s | FileCheck %s
+; RUN: llc -mtriple=thumb-darwin  -mattr=v6 < %s | FileCheck %s
+
+
+define void @test1(i16* nocapture %data) {
+entry:
+  %0 = load i16* %data, align 2
+  %1 = tail call i16 @llvm.bswap.i16(i16 %0)
+  store i16 %1, i16* %data, align 2
+  ret void
+
+  ; CHECK-LABEL: test1:
+  ; CHECK: ldrh r[[R1:[0-9]+]], [r0]
+  ; CHECK: rev16 r[[R1]], r[[R1]]
+  ; CHECK: strh r[[R1]], [r0]
+}
+
+
+define void @test2(i16* nocapture %data, i16 zeroext %in) {
+entry:
+  %0 = tail call i16 @llvm.bswap.i16(i16 %in)
+  store i16 %0, i16* %data, align 2
+  ret void
+
+  ; CHECK-LABEL: test2:
+  ; CHECK: rev16 r[[R1:[0-9]+]], r1
+  ; CHECK: strh r[[R1]], [r0]
+}
+
+
+define i16 @test3(i16* nocapture %data) {
+entry:
+  %0 = load i16* %data, align 2
+  %1 = tail call i16 @llvm.bswap.i16(i16 %0)
+  ret i16 %1
+
+  ; CHECK-LABEL: test3:
+  ; CHECK: ldrh r[[R0:[0-9]+]], [r0]
+  ; CHECK: rev16 r[[R0]], r0
+}
+
+declare i16 @llvm.bswap.i16(i16)
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index 3e825e8..d75d55d 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -33,6 +33,11 @@
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2,-vfp3,-vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-NOFPU
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,,+d16,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=pic | FileCheck %s --check-prefix=RELOC-PIC
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=static | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=default | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=dynamic-no-pic | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi | FileCheck %s --check-prefix=RELOC-OTHER
 
 ; XSCALE:      .eabi_attribute 6, 5
 ; XSCALE:      .eabi_attribute 8, 1
@@ -453,6 +458,11 @@
 ; CORTEX-A57-NOT:  .eabi_attribute 44
 ; CORTEX-A57:  .eabi_attribute 68, 3
 
+; RELOC-PIC:  .eabi_attribute 15, 1
+; RELOC-PIC:  .eabi_attribute 16, 1
+; RELOC-PIC:  .eabi_attribute 17, 2
+; RELOC-OTHER:  .eabi_attribute 17, 1
+
 define i32 @f(i64 %z) {
 	ret i32 0
 }
diff --git a/test/CodeGen/ARM/dagcombine-concatvector.ll b/test/CodeGen/ARM/dagcombine-concatvector.ll
index 2927ea2..62ed87f 100644
--- a/test/CodeGen/ARM/dagcombine-concatvector.ll
+++ b/test/CodeGen/ARM/dagcombine-concatvector.ll
@@ -1,11 +1,14 @@
-; RUN: llc < %s -mtriple=thumbv7s-apple-ios3.0.0 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7s-apple-ios3.0.0 -mcpu=generic | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
+; RUN: llc < %s -mtriple=thumbeb -mattr=v7,neon | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
 
 ; PR15525
 ; CHECK-LABEL: test1:
 ; CHECK: ldr.w	[[REG:r[0-9]+]], [sp]
-; CHECK-NEXT: vmov	{{d[0-9]+}}, r1, r2
-; CHECK-NEXT: vmov	{{d[0-9]+}}, r3, [[REG]]
-; CHECK-NEXT: vst1.8	{{{d[0-9]+}}, {{d[0-9]+}}}, [r0]
+; CHECK-LE-NEXT: vmov	{{d[0-9]+}}, r1, r2
+; CHECK-LE-NEXT: vmov	{{d[0-9]+}}, r3, [[REG]]
+; CHECK-BE-NEXT: vmov	{{d[0-9]+}}, r2, r1
+; CHECK-BE: vmov	{{d[0-9]+}}, [[REG]], r3
+; CHECK: vst1.8	{{{d[0-9]+}}, {{d[0-9]+}}}, [r0]
 ; CHECK-NEXT: bx	lr
 define void @test1(i8* %arg, [4 x i64] %vec.coerce) {
 bb:
diff --git a/test/CodeGen/ARM/debug-frame-vararg.ll b/test/CodeGen/ARM/debug-frame-vararg.ll
index 9b39525..42ff82d 100644
--- a/test/CodeGen/ARM/debug-frame-vararg.ll
+++ b/test/CodeGen/ARM/debug-frame-vararg.ll
@@ -75,12 +75,13 @@
 ; CHECK-FP-ELIM: .cfi_startproc
 ; CHECK-FP-ELIM: sub    sp, sp, #16
 ; CHECK-FP-ELIM: .cfi_def_cfa_offset 16
-; CHECK-FP-ELIM: push   {r4, r11, lr}
-; CHECK-FP-ELIM: .cfi_def_cfa_offset 28
+; CHECK-FP-ELIM: push   {r4, r10, r11, lr}
+; CHECK-FP-ELIM: .cfi_def_cfa_offset 32
 ; CHECK-FP-ELIM: .cfi_offset lr, -20
 ; CHECK-FP-ELIM: .cfi_offset r11, -24
-; CHECK-FP-ELIM: .cfi_offset r4, -28
-; CHECK-FP-ELIM: add    r11, sp, #4
+; CHECK-FP-ELIM: .cfi_offset r10, -28
+; CHECK-FP-ELIM: .cfi_offset r4, -32
+; CHECK-FP-ELIM: add    r11, sp, #8
 ; CHECK-FP-ELIM: .cfi_def_cfa r11, 24
 
 ; CHECK-THUMB-FP-LABEL: sum
diff --git a/test/CodeGen/ARM/debug-frame.ll b/test/CodeGen/ARM/debug-frame.ll
index cf68767..cb54aa8 100644
--- a/test/CodeGen/ARM/debug-frame.ll
+++ b/test/CodeGen/ARM/debug-frame.ll
@@ -201,12 +201,13 @@ declare void @_ZSt9terminatev()
 
 ; CHECK-V7-FP-LABEL: _Z4testiiiiiddddd:
 ; CHECK-V7-FP:   .cfi_startproc
-; CHECK-V7-FP:   push   {r4, r11, lr}
-; CHECK-V7-FP:   .cfi_def_cfa_offset 12
+; CHECK-V7-FP:   push   {r4, r10, r11, lr}
+; CHECK-V7-FP:   .cfi_def_cfa_offset 16
 ; CHECK-V7-FP:   .cfi_offset lr, -4
 ; CHECK-V7-FP:   .cfi_offset r11, -8
-; CHECK-V7-FP:   .cfi_offset r4, -12
-; CHECK-V7-FP:   add    r11, sp, #4
+; CHECK-V7-FP:   .cfi_offset r10, -12
+; CHECK-V7-FP:   .cfi_offset r4, -16
+; CHECK-V7-FP:   add    r11, sp, #8
 ; CHECK-V7-FP:   .cfi_def_cfa r11, 8
 ; CHECK-V7-FP:   vpush  {d8, d9, d10, d11, d12}
 ; CHECK-V7-FP:   .cfi_offset d12, -24
@@ -214,7 +215,7 @@ declare void @_ZSt9terminatev()
 ; CHECK-V7-FP:   .cfi_offset d10, -40
 ; CHECK-V7-FP:   .cfi_offset d9, -48
 ; CHECK-V7-FP:   .cfi_offset d8, -56
-; CHECK-V7-FP:   sub    sp, sp, #28
+; CHECK-V7-FP:   sub    sp, sp, #24
 ; CHECK-V7-FP:   .cfi_endproc
 
 ; CHECK-V7-FP-ELIM-LABEL: _Z4testiiiiiddddd:
diff --git a/test/CodeGen/ARM/debug-segmented-stacks.ll b/test/CodeGen/ARM/debug-segmented-stacks.ll
index b0dc467..e866b4e 100644
--- a/test/CodeGen/ARM/debug-segmented-stacks.ll
+++ b/test/CodeGen/ARM/debug-segmented-stacks.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs -filetype=asm | FileCheck %s -check-prefix=ARM-linux
-; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -verify-machineinstrs -filetype=asm | FileCheck %s -check-prefix=ARM-linux
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -filetype=obj
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-define void @test_basic() {
+define void @test_basic() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
@@ -78,3 +78,5 @@ define void @test_basic() {
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/ARM/dwarf-eh.ll b/test/CodeGen/ARM/dwarf-eh.ll
new file mode 100644
index 0000000..0b8a072
--- /dev/null
+++ b/test/CodeGen/ARM/dwarf-eh.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=arm-netbsd-eabi -o - -filetype=asm %s | \
+; RUN: FileCheck %s
+; RUN: llc -mtriple=arm-netbsd-eabi -o - -filetype=asm %s \
+; RUN: -relocation-model=pic | FileCheck -check-prefix=CHECK-PIC %s
+
+; ModuleID = 'test.cc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv5e--netbsd-eabi"
+
+%struct.exception = type { i8 }
+
+@_ZTVN10__cxxabiv117__class_type_infoE = external global i8*
+@_ZTS9exception = linkonce_odr constant [11 x i8] c"9exception\00"
+@_ZTI9exception = linkonce_odr unnamed_addr constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8** @_ZTVN10__cxxabiv117__class_type_infoE, i32 2) to i8*), i8* getelementptr inbounds ([11 x i8]* @_ZTS9exception, i32 0, i32 0) }
+
+define void @f() uwtable {
+  %1 = alloca i8*
+  %2 = alloca i32
+  %e = alloca %struct.exception*, align 4
+  invoke void @g()
+          to label %3 unwind label %4
+
+  br label %16
+
+  %5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast ({ i8*, i8* }* @_ZTI9exception to i8*)
+  %6 = extractvalue { i8*, i32 } %5, 0
+  store i8* %6, i8** %1
+  %7 = extractvalue { i8*, i32 } %5, 1
+  store i32 %7, i32* %2
+  br label %8
+
+  %9 = load i32* %2
+  %10 = call i32 @llvm.eh.typeid.for(i8* bitcast ({ i8*, i8* }* @_ZTI9exception to i8*)) nounwind
+  %11 = icmp eq i32 %9, %10
+  br i1 %11, label %12, label %17
+
+  %13 = load i8** %1
+  %14 = call i8* @__cxa_begin_catch(i8* %13) #3
+  %15 = bitcast i8* %14 to %struct.exception*
+  store %struct.exception* %15, %struct.exception** %e
+  call void @__cxa_end_catch()
+  br label %16
+
+  ret void
+
+  %18 = load i8** %1
+  %19 = load i32* %2
+  %20 = insertvalue { i8*, i32 } undef, i8* %18, 0
+  %21 = insertvalue { i8*, i32 } %20, i32 %19, 1
+  resume { i8*, i32 } %21
+}
+
+declare void @g()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+; CHECK: .cfi_personality 0,
+; CHECK: .cfi_lsda 0,
+; CHECK: @TType Encoding = absptr
+; CHECK: @ Call site Encoding = udata4
+; CHECK-PIC: .cfi_personality 155,
+; CHECK-PIC: .cfi_lsda 27,
+; CHECK-PIC: @TType Encoding = indirect pcrel sdata4
+; CHECK-PIC: @ Call site Encoding = udata4
diff --git a/test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll b/test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll
new file mode 100644
index 0000000..42ca988
--- /dev/null
+++ b/test/CodeGen/ARM/ehabi-handlerdata-nounwind.ll
@@ -0,0 +1,61 @@
+; Test for handlerdata when the function has landingpad and nounwind.
+
+; This test case checks whether the handlerdata is generated for the function
+; with landingpad instruction, even if the function has "nounwind" atttribute.
+;
+; For example, although the following function never throws any exception,
+; however, it is still required to generate LSDA, otherwise, we can't catch
+; the exception properly.
+;
+; void test1() noexcept {
+;   try {
+;     throw_exception();
+;   } catch (...) {
+;   }
+; }
+
+; RUN: llc -mtriple arm-unknown-linux-gnueabi -filetype=asm -o - %s \
+; RUN:   | FileCheck %s
+
+declare void @throw_exception()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+define void @test1() nounwind {
+entry:
+  invoke void @throw_exception() to label %try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1)
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret void
+}
+
+; CHECK:   .globl test1
+; CHECK:   .align 2
+; CHECK:   .type test1,%function
+; CHECK-LABEL: test1:
+; CHECK:   .fnstart
+
+; CHECK-NOT: .cantunwind
+
+; CHECK:   .personality __gxx_personality_v0
+; CHECK:   .handlerdata
+; CHECK:   .align 2
+; CHECK-LABEL: GCC_except_table0:
+; CHECK-LABEL: .Lexception0:
+; CHECK:   .byte 255                     @ @LPStart Encoding = omit
+; CHECK:   .byte 0                       @ @TType Encoding = absptr
+; CHECK:   .asciz
+; CHECK:   .byte 3                       @ Call site Encoding = udata4
+; CHECK:   .fnend
diff --git a/test/CodeGen/ARM/ehabi-handlerdata.ll b/test/CodeGen/ARM/ehabi-handlerdata.ll
new file mode 100644
index 0000000..7045902
--- /dev/null
+++ b/test/CodeGen/ARM/ehabi-handlerdata.ll
@@ -0,0 +1,59 @@
+; ARM EHABI test for the handlerdata.
+
+; This test case checks whether the handlerdata for exception
+; handling is generated properly.
+;
+; (1) The handlerdata must not be empty.
+; (2) LPStartEncoding == DW_EH_PE_omit
+; (3) TTypeEncoding == DW_EH_PE_absptr
+; (4) CallSiteEncoding == DW_EH_PE_udata4
+
+; RUN: llc -mtriple arm-unknown-linux-gnueabi -filetype=asm -o - %s \
+; RUN:   | FileCheck %s
+
+; RUN: llc -mtriple arm-unknown-linux-gnueabi -filetype=asm -o - %s \
+; RUN:     -relocation-model=pic \
+; RUN:   | FileCheck %s
+
+declare void @throw_exception()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+define void @test1() {
+entry:
+  invoke void @throw_exception() to label %try.cont unwind label %lpad
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1)
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret void
+}
+
+; CHECK:   .globl test1
+; CHECK:   .align 2
+; CHECK:   .type test1,%function
+; CHECK-LABEL: test1:
+; CHECK:   .fnstart
+; CHECK:   .personality __gxx_personality_v0
+; CHECK:   .handlerdata
+; CHECK:   .align 2
+; CHECK-LABEL: GCC_except_table0:
+; CHECK-LABEL: .Lexception0:
+; CHECK:   .byte 255                     @ @LPStart Encoding = omit
+; CHECK:   .byte 0                       @ @TType Encoding = absptr
+; CHECK:   .asciz
+; CHECK:   .byte 3                       @ Call site Encoding = udata4
+; CHECK:   .long
+; CHECK:   .long
+; CHECK:   .long
+; CHECK:   .fnend
diff --git a/test/CodeGen/ARM/ehabi.ll b/test/CodeGen/ARM/ehabi.ll
index 720cc3c..ebf0c2a 100644
--- a/test/CodeGen/ARM/ehabi.ll
+++ b/test/CodeGen/ARM/ehabi.ll
@@ -50,6 +50,22 @@
 ; RUN:     -filetype=asm -o - %s \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-V7-FP-ELIM
 
+; RUN: llc -mtriple arm-unknown-netbsd-eabi \
+; RUN:     -disable-fp-elim -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=DWARF-FP
+
+; RUN: llc -mtriple arm-unknown-netbsd-eabi \
+; RUN:     -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=DWARF-FP-ELIM
+
+; RUN: llc -mtriple armv7-unknown-netbsd-eabi \
+; RUN:     -disable-fp-elim -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=DWARF-V7-FP
+
+; RUN: llc -mtriple armv7-unknown-netbsd-eabi \
+; RUN:     -filetype=asm -o - %s \
+; RUN:   | FileCheck %s --check-prefix=DWARF-V7-FP-ELIM
+
 ;-------------------------------------------------------------------------------
 ; Test 1
 ;-------------------------------------------------------------------------------
@@ -148,14 +164,14 @@ declare void @_ZSt9terminatev()
 
 ; CHECK-V7-FP-LABEL: _Z4testiiiiiddddd:
 ; CHECK-V7-FP:   .fnstart
-; CHECK-V7-FP:   .save  {r4, r11, lr}
-; CHECK-V7-FP:   push   {r4, r11, lr}
-; CHECK-V7-FP:   .setfp r11, sp, #4
-; CHECK-V7-FP:   add    r11, sp, #4
+; CHECK-V7-FP:   .save  {r4, r10, r11, lr}
+; CHECK-V7-FP:   push   {r4, r10, r11, lr}
+; CHECK-V7-FP:   .setfp r11, sp, #8
+; CHECK-V7-FP:   add    r11, sp, #8
 ; CHECK-V7-FP:   .vsave {d8, d9, d10, d11, d12}
 ; CHECK-V7-FP:   vpush  {d8, d9, d10, d11, d12}
-; CHECK-V7-FP:   .pad   #28
-; CHECK-V7-FP:   sub    sp, sp, #28
+; CHECK-V7-FP:   .pad   #24
+; CHECK-V7-FP:   sub    sp, sp, #24
 ; CHECK-V7-FP:   .personality __gxx_personality_v0
 ; CHECK-V7-FP:   .handlerdata
 ; CHECK-V7-FP:   .fnend
@@ -172,6 +188,93 @@ declare void @_ZSt9terminatev()
 ; CHECK-V7-FP-ELIM:   .handlerdata
 ; CHECK-V7-FP-ELIM:   .fnend
 
+; DWARF-FP-LABEL: _Z4testiiiiiddddd:
+; DWARF-FP:    .cfi_startproc
+; DWARF-FP:    .cfi_personality 0, __gxx_personality_v0
+; DWARF-FP:    .cfi_lsda 0, .Lexception0
+; DWARF-FP:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; DWARF-FP:    .cfi_def_cfa_offset 36
+; DWARF-FP:    .cfi_offset lr, -4
+; DWARF-FP:    .cfi_offset r11, -8
+; DWARF-FP:    .cfi_offset r10, -12
+; DWARF-FP:    .cfi_offset r9, -16
+; DWARF-FP:    .cfi_offset r8, -20
+; DWARF-FP:    .cfi_offset r7, -24
+; DWARF-FP:    .cfi_offset r6, -28
+; DWARF-FP:    .cfi_offset r5, -32
+; DWARF-FP:    .cfi_offset r4, -36
+; DWARF-FP:    add r11, sp, #28
+; DWARF-FP:    .cfi_def_cfa r11, 8
+; DWARF-FP:    sub sp, sp, #28
+; DWARF-FP:    sub sp, r11, #28
+; DWARF-FP:    pop {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; DWARF-FP:    mov pc, lr
+; DWARF-FP:    .cfi_endproc
+
+; DWARF-FP-ELIM-LABEL: _Z4testiiiiiddddd:
+; DWARF-FP-ELIM:    .cfi_startproc
+; DWARF-FP-ELIM:    .cfi_personality 0, __gxx_personality_v0
+; DWARF-FP-ELIM:    .cfi_lsda 0, .Lexception0
+; DWARF-FP-ELIM:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; DWARF-FP-ELIM:    .cfi_def_cfa_offset 36
+; DWARF-FP-ELIM:    .cfi_offset lr, -4
+; DWARF-FP-ELIM:    .cfi_offset r11, -8
+; DWARF-FP-ELIM:    .cfi_offset r10, -12
+; DWARF-FP-ELIM:    .cfi_offset r9, -16
+; DWARF-FP-ELIM:    .cfi_offset r8, -20
+; DWARF-FP-ELIM:    .cfi_offset r7, -24
+; DWARF-FP-ELIM:    .cfi_offset r6, -28
+; DWARF-FP-ELIM:    .cfi_offset r5, -32
+; DWARF-FP-ELIM:    .cfi_offset r4, -36
+; DWARF-FP-ELIM:    sub sp, sp, #28
+; DWARF-FP-ELIM:    .cfi_def_cfa_offset 64
+; DWARF-FP-ELIM:    add sp, sp, #28
+; DWARF-FP-ELIM:    pop {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; DWARF-FP-ELIM:    mov pc, lr
+; DWARF-FP-ELIM:    .cfi_endproc
+
+; DWARF-V7-FP-LABEL: _Z4testiiiiiddddd:
+; DWARF-V7-FP:    .cfi_startproc
+; DWARF-V7-FP:    .cfi_personality 0, __gxx_personality_v0
+; DWARF-V7-FP:    .cfi_lsda 0, .Lexception0
+; DWARF-V7-FP:    push {r4, r10, r11, lr}
+; DWARF-V7-FP:    .cfi_def_cfa_offset 16
+; DWARF-V7-FP:    .cfi_offset lr, -4
+; DWARF-V7-FP:    .cfi_offset r11, -8
+; DWARF-V7-FP:    .cfi_offset r10, -12
+; DWARF-V7-FP:    .cfi_offset r4, -16
+; DWARF-V7-FP:    add r11, sp, #8
+; DWARF-V7-FP:    .cfi_def_cfa r11, 8
+; DWARF-V7-FP:    vpush {d8, d9, d10, d11, d12}
+; DWARF-V7-FP:    .cfi_offset d12, -24
+; DWARF-V7-FP:    .cfi_offset d11, -32
+; DWARF-V7-FP:    .cfi_offset d10, -40
+; DWARF-V7-FP:    .cfi_offset d9, -48
+; DWARF-V7-FP:    sub sp, sp, #24
+; DWARF-V7-FP:    sub sp, r11, #48
+; DWARF-V7-FP:    vpop {d8, d9, d10, d11, d12}
+; DWARF-V7-FP:    pop {r4, r10, r11, pc}
+; DWARF-V7-FP:    .cfi_endproc
+
+; DWARF-V7-FP-ELIM-LABEL: _Z4testiiiiiddddd:
+; DWARF-V7-FP-ELIM:    .cfi_startproc
+; DWARF-V7-FP-ELIM:    .cfi_personality 0, __gxx_personality_v0
+; DWARF-V7-FP-ELIM:    .cfi_lsda 0, .Lexception0
+; DWARF-V7-FP-ELIM:    push {r4, lr}
+; DWARF-V7-FP-ELIM:    .cfi_def_cfa_offset 8
+; DWARF-V7-FP-ELIM:    .cfi_offset lr, -4
+; DWARF-V7-FP-ELIM:    .cfi_offset r4, -8
+; DWARF-V7-FP-ELIM:    vpush {d8, d9, d10, d11, d12}
+; DWARF-V7-FP-ELIM:    .cfi_offset d12, -16
+; DWARF-V7-FP-ELIM:    .cfi_offset d11, -24
+; DWARF-V7-FP-ELIM:    .cfi_offset d10, -32
+; DWARF-V7-FP-ELIM:    .cfi_offset d9, -40
+; DWARF-V7-FP-ELIM:    sub sp, sp, #24
+; DWARF-V7-FP-ELIM:    .cfi_def_cfa_offset 72
+; DWARF-V7-FP-ELIM:    add sp, sp, #24
+; DWARF-V7-FP-ELIM:    vpop {d8, d9, d10, d11, d12}
+; DWARF-V7-FP-ELIM:    pop {r4, pc}
+; DWARF-V7-FP-ELIM:    .cfi_endproc
 
 ;-------------------------------------------------------------------------------
 ; Test 2
@@ -219,6 +322,48 @@ entry:
 ; CHECK-V7-FP-ELIM:   pop   {r11, pc}
 ; CHECK-V7-FP-ELIM:   .fnend
 
+; DWARF-FP-LABEL: test2:
+; DWARF-FP:    .cfi_startproc
+; DWARF-FP:    push {r11, lr}
+; DWARF-FP:    .cfi_def_cfa_offset 8
+; DWARF-FP:    .cfi_offset lr, -4
+; DWARF-FP:    .cfi_offset r11, -8
+; DWARF-FP:    mov  r11, sp
+; DWARF-FP:    .cfi_def_cfa_register r11
+; DWARF-FP:    pop  {r11, lr}
+; DWARF-FP:    mov  pc, lr
+; DWARF-FP:    .cfi_endproc
+
+; DWARF-FP-ELIM-LABEL: test2:
+; DWARF-FP-ELIM:    .cfi_startproc
+; DWARF-FP-ELIM:    push {r11, lr}
+; DWARF-FP-ELIM:    .cfi_def_cfa_offset 8
+; DWARF-FP-ELIM:    .cfi_offset lr, -4
+; DWARF-FP-ELIM:    .cfi_offset r11, -8
+; DWARF-FP-ELIM:    pop  {r11, lr}
+; DWARF-FP-ELIM:    mov  pc, lr
+; DWARF-FP-ELIM:    .cfi_endproc
+
+; DWARF-V7-FP-LABEL: test2:
+; DWARF-V7-FP:    .cfi_startproc
+; DWARF-V7-FP:    push {r11, lr}
+; DWARF-V7-FP:    .cfi_def_cfa_offset 8
+; DWARF-V7-FP:    .cfi_offset lr, -4
+; DWARF-V7-FP:    .cfi_offset r11, -8
+; DWARF-V7-FP:    mov  r11, sp
+; DWARF-V7-FP:    .cfi_def_cfa_register r11
+; DWARF-V7-FP:    pop  {r11, pc}
+; DWARF-V7-FP:    .cfi_endproc
+
+; DWARF-V7-FP-ELIM-LABEL: test2:
+; DWARF-V7-FP-ELIM:    .cfi_startproc
+; DWARF-V7-FP-ELIM:    push {r11, lr}
+; DWARF-V7-FP-ELIM:    .cfi_def_cfa_offset 8
+; DWARF-V7-FP-ELIM:    .cfi_offset lr, -4
+; DWARF-V7-FP-ELIM:    .cfi_offset r11, -8
+; DWARF-V7-FP-ELIM:    pop  {r11, pc}
+; DWARF-V7-FP-ELIM:    .cfi_endproc
+
 
 ;-------------------------------------------------------------------------------
 ; Test 3
@@ -275,6 +420,56 @@ entry:
 ; CHECK-V7-FP-ELIM:   pop   {r4, r5, r11, pc}
 ; CHECK-V7-FP-ELIM:   .fnend
 
+; DWARF-FP-LABEL: test3:
+; DWARF-FP:    .cfi_startproc
+; DWARF-FP:    push {r4, r5, r11, lr}
+; DWARF-FP:    .cfi_def_cfa_offset 16
+; DWARF-FP:    .cfi_offset lr, -4
+; DWARF-FP:    .cfi_offset r11, -8
+; DWARF-FP:    .cfi_offset r5, -12
+; DWARF-FP:    .cfi_offset r4, -16
+; DWARF-FP:    add  r11, sp, #8
+; DWARF-FP:    .cfi_def_cfa r11, 8
+; DWARF-FP:    pop  {r4, r5, r11, lr}
+; DWARF-FP:    mov  pc, lr
+; DWARF-FP:    .cfi_endproc
+
+; DWARF-FP-ELIM-LABEL: test3:
+; DWARF-FP-ELIM:    .cfi_startproc
+; DWARF-FP-ELIM:    push {r4, r5, r11, lr}
+; DWARF-FP-ELIM:    .cfi_def_cfa_offset 16
+; DWARF-FP-ELIM:    .cfi_offset lr, -4
+; DWARF-FP-ELIM:    .cfi_offset r11, -8
+; DWARF-FP-ELIM:    .cfi_offset r5, -12
+; DWARF-FP-ELIM:    .cfi_offset r4, -16
+; DWARF-FP-ELIM:    pop  {r4, r5, r11, lr}
+; DWARF-FP-ELIM:    mov  pc, lr
+; DWARF-FP-ELIM:    .cfi_endproc
+
+; DWARF-V7-FP-LABEL: test3:
+; DWARF-V7-FP:    .cfi_startproc
+; DWARF-V7-FP:    push {r4, r5, r11, lr}
+; DWARF-V7-FP:    .cfi_def_cfa_offset 16
+; DWARF-V7-FP:    .cfi_offset lr, -4
+; DWARF-V7-FP:    .cfi_offset r11, -8
+; DWARF-V7-FP:    .cfi_offset r5, -12
+; DWARF-V7-FP:    .cfi_offset r4, -16
+; DWARF-V7-FP:    add  r11, sp, #8
+; DWARF-V7-FP:    .cfi_def_cfa r11, 8
+; DWARF-V7-FP:    pop  {r4, r5, r11, pc}
+; DWARF-V7-FP:    .cfi_endproc
+
+; DWARF-V7-FP-ELIM-LABEL: test3:
+; DWARF-V7-FP-ELIM:    .cfi_startproc
+; DWARF-V7-FP-ELIM:    push {r4, r5, r11, lr}
+; DWARF-V7-FP-ELIM:    .cfi_def_cfa_offset 16
+; DWARF-V7-FP-ELIM:    .cfi_offset lr, -4
+; DWARF-V7-FP-ELIM:    .cfi_offset r11, -8
+; DWARF-V7-FP-ELIM:    .cfi_offset r5, -12
+; DWARF-V7-FP-ELIM:    .cfi_offset r4, -16
+; DWARF-V7-FP-ELIM:    pop  {r4, r5, r11, pc}
+; DWARF-V7-FP-ELIM:    .cfi_endproc
+
 
 ;-------------------------------------------------------------------------------
 ; Test 4
@@ -308,3 +503,27 @@ entry:
 ; CHECK-V7-FP-ELIM:   bx lr
 ; CHECK-V7-FP-ELIM:   .cantunwind
 ; CHECK-V7-FP-ELIM:   .fnend
+
+; DWARF-FP-LABEL: test4:
+; DWARF-FP-NOT: .cfi_startproc
+; DWARF-FP:    mov pc, lr
+; DWARF-FP-NOT: .cfi_endproc
+; DWARF-FP:    .size test4,
+
+; DWARF-FP-ELIM-LABEL: test4:
+; DWARF-FP-ELIM-NOT: .cfi_startproc
+; DWARF-FP-ELIM:     mov pc, lr
+; DWARF-FP-ELIM-NOT: .cfi_endproc
+; DWARF-FP-ELIM:     .size test4,
+
+; DWARF-V7-FP-LABEL: test4:
+; DWARF-V7-FP-NOT: .cfi_startproc
+; DWARF-V7-FP:    bx lr
+; DWARF-V7-FP-NOT: .cfi_endproc
+; DWARF-V7-FP:    .size test4,
+
+; DWARF-V7-FP-ELIM-LABEL: test4:
+; DWARF-V7-FP-ELIM-NOT: .cfi_startproc
+; DWARF-V7-FP-ELIM:     bx lr
+; DWARF-V7-FP-ELIM-NOT: .cfi_endproc
+; DWARF-V7-FP-ELIM:     .size test4,
diff --git a/test/CodeGen/ARM/frame-register.ll b/test/CodeGen/ARM/frame-register.ll
new file mode 100644
index 0000000..e6a55bd
--- /dev/null
+++ b/test/CodeGen/ARM/frame-register.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple arm-eabi -disable-fp-elim -filetype asm -o - %s \
+; RUN:     | FileCheck -check-prefix CHECK-ARM %s
+
+; RUN: llc -mtriple thumb-eabi -disable-fp-elim -filetype asm -o - %s \
+; RUN:     | FileCheck -check-prefix CHECK-THUMB %s
+
+; RUN: llc -mtriple arm-darwin -disable-fp-elim -filetype asm -o - %s \
+; RUN:     | FileCheck -check-prefix CHECK-DARWIN-ARM %s
+
+; RUN: llc -mtriple thumb-darwin -disable-fp-elim -filetype asm -o - %s \
+; RUN:     | FileCheck -check-prefix CHECK-DARWIN-THUMB %s
+
+declare void @callee(i32)
+
+define i32 @calleer(i32 %i) {
+entry:
+  %i.addr = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32* %i.addr, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %j, align 4
+  %1 = load i32* %j, align 4
+  call void @callee(i32 %1)
+  %2 = load i32* %j, align 4
+  %add1 = add nsw i32 %2, 1
+  ret i32 %add1
+}
+
+; CHECK-ARM: push {r11, lr}
+; CHECK-ARM: mov r11, sp
+
+; CHECK-THUMB: push {r4, r6, r7, lr}
+; CHECK-THUMB: add r7, sp, #8
+
+; CHECK-DARWIN-ARM: push {r7, lr}
+; CHECK-DARWIN-THUMB: push {r4, r7, lr}
+
diff --git a/test/CodeGen/ARM/func-argpassing-endian.ll b/test/CodeGen/ARM/func-argpassing-endian.ll
new file mode 100644
index 0000000..26f0597
--- /dev/null
+++ b/test/CodeGen/ARM/func-argpassing-endian.ll
@@ -0,0 +1,122 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm-eabi -mattr=v7,neon | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=armeb-eabi -mattr=v7,neon | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+
+@var32 = global i32 0
+@vardouble = global double 0.0
+
+define void @arg_longint( i64 %val ) {
+; CHECK-LABEL: arg_longint:
+; CHECK-LE: str r0, [r1]
+; CHECK-BE: str r1, [r0]
+   %tmp = trunc i64 %val to i32 
+   store i32 %tmp, i32* @var32
+   ret void
+}
+
+define void @arg_double( double %val ) {
+; CHECK-LABEL: arg_double:
+; CHECK: strd r0, r1, [r2]
+    store double  %val, double* @vardouble
+    ret void
+}
+
+define void @arg_v4i32(<4 x i32> %vec ) {
+; CHECK-LABEL: arg_v4i32:
+; CHECK-LE: vmov {{d[0-9]+}}, r2, r3
+; CHECK-LE: vmov [[ARG_V4I32_REG:d[0-9]+]], r0, r1
+; CHECK-BE: vmov {{d[0-9]+}}, r3, r2
+; CHECK-BE: vmov [[ARG_V4I32_REG:d[0-9]+]], r1, r0
+; CHECK: vst1.32 {[[ARG_V4I32_REG]][0]}, [r0:32]
+    %tmp = extractelement <4 x i32> %vec, i32 0
+    store i32 %tmp, i32* @var32
+    ret void
+}
+
+define void @arg_v2f64(<2 x double> %vec ) {
+; CHECK-LABEL: arg_v2f64:
+; CHECK: strd r0, r1, [r2]
+    %tmp = extractelement <2 x double> %vec, i32 0
+    store double %tmp, double* @vardouble
+    ret void
+}
+
+define i64 @return_longint() {
+; CHECK-LABEL: return_longint:
+; CHECK-LE: mov r0, #42
+; CHECK-LE: mov r1, #0
+; CHECK-BE: mov r0, #0
+; CHECK-BE: mov r1, #42
+    ret i64 42
+}
+
+define double @return_double() {
+; CHECK-LABEL: return_double:
+; CHECK-LE: vmov r0, r1, {{d[0-9]+}}
+; CHECK-BE: vmov r1, r0, {{d[0-9]+}}
+    ret double 1.0
+}
+
+define <4 x i32> @return_v4i32() {
+; CHECK-LABEL: return_v4i32:
+; CHECK-LE: vmov r0, r1, {{d[0-9]+}}
+; CHECK-LE: vmov r2, r3, {{d[0-9]+}}
+; CHECK-BE: vmov r1, r0, {{d[0-9]+}}
+; CHECK-BE: vmov r3, r2, {{d[0-9]+}}
+   ret < 4 x i32> < i32 42, i32 43, i32 44, i32 45 >
+}
+
+define <2 x double> @return_v2f64() {
+; CHECK-LABEL: return_v2f64:
+; CHECK-LE: vmov r0, r1, {{d[0-9]+}}
+; CHECK-LE: vmov r2, r3, {{d[0-9]+}}
+; CHECK-BE: vmov r1, r0, {{d[0-9]+}}
+; CHECK-BE: vmov r3, r2, {{d[0-9]+}}
+   ret <2 x double> < double 3.14, double 6.28 >
+}
+
+define void @caller_arg_longint() {
+; CHECK-LABEL: caller_arg_longint:
+; CHECK-LE: mov r0, #42
+; CHECK-LE: mov r1, #0
+; CHECK-BE: mov r0, #0
+; CHECK-BE: mov r1, #42
+   call void @arg_longint( i64 42 )
+   ret void
+}
+
+define void @caller_arg_double() {
+; CHECK-LABEL: caller_arg_double:
+; CHECK-LE: vmov r0, r1, {{d[0-9]+}}
+; CHECK-BE: vmov r1, r0, {{d[0-9]+}}
+   call void @arg_double( double 1.0 )
+   ret void
+}
+
+define void @caller_return_longint() {
+; CHECK-LABEL: caller_return_longint:
+; CHECK-LE: str r0, [r1]
+; CHECK-BE: str r1, [r0]
+   %val = call i64 @return_longint()
+   %tmp = trunc i64 %val to i32 
+   store i32 %tmp, i32* @var32
+   ret void
+}
+
+define void @caller_return_double() {
+; CHECK-LABEL: caller_return_double:
+; CHECK-LE: vmov {{d[0-9]+}}, r0, r1
+; CHECK-BE: vmov {{d[0-9]+}}, r1, r0
+  %val = call double @return_double( )
+  %tmp = fadd double %val, 3.14
+  store double  %tmp, double* @vardouble
+  ret void
+}
+
+define void @caller_return_v2f64() {
+; CHECK-LABEL: caller_return_v2f64:
+; CHECK: strd r0, r1, [r2]
+   %val = call <2 x double> @return_v2f64( )
+   %tmp = extractelement <2 x double> %val, i32 0
+    store double %tmp, double* @vardouble
+    ret void
+}
diff --git a/test/CodeGen/ARM/hfa-in-contiguous-registers.ll b/test/CodeGen/ARM/hfa-in-contiguous-registers.ll
new file mode 100644
index 0000000..f9ec6e0
--- /dev/null
+++ b/test/CodeGen/ARM/hfa-in-contiguous-registers.ll
@@ -0,0 +1,94 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "armv7-none--gnueabihf"
+
+%struct.s = type { float, float }
+%union.t = type { [4 x float] }
+
+; Equivalent C code:
+; struct s { float a; float b; };
+; float foo(float a, double b, struct s c) { return c.a; }
+; Argument allocation:
+; a -> s0
+; b -> d1
+; c -> s4, s5
+; s1 is unused
+; return in s0
+define float @test1(float %a, double %b, %struct.s %c) {
+entry:
+; CHECK-LABEL: test1
+; CHECK: vmov.f32  s0, s4
+; CHECK-NOT: vmov.f32        s0, s1
+
+  %result = extractvalue %struct.s %c, 0
+  ret float %result
+}
+
+; Equivalent C code:
+; union t { float a[4] };
+; float foo(float a, double b, union s c) { return c.a[0]; }
+; Argument allocation:
+; a -> s0
+; b -> d1
+; c -> s4..s7
+define float @test2(float %a, double %b, %union.t %c) #0 {
+entry:
+; CHECK-LABEL: test2
+; CHECK: vmov.f32  s0, s4
+; CHECK-NOT: vmov.f32        s0, s1
+
+  %result = extractvalue %union.t %c, 0, 0
+  ret float %result
+}
+
+; Equivalent C code:
+; struct s { float a; float b; };
+; float foo(float a, double b, struct s c, float d) { return d; }
+; Argument allocation:
+; a -> s0
+; b -> d1
+; c -> s4, s5
+; d -> s1
+; return in s0
+define float @test3(float %a, double %b, %struct.s %c, float %d) {
+entry:
+; CHECK-LABEL: test3
+; CHECK: vmov.f32  s0, s1
+; CHECK-NOT: vmov.f32        s0, s5
+
+  ret float %d
+}
+
+; Equivalent C code:
+; struct s { float a; float b; };
+; float foo(struct s a, struct s b) { return b.b; }
+; Argument allocation:
+; a -> s0, s1
+; b -> s2, s3
+; return in s0
+define float @test4(%struct.s %a, %struct.s %b) {
+entry:
+; CHECK-LABEL: test4
+; CHECK: vmov.f32  s0, s3
+
+  %result = extractvalue %struct.s %b, 1
+  ret float %result
+}
+
+; Equivalent C code:
+; struct s { float a; float b; };
+; float foo(struct s a, float b, struct s c) { return c.a; }
+; Argument allocation:
+; a -> s0, s1
+; b -> s2
+; c -> s3, s4
+; return in s0
+define float @test5(%struct.s %a, float %b, %struct.s %c) {
+entry:
+; CHECK-LABEL: test5
+; CHECK: vmov.f32  s0, s3
+
+  %result = extractvalue %struct.s %c, 0
+  ret float %result
+}
diff --git a/test/CodeGen/ARM/hints.ll b/test/CodeGen/ARM/hints.ll
new file mode 100644
index 0000000..18abbbe
--- /dev/null
+++ b/test/CodeGen/ARM/hints.ll
@@ -0,0 +1,69 @@
+; RUN: llc -mtriple armv7-eabi -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv6m-eabi -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv7-eabi -o - %s | FileCheck %s
+
+declare void @llvm.arm.hint(i32) nounwind
+
+define void @hint_nop() {
+entry:
+  tail call void @llvm.arm.hint(i32 0) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_nop
+; CHECK: nop
+
+define void @hint_yield() {
+entry:
+  tail call void @llvm.arm.hint(i32 1) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_yield
+; CHECK: yield
+
+define void @hint_wfe() {
+entry:
+  tail call void @llvm.arm.hint(i32 2) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_wfe
+; CHECK: wfe
+
+define void @hint_wfi() {
+entry:
+  tail call void @llvm.arm.hint(i32 3) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_wfi
+; CHECK: wfi
+
+define void @hint_sev() {
+entry:
+  tail call void @llvm.arm.hint(i32 4) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_sev
+; CHECK: sev
+
+define void @hint_sevl() {
+entry:
+  tail call void @llvm.arm.hint(i32 5) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_sevl
+; CHECK: hint #5
+
+define void @hint_undefined() {
+entry:
+  tail call void @llvm.arm.hint(i32 8) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_undefined
+; CHECK: hint #8
+
diff --git a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
index 86ed5b2..5d8e477 100644
--- a/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
+++ b/test/CodeGen/ARM/ifcvt-branch-weight-bug.ll
@@ -24,7 +24,7 @@ entry:
 ; CHECK: BB#1: derived from LLVM BB %for.body
 ; CHECK: Successors according to CFG: BB#2(130023362) BB#4(62)
 for.body:
-  br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i
+  br i1 undef, label %for.cond.backedge, label %lor.lhs.false.i, !prof !1
 
 for.cond.backedge:
   %tobool = icmp eq %classL* undef, null
@@ -60,3 +60,4 @@ declare void @_ZN1F10handleMoveEb(%classF*, i1 zeroext)
 declare void @_Z3fn1v()
 
 !0 = metadata !{metadata !"clang version 3.5"}
+!1 = metadata !{metadata !"branch_weights", i32 62, i32 62}
diff --git a/test/CodeGen/ARM/indirect-hidden.ll b/test/CodeGen/ARM/indirect-hidden.ll
new file mode 100644
index 0000000..ae1c505
--- /dev/null
+++ b/test/CodeGen/ARM/indirect-hidden.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=thumbv7s-apple-ios7.0 -o - %s | FileCheck %s
+
+@var = external global i32
+@var_hidden = external hidden global i32
+
+define i32* @get_var() {
+  ret i32* @var
+}
+
+define i32* @get_var_hidden() {
+  ret i32* @var_hidden
+}
+
+; CHECK: .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
+
+; CHECK: .indirect_symbol _var
+; CHECK-NEXT: .long 0
+
+; CHECK-NOT: __DATA,__data
+
+; CHECK: .indirect_symbol _var_hidden
+; CHECK-NEXT: .long 0
\ No newline at end of file
diff --git a/test/CodeGen/ARM/interrupt-attr.ll b/test/CodeGen/ARM/interrupt-attr.ll
index 9b7b41b..c5be667 100644
--- a/test/CodeGen/ARM/interrupt-attr.ll
+++ b/test/CodeGen/ARM/interrupt-attr.ll
@@ -12,13 +12,13 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
 
   ; Also need special function return setting pc and CPSR simultaneously.
 ; CHECK-A-LABEL: irq_fn:
-; CHECK-A: push {r0, r1, r2, r3, r11, r12, lr}
-; CHECK-A: add r11, sp, #16
-; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
+; CHECK-A: add r11, sp, #20
+; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
 ; CHECK-A: bic sp, sp, #7
 ; CHECK-A: bl bar
-; CHECK-A: sub sp, r11, #16
-; CHECK-A: pop {r0, r1, r2, r3, r11, r12, lr}
+; CHECK-A: sub sp, r11, #20
+; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
 ; CHECK-A: subs pc, lr, #4
 
 ; CHECK-A-THUMB-LABEL: irq_fn:
@@ -35,15 +35,15 @@ define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
   ; Normal AAPCS function (r0-r3 pushed onto stack by hardware, lr set to
   ; appropriate sentinel so no special return needed).
 ; CHECK-M-LABEL: irq_fn:
-; CHECK-M: push {r4, r7, lr}
-; CHECK-M: add r7, sp, #4
+; CHECK-M: push {r4, r6, r7, lr}
+; CHECK-M: add r7, sp, #8
 ; CHECK-M: mov r4, sp
 ; CHECK-M: bic r4, r4, #7
 ; CHECK-M: mov sp, r4
 ; CHECK-M: blx _bar
-; CHECK-M: subs r4, r7, #4
+; CHECK-M: sub.w r4, r7, #8
 ; CHECK-M: mov sp, r4
-; CHECK-M: pop {r4, r7, pc}
+; CHECK-M: pop {r4, r6, r7, pc}
 
   call arm_aapcscc void @bar()
   ret void
@@ -88,13 +88,13 @@ define arm_aapcscc void @swi_fn() alignstack(8) "interrupt"="SWI" {
 
 define arm_aapcscc void @undef_fn() alignstack(8) "interrupt"="UNDEF" {
 ; CHECK-A-LABEL: undef_fn:
-; CHECK-A: push {r0, r1, r2, r3, r11, r12, lr}
-; CHECK-A: add r11, sp, #16
-; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
+; CHECK-A: add r11, sp, #20
+; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
 ; CHECK-A: bic sp, sp, #7
 ; [...]
-; CHECK-A: sub sp, r11, #16
-; CHECK-A: pop {r0, r1, r2, r3, r11, r12, lr}
+; CHECK-A: sub sp, r11, #20
+; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
 ; CHECK-A: subs pc, lr, #0
 
   call void @bar()
@@ -103,13 +103,13 @@ define arm_aapcscc void @undef_fn() alignstack(8) "interrupt"="UNDEF" {
 
 define arm_aapcscc void @abort_fn() alignstack(8) "interrupt"="ABORT" {
 ; CHECK-A-LABEL: abort_fn:
-; CHECK-A: push {r0, r1, r2, r3, r11, r12, lr}
-; CHECK-A: add r11, sp, #16
-; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
+; CHECK-A: add r11, sp, #20
+; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
 ; CHECK-A: bic sp, sp, #7
 ; [...]
-; CHECK-A: sub sp, r11, #16
-; CHECK-A: pop {r0, r1, r2, r3, r11, r12, lr}
+; CHECK-A: sub sp, r11, #20
+; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
 ; CHECK-A: subs pc, lr, #4
 
   call void @bar()
diff --git a/test/CodeGen/ARM/intrinsics-overflow.ll b/test/CodeGen/ARM/intrinsics-overflow.ll
new file mode 100644
index 0000000..af3dd9d
--- /dev/null
+++ b/test/CodeGen/ARM/intrinsics-overflow.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=arm-linux -mcpu=generic | FileCheck %s
+
+define i32 @uadd_overflow(i32 %a, i32 %b) #0 {
+  %sadd = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %sadd, 1
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+
+  ; CHECK-LABEL: uadd_overflow:
+  ; CHECK: add r[[R2:[0-9]+]], r[[R0:[0-9]+]], r[[R1:[0-9]+]]
+  ; CHECK: mov r[[R1]], #1
+  ; CHECK: cmp r[[R2]], r[[R0]]
+  ; CHECK: movhs r[[R1]], #0
+}
+
+
+define i32 @sadd_overflow(i32 %a, i32 %b) #0 {
+  %sadd = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %sadd, 1
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+
+  ; CHECK-LABEL: sadd_overflow:
+  ; CHECK: add r[[R2:[0-9]+]], r[[R0:[0-9]+]], r[[R1:[0-9]+]]
+  ; CHECK: mov r[[R1]], #1
+  ; CHECK: cmp r[[R2]], r[[R0]]
+  ; CHECK: movvc r[[R1]], #0
+}
+
+define i32 @usub_overflow(i32 %a, i32 %b) #0 {
+  %sadd = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %sadd, 1
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+
+  ; CHECK-LABEL: usub_overflow:
+  ; CHECK: mov r[[R2]], #1
+  ; CHECK: cmp r[[R0]], r[[R1]]
+  ; CHECK: movhs r[[R2]], #0
+}
+
+define i32 @ssub_overflow(i32 %a, i32 %b) #0 {
+  %sadd = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
+  %1 = extractvalue { i32, i1 } %sadd, 1
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+
+  ; CHECK-LABEL: ssub_overflow:
+  ; CHECK: mov r[[R2]], #1
+  ; CHECK: cmp r[[R0]], r[[R1]]
+  ; CHECK: movvc r[[R2]], #0
+}
+
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) #2
+declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #3
+declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) #4
diff --git a/test/CodeGen/ARM/intrinsics-v8.ll b/test/CodeGen/ARM/intrinsics-v8.ll
index 247bfc1..ab1c3c0 100644
--- a/test/CodeGen/ARM/intrinsics-v8.ll
+++ b/test/CodeGen/ARM/intrinsics-v8.ll
@@ -10,10 +10,10 @@ define void @test() {
   ; CHECK: dsb ishld
   call void @llvm.arm.dsb(i32 9)
   ; CHECK: sevl
-  tail call void @llvm.arm.sevl() nounwind
+  tail call void @llvm.arm.hint(i32 5) nounwind
   ret void
 }
 
 declare void @llvm.arm.dmb(i32)
 declare void @llvm.arm.dsb(i32)
-declare void @llvm.arm.sevl() nounwind
+declare void @llvm.arm.hint(i32) nounwind
diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll
index 5636a12..fed6ec0 100644
--- a/test/CodeGen/ARM/longMAC.ll
+++ b/test/CodeGen/ARM/longMAC.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
-; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s --check-prefix=CHECK-V7
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s -check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s --check-prefix=CHECK-V7-LE
+; RUN: llc -mtriple=armeb-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+; RUN: llc -mtriple=armebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-BE
 ; Check generated signed and unsigned multiply accumulate long.
 
 define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
@@ -53,13 +55,18 @@ define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
 ;      function, both after the umlal. With it, *some* move has to happen
 ;      before the umlal.
 define i64 @MACLongTest5(i64 %c, i32 %a, i32 %b) {
-; CHECK-V7-LABEL: MACLongTest5:
-; CHECK-V7-LABEL: umlal r0, r1, r0, r0
+; CHECK-V7-LE-LABEL: MACLongTest5:
+; CHECK-V7-LE-LABEL: umlal r0, r1, r0, r0
+; CHECK-V7-BE-LABEL: MACLongTest5:
+; CHECK-V7-BE-LABEL: umlal r1, r0, r1, r1
 
 ; CHECK-LABEL: MACLongTest5:
-; CHECK: mov [[RDLO:r[0-9]+]], r0
-; CHECK: umlal [[RDLO]], r1, r0, r0
-; CHECK: mov r0, [[RDLO]]
+; CHECK-LE: mov [[RDLO:r[0-9]+]], r0
+; CHECK-LE: umlal [[RDLO]], r1, r0, r0
+; CHECK-LE: mov r0, [[RDLO]]
+; CHECK-BE: mov [[RDLO:r[0-9]+]], r1
+; CHECK-BE: umlal [[RDLO]], r0, r1, r1
+; CHECK-BE: mov r1, [[RDLO]]
 
   %conv.trunc = trunc i64 %c to i32
   %conv = zext i32 %conv.trunc to i64
diff --git a/test/CodeGen/ARM/long_shift.ll b/test/CodeGen/ARM/long_shift.ll
index 48b0ba7..3ec5fa4 100644
--- a/test/CodeGen/ARM/long_shift.ll
+++ b/test/CodeGen/ARM/long_shift.ll
@@ -1,11 +1,16 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
+; RUN: llc -mtriple=armeb-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
 
 define i64 @f0(i64 %A, i64 %B) {
 ; CHECK-LABEL: f0:
-; CHECK:      lsrs    r3, r3, #1
-; CHECK-NEXT: rrx     r2, r2
-; CHECK-NEXT: subs    r0, r0, r2
-; CHECK-NEXT: sbc     r1, r1, r3
+; CHECK-LE:      lsrs    r3, r3, #1
+; CHECK-LE-NEXT: rrx     r2, r2
+; CHECK-LE-NEXT: subs    r0, r0, r2
+; CHECK-LE-NEXT: sbc     r1, r1, r3
+; CHECK-BE:      lsrs    r2, r2, #1
+; CHECK-BE-NEXT: rrx     r3, r3
+; CHECK-BE-NEXT: subs    r1, r1, r3
+; CHECK-BE-NEXT: sbc     r0, r0, r2
 	%tmp = bitcast i64 %A to i64
 	%tmp2 = lshr i64 %B, 1
 	%tmp3 = sub i64 %tmp, %tmp2
@@ -14,7 +19,8 @@ define i64 @f0(i64 %A, i64 %B) {
 
 define i32 @f1(i64 %x, i64 %y) {
 ; CHECK-LABEL: f1:
-; CHECK: lsl{{.*}}r2
+; CHECK-LE: lsl{{.*}}r2
+; CHECK-BE: lsl{{.*}}r3
 	%a = shl i64 %x, %y
 	%b = trunc i64 %a to i32
 	ret i32 %b
@@ -22,12 +28,20 @@ define i32 @f1(i64 %x, i64 %y) {
 
 define i32 @f2(i64 %x, i64 %y) {
 ; CHECK-LABEL: f2:
-; CHECK:      lsr{{.*}}r2
-; CHECK-NEXT: rsb     r3, r2, #32
-; CHECK-NEXT: sub     r2, r2, #32
-; CHECK-NEXT: orr     r0, r0, r1, lsl r3
-; CHECK-NEXT: cmp     r2, #0
-; CHECK-NEXT: asrge   r0, r1, r2
+; CHECK-LE:      lsr{{.*}}r2
+; CHECK-LE-NEXT: rsb     r3, r2, #32
+; CHECK-LE-NEXT: sub     r2, r2, #32
+; CHECK-LE-NEXT: orr     r0, r0, r1, lsl r3
+; CHECK-LE-NEXT: cmp     r2, #0
+; CHECK-LE-NEXT: asrge   r0, r1, r2
+
+; CHECK-BE:      lsr{{.*}}r3
+; CHECK-BE-NEXT: rsb     r2, r3, #32
+; CHECK-BE-NEXT: orr     r1, r1, r0, lsl r2
+; CHECK-BE-NEXT: sub     r2, r3, #32
+; CHECK-BE-NEXT: cmp     r2, #0
+; CHECK-BE-NEXT: asrge   r1, r0, r2
+
 	%a = ashr i64 %x, %y
 	%b = trunc i64 %a to i32
 	ret i32 %b
@@ -35,12 +49,20 @@ define i32 @f2(i64 %x, i64 %y) {
 
 define i32 @f3(i64 %x, i64 %y) {
 ; CHECK-LABEL: f3:
-; CHECK:      lsr{{.*}}r2
-; CHECK-NEXT: rsb     r3, r2, #32
-; CHECK-NEXT: sub     r2, r2, #32
-; CHECK-NEXT: orr     r0, r0, r1, lsl r3
-; CHECK-NEXT: cmp     r2, #0
-; CHECK-NEXT: lsrge   r0, r1, r2
+; CHECK-LE:      lsr{{.*}}r2
+; CHECK-LE-NEXT: rsb     r3, r2, #32
+; CHECK-LE-NEXT: sub     r2, r2, #32
+; CHECK-LE-NEXT: orr     r0, r0, r1, lsl r3
+; CHECK-LE-NEXT: cmp     r2, #0
+; CHECK-LE-NEXT: lsrge   r0, r1, r2
+
+; CHECK-BE:      lsr{{.*}}r3
+; CHECK-BE-NEXT: rsb     r2, r3, #32
+; CHECK-BE-NEXT: orr     r1, r1, r0, lsl r2
+; CHECK-BE-NEXT: sub     r2, r3, #32
+; CHECK-BE-NEXT: cmp     r2, #0
+; CHECK-BE-NEXT: lsrge   r1, r0, r2
+
 	%a = lshr i64 %x, %y
 	%b = trunc i64 %a to i32
 	ret i32 %b
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index 14d84de..84ce4a7 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s
-
+; RUN: llc < %s -mtriple=thumbv6m-apple-ios -mcpu=cortex-m0 -pre-RA-sched=source -disable-post-ra | FileCheck %s -check-prefix=CHECK-T1
 %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
 
 @src = external global %struct.x
@@ -17,7 +17,12 @@ define i32 @t0() {
 entry:
 ; CHECK-LABEL: t0:
 ; CHECK: vldr [[REG1:d[0-9]+]],
-; CHECK: vstr [[REG1]], 
+; CHECK: vstr [[REG1]],
+; CHECK-T1-LABEL: t0:
+; CHECK-T1: ldrb [[TREG1:r[0-9]]],
+; CHECK-T1: strb [[TREG1]],
+; CHECK-T1: ldrh [[TREG2:r[0-9]]],
+; CHECK-T1: strh [[TREG2]]
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
   ret i32 0
 }
@@ -83,6 +88,11 @@ entry:
 ; CHECK: movw [[REG7:r[0-9]+]], #18500
 ; CHECK: movt [[REG7:r[0-9]+]], #22866
 ; CHECK: str [[REG7]]
+; CHECK-T1-LABEL: t5:
+; CHECK-T1: movs [[TREG3:r[0-9]]],
+; CHECK-T1: strb [[TREG3]],
+; CHECK-T1: movs [[TREG4:r[0-9]]],
+; CHECK-T1: strb [[TREG4]],
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
   ret void
 }
@@ -90,12 +100,17 @@ entry:
 define void @t6() nounwind {
 entry:
 ; CHECK-LABEL: t6:
-; CHECK: vld1.8 {[[REG8:d[0-9]+]]}, [r0]
-; CHECK: vstr [[REG8]], [r1]
+; CHECK: vld1.8 {[[REG9:d[0-9]+]]}, [r0]
+; CHECK: vstr [[REG9]], [r1]
 ; CHECK: adds r1, #6
 ; CHECK: adds r0, #6
 ; CHECK: vld1.8
 ; CHECK: vst1.16
+; CHECK-T1-LABEL: t6:
+; CHECK-T1: movs [[TREG5:r[0-9]]],
+; CHECK-T1: strh [[TREG5]],
+; CHECK-T1: ldr [[TREG6:r[0-9]]],
+; CHECK-T1: str [[TREG6]]
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
   ret void
 }
@@ -104,9 +119,12 @@ entry:
 
 define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
 entry:
-; CHECK: t7
+; CHECK-LABEL: t7:
 ; CHECK: vld1.32
 ; CHECK: vst1.32
+; CHECK-T1-LABEL: t7:
+; CHECK-T1: ldr
+; CHECK-T1: str
   %0 = bitcast %struct.Foo* %a to i8*
   %1 = bitcast %struct.Foo* %b to i8*
   tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
diff --git a/test/CodeGen/ARM/misched-copy-arm.ll b/test/CodeGen/ARM/misched-copy-arm.ll
index 5da335f..26adf0c 100644
--- a/test/CodeGen/ARM/misched-copy-arm.ll
+++ b/test/CodeGen/ARM/misched-copy-arm.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=thumb -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched %s -o - 2>&1 | FileCheck %s
 ;
 ; Loop counter copies should be eliminated.
 ; There is also a MUL here, but we don't care where it is scheduled.
diff --git a/test/CodeGen/ARM/movt.ll b/test/CodeGen/ARM/movt.ll
index 735d949..94c022e 100644
--- a/test/CodeGen/ARM/movt.ll
+++ b/test/CodeGen/ARM/movt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 ; rdar://7317664
 
 define i32 @t(i32 %X) nounwind {
diff --git a/test/CodeGen/ARM/mul.ll b/test/CodeGen/ARM/mul.ll
index 466a802..5e150b0 100644
--- a/test/CodeGen/ARM/mul.ll
+++ b/test/CodeGen/ARM/mul.ll
@@ -1,11 +1,12 @@
-; RUN: llc < %s -march=arm | grep mul | count 2
-; RUN: llc < %s -march=arm | grep lsl | count 2
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @f1(i32 %u) {
     %tmp = mul i32 %u, %u
     ret i32 %tmp
 }
 
+; CHECK: mul
+
 define i32 @f2(i32 %u, i32 %v) {
     %tmp = mul i32 %u, %v
     ret i32 %tmp
@@ -16,7 +17,16 @@ define i32 @f3(i32 %u) {
         ret i32 %tmp
 }
 
+; CHECK: mul
+; CHECK: lsl
+
 define i32 @f4(i32 %u) {
 	%tmp = mul i32 %u, 4
         ret i32 %tmp
 }
+
+; CHECK-NOT: mul
+
+; CHECK: lsl
+; CHECK-NOT: lsl
+
diff --git a/test/CodeGen/ARM/mvn.ll b/test/CodeGen/ARM/mvn.ll
index 489f247..e40ab1e 100644
--- a/test/CodeGen/ARM/mvn.ll
+++ b/test/CodeGen/ARM/mvn.ll
@@ -73,7 +73,8 @@ entry:
 	ret i1 %tmp102
 }
 
-; CHECK-LABEL: f1
+; CHECK-LABEL: mvn.ll
+; CHECK-LABEL: @f1
 ; CHECK: mvn
 ; CHECK: mvn
 ; CHECK: mvn
diff --git a/test/CodeGen/ARM/named-reg-alloc.ll b/test/CodeGen/ARM/named-reg-alloc.ll
new file mode 100644
index 0000000..3c27d22
--- /dev/null
+++ b/test/CodeGen/ARM/named-reg-alloc.ll
@@ -0,0 +1,14 @@
+; RUN: not llc < %s -mtriple=arm-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"r5\00"}
diff --git a/test/CodeGen/ARM/named-reg-notareg.ll b/test/CodeGen/ARM/named-reg-notareg.ll
new file mode 100644
index 0000000..af38b60
--- /dev/null
+++ b/test/CodeGen/ARM/named-reg-notareg.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -mtriple=arm-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"notareg\00"}
diff --git a/test/CodeGen/ARM/phi.ll b/test/CodeGen/ARM/phi.ll
index 94bced5..5a8f623 100644
--- a/test/CodeGen/ARM/phi.ll
+++ b/test/CodeGen/ARM/phi.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+v4t -addr-sink-using-gep=1 %s -o - | FileCheck %s
 
 ; <rdar://problem/8686347>
 
diff --git a/test/CodeGen/ARM/ret_i64_arg2.ll b/test/CodeGen/ARM/ret_i64_arg2.ll
index c51d2b8..5313600 100644
--- a/test/CodeGen/ARM/ret_i64_arg2.ll
+++ b/test/CodeGen/ARM/ret_i64_arg2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm -mattr=+vfp2 %s -o /dev/null
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 define i64 @test_i64(i64 %a1, i64 %a2) {
         ret i64 %a2
diff --git a/test/CodeGen/ARM/ret_i64_arg3.ll b/test/CodeGen/ARM/ret_i64_arg3.ll
index 602997e..ce8da0a 100644
--- a/test/CodeGen/ARM/ret_i64_arg3.ll
+++ b/test/CodeGen/ARM/ret_i64_arg3.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm -mattr=+vfp2 %s -o /dev/null
+; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o /dev/null
 
 define i64 @test_i64_arg3(i64 %a1, i64 %a2, i64 %a3) {
         ret i64 %a3
diff --git a/test/CodeGen/ARM/segmented-stacks-dynamic.ll b/test/CodeGen/ARM/segmented-stacks-dynamic.ll
index 13b5bcf..86f8ff8 100644
--- a/test/CodeGen/ARM/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/ARM/segmented-stacks-dynamic.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mtriple=arm-linux-androideabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=ARM-android
-; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=ARM-linux
-; RUN: llc < %s -mtriple=arm-linux-androideabi -segmented-stacks -filetype=obj
-; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-androideabi -verify-machineinstrs | FileCheck %s -check-prefix=ARM-android
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=ARM-linux
+; RUN: llc < %s -mtriple=arm-linux-androideabi -filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -filetype=obj
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define i32 @test_basic(i32 %l) {
+define i32 @test_basic(i32 %l) #0 {
         %mem = alloca i32, i32 %l
         call void @dummy_use (i32* %mem, i32 %l)
         %terminate = icmp eq i32 %l, 0
@@ -29,7 +29,7 @@ false:
 ; ARM-linux-NEXT: cmp     r4, r5
 ; ARM-linux-NEXT: blo     .LBB0_2
 
-; ARM-linux:      mov     r4, #24
+; ARM-linux:      mov     r4, #16
 ; ARM-linux-NEXT: mov     r5, #0
 ; ARM-linux-NEXT: stmdb   sp!, {lr}
 ; ARM-linux-NEXT: bl      __morestack
@@ -49,7 +49,7 @@ false:
 ; ARM-android-NEXT: cmp     r4, r5
 ; ARM-android-NEXT: blo     .LBB0_2
 
-; ARM-android:      mov     r4, #24
+; ARM-android:      mov     r4, #16
 ; ARM-android-NEXT: mov     r5, #0
 ; ARM-android-NEXT: stmdb   sp!, {lr}
 ; ARM-android-NEXT: bl      __morestack
@@ -60,3 +60,5 @@ false:
 ; ARM-android:      pop     {r4, r5}
 
 }
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/ARM/segmented-stacks.ll b/test/CodeGen/ARM/segmented-stacks.ll
index 5eff633..9873bf3 100644
--- a/test/CodeGen/ARM/segmented-stacks.ll
+++ b/test/CodeGen/ARM/segmented-stacks.ll
@@ -1,15 +1,15 @@
-; RUN: llc < %s -mtriple=arm-linux-androideabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=ARM-android
-; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=ARM-linux
+; RUN: llc < %s -mtriple=arm-linux-androideabi -verify-machineinstrs | FileCheck %s -check-prefix=ARM-android
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=ARM-linux
 
 ; We used to crash with filetype=obj
-; RUN: llc < %s -mtriple=arm-linux-androideabi -segmented-stacks -filetype=obj
-; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-androideabi -filetype=obj
+; RUN: llc < %s -mtriple=arm-linux-unknown-gnueabi -filetype=obj
 
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define void @test_basic() {
+define void @test_basic() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
@@ -54,9 +54,11 @@ define void @test_basic() {
 
 }
 
-define i32 @test_nested(i32 * nest %closure, i32 %other) {
+define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        %addend = load i32 * %closure
        %result = add i32 %other, %addend
+       %mem = alloca i32, i32 10
+       call void @dummy_use (i32* %mem, i32 10)
        ret i32 %result
 
 ; ARM-linux:      test_nested:
@@ -68,7 +70,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; ARM-linux-NEXT: cmp     r4, r5
 ; ARM-linux-NEXT: blo     .LBB1_2
 
-; ARM-linux:      mov     r4, #0
+; ARM-linux:      mov     r4, #56
 ; ARM-linux-NEXT: mov     r5, #0
 ; ARM-linux-NEXT: stmdb   sp!, {lr}
 ; ARM-linux-NEXT: bl      __morestack
@@ -87,7 +89,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; ARM-android-NEXT: cmp     r4, r5
 ; ARM-android-NEXT: blo     .LBB1_2
 
-; ARM-android:      mov     r4, #0
+; ARM-android:      mov     r4, #56
 ; ARM-android-NEXT: mov     r5, #0
 ; ARM-android-NEXT: stmdb   sp!, {lr}
 ; ARM-android-NEXT: bl      __morestack
@@ -99,7 +101,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 
 }
 
-define void @test_large() {
+define void @test_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -144,7 +146,7 @@ define void @test_large() {
 
 }
 
-define fastcc void @test_fastcc() {
+define fastcc void @test_fastcc() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
         ret void
@@ -189,7 +191,7 @@ define fastcc void @test_fastcc() {
 
 }
 
-define fastcc void @test_fastcc_large() {
+define fastcc void @test_fastcc_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -233,3 +235,15 @@ define fastcc void @test_fastcc_large() {
 ; ARM-android:      pop     {r4, r5}
 
 }
+
+define void @test_nostack() #0 {
+	ret void
+
+; ARM-linux-LABEL: test_nostack:
+; ARM-linux-NOT:   bl __morestack
+
+; ARM-android-LABEL: test_nostack:
+; ARM-android-NOT:   bl __morestack
+}
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/ARM/smml.ll b/test/CodeGen/ARM/smml.ll
index 99df0d4..fc73eb7 100644
--- a/test/CodeGen/ARM/smml.ll
+++ b/test/CodeGen/ARM/smml.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+
 define i32 @f(i32 %a, i32 %b, i32 %c) nounwind readnone ssp {
 entry:
 ; CHECK-NOT: smmls
diff --git a/test/CodeGen/ARM/stack-frame.ll b/test/CodeGen/ARM/stack-frame.ll
index a419074..a3b0b66 100644
--- a/test/CodeGen/ARM/stack-frame.ll
+++ b/test/CodeGen/ARM/stack-frame.ll
@@ -1,14 +1,14 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi < %s -o - | FileCheck %s
 
 define void @f1() {
 	%c = alloca i8, align 1
 	ret void
 }
+; CHECK-LABEL: f1:
+; CHECK: add
 
 define i32 @f2() {
 	ret i32 1
 }
-
-; CHECK: add
+; CHECK-LABEL: f2:
 ; CHECK-NOT: add
-
diff --git a/test/CodeGen/ARM/stackpointer.ll b/test/CodeGen/ARM/stackpointer.ll
new file mode 100644
index 0000000..420a916
--- /dev/null
+++ b/test/CodeGen/ARM/stackpointer.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=arm-apple-darwin  | FileCheck %s
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; CHECK-LABEL: get_stack:
+; CHECK: mov   r0, sp
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+define void @set_stack(i32 %val) nounwind {
+entry:
+; CHECK-LABEL: set_stack:
+; CHECK: mov   sp, r0
+  call void @llvm.write_register.i32(metadata !0, i32 %val)
+  ret void
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+declare void @llvm.write_register.i32(metadata, i32) nounwind
+
+; register unsigned long current_stack_pointer asm("sp");
+; CHECK-NOT: .asciz  "sp"
+!0 = metadata !{metadata !"sp\00"}
diff --git a/test/CodeGen/ARM/sub.ll b/test/CodeGen/ARM/sub.ll
index 67bde2a..9ac314d 100644
--- a/test/CodeGen/ARM/sub.ll
+++ b/test/CodeGen/ARM/sub.ll
@@ -1,10 +1,13 @@
-; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
+; RUN: llc -mtriple=armeb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
 ; CHECK: f1
-; CHECK: subs r0, r0, #171
-; CHECK: sbc r1, r1, #0
+; CHECK-LE: subs r0, r0, #171
+; CHECK-LE: sbc r1, r1, #0
+; CHECK-BE: subs r1, r1, #171
+; CHECK-BE: sbc r0, r0, #0
     %tmp = sub i64 %a, 171
     ret i64 %tmp
 }
@@ -12,8 +15,10 @@ define i64 @f1(i64 %a) {
 ; 66846720 = 0x03fc0000
 define i64 @f2(i64 %a) {
 ; CHECK: f2
-; CHECK: subs r0, r0, #66846720
-; CHECK: sbc r1, r1, #0
+; CHECK-LE: subs r0, r0, #66846720
+; CHECK-LE: sbc r1, r1, #0
+; CHECK-BE: subs r1, r1, #66846720
+; CHECK-BE: sbc r0, r0, #0
     %tmp = sub i64 %a, 66846720
     ret i64 %tmp
 }
@@ -21,8 +26,10 @@ define i64 @f2(i64 %a) {
 ; 734439407618 = 0x000000ab00000002
 define i64 @f3(i64 %a) {
 ; CHECK: f3
-; CHECK: subs r0, r0, #2
-; CHECK: sbc r1, r1, #171
+; CHECK-LE: subs r0, r0, #2
+; CHECK-LE: sbc r1, r1, #171
+; CHECK-BE: subs r1, r1, #2
+; CHECK-BE: sbc r0, r0, #171
    %tmp = sub i64 %a, 734439407618
    ret i64 %tmp
 }
diff --git a/test/CodeGen/ARM/t2-imm.ll b/test/CodeGen/ARM/t2-imm.ll
index dd75cd1..23463b8 100644
--- a/test/CodeGen/ARM/t2-imm.ll
+++ b/test/CodeGen/ARM/t2-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f6(i32 %a) {
 ; CHECK:f6
diff --git a/test/CodeGen/ARM/thumb2-it-block.ll b/test/CodeGen/ARM/thumb2-it-block.ll
index d954760..c5e699c 100644
--- a/test/CodeGen/ARM/thumb2-it-block.ll
+++ b/test/CodeGen/ARM/thumb2-it-block.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s
 ; PR11107
 
 define i32 @test(i32 %a, i32 %b) {
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index 6cb26e3..0baf50b 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=INSTR
 ; RUN: llc < %s -mtriple=arm-apple-darwin -trap-func=_trap | FileCheck %s -check-prefix=FUNC
+; RUN: llc < %s -mtriple=arm-apple-darwin -trap-func=_trap -O0 | FileCheck %s -check-prefix=FUNC
 ; RUN: llc -mtriple=armv7-unknown-nacl -filetype=obj %s -o - \
 ; RUN:  | llvm-objdump -disassemble -triple armv7-unknown-nacl - \
 ; RUN:  | FileCheck %s -check-prefix=ENCODING-NACL
diff --git a/test/CodeGen/ARM/undefined.ll b/test/CodeGen/ARM/undefined.ll
new file mode 100644
index 0000000..86422fb
--- /dev/null
+++ b/test/CodeGen/ARM/undefined.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple armv7-eabi -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv6m-eabi -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv7-eabi -o - %s | FileCheck %s
+
+declare void @llvm.arm.undefined(i32) nounwind
+
+define void @undefined_trap() {
+entry:
+  tail call void @llvm.arm.undefined(i32 254)
+  ret void
+}
+
+; CHECK-LABEL: undefined_trap
+; CHECK: udf #254
diff --git a/test/CodeGen/ARM/vcombine.ll b/test/CodeGen/ARM/vcombine.ll
index d611267..33aa71d 100644
--- a/test/CodeGen/ARM/vcombine.ll
+++ b/test/CodeGen/ARM/vcombine.ll
@@ -1,9 +1,12 @@
-; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
+; RUN: llc -mtriple=armeb-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
 
 define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ; CHECK: vcombine8
-; CHECK: vmov r0, r1, d16
-; CHECK: vmov r2, r3, d17
+; CHECK-LE: vmov r0, r1, d16
+; CHECK-LE: vmov r2, r3, d17
+; CHECK-BE: vmov r1, r0, d16
+; CHECK-BE: vmov r3, r2, d17
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
 	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -12,8 +15,10 @@ define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 
 define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ; CHECK: vcombine16
-; CHECK: vmov r0, r1, d16
-; CHECK: vmov r2, r3, d17
+; CHECK-LE: vmov r0, r1, d16
+; CHECK-LE: vmov r2, r3, d17
+; CHECK-BE: vmov r1, r0, d16
+; CHECK-BE: vmov r3, r2, d17
 	%tmp1 = load <4 x i16>* %A
 	%tmp2 = load <4 x i16>* %B
 	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -22,8 +27,10 @@ define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 
 define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ; CHECK: vcombine32
-; CHECK: vmov r0, r1, d16
-; CHECK: vmov r2, r3, d17
+; CHECK-LE: vmov r0, r1, d16
+; CHECK-LE: vmov r2, r3, d17
+; CHECK-BE: vmov r1, r0, d16
+; CHECK-BE: vmov r3, r2, d17
 	%tmp1 = load <2 x i32>* %A
 	%tmp2 = load <2 x i32>* %B
 	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -32,8 +39,10 @@ define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind {
 ; CHECK: vcombinefloat
-; CHECK: vmov r0, r1, d16
-; CHECK: vmov r2, r3, d17
+; CHECK-LE: vmov r0, r1, d16
+; CHECK-LE: vmov r2, r3, d17
+; CHECK-BE: vmov r1, r0, d16
+; CHECK-BE: vmov r3, r2, d17
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
 	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -42,8 +51,10 @@ define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind {
 
 define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 ; CHECK: vcombine64
-; CHECK: vmov r0, r1, d16
-; CHECK: vmov r2, r3, d17
+; CHECK-LE: vmov r0, r1, d16
+; CHECK-LE: vmov r2, r3, d17
+; CHECK-BE: vmov r1, r0, d16
+; CHECK-BE: vmov r3, r2, d17
 	%tmp1 = load <1 x i64>* %A
 	%tmp2 = load <1 x i64>* %B
 	%tmp3 = shufflevector <1 x i64> %tmp1, <1 x i64> %tmp2, <2 x i32> <i32 0, i32 1>
@@ -56,7 +67,8 @@ define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 define <4 x i16> @vget_low16(<8 x i16>* %A) nounwind {
 ; CHECK: vget_low16
 ; CHECK-NOT: vst
-; CHECK: vmov r0, r1, d16
+; CHECK-LE: vmov r0, r1, d16
+; CHECK-BE: vmov r1, r0, d16
 	%tmp1 = load <8 x i16>* %A
         %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i16> %tmp2
@@ -65,7 +77,8 @@ define <4 x i16> @vget_low16(<8 x i16>* %A) nounwind {
 define <8 x i8> @vget_high8(<16 x i8>* %A) nounwind {
 ; CHECK: vget_high8
 ; CHECK-NOT: vst
-; CHECK: vmov r0, r1, d17
+; CHECK-LE: vmov r0, r1, d17
+; CHECK-BE: vmov r1, r0, d16
 	%tmp1 = load <16 x i8>* %A
         %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <8 x i8> %tmp2
diff --git a/test/CodeGen/ARM/vfp-libcalls.ll b/test/CodeGen/ARM/vfp-libcalls.ll
new file mode 100644
index 0000000..9d4e194
--- /dev/null
+++ b/test/CodeGen/ARM/vfp-libcalls.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=armv6-apple-ios -mcpu=arm1136jf-s -o - %s | FileCheck %s --check-prefix=CHECK-HARD
+; RUN: llc -mtriple=thumbv6-apple-ios -mcpu=arm1136jf-s -o - %s | FileCheck %s --check-prefix=CHECK-SOFTISH
+; RUN: llc -mtriple=armv7s-apple-ios -soft-float -mcpu=arm1136jf-s -o - %s | FileCheck %s --check-prefix=CHECK-SOFT
+
+define float @test_call(float %a, float %b) {
+; CHECK-HARD: vadd.f32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-SOFTISH: blx ___addsf3vfp
+; CHECK-SOFT: bl ___addsf3{{$}}
+  %sum = fadd float %a, %b
+  ret float %sum
+}
\ No newline at end of file
diff --git a/test/CodeGen/ARM/vrev.ll b/test/CodeGen/ARM/vrev.ll
index eb76ba6..7215ad6 100644
--- a/test/CodeGen/ARM/vrev.ll
+++ b/test/CodeGen/ARM/vrev.ll
@@ -178,3 +178,11 @@ entry:
   ret void
 }
 
+define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
+; CHECK-LABEL: test_vrev32_bswap:
+; CHECK: vrev32.8
+  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
+  ret <4 x i32> %bswap
+}
+
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/zextload_demandedbits.ll b/test/CodeGen/ARM/zextload_demandedbits.ll
index 3d3269c..6b6ce97 100644
--- a/test/CodeGen/ARM/zextload_demandedbits.ll
+++ b/test/CodeGen/ARM/zextload_demandedbits.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
 %struct.spam = type { [3 x i32] }
 %struct.barney = type { [2 x i32], [2 x i32] }
 
-; Make sure that the sext op does not get lost due to ComputeMaskedBits.
+; Make sure that the sext op does not get lost due to computeKnownBits.
 ; CHECK: quux
 ; CHECK: lsl
 ; CHECK: asr
diff --git a/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll b/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
deleted file mode 100644
index 6fb7c3f..0000000
--- a/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin
-
-; Can't copy or spill / restore CPSR.
-; rdar://9105206
-
-define fastcc void @t() ssp align 2 {
-entry:
-  br i1 undef, label %bb3.i, label %bb2.i
-
-bb2.i:                                            ; preds = %entry
-  br label %bb3.i
-
-bb3.i:                                            ; preds = %bb2.i, %entry
-  br i1 undef, label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71, label %bb.i69
-
-bb.i69:                                           ; preds = %bb3.i
-  br label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
-
-_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71: ; preds = %bb.i69, %bb3.i
-  %0 = select i1 undef, float 0.000000e+00, float undef
-  %1 = fdiv float %0, undef
-  %2 = fcmp ult float %1, 0xBF847AE140000000
-  %storemerge9 = select i1 %2, float %1, float 0.000000e+00
-  store float %storemerge9, float* undef, align 4
-  br i1 undef, label %bb42, label %bb47
-
-bb42:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
-  br i1 undef, label %bb46, label %bb53
-
-bb46:                                             ; preds = %bb42
-  br label %bb48
-
-bb47:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
-  br label %bb48
-
-bb48:                                             ; preds = %bb47, %bb46
-  br i1 undef, label %bb1.i14, label %bb.i13
-
-bb.i13:                                           ; preds = %bb48
-  br label %bb1.i14
-
-bb1.i14:                                          ; preds = %bb.i13, %bb48
-  br label %bb53
-
-bb53:                                             ; preds = %bb1.i14, %bb42
-  ret void
-}
diff --git a/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
deleted file mode 100644
index 2b083d8..0000000
--- a/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin
-
-; rdar://9146594
-
-define void @drt_vsprintf() nounwind ssp {
-entry:
-  %do_tab_convert = alloca i32, align 4
-  br i1 undef, label %if.then24, label %if.else295, !dbg !13
-
-if.then24:                                        ; preds = %entry
-  unreachable
-
-if.else295:                                       ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16), !dbg !18
-  store i32 0, i32* %do_tab_convert, align 4, !dbg !19
-  unreachable
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-!llvm.dbg.gv = !{!0}
-!llvm.dbg.sp = !{!1, !7, !10, !11, !12}
-
-!0 = metadata !{i32 589876, i32 0, metadata !1, metadata !"vsplive", metadata !"vsplive", metadata !"", metadata !2, i32 617, metadata !6, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
-!1 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"drt_vsprintf", metadata !"drt_vsprintf", metadata !"", i32 616, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!5 = metadata !{metadata !6}
-!6 = metadata !{i32 589860, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"putc_mem", metadata !"putc_mem", metadata !"", i32 30, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!8 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!9 = metadata !{null}
-!10 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_double", metadata !"print_double", metadata !"", i32 203, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_number", metadata !"print_number", metadata !"", i32 75, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!12 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"get_flags", metadata !"get_flags", metadata !"", i32 508, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 653, i32 5, metadata !14, null}
-!14 = metadata !{i32 589835, metadata !20, metadata !15, i32 652, i32 35, i32 2} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{i32 589835, metadata !20, metadata !1, i32 616, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 590080, metadata !17, metadata !"do_tab_convert", metadata !2, i32 853, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!17 = metadata !{i32 589835, metadata !20, metadata !14, i32 850, i32 12, i32 33} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{i32 853, i32 11, metadata !17, null}
-!19 = metadata !{i32 853, i32 29, metadata !17, null}
-!20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"}
-!21 = metadata !{i32 0}
diff --git a/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll b/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
deleted file mode 100644
index 6f0ec34..0000000
--- a/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-define void @foo(i64 %val) {
-; CHECK: foo
-;   The stack frame store is not 64-bit aligned. Make sure we use an
-;   instruction that can handle that.
-; CHECK: stur x0, [sp, #20]
-  %a = alloca [49 x i32], align 4
-  %p32 = getelementptr inbounds [49 x i32]* %a, i64 0, i64 2
-  %p = bitcast i32* %p32 to i64*
-  store i64 %val, i64* %p, align 8
-  ret void
-}
diff --git a/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll b/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
deleted file mode 100644
index 88232fc..0000000
--- a/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-iOS5.0
-
-; CPSR is not allocatable so fast allocatable wouldn't mark them killed.
-; rdar://9313272
-
-define hidden void @t() nounwind {
-entry:
-  %cmp = icmp eq i32* null, undef
-  %frombool = zext i1 %cmp to i8
-  store i8 %frombool, i8* undef, align 1
-  %tmp4 = load i8* undef, align 1
-  %tobool = trunc i8 %tmp4 to i1
-  br i1 %tobool, label %land.lhs.true, label %if.end
-
-land.lhs.true:                                    ; preds = %entry
-  unreachable
-
-if.end:                                           ; preds = %entry
-  br i1 undef, label %land.lhs.true14, label %if.end33
-
-land.lhs.true14:                                  ; preds = %if.end
-  unreachable
-
-if.end33:                                         ; preds = %if.end
-  unreachable
-}
diff --git a/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll b/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
deleted file mode 100644
index ea1cd02..0000000
--- a/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
-
-; Can't fold the increment by 1<<12 into a post-increment load
-; rdar://10301335
-
-@test_data = common global i32 0, align 4
-
-define void @t() nounwind ssp {
-; CHECK-LABEL: t:
-entry:
-  br label %for.body
-
-for.body:
-; CHECK: for.body
-; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
-; CHECK: add x[[REG:[0-9]+]],
-; CHECK:                      x[[REG]], #4096
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %0 = shl nsw i64 %indvars.iv, 12
-  %add = add nsw i64 %0, 34628173824
-  %1 = inttoptr i64 %add to i32*
-  %2 = load volatile i32* %1, align 4096
-  store volatile i32 %2, i32* @test_data, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 200
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll b/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
deleted file mode 100644
index d47dbb2..0000000
--- a/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc < %s -march=arm64
-
-; The target lowering for integer comparisons was replacing some DAG nodes
-; during operation legalization, which resulted in dangling pointers,
-; cycles in DAGs, and eventually crashes.  This is the testcase for
-; one of those crashes. (rdar://10653656)
-
-define void @test(i1 zeroext %IsArrow) nounwind ssp align 2 {
-entry:
-  br i1 undef, label %return, label %lor.lhs.false
-
-lor.lhs.false:
-  br i1 undef, label %return, label %if.end
-
-if.end:
-  %tmp.i = load i64* undef, align 8
-  %and.i.i.i = and i64 %tmp.i, -16
-  br i1 %IsArrow, label %if.else_crit_edge, label %if.end32
-
-if.else_crit_edge:
-  br i1 undef, label %if.end32, label %return
-
-if.end32:
-  %0 = icmp ult i32 undef, 3
-  %1 = zext i64 %tmp.i to i320
-  %.pn.v = select i1 %0, i320 128, i320 64
-  %.pn = shl i320 %1, %.pn.v
-  %ins346392 = or i320 %.pn, 0
-  store i320 %ins346392, i320* undef, align 8
-  br i1 undef, label %sw.bb.i.i, label %exit
-
-sw.bb.i.i:
-  unreachable
-
-exit:
-  unreachable
-
-return:
-  ret void
-}
diff --git a/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll b/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
deleted file mode 100644
index a4d37e4..0000000
--- a/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i32 @foo(<4 x i32> %a, i32 %n) nounwind {
-; CHECK-LABEL: foo:
-; CHECK: fmov w0, s0
-; CHECK-NEXT: ret
-  %b = bitcast <4 x i32> %a to i128
-  %c = trunc i128 %b to i32
-  ret i32 %c
-}
-
-define i64 @bar(<2 x i64> %a, i64 %n) nounwind {
-; CHECK-LABEL: bar:
-; CHECK: fmov x0, d0
-; CHECK-NEXT: ret
-  %b = bitcast <2 x i64> %a to i128
-  %c = trunc i128 %b to i64
-  ret i64 %c
-}
-
diff --git a/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll b/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
deleted file mode 100644
index d59b0d0..0000000
--- a/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc < %s -march arm64 -mcpu=cyclone | FileCheck %s
-; <rdar://problem/11294426>
-
-@b = private unnamed_addr constant [3 x i32] [i32 1768775988, i32 1685481784, i32 1836253201], align 4
-
-; The important thing for this test is that we need an unaligned load of `l_b'
-; ("ldr w2, [x1, #8]" in this case).
-
-; CHECK:      adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}}
-; CHECK: add  x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}}
-; CHECK-NEXT: ldr  [[VAL:w[0-9]+]], [x[[ADDR]], #8]
-; CHECK-NEXT: str  [[VAL]], [x0, #8]
-; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
-; CHECK-NEXT: str  [[VAL2]], [x0]
-
-define void @foo(i8* %a) {
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([3 x i32]* @b to i8*), i64 12, i32 4, i1 false)
-  ret void
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll b/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
deleted file mode 100644
index d1840d3..0000000
--- a/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefix=CHECK-LINUX
-; <rdar://problem/11392109>
-
-define hidden void @t() optsize ssp {
-entry:
-  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* undef, align 8
-; CHECK:             adrp    x{{[0-9]+}}, _x@GOTPAGE
-; CHECK:        ldr     x{{[0-9]+}}, [x{{[0-9]+}}, _x@GOTPAGEOFF]
-; CHECK-NEXT:        and     x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff
-; CHECK-NEXT:        str     x{{[0-9]+}}, [x{{[0-9]+}}]
-  unreachable
-}
-
-declare i64 @x(i32) optsize
-
-; Worth checking the Linux code is sensible too: only way to access
-; the GOT is via a 64-bit load. Just loading wN is unacceptable
-; (there's no ELF relocation to do that).
-
-; CHECK-LINUX: adrp {{x[0-9]+}}, :got:x
-; CHECK-LINUX: ldr {{x[0-9]+}}, [{{x[0-9]+}}, :got_lo12:x]
diff --git a/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll b/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
deleted file mode 100644
index 4b037db..0000000
--- a/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios -verify-machineinstrs | FileCheck %s
-
-; LdStOpt bug created illegal instruction:
-;   %D1<def>, %D2<def> = LDPSi %X0, 1
-; rdar://11512047
-
-%0 = type opaque
-%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
-%struct.CGPoint = type { double, double }
-%struct.CGSize = type { double, double }
-
-@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
-
-define hidden %struct.CGRect @t(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
-entry:
-; CHECK-LABEL: t:
-; CHECK: ldp d{{[0-9]+}}, d{{[0-9]+}}
-  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
-  %0 = bitcast %0* %self to i8*
-  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
-  %add.ptr10.0 = bitcast i8* %add.ptr to double*
-  %tmp11 = load double* %add.ptr10.0, align 8
-  %add.ptr.sum = add i64 %ivar, 8
-  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
-  %1 = bitcast i8* %add.ptr10.1 to double*
-  %tmp12 = load double* %1, align 8
-  %add.ptr.sum17 = add i64 %ivar, 16
-  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
-  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
-  %tmp = load double* %add.ptr4.1.0, align 8
-  %add.ptr4.1.sum = add i64 %ivar, 24
-  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
-  %2 = bitcast i8* %add.ptr4.1.1 to double*
-  %tmp5 = load double* %2, align 8
-  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
-  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
-  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
-  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
-  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
-  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
-  ret %struct.CGRect %insert3
-}
-
-!llvm.module.flags = !{!0, !1, !2, !3}
-
-!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!4 = metadata !{}
diff --git a/test/CodeGen/ARM64/2012-06-06-FPToUI.ll b/test/CodeGen/ARM64/2012-06-06-FPToUI.ll
deleted file mode 100644
index dda4ff5..0000000
--- a/test/CodeGen/ARM64/2012-06-06-FPToUI.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: llc -march=arm64 -O0 < %s | FileCheck %s
-; RUN: llc -march=arm64 -O3 < %s | FileCheck %s
-
-@.str = private unnamed_addr constant [9 x i8] c"%lf %lu\0A\00", align 1
-@.str1 = private unnamed_addr constant [8 x i8] c"%lf %u\0A\00", align 1
-@.str2 = private unnamed_addr constant [8 x i8] c"%f %lu\0A\00", align 1
-@.str3 = private unnamed_addr constant [7 x i8] c"%f %u\0A\00", align 1
-
-define void @testDouble(double %d) ssp {
-; CHECK:  fcvtzu x{{.}}, d{{.}}
-; CHECK:  fcvtzu w{{.}}, d{{.}}
-entry:
-  %d.addr = alloca double, align 8
-  store double %d, double* %d.addr, align 8
-  %0 = load double* %d.addr, align 8
-  %1 = load double* %d.addr, align 8
-  %conv = fptoui double %1 to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
-  %2 = load double* %d.addr, align 8
-  %3 = load double* %d.addr, align 8
-  %conv1 = fptoui double %3 to i32
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
-  ret void
-}
-
-declare i32 @printf(i8*, ...)
-
-define void @testFloat(float %f) ssp {
-; CHECK:  fcvtzu x{{.}}, s{{.}}
-; CHECK:  fcvtzu w{{.}}, s{{.}}
-entry:
-  %f.addr = alloca float, align 4
-  store float %f, float* %f.addr, align 4
-  %0 = load float* %f.addr, align 4
-  %conv = fpext float %0 to double
-  %1 = load float* %f.addr, align 4
-  %conv1 = fptoui float %1 to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
-  %2 = load float* %f.addr, align 4
-  %conv2 = fpext float %2 to double
-  %3 = load float* %f.addr, align 4
-  %conv3 = fptoui float %3 to i32
-  %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
-  ret void
-}
-
-define i32 @main(i32 %argc, i8** %argv) ssp {
-entry:
-  %retval = alloca i32, align 4
-  %argc.addr = alloca i32, align 4
-  %argv.addr = alloca i8**, align 8
-  store i32 0, i32* %retval
-  store i32 %argc, i32* %argc.addr, align 4
-  store i8** %argv, i8*** %argv.addr, align 8
-  call void @testDouble(double 1.159198e+01)
-  call void @testFloat(float 0x40272F1800000000)
-  ret i32 0
-}
-
-!llvm.module.flags = !{!0, !1, !2, !3}
-
-!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll b/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
deleted file mode 100644
index 55ecfb5..0000000
--- a/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios
-; rdar://11849816
-
-@shlib_path_substitutions = external hidden unnamed_addr global i8**, align 8
-
-declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
-
-declare noalias i8* @xmalloc(i64) optsize
-
-declare i64 @strlen(i8* nocapture) nounwind readonly optsize
-
-declare i8* @__strcpy_chk(i8*, i8*, i64) nounwind optsize
-
-declare i8* @__strcat_chk(i8*, i8*, i64) nounwind optsize
-
-declare noalias i8* @xstrdup(i8*) optsize
-
-define i8* @dyld_fix_path(i8* %path) nounwind optsize ssp {
-entry:
-  br i1 undef, label %if.end56, label %for.cond
-
-for.cond:                                         ; preds = %entry
-  br i1 undef, label %for.cond10, label %for.body
-
-for.body:                                         ; preds = %for.cond
-  unreachable
-
-for.cond10:                                       ; preds = %for.cond
-  br i1 undef, label %if.end56, label %for.body14
-
-for.body14:                                       ; preds = %for.cond10
-  %call22 = tail call i64 @strlen(i8* undef) nounwind optsize
-  %sext = shl i64 %call22, 32
-  %conv30 = ashr exact i64 %sext, 32
-  %add29 = sub i64 0, %conv30
-  %sub = add i64 %add29, 0
-  %add31 = shl i64 %sub, 32
-  %sext59 = add i64 %add31, 4294967296
-  %conv33 = ashr exact i64 %sext59, 32
-  %call34 = tail call noalias i8* @xmalloc(i64 %conv33) nounwind optsize
-  br i1 undef, label %cond.false45, label %cond.true43
-
-cond.true43:                                      ; preds = %for.body14
-  unreachable
-
-cond.false45:                                     ; preds = %for.body14
-  %add.ptr = getelementptr inbounds i8* %path, i64 %conv30
-  unreachable
-
-if.end56:                                         ; preds = %for.cond10, %entry
-  ret i8* null
-}
-
-declare i32 @strncmp(i8* nocapture, i8* nocapture, i64) nounwind readonly optsize
-
-declare i8* @strcpy(i8*, i8* nocapture) nounwind
diff --git a/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll b/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
deleted file mode 100644
index b40a581..0000000
--- a/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios7.0.0"
-
-;FAST-LABEL: _Z9example25v:
-;FAST: fcmgt.4s
-;FAST: ret
-
-;CHECK-LABEL: _Z9example25v:
-;CHECK: fcmgt.4s
-;CHECK: ret
-
-define <4 x i32> @_Z9example25v( <4 x float> %N0,  <4 x float> %N1) {
-  %A = fcmp olt <4 x float> %N0, %N1
-  %B = zext <4 x i1> %A to <4 x i32>
-  ret <4 x i32> %B
-}
diff --git a/test/CodeGen/ARM64/2013-01-23-frem-crash.ll b/test/CodeGen/ARM64/2013-01-23-frem-crash.ll
deleted file mode 100644
index 9451124..0000000
--- a/test/CodeGen/ARM64/2013-01-23-frem-crash.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=arm64
-; Make sure we are not crashing on this test.
-
-define void @autogen_SD13158() {
-entry:
-  %B26 = frem float 0.000000e+00, undef
-  br i1 undef, label %CF, label %CF77
-
-CF:                                               ; preds = %CF, %CF76
-  store float %B26, float* undef
-  br i1 undef, label %CF, label %CF77
-
-CF77:                                             ; preds = %CF
-  ret void
-}
diff --git a/test/CodeGen/ARM64/2013-01-23-sext-crash.ll b/test/CodeGen/ARM64/2013-01-23-sext-crash.ll
deleted file mode 100644
index 404027b..0000000
--- a/test/CodeGen/ARM64/2013-01-23-sext-crash.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; RUN: llc < %s -march=arm64
-
-; Make sure we are not crashing on this test.
-
-define void @autogen_SD12881() {
-BB:
-  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
-  br label %CF
-
-CF:                                               ; preds = %CF83, %CF, %BB
-  br i1 undef, label %CF, label %CF83
-
-CF83:                                             ; preds = %CF
-  %FC70 = sitofp <4 x i32> %B17 to <4 x double>
-  br label %CF
-}
-
-
-define void @autogen_SD12881_2() {
-BB:
-  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
-  br label %CF
-
-CF:                                               ; preds = %CF83, %CF, %BB
-  br i1 undef, label %CF, label %CF83
-
-CF83:                                             ; preds = %CF
-  %FC70 = uitofp <4 x i32> %B17 to <4 x double>
-  br label %CF
-}
-
-define void @_Z12my_example2bv() nounwind noinline ssp {
-entry:
-  %0 = fptosi <2 x double> undef to <2 x i32>
-  store <2 x i32> %0, <2 x i32>* undef, align 8
-  ret void
-}
diff --git a/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll b/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
deleted file mode 100644
index 70e745f..0000000
--- a/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple
-
-;CHECK-LABEL: Shuff:
-;CHECK: tbl.8b
-;CHECK: ret
-define <8 x i8 > @Shuff(<8 x i8> %in, <8 x i8>* %out) nounwind ssp {
-  %value = shufflevector <8 x i8> %in, <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i8> %value
-}
-
-
diff --git a/test/CodeGen/ARM64/AdvSIMD-Scalar.ll b/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
deleted file mode 100644
index 6397ac5..0000000
--- a/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -arm64-simd-scalar=true -asm-verbose=false | FileCheck %s
-;
-define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: bar:
-; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
-; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
-; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
-  %add = add <2 x i64> %a, %b
-  %vgetq_lane = extractelement <2 x i64> %add, i32 0
-  %vgetq_lane2 = extractelement <2 x i64> %b, i32 0
-  %add3 = add i64 %vgetq_lane, %vgetq_lane2
-  %sub = sub i64 %vgetq_lane, %vgetq_lane2
-  %vecinit = insertelement <2 x i64> undef, i64 %add3, i32 0
-  %vecinit8 = insertelement <2 x i64> %vecinit, i64 %sub, i32 1
-  ret <2 x i64> %vecinit8
-}
-
-define double @subdd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: subdd_su64:
-; CHECK: sub d0, d1, d0
-; CHECK-NEXT: ret
-  %vecext = extractelement <2 x i64> %a, i32 0
-  %vecext1 = extractelement <2 x i64> %b, i32 0
-  %sub.i = sub nsw i64 %vecext1, %vecext
-  %retval = bitcast i64 %sub.i to double
-  ret double %retval
-}
-
-define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: vaddd_su64:
-; CHECK: add d0, d1, d0
-; CHECK-NEXT: ret
-  %vecext = extractelement <2 x i64> %a, i32 0
-  %vecext1 = extractelement <2 x i64> %b, i32 0
-  %add.i = add nsw i64 %vecext1, %vecext
-  %retval = bitcast i64 %add.i to double
-  ret double %retval
-}
diff --git a/test/CodeGen/ARM64/aapcs.ll b/test/CodeGen/ARM64/aapcs.ll
deleted file mode 100644
index 27d2aa7..0000000
--- a/test/CodeGen/ARM64/aapcs.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false < %s | FileCheck %s
-
-@var = global i32 0, align 4
-
-define i128 @test_i128_align(i32, i128 %arg, i32 %after) {
-  store i32 %after, i32* @var, align 4
-; CHECK: str w4, [{{x[0-9]+}}, :lo12:var]
-
-  ret i128 %arg
-; CHECK: mov x0, x2
-; CHECK: mov x1, x3
-}
-
-@var64 = global i64 0, align 8
-
-  ; Check stack slots are 64-bit at all times.
-define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
-                                i32 %int, i64 %long) {
-  ; Part of last store. Blasted scheduler.
-; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
-
-  %ext_bool = zext i1 %bool to i64
-  store volatile i64 %ext_bool, i64* @var64, align 8
-; CHECK: ldr w[[EXT:[0-9]+]], [sp]
-; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1
-; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64]
-
-  %ext_char = zext i8 %char to i64
-  store volatile i64 %ext_char, i64* @var64, align 8
-; CHECK: ldrb w[[EXT:[0-9]+]], [sp, #8]
-; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  %ext_short = zext i16 %short to i64
-  store volatile i64 %ext_short, i64* @var64, align 8
-; CHECK: ldrh w[[EXT:[0-9]+]], [sp, #16]
-; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  %ext_int = zext i32 %int to i64
-  store volatile i64 %ext_int, i64* @var64, align 8
-; CHECK: ldr w[[EXT:[0-9]+]], [sp, #24]
-; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  store volatile i64 %long, i64* @var64, align 8
-; CHECK: str [[LONG]], [{{x[0-9]+}}, :lo12:var64]
-
-  ret void
-}
-
-; Make sure the callee does extensions (in the absence of zext/sext
-; keyword on args) while we're here.
-
-define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
-  %ext_bool = zext i1 %bool to i64
-  store volatile i64 %ext_bool, i64* @var64
-; CHECK: and [[EXT:x[0-9]+]], x0, #0x1
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  %ext_char = sext i8 %char to i64
-  store volatile i64 %ext_char, i64* @var64
-; CHECK: sxtb [[EXT:x[0-9]+]], x1
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  %ext_short = zext i16 %short to i64
-  store volatile i64 %ext_short, i64* @var64
-; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  %ext_int = zext i32 %int to i64
-  store volatile i64 %ext_int, i64* @var64
-; CHECK: uxtw [[EXT:x[0-9]+]], x3
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
-
-  ret void
-}
-
-declare void @variadic(i32 %a, ...)
-
-  ; Under AAPCS variadic functions have the same calling convention as
-  ; others. The extra arguments should go in registers rather than on the stack.
-define void @test_variadic() {
-  call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0)
-; CHECK: fmov d0, #2.0
-; CHECK: orr x1, xzr, #0x1
-; CHECK: bl variadic
-  ret void
-}
diff --git a/test/CodeGen/ARM64/abi-varargs.ll b/test/CodeGen/ARM64/abi-varargs.ll
deleted file mode 100644
index 92db392..0000000
--- a/test/CodeGen/ARM64/abi-varargs.ll
+++ /dev/null
@@ -1,191 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
-target triple = "arm64-apple-ios7.0.0"
-
-; rdar://13625505
-; Here we have 9 fixed integer arguments the 9th argument in on stack, the
-; varargs start right after at 8-byte alignment.
-define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
-; CHECK-LABEL: fn9:
-; 9th fixed argument
-; CHECK: ldr {{w[0-9]+}}, [sp, #64]
-; CHECK: add [[ARGS:x[0-9]+]], sp, #72
-; CHECK: add {{x[0-9]+}}, [[ARGS]], #8
-; First vararg
-; CHECK: ldr {{w[0-9]+}}, [sp, #72]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
-; Second vararg
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
-; Third vararg
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
-  %1 = alloca i32, align 4
-  %2 = alloca i32, align 4
-  %3 = alloca i32, align 4
-  %4 = alloca i32, align 4
-  %5 = alloca i32, align 4
-  %6 = alloca i32, align 4
-  %7 = alloca i32, align 4
-  %8 = alloca i32, align 4
-  %9 = alloca i32, align 4
-  %args = alloca i8*, align 8
-  %a10 = alloca i32, align 4
-  %a11 = alloca i32, align 4
-  %a12 = alloca i32, align 4
-  store i32 %a1, i32* %1, align 4
-  store i32 %a2, i32* %2, align 4
-  store i32 %a3, i32* %3, align 4
-  store i32 %a4, i32* %4, align 4
-  store i32 %a5, i32* %5, align 4
-  store i32 %a6, i32* %6, align 4
-  store i32 %a7, i32* %7, align 4
-  store i32 %a8, i32* %8, align 4
-  store i32 %a9, i32* %9, align 4
-  %10 = bitcast i8** %args to i8*
-  call void @llvm.va_start(i8* %10)
-  %11 = va_arg i8** %args, i32
-  store i32 %11, i32* %a10, align 4
-  %12 = va_arg i8** %args, i32
-  store i32 %12, i32* %a11, align 4
-  %13 = va_arg i8** %args, i32
-  store i32 %13, i32* %a12, align 4
-  ret void
-}
-
-declare void @llvm.va_start(i8*) nounwind
-
-define i32 @main() nounwind ssp {
-; CHECK-LABEL: main:
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: str {{x[0-9]+}}, [sp, #8]
-; CHECK: str {{w[0-9]+}}, [sp]
-  %a1 = alloca i32, align 4
-  %a2 = alloca i32, align 4
-  %a3 = alloca i32, align 4
-  %a4 = alloca i32, align 4
-  %a5 = alloca i32, align 4
-  %a6 = alloca i32, align 4
-  %a7 = alloca i32, align 4
-  %a8 = alloca i32, align 4
-  %a9 = alloca i32, align 4
-  %a10 = alloca i32, align 4
-  %a11 = alloca i32, align 4
-  %a12 = alloca i32, align 4
-  store i32 1, i32* %a1, align 4
-  store i32 2, i32* %a2, align 4
-  store i32 3, i32* %a3, align 4
-  store i32 4, i32* %a4, align 4
-  store i32 5, i32* %a5, align 4
-  store i32 6, i32* %a6, align 4
-  store i32 7, i32* %a7, align 4
-  store i32 8, i32* %a8, align 4
-  store i32 9, i32* %a9, align 4
-  store i32 10, i32* %a10, align 4
-  store i32 11, i32* %a11, align 4
-  store i32 12, i32* %a12, align 4
-  %1 = load i32* %a1, align 4
-  %2 = load i32* %a2, align 4
-  %3 = load i32* %a3, align 4
-  %4 = load i32* %a4, align 4
-  %5 = load i32* %a5, align 4
-  %6 = load i32* %a6, align 4
-  %7 = load i32* %a7, align 4
-  %8 = load i32* %a8, align 4
-  %9 = load i32* %a9, align 4
-  %10 = load i32* %a10, align 4
-  %11 = load i32* %a11, align 4
-  %12 = load i32* %a12, align 4
-  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
-  ret i32 0
-}
-
-;rdar://13668483
-@.str = private unnamed_addr constant [4 x i8] c"fmt\00", align 1
-define void @foo(i8* %fmt, ...) nounwind {
-entry:
-; CHECK-LABEL: foo:
-; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
-; CHECK: ldr {{w[0-9]+}}, [sp, #48]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
-; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
-; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
-  %fmt.addr = alloca i8*, align 8
-  %args = alloca i8*, align 8
-  %vc = alloca i32, align 4
-  %vv = alloca <4 x i32>, align 16
-  store i8* %fmt, i8** %fmt.addr, align 8
-  %args1 = bitcast i8** %args to i8*
-  call void @llvm.va_start(i8* %args1)
-  %0 = va_arg i8** %args, i32
-  store i32 %0, i32* %vc, align 4
-  %1 = va_arg i8** %args, <4 x i32>
-  store <4 x i32> %1, <4 x i32>* %vv, align 16
-  ret void
-}
-
-define void @bar(i32 %x, <4 x i32> %y) nounwind {
-entry:
-; CHECK-LABEL: bar:
-; CHECK: str {{q[0-9]+}}, [sp, #16]
-; CHECK: str {{x[0-9]+}}, [sp]
-  %x.addr = alloca i32, align 4
-  %y.addr = alloca <4 x i32>, align 16
-  store i32 %x, i32* %x.addr, align 4
-  store <4 x i32> %y, <4 x i32>* %y.addr, align 16
-  %0 = load i32* %x.addr, align 4
-  %1 = load <4 x i32>* %y.addr, align 16
-  call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
-  ret void
-}
-
-; rdar://13668927
-; When passing 16-byte aligned small structs as vararg, make sure the caller
-; side is 16-byte aligned on stack.
-%struct.s41 = type { i32, i16, i32, i16 }
-define void @foo2(i8* %fmt, ...) nounwind {
-entry:
-; CHECK-LABEL: foo2:
-; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
-; CHECK: ldr {{w[0-9]+}}, [sp, #48]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
-; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
-; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
-  %fmt.addr = alloca i8*, align 8
-  %args = alloca i8*, align 8
-  %vc = alloca i32, align 4
-  %vs = alloca %struct.s41, align 16
-  store i8* %fmt, i8** %fmt.addr, align 8
-  %args1 = bitcast i8** %args to i8*
-  call void @llvm.va_start(i8* %args1)
-  %0 = va_arg i8** %args, i32
-  store i32 %0, i32* %vc, align 4
-  %ap.cur = load i8** %args
-  %1 = getelementptr i8* %ap.cur, i32 15
-  %2 = ptrtoint i8* %1 to i64
-  %3 = and i64 %2, -16
-  %ap.align = inttoptr i64 %3 to i8*
-  %ap.next = getelementptr i8* %ap.align, i32 16
-  store i8* %ap.next, i8** %args
-  %4 = bitcast i8* %ap.align to %struct.s41*
-  %5 = bitcast %struct.s41* %vs to i8*
-  %6 = bitcast %struct.s41* %4 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 16, i32 16, i1 false)
-  ret void
-}
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
-
-define void @bar2(i32 %x, i128 %s41.coerce) nounwind {
-entry:
-; CHECK-LABEL: bar2:
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: str {{x[0-9]+}}, [sp]
-  %x.addr = alloca i32, align 4
-  %s41 = alloca %struct.s41, align 16
-  store i32 %x, i32* %x.addr, align 4
-  %0 = bitcast %struct.s41* %s41 to i128*
-  store i128 %s41.coerce, i128* %0, align 1
-  %1 = load i32* %x.addr, align 4
-  %2 = bitcast %struct.s41* %s41 to i128*
-  %3 = load i128* %2, align 1
-  call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
-  ret void
-}
diff --git a/test/CodeGen/ARM64/abi.ll b/test/CodeGen/ARM64/abi.ll
deleted file mode 100644
index a7693b6..0000000
--- a/test/CodeGen/ARM64/abi.ll
+++ /dev/null
@@ -1,236 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
-; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
-target triple = "arm64-apple-darwin"
-
-; rdar://9932559
-define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline {
-entry:
-; CHECK-LABEL: i8i16callee:
-; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
-; They are i8, i16, i8 and i8.
-; CHECK: ldrsb	{{w[0-9]+}}, [sp, #5]
-; CHECK: ldrsh	{{w[0-9]+}}, [sp, #2]
-; CHECK: ldrsb	{{w[0-9]+}}, [sp]
-; CHECK: ldrsb	{{w[0-9]+}}, [sp, #4]
-; FAST-LABEL: i8i16callee:
-; FAST: ldrb  {{w[0-9]+}}, [sp, #5]
-; FAST: ldrb  {{w[0-9]+}}, [sp, #4]
-; FAST: ldrh  {{w[0-9]+}}, [sp, #2]
-; FAST: ldrb  {{w[0-9]+}}, [sp]
-  %conv = sext i8 %a4 to i64
-  %conv3 = sext i16 %a5 to i64
-  %conv8 = sext i8 %b1 to i64
-  %conv9 = sext i16 %b2 to i64
-  %conv11 = sext i8 %b3 to i64
-  %conv13 = sext i8 %b4 to i64
-  %add10 = add i64 %a2, %a1
-  %add12 = add i64 %add10, %a3
-  %add14 = add i64 %add12, %conv
-  %add = add i64 %add14, %conv3
-  %add1 = add i64 %add, %a6
-  %add2 = add i64 %add1, %a7
-  %add4 = add i64 %add2, %a8
-  %add5 = add i64 %add4, %conv8
-  %add6 = add i64 %add5, %conv9
-  %add7 = add i64 %add6, %conv11
-  %add15 = add i64 %add7, %conv13
-  %sext = shl i64 %add15, 32
-  %conv17 = ashr exact i64 %sext, 32
-  ret i64 %conv17
-}
-
-define i32 @i8i16caller() nounwind readnone {
-entry:
-; CHECK: i8i16caller
-; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
-; They are i8, i16, i8 and i8.
-; CHECK: strb {{w[0-9]+}}, [sp, #5]
-; CHECK: strb {{w[0-9]+}}, [sp, #4]
-; CHECK: strh {{w[0-9]+}}, [sp, #2]
-; CHECK: strb {{w[0-9]+}}, [sp]
-; CHECK: bl
-; FAST: i8i16caller
-; FAST: strb {{w[0-9]+}}, [sp]
-; FAST: strh {{w[0-9]+}}, [sp, #2]
-; FAST: strb {{w[0-9]+}}, [sp, #4]
-; FAST: strb {{w[0-9]+}}, [sp, #5]
-; FAST: bl
-  %call = tail call i64 @i8i16callee(i64 0, i64 1, i64 2, i8 signext 3, i16 signext 4, i64 5, i64 6, i64 7, i8 signext 97, i16 signext 98, i8 signext 99, i8 signext 100)
-  %conv = trunc i64 %call to i32
-  ret i32 %conv
-}
-
-; rdar://12651543
-define double @circle_center([2 x float] %a) nounwind ssp {
-  %call = tail call double @ext([2 x float] %a) nounwind
-; CHECK: circle_center
-; CHECK: bl
-  ret double %call
-}
-declare double @ext([2 x float])
-
-; rdar://12656141
-; 16-byte vector should be aligned at 16-byte when passing on stack.
-; A double argument will be passed on stack, so vecotr should be at sp+16.
-define double @fixed_4i(<4 x i32>* nocapture %in) nounwind {
-entry:
-; CHECK: fixed_4i
-; CHECK: str [[REG_1:q[0-9]+]], [sp, #16]
-; FAST: fixed_4i
-; FAST: mov x[[ADDR:[0-9]+]], sp
-; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16]
-  %0 = load <4 x i32>* %in, align 16
-  %call = tail call double @args_vec_4i(double 3.000000e+00, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, double 3.000000e+00, <4 x i32> %0, i8 signext 3)
-  ret double %call
-}
-declare double @args_vec_4i(double, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, double, <4 x i32>, i8 signext)
-
-; rdar://12695237
-; d8 at sp, i in register w0.
-@g_d = common global double 0.000000e+00, align 8
-define void @test1(float %f1, double %d1, double %d2, double %d3, double %d4,
-       double %d5, double %d6, double %d7, double %d8, i32 %i) nounwind ssp {
-entry:
-; CHECK: test1
-; CHECK: ldr [[REG_1:d[0-9]+]], [sp]
-; CHECK: scvtf [[REG_2:s[0-9]+]], w0
-; CHECK: fadd s0, [[REG_2]], s0
-  %conv = sitofp i32 %i to float
-  %add = fadd float %conv, %f1
-  %conv1 = fpext float %add to double
-  %add2 = fadd double %conv1, %d7
-  %add3 = fadd double %add2, %d8
-  store double %add3, double* @g_d, align 8
-  ret void
-}
-
-; i9 at sp, d1 in register s0.
-define void @test2(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-            i32 %i7, i32 %i8, i32 %i9, float %d1) nounwind ssp {
-entry:
-; CHECK: test2
-; CHECK: scvtf [[REG_2:s[0-9]+]], w0
-; CHECK: fadd s0, [[REG_2]], s0
-; CHECK: ldr [[REG_1:s[0-9]+]], [sp]
-  %conv = sitofp i32 %i1 to float
-  %add = fadd float %conv, %d1
-  %conv1 = fpext float %add to double
-  %conv2 = sitofp i32 %i8 to double
-  %add3 = fadd double %conv2, %conv1
-  %conv4 = sitofp i32 %i9 to double
-  %add5 = fadd double %conv4, %add3
-  store double %add5, double* @g_d, align 8
-  ret void
-}
-
-; rdar://12648441
-; Check alignment on stack for v64, f64, i64, f32, i32.
-define double @test3(<2 x i32>* nocapture %in) nounwind {
-entry:
-; CHECK: test3
-; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
-; FAST: test3
-; FAST: mov x[[ADDR:[0-9]+]], sp
-; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
-  %0 = load <2 x i32>* %in, align 8
-  %call = tail call double @args_vec_2i(double 3.000000e+00, <2 x i32> %0,
-          <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0,
-          <2 x i32> %0, float 3.000000e+00, <2 x i32> %0, i8 signext 3)
-  ret double %call
-}
-declare double @args_vec_2i(double, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>,
-               <2 x i32>, <2 x i32>, <2 x i32>, float, <2 x i32>, i8 signext)
-
-define double @test4(double* nocapture %in) nounwind {
-entry:
-; CHECK: test4
-; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
-; CHECK: str [[REG_2:w[0-9]+]], [sp]
-; CHECK: orr w0, wzr, #0x3
-  %0 = load double* %in, align 8
-  %call = tail call double @args_f64(double 3.000000e+00, double %0, double %0,
-          double %0, double %0, double %0, double %0, double %0,
-          float 3.000000e+00, double %0, i8 signext 3)
-  ret double %call
-}
-declare double @args_f64(double, double, double, double, double, double, double,
-               double, float, double, i8 signext)
-
-define i64 @test5(i64* nocapture %in) nounwind {
-entry:
-; CHECK: test5
-; CHECK: strb [[REG_3:w[0-9]+]], [sp, #16]
-; CHECK: str [[REG_1:x[0-9]+]], [sp, #8]
-; CHECK: str [[REG_2:w[0-9]+]], [sp]
-  %0 = load i64* %in, align 8
-  %call = tail call i64 @args_i64(i64 3, i64 %0, i64 %0, i64 %0, i64 %0, i64 %0,
-                         i64 %0, i64 %0, i32 3, i64 %0, i8 signext 3)
-  ret i64 %call
-}
-declare i64 @args_i64(i64, i64, i64, i64, i64, i64, i64, i64, i32, i64,
-             i8 signext)
-
-define i32 @test6(float* nocapture %in) nounwind {
-entry:
-; CHECK: test6
-; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
-; CHECK: str [[REG_1:s[0-9]+]], [sp, #4]
-; CHECK: strh [[REG_3:w[0-9]+]], [sp]
-  %0 = load float* %in, align 4
-  %call = tail call i32 @args_f32(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
-          i32 7, i32 8, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0,
-          float 6.0, float 7.0, float 8.0, i16 signext 3, float %0,
-          i8 signext 3)
-  ret i32 %call
-}
-declare i32 @args_f32(i32, i32, i32, i32, i32, i32, i32, i32,
-                      float, float, float, float, float, float, float, float,
-                      i16 signext, float, i8 signext)
-
-define i32 @test7(i32* nocapture %in) nounwind {
-entry:
-; CHECK: test7
-; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
-; CHECK: str [[REG_1:w[0-9]+]], [sp, #4]
-; CHECK: strh [[REG_3:w[0-9]+]], [sp]
-  %0 = load i32* %in, align 4
-  %call = tail call i32 @args_i32(i32 3, i32 %0, i32 %0, i32 %0, i32 %0, i32 %0,
-                         i32 %0, i32 %0, i16 signext 3, i32 %0, i8 signext 4)
-  ret i32 %call
-}
-declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
-             i8 signext)
-
-define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
-entry:
-; CHECK: test8
-; CHECK: strb {{w[0-9]+}}, [sp, #3]
-; CHECK: strb wzr, [sp, #2]
-; CHECK: strb {{w[0-9]+}}, [sp, #1]
-; CHECK: strb wzr, [sp]
-; CHECK: bl
-; FAST: test8
-; FAST: strb {{w[0-9]+}}, [sp]
-; FAST: strb {{w[0-9]+}}, [sp, #1]
-; FAST: strb {{w[0-9]+}}, [sp, #2]
-; FAST: strb {{w[0-9]+}}, [sp, #3]
-; FAST: bl
-  tail call void @args_i1(i1 zeroext false, i1 zeroext true, i1 zeroext false,
-                  i1 zeroext true, i1 zeroext false, i1 zeroext true,
-                  i1 zeroext false, i1 zeroext true, i1 zeroext false,
-                  i1 zeroext true, i1 zeroext false, i1 zeroext true)
-  ret i32 0
-}
-
-declare void @args_i1(i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
-                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
-                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext)
-
-define i32 @i1_stack_incoming(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f,
-                               i64 %g, i64 %h, i64 %i, i1 zeroext %j) {
-; CHECK-LABEL: i1_stack_incoming:
-; CHECK: ldrb w0, [sp, #8]
-; CHECK: ret
-  %v = zext i1 %j to i32
-  ret i32 %v
-}
diff --git a/test/CodeGen/ARM64/abi_align.ll b/test/CodeGen/ARM64/abi_align.ll
deleted file mode 100644
index 61c661e..0000000
--- a/test/CodeGen/ARM64/abi_align.ll
+++ /dev/null
@@ -1,529 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
-; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
-target triple = "arm64-apple-darwin"
-
-; rdar://12648441
-; Generated from arm64-arguments.c with -O2.
-; Test passing structs with size < 8, < 16 and > 16
-; with alignment of 16 and without
-
-; Structs with size < 8
-%struct.s38 = type { i32, i16 }
-; With alignment of 16, the size will be padded to multiple of 16 bytes.
-%struct.s39 = type { i32, i16, [10 x i8] }
-; Structs with size < 16
-%struct.s40 = type { i32, i16, i32, i16 }
-%struct.s41 = type { i32, i16, i32, i16 }
-; Structs with size > 16
-%struct.s42 = type { i32, i16, i32, i16, i32, i16 }
-%struct.s43 = type { i32, i16, i32, i16, i32, i16, [10 x i8] }
-
-@g38 = common global %struct.s38 zeroinitializer, align 4
-@g38_2 = common global %struct.s38 zeroinitializer, align 4
-@g39 = common global %struct.s39 zeroinitializer, align 16
-@g39_2 = common global %struct.s39 zeroinitializer, align 16
-@g40 = common global %struct.s40 zeroinitializer, align 4
-@g40_2 = common global %struct.s40 zeroinitializer, align 4
-@g41 = common global %struct.s41 zeroinitializer, align 16
-@g41_2 = common global %struct.s41 zeroinitializer, align 16
-@g42 = common global %struct.s42 zeroinitializer, align 4
-@g42_2 = common global %struct.s42 zeroinitializer, align 4
-@g43 = common global %struct.s43 zeroinitializer, align 16
-@g43_2 = common global %struct.s43 zeroinitializer, align 16
-
-; structs with size < 8 bytes, passed via i64 in x1 and x2
-define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce) #0 {
-entry:
-; CHECK: f38
-; CHECK: add w[[A:[0-9]+]], w1, w0
-; CHECK: add {{w[0-9]+}}, w[[A]], w2
-  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce to i32
-  %s1.sroa.1.4.extract.shift = lshr i64 %s1.coerce, 32
-  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce to i32
-  %s2.sroa.1.4.extract.shift = lshr i64 %s2.coerce, 32
-  %sext8 = shl nuw nsw i64 %s1.sroa.1.4.extract.shift, 16
-  %sext = trunc i64 %sext8 to i32
-  %conv = ashr exact i32 %sext, 16
-  %sext1011 = shl nuw nsw i64 %s2.sroa.1.4.extract.shift, 16
-  %sext10 = trunc i64 %sext1011 to i32
-  %conv6 = ashr exact i32 %sext10, 16
-  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
-  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
-  %add4 = add i32 %add3, %conv
-  %add7 = add i32 %add4, %conv6
-  ret i32 %add7
-}
-
-define i32 @caller38() #1 {
-entry:
-; CHECK: caller38
-; CHECK: ldr x1,
-; CHECK: ldr x2,
-  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
-  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
-  %call = tail call i32 @f38(i32 3, i64 %0, i64 %1) #5
-  ret i32 %call
-}
-
-declare i32 @f38_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-                i32 %i7, i32 %i8, i32 %i9, i64 %s1.coerce, i64 %s2.coerce) #0
-
-; structs with size < 8 bytes, passed on stack at [sp+8] and [sp+16]
-; i9 at [sp]
-define i32 @caller38_stack() #1 {
-entry:
-; CHECK: caller38_stack
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #9
-; CHECK: str w[[C]], [sp]
-  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
-  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
-  %call = tail call i32 @f38_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
-                                   i32 7, i32 8, i32 9, i64 %0, i64 %1) #5
-  ret i32 %call
-}
-
-; structs with size < 8 bytes, alignment of 16
-; passed via i128 in x1 and x3
-define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
-entry:
-; CHECK: f39
-; CHECK: add w[[A:[0-9]+]], w1, w0
-; CHECK: add {{w[0-9]+}}, w[[A]], w3
-  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
-  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
-  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
-  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
-  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
-  %sext = trunc i128 %sext8 to i32
-  %conv = ashr exact i32 %sext, 16
-  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
-  %sext10 = trunc i128 %sext1011 to i32
-  %conv6 = ashr exact i32 %sext10, 16
-  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
-  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
-  %add4 = add i32 %add3, %conv
-  %add7 = add i32 %add4, %conv6
-  ret i32 %add7
-}
-
-define i32 @caller39() #1 {
-entry:
-; CHECK: caller39
-; CHECK: ldp x1, x2,
-; CHECK: ldp x3, x4,
-  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
-  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
-  %call = tail call i32 @f39(i32 3, i128 %0, i128 %1) #5
-  ret i32 %call
-}
-
-declare i32 @f39_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
-
-; structs with size < 8 bytes, alignment 16
-; passed on stack at [sp+16] and [sp+32]
-define i32 @caller39_stack() #1 {
-entry:
-; CHECK: caller39_stack
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: movz w[[C:[0-9]+]], #9
-; CHECK: str w[[C]], [sp]
-  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
-  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
-  %call = tail call i32 @f39_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
-                                   i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
-  ret i32 %call
-}
-
-; structs with size < 16 bytes
-; passed via i128 in x1 and x3
-define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 {
-entry:
-; CHECK: f40
-; CHECK: add w[[A:[0-9]+]], w1, w0
-; CHECK: add {{w[0-9]+}}, w[[A]], w3
-  %s1.coerce.fca.0.extract = extractvalue [2 x i64] %s1.coerce, 0
-  %s2.coerce.fca.0.extract = extractvalue [2 x i64] %s2.coerce, 0
-  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce.fca.0.extract to i32
-  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce.fca.0.extract to i32
-  %s1.sroa.0.4.extract.shift = lshr i64 %s1.coerce.fca.0.extract, 32
-  %sext8 = shl nuw nsw i64 %s1.sroa.0.4.extract.shift, 16
-  %sext = trunc i64 %sext8 to i32
-  %conv = ashr exact i32 %sext, 16
-  %s2.sroa.0.4.extract.shift = lshr i64 %s2.coerce.fca.0.extract, 32
-  %sext1011 = shl nuw nsw i64 %s2.sroa.0.4.extract.shift, 16
-  %sext10 = trunc i64 %sext1011 to i32
-  %conv6 = ashr exact i32 %sext10, 16
-  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
-  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
-  %add4 = add i32 %add3, %conv
-  %add7 = add i32 %add4, %conv6
-  ret i32 %add7
-}
-
-define i32 @caller40() #1 {
-entry:
-; CHECK: caller40
-; CHECK: ldp x1, x2,
-; CHECK: ldp x3, x4,
-  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
-  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
-  %call = tail call i32 @f40(i32 3, [2 x i64] %0, [2 x i64] %1) #5
-  ret i32 %call
-}
-
-declare i32 @f40_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-                i32 %i7, i32 %i8, i32 %i9, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0
-
-; structs with size < 16 bytes
-; passed on stack at [sp+8] and [sp+24]
-define i32 @caller40_stack() #1 {
-entry:
-; CHECK: caller40_stack
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #9
-; CHECK: str w[[C]], [sp]
-  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
-  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
-  %call = tail call i32 @f40_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
-                         i32 7, i32 8, i32 9, [2 x i64] %0, [2 x i64] %1) #5
-  ret i32 %call
-}
-
-; structs with size < 16 bytes, alignment of 16
-; passed via i128 in x1 and x3
-define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
-entry:
-; CHECK: f41
-; CHECK: add w[[A:[0-9]+]], w1, w0
-; CHECK: add {{w[0-9]+}}, w[[A]], w3
-  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
-  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
-  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
-  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
-  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
-  %sext = trunc i128 %sext8 to i32
-  %conv = ashr exact i32 %sext, 16
-  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
-  %sext10 = trunc i128 %sext1011 to i32
-  %conv6 = ashr exact i32 %sext10, 16
-  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
-  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
-  %add4 = add i32 %add3, %conv
-  %add7 = add i32 %add4, %conv6
-  ret i32 %add7
-}
-
-define i32 @caller41() #1 {
-entry:
-; CHECK: caller41
-; CHECK: ldp x1, x2,
-; CHECK: ldp x3, x4,
-  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
-  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
-  %call = tail call i32 @f41(i32 3, i128 %0, i128 %1) #5
-  ret i32 %call
-}
-
-declare i32 @f41_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
-
-; structs with size < 16 bytes, alignment of 16
-; passed on stack at [sp+16] and [sp+32]
-define i32 @caller41_stack() #1 {
-entry:
-; CHECK: caller41_stack
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: movz w[[C:[0-9]+]], #9
-; CHECK: str w[[C]], [sp]
-  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
-  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
-  %call = tail call i32 @f41_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
-                            i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
-  ret i32 %call
-}
-
-; structs with size of 22 bytes, passed indirectly in x1 and x2
-define i32 @f42(i32 %i, %struct.s42* nocapture %s1, %struct.s42* nocapture %s2) #2 {
-entry:
-; CHECK: f42
-; CHECK: ldr w[[A:[0-9]+]], [x1]
-; CHECK: ldr w[[B:[0-9]+]], [x2]
-; CHECK: add w[[C:[0-9]+]], w[[A]], w0
-; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
-; FAST: f42
-; FAST: ldr w[[A:[0-9]+]], [x1]
-; FAST: ldr w[[B:[0-9]+]], [x2]
-; FAST: add w[[C:[0-9]+]], w[[A]], w0
-; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
-  %i1 = getelementptr inbounds %struct.s42* %s1, i64 0, i32 0
-  %0 = load i32* %i1, align 4, !tbaa !0
-  %i2 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 0
-  %1 = load i32* %i2, align 4, !tbaa !0
-  %s = getelementptr inbounds %struct.s42* %s1, i64 0, i32 1
-  %2 = load i16* %s, align 2, !tbaa !3
-  %conv = sext i16 %2 to i32
-  %s5 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 1
-  %3 = load i16* %s5, align 2, !tbaa !3
-  %conv6 = sext i16 %3 to i32
-  %add = add i32 %0, %i
-  %add3 = add i32 %add, %1
-  %add4 = add i32 %add3, %conv
-  %add7 = add i32 %add4, %conv6
-  ret i32 %add7
-}
-
-; For s1, we allocate a 22-byte space, pass its address via x1
-define i32 @caller42() #3 {
-entry:
-; CHECK: caller42
-; CHECK: str {{x[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK: str {{q[0-9]+}}, [sp]
-; CHECK: add x1, sp, #32
-; CHECK: mov x2, sp
-; Space for s1 is allocated at sp+32
-; Space for s2 is allocated at sp
-
-; FAST: caller42
-; FAST: sub sp, sp, #96
-; Space for s1 is allocated at fp-24 = sp+72
-; Space for s2 is allocated at sp+48
-; FAST: sub x[[A:[0-9]+]], fp, #24
-; FAST: add x[[A:[0-9]+]], sp, #48
-; Call memcpy with size = 24 (0x18)
-; FAST: orr {{x[0-9]+}}, xzr, #0x18
-  %tmp = alloca %struct.s42, align 4
-  %tmp1 = alloca %struct.s42, align 4
-  %0 = bitcast %struct.s42* %tmp to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
-  %1 = bitcast %struct.s42* %tmp1 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
-  %call = call i32 @f42(i32 3, %struct.s42* %tmp, %struct.s42* %tmp1) #5
-  ret i32 %call
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #4
-
-declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-                       i32 %i7, i32 %i8, i32 %i9, %struct.s42* nocapture %s1,
-                       %struct.s42* nocapture %s2) #2
-
-define i32 @caller42_stack() #3 {
-entry:
-; CHECK: caller42_stack
-; CHECK: mov fp, sp
-; CHECK: sub sp, sp, #96
-; CHECK: stur {{x[0-9]+}}, [fp, #-16]
-; CHECK: stur {{q[0-9]+}}, [fp, #-32]
-; CHECK: str {{x[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
-; Space for s1 is allocated at fp-32 = sp+64
-; Space for s2 is allocated at sp+32
-; CHECK: add x[[B:[0-9]+]], sp, #32
-; CHECK: str x[[B]], [sp, #16]
-; CHECK: sub x[[A:[0-9]+]], fp, #32
-; Address of s1 is passed on stack at sp+8
-; CHECK: str x[[A]], [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #9
-; CHECK: str w[[C]], [sp]
-
-; FAST: caller42_stack
-; Space for s1 is allocated at fp-24
-; Space for s2 is allocated at fp-48
-; FAST: sub x[[A:[0-9]+]], fp, #24
-; FAST: sub x[[B:[0-9]+]], fp, #48
-; Call memcpy with size = 24 (0x18)
-; FAST: orr {{x[0-9]+}}, xzr, #0x18
-; FAST: str {{w[0-9]+}}, [sp]
-; Address of s1 is passed on stack at sp+8
-; FAST: str {{x[0-9]+}}, [sp, #8]
-; FAST: str {{x[0-9]+}}, [sp, #16]
-  %tmp = alloca %struct.s42, align 4
-  %tmp1 = alloca %struct.s42, align 4
-  %0 = bitcast %struct.s42* %tmp to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
-  %1 = bitcast %struct.s42* %tmp1 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
-  %call = call i32 @f42_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                       i32 8, i32 9, %struct.s42* %tmp, %struct.s42* %tmp1) #5
-  ret i32 %call
-}
-
-; structs with size of 22 bytes, alignment of 16
-; passed indirectly in x1 and x2
-define i32 @f43(i32 %i, %struct.s43* nocapture %s1, %struct.s43* nocapture %s2) #2 {
-entry:
-; CHECK: f43
-; CHECK: ldr w[[A:[0-9]+]], [x1]
-; CHECK: ldr w[[B:[0-9]+]], [x2]
-; CHECK: add w[[C:[0-9]+]], w[[A]], w0
-; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
-; FAST: f43
-; FAST: ldr w[[A:[0-9]+]], [x1]
-; FAST: ldr w[[B:[0-9]+]], [x2]
-; FAST: add w[[C:[0-9]+]], w[[A]], w0
-; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
-  %i1 = getelementptr inbounds %struct.s43* %s1, i64 0, i32 0
-  %0 = load i32* %i1, align 4, !tbaa !0
-  %i2 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 0
-  %1 = load i32* %i2, align 4, !tbaa !0
-  %s = getelementptr inbounds %struct.s43* %s1, i64 0, i32 1
-  %2 = load i16* %s, align 2, !tbaa !3
-  %conv = sext i16 %2 to i32
-  %s5 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 1
-  %3 = load i16* %s5, align 2, !tbaa !3
-  %conv6 = sext i16 %3 to i32
-  %add = add i32 %0, %i
-  %add3 = add i32 %add, %1
-  %add4 = add i32 %add3, %conv
-  %add7 = add i32 %add4, %conv6
-  ret i32 %add7
-}
-
-define i32 @caller43() #3 {
-entry:
-; CHECK: caller43
-; CHECK: str {{q[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
-; CHECK: str {{q[0-9]+}}, [sp, #16]
-; CHECK: str {{q[0-9]+}}, [sp]
-; CHECK: add x1, sp, #32
-; CHECK: mov x2, sp
-; Space for s1 is allocated at sp+32
-; Space for s2 is allocated at sp
-
-; FAST: caller43
-; FAST: mov fp, sp
-; Space for s1 is allocated at sp+32
-; Space for s2 is allocated at sp
-; FAST: add x1, sp, #32
-; FAST: mov x2, sp
-; FAST: str {{x[0-9]+}}, [sp, #32]
-; FAST: str {{x[0-9]+}}, [sp, #40]
-; FAST: str {{x[0-9]+}}, [sp, #48]
-; FAST: str {{x[0-9]+}}, [sp, #56]
-; FAST: str {{x[0-9]+}}, [sp]
-; FAST: str {{x[0-9]+}}, [sp, #8]
-; FAST: str {{x[0-9]+}}, [sp, #16]
-; FAST: str {{x[0-9]+}}, [sp, #24]
-  %tmp = alloca %struct.s43, align 16
-  %tmp1 = alloca %struct.s43, align 16
-  %0 = bitcast %struct.s43* %tmp to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
-  %1 = bitcast %struct.s43* %tmp1 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
-  %call = call i32 @f43(i32 3, %struct.s43* %tmp, %struct.s43* %tmp1) #5
-  ret i32 %call
-}
-
-declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
-                       i32 %i7, i32 %i8, i32 %i9, %struct.s43* nocapture %s1,
-                       %struct.s43* nocapture %s2) #2
-
-define i32 @caller43_stack() #3 {
-entry:
-; CHECK: caller43_stack
-; CHECK: mov fp, sp
-; CHECK: sub sp, sp, #96
-; CHECK: stur {{q[0-9]+}}, [fp, #-16]
-; CHECK: stur {{q[0-9]+}}, [fp, #-32]
-; CHECK: str {{q[0-9]+}}, [sp, #48]
-; CHECK: str {{q[0-9]+}}, [sp, #32]
-; Space for s1 is allocated at fp-32 = sp+64
-; Space for s2 is allocated at sp+32
-; CHECK: add x[[B:[0-9]+]], sp, #32
-; CHECK: str x[[B]], [sp, #16]
-; CHECK: sub x[[A:[0-9]+]], fp, #32
-; Address of s1 is passed on stack at sp+8
-; CHECK: str x[[A]], [sp, #8]
-; CHECK: movz w[[C:[0-9]+]], #9
-; CHECK: str w[[C]], [sp]
-
-; FAST: caller43_stack
-; FAST: sub sp, sp, #96
-; Space for s1 is allocated at fp-32 = sp+64
-; Space for s2 is allocated at sp+32
-; FAST: sub x[[A:[0-9]+]], fp, #32
-; FAST: add x[[B:[0-9]+]], sp, #32
-; FAST: stur {{x[0-9]+}}, [fp, #-32]
-; FAST: stur {{x[0-9]+}}, [fp, #-24]
-; FAST: stur {{x[0-9]+}}, [fp, #-16]
-; FAST: stur {{x[0-9]+}}, [fp, #-8]
-; FAST: str {{x[0-9]+}}, [sp, #32]
-; FAST: str {{x[0-9]+}}, [sp, #40]
-; FAST: str {{x[0-9]+}}, [sp, #48]
-; FAST: str {{x[0-9]+}}, [sp, #56]
-; FAST: str {{w[0-9]+}}, [sp]
-; Address of s1 is passed on stack at sp+8
-; FAST: str {{x[0-9]+}}, [sp, #8]
-; FAST: str {{x[0-9]+}}, [sp, #16]
-  %tmp = alloca %struct.s43, align 16
-  %tmp1 = alloca %struct.s43, align 16
-  %0 = bitcast %struct.s43* %tmp to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
-  %1 = bitcast %struct.s43* %tmp1 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
-  %call = call i32 @f43_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                       i32 8, i32 9, %struct.s43* %tmp, %struct.s43* %tmp1) #5
-  ret i32 %call
-}
-
-; rdar://13668927
-; Check that we don't split an i128.
-declare i32 @callee_i128_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
-                               i32 %i6, i32 %i7, i128 %s1, i32 %i8)
-
-define i32 @i128_split() {
-entry:
-; CHECK: i128_split
-; "i128 %0" should be on stack at [sp].
-; "i32 8" should be on stack at [sp, #16].
-; CHECK: str {{w[0-9]+}}, [sp, #16]
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp]
-; FAST: i128_split
-; FAST: mov x[[ADDR:[0-9]+]], sp
-; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16]
-; FAST: stp {{x[0-9]+}}, {{x[0-9]+}}, [x[[ADDR]]]
-  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
-  %call = tail call i32 @callee_i128_split(i32 1, i32 2, i32 3, i32 4, i32 5,
-                                           i32 6, i32 7, i128 %0, i32 8) #5
-  ret i32 %call
-}
-
-declare i32 @callee_i64(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
-                               i32 %i6, i32 %i7, i64 %s1, i32 %i8)
-
-define i32 @i64_split() {
-entry:
-; CHECK: i64_split
-; "i64 %0" should be in register x7.
-; "i32 8" should be on stack at [sp].
-; CHECK: ldr x7, [{{x[0-9]+}}]
-; CHECK: str {{w[0-9]+}}, [sp]
-; FAST: i64_split
-; FAST: ldr x7, [{{x[0-9]+}}]
-; FAST: str {{w[0-9]+}}, [sp]
-  %0 = load i64* bitcast (%struct.s41* @g41 to i64*), align 16
-  %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5,
-                                    i32 6, i32 7, i64 %0, i32 8) #5
-  ret i32 %call
-}
-
-attributes #0 = { noinline nounwind readnone "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
-attributes #1 = { nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
-attributes #2 = { noinline nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
-attributes #3 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
-attributes #4 = { nounwind }
-attributes #5 = { nobuiltin }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"short", metadata !1}
-!4 = metadata !{i64 0, i64 4, metadata !0, i64 4, i64 2, metadata !3, i64 8, i64 4, metadata !0, i64 12, i64 2, metadata !3, i64 16, i64 4, metadata !0, i64 20, i64 2, metadata !3}
diff --git a/test/CodeGen/ARM64/addp.ll b/test/CodeGen/ARM64/addp.ll
deleted file mode 100644
index 8283a00..0000000
--- a/test/CodeGen/ARM64/addp.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define double @foo(<2 x double> %a) nounwind {
-; CHECK-LABEL: foo:
-; CHECK: faddp.2d d0, v0
-; CHECK-NEXT: ret
-  %lane0.i = extractelement <2 x double> %a, i32 0
-  %lane1.i = extractelement <2 x double> %a, i32 1
-  %vpaddd.i = fadd double %lane0.i, %lane1.i
-  ret double %vpaddd.i
-}
-
-define i64 @foo0(<2 x i64> %a) nounwind {
-; CHECK-LABEL: foo0:
-; CHECK: addp.2d d0, v0
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: ret
-  %lane0.i = extractelement <2 x i64> %a, i32 0
-  %lane1.i = extractelement <2 x i64> %a, i32 1
-  %vpaddd.i = add i64 %lane0.i, %lane1.i
-  ret i64 %vpaddd.i
-}
-
-define float @foo1(<2 x float> %a) nounwind {
-; CHECK-LABEL: foo1:
-; CHECK: faddp.2s
-; CHECK-NEXT: ret
-  %lane0.i = extractelement <2 x float> %a, i32 0
-  %lane1.i = extractelement <2 x float> %a, i32 1
-  %vpaddd.i = fadd float %lane0.i, %lane1.i
-  ret float %vpaddd.i
-}
diff --git a/test/CodeGen/ARM64/addr-mode-folding.ll b/test/CodeGen/ARM64/addr-mode-folding.ll
deleted file mode 100644
index dff2331..0000000
--- a/test/CodeGen/ARM64/addr-mode-folding.ll
+++ /dev/null
@@ -1,171 +0,0 @@
-; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s
-; <rdar://problem/13621857>
-
-@block = common global i8* null, align 8
-
-define i32 @fct(i32 %i1, i32 %i2) {
-; CHECK: @fct
-; Sign extension is used more than once, thus it should not be folded.
-; CodeGenPrepare is not sharing sext accross uses, thus this is folded because
-; of that.
-; _CHECK-NOT_: , sxtw]
-entry:
-  %idxprom = sext i32 %i1 to i64
-  %0 = load i8** @block, align 8
-  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
-  %1 = load i8* %arrayidx, align 1
-  %idxprom1 = sext i32 %i2 to i64
-  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
-  %2 = load i8* %arrayidx2, align 1
-  %cmp = icmp eq i8 %1, %2
-  br i1 %cmp, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %cmp7 = icmp ugt i8 %1, %2
-  %conv8 = zext i1 %cmp7 to i32
-  br label %return
-
-if.end:                                           ; preds = %entry
-  %inc = add nsw i32 %i1, 1
-  %inc9 = add nsw i32 %i2, 1
-  %idxprom10 = sext i32 %inc to i64
-  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
-  %3 = load i8* %arrayidx11, align 1
-  %idxprom12 = sext i32 %inc9 to i64
-  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
-  %4 = load i8* %arrayidx13, align 1
-  %cmp16 = icmp eq i8 %3, %4
-  br i1 %cmp16, label %if.end23, label %if.then18
-
-if.then18:                                        ; preds = %if.end
-  %cmp21 = icmp ugt i8 %3, %4
-  %conv22 = zext i1 %cmp21 to i32
-  br label %return
-
-if.end23:                                         ; preds = %if.end
-  %inc24 = add nsw i32 %i1, 2
-  %inc25 = add nsw i32 %i2, 2
-  %idxprom26 = sext i32 %inc24 to i64
-  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
-  %5 = load i8* %arrayidx27, align 1
-  %idxprom28 = sext i32 %inc25 to i64
-  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
-  %6 = load i8* %arrayidx29, align 1
-  %cmp32 = icmp eq i8 %5, %6
-  br i1 %cmp32, label %return, label %if.then34
-
-if.then34:                                        ; preds = %if.end23
-  %cmp37 = icmp ugt i8 %5, %6
-  %conv38 = zext i1 %cmp37 to i32
-  br label %return
-
-return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
-  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
-  ret i32 %retval.0
-}
-
-define i32 @fct1(i32 %i1, i32 %i2) optsize {
-; CHECK: @fct1
-; Addressing are folded when optimizing for code size.
-; CHECK: , sxtw]
-; CHECK: , sxtw]
-entry:
-  %idxprom = sext i32 %i1 to i64
-  %0 = load i8** @block, align 8
-  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
-  %1 = load i8* %arrayidx, align 1
-  %idxprom1 = sext i32 %i2 to i64
-  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
-  %2 = load i8* %arrayidx2, align 1
-  %cmp = icmp eq i8 %1, %2
-  br i1 %cmp, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %cmp7 = icmp ugt i8 %1, %2
-  %conv8 = zext i1 %cmp7 to i32
-  br label %return
-
-if.end:                                           ; preds = %entry
-  %inc = add nsw i32 %i1, 1
-  %inc9 = add nsw i32 %i2, 1
-  %idxprom10 = sext i32 %inc to i64
-  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
-  %3 = load i8* %arrayidx11, align 1
-  %idxprom12 = sext i32 %inc9 to i64
-  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
-  %4 = load i8* %arrayidx13, align 1
-  %cmp16 = icmp eq i8 %3, %4
-  br i1 %cmp16, label %if.end23, label %if.then18
-
-if.then18:                                        ; preds = %if.end
-  %cmp21 = icmp ugt i8 %3, %4
-  %conv22 = zext i1 %cmp21 to i32
-  br label %return
-
-if.end23:                                         ; preds = %if.end
-  %inc24 = add nsw i32 %i1, 2
-  %inc25 = add nsw i32 %i2, 2
-  %idxprom26 = sext i32 %inc24 to i64
-  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
-  %5 = load i8* %arrayidx27, align 1
-  %idxprom28 = sext i32 %inc25 to i64
-  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
-  %6 = load i8* %arrayidx29, align 1
-  %cmp32 = icmp eq i8 %5, %6
-  br i1 %cmp32, label %return, label %if.then34
-
-if.then34:                                        ; preds = %if.end23
-  %cmp37 = icmp ugt i8 %5, %6
-  %conv38 = zext i1 %cmp37 to i32
-  br label %return
-
-return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
-  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
-  ret i32 %retval.0
-}
-
-; CHECK: @test
-; CHECK-NOT: , uxtw #2]
-define i32 @test(i32* %array, i8 zeroext %c, i32 %arg) {
-entry:
-  %conv = zext i8 %c to i32
-  %add = sub i32 0, %arg
-  %tobool = icmp eq i32 %conv, %add
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %idxprom = zext i8 %c to i64
-  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
-  %0 = load volatile i32* %arrayidx, align 4
-  %1 = load volatile i32* %arrayidx, align 4
-  %add3 = add nsw i32 %1, %0
-  br label %if.end
-
-if.end:                                           ; preds = %entry, %if.then
-  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
-  ret i32 %res.0
-}
-
-
-; CHECK: @test2
-; CHECK: , uxtw #2]
-; CHECK: , uxtw #2]
-define i32 @test2(i32* %array, i8 zeroext %c, i32 %arg) optsize {
-entry:
-  %conv = zext i8 %c to i32
-  %add = sub i32 0, %arg
-  %tobool = icmp eq i32 %conv, %add
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %idxprom = zext i8 %c to i64
-  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
-  %0 = load volatile i32* %arrayidx, align 4
-  %1 = load volatile i32* %arrayidx, align 4
-  %add3 = add nsw i32 %1, %0
-  br label %if.end
-
-if.end:                                           ; preds = %entry, %if.then
-  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
-  ret i32 %res.0
-}
diff --git a/test/CodeGen/ARM64/addr-type-promotion.ll b/test/CodeGen/ARM64/addr-type-promotion.ll
deleted file mode 100644
index 0677603..0000000
--- a/test/CodeGen/ARM64/addr-type-promotion.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; RUN: llc -march arm64 < %s | FileCheck %s
-; rdar://13452552
-; ModuleID = 'reduced_test.ll'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios3.0.0"
-
-@block = common global i8* null, align 8
-
-define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
-; CHECK: fullGtU
-; CHECK: adrp [[PAGE:x[0-9]+]], _block@GOTPAGE
-; CHECK: ldr [[ADDR:x[0-9]+]], {{\[}}[[PAGE]], _block@GOTPAGEOFF]
-; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]]
-; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]],  x0, sxtw]
-; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], x1, sxtw]
-; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
-; CHECK-NEXT b.ne
-; Next BB
-; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw
-; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw
-; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1]
-; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1]
-; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]]
-; CHECK-NEXT: b.ne
-; Next BB
-; CHECK: ldrb [[LOADEDVAL3:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #2]
-; CHECK-NEXT: ldrb [[LOADEDVAL4:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #2]
-; CHECK-NEXT: cmp [[LOADEDVAL3]], [[LOADEDVAL4]]
-entry:
-  %idxprom = sext i32 %i1 to i64
-  %tmp = load i8** @block, align 8
-  %arrayidx = getelementptr inbounds i8* %tmp, i64 %idxprom
-  %tmp1 = load i8* %arrayidx, align 1
-  %idxprom1 = sext i32 %i2 to i64
-  %arrayidx2 = getelementptr inbounds i8* %tmp, i64 %idxprom1
-  %tmp2 = load i8* %arrayidx2, align 1
-  %cmp = icmp eq i8 %tmp1, %tmp2
-  br i1 %cmp, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %cmp7 = icmp ugt i8 %tmp1, %tmp2
-  %conv9 = zext i1 %cmp7 to i8
-  br label %return
-
-if.end:                                           ; preds = %entry
-  %inc = add nsw i32 %i1, 1
-  %inc10 = add nsw i32 %i2, 1
-  %idxprom11 = sext i32 %inc to i64
-  %arrayidx12 = getelementptr inbounds i8* %tmp, i64 %idxprom11
-  %tmp3 = load i8* %arrayidx12, align 1
-  %idxprom13 = sext i32 %inc10 to i64
-  %arrayidx14 = getelementptr inbounds i8* %tmp, i64 %idxprom13
-  %tmp4 = load i8* %arrayidx14, align 1
-  %cmp17 = icmp eq i8 %tmp3, %tmp4
-  br i1 %cmp17, label %if.end25, label %if.then19
-
-if.then19:                                        ; preds = %if.end
-  %cmp22 = icmp ugt i8 %tmp3, %tmp4
-  %conv24 = zext i1 %cmp22 to i8
-  br label %return
-
-if.end25:                                         ; preds = %if.end
-  %inc26 = add nsw i32 %i1, 2
-  %inc27 = add nsw i32 %i2, 2
-  %idxprom28 = sext i32 %inc26 to i64
-  %arrayidx29 = getelementptr inbounds i8* %tmp, i64 %idxprom28
-  %tmp5 = load i8* %arrayidx29, align 1
-  %idxprom30 = sext i32 %inc27 to i64
-  %arrayidx31 = getelementptr inbounds i8* %tmp, i64 %idxprom30
-  %tmp6 = load i8* %arrayidx31, align 1
-  %cmp34 = icmp eq i8 %tmp5, %tmp6
-  br i1 %cmp34, label %return, label %if.then36
-
-if.then36:                                        ; preds = %if.end25
-  %cmp39 = icmp ugt i8 %tmp5, %tmp6
-  %conv41 = zext i1 %cmp39 to i8
-  br label %return
-
-return:                                           ; preds = %if.then36, %if.end25, %if.then19, %if.then
-  %retval.0 = phi i8 [ %conv9, %if.then ], [ %conv24, %if.then19 ], [ %conv41, %if.then36 ], [ 0, %if.end25 ]
-  ret i8 %retval.0
-}
diff --git a/test/CodeGen/ARM64/addrmode.ll b/test/CodeGen/ARM64/addrmode.ll
deleted file mode 100644
index e131237..0000000
--- a/test/CodeGen/ARM64/addrmode.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: llc -march=arm64 < %s | FileCheck %s
-; rdar://10232252
-
-@object = external hidden global i64, section "__DATA, __objc_ivar", align 8
-
-; base + offset (imm9)
-; CHECK: @t1
-; CHECK: ldr xzr, [x{{[0-9]+}}, #8]
-; CHECK: ret
-define void @t1() {
-  %incdec.ptr = getelementptr inbounds i64* @object, i64 1
-  %tmp = load volatile i64* %incdec.ptr, align 8
-  ret void
-}
-
-; base + offset (> imm9)
-; CHECK: @t2
-; CHECK: sub [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #264
-; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
-; CHECK: ret
-define void @t2() {
-  %incdec.ptr = getelementptr inbounds i64* @object, i64 -33
-  %tmp = load volatile i64* %incdec.ptr, align 8
-  ret void
-}
-
-; base + unsigned offset (> imm9 and <= imm12 * size of type in bytes)
-; CHECK: @t3
-; CHECK: ldr xzr, [x{{[0-9]+}}, #32760]
-; CHECK: ret
-define void @t3() {
-  %incdec.ptr = getelementptr inbounds i64* @object, i64 4095
-  %tmp = load volatile i64* %incdec.ptr, align 8
-  ret void
-}
-
-; base + unsigned offset (> imm12 * size of type in bytes)
-; CHECK: @t4
-; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #32768
-; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
-; CHECK: ret
-define void @t4() {
-  %incdec.ptr = getelementptr inbounds i64* @object, i64 4096
-  %tmp = load volatile i64* %incdec.ptr, align 8
-  ret void
-}
-
-; base + reg
-; CHECK: @t5
-; CHECK: ldr xzr, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #3]
-; CHECK: ret
-define void @t5(i64 %a) {
-  %incdec.ptr = getelementptr inbounds i64* @object, i64 %a
-  %tmp = load volatile i64* %incdec.ptr, align 8
-  ret void
-}
-
-; base + reg + imm
-; CHECK: @t6
-; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
-; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #32768
-; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
-; CHECK: ret
-define void @t6(i64 %a) {
-  %tmp1 = getelementptr inbounds i64* @object, i64 %a
-  %incdec.ptr = getelementptr inbounds i64* %tmp1, i64 4096
-  %tmp = load volatile i64* %incdec.ptr, align 8
-  ret void
-}
diff --git a/test/CodeGen/ARM64/alloc-no-stack-realign.ll b/test/CodeGen/ARM64/alloc-no-stack-realign.ll
deleted file mode 100644
index f396bc9..0000000
--- a/test/CodeGen/ARM64/alloc-no-stack-realign.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=false | FileCheck %s
-
-; rdar://12713765
-; Make sure we are not creating stack objects that are assumed to be 64-byte
-; aligned.
-@T3_retval = common global <16 x float> zeroinitializer, align 16
-
-define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp {
-entry:
-; CHECK: test
-; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32]
-; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp]
-; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE:x[0-9]+]], #32]
-; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE]]]
- %retval = alloca <16 x float>, align 16
- %0 = load <16 x float>* @T3_retval, align 16
- store <16 x float> %0, <16 x float>* %retval
- %1 = load <16 x float>* %retval
- store <16 x float> %1, <16 x float>* %agg.result, align 16
- ret void
-}
diff --git a/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll b/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
deleted file mode 100644
index 3750f31..0000000
--- a/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s
-
-; CHECK: foo
-; CHECK: ldr w[[REG:[0-9]+]], [x19, #264]
-; CHECK: str w[[REG]], [x19, #132]
-; CHECK: ldr w{{[0-9]+}}, [x19, #264]
-
-define i32 @foo(i32 %a) nounwind {
-  %retval = alloca i32, align 4
-  %a.addr = alloca i32, align 4
-  %arr = alloca [32 x i32], align 4
-  %i = alloca i32, align 4
-  %arr2 = alloca [32 x i32], align 4
-  %j = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  %tmp = load i32* %a.addr, align 4
-  %tmp1 = zext i32 %tmp to i64
-  %v = mul i64 4, %tmp1
-  %vla = alloca i8, i64 %v, align 4
-  %tmp2 = bitcast i8* %vla to i32*
-  %tmp3 = load i32* %a.addr, align 4
-  store i32 %tmp3, i32* %i, align 4
-  %tmp4 = load i32* %a.addr, align 4
-  store i32 %tmp4, i32* %j, align 4
-  %tmp5 = load i32* %j, align 4
-  store i32 %tmp5, i32* %retval
-  %x = load i32* %retval
-  ret i32 %x
-}
diff --git a/test/CodeGen/ARM64/andCmpBrToTBZ.ll b/test/CodeGen/ARM64/andCmpBrToTBZ.ll
deleted file mode 100644
index 4194977..0000000
--- a/test/CodeGen/ARM64/andCmpBrToTBZ.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: llc -O1 -march=arm64 -enable-andcmp-sinking=true < %s | FileCheck %s
-; ModuleID = 'and-cbz-extr-mr.bc'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios7.0.0"
-
-define zeroext i1 @foo(i1 %IsEditable, i1 %isTextField, i8* %str1, i8* %str2, i8* %str3, i8* %str4, i8* %str5, i8* %str6, i8* %str7, i8* %str8, i8* %str9, i8* %str10, i8* %str11, i8* %str12, i8* %str13, i32 %int1, i8* %str14) unnamed_addr #0 align 2 {
-; CHECK: _foo:
-entry:
-  %tobool = icmp eq i8* %str14, null
-  br i1 %tobool, label %return, label %if.end
-
-; CHECK: %if.end
-; CHECK: tbz
-if.end:                                           ; preds = %entry
-  %and.i.i.i = and i32 %int1, 4
-  %tobool.i.i.i = icmp eq i32 %and.i.i.i, 0
-  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i
-
-land.rhs.i:                                       ; preds = %if.end
-  %cmp.i.i.i = icmp eq i8* %str12, %str13
-  br i1 %cmp.i.i.i, label %if.then3, label %lor.rhs.i.i.i
-
-lor.rhs.i.i.i:                                    ; preds = %land.rhs.i
-  %cmp.i13.i.i.i = icmp eq i8* %str10, %str11
-  br i1 %cmp.i13.i.i.i, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, label %if.end5
-
-_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit: ; preds = %lor.rhs.i.i.i
-  %cmp.i.i.i.i = icmp eq i8* %str8, %str9
-  br i1 %cmp.i.i.i.i, label %if.then3, label %if.end5
-
-if.then3:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %land.rhs.i
-  %tmp11 = load i8* %str14, align 8
-  %tmp12 = and i8 %tmp11, 2
-  %tmp13 = icmp ne i8 %tmp12, 0
-  br label %return
-
-if.end5:                                          ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %lor.rhs.i.i.i
-; CHECK: %if.end5
-; CHECK: tbz
-  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i19
-
-land.rhs.i19:                                     ; preds = %if.end5
-  %cmp.i.i.i18 = icmp eq i8* %str6, %str7
-  br i1 %cmp.i.i.i18, label %if.then7, label %lor.rhs.i.i.i23
-
-lor.rhs.i.i.i23:                                  ; preds = %land.rhs.i19
-  %cmp.i13.i.i.i22 = icmp eq i8* %str3, %str4
-  br i1 %cmp.i13.i.i.i22, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, label %if.end12
-
-_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28: ; preds = %lor.rhs.i.i.i23
-  %cmp.i.i.i.i26 = icmp eq i8* %str1, %str2
-  br i1 %cmp.i.i.i.i26, label %if.then7, label %if.end12
-
-if.then7:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %land.rhs.i19
-  br i1 %isTextField, label %if.then9, label %if.end12
-
-if.then9:                                         ; preds = %if.then7
-  %tmp23 = load i8* %str5, align 8
-  %tmp24 = and i8 %tmp23, 2
-  %tmp25 = icmp ne i8 %tmp24, 0
-  br label %return
-
-if.end12:                                         ; preds = %if.then7, %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %lor.rhs.i.i.i23, %if.end5, %if.end
-  %lnot = xor i1 %IsEditable, true
-  br label %return
-
-return:                                           ; preds = %if.end12, %if.then9, %if.then3, %entry
-  %retval.0 = phi i1 [ %tmp13, %if.then3 ], [ %tmp25, %if.then9 ], [ %lnot, %if.end12 ], [ true, %entry ]
-  ret i1 %retval.0
-}
-
-attributes #0 = { nounwind ssp }
diff --git a/test/CodeGen/ARM64/anyregcc-crash.ll b/test/CodeGen/ARM64/anyregcc-crash.ll
deleted file mode 100644
index 241cf97..0000000
--- a/test/CodeGen/ARM64/anyregcc-crash.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
-;
-; Check that misuse of anyregcc results in a compile time error.
-
-; CHECK: LLVM ERROR: ran out of registers during register allocation
-define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
-                        i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
-                        i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
-                        i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
-entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
-                i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
-                i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
-                i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
-                i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32)
-  ret i64 %result
-}
-
-declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/anyregcc.ll b/test/CodeGen/ARM64/anyregcc.ll
deleted file mode 100644
index e26875d..0000000
--- a/test/CodeGen/ARM64/anyregcc.ll
+++ /dev/null
@@ -1,363 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-
-; Stackmap Header: no constants - 6 callsites
-; CHECK-LABEL: .section	__LLVM_STACKMAPS,__llvm_stackmaps
-; CHECK-NEXT:  __LLVM_StackMaps:
-; Header
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 0
-; CHECK-NEXT:   .short 0
-; Num Functions
-; CHECK-NEXT:   .long 8
-; Num LargeConstants
-; CHECK-NEXT:   .long 0
-; Num Callsites
-; CHECK-NEXT:   .long 8
-
-; Functions and stack size
-; CHECK-NEXT:   .quad _test
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _property_access1
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _property_access2
-; CHECK-NEXT:   .quad 32
-; CHECK-NEXT:   .quad _property_access3
-; CHECK-NEXT:   .quad 32
-; CHECK-NEXT:   .quad _anyreg_test1
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _anyreg_test2
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _patchpoint_spilldef
-; CHECK-NEXT:   .quad 112
-; CHECK-NEXT:   .quad _patchpoint_spillargs
-; CHECK-NEXT:   .quad 128
-
-
-; test
-; CHECK-LABEL:  .long   L{{.*}}-_test
-; CHECK-NEXT:   .short  0
-; 3 locations
-; CHECK-NEXT:   .short  3
-; Loc 0: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 4
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 1: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 4
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 2: Constant 3
-; CHECK-NEXT:   .byte 4
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long 3
-define i64 @test() nounwind ssp uwtable {
-entry:
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
-  ret i64 0
-}
-
-; property access 1 - %obj is an anyreg call argument and should therefore be in a register
-; CHECK-LABEL:  .long   L{{.*}}-_property_access1
-; CHECK-NEXT:   .short  0
-; 2 locations
-; CHECK-NEXT:   .short  2
-; Loc 0: Register <-- this is the return register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 1: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
-entry:
-  %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
-  ret i64 %ret
-}
-
-; property access 2 - %obj is an anyreg call argument and should therefore be in a register
-; CHECK-LABEL:  .long   L{{.*}}-_property_access2
-; CHECK-NEXT:   .short  0
-; 2 locations
-; CHECK-NEXT:   .short  2
-; Loc 0: Register <-- this is the return register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 1: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-define i64 @property_access2() nounwind ssp uwtable {
-entry:
-  %obj = alloca i64, align 8
-  %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
-  ret i64 %ret
-}
-
-; property access 3 - %obj is a frame index
-; CHECK-LABEL:  .long   L{{.*}}-_property_access3
-; CHECK-NEXT:   .short  0
-; 2 locations
-; CHECK-NEXT:   .short  2
-; Loc 0: Register <-- this is the return register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 1: Direct FP - 8
-; CHECK-NEXT:   .byte 2
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short 29
-; CHECK-NEXT:   .long -8
-define i64 @property_access3() nounwind ssp uwtable {
-entry:
-  %obj = alloca i64, align 8
-  %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
-  ret i64 %ret
-}
-
-; anyreg_test1
-; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test1
-; CHECK-NEXT:   .short  0
-; 14 locations
-; CHECK-NEXT:   .short  14
-; Loc 0: Register <-- this is the return register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 1: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 2: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 3: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 4: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 5: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 6: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 7: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 8: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 9: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 10: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 11: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 12: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 13: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
-entry:
-  %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
-  ret i64 %ret
-}
-
-; anyreg_test2
-; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test2
-; CHECK-NEXT:   .short  0
-; 14 locations
-; CHECK-NEXT:   .short  14
-; Loc 0: Register <-- this is the return register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 1: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 2: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 3: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 4: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 5: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 6: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 7: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 8: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 9: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 10: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 11: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 12: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-; Loc 13: Register
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
-define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
-entry:
-  %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
-  ret i64 %ret
-}
-
-; Test spilling the return value of an anyregcc call.
-;
-; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
-;
-; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef
-; CHECK-NEXT: .short 0
-; CHECK-NEXT: .short 3
-; Loc 0: Register (some register that will be spilled to the stack)
-; CHECK-NEXT: .byte  1
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short {{[0-9]+}}
-; CHECK-NEXT: .long  0
-; Loc 1: Register
-; CHECK-NEXT: .byte  1
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short {{[0-9]+}}
-; CHECK-NEXT: .long  0
-; Loc 1: Register
-; CHECK-NEXT: .byte  1
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short {{[0-9]+}}
-; CHECK-NEXT: .long  0
-define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
-  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  ret i64 %result
-}
-
-; Test spilling the arguments of an anyregcc call.
-;
-; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
-;
-; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs
-; CHECK-NEXT: .short 0
-; CHECK-NEXT: .short 5
-; Loc 0: Return a register
-; CHECK-NEXT: .byte  1
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short {{[0-9]+}}
-; CHECK-NEXT: .long  0
-; Loc 1: Arg0 in a Register
-; CHECK-NEXT: .byte  1
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short {{[0-9]+}}
-; CHECK-NEXT: .long  0
-; Loc 2: Arg1 in a Register
-; CHECK-NEXT: .byte  1
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short {{[0-9]+}}
-; CHECK-NEXT: .long  0
-; Loc 3: Arg2 spilled to FP -96
-; CHECK-NEXT: .byte  3
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short 29
-; CHECK-NEXT: .long -96
-; Loc 4: Arg3 spilled to FP - 88
-; CHECK-NEXT: .byte  3
-; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short 29
-; CHECK-NEXT: .long -88
-define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
-  ret i64 %result
-}
-
-declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
-declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/arith-saturating.ll b/test/CodeGen/ARM64/arith-saturating.ll
deleted file mode 100644
index 437ebb8..0000000
--- a/test/CodeGen/ARM64/arith-saturating.ll
+++ /dev/null
@@ -1,153 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: qadds:
-; CHECK: sqadd s0, s0, s1
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqadd.i = tail call i32 @llvm.arm64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
-  ret i32 %vqadd.i
-}
-
-define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: qaddd:
-; CHECK: sqadd d0, d0, d1
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqadd.i = tail call i64 @llvm.arm64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
-  ret i64 %vqadd.i
-}
-
-define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: uqadds:
-; CHECK: uqadd s0, s0, s1
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqadd.i = tail call i32 @llvm.arm64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
-  ret i32 %vqadd.i
-}
-
-define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: uqaddd:
-; CHECK: uqadd d0, d0, d1
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqadd.i = tail call i64 @llvm.arm64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
-  ret i64 %vqadd.i
-}
-
-declare i64 @llvm.arm64.neon.uqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.uqadd.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32) nounwind readnone
-
-define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: qsubs:
-; CHECK: sqsub s0, s0, s1
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqsub.i = tail call i32 @llvm.arm64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
-  ret i32 %vqsub.i
-}
-
-define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: qsubd:
-; CHECK: sqsub d0, d0, d1
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqsub.i = tail call i64 @llvm.arm64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
-  ret i64 %vqsub.i
-}
-
-define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: uqsubs:
-; CHECK: uqsub s0, s0, s1
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqsub.i = tail call i32 @llvm.arm64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
-  ret i32 %vqsub.i
-}
-
-define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: uqsubd:
-; CHECK: uqsub d0, d0, d1
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqsub.i = tail call i64 @llvm.arm64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
-  ret i64 %vqsub.i
-}
-
-declare i64 @llvm.arm64.neon.uqsub.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.uqsub.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32) nounwind readnone
-
-define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
-; CHECK-LABEL: qabss:
-; CHECK: sqabs s0, s0
-; CHECK: ret
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vqabs.i = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %vecext) nounwind
-  ret i32 %vqabs.i
-}
-
-define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
-; CHECK-LABEL: qabsd:
-; CHECK: sqabs d0, d0
-; CHECK: ret
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqabs.i = tail call i64 @llvm.arm64.neon.sqabs.i64(i64 %vecext) nounwind
-  ret i64 %vqabs.i
-}
-
-define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
-; CHECK-LABEL: qnegs:
-; CHECK: sqneg s0, s0
-; CHECK: ret
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vqneg.i = tail call i32 @llvm.arm64.neon.sqneg.i32(i32 %vecext) nounwind
-  ret i32 %vqneg.i
-}
-
-define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
-; CHECK-LABEL: qnegd:
-; CHECK: sqneg d0, d0
-; CHECK: ret
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqneg.i = tail call i64 @llvm.arm64.neon.sqneg.i64(i64 %vecext) nounwind
-  ret i64 %vqneg.i
-}
-
-declare i64 @llvm.arm64.neon.sqneg.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqneg.i32(i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqabs.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqabs.i32(i32) nounwind readnone
-
-
-define i32 @vqmovund(<2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: vqmovund:
-; CHECK: sqxtun s0, d0
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovun.i = tail call i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
-  ret i32 %vqmovun.i
-}
-
-define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: vqmovnd_s:
-; CHECK: sqxtn s0, d0
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
-  ret i32 %vqmovn.i
-}
-
-define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: vqmovnd_u:
-; CHECK: uqxtn s0, d0
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
-  ret i32 %vqmovn.i
-}
-
-declare i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
diff --git a/test/CodeGen/ARM64/arith.ll b/test/CodeGen/ARM64/arith.ll
deleted file mode 100644
index b6ff0da..0000000
--- a/test/CodeGen/ARM64/arith.ll
+++ /dev/null
@@ -1,262 +0,0 @@
-; RUN: llc < %s -march=arm64 -asm-verbose=false | FileCheck %s
-
-define i32 @t1(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: add w0, w1, w0
-; CHECK: ret
-  %add = add i32 %b, %a
-  ret i32 %add
-}
-
-define i32 @t2(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: udiv w0, w0, w1
-; CHECK: ret
-  %udiv = udiv i32 %a, %b
-  ret i32 %udiv
-}
-
-define i64 @t3(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t3:
-; CHECK: udiv x0, x0, x1
-; CHECK: ret
-  %udiv = udiv i64 %a, %b
-  ret i64 %udiv
-}
-
-define i32 @t4(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t4:
-; CHECK: sdiv w0, w0, w1
-; CHECK: ret
-  %sdiv = sdiv i32 %a, %b
-  ret i32 %sdiv
-}
-
-define i64 @t5(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t5:
-; CHECK: sdiv x0, x0, x1
-; CHECK: ret
-  %sdiv = sdiv i64 %a, %b
-  ret i64 %sdiv
-}
-
-define i32 @t6(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t6:
-; CHECK: lslv w0, w0, w1
-; CHECK: ret
-  %shl = shl i32 %a, %b
-  ret i32 %shl
-}
-
-define i64 @t7(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t7:
-; CHECK: lslv x0, x0, x1
-; CHECK: ret
-  %shl = shl i64 %a, %b
-  ret i64 %shl
-}
-
-define i32 @t8(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t8:
-; CHECK: lsrv w0, w0, w1
-; CHECK: ret
-  %lshr = lshr i32 %a, %b
-  ret i32 %lshr
-}
-
-define i64 @t9(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t9:
-; CHECK: lsrv x0, x0, x1
-; CHECK: ret
-  %lshr = lshr i64 %a, %b
-  ret i64 %lshr
-}
-
-define i32 @t10(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t10:
-; CHECK: asrv w0, w0, w1
-; CHECK: ret
-  %ashr = ashr i32 %a, %b
-  ret i32 %ashr
-}
-
-define i64 @t11(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t11:
-; CHECK: asrv x0, x0, x1
-; CHECK: ret
-  %ashr = ashr i64 %a, %b
-  ret i64 %ashr
-}
-
-define i32 @t12(i16 %a, i32 %x) nounwind ssp {
-entry:
-; CHECK-LABEL: t12:
-; CHECK: add	w0, w1, w0, sxth
-; CHECK: ret
-  %c = sext i16 %a to i32
-  %e = add i32 %x, %c
-  ret i32 %e
-}
-
-define i32 @t13(i16 %a, i32 %x) nounwind ssp {
-entry:
-; CHECK-LABEL: t13:
-; CHECK: add	w0, w1, w0, sxth #2
-; CHECK: ret
-  %c = sext i16 %a to i32
-  %d = shl i32 %c, 2
-  %e = add i32 %x, %d
-  ret i32 %e
-}
-
-define i64 @t14(i16 %a, i64 %x) nounwind ssp {
-entry:
-; CHECK-LABEL: t14:
-; CHECK: add	x0, x1, w0, uxth #3
-; CHECK: ret
-  %c = zext i16 %a to i64
-  %d = shl i64 %c, 3
-  %e = add i64 %x, %d
-  ret i64 %e
-}
-
-; rdar://9160598
-define i64 @t15(i64 %a, i64 %x) nounwind ssp {
-entry:
-; CHECK-LABEL: t15:
-; CHECK: add x0, x1, w0, uxtw
-; CHECK: ret
-  %b = and i64 %a, 4294967295
-  %c = add i64 %x, %b
-  ret i64 %c
-}
-
-define i64 @t16(i64 %x) nounwind ssp {
-entry:
-; CHECK-LABEL: t16:
-; CHECK: lsl x0, x0, #1
-; CHECK: ret
-  %a = shl i64 %x, 1
-  ret i64 %a
-}
-
-; rdar://9166974
-define i64 @t17(i16 %a, i64 %x) nounwind ssp {
-entry:
-; CHECK-LABEL: t17:
-; CHECK: sxth [[REG:x[0-9]+]], x0
-; CHECK: sub x0, xzr, [[REG]], lsl #32
-; CHECK: ret
-  %tmp16 = sext i16 %a to i64
-  %tmp17 = mul i64 %tmp16, -4294967296
-  ret i64 %tmp17
-}
-
-define i32 @t18(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t18:
-; CHECK: sdiv w0, w0, w1
-; CHECK: ret
-  %sdiv = call i32 @llvm.arm64.sdiv.i32(i32 %a, i32 %b)
-  ret i32 %sdiv
-}
-
-define i64 @t19(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t19:
-; CHECK: sdiv x0, x0, x1
-; CHECK: ret
-  %sdiv = call i64 @llvm.arm64.sdiv.i64(i64 %a, i64 %b)
-  ret i64 %sdiv
-}
-
-define i32 @t20(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t20:
-; CHECK: udiv w0, w0, w1
-; CHECK: ret
-  %udiv = call i32 @llvm.arm64.udiv.i32(i32 %a, i32 %b)
-  ret i32 %udiv
-}
-
-define i64 @t21(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t21:
-; CHECK: udiv x0, x0, x1
-; CHECK: ret
-  %udiv = call i64 @llvm.arm64.udiv.i64(i64 %a, i64 %b)
-  ret i64 %udiv
-}
-
-declare i32 @llvm.arm64.sdiv.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.sdiv.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.udiv.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.udiv.i64(i64, i64) nounwind readnone
-
-; 32-bit not.
-define i32 @inv_32(i32 %x) nounwind ssp {
-entry:
-; CHECK: inv_32
-; CHECK: mvn w0, w0
-; CHECK: ret
-  %inv = xor i32 %x, -1
-  ret i32 %inv
-}
-
-; 64-bit not.
-define i64 @inv_64(i64 %x) nounwind ssp {
-entry:
-; CHECK: inv_64
-; CHECK: mvn x0, x0
-; CHECK: ret
-  %inv = xor i64 %x, -1
-  ret i64 %inv
-}
-
-; Multiplying by a power of two plus or minus one is better done via shift
-; and add/sub rather than the madd/msub instructions. The latter are 4+ cycles,
-; and the former are two (total for the two instruction sequence for subtract).
-define i32 @f0(i32 %a) nounwind readnone ssp {
-; CHECK-LABEL: f0:
-; CHECK-NEXT: add w0, w0, w0, lsl #3
-; CHECK-NEXT: ret
-  %res = mul i32 %a, 9
-  ret i32 %res
-}
-
-define i64 @f1(i64 %a) nounwind readnone ssp {
-; CHECK-LABEL: f1:
-; CHECK-NEXT: lsl x8, x0, #4
-; CHECK-NEXT: sub x0, x8, x0
-; CHECK-NEXT: ret
-  %res = mul i64 %a, 15
-  ret i64 %res
-}
-
-define i32 @f2(i32 %a) nounwind readnone ssp {
-; CHECK-LABEL: f2:
-; CHECK-NEXT: lsl w8, w0, #3
-; CHECK-NEXT: sub w0, w8, w0
-; CHECK-NEXT: ret
-  %res = mul nsw i32 %a, 7
-  ret i32 %res
-}
-
-define i64 @f3(i64 %a) nounwind readnone ssp {
-; CHECK-LABEL: f3:
-; CHECK-NEXT: add x0, x0, x0, lsl #4
-; CHECK-NEXT: ret
-  %res = mul nsw i64 %a, 17
-  ret i64 %res
-}
diff --git a/test/CodeGen/ARM64/atomic-128.ll b/test/CodeGen/ARM64/atomic-128.ll
deleted file mode 100644
index a0039a3..0000000
--- a/test/CodeGen/ARM64/atomic-128.ll
+++ /dev/null
@@ -1,213 +0,0 @@
-; RUN: llc < %s -march=arm64 -mtriple=arm64-linux-gnu -verify-machineinstrs | FileCheck %s
-
-@var = global i128 0
-
-define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
-; CHECK-LABEL: val_compare_and_swap:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp   [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x0]
-; CHECK: cmp    [[RESULTLO]], x2
-; CHECK: sbc    xzr, [[RESULTHI]], x3
-; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
-; CHECK: stxp   [[SCRATCH_RES:w[0-9]+]], x4, x5, [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-; CHECK: [[LABEL2]]:
-  %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
-  ret i128 %val
-}
-
-define void @fetch_and_nand(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_nand:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: bic    [[SCRATCH_REGLO:x[0-9]+]], x2, [[DEST_REGLO]]
-; CHECK: bic    [[SCRATCH_REGHI:x[0-9]+]], x3, [[DEST_REGHI]]
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
-  %val = atomicrmw nand i128* %p, i128 %bits release
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_or(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_or:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: orr    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
-; CHECK: orr    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
-  %val = atomicrmw or i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_add(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_add:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: adds   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
-; CHECK: adc    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
-  %val = atomicrmw add i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_sub(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_sub:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: subs   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
-; CHECK: sbc    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
-  %val = atomicrmw sub i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_min(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_min:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: cmp    [[DEST_REGLO]], x2
-; CHECK: sbc    xzr, [[DEST_REGHI]], x3
-; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, lt
-; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, lt
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
-  %val = atomicrmw min i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_max(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_max:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: cmp    [[DEST_REGLO]], x2
-; CHECK: sbc    xzr, [[DEST_REGHI]], x3
-; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, gt
-; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, gt
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
-  %val = atomicrmw max i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_umin(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_umin:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: cmp    [[DEST_REGLO]], x2
-; CHECK: sbc    xzr, [[DEST_REGHI]], x3
-; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, cc
-; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, cc
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
-  %val = atomicrmw umin i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define void @fetch_and_umax(i128* %p, i128 %bits) {
-; CHECK-LABEL: fetch_and_umax:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
-; CHECK: cmp    [[DEST_REGLO]], x2
-; CHECK: sbc    xzr, [[DEST_REGHI]], x3
-; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, hi
-; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, hi
-; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
-; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
-
-; CHECK: str    [[DEST_REGHI]]
-; CHECK: str    [[DEST_REGLO]]
-  %val = atomicrmw umax i128* %p, i128 %bits seq_cst
-  store i128 %val, i128* @var, align 16
-  ret void
-}
-
-define i128 @atomic_load_seq_cst(i128* %p) {
-; CHECK-LABEL: atomic_load_seq_cst:
-; CHECK-NOT: dmb
-; CHECK-LABEL: ldaxp
-; CHECK-NOT: dmb
-   %r = load atomic i128* %p seq_cst, align 16
-   ret i128 %r
-}
-
-define i128 @atomic_load_relaxed(i128* %p) {
-; CHECK-LABEL: atomic_load_relaxed:
-; CHECK-NOT: dmb
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
-; CHECK: orr [[SAMELO:x[0-9]+]], [[LO]], xzr
-; CHECK: orr [[SAMEHI:x[0-9]+]], [[HI]], xzr
-; CHECK: stxp [[SUCCESS:w[0-9]+]], [[SAMELO]], [[SAMEHI]], [x0]
-; CHECK: cbnz [[SUCCESS]], [[LABEL]]
-; CHECK-NOT: dmb
-   %r = load atomic i128* %p monotonic, align 16
-   ret i128 %r
-}
-
-
-define void @atomic_store_seq_cst(i128 %in, i128* %p) {
-; CHECK-LABEL: atomic_store_seq_cst:
-; CHECK-NOT: dmb
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxp xzr, xzr, [x2]
-; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
-; CHECK: cbnz [[SUCCESS]], [[LABEL]]
-; CHECK-NOT: dmb
-   store atomic i128 %in, i128* %p seq_cst, align 16
-   ret void
-}
-
-define void @atomic_store_release(i128 %in, i128* %p) {
-; CHECK-LABEL: atomic_store_release:
-; CHECK-NOT: dmb
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxp xzr, xzr, [x2]
-; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
-; CHECK: cbnz [[SUCCESS]], [[LABEL]]
-; CHECK-NOT: dmb
-   store atomic i128 %in, i128* %p release, align 16
-   ret void
-}
-
-define void @atomic_store_relaxed(i128 %in, i128* %p) {
-; CHECK-LABEL: atomic_store_relaxed:
-; CHECK-NOT: dmb
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxp xzr, xzr, [x2]
-; CHECK: stxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
-; CHECK: cbnz [[SUCCESS]], [[LABEL]]
-; CHECK-NOT: dmb
-   store atomic i128 %in, i128* %p unordered, align 16
-   ret void
-}
diff --git a/test/CodeGen/ARM64/atomic.ll b/test/CodeGen/ARM64/atomic.ll
deleted file mode 100644
index cf8cf7d..0000000
--- a/test/CodeGen/ARM64/atomic.ll
+++ /dev/null
@@ -1,343 +0,0 @@
-; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
-
-define i32 @val_compare_and_swap(i32* %p) {
-; CHECK-LABEL: val_compare_and_swap:
-; CHECK: orr    [[NEWVAL_REG:w[0-9]+]], wzr, #0x4
-; CHECK: orr    [[OLDVAL_REG:w[0-9]+]], wzr, #0x7
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxr   [[RESULT:w[0-9]+]], [x0]
-; CHECK: cmp    [[RESULT]], [[OLDVAL_REG]]
-; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
-; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK: [[LABEL2]]:
-  %val = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
-  ret i32 %val
-}
-
-define i64 @val_compare_and_swap_64(i64* %p) {
-; CHECK-LABEL: val_compare_and_swap_64:
-; CHECK: orr    [[NEWVAL_REG:x[0-9]+]], xzr, #0x4
-; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxr   [[RESULT:x[0-9]+]], [x0]
-; CHECK: cmp    [[RESULT]], [[OLDVAL_REG]]
-; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
-; CHECK-NOT: stxr [[NEWVAL_REG]], [[NEWVAL_REG]]
-; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK: [[LABEL2]]:
-  %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
-  ret i64 %val
-}
-
-define i32 @fetch_and_nand(i32* %p) {
-; CHECK-LABEL: fetch_and_nand:
-; CHECK: orr    [[OLDVAL_REG:w[0-9]+]], wzr, #0x7
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxr   w[[DEST_REG:[0-9]+]], [x0]
-; CHECK: bic    [[SCRATCH2_REG:w[0-9]+]], [[OLDVAL_REG]], w[[DEST_REG]]
-; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
-; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK: mov    x0, x[[DEST_REG]]
-  %val = atomicrmw nand i32* %p, i32 7 release
-  ret i32 %val
-}
-
-define i64 @fetch_and_nand_64(i64* %p) {
-; CHECK-LABEL: fetch_and_nand_64:
-; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxr   [[DEST_REG:x[0-9]+]], [x0]
-; CHECK: bic    [[SCRATCH2_REG:x[0-9]+]], [[OLDVAL_REG]], [[DEST_REG]]
-; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK: mov    x0, [[DEST_REG]]
-  %val = atomicrmw nand i64* %p, i64 7 acq_rel
-  ret i64 %val
-}
-
-define i32 @fetch_and_or(i32* %p) {
-; CHECK-LABEL: fetch_and_or:
-; CHECK: movz   [[OLDVAL_REG:w[0-9]+]], #5
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldaxr   w[[DEST_REG:[0-9]+]], [x0]
-; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
-; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
-; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK: mov    x0, x[[DEST_REG]]
-  %val = atomicrmw or i32* %p, i32 5 seq_cst
-  ret i32 %val
-}
-
-define i64 @fetch_and_or_64(i64* %p) {
-; CHECK: fetch_and_or_64:
-; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK: ldxr   [[DEST_REG:x[0-9]+]], [x0]
-; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], [[OLDVAL_REG]]
-; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK: mov    x0, [[DEST_REG]]
-  %val = atomicrmw or i64* %p, i64 7 monotonic
-  ret i64 %val
-}
-
-define void @acquire_fence() {
-   fence acquire
-   ret void
-   ; CHECK-LABEL: acquire_fence:
-   ; CHECK: dmb ishld
-}
-
-define void @release_fence() {
-   fence release
-   ret void
-   ; CHECK-LABEL: release_fence:
-   ; CHECK: dmb ish{{$}}
-}
-
-define void @seq_cst_fence() {
-   fence seq_cst
-   ret void
-   ; CHECK-LABEL: seq_cst_fence:
-   ; CHECK: dmb ish{{$}}
-}
-
-define i32 @atomic_load(i32* %p) {
-   %r = load atomic i32* %p seq_cst, align 4
-   ret i32 %r
-   ; CHECK-LABEL: atomic_load:
-   ; CHECK: ldar
-}
-
-define i8 @atomic_load_relaxed_8(i8* %p, i32 %off32) {
-; CHECK-LABEL: atomic_load_relaxed_8:
-  %ptr_unsigned = getelementptr i8* %p, i32 4095
-  %val_unsigned = load atomic i8* %ptr_unsigned monotonic, align 1
-; CHECK: ldrb {{w[0-9]+}}, [x0, #4095]
-
-  %ptr_regoff = getelementptr i8* %p, i32 %off32
-  %val_regoff = load atomic i8* %ptr_regoff unordered, align 1
-  %tot1 = add i8 %val_unsigned, %val_regoff
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: ldrb {{w[0-9]+}}, [x0, x1, sxtw]
-
-  %ptr_unscaled = getelementptr i8* %p, i32 -256
-  %val_unscaled = load atomic i8* %ptr_unscaled monotonic, align 1
-  %tot2 = add i8 %tot1, %val_unscaled
-; CHECK: ldurb {{w[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
-  %val_random = load atomic i8* %ptr_random unordered, align 1
-  %tot3 = add i8 %tot2, %val_random
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
-; CHECK: ldrb {{w[0-9]+}}, [x[[ADDR]]]
-
-  ret i8 %tot3
-}
-
-define i16 @atomic_load_relaxed_16(i16* %p, i32 %off32) {
-; CHECK-LABEL: atomic_load_relaxed_16:
-  %ptr_unsigned = getelementptr i16* %p, i32 4095
-  %val_unsigned = load atomic i16* %ptr_unsigned monotonic, align 2
-; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
-
-  %ptr_regoff = getelementptr i16* %p, i32 %off32
-  %val_regoff = load atomic i16* %ptr_regoff unordered, align 2
-  %tot1 = add i16 %val_unsigned, %val_regoff
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: ldrh {{w[0-9]+}}, [x0, x1, sxtw #1]
-
-  %ptr_unscaled = getelementptr i16* %p, i32 -128
-  %val_unscaled = load atomic i16* %ptr_unscaled monotonic, align 2
-  %tot2 = add i16 %tot1, %val_unscaled
-; CHECK: ldurh {{w[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
-  %val_random = load atomic i16* %ptr_random unordered, align 2
-  %tot3 = add i16 %tot2, %val_random
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
-; CHECK: ldrh {{w[0-9]+}}, [x[[ADDR]]]
-
-  ret i16 %tot3
-}
-
-define i32 @atomic_load_relaxed_32(i32* %p, i32 %off32) {
-; CHECK-LABEL: atomic_load_relaxed_32:
-  %ptr_unsigned = getelementptr i32* %p, i32 4095
-  %val_unsigned = load atomic i32* %ptr_unsigned monotonic, align 4
-; CHECK: ldr {{w[0-9]+}}, [x0, #16380]
-
-  %ptr_regoff = getelementptr i32* %p, i32 %off32
-  %val_regoff = load atomic i32* %ptr_regoff unordered, align 4
-  %tot1 = add i32 %val_unsigned, %val_regoff
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: ldr {{w[0-9]+}}, [x0, x1, sxtw #2]
-
-  %ptr_unscaled = getelementptr i32* %p, i32 -64
-  %val_unscaled = load atomic i32* %ptr_unscaled monotonic, align 4
-  %tot2 = add i32 %tot1, %val_unscaled
-; CHECK: ldur {{w[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
-  %val_random = load atomic i32* %ptr_random unordered, align 4
-  %tot3 = add i32 %tot2, %val_random
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
-; CHECK: ldr {{w[0-9]+}}, [x[[ADDR]]]
-
-  ret i32 %tot3
-}
-
-define i64 @atomic_load_relaxed_64(i64* %p, i32 %off32) {
-; CHECK-LABEL: atomic_load_relaxed_64:
-  %ptr_unsigned = getelementptr i64* %p, i32 4095
-  %val_unsigned = load atomic i64* %ptr_unsigned monotonic, align 8
-; CHECK: ldr {{x[0-9]+}}, [x0, #32760]
-
-  %ptr_regoff = getelementptr i64* %p, i32 %off32
-  %val_regoff = load atomic i64* %ptr_regoff unordered, align 8
-  %tot1 = add i64 %val_unsigned, %val_regoff
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: ldr {{x[0-9]+}}, [x0, x1, sxtw #3]
-
-  %ptr_unscaled = getelementptr i64* %p, i32 -32
-  %val_unscaled = load atomic i64* %ptr_unscaled monotonic, align 8
-  %tot2 = add i64 %tot1, %val_unscaled
-; CHECK: ldur {{x[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
-  %val_random = load atomic i64* %ptr_random unordered, align 8
-  %tot3 = add i64 %tot2, %val_random
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
-; CHECK: ldr {{x[0-9]+}}, [x[[ADDR]]]
-
-  ret i64 %tot3
-}
-
-
-define void @atomc_store(i32* %p) {
-   store atomic i32 4, i32* %p seq_cst, align 4
-   ret void
-   ; CHECK-LABEL: atomc_store:
-   ; CHECK: stlr
-}
-
-define void @atomic_store_relaxed_8(i8* %p, i32 %off32, i8 %val) {
-; CHECK-LABEL: atomic_store_relaxed_8:
-  %ptr_unsigned = getelementptr i8* %p, i32 4095
-  store atomic i8 %val, i8* %ptr_unsigned monotonic, align 1
-; CHECK: strb {{w[0-9]+}}, [x0, #4095]
-
-  %ptr_regoff = getelementptr i8* %p, i32 %off32
-  store atomic i8 %val, i8* %ptr_regoff unordered, align 1
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: strb {{w[0-9]+}}, [x0, x1, sxtw]
-
-  %ptr_unscaled = getelementptr i8* %p, i32 -256
-  store atomic i8 %val, i8* %ptr_unscaled monotonic, align 1
-; CHECK: sturb {{w[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
-  store atomic i8 %val, i8* %ptr_random unordered, align 1
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
-; CHECK: strb {{w[0-9]+}}, [x[[ADDR]]]
-
-  ret void
-}
-
-define void @atomic_store_relaxed_16(i16* %p, i32 %off32, i16 %val) {
-; CHECK-LABEL: atomic_store_relaxed_16:
-  %ptr_unsigned = getelementptr i16* %p, i32 4095
-  store atomic i16 %val, i16* %ptr_unsigned monotonic, align 2
-; CHECK: strh {{w[0-9]+}}, [x0, #8190]
-
-  %ptr_regoff = getelementptr i16* %p, i32 %off32
-  store atomic i16 %val, i16* %ptr_regoff unordered, align 2
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: strh {{w[0-9]+}}, [x0, x1, sxtw #1]
-
-  %ptr_unscaled = getelementptr i16* %p, i32 -128
-  store atomic i16 %val, i16* %ptr_unscaled monotonic, align 2
-; CHECK: sturh {{w[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
-  store atomic i16 %val, i16* %ptr_random unordered, align 2
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
-; CHECK: strh {{w[0-9]+}}, [x[[ADDR]]]
-
-  ret void
-}
-
-define void @atomic_store_relaxed_32(i32* %p, i32 %off32, i32 %val) {
-; CHECK-LABEL: atomic_store_relaxed_32:
-  %ptr_unsigned = getelementptr i32* %p, i32 4095
-  store atomic i32 %val, i32* %ptr_unsigned monotonic, align 4
-; CHECK: str {{w[0-9]+}}, [x0, #16380]
-
-  %ptr_regoff = getelementptr i32* %p, i32 %off32
-  store atomic i32 %val, i32* %ptr_regoff unordered, align 4
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: str {{w[0-9]+}}, [x0, x1, sxtw #2]
-
-  %ptr_unscaled = getelementptr i32* %p, i32 -64
-  store atomic i32 %val, i32* %ptr_unscaled monotonic, align 4
-; CHECK: stur {{w[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
-  store atomic i32 %val, i32* %ptr_random unordered, align 4
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
-; CHECK: str {{w[0-9]+}}, [x[[ADDR]]]
-
-  ret void
-}
-
-define void @atomic_store_relaxed_64(i64* %p, i32 %off32, i64 %val) {
-; CHECK-LABEL: atomic_store_relaxed_64:
-  %ptr_unsigned = getelementptr i64* %p, i32 4095
-  store atomic i64 %val, i64* %ptr_unsigned monotonic, align 8
-; CHECK: str {{x[0-9]+}}, [x0, #32760]
-
-  %ptr_regoff = getelementptr i64* %p, i32 %off32
-  store atomic i64 %val, i64* %ptr_regoff unordered, align 8
-  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
-; CHECK: str {{x[0-9]+}}, [x0, x1, sxtw #3]
-
-  %ptr_unscaled = getelementptr i64* %p, i32 -32
-  store atomic i64 %val, i64* %ptr_unscaled monotonic, align 8
-; CHECK: stur {{x[0-9]+}}, [x0, #-256]
-
-  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
-  store atomic i64 %val, i64* %ptr_random unordered, align 8
-; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
-; CHECK: str {{x[0-9]+}}, [x[[ADDR]]]
-
-  ret void
-}
-
-; rdar://11531169
-; rdar://11531308
-
-%"class.X::Atomic" = type { %struct.x_atomic_t }
-%struct.x_atomic_t = type { i32 }
-
-@counter = external hidden global %"class.X::Atomic", align 4
-
-define i32 @next_id() nounwind optsize ssp align 2 {
-entry:
-  %0 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
-  %add.i = add i32 %0, 1
-  %tobool = icmp eq i32 %add.i, 0
-  br i1 %tobool, label %if.else, label %return
-
-if.else:                                          ; preds = %entry
-  %1 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
-  %add.i2 = add i32 %1, 1
-  br label %return
-
-return:                                           ; preds = %if.else, %entry
-  %retval.0 = phi i32 [ %add.i2, %if.else ], [ %add.i, %entry ]
-  ret i32 %retval.0
-}
diff --git a/test/CodeGen/ARM64/basic-pic.ll b/test/CodeGen/ARM64/basic-pic.ll
deleted file mode 100644
index 9fdb1e9..0000000
--- a/test/CodeGen/ARM64/basic-pic.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
-
-@var = global i32 0
-
-define i32 @get_globalvar() {
-; CHECK-LABEL: get_globalvar:
-
-  %val = load i32* @var
-; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
-; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], :got_lo12:var]
-; CHECK: ldr w0, [x[[GOTLOC]]]
-
-  ret i32 %val
-}
-
-define i32* @get_globalvaraddr() {
-; CHECK-LABEL: get_globalvaraddr:
-
-  %val = load i32* @var
-; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
-; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:var]
-
-  ret i32* @var
-}
-
-@hiddenvar = hidden global i32 0
-
-define i32 @get_hiddenvar() {
-; CHECK-LABEL: get_hiddenvar:
-
-  %val = load i32* @hiddenvar
-; CHECK: adrp x[[HI:[0-9]+]], hiddenvar
-; CHECK: ldr w0, [x[[HI]], :lo12:hiddenvar]
-
-  ret i32 %val
-}
-
-define i32* @get_hiddenvaraddr() {
-; CHECK-LABEL: get_hiddenvaraddr:
-
-  %val = load i32* @hiddenvar
-; CHECK: adrp [[HI:x[0-9]+]], hiddenvar
-; CHECK: add x0, [[HI]], :lo12:hiddenvar
-
-  ret i32* @hiddenvar
-}
-
-define void()* @get_func() {
-; CHECK-LABEL: get_func:
-
-  ret void()* bitcast(void()*()* @get_func to void()*)
-; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func
-; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:get_func]
-}
diff --git a/test/CodeGen/ARM64/big-imm-offsets.ll b/test/CodeGen/ARM64/big-imm-offsets.ll
deleted file mode 100644
index a56df07..0000000
--- a/test/CodeGen/ARM64/big-imm-offsets.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -march=arm64 < %s
-
-
-; Make sure large offsets aren't mistaken for valid immediate offsets.
-; <rdar://problem/13190511>
-define void @f(i32* nocapture %p) {
-entry:
-  %a = ptrtoint i32* %p to i64
-  %ao = add i64 %a, 25769803792
-  %b = inttoptr i64 %ao to i32*
-  store volatile i32 0, i32* %b, align 4
-  store volatile i32 0, i32* %b, align 4
-  ret void
-}
diff --git a/test/CodeGen/ARM64/big-stack.ll b/test/CodeGen/ARM64/big-stack.ll
deleted file mode 100644
index 56ca30c..0000000
--- a/test/CodeGen/ARM64/big-stack.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-target triple = "arm64-apple-macosx10"
-
-; Check that big stacks are generated correctly.
-; Currently, this is done by a sequence of sub instructions,
-; which can encode immediate with a 12 bits mask an optionally
-; shift left (up to 12). I.e., 16773120 is the biggest value.
-; <rdar://12513931>
-; CHECK-LABEL: foo:
-; CHECK: sub sp, sp, #16773120
-; CHECK: sub sp, sp, #16773120
-; CHECK: sub sp, sp, #8192
-define void @foo() nounwind ssp {
-entry:
-  %buffer = alloca [33554432 x i8], align 1
-  %arraydecay = getelementptr inbounds [33554432 x i8]* %buffer, i64 0, i64 0
-  call void @doit(i8* %arraydecay) nounwind
-  ret void
-}
-
-declare void @doit(i8*)
diff --git a/test/CodeGen/ARM64/bitfield-extract.ll b/test/CodeGen/ARM64/bitfield-extract.ll
deleted file mode 100644
index 96b6967..0000000
--- a/test/CodeGen/ARM64/bitfield-extract.ll
+++ /dev/null
@@ -1,406 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-%struct.X = type { i8, i8, [2 x i8] }
-%struct.Y = type { i32, i8 }
-%struct.Z = type { i8, i8, [2 x i8], i16 }
-%struct.A = type { i64, i8 }
-
-define void @foo(%struct.X* nocapture %x, %struct.Y* nocapture %y) nounwind optsize ssp {
-; CHECK-LABEL: foo:
-; CHECK: ubfm
-; CHECK-NOT: and
-; CHECK: ret
-
-  %tmp = bitcast %struct.X* %x to i32*
-  %tmp1 = load i32* %tmp, align 4
-  %b = getelementptr inbounds %struct.Y* %y, i64 0, i32 1
-  %bf.clear = lshr i32 %tmp1, 3
-  %bf.clear.lobit = and i32 %bf.clear, 1
-  %frombool = trunc i32 %bf.clear.lobit to i8
-  store i8 %frombool, i8* %b, align 1
-  ret void
-}
-
-define i32 @baz(i64 %cav1.coerce) nounwind {
-; CHECK-LABEL: baz:
-; CHECK: sbfm  w0, w0, #0, #3
-  %tmp = trunc i64 %cav1.coerce to i32
-  %tmp1 = shl i32 %tmp, 28
-  %bf.val.sext = ashr exact i32 %tmp1, 28
-  ret i32 %bf.val.sext
-}
-
-define i32 @bar(i64 %cav1.coerce) nounwind {
-; CHECK-LABEL: bar:
-; CHECK: sbfm  w0, w0, #4, #9
-  %tmp = trunc i64 %cav1.coerce to i32
-  %cav1.sroa.0.1.insert = shl i32 %tmp, 22
-  %tmp1 = ashr i32 %cav1.sroa.0.1.insert, 26
-  ret i32 %tmp1
-}
-
-define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind optsize ssp {
-; CHECK-LABEL: fct1:
-; CHECK: ubfm
-; CHECK-NOT: and
-; CHECK: ret
-
-  %tmp = bitcast %struct.Z* %x to i64*
-  %tmp1 = load i64* %tmp, align 4
-  %b = getelementptr inbounds %struct.A* %y, i64 0, i32 0
-  %bf.clear = lshr i64 %tmp1, 3
-  %bf.clear.lobit = and i64 %bf.clear, 1
-  store i64 %bf.clear.lobit, i64* %b, align 8
-  ret void
-}
-
-define i64 @fct2(i64 %cav1.coerce) nounwind {
-; CHECK-LABEL: fct2:
-; CHECK: sbfm  x0, x0, #0, #35
-  %tmp = shl i64 %cav1.coerce, 28
-  %bf.val.sext = ashr exact i64 %tmp, 28
-  ret i64 %bf.val.sext
-}
-
-define i64 @fct3(i64 %cav1.coerce) nounwind {
-; CHECK-LABEL: fct3:
-; CHECK: sbfm  x0, x0, #4, #41
-  %cav1.sroa.0.1.insert = shl i64 %cav1.coerce, 22
-  %tmp1 = ashr i64 %cav1.sroa.0.1.insert, 26
-  ret i64 %tmp1
-}
-
-define void @fct4(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct4:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], x1, #16, #39
-; CHECK-NEXT: str [[REG1]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, -16777216
-  %shr = lshr i64 %x, 16
-  %and1 = and i64 %shr, 16777215
-  %or = or i64 %and, %and1
-  store i64 %or, i64* %y, align 8
-  ret void
-}
-
-define void @fct5(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct5:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
-; CHECK-NEXT: str [[REG1]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, -8
-  %shr = lshr i32 %x, 16
-  %and1 = and i32 %shr, 7
-  %or = or i32 %and, %and1
-  store i32 %or, i32* %y, align 8
-  ret void
-}
-
-; Check if we can still catch bfm instruction when we drop some low bits
-define void @fct6(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct6:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
-; lsr is an alias of ubfm
-; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #2
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, -8
-  %shr = lshr i32 %x, 16
-  %and1 = and i32 %shr, 7
-  %or = or i32 %and, %and1
-  %shr1 = lshr i32 %or, 2
-  store i32 %shr1, i32* %y, align 8
-  ret void
-}
-
-
-; Check if we can still catch bfm instruction when we drop some high bits
-define void @fct7(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct7:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
-; lsl is an alias of ubfm
-; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, -8
-  %shr = lshr i32 %x, 16
-  %and1 = and i32 %shr, 7
-  %or = or i32 %and, %and1
-  %shl = shl i32 %or, 2
-  store i32 %shl, i32* %y, align 8
-  ret void
-}
-
-
-; Check if we can still catch bfm instruction when we drop some low bits
-; (i64 version)
-define void @fct8(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct8:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
-; lsr is an alias of ubfm
-; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #2
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, -8
-  %shr = lshr i64 %x, 16
-  %and1 = and i64 %shr, 7
-  %or = or i64 %and, %and1
-  %shr1 = lshr i64 %or, 2
-  store i64 %shr1, i64* %y, align 8
-  ret void
-}
-
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; (i64 version)
-define void @fct9(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct9:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
-; lsr is an alias of ubfm
-; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, -8
-  %shr = lshr i64 %x, 16
-  %and1 = and i64 %shr, 7
-  %or = or i64 %and, %and1
-  %shl = shl i64 %or, 2
-  store i64 %shl, i64* %y, align 8
-  ret void
-}
-
-; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
-; (i32 version)
-define void @fct10(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct10:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], w1, #0, #2
-; lsl is an alias of ubfm
-; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, -8
-  %and1 = and i32 %x, 7
-  %or = or i32 %and, %and1
-  %shl = shl i32 %or, 2
-  store i32 %shl, i32* %y, align 8
-  ret void
-}
-
-; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
-; (i64 version)
-define void @fct11(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct11:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], x1, #0, #2
-; lsl is an alias of ubfm
-; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, -8
-  %and1 = and i64 %x, 7
-  %or = or i64 %and, %and1
-  %shl = shl i64 %or, 2
-  store i64 %shl, i64* %y, align 8
-  ret void
-}
-
-define zeroext i1 @fct12bis(i32 %tmp2) unnamed_addr nounwind ssp align 2 {
-; CHECK-LABEL: fct12bis:
-; CHECK-NOT: and
-; CHECK: ubfm w0, w0, #11, #11
-  %and.i.i = and i32 %tmp2, 2048
-  %tobool.i.i = icmp ne i32 %and.i.i, 0
-  ret i1 %tobool.i.i
-}
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; and some low bits
-define void @fct12(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct12:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
-; lsr is an alias of ubfm
-; CHECK-NEXT: ubfm [[REG2:w[0-9]+]], [[REG1]], #2, #29
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, -8
-  %shr = lshr i32 %x, 16
-  %and1 = and i32 %shr, 7
-  %or = or i32 %and, %and1
-  %shl = shl i32 %or, 2
-  %shr2 = lshr i32 %shl, 4
-  store i32 %shr2, i32* %y, align 8
-  ret void
-}
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; and some low bits
-; (i64 version)
-define void @fct13(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct13:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
-; lsr is an alias of ubfm
-; CHECK-NEXT: ubfm [[REG2:x[0-9]+]], [[REG1]], #2, #61
-; CHECK-NEXT: str [[REG2]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, -8
-  %shr = lshr i64 %x, 16
-  %and1 = and i64 %shr, 7
-  %or = or i64 %and, %and1
-  %shl = shl i64 %or, 2
-  %shr2 = lshr i64 %shl, 4
-  store i64 %shr2, i64* %y, align 8
-  ret void
-}
-
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; and some low bits
-define void @fct14(i32* nocapture %y, i32 %x, i32 %x1) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct14:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], w1, #16, #23
-; lsr is an alias of ubfm
-; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #4
-; CHECK-NEXT: bfm [[REG2]], w2, #5, #7
-; lsl is an alias of ubfm
-; CHECK-NEXT: lsl [[REG3:w[0-9]+]], [[REG2]], #2
-; CHECK-NEXT: str [[REG3]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, -256
-  %shr = lshr i32 %x, 16
-  %and1 = and i32 %shr, 255
-  %or = or i32 %and, %and1
-  %shl = lshr i32 %or, 4
-  %and2 = and i32 %shl, -8
-  %shr1 = lshr i32 %x1, 5
-  %and3 = and i32 %shr1, 7
-  %or1 = or i32 %and2, %and3
-  %shl1 = shl i32 %or1, 2
-  store i32 %shl1, i32* %y, align 8
-  ret void
-}
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; and some low bits
-; (i64 version)
-define void @fct15(i64* nocapture %y, i64 %x, i64 %x1) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct15:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; CHECK-NEXT: bfm [[REG1]], x1, #16, #23
-; lsr is an alias of ubfm
-; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #4
-; CHECK-NEXT: bfm [[REG2]], x2, #5, #7
-; lsl is an alias of ubfm
-; CHECK-NEXT: lsl [[REG3:x[0-9]+]], [[REG2]], #2
-; CHECK-NEXT: str [[REG3]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, -256
-  %shr = lshr i64 %x, 16
-  %and1 = and i64 %shr, 255
-  %or = or i64 %and, %and1
-  %shl = lshr i64 %or, 4
-  %and2 = and i64 %shl, -8
-  %shr1 = lshr i64 %x1, 5
-  %and3 = and i64 %shr1, 7
-  %or1 = or i64 %and2, %and3
-  %shl1 = shl i64 %or1, 2
-  store i64 %shl1, i64* %y, align 8
-  ret void
-}
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; and some low bits and a masking operation has to be kept
-define void @fct16(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct16:
-; CHECK: ldr [[REG1:w[0-9]+]],
-; Create the constant
-; CHECK: movz [[REGCST:w[0-9]+]], #26, lsl #16
-; CHECK: movk [[REGCST]], #33120
-; Do the masking
-; CHECK: and [[REG2:w[0-9]+]], [[REG1]], [[REGCST]]
-; CHECK-NEXT: bfm [[REG2]], w1, #16, #18
-; lsr is an alias of ubfm
-; CHECK-NEXT: ubfm [[REG3:w[0-9]+]], [[REG2]], #2, #29
-; CHECK-NEXT: str [[REG3]],
-; CHECK-NEXT: ret
-  %0 = load i32* %y, align 8
-  %and = and i32 %0, 1737056
-  %shr = lshr i32 %x, 16
-  %and1 = and i32 %shr, 7
-  %or = or i32 %and, %and1
-  %shl = shl i32 %or, 2
-  %shr2 = lshr i32 %shl, 4
-  store i32 %shr2, i32* %y, align 8
-  ret void
-}
-
-
-; Check if we can still catch bfm instruction when we drop some high bits
-; and some low bits and a masking operation has to be kept
-; (i64 version)
-define void @fct17(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
-entry:
-; CHECK-LABEL: fct17:
-; CHECK: ldr [[REG1:x[0-9]+]],
-; Create the constant
-; CHECK: movz [[REGCST:x[0-9]+]], #26, lsl #16
-; CHECK: movk [[REGCST]], #33120
-; Do the masking
-; CHECK: and [[REG2:x[0-9]+]], [[REG1]], [[REGCST]]
-; CHECK-NEXT: bfm [[REG2]], x1, #16, #18
-; lsr is an alias of ubfm
-; CHECK-NEXT: ubfm [[REG3:x[0-9]+]], [[REG2]], #2, #61
-; CHECK-NEXT: str [[REG3]],
-; CHECK-NEXT: ret
-  %0 = load i64* %y, align 8
-  %and = and i64 %0, 1737056
-  %shr = lshr i64 %x, 16
-  %and1 = and i64 %shr, 7
-  %or = or i64 %and, %and1
-  %shl = shl i64 %or, 2
-  %shr2 = lshr i64 %shl, 4
-  store i64 %shr2, i64* %y, align 8
-  ret void
-}
-
-define i64 @fct18(i32 %xor72) nounwind ssp {
-; CHECK-LABEL: fct18:
-; CHECK: ubfm x0, x0, #9, #16
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %result = and i64 %conv82, 255
-  ret i64 %result
-}
diff --git a/test/CodeGen/ARM64/blockaddress.ll b/test/CodeGen/ARM64/blockaddress.ll
deleted file mode 100644
index ac4f19e..0000000
--- a/test/CodeGen/ARM64/blockaddress.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
-; RUN: llc < %s -mtriple=arm64-linux-gnu -code-model=large| FileCheck %s --check-prefix=CHECK-LARGE
-
-; rdar://9188695
-
-define i64 @t() nounwind ssp {
-entry:
-; CHECK-LABEL: t:
-; CHECK: adrp [[REG:x[0-9]+]], Ltmp1@PAGE
-; CHECK: add {{x[0-9]+}}, [[REG]], Ltmp1@PAGEOFF
-
-; CHECK-LINUX-LABEL: t:
-; CHECK-LINUX: adrp [[REG:x[0-9]+]], .Ltmp1
-; CHECK-LINUX: add {{x[0-9]+}}, [[REG]], :lo12:.Ltmp1
-
-; CHECK-LARGE-LABEL: t:
-; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g3:[[DEST_LBL:.Ltmp[0-9]+]]
-; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]]
-; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g1_nc:[[DEST_LBL]]
-; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g0_nc:[[DEST_LBL]]
-
-  %recover = alloca i64, align 8
-  store volatile i64 ptrtoint (i8* blockaddress(@t, %mylabel) to i64), i64* %recover, align 8
-  br label %mylabel
-
-mylabel:
-  %tmp = load volatile i64* %recover, align 8
-  ret i64 %tmp
-}
diff --git a/test/CodeGen/ARM64/build-vector.ll b/test/CodeGen/ARM64/build-vector.ll
deleted file mode 100644
index 1d137ae..0000000
--- a/test/CodeGen/ARM64/build-vector.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-; Check that building up a vector w/ only one non-zero lane initializes
-; intelligently.
-define void @one_lane(i32* nocapture %out_int, i32 %skip0) nounwind {
-; CHECK-LABEL: one_lane:
-; CHECK: dup.16b v[[REG:[0-9]+]], wzr
-; CHECK-NEXT: ins.b v[[REG]][0], w1
-; v and q are aliases, and str is prefered against st.16b when possible
-; rdar://11246289
-; CHECK: str q[[REG]], [x0]
-; CHECK: ret
-  %conv = trunc i32 %skip0 to i8
-  %vset_lane = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %conv, i32 0
-  %tmp = bitcast i32* %out_int to <4 x i32>*
-  %tmp1 = bitcast <16 x i8> %vset_lane to <4 x i32>
-  store <4 x i32> %tmp1, <4 x i32>* %tmp, align 16
-  ret void
-}
-
-; Check that building a vector from floats doesn't insert an unnecessary
-; copy for lane zero.
-define <4 x float>  @foo(float %a, float %b, float %c, float %d) nounwind {
-; CHECK-LABEL: foo:
-; CHECK-NOT: ins.s v0[0], v0[0]
-; CHECK: ins.s v0[1], v1[0]
-; CHECK: ins.s v0[2], v2[0]
-; CHECK: ins.s v0[3], v3[0]
-; CHECK: ret
-  %1 = insertelement <4 x float> undef, float %a, i32 0
-  %2 = insertelement <4 x float> %1, float %b, i32 1
-  %3 = insertelement <4 x float> %2, float %c, i32 2
-  %4 = insertelement <4 x float> %3, float %d, i32 3
-  ret <4 x float> %4
-}
diff --git a/test/CodeGen/ARM64/call-tailcalls.ll b/test/CodeGen/ARM64/call-tailcalls.ll
deleted file mode 100644
index 487c1d9..0000000
--- a/test/CodeGen/ARM64/call-tailcalls.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-
-@t = weak global i32 ()* null
-@x = external global i32, align 4
-
-define void @t2() {
-; CHECK-LABEL: t2:
-; CHECK: adrp	x[[GOTADDR:[0-9]+]], _t@GOTPAGE
-; CHECK: ldr	x[[ADDR:[0-9]+]], [x[[GOTADDR]], _t@GOTPAGEOFF]
-; CHECK: ldr	x[[DEST:[0-9]+]], [x[[ADDR]]]
-; CHECK: br	x[[DEST]]
-  %tmp = load i32 ()** @t
-  %tmp.upgrd.2 = tail call i32 %tmp()
-  ret void
-}
-
-define void @t3() {
-; CHECK-LABEL: t3:
-; CHECK: b	_t2
-  tail call void @t2()
-  ret void
-}
-
-define double @t4(double %a) nounwind readonly ssp {
-; CHECK-LABEL: t4:
-; CHECK: b	_sin
-  %tmp = tail call double @sin(double %a) nounwind readonly
-  ret double %tmp
-}
-
-define float @t5(float %a) nounwind readonly ssp {
-; CHECK-LABEL: t5:
-; CHECK: b	_sinf
-  %tmp = tail call float @sinf(float %a) nounwind readonly
-  ret float %tmp
-}
-
-define void @t7() nounwind {
-; CHECK-LABEL: t7:
-; CHECK: b	_foo
-; CHECK: b	_bar
-
-  br i1 undef, label %bb, label %bb1.lr.ph
-
-bb1.lr.ph:                                        ; preds = %entry
-  tail call void @bar() nounwind
-  ret void
-
-bb:                                               ; preds = %entry
-  tail call void @foo() nounwind
-  ret void
-}
-
-define i32 @t8(i32 %x) nounwind ssp {
-; CHECK-LABEL: t8:
-; CHECK: b	_a
-; CHECK: b	_b
-; CHECK: b	_c
-  %and = and i32 %x, 1
-  %tobool = icmp eq i32 %and, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %call = tail call i32 @a(i32 %x) nounwind
-  br label %return
-
-if.end:                                           ; preds = %entry
-  %and1 = and i32 %x, 2
-  %tobool2 = icmp eq i32 %and1, 0
-  br i1 %tobool2, label %if.end5, label %if.then3
-
-if.then3:                                         ; preds = %if.end
-  %call4 = tail call i32 @b(i32 %x) nounwind
-  br label %return
-
-if.end5:                                          ; preds = %if.end
-  %call6 = tail call i32 @c(i32 %x) nounwind
-  br label %return
-
-return:                                           ; preds = %if.end5, %if.then3, %if.then
-  %retval.0 = phi i32 [ %call, %if.then ], [ %call4, %if.then3 ], [ %call6, %if.end5 ]
-  ret i32 %retval.0
-}
-
-declare float @sinf(float) nounwind readonly
-declare double @sin(double) nounwind readonly
-declare void @bar() nounwind
-declare void @foo() nounwind
-declare i32 @a(i32)
-declare i32 @b(i32)
-declare i32 @c(i32)
diff --git a/test/CodeGen/ARM64/cast-opt.ll b/test/CodeGen/ARM64/cast-opt.ll
deleted file mode 100644
index 3d7f257..0000000
--- a/test/CodeGen/ARM64/cast-opt.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc -O3 -march=arm64 -mtriple arm64-apple-ios5.0.0 < %s | FileCheck %s
-; <rdar://problem/15992732>
-; Zero truncation is not necessary when the values are extended properly
-; already.
-
-@block = common global i8* null, align 8
-
-define zeroext i8 @foo(i32 %i1, i32 %i2) {
-; CHECK-LABEL: foo:
-; CHECK: csinc
-; CHECK-NOT: and
-entry:
-  %idxprom = sext i32 %i1 to i64
-  %0 = load i8** @block, align 8
-  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
-  %1 = load i8* %arrayidx, align 1
-  %idxprom1 = sext i32 %i2 to i64
-  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
-  %2 = load i8* %arrayidx2, align 1
-  %cmp = icmp eq i8 %1, %2
-  br i1 %cmp, label %return, label %if.then
-
-if.then:                                          ; preds = %entry
-  %cmp7 = icmp ugt i8 %1, %2
-  %conv9 = zext i1 %cmp7 to i8
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i8 [ %conv9, %if.then ], [ 1, %entry ]
-  ret i8 %retval.0
-}
diff --git a/test/CodeGen/ARM64/ccmp-heuristics.ll b/test/CodeGen/ARM64/ccmp-heuristics.ll
deleted file mode 100644
index 5575997..0000000
--- a/test/CodeGen/ARM64/ccmp-heuristics.ll
+++ /dev/null
@@ -1,190 +0,0 @@
-; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp | FileCheck %s
-target triple = "arm64-apple-ios7.0.0"
-
-@channelColumns = external global i64
-@channelTracks = external global i64
-@mazeRoute = external hidden unnamed_addr global i8*, align 8
-@TOP = external global i64*
-@BOT = external global i64*
-@netsAssign = external global i64*
-
-; Function from yacr2/maze.c
-; The branch at the end of %if.then is driven by %cmp5 and %cmp6.
-; Isel converts the and i1 into two branches, and arm64-ccmp should not convert
-; it back again. %cmp6 has much higher latency than %cmp5.
-; CHECK: Maze1
-; CHECK: %if.then
-; CHECK: cmp x{{[0-9]+}}, #2
-; CHECK-NEXT b.cc
-; CHECK: %if.then
-; CHECK: cmp x{{[0-9]+}}, #2
-; CHECK-NEXT b.cc
-define i32 @Maze1() nounwind ssp {
-entry:
-  %0 = load i64* @channelColumns, align 8, !tbaa !0
-  %cmp90 = icmp eq i64 %0, 0
-  br i1 %cmp90, label %for.end, label %for.body
-
-for.body:                                         ; preds = %for.inc, %entry
-  %1 = phi i64 [ %0, %entry ], [ %37, %for.inc ]
-  %i.092 = phi i64 [ 1, %entry ], [ %inc53, %for.inc ]
-  %numLeft.091 = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
-  %2 = load i8** @mazeRoute, align 8, !tbaa !3
-  %arrayidx = getelementptr inbounds i8* %2, i64 %i.092
-  %3 = load i8* %arrayidx, align 1, !tbaa !1
-  %tobool = icmp eq i8 %3, 0
-  br i1 %tobool, label %for.inc, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %4 = load i64** @TOP, align 8, !tbaa !3
-  %arrayidx1 = getelementptr inbounds i64* %4, i64 %i.092
-  %5 = load i64* %arrayidx1, align 8, !tbaa !0
-  %6 = load i64** @netsAssign, align 8, !tbaa !3
-  %arrayidx2 = getelementptr inbounds i64* %6, i64 %5
-  %7 = load i64* %arrayidx2, align 8, !tbaa !0
-  %8 = load i64** @BOT, align 8, !tbaa !3
-  %arrayidx3 = getelementptr inbounds i64* %8, i64 %i.092
-  %9 = load i64* %arrayidx3, align 8, !tbaa !0
-  %arrayidx4 = getelementptr inbounds i64* %6, i64 %9
-  %10 = load i64* %arrayidx4, align 8, !tbaa !0
-  %cmp5 = icmp ugt i64 %i.092, 1
-  %cmp6 = icmp ugt i64 %10, 1
-  %or.cond = and i1 %cmp5, %cmp6
-  br i1 %or.cond, label %land.lhs.true7, label %if.else
-
-land.lhs.true7:                                   ; preds = %if.then
-  %11 = load i64* @channelTracks, align 8, !tbaa !0
-  %add = add i64 %11, 1
-  %call = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add, i64 %10, i64 0, i64 %7, i32 -1, i32 -1)
-  %tobool8 = icmp eq i32 %call, 0
-  br i1 %tobool8, label %land.lhs.true7.if.else_crit_edge, label %if.then9
-
-land.lhs.true7.if.else_crit_edge:                 ; preds = %land.lhs.true7
-  %.pre = load i64* @channelColumns, align 8, !tbaa !0
-  br label %if.else
-
-if.then9:                                         ; preds = %land.lhs.true7
-  %12 = load i8** @mazeRoute, align 8, !tbaa !3
-  %arrayidx10 = getelementptr inbounds i8* %12, i64 %i.092
-  store i8 0, i8* %arrayidx10, align 1, !tbaa !1
-  %13 = load i64** @TOP, align 8, !tbaa !3
-  %arrayidx11 = getelementptr inbounds i64* %13, i64 %i.092
-  %14 = load i64* %arrayidx11, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %14)
-  %15 = load i64** @BOT, align 8, !tbaa !3
-  %arrayidx12 = getelementptr inbounds i64* %15, i64 %i.092
-  %16 = load i64* %arrayidx12, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %16)
-  br label %for.inc
-
-if.else:                                          ; preds = %land.lhs.true7.if.else_crit_edge, %if.then
-  %17 = phi i64 [ %.pre, %land.lhs.true7.if.else_crit_edge ], [ %1, %if.then ]
-  %cmp13 = icmp ult i64 %i.092, %17
-  %or.cond89 = and i1 %cmp13, %cmp6
-  br i1 %or.cond89, label %land.lhs.true16, label %if.else24
-
-land.lhs.true16:                                  ; preds = %if.else
-  %18 = load i64* @channelTracks, align 8, !tbaa !0
-  %add17 = add i64 %18, 1
-  %call18 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add17, i64 %10, i64 0, i64 %7, i32 1, i32 -1)
-  %tobool19 = icmp eq i32 %call18, 0
-  br i1 %tobool19, label %if.else24, label %if.then20
-
-if.then20:                                        ; preds = %land.lhs.true16
-  %19 = load i8** @mazeRoute, align 8, !tbaa !3
-  %arrayidx21 = getelementptr inbounds i8* %19, i64 %i.092
-  store i8 0, i8* %arrayidx21, align 1, !tbaa !1
-  %20 = load i64** @TOP, align 8, !tbaa !3
-  %arrayidx22 = getelementptr inbounds i64* %20, i64 %i.092
-  %21 = load i64* %arrayidx22, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %21)
-  %22 = load i64** @BOT, align 8, !tbaa !3
-  %arrayidx23 = getelementptr inbounds i64* %22, i64 %i.092
-  %23 = load i64* %arrayidx23, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %23)
-  br label %for.inc
-
-if.else24:                                        ; preds = %land.lhs.true16, %if.else
-  br i1 %cmp5, label %land.lhs.true26, label %if.else36
-
-land.lhs.true26:                                  ; preds = %if.else24
-  %24 = load i64* @channelTracks, align 8, !tbaa !0
-  %cmp27 = icmp ult i64 %7, %24
-  br i1 %cmp27, label %land.lhs.true28, label %if.else36
-
-land.lhs.true28:                                  ; preds = %land.lhs.true26
-  %add29 = add i64 %24, 1
-  %call30 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add29, i64 %10, i32 -1, i32 1)
-  %tobool31 = icmp eq i32 %call30, 0
-  br i1 %tobool31, label %if.else36, label %if.then32
-
-if.then32:                                        ; preds = %land.lhs.true28
-  %25 = load i8** @mazeRoute, align 8, !tbaa !3
-  %arrayidx33 = getelementptr inbounds i8* %25, i64 %i.092
-  store i8 0, i8* %arrayidx33, align 1, !tbaa !1
-  %26 = load i64** @TOP, align 8, !tbaa !3
-  %arrayidx34 = getelementptr inbounds i64* %26, i64 %i.092
-  %27 = load i64* %arrayidx34, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %27)
-  %28 = load i64** @BOT, align 8, !tbaa !3
-  %arrayidx35 = getelementptr inbounds i64* %28, i64 %i.092
-  %29 = load i64* %arrayidx35, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %29)
-  br label %for.inc
-
-if.else36:                                        ; preds = %land.lhs.true28, %land.lhs.true26, %if.else24
-  %30 = load i64* @channelColumns, align 8, !tbaa !0
-  %cmp37 = icmp ult i64 %i.092, %30
-  br i1 %cmp37, label %land.lhs.true38, label %if.else48
-
-land.lhs.true38:                                  ; preds = %if.else36
-  %31 = load i64* @channelTracks, align 8, !tbaa !0
-  %cmp39 = icmp ult i64 %7, %31
-  br i1 %cmp39, label %land.lhs.true40, label %if.else48
-
-land.lhs.true40:                                  ; preds = %land.lhs.true38
-  %add41 = add i64 %31, 1
-  %call42 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add41, i64 %10, i32 1, i32 1)
-  %tobool43 = icmp eq i32 %call42, 0
-  br i1 %tobool43, label %if.else48, label %if.then44
-
-if.then44:                                        ; preds = %land.lhs.true40
-  %32 = load i8** @mazeRoute, align 8, !tbaa !3
-  %arrayidx45 = getelementptr inbounds i8* %32, i64 %i.092
-  store i8 0, i8* %arrayidx45, align 1, !tbaa !1
-  %33 = load i64** @TOP, align 8, !tbaa !3
-  %arrayidx46 = getelementptr inbounds i64* %33, i64 %i.092
-  %34 = load i64* %arrayidx46, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %34)
-  %35 = load i64** @BOT, align 8, !tbaa !3
-  %arrayidx47 = getelementptr inbounds i64* %35, i64 %i.092
-  %36 = load i64* %arrayidx47, align 8, !tbaa !0
-  tail call fastcc void @CleanNet(i64 %36)
-  br label %for.inc
-
-if.else48:                                        ; preds = %land.lhs.true40, %land.lhs.true38, %if.else36
-  %inc = add nsw i32 %numLeft.091, 1
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.else48, %if.then44, %if.then32, %if.then20, %if.then9, %for.body
-  %numLeft.1 = phi i32 [ %numLeft.091, %if.then9 ], [ %numLeft.091, %if.then20 ], [ %numLeft.091, %if.then32 ], [ %numLeft.091, %if.then44 ], [ %inc, %if.else48 ], [ %numLeft.091, %for.body ]
-  %inc53 = add i64 %i.092, 1
-  %37 = load i64* @channelColumns, align 8, !tbaa !0
-  %cmp = icmp ugt i64 %inc53, %37
-  br i1 %cmp, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.inc, %entry
-  %numLeft.0.lcssa = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
-  ret i32 %numLeft.0.lcssa
-}
-
-; Materializable
-declare hidden fastcc i32 @Maze1Mech(i64, i64, i64, i64, i64, i32, i32) nounwind ssp
-
-; Materializable
-declare hidden fastcc void @CleanNet(i64) nounwind ssp
-
-!0 = metadata !{metadata !"long", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/ARM64/ccmp.ll b/test/CodeGen/ARM64/ccmp.ll
deleted file mode 100644
index 79e6f94..0000000
--- a/test/CodeGen/ARM64/ccmp.ll
+++ /dev/null
@@ -1,289 +0,0 @@
-; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp -arm64-stress-ccmp | FileCheck %s
-target triple = "arm64-apple-ios"
-
-; CHECK: single_same
-; CHECK: cmp w0, #5
-; CHECK-NEXT: ccmp w1, #17, #4, ne
-; CHECK-NEXT: b.ne
-; CHECK: %if.then
-; CHECK: bl _foo
-; CHECK: %if.end
-define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 5
-  %cmp1 = icmp eq i32 %b, 17
-  %or.cond = or i1 %cmp, %cmp1
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Different condition codes for the two compares.
-; CHECK: single_different
-; CHECK: cmp w0, #6
-; CHECK-NEXT: ccmp w1, #17, #0, ge
-; CHECK-NEXT: b.eq
-; CHECK: %if.then
-; CHECK: bl _foo
-; CHECK: %if.end
-define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp sle i32 %a, 5
-  %cmp1 = icmp ne i32 %b, 17
-  %or.cond = or i1 %cmp, %cmp1
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Second block clobbers the flags, can't convert (easily).
-; CHECK: single_flagclobber
-; CHECK: cmp
-; CHECK: b.eq
-; CHECK: cmp
-; CHECK: b.gt
-define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 5
-  br i1 %cmp, label %if.then, label %lor.lhs.false
-
-lor.lhs.false:                                    ; preds = %entry
-  %cmp1 = icmp slt i32 %b, 7
-  %mul = shl nsw i32 %b, 1
-  %add = add nsw i32 %b, 1
-  %cond = select i1 %cmp1, i32 %mul, i32 %add
-  %cmp2 = icmp slt i32 %cond, 17
-  br i1 %cmp2, label %if.then, label %if.end
-
-if.then:                                          ; preds = %lor.lhs.false, %entry
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %lor.lhs.false
-  ret i32 7
-}
-
-; Second block clobbers the flags and ends with a tbz terminator.
-; CHECK: single_flagclobber_tbz
-; CHECK: cmp
-; CHECK: b.eq
-; CHECK: cmp
-; CHECK: tbz
-define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 5
-  br i1 %cmp, label %if.then, label %lor.lhs.false
-
-lor.lhs.false:                                    ; preds = %entry
-  %cmp1 = icmp slt i32 %b, 7
-  %mul = shl nsw i32 %b, 1
-  %add = add nsw i32 %b, 1
-  %cond = select i1 %cmp1, i32 %mul, i32 %add
-  %and = and i32 %cond, 8
-  %cmp2 = icmp ne i32 %and, 0
-  br i1 %cmp2, label %if.then, label %if.end
-
-if.then:                                          ; preds = %lor.lhs.false, %entry
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %lor.lhs.false
-  ret i32 7
-}
-
-; Speculatively execute division by zero.
-; The sdiv/udiv instructions do not trap when the divisor is zero, so they are
-; safe to speculate.
-; CHECK: speculate_division
-; CHECK-NOT: cmp
-; CHECK: sdiv
-; CHECK: cmp
-; CHECK-NEXT: ccmp
-define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:
-  %div = sdiv i32 %b, %a
-  %cmp1 = icmp slt i32 %div, 17
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Floating point compare.
-; CHECK: single_fcmp
-; CHECK: cmp
-; CHECK-NOT: b.
-; CHECK: fccmp {{.*}}, #8, ge
-; CHECK: b.lt
-define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
-entry:
-  %cmp = icmp sgt i32 %a, 0
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:
-  %conv = sitofp i32 %a to float
-  %div = fdiv float %b, %conv
-  %cmp1 = fcmp oge float %div, 1.700000e+01
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Chain multiple compares.
-; CHECK: multi_different
-; CHECK: cmp
-; CHECK: ccmp
-; CHECK: ccmp
-; CHECK: b.
-define void @multi_different(i32 %a, i32 %b, i32 %c) nounwind ssp {
-entry:
-  %cmp = icmp sgt i32 %a, %b
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:
-  %div = sdiv i32 %b, %a
-  %cmp1 = icmp eq i32 %div, 5
-  %cmp4 = icmp sgt i32 %div, %c
-  %or.cond = and i1 %cmp1, %cmp4
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret void
-}
-
-; Convert a cbz in the head block.
-; CHECK: cbz_head
-; CHECK: cmp w0, #0
-; CHECK: ccmp
-define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 0
-  %cmp1 = icmp ne i32 %b, 17
-  %or.cond = or i1 %cmp, %cmp1
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Check that the immediate operand is in range. The ccmp instruction encodes a
-; smaller range of immediates than subs/adds.
-; The ccmp immediates must be in the range 0-31.
-; CHECK: immediate_range
-; CHECK-NOT: ccmp
-define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 5
-  %cmp1 = icmp eq i32 %b, 32
-  %or.cond = or i1 %cmp, %cmp1
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Convert a cbz in the second block.
-; CHECK: cbz_second
-; CHECK: cmp w0, #0
-; CHECK: ccmp w1, #0, #0, ne
-; CHECK: b.eq
-define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 0
-  %cmp1 = icmp ne i32 %b, 0
-  %or.cond = or i1 %cmp, %cmp1
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-
-; Convert a cbnz in the second block.
-; CHECK: cbnz_second
-; CHECK: cmp w0, #0
-; CHECK: ccmp w1, #0, #4, ne
-; CHECK: b.ne
-define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp {
-entry:
-  %cmp = icmp eq i32 %a, 0
-  %cmp1 = icmp eq i32 %b, 0
-  %or.cond = or i1 %cmp, %cmp1
-  br i1 %or.cond, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @foo() nounwind
-  br label %if.end
-
-if.end:
-  ret i32 7
-}
-declare i32 @foo()
-
-%str1 = type { %str2 }
-%str2 = type { [24 x i8], i8*, i32, %str1*, i32, [4 x i8], %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, i8*, i8, i8*, %str1*, i8* }
-
-; Test case distilled from 126.gcc.
-; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor.
-; CHECK: build_modify_expr
-define void @build_modify_expr() nounwind ssp {
-entry:
-  switch i32 undef, label %sw.bb.i.i [
-    i32 69, label %if.end85
-    i32 70, label %if.end85
-    i32 71, label %if.end85
-    i32 72, label %if.end85
-    i32 73, label %if.end85
-    i32 105, label %if.end85
-    i32 106, label %if.end85
-  ]
-
-if.end85:
-  ret void
-
-sw.bb.i.i:
-  %ref.tr.i.i = phi %str1* [ %0, %sw.bb.i.i ], [ undef, %entry ]
-  %operands.i.i = getelementptr inbounds %str1* %ref.tr.i.i, i64 0, i32 0, i32 2
-  %arrayidx.i.i = bitcast i32* %operands.i.i to %str1**
-  %0 = load %str1** %arrayidx.i.i, align 8
-  %code1.i.i.phi.trans.insert = getelementptr inbounds %str1* %0, i64 0, i32 0, i32 0, i64 16
-  br label %sw.bb.i.i
-}
diff --git a/test/CodeGen/ARM64/coalesce-ext.ll b/test/CodeGen/ARM64/coalesce-ext.ll
deleted file mode 100644
index 9e8d08e..0000000
--- a/test/CodeGen/ARM64/coalesce-ext.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc -march=arm64 -mtriple=arm64-apple-darwin < %s | FileCheck %s
-; Check that the peephole optimizer knows about sext and zext instructions.
-; CHECK: test1sext
-define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
-  %C = add i64 %A, %B
-  ; CHECK: add x[[SUM:[0-9]+]], x0, x1
-  %D = trunc i64 %C to i32
-  %E = shl i64 %C, 32
-  %F = ashr i64 %E, 32
-  ; CHECK: sxtw x[[EXT:[0-9]+]], x[[SUM]]
-  store volatile i64 %F, i64 *%P2
-  ; CHECK: str x[[EXT]]
-  store volatile i32 %D, i32* %P
-  ; Reuse low bits of extended register, don't extend live range of SUM.
-  ; CHECK: str w[[SUM]]
-  ret i32 %D
-}
diff --git a/test/CodeGen/ARM64/code-model-large-abs.ll b/test/CodeGen/ARM64/code-model-large-abs.ll
deleted file mode 100644
index 264da2d..0000000
--- a/test/CodeGen/ARM64/code-model-large-abs.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large < %s | FileCheck %s
-
-@var8 = global i8 0
-@var16 = global i16 0
-@var32 = global i32 0
-@var64 = global i64 0
-
-define i8* @global_addr() {
-; CHECK-LABEL: global_addr:
-  ret i8* @var8
-  ; The movz/movk calculation should end up returned directly in x0.
-; CHECK: movz x0, #:abs_g3:var8
-; CHECK: movk x0, #:abs_g2_nc:var8
-; CHECK: movk x0, #:abs_g1_nc:var8
-; CHECK: movk x0, #:abs_g0_nc:var8
-; CHECK-NEXT: ret
-}
-
-define i8 @global_i8() {
-; CHECK-LABEL: global_i8:
-  %val = load i8* @var8
-  ret i8 %val
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var8
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var8
-; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var8
-; CHECK: ldrb w0, [x[[ADDR_REG]]]
-}
-
-define i16 @global_i16() {
-; CHECK-LABEL: global_i16:
-  %val = load i16* @var16
-  ret i16 %val
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var16
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var16
-; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var16
-; CHECK: ldrh w0, [x[[ADDR_REG]]]
-}
-
-define i32 @global_i32() {
-; CHECK-LABEL: global_i32:
-  %val = load i32* @var32
-  ret i32 %val
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var32
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var32
-; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var32
-; CHECK: ldr w0, [x[[ADDR_REG]]]
-}
-
-define i64 @global_i64() {
-; CHECK-LABEL: global_i64:
-  %val = load i64* @var64
-  ret i64 %val
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var64
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var64
-; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var64
-; CHECK: ldr x0, [x[[ADDR_REG]]]
-}
-
-define <2 x i64> @constpool() {
-; CHECK-LABEL: constpool:
-  ret <2 x i64> <i64 123456789, i64 987654321100>
-
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:[[CPADDR:.LCPI[0-9]+_[0-9]+]]
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:[[CPADDR]]
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:[[CPADDR]]
-; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:[[CPADDR]]
-; CHECK: ldr q0, [x[[ADDR_REG]]]
-}
diff --git a/test/CodeGen/ARM64/collect-loh-garbage-crash.ll b/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
deleted file mode 100644
index 98cb625..0000000
--- a/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -O3 -arm64-collect-loh -arm64-collect-loh-bb-only=true -arm64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
-; Check that the LOH analysis does not crash when the analysed chained
-; contains instructions that are filtered out.
-;
-; Before the fix for <rdar://problem/16041712>, these cases were removed
-; from the main container. Now, the deterministic container does not allow
-; to remove arbitrary values, so we have to live with garbage values.
-; <rdar://problem/16041712>
-
-%"class.H4ISP::H4ISPDevice" = type { i32 (%"class.H4ISP::H4ISPDevice"*, i32, i8*, i8*)*, i8*, i32*, %"class.H4ISP::H4ISPCameraManager"* }
-
-%"class.H4ISP::H4ISPCameraManager" = type opaque
-
-declare i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"*)
-
-@pH4ISPDevice = hidden global %"class.H4ISP::H4ISPDevice"* null, align 8
-
-; CHECK-LABEL: _foo:
-; CHECK: ret
-; CHECK-NOT: .loh AdrpLdrGotLdr
-define void @foo() {
-entry:
-  br label %if.then83
-if.then83:                                        ; preds = %if.end81
-  %tmp = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
-  %call84 = call i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"* %tmp) #19
-  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"()
-  %tmp2 = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
-  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x28}"()
-  %pCameraManager.i268 = getelementptr inbounds %"class.H4ISP::H4ISPDevice"* %tmp2, i64 0, i32 3
-  %tmp3 = load %"class.H4ISP::H4ISPCameraManager"** %pCameraManager.i268, align 8
-  %tobool.i269 = icmp eq %"class.H4ISP::H4ISPCameraManager"* %tmp3, null
-  br i1 %tobool.i269, label %if.then83, label %end
-end:
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/collect-loh-str.ll b/test/CodeGen/ARM64/collect-loh-str.ll
deleted file mode 100644
index fc63f8b..0000000
--- a/test/CodeGen/ARM64/collect-loh-str.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
-; Test case for <rdar://problem/15942912>.
-; AdrpAddStr cannot be used when the store uses same
-; register as address and value. Indeed, the related
-; if applied, may completely remove the definition or
-; at least provide a wrong one (with the offset folded
-; into the definition).
-
-%struct.anon = type { i32*, i32** }
-
-@pptp_wan_head = internal global %struct.anon zeroinitializer, align 8
-
-; CHECK-LABEL: _pptp_wan_init
-; CHECK: ret
-; CHECK-NOT: AdrpAddStr
-define i32 @pptp_wan_init() {
-entry:
-  store i32* null, i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), align 8
-  store i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), i32*** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 1), align 8
-  ret i32 0
-}
-
-
diff --git a/test/CodeGen/ARM64/collect-loh.ll b/test/CodeGen/ARM64/collect-loh.ll
deleted file mode 100644
index 08ab062..0000000
--- a/test/CodeGen/ARM64/collect-loh.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
-
-@a = internal unnamed_addr global i32 0, align 4
-@b = external global i32
-
-; Function Attrs: noinline nounwind ssp
-define void @foo(i32 %t) {
-entry:
-  %tmp = load i32* @a, align 4
-  %add = add nsw i32 %tmp, %t
-  store i32 %add, i32* @a, align 4
-  ret void
-}
-
-; Function Attrs: nounwind ssp
-; Testcase for <rdar://problem/15438605>, AdrpAdrp reuse is valid only when the first adrp
-; dominates the second.
-; The first adrp comes from the loading of 'a' and the second the loading of 'b'.
-; 'a' is loaded in if.then, 'b' in if.end4, if.then does not dominates if.end4.
-; CHECK-LABEL: _test
-; CHECK: ret
-; CHECK-NOT: .loh AdrpAdrp
-define i32 @test(i32 %t) {
-entry:
-  %cmp = icmp sgt i32 %t, 5
-  br i1 %cmp, label %if.then, label %if.end4
-
-if.then:                                          ; preds = %entry
-  %tmp = load i32* @a, align 4
-  %add = add nsw i32 %tmp, %t
-  %cmp1 = icmp sgt i32 %add, 12
-  br i1 %cmp1, label %if.then2, label %if.end4
-
-if.then2:                                         ; preds = %if.then
-  tail call void @foo(i32 %add)
-  %tmp1 = load i32* @a, align 4
-  br label %if.end4
-
-if.end4:                                          ; preds = %if.then2, %if.then, %entry
-  %t.addr.0 = phi i32 [ %tmp1, %if.then2 ], [ %t, %if.then ], [ %t, %entry ]
-  %tmp2 = load i32* @b, align 4
-  %add5 = add nsw i32 %tmp2, %t.addr.0
-  tail call void @foo(i32 %add5)
-  %tmp3 = load i32* @b, align 4
-  %add6 = add nsw i32 %tmp3, %t.addr.0
-  ret i32 %add6
-}
diff --git a/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S b/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
deleted file mode 100644
index 250732d..0000000
--- a/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o /dev/null %s
-
-        .text
-        .globl _foo
-        .cfi_startproc
-_foo:
-        stp x29, x30, [sp, #-16]!
- .cfi_adjust_cfa_offset 16
-
-        ldp x29, x30, [sp], #16
- .cfi_adjust_cfa_offset -16
-        .cfi_restore x29
-        .cfi_restore x30
-
-        ret
-
-        .cfi_endproc
diff --git a/test/CodeGen/ARM64/complex-ret.ll b/test/CodeGen/ARM64/complex-ret.ll
deleted file mode 100644
index 93d50a5..0000000
--- a/test/CodeGen/ARM64/complex-ret.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc -march=arm64 -o - %s | FileCheck %s
-
-define { i192, i192, i21, i192 } @foo(i192) {
-; CHECK-LABEL: foo:
-; CHECK: stp xzr, xzr, [x8]
-  ret { i192, i192, i21, i192 } {i192 0, i192 1, i21 2, i192 3}
-}
diff --git a/test/CodeGen/ARM64/convert-v2f64-v2i32.ll b/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
deleted file mode 100644
index 1a07c98..0000000
--- a/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-; CHECK: fptosi_1
-; CHECK: fcvtzs.2d
-; CHECK: xtn.2s
-; CHECK: ret
-define void @fptosi_1() nounwind noinline ssp {
-entry:
-  %0 = fptosi <2 x double> undef to <2 x i32>
-  store <2 x i32> %0, <2 x i32>* undef, align 8
-  ret void
-}
-
-; CHECK: fptoui_1
-; CHECK: fcvtzu.2d
-; CHECK: xtn.2s
-; CHECK: ret
-define void @fptoui_1() nounwind noinline ssp {
-entry:
-  %0 = fptoui <2 x double> undef to <2 x i32>
-  store <2 x i32> %0, <2 x i32>* undef, align 8
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/convert-v2i32-v2f64.ll b/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
deleted file mode 100644
index 63129a4..0000000
--- a/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: f1:
-; CHECK: sshll.2d v0, v0, #0
-; CHECK-NEXT: scvtf.2d v0, v0
-; CHECK-NEXT: ret
-  %conv = sitofp <2 x i32> %v to <2 x double>
-  ret <2 x double> %conv
-}
-define <2 x double> @f2(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: f2:
-; CHECK: ushll.2d v0, v0, #0
-; CHECK-NEXT: ucvtf.2d v0, v0
-; CHECK-NEXT: ret
-  %conv = uitofp <2 x i32> %v to <2 x double>
-  ret <2 x double> %conv
-}
-
-; CHECK: autogen_SD19655
-; CHECK: scvtf
-; CHECK: ret
-define void @autogen_SD19655() {
-  %T = load <2 x i64>* undef
-  %F = sitofp <2 x i64> undef to <2 x float>
-  store <2 x float> %F, <2 x float>* undef
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/copy-tuple.ll b/test/CodeGen/ARM64/copy-tuple.ll
deleted file mode 100644
index 6325c3f..0000000
--- a/test/CodeGen/ARM64/copy-tuple.ll
+++ /dev/null
@@ -1,146 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
-
-; The main purpose of this test is to find out whether copyPhysReg can deal with
-; the memmove-like situation arising in tuples, where an early copy can clobber
-; the value needed by a later one if the tuples overlap.
-
-; We use dummy inline asm to force LLVM to generate a COPY between the registers
-; we want by clobbering all the others.
-
-define void @test_D1D2_from_D0D1(i8* %addr) #0 {
-; CHECK-LABEL: test_D1D2_from_D0D1:
-; CHECK: orr.8b v2, v1
-; CHECK: orr.8b v1, v0
-entry:
-  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
-  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
-  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
-  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-  ret void
-}
-
-define void @test_D0D1_from_D1D2(i8* %addr) #0 {
-; CHECK-LABEL: test_D0D1_from_D1D2:
-; CHECK: orr.8b v0, v1
-; CHECK: orr.8b v1, v2
-entry:
-  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
-  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
-  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
-  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-  ret void
-}
-
-define void @test_D0D1_from_D31D0(i8* %addr) #0 {
-; CHECK-LABEL: test_D0D1_from_D31D0:
-; CHECK: orr.8b v1, v0
-; CHECK: orr.8b v0, v31
-entry:
-  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
-  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
-  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
-  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-  ret void
-}
-
-define void @test_D31D0_from_D0D1(i8* %addr) #0 {
-; CHECK-LABEL: test_D31D0_from_D0D1:
-; CHECK: orr.8b v31, v0
-; CHECK: orr.8b v0, v1
-entry:
-  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
-  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
-  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
-  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
-  ret void
-}
-
-define void @test_D2D3D4_from_D0D1D2(i8* %addr) #0 {
-; CHECK-LABEL: test_D2D3D4_from_D0D1D2:
-; CHECK: orr.8b v4, v2
-; CHECK: orr.8b v3, v1
-; CHECK: orr.8b v2, v0
-entry:
-  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
-  %vec0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 0
-  %vec1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 1
-  %vec2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 2
-
-  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v0},~{v1},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
-  ret void
-}
-
-define void @test_Q0Q1Q2_from_Q1Q2Q3(i8* %addr) #0 {
-; CHECK-LABEL: test_Q0Q1Q2_from_Q1Q2Q3:
-; CHECK: orr.16b v0, v1
-; CHECK: orr.16b v1, v2
-; CHECK: orr.16b v2, v3
-entry:
-  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
-  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
-  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
-  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
-  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
-  tail call void asm sideeffect "", "~{v0},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
-  ret void
-}
-
-define void @test_Q1Q2Q3Q4_from_Q30Q31Q0Q1(i8* %addr) #0 {
-; CHECK-LABEL: test_Q1Q2Q3Q4_from_Q30Q31Q0Q1:
-; CHECK: orr.16b v4, v1
-; CHECK: orr.16b v3, v0
-; CHECK: orr.16b v2, v31
-; CHECK: orr.16b v1, v30
-  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
-  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
-  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
-  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
-  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
-  %vec3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 3
-
-  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}"()
-  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
-
-  tail call void asm sideeffect "", "~{v0},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
-  ret void
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
-
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
diff --git a/test/CodeGen/ARM64/crc32.ll b/test/CodeGen/ARM64/crc32.ll
deleted file mode 100644
index 609eb44..0000000
--- a/test/CodeGen/ARM64/crc32.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -march=arm64 -o - %s | FileCheck %s
-
-define i32 @test_crc32b(i32 %cur, i8 %next) {
-; CHECK-LABEL: test_crc32b:
-; CHECK: crc32b w0, w0, w1
-  %bits = zext i8 %next to i32
-  %val = call i32 @llvm.arm64.crc32b(i32 %cur, i32 %bits)
-  ret i32 %val
-}
-
-define i32 @test_crc32h(i32 %cur, i16 %next) {
-; CHECK-LABEL: test_crc32h:
-; CHECK: crc32h w0, w0, w1
-  %bits = zext i16 %next to i32
-  %val = call i32 @llvm.arm64.crc32h(i32 %cur, i32 %bits)
-  ret i32 %val
-}
-
-define i32 @test_crc32w(i32 %cur, i32 %next) {
-; CHECK-LABEL: test_crc32w:
-; CHECK: crc32w w0, w0, w1
-  %val = call i32 @llvm.arm64.crc32w(i32 %cur, i32 %next)
-  ret i32 %val
-}
-
-define i32 @test_crc32x(i32 %cur, i64 %next) {
-; CHECK-LABEL: test_crc32x:
-; CHECK: crc32x w0, w0, x1
-  %val = call i32 @llvm.arm64.crc32x(i32 %cur, i64 %next)
-  ret i32 %val
-}
-
-define i32 @test_crc32cb(i32 %cur, i8 %next) {
-; CHECK-LABEL: test_crc32cb:
-; CHECK: crc32cb w0, w0, w1
-  %bits = zext i8 %next to i32
-  %val = call i32 @llvm.arm64.crc32cb(i32 %cur, i32 %bits)
-  ret i32 %val
-}
-
-define i32 @test_crc32ch(i32 %cur, i16 %next) {
-; CHECK-LABEL: test_crc32ch:
-; CHECK: crc32ch w0, w0, w1
-  %bits = zext i16 %next to i32
-  %val = call i32 @llvm.arm64.crc32ch(i32 %cur, i32 %bits)
-  ret i32 %val
-}
-
-define i32 @test_crc32cw(i32 %cur, i32 %next) {
-; CHECK-LABEL: test_crc32cw:
-; CHECK: crc32cw w0, w0, w1
-  %val = call i32 @llvm.arm64.crc32cw(i32 %cur, i32 %next)
-  ret i32 %val
-}
-
-define i32 @test_crc32cx(i32 %cur, i64 %next) {
-; CHECK-LABEL: test_crc32cx:
-; CHECK: crc32cx w0, w0, x1
-  %val = call i32 @llvm.arm64.crc32cx(i32 %cur, i64 %next)
-  ret i32 %val
-}
-
-declare i32 @llvm.arm64.crc32b(i32, i32)
-declare i32 @llvm.arm64.crc32h(i32, i32)
-declare i32 @llvm.arm64.crc32w(i32, i32)
-declare i32 @llvm.arm64.crc32x(i32, i64)
-
-declare i32 @llvm.arm64.crc32cb(i32, i32)
-declare i32 @llvm.arm64.crc32ch(i32, i32)
-declare i32 @llvm.arm64.crc32cw(i32, i32)
-declare i32 @llvm.arm64.crc32cx(i32, i64)
diff --git a/test/CodeGen/ARM64/crypto.ll b/test/CodeGen/ARM64/crypto.ll
deleted file mode 100644
index 3804310..0000000
--- a/test/CodeGen/ARM64/crypto.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s | FileCheck %s
-
-declare <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
-declare <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
-declare <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
-declare <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
-
-define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: test_aese:
-; CHECK: aese.16b v0, v1
-  %res = call <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: test_aesd:
-; CHECK: aesd.16b v0, v1
-  %res = call <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_aesmc(<16 x i8> %data) {
-; CHECK-LABEL: test_aesmc:
-; CHECK: aesmc.16b v0, v0
- %res = call <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_aesimc(<16 x i8> %data) {
-; CHECK-LABEL: test_aesimc:
-; CHECK: aesimc.16b v0, v0
- %res = call <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
-  ret <16 x i8> %res
-}
-
-declare <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
-declare <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
-declare <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
-
-define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha1c:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1c.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-; <rdar://problem/14742333> Incomplete removal of unnecessary FMOV instructions in intrinsic SHA1
-define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha1c_in_a_row:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1
-; CHECK-NOT: fmov
-; CHECK: sha1c.4s q0, s[[SHA1RES]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  %extract = extractelement <4 x i32> %res, i32 0
-  %res2 = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
-  ret <4 x i32> %res2
-}
-
-define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha1p:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1p.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha1m:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1m.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-define i32 @test_sha1h(i32 %hash_e) {
-; CHECK-LABEL: test_sha1h:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
-; CHECK: fmov w0, [[RES]]
-  %res = call i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
-  ret i32 %res
-}
-
-define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) {
-; CHECK-LABEL: test_sha1su0:
-; CHECK: sha1su0.4s v0, v1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) {
-; CHECK-LABEL: test_sha1su1:
-; CHECK: sha1su1.4s v0, v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
-  ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
-declare <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
-
-define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha256h:
-; CHECK: sha256h.4s q0, q1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha256h2:
-; CHECK: sha256h2.4s q0, q1, v2
-
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) {
-; CHECK-LABEL: test_sha256su0:
-; CHECK: sha256su0.4s v0, v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
-; CHECK-LABEL: test_sha256su1:
-; CHECK: sha256su1.4s v0, v1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
-  ret <4 x i32> %res
-}
diff --git a/test/CodeGen/ARM64/cse.ll b/test/CodeGen/ARM64/cse.ll
deleted file mode 100644
index d98bfd6..0000000
--- a/test/CodeGen/ARM64/cse.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; RUN: llc -O3 < %s | FileCheck %s
-target triple = "arm64-apple-ios"
-
-; rdar://12462006
-; CSE between "icmp reg reg" and "sub reg reg".
-; Both can be in the same basic block or in different basic blocks.
-define i8* @t1(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: subs
-; CHECK-NOT: cmp
-; CHECK-NOT: sub
-; CHECK: b.ge
-; CHECK: sub
-; CHECK: sub
-; CHECK_NOT: sub
-; CHECK: ret
- %0 = load i32* %offset, align 4
- %cmp = icmp slt i32 %0, %size
- %s = sub nsw i32 %0, %size
- br i1 %cmp, label %return, label %if.end
-
-if.end:
- %sub = sub nsw i32 %0, %size
- %s2 = sub nsw i32 %s, %size
- %s3 = sub nsw i32 %sub, %s2
- store i32 %s3, i32* %offset, align 4
- %add.ptr = getelementptr inbounds i8* %base, i32 %sub
- br label %return
-
-return:
- %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
- ret i8* %retval.0
-}
-
-; CSE between "icmp reg imm" and "sub reg imm".
-define i8* @t2(i8* %base, i32* nocapture %offset) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: subs
-; CHECK-NOT: cmp
-; CHECK-NOT: sub
-; CHECK: b.lt
-; CHECK-NOT: sub
-; CHECK: ret
- %0 = load i32* %offset, align 4
- %cmp = icmp slt i32 %0, 1
- br i1 %cmp, label %return, label %if.end
-
-if.end:
- %sub = sub nsw i32 %0, 1
- store i32 %sub, i32* %offset, align 4
- %add.ptr = getelementptr inbounds i8* %base, i32 %sub
- br label %return
-
-return:
- %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
- ret i8* %retval.0
-}
diff --git a/test/CodeGen/ARM64/csel.ll b/test/CodeGen/ARM64/csel.ll
deleted file mode 100644
index cbf1769..0000000
--- a/test/CodeGen/ARM64/csel.ll
+++ /dev/null
@@ -1,222 +0,0 @@
-; RUN: llc -O3 < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
-target triple = "arm64-unknown-unknown"
-
-; CHECK: foo1
-; CHECK: csinc w{{[0-9]+}}, w[[REG:[0-9]+]],
-; CHECK:                                     w[[REG]], eq
-define i32 @foo1(i32 %b, i32 %c) nounwind readnone ssp {
-entry:
-  %not.tobool = icmp ne i32 %c, 0
-  %add = zext i1 %not.tobool to i32
-  %b.add = add i32 %c, %b
-  %add1 = add i32 %b.add, %add
-  ret i32 %add1
-}
-
-; CHECK: foo2
-; CHECK: csneg w{{[0-9]+}}, w[[REG:[0-9]+]],
-; CHECK:                                     w[[REG]], eq
-define i32 @foo2(i32 %b, i32 %c) nounwind readnone ssp {
-entry:
-  %mul = sub i32 0, %b
-  %tobool = icmp eq i32 %c, 0
-  %b.mul = select i1 %tobool, i32 %b, i32 %mul
-  %add = add nsw i32 %b.mul, %c
-  ret i32 %add
-}
-
-; CHECK: foo3
-; CHECK: csinv w{{[0-9]+}}, w[[REG:[0-9]+]],
-; CHECK:                                     w[[REG]], eq
-define i32 @foo3(i32 %b, i32 %c) nounwind readnone ssp {
-entry:
-  %not.tobool = icmp ne i32 %c, 0
-  %xor = sext i1 %not.tobool to i32
-  %b.xor = xor i32 %xor, %b
-  %add = add nsw i32 %b.xor, %c
-  ret i32 %add
-}
-
-; rdar://11632325
-define i32@foo4(i32 %a) nounwind ssp {
-; CHECK: foo4
-; CHECK: csneg
-; CHECK-NEXT: ret
-  %cmp = icmp sgt i32 %a, -1
-  %neg = sub nsw i32 0, %a
-  %cond = select i1 %cmp, i32 %a, i32 %neg
-  ret i32 %cond
-}
-
-define i32@foo5(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: foo5
-; CHECK: subs
-; CHECK-NEXT: csneg
-; CHECK-NEXT: ret
-  %sub = sub nsw i32 %a, %b
-  %cmp = icmp sgt i32 %sub, -1
-  %sub3 = sub nsw i32 0, %sub
-  %cond = select i1 %cmp, i32 %sub, i32 %sub3
-  ret i32 %cond
-}
-
-; make sure we can handle branch instruction in optimizeCompare.
-define i32@foo6(i32 %a, i32 %b) nounwind ssp {
-; CHECK: foo6
-; CHECK: b
-  %sub = sub nsw i32 %a, %b
-  %cmp = icmp sgt i32 %sub, 0
-  br i1 %cmp, label %l.if, label %l.else
-
-l.if:
-  ret i32 1
-
-l.else:
-  ret i32 %sub
-}
-
-; If CPSR is used multiple times and V flag is used, we don't remove cmp.
-define i32 @foo7(i32 %a, i32 %b) nounwind {
-entry:
-; CHECK-LABEL: foo7:
-; CHECK: sub
-; CHECK-next: adds
-; CHECK-next: csneg
-; CHECK-next: b
-  %sub = sub nsw i32 %a, %b
-  %cmp = icmp sgt i32 %sub, -1
-  %sub3 = sub nsw i32 0, %sub
-  %cond = select i1 %cmp, i32 %sub, i32 %sub3
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  %cmp2 = icmp slt i32 %sub, -1
-  %sel = select i1 %cmp2, i32 %cond, i32 %a
-  ret i32 %sel
-
-if.else:
-  ret i32 %cond
-}
-
-define i32 @foo8(i32 %v, i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: foo8:
-; CHECK: cmp w0, #0
-; CHECK: csinv w0, w1, w2, ne
-  %tobool = icmp eq i32 %v, 0
-  %neg = xor i32 -1, %b
-  %cond = select i1 %tobool, i32 %neg, i32 %a
-  ret i32 %cond
-}
-
-define i32 @foo9(i32 %v) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo9:
-; CHECK: cmp w0, #0
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
-; CHECK: csinv w0, w[[REG]], w[[REG]], ne
-  %tobool = icmp ne i32 %v, 0
-  %cond = select i1 %tobool, i32 4, i32 -5
-  ret i32 %cond
-}
-
-define i64 @foo10(i64 %v) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo10:
-; CHECK: cmp x0, #0
-; CHECK: orr x[[REG:[0-9]+]], xzr, #0x4
-; CHECK: csinv x0, x[[REG]], x[[REG]], ne
-  %tobool = icmp ne i64 %v, 0
-  %cond = select i1 %tobool, i64 4, i64 -5
-  ret i64 %cond
-}
-
-define i32 @foo11(i32 %v) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo11:
-; CHECK: cmp w0, #0
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
-; CHECK: csneg w0, w[[REG]], w[[REG]], ne
-  %tobool = icmp ne i32 %v, 0
-  %cond = select i1 %tobool, i32 4, i32 -4
-  ret i32 %cond
-}
-
-define i64 @foo12(i64 %v) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo12:
-; CHECK: cmp x0, #0
-; CHECK: orr x[[REG:[0-9]+]], xzr, #0x4
-; CHECK: csneg x0, x[[REG]], x[[REG]], ne
-  %tobool = icmp ne i64 %v, 0
-  %cond = select i1 %tobool, i64 4, i64 -4
-  ret i64 %cond
-}
-
-define i32 @foo13(i32 %v, i32 %a, i32 %b) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo13:
-; CHECK: cmp w0, #0
-; CHECK: csneg w0, w1, w2, ne
-  %tobool = icmp eq i32 %v, 0
-  %sub = sub i32 0, %b
-  %cond = select i1 %tobool, i32 %sub, i32 %a
-  ret i32 %cond
-}
-
-define i64 @foo14(i64 %v, i64 %a, i64 %b) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo14:
-; CHECK: cmp x0, #0
-; CHECK: csneg x0, x1, x2, ne
-  %tobool = icmp eq i64 %v, 0
-  %sub = sub i64 0, %b
-  %cond = select i1 %tobool, i64 %sub, i64 %a
-  ret i64 %cond
-}
-
-define i32 @foo15(i32 %a, i32 %b) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo15:
-; CHECK: cmp w0, w1
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
-; CHECK: csinc w0, w[[REG]], w[[REG]], le
-  %cmp = icmp sgt i32 %a, %b
-  %. = select i1 %cmp, i32 2, i32 1
-  ret i32 %.
-}
-
-define i32 @foo16(i32 %a, i32 %b) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo16:
-; CHECK: cmp w0, w1
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
-; CHECK: csinc w0, w[[REG]], w[[REG]], gt
-  %cmp = icmp sgt i32 %a, %b
-  %. = select i1 %cmp, i32 1, i32 2
-  ret i32 %.
-}
-
-define i64 @foo17(i64 %a, i64 %b) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo17:
-; CHECK: cmp x0, x1
-; CHECK: orr x[[REG:[0-9]+]], xzr, #0x1
-; CHECK: csinc x0, x[[REG]], x[[REG]], le
-  %cmp = icmp sgt i64 %a, %b
-  %. = select i1 %cmp, i64 2, i64 1
-  ret i64 %.
-}
-
-define i64 @foo18(i64 %a, i64 %b) nounwind readnone optsize ssp {
-entry:
-; CHECK-LABEL: foo18:
-; CHECK: cmp x0, x1
-; CHECK: orr x[[REG:[0-9]+]], xzr, #0x1
-; CHECK: csinc x0, x[[REG]], x[[REG]], gt
-  %cmp = icmp sgt i64 %a, %b
-  %. = select i1 %cmp, i64 1, i64 2
-  ret i64 %.
-}
diff --git a/test/CodeGen/ARM64/cvt.ll b/test/CodeGen/ARM64/cvt.ll
deleted file mode 100644
index b55a42f..0000000
--- a/test/CodeGen/ARM64/cvt.ll
+++ /dev/null
@@ -1,401 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-;
-; Floating-point scalar convert to signed integer (to nearest with ties to away)
-;
-define i32 @fcvtas_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtas_1w1s:
-;CHECK: fcvtas w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtas_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtas_1x1s:
-;CHECK: fcvtas x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtas_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtas_1w1d:
-;CHECK: fcvtas w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtas_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtas_1x1d:
-;CHECK: fcvtas x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtas.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtas.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtas.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtas.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer
-;
-define i32 @fcvtau_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtau_1w1s:
-;CHECK: fcvtau w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtau_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtau_1x1s:
-;CHECK: fcvtau x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtau_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtau_1w1d:
-;CHECK: fcvtau w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtau_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtau_1x1d:
-;CHECK: fcvtau x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtau.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtau.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtau.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtau.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to signed integer (toward -Inf)
-;
-define i32 @fcvtms_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtms_1w1s:
-;CHECK: fcvtms w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtms_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtms_1x1s:
-;CHECK: fcvtms x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtms_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtms_1w1d:
-;CHECK: fcvtms w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtms_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtms_1x1d:
-;CHECK: fcvtms x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtms.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtms.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtms.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtms.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer (toward -Inf)
-;
-define i32 @fcvtmu_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtmu_1w1s:
-;CHECK: fcvtmu w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtmu_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtmu_1x1s:
-;CHECK: fcvtmu x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtmu_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtmu_1w1d:
-;CHECK: fcvtmu w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtmu_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtmu_1x1d:
-;CHECK: fcvtmu x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtmu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtmu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtmu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtmu.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to signed integer (to nearest with ties to even)
-;
-define i32 @fcvtns_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtns_1w1s:
-;CHECK: fcvtns w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtns_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtns_1x1s:
-;CHECK: fcvtns x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtns_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtns_1w1d:
-;CHECK: fcvtns w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtns_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtns_1x1d:
-;CHECK: fcvtns x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtns.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtns.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtns.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtns.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer (to nearest with ties to even)
-;
-define i32 @fcvtnu_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtnu_1w1s:
-;CHECK: fcvtnu w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtnu_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtnu_1x1s:
-;CHECK: fcvtnu x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtnu_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtnu_1w1d:
-;CHECK: fcvtnu w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtnu_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtnu_1x1d:
-;CHECK: fcvtnu x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtnu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtnu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtnu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtnu.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to signed integer (toward +Inf)
-;
-define i32 @fcvtps_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtps_1w1s:
-;CHECK: fcvtps w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtps_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtps_1x1s:
-;CHECK: fcvtps x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtps_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtps_1w1d:
-;CHECK: fcvtps w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtps_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtps_1x1d:
-;CHECK: fcvtps x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtps.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtps.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtps.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtps.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer (toward +Inf)
-;
-define i32 @fcvtpu_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtpu_1w1s:
-;CHECK: fcvtpu w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtpu_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtpu_1x1s:
-;CHECK: fcvtpu x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtpu_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtpu_1w1d:
-;CHECK: fcvtpu w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtpu_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtpu_1x1d:
-;CHECK: fcvtpu x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtpu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtpu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtpu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtpu.i64.f64(double) nounwind readnone
-
-;
-;  Floating-point scalar convert to signed integer (toward zero)
-;
-define i32 @fcvtzs_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtzs_1w1s:
-;CHECK: fcvtzs w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtzs_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtzs_1x1s:
-;CHECK: fcvtzs x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtzs_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtzs_1w1d:
-;CHECK: fcvtzs w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtzs_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtzs_1x1d:
-;CHECK: fcvtzs x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtzs.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzs.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtzs.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzs.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer (toward zero)
-;
-define i32 @fcvtzu_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtzu_1w1s:
-;CHECK: fcvtzu w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtzu_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtzu_1x1s:
-;CHECK: fcvtzu x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtzu_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtzu_1w1d:
-;CHECK: fcvtzu w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtzu_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtzu_1x1d:
-;CHECK: fcvtzu x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtzu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtzu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzu.i64.f64(double) nounwind readnone
diff --git a/test/CodeGen/ARM64/dagcombiner-convergence.ll b/test/CodeGen/ARM64/dagcombiner-convergence.ll
deleted file mode 100644
index a45e313..0000000
--- a/test/CodeGen/ARM64/dagcombiner-convergence.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -o /dev/null
-; rdar://10795250
-; DAGCombiner should converge.
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
-target triple = "arm64-apple-macosx10.8.0"
-
-define i64 @foo(i128 %Params.coerce, i128 %SelLocs.coerce) {
-entry:
-  %tmp = lshr i128 %Params.coerce, 61
-  %.tr38.i = trunc i128 %tmp to i64
-  %mul.i = and i64 %.tr38.i, 4294967288
-  %tmp1 = lshr i128 %SelLocs.coerce, 62
-  %.tr.i = trunc i128 %tmp1 to i64
-  %mul7.i = and i64 %.tr.i, 4294967292
-  %add.i = add i64 %mul7.i, %mul.i
-  %conv.i.i = and i64 %add.i, 4294967292
-  ret i64 %conv.i.i
-}
diff --git a/test/CodeGen/ARM64/dagcombiner-load-slicing.ll b/test/CodeGen/ARM64/dagcombiner-load-slicing.ll
deleted file mode 100644
index 0679014..0000000
--- a/test/CodeGen/ARM64/dagcombiner-load-slicing.ll
+++ /dev/null
@@ -1,102 +0,0 @@
-; RUN: llc -mtriple arm64-apple-ios -O3 -o - < %s | FileCheck %s
-; <rdar://problem/14477220>
-
-%class.Complex = type { float, float }
-%class.Complex_int = type { i32, i32 }
-%class.Complex_long = type { i64, i64 }
-
-; CHECK-LABEL: @test
-; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
-; CHECK: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], {{\[}}[[BASE]]]
-; CHECK: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], {{\[}}[[BASE]], #64]
-; CHECK: fadd {{s[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
-; CHECK: fadd {{s[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
-; CHECK: ret
-define void @test(%class.Complex* nocapture %out, i64 %out_start) {
-entry:
-  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
-  %0 = bitcast %class.Complex* %arrayidx to i64*
-  %1 = load i64* %0, align 4
-  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
-  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
-  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
-  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
-  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
-  %add = add i64 %out_start, 8
-  %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
-  %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
-  %4 = load float* %i.i, align 4
-  %add.i = fadd float %4, %2
-  %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
-  %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
-  %5 = load float* %r.i, align 4
-  %add5.i = fadd float %5, %3
-  %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
-  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
-  store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
-  ret void
-}
-
-; CHECK-LABEL: @test_int
-; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
-; CHECK: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], {{\[}}[[BASE]]]
-; CHECK: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], {{\[}}[[BASE]], #64]
-; CHECK: add {{w[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
-; CHECK: add {{w[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
-; CHECK: ret
-define void @test_int(%class.Complex_int* nocapture %out, i64 %out_start) {
-entry:
-  %arrayidx = getelementptr inbounds %class.Complex_int* %out, i64 %out_start
-  %0 = bitcast %class.Complex_int* %arrayidx to i64*
-  %1 = load i64* %0, align 4
-  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
-  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to i32
-  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
-  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
-  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to i32
-  %add = add i64 %out_start, 8
-  %arrayidx2 = getelementptr inbounds %class.Complex_int* %out, i64 %add
-  %i.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 0
-  %4 = load i32* %i.i, align 4
-  %add.i = add i32 %4, %2
-  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i32> undef, i32 %add.i, i32 0
-  %r.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 1
-  %5 = load i32* %r.i, align 4
-  %add5.i = add i32 %5, %3
-  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i32> %retval.sroa.0.0.vec.insert.i, i32 %add5.i, i32 1
-  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_int* %arrayidx to <2 x i32>*
-  store <2 x i32> %retval.sroa.0.4.vec.insert.i, <2 x i32>* %ref.tmp.sroa.0.0.cast, align 4
-  ret void
-}
-
-; CHECK-LABEL: @test_long
-; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #4
-; CHECK: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], {{\[}}[[BASE]]]
-; CHECK: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], {{\[}}[[BASE]], #128]
-; CHECK: add {{x[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
-; CHECK: add {{x[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
-; CHECK: ret
-define void @test_long(%class.Complex_long* nocapture %out, i64 %out_start) {
-entry:
-  %arrayidx = getelementptr inbounds %class.Complex_long* %out, i64 %out_start
-  %0 = bitcast %class.Complex_long* %arrayidx to i128*
-  %1 = load i128* %0, align 4
-  %t0.sroa.0.0.extract.trunc = trunc i128 %1 to i64
-  %2 = bitcast i64 %t0.sroa.0.0.extract.trunc to i64
-  %t0.sroa.2.0.extract.shift = lshr i128 %1, 64
-  %t0.sroa.2.0.extract.trunc = trunc i128 %t0.sroa.2.0.extract.shift to i64
-  %3 = bitcast i64 %t0.sroa.2.0.extract.trunc to i64
-  %add = add i64 %out_start, 8
-  %arrayidx2 = getelementptr inbounds %class.Complex_long* %out, i64 %add
-  %i.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 0
-  %4 = load i64* %i.i, align 4
-  %add.i = add i64 %4, %2
-  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i64> undef, i64 %add.i, i32 0
-  %r.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 1
-  %5 = load i64* %r.i, align 4
-  %add5.i = add i64 %5, %3
-  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i64> %retval.sroa.0.0.vec.insert.i, i64 %add5.i, i32 1
-  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_long* %arrayidx to <2 x i64>*
-  store <2 x i64> %retval.sroa.0.4.vec.insert.i, <2 x i64>* %ref.tmp.sroa.0.0.cast, align 4
-  ret void
-}
diff --git a/test/CodeGen/ARM64/dup.ll b/test/CodeGen/ARM64/dup.ll
deleted file mode 100644
index e659575..0000000
--- a/test/CodeGen/ARM64/dup.ll
+++ /dev/null
@@ -1,322 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
-
-define <8 x i8> @v_dup8(i8 %A) nounwind {
-;CHECK-LABEL: v_dup8:
-;CHECK: dup.8b
-	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
-	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
-	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
-	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
-	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
-	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
-	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
-	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
-	ret <8 x i8> %tmp8
-}
-
-define <4 x i16> @v_dup16(i16 %A) nounwind {
-;CHECK-LABEL: v_dup16:
-;CHECK: dup.4h
-	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
-	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
-	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
-	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
-	ret <4 x i16> %tmp4
-}
-
-define <2 x i32> @v_dup32(i32 %A) nounwind {
-;CHECK-LABEL: v_dup32:
-;CHECK: dup.2s
-	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
-	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
-	ret <2 x i32> %tmp2
-}
-
-define <2 x float> @v_dupfloat(float %A) nounwind {
-;CHECK-LABEL: v_dupfloat:
-;CHECK: dup.2s
-	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
-	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
-	ret <2 x float> %tmp2
-}
-
-define <16 x i8> @v_dupQ8(i8 %A) nounwind {
-;CHECK-LABEL: v_dupQ8:
-;CHECK: dup.16b
-	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
-	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
-	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
-	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
-	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
-	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
-	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
-	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
-	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
-	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
-	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
-	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
-	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
-	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
-	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
-	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
-	ret <16 x i8> %tmp16
-}
-
-define <8 x i16> @v_dupQ16(i16 %A) nounwind {
-;CHECK-LABEL: v_dupQ16:
-;CHECK: dup.8h
-	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
-	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
-	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
-	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
-	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
-	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
-	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
-	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
-	ret <8 x i16> %tmp8
-}
-
-define <4 x i32> @v_dupQ32(i32 %A) nounwind {
-;CHECK-LABEL: v_dupQ32:
-;CHECK: dup.4s
-	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
-	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
-	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
-	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
-	ret <4 x i32> %tmp4
-}
-
-define <4 x float> @v_dupQfloat(float %A) nounwind {
-;CHECK-LABEL: v_dupQfloat:
-;CHECK: dup.4s
-	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
-	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
-	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
-	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
-	ret <4 x float> %tmp4
-}
-
-; Check to make sure it works with shuffles, too.
-
-define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
-;CHECK-LABEL: v_shuffledup8:
-;CHECK: dup.8b
-	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
-	ret <8 x i8> %tmp2
-}
-
-define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
-;CHECK-LABEL: v_shuffledup16:
-;CHECK: dup.4h
-	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
-	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
-	ret <4 x i16> %tmp2
-}
-
-define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
-;CHECK-LABEL: v_shuffledup32:
-;CHECK: dup.2s
-	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
-	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
-	ret <2 x i32> %tmp2
-}
-
-define <2 x float> @v_shuffledupfloat(float %A) nounwind {
-;CHECK-LABEL: v_shuffledupfloat:
-;CHECK: dup.2s
-	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
-	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
-	ret <2 x float> %tmp2
-}
-
-define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
-;CHECK-LABEL: v_shuffledupQ8:
-;CHECK: dup.16b
-	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
-	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
-	ret <16 x i8> %tmp2
-}
-
-define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
-;CHECK-LABEL: v_shuffledupQ16:
-;CHECK: dup.8h
-	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
-	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
-	ret <8 x i16> %tmp2
-}
-
-define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
-;CHECK-LABEL: v_shuffledupQ32:
-;CHECK: dup.4s
-	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
-	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
-	ret <4 x i32> %tmp2
-}
-
-define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
-;CHECK-LABEL: v_shuffledupQfloat:
-;CHECK: dup.4s
-	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
-	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
-	ret <4 x float> %tmp2
-}
-
-define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vduplane8:
-;CHECK: dup.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
-	ret <8 x i8> %tmp2
-}
-
-define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: vduplane16:
-;CHECK: dup.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
-	ret <4 x i16> %tmp2
-}
-
-define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: vduplane32:
-;CHECK: dup.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
-	ret <2 x i32> %tmp2
-}
-
-define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
-;CHECK-LABEL: vduplanefloat:
-;CHECK: dup.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
-	ret <2 x float> %tmp2
-}
-
-define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vduplaneQ8:
-;CHECK: dup.16b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
-	ret <16 x i8> %tmp2
-}
-
-define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: vduplaneQ16:
-;CHECK: dup.8h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
-	ret <8 x i16> %tmp2
-}
-
-define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: vduplaneQ32:
-;CHECK: dup.4s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
-	ret <4 x i32> %tmp2
-}
-
-define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
-;CHECK-LABEL: vduplaneQfloat:
-;CHECK: dup.4s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
-	ret <4 x float> %tmp2
-}
-
-define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
-;CHECK-LABEL: foo:
-;CHECK: dup.2d
-entry:
-  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i64> %0
-}
-
-define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
-;CHECK-LABEL: bar:
-;CHECK: dup.2d
-entry:
-  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
-  ret <2 x i64> %0
-}
-
-define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
-;CHECK-LABEL: baz:
-;CHECK: dup.2d
-entry:
-  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x double> %0
-}
-
-define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
-;CHECK-LABEL: qux:
-;CHECK: dup.2d
-entry:
-  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
-  ret <2 x double> %0
-}
-
-define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone  {
-; CHECK-LABEL: f:
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: ins.s v0[1], w1
-; CHECK-NEXT: ret
-  %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
-  %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
-  ret <2 x i32> %vecinit1
-}
-
-define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
-; CHECK-LABEL: g:
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: ins.s v0[1], w1
-; CHECK-NEXT: ins.s v0[2], w1
-; CHECK-NEXT: ins.s v0[3], w0
-; CHECK-NEXT: ret
-  %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
-  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
-  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
-  ret <4 x i32> %vecinit3
-}
-
-define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone  {
-; CHECK-LABEL: h:
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ins.d v0[1], x1
-; CHECK-NEXT: ret
-  %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
-  %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
-  ret <2 x i64> %vecinit1
-}
-
-; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
-; the single value needed was of the same type as the vector. This is false if
-; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
-; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
-; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
-define <4 x i16> @test_build_illegal(<4 x i32> %in) {
-; CHECK-LABEL: test_build_illegal:
-; CHECK: umov.s [[WTMP:w[0-9]+]], v0[3]
-; CHECK: dup.4h v0, [[WTMP]]
-  %val = extractelement <4 x i32> %in, i32 3
-  %smallval = trunc i32 %val to i16
-  %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
-
-  ret <4 x i16> %vec
-}
-
-; We used to inherit an already extract_subvectored v4i16 from
-; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
-; the formation of an indexed-by-7 MLS.
-define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
-; CHECK-LABEL: test_high_splat:
-; CHECK: mls.4h v0, v1, v2[7]
-entry:
-  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %mul = mul <4 x i16> %shuffle, %b
-  %sub = sub <4 x i16> %a, %mul
-  ret <4 x i16> %sub
-}
diff --git a/test/CodeGen/ARM64/early-ifcvt.ll b/test/CodeGen/ARM64/early-ifcvt.ll
deleted file mode 100644
index a5c1e26..0000000
--- a/test/CodeGen/ARM64/early-ifcvt.ll
+++ /dev/null
@@ -1,423 +0,0 @@
-; RUN: llc < %s -stress-early-ifcvt | FileCheck %s
-target triple = "arm64-apple-macosx"
-
-; CHECK: mm2
-define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp {
-entry:
-  br label %do.body
-
-; CHECK: do.body
-; Loop body has no branches before the backedge.
-; CHECK-NOT: LBB
-do.body:
-  %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ]
-  %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ]
-  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ]
-  %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ]
-  %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1
-  %0 = load i32* %p.addr.0, align 4
-  %cmp = icmp sgt i32 %0, %max.0
-  br i1 %cmp, label %do.cond, label %if.else
-
-if.else:
-  %cmp1 = icmp slt i32 %0, %min.0
-  %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0
-  br label %do.cond
-
-do.cond:
-  %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ]
-  %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ]
-; CHECK: cbnz
-  %dec = add i32 %n.addr.0, -1
-  %tobool = icmp eq i32 %dec, 0
-  br i1 %tobool, label %do.end, label %do.body
-
-do.end:
-  %sub = sub nsw i32 %max.1, %min.1
-  ret i32 %sub
-}
-
-; CHECK-LABEL: fold_inc_true_32:
-; CHECK: {{subs.*wzr,|cmp}} w2, #1
-; CHECK-NEXT: csinc w0, w1, w0, eq
-; CHECK-NEXT: ret
-define i32 @fold_inc_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 1
-  %inc = add nsw i32 %x, 1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %y, %eq_bb ], [ %inc, %entry ]
-  ret i32 %cond
-}
-
-; CHECK-LABEL: fold_inc_true_64:
-; CHECK: {{subs.*xzr,|cmp}} x2, #1
-; CHECK-NEXT: csinc x0, x1, x0, eq
-; CHECK-NEXT: ret
-define i64 @fold_inc_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 1
-  %inc = add nsw i64 %x, 1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %y, %eq_bb ], [ %inc, %entry ]
-  ret i64 %cond
-}
-
-; CHECK-LABEL: fold_inc_false_32:
-; CHECK: {{subs.*wzr,|cmp}} w2, #1
-; CHECK-NEXT: csinc w0, w1, w0, ne
-; CHECK-NEXT: ret
-define i32 @fold_inc_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 1
-  %inc = add nsw i32 %x, 1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %inc, %eq_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK-LABEL: fold_inc_false_64:
-; CHECK: {{subs.*xzr,|cmp}} x2, #1
-; CHECK-NEXT: csinc x0, x1, x0, ne
-; CHECK-NEXT: ret
-define i64 @fold_inc_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 1
-  %inc = add nsw i64 %x, 1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %inc, %eq_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; CHECK-LABEL: fold_inv_true_32:
-; CHECK: {{subs.*wzr,|cmp}} w2, #1
-; CHECK-NEXT: csinv w0, w1, w0, eq
-; CHECK-NEXT: ret
-define i32 @fold_inv_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 1
-  %inv = xor i32 %x, -1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %y, %eq_bb ], [ %inv, %entry ]
-  ret i32 %cond
-}
-
-; CHECK-LABEL: fold_inv_true_64:
-; CHECK: {{subs.*xzr,|cmp}} x2, #1
-; CHECK-NEXT: csinv x0, x1, x0, eq
-; CHECK-NEXT: ret
-define i64 @fold_inv_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 1
-  %inv = xor i64 %x, -1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %y, %eq_bb ], [ %inv, %entry ]
-  ret i64 %cond
-}
-
-; CHECK-LABEL: fold_inv_false_32:
-; CHECK: {{subs.*wzr,|cmp}} w2, #1
-; CHECK-NEXT: csinv w0, w1, w0, ne
-; CHECK-NEXT: ret
-define i32 @fold_inv_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 1
-  %inv = xor i32 %x, -1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %inv, %eq_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK-LABEL: fold_inv_false_64:
-; CHECK: {{subs.*xzr,|cmp}} x2, #1
-; CHECK-NEXT: csinv x0, x1, x0, ne
-; CHECK-NEXT: ret
-define i64 @fold_inv_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 1
-  %inv = xor i64 %x, -1
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %inv, %eq_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; CHECK-LABEL: fold_neg_true_32:
-; CHECK: {{subs.*wzr,|cmp}} w2, #1
-; CHECK-NEXT: csneg w0, w1, w0, eq
-; CHECK-NEXT: ret
-define i32 @fold_neg_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 1
-  %neg = sub nsw i32 0, %x
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %y, %eq_bb ], [ %neg, %entry ]
-  ret i32 %cond
-}
-
-; CHECK-LABEL: fold_neg_true_64:
-; CHECK: {{subs.*xzr,|cmp}} x2, #1
-; CHECK-NEXT: csneg x0, x1, x0, eq
-; CHECK-NEXT: ret
-define i64 @fold_neg_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 1
-  %neg = sub nsw i64 0, %x
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %y, %eq_bb ], [ %neg, %entry ]
-  ret i64 %cond
-}
-
-; CHECK-LABEL: fold_neg_false_32:
-; CHECK: {{subs.*wzr,|cmp}} w2, #1
-; CHECK-NEXT: csneg w0, w1, w0, ne
-; CHECK-NEXT: ret
-define i32 @fold_neg_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 1
-  %neg = sub nsw i32 0, %x
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %neg, %eq_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK-LABEL: fold_neg_false_64:
-; CHECK: {{subs.*xzr,|cmp}} x2, #1
-; CHECK-NEXT: csneg x0, x1, x0, ne
-; CHECK-NEXT: ret
-define i64 @fold_neg_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 1
-  %neg = sub nsw i64 0, %x
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %neg, %eq_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; CHECK: cbnz_32
-; CHECK: {{subs.*wzr,|cmp}} w2, #0
-; CHECK-NEXT: csel w0, w1, w0, ne
-; CHECK-NEXT: ret
-define i32 @cbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i32 %c, 0
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK: cbnz_64
-; CHECK: {{subs.*xzr,|cmp}} x2, #0
-; CHECK-NEXT: csel x0, x1, x0, ne
-; CHECK-NEXT: ret
-define i64 @cbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp eq i64 %c, 0
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; CHECK: cbz_32
-; CHECK: {{subs.*wzr,|cmp}} w2, #0
-; CHECK-NEXT: csel w0, w1, w0, eq
-; CHECK-NEXT: ret
-define i32 @cbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %tobool = icmp ne i32 %c, 0
-  br i1 %tobool, label %ne_bb, label %done
-
-ne_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK: cbz_64
-; CHECK: {{subs.*xzr,|cmp}} x2, #0
-; CHECK-NEXT: csel x0, x1, x0, eq
-; CHECK-NEXT: ret
-define i64 @cbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %tobool = icmp ne i64 %c, 0
-  br i1 %tobool, label %ne_bb, label %done
-
-ne_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; CHECK: tbnz_32
-; CHECK: {{ands.*xzr,|tst}} x2, #0x80
-; CHECK-NEXT: csel w0, w1, w0, ne
-; CHECK-NEXT: ret
-define i32 @tbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %mask = and i32 %c, 128
-  %tobool = icmp eq i32 %mask, 0
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK: tbnz_64
-; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
-; CHECK-NEXT: csel x0, x1, x0, ne
-; CHECK-NEXT: ret
-define i64 @tbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %mask = and i64 %c, 9223372036854775808
-  %tobool = icmp eq i64 %mask, 0
-  br i1 %tobool, label %eq_bb, label %done
-
-eq_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; CHECK: tbz_32
-; CHECK: {{ands.*xzr,|tst}} x2, #0x80
-; CHECK-NEXT: csel w0, w1, w0, eq
-; CHECK-NEXT: ret
-define i32 @tbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
-entry:
-  %mask = and i32 %c, 128
-  %tobool = icmp ne i32 %mask, 0
-  br i1 %tobool, label %ne_bb, label %done
-
-ne_bb:
-  br label %done
-
-done:
-  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
-  ret i32 %cond
-}
-
-; CHECK: tbz_64
-; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
-; CHECK-NEXT: csel x0, x1, x0, eq
-; CHECK-NEXT: ret
-define i64 @tbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
-entry:
-  %mask = and i64 %c, 9223372036854775808
-  %tobool = icmp ne i64 %mask, 0
-  br i1 %tobool, label %ne_bb, label %done
-
-ne_bb:
-  br label %done
-
-done:
-  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
-  ret i64 %cond
-}
-
-; This function from 175.vpr folds an ADDWri into a CSINC.
-; Remember to clear the kill flag on the ADDWri.
-define i32 @get_ytrack_to_xtracks() nounwind ssp {
-entry:
-  br label %for.body
-
-for.body:
-  %x0 = load i32* undef, align 4
-  br i1 undef, label %if.then.i146, label %is_sbox.exit155
-
-if.then.i146:
-  %add8.i143 = add nsw i32 0, %x0
-  %rem.i144 = srem i32 %add8.i143, %x0
-  %add9.i145 = add i32 %rem.i144, 1
-  br label %is_sbox.exit155
-
-is_sbox.exit155:                                  ; preds = %if.then.i146, %for.body
-  %seg_offset.0.i151 = phi i32 [ %add9.i145, %if.then.i146 ], [ undef, %for.body ]
-  %idxprom15.i152 = sext i32 %seg_offset.0.i151 to i64
-  %arrayidx18.i154 = getelementptr inbounds i32* null, i64 %idxprom15.i152
-  %x1 = load i32* %arrayidx18.i154, align 4
-  br i1 undef, label %for.body51, label %for.body
-
-for.body51:                                       ; preds = %is_sbox.exit155
-  call fastcc void @get_switch_type(i32 %x1, i32 undef, i16 signext undef, i16 signext undef, i16* undef)
-  unreachable
-}
-declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, i16* nocapture) nounwind ssp
diff --git a/test/CodeGen/ARM64/elf-calls.ll b/test/CodeGen/ARM64/elf-calls.ll
deleted file mode 100644
index 8c40203..0000000
--- a/test/CodeGen/ARM64/elf-calls.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
-
-declare void @callee()
-
-define void @caller() {
-  call void @callee()
-  ret void
-; CHECK-LABEL: caller:
-; CHECK:     bl callee
-; CHECK-OBJ: R_AARCH64_CALL26 callee
-}
-
-define void @tail_caller() {
-  tail call void @callee()
-  ret void
-; CHECK-LABEL: tail_caller:
-; CHECK:     b callee
-; CHECK-OBJ: R_AARCH64_JUMP26 callee
-}
diff --git a/test/CodeGen/ARM64/elf-constpool.ll b/test/CodeGen/ARM64/elf-constpool.ll
deleted file mode 100644
index 95d3343..0000000
--- a/test/CodeGen/ARM64/elf-constpool.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -O0 -o - %s | FileCheck %s
-
-; O0 checked for fastisel purposes. It has a separate path which
-; creates a constpool entry for floating values.
-
-define double @needs_const() {
-  ret double 3.14159
-; CHECK: .LCPI0_0:
-
-; CHECK: adrp {{x[0-9]+}}, .LCPI0_0
-; CHECK: ldr d0, [{{x[0-9]+}}, :lo12:.LCPI0_0]
-}
diff --git a/test/CodeGen/ARM64/elf-globals.ll b/test/CodeGen/ARM64/elf-globals.ll
deleted file mode 100644
index 598c96a..0000000
--- a/test/CodeGen/ARM64/elf-globals.ll
+++ /dev/null
@@ -1,115 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 | FileCheck %s --check-prefix=CHECK-FAST
-; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s | FileCheck %s --check-prefix=CHECK-PIC
-; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s | FileCheck %s --check-prefix=CHECK-FAST-PIC
-
-@var8 = external global i8, align 1
-@var16 = external global i16, align 2
-@var32 = external global i32, align 4
-@var64 = external global i64, align 8
-
-define i8 @test_i8(i8 %new) {
-  %val = load i8* @var8, align 1
-  store i8 %new, i8* @var8
-  ret i8 %val
-; CHECK-LABEL: test_i8:
-; CHECK: adrp x[[HIREG:[0-9]+]], var8
-; CHECK: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
-; CHECK: strb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
-
-; CHECK-PIC-LABEL: test_i8:
-; CHECK-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
-; CHECK-PIC: ldr x[[VAR_ADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
-; CHECK-PIC: ldrb {{w[0-9]+}}, [x[[VAR_ADDR]]]
-
-; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var8
-; CHECK-FAST: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
-
-; CHECK-FAST-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
-; CHECK-FAST-PIC: ldr x[[VARADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
-; CHECK-FAST-PIC: ldr {{w[0-9]+}}, [x[[VARADDR]]]
-}
-
-define i16 @test_i16(i16 %new) {
-  %val = load i16* @var16, align 2
-  store i16 %new, i16* @var16
-  ret i16 %val
-; CHECK-LABEL: test_i16:
-; CHECK: adrp x[[HIREG:[0-9]+]], var16
-; CHECK: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
-; CHECK: strh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
-
-; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var16
-; CHECK-FAST: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
-}
-
-define i32 @test_i32(i32 %new) {
-  %val = load i32* @var32, align 4
-  store i32 %new, i32* @var32
-  ret i32 %val
-; CHECK-LABEL: test_i32:
-; CHECK: adrp x[[HIREG:[0-9]+]], var32
-; CHECK: ldr {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
-; CHECK: str {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
-
-; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var32
-; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var32
-}
-
-define i64 @test_i64(i64 %new) {
-  %val = load i64* @var64, align 8
-  store i64 %new, i64* @var64
-  ret i64 %val
-; CHECK-LABEL: test_i64:
-; CHECK: adrp x[[HIREG:[0-9]+]], var64
-; CHECK: ldr {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
-; CHECK: str {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
-
-; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var64
-; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var64
-}
-
-define i64* @test_addr() {
-  ret i64* @var64
-; CHECK-LABEL: test_addr:
-; CHECK: adrp [[HIREG:x[0-9]+]], var64
-; CHECK: add x0, [[HIREG]], :lo12:var64
-
-; CHECK-FAST: adrp [[HIREG:x[0-9]+]], var64
-; CHECK-FAST: add x0, [[HIREG]], :lo12:var64
-}
-
-@hiddenvar = hidden global i32 0, align 4
-@protectedvar = protected global i32 0, align 4
-
-define i32 @test_vis() {
-  %lhs = load i32* @hiddenvar, align 4
-  %rhs = load i32* @protectedvar, align 4
-  %ret = add i32 %lhs, %rhs
-  ret i32 %ret
-; CHECK-PIC: adrp {{x[0-9]+}}, hiddenvar
-; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:hiddenvar]
-; CHECK-PIC: adrp {{x[0-9]+}}, protectedvar
-; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:protectedvar]
-}
-
-@var_default = external global [2 x i32]
-
-define i32 @test_default_align() {
-  %addr = getelementptr [2 x i32]* @var_default, i32 0, i32 0
-  %val = load i32* %addr
-  ret i32 %val
-; CHECK-LABEL: test_default_align:
-; CHECK: adrp x[[HIREG:[0-9]+]], var_default
-; CHECK: ldr w0, [x[[HIREG]], :lo12:var_default]
-}
-
-define i64 @test_default_unaligned() {
-  %addr = bitcast [2 x i32]* @var_default to i64*
-  %val = load i64* %addr
-  ret i64 %val
-; CHECK-LABEL: test_default_unaligned:
-; CHECK: adrp [[HIREG:x[0-9]+]], var_default
-; CHECK: add x[[ADDR:[0-9]+]], [[HIREG]], :lo12:var_default
-; CHECK: ldr x0, [x[[ADDR]]]
-}
diff --git a/test/CodeGen/ARM64/ext.ll b/test/CodeGen/ARM64/ext.ll
deleted file mode 100644
index 57d6e0c..0000000
--- a/test/CodeGen/ARM64/ext.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextd:
-;CHECK: {{ext.8b.*#3}}
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-	ret <8 x i8> %tmp3
-}
-
-define <8 x i8> @test_vextRd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextRd:
-;CHECK: {{ext.8b.*#5}}
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @test_vextq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextq:
-;CHECK: {{ext.16b.*3}}
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
-	ret <16 x i8> %tmp3
-}
-
-define <16 x i8> @test_vextRq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextRq:
-;CHECK: {{ext.16b.*7}}
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @test_vextd16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: test_vextd16:
-;CHECK: {{ext.8b.*#6}}
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-	ret <4 x i16> %tmp3
-}
-
-define <4 x i32> @test_vextq32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: test_vextq32:
-;CHECK: {{ext.16b.*12}}
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-	ret <4 x i32> %tmp3
-}
-
-; Undef shuffle indices should not prevent matching to VEXT:
-
-define <8 x i8> @test_vextd_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextd_undef:
-;CHECK: {{ext.8b.*}}
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10>
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: test_vextRq_undef:
-;CHECK: {{ext.16b.*#7}}
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 undef, i32 undef, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 undef, i32 6>
-	ret <16 x i8> %tmp3
-}
-
-; Tests for ReconstructShuffle function. Indices have to be carefully
-; chosen to reach lowering phase as a BUILD_VECTOR.
-
-; One vector needs vext, the other can be handled by extract_subvector
-; Also checks interleaving of sources is handled correctly.
-; Essence: a vext is used on %A and something saner than stack load/store for final result.
-define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: test_interleaved:
-;CHECK: ext.8b
-;CHECK: zip1.4h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 3, i32 8, i32 5, i32 9>
-        ret <4 x i16> %tmp3
-}
-
-; An undef in the shuffle list should still be optimizable
-define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: test_undef:
-;CHECK: zip1.4h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 undef, i32 8, i32 5, i32 9>
-        ret <4 x i16> %tmp3
-}
diff --git a/test/CodeGen/ARM64/extend-int-to-fp.ll b/test/CodeGen/ARM64/extend-int-to-fp.ll
deleted file mode 100644
index 599a697..0000000
--- a/test/CodeGen/ARM64/extend-int-to-fp.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <4 x float> @foo(<4 x i16> %a) nounwind {
-; CHECK-LABEL: foo:
-; CHECK: ushll.4s	v0, v0, #0
-; CHECK-NEXT: ucvtf.4s	v0, v0
-; CHECK-NEXT: ret
-  %vcvt.i = uitofp <4 x i16> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @bar(<4 x i16> %a) nounwind {
-; CHECK-LABEL: bar:
-; CHECK: sshll.4s	v0, v0, #0
-; CHECK-NEXT: scvtf.4s	v0, v0
-; CHECK-NEXT: ret
-  %vcvt.i = sitofp <4 x i16> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
diff --git a/test/CodeGen/ARM64/extend.ll b/test/CodeGen/ARM64/extend.ll
deleted file mode 100644
index 4d20543..0000000
--- a/test/CodeGen/ARM64/extend.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
-@array = external global [0 x i32]
-
-define i64 @foo(i32 %i) {
-; CHECK: foo
-; CHECK:  adrp  x[[REG:[0-9]+]], _array@GOTPAGE
-; CHECK:  ldr x[[REG1:[0-9]+]], [x[[REG]], _array@GOTPAGEOFF]
-; CHECK:  ldrsw x0, [x[[REG1]], x0, sxtw #2]
-; CHECK:  ret
-  %idxprom = sext i32 %i to i64
-  %arrayidx = getelementptr inbounds [0 x i32]* @array, i64 0, i64 %idxprom
-  %tmp1 = load i32* %arrayidx, align 4
-  %conv = sext i32 %tmp1 to i64
-  ret i64 %conv
-}
diff --git a/test/CodeGen/ARM64/extern-weak.ll b/test/CodeGen/ARM64/extern-weak.ll
deleted file mode 100644
index a239403..0000000
--- a/test/CodeGen/ARM64/extern-weak.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -o - < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
-
-declare extern_weak i32 @var()
-
-define i32()* @foo() {
-; The usual ADRP/ADD pair can't be used for a weak reference because it must
-; evaluate to 0 if the symbol is undefined. We use a litpool entry.
-  ret i32()* @var
-
-; CHECK: adrp x[[VAR:[0-9]+]], :got:var
-; CHECK: ldr x0, [x[[VAR]], :got_lo12:var]
-
-  ; In the large model, the usual relocations are absolute and can
-  ; materialise 0.
-; CHECK-LARGE: movz x0, #:abs_g3:var
-; CHECK-LARGE: movk x0, #:abs_g2_nc:var
-; CHECK-LARGE: movk x0, #:abs_g1_nc:var
-; CHECK-LARGE: movk x0, #:abs_g0_nc:var
-}
-
-
-@arr_var = extern_weak global [10 x i32]
-
-define i32* @bar() {
-  %addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5
-; CHECK: adrp x[[ARR_VAR_HI:[0-9]+]], :got:arr_var
-; CHECK: ldr [[ARR_VAR:x[0-9]+]], [x[[ARR_VAR_HI]], :got_lo12:arr_var]
-; CHECK: add x0, [[ARR_VAR]], #20
-  ret i32* %addr
-
-  ; In the large model, the usual relocations are absolute and can
-  ; materialise 0.
-; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g3:arr_var
-; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g2_nc:arr_var
-; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g1_nc:arr_var
-; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g0_nc:arr_var
-}
-
-@defined_weak_var = internal unnamed_addr global i32 0
-
-define i32* @wibble() {
-  ret i32* @defined_weak_var
-; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
-; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
-
-; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
-; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
-; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var
-; CHECK-LARGE: movk x0, #:abs_g0_nc:defined_weak_var
-}
diff --git a/test/CodeGen/ARM64/extload-knownzero.ll b/test/CodeGen/ARM64/extload-knownzero.ll
deleted file mode 100644
index 14e5fd3..0000000
--- a/test/CodeGen/ARM64/extload-knownzero.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-; rdar://12771555
-
-define void @foo(i16* %ptr, i32 %a) nounwind {
-entry:
-; CHECK-LABEL: foo:
-  %tmp1 = icmp ult i32 %a, 100
-  br i1 %tmp1, label %bb1, label %bb2
-bb1:
-; CHECK: %bb1
-; CHECK: ldrh [[REG:w[0-9]+]]
-  %tmp2 = load i16* %ptr, align 2
-  br label %bb2
-bb2:
-; CHECK: %bb2
-; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
-; CHECK: cmp [[REG]], #23
-  %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ]
-  %cmp = icmp ult i16 %tmp3, 24
-  br i1 %cmp, label %bb3, label %exit
-bb3:
-  call void @bar() nounwind
-  br label %exit
-exit:
-  ret void
-}
-
-declare void @bar ()
diff --git a/test/CodeGen/ARM64/extract.ll b/test/CodeGen/ARM64/extract.ll
deleted file mode 100644
index 119751c..0000000
--- a/test/CodeGen/ARM64/extract.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc -arm64-extr-generation=true -verify-machineinstrs < %s \
-; RUN: -march=arm64 | FileCheck %s
-
-define i64 @ror_i64(i64 %in) {
-; CHECK-LABEL: ror_i64:
-    %left = shl i64 %in, 19
-    %right = lshr i64 %in, 45
-    %val5 = or i64 %left, %right
-; CHECK: extr {{x[0-9]+}}, x0, x0, #45
-    ret i64 %val5
-}
-
-define i32 @ror_i32(i32 %in) {
-; CHECK-LABEL: ror_i32:
-    %left = shl i32 %in, 9
-    %right = lshr i32 %in, 23
-    %val5 = or i32 %left, %right
-; CHECK: extr {{w[0-9]+}}, w0, w0, #23
-    ret i32 %val5
-}
-
-define i32 @extr_i32(i32 %lhs, i32 %rhs) {
-; CHECK-LABEL: extr_i32:
-  %left = shl i32 %lhs, 6
-  %right = lshr i32 %rhs, 26
-  %val = or i32 %left, %right
-  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
-  ; something other than w0 and w1.
-; CHECK: extr {{w[0-9]+}}, w0, w1, #26
-
-  ret i32 %val
-}
-
-define i64 @extr_i64(i64 %lhs, i64 %rhs) {
-; CHECK-LABEL: extr_i64:
-  %right = lshr i64 %rhs, 40
-  %left = shl i64 %lhs, 24
-  %val = or i64 %right, %left
-  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
-  ; something other than w0 and w1.
-; CHECK: extr {{x[0-9]+}}, x0, x1, #40
-
-  ret i64 %val
-}
-
-; Regression test: a bad experimental pattern crept into git which optimised
-; this pattern to a single EXTR.
-define i32 @extr_regress(i32 %a, i32 %b) {
-; CHECK-LABEL: extr_regress:
-
-    %sh1 = shl i32 %a, 14
-    %sh2 = lshr i32 %b, 14
-    %val = or i32 %sh2, %sh1
-; CHECK-NOT: extr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, #{{[0-9]+}}
-
-    ret i32 %val
-; CHECK: ret
-}
diff --git a/test/CodeGen/ARM64/extract_subvector.ll b/test/CodeGen/ARM64/extract_subvector.ll
deleted file mode 100644
index 20c05fb..0000000
--- a/test/CodeGen/ARM64/extract_subvector.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn.
-
-define <8 x i8> @v8i8(<16 x i8> %a) nounwind {
-; CHECK: v8i8
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: ret
-  %ret = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32>  <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i8> %ret
-}
-
-define <4 x i16> @v4i16(<8 x i16> %a) nounwind {
-; CHECK-LABEL: v4i16:
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: ret
-  %ret = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32>  <i32 4, i32 5, i32 6, i32 7>
-  ret <4 x i16> %ret
-}
-
-define <2 x i32> @v2i32(<4 x i32> %a) nounwind {
-; CHECK-LABEL: v2i32:
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: ret
-  %ret = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32>  <i32 2, i32 3>
-  ret <2 x i32> %ret
-}
-
-define <1 x i64> @v1i64(<2 x i64> %a) nounwind {
-; CHECK-LABEL: v1i64:
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: ret
-  %ret = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32>  <i32 1>
-  ret <1 x i64> %ret
-}
-
-define <2 x float> @v2f32(<4 x float> %a) nounwind {
-; CHECK-LABEL: v2f32:
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: ret
-  %ret = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32>  <i32 2, i32 3>
-  ret <2 x float> %ret
-}
-
-define <1 x double> @v1f64(<2 x double> %a) nounwind {
-; CHECK-LABEL: v1f64:
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: ret
-  %ret = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32>  <i32 1>
-  ret <1 x double> %ret
-}
diff --git a/test/CodeGen/ARM64/fast-isel-addr-offset.ll b/test/CodeGen/ARM64/fast-isel-addr-offset.ll
deleted file mode 100644
index a4326dc..0000000
--- a/test/CodeGen/ARM64/fast-isel-addr-offset.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-@sortlist = common global [5001 x i32] zeroinitializer, align 16
-@sortlist2 = common global [5001 x i64] zeroinitializer, align 16
-
-; Load an address with an offset larget then LDR imm can handle
-define i32 @foo() nounwind {
-entry:
-; CHECK: @foo
-; CHECK: adrp x[[REG:[0-9]+]], _sortlist@GOTPAGE
-; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist@GOTPAGEOFF]
-; CHECK: movz x[[REG2:[0-9]+]], #20000
-; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
-; CHECK: ldr w0, [x[[REG3]]]
-; CHECK: ret
-  %0 = load i32* getelementptr inbounds ([5001 x i32]* @sortlist, i32 0, i64 5000), align 4
-  ret i32 %0
-}
-
-define i64 @foo2() nounwind {
-entry:
-; CHECK: @foo2
-; CHECK: adrp x[[REG:[0-9]+]], _sortlist2@GOTPAGE
-; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist2@GOTPAGEOFF]
-; CHECK: movz x[[REG2:[0-9]+]], #40000
-; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
-; CHECK: ldr x0, [x[[REG3]]]
-; CHECK: ret
-  %0 = load i64* getelementptr inbounds ([5001 x i64]* @sortlist2, i32 0, i64 5000), align 4
-  ret i64 %0
-}
-
-; Load an address with a ridiculously large offset.
-; rdar://12505553
-@pd2 = common global i8* null, align 8
-
-define signext i8 @foo3() nounwind ssp {
-entry:
-; CHECK: @foo3
-; CHECK: movz x[[REG:[0-9]+]], #2874, lsl #32
-; CHECK: movk x[[REG]], #29646, lsl #16
-; CHECK: movk x[[REG]], #12274
-  %0 = load i8** @pd2, align 8
-  %arrayidx = getelementptr inbounds i8* %0, i64 12345678901234
-  %1 = load i8* %arrayidx, align 1
-  ret i8 %1
-}
diff --git a/test/CodeGen/ARM64/fast-isel-alloca.ll b/test/CodeGen/ARM64/fast-isel-alloca.ll
deleted file mode 100644
index 8bbee16..0000000
--- a/test/CodeGen/ARM64/fast-isel-alloca.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; This test should cause the TargetMaterializeAlloca to be invoked
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-%struct.S1Ty = type { i64 }
-%struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty }
-
-define void @takeS1(%struct.S1Ty* %V) nounwind {
-entry:
-  %V.addr = alloca %struct.S1Ty*, align 8
-  store %struct.S1Ty* %V, %struct.S1Ty** %V.addr, align 8
-  ret void
-}
-
-define void @main() nounwind {
-entry:
-; CHECK: main
-; CHECK: mov x[[REG:[0-9]+]], sp
-; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8
-; CHECK-NEXT: add x0, x[[REG]], x[[REG1]]
-  %E = alloca %struct.S2Ty, align 4
-  %B = getelementptr inbounds %struct.S2Ty* %E, i32 0, i32 1
-  call void @takeS1(%struct.S1Ty* %B)
-  ret void
-}
diff --git a/test/CodeGen/ARM64/fast-isel-br.ll b/test/CodeGen/ARM64/fast-isel-br.ll
deleted file mode 100644
index 8fd32fd..0000000
--- a/test/CodeGen/ARM64/fast-isel-br.ll
+++ /dev/null
@@ -1,155 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define void @branch1() nounwind uwtable ssp {
-  %x = alloca i32, align 4
-  store i32 0, i32* %x, align 4
-  %1 = load i32* %x, align 4
-  %2 = icmp ne i32 %1, 0
-  br i1 %2, label %3, label %4
-
-; <label>:3                                       ; preds = %0
-  br label %4
-
-; <label>:4                                       ; preds = %3, %0
-  ret void
-}
-
-define void @branch2() nounwind uwtable ssp {
-  %1 = alloca i32, align 4
-  %x = alloca i32, align 4
-  %y = alloca i32, align 4
-  %z = alloca i32, align 4
-  store i32 0, i32* %1
-  store i32 1, i32* %y, align 4
-  store i32 1, i32* %x, align 4
-  store i32 0, i32* %z, align 4
-  %2 = load i32* %x, align 4
-  %3 = icmp ne i32 %2, 0
-  br i1 %3, label %4, label %5
-
-; <label>:4                                       ; preds = %0
-  store i32 0, i32* %1
-  br label %14
-
-; <label>:5                                       ; preds = %0
-  %6 = load i32* %y, align 4
-  %7 = icmp ne i32 %6, 0
-  br i1 %7, label %8, label %13
-
-; <label>:8                                       ; preds = %5
-  %9 = load i32* %z, align 4
-  %10 = icmp ne i32 %9, 0
-  br i1 %10, label %11, label %12
-
-; <label>:11                                      ; preds = %8
-  store i32 1, i32* %1
-  br label %14
-
-; <label>:12                                      ; preds = %8
-  store i32 0, i32* %1
-  br label %14
-
-; <label>:13                                      ; preds = %5
-  br label %14
-
-; <label>:14                                      ; preds = %4, %11, %12, %13
-  %15 = load i32* %1
-  ret void
-}
-
-define void @true_() nounwind uwtable ssp {
-; CHECK: @true_
-; CHECK: b LBB2_1
-  br i1 true, label %1, label %2
-
-; <label>:1
-; CHECK: LBB2_1
-  br label %2
-
-; <label>:2
-  ret void
-}
-
-define void @false_() nounwind uwtable ssp {
-; CHECK: @false_
-; CHECK: b LBB3_2
-  br i1 false, label %1, label %2
-
-; <label>:1
-  br label %2
-
-; <label>:2
-; CHECK: LBB3_2
-  ret void
-}
-
-define zeroext i8 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) {
-entry:
-  %a.addr = alloca i8, align 1
-  %b.addr = alloca i16, align 2
-  %c.addr = alloca i32, align 4
-  %d.addr = alloca i64, align 8
-  store i8 %a, i8* %a.addr, align 1
-  store i16 %b, i16* %b.addr, align 2
-  store i32 %c, i32* %c.addr, align 4
-  store i64 %d, i64* %d.addr, align 8
-  %0 = load i16* %b.addr, align 2
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: b.eq LBB4_2
-  %conv = trunc i16 %0 to i1
-  br i1 %conv, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  call void @foo1()
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %entry
-  %1 = load i32* %c.addr, align 4
-; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
-; CHECK: subs w{{[0-9]+}}, w[[REG]], #0
-; CHECK: b.eq LBB4_4
-  %conv1 = trunc i32 %1 to i1
-  br i1 %conv1, label %if.then3, label %if.end4
-
-if.then3:                                         ; preds = %if.end
-  call void @foo1()
-  br label %if.end4
-
-if.end4:                                          ; preds = %if.then3, %if.end
-  %2 = load i64* %d.addr, align 8
-; CHECK: subs w{{[0-9]+}}, w{{[0-9]+}}, #0
-; CHECK: b.eq LBB4_6
-  %conv5 = trunc i64 %2 to i1
-  br i1 %conv5, label %if.then7, label %if.end8
-
-if.then7:                                         ; preds = %if.end4
-  call void @foo1()
-  br label %if.end8
-
-if.end8:                                          ; preds = %if.then7, %if.end4
-  %3 = load i8* %a.addr, align 1
-  ret i8 %3
-}
-
-declare void @foo1()
-
-; rdar://15174028
-define i32 @trunc64(i64 %foo) nounwind {
-; CHECK: trunc64
-; CHECK: orr  [[REG:x[0-9]+]], xzr, #0x1
-; CHECK: and  [[REG2:x[0-9]+]], x0, [[REG]]
-; CHECK: mov  x[[REG3:[0-9]+]], [[REG2]]
-; CHECK: and  [[REG4:w[0-9]+]], w[[REG3]], #0x1
-; CHECK: subs {{w[0-9]+}}, [[REG4]], #0
-; CHECK: b.eq LBB5_2
-  %a = and i64 %foo, 1
-  %b = trunc i64 %a to i1
-  br i1 %b, label %if.then, label %if.else
-
-if.then:
-  ret i32 1
-
-if.else:
-  ret i32 0
-}
diff --git a/test/CodeGen/ARM64/fast-isel-call.ll b/test/CodeGen/ARM64/fast-isel-call.ll
deleted file mode 100644
index be0ca68..0000000
--- a/test/CodeGen/ARM64/fast-isel-call.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define void @call0() nounwind {
-entry:
-  ret void
-}
-
-define void @foo0() nounwind {
-entry:
-; CHECK: foo0
-; CHECK: bl _call0
-  call void @call0()
-  ret void
-}
-
-define i32 @call1(i32 %a) nounwind {
-entry:
-  %a.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  %tmp = load i32* %a.addr, align 4
-  ret i32 %tmp
-}
-
-define i32 @foo1(i32 %a) nounwind {
-entry:
-; CHECK: foo1
-; CHECK: stur w0, [fp, #-4]
-; CHECK-NEXT: ldur w0, [fp, #-4]
-; CHECK-NEXT: bl _call1
-  %a.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  %tmp = load i32* %a.addr, align 4
-  %call = call i32 @call1(i32 %tmp)
-  ret i32 %call
-}
-
-define i32 @sext_(i8 %a, i16 %b) nounwind {
-entry:
-; CHECK: @sext_
-; CHECK: sxtb w0, w0
-; CHECK: sxth w1, w1
-; CHECK: bl _foo_sext_
-  call void @foo_sext_(i8 signext %a, i16 signext %b)
-  ret i32 0
-}
-
-declare void @foo_sext_(i8 %a, i16 %b)
-
-define i32 @zext_(i8 %a, i16 %b) nounwind {
-entry:
-; CHECK: @zext_
-; CHECK: uxtb w0, w0
-; CHECK: uxth w1, w1
-  call void @foo_zext_(i8 zeroext %a, i16 zeroext %b)
-  ret i32 0
-}
-
-declare void @foo_zext_(i8 %a, i16 %b)
-
-define i32 @t1(i32 %argc, i8** nocapture %argv) {
-entry:
-; CHECK: @t1
-; The last parameter will be passed on stack via i8.
-; CHECK: strb w{{[0-9]+}}, [sp]
-; CHECK-NEXT: bl _bar
-  %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70, i8 zeroext 28, i8 zeroext 39, i8 zeroext -41)
-  ret i32 0
-}
-
-declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
-
-; Test materialization of integers.  Target-independent selector handles this.
-define i32 @t2() {
-entry:
-; CHECK: @t2
-; CHECK: movz x0, #0
-; CHECK: orr w1, wzr, #0xfffffff8
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x3ff
-; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x2
-; CHECK: movz w[[REG3:[0-9]+]], #0
-; CHECK: orr w[[REG4:[0-9]+]], wzr, #0x1
-; CHECK: uxth w2, w[[REG]]
-; CHECK: sxtb w3, w[[REG2]]
-; CHECK: and w4, w[[REG3]], #0x1
-; CHECK: and w5, w[[REG4]], #0x1
-; CHECK: bl	_func2
-  %call = call i32 @func2(i64 zeroext 0, i32 signext -8, i16 zeroext 1023, i8 signext -254, i1 zeroext 0, i1 zeroext 1)
-  ret i32 0
-}
-
-declare i32 @func2(i64 zeroext, i32 signext, i16 zeroext, i8 signext, i1 zeroext, i1 zeroext)
diff --git a/test/CodeGen/ARM64/fast-isel-conversion.ll b/test/CodeGen/ARM64/fast-isel-conversion.ll
deleted file mode 100644
index 4e62e33..0000000
--- a/test/CodeGen/ARM64/fast-isel-conversion.ll
+++ /dev/null
@@ -1,416 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-;; Test various conversions.
-define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
-entry:
-; CHECK: trunc_
-; CHECK: sub sp, sp, #16
-; CHECK: strb w0, [sp, #15]
-; CHECK: strh w1, [sp, #12]
-; CHECK: str w2, [sp, #8]
-; CHECK: str x3, [sp]
-; CHECK: ldr x3, [sp]
-; CHECK: mov x0, x3
-; CHECK: str w0, [sp, #8]
-; CHECK: ldr w0, [sp, #8]
-; CHECK: strh w0, [sp, #12]
-; CHECK: ldrh w0, [sp, #12]
-; CHECK: strb w0, [sp, #15]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: uxtb w0, w0
-; CHECK: add sp, sp, #16
-; CHECK: ret
-  %a.addr = alloca i8, align 1
-  %b.addr = alloca i16, align 2
-  %c.addr = alloca i32, align 4
-  %d.addr = alloca i64, align 8
-  store i8 %a, i8* %a.addr, align 1
-  store i16 %b, i16* %b.addr, align 2
-  store i32 %c, i32* %c.addr, align 4
-  store i64 %d, i64* %d.addr, align 8
-  %tmp = load i64* %d.addr, align 8
-  %conv = trunc i64 %tmp to i32
-  store i32 %conv, i32* %c.addr, align 4
-  %tmp1 = load i32* %c.addr, align 4
-  %conv2 = trunc i32 %tmp1 to i16
-  store i16 %conv2, i16* %b.addr, align 2
-  %tmp3 = load i16* %b.addr, align 2
-  %conv4 = trunc i16 %tmp3 to i8
-  store i8 %conv4, i8* %a.addr, align 1
-  %tmp5 = load i8* %a.addr, align 1
-  %conv6 = zext i8 %tmp5 to i32
-  ret i32 %conv6
-}
-
-define i64 @zext_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
-entry:
-; CHECK: zext_
-; CHECK: sub sp, sp, #16
-; CHECK: strb w0, [sp, #15]
-; CHECK: strh w1, [sp, #12]
-; CHECK: str w2, [sp, #8]
-; CHECK: str x3, [sp]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: uxtb w0, w0
-; CHECK: strh w0, [sp, #12]
-; CHECK: ldrh w0, [sp, #12]
-; CHECK: uxth w0, w0
-; CHECK: str w0, [sp, #8]
-; CHECK: ldr w0, [sp, #8]
-; CHECK: uxtw x3, w0
-; CHECK: str x3, [sp]
-; CHECK: ldr x0, [sp], #16
-; CHECK: ret
-  %a.addr = alloca i8, align 1
-  %b.addr = alloca i16, align 2
-  %c.addr = alloca i32, align 4
-  %d.addr = alloca i64, align 8
-  store i8 %a, i8* %a.addr, align 1
-  store i16 %b, i16* %b.addr, align 2
-  store i32 %c, i32* %c.addr, align 4
-  store i64 %d, i64* %d.addr, align 8
-  %tmp = load i8* %a.addr, align 1
-  %conv = zext i8 %tmp to i16
-  store i16 %conv, i16* %b.addr, align 2
-  %tmp1 = load i16* %b.addr, align 2
-  %conv2 = zext i16 %tmp1 to i32
-  store i32 %conv2, i32* %c.addr, align 4
-  %tmp3 = load i32* %c.addr, align 4
-  %conv4 = zext i32 %tmp3 to i64
-  store i64 %conv4, i64* %d.addr, align 8
-  %tmp5 = load i64* %d.addr, align 8
-  ret i64 %tmp5
-}
-
-define i32 @zext_i1_i32(i1 zeroext %a) nounwind ssp {
-entry:
-; CHECK: @zext_i1_i32
-; CHECK: and w0, w0, #0x1
-  %conv = zext i1 %a to i32
-  ret i32 %conv;
-}
-
-define i64 @zext_i1_i64(i1 zeroext %a) nounwind ssp {
-entry:
-; CHECK: @zext_i1_i64
-; CHECK: and w0, w0, #0x1
-  %conv = zext i1 %a to i64
-  ret i64 %conv;
-}
-
-define i64 @sext_(i8 signext %a, i16 signext %b, i32 %c, i64 %d) nounwind ssp {
-entry:
-; CHECK: sext_
-; CHECK: sub sp, sp, #16
-; CHECK: strb w0, [sp, #15]
-; CHECK: strh w1, [sp, #12]
-; CHECK: str w2, [sp, #8]
-; CHECK: str x3, [sp]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: sxtb w0, w0
-; CHECK: strh w0, [sp, #12]
-; CHECK: ldrh w0, [sp, #12]
-; CHECK: sxth w0, w0
-; CHECK: str w0, [sp, #8]
-; CHECK: ldr w0, [sp, #8]
-; CHECK: sxtw x3, w0
-; CHECK: str x3, [sp]
-; CHECK: ldr x0, [sp], #16
-; CHECK: ret
-  %a.addr = alloca i8, align 1
-  %b.addr = alloca i16, align 2
-  %c.addr = alloca i32, align 4
-  %d.addr = alloca i64, align 8
-  store i8 %a, i8* %a.addr, align 1
-  store i16 %b, i16* %b.addr, align 2
-  store i32 %c, i32* %c.addr, align 4
-  store i64 %d, i64* %d.addr, align 8
-  %tmp = load i8* %a.addr, align 1
-  %conv = sext i8 %tmp to i16
-  store i16 %conv, i16* %b.addr, align 2
-  %tmp1 = load i16* %b.addr, align 2
-  %conv2 = sext i16 %tmp1 to i32
-  store i32 %conv2, i32* %c.addr, align 4
-  %tmp3 = load i32* %c.addr, align 4
-  %conv4 = sext i32 %tmp3 to i64
-  store i64 %conv4, i64* %d.addr, align 8
-  %tmp5 = load i64* %d.addr, align 8
-  ret i64 %tmp5
-}
-
-; Test sext i8 to i64
-define i64 @sext_2(i8 signext %a) nounwind ssp {
-entry:
-; CHECK: sext_2
-; CHECK: sxtb x0, w0
-  %conv = sext i8 %a to i64
-  ret i64 %conv
-}
-
-; Test sext i1 to i32
-define i32 @sext_i1_i32(i1 signext %a) nounwind ssp {
-entry:
-; CHECK: sext_i1_i32
-; CHECK: sbfm w0, w0, #0, #0
-  %conv = sext i1 %a to i32
-  ret i32 %conv
-}
-
-; Test sext i1 to i16
-define signext i16 @sext_i1_i16(i1 %a) nounwind ssp {
-entry:
-; CHECK: sext_i1_i16
-; CHECK: sbfm w0, w0, #0, #0
-  %conv = sext i1 %a to i16
-  ret i16 %conv
-}
-
-; Test sext i1 to i8
-define signext i8 @sext_i1_i8(i1 %a) nounwind ssp {
-entry:
-; CHECK: sext_i1_i8
-; CHECK: sbfm w0, w0, #0, #0
-  %conv = sext i1 %a to i8
-  ret i8 %conv
-}
-
-; Test fpext
-define double @fpext_(float %a) nounwind ssp {
-entry:
-; CHECK: fpext_
-; CHECK: fcvt d0, s0
-  %conv = fpext float %a to double
-  ret double %conv
-}
-
-; Test fptrunc
-define float @fptrunc_(double %a) nounwind ssp {
-entry:
-; CHECK: fptrunc_
-; CHECK: fcvt s0, d0
-  %conv = fptrunc double %a to float
-  ret float %conv
-}
-
-; Test fptosi
-define i32 @fptosi_ws(float %a) nounwind ssp {
-entry:
-; CHECK: fptosi_ws
-; CHECK: fcvtzs w0, s0
-  %conv = fptosi float %a to i32
-  ret i32 %conv
-}
-
-; Test fptosi
-define i32 @fptosi_wd(double %a) nounwind ssp {
-entry:
-; CHECK: fptosi_wd
-; CHECK: fcvtzs w0, d0
-  %conv = fptosi double %a to i32
-  ret i32 %conv
-}
-
-; Test fptoui
-define i32 @fptoui_ws(float %a) nounwind ssp {
-entry:
-; CHECK: fptoui_ws
-; CHECK: fcvtzu w0, s0
-  %conv = fptoui float %a to i32
-  ret i32 %conv
-}
-
-; Test fptoui
-define i32 @fptoui_wd(double %a) nounwind ssp {
-entry:
-; CHECK: fptoui_wd
-; CHECK: fcvtzu w0, d0
-  %conv = fptoui double %a to i32
-  ret i32 %conv
-}
-
-; Test sitofp
-define float @sitofp_sw_i1(i1 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_sw_i1
-; CHECK: sbfm w0, w0, #0, #0
-; CHECK: scvtf s0, w0
-  %conv = sitofp i1 %a to float
-  ret float %conv
-}
-
-; Test sitofp
-define float @sitofp_sw_i8(i8 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_sw_i8
-; CHECK: sxtb w0, w0
-; CHECK: scvtf s0, w0
-  %conv = sitofp i8 %a to float
-  ret float %conv
-}
-
-; Test sitofp
-define float @sitofp_sw_i16(i16 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_sw_i16
-; CHECK: sxth w0, w0
-; CHECK: scvtf s0, w0
-  %conv = sitofp i16 %a to float
-  ret float %conv
-}
-
-; Test sitofp
-define float @sitofp_sw(i32 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_sw
-; CHECK: scvtf s0, w0
-  %conv = sitofp i32 %a to float
-  ret float %conv
-}
-
-; Test sitofp
-define float @sitofp_sx(i64 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_sx
-; CHECK: scvtf s0, x0
-  %conv = sitofp i64 %a to float
-  ret float %conv
-}
-
-; Test sitofp
-define double @sitofp_dw(i32 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_dw
-; CHECK: scvtf d0, w0
-  %conv = sitofp i32 %a to double
-  ret double %conv
-}
-
-; Test sitofp
-define double @sitofp_dx(i64 %a) nounwind ssp {
-entry:
-; CHECK: sitofp_dx
-; CHECK: scvtf d0, x0
-  %conv = sitofp i64 %a to double
-  ret double %conv
-}
-
-; Test uitofp
-define float @uitofp_sw_i1(i1 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_sw_i1
-; CHECK: and w0, w0, #0x1
-; CHECK: ucvtf s0, w0
-  %conv = uitofp i1 %a to float
-  ret float %conv
-}
-
-; Test uitofp
-define float @uitofp_sw_i8(i8 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_sw_i8
-; CHECK: uxtb w0, w0
-; CHECK: ucvtf s0, w0
-  %conv = uitofp i8 %a to float
-  ret float %conv
-}
-
-; Test uitofp
-define float @uitofp_sw_i16(i16 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_sw_i16
-; CHECK: uxth w0, w0
-; CHECK: ucvtf s0, w0
-  %conv = uitofp i16 %a to float
-  ret float %conv
-}
-
-; Test uitofp
-define float @uitofp_sw(i32 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_sw
-; CHECK: ucvtf s0, w0
-  %conv = uitofp i32 %a to float
-  ret float %conv
-}
-
-; Test uitofp
-define float @uitofp_sx(i64 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_sx
-; CHECK: ucvtf s0, x0
-  %conv = uitofp i64 %a to float
-  ret float %conv
-}
-
-; Test uitofp
-define double @uitofp_dw(i32 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_dw
-; CHECK: ucvtf d0, w0
-  %conv = uitofp i32 %a to double
-  ret double %conv
-}
-
-; Test uitofp
-define double @uitofp_dx(i64 %a) nounwind ssp {
-entry:
-; CHECK: uitofp_dx
-; CHECK: ucvtf d0, x0
-  %conv = uitofp i64 %a to double
-  ret double %conv
-}
-
-define i32 @i64_trunc_i32(i64 %a) nounwind ssp {
-entry:
-; CHECK: i64_trunc_i32
-; CHECK: mov x1, x0
-  %conv = trunc i64 %a to i32
-  ret i32 %conv
-}
-
-define zeroext i16 @i64_trunc_i16(i64 %a) nounwind ssp {
-entry:
-; CHECK: i64_trunc_i16
-; CHECK: mov x[[REG:[0-9]+]], x0
-; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xffff
-; CHECK: uxth w0, [[REG2]]
-  %conv = trunc i64 %a to i16
-  ret i16 %conv
-}
-
-define zeroext i8 @i64_trunc_i8(i64 %a) nounwind ssp {
-entry:
-; CHECK: i64_trunc_i8
-; CHECK: mov x[[REG:[0-9]+]], x0
-; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xff
-; CHECK: uxtb w0, [[REG2]]
-  %conv = trunc i64 %a to i8
-  ret i8 %conv
-}
-
-define zeroext i1 @i64_trunc_i1(i64 %a) nounwind ssp {
-entry:
-; CHECK: i64_trunc_i1
-; CHECK: mov x[[REG:[0-9]+]], x0
-; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0x1
-; CHECK: and w0, [[REG2]], #0x1
-  %conv = trunc i64 %a to i1
-  ret i1 %conv
-}
-
-; rdar://15101939
-define void @stack_trunc() nounwind {
-; CHECK: stack_trunc
-; CHECK: sub  sp, sp, #16
-; CHECK: ldr  [[REG:x[0-9]+]], [sp]
-; CHECK: mov  x[[REG2:[0-9]+]], [[REG]]
-; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0xff
-; CHECK: strb [[REG3]], [sp, #15]
-; CHECK: add  sp, sp, #16
-  %a = alloca i8, align 1
-  %b = alloca i64, align 8
-  %c = load i64* %b, align 8
-  %d = trunc i64 %c to i8
-  store i8 %d, i8* %a, align 1
-  ret void
-}
diff --git a/test/CodeGen/ARM64/fast-isel-fcmp.ll b/test/CodeGen/ARM64/fast-isel-fcmp.ll
deleted file mode 100644
index cf71fab..0000000
--- a/test/CodeGen/ARM64/fast-isel-fcmp.ll
+++ /dev/null
@@ -1,146 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define zeroext i1 @fcmp_float1(float %a) nounwind ssp {
-entry:
-; CHECK: @fcmp_float1
-; CHECK: fcmp s0, #0.0
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
-  %cmp = fcmp une float %a, 0.000000e+00
-  ret i1 %cmp
-}
-
-define zeroext i1 @fcmp_float2(float %a, float %b) nounwind ssp {
-entry:
-; CHECK: @fcmp_float2
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
-  %cmp = fcmp une float %a, %b
-  ret i1 %cmp
-}
-
-define zeroext i1 @fcmp_double1(double %a) nounwind ssp {
-entry:
-; CHECK: @fcmp_double1
-; CHECK: fcmp d0, #0.0
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
-  %cmp = fcmp une double %a, 0.000000e+00
-  ret i1 %cmp
-}
-
-define zeroext i1 @fcmp_double2(double %a, double %b) nounwind ssp {
-entry:
-; CHECK: @fcmp_double2
-; CHECK: fcmp d0, d1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
-  %cmp = fcmp une double %a, %b
-  ret i1 %cmp
-}
-
-; Check each fcmp condition
-define float @fcmp_oeq(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_oeq
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ne
-  %cmp = fcmp oeq float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ogt(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ogt
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, le
-  %cmp = fcmp ogt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_oge(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_oge
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, lt
-  %cmp = fcmp oge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_olt(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_olt
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, pl
-  %cmp = fcmp olt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ole(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ole
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, hi
-  %cmp = fcmp ole float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ord(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ord
-; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, vs
-  %cmp = fcmp ord float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_uno(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_uno
-; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, vc
-  %cmp = fcmp uno float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ugt(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ugt
-; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ls
-  %cmp = fcmp ugt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_uge(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_uge
-; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, mi
-  %cmp = fcmp uge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ult(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ult
-; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ge
-  %cmp = fcmp ult float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ule(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ule
-; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, gt
-  %cmp = fcmp ule float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_une(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_une
-; CHECK: fcmp s0, s1
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, eq
-  %cmp = fcmp une float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
diff --git a/test/CodeGen/ARM64/fast-isel-gv.ll b/test/CodeGen/ARM64/fast-isel-gv.ll
deleted file mode 100644
index cb3df14..0000000
--- a/test/CodeGen/ARM64/fast-isel-gv.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-; Test load/store of global value from global offset table.
-@seed = common global i64 0, align 8
-
-define void @Initrand() nounwind {
-entry:
-; CHECK: @Initrand
-; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
-; CHECK: str x{{[0-9]+}}, [x[[REG2]]]
-  store i64 74755, i64* @seed, align 8
-  ret void
-}
-
-define i32 @Rand() nounwind {
-entry:
-; CHECK: @Rand
-; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
-; CHECK: movz x[[REG3:[0-9]+]], #1309
-; CHECK: ldr x[[REG4:[0-9]+]], [x[[REG2]]]
-; CHECK: mul x[[REG5:[0-9]+]], x[[REG4]], x[[REG3]]
-; CHECK: movz x[[REG6:[0-9]+]], #13849
-; CHECK: add x[[REG7:[0-9]+]], x[[REG5]], x[[REG6]]
-; CHECK: orr x[[REG8:[0-9]+]], xzr, #0xffff
-; CHECK: and x[[REG9:[0-9]+]], x[[REG7]], x[[REG8]]
-; CHECK: str x[[REG9]], [x[[REG]]]
-; CHECK: ldr x{{[0-9]+}}, [x[[REG]]]
-  %0 = load i64* @seed, align 8
-  %mul = mul nsw i64 %0, 1309
-  %add = add nsw i64 %mul, 13849
-  %and = and i64 %add, 65535
-  store i64 %and, i64* @seed, align 8
-  %1 = load i64* @seed, align 8
-  %conv = trunc i64 %1 to i32
-  ret i32 %conv
-}
diff --git a/test/CodeGen/ARM64/fast-isel-icmp.ll b/test/CodeGen/ARM64/fast-isel-icmp.ll
deleted file mode 100644
index 22af542..0000000
--- a/test/CodeGen/ARM64/fast-isel-icmp.ll
+++ /dev/null
@@ -1,214 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
-entry:
-; CHECK: icmp_eq_imm
-; CHECK: cmp  w0, #31
-; CHECK: csinc w0, wzr, wzr, ne
-  %cmp = icmp eq i32 %a, 31
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
-entry:
-; CHECK: icmp_eq_neg_imm
-; CHECK: cmn  w0, #7
-; CHECK: csinc w0, wzr, wzr, ne
-  %cmp = icmp eq i32 %a, -7
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_eq(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_eq
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, ne
-  %cmp = icmp eq i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_ne
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, eq
-  %cmp = icmp ne i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_ugt
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, ls
-  %cmp = icmp ugt i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_uge
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, cc
-  %cmp = icmp uge i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_ult
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, cs
-  %cmp = icmp ult i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_ule
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, hi
-  %cmp = icmp ule i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_sgt
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, le
-  %cmp = icmp sgt i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_sge
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, lt
-  %cmp = icmp sge i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_slt
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, ge
-  %cmp = icmp slt i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
-entry:
-; CHECK: icmp_sle
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, gt
-  %cmp = icmp sle i32 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
-entry:
-; CHECK: icmp_i64
-; CHECK: cmp  x0, x1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, gt
-  %cmp = icmp sle i64 %a, %b
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
-entry:
-; CHECK: icmp_eq_i16
-; CHECK: sxth w0, w0
-; CHECK: sxth w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, ne
-  %cmp = icmp eq i16 %a, %b
-  ret i1 %cmp
-}
-
-define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
-entry:
-; CHECK: icmp_eq_i8
-; CHECK: sxtb w0, w0
-; CHECK: sxtb w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, ne
-  %cmp = icmp eq i8 %a, %b
-  ret i1 %cmp
-}
-
-define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
-entry:
-; CHECK: icmp_i16_unsigned
-; CHECK: uxth w0, w0
-; CHECK: uxth w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, cs
-  %cmp = icmp ult i16 %a, %b
-  %conv2 = zext i1 %cmp to i32
-  ret i32 %conv2
-}
-
-define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
-entry:
-; CHECK: @icmp_i8_signed
-; CHECK: sxtb w0, w0
-; CHECK: sxtb w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: csinc w0, wzr, wzr, le
-  %cmp = icmp sgt i8 %a, %b
-  %conv2 = zext i1 %cmp to i32
-  ret i32 %conv2
-}
-
-
-define i32 @icmp_i16_signed_const(i16 %a) nounwind {
-entry:
-; CHECK: icmp_i16_signed_const
-; CHECK: sxth w0, w0
-; CHECK: cmn  w0, #233
-; CHECK: csinc w0, wzr, wzr, ge
-; CHECK: and w0, w0, #0x1
-  %cmp = icmp slt i16 %a, -233
-  %conv2 = zext i1 %cmp to i32
-  ret i32 %conv2
-}
-
-define i32 @icmp_i8_signed_const(i8 %a) nounwind {
-entry:
-; CHECK: icmp_i8_signed_const
-; CHECK: sxtb w0, w0
-; CHECK: cmp  w0, #124
-; CHECK: csinc w0, wzr, wzr, le
-; CHECK: and w0, w0, #0x1
-  %cmp = icmp sgt i8 %a, 124
-  %conv2 = zext i1 %cmp to i32
-  ret i32 %conv2
-}
-
-define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
-entry:
-; CHECK: icmp_i1_unsigned_const
-; CHECK: and w0, w0, #0x1
-; CHECK: cmp  w0, #0
-; CHECK: csinc w0, wzr, wzr, cs
-; CHECK: and w0, w0, #0x1
-  %cmp = icmp ult i1 %a, 0
-  %conv2 = zext i1 %cmp to i32
-  ret i32 %conv2
-}
diff --git a/test/CodeGen/ARM64/fast-isel-indirectbr.ll b/test/CodeGen/ARM64/fast-isel-indirectbr.ll
deleted file mode 100644
index 70335ac..0000000
--- a/test/CodeGen/ARM64/fast-isel-indirectbr.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-@fn.table = internal global [2 x i8*] [i8* blockaddress(@fn, %ZERO), i8* blockaddress(@fn, %ONE)], align 8
-
-define i32 @fn(i32 %target) nounwind {
-entry:
-; CHECK: @fn
-  %retval = alloca i32, align 4
-  %target.addr = alloca i32, align 4
-  store i32 %target, i32* %target.addr, align 4
-  %0 = load i32* %target.addr, align 4
-  %idxprom = zext i32 %0 to i64
-  %arrayidx = getelementptr inbounds [2 x i8*]* @fn.table, i32 0, i64 %idxprom
-  %1 = load i8** %arrayidx, align 8
-  br label %indirectgoto
-
-ZERO:                                             ; preds = %indirectgoto
-; CHECK: LBB0_1
-  store i32 0, i32* %retval
-  br label %return
-
-ONE:                                              ; preds = %indirectgoto
-; CHECK: LBB0_2
-  store i32 1, i32* %retval
-  br label %return
-
-return:                                           ; preds = %ONE, %ZERO
-  %2 = load i32* %retval
-  ret i32 %2
-
-indirectgoto:                                     ; preds = %entry
-; CHECK: ldr x0, [sp]
-; CHECK: br x0
-  %indirect.goto.dest = phi i8* [ %1, %entry ]
-  indirectbr i8* %indirect.goto.dest, [label %ZERO, label %ONE]
-}
diff --git a/test/CodeGen/ARM64/fast-isel-intrinsic.ll b/test/CodeGen/ARM64/fast-isel-intrinsic.ll
deleted file mode 100644
index 6443d82..0000000
--- a/test/CodeGen/ARM64/fast-isel-intrinsic.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios | FileCheck %s --check-prefix=ARM64
-
-@message = global [80 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 16
-@temp = common global [80 x i8] zeroinitializer, align 16
-
-define void @t1() {
-; ARM64: t1
-; ARM64: adrp x8, _message@PAGE
-; ARM64: add x0, x8, _message@PAGEOFF
-; ARM64: movz w9, #0
-; ARM64: movz x2, #80
-; ARM64: uxtb w1, w9
-; ARM64: bl _memset
-  call void @llvm.memset.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i8 0, i64 80, i32 16, i1 false)
-  ret void
-}
-
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
-
-define void @t2() {
-; ARM64: t2
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x8, _message@PAGE
-; ARM64: add x1, x8, _message@PAGEOFF
-; ARM64: movz x2, #80
-; ARM64: bl _memcpy
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 80, i32 16, i1 false)
-  ret void
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
-
-define void @t3() {
-; ARM64: t3
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x8, _message@PAGE
-; ARM64: add x1, x8, _message@PAGEOFF
-; ARM64: movz x2, #20
-; ARM64: bl _memmove
-  call void @llvm.memmove.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 20, i32 16, i1 false)
-  ret void
-}
-
-declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
-
-define void @t4() {
-; ARM64: t4
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldr x10, [x9]
-; ARM64: str x10, [x8]
-; ARM64: ldr x10, [x9, #8]
-; ARM64: str x10, [x8, #8]
-; ARM64: ldrb w11, [x9, #16]
-; ARM64: strb w11, [x8, #16]
-; ARM64: ret
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 16, i1 false)
-  ret void
-}
-
-define void @t5() {
-; ARM64: t5
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldr x10, [x9]
-; ARM64: str x10, [x8]
-; ARM64: ldr x10, [x9, #8]
-; ARM64: str x10, [x8, #8]
-; ARM64: ldrb w11, [x9, #16]
-; ARM64: strb w11, [x8, #16]
-; ARM64: ret
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 8, i1 false)
-  ret void
-}
-
-define void @t6() {
-; ARM64: t6
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldr w10, [x9]
-; ARM64: str w10, [x8]
-; ARM64: ldr w10, [x9, #4]
-; ARM64: str w10, [x8, #4]
-; ARM64: ldrb w10, [x9, #8]
-; ARM64: strb w10, [x8, #8]
-; ARM64: ret
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 9, i32 4, i1 false)
-  ret void
-}
-
-define void @t7() {
-; ARM64: t7
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldrh w10, [x9]
-; ARM64: strh w10, [x8]
-; ARM64: ldrh w10, [x9, #2]
-; ARM64: strh w10, [x8, #2]
-; ARM64: ldrh w10, [x9, #4]
-; ARM64: strh w10, [x8, #4]
-; ARM64: ldrb w10, [x9, #6]
-; ARM64: strb w10, [x8, #6]
-; ARM64: ret
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 7, i32 2, i1 false)
-  ret void
-}
-
-define void @t8() {
-; ARM64: t8
-; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldrb w10, [x9]
-; ARM64: strb w10, [x8]
-; ARM64: ldrb w10, [x9, #1]
-; ARM64: strb w10, [x8, #1]
-; ARM64: ldrb w10, [x9, #2]
-; ARM64: strb w10, [x8, #2]
-; ARM64: ldrb w10, [x9, #3]
-; ARM64: strb w10, [x8, #3]
-; ARM64: ret
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 4, i32 1, i1 false)
-  ret void
-}
diff --git a/test/CodeGen/ARM64/fast-isel-materialize.ll b/test/CodeGen/ARM64/fast-isel-materialize.ll
deleted file mode 100644
index fa2daf7..0000000
--- a/test/CodeGen/ARM64/fast-isel-materialize.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-; Materialize using fmov
-define void @float_(float* %value) {
-; CHECK: @float_
-; CHECK: fmov s0, #1.250000e+00
-  store float 1.250000e+00, float* %value, align 4
-  ret void
-}
-
-define void @double_(double* %value) {
-; CHECK: @double_
-; CHECK: fmov d0, #1.250000e+00
-  store double 1.250000e+00, double* %value, align 8
-  ret void
-}
-
-; Materialize from constant pool
-define float @float_cp() {
-; CHECK: @float_cp
-  ret float 0x400921FB60000000
-}
-
-define double @double_cp() {
-; CHECK: @double_cp
-  ret double 0x400921FB54442D18
-}
diff --git a/test/CodeGen/ARM64/fast-isel-noconvert.ll b/test/CodeGen/ARM64/fast-isel-noconvert.ll
deleted file mode 100644
index 3517970..0000000
--- a/test/CodeGen/ARM64/fast-isel-noconvert.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -O0 %s -o - | FileCheck %s
-
-; Fast-isel can't do vector conversions yet, but it was emitting some highly
-; suspect UCVTFUWDri MachineInstrs.
-define <4 x float> @test_uitofp(<4 x i32> %in) {
-; CHECK-LABEL: test_uitofp:
-; CHECK: ucvtf.4s v0, v0
-
-  %res = uitofp <4 x i32> %in to <4 x float>
-  ret <4 x float> %res
-}
-
-define <2 x double> @test_sitofp(<2 x i32> %in) {
-; CHECK-LABEL: test_sitofp:
-; CHECK: sshll.2d [[EXT:v[0-9]+]], v0, #0
-; CHECK: scvtf.2d v0, [[EXT]]
-
-  %res = sitofp <2 x i32> %in to <2 x double>
-  ret <2 x double> %res
-}
-
-define <2 x i32> @test_fptoui(<2 x float> %in) {
-; CHECK-LABEL: test_fptoui:
-; CHECK: fcvtzu.2s v0, v0
-
-  %res = fptoui <2 x float> %in to <2 x i32>
-  ret <2 x i32> %res
-}
-
-define <2 x i64> @test_fptosi(<2 x double> %in) {
-; CHECK-LABEL: test_fptosi:
-; CHECK: fcvtzs.2d v0, v0
-
-  %res = fptosi <2 x double> %in to <2 x i64>
-  ret <2 x i64> %res
-}
\ No newline at end of file
diff --git a/test/CodeGen/ARM64/fast-isel-rem.ll b/test/CodeGen/ARM64/fast-isel-rem.ll
deleted file mode 100644
index 0c68401..0000000
--- a/test/CodeGen/ARM64/fast-isel-rem.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i32 @t1(i32 %a, i32 %b) {
-; CHECK: @t1
-; CHECK: sdiv w2, w0, w1
-; CHECK: msub w2, w2, w1, w0
-  %1 = srem i32 %a, %b
-  ret i32 %1
-}
-
-define i64 @t2(i64 %a, i64 %b) {
-; CHECK: @t2
-; CHECK: sdiv x2, x0, x1
-; CHECK: msub x2, x2, x1, x0
-  %1 = srem i64 %a, %b
-  ret i64 %1
-}
-
-define i32 @t3(i32 %a, i32 %b) {
-; CHECK: @t3
-; CHECK: udiv w2, w0, w1
-; CHECK: msub w2, w2, w1, w0
-  %1 = urem i32 %a, %b
-  ret i32 %1
-}
-
-define i64 @t4(i64 %a, i64 %b) {
-; CHECK: @t4
-; CHECK: udiv x2, x0, x1
-; CHECK: msub x2, x2, x1, x0
-  %1 = urem i64 %a, %b
-  ret i64 %1
-}
diff --git a/test/CodeGen/ARM64/fast-isel-ret.ll b/test/CodeGen/ARM64/fast-isel-ret.ll
deleted file mode 100644
index d91fd28..0000000
--- a/test/CodeGen/ARM64/fast-isel-ret.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-;; Test returns.
-define void @t0() nounwind ssp {
-entry:
-; CHECK: t0
-; CHECK: ret
-  ret void
-}
-
-define i32 @t1(i32 %a) nounwind ssp {
-entry:
-; CHECK: t1
-; CHECK: str w0, [sp, #12]
-; CHECK-NEXT: ldr w0, [sp, #12]
-; CHECK: ret
-  %a.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  %tmp = load i32* %a.addr, align 4
-  ret i32 %tmp
-}
-
-define i64 @t2(i64 %a) nounwind ssp {
-entry:
-; CHECK: t2
-; CHECK: str x0, [sp, #8]
-; CHECK-NEXT: ldr x0, [sp, #8]
-; CHECK: ret
-  %a.addr = alloca i64, align 8
-  store i64 %a, i64* %a.addr, align 8
-  %tmp = load i64* %a.addr, align 8
-  ret i64 %tmp
-}
-
-define signext i16 @ret_i16(i16 signext %a) nounwind {
-entry:
-; CHECK: @ret_i16
-; CHECK: sxth	w0, w0
-  %a.addr = alloca i16, align 1
-  store i16 %a, i16* %a.addr, align 1
-  %0 = load i16* %a.addr, align 1
-  ret i16 %0
-}
-
-define signext i8 @ret_i8(i8 signext %a) nounwind {
-entry:
-; CHECK: @ret_i8
-; CHECK: sxtb	w0, w0
-  %a.addr = alloca i8, align 1
-  store i8 %a, i8* %a.addr, align 1
-  %0 = load i8* %a.addr, align 1
-  ret i8 %0
-}
-
-define signext i1 @ret_i1(i1 signext %a) nounwind {
-entry:
-; CHECK: @ret_i1
-; CHECK: and w0, w0, #0x1
-  %a.addr = alloca i1, align 1
-  store i1 %a, i1* %a.addr, align 1
-  %0 = load i1* %a.addr, align 1
-  ret i1 %0
-}
diff --git a/test/CodeGen/ARM64/fast-isel-select.ll b/test/CodeGen/ARM64/fast-isel-select.ll
deleted file mode 100644
index 1cc207f..0000000
--- a/test/CodeGen/ARM64/fast-isel-select.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i32 @t1(i32 %c) nounwind readnone {
-entry:
-; CHECK: @t1
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
-  %0 = icmp sgt i32 %c, 1
-  %1 = select i1 %0, i32 123, i32 357
-  ret i32 %1
-}
-
-define i64 @t2(i32 %c) nounwind readnone {
-entry:
-; CHECK: @t2
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
-  %0 = icmp sgt i32 %c, 1
-  %1 = select i1 %0, i64 123, i64 357
-  ret i64 %1
-}
-
-define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
-entry:
-; CHECK: @t3
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
-  %0 = select i1 %c, i32 %a, i32 %b
-  ret i32 %0
-}
-
-define i64 @t4(i1 %c, i64 %a, i64 %b) nounwind readnone {
-entry:
-; CHECK: @t4
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
-  %0 = select i1 %c, i64 %a, i64 %b
-  ret i64 %0
-}
-
-define float @t5(i1 %c, float %a, float %b) nounwind readnone {
-entry:
-; CHECK: @t5
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: fcsel s0, s0, s1, ne
-  %0 = select i1 %c, float %a, float %b
-  ret float %0
-}
-
-define double @t6(i1 %c, double %a, double %b) nounwind readnone {
-entry:
-; CHECK: @t6
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: fcsel d0, d0, d1, ne
-  %0 = select i1 %c, double %a, double %b
-  ret double %0
-}
diff --git a/test/CodeGen/ARM64/fast-isel.ll b/test/CodeGen/ARM64/fast-isel.ll
deleted file mode 100644
index ba718d3..0000000
--- a/test/CodeGen/ARM64/fast-isel.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define void @t0(i32 %a) nounwind {
-entry:
-; CHECK: t0
-; CHECK: str {{w[0-9]+}}, [sp, #12]
-; CHECK-NEXT: ldr [[REGISTER:w[0-9]+]], [sp, #12]
-; CHECK-NEXT: str [[REGISTER]], [sp, #12]
-; CHECK: ret
-  %a.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr
-  %tmp = load i32* %a.addr
-  store i32 %tmp, i32* %a.addr
-  ret void
-}
-
-define void @t1(i64 %a) nounwind {
-; CHECK: t1
-; CHECK: str {{x[0-9]+}}, [sp, #8]
-; CHECK-NEXT: ldr [[REGISTER:x[0-9]+]], [sp, #8]
-; CHECK-NEXT: str [[REGISTER]], [sp, #8]
-; CHECK: ret
-  %a.addr = alloca i64, align 4
-  store i64 %a, i64* %a.addr
-  %tmp = load i64* %a.addr
-  store i64 %tmp, i64* %a.addr
-  ret void
-}
-
-define zeroext i1 @i1(i1 %a) nounwind {
-entry:
-; CHECK: @i1
-; CHECK: and w0, w0, #0x1
-; CHECK: strb w0, [sp, #15]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: and w0, w0, #0x1
-; CHECK: and w0, w0, #0x1
-; CHECK: add sp, sp, #16
-; CHECK: ret
-  %a.addr = alloca i1, align 1
-  store i1 %a, i1* %a.addr, align 1
-  %0 = load i1* %a.addr, align 1
-  ret i1 %0
-}
-
-define i32 @t2(i32 *%ptr) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: ldur w0, [x0, #-4]
-; CHECK: ret
-  %0 = getelementptr i32 *%ptr, i32 -1
-  %1 = load i32* %0, align 4
-  ret i32 %1
-}
-
-define i32 @t3(i32 *%ptr) nounwind {
-entry:
-; CHECK-LABEL: t3:
-; CHECK: ldur w0, [x0, #-256]
-; CHECK: ret
-  %0 = getelementptr i32 *%ptr, i32 -64
-  %1 = load i32* %0, align 4
-  ret i32 %1
-}
-
-define void @t4(i32 *%ptr) nounwind {
-entry:
-; CHECK-LABEL: t4:
-; CHECK: movz w8, #0
-; CHECK: stur w8, [x0, #-4]
-; CHECK: ret
-  %0 = getelementptr i32 *%ptr, i32 -1
-  store i32 0, i32* %0, align 4
-  ret void
-}
-
-define void @t5(i32 *%ptr) nounwind {
-entry:
-; CHECK-LABEL: t5:
-; CHECK: movz w8, #0
-; CHECK: stur w8, [x0, #-256]
-; CHECK: ret
-  %0 = getelementptr i32 *%ptr, i32 -64
-  store i32 0, i32* %0, align 4
-  ret void
-}
-
-define void @t6() nounwind {
-; CHECK: t6
-; CHECK: brk #1
-  tail call void @llvm.trap()
-  ret void
-}
-
-declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM64/fastcc-tailcall.ll b/test/CodeGen/ARM64/fastcc-tailcall.ll
deleted file mode 100644
index 8a744c5..0000000
--- a/test/CodeGen/ARM64/fastcc-tailcall.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define void @caller(i32* nocapture %p, i32 %a, i32 %b) nounwind optsize ssp {
-; CHECK-NOT: stp
-; CHECK: b       {{_callee|callee}}
-; CHECK-NOT: ldp
-; CHECK: ret
-  %1 = icmp eq i32 %b, 0
-  br i1 %1, label %3, label %2
-
-  tail call fastcc void @callee(i32* %p, i32 %a) optsize
-  br label %3
-
-  ret void
-}
-
-define internal fastcc void @callee(i32* nocapture %p, i32 %a) nounwind optsize noinline ssp {
-  store volatile i32 %a, i32* %p, align 4, !tbaa !0
-  ret void
-}
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll b/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
deleted file mode 100644
index af9fe05..0000000
--- a/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; fastisel should not fold add with non-pointer bitwidth
-; sext(a) + sext(b) != sext(a + b)
-; RUN: llc -mtriple=arm64-apple-darwin %s -O0 -o - | FileCheck %s
-
-define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
-entry:
-  %ptr.addr = alloca i8*, align 8
-  %add = add i8 64, 64 ; 0x40 + 0x40
-  %0 = load i8** %ptr.addr, align 8
-
-  ; CHECK-LABEL: _gep_promotion:
-  ; CHECK: ldrb {{[a-z][0-9]+}}, {{\[[a-z][0-9]+\]}}
-  %arrayidx = getelementptr inbounds i8* %0, i8 %add
-
-  %1 = load i8* %arrayidx, align 1
-  ret i8 %1
-}
-
diff --git a/test/CodeGen/ARM64/fcmp-opt.ll b/test/CodeGen/ARM64/fcmp-opt.ll
deleted file mode 100644
index 17412dd..0000000
--- a/test/CodeGen/ARM64/fcmp-opt.ll
+++ /dev/null
@@ -1,173 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
-; rdar://10263824
-
-define i1 @fcmp_float1(float %a) nounwind ssp {
-entry:
-; CHECK: @fcmp_float1
-; CHECK: fcmp s0, #0.0
-; CHECK: csinc w0, wzr, wzr, eq
-  %cmp = fcmp une float %a, 0.000000e+00
-  ret i1 %cmp
-}
-
-define i1 @fcmp_float2(float %a, float %b) nounwind ssp {
-entry:
-; CHECK: @fcmp_float2
-; CHECK: fcmp s0, s1
-; CHECK: csinc w0, wzr, wzr, eq
-  %cmp = fcmp une float %a, %b
-  ret i1 %cmp
-}
-
-define i1 @fcmp_double1(double %a) nounwind ssp {
-entry:
-; CHECK: @fcmp_double1
-; CHECK: fcmp d0, #0.0
-; CHECK: csinc w0, wzr, wzr, eq
-  %cmp = fcmp une double %a, 0.000000e+00
-  ret i1 %cmp
-}
-
-define i1 @fcmp_double2(double %a, double %b) nounwind ssp {
-entry:
-; CHECK: @fcmp_double2
-; CHECK: fcmp d0, d1
-; CHECK: csinc w0, wzr, wzr, eq
-  %cmp = fcmp une double %a, %b
-  ret i1 %cmp
-}
-
-; Check each fcmp condition
-define float @fcmp_oeq(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_oeq
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ne
-  %cmp = fcmp oeq float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ogt(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ogt
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, le
-  %cmp = fcmp ogt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_oge(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_oge
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, lt
-  %cmp = fcmp oge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_olt(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_olt
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, pl
-  %cmp = fcmp olt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ole(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ole
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, hi
-  %cmp = fcmp ole float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ord(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ord
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, vs
-  %cmp = fcmp ord float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_uno(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_uno
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, vc
-  %cmp = fcmp uno float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ugt(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ugt
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ls
-  %cmp = fcmp ugt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_uge(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_uge
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, mi
-  %cmp = fcmp uge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ult(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ult
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ge
-  %cmp = fcmp ult float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_ule(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ule
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, gt
-  %cmp = fcmp ule float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-define float @fcmp_une(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_une
-; CHECK: fcmp s0, s1
-; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
-  %cmp = fcmp une float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-; Possible opportunity for improvement.  See comment in
-; ARM64TargetLowering::LowerSETCC()
-define float @fcmp_one(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_one
-;	fcmp	s0, s1
-;	orr	w0, wzr, #0x1
-;	csel	w1, w0, wzr, mi
-;	csel	w0, w0, wzr, gt
-  %cmp = fcmp one float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
-
-; Possible opportunity for improvement.  See comment in
-; ARM64TargetLowering::LowerSETCC()
-define float @fcmp_ueq(float %a, float %b) nounwind ssp {
-; CHECK: @fcmp_ueq
-; CHECK: fcmp s0, s1
-;        orr w0, wzr, #0x1
-; CHECK: csel [[REG1:w[0-9]]], [[REG2:w[0-9]+]], wzr, eq
-; CHECK: csel {{w[0-9]+}}, [[REG2]], [[REG1]], vs
-  %cmp = fcmp ueq float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
-}
diff --git a/test/CodeGen/ARM64/fcopysign.ll b/test/CodeGen/ARM64/fcopysign.ll
deleted file mode 100644
index 094ce7a..0000000
--- a/test/CodeGen/ARM64/fcopysign.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-
-; rdar://9332258
-
-define float @test1(float %x, float %y) nounwind {
-entry:
-; CHECK-LABEL: test1:
-; CHECK: movi.4s	v2, #128, lsl #24
-; CHECK: bit.16b	v0, v1, v2
-  %0 = tail call float @copysignf(float %x, float %y) nounwind readnone
-  ret float %0
-}
-
-define double @test2(double %x, double %y) nounwind {
-entry:
-; CHECK-LABEL: test2:
-; CHECK: movi.2d	v2, #0
-; CHECK: fneg.2d	v2, v2
-; CHECK: bit.16b	v0, v1, v2
-  %0 = tail call double @copysign(double %x, double %y) nounwind readnone
-  ret double %0
-}
-
-; rdar://9545768
-define double @test3(double %a, float %b, float %c) nounwind {
-; CHECK-LABEL: test3:
-; CHECK: fcvt d1, s1
-; CHECK: fneg.2d v2, v{{[0-9]+}}
-; CHECK: bit.16b v0, v1, v2
-  %tmp1 = fadd float %b, %c
-  %tmp2 = fpext float %tmp1 to double
-  %tmp = tail call double @copysign( double %a, double %tmp2 ) nounwind readnone
-  ret double %tmp
-}
-
-define float @test4() nounwind {
-entry:
-; CHECK-LABEL: test4:
-; CHECK: fcvt s0, d0
-; CHECK: movi.4s v[[CONST:[0-9]+]], #128, lsl #24
-; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
-  %0 = tail call double (...)* @bar() nounwind
-  %1 = fptrunc double %0 to float
-  %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
-  %3 = fadd float %1, %2
-  ret float %3
-}
-
-declare double @bar(...)
-declare double @copysign(double, double) nounwind readnone
-declare float @copysignf(float, float) nounwind readnone
diff --git a/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll b/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
deleted file mode 100644
index 77981f2..0000000
--- a/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-; DAGCombine to transform a conversion of an extract_vector_elt to an
-; extract_vector_elt of a conversion, which saves a round trip of copies
-; of the value to a GPR and back to and FPR.
-; rdar://11855286
-define double @foo0(<2 x i64> %a) nounwind {
-; CHECK:  scvtf.2d  [[REG:v[0-9]+]], v0, #9
-; CHECK-NEXT:  ins.d v0[0], [[REG]][1]
-  %vecext = extractelement <2 x i64> %a, i32 1
-  %fcvt_n = tail call double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
-  ret double %fcvt_n
-}
-
-declare double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/fmadd.ll b/test/CodeGen/ARM64/fmadd.ll
deleted file mode 100644
index d00aaef..0000000
--- a/test/CodeGen/ARM64/fmadd.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc -march=arm64 < %s | FileCheck %s
-
-define float @fma32(float %a, float %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fma32:
-; CHECK: fmadd
-  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
-  ret float %0
-}
-
-define float @fnma32(float %a, float %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fnma32:
-; CHECK: fnmadd
-  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
-  %mul = fmul float %0, -1.000000e+00
-  ret float %mul
-}
-
-define float @fms32(float %a, float %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fms32:
-; CHECK: fmsub
-  %mul = fmul float %b, -1.000000e+00
-  %0 = tail call float @llvm.fma.f32(float %a, float %mul, float %c)
-  ret float %0
-}
-
-define float @fms32_com(float %a, float %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fms32_com:
-; CHECK: fmsub
-  %mul = fmul float %b, -1.000000e+00
-  %0 = tail call float @llvm.fma.f32(float %mul, float %a, float %c)
-  ret float %0
-}
-
-define float @fnms32(float %a, float %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fnms32:
-; CHECK: fnmsub
-  %mul = fmul float %c, -1.000000e+00
-  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %mul)
-  ret float %0
-}
-
-define double @fma64(double %a, double %b, double %c) nounwind readnone ssp {
-; CHECK-LABEL: fma64:
-; CHECK: fmadd
-entry:
-  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
-  ret double %0
-}
-
-define double @fnma64(double %a, double %b, double %c) nounwind readnone ssp {
-; CHECK-LABEL: fnma64:
-; CHECK: fnmadd
-entry:
-  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
-  %mul = fmul double %0, -1.000000e+00
-  ret double %mul
-}
-
-define double @fms64(double %a, double %b, double %c) nounwind readnone ssp {
-; CHECK-LABEL: fms64:
-; CHECK: fmsub
-entry:
-  %mul = fmul double %b, -1.000000e+00
-  %0 = tail call double @llvm.fma.f64(double %a, double %mul, double %c)
-  ret double %0
-}
-
-define double @fms64_com(double %a, double %b, double %c) nounwind readnone ssp {
-; CHECK-LABEL: fms64_com:
-; CHECK: fmsub
-entry:
-  %mul = fmul double %b, -1.000000e+00
-  %0 = tail call double @llvm.fma.f64(double %mul, double %a, double %c)
-  ret double %0
-}
-
-define double @fnms64(double %a, double %b, double %c) nounwind readnone ssp {
-; CHECK-LABEL: fnms64:
-; CHECK: fnmsub
-entry:
-  %mul = fmul double %c, -1.000000e+00
-  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %mul)
-  ret double %0
-}
-
-declare float @llvm.fma.f32(float, float, float) nounwind readnone
-declare double @llvm.fma.f64(double, double, double) nounwind readnone
diff --git a/test/CodeGen/ARM64/fmax.ll b/test/CodeGen/ARM64/fmax.ll
deleted file mode 100644
index 53ecf86..0000000
--- a/test/CodeGen/ARM64/fmax.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s
-
-define double @test_direct(float %in) #1 {
-entry:
-  %cmp = fcmp olt float %in, 0.000000e+00
-  %longer = fpext float %in to double
-  %val = select i1 %cmp, double 0.000000e+00, double %longer
-  ret double %val
-
-; CHECK: fmax
-}
-
-define double @test_cross(float %in) #1 {
-entry:
-  %cmp = fcmp olt float %in, 0.000000e+00
-  %longer = fpext float %in to double
-  %val = select i1 %cmp, double %longer, double 0.000000e+00
-  ret double %val
-
-; CHECK: fmin
-}
diff --git a/test/CodeGen/ARM64/fminv.ll b/test/CodeGen/ARM64/fminv.ll
deleted file mode 100644
index ca706d8..0000000
--- a/test/CodeGen/ARM64/fminv.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-
-define float @test_fminv_v2f32(<2 x float> %in) {
-; CHECK: test_fminv_v2f32:
-; CHECK: fminp s0, v0.2s
-  %min = call float @llvm.arm64.neon.fminv.f32.v2f32(<2 x float> %in)
-  ret float %min
-}
-
-define float @test_fminv_v4f32(<4 x float> %in) {
-; CHECK: test_fminv_v4f32:
-; CHECK: fminv s0, v0.4s
-  %min = call float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float> %in)
-  ret float %min
-}
-
-define double @test_fminv_v2f64(<2 x double> %in) {
-; CHECK: test_fminv_v2f64:
-; CHECK: fminp d0, v0.2d
-  %min = call double @llvm.arm64.neon.fminv.f64.v2f64(<2 x double> %in)
-  ret double %min
-}
-
-declare float @llvm.arm64.neon.fminv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fminv.f64.v2f64(<2 x double>)
-
-define float @test_fmaxv_v2f32(<2 x float> %in) {
-; CHECK: test_fmaxv_v2f32:
-; CHECK: fmaxp s0, v0.2s
-  %max = call float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float> %in)
-  ret float %max
-}
-
-define float @test_fmaxv_v4f32(<4 x float> %in) {
-; CHECK: test_fmaxv_v4f32:
-; CHECK: fmaxv s0, v0.4s
-  %max = call float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float> %in)
-  ret float %max
-}
-
-define double @test_fmaxv_v2f64(<2 x double> %in) {
-; CHECK: test_fmaxv_v2f64:
-; CHECK: fmaxp d0, v0.2d
-  %max = call double @llvm.arm64.neon.fmaxv.f64.v2f64(<2 x double> %in)
-  ret double %max
-}
-
-declare float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fmaxv.f64.v2f64(<2 x double>)
-
-define float @test_fminnmv_v2f32(<2 x float> %in) {
-; CHECK: test_fminnmv_v2f32:
-; CHECK: fminnmp s0, v0.2s
-  %minnm = call float @llvm.arm64.neon.fminnmv.f32.v2f32(<2 x float> %in)
-  ret float %minnm
-}
-
-define float @test_fminnmv_v4f32(<4 x float> %in) {
-; CHECK: test_fminnmv_v4f32:
-; CHECK: fminnmv s0, v0.4s
-  %minnm = call float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float> %in)
-  ret float %minnm
-}
-
-define double @test_fminnmv_v2f64(<2 x double> %in) {
-; CHECK: test_fminnmv_v2f64:
-; CHECK: fminnmp d0, v0.2d
-  %minnm = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
-  ret double %minnm
-}
-
-declare float @llvm.arm64.neon.fminnmv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
-
-define float @test_fmaxnmv_v2f32(<2 x float> %in) {
-; CHECK: test_fmaxnmv_v2f32:
-; CHECK: fmaxnmp s0, v0.2s
-  %maxnm = call float @llvm.arm64.neon.fmaxnmv.f32.v2f32(<2 x float> %in)
-  ret float %maxnm
-}
-
-define float @test_fmaxnmv_v4f32(<4 x float> %in) {
-; CHECK: test_fmaxnmv_v4f32:
-; CHECK: fmaxnmv s0, v0.4s
-  %maxnm = call float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float> %in)
-  ret float %maxnm
-}
-
-define double @test_fmaxnmv_v2f64(<2 x double> %in) {
-; CHECK: test_fmaxnmv_v2f64:
-; CHECK: fmaxnmp d0, v0.2d
-  %maxnm = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
-  ret double %maxnm
-}
-
-declare float @llvm.arm64.neon.fmaxnmv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/fmuladd.ll b/test/CodeGen/ARM64/fmuladd.ll
deleted file mode 100644
index 174d830..0000000
--- a/test/CodeGen/ARM64/fmuladd.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define float @test_f32(float* %A, float* %B, float* %C) nounwind {
-;CHECK-LABEL: test_f32:
-;CHECK: fmadd
-;CHECK-NOT: fmadd
-  %tmp1 = load float* %A
-  %tmp2 = load float* %B
-  %tmp3 = load float* %C
-  %tmp4 = call float @llvm.fmuladd.f32(float %tmp1, float %tmp2, float %tmp3)
-  ret float %tmp4
-}
-
-define <2 x float> @test_v2f32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
-;CHECK-LABEL: test_v2f32:
-;CHECK: fmla.2s
-;CHECK-NOT: fmla.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = load <2 x float>* %C
-  %tmp4 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
-  ret <2 x float> %tmp4
-}
-
-define <4 x float> @test_v4f32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
-;CHECK-LABEL: test_v4f32:
-;CHECK: fmla.4s
-;CHECK-NOT: fmla.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = load <4 x float>* %C
-  %tmp4 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
-  ret <4 x float> %tmp4
-}
-
-define <8 x float> @test_v8f32(<8 x float>* %A, <8 x float>* %B, <8 x float>* %C) nounwind {
-;CHECK-LABEL: test_v8f32:
-;CHECK: fmla.4s
-;CHECK: fmla.4s
-;CHECK-NOT: fmla.4s
-  %tmp1 = load <8 x float>* %A
-  %tmp2 = load <8 x float>* %B
-  %tmp3 = load <8 x float>* %C
-  %tmp4 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %tmp1, <8 x float> %tmp2, <8 x float> %tmp3)
-  ret <8 x float> %tmp4
-}
-
-define double @test_f64(double* %A, double* %B, double* %C) nounwind {
-;CHECK-LABEL: test_f64:
-;CHECK: fmadd
-;CHECK-NOT: fmadd
-  %tmp1 = load double* %A
-  %tmp2 = load double* %B
-  %tmp3 = load double* %C
-  %tmp4 = call double @llvm.fmuladd.f64(double %tmp1, double %tmp2, double %tmp3)
-  ret double %tmp4
-}
-
-define <2 x double> @test_v2f64(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
-;CHECK-LABEL: test_v2f64:
-;CHECK: fmla.2d
-;CHECK-NOT: fmla.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = load <2 x double>* %C
-  %tmp4 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
-  ret <2 x double> %tmp4
-}
-
-define <4 x double> @test_v4f64(<4 x double>* %A, <4 x double>* %B, <4 x double>* %C) nounwind {
-;CHECK-LABEL: test_v4f64:
-;CHECK: fmla.2d
-;CHECK: fmla.2d
-;CHECK-NOT: fmla.2d
-  %tmp1 = load <4 x double>* %A
-  %tmp2 = load <4 x double>* %B
-  %tmp3 = load <4 x double>* %C
-  %tmp4 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %tmp1, <4 x double> %tmp2, <4 x double> %tmp3)
-  ret <4 x double> %tmp4
-}
-
-declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
-declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
-declare double @llvm.fmuladd.f64(double, double, double) nounwind readnone
-declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
diff --git a/test/CodeGen/ARM64/fold-address.ll b/test/CodeGen/ARM64/fold-address.ll
deleted file mode 100644
index 96cc3e9..0000000
--- a/test/CodeGen/ARM64/fold-address.ll
+++ /dev/null
@@ -1,79 +0,0 @@
-; RUN: llc < %s -O2 -mtriple=arm64-apple-darwin | FileCheck %s
-
-%0 = type opaque
-%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
-%struct.CGPoint = type { double, double }
-%struct.CGSize = type { double, double }
-
-@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
-
-define hidden %struct.CGRect @nofold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
-entry:
-; CHECK-LABEL: nofold:
-; CHECK: add x[[REG:[0-9]+]], x0, x{{[0-9]+}}
-; CHECK: ldp d0, d1, [x[[REG]]]
-; CHECK: ldp d2, d3, [x[[REG]], #16]
-; CHECK: ret
-  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
-  %0 = bitcast %0* %self to i8*
-  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
-  %add.ptr10.0 = bitcast i8* %add.ptr to double*
-  %tmp11 = load double* %add.ptr10.0, align 8
-  %add.ptr.sum = add i64 %ivar, 8
-  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
-  %1 = bitcast i8* %add.ptr10.1 to double*
-  %tmp12 = load double* %1, align 8
-  %add.ptr.sum17 = add i64 %ivar, 16
-  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
-  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
-  %tmp = load double* %add.ptr4.1.0, align 8
-  %add.ptr4.1.sum = add i64 %ivar, 24
-  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
-  %2 = bitcast i8* %add.ptr4.1.1 to double*
-  %tmp5 = load double* %2, align 8
-  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
-  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
-  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
-  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
-  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
-  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
-  ret %struct.CGRect %insert3
-}
-
-define hidden %struct.CGRect @fold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
-entry:
-; CHECK-LABEL: fold:
-; CHECK: ldr d0, [x0, x{{[0-9]+}}]
-; CHECK-NOT: add x0, x0, x1
-; CHECK: ret
-  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
-  %0 = bitcast %0* %self to i8*
-  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
-  %add.ptr10.0 = bitcast i8* %add.ptr to double*
-  %tmp11 = load double* %add.ptr10.0, align 8
-  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %ivar
-  %1 = bitcast i8* %add.ptr10.1 to double*
-  %tmp12 = load double* %1, align 8
-  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %ivar
-  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
-  %tmp = load double* %add.ptr4.1.0, align 8
-  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %ivar
-  %2 = bitcast i8* %add.ptr4.1.1 to double*
-  %tmp5 = load double* %2, align 8
-  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
-  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
-  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
-  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
-  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
-  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
-  ret %struct.CGRect %insert3
-}
-
-
-!llvm.module.flags = !{!0, !1, !2, !3}
-
-!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
-!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
-!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!4 = metadata !{}
diff --git a/test/CodeGen/ARM64/fold-lsl.ll b/test/CodeGen/ARM64/fold-lsl.ll
deleted file mode 100644
index a856c96..0000000
--- a/test/CodeGen/ARM64/fold-lsl.ll
+++ /dev/null
@@ -1,79 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-;
-; <rdar://problem/14486451>
-
-%struct.a = type [256 x i16]
-%struct.b = type [256 x i32]
-%struct.c = type [256 x i64]
-
-define i16 @load_halfword(%struct.a* %ctx, i32 %xor72) nounwind {
-; CHECK-LABEL: load_halfword:
-; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
-; CHECK: ldrh w0, [x0, [[REG]], lsl #1]
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %idxprom83 = and i64 %conv82, 255
-  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
-  %result = load i16* %arrayidx86, align 2
-  ret i16 %result
-}
-
-define i32 @load_word(%struct.b* %ctx, i32 %xor72) nounwind {
-; CHECK-LABEL: load_word:
-; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
-; CHECK: ldr w0, [x0, [[REG]], lsl #2]
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %idxprom83 = and i64 %conv82, 255
-  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
-  %result = load i32* %arrayidx86, align 4
-  ret i32 %result
-}
-
-define i64 @load_doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
-; CHECK-LABEL: load_doubleword:
-; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
-; CHECK: ldr x0, [x0, [[REG]], lsl #3]
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %idxprom83 = and i64 %conv82, 255
-  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
-  %result = load i64* %arrayidx86, align 8
-  ret i64 %result
-}
-
-define void @store_halfword(%struct.a* %ctx, i32 %xor72, i16 %val) nounwind {
-; CHECK-LABEL: store_halfword:
-; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
-; CHECK: strh w2, [x0, [[REG]], lsl #1]
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %idxprom83 = and i64 %conv82, 255
-  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
-  store i16 %val, i16* %arrayidx86, align 8
-  ret void
-}
-
-define void @store_word(%struct.b* %ctx, i32 %xor72, i32 %val) nounwind {
-; CHECK-LABEL: store_word:
-; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
-; CHECK: str w2, [x0, [[REG]], lsl #2]
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %idxprom83 = and i64 %conv82, 255
-  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
-  store i32 %val, i32* %arrayidx86, align 8
-  ret void
-}
-
-define void @store_doubleword(%struct.c* %ctx, i32 %xor72, i64 %val) nounwind {
-; CHECK-LABEL: store_doubleword:
-; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
-; CHECK: str x2, [x0, [[REG]], lsl #3]
-  %shr81 = lshr i32 %xor72, 9
-  %conv82 = zext i32 %shr81 to i64
-  %idxprom83 = and i64 %conv82, 255
-  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
-  store i64 %val, i64* %arrayidx86, align 8
-  ret void
-}
diff --git a/test/CodeGen/ARM64/fp-imm.ll b/test/CodeGen/ARM64/fp-imm.ll
deleted file mode 100644
index 6e271e0..0000000
--- a/test/CodeGen/ARM64/fp-imm.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-
-; CHECK: literal8
-; CHECK: .quad  4614256656552045848
-define double @foo() {
-; CHECK: _foo:
-; CHECK: adrp x[[REG:[0-9]+]], lCPI0_0@PAGE
-; CHECK: ldr  d0, [x[[REG]], lCPI0_0@PAGEOFF]
-; CHECK-NEXT: ret
-  ret double 0x400921FB54442D18
-}
-
-; CHECK: literal4
-; CHECK: .long 1078530011
-define float @bar() {
-; CHECK: _bar:
-; CHECK:  adrp  x[[REG:[0-9]+]], lCPI1_0@PAGE
-; CHECK:  ldr s0, [x[[REG]], lCPI1_0@PAGEOFF]
-; CHECK-NEXT:  ret
-  ret float 0x400921FB60000000
-}
-
-; CHECK: literal16
-; CHECK: .quad 0
-; CHECK: .quad 0
-define fp128 @baz() {
-; CHECK: _baz:
-; CHECK:  adrp x[[REG:[0-9]+]], lCPI2_0@PAGE
-; CHECK:  ldr  q0, [x[[REG]], lCPI2_0@PAGEOFF]
-; CHECK-NEXT:  ret
-  ret fp128 0xL00000000000000000000000000000000
-}
diff --git a/test/CodeGen/ARM64/fp.ll b/test/CodeGen/ARM64/fp.ll
deleted file mode 100644
index 08b1b67..0000000
--- a/test/CodeGen/ARM64/fp.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define float @t1(i1 %a, float %b, float %c) nounwind {
-; CHECK: t1
-; CHECK: fcsel	s0, s0, s1, ne
-  %sel = select i1 %a, float %b, float %c
-  ret float %sel
-}
diff --git a/test/CodeGen/ARM64/fp128-folding.ll b/test/CodeGen/ARM64/fp128-folding.ll
deleted file mode 100644
index 6a7d203..0000000
--- a/test/CodeGen/ARM64/fp128-folding.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
-declare void @bar(i8*, i8*, i32*)
-
-; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
-; which is not supported.
-
-define fp128 @test_folding() {
-; CHECK-LABEL: test_folding:
-  %l = alloca i32
-  store i32 42, i32* %l
-  %val = load i32* %l
-  %fpval = sitofp i32 %val to fp128
-  ; If the value is loaded from a constant pool into an fp128, it's been folded
-  ; successfully.
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}},
-  ret fp128 %fpval
-}
diff --git a/test/CodeGen/ARM64/fp128.ll b/test/CodeGen/ARM64/fp128.ll
deleted file mode 100644
index 21eb893..0000000
--- a/test/CodeGen/ARM64/fp128.ll
+++ /dev/null
@@ -1,274 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-
-@lhs = global fp128 zeroinitializer, align 16
-@rhs = global fp128 zeroinitializer, align 16
-
-define fp128 @test_add() {
-; CHECK-LABEL: test_add:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-  %val = fadd fp128 %lhs, %rhs
-; CHECK: bl __addtf3
-  ret fp128 %val
-}
-
-define fp128 @test_sub() {
-; CHECK-LABEL: test_sub:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-  %val = fsub fp128 %lhs, %rhs
-; CHECK: bl __subtf3
-  ret fp128 %val
-}
-
-define fp128 @test_mul() {
-; CHECK-LABEL: test_mul:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-  %val = fmul fp128 %lhs, %rhs
-; CHECK: bl __multf3
-  ret fp128 %val
-}
-
-define fp128 @test_div() {
-; CHECK-LABEL: test_div:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-  %val = fdiv fp128 %lhs, %rhs
-; CHECK: bl __divtf3
-  ret fp128 %val
-}
-
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @test_fptosi() {
-; CHECK-LABEL: test_fptosi:
-  %val = load fp128* @lhs, align 16
-
-  %val32 = fptosi fp128 %val to i32
-  store i32 %val32, i32* @var32
-; CHECK: bl __fixtfsi
-
-  %val64 = fptosi fp128 %val to i64
-  store i64 %val64, i64* @var64
-; CHECK: bl __fixtfdi
-
-  ret void
-}
-
-define void @test_fptoui() {
-; CHECK-LABEL: test_fptoui:
-  %val = load fp128* @lhs, align 16
-
-  %val32 = fptoui fp128 %val to i32
-  store i32 %val32, i32* @var32
-; CHECK: bl __fixunstfsi
-
-  %val64 = fptoui fp128 %val to i64
-  store i64 %val64, i64* @var64
-; CHECK: bl __fixunstfdi
-
-  ret void
-}
-
-define void @test_sitofp() {
-; CHECK-LABEL: test_sitofp:
-
-  %src32 = load i32* @var32
-  %val32 = sitofp i32 %src32 to fp128
-  store volatile fp128 %val32, fp128* @lhs
-; CHECK: bl __floatsitf
-
-  %src64 = load i64* @var64
-  %val64 = sitofp i64 %src64 to fp128
-  store volatile fp128 %val64, fp128* @lhs
-; CHECK: bl __floatditf
-
-  ret void
-}
-
-define void @test_uitofp() {
-; CHECK-LABEL: test_uitofp:
-
-  %src32 = load i32* @var32
-  %val32 = uitofp i32 %src32 to fp128
-  store volatile fp128 %val32, fp128* @lhs
-; CHECK: bl __floatunsitf
-
-  %src64 = load i64* @var64
-  %val64 = uitofp i64 %src64 to fp128
-  store volatile fp128 %val64, fp128* @lhs
-; CHECK: bl __floatunditf
-
-  ret void
-}
-
-define i1 @test_setcc1() {
-; CHECK-LABEL: test_setcc1:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-; Technically, everything after the call to __letf2 is redundant, but we'll let
-; LLVM have its fun for now.
-  %val = fcmp ole fp128 %lhs, %rhs
-; CHECK: bl __letf2
-; CHECK: cmp w0, #0
-; CHECK: csinc w0, wzr, wzr, gt
-
-  ret i1 %val
-; CHECK: ret
-}
-
-define i1 @test_setcc2() {
-; CHECK-LABEL: test_setcc2:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-  %val = fcmp ugt fp128 %lhs, %rhs
-; CHECK: bl      __gttf2
-; CHECK: cmp     w0, #0
-; CHECK: csinc   [[GT:w[0-9]+]], wzr, wzr, le
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp     w0, #0
-; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
-; CHECK: orr     w0, [[UNORDERED]], [[GT]]
-
-  ret i1 %val
-; CHECK: ret
-}
-
-define i32 @test_br_cc() {
-; CHECK-LABEL: test_br_cc:
-
-  %lhs = load fp128* @lhs, align 16
-  %rhs = load fp128* @rhs, align 16
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-
-  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
-  %cond = fcmp olt fp128 %lhs, %rhs
-; CHECK: bl      __getf2
-; CHECK: cmp     w0, #0
-; CHECK: csinc   [[OGE:w[0-9]+]], wzr, wzr, lt
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp     w0, #0
-; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
-
-; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
-; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
-  br i1 %cond, label %iftrue, label %iffalse
-
-iftrue:
-  ret i32 42
-; CHECK-NEXT: BB#
-; CHECK-NEXT: movz w0, #42
-; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
-
-iffalse:
-  ret i32 29
-; CHECK: [[RET29]]:
-; CHECK-NEXT: movz w0, #29
-; CHECK-NEXT: [[REALRET]]:
-; CHECK: ret
-}
-
-define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
-; CHECK-LABEL: test_select:
-
-  %val = select i1 %cond, fp128 %lhs, fp128 %rhs
-  store fp128 %val, fp128* @lhs, align 16
-; CHECK: and [[BIT:w[0-9]+]], w0, #0x1
-; CHECK: cmp [[BIT]], #0
-; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: BB#
-; CHECK-NEXT: orr v[[VAL:[0-9]+]].16b, v0.16b, v0.16b
-; CHECK-NEXT: [[IFFALSE]]:
-; CHECK: str q[[VAL]], [{{x[0-9]+}}, :lo12:lhs]
-  ret void
-; CHECK: ret
-}
-
-@varfloat = global float 0.0, align 4
-@vardouble = global double 0.0, align 8
-
-define void @test_round() {
-; CHECK-LABEL: test_round:
-
-  %val = load fp128* @lhs, align 16
-
-  %float = fptrunc fp128 %val to float
-  store float %float, float* @varfloat, align 4
-; CHECK: bl __trunctfsf2
-; CHECK: str s0, [{{x[0-9]+}}, :lo12:varfloat]
-
-  %double = fptrunc fp128 %val to double
-  store double %double, double* @vardouble, align 8
-; CHECK: bl __trunctfdf2
-; CHECK: str d0, [{{x[0-9]+}}, :lo12:vardouble]
-
-  ret void
-}
-
-define void @test_extend() {
-; CHECK-LABEL: test_extend:
-
-  %val = load fp128* @lhs, align 16
-
-  %float = load float* @varfloat
-  %fromfloat = fpext float %float to fp128
-  store volatile fp128 %fromfloat, fp128* @lhs, align 16
-; CHECK: bl __extendsftf2
-; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
-
-  %double = load double* @vardouble
-  %fromdouble = fpext double %double to fp128
-  store volatile fp128 %fromdouble, fp128* @lhs, align 16
-; CHECK: bl __extenddftf2
-; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
-
-  ret void
-; CHECK: ret
-}
-
-define fp128 @test_neg(fp128 %in) {
-; CHECK: [[MINUS0:.LCPI[0-9]+_0]]:
-; Make sure the weird hex constant below *is* -0.0
-; CHECK-NEXT: fp128 -0
-
-; CHECK-LABEL: test_neg:
-
-  ; Could in principle be optimized to fneg which we can't select, this makes
-  ; sure that doesn't happen.
-  %ret = fsub fp128 0xL00000000000000008000000000000000, %in
-; CHECK: orr v1.16b, v0.16b, v0.16b
-; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:[[MINUS0]]]
-; CHECK: bl __subtf3
-
-  ret fp128 %ret
-; CHECK: ret
-}
diff --git a/test/CodeGen/ARM64/frame-index.ll b/test/CodeGen/ARM64/frame-index.ll
deleted file mode 100644
index 4a91ff3..0000000
--- a/test/CodeGen/ARM64/frame-index.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
-; rdar://11935841
-
-define void @t1() nounwind ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK-NOT: add x{{[0-9]+}}, sp
-; CHECK: stp x28, x27, [sp, #-16]!
-  %v = alloca [288 x i32], align 4
-  unreachable
-}
diff --git a/test/CodeGen/ARM64/frameaddr.ll b/test/CodeGen/ARM64/frameaddr.ll
deleted file mode 100644
index d0635ad..0000000
--- a/test/CodeGen/ARM64/frameaddr.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i8* @t() nounwind {
-entry:
-; CHECK-LABEL: t:
-; CHECK: stp fp, lr, [sp, #-16]!
-; CHECK: mov fp, sp
-; CHECK: mov x0, fp
-; CHECK: ldp fp, lr, [sp], #16
-; CHECK: ret
-	%0 = call i8* @llvm.frameaddress(i32 0)
-        ret i8* %0
-}
-
-declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/global-address.ll b/test/CodeGen/ARM64/global-address.ll
deleted file mode 100644
index 005f414..0000000
--- a/test/CodeGen/ARM64/global-address.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-; rdar://9618644
-
-@G = external global i32
-
-define i32 @test(i32 %off) nounwind {
-; CHECK-LABEL: test:
-; CHECK: adrp x[[REG:[0-9]+]], _G@GOTPAGE
-; CHECK: ldr  x[[REG2:[0-9]+]], [x[[REG]], _G@GOTPAGEOFF]
-; CHECK: add w0, w[[REG2]], w0
-  %tmp = ptrtoint i32* @G to i32
-  %tmp1 = add i32 %tmp, %off
-  ret i32 %tmp1
-}
diff --git a/test/CodeGen/ARM64/hello.ll b/test/CodeGen/ARM64/hello.ll
deleted file mode 100644
index f870fff..0000000
--- a/test/CodeGen/ARM64/hello.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
-
-; CHECK-LABEL: main:
-; CHECK:	stp	fp, lr, [sp, #-16]!
-; CHECK-NEXT:	mov	fp, sp
-; CHECK-NEXT:	sub	sp, sp, #16
-; CHECK-NEXT:	stur	wzr, [fp, #-4]
-; CHECK:	adrp	x0, L_.str@PAGE
-; CHECK:	add	x0, x0, L_.str@PAGEOFF
-; CHECK-NEXT:	bl	_puts
-; CHECK-NEXT:	mov	sp, fp
-; CHECK-NEXT:	ldp	fp, lr, [sp], #16
-; CHECK-NEXT:	ret
-
-; CHECK-LINUX-LABEL: main:
-; CHECK-LINUX:	stp	fp, lr, [sp, #-16]!
-; CHECK-LINUX-NEXT:	mov	fp, sp
-; CHECK-LINUX-NEXT:	sub	sp, sp, #16
-; CHECK-LINUX-NEXT:	stur	wzr, [fp, #-4]
-; CHECK-LINUX:	adrp	x0, .L.str
-; CHECK-LINUX:	add	x0, x0, :lo12:.L.str
-; CHECK-LINUX-NEXT:	bl	puts
-; CHECK-LINUX-NEXT:	mov	sp, fp
-; CHECK-LINUX-NEXT:	ldp	fp, lr, [sp], #16
-; CHECK-LINUX-NEXT:	ret
-
-@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
-
-define i32 @main() nounwind ssp {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %call = call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0))
-  ret i32 %call
-}
-
-declare i32 @puts(i8*)
diff --git a/test/CodeGen/ARM64/i16-subreg-extract.ll b/test/CodeGen/ARM64/i16-subreg-extract.ll
deleted file mode 100644
index fc2e8b5..0000000
--- a/test/CodeGen/ARM64/i16-subreg-extract.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define i32 @foo(<4 x i16>* %__a) nounwind {
-; CHECK-LABEL: foo:
-; CHECK: umov.h w{{[0-9]+}}, v{{[0-9]+}}[0]
-  %tmp18 = load <4 x i16>* %__a, align 8
-  %vget_lane = extractelement <4 x i16> %tmp18, i32 0
-  %conv = zext i16 %vget_lane to i32
-  %mul = mul nsw i32 3, %conv
-  ret i32 %mul
-}
-
diff --git a/test/CodeGen/ARM64/icmp-opt.ll b/test/CodeGen/ARM64/icmp-opt.ll
deleted file mode 100644
index f88399b..0000000
--- a/test/CodeGen/ARM64/icmp-opt.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-; Optimize (x > -1) to (x >= 0) etc.
-; Optimize (cmp (add / sub), 0): eliminate the subs used to update flag
-;   for comparison only
-; rdar://10233472
-
-define i32 @t1(i64 %a) nounwind ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK-NOT: movn
-; CHECK: cmp  x0, #0
-; CHECK: csinc w0, wzr, wzr, lt
-  %cmp = icmp sgt i64 %a, -1
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
diff --git a/test/CodeGen/ARM64/illegal-float-ops.ll b/test/CodeGen/ARM64/illegal-float-ops.ll
deleted file mode 100644
index 9a35fe5..0000000
--- a/test/CodeGen/ARM64/illegal-float-ops.ll
+++ /dev/null
@@ -1,295 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-
-@varfloat = global float 0.0
-@vardouble = global double 0.0
-@varfp128 = global fp128 zeroinitializer
-
-declare float @llvm.cos.f32(float)
-declare double @llvm.cos.f64(double)
-declare fp128 @llvm.cos.f128(fp128)
-
-define void @test_cos(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_cos:
-
-   %cosfloat = call float @llvm.cos.f32(float %float)
-   store float %cosfloat, float* @varfloat
-; CHECK: bl cosf
-
-   %cosdouble = call double @llvm.cos.f64(double %double)
-   store double %cosdouble, double* @vardouble
-; CHECK: bl cos
-
-   %cosfp128 = call fp128 @llvm.cos.f128(fp128 %fp128)
-   store fp128 %cosfp128, fp128* @varfp128
-; CHECK: bl cosl
-
-  ret void
-}
-
-declare float @llvm.exp.f32(float)
-declare double @llvm.exp.f64(double)
-declare fp128 @llvm.exp.f128(fp128)
-
-define void @test_exp(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_exp:
-
-   %expfloat = call float @llvm.exp.f32(float %float)
-   store float %expfloat, float* @varfloat
-; CHECK: bl expf
-
-   %expdouble = call double @llvm.exp.f64(double %double)
-   store double %expdouble, double* @vardouble
-; CHECK: bl exp
-
-   %expfp128 = call fp128 @llvm.exp.f128(fp128 %fp128)
-   store fp128 %expfp128, fp128* @varfp128
-; CHECK: bl expl
-
-  ret void
-}
-
-declare float @llvm.exp2.f32(float)
-declare double @llvm.exp2.f64(double)
-declare fp128 @llvm.exp2.f128(fp128)
-
-define void @test_exp2(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_exp2:
-
-   %exp2float = call float @llvm.exp2.f32(float %float)
-   store float %exp2float, float* @varfloat
-; CHECK: bl exp2f
-
-   %exp2double = call double @llvm.exp2.f64(double %double)
-   store double %exp2double, double* @vardouble
-; CHECK: bl exp2
-
-   %exp2fp128 = call fp128 @llvm.exp2.f128(fp128 %fp128)
-   store fp128 %exp2fp128, fp128* @varfp128
-; CHECK: bl exp2l
-  ret void
-
-}
-
-declare float @llvm.log.f32(float)
-declare double @llvm.log.f64(double)
-declare fp128 @llvm.log.f128(fp128)
-
-define void @test_log(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_log:
-
-   %logfloat = call float @llvm.log.f32(float %float)
-   store float %logfloat, float* @varfloat
-; CHECK: bl logf
-
-   %logdouble = call double @llvm.log.f64(double %double)
-   store double %logdouble, double* @vardouble
-; CHECK: bl log
-
-   %logfp128 = call fp128 @llvm.log.f128(fp128 %fp128)
-   store fp128 %logfp128, fp128* @varfp128
-; CHECK: bl logl
-
-  ret void
-}
-
-declare float @llvm.log2.f32(float)
-declare double @llvm.log2.f64(double)
-declare fp128 @llvm.log2.f128(fp128)
-
-define void @test_log2(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_log2:
-
-   %log2float = call float @llvm.log2.f32(float %float)
-   store float %log2float, float* @varfloat
-; CHECK: bl log2f
-
-   %log2double = call double @llvm.log2.f64(double %double)
-   store double %log2double, double* @vardouble
-; CHECK: bl log2
-
-   %log2fp128 = call fp128 @llvm.log2.f128(fp128 %fp128)
-   store fp128 %log2fp128, fp128* @varfp128
-; CHECK: bl log2l
-  ret void
-
-}
-
-declare float @llvm.log10.f32(float)
-declare double @llvm.log10.f64(double)
-declare fp128 @llvm.log10.f128(fp128)
-
-define void @test_log10(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_log10:
-
-   %log10float = call float @llvm.log10.f32(float %float)
-   store float %log10float, float* @varfloat
-; CHECK: bl log10f
-
-   %log10double = call double @llvm.log10.f64(double %double)
-   store double %log10double, double* @vardouble
-; CHECK: bl log10
-
-   %log10fp128 = call fp128 @llvm.log10.f128(fp128 %fp128)
-   store fp128 %log10fp128, fp128* @varfp128
-; CHECK: bl log10l
-
-  ret void
-}
-
-declare float @llvm.sin.f32(float)
-declare double @llvm.sin.f64(double)
-declare fp128 @llvm.sin.f128(fp128)
-
-define void @test_sin(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_sin:
-
-   %sinfloat = call float @llvm.sin.f32(float %float)
-   store float %sinfloat, float* @varfloat
-; CHECK: bl sinf
-
-   %sindouble = call double @llvm.sin.f64(double %double)
-   store double %sindouble, double* @vardouble
-; CHECK: bl sin
-
-   %sinfp128 = call fp128 @llvm.sin.f128(fp128 %fp128)
-   store fp128 %sinfp128, fp128* @varfp128
-; CHECK: bl sinl
-  ret void
-
-}
-
-declare float @llvm.pow.f32(float, float)
-declare double @llvm.pow.f64(double, double)
-declare fp128 @llvm.pow.f128(fp128, fp128)
-
-define void @test_pow(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_pow:
-
-   %powfloat = call float @llvm.pow.f32(float %float, float %float)
-   store float %powfloat, float* @varfloat
-; CHECK: bl powf
-
-   %powdouble = call double @llvm.pow.f64(double %double, double %double)
-   store double %powdouble, double* @vardouble
-; CHECK: bl pow
-
-   %powfp128 = call fp128 @llvm.pow.f128(fp128 %fp128, fp128 %fp128)
-   store fp128 %powfp128, fp128* @varfp128
-; CHECK: bl powl
-
-  ret void
-}
-
-declare float @llvm.powi.f32(float, i32)
-declare double @llvm.powi.f64(double, i32)
-declare fp128 @llvm.powi.f128(fp128, i32)
-
-define void @test_powi(float %float, double %double, i32 %exponent, fp128 %fp128) {
-; CHECK-LABEL: test_powi:
-
-   %powifloat = call float @llvm.powi.f32(float %float, i32 %exponent)
-   store float %powifloat, float* @varfloat
-; CHECK: bl __powisf2
-
-   %powidouble = call double @llvm.powi.f64(double %double, i32 %exponent)
-   store double %powidouble, double* @vardouble
-; CHECK: bl __powidf2
-
-   %powifp128 = call fp128 @llvm.powi.f128(fp128 %fp128, i32 %exponent)
-   store fp128 %powifp128, fp128* @varfp128
-; CHECK: bl __powitf2
-  ret void
-
-}
-
-define void @test_frem(float %float, double %double, fp128 %fp128) {
-; CHECK-LABEL: test_frem:
-
-  %fremfloat = frem float %float, %float
-  store float %fremfloat, float* @varfloat
-; CHECK: bl fmodf
-
-  %fremdouble = frem double %double, %double
-  store double %fremdouble, double* @vardouble
-; CHECK: bl fmod
-
-  %fremfp128 = frem fp128 %fp128, %fp128
-  store fp128 %fremfp128, fp128* @varfp128
-; CHECK: bl fmodl
-
-  ret void
-}
-
-declare fp128 @llvm.fma.f128(fp128, fp128, fp128)
-
-define void @test_fma(fp128 %fp128) {
-; CHECK-LABEL: test_fma:
-
-  %fmafp128 = call fp128 @llvm.fma.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
-  store fp128 %fmafp128, fp128* @varfp128
-; CHECK: bl fmal
-
-  ret void
-}
-
-declare fp128 @llvm.fmuladd.f128(fp128, fp128, fp128)
-
-define void @test_fmuladd(fp128 %fp128) {
-; CHECK-LABEL: test_fmuladd:
-
-  %fmuladdfp128 = call fp128 @llvm.fmuladd.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
-  store fp128 %fmuladdfp128, fp128* @varfp128
-; CHECK-NOT: bl fmal
-; CHECK: bl __multf3
-; CHECK: bl __addtf3
-
-  ret void
-}
-
-define i32 @test_fptosi32(fp128 %a) {
-; CHECK-LABEL: test_fptosi32:
-; CHECK: bl __fixtfsi
-  %conv.i = fptosi fp128 %a to i32
-  %b = add nsw i32 %conv.i, 48
-  ret i32 %b
-}
-
-define i64 @test_fptosi64(fp128 %a) {
-; CHECK-LABEL: test_fptosi64:
-; CHECK: bl __fixtfdi
-  %conv.i = fptosi fp128 %a to i64
-  %b = add nsw i64 %conv.i, 48
-  ret i64 %b
-}
-
-define i128 @test_fptosi128(fp128 %a) {
-; CHECK-LABEL: test_fptosi128:
-; CHECK: bl __fixtfti
-  %conv.i = fptosi fp128 %a to i128
-  %b = add nsw i128 %conv.i, 48
-  ret i128 %b
-}
-
-define i32 @test_fptoui32(fp128 %a) {
-; CHECK-LABEL: test_fptoui32:
-; CHECK: bl __fixunstfsi
-  %conv.i = fptoui fp128 %a to i32
-  %b = add nsw i32 %conv.i, 48
-  ret i32 %b
-}
-
-define i64 @test_fptoui64(fp128 %a) {
-; CHECK-LABEL: test_fptoui64:
-; CHECK: bl __fixunstfdi
-  %conv.i = fptoui fp128 %a to i64
-  %b = add nsw i64 %conv.i, 48
-  ret i64 %b
-}
-
-define i128 @test_fptoui128(fp128 %a) {
-; CHECK-LABEL: test_fptoui128:
-; CHECK: bl __fixunstfti
-  %conv.i = fptoui fp128 %a to i128
-  %b = add nsw i128 %conv.i, 48
-  ret i128 %b
-}
diff --git a/test/CodeGen/ARM64/indexed-memory.ll b/test/CodeGen/ARM64/indexed-memory.ll
deleted file mode 100644
index e390ed7..0000000
--- a/test/CodeGen/ARM64/indexed-memory.ll
+++ /dev/null
@@ -1,351 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
-
-define void @store64(i64** nocapture %out, i64 %index, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: store64:
-; CHECK: str x{{[0-9+]}}, [x{{[0-9+]}}], #8
-; CHECK: ret
-  %tmp = load i64** %out, align 8
-  %incdec.ptr = getelementptr inbounds i64* %tmp, i64 1
-  store i64 %spacing, i64* %tmp, align 4
-  store i64* %incdec.ptr, i64** %out, align 8
-  ret void
-}
-
-define void @store32(i32** nocapture %out, i32 %index, i32 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: store32:
-; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
-; CHECK: ret
-  %tmp = load i32** %out, align 8
-  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
-  store i32 %spacing, i32* %tmp, align 4
-  store i32* %incdec.ptr, i32** %out, align 8
-  ret void
-}
-
-define void @store16(i16** nocapture %out, i16 %index, i16 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: store16:
-; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
-; CHECK: ret
-  %tmp = load i16** %out, align 8
-  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
-  store i16 %spacing, i16* %tmp, align 4
-  store i16* %incdec.ptr, i16** %out, align 8
-  ret void
-}
-
-define void @store8(i8** nocapture %out, i8 %index, i8 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: store8:
-; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
-; CHECK: ret
-  %tmp = load i8** %out, align 8
-  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
-  store i8 %spacing, i8* %tmp, align 4
-  store i8* %incdec.ptr, i8** %out, align 8
-  ret void
-}
-
-define void @truncst64to32(i32** nocapture %out, i32 %index, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: truncst64to32:
-; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
-; CHECK: ret
-  %tmp = load i32** %out, align 8
-  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
-  %trunc = trunc i64 %spacing to i32
-  store i32 %trunc, i32* %tmp, align 4
-  store i32* %incdec.ptr, i32** %out, align 8
-  ret void
-}
-
-define void @truncst64to16(i16** nocapture %out, i16 %index, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: truncst64to16:
-; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
-; CHECK: ret
-  %tmp = load i16** %out, align 8
-  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
-  %trunc = trunc i64 %spacing to i16
-  store i16 %trunc, i16* %tmp, align 4
-  store i16* %incdec.ptr, i16** %out, align 8
-  ret void
-}
-
-define void @truncst64to8(i8** nocapture %out, i8 %index, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: truncst64to8:
-; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
-; CHECK: ret
-  %tmp = load i8** %out, align 8
-  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
-  %trunc = trunc i64 %spacing to i8
-  store i8 %trunc, i8* %tmp, align 4
-  store i8* %incdec.ptr, i8** %out, align 8
-  ret void
-}
-
-
-define void @storef32(float** nocapture %out, float %index, float %spacing) nounwind noinline ssp {
-; CHECK-LABEL: storef32:
-; CHECK: str s{{[0-9+]}}, [x{{[0-9+]}}], #4
-; CHECK: ret
-  %tmp = load float** %out, align 8
-  %incdec.ptr = getelementptr inbounds float* %tmp, i64 1
-  store float %spacing, float* %tmp, align 4
-  store float* %incdec.ptr, float** %out, align 8
-  ret void
-}
-
-define void @storef64(double** nocapture %out, double %index, double %spacing) nounwind noinline ssp {
-; CHECK-LABEL: storef64:
-; CHECK: str d{{[0-9+]}}, [x{{[0-9+]}}], #8
-; CHECK: ret
-  %tmp = load double** %out, align 8
-  %incdec.ptr = getelementptr inbounds double* %tmp, i64 1
-  store double %spacing, double* %tmp, align 4
-  store double* %incdec.ptr, double** %out, align 8
-  ret void
-}
-
-define double * @pref64(double** nocapture %out, double %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pref64:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: str     d0, [x0, #32]!
-; CHECK-NEXT: ret
-  %tmp = load double** %out, align 8
-  %ptr = getelementptr inbounds double* %tmp, i64 4
-  store double %spacing, double* %ptr, align 4
-  ret double *%ptr
-}
-
-define float * @pref32(float** nocapture %out, float %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pref32:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: str     s0, [x0, #12]!
-; CHECK-NEXT: ret
-  %tmp = load float** %out, align 8
-  %ptr = getelementptr inbounds float* %tmp, i64 3
-  store float %spacing, float* %ptr, align 4
-  ret float *%ptr
-}
-
-define i64 * @pre64(i64** nocapture %out, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pre64:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: str     x1, [x0, #16]!
-; CHECK-NEXT: ret
-  %tmp = load i64** %out, align 8
-  %ptr = getelementptr inbounds i64* %tmp, i64 2
-  store i64 %spacing, i64* %ptr, align 4
-  ret i64 *%ptr
-}
-
-define i32 * @pre32(i32** nocapture %out, i32 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pre32:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: str     w1, [x0, #8]!
-; CHECK-NEXT: ret
-  %tmp = load i32** %out, align 8
-  %ptr = getelementptr inbounds i32* %tmp, i64 2
-  store i32 %spacing, i32* %ptr, align 4
-  ret i32 *%ptr
-}
-
-define i16 * @pre16(i16** nocapture %out, i16 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pre16:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: strh    w1, [x0, #4]!
-; CHECK-NEXT: ret
-  %tmp = load i16** %out, align 8
-  %ptr = getelementptr inbounds i16* %tmp, i64 2
-  store i16 %spacing, i16* %ptr, align 4
-  ret i16 *%ptr
-}
-
-define i8 * @pre8(i8** nocapture %out, i8 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pre8:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: strb    w1, [x0, #2]!
-; CHECK-NEXT: ret
-  %tmp = load i8** %out, align 8
-  %ptr = getelementptr inbounds i8* %tmp, i64 2
-  store i8 %spacing, i8* %ptr, align 4
-  ret i8 *%ptr
-}
-
-define i32 * @pretrunc64to32(i32** nocapture %out, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pretrunc64to32:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: str     w1, [x0, #8]!
-; CHECK-NEXT: ret
-  %tmp = load i32** %out, align 8
-  %ptr = getelementptr inbounds i32* %tmp, i64 2
-  %trunc = trunc i64 %spacing to i32
-  store i32 %trunc, i32* %ptr, align 4
-  ret i32 *%ptr
-}
-
-define i16 * @pretrunc64to16(i16** nocapture %out, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pretrunc64to16:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: strh    w1, [x0, #4]!
-; CHECK-NEXT: ret
-  %tmp = load i16** %out, align 8
-  %ptr = getelementptr inbounds i16* %tmp, i64 2
-  %trunc = trunc i64 %spacing to i16
-  store i16 %trunc, i16* %ptr, align 4
-  ret i16 *%ptr
-}
-
-define i8 * @pretrunc64to8(i8** nocapture %out, i64 %spacing) nounwind noinline ssp {
-; CHECK-LABEL: pretrunc64to8:
-; CHECK: ldr     x0, [x0]
-; CHECK-NEXT: strb    w1, [x0, #2]!
-; CHECK-NEXT: ret
-  %tmp = load i8** %out, align 8
-  %ptr = getelementptr inbounds i8* %tmp, i64 2
-  %trunc = trunc i64 %spacing to i8
-  store i8 %trunc, i8* %ptr, align 4
-  ret i8 *%ptr
-}
-
-;-----
-; Pre-indexed loads
-;-----
-define double* @preidxf64(double* %src, double* %out) {
-; CHECK-LABEL: preidxf64:
-; CHECK: ldr     d0, [x0, #8]!
-; CHECK: str     d0, [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds double* %src, i64 1
-  %tmp = load double* %ptr, align 4
-  store double %tmp, double* %out, align 4
-  ret double* %ptr
-}
-
-define float* @preidxf32(float* %src, float* %out) {
-; CHECK-LABEL: preidxf32:
-; CHECK: ldr     s0, [x0, #4]!
-; CHECK: str     s0, [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds float* %src, i64 1
-  %tmp = load float* %ptr, align 4
-  store float %tmp, float* %out, align 4
-  ret float* %ptr
-}
-
-define i64* @preidx64(i64* %src, i64* %out) {
-; CHECK-LABEL: preidx64:
-; CHECK: ldr     x[[REG:[0-9]+]], [x0, #8]!
-; CHECK: str     x[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i64* %src, i64 1
-  %tmp = load i64* %ptr, align 4
-  store i64 %tmp, i64* %out, align 4
-  ret i64* %ptr
-}
-
-define i32* @preidx32(i32* %src, i32* %out) {
-; CHECK: ldr     w[[REG:[0-9]+]], [x0, #4]!
-; CHECK: str     w[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i32* %src, i64 1
-  %tmp = load i32* %ptr, align 4
-  store i32 %tmp, i32* %out, align 4
-  ret i32* %ptr
-}
-
-define i16* @preidx16zext32(i16* %src, i32* %out) {
-; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
-; CHECK: str     w[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i16* %src, i64 1
-  %tmp = load i16* %ptr, align 4
-  %ext = zext i16 %tmp to i32
-  store i32 %ext, i32* %out, align 4
-  ret i16* %ptr
-}
-
-define i16* @preidx16zext64(i16* %src, i64* %out) {
-; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
-; CHECK: str     x[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i16* %src, i64 1
-  %tmp = load i16* %ptr, align 4
-  %ext = zext i16 %tmp to i64
-  store i64 %ext, i64* %out, align 4
-  ret i16* %ptr
-}
-
-define i8* @preidx8zext32(i8* %src, i32* %out) {
-; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
-; CHECK: str     w[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i8* %src, i64 1
-  %tmp = load i8* %ptr, align 4
-  %ext = zext i8 %tmp to i32
-  store i32 %ext, i32* %out, align 4
-  ret i8* %ptr
-}
-
-define i8* @preidx8zext64(i8* %src, i64* %out) {
-; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
-; CHECK: str     x[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i8* %src, i64 1
-  %tmp = load i8* %ptr, align 4
-  %ext = zext i8 %tmp to i64
-  store i64 %ext, i64* %out, align 4
-  ret i8* %ptr
-}
-
-define i32* @preidx32sext64(i32* %src, i64* %out) {
-; CHECK: ldrsw   x[[REG:[0-9]+]], [x0, #4]!
-; CHECK: str     x[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i32* %src, i64 1
-  %tmp = load i32* %ptr, align 4
-  %ext = sext i32 %tmp to i64
-  store i64 %ext, i64* %out, align 8
-  ret i32* %ptr
-}
-
-define i16* @preidx16sext32(i16* %src, i32* %out) {
-; CHECK: ldrsh   w[[REG:[0-9]+]], [x0, #2]!
-; CHECK: str     w[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i16* %src, i64 1
-  %tmp = load i16* %ptr, align 4
-  %ext = sext i16 %tmp to i32
-  store i32 %ext, i32* %out, align 4
-  ret i16* %ptr
-}
-
-define i16* @preidx16sext64(i16* %src, i64* %out) {
-; CHECK: ldrsh   x[[REG:[0-9]+]], [x0, #2]!
-; CHECK: str     x[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i16* %src, i64 1
-  %tmp = load i16* %ptr, align 4
-  %ext = sext i16 %tmp to i64
-  store i64 %ext, i64* %out, align 4
-  ret i16* %ptr
-}
-
-define i8* @preidx8sext32(i8* %src, i32* %out) {
-; CHECK: ldrsb   w[[REG:[0-9]+]], [x0, #1]!
-; CHECK: str     w[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i8* %src, i64 1
-  %tmp = load i8* %ptr, align 4
-  %ext = sext i8 %tmp to i32
-  store i32 %ext, i32* %out, align 4
-  ret i8* %ptr
-}
-
-define i8* @preidx8sext64(i8* %src, i64* %out) {
-; CHECK: ldrsb   x[[REG:[0-9]+]], [x0, #1]!
-; CHECK: str     x[[REG]], [x1]
-; CHECK: ret
-  %ptr = getelementptr inbounds i8* %src, i64 1
-  %tmp = load i8* %ptr, align 4
-  %ext = sext i8 %tmp to i64
-  store i64 %ext, i64* %out, align 4
-  ret i8* %ptr
-}
diff --git a/test/CodeGen/ARM64/inline-asm-error-I.ll b/test/CodeGen/ARM64/inline-asm-error-I.ll
deleted file mode 100644
index a7aaf9e..0000000
--- a/test/CodeGen/ARM64/inline-asm-error-I.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc -march=arm64 < %s  2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-; Check for at least one invalid constant.
-; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'I'
-
-define i32 @constraint_I(i32 %i, i32 %j) nounwind ssp {
-entry:
-  %0 = tail call i32 asm sideeffect "add $0, $1, $2", "=r,r,I"(i32 %i, i32 4097) nounwind
-  ret i32 %0
-}
diff --git a/test/CodeGen/ARM64/inline-asm-error-J.ll b/test/CodeGen/ARM64/inline-asm-error-J.ll
deleted file mode 100644
index 077e1b8..0000000
--- a/test/CodeGen/ARM64/inline-asm-error-J.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc -march=arm64 < %s  2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-; Check for at least one invalid constant.
-; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'J'
-
-define i32 @constraint_J(i32 %i, i32 %j) nounwind ssp {
-entry:
-  %0 = tail call i32 asm sideeffect "sub $0, $1, $2", "=r,r,J"(i32 %i, i32 2) nounwind
-  ret i32 %0
-}
diff --git a/test/CodeGen/ARM64/inline-asm-error-K.ll b/test/CodeGen/ARM64/inline-asm-error-K.ll
deleted file mode 100644
index 2a7f961..0000000
--- a/test/CodeGen/ARM64/inline-asm-error-K.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc -march=arm64 < %s  2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-; Check for at least one invalid constant.
-; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'K'
-
-define i32 @constraint_K(i32 %i, i32 %j) nounwind {
-entry:
-  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,K"(i32 %i, i32 -1) nounwind
-  ret i32 %0
-}
diff --git a/test/CodeGen/ARM64/inline-asm-error-L.ll b/test/CodeGen/ARM64/inline-asm-error-L.ll
deleted file mode 100644
index 1701943..0000000
--- a/test/CodeGen/ARM64/inline-asm-error-L.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc -march=arm64 < %s  2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-; Check for at least one invalid constant.
-; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'L'
-
-define i32 @constraint_L(i32 %i, i32 %j) nounwind {
-entry:
-  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,L"(i32 %i, i64 -1) nounwind
-  ret i32 %0
-}
diff --git a/test/CodeGen/ARM64/inline-asm-error-M.ll b/test/CodeGen/ARM64/inline-asm-error-M.ll
deleted file mode 100644
index 952bf60..0000000
--- a/test/CodeGen/ARM64/inline-asm-error-M.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc -march=arm64 < %s  2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-; Check for at least one invalid constant.
-; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'M'
-
-define i32 @constraint_M(i32 %i, i32 %j) nounwind {
-entry:
-  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,M"(i32 305418240) nounwind
-  ret i32 %0
-}
diff --git a/test/CodeGen/ARM64/inline-asm-error-N.ll b/test/CodeGen/ARM64/inline-asm-error-N.ll
deleted file mode 100644
index b4a199f..0000000
--- a/test/CodeGen/ARM64/inline-asm-error-N.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc -march=arm64 < %s  2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-; Check for at least one invalid constant.
-; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'N'
-
-define i32 @constraint_N(i32 %i, i32 %j) nounwind {
-entry:
-  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,N"(i64 1311761352401879040) nounwind
-  ret i32 %0
-}
diff --git a/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll b/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
deleted file mode 100644
index 6bfce8f..0000000
--- a/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not llc < %s -march=arm64 2>&1 | FileCheck %s
-
-
-; The 'z' constraint allocates either xzr or wzr, but obviously an input of 1 is
-; incompatible.
-define void @test_bad_zero_reg() {
-  tail call void asm sideeffect "USE($0)", "z"(i32 1) nounwind
-; CHECK: error: invalid operand for inline asm constraint 'z'
-
-  ret void
-}
diff --git a/test/CodeGen/ARM64/inline-asm.ll b/test/CodeGen/ARM64/inline-asm.ll
deleted file mode 100644
index e645078..0000000
--- a/test/CodeGen/ARM64/inline-asm.ll
+++ /dev/null
@@ -1,230 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -no-integrated-as | FileCheck %s
-
-; rdar://9167275
-
-define i32 @t1() nounwind ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: mov {{w[0-9]+}}, 7
-  %0 = tail call i32 asm "mov ${0:w}, 7", "=r"() nounwind
-  ret i32 %0
-}
-
-define i64 @t2() nounwind ssp {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: mov {{x[0-9]+}}, 7
-  %0 = tail call i64 asm "mov $0, 7", "=r"() nounwind
-  ret i64 %0
-}
-
-define i64 @t3() nounwind ssp {
-entry:
-; CHECK-LABEL: t3:
-; CHECK: mov {{w[0-9]+}}, 7
-  %0 = tail call i64 asm "mov ${0:w}, 7", "=r"() nounwind
-  ret i64 %0
-}
-
-; rdar://9281206
-
-define void @t4(i64 %op) nounwind {
-entry:
-; CHECK-LABEL: t4:
-; CHECK: mov x0, {{x[0-9]+}}; svc #0
-  %0 = tail call i64 asm sideeffect "mov x0, $1; svc #0;", "=r,r,r,~{x0}"(i64 %op, i64 undef) nounwind
-  ret void
-}
-
-; rdar://9394290
-
-define float @t5(float %x) nounwind {
-entry:
-; CHECK-LABEL: t5:
-; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-  %0 = tail call float asm "fadd ${0:s}, ${0:s}, ${0:s}", "=w,0"(float %x) nounwind
-  ret float %0
-}
-
-; rdar://9553599
-
-define zeroext i8 @t6(i8* %src) nounwind {
-entry:
-; CHECK-LABEL: t6:
-; CHECK: ldtrb {{w[0-9]+}}, [{{x[0-9]+}}]
-  %0 = tail call i8 asm "ldtrb ${0:w}, [$1]", "=r,r"(i8* %src) nounwind
-  ret i8 %0
-}
-
-define void @t7(i8* %f, i32 %g) nounwind {
-entry:
-  %f.addr = alloca i8*, align 8
-  store i8* %f, i8** %f.addr, align 8
-  ; CHECK-LABEL: t7:
-  ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
-  call void asm "str ${1:w}, $0", "=*Q,r"(i8** %f.addr, i32 %g) nounwind
-  ret void
-}
-
-; rdar://10258229
-; ARM64TargetLowering::getRegForInlineAsmConstraint() should recognize 'v'
-; registers.
-define void @t8() nounwind ssp {
-entry:
-; CHECK-LABEL: t8:
-; CHECK: stp {{d[0-9]+}}, {{d[0-9]+}}, [sp, #-16]
-  tail call void asm sideeffect "nop", "~{v8}"() nounwind
-  ret void
-}
-
-define i32 @constraint_I(i32 %i, i32 %j) nounwind {
-entry:
-  ; CHECK-LABEL: constraint_I:
-  %0 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 16773120) nounwind
-  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #16773120
-  %1 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 4096) nounwind
-  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #4096
-  ret i32 %1
-}
-
-define i32 @constraint_J(i32 %i, i32 %j) nounwind {
-entry:
-  ; CHECK-LABEL: constraint_J:
-  %0 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -16773120) nounwind
-  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4278194176
-  %1 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -1) nounwind
-  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4294967295
-  ret i32 %1
-}
-
-define i32 @constraint_KL(i32 %i, i32 %j) nounwind {
-entry:
-  ; CHECK-LABEL: constraint_KL:
-  %0 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,K"(i32 %i, i32 255) nounwind
-  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #255
-  %1 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,L"(i32 %i, i64 16711680) nounwind
-  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #16711680
-  ret i32 %1
-}
-
-define i32 @constraint_MN(i32 %i, i32 %j) nounwind {
-entry:
-  ; CHECK-LABEL: constraint_MN:
-  %0 = tail call i32 asm sideeffect "movk ${0:w}, $1", "=r,M"(i32 65535) nounwind
-  ; CHECK: movk  {{w[0-9]+}}, #65535
-  %1 = tail call i32 asm sideeffect "movz ${0:w}, $1", "=r,N"(i64 0) nounwind
-  ; CHECK: movz  {{w[0-9]+}}, #0
-  ret i32 %1
-}
-
-define void @t9() nounwind {
-entry:
-  ; CHECK-LABEL: t9:
-  %data = alloca <2 x double>, align 16
-  %0 = load <2 x double>* %data, align 16
-  call void asm sideeffect "mov.2d v4, $0\0A", "w,~{v4}"(<2 x double> %0) nounwind
-  ; CHECK: mov.2d v4, {{v[0-9]+}}
-  ret void
-}
-
-define void @t10() nounwind {
-entry:
-  ; CHECK-LABEL: t10:
-  %data = alloca <2 x float>, align 8
-  %a = alloca [2 x float], align 4
-  %arraydecay = getelementptr inbounds [2 x float]* %a, i32 0, i32 0
-  %0 = load <2 x float>* %data, align 8
-  call void asm sideeffect "ldr ${1:q}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
-  ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
-  call void asm sideeffect "ldr ${1:d}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
-  ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
-  call void asm sideeffect "ldr ${1:s}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
-  ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}]
-  call void asm sideeffect "ldr ${1:h}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
-  ; CHECK: ldr {{h[0-9]+}}, [{{x[0-9]+}}]
-  call void asm sideeffect "ldr ${1:b}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
-  ; CHECK: ldr {{b[0-9]+}}, [{{x[0-9]+}}]
-  ret void
-}
-
-define void @t11() nounwind {
-entry:
-  ; CHECK-LABEL: t11:
-  %a = alloca i32, align 4
-  %0 = load i32* %a, align 4
-  call void asm sideeffect "mov ${1:x}, ${0:x}\0A", "r,i"(i32 %0, i32 0) nounwind
-  ; CHECK: mov xzr, {{x[0-9]+}}
-  %1 = load i32* %a, align 4
-  call void asm sideeffect "mov ${1:w}, ${0:w}\0A", "r,i"(i32 %1, i32 0) nounwind
-  ; CHECK: mov wzr, {{w[0-9]+}}
-  ret void
-}
-
-define void @t12() nounwind {
-entry:
-  ; CHECK-LABEL: t12:
-  %data = alloca <4 x float>, align 16
-  %0 = load <4 x float>* %data, align 16
-  call void asm sideeffect "mov.2d v4, $0\0A", "x,~{v4}"(<4 x float> %0) nounwind
-  ; CHECK mov.2d v4, {{v([0-9])|(1[0-5])}}
-  ret void
-}
-
-define void @t13() nounwind {
-entry:
-  ; CHECK-LABEL: t13:
-  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 1311673391471656960) nounwind
-  ; CHECK: mov x4, #1311673391471656960
-  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 -4662) nounwind
-  ; CHECK: mov x4, #-4662
-  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 4660) nounwind
-  ; CHECK: mov x4, #4660
-  call void asm sideeffect "mov x4, $0\0A", "N"(i64 -71777214294589696) nounwind
-  ; CHECK: mov x4, #-71777214294589696
-  ret void
-}
-
-define void @t14() nounwind {
-entry:
-  ; CHECK-LABEL: t14:
-  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 305397760) nounwind
-  ; CHECK: mov w4, #305397760
-  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 -4662) nounwind
-  ; CHECK: mov w4, #4294962634
-  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 4660) nounwind
-  ; CHECK: mov w4, #4660
-  call void asm sideeffect "mov w4, $0\0A", "M"(i32 -16711936) nounwind
-  ; CHECK: mov w4, #4278255360
-  ret void
-}
-
-define void @t15() nounwind {
-entry:
-  %0 = tail call double asm sideeffect "fmov $0, d8", "=r"() nounwind
-  ; CHECK: fmov {{x[0-9]+}}, d8
-  ret void
-}
-
-; rdar://problem/14285178
-
-define void @test_zero_reg(i32* %addr) {
-; CHECK-LABEL: test_zero_reg:
-
-  tail call void asm sideeffect "USE($0)", "z"(i32 0) nounwind
-; CHECK: USE(xzr)
-
-  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 0)
-; CHECK: USE(wzr)
-
-  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 1)
-; CHECK: orr [[VAL1:w[0-9]+]], wzr, #0x1
-; CHECK: USE([[VAL1]])
-
-  tail call void asm sideeffect "USE($0), USE($1)", "z,z"(i32 0, i32 0) nounwind
-; CHECK: USE(xzr), USE(xzr)
-
-  tail call void asm sideeffect "USE($0), USE(${1:w})", "z,z"(i32 0, i32 0) nounwind
-; CHECK: USE(xzr), USE(wzr)
-
-  ret void
-}
diff --git a/test/CodeGen/ARM64/join-reserved.ll b/test/CodeGen/ARM64/join-reserved.ll
deleted file mode 100644
index e99168b..0000000
--- a/test/CodeGen/ARM64/join-reserved.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs | FileCheck %s
-target triple = "arm64-apple-macosx10"
-
-; Make sure that a store to [sp] addresses off sp directly.
-; A move isn't necessary.
-; <rdar://problem/11492712>
-; CHECK-LABEL: g:
-; CHECK: str xzr, [sp]
-; CHECK: bl
-; CHECK: ret
-define void @g() nounwind ssp {
-entry:
-  tail call void (i32, ...)* @f(i32 0, i32 0) nounwind
-  ret void
-}
-
-declare void @f(i32, ...)
diff --git a/test/CodeGen/ARM64/jumptable.ll b/test/CodeGen/ARM64/jumptable.ll
deleted file mode 100644
index 4635cfe..0000000
--- a/test/CodeGen/ARM64/jumptable.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-LINUX
-; <rdar://11417675>
-
-define void @sum(i32* %to) {
-entry:
-  switch i32 undef, label %exit [
-    i32 1, label %bb1
-    i32 2, label %bb2
-    i32 3, label %bb3
-    i32 4, label %bb4
-  ]
-bb1:
-  store i32 undef, i32* %to
-  br label %exit
-bb2:
-  store i32 undef, i32* %to
-  br label %exit
-bb3:
-  store i32 undef, i32* %to
-  br label %exit
-bb4:
-  store i32 undef, i32* %to
-  br label %exit
-exit:
-  ret void
-}
-
-; CHECK-LABEL: sum:
-; CHECK: adrp    {{x[0-9]+}}, LJTI0_0@PAGE
-; CHECK:  add    {{x[0-9]+}}, {{x[0-9]+}}, LJTI0_0@PAGEOFF
-
-; CHECK-LINUX-LABEL: sum:
-; CHECK-LINUX: adrp    {{x[0-9]+}}, .LJTI0_0
-; CHECK-LINUX:  add    {{x[0-9]+}}, {{x[0-9]+}}, :lo12:.LJTI0_0
diff --git a/test/CodeGen/ARM64/ld1.ll b/test/CodeGen/ARM64/ld1.ll
deleted file mode 100644
index 61836a1..0000000
--- a/test/CodeGen/ARM64/ld1.ll
+++ /dev/null
@@ -1,1345 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
-
-%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
-%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
-%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>, <8 x i8>,  <8 x i8> }
-
-define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
-; CHECK-LABEL: ld2_8b
-; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
-; and from the argument of the function also defined by ABI (i.e., x0)
-; CHECK ld2.8b { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x8x2_t  %tmp2
-}
-
-define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
-; CHECK-LABEL: ld3_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.8b { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x8x3_t  %tmp2
-}
-
-define %struct.__neon_int8x8x4_t @ld4_8b(i8* %A) nounwind {
-; CHECK-LABEL: ld4_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x8x4_t  %tmp2
-}
-
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
-
-%struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
-%struct.__neon_int8x16x3_t = type { <16 x i8>,  <16 x i8>,  <16 x i8> }
-%struct.__neon_int8x16x4_t = type { <16 x i8>,  <16 x i8>, <16 x i8>,  <16 x i8> }
-
-define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
-; CHECK-LABEL: ld2_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2.16b { v0, v1 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
-  ret %struct.__neon_int8x16x2_t  %tmp2
-}
-
-define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
-; CHECK-LABEL: ld3_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.16b { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
-  ret %struct.__neon_int8x16x3_t  %tmp2
-}
-
-define %struct.__neon_int8x16x4_t @ld4_16b(i8* %A) nounwind {
-; CHECK-LABEL: ld4_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
-  ret %struct.__neon_int8x16x4_t  %tmp2
-}
-
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
-
-%struct.__neon_int16x4x2_t = type { <4 x i16>,  <4 x i16> }
-%struct.__neon_int16x4x3_t = type { <4 x i16>,  <4 x i16>,  <4 x i16> }
-%struct.__neon_int16x4x4_t = type { <4 x i16>,  <4 x i16>, <4 x i16>,  <4 x i16> }
-
-define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
-; CHECK-LABEL: ld2_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2.4h { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
-	ret %struct.__neon_int16x4x2_t  %tmp2
-}
-
-define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
-; CHECK-LABEL: ld3_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.4h { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
-	ret %struct.__neon_int16x4x3_t  %tmp2
-}
-
-define %struct.__neon_int16x4x4_t @ld4_4h(i16* %A) nounwind {
-; CHECK-LABEL: ld4_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
-	ret %struct.__neon_int16x4x4_t  %tmp2
-}
-
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
-
-%struct.__neon_int16x8x2_t = type { <8 x i16>,  <8 x i16> }
-%struct.__neon_int16x8x3_t = type { <8 x i16>,  <8 x i16>,  <8 x i16> }
-%struct.__neon_int16x8x4_t = type { <8 x i16>,  <8 x i16>, <8 x i16>,  <8 x i16> }
-
-define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
-; CHECK-LABEL: ld2_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2.8h { v0, v1 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
-  ret %struct.__neon_int16x8x2_t  %tmp2
-}
-
-define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
-; CHECK-LABEL: ld3_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.8h { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
-  ret %struct.__neon_int16x8x3_t %tmp2
-}
-
-define %struct.__neon_int16x8x4_t @ld4_8h(i16* %A) nounwind {
-; CHECK-LABEL: ld4_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
-  ret %struct.__neon_int16x8x4_t  %tmp2
-}
-
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
-
-%struct.__neon_int32x2x2_t = type { <2 x i32>,  <2 x i32> }
-%struct.__neon_int32x2x3_t = type { <2 x i32>,  <2 x i32>,  <2 x i32> }
-%struct.__neon_int32x2x4_t = type { <2 x i32>,  <2 x i32>, <2 x i32>,  <2 x i32> }
-
-define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
-; CHECK-LABEL: ld2_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2.2s { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x2x2_t  %tmp2
-}
-
-define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
-; CHECK-LABEL: ld3_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.2s { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x2x3_t  %tmp2
-}
-
-define %struct.__neon_int32x2x4_t @ld4_2s(i32* %A) nounwind {
-; CHECK-LABEL: ld4_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x2x4_t  %tmp2
-}
-
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
-
-%struct.__neon_int32x4x2_t = type { <4 x i32>,  <4 x i32> }
-%struct.__neon_int32x4x3_t = type { <4 x i32>,  <4 x i32>,  <4 x i32> }
-%struct.__neon_int32x4x4_t = type { <4 x i32>,  <4 x i32>, <4 x i32>,  <4 x i32> }
-
-define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
-; CHECK-LABEL: ld2_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2.4s { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x4x2_t  %tmp2
-}
-
-define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
-; CHECK-LABEL: ld3_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.4s { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x4x3_t  %tmp2
-}
-
-define %struct.__neon_int32x4x4_t @ld4_4s(i32* %A) nounwind {
-; CHECK-LABEL: ld4_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x4x4_t  %tmp2
-}
-
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
-
-%struct.__neon_int64x2x2_t = type { <2 x i64>,  <2 x i64> }
-%struct.__neon_int64x2x3_t = type { <2 x i64>,  <2 x i64>,  <2 x i64> }
-%struct.__neon_int64x2x4_t = type { <2 x i64>,  <2 x i64>, <2 x i64>,  <2 x i64> }
-
-define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
-; CHECK-LABEL: ld2_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2.2d { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x2x2_t  %tmp2
-}
-
-define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
-; CHECK-LABEL: ld3_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3.2d { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x2x3_t  %tmp2
-}
-
-define %struct.__neon_int64x2x4_t @ld4_2d(i64* %A) nounwind {
-; CHECK-LABEL: ld4_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x2x4_t  %tmp2
-}
-
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
-
-%struct.__neon_int64x1x2_t = type { <1 x i64>,  <1 x i64> }
-%struct.__neon_int64x1x3_t = type { <1 x i64>,  <1 x i64>, <1 x i64> }
-%struct.__neon_int64x1x4_t = type { <1 x i64>,  <1 x i64>, <1 x i64>, <1 x i64> }
-
-
-define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
-; CHECK-LABEL: ld2_1di64
-; Make sure we are using the operands defined by the ABI
-; CHECK ld1.1d { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x1x2_t  %tmp2
-}
-
-define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
-; CHECK-LABEL: ld3_1di64
-; Make sure we are using the operands defined by the ABI
-; CHECK ld1.1d { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x1x3_t  %tmp2
-}
-
-define %struct.__neon_int64x1x4_t @ld4_1di64(i64* %A) nounwind {
-; CHECK-LABEL: ld4_1di64
-; Make sure we are using the operands defined by the ABI
-; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x1x4_t  %tmp2
-}
-
-
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
-
-%struct.__neon_float64x1x2_t = type { <1 x double>,  <1 x double> }
-%struct.__neon_float64x1x3_t = type { <1 x double>,  <1 x double>, <1 x double> }
-%struct.__neon_float64x1x4_t = type { <1 x double>,  <1 x double>, <1 x double>, <1 x double> }
-
-
-define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
-; CHECK-LABEL: ld2_1df64
-; Make sure we are using the operands defined by the ABI
-; CHECK ld1.1d { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
-	ret %struct.__neon_float64x1x2_t  %tmp2
-}
-
-define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
-; CHECK-LABEL: ld3_1df64
-; Make sure we are using the operands defined by the ABI
-; CHECK ld1.1d { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
-	ret %struct.__neon_float64x1x3_t  %tmp2
-}
-
-define %struct.__neon_float64x1x4_t @ld4_1df64(double* %A) nounwind {
-; CHECK-LABEL: ld4_1df64
-; Make sure we are using the operands defined by the ABI
-; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
-	ret %struct.__neon_float64x1x4_t  %tmp2
-}
-
-declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
-declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
-declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
-
-
-define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld2lane_16b
-; CHECK ld2.b { v0, v1 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
-	ret %struct.__neon_int8x16x2_t  %tmp2
-}
-
-define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i8* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld3lane_16b
-; CHECK ld3.b { v0, v1, v2 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
-	ret %struct.__neon_int8x16x3_t  %tmp2
-}
-
-define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i8* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld4lane_16b
-; CHECK ld4.b { v0, v1, v2, v3 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
-	ret %struct.__neon_int8x16x4_t  %tmp2
-}
-
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-
-define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, i16* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld2lane_8h
-; CHECK ld2.h { v0, v1 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
-	ret %struct.__neon_int16x8x2_t  %tmp2
-}
-
-define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i16* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld3lane_8h
-; CHECK ld3.h { v0, v1, v3 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
-	ret %struct.__neon_int16x8x3_t  %tmp2
-}
-
-define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i16* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld4lane_8h
-; CHECK ld4.h { v0, v1, v2, v3 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
-	ret %struct.__neon_int16x8x4_t  %tmp2
-}
-
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-
-define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, i32* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld2lane_4s
-; CHECK ld2.s { v0, v1 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
-	ret %struct.__neon_int32x4x2_t  %tmp2
-}
-
-define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i32* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld3lane_4s
-; CHECK ld3.s { v0, v1, v2 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
-	ret %struct.__neon_int32x4x3_t  %tmp2
-}
-
-define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i32* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld4lane_4s
-; CHECK ld4.s { v0, v1, v2, v3 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
-	ret %struct.__neon_int32x4x4_t  %tmp2
-}
-
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-
-define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, i64* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld2lane_2d
-; CHECK ld2.d { v0, v1 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
-	ret %struct.__neon_int64x2x2_t  %tmp2
-}
-
-define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld3lane_2d
-; CHECK ld3.d { v0, v1, v3 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
-	ret %struct.__neon_int64x2x3_t  %tmp2
-}
-
-define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64* %A) nounwind {
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld4lane_2d
-; CHECK ld4.d { v0, v1, v2, v3 }[1], [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
-	ret %struct.__neon_int64x2x4_t  %tmp2
-}
-
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-
-define <8 x i8> @ld1r_8b(i8* %bar) {
-; CHECK: ld1r_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.8b { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
-  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
-  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
-  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
-  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
-  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
-  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
-  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
-  ret <8 x i8> %tmp9
-}
-
-define <16 x i8> @ld1r_16b(i8* %bar) {
-; CHECK: ld1r_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.16b { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
-  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
-  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
-  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
-  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
-  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
-  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
-  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
-  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
-  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
-  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
-  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
-  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
-  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
-  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
-  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
-  ret <16 x i8> %tmp17
-}
-
-define <4 x i16> @ld1r_4h(i16* %bar) {
-; CHECK: ld1r_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.4h { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
-  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
-  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
-  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
-  ret <4 x i16> %tmp5
-}
-
-define <8 x i16> @ld1r_8h(i16* %bar) {
-; CHECK: ld1r_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.8h { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
-  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
-  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
-  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
-  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
-  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
-  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
-  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
-  ret <8 x i16> %tmp9
-}
-
-define <2 x i32> @ld1r_2s(i32* %bar) {
-; CHECK: ld1r_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.2s { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
-  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
-  ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @ld1r_4s(i32* %bar) {
-; CHECK: ld1r_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.4s { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
-  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
-  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
-  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @ld1r_2d(i64* %bar) {
-; CHECK: ld1r_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.2d { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i64* %bar
-  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
-  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
-  ret <2 x i64> %tmp3
-}
-
-define %struct.__neon_int8x8x2_t @ld2r_8b(i8* %A) nounwind {
-; CHECK: ld2r_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.8b { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x8x2_t  %tmp2
-}
-
-define %struct.__neon_int8x8x3_t @ld3r_8b(i8* %A) nounwind {
-; CHECK: ld3r_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.8b { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x8x3_t  %tmp2
-}
-
-define %struct.__neon_int8x8x4_t @ld4r_8b(i8* %A) nounwind {
-; CHECK: ld4r_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.8b { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x8x4_t  %tmp2
-}
-
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
-
-define %struct.__neon_int8x16x2_t @ld2r_16b(i8* %A) nounwind {
-; CHECK: ld2r_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.16b { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x16x2_t  %tmp2
-}
-
-define %struct.__neon_int8x16x3_t @ld3r_16b(i8* %A) nounwind {
-; CHECK: ld3r_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.16b { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x16x3_t  %tmp2
-}
-
-define %struct.__neon_int8x16x4_t @ld4r_16b(i8* %A) nounwind {
-; CHECK: ld4r_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.16b { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
-	ret %struct.__neon_int8x16x4_t  %tmp2
-}
-
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
-
-define %struct.__neon_int16x4x2_t @ld2r_4h(i16* %A) nounwind {
-; CHECK: ld2r_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.4h { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
-	ret %struct.__neon_int16x4x2_t  %tmp2
-}
-
-define %struct.__neon_int16x4x3_t @ld3r_4h(i16* %A) nounwind {
-; CHECK: ld3r_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.4h { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
-	ret %struct.__neon_int16x4x3_t  %tmp2
-}
-
-define %struct.__neon_int16x4x4_t @ld4r_4h(i16* %A) nounwind {
-; CHECK: ld4r_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.4h { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
-	ret %struct.__neon_int16x4x4_t  %tmp2
-}
-
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
-
-define %struct.__neon_int16x8x2_t @ld2r_8h(i16* %A) nounwind {
-; CHECK: ld2r_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.8h { v0, v1 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
-  ret %struct.__neon_int16x8x2_t  %tmp2
-}
-
-define %struct.__neon_int16x8x3_t @ld3r_8h(i16* %A) nounwind {
-; CHECK: ld3r_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.8h { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
-  ret %struct.__neon_int16x8x3_t  %tmp2
-}
-
-define %struct.__neon_int16x8x4_t @ld4r_8h(i16* %A) nounwind {
-; CHECK: ld4r_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.8h { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
-  ret %struct.__neon_int16x8x4_t  %tmp2
-}
-
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
-
-define %struct.__neon_int32x2x2_t @ld2r_2s(i32* %A) nounwind {
-; CHECK: ld2r_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.2s { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x2x2_t  %tmp2
-}
-
-define %struct.__neon_int32x2x3_t @ld3r_2s(i32* %A) nounwind {
-; CHECK: ld3r_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.2s { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x2x3_t  %tmp2
-}
-
-define %struct.__neon_int32x2x4_t @ld4r_2s(i32* %A) nounwind {
-; CHECK: ld4r_2s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.2s { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x2x4_t  %tmp2
-}
-
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
-
-define %struct.__neon_int32x4x2_t @ld2r_4s(i32* %A) nounwind {
-; CHECK: ld2r_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.4s { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x4x2_t  %tmp2
-}
-
-define %struct.__neon_int32x4x3_t @ld3r_4s(i32* %A) nounwind {
-; CHECK: ld3r_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.4s { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x4x3_t  %tmp2
-}
-
-define %struct.__neon_int32x4x4_t @ld4r_4s(i32* %A) nounwind {
-; CHECK: ld4r_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.4s { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
-	ret %struct.__neon_int32x4x4_t  %tmp2
-}
-
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
-
-define %struct.__neon_int64x1x2_t @ld2r_1d(i64* %A) nounwind {
-; CHECK: ld2r_1d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.1d { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x1x2_t  %tmp2
-}
-
-define %struct.__neon_int64x1x3_t @ld3r_1d(i64* %A) nounwind {
-; CHECK: ld3r_1d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.1d { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x1x3_t  %tmp2
-}
-
-define %struct.__neon_int64x1x4_t @ld4r_1d(i64* %A) nounwind {
-; CHECK: ld4r_1d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.1d { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x1x4_t  %tmp2
-}
-
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
-
-define %struct.__neon_int64x2x2_t @ld2r_2d(i64* %A) nounwind {
-; CHECK: ld2r_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld2r.2d { v0, v1 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x2x2_t  %tmp2
-}
-
-define %struct.__neon_int64x2x3_t @ld3r_2d(i64* %A) nounwind {
-; CHECK: ld3r_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld3r.2d { v0, v1, v2 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x2x3_t  %tmp2
-}
-
-define %struct.__neon_int64x2x4_t @ld4r_2d(i64* %A) nounwind {
-; CHECK: ld4r_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK ld4r.2d { v0, v1, v2, v3 }, [x0]
-; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
-	ret %struct.__neon_int64x2x4_t  %tmp2
-}
-
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
-
-define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
-; CHECK-LABEL: ld1_16b
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.b { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <16 x i8> %V, i8 %tmp1, i32 0
-  ret <16 x i8> %tmp2
-}
-
-define <8 x i16> @ld1_8h(<8 x i16> %V, i16* %bar) {
-; CHECK-LABEL: ld1_8h
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.h { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <8 x i16> %V, i16 %tmp1, i32 0
-  ret <8 x i16> %tmp2
-}
-
-define <4 x i32> @ld1_4s(<4 x i32> %V, i32* %bar) {
-; CHECK-LABEL: ld1_4s
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.s { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <4 x i32> %V, i32 %tmp1, i32 0
-  ret <4 x i32> %tmp2
-}
-
-define <4 x float> @ld1_4s_float(<4 x float> %V, float* %bar) {
-; CHECK-LABEL: ld1_4s_float:
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.s { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load float* %bar
-  %tmp2 = insertelement <4 x float> %V, float %tmp1, i32 0
-  ret <4 x float> %tmp2
-}
-
-define <2 x i64> @ld1_2d(<2 x i64> %V, i64* %bar) {
-; CHECK-LABEL: ld1_2d
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.d { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i64* %bar
-  %tmp2 = insertelement <2 x i64> %V, i64 %tmp1, i32 0
-  ret <2 x i64> %tmp2
-}
-
-define <2 x double> @ld1_2d_double(<2 x double> %V, double* %bar) {
-; CHECK-LABEL: ld1_2d_double:
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.d { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load double* %bar
-  %tmp2 = insertelement <2 x double> %V, double %tmp1, i32 0
-  ret <2 x double> %tmp2
-}
-
-define <1 x i64> @ld1_1d(<1 x i64>* %p) {
-; CHECK-LABEL: ld1_1d
-; Make sure we are using the operands defined by the ABI
-; CHECK: ldr [[REG:d[0-9]+]], [x0]
-; CHECK-NEXT: ret
-  %tmp = load <1 x i64>* %p, align 8
-  ret <1 x i64> %tmp
-}
-
-define <8 x i8> @ld1_8b(<8 x i8> %V, i8* %bar) {
-; CHECK-LABEL: ld1_8b
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.b { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i8* %bar
-  %tmp2 = insertelement <8 x i8> %V, i8 %tmp1, i32 0
-  ret <8 x i8> %tmp2
-}
-
-define <4 x i16> @ld1_4h(<4 x i16> %V, i16* %bar) {
-; CHECK-LABEL: ld1_4h
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.h { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i16* %bar
-  %tmp2 = insertelement <4 x i16> %V, i16 %tmp1, i32 0
-  ret <4 x i16> %tmp2
-}
-
-define <2 x i32> @ld1_2s(<2 x i32> %V, i32* %bar) {
-; CHECK-LABEL: ld1_2s:
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.s { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load i32* %bar
-  %tmp2 = insertelement <2 x i32> %V, i32 %tmp1, i32 0
-  ret <2 x i32> %tmp2
-}
-
-define <2 x float> @ld1_2s_float(<2 x float> %V, float* %bar) {
-; CHECK-LABEL: ld1_2s_float:
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1.s { v0 }[0], [x0]
-; CHECK-NEXT ret
-  %tmp1 = load float* %bar
-  %tmp2 = insertelement <2 x float> %V, float %tmp1, i32 0
-  ret <2 x float> %tmp2
-}
-
-
-; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
-define void @ld1r_2s_from_dup(i8* nocapture %a, i8* nocapture %b, i16* nocapture %diff) nounwind ssp {
-entry:
-; CHECK: ld1r_2s_from_dup
-; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0]
-; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1]
-; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]]
-; CHECK-NEXT: str d[[RESREGNUM]], [x2]
-; CHECK-NEXT: ret
-  %tmp = bitcast i8* %a to i32*
-  %tmp1 = load i32* %tmp, align 4
-  %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0
-  %lane = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %tmp3 = bitcast <2 x i32> %lane to <8 x i8>
-  %tmp4 = bitcast i8* %b to i32*
-  %tmp5 = load i32* %tmp4, align 4
-  %tmp6 = insertelement <2 x i32> undef, i32 %tmp5, i32 0
-  %lane1 = shufflevector <2 x i32> %tmp6, <2 x i32> undef, <2 x i32> zeroinitializer
-  %tmp7 = bitcast <2 x i32> %lane1 to <8 x i8>
-  %vmovl.i.i = zext <8 x i8> %tmp3 to <8 x i16>
-  %vmovl.i4.i = zext <8 x i8> %tmp7 to <8 x i16>
-  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i4.i
-  %tmp8 = bitcast <8 x i16> %sub.i to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %tmp8, <2 x i64> undef, <1 x i32> zeroinitializer
-  %tmp9 = bitcast <1 x i64> %shuffle.i to <4 x i16>
-  %tmp10 = bitcast i16* %diff to <4 x i16>*
-  store <4 x i16> %tmp9, <4 x i16>* %tmp10, align 8
-  ret void
-}
-
-; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
-define <4 x float> @ld1r_4s_float(float* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_4s_float
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.4s { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp = load float* %x, align 4
-  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
-  %tmp2 = insertelement <4 x float> %tmp1, float %tmp, i32 1
-  %tmp3 = insertelement <4 x float> %tmp2, float %tmp, i32 2
-  %tmp4 = insertelement <4 x float> %tmp3, float %tmp, i32 3
-  ret <4 x float> %tmp4
-}
-
-define <2 x float> @ld1r_2s_float(float* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_2s_float
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.2s { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp = load float* %x, align 4
-  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
-  %tmp2 = insertelement <2 x float> %tmp1, float %tmp, i32 1
-  ret <2 x float> %tmp2
-}
-
-define <2 x double> @ld1r_2d_double(double* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_2d_double
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.2d { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp = load double* %x, align 4
-  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
-  %tmp2 = insertelement <2 x double> %tmp1, double %tmp, i32 1
-  ret <2 x double> %tmp2
-}
-
-define <1 x double> @ld1r_1d_double(double* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_1d_double
-; Make sure we are using the operands defined by the ABI
-; CHECK: ldr d0, [x0]
-; CHECK-NEXT ret
-  %tmp = load double* %x, align 4
-  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
-  ret <1 x double> %tmp1
-}
-
-define <4 x float> @ld1r_4s_float_shuff(float* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_4s_float_shuff
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.4s { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp = load float* %x, align 4
-  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
-  %lane = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
-  ret <4 x float> %lane
-}
-
-define <2 x float> @ld1r_2s_float_shuff(float* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_2s_float_shuff
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.2s { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp = load float* %x, align 4
-  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
-  %lane = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
-  ret <2 x float> %lane
-}
-
-define <2 x double> @ld1r_2d_double_shuff(double* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_2d_double_shuff
-; Make sure we are using the operands defined by the ABI
-; CHECK: ld1r.2d { v0 }, [x0]
-; CHECK-NEXT ret
-  %tmp = load double* %x, align 4
-  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
-  %lane = shufflevector <2 x double> %tmp1, <2 x double> undef, <2 x i32> zeroinitializer
-  ret <2 x double> %lane
-}
-
-define <1 x double> @ld1r_1d_double_shuff(double* nocapture %x) {
-entry:
-; CHECK-LABEL: ld1r_1d_double_shuff
-; Make sure we are using the operands defined by the ABI
-; CHECK: ldr d0, [x0]
-; CHECK-NEXT ret
-  %tmp = load double* %x, align 4
-  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
-  %lane = shufflevector <1 x double> %tmp1, <1 x double> undef, <1 x i32> zeroinitializer
-  ret <1 x double> %lane
-}
-
-%struct.__neon_float32x2x2_t = type { <2 x float>,  <2 x float> }
-%struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
-%struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
-
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
-
-define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(i8* %addr) {
-; CHECK-LABEL: ld1_x2_v8i8:
-; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %addr)
-  ret %struct.__neon_int8x8x2_t %val
-}
-
-define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(i16* %addr) {
-; CHECK-LABEL: ld1_x2_v4i16:
-; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %addr)
-  ret %struct.__neon_int16x4x2_t %val
-}
-
-define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(i32* %addr) {
-; CHECK-LABEL: ld1_x2_v2i32:
-; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %addr)
-  ret %struct.__neon_int32x2x2_t %val
-}
-
-define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(float* %addr) {
-; CHECK-LABEL: ld1_x2_v2f32:
-; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %addr)
-  ret %struct.__neon_float32x2x2_t %val
-}
-
-define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(i64* %addr) {
-; CHECK-LABEL: ld1_x2_v1i64:
-; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %addr)
-  ret %struct.__neon_int64x1x2_t %val
-}
-
-define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
-; CHECK-LABEL: ld1_x2_v1f64:
-; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %addr)
-  ret %struct.__neon_float64x1x2_t %val
-}
-
-
-%struct.__neon_float32x4x2_t = type { <4 x float>,  <4 x float> }
-%struct.__neon_float32x4x3_t = type { <4 x float>,  <4 x float>,  <4 x float> }
-%struct.__neon_float32x4x4_t = type { <4 x float>,  <4 x float>, <4 x float>,  <4 x float> }
-
-%struct.__neon_float64x2x2_t = type { <2 x double>,  <2 x double> }
-%struct.__neon_float64x2x3_t = type { <2 x double>,  <2 x double>,  <2 x double> }
-%struct.__neon_float64x2x4_t = type { <2 x double>,  <2 x double>, <2 x double>,  <2 x double> }
-
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
-
-define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(i8* %addr) {
-; CHECK-LABEL: ld1_x2_v16i8:
-; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %addr)
-  ret %struct.__neon_int8x16x2_t %val
-}
-
-define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(i16* %addr) {
-; CHECK-LABEL: ld1_x2_v8i16:
-; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %addr)
-  ret %struct.__neon_int16x8x2_t %val
-}
-
-define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(i32* %addr) {
-; CHECK-LABEL: ld1_x2_v4i32:
-; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %addr)
-  ret %struct.__neon_int32x4x2_t %val
-}
-
-define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(float* %addr) {
-; CHECK-LABEL: ld1_x2_v4f32:
-; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %addr)
-  ret %struct.__neon_float32x4x2_t %val
-}
-
-define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(i64* %addr) {
-; CHECK-LABEL: ld1_x2_v2i64:
-; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %addr)
-  ret %struct.__neon_int64x2x2_t %val
-}
-
-define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(double* %addr) {
-; CHECK-LABEL: ld1_x2_v2f64:
-; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %addr)
-  ret %struct.__neon_float64x2x2_t %val
-}
-
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
-
-define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(i8* %addr) {
-; CHECK-LABEL: ld1_x3_v8i8:
-; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %addr)
-  ret %struct.__neon_int8x8x3_t %val
-}
-
-define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(i16* %addr) {
-; CHECK-LABEL: ld1_x3_v4i16:
-; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %addr)
-  ret %struct.__neon_int16x4x3_t %val
-}
-
-define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(i32* %addr) {
-; CHECK-LABEL: ld1_x3_v2i32:
-; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %addr)
-  ret %struct.__neon_int32x2x3_t %val
-}
-
-define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(float* %addr) {
-; CHECK-LABEL: ld1_x3_v2f32:
-; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %addr)
-  ret %struct.__neon_float32x2x3_t %val
-}
-
-define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(i64* %addr) {
-; CHECK-LABEL: ld1_x3_v1i64:
-; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %addr)
-  ret %struct.__neon_int64x1x3_t %val
-}
-
-define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(double* %addr) {
-; CHECK-LABEL: ld1_x3_v1f64:
-; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %addr)
-  ret %struct.__neon_float64x1x3_t %val
-}
-
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
-
-define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(i8* %addr) {
-; CHECK-LABEL: ld1_x3_v16i8:
-; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %addr)
-  ret %struct.__neon_int8x16x3_t %val
-}
-
-define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(i16* %addr) {
-; CHECK-LABEL: ld1_x3_v8i16:
-; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %addr)
-  ret %struct.__neon_int16x8x3_t %val
-}
-
-define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(i32* %addr) {
-; CHECK-LABEL: ld1_x3_v4i32:
-; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %addr)
-  ret %struct.__neon_int32x4x3_t %val
-}
-
-define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(float* %addr) {
-; CHECK-LABEL: ld1_x3_v4f32:
-; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %addr)
-  ret %struct.__neon_float32x4x3_t %val
-}
-
-define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(i64* %addr) {
-; CHECK-LABEL: ld1_x3_v2i64:
-; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %addr)
-  ret %struct.__neon_int64x2x3_t %val
-}
-
-define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(double* %addr) {
-; CHECK-LABEL: ld1_x3_v2f64:
-; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %addr)
-  ret %struct.__neon_float64x2x3_t %val
-}
-
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
-
-define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(i8* %addr) {
-; CHECK-LABEL: ld1_x4_v8i8:
-; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %addr)
-  ret %struct.__neon_int8x8x4_t %val
-}
-
-define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(i16* %addr) {
-; CHECK-LABEL: ld1_x4_v4i16:
-; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %addr)
-  ret %struct.__neon_int16x4x4_t %val
-}
-
-define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(i32* %addr) {
-; CHECK-LABEL: ld1_x4_v2i32:
-; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %addr)
-  ret %struct.__neon_int32x2x4_t %val
-}
-
-define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(float* %addr) {
-; CHECK-LABEL: ld1_x4_v2f32:
-; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %addr)
-  ret %struct.__neon_float32x2x4_t %val
-}
-
-define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(i64* %addr) {
-; CHECK-LABEL: ld1_x4_v1i64:
-; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %addr)
-  ret %struct.__neon_int64x1x4_t %val
-}
-
-define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(double* %addr) {
-; CHECK-LABEL: ld1_x4_v1f64:
-; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %addr)
-  ret %struct.__neon_float64x1x4_t %val
-}
-
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
-
-define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(i8* %addr) {
-; CHECK-LABEL: ld1_x4_v16i8:
-; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %addr)
-  ret %struct.__neon_int8x16x4_t %val
-}
-
-define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(i16* %addr) {
-; CHECK-LABEL: ld1_x4_v8i16:
-; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %addr)
-  ret %struct.__neon_int16x8x4_t %val
-}
-
-define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(i32* %addr) {
-; CHECK-LABEL: ld1_x4_v4i32:
-; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %addr)
-  ret %struct.__neon_int32x4x4_t %val
-}
-
-define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(float* %addr) {
-; CHECK-LABEL: ld1_x4_v4f32:
-; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %addr)
-  ret %struct.__neon_float32x4x4_t %val
-}
-
-define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(i64* %addr) {
-; CHECK-LABEL: ld1_x4_v2i64:
-; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %addr)
-  ret %struct.__neon_int64x2x4_t %val
-}
-
-define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(double* %addr) {
-; CHECK-LABEL: ld1_x4_v2f64:
-; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %addr)
-  ret %struct.__neon_float64x2x4_t %val
-}
diff --git a/test/CodeGen/ARM64/ldp.ll b/test/CodeGen/ARM64/ldp.ll
deleted file mode 100644
index 9444385..0000000
--- a/test/CodeGen/ARM64/ldp.ll
+++ /dev/null
@@ -1,149 +0,0 @@
-; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
-; RUN:   -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
-
-; CHECK: ldp_int
-; CHECK: ldp
-define i32 @ldp_int(i32* %p) nounwind {
-  %tmp = load i32* %p, align 4
-  %add.ptr = getelementptr inbounds i32* %p, i64 1
-  %tmp1 = load i32* %add.ptr, align 4
-  %add = add nsw i32 %tmp1, %tmp
-  ret i32 %add
-}
-
-; CHECK: ldp_long
-; CHECK: ldp
-define i64 @ldp_long(i64* %p) nounwind {
-  %tmp = load i64* %p, align 8
-  %add.ptr = getelementptr inbounds i64* %p, i64 1
-  %tmp1 = load i64* %add.ptr, align 8
-  %add = add nsw i64 %tmp1, %tmp
-  ret i64 %add
-}
-
-; CHECK: ldp_float
-; CHECK: ldp
-define float @ldp_float(float* %p) nounwind {
-  %tmp = load float* %p, align 4
-  %add.ptr = getelementptr inbounds float* %p, i64 1
-  %tmp1 = load float* %add.ptr, align 4
-  %add = fadd float %tmp, %tmp1
-  ret float %add
-}
-
-; CHECK: ldp_double
-; CHECK: ldp
-define double @ldp_double(double* %p) nounwind {
-  %tmp = load double* %p, align 8
-  %add.ptr = getelementptr inbounds double* %p, i64 1
-  %tmp1 = load double* %add.ptr, align 8
-  %add = fadd double %tmp, %tmp1
-  ret double %add
-}
-
-; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
-define i32 @ldur_int(i32* %a) nounwind {
-; LDUR_CHK: ldur_int
-; LDUR_CHK: ldp     [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8]
-; LDUR_CHK-NEXT: add     w{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i32* %a, i32 -1
-  %tmp1 = load i32* %p1, align 2
-  %p2 = getelementptr inbounds i32* %a, i32 -2
-  %tmp2 = load i32* %p2, align 2
-  %tmp3 = add i32 %tmp1, %tmp2
-  ret i32 %tmp3
-}
-
-define i64 @ldur_long(i64* %a) nounwind ssp {
-; LDUR_CHK: ldur_long
-; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i64* %a, i64 -1
-  %tmp1 = load i64* %p1, align 2
-  %p2 = getelementptr inbounds i64* %a, i64 -2
-  %tmp2 = load i64* %p2, align 2
-  %tmp3 = add i64 %tmp1, %tmp2
-  ret i64 %tmp3
-}
-
-define float @ldur_float(float* %a) {
-; LDUR_CHK: ldur_float
-; LDUR_CHK: ldp     [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8]
-; LDUR_CHK-NEXT: add     s{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds float* %a, i64 -1
-  %tmp1 = load float* %p1, align 2
-  %p2 = getelementptr inbounds float* %a, i64 -2
-  %tmp2 = load float* %p2, align 2
-  %tmp3 = fadd float %tmp1, %tmp2
-  ret float %tmp3
-}
-
-define double @ldur_double(double* %a) {
-; LDUR_CHK: ldur_double
-; LDUR_CHK: ldp     [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16]
-; LDUR_CHK-NEXT: add     d{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds double* %a, i64 -1
-  %tmp1 = load double* %p1, align 2
-  %p2 = getelementptr inbounds double* %a, i64 -2
-  %tmp2 = load double* %p2, align 2
-  %tmp3 = fadd double %tmp1, %tmp2
-  ret double %tmp3
-}
-
-; Now check some boundary conditions
-define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyIn
-; LDUR_CHK-NOT: ldur
-; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i64* %a, i64 -31
-  %tmp1 = load i64* %p1, align 2
-  %p2 = getelementptr inbounds i64* %a, i64 -32
-  %tmp2 = load i64* %p2, align 2
-  %tmp3 = add i64 %tmp1, %tmp2
-  ret i64 %tmp3
-}
-
-define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyOut
-; LDUR_CHK-NOT: ldp
-; Don't be fragile about which loads or manipulations of the base register
-; are used---just check that there isn't an ldp before the add
-; LDUR_CHK: add
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i64* %a, i64 -32
-  %tmp1 = load i64* %p1, align 2
-  %p2 = getelementptr inbounds i64* %a, i64 -33
-  %tmp2 = load i64* %p2, align 2
-  %tmp3 = add i64 %tmp1, %tmp2
-  ret i64 %tmp3
-}
-
-define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
-; LDUR_CHK: pairUpNotAligned
-; LDUR_CHK-NOT: ldp
-; LDUR_CHK: ldur
-; LDUR_CHK-NEXT: ldur
-; LDUR_CHK-NEXT: add
-; LDUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i64* %a, i64 -18
-  %bp1 = bitcast i64* %p1 to i8*
-  %bp1p1 = getelementptr inbounds i8* %bp1, i64 1
-  %dp1 = bitcast i8* %bp1p1 to i64*
-  %tmp1 = load i64* %dp1, align 1
-
-  %p2 = getelementptr inbounds i64* %a, i64 -17
-  %bp2 = bitcast i64* %p2 to i8*
-  %bp2p1 = getelementptr inbounds i8* %bp2, i64 1
-  %dp2 = bitcast i8* %bp2p1 to i64*
-  %tmp2 = load i64* %dp2, align 1
-
-  %tmp3 = add i64 %tmp1, %tmp2
-  ret i64 %tmp3
-}
diff --git a/test/CodeGen/ARM64/ldur.ll b/test/CodeGen/ARM64/ldur.ll
deleted file mode 100644
index 2848c06..0000000
--- a/test/CodeGen/ARM64/ldur.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i64 @_f0(i64* %p) {
-; CHECK: f0:
-; CHECK: ldur x0, [x0, #-8]
-; CHECK-NEXT: ret
-  %tmp = getelementptr inbounds i64* %p, i64 -1
-  %ret = load i64* %tmp, align 2
-  ret i64 %ret
-}
-define i32 @_f1(i32* %p) {
-; CHECK: f1:
-; CHECK: ldur w0, [x0, #-4]
-; CHECK-NEXT: ret
-  %tmp = getelementptr inbounds i32* %p, i64 -1
-  %ret = load i32* %tmp, align 2
-  ret i32 %ret
-}
-define i16 @_f2(i16* %p) {
-; CHECK: f2:
-; CHECK: ldurh w0, [x0, #-2]
-; CHECK-NEXT: ret
-  %tmp = getelementptr inbounds i16* %p, i64 -1
-  %ret = load i16* %tmp, align 2
-  ret i16 %ret
-}
-define i8 @_f3(i8* %p) {
-; CHECK: f3:
-; CHECK: ldurb w0, [x0, #-1]
-; CHECK-NEXT: ret
-  %tmp = getelementptr inbounds i8* %p, i64 -1
-  %ret = load i8* %tmp, align 2
-  ret i8 %ret
-}
-
-define i64 @zext32(i8* %a) nounwind ssp {
-; CHECK-LABEL: zext32:
-; CHECK: ldur w0, [x0, #-12]
-; CHECK-NEXT: ret
-  %p = getelementptr inbounds i8* %a, i64 -12
-  %tmp1 = bitcast i8* %p to i32*
-  %tmp2 = load i32* %tmp1, align 4
-  %ret = zext i32 %tmp2 to i64
-
-  ret i64 %ret
-}
-define i64 @zext16(i8* %a) nounwind ssp {
-; CHECK-LABEL: zext16:
-; CHECK: ldurh w0, [x0, #-12]
-; CHECK-NEXT: ret
-  %p = getelementptr inbounds i8* %a, i64 -12
-  %tmp1 = bitcast i8* %p to i16*
-  %tmp2 = load i16* %tmp1, align 2
-  %ret = zext i16 %tmp2 to i64
-
-  ret i64 %ret
-}
-define i64 @zext8(i8* %a) nounwind ssp {
-; CHECK-LABEL: zext8:
-; CHECK: ldurb w0, [x0, #-12]
-; CHECK-NEXT: ret
-  %p = getelementptr inbounds i8* %a, i64 -12
-  %tmp2 = load i8* %p, align 1
-  %ret = zext i8 %tmp2 to i64
-
-  ret i64 %ret
-}
diff --git a/test/CodeGen/ARM64/ldxr-stxr.ll b/test/CodeGen/ARM64/ldxr-stxr.ll
deleted file mode 100644
index d50ba94..0000000
--- a/test/CodeGen/ARM64/ldxr-stxr.ll
+++ /dev/null
@@ -1,143 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
-
-%0 = type { i64, i64 }
-
-define i128 @f0(i8* %p) nounwind readonly {
-; CHECK-LABEL: f0:
-; CHECK: ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
-entry:
-  %ldrexd = tail call %0 @llvm.arm64.ldxp(i8* %p)
-  %0 = extractvalue %0 %ldrexd, 1
-  %1 = extractvalue %0 %ldrexd, 0
-  %2 = zext i64 %0 to i128
-  %3 = zext i64 %1 to i128
-  %shl = shl nuw i128 %2, 64
-  %4 = or i128 %shl, %3
-  ret i128 %4
-}
-
-define i32 @f1(i8* %ptr, i128 %val) nounwind {
-; CHECK-LABEL: f1:
-; CHECK: stxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
-entry:
-  %tmp4 = trunc i128 %val to i64
-  %tmp6 = lshr i128 %val, 64
-  %tmp7 = trunc i128 %tmp6 to i64
-  %strexd = tail call i32 @llvm.arm64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
-  ret i32 %strexd
-}
-
-declare %0 @llvm.arm64.ldxp(i8*) nounwind
-declare i32 @llvm.arm64.stxp(i64, i64, i8*) nounwind
-
-@var = global i64 0, align 8
-
-define void @test_load_i8(i8* %addr) {
-; CHECK-LABEL: test_load_i8:
-; CHECK: ldxrb w[[LOADVAL:[0-9]+]], [x0]
-; CHECK-NOT: uxtb
-; CHECK-NOT: and
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldxr.p0i8(i8* %addr)
-  %shortval = trunc i64 %val to i8
-  %extval = zext i8 %shortval to i64
-  store i64 %extval, i64* @var, align 8
-  ret void
-}
-
-define void @test_load_i16(i16* %addr) {
-; CHECK-LABEL: test_load_i16:
-; CHECK: ldxrh w[[LOADVAL:[0-9]+]], [x0]
-; CHECK-NOT: uxth
-; CHECK-NOT: and
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldxr.p0i16(i16* %addr)
-  %shortval = trunc i64 %val to i16
-  %extval = zext i16 %shortval to i64
-  store i64 %extval, i64* @var, align 8
-  ret void
-}
-
-define void @test_load_i32(i32* %addr) {
-; CHECK-LABEL: test_load_i32:
-; CHECK: ldxr w[[LOADVAL:[0-9]+]], [x0]
-; CHECK-NOT: uxtw
-; CHECK-NOT: and
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldxr.p0i32(i32* %addr)
-  %shortval = trunc i64 %val to i32
-  %extval = zext i32 %shortval to i64
-  store i64 %extval, i64* @var, align 8
-  ret void
-}
-
-define void @test_load_i64(i64* %addr) {
-; CHECK-LABEL: test_load_i64:
-; CHECK: ldxr x[[LOADVAL:[0-9]+]], [x0]
-; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
-
-  %val = call i64 @llvm.arm64.ldxr.p0i64(i64* %addr)
-  store i64 %val, i64* @var, align 8
-  ret void
-}
-
-
-declare i64 @llvm.arm64.ldxr.p0i8(i8*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i16(i16*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i32(i32*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i64(i64*) nounwind
-
-define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
-; CHECK-LABEL: test_store_i8:
-; CHECK-NOT: uxtb
-; CHECK-NOT: and
-; CHECK: stxrb w0, w1, [x2]
-  %extval = zext i8 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i8(i64 %extval, i8* %addr)
-  ret i32 %res
-}
-
-define i32 @test_store_i16(i32, i16 %val, i16* %addr) {
-; CHECK-LABEL: test_store_i16:
-; CHECK-NOT: uxth
-; CHECK-NOT: and
-; CHECK: stxrh w0, w1, [x2]
-  %extval = zext i16 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i16(i64 %extval, i16* %addr)
-  ret i32 %res
-}
-
-define i32 @test_store_i32(i32, i32 %val, i32* %addr) {
-; CHECK-LABEL: test_store_i32:
-; CHECK-NOT: uxtw
-; CHECK-NOT: and
-; CHECK: stxr w0, w1, [x2]
-  %extval = zext i32 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i32(i64 %extval, i32* %addr)
-  ret i32 %res
-}
-
-define i32 @test_store_i64(i32, i64 %val, i64* %addr) {
-; CHECK-LABEL: test_store_i64:
-; CHECK: stxr w0, x1, [x2]
-  %res = call i32 @llvm.arm64.stxr.p0i64(i64 %val, i64* %addr)
-  ret i32 %res
-}
-
-declare i32 @llvm.arm64.stxr.p0i8(i64, i8*) nounwind
-declare i32 @llvm.arm64.stxr.p0i16(i64, i16*) nounwind
-declare i32 @llvm.arm64.stxr.p0i32(i64, i32*) nounwind
-declare i32 @llvm.arm64.stxr.p0i64(i64, i64*) nounwind
-
-; CHECK: test_clear:
-; CHECK: clrex
-define void @test_clear() {
-  call void @llvm.arm64.clrex()
-  ret void
-}
-
-declare void @llvm.arm64.clrex() nounwind
-
diff --git a/test/CodeGen/ARM64/leaf-compact-unwind.ll b/test/CodeGen/ARM64/leaf-compact-unwind.ll
deleted file mode 100644
index 0a58717..0000000
--- a/test/CodeGen/ARM64/leaf-compact-unwind.ll
+++ /dev/null
@@ -1,161 +0,0 @@
-; Use the -disable-cfi flag so that we get the compact unwind info in the
-; emitted assembly. Compact unwind info is omitted when CFI directives
-; are emitted.
-;
-; RUN: llc -march=arm64 -mtriple=arm64-apple-ios -disable-cfi < %s | FileCheck %s
-;
-; rdar://13070556
-
-@bar = common global i32 0, align 4
-
-; Leaf function with no stack allocation and no saving/restoring
-; of non-volatile registers.
-define i32 @foo1(i32 %a) #0 {
-entry:
-  %add = add nsw i32 %a, 42
-  ret i32 %add
-}
-
-; Leaf function with stack allocation but no saving/restoring
-; of non-volatile registers.
-define i32 @foo2(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #0 {
-entry:
-  %stack = alloca [36 x i32], align 4
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.body ]
-  %arrayidx = getelementptr inbounds [36 x i32]* %stack, i64 0, i64 %indvars.iv19
-  %0 = trunc i64 %indvars.iv19 to i32
-  store i32 %0, i32* %arrayidx, align 4, !tbaa !0
-  %indvars.iv.next20 = add i64 %indvars.iv19, 1
-  %lftr.wideiv21 = trunc i64 %indvars.iv.next20 to i32
-  %exitcond22 = icmp eq i32 %lftr.wideiv21, 36
-  br i1 %exitcond22, label %for.body4, label %for.body
-
-for.body4:                                        ; preds = %for.body, %for.body4
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body ]
-  %z1.016 = phi i32 [ %add, %for.body4 ], [ 0, %for.body ]
-  %arrayidx6 = getelementptr inbounds [36 x i32]* %stack, i64 0, i64 %indvars.iv
-  %1 = load i32* %arrayidx6, align 4, !tbaa !0
-  %add = add nsw i32 %1, %z1.016
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 36
-  br i1 %exitcond, label %for.end9, label %for.body4
-
-for.end9:                                         ; preds = %for.body4
-  ret i32 %add
-}
-
-; Leaf function with no stack allocation but with saving restoring of
-; non-volatile registers.
-define i32 @foo3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #1 {
-entry:
-  %0 = load volatile i32* @bar, align 4, !tbaa !0
-  %1 = load volatile i32* @bar, align 4, !tbaa !0
-  %2 = load volatile i32* @bar, align 4, !tbaa !0
-  %3 = load volatile i32* @bar, align 4, !tbaa !0
-  %4 = load volatile i32* @bar, align 4, !tbaa !0
-  %5 = load volatile i32* @bar, align 4, !tbaa !0
-  %6 = load volatile i32* @bar, align 4, !tbaa !0
-  %7 = load volatile i32* @bar, align 4, !tbaa !0
-  %8 = load volatile i32* @bar, align 4, !tbaa !0
-  %9 = load volatile i32* @bar, align 4, !tbaa !0
-  %10 = load volatile i32* @bar, align 4, !tbaa !0
-  %11 = load volatile i32* @bar, align 4, !tbaa !0
-  %12 = load volatile i32* @bar, align 4, !tbaa !0
-  %13 = load volatile i32* @bar, align 4, !tbaa !0
-  %14 = load volatile i32* @bar, align 4, !tbaa !0
-  %15 = load volatile i32* @bar, align 4, !tbaa !0
-  %16 = load volatile i32* @bar, align 4, !tbaa !0
-  %17 = load volatile i32* @bar, align 4, !tbaa !0
-  %factor = mul i32 %h, -2
-  %factor56 = mul i32 %g, -2
-  %factor57 = mul i32 %f, -2
-  %factor58 = mul i32 %e, -2
-  %factor59 = mul i32 %d, -2
-  %factor60 = mul i32 %c, -2
-  %factor61 = mul i32 %b, -2
-  %sum = add i32 %1, %0
-  %sum62 = add i32 %sum, %2
-  %sum63 = add i32 %sum62, %3
-  %sum64 = add i32 %sum63, %4
-  %sum65 = add i32 %sum64, %5
-  %sum66 = add i32 %sum65, %6
-  %sum67 = add i32 %sum66, %7
-  %sum68 = add i32 %sum67, %8
-  %sum69 = add i32 %sum68, %9
-  %sum70 = add i32 %sum69, %10
-  %sum71 = add i32 %sum70, %11
-  %sum72 = add i32 %sum71, %12
-  %sum73 = add i32 %sum72, %13
-  %sum74 = add i32 %sum73, %14
-  %sum75 = add i32 %sum74, %15
-  %sum76 = add i32 %sum75, %16
-  %sub10 = sub i32 %17, %sum76
-  %sub11 = add i32 %sub10, %factor
-  %sub12 = add i32 %sub11, %factor56
-  %sub13 = add i32 %sub12, %factor57
-  %sub14 = add i32 %sub13, %factor58
-  %sub15 = add i32 %sub14, %factor59
-  %sub16 = add i32 %sub15, %factor60
-  %add = add i32 %sub16, %factor61
-  ret i32 %add
-}
-
-; Leaf function with stack allocation and saving/restoring of non-volatile
-; registers.
-define i32 @foo4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #0 {
-entry:
-  %stack = alloca [128 x i32], align 4
-  %0 = zext i32 %a to i64
-  br label %for.body
-
-for.cond2.preheader:                              ; preds = %for.body
-  %1 = sext i32 %f to i64
-  br label %for.body4
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv22 = phi i64 [ 0, %entry ], [ %indvars.iv.next23, %for.body ]
-  %2 = add nsw i64 %indvars.iv22, %0
-  %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %indvars.iv22
-  %3 = trunc i64 %2 to i32
-  store i32 %3, i32* %arrayidx, align 4, !tbaa !0
-  %indvars.iv.next23 = add i64 %indvars.iv22, 1
-  %lftr.wideiv25 = trunc i64 %indvars.iv.next23 to i32
-  %exitcond26 = icmp eq i32 %lftr.wideiv25, 128
-  br i1 %exitcond26, label %for.cond2.preheader, label %for.body
-
-for.body4:                                        ; preds = %for.body4, %for.cond2.preheader
-  %indvars.iv = phi i64 [ 0, %for.cond2.preheader ], [ %indvars.iv.next, %for.body4 ]
-  %z1.018 = phi i32 [ 0, %for.cond2.preheader ], [ %add8, %for.body4 ]
-  %4 = add nsw i64 %indvars.iv, %1
-  %arrayidx7 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %4
-  %5 = load i32* %arrayidx7, align 4, !tbaa !0
-  %add8 = add nsw i32 %5, %z1.018
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 128
-  br i1 %exitcond, label %for.end11, label %for.body4
-
-for.end11:                                        ; preds = %for.body4
-  ret i32 %add8
-}
-
-attributes #0 = { readnone "target-cpu"="cyclone" }
-attributes #1 = { "target-cpu"="cyclone" }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-
-; CHECK:        .section        __LD,__compact_unwind,regular,debug
-; CHECK:        .quad   _foo1                   ; Range Start
-; CHECK:        .long   33554432                ; Compact Unwind Encoding: 0x2000000
-; CHECK:        .quad   _foo2                   ; Range Start
-; CHECK:        .long   33591296                ; Compact Unwind Encoding: 0x2009000
-; CHECK:        .quad   _foo3                   ; Range Start
-; CHECK:        .long   33570831                ; Compact Unwind Encoding: 0x200400f
-; CHECK:        .quad   _foo4                   ; Range Start
-; CHECK:        .long   33689616                ; Compact Unwind Encoding: 0x2021010
diff --git a/test/CodeGen/ARM64/leaf.ll b/test/CodeGen/ARM64/leaf.ll
deleted file mode 100644
index d3b2031..0000000
--- a/test/CodeGen/ARM64/leaf.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
-; rdar://12829704
-
-define void @t8() nounwind ssp {
-; CHECK-LABEL: t8:
-; CHECK-NOT: stp	fp, lr, [sp, #-16]!
-; CHECK-NOT: mov	fp, sp
-; CHECK: nop
-; CHECK-NOT: mov	sp, fp
-; CHECK-NOT: ldp	fp, lr, [sp], #16
-  tail call void asm sideeffect "nop", "~{v8}"() nounwind
-  ret void
-}
diff --git a/test/CodeGen/ARM64/lit.local.cfg b/test/CodeGen/ARM64/lit.local.cfg
deleted file mode 100644
index 48af100..0000000
--- a/test/CodeGen/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,11 +0,0 @@
-import re
-
-config.suffixes = ['.ll', '.c', '.cpp']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
-# For now we don't test arm64-win32.
-if re.search(r'cygwin|mingw32|win32', config.target_triple):
-    config.unsupported = True
diff --git a/test/CodeGen/ARM64/long-shift.ll b/test/CodeGen/ARM64/long-shift.ll
deleted file mode 100644
index 6f37044..0000000
--- a/test/CodeGen/ARM64/long-shift.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
-
-define i128 @shl(i128 %r, i128 %s) nounwind readnone {
-; CHECK-LABEL: shl:
-; CHECK: lslv  [[XREG_0:x[0-9]+]], x1, x2
-; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
-; CHECK-NEXT: lsrv  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
-; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
-; CHECK-NEXT: lslv  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
-; CHECK-NEXT: cmp   [[XREG_4]], #0
-; CHECK-NEXT: csel  x1, [[XREG_5]], [[XREG_6]], ge
-; CHECK-NEXT: lslv  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
-; CHECK-NEXT: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
-; CHECK-NEXT: ret
-
-  %shl = shl i128 %r, %s
-  ret i128 %shl
-}
-
-define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
-; CHECK: ashr:
-; CHECK: lsrv  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
-; CHECK-NEXT: lslv  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
-; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: asrv  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
-; CHECK-NEXT: cmp   [[XREG_5]], #0
-; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: asrv  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
-; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
-; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
-; CHECK-NEXT: ret
-
-  %shr = ashr i128 %r, %s
-  ret i128 %shr
-}
-
-define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
-; CHECK: lshr:
-; CHECK: lsrv  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
-; CHECK-NEXT: lslv  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
-; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: lsrv  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
-; CHECK-NEXT: cmp   [[XREG_5]], #0
-; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: lsrv  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
-; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
-; CHECK-NEXT: ret
-
-  %shr = lshr i128 %r, %s
-  ret i128 %shr
-}
diff --git a/test/CodeGen/ARM64/memcpy-inline.ll b/test/CodeGen/ARM64/memcpy-inline.ll
deleted file mode 100644
index 26f5166..0000000
--- a/test/CodeGen/ARM64/memcpy-inline.ll
+++ /dev/null
@@ -1,112 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
-
-%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
-
-@src = external global %struct.x
-@dst = external global %struct.x
-
-@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
-@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
-@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
-@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR  \00", align 1
-@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
-@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
-@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
-
-define i32 @t0() {
-entry:
-; CHECK-LABEL: t0:
-; CHECK: ldrb [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #10]
-; CHECK: strb [[REG0]], [x[[BASEREG2:[0-9]+]], #10]
-; CHECK: ldrh [[REG1:w[0-9]+]], [x[[BASEREG]], #8]
-; CHECK: strh [[REG1]], [x[[BASEREG2]], #8]
-; CHECK: ldr [[REG2:x[0-9]+]],
-; CHECK: str [[REG2]],
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
-  ret i32 0
-}
-
-define void @t1(i8* nocapture %C) nounwind {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
-; CHECK: stur [[DEST]], [x0, #15]
-; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
-; CHECK: str [[DEST]], [x0]
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
-  ret void
-}
-
-define void @t2(i8* nocapture %C) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: movz [[REG3:w[0-9]+]]
-; CHECK: movk [[REG3]],
-; CHECK: str [[REG3]], [x0, #32]
-; CHECK: ldp [[DEST1:q[0-9]+]], [[DEST2:q[0-9]+]], [x{{[0-9]+}}]
-; CHECK: stp [[DEST1]], [[DEST2]], [x0]
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
-  ret void
-}
-
-define void @t3(i8* nocapture %C) nounwind {
-entry:
-; CHECK-LABEL: t3:
-; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16]
-; CHECK: str [[REG4]], [x0, #16]
-; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
-; CHECK: str [[DEST]], [x0]
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
-  ret void
-}
-
-define void @t4(i8* nocapture %C) nounwind {
-entry:
-; CHECK-LABEL: t4:
-; CHECK: orr [[REG5:w[0-9]+]], wzr, #0x20
-; CHECK: strh [[REG5]], [x0, #16]
-; CHECK: ldr [[REG6:q[0-9]+]], [x{{[0-9]+}}]
-; CHECK: str [[REG6]], [x0]
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
-  ret void
-}
-
-define void @t5(i8* nocapture %C) nounwind {
-entry:
-; CHECK-LABEL: t5:
-; CHECK: strb wzr, [x0, #6]
-; CHECK: movz [[REG7:w[0-9]+]], #21587
-; CHECK: strh [[REG7]], [x0, #4]
-; CHECK: movz [[REG8:w[0-9]+]],
-; CHECK: movk [[REG8]],
-; CHECK: str [[REG8]], [x0]
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
-  ret void
-}
-
-define void @t6() nounwind {
-entry:
-; CHECK-LABEL: t6:
-; CHECK: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6]
-; CHECK: stur [[REG9]], [x{{[0-9]+}}, #6]
-; CHECK: ldr
-; CHECK: str
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
-  ret void
-}
-
-%struct.Foo = type { i32, i32, i32, i32 }
-
-define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
-entry:
-; CHECK: t7
-; CHECK: ldr [[REG10:q[0-9]+]], [x1]
-; CHECK: str [[REG10]], [x0]
-  %0 = bitcast %struct.Foo* %a to i8*
-  %1 = bitcast %struct.Foo* %b to i8*
-  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
-  ret void
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM64/memset-inline.ll b/test/CodeGen/ARM64/memset-inline.ll
deleted file mode 100644
index 2e237f4..0000000
--- a/test/CodeGen/ARM64/memset-inline.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define void @t1(i8* nocapture %c) nounwind optsize {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: str wzr, [x0, #8]
-; CHECK: str xzr, [x0]
-  call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
-  ret void
-}
-
-define void @t2() nounwind ssp {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: strh wzr, [sp, #32]
-; CHECK: stp xzr, xzr, [sp, #16]
-; CHECK: str xzr, [sp, #8]
-  %buf = alloca [26 x i8], align 1
-  %0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0
-  call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
-  call void @something(i8* %0) nounwind
-  ret void
-}
-
-declare void @something(i8*) nounwind
-declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM64/memset-to-bzero.ll b/test/CodeGen/ARM64/memset-to-bzero.ll
deleted file mode 100644
index b28122c..0000000
--- a/test/CodeGen/ARM64/memset-to-bzero.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc %s -march arm64 -o - | FileCheck %s
-; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
-
-; CHECK: @fct1
-; For small size (<= 256), we do not change memset to bzero.
-; CHECK: memset
-define void @fct1(i8* nocapture %ptr) {
-entry:
-  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false)
-  ret void
-}
-
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
-
-; CHECK: @fct2
-; When the size is bigger than 256, change into bzero.
-; CHECK: bzero
-define void @fct2(i8* nocapture %ptr) {
-entry:
-  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false)
-  ret void
-}
-
-; CHECK: @fct3
-; For unknown size, change to bzero.
-; CHECK: bzero
-define void @fct3(i8* nocapture %ptr, i32 %unknown) {
-entry:
-  %conv = sext i32 %unknown to i64
-  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i32 1, i1 false)
-  ret void
-}
-
-; CHECK: @fct4
-; Size <= 256, no change.
-; CHECK: memset
-define void @fct4(i8* %ptr) {
-entry:
-  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
-  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp)
-  ret void
-}
-
-declare i8* @__memset_chk(i8*, i32, i64, i64)
-
-declare i64 @llvm.objectsize.i64(i8*, i1)
-
-; CHECK: @fct5
-; Size > 256, change.
-; CHECK: bzero
-define void @fct5(i8* %ptr) {
-entry:
-  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
-  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp)
-  ret void
-}
-
-; CHECK: @fct6
-; Size = unknown, change.
-; CHECK: bzero
-define void @fct6(i8* %ptr, i32 %unknown) {
-entry:
-  %conv = sext i32 %unknown to i64
-  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
-  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp)
-  ret void
-}
-
-; Next functions check that memset is not turned into bzero
-; when the set constant is non-zero, whatever the given size.
-
-; CHECK: @fct7
-; memset with something that is not a zero, no change.
-; CHECK: memset
-define void @fct7(i8* %ptr) {
-entry:
-  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
-  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp)
-  ret void
-}
-
-; CHECK: @fct8
-; memset with something that is not a zero, no change.
-; CHECK: memset
-define void @fct8(i8* %ptr) {
-entry:
-  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
-  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp)
-  ret void
-}
-
-; CHECK: @fct9
-; memset with something that is not a zero, no change.
-; CHECK: memset
-define void @fct9(i8* %ptr, i32 %unknown) {
-entry:
-  %conv = sext i32 %unknown to i64
-  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
-  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp)
-  ret void
-}
diff --git a/test/CodeGen/ARM64/movi.ll b/test/CodeGen/ARM64/movi.ll
deleted file mode 100644
index 8fceccc..0000000
--- a/test/CodeGen/ARM64/movi.ll
+++ /dev/null
@@ -1,202 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-;==--------------------------------------------------------------------------==
-; Tests for MOV-immediate implemented with ORR-immediate.
-;==--------------------------------------------------------------------------==
-
-; 64-bit immed with 32-bit pattern size, rotated by 0.
-define i64 @test64_32_rot0() nounwind {
-; CHECK: test64_32_rot0
-; CHECK: orr x0, xzr, #0x700000007
-  ret i64 30064771079
-}
-
-; 64-bit immed with 32-bit pattern size, rotated by 2.
-define i64 @test64_32_rot2() nounwind {
-; CHECK: test64_32_rot2
-; CHECK: orr x0, xzr, #0xc0000003c0000003
-  ret i64 13835058071388291075
-}
-
-; 64-bit immed with 4-bit pattern size, rotated by 3.
-define i64 @test64_4_rot3() nounwind {
-; CHECK: test64_4_rot3
-; CHECK: orr  x0, xzr, #0xeeeeeeeeeeeeeeee
-  ret i64 17216961135462248174
-}
-
-; 32-bit immed with 32-bit pattern size, rotated by 16.
-define i32 @test32_32_rot16() nounwind {
-; CHECK: test32_32_rot16
-; CHECK: orr w0, wzr, #0xff0000
-  ret i32 16711680
-}
-
-; 32-bit immed with 2-bit pattern size, rotated by 1.
-define i32 @test32_2_rot1() nounwind {
-; CHECK: test32_2_rot1
-; CHECK: orr w0, wzr, #0xaaaaaaaa
-  ret i32 2863311530
-}
-
-;==--------------------------------------------------------------------------==
-; Tests for MOVZ with MOVK.
-;==--------------------------------------------------------------------------==
-
-define i32 @movz() nounwind {
-; CHECK: movz
-; CHECK: movz w0, #5
-  ret i32 5
-}
-
-define i64 @movz_3movk() nounwind {
-; CHECK: movz_3movk
-; CHECK:      movz x0, #5, lsl #48
-; CHECK-NEXT: movk x0, #4660, lsl #32
-; CHECK-NEXT: movk x0, #43981, lsl #16
-; CHECK-NEXT: movk x0, #22136
-  ret i64 1427392313513592
-}
-
-define i64 @movz_movk_skip1() nounwind {
-; CHECK: movz_movk_skip1
-; CHECK:      movz x0, #5, lsl #32
-; CHECK-NEXT: movk x0, #17185, lsl #16
-  ret i64 22601072640
-}
-
-define i64 @movz_skip1_movk() nounwind {
-; CHECK: movz_skip1_movk
-; CHECK:      movz x0, #34388, lsl #32
-; CHECK-NEXT: movk x0, #4660
-  ret i64 147695335379508
-}
-
-;==--------------------------------------------------------------------------==
-; Tests for MOVN with MOVK.
-;==--------------------------------------------------------------------------==
-
-define i64 @movn() nounwind {
-; CHECK: movn
-; CHECK: movn x0, #41
-  ret i64 -42
-}
-
-define i64 @movn_skip1_movk() nounwind {
-; CHECK: movn_skip1_movk
-; CHECK:      movn x0, #41, lsl #32
-; CHECK-NEXT: movk x0, #4660
-  ret i64 -176093720012
-}
-
-;==--------------------------------------------------------------------------==
-; Tests for ORR with MOVK.
-;==--------------------------------------------------------------------------==
-; rdar://14987673
-
-define i64 @orr_movk1() nounwind {
-; CHECK: orr_movk1
-; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #57005, lsl #16
-  ret i64 72056498262245120
-}
-
-define i64 @orr_movk2() nounwind {
-; CHECK: orr_movk2
-; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #57005, lsl #48
-  ret i64 -2400982650836746496
-}
-
-define i64 @orr_movk3() nounwind {
-; CHECK: orr_movk3
-; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #57005, lsl #32
-  ret i64 72020953688702720
-}
-
-define i64 @orr_movk4() nounwind {
-; CHECK: orr_movk4
-; CHECK: orr x0, xzr, #0xffff0000ffff0
-; CHECK: movk x0, #57005
-  ret i64 72056494543068845
-}
-
-; rdar://14987618
-define i64 @orr_movk5() nounwind {
-; CHECK: orr_movk5
-; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #57005, lsl #16
-  ret i64 -71777214836900096
-}
-
-define i64 @orr_movk6() nounwind {
-; CHECK: orr_movk6
-; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #57005, lsl #16
-; CHECK: movk x0, #57005, lsl #48
-  ret i64 -2400982647117578496
-}
-
-define i64 @orr_movk7() nounwind {
-; CHECK: orr_movk7
-; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #57005, lsl #48
-  ret i64 -2400982646575268096
-}
-
-define i64 @orr_movk8() nounwind {
-; CHECK: orr_movk8
-; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
-; CHECK: movk x0, #57005
-; CHECK: movk x0, #57005, lsl #48
-  ret i64 -2400982646575276371
-}
-
-; rdar://14987715
-define i64 @orr_movk9() nounwind {
-; CHECK: orr_movk9
-; CHECK: orr x0, xzr, #0xffffff000000000
-; CHECK: movk x0, #65280
-; CHECK: movk x0, #57005, lsl #16
-  ret i64 1152921439623315200
-}
-
-define i64 @orr_movk10() nounwind {
-; CHECK: orr_movk10
-; CHECK: orr x0, xzr, #0xfffffffffffff00
-; CHECK: movk x0, #57005, lsl #16
-  ret i64 1152921504047824640
-}
-
-define i64 @orr_movk11() nounwind {
-; CHECK: orr_movk11
-; CHECK: orr x0, xzr, #0xfff00000000000ff
-; CHECK: movk x0, #57005, lsl #16
-; CHECK: movk x0, #65535, lsl #32
-  ret i64 -4222125209747201
-}
-
-define i64 @orr_movk12() nounwind {
-; CHECK: orr_movk12
-; CHECK: orr x0, xzr, #0xfff00000000000ff
-; CHECK: movk x0, #57005, lsl #32
-  ret i64 -4258765016661761
-}
-
-define i64 @orr_movk13() nounwind {
-; CHECK: orr_movk13
-; CHECK: orr x0, xzr, #0xfffff000000
-; CHECK: movk x0, #57005
-; CHECK: movk x0, #57005, lsl #48
-  ret i64 -2401245434149282131
-}
-
-; rdar://13944082
-define i64 @g() nounwind {
-; CHECK: g
-; CHECK: movz x0, #65535, lsl #48
-; CHECK: movk x0, #2
-entry:
-  ret i64 -281474976710654
-}
diff --git a/test/CodeGen/ARM64/mul.ll b/test/CodeGen/ARM64/mul.ll
deleted file mode 100644
index 2e7986d..0000000
--- a/test/CodeGen/ARM64/mul.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-; rdar://9296808
-; rdar://9349137
-
-define i128 @t1(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: umulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-  %tmp1 = zext i64 %a to i128
-  %tmp2 = zext i64 %b to i128
-  %tmp3 = mul i128 %tmp1, %tmp2
-  ret i128 %tmp3
-}
-
-define i128 @t2(i64 %a, i64 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: smulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-  %tmp1 = sext i64 %a to i128
-  %tmp2 = sext i64 %b to i128
-  %tmp3 = mul i128 %tmp1, %tmp2
-  ret i128 %tmp3
-}
-
-define i64 @t3(i32 %a, i32 %b) nounwind {
-entry:
-; CHECK-LABEL: t3:
-; CHECK: umull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-  %tmp1 = zext i32 %a to i64
-  %tmp2 = zext i32 %b to i64
-  %tmp3 = mul i64 %tmp1, %tmp2
-  ret i64 %tmp3
-}
-
-define i64 @t4(i32 %a, i32 %b) nounwind {
-entry:
-; CHECK-LABEL: t4:
-; CHECK: smull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-  %tmp1 = sext i32 %a to i64
-  %tmp2 = sext i32 %b to i64
-  %tmp3 = mul i64 %tmp1, %tmp2
-  ret i64 %tmp3
-}
-
-define i64 @t5(i32 %a, i32 %b, i64 %c) nounwind {
-entry:
-; CHECK-LABEL: t5:
-; CHECK: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
-  %tmp1 = zext i32 %a to i64
-  %tmp2 = zext i32 %b to i64
-  %tmp3 = mul i64 %tmp1, %tmp2
-  %tmp4 = add i64 %c, %tmp3
-  ret i64 %tmp4
-}
-
-define i64 @t6(i32 %a, i32 %b, i64 %c) nounwind {
-entry:
-; CHECK-LABEL: t6:
-; CHECK: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
-  %tmp1 = sext i32 %a to i64
-  %tmp2 = sext i32 %b to i64
-  %tmp3 = mul i64 %tmp1, %tmp2
-  %tmp4 = sub i64 %c, %tmp3
-  ret i64 %tmp4
-}
-
-define i64 @t7(i32 %a, i32 %b) nounwind {
-entry:
-; CHECK-LABEL: t7:
-; CHECK: umnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-  %tmp1 = zext i32 %a to i64
-  %tmp2 = zext i32 %b to i64
-  %tmp3 = mul i64 %tmp1, %tmp2
-  %tmp4 = sub i64 0, %tmp3
-  ret i64 %tmp4
-}
-
-define i64 @t8(i32 %a, i32 %b) nounwind {
-entry:
-; CHECK-LABEL: t8:
-; CHECK: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-  %tmp1 = sext i32 %a to i64
-  %tmp2 = sext i32 %b to i64
-  %tmp3 = mul i64 %tmp1, %tmp2
-  %tmp4 = sub i64 0, %tmp3
-  ret i64 %tmp4
-}
diff --git a/test/CodeGen/ARM64/neg.ll b/test/CodeGen/ARM64/neg.ll
deleted file mode 100644
index 659ce98..0000000
--- a/test/CodeGen/ARM64/neg.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-
-define i32 @test_neg_i32(i32 %in) {
-; CHECK-LABEL: test_neg_i32:
-; CHECK: neg w0, w0
-  %res = sub i32 0, %in
-  ret i32 %res
-}
-
-define i64 @test_neg_i64(i64 %in) {
-; CHECK-LABEL: test_neg_i64:
-; CHECK: neg x0, x0
-  %res = sub i64 0, %in
-  ret i64 %res
-}
-
-define <8 x i8> @test_neg_v8i8(<8 x i8> %in) {
-; CHECK-LABEL: test_neg_v8i8:
-; CHECK: neg v0.8b, v0.8b
-  %res = sub <8 x i8> zeroinitializer, %in
-  ret <8 x i8> %res
-}
-
-define <4 x i16> @test_neg_v4i16(<4 x i16> %in) {
-; CHECK-LABEL: test_neg_v4i16:
-; CHECK: neg v0.4h, v0.4h
-  %res = sub <4 x i16> zeroinitializer, %in
-  ret <4 x i16> %res
-}
-
-define <2 x i32> @test_neg_v2i32(<2 x i32> %in) {
-; CHECK-LABEL: test_neg_v2i32:
-; CHECK: neg v0.2s, v0.2s
-  %res = sub <2 x i32> zeroinitializer, %in
-  ret <2 x i32> %res
-}
-
-define <16 x i8> @test_neg_v16i8(<16 x i8> %in) {
-; CHECK-LABEL: test_neg_v16i8:
-; CHECK: neg v0.16b, v0.16b
-  %res = sub <16 x i8> zeroinitializer, %in
-  ret <16 x i8> %res
-}
-
-define <8 x i16> @test_neg_v8i16(<8 x i16> %in) {
-; CHECK-LABEL: test_neg_v8i16:
-; CHECK: neg v0.8h, v0.8h
-  %res = sub <8 x i16> zeroinitializer, %in
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @test_neg_v4i32(<4 x i32> %in) {
-; CHECK-LABEL: test_neg_v4i32:
-; CHECK: neg v0.4s, v0.4s
-  %res = sub <4 x i32> zeroinitializer, %in
-  ret <4 x i32> %res
-}
-
-define <2 x i64> @test_neg_v2i64(<2 x i64> %in) {
-; CHECK-LABEL: test_neg_v2i64:
-; CHECK: neg v0.2d, v0.2d
-  %res = sub <2 x i64> zeroinitializer, %in
-  ret <2 x i64> %res
-}
-
-define <1 x i64> @test_neg_v1i64(<1 x i64> %in) {
-; CHECK-LABEL: test_neg_v1i64:
-; CHECK: neg d0, d0
-  %res = sub <1 x i64> zeroinitializer, %in
-  ret <1 x i64> %res
-}
diff --git a/test/CodeGen/ARM64/neon-compare-instructions.ll b/test/CodeGen/ARM64/neon-compare-instructions.ll
deleted file mode 100644
index 55f7b99..0000000
--- a/test/CodeGen/ARM64/neon-compare-instructions.ll
+++ /dev/null
@@ -1,1191 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s
-
-define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp eq <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp eq <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = icmp eq <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = icmp eq <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = icmp eq <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = icmp eq <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = icmp eq <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ne <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ne <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ne <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp sgt <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp sgt <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = icmp sgt <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = icmp sgt <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = icmp sgt <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = icmp sgt <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = icmp sgt <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
-	%tmp3 = icmp slt <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
-	%tmp3 = icmp slt <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
-	%tmp3 = icmp slt <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
-	%tmp3 = icmp slt <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-	%tmp3 = icmp slt <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-	%tmp3 = icmp slt <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-	%tmp3 = icmp slt <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp sge <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp sge <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = icmp sge <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = icmp sge <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = icmp sge <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = icmp sge <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = icmp sge <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
-	%tmp3 = icmp sle <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
-	%tmp3 = icmp sle <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
-	%tmp3 = icmp sle <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
-	%tmp3 = icmp sle <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
-	%tmp3 = icmp sle <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
-	%tmp3 = icmp sle <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
-	%tmp3 = icmp sle <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ugt <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ugt <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = icmp ugt <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = icmp ugt <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = icmp ugt <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = icmp ugt <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = icmp ugt <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
-	%tmp3 = icmp ult <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
-	%tmp3 = icmp ult <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
-	%tmp3 = icmp ult <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
-	%tmp3 = icmp ult <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
-	%tmp3 = icmp ult <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
-	%tmp3 = icmp ult <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
-	%tmp3 = icmp ult <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp uge <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp uge <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-	%tmp3 = icmp uge <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-	%tmp3 = icmp uge <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-	%tmp3 = icmp uge <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-	%tmp3 = icmp uge <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-	%tmp3 = icmp uge <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
-	%tmp3 = icmp ule <8 x i8> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
-	%tmp3 = icmp ule <16 x i8> %A, %B;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
-	%tmp3 = icmp ule <4 x i16> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
-	%tmp3 = icmp ule <8 x i16> %A, %B;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
-	%tmp3 = icmp ule <2 x i32> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
-	%tmp3 = icmp ule <4 x i32> %A, %B;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
-	%tmp3 = icmp ule <2 x i64> %A, %B;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-
-define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-	%tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-	%tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-	%tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-	%tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-	%tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-	%tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-	%tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-
-define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
-;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-	%tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
-;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-	%tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
-;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-	%tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
-;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-	%tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
-;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-	%tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
-;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-	%tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
-;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-	%tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-
-define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
-;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-	%tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
-;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-	%tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
-;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-	%tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
-;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-	%tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
-;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-	%tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
-;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-	%tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
-;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-	%tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
-;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-	%tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
-;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-	%tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
-;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-	%tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
-;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-	%tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
-;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-	%tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
-;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-	%tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
-;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-	%tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
-;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-	%tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
-;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-	%tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
-;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-	%tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
-;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-	%tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
-;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-	%tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
-;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-	%tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
-;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-	%tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
-	%tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
-	%tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
-	%tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
-	%tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
-	%tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
-	%tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
-	%tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-
-define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
-	%tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
-	%tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
-	%tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
-	%tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
-	%tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
-	%tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
-	%tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v[[ZERO]].8b, v0.8b
-	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
-	%tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
-	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
-	%tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
-	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
-	%tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LS implemented as HS, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
-	%tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v[[ZERO]].8b, {{v[0-9]+}}.8b
-	%tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
-	ret <8 x i8> %tmp4
-}
-
-define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
-	%tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
-   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
-	ret <16 x i8> %tmp4
-}
-
-define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
-	%tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
-	ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
-	%tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
-   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
-	ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
-	%tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
-	ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
-	%tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
-   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; LO implemented as HI, so check reversed operands.
-;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
-	%tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
-   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
-	ret <2 x i64> %tmp4
-}
-
-define <1 x i64> @cmeqz_v1i64(<1 x i64> %A) {
-; CHECK-LABEL: cmeqz_v1i64:
-; CHECK: cmeq d0, d0, #0
-  %tst = icmp eq <1 x i64> %A, <i64 0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @cmgez_v1i64(<1 x i64> %A) {
-; CHECK-LABEL: cmgez_v1i64:
-; CHECK: cmge d0, d0, #0
-  %tst = icmp sge <1 x i64> %A, <i64 0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @cmgtz_v1i64(<1 x i64> %A) {
-; CHECK-LABEL: cmgtz_v1i64:
-; CHECK: cmgt d0, d0, #0
-  %tst = icmp sgt <1 x i64> %A, <i64 0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @cmlez_v1i64(<1 x i64> %A) {
-; CHECK-LABEL: cmlez_v1i64:
-; CHECK: cmle d0, d0, #0
-  %tst = icmp sle <1 x i64> %A, <i64 0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @cmltz_v1i64(<1 x i64> %A) {
-; CHECK-LABEL: cmltz_v1i64:
-; CHECK: cmlt d0, d0, #0
-  %tst = icmp slt <1 x i64> %A, <i64 0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmeqz_v1f64(<1 x double> %A) {
-; CHECK-LABEL: fcmeqz_v1f64:
-; CHECK: fcmeq d0, d0, #0
-  %tst = fcmp oeq <1 x double> %A, <double 0.0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmgez_v1f64(<1 x double> %A) {
-; CHECK-LABEL: fcmgez_v1f64:
-; CHECK: fcmge d0, d0, #0
-  %tst = fcmp oge <1 x double> %A, <double 0.0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmgtz_v1f64(<1 x double> %A) {
-; CHECK-LABEL: fcmgtz_v1f64:
-; CHECK: fcmgt d0, d0, #0
-  %tst = fcmp ogt <1 x double> %A, <double 0.0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmlez_v1f64(<1 x double> %A) {
-; CHECK-LABEL: fcmlez_v1f64:
-; CHECK: fcmle d0, d0, #0
-  %tst = fcmp ole <1 x double> %A, <double 0.0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmltz_v1f64(<1 x double> %A) {
-; CHECK-LABEL: fcmltz_v1f64:
-; CHECK: fcmlt d0, d0, #0
-  %tst = fcmp olt <1 x double> %A, <double 0.0>
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
diff --git a/test/CodeGen/ARM64/patchpoint.ll b/test/CodeGen/ARM64/patchpoint.ll
deleted file mode 100644
index 993e3eb..0000000
--- a/test/CodeGen/ARM64/patchpoint.ll
+++ /dev/null
@@ -1,163 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 | FileCheck %s
-
-; Trivial patchpoint codegen
-;
-define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-; CHECK-LABEL: trivial_patchpoint_codegen:
-; CHECK:       movz x16, #57005, lsl #32
-; CHECK-NEXT:  movk x16, #48879, lsl #16
-; CHECK-NEXT:  movk x16, #51966
-; CHECK-NEXT:  blr  x16
-; CHECK:       movz x16, #57005, lsl #32
-; CHECK-NEXT:  movk x16, #48879, lsl #16
-; CHECK-NEXT:  movk x16, #51967
-; CHECK-NEXT:  blr  x16
-; CHECK:       ret
-  %resolveCall2 = inttoptr i64 244837814094590 to i8*
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
-  %resolveCall3 = inttoptr i64 244837814094591 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
-  ret i64 %result
-}
-
-; Caller frame metadata with stackmaps. This should not be optimized
-; as a leaf function.
-;
-; CHECK-LABEL: caller_meta_leaf
-; CHECK:       mov fp, sp
-; CHECK-NEXT:  sub sp, sp, #32
-; CHECK:       Ltmp
-; CHECK:       mov sp, fp
-; CHECK:       ret
-
-define void @caller_meta_leaf() {
-entry:
-  %metadata = alloca i64, i32 3, align 8
-  store i64 11, i64* %metadata
-  store i64 12, i64* %metadata
-  store i64 13, i64* %metadata
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
-  ret void
-}
-
-; Test the webkit_jscc calling convention.
-; One argument will be passed in register, the other will be pushed on the stack.
-; Return value in x0.
-define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen:
-; CHECK:      Ltmp
-; CHECK:      str x{{.+}}, [sp]
-; CHECK-NEXT: mov  x0, x{{.+}}
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #65535, lsl #32
-; CHECK-NEXT: movk  x16, #57005, lsl #16
-; CHECK-NEXT: movk  x16, #48879
-; CHECK-NEXT: blr x16
-  %resolveCall2 = inttoptr i64 281474417671919 to i8*
-  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
-  %resolveCall3 = inttoptr i64 244837814038255 to i8*
-  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
-  ret void
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen2(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen2:
-; CHECK:      Ltmp
-; CHECK:      orr x{{.+}}, xzr, #0x6
-; CHECK-NEXT: str x{{.+}}, [sp, #24]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
-; CHECK-NEXT: str w{{.+}}, [sp, #16]
-; CHECK-NEXT: orr x{{.+}}, xzr, #0x2
-; CHECK-NEXT: str x{{.+}}, [sp]
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #65535, lsl #32
-; CHECK-NEXT: movk  x16, #57005, lsl #16
-; CHECK-NEXT: movk  x16, #48879
-; CHECK-NEXT: blr x16
-  %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
-  ret i64 %result
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen3(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen3:
-; CHECK:      Ltmp
-; CHECK:      movz  x{{.+}}, #10
-; CHECK-NEXT: str x{{.+}}, [sp, #48]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x8
-; CHECK-NEXT: str w{{.+}}, [sp, #36]
-; CHECK-NEXT: orr x{{.+}}, xzr, #0x6
-; CHECK-NEXT: str x{{.+}}, [sp, #24]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
-; CHECK-NEXT: str w{{.+}}, [sp, #16]
-; CHECK-NEXT: orr x{{.+}}, xzr, #0x2
-; CHECK-NEXT: str x{{.+}}, [sp]
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #65535, lsl #32
-; CHECK-NEXT: movk  x16, #57005, lsl #16
-; CHECK-NEXT: movk  x16, #48879
-; CHECK-NEXT: blr x16
-  %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
-  ret i64 %result
-}
-
-; Test patchpoints reusing the same TargetConstant.
-; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
-; There is no way to verify this, since it depends on memory allocation.
-; But I think it's useful to include as a working example.
-define i64 @testLowerConstant(i64 %arg, i64 %tmp2, i64 %tmp10, i64* %tmp33, i64 %tmp79) {
-entry:
-  %tmp80 = add i64 %tmp79, -16
-  %tmp81 = inttoptr i64 %tmp80 to i64*
-  %tmp82 = load i64* %tmp81, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
-  %tmp83 = load i64* %tmp33, align 8
-  %tmp84 = add i64 %tmp83, -24
-  %tmp85 = inttoptr i64 %tmp84 to i64*
-  %tmp86 = load i64* %tmp85, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
-  ret i64 10
-}
-
-; Test small patchpoints that don't emit calls.
-define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-; CHECK-LABEL: small_patchpoint_codegen:
-; CHECK:      Ltmp
-; CHECK:      nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: ldp
-; CHECK-NEXT: ret
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
-  ret void
-}
-
-; Test that scratch registers are spilled around patchpoints
-; CHECK: InlineAsm End
-; CHECK-NEXT: mov x{{[0-9]+}}, x16
-; CHECK-NEXT: mov x{{[0-9]+}}, x17
-; CHECK-NEXT: Ltmp
-; CHECK-NEXT: nop
-define void @clobberScratch(i32* %p) {
-  %v = load i32* %p
-  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
-  store i32 %v, i32* %p
-  ret void
-}
-
-declare void @llvm.experimental.stackmap(i64, i32, ...)
-declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
-declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/platform-reg.ll b/test/CodeGen/ARM64/platform-reg.ll
deleted file mode 100644
index 651c793..0000000
--- a/test/CodeGen/ARM64/platform-reg.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-DARWIN
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-
-; x18 is reserved as a platform register on Darwin but not on other
-; systems. Create loads of register pressure and make sure this is respected.
-
-; Also, fp must always refer to a valid frame record, even if it's not the one
-; of the current function, so it shouldn't be used either.
-
-@var = global [30 x i64] zeroinitializer
-
-define void @keep_live() {
-  %val = load volatile [30 x i64]* @var
-  store volatile [30 x i64] %val, [30 x i64]* @var
-
-; CHECK: ldr x18
-; CHECK: str x18
-
-; CHECK-DARWIN-NOT: ldr fp
-; CHECK-DARWIN-NOT: ldr x18
-; CHECK-DARWIN: Spill
-; CHECK-DARWIN-NOT: ldr fp
-; CHECK-DARWIN-NOT: ldr x18
-; CHECK-DARWIN: ret
-  ret void
-}
diff --git a/test/CodeGen/ARM64/popcnt.ll b/test/CodeGen/ARM64/popcnt.ll
deleted file mode 100644
index 9bbba09c..0000000
--- a/test/CodeGen/ARM64/popcnt.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
-  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
-  ret i32 %cnt
-; CHECK: fmov	s0, w0
-; CHECK: cnt.8b	v0, v0
-; CHECK: uaddlv.8b	h0, v0
-; CHECK: fmov w0, s0
-; CHECK: ret
-}
-
-define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
-  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
-  ret i64 %cnt
-; CHECK: fmov	d0, x0
-; CHECK: cnt.8b	v0, v0
-; CHECK: uaddlv.8b	h0, v0
-; CHECK: fmov	w0, s0
-; CHECK: ret
-}
-
-; Do not use AdvSIMD when -mno-implicit-float is specified.
-; rdar://9473858
-
-define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
-  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
-  ret i32 %cnt
-; CHECK-LABEL: cnt32:
-; CHECK-NOT 16b
-; CHECK: ret
-}
-
-define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
-  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
-  ret i64 %cnt
-; CHECK-LABEL: cnt64:
-; CHECK-NOT 16b
-; CHECK: ret
-}
-
-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/test/CodeGen/ARM64/prefetch.ll b/test/CodeGen/ARM64/prefetch.ll
deleted file mode 100644
index b2e06ed..0000000
--- a/test/CodeGen/ARM64/prefetch.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; RUN: llc %s -march arm64 -o - | FileCheck %s
-
-@a = common global i32* null, align 8
-
-define void @test(i32 %i, i32 %j) nounwind ssp {
-entry:
-  ; CHECK: @test
-  %j.addr = alloca i32, align 4
-  store i32 %j, i32* %j.addr, align 4, !tbaa !0
-  %tmp = bitcast i32* %j.addr to i8*
-  ; CHECK: prfum pldl1strm
-  call void @llvm.prefetch(i8* %tmp, i32 0, i32 0, i32 1)
-  ; CHECK: prfum pldl3keep
-  call void @llvm.prefetch(i8* %tmp, i32 0, i32 1, i32 1)
-  ; CHECK: prfum pldl2keep
-  call void @llvm.prefetch(i8* %tmp, i32 0, i32 2, i32 1)
-  ; CHECK: prfum pldl1keep
-  call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 1)
-
-  ; CHECK: prfum pstl1strm
-  call void @llvm.prefetch(i8* %tmp, i32 1, i32 0, i32 1)
-  ; CHECK: prfum pstl3keep
-  call void @llvm.prefetch(i8* %tmp, i32 1, i32 1, i32 1)
-  ; CHECK: prfum pstl2keep
-  call void @llvm.prefetch(i8* %tmp, i32 1, i32 2, i32 1)
-  ; CHECK: prfum pstl1keep
-  call void @llvm.prefetch(i8* %tmp, i32 1, i32 3, i32 1)
-
-  %tmp1 = load i32* %j.addr, align 4, !tbaa !0
-  %add = add nsw i32 %tmp1, %i
-  %idxprom = sext i32 %add to i64
-  %tmp2 = load i32** @a, align 8, !tbaa !3
-  %arrayidx = getelementptr inbounds i32* %tmp2, i64 %idxprom
-  %tmp3 = bitcast i32* %arrayidx to i8*
-
-  ; CHECK: prfm pldl1strm
-  call void @llvm.prefetch(i8* %tmp3, i32 0, i32 0, i32 1)
-  %tmp4 = load i32** @a, align 8, !tbaa !3
-  %arrayidx3 = getelementptr inbounds i32* %tmp4, i64 %idxprom
-  %tmp5 = bitcast i32* %arrayidx3 to i8*
-
-  ; CHECK: prfm pldl3keep
-  call void @llvm.prefetch(i8* %tmp5, i32 0, i32 1, i32 1)
-  %tmp6 = load i32** @a, align 8, !tbaa !3
-  %arrayidx6 = getelementptr inbounds i32* %tmp6, i64 %idxprom
-  %tmp7 = bitcast i32* %arrayidx6 to i8*
-
-  ; CHECK: prfm pldl2keep
-  call void @llvm.prefetch(i8* %tmp7, i32 0, i32 2, i32 1)
-  %tmp8 = load i32** @a, align 8, !tbaa !3
-  %arrayidx9 = getelementptr inbounds i32* %tmp8, i64 %idxprom
-  %tmp9 = bitcast i32* %arrayidx9 to i8*
-
-  ; CHECK: prfm pldl1keep
-  call void @llvm.prefetch(i8* %tmp9, i32 0, i32 3, i32 1)
-  %tmp10 = load i32** @a, align 8, !tbaa !3
-  %arrayidx12 = getelementptr inbounds i32* %tmp10, i64 %idxprom
-  %tmp11 = bitcast i32* %arrayidx12 to i8*
-
-  ; CHECK: prfm pstl1strm
-  call void @llvm.prefetch(i8* %tmp11, i32 1, i32 0, i32 1)
-  %tmp12 = load i32** @a, align 8, !tbaa !3
-  %arrayidx15 = getelementptr inbounds i32* %tmp12, i64 %idxprom
-  %tmp13 = bitcast i32* %arrayidx15 to i8*
-
-  ; CHECK: prfm pstl3keep
-  call void @llvm.prefetch(i8* %tmp13, i32 1, i32 1, i32 1)
-  %tmp14 = load i32** @a, align 8, !tbaa !3
-  %arrayidx18 = getelementptr inbounds i32* %tmp14, i64 %idxprom
-  %tmp15 = bitcast i32* %arrayidx18 to i8*
-
-  ; CHECK: prfm pstl2keep
-  call void @llvm.prefetch(i8* %tmp15, i32 1, i32 2, i32 1)
-  %tmp16 = load i32** @a, align 8, !tbaa !3
-  %arrayidx21 = getelementptr inbounds i32* %tmp16, i64 %idxprom
-  %tmp17 = bitcast i32* %arrayidx21 to i8*
-
-  ; CHECK: prfm pstl1keep
-  call void @llvm.prefetch(i8* %tmp17, i32 1, i32 3, i32 1)
-  ret void
-}
-
-declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/ARM64/promote-const.ll b/test/CodeGen/ARM64/promote-const.ll
deleted file mode 100644
index 4a336db..0000000
--- a/test/CodeGen/ARM64/promote-const.ll
+++ /dev/null
@@ -1,255 +0,0 @@
-; Disable machine cse to stress the different path of the algorithm.
-; Otherwise, we always fall in the simple case, i.e., only one definition.
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-stress-promote-const | FileCheck -check-prefix=PROMOTED %s
-; The REGULAR run just checks that the inputs passed to promote const expose
-; the appropriate patterns.
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-promote-const=false | FileCheck -check-prefix=REGULAR %s
-
-%struct.uint8x16x4_t = type { [4 x <16 x i8>] }
-
-; Constant is a structure
-define %struct.uint8x16x4_t @test1() {
-; PROMOTED-LABEL: test1:
-; Promote constant has created a big constant for the whole structure
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], __PromotedConst@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], __PromotedConst@PAGEOFF
-; Destination registers are defined by the ABI
-; PROMOTED-NEXT: ldp q0, q1, {{\[}}[[BASEADDR]]]
-; PROMOTED-NEXT: ldp q2, q3, {{\[}}[[BASEADDR]], #32]
-; PROMOTED-NEXT: ret
-
-; REGULAR-LABEL: test1:
-; Regular access is quite bad, it performs 4 loads, one for each chunk of
-; the structure
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; Destination registers are defined by the ABI
-; REGULAR: ldr q0, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; REGULAR: ldr q1, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
-; REGULAR: adrp [[PAGEADDR2:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
-; REGULAR: ldr q2, {{\[}}[[PAGEADDR2]], [[CSTLABEL2]]@PAGEOFF]
-; REGULAR: adrp [[PAGEADDR3:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
-; REGULAR: ldr q3, {{\[}}[[PAGEADDR3]], [[CSTLABEL3]]@PAGEOFF]
-; REGULAR-NEXT: ret
-entry:
-  ret %struct.uint8x16x4_t { [4 x <16 x i8>] [<16 x i8> <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, <16 x i8> <i8 32, i8 124, i8 121, i8 120, i8 8, i8 117, i8 -56, i8 113, i8 -76, i8 110, i8 -53, i8 107, i8 7, i8 105, i8 103, i8 102>, <16 x i8> <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>, <16 x i8> <i8 -104, i8 83, i8 -20, i8 81, i8 81, i8 80, i8 -59, i8 78, i8 73, i8 77, i8 -37, i8 75, i8 122, i8 74, i8 37, i8 73>] }
-}
-
-; Two different uses of the same constant in the same basic block
-define <16 x i8> @test2(<16 x i8> %arg) {
-entry:
-; PROMOTED-LABEL: test2:
-; In stress mode, constant vector are promoted
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1:__PromotedConst[0-9]+]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
-; PROMOTED: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
-; Destination register is defined by ABI
-; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
-; PROMOTED-NEXT: mla.16b v0, v0, v[[REGNUM]]
-; PROMOTED-NEXT: ret
-
-; REGULAR-LABEL: test2:
-; Regular access is strickly the same as promoted access.
-; The difference is that the address (and thus the space in memory) is not
-; shared between constants
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; REGULAR: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
-; Destination register is defined by ABI
-; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
-; REGULAR-NEXT: mla.16b v0, v0, v[[REGNUM]]
-; REGULAR-NEXT: ret
-  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  %add.i9 = add <16 x i8> %add.i, %mul.i
-  ret <16 x i8> %add.i9
-}
-
-; Two different uses of the sane constant in two different basic blocks,
-; one dominates the other
-define <16 x i8> @test3(<16 x i8> %arg, i32 %path) {
-; PROMOTED-LABEL: test3:
-; In stress mode, constant vector are promoted
-; Since, the constant is the same as the previous function,
-; the same address must be used
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
-; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
-; Destination register is defined by ABI
-; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
-; PROMOTED-NEXT: cbnz w0, [[LABEL:LBB.*]]
-; Next BB
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV2:__PromotedConst[0-9]+]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV2]]@PAGEOFF
-; PROMOTED-NEXT: ldr q[[REGNUM]], {{\[}}[[BASEADDR]]]
-; Next BB
-; PROMOTED-NEXT: [[LABEL]]:
-; PROMOTED-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
-; PROMOTED-NEXT: add.16b v0, v0, [[DESTV]]
-; PROMOTED-NEXT: ret
-
-; REGULAR-LABEL: test3:
-; Regular mode does not elimitate common sub expression by its own.
-; In other words, the same loads appears several times.
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
-; Destination register is defined by ABI
-; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
-; REGULAR-NEXT: cbz w0, [[LABELelse:LBB.*]]
-; Next BB
-; Redundant load
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
-; REGULAR-NEXT: b [[LABELend:LBB.*]]
-; Next BB
-; REGULAR-NEXT: [[LABELelse]]
-; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL2]]@PAGEOFF]
-; Next BB
-; REGULAR-NEXT: [[LABELend]]:
-; REGULAR-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
-; REGULAR-NEXT: add.16b v0, v0, [[DESTV]]
-; REGULAR-NEXT: ret
-entry:
-  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  %tobool = icmp eq i32 %path, 0
-  br i1 %tobool, label %if.else, label %if.then
-
-if.then:                                          ; preds = %entry
-  %mul.i13 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %mul.i = mul <16 x i8> %add.i, <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %ret2.0 = phi <16 x i8> [ %mul.i13, %if.then ], [ %mul.i, %if.else ]
-  %add.i12 = add <16 x i8> %add.i, %ret2.0
-  ret <16 x i8> %add.i12
-}
-
-; Two different uses of the sane constant in two different basic blocks,
-; none dominates the other
-define <16 x i8> @test4(<16 x i8> %arg, i32 %path) {
-; PROMOTED-LABEL: test4:
-; In stress mode, constant vector are promoted
-; Since, the constant is the same as the previous function,
-; the same address must be used
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
-; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
-; Destination register is defined by ABI
-; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
-; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
-; Next BB
-; PROMOTED: mul.16b v0, v0, v[[REGNUM]]
-; Next BB
-; PROMOTED-NEXT: [[LABEL]]:
-; PROMOTED-NEXT: ret
-
-
-; REGULAR-LABEL: test4:
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
-; Destination register is defined by ABI
-; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
-; REGULAR-NEXT: cbz w0, [[LABEL:LBB.*]]
-; Next BB
-; Redundant expression
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
-; Destination register is defined by ABI
-; REGULAR-NEXT: mul.16b v0, v0, v[[REGNUM]]
-; Next BB
-; REGULAR-NEXT: [[LABEL]]:
-; REGULAR-NEXT: ret
-entry:
-  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  %tobool = icmp eq i32 %path, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  br label %if.end
-
-if.end:                                           ; preds = %entry, %if.then
-  %ret.0 = phi <16 x i8> [ %mul.i, %if.then ], [ %add.i, %entry ]
-  ret <16 x i8> %ret.0
-}
-
-; Two different uses of the sane constant in two different basic blocks,
-; one is in a phi.
-define <16 x i8> @test5(<16 x i8> %arg, i32 %path) {
-; PROMOTED-LABEL: test5:
-; In stress mode, constant vector are promoted
-; Since, the constant is the same as the previous function,
-; the same address must be used
-; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
-; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
-; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
-; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
-; Next BB
-; PROMOTED: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
-; PROMOTED-NEXT: mul.16b v[[REGNUM]], [[DESTV]], v[[REGNUM]]
-; Next BB
-; PROMOTED-NEXT: [[LABEL]]:
-; PROMOTED-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[REGNUM]], v[[REGNUM]]
-; PROMOTED-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
-; PROMOTED-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
-; PROMOTED-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
-; PROMOTED-NEXT: ret
-
-; REGULAR-LABEL: test5:
-; REGULAR: cbz w0, [[LABELelse:LBB.*]]
-; Next BB
-; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
-; REGULAR-NEXT: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
-; REGULAR-NEXT: mul.16b v[[DESTREGNUM:[0-9]+]], [[DESTV]], v[[REGNUM]]
-; REGULAR-NEXT: b [[LABELend:LBB.*]]
-; Next BB
-; REGULAR-NEXT: [[LABELelse]]
-; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
-; REGULAR-NEXT: ldr q[[DESTREGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
-; Next BB
-; REGULAR-NEXT: [[LABELend]]:
-; REGULAR-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[DESTREGNUM]], v[[DESTREGNUM]]
-; REGULAR-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
-; REGULAR-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
-; REGULAR-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
-; REGULAR-NEXT: ret
-entry:
-  %tobool = icmp eq i32 %path, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  %mul.i26 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
-  br label %if.end
-
-if.end:                                           ; preds = %entry, %if.then
-  %ret.0 = phi <16 x i8> [ %mul.i26, %if.then ], [ <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, %entry ]
-  %mul.i25 = mul <16 x i8> %ret.0, %ret.0
-  %mul.i24 = mul <16 x i8> %mul.i25, %mul.i25
-  %mul.i23 = mul <16 x i8> %mul.i24, %mul.i24
-  %mul.i = mul <16 x i8> %mul.i23, %mul.i23
-  ret <16 x i8> %mul.i
-}
-
-define void @accessBig(i64* %storage) {
-; PROMOTED-LABEL: accessBig:
-; PROMOTED: adrp
-; PROMOTED: ret
-  %addr = bitcast i64* %storage to <1 x i80>*
-  store <1 x i80> <i80 483673642326615442599424>, <1 x i80>* %addr
-  ret void
-}
-
-define void @asmStatement() {
-; PROMOTED-LABEL: asmStatement:
-; PROMOTED-NOT: adrp
-; PROMOTED: ret
-  call void asm sideeffect "bfxil w0, w0, $0, $1", "i,i"(i32 28, i32 4)
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/redzone.ll b/test/CodeGen/ARM64/redzone.ll
deleted file mode 100644
index b89d7b1..0000000
--- a/test/CodeGen/ARM64/redzone.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
-
-define i32 @foo(i32 %a, i32 %b) nounwind ssp {
-; CHECK-LABEL: foo:
-; CHECK-NOT: sub sp, sp
-; CHECK: ret
-  %a.addr = alloca i32, align 4
-  %b.addr = alloca i32, align 4
-  %x = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  store i32 %b, i32* %b.addr, align 4
-  %tmp = load i32* %a.addr, align 4
-  %tmp1 = load i32* %b.addr, align 4
-  %add = add nsw i32 %tmp, %tmp1
-  store i32 %add, i32* %x, align 4
-  %tmp2 = load i32* %x, align 4
-  ret i32 %tmp2
-}
diff --git a/test/CodeGen/ARM64/register-offset-addressing.ll b/test/CodeGen/ARM64/register-offset-addressing.ll
deleted file mode 100644
index c273602..0000000
--- a/test/CodeGen/ARM64/register-offset-addressing.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i8 @t1(i16* %a, i64 %b) {
-; CHECK: t1
-; CHECK: lsl [[REG:x[0-9]+]], x1, #1
-; CHECK: ldrb w0, [x0, [[REG]]]
-; CHECK: ret
-  %tmp1 = getelementptr inbounds i16* %a, i64 %b
-  %tmp2 = load i16* %tmp1
-  %tmp3 = trunc i16 %tmp2 to i8
-  ret i8 %tmp3
-}
diff --git a/test/CodeGen/ARM64/register-pairing.ll b/test/CodeGen/ARM64/register-pairing.ll
deleted file mode 100644
index 4de80d2..0000000
--- a/test/CodeGen/ARM64/register-pairing.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
-;
-; rdar://14075006
-
-define void @odd() nounwind {
-; CHECK-LABEL: odd:
-; CHECK: stp d15, d14, [sp, #-144]!
-; CHECK: stp d13, d12, [sp, #16]
-; CHECK: stp d11, d10, [sp, #32]
-; CHECK: stp d9, d8, [sp, #48]
-; CHECK: stp x28, x27, [sp, #64]
-; CHECK: stp x26, x25, [sp, #80]
-; CHECK: stp x24, x23, [sp, #96]
-; CHECK: stp x22, x21, [sp, #112]
-; CHECK: stp x20, x19, [sp, #128]
-; CHECK: movz x0, #42
-; CHECK: ldp x20, x19, [sp, #128]
-; CHECK: ldp x22, x21, [sp, #112]
-; CHECK: ldp x24, x23, [sp, #96]
-; CHECK: ldp x26, x25, [sp, #80]
-; CHECK: ldp x28, x27, [sp, #64]
-; CHECK: ldp d9, d8, [sp, #48]
-; CHECK: ldp d11, d10, [sp, #32]
-; CHECK: ldp d13, d12, [sp, #16]
-; CHECK: ldp d15, d14, [sp], #144
-  call void asm sideeffect "mov x0, #42", "~{x0},~{x19},~{x21},~{x23},~{x25},~{x27},~{d8},~{d10},~{d12},~{d14}"() nounwind
-  ret void
-}
-
-define void @even() nounwind {
-; CHECK-LABEL: even:
-; CHECK: stp d15, d14, [sp, #-144]!
-; CHECK: stp d13, d12, [sp, #16]
-; CHECK: stp d11, d10, [sp, #32]
-; CHECK: stp d9, d8, [sp, #48]
-; CHECK: stp x28, x27, [sp, #64]
-; CHECK: stp x26, x25, [sp, #80]
-; CHECK: stp x24, x23, [sp, #96]
-; CHECK: stp x22, x21, [sp, #112]
-; CHECK: stp x20, x19, [sp, #128]
-; CHECK: movz x0, #42
-; CHECK: ldp x20, x19, [sp, #128]
-; CHECK: ldp x22, x21, [sp, #112]
-; CHECK: ldp x24, x23, [sp, #96]
-; CHECK: ldp x26, x25, [sp, #80]
-; CHECK: ldp x28, x27, [sp, #64]
-; CHECK: ldp d9, d8, [sp, #48]
-; CHECK: ldp d11, d10, [sp, #32]
-; CHECK: ldp d13, d12, [sp, #16]
-; CHECK: ldp d15, d14, [sp], #144
-  call void asm sideeffect "mov x0, #42", "~{x0},~{x20},~{x22},~{x24},~{x26},~{x28},~{d9},~{d11},~{d13},~{d15}"() nounwind
-  ret void
-}
diff --git a/test/CodeGen/ARM64/regress-f128csel-flags.ll b/test/CodeGen/ARM64/regress-f128csel-flags.ll
deleted file mode 100644
index a1daf03..0000000
--- a/test/CodeGen/ARM64/regress-f128csel-flags.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
-
-; We used to not mark NZCV as being used in the continuation basic-block
-; when lowering a 128-bit "select" to branches. This meant a subsequent use
-; of the same flags gave an internal fault here.
-
-declare void @foo(fp128)
-
-define double @test_f128csel_flags(i32 %lhs, fp128 %a, fp128 %b, double %l, double %r) nounwind {
-; CHECK: test_f128csel_flags
-
-    %tst = icmp ne i32 %lhs, 42
-    %val = select i1 %tst, fp128 %a, fp128 %b
-; CHECK: cmp w0, #42
-; CHECK: b.eq {{.?LBB0}}
-
-    call void @foo(fp128 %val)
-    %retval = select i1 %tst, double %l, double %r
-
-    ; It's also reasonably important that the actual fcsel comes before the
-    ; function call since bl may corrupt NZCV. We were doing the right thing anyway,
-    ; but just as well test it while we're here.
-; CHECK: fcsel {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, ne
-; CHECK: bl {{_?foo}}
-
-    ret double %retval
-}
diff --git a/test/CodeGen/ARM64/regress-interphase-shift.ll b/test/CodeGen/ARM64/regress-interphase-shift.ll
deleted file mode 100644
index fddf591..0000000
--- a/test/CodeGen/ARM64/regress-interphase-shift.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -march=arm64 -o - %s | FileCheck %s
-
-; This is mostly a "don't assert" test. The type of the RHS of a shift depended
-; on the phase of legalization, which led to the creation of an unexpected and
-; unselectable "rotr" node: (i32 (rotr i32, i64)).
-
-define void @foo(i64* nocapture %d) {
-; CHECK-LABEL: foo:
-; CHECK: rorv
-  %tmp = load i64* undef, align 8
-  %sub397 = sub i64 0, %tmp
-  %and398 = and i64 %sub397, 4294967295
-  %shr404 = lshr i64 %and398, 0
-  %or405 = or i64 0, %shr404
-  %xor406 = xor i64 %or405, 0
-  %xor417 = xor i64 0, %xor406
-  %xor428 = xor i64 0, %xor417
-  %sub430 = sub i64 %xor417, 0
-  %and431 = and i64 %sub430, 4294967295
-  %and432 = and i64 %xor428, 31
-  %sub433 = sub i64 32, %and432
-  %shl434 = shl i64 %and431, %sub433
-  %shr437 = lshr i64 %and431, %and432
-  %or438 = or i64 %shl434, %shr437
-  %xor439 = xor i64 %or438, %xor428
-  %sub441 = sub i64 %xor439, 0
-  store i64 %sub441, i64* %d, align 8
-  ret void
-}
diff --git a/test/CodeGen/ARM64/return-vector.ll b/test/CodeGen/ARM64/return-vector.ll
deleted file mode 100644
index 9457d8b..0000000
--- a/test/CodeGen/ARM64/return-vector.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-; 2x64 vector should be returned in Q0.
-
-define <2 x double> @test(<2 x double>* %p) nounwind {
-; CHECK: test
-; CHECK: ldr q0, [x0]
-; CHECK: ret
-  %tmp1 = load <2 x double>* %p, align 16
-  ret <2 x double> %tmp1
-}
diff --git a/test/CodeGen/ARM64/returnaddr.ll b/test/CodeGen/ARM64/returnaddr.ll
deleted file mode 100644
index e06ce90..0000000
--- a/test/CodeGen/ARM64/returnaddr.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i8* @rt0(i32 %x) nounwind readnone {
-entry:
-; CHECK-LABEL: rt0:
-; CHECK: mov x0, lr
-; CHECK: ret
-  %0 = tail call i8* @llvm.returnaddress(i32 0)
-  ret i8* %0
-}
-
-define i8* @rt2() nounwind readnone {
-entry:
-; CHECK-LABEL: rt2:
-; CHECK: stp fp, lr, [sp, #-16]!
-; CHECK: mov fp, sp
-; CHECK: ldr x[[REG:[0-9]+]], [fp]
-; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]]]
-; CHECK: ldr x0, [x[[REG2]], #8]
-; CHECK: ldp fp, lr, [sp], #16
-; CHECK: ret
-  %0 = tail call i8* @llvm.returnaddress(i32 2)
-  ret i8* %0
-}
-
-declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/rev.ll b/test/CodeGen/ARM64/rev.ll
deleted file mode 100644
index 867d5b3..0000000
--- a/test/CodeGen/ARM64/rev.ll
+++ /dev/null
@@ -1,221 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define i32 @test_rev_w(i32 %a) nounwind {
-entry:
-; CHECK-LABEL: test_rev_w:
-; CHECK: rev w0, w0
-  %0 = tail call i32 @llvm.bswap.i32(i32 %a)
-  ret i32 %0
-}
-
-define i64 @test_rev_x(i64 %a) nounwind {
-entry:
-; CHECK-LABEL: test_rev_x:
-; CHECK: rev x0, x0
-  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
-  ret i64 %0
-}
-
-declare i32 @llvm.bswap.i32(i32) nounwind readnone
-declare i64 @llvm.bswap.i64(i64) nounwind readnone
-
-define i32 @test_rev16_w(i32 %X) nounwind {
-entry:
-; CHECK-LABEL: test_rev16_w:
-; CHECK: rev16 w0, w0
-  %tmp1 = lshr i32 %X, 8
-  %X15 = bitcast i32 %X to i32
-  %tmp4 = shl i32 %X15, 8
-  %tmp2 = and i32 %tmp1, 16711680
-  %tmp5 = and i32 %tmp4, -16777216
-  %tmp9 = and i32 %tmp1, 255
-  %tmp13 = and i32 %tmp4, 65280
-  %tmp6 = or i32 %tmp5, %tmp2
-  %tmp10 = or i32 %tmp6, %tmp13
-  %tmp14 = or i32 %tmp10, %tmp9
-  ret i32 %tmp14
-}
-
-define i64 @test_rev16_x(i64 %a) nounwind {
-entry:
-; CHECK-LABEL: test_rev16_x:
-; CHECK: rev16 x0, x0
-  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
-  %1 = lshr i64 %0, 16
-  %2 = shl i64 %0, 48
-  %3 = or i64 %1, %2
-  ret i64 %3
-}
-
-define i64 @test_rev32_x(i64 %a) nounwind {
-entry:
-; CHECK-LABEL: test_rev32_x:
-; CHECK: rev32 x0, x0
-  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
-  %1 = lshr i64 %0, 32
-  %2 = shl i64 %0, 32
-  %3 = or i64 %1, %2
-  ret i64 %3
-}
-
-define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev64D8:
-;CHECK: rev64.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-	ret <8 x i8> %tmp2
-}
-
-define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: test_vrev64D16:
-;CHECK: rev64.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-	ret <4 x i16> %tmp2
-}
-
-define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: test_vrev64D32:
-;CHECK: rev64.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-	ret <2 x i32> %tmp2
-}
-
-define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
-;CHECK-LABEL: test_vrev64Df:
-;CHECK: rev64.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-	ret <2 x float> %tmp2
-}
-
-define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev64Q8:
-;CHECK: rev64.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
-	ret <16 x i8> %tmp2
-}
-
-define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: test_vrev64Q16:
-;CHECK: rev64.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-	ret <8 x i16> %tmp2
-}
-
-define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: test_vrev64Q32:
-;CHECK: rev64.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-	ret <4 x i32> %tmp2
-}
-
-define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
-;CHECK-LABEL: test_vrev64Qf:
-;CHECK: rev64.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-	ret <4 x float> %tmp2
-}
-
-define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev32D8:
-;CHECK: rev32.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-	ret <8 x i8> %tmp2
-}
-
-define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: test_vrev32D16:
-;CHECK: rev32.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-	ret <4 x i16> %tmp2
-}
-
-define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev32Q8:
-;CHECK: rev32.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-	ret <16 x i8> %tmp2
-}
-
-define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: test_vrev32Q16:
-;CHECK: rev32.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-	ret <8 x i16> %tmp2
-}
-
-define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev16D8:
-;CHECK: rev16.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-	ret <8 x i8> %tmp2
-}
-
-define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev16Q8:
-;CHECK: rev16.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-	ret <16 x i8> %tmp2
-}
-
-; Undef shuffle indices should not prevent matching to VREV:
-
-define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: test_vrev64D8_undef:
-;CHECK: rev64.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
-	ret <8 x i8> %tmp2
-}
-
-define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: test_vrev32Q16_undef:
-;CHECK: rev32.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
-	ret <8 x i16> %tmp2
-}
-
-; vrev <4 x i16> should use REV32 and not REV64
-define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
-; CHECK-LABEL: test_vrev64:
-; CHECK: ldr [[DEST:q[0-9]+]],
-; CHECK: st1.h
-; CHECK: st1.h
-entry:
-  %0 = bitcast <4 x i16>* %source to <8 x i16>*
-  %tmp2 = load <8 x i16>* %0, align 4
-  %tmp3 = extractelement <8 x i16> %tmp2, i32 6
-  %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
-  %tmp9 = extractelement <8 x i16> %tmp2, i32 5
-  %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
-  store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
-  ret void
-}
-
-; Test vrev of float4
-define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
-; CHECK: float_vrev64
-; CHECK: ldr [[DEST:q[0-9]+]],
-; CHECK: rev64.4s
-entry:
-  %0 = bitcast float* %source to <4 x float>*
-  %tmp2 = load <4 x float>* %0, align 4
-  %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
-  %arrayidx8 = getelementptr inbounds <4 x float>* %dest, i32 11
-  store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/rounding.ll b/test/CodeGen/ARM64/rounding.ll
deleted file mode 100644
index 7ff65c3..0000000
--- a/test/CodeGen/ARM64/rounding.ll
+++ /dev/null
@@ -1,208 +0,0 @@
-; RUN: llc -O3 < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
-target triple = "arm64-apple-ios6.0.0"
-
-; CHECK: test1
-; CHECK: frintx
-; CHECK: frintm
-define float @test1(float %a) #0 {
-entry:
-  %call = tail call float @floorf(float %a) nounwind readnone
-  ret float %call
-}
-
-declare float @floorf(float) nounwind readnone
-
-; CHECK: test2
-; CHECK: frintx
-; CHECK: frintm
-define double @test2(double %a) #0 {
-entry:
-  %call = tail call double @floor(double %a) nounwind readnone
-  ret double %call
-}
-
-declare double @floor(double) nounwind readnone
-
-; CHECK: test3
-; CHECK: frinti
-define float @test3(float %a) #0 {
-entry:
-  %call = tail call float @nearbyintf(float %a) nounwind readnone
-  ret float %call
-}
-
-declare float @nearbyintf(float) nounwind readnone
-
-; CHECK: test4
-; CHECK: frinti
-define double @test4(double %a) #0 {
-entry:
-  %call = tail call double @nearbyint(double %a) nounwind readnone
-  ret double %call
-}
-
-declare double @nearbyint(double) nounwind readnone
-
-; CHECK: test5
-; CHECK: frintx
-; CHECK: frintp
-define float @test5(float %a) #0 {
-entry:
-  %call = tail call float @ceilf(float %a) nounwind readnone
-  ret float %call
-}
-
-declare float @ceilf(float) nounwind readnone
-
-; CHECK: test6
-; CHECK: frintx
-; CHECK: frintp
-define double @test6(double %a) #0 {
-entry:
-  %call = tail call double @ceil(double %a) nounwind readnone
-  ret double %call
-}
-
-declare double @ceil(double) nounwind readnone
-
-; CHECK: test7
-; CHECK: frintx
-define float @test7(float %a) #0 {
-entry:
-  %call = tail call float @rintf(float %a) nounwind readnone
-  ret float %call
-}
-
-declare float @rintf(float) nounwind readnone
-
-; CHECK: test8
-; CHECK: frintx
-define double @test8(double %a) #0 {
-entry:
-  %call = tail call double @rint(double %a) nounwind readnone
-  ret double %call
-}
-
-declare double @rint(double) nounwind readnone
-
-; CHECK: test9
-; CHECK: frintx
-; CHECK: frintz
-define float @test9(float %a) #0 {
-entry:
-  %call = tail call float @truncf(float %a) nounwind readnone
-  ret float %call
-}
-
-declare float @truncf(float) nounwind readnone
-
-; CHECK: test10
-; CHECK: frintx
-; CHECK: frintz
-define double @test10(double %a) #0 {
-entry:
-  %call = tail call double @trunc(double %a) nounwind readnone
-  ret double %call
-}
-
-declare double @trunc(double) nounwind readnone
-
-; CHECK: test11
-; CHECK: frintx
-; CHECK: frinta
-define float @test11(float %a) #0 {
-entry:
-  %call = tail call float @roundf(float %a) nounwind readnone
-  ret float %call
-}
-
-declare float @roundf(float %a) nounwind readnone
-
-; CHECK: test12
-; CHECK: frintx
-; CHECK: frinta
-define double @test12(double %a) #0 {
-entry:
-  %call = tail call double @round(double %a) nounwind readnone
-  ret double %call
-}
-
-declare double @round(double %a) nounwind readnone
-
-; CHECK: test13
-; CHECK-NOT: frintx
-; CHECK: frintm
-define float @test13(float %a) #1 {
-entry:
-  %call = tail call float @floorf(float %a) nounwind readnone
-  ret float %call
-}
-
-; CHECK: test14
-; CHECK-NOT: frintx
-; CHECK: frintm
-define double @test14(double %a) #1 {
-entry:
-  %call = tail call double @floor(double %a) nounwind readnone
-  ret double %call
-}
-
-; CHECK: test15
-; CHECK-NOT: frintx
-; CHECK: frintp
-define float @test15(float %a) #1 {
-entry:
-  %call = tail call float @ceilf(float %a) nounwind readnone
-  ret float %call
-}
-
-; CHECK: test16
-; CHECK-NOT: frintx
-; CHECK: frintp
-define double @test16(double %a) #1 {
-entry:
-  %call = tail call double @ceil(double %a) nounwind readnone
-  ret double %call
-}
-
-; CHECK: test17
-; CHECK-NOT: frintx
-; CHECK: frintz
-define float @test17(float %a) #1 {
-entry:
-  %call = tail call float @truncf(float %a) nounwind readnone
-  ret float %call
-}
-
-; CHECK: test18
-; CHECK-NOT: frintx
-; CHECK: frintz
-define double @test18(double %a) #1 {
-entry:
-  %call = tail call double @trunc(double %a) nounwind readnone
-  ret double %call
-}
-
-; CHECK: test19
-; CHECK-NOT: frintx
-; CHECK: frinta
-define float @test19(float %a) #1 {
-entry:
-  %call = tail call float @roundf(float %a) nounwind readnone
-  ret float %call
-}
-
-; CHECK: test20
-; CHECK-NOT: frintx
-; CHECK: frinta
-define double @test20(double %a) #1 {
-entry:
-  %call = tail call double @round(double %a) nounwind readnone
-  ret double %call
-}
-
-
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/ARM64/scaled_iv.ll b/test/CodeGen/ARM64/scaled_iv.ll
deleted file mode 100644
index 987373e..0000000
--- a/test/CodeGen/ARM64/scaled_iv.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: opt -S -loop-reduce < %s | FileCheck %s
-; Scaling factor in addressing mode are costly.
-; Make loop-reduce prefer unscaled accesses.
-; <rdar://problem/13806271>
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios7.0.0"
-
-; Function Attrs: nounwind ssp
-define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
-; CHECK: @mulDouble
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
-; Only one induction variable should have been generated.
-; CHECK-NOT: phi
-  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
-  %tmp = add nsw i64 %indvars.iv, -1
-  %arrayidx = getelementptr inbounds double* %b, i64 %tmp
-  %tmp1 = load double* %arrayidx, align 8
-; The induction variable should carry the scaling factor: 1 * 8 = 8.
-; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
-  %tmp2 = load double* %arrayidx2, align 8
-  %mul = fmul double %tmp1, %tmp2
-  %arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
-  store double %mul, double* %arrayidx4, align 8
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-; Comparison should be 19 * 8 = 152.
-; CHECK: icmp eq i32 {{%[^,]+}}, 152
-  %exitcond = icmp eq i32 %lftr.wideiv, 20
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
diff --git a/test/CodeGen/ARM64/scvt.ll b/test/CodeGen/ARM64/scvt.ll
deleted file mode 100644
index b4d4add..0000000
--- a/test/CodeGen/ARM64/scvt.ll
+++ /dev/null
@@ -1,830 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; rdar://13082402
-
-define float @t1(i32* nocapture %src) nounwind ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: ldr s0, [x0]
-; CHECK: scvtf s0, s0
-  %tmp1 = load i32* %src, align 4
-  %tmp2 = sitofp i32 %tmp1 to float
-  ret float %tmp2
-}
-
-define float @t2(i32* nocapture %src) nounwind ssp {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: ldr s0, [x0]
-; CHECK: ucvtf s0, s0
-  %tmp1 = load i32* %src, align 4
-  %tmp2 = uitofp i32 %tmp1 to float
-  ret float %tmp2
-}
-
-define double @t3(i64* nocapture %src) nounwind ssp {
-entry:
-; CHECK-LABEL: t3:
-; CHECK: ldr d0, [x0]
-; CHECK: scvtf d0, d0
-  %tmp1 = load i64* %src, align 4
-  %tmp2 = sitofp i64 %tmp1 to double
-  ret double %tmp2
-}
-
-define double @t4(i64* nocapture %src) nounwind ssp {
-entry:
-; CHECK-LABEL: t4:
-; CHECK: ldr d0, [x0]
-; CHECK: ucvtf d0, d0
-  %tmp1 = load i64* %src, align 4
-  %tmp2 = uitofp i64 %tmp1 to double
-  ret double %tmp2
-}
-
-; rdar://13136456
-define double @t5(i32* nocapture %src) nounwind ssp optsize {
-entry:
-; CHECK-LABEL: t5:
-; CHECK: ldr [[REG:w[0-9]+]], [x0]
-; CHECK: scvtf d0, [[REG]]
-  %tmp1 = load i32* %src, align 4
-  %tmp2 = sitofp i32 %tmp1 to double
-  ret double %tmp2
-}
-
-; Check that we load in FP register when we want to convert into
-; floating point value.
-; This is much faster than loading on GPR and making the conversion
-; GPR -> FPR.
-; <rdar://problem/14599607>
-;
-; Check the flollowing patterns for signed/unsigned:
-; 1. load with scaled imm to float.
-; 2. load with scaled register to float.
-; 3. load with scaled imm to double.
-; 4. load with scaled register to double.
-; 5. load with unscaled imm to float.
-; 6. load with unscaled imm to double.
-; With loading size: 8, 16, 32, and 64-bits.
-
-; ********* 1. load with scaled imm to float. *********
-define float @fct1(i8* nocapture %sp0) {
-; CHECK-LABEL: fct1:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 1
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = uitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @fct2(i16* nocapture %sp0) {
-; CHECK-LABEL: fct2:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 1
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = uitofp i16 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @fct3(i32* nocapture %sp0) {
-; CHECK-LABEL: fct3:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = uitofp i32 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; i64 -> f32 is not supported on floating point unit.
-define float @fct4(i64* nocapture %sp0) {
-; CHECK-LABEL: fct4:
-; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 1
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = uitofp i64 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; ********* 2. load with scaled register to float. *********
-define float @fct5(i8* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct5:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = uitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @fct6(i16* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct6:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = uitofp i16 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @fct7(i32* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct7:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = uitofp i32 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; i64 -> f32 is not supported on floating point unit.
-define float @fct8(i64* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct8:
-; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = uitofp i64 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-
-; ********* 3. load with scaled imm to double. *********
-define double @fct9(i8* nocapture %sp0) {
-; CHECK-LABEL: fct9:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 1
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = uitofp i8 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct10(i16* nocapture %sp0) {
-; CHECK-LABEL: fct10:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 1
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = uitofp i16 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct11(i32* nocapture %sp0) {
-; CHECK-LABEL: fct11:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = uitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct12(i64* nocapture %sp0) {
-; CHECK-LABEL: fct12:
-; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 1
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = uitofp i64 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-; ********* 4. load with scaled register to double. *********
-define double @fct13(i8* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct13:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = uitofp i8 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct14(i16* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct14:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = uitofp i16 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct15(i32* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct15:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = uitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct16(i64* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct16:
-; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = uitofp i64 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-; ********* 5. load with unscaled imm to float. *********
-define float @fct17(i8* nocapture %sp0) {
-entry:
-; CHECK-LABEL: fct17:
-; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i8* %sp0 to i64
-  %add = add i64 %bitcast, -1
-  %addr = inttoptr i64 %add to i8*
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = uitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @fct18(i16* nocapture %sp0) {
-; CHECK-LABEL: fct18:
-; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i16* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i16*
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = uitofp i16 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @fct19(i32* nocapture %sp0) {
-; CHECK-LABEL: fct19:
-; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i32* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i32*
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = uitofp i32 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; i64 -> f32 is not supported on floating point unit.
-define float @fct20(i64* nocapture %sp0) {
-; CHECK-LABEL: fct20:
-; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i64* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i64*
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = uitofp i64 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-
-}
-
-; ********* 6. load with unscaled imm to double. *********
-define double @fct21(i8* nocapture %sp0) {
-entry:
-; CHECK-LABEL: fct21:
-; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i8* %sp0 to i64
-  %add = add i64 %bitcast, -1
-  %addr = inttoptr i64 %add to i8*
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = uitofp i8 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct22(i16* nocapture %sp0) {
-; CHECK-LABEL: fct22:
-; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i16* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i16*
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = uitofp i16 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct23(i32* nocapture %sp0) {
-; CHECK-LABEL: fct23:
-; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i32* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i32*
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = uitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @fct24(i64* nocapture %sp0) {
-; CHECK-LABEL: fct24:
-; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i64* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i64*
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = uitofp i64 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-
-}
-
-; ********* 1s. load with scaled imm to float. *********
-define float @sfct1(i8* nocapture %sp0) {
-; CHECK-LABEL: sfct1:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
-; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
-; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 1
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @sfct2(i16* nocapture %sp0) {
-; CHECK-LABEL: sfct2:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
-; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
-; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 1
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = sitofp i16 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @sfct3(i32* nocapture %sp0) {
-; CHECK-LABEL: sfct3:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; i64 -> f32 is not supported on floating point unit.
-define float @sfct4(i64* nocapture %sp0) {
-; CHECK-LABEL: sfct4:
-; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 1
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = sitofp i64 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; ********* 2s. load with scaled register to float. *********
-define float @sfct5(i8* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct5:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
-; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
-; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
-; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @sfct6(i16* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct6:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
-; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
-; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = sitofp i16 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @sfct7(i32* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct7:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; i64 -> f32 is not supported on floating point unit.
-define float @sfct8(i64* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct8:
-; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = sitofp i64 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; ********* 3s. load with scaled imm to double. *********
-define double @sfct9(i8* nocapture %sp0) {
-; CHECK-LABEL: sfct9:
-; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 1
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct10(i16* nocapture %sp0) {
-; CHECK-LABEL: sfct10:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
-; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
-; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
-; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 1
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = sitofp i16 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct11(i32* nocapture %sp0) {
-; CHECK-LABEL: sfct11:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
-; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct12(i64* nocapture %sp0) {
-; CHECK-LABEL: sfct12:
-; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 1
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = sitofp i64 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-; ********* 4s. load with scaled register to double. *********
-define double @sfct13(i8* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct13:
-; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, x1]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct14(i16* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct14:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
-; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
-; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
-; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = sitofp i16 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct15(i32* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct15:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
-; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
-; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct16(i64* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: sfct16:
-; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i64* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = sitofp i64 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-; ********* 5s. load with unscaled imm to float. *********
-define float @sfct17(i8* nocapture %sp0) {
-entry:
-; CHECK-LABEL: sfct17:
-; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
-; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
-; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
-; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i8* %sp0 to i64
-  %add = add i64 %bitcast, -1
-  %addr = inttoptr i64 %add to i8*
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @sfct18(i16* nocapture %sp0) {
-; CHECK-LABEL: sfct18:
-; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
-; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i16* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i16*
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = sitofp i16 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define float @sfct19(i32* nocapture %sp0) {
-; CHECK-LABEL: sfct19:
-; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i32* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i32*
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-; i64 -> f32 is not supported on floating point unit.
-define float @sfct20(i64* nocapture %sp0) {
-; CHECK-LABEL: sfct20:
-; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i64* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i64*
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = sitofp i64 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-
-}
-
-; ********* 6s. load with unscaled imm to double. *********
-define double @sfct21(i8* nocapture %sp0) {
-entry:
-; CHECK-LABEL: sfct21:
-; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i8* %sp0 to i64
-  %add = add i64 %bitcast, -1
-  %addr = inttoptr i64 %add to i8*
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct22(i16* nocapture %sp0) {
-; CHECK-LABEL: sfct22:
-; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
-; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
-; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i16* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i16*
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %val = sitofp i16 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct23(i32* nocapture %sp0) {
-; CHECK-LABEL: sfct23:
-; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
-; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i32* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i32*
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-define double @sfct24(i64* nocapture %sp0) {
-; CHECK-LABEL: sfct24:
-; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i64* %sp0 to i64
-  %add = add i64 %bitcast, 1
-  %addr = inttoptr i64 %add to i64*
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %val = sitofp i64 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-
-}
-
-; Check that we do not use SSHLL code sequence when code size is a concern.
-define float @codesize_sfct17(i8* nocapture %sp0) optsize {
-entry:
-; CHECK-LABEL: codesize_sfct17:
-; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
-; CHECK-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
-; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
-  %bitcast = ptrtoint i8* %sp0 to i64
-  %add = add i64 %bitcast, -1
-  %addr = inttoptr i64 %add to i8*
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %val = sitofp i8 %pix_sp0.0.copyload to float
-  %vmull.i = fmul float %val, %val
-  ret float %vmull.i
-}
-
-define double @codesize_sfct11(i32* nocapture %sp0) minsize {
-; CHECK-LABEL: sfct11:
-; CHECK: ldr w[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
-; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %val = sitofp i32 %pix_sp0.0.copyload to double
-  %vmull.i = fmul double %val, %val
-  ret double %vmull.i
-}
-
-; Adding fp128 custom lowering makes these a little fragile since we have to
-; return the correct mix of Legal/Expand from the custom method.
-;
-; rdar://problem/14991489
-
-define float @float_from_i128(i128 %in) {
-; CHECK-LABEL: float_from_i128:
-; CHECK: bl {{_?__floatuntisf}}
-  %conv = uitofp i128 %in to float
-  ret float %conv
-}
-
-define double @double_from_i128(i128 %in) {
-; CHECK-LABEL: double_from_i128:
-; CHECK: bl {{_?__floattidf}}
-  %conv = sitofp i128 %in to double
-  ret double %conv
-}
-
-define fp128 @fp128_from_i128(i128 %in) {
-; CHECK-LABEL: fp128_from_i128:
-; CHECK: bl {{_?__floatuntitf}}
-  %conv = uitofp i128 %in to fp128
-  ret fp128 %conv
-}
-
-define i128 @i128_from_float(float %in) {
-; CHECK-LABEL: i128_from_float
-; CHECK: bl {{_?__fixsfti}}
-  %conv = fptosi float %in to i128
-  ret i128 %conv
-}
-
-define i128 @i128_from_double(double %in) {
-; CHECK-LABEL: i128_from_double
-; CHECK: bl {{_?__fixunsdfti}}
-  %conv = fptoui double %in to i128
-  ret i128 %conv
-}
-
-define i128 @i128_from_fp128(fp128 %in) {
-; CHECK-LABEL: i128_from_fp128
-; CHECK: bl {{_?__fixtfti}}
-  %conv = fptosi fp128 %in to i128
-  ret i128 %conv
-}
-
diff --git a/test/CodeGen/ARM64/shifted-sext.ll b/test/CodeGen/ARM64/shifted-sext.ll
deleted file mode 100644
index e553be5..0000000
--- a/test/CodeGen/ARM64/shifted-sext.ll
+++ /dev/null
@@ -1,277 +0,0 @@
-; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
-;
-; <rdar://problem/13820218>
-
-define signext i16 @extendedLeftShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftcharToshortBy4:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #28, #7
-  %inc = add i8 %a, 1
-  %conv1 = sext i8 %inc to i32
-  %shl = shl nsw i32 %conv1, 4
-  %conv2 = trunc i32 %shl to i16
-  ret i16 %conv2
-}
-
-define signext i16 @extendedRightShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftcharToshortBy4:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #4, #7
-  %inc = add i8 %a, 1
-  %conv1 = sext i8 %inc to i32
-  %shr4 = lshr i32 %conv1, 4
-  %conv2 = trunc i32 %shr4 to i16
-  ret i16 %conv2
-}
-
-define signext i16 @extendedLeftShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftcharToshortBy8:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #24, #7
-  %inc = add i8 %a, 1
-  %conv1 = sext i8 %inc to i32
-  %shl = shl nsw i32 %conv1, 8
-  %conv2 = trunc i32 %shl to i16
-  ret i16 %conv2
-}
-
-define signext i16 @extendedRightShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftcharToshortBy8:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sxtb [[REG]], [[REG]]
-; CHECK: asr w0, [[REG]], #8
-  %inc = add i8 %a, 1
-  %conv1 = sext i8 %inc to i32
-  %shr4 = lshr i32 %conv1, 8
-  %conv2 = trunc i32 %shr4 to i16
-  ret i16 %conv2
-}
-
-define i32 @extendedLeftShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftcharTointBy4:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #28, #7
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i32
-  %shl = shl nsw i32 %conv, 4
-  ret i32 %shl
-}
-
-define i32 @extendedRightShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftcharTointBy4:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #4, #7
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i32
-  %shr = ashr i32 %conv, 4
-  ret i32 %shr
-}
-
-define i32 @extendedLeftShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftcharTointBy8:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #24, #7
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i32
-  %shl = shl nsw i32 %conv, 8
-  ret i32 %shl
-}
-
-define i32 @extendedRightShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftcharTointBy8:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sxtb [[REG]], [[REG]]
-; CHECK: asr w0, [[REG]], #8
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i32
-  %shr = ashr i32 %conv, 8
-  ret i32 %shr
-}
-
-define i64 @extendedLeftShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftcharToint64By4:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #60, #7
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i64
-  %shl = shl nsw i64 %conv, 4
-  ret i64 %shl
-}
-
-define i64 @extendedRightShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftcharToint64By4:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #4, #7
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i64
-  %shr = ashr i64 %conv, 4
-  ret i64 %shr
-}
-
-define i64 @extendedLeftShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftcharToint64By8:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #56, #7
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i64
-  %shl = shl nsw i64 %conv, 8
-  ret i64 %shl
-}
-
-define i64 @extendedRightShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftcharToint64By8:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sxtb x[[REG]], x[[REG]]
-; CHECK: asr x0, x[[REG]], #8
-  %inc = add i8 %a, 1
-  %conv = sext i8 %inc to i64
-  %shr = ashr i64 %conv, 8
-  ret i64 %shr
-}
-
-define i32 @extendedLeftShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftshortTointBy4:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #28, #15
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i32
-  %shl = shl nsw i32 %conv, 4
-  ret i32 %shl
-}
-
-define i32 @extendedRightShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftshortTointBy4:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sbfm w0, [[REG]], #4, #15
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i32
-  %shr = ashr i32 %conv, 4
-  ret i32 %shr
-}
-
-define i32 @extendedLeftShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftshortTointBy16:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: lsl w0, [[REG]], #16
-  %inc = add i16 %a, 1
-  %conv2 = zext i16 %inc to i32
-  %shl = shl nuw i32 %conv2, 16
-  ret i32 %shl
-}
-
-define i32 @extendedRightShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftshortTointBy16:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: sxth [[REG]], [[REG]]
-; CHECK: asr w0, [[REG]], #16
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i32
-  %shr = ashr i32 %conv, 16
-  ret i32 %shr
-}
-
-define i64 @extendedLeftShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftshortToint64By4:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #60, #15
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i64
-  %shl = shl nsw i64 %conv, 4
-  ret i64 %shl
-}
-
-define i64 @extendedRightShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftshortToint64By4:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #4, #15
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i64
-  %shr = ashr i64 %conv, 4
-  ret i64 %shr
-}
-
-define i64 @extendedLeftShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftshortToint64By16:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #48, #15
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i64
-  %shl = shl nsw i64 %conv, 16
-  ret i64 %shl
-}
-
-define i64 @extendedRightShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftshortToint64By16:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sxth x[[REG]], x[[REG]]
-; CHECK: asr x0, x[[REG]], #16
-  %inc = add i16 %a, 1
-  %conv = sext i16 %inc to i64
-  %shr = ashr i64 %conv, 16
-  ret i64 %shr
-}
-
-define i64 @extendedLeftShiftintToint64By4(i32 %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftintToint64By4:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #60, #31
-  %inc = add nsw i32 %a, 1
-  %conv = sext i32 %inc to i64
-  %shl = shl nsw i64 %conv, 4
-  ret i64 %shl
-}
-
-define i64 @extendedRightShiftintToint64By4(i32 %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftintToint64By4:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sbfm x0, x[[REG]], #4, #31
-  %inc = add nsw i32 %a, 1
-  %conv = sext i32 %inc to i64
-  %shr = ashr i64 %conv, 4
-  ret i64 %shr
-}
-
-define i64 @extendedLeftShiftintToint64By32(i32 %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedLeftShiftintToint64By32:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: lsl x0, x[[REG]], #32
-  %inc = add nsw i32 %a, 1
-  %conv2 = zext i32 %inc to i64
-  %shl = shl nuw i64 %conv2, 32
-  ret i64 %shl
-}
-
-define i64 @extendedRightShiftintToint64By32(i32 %a) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: extendedRightShiftintToint64By32:
-; CHECK: add w[[REG:[0-9]+]], w0, #1
-; CHECK: sxtw x[[REG]], x[[REG]]
-; CHECK: asr x0, x[[REG]], #32
-  %inc = add nsw i32 %a, 1
-  %conv = sext i32 %inc to i64
-  %shr = ashr i64 %conv, 32
-  ret i64 %shr
-}
diff --git a/test/CodeGen/ARM64/simd-scalar-to-vector.ll b/test/CodeGen/ARM64/simd-scalar-to-vector.ll
deleted file mode 100644
index 6c0b840..0000000
--- a/test/CodeGen/ARM64/simd-scalar-to-vector.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -O0 | FileCheck %s --check-prefix=CHECK-FAST
-
-define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
-; CHECK: uaddlv.16b h0, v0
-; CHECK: rshrn.8b v0, v0, #4
-; CHECK: dup.16b v0, v0[0]
-; CHECK: ret
-
-; CHECK-FAST: uaddlv.16b
-; CHECK-FAST: rshrn.8b
-; CHECK-FAST: dup.16b
-  %tmp = tail call i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
-  %tmp1 = trunc i32 %tmp to i16
-  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
-  %tmp3 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
-  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
-  ret <16 x i8> %tmp4
-}
-
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/simplest-elf.ll b/test/CodeGen/ARM64/simplest-elf.ll
deleted file mode 100644
index 1254365..0000000
--- a/test/CodeGen/ARM64/simplest-elf.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump - -r -d --triple=arm64-linux-gnu | FileCheck --check-prefix=CHECK-ELF %s
-
-define void @foo() nounwind {
-  ret void
-}
-
-  ; Check source looks ELF-like: no leading underscore, comments with //
-; CHECK: foo: // @foo
-; CHECK:     ret
-
-  ; Similarly make sure ELF output works and is vaguely sane: aarch64 target
-  ; machine with correct section & symbol names.
-; CHECK-ELF: file format ELF64-aarch64
-
-; CHECK-ELF: Disassembly of section .text
-; CHECK-ELF-LABEL: foo:
-; CHECK-ELF:    ret
diff --git a/test/CodeGen/ARM64/sincos.ll b/test/CodeGen/ARM64/sincos.ll
deleted file mode 100644
index 06157b2..0000000
--- a/test/CodeGen/ARM64/sincos.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7 | FileCheck %s --check-prefix CHECK-IOS
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix CHECK-LINUX
-
-; Combine sin / cos into a single call.
-; rdar://12856873
-
-define float @test1(float %x) nounwind {
-entry:
-; CHECK-IOS-LABEL: test1:
-; CHECK-IOS: bl ___sincosf_stret
-; CHECK-IOS: fadd s0, s0, s1
-
-; CHECK-LINUX-LABEL: test1:
-; CHECK-LINUX: bl sinf
-; CHECK-LINUX: bl cosf
-
-  %call = tail call float @sinf(float %x) nounwind readnone
-  %call1 = tail call float @cosf(float %x) nounwind readnone
-  %add = fadd float %call, %call1
-  ret float %add
-}
-
-define double @test2(double %x) nounwind {
-entry:
-; CHECK-IOS-LABEL: test2:
-; CHECK-IOS: bl ___sincos_stret
-; CHECK-IOS: fadd d0, d0, d1
-
-; CHECK-LINUX-LABEL: test2:
-; CHECK-LINUX: bl sin
-; CHECK-LINUX: bl cos
-
-  %call = tail call double @sin(double %x) nounwind readnone
-  %call1 = tail call double @cos(double %x) nounwind readnone
-  %add = fadd double %call, %call1
-  ret double %add
-}
-
-declare float  @sinf(float) readonly
-declare double @sin(double) readonly
-declare float @cosf(float) readonly
-declare double @cos(double) readonly
diff --git a/test/CodeGen/ARM64/sitofp-combine-chains.ll b/test/CodeGen/ARM64/sitofp-combine-chains.ll
deleted file mode 100644
index 10b433b..0000000
--- a/test/CodeGen/ARM64/sitofp-combine-chains.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc -march=arm64 -o -  %s | FileCheck %s
-
-; ARM64ISelLowering.cpp was creating a new (floating-point) load for efficiency
-; but not updating chain-successors of the old one. As a result, the two memory
-; operations in this function both ended up direct successors to the EntryToken
-; and could be reordered.
-
-@var = global i32 0, align 4
-
-define float @foo() {
-; CHECK-LABEL: foo:
-  ; Load must come before we clobber @var
-; CHECK: adrp x[[VARBASE:[0-9]+]], {{_?var}}
-; CHECK: ldr [[SREG:s[0-9]+]], [x[[VARBASE]],
-; CHECK: str wzr, [x[[VARBASE]],
-
-  %val = load i32* @var, align 4
-  store i32 0, i32* @var, align 4
-
-  %fltval = sitofp i32 %val to float
-  ret float %fltval
-}
diff --git a/test/CodeGen/ARM64/sli-sri-opt.ll b/test/CodeGen/ARM64/sli-sri-opt.ll
deleted file mode 100644
index 725dcd5..0000000
--- a/test/CodeGen/ARM64/sli-sri-opt.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc -arm64-shift-insert-generation=true -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
-; CHECK-LABEL: testLeftGood:
-; CHECK: sli.16b v0, v1, #3
-  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
-  %vshl_n = shl <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %result = or <16 x i8> %and.i, %vshl_n
-  store <16 x i8> %result, <16 x i8>* %dest, align 16
-  ret void
-}
-
-define void @testLeftBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
-; CHECK-LABEL: testLeftBad:
-; CHECK-NOT: sli
-  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
-  %vshl_n = shl <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-  %result = or <16 x i8> %and.i, %vshl_n
-  store <16 x i8> %result, <16 x i8>* %dest, align 16
-  ret void
-}
-
-define void @testRightGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
-; CHECK-LABEL: testRightGood:
-; CHECK: sri.16b v0, v1, #3
-  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
-  %vshl_n = lshr <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-  %result = or <16 x i8> %and.i, %vshl_n
-  store <16 x i8> %result, <16 x i8>* %dest, align 16
-  ret void
-}
-
-define void @testRightBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
-; CHECK-LABEL: testRightBad:
-; CHECK-NOT: sri
-  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
-  %vshl_n = lshr <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-  %result = or <16 x i8> %and.i, %vshl_n
-  store <16 x i8> %result, <16 x i8>* %dest, align 16
-  ret void
-}
diff --git a/test/CodeGen/ARM64/smaxv.ll b/test/CodeGen/ARM64/smaxv.ll
deleted file mode 100644
index 4f6e01b..0000000
--- a/test/CodeGen/ARM64/smaxv.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
-; CHECK: test_vmaxv_s8
-; CHECK: smaxv.8b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
-  %0 = trunc i32 %vmaxv.i to i8
-  ret i8 %0
-}
-
-define signext i16 @test_vmaxv_s16(<4 x i16> %a1) {
-; CHECK: test_vmaxv_s16
-; CHECK: smaxv.4h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
-  %0 = trunc i32 %vmaxv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vmaxv_s32(<2 x i32> %a1) {
-; CHECK: test_vmaxv_s32
-; 2 x i32 is not supported by the ISA, thus, this is a special case
-; CHECK: smaxp.2s v[[REGNUM:[0-9]+]], v0, v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
-  ret i32 %vmaxv.i
-}
-
-define signext i8 @test_vmaxvq_s8(<16 x i8> %a1) {
-; CHECK: test_vmaxvq_s8
-; CHECK: smaxv.16b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
-  %0 = trunc i32 %vmaxv.i to i8
-  ret i8 %0
-}
-
-define signext i16 @test_vmaxvq_s16(<8 x i16> %a1) {
-; CHECK: test_vmaxvq_s16
-; CHECK: smaxv.8h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
-  %0 = trunc i32 %vmaxv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vmaxvq_s32(<4 x i32> %a1) {
-; CHECK: test_vmaxvq_s32
-; CHECK: smaxv.4s [[REGNUM:s[0-9]+]], v0
-; CHECK-NEXT: fmov w0, [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
-  ret i32 %vmaxv.i
-}
-
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8>)
-
diff --git a/test/CodeGen/ARM64/sminv.ll b/test/CodeGen/ARM64/sminv.ll
deleted file mode 100644
index a246868..0000000
--- a/test/CodeGen/ARM64/sminv.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define signext i8 @test_vminv_s8(<8 x i8> %a1) {
-; CHECK: test_vminv_s8
-; CHECK: sminv.8b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8> %a1)
-  %0 = trunc i32 %vminv.i to i8
-  ret i8 %0
-}
-
-define signext i16 @test_vminv_s16(<4 x i16> %a1) {
-; CHECK: test_vminv_s16
-; CHECK: sminv.4h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16> %a1)
-  %0 = trunc i32 %vminv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vminv_s32(<2 x i32> %a1) {
-; CHECK: test_vminv_s32
-; 2 x i32 is not supported by the ISA, thus, this is a special case
-; CHECK: sminp.2s v[[REGNUM:[0-9]+]], v0, v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32> %a1)
-  ret i32 %vminv.i
-}
-
-define signext i8 @test_vminvq_s8(<16 x i8> %a1) {
-; CHECK: test_vminvq_s8
-; CHECK: sminv.16b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8> %a1)
-  %0 = trunc i32 %vminv.i to i8
-  ret i8 %0
-}
-
-define signext i16 @test_vminvq_s16(<8 x i16> %a1) {
-; CHECK: test_vminvq_s16
-; CHECK: sminv.8h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16> %a1)
-  %0 = trunc i32 %vminv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vminvq_s32(<4 x i32> %a1) {
-; CHECK: test_vminvq_s32
-; CHECK: sminv.4s [[REGNUM:s[0-9]+]], v0
-; CHECK-NEXT: fmov w0, [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32> %a1)
-  ret i32 %vminv.i
-}
-
-declare i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32>)
-declare i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16>)
-declare i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8>)
-declare i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32>)
-declare i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16>)
-declare i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8>)
-
diff --git a/test/CodeGen/ARM64/spill-lr.ll b/test/CodeGen/ARM64/spill-lr.ll
deleted file mode 100644
index fb6588e..0000000
--- a/test/CodeGen/ARM64/spill-lr.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios < %s
-@bar = common global i32 0, align 4
-
-; Leaf function which uses all callee-saved registers and allocates >= 256 bytes on the stack
-; this will cause processFunctionBeforeCalleeSavedScan() to spill LR as an additional scratch
-; register.
-;
-; This is a crash-only regression test for rdar://15124582.
-define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) nounwind {
-entry:
-  %stack = alloca [128 x i32], align 4
-  %0 = bitcast [128 x i32]* %stack to i8*
-  %idxprom = sext i32 %a to i64
-  %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom
-  store i32 %b, i32* %arrayidx, align 4
-  %1 = load volatile i32* @bar, align 4
-  %2 = load volatile i32* @bar, align 4
-  %3 = load volatile i32* @bar, align 4
-  %4 = load volatile i32* @bar, align 4
-  %5 = load volatile i32* @bar, align 4
-  %6 = load volatile i32* @bar, align 4
-  %7 = load volatile i32* @bar, align 4
-  %8 = load volatile i32* @bar, align 4
-  %9 = load volatile i32* @bar, align 4
-  %10 = load volatile i32* @bar, align 4
-  %11 = load volatile i32* @bar, align 4
-  %12 = load volatile i32* @bar, align 4
-  %13 = load volatile i32* @bar, align 4
-  %14 = load volatile i32* @bar, align 4
-  %15 = load volatile i32* @bar, align 4
-  %16 = load volatile i32* @bar, align 4
-  %17 = load volatile i32* @bar, align 4
-  %18 = load volatile i32* @bar, align 4
-  %19 = load volatile i32* @bar, align 4
-  %20 = load volatile i32* @bar, align 4
-  %idxprom1 = sext i32 %c to i64
-  %arrayidx2 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom1
-  %21 = load i32* %arrayidx2, align 4
-  %factor = mul i32 %h, -2
-  %factor67 = mul i32 %g, -2
-  %factor68 = mul i32 %f, -2
-  %factor69 = mul i32 %e, -2
-  %factor70 = mul i32 %d, -2
-  %factor71 = mul i32 %c, -2
-  %factor72 = mul i32 %b, -2
-  %sum = add i32 %2, %1
-  %sum73 = add i32 %sum, %3
-  %sum74 = add i32 %sum73, %4
-  %sum75 = add i32 %sum74, %5
-  %sum76 = add i32 %sum75, %6
-  %sum77 = add i32 %sum76, %7
-  %sum78 = add i32 %sum77, %8
-  %sum79 = add i32 %sum78, %9
-  %sum80 = add i32 %sum79, %10
-  %sum81 = add i32 %sum80, %11
-  %sum82 = add i32 %sum81, %12
-  %sum83 = add i32 %sum82, %13
-  %sum84 = add i32 %sum83, %14
-  %sum85 = add i32 %sum84, %15
-  %sum86 = add i32 %sum85, %16
-  %sum87 = add i32 %sum86, %17
-  %sum88 = add i32 %sum87, %18
-  %sum89 = add i32 %sum88, %19
-  %sum90 = add i32 %sum89, %20
-  %sub15 = sub i32 %21, %sum90
-  %sub16 = add i32 %sub15, %factor
-  %sub17 = add i32 %sub16, %factor67
-  %sub18 = add i32 %sub17, %factor68
-  %sub19 = add i32 %sub18, %factor69
-  %sub20 = add i32 %sub19, %factor70
-  %sub21 = add i32 %sub20, %factor71
-  %add = add i32 %sub21, %factor72
-  ret i32 %add
-}
diff --git a/test/CodeGen/ARM64/spill.ll b/test/CodeGen/ARM64/spill.ll
deleted file mode 100644
index 9173c87..0000000
--- a/test/CodeGen/ARM64/spill.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs
-
-; CHECK: fpr128
-; CHECK: ld1.2d
-; CHECK: str q
-; CHECK: inlineasm
-; CHECK: ldr q
-; CHECK: st1.2d
-define void @fpr128(<4 x float>* %p) nounwind ssp {
-entry:
-  %x = load <4 x float>* %p, align 16
-  call void asm sideeffect "; inlineasm", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23},~{q24},~{q25},~{q26},~{q27},~{q28},~{q29},~{q30},~{q31},~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
-  store <4 x float> %x, <4 x float>* %p, align 16
-  ret void
-}
diff --git a/test/CodeGen/ARM64/st1.ll b/test/CodeGen/ARM64/st1.ll
deleted file mode 100644
index b9aafc6..0000000
--- a/test/CodeGen/ARM64/st1.ll
+++ /dev/null
@@ -1,676 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
-
-define void @st1lane_16b(<16 x i8> %A, i8* %D) {
-; CHECK-LABEL: st1lane_16b
-; CHECK: st1.b
-  %tmp = extractelement <16 x i8> %A, i32 1
-  store i8 %tmp, i8* %D
-  ret void
-}
-
-define void @st1lane_8h(<8 x i16> %A, i16* %D) {
-; CHECK-LABEL: st1lane_8h
-; CHECK: st1.h
-  %tmp = extractelement <8 x i16> %A, i32 1
-  store i16 %tmp, i16* %D
-  ret void
-}
-
-define void @st1lane_4s(<4 x i32> %A, i32* %D) {
-; CHECK-LABEL: st1lane_4s
-; CHECK: st1.s
-  %tmp = extractelement <4 x i32> %A, i32 1
-  store i32 %tmp, i32* %D
-  ret void
-}
-
-define void @st1lane_4s_float(<4 x float> %A, float* %D) {
-; CHECK-LABEL: st1lane_4s_float
-; CHECK: st1.s
-  %tmp = extractelement <4 x float> %A, i32 1
-  store float %tmp, float* %D
-  ret void
-}
-
-define void @st1lane_2d(<2 x i64> %A, i64* %D) {
-; CHECK-LABEL: st1lane_2d
-; CHECK: st1.d
-  %tmp = extractelement <2 x i64> %A, i32 1
-  store i64 %tmp, i64* %D
-  ret void
-}
-
-define void @st1lane_2d_double(<2 x double> %A, double* %D) {
-; CHECK-LABEL: st1lane_2d_double
-; CHECK: st1.d
-  %tmp = extractelement <2 x double> %A, i32 1
-  store double %tmp, double* %D
-  ret void
-}
-
-define void @st1lane_8b(<8 x i8> %A, i8* %D) {
-; CHECK-LABEL: st1lane_8b
-; CHECK: st1.b
-  %tmp = extractelement <8 x i8> %A, i32 1
-  store i8 %tmp, i8* %D
-  ret void
-}
-
-define void @st1lane_4h(<4 x i16> %A, i16* %D) {
-; CHECK-LABEL: st1lane_4h
-; CHECK: st1.h
-  %tmp = extractelement <4 x i16> %A, i32 1
-  store i16 %tmp, i16* %D
-  ret void
-}
-
-define void @st1lane_2s(<2 x i32> %A, i32* %D) {
-; CHECK-LABEL: st1lane_2s
-; CHECK: st1.s
-  %tmp = extractelement <2 x i32> %A, i32 1
-  store i32 %tmp, i32* %D
-  ret void
-}
-
-define void @st1lane_2s_float(<2 x float> %A, float* %D) {
-; CHECK-LABEL: st1lane_2s_float
-; CHECK: st1.s
-  %tmp = extractelement <2 x float> %A, i32 1
-  store float %tmp, float* %D
-  ret void
-}
-
-define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
-; CHECK-LABEL: st2lane_16b
-; CHECK: st2.b
-  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
-  ret void
-}
-
-define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
-; CHECK-LABEL: st2lane_8h
-; CHECK: st2.h
-  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
-  ret void
-}
-
-define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
-; CHECK-LABEL: st2lane_4s
-; CHECK: st2.s
-  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
-  ret void
-}
-
-define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
-; CHECK-LABEL: st2lane_2d
-; CHECK: st2.d
-  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
-
-define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
-; CHECK-LABEL: st3lane_16b
-; CHECK: st3.b
-  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
-  ret void
-}
-
-define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
-; CHECK-LABEL: st3lane_8h
-; CHECK: st3.h
-  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
-  ret void
-}
-
-define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
-; CHECK-LABEL: st3lane_4s
-; CHECK: st3.s
-  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
-  ret void
-}
-
-define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
-; CHECK-LABEL: st3lane_2d
-; CHECK: st3.d
-  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
-
-define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
-; CHECK-LABEL: st4lane_16b
-; CHECK: st4.b
-  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
-  ret void
-}
-
-define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
-; CHECK-LABEL: st4lane_8h
-; CHECK: st4.h
-  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
-  ret void
-}
-
-define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
-; CHECK-LABEL: st4lane_4s
-; CHECK: st4.s
-  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
-  ret void
-}
-
-define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
-; CHECK-LABEL: st4lane_2d
-; CHECK: st4.d
-  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
-
-
-define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
-; CHECK-LABEL: st2_8b
-; CHECK st2.8b
-	call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
-	ret void
-}
-
-define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
-; CHECK-LABEL: st3_8b
-; CHECK st3.8b
-	call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
-	ret void
-}
-
-define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
-; CHECK-LABEL: st4_8b
-; CHECK st4.8b
-	call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-
-define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
-; CHECK-LABEL: st2_16b
-; CHECK st2.16b
-	call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
-	ret void
-}
-
-define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
-; CHECK-LABEL: st3_16b
-; CHECK st3.16b
-	call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
-	ret void
-}
-
-define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
-; CHECK-LABEL: st4_16b
-; CHECK st4.16b
-	call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-
-define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
-; CHECK-LABEL: st2_4h
-; CHECK st2.4h
-	call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
-	ret void
-}
-
-define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
-; CHECK-LABEL: st3_4h
-; CHECK st3.4h
-	call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
-	ret void
-}
-
-define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
-; CHECK-LABEL: st4_4h
-; CHECK st4.4h
-	call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-
-define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
-; CHECK-LABEL: st2_8h
-; CHECK st2.8h
-	call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
-	ret void
-}
-
-define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
-; CHECK-LABEL: st3_8h
-; CHECK st3.8h
-	call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
-	ret void
-}
-
-define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
-; CHECK-LABEL: st4_8h
-; CHECK st4.8h
-	call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-
-define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
-; CHECK-LABEL: st2_2s
-; CHECK st2.2s
-	call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
-	ret void
-}
-
-define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
-; CHECK-LABEL: st3_2s
-; CHECK st3.2s
-	call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
-	ret void
-}
-
-define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
-; CHECK-LABEL: st4_2s
-; CHECK st4.2s
-	call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-
-define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
-; CHECK-LABEL: st2_4s
-; CHECK st2.4s
-	call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
-	ret void
-}
-
-define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
-; CHECK-LABEL: st3_4s
-; CHECK st3.4s
-	call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
-	ret void
-}
-
-define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
-; CHECK-LABEL: st4_4s
-; CHECK st4.4s
-	call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-
-define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
-; CHECK-LABEL: st2_1d
-; CHECK st1.2d
-	call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
-	ret void
-}
-
-define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
-; CHECK-LABEL: st3_1d
-; CHECK st1.3d
-	call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
-	ret void
-}
-
-define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
-; CHECK-LABEL: st4_1d
-; CHECK st1.4d
-	call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-
-define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
-; CHECK-LABEL: st2_2d
-; CHECK st2.2d
-	call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
-	ret void
-}
-
-define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
-; CHECK-LABEL: st3_2d
-; CHECK st2.3d
-	call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
-	ret void
-}
-
-define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
-; CHECK-LABEL: st4_2d
-; CHECK st2.4d
-	call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-
-declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
-
-define void @st1_x2_v8i8(<8 x i8> %A, <8 x i8> %B, i8* %addr) {
-; CHECK-LABEL: st1_x2_v8i8:
-; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
-  ret void
-}
-
-define void @st1_x2_v4i16(<4 x i16> %A, <4 x i16> %B, i16* %addr) {
-; CHECK-LABEL: st1_x2_v4i16:
-; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
-  ret void
-}
-
-define void @st1_x2_v2i32(<2 x i32> %A, <2 x i32> %B, i32* %addr) {
-; CHECK-LABEL: st1_x2_v2i32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
-  ret void
-}
-
-define void @st1_x2_v2f32(<2 x float> %A, <2 x float> %B, float* %addr) {
-; CHECK-LABEL: st1_x2_v2f32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
-  ret void
-}
-
-define void @st1_x2_v1i64(<1 x i64> %A, <1 x i64> %B, i64* %addr) {
-; CHECK-LABEL: st1_x2_v1i64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
-  ret void
-}
-
-define void @st1_x2_v1f64(<1 x double> %A, <1 x double> %B, double* %addr) {
-; CHECK-LABEL: st1_x2_v1f64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
-
-define void @st1_x2_v16i8(<16 x i8> %A, <16 x i8> %B, i8* %addr) {
-; CHECK-LABEL: st1_x2_v16i8:
-; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
-  ret void
-}
-
-define void @st1_x2_v8i16(<8 x i16> %A, <8 x i16> %B, i16* %addr) {
-; CHECK-LABEL: st1_x2_v8i16:
-; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
-  ret void
-}
-
-define void @st1_x2_v4i32(<4 x i32> %A, <4 x i32> %B, i32* %addr) {
-; CHECK-LABEL: st1_x2_v4i32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
-  ret void
-}
-
-define void @st1_x2_v4f32(<4 x float> %A, <4 x float> %B, float* %addr) {
-; CHECK-LABEL: st1_x2_v4f32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
-  ret void
-}
-
-define void @st1_x2_v2i64(<2 x i64> %A, <2 x i64> %B, i64* %addr) {
-; CHECK-LABEL: st1_x2_v2i64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
-  ret void
-}
-
-define void @st1_x2_v2f64(<2 x double> %A, <2 x double> %B, double* %addr) {
-; CHECK-LABEL: st1_x2_v2f64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
-
-define void @st1_x3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr) {
-; CHECK-LABEL: st1_x3_v8i8:
-; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
-  ret void
-}
-
-define void @st1_x3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr) {
-; CHECK-LABEL: st1_x3_v4i16:
-; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
-  ret void
-}
-
-define void @st1_x3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr) {
-; CHECK-LABEL: st1_x3_v2i32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
-  ret void
-}
-
-define void @st1_x3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr) {
-; CHECK-LABEL: st1_x3_v2f32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
-  ret void
-}
-
-define void @st1_x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr) {
-; CHECK-LABEL: st1_x3_v1i64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
-  ret void
-}
-
-define void @st1_x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr) {
-; CHECK-LABEL: st1_x3_v1f64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
-
-define void @st1_x3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr) {
-; CHECK-LABEL: st1_x3_v16i8:
-; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
-  ret void
-}
-
-define void @st1_x3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr) {
-; CHECK-LABEL: st1_x3_v8i16:
-; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
-  ret void
-}
-
-define void @st1_x3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr) {
-; CHECK-LABEL: st1_x3_v4i32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
-  ret void
-}
-
-define void @st1_x3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr) {
-; CHECK-LABEL: st1_x3_v4f32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
-  ret void
-}
-
-define void @st1_x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr) {
-; CHECK-LABEL: st1_x3_v2i64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
-  ret void
-}
-
-define void @st1_x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr) {
-; CHECK-LABEL: st1_x3_v2f64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
-  ret void
-}
-
-
-declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
-
-define void @st1_x4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr) {
-; CHECK-LABEL: st1_x4_v8i8:
-; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
-  ret void
-}
-
-define void @st1_x4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr) {
-; CHECK-LABEL: st1_x4_v4i16:
-; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
-  ret void
-}
-
-define void @st1_x4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr) {
-; CHECK-LABEL: st1_x4_v2i32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
-  ret void
-}
-
-define void @st1_x4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr) {
-; CHECK-LABEL: st1_x4_v2f32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
-  ret void
-}
-
-define void @st1_x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr) {
-; CHECK-LABEL: st1_x4_v1i64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
-  ret void
-}
-
-define void @st1_x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr) {
-; CHECK-LABEL: st1_x4_v1f64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
-
-define void @st1_x4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr) {
-; CHECK-LABEL: st1_x4_v16i8:
-; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
-  ret void
-}
-
-define void @st1_x4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr) {
-; CHECK-LABEL: st1_x4_v8i16:
-; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
-  ret void
-}
-
-define void @st1_x4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr) {
-; CHECK-LABEL: st1_x4_v4i32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
-  ret void
-}
-
-define void @st1_x4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr) {
-; CHECK-LABEL: st1_x4_v4f32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
-  ret void
-}
-
-define void @st1_x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr) {
-; CHECK-LABEL: st1_x4_v2i64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
-  ret void
-}
-
-define void @st1_x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr) {
-; CHECK-LABEL: st1_x4_v2f64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
-  ret void
-}
diff --git a/test/CodeGen/ARM64/stack-no-frame.ll b/test/CodeGen/ARM64/stack-no-frame.ll
deleted file mode 100644
index b5970c0..0000000
--- a/test/CodeGen/ARM64/stack-no-frame.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
-
-@global = global [20 x i64] zeroinitializer, align 8
-
-; The following function has enough locals to need some restoring, but not a
-; frame record. In an intermediate frame refactoring, prologue and epilogue were
-; inconsistent about how much to move SP.
-define void @test_stack_no_frame() {
-; CHECK: test_stack_no_frame
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-  %local = alloca [20 x i64]
-  %val = load volatile [20 x i64]* @global, align 8
-  store volatile [20 x i64] %val, [20 x i64]* %local, align 8
-
-  %val2 = load volatile [20 x i64]* %local, align 8
-  store volatile [20 x i64] %val2, [20 x i64]* @global, align 8
-
-; CHECK: add sp, sp, #[[STACKSIZE]]
-  ret void
-}
diff --git a/test/CodeGen/ARM64/stackmap.ll b/test/CodeGen/ARM64/stackmap.ll
deleted file mode 100644
index 2c7c6ae..0000000
--- a/test/CodeGen/ARM64/stackmap.ll
+++ /dev/null
@@ -1,288 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-;
-; Note: Print verbose stackmaps using -debug-only=stackmaps.
-
-; We are not getting the correct stack alignment when cross compiling for arm64.
-; So specify a datalayout here.
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
-; CHECK-NEXT:  __LLVM_StackMaps:
-; Header
-; CHECK-NEXT:   .byte 1
-; CHECK-NEXT:   .byte 0
-; CHECK-NEXT:   .short 0
-; Num Functions
-; CHECK-NEXT:   .long 11
-; Num LargeConstants
-; CHECK-NEXT:   .long 2
-; Num Callsites
-; CHECK-NEXT:   .long 11
-
-; Functions and stack size
-; CHECK-NEXT:   .quad _constantargs
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _osrinline
-; CHECK-NEXT:   .quad 32
-; CHECK-NEXT:   .quad _osrcold
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _propertyRead
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _propertyWrite
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _jsVoidCall
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _jsIntCall
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _spilledValue
-; CHECK-NEXT:   .quad 160
-; CHECK-NEXT:   .quad _spilledStackMapValue
-; CHECK-NEXT:   .quad 128
-; CHECK-NEXT:   .quad _liveConstant
-; CHECK-NEXT:   .quad 16
-; CHECK-NEXT:   .quad _clobberLR
-; CHECK-NEXT:   .quad 112
-
-; Num LargeConstants
-; CHECK-NEXT:   .quad   4294967295
-; CHECK-NEXT:   .quad   4294967296
-
-; Constant arguments
-;
-; CHECK-NEXT:   .quad   1
-; CHECK-NEXT:   .long   L{{.*}}-_constantargs
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  4
-; SmallConstant
-; CHECK-NEXT:   .byte   4
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   65535
-; SmallConstant
-; CHECK-NEXT:   .byte   4
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   65536
-; SmallConstant
-; CHECK-NEXT:   .byte   5
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   0
-; LargeConstant at index 0
-; CHECK-NEXT:   .byte   5
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   1
-
-define void @constantargs() {
-entry:
-  %0 = inttoptr i64 244837814094590 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
-  ret void
-}
-
-; Inline OSR Exit
-;
-; CHECK-LABEL:  .long   L{{.*}}-_osrinline
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  2
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long  0
-define void @osrinline(i64 %a, i64 %b) {
-entry:
-  ; Runtime void->void call.
-  call void inttoptr (i64 244837814094590 to void ()*)()
-  ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
-  ret void
-}
-
-; Cold OSR Exit
-;
-; 2 live variables in register.
-;
-; CHECK-LABEL:  .long   L{{.*}}-_osrcold
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  2
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long  0
-define void @osrcold(i64 %a, i64 %b) {
-entry:
-  %test = icmp slt i64 %a, %b
-  br i1 %test, label %ret, label %cold
-cold:
-  ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
-  %thunk = inttoptr i64 244837814094590 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
-  unreachable
-ret:
-  ret void
-}
-
-; Property Read
-; CHECK-LABEL:  .long   L{{.*}}-_propertyRead
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  0
-;
-; FIXME: There are currently no stackmap entries. After moving to
-; AnyRegCC, we will have entries for the object and return value.
-define i64 @propertyRead(i64* %obj) {
-entry:
-  %resolveRead = inttoptr i64 244837814094590 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
-  %add = add i64 %result, 3
-  ret i64 %add
-}
-
-; Property Write
-; CHECK-LABEL:  .long   L{{.*}}-_propertyWrite
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  2
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
-entry:
-  %resolveWrite = inttoptr i64 244837814094590 to i8*
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
-  ret void
-}
-
-; Void JS Call
-;
-; 2 live variables in registers.
-;
-; CHECK-LABEL:  .long   L{{.*}}-_jsVoidCall
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  2
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
-entry:
-  %resolveCall = inttoptr i64 244837814094590 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
-  ret void
-}
-
-; i64 JS Call
-;
-; 2 live variables in registers.
-;
-; CHECK-LABEL:  .long   L{{.*}}-_jsIntCall
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  2
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-; CHECK-NEXT:   .byte   1
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long   0
-define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
-entry:
-  %resolveCall = inttoptr i64 244837814094590 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
-  %add = add i64 %result, 3
-  ret i64 %add
-}
-
-; Spilled stack map values.
-;
-; Verify 28 stack map entries.
-;
-; CHECK-LABEL:  .long L{{.*}}-_spilledValue
-; CHECK-NEXT:   .short 0
-; CHECK-NEXT:   .short 28
-;
-; Check that at least one is a spilled entry from RBP.
-; Location: Indirect FP + ...
-; CHECK:        .byte 3
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short 29
-define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
-entry:
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
-  ret void
-}
-
-; Spilled stack map values.
-;
-; Verify 23 stack map entries.
-;
-; CHECK-LABEL:  .long L{{.*}}-_spilledStackMapValue
-; CHECK-NEXT:   .short 0
-; CHECK-NEXT:   .short 30
-;
-; Check that at least one is a spilled entry from RBP.
-; Location: Indirect FP + ...
-; CHECK:        .byte 3
-; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short 29
-define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
-entry:
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
-  ret void
-}
-
-
-; Map a constant value.
-;
-; CHECK-LABEL:  .long L{{.*}}-_liveConstant
-; CHECK-NEXT:   .short 0
-; 1 location
-; CHECK-NEXT:   .short 1
-; Loc 0: SmallConstant
-; CHECK-NEXT:   .byte   4
-; CHECK-NEXT:   .byte   8
-; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   33
-
-define void @liveConstant() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
-  ret void
-}
-
-; Map a value when LR is the only free register.
-;
-; CHECK-LABEL:  .long L{{.*}}-_clobberLR
-; CHECK-NEXT:   .short 0
-; 1 location
-; CHECK-NEXT:   .short 1
-; Loc 0: Indirect FP (r29) - offset
-; CHECK-NEXT:   .byte   3
-; CHECK-NEXT:   .byte   4
-; CHECK-NEXT:   .short  29
-; CHECK-NEXT:   .long   -{{[0-9]+}}
-define void @clobberLR(i32 %a) {
-  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x31}"() nounwind
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
-  ret void
-}
-
-declare void @llvm.experimental.stackmap(i64, i32, ...)
-declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
-declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/stacksave.ll b/test/CodeGen/ARM64/stacksave.ll
deleted file mode 100644
index a79e99b..0000000
--- a/test/CodeGen/ARM64/stacksave.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -verify-coalescing
-; <rdar://problem/11522048>
-target triple = "arm64-apple-macosx10.8.0"
-
-; Verify that we can handle spilling the stack pointer without attempting
-; spilling it directly.
-; CHECK: f
-; CHECK: mov [[X0:x[0-9]+]], sp
-; CHECK: str [[X0]]
-; CHECK: inlineasm
-define void @f() nounwind ssp {
-entry:
-  %savedstack = call i8* @llvm.stacksave() nounwind
-  call void asm sideeffect "; inlineasm", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
-  call void @llvm.stackrestore(i8* %savedstack) nounwind
-  ret void
-}
-
-declare i8* @llvm.stacksave() nounwind
-declare void @llvm.stackrestore(i8*) nounwind
diff --git a/test/CodeGen/ARM64/stp.ll b/test/CodeGen/ARM64/stp.ll
deleted file mode 100644
index eacf093..0000000
--- a/test/CodeGen/ARM64/stp.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-stp-suppress=false -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
-; RUN:   -verify-machineinstrs | FileCheck -check-prefix=STUR_CHK %s
-
-; CHECK: stp_int
-; CHECK: stp w0, w1, [x2]
-define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
-  store i32 %a, i32* %p, align 4
-  %add.ptr = getelementptr inbounds i32* %p, i64 1
-  store i32 %b, i32* %add.ptr, align 4
-  ret void
-}
-
-; CHECK: stp_long
-; CHECK: stp x0, x1, [x2]
-define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
-  store i64 %a, i64* %p, align 8
-  %add.ptr = getelementptr inbounds i64* %p, i64 1
-  store i64 %b, i64* %add.ptr, align 8
-  ret void
-}
-
-; CHECK: stp_float
-; CHECK: stp s0, s1, [x0]
-define void @stp_float(float %a, float %b, float* nocapture %p) nounwind {
-  store float %a, float* %p, align 4
-  %add.ptr = getelementptr inbounds float* %p, i64 1
-  store float %b, float* %add.ptr, align 4
-  ret void
-}
-
-; CHECK: stp_double
-; CHECK: stp d0, d1, [x0]
-define void @stp_double(double %a, double %b, double* nocapture %p) nounwind {
-  store double %a, double* %p, align 8
-  %add.ptr = getelementptr inbounds double* %p, i64 1
-  store double %b, double* %add.ptr, align 8
-  ret void
-}
-
-; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
-define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
-; STUR_CHK: stur_int
-; STUR_CHK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8]
-; STUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i32* %p, i32 -1
-  store i32 %a, i32* %p1, align 2
-  %p2 = getelementptr inbounds i32* %p, i32 -2
-  store i32 %b, i32* %p2, align 2
-  ret void
-}
-
-define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
-; STUR_CHK: stur_long
-; STUR_CHK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16]
-; STUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds i64* %p, i32 -1
-  store i64 %a, i64* %p1, align 2
-  %p2 = getelementptr inbounds i64* %p, i32 -2
-  store i64 %b, i64* %p2, align 2
-  ret void
-}
-
-define void @stur_float(float %a, float %b, float* nocapture %p) nounwind {
-; STUR_CHK: stur_float
-; STUR_CHK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8]
-; STUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds float* %p, i32 -1
-  store float %a, float* %p1, align 2
-  %p2 = getelementptr inbounds float* %p, i32 -2
-  store float %b, float* %p2, align 2
-  ret void
-}
-
-define void @stur_double(double %a, double %b, double* nocapture %p) nounwind {
-; STUR_CHK: stur_double
-; STUR_CHK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16]
-; STUR_CHK-NEXT: ret
-  %p1 = getelementptr inbounds double* %p, i32 -1
-  store double %a, double* %p1, align 2
-  %p2 = getelementptr inbounds double* %p, i32 -2
-  store double %b, double* %p2, align 2
-  ret void
-}
-
-define void @splat_v4i32(i32 %v, i32 *%p) {
-entry:
-
-; CHECK-LABEL: splat_v4i32
-; CHECK-DAG: stp w0, w0, [x1]
-; CHECK-DAG: stp w0, w0, [x1, #8]
-; CHECK: ret
-
-  %p17 = insertelement <4 x i32> undef, i32 %v, i32 0
-  %p18 = insertelement <4 x i32> %p17, i32 %v, i32 1
-  %p19 = insertelement <4 x i32> %p18, i32 %v, i32 2
-  %p20 = insertelement <4 x i32> %p19, i32 %v, i32 3
-  %p21 = bitcast i32* %p to <4 x i32>*
-  store <4 x i32> %p20, <4 x i32>* %p21, align 4
-  ret void
-}
diff --git a/test/CodeGen/ARM64/strict-align.ll b/test/CodeGen/ARM64/strict-align.ll
deleted file mode 100644
index e392172..0000000
--- a/test/CodeGen/ARM64/strict-align.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-apple-darwin -arm64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
-
-define i32 @f0(i32* nocapture %p) nounwind {
-; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
-; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
-; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16
-; CHECK-STRICT: ret
-
-; CHECK: ldr w0, [x0]
-; CHECK: ret
-  %tmp = load i32* %p, align 2
-  ret i32 %tmp
-}
-
-define i64 @f1(i64* nocapture %p) nounwind {
-; CHECK-STRICT:	ldp	w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
-; CHECK-STRICT:	orr	x0, x[[LOW]], x[[HIGH]], lsl #32
-; CHECK-STRICT:	ret
-
-; CHECK: ldr x0, [x0]
-; CHECK: ret
-  %tmp = load i64* %p, align 4
-  ret i64 %tmp
-}
diff --git a/test/CodeGen/ARM64/stur.ll b/test/CodeGen/ARM64/stur.ll
deleted file mode 100644
index 8326bba..0000000
--- a/test/CodeGen/ARM64/stur.ll
+++ /dev/null
@@ -1,98 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-%struct.X = type <{ i32, i64, i64 }>
-
-define void @foo1(i32* %p, i64 %val) nounwind {
-; CHECK-LABEL: foo1:
-; CHECK: 	stur	w1, [x0, #-4]
-; CHECK-NEXT: 	ret
-  %tmp1 = trunc i64 %val to i32
-  %ptr = getelementptr inbounds i32* %p, i64 -1
-  store i32 %tmp1, i32* %ptr, align 4
-  ret void
-}
-define void @foo2(i16* %p, i64 %val) nounwind {
-; CHECK-LABEL: foo2:
-; CHECK: 	sturh	w1, [x0, #-2]
-; CHECK-NEXT: 	ret
-  %tmp1 = trunc i64 %val to i16
-  %ptr = getelementptr inbounds i16* %p, i64 -1
-  store i16 %tmp1, i16* %ptr, align 2
-  ret void
-}
-define void @foo3(i8* %p, i64 %val) nounwind {
-; CHECK-LABEL: foo3:
-; CHECK: 	sturb	w1, [x0, #-1]
-; CHECK-NEXT: 	ret
-  %tmp1 = trunc i64 %val to i8
-  %ptr = getelementptr inbounds i8* %p, i64 -1
-  store i8 %tmp1, i8* %ptr, align 1
-  ret void
-}
-define void @foo4(i16* %p, i32 %val) nounwind {
-; CHECK-LABEL: foo4:
-; CHECK: 	sturh	w1, [x0, #-2]
-; CHECK-NEXT: 	ret
-  %tmp1 = trunc i32 %val to i16
-  %ptr = getelementptr inbounds i16* %p, i32 -1
-  store i16 %tmp1, i16* %ptr, align 2
-  ret void
-}
-define void @foo5(i8* %p, i32 %val) nounwind {
-; CHECK-LABEL: foo5:
-; CHECK: 	sturb	w1, [x0, #-1]
-; CHECK-NEXT: 	ret
-  %tmp1 = trunc i32 %val to i8
-  %ptr = getelementptr inbounds i8* %p, i32 -1
-  store i8 %tmp1, i8* %ptr, align 1
-  ret void
-}
-
-define void @foo(%struct.X* nocapture %p) nounwind optsize ssp {
-; CHECK-LABEL: foo:
-; CHECK-NOT: str
-; CHECK: stur    xzr, [x0, #12]
-; CHECK-NEXT: stur    xzr, [x0, #4]
-; CHECK-NEXT: ret
-  %B = getelementptr inbounds %struct.X* %p, i64 0, i32 1
-  %val = bitcast i64* %B to i8*
-  call void @llvm.memset.p0i8.i64(i8* %val, i8 0, i64 16, i32 1, i1 false)
-  ret void
-}
-
-declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
-
-; Unaligned 16b stores are split into 8b stores for performance.
-; radar://15424193
-
-; CHECK-LABEL: unaligned:
-; CHECK-NOT: str q0
-; CHECK: str     d[[REG:[0-9]+]], [x0]
-; CHECK: ext.16b v[[REG2:[0-9]+]], v[[REG]], v[[REG]], #8
-; CHECK: str     d[[REG2]], [x0, #8]
-define void @unaligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
-  store <4 x i32> %v, <4 x i32>* %p, align 4
-  ret void
-}
-
-; CHECK-LABEL: aligned:
-; CHECK: str q0
-define void @aligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
-  store <4 x i32> %v, <4 x i32>* %p
-  ret void
-}
-
-; Don't split one and two byte aligned stores.
-; radar://16349308
-
-; CHECK-LABEL: twobytealign:
-; CHECK: str q0
-define void @twobytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
-  store <4 x i32> %v, <4 x i32>* %p, align 2
-  ret void
-}
-; CHECK-LABEL: onebytealign:
-; CHECK: str q0
-define void @onebytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
-  store <4 x i32> %v, <4 x i32>* %p, align 1
-  ret void
-}
diff --git a/test/CodeGen/ARM64/subvector-extend.ll b/test/CodeGen/ARM64/subvector-extend.ll
deleted file mode 100644
index ad2f06c..0000000
--- a/test/CodeGen/ARM64/subvector-extend.ll
+++ /dev/null
@@ -1,141 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
-
-; Test efficient codegen of vector extends up from legal type to 128 bit
-; and 256 bit vector types.
-
-;-----
-; Vectors of i16.
-;-----
-define <8 x i16> @func1(<8 x i8> %v0) nounwind {
-; CHECK-LABEL: func1:
-; CHECK-NEXT: ushll.8h  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <8 x i8> %v0 to <8 x i16>
-  ret <8 x i16> %r
-}
-
-define <8 x i16> @func2(<8 x i8> %v0) nounwind {
-; CHECK-LABEL: func2:
-; CHECK-NEXT: sshll.8h  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <8 x i8> %v0 to <8 x i16>
-  ret <8 x i16> %r
-}
-
-define <16 x i16> @func3(<16 x i8> %v0) nounwind {
-; CHECK-LABEL: func3:
-; CHECK-NEXT: ushll2.8h  v1, v0, #0
-; CHECK-NEXT: ushll.8h  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <16 x i8> %v0 to <16 x i16>
-  ret <16 x i16> %r
-}
-
-define <16 x i16> @func4(<16 x i8> %v0) nounwind {
-; CHECK-LABEL: func4:
-; CHECK-NEXT: sshll2.8h  v1, v0, #0
-; CHECK-NEXT: sshll.8h  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <16 x i8> %v0 to <16 x i16>
-  ret <16 x i16> %r
-}
-
-;-----
-; Vectors of i32.
-;-----
-
-define <4 x i32> @afunc1(<4 x i16> %v0) nounwind {
-; CHECK-LABEL: afunc1:
-; CHECK-NEXT: ushll.4s v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <4 x i16> %v0 to <4 x i32>
-  ret <4 x i32> %r
-}
-
-define <4 x i32> @afunc2(<4 x i16> %v0) nounwind {
-; CHECK-LABEL: afunc2:
-; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <4 x i16> %v0 to <4 x i32>
-  ret <4 x i32> %r
-}
-
-define <8 x i32> @afunc3(<8 x i16> %v0) nounwind {
-; CHECK-LABEL: afunc3:
-; CHECK-NEXT: ushll2.4s v1, v0, #0
-; CHECK-NEXT: ushll.4s v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <8 x i16> %v0 to <8 x i32>
-  ret <8 x i32> %r
-}
-
-define <8 x i32> @afunc4(<8 x i16> %v0) nounwind {
-; CHECK-LABEL: afunc4:
-; CHECK-NEXT: sshll2.4s v1, v0, #0
-; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <8 x i16> %v0 to <8 x i32>
-  ret <8 x i32> %r
-}
-
-define <8 x i32> @bfunc1(<8 x i8> %v0) nounwind {
-; CHECK-LABEL: bfunc1:
-; CHECK-NEXT: ushll.8h  v0, v0, #0
-; CHECK-NEXT: ushll2.4s v1, v0, #0
-; CHECK-NEXT: ushll.4s  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <8 x i8> %v0 to <8 x i32>
-  ret <8 x i32> %r
-}
-
-define <8 x i32> @bfunc2(<8 x i8> %v0) nounwind {
-; CHECK-LABEL: bfunc2:
-; CHECK-NEXT: sshll.8h  v0, v0, #0
-; CHECK-NEXT: sshll2.4s v1, v0, #0
-; CHECK-NEXT: sshll.4s  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <8 x i8> %v0 to <8 x i32>
-  ret <8 x i32> %r
-}
-
-;-----
-; Vectors of i64.
-;-----
-
-define <4 x i64> @zfunc1(<4 x i32> %v0) nounwind {
-; CHECK-LABEL: zfunc1:
-; CHECK-NEXT: ushll2.2d v1, v0, #0
-; CHECK-NEXT: ushll.2d v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <4 x i32> %v0 to <4 x i64>
-  ret <4 x i64> %r
-}
-
-define <4 x i64> @zfunc2(<4 x i32> %v0) nounwind {
-; CHECK-LABEL: zfunc2:
-; CHECK-NEXT: sshll2.2d v1, v0, #0
-; CHECK-NEXT: sshll.2d v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <4 x i32> %v0 to <4 x i64>
-  ret <4 x i64> %r
-}
-
-define <4 x i64> @bfunc3(<4 x i16> %v0) nounwind {
-; CHECK-LABEL: func3:
-; CHECK-NEXT: ushll.4s  v0, v0, #0
-; CHECK-NEXT: ushll2.2d v1, v0, #0
-; CHECK-NEXT: ushll.2d  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = zext <4 x i16> %v0 to <4 x i64>
-  ret <4 x i64> %r
-}
-
-define <4 x i64> @cfunc4(<4 x i16> %v0) nounwind {
-; CHECK-LABEL: func4:
-; CHECK-NEXT: sshll.4s  v0, v0, #0
-; CHECK-NEXT: sshll2.2d v1, v0, #0
-; CHECK-NEXT: sshll.2d  v0, v0, #0
-; CHECK-NEXT: ret
-  %r = sext <4 x i16> %v0 to <4 x i64>
-  ret <4 x i64> %r
-}
diff --git a/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll b/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
deleted file mode 100644
index 4ab2bee..0000000
--- a/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-; rdar://13214163 - Make sure we generate a correct lookup table for the TBL
-; instruction when the element size of the vector is not 8 bits. We were
-; getting both the endianness wrong and the element indexing wrong.
-define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {
-; CHECK:	.section	__TEXT,__literal16,16byte_literals
-; CHECK:	.align	4
-; CHECK:lCPI0_0:
-; CHECK:	.byte	0                       ; 0x0
-; CHECK:	.byte	1                       ; 0x1
-; CHECK:	.byte	0                       ; 0x0
-; CHECK:	.byte	1                       ; 0x1
-; CHECK:	.byte	0                       ; 0x0
-; CHECK:	.byte	1                       ; 0x1
-; CHECK:	.byte	0                       ; 0x0
-; CHECK:	.byte	1                       ; 0x1
-; CHECK:	.byte	8                       ; 0x8
-; CHECK:	.byte	9                       ; 0x9
-; CHECK:	.byte	8                       ; 0x8
-; CHECK:	.byte	9                       ; 0x9
-; CHECK:	.byte	8                       ; 0x8
-; CHECK:	.byte	9                       ; 0x9
-; CHECK:	.byte	8                       ; 0x8
-; CHECK:	.byte	9                       ; 0x9
-; CHECK:	.section __TEXT,__text,regular,pure_instructions
-; CHECK:	.globl	_foo
-; CHECK:	.align	2
-; CHECK:_foo:                                   ; @foo
-; CHECK:	adrp	[[BASE:x[0-9]+]], lCPI0_0@PAGE
-; CHECK:	ldr	q[[REG:[0-9]+]], {{\[}}[[BASE]], lCPI0_0@PAGEOFF]
-; CHECK:	tbl.16b	v0, { v0 }, v[[REG]]
-; CHECK:	ret
-
-  %val = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
-  ret <8 x i16> %val
-}
diff --git a/test/CodeGen/ARM64/tbl.ll b/test/CodeGen/ARM64/tbl.ll
deleted file mode 100644
index e1edd21..0000000
--- a/test/CodeGen/ARM64/tbl.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
-; CHECK: tbl1_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
-; CHECK: tbl1_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
-; CHECK: tbl2_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
-; CHECK: tbl2_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK: tbl3_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK: tbl3_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK: tbl4_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK: tbl4_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
-  ret <16 x i8> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-
-define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
-; CHECK: tbx1_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
-; CHECK: tbx1_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK: tbx2_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK: tbx2_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK: tbx3_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK: tbx3_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
-; CHECK: tbx4_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
-; CHECK: tbx4_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
-  ret <16 x i8> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-
diff --git a/test/CodeGen/ARM64/this-return.ll b/test/CodeGen/ARM64/this-return.ll
deleted file mode 100644
index 30f5b9b..0000000
--- a/test/CodeGen/ARM64/this-return.ll
+++ /dev/null
@@ -1,83 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-%struct.A = type { i8 }
-%struct.B = type { i32 }
-%struct.C = type { %struct.B }
-%struct.D = type { %struct.B }
-%struct.E = type { %struct.B, %struct.B }
-
-declare %struct.A* @A_ctor_base(%struct.A* returned)
-declare %struct.B* @B_ctor_base(%struct.B* returned, i32)
-declare %struct.B* @B_ctor_complete(%struct.B* returned, i32)
-
-declare %struct.A* @A_ctor_base_nothisret(%struct.A*)
-declare %struct.B* @B_ctor_base_nothisret(%struct.B*, i32)
-declare %struct.B* @B_ctor_complete_nothisret(%struct.B*, i32)
-
-define %struct.C* @C_ctor_base(%struct.C* returned %this, i32 %x) {
-entry:
-; CHECK-LABEL: C_ctor_base:
-; CHECK-NOT: mov {{x[0-9]+}}, x0
-; CHECK: bl {{_?A_ctor_base}}
-; CHECK-NOT: mov x0, {{x[0-9]+}}
-; CHECK: b {{_?B_ctor_base}}
-  %0 = bitcast %struct.C* %this to %struct.A*
-  %call = tail call %struct.A* @A_ctor_base(%struct.A* %0)
-  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
-  %call2 = tail call %struct.B* @B_ctor_base(%struct.B* %1, i32 %x)
-  ret %struct.C* %this
-}
-
-define %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x) {
-entry:
-; CHECK-LABEL: C_ctor_base_nothisret:
-; CHECK: mov [[SAVETHIS:x[0-9]+]], x0
-; CHECK: bl {{_?A_ctor_base_nothisret}}
-; CHECK: mov x0, [[SAVETHIS]]
-; CHECK-NOT: b {{_?B_ctor_base_nothisret}}
-  %0 = bitcast %struct.C* %this to %struct.A*
-  %call = tail call %struct.A* @A_ctor_base_nothisret(%struct.A* %0)
-  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
-  %call2 = tail call %struct.B* @B_ctor_base_nothisret(%struct.B* %1, i32 %x)
-  ret %struct.C* %this
-}
-
-define %struct.C* @C_ctor_complete(%struct.C* %this, i32 %x) {
-entry:
-; CHECK-LABEL: C_ctor_complete:
-; CHECK: b {{_?C_ctor_base}}
-  %call = tail call %struct.C* @C_ctor_base(%struct.C* %this, i32 %x)
-  ret %struct.C* %this
-}
-
-define %struct.C* @C_ctor_complete_nothisret(%struct.C* %this, i32 %x) {
-entry:
-; CHECK-LABEL: C_ctor_complete_nothisret:
-; CHECK-NOT: b {{_?C_ctor_base_nothisret}}
-  %call = tail call %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x)
-  ret %struct.C* %this
-}
-
-define %struct.D* @D_ctor_base(%struct.D* %this, i32 %x) {
-entry:
-; CHECK-LABEL: D_ctor_base:
-; CHECK-NOT: mov {{x[0-9]+}}, x0
-; CHECK: bl {{_?B_ctor_complete}}
-; CHECK-NOT: mov x0, {{x[0-9]+}}
-; CHECK: b {{_?B_ctor_complete}}
-  %b = getelementptr inbounds %struct.D* %this, i32 0, i32 0
-  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
-  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
-  ret %struct.D* %this
-}
-
-define %struct.E* @E_ctor_base(%struct.E* %this, i32 %x) {
-entry:
-; CHECK-LABEL: E_ctor_base:
-; CHECK-NOT: b {{_?B_ctor_complete}}
-  %b = getelementptr inbounds %struct.E* %this, i32 0, i32 0
-  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
-  %b2 = getelementptr inbounds %struct.E* %this, i32 0, i32 1
-  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b2, i32 %x)
-  ret %struct.E* %this
-}
diff --git a/test/CodeGen/ARM64/tls-darwin.ll b/test/CodeGen/ARM64/tls-darwin.ll
deleted file mode 100644
index 5e8ec33..0000000
--- a/test/CodeGen/ARM64/tls-darwin.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
-
-@var = thread_local global i8 0
-
-; N.b. x0 must be the result of the first load (i.e. the address of the
-; descriptor) when tlv_get_addr is called. Likewise the result is returned in
-; x0.
-define i8 @get_var() {
-; CHECK-LABEL: get_var:
-; CHECK: adrp x[[TLVPDESC_SLOT_HI:[0-9]+]], _var@TLVPPAGE
-; CHECK: ldr x0, [x[[TLVPDESC_SLOT_HI]], _var@TLVPPAGEOFF]
-; CHECK: ldr [[TLV_GET_ADDR:x[0-9]+]], [x0]
-; CHECK: blr [[TLV_GET_ADDR]]
-; CHECK: ldrb w0, [x0]
-
-  %val = load i8* @var, align 1
-  ret i8 %val
-}
diff --git a/test/CodeGen/ARM64/tls-dynamic-together.ll b/test/CodeGen/ARM64/tls-dynamic-together.ll
deleted file mode 100644
index 3daae62..0000000
--- a/test/CodeGen/ARM64/tls-dynamic-together.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
-
-; If the .tlsdesccall and blr parts are emitted completely separately (even with
-; glue) then LLVM will separate them quite happily (with a spill at O0, hence
-; the option). This is definitely wrong, so we make sure they are emitted
-; together.
-
-@general_dynamic_var = external thread_local global i32
-
-define i32 @test_generaldynamic() {
-; CHECK-LABEL: test_generaldynamic:
-
-  %val = load i32* @general_dynamic_var
-  ret i32 %val
-
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr {{x[0-9]+}}
-}
diff --git a/test/CodeGen/ARM64/tls-dynamics.ll b/test/CodeGen/ARM64/tls-dynamics.ll
deleted file mode 100644
index e8a83fd..0000000
--- a/test/CodeGen/ARM64/tls-dynamics.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
-
-@general_dynamic_var = external thread_local global i32
-
-define i32 @test_generaldynamic() {
-; CHECK-LABEL: test_generaldynamic:
-
-  %val = load i32* @general_dynamic_var
-  ret i32 %val
-
-  ; FIXME: the adrp instructions are redundant (if harmless).
-; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
-; CHECK: ldr w0, [x[[TP]], x0]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-define i32* @test_generaldynamic_addr() {
-; CHECK-LABEL: test_generaldynamic_addr:
-
-  ret i32* @general_dynamic_var
-
-  ; FIXME: the adrp instructions are redundant (if harmless).
-; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
-; CHECK: add x0, [[TP]], x0
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-}
-
-@local_dynamic_var = external thread_local(localdynamic) global i32
-
-define i32 @test_localdynamic() {
-; CHECK-LABEL: test_localdynamic:
-
-  %val = load i32* @local_dynamic_var
-  ret i32 %val
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
-; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-
-; CHECK: add x[[TPREL:[0-9]+]], x0, [[DTP_OFFSET]]
-
-; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
-
-; CHECK: ldr w0, [x[[TPIDR]], x[[TPREL]]]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-define i32* @test_localdynamic_addr() {
-; CHECK-LABEL: test_localdynamic_addr:
-
-  ret i32* @local_dynamic_var
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
-; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-
-; CHECK: add [[TPREL:x[0-9]+]], x0, [[DTP_OFFSET]]
-
-; CHECK: mrs [[TPIDR:x[0-9]+]], TPIDR_EL0
-
-; CHECK: add x0, [[TPIDR]], [[TPREL]]
-
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
-}
-
-; The entire point of the local-dynamic access model is to have a single call to
-; the expensive resolver. Make sure we achieve that goal.
-
-@local_dynamic_var2 = external thread_local(localdynamic) global i32
-
-define i32 @test_localdynamic_deduplicate() {
-; CHECK-LABEL: test_localdynamic_deduplicate:
-
-  %val = load i32* @local_dynamic_var
-  %val2 = load i32* @local_dynamic_var2
-
-  %sum = add i32 %val, %val2
-  ret i32 %sum
-
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
-; CHECK: .tlsdesccall _TLS_MODULE_BASE_
-; CHECK-NEXT: blr [[CALLEE]]
-
-; CHECK-NOT: _TLS_MODULE_BASE_
-
-; CHECK: ret
-}
diff --git a/test/CodeGen/ARM64/tls-execs.ll b/test/CodeGen/ARM64/tls-execs.ll
deleted file mode 100644
index f0130d8..0000000
--- a/test/CodeGen/ARM64/tls-execs.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
-
-@initial_exec_var = external thread_local(initialexec) global i32
-
-define i32 @test_initial_exec() {
-; CHECK-LABEL: test_initial_exec:
-  %val = load i32* @initial_exec_var
-
-; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
-; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
-; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
-; CHECK: ldr w0, [x[[TP]], x[[TP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
-; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
-
-  ret i32 %val
-}
-
-define i32* @test_initial_exec_addr() {
-; CHECK-LABEL: test_initial_exec_addr:
-  ret i32* @initial_exec_var
-
-; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
-; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
-; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
-; CHECK: add x0, [[TP]], [[TP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
-; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
-
-}
-
-@local_exec_var = thread_local(localexec) global i32 0
-
-define i32 @test_local_exec() {
-; CHECK-LABEL: test_local_exec:
-  %val = load i32* @local_exec_var
-
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [0bAAA{{[01]+}},A,0b101AAAAA,0x92]
-; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
-; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
-
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
-
-  ret i32 %val
-}
-
-define i32* @test_local_exec_addr() {
-; CHECK-LABEL: test_local_exec_addr:
-  ret i32* @local_exec_var
-
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
-; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
-; CHECK: add x0, [[TP]], [[TP_OFFSET]]
-
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
-; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
-}
diff --git a/test/CodeGen/ARM64/trap.ll b/test/CodeGen/ARM64/trap.ll
deleted file mode 100644
index c9e0bea..0000000
--- a/test/CodeGen/ARM64/trap.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-define void @foo() nounwind {
-; CHECK: foo
-; CHECK: brk #1
-  tail call void @llvm.trap()
-  ret void
-}
-declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM64/trn.ll b/test/CodeGen/ARM64/trn.ll
deleted file mode 100644
index f467984..0000000
--- a/test/CodeGen/ARM64/trn.ll
+++ /dev/null
@@ -1,134 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vtrni8:
-;CHECK: trn1.8b
-;CHECK: trn2.8b
-;CHECK-NEXT: add.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-	ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: vtrni16:
-;CHECK: trn1.4h
-;CHECK: trn2.4h
-;CHECK-NEXT: add.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-	ret <4 x i16> %tmp5
-}
-
-; 2xi32 TRN is redundant with ZIP
-define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: vtrni32:
-;CHECK: zip1.2s
-;CHECK: zip2.2s
-;CHECK-NEXT: add.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
-	%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-	ret <2 x i32> %tmp5
-}
-
-define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: vtrnf:
-;CHECK: zip1.2s
-;CHECK: zip2.2s
-;CHECK-NEXT: fadd.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2>
-	%tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3>
-        %tmp5 = fadd <2 x float> %tmp3, %tmp4
-	ret <2 x float> %tmp5
-}
-
-define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: vtrnQi8:
-;CHECK: trn1.16b
-;CHECK: trn2.16b
-;CHECK-NEXT: add.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-	ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vtrnQi16:
-;CHECK: trn1.8h
-;CHECK: trn2.8h
-;CHECK-NEXT: add.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-	ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vtrnQi32:
-;CHECK: trn1.4s
-;CHECK: trn2.4s
-;CHECK-NEXT: add.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-	ret <4 x i32> %tmp5
-}
-
-define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: vtrnQf:
-;CHECK: trn1.4s
-;CHECK: trn2.4s
-;CHECK-NEXT: fadd.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-        %tmp5 = fadd <4 x float> %tmp3, %tmp4
-	ret <4 x float> %tmp5
-}
-
-; Undef shuffle indices should not prevent matching to VTRN:
-
-define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vtrni8_undef:
-;CHECK: trn1.8b
-;CHECK: trn2.8b
-;CHECK-NEXT: add.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14>
-	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-	ret <8 x i8> %tmp5
-}
-
-define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vtrnQi16_undef:
-;CHECK: trn1.8h
-;CHECK: trn2.8h
-;CHECK-NEXT: add.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
-	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-	ret <8 x i16> %tmp5
-}
diff --git a/test/CodeGen/ARM64/trunc-store.ll b/test/CodeGen/ARM64/trunc-store.ll
deleted file mode 100644
index e65f5b5..0000000
--- a/test/CodeGen/ARM64/trunc-store.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-
-define void @bar(<8 x i16> %arg, <8 x i8>* %p) nounwind {
-; CHECK-LABEL: bar:
-; CHECK: xtn.8b v[[REG:[0-9]+]], v0
-; CHECK-NEXT: str d[[REG]], [x0]
-; CHECK-NEXT: ret
-  %tmp = trunc <8 x i16> %arg to <8 x i8>
-  store <8 x i8> %tmp, <8 x i8>* %p, align 8
-  ret void
-}
-
-@zptr8 = common global i8* null, align 8
-@zptr16 = common global i16* null, align 8
-@zptr32 = common global i32* null, align 8
-
-define void @fct32(i32 %arg, i64 %var) {
-; CHECK: fct32
-; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr32@GOTPAGE
-; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr32@GOTPAGEOFF]
-; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
-; w0 is %arg
-; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
-; w1 is %var truncated
-; CHECK-NEXT: str w1, {{\[}}[[GLOBALADDR]], x[[OFFSETREGNUM]], sxtw #2]
-; CHECK-NEXT: ret
-bb:
-  %.pre37 = load i32** @zptr32, align 8
-  %dec = add nsw i32 %arg, -1
-  %idxprom8 = sext i32 %dec to i64
-  %arrayidx9 = getelementptr inbounds i32* %.pre37, i64 %idxprom8
-  %tmp = trunc i64 %var to i32
-  store i32 %tmp, i32* %arrayidx9, align 4
-  ret void
-}
-
-define void @fct16(i32 %arg, i64 %var) {
-; CHECK: fct16
-; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr16@GOTPAGE
-; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr16@GOTPAGEOFF]
-; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
-; w0 is %arg
-; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
-; w1 is %var truncated
-; CHECK-NEXT: strh w1, {{\[}}[[GLOBALADDR]], x[[OFFSETREGNUM]], sxtw #1]
-; CHECK-NEXT: ret
-bb:
-  %.pre37 = load i16** @zptr16, align 8
-  %dec = add nsw i32 %arg, -1
-  %idxprom8 = sext i32 %dec to i64
-  %arrayidx9 = getelementptr inbounds i16* %.pre37, i64 %idxprom8
-  %tmp = trunc i64 %var to i16
-  store i16 %tmp, i16* %arrayidx9, align 4
-  ret void
-}
-
-define void @fct8(i32 %arg, i64 %var) {
-; CHECK: fct8
-; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr8@GOTPAGE
-; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr8@GOTPAGEOFF]
-; CHECK: ldr [[BASEADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
-; w0 is %arg
-; CHECK-NEXT: add [[ADDR:x[0-9]+]], [[BASEADDR]], w0, sxtw
-; w1 is %var truncated
-; CHECK-NEXT: sturb w1, {{\[}}[[ADDR]], #-1]
-; CHECK-NEXT: ret
-bb:
-  %.pre37 = load i8** @zptr8, align 8
-  %dec = add nsw i32 %arg, -1
-  %idxprom8 = sext i32 %dec to i64
-  %arrayidx9 = getelementptr inbounds i8* %.pre37, i64 %idxprom8
-  %tmp = trunc i64 %var to i8
-  store i8 %tmp, i8* %arrayidx9, align 4
-  ret void
-}
diff --git a/test/CodeGen/ARM64/umaxv.ll b/test/CodeGen/ARM64/umaxv.ll
deleted file mode 100644
index 15277d3..0000000
--- a/test/CodeGen/ARM64/umaxv.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
-; CHECK-LABEL: vmax_u8x8:
-; CHECK: umaxv.8b        b[[REG:[0-9]+]], v0
-; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
-  %tmp = trunc i32 %vmaxv.i to i8
-  %tobool = icmp eq i8 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-declare i32 @bar(...)
-
-define i32 @vmax_u4x16(<4 x i16> %a) nounwind ssp {
-; CHECK-LABEL: vmax_u4x16:
-; CHECK: umaxv.4h        h[[REG:[0-9]+]], v0
-; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
-  %tmp = trunc i32 %vmaxv.i to i16
-  %tobool = icmp eq i16 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @vmax_u8x16(<8 x i16> %a) nounwind ssp {
-; CHECK-LABEL: vmax_u8x16:
-; CHECK: umaxv.8h        h[[REG:[0-9]+]], v0
-; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
-  %tmp = trunc i32 %vmaxv.i to i16
-  %tobool = icmp eq i16 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @vmax_u16x8(<16 x i8> %a) nounwind ssp {
-; CHECK-LABEL: vmax_u16x8:
-; CHECK: umaxv.16b        b[[REG:[0-9]+]], v0
-; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
-  %tmp = trunc i32 %vmaxv.i to i8
-  %tobool = icmp eq i8 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/uminv.ll b/test/CodeGen/ARM64/uminv.ll
deleted file mode 100644
index 440522f..0000000
--- a/test/CodeGen/ARM64/uminv.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
-; CHECK-LABEL: vmin_u8x8:
-; CHECK: uminv.8b        b[[REG:[0-9]+]], v0
-; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
-  %tmp = trunc i32 %vminv.i to i8
-  %tobool = icmp eq i8 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-declare i32 @bar(...)
-
-define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {
-; CHECK-LABEL: vmin_u4x16:
-; CHECK: uminv.4h        h[[REG:[0-9]+]], v0
-; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
-  %tmp = trunc i32 %vminv.i to i16
-  %tobool = icmp eq i16 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {
-; CHECK-LABEL: vmin_u8x16:
-; CHECK: uminv.8h        h[[REG:[0-9]+]], v0
-; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
-  %tmp = trunc i32 %vminv.i to i16
-  %tobool = icmp eq i16 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {
-; CHECK-LABEL: vmin_u16x8:
-; CHECK: uminv.16b        b[[REG:[0-9]+]], v0
-; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
-; CHECK-NOT: and
-; CHECK: cbz     [[REG2]],
-entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
-  %tmp = trunc i32 %vminv.i to i8
-  %tobool = icmp eq i8 %tmp, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
-  br label %return
-
-return:
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/umov.ll b/test/CodeGen/ARM64/umov.ll
deleted file mode 100644
index 7701874..0000000
--- a/test/CodeGen/ARM64/umov.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define zeroext i8 @f1(<16 x i8> %a) {
-; CHECK-LABEL: f1:
-; CHECK: umov.b w0, v0[3]
-; CHECK-NEXT: ret
-  %vecext = extractelement <16 x i8> %a, i32 3
-  ret i8 %vecext
-}
-
-define zeroext i16 @f2(<4 x i16> %a) {
-; CHECK-LABEL: f2:
-; CHECK: umov.h w0, v0[2]
-; CHECK-NEXT: ret
-  %vecext = extractelement <4 x i16> %a, i32 2
-  ret i16 %vecext
-}
-
-define i32 @f3(<2 x i32> %a) {
-; CHECK-LABEL: f3:
-; CHECK: umov.s w0, v0[1]
-; CHECK-NEXT: ret
-  %vecext = extractelement <2 x i32> %a, i32 1
-  ret i32 %vecext
-}
-
-define i64 @f4(<2 x i64> %a) {
-; CHECK-LABEL: f4:
-; CHECK: umov.d x0, v0[1]
-; CHECK-NEXT: ret
-  %vecext = extractelement <2 x i64> %a, i32 1
-  ret i64 %vecext
-}
diff --git a/test/CodeGen/ARM64/unaligned_ldst.ll b/test/CodeGen/ARM64/unaligned_ldst.ll
deleted file mode 100644
index 20b80c0..0000000
--- a/test/CodeGen/ARM64/unaligned_ldst.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-; rdar://r11231896
-
-define void @t1(i8* nocapture %a, i8* nocapture %b) nounwind {
-entry:
-; CHECK-LABEL: t1:
-; CHECK-NOT: orr
-; CHECK: ldr [[X0:x[0-9]+]], [x1]
-; CHECK: str [[X0]], [x0]
-  %tmp1 = bitcast i8* %b to i64*
-  %tmp2 = bitcast i8* %a to i64*
-  %tmp3 = load i64* %tmp1, align 1
-  store i64 %tmp3, i64* %tmp2, align 1
-  ret void
-}
-
-define void @t2(i8* nocapture %a, i8* nocapture %b) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK-NOT: orr
-; CHECK: ldr [[W0:w[0-9]+]], [x1]
-; CHECK: str [[W0]], [x0]
-  %tmp1 = bitcast i8* %b to i32*
-  %tmp2 = bitcast i8* %a to i32*
-  %tmp3 = load i32* %tmp1, align 1
-  store i32 %tmp3, i32* %tmp2, align 1
-  ret void
-}
-
-define void @t3(i8* nocapture %a, i8* nocapture %b) nounwind {
-entry:
-; CHECK-LABEL: t3:
-; CHECK-NOT: orr
-; CHECK: ldrh [[W0:w[0-9]+]], [x1]
-; CHECK: strh [[W0]], [x0]
-  %tmp1 = bitcast i8* %b to i16*
-  %tmp2 = bitcast i8* %a to i16*
-  %tmp3 = load i16* %tmp1, align 1
-  store i16 %tmp3, i16* %tmp2, align 1
-  ret void
-}
diff --git a/test/CodeGen/ARM64/uzp.ll b/test/CodeGen/ARM64/uzp.ll
deleted file mode 100644
index 60e16d0..0000000
--- a/test/CodeGen/ARM64/uzp.ll
+++ /dev/null
@@ -1,107 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vuzpi8:
-;CHECK: uzp1.8b
-;CHECK: uzp2.8b
-;CHECK-NEXT: add.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-	ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: vuzpi16:
-;CHECK: uzp1.4h
-;CHECK: uzp2.4h
-;CHECK-NEXT: add.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-	ret <4 x i16> %tmp5
-}
-
-define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: vuzpQi8:
-;CHECK: uzp1.16b
-;CHECK: uzp2.16b
-;CHECK-NEXT: add.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-	ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vuzpQi16:
-;CHECK: uzp1.8h
-;CHECK: uzp2.8h
-;CHECK-NEXT: add.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-	ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vuzpQi32:
-;CHECK: uzp1.4s
-;CHECK: uzp2.4s
-;CHECK-NEXT: add.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-	ret <4 x i32> %tmp5
-}
-
-define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: vuzpQf:
-;CHECK: uzp1.4s
-;CHECK: uzp2.4s
-;CHECK-NEXT: fadd.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-        %tmp5 = fadd <4 x float> %tmp3, %tmp4
-	ret <4 x float> %tmp5
-}
-
-; Undef shuffle indices should not prevent matching to VUZP:
-
-define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vuzpi8_undef:
-;CHECK: uzp1.8b
-;CHECK: uzp2.8b
-;CHECK-NEXT: add.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
-	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-	ret <8 x i8> %tmp5
-}
-
-define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vuzpQi16_undef:
-;CHECK: uzp1.8h
-;CHECK: uzp2.8h
-;CHECK-NEXT: add.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
-	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-	ret <8 x i16> %tmp5
-}
diff --git a/test/CodeGen/ARM64/vaargs.ll b/test/CodeGen/ARM64/vaargs.ll
deleted file mode 100644
index ce07635..0000000
--- a/test/CodeGen/ARM64/vaargs.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
-target triple = "arm64-apple-darwin11.0.0"
-
-define float @t1(i8* nocapture %fmt, ...) nounwind ssp {
-entry:
-; CHECK: t1
-; CHECK: fcvt
-  %argp = alloca i8*, align 8
-  %argp1 = bitcast i8** %argp to i8*
-  call void @llvm.va_start(i8* %argp1)
-  %0 = va_arg i8** %argp, i32
-  %1 = va_arg i8** %argp, float
-  call void @llvm.va_end(i8* %argp1)
-  ret float %1
-}
-
-declare void @llvm.va_start(i8*) nounwind
-
-declare void @llvm.va_end(i8*) nounwind
diff --git a/test/CodeGen/ARM64/vabs.ll b/test/CodeGen/ARM64/vabs.ll
deleted file mode 100644
index 0d8aa24..0000000
--- a/test/CodeGen/ARM64/vabs.ll
+++ /dev/null
@@ -1,804 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-
-define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sabdl8h:
-;CHECK: sabdl.8h
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sabdl4s:
-;CHECK: sabdl.4s
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
-        ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sabdl2d:
-;CHECK: sabdl.2d
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
-        ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sabdl2_8h:
-;CHECK: sabdl2.8h
-        %load1 = load <16 x i8>* %A
-        %load2 = load <16 x i8>* %B
-        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sabdl2_4s:
-;CHECK: sabdl2.4s
-        %load1 = load <8 x i16>* %A
-        %load2 = load <8 x i16>* %B
-        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
-        ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sabdl2_2d:
-;CHECK: sabdl2.2d
-        %load1 = load <4 x i32>* %A
-        %load2 = load <4 x i32>* %B
-        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
-        ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uabdl8h:
-;CHECK: uabdl.8h
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
-  ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uabdl4s:
-;CHECK: uabdl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uabdl2d:
-;CHECK: uabdl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
-  ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uabdl2_8h:
-;CHECK: uabdl2.8h
-  %load1 = load <16 x i8>* %A
-  %load2 = load <16 x i8>* %B
-  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
-  ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uabdl2_4s:
-;CHECK: uabdl2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uabdl2_2d:
-;CHECK: uabdl2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
-  ret <2 x i64> %tmp4
-}
-
-define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fabd_2s:
-;CHECK: fabd.2s
-        %tmp1 = load <2 x float>* %A
-        %tmp2 = load <2 x float>* %B
-        %tmp3 = call <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-        ret <2 x float> %tmp3
-}
-
-define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fabd_4s:
-;CHECK: fabd.4s
-        %tmp1 = load <4 x float>* %A
-        %tmp2 = load <4 x float>* %B
-        %tmp3 = call <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-        ret <4 x float> %tmp3
-}
-
-define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fabd_2d:
-;CHECK: fabd.2d
-        %tmp1 = load <2 x double>* %A
-        %tmp2 = load <2 x double>* %B
-        %tmp3 = call <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-        ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sabd_8b:
-;CHECK: sabd.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sabd_16b:
-;CHECK: sabd.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sabd_4h:
-;CHECK: sabd.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sabd_8h:
-;CHECK: sabd.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sabd_2s:
-;CHECK: sabd.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sabd_4s:
-;CHECK: sabd.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uabd_8b:
-;CHECK: uabd.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uabd_16b:
-;CHECK: uabd.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uabd_4h:
-;CHECK: uabd.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uabd_8h:
-;CHECK: uabd.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uabd_2s:
-;CHECK: uabd.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uabd_4s:
-;CHECK: uabd.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: sqabs_8b:
-;CHECK: sqabs.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8> %tmp1)
-        ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: sqabs_16b:
-;CHECK: sqabs.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8> %tmp1)
-        ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: sqabs_4h:
-;CHECK: sqabs.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16> %tmp1)
-        ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqabs_8h:
-;CHECK: sqabs.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16> %tmp1)
-        ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: sqabs_2s:
-;CHECK: sqabs.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32> %tmp1)
-        ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqabs_4s:
-;CHECK: sqabs.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32> %tmp1)
-        ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
-
-define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: sqneg_8b:
-;CHECK: sqneg.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8> %tmp1)
-        ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: sqneg_16b:
-;CHECK: sqneg.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8> %tmp1)
-        ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: sqneg_4h:
-;CHECK: sqneg.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16> %tmp1)
-        ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqneg_8h:
-;CHECK: sqneg.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16> %tmp1)
-        ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: sqneg_2s:
-;CHECK: sqneg.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32> %tmp1)
-        ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqneg_4s:
-;CHECK: sqneg.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32> %tmp1)
-        ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
-
-define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: abs_8b:
-;CHECK: abs.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8> %tmp1)
-        ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: abs_16b:
-;CHECK: abs.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8> %tmp1)
-        ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: abs_4h:
-;CHECK: abs.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16> %tmp1)
-        ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: abs_8h:
-;CHECK: abs.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16> %tmp1)
-        ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: abs_2s:
-;CHECK: abs.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32> %tmp1)
-        ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: abs_4s:
-;CHECK: abs.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32> %tmp1)
-        ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
-; CHECK-LABEL: abs_1d:
-; CHECK: abs d0, d0
-  %abs = call <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64> %A)
-  ret <1 x i64> %abs
-}
-
-define i64 @abs_1d_honestly(i64 %A) nounwind {
-; CHECK-LABEL: abs_1d_honestly:
-; CHECK: abs d0, d0
-  %abs = call i64 @llvm.arm64.neon.abs.i64(i64 %A)
-  ret i64 %abs
-}
-
-declare <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.abs.i64(i64) nounwind readnone
-
-define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
-;CHECK-LABEL: sabal8h:
-;CHECK: sabal.8h
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = load <8 x i16>* %C
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sabal4s:
-;CHECK: sabal.4s
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = load <4 x i32>* %C
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sabal2d:
-;CHECK: sabal.2d
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = load <2 x i64>* %C
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
-        %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
-        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
-;CHECK-LABEL: sabal2_8h:
-;CHECK: sabal2.8h
-        %load1 = load <16 x i8>* %A
-        %load2 = load <16 x i8>* %B
-        %tmp3 = load <8 x i16>* %C
-        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sabal2_4s:
-;CHECK: sabal2.4s
-        %load1 = load <8 x i16>* %A
-        %load2 = load <8 x i16>* %B
-        %tmp3 = load <4 x i32>* %C
-        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sabal2_2d:
-;CHECK: sabal2.2d
-        %load1 = load <4 x i32>* %A
-        %load2 = load <4 x i32>* %B
-        %tmp3 = load <2 x i64>* %C
-        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
-        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
-;CHECK-LABEL: uabal8h:
-;CHECK: uabal.8h
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = load <8 x i16>* %C
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: uabal4s:
-;CHECK: uabal.4s
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = load <4 x i32>* %C
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: uabal2d:
-;CHECK: uabal.2d
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = load <2 x i64>* %C
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
-        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
-;CHECK-LABEL: uabal2_8h:
-;CHECK: uabal2.8h
-        %load1 = load <16 x i8>* %A
-        %load2 = load <16 x i8>* %B
-        %tmp3 = load <8 x i16>* %C
-        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: uabal2_4s:
-;CHECK: uabal2.4s
-        %load1 = load <8 x i16>* %A
-        %load2 = load <8 x i16>* %B
-        %tmp3 = load <4 x i32>* %C
-        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: uabal2_2d:
-;CHECK: uabal2.2d
-        %load1 = load <4 x i32>* %A
-        %load2 = load <4 x i32>* %B
-        %tmp3 = load <2 x i64>* %C
-        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
-        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
-;CHECK-LABEL: saba_8b:
-;CHECK: saba.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4 = load <8 x i8>* %C
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
-;CHECK-LABEL: saba_16b:
-;CHECK: saba.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        %tmp4 = load <16 x i8>* %C
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-        ret <16 x i8> %tmp5
-}
-
-define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
-;CHECK-LABEL: saba_4h:
-;CHECK: saba.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4 = load <4 x i16>* %C
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
-;CHECK-LABEL: saba_8h:
-;CHECK: saba.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        %tmp4 = load <8 x i16>* %C
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-        ret <8 x i16> %tmp5
-}
-
-define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
-;CHECK-LABEL: saba_2s:
-;CHECK: saba.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4 = load <2 x i32>* %C
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: saba_4s:
-;CHECK: saba.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        %tmp4 = load <4 x i32>* %C
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-        ret <4 x i32> %tmp5
-}
-
-define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
-;CHECK-LABEL: uaba_8b:
-;CHECK: uaba.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        %tmp4 = load <8 x i8>* %C
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
-;CHECK-LABEL: uaba_16b:
-;CHECK: uaba.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        %tmp4 = load <16 x i8>* %C
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-        ret <16 x i8> %tmp5
-}
-
-define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
-;CHECK-LABEL: uaba_4h:
-;CHECK: uaba.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        %tmp4 = load <4 x i16>* %C
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
-;CHECK-LABEL: uaba_8h:
-;CHECK: uaba.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        %tmp4 = load <8 x i16>* %C
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-        ret <8 x i16> %tmp5
-}
-
-define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
-;CHECK-LABEL: uaba_2s:
-;CHECK: uaba.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        %tmp4 = load <2 x i32>* %C
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: uaba_4s:
-;CHECK: uaba.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        %tmp4 = load <4 x i32>* %C
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-        ret <4 x i32> %tmp5
-}
-
-; Scalar FABD
-define float @fabds(float %a, float %b) nounwind {
-; CHECK-LABEL: fabds:
-; CHECK: fabd s0, s0, s1
-  %vabd.i = tail call float @llvm.arm64.sisd.fabd.f32(float %a, float %b) nounwind
-  ret float %vabd.i
-}
-
-define double @fabdd(double %a, double %b) nounwind {
-; CHECK-LABEL: fabdd:
-; CHECK: fabd d0, d0, d1
-  %vabd.i = tail call double @llvm.arm64.sisd.fabd.f64(double %a, double %b) nounwind
-  ret double %vabd.i
-}
-
-declare double @llvm.arm64.sisd.fabd.f64(double, double) nounwind readnone
-declare float @llvm.arm64.sisd.fabd.f32(float, float) nounwind readnone
-
-define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: uabdl_from_extract_dup:
-; CHECK-NOT: ext.16b
-; CHECK: uabdl2.2d
-  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
-  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %res = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
-  %res1 = zext <2 x i32> %res to <2 x i64>
-  ret <2 x i64> %res1
-}
-
-define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: sabdl_from_extract_dup:
-; CHECK-NOT: ext.16b
-; CHECK: sabdl2.2d
-  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
-  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %res = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
-  %res1 = zext <2 x i32> %res to <2 x i64>
-  ret <2 x i64> %res1
-}
diff --git a/test/CodeGen/ARM64/vadd.ll b/test/CodeGen/ARM64/vadd.ll
deleted file mode 100644
index f674c6d..0000000
--- a/test/CodeGen/ARM64/vadd.ll
+++ /dev/null
@@ -1,941 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
-
-define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: addhn8b:
-;CHECK: addhn.8b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: addhn4h:
-;CHECK: addhn.4h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: addhn2s:
-;CHECK: addhn.2s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
-;CHECK-LABEL: addhn2_16b:
-;CHECK: addhn.8b
-;CHECK-NEXT: addhn2.16b
-  %vaddhn2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vaddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %res
-}
-
-define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
-;CHECK-LABEL: addhn2_8h:
-;CHECK: addhn.4h
-;CHECK-NEXT: addhn2.8h
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vaddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
-;CHECK-LABEL: addhn2_4s:
-;CHECK: addhn.2s
-;CHECK-NEXT: addhn2.4s
-  %vaddhn2.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vaddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %res
-}
-
-declare <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: raddhn8b:
-;CHECK: raddhn.8b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: raddhn4h:
-;CHECK: raddhn.4h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: raddhn2s:
-;CHECK: raddhn.2s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
-;CHECK-LABEL: raddhn2_16b:
-;CHECK: raddhn.8b
-;CHECK-NEXT: raddhn2.16b
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vraddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %res
-}
-
-define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
-;CHECK-LABEL: raddhn2_8h:
-;CHECK: raddhn.4h
-;CHECK-NEXT: raddhn2.8h
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vraddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
-;CHECK-LABEL: raddhn2_4s:
-;CHECK: raddhn.2s
-;CHECK-NEXT: raddhn2.4s
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vraddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %res
-}
-
-declare <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: saddl8h:
-;CHECK: saddl.8h
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
-  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
-  %tmp5 = add <8 x i16> %tmp3, %tmp4
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: saddl4s:
-;CHECK: saddl.4s
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
-  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
-  %tmp5 = add <4 x i32> %tmp3, %tmp4
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: saddl2d:
-;CHECK: saddl.2d
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
-  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
-  %tmp5 = add <2 x i64> %tmp3, %tmp4
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
-; CHECK-LABEL: saddl2_8h:
-; CHECK-NEXT: saddl2.8h v0, v0, v1
-; CHECK-NEXT: ret
-  %tmp = bitcast <16 x i8> %a to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
-  %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
-  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
-  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
-  %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
-; CHECK-LABEL: saddl2_4s:
-; CHECK-NEXT: saddl2.4s v0, v0, v1
-; CHECK-NEXT: ret
-  %tmp = bitcast <8 x i16> %a to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
-  %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
-  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
-  %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
-; CHECK-LABEL: saddl2_2d:
-; CHECK-NEXT: saddl2.2d v0, v0, v1
-; CHECK-NEXT: ret
-  %tmp = bitcast <4 x i32> %a to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
-  %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
-  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
-  %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uaddl8h:
-;CHECK: uaddl.8h
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
-  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
-  %tmp5 = add <8 x i16> %tmp3, %tmp4
-  ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uaddl4s:
-;CHECK: uaddl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
-  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
-  %tmp5 = add <4 x i32> %tmp3, %tmp4
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uaddl2d:
-;CHECK: uaddl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
-  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
-  %tmp5 = add <2 x i64> %tmp3, %tmp4
-  ret <2 x i64> %tmp5
-}
-
-
-define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
-; CHECK-LABEL: uaddl2_8h:
-; CHECK-NEXT: uaddl2.8h v0, v0, v1
-; CHECK-NEXT: ret
-  %tmp = bitcast <16 x i8> %a to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
-  %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
-  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
-  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
-  %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
-  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
-; CHECK-LABEL: uaddl2_4s:
-; CHECK-NEXT: uaddl2.4s v0, v0, v1
-; CHECK-NEXT: ret
-  %tmp = bitcast <8 x i16> %a to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
-  %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
-  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
-  %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
-  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
-; CHECK-LABEL: uaddl2_2d:
-; CHECK-NEXT: uaddl2.2d v0, v0, v1
-; CHECK-NEXT: ret
-  %tmp = bitcast <4 x i32> %a to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
-  %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
-  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
-  %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
-  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uaddw8h:
-;CHECK: uaddw.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i8>* %B
-  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
-  %tmp4 = add <8 x i16> %tmp1, %tmp3
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uaddw4s:
-;CHECK: uaddw.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i16>* %B
-  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
-  %tmp4 = add <4 x i32> %tmp1, %tmp3
-        ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uaddw2d:
-;CHECK: uaddw.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i32>* %B
-  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
-  %tmp4 = add <2 x i64> %tmp1, %tmp3
-        ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uaddw2_8h:
-;CHECK: uaddw2.8h
-        %tmp1 = load <8 x i16>* %A
-
-        %tmp2 = load <16 x i8>* %B
-        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %ext2 = zext <8 x i8> %high2 to <8 x i16>
-
-        %res = add <8 x i16> %tmp1, %ext2
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uaddw2_4s:
-;CHECK: uaddw2.4s
-        %tmp1 = load <4 x i32>* %A
-
-        %tmp2 = load <8 x i16>* %B
-        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %ext2 = zext <4 x i16> %high2 to <4 x i32>
-
-        %res = add <4 x i32> %tmp1, %ext2
-        ret <4 x i32> %res
-}
-
-define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uaddw2_2d:
-;CHECK: uaddw2.2d
-        %tmp1 = load <2 x i64>* %A
-
-        %tmp2 = load <4 x i32>* %B
-        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %ext2 = zext <2 x i32> %high2 to <2 x i64>
-
-        %res = add <2 x i64> %tmp1, %ext2
-        ret <2 x i64> %res
-}
-
-define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: saddw8h:
-;CHECK: saddw.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
-        %tmp4 = add <8 x i16> %tmp1, %tmp3
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: saddw4s:
-;CHECK: saddw.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
-        %tmp4 = add <4 x i32> %tmp1, %tmp3
-        ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: saddw2d:
-;CHECK: saddw.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
-        %tmp4 = add <2 x i64> %tmp1, %tmp3
-        ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: saddw2_8h:
-;CHECK: saddw2.8h
-        %tmp1 = load <8 x i16>* %A
-
-        %tmp2 = load <16 x i8>* %B
-        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %ext2 = sext <8 x i8> %high2 to <8 x i16>
-
-        %res = add <8 x i16> %tmp1, %ext2
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: saddw2_4s:
-;CHECK: saddw2.4s
-        %tmp1 = load <4 x i32>* %A
-
-        %tmp2 = load <8 x i16>* %B
-        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %ext2 = sext <4 x i16> %high2 to <4 x i32>
-
-        %res = add <4 x i32> %tmp1, %ext2
-        ret <4 x i32> %res
-}
-
-define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: saddw2_2d:
-;CHECK: saddw2.2d
-        %tmp1 = load <2 x i64>* %A
-
-        %tmp2 = load <4 x i32>* %B
-        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %ext2 = sext <2 x i32> %high2 to <2 x i64>
-
-        %res = add <2 x i64> %tmp1, %ext2
-        ret <2 x i64> %res
-}
-
-define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: saddlp4h:
-;CHECK: saddlp.4h
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: saddlp2s:
-;CHECK: saddlp.2s
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
-        ret <2 x i32> %tmp3
-}
-
-define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: saddlp1d:
-;CHECK: saddlp.1d
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
-        ret <1 x i64> %tmp3
-}
-
-define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: saddlp8h:
-;CHECK: saddlp.8h
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: saddlp4s:
-;CHECK: saddlp.4s
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: saddlp2d:
-;CHECK: saddlp.2d
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
-        ret <2 x i64> %tmp3
-}
-
-declare <4 x i16>  @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
-
-declare <8 x i16>  @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
-
-define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: uaddlp4h:
-;CHECK: uaddlp.4h
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: uaddlp2s:
-;CHECK: uaddlp.2s
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
-        ret <2 x i32> %tmp3
-}
-
-define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: uaddlp1d:
-;CHECK: uaddlp.1d
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
-        ret <1 x i64> %tmp3
-}
-
-define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: uaddlp8h:
-;CHECK: uaddlp.8h
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: uaddlp4s:
-;CHECK: uaddlp.4s
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: uaddlp2d:
-;CHECK: uaddlp.2d
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
-        ret <2 x i64> %tmp3
-}
-
-declare <4 x i16>  @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
-
-declare <8 x i16>  @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
-
-define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sadalp4h:
-;CHECK: sadalp.4h
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
-        %tmp4 = load <4 x i16>* %B
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sadalp2s:
-;CHECK: sadalp.2s
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
-        %tmp4 = load <2 x i32>* %B
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sadalp8h:
-;CHECK: sadalp.8h
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
-        %tmp4 = load <8 x i16>* %B
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sadalp4s:
-;CHECK: sadalp.4s
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
-        %tmp4 = load <4 x i32>* %B
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sadalp2d:
-;CHECK: sadalp.2d
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
-        %tmp4 = load <2 x i64>* %B
-        %tmp5 = add <2 x i64> %tmp3, %tmp4
-        ret <2 x i64> %tmp5
-}
-
-define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uadalp4h:
-;CHECK: uadalp.4h
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
-        %tmp4 = load <4 x i16>* %B
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uadalp2s:
-;CHECK: uadalp.2s
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
-        %tmp4 = load <2 x i32>* %B
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uadalp8h:
-;CHECK: uadalp.8h
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
-        %tmp4 = load <8 x i16>* %B
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uadalp4s:
-;CHECK: uadalp.4s
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
-        %tmp4 = load <4 x i32>* %B
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uadalp2d:
-;CHECK: uadalp.2d
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
-        %tmp4 = load <2 x i64>* %B
-        %tmp5 = add <2 x i64> %tmp3, %tmp4
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: addp_8b:
-;CHECK: addp.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: addp_16b:
-;CHECK: addp.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: addp_4h:
-;CHECK: addp.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: addp_8h:
-;CHECK: addp.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: addp_2s:
-;CHECK: addp.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: addp_4s:
-;CHECK: addp.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: addp_2d:
-;CHECK: addp.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: faddp_2s:
-;CHECK: faddp.2s
-        %tmp1 = load <2 x float>* %A
-        %tmp2 = load <2 x float>* %B
-        %tmp3 = call <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-        ret <2 x float> %tmp3
-}
-
-define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: faddp_4s:
-;CHECK: faddp.4s
-        %tmp1 = load <4 x float>* %A
-        %tmp2 = load <4 x float>* %B
-        %tmp3 = call <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-        ret <4 x float> %tmp3
-}
-
-define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: faddp_2d:
-;CHECK: faddp.2d
-        %tmp1 = load <2 x double>* %A
-        %tmp2 = load <2 x double>* %B
-        %tmp3 = call <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-        ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: uaddl2_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: uaddl2.2d
-  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
-  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
-  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
-
-  %res = add <2 x i64> %lhs.ext, %rhs.ext
-  ret <2 x i64> %res
-}
-
-define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: saddl2_duplhs
-; CHECK-NOT: ext.16b
-; CHECK: saddl2.2d
-  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
-  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
-
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
-  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
-
-  %res = add <2 x i64> %lhs.ext, %rhs.ext
-  ret <2 x i64> %res
-}
-
-define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: usubl2_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: usubl2.2d
-  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
-  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
-  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
-
-  %res = sub <2 x i64> %lhs.ext, %rhs.ext
-  ret <2 x i64> %res
-}
-
-define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: ssubl2_duplhs
-; CHECK-NOT: ext.16b
-; CHECK: ssubl2.2d
-  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
-  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
-
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
-  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
-
-  %res = sub <2 x i64> %lhs.ext, %rhs.ext
-  ret <2 x i64> %res
-}
-
-define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: addhn8b_natural:
-;CHECK: addhn.8b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %sum = add <8 x i16> %tmp1, %tmp2
-        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
-        ret <8 x i8> %narrowed
-}
-
-define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: addhn4h_natural:
-;CHECK: addhn.4h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %sum = add <4 x i32> %tmp1, %tmp2
-        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
-        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
-        ret <4 x i16> %narrowed
-}
-
-define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: addhn2s_natural:
-;CHECK: addhn.2s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %sum = add <2 x i64> %tmp1, %tmp2
-        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
-        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
-        ret <2 x i32> %narrowed
-}
-
-define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: addhn2_16b_natural:
-;CHECK: addhn2.16b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %sum = add <8 x i16> %tmp1, %tmp2
-        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
-        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %res
-}
-
-define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: addhn2_8h_natural:
-;CHECK: addhn2.8h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %sum = add <4 x i32> %tmp1, %tmp2
-        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
-        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
-        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: addhn2_4s_natural:
-;CHECK: addhn2.4s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %sum = add <2 x i64> %tmp1, %tmp2
-        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
-        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
-        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %res
-}
-
-define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: subhn8b_natural:
-;CHECK: subhn.8b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %diff = sub <8 x i16> %tmp1, %tmp2
-        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
-        ret <8 x i8> %narrowed
-}
-
-define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: subhn4h_natural:
-;CHECK: subhn.4h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %diff = sub <4 x i32> %tmp1, %tmp2
-        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
-        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
-        ret <4 x i16> %narrowed
-}
-
-define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: subhn2s_natural:
-;CHECK: subhn.2s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %diff = sub <2 x i64> %tmp1, %tmp2
-        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
-        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
-        ret <2 x i32> %narrowed
-}
-
-define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: subhn2_16b_natural:
-;CHECK: subhn2.16b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %diff = sub <8 x i16> %tmp1, %tmp2
-        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
-        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %res
-}
-
-define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: subhn2_8h_natural:
-;CHECK: subhn2.8h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %diff = sub <4 x i32> %tmp1, %tmp2
-        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
-        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
-        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: subhn2_4s_natural:
-;CHECK: subhn2.4s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %diff = sub <2 x i64> %tmp1, %tmp2
-        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
-        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
-        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %res
-}
diff --git a/test/CodeGen/ARM64/vaddlv.ll b/test/CodeGen/ARM64/vaddlv.ll
deleted file mode 100644
index d4d4608..0000000
--- a/test/CodeGen/ARM64/vaddlv.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
-; CHECK: test_vaddlv_s32
-; CHECK: saddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
-; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddlv.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
-  ret i64 %vaddlv.i
-}
-
-define i64 @test_vaddlv_u32(<2 x i32> %a1) nounwind readnone {
-; CHECK: test_vaddlv_u32
-; CHECK: uaddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
-; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddlv.i = tail call i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
-  ret i64 %vaddlv.i
-}
-
-declare i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
-
-declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
-
diff --git a/test/CodeGen/ARM64/vaddv.ll b/test/CodeGen/ARM64/vaddv.ll
deleted file mode 100644
index 44bfa84..0000000
--- a/test/CodeGen/ARM64/vaddv.ll
+++ /dev/null
@@ -1,233 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
-; CHECK-LABEL: test_vaddv_s8:
-; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a1)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
-; CHECK-LABEL: test_vaddv_s16:
-; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a1)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddv_s32(<2 x i32> %a1) {
-; CHECK-LABEL: test_vaddv_s32:
-; 2 x i32 is not supported by the ISA, thus, this is a special case
-; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32> %a1)
-  ret i32 %vaddv.i
-}
-
-define i64 @test_vaddv_s64(<2 x i64> %a1) {
-; CHECK-LABEL: test_vaddv_s64:
-; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
-; CHECK-NEXT: fmov x0, [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64> %a1)
-  ret i64 %vaddv.i
-}
-
-define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
-; CHECK-LABEL: test_vaddv_u8:
-; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
-; CHECK-LABEL: test_vaddv_u8_masked:
-; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
-  %0 = and i32 %vaddv.i, 511 ; 0x1ff
-  ret i32 %0
-}
-
-define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
-; CHECK-LABEL: test_vaddv_u16:
-; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
-; CHECK-LABEL: test_vaddv_u16_masked:
-; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
-  %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
-  ret i32 %0
-}
-
-define i32 @test_vaddv_u32(<2 x i32> %a1) {
-; CHECK-LABEL: test_vaddv_u32:
-; 2 x i32 is not supported by the ISA, thus, this is a special case
-; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
-  ret i32 %vaddv.i
-}
-
-define float @test_vaddv_f32(<2 x float> %a1) {
-; CHECK-LABEL: test_vaddv_f32:
-; CHECK: faddp.2s s0, v0
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
-  ret float %vaddv.i
-}
-
-define float @test_vaddv_v4f32(<4 x float> %a1) {
-; CHECK-LABEL: test_vaddv_v4f32:
-; CHECK: faddp.4s [[REGNUM:v[0-9]+]], v0, v0
-; CHECK: faddp.2s s0, [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
-  ret float %vaddv.i
-}
-
-define double @test_vaddv_f64(<2 x double> %a1) {
-; CHECK-LABEL: test_vaddv_f64:
-; CHECK: faddp.2d d0, v0
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
-  ret double %vaddv.i
-}
-
-define i64 @test_vaddv_u64(<2 x i64> %a1) {
-; CHECK-LABEL: test_vaddv_u64:
-; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
-; CHECK-NEXT: fmov x0, [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
-  ret i64 %vaddv.i
-}
-
-define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
-; CHECK-LABEL: test_vaddvq_s8:
-; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a1)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
-; CHECK-LABEL: test_vaddvq_s16:
-; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a1)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_s32(<4 x i32> %a1) {
-; CHECK-LABEL: test_vaddvq_s32:
-; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
-; CHECK-NEXT: fmov w0, [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a1)
-  ret i32 %vaddv.i
-}
-
-define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
-; CHECK-LABEL: test_vaddvq_u8:
-; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
-; CHECK-LABEL: test_vaddvq_u16:
-; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
-; CHECK-NEXT: fmov w0, s[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_u32(<4 x i32> %a1) {
-; CHECK-LABEL: test_vaddvq_u32:
-; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
-; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
-  ret i32 %vaddv.i
-}
-
-declare i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8>)
-
-declare i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64>)
-
-declare i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32>)
-
-declare i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32>)
-
-declare i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8>)
-
-declare float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
-declare float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
-declare double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
diff --git a/test/CodeGen/ARM64/variadic-aapcs.ll b/test/CodeGen/ARM64/variadic-aapcs.ll
deleted file mode 100644
index ac66902..0000000
--- a/test/CodeGen/ARM64/variadic-aapcs.ll
+++ /dev/null
@@ -1,143 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s
-
-%va_list = type {i8*, i8*, i8*, i32, i32}
-
-@var = global %va_list zeroinitializer, align 8
-
-declare void @llvm.va_start(i8*)
-
-define void @test_simple(i32 %n, ...) {
-; CHECK-LABEL: test_simple:
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
-
-; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
-
-; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]]
-; ... omit middle ones ...
-; CHECK: str x7, [sp, #
-
-; CHECK: stp q0, q1, [sp]
-; ... omit middle ones ...
-; CHECK: stp q6, q7, [sp, #
-
-; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
-
-; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
-; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #56
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
-; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
-
-; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
-; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #128
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #55
-; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-
-; CHECK: orr [[VR_OFFS:w[0-9]+]], wzr, #0xffffff80
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-
-  ret void
-}
-
-define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
-; CHECK-LABEL: test_fewargs:
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
-
-; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
-
-; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]]
-; ... omit middle ones ...
-; CHECK: str x7, [sp, #
-
-; CHECK: stp q1, q2, [sp]
-; ... omit middle ones ...
-; CHECK: str q7, [sp, #
-
-; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
-
-; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
-; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #40
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
-; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
-
-; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
-; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #112
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #39
-; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #111
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-
-  ret void
-}
-
-define void @test_nospare([8 x i64], [8 x float], ...) {
-; CHECK-LABEL: test_nospare:
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-; CHECK-NOT: sub sp, sp
-; CHECK: mov [[STACK:x[0-9]+]], sp
-; CHECK: str [[STACK]], [{{x[0-9]+}}, :lo12:var]
-
-  ret void
-}
-
-; If there are non-variadic arguments on the stack (here two i64s) then the
-; __stack field should point just past them.
-define void @test_offsetstack([10 x i64], [3 x float], ...) {
-; CHECK-LABEL: test_offsetstack:
-; CHECK: sub sp, sp, #80
-; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
-; CHECK: str [[STACK_TOP]], [{{x[0-9]+}}, :lo12:var]
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_start(i8* %addr)
-  ret void
-}
-
-declare void @llvm.va_end(i8*)
-
-define void @test_va_end() nounwind {
-; CHECK-LABEL: test_va_end:
-; CHECK-NEXT: BB#0
-
-  %addr = bitcast %va_list* @var to i8*
-  call void @llvm.va_end(i8* %addr)
-
-  ret void
-; CHECK-NEXT: ret
-}
-
-declare void @llvm.va_copy(i8* %dest, i8* %src)
-
-@second_list = global %va_list zeroinitializer
-
-define void @test_va_copy() {
-; CHECK-LABEL: test_va_copy:
-  %srcaddr = bitcast %va_list* @var to i8*
-  %dstaddr = bitcast %va_list* @second_list to i8*
-  call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
-
-; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var
-
-; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]]
-; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
-; CHECK: str [[BLOCK]], [x[[DST]]]
-
-; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16]
-; CHECK: str [[BLOCK]], [x[[DST]], #16]
-  ret void
-; CHECK: ret
-}
diff --git a/test/CodeGen/ARM64/vbitwise.ll b/test/CodeGen/ARM64/vbitwise.ll
deleted file mode 100644
index 7d8378d..0000000
--- a/test/CodeGen/ARM64/vbitwise.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @rbit_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: rbit_8b:
-;CHECK: rbit.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8> %tmp1)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @rbit_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: rbit_16b:
-;CHECK: rbit.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8> %tmp1)
-	ret <16 x i8> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
-
-define <8 x i16> @sxtl8h(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: sxtl8h:
-;CHECK: sshll.8h
-	%tmp1 = load <8 x i8>* %A
-  %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
-  ret <8 x i16> %tmp2
-}
-
-define <8 x i16> @uxtl8h(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: uxtl8h:
-;CHECK: ushll.8h
-	%tmp1 = load <8 x i8>* %A
-  %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
-  ret <8 x i16> %tmp2
-}
-
-define <4 x i32> @sxtl4s(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: sxtl4s:
-;CHECK: sshll.4s
-	%tmp1 = load <4 x i16>* %A
-  %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
-  ret <4 x i32> %tmp2
-}
-
-define <4 x i32> @uxtl4s(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: uxtl4s:
-;CHECK: ushll.4s
-	%tmp1 = load <4 x i16>* %A
-  %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
-  ret <4 x i32> %tmp2
-}
-
-define <2 x i64> @sxtl2d(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: sxtl2d:
-;CHECK: sshll.2d
-	%tmp1 = load <2 x i32>* %A
-  %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
-  ret <2 x i64> %tmp2
-}
-
-define <2 x i64> @uxtl2d(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: uxtl2d:
-;CHECK: ushll.2d
-	%tmp1 = load <2 x i32>* %A
-  %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
-  ret <2 x i64> %tmp2
-}
-
-; Check for incorrect use of vector bic.
-; rdar://11553859
-define void @test_vsliq(i8* nocapture %src, i8* nocapture %dest) nounwind noinline ssp {
-entry:
-; CHECK-LABEL: test_vsliq:
-; CHECK-NOT: bic
-; CHECK: movi.2d [[REG1:v[0-9]+]], #0x0000ff000000ff
-; CHECK: and.16b v{{[0-9]+}}, v{{[0-9]+}}, [[REG1]]
-  %0 = bitcast i8* %src to <16 x i8>*
-  %1 = load <16 x i8>* %0, align 16
-  %and.i = and <16 x i8> %1, <i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0>
-  %2 = bitcast <16 x i8> %and.i to <8 x i16>
-  %vshl_n = shl <8 x i16> %2, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  %3 = or <8 x i16> %2, %vshl_n
-  %4 = bitcast <8 x i16> %3 to <4 x i32>
-  %vshl_n8 = shl <4 x i32> %4, <i32 16, i32 16, i32 16, i32 16>
-  %5 = or <4 x i32> %4, %vshl_n8
-  %6 = bitcast <4 x i32> %5 to <16 x i8>
-  %7 = bitcast i8* %dest to <16 x i8>*
-  store <16 x i8> %6, <16 x i8>* %7, align 16
-  ret void
-}
diff --git a/test/CodeGen/ARM64/vclz.ll b/test/CodeGen/ARM64/vclz.ll
deleted file mode 100644
index ddc09ed..0000000
--- a/test/CodeGen/ARM64/vclz.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define <8 x i8> @test_vclz_u8(<8 x i8> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclz_u8:
-  ; CHECK: clz.8b v0, v0
-  ; CHECK-NEXT: ret
-  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
-  ret <8 x i8> %vclz.i
-}
-
-define <8 x i8> @test_vclz_s8(<8 x i8> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclz_s8:
-  ; CHECK: clz.8b v0, v0
-  ; CHECK-NEXT: ret
-  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
-  ret <8 x i8> %vclz.i
-}
-
-define <4 x i16> @test_vclz_u16(<4 x i16> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclz_u16:
-  ; CHECK: clz.4h v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
-  ret <4 x i16> %vclz1.i
-}
-
-define <4 x i16> @test_vclz_s16(<4 x i16> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclz_s16:
-  ; CHECK: clz.4h v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
-  ret <4 x i16> %vclz1.i
-}
-
-define <2 x i32> @test_vclz_u32(<2 x i32> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclz_u32:
-  ; CHECK: clz.2s v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
-  ret <2 x i32> %vclz1.i
-}
-
-define <2 x i32> @test_vclz_s32(<2 x i32> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclz_s32:
-  ; CHECK: clz.2s v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
-  ret <2 x i32> %vclz1.i
-}
-
-define <16 x i8> @test_vclzq_u8(<16 x i8> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclzq_u8:
-  ; CHECK: clz.16b v0, v0
-  ; CHECK-NEXT: ret
-  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
-  ret <16 x i8> %vclz.i
-}
-
-define <16 x i8> @test_vclzq_s8(<16 x i8> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclzq_s8:
-  ; CHECK: clz.16b v0, v0
-  ; CHECK-NEXT: ret
-  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
-  ret <16 x i8> %vclz.i
-}
-
-define <8 x i16> @test_vclzq_u16(<8 x i16> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclzq_u16:
-  ; CHECK: clz.8h v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
-  ret <8 x i16> %vclz1.i
-}
-
-define <8 x i16> @test_vclzq_s16(<8 x i16> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclzq_s16:
-  ; CHECK: clz.8h v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
-  ret <8 x i16> %vclz1.i
-}
-
-define <4 x i32> @test_vclzq_u32(<4 x i32> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclzq_u32:
-  ; CHECK: clz.4s v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
-  ret <4 x i32> %vclz1.i
-}
-
-define <4 x i32> @test_vclzq_s32(<4 x i32> %a) nounwind readnone ssp {
-  ; CHECK-LABEL: test_vclzq_s32:
-  ; CHECK: clz.4s v0, v0
-  ; CHECK-NEXT: ret
-  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
-  ret <4 x i32> %vclz1.i
-}
-
-declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
-
-declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
-
-declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
-
-declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
-
-declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
-
-declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcmp.ll b/test/CodeGen/ARM64/vcmp.ll
deleted file mode 100644
index f9275b8..0000000
--- a/test/CodeGen/ARM64/vcmp.ll
+++ /dev/null
@@ -1,227 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-
-define void @fcmltz_4s(<4 x float> %a, <4 x i16>* %p) nounwind {
-;CHECK-LABEL: fcmltz_4s:
-;CHECK: fcmlt.4s [[REG:v[0-9]+]], v0, #0
-;CHECK-NEXT: xtn.4h v[[REG_1:[0-9]+]], [[REG]]
-;CHECK-NEXT: str d[[REG_1]], [x0]
-;CHECK-NEXT: ret
-  %tmp = fcmp olt <4 x float> %a, zeroinitializer
-  %tmp2 = sext <4 x i1> %tmp to <4 x i16>
-  store <4 x i16> %tmp2, <4 x i16>* %p, align 8
-  ret void
-}
-
-define <2 x i32> @facge_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: facge_2s:
-;CHECK: facge.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @facge_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: facge_4s:
-;CHECK: facge.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @facge_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: facge_2d:
-;CHECK: facge.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x i32> @facgt_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: facgt_2s:
-;CHECK: facgt.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @facgt_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: facgt_4s:
-;CHECK: facgt.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @facgt_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: facgt_2d:
-;CHECK: facgt.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define i32 @facge_s(float %A, float %B) nounwind {
-; CHECK-LABEL: facge_s:
-; CHECK: facge {{s[0-9]+}}, s0, s1
-  %mask = call i32 @llvm.arm64.neon.facge.i32.f32(float %A, float %B)
-  ret i32 %mask
-}
-
-define i64 @facge_d(double %A, double %B) nounwind {
-; CHECK-LABEL: facge_d:
-; CHECK: facge {{d[0-9]+}}, d0, d1
-  %mask = call i64 @llvm.arm64.neon.facge.i64.f64(double %A, double %B)
-  ret i64 %mask
-}
-
-declare i64 @llvm.arm64.neon.facge.i64.f64(double, double)
-declare i32 @llvm.arm64.neon.facge.i32.f32(float, float)
-
-define i32 @facgt_s(float %A, float %B) nounwind {
-; CHECK-LABEL: facgt_s:
-; CHECK: facgt {{s[0-9]+}}, s0, s1
-  %mask = call i32 @llvm.arm64.neon.facgt.i32.f32(float %A, float %B)
-  ret i32 %mask
-}
-
-define i64 @facgt_d(double %A, double %B) nounwind {
-; CHECK-LABEL: facgt_d:
-; CHECK: facgt {{d[0-9]+}}, d0, d1
-  %mask = call i64 @llvm.arm64.neon.facgt.i64.f64(double %A, double %B)
-  ret i64 %mask
-}
-
-declare i64 @llvm.arm64.neon.facgt.i64.f64(double, double)
-declare i32 @llvm.arm64.neon.facgt.i32.f32(float, float)
-
-define <8 x i8> @cmtst_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: cmtst_8b:
-;CHECK: cmtst.8b
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %commonbits = and <8 x i8> %tmp1, %tmp2
-  %mask = icmp ne <8 x i8> %commonbits, zeroinitializer
-  %res = sext <8 x i1> %mask to <8 x i8>
-  ret <8 x i8> %res
-}
-
-define <16 x i8> @cmtst_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: cmtst_16b:
-;CHECK: cmtst.16b
-  %tmp1 = load <16 x i8>* %A
-  %tmp2 = load <16 x i8>* %B
-  %commonbits = and <16 x i8> %tmp1, %tmp2
-  %mask = icmp ne <16 x i8> %commonbits, zeroinitializer
-  %res = sext <16 x i1> %mask to <16 x i8>
-  ret <16 x i8> %res
-}
-
-define <4 x i16> @cmtst_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: cmtst_4h:
-;CHECK: cmtst.4h
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %commonbits = and <4 x i16> %tmp1, %tmp2
-  %mask = icmp ne <4 x i16> %commonbits, zeroinitializer
-  %res = sext <4 x i1> %mask to <4 x i16>
-  ret <4 x i16> %res
-}
-
-define <8 x i16> @cmtst_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: cmtst_8h:
-;CHECK: cmtst.8h
-  %tmp1 = load <8 x i16>* %A
-  %tmp2 = load <8 x i16>* %B
-  %commonbits = and <8 x i16> %tmp1, %tmp2
-  %mask = icmp ne <8 x i16> %commonbits, zeroinitializer
-  %res = sext <8 x i1> %mask to <8 x i16>
-  ret <8 x i16> %res
-}
-
-define <2 x i32> @cmtst_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: cmtst_2s:
-;CHECK: cmtst.2s
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %commonbits = and <2 x i32> %tmp1, %tmp2
-  %mask = icmp ne <2 x i32> %commonbits, zeroinitializer
-  %res = sext <2 x i1> %mask to <2 x i32>
-  ret <2 x i32> %res
-}
-
-define <4 x i32> @cmtst_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: cmtst_4s:
-;CHECK: cmtst.4s
-  %tmp1 = load <4 x i32>* %A
-  %tmp2 = load <4 x i32>* %B
-  %commonbits = and <4 x i32> %tmp1, %tmp2
-  %mask = icmp ne <4 x i32> %commonbits, zeroinitializer
-  %res = sext <4 x i1> %mask to <4 x i32>
-  ret <4 x i32> %res
-}
-
-define <2 x i64> @cmtst_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: cmtst_2d:
-;CHECK: cmtst.2d
-  %tmp1 = load <2 x i64>* %A
-  %tmp2 = load <2 x i64>* %B
-  %commonbits = and <2 x i64> %tmp1, %tmp2
-  %mask = icmp ne <2 x i64> %commonbits, zeroinitializer
-  %res = sext <2 x i1> %mask to <2 x i64>
-  ret <2 x i64> %res
-}
-
-define <1 x i64> @fcmeq_d(<1 x double> %A, <1 x double> %B) nounwind {
-; CHECK-LABEL: fcmeq_d:
-; CHECK: fcmeq {{d[0-9]+}}, d0, d1
-  %tst = fcmp oeq <1 x double> %A, %B
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmge_d(<1 x double> %A, <1 x double> %B) nounwind {
-; CHECK-LABEL: fcmge_d:
-; CHECK: fcmge {{d[0-9]+}}, d0, d1
-  %tst = fcmp oge <1 x double> %A, %B
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmle_d(<1 x double> %A, <1 x double> %B) nounwind {
-; CHECK-LABEL: fcmle_d:
-; CHECK: fcmge {{d[0-9]+}}, d1, d0
-  %tst = fcmp ole <1 x double> %A, %B
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmgt_d(<1 x double> %A, <1 x double> %B) nounwind {
-; CHECK-LABEL: fcmgt_d:
-; CHECK: fcmgt {{d[0-9]+}}, d0, d1
-  %tst = fcmp ogt <1 x double> %A, %B
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
-
-define <1 x i64> @fcmlt_d(<1 x double> %A, <1 x double> %B) nounwind {
-; CHECK-LABEL: fcmlt_d:
-; CHECK: fcmgt {{d[0-9]+}}, d1, d0
-  %tst = fcmp olt <1 x double> %A, %B
-  %mask = sext <1 x i1> %tst to <1 x i64>
-  ret <1 x i64> %mask
-}
diff --git a/test/CodeGen/ARM64/vcnt.ll b/test/CodeGen/ARM64/vcnt.ll
deleted file mode 100644
index e00658a..0000000
--- a/test/CodeGen/ARM64/vcnt.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: cls_8b:
-;CHECK: cls.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8> %tmp1)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: cls_16b:
-;CHECK: cls.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8> %tmp1)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: cls_4h:
-;CHECK: cls.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16> %tmp1)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: cls_8h:
-;CHECK: cls.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16> %tmp1)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: cls_2s:
-;CHECK: cls.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32> %tmp1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: cls_4s:
-;CHECK: cls.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32> %tmp1)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcombine.ll b/test/CodeGen/ARM64/vcombine.ll
deleted file mode 100644
index 16f591e..0000000
--- a/test/CodeGen/ARM64/vcombine.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-; LowerCONCAT_VECTORS() was reversing the order of two parts.
-; rdar://11558157
-; rdar://11559553
-define <16 x i8> @test(<16 x i8> %q0, <16 x i8> %q1, i8* nocapture %dest) nounwind {
-entry:
-; CHECK-LABEL: test:
-; CHECK: ins.d v0[1], v1[0]
-  %0 = bitcast <16 x i8> %q0 to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> zeroinitializer
-  %1 = bitcast <16 x i8> %q1 to <2 x i64>
-  %shuffle.i4 = shufflevector <2 x i64> %1, <2 x i64> undef, <1 x i32> zeroinitializer
-  %shuffle.i3 = shufflevector <1 x i64> %shuffle.i, <1 x i64> %shuffle.i4, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i3 to <16 x i8>
-  ret <16 x i8> %2
-}
diff --git a/test/CodeGen/ARM64/vcvt.ll b/test/CodeGen/ARM64/vcvt.ll
deleted file mode 100644
index 19bb8cb..0000000
--- a/test/CodeGen/ARM64/vcvt.ll
+++ /dev/null
@@ -1,686 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtas_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtas.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtas_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtas.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtas_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtas.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtau_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtau.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtau_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtau_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtau.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtau_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtau.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtms_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtms.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtms_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtms_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtms.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtms_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtms.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtmu_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtmu.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtmu_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtmu_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtmu.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtmu_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtmu.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtps_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtps.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtps_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtps_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtps.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtps_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtps.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtpu_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtpu.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtpu_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtpu_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtpu.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtpu_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtpu.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtns_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtns.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtns_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtns_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtns.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtns_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtns.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtnu_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtnu.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtnu_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtnu_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtnu.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtnu_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtnu.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzs_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzs.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = fptosi <2 x float> %A to <2 x i32>
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtzs_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzs_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzs.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = fptosi <4 x float> %A to <4 x i32>
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtzs_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtzs.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = fptosi <2 x double> %A to <2 x i64>
-	ret <2 x i64> %tmp3
-}
-
-
-define <2 x i32> @fcvtzu_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzu_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzu.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = fptoui <2 x float> %A to <2 x i32>
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtzu_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzu_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzu.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = fptoui <4 x float> %A to <4 x i32>
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtzu_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtzu.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = fptoui <2 x double> %A to <2 x i64>
-	ret <2 x i64> %tmp3
-}
-
-define <2 x float> @frinta_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frinta_2s:
-;CHECK-NOT: ld1
-;CHECK: frinta.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.round.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frinta_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frinta_4s:
-;CHECK-NOT: ld1
-;CHECK: frinta.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.round.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frinta_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frinta_2d:
-;CHECK-NOT: ld1
-;CHECK: frinta.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.round.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.round.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.round.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.round.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @frinti_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frinti_2s:
-;CHECK-NOT: ld1
-;CHECK: frinti.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frinti_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frinti_4s:
-;CHECK-NOT: ld1
-;CHECK: frinti.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frinti_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frinti_2d:
-;CHECK-NOT: ld1
-;CHECK: frinti.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @frintm_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frintm_2s:
-;CHECK-NOT: ld1
-;CHECK: frintm.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.floor.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frintm_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frintm_4s:
-;CHECK-NOT: ld1
-;CHECK: frintm.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.floor.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frintm_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frintm_2d:
-;CHECK-NOT: ld1
-;CHECK: frintm.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.floor.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.floor.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @frintn_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frintn_2s:
-;CHECK-NOT: ld1
-;CHECK: frintn.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frintn_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frintn_4s:
-;CHECK-NOT: ld1
-;CHECK: frintn.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frintn_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frintn_2d:
-;CHECK-NOT: ld1
-;CHECK: frintn.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @frintp_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frintp_2s:
-;CHECK-NOT: ld1
-;CHECK: frintp.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.ceil.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frintp_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frintp_4s:
-;CHECK-NOT: ld1
-;CHECK: frintp.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frintp_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frintp_2d:
-;CHECK-NOT: ld1
-;CHECK: frintp.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @frintx_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frintx_2s:
-;CHECK-NOT: ld1
-;CHECK: frintx.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.rint.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frintx_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frintx_4s:
-;CHECK-NOT: ld1
-;CHECK: frintx.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frintx_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frintx_2d:
-;CHECK-NOT: ld1
-;CHECK: frintx.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.rint.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.rint.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.rint.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @frintz_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: frintz_2s:
-;CHECK-NOT: ld1
-;CHECK: frintz.2s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.trunc.v2f32(<2 x float> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frintz_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: frintz_4s:
-;CHECK-NOT: ld1
-;CHECK: frintz.4s v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %A)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frintz_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: frintz_2d:
-;CHECK-NOT: ld1
-;CHECK: frintz.2d v0, v0
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %A)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
-
-define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtxn_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtxn v0.2s, v0.2d
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtxn_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtxn2 v0.4s, v1.2d
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
-        %res = shufflevector <2 x float> %ret, <2 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-	ret <4 x float> %res
-}
-
-declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
-
-define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzsc_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzs.2s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtzsc_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzsc_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzs.4s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtzsc_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtzsc_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtzs.2d v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
-
-define <2 x i32> @fcvtzuc_2s(<2 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzuc_2s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzu.2s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @fcvtzuc_4s(<4 x float> %A) nounwind {
-;CHECK-LABEL: fcvtzuc_4s:
-;CHECK-NOT: ld1
-;CHECK: fcvtzu.4s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @fcvtzuc_2d(<2 x double> %A) nounwind {
-;CHECK-LABEL: fcvtzuc_2d:
-;CHECK-NOT: ld1
-;CHECK: fcvtzu.2d v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
-	ret <2 x i64> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
-
-define <2 x float> @scvtf_2sc(<2 x i32> %A) nounwind {
-;CHECK-LABEL: scvtf_2sc:
-;CHECK-NOT: ld1
-;CHECK: scvtf.2s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @scvtf_4sc(<4 x i32> %A) nounwind {
-;CHECK-LABEL: scvtf_4sc:
-;CHECK-NOT: ld1
-;CHECK: scvtf.4s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @scvtf_2dc(<2 x i64> %A) nounwind {
-;CHECK-LABEL: scvtf_2dc:
-;CHECK-NOT: ld1
-;CHECK: scvtf.2d v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
-
-define <2 x float> @ucvtf_2sc(<2 x i32> %A) nounwind {
-;CHECK-LABEL: ucvtf_2sc:
-;CHECK-NOT: ld1
-;CHECK: ucvtf.2s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @ucvtf_4sc(<4 x i32> %A) nounwind {
-;CHECK-LABEL: ucvtf_4sc:
-;CHECK-NOT: ld1
-;CHECK: ucvtf.4s v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
-;CHECK-LABEL: ucvtf_2dc:
-;CHECK-NOT: ld1
-;CHECK: ucvtf.2d v0, v0, #1
-;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
-	ret <2 x double> %tmp3
-}
-
-
-;CHECK-LABEL: autogen_SD28458:
-;CHECK: fcvt
-;CHECK: ret
-define void @autogen_SD28458() {
-  %Tr53 = fptrunc <8 x double> undef to <8 x float>
-  store <8 x float> %Tr53, <8 x float>* undef
-  ret void
-}
-
-;CHECK-LABEL: autogen_SD19225:
-;CHECK: fcvt
-;CHECK: ret
-define void @autogen_SD19225() {
-  %A = load <8 x float>* undef
-  %Tr53 = fpext <8 x float> %A to <8 x double>
-  store <8 x double> %Tr53, <8 x double>* undef
-  ret void
-}
-
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_f.ll b/test/CodeGen/ARM64/vcvt_f.ll
deleted file mode 100644
index d67aa3b..0000000
--- a/test/CodeGen/ARM64/vcvt_f.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -O0 -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvt_f64_f32:
-  %vcvt1.i = fpext <2 x float> %x to <2 x double>
-; CHECK: fcvtl	v0.2d, v0.2s
-  ret <2 x double> %vcvt1.i
-; CHECK: ret
-}
-
-define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvt_high_f64_f32:
-  %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
-  %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
-; CHECK: fcvtl2	v0.2d, v0.4s
-  ret <2 x double> %vcvt1.i
-; CHECK: ret
-}
-
-define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvt_f32_f64:
-  %vcvt1.i = fptrunc <2 x double> %v to <2 x float>
-; CHECK: fcvtn
-  ret <2 x float> %vcvt1.i
-; CHECK: ret
-}
-
-define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvt_high_f32_f64:
-
-  %cvt = fptrunc <2 x double> %v to <2 x float>
-  %vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: fcvtn2
-  ret <4 x float> %vcvt2.i
-; CHECK: ret
-}
-
-define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvtx_f32_f64:
-  %vcvtx1.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
-; CHECK: fcvtxn
-  ret <2 x float> %vcvtx1.i
-; CHECK: ret
-}
-
-define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvtx_high_f32_f64:
-  %vcvtx2.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
-  %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: fcvtxn2
-  ret <4 x float> %res
-; CHECK: ret
-}
-
-
-declare <2 x double> @llvm.arm64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfp2df(<2 x float>) nounwind readnone
-
-declare <2 x float> @llvm.arm64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
-
-declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
-
-define i16 @to_half(float %in) {
-; CHECK-LABEL: to_half:
-; CHECK: fcvt h[[HALFVAL:[0-9]+]], s0
-; CHECK: fmov {{w[0-9]+}}, {{s[0-9]+}}
-  %res = call i16 @llvm.convert.to.fp16(float %in)
-  ret i16 %res
-}
-
-define float @from_half(i16 %in) {
-; CHECK-LABEL: from_half:
-; CHECK: fmov s[[HALFVAL:[0-9]+]], {{w[0-9]+}}
-; CHECK: fcvt s0, h[[HALFVAL]]
-  %res = call float @llvm.convert.from.fp16(i16 %in)
-  ret float %res
-}
-
-declare float @llvm.convert.from.fp16(i16) #1
-declare i16 @llvm.convert.to.fp16(float) #1
diff --git a/test/CodeGen/ARM64/vcvt_f32_su32.ll b/test/CodeGen/ARM64/vcvt_f32_su32.ll
deleted file mode 100644
index 51e053d..0000000
--- a/test/CodeGen/ARM64/vcvt_f32_su32.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @ucvt(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: ucvt:
-; CHECK: ucvtf.2s  v0, v0
-; CHECK: ret
-
-  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <2 x float> @scvt(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: scvt:
-; CHECK: scvtf.2s  v0, v0
-; CHECK: ret
-  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
-  ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @ucvtq(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: ucvtq:
-; CHECK: ucvtf.4s  v0, v0
-; CHECK: ret
-  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @scvtq(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: scvtq:
-; CHECK: scvtf.4s  v0, v0
-; CHECK: ret
-  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @cvtf16(<4 x i16> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf16:
-; CHECK: fcvtl  v0.4s, v0.4h
-; CHECK-NEXT: ret
-  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %a) nounwind
-  ret <4 x float> %vcvt1.i
-}
-
-define <4 x float> @cvtf16_high(<8 x i16> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf16_high:
-; CHECK: fcvtl2  v0.4s, v0.8h
-; CHECK-NEXT: ret
-  %in = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %in) nounwind
-  ret <4 x float> %vcvt1.i
-}
-
-
-
-define <4 x i16> @cvtf16f32(<4 x float> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf16f32:
-; CHECK: fcvtn  v0.4h, v0.4s
-; CHECK-NEXT: ret
-  %vcvt1.i = tail call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %a) nounwind
-  ret <4 x i16> %vcvt1.i
-}
-
-define <8 x i16> @cvtf16f32_high(<4 x i16> %low, <4 x float> %high_big) {
-; CHECK-LABEL: cvtf16f32_high:
-; CHECK: fcvtn2 v0.8h, v1.4s
-; CHECK-NEXT: ret
-  %high = call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %high_big)
-  %res = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %res
-}
-
-declare <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_n.ll b/test/CodeGen/ARM64/vcvt_n.ll
deleted file mode 100644
index 46de557..0000000
--- a/test/CodeGen/ARM64/vcvt_n.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf32fxpu:
-; CHECK: ucvtf.2s	v0, v0, #9
-; CHECK: ret
-  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
-  ret <2 x float> %vcvt_n1
-}
-
-define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf32fxps:
-; CHECK: scvtf.2s	v0, v0, #12
-; CHECK: ret
-  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
-  ret <2 x float> %vcvt_n1
-}
-
-define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtqf32fxpu:
-; CHECK: ucvtf.4s	v0, v0, #18
-; CHECK: ret
-  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
-  ret <4 x float> %vcvt_n1
-}
-
-define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtqf32fxps:
-; CHECK: scvtf.4s	v0, v0, #30
-; CHECK: ret
-  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
-  ret <4 x float> %vcvt_n1
-}
-define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
-  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
-  ret <2 x double> %vcvt_n1
-}
-
-define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
-  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
-  ret <2 x double> %vcvt_n1
-}
-
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_su32_f32.ll b/test/CodeGen/ARM64/vcvt_su32_f32.ll
deleted file mode 100644
index 8c82fa0..0000000
--- a/test/CodeGen/ARM64/vcvt_su32_f32.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x i32> @c1(<2 x float> %a) nounwind readnone ssp {
-; CHECK: c1
-; CHECK: fcvtzs.2s	v0, v0
-; CHECK: ret
-  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <2 x i32> @c2(<2 x float> %a) nounwind readnone ssp {
-; CHECK: c2
-; CHECK: fcvtzu.2s	v0, v0
-; CHECK: ret
-  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
-  ret <2 x i32> %vcvt.i
-}
-
-define <4 x i32> @c3(<4 x float> %a) nounwind readnone ssp {
-; CHECK: c3
-; CHECK: fcvtzs.4s	v0, v0
-; CHECK: ret
-  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
-  ret <4 x i32> %vcvt.i
-}
-
-define <4 x i32> @c4(<4 x float> %a) nounwind readnone ssp {
-; CHECK: c4
-; CHECK: fcvtzu.4s	v0, v0
-; CHECK: ret
-  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
-  ret <4 x i32> %vcvt.i
-}
-
diff --git a/test/CodeGen/ARM64/vcvtxd_f32_f64.ll b/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
deleted file mode 100644
index bbe8f0b..0000000
--- a/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define float @fcvtxn(double %a) {
-; CHECK-LABEL: fcvtxn:
-; CHECK: fcvtxn s0, d0
-; CHECK-NEXT: ret
-  %vcvtxd.i = tail call float @llvm.arm64.sisd.fcvtxn(double %a) nounwind
-  ret float %vcvtxd.i
-}
-
-declare float @llvm.arm64.sisd.fcvtxn(double) nounwind readnone
diff --git a/test/CodeGen/ARM64/vecCmpBr.ll b/test/CodeGen/ARM64/vecCmpBr.ll
deleted file mode 100644
index e23ef25..0000000
--- a/test/CodeGen/ARM64/vecCmpBr.ll
+++ /dev/null
@@ -1,207 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-; ModuleID = 'arm64_vecCmpBr.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios3.0.0"
-
-
-define i32 @anyZero64(<4 x i16> %a) #0 {
-; CHECK: _anyZero64:
-; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: b _bar
-entry:
-  %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
-  %1 = trunc i32 %vminv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %if.then, label %return
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-declare i32 @bar(...) #1
-
-define i32 @anyZero128(<8 x i16> %a) #0 {
-; CHECK: _anyZero128:
-; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: b _bar
-
-entry:
-  %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
-  %1 = trunc i32 %vminv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %if.then, label %return
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @anyNonZero64(<4 x i16> %a) #0 {
-; CHECK: _anyNonZero64:
-; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: movz w0, #0
-
-entry:
-  %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
-  %1 = trunc i32 %vmaxv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @anyNonZero128(<8 x i16> %a) #0 {
-; CHECK: _anyNonZero128:
-; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: movz w0, #0
-entry:
-  %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
-  %1 = trunc i32 %vmaxv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @allZero64(<4 x i16> %a) #0 {
-; CHECK: _allZero64:
-; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: b _bar
-entry:
-  %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
-  %1 = trunc i32 %vmaxv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %if.then, label %return
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @allZero128(<8 x i16> %a) #0 {
-; CHECK: _allZero128:
-; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: b _bar
-entry:
-  %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
-  %1 = trunc i32 %vmaxv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %if.then, label %return
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @allNonZero64(<4 x i16> %a) #0 {
-; CHECK: _allNonZero64:
-; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: movz w0, #0
-entry:
-  %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
-  %1 = trunc i32 %vminv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-define i32 @allNonZero128(<8 x i16> %a) #0 {
-; CHECK: _allNonZero128:
-; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
-; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: movz w0, #0
-entry:
-  %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
-  %1 = trunc i32 %vminv.i to i8
-  %tobool = icmp eq i8 %1, 0
-  br i1 %tobool, label %return, label %if.then
-
-if.then:                                          ; preds = %entry
-  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
-  br label %return
-
-return:                                           ; preds = %entry, %if.then
-  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
-  ret i32 %retval.0
-}
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) #2
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) #2
-
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) #2
-
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) #2
-
-attributes #0 = { nounwind ssp "target-cpu"="cyclone" }
-attributes #1 = { "target-cpu"="cyclone" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-attributes #4 = { nobuiltin nounwind }
diff --git a/test/CodeGen/ARM64/vecFold.ll b/test/CodeGen/ARM64/vecFold.ll
deleted file mode 100644
index 6888932..0000000
--- a/test/CodeGen/ARM64/vecFold.ll
+++ /dev/null
@@ -1,145 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s| FileCheck %s
-
-define <16 x i8> @foov16i8(<8 x i16> %a0, <8 x i16> %b0) nounwind readnone ssp {
-; CHECK-LABEL: foov16i8:
-  %vshrn_low_shift = lshr <8 x i16> %a0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
-  %vshrn_low = trunc <8 x i16> %vshrn_low_shift to <8 x i8>
-  %vshrn_high_shift = lshr <8 x i16> %b0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
-  %vshrn_high = trunc <8 x i16> %vshrn_high_shift to <8 x i8>
-; CHECK: shrn.8b v0, v0, #5
-; CHECK-NEXT: shrn2.16b v0, v1, #5
-; CHECK-NEXT: ret
-  %1 = bitcast <8 x i8> %vshrn_low to <1 x i64>
-  %2 = bitcast <8 x i8> %vshrn_high to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
-  ret <16 x i8> %3
-}
-
-define <8 x i16> @foov8i16(<4 x i32> %a0, <4 x i32> %b0) nounwind readnone ssp {
-; CHECK-LABEL: foov8i16:
-  %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5>
-  %vshrn_low = trunc <4 x i32> %vshrn_low_shift to <4 x i16>
-  %vshrn_high_shift = lshr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
-  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
-; CHECK: shrn.4h v0, v0, #5
-; CHECK-NEXT: shrn2.8h v0, v1, #5
-; CHECK-NEXT: ret
-  %1 = bitcast <4 x i16> %vshrn_low to <1 x i64>
-  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <4 x i32> @foov4i32(<2 x i64> %a0, <2 x i64> %b0) nounwind readnone ssp {
-; CHECK-LABEL: foov4i32:
-  %vshrn_low_shift = lshr <2 x i64> %a0, <i64 5, i64 5>
-  %vshrn_low = trunc <2 x i64> %vshrn_low_shift to <2 x i32>
-  %vshrn_high_shift = lshr <2 x i64> %b0, <i64 5, i64 5>
-  %vshrn_high = trunc <2 x i64> %vshrn_high_shift to <2 x i32>
-; CHECK: shrn.2s v0, v0, #5
-; CHECK-NEXT: shrn2.4s v0, v1, #5
-; CHECK-NEXT: ret
-  %1 = bitcast <2 x i32> %vshrn_low to <1 x i64>
-  %2 = bitcast <2 x i32> %vshrn_high to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
-  ret <4 x i32> %3
-}
-
-define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
-; CHECK-LABEL: bar:
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
-  %vaddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
-; CHECK: addhn.4h	v0, v0, v1
-; CHECK-NEXT: addhn2.8h	v0, v2, v3
-; CHECK-NEXT: ret
-  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
-  %2 = bitcast <4 x i16> %vaddhn2.i10 to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @baz(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
-; CHECK-LABEL: baz:
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
-  %vshrn_high_shift = ashr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
-  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
-; CHECK: addhn.4h	v0, v0, v1
-; CHECK-NEXT: shrn2.8h	v0, v2, #5
-; CHECK-NEXT: ret
-  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
-  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @raddhn(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
-; CHECK-LABEL: raddhn:
-entry:
-; CHECK: 	raddhn.4h	v0, v0, v1
-; CHECK-NEXT: 	raddhn2.8h	v0, v2, v3
-; CHECK-NEXT: 	ret
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
-  %vraddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
-  %0 = bitcast <4 x i16> %vraddhn2.i to <1 x i64>
-  %1 = bitcast <4 x i16> %vraddhn2.i10 to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
-  %2 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %2
-}
-
-define <8 x i16> @vrshrn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
-; CHECK-LABEL: vrshrn:
-; CHECK: rshrn.8b	v0, v0, #5
-; CHECK-NEXT: rshrn2.16b	v0, v2, #6
-; CHECK-NEXT: ret
-  %vrshrn_n1 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
-  %vrshrn_n4 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
-  %1 = bitcast <8 x i8> %vrshrn_n1 to <1 x i64>
-  %2 = bitcast <8 x i8> %vrshrn_n4 to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @vrsubhn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
-; CHECK-LABEL: vrsubhn:
-; CHECK: rsubhn.8b	v0, v0, v1
-; CHECK: rsubhn2.16b	v0, v2, v3
-; CHECK-NEXT: 	ret
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
-  %vrsubhn2.i10 = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
-  %1 = bitcast <8 x i8> %vrsubhn2.i to <1 x i64>
-  %2 = bitcast <8 x i8> %vrsubhn2.i10 to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @noOpt1(<2 x i32> %a0, <2 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
-; CHECK-LABEL: noOpt1:
-  %vqsub2.i = tail call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
-; CHECK:	sqsub.2s	v0, v0, v1
-; CHECK-NEXT:	addhn2.8h	v0, v2, v3
-  %1 = bitcast <2 x i32> %vqsub2.i to <1 x i64>
-  %2 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
-  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
-  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
-  ret <8 x i16> %3
-}
-
-declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i8> @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-
diff --git a/test/CodeGen/ARM64/vector-ext.ll b/test/CodeGen/ARM64/vector-ext.ll
deleted file mode 100644
index 88889fd..0000000
--- a/test/CodeGen/ARM64/vector-ext.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-;CHECK: @func30
-;CHECK: ushll.4s  v0, v0, #0
-;CHECK: movi.4s v1, #1
-;CHECK: and.16b v0, v0, v1
-;CHECK: str  q0, [x0]
-;CHECK: ret
-
-%T0_30 = type <4 x i1>
-%T1_30 = type <4 x i32>
-define void @func30(%T0_30 %v0, %T1_30* %p1) {
-  %r = zext %T0_30 %v0 to %T1_30
-  store %T1_30 %r, %T1_30* %p1
-  ret void
-}
diff --git a/test/CodeGen/ARM64/vector-imm.ll b/test/CodeGen/ARM64/vector-imm.ll
deleted file mode 100644
index f1fc3cc..0000000
--- a/test/CodeGen/ARM64/vector-imm.ll
+++ /dev/null
@@ -1,134 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @v_orrimm(<8 x i8>* %A) nounwind {
-; CHECK-LABEL: v_orrimm:
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: orr
-	%tmp1 = load <8 x i8>* %A
-	%tmp3 = or <8 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @v_orrimmQ(<16 x i8>* %A) nounwind {
-; CHECK: v_orrimmQ
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: orr
-	%tmp1 = load <16 x i8>* %A
-	%tmp3 = or <16 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @v_bicimm(<8 x i8>* %A) nounwind {
-; CHECK-LABEL: v_bicimm:
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: bic
-	%tmp1 = load <8 x i8>* %A
-	%tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @v_bicimmQ(<16 x i8>* %A) nounwind {
-; CHECK-LABEL: v_bicimmQ:
-; CHECK-NOT: mov
-; CHECK-NOT: mvn
-; CHECK: bic
-	%tmp1 = load <16 x i8>* %A
-	%tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
-	ret <16 x i8> %tmp3
-}
-
-define <2 x double> @foo(<2 x double> %bar) nounwind {
-; CHECK: foo
-; CHECK: fmov.2d	v1, #1.000000e+00
-  %add = fadd <2 x double> %bar, <double 1.0, double 1.0>
-  ret <2 x double> %add
-}
-
-define <4 x i32> @movi_4s_imm_t1() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t1:
-; CHECK: movi.4s v0, #75
-  ret <4 x i32> <i32 75, i32 75, i32 75, i32 75>
-}
-
-define <4 x i32> @movi_4s_imm_t2() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t2:
-; CHECK: movi.4s v0, #75, lsl #8
-  ret <4 x i32> <i32 19200, i32 19200, i32 19200, i32 19200>
-}
-
-define <4 x i32> @movi_4s_imm_t3() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t3:
-; CHECK: movi.4s v0, #75, lsl #16
-  ret <4 x i32> <i32 4915200, i32 4915200, i32 4915200, i32 4915200>
-}
-
-define <4 x i32> @movi_4s_imm_t4() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t4:
-; CHECK: movi.4s v0, #75, lsl #24
-  ret <4 x i32> <i32 1258291200, i32 1258291200, i32 1258291200, i32 1258291200>
-}
-
-define <8 x i16> @movi_8h_imm_t5() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_8h_imm_t5:
-; CHECK: movi.8h v0, #75
-  ret <8 x i16> <i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75>
-}
-
-; rdar://11989841
-define <8 x i16> @movi_8h_imm_t6() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_8h_imm_t6:
-; CHECK: movi.8h v0, #75, lsl #8
-  ret <8 x i16> <i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200>
-}
-
-define <4 x i32> @movi_4s_imm_t7() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t7:
-; CHECK: movi.4s v0, #75, msl #8
-ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
-}
-
-define <4 x i32> @movi_4s_imm_t8() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t8:
-; CHECK: movi.4s v0, #75, msl #16
-ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
-}
-
-define <16 x i8> @movi_16b_imm_t9() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_16b_imm_t9:
-; CHECK: movi.16b v0, #75
-ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75,
-               i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
-}
-
-define <2 x i64> @movi_2d_imm_t10() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_2d_imm_t10:
-; CHECK: movi.2d v0, #0xff00ff00ff00ff
-ret <2 x i64> <i64 71777214294589695, i64 71777214294589695>
-}
-
-define <4 x i32> @movi_4s_imm_t11() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_4s_imm_t11:
-; CHECK: fmov.4s v0, #-3.281250e-01
-ret <4 x i32> <i32 3198681088, i32 3198681088, i32 3198681088, i32 3198681088>
-}
-
-define <2 x i64> @movi_2d_imm_t12() nounwind readnone ssp {
-entry:
-; CHECK-LABEL: movi_2d_imm_t12:
-; CHECK: fmov.2d v0, #-1.718750e-01
-ret <2 x i64> <i64 13818732506632945664, i64 13818732506632945664>
-}
diff --git a/test/CodeGen/ARM64/vector-ldst.ll b/test/CodeGen/ARM64/vector-ldst.ll
deleted file mode 100644
index 154160e..0000000
--- a/test/CodeGen/ARM64/vector-ldst.ll
+++ /dev/null
@@ -1,601 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
-
-; rdar://9428579
-
-%type1 = type { <16 x i8> }
-%type2 = type { <8 x i8> }
-%type3 = type { <4 x i16> }
-
-
-define hidden fastcc void @t1(%type1** %argtable) nounwind {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: ldr x[[REG:[0-9]+]], [x0]
-; CHECK: str q0, [x[[REG]]]
-  %tmp1 = load %type1** %argtable, align 8
-  %tmp2 = getelementptr inbounds %type1* %tmp1, i64 0, i32 0
-  store <16 x i8> zeroinitializer, <16 x i8>* %tmp2, align 16
-  ret void
-}
-
-define hidden fastcc void @t2(%type2** %argtable) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: ldr x[[REG:[0-9]+]], [x0]
-; CHECK: str d0, [x[[REG]]]
-  %tmp1 = load %type2** %argtable, align 8
-  %tmp2 = getelementptr inbounds %type2* %tmp1, i64 0, i32 0
-  store <8 x i8> zeroinitializer, <8 x i8>* %tmp2, align 8
-  ret void
-}
-
-; add a bunch of tests for rdar://11246289
-
-@globalArray64x2 = common global <2 x i64>* null, align 8
-@globalArray32x4 = common global <4 x i32>* null, align 8
-@globalArray16x8 = common global <8 x i16>* null, align 8
-@globalArray8x16 = common global <16 x i8>* null, align 8
-@globalArray64x1 = common global <1 x i64>* null, align 8
-@globalArray32x2 = common global <2 x i32>* null, align 8
-@globalArray16x4 = common global <4 x i16>* null, align 8
-@globalArray8x8 = common global <8 x i8>* null, align 8
-@floatglobalArray64x2 = common global <2 x double>* null, align 8
-@floatglobalArray32x4 = common global <4 x float>* null, align 8
-@floatglobalArray64x1 = common global <1 x double>* null, align 8
-@floatglobalArray32x2 = common global <2 x float>* null, align 8
-
-define void @fct1_64x2(<2 x i64>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_64x2:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 %offset
-  %tmp = load <2 x i64>* %arrayidx, align 16
-  %tmp1 = load <2 x i64>** @globalArray64x2, align 8
-  %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 %offset
-  store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct2_64x2(<2 x i64>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_64x2:
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
-  %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 3
-  %tmp = load <2 x i64>* %arrayidx, align 16
-  %tmp1 = load <2 x i64>** @globalArray64x2, align 8
-  %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 5
-  store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct1_32x4(<4 x i32>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_32x4:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 %offset
-  %tmp = load <4 x i32>* %arrayidx, align 16
-  %tmp1 = load <4 x i32>** @globalArray32x4, align 8
-  %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 %offset
-  store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct2_32x4(<4 x i32>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_32x4:
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
-  %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 3
-  %tmp = load <4 x i32>* %arrayidx, align 16
-  %tmp1 = load <4 x i32>** @globalArray32x4, align 8
-  %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 5
-  store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct1_16x8(<8 x i16>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_16x8:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 %offset
-  %tmp = load <8 x i16>* %arrayidx, align 16
-  %tmp1 = load <8 x i16>** @globalArray16x8, align 8
-  %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 %offset
-  store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct2_16x8(<8 x i16>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_16x8:
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
-  %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 3
-  %tmp = load <8 x i16>* %arrayidx, align 16
-  %tmp1 = load <8 x i16>** @globalArray16x8, align 8
-  %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 5
-  store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct1_8x16(<16 x i8>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_8x16:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 %offset
-  %tmp = load <16 x i8>* %arrayidx, align 16
-  %tmp1 = load <16 x i8>** @globalArray8x16, align 8
-  %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 %offset
-  store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct2_8x16(<16 x i8>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_8x16:
-; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
-  %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 3
-  %tmp = load <16 x i8>* %arrayidx, align 16
-  %tmp1 = load <16 x i8>** @globalArray8x16, align 8
-  %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 5
-  store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
-  ret void
-}
-
-define void @fct1_64x1(<1 x i64>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_64x1:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 %offset
-  %tmp = load <1 x i64>* %arrayidx, align 8
-  %tmp1 = load <1 x i64>** @globalArray64x1, align 8
-  %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 %offset
-  store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
-  ret void
-}
-
-define void @fct2_64x1(<1 x i64>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_64x1:
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
-  %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 3
-  %tmp = load <1 x i64>* %arrayidx, align 8
-  %tmp1 = load <1 x i64>** @globalArray64x1, align 8
-  %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 5
-  store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
-  ret void
-}
-
-define void @fct1_32x2(<2 x i32>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_32x2:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 %offset
-  %tmp = load <2 x i32>* %arrayidx, align 8
-  %tmp1 = load <2 x i32>** @globalArray32x2, align 8
-  %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 %offset
-  store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
-  ret void
-}
-
-define void @fct2_32x2(<2 x i32>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_32x2:
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
-  %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 3
-  %tmp = load <2 x i32>* %arrayidx, align 8
-  %tmp1 = load <2 x i32>** @globalArray32x2, align 8
-  %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 5
-  store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
-  ret void
-}
-
-define void @fct1_16x4(<4 x i16>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_16x4:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 %offset
-  %tmp = load <4 x i16>* %arrayidx, align 8
-  %tmp1 = load <4 x i16>** @globalArray16x4, align 8
-  %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 %offset
-  store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
-  ret void
-}
-
-define void @fct2_16x4(<4 x i16>* nocapture %array) nounwind ssp {
-entry:
-; CHECK-LABEL: fct2_16x4:
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
-  %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 3
-  %tmp = load <4 x i16>* %arrayidx, align 8
-  %tmp1 = load <4 x i16>** @globalArray16x4, align 8
-  %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 5
-  store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
-  ret void
-}
-
-define void @fct1_8x8(<8 x i8>* nocapture %array, i64 %offset) nounwind ssp {
-entry:
-; CHECK-LABEL: fct1_8x8:
-; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
-; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
-; CHECK: ldr [[BASE:x[0-9]+]],
-; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
-  %arrayidx = getelementptr inbounds <8 x i8>* %array, i64 %offset
-  %tmp = load <8 x i8>* %arrayidx, align 8
-  %tmp1 = load <8 x i8>** @globalArray8x8, align 8
-  %arrayidx1 = getelementptr inbounds <8 x i8>* %tmp1, i64 %offset
-  store <8 x i8> %tmp, <8 x i8>* %arrayidx1, align 8
-  ret void
-}
-
-; Add a bunch of tests for rdar://13258794: Match LDUR/STUR for D and Q
-; registers for unscaled vector accesses
-@str = global [63 x i8] c"Test case for rdar://13258794: LDUR/STUR for D and Q registers\00", align 1
-
-define <1 x i64> @fct0() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct0:
-; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
-  ret <1 x i64> %0
-}
-
-define <2 x i32> @fct1() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct1:
-; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
-  ret <2 x i32> %0
-}
-
-define <4 x i16> @fct2() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct2:
-; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
-  ret <4 x i16> %0
-}
-
-define <8 x i8> @fct3() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct3:
-; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
-  ret <8 x i8> %0
-}
-
-define <2 x i64> @fct4() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct4:
-; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
-  ret <2 x i64> %0
-}
-
-define <4 x i32> @fct5() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct5:
-; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
-  ret <4 x i32> %0
-}
-
-define <8 x i16> @fct6() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct6:
-; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
-  ret <8 x i16> %0
-}
-
-define <16 x i8> @fct7() nounwind readonly ssp {
-entry:
-; CHECK-LABEL: fct7:
-; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
-  %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
-  ret <16 x i8> %0
-}
-
-define void @fct8() nounwind ssp {
-entry:
-; CHECK-LABEL: fct8:
-; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
-  store <1 x i64> %0, <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <1 x i64>*), align 8
-  ret void
-}
-
-define void @fct9() nounwind ssp {
-entry:
-; CHECK-LABEL: fct9:
-; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
-  store <2 x i32> %0, <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i32>*), align 8
-  ret void
-}
-
-define void @fct10() nounwind ssp {
-entry:
-; CHECK-LABEL: fct10:
-; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
-  store <4 x i16> %0, <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i16>*), align 8
-  ret void
-}
-
-define void @fct11() nounwind ssp {
-entry:
-; CHECK-LABEL: fct11:
-; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
-  store <8 x i8> %0, <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i8>*), align 8
-  ret void
-}
-
-define void @fct12() nounwind ssp {
-entry:
-; CHECK-LABEL: fct12:
-; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
-  store <2 x i64> %0, <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i64>*), align 16
-  ret void
-}
-
-define void @fct13() nounwind ssp {
-entry:
-; CHECK-LABEL: fct13:
-; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
-  store <4 x i32> %0, <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i32>*), align 16
-  ret void
-}
-
-define void @fct14() nounwind ssp {
-entry:
-; CHECK-LABEL: fct14:
-; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
-  store <8 x i16> %0, <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i16>*), align 16
-  ret void
-}
-
-define void @fct15() nounwind ssp {
-entry:
-; CHECK-LABEL: fct15:
-; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
-; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
-  %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
-  store <16 x i8> %0, <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <16 x i8>*), align 16
-  ret void
-}
-
-; Check the building of vector from a single loaded value.
-; Part of <rdar://problem/14170854>
-;
-; Single loads with immediate offset.
-define <8 x i8> @fct16(i8* nocapture %sp0) {
-; CHECK-LABEL: fct16:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 1
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <8 x i8> %vec, %vec
-  ret <8 x i8> %vmull.i
-}
-
-define <16 x i8> @fct17(i8* nocapture %sp0) {
-; CHECK-LABEL: fct17:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
-; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 1
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <16 x i8> %vec, %vec
-  ret <16 x i8> %vmull.i
-}
-
-define <4 x i16> @fct18(i16* nocapture %sp0) {
-; CHECK-LABEL: fct18:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
-; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 1
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <4 x i16> %vec, %vec
-  ret <4 x i16> %vmull.i
-}
-
-define <8 x i16> @fct19(i16* nocapture %sp0) {
-; CHECK-LABEL: fct19:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
-; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 1
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <8 x i16> %vec, %vec
-  ret <8 x i16> %vmull.i
-}
-
-define <2 x i32> @fct20(i32* nocapture %sp0) {
-; CHECK-LABEL: fct20:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <2 x i32> %vec, %vec
-  ret <2 x i32> %vmull.i
-}
-
-define <4 x i32> @fct21(i32* nocapture %sp0) {
-; CHECK-LABEL: fct21:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
-; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 1
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <4 x i32> %vec, %vec
-  ret <4 x i32> %vmull.i
-}
-
-define <1 x i64> @fct22(i64* nocapture %sp0) {
-; CHECK-LABEL: fct22:
-; CHECK: ldr d0, [x0, #8]
-entry:
-  %addr = getelementptr i64* %sp0, i64 1
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
-   ret <1 x i64> %vec
-}
-
-define <2 x i64> @fct23(i64* nocapture %sp0) {
-; CHECK-LABEL: fct23:
-; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
-entry:
-  %addr = getelementptr i64* %sp0, i64 1
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
-  ret <2 x i64> %vec
-}
-
-;
-; Single loads with register offset.
-define <8 x i8> @fct24(i8* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct24:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
-; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <8 x i8> %vec, %vec
-  ret <8 x i8> %vmull.i
-}
-
-define <16 x i8> @fct25(i8* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct25:
-; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
-; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i8* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i8* %addr, align 1
-  %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <16 x i8> %vec, %vec
-  ret <16 x i8> %vmull.i
-}
-
-define <4 x i16> @fct26(i16* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct26:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
-; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <4 x i16> %vec, %vec
-  ret <4 x i16> %vmull.i
-}
-
-define <8 x i16> @fct27(i16* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct27:
-; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
-; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i16* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i16* %addr, align 1
-  %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <8 x i16> %vec, %vec
-  ret <8 x i16> %vmull.i
-}
-
-define <2 x i32> @fct28(i32* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct28:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
-; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <2 x i32> %vec, %vec
-  ret <2 x i32> %vmull.i
-}
-
-define <4 x i32> @fct29(i32* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct29:
-; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
-; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
-entry:
-  %addr = getelementptr i32* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i32* %addr, align 1
-  %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
-  %vmull.i = mul <4 x i32> %vec, %vec
-  ret <4 x i32> %vmull.i
-}
-
-define <1 x i64> @fct30(i64* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct30:
-; CHECK: ldr d0, [x0, x1, lsl #3]
-entry:
-  %addr = getelementptr i64* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
-   ret <1 x i64> %vec
-}
-
-define <2 x i64> @fct31(i64* nocapture %sp0, i64 %offset) {
-; CHECK-LABEL: fct31:
-; CHECK: ldr d0, [x0, x1, lsl #3]
-entry:
-  %addr = getelementptr i64* %sp0, i64 %offset
-  %pix_sp0.0.copyload = load i64* %addr, align 1
-  %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
-  ret <2 x i64> %vec
-}
diff --git a/test/CodeGen/ARM64/vext.ll b/test/CodeGen/ARM64/vext.ll
deleted file mode 100644
index c820439..0000000
--- a/test/CodeGen/ARM64/vext.ll
+++ /dev/null
@@ -1,464 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define void @test_vext_s8() nounwind ssp {
-  ; CHECK-LABEL: test_vext_s8:
-  ; CHECK: {{ext.8.*#1}}
-  %xS8x8 = alloca <8 x i8>, align 8
-  %__a = alloca <8 x i8>, align 8
-  %__b = alloca <8 x i8>, align 8
-  %tmp = load <8 x i8>* %xS8x8, align 8
-  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
-  %tmp1 = load <8 x i8>* %xS8x8, align 8
-  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
-  %tmp2 = load <8 x i8>* %__a, align 8
-  %tmp3 = load <8 x i8>* %__b, align 8
-  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  store <8 x i8> %vext, <8 x i8>* %xS8x8, align 8
-  ret void
-}
-
-define void @test_vext_u8() nounwind ssp {
-  ; CHECK-LABEL: test_vext_u8:
-  ; CHECK: {{ext.8.*#2}}
-  %xU8x8 = alloca <8 x i8>, align 8
-  %__a = alloca <8 x i8>, align 8
-  %__b = alloca <8 x i8>, align 8
-  %tmp = load <8 x i8>* %xU8x8, align 8
-  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
-  %tmp1 = load <8 x i8>* %xU8x8, align 8
-  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
-  %tmp2 = load <8 x i8>* %__a, align 8
-  %tmp3 = load <8 x i8>* %__b, align 8
-  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-  store <8 x i8> %vext, <8 x i8>* %xU8x8, align 8
-  ret void
-}
-
-define void @test_vext_p8() nounwind ssp {
-  ; CHECK-LABEL: test_vext_p8:
-  ; CHECK: {{ext.8.*#3}}
-  %xP8x8 = alloca <8 x i8>, align 8
-  %__a = alloca <8 x i8>, align 8
-  %__b = alloca <8 x i8>, align 8
-  %tmp = load <8 x i8>* %xP8x8, align 8
-  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
-  %tmp1 = load <8 x i8>* %xP8x8, align 8
-  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
-  %tmp2 = load <8 x i8>* %__a, align 8
-  %tmp3 = load <8 x i8>* %__b, align 8
-  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
-  store <8 x i8> %vext, <8 x i8>* %xP8x8, align 8
-  ret void
-}
-
-define void @test_vext_s16() nounwind ssp {
-  ; CHECK-LABEL: test_vext_s16:
-  ; CHECK: {{ext.8.*#2}}
-  %xS16x4 = alloca <4 x i16>, align 8
-  %__a = alloca <4 x i16>, align 8
-  %__b = alloca <4 x i16>, align 8
-  %tmp = load <4 x i16>* %xS16x4, align 8
-  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
-  %tmp1 = load <4 x i16>* %xS16x4, align 8
-  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
-  %tmp2 = load <4 x i16>* %__a, align 8
-  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
-  %tmp4 = load <4 x i16>* %__b, align 8
-  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
-  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-  store <4 x i16> %vext, <4 x i16>* %xS16x4, align 8
-  ret void
-}
-
-define void @test_vext_u16() nounwind ssp {
-  ; CHECK-LABEL: test_vext_u16:
-  ; CHECK: {{ext.8.*#4}}
-  %xU16x4 = alloca <4 x i16>, align 8
-  %__a = alloca <4 x i16>, align 8
-  %__b = alloca <4 x i16>, align 8
-  %tmp = load <4 x i16>* %xU16x4, align 8
-  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
-  %tmp1 = load <4 x i16>* %xU16x4, align 8
-  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
-  %tmp2 = load <4 x i16>* %__a, align 8
-  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
-  %tmp4 = load <4 x i16>* %__b, align 8
-  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
-  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  store <4 x i16> %vext, <4 x i16>* %xU16x4, align 8
-  ret void
-}
-
-define void @test_vext_p16() nounwind ssp {
-  ; CHECK-LABEL: test_vext_p16:
-  ; CHECK: {{ext.8.*#6}}
-  %xP16x4 = alloca <4 x i16>, align 8
-  %__a = alloca <4 x i16>, align 8
-  %__b = alloca <4 x i16>, align 8
-  %tmp = load <4 x i16>* %xP16x4, align 8
-  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
-  %tmp1 = load <4 x i16>* %xP16x4, align 8
-  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
-  %tmp2 = load <4 x i16>* %__a, align 8
-  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
-  %tmp4 = load <4 x i16>* %__b, align 8
-  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
-  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-  store <4 x i16> %vext, <4 x i16>* %xP16x4, align 8
-  ret void
-}
-
-define void @test_vext_s32() nounwind ssp {
-  ; CHECK-LABEL: test_vext_s32:
-  ; CHECK: {{ext.8.*#4}}
-  %xS32x2 = alloca <2 x i32>, align 8
-  %__a = alloca <2 x i32>, align 8
-  %__b = alloca <2 x i32>, align 8
-  %tmp = load <2 x i32>* %xS32x2, align 8
-  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
-  %tmp1 = load <2 x i32>* %xS32x2, align 8
-  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
-  %tmp2 = load <2 x i32>* %__a, align 8
-  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
-  %tmp4 = load <2 x i32>* %__b, align 8
-  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
-  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
-  store <2 x i32> %vext, <2 x i32>* %xS32x2, align 8
-  ret void
-}
-
-define void @test_vext_u32() nounwind ssp {
-  ; CHECK-LABEL: test_vext_u32:
-  ; CHECK: {{ext.8.*#4}}
-  %xU32x2 = alloca <2 x i32>, align 8
-  %__a = alloca <2 x i32>, align 8
-  %__b = alloca <2 x i32>, align 8
-  %tmp = load <2 x i32>* %xU32x2, align 8
-  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
-  %tmp1 = load <2 x i32>* %xU32x2, align 8
-  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
-  %tmp2 = load <2 x i32>* %__a, align 8
-  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
-  %tmp4 = load <2 x i32>* %__b, align 8
-  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
-  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
-  store <2 x i32> %vext, <2 x i32>* %xU32x2, align 8
-  ret void
-}
-
-define void @test_vext_f32() nounwind ssp {
-  ; CHECK-LABEL: test_vext_f32:
-  ; CHECK: {{ext.8.*#4}}
-  %xF32x2 = alloca <2 x float>, align 8
-  %__a = alloca <2 x float>, align 8
-  %__b = alloca <2 x float>, align 8
-  %tmp = load <2 x float>* %xF32x2, align 8
-  store <2 x float> %tmp, <2 x float>* %__a, align 8
-  %tmp1 = load <2 x float>* %xF32x2, align 8
-  store <2 x float> %tmp1, <2 x float>* %__b, align 8
-  %tmp2 = load <2 x float>* %__a, align 8
-  %tmp3 = bitcast <2 x float> %tmp2 to <8 x i8>
-  %tmp4 = load <2 x float>* %__b, align 8
-  %tmp5 = bitcast <2 x float> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x float>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x float>
-  %vext = shufflevector <2 x float> %tmp6, <2 x float> %tmp7, <2 x i32> <i32 1, i32 2>
-  store <2 x float> %vext, <2 x float>* %xF32x2, align 8
-  ret void
-}
-
-define void @test_vext_s64() nounwind ssp {
-  ; CHECK-LABEL: test_vext_s64:
-  ; CHECK_FIXME: {{ext.8.*#1}}
-  ; this just turns into a load of the second element
-  %xS64x1 = alloca <1 x i64>, align 8
-  %__a = alloca <1 x i64>, align 8
-  %__b = alloca <1 x i64>, align 8
-  %tmp = load <1 x i64>* %xS64x1, align 8
-  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
-  %tmp1 = load <1 x i64>* %xS64x1, align 8
-  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
-  %tmp2 = load <1 x i64>* %__a, align 8
-  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
-  %tmp4 = load <1 x i64>* %__b, align 8
-  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
-  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
-  store <1 x i64> %vext, <1 x i64>* %xS64x1, align 8
-  ret void
-}
-
-define void @test_vext_u64() nounwind ssp {
-  ; CHECK-LABEL: test_vext_u64:
-  ; CHECK_FIXME: {{ext.8.*#1}}
-  ; this is turned into a simple load of the 2nd element
-  %xU64x1 = alloca <1 x i64>, align 8
-  %__a = alloca <1 x i64>, align 8
-  %__b = alloca <1 x i64>, align 8
-  %tmp = load <1 x i64>* %xU64x1, align 8
-  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
-  %tmp1 = load <1 x i64>* %xU64x1, align 8
-  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
-  %tmp2 = load <1 x i64>* %__a, align 8
-  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
-  %tmp4 = load <1 x i64>* %__b, align 8
-  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
-  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
-  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
-  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
-  store <1 x i64> %vext, <1 x i64>* %xU64x1, align 8
-  ret void
-}
-
-define void @test_vextq_s8() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_s8:
-  ; CHECK: {{ext.16.*#4}}
-  %xS8x16 = alloca <16 x i8>, align 16
-  %__a = alloca <16 x i8>, align 16
-  %__b = alloca <16 x i8>, align 16
-  %tmp = load <16 x i8>* %xS8x16, align 16
-  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
-  %tmp1 = load <16 x i8>* %xS8x16, align 16
-  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
-  %tmp2 = load <16 x i8>* %__a, align 16
-  %tmp3 = load <16 x i8>* %__b, align 16
-  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
-  store <16 x i8> %vext, <16 x i8>* %xS8x16, align 16
-  ret void
-}
-
-define void @test_vextq_u8() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_u8:
-  ; CHECK: {{ext.16.*#5}}
-  %xU8x16 = alloca <16 x i8>, align 16
-  %__a = alloca <16 x i8>, align 16
-  %__b = alloca <16 x i8>, align 16
-  %tmp = load <16 x i8>* %xU8x16, align 16
-  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
-  %tmp1 = load <16 x i8>* %xU8x16, align 16
-  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
-  %tmp2 = load <16 x i8>* %__a, align 16
-  %tmp3 = load <16 x i8>* %__b, align 16
-  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
-  store <16 x i8> %vext, <16 x i8>* %xU8x16, align 16
-  ret void
-}
-
-define void @test_vextq_p8() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_p8:
-  ; CHECK: {{ext.16.*#6}}
-  %xP8x16 = alloca <16 x i8>, align 16
-  %__a = alloca <16 x i8>, align 16
-  %__b = alloca <16 x i8>, align 16
-  %tmp = load <16 x i8>* %xP8x16, align 16
-  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
-  %tmp1 = load <16 x i8>* %xP8x16, align 16
-  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
-  %tmp2 = load <16 x i8>* %__a, align 16
-  %tmp3 = load <16 x i8>* %__b, align 16
-  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
-  store <16 x i8> %vext, <16 x i8>* %xP8x16, align 16
-  ret void
-}
-
-define void @test_vextq_s16() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_s16:
-  ; CHECK: {{ext.16.*#14}}
-  %xS16x8 = alloca <8 x i16>, align 16
-  %__a = alloca <8 x i16>, align 16
-  %__b = alloca <8 x i16>, align 16
-  %tmp = load <8 x i16>* %xS16x8, align 16
-  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
-  %tmp1 = load <8 x i16>* %xS16x8, align 16
-  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
-  %tmp2 = load <8 x i16>* %__a, align 16
-  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
-  %tmp4 = load <8 x i16>* %__b, align 16
-  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
-  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
-  store <8 x i16> %vext, <8 x i16>* %xS16x8, align 16
-  ret void
-}
-
-define void @test_vextq_u16() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_u16:
-  ; CHECK: {{ext.16.*#8}}
-  %xU16x8 = alloca <8 x i16>, align 16
-  %__a = alloca <8 x i16>, align 16
-  %__b = alloca <8 x i16>, align 16
-  %tmp = load <8 x i16>* %xU16x8, align 16
-  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
-  %tmp1 = load <8 x i16>* %xU16x8, align 16
-  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
-  %tmp2 = load <8 x i16>* %__a, align 16
-  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
-  %tmp4 = load <8 x i16>* %__b, align 16
-  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
-  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  store <8 x i16> %vext, <8 x i16>* %xU16x8, align 16
-  ret void
-}
-
-define void @test_vextq_p16() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_p16:
-  ; CHECK: {{ext.16.*#10}}
-  %xP16x8 = alloca <8 x i16>, align 16
-  %__a = alloca <8 x i16>, align 16
-  %__b = alloca <8 x i16>, align 16
-  %tmp = load <8 x i16>* %xP16x8, align 16
-  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
-  %tmp1 = load <8 x i16>* %xP16x8, align 16
-  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
-  %tmp2 = load <8 x i16>* %__a, align 16
-  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
-  %tmp4 = load <8 x i16>* %__b, align 16
-  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
-  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
-  store <8 x i16> %vext, <8 x i16>* %xP16x8, align 16
-  ret void
-}
-
-define void @test_vextq_s32() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_s32:
-  ; CHECK: {{ext.16.*#4}}
-  %xS32x4 = alloca <4 x i32>, align 16
-  %__a = alloca <4 x i32>, align 16
-  %__b = alloca <4 x i32>, align 16
-  %tmp = load <4 x i32>* %xS32x4, align 16
-  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
-  %tmp1 = load <4 x i32>* %xS32x4, align 16
-  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
-  %tmp2 = load <4 x i32>* %__a, align 16
-  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
-  %tmp4 = load <4 x i32>* %__b, align 16
-  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
-  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-  store <4 x i32> %vext, <4 x i32>* %xS32x4, align 16
-  ret void
-}
-
-define void @test_vextq_u32() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_u32:
-  ; CHECK: {{ext.16.*#8}}
-  %xU32x4 = alloca <4 x i32>, align 16
-  %__a = alloca <4 x i32>, align 16
-  %__b = alloca <4 x i32>, align 16
-  %tmp = load <4 x i32>* %xU32x4, align 16
-  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
-  %tmp1 = load <4 x i32>* %xU32x4, align 16
-  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
-  %tmp2 = load <4 x i32>* %__a, align 16
-  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
-  %tmp4 = load <4 x i32>* %__b, align 16
-  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
-  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  store <4 x i32> %vext, <4 x i32>* %xU32x4, align 16
-  ret void
-}
-
-define void @test_vextq_f32() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_f32:
-  ; CHECK: {{ext.16.*#12}}
-  %xF32x4 = alloca <4 x float>, align 16
-  %__a = alloca <4 x float>, align 16
-  %__b = alloca <4 x float>, align 16
-  %tmp = load <4 x float>* %xF32x4, align 16
-  store <4 x float> %tmp, <4 x float>* %__a, align 16
-  %tmp1 = load <4 x float>* %xF32x4, align 16
-  store <4 x float> %tmp1, <4 x float>* %__b, align 16
-  %tmp2 = load <4 x float>* %__a, align 16
-  %tmp3 = bitcast <4 x float> %tmp2 to <16 x i8>
-  %tmp4 = load <4 x float>* %__b, align 16
-  %tmp5 = bitcast <4 x float> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x float>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x float>
-  %vext = shufflevector <4 x float> %tmp6, <4 x float> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-  store <4 x float> %vext, <4 x float>* %xF32x4, align 16
-  ret void
-}
-
-define void @test_vextq_s64() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_s64:
-  ; CHECK: {{ext.16.*#8}}
-  %xS64x2 = alloca <2 x i64>, align 16
-  %__a = alloca <2 x i64>, align 16
-  %__b = alloca <2 x i64>, align 16
-  %tmp = load <2 x i64>* %xS64x2, align 16
-  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
-  %tmp1 = load <2 x i64>* %xS64x2, align 16
-  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
-  %tmp2 = load <2 x i64>* %__a, align 16
-  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
-  %tmp4 = load <2 x i64>* %__b, align 16
-  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
-  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
-  store <2 x i64> %vext, <2 x i64>* %xS64x2, align 16
-  ret void
-}
-
-define void @test_vextq_u64() nounwind ssp {
-  ; CHECK-LABEL: test_vextq_u64:
-  ; CHECK: {{ext.16.*#8}}
-  %xU64x2 = alloca <2 x i64>, align 16
-  %__a = alloca <2 x i64>, align 16
-  %__b = alloca <2 x i64>, align 16
-  %tmp = load <2 x i64>* %xU64x2, align 16
-  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
-  %tmp1 = load <2 x i64>* %xU64x2, align 16
-  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
-  %tmp2 = load <2 x i64>* %__a, align 16
-  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
-  %tmp4 = load <2 x i64>* %__b, align 16
-  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
-  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
-  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
-  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
-  store <2 x i64> %vext, <2 x i64>* %xU64x2, align 16
-  ret void
-}
-
-; shuffles with an undef second operand can use an EXT also so long as the
-; indices wrap and stay sequential.
-; rdar://12051674
-define <16 x i8> @vext1(<16 x i8> %_a) nounwind {
-; CHECK-LABEL: vext1:
-; CHECK: ext.16b  v0, v0, v0, #8
-  %vext = shufflevector <16 x i8> %_a, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <16 x i8> %vext
-}
-
-; <rdar://problem/12212062>
-define <2 x i64> @vext2(<2 x i64> %p0, <2 x i64> %p1) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: vext2:
-; CHECK: ext.16b v1, v1, v1, #8
-; CHECK: ext.16b v0, v0, v0, #8
-; CHECK: add.2d  v0, v0, v1
-  %t0 = shufflevector <2 x i64> %p1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-  %t1 = shufflevector <2 x i64> %p0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-  %t2 = add <2 x i64> %t1, %t0
-  ret <2 x i64> %t2
-}
diff --git a/test/CodeGen/ARM64/vfloatintrinsics.ll b/test/CodeGen/ARM64/vfloatintrinsics.ll
deleted file mode 100644
index a8c882b..0000000
--- a/test/CodeGen/ARM64/vfloatintrinsics.ll
+++ /dev/null
@@ -1,375 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-;;; Float vectors
-
-%v2f32 = type <2 x float>
-; CHECK: test_v2f32.sqrt:
-define %v2f32 @test_v2f32.sqrt(%v2f32 %a) {
-  ; CHECK: fsqrt.2s
-  %1 = call %v2f32 @llvm.sqrt.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.powi:
-define %v2f32 @test_v2f32.powi(%v2f32 %a, i32 %b) {
-  ; CHECK: pow
-  %1 = call %v2f32 @llvm.powi.v2f32(%v2f32 %a, i32 %b)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.sin:
-define %v2f32 @test_v2f32.sin(%v2f32 %a) {
-  ; CHECK: sin
-  %1 = call %v2f32 @llvm.sin.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.cos:
-define %v2f32 @test_v2f32.cos(%v2f32 %a) {
-  ; CHECK: cos
-  %1 = call %v2f32 @llvm.cos.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.pow:
-define %v2f32 @test_v2f32.pow(%v2f32 %a, %v2f32 %b) {
-  ; CHECK: pow
-  %1 = call %v2f32 @llvm.pow.v2f32(%v2f32 %a, %v2f32 %b)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.exp:
-define %v2f32 @test_v2f32.exp(%v2f32 %a) {
-  ; CHECK: exp
-  %1 = call %v2f32 @llvm.exp.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.exp2:
-define %v2f32 @test_v2f32.exp2(%v2f32 %a) {
-  ; CHECK: exp
-  %1 = call %v2f32 @llvm.exp2.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.log:
-define %v2f32 @test_v2f32.log(%v2f32 %a) {
-  ; CHECK: log
-  %1 = call %v2f32 @llvm.log.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.log10:
-define %v2f32 @test_v2f32.log10(%v2f32 %a) {
-  ; CHECK: log
-  %1 = call %v2f32 @llvm.log10.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.log2:
-define %v2f32 @test_v2f32.log2(%v2f32 %a) {
-  ; CHECK: log
-  %1 = call %v2f32 @llvm.log2.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.fma:
-define %v2f32 @test_v2f32.fma(%v2f32 %a, %v2f32 %b, %v2f32 %c) {
-  ; CHECK: fma
-  %1 = call %v2f32 @llvm.fma.v2f32(%v2f32 %a, %v2f32 %b, %v2f32 %c)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.fabs:
-define %v2f32 @test_v2f32.fabs(%v2f32 %a) {
-  ; CHECK: fabs
-  %1 = call %v2f32 @llvm.fabs.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.floor:
-define %v2f32 @test_v2f32.floor(%v2f32 %a) {
-  ; CHECK: frintm.2s
-  %1 = call %v2f32 @llvm.floor.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.ceil:
-define %v2f32 @test_v2f32.ceil(%v2f32 %a) {
-  ; CHECK: frintp.2s
-  %1 = call %v2f32 @llvm.ceil.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.trunc:
-define %v2f32 @test_v2f32.trunc(%v2f32 %a) {
-  ; CHECK: frintz.2s
-  %1 = call %v2f32 @llvm.trunc.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.rint:
-define %v2f32 @test_v2f32.rint(%v2f32 %a) {
-  ; CHECK: frintx.2s
-  %1 = call %v2f32 @llvm.rint.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-; CHECK: test_v2f32.nearbyint:
-define %v2f32 @test_v2f32.nearbyint(%v2f32 %a) {
-  ; CHECK: frinti.2s
-  %1 = call %v2f32 @llvm.nearbyint.v2f32(%v2f32 %a)
-  ret %v2f32 %1
-}
-
-declare %v2f32 @llvm.sqrt.v2f32(%v2f32) #0
-declare %v2f32 @llvm.powi.v2f32(%v2f32, i32) #0
-declare %v2f32 @llvm.sin.v2f32(%v2f32) #0
-declare %v2f32 @llvm.cos.v2f32(%v2f32) #0
-declare %v2f32 @llvm.pow.v2f32(%v2f32, %v2f32) #0
-declare %v2f32 @llvm.exp.v2f32(%v2f32) #0
-declare %v2f32 @llvm.exp2.v2f32(%v2f32) #0
-declare %v2f32 @llvm.log.v2f32(%v2f32) #0
-declare %v2f32 @llvm.log10.v2f32(%v2f32) #0
-declare %v2f32 @llvm.log2.v2f32(%v2f32) #0
-declare %v2f32 @llvm.fma.v2f32(%v2f32, %v2f32, %v2f32) #0
-declare %v2f32 @llvm.fabs.v2f32(%v2f32) #0
-declare %v2f32 @llvm.floor.v2f32(%v2f32) #0
-declare %v2f32 @llvm.ceil.v2f32(%v2f32) #0
-declare %v2f32 @llvm.trunc.v2f32(%v2f32) #0
-declare %v2f32 @llvm.rint.v2f32(%v2f32) #0
-declare %v2f32 @llvm.nearbyint.v2f32(%v2f32) #0
-
-;;;
-
-%v4f32 = type <4 x float>
-; CHECK: test_v4f32.sqrt:
-define %v4f32 @test_v4f32.sqrt(%v4f32 %a) {
-  ; CHECK: fsqrt.4s
-  %1 = call %v4f32 @llvm.sqrt.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.powi:
-define %v4f32 @test_v4f32.powi(%v4f32 %a, i32 %b) {
-  ; CHECK: pow
-  %1 = call %v4f32 @llvm.powi.v4f32(%v4f32 %a, i32 %b)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.sin:
-define %v4f32 @test_v4f32.sin(%v4f32 %a) {
-  ; CHECK: sin
-  %1 = call %v4f32 @llvm.sin.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.cos:
-define %v4f32 @test_v4f32.cos(%v4f32 %a) {
-  ; CHECK: cos
-  %1 = call %v4f32 @llvm.cos.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.pow:
-define %v4f32 @test_v4f32.pow(%v4f32 %a, %v4f32 %b) {
-  ; CHECK: pow
-  %1 = call %v4f32 @llvm.pow.v4f32(%v4f32 %a, %v4f32 %b)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.exp:
-define %v4f32 @test_v4f32.exp(%v4f32 %a) {
-  ; CHECK: exp
-  %1 = call %v4f32 @llvm.exp.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.exp2:
-define %v4f32 @test_v4f32.exp2(%v4f32 %a) {
-  ; CHECK: exp
-  %1 = call %v4f32 @llvm.exp2.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.log:
-define %v4f32 @test_v4f32.log(%v4f32 %a) {
-  ; CHECK: log
-  %1 = call %v4f32 @llvm.log.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.log10:
-define %v4f32 @test_v4f32.log10(%v4f32 %a) {
-  ; CHECK: log
-  %1 = call %v4f32 @llvm.log10.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.log2:
-define %v4f32 @test_v4f32.log2(%v4f32 %a) {
-  ; CHECK: log
-  %1 = call %v4f32 @llvm.log2.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.fma:
-define %v4f32 @test_v4f32.fma(%v4f32 %a, %v4f32 %b, %v4f32 %c) {
-  ; CHECK: fma
-  %1 = call %v4f32 @llvm.fma.v4f32(%v4f32 %a, %v4f32 %b, %v4f32 %c)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.fabs:
-define %v4f32 @test_v4f32.fabs(%v4f32 %a) {
-  ; CHECK: fabs
-  %1 = call %v4f32 @llvm.fabs.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.floor:
-define %v4f32 @test_v4f32.floor(%v4f32 %a) {
-  ; CHECK: frintm.4s
-  %1 = call %v4f32 @llvm.floor.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.ceil:
-define %v4f32 @test_v4f32.ceil(%v4f32 %a) {
-  ; CHECK: frintp.4s
-  %1 = call %v4f32 @llvm.ceil.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.trunc:
-define %v4f32 @test_v4f32.trunc(%v4f32 %a) {
-  ; CHECK: frintz.4s
-  %1 = call %v4f32 @llvm.trunc.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.rint:
-define %v4f32 @test_v4f32.rint(%v4f32 %a) {
-  ; CHECK: frintx.4s
-  %1 = call %v4f32 @llvm.rint.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-; CHECK: test_v4f32.nearbyint:
-define %v4f32 @test_v4f32.nearbyint(%v4f32 %a) {
-  ; CHECK: frinti.4s
-  %1 = call %v4f32 @llvm.nearbyint.v4f32(%v4f32 %a)
-  ret %v4f32 %1
-}
-
-declare %v4f32 @llvm.sqrt.v4f32(%v4f32) #0
-declare %v4f32 @llvm.powi.v4f32(%v4f32, i32) #0
-declare %v4f32 @llvm.sin.v4f32(%v4f32) #0
-declare %v4f32 @llvm.cos.v4f32(%v4f32) #0
-declare %v4f32 @llvm.pow.v4f32(%v4f32, %v4f32) #0
-declare %v4f32 @llvm.exp.v4f32(%v4f32) #0
-declare %v4f32 @llvm.exp2.v4f32(%v4f32) #0
-declare %v4f32 @llvm.log.v4f32(%v4f32) #0
-declare %v4f32 @llvm.log10.v4f32(%v4f32) #0
-declare %v4f32 @llvm.log2.v4f32(%v4f32) #0
-declare %v4f32 @llvm.fma.v4f32(%v4f32, %v4f32, %v4f32) #0
-declare %v4f32 @llvm.fabs.v4f32(%v4f32) #0
-declare %v4f32 @llvm.floor.v4f32(%v4f32) #0
-declare %v4f32 @llvm.ceil.v4f32(%v4f32) #0
-declare %v4f32 @llvm.trunc.v4f32(%v4f32) #0
-declare %v4f32 @llvm.rint.v4f32(%v4f32) #0
-declare %v4f32 @llvm.nearbyint.v4f32(%v4f32) #0
-
-;;; Double vector
-
-%v2f64 = type <2 x double>
-; CHECK: test_v2f64.sqrt:
-define %v2f64 @test_v2f64.sqrt(%v2f64 %a) {
-  ; CHECK: fsqrt.2d
-  %1 = call %v2f64 @llvm.sqrt.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.powi:
-define %v2f64 @test_v2f64.powi(%v2f64 %a, i32 %b) {
-  ; CHECK: pow
-  %1 = call %v2f64 @llvm.powi.v2f64(%v2f64 %a, i32 %b)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.sin:
-define %v2f64 @test_v2f64.sin(%v2f64 %a) {
-  ; CHECK: sin
-  %1 = call %v2f64 @llvm.sin.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.cos:
-define %v2f64 @test_v2f64.cos(%v2f64 %a) {
-  ; CHECK: cos
-  %1 = call %v2f64 @llvm.cos.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.pow:
-define %v2f64 @test_v2f64.pow(%v2f64 %a, %v2f64 %b) {
-  ; CHECK: pow
-  %1 = call %v2f64 @llvm.pow.v2f64(%v2f64 %a, %v2f64 %b)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.exp:
-define %v2f64 @test_v2f64.exp(%v2f64 %a) {
-  ; CHECK: exp
-  %1 = call %v2f64 @llvm.exp.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.exp2:
-define %v2f64 @test_v2f64.exp2(%v2f64 %a) {
-  ; CHECK: exp
-  %1 = call %v2f64 @llvm.exp2.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.log:
-define %v2f64 @test_v2f64.log(%v2f64 %a) {
-  ; CHECK: log
-  %1 = call %v2f64 @llvm.log.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.log10:
-define %v2f64 @test_v2f64.log10(%v2f64 %a) {
-  ; CHECK: log
-  %1 = call %v2f64 @llvm.log10.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.log2:
-define %v2f64 @test_v2f64.log2(%v2f64 %a) {
-  ; CHECK: log
-  %1 = call %v2f64 @llvm.log2.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.fma:
-define %v2f64 @test_v2f64.fma(%v2f64 %a, %v2f64 %b, %v2f64 %c) {
-  ; CHECK: fma
-  %1 = call %v2f64 @llvm.fma.v2f64(%v2f64 %a, %v2f64 %b, %v2f64 %c)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.fabs:
-define %v2f64 @test_v2f64.fabs(%v2f64 %a) {
-  ; CHECK: fabs
-  %1 = call %v2f64 @llvm.fabs.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.floor:
-define %v2f64 @test_v2f64.floor(%v2f64 %a) {
-  ; CHECK: frintm.2d
-  %1 = call %v2f64 @llvm.floor.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.ceil:
-define %v2f64 @test_v2f64.ceil(%v2f64 %a) {
-  ; CHECK: frintp.2d
-  %1 = call %v2f64 @llvm.ceil.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.trunc:
-define %v2f64 @test_v2f64.trunc(%v2f64 %a) {
-  ; CHECK: frintz.2d
-  %1 = call %v2f64 @llvm.trunc.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.rint:
-define %v2f64 @test_v2f64.rint(%v2f64 %a) {
-  ; CHECK: frintx.2d
-  %1 = call %v2f64 @llvm.rint.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-; CHECK: test_v2f64.nearbyint:
-define %v2f64 @test_v2f64.nearbyint(%v2f64 %a) {
-  ; CHECK: frinti.2d
-  %1 = call %v2f64 @llvm.nearbyint.v2f64(%v2f64 %a)
-  ret %v2f64 %1
-}
-
-declare %v2f64 @llvm.sqrt.v2f64(%v2f64) #0
-declare %v2f64 @llvm.powi.v2f64(%v2f64, i32) #0
-declare %v2f64 @llvm.sin.v2f64(%v2f64) #0
-declare %v2f64 @llvm.cos.v2f64(%v2f64) #0
-declare %v2f64 @llvm.pow.v2f64(%v2f64, %v2f64) #0
-declare %v2f64 @llvm.exp.v2f64(%v2f64) #0
-declare %v2f64 @llvm.exp2.v2f64(%v2f64) #0
-declare %v2f64 @llvm.log.v2f64(%v2f64) #0
-declare %v2f64 @llvm.log10.v2f64(%v2f64) #0
-declare %v2f64 @llvm.log2.v2f64(%v2f64) #0
-declare %v2f64 @llvm.fma.v2f64(%v2f64, %v2f64, %v2f64) #0
-declare %v2f64 @llvm.fabs.v2f64(%v2f64) #0
-declare %v2f64 @llvm.floor.v2f64(%v2f64) #0
-declare %v2f64 @llvm.ceil.v2f64(%v2f64) #0
-declare %v2f64 @llvm.trunc.v2f64(%v2f64) #0
-declare %v2f64 @llvm.rint.v2f64(%v2f64) #0
-declare %v2f64 @llvm.nearbyint.v2f64(%v2f64) #0
-
-attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/ARM64/vhadd.ll b/test/CodeGen/ARM64/vhadd.ll
deleted file mode 100644
index aed7681..0000000
--- a/test/CodeGen/ARM64/vhadd.ll
+++ /dev/null
@@ -1,249 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: shadd8b:
-;CHECK: shadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: shadd16b:
-;CHECK: shadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: shadd4h:
-;CHECK: shadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: shadd8h:
-;CHECK: shadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: shadd2s:
-;CHECK: shadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: shadd4s:
-;CHECK: shadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uhadd8b:
-;CHECK: uhadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uhadd16b:
-;CHECK: uhadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uhadd4h:
-;CHECK: uhadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uhadd8h:
-;CHECK: uhadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uhadd2s:
-;CHECK: uhadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uhadd4s:
-;CHECK: uhadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: srhadd8b:
-;CHECK: srhadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: srhadd16b:
-;CHECK: srhadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: srhadd4h:
-;CHECK: srhadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: srhadd8h:
-;CHECK: srhadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: srhadd2s:
-;CHECK: srhadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: srhadd4s:
-;CHECK: srhadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: urhadd8b:
-;CHECK: urhadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: urhadd16b:
-;CHECK: urhadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: urhadd4h:
-;CHECK: urhadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: urhadd8h:
-;CHECK: urhadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: urhadd2s:
-;CHECK: urhadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: urhadd4s:
-;CHECK: urhadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vhsub.ll b/test/CodeGen/ARM64/vhsub.ll
deleted file mode 100644
index 85df4d4..0000000
--- a/test/CodeGen/ARM64/vhsub.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @shsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: shsub8b:
-;CHECK: shsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @shsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: shsub16b:
-;CHECK: shsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @shsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: shsub4h:
-;CHECK: shsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @shsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: shsub8h:
-;CHECK: shsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @shsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: shsub2s:
-;CHECK: shsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @shsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: shsub4s:
-;CHECK: shsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <8 x i8> @uhsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uhsub8b:
-;CHECK: uhsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @uhsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uhsub16b:
-;CHECK: uhsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @uhsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uhsub4h:
-;CHECK: uhsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @uhsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uhsub8h:
-;CHECK: uhsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @uhsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uhsub2s:
-;CHECK: uhsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @uhsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uhsub4s:
-;CHECK: uhsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/virtual_base.ll b/test/CodeGen/ARM64/virtual_base.ll
deleted file mode 100644
index cb95954..0000000
--- a/test/CodeGen/ARM64/virtual_base.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: llc < %s -O3 -march arm64 | FileCheck %s
-; <rdar://13463602>
-
-%struct.Counter_Struct = type { i64, i64 }
-%struct.Bicubic_Patch_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64, i32, i32, i32, [4 x [4 x [3 x double]]], [3 x double], double, double, %struct.Bezier_Node_Struct* }
-%struct.Method_Struct = type { i32 (%struct.Object_Struct*, %struct.Ray_Struct*, %struct.istack_struct*)*, i32 (double*, %struct.Object_Struct*)*, void (double*, %struct.Object_Struct*, %struct.istk_entry*)*, i8* (%struct.Object_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*)*, void (%struct.Object_Struct*)* }
-%struct.Object_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64 }
-%struct.Texture_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.9, %struct.Texture_Struct*, %struct.Pigment_Struct*, %struct.Tnormal_Struct*, %struct.Finish_Struct*, %struct.Texture_Struct*, i32 }
-%struct.Warps_Struct = type { i16, %struct.Warps_Struct* }
-%struct.Pattern_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.6 }
-%struct.Blend_Map_Struct = type { i16, i16, i16, i64, %struct.Blend_Map_Entry* }
-%struct.Blend_Map_Entry = type { float, i8, %union.anon }
-%union.anon = type { [2 x double], [8 x i8] }
-%union.anon.6 = type { %struct.anon.7 }
-%struct.anon.7 = type { float, [3 x double] }
-%union.anon.9 = type { %struct.anon.10 }
-%struct.anon.10 = type { float, [3 x double] }
-%struct.Pigment_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.0, [5 x float] }
-%union.anon.0 = type { %struct.anon }
-%struct.anon = type { float, [3 x double] }
-%struct.Tnormal_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.3, float }
-%union.anon.3 = type { %struct.anon.4 }
-%struct.anon.4 = type { float, [3 x double] }
-%struct.Finish_Struct = type { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, [3 x float], [3 x float] }
-%struct.Interior_Struct = type { i64, i32, float, float, float, float, float, %struct.Media_Struct* }
-%struct.Media_Struct = type { i32, i32, i32, i32, i32, double, double, i32, i32, i32, i32, [5 x float], [5 x float], [5 x float], [5 x float], double, double, double, double*, %struct.Pigment_Struct*, %struct.Media_Struct* }
-%struct.Bounding_Box_Struct = type { [3 x float], [3 x float] }
-%struct.Ray_Struct = type { [3 x double], [3 x double], i32, [100 x %struct.Interior_Struct*] }
-%struct.istack_struct = type { %struct.istack_struct*, %struct.istk_entry*, i32 }
-%struct.istk_entry = type { double, [3 x double], [3 x double], %struct.Object_Struct*, i32, i32, double, double, i8* }
-%struct.Transform_Struct = type { [4 x [4 x double]], [4 x [4 x double]] }
-%struct.Bezier_Node_Struct = type { i32, [3 x double], double, i32, i8* }
-
-define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
-; CHECK: Precompute_Patch_Values
-; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288]
-; CHECK-NEXT: str [[VAL]], [sp, #232]
-; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272]
-; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
-entry:
-  %Control_Points = alloca [16 x [3 x double]], align 8
-  %arraydecay5.3.1 = getelementptr inbounds [16 x [3 x double]]* %Control_Points, i64 0, i64 9, i64 0
-  %tmp14 = bitcast double* %arraydecay5.3.1 to i8*
-  %arraydecay11.3.1 = getelementptr inbounds %struct.Bicubic_Patch_Struct* %Shape, i64 0, i32 12, i64 1, i64 3, i64 0
-  %tmp15 = bitcast double* %arraydecay11.3.1 to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp14, i8* %tmp15, i64 24, i32 1, i1 false)
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
diff --git a/test/CodeGen/ARM64/vmax.ll b/test/CodeGen/ARM64/vmax.ll
deleted file mode 100644
index b2426f3..0000000
--- a/test/CodeGen/ARM64/vmax.ll
+++ /dev/null
@@ -1,679 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @smax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: smax_8b:
-;CHECK: smax.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @smax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: smax_16b:
-;CHECK: smax.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @smax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smax_4h:
-;CHECK: smax.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @smax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: smax_8h:
-;CHECK: smax.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @smax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smax_2s:
-;CHECK: smax.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @smax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: smax_4s:
-;CHECK: smax.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @umax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: umax_8b:
-;CHECK: umax.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @umax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: umax_16b:
-;CHECK: umax.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @umax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umax_4h:
-;CHECK: umax.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @umax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: umax_8h:
-;CHECK: umax.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @umax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umax_2s:
-;CHECK: umax.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @umax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: umax_4s:
-;CHECK: umax.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @smin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: smin_8b:
-;CHECK: smin.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @smin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: smin_16b:
-;CHECK: smin.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @smin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smin_4h:
-;CHECK: smin.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @smin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: smin_8h:
-;CHECK: smin.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @smin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smin_2s:
-;CHECK: smin.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @smin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: smin_4s:
-;CHECK: smin.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @umin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: umin_8b:
-;CHECK: umin.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @umin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: umin_16b:
-;CHECK: umin.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @umin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umin_4h:
-;CHECK: umin.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @umin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: umin_8h:
-;CHECK: umin.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @umin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umin_2s:
-;CHECK: umin.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @umin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: umin_4s:
-;CHECK: umin.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @smaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: smaxp_8b:
-;CHECK: smaxp.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @smaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: smaxp_16b:
-;CHECK: smaxp.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @smaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smaxp_4h:
-;CHECK: smaxp.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @smaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: smaxp_8h:
-;CHECK: smaxp.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @smaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smaxp_2s:
-;CHECK: smaxp.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @smaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: smaxp_4s:
-;CHECK: smaxp.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @umaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: umaxp_8b:
-;CHECK: umaxp.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @umaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: umaxp_16b:
-;CHECK: umaxp.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @umaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umaxp_4h:
-;CHECK: umaxp.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @umaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: umaxp_8h:
-;CHECK: umaxp.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @umaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umaxp_2s:
-;CHECK: umaxp.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @umaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: umaxp_4s:
-;CHECK: umaxp.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @sminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sminp_8b:
-;CHECK: sminp.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sminp_16b:
-;CHECK: sminp.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @sminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sminp_4h:
-;CHECK: sminp.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sminp_8h:
-;CHECK: sminp.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sminp_2s:
-;CHECK: sminp.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sminp_4s:
-;CHECK: sminp.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @uminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uminp_8b:
-;CHECK: uminp.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @uminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uminp_16b:
-;CHECK: uminp.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @uminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uminp_4h:
-;CHECK: uminp.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @uminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uminp_8h:
-;CHECK: uminp.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @uminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uminp_2s:
-;CHECK: uminp.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @uminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uminp_4s:
-;CHECK: uminp.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x float> @fmax_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmax_2s:
-;CHECK: fmax.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmax_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmax_4s:
-;CHECK: fmax.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmax_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmax_2d:
-;CHECK: fmax.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fmaxp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmaxp_2s:
-;CHECK: fmaxp.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmaxp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmaxp_4s:
-;CHECK: fmaxp.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmaxp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmaxp_2d:
-;CHECK: fmaxp.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fmin_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmin_2s:
-;CHECK: fmin.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmin_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmin_4s:
-;CHECK: fmin.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmin_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmin_2d:
-;CHECK: fmin.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fminp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fminp_2s:
-;CHECK: fminp.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fminp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fminp_4s:
-;CHECK: fminp.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fminp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fminp_2d:
-;CHECK: fminp.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fminnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fminnmp_2s:
-;CHECK: fminnmp.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fminnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fminnmp_4s:
-;CHECK: fminnmp.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fminnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fminnmp_2d:
-;CHECK: fminnmp.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fmaxnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmaxnmp_2s:
-;CHECK: fmaxnmp.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmaxnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmaxnmp_4s:
-;CHECK: fmaxnmp.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmaxnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmaxnmp_2d:
-;CHECK: fmaxnmp.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vminmaxnm.ll b/test/CodeGen/ARM64/vminmaxnm.ll
deleted file mode 100644
index 6286407..0000000
--- a/test/CodeGen/ARM64/vminmaxnm.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.2s	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
-  ret <2 x float> %vmaxnm2.i
-}
-
-define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.4s	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
-  ret <4 x float> %vmaxnm2.i
-}
-
-define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.2d	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
-  ret <2 x double> %vmaxnm2.i
-}
-
-define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
-; CHECK: fminnm.2s	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
-  ret <2 x float> %vminnm2.i
-}
-
-define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
-; CHECK: fminnm.4s	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
-  ret <4 x float> %vminnm2.i
-}
-
-define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
-; CHECK: fminnm.2d	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
-  ret <2 x double> %vminnm2.i
-}
-
-declare <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
-
-
-define double @test_fmaxnmv(<2 x double> %in) {
-; CHECK-LABEL: test_fmaxnmv:
-; CHECK: fmaxnmp.2d d0, v0
-  %max = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
-  ret double %max
-}
-
-define double @test_fminnmv(<2 x double> %in) {
-; CHECK-LABEL: test_fminnmv:
-; CHECK: fminnmp.2d d0, v0
-  %min = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
-  ret double %min
-}
-
-declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
-declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/vmovn.ll b/test/CodeGen/ARM64/vmovn.ll
deleted file mode 100644
index 675633b..0000000
--- a/test/CodeGen/ARM64/vmovn.ll
+++ /dev/null
@@ -1,242 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @xtn8b(<8 x i16> %A) nounwind {
-;CHECK-LABEL: xtn8b:
-;CHECK-NOT: ld1
-;CHECK: xtn.8b v0, v0
-;CHECK-NEXT: ret
-  %tmp3 = trunc <8 x i16> %A to <8 x i8>
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @xtn4h(<4 x i32> %A) nounwind {
-;CHECK-LABEL: xtn4h:
-;CHECK-NOT: ld1
-;CHECK: xtn.4h v0, v0
-;CHECK-NEXT: ret
-  %tmp3 = trunc <4 x i32> %A to <4 x i16>
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @xtn2s(<2 x i64> %A) nounwind {
-;CHECK-LABEL: xtn2s:
-;CHECK-NOT: ld1
-;CHECK: xtn.2s v0, v0
-;CHECK-NEXT: ret
-  %tmp3 = trunc <2 x i64> %A to <2 x i32>
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @xtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
-;CHECK-LABEL: xtn2_16b:
-;CHECK-NOT: ld1
-;CHECK: xtn2.16b v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = trunc <8 x i16> %A to <8 x i8>
-        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %res
-}
-
-define <8 x i16> @xtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
-;CHECK-LABEL: xtn2_8h:
-;CHECK-NOT: ld1
-;CHECK: xtn2.8h v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = trunc <4 x i32> %A to <4 x i16>
-        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @xtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
-;CHECK-LABEL: xtn2_4s:
-;CHECK-NOT: ld1
-;CHECK: xtn2.4s v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = trunc <2 x i64> %A to <2 x i32>
-        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %res
-}
-
-define <8 x i8> @sqxtn8b(<8 x i16> %A) nounwind {
-;CHECK-LABEL: sqxtn8b:
-;CHECK-NOT: ld1
-;CHECK: sqxtn.8b v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqxtn4h(<4 x i32> %A) nounwind {
-;CHECK-LABEL: sqxtn4h:
-;CHECK-NOT: ld1
-;CHECK: sqxtn.4h v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqxtn2s(<2 x i64> %A) nounwind {
-;CHECK-LABEL: sqxtn2s:
-;CHECK-NOT: ld1
-;CHECK: sqxtn.2s v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
-;CHECK-LABEL: sqxtn2_16b:
-;CHECK-NOT: ld1
-;CHECK: sqxtn2.16b v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
-        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %res
-}
-
-define <8 x i16> @sqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
-;CHECK-LABEL: sqxtn2_8h:
-;CHECK-NOT: ld1
-;CHECK: sqxtn2.8h v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
-        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @sqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
-;CHECK-LABEL: sqxtn2_4s:
-;CHECK-NOT: ld1
-;CHECK: sqxtn2.4s v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
-        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %res
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
-
-define <8 x i8> @uqxtn8b(<8 x i16> %A) nounwind {
-;CHECK-LABEL: uqxtn8b:
-;CHECK-NOT: ld1
-;CHECK: uqxtn.8b v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqxtn4h(<4 x i32> %A) nounwind {
-;CHECK-LABEL: uqxtn4h:
-;CHECK-NOT: ld1
-;CHECK: uqxtn.4h v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqxtn2s(<2 x i64> %A) nounwind {
-;CHECK-LABEL: uqxtn2s:
-;CHECK-NOT: ld1
-;CHECK: uqxtn.2s v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @uqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
-;CHECK-LABEL: uqxtn2_16b:
-;CHECK-NOT: ld1
-;CHECK: uqxtn2.16b v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
-        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %res
-}
-
-define <8 x i16> @uqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
-;CHECK-LABEL: uqxtn2_8h:
-;CHECK-NOT: ld1
-;CHECK: uqxtn2.8h v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
-        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @uqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
-;CHECK-LABEL: uqxtn2_4s:
-;CHECK-NOT: ld1
-;CHECK: uqxtn2.4s v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
-        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %res
-}
-
-declare <8 x i8>  @llvm.arm64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
-
-define <8 x i8> @sqxtun8b(<8 x i16> %A) nounwind {
-;CHECK-LABEL: sqxtun8b:
-;CHECK-NOT: ld1
-;CHECK: sqxtun.8b v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqxtun4h(<4 x i32> %A) nounwind {
-;CHECK-LABEL: sqxtun4h:
-;CHECK-NOT: ld1
-;CHECK: sqxtun.4h v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqxtun2s(<2 x i64> %A) nounwind {
-;CHECK-LABEL: sqxtun2s:
-;CHECK-NOT: ld1
-;CHECK: sqxtun.2s v0, v0
-;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqxtun2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
-;CHECK-LABEL: sqxtun2_16b:
-;CHECK-NOT: ld1
-;CHECK: sqxtun2.16b v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
-        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %res
-}
-
-define <8 x i16> @sqxtun2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
-;CHECK-LABEL: sqxtun2_8h:
-;CHECK-NOT: ld1
-;CHECK: sqxtun2.8h v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
-        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @sqxtun2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
-;CHECK-LABEL: sqxtun2_4s:
-;CHECK-NOT: ld1
-;CHECK: sqxtun2.4s v0, v1
-;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
-        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %res
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
-
diff --git a/test/CodeGen/ARM64/vmul.ll b/test/CodeGen/ARM64/vmul.ll
deleted file mode 100644
index 3ef0a76..0000000
--- a/test/CodeGen/ARM64/vmul.ll
+++ /dev/null
@@ -1,2003 +0,0 @@
-; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-
-define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: smull8h:
-;CHECK: smull.8h
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
-  ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smull4s:
-;CHECK: smull.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smull2d:
-;CHECK: smull.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i64> %tmp3
-}
-
-declare <8 x i16>  @llvm.arm64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
-
-define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: umull8h:
-;CHECK: umull.8h
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
-  ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umull4s:
-;CHECK: umull.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umull2d:
-;CHECK: umull.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i64> %tmp3
-}
-
-declare <8 x i16>  @llvm.arm64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
-
-define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmull4s:
-;CHECK: sqdmull.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmull2d:
-;CHECK: sqdmull.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i64> %tmp3
-}
-
-define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmull2_4s:
-;CHECK: sqdmull2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmull2_2d:
-;CHECK: sqdmull2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i64> %tmp3
-}
-
-
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
-
-define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: pmull8h:
-;CHECK: pmull.8h
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
-  ret <8 x i16> %tmp3
-}
-
-declare <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-
-define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_4h:
-;CHECK: sqdmulh.4h
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_8h:
-;CHECK: sqdmulh.8h
-  %tmp1 = load <8 x i16>* %A
-  %tmp2 = load <8 x i16>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-  ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_2s:
-;CHECK: sqdmulh.2s
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_4s:
-;CHECK: sqdmulh.4s
-  %tmp1 = load <4 x i32>* %A
-  %tmp2 = load <4 x i32>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-  ret <4 x i32> %tmp3
-}
-
-define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
-;CHECK-LABEL: sqdmulh_1s:
-;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
-  %tmp1 = load i32* %A
-  %tmp2 = load i32* %B
-  %tmp3 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
-  ret i32 %tmp3
-}
-
-declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare i32 @llvm.arm64.neon.sqdmulh.i32(i32, i32) nounwind readnone
-
-define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_4h:
-;CHECK: sqrdmulh.4h
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_8h:
-;CHECK: sqrdmulh.8h
-  %tmp1 = load <8 x i16>* %A
-  %tmp2 = load <8 x i16>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-  ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_2s:
-;CHECK: sqrdmulh.2s
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_4s:
-;CHECK: sqrdmulh.4s
-  %tmp1 = load <4 x i32>* %A
-  %tmp2 = load <4 x i32>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-  ret <4 x i32> %tmp3
-}
-
-define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_1s:
-;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
-  %tmp1 = load i32* %A
-  %tmp2 = load i32* %B
-  %tmp3 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
-  ret i32 %tmp3
-}
-
-declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare i32 @llvm.arm64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
-
-define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmulx_2s:
-;CHECK: fmulx.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-  ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmulx_4s:
-;CHECK: fmulx.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-  ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmulx_2d:
-;CHECK: fmulx.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-  ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: smlal4s:
-;CHECK: smlal.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = add <4 x i32> %tmp3, %tmp4
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: smlal2d:
-;CHECK: smlal.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = add <2 x i64> %tmp3, %tmp4
-  ret <2 x i64> %tmp5
-}
-
-define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: smlsl4s:
-;CHECK: smlsl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = sub <4 x i32> %tmp3, %tmp4
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: smlsl2d:
-;CHECK: smlsl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = sub <2 x i64> %tmp3, %tmp4
-  ret <2 x i64> %tmp5
-}
-
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
-
-define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlal4s:
-;CHECK: sqdmlal.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlal2d:
-;CHECK: sqdmlal.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
-  ret <2 x i64> %tmp5
-}
-
-define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlal2_4s:
-;CHECK: sqdmlal2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlal2_2d:
-;CHECK: sqdmlal2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
-  ret <2 x i64> %tmp5
-}
-
-define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl4s:
-;CHECK: sqdmlsl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl2d:
-;CHECK: sqdmlsl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
-  ret <2 x i64> %tmp5
-}
-
-define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl2_4s:
-;CHECK: sqdmlsl2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl2_2d:
-;CHECK: sqdmlsl2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
-  ret <2 x i64> %tmp5
-}
-
-define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: umlal4s:
-;CHECK: umlal.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = add <4 x i32> %tmp3, %tmp4
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: umlal2d:
-;CHECK: umlal.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = add <2 x i64> %tmp3, %tmp4
-  ret <2 x i64> %tmp5
-}
-
-define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: umlsl4s:
-;CHECK: umlsl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = sub <4 x i32> %tmp3, %tmp4
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: umlsl2d:
-;CHECK: umlsl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = sub <2 x i64> %tmp3, %tmp4
-  ret <2 x i64> %tmp5
-}
-
-define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
-;CHECK-LABEL: fmla_2s:
-;CHECK: fmla.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = load <2 x float>* %C
-  %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
-  ret <2 x float> %tmp4
-}
-
-define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
-;CHECK-LABEL: fmla_4s:
-;CHECK: fmla.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = load <4 x float>* %C
-  %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
-  ret <4 x float> %tmp4
-}
-
-define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
-;CHECK-LABEL: fmla_2d:
-;CHECK: fmla.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = load <2 x double>* %C
-  %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
-  ret <2 x double> %tmp4
-}
-
-declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
-;CHECK-LABEL: fmls_2s:
-;CHECK: fmls.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = load <2 x float>* %C
-  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
-  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
-  ret <2 x float> %tmp5
-}
-
-define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
-;CHECK-LABEL: fmls_4s:
-;CHECK: fmls.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = load <4 x float>* %C
-  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
-  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
-  ret <4 x float> %tmp5
-}
-
-define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
-;CHECK-LABEL: fmls_2d:
-;CHECK: fmls.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = load <2 x double>* %C
-  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
-  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
-  ret <2 x double> %tmp5
-}
-
-define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
-;CHECK-LABEL: fmls_commuted_neg_2s:
-;CHECK: fmls.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = load <2 x float>* %C
-  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
-  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
-  ret <2 x float> %tmp5
-}
-
-define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
-;CHECK-LABEL: fmls_commuted_neg_4s:
-;CHECK: fmls.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = load <4 x float>* %C
-  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
-  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
-  ret <4 x float> %tmp5
-}
-
-define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
-;CHECK-LABEL: fmls_commuted_neg_2d:
-;CHECK: fmls.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = load <2 x double>* %C
-  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
-  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
-  ret <2 x double> %tmp5
-}
-
-define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
-;CHECK-LABEL: fmls_indexed_2s:
-;CHECK: fmls.2s
-entry:
-  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
-  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
-  %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
-  ret <2 x float> %fmls1
-}
-
-define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
-;CHECK-LABEL: fmls_indexed_4s:
-;CHECK: fmls.4s
-entry:
-  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
-  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
-  %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
-  ret <4 x float> %fmls1
-}
-
-define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
-;CHECK-LABEL: fmls_indexed_2d:
-;CHECK: fmls.2d
-entry:
-  %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
-  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
-  %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
-  ret <2 x double> %fmls1
-}
-
-define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fmla_indexed_scalar_2s:
-; CHECK-NEXT: fmla.2s
-; CHECK-NEXT: ret
-  %v1 = insertelement <2 x float> undef, float %c, i32 0
-  %v2 = insertelement <2 x float> %v1, float %c, i32 1
-  %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
-  ret <2 x float> %fmla1
-}
-
-define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: fmla_indexed_scalar_4s:
-; CHECK-NEXT: fmla.4s
-; CHECK-NEXT: ret
-  %v1 = insertelement <4 x float> undef, float %c, i32 0
-  %v2 = insertelement <4 x float> %v1, float %c, i32 1
-  %v3 = insertelement <4 x float> %v2, float %c, i32 2
-  %v4 = insertelement <4 x float> %v3, float %c, i32 3
-  %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
-  ret <4 x float> %fmla1
-}
-
-define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
-; CHECK-LABEL: fmla_indexed_scalar_2d:
-; CHECK-NEXT: fmla.2d
-; CHECK-NEXT: ret
-entry:
-  %v1 = insertelement <2 x double> undef, double %c, i32 0
-  %v2 = insertelement <2 x double> %v1, double %c, i32 1
-  %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
-  ret <2 x double> %fmla1
-}
-
-define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: mul_4h:
-;CHECK-NOT: dup
-;CHECK: mul.4h
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = mul <4 x i16> %tmp1, %tmp3
-  ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: mul_8h:
-;CHECK-NOT: dup
-;CHECK: mul.8h
-  %tmp1 = load <8 x i16>* %A
-  %tmp2 = load <8 x i16>* %B
-  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = mul <8 x i16> %tmp1, %tmp3
-  ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: mul_2s:
-;CHECK-NOT: dup
-;CHECK: mul.2s
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = mul <2 x i32> %tmp1, %tmp3
-  ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: mul_4s:
-;CHECK-NOT: dup
-;CHECK: mul.4s
-  %tmp1 = load <4 x i32>* %A
-  %tmp2 = load <4 x i32>* %B
-  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = mul <4 x i32> %tmp1, %tmp3
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
-; CHECK-LABEL: mul_2d:
-; CHECK: mul
-; CHECK: mul
-  %tmp1 = mul <2 x i64> %A, %B
-  ret <2 x i64> %tmp1
-}
-
-define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmul_lane_2s:
-;CHECK-NOT: dup
-;CHECK: fmul.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = fmul <2 x float> %tmp1, %tmp3
-  ret <2 x float> %tmp4
-}
-
-define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmul_lane_4s:
-;CHECK-NOT: dup
-;CHECK: fmul.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = fmul <4 x float> %tmp1, %tmp3
-  ret <4 x float> %tmp4
-}
-
-define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmul_lane_2d:
-;CHECK-NOT: dup
-;CHECK: fmul.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = fmul <2 x double> %tmp1, %tmp3
-  ret <2 x double> %tmp4
-}
-
-define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
-;CHECK-LABEL: fmul_lane_s:
-;CHECK-NOT: dup
-;CHECK: fmul.s s0, s0, v1[3]
-  %B = extractelement <4 x float> %vec, i32 3
-  %res = fmul float %A, %B
-  ret float %res
-}
-
-define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
-;CHECK-LABEL: fmul_lane_d:
-;CHECK-NOT: dup
-;CHECK: fmul.d d0, d0, v1[1]
-  %B = extractelement <2 x double> %vec, i32 1
-  %res = fmul double %A, %B
-  ret double %res
-}
-
-
-
-define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmulx_lane_2s:
-;CHECK-NOT: dup
-;CHECK: fmulx.2s
-  %tmp1 = load <2 x float>* %A
-  %tmp2 = load <2 x float>* %B
-  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
-  ret <2 x float> %tmp4
-}
-
-define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmulx_lane_4s:
-;CHECK-NOT: dup
-;CHECK: fmulx.4s
-  %tmp1 = load <4 x float>* %A
-  %tmp2 = load <4 x float>* %B
-  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
-  ret <4 x float> %tmp4
-}
-
-define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmulx_lane_2d:
-;CHECK-NOT: dup
-;CHECK: fmulx.2d
-  %tmp1 = load <2 x double>* %A
-  %tmp2 = load <2 x double>* %B
-  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
-  ret <2 x double> %tmp4
-}
-
-define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_lane_4h:
-;CHECK-NOT: dup
-;CHECK: sqdmulh.4h
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
-  ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_lane_8h:
-;CHECK-NOT: dup
-;CHECK: sqdmulh.8h
-  %tmp1 = load <8 x i16>* %A
-  %tmp2 = load <8 x i16>* %B
-  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
-  ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_lane_2s:
-;CHECK-NOT: dup
-;CHECK: sqdmulh.2s
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
-  ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmulh_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmulh.4s
-  %tmp1 = load <4 x i32>* %A
-  %tmp2 = load <4 x i32>* %B
-  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
-  ret <4 x i32> %tmp4
-}
-
-define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
-;CHECK-LABEL: sqdmulh_lane_1s:
-;CHECK-NOT: dup
-;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
-  %tmp1 = extractelement <4 x i32> %B, i32 1
-  %tmp2 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
-  ret i32 %tmp2
-}
-
-define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_lane_4h:
-;CHECK-NOT: dup
-;CHECK: sqrdmulh.4h
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
-  ret <4 x i16> %tmp4
-}
-
-define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_lane_8h:
-;CHECK-NOT: dup
-;CHECK: sqrdmulh.8h
-  %tmp1 = load <8 x i16>* %A
-  %tmp2 = load <8 x i16>* %B
-  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
-  ret <8 x i16> %tmp4
-}
-
-define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_lane_2s:
-;CHECK-NOT: dup
-;CHECK: sqrdmulh.2s
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
-  ret <2 x i32> %tmp4
-}
-
-define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqrdmulh_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqrdmulh.4s
-  %tmp1 = load <4 x i32>* %A
-  %tmp2 = load <4 x i32>* %B
-  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
-  ret <4 x i32> %tmp4
-}
-
-define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
-;CHECK-LABEL: sqrdmulh_lane_1s:
-;CHECK-NOT: dup
-;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
-  %tmp1 = extractelement <4 x i32> %B, i32 1
-  %tmp2 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
-  ret i32 %tmp2
-}
-
-define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmull_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmull.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmull_lane_2d:
-;CHECK-NOT: dup
-;CHECK: sqdmull.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
-  ret <2 x i64> %tmp4
-}
-
-define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqdmull2_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmull2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqdmull2_lane_2d:
-;CHECK-NOT: dup
-;CHECK: sqdmull2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  ret <2 x i64> %tmp4
-}
-
-define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umull_lane_4s:
-;CHECK-NOT: dup
-;CHECK: umull.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umull_lane_2d:
-;CHECK-NOT: dup
-;CHECK: umull.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
-  ret <2 x i64> %tmp4
-}
-
-define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smull_lane_4s:
-;CHECK-NOT: dup
-;CHECK: smull.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
-  ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smull_lane_2d:
-;CHECK-NOT: dup
-;CHECK: smull.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
-  ret <2 x i64> %tmp4
-}
-
-define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: smlal_lane_4s:
-;CHECK-NOT: dup
-;CHECK: smlal.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = add <4 x i32> %tmp3, %tmp5
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: smlal_lane_2d:
-;CHECK-NOT: dup
-;CHECK: smlal.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = add <2 x i64> %tmp3, %tmp5
-  ret <2 x i64> %tmp6
-}
-
-define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlal_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmlal.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlal_lane_2d:
-;CHECK-NOT: dup
-;CHECK: sqdmlal.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
-  ret <2 x i64> %tmp6
-}
-
-define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlal2_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmlal2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlal2_lane_2d:
-;CHECK-NOT: dup
-;CHECK: sqdmlal2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
-  ret <2 x i64> %tmp6
-}
-
-define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
-;CHECK-LABEL: sqdmlal_lane_1s:
-;CHECK: sqdmlal.4s
-  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
-  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
-  %prod = extractelement <4 x i32> %prod.vec, i32 0
-  %res = call i32 @llvm.arm64.neon.sqadd.i32(i32 %A, i32 %prod)
-  ret i32 %res
-}
-declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32)
-
-define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
-;CHECK-LABEL: sqdmlsl_lane_1s:
-;CHECK: sqdmlsl.4s
-  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
-  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
-  %prod = extractelement <4 x i32> %prod.vec, i32 0
-  %res = call i32 @llvm.arm64.neon.sqsub.i32(i32 %A, i32 %prod)
-  ret i32 %res
-}
-declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32)
-
-define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
-;CHECK-LABEL: sqdmlal_lane_1d:
-;CHECK: sqdmlal.s
-  %rhs = extractelement <2 x i32> %C, i32 1
-  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
-  %res = call i64 @llvm.arm64.neon.sqadd.i64(i64 %A, i64 %prod)
-  ret i64 %res
-}
-declare i64 @llvm.arm64.neon.sqdmulls.scalar(i32, i32)
-declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64)
-
-define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
-;CHECK-LABEL: sqdmlsl_lane_1d:
-;CHECK: sqdmlsl.s
-  %rhs = extractelement <2 x i32> %C, i32 1
-  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
-  %res = call i64 @llvm.arm64.neon.sqsub.i64(i64 %A, i64 %prod)
-  ret i64 %res
-}
-declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64)
-
-
-define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: umlal_lane_4s:
-;CHECK-NOT: dup
-;CHECK: umlal.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = add <4 x i32> %tmp3, %tmp5
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: umlal_lane_2d:
-;CHECK-NOT: dup
-;CHECK: umlal.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = add <2 x i64> %tmp3, %tmp5
-  ret <2 x i64> %tmp6
-}
-
-
-define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: smlsl_lane_4s:
-;CHECK-NOT: dup
-;CHECK: smlsl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = sub <4 x i32> %tmp3, %tmp5
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: smlsl_lane_2d:
-;CHECK-NOT: dup
-;CHECK: smlsl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = sub <2 x i64> %tmp3, %tmp5
-  ret <2 x i64> %tmp6
-}
-
-define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmlsl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl_lane_2d:
-;CHECK-NOT: dup
-;CHECK: sqdmlsl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
-  ret <2 x i64> %tmp6
-}
-
-define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl2_lane_4s:
-;CHECK-NOT: dup
-;CHECK: sqdmlsl2.4s
-  %load1 = load <8 x i16>* %A
-  %load2 = load <8 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: sqdmlsl2_lane_2d:
-;CHECK-NOT: dup
-;CHECK: sqdmlsl2.2d
-  %load1 = load <4 x i32>* %A
-  %load2 = load <4 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
-  ret <2 x i64> %tmp6
-}
-
-define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
-;CHECK-LABEL: umlsl_lane_4s:
-;CHECK-NOT: dup
-;CHECK: umlsl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = load <4 x i32>* %C
-  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = sub <4 x i32> %tmp3, %tmp5
-  ret <4 x i32> %tmp6
-}
-
-define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
-;CHECK-LABEL: umlsl_lane_2d:
-;CHECK-NOT: dup
-;CHECK: umlsl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = load <2 x i64>* %C
-  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = sub <2 x i64> %tmp3, %tmp5
-  ret <2 x i64> %tmp6
-}
-
-; Scalar FMULX
-define float @fmulxs(float %a, float %b) nounwind {
-; CHECK-LABEL: fmulxs:
-; CHECKNEXT: fmulx s0, s0, s1
-  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
-; CHECKNEXT: ret
-  ret float %fmulx.i
-}
-
-define double @fmulxd(double %a, double %b) nounwind {
-; CHECK-LABEL: fmulxd:
-; CHECKNEXT: fmulx d0, d0, d1
-  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
-; CHECKNEXT: ret
-  ret double %fmulx.i
-}
-
-define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
-; CHECK-LABEL: fmulxs_lane:
-; CHECKNEXT: fmulx.s s0, s0, v1[3]
-  %b = extractelement <4 x float> %vec, i32 3
-  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
-; CHECKNEXT: ret
-  ret float %fmulx.i
-}
-
-define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
-; CHECK-LABEL: fmulxd_lane:
-; CHECKNEXT: fmulx d0, d0, v1[1]
-  %b = extractelement <2 x double> %vec, i32 1
-  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
-; CHECKNEXT: ret
-  ret double %fmulx.i
-}
-
-declare double @llvm.arm64.neon.fmulx.f64(double, double) nounwind readnone
-declare float @llvm.arm64.neon.fmulx.f32(float, float) nounwind readnone
-
-
-define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
-; CHECK-LABEL: smull2_8h_simple:
-; CHECK-NEXT: smull2.8h v0, v0, v1
-; CHECK-NEXT: ret
-  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %3 = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
-  ret <8 x i16> %3
-}
-
-define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
-; CHECK-LABEL: foo0:
-; CHECK: smull2.8h v0, v0, v1
-  %tmp = bitcast <16 x i8> %a to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
-  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
-  ret <8 x i16> %vmull.i.i
-}
-
-define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
-; CHECK-LABEL: foo1:
-; CHECK: smull2.4s v0, v0, v1
-  %tmp = bitcast <8 x i16> %a to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
-  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
-; CHECK-LABEL: foo2:
-; CHECK: smull2.2d v0, v0, v1
-  %tmp = bitcast <4 x i32> %a to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
-  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
-; CHECK-LABEL: foo3:
-; CHECK: umull2.8h v0, v0, v1
-  %tmp = bitcast <16 x i8> %a to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
-  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
-  ret <8 x i16> %vmull.i.i
-}
-
-define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
-; CHECK-LABEL: foo4:
-; CHECK: umull2.4s v0, v0, v1
-  %tmp = bitcast <8 x i16> %a to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
-  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
-; CHECK-LABEL: foo5:
-; CHECK: umull2.2d v0, v0, v1
-  %tmp = bitcast <4 x i32> %a to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
-  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: foo6:
-; CHECK-NEXT: smull2.4s v0, v1, v2[1]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
-  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: foo7:
-; CHECK-NEXT: smull2.2d v0, v1, v2[1]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
-  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: foo8:
-; CHECK-NEXT: umull2.4s v0, v1, v2[1]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
-  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: foo9:
-; CHECK-NEXT: umull2.2d v0, v1, v2[1]
-; CHECK-NEXT: ret
-entry:
-  %0 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
-  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
-  ret <2 x i64> %vmull2.i
-}
-
-define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
-; CHECK-LABEL: bar0:
-; CHECK: smlal2.8h v0, v1, v2
-; CHECK-NEXT: ret
-
-  %tmp = bitcast <16 x i8> %b to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
-  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
-  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
-  %add.i = add <8 x i16> %vmull.i.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
-; CHECK-LABEL: bar1:
-; CHECK: smlal2.4s v0, v1, v2
-; CHECK-NEXT: ret
-
-  %tmp = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
-  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
-  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
-  %add.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
-; CHECK-LABEL: bar2:
-; CHECK: smlal2.2d v0, v1, v2
-; CHECK-NEXT: ret
-
-  %tmp = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
-  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
-  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
-  %add.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
-; CHECK-LABEL: bar3:
-; CHECK: umlal2.8h v0, v1, v2
-; CHECK-NEXT: ret
-
-  %tmp = bitcast <16 x i8> %b to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
-  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
-  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
-  %add.i = add <8 x i16> %vmull.i.i.i, %a
-  ret <8 x i16> %add.i
-}
-
-define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
-; CHECK-LABEL: bar4:
-; CHECK: umlal2.4s v0, v1, v2
-; CHECK-NEXT: ret
-
-  %tmp = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
-  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
-  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
-  %add.i = add <4 x i32> %vmull2.i.i.i, %a
-  ret <4 x i32> %add.i
-}
-
-define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
-; CHECK-LABEL: bar5:
-; CHECK: umlal2.2d v0, v1, v2
-; CHECK-NEXT: ret
-
-  %tmp = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
-  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
-  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
-  %add.i = add <2 x i64> %vmull2.i.i.i, %a
-  ret <2 x i64> %add.i
-}
-
-define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
-; CHECK-LABEL: mlal2_1:
-; CHECK: smlal2.4s v0, v1, v2[3]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %tmp = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
-  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
-  %add = add <4 x i32> %vmull2.i.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
-; CHECK-LABEL: mlal2_2:
-; CHECK: smlal2.2d v0, v1, v2[1]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
-  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
-  %add = add <2 x i64> %vmull2.i.i, %a
-  ret <2 x i64> %add
-}
-
-define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
-; CHECK-LABEL: mlal2_4:
-; CHECK: umlal2.4s v0, v1, v2[2]
-; CHECK-NEXT: ret
-
-  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %tmp = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
-  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
-  %add = add <4 x i32> %vmull2.i.i, %a
-  ret <4 x i32> %add
-}
-
-define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
-; CHECK-LABEL: mlal2_5:
-; CHECK: umlal2.2d v0, v1, v2[0]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
-  %tmp = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
-  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
-  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
-  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
-  %add = add <2 x i64> %vmull2.i.i, %a
-  ret <2 x i64> %add
-}
-
-; rdar://12328502
-define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: vmulq_n_f64:
-; CHECK-NOT: dup.2d
-; CHECK: fmul.2d v0, v0, v1[0]
-  %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
-  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
-  %mul.i = fmul <2 x double> %vecinit1.i, %x
-  ret <2 x double> %mul.i
-}
-
-define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: vmulq_n_f32:
-; CHECK-NOT: dup.4s
-; CHECK: fmul.4s v0, v0, v1[0]
-  %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
-  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
-  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
-  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
-  %mul.i = fmul <4 x float> %vecinit3.i, %x
-  ret <4 x float> %mul.i
-}
-
-define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: vmul_n_f32:
-; CHECK-NOT: dup.2s
-; CHECK: fmul.2s v0, v0, v1[0]
-  %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
-  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
-  %mul.i = fmul <2 x float> %vecinit1.i, %x
-  ret <2 x float> %mul.i
-}
-
-define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
-entry:
-; CHECK: vmla_laneq_s16_test
-; CHECK-NOT: ext
-; CHECK: mla.4h v0, v1, v2[6]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  %mul = mul <4 x i16> %shuffle, %b
-  %add = add <4 x i16> %mul, %a
-  ret <4 x i16> %add
-}
-
-define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
-entry:
-; CHECK: vmla_laneq_s32_test
-; CHECK-NOT: ext
-; CHECK: mla.2s v0, v1, v2[3]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %mul = mul <2 x i32> %shuffle, %b
-  %add = add <2 x i32> %mul, %a
-  ret <2 x i32> %add
-}
-
-define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
-entry:
-; CHECK: vmull_laneq_s16_test
-; CHECK-NOT: ext
-; CHECK: smull.4s v0, v0, v1[6]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
-entry:
-; CHECK: vmull_laneq_s32_test
-; CHECK-NOT: ext
-; CHECK: smull.2d v0, v0, v1[2]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
-  ret <2 x i64> %vmull2.i
-}
-define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
-entry:
-; CHECK: vmull_laneq_u16_test
-; CHECK-NOT: ext
-; CHECK: umull.4s v0, v0, v1[6]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
-  ret <4 x i32> %vmull2.i
-}
-
-define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
-entry:
-; CHECK: vmull_laneq_u32_test
-; CHECK-NOT: ext
-; CHECK: umull.2d v0, v0, v1[2]
-; CHECK-NEXT: ret
-  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
-  ret <2 x i64> %vmull2.i
-}
-
-define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
-entry:
-; CHECK: vmull_high_n_s16_test
-; CHECK-NOT: ext
-; CHECK: smull2.4s
-; CHECK-NEXT: ret
-  %conv = trunc i32 %d to i16
-  %0 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
-  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
-entry:
-; CHECK: vmull_high_n_s32_test
-; CHECK-NOT: ext
-; CHECK: smull2.2d
-; CHECK-NEXT: ret
-  %0 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
-  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
-entry:
-; CHECK: vmull_high_n_u16_test
-; CHECK-NOT: ext
-; CHECK: umull2.4s
-; CHECK-NEXT: ret
-  %conv = trunc i32 %d to i16
-  %0 = bitcast <8 x i16> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
-  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
-  ret <4 x i32> %vmull2.i.i
-}
-
-define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
-entry:
-; CHECK: vmull_high_n_u32_test
-; CHECK-NOT: ext
-; CHECK: umull2.2d
-; CHECK-NEXT: ret
-  %0 = bitcast <4 x i32> %b to <2 x i64>
-  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
-  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
-  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
-  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
-  ret <2 x i64> %vmull2.i.i
-}
-
-define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: vmul_built_dup_test:
-; CHECK-NOT: ins
-; CHECK-NOT: dup
-; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1]
-  %vget_lane = extractelement <4 x i32> %b, i32 1
-  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
-  %prod = mul <4 x i32> %a, %vecinit3.i
-  ret <4 x i32> %prod
-}
-
-define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: vmul_built_dup_fromsmall_test:
-; CHECK-NOT: ins
-; CHECK-NOT: dup
-; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3]
-  %vget_lane = extractelement <4 x i16> %b, i32 3
-  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  %prod = mul <4 x i16> %a, %vecinit3.i
-  ret <4 x i16> %prod
-}
-
-define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
-; CHECK-NOT: ins
-; CHECK-NOT: dup
-; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
-  %vget_lane = extractelement <4 x i16> %b, i32 0
-  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
-  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
-  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
-  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
-  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
-  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
-  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
-  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
-  %prod = mul <8 x i16> %a, %vecinit7.i
-  ret <8 x i16> %prod
-}
-
-define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: mull_from_two_extracts:
-; CHECK-NOT: ext
-; CHECK: sqdmull2.2d
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  ret <2 x i64> %res
-}
-
-define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: mlal_from_two_extracts:
-; CHECK-NOT: ext
-; CHECK: sqdmlal2.2d
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
-  ret <2 x i64> %sum
-}
-
-define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: mull_from_extract_dup:
-; CHECK-NOT: ext
-; CHECK: sqdmull2.2d
-  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
-  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
-  ret <2 x i64> %res
-}
-
-define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
-; CHECK-LABEL: pmull_from_extract_dup:
-; CHECK-NOT: ext
-; CHECK: pmull2.8h
-  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
-  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-
-  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-
-  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK-LABEL: pmull_from_extract_duplane:
-; CHECK-NOT: ext
-; CHECK: pmull2.8h
-
-  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-
-  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
-  ret <8 x i16> %res
-}
-
-define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: sqdmull_from_extract_duplane:
-; CHECK-NOT: ext
-; CHECK: sqdmull2.2d
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
-
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  ret <2 x i64> %res
-}
-
-define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: sqdmlal_from_extract_duplane:
-; CHECK-NOT: ext
-; CHECK: sqdmlal2.2d
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
-
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
-  ret <2 x i64> %sum
-}
-
-define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: umlal_from_extract_duplane:
-; CHECK-NOT: ext
-; CHECK: umlal2.2d
-
-  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
-
-  %res = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  %sum = add <2 x i64> %accum, %res
-  ret <2 x i64> %sum
-}
-
-define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
-; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
-; CHECK: fmla.s s0, s1, v2[3]
-  %rhs = extractelement <4 x float> %rvec, i32 3
-  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
-  ret float %res
-}
-
-define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
-; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
-; CHECK: fmla.s s0, s1, v2[1]
-  %rhs = extractelement <2 x float> %rvec, i32 1
-  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
-  ret float %res
-}
-
-define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
-; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
-; CHECK: fmls.s s0, s1, v2[3]
-  %rhs.scal = extractelement <4 x float> %rvec, i32 3
-  %rhs = fsub float -0.0, %rhs.scal
-  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
-  ret float %res
-}
-
-define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
-; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
-; CHECK: fmls.s s0, s1, v2[1]
-  %rhs.scal = extractelement <2 x float> %rvec, i32 1
-  %rhs = fsub float -0.0, %rhs.scal
-  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
-  ret float %res
-}
-
-declare float @llvm.fma.f32(float, float, float)
-
-define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
-; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
-; CHECK: fmla.d d0, d1, v2[1]
-  %rhs = extractelement <2 x double> %rvec, i32 1
-  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
-  ret double %res
-}
-
-define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
-; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
-; CHECK: fmls.d d0, d1, v2[1]
-  %rhs.scal = extractelement <2 x double> %rvec, i32 1
-  %rhs = fsub double -0.0, %rhs.scal
-  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
-  ret double %res
-}
-
-declare double @llvm.fma.f64(double, double, double)
-
-define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
-; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
-; CHECK: fmls.2s v0, v1, v2[3]
-  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
-  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
-  ret <2 x float> %res
-}
-
-define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
-; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
-; CHECK: fmls.2s v0, v1, v2[1]
-  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
-  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
-  ret <2 x float> %res
-}
-
-define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
-; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
-; CHECK: fmls.4s v0, v1, v2[3]
-  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
-  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
-  ret <4 x float> %res
-}
-
-define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
-; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
-; CHECK: fmls.4s v0, v1, v2[1]
-  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
-  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
-  ret <4 x float> %res
-}
-
-define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
-; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
-; CHECK: fmls.2d v0, v1, v2[1]
-  %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
-  %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
-  ret <2 x double> %res
-}
-
-define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
-; CHECK-LABEL: test_fmul_v1f64:
-; CHECK: fmul
-  %prod = fmul <1 x double> %L, %R
-  ret <1 x double> %prod
-}
-
-define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
-; CHECK-LABEL: test_fdiv_v1f64:
-; CHECK-LABEL: fdiv
-  %prod = fdiv <1 x double> %L, %R
-  ret <1 x double> %prod
-}
-
-define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
-;CHECK-LABEL: sqdmlal_d:
-;CHECK: sqdmlal
-  %tmp4 = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %A, i32 %B)
-  %tmp5 = call i64 @llvm.arm64.neon.sqadd.i64(i64 %C, i64 %tmp4)
-  ret i64 %tmp5
-}
-
-define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
-;CHECK-LABEL: sqdmlsl_d:
-;CHECK: sqdmlsl
-  %tmp4 = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %A, i32 %B)
-  %tmp5 = call i64 @llvm.arm64.neon.sqsub.i64(i64 %C, i64 %tmp4)
-  ret i64 %tmp5
-}
-
-define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: test_pmull_64:
-; CHECK: pmull.1q
-  %val = call <16 x i8> @llvm.arm64.neon.pmull64(i64 %l, i64 %r)
-  ret <16 x i8> %val
-}
-
-define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
-; CHECK-LABEL: test_pmull_high_64:
-; CHECK: pmull2.1q
-  %l_hi = extractelement <2 x i64> %l, i32 1
-  %r_hi = extractelement <2 x i64> %r, i32 1
-  %val = call <16 x i8> @llvm.arm64.neon.pmull64(i64 %l_hi, i64 %r_hi)
-  ret <16 x i8> %val
-}
-
-declare <16 x i8> @llvm.arm64.neon.pmull64(i64, i64)
diff --git a/test/CodeGen/ARM64/volatile.ll b/test/CodeGen/ARM64/volatile.ll
deleted file mode 100644
index e00ac5a..0000000
--- a/test/CodeGen/ARM64/volatile.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-define i64 @normal_load(i64* nocapture %bar) nounwind readonly {
-; CHECK: normal_load
-; CHECK: ldp
-; CHECK-NEXT: add
-; CHECK-NEXT: ret
-  %add.ptr = getelementptr inbounds i64* %bar, i64 1
-  %tmp = load i64* %add.ptr, align 8
-  %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
-  %tmp1 = load i64* %add.ptr1, align 8
-  %add = add nsw i64 %tmp1, %tmp
-  ret i64 %add
-}
-
-define i64 @volatile_load(i64* nocapture %bar) nounwind {
-; CHECK: volatile_load
-; CHECK: ldr
-; CHECK-NEXT: ldr
-; CHECK-NEXT: add
-; CHECK-NEXT: ret
-  %add.ptr = getelementptr inbounds i64* %bar, i64 1
-  %tmp = load volatile i64* %add.ptr, align 8
-  %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
-  %tmp1 = load volatile i64* %add.ptr1, align 8
-  %add = add nsw i64 %tmp1, %tmp
-  ret i64 %add
-}
diff --git a/test/CodeGen/ARM64/vqadd.ll b/test/CodeGen/ARM64/vqadd.ll
deleted file mode 100644
index 0b7f7e5..0000000
--- a/test/CodeGen/ARM64/vqadd.ll
+++ /dev/null
@@ -1,332 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @sqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sqadd8b:
-;CHECK: sqadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqadd4h:
-;CHECK: sqadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqadd2s:
-;CHECK: sqadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @uqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uqadd8b:
-;CHECK: uqadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uqadd4h:
-;CHECK: uqadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uqadd2s:
-;CHECK: uqadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sqadd16b:
-;CHECK: sqadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqadd8h:
-;CHECK: sqadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqadd4s:
-;CHECK: sqadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sqadd2d:
-;CHECK: sqadd.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @uqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uqadd16b:
-;CHECK: uqadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uqadd8h:
-;CHECK: uqadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uqadd4s:
-;CHECK: uqadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uqadd2d:
-;CHECK: uqadd.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @usqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: usqadd8b:
-;CHECK: usqadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @usqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: usqadd4h:
-;CHECK: usqadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @usqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: usqadd2s:
-;CHECK: usqadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @usqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: usqadd16b:
-;CHECK: usqadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @usqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: usqadd8h:
-;CHECK: usqadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @usqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: usqadd4s:
-;CHECK: usqadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @usqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: usqadd2d:
-;CHECK: usqadd.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define i64 @usqadd_d(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: usqadd_d:
-; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call i64 @llvm.arm64.neon.usqadd.i64(i64 %l, i64 %r)
-  ret i64 %sum
-}
-
-define i32 @usqadd_s(i32 %l, i32 %r) nounwind {
-; CHECK-LABEL: usqadd_s:
-; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
-  %sum = call i32 @llvm.arm64.neon.usqadd.i32(i32 %l, i32 %r)
-  ret i32 %sum
-}
-
-declare <8 x i8>  @llvm.arm64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.usqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.usqadd.i32(i32, i32) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @suqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: suqadd8b:
-;CHECK: suqadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @suqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: suqadd4h:
-;CHECK: suqadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @suqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: suqadd2s:
-;CHECK: suqadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @suqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: suqadd16b:
-;CHECK: suqadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @suqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: suqadd8h:
-;CHECK: suqadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @suqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: suqadd4s:
-;CHECK: suqadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @suqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: suqadd2d:
-;CHECK: suqadd.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i64> @suqadd_1d(<1 x i64> %l, <1 x i64> %r) nounwind {
-; CHECK-LABEL: suqadd_1d:
-; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
-  ret <1 x i64> %sum
-}
-
-define i64 @suqadd_d(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: suqadd_d:
-; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call i64 @llvm.arm64.neon.suqadd.i64(i64 %l, i64 %r)
-  ret i64 %sum
-}
-
-define i32 @suqadd_s(i32 %l, i32 %r) nounwind {
-; CHECK-LABEL: suqadd_s:
-; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
-  %sum = call i32 @llvm.arm64.neon.suqadd.i32(i32 %l, i32 %r)
-  ret i32 %sum
-}
-
-declare <8 x i8>  @llvm.arm64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.suqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.suqadd.i32(i32, i32) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vqsub.ll b/test/CodeGen/ARM64/vqsub.ll
deleted file mode 100644
index 0afeb68..0000000
--- a/test/CodeGen/ARM64/vqsub.ll
+++ /dev/null
@@ -1,147 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sqsub8b:
-;CHECK: sqsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqsub4h:
-;CHECK: sqsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqsub2s:
-;CHECK: sqsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uqsub8b:
-;CHECK: uqsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uqsub4h:
-;CHECK: uqsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uqsub2s:
-;CHECK: uqsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sqsub16b:
-;CHECK: sqsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqsub8h:
-;CHECK: sqsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqsub4s:
-;CHECK: sqsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sqsub2d:
-;CHECK: sqsub.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uqsub16b:
-;CHECK: uqsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uqsub8h:
-;CHECK: uqsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uqsub4s:
-;CHECK: uqsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uqsub2d:
-;CHECK: uqsub.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vselect.ll b/test/CodeGen/ARM64/vselect.ll
deleted file mode 100644
index 07274a0..0000000
--- a/test/CodeGen/ARM64/vselect.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-;CHECK: @func63
-;CHECK: cmeq.4h v0, v0, v1
-;CHECK: sshll.4s  v0, v0, #0
-;CHECK: bsl.16b v0, v2, v3
-;CHECK: str  q0, [x0]
-;CHECK: ret
-
-%T0_63 = type <4 x i16>
-%T1_63 = type <4 x i32>
-%T2_63 = type <4 x i1>
-define void @func63(%T1_63* %out, %T0_63 %v0, %T0_63 %v1, %T1_63 %v2, %T1_63 %v3) {
-  %cond = icmp eq %T0_63 %v0, %v1
-  %r = select %T2_63 %cond, %T1_63 %v2, %T1_63 %v3
-  store %T1_63 %r, %T1_63* %out
-  ret void
-}
diff --git a/test/CodeGen/ARM64/vsetcc_fp.ll b/test/CodeGen/ARM64/vsetcc_fp.ll
deleted file mode 100644
index c93aad5..0000000
--- a/test/CodeGen/ARM64/vsetcc_fp.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
-define <2 x i32> @fcmp_one(<2 x float> %x, <2 x float> %y) nounwind optsize readnone {
-; CHECK-LABEL: fcmp_one:
-; CHECK-NEXT: fcmgt.2s [[REG:v[0-9]+]], v0, v1
-; CHECK-NEXT: fcmgt.2s [[REG2:v[0-9]+]], v1, v0
-; CHECK-NEXT: orr.8b v0, [[REG2]], [[REG]]
-; CHECK-NEXT: ret
-  %tmp = fcmp one <2 x float> %x, %y
-  %or = sext <2 x i1> %tmp to <2 x i32>
-  ret <2 x i32> %or
-}
diff --git a/test/CodeGen/ARM64/vshift.ll b/test/CodeGen/ARM64/vshift.ll
deleted file mode 100644
index ae5da38..0000000
--- a/test/CodeGen/ARM64/vshift.ll
+++ /dev/null
@@ -1,1909 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -enable-misched=false | FileCheck %s
-
-define <8 x i8> @sqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sqshl8b:
-;CHECK: sqshl.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqshl4h:
-;CHECK: sqshl.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqshl2s:
-;CHECK: sqshl.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @uqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uqshl8b:
-;CHECK: uqshl.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uqshl4h:
-;CHECK: uqshl.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uqshl2s:
-;CHECK: uqshl.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sqshl16b:
-;CHECK: sqshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqshl8h:
-;CHECK: sqshl.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqshl4s:
-;CHECK: sqshl.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sqshl2d:
-;CHECK: sqshl.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @uqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uqshl16b:
-;CHECK: uqshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uqshl8h:
-;CHECK: uqshl.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uqshl4s:
-;CHECK: uqshl.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uqshl2d:
-;CHECK: uqshl.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @srshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: srshl8b:
-;CHECK: srshl.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @srshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: srshl4h:
-;CHECK: srshl.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @srshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: srshl2s:
-;CHECK: srshl.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @urshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: urshl8b:
-;CHECK: urshl.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @urshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: urshl4h:
-;CHECK: urshl.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @urshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: urshl2s:
-;CHECK: urshl.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @srshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: srshl16b:
-;CHECK: srshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @srshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: srshl8h:
-;CHECK: srshl.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @srshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: srshl4s:
-;CHECK: srshl.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @srshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: srshl2d:
-;CHECK: srshl.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @urshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: urshl16b:
-;CHECK: urshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @urshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: urshl8h:
-;CHECK: urshl.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @urshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: urshl4s:
-;CHECK: urshl.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @urshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: urshl2d:
-;CHECK: urshl.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @sqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sqrshl8b:
-;CHECK: sqrshl.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqrshl4h:
-;CHECK: sqrshl.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqrshl2s:
-;CHECK: sqrshl.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @uqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uqrshl8b:
-;CHECK: uqrshl.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uqrshl4h:
-;CHECK: uqrshl.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uqrshl2s:
-;CHECK: uqrshl.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sqrshl16b:
-;CHECK: sqrshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqrshl8h:
-;CHECK: sqrshl.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqrshl4s:
-;CHECK: sqrshl.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sqrshl2d:
-;CHECK: sqrshl.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @uqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uqrshl16b:
-;CHECK: uqrshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uqrshl8h:
-;CHECK: uqrshl.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uqrshl4s:
-;CHECK: uqrshl.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uqrshl2d:
-;CHECK: uqrshl.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @urshr8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: urshr8b:
-;CHECK: urshr.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @urshr4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: urshr4h:
-;CHECK: urshr.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @urshr2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: urshr2s:
-;CHECK: urshr.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @urshr16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: urshr16b:
-;CHECK: urshr.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @urshr8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: urshr8h:
-;CHECK: urshr.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @urshr4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: urshr4s:
-;CHECK: urshr.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @urshr2d(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: urshr2d:
-;CHECK: urshr.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @srshr8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: srshr8b:
-;CHECK: srshr.8b
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @srshr4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: srshr4h:
-;CHECK: srshr.4h
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @srshr2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: srshr2s:
-;CHECK: srshr.2s
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @srshr16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: srshr16b:
-;CHECK: srshr.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @srshr8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: srshr8h:
-;CHECK: srshr.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @srshr4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: srshr4s:
-;CHECK: srshr.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @srshr2d(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: srshr2d:
-;CHECK: srshr.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @sqshlu8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: sqshlu8b:
-;CHECK: sqshlu.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqshlu4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshlu4h:
-;CHECK: sqshlu.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqshlu2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshlu2s:
-;CHECK: sqshlu.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqshlu16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: sqshlu16b:
-;CHECK: sqshlu.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqshlu8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshlu8h:
-;CHECK: sqshlu.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqshlu4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshlu4s:
-;CHECK: sqshlu.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqshlu2d(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqshlu2d:
-;CHECK: sqshlu.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
-        ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @rshrn8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: rshrn8b:
-;CHECK: rshrn.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @rshrn4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: rshrn4h:
-;CHECK: rshrn.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @rshrn2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: rshrn2s:
-;CHECK: rshrn.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @rshrn16b(<8 x i8> *%ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: rshrn16b:
-;CHECK: rshrn2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @rshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: rshrn8h:
-;CHECK: rshrn2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @rshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: rshrn4s:
-;CHECK: rshrn2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare <8 x i8>  @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
-
-define <8 x i8> @shrn8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: shrn8b:
-;CHECK: shrn.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @shrn4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: shrn4h:
-;CHECK: shrn.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
-        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @shrn2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: shrn2s:
-;CHECK: shrn.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
-        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @shrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: shrn16b:
-;CHECK: shrn2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @shrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: shrn8h:
-;CHECK: shrn2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
-        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @shrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: shrn4s:
-;CHECK: shrn2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
-        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare <8 x i8>  @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
-
-define i32 @sqshrn1s(i64 %A) nounwind {
-; CHECK-LABEL: sqshrn1s:
-; CHECK: sqshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqshrn.i32(i64 %A, i32 1)
-  ret i32 %tmp
-}
-
-define <8 x i8> @sqshrn8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshrn8b:
-;CHECK: sqshrn.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqshrn4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshrn4h:
-;CHECK: sqshrn.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqshrn2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqshrn2s:
-;CHECK: sqshrn.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-
-define <16 x i8> @sqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshrn16b:
-;CHECK: sqshrn2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @sqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshrn8h:
-;CHECK: sqshrn2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @sqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqshrn4s:
-;CHECK: sqshrn2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare i32  @llvm.arm64.neon.sqshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
-
-define i32 @sqshrun1s(i64 %A) nounwind {
-; CHECK-LABEL: sqshrun1s:
-; CHECK: sqshrun {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqshrun.i32(i64 %A, i32 1)
-  ret i32 %tmp
-}
-
-define <8 x i8> @sqshrun8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshrun8b:
-;CHECK: sqshrun.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqshrun4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshrun4h:
-;CHECK: sqshrun.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqshrun2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqshrun2s:
-;CHECK: sqshrun.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshrun16b:
-;CHECK: sqshrun2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @sqshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshrun8h:
-;CHECK: sqshrun2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @sqshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqshrun4s:
-;CHECK: sqshrun2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare i32  @llvm.arm64.neon.sqshrun.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
-
-define i32 @sqrshrn1s(i64 %A) nounwind {
-; CHECK-LABEL: sqrshrn1s:
-; CHECK: sqrshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqrshrn.i32(i64 %A, i32 1)
-  ret i32 %tmp
-}
-
-define <8 x i8> @sqrshrn8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqrshrn8b:
-;CHECK: sqrshrn.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqrshrn4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqrshrn4h:
-;CHECK: sqrshrn.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqrshrn2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqrshrn2s:
-;CHECK: sqrshrn.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqrshrn16b:
-;CHECK: sqrshrn2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @sqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqrshrn8h:
-;CHECK: sqrshrn2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @sqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqrshrn4s:
-;CHECK: sqrshrn2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare i32  @llvm.arm64.neon.sqrshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
-
-define i32 @sqrshrun1s(i64 %A) nounwind {
-; CHECK-LABEL: sqrshrun1s:
-; CHECK: sqrshrun {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqrshrun.i32(i64 %A, i32 1)
-  ret i32 %tmp
-}
-
-define <8 x i8> @sqrshrun8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqrshrun8b:
-;CHECK: sqrshrun.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqrshrun4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqrshrun4h:
-;CHECK: sqrshrun.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqrshrun2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqrshrun2s:
-;CHECK: sqrshrun.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqrshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqrshrun16b:
-;CHECK: sqrshrun2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @sqrshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqrshrun8h:
-;CHECK: sqrshrun2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @sqrshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqrshrun4s:
-;CHECK: sqrshrun2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare i32  @llvm.arm64.neon.sqrshrun.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
-
-define i32 @uqrshrn1s(i64 %A) nounwind {
-; CHECK-LABEL: uqrshrn1s:
-; CHECK: uqrshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.uqrshrn.i32(i64 %A, i32 1)
-  ret i32 %tmp
-}
-
-define <8 x i8> @uqrshrn8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: uqrshrn8b:
-;CHECK: uqrshrn.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqrshrn4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: uqrshrn4h:
-;CHECK: uqrshrn.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqrshrn2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: uqrshrn2s:
-;CHECK: uqrshrn.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @uqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: uqrshrn16b:
-;CHECK: uqrshrn2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @uqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: uqrshrn8h:
-;CHECK: uqrshrn2.8h v0, {{v[0-9]+}}, #1
-        %out = load <4 x i16>* %ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @uqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: uqrshrn4s:
-;CHECK: uqrshrn2.4s v0, {{v[0-9]+}}, #1
-        %out = load <2 x i32>* %ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-        ret <4 x i32> %tmp4
-}
-
-declare i32  @llvm.arm64.neon.uqrshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
-
-define i32 @uqshrn1s(i64 %A) nounwind {
-; CHECK-LABEL: uqshrn1s:
-; CHECK: uqshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.uqshrn.i32(i64 %A, i32 1)
-  ret i32 %tmp
-}
-
-define <8 x i8> @uqshrn8b(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: uqshrn8b:
-;CHECK: uqshrn.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqshrn4h(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: uqshrn4h:
-;CHECK: uqshrn.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqshrn2s(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: uqshrn2s:
-;CHECK: uqshrn.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @uqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
-;CHECK-LABEL: uqshrn16b:
-;CHECK: uqshrn2.16b v0, {{v[0-9]+}}, #1
-        %out = load <8 x i8>* %ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
-        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @uqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
-;CHECK-LABEL: uqshrn8h:
-;CHECK: uqshrn2.8h v0, {{v[0-9]+}}, #1
-  %out = load <4 x i16>* %ret
-  %tmp1 = load <4 x i32>* %A
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
-  %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @uqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
-;CHECK-LABEL: uqshrn4s:
-;CHECK: uqshrn2.4s v0, {{v[0-9]+}}, #1
-  %out = load <2 x i32>* %ret
-  %tmp1 = load <2 x i64>* %A
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
-  %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %tmp4
-}
-
-declare i32  @llvm.arm64.neon.uqshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
-
-define <8 x i16> @ushll8h(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: ushll8h:
-;CHECK: ushll.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
-        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @ushll4s(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: ushll4s:
-;CHECK: ushll.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
-        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @ushll2d(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: ushll2d:
-;CHECK: ushll.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
-        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: ushll2_8h:
-;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1
-        %load1 = load <16 x i8>* %A
-        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
-        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: ushll2_4s:
-;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1
-        %load1 = load <8 x i16>* %A
-        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
-        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: ushll2_2d:
-;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1
-        %load1 = load <4 x i32>* %A
-        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
-        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i16> @sshll8h(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: sshll8h:
-;CHECK: sshll.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
-        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sshll4s(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: sshll4s:
-;CHECK: sshll.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
-        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: sshll2d:
-;CHECK: sshll.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
-        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: sshll2_8h:
-;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1
-        %load1 = load <16 x i8>* %A
-        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
-        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sshll2_4s:
-;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1
-        %load1 = load <8 x i16>* %A
-        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
-        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sshll2_2d:
-;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1
-        %load1 = load <4 x i32>* %A
-        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
-        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @sqshli8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: sqshli8b:
-;CHECK: sqshl.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqshli4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshli4h:
-;CHECK: sqshl.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqshli2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshli2s:
-;CHECK: sqshl.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqshli16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: sqshli16b:
-;CHECK: sqshl.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqshli8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: sqshli8h:
-;CHECK: sqshl.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqshli4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: sqshli4s:
-;CHECK: sqshl.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqshli2d(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: sqshli2d:
-;CHECK: sqshl.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: uqshli8b:
-;CHECK: uqshl.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqshli4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: uqshli4h:
-;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqshli2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: uqshli2s:
-;CHECK: uqshl.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @uqshli16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: uqshli16b:
-;CHECK: uqshl.16b
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqshli8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: uqshli8h:
-;CHECK: uqshl.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqshli4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: uqshli4s:
-;CHECK: uqshl.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqshli2d(<2 x i64>* %A) nounwind {
-;CHECK-LABEL: uqshli2d:
-;CHECK: uqshl.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
-        ret <2 x i64> %tmp3
-}
-
-define <8 x i8> @ursra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: ursra8b:
-;CHECK: ursra.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        %tmp4 = load <8 x i8>* %B
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @ursra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: ursra4h:
-;CHECK: ursra.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-        %tmp4 = load <4 x i16>* %B
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @ursra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: ursra2s:
-;CHECK: ursra.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
-        %tmp4 = load <2 x i32>* %B
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <16 x i8> @ursra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: ursra16b:
-;CHECK: ursra.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        %tmp4 = load <16 x i8>* %B
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-         ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @ursra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: ursra8h:
-;CHECK: ursra.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-        %tmp4 = load <8 x i16>* %B
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-         ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @ursra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: ursra4s:
-;CHECK: ursra.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-        %tmp4 = load <4 x i32>* %B
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-         ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @ursra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: ursra2d:
-;CHECK: ursra.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
-        %tmp4 = load <2 x i64>* %B
-        %tmp5 = add <2 x i64> %tmp3, %tmp4
-         ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @srsra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: srsra8b:
-;CHECK: srsra.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        %tmp4 = load <8 x i8>* %B
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @srsra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: srsra4h:
-;CHECK: srsra.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-        %tmp4 = load <4 x i16>* %B
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @srsra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: srsra2s:
-;CHECK: srsra.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
-        %tmp4 = load <2 x i32>* %B
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <16 x i8> @srsra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: srsra16b:
-;CHECK: srsra.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-        %tmp4 = load <16 x i8>* %B
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-         ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @srsra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: srsra8h:
-;CHECK: srsra.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-        %tmp4 = load <8 x i16>* %B
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-         ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @srsra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: srsra4s:
-;CHECK: srsra.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-        %tmp4 = load <4 x i32>* %B
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-         ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @srsra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: srsra2d:
-;CHECK: srsra.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
-        %tmp4 = load <2 x i64>* %B
-        %tmp5 = add <2 x i64> %tmp3, %tmp4
-         ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @usra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: usra8b:
-;CHECK: usra.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp4 = load <8 x i8>* %B
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @usra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: usra4h:
-;CHECK: usra.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
-        %tmp4 = load <4 x i16>* %B
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @usra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: usra2s:
-;CHECK: usra.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
-        %tmp4 = load <2 x i32>* %B
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <16 x i8> @usra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: usra16b:
-;CHECK: usra.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp4 = load <16 x i8>* %B
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-         ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @usra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: usra8h:
-;CHECK: usra.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        %tmp4 = load <8 x i16>* %B
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-         ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @usra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: usra4s:
-;CHECK: usra.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
-        %tmp4 = load <4 x i32>* %B
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-         ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @usra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: usra2d:
-;CHECK: usra.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
-        %tmp4 = load <2 x i64>* %B
-        %tmp5 = add <2 x i64> %tmp3, %tmp4
-         ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @ssra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: ssra8b:
-;CHECK: ssra.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp3 = ashr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp4 = load <8 x i8>* %B
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @ssra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: ssra4h:
-;CHECK: ssra.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp3 = ashr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
-        %tmp4 = load <4 x i16>* %B
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @ssra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: ssra2s:
-;CHECK: ssra.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp3 = ashr <2 x i32> %tmp1, <i32 1, i32 1>
-        %tmp4 = load <2 x i32>* %B
-        %tmp5 = add <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <16 x i8> @ssra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: ssra16b:
-;CHECK: ssra.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp3 = ashr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp4 = load <16 x i8>* %B
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-         ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @ssra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: ssra8h:
-;CHECK: ssra.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp3 = ashr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        %tmp4 = load <8 x i16>* %B
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-         ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @ssra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: ssra4s:
-;CHECK: ssra.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp3 = ashr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
-        %tmp4 = load <4 x i32>* %B
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-         ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @ssra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: ssra2d:
-;CHECK: ssra.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp3 = ashr <2 x i64> %tmp1, <i64 1, i64 1>
-        %tmp4 = load <2 x i64>* %B
-        %tmp5 = add <2 x i64> %tmp3, %tmp4
-         ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @shr_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: shr_orr8b:
-;CHECK: shr.8b v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.8b
-;CHECK-NEXT: ret
-        %tmp1 = load <8 x i8>* %A
-        %tmp4 = load <8 x i8>* %B
-        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp5 = or <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @shr_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: shr_orr4h:
-;CHECK: shr.4h v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.8b
-;CHECK-NEXT: ret
-        %tmp1 = load <4 x i16>* %A
-        %tmp4 = load <4 x i16>* %B
-        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
-        %tmp5 = or <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @shr_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: shr_orr2s:
-;CHECK: shr.2s v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.8b
-;CHECK-NEXT: ret
-        %tmp1 = load <2 x i32>* %A
-        %tmp4 = load <2 x i32>* %B
-        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
-        %tmp5 = or <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <16 x i8> @shr_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: shr_orr16b:
-;CHECK: shr.16b v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <16 x i8>* %A
-        %tmp4 = load <16 x i8>* %B
-        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp5 = or <16 x i8> %tmp3, %tmp4
-         ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @shr_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: shr_orr8h:
-;CHECK: shr.8h v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp4 = load <8 x i16>* %B
-        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        %tmp5 = or <8 x i16> %tmp3, %tmp4
-         ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @shr_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: shr_orr4s:
-;CHECK: shr.4s v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp4 = load <4 x i32>* %B
-        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
-        %tmp5 = or <4 x i32> %tmp3, %tmp4
-         ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @shr_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: shr_orr2d:
-;CHECK: shr.2d v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp4 = load <2 x i64>* %B
-        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
-        %tmp5 = or <2 x i64> %tmp3, %tmp4
-         ret <2 x i64> %tmp5
-}
-
-define <8 x i8> @shl_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: shl_orr8b:
-;CHECK: shl.8b v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.8b
-;CHECK-NEXT: ret
-        %tmp1 = load <8 x i8>* %A
-        %tmp4 = load <8 x i8>* %B
-        %tmp3 = shl <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp5 = or <8 x i8> %tmp3, %tmp4
-        ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @shl_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: shl_orr4h:
-;CHECK: shl.4h v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.8b
-;CHECK-NEXT: ret
-        %tmp1 = load <4 x i16>* %A
-        %tmp4 = load <4 x i16>* %B
-        %tmp3 = shl <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
-        %tmp5 = or <4 x i16> %tmp3, %tmp4
-        ret <4 x i16> %tmp5
-}
-
-define <2 x i32> @shl_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: shl_orr2s:
-;CHECK: shl.2s v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.8b
-;CHECK-NEXT: ret
-        %tmp1 = load <2 x i32>* %A
-        %tmp4 = load <2 x i32>* %B
-        %tmp3 = shl <2 x i32> %tmp1, <i32 1, i32 1>
-        %tmp5 = or <2 x i32> %tmp3, %tmp4
-        ret <2 x i32> %tmp5
-}
-
-define <16 x i8> @shl_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: shl_orr16b:
-;CHECK: shl.16b v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <16 x i8>* %A
-        %tmp4 = load <16 x i8>* %B
-        %tmp3 = shl <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-        %tmp5 = or <16 x i8> %tmp3, %tmp4
-         ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @shl_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: shl_orr8h:
-;CHECK: shl.8h v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <8 x i16>* %A
-        %tmp4 = load <8 x i16>* %B
-        %tmp3 = shl <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-        %tmp5 = or <8 x i16> %tmp3, %tmp4
-         ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @shl_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: shl_orr4s:
-;CHECK: shl.4s v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <4 x i32>* %A
-        %tmp4 = load <4 x i32>* %B
-        %tmp3 = shl <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
-        %tmp5 = or <4 x i32> %tmp3, %tmp4
-         ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @shl_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: shl_orr2d:
-;CHECK: shl.2d v0, {{v[0-9]+}}, #1
-;CHECK-NEXT: orr.16b
-;CHECK-NEXT: ret
-        %tmp1 = load <2 x i64>* %A
-        %tmp4 = load <2 x i64>* %B
-        %tmp3 = shl <2 x i64> %tmp1, <i64 1, i64 1>
-        %tmp5 = or <2 x i64> %tmp3, %tmp4
-         ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @shll(<8 x i8> %in) {
-; CHECK-LABEL: shll:
-; CHECK: shll.8h v0, {{v[0-9]+}}, #8
-  %ext = zext <8 x i8> %in to <8 x i16>
-  %res = shl <8 x i16> %ext, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @shll_high(<8 x i16> %in) {
-; CHECK-LABEL: shll_high
-; CHECK: shll2.4s v0, {{v[0-9]+}}, #16
-  %extract = shufflevector <8 x i16> %in, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %ext = zext <4 x i16> %extract to <4 x i32>
-  %res = shl <4 x i32> %ext, <i32 16, i32 16, i32 16, i32 16>
-  ret <4 x i32> %res
-}
-
-define <8 x i8> @sli8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sli8b:
-;CHECK: sli.8b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sli4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sli4h:
-;CHECK: sli.4h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sli2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sli2s:
-;CHECK: sli.2s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
-        ret <2 x i32> %tmp3
-}
-
-define <1 x i64> @sli1d(<1 x i64>* %A, <1 x i64>* %B) nounwind {
-;CHECK-LABEL: sli1d:
-;CHECK: sli d0, {{d[0-9]+}}, #1
-        %tmp1 = load <1 x i64>* %A
-        %tmp2 = load <1 x i64>* %B
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
-        ret <1 x i64> %tmp3
-}
-
-define <16 x i8> @sli16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sli16b:
-;CHECK: sli.16b v0, {{v[0-9]+}}, #1
-        %tmp1 = load <16 x i8>* %A
-        %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
-        ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sli8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sli8h:
-;CHECK: sli.8h v0, {{v[0-9]+}}, #1
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
-        ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sli4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sli4s:
-;CHECK: sli.4s v0, {{v[0-9]+}}, #1
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
-        ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sli2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sli2d:
-;CHECK: sli.2d v0, {{v[0-9]+}}, #1
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
-        ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vshr.ll b/test/CodeGen/ARM64/vshr.ll
deleted file mode 100644
index 6adb81c..0000000
--- a/test/CodeGen/ARM64/vshr.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define <8 x i16> @testShiftRightArith_v8i16(<8 x i16> %a, <8 x i16> %b) #0 {
-; CHECK-LABEL: testShiftRightArith_v8i16:
-; CHECK: neg.8h	[[REG1:v[0-9]+]], [[REG1]]
-; CHECK-NEXT: sshl.8h [[REG2:v[0-9]+]], [[REG2]], [[REG1]]
-
-entry:
-  %a.addr = alloca <8 x i16>, align 16
-  %b.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %a, <8 x i16>* %a.addr, align 16
-  store <8 x i16> %b, <8 x i16>* %b.addr, align 16
-  %0 = load <8 x i16>* %a.addr, align 16
-  %1 = load <8 x i16>* %b.addr, align 16
-  %shr = ashr <8 x i16> %0, %1
-  ret <8 x i16> %shr
-}
-
-define <4 x i32> @testShiftRightArith_v4i32(<4 x i32> %a, <4 x i32> %b) #0 {
-; CHECK-LABEL: testShiftRightArith_v4i32:
-; CHECK: neg.4s	[[REG3:v[0-9]+]], [[REG3]]
-; CHECK-NEXT: sshl.4s [[REG4:v[0-9]+]], [[REG4]], [[REG3]]
-entry:
-  %a.addr = alloca <4 x i32>, align 32
-  %b.addr = alloca <4 x i32>, align 32
-  store <4 x i32> %a, <4 x i32>* %a.addr, align 32
-  store <4 x i32> %b, <4 x i32>* %b.addr, align 32
-  %0 = load <4 x i32>* %a.addr, align 32
-  %1 = load <4 x i32>* %b.addr, align 32
-  %shr = ashr <4 x i32> %0, %1
-  ret <4 x i32> %shr
-}
-
-define <8 x i16> @testShiftRightLogical(<8 x i16> %a, <8 x i16> %b) #0 {
-; CHECK: testShiftRightLogical
-; CHECK: neg.8h	[[REG5:v[0-9]+]], [[REG5]]
-; CHECK-NEXT: ushl.8h [[REG6:v[0-9]+]], [[REG6]], [[REG5]]
-entry:
-  %a.addr = alloca <8 x i16>, align 16
-  %b.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %a, <8 x i16>* %a.addr, align 16
-  store <8 x i16> %b, <8 x i16>* %b.addr, align 16
-  %0 = load <8 x i16>* %a.addr, align 16
-  %1 = load <8 x i16>* %b.addr, align 16
-  %shr = lshr <8 x i16> %0, %1
-  ret <8 x i16> %shr
-}
-
-define <1 x i64> @sshr_v1i64(<1 x i64> %A) nounwind {
-; CHECK-LABEL: sshr_v1i64:
-; CHECK: sshr d0, d0, #63
-  %tmp3 = ashr <1 x i64> %A, < i64 63 >
-  ret <1 x i64> %tmp3
-}
-
-define <1 x i64> @ushr_v1i64(<1 x i64> %A) nounwind {
-; CHECK-LABEL: ushr_v1i64:
-; CHECK: ushr d0, d0, #63
-  %tmp3 = lshr <1 x i64> %A, < i64 63 >
-  ret <1 x i64> %tmp3
-}
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/ARM64/vshuffle.ll b/test/CodeGen/ARM64/vshuffle.ll
deleted file mode 100644
index f90200c..0000000
--- a/test/CodeGen/ARM64/vshuffle.ll
+++ /dev/null
@@ -1,115 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-
-
-; The mask:
-; CHECK: lCPI0_0:
-; CHECK:  .byte   2                       ; 0x2
-; CHECK:  .byte   255                     ; 0xff
-; CHECK:  .byte   6                       ; 0x6
-; CHECK:  .byte   255                     ; 0xff
-; The second vector is legalized to undef and the elements of the first vector
-; are used instead.
-; CHECK:  .byte   2                       ; 0x2
-; CHECK:  .byte   4                       ; 0x4
-; CHECK:  .byte   6                       ; 0x6
-; CHECK:  .byte   0                       ; 0x0
-; CHECK: test1
-; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0
-; CHECK: movi.8h v[[REG1:[0-9]+]], #1, lsl #8
-; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
-define <8 x i1> @test1() {
-entry:
-  %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
-                                   i1 7>,
-                         <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
-                                   i1 7>,
-                         <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10,
-                                    i32 12, i32 14, i32 0>
-  ret <8 x i1> %Shuff
-}
-
-; CHECK: lCPI1_0:
-; CHECK:          .byte   2                       ; 0x2
-; CHECK:          .byte   255                     ; 0xff
-; CHECK:          .byte   6                       ; 0x6
-; CHECK:          .byte   255                     ; 0xff
-; CHECK:          .byte   10                      ; 0xa
-; CHECK:          .byte   12                      ; 0xc
-; CHECK:          .byte   14                      ; 0xe
-; CHECK:          .byte   0                       ; 0x0
-; CHECK: test2
-; CHECK: ldr     d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF]
-; CHECK: adrp    x[[REG2:[0-9]+]], lCPI1_1@PAGE
-; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG2]], lCPI1_1@PAGEOFF]
-; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
-define <8 x i1>@test2() {
-bb:
-  %Shuff = shufflevector <8 x i1> zeroinitializer,
-     <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
-     <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
-                i32 0>
-  ret <8 x i1> %Shuff
-}
-
-; CHECK: lCPI2_0:
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
-; CHECK: test3
-; CHECK: adrp    x[[REG3:[0-9]+]], lCPI2_0@PAGE
-; CHECK: ldr     q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF]
-; CHECK: movi.2d v[[REG1:[0-9]+]], #0000000000000000
-; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
-define <16 x i1> @test3(i1* %ptr, i32 %v) {
-bb:
-  %Shuff = shufflevector <16 x i1> zeroinitializer, <16 x i1> undef,
-     <16 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
-                 i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12,
-                 i32 14, i32 0>
-  ret <16 x i1> %Shuff
-}
-; CHECK: lCPI3_1:
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   1                       ; 0x1
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   18                      ; 0x12
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   31                      ; 0x1f
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   30                      ; 0x1e
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
-; CHECK: _test4:
-; CHECK:         ldr     q[[REG1:[0-9]+]]
-; CHECK:         movi.2d v[[REG0:[0-9]+]], #0000000000000000
-; CHECK:         adrp    x[[REG3:[0-9]+]], lCPI3_1@PAGE
-; CHECK:         ldr     q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF]
-; CHECK:         tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]]
-define <16 x i1> @test4(i1* %ptr, i32 %v) {
-bb:
-  %Shuff = shufflevector <16 x i1> zeroinitializer,
-     <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1,
-                i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
-     <16 x i32> <i32 2, i32 1, i32 6, i32 18, i32 10, i32 12, i32 14, i32 0,
-                 i32 2, i32 31, i32 6, i32 30, i32 10, i32 12, i32 14, i32 0>
-  ret <16 x i1> %Shuff
-}
diff --git a/test/CodeGen/ARM64/vsqrt.ll b/test/CodeGen/ARM64/vsqrt.ll
deleted file mode 100644
index 094d704..0000000
--- a/test/CodeGen/ARM64/vsqrt.ll
+++ /dev/null
@@ -1,232 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @frecps_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: frecps_2s:
-;CHECK: frecps.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frecps_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: frecps_4s:
-;CHECK: frecps.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frecps_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: frecps_2d:
-;CHECK: frecps.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-
-define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: frsqrts_2s:
-;CHECK: frsqrts.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frsqrts_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: frsqrts_4s:
-;CHECK: frsqrts.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frsqrts_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: frsqrts_2d:
-;CHECK: frsqrts.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @frecpe_2s(<2 x float>* %A) nounwind {
-;CHECK-LABEL: frecpe_2s:
-;CHECK: frecpe.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float> %tmp1)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frecpe_4s(<4 x float>* %A) nounwind {
-;CHECK-LABEL: frecpe_4s:
-;CHECK: frecpe.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float> %tmp1)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frecpe_2d(<2 x double>* %A) nounwind {
-;CHECK-LABEL: frecpe_2d:
-;CHECK: frecpe.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double> %tmp1)
-	ret <2 x double> %tmp3
-}
-
-define float @frecpe_s(float* %A) nounwind {
-;CHECK-LABEL: frecpe_s:
-;CHECK: frecpe s0, {{s[0-9]+}}
-  %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frecpe.f32(float %tmp1)
-  ret float %tmp3
-}
-
-define double @frecpe_d(double* %A) nounwind {
-;CHECK-LABEL: frecpe_d:
-;CHECK: frecpe d0, {{d[0-9]+}}
-  %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frecpe.f64(double %tmp1)
-  ret double %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
-declare float @llvm.arm64.neon.frecpe.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frecpe.f64(double) nounwind readnone
-
-define float @frecpx_s(float* %A) nounwind {
-;CHECK-LABEL: frecpx_s:
-;CHECK: frecpx s0, {{s[0-9]+}}
-  %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frecpx.f32(float %tmp1)
-  ret float %tmp3
-}
-
-define double @frecpx_d(double* %A) nounwind {
-;CHECK-LABEL: frecpx_d:
-;CHECK: frecpx d0, {{d[0-9]+}}
-  %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frecpx.f64(double %tmp1)
-  ret double %tmp3
-}
-
-declare float @llvm.arm64.neon.frecpx.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frecpx.f64(double) nounwind readnone
-
-define <2 x float> @frsqrte_2s(<2 x float>* %A) nounwind {
-;CHECK-LABEL: frsqrte_2s:
-;CHECK: frsqrte.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float> %tmp1)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frsqrte_4s(<4 x float>* %A) nounwind {
-;CHECK-LABEL: frsqrte_4s:
-;CHECK: frsqrte.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float> %tmp1)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frsqrte_2d(<2 x double>* %A) nounwind {
-;CHECK-LABEL: frsqrte_2d:
-;CHECK: frsqrte.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double> %tmp1)
-	ret <2 x double> %tmp3
-}
-
-define float @frsqrte_s(float* %A) nounwind {
-;CHECK-LABEL: frsqrte_s:
-;CHECK: frsqrte s0, {{s[0-9]+}}
-  %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frsqrte.f32(float %tmp1)
-  ret float %tmp3
-}
-
-define double @frsqrte_d(double* %A) nounwind {
-;CHECK-LABEL: frsqrte_d:
-;CHECK: frsqrte d0, {{d[0-9]+}}
-  %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frsqrte.f64(double %tmp1)
-  ret double %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
-declare float @llvm.arm64.neon.frsqrte.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frsqrte.f64(double) nounwind readnone
-
-define <2 x i32> @urecpe_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: urecpe_2s:
-;CHECK: urecpe.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32> %tmp1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @urecpe_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: urecpe_4s:
-;CHECK: urecpe.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32> %tmp1)
-	ret <4 x i32> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
-
-define <2 x i32> @ursqrte_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: ursqrte_2s:
-;CHECK: ursqrte.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @ursqrte_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: ursqrte_4s:
-;CHECK: ursqrte.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
-	ret <4 x i32> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
-
-define float @f1(float %a, float %b) nounwind readnone optsize ssp {
-; CHECK-LABEL: f1:
-; CHECK: frsqrts s0, s0, s1
-; CHECK-NEXT: ret
-  %vrsqrtss.i = tail call float @llvm.arm64.neon.frsqrts.f32(float %a, float %b) nounwind
-  ret float %vrsqrtss.i
-}
-
-define double @f2(double %a, double %b) nounwind readnone optsize ssp {
-; CHECK-LABEL: f2:
-; CHECK: frsqrts d0, d0, d1
-; CHECK-NEXT: ret
-  %vrsqrtsd.i = tail call double @llvm.arm64.neon.frsqrts.f64(double %a, double %b) nounwind
-  ret double %vrsqrtsd.i
-}
-
-declare double @llvm.arm64.neon.frsqrts.f64(double, double) nounwind readnone
-declare float @llvm.arm64.neon.frsqrts.f32(float, float) nounwind readnone
diff --git a/test/CodeGen/ARM64/vsra.ll b/test/CodeGen/ARM64/vsra.ll
deleted file mode 100644
index 3611eb3..0000000
--- a/test/CodeGen/ARM64/vsra.ll
+++ /dev/null
@@ -1,150 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vsras8:
-;CHECK: ssra.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = ashr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
-        %tmp4 = add <8 x i8> %tmp1, %tmp3
-	ret <8 x i8> %tmp4
-}
-
-define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: vsras16:
-;CHECK: ssra.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = ashr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
-        %tmp4 = add <4 x i16> %tmp1, %tmp3
-	ret <4 x i16> %tmp4
-}
-
-define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: vsras32:
-;CHECK: ssra.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = ashr <2 x i32> %tmp2, < i32 31, i32 31 >
-        %tmp4 = add <2 x i32> %tmp1, %tmp3
-	ret <2 x i32> %tmp4
-}
-
-define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: vsraQs8:
-;CHECK: ssra.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = ashr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
-        %tmp4 = add <16 x i8> %tmp1, %tmp3
-	ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vsraQs16:
-;CHECK: ssra.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = ashr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
-        %tmp4 = add <8 x i16> %tmp1, %tmp3
-	ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vsraQs32:
-;CHECK: ssra.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = ashr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
-        %tmp4 = add <4 x i32> %tmp1, %tmp3
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: vsraQs64:
-;CHECK: ssra.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = ashr <2 x i64> %tmp2, < i64 63, i64 63 >
-        %tmp4 = add <2 x i64> %tmp1, %tmp3
-	ret <2 x i64> %tmp4
-}
-
-define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vsrau8:
-;CHECK: usra.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = lshr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
-        %tmp4 = add <8 x i8> %tmp1, %tmp3
-	ret <8 x i8> %tmp4
-}
-
-define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: vsrau16:
-;CHECK: usra.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = lshr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
-        %tmp4 = add <4 x i16> %tmp1, %tmp3
-	ret <4 x i16> %tmp4
-}
-
-define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: vsrau32:
-;CHECK: usra.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = lshr <2 x i32> %tmp2, < i32 31, i32 31 >
-        %tmp4 = add <2 x i32> %tmp1, %tmp3
-	ret <2 x i32> %tmp4
-}
-
-
-define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: vsraQu8:
-;CHECK: usra.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = lshr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
-        %tmp4 = add <16 x i8> %tmp1, %tmp3
-	ret <16 x i8> %tmp4
-}
-
-define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vsraQu16:
-;CHECK: usra.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = lshr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
-        %tmp4 = add <8 x i16> %tmp1, %tmp3
-	ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vsraQu32:
-;CHECK: usra.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = lshr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
-        %tmp4 = add <4 x i32> %tmp1, %tmp3
-	ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: vsraQu64:
-;CHECK: usra.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = lshr <2 x i64> %tmp2, < i64 63, i64 63 >
-        %tmp4 = add <2 x i64> %tmp1, %tmp3
-	ret <2 x i64> %tmp4
-}
-
-define <1 x i64> @vsra_v1i64(<1 x i64> %A, <1 x i64> %B) nounwind {
-; CHECK-LABEL: vsra_v1i64:
-; CHECK: ssra d0, d1, #63
-  %tmp3 = ashr <1 x i64> %B, < i64 63 >
-  %tmp4 = add <1 x i64> %A, %tmp3
-  ret <1 x i64> %tmp4
-}
diff --git a/test/CodeGen/ARM64/vsub.ll b/test/CodeGen/ARM64/vsub.ll
deleted file mode 100644
index 5c7e84f..0000000
--- a/test/CodeGen/ARM64/vsub.ll
+++ /dev/null
@@ -1,417 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: subhn8b:
-;CHECK: subhn.8b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: subhn4h:
-;CHECK: subhn.4h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: subhn2s:
-;CHECK: subhn.2s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
-;CHECK-LABEL: subhn2_16b:
-;CHECK: subhn.8b
-;CHECK-NEXT: subhn2.16b
-  %vsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %res
-}
-
-define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
-;CHECK-LABEL: subhn2_8h:
-;CHECK: subhn.4h
-;CHECK-NEXT: subhn2.8h
-  %vsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
-;CHECK-LABEL: subhn2_4s:
-;CHECK: subhn.2s
-;CHECK-NEXT: subhn2.4s
-  %vsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %res
-}
-
-declare <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: rsubhn8b:
-;CHECK: rsubhn.8b
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
-        ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: rsubhn4h:
-;CHECK: rsubhn.4h
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
-        ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: rsubhn2s:
-;CHECK: rsubhn.2s
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
-        ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
-;CHECK-LABEL: rsubhn2_16b:
-;CHECK: rsubhn.8b
-;CHECK-NEXT: rsubhn2.16b
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vrsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %res
-}
-
-define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
-;CHECK-LABEL: rsubhn2_8h:
-;CHECK: rsubhn.4h
-;CHECK-NEXT: rsubhn2.8h
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vrsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
-;CHECK-LABEL: rsubhn2_4s:
-;CHECK: rsubhn.2s
-;CHECK-NEXT: rsubhn2.4s
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vrsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %res
-}
-
-declare <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: ssubl8h:
-;CHECK: ssubl.8h
-        %tmp1 = load <8 x i8>* %A
-        %tmp2 = load <8 x i8>* %B
-  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
-  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
-  %tmp5 = sub <8 x i16> %tmp3, %tmp4
-        ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @ssubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: ssubl4s:
-;CHECK: ssubl.4s
-        %tmp1 = load <4 x i16>* %A
-        %tmp2 = load <4 x i16>* %B
-  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
-  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
-  %tmp5 = sub <4 x i32> %tmp3, %tmp4
-        ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: ssubl2d:
-;CHECK: ssubl.2d
-        %tmp1 = load <2 x i32>* %A
-        %tmp2 = load <2 x i32>* %B
-  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
-  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
-  %tmp5 = sub <2 x i64> %tmp3, %tmp4
-        ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: ssubl2_8h:
-;CHECK: ssubl2.8h
-        %tmp1 = load <16 x i8>* %A
-        %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %ext1 = sext <8 x i8> %high1 to <8 x i16>
-
-        %tmp2 = load <16 x i8>* %B
-        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %ext2 = sext <8 x i8> %high2 to <8 x i16>
-
-        %res = sub <8 x i16> %ext1, %ext2
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: ssubl2_4s:
-;CHECK: ssubl2.4s
-        %tmp1 = load <8 x i16>* %A
-        %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %ext1 = sext <4 x i16> %high1 to <4 x i32>
-
-        %tmp2 = load <8 x i16>* %B
-        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %ext2 = sext <4 x i16> %high2 to <4 x i32>
-
-        %res = sub <4 x i32> %ext1, %ext2
-        ret <4 x i32> %res
-}
-
-define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: ssubl2_2d:
-;CHECK: ssubl2.2d
-        %tmp1 = load <4 x i32>* %A
-        %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %ext1 = sext <2 x i32> %high1 to <2 x i64>
-
-        %tmp2 = load <4 x i32>* %B
-        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %ext2 = sext <2 x i32> %high2 to <2 x i64>
-
-        %res = sub <2 x i64> %ext1, %ext2
-        ret <2 x i64> %res
-}
-
-define <8 x i16> @usubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: usubl8h:
-;CHECK: usubl.8h
-  %tmp1 = load <8 x i8>* %A
-  %tmp2 = load <8 x i8>* %B
-  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
-  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
-  %tmp5 = sub <8 x i16> %tmp3, %tmp4
-  ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @usubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: usubl4s:
-;CHECK: usubl.4s
-  %tmp1 = load <4 x i16>* %A
-  %tmp2 = load <4 x i16>* %B
-  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
-  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
-  %tmp5 = sub <4 x i32> %tmp3, %tmp4
-  ret <4 x i32> %tmp5
-}
-
-define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: usubl2d:
-;CHECK: usubl.2d
-  %tmp1 = load <2 x i32>* %A
-  %tmp2 = load <2 x i32>* %B
-  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
-  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
-  %tmp5 = sub <2 x i64> %tmp3, %tmp4
-  ret <2 x i64> %tmp5
-}
-
-define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: usubl2_8h:
-;CHECK: usubl2.8h
-  %tmp1 = load <16 x i8>* %A
-  %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %ext1 = zext <8 x i8> %high1 to <8 x i16>
-
-  %tmp2 = load <16 x i8>* %B
-  %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %ext2 = zext <8 x i8> %high2 to <8 x i16>
-
-  %res = sub <8 x i16> %ext1, %ext2
-  ret <8 x i16> %res
-}
-
-define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: usubl2_4s:
-;CHECK: usubl2.4s
-  %tmp1 = load <8 x i16>* %A
-  %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %ext1 = zext <4 x i16> %high1 to <4 x i32>
-
-  %tmp2 = load <8 x i16>* %B
-  %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %ext2 = zext <4 x i16> %high2 to <4 x i32>
-
-  %res = sub <4 x i32> %ext1, %ext2
-  ret <4 x i32> %res
-}
-
-define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: usubl2_2d:
-;CHECK: usubl2.2d
-  %tmp1 = load <4 x i32>* %A
-  %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %ext1 = zext <2 x i32> %high1 to <2 x i64>
-
-  %tmp2 = load <4 x i32>* %B
-  %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %ext2 = zext <2 x i32> %high2 to <2 x i64>
-
-  %res = sub <2 x i64> %ext1, %ext2
-  ret <2 x i64> %res
-}
-
-define <8 x i16> @ssubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: ssubw8h:
-;CHECK: ssubw.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i8>* %B
-  %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
-  %tmp4 = sub <8 x i16> %tmp1, %tmp3
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @ssubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: ssubw4s:
-;CHECK: ssubw.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i16>* %B
-  %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
-  %tmp4 = sub <4 x i32> %tmp1, %tmp3
-        ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: ssubw2d:
-;CHECK: ssubw.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i32>* %B
-  %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
-  %tmp4 = sub <2 x i64> %tmp1, %tmp3
-        ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: ssubw2_8h:
-;CHECK: ssubw2.8h
-        %tmp1 = load <8 x i16>* %A
-
-        %tmp2 = load <16 x i8>* %B
-        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %ext2 = sext <8 x i8> %high2 to <8 x i16>
-
-        %res = sub <8 x i16> %tmp1, %ext2
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: ssubw2_4s:
-;CHECK: ssubw2.4s
-        %tmp1 = load <4 x i32>* %A
-
-        %tmp2 = load <8 x i16>* %B
-        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %ext2 = sext <4 x i16> %high2 to <4 x i32>
-
-        %res = sub <4 x i32> %tmp1, %ext2
-        ret <4 x i32> %res
-}
-
-define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: ssubw2_2d:
-;CHECK: ssubw2.2d
-        %tmp1 = load <2 x i64>* %A
-
-        %tmp2 = load <4 x i32>* %B
-        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %ext2 = sext <2 x i32> %high2 to <2 x i64>
-
-        %res = sub <2 x i64> %tmp1, %ext2
-        ret <2 x i64> %res
-}
-
-define <8 x i16> @usubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: usubw8h:
-;CHECK: usubw.8h
-        %tmp1 = load <8 x i16>* %A
-        %tmp2 = load <8 x i8>* %B
-  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
-  %tmp4 = sub <8 x i16> %tmp1, %tmp3
-        ret <8 x i16> %tmp4
-}
-
-define <4 x i32> @usubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: usubw4s:
-;CHECK: usubw.4s
-        %tmp1 = load <4 x i32>* %A
-        %tmp2 = load <4 x i16>* %B
-  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
-  %tmp4 = sub <4 x i32> %tmp1, %tmp3
-        ret <4 x i32> %tmp4
-}
-
-define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: usubw2d:
-;CHECK: usubw.2d
-        %tmp1 = load <2 x i64>* %A
-        %tmp2 = load <2 x i32>* %B
-  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
-  %tmp4 = sub <2 x i64> %tmp1, %tmp3
-        ret <2 x i64> %tmp4
-}
-
-define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: usubw2_8h:
-;CHECK: usubw2.8h
-        %tmp1 = load <8 x i16>* %A
-
-        %tmp2 = load <16 x i8>* %B
-        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %ext2 = zext <8 x i8> %high2 to <8 x i16>
-
-        %res = sub <8 x i16> %tmp1, %ext2
-        ret <8 x i16> %res
-}
-
-define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: usubw2_4s:
-;CHECK: usubw2.4s
-        %tmp1 = load <4 x i32>* %A
-
-        %tmp2 = load <8 x i16>* %B
-        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %ext2 = zext <4 x i16> %high2 to <4 x i32>
-
-        %res = sub <4 x i32> %tmp1, %ext2
-        ret <4 x i32> %res
-}
-
-define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: usubw2_2d:
-;CHECK: usubw2.2d
-        %tmp1 = load <2 x i64>* %A
-
-        %tmp2 = load <4 x i32>* %B
-        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %ext2 = zext <2 x i32> %high2 to <2 x i64>
-
-        %res = sub <2 x i64> %tmp1, %ext2
-        ret <2 x i64> %res
-}
diff --git a/test/CodeGen/ARM64/weak-reference.ll b/test/CodeGen/ARM64/weak-reference.ll
deleted file mode 100644
index b2135e0..0000000
--- a/test/CodeGen/ARM64/weak-reference.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
-
-@x = extern_weak global i32
-
-define i32 @fn() nounwind ssp {
-; CHECK-LABEL: fn:
-; CHECK: .weak_reference
-  %val = load i32* @x, align 4
-  ret i32 %val
-}
diff --git a/test/CodeGen/ARM64/xaluo.ll b/test/CodeGen/ARM64/xaluo.ll
deleted file mode 100644
index 6a8520d..0000000
--- a/test/CodeGen/ARM64/xaluo.ll
+++ /dev/null
@@ -1,524 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-;
-; Get the actual value of the overflow bit.
-;
-define i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL:  saddo.i32
-; CHECK:        adds w8, w0, w1
-; CHECK-NEXT:   csinc w0, wzr, wzr, vc
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL:  saddo.i64
-; CHECK:        adds x8, x0, x1
-; CHECK-NEXT:   csinc w0, wzr, wzr, vc
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-define i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL:  uaddo.i32
-; CHECK:        adds w8, w0, w1
-; CHECK-NEXT:   csinc w0, wzr, wzr, cc
-  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL:  uaddo.i64
-; CHECK:        adds x8, x0, x1
-; CHECK-NEXT:   csinc w0, wzr, wzr, cc
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-define i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL:  ssubo.i32
-; CHECK:        subs w8, w0, w1
-; CHECK-NEXT:   csinc w0, wzr, wzr, vc
-  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL:  ssubo.i64
-; CHECK:        subs x8, x0, x1
-; CHECK-NEXT:   csinc w0, wzr, wzr, vc
-  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-define i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL:  usubo.i32
-; CHECK:        subs w8, w0, w1
-; CHECK-NEXT:   csinc w0, wzr, wzr, cs
-  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL:  usubo.i64
-; CHECK:        subs x8, x0, x1
-; CHECK-NEXT:   csinc w0, wzr, wzr, cs
-  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-define i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL:  smulo.i32
-; CHECK:        smull x8, w0, w1
-; CHECK-NEXT:   lsr x9, x8, #32
-; CHECK-NEXT:   cmp w9, w8, asr #31
-; CHECK-NEXT:   csinc w0, wzr, wzr, eq
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL:  smulo.i64
-; CHECK:        mul x8, x0, x1
-; CHECK-NEXT:   smulh x9, x0, x1
-; CHECK-NEXT:   cmp x9, x8, asr #63
-; CHECK-NEXT:   csinc w0, wzr, wzr, eq
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-define i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL:  umulo.i32
-; CHECK:        umull x8, w0, w1
-; CHECK-NEXT:   cmp xzr, x8, lsr #32
-; CHECK-NEXT:   csinc w0, wzr, wzr, eq
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL:  umulo.i64
-; CHECK:        umulh x8, x0, x1
-; CHECK-NEXT:   cmp xzr, x8
-; CHECK-NEXT:   csinc w8, wzr, wzr, eq
-; CHECK-NEXT:   mul x9, x0, x1
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-
-;
-; Check the use of the overflow bit in combination with a select instruction.
-;
-define i32 @saddo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  saddo.select.i32
-; CHECK:        cmn w0, w1
-; CHECK-NEXT:   csel w0, w0, w1, vs
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  saddo.select.i64
-; CHECK:        cmn x0, x1
-; CHECK-NEXT:   csel x0, x0, x1, vs
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  uaddo.select.i32
-; CHECK:        cmn w0, w1
-; CHECK-NEXT:   csel w0, w0, w1, cs
-  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  uaddo.select.i64
-; CHECK:        cmn x0, x1
-; CHECK-NEXT:   csel x0, x0, x1, cs
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @ssubo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  ssubo.select.i32
-; CHECK:        cmp w0, w1
-; CHECK-NEXT:   csel w0, w0, w1, vs
-  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  ssubo.select.i64
-; CHECK:        cmp x0, x1
-; CHECK-NEXT:   csel x0, x0, x1, vs
-  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  usubo.select.i32
-; CHECK:        cmp w0, w1
-; CHECK-NEXT:   csel w0, w0, w1, cc
-  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  usubo.select.i64
-; CHECK:        cmp x0, x1
-; CHECK-NEXT:   csel x0, x0, x1, cc
-  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  smulo.select.i32
-; CHECK:        smull    x8, w0, w1
-; CHECK-NEXT:   lsr     x9, x8, #32
-; CHECK-NEXT:   cmp     w9, w8, asr #31
-; CHECK-NEXT:   csel    w0, w0, w1, ne
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  smulo.select.i64
-; CHECK:        mul      x8, x0, x1
-; CHECK-NEXT:   smulh   x9, x0, x1
-; CHECK-NEXT:   cmp     x9, x8, asr #63
-; CHECK-NEXT:   csel    x0, x0, x1, ne
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  umulo.select.i32
-; CHECK:        umull    x8, w0, w1
-; CHECK-NEXT:   cmp     xzr, x8, lsr #32
-; CHECK-NEXT:   csel    w0, w0, w1, ne
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  umulo.select.i64
-; CHECK:        umulh   x8, x0, x1
-; CHECK-NEXT:   cmp     xzr, x8
-; CHECK-NEXT:   csel    x0, x0, x1, ne
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-
-;
-; Check the use of the overflow bit in combination with a branch instruction.
-;
-define i1 @saddo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  saddo.br.i32
-; CHECK:        cmn w0, w1
-; CHECK-NEXT:   b.vc
-  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @saddo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  saddo.br.i64
-; CHECK:        cmn x0, x1
-; CHECK-NEXT:   b.vc
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  uaddo.br.i32
-; CHECK:        cmn w0, w1
-; CHECK-NEXT:   b.cc
-  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  uaddo.br.i64
-; CHECK:        cmn x0, x1
-; CHECK-NEXT:   b.cc
-  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  ssubo.br.i32
-; CHECK:        cmp w0, w1
-; CHECK-NEXT:   b.vc
-  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  ssubo.br.i64
-; CHECK:        cmp x0, x1
-; CHECK-NEXT:   b.vc
-  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @usubo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  usubo.br.i32
-; CHECK:        cmp w0, w1
-; CHECK-NEXT:   b.cs
-  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @usubo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  usubo.br.i64
-; CHECK:        cmp x0, x1
-; CHECK-NEXT:   b.cs
-  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @smulo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  smulo.br.i32
-; CHECK:        smull    x8, w0, w1
-; CHECK-NEXT:   lsr     x9, x8, #32
-; CHECK-NEXT:   cmp     w9, w8, asr #31
-; CHECK-NEXT:   b.eq
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @smulo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  smulo.br.i64
-; CHECK:        mul      x8, x0, x1
-; CHECK-NEXT:   smulh   x9, x0, x1
-; CHECK-NEXT:   cmp     x9, x8, asr #63
-; CHECK-NEXT:   b.eq
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @umulo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL:  umulo.br.i32
-; CHECK:        umull    x8, w0, w1
-; CHECK-NEXT:   cmp     xzr, x8, lsr #32
-; CHECK-NEXT:   b.eq
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define i1 @umulo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL:  umulo.br.i64
-; CHECK:        umulh   x8, x0, x1
-; CHECK-NEXT:   cbz
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
-declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
-
diff --git a/test/CodeGen/ARM64/zero-cycle-regmov.ll b/test/CodeGen/ARM64/zero-cycle-regmov.ll
deleted file mode 100644
index c56d607..0000000
--- a/test/CodeGen/ARM64/zero-cycle-regmov.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
-; rdar://12254953
-
-define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp {
-entry:
-; CHECK-LABEL: t:
-; CHECK: mov x0, [[REG1:x[0-9]+]]
-; CHECK: mov x1, [[REG2:x[0-9]+]]
-; CHECK: bl _foo
-; CHECK: mov x0, [[REG1]]
-; CHECK: mov x1, [[REG2]]
-  %call = call i32 @foo(i32 %c, i32 %d) nounwind
-  %call1 = call i32 @foo(i32 %c, i32 %d) nounwind
-  unreachable
-}
-
-declare i32 @foo(i32, i32)
diff --git a/test/CodeGen/ARM64/zero-cycle-zeroing.ll b/test/CodeGen/ARM64/zero-cycle-zeroing.ll
deleted file mode 100644
index 349bb6f..0000000
--- a/test/CodeGen/ARM64/zero-cycle-zeroing.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
-; rdar://11481771
-; rdar://13713797
-
-define void @t1() nounwind ssp {
-entry:
-; CHECK-LABEL: t1:
-; CHECK-NOT: fmov
-; CHECK: movi.2d v0, #0000000000000000
-; CHECK: movi.2d v1, #0000000000000000
-; CHECK: movi.2d v2, #0000000000000000
-; CHECK: movi.2d v3, #0000000000000000
-  tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind
-  ret void
-}
-
-define void @t2() nounwind ssp {
-entry:
-; CHECK-LABEL: t2:
-; CHECK-NOT: mov w0, wzr
-; CHECK: movz w0, #0
-; CHECK: movz w1, #0
-  tail call void @bari(i32 0, i32 0) nounwind
-  ret void
-}
-
-define void @t3() nounwind ssp {
-entry:
-; CHECK-LABEL: t3:
-; CHECK-NOT: mov x0, xzr
-; CHECK: movz x0, #0
-; CHECK: movz x1, #0
-  tail call void @barl(i64 0, i64 0) nounwind
-  ret void
-}
-
-define void @t4() nounwind ssp {
-; CHECK-LABEL: t4:
-; CHECK-NOT: fmov
-; CHECK: movi.2d v0, #0000000000000000
-; CHECK: movi.2d v1, #0000000000000000
-  tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
-  ret void
-}
-
-declare void @bar(double, double, double, double)
-declare void @bari(i32, i32)
-declare void @barl(i64, i64)
-declare void @barf(float, float)
diff --git a/test/CodeGen/ARM64/zext.ll b/test/CodeGen/ARM64/zext.ll
deleted file mode 100644
index 8d9e5ea..0000000
--- a/test/CodeGen/ARM64/zext.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i64 @foo(i32 %a, i32 %b) nounwind readnone ssp {
-entry:
-; CHECK-LABEL: foo:
-; CHECK: add w0, w1, w0
-; CHECK: ret
-  %add = add i32 %b, %a
-  %conv = zext i32 %add to i64
-  ret i64 %conv
-}
diff --git a/test/CodeGen/ARM64/zextload-unscaled.ll b/test/CodeGen/ARM64/zextload-unscaled.ll
deleted file mode 100644
index c475dbd..0000000
--- a/test/CodeGen/ARM64/zextload-unscaled.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc -march=arm64 < %s | FileCheck %s
-
-@var32 = global i32 0
-
-define void @test_zextloadi1_unscaled(i1* %base) {
-; CHECK-LABEL: test_zextloadi1_unscaled:
-; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
-
-  %addr = getelementptr i1* %base, i32 -7
-  %val = load i1* %addr, align 1
-
-  %extended = zext i1 %val to i32
-  store i32 %extended, i32* @var32, align 4
-  ret void
-}
-
-define void @test_zextloadi8_unscaled(i8* %base) {
-; CHECK-LABEL: test_zextloadi8_unscaled:
-; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
-
-  %addr = getelementptr i8* %base, i32 -7
-  %val = load i8* %addr, align 1
-
-  %extended = zext i8 %val to i32
-  store i32 %extended, i32* @var32, align 4
-  ret void
-}
-
-define void @test_zextloadi16_unscaled(i16* %base) {
-; CHECK-LABEL: test_zextloadi16_unscaled:
-; CHECK: ldurh {{w[0-9]+}}, [{{x[0-9]+}}, #-14]
-
-  %addr = getelementptr i16* %base, i32 -7
-  %val = load i16* %addr, align 2
-
-  %extended = zext i16 %val to i32
-  store i32 %extended, i32* @var32, align 4
-  ret void
-}
-
diff --git a/test/CodeGen/ARM64/zip.ll b/test/CodeGen/ARM64/zip.ll
deleted file mode 100644
index d06a9f8..0000000
--- a/test/CodeGen/ARM64/zip.ll
+++ /dev/null
@@ -1,107 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vzipi8:
-;CHECK: zip1.8b
-;CHECK: zip2.8b
-;CHECK-NEXT: add.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-	ret <8 x i8> %tmp5
-}
-
-define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: vzipi16:
-;CHECK: zip1.4h
-;CHECK: zip2.4h
-;CHECK-NEXT: add.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-        %tmp5 = add <4 x i16> %tmp3, %tmp4
-	ret <4 x i16> %tmp5
-}
-
-define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: vzipQi8:
-;CHECK: zip1.16b
-;CHECK: zip2.16b
-;CHECK-NEXT: add.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-	ret <16 x i8> %tmp5
-}
-
-define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vzipQi16:
-;CHECK: zip1.8h
-;CHECK: zip2.8h
-;CHECK-NEXT: add.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-        %tmp5 = add <8 x i16> %tmp3, %tmp4
-	ret <8 x i16> %tmp5
-}
-
-define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vzipQi32:
-;CHECK: zip1.4s
-;CHECK: zip2.4s
-;CHECK-NEXT: add.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-        %tmp5 = add <4 x i32> %tmp3, %tmp4
-	ret <4 x i32> %tmp5
-}
-
-define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: vzipQf:
-;CHECK: zip1.4s
-;CHECK: zip2.4s
-;CHECK-NEXT: fadd.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-        %tmp5 = fadd <4 x float> %tmp3, %tmp4
-	ret <4 x float> %tmp5
-}
-
-; Undef shuffle indices should not prevent matching to VZIP:
-
-define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: vzipi8_undef:
-;CHECK: zip1.8b
-;CHECK: zip2.8b
-;CHECK-NEXT: add.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 1, i32 9, i32 undef, i32 10, i32 3, i32 11>
-	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 undef, i32 undef, i32 15>
-        %tmp5 = add <8 x i8> %tmp3, %tmp4
-	ret <8 x i8> %tmp5
-}
-
-define <16 x i8> @vzipQi8_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: vzipQi8_undef:
-;CHECK: zip1.16b
-;CHECK: zip2.16b
-;CHECK-NEXT: add.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31>
-        %tmp5 = add <16 x i8> %tmp3, %tmp4
-	ret <16 x i8> %tmp5
-}
diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
index 4e858f7..9537489 100644
--- a/test/CodeGen/Hexagon/hwloop-dbg.ll
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll
@@ -46,8 +46,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !8 = metadata !{null, metadata !9, metadata !9}
 !9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
 !10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !13, metadata !14, metadata !15}
+!11 = metadata !{metadata !13, metadata !14, metadata !15}
 !13 = metadata !{i32 786689, metadata !5, metadata !"a", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 1]
 !14 = metadata !{i32 786689, metadata !5, metadata !"b", metadata !6, i32 33554433, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 1]
 !15 = metadata !{i32 786688, metadata !16, metadata !"i", metadata !6, i32 2, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2]
diff --git a/test/CodeGen/MSP430/fp.ll b/test/CodeGen/MSP430/fp.ll
index b6ba22e..2559e23 100644
--- a/test/CodeGen/MSP430/fp.ll
+++ b/test/CodeGen/MSP430/fp.ll
@@ -21,7 +21,7 @@ entry:
 ; does not happen anymore. Note that the only reason an ISR is used here is that
 ; the register allocator selects r4 first instead of fifth in a normal function.
 define msp430_intrcc void @fpb_alloced() #0 {
-; CHECK_LABEL: fpb_alloced:
+; CHECK-LABEL: fpb_alloced:
 ; CHECK-NOT: mov.b #0, r4
 ; CHECK: nop
   call void asm sideeffect "nop", "r"(i8 0)
diff --git a/test/CodeGen/Mips/2010-07-20-Switch.ll b/test/CodeGen/Mips/2010-07-20-Switch.ll
index 38d7b7e..5c84077 100644
--- a/test/CodeGen/Mips/2010-07-20-Switch.ll
+++ b/test/CodeGen/Mips/2010-07-20-Switch.ll
@@ -2,10 +2,14 @@
 ; RUN: FileCheck %s -check-prefix=STATIC-O32 
 ; RUN: llc < %s -march=mips -relocation-model=pic | \
 ; RUN: FileCheck %s -check-prefix=PIC-O32 
+; RUN: llc < %s -march=mips64 -relocation-model=pic -mcpu=mips4 | \
+; RUN:     FileCheck %s -check-prefix=N64
+; RUN: llc < %s -march=mips64 -relocation-model=static -mcpu=mips4 | \
+; RUN:     FileCheck %s -check-prefix=N64
 ; RUN: llc < %s -march=mips64 -relocation-model=pic -mcpu=mips64 | \
-; RUN: FileCheck %s -check-prefix=N64
+; RUN:     FileCheck %s -check-prefix=N64
 ; RUN: llc < %s -march=mips64 -relocation-model=static -mcpu=mips64 | \
-; RUN: FileCheck %s -check-prefix=N64
+; RUN:     FileCheck %s -check-prefix=N64
 
 define i32 @main() nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/Fast-ISel/nullvoid.ll b/test/CodeGen/Mips/Fast-ISel/nullvoid.ll
new file mode 100644
index 0000000..eeaff87
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/nullvoid.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+
+; Function Attrs: nounwind
+define void @foo() {
+entry:
+  ret void
+; CHECK: jr	$ra
+}
diff --git a/test/CodeGen/Mips/Fast-ISel/simplestore.ll b/test/CodeGen/Mips/Fast-ISel/simplestore.ll
new file mode 100644
index 0000000..5d52481
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/simplestore.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+
+@abcd = external global i32
+
+; Function Attrs: nounwind
+define void @foo()  {
+entry:
+  store i32 12345, i32* @abcd, align 4
+; CHECK: 	addiu	$[[REG1:[0-9]+]], $zero, 12345
+; CHECK: 	lw	$[[REG2:[0-9]+]], %got(abcd)(${{[0-9]+}})
+; CHECK: 	sw	$[[REG1]], 0($[[REG2]])
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/Fast-ISel/simplestorei.ll b/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
new file mode 100644
index 0000000..7d2c8e7
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+
+@ijk = external global i32
+
+; Function Attrs: nounwind
+define void @si2_1() #0 {
+entry:
+  store i32 32767, i32* @ijk, align 4
+; CHECK:        .ent    si2_1
+; CHECK:        addiu   $[[REG1:[0-9]+]], $zero, 32767
+; CHECK:        lw      $[[REG2:[0-9]+]], %got(ijk)(${{[0-9]+}})
+; CHECK:        sw      $[[REG1]], 0($[[REG2]])
+
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @si2_2() #0 {
+entry:
+  store i32 -32768, i32* @ijk, align 4
+; CHECK:        .ent    si2_2
+; CHECK:        addiu   $[[REG1:[0-9]+]], $zero, -32768
+; CHECK:        lw      $[[REG2:[0-9]+]], %got(ijk)(${{[0-9]+}})
+; CHECK:        sw      $[[REG1]], 0($[[REG2]])
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ui2_1() #0 {
+entry:
+  store i32 65535, i32* @ijk, align 4
+; CHECK:        .ent    ui2_1
+; CHECK:        ori     $[[REG1:[0-9]+]], $zero, 65535
+; CHECK:        lw      $[[REG2:[0-9]+]], %got(ijk)(${{[0-9]+}})
+; CHECK:        sw      $[[REG1]], 0($[[REG2]])
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ui4_1() #0 {
+entry:
+  store i32 983040, i32* @ijk, align 4
+; CHECK:        .ent    ui4_1
+; CHECK:        lui     $[[REG1:[0-9]+]], 15
+; CHECK:        lw      $[[REG2:[0-9]+]], %got(ijk)(${{[0-9]+}})
+; CHECK:        sw      $[[REG1]], 0($[[REG2]])
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ui4_2() #0 {
+entry:
+  store i32 719566, i32* @ijk, align 4
+; CHECK:        .ent    ui4_2
+; CHECK:        lui	$[[REG1:[0-9]+]], 10
+; CHECK: 	ori	$[[REG1]], $[[REG1]], 64206
+; CHECK: 	lw	$[[REG2:[0-9]+]], %got(ijk)(${{[0-9]+}})
+; CHECK: 	sw	$[[REG1]], 0($[[REG2]])
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+
diff --git a/test/CodeGen/Mips/abicalls.ll b/test/CodeGen/Mips/abicalls.ll
index 7b98b02..6fa33aa 100644
--- a/test/CodeGen/Mips/abicalls.ll
+++ b/test/CodeGen/Mips/abicalls.ll
@@ -7,6 +7,7 @@
 
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-STATIC %s
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck -check-prefix=CHECK-PIC %s
+; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips4 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-PIC %s
 ; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-PIC %s
 
 ; CHECK-STATIC: .abicalls
diff --git a/test/CodeGen/Mips/cconv/arguments-float.ll b/test/CodeGen/Mips/cconv/arguments-float.ll
new file mode 100644
index 0000000..e2119ec
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-float.ll
@@ -0,0 +1,222 @@
+; RUN: llc -march=mips -relocation-model=static -soft-float < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
+; RUN: llc -march=mipsel -relocation-model=static -soft-float < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+
+; RUN-TODO: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+
+; RUN: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+
+; Test the floating point arguments for all ABI's and byte orders as specified
+; by section 5 of MD00305 (MIPS ABIs Described).
+;
+; N32/N64 are identical in this area so their checks have been combined into
+; the 'NEW' prefix (the N stands for New).
+
+@bytes = global [11 x i8] zeroinitializer
+@dwords = global [11 x i64] zeroinitializer
+@floats = global [11 x float] zeroinitializer
+@doubles = global [11 x double] zeroinitializer
+
+define void @double_args(double %a, double %b, double %c, double %d, double %e,
+                         double %f, double %g, double %h, double %i) nounwind {
+entry:
+        %0 = getelementptr [11 x double]* @doubles, i32 0, i32 1
+        store volatile double %a, double* %0
+        %1 = getelementptr [11 x double]* @doubles, i32 0, i32 2
+        store volatile double %b, double* %1
+        %2 = getelementptr [11 x double]* @doubles, i32 0, i32 3
+        store volatile double %c, double* %2
+        %3 = getelementptr [11 x double]* @doubles, i32 0, i32 4
+        store volatile double %d, double* %3
+        %4 = getelementptr [11 x double]* @doubles, i32 0, i32 5
+        store volatile double %e, double* %4
+        %5 = getelementptr [11 x double]* @doubles, i32 0, i32 6
+        store volatile double %f, double* %5
+        %6 = getelementptr [11 x double]* @doubles, i32 0, i32 7
+        store volatile double %g, double* %6
+        %7 = getelementptr [11 x double]* @doubles, i32 0, i32 8
+        store volatile double %h, double* %7
+        %8 = getelementptr [11 x double]* @doubles, i32 0, i32 9
+        store volatile double %i, double* %8
+        ret void
+}
+
+; ALL-LABEL: double_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(doubles)(
+
+; The first four arguments are the same in O32/N32/N64.
+; The first argument is floating point but soft-float is enabled so floating
+; point registers are not used.
+; O32-DAG:           sw $4, 8([[R2]])
+; O32-DAG:           sw $5, 12([[R2]])
+; NEW-DAG:           sd $4, 8([[R2]])
+
+; O32-DAG:           sw $6, 16([[R2]])
+; O32-DAG:           sw $7, 20([[R2]])
+; NEW-DAG:           sd $5, 16([[R2]])
+
+; O32 has run out of argument registers and starts using the stack
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 24($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 28($sp)
+; O32-DAG:           sw [[R3]], 24([[R2]])
+; O32-DAG:           sw [[R4]], 28([[R2]])
+; NEW-DAG:           sd $6, 24([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 32($sp)
+; O32-DAG:           lw [[R4:\$[0-9]+]], 36($sp)
+; O32-DAG:           sw [[R3]], 32([[R2]])
+; O32-DAG:           sw [[R4]], 36([[R2]])
+; NEW-DAG:           sd $7, 32([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 40($sp)
+; O32-DAG:           lw [[R4:\$[0-9]+]], 44($sp)
+; O32-DAG:           sw [[R3]], 40([[R2]])
+; O32-DAG:           sw [[R4]], 44([[R2]])
+; NEW-DAG:           sd $8, 40([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 48($sp)
+; O32-DAG:           lw [[R4:\$[0-9]+]], 52($sp)
+; O32-DAG:           sw [[R3]], 48([[R2]])
+; O32-DAG:           sw [[R4]], 52([[R2]])
+; NEW-DAG:           sd $9, 48([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 56($sp)
+; O32-DAG:           lw [[R4:\$[0-9]+]], 60($sp)
+; O32-DAG:           sw [[R3]], 56([[R2]])
+; O32-DAG:           sw [[R4]], 60([[R2]])
+; NEW-DAG:           sd $10, 56([[R2]])
+
+; N32/N64 have run out of registers and starts using the stack too
+; O32-DAG:           lw [[R3:\$[0-9]+]], 64($sp)
+; O32-DAG:           lw [[R4:\$[0-9]+]], 68($sp)
+; O32-DAG:           sw [[R3]], 64([[R2]])
+; O32-DAG:           sw [[R4]], 68([[R2]])
+; NEW-DAG:           ld [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           sd $11, 64([[R2]])
+
+define void @float_args(float %a, float %b, float %c, float %d, float %e,
+                        float %f, float %g, float %h, float %i, float %j)
+                       nounwind {
+entry:
+        %0 = getelementptr [11 x float]* @floats, i32 0, i32 1
+        store volatile float %a, float* %0
+        %1 = getelementptr [11 x float]* @floats, i32 0, i32 2
+        store volatile float %b, float* %1
+        %2 = getelementptr [11 x float]* @floats, i32 0, i32 3
+        store volatile float %c, float* %2
+        %3 = getelementptr [11 x float]* @floats, i32 0, i32 4
+        store volatile float %d, float* %3
+        %4 = getelementptr [11 x float]* @floats, i32 0, i32 5
+        store volatile float %e, float* %4
+        %5 = getelementptr [11 x float]* @floats, i32 0, i32 6
+        store volatile float %f, float* %5
+        %6 = getelementptr [11 x float]* @floats, i32 0, i32 7
+        store volatile float %g, float* %6
+        %7 = getelementptr [11 x float]* @floats, i32 0, i32 8
+        store volatile float %h, float* %7
+        %8 = getelementptr [11 x float]* @floats, i32 0, i32 9
+        store volatile float %i, float* %8
+        %9 = getelementptr [11 x float]* @floats, i32 0, i32 10
+        store volatile float %j, float* %9
+        ret void
+}
+
+; ALL-LABEL: float_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(floats)(
+
+; The first four arguments are the same in O32/N32/N64.
+; The first argument isn't floating point so floating point registers are not
+; used.
+; MD00305 and GCC disagree on this one. MD00305 says that floats are treated
+; as 8-byte aligned and occupy two slots on O32. GCC is treating them as 4-byte
+; aligned and occupying one slot. We'll use GCC's definition.
+; ALL-DAG:           sw $4, 4([[R2]])
+; ALL-DAG:           sw $5, 8([[R2]])
+; ALL-DAG:           sw $6, 12([[R2]])
+; ALL-DAG:           sw $7, 16([[R2]])
+
+; O32 has run out of argument registers and starts using the stack
+; O32-DAG:           lw [[R3:\$[0-9]+]], 16($sp)
+; O32-DAG:           sw [[R3]], 20([[R2]])
+; NEW-DAG:           sw $8, 20([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 20($sp)
+; O32-DAG:           sw [[R3]], 24([[R2]])
+; NEW-DAG:           sw $9, 24([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 24($sp)
+; O32-DAG:           sw [[R3]], 28([[R2]])
+; NEW-DAG:           sw $10, 28([[R2]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 28($sp)
+; O32-DAG:           sw [[R3]], 32([[R2]])
+; NEW-DAG:           sw $11, 32([[R2]])
+
+; N32/N64 have run out of registers and start using the stack too
+; O32-DAG:           lw [[R3:\$[0-9]+]], 32($sp)
+; O32-DAG:           sw [[R3]], 36([[R2]])
+; NEW-DAG:           lw [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           sw [[R3]], 36([[R2]])
+
+define void @double_arg2(i8 %a, double %b) nounwind {
+entry:
+        %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
+        store volatile i8 %a, i8* %0
+        %1 = getelementptr [11 x double]* @doubles, i32 0, i32 1
+        store volatile double %b, double* %1
+        ret void
+}
+
+; ALL-LABEL: double_arg2:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(doubles)(
+
+; The first four arguments are the same in O32/N32/N64.
+; The first argument isn't floating point so floating point registers are not
+; used.
+; The second slot is insufficiently aligned for double on O32 so it is skipped.
+; Also, double occupies two slots on O32 and only one for N32/N64.
+; ALL-DAG:           sb $4, 1([[R1]])
+; O32-DAG:           sw $6, 8([[R2]])
+; O32-DAG:           sw $7, 12([[R2]])
+; NEW-DAG:           sd $5, 8([[R2]])
+
+define void @float_arg2(i8 %a, float %b) nounwind {
+entry:
+        %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
+        store volatile i8 %a, i8* %0
+        %1 = getelementptr [11 x float]* @floats, i32 0, i32 1
+        store volatile float %b, float* %1
+        ret void
+}
+
+; ALL-LABEL: float_arg2:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(floats)(
+
+; The first four arguments are the same in O32/N32/N64.
+; The first argument isn't floating point so floating point registers are not
+; used.
+; MD00305 and GCC disagree on this one. MD00305 says that floats are treated
+; as 8-byte aligned and occupy two slots on O32. GCC is treating them as 4-byte
+; aligned and occupying one slot. We'll use GCC's definition.
+; ALL-DAG:           sb $4, 1([[R1]])
+; ALL-DAG:           sw $5, 4([[R2]])
diff --git a/test/CodeGen/Mips/cconv/arguments-fp128.ll b/test/CodeGen/Mips/cconv/arguments-fp128.ll
new file mode 100644
index 0000000..c8cd8fd
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-fp128.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+; RUN: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -soft-float -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+; RUN: llc -march=mips64el -relocation-model=static -soft-float -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+
+; Test the fp128 arguments for all ABI's and byte orders as specified
+; by section 2 of the MIPSpro N32 Handbook.
+;
+; O32 is not tested because long double is the same as double on O32.
+
+@ldoubles = global [11 x fp128] zeroinitializer
+
+define void @ldouble_args(fp128 %a, fp128 %b, fp128 %c, fp128 %d, fp128 %e) nounwind {
+entry:
+        %0 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 1
+        store volatile fp128 %a, fp128* %0
+        %1 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 2
+        store volatile fp128 %b, fp128* %1
+        %2 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 3
+        store volatile fp128 %c, fp128* %2
+        %3 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 4
+        store volatile fp128 %d, fp128* %3
+        %4 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 5
+        store volatile fp128 %e, fp128* %4
+        ret void
+}
+
+; ALL-LABEL: ldouble_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(ldoubles)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(ldoubles)(
+
+; The first four arguments are the same in N32/N64.
+; The first argument is floating point but soft-float is enabled so floating
+; point registers are not used.
+; ALL-DAG:           sd $4, 16([[R2]])
+; ALL-DAG:           sd $5, 24([[R2]])
+; ALL-DAG:           sd $6, 32([[R2]])
+; ALL-DAG:           sd $7, 40([[R2]])
+; ALL-DAG:           sd $8, 48([[R2]])
+; ALL-DAG:           sd $9, 56([[R2]])
+; ALL-DAG:           sd $10, 64([[R2]])
+; ALL-DAG:           sd $11, 72([[R2]])
+
+; N32/N64 have run out of registers and starts using the stack too
+; ALL-DAG:           ld [[R3:\$[0-9]+]], 0($sp)
+; ALL-DAG:           ld [[R4:\$[0-9]+]], 8($sp)
+; ALL-DAG:           sd [[R3]], 80([[R2]])
+; ALL-DAG:           sd [[R4]], 88([[R2]])
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll b/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
new file mode 100644
index 0000000..aadf7d1
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
@@ -0,0 +1,157 @@
+; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+
+; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW %s
+
+; Test the effect of varargs on floating point types in the non-variable part
+; of the argument list as specified by section 2 of the MIPSpro N32 Handbook.
+;
+; N32/N64 are almost identical in this area so many of their checks have been
+; combined into the 'NEW' prefix (the N stands for New).
+;
+; On O32, varargs prevents all FPU argument register usage. This contradicts
+; the N32 handbook, but agrees with the SYSV ABI and GCC's behaviour.
+
+@floats = global [11 x float] zeroinitializer
+@doubles = global [11 x double] zeroinitializer
+
+define void @double_args(double %a, ...)
+                         nounwind {
+entry:
+        %0 = getelementptr [11 x double]* @doubles, i32 0, i32 1
+        store volatile double %a, double* %0
+
+        %ap = alloca i8*
+        %ap2 = bitcast i8** %ap to i8*
+        call void @llvm.va_start(i8* %ap2)
+        %b = va_arg i8** %ap, double
+        %1 = getelementptr [11 x double]* @doubles, i32 0, i32 2
+        store volatile double %b, double* %1
+        ret void
+}
+
+; ALL-LABEL: double_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:         addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:         ld [[R2:\$[0-9]]], %got_disp(doubles)(
+
+; O32 forbids using floating point registers for the non-variable portion.
+; N32/N64 allow it.
+; O32BE-DAG:         mtc1 $5, [[FTMP1:\$f[0-9]*[02468]+]]
+; O32BE-DAG:         mtc1 $4, [[FTMP2:\$f[0-9]*[13579]+]]
+; O32LE-DAG:         mtc1 $4, [[FTMP1:\$f[0-9]*[02468]+]]
+; O32LE-DAG:         mtc1 $5, [[FTMP2:\$f[0-9]*[13579]+]]
+; O32-DAG:           sdc1 [[FTMP1]], 8([[R2]])
+; NEW-DAG:           sdc1 $f12, 8([[R2]])
+
+; The varargs portion is dumped to stack
+; O32-DAG:           sw $6, 16($sp)
+; O32-DAG:           sw $7, 20($sp)
+; NEW-DAG:           sd $5, 8($sp)
+; NEW-DAG:           sd $6, 16($sp)
+; NEW-DAG:           sd $7, 24($sp)
+; NEW-DAG:           sd $8, 32($sp)
+; NEW-DAG:           sd $9, 40($sp)
+; NEW-DAG:           sd $10, 48($sp)
+; NEW-DAG:           sd $11, 56($sp)
+
+; Get the varargs pointer
+; O32 has 4 bytes padding, 4 bytes for the varargs pointer, and 8 bytes reserved
+; for arguments 1 and 2.
+; N32/N64 has 8 bytes for the varargs pointer, and no reserved area.
+; O32-DAG:           addiu [[VAPTR:\$[0-9]+]], $sp, 16
+; O32-DAG:           sw [[VAPTR]], 4($sp)
+; N32-DAG:           addiu [[VAPTR:\$[0-9]+]], $sp, 8
+; N32-DAG:           sw [[VAPTR]], 4($sp)
+; N64-DAG:           daddiu [[VAPTR:\$[0-9]+]], $sp, 8
+; N64-DAG:           sd [[VAPTR]], 0($sp)
+
+; Increment the pointer then get the varargs arg
+; LLVM will rebind the load to the stack pointer instead of the varargs pointer
+; during lowering. This is fine and doesn't change the behaviour.
+; O32-DAG:           addiu [[VAPTR]], [[VAPTR]], 8
+; O32-DAG:           sw [[VAPTR]], 4($sp)
+; N32-DAG:           addiu [[VAPTR]], [[VAPTR]], 8
+; N32-DAG:           sw [[VAPTR]], 4($sp)
+; N64-DAG:           daddiu [[VAPTR]], [[VAPTR]], 8
+; N64-DAG:           sd [[VAPTR]], 0($sp)
+; O32-DAG:           ldc1 [[FTMP1:\$f[0-9]+]], 16($sp)
+; NEW-DAG:           ldc1 [[FTMP1:\$f[0-9]+]], 8($sp)
+; ALL-DAG:           sdc1 [[FTMP1]], 16([[R2]])
+
+define void @float_args(float %a, ...) nounwind {
+entry:
+        %0 = getelementptr [11 x float]* @floats, i32 0, i32 1
+        store volatile float %a, float* %0
+
+        %ap = alloca i8*
+        %ap2 = bitcast i8** %ap to i8*
+        call void @llvm.va_start(i8* %ap2)
+        %b = va_arg i8** %ap, float
+        %1 = getelementptr [11 x float]* @floats, i32 0, i32 2
+        store volatile float %b, float* %1
+        ret void
+}
+
+; ALL-LABEL: float_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:         addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:         ld [[R2:\$[0-9]]], %got_disp(floats)(
+
+; The first four arguments are the same in O32/N32/N64.
+; The non-variable portion should be unaffected.
+; O32-DAG:           sw $4, 4([[R2]])
+; NEW-DAG:           swc1 $f12, 4([[R2]])
+
+; The varargs portion is dumped to stack
+; O32-DAG:           sw $5, 12($sp)
+; O32-DAG:           sw $6, 16($sp)
+; O32-DAG:           sw $7, 20($sp)
+; NEW-DAG:           sd $5, 8($sp)
+; NEW-DAG:           sd $6, 16($sp)
+; NEW-DAG:           sd $7, 24($sp)
+; NEW-DAG:           sd $8, 32($sp)
+; NEW-DAG:           sd $9, 40($sp)
+; NEW-DAG:           sd $10, 48($sp)
+; NEW-DAG:           sd $11, 56($sp)
+
+; Get the varargs pointer
+; O32 has 4 bytes padding, 4 bytes for the varargs pointer, and should have 8
+; bytes reserved for arguments 1 and 2 (the first float arg) but as discussed in
+; arguments-float.ll, GCC doesn't agree with MD00305 and treats floats as 4
+; bytes so we only have 12 bytes total.
+; N32/N64 has 8 bytes for the varargs pointer, and no reserved area.
+; O32-DAG:           addiu [[VAPTR:\$[0-9]+]], $sp, 12
+; O32-DAG:           sw [[VAPTR]], 4($sp)
+; N32-DAG:           addiu [[VAPTR:\$[0-9]+]], $sp, 8
+; N32-DAG:           sw [[VAPTR]], 4($sp)
+; N64-DAG:           daddiu [[VAPTR:\$[0-9]+]], $sp, 8
+; N64-DAG:           sd [[VAPTR]], 0($sp)
+
+; Increment the pointer then get the varargs arg
+; LLVM will rebind the load to the stack pointer instead of the varargs pointer
+; during lowering. This is fine and doesn't change the behaviour.
+; N32/N64 is using ori instead of addiu/daddiu but (although odd) this is fine
+; since the stack is always aligned.
+; O32-DAG:           addiu [[VAPTR]], [[VAPTR]], 4
+; O32-DAG:           sw [[VAPTR]], 4($sp)
+; N32-DAG:           ori [[VAPTR]], [[VAPTR]], 4
+; N32-DAG:           sw [[VAPTR]], 4($sp)
+; N64-DAG:           ori [[VAPTR]], [[VAPTR]], 4
+; N64-DAG:           sd [[VAPTR]], 0($sp)
+; O32-DAG:           lwc1 [[FTMP1:\$f[0-9]+]], 12($sp)
+; NEW-DAG:           lwc1 [[FTMP1:\$f[0-9]+]], 8($sp)
+; ALL-DAG:           swc1 [[FTMP1]], 8([[R2]])
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_copy(i8*, i8*)
+declare void @llvm.va_end(i8*)
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-float.ll b/test/CodeGen/Mips/cconv/arguments-hard-float.ll
new file mode 100644
index 0000000..9837f7e
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-hard-float.ll
@@ -0,0 +1,211 @@
+; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+
+; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+
+; Test the floating point arguments for all ABI's and byte orders as specified
+; by section 5 of MD00305 (MIPS ABIs Described).
+;
+; N32/N64 are identical in this area so their checks have been combined into
+; the 'NEW' prefix (the N stands for New).
+
+@bytes = global [11 x i8] zeroinitializer
+@dwords = global [11 x i64] zeroinitializer
+@floats = global [11 x float] zeroinitializer
+@doubles = global [11 x double] zeroinitializer
+
+define void @double_args(double %a, double %b, double %c, double %d, double %e,
+                         double %f, double %g, double %h, double %i) nounwind {
+entry:
+        %0 = getelementptr [11 x double]* @doubles, i32 0, i32 1
+        store volatile double %a, double* %0
+        %1 = getelementptr [11 x double]* @doubles, i32 0, i32 2
+        store volatile double %b, double* %1
+        %2 = getelementptr [11 x double]* @doubles, i32 0, i32 3
+        store volatile double %c, double* %2
+        %3 = getelementptr [11 x double]* @doubles, i32 0, i32 4
+        store volatile double %d, double* %3
+        %4 = getelementptr [11 x double]* @doubles, i32 0, i32 5
+        store volatile double %e, double* %4
+        %5 = getelementptr [11 x double]* @doubles, i32 0, i32 6
+        store volatile double %f, double* %5
+        %6 = getelementptr [11 x double]* @doubles, i32 0, i32 7
+        store volatile double %g, double* %6
+        %7 = getelementptr [11 x double]* @doubles, i32 0, i32 8
+        store volatile double %h, double* %7
+        %8 = getelementptr [11 x double]* @doubles, i32 0, i32 9
+        store volatile double %i, double* %8
+        ret void
+}
+
+; ALL-LABEL: double_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(doubles)(
+
+; The first argument is floating point so floating point registers are used.
+; The first argument is the same for O32/N32/N64 but the second argument differs
+; by register
+; ALL-DAG:           sdc1 $f12, 8([[R2]])
+; O32-DAG:           sdc1 $f14, 16([[R2]])
+; NEW-DAG:           sdc1 $f13, 16([[R2]])
+
+; O32 has run out of argument registers and starts using the stack
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 16($sp)
+; O32-DAG:           sdc1 [[F1]], 24([[R2]])
+; NEW-DAG:           sdc1 $f14, 24([[R2]])
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 24($sp)
+; O32-DAG:           sdc1 [[F1]], 32([[R2]])
+; NEW-DAG:           sdc1 $f15, 32([[R2]])
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 32($sp)
+; O32-DAG:           sdc1 [[F1]], 40([[R2]])
+; NEW-DAG:           sdc1 $f16, 40([[R2]])
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 40($sp)
+; O32-DAG:           sdc1 [[F1]], 48([[R2]])
+; NEW-DAG:           sdc1 $f17, 48([[R2]])
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 48($sp)
+; O32-DAG:           sdc1 [[F1]], 56([[R2]])
+; NEW-DAG:           sdc1 $f18, 56([[R2]])
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 56($sp)
+; O32-DAG:           sdc1 [[F1]], 64([[R2]])
+; NEW-DAG:           sdc1 $f19, 64([[R2]])
+
+; N32/N64 have run out of registers and start using the stack too
+; O32-DAG:           ldc1 [[F1:\$f[0-9]+]], 64($sp)
+; O32-DAG:           sdc1 [[F1]], 72([[R2]])
+; NEW-DAG:           ldc1 [[F1:\$f[0-9]+]], 0($sp)
+; NEW-DAG:           sdc1 [[F1]], 72([[R2]])
+
+define void @float_args(float %a, float %b, float %c, float %d, float %e,
+                        float %f, float %g, float %h, float %i) nounwind {
+entry:
+        %0 = getelementptr [11 x float]* @floats, i32 0, i32 1
+        store volatile float %a, float* %0
+        %1 = getelementptr [11 x float]* @floats, i32 0, i32 2
+        store volatile float %b, float* %1
+        %2 = getelementptr [11 x float]* @floats, i32 0, i32 3
+        store volatile float %c, float* %2
+        %3 = getelementptr [11 x float]* @floats, i32 0, i32 4
+        store volatile float %d, float* %3
+        %4 = getelementptr [11 x float]* @floats, i32 0, i32 5
+        store volatile float %e, float* %4
+        %5 = getelementptr [11 x float]* @floats, i32 0, i32 6
+        store volatile float %f, float* %5
+        %6 = getelementptr [11 x float]* @floats, i32 0, i32 7
+        store volatile float %g, float* %6
+        %7 = getelementptr [11 x float]* @floats, i32 0, i32 8
+        store volatile float %h, float* %7
+        %8 = getelementptr [11 x float]* @floats, i32 0, i32 9
+        store volatile float %i, float* %8
+        ret void
+}
+
+; ALL-LABEL: float_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(floats)(
+
+; The first argument is floating point so floating point registers are used.
+; The first argument is the same for O32/N32/N64 but the second argument differs
+; by register
+; ALL-DAG:           swc1 $f12, 4([[R1]])
+; O32-DAG:           swc1 $f14, 8([[R1]])
+; NEW-DAG:           swc1 $f13, 8([[R1]])
+
+; O32 has run out of argument registers and (in theory) starts using the stack
+; I've yet to find a reference in the documentation about this but GCC uses up
+; the remaining two argument slots in the GPR's first. We'll do the same for
+; compatibility.
+; O32-DAG:           sw $6, 12([[R1]])
+; NEW-DAG:           swc1 $f14, 12([[R1]])
+; O32-DAG:           sw $7, 16([[R1]])
+; NEW-DAG:           swc1 $f15, 16([[R1]])
+
+; O32 is definitely out of registers now and switches to the stack.
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 16($sp)
+; O32-DAG:           swc1 [[F1]], 20([[R1]])
+; NEW-DAG:           swc1 $f16, 20([[R1]])
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 20($sp)
+; O32-DAG:           swc1 [[F1]], 24([[R1]])
+; NEW-DAG:           swc1 $f17, 24([[R1]])
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 24($sp)
+; O32-DAG:           swc1 [[F1]], 28([[R1]])
+; NEW-DAG:           swc1 $f18, 28([[R1]])
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 28($sp)
+; O32-DAG:           swc1 [[F1]], 32([[R1]])
+; NEW-DAG:           swc1 $f19, 32([[R1]])
+
+; N32/N64 have run out of registers and start using the stack too
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 32($sp)
+; O32-DAG:           swc1 [[F1]], 36([[R1]])
+; NEW-DAG:           lwc1 [[F1:\$f[0-9]+]], 0($sp)
+; NEW-DAG:           swc1 [[F1]], 36([[R1]])
+
+
+define void @double_arg2(i8 %a, double %b) nounwind {
+entry:
+        %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
+        store volatile i8 %a, i8* %0
+        %1 = getelementptr [11 x double]* @doubles, i32 0, i32 1
+        store volatile double %b, double* %1
+        ret void
+}
+
+; ALL-LABEL: double_arg2:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(doubles)(
+
+; The first argument is the same in O32/N32/N64.
+; ALL-DAG:           sb $4, 1([[R1]])
+
+; The first argument isn't floating point so floating point registers are not
+; used in O32, but N32/N64 will still use them.
+; The second slot is insufficiently aligned for double on O32 so it is skipped.
+; Also, double occupies two slots on O32 and only one for N32/N64.
+; O32LE-DAG:           mtc1 $6, [[F1:\$f[0-9]*[02468]+]]
+; O32LE-DAG:           mtc1 $7, [[F2:\$f[0-9]*[13579]+]]
+; O32BE-DAG:           mtc1 $6, [[F2:\$f[0-9]*[13579]+]]
+; O32BE-DAG:           mtc1 $7, [[F1:\$f[0-9]*[02468]+]]
+; O32-DAG:           sdc1 [[F1]], 8([[R2]])
+; NEW-DAG:           sdc1 $f13, 8([[R2]])
+
+define void @float_arg2(i8 %a, float %b) nounwind {
+entry:
+        %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
+        store volatile i8 %a, i8* %0
+        %1 = getelementptr [11 x float]* @floats, i32 0, i32 1
+        store volatile float %b, float* %1
+        ret void
+}
+
+; ALL-LABEL: float_arg2:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(floats)(
+
+; The first argument is the same in O32/N32/N64.
+; ALL-DAG:           sb $4, 1([[R1]])
+
+; The first argument isn't floating point so floating point registers are not
+; used in O32, but N32/N64 will still use them.
+; MD00305 and GCC disagree on this one. MD00305 says that floats are treated
+; as 8-byte aligned and occupy two slots on O32. GCC is treating them as 4-byte
+; aligned and occupying one slot. We'll use GCC's definition.
+; O32-DAG:           sw $5, 4([[R2]])
+; NEW-DAG:           swc1 $f13, 4([[R2]])
diff --git a/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll b/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll
new file mode 100644
index 0000000..5e3f403
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-hard-fp128.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 %s
+
+; Test the fp128 arguments for all ABI's and byte orders as specified
+; by section 2 of the MIPSpro N32 Handbook.
+;
+; O32 is not tested because long double is the same as double on O32.
+
+@ldoubles = global [11 x fp128] zeroinitializer
+
+define void @ldouble_args(fp128 %a, fp128 %b, fp128 %c, fp128 %d, fp128 %e) nounwind {
+entry:
+        %0 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 1
+        store volatile fp128 %a, fp128* %0
+        %1 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 2
+        store volatile fp128 %b, fp128* %1
+        %2 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 3
+        store volatile fp128 %c, fp128* %2
+        %3 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 4
+        store volatile fp128 %d, fp128* %3
+        %4 = getelementptr [11 x fp128]* @ldoubles, i32 0, i32 5
+        store volatile fp128 %e, fp128* %4
+        ret void
+}
+
+; ALL-LABEL: ldouble_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(ldoubles)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(ldoubles)(
+
+; The first four arguments are the same in N32/N64.
+; ALL-DAG:           sdc1 $f12, 16([[R2]])
+; ALL-DAG:           sdc1 $f13, 24([[R2]])
+; ALL-DAG:           sdc1 $f14, 32([[R2]])
+; ALL-DAG:           sdc1 $f15, 40([[R2]])
+; ALL-DAG:           sdc1 $f16, 48([[R2]])
+; ALL-DAG:           sdc1 $f17, 56([[R2]])
+; ALL-DAG:           sdc1 $f18, 64([[R2]])
+; ALL-DAG:           sdc1 $f19, 72([[R2]])
+
+; N32/N64 have run out of registers and starts using the stack too
+; ALL-DAG:           ld [[R3:\$[0-9]+]], 0($sp)
+; ALL-DAG:           ld [[R4:\$[0-9]+]], 8($sp)
+; ALL-DAG:           sd [[R3]], 80([[R2]])
+; ALL-DAG:           sd [[R4]], 88([[R2]])
diff --git a/test/CodeGen/Mips/cconv/arguments.ll b/test/CodeGen/Mips/cconv/arguments.ll
new file mode 100644
index 0000000..8fe29f3
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments.ll
@@ -0,0 +1,170 @@
+; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+
+; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=NEW %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=NEW %s
+
+; Test the integer arguments for all ABI's and byte orders as specified by
+; section 5 of MD00305 (MIPS ABIs Described).
+;
+; N32/N64 are identical in this area so their checks have been combined into
+; the 'NEW' prefix (the N stands for New).
+;
+; Varargs are covered in arguments-hard-float-varargs.ll.
+
+@bytes = global [11 x i8] zeroinitializer
+@dwords = global [11 x i64] zeroinitializer
+@floats = global [11 x float] zeroinitializer
+@doubles = global [11 x double] zeroinitializer
+
+define void @align_to_arg_slots(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g,
+                                i8 %h, i8 %i, i8 %j) nounwind {
+entry:
+        %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
+        store volatile i8 %a, i8* %0
+        %1 = getelementptr [11 x i8]* @bytes, i32 0, i32 2
+        store volatile i8 %b, i8* %1
+        %2 = getelementptr [11 x i8]* @bytes, i32 0, i32 3
+        store volatile i8 %c, i8* %2
+        %3 = getelementptr [11 x i8]* @bytes, i32 0, i32 4
+        store volatile i8 %d, i8* %3
+        %4 = getelementptr [11 x i8]* @bytes, i32 0, i32 5
+        store volatile i8 %e, i8* %4
+        %5 = getelementptr [11 x i8]* @bytes, i32 0, i32 6
+        store volatile i8 %f, i8* %5
+        %6 = getelementptr [11 x i8]* @bytes, i32 0, i32 7
+        store volatile i8 %g, i8* %6
+        %7 = getelementptr [11 x i8]* @bytes, i32 0, i32 8
+        store volatile i8 %h, i8* %7
+        %8 = getelementptr [11 x i8]* @bytes, i32 0, i32 9
+        store volatile i8 %i, i8* %8
+        %9 = getelementptr [11 x i8]* @bytes, i32 0, i32 10
+        store volatile i8 %j, i8* %9
+        ret void
+}
+
+; ALL-LABEL: align_to_arg_slots:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+
+; The first four arguments are the same in O32/N32/N64
+; ALL-DAG:           sb $4, 1([[R1]])
+; ALL-DAG:           sb $5, 2([[R1]])
+; ALL-DAG:           sb $6, 3([[R1]])
+; ALL-DAG:           sb $7, 4([[R1]])
+
+; N32/N64 get an extra four arguments in registers
+; O32 starts loading from the stack. The addresses start at 16 because space is
+; always reserved for the first four arguments.
+; O32-DAG:           lw [[R3:\$[0-9]+]], 16($sp)
+; O32-DAG:           sb [[R3]], 5([[R1]])
+; NEW-DAG:           sb $8, 5([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 20($sp)
+; O32-DAG:           sb [[R3]], 6([[R1]])
+; NEW-DAG:           sb $9, 6([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 24($sp)
+; O32-DAG:           sb [[R3]], 7([[R1]])
+; NEW-DAG:           sb $10, 7([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 28($sp)
+; O32-DAG:           sb [[R3]], 8([[R1]])
+; NEW-DAG:           sb $11, 8([[R1]])
+
+; O32/N32/N64 are accessing the stack at this point.
+; Unlike O32, N32/N64 do not reserve space for the arguments.
+; increase by 4 for O32 and 8 for N32/N64.
+; O32-DAG:           lw [[R3:\$[0-9]+]], 32($sp)
+; O32-DAG:           sb [[R3]], 9([[R1]])
+; NEW-DAG:           lw [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           sb [[R3]], 9([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 36($sp)
+; O32-DAG:           sb [[R3]], 10([[R1]])
+; NEW-DAG:           lw [[R3:\$[0-9]+]], 8($sp)
+; NEW-DAG:           sb [[R3]], 10([[R1]])
+
+define void @slot_skipping(i8 %a, i64 %b, i8 %c, i8 %d,
+                           i8 %e, i8 %f, i8 %g, i64 %i, i8 %j) nounwind {
+entry:
+        %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
+        store volatile i8 %a, i8* %0
+        %1 = getelementptr [11 x i64]* @dwords, i32 0, i32 1
+        store volatile i64 %b, i64* %1
+        %2 = getelementptr [11 x i8]* @bytes, i32 0, i32 2
+        store volatile i8 %c, i8* %2
+        %3 = getelementptr [11 x i8]* @bytes, i32 0, i32 3
+        store volatile i8 %d, i8* %3
+        %4 = getelementptr [11 x i8]* @bytes, i32 0, i32 4
+        store volatile i8 %e, i8* %4
+        %5 = getelementptr [11 x i8]* @bytes, i32 0, i32 5
+        store volatile i8 %f, i8* %5
+        %6 = getelementptr [11 x i8]* @bytes, i32 0, i32 6
+        store volatile i8 %g, i8* %6
+        %7 = getelementptr [11 x i64]* @dwords, i32 0, i32 2
+        store volatile i64 %i, i64* %7
+        %8 = getelementptr [11 x i8]* @bytes, i32 0, i32 7
+        store volatile i8 %j, i8* %8
+        ret void
+}
+
+; ALL-LABEL: slot_skipping:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(dwords)(
+
+; The first argument is the same in O32/N32/N64.
+; ALL-DAG:           sb $4, 1([[R1]])
+
+; The second slot is insufficiently aligned for i64 on O32 so it is skipped.
+; Also, i64 occupies two slots on O32 and only one for N32/N64.
+; O32-DAG:           sw $6, 8([[R2]])
+; O32-DAG:           sw $7, 12([[R2]])
+; NEW-DAG:           sd $5, 8([[R2]])
+
+; N32/N64 get an extra four arguments in registers and still have two left from
+; the first four.
+; O32 starts loading from the stack. The addresses start at 16 because space is
+; always reserved for the first four arguments.
+; It's not clear why O32 uses lbu for this argument, but it's not wrong so we'll
+; accept it for now. The only IR difference is that this argument has
+; anyext from i8 and align 8 on it.
+; O32LE-DAG:           lbu [[R3:\$[0-9]+]], 16($sp)
+; O32BE-DAG:           lbu [[R3:\$[0-9]+]], 19($sp)
+; O32-DAG:           sb [[R3]], 2([[R1]])
+; NEW-DAG:           sb $6, 2([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 20($sp)
+; O32-DAG:           sb [[R3]], 3([[R1]])
+; NEW-DAG:           sb $7, 3([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 24($sp)
+; O32-DAG:           sb [[R3]], 4([[R1]])
+; NEW-DAG:           sb $8, 4([[R1]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 28($sp)
+; O32-DAG:           sb [[R3]], 5([[R1]])
+; NEW-DAG:           sb $9, 5([[R1]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 32($sp)
+; O32-DAG:           sb [[R3]], 6([[R1]])
+; NEW-DAG:           sb $10, 6([[R1]])
+
+; O32-DAG:           lw [[R3:\$[0-9]+]], 40($sp)
+; O32-DAG:           sw [[R3]], 16([[R2]])
+; O32-DAG:           lw [[R3:\$[0-9]+]], 44($sp)
+; O32-DAG:           sw [[R3]], 20([[R2]])
+; NEW-DAG:           sd $11, 16([[R2]])
+
+; O32/N32/N64 are accessing the stack at this point.
+; Unlike O32, N32/N64 do not reserve space for the arguments.
+; increase by 4 for O32 and 8 for N32/N64.
+; O32-DAG:           lw [[R3:\$[0-9]+]], 48($sp)
+; O32-DAG:           sb [[R3]], 7([[R1]])
+; NEW-DAG:           lw [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           sb [[R3]], 7([[R1]])
diff --git a/test/CodeGen/Mips/cconv/callee-saved-float.ll b/test/CodeGen/Mips/cconv/callee-saved-float.ll
new file mode 100644
index 0000000..de4d917
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/callee-saved-float.ll
@@ -0,0 +1,111 @@
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=O32-INV %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=O32-INV %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N32-INV %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N32-INV %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N64-INV %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=ALL-INV --check-prefix=N64-INV %s
+
+; Test the the callee-saved registers are callee-saved as specified by section
+; 2 of the MIPSpro N32 Handbook and section 3 of the SYSV ABI spec.
+
+define void @fpu_clobber() nounwind {
+entry:
+        call void asm "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f12},~{$f13},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+        ret void
+}
+
+; ALL-LABEL: fpu_clobber:
+; ALL-INV-NOT:   sdc1 $f0,
+; ALL-INV-NOT:   sdc1 $f1,
+; ALL-INV-NOT:   sdc1 $f2,
+; ALL-INV-NOT:   sdc1 $f3,
+; ALL-INV-NOT:   sdc1 $f4,
+; ALL-INV-NOT:   sdc1 $f5,
+; ALL-INV-NOT:   sdc1 $f6,
+; ALL-INV-NOT:   sdc1 $f7,
+; ALL-INV-NOT:   sdc1 $f8,
+; ALL-INV-NOT:   sdc1 $f9,
+; ALL-INV-NOT:   sdc1 $f10,
+; ALL-INV-NOT:   sdc1 $f11,
+; ALL-INV-NOT:   sdc1 $f12,
+; ALL-INV-NOT:   sdc1 $f13,
+; ALL-INV-NOT:   sdc1 $f14,
+; ALL-INV-NOT:   sdc1 $f15,
+; ALL-INV-NOT:   sdc1 $f16,
+; ALL-INV-NOT:   sdc1 $f17,
+; ALL-INV-NOT:   sdc1 $f18,
+; ALL-INV-NOT:   sdc1 $f19,
+; ALL-INV-NOT:   sdc1 $f21,
+; ALL-INV-NOT:   sdc1 $f23,
+
+; O32:           addiu $sp, $sp, -48
+; O32-DAG:       sdc1 [[F20:\$f20]], [[OFF20:[0-9]+]]($sp)
+; O32-DAG:       sdc1 [[F22:\$f22]], [[OFF22:[0-9]+]]($sp)
+; O32-DAG:       sdc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp)
+; O32-DAG:       sdc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp)
+; O32-DAG:       sdc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp)
+; O32-DAG:       sdc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp)
+; O32-DAG:       ldc1 [[F20]], [[OFF20]]($sp)
+; O32-DAG:       ldc1 [[F22]], [[OFF22]]($sp)
+; O32-DAG:       ldc1 [[F24]], [[OFF24]]($sp)
+; O32-INV-NOT:   sdc1 $f25,
+; O32-DAG:       ldc1 [[F26]], [[OFF26]]($sp)
+; O32-INV-NOT:   sdc1 $f27,
+; O32-DAG:       ldc1 [[F28]], [[OFF28]]($sp)
+; O32-INV-NOT:   sdc1 $f29,
+; O32-DAG:       ldc1 [[F30]], [[OFF30]]($sp)
+; O32-INV-NOT:   sdc1 $f31,
+; O32:           addiu $sp, $sp, 48
+
+; N32:           addiu $sp, $sp, -48
+; N32-DAG:       sdc1 [[F20:\$f20]], [[OFF20:[0-9]+]]($sp)
+; N32-DAG:       sdc1 [[F22:\$f22]], [[OFF22:[0-9]+]]($sp)
+; N32-DAG:       sdc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp)
+; N32-DAG:       sdc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp)
+; N32-DAG:       sdc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp)
+; N32-DAG:       sdc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp)
+; N32-DAG:       ldc1 [[F20]], [[OFF20]]($sp)
+; N32-DAG:       ldc1 [[F22]], [[OFF22]]($sp)
+; N32-DAG:       ldc1 [[F24]], [[OFF24]]($sp)
+; N32-INV-NOT:   sdc1 $f25,
+; N32-DAG:       ldc1 [[F26]], [[OFF26]]($sp)
+; N32-INV-NOT:   sdc1 $f27,
+; N32-DAG:       ldc1 [[F28]], [[OFF28]]($sp)
+; N32-INV-NOT:   sdc1 $f29,
+; N32-DAG:       ldc1 [[F30]], [[OFF30]]($sp)
+; N32-INV-NOT:   sdc1 $f31,
+; N32:           addiu $sp, $sp, 48
+
+; N64:           addiu $sp, $sp, -64
+; N64-INV-NOT:   sdc1 $f20,
+; N64-INV-NOT:   sdc1 $f22,
+; N64-DAG:       sdc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F25:\$f25]], [[OFF25:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F27:\$f27]], [[OFF27:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F29:\$f29]], [[OFF29:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp)
+; N64-DAG:       sdc1 [[F31:\$f31]], [[OFF31:[0-9]+]]($sp)
+; N64-DAG:       ldc1 [[F24]], [[OFF24]]($sp)
+; N64-DAG:       ldc1 [[F25]], [[OFF25]]($sp)
+; N64-DAG:       ldc1 [[F26]], [[OFF26]]($sp)
+; N64-DAG:       ldc1 [[F27]], [[OFF27]]($sp)
+; N64-DAG:       ldc1 [[F28]], [[OFF28]]($sp)
+; N64-DAG:       ldc1 [[F29]], [[OFF29]]($sp)
+; N64-DAG:       ldc1 [[F30]], [[OFF30]]($sp)
+; N64-DAG:       ldc1 [[F31]], [[OFF31]]($sp)
+; N64:           addiu $sp, $sp, 64
diff --git a/test/CodeGen/Mips/cconv/callee-saved.ll b/test/CodeGen/Mips/cconv/callee-saved.ll
new file mode 100644
index 0000000..293e99f
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/callee-saved.ll
@@ -0,0 +1,167 @@
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32-INV %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32-INV %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32-INV %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64-INV %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64-INV %s
+
+; Test the the callee-saved registers are callee-saved as specified by section
+; 2 of the MIPSpro N32 Handbook and section 3 of the SYSV ABI spec.
+
+define void @gpr_clobber() nounwind {
+entry:
+        ; Clobbering the stack pointer is a bad idea so we'll skip that one
+        call void asm "# Clobber", "~{$0},~{$1},~{$2},~{$3},~{$4},~{$5},~{$6},~{$7},~{$8},~{$9},~{$10},~{$11},~{$12},~{$13},~{$14},~{$15},~{$16},~{$17},~{$18},~{$19},~{$20},~{$21},~{$22},~{$23},~{$24},~{$25},~{$26},~{$27},~{$28},~{$30},~{$31}"()
+        ret void
+}
+
+; ALL-LABEL: gpr_clobber:
+; O32:           addiu $sp, $sp, -40
+; O32-INV-NOT:   sw $0,
+; O32-INV-NOT:   sw $1,
+; O32-INV-NOT:   sw $2,
+; O32-INV-NOT:   sw $3,
+; O32-INV-NOT:   sw $4,
+; O32-INV-NOT:   sw $5,
+; O32-INV-NOT:   sw $6,
+; O32-INV-NOT:   sw $7,
+; O32-INV-NOT:   sw $8,
+; O32-INV-NOT:   sw $9,
+; O32-INV-NOT:   sw $10,
+; O32-INV-NOT:   sw $11,
+; O32-INV-NOT:   sw $12,
+; O32-INV-NOT:   sw $13,
+; O32-INV-NOT:   sw $14,
+; O32-INV-NOT:   sw $15,
+; O32-DAG:       sw [[G16:\$16]], [[OFF16:[0-9]+]]($sp)
+; O32-DAG:       sw [[G17:\$17]], [[OFF17:[0-9]+]]($sp)
+; O32-DAG:       sw [[G18:\$18]], [[OFF18:[0-9]+]]($sp)
+; O32-DAG:       sw [[G19:\$19]], [[OFF19:[0-9]+]]($sp)
+; O32-DAG:       sw [[G20:\$20]], [[OFF20:[0-9]+]]($sp)
+; O32-DAG:       sw [[G21:\$21]], [[OFF21:[0-9]+]]($sp)
+; O32-DAG:       sw [[G22:\$22]], [[OFF22:[0-9]+]]($sp)
+; O32-DAG:       sw [[G23:\$23]], [[OFF23:[0-9]+]]($sp)
+; O32-INV-NOT:   sw $24,
+; O32-INV-NOT:   sw $25,
+; O32-INV-NOT:   sw $26,
+; O32-INV-NOT:   sw $27,
+; O32-INV-NOT:   sw $28,
+; O32-INV-NOT:   sw $29,
+; O32-DAG:       sw [[G30:\$fp]], [[OFF30:[0-9]+]]($sp)
+; O32-DAG:       sw [[G31:\$fp]], [[OFF31:[0-9]+]]($sp)
+; O32-DAG:       lw [[G16]], [[OFF16]]($sp)
+; O32-DAG:       lw [[G17]], [[OFF17]]($sp)
+; O32-DAG:       lw [[G18]], [[OFF18]]($sp)
+; O32-DAG:       lw [[G19]], [[OFF19]]($sp)
+; O32-DAG:       lw [[G20]], [[OFF20]]($sp)
+; O32-DAG:       lw [[G21]], [[OFF21]]($sp)
+; O32-DAG:       lw [[G22]], [[OFF22]]($sp)
+; O32-DAG:       lw [[G23]], [[OFF23]]($sp)
+; O32-DAG:       lw [[G30]], [[OFF30]]($sp)
+; O32-DAG:       lw [[G31]], [[OFF31]]($sp)
+; O32:           addiu $sp, $sp, 40
+
+; N32:           addiu $sp, $sp, -96
+; N32-INV-NOT:   sd $0,
+; N32-INV-NOT:   sd $1,
+; N32-INV-NOT:   sd $2,
+; N32-INV-NOT:   sd $3,
+; N32-INV-NOT:   sd $4,
+; N32-INV-NOT:   sd $5,
+; N32-INV-NOT:   sd $6,
+; N32-INV-NOT:   sd $7,
+; N32-INV-NOT:   sd $8,
+; N32-INV-NOT:   sd $9,
+; N32-INV-NOT:   sd $10,
+; N32-INV-NOT:   sd $11,
+; N32-INV-NOT:   sd $12,
+; N32-INV-NOT:   sd $13,
+; N32-INV-NOT:   sd $14,
+; N32-INV-NOT:   sd $15,
+; N32-DAG:       sd [[G16:\$16]], [[OFF16:[0-9]+]]($sp)
+; N32-DAG:       sd [[G17:\$17]], [[OFF17:[0-9]+]]($sp)
+; N32-DAG:       sd [[G18:\$18]], [[OFF18:[0-9]+]]($sp)
+; N32-DAG:       sd [[G19:\$19]], [[OFF19:[0-9]+]]($sp)
+; N32-DAG:       sd [[G20:\$20]], [[OFF20:[0-9]+]]($sp)
+; N32-DAG:       sd [[G21:\$21]], [[OFF21:[0-9]+]]($sp)
+; N32-DAG:       sd [[G22:\$22]], [[OFF22:[0-9]+]]($sp)
+; N32-DAG:       sd [[G23:\$23]], [[OFF23:[0-9]+]]($sp)
+; N32-INV-NOT:   sd $24,
+; N32-INV-NOT:   sd $25,
+; N32-INV-NOT:   sd $26,
+; N32-INV-NOT:   sd $27,
+; N32-DAG:       sd [[G28:\$gp]], [[OFF28:[0-9]+]]($sp)
+; N32-INV-NOT:   sd $29,
+; N32-DAG:       sd [[G30:\$fp]], [[OFF30:[0-9]+]]($sp)
+; N32-DAG:       sd [[G31:\$fp]], [[OFF31:[0-9]+]]($sp)
+; N32-DAG:       ld [[G16]], [[OFF16]]($sp)
+; N32-DAG:       ld [[G17]], [[OFF17]]($sp)
+; N32-DAG:       ld [[G18]], [[OFF18]]($sp)
+; N32-DAG:       ld [[G19]], [[OFF19]]($sp)
+; N32-DAG:       ld [[G20]], [[OFF20]]($sp)
+; N32-DAG:       ld [[G21]], [[OFF21]]($sp)
+; N32-DAG:       ld [[G22]], [[OFF22]]($sp)
+; N32-DAG:       ld [[G23]], [[OFF23]]($sp)
+; N32-DAG:       ld [[G28]], [[OFF28]]($sp)
+; N32-DAG:       ld [[G30]], [[OFF30]]($sp)
+; N32-DAG:       ld [[G31]], [[OFF31]]($sp)
+; N32:           addiu $sp, $sp, 96
+
+; N64:           daddiu $sp, $sp, -96
+; N64-INV-NOT:   sd $0,
+; N64-INV-NOT:   sd $1,
+; N64-INV-NOT:   sd $2,
+; N64-INV-NOT:   sd $3,
+; N64-INV-NOT:   sd $4,
+; N64-INV-NOT:   sd $5,
+; N64-INV-NOT:   sd $6,
+; N64-INV-NOT:   sd $7,
+; N64-INV-NOT:   sd $8,
+; N64-INV-NOT:   sd $9,
+; N64-INV-NOT:   sd $10,
+; N64-INV-NOT:   sd $11,
+; N64-INV-NOT:   sd $12,
+; N64-INV-NOT:   sd $13,
+; N64-INV-NOT:   sd $14,
+; N64-INV-NOT:   sd $15,
+; N64-DAG:       sd [[G16:\$16]], [[OFF16:[0-9]+]]($sp)
+; N64-DAG:       sd [[G17:\$17]], [[OFF17:[0-9]+]]($sp)
+; N64-DAG:       sd [[G18:\$18]], [[OFF18:[0-9]+]]($sp)
+; N64-DAG:       sd [[G19:\$19]], [[OFF19:[0-9]+]]($sp)
+; N64-DAG:       sd [[G20:\$20]], [[OFF20:[0-9]+]]($sp)
+; N64-DAG:       sd [[G21:\$21]], [[OFF21:[0-9]+]]($sp)
+; N64-DAG:       sd [[G22:\$22]], [[OFF22:[0-9]+]]($sp)
+; N64-DAG:       sd [[G23:\$23]], [[OFF23:[0-9]+]]($sp)
+; N64-DAG:       sd [[G30:\$fp]], [[OFF30:[0-9]+]]($sp)
+; N64-DAG:       sd [[G31:\$fp]], [[OFF31:[0-9]+]]($sp)
+; N64-INV-NOT:   sd $24,
+; N64-INV-NOT:   sd $25,
+; N64-INV-NOT:   sd $26,
+; N64-INV-NOT:   sd $27,
+; N64-DAG:       sd [[G28:\$gp]], [[OFF28:[0-9]+]]($sp)
+; N64-INV-NOT:   sd $29,
+; N64-DAG:       ld [[G16]], [[OFF16]]($sp)
+; N64-DAG:       ld [[G17]], [[OFF17]]($sp)
+; N64-DAG:       ld [[G18]], [[OFF18]]($sp)
+; N64-DAG:       ld [[G19]], [[OFF19]]($sp)
+; N64-DAG:       ld [[G20]], [[OFF20]]($sp)
+; N64-DAG:       ld [[G21]], [[OFF21]]($sp)
+; N64-DAG:       ld [[G22]], [[OFF22]]($sp)
+; N64-DAG:       ld [[G23]], [[OFF23]]($sp)
+; N64-DAG:       ld [[G28]], [[OFF28]]($sp)
+; N64-DAG:       ld [[G30]], [[OFF30]]($sp)
+; N64-DAG:       ld [[G31]], [[OFF31]]($sp)
+; N64:           daddiu $sp, $sp, 96
diff --git a/test/CodeGen/Mips/cconv/memory-layout.ll b/test/CodeGen/Mips/cconv/memory-layout.ll
new file mode 100644
index 0000000..0c3cc9e
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/memory-layout.ll
@@ -0,0 +1,140 @@
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test the memory layout for all ABI's and byte orders as specified by section
+; 4 of MD00305 (MIPS ABIs Described).
+; Bitfields are not covered since they are not available as a type in LLVM IR.
+;
+; The assembly directives deal with endianness so we don't need to account for
+; that.
+
+; Deliberately request alignments that are too small for the target so we get
+; the minimum alignment instead of the preferred alignment.
+@byte = global i8 1, align 1
+@halfword = global i16 258, align 1
+@word = global i32 16909060, align 1
+@float = global float 1.0, align 1
+@dword = global i64 283686952306183, align 1
+@double = global double 1.0, align 1
+@pointer = global i8* @byte
+
+; ALL-NOT:       .align
+; ALL-LABEL: byte:
+; ALL:           .byte 1
+; ALL:           .size byte, 1
+
+; ALL:           .align 1
+; ALL-LABEL: halfword:
+; ALL:           .2byte 258
+; ALL:           .size halfword, 2
+
+; ALL:           .align 2
+; ALL-LABEL: word:
+; ALL:           .4byte 16909060
+; ALL:           .size word, 4
+
+; ALL:           .align 2
+; ALL-LABEL: float:
+; ALL:           .4byte 1065353216
+; ALL:           .size float, 4
+
+; ALL:           .align 3
+; ALL-LABEL: dword:
+; ALL:           .8byte 283686952306183
+; ALL:           .size dword, 8
+
+; ALL:           .align 3
+; ALL-LABEL: double:
+; ALL:           .8byte 4607182418800017408
+; ALL:           .size double, 8
+
+; O32:           .align 2
+; N32:           .align 2
+; N64:           .align 3
+; ALL-LABEL: pointer:
+; O32:           .4byte byte
+; O32:           .size pointer, 4
+; N32:           .4byte byte
+; N32:           .size pointer, 4
+; N64:           .8byte byte
+; N64:           .size pointer, 8
+
+@byte_array = global [2 x i8] [i8 1, i8 2], align 1
+@halfword_array = global [2 x i16] [i16 1, i16 2], align 1
+@word_array = global [2 x i32] [i32 1, i32 2], align 1
+@float_array = global [2 x float] [float 1.0, float 2.0], align 1
+@dword_array = global [2 x i64] [i64 1, i64 2], align 1
+@double_array = global [2 x double] [double 1.0, double 2.0], align 1
+@pointer_array = global [2 x i8*] [i8* @byte, i8* @byte]
+
+; ALL-NOT:       .align
+; ALL-LABEL: byte_array:
+; ALL:           .ascii "\001\002"
+; ALL:           .size byte_array, 2
+
+; ALL:           .align 1
+; ALL-LABEL: halfword_array:
+; ALL:           .2byte 1
+; ALL:           .2byte 2
+; ALL:           .size halfword_array, 4
+
+; ALL:           .align 2
+; ALL-LABEL: word_array:
+; ALL:           .4byte 1
+; ALL:           .4byte 2
+; ALL:           .size word_array, 8
+
+; ALL:           .align 2
+; ALL-LABEL: float_array:
+; ALL:           .4byte 1065353216
+; ALL:           .4byte 1073741824
+; ALL:           .size float_array, 8
+
+; ALL:           .align 3
+; ALL-LABEL: dword_array:
+; ALL:           .8byte 1
+; ALL:           .8byte 2
+; ALL:           .size dword_array, 16
+
+; ALL:           .align 3
+; ALL-LABEL: double_array:
+; ALL:           .8byte 4607182418800017408
+; ALL:           .8byte 4611686018427387904
+; ALL:           .size double_array, 16
+
+; O32:           .align 2
+; N32:           .align 2
+; N64:           .align 3
+; ALL-LABEL: pointer_array:
+; O32:           .4byte byte
+; O32:           .4byte byte
+; O32:           .size pointer_array, 8
+; N32:           .4byte byte
+; N32:           .4byte byte
+; N32:           .size pointer_array, 8
+; N64:           .8byte byte
+; N64:           .8byte byte
+; N64:           .size pointer_array, 16
+
+%mixed = type { i8, double, i16 }
+@mixed = global %mixed { i8 1, double 1.0, i16 515 }, align 1
+
+; ALL:           .align 3
+; ALL-LABEL: mixed:
+; ALL:           .byte 1
+; ALL:           .space 7
+; ALL:           .8byte 4607182418800017408
+; ALL:           .2byte 515
+; ALL:           .space 6
+; ALL:           .size mixed, 24
+
+; Bitfields are not available in LLVM IR so we can't test them here.
diff --git a/test/CodeGen/Mips/cconv/reserved-space.ll b/test/CodeGen/Mips/cconv/reserved-space.ll
new file mode 100644
index 0000000..b36f89e
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/reserved-space.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test that O32 correctly reserved space for the four arguments, even when
+; there aren't any as per section 5 of MD00305 (MIPS ABIs Described).
+
+declare void @foo() nounwind;
+
+define void @reserved_space() nounwind {
+entry:
+        tail call void @foo()
+        ret void
+}
+
+; ALL-LABEL: reserved_space:
+; O32:           addiu $sp, $sp, -24
+; O32:           sw $ra, 20($sp)
+; O32:           lw $ra, 20($sp)
+; O32:           addiu $sp, $sp, 24
+; Despite pointers being 32-bit wide on N32, the return pointer is saved as a
+; 64-bit pointer. I've yet to find a documentation reference for this quirk but
+; this behaviour matches GCC so I have considered it to be correct.
+; N32:           addiu $sp, $sp, -16
+; N32:           sd $ra, 8($sp)
+; N32:           ld $ra, 8($sp)
+; N32:           addiu $sp, $sp, 16
+; N64:           daddiu $sp, $sp, -16
+; N64:           sd $ra, 8($sp)
+; N64:           ld $ra, 8($sp)
+; N64:           daddiu $sp, $sp, 16
diff --git a/test/CodeGen/Mips/cconv/return-float.ll b/test/CodeGen/Mips/cconv/return-float.ll
new file mode 100644
index 0000000..28cf83d
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/return-float.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=mips-linux-gnu -soft-float -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -soft-float -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -soft-float -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test the float returns for all ABI's and byte orders as specified by
+; section 5 of MD00305 (MIPS ABIs Described).
+
+; We only test Linux because other OS's use different relocations and I don't
+; know if this is correct.
+
+@float = global float zeroinitializer
+@double = global double zeroinitializer
+
+define float @retfloat() nounwind {
+entry:
+        %0 = load volatile float* @float
+        ret float %0
+}
+
+; ALL-LABEL: retfloat:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
+; O32-DAG:           lw $2, %lo(float)([[R1]])
+; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
+; N32-DAG:           lw $2, %lo(float)([[R1]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(float)($1)
+; N64-DAG:           lw $2, 0([[R1]])
+
+define double @retdouble() nounwind {
+entry:
+        %0 = load volatile double* @double
+        ret double %0
+}
+
+; ALL-LABEL: retdouble:
+; O32-DAG:           lw $2, %lo(double)([[R1:\$[0-9]+]])
+; O32-DAG:           addiu [[R2:\$[0-9]+]], [[R1]], %lo(double)
+; O32-DAG:           lw $3, 4([[R2]])
+; N32-DAG:           ld $2, %lo(double)([[R1:\$[0-9]+]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(double)($1)
+; N64-DAG:           ld $2, 0([[R1]])
diff --git a/test/CodeGen/Mips/cconv/return-hard-float.ll b/test/CodeGen/Mips/cconv/return-hard-float.ll
new file mode 100644
index 0000000..371b3a5
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/return-hard-float.ll
@@ -0,0 +1,46 @@
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test the float returns for all ABI's and byte orders as specified by
+; section 5 of MD00305 (MIPS ABIs Described).
+
+; We only test Linux because other OS's use different relocations and I don't
+; know if this is correct.
+
+@float = global float zeroinitializer
+@double = global double zeroinitializer
+
+define float @retfloat() nounwind {
+entry:
+        %0 = load volatile float* @float
+        ret float %0
+}
+
+; ALL-LABEL: retfloat:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
+; O32-DAG:           lwc1 $f0, %lo(float)([[R1]])
+; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
+; N32-DAG:           lwc1 $f0, %lo(float)([[R1]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(float)($1)
+; N64-DAG:           lwc1 $f0, 0([[R1]])
+
+define double @retdouble() nounwind {
+entry:
+        %0 = load volatile double* @double
+        ret double %0
+}
+
+; ALL-LABEL: retdouble:
+; O32-DAG:           ldc1 $f0, %lo(double)([[R1:\$[0-9]+]])
+; N32-DAG:           ldc1 $f0, %lo(double)([[R1:\$[0-9]+]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(double)($1)
+; N64-DAG:           ldc1 $f0, 0([[R1]])
diff --git a/test/CodeGen/Mips/cconv/return-hard-fp128.ll b/test/CodeGen/Mips/cconv/return-hard-fp128.ll
new file mode 100644
index 0000000..0da59ef
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/return-hard-fp128.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test the fp128 returns for N32/N64 and all byte orders as specified by
+; section 5 of MD00305 (MIPS ABIs Described).
+;
+; O32 is not tested because long double is the same as double on O32.
+;
+@fp128 = global fp128 zeroinitializer
+
+define fp128 @retldouble() nounwind {
+entry:
+        %0 = load volatile fp128* @fp128
+        ret fp128 %0
+}
+
+; ALL-LABEL: retldouble:
+; N32-DAG:           ld [[R2:\$[0-9]+]], %lo(fp128)([[R1:\$[0-9]+]])
+; N32-DAG:           addiu [[R3:\$[0-9]+]], [[R1]], %lo(fp128)
+; N32-DAG:           ld [[R4:\$[0-9]+]], 8([[R3]])
+; N32-DAG:           dmtc1 [[R2]], $f0
+; N32-DAG:           dmtc1 [[R4]], $f2
+
+; N64-DAG:           ld [[R2:\$[0-9]+]], %got_disp(fp128)([[R1:\$[0-9]+]])
+; N64-DAG:           ld [[R3:\$[0-9]+]], 0([[R2]])
+; N64-DAG:           ld [[R4:\$[0-9]+]], 8([[R2]])
+; N64-DAG:           dmtc1 [[R3]], $f0
+; N64-DAG:           dmtc1 [[R4]], $f2
diff --git a/test/CodeGen/Mips/cconv/return.ll b/test/CodeGen/Mips/cconv/return.ll
new file mode 100644
index 0000000..76ce5e4
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/return.ll
@@ -0,0 +1,66 @@
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test the integer returns for all ABI's and byte orders as specified by
+; section 5 of MD00305 (MIPS ABIs Described).
+
+; We only test Linux because other OS's use different relocations and I don't
+; know if this is correct.
+
+@byte = global i8 zeroinitializer
+@word = global i32 zeroinitializer
+@dword = global i64 zeroinitializer
+@float = global float zeroinitializer
+@double = global double zeroinitializer
+
+define i8 @reti8() nounwind {
+entry:
+        %0 = load volatile i8* @byte
+        ret i8 %0
+}
+
+; ALL-LABEL: reti8:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(byte)
+; O32-DAG:           lbu $2, %lo(byte)([[R1]])
+; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(byte)
+; N32-DAG:           lbu $2, %lo(byte)([[R1]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(byte)($1)
+; N64-DAG:           lbu $2, 0([[R1]])
+
+define i32 @reti32() nounwind {
+entry:
+        %0 = load volatile i32* @word
+        ret i32 %0
+}
+
+; ALL-LABEL: reti32:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(word)
+; O32-DAG:           lw $2, %lo(word)([[R1]])
+; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(word)
+; N32-DAG:           lw $2, %lo(word)([[R1]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(word)($1)
+; N64-DAG:           lw $2, 0([[R1]])
+
+define i64 @reti64() nounwind {
+entry:
+        %0 = load volatile i64* @dword
+        ret i64 %0
+}
+
+; ALL-LABEL: reti64:
+; On O32, we must use v0 and v1 for the return value
+; O32-DAG:           lw $2, %lo(dword)([[R1:\$[0-9]+]])
+; O32-DAG:           addiu [[R2:\$[0-9]+]], [[R1]], %lo(dword)
+; O32-DAG:           lw $3, 4([[R2]])
+; N32-DAG:           ld $2, %lo(dword)([[R1:\$[0-9]+]])
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(dword)([[R1:\$[0-9]+]])
+; N64-DAG:           ld $2, 0([[R1]])
diff --git a/test/CodeGen/Mips/cconv/stack-alignment.ll b/test/CodeGen/Mips/cconv/stack-alignment.ll
new file mode 100644
index 0000000..834033b
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/stack-alignment.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=mips < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN: llc -march=mipsel < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN-TODO: llc -march=mips64 -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -march=mips64 -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -march=mips64el -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test the stack alignment for all ABI's and byte orders as specified by
+; section 5 of MD00305 (MIPS ABIs Described).
+
+define void @local_bytes_1() nounwind {
+entry:
+        %0 = alloca i8
+        ret void
+}
+
+; ALL-LABEL: local_bytes_1:
+; O32:           addiu $sp, $sp, -8
+; O32:           addiu $sp, $sp, 8
+; N32:           addiu $sp, $sp, -16
+; N32:           addiu $sp, $sp, 16
+; N64:           addiu $sp, $sp, -16
+; N64:           addiu $sp, $sp, 16
diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll
index f2009fa..b9732eb 100644
--- a/test/CodeGen/Mips/cmov.ll
+++ b/test/CodeGen/Mips/cmov.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=O32
 ; RUN: llc -march=mips -regalloc=basic < %s | FileCheck %s -check-prefix=O32
+; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 < %s | FileCheck %s -check-prefix=N64
 ; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck %s -check-prefix=N64
 
 @i1 = global [3 x i32] [i32 1, i32 2, i32 3], align 4
@@ -238,4 +239,4 @@ define i32 @slti6(i32 %a) nounwind readnone {
 ; O32-DAG: xori [[R1]], [[R1]], 1
 ; O32-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
 ; O32-NOT: movn
-; O32:.size slti6
\ No newline at end of file
+; O32:.size slti6
diff --git a/test/CodeGen/Mips/eh-dwarf-cfa.ll b/test/CodeGen/Mips/eh-dwarf-cfa.ll
index 3a21332..6554974 100644
--- a/test/CodeGen/Mips/eh-dwarf-cfa.ll
+++ b/test/CodeGen/Mips/eh-dwarf-cfa.ll
@@ -1,4 +1,6 @@
 ; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | \
+; RUN:      FileCheck %s -check-prefix=CHECK-MIPS64
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | \
 ; RUN:      FileCheck %s -check-prefix=CHECK-MIPS64
 
diff --git a/test/CodeGen/Mips/eh-return64.ll b/test/CodeGen/Mips/eh-return64.ll
index 32fc5e6..8c5af50 100644
--- a/test/CodeGen/Mips/eh-return64.ll
+++ b/test/CodeGen/Mips/eh-return64.ll
@@ -1,3 +1,4 @@
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
 
 declare void @llvm.eh.return.i64(i64, i8*)
diff --git a/test/CodeGen/Mips/elf_eflags.ll b/test/CodeGen/Mips/elf_eflags.ll
index 336ed7b..00d8584 100644
--- a/test/CodeGen/Mips/elf_eflags.ll
+++ b/test/CodeGen/Mips/elf_eflags.ll
@@ -23,6 +23,9 @@
 ; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE32R2-MICROMIPS %s
 ; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | FileCheck -check-prefix=CHECK-LE32R2-MICROMIPS_PIC %s
 
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips4 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips4 %s -o - | FileCheck -check-prefix=CHECK-LE64_PIC %s
+
 ; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64 %s
 ; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64 %s -o - | FileCheck -check-prefix=CHECK-LE64_PIC %s
 ; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64r2 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-LE64R2 %s
diff --git a/test/CodeGen/Mips/elf_st_other.ll b/test/CodeGen/Mips/elf_st_other.ll
deleted file mode 100644
index 8a5f20d..0000000
--- a/test/CodeGen/Mips/elf_st_other.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; This tests value of ELF st_other field for function symbol table entries.
-; For microMIPS value should be equal to STO_MIPS_MICROMIPS.
-
-; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | FileCheck %s
-
-define i32 @main() nounwind {
-entry:
-  ret i32 0
-}
-
-; CHECK: .set	micromips
-; CHECK: main:
diff --git a/test/CodeGen/Mips/fabs.ll b/test/CodeGen/Mips/fabs.ll
index 49d8a72..ce1a9a6 100644
--- a/test/CodeGen/Mips/fabs.ll
+++ b/test/CodeGen/Mips/fabs.ll
@@ -1,21 +1,23 @@
-; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 | FileCheck %s -check-prefix=32
-; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2
-; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64 -mattr=n64 | FileCheck %s -check-prefix=64
-; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2
-; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 -enable-no-nans-fp-math | FileCheck %s -check-prefix=NO-NAN
+; Check that abs.[ds] is selected and does not depend on -enable-no-nans-fp-math
+; They obey the Has2008 and ABS2008 configuration bits which govern the
+; conformance to IEEE 754 (1985) and IEEE 754 (2008). When these bits are not
+; present, they confirm to 1985.
+; In 1985 mode, abs.[ds] are arithmetic (i.e. they raise invalid operation
+; exceptions when given NaN's). In 2008 mode, they are non-arithmetic (i.e.
+; they are copies and don't raise any exceptions).
 
-define float @foo0(float %a) nounwind readnone {
-entry:
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 | FileCheck %s
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32r2 | FileCheck %s
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 -enable-no-nans-fp-math | FileCheck %s
 
-; 32: lui  $[[T0:[0-9]+]], 32767
-; 32: ori  $[[MSK0:[0-9]+]], $[[T0]], 65535
-; 32: and  $[[AND:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 32: mtc1 $[[AND]], $f0
+; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64 | FileCheck %s
+; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64 -enable-no-nans-fp-math | FileCheck %s
 
-; 32R2: ins  $[[INS:[0-9]+]], $zero, 31, 1
-; 32R2: mtc1 $[[INS]], $f0
+define float @foo0(float %a) nounwind readnone {
+entry:
 
-; NO-NAN: abs.s
+; CHECK-LABEL: foo0
+; CHECK: abs.s
 
   %call = tail call float @fabsf(float %a) nounwind readnone
   ret float %call
@@ -26,24 +28,8 @@ declare float @fabsf(float) nounwind readnone
 define double @foo1(double %a) nounwind readnone {
 entry:
 
-; 32: lui  $[[T0:[0-9]+]], 32767
-; 32: ori  $[[MSK0:[0-9]+]], $[[T0]], 65535
-; 32: and  $[[AND:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 32: mtc1 $[[AND]], $f1
-
-; 32R2: ins  $[[INS:[0-9]+]], $zero, 31, 1
-; 32R2: mtc1 $[[INS]], $f1
-
-; 64: daddiu  $[[T0:[0-9]+]], $zero, 1
-; 64: dsll    $[[T1:[0-9]+]], ${{[0-9]+}}, 63
-; 64: daddiu  $[[MSK0:[0-9]+]], $[[T1]], -1
-; 64: and     $[[AND:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 64: dmtc1   $[[AND]], $f0
-
-; 64R2: dins  $[[INS:[0-9]+]], $zero, 63, 1
-; 64R2: dmtc1 $[[INS]], $f0
-
-; NO-NAN: abs.d
+; CHECK-LABEL: foo1:
+; CHECK: abs.d
 
   %call = tail call double @fabs(double %a) nounwind readnone
   ret double %call
diff --git a/test/CodeGen/Mips/fcopysign-f32-f64.ll b/test/CodeGen/Mips/fcopysign-f32-f64.ll
index 9f88d0c..148a780 100644
--- a/test/CodeGen/Mips/fcopysign-f32-f64.ll
+++ b/test/CodeGen/Mips/fcopysign-f32-f64.ll
@@ -1,3 +1,4 @@
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s -check-prefix=64
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s -check-prefix=64
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2
 
diff --git a/test/CodeGen/Mips/fcopysign.ll b/test/CodeGen/Mips/fcopysign.ll
index 1c57eca..44c4117 100644
--- a/test/CodeGen/Mips/fcopysign.ll
+++ b/test/CodeGen/Mips/fcopysign.ll
@@ -1,5 +1,6 @@
 ; RUN: llc  < %s -march=mipsel -mcpu=mips32 | FileCheck %s -check-prefix=32
 ; RUN: llc  < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s -check-prefix=64
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s -check-prefix=64
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 | FileCheck %s -check-prefix=64R2
 
diff --git a/test/CodeGen/Mips/fmadd1.ll b/test/CodeGen/Mips/fmadd1.ll
index 6768ed6..a9a8e21 100644
--- a/test/CodeGen/Mips/fmadd1.ll
+++ b/test/CodeGen/Mips/fmadd1.ll
@@ -1,3 +1,10 @@
+; Check that madd.[ds], msub.[ds], nmadd.[ds], and nmsub.[ds] are supported
+; correctly.
+; The spec for nmadd.[ds], and nmsub.[ds] does not state that they obey the
+; the Has2008 and ABS2008 configuration bits which govern the conformance to
+; IEEE 754 (1985) and IEEE 754 (2008). These instructions are therefore only
+; available when -enable-no-nans-fp-math is given.
+
 ; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -enable-no-nans-fp-math | FileCheck %s -check-prefix=32R2 -check-prefix=CHECK
 ; RUN: llc < %s -march=mips64el -mcpu=mips64r2 -mattr=n64 -enable-no-nans-fp-math | FileCheck %s -check-prefix=64R2 -check-prefix=CHECK
 ; RUN: llc < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2NAN -check-prefix=CHECK
@@ -5,6 +12,7 @@
 
 define float @FOO0float(float %a, float %b, float %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO0float:
 ; CHECK: madd.s 
   %mul = fmul float %a, %b
   %add = fadd float %mul, %c
@@ -14,6 +22,7 @@ entry:
 
 define float @FOO1float(float %a, float %b, float %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO1float:
 ; CHECK: msub.s 
   %mul = fmul float %a, %b
   %sub = fsub float %mul, %c
@@ -23,6 +32,7 @@ entry:
 
 define float @FOO2float(float %a, float %b, float %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO2float:
 ; 32R2: nmadd.s 
 ; 64R2: nmadd.s 
 ; 32R2NAN: madd.s 
@@ -35,6 +45,7 @@ entry:
 
 define float @FOO3float(float %a, float %b, float %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO3float:
 ; 32R2: nmsub.s 
 ; 64R2: nmsub.s 
 ; 32R2NAN: msub.s 
@@ -47,6 +58,7 @@ entry:
 
 define double @FOO10double(double %a, double %b, double %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO10double:
 ; CHECK: madd.d
   %mul = fmul double %a, %b
   %add = fadd double %mul, %c
@@ -56,6 +68,7 @@ entry:
 
 define double @FOO11double(double %a, double %b, double %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO11double:
 ; CHECK: msub.d
   %mul = fmul double %a, %b
   %sub = fsub double %mul, %c
@@ -65,6 +78,7 @@ entry:
 
 define double @FOO12double(double %a, double %b, double %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO12double:
 ; 32R2: nmadd.d 
 ; 64R2: nmadd.d 
 ; 32R2NAN: madd.d 
@@ -77,6 +91,7 @@ entry:
 
 define double @FOO13double(double %a, double %b, double %c) nounwind readnone {
 entry:
+; CHECK-LABEL: FOO13double:
 ; 32R2: nmsub.d 
 ; 64R2: nmsub.d 
 ; 32R2NAN: msub.d 
diff --git a/test/CodeGen/Mips/fneg.ll b/test/CodeGen/Mips/fneg.ll
index b322abd..4fb80fd 100644
--- a/test/CodeGen/Mips/fneg.ll
+++ b/test/CodeGen/Mips/fneg.ll
@@ -1,17 +1,30 @@
-; RUN: llc  < %s -march=mipsel -mcpu=mips32 | FileCheck %s 
+; Check that abs.[ds] is selected and does not depend on -enable-no-nans-fp-math
+; They obey the Has2008 and ABS2008 configuration bits which govern the
+; conformance to IEEE 754 (1985) and IEEE 754 (2008). When these bits are not
+; present, they confirm to 1985.
+; In 1985 mode, abs.[ds] are arithmetic (i.e. they raise invalid operation
+; exceptions when given NaN's). In 2008 mode, they are non-arithmetic (i.e.
+; they are copies and don't raise any exceptions).
 
-define float @foo0(i32 %a, float %d) nounwind readnone {
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 | FileCheck %s
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32r2 | FileCheck %s
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -mcpu=mips32 -enable-no-nans-fp-math | FileCheck %s
+
+; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64 | FileCheck %s
+; RUN: llc  < %s -mtriple=mips64el-linux-gnu -mcpu=mips64 -enable-no-nans-fp-math | FileCheck %s
+
+define float @foo0(float %d) nounwind readnone {
 entry:
-; CHECK-NOT: neg.s
+; CHECK-LABEL: foo0:
+; CHECK: neg.s
   %sub = fsub float -0.000000e+00, %d
   ret float %sub
 }
 
-define double @foo1(i32 %a, double %d) nounwind readnone {
+define double @foo1(double %d) nounwind readnone {
 entry:
-; CHECK:     foo1
-; CHECK-NOT: neg.d
-; CHECK:     jr
+; CHECK-LABEL: foo1:
+; CHECK: neg.d
   %sub = fsub double -0.000000e+00, %d
   ret double %sub
 }
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll
index f9e53cb..c09108d 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-I-1.ll
@@ -9,7 +9,7 @@ define i32 @main() nounwind {
 entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'I'
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,I"(i32 7, i32 1048576) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,I"(i32 7, i32 1048576) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll
index 1fdf672..2b24b0f 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-J.ll
@@ -10,7 +10,7 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'J'
 
-  tail call i32 asm "addi $0,$1,$2", "=r,r,J"(i32 1024, i32 3) nounwind
+  tail call i32 asm "addiu $0,$1,$2", "=r,r,J"(i32 1024, i32 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll
index 49dcc87..5edb3e2 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-L.ll
@@ -10,7 +10,7 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'L'
 
-  tail call i32 asm "addi $0,$1,$2", "=r,r,L"(i32 7, i32 1048579) nounwind
+  tail call i32 asm "addiu $0,$1,$2", "=r,r,L"(i32 7, i32 1048579) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll
index 770669d..eaa540a 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-N.ll
@@ -11,7 +11,7 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'N'
 
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,N"(i32 7, i32 3) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,N"(i32 7, i32 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll
index cd4431a..56afbaa 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-O.ll
@@ -11,6 +11,6 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'O'
 
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,O"(i32 undef, i32 16384) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,O"(i32 undef, i32 16384) nounwind
   ret i32 0
 }
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll
index 0a4739e..0a55cb5 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-bad-P.ll
@@ -11,6 +11,6 @@ entry:
 
 ;CHECK-ERRORS:	error: invalid operand for inline asm constraint 'P'
 
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,P"(i32 undef, i32 655536) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,P"(i32 undef, i32 655536) nounwind
   ret i32 0
 }
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
index 94ded30..9464918 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg.ll
@@ -7,27 +7,27 @@ entry:
 
 ; r with char
 ;CHECK:	#APP
-;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},23
+;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},23
 ;CHECK:	#NO_APP
-  tail call i8 asm sideeffect "addi $0,$1,$2", "=r,r,n"(i8 27, i8 23) nounwind
+  tail call i8 asm sideeffect "addiu $0,$1,$2", "=r,r,n"(i8 27, i8 23) nounwind
 
 ; r with short
 ;CHECK:	#APP
-;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},13
+;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},13
 ;CHECK:	#NO_APP
-  tail call i16 asm sideeffect "addi $0,$1,$2", "=r,r,n"(i16 17, i16 13) nounwind
+  tail call i16 asm sideeffect "addiu $0,$1,$2", "=r,r,n"(i16 17, i16 13) nounwind
 
 ; r with int
 ;CHECK:	#APP
-;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},3
+;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},3
 ;CHECK:	#NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,n"(i32 7, i32 3) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,n"(i32 7, i32 3) nounwind
 
 ; Now c with 1024: make sure register $25 is picked
 ; CHECK: #APP
-; CHECK: addi $25,${{[0-9]+}},1024
+; CHECK: addiu $25,${{[0-9]+}},1024
 ; CHECK: #NO_APP	
-   tail call i32 asm sideeffect "addi $0,$1,$2", "=c,c,I"(i32 4194304, i32 1024) nounwind
+   tail call i32 asm sideeffect "addiu $0,$1,$2", "=c,c,I"(i32 4194304, i32 1024) nounwind
 
 ; Now l with 1024: make sure register lo is picked. We do this by checking the instruction
 ; after the inline expression for a mflo to pull the value out of lo.
diff --git a/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll b/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
index 7870666..a7ba762 100644
--- a/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
+++ b/test/CodeGen/Mips/inlineasm-cnstrnt-reg64.ll
@@ -12,9 +12,9 @@ entry:
 
 ; r with long long
 ;CHECK:	#APP
-;CHECK:	addi ${{[0-9]+}},${{[0-9]+}},3
+;CHECK:	addiu ${{[0-9]+}},${{[0-9]+}},3
 ;CHECK:	#NO_APP
-  tail call i64 asm sideeffect "addi $0,$1,$2", "=r,r,i"(i64 7, i64 3) nounwind
+  tail call i64 asm sideeffect "addiu $0,$1,$2", "=r,r,i"(i64 7, i64 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm-operand-code.ll b/test/CodeGen/Mips/inlineasm-operand-code.ll
index 7bb4adc..6512851 100644
--- a/test/CodeGen/Mips/inlineasm-operand-code.ll
+++ b/test/CodeGen/Mips/inlineasm-operand-code.ll
@@ -12,9 +12,9 @@ define i32 @constraint_X() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL:   constraint_X:
 ;CHECK_LITTLE_32: #APP
-;CHECK_LITTLE_32: addi ${{[0-9]+}},${{[0-9]+}},0xfffffffffffffffd
+;CHECK_LITTLE_32: addiu ${{[0-9]+}},${{[0-9]+}},0xfffffffffffffffd
 ;CHECK_LITTLE_32: #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:X}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:X}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
@@ -23,9 +23,9 @@ define i32 @constraint_x() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL:   constraint_x:
 ;CHECK_LITTLE_32: #APP
-;CHECK_LITTLE_32: addi ${{[0-9]+}},${{[0-9]+}},0xfffd
+;CHECK_LITTLE_32: addiu ${{[0-9]+}},${{[0-9]+}},0xfffd
 ;CHECK_LITTLE_32: #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:x}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:x}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
@@ -34,9 +34,9 @@ define i32 @constraint_d() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL:   constraint_d:
 ;CHECK_LITTLE_32:   #APP
-;CHECK_LITTLE_32:   addi ${{[0-9]+}},${{[0-9]+}},-3
+;CHECK_LITTLE_32:   addiu ${{[0-9]+}},${{[0-9]+}},-3
 ;CHECK_LITTLE_32:   #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:d}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:d}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
@@ -45,9 +45,9 @@ define i32 @constraint_m() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL:   constraint_m:
 ;CHECK_LITTLE_32:   #APP
-;CHECK_LITTLE_32:   addi ${{[0-9]+}},${{[0-9]+}},-4
+;CHECK_LITTLE_32:   addiu ${{[0-9]+}},${{[0-9]+}},-4
 ;CHECK_LITTLE_32:   #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:m}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:m}", "=r,r,I"(i32 7, i32 -3) ;
   ret i32 0
 }
 
@@ -56,15 +56,15 @@ define i32 @constraint_z() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL: constraint_z:
 ;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    addi ${{[0-9]+}},${{[0-9]+}},-3
+;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},-3
 ;CHECK_LITTLE_32:    #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:z}", "=r,r,I"(i32 7, i32 -3) ;
+  tail call i32 asm sideeffect "addiu $0,$1,${2:z}", "=r,r,I"(i32 7, i32 -3) ;
 
 ; z with 0
 ;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    addi ${{[0-9]+}},${{[0-9]+}},$0
+;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},$0
 ;CHECK_LITTLE_32:    #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
   ret i32 0
 }
 
@@ -73,9 +73,9 @@ define i32 @constraint_longlong() nounwind {
 entry:
 ;CHECK_LITTLE_32-LABEL: constraint_longlong:
 ;CHECK_LITTLE_32:    #APP
-;CHECK_LITTLE_32:    addi ${{[0-9]+}},${{[0-9]+}},3
+;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},3
 ;CHECK_LITTLE_32:    #NO_APP
-  tail call i64 asm sideeffect "addi $0,$1,$2 \0A\09", "=r,r,X"(i64 1229801703532086340, i64 3) nounwind
+  tail call i64 asm sideeffect "addiu $0,$1,$2 \0A\09", "=r,r,X"(i64 1229801703532086340, i64 3) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/inlineasm_constraint.ll b/test/CodeGen/Mips/inlineasm_constraint.ll
index 8d30f45..76b73dc 100644
--- a/test/CodeGen/Mips/inlineasm_constraint.ll
+++ b/test/CodeGen/Mips/inlineasm_constraint.ll
@@ -5,21 +5,21 @@ entry:
 
 ; First I with short
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},4096
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},4096
 ; CHECK: #NO_APP
-  tail call i16 asm sideeffect "addi $0,$1,$2", "=r,r,I"(i16 7, i16 4096) nounwind
+  tail call i16 asm sideeffect "addiu $0,$1,$2", "=r,r,I"(i16 7, i16 4096) nounwind
 
 ; Then I with int
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},-3
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},-3
 ; CHECK: #NO_APP
-   tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,I"(i32 7, i32 -3) nounwind
+   tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,I"(i32 7, i32 -3) nounwind
 
 ; Now J with 0
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},0
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},0
 ; CHECK: #NO_APP
-  tail call i32 asm sideeffect "addi $0,$1,$2\0A\09 ", "=r,r,J"(i32 7, i16 0) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2\0A\09 ", "=r,r,J"(i32 7, i16 0) nounwind
 
 ; Now K with 64
 ; CHECK: #APP
@@ -35,29 +35,29 @@ entry:
 
 ; Now N with -3
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},-3
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},-3
 ; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,N"(i32 7, i32 -3) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,N"(i32 7, i32 -3) nounwind
 
 ; Now O with -3
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},-3
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},-3
 ; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,O"(i32 7, i16 -3) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,O"(i32 7, i16 -3) nounwind
 
 ; Now P with 65535
 ; CHECK: #APP
-; CHECK: addi ${{[0-9]+}},${{[0-9]+}},65535
+; CHECK: addiu ${{[0-9]+}},${{[0-9]+}},65535
 ; CHECK: #NO_APP	
-  tail call i32 asm sideeffect "addi $0,$1,$2", "=r,r,P"(i32 7, i32 65535) nounwind
+  tail call i32 asm sideeffect "addiu $0,$1,$2", "=r,r,P"(i32 7, i32 65535) nounwind
 
 ; Now R Which takes the address of c
   %c = alloca i32, align 4
   store i32 -4469539, i32* %c, align 4
-  %8 = call i32 asm sideeffect "lwl $0, 1 + $1\0A\09lwr $0, 2 + $1\0A\09", "=r,*R"(i32* %c) #1
+  %8 = call i32 asm sideeffect "lw $0, 1 + $1\0A\09lw $0, 2 + $1\0A\09", "=r,*R"(i32* %c) #1
 ; CHECK: #APP
-; CHECK: lwl ${{[0-9]+}}, 1 + 0(${{[0-9]+}})
-; CHECK: lwr ${{[0-9]+}}, 2 + 0(${{[0-9]+}})
+; CHECK: lw ${{[0-9]+}}, 1 + 0(${{[0-9]+}})
+; CHECK: lw ${{[0-9]+}}, 2 + 0(${{[0-9]+}})
 ; CHECK: #NO_APP	
 
   ret i32 0
diff --git a/test/CodeGen/Mips/int-to-float-conversion.ll b/test/CodeGen/Mips/int-to-float-conversion.ll
index c2baf44..d226b48 100644
--- a/test/CodeGen/Mips/int-to-float-conversion.ll
+++ b/test/CodeGen/Mips/int-to-float-conversion.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefix=64
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=64
 
 @i1 = global [3 x i32] [i32 1, i32 2, i32 3], align 4
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
index 09fee3d..0e9c91f 100644
--- a/test/CodeGen/Mips/largeimmprinting.ll
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 < %s | \
+; RUN:     FileCheck %s -check-prefix=64
 ; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | \
-; RUN: FileCheck %s -check-prefix=64
+; RUN:     FileCheck %s -check-prefix=64
 
 %struct.S1 = type { [65536 x i8] }
 
diff --git a/test/CodeGen/Mips/load-store-left-right.ll b/test/CodeGen/Mips/load-store-left-right.ll
index d0928ee..a3f5ebf 100644
--- a/test/CodeGen/Mips/load-store-left-right.ll
+++ b/test/CodeGen/Mips/load-store-left-right.ll
@@ -1,29 +1,439 @@
-; RUN: llc -march=mipsel < %s | FileCheck  -check-prefix=EL %s
-; RUN: llc -march=mips < %s | FileCheck  -check-prefix=EB %s
+; RUN: llc -march=mipsel   -mcpu=mips32              < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EL %s
+; RUN: llc -march=mips     -mcpu=mips32              < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EB %s
+; RUN: llc -march=mipsel   -mcpu=mips32r2            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EL %s
+; RUN: llc -march=mips     -mcpu=mips32r2            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32 -check-prefix=MIPS32-EB %s
+; RUN: llc -march=mipsel   -mcpu=mips32r6            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32R6 -check-prefix=MIPS32R6-EL %s
+; RUN: llc -march=mips     -mcpu=mips32r6            < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS32R6 -check-prefix=MIPS32R6-EB %s
+; RUN: llc -march=mips64el -mcpu=mips4    -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
+; RUN: llc -march=mips64   -mcpu=mips4    -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64   -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64   -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64r2 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64 -check-prefix=MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64r6 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64R6 -check-prefix=MIPS64R6-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64r6 -mattr=n64 < %s | FileCheck -check-prefix=ALL -check-prefix=MIPS64R6 -check-prefix=MIPS64R6-EB %s
 
+%struct.SLL = type { i64 }
 %struct.SI = type { i32 }
+%struct.SUI = type { i32 }
 
+@sll = common global %struct.SLL zeroinitializer, align 1
 @si = common global %struct.SI zeroinitializer, align 1
+@sui = common global %struct.SUI zeroinitializer, align 1
 
-define i32 @foo_load_i() nounwind readonly {
+define i32 @load_SI() nounwind readonly {
 entry:
-; EL: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: lwr $[[R0]], 0($[[R1]])
-; EB: lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: lwr $[[R0]], 3($[[R1]])
+; ALL-LABEL: load_SI:
+
+; MIPS32-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS32-EL:     lwr $[[R0]], 0($[[R1]])
+
+; MIPS32-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS32-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(si)(
+; MIPS32R6:      lw $2, 0($[[PTR]])
+
+; MIPS64-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL:     lwr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
+; MIPS64R6:      lw $2, 0($[[PTR]])
 
   %0 = load i32* getelementptr inbounds (%struct.SI* @si, i32 0, i32 0), align 1
   ret i32 %0
 }
 
-define void @foo_store_i(i32 %a) nounwind {
+define void @store_SI(i32 %a) nounwind {
 entry:
-; EL: swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: swr $[[R0]], 0($[[R1]])
-; EB: swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: swr $[[R0]], 3($[[R1]])
+; ALL-LABEL: store_SI:
+
+; MIPS32-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS32-EL:     swr $[[R0]], 0($[[R1]])
+
+; MIPS32-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS32-EB:     swr $[[R0]], 3($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(si)(
+; MIPS32R6:      sw $4, 0($[[PTR]])
+
+; MIPS64-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL:     swr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     swr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
+; MIPS64R6:      sw $4, 0($[[PTR]])
 
   store i32 %a, i32* getelementptr inbounds (%struct.SI* @si, i32 0, i32 0), align 1
   ret void
 }
 
+define i64 @load_SLL() nounwind readonly {
+entry:
+; ALL-LABEL: load_SLL:
+
+; MIPS32-EL:     lwl $2, 3($[[R1:[0-9]+]])
+; MIPS32-EL:     lwr $2, 0($[[R1]])
+; MIPS32-EL:     lwl $3, 7($[[R1:[0-9]+]])
+; MIPS32-EL:     lwr $3, 4($[[R1]])
+
+; MIPS32-EB:     lwl $2, 0($[[R1:[0-9]+]])
+; MIPS32-EB:     lwr $2, 3($[[R1]])
+; MIPS32-EB:     lwl $3, 4($[[R1:[0-9]+]])
+; MIPS32-EB:     lwr $3, 7($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(sll)(
+; MIPS32R6-DAG:  lw $2, 0($[[PTR]])
+; MIPS32R6-DAG:  lw $3, 4($[[PTR]])
+
+; MIPS64-EL:     ldl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
+; MIPS64-EL:     ldr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     ldl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     ldr $[[R0]], 7($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(sll)(
+; MIPS64R6:      ld $2, 0($[[PTR]])
+
+  %0 = load i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
+  ret i64 %0
+}
+
+define i64 @load_SI_sext_to_i64() nounwind readonly {
+entry:
+; ALL-LABEL: load_SI_sext_to_i64:
+
+; MIPS32-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS32-EL:     lwr $[[R0]], 0($[[R1]])
+
+; MIPS32-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS32-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(si)(
+; MIPS32R6-EL:   lw $2, 0($[[PTR]])
+; MIPS32R6-EL:   sra $3, $2, 31
+; MIPS32R6-EB:   lw $3, 0($[[PTR]])
+; MIPS32R6-EB:   sra $2, $3, 31
+
+; MIPS64-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL:     lwr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
+; MIPS64R6:      lw $2, 0($[[PTR]])
+
+  %0 = load i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
+  %conv = sext i32 %0 to i64
+  ret i64 %conv
+}
+
+define i64 @load_UI() nounwind readonly {
+entry:
+; ALL-LABEL: load_UI:
+
+; MIPS32-EL-DAG: lwl $[[R2:2]], 3($[[R1:[0-9]+]])
+; MIPS32-EL-DAG: lwr $[[R2]],   0($[[R1]])
+; MIPS32-EL-DAG: addiu $3, $zero, 0
+
+; MIPS32-EB-DAG: lwl $[[R2:3]], 0($[[R1:[0-9]+]])
+; MIPS32-EB-DAG: lwr $[[R2]],   3($[[R1]])
+; MIPS32-EB-DAG: addiu $2, $zero, 0
+
+; MIPS32R6:        lw $[[PTR:[0-9]+]], %got(sui)(
+; MIPS32R6-EL-DAG: lw $2, 0($[[PTR]])
+; MIPS32R6-EL-DAG: addiu $3, $zero, 0
+; MIPS32R6-EB-DAG: lw $3, 0($[[PTR]])
+; MIPS32R6-EB-DAG: addiu $2, $zero, 0
+
+; MIPS64-EL-DAG: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL-DAG: lwr $[[R0]], 0($[[R1]])
+; MIPS64-EL-DAG: daddiu $[[R2:[0-9]+]], $zero, 1
+; MIPS64-EL-DAG: dsll   $[[R3:[0-9]+]], $[[R2]], 32
+; MIPS64-EL-DAG: daddiu $[[R4:[0-9]+]], $[[R3]], -1
+; MIPS64-EL-DAG: and    ${{[0-9]+}}, $[[R0]], $[[R4]]
+
+; MIPS64-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     lwr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(sui)(
+; MIPS64R6:      lwu $2, 0($[[PTR]])
+
+  %0 = load i32* getelementptr inbounds (%struct.SUI* @sui, i64 0, i32 0), align 1
+  %conv = zext i32 %0 to i64
+  ret i64 %conv
+}
+
+define void @store_SLL(i64 %a) nounwind {
+entry:
+; ALL-LABEL: store_SLL:
+
+; MIPS32-EL-DAG: swl $[[A1:4]], 3($[[R1:[0-9]+]])
+; MIPS32-EL-DAG: swr $[[A1]],   0($[[R1]])
+; MIPS32-EL-DAG: swl $[[A2:5]], 7($[[R1:[0-9]+]])
+; MIPS32-EL-DAG: swr $[[A2]],   4($[[R1]])
+
+; MIPS32-EB-DAG: swl $[[A1:4]], 0($[[R1:[0-9]+]])
+; MIPS32-EB-DAG: swr $[[A1]],   3($[[R1]])
+; MIPS32-EB-DAG: swl $[[A1:5]], 4($[[R1:[0-9]+]])
+; MIPS32-EB-DAG: swr $[[A1]],   7($[[R1]])
+
+; MIPS32R6-DAG:  lw $[[PTR:[0-9]+]], %got(sll)(
+; MIPS32R6-DAG:  sw $4, 0($[[PTR]])
+; MIPS32R6-DAG:  sw $5, 4($[[PTR]])
+
+; MIPS64-EL:     sdl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
+; MIPS64-EL:     sdr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     sdl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     sdr $[[R0]], 7($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(sll)(
+; MIPS64R6:      sd $4, 0($[[PTR]])
+
+  store i64 %a, i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
+  ret void
+}
+
+define void @store_SI_trunc_from_i64(i32 %a) nounwind {
+entry:
+; ALL-LABEL: store_SI_trunc_from_i64:
+
+; MIPS32-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS32-EL:     swr $[[R0]], 0($[[R1]])
+
+; MIPS32-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS32-EB:     swr $[[R0]], 3($[[R1]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(si)(
+; MIPS32R6:      sw $4, 0($[[PTR]])
+
+; MIPS64-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64-EL:     swr $[[R0]], 0($[[R1]])
+
+; MIPS64-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64-EB:     swr $[[R0]], 3($[[R1]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
+; MIPS64R6:      sw $4, 0($[[PTR]])
+
+  store i32 %a, i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
+  ret void
+}
+
+;
+; Structures are simply concatenations of the members. They are unaffected by
+; endianness
+;
+
+%struct.S0 = type { i8, i8 }
+@struct_s0 = common global %struct.S0 zeroinitializer, align 1
+%struct.S1 = type { i16, i16 }
+@struct_s1 = common global %struct.S1 zeroinitializer, align 1
+%struct.S2 = type { i32, i32 }
+@struct_s2 = common global %struct.S2 zeroinitializer, align 1
+
+define void @copy_struct_S0() nounwind {
+entry:
+; ALL-LABEL: copy_struct_S0:
+
+; MIPS32-EL:     lw $[[PTR:[0-9]+]], %got(struct_s0)(
+; MIPS32-EB:     lw $[[PTR:[0-9]+]], %got(struct_s0)(
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(struct_s0)(
+; MIPS64-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
+; MIPS64-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
+
+; FIXME: We should be able to do better than this on MIPS32r6/MIPS64r6 since
+;        we have unaligned halfword load/store available
+; ALL-DAG:       lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; ALL-DAG:       sb $[[R1]], 2($[[PTR]])
+; ALL-DAG:       lbu $[[R1:[0-9]+]], 1($[[PTR]])
+; ALL-DAG:       sb $[[R1]], 3($[[PTR]])
+
+  %0 = load %struct.S0* getelementptr inbounds (%struct.S0* @struct_s0, i32 0), align 1
+  store %struct.S0 %0, %struct.S0* getelementptr inbounds (%struct.S0* @struct_s0, i32 1), align 1
+  ret void
+}
+
+define void @copy_struct_S1() nounwind {
+entry:
+; ALL-LABEL: copy_struct_S1:
+
+; MIPS32-EL:     lw $[[PTR:[0-9]+]], %got(struct_s1)(
+; MIPS32-EB:     lw $[[PTR:[0-9]+]], %got(struct_s1)(
+; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32-DAG:    sb $[[R1]], 4($[[PTR]])
+; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 1($[[PTR]])
+; MIPS32-DAG:    sb $[[R1]], 5($[[PTR]])
+; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS32-DAG:    sb $[[R1]], 6($[[PTR]])
+; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS32-DAG:    sb $[[R1]], 7($[[PTR]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(struct_s1)(
+; MIPS32R6-DAG:  lhu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32R6-DAG:  sh $[[R1]], 4($[[PTR]])
+; MIPS32R6-DAG:  lhu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS32R6-DAG:  sh $[[R1]], 6($[[PTR]])
+
+; MIPS64-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
+; MIPS64-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
+; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-DAG:    sb $[[R1]], 4($[[PTR]])
+; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 1($[[PTR]])
+; MIPS64-DAG:    sb $[[R1]], 5($[[PTR]])
+; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS64-DAG:    sb $[[R1]], 6($[[PTR]])
+; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64-DAG:    sb $[[R1]], 7($[[PTR]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
+; MIPS64R6-DAG:  lhu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64R6-DAG:  sh $[[R1]], 4($[[PTR]])
+; MIPS64R6-DAG:  lhu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS64R6-DAG:  sh $[[R1]], 6($[[PTR]])
+
+  %0 = load %struct.S1* getelementptr inbounds (%struct.S1* @struct_s1, i32 0), align 1
+  store %struct.S1 %0, %struct.S1* getelementptr inbounds (%struct.S1* @struct_s1, i32 1), align 1
+  ret void
+}
+
+define void @copy_struct_S2() nounwind {
+entry:
+; ALL-LABEL: copy_struct_S2:
+
+; MIPS32-EL:     lw $[[PTR:[0-9]+]], %got(struct_s2)(
+; MIPS32-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS32-EL-DAG: lwr $[[R1]],        0($[[PTR]])
+; MIPS32-EL-DAG: swl $[[R1]],       11($[[PTR]])
+; MIPS32-EL-DAG: swr $[[R1]],        8($[[PTR]])
+; MIPS32-EL-DAG: lwl $[[R1:[0-9]+]], 7($[[PTR]])
+; MIPS32-EL-DAG: lwr $[[R1]],        4($[[PTR]])
+; MIPS32-EL-DAG: swl $[[R1]],       15($[[PTR]])
+; MIPS32-EL-DAG: swr $[[R1]],       12($[[PTR]])
+
+; MIPS32-EB:     lw $[[PTR:[0-9]+]], %got(struct_s2)(
+; MIPS32-EB-DAG: lwl $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32-EB-DAG: lwr $[[R1]],        3($[[PTR]])
+; MIPS32-EB-DAG: swl $[[R1]],        8($[[PTR]])
+; MIPS32-EB-DAG: swr $[[R1]],       11($[[PTR]])
+; MIPS32-EB-DAG: lwl $[[R1:[0-9]+]], 4($[[PTR]])
+; MIPS32-EB-DAG: lwr $[[R1]],        7($[[PTR]])
+; MIPS32-EB-DAG: swl $[[R1]],       12($[[PTR]])
+; MIPS32-EB-DAG: swr $[[R1]],       15($[[PTR]])
+
+; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(struct_s2)(
+; MIPS32R6-DAG:  lw $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32R6-DAG:  sw $[[R1]],        8($[[PTR]])
+; MIPS32R6-DAG:  lw $[[R1:[0-9]+]], 4($[[PTR]])
+; MIPS32R6-DAG:  sw $[[R1]],       12($[[PTR]])
+
+; MIPS64-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
+; MIPS64-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64-EL-DAG: lwr $[[R1]],        0($[[PTR]])
+; MIPS64-EL-DAG: swl $[[R1]],       11($[[PTR]])
+; MIPS64-EL-DAG: swr $[[R1]],        8($[[PTR]])
+; MIPS64-EL-DAG: lwl $[[R1:[0-9]+]], 7($[[PTR]])
+; MIPS64-EL-DAG: lwr $[[R1]],        4($[[PTR]])
+; MIPS64-EL-DAG: swl $[[R1]],       15($[[PTR]])
+; MIPS64-EL-DAG: swr $[[R1]],       12($[[PTR]])
+
+; MIPS64-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
+; MIPS64-EB-DAG: lwl $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-EB-DAG: lwr $[[R1]],        3($[[PTR]])
+; MIPS64-EB-DAG: swl $[[R1]],        8($[[PTR]])
+; MIPS64-EB-DAG: swr $[[R1]],       11($[[PTR]])
+; MIPS64-EB-DAG: lwl $[[R1:[0-9]+]], 4($[[PTR]])
+; MIPS64-EB-DAG: lwr $[[R1]],        7($[[PTR]])
+; MIPS64-EB-DAG: swl $[[R1]],       12($[[PTR]])
+; MIPS64-EB-DAG: swr $[[R1]],       15($[[PTR]])
+
+; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
+; MIPS64R6-DAG:  lw $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64R6-DAG:  sw $[[R1]],        8($[[PTR]])
+; MIPS64R6-DAG:  lw $[[R1:[0-9]+]], 4($[[PTR]])
+; MIPS64R6-DAG:  sw $[[R1]],       12($[[PTR]])
+
+  %0 = load %struct.S2* getelementptr inbounds (%struct.S2* @struct_s2, i32 0), align 1
+  store %struct.S2 %0, %struct.S2* getelementptr inbounds (%struct.S2* @struct_s2, i32 1), align 1
+  ret void
+}
+
+;
+; Arrays are simply concatenations of the members. They are unaffected by
+; endianness
+;
+
+@arr = common global [7 x i8] zeroinitializer, align 1
+
+define void @pass_array_byval() nounwind {
+entry:
+; ALL-LABEL: pass_array_byval:
+
+; MIPS32-EL:     lw $[[SPTR:[0-9]+]], %got(arr)(
+; MIPS32-EL-DAG: lwl $[[R1:4]], 3($[[PTR]])
+; MIPS32-EL-DAG: lwr $[[R1]],   0($[[PTR]])
+; MIPS32-EL-DAG: lbu $[[R2:[0-9]+]], 4($[[PTR]])
+; MIPS32-EL-DAG: lbu $[[R3:[0-9]+]], 5($[[PTR]])
+; MIPS32-EL-DAG: sll $[[T0:[0-9]+]], $[[R3]], 8
+; MIPS32-EL-DAG: or  $[[T1:[0-9]+]], $[[T0]], $[[R2]]
+; MIPS32-EL-DAG: lbu $[[R4:[0-9]+]], 6($[[PTR]])
+; MIPS32-EL-DAG: sll $[[T2:[0-9]+]], $[[R4]], 16
+; MIPS32-EL-DAG: or  $5, $[[T1]], $[[T2]]
+
+; MIPS32-EB:     lw $[[SPTR:[0-9]+]], %got(arr)(
+; MIPS32-EB-DAG: lwl $[[R1:4]], 0($[[PTR]])
+; MIPS32-EB-DAG: lwr $[[R1]],   3($[[PTR]])
+; MIPS32-EB-DAG: lbu $[[R2:[0-9]+]], 5($[[PTR]])
+; MIPS32-EB-DAG: lbu $[[R3:[0-9]+]], 4($[[PTR]])
+; MIPS32-EB-DAG: sll $[[T0:[0-9]+]], $[[R3]], 8
+; MIPS32-EB-DAG: or  $[[T1:[0-9]+]], $[[T0]], $[[R2]]
+; MIPS32-EB-DAG: sll $[[T1]], $[[T1]], 16
+; MIPS32-EB-DAG: lbu $[[R4:[0-9]+]], 6($[[PTR]])
+; MIPS32-EB-DAG: sll $[[T2:[0-9]+]], $[[R4]], 8
+; MIPS32-EB-DAG: or  $5, $[[T1]], $[[T2]]
+
+; MIPS32R6:        lw $[[SPTR:[0-9]+]], %got(arr)(
+; MIPS32R6-DAG:    lw $4, 0($[[PTR]])
+; MIPS32R6-EL-DAG: lhu $[[R2:[0-9]+]], 4($[[PTR]])
+; MIPS32R6-EL-DAG: lbu $[[R3:[0-9]+]], 6($[[PTR]])
+; MIPS32R6-EL-DAG: sll $[[T0:[0-9]+]], $[[R3]], 16
+; MIPS32R6-EL-DAG: or  $5, $[[R2]], $[[T0]]
+
+; MIPS32R6-EB-DAG: lhu $[[R2:[0-9]+]], 4($[[PTR]])
+; MIPS32R6-EB-DAG: lbu $[[R3:[0-9]+]], 6($[[PTR]])
+; MIPS32R6-EB-DAG: sll $[[T0:[0-9]+]], $[[R2]], 16
+; MIPS32R6-EB-DAG: or  $5, $[[T0]], $[[R3]]
+
+; MIPS64-EL:     ld $[[SPTR:[0-9]+]], %got_disp(arr)(
+; MIPS64-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64-EL-DAG: lwr $[[R1]],   0($[[PTR]])
+
+; MIPS64-EB:     ld $[[SPTR:[0-9]+]], %got_disp(arr)(
+; MIPS64-EB-DAG: lwl  $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-EB-DAG: lwr  $[[R1]],   3($[[PTR]])
+; MIPS64-EB-DAG: dsll $[[R1]], $[[R1]], 32
+; MIPS64-EB-DAG: lbu  $[[R2:[0-9]+]], 5($[[PTR]])
+; MIPS64-EB-DAG: lbu  $[[R3:[0-9]+]], 4($[[PTR]])
+; MIPS64-EB-DAG: dsll $[[T0:[0-9]+]], $[[R3]], 8
+; MIPS64-EB-DAG: or   $[[T1:[0-9]+]], $[[T0]], $[[R2]]
+; MIPS64-EB-DAG: dsll $[[T1]], $[[T1]], 16
+; MIPS64-EB-DAG: or   $[[T3:[0-9]+]], $[[R1]], $[[T1]]
+; MIPS64-EB-DAG: lbu  $[[R4:[0-9]+]], 6($[[PTR]])
+; MIPS64-EB-DAG: dsll $[[T4:[0-9]+]], $[[R4]], 8
+; MIPS64-EB-DAG: or   $4, $[[T3]], $[[T4]]
+
+; MIPS64R6:      ld $[[SPTR:[0-9]+]], %got_disp(arr)(
+
+  tail call void @extern_func([7 x i8]* byval @arr) nounwind
+  ret void
+}
+
+declare void @extern_func([7 x i8]* byval)
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index af192d0..c7fe6fd 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -1,35 +1,129 @@
-; RUN: llc -march=mipsel -force-mips-long-branch -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=O32
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -force-mips-long-branch -O3 < %s \
+; RUN:   | FileCheck %s -check-prefix=O32
+; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 -force-mips-long-branch -O3 \
+; RUN:   < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 -force-mips-long-branch -O3 \
+; RUN:   < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=micromips \
+; RUN:   -force-mips-long-branch -O3 < %s | FileCheck %s -check-prefix=MICROMIPS
 
-@g0 = external global i32
 
-define void @foo1(i32 %s) nounwind {
+@x = external global i32
+
+define void @test1(i32 %s) {
 entry:
-; O32: nop
-; O32: addiu $sp, $sp, -8
-; O32: bal
-; O32: lui $1, 0
-; O32: addiu $1, $1, {{[0-9]+}} 
-; N64: nop
-; N64: daddiu $sp, $sp, -16
-; N64: lui $1, 0
-; N64: daddiu $1, $1, 0
-; N64: dsll $1, $1, 16
-; N64: daddiu $1, $1, 0
-; N64: bal
-; N64: dsll $1, $1, 16
-; N64: daddiu $1, $1, {{[0-9]+}}  
-
-  %tobool = icmp eq i32 %s, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %entry
-  %0 = load i32* @g0, align 4
-  %add = add nsw i32 %0, 12
-  store i32 %add, i32* @g0, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %entry, %if.then
+  %cmp = icmp eq i32 %s, 0
+  br i1 %cmp, label %end, label %then
+
+then:
+  store i32 1, i32* @x, align 4
+  br label %end
+
+end:
   ret void
-}
 
+
+; First check the normal version (without long branch).  beqz jumps to return,
+; and fallthrough block stores 1 to global variable.
+
+; CHECK:        lui     $[[R0:[0-9]+]], %hi(_gp_disp)
+; CHECK:        addiu   $[[R0]], $[[R0]], %lo(_gp_disp)
+; CHECK:        beqz    $4, $[[BB0:BB[0-9_]+]]
+; CHECK:        addu    $[[GP:[0-9]+]], $[[R0]], $25
+; CHECK:        lw      $[[R1:[0-9]+]], %got(x)($[[GP]])
+; CHECK:        addiu   $[[R2:[0-9]+]], $zero, 1
+; CHECK:        sw      $[[R2]], 0($[[R1]])
+; CHECK:   $[[BB0]]:
+; CHECK:        jr      $ra
+; CHECK:        nop
+
+
+; Check the MIPS32 version.  Check that branch logic is inverted, so that the
+; target of the new branch (bnez) is the fallthrough block of the original
+; branch.  Check that fallthrough block of the new branch contains long branch
+; expansion which at the end indirectly jumps to the target of the original
+; branch.
+
+; O32:        lui     $[[R0:[0-9]+]], %hi(_gp_disp)
+; O32:        addiu   $[[R0]], $[[R0]], %lo(_gp_disp)
+; O32:        bnez    $4, $[[BB0:BB[0-9_]+]]
+; O32:        addu    $[[GP:[0-9]+]], $[[R0]], $25
+
+; Check for long branch expansion:
+; O32:             addiu   $sp, $sp, -8
+; O32-NEXT:        sw      $ra, 0($sp)
+; O32-NEXT:        lui     $1, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; O32-NEXT:        bal     $[[BB1]]
+; O32-NEXT:        addiu   $1, $1, %lo(($[[BB2]])-($[[BB1]]))
+; O32-NEXT:   $[[BB1]]:
+; O32-NEXT:        addu    $1, $ra, $1
+; O32-NEXT:        lw      $ra, 0($sp)
+; O32-NEXT:        jr      $1
+; O32-NEXT:        addiu   $sp, $sp, 8
+
+; O32:   $[[BB0]]:
+; O32:        lw      $[[R1:[0-9]+]], %got(x)($[[GP]])
+; O32:        addiu   $[[R2:[0-9]+]], $zero, 1
+; O32:        sw      $[[R2]], 0($[[R1]])
+; O32:   $[[BB2]]:
+; O32:        jr      $ra
+; O32:        nop
+
+
+; Check the MIPS64 version.
+
+; N64:        lui     $[[R0:[0-9]+]], %hi(%neg(%gp_rel(test1)))
+; N64:        bnez    $4, $[[BB0:BB[0-9_]+]]
+; N64:        daddu   $[[R1:[0-9]+]], $[[R0]], $25
+
+; Check for long branch expansion:
+; N64:           daddiu  $sp, $sp, -16
+; N64-NEXT:      sd      $ra, 0($sp)
+; N64-NEXT:      daddiu  $1, $zero, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; N64-NEXT:      dsll    $1, $1, 16
+; N64-NEXT:      bal     $[[BB1]]
+; N64-NEXT:      daddiu  $1, $1, %lo(($[[BB2]])-($[[BB1]]))
+; N64-NEXT:  $[[BB1]]:
+; N64-NEXT:      daddu   $1, $ra, $1
+; N64-NEXT:      ld      $ra, 0($sp)
+; N64-NEXT:      jr      $1
+; N64-NEXT:      daddiu  $sp, $sp, 16
+
+; N64:   $[[BB0]]:
+; N64:        daddiu  $[[GP:[0-9]+]], $[[R1]], %lo(%neg(%gp_rel(test1)))
+; N64:        ld      $[[R2:[0-9]+]], %got_disp(x)($[[GP]])
+; N64:        addiu   $[[R3:[0-9]+]], $zero, 1
+; N64:        sw      $[[R3]], 0($[[R2]])
+; N64:   $[[BB2]]:
+; N64:        jr      $ra
+; N64:        nop
+
+
+; Check the microMIPS version.
+
+; MICROMIPS:        lui     $[[R0:[0-9]+]], %hi(_gp_disp)
+; MICROMIPS:        addiu   $[[R0]], $[[R0]], %lo(_gp_disp)
+; MICROMIPS:        bnez    $4, $[[BB0:BB[0-9_]+]]
+; MICROMIPS:        addu    $[[GP:[0-9]+]], $[[R0]], $25
+
+; Check for long branch expansion:
+; MICROMIPS:          addiu   $sp, $sp, -8
+; MICROMIPS-NEXT:     sw      $ra, 0($sp)
+; MICROMIPS-NEXT:     lui     $1, %hi(($[[BB2:BB[0-9_]+]])-($[[BB1:BB[0-9_]+]]))
+; MICROMIPS-NEXT:     bal     $[[BB1]]
+; MICROMIPS-NEXT:     addiu   $1, $1, %lo(($[[BB2]])-($[[BB1]]))
+; MICROMIPS-NEXT:  $[[BB1]]:
+; MICROMIPS-NEXT:     addu    $1, $ra, $1
+; MICROMIPS-NEXT:     lw      $ra, 0($sp)
+; MICROMIPS-NEXT:     jr      $1
+; MICROMIPS-NEXT:     addiu   $sp, $sp, 8
+
+; MICROMIPS:   $[[BB0]]:
+; MICROMIPS:        lw      $[[R1:[0-9]+]], %got(x)($[[GP]])
+; MICROMIPS:        addiu   $[[R2:[0-9]+]], $zero, 1
+; MICROMIPS:        sw      $[[R2]], 0($[[R1]])
+; MICROMIPS:   $[[BB2]]:
+; MICROMIPS:        jr      $ra
+; MICROMIPS:        nop
+}
diff --git a/test/CodeGen/Mips/micromips-directives.ll b/test/CodeGen/Mips/micromips-directives.ll
new file mode 100644
index 0000000..dd0bd58
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-directives.ll
@@ -0,0 +1,16 @@
+; This test checks if the '.set [no]micromips' directives
+; are emitted before a function's entry label.
+
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | \
+; RUN:   FileCheck %s -check-prefix=CHECK-MM
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=-micromips %s -o - | \
+; RUN:   FileCheck %s -check-prefix=CHECK-NO-MM
+
+define i32 @main() nounwind {
+entry:
+  ret i32 0
+}
+
+; CHECK-MM: .set micromips
+; CHECK-NO-MM: .set nomicromips
+; CHECK: main:
diff --git a/test/CodeGen/Mips/micromips-long-branch.ll b/test/CodeGen/Mips/micromips-long-branch.ll
deleted file mode 100644
index 3267f4a..0000000
--- a/test/CodeGen/Mips/micromips-long-branch.ll
+++ /dev/null
@@ -1,16437 +0,0 @@
-; RUN: llc %s -march=mipsel -mcpu=mips32r2 -mattr=micromips -filetype=asm \
-; RUN: -relocation-model=pic -O3 -o - | FileCheck %s
-
-@a = common global [10 x i32] zeroinitializer, align 16
-
-; Function Attrs: nounwind uwtable
-define i32 @main() #0 {
-entry:
-  %retval = alloca i32, align 4
-  %i = alloca i32, align 4
-  store i32 0, i32* %retval
-  store i32 0, i32* %i, align 4
-  br label %for.cond
-
-for.cond:
-  %0 = load i32* %i, align 4
-  %cmp = icmp slt i32 %0, 10
-  br i1 %cmp, label %for.body, label %for.end
-
-; CHECK:  addiu $sp, $sp, -8
-; CHECK:  sw  $ra, 0($sp)
-; CHECK:  lui $[[REG1:[0-9]+]], 2
-; CHECK:  addiu $[[REG1]], $[[REG1]], 0
-; CHECK:  addu  $[[REG1]], $ra, $[[REG1]]
-; CHECK:  lw  $ra, 0($sp)
-; CHECK:  jr  $[[REG1]]
-; CHECK:  addiu $sp, $sp, 8
-
-for.body:
-  %1 = load i32* %i, align 4
-  %2 = load i32* %i, align 4
-  %idxprom = sext i32 %2 to i64
-  %arrayidx = getelementptr inbounds [10 x i32]* @a, i32 0, i64 %idxprom
-  store i32 %1, i32* %arrayidx, align 4  %nop0 = alloca i1, i1 0
-  %nop1 = alloca i1, i1 0
-  %nop2 = alloca i1, i1 0
-  %nop3 = alloca i1, i1 0
-  %nop4 = alloca i1, i1 0
-  %nop5 = alloca i1, i1 0
-  %nop6 = alloca i1, i1 0
-  %nop7 = alloca i1, i1 0
-  %nop8 = alloca i1, i1 0
-  %nop9 = alloca i1, i1 0
-  %nop10 = alloca i1, i1 0
-  %nop11 = alloca i1, i1 0
-  %nop12 = alloca i1, i1 0
-  %nop13 = alloca i1, i1 0
-  %nop14 = alloca i1, i1 0
-  %nop15 = alloca i1, i1 0
-  %nop16 = alloca i1, i1 0
-  %nop17 = alloca i1, i1 0
-  %nop18 = alloca i1, i1 0
-  %nop19 = alloca i1, i1 0
-  %nop20 = alloca i1, i1 0
-  %nop21 = alloca i1, i1 0
-  %nop22 = alloca i1, i1 0
-  %nop23 = alloca i1, i1 0
-  %nop24 = alloca i1, i1 0
-  %nop25 = alloca i1, i1 0
-  %nop26 = alloca i1, i1 0
-  %nop27 = alloca i1, i1 0
-  %nop28 = alloca i1, i1 0
-  %nop29 = alloca i1, i1 0
-  %nop30 = alloca i1, i1 0
-  %nop31 = alloca i1, i1 0
-  %nop32 = alloca i1, i1 0
-  %nop33 = alloca i1, i1 0
-  %nop34 = alloca i1, i1 0
-  %nop35 = alloca i1, i1 0
-  %nop36 = alloca i1, i1 0
-  %nop37 = alloca i1, i1 0
-  %nop38 = alloca i1, i1 0
-  %nop39 = alloca i1, i1 0
-  %nop40 = alloca i1, i1 0
-  %nop41 = alloca i1, i1 0
-  %nop42 = alloca i1, i1 0
-  %nop43 = alloca i1, i1 0
-  %nop44 = alloca i1, i1 0
-  %nop45 = alloca i1, i1 0
-  %nop46 = alloca i1, i1 0
-  %nop47 = alloca i1, i1 0
-  %nop48 = alloca i1, i1 0
-  %nop49 = alloca i1, i1 0
-  %nop50 = alloca i1, i1 0
-  %nop51 = alloca i1, i1 0
-  %nop52 = alloca i1, i1 0
-  %nop53 = alloca i1, i1 0
-  %nop54 = alloca i1, i1 0
-  %nop55 = alloca i1, i1 0
-  %nop56 = alloca i1, i1 0
-  %nop57 = alloca i1, i1 0
-  %nop58 = alloca i1, i1 0
-  %nop59 = alloca i1, i1 0
-  %nop60 = alloca i1, i1 0
-  %nop61 = alloca i1, i1 0
-  %nop62 = alloca i1, i1 0
-  %nop63 = alloca i1, i1 0
-  %nop64 = alloca i1, i1 0
-  %nop65 = alloca i1, i1 0
-  %nop66 = alloca i1, i1 0
-  %nop67 = alloca i1, i1 0
-  %nop68 = alloca i1, i1 0
-  %nop69 = alloca i1, i1 0
-  %nop70 = alloca i1, i1 0
-  %nop71 = alloca i1, i1 0
-  %nop72 = alloca i1, i1 0
-  %nop73 = alloca i1, i1 0
-  %nop74 = alloca i1, i1 0
-  %nop75 = alloca i1, i1 0
-  %nop76 = alloca i1, i1 0
-  %nop77 = alloca i1, i1 0
-  %nop78 = alloca i1, i1 0
-  %nop79 = alloca i1, i1 0
-  %nop80 = alloca i1, i1 0
-  %nop81 = alloca i1, i1 0
-  %nop82 = alloca i1, i1 0
-  %nop83 = alloca i1, i1 0
-  %nop84 = alloca i1, i1 0
-  %nop85 = alloca i1, i1 0
-  %nop86 = alloca i1, i1 0
-  %nop87 = alloca i1, i1 0
-  %nop88 = alloca i1, i1 0
-  %nop89 = alloca i1, i1 0
-  %nop90 = alloca i1, i1 0
-  %nop91 = alloca i1, i1 0
-  %nop92 = alloca i1, i1 0
-  %nop93 = alloca i1, i1 0
-  %nop94 = alloca i1, i1 0
-  %nop95 = alloca i1, i1 0
-  %nop96 = alloca i1, i1 0
-  %nop97 = alloca i1, i1 0
-  %nop98 = alloca i1, i1 0
-  %nop99 = alloca i1, i1 0
-  %nop100 = alloca i1, i1 0
-  %nop101 = alloca i1, i1 0
-  %nop102 = alloca i1, i1 0
-  %nop103 = alloca i1, i1 0
-  %nop104 = alloca i1, i1 0
-  %nop105 = alloca i1, i1 0
-  %nop106 = alloca i1, i1 0
-  %nop107 = alloca i1, i1 0
-  %nop108 = alloca i1, i1 0
-  %nop109 = alloca i1, i1 0
-  %nop110 = alloca i1, i1 0
-  %nop111 = alloca i1, i1 0
-  %nop112 = alloca i1, i1 0
-  %nop113 = alloca i1, i1 0
-  %nop114 = alloca i1, i1 0
-  %nop115 = alloca i1, i1 0
-  %nop116 = alloca i1, i1 0
-  %nop117 = alloca i1, i1 0
-  %nop118 = alloca i1, i1 0
-  %nop119 = alloca i1, i1 0
-  %nop120 = alloca i1, i1 0
-  %nop121 = alloca i1, i1 0
-  %nop122 = alloca i1, i1 0
-  %nop123 = alloca i1, i1 0
-  %nop124 = alloca i1, i1 0
-  %nop125 = alloca i1, i1 0
-  %nop126 = alloca i1, i1 0
-  %nop127 = alloca i1, i1 0
-  %nop128 = alloca i1, i1 0
-  %nop129 = alloca i1, i1 0
-  %nop130 = alloca i1, i1 0
-  %nop131 = alloca i1, i1 0
-  %nop132 = alloca i1, i1 0
-  %nop133 = alloca i1, i1 0
-  %nop134 = alloca i1, i1 0
-  %nop135 = alloca i1, i1 0
-  %nop136 = alloca i1, i1 0
-  %nop137 = alloca i1, i1 0
-  %nop138 = alloca i1, i1 0
-  %nop139 = alloca i1, i1 0
-  %nop140 = alloca i1, i1 0
-  %nop141 = alloca i1, i1 0
-  %nop142 = alloca i1, i1 0
-  %nop143 = alloca i1, i1 0
-  %nop144 = alloca i1, i1 0
-  %nop145 = alloca i1, i1 0
-  %nop146 = alloca i1, i1 0
-  %nop147 = alloca i1, i1 0
-  %nop148 = alloca i1, i1 0
-  %nop149 = alloca i1, i1 0
-  %nop150 = alloca i1, i1 0
-  %nop151 = alloca i1, i1 0
-  %nop152 = alloca i1, i1 0
-  %nop153 = alloca i1, i1 0
-  %nop154 = alloca i1, i1 0
-  %nop155 = alloca i1, i1 0
-  %nop156 = alloca i1, i1 0
-  %nop157 = alloca i1, i1 0
-  %nop158 = alloca i1, i1 0
-  %nop159 = alloca i1, i1 0
-  %nop160 = alloca i1, i1 0
-  %nop161 = alloca i1, i1 0
-  %nop162 = alloca i1, i1 0
-  %nop163 = alloca i1, i1 0
-  %nop164 = alloca i1, i1 0
-  %nop165 = alloca i1, i1 0
-  %nop166 = alloca i1, i1 0
-  %nop167 = alloca i1, i1 0
-  %nop168 = alloca i1, i1 0
-  %nop169 = alloca i1, i1 0
-  %nop170 = alloca i1, i1 0
-  %nop171 = alloca i1, i1 0
-  %nop172 = alloca i1, i1 0
-  %nop173 = alloca i1, i1 0
-  %nop174 = alloca i1, i1 0
-  %nop175 = alloca i1, i1 0
-  %nop176 = alloca i1, i1 0
-  %nop177 = alloca i1, i1 0
-  %nop178 = alloca i1, i1 0
-  %nop179 = alloca i1, i1 0
-  %nop180 = alloca i1, i1 0
-  %nop181 = alloca i1, i1 0
-  %nop182 = alloca i1, i1 0
-  %nop183 = alloca i1, i1 0
-  %nop184 = alloca i1, i1 0
-  %nop185 = alloca i1, i1 0
-  %nop186 = alloca i1, i1 0
-  %nop187 = alloca i1, i1 0
-  %nop188 = alloca i1, i1 0
-  %nop189 = alloca i1, i1 0
-  %nop190 = alloca i1, i1 0
-  %nop191 = alloca i1, i1 0
-  %nop192 = alloca i1, i1 0
-  %nop193 = alloca i1, i1 0
-  %nop194 = alloca i1, i1 0
-  %nop195 = alloca i1, i1 0
-  %nop196 = alloca i1, i1 0
-  %nop197 = alloca i1, i1 0
-  %nop198 = alloca i1, i1 0
-  %nop199 = alloca i1, i1 0
-  %nop200 = alloca i1, i1 0
-  %nop201 = alloca i1, i1 0
-  %nop202 = alloca i1, i1 0
-  %nop203 = alloca i1, i1 0
-  %nop204 = alloca i1, i1 0
-  %nop205 = alloca i1, i1 0
-  %nop206 = alloca i1, i1 0
-  %nop207 = alloca i1, i1 0
-  %nop208 = alloca i1, i1 0
-  %nop209 = alloca i1, i1 0
-  %nop210 = alloca i1, i1 0
-  %nop211 = alloca i1, i1 0
-  %nop212 = alloca i1, i1 0
-  %nop213 = alloca i1, i1 0
-  %nop214 = alloca i1, i1 0
-  %nop215 = alloca i1, i1 0
-  %nop216 = alloca i1, i1 0
-  %nop217 = alloca i1, i1 0
-  %nop218 = alloca i1, i1 0
-  %nop219 = alloca i1, i1 0
-  %nop220 = alloca i1, i1 0
-  %nop221 = alloca i1, i1 0
-  %nop222 = alloca i1, i1 0
-  %nop223 = alloca i1, i1 0
-  %nop224 = alloca i1, i1 0
-  %nop225 = alloca i1, i1 0
-  %nop226 = alloca i1, i1 0
-  %nop227 = alloca i1, i1 0
-  %nop228 = alloca i1, i1 0
-  %nop229 = alloca i1, i1 0
-  %nop230 = alloca i1, i1 0
-  %nop231 = alloca i1, i1 0
-  %nop232 = alloca i1, i1 0
-  %nop233 = alloca i1, i1 0
-  %nop234 = alloca i1, i1 0
-  %nop235 = alloca i1, i1 0
-  %nop236 = alloca i1, i1 0
-  %nop237 = alloca i1, i1 0
-  %nop238 = alloca i1, i1 0
-  %nop239 = alloca i1, i1 0
-  %nop240 = alloca i1, i1 0
-  %nop241 = alloca i1, i1 0
-  %nop242 = alloca i1, i1 0
-  %nop243 = alloca i1, i1 0
-  %nop244 = alloca i1, i1 0
-  %nop245 = alloca i1, i1 0
-  %nop246 = alloca i1, i1 0
-  %nop247 = alloca i1, i1 0
-  %nop248 = alloca i1, i1 0
-  %nop249 = alloca i1, i1 0
-  %nop250 = alloca i1, i1 0
-  %nop251 = alloca i1, i1 0
-  %nop252 = alloca i1, i1 0
-  %nop253 = alloca i1, i1 0
-  %nop254 = alloca i1, i1 0
-  %nop255 = alloca i1, i1 0
-  %nop256 = alloca i1, i1 0
-  %nop257 = alloca i1, i1 0
-  %nop258 = alloca i1, i1 0
-  %nop259 = alloca i1, i1 0
-  %nop260 = alloca i1, i1 0
-  %nop261 = alloca i1, i1 0
-  %nop262 = alloca i1, i1 0
-  %nop263 = alloca i1, i1 0
-  %nop264 = alloca i1, i1 0
-  %nop265 = alloca i1, i1 0
-  %nop266 = alloca i1, i1 0
-  %nop267 = alloca i1, i1 0
-  %nop268 = alloca i1, i1 0
-  %nop269 = alloca i1, i1 0
-  %nop270 = alloca i1, i1 0
-  %nop271 = alloca i1, i1 0
-  %nop272 = alloca i1, i1 0
-  %nop273 = alloca i1, i1 0
-  %nop274 = alloca i1, i1 0
-  %nop275 = alloca i1, i1 0
-  %nop276 = alloca i1, i1 0
-  %nop277 = alloca i1, i1 0
-  %nop278 = alloca i1, i1 0
-  %nop279 = alloca i1, i1 0
-  %nop280 = alloca i1, i1 0
-  %nop281 = alloca i1, i1 0
-  %nop282 = alloca i1, i1 0
-  %nop283 = alloca i1, i1 0
-  %nop284 = alloca i1, i1 0
-  %nop285 = alloca i1, i1 0
-  %nop286 = alloca i1, i1 0
-  %nop287 = alloca i1, i1 0
-  %nop288 = alloca i1, i1 0
-  %nop289 = alloca i1, i1 0
-  %nop290 = alloca i1, i1 0
-  %nop291 = alloca i1, i1 0
-  %nop292 = alloca i1, i1 0
-  %nop293 = alloca i1, i1 0
-  %nop294 = alloca i1, i1 0
-  %nop295 = alloca i1, i1 0
-  %nop296 = alloca i1, i1 0
-  %nop297 = alloca i1, i1 0
-  %nop298 = alloca i1, i1 0
-  %nop299 = alloca i1, i1 0
-  %nop300 = alloca i1, i1 0
-  %nop301 = alloca i1, i1 0
-  %nop302 = alloca i1, i1 0
-  %nop303 = alloca i1, i1 0
-  %nop304 = alloca i1, i1 0
-  %nop305 = alloca i1, i1 0
-  %nop306 = alloca i1, i1 0
-  %nop307 = alloca i1, i1 0
-  %nop308 = alloca i1, i1 0
-  %nop309 = alloca i1, i1 0
-  %nop310 = alloca i1, i1 0
-  %nop311 = alloca i1, i1 0
-  %nop312 = alloca i1, i1 0
-  %nop313 = alloca i1, i1 0
-  %nop314 = alloca i1, i1 0
-  %nop315 = alloca i1, i1 0
-  %nop316 = alloca i1, i1 0
-  %nop317 = alloca i1, i1 0
-  %nop318 = alloca i1, i1 0
-  %nop319 = alloca i1, i1 0
-  %nop320 = alloca i1, i1 0
-  %nop321 = alloca i1, i1 0
-  %nop322 = alloca i1, i1 0
-  %nop323 = alloca i1, i1 0
-  %nop324 = alloca i1, i1 0
-  %nop325 = alloca i1, i1 0
-  %nop326 = alloca i1, i1 0
-  %nop327 = alloca i1, i1 0
-  %nop328 = alloca i1, i1 0
-  %nop329 = alloca i1, i1 0
-  %nop330 = alloca i1, i1 0
-  %nop331 = alloca i1, i1 0
-  %nop332 = alloca i1, i1 0
-  %nop333 = alloca i1, i1 0
-  %nop334 = alloca i1, i1 0
-  %nop335 = alloca i1, i1 0
-  %nop336 = alloca i1, i1 0
-  %nop337 = alloca i1, i1 0
-  %nop338 = alloca i1, i1 0
-  %nop339 = alloca i1, i1 0
-  %nop340 = alloca i1, i1 0
-  %nop341 = alloca i1, i1 0
-  %nop342 = alloca i1, i1 0
-  %nop343 = alloca i1, i1 0
-  %nop344 = alloca i1, i1 0
-  %nop345 = alloca i1, i1 0
-  %nop346 = alloca i1, i1 0
-  %nop347 = alloca i1, i1 0
-  %nop348 = alloca i1, i1 0
-  %nop349 = alloca i1, i1 0
-  %nop350 = alloca i1, i1 0
-  %nop351 = alloca i1, i1 0
-  %nop352 = alloca i1, i1 0
-  %nop353 = alloca i1, i1 0
-  %nop354 = alloca i1, i1 0
-  %nop355 = alloca i1, i1 0
-  %nop356 = alloca i1, i1 0
-  %nop357 = alloca i1, i1 0
-  %nop358 = alloca i1, i1 0
-  %nop359 = alloca i1, i1 0
-  %nop360 = alloca i1, i1 0
-  %nop361 = alloca i1, i1 0
-  %nop362 = alloca i1, i1 0
-  %nop363 = alloca i1, i1 0
-  %nop364 = alloca i1, i1 0
-  %nop365 = alloca i1, i1 0
-  %nop366 = alloca i1, i1 0
-  %nop367 = alloca i1, i1 0
-  %nop368 = alloca i1, i1 0
-  %nop369 = alloca i1, i1 0
-  %nop370 = alloca i1, i1 0
-  %nop371 = alloca i1, i1 0
-  %nop372 = alloca i1, i1 0
-  %nop373 = alloca i1, i1 0
-  %nop374 = alloca i1, i1 0
-  %nop375 = alloca i1, i1 0
-  %nop376 = alloca i1, i1 0
-  %nop377 = alloca i1, i1 0
-  %nop378 = alloca i1, i1 0
-  %nop379 = alloca i1, i1 0
-  %nop380 = alloca i1, i1 0
-  %nop381 = alloca i1, i1 0
-  %nop382 = alloca i1, i1 0
-  %nop383 = alloca i1, i1 0
-  %nop384 = alloca i1, i1 0
-  %nop385 = alloca i1, i1 0
-  %nop386 = alloca i1, i1 0
-  %nop387 = alloca i1, i1 0
-  %nop388 = alloca i1, i1 0
-  %nop389 = alloca i1, i1 0
-  %nop390 = alloca i1, i1 0
-  %nop391 = alloca i1, i1 0
-  %nop392 = alloca i1, i1 0
-  %nop393 = alloca i1, i1 0
-  %nop394 = alloca i1, i1 0
-  %nop395 = alloca i1, i1 0
-  %nop396 = alloca i1, i1 0
-  %nop397 = alloca i1, i1 0
-  %nop398 = alloca i1, i1 0
-  %nop399 = alloca i1, i1 0
-  %nop400 = alloca i1, i1 0
-  %nop401 = alloca i1, i1 0
-  %nop402 = alloca i1, i1 0
-  %nop403 = alloca i1, i1 0
-  %nop404 = alloca i1, i1 0
-  %nop405 = alloca i1, i1 0
-  %nop406 = alloca i1, i1 0
-  %nop407 = alloca i1, i1 0
-  %nop408 = alloca i1, i1 0
-  %nop409 = alloca i1, i1 0
-  %nop410 = alloca i1, i1 0
-  %nop411 = alloca i1, i1 0
-  %nop412 = alloca i1, i1 0
-  %nop413 = alloca i1, i1 0
-  %nop414 = alloca i1, i1 0
-  %nop415 = alloca i1, i1 0
-  %nop416 = alloca i1, i1 0
-  %nop417 = alloca i1, i1 0
-  %nop418 = alloca i1, i1 0
-  %nop419 = alloca i1, i1 0
-  %nop420 = alloca i1, i1 0
-  %nop421 = alloca i1, i1 0
-  %nop422 = alloca i1, i1 0
-  %nop423 = alloca i1, i1 0
-  %nop424 = alloca i1, i1 0
-  %nop425 = alloca i1, i1 0
-  %nop426 = alloca i1, i1 0
-  %nop427 = alloca i1, i1 0
-  %nop428 = alloca i1, i1 0
-  %nop429 = alloca i1, i1 0
-  %nop430 = alloca i1, i1 0
-  %nop431 = alloca i1, i1 0
-  %nop432 = alloca i1, i1 0
-  %nop433 = alloca i1, i1 0
-  %nop434 = alloca i1, i1 0
-  %nop435 = alloca i1, i1 0
-  %nop436 = alloca i1, i1 0
-  %nop437 = alloca i1, i1 0
-  %nop438 = alloca i1, i1 0
-  %nop439 = alloca i1, i1 0
-  %nop440 = alloca i1, i1 0
-  %nop441 = alloca i1, i1 0
-  %nop442 = alloca i1, i1 0
-  %nop443 = alloca i1, i1 0
-  %nop444 = alloca i1, i1 0
-  %nop445 = alloca i1, i1 0
-  %nop446 = alloca i1, i1 0
-  %nop447 = alloca i1, i1 0
-  %nop448 = alloca i1, i1 0
-  %nop449 = alloca i1, i1 0
-  %nop450 = alloca i1, i1 0
-  %nop451 = alloca i1, i1 0
-  %nop452 = alloca i1, i1 0
-  %nop453 = alloca i1, i1 0
-  %nop454 = alloca i1, i1 0
-  %nop455 = alloca i1, i1 0
-  %nop456 = alloca i1, i1 0
-  %nop457 = alloca i1, i1 0
-  %nop458 = alloca i1, i1 0
-  %nop459 = alloca i1, i1 0
-  %nop460 = alloca i1, i1 0
-  %nop461 = alloca i1, i1 0
-  %nop462 = alloca i1, i1 0
-  %nop463 = alloca i1, i1 0
-  %nop464 = alloca i1, i1 0
-  %nop465 = alloca i1, i1 0
-  %nop466 = alloca i1, i1 0
-  %nop467 = alloca i1, i1 0
-  %nop468 = alloca i1, i1 0
-  %nop469 = alloca i1, i1 0
-  %nop470 = alloca i1, i1 0
-  %nop471 = alloca i1, i1 0
-  %nop472 = alloca i1, i1 0
-  %nop473 = alloca i1, i1 0
-  %nop474 = alloca i1, i1 0
-  %nop475 = alloca i1, i1 0
-  %nop476 = alloca i1, i1 0
-  %nop477 = alloca i1, i1 0
-  %nop478 = alloca i1, i1 0
-  %nop479 = alloca i1, i1 0
-  %nop480 = alloca i1, i1 0
-  %nop481 = alloca i1, i1 0
-  %nop482 = alloca i1, i1 0
-  %nop483 = alloca i1, i1 0
-  %nop484 = alloca i1, i1 0
-  %nop485 = alloca i1, i1 0
-  %nop486 = alloca i1, i1 0
-  %nop487 = alloca i1, i1 0
-  %nop488 = alloca i1, i1 0
-  %nop489 = alloca i1, i1 0
-  %nop490 = alloca i1, i1 0
-  %nop491 = alloca i1, i1 0
-  %nop492 = alloca i1, i1 0
-  %nop493 = alloca i1, i1 0
-  %nop494 = alloca i1, i1 0
-  %nop495 = alloca i1, i1 0
-  %nop496 = alloca i1, i1 0
-  %nop497 = alloca i1, i1 0
-  %nop498 = alloca i1, i1 0
-  %nop499 = alloca i1, i1 0
-  %nop500 = alloca i1, i1 0
-  %nop501 = alloca i1, i1 0
-  %nop502 = alloca i1, i1 0
-  %nop503 = alloca i1, i1 0
-  %nop504 = alloca i1, i1 0
-  %nop505 = alloca i1, i1 0
-  %nop506 = alloca i1, i1 0
-  %nop507 = alloca i1, i1 0
-  %nop508 = alloca i1, i1 0
-  %nop509 = alloca i1, i1 0
-  %nop510 = alloca i1, i1 0
-  %nop511 = alloca i1, i1 0
-  %nop512 = alloca i1, i1 0
-  %nop513 = alloca i1, i1 0
-  %nop514 = alloca i1, i1 0
-  %nop515 = alloca i1, i1 0
-  %nop516 = alloca i1, i1 0
-  %nop517 = alloca i1, i1 0
-  %nop518 = alloca i1, i1 0
-  %nop519 = alloca i1, i1 0
-  %nop520 = alloca i1, i1 0
-  %nop521 = alloca i1, i1 0
-  %nop522 = alloca i1, i1 0
-  %nop523 = alloca i1, i1 0
-  %nop524 = alloca i1, i1 0
-  %nop525 = alloca i1, i1 0
-  %nop526 = alloca i1, i1 0
-  %nop527 = alloca i1, i1 0
-  %nop528 = alloca i1, i1 0
-  %nop529 = alloca i1, i1 0
-  %nop530 = alloca i1, i1 0
-  %nop531 = alloca i1, i1 0
-  %nop532 = alloca i1, i1 0
-  %nop533 = alloca i1, i1 0
-  %nop534 = alloca i1, i1 0
-  %nop535 = alloca i1, i1 0
-  %nop536 = alloca i1, i1 0
-  %nop537 = alloca i1, i1 0
-  %nop538 = alloca i1, i1 0
-  %nop539 = alloca i1, i1 0
-  %nop540 = alloca i1, i1 0
-  %nop541 = alloca i1, i1 0
-  %nop542 = alloca i1, i1 0
-  %nop543 = alloca i1, i1 0
-  %nop544 = alloca i1, i1 0
-  %nop545 = alloca i1, i1 0
-  %nop546 = alloca i1, i1 0
-  %nop547 = alloca i1, i1 0
-  %nop548 = alloca i1, i1 0
-  %nop549 = alloca i1, i1 0
-  %nop550 = alloca i1, i1 0
-  %nop551 = alloca i1, i1 0
-  %nop552 = alloca i1, i1 0
-  %nop553 = alloca i1, i1 0
-  %nop554 = alloca i1, i1 0
-  %nop555 = alloca i1, i1 0
-  %nop556 = alloca i1, i1 0
-  %nop557 = alloca i1, i1 0
-  %nop558 = alloca i1, i1 0
-  %nop559 = alloca i1, i1 0
-  %nop560 = alloca i1, i1 0
-  %nop561 = alloca i1, i1 0
-  %nop562 = alloca i1, i1 0
-  %nop563 = alloca i1, i1 0
-  %nop564 = alloca i1, i1 0
-  %nop565 = alloca i1, i1 0
-  %nop566 = alloca i1, i1 0
-  %nop567 = alloca i1, i1 0
-  %nop568 = alloca i1, i1 0
-  %nop569 = alloca i1, i1 0
-  %nop570 = alloca i1, i1 0
-  %nop571 = alloca i1, i1 0
-  %nop572 = alloca i1, i1 0
-  %nop573 = alloca i1, i1 0
-  %nop574 = alloca i1, i1 0
-  %nop575 = alloca i1, i1 0
-  %nop576 = alloca i1, i1 0
-  %nop577 = alloca i1, i1 0
-  %nop578 = alloca i1, i1 0
-  %nop579 = alloca i1, i1 0
-  %nop580 = alloca i1, i1 0
-  %nop581 = alloca i1, i1 0
-  %nop582 = alloca i1, i1 0
-  %nop583 = alloca i1, i1 0
-  %nop584 = alloca i1, i1 0
-  %nop585 = alloca i1, i1 0
-  %nop586 = alloca i1, i1 0
-  %nop587 = alloca i1, i1 0
-  %nop588 = alloca i1, i1 0
-  %nop589 = alloca i1, i1 0
-  %nop590 = alloca i1, i1 0
-  %nop591 = alloca i1, i1 0
-  %nop592 = alloca i1, i1 0
-  %nop593 = alloca i1, i1 0
-  %nop594 = alloca i1, i1 0
-  %nop595 = alloca i1, i1 0
-  %nop596 = alloca i1, i1 0
-  %nop597 = alloca i1, i1 0
-  %nop598 = alloca i1, i1 0
-  %nop599 = alloca i1, i1 0
-  %nop600 = alloca i1, i1 0
-  %nop601 = alloca i1, i1 0
-  %nop602 = alloca i1, i1 0
-  %nop603 = alloca i1, i1 0
-  %nop604 = alloca i1, i1 0
-  %nop605 = alloca i1, i1 0
-  %nop606 = alloca i1, i1 0
-  %nop607 = alloca i1, i1 0
-  %nop608 = alloca i1, i1 0
-  %nop609 = alloca i1, i1 0
-  %nop610 = alloca i1, i1 0
-  %nop611 = alloca i1, i1 0
-  %nop612 = alloca i1, i1 0
-  %nop613 = alloca i1, i1 0
-  %nop614 = alloca i1, i1 0
-  %nop615 = alloca i1, i1 0
-  %nop616 = alloca i1, i1 0
-  %nop617 = alloca i1, i1 0
-  %nop618 = alloca i1, i1 0
-  %nop619 = alloca i1, i1 0
-  %nop620 = alloca i1, i1 0
-  %nop621 = alloca i1, i1 0
-  %nop622 = alloca i1, i1 0
-  %nop623 = alloca i1, i1 0
-  %nop624 = alloca i1, i1 0
-  %nop625 = alloca i1, i1 0
-  %nop626 = alloca i1, i1 0
-  %nop627 = alloca i1, i1 0
-  %nop628 = alloca i1, i1 0
-  %nop629 = alloca i1, i1 0
-  %nop630 = alloca i1, i1 0
-  %nop631 = alloca i1, i1 0
-  %nop632 = alloca i1, i1 0
-  %nop633 = alloca i1, i1 0
-  %nop634 = alloca i1, i1 0
-  %nop635 = alloca i1, i1 0
-  %nop636 = alloca i1, i1 0
-  %nop637 = alloca i1, i1 0
-  %nop638 = alloca i1, i1 0
-  %nop639 = alloca i1, i1 0
-  %nop640 = alloca i1, i1 0
-  %nop641 = alloca i1, i1 0
-  %nop642 = alloca i1, i1 0
-  %nop643 = alloca i1, i1 0
-  %nop644 = alloca i1, i1 0
-  %nop645 = alloca i1, i1 0
-  %nop646 = alloca i1, i1 0
-  %nop647 = alloca i1, i1 0
-  %nop648 = alloca i1, i1 0
-  %nop649 = alloca i1, i1 0
-  %nop650 = alloca i1, i1 0
-  %nop651 = alloca i1, i1 0
-  %nop652 = alloca i1, i1 0
-  %nop653 = alloca i1, i1 0
-  %nop654 = alloca i1, i1 0
-  %nop655 = alloca i1, i1 0
-  %nop656 = alloca i1, i1 0
-  %nop657 = alloca i1, i1 0
-  %nop658 = alloca i1, i1 0
-  %nop659 = alloca i1, i1 0
-  %nop660 = alloca i1, i1 0
-  %nop661 = alloca i1, i1 0
-  %nop662 = alloca i1, i1 0
-  %nop663 = alloca i1, i1 0
-  %nop664 = alloca i1, i1 0
-  %nop665 = alloca i1, i1 0
-  %nop666 = alloca i1, i1 0
-  %nop667 = alloca i1, i1 0
-  %nop668 = alloca i1, i1 0
-  %nop669 = alloca i1, i1 0
-  %nop670 = alloca i1, i1 0
-  %nop671 = alloca i1, i1 0
-  %nop672 = alloca i1, i1 0
-  %nop673 = alloca i1, i1 0
-  %nop674 = alloca i1, i1 0
-  %nop675 = alloca i1, i1 0
-  %nop676 = alloca i1, i1 0
-  %nop677 = alloca i1, i1 0
-  %nop678 = alloca i1, i1 0
-  %nop679 = alloca i1, i1 0
-  %nop680 = alloca i1, i1 0
-  %nop681 = alloca i1, i1 0
-  %nop682 = alloca i1, i1 0
-  %nop683 = alloca i1, i1 0
-  %nop684 = alloca i1, i1 0
-  %nop685 = alloca i1, i1 0
-  %nop686 = alloca i1, i1 0
-  %nop687 = alloca i1, i1 0
-  %nop688 = alloca i1, i1 0
-  %nop689 = alloca i1, i1 0
-  %nop690 = alloca i1, i1 0
-  %nop691 = alloca i1, i1 0
-  %nop692 = alloca i1, i1 0
-  %nop693 = alloca i1, i1 0
-  %nop694 = alloca i1, i1 0
-  %nop695 = alloca i1, i1 0
-  %nop696 = alloca i1, i1 0
-  %nop697 = alloca i1, i1 0
-  %nop698 = alloca i1, i1 0
-  %nop699 = alloca i1, i1 0
-  %nop700 = alloca i1, i1 0
-  %nop701 = alloca i1, i1 0
-  %nop702 = alloca i1, i1 0
-  %nop703 = alloca i1, i1 0
-  %nop704 = alloca i1, i1 0
-  %nop705 = alloca i1, i1 0
-  %nop706 = alloca i1, i1 0
-  %nop707 = alloca i1, i1 0
-  %nop708 = alloca i1, i1 0
-  %nop709 = alloca i1, i1 0
-  %nop710 = alloca i1, i1 0
-  %nop711 = alloca i1, i1 0
-  %nop712 = alloca i1, i1 0
-  %nop713 = alloca i1, i1 0
-  %nop714 = alloca i1, i1 0
-  %nop715 = alloca i1, i1 0
-  %nop716 = alloca i1, i1 0
-  %nop717 = alloca i1, i1 0
-  %nop718 = alloca i1, i1 0
-  %nop719 = alloca i1, i1 0
-  %nop720 = alloca i1, i1 0
-  %nop721 = alloca i1, i1 0
-  %nop722 = alloca i1, i1 0
-  %nop723 = alloca i1, i1 0
-  %nop724 = alloca i1, i1 0
-  %nop725 = alloca i1, i1 0
-  %nop726 = alloca i1, i1 0
-  %nop727 = alloca i1, i1 0
-  %nop728 = alloca i1, i1 0
-  %nop729 = alloca i1, i1 0
-  %nop730 = alloca i1, i1 0
-  %nop731 = alloca i1, i1 0
-  %nop732 = alloca i1, i1 0
-  %nop733 = alloca i1, i1 0
-  %nop734 = alloca i1, i1 0
-  %nop735 = alloca i1, i1 0
-  %nop736 = alloca i1, i1 0
-  %nop737 = alloca i1, i1 0
-  %nop738 = alloca i1, i1 0
-  %nop739 = alloca i1, i1 0
-  %nop740 = alloca i1, i1 0
-  %nop741 = alloca i1, i1 0
-  %nop742 = alloca i1, i1 0
-  %nop743 = alloca i1, i1 0
-  %nop744 = alloca i1, i1 0
-  %nop745 = alloca i1, i1 0
-  %nop746 = alloca i1, i1 0
-  %nop747 = alloca i1, i1 0
-  %nop748 = alloca i1, i1 0
-  %nop749 = alloca i1, i1 0
-  %nop750 = alloca i1, i1 0
-  %nop751 = alloca i1, i1 0
-  %nop752 = alloca i1, i1 0
-  %nop753 = alloca i1, i1 0
-  %nop754 = alloca i1, i1 0
-  %nop755 = alloca i1, i1 0
-  %nop756 = alloca i1, i1 0
-  %nop757 = alloca i1, i1 0
-  %nop758 = alloca i1, i1 0
-  %nop759 = alloca i1, i1 0
-  %nop760 = alloca i1, i1 0
-  %nop761 = alloca i1, i1 0
-  %nop762 = alloca i1, i1 0
-  %nop763 = alloca i1, i1 0
-  %nop764 = alloca i1, i1 0
-  %nop765 = alloca i1, i1 0
-  %nop766 = alloca i1, i1 0
-  %nop767 = alloca i1, i1 0
-  %nop768 = alloca i1, i1 0
-  %nop769 = alloca i1, i1 0
-  %nop770 = alloca i1, i1 0
-  %nop771 = alloca i1, i1 0
-  %nop772 = alloca i1, i1 0
-  %nop773 = alloca i1, i1 0
-  %nop774 = alloca i1, i1 0
-  %nop775 = alloca i1, i1 0
-  %nop776 = alloca i1, i1 0
-  %nop777 = alloca i1, i1 0
-  %nop778 = alloca i1, i1 0
-  %nop779 = alloca i1, i1 0
-  %nop780 = alloca i1, i1 0
-  %nop781 = alloca i1, i1 0
-  %nop782 = alloca i1, i1 0
-  %nop783 = alloca i1, i1 0
-  %nop784 = alloca i1, i1 0
-  %nop785 = alloca i1, i1 0
-  %nop786 = alloca i1, i1 0
-  %nop787 = alloca i1, i1 0
-  %nop788 = alloca i1, i1 0
-  %nop789 = alloca i1, i1 0
-  %nop790 = alloca i1, i1 0
-  %nop791 = alloca i1, i1 0
-  %nop792 = alloca i1, i1 0
-  %nop793 = alloca i1, i1 0
-  %nop794 = alloca i1, i1 0
-  %nop795 = alloca i1, i1 0
-  %nop796 = alloca i1, i1 0
-  %nop797 = alloca i1, i1 0
-  %nop798 = alloca i1, i1 0
-  %nop799 = alloca i1, i1 0
-  %nop800 = alloca i1, i1 0
-  %nop801 = alloca i1, i1 0
-  %nop802 = alloca i1, i1 0
-  %nop803 = alloca i1, i1 0
-  %nop804 = alloca i1, i1 0
-  %nop805 = alloca i1, i1 0
-  %nop806 = alloca i1, i1 0
-  %nop807 = alloca i1, i1 0
-  %nop808 = alloca i1, i1 0
-  %nop809 = alloca i1, i1 0
-  %nop810 = alloca i1, i1 0
-  %nop811 = alloca i1, i1 0
-  %nop812 = alloca i1, i1 0
-  %nop813 = alloca i1, i1 0
-  %nop814 = alloca i1, i1 0
-  %nop815 = alloca i1, i1 0
-  %nop816 = alloca i1, i1 0
-  %nop817 = alloca i1, i1 0
-  %nop818 = alloca i1, i1 0
-  %nop819 = alloca i1, i1 0
-  %nop820 = alloca i1, i1 0
-  %nop821 = alloca i1, i1 0
-  %nop822 = alloca i1, i1 0
-  %nop823 = alloca i1, i1 0
-  %nop824 = alloca i1, i1 0
-  %nop825 = alloca i1, i1 0
-  %nop826 = alloca i1, i1 0
-  %nop827 = alloca i1, i1 0
-  %nop828 = alloca i1, i1 0
-  %nop829 = alloca i1, i1 0
-  %nop830 = alloca i1, i1 0
-  %nop831 = alloca i1, i1 0
-  %nop832 = alloca i1, i1 0
-  %nop833 = alloca i1, i1 0
-  %nop834 = alloca i1, i1 0
-  %nop835 = alloca i1, i1 0
-  %nop836 = alloca i1, i1 0
-  %nop837 = alloca i1, i1 0
-  %nop838 = alloca i1, i1 0
-  %nop839 = alloca i1, i1 0
-  %nop840 = alloca i1, i1 0
-  %nop841 = alloca i1, i1 0
-  %nop842 = alloca i1, i1 0
-  %nop843 = alloca i1, i1 0
-  %nop844 = alloca i1, i1 0
-  %nop845 = alloca i1, i1 0
-  %nop846 = alloca i1, i1 0
-  %nop847 = alloca i1, i1 0
-  %nop848 = alloca i1, i1 0
-  %nop849 = alloca i1, i1 0
-  %nop850 = alloca i1, i1 0
-  %nop851 = alloca i1, i1 0
-  %nop852 = alloca i1, i1 0
-  %nop853 = alloca i1, i1 0
-  %nop854 = alloca i1, i1 0
-  %nop855 = alloca i1, i1 0
-  %nop856 = alloca i1, i1 0
-  %nop857 = alloca i1, i1 0
-  %nop858 = alloca i1, i1 0
-  %nop859 = alloca i1, i1 0
-  %nop860 = alloca i1, i1 0
-  %nop861 = alloca i1, i1 0
-  %nop862 = alloca i1, i1 0
-  %nop863 = alloca i1, i1 0
-  %nop864 = alloca i1, i1 0
-  %nop865 = alloca i1, i1 0
-  %nop866 = alloca i1, i1 0
-  %nop867 = alloca i1, i1 0
-  %nop868 = alloca i1, i1 0
-  %nop869 = alloca i1, i1 0
-  %nop870 = alloca i1, i1 0
-  %nop871 = alloca i1, i1 0
-  %nop872 = alloca i1, i1 0
-  %nop873 = alloca i1, i1 0
-  %nop874 = alloca i1, i1 0
-  %nop875 = alloca i1, i1 0
-  %nop876 = alloca i1, i1 0
-  %nop877 = alloca i1, i1 0
-  %nop878 = alloca i1, i1 0
-  %nop879 = alloca i1, i1 0
-  %nop880 = alloca i1, i1 0
-  %nop881 = alloca i1, i1 0
-  %nop882 = alloca i1, i1 0
-  %nop883 = alloca i1, i1 0
-  %nop884 = alloca i1, i1 0
-  %nop885 = alloca i1, i1 0
-  %nop886 = alloca i1, i1 0
-  %nop887 = alloca i1, i1 0
-  %nop888 = alloca i1, i1 0
-  %nop889 = alloca i1, i1 0
-  %nop890 = alloca i1, i1 0
-  %nop891 = alloca i1, i1 0
-  %nop892 = alloca i1, i1 0
-  %nop893 = alloca i1, i1 0
-  %nop894 = alloca i1, i1 0
-  %nop895 = alloca i1, i1 0
-  %nop896 = alloca i1, i1 0
-  %nop897 = alloca i1, i1 0
-  %nop898 = alloca i1, i1 0
-  %nop899 = alloca i1, i1 0
-  %nop900 = alloca i1, i1 0
-  %nop901 = alloca i1, i1 0
-  %nop902 = alloca i1, i1 0
-  %nop903 = alloca i1, i1 0
-  %nop904 = alloca i1, i1 0
-  %nop905 = alloca i1, i1 0
-  %nop906 = alloca i1, i1 0
-  %nop907 = alloca i1, i1 0
-  %nop908 = alloca i1, i1 0
-  %nop909 = alloca i1, i1 0
-  %nop910 = alloca i1, i1 0
-  %nop911 = alloca i1, i1 0
-  %nop912 = alloca i1, i1 0
-  %nop913 = alloca i1, i1 0
-  %nop914 = alloca i1, i1 0
-  %nop915 = alloca i1, i1 0
-  %nop916 = alloca i1, i1 0
-  %nop917 = alloca i1, i1 0
-  %nop918 = alloca i1, i1 0
-  %nop919 = alloca i1, i1 0
-  %nop920 = alloca i1, i1 0
-  %nop921 = alloca i1, i1 0
-  %nop922 = alloca i1, i1 0
-  %nop923 = alloca i1, i1 0
-  %nop924 = alloca i1, i1 0
-  %nop925 = alloca i1, i1 0
-  %nop926 = alloca i1, i1 0
-  %nop927 = alloca i1, i1 0
-  %nop928 = alloca i1, i1 0
-  %nop929 = alloca i1, i1 0
-  %nop930 = alloca i1, i1 0
-  %nop931 = alloca i1, i1 0
-  %nop932 = alloca i1, i1 0
-  %nop933 = alloca i1, i1 0
-  %nop934 = alloca i1, i1 0
-  %nop935 = alloca i1, i1 0
-  %nop936 = alloca i1, i1 0
-  %nop937 = alloca i1, i1 0
-  %nop938 = alloca i1, i1 0
-  %nop939 = alloca i1, i1 0
-  %nop940 = alloca i1, i1 0
-  %nop941 = alloca i1, i1 0
-  %nop942 = alloca i1, i1 0
-  %nop943 = alloca i1, i1 0
-  %nop944 = alloca i1, i1 0
-  %nop945 = alloca i1, i1 0
-  %nop946 = alloca i1, i1 0
-  %nop947 = alloca i1, i1 0
-  %nop948 = alloca i1, i1 0
-  %nop949 = alloca i1, i1 0
-  %nop950 = alloca i1, i1 0
-  %nop951 = alloca i1, i1 0
-  %nop952 = alloca i1, i1 0
-  %nop953 = alloca i1, i1 0
-  %nop954 = alloca i1, i1 0
-  %nop955 = alloca i1, i1 0
-  %nop956 = alloca i1, i1 0
-  %nop957 = alloca i1, i1 0
-  %nop958 = alloca i1, i1 0
-  %nop959 = alloca i1, i1 0
-  %nop960 = alloca i1, i1 0
-  %nop961 = alloca i1, i1 0
-  %nop962 = alloca i1, i1 0
-  %nop963 = alloca i1, i1 0
-  %nop964 = alloca i1, i1 0
-  %nop965 = alloca i1, i1 0
-  %nop966 = alloca i1, i1 0
-  %nop967 = alloca i1, i1 0
-  %nop968 = alloca i1, i1 0
-  %nop969 = alloca i1, i1 0
-  %nop970 = alloca i1, i1 0
-  %nop971 = alloca i1, i1 0
-  %nop972 = alloca i1, i1 0
-  %nop973 = alloca i1, i1 0
-  %nop974 = alloca i1, i1 0
-  %nop975 = alloca i1, i1 0
-  %nop976 = alloca i1, i1 0
-  %nop977 = alloca i1, i1 0
-  %nop978 = alloca i1, i1 0
-  %nop979 = alloca i1, i1 0
-  %nop980 = alloca i1, i1 0
-  %nop981 = alloca i1, i1 0
-  %nop982 = alloca i1, i1 0
-  %nop983 = alloca i1, i1 0
-  %nop984 = alloca i1, i1 0
-  %nop985 = alloca i1, i1 0
-  %nop986 = alloca i1, i1 0
-  %nop987 = alloca i1, i1 0
-  %nop988 = alloca i1, i1 0
-  %nop989 = alloca i1, i1 0
-  %nop990 = alloca i1, i1 0
-  %nop991 = alloca i1, i1 0
-  %nop992 = alloca i1, i1 0
-  %nop993 = alloca i1, i1 0
-  %nop994 = alloca i1, i1 0
-  %nop995 = alloca i1, i1 0
-  %nop996 = alloca i1, i1 0
-  %nop997 = alloca i1, i1 0
-  %nop998 = alloca i1, i1 0
-  %nop999 = alloca i1, i1 0
-  %nop1000 = alloca i1, i1 0
-  %nop1001 = alloca i1, i1 0
-  %nop1002 = alloca i1, i1 0
-  %nop1003 = alloca i1, i1 0
-  %nop1004 = alloca i1, i1 0
-  %nop1005 = alloca i1, i1 0
-  %nop1006 = alloca i1, i1 0
-  %nop1007 = alloca i1, i1 0
-  %nop1008 = alloca i1, i1 0
-  %nop1009 = alloca i1, i1 0
-  %nop1010 = alloca i1, i1 0
-  %nop1011 = alloca i1, i1 0
-  %nop1012 = alloca i1, i1 0
-  %nop1013 = alloca i1, i1 0
-  %nop1014 = alloca i1, i1 0
-  %nop1015 = alloca i1, i1 0
-  %nop1016 = alloca i1, i1 0
-  %nop1017 = alloca i1, i1 0
-  %nop1018 = alloca i1, i1 0
-  %nop1019 = alloca i1, i1 0
-  %nop1020 = alloca i1, i1 0
-  %nop1021 = alloca i1, i1 0
-  %nop1022 = alloca i1, i1 0
-  %nop1023 = alloca i1, i1 0
-  %nop1024 = alloca i1, i1 0
-  %nop1025 = alloca i1, i1 0
-  %nop1026 = alloca i1, i1 0
-  %nop1027 = alloca i1, i1 0
-  %nop1028 = alloca i1, i1 0
-  %nop1029 = alloca i1, i1 0
-  %nop1030 = alloca i1, i1 0
-  %nop1031 = alloca i1, i1 0
-  %nop1032 = alloca i1, i1 0
-  %nop1033 = alloca i1, i1 0
-  %nop1034 = alloca i1, i1 0
-  %nop1035 = alloca i1, i1 0
-  %nop1036 = alloca i1, i1 0
-  %nop1037 = alloca i1, i1 0
-  %nop1038 = alloca i1, i1 0
-  %nop1039 = alloca i1, i1 0
-  %nop1040 = alloca i1, i1 0
-  %nop1041 = alloca i1, i1 0
-  %nop1042 = alloca i1, i1 0
-  %nop1043 = alloca i1, i1 0
-  %nop1044 = alloca i1, i1 0
-  %nop1045 = alloca i1, i1 0
-  %nop1046 = alloca i1, i1 0
-  %nop1047 = alloca i1, i1 0
-  %nop1048 = alloca i1, i1 0
-  %nop1049 = alloca i1, i1 0
-  %nop1050 = alloca i1, i1 0
-  %nop1051 = alloca i1, i1 0
-  %nop1052 = alloca i1, i1 0
-  %nop1053 = alloca i1, i1 0
-  %nop1054 = alloca i1, i1 0
-  %nop1055 = alloca i1, i1 0
-  %nop1056 = alloca i1, i1 0
-  %nop1057 = alloca i1, i1 0
-  %nop1058 = alloca i1, i1 0
-  %nop1059 = alloca i1, i1 0
-  %nop1060 = alloca i1, i1 0
-  %nop1061 = alloca i1, i1 0
-  %nop1062 = alloca i1, i1 0
-  %nop1063 = alloca i1, i1 0
-  %nop1064 = alloca i1, i1 0
-  %nop1065 = alloca i1, i1 0
-  %nop1066 = alloca i1, i1 0
-  %nop1067 = alloca i1, i1 0
-  %nop1068 = alloca i1, i1 0
-  %nop1069 = alloca i1, i1 0
-  %nop1070 = alloca i1, i1 0
-  %nop1071 = alloca i1, i1 0
-  %nop1072 = alloca i1, i1 0
-  %nop1073 = alloca i1, i1 0
-  %nop1074 = alloca i1, i1 0
-  %nop1075 = alloca i1, i1 0
-  %nop1076 = alloca i1, i1 0
-  %nop1077 = alloca i1, i1 0
-  %nop1078 = alloca i1, i1 0
-  %nop1079 = alloca i1, i1 0
-  %nop1080 = alloca i1, i1 0
-  %nop1081 = alloca i1, i1 0
-  %nop1082 = alloca i1, i1 0
-  %nop1083 = alloca i1, i1 0
-  %nop1084 = alloca i1, i1 0
-  %nop1085 = alloca i1, i1 0
-  %nop1086 = alloca i1, i1 0
-  %nop1087 = alloca i1, i1 0
-  %nop1088 = alloca i1, i1 0
-  %nop1089 = alloca i1, i1 0
-  %nop1090 = alloca i1, i1 0
-  %nop1091 = alloca i1, i1 0
-  %nop1092 = alloca i1, i1 0
-  %nop1093 = alloca i1, i1 0
-  %nop1094 = alloca i1, i1 0
-  %nop1095 = alloca i1, i1 0
-  %nop1096 = alloca i1, i1 0
-  %nop1097 = alloca i1, i1 0
-  %nop1098 = alloca i1, i1 0
-  %nop1099 = alloca i1, i1 0
-  %nop1100 = alloca i1, i1 0
-  %nop1101 = alloca i1, i1 0
-  %nop1102 = alloca i1, i1 0
-  %nop1103 = alloca i1, i1 0
-  %nop1104 = alloca i1, i1 0
-  %nop1105 = alloca i1, i1 0
-  %nop1106 = alloca i1, i1 0
-  %nop1107 = alloca i1, i1 0
-  %nop1108 = alloca i1, i1 0
-  %nop1109 = alloca i1, i1 0
-  %nop1110 = alloca i1, i1 0
-  %nop1111 = alloca i1, i1 0
-  %nop1112 = alloca i1, i1 0
-  %nop1113 = alloca i1, i1 0
-  %nop1114 = alloca i1, i1 0
-  %nop1115 = alloca i1, i1 0
-  %nop1116 = alloca i1, i1 0
-  %nop1117 = alloca i1, i1 0
-  %nop1118 = alloca i1, i1 0
-  %nop1119 = alloca i1, i1 0
-  %nop1120 = alloca i1, i1 0
-  %nop1121 = alloca i1, i1 0
-  %nop1122 = alloca i1, i1 0
-  %nop1123 = alloca i1, i1 0
-  %nop1124 = alloca i1, i1 0
-  %nop1125 = alloca i1, i1 0
-  %nop1126 = alloca i1, i1 0
-  %nop1127 = alloca i1, i1 0
-  %nop1128 = alloca i1, i1 0
-  %nop1129 = alloca i1, i1 0
-  %nop1130 = alloca i1, i1 0
-  %nop1131 = alloca i1, i1 0
-  %nop1132 = alloca i1, i1 0
-  %nop1133 = alloca i1, i1 0
-  %nop1134 = alloca i1, i1 0
-  %nop1135 = alloca i1, i1 0
-  %nop1136 = alloca i1, i1 0
-  %nop1137 = alloca i1, i1 0
-  %nop1138 = alloca i1, i1 0
-  %nop1139 = alloca i1, i1 0
-  %nop1140 = alloca i1, i1 0
-  %nop1141 = alloca i1, i1 0
-  %nop1142 = alloca i1, i1 0
-  %nop1143 = alloca i1, i1 0
-  %nop1144 = alloca i1, i1 0
-  %nop1145 = alloca i1, i1 0
-  %nop1146 = alloca i1, i1 0
-  %nop1147 = alloca i1, i1 0
-  %nop1148 = alloca i1, i1 0
-  %nop1149 = alloca i1, i1 0
-  %nop1150 = alloca i1, i1 0
-  %nop1151 = alloca i1, i1 0
-  %nop1152 = alloca i1, i1 0
-  %nop1153 = alloca i1, i1 0
-  %nop1154 = alloca i1, i1 0
-  %nop1155 = alloca i1, i1 0
-  %nop1156 = alloca i1, i1 0
-  %nop1157 = alloca i1, i1 0
-  %nop1158 = alloca i1, i1 0
-  %nop1159 = alloca i1, i1 0
-  %nop1160 = alloca i1, i1 0
-  %nop1161 = alloca i1, i1 0
-  %nop1162 = alloca i1, i1 0
-  %nop1163 = alloca i1, i1 0
-  %nop1164 = alloca i1, i1 0
-  %nop1165 = alloca i1, i1 0
-  %nop1166 = alloca i1, i1 0
-  %nop1167 = alloca i1, i1 0
-  %nop1168 = alloca i1, i1 0
-  %nop1169 = alloca i1, i1 0
-  %nop1170 = alloca i1, i1 0
-  %nop1171 = alloca i1, i1 0
-  %nop1172 = alloca i1, i1 0
-  %nop1173 = alloca i1, i1 0
-  %nop1174 = alloca i1, i1 0
-  %nop1175 = alloca i1, i1 0
-  %nop1176 = alloca i1, i1 0
-  %nop1177 = alloca i1, i1 0
-  %nop1178 = alloca i1, i1 0
-  %nop1179 = alloca i1, i1 0
-  %nop1180 = alloca i1, i1 0
-  %nop1181 = alloca i1, i1 0
-  %nop1182 = alloca i1, i1 0
-  %nop1183 = alloca i1, i1 0
-  %nop1184 = alloca i1, i1 0
-  %nop1185 = alloca i1, i1 0
-  %nop1186 = alloca i1, i1 0
-  %nop1187 = alloca i1, i1 0
-  %nop1188 = alloca i1, i1 0
-  %nop1189 = alloca i1, i1 0
-  %nop1190 = alloca i1, i1 0
-  %nop1191 = alloca i1, i1 0
-  %nop1192 = alloca i1, i1 0
-  %nop1193 = alloca i1, i1 0
-  %nop1194 = alloca i1, i1 0
-  %nop1195 = alloca i1, i1 0
-  %nop1196 = alloca i1, i1 0
-  %nop1197 = alloca i1, i1 0
-  %nop1198 = alloca i1, i1 0
-  %nop1199 = alloca i1, i1 0
-  %nop1200 = alloca i1, i1 0
-  %nop1201 = alloca i1, i1 0
-  %nop1202 = alloca i1, i1 0
-  %nop1203 = alloca i1, i1 0
-  %nop1204 = alloca i1, i1 0
-  %nop1205 = alloca i1, i1 0
-  %nop1206 = alloca i1, i1 0
-  %nop1207 = alloca i1, i1 0
-  %nop1208 = alloca i1, i1 0
-  %nop1209 = alloca i1, i1 0
-  %nop1210 = alloca i1, i1 0
-  %nop1211 = alloca i1, i1 0
-  %nop1212 = alloca i1, i1 0
-  %nop1213 = alloca i1, i1 0
-  %nop1214 = alloca i1, i1 0
-  %nop1215 = alloca i1, i1 0
-  %nop1216 = alloca i1, i1 0
-  %nop1217 = alloca i1, i1 0
-  %nop1218 = alloca i1, i1 0
-  %nop1219 = alloca i1, i1 0
-  %nop1220 = alloca i1, i1 0
-  %nop1221 = alloca i1, i1 0
-  %nop1222 = alloca i1, i1 0
-  %nop1223 = alloca i1, i1 0
-  %nop1224 = alloca i1, i1 0
-  %nop1225 = alloca i1, i1 0
-  %nop1226 = alloca i1, i1 0
-  %nop1227 = alloca i1, i1 0
-  %nop1228 = alloca i1, i1 0
-  %nop1229 = alloca i1, i1 0
-  %nop1230 = alloca i1, i1 0
-  %nop1231 = alloca i1, i1 0
-  %nop1232 = alloca i1, i1 0
-  %nop1233 = alloca i1, i1 0
-  %nop1234 = alloca i1, i1 0
-  %nop1235 = alloca i1, i1 0
-  %nop1236 = alloca i1, i1 0
-  %nop1237 = alloca i1, i1 0
-  %nop1238 = alloca i1, i1 0
-  %nop1239 = alloca i1, i1 0
-  %nop1240 = alloca i1, i1 0
-  %nop1241 = alloca i1, i1 0
-  %nop1242 = alloca i1, i1 0
-  %nop1243 = alloca i1, i1 0
-  %nop1244 = alloca i1, i1 0
-  %nop1245 = alloca i1, i1 0
-  %nop1246 = alloca i1, i1 0
-  %nop1247 = alloca i1, i1 0
-  %nop1248 = alloca i1, i1 0
-  %nop1249 = alloca i1, i1 0
-  %nop1250 = alloca i1, i1 0
-  %nop1251 = alloca i1, i1 0
-  %nop1252 = alloca i1, i1 0
-  %nop1253 = alloca i1, i1 0
-  %nop1254 = alloca i1, i1 0
-  %nop1255 = alloca i1, i1 0
-  %nop1256 = alloca i1, i1 0
-  %nop1257 = alloca i1, i1 0
-  %nop1258 = alloca i1, i1 0
-  %nop1259 = alloca i1, i1 0
-  %nop1260 = alloca i1, i1 0
-  %nop1261 = alloca i1, i1 0
-  %nop1262 = alloca i1, i1 0
-  %nop1263 = alloca i1, i1 0
-  %nop1264 = alloca i1, i1 0
-  %nop1265 = alloca i1, i1 0
-  %nop1266 = alloca i1, i1 0
-  %nop1267 = alloca i1, i1 0
-  %nop1268 = alloca i1, i1 0
-  %nop1269 = alloca i1, i1 0
-  %nop1270 = alloca i1, i1 0
-  %nop1271 = alloca i1, i1 0
-  %nop1272 = alloca i1, i1 0
-  %nop1273 = alloca i1, i1 0
-  %nop1274 = alloca i1, i1 0
-  %nop1275 = alloca i1, i1 0
-  %nop1276 = alloca i1, i1 0
-  %nop1277 = alloca i1, i1 0
-  %nop1278 = alloca i1, i1 0
-  %nop1279 = alloca i1, i1 0
-  %nop1280 = alloca i1, i1 0
-  %nop1281 = alloca i1, i1 0
-  %nop1282 = alloca i1, i1 0
-  %nop1283 = alloca i1, i1 0
-  %nop1284 = alloca i1, i1 0
-  %nop1285 = alloca i1, i1 0
-  %nop1286 = alloca i1, i1 0
-  %nop1287 = alloca i1, i1 0
-  %nop1288 = alloca i1, i1 0
-  %nop1289 = alloca i1, i1 0
-  %nop1290 = alloca i1, i1 0
-  %nop1291 = alloca i1, i1 0
-  %nop1292 = alloca i1, i1 0
-  %nop1293 = alloca i1, i1 0
-  %nop1294 = alloca i1, i1 0
-  %nop1295 = alloca i1, i1 0
-  %nop1296 = alloca i1, i1 0
-  %nop1297 = alloca i1, i1 0
-  %nop1298 = alloca i1, i1 0
-  %nop1299 = alloca i1, i1 0
-  %nop1300 = alloca i1, i1 0
-  %nop1301 = alloca i1, i1 0
-  %nop1302 = alloca i1, i1 0
-  %nop1303 = alloca i1, i1 0
-  %nop1304 = alloca i1, i1 0
-  %nop1305 = alloca i1, i1 0
-  %nop1306 = alloca i1, i1 0
-  %nop1307 = alloca i1, i1 0
-  %nop1308 = alloca i1, i1 0
-  %nop1309 = alloca i1, i1 0
-  %nop1310 = alloca i1, i1 0
-  %nop1311 = alloca i1, i1 0
-  %nop1312 = alloca i1, i1 0
-  %nop1313 = alloca i1, i1 0
-  %nop1314 = alloca i1, i1 0
-  %nop1315 = alloca i1, i1 0
-  %nop1316 = alloca i1, i1 0
-  %nop1317 = alloca i1, i1 0
-  %nop1318 = alloca i1, i1 0
-  %nop1319 = alloca i1, i1 0
-  %nop1320 = alloca i1, i1 0
-  %nop1321 = alloca i1, i1 0
-  %nop1322 = alloca i1, i1 0
-  %nop1323 = alloca i1, i1 0
-  %nop1324 = alloca i1, i1 0
-  %nop1325 = alloca i1, i1 0
-  %nop1326 = alloca i1, i1 0
-  %nop1327 = alloca i1, i1 0
-  %nop1328 = alloca i1, i1 0
-  %nop1329 = alloca i1, i1 0
-  %nop1330 = alloca i1, i1 0
-  %nop1331 = alloca i1, i1 0
-  %nop1332 = alloca i1, i1 0
-  %nop1333 = alloca i1, i1 0
-  %nop1334 = alloca i1, i1 0
-  %nop1335 = alloca i1, i1 0
-  %nop1336 = alloca i1, i1 0
-  %nop1337 = alloca i1, i1 0
-  %nop1338 = alloca i1, i1 0
-  %nop1339 = alloca i1, i1 0
-  %nop1340 = alloca i1, i1 0
-  %nop1341 = alloca i1, i1 0
-  %nop1342 = alloca i1, i1 0
-  %nop1343 = alloca i1, i1 0
-  %nop1344 = alloca i1, i1 0
-  %nop1345 = alloca i1, i1 0
-  %nop1346 = alloca i1, i1 0
-  %nop1347 = alloca i1, i1 0
-  %nop1348 = alloca i1, i1 0
-  %nop1349 = alloca i1, i1 0
-  %nop1350 = alloca i1, i1 0
-  %nop1351 = alloca i1, i1 0
-  %nop1352 = alloca i1, i1 0
-  %nop1353 = alloca i1, i1 0
-  %nop1354 = alloca i1, i1 0
-  %nop1355 = alloca i1, i1 0
-  %nop1356 = alloca i1, i1 0
-  %nop1357 = alloca i1, i1 0
-  %nop1358 = alloca i1, i1 0
-  %nop1359 = alloca i1, i1 0
-  %nop1360 = alloca i1, i1 0
-  %nop1361 = alloca i1, i1 0
-  %nop1362 = alloca i1, i1 0
-  %nop1363 = alloca i1, i1 0
-  %nop1364 = alloca i1, i1 0
-  %nop1365 = alloca i1, i1 0
-  %nop1366 = alloca i1, i1 0
-  %nop1367 = alloca i1, i1 0
-  %nop1368 = alloca i1, i1 0
-  %nop1369 = alloca i1, i1 0
-  %nop1370 = alloca i1, i1 0
-  %nop1371 = alloca i1, i1 0
-  %nop1372 = alloca i1, i1 0
-  %nop1373 = alloca i1, i1 0
-  %nop1374 = alloca i1, i1 0
-  %nop1375 = alloca i1, i1 0
-  %nop1376 = alloca i1, i1 0
-  %nop1377 = alloca i1, i1 0
-  %nop1378 = alloca i1, i1 0
-  %nop1379 = alloca i1, i1 0
-  %nop1380 = alloca i1, i1 0
-  %nop1381 = alloca i1, i1 0
-  %nop1382 = alloca i1, i1 0
-  %nop1383 = alloca i1, i1 0
-  %nop1384 = alloca i1, i1 0
-  %nop1385 = alloca i1, i1 0
-  %nop1386 = alloca i1, i1 0
-  %nop1387 = alloca i1, i1 0
-  %nop1388 = alloca i1, i1 0
-  %nop1389 = alloca i1, i1 0
-  %nop1390 = alloca i1, i1 0
-  %nop1391 = alloca i1, i1 0
-  %nop1392 = alloca i1, i1 0
-  %nop1393 = alloca i1, i1 0
-  %nop1394 = alloca i1, i1 0
-  %nop1395 = alloca i1, i1 0
-  %nop1396 = alloca i1, i1 0
-  %nop1397 = alloca i1, i1 0
-  %nop1398 = alloca i1, i1 0
-  %nop1399 = alloca i1, i1 0
-  %nop1400 = alloca i1, i1 0
-  %nop1401 = alloca i1, i1 0
-  %nop1402 = alloca i1, i1 0
-  %nop1403 = alloca i1, i1 0
-  %nop1404 = alloca i1, i1 0
-  %nop1405 = alloca i1, i1 0
-  %nop1406 = alloca i1, i1 0
-  %nop1407 = alloca i1, i1 0
-  %nop1408 = alloca i1, i1 0
-  %nop1409 = alloca i1, i1 0
-  %nop1410 = alloca i1, i1 0
-  %nop1411 = alloca i1, i1 0
-  %nop1412 = alloca i1, i1 0
-  %nop1413 = alloca i1, i1 0
-  %nop1414 = alloca i1, i1 0
-  %nop1415 = alloca i1, i1 0
-  %nop1416 = alloca i1, i1 0
-  %nop1417 = alloca i1, i1 0
-  %nop1418 = alloca i1, i1 0
-  %nop1419 = alloca i1, i1 0
-  %nop1420 = alloca i1, i1 0
-  %nop1421 = alloca i1, i1 0
-  %nop1422 = alloca i1, i1 0
-  %nop1423 = alloca i1, i1 0
-  %nop1424 = alloca i1, i1 0
-  %nop1425 = alloca i1, i1 0
-  %nop1426 = alloca i1, i1 0
-  %nop1427 = alloca i1, i1 0
-  %nop1428 = alloca i1, i1 0
-  %nop1429 = alloca i1, i1 0
-  %nop1430 = alloca i1, i1 0
-  %nop1431 = alloca i1, i1 0
-  %nop1432 = alloca i1, i1 0
-  %nop1433 = alloca i1, i1 0
-  %nop1434 = alloca i1, i1 0
-  %nop1435 = alloca i1, i1 0
-  %nop1436 = alloca i1, i1 0
-  %nop1437 = alloca i1, i1 0
-  %nop1438 = alloca i1, i1 0
-  %nop1439 = alloca i1, i1 0
-  %nop1440 = alloca i1, i1 0
-  %nop1441 = alloca i1, i1 0
-  %nop1442 = alloca i1, i1 0
-  %nop1443 = alloca i1, i1 0
-  %nop1444 = alloca i1, i1 0
-  %nop1445 = alloca i1, i1 0
-  %nop1446 = alloca i1, i1 0
-  %nop1447 = alloca i1, i1 0
-  %nop1448 = alloca i1, i1 0
-  %nop1449 = alloca i1, i1 0
-  %nop1450 = alloca i1, i1 0
-  %nop1451 = alloca i1, i1 0
-  %nop1452 = alloca i1, i1 0
-  %nop1453 = alloca i1, i1 0
-  %nop1454 = alloca i1, i1 0
-  %nop1455 = alloca i1, i1 0
-  %nop1456 = alloca i1, i1 0
-  %nop1457 = alloca i1, i1 0
-  %nop1458 = alloca i1, i1 0
-  %nop1459 = alloca i1, i1 0
-  %nop1460 = alloca i1, i1 0
-  %nop1461 = alloca i1, i1 0
-  %nop1462 = alloca i1, i1 0
-  %nop1463 = alloca i1, i1 0
-  %nop1464 = alloca i1, i1 0
-  %nop1465 = alloca i1, i1 0
-  %nop1466 = alloca i1, i1 0
-  %nop1467 = alloca i1, i1 0
-  %nop1468 = alloca i1, i1 0
-  %nop1469 = alloca i1, i1 0
-  %nop1470 = alloca i1, i1 0
-  %nop1471 = alloca i1, i1 0
-  %nop1472 = alloca i1, i1 0
-  %nop1473 = alloca i1, i1 0
-  %nop1474 = alloca i1, i1 0
-  %nop1475 = alloca i1, i1 0
-  %nop1476 = alloca i1, i1 0
-  %nop1477 = alloca i1, i1 0
-  %nop1478 = alloca i1, i1 0
-  %nop1479 = alloca i1, i1 0
-  %nop1480 = alloca i1, i1 0
-  %nop1481 = alloca i1, i1 0
-  %nop1482 = alloca i1, i1 0
-  %nop1483 = alloca i1, i1 0
-  %nop1484 = alloca i1, i1 0
-  %nop1485 = alloca i1, i1 0
-  %nop1486 = alloca i1, i1 0
-  %nop1487 = alloca i1, i1 0
-  %nop1488 = alloca i1, i1 0
-  %nop1489 = alloca i1, i1 0
-  %nop1490 = alloca i1, i1 0
-  %nop1491 = alloca i1, i1 0
-  %nop1492 = alloca i1, i1 0
-  %nop1493 = alloca i1, i1 0
-  %nop1494 = alloca i1, i1 0
-  %nop1495 = alloca i1, i1 0
-  %nop1496 = alloca i1, i1 0
-  %nop1497 = alloca i1, i1 0
-  %nop1498 = alloca i1, i1 0
-  %nop1499 = alloca i1, i1 0
-  %nop1500 = alloca i1, i1 0
-  %nop1501 = alloca i1, i1 0
-  %nop1502 = alloca i1, i1 0
-  %nop1503 = alloca i1, i1 0
-  %nop1504 = alloca i1, i1 0
-  %nop1505 = alloca i1, i1 0
-  %nop1506 = alloca i1, i1 0
-  %nop1507 = alloca i1, i1 0
-  %nop1508 = alloca i1, i1 0
-  %nop1509 = alloca i1, i1 0
-  %nop1510 = alloca i1, i1 0
-  %nop1511 = alloca i1, i1 0
-  %nop1512 = alloca i1, i1 0
-  %nop1513 = alloca i1, i1 0
-  %nop1514 = alloca i1, i1 0
-  %nop1515 = alloca i1, i1 0
-  %nop1516 = alloca i1, i1 0
-  %nop1517 = alloca i1, i1 0
-  %nop1518 = alloca i1, i1 0
-  %nop1519 = alloca i1, i1 0
-  %nop1520 = alloca i1, i1 0
-  %nop1521 = alloca i1, i1 0
-  %nop1522 = alloca i1, i1 0
-  %nop1523 = alloca i1, i1 0
-  %nop1524 = alloca i1, i1 0
-  %nop1525 = alloca i1, i1 0
-  %nop1526 = alloca i1, i1 0
-  %nop1527 = alloca i1, i1 0
-  %nop1528 = alloca i1, i1 0
-  %nop1529 = alloca i1, i1 0
-  %nop1530 = alloca i1, i1 0
-  %nop1531 = alloca i1, i1 0
-  %nop1532 = alloca i1, i1 0
-  %nop1533 = alloca i1, i1 0
-  %nop1534 = alloca i1, i1 0
-  %nop1535 = alloca i1, i1 0
-  %nop1536 = alloca i1, i1 0
-  %nop1537 = alloca i1, i1 0
-  %nop1538 = alloca i1, i1 0
-  %nop1539 = alloca i1, i1 0
-  %nop1540 = alloca i1, i1 0
-  %nop1541 = alloca i1, i1 0
-  %nop1542 = alloca i1, i1 0
-  %nop1543 = alloca i1, i1 0
-  %nop1544 = alloca i1, i1 0
-  %nop1545 = alloca i1, i1 0
-  %nop1546 = alloca i1, i1 0
-  %nop1547 = alloca i1, i1 0
-  %nop1548 = alloca i1, i1 0
-  %nop1549 = alloca i1, i1 0
-  %nop1550 = alloca i1, i1 0
-  %nop1551 = alloca i1, i1 0
-  %nop1552 = alloca i1, i1 0
-  %nop1553 = alloca i1, i1 0
-  %nop1554 = alloca i1, i1 0
-  %nop1555 = alloca i1, i1 0
-  %nop1556 = alloca i1, i1 0
-  %nop1557 = alloca i1, i1 0
-  %nop1558 = alloca i1, i1 0
-  %nop1559 = alloca i1, i1 0
-  %nop1560 = alloca i1, i1 0
-  %nop1561 = alloca i1, i1 0
-  %nop1562 = alloca i1, i1 0
-  %nop1563 = alloca i1, i1 0
-  %nop1564 = alloca i1, i1 0
-  %nop1565 = alloca i1, i1 0
-  %nop1566 = alloca i1, i1 0
-  %nop1567 = alloca i1, i1 0
-  %nop1568 = alloca i1, i1 0
-  %nop1569 = alloca i1, i1 0
-  %nop1570 = alloca i1, i1 0
-  %nop1571 = alloca i1, i1 0
-  %nop1572 = alloca i1, i1 0
-  %nop1573 = alloca i1, i1 0
-  %nop1574 = alloca i1, i1 0
-  %nop1575 = alloca i1, i1 0
-  %nop1576 = alloca i1, i1 0
-  %nop1577 = alloca i1, i1 0
-  %nop1578 = alloca i1, i1 0
-  %nop1579 = alloca i1, i1 0
-  %nop1580 = alloca i1, i1 0
-  %nop1581 = alloca i1, i1 0
-  %nop1582 = alloca i1, i1 0
-  %nop1583 = alloca i1, i1 0
-  %nop1584 = alloca i1, i1 0
-  %nop1585 = alloca i1, i1 0
-  %nop1586 = alloca i1, i1 0
-  %nop1587 = alloca i1, i1 0
-  %nop1588 = alloca i1, i1 0
-  %nop1589 = alloca i1, i1 0
-  %nop1590 = alloca i1, i1 0
-  %nop1591 = alloca i1, i1 0
-  %nop1592 = alloca i1, i1 0
-  %nop1593 = alloca i1, i1 0
-  %nop1594 = alloca i1, i1 0
-  %nop1595 = alloca i1, i1 0
-  %nop1596 = alloca i1, i1 0
-  %nop1597 = alloca i1, i1 0
-  %nop1598 = alloca i1, i1 0
-  %nop1599 = alloca i1, i1 0
-  %nop1600 = alloca i1, i1 0
-  %nop1601 = alloca i1, i1 0
-  %nop1602 = alloca i1, i1 0
-  %nop1603 = alloca i1, i1 0
-  %nop1604 = alloca i1, i1 0
-  %nop1605 = alloca i1, i1 0
-  %nop1606 = alloca i1, i1 0
-  %nop1607 = alloca i1, i1 0
-  %nop1608 = alloca i1, i1 0
-  %nop1609 = alloca i1, i1 0
-  %nop1610 = alloca i1, i1 0
-  %nop1611 = alloca i1, i1 0
-  %nop1612 = alloca i1, i1 0
-  %nop1613 = alloca i1, i1 0
-  %nop1614 = alloca i1, i1 0
-  %nop1615 = alloca i1, i1 0
-  %nop1616 = alloca i1, i1 0
-  %nop1617 = alloca i1, i1 0
-  %nop1618 = alloca i1, i1 0
-  %nop1619 = alloca i1, i1 0
-  %nop1620 = alloca i1, i1 0
-  %nop1621 = alloca i1, i1 0
-  %nop1622 = alloca i1, i1 0
-  %nop1623 = alloca i1, i1 0
-  %nop1624 = alloca i1, i1 0
-  %nop1625 = alloca i1, i1 0
-  %nop1626 = alloca i1, i1 0
-  %nop1627 = alloca i1, i1 0
-  %nop1628 = alloca i1, i1 0
-  %nop1629 = alloca i1, i1 0
-  %nop1630 = alloca i1, i1 0
-  %nop1631 = alloca i1, i1 0
-  %nop1632 = alloca i1, i1 0
-  %nop1633 = alloca i1, i1 0
-  %nop1634 = alloca i1, i1 0
-  %nop1635 = alloca i1, i1 0
-  %nop1636 = alloca i1, i1 0
-  %nop1637 = alloca i1, i1 0
-  %nop1638 = alloca i1, i1 0
-  %nop1639 = alloca i1, i1 0
-  %nop1640 = alloca i1, i1 0
-  %nop1641 = alloca i1, i1 0
-  %nop1642 = alloca i1, i1 0
-  %nop1643 = alloca i1, i1 0
-  %nop1644 = alloca i1, i1 0
-  %nop1645 = alloca i1, i1 0
-  %nop1646 = alloca i1, i1 0
-  %nop1647 = alloca i1, i1 0
-  %nop1648 = alloca i1, i1 0
-  %nop1649 = alloca i1, i1 0
-  %nop1650 = alloca i1, i1 0
-  %nop1651 = alloca i1, i1 0
-  %nop1652 = alloca i1, i1 0
-  %nop1653 = alloca i1, i1 0
-  %nop1654 = alloca i1, i1 0
-  %nop1655 = alloca i1, i1 0
-  %nop1656 = alloca i1, i1 0
-  %nop1657 = alloca i1, i1 0
-  %nop1658 = alloca i1, i1 0
-  %nop1659 = alloca i1, i1 0
-  %nop1660 = alloca i1, i1 0
-  %nop1661 = alloca i1, i1 0
-  %nop1662 = alloca i1, i1 0
-  %nop1663 = alloca i1, i1 0
-  %nop1664 = alloca i1, i1 0
-  %nop1665 = alloca i1, i1 0
-  %nop1666 = alloca i1, i1 0
-  %nop1667 = alloca i1, i1 0
-  %nop1668 = alloca i1, i1 0
-  %nop1669 = alloca i1, i1 0
-  %nop1670 = alloca i1, i1 0
-  %nop1671 = alloca i1, i1 0
-  %nop1672 = alloca i1, i1 0
-  %nop1673 = alloca i1, i1 0
-  %nop1674 = alloca i1, i1 0
-  %nop1675 = alloca i1, i1 0
-  %nop1676 = alloca i1, i1 0
-  %nop1677 = alloca i1, i1 0
-  %nop1678 = alloca i1, i1 0
-  %nop1679 = alloca i1, i1 0
-  %nop1680 = alloca i1, i1 0
-  %nop1681 = alloca i1, i1 0
-  %nop1682 = alloca i1, i1 0
-  %nop1683 = alloca i1, i1 0
-  %nop1684 = alloca i1, i1 0
-  %nop1685 = alloca i1, i1 0
-  %nop1686 = alloca i1, i1 0
-  %nop1687 = alloca i1, i1 0
-  %nop1688 = alloca i1, i1 0
-  %nop1689 = alloca i1, i1 0
-  %nop1690 = alloca i1, i1 0
-  %nop1691 = alloca i1, i1 0
-  %nop1692 = alloca i1, i1 0
-  %nop1693 = alloca i1, i1 0
-  %nop1694 = alloca i1, i1 0
-  %nop1695 = alloca i1, i1 0
-  %nop1696 = alloca i1, i1 0
-  %nop1697 = alloca i1, i1 0
-  %nop1698 = alloca i1, i1 0
-  %nop1699 = alloca i1, i1 0
-  %nop1700 = alloca i1, i1 0
-  %nop1701 = alloca i1, i1 0
-  %nop1702 = alloca i1, i1 0
-  %nop1703 = alloca i1, i1 0
-  %nop1704 = alloca i1, i1 0
-  %nop1705 = alloca i1, i1 0
-  %nop1706 = alloca i1, i1 0
-  %nop1707 = alloca i1, i1 0
-  %nop1708 = alloca i1, i1 0
-  %nop1709 = alloca i1, i1 0
-  %nop1710 = alloca i1, i1 0
-  %nop1711 = alloca i1, i1 0
-  %nop1712 = alloca i1, i1 0
-  %nop1713 = alloca i1, i1 0
-  %nop1714 = alloca i1, i1 0
-  %nop1715 = alloca i1, i1 0
-  %nop1716 = alloca i1, i1 0
-  %nop1717 = alloca i1, i1 0
-  %nop1718 = alloca i1, i1 0
-  %nop1719 = alloca i1, i1 0
-  %nop1720 = alloca i1, i1 0
-  %nop1721 = alloca i1, i1 0
-  %nop1722 = alloca i1, i1 0
-  %nop1723 = alloca i1, i1 0
-  %nop1724 = alloca i1, i1 0
-  %nop1725 = alloca i1, i1 0
-  %nop1726 = alloca i1, i1 0
-  %nop1727 = alloca i1, i1 0
-  %nop1728 = alloca i1, i1 0
-  %nop1729 = alloca i1, i1 0
-  %nop1730 = alloca i1, i1 0
-  %nop1731 = alloca i1, i1 0
-  %nop1732 = alloca i1, i1 0
-  %nop1733 = alloca i1, i1 0
-  %nop1734 = alloca i1, i1 0
-  %nop1735 = alloca i1, i1 0
-  %nop1736 = alloca i1, i1 0
-  %nop1737 = alloca i1, i1 0
-  %nop1738 = alloca i1, i1 0
-  %nop1739 = alloca i1, i1 0
-  %nop1740 = alloca i1, i1 0
-  %nop1741 = alloca i1, i1 0
-  %nop1742 = alloca i1, i1 0
-  %nop1743 = alloca i1, i1 0
-  %nop1744 = alloca i1, i1 0
-  %nop1745 = alloca i1, i1 0
-  %nop1746 = alloca i1, i1 0
-  %nop1747 = alloca i1, i1 0
-  %nop1748 = alloca i1, i1 0
-  %nop1749 = alloca i1, i1 0
-  %nop1750 = alloca i1, i1 0
-  %nop1751 = alloca i1, i1 0
-  %nop1752 = alloca i1, i1 0
-  %nop1753 = alloca i1, i1 0
-  %nop1754 = alloca i1, i1 0
-  %nop1755 = alloca i1, i1 0
-  %nop1756 = alloca i1, i1 0
-  %nop1757 = alloca i1, i1 0
-  %nop1758 = alloca i1, i1 0
-  %nop1759 = alloca i1, i1 0
-  %nop1760 = alloca i1, i1 0
-  %nop1761 = alloca i1, i1 0
-  %nop1762 = alloca i1, i1 0
-  %nop1763 = alloca i1, i1 0
-  %nop1764 = alloca i1, i1 0
-  %nop1765 = alloca i1, i1 0
-  %nop1766 = alloca i1, i1 0
-  %nop1767 = alloca i1, i1 0
-  %nop1768 = alloca i1, i1 0
-  %nop1769 = alloca i1, i1 0
-  %nop1770 = alloca i1, i1 0
-  %nop1771 = alloca i1, i1 0
-  %nop1772 = alloca i1, i1 0
-  %nop1773 = alloca i1, i1 0
-  %nop1774 = alloca i1, i1 0
-  %nop1775 = alloca i1, i1 0
-  %nop1776 = alloca i1, i1 0
-  %nop1777 = alloca i1, i1 0
-  %nop1778 = alloca i1, i1 0
-  %nop1779 = alloca i1, i1 0
-  %nop1780 = alloca i1, i1 0
-  %nop1781 = alloca i1, i1 0
-  %nop1782 = alloca i1, i1 0
-  %nop1783 = alloca i1, i1 0
-  %nop1784 = alloca i1, i1 0
-  %nop1785 = alloca i1, i1 0
-  %nop1786 = alloca i1, i1 0
-  %nop1787 = alloca i1, i1 0
-  %nop1788 = alloca i1, i1 0
-  %nop1789 = alloca i1, i1 0
-  %nop1790 = alloca i1, i1 0
-  %nop1791 = alloca i1, i1 0
-  %nop1792 = alloca i1, i1 0
-  %nop1793 = alloca i1, i1 0
-  %nop1794 = alloca i1, i1 0
-  %nop1795 = alloca i1, i1 0
-  %nop1796 = alloca i1, i1 0
-  %nop1797 = alloca i1, i1 0
-  %nop1798 = alloca i1, i1 0
-  %nop1799 = alloca i1, i1 0
-  %nop1800 = alloca i1, i1 0
-  %nop1801 = alloca i1, i1 0
-  %nop1802 = alloca i1, i1 0
-  %nop1803 = alloca i1, i1 0
-  %nop1804 = alloca i1, i1 0
-  %nop1805 = alloca i1, i1 0
-  %nop1806 = alloca i1, i1 0
-  %nop1807 = alloca i1, i1 0
-  %nop1808 = alloca i1, i1 0
-  %nop1809 = alloca i1, i1 0
-  %nop1810 = alloca i1, i1 0
-  %nop1811 = alloca i1, i1 0
-  %nop1812 = alloca i1, i1 0
-  %nop1813 = alloca i1, i1 0
-  %nop1814 = alloca i1, i1 0
-  %nop1815 = alloca i1, i1 0
-  %nop1816 = alloca i1, i1 0
-  %nop1817 = alloca i1, i1 0
-  %nop1818 = alloca i1, i1 0
-  %nop1819 = alloca i1, i1 0
-  %nop1820 = alloca i1, i1 0
-  %nop1821 = alloca i1, i1 0
-  %nop1822 = alloca i1, i1 0
-  %nop1823 = alloca i1, i1 0
-  %nop1824 = alloca i1, i1 0
-  %nop1825 = alloca i1, i1 0
-  %nop1826 = alloca i1, i1 0
-  %nop1827 = alloca i1, i1 0
-  %nop1828 = alloca i1, i1 0
-  %nop1829 = alloca i1, i1 0
-  %nop1830 = alloca i1, i1 0
-  %nop1831 = alloca i1, i1 0
-  %nop1832 = alloca i1, i1 0
-  %nop1833 = alloca i1, i1 0
-  %nop1834 = alloca i1, i1 0
-  %nop1835 = alloca i1, i1 0
-  %nop1836 = alloca i1, i1 0
-  %nop1837 = alloca i1, i1 0
-  %nop1838 = alloca i1, i1 0
-  %nop1839 = alloca i1, i1 0
-  %nop1840 = alloca i1, i1 0
-  %nop1841 = alloca i1, i1 0
-  %nop1842 = alloca i1, i1 0
-  %nop1843 = alloca i1, i1 0
-  %nop1844 = alloca i1, i1 0
-  %nop1845 = alloca i1, i1 0
-  %nop1846 = alloca i1, i1 0
-  %nop1847 = alloca i1, i1 0
-  %nop1848 = alloca i1, i1 0
-  %nop1849 = alloca i1, i1 0
-  %nop1850 = alloca i1, i1 0
-  %nop1851 = alloca i1, i1 0
-  %nop1852 = alloca i1, i1 0
-  %nop1853 = alloca i1, i1 0
-  %nop1854 = alloca i1, i1 0
-  %nop1855 = alloca i1, i1 0
-  %nop1856 = alloca i1, i1 0
-  %nop1857 = alloca i1, i1 0
-  %nop1858 = alloca i1, i1 0
-  %nop1859 = alloca i1, i1 0
-  %nop1860 = alloca i1, i1 0
-  %nop1861 = alloca i1, i1 0
-  %nop1862 = alloca i1, i1 0
-  %nop1863 = alloca i1, i1 0
-  %nop1864 = alloca i1, i1 0
-  %nop1865 = alloca i1, i1 0
-  %nop1866 = alloca i1, i1 0
-  %nop1867 = alloca i1, i1 0
-  %nop1868 = alloca i1, i1 0
-  %nop1869 = alloca i1, i1 0
-  %nop1870 = alloca i1, i1 0
-  %nop1871 = alloca i1, i1 0
-  %nop1872 = alloca i1, i1 0
-  %nop1873 = alloca i1, i1 0
-  %nop1874 = alloca i1, i1 0
-  %nop1875 = alloca i1, i1 0
-  %nop1876 = alloca i1, i1 0
-  %nop1877 = alloca i1, i1 0
-  %nop1878 = alloca i1, i1 0
-  %nop1879 = alloca i1, i1 0
-  %nop1880 = alloca i1, i1 0
-  %nop1881 = alloca i1, i1 0
-  %nop1882 = alloca i1, i1 0
-  %nop1883 = alloca i1, i1 0
-  %nop1884 = alloca i1, i1 0
-  %nop1885 = alloca i1, i1 0
-  %nop1886 = alloca i1, i1 0
-  %nop1887 = alloca i1, i1 0
-  %nop1888 = alloca i1, i1 0
-  %nop1889 = alloca i1, i1 0
-  %nop1890 = alloca i1, i1 0
-  %nop1891 = alloca i1, i1 0
-  %nop1892 = alloca i1, i1 0
-  %nop1893 = alloca i1, i1 0
-  %nop1894 = alloca i1, i1 0
-  %nop1895 = alloca i1, i1 0
-  %nop1896 = alloca i1, i1 0
-  %nop1897 = alloca i1, i1 0
-  %nop1898 = alloca i1, i1 0
-  %nop1899 = alloca i1, i1 0
-  %nop1900 = alloca i1, i1 0
-  %nop1901 = alloca i1, i1 0
-  %nop1902 = alloca i1, i1 0
-  %nop1903 = alloca i1, i1 0
-  %nop1904 = alloca i1, i1 0
-  %nop1905 = alloca i1, i1 0
-  %nop1906 = alloca i1, i1 0
-  %nop1907 = alloca i1, i1 0
-  %nop1908 = alloca i1, i1 0
-  %nop1909 = alloca i1, i1 0
-  %nop1910 = alloca i1, i1 0
-  %nop1911 = alloca i1, i1 0
-  %nop1912 = alloca i1, i1 0
-  %nop1913 = alloca i1, i1 0
-  %nop1914 = alloca i1, i1 0
-  %nop1915 = alloca i1, i1 0
-  %nop1916 = alloca i1, i1 0
-  %nop1917 = alloca i1, i1 0
-  %nop1918 = alloca i1, i1 0
-  %nop1919 = alloca i1, i1 0
-  %nop1920 = alloca i1, i1 0
-  %nop1921 = alloca i1, i1 0
-  %nop1922 = alloca i1, i1 0
-  %nop1923 = alloca i1, i1 0
-  %nop1924 = alloca i1, i1 0
-  %nop1925 = alloca i1, i1 0
-  %nop1926 = alloca i1, i1 0
-  %nop1927 = alloca i1, i1 0
-  %nop1928 = alloca i1, i1 0
-  %nop1929 = alloca i1, i1 0
-  %nop1930 = alloca i1, i1 0
-  %nop1931 = alloca i1, i1 0
-  %nop1932 = alloca i1, i1 0
-  %nop1933 = alloca i1, i1 0
-  %nop1934 = alloca i1, i1 0
-  %nop1935 = alloca i1, i1 0
-  %nop1936 = alloca i1, i1 0
-  %nop1937 = alloca i1, i1 0
-  %nop1938 = alloca i1, i1 0
-  %nop1939 = alloca i1, i1 0
-  %nop1940 = alloca i1, i1 0
-  %nop1941 = alloca i1, i1 0
-  %nop1942 = alloca i1, i1 0
-  %nop1943 = alloca i1, i1 0
-  %nop1944 = alloca i1, i1 0
-  %nop1945 = alloca i1, i1 0
-  %nop1946 = alloca i1, i1 0
-  %nop1947 = alloca i1, i1 0
-  %nop1948 = alloca i1, i1 0
-  %nop1949 = alloca i1, i1 0
-  %nop1950 = alloca i1, i1 0
-  %nop1951 = alloca i1, i1 0
-  %nop1952 = alloca i1, i1 0
-  %nop1953 = alloca i1, i1 0
-  %nop1954 = alloca i1, i1 0
-  %nop1955 = alloca i1, i1 0
-  %nop1956 = alloca i1, i1 0
-  %nop1957 = alloca i1, i1 0
-  %nop1958 = alloca i1, i1 0
-  %nop1959 = alloca i1, i1 0
-  %nop1960 = alloca i1, i1 0
-  %nop1961 = alloca i1, i1 0
-  %nop1962 = alloca i1, i1 0
-  %nop1963 = alloca i1, i1 0
-  %nop1964 = alloca i1, i1 0
-  %nop1965 = alloca i1, i1 0
-  %nop1966 = alloca i1, i1 0
-  %nop1967 = alloca i1, i1 0
-  %nop1968 = alloca i1, i1 0
-  %nop1969 = alloca i1, i1 0
-  %nop1970 = alloca i1, i1 0
-  %nop1971 = alloca i1, i1 0
-  %nop1972 = alloca i1, i1 0
-  %nop1973 = alloca i1, i1 0
-  %nop1974 = alloca i1, i1 0
-  %nop1975 = alloca i1, i1 0
-  %nop1976 = alloca i1, i1 0
-  %nop1977 = alloca i1, i1 0
-  %nop1978 = alloca i1, i1 0
-  %nop1979 = alloca i1, i1 0
-  %nop1980 = alloca i1, i1 0
-  %nop1981 = alloca i1, i1 0
-  %nop1982 = alloca i1, i1 0
-  %nop1983 = alloca i1, i1 0
-  %nop1984 = alloca i1, i1 0
-  %nop1985 = alloca i1, i1 0
-  %nop1986 = alloca i1, i1 0
-  %nop1987 = alloca i1, i1 0
-  %nop1988 = alloca i1, i1 0
-  %nop1989 = alloca i1, i1 0
-  %nop1990 = alloca i1, i1 0
-  %nop1991 = alloca i1, i1 0
-  %nop1992 = alloca i1, i1 0
-  %nop1993 = alloca i1, i1 0
-  %nop1994 = alloca i1, i1 0
-  %nop1995 = alloca i1, i1 0
-  %nop1996 = alloca i1, i1 0
-  %nop1997 = alloca i1, i1 0
-  %nop1998 = alloca i1, i1 0
-  %nop1999 = alloca i1, i1 0
-  %nop2000 = alloca i1, i1 0
-  %nop2001 = alloca i1, i1 0
-  %nop2002 = alloca i1, i1 0
-  %nop2003 = alloca i1, i1 0
-  %nop2004 = alloca i1, i1 0
-  %nop2005 = alloca i1, i1 0
-  %nop2006 = alloca i1, i1 0
-  %nop2007 = alloca i1, i1 0
-  %nop2008 = alloca i1, i1 0
-  %nop2009 = alloca i1, i1 0
-  %nop2010 = alloca i1, i1 0
-  %nop2011 = alloca i1, i1 0
-  %nop2012 = alloca i1, i1 0
-  %nop2013 = alloca i1, i1 0
-  %nop2014 = alloca i1, i1 0
-  %nop2015 = alloca i1, i1 0
-  %nop2016 = alloca i1, i1 0
-  %nop2017 = alloca i1, i1 0
-  %nop2018 = alloca i1, i1 0
-  %nop2019 = alloca i1, i1 0
-  %nop2020 = alloca i1, i1 0
-  %nop2021 = alloca i1, i1 0
-  %nop2022 = alloca i1, i1 0
-  %nop2023 = alloca i1, i1 0
-  %nop2024 = alloca i1, i1 0
-  %nop2025 = alloca i1, i1 0
-  %nop2026 = alloca i1, i1 0
-  %nop2027 = alloca i1, i1 0
-  %nop2028 = alloca i1, i1 0
-  %nop2029 = alloca i1, i1 0
-  %nop2030 = alloca i1, i1 0
-  %nop2031 = alloca i1, i1 0
-  %nop2032 = alloca i1, i1 0
-  %nop2033 = alloca i1, i1 0
-  %nop2034 = alloca i1, i1 0
-  %nop2035 = alloca i1, i1 0
-  %nop2036 = alloca i1, i1 0
-  %nop2037 = alloca i1, i1 0
-  %nop2038 = alloca i1, i1 0
-  %nop2039 = alloca i1, i1 0
-  %nop2040 = alloca i1, i1 0
-  %nop2041 = alloca i1, i1 0
-  %nop2042 = alloca i1, i1 0
-  %nop2043 = alloca i1, i1 0
-  %nop2044 = alloca i1, i1 0
-  %nop2045 = alloca i1, i1 0
-  %nop2046 = alloca i1, i1 0
-  %nop2047 = alloca i1, i1 0
-  %nop2048 = alloca i1, i1 0
-  %nop2049 = alloca i1, i1 0
-  %nop2050 = alloca i1, i1 0
-  %nop2051 = alloca i1, i1 0
-  %nop2052 = alloca i1, i1 0
-  %nop2053 = alloca i1, i1 0
-  %nop2054 = alloca i1, i1 0
-  %nop2055 = alloca i1, i1 0
-  %nop2056 = alloca i1, i1 0
-  %nop2057 = alloca i1, i1 0
-  %nop2058 = alloca i1, i1 0
-  %nop2059 = alloca i1, i1 0
-  %nop2060 = alloca i1, i1 0
-  %nop2061 = alloca i1, i1 0
-  %nop2062 = alloca i1, i1 0
-  %nop2063 = alloca i1, i1 0
-  %nop2064 = alloca i1, i1 0
-  %nop2065 = alloca i1, i1 0
-  %nop2066 = alloca i1, i1 0
-  %nop2067 = alloca i1, i1 0
-  %nop2068 = alloca i1, i1 0
-  %nop2069 = alloca i1, i1 0
-  %nop2070 = alloca i1, i1 0
-  %nop2071 = alloca i1, i1 0
-  %nop2072 = alloca i1, i1 0
-  %nop2073 = alloca i1, i1 0
-  %nop2074 = alloca i1, i1 0
-  %nop2075 = alloca i1, i1 0
-  %nop2076 = alloca i1, i1 0
-  %nop2077 = alloca i1, i1 0
-  %nop2078 = alloca i1, i1 0
-  %nop2079 = alloca i1, i1 0
-  %nop2080 = alloca i1, i1 0
-  %nop2081 = alloca i1, i1 0
-  %nop2082 = alloca i1, i1 0
-  %nop2083 = alloca i1, i1 0
-  %nop2084 = alloca i1, i1 0
-  %nop2085 = alloca i1, i1 0
-  %nop2086 = alloca i1, i1 0
-  %nop2087 = alloca i1, i1 0
-  %nop2088 = alloca i1, i1 0
-  %nop2089 = alloca i1, i1 0
-  %nop2090 = alloca i1, i1 0
-  %nop2091 = alloca i1, i1 0
-  %nop2092 = alloca i1, i1 0
-  %nop2093 = alloca i1, i1 0
-  %nop2094 = alloca i1, i1 0
-  %nop2095 = alloca i1, i1 0
-  %nop2096 = alloca i1, i1 0
-  %nop2097 = alloca i1, i1 0
-  %nop2098 = alloca i1, i1 0
-  %nop2099 = alloca i1, i1 0
-  %nop2100 = alloca i1, i1 0
-  %nop2101 = alloca i1, i1 0
-  %nop2102 = alloca i1, i1 0
-  %nop2103 = alloca i1, i1 0
-  %nop2104 = alloca i1, i1 0
-  %nop2105 = alloca i1, i1 0
-  %nop2106 = alloca i1, i1 0
-  %nop2107 = alloca i1, i1 0
-  %nop2108 = alloca i1, i1 0
-  %nop2109 = alloca i1, i1 0
-  %nop2110 = alloca i1, i1 0
-  %nop2111 = alloca i1, i1 0
-  %nop2112 = alloca i1, i1 0
-  %nop2113 = alloca i1, i1 0
-  %nop2114 = alloca i1, i1 0
-  %nop2115 = alloca i1, i1 0
-  %nop2116 = alloca i1, i1 0
-  %nop2117 = alloca i1, i1 0
-  %nop2118 = alloca i1, i1 0
-  %nop2119 = alloca i1, i1 0
-  %nop2120 = alloca i1, i1 0
-  %nop2121 = alloca i1, i1 0
-  %nop2122 = alloca i1, i1 0
-  %nop2123 = alloca i1, i1 0
-  %nop2124 = alloca i1, i1 0
-  %nop2125 = alloca i1, i1 0
-  %nop2126 = alloca i1, i1 0
-  %nop2127 = alloca i1, i1 0
-  %nop2128 = alloca i1, i1 0
-  %nop2129 = alloca i1, i1 0
-  %nop2130 = alloca i1, i1 0
-  %nop2131 = alloca i1, i1 0
-  %nop2132 = alloca i1, i1 0
-  %nop2133 = alloca i1, i1 0
-  %nop2134 = alloca i1, i1 0
-  %nop2135 = alloca i1, i1 0
-  %nop2136 = alloca i1, i1 0
-  %nop2137 = alloca i1, i1 0
-  %nop2138 = alloca i1, i1 0
-  %nop2139 = alloca i1, i1 0
-  %nop2140 = alloca i1, i1 0
-  %nop2141 = alloca i1, i1 0
-  %nop2142 = alloca i1, i1 0
-  %nop2143 = alloca i1, i1 0
-  %nop2144 = alloca i1, i1 0
-  %nop2145 = alloca i1, i1 0
-  %nop2146 = alloca i1, i1 0
-  %nop2147 = alloca i1, i1 0
-  %nop2148 = alloca i1, i1 0
-  %nop2149 = alloca i1, i1 0
-  %nop2150 = alloca i1, i1 0
-  %nop2151 = alloca i1, i1 0
-  %nop2152 = alloca i1, i1 0
-  %nop2153 = alloca i1, i1 0
-  %nop2154 = alloca i1, i1 0
-  %nop2155 = alloca i1, i1 0
-  %nop2156 = alloca i1, i1 0
-  %nop2157 = alloca i1, i1 0
-  %nop2158 = alloca i1, i1 0
-  %nop2159 = alloca i1, i1 0
-  %nop2160 = alloca i1, i1 0
-  %nop2161 = alloca i1, i1 0
-  %nop2162 = alloca i1, i1 0
-  %nop2163 = alloca i1, i1 0
-  %nop2164 = alloca i1, i1 0
-  %nop2165 = alloca i1, i1 0
-  %nop2166 = alloca i1, i1 0
-  %nop2167 = alloca i1, i1 0
-  %nop2168 = alloca i1, i1 0
-  %nop2169 = alloca i1, i1 0
-  %nop2170 = alloca i1, i1 0
-  %nop2171 = alloca i1, i1 0
-  %nop2172 = alloca i1, i1 0
-  %nop2173 = alloca i1, i1 0
-  %nop2174 = alloca i1, i1 0
-  %nop2175 = alloca i1, i1 0
-  %nop2176 = alloca i1, i1 0
-  %nop2177 = alloca i1, i1 0
-  %nop2178 = alloca i1, i1 0
-  %nop2179 = alloca i1, i1 0
-  %nop2180 = alloca i1, i1 0
-  %nop2181 = alloca i1, i1 0
-  %nop2182 = alloca i1, i1 0
-  %nop2183 = alloca i1, i1 0
-  %nop2184 = alloca i1, i1 0
-  %nop2185 = alloca i1, i1 0
-  %nop2186 = alloca i1, i1 0
-  %nop2187 = alloca i1, i1 0
-  %nop2188 = alloca i1, i1 0
-  %nop2189 = alloca i1, i1 0
-  %nop2190 = alloca i1, i1 0
-  %nop2191 = alloca i1, i1 0
-  %nop2192 = alloca i1, i1 0
-  %nop2193 = alloca i1, i1 0
-  %nop2194 = alloca i1, i1 0
-  %nop2195 = alloca i1, i1 0
-  %nop2196 = alloca i1, i1 0
-  %nop2197 = alloca i1, i1 0
-  %nop2198 = alloca i1, i1 0
-  %nop2199 = alloca i1, i1 0
-  %nop2200 = alloca i1, i1 0
-  %nop2201 = alloca i1, i1 0
-  %nop2202 = alloca i1, i1 0
-  %nop2203 = alloca i1, i1 0
-  %nop2204 = alloca i1, i1 0
-  %nop2205 = alloca i1, i1 0
-  %nop2206 = alloca i1, i1 0
-  %nop2207 = alloca i1, i1 0
-  %nop2208 = alloca i1, i1 0
-  %nop2209 = alloca i1, i1 0
-  %nop2210 = alloca i1, i1 0
-  %nop2211 = alloca i1, i1 0
-  %nop2212 = alloca i1, i1 0
-  %nop2213 = alloca i1, i1 0
-  %nop2214 = alloca i1, i1 0
-  %nop2215 = alloca i1, i1 0
-  %nop2216 = alloca i1, i1 0
-  %nop2217 = alloca i1, i1 0
-  %nop2218 = alloca i1, i1 0
-  %nop2219 = alloca i1, i1 0
-  %nop2220 = alloca i1, i1 0
-  %nop2221 = alloca i1, i1 0
-  %nop2222 = alloca i1, i1 0
-  %nop2223 = alloca i1, i1 0
-  %nop2224 = alloca i1, i1 0
-  %nop2225 = alloca i1, i1 0
-  %nop2226 = alloca i1, i1 0
-  %nop2227 = alloca i1, i1 0
-  %nop2228 = alloca i1, i1 0
-  %nop2229 = alloca i1, i1 0
-  %nop2230 = alloca i1, i1 0
-  %nop2231 = alloca i1, i1 0
-  %nop2232 = alloca i1, i1 0
-  %nop2233 = alloca i1, i1 0
-  %nop2234 = alloca i1, i1 0
-  %nop2235 = alloca i1, i1 0
-  %nop2236 = alloca i1, i1 0
-  %nop2237 = alloca i1, i1 0
-  %nop2238 = alloca i1, i1 0
-  %nop2239 = alloca i1, i1 0
-  %nop2240 = alloca i1, i1 0
-  %nop2241 = alloca i1, i1 0
-  %nop2242 = alloca i1, i1 0
-  %nop2243 = alloca i1, i1 0
-  %nop2244 = alloca i1, i1 0
-  %nop2245 = alloca i1, i1 0
-  %nop2246 = alloca i1, i1 0
-  %nop2247 = alloca i1, i1 0
-  %nop2248 = alloca i1, i1 0
-  %nop2249 = alloca i1, i1 0
-  %nop2250 = alloca i1, i1 0
-  %nop2251 = alloca i1, i1 0
-  %nop2252 = alloca i1, i1 0
-  %nop2253 = alloca i1, i1 0
-  %nop2254 = alloca i1, i1 0
-  %nop2255 = alloca i1, i1 0
-  %nop2256 = alloca i1, i1 0
-  %nop2257 = alloca i1, i1 0
-  %nop2258 = alloca i1, i1 0
-  %nop2259 = alloca i1, i1 0
-  %nop2260 = alloca i1, i1 0
-  %nop2261 = alloca i1, i1 0
-  %nop2262 = alloca i1, i1 0
-  %nop2263 = alloca i1, i1 0
-  %nop2264 = alloca i1, i1 0
-  %nop2265 = alloca i1, i1 0
-  %nop2266 = alloca i1, i1 0
-  %nop2267 = alloca i1, i1 0
-  %nop2268 = alloca i1, i1 0
-  %nop2269 = alloca i1, i1 0
-  %nop2270 = alloca i1, i1 0
-  %nop2271 = alloca i1, i1 0
-  %nop2272 = alloca i1, i1 0
-  %nop2273 = alloca i1, i1 0
-  %nop2274 = alloca i1, i1 0
-  %nop2275 = alloca i1, i1 0
-  %nop2276 = alloca i1, i1 0
-  %nop2277 = alloca i1, i1 0
-  %nop2278 = alloca i1, i1 0
-  %nop2279 = alloca i1, i1 0
-  %nop2280 = alloca i1, i1 0
-  %nop2281 = alloca i1, i1 0
-  %nop2282 = alloca i1, i1 0
-  %nop2283 = alloca i1, i1 0
-  %nop2284 = alloca i1, i1 0
-  %nop2285 = alloca i1, i1 0
-  %nop2286 = alloca i1, i1 0
-  %nop2287 = alloca i1, i1 0
-  %nop2288 = alloca i1, i1 0
-  %nop2289 = alloca i1, i1 0
-  %nop2290 = alloca i1, i1 0
-  %nop2291 = alloca i1, i1 0
-  %nop2292 = alloca i1, i1 0
-  %nop2293 = alloca i1, i1 0
-  %nop2294 = alloca i1, i1 0
-  %nop2295 = alloca i1, i1 0
-  %nop2296 = alloca i1, i1 0
-  %nop2297 = alloca i1, i1 0
-  %nop2298 = alloca i1, i1 0
-  %nop2299 = alloca i1, i1 0
-  %nop2300 = alloca i1, i1 0
-  %nop2301 = alloca i1, i1 0
-  %nop2302 = alloca i1, i1 0
-  %nop2303 = alloca i1, i1 0
-  %nop2304 = alloca i1, i1 0
-  %nop2305 = alloca i1, i1 0
-  %nop2306 = alloca i1, i1 0
-  %nop2307 = alloca i1, i1 0
-  %nop2308 = alloca i1, i1 0
-  %nop2309 = alloca i1, i1 0
-  %nop2310 = alloca i1, i1 0
-  %nop2311 = alloca i1, i1 0
-  %nop2312 = alloca i1, i1 0
-  %nop2313 = alloca i1, i1 0
-  %nop2314 = alloca i1, i1 0
-  %nop2315 = alloca i1, i1 0
-  %nop2316 = alloca i1, i1 0
-  %nop2317 = alloca i1, i1 0
-  %nop2318 = alloca i1, i1 0
-  %nop2319 = alloca i1, i1 0
-  %nop2320 = alloca i1, i1 0
-  %nop2321 = alloca i1, i1 0
-  %nop2322 = alloca i1, i1 0
-  %nop2323 = alloca i1, i1 0
-  %nop2324 = alloca i1, i1 0
-  %nop2325 = alloca i1, i1 0
-  %nop2326 = alloca i1, i1 0
-  %nop2327 = alloca i1, i1 0
-  %nop2328 = alloca i1, i1 0
-  %nop2329 = alloca i1, i1 0
-  %nop2330 = alloca i1, i1 0
-  %nop2331 = alloca i1, i1 0
-  %nop2332 = alloca i1, i1 0
-  %nop2333 = alloca i1, i1 0
-  %nop2334 = alloca i1, i1 0
-  %nop2335 = alloca i1, i1 0
-  %nop2336 = alloca i1, i1 0
-  %nop2337 = alloca i1, i1 0
-  %nop2338 = alloca i1, i1 0
-  %nop2339 = alloca i1, i1 0
-  %nop2340 = alloca i1, i1 0
-  %nop2341 = alloca i1, i1 0
-  %nop2342 = alloca i1, i1 0
-  %nop2343 = alloca i1, i1 0
-  %nop2344 = alloca i1, i1 0
-  %nop2345 = alloca i1, i1 0
-  %nop2346 = alloca i1, i1 0
-  %nop2347 = alloca i1, i1 0
-  %nop2348 = alloca i1, i1 0
-  %nop2349 = alloca i1, i1 0
-  %nop2350 = alloca i1, i1 0
-  %nop2351 = alloca i1, i1 0
-  %nop2352 = alloca i1, i1 0
-  %nop2353 = alloca i1, i1 0
-  %nop2354 = alloca i1, i1 0
-  %nop2355 = alloca i1, i1 0
-  %nop2356 = alloca i1, i1 0
-  %nop2357 = alloca i1, i1 0
-  %nop2358 = alloca i1, i1 0
-  %nop2359 = alloca i1, i1 0
-  %nop2360 = alloca i1, i1 0
-  %nop2361 = alloca i1, i1 0
-  %nop2362 = alloca i1, i1 0
-  %nop2363 = alloca i1, i1 0
-  %nop2364 = alloca i1, i1 0
-  %nop2365 = alloca i1, i1 0
-  %nop2366 = alloca i1, i1 0
-  %nop2367 = alloca i1, i1 0
-  %nop2368 = alloca i1, i1 0
-  %nop2369 = alloca i1, i1 0
-  %nop2370 = alloca i1, i1 0
-  %nop2371 = alloca i1, i1 0
-  %nop2372 = alloca i1, i1 0
-  %nop2373 = alloca i1, i1 0
-  %nop2374 = alloca i1, i1 0
-  %nop2375 = alloca i1, i1 0
-  %nop2376 = alloca i1, i1 0
-  %nop2377 = alloca i1, i1 0
-  %nop2378 = alloca i1, i1 0
-  %nop2379 = alloca i1, i1 0
-  %nop2380 = alloca i1, i1 0
-  %nop2381 = alloca i1, i1 0
-  %nop2382 = alloca i1, i1 0
-  %nop2383 = alloca i1, i1 0
-  %nop2384 = alloca i1, i1 0
-  %nop2385 = alloca i1, i1 0
-  %nop2386 = alloca i1, i1 0
-  %nop2387 = alloca i1, i1 0
-  %nop2388 = alloca i1, i1 0
-  %nop2389 = alloca i1, i1 0
-  %nop2390 = alloca i1, i1 0
-  %nop2391 = alloca i1, i1 0
-  %nop2392 = alloca i1, i1 0
-  %nop2393 = alloca i1, i1 0
-  %nop2394 = alloca i1, i1 0
-  %nop2395 = alloca i1, i1 0
-  %nop2396 = alloca i1, i1 0
-  %nop2397 = alloca i1, i1 0
-  %nop2398 = alloca i1, i1 0
-  %nop2399 = alloca i1, i1 0
-  %nop2400 = alloca i1, i1 0
-  %nop2401 = alloca i1, i1 0
-  %nop2402 = alloca i1, i1 0
-  %nop2403 = alloca i1, i1 0
-  %nop2404 = alloca i1, i1 0
-  %nop2405 = alloca i1, i1 0
-  %nop2406 = alloca i1, i1 0
-  %nop2407 = alloca i1, i1 0
-  %nop2408 = alloca i1, i1 0
-  %nop2409 = alloca i1, i1 0
-  %nop2410 = alloca i1, i1 0
-  %nop2411 = alloca i1, i1 0
-  %nop2412 = alloca i1, i1 0
-  %nop2413 = alloca i1, i1 0
-  %nop2414 = alloca i1, i1 0
-  %nop2415 = alloca i1, i1 0
-  %nop2416 = alloca i1, i1 0
-  %nop2417 = alloca i1, i1 0
-  %nop2418 = alloca i1, i1 0
-  %nop2419 = alloca i1, i1 0
-  %nop2420 = alloca i1, i1 0
-  %nop2421 = alloca i1, i1 0
-  %nop2422 = alloca i1, i1 0
-  %nop2423 = alloca i1, i1 0
-  %nop2424 = alloca i1, i1 0
-  %nop2425 = alloca i1, i1 0
-  %nop2426 = alloca i1, i1 0
-  %nop2427 = alloca i1, i1 0
-  %nop2428 = alloca i1, i1 0
-  %nop2429 = alloca i1, i1 0
-  %nop2430 = alloca i1, i1 0
-  %nop2431 = alloca i1, i1 0
-  %nop2432 = alloca i1, i1 0
-  %nop2433 = alloca i1, i1 0
-  %nop2434 = alloca i1, i1 0
-  %nop2435 = alloca i1, i1 0
-  %nop2436 = alloca i1, i1 0
-  %nop2437 = alloca i1, i1 0
-  %nop2438 = alloca i1, i1 0
-  %nop2439 = alloca i1, i1 0
-  %nop2440 = alloca i1, i1 0
-  %nop2441 = alloca i1, i1 0
-  %nop2442 = alloca i1, i1 0
-  %nop2443 = alloca i1, i1 0
-  %nop2444 = alloca i1, i1 0
-  %nop2445 = alloca i1, i1 0
-  %nop2446 = alloca i1, i1 0
-  %nop2447 = alloca i1, i1 0
-  %nop2448 = alloca i1, i1 0
-  %nop2449 = alloca i1, i1 0
-  %nop2450 = alloca i1, i1 0
-  %nop2451 = alloca i1, i1 0
-  %nop2452 = alloca i1, i1 0
-  %nop2453 = alloca i1, i1 0
-  %nop2454 = alloca i1, i1 0
-  %nop2455 = alloca i1, i1 0
-  %nop2456 = alloca i1, i1 0
-  %nop2457 = alloca i1, i1 0
-  %nop2458 = alloca i1, i1 0
-  %nop2459 = alloca i1, i1 0
-  %nop2460 = alloca i1, i1 0
-  %nop2461 = alloca i1, i1 0
-  %nop2462 = alloca i1, i1 0
-  %nop2463 = alloca i1, i1 0
-  %nop2464 = alloca i1, i1 0
-  %nop2465 = alloca i1, i1 0
-  %nop2466 = alloca i1, i1 0
-  %nop2467 = alloca i1, i1 0
-  %nop2468 = alloca i1, i1 0
-  %nop2469 = alloca i1, i1 0
-  %nop2470 = alloca i1, i1 0
-  %nop2471 = alloca i1, i1 0
-  %nop2472 = alloca i1, i1 0
-  %nop2473 = alloca i1, i1 0
-  %nop2474 = alloca i1, i1 0
-  %nop2475 = alloca i1, i1 0
-  %nop2476 = alloca i1, i1 0
-  %nop2477 = alloca i1, i1 0
-  %nop2478 = alloca i1, i1 0
-  %nop2479 = alloca i1, i1 0
-  %nop2480 = alloca i1, i1 0
-  %nop2481 = alloca i1, i1 0
-  %nop2482 = alloca i1, i1 0
-  %nop2483 = alloca i1, i1 0
-  %nop2484 = alloca i1, i1 0
-  %nop2485 = alloca i1, i1 0
-  %nop2486 = alloca i1, i1 0
-  %nop2487 = alloca i1, i1 0
-  %nop2488 = alloca i1, i1 0
-  %nop2489 = alloca i1, i1 0
-  %nop2490 = alloca i1, i1 0
-  %nop2491 = alloca i1, i1 0
-  %nop2492 = alloca i1, i1 0
-  %nop2493 = alloca i1, i1 0
-  %nop2494 = alloca i1, i1 0
-  %nop2495 = alloca i1, i1 0
-  %nop2496 = alloca i1, i1 0
-  %nop2497 = alloca i1, i1 0
-  %nop2498 = alloca i1, i1 0
-  %nop2499 = alloca i1, i1 0
-  %nop2500 = alloca i1, i1 0
-  %nop2501 = alloca i1, i1 0
-  %nop2502 = alloca i1, i1 0
-  %nop2503 = alloca i1, i1 0
-  %nop2504 = alloca i1, i1 0
-  %nop2505 = alloca i1, i1 0
-  %nop2506 = alloca i1, i1 0
-  %nop2507 = alloca i1, i1 0
-  %nop2508 = alloca i1, i1 0
-  %nop2509 = alloca i1, i1 0
-  %nop2510 = alloca i1, i1 0
-  %nop2511 = alloca i1, i1 0
-  %nop2512 = alloca i1, i1 0
-  %nop2513 = alloca i1, i1 0
-  %nop2514 = alloca i1, i1 0
-  %nop2515 = alloca i1, i1 0
-  %nop2516 = alloca i1, i1 0
-  %nop2517 = alloca i1, i1 0
-  %nop2518 = alloca i1, i1 0
-  %nop2519 = alloca i1, i1 0
-  %nop2520 = alloca i1, i1 0
-  %nop2521 = alloca i1, i1 0
-  %nop2522 = alloca i1, i1 0
-  %nop2523 = alloca i1, i1 0
-  %nop2524 = alloca i1, i1 0
-  %nop2525 = alloca i1, i1 0
-  %nop2526 = alloca i1, i1 0
-  %nop2527 = alloca i1, i1 0
-  %nop2528 = alloca i1, i1 0
-  %nop2529 = alloca i1, i1 0
-  %nop2530 = alloca i1, i1 0
-  %nop2531 = alloca i1, i1 0
-  %nop2532 = alloca i1, i1 0
-  %nop2533 = alloca i1, i1 0
-  %nop2534 = alloca i1, i1 0
-  %nop2535 = alloca i1, i1 0
-  %nop2536 = alloca i1, i1 0
-  %nop2537 = alloca i1, i1 0
-  %nop2538 = alloca i1, i1 0
-  %nop2539 = alloca i1, i1 0
-  %nop2540 = alloca i1, i1 0
-  %nop2541 = alloca i1, i1 0
-  %nop2542 = alloca i1, i1 0
-  %nop2543 = alloca i1, i1 0
-  %nop2544 = alloca i1, i1 0
-  %nop2545 = alloca i1, i1 0
-  %nop2546 = alloca i1, i1 0
-  %nop2547 = alloca i1, i1 0
-  %nop2548 = alloca i1, i1 0
-  %nop2549 = alloca i1, i1 0
-  %nop2550 = alloca i1, i1 0
-  %nop2551 = alloca i1, i1 0
-  %nop2552 = alloca i1, i1 0
-  %nop2553 = alloca i1, i1 0
-  %nop2554 = alloca i1, i1 0
-  %nop2555 = alloca i1, i1 0
-  %nop2556 = alloca i1, i1 0
-  %nop2557 = alloca i1, i1 0
-  %nop2558 = alloca i1, i1 0
-  %nop2559 = alloca i1, i1 0
-  %nop2560 = alloca i1, i1 0
-  %nop2561 = alloca i1, i1 0
-  %nop2562 = alloca i1, i1 0
-  %nop2563 = alloca i1, i1 0
-  %nop2564 = alloca i1, i1 0
-  %nop2565 = alloca i1, i1 0
-  %nop2566 = alloca i1, i1 0
-  %nop2567 = alloca i1, i1 0
-  %nop2568 = alloca i1, i1 0
-  %nop2569 = alloca i1, i1 0
-  %nop2570 = alloca i1, i1 0
-  %nop2571 = alloca i1, i1 0
-  %nop2572 = alloca i1, i1 0
-  %nop2573 = alloca i1, i1 0
-  %nop2574 = alloca i1, i1 0
-  %nop2575 = alloca i1, i1 0
-  %nop2576 = alloca i1, i1 0
-  %nop2577 = alloca i1, i1 0
-  %nop2578 = alloca i1, i1 0
-  %nop2579 = alloca i1, i1 0
-  %nop2580 = alloca i1, i1 0
-  %nop2581 = alloca i1, i1 0
-  %nop2582 = alloca i1, i1 0
-  %nop2583 = alloca i1, i1 0
-  %nop2584 = alloca i1, i1 0
-  %nop2585 = alloca i1, i1 0
-  %nop2586 = alloca i1, i1 0
-  %nop2587 = alloca i1, i1 0
-  %nop2588 = alloca i1, i1 0
-  %nop2589 = alloca i1, i1 0
-  %nop2590 = alloca i1, i1 0
-  %nop2591 = alloca i1, i1 0
-  %nop2592 = alloca i1, i1 0
-  %nop2593 = alloca i1, i1 0
-  %nop2594 = alloca i1, i1 0
-  %nop2595 = alloca i1, i1 0
-  %nop2596 = alloca i1, i1 0
-  %nop2597 = alloca i1, i1 0
-  %nop2598 = alloca i1, i1 0
-  %nop2599 = alloca i1, i1 0
-  %nop2600 = alloca i1, i1 0
-  %nop2601 = alloca i1, i1 0
-  %nop2602 = alloca i1, i1 0
-  %nop2603 = alloca i1, i1 0
-  %nop2604 = alloca i1, i1 0
-  %nop2605 = alloca i1, i1 0
-  %nop2606 = alloca i1, i1 0
-  %nop2607 = alloca i1, i1 0
-  %nop2608 = alloca i1, i1 0
-  %nop2609 = alloca i1, i1 0
-  %nop2610 = alloca i1, i1 0
-  %nop2611 = alloca i1, i1 0
-  %nop2612 = alloca i1, i1 0
-  %nop2613 = alloca i1, i1 0
-  %nop2614 = alloca i1, i1 0
-  %nop2615 = alloca i1, i1 0
-  %nop2616 = alloca i1, i1 0
-  %nop2617 = alloca i1, i1 0
-  %nop2618 = alloca i1, i1 0
-  %nop2619 = alloca i1, i1 0
-  %nop2620 = alloca i1, i1 0
-  %nop2621 = alloca i1, i1 0
-  %nop2622 = alloca i1, i1 0
-  %nop2623 = alloca i1, i1 0
-  %nop2624 = alloca i1, i1 0
-  %nop2625 = alloca i1, i1 0
-  %nop2626 = alloca i1, i1 0
-  %nop2627 = alloca i1, i1 0
-  %nop2628 = alloca i1, i1 0
-  %nop2629 = alloca i1, i1 0
-  %nop2630 = alloca i1, i1 0
-  %nop2631 = alloca i1, i1 0
-  %nop2632 = alloca i1, i1 0
-  %nop2633 = alloca i1, i1 0
-  %nop2634 = alloca i1, i1 0
-  %nop2635 = alloca i1, i1 0
-  %nop2636 = alloca i1, i1 0
-  %nop2637 = alloca i1, i1 0
-  %nop2638 = alloca i1, i1 0
-  %nop2639 = alloca i1, i1 0
-  %nop2640 = alloca i1, i1 0
-  %nop2641 = alloca i1, i1 0
-  %nop2642 = alloca i1, i1 0
-  %nop2643 = alloca i1, i1 0
-  %nop2644 = alloca i1, i1 0
-  %nop2645 = alloca i1, i1 0
-  %nop2646 = alloca i1, i1 0
-  %nop2647 = alloca i1, i1 0
-  %nop2648 = alloca i1, i1 0
-  %nop2649 = alloca i1, i1 0
-  %nop2650 = alloca i1, i1 0
-  %nop2651 = alloca i1, i1 0
-  %nop2652 = alloca i1, i1 0
-  %nop2653 = alloca i1, i1 0
-  %nop2654 = alloca i1, i1 0
-  %nop2655 = alloca i1, i1 0
-  %nop2656 = alloca i1, i1 0
-  %nop2657 = alloca i1, i1 0
-  %nop2658 = alloca i1, i1 0
-  %nop2659 = alloca i1, i1 0
-  %nop2660 = alloca i1, i1 0
-  %nop2661 = alloca i1, i1 0
-  %nop2662 = alloca i1, i1 0
-  %nop2663 = alloca i1, i1 0
-  %nop2664 = alloca i1, i1 0
-  %nop2665 = alloca i1, i1 0
-  %nop2666 = alloca i1, i1 0
-  %nop2667 = alloca i1, i1 0
-  %nop2668 = alloca i1, i1 0
-  %nop2669 = alloca i1, i1 0
-  %nop2670 = alloca i1, i1 0
-  %nop2671 = alloca i1, i1 0
-  %nop2672 = alloca i1, i1 0
-  %nop2673 = alloca i1, i1 0
-  %nop2674 = alloca i1, i1 0
-  %nop2675 = alloca i1, i1 0
-  %nop2676 = alloca i1, i1 0
-  %nop2677 = alloca i1, i1 0
-  %nop2678 = alloca i1, i1 0
-  %nop2679 = alloca i1, i1 0
-  %nop2680 = alloca i1, i1 0
-  %nop2681 = alloca i1, i1 0
-  %nop2682 = alloca i1, i1 0
-  %nop2683 = alloca i1, i1 0
-  %nop2684 = alloca i1, i1 0
-  %nop2685 = alloca i1, i1 0
-  %nop2686 = alloca i1, i1 0
-  %nop2687 = alloca i1, i1 0
-  %nop2688 = alloca i1, i1 0
-  %nop2689 = alloca i1, i1 0
-  %nop2690 = alloca i1, i1 0
-  %nop2691 = alloca i1, i1 0
-  %nop2692 = alloca i1, i1 0
-  %nop2693 = alloca i1, i1 0
-  %nop2694 = alloca i1, i1 0
-  %nop2695 = alloca i1, i1 0
-  %nop2696 = alloca i1, i1 0
-  %nop2697 = alloca i1, i1 0
-  %nop2698 = alloca i1, i1 0
-  %nop2699 = alloca i1, i1 0
-  %nop2700 = alloca i1, i1 0
-  %nop2701 = alloca i1, i1 0
-  %nop2702 = alloca i1, i1 0
-  %nop2703 = alloca i1, i1 0
-  %nop2704 = alloca i1, i1 0
-  %nop2705 = alloca i1, i1 0
-  %nop2706 = alloca i1, i1 0
-  %nop2707 = alloca i1, i1 0
-  %nop2708 = alloca i1, i1 0
-  %nop2709 = alloca i1, i1 0
-  %nop2710 = alloca i1, i1 0
-  %nop2711 = alloca i1, i1 0
-  %nop2712 = alloca i1, i1 0
-  %nop2713 = alloca i1, i1 0
-  %nop2714 = alloca i1, i1 0
-  %nop2715 = alloca i1, i1 0
-  %nop2716 = alloca i1, i1 0
-  %nop2717 = alloca i1, i1 0
-  %nop2718 = alloca i1, i1 0
-  %nop2719 = alloca i1, i1 0
-  %nop2720 = alloca i1, i1 0
-  %nop2721 = alloca i1, i1 0
-  %nop2722 = alloca i1, i1 0
-  %nop2723 = alloca i1, i1 0
-  %nop2724 = alloca i1, i1 0
-  %nop2725 = alloca i1, i1 0
-  %nop2726 = alloca i1, i1 0
-  %nop2727 = alloca i1, i1 0
-  %nop2728 = alloca i1, i1 0
-  %nop2729 = alloca i1, i1 0
-  %nop2730 = alloca i1, i1 0
-  %nop2731 = alloca i1, i1 0
-  %nop2732 = alloca i1, i1 0
-  %nop2733 = alloca i1, i1 0
-  %nop2734 = alloca i1, i1 0
-  %nop2735 = alloca i1, i1 0
-  %nop2736 = alloca i1, i1 0
-  %nop2737 = alloca i1, i1 0
-  %nop2738 = alloca i1, i1 0
-  %nop2739 = alloca i1, i1 0
-  %nop2740 = alloca i1, i1 0
-  %nop2741 = alloca i1, i1 0
-  %nop2742 = alloca i1, i1 0
-  %nop2743 = alloca i1, i1 0
-  %nop2744 = alloca i1, i1 0
-  %nop2745 = alloca i1, i1 0
-  %nop2746 = alloca i1, i1 0
-  %nop2747 = alloca i1, i1 0
-  %nop2748 = alloca i1, i1 0
-  %nop2749 = alloca i1, i1 0
-  %nop2750 = alloca i1, i1 0
-  %nop2751 = alloca i1, i1 0
-  %nop2752 = alloca i1, i1 0
-  %nop2753 = alloca i1, i1 0
-  %nop2754 = alloca i1, i1 0
-  %nop2755 = alloca i1, i1 0
-  %nop2756 = alloca i1, i1 0
-  %nop2757 = alloca i1, i1 0
-  %nop2758 = alloca i1, i1 0
-  %nop2759 = alloca i1, i1 0
-  %nop2760 = alloca i1, i1 0
-  %nop2761 = alloca i1, i1 0
-  %nop2762 = alloca i1, i1 0
-  %nop2763 = alloca i1, i1 0
-  %nop2764 = alloca i1, i1 0
-  %nop2765 = alloca i1, i1 0
-  %nop2766 = alloca i1, i1 0
-  %nop2767 = alloca i1, i1 0
-  %nop2768 = alloca i1, i1 0
-  %nop2769 = alloca i1, i1 0
-  %nop2770 = alloca i1, i1 0
-  %nop2771 = alloca i1, i1 0
-  %nop2772 = alloca i1, i1 0
-  %nop2773 = alloca i1, i1 0
-  %nop2774 = alloca i1, i1 0
-  %nop2775 = alloca i1, i1 0
-  %nop2776 = alloca i1, i1 0
-  %nop2777 = alloca i1, i1 0
-  %nop2778 = alloca i1, i1 0
-  %nop2779 = alloca i1, i1 0
-  %nop2780 = alloca i1, i1 0
-  %nop2781 = alloca i1, i1 0
-  %nop2782 = alloca i1, i1 0
-  %nop2783 = alloca i1, i1 0
-  %nop2784 = alloca i1, i1 0
-  %nop2785 = alloca i1, i1 0
-  %nop2786 = alloca i1, i1 0
-  %nop2787 = alloca i1, i1 0
-  %nop2788 = alloca i1, i1 0
-  %nop2789 = alloca i1, i1 0
-  %nop2790 = alloca i1, i1 0
-  %nop2791 = alloca i1, i1 0
-  %nop2792 = alloca i1, i1 0
-  %nop2793 = alloca i1, i1 0
-  %nop2794 = alloca i1, i1 0
-  %nop2795 = alloca i1, i1 0
-  %nop2796 = alloca i1, i1 0
-  %nop2797 = alloca i1, i1 0
-  %nop2798 = alloca i1, i1 0
-  %nop2799 = alloca i1, i1 0
-  %nop2800 = alloca i1, i1 0
-  %nop2801 = alloca i1, i1 0
-  %nop2802 = alloca i1, i1 0
-  %nop2803 = alloca i1, i1 0
-  %nop2804 = alloca i1, i1 0
-  %nop2805 = alloca i1, i1 0
-  %nop2806 = alloca i1, i1 0
-  %nop2807 = alloca i1, i1 0
-  %nop2808 = alloca i1, i1 0
-  %nop2809 = alloca i1, i1 0
-  %nop2810 = alloca i1, i1 0
-  %nop2811 = alloca i1, i1 0
-  %nop2812 = alloca i1, i1 0
-  %nop2813 = alloca i1, i1 0
-  %nop2814 = alloca i1, i1 0
-  %nop2815 = alloca i1, i1 0
-  %nop2816 = alloca i1, i1 0
-  %nop2817 = alloca i1, i1 0
-  %nop2818 = alloca i1, i1 0
-  %nop2819 = alloca i1, i1 0
-  %nop2820 = alloca i1, i1 0
-  %nop2821 = alloca i1, i1 0
-  %nop2822 = alloca i1, i1 0
-  %nop2823 = alloca i1, i1 0
-  %nop2824 = alloca i1, i1 0
-  %nop2825 = alloca i1, i1 0
-  %nop2826 = alloca i1, i1 0
-  %nop2827 = alloca i1, i1 0
-  %nop2828 = alloca i1, i1 0
-  %nop2829 = alloca i1, i1 0
-  %nop2830 = alloca i1, i1 0
-  %nop2831 = alloca i1, i1 0
-  %nop2832 = alloca i1, i1 0
-  %nop2833 = alloca i1, i1 0
-  %nop2834 = alloca i1, i1 0
-  %nop2835 = alloca i1, i1 0
-  %nop2836 = alloca i1, i1 0
-  %nop2837 = alloca i1, i1 0
-  %nop2838 = alloca i1, i1 0
-  %nop2839 = alloca i1, i1 0
-  %nop2840 = alloca i1, i1 0
-  %nop2841 = alloca i1, i1 0
-  %nop2842 = alloca i1, i1 0
-  %nop2843 = alloca i1, i1 0
-  %nop2844 = alloca i1, i1 0
-  %nop2845 = alloca i1, i1 0
-  %nop2846 = alloca i1, i1 0
-  %nop2847 = alloca i1, i1 0
-  %nop2848 = alloca i1, i1 0
-  %nop2849 = alloca i1, i1 0
-  %nop2850 = alloca i1, i1 0
-  %nop2851 = alloca i1, i1 0
-  %nop2852 = alloca i1, i1 0
-  %nop2853 = alloca i1, i1 0
-  %nop2854 = alloca i1, i1 0
-  %nop2855 = alloca i1, i1 0
-  %nop2856 = alloca i1, i1 0
-  %nop2857 = alloca i1, i1 0
-  %nop2858 = alloca i1, i1 0
-  %nop2859 = alloca i1, i1 0
-  %nop2860 = alloca i1, i1 0
-  %nop2861 = alloca i1, i1 0
-  %nop2862 = alloca i1, i1 0
-  %nop2863 = alloca i1, i1 0
-  %nop2864 = alloca i1, i1 0
-  %nop2865 = alloca i1, i1 0
-  %nop2866 = alloca i1, i1 0
-  %nop2867 = alloca i1, i1 0
-  %nop2868 = alloca i1, i1 0
-  %nop2869 = alloca i1, i1 0
-  %nop2870 = alloca i1, i1 0
-  %nop2871 = alloca i1, i1 0
-  %nop2872 = alloca i1, i1 0
-  %nop2873 = alloca i1, i1 0
-  %nop2874 = alloca i1, i1 0
-  %nop2875 = alloca i1, i1 0
-  %nop2876 = alloca i1, i1 0
-  %nop2877 = alloca i1, i1 0
-  %nop2878 = alloca i1, i1 0
-  %nop2879 = alloca i1, i1 0
-  %nop2880 = alloca i1, i1 0
-  %nop2881 = alloca i1, i1 0
-  %nop2882 = alloca i1, i1 0
-  %nop2883 = alloca i1, i1 0
-  %nop2884 = alloca i1, i1 0
-  %nop2885 = alloca i1, i1 0
-  %nop2886 = alloca i1, i1 0
-  %nop2887 = alloca i1, i1 0
-  %nop2888 = alloca i1, i1 0
-  %nop2889 = alloca i1, i1 0
-  %nop2890 = alloca i1, i1 0
-  %nop2891 = alloca i1, i1 0
-  %nop2892 = alloca i1, i1 0
-  %nop2893 = alloca i1, i1 0
-  %nop2894 = alloca i1, i1 0
-  %nop2895 = alloca i1, i1 0
-  %nop2896 = alloca i1, i1 0
-  %nop2897 = alloca i1, i1 0
-  %nop2898 = alloca i1, i1 0
-  %nop2899 = alloca i1, i1 0
-  %nop2900 = alloca i1, i1 0
-  %nop2901 = alloca i1, i1 0
-  %nop2902 = alloca i1, i1 0
-  %nop2903 = alloca i1, i1 0
-  %nop2904 = alloca i1, i1 0
-  %nop2905 = alloca i1, i1 0
-  %nop2906 = alloca i1, i1 0
-  %nop2907 = alloca i1, i1 0
-  %nop2908 = alloca i1, i1 0
-  %nop2909 = alloca i1, i1 0
-  %nop2910 = alloca i1, i1 0
-  %nop2911 = alloca i1, i1 0
-  %nop2912 = alloca i1, i1 0
-  %nop2913 = alloca i1, i1 0
-  %nop2914 = alloca i1, i1 0
-  %nop2915 = alloca i1, i1 0
-  %nop2916 = alloca i1, i1 0
-  %nop2917 = alloca i1, i1 0
-  %nop2918 = alloca i1, i1 0
-  %nop2919 = alloca i1, i1 0
-  %nop2920 = alloca i1, i1 0
-  %nop2921 = alloca i1, i1 0
-  %nop2922 = alloca i1, i1 0
-  %nop2923 = alloca i1, i1 0
-  %nop2924 = alloca i1, i1 0
-  %nop2925 = alloca i1, i1 0
-  %nop2926 = alloca i1, i1 0
-  %nop2927 = alloca i1, i1 0
-  %nop2928 = alloca i1, i1 0
-  %nop2929 = alloca i1, i1 0
-  %nop2930 = alloca i1, i1 0
-  %nop2931 = alloca i1, i1 0
-  %nop2932 = alloca i1, i1 0
-  %nop2933 = alloca i1, i1 0
-  %nop2934 = alloca i1, i1 0
-  %nop2935 = alloca i1, i1 0
-  %nop2936 = alloca i1, i1 0
-  %nop2937 = alloca i1, i1 0
-  %nop2938 = alloca i1, i1 0
-  %nop2939 = alloca i1, i1 0
-  %nop2940 = alloca i1, i1 0
-  %nop2941 = alloca i1, i1 0
-  %nop2942 = alloca i1, i1 0
-  %nop2943 = alloca i1, i1 0
-  %nop2944 = alloca i1, i1 0
-  %nop2945 = alloca i1, i1 0
-  %nop2946 = alloca i1, i1 0
-  %nop2947 = alloca i1, i1 0
-  %nop2948 = alloca i1, i1 0
-  %nop2949 = alloca i1, i1 0
-  %nop2950 = alloca i1, i1 0
-  %nop2951 = alloca i1, i1 0
-  %nop2952 = alloca i1, i1 0
-  %nop2953 = alloca i1, i1 0
-  %nop2954 = alloca i1, i1 0
-  %nop2955 = alloca i1, i1 0
-  %nop2956 = alloca i1, i1 0
-  %nop2957 = alloca i1, i1 0
-  %nop2958 = alloca i1, i1 0
-  %nop2959 = alloca i1, i1 0
-  %nop2960 = alloca i1, i1 0
-  %nop2961 = alloca i1, i1 0
-  %nop2962 = alloca i1, i1 0
-  %nop2963 = alloca i1, i1 0
-  %nop2964 = alloca i1, i1 0
-  %nop2965 = alloca i1, i1 0
-  %nop2966 = alloca i1, i1 0
-  %nop2967 = alloca i1, i1 0
-  %nop2968 = alloca i1, i1 0
-  %nop2969 = alloca i1, i1 0
-  %nop2970 = alloca i1, i1 0
-  %nop2971 = alloca i1, i1 0
-  %nop2972 = alloca i1, i1 0
-  %nop2973 = alloca i1, i1 0
-  %nop2974 = alloca i1, i1 0
-  %nop2975 = alloca i1, i1 0
-  %nop2976 = alloca i1, i1 0
-  %nop2977 = alloca i1, i1 0
-  %nop2978 = alloca i1, i1 0
-  %nop2979 = alloca i1, i1 0
-  %nop2980 = alloca i1, i1 0
-  %nop2981 = alloca i1, i1 0
-  %nop2982 = alloca i1, i1 0
-  %nop2983 = alloca i1, i1 0
-  %nop2984 = alloca i1, i1 0
-  %nop2985 = alloca i1, i1 0
-  %nop2986 = alloca i1, i1 0
-  %nop2987 = alloca i1, i1 0
-  %nop2988 = alloca i1, i1 0
-  %nop2989 = alloca i1, i1 0
-  %nop2990 = alloca i1, i1 0
-  %nop2991 = alloca i1, i1 0
-  %nop2992 = alloca i1, i1 0
-  %nop2993 = alloca i1, i1 0
-  %nop2994 = alloca i1, i1 0
-  %nop2995 = alloca i1, i1 0
-  %nop2996 = alloca i1, i1 0
-  %nop2997 = alloca i1, i1 0
-  %nop2998 = alloca i1, i1 0
-  %nop2999 = alloca i1, i1 0
-  %nop3000 = alloca i1, i1 0
-  %nop3001 = alloca i1, i1 0
-  %nop3002 = alloca i1, i1 0
-  %nop3003 = alloca i1, i1 0
-  %nop3004 = alloca i1, i1 0
-  %nop3005 = alloca i1, i1 0
-  %nop3006 = alloca i1, i1 0
-  %nop3007 = alloca i1, i1 0
-  %nop3008 = alloca i1, i1 0
-  %nop3009 = alloca i1, i1 0
-  %nop3010 = alloca i1, i1 0
-  %nop3011 = alloca i1, i1 0
-  %nop3012 = alloca i1, i1 0
-  %nop3013 = alloca i1, i1 0
-  %nop3014 = alloca i1, i1 0
-  %nop3015 = alloca i1, i1 0
-  %nop3016 = alloca i1, i1 0
-  %nop3017 = alloca i1, i1 0
-  %nop3018 = alloca i1, i1 0
-  %nop3019 = alloca i1, i1 0
-  %nop3020 = alloca i1, i1 0
-  %nop3021 = alloca i1, i1 0
-  %nop3022 = alloca i1, i1 0
-  %nop3023 = alloca i1, i1 0
-  %nop3024 = alloca i1, i1 0
-  %nop3025 = alloca i1, i1 0
-  %nop3026 = alloca i1, i1 0
-  %nop3027 = alloca i1, i1 0
-  %nop3028 = alloca i1, i1 0
-  %nop3029 = alloca i1, i1 0
-  %nop3030 = alloca i1, i1 0
-  %nop3031 = alloca i1, i1 0
-  %nop3032 = alloca i1, i1 0
-  %nop3033 = alloca i1, i1 0
-  %nop3034 = alloca i1, i1 0
-  %nop3035 = alloca i1, i1 0
-  %nop3036 = alloca i1, i1 0
-  %nop3037 = alloca i1, i1 0
-  %nop3038 = alloca i1, i1 0
-  %nop3039 = alloca i1, i1 0
-  %nop3040 = alloca i1, i1 0
-  %nop3041 = alloca i1, i1 0
-  %nop3042 = alloca i1, i1 0
-  %nop3043 = alloca i1, i1 0
-  %nop3044 = alloca i1, i1 0
-  %nop3045 = alloca i1, i1 0
-  %nop3046 = alloca i1, i1 0
-  %nop3047 = alloca i1, i1 0
-  %nop3048 = alloca i1, i1 0
-  %nop3049 = alloca i1, i1 0
-  %nop3050 = alloca i1, i1 0
-  %nop3051 = alloca i1, i1 0
-  %nop3052 = alloca i1, i1 0
-  %nop3053 = alloca i1, i1 0
-  %nop3054 = alloca i1, i1 0
-  %nop3055 = alloca i1, i1 0
-  %nop3056 = alloca i1, i1 0
-  %nop3057 = alloca i1, i1 0
-  %nop3058 = alloca i1, i1 0
-  %nop3059 = alloca i1, i1 0
-  %nop3060 = alloca i1, i1 0
-  %nop3061 = alloca i1, i1 0
-  %nop3062 = alloca i1, i1 0
-  %nop3063 = alloca i1, i1 0
-  %nop3064 = alloca i1, i1 0
-  %nop3065 = alloca i1, i1 0
-  %nop3066 = alloca i1, i1 0
-  %nop3067 = alloca i1, i1 0
-  %nop3068 = alloca i1, i1 0
-  %nop3069 = alloca i1, i1 0
-  %nop3070 = alloca i1, i1 0
-  %nop3071 = alloca i1, i1 0
-  %nop3072 = alloca i1, i1 0
-  %nop3073 = alloca i1, i1 0
-  %nop3074 = alloca i1, i1 0
-  %nop3075 = alloca i1, i1 0
-  %nop3076 = alloca i1, i1 0
-  %nop3077 = alloca i1, i1 0
-  %nop3078 = alloca i1, i1 0
-  %nop3079 = alloca i1, i1 0
-  %nop3080 = alloca i1, i1 0
-  %nop3081 = alloca i1, i1 0
-  %nop3082 = alloca i1, i1 0
-  %nop3083 = alloca i1, i1 0
-  %nop3084 = alloca i1, i1 0
-  %nop3085 = alloca i1, i1 0
-  %nop3086 = alloca i1, i1 0
-  %nop3087 = alloca i1, i1 0
-  %nop3088 = alloca i1, i1 0
-  %nop3089 = alloca i1, i1 0
-  %nop3090 = alloca i1, i1 0
-  %nop3091 = alloca i1, i1 0
-  %nop3092 = alloca i1, i1 0
-  %nop3093 = alloca i1, i1 0
-  %nop3094 = alloca i1, i1 0
-  %nop3095 = alloca i1, i1 0
-  %nop3096 = alloca i1, i1 0
-  %nop3097 = alloca i1, i1 0
-  %nop3098 = alloca i1, i1 0
-  %nop3099 = alloca i1, i1 0
-  %nop3100 = alloca i1, i1 0
-  %nop3101 = alloca i1, i1 0
-  %nop3102 = alloca i1, i1 0
-  %nop3103 = alloca i1, i1 0
-  %nop3104 = alloca i1, i1 0
-  %nop3105 = alloca i1, i1 0
-  %nop3106 = alloca i1, i1 0
-  %nop3107 = alloca i1, i1 0
-  %nop3108 = alloca i1, i1 0
-  %nop3109 = alloca i1, i1 0
-  %nop3110 = alloca i1, i1 0
-  %nop3111 = alloca i1, i1 0
-  %nop3112 = alloca i1, i1 0
-  %nop3113 = alloca i1, i1 0
-  %nop3114 = alloca i1, i1 0
-  %nop3115 = alloca i1, i1 0
-  %nop3116 = alloca i1, i1 0
-  %nop3117 = alloca i1, i1 0
-  %nop3118 = alloca i1, i1 0
-  %nop3119 = alloca i1, i1 0
-  %nop3120 = alloca i1, i1 0
-  %nop3121 = alloca i1, i1 0
-  %nop3122 = alloca i1, i1 0
-  %nop3123 = alloca i1, i1 0
-  %nop3124 = alloca i1, i1 0
-  %nop3125 = alloca i1, i1 0
-  %nop3126 = alloca i1, i1 0
-  %nop3127 = alloca i1, i1 0
-  %nop3128 = alloca i1, i1 0
-  %nop3129 = alloca i1, i1 0
-  %nop3130 = alloca i1, i1 0
-  %nop3131 = alloca i1, i1 0
-  %nop3132 = alloca i1, i1 0
-  %nop3133 = alloca i1, i1 0
-  %nop3134 = alloca i1, i1 0
-  %nop3135 = alloca i1, i1 0
-  %nop3136 = alloca i1, i1 0
-  %nop3137 = alloca i1, i1 0
-  %nop3138 = alloca i1, i1 0
-  %nop3139 = alloca i1, i1 0
-  %nop3140 = alloca i1, i1 0
-  %nop3141 = alloca i1, i1 0
-  %nop3142 = alloca i1, i1 0
-  %nop3143 = alloca i1, i1 0
-  %nop3144 = alloca i1, i1 0
-  %nop3145 = alloca i1, i1 0
-  %nop3146 = alloca i1, i1 0
-  %nop3147 = alloca i1, i1 0
-  %nop3148 = alloca i1, i1 0
-  %nop3149 = alloca i1, i1 0
-  %nop3150 = alloca i1, i1 0
-  %nop3151 = alloca i1, i1 0
-  %nop3152 = alloca i1, i1 0
-  %nop3153 = alloca i1, i1 0
-  %nop3154 = alloca i1, i1 0
-  %nop3155 = alloca i1, i1 0
-  %nop3156 = alloca i1, i1 0
-  %nop3157 = alloca i1, i1 0
-  %nop3158 = alloca i1, i1 0
-  %nop3159 = alloca i1, i1 0
-  %nop3160 = alloca i1, i1 0
-  %nop3161 = alloca i1, i1 0
-  %nop3162 = alloca i1, i1 0
-  %nop3163 = alloca i1, i1 0
-  %nop3164 = alloca i1, i1 0
-  %nop3165 = alloca i1, i1 0
-  %nop3166 = alloca i1, i1 0
-  %nop3167 = alloca i1, i1 0
-  %nop3168 = alloca i1, i1 0
-  %nop3169 = alloca i1, i1 0
-  %nop3170 = alloca i1, i1 0
-  %nop3171 = alloca i1, i1 0
-  %nop3172 = alloca i1, i1 0
-  %nop3173 = alloca i1, i1 0
-  %nop3174 = alloca i1, i1 0
-  %nop3175 = alloca i1, i1 0
-  %nop3176 = alloca i1, i1 0
-  %nop3177 = alloca i1, i1 0
-  %nop3178 = alloca i1, i1 0
-  %nop3179 = alloca i1, i1 0
-  %nop3180 = alloca i1, i1 0
-  %nop3181 = alloca i1, i1 0
-  %nop3182 = alloca i1, i1 0
-  %nop3183 = alloca i1, i1 0
-  %nop3184 = alloca i1, i1 0
-  %nop3185 = alloca i1, i1 0
-  %nop3186 = alloca i1, i1 0
-  %nop3187 = alloca i1, i1 0
-  %nop3188 = alloca i1, i1 0
-  %nop3189 = alloca i1, i1 0
-  %nop3190 = alloca i1, i1 0
-  %nop3191 = alloca i1, i1 0
-  %nop3192 = alloca i1, i1 0
-  %nop3193 = alloca i1, i1 0
-  %nop3194 = alloca i1, i1 0
-  %nop3195 = alloca i1, i1 0
-  %nop3196 = alloca i1, i1 0
-  %nop3197 = alloca i1, i1 0
-  %nop3198 = alloca i1, i1 0
-  %nop3199 = alloca i1, i1 0
-  %nop3200 = alloca i1, i1 0
-  %nop3201 = alloca i1, i1 0
-  %nop3202 = alloca i1, i1 0
-  %nop3203 = alloca i1, i1 0
-  %nop3204 = alloca i1, i1 0
-  %nop3205 = alloca i1, i1 0
-  %nop3206 = alloca i1, i1 0
-  %nop3207 = alloca i1, i1 0
-  %nop3208 = alloca i1, i1 0
-  %nop3209 = alloca i1, i1 0
-  %nop3210 = alloca i1, i1 0
-  %nop3211 = alloca i1, i1 0
-  %nop3212 = alloca i1, i1 0
-  %nop3213 = alloca i1, i1 0
-  %nop3214 = alloca i1, i1 0
-  %nop3215 = alloca i1, i1 0
-  %nop3216 = alloca i1, i1 0
-  %nop3217 = alloca i1, i1 0
-  %nop3218 = alloca i1, i1 0
-  %nop3219 = alloca i1, i1 0
-  %nop3220 = alloca i1, i1 0
-  %nop3221 = alloca i1, i1 0
-  %nop3222 = alloca i1, i1 0
-  %nop3223 = alloca i1, i1 0
-  %nop3224 = alloca i1, i1 0
-  %nop3225 = alloca i1, i1 0
-  %nop3226 = alloca i1, i1 0
-  %nop3227 = alloca i1, i1 0
-  %nop3228 = alloca i1, i1 0
-  %nop3229 = alloca i1, i1 0
-  %nop3230 = alloca i1, i1 0
-  %nop3231 = alloca i1, i1 0
-  %nop3232 = alloca i1, i1 0
-  %nop3233 = alloca i1, i1 0
-  %nop3234 = alloca i1, i1 0
-  %nop3235 = alloca i1, i1 0
-  %nop3236 = alloca i1, i1 0
-  %nop3237 = alloca i1, i1 0
-  %nop3238 = alloca i1, i1 0
-  %nop3239 = alloca i1, i1 0
-  %nop3240 = alloca i1, i1 0
-  %nop3241 = alloca i1, i1 0
-  %nop3242 = alloca i1, i1 0
-  %nop3243 = alloca i1, i1 0
-  %nop3244 = alloca i1, i1 0
-  %nop3245 = alloca i1, i1 0
-  %nop3246 = alloca i1, i1 0
-  %nop3247 = alloca i1, i1 0
-  %nop3248 = alloca i1, i1 0
-  %nop3249 = alloca i1, i1 0
-  %nop3250 = alloca i1, i1 0
-  %nop3251 = alloca i1, i1 0
-  %nop3252 = alloca i1, i1 0
-  %nop3253 = alloca i1, i1 0
-  %nop3254 = alloca i1, i1 0
-  %nop3255 = alloca i1, i1 0
-  %nop3256 = alloca i1, i1 0
-  %nop3257 = alloca i1, i1 0
-  %nop3258 = alloca i1, i1 0
-  %nop3259 = alloca i1, i1 0
-  %nop3260 = alloca i1, i1 0
-  %nop3261 = alloca i1, i1 0
-  %nop3262 = alloca i1, i1 0
-  %nop3263 = alloca i1, i1 0
-  %nop3264 = alloca i1, i1 0
-  %nop3265 = alloca i1, i1 0
-  %nop3266 = alloca i1, i1 0
-  %nop3267 = alloca i1, i1 0
-  %nop3268 = alloca i1, i1 0
-  %nop3269 = alloca i1, i1 0
-  %nop3270 = alloca i1, i1 0
-  %nop3271 = alloca i1, i1 0
-  %nop3272 = alloca i1, i1 0
-  %nop3273 = alloca i1, i1 0
-  %nop3274 = alloca i1, i1 0
-  %nop3275 = alloca i1, i1 0
-  %nop3276 = alloca i1, i1 0
-  %nop3277 = alloca i1, i1 0
-  %nop3278 = alloca i1, i1 0
-  %nop3279 = alloca i1, i1 0
-  %nop3280 = alloca i1, i1 0
-  %nop3281 = alloca i1, i1 0
-  %nop3282 = alloca i1, i1 0
-  %nop3283 = alloca i1, i1 0
-  %nop3284 = alloca i1, i1 0
-  %nop3285 = alloca i1, i1 0
-  %nop3286 = alloca i1, i1 0
-  %nop3287 = alloca i1, i1 0
-  %nop3288 = alloca i1, i1 0
-  %nop3289 = alloca i1, i1 0
-  %nop3290 = alloca i1, i1 0
-  %nop3291 = alloca i1, i1 0
-  %nop3292 = alloca i1, i1 0
-  %nop3293 = alloca i1, i1 0
-  %nop3294 = alloca i1, i1 0
-  %nop3295 = alloca i1, i1 0
-  %nop3296 = alloca i1, i1 0
-  %nop3297 = alloca i1, i1 0
-  %nop3298 = alloca i1, i1 0
-  %nop3299 = alloca i1, i1 0
-  %nop3300 = alloca i1, i1 0
-  %nop3301 = alloca i1, i1 0
-  %nop3302 = alloca i1, i1 0
-  %nop3303 = alloca i1, i1 0
-  %nop3304 = alloca i1, i1 0
-  %nop3305 = alloca i1, i1 0
-  %nop3306 = alloca i1, i1 0
-  %nop3307 = alloca i1, i1 0
-  %nop3308 = alloca i1, i1 0
-  %nop3309 = alloca i1, i1 0
-  %nop3310 = alloca i1, i1 0
-  %nop3311 = alloca i1, i1 0
-  %nop3312 = alloca i1, i1 0
-  %nop3313 = alloca i1, i1 0
-  %nop3314 = alloca i1, i1 0
-  %nop3315 = alloca i1, i1 0
-  %nop3316 = alloca i1, i1 0
-  %nop3317 = alloca i1, i1 0
-  %nop3318 = alloca i1, i1 0
-  %nop3319 = alloca i1, i1 0
-  %nop3320 = alloca i1, i1 0
-  %nop3321 = alloca i1, i1 0
-  %nop3322 = alloca i1, i1 0
-  %nop3323 = alloca i1, i1 0
-  %nop3324 = alloca i1, i1 0
-  %nop3325 = alloca i1, i1 0
-  %nop3326 = alloca i1, i1 0
-  %nop3327 = alloca i1, i1 0
-  %nop3328 = alloca i1, i1 0
-  %nop3329 = alloca i1, i1 0
-  %nop3330 = alloca i1, i1 0
-  %nop3331 = alloca i1, i1 0
-  %nop3332 = alloca i1, i1 0
-  %nop3333 = alloca i1, i1 0
-  %nop3334 = alloca i1, i1 0
-  %nop3335 = alloca i1, i1 0
-  %nop3336 = alloca i1, i1 0
-  %nop3337 = alloca i1, i1 0
-  %nop3338 = alloca i1, i1 0
-  %nop3339 = alloca i1, i1 0
-  %nop3340 = alloca i1, i1 0
-  %nop3341 = alloca i1, i1 0
-  %nop3342 = alloca i1, i1 0
-  %nop3343 = alloca i1, i1 0
-  %nop3344 = alloca i1, i1 0
-  %nop3345 = alloca i1, i1 0
-  %nop3346 = alloca i1, i1 0
-  %nop3347 = alloca i1, i1 0
-  %nop3348 = alloca i1, i1 0
-  %nop3349 = alloca i1, i1 0
-  %nop3350 = alloca i1, i1 0
-  %nop3351 = alloca i1, i1 0
-  %nop3352 = alloca i1, i1 0
-  %nop3353 = alloca i1, i1 0
-  %nop3354 = alloca i1, i1 0
-  %nop3355 = alloca i1, i1 0
-  %nop3356 = alloca i1, i1 0
-  %nop3357 = alloca i1, i1 0
-  %nop3358 = alloca i1, i1 0
-  %nop3359 = alloca i1, i1 0
-  %nop3360 = alloca i1, i1 0
-  %nop3361 = alloca i1, i1 0
-  %nop3362 = alloca i1, i1 0
-  %nop3363 = alloca i1, i1 0
-  %nop3364 = alloca i1, i1 0
-  %nop3365 = alloca i1, i1 0
-  %nop3366 = alloca i1, i1 0
-  %nop3367 = alloca i1, i1 0
-  %nop3368 = alloca i1, i1 0
-  %nop3369 = alloca i1, i1 0
-  %nop3370 = alloca i1, i1 0
-  %nop3371 = alloca i1, i1 0
-  %nop3372 = alloca i1, i1 0
-  %nop3373 = alloca i1, i1 0
-  %nop3374 = alloca i1, i1 0
-  %nop3375 = alloca i1, i1 0
-  %nop3376 = alloca i1, i1 0
-  %nop3377 = alloca i1, i1 0
-  %nop3378 = alloca i1, i1 0
-  %nop3379 = alloca i1, i1 0
-  %nop3380 = alloca i1, i1 0
-  %nop3381 = alloca i1, i1 0
-  %nop3382 = alloca i1, i1 0
-  %nop3383 = alloca i1, i1 0
-  %nop3384 = alloca i1, i1 0
-  %nop3385 = alloca i1, i1 0
-  %nop3386 = alloca i1, i1 0
-  %nop3387 = alloca i1, i1 0
-  %nop3388 = alloca i1, i1 0
-  %nop3389 = alloca i1, i1 0
-  %nop3390 = alloca i1, i1 0
-  %nop3391 = alloca i1, i1 0
-  %nop3392 = alloca i1, i1 0
-  %nop3393 = alloca i1, i1 0
-  %nop3394 = alloca i1, i1 0
-  %nop3395 = alloca i1, i1 0
-  %nop3396 = alloca i1, i1 0
-  %nop3397 = alloca i1, i1 0
-  %nop3398 = alloca i1, i1 0
-  %nop3399 = alloca i1, i1 0
-  %nop3400 = alloca i1, i1 0
-  %nop3401 = alloca i1, i1 0
-  %nop3402 = alloca i1, i1 0
-  %nop3403 = alloca i1, i1 0
-  %nop3404 = alloca i1, i1 0
-  %nop3405 = alloca i1, i1 0
-  %nop3406 = alloca i1, i1 0
-  %nop3407 = alloca i1, i1 0
-  %nop3408 = alloca i1, i1 0
-  %nop3409 = alloca i1, i1 0
-  %nop3410 = alloca i1, i1 0
-  %nop3411 = alloca i1, i1 0
-  %nop3412 = alloca i1, i1 0
-  %nop3413 = alloca i1, i1 0
-  %nop3414 = alloca i1, i1 0
-  %nop3415 = alloca i1, i1 0
-  %nop3416 = alloca i1, i1 0
-  %nop3417 = alloca i1, i1 0
-  %nop3418 = alloca i1, i1 0
-  %nop3419 = alloca i1, i1 0
-  %nop3420 = alloca i1, i1 0
-  %nop3421 = alloca i1, i1 0
-  %nop3422 = alloca i1, i1 0
-  %nop3423 = alloca i1, i1 0
-  %nop3424 = alloca i1, i1 0
-  %nop3425 = alloca i1, i1 0
-  %nop3426 = alloca i1, i1 0
-  %nop3427 = alloca i1, i1 0
-  %nop3428 = alloca i1, i1 0
-  %nop3429 = alloca i1, i1 0
-  %nop3430 = alloca i1, i1 0
-  %nop3431 = alloca i1, i1 0
-  %nop3432 = alloca i1, i1 0
-  %nop3433 = alloca i1, i1 0
-  %nop3434 = alloca i1, i1 0
-  %nop3435 = alloca i1, i1 0
-  %nop3436 = alloca i1, i1 0
-  %nop3437 = alloca i1, i1 0
-  %nop3438 = alloca i1, i1 0
-  %nop3439 = alloca i1, i1 0
-  %nop3440 = alloca i1, i1 0
-  %nop3441 = alloca i1, i1 0
-  %nop3442 = alloca i1, i1 0
-  %nop3443 = alloca i1, i1 0
-  %nop3444 = alloca i1, i1 0
-  %nop3445 = alloca i1, i1 0
-  %nop3446 = alloca i1, i1 0
-  %nop3447 = alloca i1, i1 0
-  %nop3448 = alloca i1, i1 0
-  %nop3449 = alloca i1, i1 0
-  %nop3450 = alloca i1, i1 0
-  %nop3451 = alloca i1, i1 0
-  %nop3452 = alloca i1, i1 0
-  %nop3453 = alloca i1, i1 0
-  %nop3454 = alloca i1, i1 0
-  %nop3455 = alloca i1, i1 0
-  %nop3456 = alloca i1, i1 0
-  %nop3457 = alloca i1, i1 0
-  %nop3458 = alloca i1, i1 0
-  %nop3459 = alloca i1, i1 0
-  %nop3460 = alloca i1, i1 0
-  %nop3461 = alloca i1, i1 0
-  %nop3462 = alloca i1, i1 0
-  %nop3463 = alloca i1, i1 0
-  %nop3464 = alloca i1, i1 0
-  %nop3465 = alloca i1, i1 0
-  %nop3466 = alloca i1, i1 0
-  %nop3467 = alloca i1, i1 0
-  %nop3468 = alloca i1, i1 0
-  %nop3469 = alloca i1, i1 0
-  %nop3470 = alloca i1, i1 0
-  %nop3471 = alloca i1, i1 0
-  %nop3472 = alloca i1, i1 0
-  %nop3473 = alloca i1, i1 0
-  %nop3474 = alloca i1, i1 0
-  %nop3475 = alloca i1, i1 0
-  %nop3476 = alloca i1, i1 0
-  %nop3477 = alloca i1, i1 0
-  %nop3478 = alloca i1, i1 0
-  %nop3479 = alloca i1, i1 0
-  %nop3480 = alloca i1, i1 0
-  %nop3481 = alloca i1, i1 0
-  %nop3482 = alloca i1, i1 0
-  %nop3483 = alloca i1, i1 0
-  %nop3484 = alloca i1, i1 0
-  %nop3485 = alloca i1, i1 0
-  %nop3486 = alloca i1, i1 0
-  %nop3487 = alloca i1, i1 0
-  %nop3488 = alloca i1, i1 0
-  %nop3489 = alloca i1, i1 0
-  %nop3490 = alloca i1, i1 0
-  %nop3491 = alloca i1, i1 0
-  %nop3492 = alloca i1, i1 0
-  %nop3493 = alloca i1, i1 0
-  %nop3494 = alloca i1, i1 0
-  %nop3495 = alloca i1, i1 0
-  %nop3496 = alloca i1, i1 0
-  %nop3497 = alloca i1, i1 0
-  %nop3498 = alloca i1, i1 0
-  %nop3499 = alloca i1, i1 0
-  %nop3500 = alloca i1, i1 0
-  %nop3501 = alloca i1, i1 0
-  %nop3502 = alloca i1, i1 0
-  %nop3503 = alloca i1, i1 0
-  %nop3504 = alloca i1, i1 0
-  %nop3505 = alloca i1, i1 0
-  %nop3506 = alloca i1, i1 0
-  %nop3507 = alloca i1, i1 0
-  %nop3508 = alloca i1, i1 0
-  %nop3509 = alloca i1, i1 0
-  %nop3510 = alloca i1, i1 0
-  %nop3511 = alloca i1, i1 0
-  %nop3512 = alloca i1, i1 0
-  %nop3513 = alloca i1, i1 0
-  %nop3514 = alloca i1, i1 0
-  %nop3515 = alloca i1, i1 0
-  %nop3516 = alloca i1, i1 0
-  %nop3517 = alloca i1, i1 0
-  %nop3518 = alloca i1, i1 0
-  %nop3519 = alloca i1, i1 0
-  %nop3520 = alloca i1, i1 0
-  %nop3521 = alloca i1, i1 0
-  %nop3522 = alloca i1, i1 0
-  %nop3523 = alloca i1, i1 0
-  %nop3524 = alloca i1, i1 0
-  %nop3525 = alloca i1, i1 0
-  %nop3526 = alloca i1, i1 0
-  %nop3527 = alloca i1, i1 0
-  %nop3528 = alloca i1, i1 0
-  %nop3529 = alloca i1, i1 0
-  %nop3530 = alloca i1, i1 0
-  %nop3531 = alloca i1, i1 0
-  %nop3532 = alloca i1, i1 0
-  %nop3533 = alloca i1, i1 0
-  %nop3534 = alloca i1, i1 0
-  %nop3535 = alloca i1, i1 0
-  %nop3536 = alloca i1, i1 0
-  %nop3537 = alloca i1, i1 0
-  %nop3538 = alloca i1, i1 0
-  %nop3539 = alloca i1, i1 0
-  %nop3540 = alloca i1, i1 0
-  %nop3541 = alloca i1, i1 0
-  %nop3542 = alloca i1, i1 0
-  %nop3543 = alloca i1, i1 0
-  %nop3544 = alloca i1, i1 0
-  %nop3545 = alloca i1, i1 0
-  %nop3546 = alloca i1, i1 0
-  %nop3547 = alloca i1, i1 0
-  %nop3548 = alloca i1, i1 0
-  %nop3549 = alloca i1, i1 0
-  %nop3550 = alloca i1, i1 0
-  %nop3551 = alloca i1, i1 0
-  %nop3552 = alloca i1, i1 0
-  %nop3553 = alloca i1, i1 0
-  %nop3554 = alloca i1, i1 0
-  %nop3555 = alloca i1, i1 0
-  %nop3556 = alloca i1, i1 0
-  %nop3557 = alloca i1, i1 0
-  %nop3558 = alloca i1, i1 0
-  %nop3559 = alloca i1, i1 0
-  %nop3560 = alloca i1, i1 0
-  %nop3561 = alloca i1, i1 0
-  %nop3562 = alloca i1, i1 0
-  %nop3563 = alloca i1, i1 0
-  %nop3564 = alloca i1, i1 0
-  %nop3565 = alloca i1, i1 0
-  %nop3566 = alloca i1, i1 0
-  %nop3567 = alloca i1, i1 0
-  %nop3568 = alloca i1, i1 0
-  %nop3569 = alloca i1, i1 0
-  %nop3570 = alloca i1, i1 0
-  %nop3571 = alloca i1, i1 0
-  %nop3572 = alloca i1, i1 0
-  %nop3573 = alloca i1, i1 0
-  %nop3574 = alloca i1, i1 0
-  %nop3575 = alloca i1, i1 0
-  %nop3576 = alloca i1, i1 0
-  %nop3577 = alloca i1, i1 0
-  %nop3578 = alloca i1, i1 0
-  %nop3579 = alloca i1, i1 0
-  %nop3580 = alloca i1, i1 0
-  %nop3581 = alloca i1, i1 0
-  %nop3582 = alloca i1, i1 0
-  %nop3583 = alloca i1, i1 0
-  %nop3584 = alloca i1, i1 0
-  %nop3585 = alloca i1, i1 0
-  %nop3586 = alloca i1, i1 0
-  %nop3587 = alloca i1, i1 0
-  %nop3588 = alloca i1, i1 0
-  %nop3589 = alloca i1, i1 0
-  %nop3590 = alloca i1, i1 0
-  %nop3591 = alloca i1, i1 0
-  %nop3592 = alloca i1, i1 0
-  %nop3593 = alloca i1, i1 0
-  %nop3594 = alloca i1, i1 0
-  %nop3595 = alloca i1, i1 0
-  %nop3596 = alloca i1, i1 0
-  %nop3597 = alloca i1, i1 0
-  %nop3598 = alloca i1, i1 0
-  %nop3599 = alloca i1, i1 0
-  %nop3600 = alloca i1, i1 0
-  %nop3601 = alloca i1, i1 0
-  %nop3602 = alloca i1, i1 0
-  %nop3603 = alloca i1, i1 0
-  %nop3604 = alloca i1, i1 0
-  %nop3605 = alloca i1, i1 0
-  %nop3606 = alloca i1, i1 0
-  %nop3607 = alloca i1, i1 0
-  %nop3608 = alloca i1, i1 0
-  %nop3609 = alloca i1, i1 0
-  %nop3610 = alloca i1, i1 0
-  %nop3611 = alloca i1, i1 0
-  %nop3612 = alloca i1, i1 0
-  %nop3613 = alloca i1, i1 0
-  %nop3614 = alloca i1, i1 0
-  %nop3615 = alloca i1, i1 0
-  %nop3616 = alloca i1, i1 0
-  %nop3617 = alloca i1, i1 0
-  %nop3618 = alloca i1, i1 0
-  %nop3619 = alloca i1, i1 0
-  %nop3620 = alloca i1, i1 0
-  %nop3621 = alloca i1, i1 0
-  %nop3622 = alloca i1, i1 0
-  %nop3623 = alloca i1, i1 0
-  %nop3624 = alloca i1, i1 0
-  %nop3625 = alloca i1, i1 0
-  %nop3626 = alloca i1, i1 0
-  %nop3627 = alloca i1, i1 0
-  %nop3628 = alloca i1, i1 0
-  %nop3629 = alloca i1, i1 0
-  %nop3630 = alloca i1, i1 0
-  %nop3631 = alloca i1, i1 0
-  %nop3632 = alloca i1, i1 0
-  %nop3633 = alloca i1, i1 0
-  %nop3634 = alloca i1, i1 0
-  %nop3635 = alloca i1, i1 0
-  %nop3636 = alloca i1, i1 0
-  %nop3637 = alloca i1, i1 0
-  %nop3638 = alloca i1, i1 0
-  %nop3639 = alloca i1, i1 0
-  %nop3640 = alloca i1, i1 0
-  %nop3641 = alloca i1, i1 0
-  %nop3642 = alloca i1, i1 0
-  %nop3643 = alloca i1, i1 0
-  %nop3644 = alloca i1, i1 0
-  %nop3645 = alloca i1, i1 0
-  %nop3646 = alloca i1, i1 0
-  %nop3647 = alloca i1, i1 0
-  %nop3648 = alloca i1, i1 0
-  %nop3649 = alloca i1, i1 0
-  %nop3650 = alloca i1, i1 0
-  %nop3651 = alloca i1, i1 0
-  %nop3652 = alloca i1, i1 0
-  %nop3653 = alloca i1, i1 0
-  %nop3654 = alloca i1, i1 0
-  %nop3655 = alloca i1, i1 0
-  %nop3656 = alloca i1, i1 0
-  %nop3657 = alloca i1, i1 0
-  %nop3658 = alloca i1, i1 0
-  %nop3659 = alloca i1, i1 0
-  %nop3660 = alloca i1, i1 0
-  %nop3661 = alloca i1, i1 0
-  %nop3662 = alloca i1, i1 0
-  %nop3663 = alloca i1, i1 0
-  %nop3664 = alloca i1, i1 0
-  %nop3665 = alloca i1, i1 0
-  %nop3666 = alloca i1, i1 0
-  %nop3667 = alloca i1, i1 0
-  %nop3668 = alloca i1, i1 0
-  %nop3669 = alloca i1, i1 0
-  %nop3670 = alloca i1, i1 0
-  %nop3671 = alloca i1, i1 0
-  %nop3672 = alloca i1, i1 0
-  %nop3673 = alloca i1, i1 0
-  %nop3674 = alloca i1, i1 0
-  %nop3675 = alloca i1, i1 0
-  %nop3676 = alloca i1, i1 0
-  %nop3677 = alloca i1, i1 0
-  %nop3678 = alloca i1, i1 0
-  %nop3679 = alloca i1, i1 0
-  %nop3680 = alloca i1, i1 0
-  %nop3681 = alloca i1, i1 0
-  %nop3682 = alloca i1, i1 0
-  %nop3683 = alloca i1, i1 0
-  %nop3684 = alloca i1, i1 0
-  %nop3685 = alloca i1, i1 0
-  %nop3686 = alloca i1, i1 0
-  %nop3687 = alloca i1, i1 0
-  %nop3688 = alloca i1, i1 0
-  %nop3689 = alloca i1, i1 0
-  %nop3690 = alloca i1, i1 0
-  %nop3691 = alloca i1, i1 0
-  %nop3692 = alloca i1, i1 0
-  %nop3693 = alloca i1, i1 0
-  %nop3694 = alloca i1, i1 0
-  %nop3695 = alloca i1, i1 0
-  %nop3696 = alloca i1, i1 0
-  %nop3697 = alloca i1, i1 0
-  %nop3698 = alloca i1, i1 0
-  %nop3699 = alloca i1, i1 0
-  %nop3700 = alloca i1, i1 0
-  %nop3701 = alloca i1, i1 0
-  %nop3702 = alloca i1, i1 0
-  %nop3703 = alloca i1, i1 0
-  %nop3704 = alloca i1, i1 0
-  %nop3705 = alloca i1, i1 0
-  %nop3706 = alloca i1, i1 0
-  %nop3707 = alloca i1, i1 0
-  %nop3708 = alloca i1, i1 0
-  %nop3709 = alloca i1, i1 0
-  %nop3710 = alloca i1, i1 0
-  %nop3711 = alloca i1, i1 0
-  %nop3712 = alloca i1, i1 0
-  %nop3713 = alloca i1, i1 0
-  %nop3714 = alloca i1, i1 0
-  %nop3715 = alloca i1, i1 0
-  %nop3716 = alloca i1, i1 0
-  %nop3717 = alloca i1, i1 0
-  %nop3718 = alloca i1, i1 0
-  %nop3719 = alloca i1, i1 0
-  %nop3720 = alloca i1, i1 0
-  %nop3721 = alloca i1, i1 0
-  %nop3722 = alloca i1, i1 0
-  %nop3723 = alloca i1, i1 0
-  %nop3724 = alloca i1, i1 0
-  %nop3725 = alloca i1, i1 0
-  %nop3726 = alloca i1, i1 0
-  %nop3727 = alloca i1, i1 0
-  %nop3728 = alloca i1, i1 0
-  %nop3729 = alloca i1, i1 0
-  %nop3730 = alloca i1, i1 0
-  %nop3731 = alloca i1, i1 0
-  %nop3732 = alloca i1, i1 0
-  %nop3733 = alloca i1, i1 0
-  %nop3734 = alloca i1, i1 0
-  %nop3735 = alloca i1, i1 0
-  %nop3736 = alloca i1, i1 0
-  %nop3737 = alloca i1, i1 0
-  %nop3738 = alloca i1, i1 0
-  %nop3739 = alloca i1, i1 0
-  %nop3740 = alloca i1, i1 0
-  %nop3741 = alloca i1, i1 0
-  %nop3742 = alloca i1, i1 0
-  %nop3743 = alloca i1, i1 0
-  %nop3744 = alloca i1, i1 0
-  %nop3745 = alloca i1, i1 0
-  %nop3746 = alloca i1, i1 0
-  %nop3747 = alloca i1, i1 0
-  %nop3748 = alloca i1, i1 0
-  %nop3749 = alloca i1, i1 0
-  %nop3750 = alloca i1, i1 0
-  %nop3751 = alloca i1, i1 0
-  %nop3752 = alloca i1, i1 0
-  %nop3753 = alloca i1, i1 0
-  %nop3754 = alloca i1, i1 0
-  %nop3755 = alloca i1, i1 0
-  %nop3756 = alloca i1, i1 0
-  %nop3757 = alloca i1, i1 0
-  %nop3758 = alloca i1, i1 0
-  %nop3759 = alloca i1, i1 0
-  %nop3760 = alloca i1, i1 0
-  %nop3761 = alloca i1, i1 0
-  %nop3762 = alloca i1, i1 0
-  %nop3763 = alloca i1, i1 0
-  %nop3764 = alloca i1, i1 0
-  %nop3765 = alloca i1, i1 0
-  %nop3766 = alloca i1, i1 0
-  %nop3767 = alloca i1, i1 0
-  %nop3768 = alloca i1, i1 0
-  %nop3769 = alloca i1, i1 0
-  %nop3770 = alloca i1, i1 0
-  %nop3771 = alloca i1, i1 0
-  %nop3772 = alloca i1, i1 0
-  %nop3773 = alloca i1, i1 0
-  %nop3774 = alloca i1, i1 0
-  %nop3775 = alloca i1, i1 0
-  %nop3776 = alloca i1, i1 0
-  %nop3777 = alloca i1, i1 0
-  %nop3778 = alloca i1, i1 0
-  %nop3779 = alloca i1, i1 0
-  %nop3780 = alloca i1, i1 0
-  %nop3781 = alloca i1, i1 0
-  %nop3782 = alloca i1, i1 0
-  %nop3783 = alloca i1, i1 0
-  %nop3784 = alloca i1, i1 0
-  %nop3785 = alloca i1, i1 0
-  %nop3786 = alloca i1, i1 0
-  %nop3787 = alloca i1, i1 0
-  %nop3788 = alloca i1, i1 0
-  %nop3789 = alloca i1, i1 0
-  %nop3790 = alloca i1, i1 0
-  %nop3791 = alloca i1, i1 0
-  %nop3792 = alloca i1, i1 0
-  %nop3793 = alloca i1, i1 0
-  %nop3794 = alloca i1, i1 0
-  %nop3795 = alloca i1, i1 0
-  %nop3796 = alloca i1, i1 0
-  %nop3797 = alloca i1, i1 0
-  %nop3798 = alloca i1, i1 0
-  %nop3799 = alloca i1, i1 0
-  %nop3800 = alloca i1, i1 0
-  %nop3801 = alloca i1, i1 0
-  %nop3802 = alloca i1, i1 0
-  %nop3803 = alloca i1, i1 0
-  %nop3804 = alloca i1, i1 0
-  %nop3805 = alloca i1, i1 0
-  %nop3806 = alloca i1, i1 0
-  %nop3807 = alloca i1, i1 0
-  %nop3808 = alloca i1, i1 0
-  %nop3809 = alloca i1, i1 0
-  %nop3810 = alloca i1, i1 0
-  %nop3811 = alloca i1, i1 0
-  %nop3812 = alloca i1, i1 0
-  %nop3813 = alloca i1, i1 0
-  %nop3814 = alloca i1, i1 0
-  %nop3815 = alloca i1, i1 0
-  %nop3816 = alloca i1, i1 0
-  %nop3817 = alloca i1, i1 0
-  %nop3818 = alloca i1, i1 0
-  %nop3819 = alloca i1, i1 0
-  %nop3820 = alloca i1, i1 0
-  %nop3821 = alloca i1, i1 0
-  %nop3822 = alloca i1, i1 0
-  %nop3823 = alloca i1, i1 0
-  %nop3824 = alloca i1, i1 0
-  %nop3825 = alloca i1, i1 0
-  %nop3826 = alloca i1, i1 0
-  %nop3827 = alloca i1, i1 0
-  %nop3828 = alloca i1, i1 0
-  %nop3829 = alloca i1, i1 0
-  %nop3830 = alloca i1, i1 0
-  %nop3831 = alloca i1, i1 0
-  %nop3832 = alloca i1, i1 0
-  %nop3833 = alloca i1, i1 0
-  %nop3834 = alloca i1, i1 0
-  %nop3835 = alloca i1, i1 0
-  %nop3836 = alloca i1, i1 0
-  %nop3837 = alloca i1, i1 0
-  %nop3838 = alloca i1, i1 0
-  %nop3839 = alloca i1, i1 0
-  %nop3840 = alloca i1, i1 0
-  %nop3841 = alloca i1, i1 0
-  %nop3842 = alloca i1, i1 0
-  %nop3843 = alloca i1, i1 0
-  %nop3844 = alloca i1, i1 0
-  %nop3845 = alloca i1, i1 0
-  %nop3846 = alloca i1, i1 0
-  %nop3847 = alloca i1, i1 0
-  %nop3848 = alloca i1, i1 0
-  %nop3849 = alloca i1, i1 0
-  %nop3850 = alloca i1, i1 0
-  %nop3851 = alloca i1, i1 0
-  %nop3852 = alloca i1, i1 0
-  %nop3853 = alloca i1, i1 0
-  %nop3854 = alloca i1, i1 0
-  %nop3855 = alloca i1, i1 0
-  %nop3856 = alloca i1, i1 0
-  %nop3857 = alloca i1, i1 0
-  %nop3858 = alloca i1, i1 0
-  %nop3859 = alloca i1, i1 0
-  %nop3860 = alloca i1, i1 0
-  %nop3861 = alloca i1, i1 0
-  %nop3862 = alloca i1, i1 0
-  %nop3863 = alloca i1, i1 0
-  %nop3864 = alloca i1, i1 0
-  %nop3865 = alloca i1, i1 0
-  %nop3866 = alloca i1, i1 0
-  %nop3867 = alloca i1, i1 0
-  %nop3868 = alloca i1, i1 0
-  %nop3869 = alloca i1, i1 0
-  %nop3870 = alloca i1, i1 0
-  %nop3871 = alloca i1, i1 0
-  %nop3872 = alloca i1, i1 0
-  %nop3873 = alloca i1, i1 0
-  %nop3874 = alloca i1, i1 0
-  %nop3875 = alloca i1, i1 0
-  %nop3876 = alloca i1, i1 0
-  %nop3877 = alloca i1, i1 0
-  %nop3878 = alloca i1, i1 0
-  %nop3879 = alloca i1, i1 0
-  %nop3880 = alloca i1, i1 0
-  %nop3881 = alloca i1, i1 0
-  %nop3882 = alloca i1, i1 0
-  %nop3883 = alloca i1, i1 0
-  %nop3884 = alloca i1, i1 0
-  %nop3885 = alloca i1, i1 0
-  %nop3886 = alloca i1, i1 0
-  %nop3887 = alloca i1, i1 0
-  %nop3888 = alloca i1, i1 0
-  %nop3889 = alloca i1, i1 0
-  %nop3890 = alloca i1, i1 0
-  %nop3891 = alloca i1, i1 0
-  %nop3892 = alloca i1, i1 0
-  %nop3893 = alloca i1, i1 0
-  %nop3894 = alloca i1, i1 0
-  %nop3895 = alloca i1, i1 0
-  %nop3896 = alloca i1, i1 0
-  %nop3897 = alloca i1, i1 0
-  %nop3898 = alloca i1, i1 0
-  %nop3899 = alloca i1, i1 0
-  %nop3900 = alloca i1, i1 0
-  %nop3901 = alloca i1, i1 0
-  %nop3902 = alloca i1, i1 0
-  %nop3903 = alloca i1, i1 0
-  %nop3904 = alloca i1, i1 0
-  %nop3905 = alloca i1, i1 0
-  %nop3906 = alloca i1, i1 0
-  %nop3907 = alloca i1, i1 0
-  %nop3908 = alloca i1, i1 0
-  %nop3909 = alloca i1, i1 0
-  %nop3910 = alloca i1, i1 0
-  %nop3911 = alloca i1, i1 0
-  %nop3912 = alloca i1, i1 0
-  %nop3913 = alloca i1, i1 0
-  %nop3914 = alloca i1, i1 0
-  %nop3915 = alloca i1, i1 0
-  %nop3916 = alloca i1, i1 0
-  %nop3917 = alloca i1, i1 0
-  %nop3918 = alloca i1, i1 0
-  %nop3919 = alloca i1, i1 0
-  %nop3920 = alloca i1, i1 0
-  %nop3921 = alloca i1, i1 0
-  %nop3922 = alloca i1, i1 0
-  %nop3923 = alloca i1, i1 0
-  %nop3924 = alloca i1, i1 0
-  %nop3925 = alloca i1, i1 0
-  %nop3926 = alloca i1, i1 0
-  %nop3927 = alloca i1, i1 0
-  %nop3928 = alloca i1, i1 0
-  %nop3929 = alloca i1, i1 0
-  %nop3930 = alloca i1, i1 0
-  %nop3931 = alloca i1, i1 0
-  %nop3932 = alloca i1, i1 0
-  %nop3933 = alloca i1, i1 0
-  %nop3934 = alloca i1, i1 0
-  %nop3935 = alloca i1, i1 0
-  %nop3936 = alloca i1, i1 0
-  %nop3937 = alloca i1, i1 0
-  %nop3938 = alloca i1, i1 0
-  %nop3939 = alloca i1, i1 0
-  %nop3940 = alloca i1, i1 0
-  %nop3941 = alloca i1, i1 0
-  %nop3942 = alloca i1, i1 0
-  %nop3943 = alloca i1, i1 0
-  %nop3944 = alloca i1, i1 0
-  %nop3945 = alloca i1, i1 0
-  %nop3946 = alloca i1, i1 0
-  %nop3947 = alloca i1, i1 0
-  %nop3948 = alloca i1, i1 0
-  %nop3949 = alloca i1, i1 0
-  %nop3950 = alloca i1, i1 0
-  %nop3951 = alloca i1, i1 0
-  %nop3952 = alloca i1, i1 0
-  %nop3953 = alloca i1, i1 0
-  %nop3954 = alloca i1, i1 0
-  %nop3955 = alloca i1, i1 0
-  %nop3956 = alloca i1, i1 0
-  %nop3957 = alloca i1, i1 0
-  %nop3958 = alloca i1, i1 0
-  %nop3959 = alloca i1, i1 0
-  %nop3960 = alloca i1, i1 0
-  %nop3961 = alloca i1, i1 0
-  %nop3962 = alloca i1, i1 0
-  %nop3963 = alloca i1, i1 0
-  %nop3964 = alloca i1, i1 0
-  %nop3965 = alloca i1, i1 0
-  %nop3966 = alloca i1, i1 0
-  %nop3967 = alloca i1, i1 0
-  %nop3968 = alloca i1, i1 0
-  %nop3969 = alloca i1, i1 0
-  %nop3970 = alloca i1, i1 0
-  %nop3971 = alloca i1, i1 0
-  %nop3972 = alloca i1, i1 0
-  %nop3973 = alloca i1, i1 0
-  %nop3974 = alloca i1, i1 0
-  %nop3975 = alloca i1, i1 0
-  %nop3976 = alloca i1, i1 0
-  %nop3977 = alloca i1, i1 0
-  %nop3978 = alloca i1, i1 0
-  %nop3979 = alloca i1, i1 0
-  %nop3980 = alloca i1, i1 0
-  %nop3981 = alloca i1, i1 0
-  %nop3982 = alloca i1, i1 0
-  %nop3983 = alloca i1, i1 0
-  %nop3984 = alloca i1, i1 0
-  %nop3985 = alloca i1, i1 0
-  %nop3986 = alloca i1, i1 0
-  %nop3987 = alloca i1, i1 0
-  %nop3988 = alloca i1, i1 0
-  %nop3989 = alloca i1, i1 0
-  %nop3990 = alloca i1, i1 0
-  %nop3991 = alloca i1, i1 0
-  %nop3992 = alloca i1, i1 0
-  %nop3993 = alloca i1, i1 0
-  %nop3994 = alloca i1, i1 0
-  %nop3995 = alloca i1, i1 0
-  %nop3996 = alloca i1, i1 0
-  %nop3997 = alloca i1, i1 0
-  %nop3998 = alloca i1, i1 0
-  %nop3999 = alloca i1, i1 0
-  %nop4000 = alloca i1, i1 0
-  %nop4001 = alloca i1, i1 0
-  %nop4002 = alloca i1, i1 0
-  %nop4003 = alloca i1, i1 0
-  %nop4004 = alloca i1, i1 0
-  %nop4005 = alloca i1, i1 0
-  %nop4006 = alloca i1, i1 0
-  %nop4007 = alloca i1, i1 0
-  %nop4008 = alloca i1, i1 0
-  %nop4009 = alloca i1, i1 0
-  %nop4010 = alloca i1, i1 0
-  %nop4011 = alloca i1, i1 0
-  %nop4012 = alloca i1, i1 0
-  %nop4013 = alloca i1, i1 0
-  %nop4014 = alloca i1, i1 0
-  %nop4015 = alloca i1, i1 0
-  %nop4016 = alloca i1, i1 0
-  %nop4017 = alloca i1, i1 0
-  %nop4018 = alloca i1, i1 0
-  %nop4019 = alloca i1, i1 0
-  %nop4020 = alloca i1, i1 0
-  %nop4021 = alloca i1, i1 0
-  %nop4022 = alloca i1, i1 0
-  %nop4023 = alloca i1, i1 0
-  %nop4024 = alloca i1, i1 0
-  %nop4025 = alloca i1, i1 0
-  %nop4026 = alloca i1, i1 0
-  %nop4027 = alloca i1, i1 0
-  %nop4028 = alloca i1, i1 0
-  %nop4029 = alloca i1, i1 0
-  %nop4030 = alloca i1, i1 0
-  %nop4031 = alloca i1, i1 0
-  %nop4032 = alloca i1, i1 0
-  %nop4033 = alloca i1, i1 0
-  %nop4034 = alloca i1, i1 0
-  %nop4035 = alloca i1, i1 0
-  %nop4036 = alloca i1, i1 0
-  %nop4037 = alloca i1, i1 0
-  %nop4038 = alloca i1, i1 0
-  %nop4039 = alloca i1, i1 0
-  %nop4040 = alloca i1, i1 0
-  %nop4041 = alloca i1, i1 0
-  %nop4042 = alloca i1, i1 0
-  %nop4043 = alloca i1, i1 0
-  %nop4044 = alloca i1, i1 0
-  %nop4045 = alloca i1, i1 0
-  %nop4046 = alloca i1, i1 0
-  %nop4047 = alloca i1, i1 0
-  %nop4048 = alloca i1, i1 0
-  %nop4049 = alloca i1, i1 0
-  %nop4050 = alloca i1, i1 0
-  %nop4051 = alloca i1, i1 0
-  %nop4052 = alloca i1, i1 0
-  %nop4053 = alloca i1, i1 0
-  %nop4054 = alloca i1, i1 0
-  %nop4055 = alloca i1, i1 0
-  %nop4056 = alloca i1, i1 0
-  %nop4057 = alloca i1, i1 0
-  %nop4058 = alloca i1, i1 0
-  %nop4059 = alloca i1, i1 0
-  %nop4060 = alloca i1, i1 0
-  %nop4061 = alloca i1, i1 0
-  %nop4062 = alloca i1, i1 0
-  %nop4063 = alloca i1, i1 0
-  %nop4064 = alloca i1, i1 0
-  %nop4065 = alloca i1, i1 0
-  %nop4066 = alloca i1, i1 0
-  %nop4067 = alloca i1, i1 0
-  %nop4068 = alloca i1, i1 0
-  %nop4069 = alloca i1, i1 0
-  %nop4070 = alloca i1, i1 0
-  %nop4071 = alloca i1, i1 0
-  %nop4072 = alloca i1, i1 0
-  %nop4073 = alloca i1, i1 0
-  %nop4074 = alloca i1, i1 0
-  %nop4075 = alloca i1, i1 0
-  %nop4076 = alloca i1, i1 0
-  %nop4077 = alloca i1, i1 0
-  %nop4078 = alloca i1, i1 0
-  %nop4079 = alloca i1, i1 0
-  %nop4080 = alloca i1, i1 0
-  %nop4081 = alloca i1, i1 0
-  %nop4082 = alloca i1, i1 0
-  %nop4083 = alloca i1, i1 0
-  %nop4084 = alloca i1, i1 0
-  %nop4085 = alloca i1, i1 0
-  %nop4086 = alloca i1, i1 0
-  %nop4087 = alloca i1, i1 0
-  %nop4088 = alloca i1, i1 0
-  %nop4089 = alloca i1, i1 0
-  %nop4090 = alloca i1, i1 0
-  %nop4091 = alloca i1, i1 0
-  %nop4092 = alloca i1, i1 0
-  %nop4093 = alloca i1, i1 0
-  %nop4094 = alloca i1, i1 0
-  %nop4095 = alloca i1, i1 0
-  %nop4096 = alloca i1, i1 0
-  %nop4097 = alloca i1, i1 0
-  %nop4098 = alloca i1, i1 0
-  %nop4099 = alloca i1, i1 0
-  %nop4100 = alloca i1, i1 0
-  %nop4101 = alloca i1, i1 0
-  %nop4102 = alloca i1, i1 0
-  %nop4103 = alloca i1, i1 0
-  %nop4104 = alloca i1, i1 0
-  %nop4105 = alloca i1, i1 0
-  %nop4106 = alloca i1, i1 0
-  %nop4107 = alloca i1, i1 0
-  %nop4108 = alloca i1, i1 0
-  %nop4109 = alloca i1, i1 0
-  %nop4110 = alloca i1, i1 0
-  %nop4111 = alloca i1, i1 0
-  %nop4112 = alloca i1, i1 0
-  %nop4113 = alloca i1, i1 0
-  %nop4114 = alloca i1, i1 0
-  %nop4115 = alloca i1, i1 0
-  %nop4116 = alloca i1, i1 0
-  %nop4117 = alloca i1, i1 0
-  %nop4118 = alloca i1, i1 0
-  %nop4119 = alloca i1, i1 0
-  %nop4120 = alloca i1, i1 0
-  %nop4121 = alloca i1, i1 0
-  %nop4122 = alloca i1, i1 0
-  %nop4123 = alloca i1, i1 0
-  %nop4124 = alloca i1, i1 0
-  %nop4125 = alloca i1, i1 0
-  %nop4126 = alloca i1, i1 0
-  %nop4127 = alloca i1, i1 0
-  %nop4128 = alloca i1, i1 0
-  %nop4129 = alloca i1, i1 0
-  %nop4130 = alloca i1, i1 0
-  %nop4131 = alloca i1, i1 0
-  %nop4132 = alloca i1, i1 0
-  %nop4133 = alloca i1, i1 0
-  %nop4134 = alloca i1, i1 0
-  %nop4135 = alloca i1, i1 0
-  %nop4136 = alloca i1, i1 0
-  %nop4137 = alloca i1, i1 0
-  %nop4138 = alloca i1, i1 0
-  %nop4139 = alloca i1, i1 0
-  %nop4140 = alloca i1, i1 0
-  %nop4141 = alloca i1, i1 0
-  %nop4142 = alloca i1, i1 0
-  %nop4143 = alloca i1, i1 0
-  %nop4144 = alloca i1, i1 0
-  %nop4145 = alloca i1, i1 0
-  %nop4146 = alloca i1, i1 0
-  %nop4147 = alloca i1, i1 0
-  %nop4148 = alloca i1, i1 0
-  %nop4149 = alloca i1, i1 0
-  %nop4150 = alloca i1, i1 0
-  %nop4151 = alloca i1, i1 0
-  %nop4152 = alloca i1, i1 0
-  %nop4153 = alloca i1, i1 0
-  %nop4154 = alloca i1, i1 0
-  %nop4155 = alloca i1, i1 0
-  %nop4156 = alloca i1, i1 0
-  %nop4157 = alloca i1, i1 0
-  %nop4158 = alloca i1, i1 0
-  %nop4159 = alloca i1, i1 0
-  %nop4160 = alloca i1, i1 0
-  %nop4161 = alloca i1, i1 0
-  %nop4162 = alloca i1, i1 0
-  %nop4163 = alloca i1, i1 0
-  %nop4164 = alloca i1, i1 0
-  %nop4165 = alloca i1, i1 0
-  %nop4166 = alloca i1, i1 0
-  %nop4167 = alloca i1, i1 0
-  %nop4168 = alloca i1, i1 0
-  %nop4169 = alloca i1, i1 0
-  %nop4170 = alloca i1, i1 0
-  %nop4171 = alloca i1, i1 0
-  %nop4172 = alloca i1, i1 0
-  %nop4173 = alloca i1, i1 0
-  %nop4174 = alloca i1, i1 0
-  %nop4175 = alloca i1, i1 0
-  %nop4176 = alloca i1, i1 0
-  %nop4177 = alloca i1, i1 0
-  %nop4178 = alloca i1, i1 0
-  %nop4179 = alloca i1, i1 0
-  %nop4180 = alloca i1, i1 0
-  %nop4181 = alloca i1, i1 0
-  %nop4182 = alloca i1, i1 0
-  %nop4183 = alloca i1, i1 0
-  %nop4184 = alloca i1, i1 0
-  %nop4185 = alloca i1, i1 0
-  %nop4186 = alloca i1, i1 0
-  %nop4187 = alloca i1, i1 0
-  %nop4188 = alloca i1, i1 0
-  %nop4189 = alloca i1, i1 0
-  %nop4190 = alloca i1, i1 0
-  %nop4191 = alloca i1, i1 0
-  %nop4192 = alloca i1, i1 0
-  %nop4193 = alloca i1, i1 0
-  %nop4194 = alloca i1, i1 0
-  %nop4195 = alloca i1, i1 0
-  %nop4196 = alloca i1, i1 0
-  %nop4197 = alloca i1, i1 0
-  %nop4198 = alloca i1, i1 0
-  %nop4199 = alloca i1, i1 0
-  %nop4200 = alloca i1, i1 0
-  %nop4201 = alloca i1, i1 0
-  %nop4202 = alloca i1, i1 0
-  %nop4203 = alloca i1, i1 0
-  %nop4204 = alloca i1, i1 0
-  %nop4205 = alloca i1, i1 0
-  %nop4206 = alloca i1, i1 0
-  %nop4207 = alloca i1, i1 0
-  %nop4208 = alloca i1, i1 0
-  %nop4209 = alloca i1, i1 0
-  %nop4210 = alloca i1, i1 0
-  %nop4211 = alloca i1, i1 0
-  %nop4212 = alloca i1, i1 0
-  %nop4213 = alloca i1, i1 0
-  %nop4214 = alloca i1, i1 0
-  %nop4215 = alloca i1, i1 0
-  %nop4216 = alloca i1, i1 0
-  %nop4217 = alloca i1, i1 0
-  %nop4218 = alloca i1, i1 0
-  %nop4219 = alloca i1, i1 0
-  %nop4220 = alloca i1, i1 0
-  %nop4221 = alloca i1, i1 0
-  %nop4222 = alloca i1, i1 0
-  %nop4223 = alloca i1, i1 0
-  %nop4224 = alloca i1, i1 0
-  %nop4225 = alloca i1, i1 0
-  %nop4226 = alloca i1, i1 0
-  %nop4227 = alloca i1, i1 0
-  %nop4228 = alloca i1, i1 0
-  %nop4229 = alloca i1, i1 0
-  %nop4230 = alloca i1, i1 0
-  %nop4231 = alloca i1, i1 0
-  %nop4232 = alloca i1, i1 0
-  %nop4233 = alloca i1, i1 0
-  %nop4234 = alloca i1, i1 0
-  %nop4235 = alloca i1, i1 0
-  %nop4236 = alloca i1, i1 0
-  %nop4237 = alloca i1, i1 0
-  %nop4238 = alloca i1, i1 0
-  %nop4239 = alloca i1, i1 0
-  %nop4240 = alloca i1, i1 0
-  %nop4241 = alloca i1, i1 0
-  %nop4242 = alloca i1, i1 0
-  %nop4243 = alloca i1, i1 0
-  %nop4244 = alloca i1, i1 0
-  %nop4245 = alloca i1, i1 0
-  %nop4246 = alloca i1, i1 0
-  %nop4247 = alloca i1, i1 0
-  %nop4248 = alloca i1, i1 0
-  %nop4249 = alloca i1, i1 0
-  %nop4250 = alloca i1, i1 0
-  %nop4251 = alloca i1, i1 0
-  %nop4252 = alloca i1, i1 0
-  %nop4253 = alloca i1, i1 0
-  %nop4254 = alloca i1, i1 0
-  %nop4255 = alloca i1, i1 0
-  %nop4256 = alloca i1, i1 0
-  %nop4257 = alloca i1, i1 0
-  %nop4258 = alloca i1, i1 0
-  %nop4259 = alloca i1, i1 0
-  %nop4260 = alloca i1, i1 0
-  %nop4261 = alloca i1, i1 0
-  %nop4262 = alloca i1, i1 0
-  %nop4263 = alloca i1, i1 0
-  %nop4264 = alloca i1, i1 0
-  %nop4265 = alloca i1, i1 0
-  %nop4266 = alloca i1, i1 0
-  %nop4267 = alloca i1, i1 0
-  %nop4268 = alloca i1, i1 0
-  %nop4269 = alloca i1, i1 0
-  %nop4270 = alloca i1, i1 0
-  %nop4271 = alloca i1, i1 0
-  %nop4272 = alloca i1, i1 0
-  %nop4273 = alloca i1, i1 0
-  %nop4274 = alloca i1, i1 0
-  %nop4275 = alloca i1, i1 0
-  %nop4276 = alloca i1, i1 0
-  %nop4277 = alloca i1, i1 0
-  %nop4278 = alloca i1, i1 0
-  %nop4279 = alloca i1, i1 0
-  %nop4280 = alloca i1, i1 0
-  %nop4281 = alloca i1, i1 0
-  %nop4282 = alloca i1, i1 0
-  %nop4283 = alloca i1, i1 0
-  %nop4284 = alloca i1, i1 0
-  %nop4285 = alloca i1, i1 0
-  %nop4286 = alloca i1, i1 0
-  %nop4287 = alloca i1, i1 0
-  %nop4288 = alloca i1, i1 0
-  %nop4289 = alloca i1, i1 0
-  %nop4290 = alloca i1, i1 0
-  %nop4291 = alloca i1, i1 0
-  %nop4292 = alloca i1, i1 0
-  %nop4293 = alloca i1, i1 0
-  %nop4294 = alloca i1, i1 0
-  %nop4295 = alloca i1, i1 0
-  %nop4296 = alloca i1, i1 0
-  %nop4297 = alloca i1, i1 0
-  %nop4298 = alloca i1, i1 0
-  %nop4299 = alloca i1, i1 0
-  %nop4300 = alloca i1, i1 0
-  %nop4301 = alloca i1, i1 0
-  %nop4302 = alloca i1, i1 0
-  %nop4303 = alloca i1, i1 0
-  %nop4304 = alloca i1, i1 0
-  %nop4305 = alloca i1, i1 0
-  %nop4306 = alloca i1, i1 0
-  %nop4307 = alloca i1, i1 0
-  %nop4308 = alloca i1, i1 0
-  %nop4309 = alloca i1, i1 0
-  %nop4310 = alloca i1, i1 0
-  %nop4311 = alloca i1, i1 0
-  %nop4312 = alloca i1, i1 0
-  %nop4313 = alloca i1, i1 0
-  %nop4314 = alloca i1, i1 0
-  %nop4315 = alloca i1, i1 0
-  %nop4316 = alloca i1, i1 0
-  %nop4317 = alloca i1, i1 0
-  %nop4318 = alloca i1, i1 0
-  %nop4319 = alloca i1, i1 0
-  %nop4320 = alloca i1, i1 0
-  %nop4321 = alloca i1, i1 0
-  %nop4322 = alloca i1, i1 0
-  %nop4323 = alloca i1, i1 0
-  %nop4324 = alloca i1, i1 0
-  %nop4325 = alloca i1, i1 0
-  %nop4326 = alloca i1, i1 0
-  %nop4327 = alloca i1, i1 0
-  %nop4328 = alloca i1, i1 0
-  %nop4329 = alloca i1, i1 0
-  %nop4330 = alloca i1, i1 0
-  %nop4331 = alloca i1, i1 0
-  %nop4332 = alloca i1, i1 0
-  %nop4333 = alloca i1, i1 0
-  %nop4334 = alloca i1, i1 0
-  %nop4335 = alloca i1, i1 0
-  %nop4336 = alloca i1, i1 0
-  %nop4337 = alloca i1, i1 0
-  %nop4338 = alloca i1, i1 0
-  %nop4339 = alloca i1, i1 0
-  %nop4340 = alloca i1, i1 0
-  %nop4341 = alloca i1, i1 0
-  %nop4342 = alloca i1, i1 0
-  %nop4343 = alloca i1, i1 0
-  %nop4344 = alloca i1, i1 0
-  %nop4345 = alloca i1, i1 0
-  %nop4346 = alloca i1, i1 0
-  %nop4347 = alloca i1, i1 0
-  %nop4348 = alloca i1, i1 0
-  %nop4349 = alloca i1, i1 0
-  %nop4350 = alloca i1, i1 0
-  %nop4351 = alloca i1, i1 0
-  %nop4352 = alloca i1, i1 0
-  %nop4353 = alloca i1, i1 0
-  %nop4354 = alloca i1, i1 0
-  %nop4355 = alloca i1, i1 0
-  %nop4356 = alloca i1, i1 0
-  %nop4357 = alloca i1, i1 0
-  %nop4358 = alloca i1, i1 0
-  %nop4359 = alloca i1, i1 0
-  %nop4360 = alloca i1, i1 0
-  %nop4361 = alloca i1, i1 0
-  %nop4362 = alloca i1, i1 0
-  %nop4363 = alloca i1, i1 0
-  %nop4364 = alloca i1, i1 0
-  %nop4365 = alloca i1, i1 0
-  %nop4366 = alloca i1, i1 0
-  %nop4367 = alloca i1, i1 0
-  %nop4368 = alloca i1, i1 0
-  %nop4369 = alloca i1, i1 0
-  %nop4370 = alloca i1, i1 0
-  %nop4371 = alloca i1, i1 0
-  %nop4372 = alloca i1, i1 0
-  %nop4373 = alloca i1, i1 0
-  %nop4374 = alloca i1, i1 0
-  %nop4375 = alloca i1, i1 0
-  %nop4376 = alloca i1, i1 0
-  %nop4377 = alloca i1, i1 0
-  %nop4378 = alloca i1, i1 0
-  %nop4379 = alloca i1, i1 0
-  %nop4380 = alloca i1, i1 0
-  %nop4381 = alloca i1, i1 0
-  %nop4382 = alloca i1, i1 0
-  %nop4383 = alloca i1, i1 0
-  %nop4384 = alloca i1, i1 0
-  %nop4385 = alloca i1, i1 0
-  %nop4386 = alloca i1, i1 0
-  %nop4387 = alloca i1, i1 0
-  %nop4388 = alloca i1, i1 0
-  %nop4389 = alloca i1, i1 0
-  %nop4390 = alloca i1, i1 0
-  %nop4391 = alloca i1, i1 0
-  %nop4392 = alloca i1, i1 0
-  %nop4393 = alloca i1, i1 0
-  %nop4394 = alloca i1, i1 0
-  %nop4395 = alloca i1, i1 0
-  %nop4396 = alloca i1, i1 0
-  %nop4397 = alloca i1, i1 0
-  %nop4398 = alloca i1, i1 0
-  %nop4399 = alloca i1, i1 0
-  %nop4400 = alloca i1, i1 0
-  %nop4401 = alloca i1, i1 0
-  %nop4402 = alloca i1, i1 0
-  %nop4403 = alloca i1, i1 0
-  %nop4404 = alloca i1, i1 0
-  %nop4405 = alloca i1, i1 0
-  %nop4406 = alloca i1, i1 0
-  %nop4407 = alloca i1, i1 0
-  %nop4408 = alloca i1, i1 0
-  %nop4409 = alloca i1, i1 0
-  %nop4410 = alloca i1, i1 0
-  %nop4411 = alloca i1, i1 0
-  %nop4412 = alloca i1, i1 0
-  %nop4413 = alloca i1, i1 0
-  %nop4414 = alloca i1, i1 0
-  %nop4415 = alloca i1, i1 0
-  %nop4416 = alloca i1, i1 0
-  %nop4417 = alloca i1, i1 0
-  %nop4418 = alloca i1, i1 0
-  %nop4419 = alloca i1, i1 0
-  %nop4420 = alloca i1, i1 0
-  %nop4421 = alloca i1, i1 0
-  %nop4422 = alloca i1, i1 0
-  %nop4423 = alloca i1, i1 0
-  %nop4424 = alloca i1, i1 0
-  %nop4425 = alloca i1, i1 0
-  %nop4426 = alloca i1, i1 0
-  %nop4427 = alloca i1, i1 0
-  %nop4428 = alloca i1, i1 0
-  %nop4429 = alloca i1, i1 0
-  %nop4430 = alloca i1, i1 0
-  %nop4431 = alloca i1, i1 0
-  %nop4432 = alloca i1, i1 0
-  %nop4433 = alloca i1, i1 0
-  %nop4434 = alloca i1, i1 0
-  %nop4435 = alloca i1, i1 0
-  %nop4436 = alloca i1, i1 0
-  %nop4437 = alloca i1, i1 0
-  %nop4438 = alloca i1, i1 0
-  %nop4439 = alloca i1, i1 0
-  %nop4440 = alloca i1, i1 0
-  %nop4441 = alloca i1, i1 0
-  %nop4442 = alloca i1, i1 0
-  %nop4443 = alloca i1, i1 0
-  %nop4444 = alloca i1, i1 0
-  %nop4445 = alloca i1, i1 0
-  %nop4446 = alloca i1, i1 0
-  %nop4447 = alloca i1, i1 0
-  %nop4448 = alloca i1, i1 0
-  %nop4449 = alloca i1, i1 0
-  %nop4450 = alloca i1, i1 0
-  %nop4451 = alloca i1, i1 0
-  %nop4452 = alloca i1, i1 0
-  %nop4453 = alloca i1, i1 0
-  %nop4454 = alloca i1, i1 0
-  %nop4455 = alloca i1, i1 0
-  %nop4456 = alloca i1, i1 0
-  %nop4457 = alloca i1, i1 0
-  %nop4458 = alloca i1, i1 0
-  %nop4459 = alloca i1, i1 0
-  %nop4460 = alloca i1, i1 0
-  %nop4461 = alloca i1, i1 0
-  %nop4462 = alloca i1, i1 0
-  %nop4463 = alloca i1, i1 0
-  %nop4464 = alloca i1, i1 0
-  %nop4465 = alloca i1, i1 0
-  %nop4466 = alloca i1, i1 0
-  %nop4467 = alloca i1, i1 0
-  %nop4468 = alloca i1, i1 0
-  %nop4469 = alloca i1, i1 0
-  %nop4470 = alloca i1, i1 0
-  %nop4471 = alloca i1, i1 0
-  %nop4472 = alloca i1, i1 0
-  %nop4473 = alloca i1, i1 0
-  %nop4474 = alloca i1, i1 0
-  %nop4475 = alloca i1, i1 0
-  %nop4476 = alloca i1, i1 0
-  %nop4477 = alloca i1, i1 0
-  %nop4478 = alloca i1, i1 0
-  %nop4479 = alloca i1, i1 0
-  %nop4480 = alloca i1, i1 0
-  %nop4481 = alloca i1, i1 0
-  %nop4482 = alloca i1, i1 0
-  %nop4483 = alloca i1, i1 0
-  %nop4484 = alloca i1, i1 0
-  %nop4485 = alloca i1, i1 0
-  %nop4486 = alloca i1, i1 0
-  %nop4487 = alloca i1, i1 0
-  %nop4488 = alloca i1, i1 0
-  %nop4489 = alloca i1, i1 0
-  %nop4490 = alloca i1, i1 0
-  %nop4491 = alloca i1, i1 0
-  %nop4492 = alloca i1, i1 0
-  %nop4493 = alloca i1, i1 0
-  %nop4494 = alloca i1, i1 0
-  %nop4495 = alloca i1, i1 0
-  %nop4496 = alloca i1, i1 0
-  %nop4497 = alloca i1, i1 0
-  %nop4498 = alloca i1, i1 0
-  %nop4499 = alloca i1, i1 0
-  %nop4500 = alloca i1, i1 0
-  %nop4501 = alloca i1, i1 0
-  %nop4502 = alloca i1, i1 0
-  %nop4503 = alloca i1, i1 0
-  %nop4504 = alloca i1, i1 0
-  %nop4505 = alloca i1, i1 0
-  %nop4506 = alloca i1, i1 0
-  %nop4507 = alloca i1, i1 0
-  %nop4508 = alloca i1, i1 0
-  %nop4509 = alloca i1, i1 0
-  %nop4510 = alloca i1, i1 0
-  %nop4511 = alloca i1, i1 0
-  %nop4512 = alloca i1, i1 0
-  %nop4513 = alloca i1, i1 0
-  %nop4514 = alloca i1, i1 0
-  %nop4515 = alloca i1, i1 0
-  %nop4516 = alloca i1, i1 0
-  %nop4517 = alloca i1, i1 0
-  %nop4518 = alloca i1, i1 0
-  %nop4519 = alloca i1, i1 0
-  %nop4520 = alloca i1, i1 0
-  %nop4521 = alloca i1, i1 0
-  %nop4522 = alloca i1, i1 0
-  %nop4523 = alloca i1, i1 0
-  %nop4524 = alloca i1, i1 0
-  %nop4525 = alloca i1, i1 0
-  %nop4526 = alloca i1, i1 0
-  %nop4527 = alloca i1, i1 0
-  %nop4528 = alloca i1, i1 0
-  %nop4529 = alloca i1, i1 0
-  %nop4530 = alloca i1, i1 0
-  %nop4531 = alloca i1, i1 0
-  %nop4532 = alloca i1, i1 0
-  %nop4533 = alloca i1, i1 0
-  %nop4534 = alloca i1, i1 0
-  %nop4535 = alloca i1, i1 0
-  %nop4536 = alloca i1, i1 0
-  %nop4537 = alloca i1, i1 0
-  %nop4538 = alloca i1, i1 0
-  %nop4539 = alloca i1, i1 0
-  %nop4540 = alloca i1, i1 0
-  %nop4541 = alloca i1, i1 0
-  %nop4542 = alloca i1, i1 0
-  %nop4543 = alloca i1, i1 0
-  %nop4544 = alloca i1, i1 0
-  %nop4545 = alloca i1, i1 0
-  %nop4546 = alloca i1, i1 0
-  %nop4547 = alloca i1, i1 0
-  %nop4548 = alloca i1, i1 0
-  %nop4549 = alloca i1, i1 0
-  %nop4550 = alloca i1, i1 0
-  %nop4551 = alloca i1, i1 0
-  %nop4552 = alloca i1, i1 0
-  %nop4553 = alloca i1, i1 0
-  %nop4554 = alloca i1, i1 0
-  %nop4555 = alloca i1, i1 0
-  %nop4556 = alloca i1, i1 0
-  %nop4557 = alloca i1, i1 0
-  %nop4558 = alloca i1, i1 0
-  %nop4559 = alloca i1, i1 0
-  %nop4560 = alloca i1, i1 0
-  %nop4561 = alloca i1, i1 0
-  %nop4562 = alloca i1, i1 0
-  %nop4563 = alloca i1, i1 0
-  %nop4564 = alloca i1, i1 0
-  %nop4565 = alloca i1, i1 0
-  %nop4566 = alloca i1, i1 0
-  %nop4567 = alloca i1, i1 0
-  %nop4568 = alloca i1, i1 0
-  %nop4569 = alloca i1, i1 0
-  %nop4570 = alloca i1, i1 0
-  %nop4571 = alloca i1, i1 0
-  %nop4572 = alloca i1, i1 0
-  %nop4573 = alloca i1, i1 0
-  %nop4574 = alloca i1, i1 0
-  %nop4575 = alloca i1, i1 0
-  %nop4576 = alloca i1, i1 0
-  %nop4577 = alloca i1, i1 0
-  %nop4578 = alloca i1, i1 0
-  %nop4579 = alloca i1, i1 0
-  %nop4580 = alloca i1, i1 0
-  %nop4581 = alloca i1, i1 0
-  %nop4582 = alloca i1, i1 0
-  %nop4583 = alloca i1, i1 0
-  %nop4584 = alloca i1, i1 0
-  %nop4585 = alloca i1, i1 0
-  %nop4586 = alloca i1, i1 0
-  %nop4587 = alloca i1, i1 0
-  %nop4588 = alloca i1, i1 0
-  %nop4589 = alloca i1, i1 0
-  %nop4590 = alloca i1, i1 0
-  %nop4591 = alloca i1, i1 0
-  %nop4592 = alloca i1, i1 0
-  %nop4593 = alloca i1, i1 0
-  %nop4594 = alloca i1, i1 0
-  %nop4595 = alloca i1, i1 0
-  %nop4596 = alloca i1, i1 0
-  %nop4597 = alloca i1, i1 0
-  %nop4598 = alloca i1, i1 0
-  %nop4599 = alloca i1, i1 0
-  %nop4600 = alloca i1, i1 0
-  %nop4601 = alloca i1, i1 0
-  %nop4602 = alloca i1, i1 0
-  %nop4603 = alloca i1, i1 0
-  %nop4604 = alloca i1, i1 0
-  %nop4605 = alloca i1, i1 0
-  %nop4606 = alloca i1, i1 0
-  %nop4607 = alloca i1, i1 0
-  %nop4608 = alloca i1, i1 0
-  %nop4609 = alloca i1, i1 0
-  %nop4610 = alloca i1, i1 0
-  %nop4611 = alloca i1, i1 0
-  %nop4612 = alloca i1, i1 0
-  %nop4613 = alloca i1, i1 0
-  %nop4614 = alloca i1, i1 0
-  %nop4615 = alloca i1, i1 0
-  %nop4616 = alloca i1, i1 0
-  %nop4617 = alloca i1, i1 0
-  %nop4618 = alloca i1, i1 0
-  %nop4619 = alloca i1, i1 0
-  %nop4620 = alloca i1, i1 0
-  %nop4621 = alloca i1, i1 0
-  %nop4622 = alloca i1, i1 0
-  %nop4623 = alloca i1, i1 0
-  %nop4624 = alloca i1, i1 0
-  %nop4625 = alloca i1, i1 0
-  %nop4626 = alloca i1, i1 0
-  %nop4627 = alloca i1, i1 0
-  %nop4628 = alloca i1, i1 0
-  %nop4629 = alloca i1, i1 0
-  %nop4630 = alloca i1, i1 0
-  %nop4631 = alloca i1, i1 0
-  %nop4632 = alloca i1, i1 0
-  %nop4633 = alloca i1, i1 0
-  %nop4634 = alloca i1, i1 0
-  %nop4635 = alloca i1, i1 0
-  %nop4636 = alloca i1, i1 0
-  %nop4637 = alloca i1, i1 0
-  %nop4638 = alloca i1, i1 0
-  %nop4639 = alloca i1, i1 0
-  %nop4640 = alloca i1, i1 0
-  %nop4641 = alloca i1, i1 0
-  %nop4642 = alloca i1, i1 0
-  %nop4643 = alloca i1, i1 0
-  %nop4644 = alloca i1, i1 0
-  %nop4645 = alloca i1, i1 0
-  %nop4646 = alloca i1, i1 0
-  %nop4647 = alloca i1, i1 0
-  %nop4648 = alloca i1, i1 0
-  %nop4649 = alloca i1, i1 0
-  %nop4650 = alloca i1, i1 0
-  %nop4651 = alloca i1, i1 0
-  %nop4652 = alloca i1, i1 0
-  %nop4653 = alloca i1, i1 0
-  %nop4654 = alloca i1, i1 0
-  %nop4655 = alloca i1, i1 0
-  %nop4656 = alloca i1, i1 0
-  %nop4657 = alloca i1, i1 0
-  %nop4658 = alloca i1, i1 0
-  %nop4659 = alloca i1, i1 0
-  %nop4660 = alloca i1, i1 0
-  %nop4661 = alloca i1, i1 0
-  %nop4662 = alloca i1, i1 0
-  %nop4663 = alloca i1, i1 0
-  %nop4664 = alloca i1, i1 0
-  %nop4665 = alloca i1, i1 0
-  %nop4666 = alloca i1, i1 0
-  %nop4667 = alloca i1, i1 0
-  %nop4668 = alloca i1, i1 0
-  %nop4669 = alloca i1, i1 0
-  %nop4670 = alloca i1, i1 0
-  %nop4671 = alloca i1, i1 0
-  %nop4672 = alloca i1, i1 0
-  %nop4673 = alloca i1, i1 0
-  %nop4674 = alloca i1, i1 0
-  %nop4675 = alloca i1, i1 0
-  %nop4676 = alloca i1, i1 0
-  %nop4677 = alloca i1, i1 0
-  %nop4678 = alloca i1, i1 0
-  %nop4679 = alloca i1, i1 0
-  %nop4680 = alloca i1, i1 0
-  %nop4681 = alloca i1, i1 0
-  %nop4682 = alloca i1, i1 0
-  %nop4683 = alloca i1, i1 0
-  %nop4684 = alloca i1, i1 0
-  %nop4685 = alloca i1, i1 0
-  %nop4686 = alloca i1, i1 0
-  %nop4687 = alloca i1, i1 0
-  %nop4688 = alloca i1, i1 0
-  %nop4689 = alloca i1, i1 0
-  %nop4690 = alloca i1, i1 0
-  %nop4691 = alloca i1, i1 0
-  %nop4692 = alloca i1, i1 0
-  %nop4693 = alloca i1, i1 0
-  %nop4694 = alloca i1, i1 0
-  %nop4695 = alloca i1, i1 0
-  %nop4696 = alloca i1, i1 0
-  %nop4697 = alloca i1, i1 0
-  %nop4698 = alloca i1, i1 0
-  %nop4699 = alloca i1, i1 0
-  %nop4700 = alloca i1, i1 0
-  %nop4701 = alloca i1, i1 0
-  %nop4702 = alloca i1, i1 0
-  %nop4703 = alloca i1, i1 0
-  %nop4704 = alloca i1, i1 0
-  %nop4705 = alloca i1, i1 0
-  %nop4706 = alloca i1, i1 0
-  %nop4707 = alloca i1, i1 0
-  %nop4708 = alloca i1, i1 0
-  %nop4709 = alloca i1, i1 0
-  %nop4710 = alloca i1, i1 0
-  %nop4711 = alloca i1, i1 0
-  %nop4712 = alloca i1, i1 0
-  %nop4713 = alloca i1, i1 0
-  %nop4714 = alloca i1, i1 0
-  %nop4715 = alloca i1, i1 0
-  %nop4716 = alloca i1, i1 0
-  %nop4717 = alloca i1, i1 0
-  %nop4718 = alloca i1, i1 0
-  %nop4719 = alloca i1, i1 0
-  %nop4720 = alloca i1, i1 0
-  %nop4721 = alloca i1, i1 0
-  %nop4722 = alloca i1, i1 0
-  %nop4723 = alloca i1, i1 0
-  %nop4724 = alloca i1, i1 0
-  %nop4725 = alloca i1, i1 0
-  %nop4726 = alloca i1, i1 0
-  %nop4727 = alloca i1, i1 0
-  %nop4728 = alloca i1, i1 0
-  %nop4729 = alloca i1, i1 0
-  %nop4730 = alloca i1, i1 0
-  %nop4731 = alloca i1, i1 0
-  %nop4732 = alloca i1, i1 0
-  %nop4733 = alloca i1, i1 0
-  %nop4734 = alloca i1, i1 0
-  %nop4735 = alloca i1, i1 0
-  %nop4736 = alloca i1, i1 0
-  %nop4737 = alloca i1, i1 0
-  %nop4738 = alloca i1, i1 0
-  %nop4739 = alloca i1, i1 0
-  %nop4740 = alloca i1, i1 0
-  %nop4741 = alloca i1, i1 0
-  %nop4742 = alloca i1, i1 0
-  %nop4743 = alloca i1, i1 0
-  %nop4744 = alloca i1, i1 0
-  %nop4745 = alloca i1, i1 0
-  %nop4746 = alloca i1, i1 0
-  %nop4747 = alloca i1, i1 0
-  %nop4748 = alloca i1, i1 0
-  %nop4749 = alloca i1, i1 0
-  %nop4750 = alloca i1, i1 0
-  %nop4751 = alloca i1, i1 0
-  %nop4752 = alloca i1, i1 0
-  %nop4753 = alloca i1, i1 0
-  %nop4754 = alloca i1, i1 0
-  %nop4755 = alloca i1, i1 0
-  %nop4756 = alloca i1, i1 0
-  %nop4757 = alloca i1, i1 0
-  %nop4758 = alloca i1, i1 0
-  %nop4759 = alloca i1, i1 0
-  %nop4760 = alloca i1, i1 0
-  %nop4761 = alloca i1, i1 0
-  %nop4762 = alloca i1, i1 0
-  %nop4763 = alloca i1, i1 0
-  %nop4764 = alloca i1, i1 0
-  %nop4765 = alloca i1, i1 0
-  %nop4766 = alloca i1, i1 0
-  %nop4767 = alloca i1, i1 0
-  %nop4768 = alloca i1, i1 0
-  %nop4769 = alloca i1, i1 0
-  %nop4770 = alloca i1, i1 0
-  %nop4771 = alloca i1, i1 0
-  %nop4772 = alloca i1, i1 0
-  %nop4773 = alloca i1, i1 0
-  %nop4774 = alloca i1, i1 0
-  %nop4775 = alloca i1, i1 0
-  %nop4776 = alloca i1, i1 0
-  %nop4777 = alloca i1, i1 0
-  %nop4778 = alloca i1, i1 0
-  %nop4779 = alloca i1, i1 0
-  %nop4780 = alloca i1, i1 0
-  %nop4781 = alloca i1, i1 0
-  %nop4782 = alloca i1, i1 0
-  %nop4783 = alloca i1, i1 0
-  %nop4784 = alloca i1, i1 0
-  %nop4785 = alloca i1, i1 0
-  %nop4786 = alloca i1, i1 0
-  %nop4787 = alloca i1, i1 0
-  %nop4788 = alloca i1, i1 0
-  %nop4789 = alloca i1, i1 0
-  %nop4790 = alloca i1, i1 0
-  %nop4791 = alloca i1, i1 0
-  %nop4792 = alloca i1, i1 0
-  %nop4793 = alloca i1, i1 0
-  %nop4794 = alloca i1, i1 0
-  %nop4795 = alloca i1, i1 0
-  %nop4796 = alloca i1, i1 0
-  %nop4797 = alloca i1, i1 0
-  %nop4798 = alloca i1, i1 0
-  %nop4799 = alloca i1, i1 0
-  %nop4800 = alloca i1, i1 0
-  %nop4801 = alloca i1, i1 0
-  %nop4802 = alloca i1, i1 0
-  %nop4803 = alloca i1, i1 0
-  %nop4804 = alloca i1, i1 0
-  %nop4805 = alloca i1, i1 0
-  %nop4806 = alloca i1, i1 0
-  %nop4807 = alloca i1, i1 0
-  %nop4808 = alloca i1, i1 0
-  %nop4809 = alloca i1, i1 0
-  %nop4810 = alloca i1, i1 0
-  %nop4811 = alloca i1, i1 0
-  %nop4812 = alloca i1, i1 0
-  %nop4813 = alloca i1, i1 0
-  %nop4814 = alloca i1, i1 0
-  %nop4815 = alloca i1, i1 0
-  %nop4816 = alloca i1, i1 0
-  %nop4817 = alloca i1, i1 0
-  %nop4818 = alloca i1, i1 0
-  %nop4819 = alloca i1, i1 0
-  %nop4820 = alloca i1, i1 0
-  %nop4821 = alloca i1, i1 0
-  %nop4822 = alloca i1, i1 0
-  %nop4823 = alloca i1, i1 0
-  %nop4824 = alloca i1, i1 0
-  %nop4825 = alloca i1, i1 0
-  %nop4826 = alloca i1, i1 0
-  %nop4827 = alloca i1, i1 0
-  %nop4828 = alloca i1, i1 0
-  %nop4829 = alloca i1, i1 0
-  %nop4830 = alloca i1, i1 0
-  %nop4831 = alloca i1, i1 0
-  %nop4832 = alloca i1, i1 0
-  %nop4833 = alloca i1, i1 0
-  %nop4834 = alloca i1, i1 0
-  %nop4835 = alloca i1, i1 0
-  %nop4836 = alloca i1, i1 0
-  %nop4837 = alloca i1, i1 0
-  %nop4838 = alloca i1, i1 0
-  %nop4839 = alloca i1, i1 0
-  %nop4840 = alloca i1, i1 0
-  %nop4841 = alloca i1, i1 0
-  %nop4842 = alloca i1, i1 0
-  %nop4843 = alloca i1, i1 0
-  %nop4844 = alloca i1, i1 0
-  %nop4845 = alloca i1, i1 0
-  %nop4846 = alloca i1, i1 0
-  %nop4847 = alloca i1, i1 0
-  %nop4848 = alloca i1, i1 0
-  %nop4849 = alloca i1, i1 0
-  %nop4850 = alloca i1, i1 0
-  %nop4851 = alloca i1, i1 0
-  %nop4852 = alloca i1, i1 0
-  %nop4853 = alloca i1, i1 0
-  %nop4854 = alloca i1, i1 0
-  %nop4855 = alloca i1, i1 0
-  %nop4856 = alloca i1, i1 0
-  %nop4857 = alloca i1, i1 0
-  %nop4858 = alloca i1, i1 0
-  %nop4859 = alloca i1, i1 0
-  %nop4860 = alloca i1, i1 0
-  %nop4861 = alloca i1, i1 0
-  %nop4862 = alloca i1, i1 0
-  %nop4863 = alloca i1, i1 0
-  %nop4864 = alloca i1, i1 0
-  %nop4865 = alloca i1, i1 0
-  %nop4866 = alloca i1, i1 0
-  %nop4867 = alloca i1, i1 0
-  %nop4868 = alloca i1, i1 0
-  %nop4869 = alloca i1, i1 0
-  %nop4870 = alloca i1, i1 0
-  %nop4871 = alloca i1, i1 0
-  %nop4872 = alloca i1, i1 0
-  %nop4873 = alloca i1, i1 0
-  %nop4874 = alloca i1, i1 0
-  %nop4875 = alloca i1, i1 0
-  %nop4876 = alloca i1, i1 0
-  %nop4877 = alloca i1, i1 0
-  %nop4878 = alloca i1, i1 0
-  %nop4879 = alloca i1, i1 0
-  %nop4880 = alloca i1, i1 0
-  %nop4881 = alloca i1, i1 0
-  %nop4882 = alloca i1, i1 0
-  %nop4883 = alloca i1, i1 0
-  %nop4884 = alloca i1, i1 0
-  %nop4885 = alloca i1, i1 0
-  %nop4886 = alloca i1, i1 0
-  %nop4887 = alloca i1, i1 0
-  %nop4888 = alloca i1, i1 0
-  %nop4889 = alloca i1, i1 0
-  %nop4890 = alloca i1, i1 0
-  %nop4891 = alloca i1, i1 0
-  %nop4892 = alloca i1, i1 0
-  %nop4893 = alloca i1, i1 0
-  %nop4894 = alloca i1, i1 0
-  %nop4895 = alloca i1, i1 0
-  %nop4896 = alloca i1, i1 0
-  %nop4897 = alloca i1, i1 0
-  %nop4898 = alloca i1, i1 0
-  %nop4899 = alloca i1, i1 0
-  %nop4900 = alloca i1, i1 0
-  %nop4901 = alloca i1, i1 0
-  %nop4902 = alloca i1, i1 0
-  %nop4903 = alloca i1, i1 0
-  %nop4904 = alloca i1, i1 0
-  %nop4905 = alloca i1, i1 0
-  %nop4906 = alloca i1, i1 0
-  %nop4907 = alloca i1, i1 0
-  %nop4908 = alloca i1, i1 0
-  %nop4909 = alloca i1, i1 0
-  %nop4910 = alloca i1, i1 0
-  %nop4911 = alloca i1, i1 0
-  %nop4912 = alloca i1, i1 0
-  %nop4913 = alloca i1, i1 0
-  %nop4914 = alloca i1, i1 0
-  %nop4915 = alloca i1, i1 0
-  %nop4916 = alloca i1, i1 0
-  %nop4917 = alloca i1, i1 0
-  %nop4918 = alloca i1, i1 0
-  %nop4919 = alloca i1, i1 0
-  %nop4920 = alloca i1, i1 0
-  %nop4921 = alloca i1, i1 0
-  %nop4922 = alloca i1, i1 0
-  %nop4923 = alloca i1, i1 0
-  %nop4924 = alloca i1, i1 0
-  %nop4925 = alloca i1, i1 0
-  %nop4926 = alloca i1, i1 0
-  %nop4927 = alloca i1, i1 0
-  %nop4928 = alloca i1, i1 0
-  %nop4929 = alloca i1, i1 0
-  %nop4930 = alloca i1, i1 0
-  %nop4931 = alloca i1, i1 0
-  %nop4932 = alloca i1, i1 0
-  %nop4933 = alloca i1, i1 0
-  %nop4934 = alloca i1, i1 0
-  %nop4935 = alloca i1, i1 0
-  %nop4936 = alloca i1, i1 0
-  %nop4937 = alloca i1, i1 0
-  %nop4938 = alloca i1, i1 0
-  %nop4939 = alloca i1, i1 0
-  %nop4940 = alloca i1, i1 0
-  %nop4941 = alloca i1, i1 0
-  %nop4942 = alloca i1, i1 0
-  %nop4943 = alloca i1, i1 0
-  %nop4944 = alloca i1, i1 0
-  %nop4945 = alloca i1, i1 0
-  %nop4946 = alloca i1, i1 0
-  %nop4947 = alloca i1, i1 0
-  %nop4948 = alloca i1, i1 0
-  %nop4949 = alloca i1, i1 0
-  %nop4950 = alloca i1, i1 0
-  %nop4951 = alloca i1, i1 0
-  %nop4952 = alloca i1, i1 0
-  %nop4953 = alloca i1, i1 0
-  %nop4954 = alloca i1, i1 0
-  %nop4955 = alloca i1, i1 0
-  %nop4956 = alloca i1, i1 0
-  %nop4957 = alloca i1, i1 0
-  %nop4958 = alloca i1, i1 0
-  %nop4959 = alloca i1, i1 0
-  %nop4960 = alloca i1, i1 0
-  %nop4961 = alloca i1, i1 0
-  %nop4962 = alloca i1, i1 0
-  %nop4963 = alloca i1, i1 0
-  %nop4964 = alloca i1, i1 0
-  %nop4965 = alloca i1, i1 0
-  %nop4966 = alloca i1, i1 0
-  %nop4967 = alloca i1, i1 0
-  %nop4968 = alloca i1, i1 0
-  %nop4969 = alloca i1, i1 0
-  %nop4970 = alloca i1, i1 0
-  %nop4971 = alloca i1, i1 0
-  %nop4972 = alloca i1, i1 0
-  %nop4973 = alloca i1, i1 0
-  %nop4974 = alloca i1, i1 0
-  %nop4975 = alloca i1, i1 0
-  %nop4976 = alloca i1, i1 0
-  %nop4977 = alloca i1, i1 0
-  %nop4978 = alloca i1, i1 0
-  %nop4979 = alloca i1, i1 0
-  %nop4980 = alloca i1, i1 0
-  %nop4981 = alloca i1, i1 0
-  %nop4982 = alloca i1, i1 0
-  %nop4983 = alloca i1, i1 0
-  %nop4984 = alloca i1, i1 0
-  %nop4985 = alloca i1, i1 0
-  %nop4986 = alloca i1, i1 0
-  %nop4987 = alloca i1, i1 0
-  %nop4988 = alloca i1, i1 0
-  %nop4989 = alloca i1, i1 0
-  %nop4990 = alloca i1, i1 0
-  %nop4991 = alloca i1, i1 0
-  %nop4992 = alloca i1, i1 0
-  %nop4993 = alloca i1, i1 0
-  %nop4994 = alloca i1, i1 0
-  %nop4995 = alloca i1, i1 0
-  %nop4996 = alloca i1, i1 0
-  %nop4997 = alloca i1, i1 0
-  %nop4998 = alloca i1, i1 0
-  %nop4999 = alloca i1, i1 0
-  %nop5000 = alloca i1, i1 0
-  %nop5001 = alloca i1, i1 0
-  %nop5002 = alloca i1, i1 0
-  %nop5003 = alloca i1, i1 0
-  %nop5004 = alloca i1, i1 0
-  %nop5005 = alloca i1, i1 0
-  %nop5006 = alloca i1, i1 0
-  %nop5007 = alloca i1, i1 0
-  %nop5008 = alloca i1, i1 0
-  %nop5009 = alloca i1, i1 0
-  %nop5010 = alloca i1, i1 0
-  %nop5011 = alloca i1, i1 0
-  %nop5012 = alloca i1, i1 0
-  %nop5013 = alloca i1, i1 0
-  %nop5014 = alloca i1, i1 0
-  %nop5015 = alloca i1, i1 0
-  %nop5016 = alloca i1, i1 0
-  %nop5017 = alloca i1, i1 0
-  %nop5018 = alloca i1, i1 0
-  %nop5019 = alloca i1, i1 0
-  %nop5020 = alloca i1, i1 0
-  %nop5021 = alloca i1, i1 0
-  %nop5022 = alloca i1, i1 0
-  %nop5023 = alloca i1, i1 0
-  %nop5024 = alloca i1, i1 0
-  %nop5025 = alloca i1, i1 0
-  %nop5026 = alloca i1, i1 0
-  %nop5027 = alloca i1, i1 0
-  %nop5028 = alloca i1, i1 0
-  %nop5029 = alloca i1, i1 0
-  %nop5030 = alloca i1, i1 0
-  %nop5031 = alloca i1, i1 0
-  %nop5032 = alloca i1, i1 0
-  %nop5033 = alloca i1, i1 0
-  %nop5034 = alloca i1, i1 0
-  %nop5035 = alloca i1, i1 0
-  %nop5036 = alloca i1, i1 0
-  %nop5037 = alloca i1, i1 0
-  %nop5038 = alloca i1, i1 0
-  %nop5039 = alloca i1, i1 0
-  %nop5040 = alloca i1, i1 0
-  %nop5041 = alloca i1, i1 0
-  %nop5042 = alloca i1, i1 0
-  %nop5043 = alloca i1, i1 0
-  %nop5044 = alloca i1, i1 0
-  %nop5045 = alloca i1, i1 0
-  %nop5046 = alloca i1, i1 0
-  %nop5047 = alloca i1, i1 0
-  %nop5048 = alloca i1, i1 0
-  %nop5049 = alloca i1, i1 0
-  %nop5050 = alloca i1, i1 0
-  %nop5051 = alloca i1, i1 0
-  %nop5052 = alloca i1, i1 0
-  %nop5053 = alloca i1, i1 0
-  %nop5054 = alloca i1, i1 0
-  %nop5055 = alloca i1, i1 0
-  %nop5056 = alloca i1, i1 0
-  %nop5057 = alloca i1, i1 0
-  %nop5058 = alloca i1, i1 0
-  %nop5059 = alloca i1, i1 0
-  %nop5060 = alloca i1, i1 0
-  %nop5061 = alloca i1, i1 0
-  %nop5062 = alloca i1, i1 0
-  %nop5063 = alloca i1, i1 0
-  %nop5064 = alloca i1, i1 0
-  %nop5065 = alloca i1, i1 0
-  %nop5066 = alloca i1, i1 0
-  %nop5067 = alloca i1, i1 0
-  %nop5068 = alloca i1, i1 0
-  %nop5069 = alloca i1, i1 0
-  %nop5070 = alloca i1, i1 0
-  %nop5071 = alloca i1, i1 0
-  %nop5072 = alloca i1, i1 0
-  %nop5073 = alloca i1, i1 0
-  %nop5074 = alloca i1, i1 0
-  %nop5075 = alloca i1, i1 0
-  %nop5076 = alloca i1, i1 0
-  %nop5077 = alloca i1, i1 0
-  %nop5078 = alloca i1, i1 0
-  %nop5079 = alloca i1, i1 0
-  %nop5080 = alloca i1, i1 0
-  %nop5081 = alloca i1, i1 0
-  %nop5082 = alloca i1, i1 0
-  %nop5083 = alloca i1, i1 0
-  %nop5084 = alloca i1, i1 0
-  %nop5085 = alloca i1, i1 0
-  %nop5086 = alloca i1, i1 0
-  %nop5087 = alloca i1, i1 0
-  %nop5088 = alloca i1, i1 0
-  %nop5089 = alloca i1, i1 0
-  %nop5090 = alloca i1, i1 0
-  %nop5091 = alloca i1, i1 0
-  %nop5092 = alloca i1, i1 0
-  %nop5093 = alloca i1, i1 0
-  %nop5094 = alloca i1, i1 0
-  %nop5095 = alloca i1, i1 0
-  %nop5096 = alloca i1, i1 0
-  %nop5097 = alloca i1, i1 0
-  %nop5098 = alloca i1, i1 0
-  %nop5099 = alloca i1, i1 0
-  %nop5100 = alloca i1, i1 0
-  %nop5101 = alloca i1, i1 0
-  %nop5102 = alloca i1, i1 0
-  %nop5103 = alloca i1, i1 0
-  %nop5104 = alloca i1, i1 0
-  %nop5105 = alloca i1, i1 0
-  %nop5106 = alloca i1, i1 0
-  %nop5107 = alloca i1, i1 0
-  %nop5108 = alloca i1, i1 0
-  %nop5109 = alloca i1, i1 0
-  %nop5110 = alloca i1, i1 0
-  %nop5111 = alloca i1, i1 0
-  %nop5112 = alloca i1, i1 0
-  %nop5113 = alloca i1, i1 0
-  %nop5114 = alloca i1, i1 0
-  %nop5115 = alloca i1, i1 0
-  %nop5116 = alloca i1, i1 0
-  %nop5117 = alloca i1, i1 0
-  %nop5118 = alloca i1, i1 0
-  %nop5119 = alloca i1, i1 0
-  %nop5120 = alloca i1, i1 0
-  %nop5121 = alloca i1, i1 0
-  %nop5122 = alloca i1, i1 0
-  %nop5123 = alloca i1, i1 0
-  %nop5124 = alloca i1, i1 0
-  %nop5125 = alloca i1, i1 0
-  %nop5126 = alloca i1, i1 0
-  %nop5127 = alloca i1, i1 0
-  %nop5128 = alloca i1, i1 0
-  %nop5129 = alloca i1, i1 0
-  %nop5130 = alloca i1, i1 0
-  %nop5131 = alloca i1, i1 0
-  %nop5132 = alloca i1, i1 0
-  %nop5133 = alloca i1, i1 0
-  %nop5134 = alloca i1, i1 0
-  %nop5135 = alloca i1, i1 0
-  %nop5136 = alloca i1, i1 0
-  %nop5137 = alloca i1, i1 0
-  %nop5138 = alloca i1, i1 0
-  %nop5139 = alloca i1, i1 0
-  %nop5140 = alloca i1, i1 0
-  %nop5141 = alloca i1, i1 0
-  %nop5142 = alloca i1, i1 0
-  %nop5143 = alloca i1, i1 0
-  %nop5144 = alloca i1, i1 0
-  %nop5145 = alloca i1, i1 0
-  %nop5146 = alloca i1, i1 0
-  %nop5147 = alloca i1, i1 0
-  %nop5148 = alloca i1, i1 0
-  %nop5149 = alloca i1, i1 0
-  %nop5150 = alloca i1, i1 0
-  %nop5151 = alloca i1, i1 0
-  %nop5152 = alloca i1, i1 0
-  %nop5153 = alloca i1, i1 0
-  %nop5154 = alloca i1, i1 0
-  %nop5155 = alloca i1, i1 0
-  %nop5156 = alloca i1, i1 0
-  %nop5157 = alloca i1, i1 0
-  %nop5158 = alloca i1, i1 0
-  %nop5159 = alloca i1, i1 0
-  %nop5160 = alloca i1, i1 0
-  %nop5161 = alloca i1, i1 0
-  %nop5162 = alloca i1, i1 0
-  %nop5163 = alloca i1, i1 0
-  %nop5164 = alloca i1, i1 0
-  %nop5165 = alloca i1, i1 0
-  %nop5166 = alloca i1, i1 0
-  %nop5167 = alloca i1, i1 0
-  %nop5168 = alloca i1, i1 0
-  %nop5169 = alloca i1, i1 0
-  %nop5170 = alloca i1, i1 0
-  %nop5171 = alloca i1, i1 0
-  %nop5172 = alloca i1, i1 0
-  %nop5173 = alloca i1, i1 0
-  %nop5174 = alloca i1, i1 0
-  %nop5175 = alloca i1, i1 0
-  %nop5176 = alloca i1, i1 0
-  %nop5177 = alloca i1, i1 0
-  %nop5178 = alloca i1, i1 0
-  %nop5179 = alloca i1, i1 0
-  %nop5180 = alloca i1, i1 0
-  %nop5181 = alloca i1, i1 0
-  %nop5182 = alloca i1, i1 0
-  %nop5183 = alloca i1, i1 0
-  %nop5184 = alloca i1, i1 0
-  %nop5185 = alloca i1, i1 0
-  %nop5186 = alloca i1, i1 0
-  %nop5187 = alloca i1, i1 0
-  %nop5188 = alloca i1, i1 0
-  %nop5189 = alloca i1, i1 0
-  %nop5190 = alloca i1, i1 0
-  %nop5191 = alloca i1, i1 0
-  %nop5192 = alloca i1, i1 0
-  %nop5193 = alloca i1, i1 0
-  %nop5194 = alloca i1, i1 0
-  %nop5195 = alloca i1, i1 0
-  %nop5196 = alloca i1, i1 0
-  %nop5197 = alloca i1, i1 0
-  %nop5198 = alloca i1, i1 0
-  %nop5199 = alloca i1, i1 0
-  %nop5200 = alloca i1, i1 0
-  %nop5201 = alloca i1, i1 0
-  %nop5202 = alloca i1, i1 0
-  %nop5203 = alloca i1, i1 0
-  %nop5204 = alloca i1, i1 0
-  %nop5205 = alloca i1, i1 0
-  %nop5206 = alloca i1, i1 0
-  %nop5207 = alloca i1, i1 0
-  %nop5208 = alloca i1, i1 0
-  %nop5209 = alloca i1, i1 0
-  %nop5210 = alloca i1, i1 0
-  %nop5211 = alloca i1, i1 0
-  %nop5212 = alloca i1, i1 0
-  %nop5213 = alloca i1, i1 0
-  %nop5214 = alloca i1, i1 0
-  %nop5215 = alloca i1, i1 0
-  %nop5216 = alloca i1, i1 0
-  %nop5217 = alloca i1, i1 0
-  %nop5218 = alloca i1, i1 0
-  %nop5219 = alloca i1, i1 0
-  %nop5220 = alloca i1, i1 0
-  %nop5221 = alloca i1, i1 0
-  %nop5222 = alloca i1, i1 0
-  %nop5223 = alloca i1, i1 0
-  %nop5224 = alloca i1, i1 0
-  %nop5225 = alloca i1, i1 0
-  %nop5226 = alloca i1, i1 0
-  %nop5227 = alloca i1, i1 0
-  %nop5228 = alloca i1, i1 0
-  %nop5229 = alloca i1, i1 0
-  %nop5230 = alloca i1, i1 0
-  %nop5231 = alloca i1, i1 0
-  %nop5232 = alloca i1, i1 0
-  %nop5233 = alloca i1, i1 0
-  %nop5234 = alloca i1, i1 0
-  %nop5235 = alloca i1, i1 0
-  %nop5236 = alloca i1, i1 0
-  %nop5237 = alloca i1, i1 0
-  %nop5238 = alloca i1, i1 0
-  %nop5239 = alloca i1, i1 0
-  %nop5240 = alloca i1, i1 0
-  %nop5241 = alloca i1, i1 0
-  %nop5242 = alloca i1, i1 0
-  %nop5243 = alloca i1, i1 0
-  %nop5244 = alloca i1, i1 0
-  %nop5245 = alloca i1, i1 0
-  %nop5246 = alloca i1, i1 0
-  %nop5247 = alloca i1, i1 0
-  %nop5248 = alloca i1, i1 0
-  %nop5249 = alloca i1, i1 0
-  %nop5250 = alloca i1, i1 0
-  %nop5251 = alloca i1, i1 0
-  %nop5252 = alloca i1, i1 0
-  %nop5253 = alloca i1, i1 0
-  %nop5254 = alloca i1, i1 0
-  %nop5255 = alloca i1, i1 0
-  %nop5256 = alloca i1, i1 0
-  %nop5257 = alloca i1, i1 0
-  %nop5258 = alloca i1, i1 0
-  %nop5259 = alloca i1, i1 0
-  %nop5260 = alloca i1, i1 0
-  %nop5261 = alloca i1, i1 0
-  %nop5262 = alloca i1, i1 0
-  %nop5263 = alloca i1, i1 0
-  %nop5264 = alloca i1, i1 0
-  %nop5265 = alloca i1, i1 0
-  %nop5266 = alloca i1, i1 0
-  %nop5267 = alloca i1, i1 0
-  %nop5268 = alloca i1, i1 0
-  %nop5269 = alloca i1, i1 0
-  %nop5270 = alloca i1, i1 0
-  %nop5271 = alloca i1, i1 0
-  %nop5272 = alloca i1, i1 0
-  %nop5273 = alloca i1, i1 0
-  %nop5274 = alloca i1, i1 0
-  %nop5275 = alloca i1, i1 0
-  %nop5276 = alloca i1, i1 0
-  %nop5277 = alloca i1, i1 0
-  %nop5278 = alloca i1, i1 0
-  %nop5279 = alloca i1, i1 0
-  %nop5280 = alloca i1, i1 0
-  %nop5281 = alloca i1, i1 0
-  %nop5282 = alloca i1, i1 0
-  %nop5283 = alloca i1, i1 0
-  %nop5284 = alloca i1, i1 0
-  %nop5285 = alloca i1, i1 0
-  %nop5286 = alloca i1, i1 0
-  %nop5287 = alloca i1, i1 0
-  %nop5288 = alloca i1, i1 0
-  %nop5289 = alloca i1, i1 0
-  %nop5290 = alloca i1, i1 0
-  %nop5291 = alloca i1, i1 0
-  %nop5292 = alloca i1, i1 0
-  %nop5293 = alloca i1, i1 0
-  %nop5294 = alloca i1, i1 0
-  %nop5295 = alloca i1, i1 0
-  %nop5296 = alloca i1, i1 0
-  %nop5297 = alloca i1, i1 0
-  %nop5298 = alloca i1, i1 0
-  %nop5299 = alloca i1, i1 0
-  %nop5300 = alloca i1, i1 0
-  %nop5301 = alloca i1, i1 0
-  %nop5302 = alloca i1, i1 0
-  %nop5303 = alloca i1, i1 0
-  %nop5304 = alloca i1, i1 0
-  %nop5305 = alloca i1, i1 0
-  %nop5306 = alloca i1, i1 0
-  %nop5307 = alloca i1, i1 0
-  %nop5308 = alloca i1, i1 0
-  %nop5309 = alloca i1, i1 0
-  %nop5310 = alloca i1, i1 0
-  %nop5311 = alloca i1, i1 0
-  %nop5312 = alloca i1, i1 0
-  %nop5313 = alloca i1, i1 0
-  %nop5314 = alloca i1, i1 0
-  %nop5315 = alloca i1, i1 0
-  %nop5316 = alloca i1, i1 0
-  %nop5317 = alloca i1, i1 0
-  %nop5318 = alloca i1, i1 0
-  %nop5319 = alloca i1, i1 0
-  %nop5320 = alloca i1, i1 0
-  %nop5321 = alloca i1, i1 0
-  %nop5322 = alloca i1, i1 0
-  %nop5323 = alloca i1, i1 0
-  %nop5324 = alloca i1, i1 0
-  %nop5325 = alloca i1, i1 0
-  %nop5326 = alloca i1, i1 0
-  %nop5327 = alloca i1, i1 0
-  %nop5328 = alloca i1, i1 0
-  %nop5329 = alloca i1, i1 0
-  %nop5330 = alloca i1, i1 0
-  %nop5331 = alloca i1, i1 0
-  %nop5332 = alloca i1, i1 0
-  %nop5333 = alloca i1, i1 0
-  %nop5334 = alloca i1, i1 0
-  %nop5335 = alloca i1, i1 0
-  %nop5336 = alloca i1, i1 0
-  %nop5337 = alloca i1, i1 0
-  %nop5338 = alloca i1, i1 0
-  %nop5339 = alloca i1, i1 0
-  %nop5340 = alloca i1, i1 0
-  %nop5341 = alloca i1, i1 0
-  %nop5342 = alloca i1, i1 0
-  %nop5343 = alloca i1, i1 0
-  %nop5344 = alloca i1, i1 0
-  %nop5345 = alloca i1, i1 0
-  %nop5346 = alloca i1, i1 0
-  %nop5347 = alloca i1, i1 0
-  %nop5348 = alloca i1, i1 0
-  %nop5349 = alloca i1, i1 0
-  %nop5350 = alloca i1, i1 0
-  %nop5351 = alloca i1, i1 0
-  %nop5352 = alloca i1, i1 0
-  %nop5353 = alloca i1, i1 0
-  %nop5354 = alloca i1, i1 0
-  %nop5355 = alloca i1, i1 0
-  %nop5356 = alloca i1, i1 0
-  %nop5357 = alloca i1, i1 0
-  %nop5358 = alloca i1, i1 0
-  %nop5359 = alloca i1, i1 0
-  %nop5360 = alloca i1, i1 0
-  %nop5361 = alloca i1, i1 0
-  %nop5362 = alloca i1, i1 0
-  %nop5363 = alloca i1, i1 0
-  %nop5364 = alloca i1, i1 0
-  %nop5365 = alloca i1, i1 0
-  %nop5366 = alloca i1, i1 0
-  %nop5367 = alloca i1, i1 0
-  %nop5368 = alloca i1, i1 0
-  %nop5369 = alloca i1, i1 0
-  %nop5370 = alloca i1, i1 0
-  %nop5371 = alloca i1, i1 0
-  %nop5372 = alloca i1, i1 0
-  %nop5373 = alloca i1, i1 0
-  %nop5374 = alloca i1, i1 0
-  %nop5375 = alloca i1, i1 0
-  %nop5376 = alloca i1, i1 0
-  %nop5377 = alloca i1, i1 0
-  %nop5378 = alloca i1, i1 0
-  %nop5379 = alloca i1, i1 0
-  %nop5380 = alloca i1, i1 0
-  %nop5381 = alloca i1, i1 0
-  %nop5382 = alloca i1, i1 0
-  %nop5383 = alloca i1, i1 0
-  %nop5384 = alloca i1, i1 0
-  %nop5385 = alloca i1, i1 0
-  %nop5386 = alloca i1, i1 0
-  %nop5387 = alloca i1, i1 0
-  %nop5388 = alloca i1, i1 0
-  %nop5389 = alloca i1, i1 0
-  %nop5390 = alloca i1, i1 0
-  %nop5391 = alloca i1, i1 0
-  %nop5392 = alloca i1, i1 0
-  %nop5393 = alloca i1, i1 0
-  %nop5394 = alloca i1, i1 0
-  %nop5395 = alloca i1, i1 0
-  %nop5396 = alloca i1, i1 0
-  %nop5397 = alloca i1, i1 0
-  %nop5398 = alloca i1, i1 0
-  %nop5399 = alloca i1, i1 0
-  %nop5400 = alloca i1, i1 0
-  %nop5401 = alloca i1, i1 0
-  %nop5402 = alloca i1, i1 0
-  %nop5403 = alloca i1, i1 0
-  %nop5404 = alloca i1, i1 0
-  %nop5405 = alloca i1, i1 0
-  %nop5406 = alloca i1, i1 0
-  %nop5407 = alloca i1, i1 0
-  %nop5408 = alloca i1, i1 0
-  %nop5409 = alloca i1, i1 0
-  %nop5410 = alloca i1, i1 0
-  %nop5411 = alloca i1, i1 0
-  %nop5412 = alloca i1, i1 0
-  %nop5413 = alloca i1, i1 0
-  %nop5414 = alloca i1, i1 0
-  %nop5415 = alloca i1, i1 0
-  %nop5416 = alloca i1, i1 0
-  %nop5417 = alloca i1, i1 0
-  %nop5418 = alloca i1, i1 0
-  %nop5419 = alloca i1, i1 0
-  %nop5420 = alloca i1, i1 0
-  %nop5421 = alloca i1, i1 0
-  %nop5422 = alloca i1, i1 0
-  %nop5423 = alloca i1, i1 0
-  %nop5424 = alloca i1, i1 0
-  %nop5425 = alloca i1, i1 0
-  %nop5426 = alloca i1, i1 0
-  %nop5427 = alloca i1, i1 0
-  %nop5428 = alloca i1, i1 0
-  %nop5429 = alloca i1, i1 0
-  %nop5430 = alloca i1, i1 0
-  %nop5431 = alloca i1, i1 0
-  %nop5432 = alloca i1, i1 0
-  %nop5433 = alloca i1, i1 0
-  %nop5434 = alloca i1, i1 0
-  %nop5435 = alloca i1, i1 0
-  %nop5436 = alloca i1, i1 0
-  %nop5437 = alloca i1, i1 0
-  %nop5438 = alloca i1, i1 0
-  %nop5439 = alloca i1, i1 0
-  %nop5440 = alloca i1, i1 0
-  %nop5441 = alloca i1, i1 0
-  %nop5442 = alloca i1, i1 0
-  %nop5443 = alloca i1, i1 0
-  %nop5444 = alloca i1, i1 0
-  %nop5445 = alloca i1, i1 0
-  %nop5446 = alloca i1, i1 0
-  %nop5447 = alloca i1, i1 0
-  %nop5448 = alloca i1, i1 0
-  %nop5449 = alloca i1, i1 0
-  %nop5450 = alloca i1, i1 0
-  %nop5451 = alloca i1, i1 0
-  %nop5452 = alloca i1, i1 0
-  %nop5453 = alloca i1, i1 0
-  %nop5454 = alloca i1, i1 0
-  %nop5455 = alloca i1, i1 0
-  %nop5456 = alloca i1, i1 0
-  %nop5457 = alloca i1, i1 0
-  %nop5458 = alloca i1, i1 0
-  %nop5459 = alloca i1, i1 0
-  %nop5460 = alloca i1, i1 0
-  %nop5461 = alloca i1, i1 0
-  %nop5462 = alloca i1, i1 0
-  %nop5463 = alloca i1, i1 0
-  %nop5464 = alloca i1, i1 0
-  %nop5465 = alloca i1, i1 0
-  %nop5466 = alloca i1, i1 0
-  %nop5467 = alloca i1, i1 0
-  %nop5468 = alloca i1, i1 0
-  %nop5469 = alloca i1, i1 0
-  %nop5470 = alloca i1, i1 0
-  %nop5471 = alloca i1, i1 0
-  %nop5472 = alloca i1, i1 0
-  %nop5473 = alloca i1, i1 0
-  %nop5474 = alloca i1, i1 0
-  %nop5475 = alloca i1, i1 0
-  %nop5476 = alloca i1, i1 0
-  %nop5477 = alloca i1, i1 0
-  %nop5478 = alloca i1, i1 0
-  %nop5479 = alloca i1, i1 0
-  %nop5480 = alloca i1, i1 0
-  %nop5481 = alloca i1, i1 0
-  %nop5482 = alloca i1, i1 0
-  %nop5483 = alloca i1, i1 0
-  %nop5484 = alloca i1, i1 0
-  %nop5485 = alloca i1, i1 0
-  %nop5486 = alloca i1, i1 0
-  %nop5487 = alloca i1, i1 0
-  %nop5488 = alloca i1, i1 0
-  %nop5489 = alloca i1, i1 0
-  %nop5490 = alloca i1, i1 0
-  %nop5491 = alloca i1, i1 0
-  %nop5492 = alloca i1, i1 0
-  %nop5493 = alloca i1, i1 0
-  %nop5494 = alloca i1, i1 0
-  %nop5495 = alloca i1, i1 0
-  %nop5496 = alloca i1, i1 0
-  %nop5497 = alloca i1, i1 0
-  %nop5498 = alloca i1, i1 0
-  %nop5499 = alloca i1, i1 0
-  %nop5500 = alloca i1, i1 0
-  %nop5501 = alloca i1, i1 0
-  %nop5502 = alloca i1, i1 0
-  %nop5503 = alloca i1, i1 0
-  %nop5504 = alloca i1, i1 0
-  %nop5505 = alloca i1, i1 0
-  %nop5506 = alloca i1, i1 0
-  %nop5507 = alloca i1, i1 0
-  %nop5508 = alloca i1, i1 0
-  %nop5509 = alloca i1, i1 0
-  %nop5510 = alloca i1, i1 0
-  %nop5511 = alloca i1, i1 0
-  %nop5512 = alloca i1, i1 0
-  %nop5513 = alloca i1, i1 0
-  %nop5514 = alloca i1, i1 0
-  %nop5515 = alloca i1, i1 0
-  %nop5516 = alloca i1, i1 0
-  %nop5517 = alloca i1, i1 0
-  %nop5518 = alloca i1, i1 0
-  %nop5519 = alloca i1, i1 0
-  %nop5520 = alloca i1, i1 0
-  %nop5521 = alloca i1, i1 0
-  %nop5522 = alloca i1, i1 0
-  %nop5523 = alloca i1, i1 0
-  %nop5524 = alloca i1, i1 0
-  %nop5525 = alloca i1, i1 0
-  %nop5526 = alloca i1, i1 0
-  %nop5527 = alloca i1, i1 0
-  %nop5528 = alloca i1, i1 0
-  %nop5529 = alloca i1, i1 0
-  %nop5530 = alloca i1, i1 0
-  %nop5531 = alloca i1, i1 0
-  %nop5532 = alloca i1, i1 0
-  %nop5533 = alloca i1, i1 0
-  %nop5534 = alloca i1, i1 0
-  %nop5535 = alloca i1, i1 0
-  %nop5536 = alloca i1, i1 0
-  %nop5537 = alloca i1, i1 0
-  %nop5538 = alloca i1, i1 0
-  %nop5539 = alloca i1, i1 0
-  %nop5540 = alloca i1, i1 0
-  %nop5541 = alloca i1, i1 0
-  %nop5542 = alloca i1, i1 0
-  %nop5543 = alloca i1, i1 0
-  %nop5544 = alloca i1, i1 0
-  %nop5545 = alloca i1, i1 0
-  %nop5546 = alloca i1, i1 0
-  %nop5547 = alloca i1, i1 0
-  %nop5548 = alloca i1, i1 0
-  %nop5549 = alloca i1, i1 0
-  %nop5550 = alloca i1, i1 0
-  %nop5551 = alloca i1, i1 0
-  %nop5552 = alloca i1, i1 0
-  %nop5553 = alloca i1, i1 0
-  %nop5554 = alloca i1, i1 0
-  %nop5555 = alloca i1, i1 0
-  %nop5556 = alloca i1, i1 0
-  %nop5557 = alloca i1, i1 0
-  %nop5558 = alloca i1, i1 0
-  %nop5559 = alloca i1, i1 0
-  %nop5560 = alloca i1, i1 0
-  %nop5561 = alloca i1, i1 0
-  %nop5562 = alloca i1, i1 0
-  %nop5563 = alloca i1, i1 0
-  %nop5564 = alloca i1, i1 0
-  %nop5565 = alloca i1, i1 0
-  %nop5566 = alloca i1, i1 0
-  %nop5567 = alloca i1, i1 0
-  %nop5568 = alloca i1, i1 0
-  %nop5569 = alloca i1, i1 0
-  %nop5570 = alloca i1, i1 0
-  %nop5571 = alloca i1, i1 0
-  %nop5572 = alloca i1, i1 0
-  %nop5573 = alloca i1, i1 0
-  %nop5574 = alloca i1, i1 0
-  %nop5575 = alloca i1, i1 0
-  %nop5576 = alloca i1, i1 0
-  %nop5577 = alloca i1, i1 0
-  %nop5578 = alloca i1, i1 0
-  %nop5579 = alloca i1, i1 0
-  %nop5580 = alloca i1, i1 0
-  %nop5581 = alloca i1, i1 0
-  %nop5582 = alloca i1, i1 0
-  %nop5583 = alloca i1, i1 0
-  %nop5584 = alloca i1, i1 0
-  %nop5585 = alloca i1, i1 0
-  %nop5586 = alloca i1, i1 0
-  %nop5587 = alloca i1, i1 0
-  %nop5588 = alloca i1, i1 0
-  %nop5589 = alloca i1, i1 0
-  %nop5590 = alloca i1, i1 0
-  %nop5591 = alloca i1, i1 0
-  %nop5592 = alloca i1, i1 0
-  %nop5593 = alloca i1, i1 0
-  %nop5594 = alloca i1, i1 0
-  %nop5595 = alloca i1, i1 0
-  %nop5596 = alloca i1, i1 0
-  %nop5597 = alloca i1, i1 0
-  %nop5598 = alloca i1, i1 0
-  %nop5599 = alloca i1, i1 0
-  %nop5600 = alloca i1, i1 0
-  %nop5601 = alloca i1, i1 0
-  %nop5602 = alloca i1, i1 0
-  %nop5603 = alloca i1, i1 0
-  %nop5604 = alloca i1, i1 0
-  %nop5605 = alloca i1, i1 0
-  %nop5606 = alloca i1, i1 0
-  %nop5607 = alloca i1, i1 0
-  %nop5608 = alloca i1, i1 0
-  %nop5609 = alloca i1, i1 0
-  %nop5610 = alloca i1, i1 0
-  %nop5611 = alloca i1, i1 0
-  %nop5612 = alloca i1, i1 0
-  %nop5613 = alloca i1, i1 0
-  %nop5614 = alloca i1, i1 0
-  %nop5615 = alloca i1, i1 0
-  %nop5616 = alloca i1, i1 0
-  %nop5617 = alloca i1, i1 0
-  %nop5618 = alloca i1, i1 0
-  %nop5619 = alloca i1, i1 0
-  %nop5620 = alloca i1, i1 0
-  %nop5621 = alloca i1, i1 0
-  %nop5622 = alloca i1, i1 0
-  %nop5623 = alloca i1, i1 0
-  %nop5624 = alloca i1, i1 0
-  %nop5625 = alloca i1, i1 0
-  %nop5626 = alloca i1, i1 0
-  %nop5627 = alloca i1, i1 0
-  %nop5628 = alloca i1, i1 0
-  %nop5629 = alloca i1, i1 0
-  %nop5630 = alloca i1, i1 0
-  %nop5631 = alloca i1, i1 0
-  %nop5632 = alloca i1, i1 0
-  %nop5633 = alloca i1, i1 0
-  %nop5634 = alloca i1, i1 0
-  %nop5635 = alloca i1, i1 0
-  %nop5636 = alloca i1, i1 0
-  %nop5637 = alloca i1, i1 0
-  %nop5638 = alloca i1, i1 0
-  %nop5639 = alloca i1, i1 0
-  %nop5640 = alloca i1, i1 0
-  %nop5641 = alloca i1, i1 0
-  %nop5642 = alloca i1, i1 0
-  %nop5643 = alloca i1, i1 0
-  %nop5644 = alloca i1, i1 0
-  %nop5645 = alloca i1, i1 0
-  %nop5646 = alloca i1, i1 0
-  %nop5647 = alloca i1, i1 0
-  %nop5648 = alloca i1, i1 0
-  %nop5649 = alloca i1, i1 0
-  %nop5650 = alloca i1, i1 0
-  %nop5651 = alloca i1, i1 0
-  %nop5652 = alloca i1, i1 0
-  %nop5653 = alloca i1, i1 0
-  %nop5654 = alloca i1, i1 0
-  %nop5655 = alloca i1, i1 0
-  %nop5656 = alloca i1, i1 0
-  %nop5657 = alloca i1, i1 0
-  %nop5658 = alloca i1, i1 0
-  %nop5659 = alloca i1, i1 0
-  %nop5660 = alloca i1, i1 0
-  %nop5661 = alloca i1, i1 0
-  %nop5662 = alloca i1, i1 0
-  %nop5663 = alloca i1, i1 0
-  %nop5664 = alloca i1, i1 0
-  %nop5665 = alloca i1, i1 0
-  %nop5666 = alloca i1, i1 0
-  %nop5667 = alloca i1, i1 0
-  %nop5668 = alloca i1, i1 0
-  %nop5669 = alloca i1, i1 0
-  %nop5670 = alloca i1, i1 0
-  %nop5671 = alloca i1, i1 0
-  %nop5672 = alloca i1, i1 0
-  %nop5673 = alloca i1, i1 0
-  %nop5674 = alloca i1, i1 0
-  %nop5675 = alloca i1, i1 0
-  %nop5676 = alloca i1, i1 0
-  %nop5677 = alloca i1, i1 0
-  %nop5678 = alloca i1, i1 0
-  %nop5679 = alloca i1, i1 0
-  %nop5680 = alloca i1, i1 0
-  %nop5681 = alloca i1, i1 0
-  %nop5682 = alloca i1, i1 0
-  %nop5683 = alloca i1, i1 0
-  %nop5684 = alloca i1, i1 0
-  %nop5685 = alloca i1, i1 0
-  %nop5686 = alloca i1, i1 0
-  %nop5687 = alloca i1, i1 0
-  %nop5688 = alloca i1, i1 0
-  %nop5689 = alloca i1, i1 0
-  %nop5690 = alloca i1, i1 0
-  %nop5691 = alloca i1, i1 0
-  %nop5692 = alloca i1, i1 0
-  %nop5693 = alloca i1, i1 0
-  %nop5694 = alloca i1, i1 0
-  %nop5695 = alloca i1, i1 0
-  %nop5696 = alloca i1, i1 0
-  %nop5697 = alloca i1, i1 0
-  %nop5698 = alloca i1, i1 0
-  %nop5699 = alloca i1, i1 0
-  %nop5700 = alloca i1, i1 0
-  %nop5701 = alloca i1, i1 0
-  %nop5702 = alloca i1, i1 0
-  %nop5703 = alloca i1, i1 0
-  %nop5704 = alloca i1, i1 0
-  %nop5705 = alloca i1, i1 0
-  %nop5706 = alloca i1, i1 0
-  %nop5707 = alloca i1, i1 0
-  %nop5708 = alloca i1, i1 0
-  %nop5709 = alloca i1, i1 0
-  %nop5710 = alloca i1, i1 0
-  %nop5711 = alloca i1, i1 0
-  %nop5712 = alloca i1, i1 0
-  %nop5713 = alloca i1, i1 0
-  %nop5714 = alloca i1, i1 0
-  %nop5715 = alloca i1, i1 0
-  %nop5716 = alloca i1, i1 0
-  %nop5717 = alloca i1, i1 0
-  %nop5718 = alloca i1, i1 0
-  %nop5719 = alloca i1, i1 0
-  %nop5720 = alloca i1, i1 0
-  %nop5721 = alloca i1, i1 0
-  %nop5722 = alloca i1, i1 0
-  %nop5723 = alloca i1, i1 0
-  %nop5724 = alloca i1, i1 0
-  %nop5725 = alloca i1, i1 0
-  %nop5726 = alloca i1, i1 0
-  %nop5727 = alloca i1, i1 0
-  %nop5728 = alloca i1, i1 0
-  %nop5729 = alloca i1, i1 0
-  %nop5730 = alloca i1, i1 0
-  %nop5731 = alloca i1, i1 0
-  %nop5732 = alloca i1, i1 0
-  %nop5733 = alloca i1, i1 0
-  %nop5734 = alloca i1, i1 0
-  %nop5735 = alloca i1, i1 0
-  %nop5736 = alloca i1, i1 0
-  %nop5737 = alloca i1, i1 0
-  %nop5738 = alloca i1, i1 0
-  %nop5739 = alloca i1, i1 0
-  %nop5740 = alloca i1, i1 0
-  %nop5741 = alloca i1, i1 0
-  %nop5742 = alloca i1, i1 0
-  %nop5743 = alloca i1, i1 0
-  %nop5744 = alloca i1, i1 0
-  %nop5745 = alloca i1, i1 0
-  %nop5746 = alloca i1, i1 0
-  %nop5747 = alloca i1, i1 0
-  %nop5748 = alloca i1, i1 0
-  %nop5749 = alloca i1, i1 0
-  %nop5750 = alloca i1, i1 0
-  %nop5751 = alloca i1, i1 0
-  %nop5752 = alloca i1, i1 0
-  %nop5753 = alloca i1, i1 0
-  %nop5754 = alloca i1, i1 0
-  %nop5755 = alloca i1, i1 0
-  %nop5756 = alloca i1, i1 0
-  %nop5757 = alloca i1, i1 0
-  %nop5758 = alloca i1, i1 0
-  %nop5759 = alloca i1, i1 0
-  %nop5760 = alloca i1, i1 0
-  %nop5761 = alloca i1, i1 0
-  %nop5762 = alloca i1, i1 0
-  %nop5763 = alloca i1, i1 0
-  %nop5764 = alloca i1, i1 0
-  %nop5765 = alloca i1, i1 0
-  %nop5766 = alloca i1, i1 0
-  %nop5767 = alloca i1, i1 0
-  %nop5768 = alloca i1, i1 0
-  %nop5769 = alloca i1, i1 0
-  %nop5770 = alloca i1, i1 0
-  %nop5771 = alloca i1, i1 0
-  %nop5772 = alloca i1, i1 0
-  %nop5773 = alloca i1, i1 0
-  %nop5774 = alloca i1, i1 0
-  %nop5775 = alloca i1, i1 0
-  %nop5776 = alloca i1, i1 0
-  %nop5777 = alloca i1, i1 0
-  %nop5778 = alloca i1, i1 0
-  %nop5779 = alloca i1, i1 0
-  %nop5780 = alloca i1, i1 0
-  %nop5781 = alloca i1, i1 0
-  %nop5782 = alloca i1, i1 0
-  %nop5783 = alloca i1, i1 0
-  %nop5784 = alloca i1, i1 0
-  %nop5785 = alloca i1, i1 0
-  %nop5786 = alloca i1, i1 0
-  %nop5787 = alloca i1, i1 0
-  %nop5788 = alloca i1, i1 0
-  %nop5789 = alloca i1, i1 0
-  %nop5790 = alloca i1, i1 0
-  %nop5791 = alloca i1, i1 0
-  %nop5792 = alloca i1, i1 0
-  %nop5793 = alloca i1, i1 0
-  %nop5794 = alloca i1, i1 0
-  %nop5795 = alloca i1, i1 0
-  %nop5796 = alloca i1, i1 0
-  %nop5797 = alloca i1, i1 0
-  %nop5798 = alloca i1, i1 0
-  %nop5799 = alloca i1, i1 0
-  %nop5800 = alloca i1, i1 0
-  %nop5801 = alloca i1, i1 0
-  %nop5802 = alloca i1, i1 0
-  %nop5803 = alloca i1, i1 0
-  %nop5804 = alloca i1, i1 0
-  %nop5805 = alloca i1, i1 0
-  %nop5806 = alloca i1, i1 0
-  %nop5807 = alloca i1, i1 0
-  %nop5808 = alloca i1, i1 0
-  %nop5809 = alloca i1, i1 0
-  %nop5810 = alloca i1, i1 0
-  %nop5811 = alloca i1, i1 0
-  %nop5812 = alloca i1, i1 0
-  %nop5813 = alloca i1, i1 0
-  %nop5814 = alloca i1, i1 0
-  %nop5815 = alloca i1, i1 0
-  %nop5816 = alloca i1, i1 0
-  %nop5817 = alloca i1, i1 0
-  %nop5818 = alloca i1, i1 0
-  %nop5819 = alloca i1, i1 0
-  %nop5820 = alloca i1, i1 0
-  %nop5821 = alloca i1, i1 0
-  %nop5822 = alloca i1, i1 0
-  %nop5823 = alloca i1, i1 0
-  %nop5824 = alloca i1, i1 0
-  %nop5825 = alloca i1, i1 0
-  %nop5826 = alloca i1, i1 0
-  %nop5827 = alloca i1, i1 0
-  %nop5828 = alloca i1, i1 0
-  %nop5829 = alloca i1, i1 0
-  %nop5830 = alloca i1, i1 0
-  %nop5831 = alloca i1, i1 0
-  %nop5832 = alloca i1, i1 0
-  %nop5833 = alloca i1, i1 0
-  %nop5834 = alloca i1, i1 0
-  %nop5835 = alloca i1, i1 0
-  %nop5836 = alloca i1, i1 0
-  %nop5837 = alloca i1, i1 0
-  %nop5838 = alloca i1, i1 0
-  %nop5839 = alloca i1, i1 0
-  %nop5840 = alloca i1, i1 0
-  %nop5841 = alloca i1, i1 0
-  %nop5842 = alloca i1, i1 0
-  %nop5843 = alloca i1, i1 0
-  %nop5844 = alloca i1, i1 0
-  %nop5845 = alloca i1, i1 0
-  %nop5846 = alloca i1, i1 0
-  %nop5847 = alloca i1, i1 0
-  %nop5848 = alloca i1, i1 0
-  %nop5849 = alloca i1, i1 0
-  %nop5850 = alloca i1, i1 0
-  %nop5851 = alloca i1, i1 0
-  %nop5852 = alloca i1, i1 0
-  %nop5853 = alloca i1, i1 0
-  %nop5854 = alloca i1, i1 0
-  %nop5855 = alloca i1, i1 0
-  %nop5856 = alloca i1, i1 0
-  %nop5857 = alloca i1, i1 0
-  %nop5858 = alloca i1, i1 0
-  %nop5859 = alloca i1, i1 0
-  %nop5860 = alloca i1, i1 0
-  %nop5861 = alloca i1, i1 0
-  %nop5862 = alloca i1, i1 0
-  %nop5863 = alloca i1, i1 0
-  %nop5864 = alloca i1, i1 0
-  %nop5865 = alloca i1, i1 0
-  %nop5866 = alloca i1, i1 0
-  %nop5867 = alloca i1, i1 0
-  %nop5868 = alloca i1, i1 0
-  %nop5869 = alloca i1, i1 0
-  %nop5870 = alloca i1, i1 0
-  %nop5871 = alloca i1, i1 0
-  %nop5872 = alloca i1, i1 0
-  %nop5873 = alloca i1, i1 0
-  %nop5874 = alloca i1, i1 0
-  %nop5875 = alloca i1, i1 0
-  %nop5876 = alloca i1, i1 0
-  %nop5877 = alloca i1, i1 0
-  %nop5878 = alloca i1, i1 0
-  %nop5879 = alloca i1, i1 0
-  %nop5880 = alloca i1, i1 0
-  %nop5881 = alloca i1, i1 0
-  %nop5882 = alloca i1, i1 0
-  %nop5883 = alloca i1, i1 0
-  %nop5884 = alloca i1, i1 0
-  %nop5885 = alloca i1, i1 0
-  %nop5886 = alloca i1, i1 0
-  %nop5887 = alloca i1, i1 0
-  %nop5888 = alloca i1, i1 0
-  %nop5889 = alloca i1, i1 0
-  %nop5890 = alloca i1, i1 0
-  %nop5891 = alloca i1, i1 0
-  %nop5892 = alloca i1, i1 0
-  %nop5893 = alloca i1, i1 0
-  %nop5894 = alloca i1, i1 0
-  %nop5895 = alloca i1, i1 0
-  %nop5896 = alloca i1, i1 0
-  %nop5897 = alloca i1, i1 0
-  %nop5898 = alloca i1, i1 0
-  %nop5899 = alloca i1, i1 0
-  %nop5900 = alloca i1, i1 0
-  %nop5901 = alloca i1, i1 0
-  %nop5902 = alloca i1, i1 0
-  %nop5903 = alloca i1, i1 0
-  %nop5904 = alloca i1, i1 0
-  %nop5905 = alloca i1, i1 0
-  %nop5906 = alloca i1, i1 0
-  %nop5907 = alloca i1, i1 0
-  %nop5908 = alloca i1, i1 0
-  %nop5909 = alloca i1, i1 0
-  %nop5910 = alloca i1, i1 0
-  %nop5911 = alloca i1, i1 0
-  %nop5912 = alloca i1, i1 0
-  %nop5913 = alloca i1, i1 0
-  %nop5914 = alloca i1, i1 0
-  %nop5915 = alloca i1, i1 0
-  %nop5916 = alloca i1, i1 0
-  %nop5917 = alloca i1, i1 0
-  %nop5918 = alloca i1, i1 0
-  %nop5919 = alloca i1, i1 0
-  %nop5920 = alloca i1, i1 0
-  %nop5921 = alloca i1, i1 0
-  %nop5922 = alloca i1, i1 0
-  %nop5923 = alloca i1, i1 0
-  %nop5924 = alloca i1, i1 0
-  %nop5925 = alloca i1, i1 0
-  %nop5926 = alloca i1, i1 0
-  %nop5927 = alloca i1, i1 0
-  %nop5928 = alloca i1, i1 0
-  %nop5929 = alloca i1, i1 0
-  %nop5930 = alloca i1, i1 0
-  %nop5931 = alloca i1, i1 0
-  %nop5932 = alloca i1, i1 0
-  %nop5933 = alloca i1, i1 0
-  %nop5934 = alloca i1, i1 0
-  %nop5935 = alloca i1, i1 0
-  %nop5936 = alloca i1, i1 0
-  %nop5937 = alloca i1, i1 0
-  %nop5938 = alloca i1, i1 0
-  %nop5939 = alloca i1, i1 0
-  %nop5940 = alloca i1, i1 0
-  %nop5941 = alloca i1, i1 0
-  %nop5942 = alloca i1, i1 0
-  %nop5943 = alloca i1, i1 0
-  %nop5944 = alloca i1, i1 0
-  %nop5945 = alloca i1, i1 0
-  %nop5946 = alloca i1, i1 0
-  %nop5947 = alloca i1, i1 0
-  %nop5948 = alloca i1, i1 0
-  %nop5949 = alloca i1, i1 0
-  %nop5950 = alloca i1, i1 0
-  %nop5951 = alloca i1, i1 0
-  %nop5952 = alloca i1, i1 0
-  %nop5953 = alloca i1, i1 0
-  %nop5954 = alloca i1, i1 0
-  %nop5955 = alloca i1, i1 0
-  %nop5956 = alloca i1, i1 0
-  %nop5957 = alloca i1, i1 0
-  %nop5958 = alloca i1, i1 0
-  %nop5959 = alloca i1, i1 0
-  %nop5960 = alloca i1, i1 0
-  %nop5961 = alloca i1, i1 0
-  %nop5962 = alloca i1, i1 0
-  %nop5963 = alloca i1, i1 0
-  %nop5964 = alloca i1, i1 0
-  %nop5965 = alloca i1, i1 0
-  %nop5966 = alloca i1, i1 0
-  %nop5967 = alloca i1, i1 0
-  %nop5968 = alloca i1, i1 0
-  %nop5969 = alloca i1, i1 0
-  %nop5970 = alloca i1, i1 0
-  %nop5971 = alloca i1, i1 0
-  %nop5972 = alloca i1, i1 0
-  %nop5973 = alloca i1, i1 0
-  %nop5974 = alloca i1, i1 0
-  %nop5975 = alloca i1, i1 0
-  %nop5976 = alloca i1, i1 0
-  %nop5977 = alloca i1, i1 0
-  %nop5978 = alloca i1, i1 0
-  %nop5979 = alloca i1, i1 0
-  %nop5980 = alloca i1, i1 0
-  %nop5981 = alloca i1, i1 0
-  %nop5982 = alloca i1, i1 0
-  %nop5983 = alloca i1, i1 0
-  %nop5984 = alloca i1, i1 0
-  %nop5985 = alloca i1, i1 0
-  %nop5986 = alloca i1, i1 0
-  %nop5987 = alloca i1, i1 0
-  %nop5988 = alloca i1, i1 0
-  %nop5989 = alloca i1, i1 0
-  %nop5990 = alloca i1, i1 0
-  %nop5991 = alloca i1, i1 0
-  %nop5992 = alloca i1, i1 0
-  %nop5993 = alloca i1, i1 0
-  %nop5994 = alloca i1, i1 0
-  %nop5995 = alloca i1, i1 0
-  %nop5996 = alloca i1, i1 0
-  %nop5997 = alloca i1, i1 0
-  %nop5998 = alloca i1, i1 0
-  %nop5999 = alloca i1, i1 0
-  %nop6000 = alloca i1, i1 0
-  %nop6001 = alloca i1, i1 0
-  %nop6002 = alloca i1, i1 0
-  %nop6003 = alloca i1, i1 0
-  %nop6004 = alloca i1, i1 0
-  %nop6005 = alloca i1, i1 0
-  %nop6006 = alloca i1, i1 0
-  %nop6007 = alloca i1, i1 0
-  %nop6008 = alloca i1, i1 0
-  %nop6009 = alloca i1, i1 0
-  %nop6010 = alloca i1, i1 0
-  %nop6011 = alloca i1, i1 0
-  %nop6012 = alloca i1, i1 0
-  %nop6013 = alloca i1, i1 0
-  %nop6014 = alloca i1, i1 0
-  %nop6015 = alloca i1, i1 0
-  %nop6016 = alloca i1, i1 0
-  %nop6017 = alloca i1, i1 0
-  %nop6018 = alloca i1, i1 0
-  %nop6019 = alloca i1, i1 0
-  %nop6020 = alloca i1, i1 0
-  %nop6021 = alloca i1, i1 0
-  %nop6022 = alloca i1, i1 0
-  %nop6023 = alloca i1, i1 0
-  %nop6024 = alloca i1, i1 0
-  %nop6025 = alloca i1, i1 0
-  %nop6026 = alloca i1, i1 0
-  %nop6027 = alloca i1, i1 0
-  %nop6028 = alloca i1, i1 0
-  %nop6029 = alloca i1, i1 0
-  %nop6030 = alloca i1, i1 0
-  %nop6031 = alloca i1, i1 0
-  %nop6032 = alloca i1, i1 0
-  %nop6033 = alloca i1, i1 0
-  %nop6034 = alloca i1, i1 0
-  %nop6035 = alloca i1, i1 0
-  %nop6036 = alloca i1, i1 0
-  %nop6037 = alloca i1, i1 0
-  %nop6038 = alloca i1, i1 0
-  %nop6039 = alloca i1, i1 0
-  %nop6040 = alloca i1, i1 0
-  %nop6041 = alloca i1, i1 0
-  %nop6042 = alloca i1, i1 0
-  %nop6043 = alloca i1, i1 0
-  %nop6044 = alloca i1, i1 0
-  %nop6045 = alloca i1, i1 0
-  %nop6046 = alloca i1, i1 0
-  %nop6047 = alloca i1, i1 0
-  %nop6048 = alloca i1, i1 0
-  %nop6049 = alloca i1, i1 0
-  %nop6050 = alloca i1, i1 0
-  %nop6051 = alloca i1, i1 0
-  %nop6052 = alloca i1, i1 0
-  %nop6053 = alloca i1, i1 0
-  %nop6054 = alloca i1, i1 0
-  %nop6055 = alloca i1, i1 0
-  %nop6056 = alloca i1, i1 0
-  %nop6057 = alloca i1, i1 0
-  %nop6058 = alloca i1, i1 0
-  %nop6059 = alloca i1, i1 0
-  %nop6060 = alloca i1, i1 0
-  %nop6061 = alloca i1, i1 0
-  %nop6062 = alloca i1, i1 0
-  %nop6063 = alloca i1, i1 0
-  %nop6064 = alloca i1, i1 0
-  %nop6065 = alloca i1, i1 0
-  %nop6066 = alloca i1, i1 0
-  %nop6067 = alloca i1, i1 0
-  %nop6068 = alloca i1, i1 0
-  %nop6069 = alloca i1, i1 0
-  %nop6070 = alloca i1, i1 0
-  %nop6071 = alloca i1, i1 0
-  %nop6072 = alloca i1, i1 0
-  %nop6073 = alloca i1, i1 0
-  %nop6074 = alloca i1, i1 0
-  %nop6075 = alloca i1, i1 0
-  %nop6076 = alloca i1, i1 0
-  %nop6077 = alloca i1, i1 0
-  %nop6078 = alloca i1, i1 0
-  %nop6079 = alloca i1, i1 0
-  %nop6080 = alloca i1, i1 0
-  %nop6081 = alloca i1, i1 0
-  %nop6082 = alloca i1, i1 0
-  %nop6083 = alloca i1, i1 0
-  %nop6084 = alloca i1, i1 0
-  %nop6085 = alloca i1, i1 0
-  %nop6086 = alloca i1, i1 0
-  %nop6087 = alloca i1, i1 0
-  %nop6088 = alloca i1, i1 0
-  %nop6089 = alloca i1, i1 0
-  %nop6090 = alloca i1, i1 0
-  %nop6091 = alloca i1, i1 0
-  %nop6092 = alloca i1, i1 0
-  %nop6093 = alloca i1, i1 0
-  %nop6094 = alloca i1, i1 0
-  %nop6095 = alloca i1, i1 0
-  %nop6096 = alloca i1, i1 0
-  %nop6097 = alloca i1, i1 0
-  %nop6098 = alloca i1, i1 0
-  %nop6099 = alloca i1, i1 0
-  %nop6100 = alloca i1, i1 0
-  %nop6101 = alloca i1, i1 0
-  %nop6102 = alloca i1, i1 0
-  %nop6103 = alloca i1, i1 0
-  %nop6104 = alloca i1, i1 0
-  %nop6105 = alloca i1, i1 0
-  %nop6106 = alloca i1, i1 0
-  %nop6107 = alloca i1, i1 0
-  %nop6108 = alloca i1, i1 0
-  %nop6109 = alloca i1, i1 0
-  %nop6110 = alloca i1, i1 0
-  %nop6111 = alloca i1, i1 0
-  %nop6112 = alloca i1, i1 0
-  %nop6113 = alloca i1, i1 0
-  %nop6114 = alloca i1, i1 0
-  %nop6115 = alloca i1, i1 0
-  %nop6116 = alloca i1, i1 0
-  %nop6117 = alloca i1, i1 0
-  %nop6118 = alloca i1, i1 0
-  %nop6119 = alloca i1, i1 0
-  %nop6120 = alloca i1, i1 0
-  %nop6121 = alloca i1, i1 0
-  %nop6122 = alloca i1, i1 0
-  %nop6123 = alloca i1, i1 0
-  %nop6124 = alloca i1, i1 0
-  %nop6125 = alloca i1, i1 0
-  %nop6126 = alloca i1, i1 0
-  %nop6127 = alloca i1, i1 0
-  %nop6128 = alloca i1, i1 0
-  %nop6129 = alloca i1, i1 0
-  %nop6130 = alloca i1, i1 0
-  %nop6131 = alloca i1, i1 0
-  %nop6132 = alloca i1, i1 0
-  %nop6133 = alloca i1, i1 0
-  %nop6134 = alloca i1, i1 0
-  %nop6135 = alloca i1, i1 0
-  %nop6136 = alloca i1, i1 0
-  %nop6137 = alloca i1, i1 0
-  %nop6138 = alloca i1, i1 0
-  %nop6139 = alloca i1, i1 0
-  %nop6140 = alloca i1, i1 0
-  %nop6141 = alloca i1, i1 0
-  %nop6142 = alloca i1, i1 0
-  %nop6143 = alloca i1, i1 0
-  %nop6144 = alloca i1, i1 0
-  %nop6145 = alloca i1, i1 0
-  %nop6146 = alloca i1, i1 0
-  %nop6147 = alloca i1, i1 0
-  %nop6148 = alloca i1, i1 0
-  %nop6149 = alloca i1, i1 0
-  %nop6150 = alloca i1, i1 0
-  %nop6151 = alloca i1, i1 0
-  %nop6152 = alloca i1, i1 0
-  %nop6153 = alloca i1, i1 0
-  %nop6154 = alloca i1, i1 0
-  %nop6155 = alloca i1, i1 0
-  %nop6156 = alloca i1, i1 0
-  %nop6157 = alloca i1, i1 0
-  %nop6158 = alloca i1, i1 0
-  %nop6159 = alloca i1, i1 0
-  %nop6160 = alloca i1, i1 0
-  %nop6161 = alloca i1, i1 0
-  %nop6162 = alloca i1, i1 0
-  %nop6163 = alloca i1, i1 0
-  %nop6164 = alloca i1, i1 0
-  %nop6165 = alloca i1, i1 0
-  %nop6166 = alloca i1, i1 0
-  %nop6167 = alloca i1, i1 0
-  %nop6168 = alloca i1, i1 0
-  %nop6169 = alloca i1, i1 0
-  %nop6170 = alloca i1, i1 0
-  %nop6171 = alloca i1, i1 0
-  %nop6172 = alloca i1, i1 0
-  %nop6173 = alloca i1, i1 0
-  %nop6174 = alloca i1, i1 0
-  %nop6175 = alloca i1, i1 0
-  %nop6176 = alloca i1, i1 0
-  %nop6177 = alloca i1, i1 0
-  %nop6178 = alloca i1, i1 0
-  %nop6179 = alloca i1, i1 0
-  %nop6180 = alloca i1, i1 0
-  %nop6181 = alloca i1, i1 0
-  %nop6182 = alloca i1, i1 0
-  %nop6183 = alloca i1, i1 0
-  %nop6184 = alloca i1, i1 0
-  %nop6185 = alloca i1, i1 0
-  %nop6186 = alloca i1, i1 0
-  %nop6187 = alloca i1, i1 0
-  %nop6188 = alloca i1, i1 0
-  %nop6189 = alloca i1, i1 0
-  %nop6190 = alloca i1, i1 0
-  %nop6191 = alloca i1, i1 0
-  %nop6192 = alloca i1, i1 0
-  %nop6193 = alloca i1, i1 0
-  %nop6194 = alloca i1, i1 0
-  %nop6195 = alloca i1, i1 0
-  %nop6196 = alloca i1, i1 0
-  %nop6197 = alloca i1, i1 0
-  %nop6198 = alloca i1, i1 0
-  %nop6199 = alloca i1, i1 0
-  %nop6200 = alloca i1, i1 0
-  %nop6201 = alloca i1, i1 0
-  %nop6202 = alloca i1, i1 0
-  %nop6203 = alloca i1, i1 0
-  %nop6204 = alloca i1, i1 0
-  %nop6205 = alloca i1, i1 0
-  %nop6206 = alloca i1, i1 0
-  %nop6207 = alloca i1, i1 0
-  %nop6208 = alloca i1, i1 0
-  %nop6209 = alloca i1, i1 0
-  %nop6210 = alloca i1, i1 0
-  %nop6211 = alloca i1, i1 0
-  %nop6212 = alloca i1, i1 0
-  %nop6213 = alloca i1, i1 0
-  %nop6214 = alloca i1, i1 0
-  %nop6215 = alloca i1, i1 0
-  %nop6216 = alloca i1, i1 0
-  %nop6217 = alloca i1, i1 0
-  %nop6218 = alloca i1, i1 0
-  %nop6219 = alloca i1, i1 0
-  %nop6220 = alloca i1, i1 0
-  %nop6221 = alloca i1, i1 0
-  %nop6222 = alloca i1, i1 0
-  %nop6223 = alloca i1, i1 0
-  %nop6224 = alloca i1, i1 0
-  %nop6225 = alloca i1, i1 0
-  %nop6226 = alloca i1, i1 0
-  %nop6227 = alloca i1, i1 0
-  %nop6228 = alloca i1, i1 0
-  %nop6229 = alloca i1, i1 0
-  %nop6230 = alloca i1, i1 0
-  %nop6231 = alloca i1, i1 0
-  %nop6232 = alloca i1, i1 0
-  %nop6233 = alloca i1, i1 0
-  %nop6234 = alloca i1, i1 0
-  %nop6235 = alloca i1, i1 0
-  %nop6236 = alloca i1, i1 0
-  %nop6237 = alloca i1, i1 0
-  %nop6238 = alloca i1, i1 0
-  %nop6239 = alloca i1, i1 0
-  %nop6240 = alloca i1, i1 0
-  %nop6241 = alloca i1, i1 0
-  %nop6242 = alloca i1, i1 0
-  %nop6243 = alloca i1, i1 0
-  %nop6244 = alloca i1, i1 0
-  %nop6245 = alloca i1, i1 0
-  %nop6246 = alloca i1, i1 0
-  %nop6247 = alloca i1, i1 0
-  %nop6248 = alloca i1, i1 0
-  %nop6249 = alloca i1, i1 0
-  %nop6250 = alloca i1, i1 0
-  %nop6251 = alloca i1, i1 0
-  %nop6252 = alloca i1, i1 0
-  %nop6253 = alloca i1, i1 0
-  %nop6254 = alloca i1, i1 0
-  %nop6255 = alloca i1, i1 0
-  %nop6256 = alloca i1, i1 0
-  %nop6257 = alloca i1, i1 0
-  %nop6258 = alloca i1, i1 0
-  %nop6259 = alloca i1, i1 0
-  %nop6260 = alloca i1, i1 0
-  %nop6261 = alloca i1, i1 0
-  %nop6262 = alloca i1, i1 0
-  %nop6263 = alloca i1, i1 0
-  %nop6264 = alloca i1, i1 0
-  %nop6265 = alloca i1, i1 0
-  %nop6266 = alloca i1, i1 0
-  %nop6267 = alloca i1, i1 0
-  %nop6268 = alloca i1, i1 0
-  %nop6269 = alloca i1, i1 0
-  %nop6270 = alloca i1, i1 0
-  %nop6271 = alloca i1, i1 0
-  %nop6272 = alloca i1, i1 0
-  %nop6273 = alloca i1, i1 0
-  %nop6274 = alloca i1, i1 0
-  %nop6275 = alloca i1, i1 0
-  %nop6276 = alloca i1, i1 0
-  %nop6277 = alloca i1, i1 0
-  %nop6278 = alloca i1, i1 0
-  %nop6279 = alloca i1, i1 0
-  %nop6280 = alloca i1, i1 0
-  %nop6281 = alloca i1, i1 0
-  %nop6282 = alloca i1, i1 0
-  %nop6283 = alloca i1, i1 0
-  %nop6284 = alloca i1, i1 0
-  %nop6285 = alloca i1, i1 0
-  %nop6286 = alloca i1, i1 0
-  %nop6287 = alloca i1, i1 0
-  %nop6288 = alloca i1, i1 0
-  %nop6289 = alloca i1, i1 0
-  %nop6290 = alloca i1, i1 0
-  %nop6291 = alloca i1, i1 0
-  %nop6292 = alloca i1, i1 0
-  %nop6293 = alloca i1, i1 0
-  %nop6294 = alloca i1, i1 0
-  %nop6295 = alloca i1, i1 0
-  %nop6296 = alloca i1, i1 0
-  %nop6297 = alloca i1, i1 0
-  %nop6298 = alloca i1, i1 0
-  %nop6299 = alloca i1, i1 0
-  %nop6300 = alloca i1, i1 0
-  %nop6301 = alloca i1, i1 0
-  %nop6302 = alloca i1, i1 0
-  %nop6303 = alloca i1, i1 0
-  %nop6304 = alloca i1, i1 0
-  %nop6305 = alloca i1, i1 0
-  %nop6306 = alloca i1, i1 0
-  %nop6307 = alloca i1, i1 0
-  %nop6308 = alloca i1, i1 0
-  %nop6309 = alloca i1, i1 0
-  %nop6310 = alloca i1, i1 0
-  %nop6311 = alloca i1, i1 0
-  %nop6312 = alloca i1, i1 0
-  %nop6313 = alloca i1, i1 0
-  %nop6314 = alloca i1, i1 0
-  %nop6315 = alloca i1, i1 0
-  %nop6316 = alloca i1, i1 0
-  %nop6317 = alloca i1, i1 0
-  %nop6318 = alloca i1, i1 0
-  %nop6319 = alloca i1, i1 0
-  %nop6320 = alloca i1, i1 0
-  %nop6321 = alloca i1, i1 0
-  %nop6322 = alloca i1, i1 0
-  %nop6323 = alloca i1, i1 0
-  %nop6324 = alloca i1, i1 0
-  %nop6325 = alloca i1, i1 0
-  %nop6326 = alloca i1, i1 0
-  %nop6327 = alloca i1, i1 0
-  %nop6328 = alloca i1, i1 0
-  %nop6329 = alloca i1, i1 0
-  %nop6330 = alloca i1, i1 0
-  %nop6331 = alloca i1, i1 0
-  %nop6332 = alloca i1, i1 0
-  %nop6333 = alloca i1, i1 0
-  %nop6334 = alloca i1, i1 0
-  %nop6335 = alloca i1, i1 0
-  %nop6336 = alloca i1, i1 0
-  %nop6337 = alloca i1, i1 0
-  %nop6338 = alloca i1, i1 0
-  %nop6339 = alloca i1, i1 0
-  %nop6340 = alloca i1, i1 0
-  %nop6341 = alloca i1, i1 0
-  %nop6342 = alloca i1, i1 0
-  %nop6343 = alloca i1, i1 0
-  %nop6344 = alloca i1, i1 0
-  %nop6345 = alloca i1, i1 0
-  %nop6346 = alloca i1, i1 0
-  %nop6347 = alloca i1, i1 0
-  %nop6348 = alloca i1, i1 0
-  %nop6349 = alloca i1, i1 0
-  %nop6350 = alloca i1, i1 0
-  %nop6351 = alloca i1, i1 0
-  %nop6352 = alloca i1, i1 0
-  %nop6353 = alloca i1, i1 0
-  %nop6354 = alloca i1, i1 0
-  %nop6355 = alloca i1, i1 0
-  %nop6356 = alloca i1, i1 0
-  %nop6357 = alloca i1, i1 0
-  %nop6358 = alloca i1, i1 0
-  %nop6359 = alloca i1, i1 0
-  %nop6360 = alloca i1, i1 0
-  %nop6361 = alloca i1, i1 0
-  %nop6362 = alloca i1, i1 0
-  %nop6363 = alloca i1, i1 0
-  %nop6364 = alloca i1, i1 0
-  %nop6365 = alloca i1, i1 0
-  %nop6366 = alloca i1, i1 0
-  %nop6367 = alloca i1, i1 0
-  %nop6368 = alloca i1, i1 0
-  %nop6369 = alloca i1, i1 0
-  %nop6370 = alloca i1, i1 0
-  %nop6371 = alloca i1, i1 0
-  %nop6372 = alloca i1, i1 0
-  %nop6373 = alloca i1, i1 0
-  %nop6374 = alloca i1, i1 0
-  %nop6375 = alloca i1, i1 0
-  %nop6376 = alloca i1, i1 0
-  %nop6377 = alloca i1, i1 0
-  %nop6378 = alloca i1, i1 0
-  %nop6379 = alloca i1, i1 0
-  %nop6380 = alloca i1, i1 0
-  %nop6381 = alloca i1, i1 0
-  %nop6382 = alloca i1, i1 0
-  %nop6383 = alloca i1, i1 0
-  %nop6384 = alloca i1, i1 0
-  %nop6385 = alloca i1, i1 0
-  %nop6386 = alloca i1, i1 0
-  %nop6387 = alloca i1, i1 0
-  %nop6388 = alloca i1, i1 0
-  %nop6389 = alloca i1, i1 0
-  %nop6390 = alloca i1, i1 0
-  %nop6391 = alloca i1, i1 0
-  %nop6392 = alloca i1, i1 0
-  %nop6393 = alloca i1, i1 0
-  %nop6394 = alloca i1, i1 0
-  %nop6395 = alloca i1, i1 0
-  %nop6396 = alloca i1, i1 0
-  %nop6397 = alloca i1, i1 0
-  %nop6398 = alloca i1, i1 0
-  %nop6399 = alloca i1, i1 0
-  %nop6400 = alloca i1, i1 0
-  %nop6401 = alloca i1, i1 0
-  %nop6402 = alloca i1, i1 0
-  %nop6403 = alloca i1, i1 0
-  %nop6404 = alloca i1, i1 0
-  %nop6405 = alloca i1, i1 0
-  %nop6406 = alloca i1, i1 0
-  %nop6407 = alloca i1, i1 0
-  %nop6408 = alloca i1, i1 0
-  %nop6409 = alloca i1, i1 0
-  %nop6410 = alloca i1, i1 0
-  %nop6411 = alloca i1, i1 0
-  %nop6412 = alloca i1, i1 0
-  %nop6413 = alloca i1, i1 0
-  %nop6414 = alloca i1, i1 0
-  %nop6415 = alloca i1, i1 0
-  %nop6416 = alloca i1, i1 0
-  %nop6417 = alloca i1, i1 0
-  %nop6418 = alloca i1, i1 0
-  %nop6419 = alloca i1, i1 0
-  %nop6420 = alloca i1, i1 0
-  %nop6421 = alloca i1, i1 0
-  %nop6422 = alloca i1, i1 0
-  %nop6423 = alloca i1, i1 0
-  %nop6424 = alloca i1, i1 0
-  %nop6425 = alloca i1, i1 0
-  %nop6426 = alloca i1, i1 0
-  %nop6427 = alloca i1, i1 0
-  %nop6428 = alloca i1, i1 0
-  %nop6429 = alloca i1, i1 0
-  %nop6430 = alloca i1, i1 0
-  %nop6431 = alloca i1, i1 0
-  %nop6432 = alloca i1, i1 0
-  %nop6433 = alloca i1, i1 0
-  %nop6434 = alloca i1, i1 0
-  %nop6435 = alloca i1, i1 0
-  %nop6436 = alloca i1, i1 0
-  %nop6437 = alloca i1, i1 0
-  %nop6438 = alloca i1, i1 0
-  %nop6439 = alloca i1, i1 0
-  %nop6440 = alloca i1, i1 0
-  %nop6441 = alloca i1, i1 0
-  %nop6442 = alloca i1, i1 0
-  %nop6443 = alloca i1, i1 0
-  %nop6444 = alloca i1, i1 0
-  %nop6445 = alloca i1, i1 0
-  %nop6446 = alloca i1, i1 0
-  %nop6447 = alloca i1, i1 0
-  %nop6448 = alloca i1, i1 0
-  %nop6449 = alloca i1, i1 0
-  %nop6450 = alloca i1, i1 0
-  %nop6451 = alloca i1, i1 0
-  %nop6452 = alloca i1, i1 0
-  %nop6453 = alloca i1, i1 0
-  %nop6454 = alloca i1, i1 0
-  %nop6455 = alloca i1, i1 0
-  %nop6456 = alloca i1, i1 0
-  %nop6457 = alloca i1, i1 0
-  %nop6458 = alloca i1, i1 0
-  %nop6459 = alloca i1, i1 0
-  %nop6460 = alloca i1, i1 0
-  %nop6461 = alloca i1, i1 0
-  %nop6462 = alloca i1, i1 0
-  %nop6463 = alloca i1, i1 0
-  %nop6464 = alloca i1, i1 0
-  %nop6465 = alloca i1, i1 0
-  %nop6466 = alloca i1, i1 0
-  %nop6467 = alloca i1, i1 0
-  %nop6468 = alloca i1, i1 0
-  %nop6469 = alloca i1, i1 0
-  %nop6470 = alloca i1, i1 0
-  %nop6471 = alloca i1, i1 0
-  %nop6472 = alloca i1, i1 0
-  %nop6473 = alloca i1, i1 0
-  %nop6474 = alloca i1, i1 0
-  %nop6475 = alloca i1, i1 0
-  %nop6476 = alloca i1, i1 0
-  %nop6477 = alloca i1, i1 0
-  %nop6478 = alloca i1, i1 0
-  %nop6479 = alloca i1, i1 0
-  %nop6480 = alloca i1, i1 0
-  %nop6481 = alloca i1, i1 0
-  %nop6482 = alloca i1, i1 0
-  %nop6483 = alloca i1, i1 0
-  %nop6484 = alloca i1, i1 0
-  %nop6485 = alloca i1, i1 0
-  %nop6486 = alloca i1, i1 0
-  %nop6487 = alloca i1, i1 0
-  %nop6488 = alloca i1, i1 0
-  %nop6489 = alloca i1, i1 0
-  %nop6490 = alloca i1, i1 0
-  %nop6491 = alloca i1, i1 0
-  %nop6492 = alloca i1, i1 0
-  %nop6493 = alloca i1, i1 0
-  %nop6494 = alloca i1, i1 0
-  %nop6495 = alloca i1, i1 0
-  %nop6496 = alloca i1, i1 0
-  %nop6497 = alloca i1, i1 0
-  %nop6498 = alloca i1, i1 0
-  %nop6499 = alloca i1, i1 0
-  %nop6500 = alloca i1, i1 0
-  %nop6501 = alloca i1, i1 0
-  %nop6502 = alloca i1, i1 0
-  %nop6503 = alloca i1, i1 0
-  %nop6504 = alloca i1, i1 0
-  %nop6505 = alloca i1, i1 0
-  %nop6506 = alloca i1, i1 0
-  %nop6507 = alloca i1, i1 0
-  %nop6508 = alloca i1, i1 0
-  %nop6509 = alloca i1, i1 0
-  %nop6510 = alloca i1, i1 0
-  %nop6511 = alloca i1, i1 0
-  %nop6512 = alloca i1, i1 0
-  %nop6513 = alloca i1, i1 0
-  %nop6514 = alloca i1, i1 0
-  %nop6515 = alloca i1, i1 0
-  %nop6516 = alloca i1, i1 0
-  %nop6517 = alloca i1, i1 0
-  %nop6518 = alloca i1, i1 0
-  %nop6519 = alloca i1, i1 0
-  %nop6520 = alloca i1, i1 0
-  %nop6521 = alloca i1, i1 0
-  %nop6522 = alloca i1, i1 0
-  %nop6523 = alloca i1, i1 0
-  %nop6524 = alloca i1, i1 0
-  %nop6525 = alloca i1, i1 0
-  %nop6526 = alloca i1, i1 0
-  %nop6527 = alloca i1, i1 0
-  %nop6528 = alloca i1, i1 0
-  %nop6529 = alloca i1, i1 0
-  %nop6530 = alloca i1, i1 0
-  %nop6531 = alloca i1, i1 0
-  %nop6532 = alloca i1, i1 0
-  %nop6533 = alloca i1, i1 0
-  %nop6534 = alloca i1, i1 0
-  %nop6535 = alloca i1, i1 0
-  %nop6536 = alloca i1, i1 0
-  %nop6537 = alloca i1, i1 0
-  %nop6538 = alloca i1, i1 0
-  %nop6539 = alloca i1, i1 0
-  %nop6540 = alloca i1, i1 0
-  %nop6541 = alloca i1, i1 0
-  %nop6542 = alloca i1, i1 0
-  %nop6543 = alloca i1, i1 0
-  %nop6544 = alloca i1, i1 0
-  %nop6545 = alloca i1, i1 0
-  %nop6546 = alloca i1, i1 0
-  %nop6547 = alloca i1, i1 0
-  %nop6548 = alloca i1, i1 0
-  %nop6549 = alloca i1, i1 0
-  %nop6550 = alloca i1, i1 0
-  %nop6551 = alloca i1, i1 0
-  %nop6552 = alloca i1, i1 0
-  %nop6553 = alloca i1, i1 0
-  %nop6554 = alloca i1, i1 0
-  %nop6555 = alloca i1, i1 0
-  %nop6556 = alloca i1, i1 0
-  %nop6557 = alloca i1, i1 0
-  %nop6558 = alloca i1, i1 0
-  %nop6559 = alloca i1, i1 0
-  %nop6560 = alloca i1, i1 0
-  %nop6561 = alloca i1, i1 0
-  %nop6562 = alloca i1, i1 0
-  %nop6563 = alloca i1, i1 0
-  %nop6564 = alloca i1, i1 0
-  %nop6565 = alloca i1, i1 0
-  %nop6566 = alloca i1, i1 0
-  %nop6567 = alloca i1, i1 0
-  %nop6568 = alloca i1, i1 0
-  %nop6569 = alloca i1, i1 0
-  %nop6570 = alloca i1, i1 0
-  %nop6571 = alloca i1, i1 0
-  %nop6572 = alloca i1, i1 0
-  %nop6573 = alloca i1, i1 0
-  %nop6574 = alloca i1, i1 0
-  %nop6575 = alloca i1, i1 0
-  %nop6576 = alloca i1, i1 0
-  %nop6577 = alloca i1, i1 0
-  %nop6578 = alloca i1, i1 0
-  %nop6579 = alloca i1, i1 0
-  %nop6580 = alloca i1, i1 0
-  %nop6581 = alloca i1, i1 0
-  %nop6582 = alloca i1, i1 0
-  %nop6583 = alloca i1, i1 0
-  %nop6584 = alloca i1, i1 0
-  %nop6585 = alloca i1, i1 0
-  %nop6586 = alloca i1, i1 0
-  %nop6587 = alloca i1, i1 0
-  %nop6588 = alloca i1, i1 0
-  %nop6589 = alloca i1, i1 0
-  %nop6590 = alloca i1, i1 0
-  %nop6591 = alloca i1, i1 0
-  %nop6592 = alloca i1, i1 0
-  %nop6593 = alloca i1, i1 0
-  %nop6594 = alloca i1, i1 0
-  %nop6595 = alloca i1, i1 0
-  %nop6596 = alloca i1, i1 0
-  %nop6597 = alloca i1, i1 0
-  %nop6598 = alloca i1, i1 0
-  %nop6599 = alloca i1, i1 0
-  %nop6600 = alloca i1, i1 0
-  %nop6601 = alloca i1, i1 0
-  %nop6602 = alloca i1, i1 0
-  %nop6603 = alloca i1, i1 0
-  %nop6604 = alloca i1, i1 0
-  %nop6605 = alloca i1, i1 0
-  %nop6606 = alloca i1, i1 0
-  %nop6607 = alloca i1, i1 0
-  %nop6608 = alloca i1, i1 0
-  %nop6609 = alloca i1, i1 0
-  %nop6610 = alloca i1, i1 0
-  %nop6611 = alloca i1, i1 0
-  %nop6612 = alloca i1, i1 0
-  %nop6613 = alloca i1, i1 0
-  %nop6614 = alloca i1, i1 0
-  %nop6615 = alloca i1, i1 0
-  %nop6616 = alloca i1, i1 0
-  %nop6617 = alloca i1, i1 0
-  %nop6618 = alloca i1, i1 0
-  %nop6619 = alloca i1, i1 0
-  %nop6620 = alloca i1, i1 0
-  %nop6621 = alloca i1, i1 0
-  %nop6622 = alloca i1, i1 0
-  %nop6623 = alloca i1, i1 0
-  %nop6624 = alloca i1, i1 0
-  %nop6625 = alloca i1, i1 0
-  %nop6626 = alloca i1, i1 0
-  %nop6627 = alloca i1, i1 0
-  %nop6628 = alloca i1, i1 0
-  %nop6629 = alloca i1, i1 0
-  %nop6630 = alloca i1, i1 0
-  %nop6631 = alloca i1, i1 0
-  %nop6632 = alloca i1, i1 0
-  %nop6633 = alloca i1, i1 0
-  %nop6634 = alloca i1, i1 0
-  %nop6635 = alloca i1, i1 0
-  %nop6636 = alloca i1, i1 0
-  %nop6637 = alloca i1, i1 0
-  %nop6638 = alloca i1, i1 0
-  %nop6639 = alloca i1, i1 0
-  %nop6640 = alloca i1, i1 0
-  %nop6641 = alloca i1, i1 0
-  %nop6642 = alloca i1, i1 0
-  %nop6643 = alloca i1, i1 0
-  %nop6644 = alloca i1, i1 0
-  %nop6645 = alloca i1, i1 0
-  %nop6646 = alloca i1, i1 0
-  %nop6647 = alloca i1, i1 0
-  %nop6648 = alloca i1, i1 0
-  %nop6649 = alloca i1, i1 0
-  %nop6650 = alloca i1, i1 0
-  %nop6651 = alloca i1, i1 0
-  %nop6652 = alloca i1, i1 0
-  %nop6653 = alloca i1, i1 0
-  %nop6654 = alloca i1, i1 0
-  %nop6655 = alloca i1, i1 0
-  %nop6656 = alloca i1, i1 0
-  %nop6657 = alloca i1, i1 0
-  %nop6658 = alloca i1, i1 0
-  %nop6659 = alloca i1, i1 0
-  %nop6660 = alloca i1, i1 0
-  %nop6661 = alloca i1, i1 0
-  %nop6662 = alloca i1, i1 0
-  %nop6663 = alloca i1, i1 0
-  %nop6664 = alloca i1, i1 0
-  %nop6665 = alloca i1, i1 0
-  %nop6666 = alloca i1, i1 0
-  %nop6667 = alloca i1, i1 0
-  %nop6668 = alloca i1, i1 0
-  %nop6669 = alloca i1, i1 0
-  %nop6670 = alloca i1, i1 0
-  %nop6671 = alloca i1, i1 0
-  %nop6672 = alloca i1, i1 0
-  %nop6673 = alloca i1, i1 0
-  %nop6674 = alloca i1, i1 0
-  %nop6675 = alloca i1, i1 0
-  %nop6676 = alloca i1, i1 0
-  %nop6677 = alloca i1, i1 0
-  %nop6678 = alloca i1, i1 0
-  %nop6679 = alloca i1, i1 0
-  %nop6680 = alloca i1, i1 0
-  %nop6681 = alloca i1, i1 0
-  %nop6682 = alloca i1, i1 0
-  %nop6683 = alloca i1, i1 0
-  %nop6684 = alloca i1, i1 0
-  %nop6685 = alloca i1, i1 0
-  %nop6686 = alloca i1, i1 0
-  %nop6687 = alloca i1, i1 0
-  %nop6688 = alloca i1, i1 0
-  %nop6689 = alloca i1, i1 0
-  %nop6690 = alloca i1, i1 0
-  %nop6691 = alloca i1, i1 0
-  %nop6692 = alloca i1, i1 0
-  %nop6693 = alloca i1, i1 0
-  %nop6694 = alloca i1, i1 0
-  %nop6695 = alloca i1, i1 0
-  %nop6696 = alloca i1, i1 0
-  %nop6697 = alloca i1, i1 0
-  %nop6698 = alloca i1, i1 0
-  %nop6699 = alloca i1, i1 0
-  %nop6700 = alloca i1, i1 0
-  %nop6701 = alloca i1, i1 0
-  %nop6702 = alloca i1, i1 0
-  %nop6703 = alloca i1, i1 0
-  %nop6704 = alloca i1, i1 0
-  %nop6705 = alloca i1, i1 0
-  %nop6706 = alloca i1, i1 0
-  %nop6707 = alloca i1, i1 0
-  %nop6708 = alloca i1, i1 0
-  %nop6709 = alloca i1, i1 0
-  %nop6710 = alloca i1, i1 0
-  %nop6711 = alloca i1, i1 0
-  %nop6712 = alloca i1, i1 0
-  %nop6713 = alloca i1, i1 0
-  %nop6714 = alloca i1, i1 0
-  %nop6715 = alloca i1, i1 0
-  %nop6716 = alloca i1, i1 0
-  %nop6717 = alloca i1, i1 0
-  %nop6718 = alloca i1, i1 0
-  %nop6719 = alloca i1, i1 0
-  %nop6720 = alloca i1, i1 0
-  %nop6721 = alloca i1, i1 0
-  %nop6722 = alloca i1, i1 0
-  %nop6723 = alloca i1, i1 0
-  %nop6724 = alloca i1, i1 0
-  %nop6725 = alloca i1, i1 0
-  %nop6726 = alloca i1, i1 0
-  %nop6727 = alloca i1, i1 0
-  %nop6728 = alloca i1, i1 0
-  %nop6729 = alloca i1, i1 0
-  %nop6730 = alloca i1, i1 0
-  %nop6731 = alloca i1, i1 0
-  %nop6732 = alloca i1, i1 0
-  %nop6733 = alloca i1, i1 0
-  %nop6734 = alloca i1, i1 0
-  %nop6735 = alloca i1, i1 0
-  %nop6736 = alloca i1, i1 0
-  %nop6737 = alloca i1, i1 0
-  %nop6738 = alloca i1, i1 0
-  %nop6739 = alloca i1, i1 0
-  %nop6740 = alloca i1, i1 0
-  %nop6741 = alloca i1, i1 0
-  %nop6742 = alloca i1, i1 0
-  %nop6743 = alloca i1, i1 0
-  %nop6744 = alloca i1, i1 0
-  %nop6745 = alloca i1, i1 0
-  %nop6746 = alloca i1, i1 0
-  %nop6747 = alloca i1, i1 0
-  %nop6748 = alloca i1, i1 0
-  %nop6749 = alloca i1, i1 0
-  %nop6750 = alloca i1, i1 0
-  %nop6751 = alloca i1, i1 0
-  %nop6752 = alloca i1, i1 0
-  %nop6753 = alloca i1, i1 0
-  %nop6754 = alloca i1, i1 0
-  %nop6755 = alloca i1, i1 0
-  %nop6756 = alloca i1, i1 0
-  %nop6757 = alloca i1, i1 0
-  %nop6758 = alloca i1, i1 0
-  %nop6759 = alloca i1, i1 0
-  %nop6760 = alloca i1, i1 0
-  %nop6761 = alloca i1, i1 0
-  %nop6762 = alloca i1, i1 0
-  %nop6763 = alloca i1, i1 0
-  %nop6764 = alloca i1, i1 0
-  %nop6765 = alloca i1, i1 0
-  %nop6766 = alloca i1, i1 0
-  %nop6767 = alloca i1, i1 0
-  %nop6768 = alloca i1, i1 0
-  %nop6769 = alloca i1, i1 0
-  %nop6770 = alloca i1, i1 0
-  %nop6771 = alloca i1, i1 0
-  %nop6772 = alloca i1, i1 0
-  %nop6773 = alloca i1, i1 0
-  %nop6774 = alloca i1, i1 0
-  %nop6775 = alloca i1, i1 0
-  %nop6776 = alloca i1, i1 0
-  %nop6777 = alloca i1, i1 0
-  %nop6778 = alloca i1, i1 0
-  %nop6779 = alloca i1, i1 0
-  %nop6780 = alloca i1, i1 0
-  %nop6781 = alloca i1, i1 0
-  %nop6782 = alloca i1, i1 0
-  %nop6783 = alloca i1, i1 0
-  %nop6784 = alloca i1, i1 0
-  %nop6785 = alloca i1, i1 0
-  %nop6786 = alloca i1, i1 0
-  %nop6787 = alloca i1, i1 0
-  %nop6788 = alloca i1, i1 0
-  %nop6789 = alloca i1, i1 0
-  %nop6790 = alloca i1, i1 0
-  %nop6791 = alloca i1, i1 0
-  %nop6792 = alloca i1, i1 0
-  %nop6793 = alloca i1, i1 0
-  %nop6794 = alloca i1, i1 0
-  %nop6795 = alloca i1, i1 0
-  %nop6796 = alloca i1, i1 0
-  %nop6797 = alloca i1, i1 0
-  %nop6798 = alloca i1, i1 0
-  %nop6799 = alloca i1, i1 0
-  %nop6800 = alloca i1, i1 0
-  %nop6801 = alloca i1, i1 0
-  %nop6802 = alloca i1, i1 0
-  %nop6803 = alloca i1, i1 0
-  %nop6804 = alloca i1, i1 0
-  %nop6805 = alloca i1, i1 0
-  %nop6806 = alloca i1, i1 0
-  %nop6807 = alloca i1, i1 0
-  %nop6808 = alloca i1, i1 0
-  %nop6809 = alloca i1, i1 0
-  %nop6810 = alloca i1, i1 0
-  %nop6811 = alloca i1, i1 0
-  %nop6812 = alloca i1, i1 0
-  %nop6813 = alloca i1, i1 0
-  %nop6814 = alloca i1, i1 0
-  %nop6815 = alloca i1, i1 0
-  %nop6816 = alloca i1, i1 0
-  %nop6817 = alloca i1, i1 0
-  %nop6818 = alloca i1, i1 0
-  %nop6819 = alloca i1, i1 0
-  %nop6820 = alloca i1, i1 0
-  %nop6821 = alloca i1, i1 0
-  %nop6822 = alloca i1, i1 0
-  %nop6823 = alloca i1, i1 0
-  %nop6824 = alloca i1, i1 0
-  %nop6825 = alloca i1, i1 0
-  %nop6826 = alloca i1, i1 0
-  %nop6827 = alloca i1, i1 0
-  %nop6828 = alloca i1, i1 0
-  %nop6829 = alloca i1, i1 0
-  %nop6830 = alloca i1, i1 0
-  %nop6831 = alloca i1, i1 0
-  %nop6832 = alloca i1, i1 0
-  %nop6833 = alloca i1, i1 0
-  %nop6834 = alloca i1, i1 0
-  %nop6835 = alloca i1, i1 0
-  %nop6836 = alloca i1, i1 0
-  %nop6837 = alloca i1, i1 0
-  %nop6838 = alloca i1, i1 0
-  %nop6839 = alloca i1, i1 0
-  %nop6840 = alloca i1, i1 0
-  %nop6841 = alloca i1, i1 0
-  %nop6842 = alloca i1, i1 0
-  %nop6843 = alloca i1, i1 0
-  %nop6844 = alloca i1, i1 0
-  %nop6845 = alloca i1, i1 0
-  %nop6846 = alloca i1, i1 0
-  %nop6847 = alloca i1, i1 0
-  %nop6848 = alloca i1, i1 0
-  %nop6849 = alloca i1, i1 0
-  %nop6850 = alloca i1, i1 0
-  %nop6851 = alloca i1, i1 0
-  %nop6852 = alloca i1, i1 0
-  %nop6853 = alloca i1, i1 0
-  %nop6854 = alloca i1, i1 0
-  %nop6855 = alloca i1, i1 0
-  %nop6856 = alloca i1, i1 0
-  %nop6857 = alloca i1, i1 0
-  %nop6858 = alloca i1, i1 0
-  %nop6859 = alloca i1, i1 0
-  %nop6860 = alloca i1, i1 0
-  %nop6861 = alloca i1, i1 0
-  %nop6862 = alloca i1, i1 0
-  %nop6863 = alloca i1, i1 0
-  %nop6864 = alloca i1, i1 0
-  %nop6865 = alloca i1, i1 0
-  %nop6866 = alloca i1, i1 0
-  %nop6867 = alloca i1, i1 0
-  %nop6868 = alloca i1, i1 0
-  %nop6869 = alloca i1, i1 0
-  %nop6870 = alloca i1, i1 0
-  %nop6871 = alloca i1, i1 0
-  %nop6872 = alloca i1, i1 0
-  %nop6873 = alloca i1, i1 0
-  %nop6874 = alloca i1, i1 0
-  %nop6875 = alloca i1, i1 0
-  %nop6876 = alloca i1, i1 0
-  %nop6877 = alloca i1, i1 0
-  %nop6878 = alloca i1, i1 0
-  %nop6879 = alloca i1, i1 0
-  %nop6880 = alloca i1, i1 0
-  %nop6881 = alloca i1, i1 0
-  %nop6882 = alloca i1, i1 0
-  %nop6883 = alloca i1, i1 0
-  %nop6884 = alloca i1, i1 0
-  %nop6885 = alloca i1, i1 0
-  %nop6886 = alloca i1, i1 0
-  %nop6887 = alloca i1, i1 0
-  %nop6888 = alloca i1, i1 0
-  %nop6889 = alloca i1, i1 0
-  %nop6890 = alloca i1, i1 0
-  %nop6891 = alloca i1, i1 0
-  %nop6892 = alloca i1, i1 0
-  %nop6893 = alloca i1, i1 0
-  %nop6894 = alloca i1, i1 0
-  %nop6895 = alloca i1, i1 0
-  %nop6896 = alloca i1, i1 0
-  %nop6897 = alloca i1, i1 0
-  %nop6898 = alloca i1, i1 0
-  %nop6899 = alloca i1, i1 0
-  %nop6900 = alloca i1, i1 0
-  %nop6901 = alloca i1, i1 0
-  %nop6902 = alloca i1, i1 0
-  %nop6903 = alloca i1, i1 0
-  %nop6904 = alloca i1, i1 0
-  %nop6905 = alloca i1, i1 0
-  %nop6906 = alloca i1, i1 0
-  %nop6907 = alloca i1, i1 0
-  %nop6908 = alloca i1, i1 0
-  %nop6909 = alloca i1, i1 0
-  %nop6910 = alloca i1, i1 0
-  %nop6911 = alloca i1, i1 0
-  %nop6912 = alloca i1, i1 0
-  %nop6913 = alloca i1, i1 0
-  %nop6914 = alloca i1, i1 0
-  %nop6915 = alloca i1, i1 0
-  %nop6916 = alloca i1, i1 0
-  %nop6917 = alloca i1, i1 0
-  %nop6918 = alloca i1, i1 0
-  %nop6919 = alloca i1, i1 0
-  %nop6920 = alloca i1, i1 0
-  %nop6921 = alloca i1, i1 0
-  %nop6922 = alloca i1, i1 0
-  %nop6923 = alloca i1, i1 0
-  %nop6924 = alloca i1, i1 0
-  %nop6925 = alloca i1, i1 0
-  %nop6926 = alloca i1, i1 0
-  %nop6927 = alloca i1, i1 0
-  %nop6928 = alloca i1, i1 0
-  %nop6929 = alloca i1, i1 0
-  %nop6930 = alloca i1, i1 0
-  %nop6931 = alloca i1, i1 0
-  %nop6932 = alloca i1, i1 0
-  %nop6933 = alloca i1, i1 0
-  %nop6934 = alloca i1, i1 0
-  %nop6935 = alloca i1, i1 0
-  %nop6936 = alloca i1, i1 0
-  %nop6937 = alloca i1, i1 0
-  %nop6938 = alloca i1, i1 0
-  %nop6939 = alloca i1, i1 0
-  %nop6940 = alloca i1, i1 0
-  %nop6941 = alloca i1, i1 0
-  %nop6942 = alloca i1, i1 0
-  %nop6943 = alloca i1, i1 0
-  %nop6944 = alloca i1, i1 0
-  %nop6945 = alloca i1, i1 0
-  %nop6946 = alloca i1, i1 0
-  %nop6947 = alloca i1, i1 0
-  %nop6948 = alloca i1, i1 0
-  %nop6949 = alloca i1, i1 0
-  %nop6950 = alloca i1, i1 0
-  %nop6951 = alloca i1, i1 0
-  %nop6952 = alloca i1, i1 0
-  %nop6953 = alloca i1, i1 0
-  %nop6954 = alloca i1, i1 0
-  %nop6955 = alloca i1, i1 0
-  %nop6956 = alloca i1, i1 0
-  %nop6957 = alloca i1, i1 0
-  %nop6958 = alloca i1, i1 0
-  %nop6959 = alloca i1, i1 0
-  %nop6960 = alloca i1, i1 0
-  %nop6961 = alloca i1, i1 0
-  %nop6962 = alloca i1, i1 0
-  %nop6963 = alloca i1, i1 0
-  %nop6964 = alloca i1, i1 0
-  %nop6965 = alloca i1, i1 0
-  %nop6966 = alloca i1, i1 0
-  %nop6967 = alloca i1, i1 0
-  %nop6968 = alloca i1, i1 0
-  %nop6969 = alloca i1, i1 0
-  %nop6970 = alloca i1, i1 0
-  %nop6971 = alloca i1, i1 0
-  %nop6972 = alloca i1, i1 0
-  %nop6973 = alloca i1, i1 0
-  %nop6974 = alloca i1, i1 0
-  %nop6975 = alloca i1, i1 0
-  %nop6976 = alloca i1, i1 0
-  %nop6977 = alloca i1, i1 0
-  %nop6978 = alloca i1, i1 0
-  %nop6979 = alloca i1, i1 0
-  %nop6980 = alloca i1, i1 0
-  %nop6981 = alloca i1, i1 0
-  %nop6982 = alloca i1, i1 0
-  %nop6983 = alloca i1, i1 0
-  %nop6984 = alloca i1, i1 0
-  %nop6985 = alloca i1, i1 0
-  %nop6986 = alloca i1, i1 0
-  %nop6987 = alloca i1, i1 0
-  %nop6988 = alloca i1, i1 0
-  %nop6989 = alloca i1, i1 0
-  %nop6990 = alloca i1, i1 0
-  %nop6991 = alloca i1, i1 0
-  %nop6992 = alloca i1, i1 0
-  %nop6993 = alloca i1, i1 0
-  %nop6994 = alloca i1, i1 0
-  %nop6995 = alloca i1, i1 0
-  %nop6996 = alloca i1, i1 0
-  %nop6997 = alloca i1, i1 0
-  %nop6998 = alloca i1, i1 0
-  %nop6999 = alloca i1, i1 0
-  %nop7000 = alloca i1, i1 0
-  %nop7001 = alloca i1, i1 0
-  %nop7002 = alloca i1, i1 0
-  %nop7003 = alloca i1, i1 0
-  %nop7004 = alloca i1, i1 0
-  %nop7005 = alloca i1, i1 0
-  %nop7006 = alloca i1, i1 0
-  %nop7007 = alloca i1, i1 0
-  %nop7008 = alloca i1, i1 0
-  %nop7009 = alloca i1, i1 0
-  %nop7010 = alloca i1, i1 0
-  %nop7011 = alloca i1, i1 0
-  %nop7012 = alloca i1, i1 0
-  %nop7013 = alloca i1, i1 0
-  %nop7014 = alloca i1, i1 0
-  %nop7015 = alloca i1, i1 0
-  %nop7016 = alloca i1, i1 0
-  %nop7017 = alloca i1, i1 0
-  %nop7018 = alloca i1, i1 0
-  %nop7019 = alloca i1, i1 0
-  %nop7020 = alloca i1, i1 0
-  %nop7021 = alloca i1, i1 0
-  %nop7022 = alloca i1, i1 0
-  %nop7023 = alloca i1, i1 0
-  %nop7024 = alloca i1, i1 0
-  %nop7025 = alloca i1, i1 0
-  %nop7026 = alloca i1, i1 0
-  %nop7027 = alloca i1, i1 0
-  %nop7028 = alloca i1, i1 0
-  %nop7029 = alloca i1, i1 0
-  %nop7030 = alloca i1, i1 0
-  %nop7031 = alloca i1, i1 0
-  %nop7032 = alloca i1, i1 0
-  %nop7033 = alloca i1, i1 0
-  %nop7034 = alloca i1, i1 0
-  %nop7035 = alloca i1, i1 0
-  %nop7036 = alloca i1, i1 0
-  %nop7037 = alloca i1, i1 0
-  %nop7038 = alloca i1, i1 0
-  %nop7039 = alloca i1, i1 0
-  %nop7040 = alloca i1, i1 0
-  %nop7041 = alloca i1, i1 0
-  %nop7042 = alloca i1, i1 0
-  %nop7043 = alloca i1, i1 0
-  %nop7044 = alloca i1, i1 0
-  %nop7045 = alloca i1, i1 0
-  %nop7046 = alloca i1, i1 0
-  %nop7047 = alloca i1, i1 0
-  %nop7048 = alloca i1, i1 0
-  %nop7049 = alloca i1, i1 0
-  %nop7050 = alloca i1, i1 0
-  %nop7051 = alloca i1, i1 0
-  %nop7052 = alloca i1, i1 0
-  %nop7053 = alloca i1, i1 0
-  %nop7054 = alloca i1, i1 0
-  %nop7055 = alloca i1, i1 0
-  %nop7056 = alloca i1, i1 0
-  %nop7057 = alloca i1, i1 0
-  %nop7058 = alloca i1, i1 0
-  %nop7059 = alloca i1, i1 0
-  %nop7060 = alloca i1, i1 0
-  %nop7061 = alloca i1, i1 0
-  %nop7062 = alloca i1, i1 0
-  %nop7063 = alloca i1, i1 0
-  %nop7064 = alloca i1, i1 0
-  %nop7065 = alloca i1, i1 0
-  %nop7066 = alloca i1, i1 0
-  %nop7067 = alloca i1, i1 0
-  %nop7068 = alloca i1, i1 0
-  %nop7069 = alloca i1, i1 0
-  %nop7070 = alloca i1, i1 0
-  %nop7071 = alloca i1, i1 0
-  %nop7072 = alloca i1, i1 0
-  %nop7073 = alloca i1, i1 0
-  %nop7074 = alloca i1, i1 0
-  %nop7075 = alloca i1, i1 0
-  %nop7076 = alloca i1, i1 0
-  %nop7077 = alloca i1, i1 0
-  %nop7078 = alloca i1, i1 0
-  %nop7079 = alloca i1, i1 0
-  %nop7080 = alloca i1, i1 0
-  %nop7081 = alloca i1, i1 0
-  %nop7082 = alloca i1, i1 0
-  %nop7083 = alloca i1, i1 0
-  %nop7084 = alloca i1, i1 0
-  %nop7085 = alloca i1, i1 0
-  %nop7086 = alloca i1, i1 0
-  %nop7087 = alloca i1, i1 0
-  %nop7088 = alloca i1, i1 0
-  %nop7089 = alloca i1, i1 0
-  %nop7090 = alloca i1, i1 0
-  %nop7091 = alloca i1, i1 0
-  %nop7092 = alloca i1, i1 0
-  %nop7093 = alloca i1, i1 0
-  %nop7094 = alloca i1, i1 0
-  %nop7095 = alloca i1, i1 0
-  %nop7096 = alloca i1, i1 0
-  %nop7097 = alloca i1, i1 0
-  %nop7098 = alloca i1, i1 0
-  %nop7099 = alloca i1, i1 0
-  %nop7100 = alloca i1, i1 0
-  %nop7101 = alloca i1, i1 0
-  %nop7102 = alloca i1, i1 0
-  %nop7103 = alloca i1, i1 0
-  %nop7104 = alloca i1, i1 0
-  %nop7105 = alloca i1, i1 0
-  %nop7106 = alloca i1, i1 0
-  %nop7107 = alloca i1, i1 0
-  %nop7108 = alloca i1, i1 0
-  %nop7109 = alloca i1, i1 0
-  %nop7110 = alloca i1, i1 0
-  %nop7111 = alloca i1, i1 0
-  %nop7112 = alloca i1, i1 0
-  %nop7113 = alloca i1, i1 0
-  %nop7114 = alloca i1, i1 0
-  %nop7115 = alloca i1, i1 0
-  %nop7116 = alloca i1, i1 0
-  %nop7117 = alloca i1, i1 0
-  %nop7118 = alloca i1, i1 0
-  %nop7119 = alloca i1, i1 0
-  %nop7120 = alloca i1, i1 0
-  %nop7121 = alloca i1, i1 0
-  %nop7122 = alloca i1, i1 0
-  %nop7123 = alloca i1, i1 0
-  %nop7124 = alloca i1, i1 0
-  %nop7125 = alloca i1, i1 0
-  %nop7126 = alloca i1, i1 0
-  %nop7127 = alloca i1, i1 0
-  %nop7128 = alloca i1, i1 0
-  %nop7129 = alloca i1, i1 0
-  %nop7130 = alloca i1, i1 0
-  %nop7131 = alloca i1, i1 0
-  %nop7132 = alloca i1, i1 0
-  %nop7133 = alloca i1, i1 0
-  %nop7134 = alloca i1, i1 0
-  %nop7135 = alloca i1, i1 0
-  %nop7136 = alloca i1, i1 0
-  %nop7137 = alloca i1, i1 0
-  %nop7138 = alloca i1, i1 0
-  %nop7139 = alloca i1, i1 0
-  %nop7140 = alloca i1, i1 0
-  %nop7141 = alloca i1, i1 0
-  %nop7142 = alloca i1, i1 0
-  %nop7143 = alloca i1, i1 0
-  %nop7144 = alloca i1, i1 0
-  %nop7145 = alloca i1, i1 0
-  %nop7146 = alloca i1, i1 0
-  %nop7147 = alloca i1, i1 0
-  %nop7148 = alloca i1, i1 0
-  %nop7149 = alloca i1, i1 0
-  %nop7150 = alloca i1, i1 0
-  %nop7151 = alloca i1, i1 0
-  %nop7152 = alloca i1, i1 0
-  %nop7153 = alloca i1, i1 0
-  %nop7154 = alloca i1, i1 0
-  %nop7155 = alloca i1, i1 0
-  %nop7156 = alloca i1, i1 0
-  %nop7157 = alloca i1, i1 0
-  %nop7158 = alloca i1, i1 0
-  %nop7159 = alloca i1, i1 0
-  %nop7160 = alloca i1, i1 0
-  %nop7161 = alloca i1, i1 0
-  %nop7162 = alloca i1, i1 0
-  %nop7163 = alloca i1, i1 0
-  %nop7164 = alloca i1, i1 0
-  %nop7165 = alloca i1, i1 0
-  %nop7166 = alloca i1, i1 0
-  %nop7167 = alloca i1, i1 0
-  %nop7168 = alloca i1, i1 0
-  %nop7169 = alloca i1, i1 0
-  %nop7170 = alloca i1, i1 0
-  %nop7171 = alloca i1, i1 0
-  %nop7172 = alloca i1, i1 0
-  %nop7173 = alloca i1, i1 0
-  %nop7174 = alloca i1, i1 0
-  %nop7175 = alloca i1, i1 0
-  %nop7176 = alloca i1, i1 0
-  %nop7177 = alloca i1, i1 0
-  %nop7178 = alloca i1, i1 0
-  %nop7179 = alloca i1, i1 0
-  %nop7180 = alloca i1, i1 0
-  %nop7181 = alloca i1, i1 0
-  %nop7182 = alloca i1, i1 0
-  %nop7183 = alloca i1, i1 0
-  %nop7184 = alloca i1, i1 0
-  %nop7185 = alloca i1, i1 0
-  %nop7186 = alloca i1, i1 0
-  %nop7187 = alloca i1, i1 0
-  %nop7188 = alloca i1, i1 0
-  %nop7189 = alloca i1, i1 0
-  %nop7190 = alloca i1, i1 0
-  %nop7191 = alloca i1, i1 0
-  %nop7192 = alloca i1, i1 0
-  %nop7193 = alloca i1, i1 0
-  %nop7194 = alloca i1, i1 0
-  %nop7195 = alloca i1, i1 0
-  %nop7196 = alloca i1, i1 0
-  %nop7197 = alloca i1, i1 0
-  %nop7198 = alloca i1, i1 0
-  %nop7199 = alloca i1, i1 0
-  %nop7200 = alloca i1, i1 0
-  %nop7201 = alloca i1, i1 0
-  %nop7202 = alloca i1, i1 0
-  %nop7203 = alloca i1, i1 0
-  %nop7204 = alloca i1, i1 0
-  %nop7205 = alloca i1, i1 0
-  %nop7206 = alloca i1, i1 0
-  %nop7207 = alloca i1, i1 0
-  %nop7208 = alloca i1, i1 0
-  %nop7209 = alloca i1, i1 0
-  %nop7210 = alloca i1, i1 0
-  %nop7211 = alloca i1, i1 0
-  %nop7212 = alloca i1, i1 0
-  %nop7213 = alloca i1, i1 0
-  %nop7214 = alloca i1, i1 0
-  %nop7215 = alloca i1, i1 0
-  %nop7216 = alloca i1, i1 0
-  %nop7217 = alloca i1, i1 0
-  %nop7218 = alloca i1, i1 0
-  %nop7219 = alloca i1, i1 0
-  %nop7220 = alloca i1, i1 0
-  %nop7221 = alloca i1, i1 0
-  %nop7222 = alloca i1, i1 0
-  %nop7223 = alloca i1, i1 0
-  %nop7224 = alloca i1, i1 0
-  %nop7225 = alloca i1, i1 0
-  %nop7226 = alloca i1, i1 0
-  %nop7227 = alloca i1, i1 0
-  %nop7228 = alloca i1, i1 0
-  %nop7229 = alloca i1, i1 0
-  %nop7230 = alloca i1, i1 0
-  %nop7231 = alloca i1, i1 0
-  %nop7232 = alloca i1, i1 0
-  %nop7233 = alloca i1, i1 0
-  %nop7234 = alloca i1, i1 0
-  %nop7235 = alloca i1, i1 0
-  %nop7236 = alloca i1, i1 0
-  %nop7237 = alloca i1, i1 0
-  %nop7238 = alloca i1, i1 0
-  %nop7239 = alloca i1, i1 0
-  %nop7240 = alloca i1, i1 0
-  %nop7241 = alloca i1, i1 0
-  %nop7242 = alloca i1, i1 0
-  %nop7243 = alloca i1, i1 0
-  %nop7244 = alloca i1, i1 0
-  %nop7245 = alloca i1, i1 0
-  %nop7246 = alloca i1, i1 0
-  %nop7247 = alloca i1, i1 0
-  %nop7248 = alloca i1, i1 0
-  %nop7249 = alloca i1, i1 0
-  %nop7250 = alloca i1, i1 0
-  %nop7251 = alloca i1, i1 0
-  %nop7252 = alloca i1, i1 0
-  %nop7253 = alloca i1, i1 0
-  %nop7254 = alloca i1, i1 0
-  %nop7255 = alloca i1, i1 0
-  %nop7256 = alloca i1, i1 0
-  %nop7257 = alloca i1, i1 0
-  %nop7258 = alloca i1, i1 0
-  %nop7259 = alloca i1, i1 0
-  %nop7260 = alloca i1, i1 0
-  %nop7261 = alloca i1, i1 0
-  %nop7262 = alloca i1, i1 0
-  %nop7263 = alloca i1, i1 0
-  %nop7264 = alloca i1, i1 0
-  %nop7265 = alloca i1, i1 0
-  %nop7266 = alloca i1, i1 0
-  %nop7267 = alloca i1, i1 0
-  %nop7268 = alloca i1, i1 0
-  %nop7269 = alloca i1, i1 0
-  %nop7270 = alloca i1, i1 0
-  %nop7271 = alloca i1, i1 0
-  %nop7272 = alloca i1, i1 0
-  %nop7273 = alloca i1, i1 0
-  %nop7274 = alloca i1, i1 0
-  %nop7275 = alloca i1, i1 0
-  %nop7276 = alloca i1, i1 0
-  %nop7277 = alloca i1, i1 0
-  %nop7278 = alloca i1, i1 0
-  %nop7279 = alloca i1, i1 0
-  %nop7280 = alloca i1, i1 0
-  %nop7281 = alloca i1, i1 0
-  %nop7282 = alloca i1, i1 0
-  %nop7283 = alloca i1, i1 0
-  %nop7284 = alloca i1, i1 0
-  %nop7285 = alloca i1, i1 0
-  %nop7286 = alloca i1, i1 0
-  %nop7287 = alloca i1, i1 0
-  %nop7288 = alloca i1, i1 0
-  %nop7289 = alloca i1, i1 0
-  %nop7290 = alloca i1, i1 0
-  %nop7291 = alloca i1, i1 0
-  %nop7292 = alloca i1, i1 0
-  %nop7293 = alloca i1, i1 0
-  %nop7294 = alloca i1, i1 0
-  %nop7295 = alloca i1, i1 0
-  %nop7296 = alloca i1, i1 0
-  %nop7297 = alloca i1, i1 0
-  %nop7298 = alloca i1, i1 0
-  %nop7299 = alloca i1, i1 0
-  %nop7300 = alloca i1, i1 0
-  %nop7301 = alloca i1, i1 0
-  %nop7302 = alloca i1, i1 0
-  %nop7303 = alloca i1, i1 0
-  %nop7304 = alloca i1, i1 0
-  %nop7305 = alloca i1, i1 0
-  %nop7306 = alloca i1, i1 0
-  %nop7307 = alloca i1, i1 0
-  %nop7308 = alloca i1, i1 0
-  %nop7309 = alloca i1, i1 0
-  %nop7310 = alloca i1, i1 0
-  %nop7311 = alloca i1, i1 0
-  %nop7312 = alloca i1, i1 0
-  %nop7313 = alloca i1, i1 0
-  %nop7314 = alloca i1, i1 0
-  %nop7315 = alloca i1, i1 0
-  %nop7316 = alloca i1, i1 0
-  %nop7317 = alloca i1, i1 0
-  %nop7318 = alloca i1, i1 0
-  %nop7319 = alloca i1, i1 0
-  %nop7320 = alloca i1, i1 0
-  %nop7321 = alloca i1, i1 0
-  %nop7322 = alloca i1, i1 0
-  %nop7323 = alloca i1, i1 0
-  %nop7324 = alloca i1, i1 0
-  %nop7325 = alloca i1, i1 0
-  %nop7326 = alloca i1, i1 0
-  %nop7327 = alloca i1, i1 0
-  %nop7328 = alloca i1, i1 0
-  %nop7329 = alloca i1, i1 0
-  %nop7330 = alloca i1, i1 0
-  %nop7331 = alloca i1, i1 0
-  %nop7332 = alloca i1, i1 0
-  %nop7333 = alloca i1, i1 0
-  %nop7334 = alloca i1, i1 0
-  %nop7335 = alloca i1, i1 0
-  %nop7336 = alloca i1, i1 0
-  %nop7337 = alloca i1, i1 0
-  %nop7338 = alloca i1, i1 0
-  %nop7339 = alloca i1, i1 0
-  %nop7340 = alloca i1, i1 0
-  %nop7341 = alloca i1, i1 0
-  %nop7342 = alloca i1, i1 0
-  %nop7343 = alloca i1, i1 0
-  %nop7344 = alloca i1, i1 0
-  %nop7345 = alloca i1, i1 0
-  %nop7346 = alloca i1, i1 0
-  %nop7347 = alloca i1, i1 0
-  %nop7348 = alloca i1, i1 0
-  %nop7349 = alloca i1, i1 0
-  %nop7350 = alloca i1, i1 0
-  %nop7351 = alloca i1, i1 0
-  %nop7352 = alloca i1, i1 0
-  %nop7353 = alloca i1, i1 0
-  %nop7354 = alloca i1, i1 0
-  %nop7355 = alloca i1, i1 0
-  %nop7356 = alloca i1, i1 0
-  %nop7357 = alloca i1, i1 0
-  %nop7358 = alloca i1, i1 0
-  %nop7359 = alloca i1, i1 0
-  %nop7360 = alloca i1, i1 0
-  %nop7361 = alloca i1, i1 0
-  %nop7362 = alloca i1, i1 0
-  %nop7363 = alloca i1, i1 0
-  %nop7364 = alloca i1, i1 0
-  %nop7365 = alloca i1, i1 0
-  %nop7366 = alloca i1, i1 0
-  %nop7367 = alloca i1, i1 0
-  %nop7368 = alloca i1, i1 0
-  %nop7369 = alloca i1, i1 0
-  %nop7370 = alloca i1, i1 0
-  %nop7371 = alloca i1, i1 0
-  %nop7372 = alloca i1, i1 0
-  %nop7373 = alloca i1, i1 0
-  %nop7374 = alloca i1, i1 0
-  %nop7375 = alloca i1, i1 0
-  %nop7376 = alloca i1, i1 0
-  %nop7377 = alloca i1, i1 0
-  %nop7378 = alloca i1, i1 0
-  %nop7379 = alloca i1, i1 0
-  %nop7380 = alloca i1, i1 0
-  %nop7381 = alloca i1, i1 0
-  %nop7382 = alloca i1, i1 0
-  %nop7383 = alloca i1, i1 0
-  %nop7384 = alloca i1, i1 0
-  %nop7385 = alloca i1, i1 0
-  %nop7386 = alloca i1, i1 0
-  %nop7387 = alloca i1, i1 0
-  %nop7388 = alloca i1, i1 0
-  %nop7389 = alloca i1, i1 0
-  %nop7390 = alloca i1, i1 0
-  %nop7391 = alloca i1, i1 0
-  %nop7392 = alloca i1, i1 0
-  %nop7393 = alloca i1, i1 0
-  %nop7394 = alloca i1, i1 0
-  %nop7395 = alloca i1, i1 0
-  %nop7396 = alloca i1, i1 0
-  %nop7397 = alloca i1, i1 0
-  %nop7398 = alloca i1, i1 0
-  %nop7399 = alloca i1, i1 0
-  %nop7400 = alloca i1, i1 0
-  %nop7401 = alloca i1, i1 0
-  %nop7402 = alloca i1, i1 0
-  %nop7403 = alloca i1, i1 0
-  %nop7404 = alloca i1, i1 0
-  %nop7405 = alloca i1, i1 0
-  %nop7406 = alloca i1, i1 0
-  %nop7407 = alloca i1, i1 0
-  %nop7408 = alloca i1, i1 0
-  %nop7409 = alloca i1, i1 0
-  %nop7410 = alloca i1, i1 0
-  %nop7411 = alloca i1, i1 0
-  %nop7412 = alloca i1, i1 0
-  %nop7413 = alloca i1, i1 0
-  %nop7414 = alloca i1, i1 0
-  %nop7415 = alloca i1, i1 0
-  %nop7416 = alloca i1, i1 0
-  %nop7417 = alloca i1, i1 0
-  %nop7418 = alloca i1, i1 0
-  %nop7419 = alloca i1, i1 0
-  %nop7420 = alloca i1, i1 0
-  %nop7421 = alloca i1, i1 0
-  %nop7422 = alloca i1, i1 0
-  %nop7423 = alloca i1, i1 0
-  %nop7424 = alloca i1, i1 0
-  %nop7425 = alloca i1, i1 0
-  %nop7426 = alloca i1, i1 0
-  %nop7427 = alloca i1, i1 0
-  %nop7428 = alloca i1, i1 0
-  %nop7429 = alloca i1, i1 0
-  %nop7430 = alloca i1, i1 0
-  %nop7431 = alloca i1, i1 0
-  %nop7432 = alloca i1, i1 0
-  %nop7433 = alloca i1, i1 0
-  %nop7434 = alloca i1, i1 0
-  %nop7435 = alloca i1, i1 0
-  %nop7436 = alloca i1, i1 0
-  %nop7437 = alloca i1, i1 0
-  %nop7438 = alloca i1, i1 0
-  %nop7439 = alloca i1, i1 0
-  %nop7440 = alloca i1, i1 0
-  %nop7441 = alloca i1, i1 0
-  %nop7442 = alloca i1, i1 0
-  %nop7443 = alloca i1, i1 0
-  %nop7444 = alloca i1, i1 0
-  %nop7445 = alloca i1, i1 0
-  %nop7446 = alloca i1, i1 0
-  %nop7447 = alloca i1, i1 0
-  %nop7448 = alloca i1, i1 0
-  %nop7449 = alloca i1, i1 0
-  %nop7450 = alloca i1, i1 0
-  %nop7451 = alloca i1, i1 0
-  %nop7452 = alloca i1, i1 0
-  %nop7453 = alloca i1, i1 0
-  %nop7454 = alloca i1, i1 0
-  %nop7455 = alloca i1, i1 0
-  %nop7456 = alloca i1, i1 0
-  %nop7457 = alloca i1, i1 0
-  %nop7458 = alloca i1, i1 0
-  %nop7459 = alloca i1, i1 0
-  %nop7460 = alloca i1, i1 0
-  %nop7461 = alloca i1, i1 0
-  %nop7462 = alloca i1, i1 0
-  %nop7463 = alloca i1, i1 0
-  %nop7464 = alloca i1, i1 0
-  %nop7465 = alloca i1, i1 0
-  %nop7466 = alloca i1, i1 0
-  %nop7467 = alloca i1, i1 0
-  %nop7468 = alloca i1, i1 0
-  %nop7469 = alloca i1, i1 0
-  %nop7470 = alloca i1, i1 0
-  %nop7471 = alloca i1, i1 0
-  %nop7472 = alloca i1, i1 0
-  %nop7473 = alloca i1, i1 0
-  %nop7474 = alloca i1, i1 0
-  %nop7475 = alloca i1, i1 0
-  %nop7476 = alloca i1, i1 0
-  %nop7477 = alloca i1, i1 0
-  %nop7478 = alloca i1, i1 0
-  %nop7479 = alloca i1, i1 0
-  %nop7480 = alloca i1, i1 0
-  %nop7481 = alloca i1, i1 0
-  %nop7482 = alloca i1, i1 0
-  %nop7483 = alloca i1, i1 0
-  %nop7484 = alloca i1, i1 0
-  %nop7485 = alloca i1, i1 0
-  %nop7486 = alloca i1, i1 0
-  %nop7487 = alloca i1, i1 0
-  %nop7488 = alloca i1, i1 0
-  %nop7489 = alloca i1, i1 0
-  %nop7490 = alloca i1, i1 0
-  %nop7491 = alloca i1, i1 0
-  %nop7492 = alloca i1, i1 0
-  %nop7493 = alloca i1, i1 0
-  %nop7494 = alloca i1, i1 0
-  %nop7495 = alloca i1, i1 0
-  %nop7496 = alloca i1, i1 0
-  %nop7497 = alloca i1, i1 0
-  %nop7498 = alloca i1, i1 0
-  %nop7499 = alloca i1, i1 0
-  %nop7500 = alloca i1, i1 0
-  %nop7501 = alloca i1, i1 0
-  %nop7502 = alloca i1, i1 0
-  %nop7503 = alloca i1, i1 0
-  %nop7504 = alloca i1, i1 0
-  %nop7505 = alloca i1, i1 0
-  %nop7506 = alloca i1, i1 0
-  %nop7507 = alloca i1, i1 0
-  %nop7508 = alloca i1, i1 0
-  %nop7509 = alloca i1, i1 0
-  %nop7510 = alloca i1, i1 0
-  %nop7511 = alloca i1, i1 0
-  %nop7512 = alloca i1, i1 0
-  %nop7513 = alloca i1, i1 0
-  %nop7514 = alloca i1, i1 0
-  %nop7515 = alloca i1, i1 0
-  %nop7516 = alloca i1, i1 0
-  %nop7517 = alloca i1, i1 0
-  %nop7518 = alloca i1, i1 0
-  %nop7519 = alloca i1, i1 0
-  %nop7520 = alloca i1, i1 0
-  %nop7521 = alloca i1, i1 0
-  %nop7522 = alloca i1, i1 0
-  %nop7523 = alloca i1, i1 0
-  %nop7524 = alloca i1, i1 0
-  %nop7525 = alloca i1, i1 0
-  %nop7526 = alloca i1, i1 0
-  %nop7527 = alloca i1, i1 0
-  %nop7528 = alloca i1, i1 0
-  %nop7529 = alloca i1, i1 0
-  %nop7530 = alloca i1, i1 0
-  %nop7531 = alloca i1, i1 0
-  %nop7532 = alloca i1, i1 0
-  %nop7533 = alloca i1, i1 0
-  %nop7534 = alloca i1, i1 0
-  %nop7535 = alloca i1, i1 0
-  %nop7536 = alloca i1, i1 0
-  %nop7537 = alloca i1, i1 0
-  %nop7538 = alloca i1, i1 0
-  %nop7539 = alloca i1, i1 0
-  %nop7540 = alloca i1, i1 0
-  %nop7541 = alloca i1, i1 0
-  %nop7542 = alloca i1, i1 0
-  %nop7543 = alloca i1, i1 0
-  %nop7544 = alloca i1, i1 0
-  %nop7545 = alloca i1, i1 0
-  %nop7546 = alloca i1, i1 0
-  %nop7547 = alloca i1, i1 0
-  %nop7548 = alloca i1, i1 0
-  %nop7549 = alloca i1, i1 0
-  %nop7550 = alloca i1, i1 0
-  %nop7551 = alloca i1, i1 0
-  %nop7552 = alloca i1, i1 0
-  %nop7553 = alloca i1, i1 0
-  %nop7554 = alloca i1, i1 0
-  %nop7555 = alloca i1, i1 0
-  %nop7556 = alloca i1, i1 0
-  %nop7557 = alloca i1, i1 0
-  %nop7558 = alloca i1, i1 0
-  %nop7559 = alloca i1, i1 0
-  %nop7560 = alloca i1, i1 0
-  %nop7561 = alloca i1, i1 0
-  %nop7562 = alloca i1, i1 0
-  %nop7563 = alloca i1, i1 0
-  %nop7564 = alloca i1, i1 0
-  %nop7565 = alloca i1, i1 0
-  %nop7566 = alloca i1, i1 0
-  %nop7567 = alloca i1, i1 0
-  %nop7568 = alloca i1, i1 0
-  %nop7569 = alloca i1, i1 0
-  %nop7570 = alloca i1, i1 0
-  %nop7571 = alloca i1, i1 0
-  %nop7572 = alloca i1, i1 0
-  %nop7573 = alloca i1, i1 0
-  %nop7574 = alloca i1, i1 0
-  %nop7575 = alloca i1, i1 0
-  %nop7576 = alloca i1, i1 0
-  %nop7577 = alloca i1, i1 0
-  %nop7578 = alloca i1, i1 0
-  %nop7579 = alloca i1, i1 0
-  %nop7580 = alloca i1, i1 0
-  %nop7581 = alloca i1, i1 0
-  %nop7582 = alloca i1, i1 0
-  %nop7583 = alloca i1, i1 0
-  %nop7584 = alloca i1, i1 0
-  %nop7585 = alloca i1, i1 0
-  %nop7586 = alloca i1, i1 0
-  %nop7587 = alloca i1, i1 0
-  %nop7588 = alloca i1, i1 0
-  %nop7589 = alloca i1, i1 0
-  %nop7590 = alloca i1, i1 0
-  %nop7591 = alloca i1, i1 0
-  %nop7592 = alloca i1, i1 0
-  %nop7593 = alloca i1, i1 0
-  %nop7594 = alloca i1, i1 0
-  %nop7595 = alloca i1, i1 0
-  %nop7596 = alloca i1, i1 0
-  %nop7597 = alloca i1, i1 0
-  %nop7598 = alloca i1, i1 0
-  %nop7599 = alloca i1, i1 0
-  %nop7600 = alloca i1, i1 0
-  %nop7601 = alloca i1, i1 0
-  %nop7602 = alloca i1, i1 0
-  %nop7603 = alloca i1, i1 0
-  %nop7604 = alloca i1, i1 0
-  %nop7605 = alloca i1, i1 0
-  %nop7606 = alloca i1, i1 0
-  %nop7607 = alloca i1, i1 0
-  %nop7608 = alloca i1, i1 0
-  %nop7609 = alloca i1, i1 0
-  %nop7610 = alloca i1, i1 0
-  %nop7611 = alloca i1, i1 0
-  %nop7612 = alloca i1, i1 0
-  %nop7613 = alloca i1, i1 0
-  %nop7614 = alloca i1, i1 0
-  %nop7615 = alloca i1, i1 0
-  %nop7616 = alloca i1, i1 0
-  %nop7617 = alloca i1, i1 0
-  %nop7618 = alloca i1, i1 0
-  %nop7619 = alloca i1, i1 0
-  %nop7620 = alloca i1, i1 0
-  %nop7621 = alloca i1, i1 0
-  %nop7622 = alloca i1, i1 0
-  %nop7623 = alloca i1, i1 0
-  %nop7624 = alloca i1, i1 0
-  %nop7625 = alloca i1, i1 0
-  %nop7626 = alloca i1, i1 0
-  %nop7627 = alloca i1, i1 0
-  %nop7628 = alloca i1, i1 0
-  %nop7629 = alloca i1, i1 0
-  %nop7630 = alloca i1, i1 0
-  %nop7631 = alloca i1, i1 0
-  %nop7632 = alloca i1, i1 0
-  %nop7633 = alloca i1, i1 0
-  %nop7634 = alloca i1, i1 0
-  %nop7635 = alloca i1, i1 0
-  %nop7636 = alloca i1, i1 0
-  %nop7637 = alloca i1, i1 0
-  %nop7638 = alloca i1, i1 0
-  %nop7639 = alloca i1, i1 0
-  %nop7640 = alloca i1, i1 0
-  %nop7641 = alloca i1, i1 0
-  %nop7642 = alloca i1, i1 0
-  %nop7643 = alloca i1, i1 0
-  %nop7644 = alloca i1, i1 0
-  %nop7645 = alloca i1, i1 0
-  %nop7646 = alloca i1, i1 0
-  %nop7647 = alloca i1, i1 0
-  %nop7648 = alloca i1, i1 0
-  %nop7649 = alloca i1, i1 0
-  %nop7650 = alloca i1, i1 0
-  %nop7651 = alloca i1, i1 0
-  %nop7652 = alloca i1, i1 0
-  %nop7653 = alloca i1, i1 0
-  %nop7654 = alloca i1, i1 0
-  %nop7655 = alloca i1, i1 0
-  %nop7656 = alloca i1, i1 0
-  %nop7657 = alloca i1, i1 0
-  %nop7658 = alloca i1, i1 0
-  %nop7659 = alloca i1, i1 0
-  %nop7660 = alloca i1, i1 0
-  %nop7661 = alloca i1, i1 0
-  %nop7662 = alloca i1, i1 0
-  %nop7663 = alloca i1, i1 0
-  %nop7664 = alloca i1, i1 0
-  %nop7665 = alloca i1, i1 0
-  %nop7666 = alloca i1, i1 0
-  %nop7667 = alloca i1, i1 0
-  %nop7668 = alloca i1, i1 0
-  %nop7669 = alloca i1, i1 0
-  %nop7670 = alloca i1, i1 0
-  %nop7671 = alloca i1, i1 0
-  %nop7672 = alloca i1, i1 0
-  %nop7673 = alloca i1, i1 0
-  %nop7674 = alloca i1, i1 0
-  %nop7675 = alloca i1, i1 0
-  %nop7676 = alloca i1, i1 0
-  %nop7677 = alloca i1, i1 0
-  %nop7678 = alloca i1, i1 0
-  %nop7679 = alloca i1, i1 0
-  %nop7680 = alloca i1, i1 0
-  %nop7681 = alloca i1, i1 0
-  %nop7682 = alloca i1, i1 0
-  %nop7683 = alloca i1, i1 0
-  %nop7684 = alloca i1, i1 0
-  %nop7685 = alloca i1, i1 0
-  %nop7686 = alloca i1, i1 0
-  %nop7687 = alloca i1, i1 0
-  %nop7688 = alloca i1, i1 0
-  %nop7689 = alloca i1, i1 0
-  %nop7690 = alloca i1, i1 0
-  %nop7691 = alloca i1, i1 0
-  %nop7692 = alloca i1, i1 0
-  %nop7693 = alloca i1, i1 0
-  %nop7694 = alloca i1, i1 0
-  %nop7695 = alloca i1, i1 0
-  %nop7696 = alloca i1, i1 0
-  %nop7697 = alloca i1, i1 0
-  %nop7698 = alloca i1, i1 0
-  %nop7699 = alloca i1, i1 0
-  %nop7700 = alloca i1, i1 0
-  %nop7701 = alloca i1, i1 0
-  %nop7702 = alloca i1, i1 0
-  %nop7703 = alloca i1, i1 0
-  %nop7704 = alloca i1, i1 0
-  %nop7705 = alloca i1, i1 0
-  %nop7706 = alloca i1, i1 0
-  %nop7707 = alloca i1, i1 0
-  %nop7708 = alloca i1, i1 0
-  %nop7709 = alloca i1, i1 0
-  %nop7710 = alloca i1, i1 0
-  %nop7711 = alloca i1, i1 0
-  %nop7712 = alloca i1, i1 0
-  %nop7713 = alloca i1, i1 0
-  %nop7714 = alloca i1, i1 0
-  %nop7715 = alloca i1, i1 0
-  %nop7716 = alloca i1, i1 0
-  %nop7717 = alloca i1, i1 0
-  %nop7718 = alloca i1, i1 0
-  %nop7719 = alloca i1, i1 0
-  %nop7720 = alloca i1, i1 0
-  %nop7721 = alloca i1, i1 0
-  %nop7722 = alloca i1, i1 0
-  %nop7723 = alloca i1, i1 0
-  %nop7724 = alloca i1, i1 0
-  %nop7725 = alloca i1, i1 0
-  %nop7726 = alloca i1, i1 0
-  %nop7727 = alloca i1, i1 0
-  %nop7728 = alloca i1, i1 0
-  %nop7729 = alloca i1, i1 0
-  %nop7730 = alloca i1, i1 0
-  %nop7731 = alloca i1, i1 0
-  %nop7732 = alloca i1, i1 0
-  %nop7733 = alloca i1, i1 0
-  %nop7734 = alloca i1, i1 0
-  %nop7735 = alloca i1, i1 0
-  %nop7736 = alloca i1, i1 0
-  %nop7737 = alloca i1, i1 0
-  %nop7738 = alloca i1, i1 0
-  %nop7739 = alloca i1, i1 0
-  %nop7740 = alloca i1, i1 0
-  %nop7741 = alloca i1, i1 0
-  %nop7742 = alloca i1, i1 0
-  %nop7743 = alloca i1, i1 0
-  %nop7744 = alloca i1, i1 0
-  %nop7745 = alloca i1, i1 0
-  %nop7746 = alloca i1, i1 0
-  %nop7747 = alloca i1, i1 0
-  %nop7748 = alloca i1, i1 0
-  %nop7749 = alloca i1, i1 0
-  %nop7750 = alloca i1, i1 0
-  %nop7751 = alloca i1, i1 0
-  %nop7752 = alloca i1, i1 0
-  %nop7753 = alloca i1, i1 0
-  %nop7754 = alloca i1, i1 0
-  %nop7755 = alloca i1, i1 0
-  %nop7756 = alloca i1, i1 0
-  %nop7757 = alloca i1, i1 0
-  %nop7758 = alloca i1, i1 0
-  %nop7759 = alloca i1, i1 0
-  %nop7760 = alloca i1, i1 0
-  %nop7761 = alloca i1, i1 0
-  %nop7762 = alloca i1, i1 0
-  %nop7763 = alloca i1, i1 0
-  %nop7764 = alloca i1, i1 0
-  %nop7765 = alloca i1, i1 0
-  %nop7766 = alloca i1, i1 0
-  %nop7767 = alloca i1, i1 0
-  %nop7768 = alloca i1, i1 0
-  %nop7769 = alloca i1, i1 0
-  %nop7770 = alloca i1, i1 0
-  %nop7771 = alloca i1, i1 0
-  %nop7772 = alloca i1, i1 0
-  %nop7773 = alloca i1, i1 0
-  %nop7774 = alloca i1, i1 0
-  %nop7775 = alloca i1, i1 0
-  %nop7776 = alloca i1, i1 0
-  %nop7777 = alloca i1, i1 0
-  %nop7778 = alloca i1, i1 0
-  %nop7779 = alloca i1, i1 0
-  %nop7780 = alloca i1, i1 0
-  %nop7781 = alloca i1, i1 0
-  %nop7782 = alloca i1, i1 0
-  %nop7783 = alloca i1, i1 0
-  %nop7784 = alloca i1, i1 0
-  %nop7785 = alloca i1, i1 0
-  %nop7786 = alloca i1, i1 0
-  %nop7787 = alloca i1, i1 0
-  %nop7788 = alloca i1, i1 0
-  %nop7789 = alloca i1, i1 0
-  %nop7790 = alloca i1, i1 0
-  %nop7791 = alloca i1, i1 0
-  %nop7792 = alloca i1, i1 0
-  %nop7793 = alloca i1, i1 0
-  %nop7794 = alloca i1, i1 0
-  %nop7795 = alloca i1, i1 0
-  %nop7796 = alloca i1, i1 0
-  %nop7797 = alloca i1, i1 0
-  %nop7798 = alloca i1, i1 0
-  %nop7799 = alloca i1, i1 0
-  %nop7800 = alloca i1, i1 0
-  %nop7801 = alloca i1, i1 0
-  %nop7802 = alloca i1, i1 0
-  %nop7803 = alloca i1, i1 0
-  %nop7804 = alloca i1, i1 0
-  %nop7805 = alloca i1, i1 0
-  %nop7806 = alloca i1, i1 0
-  %nop7807 = alloca i1, i1 0
-  %nop7808 = alloca i1, i1 0
-  %nop7809 = alloca i1, i1 0
-  %nop7810 = alloca i1, i1 0
-  %nop7811 = alloca i1, i1 0
-  %nop7812 = alloca i1, i1 0
-  %nop7813 = alloca i1, i1 0
-  %nop7814 = alloca i1, i1 0
-  %nop7815 = alloca i1, i1 0
-  %nop7816 = alloca i1, i1 0
-  %nop7817 = alloca i1, i1 0
-  %nop7818 = alloca i1, i1 0
-  %nop7819 = alloca i1, i1 0
-  %nop7820 = alloca i1, i1 0
-  %nop7821 = alloca i1, i1 0
-  %nop7822 = alloca i1, i1 0
-  %nop7823 = alloca i1, i1 0
-  %nop7824 = alloca i1, i1 0
-  %nop7825 = alloca i1, i1 0
-  %nop7826 = alloca i1, i1 0
-  %nop7827 = alloca i1, i1 0
-  %nop7828 = alloca i1, i1 0
-  %nop7829 = alloca i1, i1 0
-  %nop7830 = alloca i1, i1 0
-  %nop7831 = alloca i1, i1 0
-  %nop7832 = alloca i1, i1 0
-  %nop7833 = alloca i1, i1 0
-  %nop7834 = alloca i1, i1 0
-  %nop7835 = alloca i1, i1 0
-  %nop7836 = alloca i1, i1 0
-  %nop7837 = alloca i1, i1 0
-  %nop7838 = alloca i1, i1 0
-  %nop7839 = alloca i1, i1 0
-  %nop7840 = alloca i1, i1 0
-  %nop7841 = alloca i1, i1 0
-  %nop7842 = alloca i1, i1 0
-  %nop7843 = alloca i1, i1 0
-  %nop7844 = alloca i1, i1 0
-  %nop7845 = alloca i1, i1 0
-  %nop7846 = alloca i1, i1 0
-  %nop7847 = alloca i1, i1 0
-  %nop7848 = alloca i1, i1 0
-  %nop7849 = alloca i1, i1 0
-  %nop7850 = alloca i1, i1 0
-  %nop7851 = alloca i1, i1 0
-  %nop7852 = alloca i1, i1 0
-  %nop7853 = alloca i1, i1 0
-  %nop7854 = alloca i1, i1 0
-  %nop7855 = alloca i1, i1 0
-  %nop7856 = alloca i1, i1 0
-  %nop7857 = alloca i1, i1 0
-  %nop7858 = alloca i1, i1 0
-  %nop7859 = alloca i1, i1 0
-  %nop7860 = alloca i1, i1 0
-  %nop7861 = alloca i1, i1 0
-  %nop7862 = alloca i1, i1 0
-  %nop7863 = alloca i1, i1 0
-  %nop7864 = alloca i1, i1 0
-  %nop7865 = alloca i1, i1 0
-  %nop7866 = alloca i1, i1 0
-  %nop7867 = alloca i1, i1 0
-  %nop7868 = alloca i1, i1 0
-  %nop7869 = alloca i1, i1 0
-  %nop7870 = alloca i1, i1 0
-  %nop7871 = alloca i1, i1 0
-  %nop7872 = alloca i1, i1 0
-  %nop7873 = alloca i1, i1 0
-  %nop7874 = alloca i1, i1 0
-  %nop7875 = alloca i1, i1 0
-  %nop7876 = alloca i1, i1 0
-  %nop7877 = alloca i1, i1 0
-  %nop7878 = alloca i1, i1 0
-  %nop7879 = alloca i1, i1 0
-  %nop7880 = alloca i1, i1 0
-  %nop7881 = alloca i1, i1 0
-  %nop7882 = alloca i1, i1 0
-  %nop7883 = alloca i1, i1 0
-  %nop7884 = alloca i1, i1 0
-  %nop7885 = alloca i1, i1 0
-  %nop7886 = alloca i1, i1 0
-  %nop7887 = alloca i1, i1 0
-  %nop7888 = alloca i1, i1 0
-  %nop7889 = alloca i1, i1 0
-  %nop7890 = alloca i1, i1 0
-  %nop7891 = alloca i1, i1 0
-  %nop7892 = alloca i1, i1 0
-  %nop7893 = alloca i1, i1 0
-  %nop7894 = alloca i1, i1 0
-  %nop7895 = alloca i1, i1 0
-  %nop7896 = alloca i1, i1 0
-  %nop7897 = alloca i1, i1 0
-  %nop7898 = alloca i1, i1 0
-  %nop7899 = alloca i1, i1 0
-  %nop7900 = alloca i1, i1 0
-  %nop7901 = alloca i1, i1 0
-  %nop7902 = alloca i1, i1 0
-  %nop7903 = alloca i1, i1 0
-  %nop7904 = alloca i1, i1 0
-  %nop7905 = alloca i1, i1 0
-  %nop7906 = alloca i1, i1 0
-  %nop7907 = alloca i1, i1 0
-  %nop7908 = alloca i1, i1 0
-  %nop7909 = alloca i1, i1 0
-  %nop7910 = alloca i1, i1 0
-  %nop7911 = alloca i1, i1 0
-  %nop7912 = alloca i1, i1 0
-  %nop7913 = alloca i1, i1 0
-  %nop7914 = alloca i1, i1 0
-  %nop7915 = alloca i1, i1 0
-  %nop7916 = alloca i1, i1 0
-  %nop7917 = alloca i1, i1 0
-  %nop7918 = alloca i1, i1 0
-  %nop7919 = alloca i1, i1 0
-  %nop7920 = alloca i1, i1 0
-  %nop7921 = alloca i1, i1 0
-  %nop7922 = alloca i1, i1 0
-  %nop7923 = alloca i1, i1 0
-  %nop7924 = alloca i1, i1 0
-  %nop7925 = alloca i1, i1 0
-  %nop7926 = alloca i1, i1 0
-  %nop7927 = alloca i1, i1 0
-  %nop7928 = alloca i1, i1 0
-  %nop7929 = alloca i1, i1 0
-  %nop7930 = alloca i1, i1 0
-  %nop7931 = alloca i1, i1 0
-  %nop7932 = alloca i1, i1 0
-  %nop7933 = alloca i1, i1 0
-  %nop7934 = alloca i1, i1 0
-  %nop7935 = alloca i1, i1 0
-  %nop7936 = alloca i1, i1 0
-  %nop7937 = alloca i1, i1 0
-  %nop7938 = alloca i1, i1 0
-  %nop7939 = alloca i1, i1 0
-  %nop7940 = alloca i1, i1 0
-  %nop7941 = alloca i1, i1 0
-  %nop7942 = alloca i1, i1 0
-  %nop7943 = alloca i1, i1 0
-  %nop7944 = alloca i1, i1 0
-  %nop7945 = alloca i1, i1 0
-  %nop7946 = alloca i1, i1 0
-  %nop7947 = alloca i1, i1 0
-  %nop7948 = alloca i1, i1 0
-  %nop7949 = alloca i1, i1 0
-  %nop7950 = alloca i1, i1 0
-  %nop7951 = alloca i1, i1 0
-  %nop7952 = alloca i1, i1 0
-  %nop7953 = alloca i1, i1 0
-  %nop7954 = alloca i1, i1 0
-  %nop7955 = alloca i1, i1 0
-  %nop7956 = alloca i1, i1 0
-  %nop7957 = alloca i1, i1 0
-  %nop7958 = alloca i1, i1 0
-  %nop7959 = alloca i1, i1 0
-  %nop7960 = alloca i1, i1 0
-  %nop7961 = alloca i1, i1 0
-  %nop7962 = alloca i1, i1 0
-  %nop7963 = alloca i1, i1 0
-  %nop7964 = alloca i1, i1 0
-  %nop7965 = alloca i1, i1 0
-  %nop7966 = alloca i1, i1 0
-  %nop7967 = alloca i1, i1 0
-  %nop7968 = alloca i1, i1 0
-  %nop7969 = alloca i1, i1 0
-  %nop7970 = alloca i1, i1 0
-  %nop7971 = alloca i1, i1 0
-  %nop7972 = alloca i1, i1 0
-  %nop7973 = alloca i1, i1 0
-  %nop7974 = alloca i1, i1 0
-  %nop7975 = alloca i1, i1 0
-  %nop7976 = alloca i1, i1 0
-  %nop7977 = alloca i1, i1 0
-  %nop7978 = alloca i1, i1 0
-  %nop7979 = alloca i1, i1 0
-  %nop7980 = alloca i1, i1 0
-  %nop7981 = alloca i1, i1 0
-  %nop7982 = alloca i1, i1 0
-  %nop7983 = alloca i1, i1 0
-  %nop7984 = alloca i1, i1 0
-  %nop7985 = alloca i1, i1 0
-  %nop7986 = alloca i1, i1 0
-  %nop7987 = alloca i1, i1 0
-  %nop7988 = alloca i1, i1 0
-  %nop7989 = alloca i1, i1 0
-  %nop7990 = alloca i1, i1 0
-  %nop7991 = alloca i1, i1 0
-  %nop7992 = alloca i1, i1 0
-  %nop7993 = alloca i1, i1 0
-  %nop7994 = alloca i1, i1 0
-  %nop7995 = alloca i1, i1 0
-  %nop7996 = alloca i1, i1 0
-  %nop7997 = alloca i1, i1 0
-  %nop7998 = alloca i1, i1 0
-  %nop7999 = alloca i1, i1 0
-  %nop8000 = alloca i1, i1 0
-  %nop8001 = alloca i1, i1 0
-  %nop8002 = alloca i1, i1 0
-  %nop8003 = alloca i1, i1 0
-  %nop8004 = alloca i1, i1 0
-  %nop8005 = alloca i1, i1 0
-  %nop8006 = alloca i1, i1 0
-  %nop8007 = alloca i1, i1 0
-  %nop8008 = alloca i1, i1 0
-  %nop8009 = alloca i1, i1 0
-  %nop8010 = alloca i1, i1 0
-  %nop8011 = alloca i1, i1 0
-  %nop8012 = alloca i1, i1 0
-  %nop8013 = alloca i1, i1 0
-  %nop8014 = alloca i1, i1 0
-  %nop8015 = alloca i1, i1 0
-  %nop8016 = alloca i1, i1 0
-  %nop8017 = alloca i1, i1 0
-  %nop8018 = alloca i1, i1 0
-  %nop8019 = alloca i1, i1 0
-  %nop8020 = alloca i1, i1 0
-  %nop8021 = alloca i1, i1 0
-  %nop8022 = alloca i1, i1 0
-  %nop8023 = alloca i1, i1 0
-  %nop8024 = alloca i1, i1 0
-  %nop8025 = alloca i1, i1 0
-  %nop8026 = alloca i1, i1 0
-  %nop8027 = alloca i1, i1 0
-  %nop8028 = alloca i1, i1 0
-  %nop8029 = alloca i1, i1 0
-  %nop8030 = alloca i1, i1 0
-  %nop8031 = alloca i1, i1 0
-  %nop8032 = alloca i1, i1 0
-  %nop8033 = alloca i1, i1 0
-  %nop8034 = alloca i1, i1 0
-  %nop8035 = alloca i1, i1 0
-  %nop8036 = alloca i1, i1 0
-  %nop8037 = alloca i1, i1 0
-  %nop8038 = alloca i1, i1 0
-  %nop8039 = alloca i1, i1 0
-  %nop8040 = alloca i1, i1 0
-  %nop8041 = alloca i1, i1 0
-  %nop8042 = alloca i1, i1 0
-  %nop8043 = alloca i1, i1 0
-  %nop8044 = alloca i1, i1 0
-  %nop8045 = alloca i1, i1 0
-  %nop8046 = alloca i1, i1 0
-  %nop8047 = alloca i1, i1 0
-  %nop8048 = alloca i1, i1 0
-  %nop8049 = alloca i1, i1 0
-  %nop8050 = alloca i1, i1 0
-  %nop8051 = alloca i1, i1 0
-  %nop8052 = alloca i1, i1 0
-  %nop8053 = alloca i1, i1 0
-  %nop8054 = alloca i1, i1 0
-  %nop8055 = alloca i1, i1 0
-  %nop8056 = alloca i1, i1 0
-  %nop8057 = alloca i1, i1 0
-  %nop8058 = alloca i1, i1 0
-  %nop8059 = alloca i1, i1 0
-  %nop8060 = alloca i1, i1 0
-  %nop8061 = alloca i1, i1 0
-  %nop8062 = alloca i1, i1 0
-  %nop8063 = alloca i1, i1 0
-  %nop8064 = alloca i1, i1 0
-  %nop8065 = alloca i1, i1 0
-  %nop8066 = alloca i1, i1 0
-  %nop8067 = alloca i1, i1 0
-  %nop8068 = alloca i1, i1 0
-  %nop8069 = alloca i1, i1 0
-  %nop8070 = alloca i1, i1 0
-  %nop8071 = alloca i1, i1 0
-  %nop8072 = alloca i1, i1 0
-  %nop8073 = alloca i1, i1 0
-  %nop8074 = alloca i1, i1 0
-  %nop8075 = alloca i1, i1 0
-  %nop8076 = alloca i1, i1 0
-  %nop8077 = alloca i1, i1 0
-  %nop8078 = alloca i1, i1 0
-  %nop8079 = alloca i1, i1 0
-  %nop8080 = alloca i1, i1 0
-  %nop8081 = alloca i1, i1 0
-  %nop8082 = alloca i1, i1 0
-  %nop8083 = alloca i1, i1 0
-  %nop8084 = alloca i1, i1 0
-  %nop8085 = alloca i1, i1 0
-  %nop8086 = alloca i1, i1 0
-  %nop8087 = alloca i1, i1 0
-  %nop8088 = alloca i1, i1 0
-  %nop8089 = alloca i1, i1 0
-  %nop8090 = alloca i1, i1 0
-  %nop8091 = alloca i1, i1 0
-  %nop8092 = alloca i1, i1 0
-  %nop8093 = alloca i1, i1 0
-  %nop8094 = alloca i1, i1 0
-  %nop8095 = alloca i1, i1 0
-  %nop8096 = alloca i1, i1 0
-  %nop8097 = alloca i1, i1 0
-  %nop8098 = alloca i1, i1 0
-  %nop8099 = alloca i1, i1 0
-  %nop8100 = alloca i1, i1 0
-  %nop8101 = alloca i1, i1 0
-  %nop8102 = alloca i1, i1 0
-  %nop8103 = alloca i1, i1 0
-  %nop8104 = alloca i1, i1 0
-  %nop8105 = alloca i1, i1 0
-  %nop8106 = alloca i1, i1 0
-  %nop8107 = alloca i1, i1 0
-  %nop8108 = alloca i1, i1 0
-  %nop8109 = alloca i1, i1 0
-  %nop8110 = alloca i1, i1 0
-  %nop8111 = alloca i1, i1 0
-  %nop8112 = alloca i1, i1 0
-  %nop8113 = alloca i1, i1 0
-  %nop8114 = alloca i1, i1 0
-  %nop8115 = alloca i1, i1 0
-  %nop8116 = alloca i1, i1 0
-  %nop8117 = alloca i1, i1 0
-  %nop8118 = alloca i1, i1 0
-  %nop8119 = alloca i1, i1 0
-  %nop8120 = alloca i1, i1 0
-  %nop8121 = alloca i1, i1 0
-  %nop8122 = alloca i1, i1 0
-  %nop8123 = alloca i1, i1 0
-  %nop8124 = alloca i1, i1 0
-  %nop8125 = alloca i1, i1 0
-  %nop8126 = alloca i1, i1 0
-  %nop8127 = alloca i1, i1 0
-  %nop8128 = alloca i1, i1 0
-  %nop8129 = alloca i1, i1 0
-  %nop8130 = alloca i1, i1 0
-  %nop8131 = alloca i1, i1 0
-  %nop8132 = alloca i1, i1 0
-  %nop8133 = alloca i1, i1 0
-  %nop8134 = alloca i1, i1 0
-  %nop8135 = alloca i1, i1 0
-  %nop8136 = alloca i1, i1 0
-  %nop8137 = alloca i1, i1 0
-  %nop8138 = alloca i1, i1 0
-  %nop8139 = alloca i1, i1 0
-  %nop8140 = alloca i1, i1 0
-  %nop8141 = alloca i1, i1 0
-  %nop8142 = alloca i1, i1 0
-  %nop8143 = alloca i1, i1 0
-  %nop8144 = alloca i1, i1 0
-  %nop8145 = alloca i1, i1 0
-  %nop8146 = alloca i1, i1 0
-  %nop8147 = alloca i1, i1 0
-  %nop8148 = alloca i1, i1 0
-  %nop8149 = alloca i1, i1 0
-  %nop8150 = alloca i1, i1 0
-  %nop8151 = alloca i1, i1 0
-  %nop8152 = alloca i1, i1 0
-  %nop8153 = alloca i1, i1 0
-  %nop8154 = alloca i1, i1 0
-  %nop8155 = alloca i1, i1 0
-  %nop8156 = alloca i1, i1 0
-  %nop8157 = alloca i1, i1 0
-  %nop8158 = alloca i1, i1 0
-  %nop8159 = alloca i1, i1 0
-  %nop8160 = alloca i1, i1 0
-  %nop8161 = alloca i1, i1 0
-  %nop8162 = alloca i1, i1 0
-  %nop8163 = alloca i1, i1 0
-  %nop8164 = alloca i1, i1 0
-  %nop8165 = alloca i1, i1 0
-  %nop8166 = alloca i1, i1 0
-  %nop8167 = alloca i1, i1 0
-  %nop8168 = alloca i1, i1 0
-  %nop8169 = alloca i1, i1 0
-  %nop8170 = alloca i1, i1 0
-  %nop8171 = alloca i1, i1 0
-  %nop8172 = alloca i1, i1 0
-  %nop8173 = alloca i1, i1 0
-  %nop8174 = alloca i1, i1 0
-  %nop8175 = alloca i1, i1 0
-  %nop8176 = alloca i1, i1 0
-  %nop8177 = alloca i1, i1 0
-  %nop8178 = alloca i1, i1 0
-  %nop8179 = alloca i1, i1 0
-  %nop8180 = alloca i1, i1 0
-  %nop8181 = alloca i1, i1 0
-  %nop8182 = alloca i1, i1 0
-  %nop8183 = alloca i1, i1 0
-  %nop8184 = alloca i1, i1 0
-  %nop8185 = alloca i1, i1 0
-  %nop8186 = alloca i1, i1 0
-  %nop8187 = alloca i1, i1 0
-  %nop8188 = alloca i1, i1 0
-  %nop8189 = alloca i1, i1 0
-  %nop8190 = alloca i1, i1 0
-  %nop8191 = alloca i1, i1 0
-  %nop8192 = alloca i1, i1 0
-  %nop8193 = alloca i1, i1 0
-  %nop8194 = alloca i1, i1 0
-  %nop8195 = alloca i1, i1 0
-  %nop8196 = alloca i1, i1 0
-  %nop8197 = alloca i1, i1 0
-  %nop8198 = alloca i1, i1 0
-  %nop8199 = alloca i1, i1 0
-  %nop8200 = alloca i1, i1 0
-  %nop8201 = alloca i1, i1 0
-  %nop8202 = alloca i1, i1 0
-  %nop8203 = alloca i1, i1 0
-  %nop8204 = alloca i1, i1 0
-  %nop8205 = alloca i1, i1 0
-  %nop8206 = alloca i1, i1 0
-  %nop8207 = alloca i1, i1 0
-  %nop8208 = alloca i1, i1 0
-  %nop8209 = alloca i1, i1 0
-  %nop8210 = alloca i1, i1 0
-  %nop8211 = alloca i1, i1 0
-  %nop8212 = alloca i1, i1 0
-  %nop8213 = alloca i1, i1 0
-  %nop8214 = alloca i1, i1 0
-  %nop8215 = alloca i1, i1 0
-  %nop8216 = alloca i1, i1 0
-  %nop8217 = alloca i1, i1 0
-  %nop8218 = alloca i1, i1 0
-  %nop8219 = alloca i1, i1 0
-  %nop8220 = alloca i1, i1 0
-  %nop8221 = alloca i1, i1 0
-  %nop8222 = alloca i1, i1 0
-  %nop8223 = alloca i1, i1 0
-  %nop8224 = alloca i1, i1 0
-  %nop8225 = alloca i1, i1 0
-  %nop8226 = alloca i1, i1 0
-  %nop8227 = alloca i1, i1 0
-  %nop8228 = alloca i1, i1 0
-  %nop8229 = alloca i1, i1 0
-  %nop8230 = alloca i1, i1 0
-  %nop8231 = alloca i1, i1 0
-  %nop8232 = alloca i1, i1 0
-  %nop8233 = alloca i1, i1 0
-  %nop8234 = alloca i1, i1 0
-  %nop8235 = alloca i1, i1 0
-  %nop8236 = alloca i1, i1 0
-  %nop8237 = alloca i1, i1 0
-  %nop8238 = alloca i1, i1 0
-  %nop8239 = alloca i1, i1 0
-  %nop8240 = alloca i1, i1 0
-  %nop8241 = alloca i1, i1 0
-  %nop8242 = alloca i1, i1 0
-  %nop8243 = alloca i1, i1 0
-  %nop8244 = alloca i1, i1 0
-  %nop8245 = alloca i1, i1 0
-  %nop8246 = alloca i1, i1 0
-  %nop8247 = alloca i1, i1 0
-  %nop8248 = alloca i1, i1 0
-  %nop8249 = alloca i1, i1 0
-  %nop8250 = alloca i1, i1 0
-  %nop8251 = alloca i1, i1 0
-  %nop8252 = alloca i1, i1 0
-  %nop8253 = alloca i1, i1 0
-  %nop8254 = alloca i1, i1 0
-  %nop8255 = alloca i1, i1 0
-  %nop8256 = alloca i1, i1 0
-  %nop8257 = alloca i1, i1 0
-  %nop8258 = alloca i1, i1 0
-  %nop8259 = alloca i1, i1 0
-  %nop8260 = alloca i1, i1 0
-  %nop8261 = alloca i1, i1 0
-  %nop8262 = alloca i1, i1 0
-  %nop8263 = alloca i1, i1 0
-  %nop8264 = alloca i1, i1 0
-  %nop8265 = alloca i1, i1 0
-  %nop8266 = alloca i1, i1 0
-  %nop8267 = alloca i1, i1 0
-  %nop8268 = alloca i1, i1 0
-  %nop8269 = alloca i1, i1 0
-  %nop8270 = alloca i1, i1 0
-  %nop8271 = alloca i1, i1 0
-  %nop8272 = alloca i1, i1 0
-  %nop8273 = alloca i1, i1 0
-  %nop8274 = alloca i1, i1 0
-  %nop8275 = alloca i1, i1 0
-  %nop8276 = alloca i1, i1 0
-  %nop8277 = alloca i1, i1 0
-  %nop8278 = alloca i1, i1 0
-  %nop8279 = alloca i1, i1 0
-  %nop8280 = alloca i1, i1 0
-  %nop8281 = alloca i1, i1 0
-  %nop8282 = alloca i1, i1 0
-  %nop8283 = alloca i1, i1 0
-  %nop8284 = alloca i1, i1 0
-  %nop8285 = alloca i1, i1 0
-  %nop8286 = alloca i1, i1 0
-  %nop8287 = alloca i1, i1 0
-  %nop8288 = alloca i1, i1 0
-  %nop8289 = alloca i1, i1 0
-  %nop8290 = alloca i1, i1 0
-  %nop8291 = alloca i1, i1 0
-  %nop8292 = alloca i1, i1 0
-  %nop8293 = alloca i1, i1 0
-  %nop8294 = alloca i1, i1 0
-  %nop8295 = alloca i1, i1 0
-  %nop8296 = alloca i1, i1 0
-  %nop8297 = alloca i1, i1 0
-  %nop8298 = alloca i1, i1 0
-  %nop8299 = alloca i1, i1 0
-  %nop8300 = alloca i1, i1 0
-  %nop8301 = alloca i1, i1 0
-  %nop8302 = alloca i1, i1 0
-  %nop8303 = alloca i1, i1 0
-  %nop8304 = alloca i1, i1 0
-  %nop8305 = alloca i1, i1 0
-  %nop8306 = alloca i1, i1 0
-  %nop8307 = alloca i1, i1 0
-  %nop8308 = alloca i1, i1 0
-  %nop8309 = alloca i1, i1 0
-  %nop8310 = alloca i1, i1 0
-  %nop8311 = alloca i1, i1 0
-  %nop8312 = alloca i1, i1 0
-  %nop8313 = alloca i1, i1 0
-  %nop8314 = alloca i1, i1 0
-  %nop8315 = alloca i1, i1 0
-  %nop8316 = alloca i1, i1 0
-  %nop8317 = alloca i1, i1 0
-  %nop8318 = alloca i1, i1 0
-  %nop8319 = alloca i1, i1 0
-  %nop8320 = alloca i1, i1 0
-  %nop8321 = alloca i1, i1 0
-  %nop8322 = alloca i1, i1 0
-  %nop8323 = alloca i1, i1 0
-  %nop8324 = alloca i1, i1 0
-  %nop8325 = alloca i1, i1 0
-  %nop8326 = alloca i1, i1 0
-  %nop8327 = alloca i1, i1 0
-  %nop8328 = alloca i1, i1 0
-  %nop8329 = alloca i1, i1 0
-  %nop8330 = alloca i1, i1 0
-  %nop8331 = alloca i1, i1 0
-  %nop8332 = alloca i1, i1 0
-  %nop8333 = alloca i1, i1 0
-  %nop8334 = alloca i1, i1 0
-  %nop8335 = alloca i1, i1 0
-  %nop8336 = alloca i1, i1 0
-  %nop8337 = alloca i1, i1 0
-  %nop8338 = alloca i1, i1 0
-  %nop8339 = alloca i1, i1 0
-  %nop8340 = alloca i1, i1 0
-  %nop8341 = alloca i1, i1 0
-  %nop8342 = alloca i1, i1 0
-  %nop8343 = alloca i1, i1 0
-  %nop8344 = alloca i1, i1 0
-  %nop8345 = alloca i1, i1 0
-  %nop8346 = alloca i1, i1 0
-  %nop8347 = alloca i1, i1 0
-  %nop8348 = alloca i1, i1 0
-  %nop8349 = alloca i1, i1 0
-  %nop8350 = alloca i1, i1 0
-  %nop8351 = alloca i1, i1 0
-  %nop8352 = alloca i1, i1 0
-  %nop8353 = alloca i1, i1 0
-  %nop8354 = alloca i1, i1 0
-  %nop8355 = alloca i1, i1 0
-  %nop8356 = alloca i1, i1 0
-  %nop8357 = alloca i1, i1 0
-  %nop8358 = alloca i1, i1 0
-  %nop8359 = alloca i1, i1 0
-  %nop8360 = alloca i1, i1 0
-  %nop8361 = alloca i1, i1 0
-  %nop8362 = alloca i1, i1 0
-  %nop8363 = alloca i1, i1 0
-  %nop8364 = alloca i1, i1 0
-  %nop8365 = alloca i1, i1 0
-  %nop8366 = alloca i1, i1 0
-  %nop8367 = alloca i1, i1 0
-  %nop8368 = alloca i1, i1 0
-  %nop8369 = alloca i1, i1 0
-  %nop8370 = alloca i1, i1 0
-  %nop8371 = alloca i1, i1 0
-  %nop8372 = alloca i1, i1 0
-  %nop8373 = alloca i1, i1 0
-  %nop8374 = alloca i1, i1 0
-  %nop8375 = alloca i1, i1 0
-  %nop8376 = alloca i1, i1 0
-  %nop8377 = alloca i1, i1 0
-  %nop8378 = alloca i1, i1 0
-  %nop8379 = alloca i1, i1 0
-  %nop8380 = alloca i1, i1 0
-  %nop8381 = alloca i1, i1 0
-  %nop8382 = alloca i1, i1 0
-  %nop8383 = alloca i1, i1 0
-  %nop8384 = alloca i1, i1 0
-  %nop8385 = alloca i1, i1 0
-  %nop8386 = alloca i1, i1 0
-  %nop8387 = alloca i1, i1 0
-  %nop8388 = alloca i1, i1 0
-  %nop8389 = alloca i1, i1 0
-  %nop8390 = alloca i1, i1 0
-  %nop8391 = alloca i1, i1 0
-  %nop8392 = alloca i1, i1 0
-  %nop8393 = alloca i1, i1 0
-  %nop8394 = alloca i1, i1 0
-  %nop8395 = alloca i1, i1 0
-  %nop8396 = alloca i1, i1 0
-  %nop8397 = alloca i1, i1 0
-  %nop8398 = alloca i1, i1 0
-  %nop8399 = alloca i1, i1 0
-  %nop8400 = alloca i1, i1 0
-  %nop8401 = alloca i1, i1 0
-  %nop8402 = alloca i1, i1 0
-  %nop8403 = alloca i1, i1 0
-  %nop8404 = alloca i1, i1 0
-  %nop8405 = alloca i1, i1 0
-  %nop8406 = alloca i1, i1 0
-  %nop8407 = alloca i1, i1 0
-  %nop8408 = alloca i1, i1 0
-  %nop8409 = alloca i1, i1 0
-  %nop8410 = alloca i1, i1 0
-  %nop8411 = alloca i1, i1 0
-  %nop8412 = alloca i1, i1 0
-  %nop8413 = alloca i1, i1 0
-  %nop8414 = alloca i1, i1 0
-  %nop8415 = alloca i1, i1 0
-  %nop8416 = alloca i1, i1 0
-  %nop8417 = alloca i1, i1 0
-  %nop8418 = alloca i1, i1 0
-  %nop8419 = alloca i1, i1 0
-  %nop8420 = alloca i1, i1 0
-  %nop8421 = alloca i1, i1 0
-  %nop8422 = alloca i1, i1 0
-  %nop8423 = alloca i1, i1 0
-  %nop8424 = alloca i1, i1 0
-  %nop8425 = alloca i1, i1 0
-  %nop8426 = alloca i1, i1 0
-  %nop8427 = alloca i1, i1 0
-  %nop8428 = alloca i1, i1 0
-  %nop8429 = alloca i1, i1 0
-  %nop8430 = alloca i1, i1 0
-  %nop8431 = alloca i1, i1 0
-  %nop8432 = alloca i1, i1 0
-  %nop8433 = alloca i1, i1 0
-  %nop8434 = alloca i1, i1 0
-  %nop8435 = alloca i1, i1 0
-  %nop8436 = alloca i1, i1 0
-  %nop8437 = alloca i1, i1 0
-  %nop8438 = alloca i1, i1 0
-  %nop8439 = alloca i1, i1 0
-  %nop8440 = alloca i1, i1 0
-  %nop8441 = alloca i1, i1 0
-  %nop8442 = alloca i1, i1 0
-  %nop8443 = alloca i1, i1 0
-  %nop8444 = alloca i1, i1 0
-  %nop8445 = alloca i1, i1 0
-  %nop8446 = alloca i1, i1 0
-  %nop8447 = alloca i1, i1 0
-  %nop8448 = alloca i1, i1 0
-  %nop8449 = alloca i1, i1 0
-  %nop8450 = alloca i1, i1 0
-  %nop8451 = alloca i1, i1 0
-  %nop8452 = alloca i1, i1 0
-  %nop8453 = alloca i1, i1 0
-  %nop8454 = alloca i1, i1 0
-  %nop8455 = alloca i1, i1 0
-  %nop8456 = alloca i1, i1 0
-  %nop8457 = alloca i1, i1 0
-  %nop8458 = alloca i1, i1 0
-  %nop8459 = alloca i1, i1 0
-  %nop8460 = alloca i1, i1 0
-  %nop8461 = alloca i1, i1 0
-  %nop8462 = alloca i1, i1 0
-  %nop8463 = alloca i1, i1 0
-  %nop8464 = alloca i1, i1 0
-  %nop8465 = alloca i1, i1 0
-  %nop8466 = alloca i1, i1 0
-  %nop8467 = alloca i1, i1 0
-  %nop8468 = alloca i1, i1 0
-  %nop8469 = alloca i1, i1 0
-  %nop8470 = alloca i1, i1 0
-  %nop8471 = alloca i1, i1 0
-  %nop8472 = alloca i1, i1 0
-  %nop8473 = alloca i1, i1 0
-  %nop8474 = alloca i1, i1 0
-  %nop8475 = alloca i1, i1 0
-  %nop8476 = alloca i1, i1 0
-  %nop8477 = alloca i1, i1 0
-  %nop8478 = alloca i1, i1 0
-  %nop8479 = alloca i1, i1 0
-  %nop8480 = alloca i1, i1 0
-  %nop8481 = alloca i1, i1 0
-  %nop8482 = alloca i1, i1 0
-  %nop8483 = alloca i1, i1 0
-  %nop8484 = alloca i1, i1 0
-  %nop8485 = alloca i1, i1 0
-  %nop8486 = alloca i1, i1 0
-  %nop8487 = alloca i1, i1 0
-  %nop8488 = alloca i1, i1 0
-  %nop8489 = alloca i1, i1 0
-  %nop8490 = alloca i1, i1 0
-  %nop8491 = alloca i1, i1 0
-  %nop8492 = alloca i1, i1 0
-  %nop8493 = alloca i1, i1 0
-  %nop8494 = alloca i1, i1 0
-  %nop8495 = alloca i1, i1 0
-  %nop8496 = alloca i1, i1 0
-  %nop8497 = alloca i1, i1 0
-  %nop8498 = alloca i1, i1 0
-  %nop8499 = alloca i1, i1 0
-  %nop8500 = alloca i1, i1 0
-  %nop8501 = alloca i1, i1 0
-  %nop8502 = alloca i1, i1 0
-  %nop8503 = alloca i1, i1 0
-  %nop8504 = alloca i1, i1 0
-  %nop8505 = alloca i1, i1 0
-  %nop8506 = alloca i1, i1 0
-  %nop8507 = alloca i1, i1 0
-  %nop8508 = alloca i1, i1 0
-  %nop8509 = alloca i1, i1 0
-  %nop8510 = alloca i1, i1 0
-  %nop8511 = alloca i1, i1 0
-  %nop8512 = alloca i1, i1 0
-  %nop8513 = alloca i1, i1 0
-  %nop8514 = alloca i1, i1 0
-  %nop8515 = alloca i1, i1 0
-  %nop8516 = alloca i1, i1 0
-  %nop8517 = alloca i1, i1 0
-  %nop8518 = alloca i1, i1 0
-  %nop8519 = alloca i1, i1 0
-  %nop8520 = alloca i1, i1 0
-  %nop8521 = alloca i1, i1 0
-  %nop8522 = alloca i1, i1 0
-  %nop8523 = alloca i1, i1 0
-  %nop8524 = alloca i1, i1 0
-  %nop8525 = alloca i1, i1 0
-  %nop8526 = alloca i1, i1 0
-  %nop8527 = alloca i1, i1 0
-  %nop8528 = alloca i1, i1 0
-  %nop8529 = alloca i1, i1 0
-  %nop8530 = alloca i1, i1 0
-  %nop8531 = alloca i1, i1 0
-  %nop8532 = alloca i1, i1 0
-  %nop8533 = alloca i1, i1 0
-  %nop8534 = alloca i1, i1 0
-  %nop8535 = alloca i1, i1 0
-  %nop8536 = alloca i1, i1 0
-  %nop8537 = alloca i1, i1 0
-  %nop8538 = alloca i1, i1 0
-  %nop8539 = alloca i1, i1 0
-  %nop8540 = alloca i1, i1 0
-  %nop8541 = alloca i1, i1 0
-  %nop8542 = alloca i1, i1 0
-  %nop8543 = alloca i1, i1 0
-  %nop8544 = alloca i1, i1 0
-  %nop8545 = alloca i1, i1 0
-  %nop8546 = alloca i1, i1 0
-  %nop8547 = alloca i1, i1 0
-  %nop8548 = alloca i1, i1 0
-  %nop8549 = alloca i1, i1 0
-  %nop8550 = alloca i1, i1 0
-  %nop8551 = alloca i1, i1 0
-  %nop8552 = alloca i1, i1 0
-  %nop8553 = alloca i1, i1 0
-  %nop8554 = alloca i1, i1 0
-  %nop8555 = alloca i1, i1 0
-  %nop8556 = alloca i1, i1 0
-  %nop8557 = alloca i1, i1 0
-  %nop8558 = alloca i1, i1 0
-  %nop8559 = alloca i1, i1 0
-  %nop8560 = alloca i1, i1 0
-  %nop8561 = alloca i1, i1 0
-  %nop8562 = alloca i1, i1 0
-  %nop8563 = alloca i1, i1 0
-  %nop8564 = alloca i1, i1 0
-  %nop8565 = alloca i1, i1 0
-  %nop8566 = alloca i1, i1 0
-  %nop8567 = alloca i1, i1 0
-  %nop8568 = alloca i1, i1 0
-  %nop8569 = alloca i1, i1 0
-  %nop8570 = alloca i1, i1 0
-  %nop8571 = alloca i1, i1 0
-  %nop8572 = alloca i1, i1 0
-  %nop8573 = alloca i1, i1 0
-  %nop8574 = alloca i1, i1 0
-  %nop8575 = alloca i1, i1 0
-  %nop8576 = alloca i1, i1 0
-  %nop8577 = alloca i1, i1 0
-  %nop8578 = alloca i1, i1 0
-  %nop8579 = alloca i1, i1 0
-  %nop8580 = alloca i1, i1 0
-  %nop8581 = alloca i1, i1 0
-  %nop8582 = alloca i1, i1 0
-  %nop8583 = alloca i1, i1 0
-  %nop8584 = alloca i1, i1 0
-  %nop8585 = alloca i1, i1 0
-  %nop8586 = alloca i1, i1 0
-  %nop8587 = alloca i1, i1 0
-  %nop8588 = alloca i1, i1 0
-  %nop8589 = alloca i1, i1 0
-  %nop8590 = alloca i1, i1 0
-  %nop8591 = alloca i1, i1 0
-  %nop8592 = alloca i1, i1 0
-  %nop8593 = alloca i1, i1 0
-  %nop8594 = alloca i1, i1 0
-  %nop8595 = alloca i1, i1 0
-  %nop8596 = alloca i1, i1 0
-  %nop8597 = alloca i1, i1 0
-  %nop8598 = alloca i1, i1 0
-  %nop8599 = alloca i1, i1 0
-  %nop8600 = alloca i1, i1 0
-  %nop8601 = alloca i1, i1 0
-  %nop8602 = alloca i1, i1 0
-  %nop8603 = alloca i1, i1 0
-  %nop8604 = alloca i1, i1 0
-  %nop8605 = alloca i1, i1 0
-  %nop8606 = alloca i1, i1 0
-  %nop8607 = alloca i1, i1 0
-  %nop8608 = alloca i1, i1 0
-  %nop8609 = alloca i1, i1 0
-  %nop8610 = alloca i1, i1 0
-  %nop8611 = alloca i1, i1 0
-  %nop8612 = alloca i1, i1 0
-  %nop8613 = alloca i1, i1 0
-  %nop8614 = alloca i1, i1 0
-  %nop8615 = alloca i1, i1 0
-  %nop8616 = alloca i1, i1 0
-  %nop8617 = alloca i1, i1 0
-  %nop8618 = alloca i1, i1 0
-  %nop8619 = alloca i1, i1 0
-  %nop8620 = alloca i1, i1 0
-  %nop8621 = alloca i1, i1 0
-  %nop8622 = alloca i1, i1 0
-  %nop8623 = alloca i1, i1 0
-  %nop8624 = alloca i1, i1 0
-  %nop8625 = alloca i1, i1 0
-  %nop8626 = alloca i1, i1 0
-  %nop8627 = alloca i1, i1 0
-  %nop8628 = alloca i1, i1 0
-  %nop8629 = alloca i1, i1 0
-  %nop8630 = alloca i1, i1 0
-  %nop8631 = alloca i1, i1 0
-  %nop8632 = alloca i1, i1 0
-  %nop8633 = alloca i1, i1 0
-  %nop8634 = alloca i1, i1 0
-  %nop8635 = alloca i1, i1 0
-  %nop8636 = alloca i1, i1 0
-  %nop8637 = alloca i1, i1 0
-  %nop8638 = alloca i1, i1 0
-  %nop8639 = alloca i1, i1 0
-  %nop8640 = alloca i1, i1 0
-  %nop8641 = alloca i1, i1 0
-  %nop8642 = alloca i1, i1 0
-  %nop8643 = alloca i1, i1 0
-  %nop8644 = alloca i1, i1 0
-  %nop8645 = alloca i1, i1 0
-  %nop8646 = alloca i1, i1 0
-  %nop8647 = alloca i1, i1 0
-  %nop8648 = alloca i1, i1 0
-  %nop8649 = alloca i1, i1 0
-  %nop8650 = alloca i1, i1 0
-  %nop8651 = alloca i1, i1 0
-  %nop8652 = alloca i1, i1 0
-  %nop8653 = alloca i1, i1 0
-  %nop8654 = alloca i1, i1 0
-  %nop8655 = alloca i1, i1 0
-  %nop8656 = alloca i1, i1 0
-  %nop8657 = alloca i1, i1 0
-  %nop8658 = alloca i1, i1 0
-  %nop8659 = alloca i1, i1 0
-  %nop8660 = alloca i1, i1 0
-  %nop8661 = alloca i1, i1 0
-  %nop8662 = alloca i1, i1 0
-  %nop8663 = alloca i1, i1 0
-  %nop8664 = alloca i1, i1 0
-  %nop8665 = alloca i1, i1 0
-  %nop8666 = alloca i1, i1 0
-  %nop8667 = alloca i1, i1 0
-  %nop8668 = alloca i1, i1 0
-  %nop8669 = alloca i1, i1 0
-  %nop8670 = alloca i1, i1 0
-  %nop8671 = alloca i1, i1 0
-  %nop8672 = alloca i1, i1 0
-  %nop8673 = alloca i1, i1 0
-  %nop8674 = alloca i1, i1 0
-  %nop8675 = alloca i1, i1 0
-  %nop8676 = alloca i1, i1 0
-  %nop8677 = alloca i1, i1 0
-  %nop8678 = alloca i1, i1 0
-  %nop8679 = alloca i1, i1 0
-  %nop8680 = alloca i1, i1 0
-  %nop8681 = alloca i1, i1 0
-  %nop8682 = alloca i1, i1 0
-  %nop8683 = alloca i1, i1 0
-  %nop8684 = alloca i1, i1 0
-  %nop8685 = alloca i1, i1 0
-  %nop8686 = alloca i1, i1 0
-  %nop8687 = alloca i1, i1 0
-  %nop8688 = alloca i1, i1 0
-  %nop8689 = alloca i1, i1 0
-  %nop8690 = alloca i1, i1 0
-  %nop8691 = alloca i1, i1 0
-  %nop8692 = alloca i1, i1 0
-  %nop8693 = alloca i1, i1 0
-  %nop8694 = alloca i1, i1 0
-  %nop8695 = alloca i1, i1 0
-  %nop8696 = alloca i1, i1 0
-  %nop8697 = alloca i1, i1 0
-  %nop8698 = alloca i1, i1 0
-  %nop8699 = alloca i1, i1 0
-  %nop8700 = alloca i1, i1 0
-  %nop8701 = alloca i1, i1 0
-  %nop8702 = alloca i1, i1 0
-  %nop8703 = alloca i1, i1 0
-  %nop8704 = alloca i1, i1 0
-  %nop8705 = alloca i1, i1 0
-  %nop8706 = alloca i1, i1 0
-  %nop8707 = alloca i1, i1 0
-  %nop8708 = alloca i1, i1 0
-  %nop8709 = alloca i1, i1 0
-  %nop8710 = alloca i1, i1 0
-  %nop8711 = alloca i1, i1 0
-  %nop8712 = alloca i1, i1 0
-  %nop8713 = alloca i1, i1 0
-  %nop8714 = alloca i1, i1 0
-  %nop8715 = alloca i1, i1 0
-  %nop8716 = alloca i1, i1 0
-  %nop8717 = alloca i1, i1 0
-  %nop8718 = alloca i1, i1 0
-  %nop8719 = alloca i1, i1 0
-  %nop8720 = alloca i1, i1 0
-  %nop8721 = alloca i1, i1 0
-  %nop8722 = alloca i1, i1 0
-  %nop8723 = alloca i1, i1 0
-  %nop8724 = alloca i1, i1 0
-  %nop8725 = alloca i1, i1 0
-  %nop8726 = alloca i1, i1 0
-  %nop8727 = alloca i1, i1 0
-  %nop8728 = alloca i1, i1 0
-  %nop8729 = alloca i1, i1 0
-  %nop8730 = alloca i1, i1 0
-  %nop8731 = alloca i1, i1 0
-  %nop8732 = alloca i1, i1 0
-  %nop8733 = alloca i1, i1 0
-  %nop8734 = alloca i1, i1 0
-  %nop8735 = alloca i1, i1 0
-  %nop8736 = alloca i1, i1 0
-  %nop8737 = alloca i1, i1 0
-  %nop8738 = alloca i1, i1 0
-  %nop8739 = alloca i1, i1 0
-  %nop8740 = alloca i1, i1 0
-  %nop8741 = alloca i1, i1 0
-  %nop8742 = alloca i1, i1 0
-  %nop8743 = alloca i1, i1 0
-  %nop8744 = alloca i1, i1 0
-  %nop8745 = alloca i1, i1 0
-  %nop8746 = alloca i1, i1 0
-  %nop8747 = alloca i1, i1 0
-  %nop8748 = alloca i1, i1 0
-  %nop8749 = alloca i1, i1 0
-  %nop8750 = alloca i1, i1 0
-  %nop8751 = alloca i1, i1 0
-  %nop8752 = alloca i1, i1 0
-  %nop8753 = alloca i1, i1 0
-  %nop8754 = alloca i1, i1 0
-  %nop8755 = alloca i1, i1 0
-  %nop8756 = alloca i1, i1 0
-  %nop8757 = alloca i1, i1 0
-  %nop8758 = alloca i1, i1 0
-  %nop8759 = alloca i1, i1 0
-  %nop8760 = alloca i1, i1 0
-  %nop8761 = alloca i1, i1 0
-  %nop8762 = alloca i1, i1 0
-  %nop8763 = alloca i1, i1 0
-  %nop8764 = alloca i1, i1 0
-  %nop8765 = alloca i1, i1 0
-  %nop8766 = alloca i1, i1 0
-  %nop8767 = alloca i1, i1 0
-  %nop8768 = alloca i1, i1 0
-  %nop8769 = alloca i1, i1 0
-  %nop8770 = alloca i1, i1 0
-  %nop8771 = alloca i1, i1 0
-  %nop8772 = alloca i1, i1 0
-  %nop8773 = alloca i1, i1 0
-  %nop8774 = alloca i1, i1 0
-  %nop8775 = alloca i1, i1 0
-  %nop8776 = alloca i1, i1 0
-  %nop8777 = alloca i1, i1 0
-  %nop8778 = alloca i1, i1 0
-  %nop8779 = alloca i1, i1 0
-  %nop8780 = alloca i1, i1 0
-  %nop8781 = alloca i1, i1 0
-  %nop8782 = alloca i1, i1 0
-  %nop8783 = alloca i1, i1 0
-  %nop8784 = alloca i1, i1 0
-  %nop8785 = alloca i1, i1 0
-  %nop8786 = alloca i1, i1 0
-  %nop8787 = alloca i1, i1 0
-  %nop8788 = alloca i1, i1 0
-  %nop8789 = alloca i1, i1 0
-  %nop8790 = alloca i1, i1 0
-  %nop8791 = alloca i1, i1 0
-  %nop8792 = alloca i1, i1 0
-  %nop8793 = alloca i1, i1 0
-  %nop8794 = alloca i1, i1 0
-  %nop8795 = alloca i1, i1 0
-  %nop8796 = alloca i1, i1 0
-  %nop8797 = alloca i1, i1 0
-  %nop8798 = alloca i1, i1 0
-  %nop8799 = alloca i1, i1 0
-  %nop8800 = alloca i1, i1 0
-  %nop8801 = alloca i1, i1 0
-  %nop8802 = alloca i1, i1 0
-  %nop8803 = alloca i1, i1 0
-  %nop8804 = alloca i1, i1 0
-  %nop8805 = alloca i1, i1 0
-  %nop8806 = alloca i1, i1 0
-  %nop8807 = alloca i1, i1 0
-  %nop8808 = alloca i1, i1 0
-  %nop8809 = alloca i1, i1 0
-  %nop8810 = alloca i1, i1 0
-  %nop8811 = alloca i1, i1 0
-  %nop8812 = alloca i1, i1 0
-  %nop8813 = alloca i1, i1 0
-  %nop8814 = alloca i1, i1 0
-  %nop8815 = alloca i1, i1 0
-  %nop8816 = alloca i1, i1 0
-  %nop8817 = alloca i1, i1 0
-  %nop8818 = alloca i1, i1 0
-  %nop8819 = alloca i1, i1 0
-  %nop8820 = alloca i1, i1 0
-  %nop8821 = alloca i1, i1 0
-  %nop8822 = alloca i1, i1 0
-  %nop8823 = alloca i1, i1 0
-  %nop8824 = alloca i1, i1 0
-  %nop8825 = alloca i1, i1 0
-  %nop8826 = alloca i1, i1 0
-  %nop8827 = alloca i1, i1 0
-  %nop8828 = alloca i1, i1 0
-  %nop8829 = alloca i1, i1 0
-  %nop8830 = alloca i1, i1 0
-  %nop8831 = alloca i1, i1 0
-  %nop8832 = alloca i1, i1 0
-  %nop8833 = alloca i1, i1 0
-  %nop8834 = alloca i1, i1 0
-  %nop8835 = alloca i1, i1 0
-  %nop8836 = alloca i1, i1 0
-  %nop8837 = alloca i1, i1 0
-  %nop8838 = alloca i1, i1 0
-  %nop8839 = alloca i1, i1 0
-  %nop8840 = alloca i1, i1 0
-  %nop8841 = alloca i1, i1 0
-  %nop8842 = alloca i1, i1 0
-  %nop8843 = alloca i1, i1 0
-  %nop8844 = alloca i1, i1 0
-  %nop8845 = alloca i1, i1 0
-  %nop8846 = alloca i1, i1 0
-  %nop8847 = alloca i1, i1 0
-  %nop8848 = alloca i1, i1 0
-  %nop8849 = alloca i1, i1 0
-  %nop8850 = alloca i1, i1 0
-  %nop8851 = alloca i1, i1 0
-  %nop8852 = alloca i1, i1 0
-  %nop8853 = alloca i1, i1 0
-  %nop8854 = alloca i1, i1 0
-  %nop8855 = alloca i1, i1 0
-  %nop8856 = alloca i1, i1 0
-  %nop8857 = alloca i1, i1 0
-  %nop8858 = alloca i1, i1 0
-  %nop8859 = alloca i1, i1 0
-  %nop8860 = alloca i1, i1 0
-  %nop8861 = alloca i1, i1 0
-  %nop8862 = alloca i1, i1 0
-  %nop8863 = alloca i1, i1 0
-  %nop8864 = alloca i1, i1 0
-  %nop8865 = alloca i1, i1 0
-  %nop8866 = alloca i1, i1 0
-  %nop8867 = alloca i1, i1 0
-  %nop8868 = alloca i1, i1 0
-  %nop8869 = alloca i1, i1 0
-  %nop8870 = alloca i1, i1 0
-  %nop8871 = alloca i1, i1 0
-  %nop8872 = alloca i1, i1 0
-  %nop8873 = alloca i1, i1 0
-  %nop8874 = alloca i1, i1 0
-  %nop8875 = alloca i1, i1 0
-  %nop8876 = alloca i1, i1 0
-  %nop8877 = alloca i1, i1 0
-  %nop8878 = alloca i1, i1 0
-  %nop8879 = alloca i1, i1 0
-  %nop8880 = alloca i1, i1 0
-  %nop8881 = alloca i1, i1 0
-  %nop8882 = alloca i1, i1 0
-  %nop8883 = alloca i1, i1 0
-  %nop8884 = alloca i1, i1 0
-  %nop8885 = alloca i1, i1 0
-  %nop8886 = alloca i1, i1 0
-  %nop8887 = alloca i1, i1 0
-  %nop8888 = alloca i1, i1 0
-  %nop8889 = alloca i1, i1 0
-  %nop8890 = alloca i1, i1 0
-  %nop8891 = alloca i1, i1 0
-  %nop8892 = alloca i1, i1 0
-  %nop8893 = alloca i1, i1 0
-  %nop8894 = alloca i1, i1 0
-  %nop8895 = alloca i1, i1 0
-  %nop8896 = alloca i1, i1 0
-  %nop8897 = alloca i1, i1 0
-  %nop8898 = alloca i1, i1 0
-  %nop8899 = alloca i1, i1 0
-  %nop8900 = alloca i1, i1 0
-  %nop8901 = alloca i1, i1 0
-  %nop8902 = alloca i1, i1 0
-  %nop8903 = alloca i1, i1 0
-  %nop8904 = alloca i1, i1 0
-  %nop8905 = alloca i1, i1 0
-  %nop8906 = alloca i1, i1 0
-  %nop8907 = alloca i1, i1 0
-  %nop8908 = alloca i1, i1 0
-  %nop8909 = alloca i1, i1 0
-  %nop8910 = alloca i1, i1 0
-  %nop8911 = alloca i1, i1 0
-  %nop8912 = alloca i1, i1 0
-  %nop8913 = alloca i1, i1 0
-  %nop8914 = alloca i1, i1 0
-  %nop8915 = alloca i1, i1 0
-  %nop8916 = alloca i1, i1 0
-  %nop8917 = alloca i1, i1 0
-  %nop8918 = alloca i1, i1 0
-  %nop8919 = alloca i1, i1 0
-  %nop8920 = alloca i1, i1 0
-  %nop8921 = alloca i1, i1 0
-  %nop8922 = alloca i1, i1 0
-  %nop8923 = alloca i1, i1 0
-  %nop8924 = alloca i1, i1 0
-  %nop8925 = alloca i1, i1 0
-  %nop8926 = alloca i1, i1 0
-  %nop8927 = alloca i1, i1 0
-  %nop8928 = alloca i1, i1 0
-  %nop8929 = alloca i1, i1 0
-  %nop8930 = alloca i1, i1 0
-  %nop8931 = alloca i1, i1 0
-  %nop8932 = alloca i1, i1 0
-  %nop8933 = alloca i1, i1 0
-  %nop8934 = alloca i1, i1 0
-  %nop8935 = alloca i1, i1 0
-  %nop8936 = alloca i1, i1 0
-  %nop8937 = alloca i1, i1 0
-  %nop8938 = alloca i1, i1 0
-  %nop8939 = alloca i1, i1 0
-  %nop8940 = alloca i1, i1 0
-  %nop8941 = alloca i1, i1 0
-  %nop8942 = alloca i1, i1 0
-  %nop8943 = alloca i1, i1 0
-  %nop8944 = alloca i1, i1 0
-  %nop8945 = alloca i1, i1 0
-  %nop8946 = alloca i1, i1 0
-  %nop8947 = alloca i1, i1 0
-  %nop8948 = alloca i1, i1 0
-  %nop8949 = alloca i1, i1 0
-  %nop8950 = alloca i1, i1 0
-  %nop8951 = alloca i1, i1 0
-  %nop8952 = alloca i1, i1 0
-  %nop8953 = alloca i1, i1 0
-  %nop8954 = alloca i1, i1 0
-  %nop8955 = alloca i1, i1 0
-  %nop8956 = alloca i1, i1 0
-  %nop8957 = alloca i1, i1 0
-  %nop8958 = alloca i1, i1 0
-  %nop8959 = alloca i1, i1 0
-  %nop8960 = alloca i1, i1 0
-  %nop8961 = alloca i1, i1 0
-  %nop8962 = alloca i1, i1 0
-  %nop8963 = alloca i1, i1 0
-  %nop8964 = alloca i1, i1 0
-  %nop8965 = alloca i1, i1 0
-  %nop8966 = alloca i1, i1 0
-  %nop8967 = alloca i1, i1 0
-  %nop8968 = alloca i1, i1 0
-  %nop8969 = alloca i1, i1 0
-  %nop8970 = alloca i1, i1 0
-  %nop8971 = alloca i1, i1 0
-  %nop8972 = alloca i1, i1 0
-  %nop8973 = alloca i1, i1 0
-  %nop8974 = alloca i1, i1 0
-  %nop8975 = alloca i1, i1 0
-  %nop8976 = alloca i1, i1 0
-  %nop8977 = alloca i1, i1 0
-  %nop8978 = alloca i1, i1 0
-  %nop8979 = alloca i1, i1 0
-  %nop8980 = alloca i1, i1 0
-  %nop8981 = alloca i1, i1 0
-  %nop8982 = alloca i1, i1 0
-  %nop8983 = alloca i1, i1 0
-  %nop8984 = alloca i1, i1 0
-  %nop8985 = alloca i1, i1 0
-  %nop8986 = alloca i1, i1 0
-  %nop8987 = alloca i1, i1 0
-  %nop8988 = alloca i1, i1 0
-  %nop8989 = alloca i1, i1 0
-  %nop8990 = alloca i1, i1 0
-  %nop8991 = alloca i1, i1 0
-  %nop8992 = alloca i1, i1 0
-  %nop8993 = alloca i1, i1 0
-  %nop8994 = alloca i1, i1 0
-  %nop8995 = alloca i1, i1 0
-  %nop8996 = alloca i1, i1 0
-  %nop8997 = alloca i1, i1 0
-  %nop8998 = alloca i1, i1 0
-  %nop8999 = alloca i1, i1 0
-  %nop9000 = alloca i1, i1 0
-  %nop9001 = alloca i1, i1 0
-  %nop9002 = alloca i1, i1 0
-  %nop9003 = alloca i1, i1 0
-  %nop9004 = alloca i1, i1 0
-  %nop9005 = alloca i1, i1 0
-  %nop9006 = alloca i1, i1 0
-  %nop9007 = alloca i1, i1 0
-  %nop9008 = alloca i1, i1 0
-  %nop9009 = alloca i1, i1 0
-  %nop9010 = alloca i1, i1 0
-  %nop9011 = alloca i1, i1 0
-  %nop9012 = alloca i1, i1 0
-  %nop9013 = alloca i1, i1 0
-  %nop9014 = alloca i1, i1 0
-  %nop9015 = alloca i1, i1 0
-  %nop9016 = alloca i1, i1 0
-  %nop9017 = alloca i1, i1 0
-  %nop9018 = alloca i1, i1 0
-  %nop9019 = alloca i1, i1 0
-  %nop9020 = alloca i1, i1 0
-  %nop9021 = alloca i1, i1 0
-  %nop9022 = alloca i1, i1 0
-  %nop9023 = alloca i1, i1 0
-  %nop9024 = alloca i1, i1 0
-  %nop9025 = alloca i1, i1 0
-  %nop9026 = alloca i1, i1 0
-  %nop9027 = alloca i1, i1 0
-  %nop9028 = alloca i1, i1 0
-  %nop9029 = alloca i1, i1 0
-  %nop9030 = alloca i1, i1 0
-  %nop9031 = alloca i1, i1 0
-  %nop9032 = alloca i1, i1 0
-  %nop9033 = alloca i1, i1 0
-  %nop9034 = alloca i1, i1 0
-  %nop9035 = alloca i1, i1 0
-  %nop9036 = alloca i1, i1 0
-  %nop9037 = alloca i1, i1 0
-  %nop9038 = alloca i1, i1 0
-  %nop9039 = alloca i1, i1 0
-  %nop9040 = alloca i1, i1 0
-  %nop9041 = alloca i1, i1 0
-  %nop9042 = alloca i1, i1 0
-  %nop9043 = alloca i1, i1 0
-  %nop9044 = alloca i1, i1 0
-  %nop9045 = alloca i1, i1 0
-  %nop9046 = alloca i1, i1 0
-  %nop9047 = alloca i1, i1 0
-  %nop9048 = alloca i1, i1 0
-  %nop9049 = alloca i1, i1 0
-  %nop9050 = alloca i1, i1 0
-  %nop9051 = alloca i1, i1 0
-  %nop9052 = alloca i1, i1 0
-  %nop9053 = alloca i1, i1 0
-  %nop9054 = alloca i1, i1 0
-  %nop9055 = alloca i1, i1 0
-  %nop9056 = alloca i1, i1 0
-  %nop9057 = alloca i1, i1 0
-  %nop9058 = alloca i1, i1 0
-  %nop9059 = alloca i1, i1 0
-  %nop9060 = alloca i1, i1 0
-  %nop9061 = alloca i1, i1 0
-  %nop9062 = alloca i1, i1 0
-  %nop9063 = alloca i1, i1 0
-  %nop9064 = alloca i1, i1 0
-  %nop9065 = alloca i1, i1 0
-  %nop9066 = alloca i1, i1 0
-  %nop9067 = alloca i1, i1 0
-  %nop9068 = alloca i1, i1 0
-  %nop9069 = alloca i1, i1 0
-  %nop9070 = alloca i1, i1 0
-  %nop9071 = alloca i1, i1 0
-  %nop9072 = alloca i1, i1 0
-  %nop9073 = alloca i1, i1 0
-  %nop9074 = alloca i1, i1 0
-  %nop9075 = alloca i1, i1 0
-  %nop9076 = alloca i1, i1 0
-  %nop9077 = alloca i1, i1 0
-  %nop9078 = alloca i1, i1 0
-  %nop9079 = alloca i1, i1 0
-  %nop9080 = alloca i1, i1 0
-  %nop9081 = alloca i1, i1 0
-  %nop9082 = alloca i1, i1 0
-  %nop9083 = alloca i1, i1 0
-  %nop9084 = alloca i1, i1 0
-  %nop9085 = alloca i1, i1 0
-  %nop9086 = alloca i1, i1 0
-  %nop9087 = alloca i1, i1 0
-  %nop9088 = alloca i1, i1 0
-  %nop9089 = alloca i1, i1 0
-  %nop9090 = alloca i1, i1 0
-  %nop9091 = alloca i1, i1 0
-  %nop9092 = alloca i1, i1 0
-  %nop9093 = alloca i1, i1 0
-  %nop9094 = alloca i1, i1 0
-  %nop9095 = alloca i1, i1 0
-  %nop9096 = alloca i1, i1 0
-  %nop9097 = alloca i1, i1 0
-  %nop9098 = alloca i1, i1 0
-  %nop9099 = alloca i1, i1 0
-  %nop9100 = alloca i1, i1 0
-  %nop9101 = alloca i1, i1 0
-  %nop9102 = alloca i1, i1 0
-  %nop9103 = alloca i1, i1 0
-  %nop9104 = alloca i1, i1 0
-  %nop9105 = alloca i1, i1 0
-  %nop9106 = alloca i1, i1 0
-  %nop9107 = alloca i1, i1 0
-  %nop9108 = alloca i1, i1 0
-  %nop9109 = alloca i1, i1 0
-  %nop9110 = alloca i1, i1 0
-  %nop9111 = alloca i1, i1 0
-  %nop9112 = alloca i1, i1 0
-  %nop9113 = alloca i1, i1 0
-  %nop9114 = alloca i1, i1 0
-  %nop9115 = alloca i1, i1 0
-  %nop9116 = alloca i1, i1 0
-  %nop9117 = alloca i1, i1 0
-  %nop9118 = alloca i1, i1 0
-  %nop9119 = alloca i1, i1 0
-  %nop9120 = alloca i1, i1 0
-  %nop9121 = alloca i1, i1 0
-  %nop9122 = alloca i1, i1 0
-  %nop9123 = alloca i1, i1 0
-  %nop9124 = alloca i1, i1 0
-  %nop9125 = alloca i1, i1 0
-  %nop9126 = alloca i1, i1 0
-  %nop9127 = alloca i1, i1 0
-  %nop9128 = alloca i1, i1 0
-  %nop9129 = alloca i1, i1 0
-  %nop9130 = alloca i1, i1 0
-  %nop9131 = alloca i1, i1 0
-  %nop9132 = alloca i1, i1 0
-  %nop9133 = alloca i1, i1 0
-  %nop9134 = alloca i1, i1 0
-  %nop9135 = alloca i1, i1 0
-  %nop9136 = alloca i1, i1 0
-  %nop9137 = alloca i1, i1 0
-  %nop9138 = alloca i1, i1 0
-  %nop9139 = alloca i1, i1 0
-  %nop9140 = alloca i1, i1 0
-  %nop9141 = alloca i1, i1 0
-  %nop9142 = alloca i1, i1 0
-  %nop9143 = alloca i1, i1 0
-  %nop9144 = alloca i1, i1 0
-  %nop9145 = alloca i1, i1 0
-  %nop9146 = alloca i1, i1 0
-  %nop9147 = alloca i1, i1 0
-  %nop9148 = alloca i1, i1 0
-  %nop9149 = alloca i1, i1 0
-  %nop9150 = alloca i1, i1 0
-  %nop9151 = alloca i1, i1 0
-  %nop9152 = alloca i1, i1 0
-  %nop9153 = alloca i1, i1 0
-  %nop9154 = alloca i1, i1 0
-  %nop9155 = alloca i1, i1 0
-  %nop9156 = alloca i1, i1 0
-  %nop9157 = alloca i1, i1 0
-  %nop9158 = alloca i1, i1 0
-  %nop9159 = alloca i1, i1 0
-  %nop9160 = alloca i1, i1 0
-  %nop9161 = alloca i1, i1 0
-  %nop9162 = alloca i1, i1 0
-  %nop9163 = alloca i1, i1 0
-  %nop9164 = alloca i1, i1 0
-  %nop9165 = alloca i1, i1 0
-  %nop9166 = alloca i1, i1 0
-  %nop9167 = alloca i1, i1 0
-  %nop9168 = alloca i1, i1 0
-  %nop9169 = alloca i1, i1 0
-  %nop9170 = alloca i1, i1 0
-  %nop9171 = alloca i1, i1 0
-  %nop9172 = alloca i1, i1 0
-  %nop9173 = alloca i1, i1 0
-  %nop9174 = alloca i1, i1 0
-  %nop9175 = alloca i1, i1 0
-  %nop9176 = alloca i1, i1 0
-  %nop9177 = alloca i1, i1 0
-  %nop9178 = alloca i1, i1 0
-  %nop9179 = alloca i1, i1 0
-  %nop9180 = alloca i1, i1 0
-  %nop9181 = alloca i1, i1 0
-  %nop9182 = alloca i1, i1 0
-  %nop9183 = alloca i1, i1 0
-  %nop9184 = alloca i1, i1 0
-  %nop9185 = alloca i1, i1 0
-  %nop9186 = alloca i1, i1 0
-  %nop9187 = alloca i1, i1 0
-  %nop9188 = alloca i1, i1 0
-  %nop9189 = alloca i1, i1 0
-  %nop9190 = alloca i1, i1 0
-  %nop9191 = alloca i1, i1 0
-  %nop9192 = alloca i1, i1 0
-  %nop9193 = alloca i1, i1 0
-  %nop9194 = alloca i1, i1 0
-  %nop9195 = alloca i1, i1 0
-  %nop9196 = alloca i1, i1 0
-  %nop9197 = alloca i1, i1 0
-  %nop9198 = alloca i1, i1 0
-  %nop9199 = alloca i1, i1 0
-  %nop9200 = alloca i1, i1 0
-  %nop9201 = alloca i1, i1 0
-  %nop9202 = alloca i1, i1 0
-  %nop9203 = alloca i1, i1 0
-  %nop9204 = alloca i1, i1 0
-  %nop9205 = alloca i1, i1 0
-  %nop9206 = alloca i1, i1 0
-  %nop9207 = alloca i1, i1 0
-  %nop9208 = alloca i1, i1 0
-  %nop9209 = alloca i1, i1 0
-  %nop9210 = alloca i1, i1 0
-  %nop9211 = alloca i1, i1 0
-  %nop9212 = alloca i1, i1 0
-  %nop9213 = alloca i1, i1 0
-  %nop9214 = alloca i1, i1 0
-  %nop9215 = alloca i1, i1 0
-  %nop9216 = alloca i1, i1 0
-  %nop9217 = alloca i1, i1 0
-  %nop9218 = alloca i1, i1 0
-  %nop9219 = alloca i1, i1 0
-  %nop9220 = alloca i1, i1 0
-  %nop9221 = alloca i1, i1 0
-  %nop9222 = alloca i1, i1 0
-  %nop9223 = alloca i1, i1 0
-  %nop9224 = alloca i1, i1 0
-  %nop9225 = alloca i1, i1 0
-  %nop9226 = alloca i1, i1 0
-  %nop9227 = alloca i1, i1 0
-  %nop9228 = alloca i1, i1 0
-  %nop9229 = alloca i1, i1 0
-  %nop9230 = alloca i1, i1 0
-  %nop9231 = alloca i1, i1 0
-  %nop9232 = alloca i1, i1 0
-  %nop9233 = alloca i1, i1 0
-  %nop9234 = alloca i1, i1 0
-  %nop9235 = alloca i1, i1 0
-  %nop9236 = alloca i1, i1 0
-  %nop9237 = alloca i1, i1 0
-  %nop9238 = alloca i1, i1 0
-  %nop9239 = alloca i1, i1 0
-  %nop9240 = alloca i1, i1 0
-  %nop9241 = alloca i1, i1 0
-  %nop9242 = alloca i1, i1 0
-  %nop9243 = alloca i1, i1 0
-  %nop9244 = alloca i1, i1 0
-  %nop9245 = alloca i1, i1 0
-  %nop9246 = alloca i1, i1 0
-  %nop9247 = alloca i1, i1 0
-  %nop9248 = alloca i1, i1 0
-  %nop9249 = alloca i1, i1 0
-  %nop9250 = alloca i1, i1 0
-  %nop9251 = alloca i1, i1 0
-  %nop9252 = alloca i1, i1 0
-  %nop9253 = alloca i1, i1 0
-  %nop9254 = alloca i1, i1 0
-  %nop9255 = alloca i1, i1 0
-  %nop9256 = alloca i1, i1 0
-  %nop9257 = alloca i1, i1 0
-  %nop9258 = alloca i1, i1 0
-  %nop9259 = alloca i1, i1 0
-  %nop9260 = alloca i1, i1 0
-  %nop9261 = alloca i1, i1 0
-  %nop9262 = alloca i1, i1 0
-  %nop9263 = alloca i1, i1 0
-  %nop9264 = alloca i1, i1 0
-  %nop9265 = alloca i1, i1 0
-  %nop9266 = alloca i1, i1 0
-  %nop9267 = alloca i1, i1 0
-  %nop9268 = alloca i1, i1 0
-  %nop9269 = alloca i1, i1 0
-  %nop9270 = alloca i1, i1 0
-  %nop9271 = alloca i1, i1 0
-  %nop9272 = alloca i1, i1 0
-  %nop9273 = alloca i1, i1 0
-  %nop9274 = alloca i1, i1 0
-  %nop9275 = alloca i1, i1 0
-  %nop9276 = alloca i1, i1 0
-  %nop9277 = alloca i1, i1 0
-  %nop9278 = alloca i1, i1 0
-  %nop9279 = alloca i1, i1 0
-  %nop9280 = alloca i1, i1 0
-  %nop9281 = alloca i1, i1 0
-  %nop9282 = alloca i1, i1 0
-  %nop9283 = alloca i1, i1 0
-  %nop9284 = alloca i1, i1 0
-  %nop9285 = alloca i1, i1 0
-  %nop9286 = alloca i1, i1 0
-  %nop9287 = alloca i1, i1 0
-  %nop9288 = alloca i1, i1 0
-  %nop9289 = alloca i1, i1 0
-  %nop9290 = alloca i1, i1 0
-  %nop9291 = alloca i1, i1 0
-  %nop9292 = alloca i1, i1 0
-  %nop9293 = alloca i1, i1 0
-  %nop9294 = alloca i1, i1 0
-  %nop9295 = alloca i1, i1 0
-  %nop9296 = alloca i1, i1 0
-  %nop9297 = alloca i1, i1 0
-  %nop9298 = alloca i1, i1 0
-  %nop9299 = alloca i1, i1 0
-  %nop9300 = alloca i1, i1 0
-  %nop9301 = alloca i1, i1 0
-  %nop9302 = alloca i1, i1 0
-  %nop9303 = alloca i1, i1 0
-  %nop9304 = alloca i1, i1 0
-  %nop9305 = alloca i1, i1 0
-  %nop9306 = alloca i1, i1 0
-  %nop9307 = alloca i1, i1 0
-  %nop9308 = alloca i1, i1 0
-  %nop9309 = alloca i1, i1 0
-  %nop9310 = alloca i1, i1 0
-  %nop9311 = alloca i1, i1 0
-  %nop9312 = alloca i1, i1 0
-  %nop9313 = alloca i1, i1 0
-  %nop9314 = alloca i1, i1 0
-  %nop9315 = alloca i1, i1 0
-  %nop9316 = alloca i1, i1 0
-  %nop9317 = alloca i1, i1 0
-  %nop9318 = alloca i1, i1 0
-  %nop9319 = alloca i1, i1 0
-  %nop9320 = alloca i1, i1 0
-  %nop9321 = alloca i1, i1 0
-  %nop9322 = alloca i1, i1 0
-  %nop9323 = alloca i1, i1 0
-  %nop9324 = alloca i1, i1 0
-  %nop9325 = alloca i1, i1 0
-  %nop9326 = alloca i1, i1 0
-  %nop9327 = alloca i1, i1 0
-  %nop9328 = alloca i1, i1 0
-  %nop9329 = alloca i1, i1 0
-  %nop9330 = alloca i1, i1 0
-  %nop9331 = alloca i1, i1 0
-  %nop9332 = alloca i1, i1 0
-  %nop9333 = alloca i1, i1 0
-  %nop9334 = alloca i1, i1 0
-  %nop9335 = alloca i1, i1 0
-  %nop9336 = alloca i1, i1 0
-  %nop9337 = alloca i1, i1 0
-  %nop9338 = alloca i1, i1 0
-  %nop9339 = alloca i1, i1 0
-  %nop9340 = alloca i1, i1 0
-  %nop9341 = alloca i1, i1 0
-  %nop9342 = alloca i1, i1 0
-  %nop9343 = alloca i1, i1 0
-  %nop9344 = alloca i1, i1 0
-  %nop9345 = alloca i1, i1 0
-  %nop9346 = alloca i1, i1 0
-  %nop9347 = alloca i1, i1 0
-  %nop9348 = alloca i1, i1 0
-  %nop9349 = alloca i1, i1 0
-  %nop9350 = alloca i1, i1 0
-  %nop9351 = alloca i1, i1 0
-  %nop9352 = alloca i1, i1 0
-  %nop9353 = alloca i1, i1 0
-  %nop9354 = alloca i1, i1 0
-  %nop9355 = alloca i1, i1 0
-  %nop9356 = alloca i1, i1 0
-  %nop9357 = alloca i1, i1 0
-  %nop9358 = alloca i1, i1 0
-  %nop9359 = alloca i1, i1 0
-  %nop9360 = alloca i1, i1 0
-  %nop9361 = alloca i1, i1 0
-  %nop9362 = alloca i1, i1 0
-  %nop9363 = alloca i1, i1 0
-  %nop9364 = alloca i1, i1 0
-  %nop9365 = alloca i1, i1 0
-  %nop9366 = alloca i1, i1 0
-  %nop9367 = alloca i1, i1 0
-  %nop9368 = alloca i1, i1 0
-  %nop9369 = alloca i1, i1 0
-  %nop9370 = alloca i1, i1 0
-  %nop9371 = alloca i1, i1 0
-  %nop9372 = alloca i1, i1 0
-  %nop9373 = alloca i1, i1 0
-  %nop9374 = alloca i1, i1 0
-  %nop9375 = alloca i1, i1 0
-  %nop9376 = alloca i1, i1 0
-  %nop9377 = alloca i1, i1 0
-  %nop9378 = alloca i1, i1 0
-  %nop9379 = alloca i1, i1 0
-  %nop9380 = alloca i1, i1 0
-  %nop9381 = alloca i1, i1 0
-  %nop9382 = alloca i1, i1 0
-  %nop9383 = alloca i1, i1 0
-  %nop9384 = alloca i1, i1 0
-  %nop9385 = alloca i1, i1 0
-  %nop9386 = alloca i1, i1 0
-  %nop9387 = alloca i1, i1 0
-  %nop9388 = alloca i1, i1 0
-  %nop9389 = alloca i1, i1 0
-  %nop9390 = alloca i1, i1 0
-  %nop9391 = alloca i1, i1 0
-  %nop9392 = alloca i1, i1 0
-  %nop9393 = alloca i1, i1 0
-  %nop9394 = alloca i1, i1 0
-  %nop9395 = alloca i1, i1 0
-  %nop9396 = alloca i1, i1 0
-  %nop9397 = alloca i1, i1 0
-  %nop9398 = alloca i1, i1 0
-  %nop9399 = alloca i1, i1 0
-  %nop9400 = alloca i1, i1 0
-  %nop9401 = alloca i1, i1 0
-  %nop9402 = alloca i1, i1 0
-  %nop9403 = alloca i1, i1 0
-  %nop9404 = alloca i1, i1 0
-  %nop9405 = alloca i1, i1 0
-  %nop9406 = alloca i1, i1 0
-  %nop9407 = alloca i1, i1 0
-  %nop9408 = alloca i1, i1 0
-  %nop9409 = alloca i1, i1 0
-  %nop9410 = alloca i1, i1 0
-  %nop9411 = alloca i1, i1 0
-  %nop9412 = alloca i1, i1 0
-  %nop9413 = alloca i1, i1 0
-  %nop9414 = alloca i1, i1 0
-  %nop9415 = alloca i1, i1 0
-  %nop9416 = alloca i1, i1 0
-  %nop9417 = alloca i1, i1 0
-  %nop9418 = alloca i1, i1 0
-  %nop9419 = alloca i1, i1 0
-  %nop9420 = alloca i1, i1 0
-  %nop9421 = alloca i1, i1 0
-  %nop9422 = alloca i1, i1 0
-  %nop9423 = alloca i1, i1 0
-  %nop9424 = alloca i1, i1 0
-  %nop9425 = alloca i1, i1 0
-  %nop9426 = alloca i1, i1 0
-  %nop9427 = alloca i1, i1 0
-  %nop9428 = alloca i1, i1 0
-  %nop9429 = alloca i1, i1 0
-  %nop9430 = alloca i1, i1 0
-  %nop9431 = alloca i1, i1 0
-  %nop9432 = alloca i1, i1 0
-  %nop9433 = alloca i1, i1 0
-  %nop9434 = alloca i1, i1 0
-  %nop9435 = alloca i1, i1 0
-  %nop9436 = alloca i1, i1 0
-  %nop9437 = alloca i1, i1 0
-  %nop9438 = alloca i1, i1 0
-  %nop9439 = alloca i1, i1 0
-  %nop9440 = alloca i1, i1 0
-  %nop9441 = alloca i1, i1 0
-  %nop9442 = alloca i1, i1 0
-  %nop9443 = alloca i1, i1 0
-  %nop9444 = alloca i1, i1 0
-  %nop9445 = alloca i1, i1 0
-  %nop9446 = alloca i1, i1 0
-  %nop9447 = alloca i1, i1 0
-  %nop9448 = alloca i1, i1 0
-  %nop9449 = alloca i1, i1 0
-  %nop9450 = alloca i1, i1 0
-  %nop9451 = alloca i1, i1 0
-  %nop9452 = alloca i1, i1 0
-  %nop9453 = alloca i1, i1 0
-  %nop9454 = alloca i1, i1 0
-  %nop9455 = alloca i1, i1 0
-  %nop9456 = alloca i1, i1 0
-  %nop9457 = alloca i1, i1 0
-  %nop9458 = alloca i1, i1 0
-  %nop9459 = alloca i1, i1 0
-  %nop9460 = alloca i1, i1 0
-  %nop9461 = alloca i1, i1 0
-  %nop9462 = alloca i1, i1 0
-  %nop9463 = alloca i1, i1 0
-  %nop9464 = alloca i1, i1 0
-  %nop9465 = alloca i1, i1 0
-  %nop9466 = alloca i1, i1 0
-  %nop9467 = alloca i1, i1 0
-  %nop9468 = alloca i1, i1 0
-  %nop9469 = alloca i1, i1 0
-  %nop9470 = alloca i1, i1 0
-  %nop9471 = alloca i1, i1 0
-  %nop9472 = alloca i1, i1 0
-  %nop9473 = alloca i1, i1 0
-  %nop9474 = alloca i1, i1 0
-  %nop9475 = alloca i1, i1 0
-  %nop9476 = alloca i1, i1 0
-  %nop9477 = alloca i1, i1 0
-  %nop9478 = alloca i1, i1 0
-  %nop9479 = alloca i1, i1 0
-  %nop9480 = alloca i1, i1 0
-  %nop9481 = alloca i1, i1 0
-  %nop9482 = alloca i1, i1 0
-  %nop9483 = alloca i1, i1 0
-  %nop9484 = alloca i1, i1 0
-  %nop9485 = alloca i1, i1 0
-  %nop9486 = alloca i1, i1 0
-  %nop9487 = alloca i1, i1 0
-  %nop9488 = alloca i1, i1 0
-  %nop9489 = alloca i1, i1 0
-  %nop9490 = alloca i1, i1 0
-  %nop9491 = alloca i1, i1 0
-  %nop9492 = alloca i1, i1 0
-  %nop9493 = alloca i1, i1 0
-  %nop9494 = alloca i1, i1 0
-  %nop9495 = alloca i1, i1 0
-  %nop9496 = alloca i1, i1 0
-  %nop9497 = alloca i1, i1 0
-  %nop9498 = alloca i1, i1 0
-  %nop9499 = alloca i1, i1 0
-  %nop9500 = alloca i1, i1 0
-  %nop9501 = alloca i1, i1 0
-  %nop9502 = alloca i1, i1 0
-  %nop9503 = alloca i1, i1 0
-  %nop9504 = alloca i1, i1 0
-  %nop9505 = alloca i1, i1 0
-  %nop9506 = alloca i1, i1 0
-  %nop9507 = alloca i1, i1 0
-  %nop9508 = alloca i1, i1 0
-  %nop9509 = alloca i1, i1 0
-  %nop9510 = alloca i1, i1 0
-  %nop9511 = alloca i1, i1 0
-  %nop9512 = alloca i1, i1 0
-  %nop9513 = alloca i1, i1 0
-  %nop9514 = alloca i1, i1 0
-  %nop9515 = alloca i1, i1 0
-  %nop9516 = alloca i1, i1 0
-  %nop9517 = alloca i1, i1 0
-  %nop9518 = alloca i1, i1 0
-  %nop9519 = alloca i1, i1 0
-  %nop9520 = alloca i1, i1 0
-  %nop9521 = alloca i1, i1 0
-  %nop9522 = alloca i1, i1 0
-  %nop9523 = alloca i1, i1 0
-  %nop9524 = alloca i1, i1 0
-  %nop9525 = alloca i1, i1 0
-  %nop9526 = alloca i1, i1 0
-  %nop9527 = alloca i1, i1 0
-  %nop9528 = alloca i1, i1 0
-  %nop9529 = alloca i1, i1 0
-  %nop9530 = alloca i1, i1 0
-  %nop9531 = alloca i1, i1 0
-  %nop9532 = alloca i1, i1 0
-  %nop9533 = alloca i1, i1 0
-  %nop9534 = alloca i1, i1 0
-  %nop9535 = alloca i1, i1 0
-  %nop9536 = alloca i1, i1 0
-  %nop9537 = alloca i1, i1 0
-  %nop9538 = alloca i1, i1 0
-  %nop9539 = alloca i1, i1 0
-  %nop9540 = alloca i1, i1 0
-  %nop9541 = alloca i1, i1 0
-  %nop9542 = alloca i1, i1 0
-  %nop9543 = alloca i1, i1 0
-  %nop9544 = alloca i1, i1 0
-  %nop9545 = alloca i1, i1 0
-  %nop9546 = alloca i1, i1 0
-  %nop9547 = alloca i1, i1 0
-  %nop9548 = alloca i1, i1 0
-  %nop9549 = alloca i1, i1 0
-  %nop9550 = alloca i1, i1 0
-  %nop9551 = alloca i1, i1 0
-  %nop9552 = alloca i1, i1 0
-  %nop9553 = alloca i1, i1 0
-  %nop9554 = alloca i1, i1 0
-  %nop9555 = alloca i1, i1 0
-  %nop9556 = alloca i1, i1 0
-  %nop9557 = alloca i1, i1 0
-  %nop9558 = alloca i1, i1 0
-  %nop9559 = alloca i1, i1 0
-  %nop9560 = alloca i1, i1 0
-  %nop9561 = alloca i1, i1 0
-  %nop9562 = alloca i1, i1 0
-  %nop9563 = alloca i1, i1 0
-  %nop9564 = alloca i1, i1 0
-  %nop9565 = alloca i1, i1 0
-  %nop9566 = alloca i1, i1 0
-  %nop9567 = alloca i1, i1 0
-  %nop9568 = alloca i1, i1 0
-  %nop9569 = alloca i1, i1 0
-  %nop9570 = alloca i1, i1 0
-  %nop9571 = alloca i1, i1 0
-  %nop9572 = alloca i1, i1 0
-  %nop9573 = alloca i1, i1 0
-  %nop9574 = alloca i1, i1 0
-  %nop9575 = alloca i1, i1 0
-  %nop9576 = alloca i1, i1 0
-  %nop9577 = alloca i1, i1 0
-  %nop9578 = alloca i1, i1 0
-  %nop9579 = alloca i1, i1 0
-  %nop9580 = alloca i1, i1 0
-  %nop9581 = alloca i1, i1 0
-  %nop9582 = alloca i1, i1 0
-  %nop9583 = alloca i1, i1 0
-  %nop9584 = alloca i1, i1 0
-  %nop9585 = alloca i1, i1 0
-  %nop9586 = alloca i1, i1 0
-  %nop9587 = alloca i1, i1 0
-  %nop9588 = alloca i1, i1 0
-  %nop9589 = alloca i1, i1 0
-  %nop9590 = alloca i1, i1 0
-  %nop9591 = alloca i1, i1 0
-  %nop9592 = alloca i1, i1 0
-  %nop9593 = alloca i1, i1 0
-  %nop9594 = alloca i1, i1 0
-  %nop9595 = alloca i1, i1 0
-  %nop9596 = alloca i1, i1 0
-  %nop9597 = alloca i1, i1 0
-  %nop9598 = alloca i1, i1 0
-  %nop9599 = alloca i1, i1 0
-  %nop9600 = alloca i1, i1 0
-  %nop9601 = alloca i1, i1 0
-  %nop9602 = alloca i1, i1 0
-  %nop9603 = alloca i1, i1 0
-  %nop9604 = alloca i1, i1 0
-  %nop9605 = alloca i1, i1 0
-  %nop9606 = alloca i1, i1 0
-  %nop9607 = alloca i1, i1 0
-  %nop9608 = alloca i1, i1 0
-  %nop9609 = alloca i1, i1 0
-  %nop9610 = alloca i1, i1 0
-  %nop9611 = alloca i1, i1 0
-  %nop9612 = alloca i1, i1 0
-  %nop9613 = alloca i1, i1 0
-  %nop9614 = alloca i1, i1 0
-  %nop9615 = alloca i1, i1 0
-  %nop9616 = alloca i1, i1 0
-  %nop9617 = alloca i1, i1 0
-  %nop9618 = alloca i1, i1 0
-  %nop9619 = alloca i1, i1 0
-  %nop9620 = alloca i1, i1 0
-  %nop9621 = alloca i1, i1 0
-  %nop9622 = alloca i1, i1 0
-  %nop9623 = alloca i1, i1 0
-  %nop9624 = alloca i1, i1 0
-  %nop9625 = alloca i1, i1 0
-  %nop9626 = alloca i1, i1 0
-  %nop9627 = alloca i1, i1 0
-  %nop9628 = alloca i1, i1 0
-  %nop9629 = alloca i1, i1 0
-  %nop9630 = alloca i1, i1 0
-  %nop9631 = alloca i1, i1 0
-  %nop9632 = alloca i1, i1 0
-  %nop9633 = alloca i1, i1 0
-  %nop9634 = alloca i1, i1 0
-  %nop9635 = alloca i1, i1 0
-  %nop9636 = alloca i1, i1 0
-  %nop9637 = alloca i1, i1 0
-  %nop9638 = alloca i1, i1 0
-  %nop9639 = alloca i1, i1 0
-  %nop9640 = alloca i1, i1 0
-  %nop9641 = alloca i1, i1 0
-  %nop9642 = alloca i1, i1 0
-  %nop9643 = alloca i1, i1 0
-  %nop9644 = alloca i1, i1 0
-  %nop9645 = alloca i1, i1 0
-  %nop9646 = alloca i1, i1 0
-  %nop9647 = alloca i1, i1 0
-  %nop9648 = alloca i1, i1 0
-  %nop9649 = alloca i1, i1 0
-  %nop9650 = alloca i1, i1 0
-  %nop9651 = alloca i1, i1 0
-  %nop9652 = alloca i1, i1 0
-  %nop9653 = alloca i1, i1 0
-  %nop9654 = alloca i1, i1 0
-  %nop9655 = alloca i1, i1 0
-  %nop9656 = alloca i1, i1 0
-  %nop9657 = alloca i1, i1 0
-  %nop9658 = alloca i1, i1 0
-  %nop9659 = alloca i1, i1 0
-  %nop9660 = alloca i1, i1 0
-  %nop9661 = alloca i1, i1 0
-  %nop9662 = alloca i1, i1 0
-  %nop9663 = alloca i1, i1 0
-  %nop9664 = alloca i1, i1 0
-  %nop9665 = alloca i1, i1 0
-  %nop9666 = alloca i1, i1 0
-  %nop9667 = alloca i1, i1 0
-  %nop9668 = alloca i1, i1 0
-  %nop9669 = alloca i1, i1 0
-  %nop9670 = alloca i1, i1 0
-  %nop9671 = alloca i1, i1 0
-  %nop9672 = alloca i1, i1 0
-  %nop9673 = alloca i1, i1 0
-  %nop9674 = alloca i1, i1 0
-  %nop9675 = alloca i1, i1 0
-  %nop9676 = alloca i1, i1 0
-  %nop9677 = alloca i1, i1 0
-  %nop9678 = alloca i1, i1 0
-  %nop9679 = alloca i1, i1 0
-  %nop9680 = alloca i1, i1 0
-  %nop9681 = alloca i1, i1 0
-  %nop9682 = alloca i1, i1 0
-  %nop9683 = alloca i1, i1 0
-  %nop9684 = alloca i1, i1 0
-  %nop9685 = alloca i1, i1 0
-  %nop9686 = alloca i1, i1 0
-  %nop9687 = alloca i1, i1 0
-  %nop9688 = alloca i1, i1 0
-  %nop9689 = alloca i1, i1 0
-  %nop9690 = alloca i1, i1 0
-  %nop9691 = alloca i1, i1 0
-  %nop9692 = alloca i1, i1 0
-  %nop9693 = alloca i1, i1 0
-  %nop9694 = alloca i1, i1 0
-  %nop9695 = alloca i1, i1 0
-  %nop9696 = alloca i1, i1 0
-  %nop9697 = alloca i1, i1 0
-  %nop9698 = alloca i1, i1 0
-  %nop9699 = alloca i1, i1 0
-  %nop9700 = alloca i1, i1 0
-  %nop9701 = alloca i1, i1 0
-  %nop9702 = alloca i1, i1 0
-  %nop9703 = alloca i1, i1 0
-  %nop9704 = alloca i1, i1 0
-  %nop9705 = alloca i1, i1 0
-  %nop9706 = alloca i1, i1 0
-  %nop9707 = alloca i1, i1 0
-  %nop9708 = alloca i1, i1 0
-  %nop9709 = alloca i1, i1 0
-  %nop9710 = alloca i1, i1 0
-  %nop9711 = alloca i1, i1 0
-  %nop9712 = alloca i1, i1 0
-  %nop9713 = alloca i1, i1 0
-  %nop9714 = alloca i1, i1 0
-  %nop9715 = alloca i1, i1 0
-  %nop9716 = alloca i1, i1 0
-  %nop9717 = alloca i1, i1 0
-  %nop9718 = alloca i1, i1 0
-  %nop9719 = alloca i1, i1 0
-  %nop9720 = alloca i1, i1 0
-  %nop9721 = alloca i1, i1 0
-  %nop9722 = alloca i1, i1 0
-  %nop9723 = alloca i1, i1 0
-  %nop9724 = alloca i1, i1 0
-  %nop9725 = alloca i1, i1 0
-  %nop9726 = alloca i1, i1 0
-  %nop9727 = alloca i1, i1 0
-  %nop9728 = alloca i1, i1 0
-  %nop9729 = alloca i1, i1 0
-  %nop9730 = alloca i1, i1 0
-  %nop9731 = alloca i1, i1 0
-  %nop9732 = alloca i1, i1 0
-  %nop9733 = alloca i1, i1 0
-  %nop9734 = alloca i1, i1 0
-  %nop9735 = alloca i1, i1 0
-  %nop9736 = alloca i1, i1 0
-  %nop9737 = alloca i1, i1 0
-  %nop9738 = alloca i1, i1 0
-  %nop9739 = alloca i1, i1 0
-  %nop9740 = alloca i1, i1 0
-  %nop9741 = alloca i1, i1 0
-  %nop9742 = alloca i1, i1 0
-  %nop9743 = alloca i1, i1 0
-  %nop9744 = alloca i1, i1 0
-  %nop9745 = alloca i1, i1 0
-  %nop9746 = alloca i1, i1 0
-  %nop9747 = alloca i1, i1 0
-  %nop9748 = alloca i1, i1 0
-  %nop9749 = alloca i1, i1 0
-  %nop9750 = alloca i1, i1 0
-  %nop9751 = alloca i1, i1 0
-  %nop9752 = alloca i1, i1 0
-  %nop9753 = alloca i1, i1 0
-  %nop9754 = alloca i1, i1 0
-  %nop9755 = alloca i1, i1 0
-  %nop9756 = alloca i1, i1 0
-  %nop9757 = alloca i1, i1 0
-  %nop9758 = alloca i1, i1 0
-  %nop9759 = alloca i1, i1 0
-  %nop9760 = alloca i1, i1 0
-  %nop9761 = alloca i1, i1 0
-  %nop9762 = alloca i1, i1 0
-  %nop9763 = alloca i1, i1 0
-  %nop9764 = alloca i1, i1 0
-  %nop9765 = alloca i1, i1 0
-  %nop9766 = alloca i1, i1 0
-  %nop9767 = alloca i1, i1 0
-  %nop9768 = alloca i1, i1 0
-  %nop9769 = alloca i1, i1 0
-  %nop9770 = alloca i1, i1 0
-  %nop9771 = alloca i1, i1 0
-  %nop9772 = alloca i1, i1 0
-  %nop9773 = alloca i1, i1 0
-  %nop9774 = alloca i1, i1 0
-  %nop9775 = alloca i1, i1 0
-  %nop9776 = alloca i1, i1 0
-  %nop9777 = alloca i1, i1 0
-  %nop9778 = alloca i1, i1 0
-  %nop9779 = alloca i1, i1 0
-  %nop9780 = alloca i1, i1 0
-  %nop9781 = alloca i1, i1 0
-  %nop9782 = alloca i1, i1 0
-  %nop9783 = alloca i1, i1 0
-  %nop9784 = alloca i1, i1 0
-  %nop9785 = alloca i1, i1 0
-  %nop9786 = alloca i1, i1 0
-  %nop9787 = alloca i1, i1 0
-  %nop9788 = alloca i1, i1 0
-  %nop9789 = alloca i1, i1 0
-  %nop9790 = alloca i1, i1 0
-  %nop9791 = alloca i1, i1 0
-  %nop9792 = alloca i1, i1 0
-  %nop9793 = alloca i1, i1 0
-  %nop9794 = alloca i1, i1 0
-  %nop9795 = alloca i1, i1 0
-  %nop9796 = alloca i1, i1 0
-  %nop9797 = alloca i1, i1 0
-  %nop9798 = alloca i1, i1 0
-  %nop9799 = alloca i1, i1 0
-  %nop9800 = alloca i1, i1 0
-  %nop9801 = alloca i1, i1 0
-  %nop9802 = alloca i1, i1 0
-  %nop9803 = alloca i1, i1 0
-  %nop9804 = alloca i1, i1 0
-  %nop9805 = alloca i1, i1 0
-  %nop9806 = alloca i1, i1 0
-  %nop9807 = alloca i1, i1 0
-  %nop9808 = alloca i1, i1 0
-  %nop9809 = alloca i1, i1 0
-  %nop9810 = alloca i1, i1 0
-  %nop9811 = alloca i1, i1 0
-  %nop9812 = alloca i1, i1 0
-  %nop9813 = alloca i1, i1 0
-  %nop9814 = alloca i1, i1 0
-  %nop9815 = alloca i1, i1 0
-  %nop9816 = alloca i1, i1 0
-  %nop9817 = alloca i1, i1 0
-  %nop9818 = alloca i1, i1 0
-  %nop9819 = alloca i1, i1 0
-  %nop9820 = alloca i1, i1 0
-  %nop9821 = alloca i1, i1 0
-  %nop9822 = alloca i1, i1 0
-  %nop9823 = alloca i1, i1 0
-  %nop9824 = alloca i1, i1 0
-  %nop9825 = alloca i1, i1 0
-  %nop9826 = alloca i1, i1 0
-  %nop9827 = alloca i1, i1 0
-  %nop9828 = alloca i1, i1 0
-  %nop9829 = alloca i1, i1 0
-  %nop9830 = alloca i1, i1 0
-  %nop9831 = alloca i1, i1 0
-  %nop9832 = alloca i1, i1 0
-  %nop9833 = alloca i1, i1 0
-  %nop9834 = alloca i1, i1 0
-  %nop9835 = alloca i1, i1 0
-  %nop9836 = alloca i1, i1 0
-  %nop9837 = alloca i1, i1 0
-  %nop9838 = alloca i1, i1 0
-  %nop9839 = alloca i1, i1 0
-  %nop9840 = alloca i1, i1 0
-  %nop9841 = alloca i1, i1 0
-  %nop9842 = alloca i1, i1 0
-  %nop9843 = alloca i1, i1 0
-  %nop9844 = alloca i1, i1 0
-  %nop9845 = alloca i1, i1 0
-  %nop9846 = alloca i1, i1 0
-  %nop9847 = alloca i1, i1 0
-  %nop9848 = alloca i1, i1 0
-  %nop9849 = alloca i1, i1 0
-  %nop9850 = alloca i1, i1 0
-  %nop9851 = alloca i1, i1 0
-  %nop9852 = alloca i1, i1 0
-  %nop9853 = alloca i1, i1 0
-  %nop9854 = alloca i1, i1 0
-  %nop9855 = alloca i1, i1 0
-  %nop9856 = alloca i1, i1 0
-  %nop9857 = alloca i1, i1 0
-  %nop9858 = alloca i1, i1 0
-  %nop9859 = alloca i1, i1 0
-  %nop9860 = alloca i1, i1 0
-  %nop9861 = alloca i1, i1 0
-  %nop9862 = alloca i1, i1 0
-  %nop9863 = alloca i1, i1 0
-  %nop9864 = alloca i1, i1 0
-  %nop9865 = alloca i1, i1 0
-  %nop9866 = alloca i1, i1 0
-  %nop9867 = alloca i1, i1 0
-  %nop9868 = alloca i1, i1 0
-  %nop9869 = alloca i1, i1 0
-  %nop9870 = alloca i1, i1 0
-  %nop9871 = alloca i1, i1 0
-  %nop9872 = alloca i1, i1 0
-  %nop9873 = alloca i1, i1 0
-  %nop9874 = alloca i1, i1 0
-  %nop9875 = alloca i1, i1 0
-  %nop9876 = alloca i1, i1 0
-  %nop9877 = alloca i1, i1 0
-  %nop9878 = alloca i1, i1 0
-  %nop9879 = alloca i1, i1 0
-  %nop9880 = alloca i1, i1 0
-  %nop9881 = alloca i1, i1 0
-  %nop9882 = alloca i1, i1 0
-  %nop9883 = alloca i1, i1 0
-  %nop9884 = alloca i1, i1 0
-  %nop9885 = alloca i1, i1 0
-  %nop9886 = alloca i1, i1 0
-  %nop9887 = alloca i1, i1 0
-  %nop9888 = alloca i1, i1 0
-  %nop9889 = alloca i1, i1 0
-  %nop9890 = alloca i1, i1 0
-  %nop9891 = alloca i1, i1 0
-  %nop9892 = alloca i1, i1 0
-  %nop9893 = alloca i1, i1 0
-  %nop9894 = alloca i1, i1 0
-  %nop9895 = alloca i1, i1 0
-  %nop9896 = alloca i1, i1 0
-  %nop9897 = alloca i1, i1 0
-  %nop9898 = alloca i1, i1 0
-  %nop9899 = alloca i1, i1 0
-  %nop9900 = alloca i1, i1 0
-  %nop9901 = alloca i1, i1 0
-  %nop9902 = alloca i1, i1 0
-  %nop9903 = alloca i1, i1 0
-  %nop9904 = alloca i1, i1 0
-  %nop9905 = alloca i1, i1 0
-  %nop9906 = alloca i1, i1 0
-  %nop9907 = alloca i1, i1 0
-  %nop9908 = alloca i1, i1 0
-  %nop9909 = alloca i1, i1 0
-  %nop9910 = alloca i1, i1 0
-  %nop9911 = alloca i1, i1 0
-  %nop9912 = alloca i1, i1 0
-  %nop9913 = alloca i1, i1 0
-  %nop9914 = alloca i1, i1 0
-  %nop9915 = alloca i1, i1 0
-  %nop9916 = alloca i1, i1 0
-  %nop9917 = alloca i1, i1 0
-  %nop9918 = alloca i1, i1 0
-  %nop9919 = alloca i1, i1 0
-  %nop9920 = alloca i1, i1 0
-  %nop9921 = alloca i1, i1 0
-  %nop9922 = alloca i1, i1 0
-  %nop9923 = alloca i1, i1 0
-  %nop9924 = alloca i1, i1 0
-  %nop9925 = alloca i1, i1 0
-  %nop9926 = alloca i1, i1 0
-  %nop9927 = alloca i1, i1 0
-  %nop9928 = alloca i1, i1 0
-  %nop9929 = alloca i1, i1 0
-  %nop9930 = alloca i1, i1 0
-  %nop9931 = alloca i1, i1 0
-  %nop9932 = alloca i1, i1 0
-  %nop9933 = alloca i1, i1 0
-  %nop9934 = alloca i1, i1 0
-  %nop9935 = alloca i1, i1 0
-  %nop9936 = alloca i1, i1 0
-  %nop9937 = alloca i1, i1 0
-  %nop9938 = alloca i1, i1 0
-  %nop9939 = alloca i1, i1 0
-  %nop9940 = alloca i1, i1 0
-  %nop9941 = alloca i1, i1 0
-  %nop9942 = alloca i1, i1 0
-  %nop9943 = alloca i1, i1 0
-  %nop9944 = alloca i1, i1 0
-  %nop9945 = alloca i1, i1 0
-  %nop9946 = alloca i1, i1 0
-  %nop9947 = alloca i1, i1 0
-  %nop9948 = alloca i1, i1 0
-  %nop9949 = alloca i1, i1 0
-  %nop9950 = alloca i1, i1 0
-  %nop9951 = alloca i1, i1 0
-  %nop9952 = alloca i1, i1 0
-  %nop9953 = alloca i1, i1 0
-  %nop9954 = alloca i1, i1 0
-  %nop9955 = alloca i1, i1 0
-  %nop9956 = alloca i1, i1 0
-  %nop9957 = alloca i1, i1 0
-  %nop9958 = alloca i1, i1 0
-  %nop9959 = alloca i1, i1 0
-  %nop9960 = alloca i1, i1 0
-  %nop9961 = alloca i1, i1 0
-  %nop9962 = alloca i1, i1 0
-  %nop9963 = alloca i1, i1 0
-  %nop9964 = alloca i1, i1 0
-  %nop9965 = alloca i1, i1 0
-  %nop9966 = alloca i1, i1 0
-  %nop9967 = alloca i1, i1 0
-  %nop9968 = alloca i1, i1 0
-  %nop9969 = alloca i1, i1 0
-  %nop9970 = alloca i1, i1 0
-  %nop9971 = alloca i1, i1 0
-  %nop9972 = alloca i1, i1 0
-  %nop9973 = alloca i1, i1 0
-  %nop9974 = alloca i1, i1 0
-  %nop9975 = alloca i1, i1 0
-  %nop9976 = alloca i1, i1 0
-  %nop9977 = alloca i1, i1 0
-  %nop9978 = alloca i1, i1 0
-  %nop9979 = alloca i1, i1 0
-  %nop9980 = alloca i1, i1 0
-  %nop9981 = alloca i1, i1 0
-  %nop9982 = alloca i1, i1 0
-  %nop9983 = alloca i1, i1 0
-  %nop9984 = alloca i1, i1 0
-  %nop9985 = alloca i1, i1 0
-  %nop9986 = alloca i1, i1 0
-  %nop9987 = alloca i1, i1 0
-  %nop9988 = alloca i1, i1 0
-  %nop9989 = alloca i1, i1 0
-  %nop9990 = alloca i1, i1 0
-  %nop9991 = alloca i1, i1 0
-  %nop9992 = alloca i1, i1 0
-  %nop9993 = alloca i1, i1 0
-  %nop9994 = alloca i1, i1 0
-  %nop9995 = alloca i1, i1 0
-  %nop9996 = alloca i1, i1 0
-  %nop9997 = alloca i1, i1 0
-  %nop9998 = alloca i1, i1 0
-  %nop9999 = alloca i1, i1 0
-  %nop10000 = alloca i1, i1 0
-  %nop10001 = alloca i1, i1 0
-  %nop10002 = alloca i1, i1 0
-  %nop10003 = alloca i1, i1 0
-  %nop10004 = alloca i1, i1 0
-  %nop10005 = alloca i1, i1 0
-  %nop10006 = alloca i1, i1 0
-  %nop10007 = alloca i1, i1 0
-  %nop10008 = alloca i1, i1 0
-  %nop10009 = alloca i1, i1 0
-  %nop10010 = alloca i1, i1 0
-  %nop10011 = alloca i1, i1 0
-  %nop10012 = alloca i1, i1 0
-  %nop10013 = alloca i1, i1 0
-  %nop10014 = alloca i1, i1 0
-  %nop10015 = alloca i1, i1 0
-  %nop10016 = alloca i1, i1 0
-  %nop10017 = alloca i1, i1 0
-  %nop10018 = alloca i1, i1 0
-  %nop10019 = alloca i1, i1 0
-  %nop10020 = alloca i1, i1 0
-  %nop10021 = alloca i1, i1 0
-  %nop10022 = alloca i1, i1 0
-  %nop10023 = alloca i1, i1 0
-  %nop10024 = alloca i1, i1 0
-  %nop10025 = alloca i1, i1 0
-  %nop10026 = alloca i1, i1 0
-  %nop10027 = alloca i1, i1 0
-  %nop10028 = alloca i1, i1 0
-  %nop10029 = alloca i1, i1 0
-  %nop10030 = alloca i1, i1 0
-  %nop10031 = alloca i1, i1 0
-  %nop10032 = alloca i1, i1 0
-  %nop10033 = alloca i1, i1 0
-  %nop10034 = alloca i1, i1 0
-  %nop10035 = alloca i1, i1 0
-  %nop10036 = alloca i1, i1 0
-  %nop10037 = alloca i1, i1 0
-  %nop10038 = alloca i1, i1 0
-  %nop10039 = alloca i1, i1 0
-  %nop10040 = alloca i1, i1 0
-  %nop10041 = alloca i1, i1 0
-  %nop10042 = alloca i1, i1 0
-  %nop10043 = alloca i1, i1 0
-  %nop10044 = alloca i1, i1 0
-  %nop10045 = alloca i1, i1 0
-  %nop10046 = alloca i1, i1 0
-  %nop10047 = alloca i1, i1 0
-  %nop10048 = alloca i1, i1 0
-  %nop10049 = alloca i1, i1 0
-  %nop10050 = alloca i1, i1 0
-  %nop10051 = alloca i1, i1 0
-  %nop10052 = alloca i1, i1 0
-  %nop10053 = alloca i1, i1 0
-  %nop10054 = alloca i1, i1 0
-  %nop10055 = alloca i1, i1 0
-  %nop10056 = alloca i1, i1 0
-  %nop10057 = alloca i1, i1 0
-  %nop10058 = alloca i1, i1 0
-  %nop10059 = alloca i1, i1 0
-  %nop10060 = alloca i1, i1 0
-  %nop10061 = alloca i1, i1 0
-  %nop10062 = alloca i1, i1 0
-  %nop10063 = alloca i1, i1 0
-  %nop10064 = alloca i1, i1 0
-  %nop10065 = alloca i1, i1 0
-  %nop10066 = alloca i1, i1 0
-  %nop10067 = alloca i1, i1 0
-  %nop10068 = alloca i1, i1 0
-  %nop10069 = alloca i1, i1 0
-  %nop10070 = alloca i1, i1 0
-  %nop10071 = alloca i1, i1 0
-  %nop10072 = alloca i1, i1 0
-  %nop10073 = alloca i1, i1 0
-  %nop10074 = alloca i1, i1 0
-  %nop10075 = alloca i1, i1 0
-  %nop10076 = alloca i1, i1 0
-  %nop10077 = alloca i1, i1 0
-  %nop10078 = alloca i1, i1 0
-  %nop10079 = alloca i1, i1 0
-  %nop10080 = alloca i1, i1 0
-  %nop10081 = alloca i1, i1 0
-  %nop10082 = alloca i1, i1 0
-  %nop10083 = alloca i1, i1 0
-  %nop10084 = alloca i1, i1 0
-  %nop10085 = alloca i1, i1 0
-  %nop10086 = alloca i1, i1 0
-  %nop10087 = alloca i1, i1 0
-  %nop10088 = alloca i1, i1 0
-  %nop10089 = alloca i1, i1 0
-  %nop10090 = alloca i1, i1 0
-  %nop10091 = alloca i1, i1 0
-  %nop10092 = alloca i1, i1 0
-  %nop10093 = alloca i1, i1 0
-  %nop10094 = alloca i1, i1 0
-  %nop10095 = alloca i1, i1 0
-  %nop10096 = alloca i1, i1 0
-  %nop10097 = alloca i1, i1 0
-  %nop10098 = alloca i1, i1 0
-  %nop10099 = alloca i1, i1 0
-  %nop10100 = alloca i1, i1 0
-  %nop10101 = alloca i1, i1 0
-  %nop10102 = alloca i1, i1 0
-  %nop10103 = alloca i1, i1 0
-  %nop10104 = alloca i1, i1 0
-  %nop10105 = alloca i1, i1 0
-  %nop10106 = alloca i1, i1 0
-  %nop10107 = alloca i1, i1 0
-  %nop10108 = alloca i1, i1 0
-  %nop10109 = alloca i1, i1 0
-  %nop10110 = alloca i1, i1 0
-  %nop10111 = alloca i1, i1 0
-  %nop10112 = alloca i1, i1 0
-  %nop10113 = alloca i1, i1 0
-  %nop10114 = alloca i1, i1 0
-  %nop10115 = alloca i1, i1 0
-  %nop10116 = alloca i1, i1 0
-  %nop10117 = alloca i1, i1 0
-  %nop10118 = alloca i1, i1 0
-  %nop10119 = alloca i1, i1 0
-  %nop10120 = alloca i1, i1 0
-  %nop10121 = alloca i1, i1 0
-  %nop10122 = alloca i1, i1 0
-  %nop10123 = alloca i1, i1 0
-  %nop10124 = alloca i1, i1 0
-  %nop10125 = alloca i1, i1 0
-  %nop10126 = alloca i1, i1 0
-  %nop10127 = alloca i1, i1 0
-  %nop10128 = alloca i1, i1 0
-  %nop10129 = alloca i1, i1 0
-  %nop10130 = alloca i1, i1 0
-  %nop10131 = alloca i1, i1 0
-  %nop10132 = alloca i1, i1 0
-  %nop10133 = alloca i1, i1 0
-  %nop10134 = alloca i1, i1 0
-  %nop10135 = alloca i1, i1 0
-  %nop10136 = alloca i1, i1 0
-  %nop10137 = alloca i1, i1 0
-  %nop10138 = alloca i1, i1 0
-  %nop10139 = alloca i1, i1 0
-  %nop10140 = alloca i1, i1 0
-  %nop10141 = alloca i1, i1 0
-  %nop10142 = alloca i1, i1 0
-  %nop10143 = alloca i1, i1 0
-  %nop10144 = alloca i1, i1 0
-  %nop10145 = alloca i1, i1 0
-  %nop10146 = alloca i1, i1 0
-  %nop10147 = alloca i1, i1 0
-  %nop10148 = alloca i1, i1 0
-  %nop10149 = alloca i1, i1 0
-  %nop10150 = alloca i1, i1 0
-  %nop10151 = alloca i1, i1 0
-  %nop10152 = alloca i1, i1 0
-  %nop10153 = alloca i1, i1 0
-  %nop10154 = alloca i1, i1 0
-  %nop10155 = alloca i1, i1 0
-  %nop10156 = alloca i1, i1 0
-  %nop10157 = alloca i1, i1 0
-  %nop10158 = alloca i1, i1 0
-  %nop10159 = alloca i1, i1 0
-  %nop10160 = alloca i1, i1 0
-  %nop10161 = alloca i1, i1 0
-  %nop10162 = alloca i1, i1 0
-  %nop10163 = alloca i1, i1 0
-  %nop10164 = alloca i1, i1 0
-  %nop10165 = alloca i1, i1 0
-  %nop10166 = alloca i1, i1 0
-  %nop10167 = alloca i1, i1 0
-  %nop10168 = alloca i1, i1 0
-  %nop10169 = alloca i1, i1 0
-  %nop10170 = alloca i1, i1 0
-  %nop10171 = alloca i1, i1 0
-  %nop10172 = alloca i1, i1 0
-  %nop10173 = alloca i1, i1 0
-  %nop10174 = alloca i1, i1 0
-  %nop10175 = alloca i1, i1 0
-  %nop10176 = alloca i1, i1 0
-  %nop10177 = alloca i1, i1 0
-  %nop10178 = alloca i1, i1 0
-  %nop10179 = alloca i1, i1 0
-  %nop10180 = alloca i1, i1 0
-  %nop10181 = alloca i1, i1 0
-  %nop10182 = alloca i1, i1 0
-  %nop10183 = alloca i1, i1 0
-  %nop10184 = alloca i1, i1 0
-  %nop10185 = alloca i1, i1 0
-  %nop10186 = alloca i1, i1 0
-  %nop10187 = alloca i1, i1 0
-  %nop10188 = alloca i1, i1 0
-  %nop10189 = alloca i1, i1 0
-  %nop10190 = alloca i1, i1 0
-  %nop10191 = alloca i1, i1 0
-  %nop10192 = alloca i1, i1 0
-  %nop10193 = alloca i1, i1 0
-  %nop10194 = alloca i1, i1 0
-  %nop10195 = alloca i1, i1 0
-  %nop10196 = alloca i1, i1 0
-  %nop10197 = alloca i1, i1 0
-  %nop10198 = alloca i1, i1 0
-  %nop10199 = alloca i1, i1 0
-  %nop10200 = alloca i1, i1 0
-  %nop10201 = alloca i1, i1 0
-  %nop10202 = alloca i1, i1 0
-  %nop10203 = alloca i1, i1 0
-  %nop10204 = alloca i1, i1 0
-  %nop10205 = alloca i1, i1 0
-  %nop10206 = alloca i1, i1 0
-  %nop10207 = alloca i1, i1 0
-  %nop10208 = alloca i1, i1 0
-  %nop10209 = alloca i1, i1 0
-  %nop10210 = alloca i1, i1 0
-  %nop10211 = alloca i1, i1 0
-  %nop10212 = alloca i1, i1 0
-  %nop10213 = alloca i1, i1 0
-  %nop10214 = alloca i1, i1 0
-  %nop10215 = alloca i1, i1 0
-  %nop10216 = alloca i1, i1 0
-  %nop10217 = alloca i1, i1 0
-  %nop10218 = alloca i1, i1 0
-  %nop10219 = alloca i1, i1 0
-  %nop10220 = alloca i1, i1 0
-  %nop10221 = alloca i1, i1 0
-  %nop10222 = alloca i1, i1 0
-  %nop10223 = alloca i1, i1 0
-  %nop10224 = alloca i1, i1 0
-  %nop10225 = alloca i1, i1 0
-  %nop10226 = alloca i1, i1 0
-  %nop10227 = alloca i1, i1 0
-  %nop10228 = alloca i1, i1 0
-  %nop10229 = alloca i1, i1 0
-  %nop10230 = alloca i1, i1 0
-  %nop10231 = alloca i1, i1 0
-  %nop10232 = alloca i1, i1 0
-  %nop10233 = alloca i1, i1 0
-  %nop10234 = alloca i1, i1 0
-  %nop10235 = alloca i1, i1 0
-  %nop10236 = alloca i1, i1 0
-  %nop10237 = alloca i1, i1 0
-  %nop10238 = alloca i1, i1 0
-  %nop10239 = alloca i1, i1 0
-  %nop10240 = alloca i1, i1 0
-  %nop10241 = alloca i1, i1 0
-  %nop10242 = alloca i1, i1 0
-  %nop10243 = alloca i1, i1 0
-  %nop10244 = alloca i1, i1 0
-  %nop10245 = alloca i1, i1 0
-  %nop10246 = alloca i1, i1 0
-  %nop10247 = alloca i1, i1 0
-  %nop10248 = alloca i1, i1 0
-  %nop10249 = alloca i1, i1 0
-  %nop10250 = alloca i1, i1 0
-  %nop10251 = alloca i1, i1 0
-  %nop10252 = alloca i1, i1 0
-  %nop10253 = alloca i1, i1 0
-  %nop10254 = alloca i1, i1 0
-  %nop10255 = alloca i1, i1 0
-  %nop10256 = alloca i1, i1 0
-  %nop10257 = alloca i1, i1 0
-  %nop10258 = alloca i1, i1 0
-  %nop10259 = alloca i1, i1 0
-  %nop10260 = alloca i1, i1 0
-  %nop10261 = alloca i1, i1 0
-  %nop10262 = alloca i1, i1 0
-  %nop10263 = alloca i1, i1 0
-  %nop10264 = alloca i1, i1 0
-  %nop10265 = alloca i1, i1 0
-  %nop10266 = alloca i1, i1 0
-  %nop10267 = alloca i1, i1 0
-  %nop10268 = alloca i1, i1 0
-  %nop10269 = alloca i1, i1 0
-  %nop10270 = alloca i1, i1 0
-  %nop10271 = alloca i1, i1 0
-  %nop10272 = alloca i1, i1 0
-  %nop10273 = alloca i1, i1 0
-  %nop10274 = alloca i1, i1 0
-  %nop10275 = alloca i1, i1 0
-  %nop10276 = alloca i1, i1 0
-  %nop10277 = alloca i1, i1 0
-  %nop10278 = alloca i1, i1 0
-  %nop10279 = alloca i1, i1 0
-  %nop10280 = alloca i1, i1 0
-  %nop10281 = alloca i1, i1 0
-  %nop10282 = alloca i1, i1 0
-  %nop10283 = alloca i1, i1 0
-  %nop10284 = alloca i1, i1 0
-  %nop10285 = alloca i1, i1 0
-  %nop10286 = alloca i1, i1 0
-  %nop10287 = alloca i1, i1 0
-  %nop10288 = alloca i1, i1 0
-  %nop10289 = alloca i1, i1 0
-  %nop10290 = alloca i1, i1 0
-  %nop10291 = alloca i1, i1 0
-  %nop10292 = alloca i1, i1 0
-  %nop10293 = alloca i1, i1 0
-  %nop10294 = alloca i1, i1 0
-  %nop10295 = alloca i1, i1 0
-  %nop10296 = alloca i1, i1 0
-  %nop10297 = alloca i1, i1 0
-  %nop10298 = alloca i1, i1 0
-  %nop10299 = alloca i1, i1 0
-  %nop10300 = alloca i1, i1 0
-  %nop10301 = alloca i1, i1 0
-  %nop10302 = alloca i1, i1 0
-  %nop10303 = alloca i1, i1 0
-  %nop10304 = alloca i1, i1 0
-  %nop10305 = alloca i1, i1 0
-  %nop10306 = alloca i1, i1 0
-  %nop10307 = alloca i1, i1 0
-  %nop10308 = alloca i1, i1 0
-  %nop10309 = alloca i1, i1 0
-  %nop10310 = alloca i1, i1 0
-  %nop10311 = alloca i1, i1 0
-  %nop10312 = alloca i1, i1 0
-  %nop10313 = alloca i1, i1 0
-  %nop10314 = alloca i1, i1 0
-  %nop10315 = alloca i1, i1 0
-  %nop10316 = alloca i1, i1 0
-  %nop10317 = alloca i1, i1 0
-  %nop10318 = alloca i1, i1 0
-  %nop10319 = alloca i1, i1 0
-  %nop10320 = alloca i1, i1 0
-  %nop10321 = alloca i1, i1 0
-  %nop10322 = alloca i1, i1 0
-  %nop10323 = alloca i1, i1 0
-  %nop10324 = alloca i1, i1 0
-  %nop10325 = alloca i1, i1 0
-  %nop10326 = alloca i1, i1 0
-  %nop10327 = alloca i1, i1 0
-  %nop10328 = alloca i1, i1 0
-  %nop10329 = alloca i1, i1 0
-  %nop10330 = alloca i1, i1 0
-  %nop10331 = alloca i1, i1 0
-  %nop10332 = alloca i1, i1 0
-  %nop10333 = alloca i1, i1 0
-  %nop10334 = alloca i1, i1 0
-  %nop10335 = alloca i1, i1 0
-  %nop10336 = alloca i1, i1 0
-  %nop10337 = alloca i1, i1 0
-  %nop10338 = alloca i1, i1 0
-  %nop10339 = alloca i1, i1 0
-  %nop10340 = alloca i1, i1 0
-  %nop10341 = alloca i1, i1 0
-  %nop10342 = alloca i1, i1 0
-  %nop10343 = alloca i1, i1 0
-  %nop10344 = alloca i1, i1 0
-  %nop10345 = alloca i1, i1 0
-  %nop10346 = alloca i1, i1 0
-  %nop10347 = alloca i1, i1 0
-  %nop10348 = alloca i1, i1 0
-  %nop10349 = alloca i1, i1 0
-  %nop10350 = alloca i1, i1 0
-  %nop10351 = alloca i1, i1 0
-  %nop10352 = alloca i1, i1 0
-  %nop10353 = alloca i1, i1 0
-  %nop10354 = alloca i1, i1 0
-  %nop10355 = alloca i1, i1 0
-  %nop10356 = alloca i1, i1 0
-  %nop10357 = alloca i1, i1 0
-  %nop10358 = alloca i1, i1 0
-  %nop10359 = alloca i1, i1 0
-  %nop10360 = alloca i1, i1 0
-  %nop10361 = alloca i1, i1 0
-  %nop10362 = alloca i1, i1 0
-  %nop10363 = alloca i1, i1 0
-  %nop10364 = alloca i1, i1 0
-  %nop10365 = alloca i1, i1 0
-  %nop10366 = alloca i1, i1 0
-  %nop10367 = alloca i1, i1 0
-  %nop10368 = alloca i1, i1 0
-  %nop10369 = alloca i1, i1 0
-  %nop10370 = alloca i1, i1 0
-  %nop10371 = alloca i1, i1 0
-  %nop10372 = alloca i1, i1 0
-  %nop10373 = alloca i1, i1 0
-  %nop10374 = alloca i1, i1 0
-  %nop10375 = alloca i1, i1 0
-  %nop10376 = alloca i1, i1 0
-  %nop10377 = alloca i1, i1 0
-  %nop10378 = alloca i1, i1 0
-  %nop10379 = alloca i1, i1 0
-  %nop10380 = alloca i1, i1 0
-  %nop10381 = alloca i1, i1 0
-  %nop10382 = alloca i1, i1 0
-  %nop10383 = alloca i1, i1 0
-  %nop10384 = alloca i1, i1 0
-  %nop10385 = alloca i1, i1 0
-  %nop10386 = alloca i1, i1 0
-  %nop10387 = alloca i1, i1 0
-  %nop10388 = alloca i1, i1 0
-  %nop10389 = alloca i1, i1 0
-  %nop10390 = alloca i1, i1 0
-  %nop10391 = alloca i1, i1 0
-  %nop10392 = alloca i1, i1 0
-  %nop10393 = alloca i1, i1 0
-  %nop10394 = alloca i1, i1 0
-  %nop10395 = alloca i1, i1 0
-  %nop10396 = alloca i1, i1 0
-  %nop10397 = alloca i1, i1 0
-  %nop10398 = alloca i1, i1 0
-  %nop10399 = alloca i1, i1 0
-  %nop10400 = alloca i1, i1 0
-  %nop10401 = alloca i1, i1 0
-  %nop10402 = alloca i1, i1 0
-  %nop10403 = alloca i1, i1 0
-  %nop10404 = alloca i1, i1 0
-  %nop10405 = alloca i1, i1 0
-  %nop10406 = alloca i1, i1 0
-  %nop10407 = alloca i1, i1 0
-  %nop10408 = alloca i1, i1 0
-  %nop10409 = alloca i1, i1 0
-  %nop10410 = alloca i1, i1 0
-  %nop10411 = alloca i1, i1 0
-  %nop10412 = alloca i1, i1 0
-  %nop10413 = alloca i1, i1 0
-  %nop10414 = alloca i1, i1 0
-  %nop10415 = alloca i1, i1 0
-  %nop10416 = alloca i1, i1 0
-  %nop10417 = alloca i1, i1 0
-  %nop10418 = alloca i1, i1 0
-  %nop10419 = alloca i1, i1 0
-  %nop10420 = alloca i1, i1 0
-  %nop10421 = alloca i1, i1 0
-  %nop10422 = alloca i1, i1 0
-  %nop10423 = alloca i1, i1 0
-  %nop10424 = alloca i1, i1 0
-  %nop10425 = alloca i1, i1 0
-  %nop10426 = alloca i1, i1 0
-  %nop10427 = alloca i1, i1 0
-  %nop10428 = alloca i1, i1 0
-  %nop10429 = alloca i1, i1 0
-  %nop10430 = alloca i1, i1 0
-  %nop10431 = alloca i1, i1 0
-  %nop10432 = alloca i1, i1 0
-  %nop10433 = alloca i1, i1 0
-  %nop10434 = alloca i1, i1 0
-  %nop10435 = alloca i1, i1 0
-  %nop10436 = alloca i1, i1 0
-  %nop10437 = alloca i1, i1 0
-  %nop10438 = alloca i1, i1 0
-  %nop10439 = alloca i1, i1 0
-  %nop10440 = alloca i1, i1 0
-  %nop10441 = alloca i1, i1 0
-  %nop10442 = alloca i1, i1 0
-  %nop10443 = alloca i1, i1 0
-  %nop10444 = alloca i1, i1 0
-  %nop10445 = alloca i1, i1 0
-  %nop10446 = alloca i1, i1 0
-  %nop10447 = alloca i1, i1 0
-  %nop10448 = alloca i1, i1 0
-  %nop10449 = alloca i1, i1 0
-  %nop10450 = alloca i1, i1 0
-  %nop10451 = alloca i1, i1 0
-  %nop10452 = alloca i1, i1 0
-  %nop10453 = alloca i1, i1 0
-  %nop10454 = alloca i1, i1 0
-  %nop10455 = alloca i1, i1 0
-  %nop10456 = alloca i1, i1 0
-  %nop10457 = alloca i1, i1 0
-  %nop10458 = alloca i1, i1 0
-  %nop10459 = alloca i1, i1 0
-  %nop10460 = alloca i1, i1 0
-  %nop10461 = alloca i1, i1 0
-  %nop10462 = alloca i1, i1 0
-  %nop10463 = alloca i1, i1 0
-  %nop10464 = alloca i1, i1 0
-  %nop10465 = alloca i1, i1 0
-  %nop10466 = alloca i1, i1 0
-  %nop10467 = alloca i1, i1 0
-  %nop10468 = alloca i1, i1 0
-  %nop10469 = alloca i1, i1 0
-  %nop10470 = alloca i1, i1 0
-  %nop10471 = alloca i1, i1 0
-  %nop10472 = alloca i1, i1 0
-  %nop10473 = alloca i1, i1 0
-  %nop10474 = alloca i1, i1 0
-  %nop10475 = alloca i1, i1 0
-  %nop10476 = alloca i1, i1 0
-  %nop10477 = alloca i1, i1 0
-  %nop10478 = alloca i1, i1 0
-  %nop10479 = alloca i1, i1 0
-  %nop10480 = alloca i1, i1 0
-  %nop10481 = alloca i1, i1 0
-  %nop10482 = alloca i1, i1 0
-  %nop10483 = alloca i1, i1 0
-  %nop10484 = alloca i1, i1 0
-  %nop10485 = alloca i1, i1 0
-  %nop10486 = alloca i1, i1 0
-  %nop10487 = alloca i1, i1 0
-  %nop10488 = alloca i1, i1 0
-  %nop10489 = alloca i1, i1 0
-  %nop10490 = alloca i1, i1 0
-  %nop10491 = alloca i1, i1 0
-  %nop10492 = alloca i1, i1 0
-  %nop10493 = alloca i1, i1 0
-  %nop10494 = alloca i1, i1 0
-  %nop10495 = alloca i1, i1 0
-  %nop10496 = alloca i1, i1 0
-  %nop10497 = alloca i1, i1 0
-  %nop10498 = alloca i1, i1 0
-  %nop10499 = alloca i1, i1 0
-  %nop10500 = alloca i1, i1 0
-  %nop10501 = alloca i1, i1 0
-  %nop10502 = alloca i1, i1 0
-  %nop10503 = alloca i1, i1 0
-  %nop10504 = alloca i1, i1 0
-  %nop10505 = alloca i1, i1 0
-  %nop10506 = alloca i1, i1 0
-  %nop10507 = alloca i1, i1 0
-  %nop10508 = alloca i1, i1 0
-  %nop10509 = alloca i1, i1 0
-  %nop10510 = alloca i1, i1 0
-  %nop10511 = alloca i1, i1 0
-  %nop10512 = alloca i1, i1 0
-  %nop10513 = alloca i1, i1 0
-  %nop10514 = alloca i1, i1 0
-  %nop10515 = alloca i1, i1 0
-  %nop10516 = alloca i1, i1 0
-  %nop10517 = alloca i1, i1 0
-  %nop10518 = alloca i1, i1 0
-  %nop10519 = alloca i1, i1 0
-  %nop10520 = alloca i1, i1 0
-  %nop10521 = alloca i1, i1 0
-  %nop10522 = alloca i1, i1 0
-  %nop10523 = alloca i1, i1 0
-  %nop10524 = alloca i1, i1 0
-  %nop10525 = alloca i1, i1 0
-  %nop10526 = alloca i1, i1 0
-  %nop10527 = alloca i1, i1 0
-  %nop10528 = alloca i1, i1 0
-  %nop10529 = alloca i1, i1 0
-  %nop10530 = alloca i1, i1 0
-  %nop10531 = alloca i1, i1 0
-  %nop10532 = alloca i1, i1 0
-  %nop10533 = alloca i1, i1 0
-  %nop10534 = alloca i1, i1 0
-  %nop10535 = alloca i1, i1 0
-  %nop10536 = alloca i1, i1 0
-  %nop10537 = alloca i1, i1 0
-  %nop10538 = alloca i1, i1 0
-  %nop10539 = alloca i1, i1 0
-  %nop10540 = alloca i1, i1 0
-  %nop10541 = alloca i1, i1 0
-  %nop10542 = alloca i1, i1 0
-  %nop10543 = alloca i1, i1 0
-  %nop10544 = alloca i1, i1 0
-  %nop10545 = alloca i1, i1 0
-  %nop10546 = alloca i1, i1 0
-  %nop10547 = alloca i1, i1 0
-  %nop10548 = alloca i1, i1 0
-  %nop10549 = alloca i1, i1 0
-  %nop10550 = alloca i1, i1 0
-  %nop10551 = alloca i1, i1 0
-  %nop10552 = alloca i1, i1 0
-  %nop10553 = alloca i1, i1 0
-  %nop10554 = alloca i1, i1 0
-  %nop10555 = alloca i1, i1 0
-  %nop10556 = alloca i1, i1 0
-  %nop10557 = alloca i1, i1 0
-  %nop10558 = alloca i1, i1 0
-  %nop10559 = alloca i1, i1 0
-  %nop10560 = alloca i1, i1 0
-  %nop10561 = alloca i1, i1 0
-  %nop10562 = alloca i1, i1 0
-  %nop10563 = alloca i1, i1 0
-  %nop10564 = alloca i1, i1 0
-  %nop10565 = alloca i1, i1 0
-  %nop10566 = alloca i1, i1 0
-  %nop10567 = alloca i1, i1 0
-  %nop10568 = alloca i1, i1 0
-  %nop10569 = alloca i1, i1 0
-  %nop10570 = alloca i1, i1 0
-  %nop10571 = alloca i1, i1 0
-  %nop10572 = alloca i1, i1 0
-  %nop10573 = alloca i1, i1 0
-  %nop10574 = alloca i1, i1 0
-  %nop10575 = alloca i1, i1 0
-  %nop10576 = alloca i1, i1 0
-  %nop10577 = alloca i1, i1 0
-  %nop10578 = alloca i1, i1 0
-  %nop10579 = alloca i1, i1 0
-  %nop10580 = alloca i1, i1 0
-  %nop10581 = alloca i1, i1 0
-  %nop10582 = alloca i1, i1 0
-  %nop10583 = alloca i1, i1 0
-  %nop10584 = alloca i1, i1 0
-  %nop10585 = alloca i1, i1 0
-  %nop10586 = alloca i1, i1 0
-  %nop10587 = alloca i1, i1 0
-  %nop10588 = alloca i1, i1 0
-  %nop10589 = alloca i1, i1 0
-  %nop10590 = alloca i1, i1 0
-  %nop10591 = alloca i1, i1 0
-  %nop10592 = alloca i1, i1 0
-  %nop10593 = alloca i1, i1 0
-  %nop10594 = alloca i1, i1 0
-  %nop10595 = alloca i1, i1 0
-  %nop10596 = alloca i1, i1 0
-  %nop10597 = alloca i1, i1 0
-  %nop10598 = alloca i1, i1 0
-  %nop10599 = alloca i1, i1 0
-  %nop10600 = alloca i1, i1 0
-  %nop10601 = alloca i1, i1 0
-  %nop10602 = alloca i1, i1 0
-  %nop10603 = alloca i1, i1 0
-  %nop10604 = alloca i1, i1 0
-  %nop10605 = alloca i1, i1 0
-  %nop10606 = alloca i1, i1 0
-  %nop10607 = alloca i1, i1 0
-  %nop10608 = alloca i1, i1 0
-  %nop10609 = alloca i1, i1 0
-  %nop10610 = alloca i1, i1 0
-  %nop10611 = alloca i1, i1 0
-  %nop10612 = alloca i1, i1 0
-  %nop10613 = alloca i1, i1 0
-  %nop10614 = alloca i1, i1 0
-  %nop10615 = alloca i1, i1 0
-  %nop10616 = alloca i1, i1 0
-  %nop10617 = alloca i1, i1 0
-  %nop10618 = alloca i1, i1 0
-  %nop10619 = alloca i1, i1 0
-  %nop10620 = alloca i1, i1 0
-  %nop10621 = alloca i1, i1 0
-  %nop10622 = alloca i1, i1 0
-  %nop10623 = alloca i1, i1 0
-  %nop10624 = alloca i1, i1 0
-  %nop10625 = alloca i1, i1 0
-  %nop10626 = alloca i1, i1 0
-  %nop10627 = alloca i1, i1 0
-  %nop10628 = alloca i1, i1 0
-  %nop10629 = alloca i1, i1 0
-  %nop10630 = alloca i1, i1 0
-  %nop10631 = alloca i1, i1 0
-  %nop10632 = alloca i1, i1 0
-  %nop10633 = alloca i1, i1 0
-  %nop10634 = alloca i1, i1 0
-  %nop10635 = alloca i1, i1 0
-  %nop10636 = alloca i1, i1 0
-  %nop10637 = alloca i1, i1 0
-  %nop10638 = alloca i1, i1 0
-  %nop10639 = alloca i1, i1 0
-  %nop10640 = alloca i1, i1 0
-  %nop10641 = alloca i1, i1 0
-  %nop10642 = alloca i1, i1 0
-  %nop10643 = alloca i1, i1 0
-  %nop10644 = alloca i1, i1 0
-  %nop10645 = alloca i1, i1 0
-  %nop10646 = alloca i1, i1 0
-  %nop10647 = alloca i1, i1 0
-  %nop10648 = alloca i1, i1 0
-  %nop10649 = alloca i1, i1 0
-  %nop10650 = alloca i1, i1 0
-  %nop10651 = alloca i1, i1 0
-  %nop10652 = alloca i1, i1 0
-  %nop10653 = alloca i1, i1 0
-  %nop10654 = alloca i1, i1 0
-  %nop10655 = alloca i1, i1 0
-  %nop10656 = alloca i1, i1 0
-  %nop10657 = alloca i1, i1 0
-  %nop10658 = alloca i1, i1 0
-  %nop10659 = alloca i1, i1 0
-  %nop10660 = alloca i1, i1 0
-  %nop10661 = alloca i1, i1 0
-  %nop10662 = alloca i1, i1 0
-  %nop10663 = alloca i1, i1 0
-  %nop10664 = alloca i1, i1 0
-  %nop10665 = alloca i1, i1 0
-  %nop10666 = alloca i1, i1 0
-  %nop10667 = alloca i1, i1 0
-  %nop10668 = alloca i1, i1 0
-  %nop10669 = alloca i1, i1 0
-  %nop10670 = alloca i1, i1 0
-  %nop10671 = alloca i1, i1 0
-  %nop10672 = alloca i1, i1 0
-  %nop10673 = alloca i1, i1 0
-  %nop10674 = alloca i1, i1 0
-  %nop10675 = alloca i1, i1 0
-  %nop10676 = alloca i1, i1 0
-  %nop10677 = alloca i1, i1 0
-  %nop10678 = alloca i1, i1 0
-  %nop10679 = alloca i1, i1 0
-  %nop10680 = alloca i1, i1 0
-  %nop10681 = alloca i1, i1 0
-  %nop10682 = alloca i1, i1 0
-  %nop10683 = alloca i1, i1 0
-  %nop10684 = alloca i1, i1 0
-  %nop10685 = alloca i1, i1 0
-  %nop10686 = alloca i1, i1 0
-  %nop10687 = alloca i1, i1 0
-  %nop10688 = alloca i1, i1 0
-  %nop10689 = alloca i1, i1 0
-  %nop10690 = alloca i1, i1 0
-  %nop10691 = alloca i1, i1 0
-  %nop10692 = alloca i1, i1 0
-  %nop10693 = alloca i1, i1 0
-  %nop10694 = alloca i1, i1 0
-  %nop10695 = alloca i1, i1 0
-  %nop10696 = alloca i1, i1 0
-  %nop10697 = alloca i1, i1 0
-  %nop10698 = alloca i1, i1 0
-  %nop10699 = alloca i1, i1 0
-  %nop10700 = alloca i1, i1 0
-  %nop10701 = alloca i1, i1 0
-  %nop10702 = alloca i1, i1 0
-  %nop10703 = alloca i1, i1 0
-  %nop10704 = alloca i1, i1 0
-  %nop10705 = alloca i1, i1 0
-  %nop10706 = alloca i1, i1 0
-  %nop10707 = alloca i1, i1 0
-  %nop10708 = alloca i1, i1 0
-  %nop10709 = alloca i1, i1 0
-  %nop10710 = alloca i1, i1 0
-  %nop10711 = alloca i1, i1 0
-  %nop10712 = alloca i1, i1 0
-  %nop10713 = alloca i1, i1 0
-  %nop10714 = alloca i1, i1 0
-  %nop10715 = alloca i1, i1 0
-  %nop10716 = alloca i1, i1 0
-  %nop10717 = alloca i1, i1 0
-  %nop10718 = alloca i1, i1 0
-  %nop10719 = alloca i1, i1 0
-  %nop10720 = alloca i1, i1 0
-  %nop10721 = alloca i1, i1 0
-  %nop10722 = alloca i1, i1 0
-  %nop10723 = alloca i1, i1 0
-  %nop10724 = alloca i1, i1 0
-  %nop10725 = alloca i1, i1 0
-  %nop10726 = alloca i1, i1 0
-  %nop10727 = alloca i1, i1 0
-  %nop10728 = alloca i1, i1 0
-  %nop10729 = alloca i1, i1 0
-  %nop10730 = alloca i1, i1 0
-  %nop10731 = alloca i1, i1 0
-  %nop10732 = alloca i1, i1 0
-  %nop10733 = alloca i1, i1 0
-  %nop10734 = alloca i1, i1 0
-  %nop10735 = alloca i1, i1 0
-  %nop10736 = alloca i1, i1 0
-  %nop10737 = alloca i1, i1 0
-  %nop10738 = alloca i1, i1 0
-  %nop10739 = alloca i1, i1 0
-  %nop10740 = alloca i1, i1 0
-  %nop10741 = alloca i1, i1 0
-  %nop10742 = alloca i1, i1 0
-  %nop10743 = alloca i1, i1 0
-  %nop10744 = alloca i1, i1 0
-  %nop10745 = alloca i1, i1 0
-  %nop10746 = alloca i1, i1 0
-  %nop10747 = alloca i1, i1 0
-  %nop10748 = alloca i1, i1 0
-  %nop10749 = alloca i1, i1 0
-  %nop10750 = alloca i1, i1 0
-  %nop10751 = alloca i1, i1 0
-  %nop10752 = alloca i1, i1 0
-  %nop10753 = alloca i1, i1 0
-  %nop10754 = alloca i1, i1 0
-  %nop10755 = alloca i1, i1 0
-  %nop10756 = alloca i1, i1 0
-  %nop10757 = alloca i1, i1 0
-  %nop10758 = alloca i1, i1 0
-  %nop10759 = alloca i1, i1 0
-  %nop10760 = alloca i1, i1 0
-  %nop10761 = alloca i1, i1 0
-  %nop10762 = alloca i1, i1 0
-  %nop10763 = alloca i1, i1 0
-  %nop10764 = alloca i1, i1 0
-  %nop10765 = alloca i1, i1 0
-  %nop10766 = alloca i1, i1 0
-  %nop10767 = alloca i1, i1 0
-  %nop10768 = alloca i1, i1 0
-  %nop10769 = alloca i1, i1 0
-  %nop10770 = alloca i1, i1 0
-  %nop10771 = alloca i1, i1 0
-  %nop10772 = alloca i1, i1 0
-  %nop10773 = alloca i1, i1 0
-  %nop10774 = alloca i1, i1 0
-  %nop10775 = alloca i1, i1 0
-  %nop10776 = alloca i1, i1 0
-  %nop10777 = alloca i1, i1 0
-  %nop10778 = alloca i1, i1 0
-  %nop10779 = alloca i1, i1 0
-  %nop10780 = alloca i1, i1 0
-  %nop10781 = alloca i1, i1 0
-  %nop10782 = alloca i1, i1 0
-  %nop10783 = alloca i1, i1 0
-  %nop10784 = alloca i1, i1 0
-  %nop10785 = alloca i1, i1 0
-  %nop10786 = alloca i1, i1 0
-  %nop10787 = alloca i1, i1 0
-  %nop10788 = alloca i1, i1 0
-  %nop10789 = alloca i1, i1 0
-  %nop10790 = alloca i1, i1 0
-  %nop10791 = alloca i1, i1 0
-  %nop10792 = alloca i1, i1 0
-  %nop10793 = alloca i1, i1 0
-  %nop10794 = alloca i1, i1 0
-  %nop10795 = alloca i1, i1 0
-  %nop10796 = alloca i1, i1 0
-  %nop10797 = alloca i1, i1 0
-  %nop10798 = alloca i1, i1 0
-  %nop10799 = alloca i1, i1 0
-  %nop10800 = alloca i1, i1 0
-  %nop10801 = alloca i1, i1 0
-  %nop10802 = alloca i1, i1 0
-  %nop10803 = alloca i1, i1 0
-  %nop10804 = alloca i1, i1 0
-  %nop10805 = alloca i1, i1 0
-  %nop10806 = alloca i1, i1 0
-  %nop10807 = alloca i1, i1 0
-  %nop10808 = alloca i1, i1 0
-  %nop10809 = alloca i1, i1 0
-  %nop10810 = alloca i1, i1 0
-  %nop10811 = alloca i1, i1 0
-  %nop10812 = alloca i1, i1 0
-  %nop10813 = alloca i1, i1 0
-  %nop10814 = alloca i1, i1 0
-  %nop10815 = alloca i1, i1 0
-  %nop10816 = alloca i1, i1 0
-  %nop10817 = alloca i1, i1 0
-  %nop10818 = alloca i1, i1 0
-  %nop10819 = alloca i1, i1 0
-  %nop10820 = alloca i1, i1 0
-  %nop10821 = alloca i1, i1 0
-  %nop10822 = alloca i1, i1 0
-  %nop10823 = alloca i1, i1 0
-  %nop10824 = alloca i1, i1 0
-  %nop10825 = alloca i1, i1 0
-  %nop10826 = alloca i1, i1 0
-  %nop10827 = alloca i1, i1 0
-  %nop10828 = alloca i1, i1 0
-  %nop10829 = alloca i1, i1 0
-  %nop10830 = alloca i1, i1 0
-  %nop10831 = alloca i1, i1 0
-  %nop10832 = alloca i1, i1 0
-  %nop10833 = alloca i1, i1 0
-  %nop10834 = alloca i1, i1 0
-  %nop10835 = alloca i1, i1 0
-  %nop10836 = alloca i1, i1 0
-  %nop10837 = alloca i1, i1 0
-  %nop10838 = alloca i1, i1 0
-  %nop10839 = alloca i1, i1 0
-  %nop10840 = alloca i1, i1 0
-  %nop10841 = alloca i1, i1 0
-  %nop10842 = alloca i1, i1 0
-  %nop10843 = alloca i1, i1 0
-  %nop10844 = alloca i1, i1 0
-  %nop10845 = alloca i1, i1 0
-  %nop10846 = alloca i1, i1 0
-  %nop10847 = alloca i1, i1 0
-  %nop10848 = alloca i1, i1 0
-  %nop10849 = alloca i1, i1 0
-  %nop10850 = alloca i1, i1 0
-  %nop10851 = alloca i1, i1 0
-  %nop10852 = alloca i1, i1 0
-  %nop10853 = alloca i1, i1 0
-  %nop10854 = alloca i1, i1 0
-  %nop10855 = alloca i1, i1 0
-  %nop10856 = alloca i1, i1 0
-  %nop10857 = alloca i1, i1 0
-  %nop10858 = alloca i1, i1 0
-  %nop10859 = alloca i1, i1 0
-  %nop10860 = alloca i1, i1 0
-  %nop10861 = alloca i1, i1 0
-  %nop10862 = alloca i1, i1 0
-  %nop10863 = alloca i1, i1 0
-  %nop10864 = alloca i1, i1 0
-  %nop10865 = alloca i1, i1 0
-  %nop10866 = alloca i1, i1 0
-  %nop10867 = alloca i1, i1 0
-  %nop10868 = alloca i1, i1 0
-  %nop10869 = alloca i1, i1 0
-  %nop10870 = alloca i1, i1 0
-  %nop10871 = alloca i1, i1 0
-  %nop10872 = alloca i1, i1 0
-  %nop10873 = alloca i1, i1 0
-  %nop10874 = alloca i1, i1 0
-  %nop10875 = alloca i1, i1 0
-  %nop10876 = alloca i1, i1 0
-  %nop10877 = alloca i1, i1 0
-  %nop10878 = alloca i1, i1 0
-  %nop10879 = alloca i1, i1 0
-  %nop10880 = alloca i1, i1 0
-  %nop10881 = alloca i1, i1 0
-  %nop10882 = alloca i1, i1 0
-  %nop10883 = alloca i1, i1 0
-  %nop10884 = alloca i1, i1 0
-  %nop10885 = alloca i1, i1 0
-  %nop10886 = alloca i1, i1 0
-  %nop10887 = alloca i1, i1 0
-  %nop10888 = alloca i1, i1 0
-  %nop10889 = alloca i1, i1 0
-  %nop10890 = alloca i1, i1 0
-  %nop10891 = alloca i1, i1 0
-  %nop10892 = alloca i1, i1 0
-  %nop10893 = alloca i1, i1 0
-  %nop10894 = alloca i1, i1 0
-  %nop10895 = alloca i1, i1 0
-  %nop10896 = alloca i1, i1 0
-  %nop10897 = alloca i1, i1 0
-  %nop10898 = alloca i1, i1 0
-  %nop10899 = alloca i1, i1 0
-  %nop10900 = alloca i1, i1 0
-  %nop10901 = alloca i1, i1 0
-  %nop10902 = alloca i1, i1 0
-  %nop10903 = alloca i1, i1 0
-  %nop10904 = alloca i1, i1 0
-  %nop10905 = alloca i1, i1 0
-  %nop10906 = alloca i1, i1 0
-  %nop10907 = alloca i1, i1 0
-  %nop10908 = alloca i1, i1 0
-  %nop10909 = alloca i1, i1 0
-  %nop10910 = alloca i1, i1 0
-  %nop10911 = alloca i1, i1 0
-  %nop10912 = alloca i1, i1 0
-  %nop10913 = alloca i1, i1 0
-  %nop10914 = alloca i1, i1 0
-  %nop10915 = alloca i1, i1 0
-  %nop10916 = alloca i1, i1 0
-  %nop10917 = alloca i1, i1 0
-  %nop10918 = alloca i1, i1 0
-  %nop10919 = alloca i1, i1 0
-  %nop10920 = alloca i1, i1 0
-  %nop10921 = alloca i1, i1 0
-  %nop10922 = alloca i1, i1 0
-  %nop10923 = alloca i1, i1 0
-  %nop10924 = alloca i1, i1 0
-  %nop10925 = alloca i1, i1 0
-  %nop10926 = alloca i1, i1 0
-  %nop10927 = alloca i1, i1 0
-  %nop10928 = alloca i1, i1 0
-  %nop10929 = alloca i1, i1 0
-  %nop10930 = alloca i1, i1 0
-  %nop10931 = alloca i1, i1 0
-  %nop10932 = alloca i1, i1 0
-  %nop10933 = alloca i1, i1 0
-  %nop10934 = alloca i1, i1 0
-  %nop10935 = alloca i1, i1 0
-  %nop10936 = alloca i1, i1 0
-  %nop10937 = alloca i1, i1 0
-  %nop10938 = alloca i1, i1 0
-  %nop10939 = alloca i1, i1 0
-  %nop10940 = alloca i1, i1 0
-  %nop10941 = alloca i1, i1 0
-  %nop10942 = alloca i1, i1 0
-  %nop10943 = alloca i1, i1 0
-  %nop10944 = alloca i1, i1 0
-  %nop10945 = alloca i1, i1 0
-  %nop10946 = alloca i1, i1 0
-  %nop10947 = alloca i1, i1 0
-  %nop10948 = alloca i1, i1 0
-  %nop10949 = alloca i1, i1 0
-  %nop10950 = alloca i1, i1 0
-  %nop10951 = alloca i1, i1 0
-  %nop10952 = alloca i1, i1 0
-  %nop10953 = alloca i1, i1 0
-  %nop10954 = alloca i1, i1 0
-  %nop10955 = alloca i1, i1 0
-  %nop10956 = alloca i1, i1 0
-  %nop10957 = alloca i1, i1 0
-  %nop10958 = alloca i1, i1 0
-  %nop10959 = alloca i1, i1 0
-  %nop10960 = alloca i1, i1 0
-  %nop10961 = alloca i1, i1 0
-  %nop10962 = alloca i1, i1 0
-  %nop10963 = alloca i1, i1 0
-  %nop10964 = alloca i1, i1 0
-  %nop10965 = alloca i1, i1 0
-  %nop10966 = alloca i1, i1 0
-  %nop10967 = alloca i1, i1 0
-  %nop10968 = alloca i1, i1 0
-  %nop10969 = alloca i1, i1 0
-  %nop10970 = alloca i1, i1 0
-  %nop10971 = alloca i1, i1 0
-  %nop10972 = alloca i1, i1 0
-  %nop10973 = alloca i1, i1 0
-  %nop10974 = alloca i1, i1 0
-  %nop10975 = alloca i1, i1 0
-  %nop10976 = alloca i1, i1 0
-  %nop10977 = alloca i1, i1 0
-  %nop10978 = alloca i1, i1 0
-  %nop10979 = alloca i1, i1 0
-  %nop10980 = alloca i1, i1 0
-  %nop10981 = alloca i1, i1 0
-  %nop10982 = alloca i1, i1 0
-  %nop10983 = alloca i1, i1 0
-  %nop10984 = alloca i1, i1 0
-  %nop10985 = alloca i1, i1 0
-  %nop10986 = alloca i1, i1 0
-  %nop10987 = alloca i1, i1 0
-  %nop10988 = alloca i1, i1 0
-  %nop10989 = alloca i1, i1 0
-  %nop10990 = alloca i1, i1 0
-  %nop10991 = alloca i1, i1 0
-  %nop10992 = alloca i1, i1 0
-  %nop10993 = alloca i1, i1 0
-  %nop10994 = alloca i1, i1 0
-  %nop10995 = alloca i1, i1 0
-  %nop10996 = alloca i1, i1 0
-  %nop10997 = alloca i1, i1 0
-  %nop10998 = alloca i1, i1 0
-  %nop10999 = alloca i1, i1 0
-  %nop11000 = alloca i1, i1 0
-  %nop11001 = alloca i1, i1 0
-  %nop11002 = alloca i1, i1 0
-  %nop11003 = alloca i1, i1 0
-  %nop11004 = alloca i1, i1 0
-  %nop11005 = alloca i1, i1 0
-  %nop11006 = alloca i1, i1 0
-  %nop11007 = alloca i1, i1 0
-  %nop11008 = alloca i1, i1 0
-  %nop11009 = alloca i1, i1 0
-  %nop11010 = alloca i1, i1 0
-  %nop11011 = alloca i1, i1 0
-  %nop11012 = alloca i1, i1 0
-  %nop11013 = alloca i1, i1 0
-  %nop11014 = alloca i1, i1 0
-  %nop11015 = alloca i1, i1 0
-  %nop11016 = alloca i1, i1 0
-  %nop11017 = alloca i1, i1 0
-  %nop11018 = alloca i1, i1 0
-  %nop11019 = alloca i1, i1 0
-  %nop11020 = alloca i1, i1 0
-  %nop11021 = alloca i1, i1 0
-  %nop11022 = alloca i1, i1 0
-  %nop11023 = alloca i1, i1 0
-  %nop11024 = alloca i1, i1 0
-  %nop11025 = alloca i1, i1 0
-  %nop11026 = alloca i1, i1 0
-  %nop11027 = alloca i1, i1 0
-  %nop11028 = alloca i1, i1 0
-  %nop11029 = alloca i1, i1 0
-  %nop11030 = alloca i1, i1 0
-  %nop11031 = alloca i1, i1 0
-  %nop11032 = alloca i1, i1 0
-  %nop11033 = alloca i1, i1 0
-  %nop11034 = alloca i1, i1 0
-  %nop11035 = alloca i1, i1 0
-  %nop11036 = alloca i1, i1 0
-  %nop11037 = alloca i1, i1 0
-  %nop11038 = alloca i1, i1 0
-  %nop11039 = alloca i1, i1 0
-  %nop11040 = alloca i1, i1 0
-  %nop11041 = alloca i1, i1 0
-  %nop11042 = alloca i1, i1 0
-  %nop11043 = alloca i1, i1 0
-  %nop11044 = alloca i1, i1 0
-  %nop11045 = alloca i1, i1 0
-  %nop11046 = alloca i1, i1 0
-  %nop11047 = alloca i1, i1 0
-  %nop11048 = alloca i1, i1 0
-  %nop11049 = alloca i1, i1 0
-  %nop11050 = alloca i1, i1 0
-  %nop11051 = alloca i1, i1 0
-  %nop11052 = alloca i1, i1 0
-  %nop11053 = alloca i1, i1 0
-  %nop11054 = alloca i1, i1 0
-  %nop11055 = alloca i1, i1 0
-  %nop11056 = alloca i1, i1 0
-  %nop11057 = alloca i1, i1 0
-  %nop11058 = alloca i1, i1 0
-  %nop11059 = alloca i1, i1 0
-  %nop11060 = alloca i1, i1 0
-  %nop11061 = alloca i1, i1 0
-  %nop11062 = alloca i1, i1 0
-  %nop11063 = alloca i1, i1 0
-  %nop11064 = alloca i1, i1 0
-  %nop11065 = alloca i1, i1 0
-  %nop11066 = alloca i1, i1 0
-  %nop11067 = alloca i1, i1 0
-  %nop11068 = alloca i1, i1 0
-  %nop11069 = alloca i1, i1 0
-  %nop11070 = alloca i1, i1 0
-  %nop11071 = alloca i1, i1 0
-  %nop11072 = alloca i1, i1 0
-  %nop11073 = alloca i1, i1 0
-  %nop11074 = alloca i1, i1 0
-  %nop11075 = alloca i1, i1 0
-  %nop11076 = alloca i1, i1 0
-  %nop11077 = alloca i1, i1 0
-  %nop11078 = alloca i1, i1 0
-  %nop11079 = alloca i1, i1 0
-  %nop11080 = alloca i1, i1 0
-  %nop11081 = alloca i1, i1 0
-  %nop11082 = alloca i1, i1 0
-  %nop11083 = alloca i1, i1 0
-  %nop11084 = alloca i1, i1 0
-  %nop11085 = alloca i1, i1 0
-  %nop11086 = alloca i1, i1 0
-  %nop11087 = alloca i1, i1 0
-  %nop11088 = alloca i1, i1 0
-  %nop11089 = alloca i1, i1 0
-  %nop11090 = alloca i1, i1 0
-  %nop11091 = alloca i1, i1 0
-  %nop11092 = alloca i1, i1 0
-  %nop11093 = alloca i1, i1 0
-  %nop11094 = alloca i1, i1 0
-  %nop11095 = alloca i1, i1 0
-  %nop11096 = alloca i1, i1 0
-  %nop11097 = alloca i1, i1 0
-  %nop11098 = alloca i1, i1 0
-  %nop11099 = alloca i1, i1 0
-  %nop11100 = alloca i1, i1 0
-  %nop11101 = alloca i1, i1 0
-  %nop11102 = alloca i1, i1 0
-  %nop11103 = alloca i1, i1 0
-  %nop11104 = alloca i1, i1 0
-  %nop11105 = alloca i1, i1 0
-  %nop11106 = alloca i1, i1 0
-  %nop11107 = alloca i1, i1 0
-  %nop11108 = alloca i1, i1 0
-  %nop11109 = alloca i1, i1 0
-  %nop11110 = alloca i1, i1 0
-  %nop11111 = alloca i1, i1 0
-  %nop11112 = alloca i1, i1 0
-  %nop11113 = alloca i1, i1 0
-  %nop11114 = alloca i1, i1 0
-  %nop11115 = alloca i1, i1 0
-  %nop11116 = alloca i1, i1 0
-  %nop11117 = alloca i1, i1 0
-  %nop11118 = alloca i1, i1 0
-  %nop11119 = alloca i1, i1 0
-  %nop11120 = alloca i1, i1 0
-  %nop11121 = alloca i1, i1 0
-  %nop11122 = alloca i1, i1 0
-  %nop11123 = alloca i1, i1 0
-  %nop11124 = alloca i1, i1 0
-  %nop11125 = alloca i1, i1 0
-  %nop11126 = alloca i1, i1 0
-  %nop11127 = alloca i1, i1 0
-  %nop11128 = alloca i1, i1 0
-  %nop11129 = alloca i1, i1 0
-  %nop11130 = alloca i1, i1 0
-  %nop11131 = alloca i1, i1 0
-  %nop11132 = alloca i1, i1 0
-  %nop11133 = alloca i1, i1 0
-  %nop11134 = alloca i1, i1 0
-  %nop11135 = alloca i1, i1 0
-  %nop11136 = alloca i1, i1 0
-  %nop11137 = alloca i1, i1 0
-  %nop11138 = alloca i1, i1 0
-  %nop11139 = alloca i1, i1 0
-  %nop11140 = alloca i1, i1 0
-  %nop11141 = alloca i1, i1 0
-  %nop11142 = alloca i1, i1 0
-  %nop11143 = alloca i1, i1 0
-  %nop11144 = alloca i1, i1 0
-  %nop11145 = alloca i1, i1 0
-  %nop11146 = alloca i1, i1 0
-  %nop11147 = alloca i1, i1 0
-  %nop11148 = alloca i1, i1 0
-  %nop11149 = alloca i1, i1 0
-  %nop11150 = alloca i1, i1 0
-  %nop11151 = alloca i1, i1 0
-  %nop11152 = alloca i1, i1 0
-  %nop11153 = alloca i1, i1 0
-  %nop11154 = alloca i1, i1 0
-  %nop11155 = alloca i1, i1 0
-  %nop11156 = alloca i1, i1 0
-  %nop11157 = alloca i1, i1 0
-  %nop11158 = alloca i1, i1 0
-  %nop11159 = alloca i1, i1 0
-  %nop11160 = alloca i1, i1 0
-  %nop11161 = alloca i1, i1 0
-  %nop11162 = alloca i1, i1 0
-  %nop11163 = alloca i1, i1 0
-  %nop11164 = alloca i1, i1 0
-  %nop11165 = alloca i1, i1 0
-  %nop11166 = alloca i1, i1 0
-  %nop11167 = alloca i1, i1 0
-  %nop11168 = alloca i1, i1 0
-  %nop11169 = alloca i1, i1 0
-  %nop11170 = alloca i1, i1 0
-  %nop11171 = alloca i1, i1 0
-  %nop11172 = alloca i1, i1 0
-  %nop11173 = alloca i1, i1 0
-  %nop11174 = alloca i1, i1 0
-  %nop11175 = alloca i1, i1 0
-  %nop11176 = alloca i1, i1 0
-  %nop11177 = alloca i1, i1 0
-  %nop11178 = alloca i1, i1 0
-  %nop11179 = alloca i1, i1 0
-  %nop11180 = alloca i1, i1 0
-  %nop11181 = alloca i1, i1 0
-  %nop11182 = alloca i1, i1 0
-  %nop11183 = alloca i1, i1 0
-  %nop11184 = alloca i1, i1 0
-  %nop11185 = alloca i1, i1 0
-  %nop11186 = alloca i1, i1 0
-  %nop11187 = alloca i1, i1 0
-  %nop11188 = alloca i1, i1 0
-  %nop11189 = alloca i1, i1 0
-  %nop11190 = alloca i1, i1 0
-  %nop11191 = alloca i1, i1 0
-  %nop11192 = alloca i1, i1 0
-  %nop11193 = alloca i1, i1 0
-  %nop11194 = alloca i1, i1 0
-  %nop11195 = alloca i1, i1 0
-  %nop11196 = alloca i1, i1 0
-  %nop11197 = alloca i1, i1 0
-  %nop11198 = alloca i1, i1 0
-  %nop11199 = alloca i1, i1 0
-  %nop11200 = alloca i1, i1 0
-  %nop11201 = alloca i1, i1 0
-  %nop11202 = alloca i1, i1 0
-  %nop11203 = alloca i1, i1 0
-  %nop11204 = alloca i1, i1 0
-  %nop11205 = alloca i1, i1 0
-  %nop11206 = alloca i1, i1 0
-  %nop11207 = alloca i1, i1 0
-  %nop11208 = alloca i1, i1 0
-  %nop11209 = alloca i1, i1 0
-  %nop11210 = alloca i1, i1 0
-  %nop11211 = alloca i1, i1 0
-  %nop11212 = alloca i1, i1 0
-  %nop11213 = alloca i1, i1 0
-  %nop11214 = alloca i1, i1 0
-  %nop11215 = alloca i1, i1 0
-  %nop11216 = alloca i1, i1 0
-  %nop11217 = alloca i1, i1 0
-  %nop11218 = alloca i1, i1 0
-  %nop11219 = alloca i1, i1 0
-  %nop11220 = alloca i1, i1 0
-  %nop11221 = alloca i1, i1 0
-  %nop11222 = alloca i1, i1 0
-  %nop11223 = alloca i1, i1 0
-  %nop11224 = alloca i1, i1 0
-  %nop11225 = alloca i1, i1 0
-  %nop11226 = alloca i1, i1 0
-  %nop11227 = alloca i1, i1 0
-  %nop11228 = alloca i1, i1 0
-  %nop11229 = alloca i1, i1 0
-  %nop11230 = alloca i1, i1 0
-  %nop11231 = alloca i1, i1 0
-  %nop11232 = alloca i1, i1 0
-  %nop11233 = alloca i1, i1 0
-  %nop11234 = alloca i1, i1 0
-  %nop11235 = alloca i1, i1 0
-  %nop11236 = alloca i1, i1 0
-  %nop11237 = alloca i1, i1 0
-  %nop11238 = alloca i1, i1 0
-  %nop11239 = alloca i1, i1 0
-  %nop11240 = alloca i1, i1 0
-  %nop11241 = alloca i1, i1 0
-  %nop11242 = alloca i1, i1 0
-  %nop11243 = alloca i1, i1 0
-  %nop11244 = alloca i1, i1 0
-  %nop11245 = alloca i1, i1 0
-  %nop11246 = alloca i1, i1 0
-  %nop11247 = alloca i1, i1 0
-  %nop11248 = alloca i1, i1 0
-  %nop11249 = alloca i1, i1 0
-  %nop11250 = alloca i1, i1 0
-  %nop11251 = alloca i1, i1 0
-  %nop11252 = alloca i1, i1 0
-  %nop11253 = alloca i1, i1 0
-  %nop11254 = alloca i1, i1 0
-  %nop11255 = alloca i1, i1 0
-  %nop11256 = alloca i1, i1 0
-  %nop11257 = alloca i1, i1 0
-  %nop11258 = alloca i1, i1 0
-  %nop11259 = alloca i1, i1 0
-  %nop11260 = alloca i1, i1 0
-  %nop11261 = alloca i1, i1 0
-  %nop11262 = alloca i1, i1 0
-  %nop11263 = alloca i1, i1 0
-  %nop11264 = alloca i1, i1 0
-  %nop11265 = alloca i1, i1 0
-  %nop11266 = alloca i1, i1 0
-  %nop11267 = alloca i1, i1 0
-  %nop11268 = alloca i1, i1 0
-  %nop11269 = alloca i1, i1 0
-  %nop11270 = alloca i1, i1 0
-  %nop11271 = alloca i1, i1 0
-  %nop11272 = alloca i1, i1 0
-  %nop11273 = alloca i1, i1 0
-  %nop11274 = alloca i1, i1 0
-  %nop11275 = alloca i1, i1 0
-  %nop11276 = alloca i1, i1 0
-  %nop11277 = alloca i1, i1 0
-  %nop11278 = alloca i1, i1 0
-  %nop11279 = alloca i1, i1 0
-  %nop11280 = alloca i1, i1 0
-  %nop11281 = alloca i1, i1 0
-  %nop11282 = alloca i1, i1 0
-  %nop11283 = alloca i1, i1 0
-  %nop11284 = alloca i1, i1 0
-  %nop11285 = alloca i1, i1 0
-  %nop11286 = alloca i1, i1 0
-  %nop11287 = alloca i1, i1 0
-  %nop11288 = alloca i1, i1 0
-  %nop11289 = alloca i1, i1 0
-  %nop11290 = alloca i1, i1 0
-  %nop11291 = alloca i1, i1 0
-  %nop11292 = alloca i1, i1 0
-  %nop11293 = alloca i1, i1 0
-  %nop11294 = alloca i1, i1 0
-  %nop11295 = alloca i1, i1 0
-  %nop11296 = alloca i1, i1 0
-  %nop11297 = alloca i1, i1 0
-  %nop11298 = alloca i1, i1 0
-  %nop11299 = alloca i1, i1 0
-  %nop11300 = alloca i1, i1 0
-  %nop11301 = alloca i1, i1 0
-  %nop11302 = alloca i1, i1 0
-  %nop11303 = alloca i1, i1 0
-  %nop11304 = alloca i1, i1 0
-  %nop11305 = alloca i1, i1 0
-  %nop11306 = alloca i1, i1 0
-  %nop11307 = alloca i1, i1 0
-  %nop11308 = alloca i1, i1 0
-  %nop11309 = alloca i1, i1 0
-  %nop11310 = alloca i1, i1 0
-  %nop11311 = alloca i1, i1 0
-  %nop11312 = alloca i1, i1 0
-  %nop11313 = alloca i1, i1 0
-  %nop11314 = alloca i1, i1 0
-  %nop11315 = alloca i1, i1 0
-  %nop11316 = alloca i1, i1 0
-  %nop11317 = alloca i1, i1 0
-  %nop11318 = alloca i1, i1 0
-  %nop11319 = alloca i1, i1 0
-  %nop11320 = alloca i1, i1 0
-  %nop11321 = alloca i1, i1 0
-  %nop11322 = alloca i1, i1 0
-  %nop11323 = alloca i1, i1 0
-  %nop11324 = alloca i1, i1 0
-  %nop11325 = alloca i1, i1 0
-  %nop11326 = alloca i1, i1 0
-  %nop11327 = alloca i1, i1 0
-  %nop11328 = alloca i1, i1 0
-  %nop11329 = alloca i1, i1 0
-  %nop11330 = alloca i1, i1 0
-  %nop11331 = alloca i1, i1 0
-  %nop11332 = alloca i1, i1 0
-  %nop11333 = alloca i1, i1 0
-  %nop11334 = alloca i1, i1 0
-  %nop11335 = alloca i1, i1 0
-  %nop11336 = alloca i1, i1 0
-  %nop11337 = alloca i1, i1 0
-  %nop11338 = alloca i1, i1 0
-  %nop11339 = alloca i1, i1 0
-  %nop11340 = alloca i1, i1 0
-  %nop11341 = alloca i1, i1 0
-  %nop11342 = alloca i1, i1 0
-  %nop11343 = alloca i1, i1 0
-  %nop11344 = alloca i1, i1 0
-  %nop11345 = alloca i1, i1 0
-  %nop11346 = alloca i1, i1 0
-  %nop11347 = alloca i1, i1 0
-  %nop11348 = alloca i1, i1 0
-  %nop11349 = alloca i1, i1 0
-  %nop11350 = alloca i1, i1 0
-  %nop11351 = alloca i1, i1 0
-  %nop11352 = alloca i1, i1 0
-  %nop11353 = alloca i1, i1 0
-  %nop11354 = alloca i1, i1 0
-  %nop11355 = alloca i1, i1 0
-  %nop11356 = alloca i1, i1 0
-  %nop11357 = alloca i1, i1 0
-  %nop11358 = alloca i1, i1 0
-  %nop11359 = alloca i1, i1 0
-  %nop11360 = alloca i1, i1 0
-  %nop11361 = alloca i1, i1 0
-  %nop11362 = alloca i1, i1 0
-  %nop11363 = alloca i1, i1 0
-  %nop11364 = alloca i1, i1 0
-  %nop11365 = alloca i1, i1 0
-  %nop11366 = alloca i1, i1 0
-  %nop11367 = alloca i1, i1 0
-  %nop11368 = alloca i1, i1 0
-  %nop11369 = alloca i1, i1 0
-  %nop11370 = alloca i1, i1 0
-  %nop11371 = alloca i1, i1 0
-  %nop11372 = alloca i1, i1 0
-  %nop11373 = alloca i1, i1 0
-  %nop11374 = alloca i1, i1 0
-  %nop11375 = alloca i1, i1 0
-  %nop11376 = alloca i1, i1 0
-  %nop11377 = alloca i1, i1 0
-  %nop11378 = alloca i1, i1 0
-  %nop11379 = alloca i1, i1 0
-  %nop11380 = alloca i1, i1 0
-  %nop11381 = alloca i1, i1 0
-  %nop11382 = alloca i1, i1 0
-  %nop11383 = alloca i1, i1 0
-  %nop11384 = alloca i1, i1 0
-  %nop11385 = alloca i1, i1 0
-  %nop11386 = alloca i1, i1 0
-  %nop11387 = alloca i1, i1 0
-  %nop11388 = alloca i1, i1 0
-  %nop11389 = alloca i1, i1 0
-  %nop11390 = alloca i1, i1 0
-  %nop11391 = alloca i1, i1 0
-  %nop11392 = alloca i1, i1 0
-  %nop11393 = alloca i1, i1 0
-  %nop11394 = alloca i1, i1 0
-  %nop11395 = alloca i1, i1 0
-  %nop11396 = alloca i1, i1 0
-  %nop11397 = alloca i1, i1 0
-  %nop11398 = alloca i1, i1 0
-  %nop11399 = alloca i1, i1 0
-  %nop11400 = alloca i1, i1 0
-  %nop11401 = alloca i1, i1 0
-  %nop11402 = alloca i1, i1 0
-  %nop11403 = alloca i1, i1 0
-  %nop11404 = alloca i1, i1 0
-  %nop11405 = alloca i1, i1 0
-  %nop11406 = alloca i1, i1 0
-  %nop11407 = alloca i1, i1 0
-  %nop11408 = alloca i1, i1 0
-  %nop11409 = alloca i1, i1 0
-  %nop11410 = alloca i1, i1 0
-  %nop11411 = alloca i1, i1 0
-  %nop11412 = alloca i1, i1 0
-  %nop11413 = alloca i1, i1 0
-  %nop11414 = alloca i1, i1 0
-  %nop11415 = alloca i1, i1 0
-  %nop11416 = alloca i1, i1 0
-  %nop11417 = alloca i1, i1 0
-  %nop11418 = alloca i1, i1 0
-  %nop11419 = alloca i1, i1 0
-  %nop11420 = alloca i1, i1 0
-  %nop11421 = alloca i1, i1 0
-  %nop11422 = alloca i1, i1 0
-  %nop11423 = alloca i1, i1 0
-  %nop11424 = alloca i1, i1 0
-  %nop11425 = alloca i1, i1 0
-  %nop11426 = alloca i1, i1 0
-  %nop11427 = alloca i1, i1 0
-  %nop11428 = alloca i1, i1 0
-  %nop11429 = alloca i1, i1 0
-  %nop11430 = alloca i1, i1 0
-  %nop11431 = alloca i1, i1 0
-  %nop11432 = alloca i1, i1 0
-  %nop11433 = alloca i1, i1 0
-  %nop11434 = alloca i1, i1 0
-  %nop11435 = alloca i1, i1 0
-  %nop11436 = alloca i1, i1 0
-  %nop11437 = alloca i1, i1 0
-  %nop11438 = alloca i1, i1 0
-  %nop11439 = alloca i1, i1 0
-  %nop11440 = alloca i1, i1 0
-  %nop11441 = alloca i1, i1 0
-  %nop11442 = alloca i1, i1 0
-  %nop11443 = alloca i1, i1 0
-  %nop11444 = alloca i1, i1 0
-  %nop11445 = alloca i1, i1 0
-  %nop11446 = alloca i1, i1 0
-  %nop11447 = alloca i1, i1 0
-  %nop11448 = alloca i1, i1 0
-  %nop11449 = alloca i1, i1 0
-  %nop11450 = alloca i1, i1 0
-  %nop11451 = alloca i1, i1 0
-  %nop11452 = alloca i1, i1 0
-  %nop11453 = alloca i1, i1 0
-  %nop11454 = alloca i1, i1 0
-  %nop11455 = alloca i1, i1 0
-  %nop11456 = alloca i1, i1 0
-  %nop11457 = alloca i1, i1 0
-  %nop11458 = alloca i1, i1 0
-  %nop11459 = alloca i1, i1 0
-  %nop11460 = alloca i1, i1 0
-  %nop11461 = alloca i1, i1 0
-  %nop11462 = alloca i1, i1 0
-  %nop11463 = alloca i1, i1 0
-  %nop11464 = alloca i1, i1 0
-  %nop11465 = alloca i1, i1 0
-  %nop11466 = alloca i1, i1 0
-  %nop11467 = alloca i1, i1 0
-  %nop11468 = alloca i1, i1 0
-  %nop11469 = alloca i1, i1 0
-  %nop11470 = alloca i1, i1 0
-  %nop11471 = alloca i1, i1 0
-  %nop11472 = alloca i1, i1 0
-  %nop11473 = alloca i1, i1 0
-  %nop11474 = alloca i1, i1 0
-  %nop11475 = alloca i1, i1 0
-  %nop11476 = alloca i1, i1 0
-  %nop11477 = alloca i1, i1 0
-  %nop11478 = alloca i1, i1 0
-  %nop11479 = alloca i1, i1 0
-  %nop11480 = alloca i1, i1 0
-  %nop11481 = alloca i1, i1 0
-  %nop11482 = alloca i1, i1 0
-  %nop11483 = alloca i1, i1 0
-  %nop11484 = alloca i1, i1 0
-  %nop11485 = alloca i1, i1 0
-  %nop11486 = alloca i1, i1 0
-  %nop11487 = alloca i1, i1 0
-  %nop11488 = alloca i1, i1 0
-  %nop11489 = alloca i1, i1 0
-  %nop11490 = alloca i1, i1 0
-  %nop11491 = alloca i1, i1 0
-  %nop11492 = alloca i1, i1 0
-  %nop11493 = alloca i1, i1 0
-  %nop11494 = alloca i1, i1 0
-  %nop11495 = alloca i1, i1 0
-  %nop11496 = alloca i1, i1 0
-  %nop11497 = alloca i1, i1 0
-  %nop11498 = alloca i1, i1 0
-  %nop11499 = alloca i1, i1 0
-  %nop11500 = alloca i1, i1 0
-  %nop11501 = alloca i1, i1 0
-  %nop11502 = alloca i1, i1 0
-  %nop11503 = alloca i1, i1 0
-  %nop11504 = alloca i1, i1 0
-  %nop11505 = alloca i1, i1 0
-  %nop11506 = alloca i1, i1 0
-  %nop11507 = alloca i1, i1 0
-  %nop11508 = alloca i1, i1 0
-  %nop11509 = alloca i1, i1 0
-  %nop11510 = alloca i1, i1 0
-  %nop11511 = alloca i1, i1 0
-  %nop11512 = alloca i1, i1 0
-  %nop11513 = alloca i1, i1 0
-  %nop11514 = alloca i1, i1 0
-  %nop11515 = alloca i1, i1 0
-  %nop11516 = alloca i1, i1 0
-  %nop11517 = alloca i1, i1 0
-  %nop11518 = alloca i1, i1 0
-  %nop11519 = alloca i1, i1 0
-  %nop11520 = alloca i1, i1 0
-  %nop11521 = alloca i1, i1 0
-  %nop11522 = alloca i1, i1 0
-  %nop11523 = alloca i1, i1 0
-  %nop11524 = alloca i1, i1 0
-  %nop11525 = alloca i1, i1 0
-  %nop11526 = alloca i1, i1 0
-  %nop11527 = alloca i1, i1 0
-  %nop11528 = alloca i1, i1 0
-  %nop11529 = alloca i1, i1 0
-  %nop11530 = alloca i1, i1 0
-  %nop11531 = alloca i1, i1 0
-  %nop11532 = alloca i1, i1 0
-  %nop11533 = alloca i1, i1 0
-  %nop11534 = alloca i1, i1 0
-  %nop11535 = alloca i1, i1 0
-  %nop11536 = alloca i1, i1 0
-  %nop11537 = alloca i1, i1 0
-  %nop11538 = alloca i1, i1 0
-  %nop11539 = alloca i1, i1 0
-  %nop11540 = alloca i1, i1 0
-  %nop11541 = alloca i1, i1 0
-  %nop11542 = alloca i1, i1 0
-  %nop11543 = alloca i1, i1 0
-  %nop11544 = alloca i1, i1 0
-  %nop11545 = alloca i1, i1 0
-  %nop11546 = alloca i1, i1 0
-  %nop11547 = alloca i1, i1 0
-  %nop11548 = alloca i1, i1 0
-  %nop11549 = alloca i1, i1 0
-  %nop11550 = alloca i1, i1 0
-  %nop11551 = alloca i1, i1 0
-  %nop11552 = alloca i1, i1 0
-  %nop11553 = alloca i1, i1 0
-  %nop11554 = alloca i1, i1 0
-  %nop11555 = alloca i1, i1 0
-  %nop11556 = alloca i1, i1 0
-  %nop11557 = alloca i1, i1 0
-  %nop11558 = alloca i1, i1 0
-  %nop11559 = alloca i1, i1 0
-  %nop11560 = alloca i1, i1 0
-  %nop11561 = alloca i1, i1 0
-  %nop11562 = alloca i1, i1 0
-  %nop11563 = alloca i1, i1 0
-  %nop11564 = alloca i1, i1 0
-  %nop11565 = alloca i1, i1 0
-  %nop11566 = alloca i1, i1 0
-  %nop11567 = alloca i1, i1 0
-  %nop11568 = alloca i1, i1 0
-  %nop11569 = alloca i1, i1 0
-  %nop11570 = alloca i1, i1 0
-  %nop11571 = alloca i1, i1 0
-  %nop11572 = alloca i1, i1 0
-  %nop11573 = alloca i1, i1 0
-  %nop11574 = alloca i1, i1 0
-  %nop11575 = alloca i1, i1 0
-  %nop11576 = alloca i1, i1 0
-  %nop11577 = alloca i1, i1 0
-  %nop11578 = alloca i1, i1 0
-  %nop11579 = alloca i1, i1 0
-  %nop11580 = alloca i1, i1 0
-  %nop11581 = alloca i1, i1 0
-  %nop11582 = alloca i1, i1 0
-  %nop11583 = alloca i1, i1 0
-  %nop11584 = alloca i1, i1 0
-  %nop11585 = alloca i1, i1 0
-  %nop11586 = alloca i1, i1 0
-  %nop11587 = alloca i1, i1 0
-  %nop11588 = alloca i1, i1 0
-  %nop11589 = alloca i1, i1 0
-  %nop11590 = alloca i1, i1 0
-  %nop11591 = alloca i1, i1 0
-  %nop11592 = alloca i1, i1 0
-  %nop11593 = alloca i1, i1 0
-  %nop11594 = alloca i1, i1 0
-  %nop11595 = alloca i1, i1 0
-  %nop11596 = alloca i1, i1 0
-  %nop11597 = alloca i1, i1 0
-  %nop11598 = alloca i1, i1 0
-  %nop11599 = alloca i1, i1 0
-  %nop11600 = alloca i1, i1 0
-  %nop11601 = alloca i1, i1 0
-  %nop11602 = alloca i1, i1 0
-  %nop11603 = alloca i1, i1 0
-  %nop11604 = alloca i1, i1 0
-  %nop11605 = alloca i1, i1 0
-  %nop11606 = alloca i1, i1 0
-  %nop11607 = alloca i1, i1 0
-  %nop11608 = alloca i1, i1 0
-  %nop11609 = alloca i1, i1 0
-  %nop11610 = alloca i1, i1 0
-  %nop11611 = alloca i1, i1 0
-  %nop11612 = alloca i1, i1 0
-  %nop11613 = alloca i1, i1 0
-  %nop11614 = alloca i1, i1 0
-  %nop11615 = alloca i1, i1 0
-  %nop11616 = alloca i1, i1 0
-  %nop11617 = alloca i1, i1 0
-  %nop11618 = alloca i1, i1 0
-  %nop11619 = alloca i1, i1 0
-  %nop11620 = alloca i1, i1 0
-  %nop11621 = alloca i1, i1 0
-  %nop11622 = alloca i1, i1 0
-  %nop11623 = alloca i1, i1 0
-  %nop11624 = alloca i1, i1 0
-  %nop11625 = alloca i1, i1 0
-  %nop11626 = alloca i1, i1 0
-  %nop11627 = alloca i1, i1 0
-  %nop11628 = alloca i1, i1 0
-  %nop11629 = alloca i1, i1 0
-  %nop11630 = alloca i1, i1 0
-  %nop11631 = alloca i1, i1 0
-  %nop11632 = alloca i1, i1 0
-  %nop11633 = alloca i1, i1 0
-  %nop11634 = alloca i1, i1 0
-  %nop11635 = alloca i1, i1 0
-  %nop11636 = alloca i1, i1 0
-  %nop11637 = alloca i1, i1 0
-  %nop11638 = alloca i1, i1 0
-  %nop11639 = alloca i1, i1 0
-  %nop11640 = alloca i1, i1 0
-  %nop11641 = alloca i1, i1 0
-  %nop11642 = alloca i1, i1 0
-  %nop11643 = alloca i1, i1 0
-  %nop11644 = alloca i1, i1 0
-  %nop11645 = alloca i1, i1 0
-  %nop11646 = alloca i1, i1 0
-  %nop11647 = alloca i1, i1 0
-  %nop11648 = alloca i1, i1 0
-  %nop11649 = alloca i1, i1 0
-  %nop11650 = alloca i1, i1 0
-  %nop11651 = alloca i1, i1 0
-  %nop11652 = alloca i1, i1 0
-  %nop11653 = alloca i1, i1 0
-  %nop11654 = alloca i1, i1 0
-  %nop11655 = alloca i1, i1 0
-  %nop11656 = alloca i1, i1 0
-  %nop11657 = alloca i1, i1 0
-  %nop11658 = alloca i1, i1 0
-  %nop11659 = alloca i1, i1 0
-  %nop11660 = alloca i1, i1 0
-  %nop11661 = alloca i1, i1 0
-  %nop11662 = alloca i1, i1 0
-  %nop11663 = alloca i1, i1 0
-  %nop11664 = alloca i1, i1 0
-  %nop11665 = alloca i1, i1 0
-  %nop11666 = alloca i1, i1 0
-  %nop11667 = alloca i1, i1 0
-  %nop11668 = alloca i1, i1 0
-  %nop11669 = alloca i1, i1 0
-  %nop11670 = alloca i1, i1 0
-  %nop11671 = alloca i1, i1 0
-  %nop11672 = alloca i1, i1 0
-  %nop11673 = alloca i1, i1 0
-  %nop11674 = alloca i1, i1 0
-  %nop11675 = alloca i1, i1 0
-  %nop11676 = alloca i1, i1 0
-  %nop11677 = alloca i1, i1 0
-  %nop11678 = alloca i1, i1 0
-  %nop11679 = alloca i1, i1 0
-  %nop11680 = alloca i1, i1 0
-  %nop11681 = alloca i1, i1 0
-  %nop11682 = alloca i1, i1 0
-  %nop11683 = alloca i1, i1 0
-  %nop11684 = alloca i1, i1 0
-  %nop11685 = alloca i1, i1 0
-  %nop11686 = alloca i1, i1 0
-  %nop11687 = alloca i1, i1 0
-  %nop11688 = alloca i1, i1 0
-  %nop11689 = alloca i1, i1 0
-  %nop11690 = alloca i1, i1 0
-  %nop11691 = alloca i1, i1 0
-  %nop11692 = alloca i1, i1 0
-  %nop11693 = alloca i1, i1 0
-  %nop11694 = alloca i1, i1 0
-  %nop11695 = alloca i1, i1 0
-  %nop11696 = alloca i1, i1 0
-  %nop11697 = alloca i1, i1 0
-  %nop11698 = alloca i1, i1 0
-  %nop11699 = alloca i1, i1 0
-  %nop11700 = alloca i1, i1 0
-  %nop11701 = alloca i1, i1 0
-  %nop11702 = alloca i1, i1 0
-  %nop11703 = alloca i1, i1 0
-  %nop11704 = alloca i1, i1 0
-  %nop11705 = alloca i1, i1 0
-  %nop11706 = alloca i1, i1 0
-  %nop11707 = alloca i1, i1 0
-  %nop11708 = alloca i1, i1 0
-  %nop11709 = alloca i1, i1 0
-  %nop11710 = alloca i1, i1 0
-  %nop11711 = alloca i1, i1 0
-  %nop11712 = alloca i1, i1 0
-  %nop11713 = alloca i1, i1 0
-  %nop11714 = alloca i1, i1 0
-  %nop11715 = alloca i1, i1 0
-  %nop11716 = alloca i1, i1 0
-  %nop11717 = alloca i1, i1 0
-  %nop11718 = alloca i1, i1 0
-  %nop11719 = alloca i1, i1 0
-  %nop11720 = alloca i1, i1 0
-  %nop11721 = alloca i1, i1 0
-  %nop11722 = alloca i1, i1 0
-  %nop11723 = alloca i1, i1 0
-  %nop11724 = alloca i1, i1 0
-  %nop11725 = alloca i1, i1 0
-  %nop11726 = alloca i1, i1 0
-  %nop11727 = alloca i1, i1 0
-  %nop11728 = alloca i1, i1 0
-  %nop11729 = alloca i1, i1 0
-  %nop11730 = alloca i1, i1 0
-  %nop11731 = alloca i1, i1 0
-  %nop11732 = alloca i1, i1 0
-  %nop11733 = alloca i1, i1 0
-  %nop11734 = alloca i1, i1 0
-  %nop11735 = alloca i1, i1 0
-  %nop11736 = alloca i1, i1 0
-  %nop11737 = alloca i1, i1 0
-  %nop11738 = alloca i1, i1 0
-  %nop11739 = alloca i1, i1 0
-  %nop11740 = alloca i1, i1 0
-  %nop11741 = alloca i1, i1 0
-  %nop11742 = alloca i1, i1 0
-  %nop11743 = alloca i1, i1 0
-  %nop11744 = alloca i1, i1 0
-  %nop11745 = alloca i1, i1 0
-  %nop11746 = alloca i1, i1 0
-  %nop11747 = alloca i1, i1 0
-  %nop11748 = alloca i1, i1 0
-  %nop11749 = alloca i1, i1 0
-  %nop11750 = alloca i1, i1 0
-  %nop11751 = alloca i1, i1 0
-  %nop11752 = alloca i1, i1 0
-  %nop11753 = alloca i1, i1 0
-  %nop11754 = alloca i1, i1 0
-  %nop11755 = alloca i1, i1 0
-  %nop11756 = alloca i1, i1 0
-  %nop11757 = alloca i1, i1 0
-  %nop11758 = alloca i1, i1 0
-  %nop11759 = alloca i1, i1 0
-  %nop11760 = alloca i1, i1 0
-  %nop11761 = alloca i1, i1 0
-  %nop11762 = alloca i1, i1 0
-  %nop11763 = alloca i1, i1 0
-  %nop11764 = alloca i1, i1 0
-  %nop11765 = alloca i1, i1 0
-  %nop11766 = alloca i1, i1 0
-  %nop11767 = alloca i1, i1 0
-  %nop11768 = alloca i1, i1 0
-  %nop11769 = alloca i1, i1 0
-  %nop11770 = alloca i1, i1 0
-  %nop11771 = alloca i1, i1 0
-  %nop11772 = alloca i1, i1 0
-  %nop11773 = alloca i1, i1 0
-  %nop11774 = alloca i1, i1 0
-  %nop11775 = alloca i1, i1 0
-  %nop11776 = alloca i1, i1 0
-  %nop11777 = alloca i1, i1 0
-  %nop11778 = alloca i1, i1 0
-  %nop11779 = alloca i1, i1 0
-  %nop11780 = alloca i1, i1 0
-  %nop11781 = alloca i1, i1 0
-  %nop11782 = alloca i1, i1 0
-  %nop11783 = alloca i1, i1 0
-  %nop11784 = alloca i1, i1 0
-  %nop11785 = alloca i1, i1 0
-  %nop11786 = alloca i1, i1 0
-  %nop11787 = alloca i1, i1 0
-  %nop11788 = alloca i1, i1 0
-  %nop11789 = alloca i1, i1 0
-  %nop11790 = alloca i1, i1 0
-  %nop11791 = alloca i1, i1 0
-  %nop11792 = alloca i1, i1 0
-  %nop11793 = alloca i1, i1 0
-  %nop11794 = alloca i1, i1 0
-  %nop11795 = alloca i1, i1 0
-  %nop11796 = alloca i1, i1 0
-  %nop11797 = alloca i1, i1 0
-  %nop11798 = alloca i1, i1 0
-  %nop11799 = alloca i1, i1 0
-  %nop11800 = alloca i1, i1 0
-  %nop11801 = alloca i1, i1 0
-  %nop11802 = alloca i1, i1 0
-  %nop11803 = alloca i1, i1 0
-  %nop11804 = alloca i1, i1 0
-  %nop11805 = alloca i1, i1 0
-  %nop11806 = alloca i1, i1 0
-  %nop11807 = alloca i1, i1 0
-  %nop11808 = alloca i1, i1 0
-  %nop11809 = alloca i1, i1 0
-  %nop11810 = alloca i1, i1 0
-  %nop11811 = alloca i1, i1 0
-  %nop11812 = alloca i1, i1 0
-  %nop11813 = alloca i1, i1 0
-  %nop11814 = alloca i1, i1 0
-  %nop11815 = alloca i1, i1 0
-  %nop11816 = alloca i1, i1 0
-  %nop11817 = alloca i1, i1 0
-  %nop11818 = alloca i1, i1 0
-  %nop11819 = alloca i1, i1 0
-  %nop11820 = alloca i1, i1 0
-  %nop11821 = alloca i1, i1 0
-  %nop11822 = alloca i1, i1 0
-  %nop11823 = alloca i1, i1 0
-  %nop11824 = alloca i1, i1 0
-  %nop11825 = alloca i1, i1 0
-  %nop11826 = alloca i1, i1 0
-  %nop11827 = alloca i1, i1 0
-  %nop11828 = alloca i1, i1 0
-  %nop11829 = alloca i1, i1 0
-  %nop11830 = alloca i1, i1 0
-  %nop11831 = alloca i1, i1 0
-  %nop11832 = alloca i1, i1 0
-  %nop11833 = alloca i1, i1 0
-  %nop11834 = alloca i1, i1 0
-  %nop11835 = alloca i1, i1 0
-  %nop11836 = alloca i1, i1 0
-  %nop11837 = alloca i1, i1 0
-  %nop11838 = alloca i1, i1 0
-  %nop11839 = alloca i1, i1 0
-  %nop11840 = alloca i1, i1 0
-  %nop11841 = alloca i1, i1 0
-  %nop11842 = alloca i1, i1 0
-  %nop11843 = alloca i1, i1 0
-  %nop11844 = alloca i1, i1 0
-  %nop11845 = alloca i1, i1 0
-  %nop11846 = alloca i1, i1 0
-  %nop11847 = alloca i1, i1 0
-  %nop11848 = alloca i1, i1 0
-  %nop11849 = alloca i1, i1 0
-  %nop11850 = alloca i1, i1 0
-  %nop11851 = alloca i1, i1 0
-  %nop11852 = alloca i1, i1 0
-  %nop11853 = alloca i1, i1 0
-  %nop11854 = alloca i1, i1 0
-  %nop11855 = alloca i1, i1 0
-  %nop11856 = alloca i1, i1 0
-  %nop11857 = alloca i1, i1 0
-  %nop11858 = alloca i1, i1 0
-  %nop11859 = alloca i1, i1 0
-  %nop11860 = alloca i1, i1 0
-  %nop11861 = alloca i1, i1 0
-  %nop11862 = alloca i1, i1 0
-  %nop11863 = alloca i1, i1 0
-  %nop11864 = alloca i1, i1 0
-  %nop11865 = alloca i1, i1 0
-  %nop11866 = alloca i1, i1 0
-  %nop11867 = alloca i1, i1 0
-  %nop11868 = alloca i1, i1 0
-  %nop11869 = alloca i1, i1 0
-  %nop11870 = alloca i1, i1 0
-  %nop11871 = alloca i1, i1 0
-  %nop11872 = alloca i1, i1 0
-  %nop11873 = alloca i1, i1 0
-  %nop11874 = alloca i1, i1 0
-  %nop11875 = alloca i1, i1 0
-  %nop11876 = alloca i1, i1 0
-  %nop11877 = alloca i1, i1 0
-  %nop11878 = alloca i1, i1 0
-  %nop11879 = alloca i1, i1 0
-  %nop11880 = alloca i1, i1 0
-  %nop11881 = alloca i1, i1 0
-  %nop11882 = alloca i1, i1 0
-  %nop11883 = alloca i1, i1 0
-  %nop11884 = alloca i1, i1 0
-  %nop11885 = alloca i1, i1 0
-  %nop11886 = alloca i1, i1 0
-  %nop11887 = alloca i1, i1 0
-  %nop11888 = alloca i1, i1 0
-  %nop11889 = alloca i1, i1 0
-  %nop11890 = alloca i1, i1 0
-  %nop11891 = alloca i1, i1 0
-  %nop11892 = alloca i1, i1 0
-  %nop11893 = alloca i1, i1 0
-  %nop11894 = alloca i1, i1 0
-  %nop11895 = alloca i1, i1 0
-  %nop11896 = alloca i1, i1 0
-  %nop11897 = alloca i1, i1 0
-  %nop11898 = alloca i1, i1 0
-  %nop11899 = alloca i1, i1 0
-  %nop11900 = alloca i1, i1 0
-  %nop11901 = alloca i1, i1 0
-  %nop11902 = alloca i1, i1 0
-  %nop11903 = alloca i1, i1 0
-  %nop11904 = alloca i1, i1 0
-  %nop11905 = alloca i1, i1 0
-  %nop11906 = alloca i1, i1 0
-  %nop11907 = alloca i1, i1 0
-  %nop11908 = alloca i1, i1 0
-  %nop11909 = alloca i1, i1 0
-  %nop11910 = alloca i1, i1 0
-  %nop11911 = alloca i1, i1 0
-  %nop11912 = alloca i1, i1 0
-  %nop11913 = alloca i1, i1 0
-  %nop11914 = alloca i1, i1 0
-  %nop11915 = alloca i1, i1 0
-  %nop11916 = alloca i1, i1 0
-  %nop11917 = alloca i1, i1 0
-  %nop11918 = alloca i1, i1 0
-  %nop11919 = alloca i1, i1 0
-  %nop11920 = alloca i1, i1 0
-  %nop11921 = alloca i1, i1 0
-  %nop11922 = alloca i1, i1 0
-  %nop11923 = alloca i1, i1 0
-  %nop11924 = alloca i1, i1 0
-  %nop11925 = alloca i1, i1 0
-  %nop11926 = alloca i1, i1 0
-  %nop11927 = alloca i1, i1 0
-  %nop11928 = alloca i1, i1 0
-  %nop11929 = alloca i1, i1 0
-  %nop11930 = alloca i1, i1 0
-  %nop11931 = alloca i1, i1 0
-  %nop11932 = alloca i1, i1 0
-  %nop11933 = alloca i1, i1 0
-  %nop11934 = alloca i1, i1 0
-  %nop11935 = alloca i1, i1 0
-  %nop11936 = alloca i1, i1 0
-  %nop11937 = alloca i1, i1 0
-  %nop11938 = alloca i1, i1 0
-  %nop11939 = alloca i1, i1 0
-  %nop11940 = alloca i1, i1 0
-  %nop11941 = alloca i1, i1 0
-  %nop11942 = alloca i1, i1 0
-  %nop11943 = alloca i1, i1 0
-  %nop11944 = alloca i1, i1 0
-  %nop11945 = alloca i1, i1 0
-  %nop11946 = alloca i1, i1 0
-  %nop11947 = alloca i1, i1 0
-  %nop11948 = alloca i1, i1 0
-  %nop11949 = alloca i1, i1 0
-  %nop11950 = alloca i1, i1 0
-  %nop11951 = alloca i1, i1 0
-  %nop11952 = alloca i1, i1 0
-  %nop11953 = alloca i1, i1 0
-  %nop11954 = alloca i1, i1 0
-  %nop11955 = alloca i1, i1 0
-  %nop11956 = alloca i1, i1 0
-  %nop11957 = alloca i1, i1 0
-  %nop11958 = alloca i1, i1 0
-  %nop11959 = alloca i1, i1 0
-  %nop11960 = alloca i1, i1 0
-  %nop11961 = alloca i1, i1 0
-  %nop11962 = alloca i1, i1 0
-  %nop11963 = alloca i1, i1 0
-  %nop11964 = alloca i1, i1 0
-  %nop11965 = alloca i1, i1 0
-  %nop11966 = alloca i1, i1 0
-  %nop11967 = alloca i1, i1 0
-  %nop11968 = alloca i1, i1 0
-  %nop11969 = alloca i1, i1 0
-  %nop11970 = alloca i1, i1 0
-  %nop11971 = alloca i1, i1 0
-  %nop11972 = alloca i1, i1 0
-  %nop11973 = alloca i1, i1 0
-  %nop11974 = alloca i1, i1 0
-  %nop11975 = alloca i1, i1 0
-  %nop11976 = alloca i1, i1 0
-  %nop11977 = alloca i1, i1 0
-  %nop11978 = alloca i1, i1 0
-  %nop11979 = alloca i1, i1 0
-  %nop11980 = alloca i1, i1 0
-  %nop11981 = alloca i1, i1 0
-  %nop11982 = alloca i1, i1 0
-  %nop11983 = alloca i1, i1 0
-  %nop11984 = alloca i1, i1 0
-  %nop11985 = alloca i1, i1 0
-  %nop11986 = alloca i1, i1 0
-  %nop11987 = alloca i1, i1 0
-  %nop11988 = alloca i1, i1 0
-  %nop11989 = alloca i1, i1 0
-  %nop11990 = alloca i1, i1 0
-  %nop11991 = alloca i1, i1 0
-  %nop11992 = alloca i1, i1 0
-  %nop11993 = alloca i1, i1 0
-  %nop11994 = alloca i1, i1 0
-  %nop11995 = alloca i1, i1 0
-  %nop11996 = alloca i1, i1 0
-  %nop11997 = alloca i1, i1 0
-  %nop11998 = alloca i1, i1 0
-  %nop11999 = alloca i1, i1 0
-  %nop12000 = alloca i1, i1 0
-  %nop12001 = alloca i1, i1 0
-  %nop12002 = alloca i1, i1 0
-  %nop12003 = alloca i1, i1 0
-  %nop12004 = alloca i1, i1 0
-  %nop12005 = alloca i1, i1 0
-  %nop12006 = alloca i1, i1 0
-  %nop12007 = alloca i1, i1 0
-  %nop12008 = alloca i1, i1 0
-  %nop12009 = alloca i1, i1 0
-  %nop12010 = alloca i1, i1 0
-  %nop12011 = alloca i1, i1 0
-  %nop12012 = alloca i1, i1 0
-  %nop12013 = alloca i1, i1 0
-  %nop12014 = alloca i1, i1 0
-  %nop12015 = alloca i1, i1 0
-  %nop12016 = alloca i1, i1 0
-  %nop12017 = alloca i1, i1 0
-  %nop12018 = alloca i1, i1 0
-  %nop12019 = alloca i1, i1 0
-  %nop12020 = alloca i1, i1 0
-  %nop12021 = alloca i1, i1 0
-  %nop12022 = alloca i1, i1 0
-  %nop12023 = alloca i1, i1 0
-  %nop12024 = alloca i1, i1 0
-  %nop12025 = alloca i1, i1 0
-  %nop12026 = alloca i1, i1 0
-  %nop12027 = alloca i1, i1 0
-  %nop12028 = alloca i1, i1 0
-  %nop12029 = alloca i1, i1 0
-  %nop12030 = alloca i1, i1 0
-  %nop12031 = alloca i1, i1 0
-  %nop12032 = alloca i1, i1 0
-  %nop12033 = alloca i1, i1 0
-  %nop12034 = alloca i1, i1 0
-  %nop12035 = alloca i1, i1 0
-  %nop12036 = alloca i1, i1 0
-  %nop12037 = alloca i1, i1 0
-  %nop12038 = alloca i1, i1 0
-  %nop12039 = alloca i1, i1 0
-  %nop12040 = alloca i1, i1 0
-  %nop12041 = alloca i1, i1 0
-  %nop12042 = alloca i1, i1 0
-  %nop12043 = alloca i1, i1 0
-  %nop12044 = alloca i1, i1 0
-  %nop12045 = alloca i1, i1 0
-  %nop12046 = alloca i1, i1 0
-  %nop12047 = alloca i1, i1 0
-  %nop12048 = alloca i1, i1 0
-  %nop12049 = alloca i1, i1 0
-  %nop12050 = alloca i1, i1 0
-  %nop12051 = alloca i1, i1 0
-  %nop12052 = alloca i1, i1 0
-  %nop12053 = alloca i1, i1 0
-  %nop12054 = alloca i1, i1 0
-  %nop12055 = alloca i1, i1 0
-  %nop12056 = alloca i1, i1 0
-  %nop12057 = alloca i1, i1 0
-  %nop12058 = alloca i1, i1 0
-  %nop12059 = alloca i1, i1 0
-  %nop12060 = alloca i1, i1 0
-  %nop12061 = alloca i1, i1 0
-  %nop12062 = alloca i1, i1 0
-  %nop12063 = alloca i1, i1 0
-  %nop12064 = alloca i1, i1 0
-  %nop12065 = alloca i1, i1 0
-  %nop12066 = alloca i1, i1 0
-  %nop12067 = alloca i1, i1 0
-  %nop12068 = alloca i1, i1 0
-  %nop12069 = alloca i1, i1 0
-  %nop12070 = alloca i1, i1 0
-  %nop12071 = alloca i1, i1 0
-  %nop12072 = alloca i1, i1 0
-  %nop12073 = alloca i1, i1 0
-  %nop12074 = alloca i1, i1 0
-  %nop12075 = alloca i1, i1 0
-  %nop12076 = alloca i1, i1 0
-  %nop12077 = alloca i1, i1 0
-  %nop12078 = alloca i1, i1 0
-  %nop12079 = alloca i1, i1 0
-  %nop12080 = alloca i1, i1 0
-  %nop12081 = alloca i1, i1 0
-  %nop12082 = alloca i1, i1 0
-  %nop12083 = alloca i1, i1 0
-  %nop12084 = alloca i1, i1 0
-  %nop12085 = alloca i1, i1 0
-  %nop12086 = alloca i1, i1 0
-  %nop12087 = alloca i1, i1 0
-  %nop12088 = alloca i1, i1 0
-  %nop12089 = alloca i1, i1 0
-  %nop12090 = alloca i1, i1 0
-  %nop12091 = alloca i1, i1 0
-  %nop12092 = alloca i1, i1 0
-  %nop12093 = alloca i1, i1 0
-  %nop12094 = alloca i1, i1 0
-  %nop12095 = alloca i1, i1 0
-  %nop12096 = alloca i1, i1 0
-  %nop12097 = alloca i1, i1 0
-  %nop12098 = alloca i1, i1 0
-  %nop12099 = alloca i1, i1 0
-  %nop12100 = alloca i1, i1 0
-  %nop12101 = alloca i1, i1 0
-  %nop12102 = alloca i1, i1 0
-  %nop12103 = alloca i1, i1 0
-  %nop12104 = alloca i1, i1 0
-  %nop12105 = alloca i1, i1 0
-  %nop12106 = alloca i1, i1 0
-  %nop12107 = alloca i1, i1 0
-  %nop12108 = alloca i1, i1 0
-  %nop12109 = alloca i1, i1 0
-  %nop12110 = alloca i1, i1 0
-  %nop12111 = alloca i1, i1 0
-  %nop12112 = alloca i1, i1 0
-  %nop12113 = alloca i1, i1 0
-  %nop12114 = alloca i1, i1 0
-  %nop12115 = alloca i1, i1 0
-  %nop12116 = alloca i1, i1 0
-  %nop12117 = alloca i1, i1 0
-  %nop12118 = alloca i1, i1 0
-  %nop12119 = alloca i1, i1 0
-  %nop12120 = alloca i1, i1 0
-  %nop12121 = alloca i1, i1 0
-  %nop12122 = alloca i1, i1 0
-  %nop12123 = alloca i1, i1 0
-  %nop12124 = alloca i1, i1 0
-  %nop12125 = alloca i1, i1 0
-  %nop12126 = alloca i1, i1 0
-  %nop12127 = alloca i1, i1 0
-  %nop12128 = alloca i1, i1 0
-  %nop12129 = alloca i1, i1 0
-  %nop12130 = alloca i1, i1 0
-  %nop12131 = alloca i1, i1 0
-  %nop12132 = alloca i1, i1 0
-  %nop12133 = alloca i1, i1 0
-  %nop12134 = alloca i1, i1 0
-  %nop12135 = alloca i1, i1 0
-  %nop12136 = alloca i1, i1 0
-  %nop12137 = alloca i1, i1 0
-  %nop12138 = alloca i1, i1 0
-  %nop12139 = alloca i1, i1 0
-  %nop12140 = alloca i1, i1 0
-  %nop12141 = alloca i1, i1 0
-  %nop12142 = alloca i1, i1 0
-  %nop12143 = alloca i1, i1 0
-  %nop12144 = alloca i1, i1 0
-  %nop12145 = alloca i1, i1 0
-  %nop12146 = alloca i1, i1 0
-  %nop12147 = alloca i1, i1 0
-  %nop12148 = alloca i1, i1 0
-  %nop12149 = alloca i1, i1 0
-  %nop12150 = alloca i1, i1 0
-  %nop12151 = alloca i1, i1 0
-  %nop12152 = alloca i1, i1 0
-  %nop12153 = alloca i1, i1 0
-  %nop12154 = alloca i1, i1 0
-  %nop12155 = alloca i1, i1 0
-  %nop12156 = alloca i1, i1 0
-  %nop12157 = alloca i1, i1 0
-  %nop12158 = alloca i1, i1 0
-  %nop12159 = alloca i1, i1 0
-  %nop12160 = alloca i1, i1 0
-  %nop12161 = alloca i1, i1 0
-  %nop12162 = alloca i1, i1 0
-  %nop12163 = alloca i1, i1 0
-  %nop12164 = alloca i1, i1 0
-  %nop12165 = alloca i1, i1 0
-  %nop12166 = alloca i1, i1 0
-  %nop12167 = alloca i1, i1 0
-  %nop12168 = alloca i1, i1 0
-  %nop12169 = alloca i1, i1 0
-  %nop12170 = alloca i1, i1 0
-  %nop12171 = alloca i1, i1 0
-  %nop12172 = alloca i1, i1 0
-  %nop12173 = alloca i1, i1 0
-  %nop12174 = alloca i1, i1 0
-  %nop12175 = alloca i1, i1 0
-  %nop12176 = alloca i1, i1 0
-  %nop12177 = alloca i1, i1 0
-  %nop12178 = alloca i1, i1 0
-  %nop12179 = alloca i1, i1 0
-  %nop12180 = alloca i1, i1 0
-  %nop12181 = alloca i1, i1 0
-  %nop12182 = alloca i1, i1 0
-  %nop12183 = alloca i1, i1 0
-  %nop12184 = alloca i1, i1 0
-  %nop12185 = alloca i1, i1 0
-  %nop12186 = alloca i1, i1 0
-  %nop12187 = alloca i1, i1 0
-  %nop12188 = alloca i1, i1 0
-  %nop12189 = alloca i1, i1 0
-  %nop12190 = alloca i1, i1 0
-  %nop12191 = alloca i1, i1 0
-  %nop12192 = alloca i1, i1 0
-  %nop12193 = alloca i1, i1 0
-  %nop12194 = alloca i1, i1 0
-  %nop12195 = alloca i1, i1 0
-  %nop12196 = alloca i1, i1 0
-  %nop12197 = alloca i1, i1 0
-  %nop12198 = alloca i1, i1 0
-  %nop12199 = alloca i1, i1 0
-  %nop12200 = alloca i1, i1 0
-  %nop12201 = alloca i1, i1 0
-  %nop12202 = alloca i1, i1 0
-  %nop12203 = alloca i1, i1 0
-  %nop12204 = alloca i1, i1 0
-  %nop12205 = alloca i1, i1 0
-  %nop12206 = alloca i1, i1 0
-  %nop12207 = alloca i1, i1 0
-  %nop12208 = alloca i1, i1 0
-  %nop12209 = alloca i1, i1 0
-  %nop12210 = alloca i1, i1 0
-  %nop12211 = alloca i1, i1 0
-  %nop12212 = alloca i1, i1 0
-  %nop12213 = alloca i1, i1 0
-  %nop12214 = alloca i1, i1 0
-  %nop12215 = alloca i1, i1 0
-  %nop12216 = alloca i1, i1 0
-  %nop12217 = alloca i1, i1 0
-  %nop12218 = alloca i1, i1 0
-  %nop12219 = alloca i1, i1 0
-  %nop12220 = alloca i1, i1 0
-  %nop12221 = alloca i1, i1 0
-  %nop12222 = alloca i1, i1 0
-  %nop12223 = alloca i1, i1 0
-  %nop12224 = alloca i1, i1 0
-  %nop12225 = alloca i1, i1 0
-  %nop12226 = alloca i1, i1 0
-  %nop12227 = alloca i1, i1 0
-  %nop12228 = alloca i1, i1 0
-  %nop12229 = alloca i1, i1 0
-  %nop12230 = alloca i1, i1 0
-  %nop12231 = alloca i1, i1 0
-  %nop12232 = alloca i1, i1 0
-  %nop12233 = alloca i1, i1 0
-  %nop12234 = alloca i1, i1 0
-  %nop12235 = alloca i1, i1 0
-  %nop12236 = alloca i1, i1 0
-  %nop12237 = alloca i1, i1 0
-  %nop12238 = alloca i1, i1 0
-  %nop12239 = alloca i1, i1 0
-  %nop12240 = alloca i1, i1 0
-  %nop12241 = alloca i1, i1 0
-  %nop12242 = alloca i1, i1 0
-  %nop12243 = alloca i1, i1 0
-  %nop12244 = alloca i1, i1 0
-  %nop12245 = alloca i1, i1 0
-  %nop12246 = alloca i1, i1 0
-  %nop12247 = alloca i1, i1 0
-  %nop12248 = alloca i1, i1 0
-  %nop12249 = alloca i1, i1 0
-  %nop12250 = alloca i1, i1 0
-  %nop12251 = alloca i1, i1 0
-  %nop12252 = alloca i1, i1 0
-  %nop12253 = alloca i1, i1 0
-  %nop12254 = alloca i1, i1 0
-  %nop12255 = alloca i1, i1 0
-  %nop12256 = alloca i1, i1 0
-  %nop12257 = alloca i1, i1 0
-  %nop12258 = alloca i1, i1 0
-  %nop12259 = alloca i1, i1 0
-  %nop12260 = alloca i1, i1 0
-  %nop12261 = alloca i1, i1 0
-  %nop12262 = alloca i1, i1 0
-  %nop12263 = alloca i1, i1 0
-  %nop12264 = alloca i1, i1 0
-  %nop12265 = alloca i1, i1 0
-  %nop12266 = alloca i1, i1 0
-  %nop12267 = alloca i1, i1 0
-  %nop12268 = alloca i1, i1 0
-  %nop12269 = alloca i1, i1 0
-  %nop12270 = alloca i1, i1 0
-  %nop12271 = alloca i1, i1 0
-  %nop12272 = alloca i1, i1 0
-  %nop12273 = alloca i1, i1 0
-  %nop12274 = alloca i1, i1 0
-  %nop12275 = alloca i1, i1 0
-  %nop12276 = alloca i1, i1 0
-  %nop12277 = alloca i1, i1 0
-  %nop12278 = alloca i1, i1 0
-  %nop12279 = alloca i1, i1 0
-  %nop12280 = alloca i1, i1 0
-  %nop12281 = alloca i1, i1 0
-  %nop12282 = alloca i1, i1 0
-  %nop12283 = alloca i1, i1 0
-  %nop12284 = alloca i1, i1 0
-  %nop12285 = alloca i1, i1 0
-  %nop12286 = alloca i1, i1 0
-  %nop12287 = alloca i1, i1 0
-  %nop12288 = alloca i1, i1 0
-  %nop12289 = alloca i1, i1 0
-  %nop12290 = alloca i1, i1 0
-  %nop12291 = alloca i1, i1 0
-  %nop12292 = alloca i1, i1 0
-  %nop12293 = alloca i1, i1 0
-  %nop12294 = alloca i1, i1 0
-  %nop12295 = alloca i1, i1 0
-  %nop12296 = alloca i1, i1 0
-  %nop12297 = alloca i1, i1 0
-  %nop12298 = alloca i1, i1 0
-  %nop12299 = alloca i1, i1 0
-  %nop12300 = alloca i1, i1 0
-  %nop12301 = alloca i1, i1 0
-  %nop12302 = alloca i1, i1 0
-  %nop12303 = alloca i1, i1 0
-  %nop12304 = alloca i1, i1 0
-  %nop12305 = alloca i1, i1 0
-  %nop12306 = alloca i1, i1 0
-  %nop12307 = alloca i1, i1 0
-  %nop12308 = alloca i1, i1 0
-  %nop12309 = alloca i1, i1 0
-  %nop12310 = alloca i1, i1 0
-  %nop12311 = alloca i1, i1 0
-  %nop12312 = alloca i1, i1 0
-  %nop12313 = alloca i1, i1 0
-  %nop12314 = alloca i1, i1 0
-  %nop12315 = alloca i1, i1 0
-  %nop12316 = alloca i1, i1 0
-  %nop12317 = alloca i1, i1 0
-  %nop12318 = alloca i1, i1 0
-  %nop12319 = alloca i1, i1 0
-  %nop12320 = alloca i1, i1 0
-  %nop12321 = alloca i1, i1 0
-  %nop12322 = alloca i1, i1 0
-  %nop12323 = alloca i1, i1 0
-  %nop12324 = alloca i1, i1 0
-  %nop12325 = alloca i1, i1 0
-  %nop12326 = alloca i1, i1 0
-  %nop12327 = alloca i1, i1 0
-  %nop12328 = alloca i1, i1 0
-  %nop12329 = alloca i1, i1 0
-  %nop12330 = alloca i1, i1 0
-  %nop12331 = alloca i1, i1 0
-  %nop12332 = alloca i1, i1 0
-  %nop12333 = alloca i1, i1 0
-  %nop12334 = alloca i1, i1 0
-  %nop12335 = alloca i1, i1 0
-  %nop12336 = alloca i1, i1 0
-  %nop12337 = alloca i1, i1 0
-  %nop12338 = alloca i1, i1 0
-  %nop12339 = alloca i1, i1 0
-  %nop12340 = alloca i1, i1 0
-  %nop12341 = alloca i1, i1 0
-  %nop12342 = alloca i1, i1 0
-  %nop12343 = alloca i1, i1 0
-  %nop12344 = alloca i1, i1 0
-  %nop12345 = alloca i1, i1 0
-  %nop12346 = alloca i1, i1 0
-  %nop12347 = alloca i1, i1 0
-  %nop12348 = alloca i1, i1 0
-  %nop12349 = alloca i1, i1 0
-  %nop12350 = alloca i1, i1 0
-  %nop12351 = alloca i1, i1 0
-  %nop12352 = alloca i1, i1 0
-  %nop12353 = alloca i1, i1 0
-  %nop12354 = alloca i1, i1 0
-  %nop12355 = alloca i1, i1 0
-  %nop12356 = alloca i1, i1 0
-  %nop12357 = alloca i1, i1 0
-  %nop12358 = alloca i1, i1 0
-  %nop12359 = alloca i1, i1 0
-  %nop12360 = alloca i1, i1 0
-  %nop12361 = alloca i1, i1 0
-  %nop12362 = alloca i1, i1 0
-  %nop12363 = alloca i1, i1 0
-  %nop12364 = alloca i1, i1 0
-  %nop12365 = alloca i1, i1 0
-  %nop12366 = alloca i1, i1 0
-  %nop12367 = alloca i1, i1 0
-  %nop12368 = alloca i1, i1 0
-  %nop12369 = alloca i1, i1 0
-  %nop12370 = alloca i1, i1 0
-  %nop12371 = alloca i1, i1 0
-  %nop12372 = alloca i1, i1 0
-  %nop12373 = alloca i1, i1 0
-  %nop12374 = alloca i1, i1 0
-  %nop12375 = alloca i1, i1 0
-  %nop12376 = alloca i1, i1 0
-  %nop12377 = alloca i1, i1 0
-  %nop12378 = alloca i1, i1 0
-  %nop12379 = alloca i1, i1 0
-  %nop12380 = alloca i1, i1 0
-  %nop12381 = alloca i1, i1 0
-  %nop12382 = alloca i1, i1 0
-  %nop12383 = alloca i1, i1 0
-  %nop12384 = alloca i1, i1 0
-  %nop12385 = alloca i1, i1 0
-  %nop12386 = alloca i1, i1 0
-  %nop12387 = alloca i1, i1 0
-  %nop12388 = alloca i1, i1 0
-  %nop12389 = alloca i1, i1 0
-  %nop12390 = alloca i1, i1 0
-  %nop12391 = alloca i1, i1 0
-  %nop12392 = alloca i1, i1 0
-  %nop12393 = alloca i1, i1 0
-  %nop12394 = alloca i1, i1 0
-  %nop12395 = alloca i1, i1 0
-  %nop12396 = alloca i1, i1 0
-  %nop12397 = alloca i1, i1 0
-  %nop12398 = alloca i1, i1 0
-  %nop12399 = alloca i1, i1 0
-  %nop12400 = alloca i1, i1 0
-  %nop12401 = alloca i1, i1 0
-  %nop12402 = alloca i1, i1 0
-  %nop12403 = alloca i1, i1 0
-  %nop12404 = alloca i1, i1 0
-  %nop12405 = alloca i1, i1 0
-  %nop12406 = alloca i1, i1 0
-  %nop12407 = alloca i1, i1 0
-  %nop12408 = alloca i1, i1 0
-  %nop12409 = alloca i1, i1 0
-  %nop12410 = alloca i1, i1 0
-  %nop12411 = alloca i1, i1 0
-  %nop12412 = alloca i1, i1 0
-  %nop12413 = alloca i1, i1 0
-  %nop12414 = alloca i1, i1 0
-  %nop12415 = alloca i1, i1 0
-  %nop12416 = alloca i1, i1 0
-  %nop12417 = alloca i1, i1 0
-  %nop12418 = alloca i1, i1 0
-  %nop12419 = alloca i1, i1 0
-  %nop12420 = alloca i1, i1 0
-  %nop12421 = alloca i1, i1 0
-  %nop12422 = alloca i1, i1 0
-  %nop12423 = alloca i1, i1 0
-  %nop12424 = alloca i1, i1 0
-  %nop12425 = alloca i1, i1 0
-  %nop12426 = alloca i1, i1 0
-  %nop12427 = alloca i1, i1 0
-  %nop12428 = alloca i1, i1 0
-  %nop12429 = alloca i1, i1 0
-  %nop12430 = alloca i1, i1 0
-  %nop12431 = alloca i1, i1 0
-  %nop12432 = alloca i1, i1 0
-  %nop12433 = alloca i1, i1 0
-  %nop12434 = alloca i1, i1 0
-  %nop12435 = alloca i1, i1 0
-  %nop12436 = alloca i1, i1 0
-  %nop12437 = alloca i1, i1 0
-  %nop12438 = alloca i1, i1 0
-  %nop12439 = alloca i1, i1 0
-  %nop12440 = alloca i1, i1 0
-  %nop12441 = alloca i1, i1 0
-  %nop12442 = alloca i1, i1 0
-  %nop12443 = alloca i1, i1 0
-  %nop12444 = alloca i1, i1 0
-  %nop12445 = alloca i1, i1 0
-  %nop12446 = alloca i1, i1 0
-  %nop12447 = alloca i1, i1 0
-  %nop12448 = alloca i1, i1 0
-  %nop12449 = alloca i1, i1 0
-  %nop12450 = alloca i1, i1 0
-  %nop12451 = alloca i1, i1 0
-  %nop12452 = alloca i1, i1 0
-  %nop12453 = alloca i1, i1 0
-  %nop12454 = alloca i1, i1 0
-  %nop12455 = alloca i1, i1 0
-  %nop12456 = alloca i1, i1 0
-  %nop12457 = alloca i1, i1 0
-  %nop12458 = alloca i1, i1 0
-  %nop12459 = alloca i1, i1 0
-  %nop12460 = alloca i1, i1 0
-  %nop12461 = alloca i1, i1 0
-  %nop12462 = alloca i1, i1 0
-  %nop12463 = alloca i1, i1 0
-  %nop12464 = alloca i1, i1 0
-  %nop12465 = alloca i1, i1 0
-  %nop12466 = alloca i1, i1 0
-  %nop12467 = alloca i1, i1 0
-  %nop12468 = alloca i1, i1 0
-  %nop12469 = alloca i1, i1 0
-  %nop12470 = alloca i1, i1 0
-  %nop12471 = alloca i1, i1 0
-  %nop12472 = alloca i1, i1 0
-  %nop12473 = alloca i1, i1 0
-  %nop12474 = alloca i1, i1 0
-  %nop12475 = alloca i1, i1 0
-  %nop12476 = alloca i1, i1 0
-  %nop12477 = alloca i1, i1 0
-  %nop12478 = alloca i1, i1 0
-  %nop12479 = alloca i1, i1 0
-  %nop12480 = alloca i1, i1 0
-  %nop12481 = alloca i1, i1 0
-  %nop12482 = alloca i1, i1 0
-  %nop12483 = alloca i1, i1 0
-  %nop12484 = alloca i1, i1 0
-  %nop12485 = alloca i1, i1 0
-  %nop12486 = alloca i1, i1 0
-  %nop12487 = alloca i1, i1 0
-  %nop12488 = alloca i1, i1 0
-  %nop12489 = alloca i1, i1 0
-  %nop12490 = alloca i1, i1 0
-  %nop12491 = alloca i1, i1 0
-  %nop12492 = alloca i1, i1 0
-  %nop12493 = alloca i1, i1 0
-  %nop12494 = alloca i1, i1 0
-  %nop12495 = alloca i1, i1 0
-  %nop12496 = alloca i1, i1 0
-  %nop12497 = alloca i1, i1 0
-  %nop12498 = alloca i1, i1 0
-  %nop12499 = alloca i1, i1 0
-  %nop12500 = alloca i1, i1 0
-  %nop12501 = alloca i1, i1 0
-  %nop12502 = alloca i1, i1 0
-  %nop12503 = alloca i1, i1 0
-  %nop12504 = alloca i1, i1 0
-  %nop12505 = alloca i1, i1 0
-  %nop12506 = alloca i1, i1 0
-  %nop12507 = alloca i1, i1 0
-  %nop12508 = alloca i1, i1 0
-  %nop12509 = alloca i1, i1 0
-  %nop12510 = alloca i1, i1 0
-  %nop12511 = alloca i1, i1 0
-  %nop12512 = alloca i1, i1 0
-  %nop12513 = alloca i1, i1 0
-  %nop12514 = alloca i1, i1 0
-  %nop12515 = alloca i1, i1 0
-  %nop12516 = alloca i1, i1 0
-  %nop12517 = alloca i1, i1 0
-  %nop12518 = alloca i1, i1 0
-  %nop12519 = alloca i1, i1 0
-  %nop12520 = alloca i1, i1 0
-  %nop12521 = alloca i1, i1 0
-  %nop12522 = alloca i1, i1 0
-  %nop12523 = alloca i1, i1 0
-  %nop12524 = alloca i1, i1 0
-  %nop12525 = alloca i1, i1 0
-  %nop12526 = alloca i1, i1 0
-  %nop12527 = alloca i1, i1 0
-  %nop12528 = alloca i1, i1 0
-  %nop12529 = alloca i1, i1 0
-  %nop12530 = alloca i1, i1 0
-  %nop12531 = alloca i1, i1 0
-  %nop12532 = alloca i1, i1 0
-  %nop12533 = alloca i1, i1 0
-  %nop12534 = alloca i1, i1 0
-  %nop12535 = alloca i1, i1 0
-  %nop12536 = alloca i1, i1 0
-  %nop12537 = alloca i1, i1 0
-  %nop12538 = alloca i1, i1 0
-  %nop12539 = alloca i1, i1 0
-  %nop12540 = alloca i1, i1 0
-  %nop12541 = alloca i1, i1 0
-  %nop12542 = alloca i1, i1 0
-  %nop12543 = alloca i1, i1 0
-  %nop12544 = alloca i1, i1 0
-  %nop12545 = alloca i1, i1 0
-  %nop12546 = alloca i1, i1 0
-  %nop12547 = alloca i1, i1 0
-  %nop12548 = alloca i1, i1 0
-  %nop12549 = alloca i1, i1 0
-  %nop12550 = alloca i1, i1 0
-  %nop12551 = alloca i1, i1 0
-  %nop12552 = alloca i1, i1 0
-  %nop12553 = alloca i1, i1 0
-  %nop12554 = alloca i1, i1 0
-  %nop12555 = alloca i1, i1 0
-  %nop12556 = alloca i1, i1 0
-  %nop12557 = alloca i1, i1 0
-  %nop12558 = alloca i1, i1 0
-  %nop12559 = alloca i1, i1 0
-  %nop12560 = alloca i1, i1 0
-  %nop12561 = alloca i1, i1 0
-  %nop12562 = alloca i1, i1 0
-  %nop12563 = alloca i1, i1 0
-  %nop12564 = alloca i1, i1 0
-  %nop12565 = alloca i1, i1 0
-  %nop12566 = alloca i1, i1 0
-  %nop12567 = alloca i1, i1 0
-  %nop12568 = alloca i1, i1 0
-  %nop12569 = alloca i1, i1 0
-  %nop12570 = alloca i1, i1 0
-  %nop12571 = alloca i1, i1 0
-  %nop12572 = alloca i1, i1 0
-  %nop12573 = alloca i1, i1 0
-  %nop12574 = alloca i1, i1 0
-  %nop12575 = alloca i1, i1 0
-  %nop12576 = alloca i1, i1 0
-  %nop12577 = alloca i1, i1 0
-  %nop12578 = alloca i1, i1 0
-  %nop12579 = alloca i1, i1 0
-  %nop12580 = alloca i1, i1 0
-  %nop12581 = alloca i1, i1 0
-  %nop12582 = alloca i1, i1 0
-  %nop12583 = alloca i1, i1 0
-  %nop12584 = alloca i1, i1 0
-  %nop12585 = alloca i1, i1 0
-  %nop12586 = alloca i1, i1 0
-  %nop12587 = alloca i1, i1 0
-  %nop12588 = alloca i1, i1 0
-  %nop12589 = alloca i1, i1 0
-  %nop12590 = alloca i1, i1 0
-  %nop12591 = alloca i1, i1 0
-  %nop12592 = alloca i1, i1 0
-  %nop12593 = alloca i1, i1 0
-  %nop12594 = alloca i1, i1 0
-  %nop12595 = alloca i1, i1 0
-  %nop12596 = alloca i1, i1 0
-  %nop12597 = alloca i1, i1 0
-  %nop12598 = alloca i1, i1 0
-  %nop12599 = alloca i1, i1 0
-  %nop12600 = alloca i1, i1 0
-  %nop12601 = alloca i1, i1 0
-  %nop12602 = alloca i1, i1 0
-  %nop12603 = alloca i1, i1 0
-  %nop12604 = alloca i1, i1 0
-  %nop12605 = alloca i1, i1 0
-  %nop12606 = alloca i1, i1 0
-  %nop12607 = alloca i1, i1 0
-  %nop12608 = alloca i1, i1 0
-  %nop12609 = alloca i1, i1 0
-  %nop12610 = alloca i1, i1 0
-  %nop12611 = alloca i1, i1 0
-  %nop12612 = alloca i1, i1 0
-  %nop12613 = alloca i1, i1 0
-  %nop12614 = alloca i1, i1 0
-  %nop12615 = alloca i1, i1 0
-  %nop12616 = alloca i1, i1 0
-  %nop12617 = alloca i1, i1 0
-  %nop12618 = alloca i1, i1 0
-  %nop12619 = alloca i1, i1 0
-  %nop12620 = alloca i1, i1 0
-  %nop12621 = alloca i1, i1 0
-  %nop12622 = alloca i1, i1 0
-  %nop12623 = alloca i1, i1 0
-  %nop12624 = alloca i1, i1 0
-  %nop12625 = alloca i1, i1 0
-  %nop12626 = alloca i1, i1 0
-  %nop12627 = alloca i1, i1 0
-  %nop12628 = alloca i1, i1 0
-  %nop12629 = alloca i1, i1 0
-  %nop12630 = alloca i1, i1 0
-  %nop12631 = alloca i1, i1 0
-  %nop12632 = alloca i1, i1 0
-  %nop12633 = alloca i1, i1 0
-  %nop12634 = alloca i1, i1 0
-  %nop12635 = alloca i1, i1 0
-  %nop12636 = alloca i1, i1 0
-  %nop12637 = alloca i1, i1 0
-  %nop12638 = alloca i1, i1 0
-  %nop12639 = alloca i1, i1 0
-  %nop12640 = alloca i1, i1 0
-  %nop12641 = alloca i1, i1 0
-  %nop12642 = alloca i1, i1 0
-  %nop12643 = alloca i1, i1 0
-  %nop12644 = alloca i1, i1 0
-  %nop12645 = alloca i1, i1 0
-  %nop12646 = alloca i1, i1 0
-  %nop12647 = alloca i1, i1 0
-  %nop12648 = alloca i1, i1 0
-  %nop12649 = alloca i1, i1 0
-  %nop12650 = alloca i1, i1 0
-  %nop12651 = alloca i1, i1 0
-  %nop12652 = alloca i1, i1 0
-  %nop12653 = alloca i1, i1 0
-  %nop12654 = alloca i1, i1 0
-  %nop12655 = alloca i1, i1 0
-  %nop12656 = alloca i1, i1 0
-  %nop12657 = alloca i1, i1 0
-  %nop12658 = alloca i1, i1 0
-  %nop12659 = alloca i1, i1 0
-  %nop12660 = alloca i1, i1 0
-  %nop12661 = alloca i1, i1 0
-  %nop12662 = alloca i1, i1 0
-  %nop12663 = alloca i1, i1 0
-  %nop12664 = alloca i1, i1 0
-  %nop12665 = alloca i1, i1 0
-  %nop12666 = alloca i1, i1 0
-  %nop12667 = alloca i1, i1 0
-  %nop12668 = alloca i1, i1 0
-  %nop12669 = alloca i1, i1 0
-  %nop12670 = alloca i1, i1 0
-  %nop12671 = alloca i1, i1 0
-  %nop12672 = alloca i1, i1 0
-  %nop12673 = alloca i1, i1 0
-  %nop12674 = alloca i1, i1 0
-  %nop12675 = alloca i1, i1 0
-  %nop12676 = alloca i1, i1 0
-  %nop12677 = alloca i1, i1 0
-  %nop12678 = alloca i1, i1 0
-  %nop12679 = alloca i1, i1 0
-  %nop12680 = alloca i1, i1 0
-  %nop12681 = alloca i1, i1 0
-  %nop12682 = alloca i1, i1 0
-  %nop12683 = alloca i1, i1 0
-  %nop12684 = alloca i1, i1 0
-  %nop12685 = alloca i1, i1 0
-  %nop12686 = alloca i1, i1 0
-  %nop12687 = alloca i1, i1 0
-  %nop12688 = alloca i1, i1 0
-  %nop12689 = alloca i1, i1 0
-  %nop12690 = alloca i1, i1 0
-  %nop12691 = alloca i1, i1 0
-  %nop12692 = alloca i1, i1 0
-  %nop12693 = alloca i1, i1 0
-  %nop12694 = alloca i1, i1 0
-  %nop12695 = alloca i1, i1 0
-  %nop12696 = alloca i1, i1 0
-  %nop12697 = alloca i1, i1 0
-  %nop12698 = alloca i1, i1 0
-  %nop12699 = alloca i1, i1 0
-  %nop12700 = alloca i1, i1 0
-  %nop12701 = alloca i1, i1 0
-  %nop12702 = alloca i1, i1 0
-  %nop12703 = alloca i1, i1 0
-  %nop12704 = alloca i1, i1 0
-  %nop12705 = alloca i1, i1 0
-  %nop12706 = alloca i1, i1 0
-  %nop12707 = alloca i1, i1 0
-  %nop12708 = alloca i1, i1 0
-  %nop12709 = alloca i1, i1 0
-  %nop12710 = alloca i1, i1 0
-  %nop12711 = alloca i1, i1 0
-  %nop12712 = alloca i1, i1 0
-  %nop12713 = alloca i1, i1 0
-  %nop12714 = alloca i1, i1 0
-  %nop12715 = alloca i1, i1 0
-  %nop12716 = alloca i1, i1 0
-  %nop12717 = alloca i1, i1 0
-  %nop12718 = alloca i1, i1 0
-  %nop12719 = alloca i1, i1 0
-  %nop12720 = alloca i1, i1 0
-  %nop12721 = alloca i1, i1 0
-  %nop12722 = alloca i1, i1 0
-  %nop12723 = alloca i1, i1 0
-  %nop12724 = alloca i1, i1 0
-  %nop12725 = alloca i1, i1 0
-  %nop12726 = alloca i1, i1 0
-  %nop12727 = alloca i1, i1 0
-  %nop12728 = alloca i1, i1 0
-  %nop12729 = alloca i1, i1 0
-  %nop12730 = alloca i1, i1 0
-  %nop12731 = alloca i1, i1 0
-  %nop12732 = alloca i1, i1 0
-  %nop12733 = alloca i1, i1 0
-  %nop12734 = alloca i1, i1 0
-  %nop12735 = alloca i1, i1 0
-  %nop12736 = alloca i1, i1 0
-  %nop12737 = alloca i1, i1 0
-  %nop12738 = alloca i1, i1 0
-  %nop12739 = alloca i1, i1 0
-  %nop12740 = alloca i1, i1 0
-  %nop12741 = alloca i1, i1 0
-  %nop12742 = alloca i1, i1 0
-  %nop12743 = alloca i1, i1 0
-  %nop12744 = alloca i1, i1 0
-  %nop12745 = alloca i1, i1 0
-  %nop12746 = alloca i1, i1 0
-  %nop12747 = alloca i1, i1 0
-  %nop12748 = alloca i1, i1 0
-  %nop12749 = alloca i1, i1 0
-  %nop12750 = alloca i1, i1 0
-  %nop12751 = alloca i1, i1 0
-  %nop12752 = alloca i1, i1 0
-  %nop12753 = alloca i1, i1 0
-  %nop12754 = alloca i1, i1 0
-  %nop12755 = alloca i1, i1 0
-  %nop12756 = alloca i1, i1 0
-  %nop12757 = alloca i1, i1 0
-  %nop12758 = alloca i1, i1 0
-  %nop12759 = alloca i1, i1 0
-  %nop12760 = alloca i1, i1 0
-  %nop12761 = alloca i1, i1 0
-  %nop12762 = alloca i1, i1 0
-  %nop12763 = alloca i1, i1 0
-  %nop12764 = alloca i1, i1 0
-  %nop12765 = alloca i1, i1 0
-  %nop12766 = alloca i1, i1 0
-  %nop12767 = alloca i1, i1 0
-  %nop12768 = alloca i1, i1 0
-  %nop12769 = alloca i1, i1 0
-  %nop12770 = alloca i1, i1 0
-  %nop12771 = alloca i1, i1 0
-  %nop12772 = alloca i1, i1 0
-  %nop12773 = alloca i1, i1 0
-  %nop12774 = alloca i1, i1 0
-  %nop12775 = alloca i1, i1 0
-  %nop12776 = alloca i1, i1 0
-  %nop12777 = alloca i1, i1 0
-  %nop12778 = alloca i1, i1 0
-  %nop12779 = alloca i1, i1 0
-  %nop12780 = alloca i1, i1 0
-  %nop12781 = alloca i1, i1 0
-  %nop12782 = alloca i1, i1 0
-  %nop12783 = alloca i1, i1 0
-  %nop12784 = alloca i1, i1 0
-  %nop12785 = alloca i1, i1 0
-  %nop12786 = alloca i1, i1 0
-  %nop12787 = alloca i1, i1 0
-  %nop12788 = alloca i1, i1 0
-  %nop12789 = alloca i1, i1 0
-  %nop12790 = alloca i1, i1 0
-  %nop12791 = alloca i1, i1 0
-  %nop12792 = alloca i1, i1 0
-  %nop12793 = alloca i1, i1 0
-  %nop12794 = alloca i1, i1 0
-  %nop12795 = alloca i1, i1 0
-  %nop12796 = alloca i1, i1 0
-  %nop12797 = alloca i1, i1 0
-  %nop12798 = alloca i1, i1 0
-  %nop12799 = alloca i1, i1 0
-  %nop12800 = alloca i1, i1 0
-  %nop12801 = alloca i1, i1 0
-  %nop12802 = alloca i1, i1 0
-  %nop12803 = alloca i1, i1 0
-  %nop12804 = alloca i1, i1 0
-  %nop12805 = alloca i1, i1 0
-  %nop12806 = alloca i1, i1 0
-  %nop12807 = alloca i1, i1 0
-  %nop12808 = alloca i1, i1 0
-  %nop12809 = alloca i1, i1 0
-  %nop12810 = alloca i1, i1 0
-  %nop12811 = alloca i1, i1 0
-  %nop12812 = alloca i1, i1 0
-  %nop12813 = alloca i1, i1 0
-  %nop12814 = alloca i1, i1 0
-  %nop12815 = alloca i1, i1 0
-  %nop12816 = alloca i1, i1 0
-  %nop12817 = alloca i1, i1 0
-  %nop12818 = alloca i1, i1 0
-  %nop12819 = alloca i1, i1 0
-  %nop12820 = alloca i1, i1 0
-  %nop12821 = alloca i1, i1 0
-  %nop12822 = alloca i1, i1 0
-  %nop12823 = alloca i1, i1 0
-  %nop12824 = alloca i1, i1 0
-  %nop12825 = alloca i1, i1 0
-  %nop12826 = alloca i1, i1 0
-  %nop12827 = alloca i1, i1 0
-  %nop12828 = alloca i1, i1 0
-  %nop12829 = alloca i1, i1 0
-  %nop12830 = alloca i1, i1 0
-  %nop12831 = alloca i1, i1 0
-  %nop12832 = alloca i1, i1 0
-  %nop12833 = alloca i1, i1 0
-  %nop12834 = alloca i1, i1 0
-  %nop12835 = alloca i1, i1 0
-  %nop12836 = alloca i1, i1 0
-  %nop12837 = alloca i1, i1 0
-  %nop12838 = alloca i1, i1 0
-  %nop12839 = alloca i1, i1 0
-  %nop12840 = alloca i1, i1 0
-  %nop12841 = alloca i1, i1 0
-  %nop12842 = alloca i1, i1 0
-  %nop12843 = alloca i1, i1 0
-  %nop12844 = alloca i1, i1 0
-  %nop12845 = alloca i1, i1 0
-  %nop12846 = alloca i1, i1 0
-  %nop12847 = alloca i1, i1 0
-  %nop12848 = alloca i1, i1 0
-  %nop12849 = alloca i1, i1 0
-  %nop12850 = alloca i1, i1 0
-  %nop12851 = alloca i1, i1 0
-  %nop12852 = alloca i1, i1 0
-  %nop12853 = alloca i1, i1 0
-  %nop12854 = alloca i1, i1 0
-  %nop12855 = alloca i1, i1 0
-  %nop12856 = alloca i1, i1 0
-  %nop12857 = alloca i1, i1 0
-  %nop12858 = alloca i1, i1 0
-  %nop12859 = alloca i1, i1 0
-  %nop12860 = alloca i1, i1 0
-  %nop12861 = alloca i1, i1 0
-  %nop12862 = alloca i1, i1 0
-  %nop12863 = alloca i1, i1 0
-  %nop12864 = alloca i1, i1 0
-  %nop12865 = alloca i1, i1 0
-  %nop12866 = alloca i1, i1 0
-  %nop12867 = alloca i1, i1 0
-  %nop12868 = alloca i1, i1 0
-  %nop12869 = alloca i1, i1 0
-  %nop12870 = alloca i1, i1 0
-  %nop12871 = alloca i1, i1 0
-  %nop12872 = alloca i1, i1 0
-  %nop12873 = alloca i1, i1 0
-  %nop12874 = alloca i1, i1 0
-  %nop12875 = alloca i1, i1 0
-  %nop12876 = alloca i1, i1 0
-  %nop12877 = alloca i1, i1 0
-  %nop12878 = alloca i1, i1 0
-  %nop12879 = alloca i1, i1 0
-  %nop12880 = alloca i1, i1 0
-  %nop12881 = alloca i1, i1 0
-  %nop12882 = alloca i1, i1 0
-  %nop12883 = alloca i1, i1 0
-  %nop12884 = alloca i1, i1 0
-  %nop12885 = alloca i1, i1 0
-  %nop12886 = alloca i1, i1 0
-  %nop12887 = alloca i1, i1 0
-  %nop12888 = alloca i1, i1 0
-  %nop12889 = alloca i1, i1 0
-  %nop12890 = alloca i1, i1 0
-  %nop12891 = alloca i1, i1 0
-  %nop12892 = alloca i1, i1 0
-  %nop12893 = alloca i1, i1 0
-  %nop12894 = alloca i1, i1 0
-  %nop12895 = alloca i1, i1 0
-  %nop12896 = alloca i1, i1 0
-  %nop12897 = alloca i1, i1 0
-  %nop12898 = alloca i1, i1 0
-  %nop12899 = alloca i1, i1 0
-  %nop12900 = alloca i1, i1 0
-  %nop12901 = alloca i1, i1 0
-  %nop12902 = alloca i1, i1 0
-  %nop12903 = alloca i1, i1 0
-  %nop12904 = alloca i1, i1 0
-  %nop12905 = alloca i1, i1 0
-  %nop12906 = alloca i1, i1 0
-  %nop12907 = alloca i1, i1 0
-  %nop12908 = alloca i1, i1 0
-  %nop12909 = alloca i1, i1 0
-  %nop12910 = alloca i1, i1 0
-  %nop12911 = alloca i1, i1 0
-  %nop12912 = alloca i1, i1 0
-  %nop12913 = alloca i1, i1 0
-  %nop12914 = alloca i1, i1 0
-  %nop12915 = alloca i1, i1 0
-  %nop12916 = alloca i1, i1 0
-  %nop12917 = alloca i1, i1 0
-  %nop12918 = alloca i1, i1 0
-  %nop12919 = alloca i1, i1 0
-  %nop12920 = alloca i1, i1 0
-  %nop12921 = alloca i1, i1 0
-  %nop12922 = alloca i1, i1 0
-  %nop12923 = alloca i1, i1 0
-  %nop12924 = alloca i1, i1 0
-  %nop12925 = alloca i1, i1 0
-  %nop12926 = alloca i1, i1 0
-  %nop12927 = alloca i1, i1 0
-  %nop12928 = alloca i1, i1 0
-  %nop12929 = alloca i1, i1 0
-  %nop12930 = alloca i1, i1 0
-  %nop12931 = alloca i1, i1 0
-  %nop12932 = alloca i1, i1 0
-  %nop12933 = alloca i1, i1 0
-  %nop12934 = alloca i1, i1 0
-  %nop12935 = alloca i1, i1 0
-  %nop12936 = alloca i1, i1 0
-  %nop12937 = alloca i1, i1 0
-  %nop12938 = alloca i1, i1 0
-  %nop12939 = alloca i1, i1 0
-  %nop12940 = alloca i1, i1 0
-  %nop12941 = alloca i1, i1 0
-  %nop12942 = alloca i1, i1 0
-  %nop12943 = alloca i1, i1 0
-  %nop12944 = alloca i1, i1 0
-  %nop12945 = alloca i1, i1 0
-  %nop12946 = alloca i1, i1 0
-  %nop12947 = alloca i1, i1 0
-  %nop12948 = alloca i1, i1 0
-  %nop12949 = alloca i1, i1 0
-  %nop12950 = alloca i1, i1 0
-  %nop12951 = alloca i1, i1 0
-  %nop12952 = alloca i1, i1 0
-  %nop12953 = alloca i1, i1 0
-  %nop12954 = alloca i1, i1 0
-  %nop12955 = alloca i1, i1 0
-  %nop12956 = alloca i1, i1 0
-  %nop12957 = alloca i1, i1 0
-  %nop12958 = alloca i1, i1 0
-  %nop12959 = alloca i1, i1 0
-  %nop12960 = alloca i1, i1 0
-  %nop12961 = alloca i1, i1 0
-  %nop12962 = alloca i1, i1 0
-  %nop12963 = alloca i1, i1 0
-  %nop12964 = alloca i1, i1 0
-  %nop12965 = alloca i1, i1 0
-  %nop12966 = alloca i1, i1 0
-  %nop12967 = alloca i1, i1 0
-  %nop12968 = alloca i1, i1 0
-  %nop12969 = alloca i1, i1 0
-  %nop12970 = alloca i1, i1 0
-  %nop12971 = alloca i1, i1 0
-  %nop12972 = alloca i1, i1 0
-  %nop12973 = alloca i1, i1 0
-  %nop12974 = alloca i1, i1 0
-  %nop12975 = alloca i1, i1 0
-  %nop12976 = alloca i1, i1 0
-  %nop12977 = alloca i1, i1 0
-  %nop12978 = alloca i1, i1 0
-  %nop12979 = alloca i1, i1 0
-  %nop12980 = alloca i1, i1 0
-  %nop12981 = alloca i1, i1 0
-  %nop12982 = alloca i1, i1 0
-  %nop12983 = alloca i1, i1 0
-  %nop12984 = alloca i1, i1 0
-  %nop12985 = alloca i1, i1 0
-  %nop12986 = alloca i1, i1 0
-  %nop12987 = alloca i1, i1 0
-  %nop12988 = alloca i1, i1 0
-  %nop12989 = alloca i1, i1 0
-  %nop12990 = alloca i1, i1 0
-  %nop12991 = alloca i1, i1 0
-  %nop12992 = alloca i1, i1 0
-  %nop12993 = alloca i1, i1 0
-  %nop12994 = alloca i1, i1 0
-  %nop12995 = alloca i1, i1 0
-  %nop12996 = alloca i1, i1 0
-  %nop12997 = alloca i1, i1 0
-  %nop12998 = alloca i1, i1 0
-  %nop12999 = alloca i1, i1 0
-  %nop13000 = alloca i1, i1 0
-  %nop13001 = alloca i1, i1 0
-  %nop13002 = alloca i1, i1 0
-  %nop13003 = alloca i1, i1 0
-  %nop13004 = alloca i1, i1 0
-  %nop13005 = alloca i1, i1 0
-  %nop13006 = alloca i1, i1 0
-  %nop13007 = alloca i1, i1 0
-  %nop13008 = alloca i1, i1 0
-  %nop13009 = alloca i1, i1 0
-  %nop13010 = alloca i1, i1 0
-  %nop13011 = alloca i1, i1 0
-  %nop13012 = alloca i1, i1 0
-  %nop13013 = alloca i1, i1 0
-  %nop13014 = alloca i1, i1 0
-  %nop13015 = alloca i1, i1 0
-  %nop13016 = alloca i1, i1 0
-  %nop13017 = alloca i1, i1 0
-  %nop13018 = alloca i1, i1 0
-  %nop13019 = alloca i1, i1 0
-  %nop13020 = alloca i1, i1 0
-  %nop13021 = alloca i1, i1 0
-  %nop13022 = alloca i1, i1 0
-  %nop13023 = alloca i1, i1 0
-  %nop13024 = alloca i1, i1 0
-  %nop13025 = alloca i1, i1 0
-  %nop13026 = alloca i1, i1 0
-  %nop13027 = alloca i1, i1 0
-  %nop13028 = alloca i1, i1 0
-  %nop13029 = alloca i1, i1 0
-  %nop13030 = alloca i1, i1 0
-  %nop13031 = alloca i1, i1 0
-  %nop13032 = alloca i1, i1 0
-  %nop13033 = alloca i1, i1 0
-  %nop13034 = alloca i1, i1 0
-  %nop13035 = alloca i1, i1 0
-  %nop13036 = alloca i1, i1 0
-  %nop13037 = alloca i1, i1 0
-  %nop13038 = alloca i1, i1 0
-  %nop13039 = alloca i1, i1 0
-  %nop13040 = alloca i1, i1 0
-  %nop13041 = alloca i1, i1 0
-  %nop13042 = alloca i1, i1 0
-  %nop13043 = alloca i1, i1 0
-  %nop13044 = alloca i1, i1 0
-  %nop13045 = alloca i1, i1 0
-  %nop13046 = alloca i1, i1 0
-  %nop13047 = alloca i1, i1 0
-  %nop13048 = alloca i1, i1 0
-  %nop13049 = alloca i1, i1 0
-  %nop13050 = alloca i1, i1 0
-  %nop13051 = alloca i1, i1 0
-  %nop13052 = alloca i1, i1 0
-  %nop13053 = alloca i1, i1 0
-  %nop13054 = alloca i1, i1 0
-  %nop13055 = alloca i1, i1 0
-  %nop13056 = alloca i1, i1 0
-  %nop13057 = alloca i1, i1 0
-  %nop13058 = alloca i1, i1 0
-  %nop13059 = alloca i1, i1 0
-  %nop13060 = alloca i1, i1 0
-  %nop13061 = alloca i1, i1 0
-  %nop13062 = alloca i1, i1 0
-  %nop13063 = alloca i1, i1 0
-  %nop13064 = alloca i1, i1 0
-  %nop13065 = alloca i1, i1 0
-  %nop13066 = alloca i1, i1 0
-  %nop13067 = alloca i1, i1 0
-  %nop13068 = alloca i1, i1 0
-  %nop13069 = alloca i1, i1 0
-  %nop13070 = alloca i1, i1 0
-  %nop13071 = alloca i1, i1 0
-  %nop13072 = alloca i1, i1 0
-  %nop13073 = alloca i1, i1 0
-  %nop13074 = alloca i1, i1 0
-  %nop13075 = alloca i1, i1 0
-  %nop13076 = alloca i1, i1 0
-  %nop13077 = alloca i1, i1 0
-  %nop13078 = alloca i1, i1 0
-  %nop13079 = alloca i1, i1 0
-  %nop13080 = alloca i1, i1 0
-  %nop13081 = alloca i1, i1 0
-  %nop13082 = alloca i1, i1 0
-  %nop13083 = alloca i1, i1 0
-  %nop13084 = alloca i1, i1 0
-  %nop13085 = alloca i1, i1 0
-  %nop13086 = alloca i1, i1 0
-  %nop13087 = alloca i1, i1 0
-  %nop13088 = alloca i1, i1 0
-  %nop13089 = alloca i1, i1 0
-  %nop13090 = alloca i1, i1 0
-  %nop13091 = alloca i1, i1 0
-  %nop13092 = alloca i1, i1 0
-  %nop13093 = alloca i1, i1 0
-  %nop13094 = alloca i1, i1 0
-  %nop13095 = alloca i1, i1 0
-  %nop13096 = alloca i1, i1 0
-  %nop13097 = alloca i1, i1 0
-  %nop13098 = alloca i1, i1 0
-  %nop13099 = alloca i1, i1 0
-  %nop13100 = alloca i1, i1 0
-  %nop13101 = alloca i1, i1 0
-  %nop13102 = alloca i1, i1 0
-  %nop13103 = alloca i1, i1 0
-  %nop13104 = alloca i1, i1 0
-  %nop13105 = alloca i1, i1 0
-  %nop13106 = alloca i1, i1 0
-  %nop13107 = alloca i1, i1 0
-  %nop13108 = alloca i1, i1 0
-  %nop13109 = alloca i1, i1 0
-  %nop13110 = alloca i1, i1 0
-  %nop13111 = alloca i1, i1 0
-  %nop13112 = alloca i1, i1 0
-  %nop13113 = alloca i1, i1 0
-  %nop13114 = alloca i1, i1 0
-  %nop13115 = alloca i1, i1 0
-  %nop13116 = alloca i1, i1 0
-  %nop13117 = alloca i1, i1 0
-  %nop13118 = alloca i1, i1 0
-  %nop13119 = alloca i1, i1 0
-  %nop13120 = alloca i1, i1 0
-  %nop13121 = alloca i1, i1 0
-  %nop13122 = alloca i1, i1 0
-  %nop13123 = alloca i1, i1 0
-  %nop13124 = alloca i1, i1 0
-  %nop13125 = alloca i1, i1 0
-  %nop13126 = alloca i1, i1 0
-  %nop13127 = alloca i1, i1 0
-  %nop13128 = alloca i1, i1 0
-  %nop13129 = alloca i1, i1 0
-  %nop13130 = alloca i1, i1 0
-  %nop13131 = alloca i1, i1 0
-  %nop13132 = alloca i1, i1 0
-  %nop13133 = alloca i1, i1 0
-  %nop13134 = alloca i1, i1 0
-  %nop13135 = alloca i1, i1 0
-  %nop13136 = alloca i1, i1 0
-  %nop13137 = alloca i1, i1 0
-  %nop13138 = alloca i1, i1 0
-  %nop13139 = alloca i1, i1 0
-  %nop13140 = alloca i1, i1 0
-  %nop13141 = alloca i1, i1 0
-  %nop13142 = alloca i1, i1 0
-  %nop13143 = alloca i1, i1 0
-  %nop13144 = alloca i1, i1 0
-  %nop13145 = alloca i1, i1 0
-  %nop13146 = alloca i1, i1 0
-  %nop13147 = alloca i1, i1 0
-  %nop13148 = alloca i1, i1 0
-  %nop13149 = alloca i1, i1 0
-  %nop13150 = alloca i1, i1 0
-  %nop13151 = alloca i1, i1 0
-  %nop13152 = alloca i1, i1 0
-  %nop13153 = alloca i1, i1 0
-  %nop13154 = alloca i1, i1 0
-  %nop13155 = alloca i1, i1 0
-  %nop13156 = alloca i1, i1 0
-  %nop13157 = alloca i1, i1 0
-  %nop13158 = alloca i1, i1 0
-  %nop13159 = alloca i1, i1 0
-  %nop13160 = alloca i1, i1 0
-  %nop13161 = alloca i1, i1 0
-  %nop13162 = alloca i1, i1 0
-  %nop13163 = alloca i1, i1 0
-  %nop13164 = alloca i1, i1 0
-  %nop13165 = alloca i1, i1 0
-  %nop13166 = alloca i1, i1 0
-  %nop13167 = alloca i1, i1 0
-  %nop13168 = alloca i1, i1 0
-  %nop13169 = alloca i1, i1 0
-  %nop13170 = alloca i1, i1 0
-  %nop13171 = alloca i1, i1 0
-  %nop13172 = alloca i1, i1 0
-  %nop13173 = alloca i1, i1 0
-  %nop13174 = alloca i1, i1 0
-  %nop13175 = alloca i1, i1 0
-  %nop13176 = alloca i1, i1 0
-  %nop13177 = alloca i1, i1 0
-  %nop13178 = alloca i1, i1 0
-  %nop13179 = alloca i1, i1 0
-  %nop13180 = alloca i1, i1 0
-  %nop13181 = alloca i1, i1 0
-  %nop13182 = alloca i1, i1 0
-  %nop13183 = alloca i1, i1 0
-  %nop13184 = alloca i1, i1 0
-  %nop13185 = alloca i1, i1 0
-  %nop13186 = alloca i1, i1 0
-  %nop13187 = alloca i1, i1 0
-  %nop13188 = alloca i1, i1 0
-  %nop13189 = alloca i1, i1 0
-  %nop13190 = alloca i1, i1 0
-  %nop13191 = alloca i1, i1 0
-  %nop13192 = alloca i1, i1 0
-  %nop13193 = alloca i1, i1 0
-  %nop13194 = alloca i1, i1 0
-  %nop13195 = alloca i1, i1 0
-  %nop13196 = alloca i1, i1 0
-  %nop13197 = alloca i1, i1 0
-  %nop13198 = alloca i1, i1 0
-  %nop13199 = alloca i1, i1 0
-  %nop13200 = alloca i1, i1 0
-  %nop13201 = alloca i1, i1 0
-  %nop13202 = alloca i1, i1 0
-  %nop13203 = alloca i1, i1 0
-  %nop13204 = alloca i1, i1 0
-  %nop13205 = alloca i1, i1 0
-  %nop13206 = alloca i1, i1 0
-  %nop13207 = alloca i1, i1 0
-  %nop13208 = alloca i1, i1 0
-  %nop13209 = alloca i1, i1 0
-  %nop13210 = alloca i1, i1 0
-  %nop13211 = alloca i1, i1 0
-  %nop13212 = alloca i1, i1 0
-  %nop13213 = alloca i1, i1 0
-  %nop13214 = alloca i1, i1 0
-  %nop13215 = alloca i1, i1 0
-  %nop13216 = alloca i1, i1 0
-  %nop13217 = alloca i1, i1 0
-  %nop13218 = alloca i1, i1 0
-  %nop13219 = alloca i1, i1 0
-  %nop13220 = alloca i1, i1 0
-  %nop13221 = alloca i1, i1 0
-  %nop13222 = alloca i1, i1 0
-  %nop13223 = alloca i1, i1 0
-  %nop13224 = alloca i1, i1 0
-  %nop13225 = alloca i1, i1 0
-  %nop13226 = alloca i1, i1 0
-  %nop13227 = alloca i1, i1 0
-  %nop13228 = alloca i1, i1 0
-  %nop13229 = alloca i1, i1 0
-  %nop13230 = alloca i1, i1 0
-  %nop13231 = alloca i1, i1 0
-  %nop13232 = alloca i1, i1 0
-  %nop13233 = alloca i1, i1 0
-  %nop13234 = alloca i1, i1 0
-  %nop13235 = alloca i1, i1 0
-  %nop13236 = alloca i1, i1 0
-  %nop13237 = alloca i1, i1 0
-  %nop13238 = alloca i1, i1 0
-  %nop13239 = alloca i1, i1 0
-  %nop13240 = alloca i1, i1 0
-  %nop13241 = alloca i1, i1 0
-  %nop13242 = alloca i1, i1 0
-  %nop13243 = alloca i1, i1 0
-  %nop13244 = alloca i1, i1 0
-  %nop13245 = alloca i1, i1 0
-  %nop13246 = alloca i1, i1 0
-  %nop13247 = alloca i1, i1 0
-  %nop13248 = alloca i1, i1 0
-  %nop13249 = alloca i1, i1 0
-  %nop13250 = alloca i1, i1 0
-  %nop13251 = alloca i1, i1 0
-  %nop13252 = alloca i1, i1 0
-  %nop13253 = alloca i1, i1 0
-  %nop13254 = alloca i1, i1 0
-  %nop13255 = alloca i1, i1 0
-  %nop13256 = alloca i1, i1 0
-  %nop13257 = alloca i1, i1 0
-  %nop13258 = alloca i1, i1 0
-  %nop13259 = alloca i1, i1 0
-  %nop13260 = alloca i1, i1 0
-  %nop13261 = alloca i1, i1 0
-  %nop13262 = alloca i1, i1 0
-  %nop13263 = alloca i1, i1 0
-  %nop13264 = alloca i1, i1 0
-  %nop13265 = alloca i1, i1 0
-  %nop13266 = alloca i1, i1 0
-  %nop13267 = alloca i1, i1 0
-  %nop13268 = alloca i1, i1 0
-  %nop13269 = alloca i1, i1 0
-  %nop13270 = alloca i1, i1 0
-  %nop13271 = alloca i1, i1 0
-  %nop13272 = alloca i1, i1 0
-  %nop13273 = alloca i1, i1 0
-  %nop13274 = alloca i1, i1 0
-  %nop13275 = alloca i1, i1 0
-  %nop13276 = alloca i1, i1 0
-  %nop13277 = alloca i1, i1 0
-  %nop13278 = alloca i1, i1 0
-  %nop13279 = alloca i1, i1 0
-  %nop13280 = alloca i1, i1 0
-  %nop13281 = alloca i1, i1 0
-  %nop13282 = alloca i1, i1 0
-  %nop13283 = alloca i1, i1 0
-  %nop13284 = alloca i1, i1 0
-  %nop13285 = alloca i1, i1 0
-  %nop13286 = alloca i1, i1 0
-  %nop13287 = alloca i1, i1 0
-  %nop13288 = alloca i1, i1 0
-  %nop13289 = alloca i1, i1 0
-  %nop13290 = alloca i1, i1 0
-  %nop13291 = alloca i1, i1 0
-  %nop13292 = alloca i1, i1 0
-  %nop13293 = alloca i1, i1 0
-  %nop13294 = alloca i1, i1 0
-  %nop13295 = alloca i1, i1 0
-  %nop13296 = alloca i1, i1 0
-  %nop13297 = alloca i1, i1 0
-  %nop13298 = alloca i1, i1 0
-  %nop13299 = alloca i1, i1 0
-  %nop13300 = alloca i1, i1 0
-  %nop13301 = alloca i1, i1 0
-  %nop13302 = alloca i1, i1 0
-  %nop13303 = alloca i1, i1 0
-  %nop13304 = alloca i1, i1 0
-  %nop13305 = alloca i1, i1 0
-  %nop13306 = alloca i1, i1 0
-  %nop13307 = alloca i1, i1 0
-  %nop13308 = alloca i1, i1 0
-  %nop13309 = alloca i1, i1 0
-  %nop13310 = alloca i1, i1 0
-  %nop13311 = alloca i1, i1 0
-  %nop13312 = alloca i1, i1 0
-  %nop13313 = alloca i1, i1 0
-  %nop13314 = alloca i1, i1 0
-  %nop13315 = alloca i1, i1 0
-  %nop13316 = alloca i1, i1 0
-  %nop13317 = alloca i1, i1 0
-  %nop13318 = alloca i1, i1 0
-  %nop13319 = alloca i1, i1 0
-  %nop13320 = alloca i1, i1 0
-  %nop13321 = alloca i1, i1 0
-  %nop13322 = alloca i1, i1 0
-  %nop13323 = alloca i1, i1 0
-  %nop13324 = alloca i1, i1 0
-  %nop13325 = alloca i1, i1 0
-  %nop13326 = alloca i1, i1 0
-  %nop13327 = alloca i1, i1 0
-  %nop13328 = alloca i1, i1 0
-  %nop13329 = alloca i1, i1 0
-  %nop13330 = alloca i1, i1 0
-  %nop13331 = alloca i1, i1 0
-  %nop13332 = alloca i1, i1 0
-  %nop13333 = alloca i1, i1 0
-  %nop13334 = alloca i1, i1 0
-  %nop13335 = alloca i1, i1 0
-  %nop13336 = alloca i1, i1 0
-  %nop13337 = alloca i1, i1 0
-  %nop13338 = alloca i1, i1 0
-  %nop13339 = alloca i1, i1 0
-  %nop13340 = alloca i1, i1 0
-  %nop13341 = alloca i1, i1 0
-  %nop13342 = alloca i1, i1 0
-  %nop13343 = alloca i1, i1 0
-  %nop13344 = alloca i1, i1 0
-  %nop13345 = alloca i1, i1 0
-  %nop13346 = alloca i1, i1 0
-  %nop13347 = alloca i1, i1 0
-  %nop13348 = alloca i1, i1 0
-  %nop13349 = alloca i1, i1 0
-  %nop13350 = alloca i1, i1 0
-  %nop13351 = alloca i1, i1 0
-  %nop13352 = alloca i1, i1 0
-  %nop13353 = alloca i1, i1 0
-  %nop13354 = alloca i1, i1 0
-  %nop13355 = alloca i1, i1 0
-  %nop13356 = alloca i1, i1 0
-  %nop13357 = alloca i1, i1 0
-  %nop13358 = alloca i1, i1 0
-  %nop13359 = alloca i1, i1 0
-  %nop13360 = alloca i1, i1 0
-  %nop13361 = alloca i1, i1 0
-  %nop13362 = alloca i1, i1 0
-  %nop13363 = alloca i1, i1 0
-  %nop13364 = alloca i1, i1 0
-  %nop13365 = alloca i1, i1 0
-  %nop13366 = alloca i1, i1 0
-  %nop13367 = alloca i1, i1 0
-  %nop13368 = alloca i1, i1 0
-  %nop13369 = alloca i1, i1 0
-  %nop13370 = alloca i1, i1 0
-  %nop13371 = alloca i1, i1 0
-  %nop13372 = alloca i1, i1 0
-  %nop13373 = alloca i1, i1 0
-  %nop13374 = alloca i1, i1 0
-  %nop13375 = alloca i1, i1 0
-  %nop13376 = alloca i1, i1 0
-  %nop13377 = alloca i1, i1 0
-  %nop13378 = alloca i1, i1 0
-  %nop13379 = alloca i1, i1 0
-  %nop13380 = alloca i1, i1 0
-  %nop13381 = alloca i1, i1 0
-  %nop13382 = alloca i1, i1 0
-  %nop13383 = alloca i1, i1 0
-  %nop13384 = alloca i1, i1 0
-  %nop13385 = alloca i1, i1 0
-  %nop13386 = alloca i1, i1 0
-  %nop13387 = alloca i1, i1 0
-  %nop13388 = alloca i1, i1 0
-  %nop13389 = alloca i1, i1 0
-  %nop13390 = alloca i1, i1 0
-  %nop13391 = alloca i1, i1 0
-  %nop13392 = alloca i1, i1 0
-  %nop13393 = alloca i1, i1 0
-  %nop13394 = alloca i1, i1 0
-  %nop13395 = alloca i1, i1 0
-  %nop13396 = alloca i1, i1 0
-  %nop13397 = alloca i1, i1 0
-  %nop13398 = alloca i1, i1 0
-  %nop13399 = alloca i1, i1 0
-  %nop13400 = alloca i1, i1 0
-  %nop13401 = alloca i1, i1 0
-  %nop13402 = alloca i1, i1 0
-  %nop13403 = alloca i1, i1 0
-  %nop13404 = alloca i1, i1 0
-  %nop13405 = alloca i1, i1 0
-  %nop13406 = alloca i1, i1 0
-  %nop13407 = alloca i1, i1 0
-  %nop13408 = alloca i1, i1 0
-  %nop13409 = alloca i1, i1 0
-  %nop13410 = alloca i1, i1 0
-  %nop13411 = alloca i1, i1 0
-  %nop13412 = alloca i1, i1 0
-  %nop13413 = alloca i1, i1 0
-  %nop13414 = alloca i1, i1 0
-  %nop13415 = alloca i1, i1 0
-  %nop13416 = alloca i1, i1 0
-  %nop13417 = alloca i1, i1 0
-  %nop13418 = alloca i1, i1 0
-  %nop13419 = alloca i1, i1 0
-  %nop13420 = alloca i1, i1 0
-  %nop13421 = alloca i1, i1 0
-  %nop13422 = alloca i1, i1 0
-  %nop13423 = alloca i1, i1 0
-  %nop13424 = alloca i1, i1 0
-  %nop13425 = alloca i1, i1 0
-  %nop13426 = alloca i1, i1 0
-  %nop13427 = alloca i1, i1 0
-  %nop13428 = alloca i1, i1 0
-  %nop13429 = alloca i1, i1 0
-  %nop13430 = alloca i1, i1 0
-  %nop13431 = alloca i1, i1 0
-  %nop13432 = alloca i1, i1 0
-  %nop13433 = alloca i1, i1 0
-  %nop13434 = alloca i1, i1 0
-  %nop13435 = alloca i1, i1 0
-  %nop13436 = alloca i1, i1 0
-  %nop13437 = alloca i1, i1 0
-  %nop13438 = alloca i1, i1 0
-  %nop13439 = alloca i1, i1 0
-  %nop13440 = alloca i1, i1 0
-  %nop13441 = alloca i1, i1 0
-  %nop13442 = alloca i1, i1 0
-  %nop13443 = alloca i1, i1 0
-  %nop13444 = alloca i1, i1 0
-  %nop13445 = alloca i1, i1 0
-  %nop13446 = alloca i1, i1 0
-  %nop13447 = alloca i1, i1 0
-  %nop13448 = alloca i1, i1 0
-  %nop13449 = alloca i1, i1 0
-  %nop13450 = alloca i1, i1 0
-  %nop13451 = alloca i1, i1 0
-  %nop13452 = alloca i1, i1 0
-  %nop13453 = alloca i1, i1 0
-  %nop13454 = alloca i1, i1 0
-  %nop13455 = alloca i1, i1 0
-  %nop13456 = alloca i1, i1 0
-  %nop13457 = alloca i1, i1 0
-  %nop13458 = alloca i1, i1 0
-  %nop13459 = alloca i1, i1 0
-  %nop13460 = alloca i1, i1 0
-  %nop13461 = alloca i1, i1 0
-  %nop13462 = alloca i1, i1 0
-  %nop13463 = alloca i1, i1 0
-  %nop13464 = alloca i1, i1 0
-  %nop13465 = alloca i1, i1 0
-  %nop13466 = alloca i1, i1 0
-  %nop13467 = alloca i1, i1 0
-  %nop13468 = alloca i1, i1 0
-  %nop13469 = alloca i1, i1 0
-  %nop13470 = alloca i1, i1 0
-  %nop13471 = alloca i1, i1 0
-  %nop13472 = alloca i1, i1 0
-  %nop13473 = alloca i1, i1 0
-  %nop13474 = alloca i1, i1 0
-  %nop13475 = alloca i1, i1 0
-  %nop13476 = alloca i1, i1 0
-  %nop13477 = alloca i1, i1 0
-  %nop13478 = alloca i1, i1 0
-  %nop13479 = alloca i1, i1 0
-  %nop13480 = alloca i1, i1 0
-  %nop13481 = alloca i1, i1 0
-  %nop13482 = alloca i1, i1 0
-  %nop13483 = alloca i1, i1 0
-  %nop13484 = alloca i1, i1 0
-  %nop13485 = alloca i1, i1 0
-  %nop13486 = alloca i1, i1 0
-  %nop13487 = alloca i1, i1 0
-  %nop13488 = alloca i1, i1 0
-  %nop13489 = alloca i1, i1 0
-  %nop13490 = alloca i1, i1 0
-  %nop13491 = alloca i1, i1 0
-  %nop13492 = alloca i1, i1 0
-  %nop13493 = alloca i1, i1 0
-  %nop13494 = alloca i1, i1 0
-  %nop13495 = alloca i1, i1 0
-  %nop13496 = alloca i1, i1 0
-  %nop13497 = alloca i1, i1 0
-  %nop13498 = alloca i1, i1 0
-  %nop13499 = alloca i1, i1 0
-  %nop13500 = alloca i1, i1 0
-  %nop13501 = alloca i1, i1 0
-  %nop13502 = alloca i1, i1 0
-  %nop13503 = alloca i1, i1 0
-  %nop13504 = alloca i1, i1 0
-  %nop13505 = alloca i1, i1 0
-  %nop13506 = alloca i1, i1 0
-  %nop13507 = alloca i1, i1 0
-  %nop13508 = alloca i1, i1 0
-  %nop13509 = alloca i1, i1 0
-  %nop13510 = alloca i1, i1 0
-  %nop13511 = alloca i1, i1 0
-  %nop13512 = alloca i1, i1 0
-  %nop13513 = alloca i1, i1 0
-  %nop13514 = alloca i1, i1 0
-  %nop13515 = alloca i1, i1 0
-  %nop13516 = alloca i1, i1 0
-  %nop13517 = alloca i1, i1 0
-  %nop13518 = alloca i1, i1 0
-  %nop13519 = alloca i1, i1 0
-  %nop13520 = alloca i1, i1 0
-  %nop13521 = alloca i1, i1 0
-  %nop13522 = alloca i1, i1 0
-  %nop13523 = alloca i1, i1 0
-  %nop13524 = alloca i1, i1 0
-  %nop13525 = alloca i1, i1 0
-  %nop13526 = alloca i1, i1 0
-  %nop13527 = alloca i1, i1 0
-  %nop13528 = alloca i1, i1 0
-  %nop13529 = alloca i1, i1 0
-  %nop13530 = alloca i1, i1 0
-  %nop13531 = alloca i1, i1 0
-  %nop13532 = alloca i1, i1 0
-  %nop13533 = alloca i1, i1 0
-  %nop13534 = alloca i1, i1 0
-  %nop13535 = alloca i1, i1 0
-  %nop13536 = alloca i1, i1 0
-  %nop13537 = alloca i1, i1 0
-  %nop13538 = alloca i1, i1 0
-  %nop13539 = alloca i1, i1 0
-  %nop13540 = alloca i1, i1 0
-  %nop13541 = alloca i1, i1 0
-  %nop13542 = alloca i1, i1 0
-  %nop13543 = alloca i1, i1 0
-  %nop13544 = alloca i1, i1 0
-  %nop13545 = alloca i1, i1 0
-  %nop13546 = alloca i1, i1 0
-  %nop13547 = alloca i1, i1 0
-  %nop13548 = alloca i1, i1 0
-  %nop13549 = alloca i1, i1 0
-  %nop13550 = alloca i1, i1 0
-  %nop13551 = alloca i1, i1 0
-  %nop13552 = alloca i1, i1 0
-  %nop13553 = alloca i1, i1 0
-  %nop13554 = alloca i1, i1 0
-  %nop13555 = alloca i1, i1 0
-  %nop13556 = alloca i1, i1 0
-  %nop13557 = alloca i1, i1 0
-  %nop13558 = alloca i1, i1 0
-  %nop13559 = alloca i1, i1 0
-  %nop13560 = alloca i1, i1 0
-  %nop13561 = alloca i1, i1 0
-  %nop13562 = alloca i1, i1 0
-  %nop13563 = alloca i1, i1 0
-  %nop13564 = alloca i1, i1 0
-  %nop13565 = alloca i1, i1 0
-  %nop13566 = alloca i1, i1 0
-  %nop13567 = alloca i1, i1 0
-  %nop13568 = alloca i1, i1 0
-  %nop13569 = alloca i1, i1 0
-  %nop13570 = alloca i1, i1 0
-  %nop13571 = alloca i1, i1 0
-  %nop13572 = alloca i1, i1 0
-  %nop13573 = alloca i1, i1 0
-  %nop13574 = alloca i1, i1 0
-  %nop13575 = alloca i1, i1 0
-  %nop13576 = alloca i1, i1 0
-  %nop13577 = alloca i1, i1 0
-  %nop13578 = alloca i1, i1 0
-  %nop13579 = alloca i1, i1 0
-  %nop13580 = alloca i1, i1 0
-  %nop13581 = alloca i1, i1 0
-  %nop13582 = alloca i1, i1 0
-  %nop13583 = alloca i1, i1 0
-  %nop13584 = alloca i1, i1 0
-  %nop13585 = alloca i1, i1 0
-  %nop13586 = alloca i1, i1 0
-  %nop13587 = alloca i1, i1 0
-  %nop13588 = alloca i1, i1 0
-  %nop13589 = alloca i1, i1 0
-  %nop13590 = alloca i1, i1 0
-  %nop13591 = alloca i1, i1 0
-  %nop13592 = alloca i1, i1 0
-  %nop13593 = alloca i1, i1 0
-  %nop13594 = alloca i1, i1 0
-  %nop13595 = alloca i1, i1 0
-  %nop13596 = alloca i1, i1 0
-  %nop13597 = alloca i1, i1 0
-  %nop13598 = alloca i1, i1 0
-  %nop13599 = alloca i1, i1 0
-  %nop13600 = alloca i1, i1 0
-  %nop13601 = alloca i1, i1 0
-  %nop13602 = alloca i1, i1 0
-  %nop13603 = alloca i1, i1 0
-  %nop13604 = alloca i1, i1 0
-  %nop13605 = alloca i1, i1 0
-  %nop13606 = alloca i1, i1 0
-  %nop13607 = alloca i1, i1 0
-  %nop13608 = alloca i1, i1 0
-  %nop13609 = alloca i1, i1 0
-  %nop13610 = alloca i1, i1 0
-  %nop13611 = alloca i1, i1 0
-  %nop13612 = alloca i1, i1 0
-  %nop13613 = alloca i1, i1 0
-  %nop13614 = alloca i1, i1 0
-  %nop13615 = alloca i1, i1 0
-  %nop13616 = alloca i1, i1 0
-  %nop13617 = alloca i1, i1 0
-  %nop13618 = alloca i1, i1 0
-  %nop13619 = alloca i1, i1 0
-  %nop13620 = alloca i1, i1 0
-  %nop13621 = alloca i1, i1 0
-  %nop13622 = alloca i1, i1 0
-  %nop13623 = alloca i1, i1 0
-  %nop13624 = alloca i1, i1 0
-  %nop13625 = alloca i1, i1 0
-  %nop13626 = alloca i1, i1 0
-  %nop13627 = alloca i1, i1 0
-  %nop13628 = alloca i1, i1 0
-  %nop13629 = alloca i1, i1 0
-  %nop13630 = alloca i1, i1 0
-  %nop13631 = alloca i1, i1 0
-  %nop13632 = alloca i1, i1 0
-  %nop13633 = alloca i1, i1 0
-  %nop13634 = alloca i1, i1 0
-  %nop13635 = alloca i1, i1 0
-  %nop13636 = alloca i1, i1 0
-  %nop13637 = alloca i1, i1 0
-  %nop13638 = alloca i1, i1 0
-  %nop13639 = alloca i1, i1 0
-  %nop13640 = alloca i1, i1 0
-  %nop13641 = alloca i1, i1 0
-  %nop13642 = alloca i1, i1 0
-  %nop13643 = alloca i1, i1 0
-  %nop13644 = alloca i1, i1 0
-  %nop13645 = alloca i1, i1 0
-  %nop13646 = alloca i1, i1 0
-  %nop13647 = alloca i1, i1 0
-  %nop13648 = alloca i1, i1 0
-  %nop13649 = alloca i1, i1 0
-  %nop13650 = alloca i1, i1 0
-  %nop13651 = alloca i1, i1 0
-  %nop13652 = alloca i1, i1 0
-  %nop13653 = alloca i1, i1 0
-  %nop13654 = alloca i1, i1 0
-  %nop13655 = alloca i1, i1 0
-  %nop13656 = alloca i1, i1 0
-  %nop13657 = alloca i1, i1 0
-  %nop13658 = alloca i1, i1 0
-  %nop13659 = alloca i1, i1 0
-  %nop13660 = alloca i1, i1 0
-  %nop13661 = alloca i1, i1 0
-  %nop13662 = alloca i1, i1 0
-  %nop13663 = alloca i1, i1 0
-  %nop13664 = alloca i1, i1 0
-  %nop13665 = alloca i1, i1 0
-  %nop13666 = alloca i1, i1 0
-  %nop13667 = alloca i1, i1 0
-  %nop13668 = alloca i1, i1 0
-  %nop13669 = alloca i1, i1 0
-  %nop13670 = alloca i1, i1 0
-  %nop13671 = alloca i1, i1 0
-  %nop13672 = alloca i1, i1 0
-  %nop13673 = alloca i1, i1 0
-  %nop13674 = alloca i1, i1 0
-  %nop13675 = alloca i1, i1 0
-  %nop13676 = alloca i1, i1 0
-  %nop13677 = alloca i1, i1 0
-  %nop13678 = alloca i1, i1 0
-  %nop13679 = alloca i1, i1 0
-  %nop13680 = alloca i1, i1 0
-  %nop13681 = alloca i1, i1 0
-  %nop13682 = alloca i1, i1 0
-  %nop13683 = alloca i1, i1 0
-  %nop13684 = alloca i1, i1 0
-  %nop13685 = alloca i1, i1 0
-  %nop13686 = alloca i1, i1 0
-  %nop13687 = alloca i1, i1 0
-  %nop13688 = alloca i1, i1 0
-  %nop13689 = alloca i1, i1 0
-  %nop13690 = alloca i1, i1 0
-  %nop13691 = alloca i1, i1 0
-  %nop13692 = alloca i1, i1 0
-  %nop13693 = alloca i1, i1 0
-  %nop13694 = alloca i1, i1 0
-  %nop13695 = alloca i1, i1 0
-  %nop13696 = alloca i1, i1 0
-  %nop13697 = alloca i1, i1 0
-  %nop13698 = alloca i1, i1 0
-  %nop13699 = alloca i1, i1 0
-  %nop13700 = alloca i1, i1 0
-  %nop13701 = alloca i1, i1 0
-  %nop13702 = alloca i1, i1 0
-  %nop13703 = alloca i1, i1 0
-  %nop13704 = alloca i1, i1 0
-  %nop13705 = alloca i1, i1 0
-  %nop13706 = alloca i1, i1 0
-  %nop13707 = alloca i1, i1 0
-  %nop13708 = alloca i1, i1 0
-  %nop13709 = alloca i1, i1 0
-  %nop13710 = alloca i1, i1 0
-  %nop13711 = alloca i1, i1 0
-  %nop13712 = alloca i1, i1 0
-  %nop13713 = alloca i1, i1 0
-  %nop13714 = alloca i1, i1 0
-  %nop13715 = alloca i1, i1 0
-  %nop13716 = alloca i1, i1 0
-  %nop13717 = alloca i1, i1 0
-  %nop13718 = alloca i1, i1 0
-  %nop13719 = alloca i1, i1 0
-  %nop13720 = alloca i1, i1 0
-  %nop13721 = alloca i1, i1 0
-  %nop13722 = alloca i1, i1 0
-  %nop13723 = alloca i1, i1 0
-  %nop13724 = alloca i1, i1 0
-  %nop13725 = alloca i1, i1 0
-  %nop13726 = alloca i1, i1 0
-  %nop13727 = alloca i1, i1 0
-  %nop13728 = alloca i1, i1 0
-  %nop13729 = alloca i1, i1 0
-  %nop13730 = alloca i1, i1 0
-  %nop13731 = alloca i1, i1 0
-  %nop13732 = alloca i1, i1 0
-  %nop13733 = alloca i1, i1 0
-  %nop13734 = alloca i1, i1 0
-  %nop13735 = alloca i1, i1 0
-  %nop13736 = alloca i1, i1 0
-  %nop13737 = alloca i1, i1 0
-  %nop13738 = alloca i1, i1 0
-  %nop13739 = alloca i1, i1 0
-  %nop13740 = alloca i1, i1 0
-  %nop13741 = alloca i1, i1 0
-  %nop13742 = alloca i1, i1 0
-  %nop13743 = alloca i1, i1 0
-  %nop13744 = alloca i1, i1 0
-  %nop13745 = alloca i1, i1 0
-  %nop13746 = alloca i1, i1 0
-  %nop13747 = alloca i1, i1 0
-  %nop13748 = alloca i1, i1 0
-  %nop13749 = alloca i1, i1 0
-  %nop13750 = alloca i1, i1 0
-  %nop13751 = alloca i1, i1 0
-  %nop13752 = alloca i1, i1 0
-  %nop13753 = alloca i1, i1 0
-  %nop13754 = alloca i1, i1 0
-  %nop13755 = alloca i1, i1 0
-  %nop13756 = alloca i1, i1 0
-  %nop13757 = alloca i1, i1 0
-  %nop13758 = alloca i1, i1 0
-  %nop13759 = alloca i1, i1 0
-  %nop13760 = alloca i1, i1 0
-  %nop13761 = alloca i1, i1 0
-  %nop13762 = alloca i1, i1 0
-  %nop13763 = alloca i1, i1 0
-  %nop13764 = alloca i1, i1 0
-  %nop13765 = alloca i1, i1 0
-  %nop13766 = alloca i1, i1 0
-  %nop13767 = alloca i1, i1 0
-  %nop13768 = alloca i1, i1 0
-  %nop13769 = alloca i1, i1 0
-  %nop13770 = alloca i1, i1 0
-  %nop13771 = alloca i1, i1 0
-  %nop13772 = alloca i1, i1 0
-  %nop13773 = alloca i1, i1 0
-  %nop13774 = alloca i1, i1 0
-  %nop13775 = alloca i1, i1 0
-  %nop13776 = alloca i1, i1 0
-  %nop13777 = alloca i1, i1 0
-  %nop13778 = alloca i1, i1 0
-  %nop13779 = alloca i1, i1 0
-  %nop13780 = alloca i1, i1 0
-  %nop13781 = alloca i1, i1 0
-  %nop13782 = alloca i1, i1 0
-  %nop13783 = alloca i1, i1 0
-  %nop13784 = alloca i1, i1 0
-  %nop13785 = alloca i1, i1 0
-  %nop13786 = alloca i1, i1 0
-  %nop13787 = alloca i1, i1 0
-  %nop13788 = alloca i1, i1 0
-  %nop13789 = alloca i1, i1 0
-  %nop13790 = alloca i1, i1 0
-  %nop13791 = alloca i1, i1 0
-  %nop13792 = alloca i1, i1 0
-  %nop13793 = alloca i1, i1 0
-  %nop13794 = alloca i1, i1 0
-  %nop13795 = alloca i1, i1 0
-  %nop13796 = alloca i1, i1 0
-  %nop13797 = alloca i1, i1 0
-  %nop13798 = alloca i1, i1 0
-  %nop13799 = alloca i1, i1 0
-  %nop13800 = alloca i1, i1 0
-  %nop13801 = alloca i1, i1 0
-  %nop13802 = alloca i1, i1 0
-  %nop13803 = alloca i1, i1 0
-  %nop13804 = alloca i1, i1 0
-  %nop13805 = alloca i1, i1 0
-  %nop13806 = alloca i1, i1 0
-  %nop13807 = alloca i1, i1 0
-  %nop13808 = alloca i1, i1 0
-  %nop13809 = alloca i1, i1 0
-  %nop13810 = alloca i1, i1 0
-  %nop13811 = alloca i1, i1 0
-  %nop13812 = alloca i1, i1 0
-  %nop13813 = alloca i1, i1 0
-  %nop13814 = alloca i1, i1 0
-  %nop13815 = alloca i1, i1 0
-  %nop13816 = alloca i1, i1 0
-  %nop13817 = alloca i1, i1 0
-  %nop13818 = alloca i1, i1 0
-  %nop13819 = alloca i1, i1 0
-  %nop13820 = alloca i1, i1 0
-  %nop13821 = alloca i1, i1 0
-  %nop13822 = alloca i1, i1 0
-  %nop13823 = alloca i1, i1 0
-  %nop13824 = alloca i1, i1 0
-  %nop13825 = alloca i1, i1 0
-  %nop13826 = alloca i1, i1 0
-  %nop13827 = alloca i1, i1 0
-  %nop13828 = alloca i1, i1 0
-  %nop13829 = alloca i1, i1 0
-  %nop13830 = alloca i1, i1 0
-  %nop13831 = alloca i1, i1 0
-  %nop13832 = alloca i1, i1 0
-  %nop13833 = alloca i1, i1 0
-  %nop13834 = alloca i1, i1 0
-  %nop13835 = alloca i1, i1 0
-  %nop13836 = alloca i1, i1 0
-  %nop13837 = alloca i1, i1 0
-  %nop13838 = alloca i1, i1 0
-  %nop13839 = alloca i1, i1 0
-  %nop13840 = alloca i1, i1 0
-  %nop13841 = alloca i1, i1 0
-  %nop13842 = alloca i1, i1 0
-  %nop13843 = alloca i1, i1 0
-  %nop13844 = alloca i1, i1 0
-  %nop13845 = alloca i1, i1 0
-  %nop13846 = alloca i1, i1 0
-  %nop13847 = alloca i1, i1 0
-  %nop13848 = alloca i1, i1 0
-  %nop13849 = alloca i1, i1 0
-  %nop13850 = alloca i1, i1 0
-  %nop13851 = alloca i1, i1 0
-  %nop13852 = alloca i1, i1 0
-  %nop13853 = alloca i1, i1 0
-  %nop13854 = alloca i1, i1 0
-  %nop13855 = alloca i1, i1 0
-  %nop13856 = alloca i1, i1 0
-  %nop13857 = alloca i1, i1 0
-  %nop13858 = alloca i1, i1 0
-  %nop13859 = alloca i1, i1 0
-  %nop13860 = alloca i1, i1 0
-  %nop13861 = alloca i1, i1 0
-  %nop13862 = alloca i1, i1 0
-  %nop13863 = alloca i1, i1 0
-  %nop13864 = alloca i1, i1 0
-  %nop13865 = alloca i1, i1 0
-  %nop13866 = alloca i1, i1 0
-  %nop13867 = alloca i1, i1 0
-  %nop13868 = alloca i1, i1 0
-  %nop13869 = alloca i1, i1 0
-  %nop13870 = alloca i1, i1 0
-  %nop13871 = alloca i1, i1 0
-  %nop13872 = alloca i1, i1 0
-  %nop13873 = alloca i1, i1 0
-  %nop13874 = alloca i1, i1 0
-  %nop13875 = alloca i1, i1 0
-  %nop13876 = alloca i1, i1 0
-  %nop13877 = alloca i1, i1 0
-  %nop13878 = alloca i1, i1 0
-  %nop13879 = alloca i1, i1 0
-  %nop13880 = alloca i1, i1 0
-  %nop13881 = alloca i1, i1 0
-  %nop13882 = alloca i1, i1 0
-  %nop13883 = alloca i1, i1 0
-  %nop13884 = alloca i1, i1 0
-  %nop13885 = alloca i1, i1 0
-  %nop13886 = alloca i1, i1 0
-  %nop13887 = alloca i1, i1 0
-  %nop13888 = alloca i1, i1 0
-  %nop13889 = alloca i1, i1 0
-  %nop13890 = alloca i1, i1 0
-  %nop13891 = alloca i1, i1 0
-  %nop13892 = alloca i1, i1 0
-  %nop13893 = alloca i1, i1 0
-  %nop13894 = alloca i1, i1 0
-  %nop13895 = alloca i1, i1 0
-  %nop13896 = alloca i1, i1 0
-  %nop13897 = alloca i1, i1 0
-  %nop13898 = alloca i1, i1 0
-  %nop13899 = alloca i1, i1 0
-  %nop13900 = alloca i1, i1 0
-  %nop13901 = alloca i1, i1 0
-  %nop13902 = alloca i1, i1 0
-  %nop13903 = alloca i1, i1 0
-  %nop13904 = alloca i1, i1 0
-  %nop13905 = alloca i1, i1 0
-  %nop13906 = alloca i1, i1 0
-  %nop13907 = alloca i1, i1 0
-  %nop13908 = alloca i1, i1 0
-  %nop13909 = alloca i1, i1 0
-  %nop13910 = alloca i1, i1 0
-  %nop13911 = alloca i1, i1 0
-  %nop13912 = alloca i1, i1 0
-  %nop13913 = alloca i1, i1 0
-  %nop13914 = alloca i1, i1 0
-  %nop13915 = alloca i1, i1 0
-  %nop13916 = alloca i1, i1 0
-  %nop13917 = alloca i1, i1 0
-  %nop13918 = alloca i1, i1 0
-  %nop13919 = alloca i1, i1 0
-  %nop13920 = alloca i1, i1 0
-  %nop13921 = alloca i1, i1 0
-  %nop13922 = alloca i1, i1 0
-  %nop13923 = alloca i1, i1 0
-  %nop13924 = alloca i1, i1 0
-  %nop13925 = alloca i1, i1 0
-  %nop13926 = alloca i1, i1 0
-  %nop13927 = alloca i1, i1 0
-  %nop13928 = alloca i1, i1 0
-  %nop13929 = alloca i1, i1 0
-  %nop13930 = alloca i1, i1 0
-  %nop13931 = alloca i1, i1 0
-  %nop13932 = alloca i1, i1 0
-  %nop13933 = alloca i1, i1 0
-  %nop13934 = alloca i1, i1 0
-  %nop13935 = alloca i1, i1 0
-  %nop13936 = alloca i1, i1 0
-  %nop13937 = alloca i1, i1 0
-  %nop13938 = alloca i1, i1 0
-  %nop13939 = alloca i1, i1 0
-  %nop13940 = alloca i1, i1 0
-  %nop13941 = alloca i1, i1 0
-  %nop13942 = alloca i1, i1 0
-  %nop13943 = alloca i1, i1 0
-  %nop13944 = alloca i1, i1 0
-  %nop13945 = alloca i1, i1 0
-  %nop13946 = alloca i1, i1 0
-  %nop13947 = alloca i1, i1 0
-  %nop13948 = alloca i1, i1 0
-  %nop13949 = alloca i1, i1 0
-  %nop13950 = alloca i1, i1 0
-  %nop13951 = alloca i1, i1 0
-  %nop13952 = alloca i1, i1 0
-  %nop13953 = alloca i1, i1 0
-  %nop13954 = alloca i1, i1 0
-  %nop13955 = alloca i1, i1 0
-  %nop13956 = alloca i1, i1 0
-  %nop13957 = alloca i1, i1 0
-  %nop13958 = alloca i1, i1 0
-  %nop13959 = alloca i1, i1 0
-  %nop13960 = alloca i1, i1 0
-  %nop13961 = alloca i1, i1 0
-  %nop13962 = alloca i1, i1 0
-  %nop13963 = alloca i1, i1 0
-  %nop13964 = alloca i1, i1 0
-  %nop13965 = alloca i1, i1 0
-  %nop13966 = alloca i1, i1 0
-  %nop13967 = alloca i1, i1 0
-  %nop13968 = alloca i1, i1 0
-  %nop13969 = alloca i1, i1 0
-  %nop13970 = alloca i1, i1 0
-  %nop13971 = alloca i1, i1 0
-  %nop13972 = alloca i1, i1 0
-  %nop13973 = alloca i1, i1 0
-  %nop13974 = alloca i1, i1 0
-  %nop13975 = alloca i1, i1 0
-  %nop13976 = alloca i1, i1 0
-  %nop13977 = alloca i1, i1 0
-  %nop13978 = alloca i1, i1 0
-  %nop13979 = alloca i1, i1 0
-  %nop13980 = alloca i1, i1 0
-  %nop13981 = alloca i1, i1 0
-  %nop13982 = alloca i1, i1 0
-  %nop13983 = alloca i1, i1 0
-  %nop13984 = alloca i1, i1 0
-  %nop13985 = alloca i1, i1 0
-  %nop13986 = alloca i1, i1 0
-  %nop13987 = alloca i1, i1 0
-  %nop13988 = alloca i1, i1 0
-  %nop13989 = alloca i1, i1 0
-  %nop13990 = alloca i1, i1 0
-  %nop13991 = alloca i1, i1 0
-  %nop13992 = alloca i1, i1 0
-  %nop13993 = alloca i1, i1 0
-  %nop13994 = alloca i1, i1 0
-  %nop13995 = alloca i1, i1 0
-  %nop13996 = alloca i1, i1 0
-  %nop13997 = alloca i1, i1 0
-  %nop13998 = alloca i1, i1 0
-  %nop13999 = alloca i1, i1 0
-  %nop14000 = alloca i1, i1 0
-  %nop14001 = alloca i1, i1 0
-  %nop14002 = alloca i1, i1 0
-  %nop14003 = alloca i1, i1 0
-  %nop14004 = alloca i1, i1 0
-  %nop14005 = alloca i1, i1 0
-  %nop14006 = alloca i1, i1 0
-  %nop14007 = alloca i1, i1 0
-  %nop14008 = alloca i1, i1 0
-  %nop14009 = alloca i1, i1 0
-  %nop14010 = alloca i1, i1 0
-  %nop14011 = alloca i1, i1 0
-  %nop14012 = alloca i1, i1 0
-  %nop14013 = alloca i1, i1 0
-  %nop14014 = alloca i1, i1 0
-  %nop14015 = alloca i1, i1 0
-  %nop14016 = alloca i1, i1 0
-  %nop14017 = alloca i1, i1 0
-  %nop14018 = alloca i1, i1 0
-  %nop14019 = alloca i1, i1 0
-  %nop14020 = alloca i1, i1 0
-  %nop14021 = alloca i1, i1 0
-  %nop14022 = alloca i1, i1 0
-  %nop14023 = alloca i1, i1 0
-  %nop14024 = alloca i1, i1 0
-  %nop14025 = alloca i1, i1 0
-  %nop14026 = alloca i1, i1 0
-  %nop14027 = alloca i1, i1 0
-  %nop14028 = alloca i1, i1 0
-  %nop14029 = alloca i1, i1 0
-  %nop14030 = alloca i1, i1 0
-  %nop14031 = alloca i1, i1 0
-  %nop14032 = alloca i1, i1 0
-  %nop14033 = alloca i1, i1 0
-  %nop14034 = alloca i1, i1 0
-  %nop14035 = alloca i1, i1 0
-  %nop14036 = alloca i1, i1 0
-  %nop14037 = alloca i1, i1 0
-  %nop14038 = alloca i1, i1 0
-  %nop14039 = alloca i1, i1 0
-  %nop14040 = alloca i1, i1 0
-  %nop14041 = alloca i1, i1 0
-  %nop14042 = alloca i1, i1 0
-  %nop14043 = alloca i1, i1 0
-  %nop14044 = alloca i1, i1 0
-  %nop14045 = alloca i1, i1 0
-  %nop14046 = alloca i1, i1 0
-  %nop14047 = alloca i1, i1 0
-  %nop14048 = alloca i1, i1 0
-  %nop14049 = alloca i1, i1 0
-  %nop14050 = alloca i1, i1 0
-  %nop14051 = alloca i1, i1 0
-  %nop14052 = alloca i1, i1 0
-  %nop14053 = alloca i1, i1 0
-  %nop14054 = alloca i1, i1 0
-  %nop14055 = alloca i1, i1 0
-  %nop14056 = alloca i1, i1 0
-  %nop14057 = alloca i1, i1 0
-  %nop14058 = alloca i1, i1 0
-  %nop14059 = alloca i1, i1 0
-  %nop14060 = alloca i1, i1 0
-  %nop14061 = alloca i1, i1 0
-  %nop14062 = alloca i1, i1 0
-  %nop14063 = alloca i1, i1 0
-  %nop14064 = alloca i1, i1 0
-  %nop14065 = alloca i1, i1 0
-  %nop14066 = alloca i1, i1 0
-  %nop14067 = alloca i1, i1 0
-  %nop14068 = alloca i1, i1 0
-  %nop14069 = alloca i1, i1 0
-  %nop14070 = alloca i1, i1 0
-  %nop14071 = alloca i1, i1 0
-  %nop14072 = alloca i1, i1 0
-  %nop14073 = alloca i1, i1 0
-  %nop14074 = alloca i1, i1 0
-  %nop14075 = alloca i1, i1 0
-  %nop14076 = alloca i1, i1 0
-  %nop14077 = alloca i1, i1 0
-  %nop14078 = alloca i1, i1 0
-  %nop14079 = alloca i1, i1 0
-  %nop14080 = alloca i1, i1 0
-  %nop14081 = alloca i1, i1 0
-  %nop14082 = alloca i1, i1 0
-  %nop14083 = alloca i1, i1 0
-  %nop14084 = alloca i1, i1 0
-  %nop14085 = alloca i1, i1 0
-  %nop14086 = alloca i1, i1 0
-  %nop14087 = alloca i1, i1 0
-  %nop14088 = alloca i1, i1 0
-  %nop14089 = alloca i1, i1 0
-  %nop14090 = alloca i1, i1 0
-  %nop14091 = alloca i1, i1 0
-  %nop14092 = alloca i1, i1 0
-  %nop14093 = alloca i1, i1 0
-  %nop14094 = alloca i1, i1 0
-  %nop14095 = alloca i1, i1 0
-  %nop14096 = alloca i1, i1 0
-  %nop14097 = alloca i1, i1 0
-  %nop14098 = alloca i1, i1 0
-  %nop14099 = alloca i1, i1 0
-  %nop14100 = alloca i1, i1 0
-  %nop14101 = alloca i1, i1 0
-  %nop14102 = alloca i1, i1 0
-  %nop14103 = alloca i1, i1 0
-  %nop14104 = alloca i1, i1 0
-  %nop14105 = alloca i1, i1 0
-  %nop14106 = alloca i1, i1 0
-  %nop14107 = alloca i1, i1 0
-  %nop14108 = alloca i1, i1 0
-  %nop14109 = alloca i1, i1 0
-  %nop14110 = alloca i1, i1 0
-  %nop14111 = alloca i1, i1 0
-  %nop14112 = alloca i1, i1 0
-  %nop14113 = alloca i1, i1 0
-  %nop14114 = alloca i1, i1 0
-  %nop14115 = alloca i1, i1 0
-  %nop14116 = alloca i1, i1 0
-  %nop14117 = alloca i1, i1 0
-  %nop14118 = alloca i1, i1 0
-  %nop14119 = alloca i1, i1 0
-  %nop14120 = alloca i1, i1 0
-  %nop14121 = alloca i1, i1 0
-  %nop14122 = alloca i1, i1 0
-  %nop14123 = alloca i1, i1 0
-  %nop14124 = alloca i1, i1 0
-  %nop14125 = alloca i1, i1 0
-  %nop14126 = alloca i1, i1 0
-  %nop14127 = alloca i1, i1 0
-  %nop14128 = alloca i1, i1 0
-  %nop14129 = alloca i1, i1 0
-  %nop14130 = alloca i1, i1 0
-  %nop14131 = alloca i1, i1 0
-  %nop14132 = alloca i1, i1 0
-  %nop14133 = alloca i1, i1 0
-  %nop14134 = alloca i1, i1 0
-  %nop14135 = alloca i1, i1 0
-  %nop14136 = alloca i1, i1 0
-  %nop14137 = alloca i1, i1 0
-  %nop14138 = alloca i1, i1 0
-  %nop14139 = alloca i1, i1 0
-  %nop14140 = alloca i1, i1 0
-  %nop14141 = alloca i1, i1 0
-  %nop14142 = alloca i1, i1 0
-  %nop14143 = alloca i1, i1 0
-  %nop14144 = alloca i1, i1 0
-  %nop14145 = alloca i1, i1 0
-  %nop14146 = alloca i1, i1 0
-  %nop14147 = alloca i1, i1 0
-  %nop14148 = alloca i1, i1 0
-  %nop14149 = alloca i1, i1 0
-  %nop14150 = alloca i1, i1 0
-  %nop14151 = alloca i1, i1 0
-  %nop14152 = alloca i1, i1 0
-  %nop14153 = alloca i1, i1 0
-  %nop14154 = alloca i1, i1 0
-  %nop14155 = alloca i1, i1 0
-  %nop14156 = alloca i1, i1 0
-  %nop14157 = alloca i1, i1 0
-  %nop14158 = alloca i1, i1 0
-  %nop14159 = alloca i1, i1 0
-  %nop14160 = alloca i1, i1 0
-  %nop14161 = alloca i1, i1 0
-  %nop14162 = alloca i1, i1 0
-  %nop14163 = alloca i1, i1 0
-  %nop14164 = alloca i1, i1 0
-  %nop14165 = alloca i1, i1 0
-  %nop14166 = alloca i1, i1 0
-  %nop14167 = alloca i1, i1 0
-  %nop14168 = alloca i1, i1 0
-  %nop14169 = alloca i1, i1 0
-  %nop14170 = alloca i1, i1 0
-  %nop14171 = alloca i1, i1 0
-  %nop14172 = alloca i1, i1 0
-  %nop14173 = alloca i1, i1 0
-  %nop14174 = alloca i1, i1 0
-  %nop14175 = alloca i1, i1 0
-  %nop14176 = alloca i1, i1 0
-  %nop14177 = alloca i1, i1 0
-  %nop14178 = alloca i1, i1 0
-  %nop14179 = alloca i1, i1 0
-  %nop14180 = alloca i1, i1 0
-  %nop14181 = alloca i1, i1 0
-  %nop14182 = alloca i1, i1 0
-  %nop14183 = alloca i1, i1 0
-  %nop14184 = alloca i1, i1 0
-  %nop14185 = alloca i1, i1 0
-  %nop14186 = alloca i1, i1 0
-  %nop14187 = alloca i1, i1 0
-  %nop14188 = alloca i1, i1 0
-  %nop14189 = alloca i1, i1 0
-  %nop14190 = alloca i1, i1 0
-  %nop14191 = alloca i1, i1 0
-  %nop14192 = alloca i1, i1 0
-  %nop14193 = alloca i1, i1 0
-  %nop14194 = alloca i1, i1 0
-  %nop14195 = alloca i1, i1 0
-  %nop14196 = alloca i1, i1 0
-  %nop14197 = alloca i1, i1 0
-  %nop14198 = alloca i1, i1 0
-  %nop14199 = alloca i1, i1 0
-  %nop14200 = alloca i1, i1 0
-  %nop14201 = alloca i1, i1 0
-  %nop14202 = alloca i1, i1 0
-  %nop14203 = alloca i1, i1 0
-  %nop14204 = alloca i1, i1 0
-  %nop14205 = alloca i1, i1 0
-  %nop14206 = alloca i1, i1 0
-  %nop14207 = alloca i1, i1 0
-  %nop14208 = alloca i1, i1 0
-  %nop14209 = alloca i1, i1 0
-  %nop14210 = alloca i1, i1 0
-  %nop14211 = alloca i1, i1 0
-  %nop14212 = alloca i1, i1 0
-  %nop14213 = alloca i1, i1 0
-  %nop14214 = alloca i1, i1 0
-  %nop14215 = alloca i1, i1 0
-  %nop14216 = alloca i1, i1 0
-  %nop14217 = alloca i1, i1 0
-  %nop14218 = alloca i1, i1 0
-  %nop14219 = alloca i1, i1 0
-  %nop14220 = alloca i1, i1 0
-  %nop14221 = alloca i1, i1 0
-  %nop14222 = alloca i1, i1 0
-  %nop14223 = alloca i1, i1 0
-  %nop14224 = alloca i1, i1 0
-  %nop14225 = alloca i1, i1 0
-  %nop14226 = alloca i1, i1 0
-  %nop14227 = alloca i1, i1 0
-  %nop14228 = alloca i1, i1 0
-  %nop14229 = alloca i1, i1 0
-  %nop14230 = alloca i1, i1 0
-  %nop14231 = alloca i1, i1 0
-  %nop14232 = alloca i1, i1 0
-  %nop14233 = alloca i1, i1 0
-  %nop14234 = alloca i1, i1 0
-  %nop14235 = alloca i1, i1 0
-  %nop14236 = alloca i1, i1 0
-  %nop14237 = alloca i1, i1 0
-  %nop14238 = alloca i1, i1 0
-  %nop14239 = alloca i1, i1 0
-  %nop14240 = alloca i1, i1 0
-  %nop14241 = alloca i1, i1 0
-  %nop14242 = alloca i1, i1 0
-  %nop14243 = alloca i1, i1 0
-  %nop14244 = alloca i1, i1 0
-  %nop14245 = alloca i1, i1 0
-  %nop14246 = alloca i1, i1 0
-  %nop14247 = alloca i1, i1 0
-  %nop14248 = alloca i1, i1 0
-  %nop14249 = alloca i1, i1 0
-  %nop14250 = alloca i1, i1 0
-  %nop14251 = alloca i1, i1 0
-  %nop14252 = alloca i1, i1 0
-  %nop14253 = alloca i1, i1 0
-  %nop14254 = alloca i1, i1 0
-  %nop14255 = alloca i1, i1 0
-  %nop14256 = alloca i1, i1 0
-  %nop14257 = alloca i1, i1 0
-  %nop14258 = alloca i1, i1 0
-  %nop14259 = alloca i1, i1 0
-  %nop14260 = alloca i1, i1 0
-  %nop14261 = alloca i1, i1 0
-  %nop14262 = alloca i1, i1 0
-  %nop14263 = alloca i1, i1 0
-  %nop14264 = alloca i1, i1 0
-  %nop14265 = alloca i1, i1 0
-  %nop14266 = alloca i1, i1 0
-  %nop14267 = alloca i1, i1 0
-  %nop14268 = alloca i1, i1 0
-  %nop14269 = alloca i1, i1 0
-  %nop14270 = alloca i1, i1 0
-  %nop14271 = alloca i1, i1 0
-  %nop14272 = alloca i1, i1 0
-  %nop14273 = alloca i1, i1 0
-  %nop14274 = alloca i1, i1 0
-  %nop14275 = alloca i1, i1 0
-  %nop14276 = alloca i1, i1 0
-  %nop14277 = alloca i1, i1 0
-  %nop14278 = alloca i1, i1 0
-  %nop14279 = alloca i1, i1 0
-  %nop14280 = alloca i1, i1 0
-  %nop14281 = alloca i1, i1 0
-  %nop14282 = alloca i1, i1 0
-  %nop14283 = alloca i1, i1 0
-  %nop14284 = alloca i1, i1 0
-  %nop14285 = alloca i1, i1 0
-  %nop14286 = alloca i1, i1 0
-  %nop14287 = alloca i1, i1 0
-  %nop14288 = alloca i1, i1 0
-  %nop14289 = alloca i1, i1 0
-  %nop14290 = alloca i1, i1 0
-  %nop14291 = alloca i1, i1 0
-  %nop14292 = alloca i1, i1 0
-  %nop14293 = alloca i1, i1 0
-  %nop14294 = alloca i1, i1 0
-  %nop14295 = alloca i1, i1 0
-  %nop14296 = alloca i1, i1 0
-  %nop14297 = alloca i1, i1 0
-  %nop14298 = alloca i1, i1 0
-  %nop14299 = alloca i1, i1 0
-  %nop14300 = alloca i1, i1 0
-  %nop14301 = alloca i1, i1 0
-  %nop14302 = alloca i1, i1 0
-  %nop14303 = alloca i1, i1 0
-  %nop14304 = alloca i1, i1 0
-  %nop14305 = alloca i1, i1 0
-  %nop14306 = alloca i1, i1 0
-  %nop14307 = alloca i1, i1 0
-  %nop14308 = alloca i1, i1 0
-  %nop14309 = alloca i1, i1 0
-  %nop14310 = alloca i1, i1 0
-  %nop14311 = alloca i1, i1 0
-  %nop14312 = alloca i1, i1 0
-  %nop14313 = alloca i1, i1 0
-  %nop14314 = alloca i1, i1 0
-  %nop14315 = alloca i1, i1 0
-  %nop14316 = alloca i1, i1 0
-  %nop14317 = alloca i1, i1 0
-  %nop14318 = alloca i1, i1 0
-  %nop14319 = alloca i1, i1 0
-  %nop14320 = alloca i1, i1 0
-  %nop14321 = alloca i1, i1 0
-  %nop14322 = alloca i1, i1 0
-  %nop14323 = alloca i1, i1 0
-  %nop14324 = alloca i1, i1 0
-  %nop14325 = alloca i1, i1 0
-  %nop14326 = alloca i1, i1 0
-  %nop14327 = alloca i1, i1 0
-  %nop14328 = alloca i1, i1 0
-  %nop14329 = alloca i1, i1 0
-  %nop14330 = alloca i1, i1 0
-  %nop14331 = alloca i1, i1 0
-  %nop14332 = alloca i1, i1 0
-  %nop14333 = alloca i1, i1 0
-  %nop14334 = alloca i1, i1 0
-  %nop14335 = alloca i1, i1 0
-  %nop14336 = alloca i1, i1 0
-  %nop14337 = alloca i1, i1 0
-  %nop14338 = alloca i1, i1 0
-  %nop14339 = alloca i1, i1 0
-  %nop14340 = alloca i1, i1 0
-  %nop14341 = alloca i1, i1 0
-  %nop14342 = alloca i1, i1 0
-  %nop14343 = alloca i1, i1 0
-  %nop14344 = alloca i1, i1 0
-  %nop14345 = alloca i1, i1 0
-  %nop14346 = alloca i1, i1 0
-  %nop14347 = alloca i1, i1 0
-  %nop14348 = alloca i1, i1 0
-  %nop14349 = alloca i1, i1 0
-  %nop14350 = alloca i1, i1 0
-  %nop14351 = alloca i1, i1 0
-  %nop14352 = alloca i1, i1 0
-  %nop14353 = alloca i1, i1 0
-  %nop14354 = alloca i1, i1 0
-  %nop14355 = alloca i1, i1 0
-  %nop14356 = alloca i1, i1 0
-  %nop14357 = alloca i1, i1 0
-  %nop14358 = alloca i1, i1 0
-  %nop14359 = alloca i1, i1 0
-  %nop14360 = alloca i1, i1 0
-  %nop14361 = alloca i1, i1 0
-  %nop14362 = alloca i1, i1 0
-  %nop14363 = alloca i1, i1 0
-  %nop14364 = alloca i1, i1 0
-  %nop14365 = alloca i1, i1 0
-  %nop14366 = alloca i1, i1 0
-  %nop14367 = alloca i1, i1 0
-  %nop14368 = alloca i1, i1 0
-  %nop14369 = alloca i1, i1 0
-  %nop14370 = alloca i1, i1 0
-  %nop14371 = alloca i1, i1 0
-  %nop14372 = alloca i1, i1 0
-  %nop14373 = alloca i1, i1 0
-  %nop14374 = alloca i1, i1 0
-  %nop14375 = alloca i1, i1 0
-  %nop14376 = alloca i1, i1 0
-  %nop14377 = alloca i1, i1 0
-  %nop14378 = alloca i1, i1 0
-  %nop14379 = alloca i1, i1 0
-  %nop14380 = alloca i1, i1 0
-  %nop14381 = alloca i1, i1 0
-  %nop14382 = alloca i1, i1 0
-  %nop14383 = alloca i1, i1 0
-  %nop14384 = alloca i1, i1 0
-  %nop14385 = alloca i1, i1 0
-  %nop14386 = alloca i1, i1 0
-  %nop14387 = alloca i1, i1 0
-  %nop14388 = alloca i1, i1 0
-  %nop14389 = alloca i1, i1 0
-  %nop14390 = alloca i1, i1 0
-  %nop14391 = alloca i1, i1 0
-  %nop14392 = alloca i1, i1 0
-  %nop14393 = alloca i1, i1 0
-  %nop14394 = alloca i1, i1 0
-  %nop14395 = alloca i1, i1 0
-  %nop14396 = alloca i1, i1 0
-  %nop14397 = alloca i1, i1 0
-  %nop14398 = alloca i1, i1 0
-  %nop14399 = alloca i1, i1 0
-  %nop14400 = alloca i1, i1 0
-  %nop14401 = alloca i1, i1 0
-  %nop14402 = alloca i1, i1 0
-  %nop14403 = alloca i1, i1 0
-  %nop14404 = alloca i1, i1 0
-  %nop14405 = alloca i1, i1 0
-  %nop14406 = alloca i1, i1 0
-  %nop14407 = alloca i1, i1 0
-  %nop14408 = alloca i1, i1 0
-  %nop14409 = alloca i1, i1 0
-  %nop14410 = alloca i1, i1 0
-  %nop14411 = alloca i1, i1 0
-  %nop14412 = alloca i1, i1 0
-  %nop14413 = alloca i1, i1 0
-  %nop14414 = alloca i1, i1 0
-  %nop14415 = alloca i1, i1 0
-  %nop14416 = alloca i1, i1 0
-  %nop14417 = alloca i1, i1 0
-  %nop14418 = alloca i1, i1 0
-  %nop14419 = alloca i1, i1 0
-  %nop14420 = alloca i1, i1 0
-  %nop14421 = alloca i1, i1 0
-  %nop14422 = alloca i1, i1 0
-  %nop14423 = alloca i1, i1 0
-  %nop14424 = alloca i1, i1 0
-  %nop14425 = alloca i1, i1 0
-  %nop14426 = alloca i1, i1 0
-  %nop14427 = alloca i1, i1 0
-  %nop14428 = alloca i1, i1 0
-  %nop14429 = alloca i1, i1 0
-  %nop14430 = alloca i1, i1 0
-  %nop14431 = alloca i1, i1 0
-  %nop14432 = alloca i1, i1 0
-  %nop14433 = alloca i1, i1 0
-  %nop14434 = alloca i1, i1 0
-  %nop14435 = alloca i1, i1 0
-  %nop14436 = alloca i1, i1 0
-  %nop14437 = alloca i1, i1 0
-  %nop14438 = alloca i1, i1 0
-  %nop14439 = alloca i1, i1 0
-  %nop14440 = alloca i1, i1 0
-  %nop14441 = alloca i1, i1 0
-  %nop14442 = alloca i1, i1 0
-  %nop14443 = alloca i1, i1 0
-  %nop14444 = alloca i1, i1 0
-  %nop14445 = alloca i1, i1 0
-  %nop14446 = alloca i1, i1 0
-  %nop14447 = alloca i1, i1 0
-  %nop14448 = alloca i1, i1 0
-  %nop14449 = alloca i1, i1 0
-  %nop14450 = alloca i1, i1 0
-  %nop14451 = alloca i1, i1 0
-  %nop14452 = alloca i1, i1 0
-  %nop14453 = alloca i1, i1 0
-  %nop14454 = alloca i1, i1 0
-  %nop14455 = alloca i1, i1 0
-  %nop14456 = alloca i1, i1 0
-  %nop14457 = alloca i1, i1 0
-  %nop14458 = alloca i1, i1 0
-  %nop14459 = alloca i1, i1 0
-  %nop14460 = alloca i1, i1 0
-  %nop14461 = alloca i1, i1 0
-  %nop14462 = alloca i1, i1 0
-  %nop14463 = alloca i1, i1 0
-  %nop14464 = alloca i1, i1 0
-  %nop14465 = alloca i1, i1 0
-  %nop14466 = alloca i1, i1 0
-  %nop14467 = alloca i1, i1 0
-  %nop14468 = alloca i1, i1 0
-  %nop14469 = alloca i1, i1 0
-  %nop14470 = alloca i1, i1 0
-  %nop14471 = alloca i1, i1 0
-  %nop14472 = alloca i1, i1 0
-  %nop14473 = alloca i1, i1 0
-  %nop14474 = alloca i1, i1 0
-  %nop14475 = alloca i1, i1 0
-  %nop14476 = alloca i1, i1 0
-  %nop14477 = alloca i1, i1 0
-  %nop14478 = alloca i1, i1 0
-  %nop14479 = alloca i1, i1 0
-  %nop14480 = alloca i1, i1 0
-  %nop14481 = alloca i1, i1 0
-  %nop14482 = alloca i1, i1 0
-  %nop14483 = alloca i1, i1 0
-  %nop14484 = alloca i1, i1 0
-  %nop14485 = alloca i1, i1 0
-  %nop14486 = alloca i1, i1 0
-  %nop14487 = alloca i1, i1 0
-  %nop14488 = alloca i1, i1 0
-  %nop14489 = alloca i1, i1 0
-  %nop14490 = alloca i1, i1 0
-  %nop14491 = alloca i1, i1 0
-  %nop14492 = alloca i1, i1 0
-  %nop14493 = alloca i1, i1 0
-  %nop14494 = alloca i1, i1 0
-  %nop14495 = alloca i1, i1 0
-  %nop14496 = alloca i1, i1 0
-  %nop14497 = alloca i1, i1 0
-  %nop14498 = alloca i1, i1 0
-  %nop14499 = alloca i1, i1 0
-  %nop14500 = alloca i1, i1 0
-  %nop14501 = alloca i1, i1 0
-  %nop14502 = alloca i1, i1 0
-  %nop14503 = alloca i1, i1 0
-  %nop14504 = alloca i1, i1 0
-  %nop14505 = alloca i1, i1 0
-  %nop14506 = alloca i1, i1 0
-  %nop14507 = alloca i1, i1 0
-  %nop14508 = alloca i1, i1 0
-  %nop14509 = alloca i1, i1 0
-  %nop14510 = alloca i1, i1 0
-  %nop14511 = alloca i1, i1 0
-  %nop14512 = alloca i1, i1 0
-  %nop14513 = alloca i1, i1 0
-  %nop14514 = alloca i1, i1 0
-  %nop14515 = alloca i1, i1 0
-  %nop14516 = alloca i1, i1 0
-  %nop14517 = alloca i1, i1 0
-  %nop14518 = alloca i1, i1 0
-  %nop14519 = alloca i1, i1 0
-  %nop14520 = alloca i1, i1 0
-  %nop14521 = alloca i1, i1 0
-  %nop14522 = alloca i1, i1 0
-  %nop14523 = alloca i1, i1 0
-  %nop14524 = alloca i1, i1 0
-  %nop14525 = alloca i1, i1 0
-  %nop14526 = alloca i1, i1 0
-  %nop14527 = alloca i1, i1 0
-  %nop14528 = alloca i1, i1 0
-  %nop14529 = alloca i1, i1 0
-  %nop14530 = alloca i1, i1 0
-  %nop14531 = alloca i1, i1 0
-  %nop14532 = alloca i1, i1 0
-  %nop14533 = alloca i1, i1 0
-  %nop14534 = alloca i1, i1 0
-  %nop14535 = alloca i1, i1 0
-  %nop14536 = alloca i1, i1 0
-  %nop14537 = alloca i1, i1 0
-  %nop14538 = alloca i1, i1 0
-  %nop14539 = alloca i1, i1 0
-  %nop14540 = alloca i1, i1 0
-  %nop14541 = alloca i1, i1 0
-  %nop14542 = alloca i1, i1 0
-  %nop14543 = alloca i1, i1 0
-  %nop14544 = alloca i1, i1 0
-  %nop14545 = alloca i1, i1 0
-  %nop14546 = alloca i1, i1 0
-  %nop14547 = alloca i1, i1 0
-  %nop14548 = alloca i1, i1 0
-  %nop14549 = alloca i1, i1 0
-  %nop14550 = alloca i1, i1 0
-  %nop14551 = alloca i1, i1 0
-  %nop14552 = alloca i1, i1 0
-  %nop14553 = alloca i1, i1 0
-  %nop14554 = alloca i1, i1 0
-  %nop14555 = alloca i1, i1 0
-  %nop14556 = alloca i1, i1 0
-  %nop14557 = alloca i1, i1 0
-  %nop14558 = alloca i1, i1 0
-  %nop14559 = alloca i1, i1 0
-  %nop14560 = alloca i1, i1 0
-  %nop14561 = alloca i1, i1 0
-  %nop14562 = alloca i1, i1 0
-  %nop14563 = alloca i1, i1 0
-  %nop14564 = alloca i1, i1 0
-  %nop14565 = alloca i1, i1 0
-  %nop14566 = alloca i1, i1 0
-  %nop14567 = alloca i1, i1 0
-  %nop14568 = alloca i1, i1 0
-  %nop14569 = alloca i1, i1 0
-  %nop14570 = alloca i1, i1 0
-  %nop14571 = alloca i1, i1 0
-  %nop14572 = alloca i1, i1 0
-  %nop14573 = alloca i1, i1 0
-  %nop14574 = alloca i1, i1 0
-  %nop14575 = alloca i1, i1 0
-  %nop14576 = alloca i1, i1 0
-  %nop14577 = alloca i1, i1 0
-  %nop14578 = alloca i1, i1 0
-  %nop14579 = alloca i1, i1 0
-  %nop14580 = alloca i1, i1 0
-  %nop14581 = alloca i1, i1 0
-  %nop14582 = alloca i1, i1 0
-  %nop14583 = alloca i1, i1 0
-  %nop14584 = alloca i1, i1 0
-  %nop14585 = alloca i1, i1 0
-  %nop14586 = alloca i1, i1 0
-  %nop14587 = alloca i1, i1 0
-  %nop14588 = alloca i1, i1 0
-  %nop14589 = alloca i1, i1 0
-  %nop14590 = alloca i1, i1 0
-  %nop14591 = alloca i1, i1 0
-  %nop14592 = alloca i1, i1 0
-  %nop14593 = alloca i1, i1 0
-  %nop14594 = alloca i1, i1 0
-  %nop14595 = alloca i1, i1 0
-  %nop14596 = alloca i1, i1 0
-  %nop14597 = alloca i1, i1 0
-  %nop14598 = alloca i1, i1 0
-  %nop14599 = alloca i1, i1 0
-  %nop14600 = alloca i1, i1 0
-  %nop14601 = alloca i1, i1 0
-  %nop14602 = alloca i1, i1 0
-  %nop14603 = alloca i1, i1 0
-  %nop14604 = alloca i1, i1 0
-  %nop14605 = alloca i1, i1 0
-  %nop14606 = alloca i1, i1 0
-  %nop14607 = alloca i1, i1 0
-  %nop14608 = alloca i1, i1 0
-  %nop14609 = alloca i1, i1 0
-  %nop14610 = alloca i1, i1 0
-  %nop14611 = alloca i1, i1 0
-  %nop14612 = alloca i1, i1 0
-  %nop14613 = alloca i1, i1 0
-  %nop14614 = alloca i1, i1 0
-  %nop14615 = alloca i1, i1 0
-  %nop14616 = alloca i1, i1 0
-  %nop14617 = alloca i1, i1 0
-  %nop14618 = alloca i1, i1 0
-  %nop14619 = alloca i1, i1 0
-  %nop14620 = alloca i1, i1 0
-  %nop14621 = alloca i1, i1 0
-  %nop14622 = alloca i1, i1 0
-  %nop14623 = alloca i1, i1 0
-  %nop14624 = alloca i1, i1 0
-  %nop14625 = alloca i1, i1 0
-  %nop14626 = alloca i1, i1 0
-  %nop14627 = alloca i1, i1 0
-  %nop14628 = alloca i1, i1 0
-  %nop14629 = alloca i1, i1 0
-  %nop14630 = alloca i1, i1 0
-  %nop14631 = alloca i1, i1 0
-  %nop14632 = alloca i1, i1 0
-  %nop14633 = alloca i1, i1 0
-  %nop14634 = alloca i1, i1 0
-  %nop14635 = alloca i1, i1 0
-  %nop14636 = alloca i1, i1 0
-  %nop14637 = alloca i1, i1 0
-  %nop14638 = alloca i1, i1 0
-  %nop14639 = alloca i1, i1 0
-  %nop14640 = alloca i1, i1 0
-  %nop14641 = alloca i1, i1 0
-  %nop14642 = alloca i1, i1 0
-  %nop14643 = alloca i1, i1 0
-  %nop14644 = alloca i1, i1 0
-  %nop14645 = alloca i1, i1 0
-  %nop14646 = alloca i1, i1 0
-  %nop14647 = alloca i1, i1 0
-  %nop14648 = alloca i1, i1 0
-  %nop14649 = alloca i1, i1 0
-  %nop14650 = alloca i1, i1 0
-  %nop14651 = alloca i1, i1 0
-  %nop14652 = alloca i1, i1 0
-  %nop14653 = alloca i1, i1 0
-  %nop14654 = alloca i1, i1 0
-  %nop14655 = alloca i1, i1 0
-  %nop14656 = alloca i1, i1 0
-  %nop14657 = alloca i1, i1 0
-  %nop14658 = alloca i1, i1 0
-  %nop14659 = alloca i1, i1 0
-  %nop14660 = alloca i1, i1 0
-  %nop14661 = alloca i1, i1 0
-  %nop14662 = alloca i1, i1 0
-  %nop14663 = alloca i1, i1 0
-  %nop14664 = alloca i1, i1 0
-  %nop14665 = alloca i1, i1 0
-  %nop14666 = alloca i1, i1 0
-  %nop14667 = alloca i1, i1 0
-  %nop14668 = alloca i1, i1 0
-  %nop14669 = alloca i1, i1 0
-  %nop14670 = alloca i1, i1 0
-  %nop14671 = alloca i1, i1 0
-  %nop14672 = alloca i1, i1 0
-  %nop14673 = alloca i1, i1 0
-  %nop14674 = alloca i1, i1 0
-  %nop14675 = alloca i1, i1 0
-  %nop14676 = alloca i1, i1 0
-  %nop14677 = alloca i1, i1 0
-  %nop14678 = alloca i1, i1 0
-  %nop14679 = alloca i1, i1 0
-  %nop14680 = alloca i1, i1 0
-  %nop14681 = alloca i1, i1 0
-  %nop14682 = alloca i1, i1 0
-  %nop14683 = alloca i1, i1 0
-  %nop14684 = alloca i1, i1 0
-  %nop14685 = alloca i1, i1 0
-  %nop14686 = alloca i1, i1 0
-  %nop14687 = alloca i1, i1 0
-  %nop14688 = alloca i1, i1 0
-  %nop14689 = alloca i1, i1 0
-  %nop14690 = alloca i1, i1 0
-  %nop14691 = alloca i1, i1 0
-  %nop14692 = alloca i1, i1 0
-  %nop14693 = alloca i1, i1 0
-  %nop14694 = alloca i1, i1 0
-  %nop14695 = alloca i1, i1 0
-  %nop14696 = alloca i1, i1 0
-  %nop14697 = alloca i1, i1 0
-  %nop14698 = alloca i1, i1 0
-  %nop14699 = alloca i1, i1 0
-  %nop14700 = alloca i1, i1 0
-  %nop14701 = alloca i1, i1 0
-  %nop14702 = alloca i1, i1 0
-  %nop14703 = alloca i1, i1 0
-  %nop14704 = alloca i1, i1 0
-  %nop14705 = alloca i1, i1 0
-  %nop14706 = alloca i1, i1 0
-  %nop14707 = alloca i1, i1 0
-  %nop14708 = alloca i1, i1 0
-  %nop14709 = alloca i1, i1 0
-  %nop14710 = alloca i1, i1 0
-  %nop14711 = alloca i1, i1 0
-  %nop14712 = alloca i1, i1 0
-  %nop14713 = alloca i1, i1 0
-  %nop14714 = alloca i1, i1 0
-  %nop14715 = alloca i1, i1 0
-  %nop14716 = alloca i1, i1 0
-  %nop14717 = alloca i1, i1 0
-  %nop14718 = alloca i1, i1 0
-  %nop14719 = alloca i1, i1 0
-  %nop14720 = alloca i1, i1 0
-  %nop14721 = alloca i1, i1 0
-  %nop14722 = alloca i1, i1 0
-  %nop14723 = alloca i1, i1 0
-  %nop14724 = alloca i1, i1 0
-  %nop14725 = alloca i1, i1 0
-  %nop14726 = alloca i1, i1 0
-  %nop14727 = alloca i1, i1 0
-  %nop14728 = alloca i1, i1 0
-  %nop14729 = alloca i1, i1 0
-  %nop14730 = alloca i1, i1 0
-  %nop14731 = alloca i1, i1 0
-  %nop14732 = alloca i1, i1 0
-  %nop14733 = alloca i1, i1 0
-  %nop14734 = alloca i1, i1 0
-  %nop14735 = alloca i1, i1 0
-  %nop14736 = alloca i1, i1 0
-  %nop14737 = alloca i1, i1 0
-  %nop14738 = alloca i1, i1 0
-  %nop14739 = alloca i1, i1 0
-  %nop14740 = alloca i1, i1 0
-  %nop14741 = alloca i1, i1 0
-  %nop14742 = alloca i1, i1 0
-  %nop14743 = alloca i1, i1 0
-  %nop14744 = alloca i1, i1 0
-  %nop14745 = alloca i1, i1 0
-  %nop14746 = alloca i1, i1 0
-  %nop14747 = alloca i1, i1 0
-  %nop14748 = alloca i1, i1 0
-  %nop14749 = alloca i1, i1 0
-  %nop14750 = alloca i1, i1 0
-  %nop14751 = alloca i1, i1 0
-  %nop14752 = alloca i1, i1 0
-  %nop14753 = alloca i1, i1 0
-  %nop14754 = alloca i1, i1 0
-  %nop14755 = alloca i1, i1 0
-  %nop14756 = alloca i1, i1 0
-  %nop14757 = alloca i1, i1 0
-  %nop14758 = alloca i1, i1 0
-  %nop14759 = alloca i1, i1 0
-  %nop14760 = alloca i1, i1 0
-  %nop14761 = alloca i1, i1 0
-  %nop14762 = alloca i1, i1 0
-  %nop14763 = alloca i1, i1 0
-  %nop14764 = alloca i1, i1 0
-  %nop14765 = alloca i1, i1 0
-  %nop14766 = alloca i1, i1 0
-  %nop14767 = alloca i1, i1 0
-  %nop14768 = alloca i1, i1 0
-  %nop14769 = alloca i1, i1 0
-  %nop14770 = alloca i1, i1 0
-  %nop14771 = alloca i1, i1 0
-  %nop14772 = alloca i1, i1 0
-  %nop14773 = alloca i1, i1 0
-  %nop14774 = alloca i1, i1 0
-  %nop14775 = alloca i1, i1 0
-  %nop14776 = alloca i1, i1 0
-  %nop14777 = alloca i1, i1 0
-  %nop14778 = alloca i1, i1 0
-  %nop14779 = alloca i1, i1 0
-  %nop14780 = alloca i1, i1 0
-  %nop14781 = alloca i1, i1 0
-  %nop14782 = alloca i1, i1 0
-  %nop14783 = alloca i1, i1 0
-  %nop14784 = alloca i1, i1 0
-  %nop14785 = alloca i1, i1 0
-  %nop14786 = alloca i1, i1 0
-  %nop14787 = alloca i1, i1 0
-  %nop14788 = alloca i1, i1 0
-  %nop14789 = alloca i1, i1 0
-  %nop14790 = alloca i1, i1 0
-  %nop14791 = alloca i1, i1 0
-  %nop14792 = alloca i1, i1 0
-  %nop14793 = alloca i1, i1 0
-  %nop14794 = alloca i1, i1 0
-  %nop14795 = alloca i1, i1 0
-  %nop14796 = alloca i1, i1 0
-  %nop14797 = alloca i1, i1 0
-  %nop14798 = alloca i1, i1 0
-  %nop14799 = alloca i1, i1 0
-  %nop14800 = alloca i1, i1 0
-  %nop14801 = alloca i1, i1 0
-  %nop14802 = alloca i1, i1 0
-  %nop14803 = alloca i1, i1 0
-  %nop14804 = alloca i1, i1 0
-  %nop14805 = alloca i1, i1 0
-  %nop14806 = alloca i1, i1 0
-  %nop14807 = alloca i1, i1 0
-  %nop14808 = alloca i1, i1 0
-  %nop14809 = alloca i1, i1 0
-  %nop14810 = alloca i1, i1 0
-  %nop14811 = alloca i1, i1 0
-  %nop14812 = alloca i1, i1 0
-  %nop14813 = alloca i1, i1 0
-  %nop14814 = alloca i1, i1 0
-  %nop14815 = alloca i1, i1 0
-  %nop14816 = alloca i1, i1 0
-  %nop14817 = alloca i1, i1 0
-  %nop14818 = alloca i1, i1 0
-  %nop14819 = alloca i1, i1 0
-  %nop14820 = alloca i1, i1 0
-  %nop14821 = alloca i1, i1 0
-  %nop14822 = alloca i1, i1 0
-  %nop14823 = alloca i1, i1 0
-  %nop14824 = alloca i1, i1 0
-  %nop14825 = alloca i1, i1 0
-  %nop14826 = alloca i1, i1 0
-  %nop14827 = alloca i1, i1 0
-  %nop14828 = alloca i1, i1 0
-  %nop14829 = alloca i1, i1 0
-  %nop14830 = alloca i1, i1 0
-  %nop14831 = alloca i1, i1 0
-  %nop14832 = alloca i1, i1 0
-  %nop14833 = alloca i1, i1 0
-  %nop14834 = alloca i1, i1 0
-  %nop14835 = alloca i1, i1 0
-  %nop14836 = alloca i1, i1 0
-  %nop14837 = alloca i1, i1 0
-  %nop14838 = alloca i1, i1 0
-  %nop14839 = alloca i1, i1 0
-  %nop14840 = alloca i1, i1 0
-  %nop14841 = alloca i1, i1 0
-  %nop14842 = alloca i1, i1 0
-  %nop14843 = alloca i1, i1 0
-  %nop14844 = alloca i1, i1 0
-  %nop14845 = alloca i1, i1 0
-  %nop14846 = alloca i1, i1 0
-  %nop14847 = alloca i1, i1 0
-  %nop14848 = alloca i1, i1 0
-  %nop14849 = alloca i1, i1 0
-  %nop14850 = alloca i1, i1 0
-  %nop14851 = alloca i1, i1 0
-  %nop14852 = alloca i1, i1 0
-  %nop14853 = alloca i1, i1 0
-  %nop14854 = alloca i1, i1 0
-  %nop14855 = alloca i1, i1 0
-  %nop14856 = alloca i1, i1 0
-  %nop14857 = alloca i1, i1 0
-  %nop14858 = alloca i1, i1 0
-  %nop14859 = alloca i1, i1 0
-  %nop14860 = alloca i1, i1 0
-  %nop14861 = alloca i1, i1 0
-  %nop14862 = alloca i1, i1 0
-  %nop14863 = alloca i1, i1 0
-  %nop14864 = alloca i1, i1 0
-  %nop14865 = alloca i1, i1 0
-  %nop14866 = alloca i1, i1 0
-  %nop14867 = alloca i1, i1 0
-  %nop14868 = alloca i1, i1 0
-  %nop14869 = alloca i1, i1 0
-  %nop14870 = alloca i1, i1 0
-  %nop14871 = alloca i1, i1 0
-  %nop14872 = alloca i1, i1 0
-  %nop14873 = alloca i1, i1 0
-  %nop14874 = alloca i1, i1 0
-  %nop14875 = alloca i1, i1 0
-  %nop14876 = alloca i1, i1 0
-  %nop14877 = alloca i1, i1 0
-  %nop14878 = alloca i1, i1 0
-  %nop14879 = alloca i1, i1 0
-  %nop14880 = alloca i1, i1 0
-  %nop14881 = alloca i1, i1 0
-  %nop14882 = alloca i1, i1 0
-  %nop14883 = alloca i1, i1 0
-  %nop14884 = alloca i1, i1 0
-  %nop14885 = alloca i1, i1 0
-  %nop14886 = alloca i1, i1 0
-  %nop14887 = alloca i1, i1 0
-  %nop14888 = alloca i1, i1 0
-  %nop14889 = alloca i1, i1 0
-  %nop14890 = alloca i1, i1 0
-  %nop14891 = alloca i1, i1 0
-  %nop14892 = alloca i1, i1 0
-  %nop14893 = alloca i1, i1 0
-  %nop14894 = alloca i1, i1 0
-  %nop14895 = alloca i1, i1 0
-  %nop14896 = alloca i1, i1 0
-  %nop14897 = alloca i1, i1 0
-  %nop14898 = alloca i1, i1 0
-  %nop14899 = alloca i1, i1 0
-  %nop14900 = alloca i1, i1 0
-  %nop14901 = alloca i1, i1 0
-  %nop14902 = alloca i1, i1 0
-  %nop14903 = alloca i1, i1 0
-  %nop14904 = alloca i1, i1 0
-  %nop14905 = alloca i1, i1 0
-  %nop14906 = alloca i1, i1 0
-  %nop14907 = alloca i1, i1 0
-  %nop14908 = alloca i1, i1 0
-  %nop14909 = alloca i1, i1 0
-  %nop14910 = alloca i1, i1 0
-  %nop14911 = alloca i1, i1 0
-  %nop14912 = alloca i1, i1 0
-  %nop14913 = alloca i1, i1 0
-  %nop14914 = alloca i1, i1 0
-  %nop14915 = alloca i1, i1 0
-  %nop14916 = alloca i1, i1 0
-  %nop14917 = alloca i1, i1 0
-  %nop14918 = alloca i1, i1 0
-  %nop14919 = alloca i1, i1 0
-  %nop14920 = alloca i1, i1 0
-  %nop14921 = alloca i1, i1 0
-  %nop14922 = alloca i1, i1 0
-  %nop14923 = alloca i1, i1 0
-  %nop14924 = alloca i1, i1 0
-  %nop14925 = alloca i1, i1 0
-  %nop14926 = alloca i1, i1 0
-  %nop14927 = alloca i1, i1 0
-  %nop14928 = alloca i1, i1 0
-  %nop14929 = alloca i1, i1 0
-  %nop14930 = alloca i1, i1 0
-  %nop14931 = alloca i1, i1 0
-  %nop14932 = alloca i1, i1 0
-  %nop14933 = alloca i1, i1 0
-  %nop14934 = alloca i1, i1 0
-  %nop14935 = alloca i1, i1 0
-  %nop14936 = alloca i1, i1 0
-  %nop14937 = alloca i1, i1 0
-  %nop14938 = alloca i1, i1 0
-  %nop14939 = alloca i1, i1 0
-  %nop14940 = alloca i1, i1 0
-  %nop14941 = alloca i1, i1 0
-  %nop14942 = alloca i1, i1 0
-  %nop14943 = alloca i1, i1 0
-  %nop14944 = alloca i1, i1 0
-  %nop14945 = alloca i1, i1 0
-  %nop14946 = alloca i1, i1 0
-  %nop14947 = alloca i1, i1 0
-  %nop14948 = alloca i1, i1 0
-  %nop14949 = alloca i1, i1 0
-  %nop14950 = alloca i1, i1 0
-  %nop14951 = alloca i1, i1 0
-  %nop14952 = alloca i1, i1 0
-  %nop14953 = alloca i1, i1 0
-  %nop14954 = alloca i1, i1 0
-  %nop14955 = alloca i1, i1 0
-  %nop14956 = alloca i1, i1 0
-  %nop14957 = alloca i1, i1 0
-  %nop14958 = alloca i1, i1 0
-  %nop14959 = alloca i1, i1 0
-  %nop14960 = alloca i1, i1 0
-  %nop14961 = alloca i1, i1 0
-  %nop14962 = alloca i1, i1 0
-  %nop14963 = alloca i1, i1 0
-  %nop14964 = alloca i1, i1 0
-  %nop14965 = alloca i1, i1 0
-  %nop14966 = alloca i1, i1 0
-  %nop14967 = alloca i1, i1 0
-  %nop14968 = alloca i1, i1 0
-  %nop14969 = alloca i1, i1 0
-  %nop14970 = alloca i1, i1 0
-  %nop14971 = alloca i1, i1 0
-  %nop14972 = alloca i1, i1 0
-  %nop14973 = alloca i1, i1 0
-  %nop14974 = alloca i1, i1 0
-  %nop14975 = alloca i1, i1 0
-  %nop14976 = alloca i1, i1 0
-  %nop14977 = alloca i1, i1 0
-  %nop14978 = alloca i1, i1 0
-  %nop14979 = alloca i1, i1 0
-  %nop14980 = alloca i1, i1 0
-  %nop14981 = alloca i1, i1 0
-  %nop14982 = alloca i1, i1 0
-  %nop14983 = alloca i1, i1 0
-  %nop14984 = alloca i1, i1 0
-  %nop14985 = alloca i1, i1 0
-  %nop14986 = alloca i1, i1 0
-  %nop14987 = alloca i1, i1 0
-  %nop14988 = alloca i1, i1 0
-  %nop14989 = alloca i1, i1 0
-  %nop14990 = alloca i1, i1 0
-  %nop14991 = alloca i1, i1 0
-  %nop14992 = alloca i1, i1 0
-  %nop14993 = alloca i1, i1 0
-  %nop14994 = alloca i1, i1 0
-  %nop14995 = alloca i1, i1 0
-  %nop14996 = alloca i1, i1 0
-  %nop14997 = alloca i1, i1 0
-  %nop14998 = alloca i1, i1 0
-  %nop14999 = alloca i1, i1 0
-  %nop15000 = alloca i1, i1 0
-  %nop15001 = alloca i1, i1 0
-  %nop15002 = alloca i1, i1 0
-  %nop15003 = alloca i1, i1 0
-  %nop15004 = alloca i1, i1 0
-  %nop15005 = alloca i1, i1 0
-  %nop15006 = alloca i1, i1 0
-  %nop15007 = alloca i1, i1 0
-  %nop15008 = alloca i1, i1 0
-  %nop15009 = alloca i1, i1 0
-  %nop15010 = alloca i1, i1 0
-  %nop15011 = alloca i1, i1 0
-  %nop15012 = alloca i1, i1 0
-  %nop15013 = alloca i1, i1 0
-  %nop15014 = alloca i1, i1 0
-  %nop15015 = alloca i1, i1 0
-  %nop15016 = alloca i1, i1 0
-  %nop15017 = alloca i1, i1 0
-  %nop15018 = alloca i1, i1 0
-  %nop15019 = alloca i1, i1 0
-  %nop15020 = alloca i1, i1 0
-  %nop15021 = alloca i1, i1 0
-  %nop15022 = alloca i1, i1 0
-  %nop15023 = alloca i1, i1 0
-  %nop15024 = alloca i1, i1 0
-  %nop15025 = alloca i1, i1 0
-  %nop15026 = alloca i1, i1 0
-  %nop15027 = alloca i1, i1 0
-  %nop15028 = alloca i1, i1 0
-  %nop15029 = alloca i1, i1 0
-  %nop15030 = alloca i1, i1 0
-  %nop15031 = alloca i1, i1 0
-  %nop15032 = alloca i1, i1 0
-  %nop15033 = alloca i1, i1 0
-  %nop15034 = alloca i1, i1 0
-  %nop15035 = alloca i1, i1 0
-  %nop15036 = alloca i1, i1 0
-  %nop15037 = alloca i1, i1 0
-  %nop15038 = alloca i1, i1 0
-  %nop15039 = alloca i1, i1 0
-  %nop15040 = alloca i1, i1 0
-  %nop15041 = alloca i1, i1 0
-  %nop15042 = alloca i1, i1 0
-  %nop15043 = alloca i1, i1 0
-  %nop15044 = alloca i1, i1 0
-  %nop15045 = alloca i1, i1 0
-  %nop15046 = alloca i1, i1 0
-  %nop15047 = alloca i1, i1 0
-  %nop15048 = alloca i1, i1 0
-  %nop15049 = alloca i1, i1 0
-  %nop15050 = alloca i1, i1 0
-  %nop15051 = alloca i1, i1 0
-  %nop15052 = alloca i1, i1 0
-  %nop15053 = alloca i1, i1 0
-  %nop15054 = alloca i1, i1 0
-  %nop15055 = alloca i1, i1 0
-  %nop15056 = alloca i1, i1 0
-  %nop15057 = alloca i1, i1 0
-  %nop15058 = alloca i1, i1 0
-  %nop15059 = alloca i1, i1 0
-  %nop15060 = alloca i1, i1 0
-  %nop15061 = alloca i1, i1 0
-  %nop15062 = alloca i1, i1 0
-  %nop15063 = alloca i1, i1 0
-  %nop15064 = alloca i1, i1 0
-  %nop15065 = alloca i1, i1 0
-  %nop15066 = alloca i1, i1 0
-  %nop15067 = alloca i1, i1 0
-  %nop15068 = alloca i1, i1 0
-  %nop15069 = alloca i1, i1 0
-  %nop15070 = alloca i1, i1 0
-  %nop15071 = alloca i1, i1 0
-  %nop15072 = alloca i1, i1 0
-  %nop15073 = alloca i1, i1 0
-  %nop15074 = alloca i1, i1 0
-  %nop15075 = alloca i1, i1 0
-  %nop15076 = alloca i1, i1 0
-  %nop15077 = alloca i1, i1 0
-  %nop15078 = alloca i1, i1 0
-  %nop15079 = alloca i1, i1 0
-  %nop15080 = alloca i1, i1 0
-  %nop15081 = alloca i1, i1 0
-  %nop15082 = alloca i1, i1 0
-  %nop15083 = alloca i1, i1 0
-  %nop15084 = alloca i1, i1 0
-  %nop15085 = alloca i1, i1 0
-  %nop15086 = alloca i1, i1 0
-  %nop15087 = alloca i1, i1 0
-  %nop15088 = alloca i1, i1 0
-  %nop15089 = alloca i1, i1 0
-  %nop15090 = alloca i1, i1 0
-  %nop15091 = alloca i1, i1 0
-  %nop15092 = alloca i1, i1 0
-  %nop15093 = alloca i1, i1 0
-  %nop15094 = alloca i1, i1 0
-  %nop15095 = alloca i1, i1 0
-  %nop15096 = alloca i1, i1 0
-  %nop15097 = alloca i1, i1 0
-  %nop15098 = alloca i1, i1 0
-  %nop15099 = alloca i1, i1 0
-  %nop15100 = alloca i1, i1 0
-  %nop15101 = alloca i1, i1 0
-  %nop15102 = alloca i1, i1 0
-  %nop15103 = alloca i1, i1 0
-  %nop15104 = alloca i1, i1 0
-  %nop15105 = alloca i1, i1 0
-  %nop15106 = alloca i1, i1 0
-  %nop15107 = alloca i1, i1 0
-  %nop15108 = alloca i1, i1 0
-  %nop15109 = alloca i1, i1 0
-  %nop15110 = alloca i1, i1 0
-  %nop15111 = alloca i1, i1 0
-  %nop15112 = alloca i1, i1 0
-  %nop15113 = alloca i1, i1 0
-  %nop15114 = alloca i1, i1 0
-  %nop15115 = alloca i1, i1 0
-  %nop15116 = alloca i1, i1 0
-  %nop15117 = alloca i1, i1 0
-  %nop15118 = alloca i1, i1 0
-  %nop15119 = alloca i1, i1 0
-  %nop15120 = alloca i1, i1 0
-  %nop15121 = alloca i1, i1 0
-  %nop15122 = alloca i1, i1 0
-  %nop15123 = alloca i1, i1 0
-  %nop15124 = alloca i1, i1 0
-  %nop15125 = alloca i1, i1 0
-  %nop15126 = alloca i1, i1 0
-  %nop15127 = alloca i1, i1 0
-  %nop15128 = alloca i1, i1 0
-  %nop15129 = alloca i1, i1 0
-  %nop15130 = alloca i1, i1 0
-  %nop15131 = alloca i1, i1 0
-  %nop15132 = alloca i1, i1 0
-  %nop15133 = alloca i1, i1 0
-  %nop15134 = alloca i1, i1 0
-  %nop15135 = alloca i1, i1 0
-  %nop15136 = alloca i1, i1 0
-  %nop15137 = alloca i1, i1 0
-  %nop15138 = alloca i1, i1 0
-  %nop15139 = alloca i1, i1 0
-  %nop15140 = alloca i1, i1 0
-  %nop15141 = alloca i1, i1 0
-  %nop15142 = alloca i1, i1 0
-  %nop15143 = alloca i1, i1 0
-  %nop15144 = alloca i1, i1 0
-  %nop15145 = alloca i1, i1 0
-  %nop15146 = alloca i1, i1 0
-  %nop15147 = alloca i1, i1 0
-  %nop15148 = alloca i1, i1 0
-  %nop15149 = alloca i1, i1 0
-  %nop15150 = alloca i1, i1 0
-  %nop15151 = alloca i1, i1 0
-  %nop15152 = alloca i1, i1 0
-  %nop15153 = alloca i1, i1 0
-  %nop15154 = alloca i1, i1 0
-  %nop15155 = alloca i1, i1 0
-  %nop15156 = alloca i1, i1 0
-  %nop15157 = alloca i1, i1 0
-  %nop15158 = alloca i1, i1 0
-  %nop15159 = alloca i1, i1 0
-  %nop15160 = alloca i1, i1 0
-  %nop15161 = alloca i1, i1 0
-  %nop15162 = alloca i1, i1 0
-  %nop15163 = alloca i1, i1 0
-  %nop15164 = alloca i1, i1 0
-  %nop15165 = alloca i1, i1 0
-  %nop15166 = alloca i1, i1 0
-  %nop15167 = alloca i1, i1 0
-  %nop15168 = alloca i1, i1 0
-  %nop15169 = alloca i1, i1 0
-  %nop15170 = alloca i1, i1 0
-  %nop15171 = alloca i1, i1 0
-  %nop15172 = alloca i1, i1 0
-  %nop15173 = alloca i1, i1 0
-  %nop15174 = alloca i1, i1 0
-  %nop15175 = alloca i1, i1 0
-  %nop15176 = alloca i1, i1 0
-  %nop15177 = alloca i1, i1 0
-  %nop15178 = alloca i1, i1 0
-  %nop15179 = alloca i1, i1 0
-  %nop15180 = alloca i1, i1 0
-  %nop15181 = alloca i1, i1 0
-  %nop15182 = alloca i1, i1 0
-  %nop15183 = alloca i1, i1 0
-  %nop15184 = alloca i1, i1 0
-  %nop15185 = alloca i1, i1 0
-  %nop15186 = alloca i1, i1 0
-  %nop15187 = alloca i1, i1 0
-  %nop15188 = alloca i1, i1 0
-  %nop15189 = alloca i1, i1 0
-  %nop15190 = alloca i1, i1 0
-  %nop15191 = alloca i1, i1 0
-  %nop15192 = alloca i1, i1 0
-  %nop15193 = alloca i1, i1 0
-  %nop15194 = alloca i1, i1 0
-  %nop15195 = alloca i1, i1 0
-  %nop15196 = alloca i1, i1 0
-  %nop15197 = alloca i1, i1 0
-  %nop15198 = alloca i1, i1 0
-  %nop15199 = alloca i1, i1 0
-  %nop15200 = alloca i1, i1 0
-  %nop15201 = alloca i1, i1 0
-  %nop15202 = alloca i1, i1 0
-  %nop15203 = alloca i1, i1 0
-  %nop15204 = alloca i1, i1 0
-  %nop15205 = alloca i1, i1 0
-  %nop15206 = alloca i1, i1 0
-  %nop15207 = alloca i1, i1 0
-  %nop15208 = alloca i1, i1 0
-  %nop15209 = alloca i1, i1 0
-  %nop15210 = alloca i1, i1 0
-  %nop15211 = alloca i1, i1 0
-  %nop15212 = alloca i1, i1 0
-  %nop15213 = alloca i1, i1 0
-  %nop15214 = alloca i1, i1 0
-  %nop15215 = alloca i1, i1 0
-  %nop15216 = alloca i1, i1 0
-  %nop15217 = alloca i1, i1 0
-  %nop15218 = alloca i1, i1 0
-  %nop15219 = alloca i1, i1 0
-  %nop15220 = alloca i1, i1 0
-  %nop15221 = alloca i1, i1 0
-  %nop15222 = alloca i1, i1 0
-  %nop15223 = alloca i1, i1 0
-  %nop15224 = alloca i1, i1 0
-  %nop15225 = alloca i1, i1 0
-  %nop15226 = alloca i1, i1 0
-  %nop15227 = alloca i1, i1 0
-  %nop15228 = alloca i1, i1 0
-  %nop15229 = alloca i1, i1 0
-  %nop15230 = alloca i1, i1 0
-  %nop15231 = alloca i1, i1 0
-  %nop15232 = alloca i1, i1 0
-  %nop15233 = alloca i1, i1 0
-  %nop15234 = alloca i1, i1 0
-  %nop15235 = alloca i1, i1 0
-  %nop15236 = alloca i1, i1 0
-  %nop15237 = alloca i1, i1 0
-  %nop15238 = alloca i1, i1 0
-  %nop15239 = alloca i1, i1 0
-  %nop15240 = alloca i1, i1 0
-  %nop15241 = alloca i1, i1 0
-  %nop15242 = alloca i1, i1 0
-  %nop15243 = alloca i1, i1 0
-  %nop15244 = alloca i1, i1 0
-  %nop15245 = alloca i1, i1 0
-  %nop15246 = alloca i1, i1 0
-  %nop15247 = alloca i1, i1 0
-  %nop15248 = alloca i1, i1 0
-  %nop15249 = alloca i1, i1 0
-  %nop15250 = alloca i1, i1 0
-  %nop15251 = alloca i1, i1 0
-  %nop15252 = alloca i1, i1 0
-  %nop15253 = alloca i1, i1 0
-  %nop15254 = alloca i1, i1 0
-  %nop15255 = alloca i1, i1 0
-  %nop15256 = alloca i1, i1 0
-  %nop15257 = alloca i1, i1 0
-  %nop15258 = alloca i1, i1 0
-  %nop15259 = alloca i1, i1 0
-  %nop15260 = alloca i1, i1 0
-  %nop15261 = alloca i1, i1 0
-  %nop15262 = alloca i1, i1 0
-  %nop15263 = alloca i1, i1 0
-  %nop15264 = alloca i1, i1 0
-  %nop15265 = alloca i1, i1 0
-  %nop15266 = alloca i1, i1 0
-  %nop15267 = alloca i1, i1 0
-  %nop15268 = alloca i1, i1 0
-  %nop15269 = alloca i1, i1 0
-  %nop15270 = alloca i1, i1 0
-  %nop15271 = alloca i1, i1 0
-  %nop15272 = alloca i1, i1 0
-  %nop15273 = alloca i1, i1 0
-  %nop15274 = alloca i1, i1 0
-  %nop15275 = alloca i1, i1 0
-  %nop15276 = alloca i1, i1 0
-  %nop15277 = alloca i1, i1 0
-  %nop15278 = alloca i1, i1 0
-  %nop15279 = alloca i1, i1 0
-  %nop15280 = alloca i1, i1 0
-  %nop15281 = alloca i1, i1 0
-  %nop15282 = alloca i1, i1 0
-  %nop15283 = alloca i1, i1 0
-  %nop15284 = alloca i1, i1 0
-  %nop15285 = alloca i1, i1 0
-  %nop15286 = alloca i1, i1 0
-  %nop15287 = alloca i1, i1 0
-  %nop15288 = alloca i1, i1 0
-  %nop15289 = alloca i1, i1 0
-  %nop15290 = alloca i1, i1 0
-  %nop15291 = alloca i1, i1 0
-  %nop15292 = alloca i1, i1 0
-  %nop15293 = alloca i1, i1 0
-  %nop15294 = alloca i1, i1 0
-  %nop15295 = alloca i1, i1 0
-  %nop15296 = alloca i1, i1 0
-  %nop15297 = alloca i1, i1 0
-  %nop15298 = alloca i1, i1 0
-  %nop15299 = alloca i1, i1 0
-  %nop15300 = alloca i1, i1 0
-  %nop15301 = alloca i1, i1 0
-  %nop15302 = alloca i1, i1 0
-  %nop15303 = alloca i1, i1 0
-  %nop15304 = alloca i1, i1 0
-  %nop15305 = alloca i1, i1 0
-  %nop15306 = alloca i1, i1 0
-  %nop15307 = alloca i1, i1 0
-  %nop15308 = alloca i1, i1 0
-  %nop15309 = alloca i1, i1 0
-  %nop15310 = alloca i1, i1 0
-  %nop15311 = alloca i1, i1 0
-  %nop15312 = alloca i1, i1 0
-  %nop15313 = alloca i1, i1 0
-  %nop15314 = alloca i1, i1 0
-  %nop15315 = alloca i1, i1 0
-  %nop15316 = alloca i1, i1 0
-  %nop15317 = alloca i1, i1 0
-  %nop15318 = alloca i1, i1 0
-  %nop15319 = alloca i1, i1 0
-  %nop15320 = alloca i1, i1 0
-  %nop15321 = alloca i1, i1 0
-  %nop15322 = alloca i1, i1 0
-  %nop15323 = alloca i1, i1 0
-  %nop15324 = alloca i1, i1 0
-  %nop15325 = alloca i1, i1 0
-  %nop15326 = alloca i1, i1 0
-  %nop15327 = alloca i1, i1 0
-  %nop15328 = alloca i1, i1 0
-  %nop15329 = alloca i1, i1 0
-  %nop15330 = alloca i1, i1 0
-  %nop15331 = alloca i1, i1 0
-  %nop15332 = alloca i1, i1 0
-  %nop15333 = alloca i1, i1 0
-  %nop15334 = alloca i1, i1 0
-  %nop15335 = alloca i1, i1 0
-  %nop15336 = alloca i1, i1 0
-  %nop15337 = alloca i1, i1 0
-  %nop15338 = alloca i1, i1 0
-  %nop15339 = alloca i1, i1 0
-  %nop15340 = alloca i1, i1 0
-  %nop15341 = alloca i1, i1 0
-  %nop15342 = alloca i1, i1 0
-  %nop15343 = alloca i1, i1 0
-  %nop15344 = alloca i1, i1 0
-  %nop15345 = alloca i1, i1 0
-  %nop15346 = alloca i1, i1 0
-  %nop15347 = alloca i1, i1 0
-  %nop15348 = alloca i1, i1 0
-  %nop15349 = alloca i1, i1 0
-  %nop15350 = alloca i1, i1 0
-  %nop15351 = alloca i1, i1 0
-  %nop15352 = alloca i1, i1 0
-  %nop15353 = alloca i1, i1 0
-  %nop15354 = alloca i1, i1 0
-  %nop15355 = alloca i1, i1 0
-  %nop15356 = alloca i1, i1 0
-  %nop15357 = alloca i1, i1 0
-  %nop15358 = alloca i1, i1 0
-  %nop15359 = alloca i1, i1 0
-  %nop15360 = alloca i1, i1 0
-  %nop15361 = alloca i1, i1 0
-  %nop15362 = alloca i1, i1 0
-  %nop15363 = alloca i1, i1 0
-  %nop15364 = alloca i1, i1 0
-  %nop15365 = alloca i1, i1 0
-  %nop15366 = alloca i1, i1 0
-  %nop15367 = alloca i1, i1 0
-  %nop15368 = alloca i1, i1 0
-  %nop15369 = alloca i1, i1 0
-  %nop15370 = alloca i1, i1 0
-  %nop15371 = alloca i1, i1 0
-  %nop15372 = alloca i1, i1 0
-  %nop15373 = alloca i1, i1 0
-  %nop15374 = alloca i1, i1 0
-  %nop15375 = alloca i1, i1 0
-  %nop15376 = alloca i1, i1 0
-  %nop15377 = alloca i1, i1 0
-  %nop15378 = alloca i1, i1 0
-  %nop15379 = alloca i1, i1 0
-  %nop15380 = alloca i1, i1 0
-  %nop15381 = alloca i1, i1 0
-  %nop15382 = alloca i1, i1 0
-  %nop15383 = alloca i1, i1 0
-  %nop15384 = alloca i1, i1 0
-  %nop15385 = alloca i1, i1 0
-  %nop15386 = alloca i1, i1 0
-  %nop15387 = alloca i1, i1 0
-  %nop15388 = alloca i1, i1 0
-  %nop15389 = alloca i1, i1 0
-  %nop15390 = alloca i1, i1 0
-  %nop15391 = alloca i1, i1 0
-  %nop15392 = alloca i1, i1 0
-  %nop15393 = alloca i1, i1 0
-  %nop15394 = alloca i1, i1 0
-  %nop15395 = alloca i1, i1 0
-  %nop15396 = alloca i1, i1 0
-  %nop15397 = alloca i1, i1 0
-  %nop15398 = alloca i1, i1 0
-  %nop15399 = alloca i1, i1 0
-  %nop15400 = alloca i1, i1 0
-  %nop15401 = alloca i1, i1 0
-  %nop15402 = alloca i1, i1 0
-  %nop15403 = alloca i1, i1 0
-  %nop15404 = alloca i1, i1 0
-  %nop15405 = alloca i1, i1 0
-  %nop15406 = alloca i1, i1 0
-  %nop15407 = alloca i1, i1 0
-  %nop15408 = alloca i1, i1 0
-  %nop15409 = alloca i1, i1 0
-  %nop15410 = alloca i1, i1 0
-  %nop15411 = alloca i1, i1 0
-  %nop15412 = alloca i1, i1 0
-  %nop15413 = alloca i1, i1 0
-  %nop15414 = alloca i1, i1 0
-  %nop15415 = alloca i1, i1 0
-  %nop15416 = alloca i1, i1 0
-  %nop15417 = alloca i1, i1 0
-  %nop15418 = alloca i1, i1 0
-  %nop15419 = alloca i1, i1 0
-  %nop15420 = alloca i1, i1 0
-  %nop15421 = alloca i1, i1 0
-  %nop15422 = alloca i1, i1 0
-  %nop15423 = alloca i1, i1 0
-  %nop15424 = alloca i1, i1 0
-  %nop15425 = alloca i1, i1 0
-  %nop15426 = alloca i1, i1 0
-  %nop15427 = alloca i1, i1 0
-  %nop15428 = alloca i1, i1 0
-  %nop15429 = alloca i1, i1 0
-  %nop15430 = alloca i1, i1 0
-  %nop15431 = alloca i1, i1 0
-  %nop15432 = alloca i1, i1 0
-  %nop15433 = alloca i1, i1 0
-  %nop15434 = alloca i1, i1 0
-  %nop15435 = alloca i1, i1 0
-  %nop15436 = alloca i1, i1 0
-  %nop15437 = alloca i1, i1 0
-  %nop15438 = alloca i1, i1 0
-  %nop15439 = alloca i1, i1 0
-  %nop15440 = alloca i1, i1 0
-  %nop15441 = alloca i1, i1 0
-  %nop15442 = alloca i1, i1 0
-  %nop15443 = alloca i1, i1 0
-  %nop15444 = alloca i1, i1 0
-  %nop15445 = alloca i1, i1 0
-  %nop15446 = alloca i1, i1 0
-  %nop15447 = alloca i1, i1 0
-  %nop15448 = alloca i1, i1 0
-  %nop15449 = alloca i1, i1 0
-  %nop15450 = alloca i1, i1 0
-  %nop15451 = alloca i1, i1 0
-  %nop15452 = alloca i1, i1 0
-  %nop15453 = alloca i1, i1 0
-  %nop15454 = alloca i1, i1 0
-  %nop15455 = alloca i1, i1 0
-  %nop15456 = alloca i1, i1 0
-  %nop15457 = alloca i1, i1 0
-  %nop15458 = alloca i1, i1 0
-  %nop15459 = alloca i1, i1 0
-  %nop15460 = alloca i1, i1 0
-  %nop15461 = alloca i1, i1 0
-  %nop15462 = alloca i1, i1 0
-  %nop15463 = alloca i1, i1 0
-  %nop15464 = alloca i1, i1 0
-  %nop15465 = alloca i1, i1 0
-  %nop15466 = alloca i1, i1 0
-  %nop15467 = alloca i1, i1 0
-  %nop15468 = alloca i1, i1 0
-  %nop15469 = alloca i1, i1 0
-  %nop15470 = alloca i1, i1 0
-  %nop15471 = alloca i1, i1 0
-  %nop15472 = alloca i1, i1 0
-  %nop15473 = alloca i1, i1 0
-  %nop15474 = alloca i1, i1 0
-  %nop15475 = alloca i1, i1 0
-  %nop15476 = alloca i1, i1 0
-  %nop15477 = alloca i1, i1 0
-  %nop15478 = alloca i1, i1 0
-  %nop15479 = alloca i1, i1 0
-  %nop15480 = alloca i1, i1 0
-  %nop15481 = alloca i1, i1 0
-  %nop15482 = alloca i1, i1 0
-  %nop15483 = alloca i1, i1 0
-  %nop15484 = alloca i1, i1 0
-  %nop15485 = alloca i1, i1 0
-  %nop15486 = alloca i1, i1 0
-  %nop15487 = alloca i1, i1 0
-  %nop15488 = alloca i1, i1 0
-  %nop15489 = alloca i1, i1 0
-  %nop15490 = alloca i1, i1 0
-  %nop15491 = alloca i1, i1 0
-  %nop15492 = alloca i1, i1 0
-  %nop15493 = alloca i1, i1 0
-  %nop15494 = alloca i1, i1 0
-  %nop15495 = alloca i1, i1 0
-  %nop15496 = alloca i1, i1 0
-  %nop15497 = alloca i1, i1 0
-  %nop15498 = alloca i1, i1 0
-  %nop15499 = alloca i1, i1 0
-  %nop15500 = alloca i1, i1 0
-  %nop15501 = alloca i1, i1 0
-  %nop15502 = alloca i1, i1 0
-  %nop15503 = alloca i1, i1 0
-  %nop15504 = alloca i1, i1 0
-  %nop15505 = alloca i1, i1 0
-  %nop15506 = alloca i1, i1 0
-  %nop15507 = alloca i1, i1 0
-  %nop15508 = alloca i1, i1 0
-  %nop15509 = alloca i1, i1 0
-  %nop15510 = alloca i1, i1 0
-  %nop15511 = alloca i1, i1 0
-  %nop15512 = alloca i1, i1 0
-  %nop15513 = alloca i1, i1 0
-  %nop15514 = alloca i1, i1 0
-  %nop15515 = alloca i1, i1 0
-  %nop15516 = alloca i1, i1 0
-  %nop15517 = alloca i1, i1 0
-  %nop15518 = alloca i1, i1 0
-  %nop15519 = alloca i1, i1 0
-  %nop15520 = alloca i1, i1 0
-  %nop15521 = alloca i1, i1 0
-  %nop15522 = alloca i1, i1 0
-  %nop15523 = alloca i1, i1 0
-  %nop15524 = alloca i1, i1 0
-  %nop15525 = alloca i1, i1 0
-  %nop15526 = alloca i1, i1 0
-  %nop15527 = alloca i1, i1 0
-  %nop15528 = alloca i1, i1 0
-  %nop15529 = alloca i1, i1 0
-  %nop15530 = alloca i1, i1 0
-  %nop15531 = alloca i1, i1 0
-  %nop15532 = alloca i1, i1 0
-  %nop15533 = alloca i1, i1 0
-  %nop15534 = alloca i1, i1 0
-  %nop15535 = alloca i1, i1 0
-  %nop15536 = alloca i1, i1 0
-  %nop15537 = alloca i1, i1 0
-  %nop15538 = alloca i1, i1 0
-  %nop15539 = alloca i1, i1 0
-  %nop15540 = alloca i1, i1 0
-  %nop15541 = alloca i1, i1 0
-  %nop15542 = alloca i1, i1 0
-  %nop15543 = alloca i1, i1 0
-  %nop15544 = alloca i1, i1 0
-  %nop15545 = alloca i1, i1 0
-  %nop15546 = alloca i1, i1 0
-  %nop15547 = alloca i1, i1 0
-  %nop15548 = alloca i1, i1 0
-  %nop15549 = alloca i1, i1 0
-  %nop15550 = alloca i1, i1 0
-  %nop15551 = alloca i1, i1 0
-  %nop15552 = alloca i1, i1 0
-  %nop15553 = alloca i1, i1 0
-  %nop15554 = alloca i1, i1 0
-  %nop15555 = alloca i1, i1 0
-  %nop15556 = alloca i1, i1 0
-  %nop15557 = alloca i1, i1 0
-  %nop15558 = alloca i1, i1 0
-  %nop15559 = alloca i1, i1 0
-  %nop15560 = alloca i1, i1 0
-  %nop15561 = alloca i1, i1 0
-  %nop15562 = alloca i1, i1 0
-  %nop15563 = alloca i1, i1 0
-  %nop15564 = alloca i1, i1 0
-  %nop15565 = alloca i1, i1 0
-  %nop15566 = alloca i1, i1 0
-  %nop15567 = alloca i1, i1 0
-  %nop15568 = alloca i1, i1 0
-  %nop15569 = alloca i1, i1 0
-  %nop15570 = alloca i1, i1 0
-  %nop15571 = alloca i1, i1 0
-  %nop15572 = alloca i1, i1 0
-  %nop15573 = alloca i1, i1 0
-  %nop15574 = alloca i1, i1 0
-  %nop15575 = alloca i1, i1 0
-  %nop15576 = alloca i1, i1 0
-  %nop15577 = alloca i1, i1 0
-  %nop15578 = alloca i1, i1 0
-  %nop15579 = alloca i1, i1 0
-  %nop15580 = alloca i1, i1 0
-  %nop15581 = alloca i1, i1 0
-  %nop15582 = alloca i1, i1 0
-  %nop15583 = alloca i1, i1 0
-  %nop15584 = alloca i1, i1 0
-  %nop15585 = alloca i1, i1 0
-  %nop15586 = alloca i1, i1 0
-  %nop15587 = alloca i1, i1 0
-  %nop15588 = alloca i1, i1 0
-  %nop15589 = alloca i1, i1 0
-  %nop15590 = alloca i1, i1 0
-  %nop15591 = alloca i1, i1 0
-  %nop15592 = alloca i1, i1 0
-  %nop15593 = alloca i1, i1 0
-  %nop15594 = alloca i1, i1 0
-  %nop15595 = alloca i1, i1 0
-  %nop15596 = alloca i1, i1 0
-  %nop15597 = alloca i1, i1 0
-  %nop15598 = alloca i1, i1 0
-  %nop15599 = alloca i1, i1 0
-  %nop15600 = alloca i1, i1 0
-  %nop15601 = alloca i1, i1 0
-  %nop15602 = alloca i1, i1 0
-  %nop15603 = alloca i1, i1 0
-  %nop15604 = alloca i1, i1 0
-  %nop15605 = alloca i1, i1 0
-  %nop15606 = alloca i1, i1 0
-  %nop15607 = alloca i1, i1 0
-  %nop15608 = alloca i1, i1 0
-  %nop15609 = alloca i1, i1 0
-  %nop15610 = alloca i1, i1 0
-  %nop15611 = alloca i1, i1 0
-  %nop15612 = alloca i1, i1 0
-  %nop15613 = alloca i1, i1 0
-  %nop15614 = alloca i1, i1 0
-  %nop15615 = alloca i1, i1 0
-  %nop15616 = alloca i1, i1 0
-  %nop15617 = alloca i1, i1 0
-  %nop15618 = alloca i1, i1 0
-  %nop15619 = alloca i1, i1 0
-  %nop15620 = alloca i1, i1 0
-  %nop15621 = alloca i1, i1 0
-  %nop15622 = alloca i1, i1 0
-  %nop15623 = alloca i1, i1 0
-  %nop15624 = alloca i1, i1 0
-  %nop15625 = alloca i1, i1 0
-  %nop15626 = alloca i1, i1 0
-  %nop15627 = alloca i1, i1 0
-  %nop15628 = alloca i1, i1 0
-  %nop15629 = alloca i1, i1 0
-  %nop15630 = alloca i1, i1 0
-  %nop15631 = alloca i1, i1 0
-  %nop15632 = alloca i1, i1 0
-  %nop15633 = alloca i1, i1 0
-  %nop15634 = alloca i1, i1 0
-  %nop15635 = alloca i1, i1 0
-  %nop15636 = alloca i1, i1 0
-  %nop15637 = alloca i1, i1 0
-  %nop15638 = alloca i1, i1 0
-  %nop15639 = alloca i1, i1 0
-  %nop15640 = alloca i1, i1 0
-  %nop15641 = alloca i1, i1 0
-  %nop15642 = alloca i1, i1 0
-  %nop15643 = alloca i1, i1 0
-  %nop15644 = alloca i1, i1 0
-  %nop15645 = alloca i1, i1 0
-  %nop15646 = alloca i1, i1 0
-  %nop15647 = alloca i1, i1 0
-  %nop15648 = alloca i1, i1 0
-  %nop15649 = alloca i1, i1 0
-  %nop15650 = alloca i1, i1 0
-  %nop15651 = alloca i1, i1 0
-  %nop15652 = alloca i1, i1 0
-  %nop15653 = alloca i1, i1 0
-  %nop15654 = alloca i1, i1 0
-  %nop15655 = alloca i1, i1 0
-  %nop15656 = alloca i1, i1 0
-  %nop15657 = alloca i1, i1 0
-  %nop15658 = alloca i1, i1 0
-  %nop15659 = alloca i1, i1 0
-  %nop15660 = alloca i1, i1 0
-  %nop15661 = alloca i1, i1 0
-  %nop15662 = alloca i1, i1 0
-  %nop15663 = alloca i1, i1 0
-  %nop15664 = alloca i1, i1 0
-  %nop15665 = alloca i1, i1 0
-  %nop15666 = alloca i1, i1 0
-  %nop15667 = alloca i1, i1 0
-  %nop15668 = alloca i1, i1 0
-  %nop15669 = alloca i1, i1 0
-  %nop15670 = alloca i1, i1 0
-  %nop15671 = alloca i1, i1 0
-  %nop15672 = alloca i1, i1 0
-  %nop15673 = alloca i1, i1 0
-  %nop15674 = alloca i1, i1 0
-  %nop15675 = alloca i1, i1 0
-  %nop15676 = alloca i1, i1 0
-  %nop15677 = alloca i1, i1 0
-  %nop15678 = alloca i1, i1 0
-  %nop15679 = alloca i1, i1 0
-  %nop15680 = alloca i1, i1 0
-  %nop15681 = alloca i1, i1 0
-  %nop15682 = alloca i1, i1 0
-  %nop15683 = alloca i1, i1 0
-  %nop15684 = alloca i1, i1 0
-  %nop15685 = alloca i1, i1 0
-  %nop15686 = alloca i1, i1 0
-  %nop15687 = alloca i1, i1 0
-  %nop15688 = alloca i1, i1 0
-  %nop15689 = alloca i1, i1 0
-  %nop15690 = alloca i1, i1 0
-  %nop15691 = alloca i1, i1 0
-  %nop15692 = alloca i1, i1 0
-  %nop15693 = alloca i1, i1 0
-  %nop15694 = alloca i1, i1 0
-  %nop15695 = alloca i1, i1 0
-  %nop15696 = alloca i1, i1 0
-  %nop15697 = alloca i1, i1 0
-  %nop15698 = alloca i1, i1 0
-  %nop15699 = alloca i1, i1 0
-  %nop15700 = alloca i1, i1 0
-  %nop15701 = alloca i1, i1 0
-  %nop15702 = alloca i1, i1 0
-  %nop15703 = alloca i1, i1 0
-  %nop15704 = alloca i1, i1 0
-  %nop15705 = alloca i1, i1 0
-  %nop15706 = alloca i1, i1 0
-  %nop15707 = alloca i1, i1 0
-  %nop15708 = alloca i1, i1 0
-  %nop15709 = alloca i1, i1 0
-  %nop15710 = alloca i1, i1 0
-  %nop15711 = alloca i1, i1 0
-  %nop15712 = alloca i1, i1 0
-  %nop15713 = alloca i1, i1 0
-  %nop15714 = alloca i1, i1 0
-  %nop15715 = alloca i1, i1 0
-  %nop15716 = alloca i1, i1 0
-  %nop15717 = alloca i1, i1 0
-  %nop15718 = alloca i1, i1 0
-  %nop15719 = alloca i1, i1 0
-  %nop15720 = alloca i1, i1 0
-  %nop15721 = alloca i1, i1 0
-  %nop15722 = alloca i1, i1 0
-  %nop15723 = alloca i1, i1 0
-  %nop15724 = alloca i1, i1 0
-  %nop15725 = alloca i1, i1 0
-  %nop15726 = alloca i1, i1 0
-  %nop15727 = alloca i1, i1 0
-  %nop15728 = alloca i1, i1 0
-  %nop15729 = alloca i1, i1 0
-  %nop15730 = alloca i1, i1 0
-  %nop15731 = alloca i1, i1 0
-  %nop15732 = alloca i1, i1 0
-  %nop15733 = alloca i1, i1 0
-  %nop15734 = alloca i1, i1 0
-  %nop15735 = alloca i1, i1 0
-  %nop15736 = alloca i1, i1 0
-  %nop15737 = alloca i1, i1 0
-  %nop15738 = alloca i1, i1 0
-  %nop15739 = alloca i1, i1 0
-  %nop15740 = alloca i1, i1 0
-  %nop15741 = alloca i1, i1 0
-  %nop15742 = alloca i1, i1 0
-  %nop15743 = alloca i1, i1 0
-  %nop15744 = alloca i1, i1 0
-  %nop15745 = alloca i1, i1 0
-  %nop15746 = alloca i1, i1 0
-  %nop15747 = alloca i1, i1 0
-  %nop15748 = alloca i1, i1 0
-  %nop15749 = alloca i1, i1 0
-  %nop15750 = alloca i1, i1 0
-  %nop15751 = alloca i1, i1 0
-  %nop15752 = alloca i1, i1 0
-  %nop15753 = alloca i1, i1 0
-  %nop15754 = alloca i1, i1 0
-  %nop15755 = alloca i1, i1 0
-  %nop15756 = alloca i1, i1 0
-  %nop15757 = alloca i1, i1 0
-  %nop15758 = alloca i1, i1 0
-  %nop15759 = alloca i1, i1 0
-  %nop15760 = alloca i1, i1 0
-  %nop15761 = alloca i1, i1 0
-  %nop15762 = alloca i1, i1 0
-  %nop15763 = alloca i1, i1 0
-  %nop15764 = alloca i1, i1 0
-  %nop15765 = alloca i1, i1 0
-  %nop15766 = alloca i1, i1 0
-  %nop15767 = alloca i1, i1 0
-  %nop15768 = alloca i1, i1 0
-  %nop15769 = alloca i1, i1 0
-  %nop15770 = alloca i1, i1 0
-  %nop15771 = alloca i1, i1 0
-  %nop15772 = alloca i1, i1 0
-  %nop15773 = alloca i1, i1 0
-  %nop15774 = alloca i1, i1 0
-  %nop15775 = alloca i1, i1 0
-  %nop15776 = alloca i1, i1 0
-  %nop15777 = alloca i1, i1 0
-  %nop15778 = alloca i1, i1 0
-  %nop15779 = alloca i1, i1 0
-  %nop15780 = alloca i1, i1 0
-  %nop15781 = alloca i1, i1 0
-  %nop15782 = alloca i1, i1 0
-  %nop15783 = alloca i1, i1 0
-  %nop15784 = alloca i1, i1 0
-  %nop15785 = alloca i1, i1 0
-  %nop15786 = alloca i1, i1 0
-  %nop15787 = alloca i1, i1 0
-  %nop15788 = alloca i1, i1 0
-  %nop15789 = alloca i1, i1 0
-  %nop15790 = alloca i1, i1 0
-  %nop15791 = alloca i1, i1 0
-  %nop15792 = alloca i1, i1 0
-  %nop15793 = alloca i1, i1 0
-  %nop15794 = alloca i1, i1 0
-  %nop15795 = alloca i1, i1 0
-  %nop15796 = alloca i1, i1 0
-  %nop15797 = alloca i1, i1 0
-  %nop15798 = alloca i1, i1 0
-  %nop15799 = alloca i1, i1 0
-  %nop15800 = alloca i1, i1 0
-  %nop15801 = alloca i1, i1 0
-  %nop15802 = alloca i1, i1 0
-  %nop15803 = alloca i1, i1 0
-  %nop15804 = alloca i1, i1 0
-  %nop15805 = alloca i1, i1 0
-  %nop15806 = alloca i1, i1 0
-  %nop15807 = alloca i1, i1 0
-  %nop15808 = alloca i1, i1 0
-  %nop15809 = alloca i1, i1 0
-  %nop15810 = alloca i1, i1 0
-  %nop15811 = alloca i1, i1 0
-  %nop15812 = alloca i1, i1 0
-  %nop15813 = alloca i1, i1 0
-  %nop15814 = alloca i1, i1 0
-  %nop15815 = alloca i1, i1 0
-  %nop15816 = alloca i1, i1 0
-  %nop15817 = alloca i1, i1 0
-  %nop15818 = alloca i1, i1 0
-  %nop15819 = alloca i1, i1 0
-  %nop15820 = alloca i1, i1 0
-  %nop15821 = alloca i1, i1 0
-  %nop15822 = alloca i1, i1 0
-  %nop15823 = alloca i1, i1 0
-  %nop15824 = alloca i1, i1 0
-  %nop15825 = alloca i1, i1 0
-  %nop15826 = alloca i1, i1 0
-  %nop15827 = alloca i1, i1 0
-  %nop15828 = alloca i1, i1 0
-  %nop15829 = alloca i1, i1 0
-  %nop15830 = alloca i1, i1 0
-  %nop15831 = alloca i1, i1 0
-  %nop15832 = alloca i1, i1 0
-  %nop15833 = alloca i1, i1 0
-  %nop15834 = alloca i1, i1 0
-  %nop15835 = alloca i1, i1 0
-  %nop15836 = alloca i1, i1 0
-  %nop15837 = alloca i1, i1 0
-  %nop15838 = alloca i1, i1 0
-  %nop15839 = alloca i1, i1 0
-  %nop15840 = alloca i1, i1 0
-  %nop15841 = alloca i1, i1 0
-  %nop15842 = alloca i1, i1 0
-  %nop15843 = alloca i1, i1 0
-  %nop15844 = alloca i1, i1 0
-  %nop15845 = alloca i1, i1 0
-  %nop15846 = alloca i1, i1 0
-  %nop15847 = alloca i1, i1 0
-  %nop15848 = alloca i1, i1 0
-  %nop15849 = alloca i1, i1 0
-  %nop15850 = alloca i1, i1 0
-  %nop15851 = alloca i1, i1 0
-  %nop15852 = alloca i1, i1 0
-  %nop15853 = alloca i1, i1 0
-  %nop15854 = alloca i1, i1 0
-  %nop15855 = alloca i1, i1 0
-  %nop15856 = alloca i1, i1 0
-  %nop15857 = alloca i1, i1 0
-  %nop15858 = alloca i1, i1 0
-  %nop15859 = alloca i1, i1 0
-  %nop15860 = alloca i1, i1 0
-  %nop15861 = alloca i1, i1 0
-  %nop15862 = alloca i1, i1 0
-  %nop15863 = alloca i1, i1 0
-  %nop15864 = alloca i1, i1 0
-  %nop15865 = alloca i1, i1 0
-  %nop15866 = alloca i1, i1 0
-  %nop15867 = alloca i1, i1 0
-  %nop15868 = alloca i1, i1 0
-  %nop15869 = alloca i1, i1 0
-  %nop15870 = alloca i1, i1 0
-  %nop15871 = alloca i1, i1 0
-  %nop15872 = alloca i1, i1 0
-  %nop15873 = alloca i1, i1 0
-  %nop15874 = alloca i1, i1 0
-  %nop15875 = alloca i1, i1 0
-  %nop15876 = alloca i1, i1 0
-  %nop15877 = alloca i1, i1 0
-  %nop15878 = alloca i1, i1 0
-  %nop15879 = alloca i1, i1 0
-  %nop15880 = alloca i1, i1 0
-  %nop15881 = alloca i1, i1 0
-  %nop15882 = alloca i1, i1 0
-  %nop15883 = alloca i1, i1 0
-  %nop15884 = alloca i1, i1 0
-  %nop15885 = alloca i1, i1 0
-  %nop15886 = alloca i1, i1 0
-  %nop15887 = alloca i1, i1 0
-  %nop15888 = alloca i1, i1 0
-  %nop15889 = alloca i1, i1 0
-  %nop15890 = alloca i1, i1 0
-  %nop15891 = alloca i1, i1 0
-  %nop15892 = alloca i1, i1 0
-  %nop15893 = alloca i1, i1 0
-  %nop15894 = alloca i1, i1 0
-  %nop15895 = alloca i1, i1 0
-  %nop15896 = alloca i1, i1 0
-  %nop15897 = alloca i1, i1 0
-  %nop15898 = alloca i1, i1 0
-  %nop15899 = alloca i1, i1 0
-  %nop15900 = alloca i1, i1 0
-  %nop15901 = alloca i1, i1 0
-  %nop15902 = alloca i1, i1 0
-  %nop15903 = alloca i1, i1 0
-  %nop15904 = alloca i1, i1 0
-  %nop15905 = alloca i1, i1 0
-  %nop15906 = alloca i1, i1 0
-  %nop15907 = alloca i1, i1 0
-  %nop15908 = alloca i1, i1 0
-  %nop15909 = alloca i1, i1 0
-  %nop15910 = alloca i1, i1 0
-  %nop15911 = alloca i1, i1 0
-  %nop15912 = alloca i1, i1 0
-  %nop15913 = alloca i1, i1 0
-  %nop15914 = alloca i1, i1 0
-  %nop15915 = alloca i1, i1 0
-  %nop15916 = alloca i1, i1 0
-  %nop15917 = alloca i1, i1 0
-  %nop15918 = alloca i1, i1 0
-  %nop15919 = alloca i1, i1 0
-  %nop15920 = alloca i1, i1 0
-  %nop15921 = alloca i1, i1 0
-  %nop15922 = alloca i1, i1 0
-  %nop15923 = alloca i1, i1 0
-  %nop15924 = alloca i1, i1 0
-  %nop15925 = alloca i1, i1 0
-  %nop15926 = alloca i1, i1 0
-  %nop15927 = alloca i1, i1 0
-  %nop15928 = alloca i1, i1 0
-  %nop15929 = alloca i1, i1 0
-  %nop15930 = alloca i1, i1 0
-  %nop15931 = alloca i1, i1 0
-  %nop15932 = alloca i1, i1 0
-  %nop15933 = alloca i1, i1 0
-  %nop15934 = alloca i1, i1 0
-  %nop15935 = alloca i1, i1 0
-  %nop15936 = alloca i1, i1 0
-  %nop15937 = alloca i1, i1 0
-  %nop15938 = alloca i1, i1 0
-  %nop15939 = alloca i1, i1 0
-  %nop15940 = alloca i1, i1 0
-  %nop15941 = alloca i1, i1 0
-  %nop15942 = alloca i1, i1 0
-  %nop15943 = alloca i1, i1 0
-  %nop15944 = alloca i1, i1 0
-  %nop15945 = alloca i1, i1 0
-  %nop15946 = alloca i1, i1 0
-  %nop15947 = alloca i1, i1 0
-  %nop15948 = alloca i1, i1 0
-  %nop15949 = alloca i1, i1 0
-  %nop15950 = alloca i1, i1 0
-  %nop15951 = alloca i1, i1 0
-  %nop15952 = alloca i1, i1 0
-  %nop15953 = alloca i1, i1 0
-  %nop15954 = alloca i1, i1 0
-  %nop15955 = alloca i1, i1 0
-  %nop15956 = alloca i1, i1 0
-  %nop15957 = alloca i1, i1 0
-  %nop15958 = alloca i1, i1 0
-  %nop15959 = alloca i1, i1 0
-  %nop15960 = alloca i1, i1 0
-  %nop15961 = alloca i1, i1 0
-  %nop15962 = alloca i1, i1 0
-  %nop15963 = alloca i1, i1 0
-  %nop15964 = alloca i1, i1 0
-  %nop15965 = alloca i1, i1 0
-  %nop15966 = alloca i1, i1 0
-  %nop15967 = alloca i1, i1 0
-  %nop15968 = alloca i1, i1 0
-  %nop15969 = alloca i1, i1 0
-  %nop15970 = alloca i1, i1 0
-  %nop15971 = alloca i1, i1 0
-  %nop15972 = alloca i1, i1 0
-  %nop15973 = alloca i1, i1 0
-  %nop15974 = alloca i1, i1 0
-  %nop15975 = alloca i1, i1 0
-  %nop15976 = alloca i1, i1 0
-  %nop15977 = alloca i1, i1 0
-  %nop15978 = alloca i1, i1 0
-  %nop15979 = alloca i1, i1 0
-  %nop15980 = alloca i1, i1 0
-  %nop15981 = alloca i1, i1 0
-  %nop15982 = alloca i1, i1 0
-  %nop15983 = alloca i1, i1 0
-  %nop15984 = alloca i1, i1 0
-  %nop15985 = alloca i1, i1 0
-  %nop15986 = alloca i1, i1 0
-  %nop15987 = alloca i1, i1 0
-  %nop15988 = alloca i1, i1 0
-  %nop15989 = alloca i1, i1 0
-  %nop15990 = alloca i1, i1 0
-  %nop15991 = alloca i1, i1 0
-  %nop15992 = alloca i1, i1 0
-  %nop15993 = alloca i1, i1 0
-  %nop15994 = alloca i1, i1 0
-  %nop15995 = alloca i1, i1 0
-  %nop15996 = alloca i1, i1 0
-  %nop15997 = alloca i1, i1 0
-  %nop15998 = alloca i1, i1 0
-  %nop15999 = alloca i1, i1 0
-  %nop16000 = alloca i1, i1 0
-  %nop16001 = alloca i1, i1 0
-  %nop16002 = alloca i1, i1 0
-  %nop16003 = alloca i1, i1 0
-  %nop16004 = alloca i1, i1 0
-  %nop16005 = alloca i1, i1 0
-  %nop16006 = alloca i1, i1 0
-  %nop16007 = alloca i1, i1 0
-  %nop16008 = alloca i1, i1 0
-  %nop16009 = alloca i1, i1 0
-  %nop16010 = alloca i1, i1 0
-  %nop16011 = alloca i1, i1 0
-  %nop16012 = alloca i1, i1 0
-  %nop16013 = alloca i1, i1 0
-  %nop16014 = alloca i1, i1 0
-  %nop16015 = alloca i1, i1 0
-  %nop16016 = alloca i1, i1 0
-  %nop16017 = alloca i1, i1 0
-  %nop16018 = alloca i1, i1 0
-  %nop16019 = alloca i1, i1 0
-  %nop16020 = alloca i1, i1 0
-  %nop16021 = alloca i1, i1 0
-  %nop16022 = alloca i1, i1 0
-  %nop16023 = alloca i1, i1 0
-  %nop16024 = alloca i1, i1 0
-  %nop16025 = alloca i1, i1 0
-  %nop16026 = alloca i1, i1 0
-  %nop16027 = alloca i1, i1 0
-  %nop16028 = alloca i1, i1 0
-  %nop16029 = alloca i1, i1 0
-  %nop16030 = alloca i1, i1 0
-  %nop16031 = alloca i1, i1 0
-  %nop16032 = alloca i1, i1 0
-  %nop16033 = alloca i1, i1 0
-  %nop16034 = alloca i1, i1 0
-  %nop16035 = alloca i1, i1 0
-  %nop16036 = alloca i1, i1 0
-  %nop16037 = alloca i1, i1 0
-  %nop16038 = alloca i1, i1 0
-  %nop16039 = alloca i1, i1 0
-  %nop16040 = alloca i1, i1 0
-  %nop16041 = alloca i1, i1 0
-  %nop16042 = alloca i1, i1 0
-  %nop16043 = alloca i1, i1 0
-  %nop16044 = alloca i1, i1 0
-  %nop16045 = alloca i1, i1 0
-  %nop16046 = alloca i1, i1 0
-  %nop16047 = alloca i1, i1 0
-  %nop16048 = alloca i1, i1 0
-  %nop16049 = alloca i1, i1 0
-  %nop16050 = alloca i1, i1 0
-  %nop16051 = alloca i1, i1 0
-  %nop16052 = alloca i1, i1 0
-  %nop16053 = alloca i1, i1 0
-  %nop16054 = alloca i1, i1 0
-  %nop16055 = alloca i1, i1 0
-  %nop16056 = alloca i1, i1 0
-  %nop16057 = alloca i1, i1 0
-  %nop16058 = alloca i1, i1 0
-  %nop16059 = alloca i1, i1 0
-  %nop16060 = alloca i1, i1 0
-  %nop16061 = alloca i1, i1 0
-  %nop16062 = alloca i1, i1 0
-  %nop16063 = alloca i1, i1 0
-  %nop16064 = alloca i1, i1 0
-  %nop16065 = alloca i1, i1 0
-  %nop16066 = alloca i1, i1 0
-  %nop16067 = alloca i1, i1 0
-  %nop16068 = alloca i1, i1 0
-  %nop16069 = alloca i1, i1 0
-  %nop16070 = alloca i1, i1 0
-  %nop16071 = alloca i1, i1 0
-  %nop16072 = alloca i1, i1 0
-  %nop16073 = alloca i1, i1 0
-  %nop16074 = alloca i1, i1 0
-  %nop16075 = alloca i1, i1 0
-  %nop16076 = alloca i1, i1 0
-  %nop16077 = alloca i1, i1 0
-  %nop16078 = alloca i1, i1 0
-  %nop16079 = alloca i1, i1 0
-  %nop16080 = alloca i1, i1 0
-  %nop16081 = alloca i1, i1 0
-  %nop16082 = alloca i1, i1 0
-  %nop16083 = alloca i1, i1 0
-  %nop16084 = alloca i1, i1 0
-  %nop16085 = alloca i1, i1 0
-  %nop16086 = alloca i1, i1 0
-  %nop16087 = alloca i1, i1 0
-  %nop16088 = alloca i1, i1 0
-  %nop16089 = alloca i1, i1 0
-  %nop16090 = alloca i1, i1 0
-  %nop16091 = alloca i1, i1 0
-  %nop16092 = alloca i1, i1 0
-  %nop16093 = alloca i1, i1 0
-  %nop16094 = alloca i1, i1 0
-  %nop16095 = alloca i1, i1 0
-  %nop16096 = alloca i1, i1 0
-  %nop16097 = alloca i1, i1 0
-  %nop16098 = alloca i1, i1 0
-  %nop16099 = alloca i1, i1 0
-  %nop16100 = alloca i1, i1 0
-  %nop16101 = alloca i1, i1 0
-  %nop16102 = alloca i1, i1 0
-  %nop16103 = alloca i1, i1 0
-  %nop16104 = alloca i1, i1 0
-  %nop16105 = alloca i1, i1 0
-  %nop16106 = alloca i1, i1 0
-  %nop16107 = alloca i1, i1 0
-  %nop16108 = alloca i1, i1 0
-  %nop16109 = alloca i1, i1 0
-  %nop16110 = alloca i1, i1 0
-  %nop16111 = alloca i1, i1 0
-  %nop16112 = alloca i1, i1 0
-  %nop16113 = alloca i1, i1 0
-  %nop16114 = alloca i1, i1 0
-  %nop16115 = alloca i1, i1 0
-  %nop16116 = alloca i1, i1 0
-  %nop16117 = alloca i1, i1 0
-  %nop16118 = alloca i1, i1 0
-  %nop16119 = alloca i1, i1 0
-  %nop16120 = alloca i1, i1 0
-  %nop16121 = alloca i1, i1 0
-  %nop16122 = alloca i1, i1 0
-  %nop16123 = alloca i1, i1 0
-  %nop16124 = alloca i1, i1 0
-  %nop16125 = alloca i1, i1 0
-  %nop16126 = alloca i1, i1 0
-  %nop16127 = alloca i1, i1 0
-  %nop16128 = alloca i1, i1 0
-  %nop16129 = alloca i1, i1 0
-  %nop16130 = alloca i1, i1 0
-  %nop16131 = alloca i1, i1 0
-  %nop16132 = alloca i1, i1 0
-  %nop16133 = alloca i1, i1 0
-  %nop16134 = alloca i1, i1 0
-  %nop16135 = alloca i1, i1 0
-  %nop16136 = alloca i1, i1 0
-  %nop16137 = alloca i1, i1 0
-  %nop16138 = alloca i1, i1 0
-  %nop16139 = alloca i1, i1 0
-  %nop16140 = alloca i1, i1 0
-  %nop16141 = alloca i1, i1 0
-  %nop16142 = alloca i1, i1 0
-  %nop16143 = alloca i1, i1 0
-  %nop16144 = alloca i1, i1 0
-  %nop16145 = alloca i1, i1 0
-  %nop16146 = alloca i1, i1 0
-  %nop16147 = alloca i1, i1 0
-  %nop16148 = alloca i1, i1 0
-  %nop16149 = alloca i1, i1 0
-  %nop16150 = alloca i1, i1 0
-  %nop16151 = alloca i1, i1 0
-  %nop16152 = alloca i1, i1 0
-  %nop16153 = alloca i1, i1 0
-  %nop16154 = alloca i1, i1 0
-  %nop16155 = alloca i1, i1 0
-  %nop16156 = alloca i1, i1 0
-  %nop16157 = alloca i1, i1 0
-  %nop16158 = alloca i1, i1 0
-  %nop16159 = alloca i1, i1 0
-  %nop16160 = alloca i1, i1 0
-  %nop16161 = alloca i1, i1 0
-  %nop16162 = alloca i1, i1 0
-  %nop16163 = alloca i1, i1 0
-  %nop16164 = alloca i1, i1 0
-  %nop16165 = alloca i1, i1 0
-  %nop16166 = alloca i1, i1 0
-  %nop16167 = alloca i1, i1 0
-  %nop16168 = alloca i1, i1 0
-  %nop16169 = alloca i1, i1 0
-  %nop16170 = alloca i1, i1 0
-  %nop16171 = alloca i1, i1 0
-  %nop16172 = alloca i1, i1 0
-  %nop16173 = alloca i1, i1 0
-  %nop16174 = alloca i1, i1 0
-  %nop16175 = alloca i1, i1 0
-  %nop16176 = alloca i1, i1 0
-  %nop16177 = alloca i1, i1 0
-  %nop16178 = alloca i1, i1 0
-  %nop16179 = alloca i1, i1 0
-  %nop16180 = alloca i1, i1 0
-  %nop16181 = alloca i1, i1 0
-  %nop16182 = alloca i1, i1 0
-  %nop16183 = alloca i1, i1 0
-  %nop16184 = alloca i1, i1 0
-  %nop16185 = alloca i1, i1 0
-  %nop16186 = alloca i1, i1 0
-  %nop16187 = alloca i1, i1 0
-  %nop16188 = alloca i1, i1 0
-  %nop16189 = alloca i1, i1 0
-  %nop16190 = alloca i1, i1 0
-  %nop16191 = alloca i1, i1 0
-  %nop16192 = alloca i1, i1 0
-  %nop16193 = alloca i1, i1 0
-  %nop16194 = alloca i1, i1 0
-  %nop16195 = alloca i1, i1 0
-  %nop16196 = alloca i1, i1 0
-  %nop16197 = alloca i1, i1 0
-  %nop16198 = alloca i1, i1 0
-  %nop16199 = alloca i1, i1 0
-  %nop16200 = alloca i1, i1 0
-  %nop16201 = alloca i1, i1 0
-  %nop16202 = alloca i1, i1 0
-  %nop16203 = alloca i1, i1 0
-  %nop16204 = alloca i1, i1 0
-  %nop16205 = alloca i1, i1 0
-  %nop16206 = alloca i1, i1 0
-  %nop16207 = alloca i1, i1 0
-  %nop16208 = alloca i1, i1 0
-  %nop16209 = alloca i1, i1 0
-  %nop16210 = alloca i1, i1 0
-  %nop16211 = alloca i1, i1 0
-  %nop16212 = alloca i1, i1 0
-  %nop16213 = alloca i1, i1 0
-  %nop16214 = alloca i1, i1 0
-  %nop16215 = alloca i1, i1 0
-  %nop16216 = alloca i1, i1 0
-  %nop16217 = alloca i1, i1 0
-  %nop16218 = alloca i1, i1 0
-  %nop16219 = alloca i1, i1 0
-  %nop16220 = alloca i1, i1 0
-  %nop16221 = alloca i1, i1 0
-  %nop16222 = alloca i1, i1 0
-  %nop16223 = alloca i1, i1 0
-  %nop16224 = alloca i1, i1 0
-  %nop16225 = alloca i1, i1 0
-  %nop16226 = alloca i1, i1 0
-  %nop16227 = alloca i1, i1 0
-  %nop16228 = alloca i1, i1 0
-  %nop16229 = alloca i1, i1 0
-  %nop16230 = alloca i1, i1 0
-  %nop16231 = alloca i1, i1 0
-  %nop16232 = alloca i1, i1 0
-  %nop16233 = alloca i1, i1 0
-  %nop16234 = alloca i1, i1 0
-  %nop16235 = alloca i1, i1 0
-  %nop16236 = alloca i1, i1 0
-  %nop16237 = alloca i1, i1 0
-  %nop16238 = alloca i1, i1 0
-  %nop16239 = alloca i1, i1 0
-  %nop16240 = alloca i1, i1 0
-  %nop16241 = alloca i1, i1 0
-  %nop16242 = alloca i1, i1 0
-  %nop16243 = alloca i1, i1 0
-  %nop16244 = alloca i1, i1 0
-  %nop16245 = alloca i1, i1 0
-  %nop16246 = alloca i1, i1 0
-  %nop16247 = alloca i1, i1 0
-  %nop16248 = alloca i1, i1 0
-  %nop16249 = alloca i1, i1 0
-  %nop16250 = alloca i1, i1 0
-  %nop16251 = alloca i1, i1 0
-  %nop16252 = alloca i1, i1 0
-  %nop16253 = alloca i1, i1 0
-  %nop16254 = alloca i1, i1 0
-  %nop16255 = alloca i1, i1 0
-  %nop16256 = alloca i1, i1 0
-  %nop16257 = alloca i1, i1 0
-  %nop16258 = alloca i1, i1 0
-  %nop16259 = alloca i1, i1 0
-  %nop16260 = alloca i1, i1 0
-  %nop16261 = alloca i1, i1 0
-  %nop16262 = alloca i1, i1 0
-  %nop16263 = alloca i1, i1 0
-  %nop16264 = alloca i1, i1 0
-  %nop16265 = alloca i1, i1 0
-  %nop16266 = alloca i1, i1 0
-  %nop16267 = alloca i1, i1 0
-  %nop16268 = alloca i1, i1 0
-  %nop16269 = alloca i1, i1 0
-  %nop16270 = alloca i1, i1 0
-  %nop16271 = alloca i1, i1 0
-  %nop16272 = alloca i1, i1 0
-  %nop16273 = alloca i1, i1 0
-  %nop16274 = alloca i1, i1 0
-  %nop16275 = alloca i1, i1 0
-  %nop16276 = alloca i1, i1 0
-  %nop16277 = alloca i1, i1 0
-  %nop16278 = alloca i1, i1 0
-  %nop16279 = alloca i1, i1 0
-  %nop16280 = alloca i1, i1 0
-  %nop16281 = alloca i1, i1 0
-  %nop16282 = alloca i1, i1 0
-  %nop16283 = alloca i1, i1 0
-  %nop16284 = alloca i1, i1 0
-  %nop16285 = alloca i1, i1 0
-  %nop16286 = alloca i1, i1 0
-  %nop16287 = alloca i1, i1 0
-  %nop16288 = alloca i1, i1 0
-  %nop16289 = alloca i1, i1 0
-  %nop16290 = alloca i1, i1 0
-  %nop16291 = alloca i1, i1 0
-  %nop16292 = alloca i1, i1 0
-  %nop16293 = alloca i1, i1 0
-  %nop16294 = alloca i1, i1 0
-  %nop16295 = alloca i1, i1 0
-  %nop16296 = alloca i1, i1 0
-  %nop16297 = alloca i1, i1 0
-  %nop16298 = alloca i1, i1 0
-  %nop16299 = alloca i1, i1 0
-  %nop16300 = alloca i1, i1 0
-  %nop16301 = alloca i1, i1 0
-  %nop16302 = alloca i1, i1 0
-  %nop16303 = alloca i1, i1 0
-  %nop16304 = alloca i1, i1 0
-  %nop16305 = alloca i1, i1 0
-  %nop16306 = alloca i1, i1 0
-  %nop16307 = alloca i1, i1 0
-  %nop16308 = alloca i1, i1 0
-  %nop16309 = alloca i1, i1 0
-  %nop16310 = alloca i1, i1 0
-  %nop16311 = alloca i1, i1 0
-  %nop16312 = alloca i1, i1 0
-  %nop16313 = alloca i1, i1 0
-  %nop16314 = alloca i1, i1 0
-  %nop16315 = alloca i1, i1 0
-  %nop16316 = alloca i1, i1 0
-  %nop16317 = alloca i1, i1 0
-  %nop16318 = alloca i1, i1 0
-  %nop16319 = alloca i1, i1 0
-  %nop16320 = alloca i1, i1 0
-  %nop16321 = alloca i1, i1 0
-  %nop16322 = alloca i1, i1 0
-  %nop16323 = alloca i1, i1 0
-  %nop16324 = alloca i1, i1 0
-  %nop16325 = alloca i1, i1 0
-  %nop16326 = alloca i1, i1 0
-  %nop16327 = alloca i1, i1 0
-  %nop16328 = alloca i1, i1 0
-  %nop16329 = alloca i1, i1 0
-  %nop16330 = alloca i1, i1 0
-  %nop16331 = alloca i1, i1 0
-  %nop16332 = alloca i1, i1 0
-  %nop16333 = alloca i1, i1 0
-  %nop16334 = alloca i1, i1 0
-  %nop16335 = alloca i1, i1 0
-  %nop16336 = alloca i1, i1 0
-  %nop16337 = alloca i1, i1 0
-  %nop16338 = alloca i1, i1 0
-  %nop16339 = alloca i1, i1 0
-  %nop16340 = alloca i1, i1 0
-  %nop16341 = alloca i1, i1 0
-  %nop16342 = alloca i1, i1 0
-  %nop16343 = alloca i1, i1 0
-  %nop16344 = alloca i1, i1 0
-  %nop16345 = alloca i1, i1 0
-  %nop16346 = alloca i1, i1 0
-  %nop16347 = alloca i1, i1 0
-  %nop16348 = alloca i1, i1 0
-  %nop16349 = alloca i1, i1 0
-  %nop16350 = alloca i1, i1 0
-  %nop16351 = alloca i1, i1 0
-  %nop16352 = alloca i1, i1 0
-  %nop16353 = alloca i1, i1 0
-  %nop16354 = alloca i1, i1 0
-  %nop16355 = alloca i1, i1 0
-  %nop16356 = alloca i1, i1 0
-  %nop16357 = alloca i1, i1 0
-  %nop16358 = alloca i1, i1 0
-  %nop16359 = alloca i1, i1 0
-  %nop16360 = alloca i1, i1 0
-  %nop16361 = alloca i1, i1 0
-  %nop16362 = alloca i1, i1 0
-  %nop16363 = alloca i1, i1 0
-  %nop16364 = alloca i1, i1 0
-  %nop16365 = alloca i1, i1 0
-  %nop16366 = alloca i1, i1 0
-  %nop16367 = alloca i1, i1 0
-  %nop16368 = alloca i1, i1 0
-  %nop16369 = alloca i1, i1 0
-  %nop16370 = alloca i1, i1 0
-  %nop16371 = alloca i1, i1 0
-  %nop16372 = alloca i1, i1 0
-  %nop16373 = alloca i1, i1 0
-  %nop16374 = alloca i1, i1 0
-  %nop16375 = alloca i1, i1 0
-  %nop16376 = alloca i1, i1 0
-  %nop16377 = alloca i1, i1 0
-  br label %for.inc
-
-for.inc:
-  %3 = load i32* %i, align 4
-  %inc = add nsw i32 %3, 1
-  store i32 %inc, i32* %i, align 4
-  br label %for.cond
-
-; CHECK:  addiu $sp, $sp, -8
-; CHECK:  sw  $ra, 0($sp)
-; CHECK:  lui $[[REG1:[0-9]+]], 65534
-; CHECK:  addiu $[[REG1]], $[[REG1]], -12
-; CHECK:  addu  $[[REG1]], $ra, $[[REG1]]
-; CHECK:  lw  $ra, 0($sp)
-; CHECK:  jr  $[[REG1]]
-; CHECK:  addiu $sp, $sp, 8
-
-for.end:
-  ret i32 0
-}
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false"
-  "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"
-  "no-infs-fp-math"="false" "no-nans-fp-math"="false"
-  "stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
-  "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips32r6/compatibility.ll b/test/CodeGen/Mips/mips32r6/compatibility.ll
new file mode 100644
index 0000000..8eac8d4
--- /dev/null
+++ b/test/CodeGen/Mips/mips32r6/compatibility.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=mipsel -mcpu=mips32r6 < %s | FileCheck %s
+; RUN: not llc -march=mipsel -mcpu=mips32r6 -mattr=+dsp < %s 2>&1 | FileCheck --check-prefix=DSP %s
+
+; CHECK: foo:
+; DSP: MIPS32r6 is not compatible with the DSP ASE
+
+define void @foo() nounwind {
+  ret void
+}
diff --git a/test/CodeGen/Mips/mips64-f128.ll b/test/CodeGen/Mips/mips64-f128.ll
index dc8bbfd..4d590b6 100644
--- a/test/CodeGen/Mips/mips64-f128.ll
+++ b/test/CodeGen/Mips/mips64-f128.ll
@@ -1,5 +1,7 @@
+; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips4 -soft-float -O1 \
+; RUN:     -disable-mips-delay-filler < %s | FileCheck %s
 ; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64 -soft-float -O1 \
-; RUN: -disable-mips-delay-filler < %s | FileCheck %s
+; RUN:     -disable-mips-delay-filler < %s | FileCheck %s
 
 @gld0 = external global fp128
 @gld1 = external global fp128
diff --git a/test/CodeGen/Mips/mips64-sret.ll b/test/CodeGen/Mips/mips64-sret.ll
index e01609f..7a52c3d 100644
--- a/test/CodeGen/Mips/mips64-sret.ll
+++ b/test/CodeGen/Mips/mips64-sret.ll
@@ -1,16 +1,23 @@
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -O3 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 < %s | FileCheck %s
 
-%struct.S = type { [8 x i32] }
+define void @foo(i32* noalias sret %agg.result) nounwind {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: sw {{.*}}, 0($4)
+; CHECK: jr $ra
+; CHECK-NEXT: move $2, $4
 
-@g = common global %struct.S zeroinitializer, align 4
+  store i32 42, i32* %agg.result
+  ret void
+}
 
-define void @f(%struct.S* noalias sret %agg.result) nounwind {
+define void @bar(i32 %v, i32* noalias sret %agg.result) nounwind {
 entry:
-; CHECK: move $2, $4
+; CHECK-LABEL: bar:
+; CHECK: sw $4, 0($5)
+; CHECK: jr $ra
+; CHECK-NEXT: move $2, $5
 
-  %0 = bitcast %struct.S* %agg.result to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.S* @g to i8*), i64 32, i32 4, i1 false)
+  store i32 %v, i32* %agg.result
   ret void
 }
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/Mips/mips64countleading.ll b/test/CodeGen/Mips/mips64countleading.ll
index b2b67e5..252f323 100644
--- a/test/CodeGen/Mips/mips64countleading.ll
+++ b/test/CodeGen/Mips/mips64countleading.ll
@@ -1,8 +1,11 @@
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS4 %s
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS64 %s
 
 define i64 @t1(i64 %X) nounwind readnone {
 entry:
-; CHECK: dclz
+; CHECK-LABEL: t1:
+; MIPS4-NOT: dclz
+; MIPS64: dclz
   %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %X, i1 true)
   ret i64 %tmp1
 }
@@ -11,7 +14,9 @@ declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
 
 define i64 @t3(i64 %X) nounwind readnone {
 entry:
-; CHECK: dclo 
+; CHECK-LABEL: t3:
+; MIPS4-NOT: dclo
+; MIPS64: dclo
   %neg = xor i64 %X, -1
   %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %neg, i1 true)
   ret i64 %tmp1
diff --git a/test/CodeGen/Mips/mips64directive.ll b/test/CodeGen/Mips/mips64directive.ll
index fa81b72..3d95f51 100644
--- a/test/CodeGen/Mips/mips64directive.ll
+++ b/test/CodeGen/Mips/mips64directive.ll
@@ -1,3 +1,4 @@
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s
 
 @gl = global i64 1250999896321, align 8
diff --git a/test/CodeGen/Mips/mips64ext.ll b/test/CodeGen/Mips/mips64ext.ll
index 02a35f8..22ea0eb 100644
--- a/test/CodeGen/Mips/mips64ext.ll
+++ b/test/CodeGen/Mips/mips64ext.ll
@@ -1,4 +1,5 @@
-; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s 
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s
+; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s
 
 define i64 @zext64_32(i32 %a) nounwind readnone {
 entry:
diff --git a/test/CodeGen/Mips/mips64fpimm0.ll b/test/CodeGen/Mips/mips64fpimm0.ll
index 17716da..19e076d 100644
--- a/test/CodeGen/Mips/mips64fpimm0.ll
+++ b/test/CodeGen/Mips/mips64fpimm0.ll
@@ -1,3 +1,4 @@
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=n64 | FileCheck %s
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=n64 | FileCheck %s
 
 define double @foo1() nounwind readnone {
diff --git a/test/CodeGen/Mips/mips64fpldst.ll b/test/CodeGen/Mips/mips64fpldst.ll
index 368ab83..2f42270 100644
--- a/test/CodeGen/Mips/mips64fpldst.ll
+++ b/test/CodeGen/Mips/mips64fpldst.ll
@@ -1,3 +1,5 @@
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
 
diff --git a/test/CodeGen/Mips/mips64imm.ll b/test/CodeGen/Mips/mips64imm.ll
index 1fc8636..c3fc61d 100644
--- a/test/CodeGen/Mips/mips64imm.ll
+++ b/test/CodeGen/Mips/mips64imm.ll
@@ -1,3 +1,4 @@
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
 
 define i32 @foo1() nounwind readnone {
diff --git a/test/CodeGen/Mips/mips64instrs.ll b/test/CodeGen/Mips/mips64instrs.ll
index 2894d69..58f11f1 100644
--- a/test/CodeGen/Mips/mips64instrs.ll
+++ b/test/CodeGen/Mips/mips64instrs.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=mips64el -mcpu=mips64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips4 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS4 %s
+; RUN: llc -march=mips64el -mcpu=mips64 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=MIPS64 %s
 
 @gll0 = common global i64 0, align 8
 @gll1 = common global i64 0, align 8
@@ -135,14 +136,24 @@ declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
 
 define i64 @f18(i64 %X) nounwind readnone {
 entry:
-; CHECK: dclz $2, $4
+; CHECK-LABEL: f18:
+
+; The MIPS4 version is too long to reasonably test. At least check we don't get dclz
+; MIPS4-NOT: dclz
+
+; MIPS64: dclz $2, $4
   %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %X, i1 true)
   ret i64 %tmp1
 }
 
 define i64 @f19(i64 %X) nounwind readnone {
 entry:
-; CHECK: dclo $2, $4
+; CHECK-LABEL: f19:
+
+; The MIPS4 version is too long to reasonably test. At least check we don't get dclo
+; MIPS4-NOT: dclo
+
+; MIPS64: dclo $2, $4
   %neg = xor i64 %X, -1
   %tmp1 = tail call i64 @llvm.ctlz.i64(i64 %neg, i1 true)
   ret i64 %tmp1
@@ -150,6 +161,7 @@ entry:
 
 define i64 @f20(i64 %a, i64 %b) nounwind readnone {
 entry:
+; CHECK-LABEL: f20:
 ; CHECK: nor
   %or = or i64 %b, %a
   %neg = xor i64 %or, -1
diff --git a/test/CodeGen/Mips/mips64intldst.ll b/test/CodeGen/Mips/mips64intldst.ll
index 62244f6..c3607ba 100644
--- a/test/CodeGen/Mips/mips64intldst.ll
+++ b/test/CodeGen/Mips/mips64intldst.ll
@@ -1,3 +1,5 @@
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
+; RUN: llc  < %s -march=mips64el -mcpu=mips4 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n64 | FileCheck %s -check-prefix=CHECK-N64
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64 -mattr=-n64,n32 | FileCheck %s -check-prefix=CHECK-N32
 
diff --git a/test/CodeGen/Mips/mips64lea.ll b/test/CodeGen/Mips/mips64lea.ll
index 54d504f..e866b21 100644
--- a/test/CodeGen/Mips/mips64lea.ll
+++ b/test/CodeGen/Mips/mips64lea.ll
@@ -1,3 +1,4 @@
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
 
 define void @foo3() nounwind {
diff --git a/test/CodeGen/Mips/mips64load-store-left-right.ll b/test/CodeGen/Mips/mips64load-store-left-right.ll
deleted file mode 100644
index 4561429..0000000
--- a/test/CodeGen/Mips/mips64load-store-left-right.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck  -check-prefix=EL %s
-; RUN: llc -march=mips64 -mcpu=mips64 -mattr=n64 < %s | FileCheck  -check-prefix=EB %s
-
-%struct.SLL = type { i64 }
-%struct.SI = type { i32 }
-%struct.SUI = type { i32 }
-
-@sll = common global %struct.SLL zeroinitializer, align 1
-@si = common global %struct.SI zeroinitializer, align 1
-@sui = common global %struct.SUI zeroinitializer, align 1
-
-define i64 @foo_load_ll() nounwind readonly {
-entry:
-; EL: ldl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
-; EL: ldr $[[R0]], 0($[[R1]])
-; EB: ldl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: ldr $[[R0]], 7($[[R1]])
-
-  %0 = load i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
-  ret i64 %0
-}
-
-define i64 @foo_load_i() nounwind readonly {
-entry:
-; EL: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: lwr $[[R0]], 0($[[R1]])
-; EB: lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: lwr $[[R0]], 3($[[R1]])
-
-  %0 = load i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
-  %conv = sext i32 %0 to i64
-  ret i64 %conv
-}
-
-define i64 @foo_load_ui() nounwind readonly {
-entry:
-; EL: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: lwr $[[R0]], 0($[[R1]])
-; EL: daddiu $[[R2:[0-9]+]], $zero, 1
-; EL: dsll   $[[R3:[0-9]+]], $[[R2]], 32
-; EL: daddiu $[[R4:[0-9]+]], $[[R3]], -1
-; EL: and    ${{[0-9]+}}, $[[R0]], $[[R4]]
-; EB: lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: lwr $[[R0]], 3($[[R1]])
-
-
-  %0 = load i32* getelementptr inbounds (%struct.SUI* @sui, i64 0, i32 0), align 1
-  %conv = zext i32 %0 to i64
-  ret i64 %conv
-}
-
-define void @foo_store_ll(i64 %a) nounwind {
-entry:
-; EL: sdl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
-; EL: sdr $[[R0]], 0($[[R1]])
-; EB: sdl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: sdr $[[R0]], 7($[[R1]])
-
-  store i64 %a, i64* getelementptr inbounds (%struct.SLL* @sll, i64 0, i32 0), align 1
-  ret void
-}
-
-define void @foo_store_i(i32 %a) nounwind {
-entry:
-; EL: swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
-; EL: swr $[[R0]], 0($[[R1]])
-; EB: swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
-; EB: swr $[[R0]], 3($[[R1]])
-
-  store i32 %a, i32* getelementptr inbounds (%struct.SI* @si, i64 0, i32 0), align 1
-  ret void
-}
-
diff --git a/test/CodeGen/Mips/mips64muldiv.ll b/test/CodeGen/Mips/mips64muldiv.ll
index fd036a2..39c73e9 100644
--- a/test/CodeGen/Mips/mips64muldiv.ll
+++ b/test/CodeGen/Mips/mips64muldiv.ll
@@ -1,3 +1,4 @@
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
 
 define i64 @m0(i64 %a0, i64 %a1) nounwind readnone {
diff --git a/test/CodeGen/Mips/mips64r6/compatibility.ll b/test/CodeGen/Mips/mips64r6/compatibility.ll
new file mode 100644
index 0000000..429f68d
--- /dev/null
+++ b/test/CodeGen/Mips/mips64r6/compatibility.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=mipsel -mcpu=mips64r6 < %s | FileCheck %s
+; RUN: not llc -march=mipsel -mcpu=mips64r6 -mattr=+dsp < %s 2>&1 | FileCheck --check-prefix=DSP %s
+
+; CHECK: foo:
+; DSP: MIPS64r6 is not compatible with the DSP ASE
+
+define void @foo() nounwind {
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/basic_operations.ll b/test/CodeGen/Mips/msa/basic_operations.ll
index 2725e9a..dbdf42b 100644
--- a/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/test/CodeGen/Mips/msa/basic_operations.ll
@@ -6,10 +6,11 @@
 @v8i16 = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
 @v4i32 = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>
 @v2i64 = global <2 x i64> <i64 0, i64 0>
+@i32 = global i32 0
 @i64 = global i64 0
 
 define void @const_v16i8() nounwind {
-  ; MIPS32-AE: const_v16i8:
+  ; MIPS32-AE-LABEL: const_v16i8:
 
   store volatile <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8>*@v16i8
   ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
@@ -45,7 +46,7 @@ define void @const_v16i8() nounwind {
 }
 
 define void @const_v8i16() nounwind {
-  ; MIPS32-AE: const_v8i16:
+  ; MIPS32-AE-LABEL: const_v8i16:
 
   store volatile <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16>*@v8i16
   ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
@@ -76,7 +77,7 @@ define void @const_v8i16() nounwind {
 }
 
 define void @const_v4i32() nounwind {
-  ; MIPS32-AE: const_v4i32:
+  ; MIPS32-AE-LABEL: const_v4i32:
 
   store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>*@v4i32
   ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
@@ -107,7 +108,7 @@ define void @const_v4i32() nounwind {
 }
 
 define void @const_v2i64() nounwind {
-  ; MIPS32-AE: const_v2i64:
+  ; MIPS32-AE-LABEL: const_v2i64:
 
   store volatile <2 x i64> <i64 0, i64 0>, <2 x i64>*@v2i64
   ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
@@ -137,7 +138,7 @@ define void @const_v2i64() nounwind {
 }
 
 define void @nonconst_v16i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h) nounwind {
-  ; MIPS32-AE: nonconst_v16i8:
+  ; MIPS32-AE-LABEL: nonconst_v16i8:
 
   %1 = insertelement <16 x i8> undef, i8 %a, i32 0
   %2 = insertelement <16 x i8> %1, i8 %b, i32 1
@@ -187,7 +188,7 @@ define void @nonconst_v16i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8
 }
 
 define void @nonconst_v8i16(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h) nounwind {
-  ; MIPS32-AE: nonconst_v8i16:
+  ; MIPS32-AE-LABEL: nonconst_v8i16:
 
   %1 = insertelement <8 x i16> undef, i16 %a, i32 0
   %2 = insertelement <8 x i16> %1, i16 %b, i32 1
@@ -221,7 +222,7 @@ define void @nonconst_v8i16(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16
 }
 
 define void @nonconst_v4i32(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
-  ; MIPS32-AE: nonconst_v4i32:
+  ; MIPS32-AE-LABEL: nonconst_v4i32:
 
   %1 = insertelement <4 x i32> undef, i32 %a, i32 0
   %2 = insertelement <4 x i32> %1, i32 %b, i32 1
@@ -239,7 +240,7 @@ define void @nonconst_v4i32(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 }
 
 define void @nonconst_v2i64(i64 %a, i64 %b) nounwind {
-  ; MIPS32-AE: nonconst_v2i64:
+  ; MIPS32-AE-LABEL: nonconst_v2i64:
 
   %1 = insertelement <2 x i64> undef, i64 %a, i32 0
   %2 = insertelement <2 x i64> %1, i64 %b, i32 1
@@ -255,7 +256,7 @@ define void @nonconst_v2i64(i64 %a, i64 %b) nounwind {
 }
 
 define i32 @extract_sext_v16i8() nounwind {
-  ; MIPS32-AE: extract_sext_v16i8:
+  ; MIPS32-AE-LABEL: extract_sext_v16i8:
 
   %1 = load <16 x i8>* @v16i8
   ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
@@ -274,7 +275,7 @@ define i32 @extract_sext_v16i8() nounwind {
 }
 
 define i32 @extract_sext_v8i16() nounwind {
-  ; MIPS32-AE: extract_sext_v8i16:
+  ; MIPS32-AE-LABEL: extract_sext_v8i16:
 
   %1 = load <8 x i16>* @v8i16
   ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
@@ -293,7 +294,7 @@ define i32 @extract_sext_v8i16() nounwind {
 }
 
 define i32 @extract_sext_v4i32() nounwind {
-  ; MIPS32-AE: extract_sext_v4i32:
+  ; MIPS32-AE-LABEL: extract_sext_v4i32:
 
   %1 = load <4 x i32>* @v4i32
   ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -309,7 +310,7 @@ define i32 @extract_sext_v4i32() nounwind {
 }
 
 define i64 @extract_sext_v2i64() nounwind {
-  ; MIPS32-AE: extract_sext_v2i64:
+  ; MIPS32-AE-LABEL: extract_sext_v2i64:
 
   %1 = load <2 x i64>* @v2i64
   ; MIPS32-AE-DAG: ld.d [[R1:\$w[0-9]+]],
@@ -328,7 +329,7 @@ define i64 @extract_sext_v2i64() nounwind {
 }
 
 define i32 @extract_zext_v16i8() nounwind {
-  ; MIPS32-AE: extract_zext_v16i8:
+  ; MIPS32-AE-LABEL: extract_zext_v16i8:
 
   %1 = load <16 x i8>* @v16i8
   ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
@@ -346,7 +347,7 @@ define i32 @extract_zext_v16i8() nounwind {
 }
 
 define i32 @extract_zext_v8i16() nounwind {
-  ; MIPS32-AE: extract_zext_v8i16:
+  ; MIPS32-AE-LABEL: extract_zext_v8i16:
 
   %1 = load <8 x i16>* @v8i16
   ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
@@ -364,7 +365,7 @@ define i32 @extract_zext_v8i16() nounwind {
 }
 
 define i32 @extract_zext_v4i32() nounwind {
-  ; MIPS32-AE: extract_zext_v4i32:
+  ; MIPS32-AE-LABEL: extract_zext_v4i32:
 
   %1 = load <4 x i32>* @v4i32
   ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -380,7 +381,7 @@ define i32 @extract_zext_v4i32() nounwind {
 }
 
 define i64 @extract_zext_v2i64() nounwind {
-  ; MIPS32-AE: extract_zext_v2i64:
+  ; MIPS32-AE-LABEL: extract_zext_v2i64:
 
   %1 = load <2 x i64>* @v2i64
   ; MIPS32-AE-DAG: ld.d [[R1:\$w[0-9]+]],
@@ -397,8 +398,200 @@ define i64 @extract_zext_v2i64() nounwind {
   ; MIPS32-AE: .size extract_zext_v2i64
 }
 
+define i32 @extract_sext_v16i8_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_sext_v16i8_vidx:
+
+  %1 = load <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v16i8)(
+  ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <16 x i8> %1, %1
+  ; MIPS32-AE-DAG: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <16 x i8> %2, i32 %3
+  %5 = sext i8 %4 to i32
+  ; MIPS32-AE-DAG: splat.b $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-DAG: sra [[R6:\$[0-9]+]], [[R5]], 24
+
+  ret i32 %5
+  ; MIPS32-AE: .size extract_sext_v16i8_vidx
+}
+
+define i32 @extract_sext_v8i16_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_sext_v8i16_vidx:
+
+  %1 = load <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v8i16)(
+  ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <8 x i16> %1, %1
+  ; MIPS32-AE-DAG: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <8 x i16> %2, i32 %3
+  %5 = sext i16 %4 to i32
+  ; MIPS32-AE-DAG: splat.h $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-DAG: sra [[R6:\$[0-9]+]], [[R5]], 16
+
+  ret i32 %5
+  ; MIPS32-AE: .size extract_sext_v8i16_vidx
+}
+
+define i32 @extract_sext_v4i32_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_sext_v4i32_vidx:
+
+  %1 = load <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v4i32)(
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <4 x i32> %1, %1
+  ; MIPS32-AE-DAG: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <4 x i32> %2, i32 %3
+  ; MIPS32-AE-DAG: splat.w $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-NOT: sra
+
+  ret i32 %4
+  ; MIPS32-AE: .size extract_sext_v4i32_vidx
+}
+
+define i64 @extract_sext_v2i64_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_sext_v2i64_vidx:
+
+  %1 = load <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v2i64)(
+  ; MIPS32-AE-DAG: ld.d [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <2 x i64> %1, %1
+  ; MIPS32-AE-DAG: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <2 x i64> %2, i32 %3
+  ; MIPS32-AE-DAG: splat.w $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-DAG: splat.w $w[[R4:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R6:\$[0-9]+]], $f[[R4]]
+  ; MIPS32-AE-NOT: sra
+
+  ret i64 %4
+  ; MIPS32-AE: .size extract_sext_v2i64_vidx
+}
+
+define i32 @extract_zext_v16i8_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_zext_v16i8_vidx:
+
+  %1 = load <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v16i8)(
+  ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <16 x i8> %1, %1
+  ; MIPS32-AE-DAG: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <16 x i8> %2, i32 %3
+  %5 = zext i8 %4 to i32
+  ; MIPS32-AE-DAG: splat.b $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-DAG: srl [[R6:\$[0-9]+]], [[R5]], 24
+
+  ret i32 %5
+  ; MIPS32-AE: .size extract_zext_v16i8_vidx
+}
+
+define i32 @extract_zext_v8i16_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_zext_v8i16_vidx:
+
+  %1 = load <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v8i16)(
+  ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <8 x i16> %1, %1
+  ; MIPS32-AE-DAG: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <8 x i16> %2, i32 %3
+  %5 = zext i16 %4 to i32
+  ; MIPS32-AE-DAG: splat.h $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-DAG: srl [[R6:\$[0-9]+]], [[R5]], 16
+
+  ret i32 %5
+  ; MIPS32-AE: .size extract_zext_v8i16_vidx
+}
+
+define i32 @extract_zext_v4i32_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_zext_v4i32_vidx:
+
+  %1 = load <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v4i32)(
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <4 x i32> %1, %1
+  ; MIPS32-AE-DAG: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <4 x i32> %2, i32 %3
+  ; MIPS32-AE-DAG: splat.w $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-NOT: srl
+
+  ret i32 %4
+  ; MIPS32-AE: .size extract_zext_v4i32_vidx
+}
+
+define i64 @extract_zext_v2i64_vidx() nounwind {
+  ; MIPS32-AE-LABEL: extract_zext_v2i64_vidx:
+
+  %1 = load <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: lw [[PTR_V:\$[0-9]+]], %got(v2i64)(
+  ; MIPS32-AE-DAG: ld.d [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = add <2 x i64> %1, %1
+  ; MIPS32-AE-DAG: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <2 x i64> %2, i32 %3
+  ; MIPS32-AE-DAG: splat.w $w[[R3:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R5:\$[0-9]+]], $f[[R3]]
+  ; MIPS32-AE-DAG: splat.w $w[[R4:[0-9]+]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: mfc1 [[R6:\$[0-9]+]], $f[[R4]]
+  ; MIPS32-AE-NOT: srl
+
+  ret i64 %4
+  ; MIPS32-AE: .size extract_zext_v2i64_vidx
+}
+
 define void @insert_v16i8(i32 %a) nounwind {
-  ; MIPS32-AE: insert_v16i8:
+  ; MIPS32-AE-LABEL: insert_v16i8:
 
   %1 = load <16 x i8>* @v16i8
   ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
@@ -420,7 +613,7 @@ define void @insert_v16i8(i32 %a) nounwind {
 }
 
 define void @insert_v8i16(i32 %a) nounwind {
-  ; MIPS32-AE: insert_v8i16:
+  ; MIPS32-AE-LABEL: insert_v8i16:
 
   %1 = load <8 x i16>* @v8i16
   ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
@@ -442,7 +635,7 @@ define void @insert_v8i16(i32 %a) nounwind {
 }
 
 define void @insert_v4i32(i32 %a) nounwind {
-  ; MIPS32-AE: insert_v4i32:
+  ; MIPS32-AE-LABEL: insert_v4i32:
 
   %1 = load <4 x i32>* @v4i32
   ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -461,7 +654,7 @@ define void @insert_v4i32(i32 %a) nounwind {
 }
 
 define void @insert_v2i64(i64 %a) nounwind {
-  ; MIPS32-AE: insert_v2i64:
+  ; MIPS32-AE-LABEL: insert_v2i64:
 
   %1 = load <2 x i64>* @v2i64
   ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -480,8 +673,131 @@ define void @insert_v2i64(i64 %a) nounwind {
   ; MIPS32-AE: .size insert_v2i64
 }
 
+define void @insert_v16i8_vidx(i32 %a) nounwind {
+  ; MIPS32-AE: insert_v16i8_vidx:
+
+  %1 = load <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
+
+  %2 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %a2 = trunc i32 %a to i8
+  %a3 = sext i8 %a2 to i32
+  %a4 = trunc i32 %a3 to i8
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %3 = insertelement <16 x i8> %1, i8 %a4, i32 %2
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[IDX]]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][0], $4
+  ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[IDX]]
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+
+  store <16 x i8> %3, <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: st.b [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v16i8_vidx
+}
+
+define void @insert_v8i16_vidx(i32 %a) nounwind {
+  ; MIPS32-AE: insert_v8i16_vidx:
+
+  %1 = load <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
+
+  %2 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %a2 = trunc i32 %a to i16
+  %a3 = sext i16 %a2 to i32
+  %a4 = trunc i32 %a3 to i16
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %3 = insertelement <8 x i16> %1, i16 %a4, i32 %2
+  ; MIPS32-AE-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 1
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]]
+  ; MIPS32-AE-DAG: insert.h [[R1]][0], $4
+  ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+
+  store <8 x i16> %3, <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: st.h [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v8i16_vidx
+}
+
+define void @insert_v4i32_vidx(i32 %a) nounwind {
+  ; MIPS32-AE: insert_v4i32_vidx:
+
+  %1 = load <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %3 = insertelement <4 x i32> %1, i32 %a, i32 %2
+  ; MIPS32-AE-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 2
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]]
+  ; MIPS32-AE-DAG: insert.w [[R1]][0], $4
+  ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+
+  store <4 x i32> %3, <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: st.w [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v4i32_vidx
+}
+
+define void @insert_v2i64_vidx(i64 %a) nounwind {
+  ; MIPS32-AE: insert_v2i64_vidx:
+
+  %1 = load <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = load i32* @i32
+  ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %3 = insertelement <2 x i64> %1, i64 %a, i32 %2
+  ; TODO: This code could be a lot better but it works. The legalizer splits
+  ; 64-bit inserts into two 32-bit inserts because there is no i64 type on
+  ; MIPS32. The obvious optimisation is to perform both insert.w's at once while
+  ; the vector is rotated.
+  ; MIPS32-AE-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 2
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]]
+  ; MIPS32-AE-DAG: insert.w [[R1]][0], $4
+  ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+  ; MIPS32-AE-DAG: addiu [[IDX2:\$[0-9]+]], [[IDX]], 1
+  ; MIPS32-AE-DAG: sll [[BIDX:\$[0-9]+]], [[IDX2]], 2
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]]
+  ; MIPS32-AE-DAG: insert.w [[R1]][0], $5
+  ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
+  ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+
+  store <2 x i64> %3, <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: st.w [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v2i64_vidx
+}
+
 define void @truncstore() nounwind {
-  ; MIPS32-AE: truncstore:
+  ; MIPS32-AE-LABEL: truncstore:
 
   store volatile <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>, <4 x i8>*@v4i8
   ; TODO: What code should be emitted?
diff --git a/test/CodeGen/Mips/msa/basic_operations_float.ll b/test/CodeGen/Mips/msa/basic_operations_float.ll
index c8cef44..a0c9d29 100644
--- a/test/CodeGen/Mips/msa/basic_operations_float.ll
+++ b/test/CodeGen/Mips/msa/basic_operations_float.ll
@@ -3,11 +3,12 @@
 
 @v4f32 = global <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
 @v2f64 = global <2 x double> <double 0.0, double 0.0>
+@i32 = global i32 0
 @f32 = global float 0.0
 @f64 = global double 0.0
 
 define void @const_v4f32() nounwind {
-  ; MIPS32: const_v4f32:
+  ; MIPS32-LABEL: const_v4f32:
 
   store volatile <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, <4 x float>*@v4f32
   ; MIPS32: ldi.b  [[R1:\$w[0-9]+]], 0
@@ -38,7 +39,7 @@ define void @const_v4f32() nounwind {
 }
 
 define void @const_v2f64() nounwind {
-  ; MIPS32: const_v2f64:
+  ; MIPS32-LABEL: const_v2f64:
 
   store volatile <2 x double> <double 0.0, double 0.0>, <2 x double>*@v2f64
   ; MIPS32: ldi.b  [[R1:\$w[0-9]+]], 0
@@ -72,7 +73,7 @@ define void @const_v2f64() nounwind {
 }
 
 define void @nonconst_v4f32() nounwind {
-  ; MIPS32: nonconst_v4f32:
+  ; MIPS32-LABEL: nonconst_v4f32:
 
   %1 = load float *@f32
   %2 = insertelement <4 x float> undef, float %1, i32 0
@@ -88,7 +89,7 @@ define void @nonconst_v4f32() nounwind {
 }
 
 define void @nonconst_v2f64() nounwind {
-  ; MIPS32: nonconst_v2f64:
+  ; MIPS32-LABEL: nonconst_v2f64:
 
   %1 = load double *@f64
   %2 = insertelement <2 x double> undef, double %1, i32 0
@@ -102,7 +103,7 @@ define void @nonconst_v2f64() nounwind {
 }
 
 define float @extract_v4f32() nounwind {
-  ; MIPS32: extract_v4f32:
+  ; MIPS32-LABEL: extract_v4f32:
 
   %1 = load <4 x float>* @v4f32
   ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -120,7 +121,7 @@ define float @extract_v4f32() nounwind {
 }
 
 define float @extract_v4f32_elt0() nounwind {
-  ; MIPS32: extract_v4f32_elt0:
+  ; MIPS32-LABEL: extract_v4f32_elt0:
 
   %1 = load <4 x float>* @v4f32
   ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -138,7 +139,7 @@ define float @extract_v4f32_elt0() nounwind {
 }
 
 define float @extract_v4f32_elt2() nounwind {
-  ; MIPS32: extract_v4f32_elt2:
+  ; MIPS32-LABEL: extract_v4f32_elt2:
 
   %1 = load <4 x float>* @v4f32
   ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -155,8 +156,29 @@ define float @extract_v4f32_elt2() nounwind {
   ; MIPS32: .size extract_v4f32_elt2
 }
 
+define float @extract_v4f32_vidx() nounwind {
+  ; MIPS32-LABEL: extract_v4f32_vidx:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: lw [[PTR_V:\$[0-9]+]], %got(v4f32)(
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = fadd <4 x float> %1, %1
+  ; MIPS32-DAG: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <4 x float> %2, i32 %3
+  ; MIPS32-DAG: splat.w $w0, [[R1]]{{\[}}[[IDX]]]
+
+  ret float %4
+  ; MIPS32: .size extract_v4f32_vidx
+}
+
 define double @extract_v2f64() nounwind {
-  ; MIPS32: extract_v2f64:
+  ; MIPS32-LABEL: extract_v2f64:
 
   %1 = load <2 x double>* @v2f64
   ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
@@ -179,7 +201,7 @@ define double @extract_v2f64() nounwind {
 }
 
 define double @extract_v2f64_elt0() nounwind {
-  ; MIPS32: extract_v2f64_elt0:
+  ; MIPS32-LABEL: extract_v2f64_elt0:
 
   %1 = load <2 x double>* @v2f64
   ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
@@ -199,8 +221,29 @@ define double @extract_v2f64_elt0() nounwind {
   ; MIPS32: .size extract_v2f64_elt0
 }
 
+define double @extract_v2f64_vidx() nounwind {
+  ; MIPS32-LABEL: extract_v2f64_vidx:
+
+  %1 = load <2 x double>* @v2f64
+  ; MIPS32-DAG: lw [[PTR_V:\$[0-9]+]], %got(v2f64)(
+  ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = fadd <2 x double> %1, %1
+  ; MIPS32-DAG: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = load i32* @i32
+  ; MIPS32-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %4 = extractelement <2 x double> %2, i32 %3
+  ; MIPS32-DAG: splat.d $w0, [[R1]]{{\[}}[[IDX]]]
+
+  ret double %4
+  ; MIPS32: .size extract_v2f64_vidx
+}
+
 define void @insert_v4f32(float %a) nounwind {
-  ; MIPS32: insert_v4f32:
+  ; MIPS32-LABEL: insert_v4f32:
 
   %1 = load <4 x float>* @v4f32
   ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
@@ -217,7 +260,7 @@ define void @insert_v4f32(float %a) nounwind {
 }
 
 define void @insert_v2f64(double %a) nounwind {
-  ; MIPS32: insert_v2f64:
+  ; MIPS32-LABEL: insert_v2f64:
 
   %1 = load <2 x double>* @v2f64
   ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
@@ -232,3 +275,55 @@ define void @insert_v2f64(double %a) nounwind {
   ret void
   ; MIPS32: .size insert_v2f64
 }
+
+define void @insert_v4f32_vidx(float %a) nounwind {
+  ; MIPS32-LABEL: insert_v4f32_vidx:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: lw [[PTR_V:\$[0-9]+]], %got(v4f32)(
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = load i32* @i32
+  ; MIPS32-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %3 = insertelement <4 x float> %1, float %a, i32 %2
+  ; float argument passed in $f12
+  ; MIPS32-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 2
+  ; MIPS32-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]]
+  ; MIPS32-DAG: insve.w [[R1]][0], $w12[0]
+  ; MIPS32-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
+  ; MIPS32-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+
+  store <4 x float> %3, <4 x float>* @v4f32
+  ; MIPS32-DAG: st.w [[R1]]
+
+  ret void
+  ; MIPS32: .size insert_v4f32_vidx
+}
+
+define void @insert_v2f64_vidx(double %a) nounwind {
+  ; MIPS32-LABEL: insert_v2f64_vidx:
+
+  %1 = load <2 x double>* @v2f64
+  ; MIPS32-DAG: lw [[PTR_V:\$[0-9]+]], %got(v2f64)(
+  ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]], 0([[PTR_V]])
+
+  %2 = load i32* @i32
+  ; MIPS32-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)(
+  ; MIPS32-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]])
+
+  %3 = insertelement <2 x double> %1, double %a, i32 %2
+  ; double argument passed in $f12
+  ; MIPS32-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 3
+  ; MIPS32-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]]
+  ; MIPS32-DAG: insve.d [[R1]][0], $w12[0]
+  ; MIPS32-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
+  ; MIPS32-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]]
+
+  store <2 x double> %3, <2 x double>* @v2f64
+  ; MIPS32-DAG: st.d [[R1]]
+
+  ret void
+  ; MIPS32: .size insert_v2f64_vidx
+}
diff --git a/test/CodeGen/Mips/optimize-fp-math.ll b/test/CodeGen/Mips/optimize-fp-math.ll
index 8b71dc4..7886f29 100644
--- a/test/CodeGen/Mips/optimize-fp-math.ll
+++ b/test/CodeGen/Mips/optimize-fp-math.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefix=64
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=64
 
 ; 32-LABEL: test_sqrtf_float_:
diff --git a/test/CodeGen/Mips/remat-immed-load.ll b/test/CodeGen/Mips/remat-immed-load.ll
index d93964b..b53b156 100644
--- a/test/CodeGen/Mips/remat-immed-load.ll
+++ b/test/CodeGen/Mips/remat-immed-load.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips4 -mattr=n64 < %s | FileCheck %s -check-prefix=64
 ; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck %s -check-prefix=64
 
 define void @f0() nounwind {
diff --git a/test/CodeGen/Mips/sint-fp-store_pattern.ll b/test/CodeGen/Mips/sint-fp-store_pattern.ll
index c44ea08..2735d78 100644
--- a/test/CodeGen/Mips/sint-fp-store_pattern.ll
+++ b/test/CodeGen/Mips/sint-fp-store_pattern.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefix=64
 ; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=64
 
 @gint_ = external global i32
diff --git a/test/CodeGen/Mips/start-asm-file.ll b/test/CodeGen/Mips/start-asm-file.ll
new file mode 100644
index 0000000..8872464
--- /dev/null
+++ b/test/CodeGen/Mips/start-asm-file.ll
@@ -0,0 +1,91 @@
+; Check the emission of directives at the start of an asm file.
+; This test is XFAILED until we fix the emission of '.option pic0' on
+; N32. At the moment we check if subtarget is Mips64 when we should be
+; checking the Subtarget's ABI.
+
+; ### O32 ABI ###
+; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
+; RUN: -relocation-model=static %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-STATIC-O32 -check-prefix=CHECK-STATIC-O32-NLEGACY %s
+
+; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
+; RUN: -relocation-model=pic %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-PIC-O32 -check-prefix=CHECK-PIC-O32-NLEGACY %s
+
+; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
+; RUN: -relocation-model=static -mattr=+nan2008 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-STATIC-O32 -check-prefix=CHECK-STATIC-O32-N2008 %s
+
+; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \
+; RUN: -relocation-model=pic -mattr=+nan2008 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-PIC-O32 -check-prefix=CHECK-PIC-O32-N2008 %s
+
+; ### N32 ABI ###
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=static -mattr=-n64,+n32 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-STATIC-N32 -check-prefix=CHECK-STATIC-N32-NLEGACY %s
+
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=pic -mattr=-n64,+n32 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-PIC-N32 -check-prefix=CHECK-PIC-N32-NLEGACY %s
+
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=static -mattr=-n64,+n32,+nan2008 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-STATIC-N32 -check-prefix=CHECK-STATIC-N32-N2008 %s
+
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=pic -mattr=-n64,+n32,+nan2008 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-PIC-N32 -check-prefix=CHECK-PIC-N32-N2008 %s
+
+; ### N64 ABI ###
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=static -mattr=+n64 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-STATIC-N64 -check-prefix=CHECK-STATIC-N64-NLEGACY %s
+
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=pic -mattr=+n64 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-PIC-N64 -check-prefix=CHECK-PIC-N64-NLEGACY %s
+
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=static -mattr=+n64,+nan2008 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-STATIC-N64 -check-prefix=CHECK-STATIC-N64-N2008 %s
+
+; RUN: llc -filetype=asm -mtriple mips64-unknown-linux -mcpu=mips64 \
+; RUN: -relocation-model=pic -mattr=+n64,+nan2008 %s -o - | \
+; RUN:   FileCheck -check-prefix=CHECK-PIC-N64 -check-prefix=CHECK-PIC-N64-N2008 %s
+
+; CHECK-STATIC-O32: .abicalls
+; CHECK-STATIC-O32: .option pic0
+; CHECK-STATIC-O32: .section .mdebug.abi32
+; CHECK-STATIC-O32-NLEGACY: .nan legacy
+; CHECK-STATIC-O32-N2008: .nan 2008
+
+; CHECK-PIC-O32: .abicalls
+; CHECK-PIC-O32-NOT: .option pic0
+; CHECK-PIC-O32: .section .mdebug.abi32
+; CHECK-PIC-O32-NLEGACY: .nan legacy
+; CHECK-PIC-O32-N2008: .nan 2008
+
+; CHECK-STATIC-N32: .abicalls
+; CHECK-STATIC-N32: .option pic0
+; CHECK-STATIC-N32: .section .mdebug.abiN32
+; CHECK-STATIC-N32-NLEGACY: .nan legacy
+; CHECK-STATIC-N32-N2008: .nan 2008
+
+; CHECK-PIC-N32: .abicalls
+; CHECK-PIC-N32-NOT: .option pic0
+; CHECK-PIC-N32: .section .mdebug.abiN32
+; CHECK-PIC-N32-NLEGACY: .nan legacy
+; CHECK-PIC-N32-N2008: .nan 2008
+
+; CHECK-STATIC-N64: .abicalls
+; CHECK-STATIC-N64-NOT: .option pic0
+; CHECK-STATIC-N64: .section .mdebug.abi64
+; CHECK-STATIC-N64-NLEGACY: .nan legacy
+; CHECK-STATIC-N64-N2008: .nan 2008
+
+; CHECK-PIC-N64: .abicalls
+; CHECK-PIC-N64-NOT: .option pic0
+; CHECK-PIC-N64: .section .mdebug.abi64
+; CHECK-PIC-N64-NLEGACY: .nan legacy
+; CHECK-PIC-N64-N2008: .nan 2008
diff --git a/test/CodeGen/Mips/tls-alias.ll b/test/CodeGen/Mips/tls-alias.ll
index 3c81054..80fbe87 100644
--- a/test/CodeGen/Mips/tls-alias.ll
+++ b/test/CodeGen/Mips/tls-alias.ll
@@ -5,6 +5,6 @@
 
 define i32* @zed() {
 ; CHECK-DAG: __tls_get_addr
-; CHECK-DAG: %tlsgd(bar)
+; CHECK-DAG: %tlsldm(bar)
        ret i32* @bar
 }
diff --git a/test/CodeGen/Mips/unalignedload.ll b/test/CodeGen/Mips/unalignedload.ll
index 19f3af7..2002b1c 100644
--- a/test/CodeGen/Mips/unalignedload.ll
+++ b/test/CodeGen/Mips/unalignedload.ll
@@ -1,5 +1,9 @@
-; RUN: llc  < %s -march=mipsel  | FileCheck %s -check-prefix=CHECK-EL
-; RUN: llc  < %s -march=mips    | FileCheck %s -check-prefix=CHECK-EB
+; RUN: llc  < %s -march=mipsel -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EL -check-prefix=MIPS32-EL
+; RUN: llc  < %s -march=mips   -mcpu=mips32   | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EB -check-prefix=MIPS32-EB
+; RUN: llc  < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EL -check-prefix=MIPS32-EL
+; RUN: llc  < %s -march=mips   -mcpu=mips32r2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EB -check-prefix=MIPS32-EB
+; RUN: llc  < %s -march=mipsel -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EL -check-prefix=MIPS32R6-EL
+; RUN: llc  < %s -march=mips   -mcpu=mips32r6 | FileCheck %s -check-prefix=ALL -check-prefix=ALL-EB -check-prefix=MIPS32R6-EB
 %struct.S2 = type { %struct.S1, %struct.S1 }
 %struct.S1 = type { i8, i8 }
 %struct.S4 = type { [7 x i8] }
@@ -7,21 +11,71 @@
 @s2 = common global %struct.S2 zeroinitializer, align 1
 @s4 = common global %struct.S4 zeroinitializer, align 1
 
-define void @foo1() nounwind {
+define void @bar1() nounwind {
 entry:
-; CHECK-EL-DAG: lbu ${{[0-9]+}}, 2($[[R0:[0-9]+]])
-; CHECK-EL-DAG: lbu ${{[0-9]+}}, 3($[[R0]])
-; CHECK-EL:     jalr
-; CHECK-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[R2:[0-9]+]])
-; CHECK-EL-DAG: lwr $[[R1]], 0($[[R2]])
-
-; CHECK-EB-DAG: lbu ${{[0-9]+}}, 3($[[R0:[0-9]+]])
-; CHECK-EB-DAG: lbu ${{[0-9]+}}, 2($[[R0]])
-; CHECK-EB:     jalr
-; CHECK-EB-DAG: lwl $[[R1:[0-9]+]], 0($[[R2:[0-9]+]])
-; CHECK-EB-DAG: lwr $[[R1]], 3($[[R2]])
+; ALL-LABEL: bar1:
+
+; ALL-DAG:       lw $[[R0:[0-9]+]], %got(s2)(
+
+; MIPS32-EL-DAG: lbu $[[PART1:[0-9]+]], 2($[[R0]])
+; MIPS32-EL-DAG: lbu $[[PART2:[0-9]+]], 3($[[R0]])
+; MIPS32-EL-DAG: sll $[[T0:[0-9]+]], $[[PART2]], 8
+; MIPS32-EL-DAG: or  $4, $[[T0]], $[[PART1]]
+
+; MIPS32-EB-DAG: lbu $[[PART1:[0-9]+]], 2($[[R0]])
+; MIPS32-EB-DAG: lbu $[[PART2:[0-9]+]], 3($[[R0]])
+; MIPS32-EB-DAG: sll $[[T0:[0-9]+]], $[[PART1]], 8
+; MIPS32-EB-DAG: or  $[[T1:[0-9]+]], $[[T0]], $[[PART2]]
+; MIPS32-EB-DAG: sll $4, $[[T1]], 16
+
+; MIPS32R6-DAG:  lhu $[[PART1:[0-9]+]], 2($[[R0]])
 
   tail call void @foo2(%struct.S1* byval getelementptr inbounds (%struct.S2* @s2, i32 0, i32 1)) nounwind
+  ret void
+}
+
+define void @bar2() nounwind {
+entry:
+; ALL-LABEL: bar2:
+
+; ALL-DAG:       lw $[[R2:[0-9]+]], %got(s4)(
+
+; MIPS32-EL-DAG: lwl $[[R1:4]], 3($[[R2]])
+; MIPS32-EL-DAG: lwr $[[R1]], 0($[[R2]])
+; MIPS32-EL-DAG: lbu $[[T0:[0-9]+]], 4($[[R2]])
+; MIPS32-EL-DAG: lbu $[[T1:[0-9]+]], 5($[[R2]])
+; MIPS32-EL-DAG: lbu $[[T2:[0-9]+]], 6($[[R2]])
+; MIPS32-EL-DAG: sll $[[T3:[0-9]+]], $[[T1]], 8
+; MIPS32-EL-DAG: or  $[[T4:[0-9]+]], $[[T3]], $[[T0]]
+; MIPS32-EL-DAG: sll $[[T5:[0-9]+]], $[[T2]], 16
+; MIPS32-EL-DAG: or  $5, $[[T4]], $[[T5]]
+
+; MIPS32-EB-DAG: lwl $[[R1:4]], 0($[[R2]])
+; MIPS32-EB-DAG: lwr $[[R1]], 3($[[R2]])
+; MIPS32-EB-DAG: lbu $[[T0:[0-9]+]], 4($[[R2]])
+; MIPS32-EB-DAG: lbu $[[T1:[0-9]+]], 5($[[R2]])
+; MIPS32-EB-DAG: lbu $[[T2:[0-9]+]], 6($[[R2]])
+; MIPS32-EB-DAG: sll $[[T3:[0-9]+]], $[[T0]], 8
+; MIPS32-EB-DAG: or  $[[T4:[0-9]+]], $[[T3]], $[[T1]]
+; MIPS32-EB-DAG: sll $[[T5:[0-9]+]], $[[T4]], 16
+; MIPS32-EB-DAG: sll $[[T6:[0-9]+]], $[[T2]], 8
+; MIPS32-EB-DAG: or  $5, $[[T5]], $[[T6]]
+
+; FIXME: We should be able to do better than this using lhu
+; MIPS32R6-EL-DAG: lw $4, 0($[[R2]])
+; MIPS32R6-EL-DAG: lhu $[[T0:[0-9]+]], 4($[[R2]])
+; MIPS32R6-EL-DAG: lbu $[[T1:[0-9]+]], 6($[[R2]])
+; MIPS32R6-EL-DAG: sll $[[T2:[0-9]+]], $[[T1]], 16
+; MIPS32R6-EL-DAG: or  $5, $[[T0]], $[[T2]]
+
+; FIXME: We should be able to do better than this using lhu
+; MIPS32R6-EB-DAG: lw $4, 0($[[R2]])
+; MIPS32R6-EB-DAG: lhu $[[T0:[0-9]+]], 4($[[R2]])
+; MIPS32R6-EB-DAG: lbu $[[T1:[0-9]+]], 6($[[R2]])
+; MIPS32R6-EB-DAG: sll $[[T2:[0-9]+]], $[[T0]], 16
+; MIPS32R6-EB-DAG: sll $[[T3:[0-9]+]], $[[T1]], 8
+; MIPS32R6-EB-DAG: or  $5, $[[T2]], $[[T3]]
+
   tail call void @foo4(%struct.S4* byval @s4) nounwind
   ret void
 }
diff --git a/test/CodeGen/NVPTX/access-non-generic.ll b/test/CodeGen/NVPTX/access-non-generic.ll
new file mode 100644
index 0000000..0622aa3
--- /dev/null
+++ b/test/CodeGen/NVPTX/access-non-generic.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix PTX
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX
+; RUN: opt < %s -S -nvptx-favor-non-generic -dce | FileCheck %s --check-prefix IR
+
+@array = internal addrspace(3) global [10 x float] zeroinitializer, align 4
+@scalar = internal addrspace(3) global float 0.000000e+00, align 4
+
+; Verifies nvptx-favor-non-generic correctly optimizes generic address space
+; usage to non-generic address space usage for the patterns we claim to handle:
+; 1. load cast
+; 2. store cast
+; 3. load gep cast
+; 4. store gep cast
+; gep and cast can be an instruction or a constant expression. This function
+; tries all possible combinations.
+define float @ld_st_shared_f32(i32 %i, float %v) {
+; IR-LABEL: @ld_st_shared_f32
+; IR-NOT: addrspacecast
+; PTX-LABEL: ld_st_shared_f32(
+  ; load cast
+  %1 = load float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
+  ; store cast
+  store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
+; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
+  ; use syncthreads to disable optimizations across components
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  ; cast; load
+  %2 = addrspacecast float addrspace(3)* @scalar to float*
+  %3 = load float* %2, align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
+  ; cast; store
+  store float %v, float* %2, align 4
+; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  ; load gep cast
+  %4 = load float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
+  ; store gep cast
+  store float %v, float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
+; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  ; gep cast; load
+  %5 = getelementptr inbounds [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5
+  %6 = load float* %5, align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
+  ; gep cast; store
+  store float %v, float* %5, align 4
+; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  ; cast; gep; load
+  %7 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float]*
+  %8 = getelementptr inbounds [10 x float]* %7, i32 0, i32 %i
+  %9 = load float* %8, align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
+  ; cast; gep; store
+  store float %v, float* %8, align 4
+; PTX: st.shared.f32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}};
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  %sum2 = fadd float %1, %3
+  %sum3 = fadd float %sum2, %4
+  %sum4 = fadd float %sum3, %6
+  %sum5 = fadd float %sum4, %9
+  ret float %sum5
+}
+
+; Verifies nvptx-favor-non-generic keeps addrspacecasts between pointers of
+; different element types.
+define i32 @ld_int_from_float() {
+; IR-LABEL: @ld_int_from_float
+; IR: addrspacecast
+; PTX-LABEL: ld_int_from_float(
+; PTX: cvta.shared.u{{(32|64)}}
+  %1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4
+  ret i32 %1
+}
+
+declare void @llvm.cuda.syncthreads() #3
+
+attributes #3 = { noduplicate nounwind }
+
diff --git a/test/CodeGen/NVPTX/addrspacecast-gvar.ll b/test/CodeGen/NVPTX/addrspacecast-gvar.ll
new file mode 100644
index 0000000..6afbdb8
--- /dev/null
+++ b/test/CodeGen/NVPTX/addrspacecast-gvar.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; CHECK: .visible .global .align 4 .u32 g = 42;
+; CHECK: .visible .global .align 4 .u32 g2 = generic(g);
+; CHECK: .visible .global .align 4 .u32 g3 = g;
+
+@g = addrspace(1) global i32 42
+@g2 = addrspace(1) global i32* addrspacecast (i32 addrspace(1)* @g to i32*)
+@g3 = addrspace(1) global i32 addrspace(1)* @g
diff --git a/test/CodeGen/NVPTX/addrspacecast.ll b/test/CodeGen/NVPTX/addrspacecast.ll
index 98ea655..03b9a98 100644
--- a/test/CodeGen/NVPTX/addrspacecast.ll
+++ b/test/CodeGen/NVPTX/addrspacecast.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefix=PTX32
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefix=PTX64
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -disable-nvptx-favor-non-generic | FileCheck %s -check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -disable-nvptx-favor-non-generic | FileCheck %s -check-prefix=PTX64
 
 
 define i32 @conv1(i32 addrspace(1)* %ptr) {
diff --git a/test/CodeGen/NVPTX/local-stack-frame.ll b/test/CodeGen/NVPTX/local-stack-frame.ll
index 178dff1..c0d7d1c 100644
--- a/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -3,16 +3,16 @@
 
 ; Ensure we access the local stack properly
 
-; PTX32:        mov.u32         %r{{[0-9]+}}, __local_depot{{[0-9]+}};
-; PTX32:        cvta.local.u32  %SP, %r{{[0-9]+}};
-; PTX32:        ld.param.u32    %r{{[0-9]+}}, [foo_param_0];
-; PTX32:        st.u32  [%SP+0], %r{{[0-9]+}};
-; PTX64:        mov.u64         %rl{{[0-9]+}}, __local_depot{{[0-9]+}};
-; PTX64:        cvta.local.u64  %SP, %rl{{[0-9]+}};
-; PTX64:        ld.param.u32    %r{{[0-9]+}}, [foo_param_0];
-; PTX64:        st.u32  [%SP+0], %r{{[0-9]+}};
+; PTX32:        mov.u32          %r{{[0-9]+}}, __local_depot{{[0-9]+}};
+; PTX32:        cvta.local.u32   %SP, %r{{[0-9]+}};
+; PTX32:        ld.param.u32     %r{{[0-9]+}}, [foo_param_0];
+; PTX32:        st.volatile.u32  [%SP+0], %r{{[0-9]+}};
+; PTX64:        mov.u64          %rl{{[0-9]+}}, __local_depot{{[0-9]+}};
+; PTX64:        cvta.local.u64   %SP, %rl{{[0-9]+}};
+; PTX64:        ld.param.u32     %r{{[0-9]+}}, [foo_param_0];
+; PTX64:        st.volatile.u32  [%SP+0], %r{{[0-9]+}};
 define void @foo(i32 %a) {
   %local = alloca i32, align 4
-  store i32 %a, i32* %local
+  store volatile i32 %a, i32* %local
   ret void
 }
diff --git a/test/CodeGen/NVPTX/surf-read.ll b/test/CodeGen/NVPTX/surf-read.ll
new file mode 100644
index 0000000..a69d03e
--- /dev/null
+++ b/test/CodeGen/NVPTX/surf-read.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-nvcl"
+
+declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
+
+; CHECK: .entry foo
+define void @foo(i64 %img, float* %red, i32 %idx) {
+; CHECK: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [foo_param_0, {%r{{[0-9]+}}}]
+  %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
+; CHECK: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+  %ret = sitofp i32 %val to float
+; CHECK: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+  store float %ret, float* %red
+  ret void
+}
+
+!nvvm.annotations = !{!1, !2}
+!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (i64, float*, i32)* @foo, metadata !"rdwrimage", i32 0}
diff --git a/test/CodeGen/NVPTX/surf-write.ll b/test/CodeGen/NVPTX/surf-write.ll
new file mode 100644
index 0000000..880231f
--- /dev/null
+++ b/test/CodeGen/NVPTX/surf-write.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-nvcl"
+
+declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
+
+; CHECK: .entry foo
+define void @foo(i64 %img, i32 %val, i32 %idx) {
+; CHECK: sust.b.1d.b32.trap [foo_param_0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+  tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val)
+  ret void
+}
+
+!nvvm.annotations = !{!1, !2}
+!1 = metadata !{void (i64, i32, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (i64, i32, i32)* @foo, metadata !"wroimage", i32 0}
diff --git a/test/CodeGen/NVPTX/tex-read.ll b/test/CodeGen/NVPTX/tex-read.ll
new file mode 100644
index 0000000..291060b
--- /dev/null
+++ b/test/CodeGen/NVPTX/tex-read.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-nvcl"
+
+declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64, i64, i32)
+
+; CHECK: .entry foo
+define void @foo(i64 %img, i64 %sampler, float* %red, i32 %idx) {
+; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
+  %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64 %img, i64 %sampler, i32 %idx)
+  %ret = extractvalue { float, float, float, float } %val, 0
+; CHECK: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+  store float %ret, float* %red
+  ret void
+}
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = metadata !{void (i64, i64, float*, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (i64, i64, float*, i32)* @foo, metadata !"rdoimage", i32 0}
+!3 = metadata !{void (i64, i64, float*, i32)* @foo, metadata !"sampler", i32 1}
diff --git a/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll b/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
index ccf5297..df83f8b 100644
--- a/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
+++ b/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mcpu=g5 < %s | FileCheck %s
+; RUN: llc -mcpu=g5 -addr-sink-using-gep=1 < %s | FileCheck %s
 ;; Formerly crashed, see PR 1508
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc64-apple-darwin8"
diff --git a/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll b/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll
index 00a402e..8802b97 100644
--- a/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll
+++ b/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=ppc32 -mcpu=g5 | grep vadduhm
 ; RUN: llc < %s -march=ppc32 -mcpu=g5 | grep vsubuhm
-; XFAIL: *
 
 define <4 x i32> @test() nounwind {
 	ret <4 x i32> < i32 4293066722, i32 4293066722, i32 4293066722, i32 4293066722>
diff --git a/test/CodeGen/PowerPC/aa-tbaa.ll b/test/CodeGen/PowerPC/aa-tbaa.ll
index d7f80fa..1939841 100644
--- a/test/CodeGen/PowerPC/aa-tbaa.ll
+++ b/test/CodeGen/PowerPC/aa-tbaa.ll
@@ -1,4 +1,4 @@
-; RUN: llc -enable-misched -misched=shuffle -enable-aa-sched-mi -post-RA-scheduler=0 -mcpu=ppc64 < %s | FileCheck %s
+; RUN: llc -enable-misched -misched=shuffle -enable-aa-sched-mi -use-tbaa-in-sched-mi=0 -post-RA-scheduler=0 -mcpu=ppc64 < %s | FileCheck %s
 
 ; REQUIRES: asserts
 ; -misched=shuffle is NDEBUG only!
diff --git a/test/CodeGen/PowerPC/alias.ll b/test/CodeGen/PowerPC/alias.ll
new file mode 100644
index 0000000..86e4114
--- /dev/null
+++ b/test/CodeGen/PowerPC/alias.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -code-model=medium| FileCheck --check-prefix=CHECK --check-prefix=MEDIUM %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK --check-prefix=LARGE %s
+
+@foo = global i32 42
+@fooa = alias i32* @foo
+
+@foo2 = global i64 42
+@foo2a = alias i64* @foo2
+
+; CHECK-LABEL: bar:
+define i32 @bar() {
+; MEDIUM: addis 3, 2, fooa@toc@ha
+; LARGE: addis 3, 2, .LC1@toc@ha
+  %a = load i32* @fooa
+  ret i32 %a
+}
+
+; CHECK-LABEL: bar2:
+define i64 @bar2() {
+; MEDIUM: addis 3, 2, foo2a@toc@ha
+; MEDIUM: addi 3, 3, foo2a@toc@l
+; LARGE: addis 3, 2, .LC3@toc@ha
+  %a = load i64* @foo2a
+  ret i64 %a
+}
+
+; LARGE: .LC1:
+; LARGE-NEXT: .tc fooa[TC],fooa
+
+; LARGE: .LC3:
+; LARGE-NEXT: .tc foo2a[TC],foo2a
diff --git a/test/CodeGen/PowerPC/cc.ll b/test/CodeGen/PowerPC/cc.ll
new file mode 100644
index 0000000..f92121b
--- /dev/null
+++ b/test/CodeGen/PowerPC/cc.ll
@@ -0,0 +1,70 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i64 @test1(i64 %a, i64 %b) {
+entry:
+  %c = icmp eq i64 %a, %b
+  br label %foo
+
+foo:
+  call { i64, i64 } asm sideeffect "sc", "={r0},={r3},{r0},~{cr0},~{cr1},~{cr2},~{cr3},~{cr4},~{cr5},~{cr6},~{cr7}" (i64 %a)
+  br i1 %c, label %bar, label %end
+
+bar:
+  ret i64 %b
+
+end:
+  ret i64 %a
+
+; CHECK-LABEL: @test1
+; CHECK: mfcr [[REG1:[0-9]+]]
+; CHECK-DAG: cmpd
+; CHECK-DAG: mfocrf [[REG2:[0-9]+]],
+; CHECK-DAG: stw [[REG1]], 8(1)
+; CHECK-DAG: stw [[REG2]], -4(1)
+
+; CHECK: sc
+; CHECK: lwz [[REG3:[0-9]+]], -4(1)
+; CHECK: mtocrf 128, [[REG3]]
+
+; CHECK: lwz [[REG4:[0-9]+]], 8(1)
+; CHECK-DAG: mtocrf 32, [[REG4]]
+; CHECK-DAG: mtocrf 16, [[REG4]]
+; CHECK-DAG: mtocrf 8, [[REG4]]
+; CHECK: blr
+}
+
+define i64 @test2(i64 %a, i64 %b) {
+entry:
+  %c = icmp eq i64 %a, %b
+  br label %foo
+
+foo:
+  call { i64, i64 } asm sideeffect "sc", "={r0},={r3},{r0},~{cc}" (i64 %a)
+  br i1 %c, label %bar, label %end
+
+bar:
+  ret i64 %b
+
+end:
+  ret i64 %a
+
+; CHECK-LABEL: @test2
+; CHECK: mfcr [[REG1:[0-9]+]]
+; CHECK-DAG: cmpd
+; CHECK-DAG: mfocrf [[REG2:[0-9]+]],
+; CHECK-DAG: stw [[REG1]], 8(1)
+; CHECK-DAG: stw [[REG2]], -4(1)
+
+; CHECK: sc
+; CHECK: lwz [[REG3:[0-9]+]], -4(1)
+; CHECK: mtocrf 128, [[REG3]]
+
+; CHECK: lwz [[REG4:[0-9]+]], 8(1)
+; CHECK-DAG: mtocrf 32, [[REG4]]
+; CHECK-DAG: mtocrf 16, [[REG4]]
+; CHECK-DAG: mtocrf 8, [[REG4]]
+; CHECK: blr
+}
+
diff --git a/test/CodeGen/PowerPC/ctrloop-le.ll b/test/CodeGen/PowerPC/ctrloop-le.ll
index 7b8185e..60b0536 100644
--- a/test/CodeGen/PowerPC/ctrloop-le.ll
+++ b/test/CodeGen/PowerPC/ctrloop-le.ll
@@ -2,6 +2,9 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "powerpc64-unknown-linux-gnu"
 ; RUN: llc < %s -march=ppc64 | FileCheck %s
 
+; XFAIL: *
+; SE needs improvement
+
 ; CHECK: test_pos1_ir_sle
 ; CHECK: bdnz
 ; a < b
diff --git a/test/CodeGen/PowerPC/ctrloop-lt.ll b/test/CodeGen/PowerPC/ctrloop-lt.ll
index eaab61a..a9dc42c 100644
--- a/test/CodeGen/PowerPC/ctrloop-lt.ll
+++ b/test/CodeGen/PowerPC/ctrloop-lt.ll
@@ -2,6 +2,9 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "powerpc64-unknown-linux-gnu"
 ; RUN: llc < %s -march=ppc64 | FileCheck %s
 
+; XFAIL: *
+; SE needs improvement
+
 ; CHECK: test_pos1_ir_slt
 ; CHECK: bdnz
 ; a < b
diff --git a/test/CodeGen/PowerPC/ctrloop-sh.ll b/test/CodeGen/PowerPC/ctrloop-sh.ll
new file mode 100644
index 0000000..d8e6fc7
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloop-sh.ll
@@ -0,0 +1,72 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-p:32:32-i128:64-n32"
+target triple = "powerpc-ellcc-linux"
+
+; Function Attrs: nounwind
+define void @foo1(i128* %a, i128* readonly %b, i128* readonly %c) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i128* %b, align 16
+  %1 = load i128* %c, align 16
+  %shl = shl i128 %0, %1
+  store i128 %shl, i128* %a, align 16
+  %inc = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, 2048
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; CHECK-LABEL: @foo1
+; CHECK-NOT: mtctr
+}
+
+; Function Attrs: nounwind
+define void @foo2(i128* %a, i128* readonly %b, i128* readonly %c) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i128* %b, align 16
+  %1 = load i128* %c, align 16
+  %shl = ashr i128 %0, %1
+  store i128 %shl, i128* %a, align 16
+  %inc = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, 2048
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; CHECK-LABEL: @foo2
+; CHECK-NOT: mtctr
+}
+
+; Function Attrs: nounwind
+define void @foo3(i128* %a, i128* readonly %b, i128* readonly %c) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i128* %b, align 16
+  %1 = load i128* %c, align 16
+  %shl = lshr i128 %0, %1
+  store i128 %shl, i128* %a, align 16
+  %inc = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, 2048
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; CHECK-LABEL: @foo3
+; CHECK-NOT: mtctr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/dbg.ll b/test/CodeGen/PowerPC/dbg.ll
index 0d6c4a6..6beea55 100644
--- a/test/CodeGen/PowerPC/dbg.ll
+++ b/test/CodeGen/PowerPC/dbg.ll
@@ -28,8 +28,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !10 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
 !11 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
 !12 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!13 = metadata !{metadata !14}
-!14 = metadata !{metadata !15, metadata !16}
+!13 = metadata !{metadata !15, metadata !16}
 !15 = metadata !{i32 721153, metadata !5, metadata !"argc", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !16 = metadata !{i32 721153, metadata !5, metadata !"argv", metadata !6, i32 33554433, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !17 = metadata !{i32 1, i32 14, metadata !5, null}
diff --git a/test/CodeGen/PowerPC/indexed-load.ll b/test/CodeGen/PowerPC/indexed-load.ll
new file mode 100644
index 0000000..59fc058
--- /dev/null
+++ b/test/CodeGen/PowerPC/indexed-load.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+
+; The SplitIndexingFromLoad tranformation exposed an isel backend bug.  This
+; testcase used to generate stwx 4, 3, 64.  stwx does not have an
+; immediate-offset format (note the 64) and it should not be matched.
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%class.test = type { [64 x i8], [5 x i8] }
+
+; CHECK-LABEL: f:
+; CHECK-NOT: stwx {{[0-9]+}}, {{[0-9]+}}, 64
+define void @f(%class.test* %this) {
+entry:
+  %Subminor.i.i = getelementptr inbounds %class.test* %this, i64 0, i32 1
+  %0 = bitcast [5 x i8]* %Subminor.i.i to i40*
+  %bf.load2.i.i = load i40* %0, align 4
+  %bf.clear7.i.i = and i40 %bf.load2.i.i, -8589934592
+  store i40 %bf.clear7.i.i, i40* %0, align 4
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/mcm-10.ll b/test/CodeGen/PowerPC/mcm-10.ll
index b479559..c3ab747 100644
--- a/test/CodeGen/PowerPC/mcm-10.ll
+++ b/test/CodeGen/PowerPC/mcm-10.ll
@@ -18,7 +18,8 @@ entry:
 
 ; CHECK-LABEL: test_fn_static:
 ; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
-; CHECK: lwz {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK: lwa {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-NOT: extsw
 ; CHECK: stw {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
 ; CHECK: .type [[VAR]],@object
 ; CHECK: .local [[VAR]]
diff --git a/test/CodeGen/PowerPC/mcm-11.ll b/test/CodeGen/PowerPC/mcm-11.ll
index c49e865..033045c 100644
--- a/test/CodeGen/PowerPC/mcm-11.ll
+++ b/test/CodeGen/PowerPC/mcm-11.ll
@@ -18,7 +18,8 @@ entry:
 
 ; CHECK-LABEL: test_file_static:
 ; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
-; CHECK: lwz {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK: lwa {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK-NOT: extsw
 ; CHECK: stw {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
 ; CHECK: .type [[VAR]],@object
 ; CHECK: .data
diff --git a/test/CodeGen/PowerPC/mcm-obj-2.ll b/test/CodeGen/PowerPC/mcm-obj-2.ll
index a6e9855..c42cf0c 100644
--- a/test/CodeGen/PowerPC/mcm-obj-2.ll
+++ b/test/CodeGen/PowerPC/mcm-obj-2.ll
@@ -22,7 +22,7 @@ entry:
 ; CHECK: Relocations [
 ; CHECK:   Section (2) .rela.text {
 ; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM2:[^ ]+]]
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO [[SYM2]]
+; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM2]]
 ; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO [[SYM2]]
 
 @gi = global i32 5, align 4
@@ -39,7 +39,7 @@ entry:
 ; accessing file-scope variable gi.
 ;
 ; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_HA [[SYM3:[^ ]+]]
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO [[SYM3]]
+; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO_DS [[SYM3]]
 ; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TOC16_LO [[SYM3]]
 
 define double @test_double_const() nounwind {
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r0.ll b/test/CodeGen/PowerPC/named-reg-alloc-r0.ll
new file mode 100644
index 0000000..e683f99
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r0.ll
@@ -0,0 +1,15 @@
+; RUN: not llc < %s -mtriple=powerpc-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=powerpc-unknown-linux-gnu 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i32 @get_reg() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK: Invalid register name global variable
+        %reg = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %reg
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"r0\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r1-64.ll b/test/CodeGen/PowerPC/named-reg-alloc-r1-64.ll
new file mode 100644
index 0000000..b047f9f
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r1-64.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=powerpc64-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i64 @get_reg() nounwind {
+entry:
+  %reg = call i64 @llvm.read_register.i64(metadata !0)
+  ret i64 %reg
+
+; CHECK-LABEL: @get_reg
+; CHECK: mr 3, 1
+
+; CHECK-DARWIN-LABEL: @get_reg
+; CHECK-DARWIN: mr r3, r1
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+
+!0 = metadata !{metadata !"r1\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r1.ll b/test/CodeGen/PowerPC/named-reg-alloc-r1.ll
new file mode 100644
index 0000000..9d0eb34
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r1.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=powerpc-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc64-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i32 @get_reg() nounwind {
+entry:
+  %reg = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %reg
+
+; CHECK-LABEL: @get_reg
+; CHECK: mr 3, 1
+
+; CHECK-DARWIN-LABEL: @get_reg
+; CHECK-DARWIN: mr r3, r1
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"r1\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r13-64.ll b/test/CodeGen/PowerPC/named-reg-alloc-r13-64.ll
new file mode 100644
index 0000000..df5085b
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r13-64.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=powerpc64-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i64 @get_reg() nounwind {
+entry:
+  %reg = call i64 @llvm.read_register.i64(metadata !0)
+  ret i64 %reg
+
+; CHECK-LABEL: @get_reg
+; CHECK: mr 3, 13
+
+; CHECK-DARWIN-LABEL: @get_reg
+; CHECK-DARWIN: mr r3, r13
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+
+!0 = metadata !{metadata !"r13\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r13.ll b/test/CodeGen/PowerPC/named-reg-alloc-r13.ll
new file mode 100644
index 0000000..900ebb2
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r13.ll
@@ -0,0 +1,18 @@
+; RUN: not llc < %s -mtriple=powerpc-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i32 @get_reg() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK-DARWIN: Invalid register name global variable
+        %reg = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %reg
+
+; CHECK-LABEL: @get_reg
+; CHECK: mr 3, 13
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"r13\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r2-64.ll b/test/CodeGen/PowerPC/named-reg-alloc-r2-64.ll
new file mode 100644
index 0000000..0da33fa
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r2-64.ll
@@ -0,0 +1,17 @@
+; RUN: not llc < %s -mtriple=powerpc64-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i64 @get_reg() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK-DARWIN: Invalid register name global variable
+        %reg = call i64 @llvm.read_register.i64(metadata !0)
+  ret i64 %reg
+
+; CHECK-LABEL: @get_reg
+; CHECK: mr 3, 2
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+
+!0 = metadata !{metadata !"r2\00"}
diff --git a/test/CodeGen/PowerPC/named-reg-alloc-r2.ll b/test/CodeGen/PowerPC/named-reg-alloc-r2.ll
new file mode 100644
index 0000000..51e7e3e
--- /dev/null
+++ b/test/CodeGen/PowerPC/named-reg-alloc-r2.ll
@@ -0,0 +1,18 @@
+; RUN: not llc < %s -mtriple=powerpc-apple-darwin 2>&1 | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu 2>&1 | FileCheck %s
+
+define i32 @get_reg() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK-DARWIN: Invalid register name global variable
+        %reg = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %reg
+
+; CHECK-LABEL: @get_reg
+; CHECK: mr 3, 2
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"r2\00"}
diff --git a/test/CodeGen/PowerPC/rlwimi-dyn-and.ll b/test/CodeGen/PowerPC/rlwimi-dyn-and.ll
new file mode 100644
index 0000000..e02801f
--- /dev/null
+++ b/test/CodeGen/PowerPC/rlwimi-dyn-and.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @test1() #0 {
+entry:
+  %conv67.reload = load i32* undef
+  %const = bitcast i32 65535 to i32
+  br label %next
+
+next:
+  %shl161 = shl nuw nsw i32 %conv67.reload, 15
+  %0 = load i8* undef, align 1
+  %conv169 = zext i8 %0 to i32
+  %shl170 = shl nuw nsw i32 %conv169, 7
+  %const_mat = add i32 %const, -32767
+  %shl161.masked = and i32 %shl161, %const_mat
+  %conv174 = or i32 %shl170, %shl161.masked
+  ret i32 %conv174
+
+; CHECK-LABEL: @test1
+; CHECK-NOT: rlwimi 3, {{[0-9]+}}, 15, 0, 16
+; CHECK: blr
+}
+
+define i32 @test2() #0 {
+entry:
+  %conv67.reload = load i32* undef
+  %const = bitcast i32 65535 to i32
+  br label %next
+
+next:
+  %shl161 = shl nuw nsw i32 %conv67.reload, 15
+  %0 = load i8* undef, align 1
+  %conv169 = zext i8 %0 to i32
+  %shl170 = shl nuw nsw i32 %conv169, 7
+  %shl161.masked = and i32 %shl161, 32768
+  %conv174 = or i32 %shl170, %shl161.masked
+  ret i32 %conv174
+
+; CHECK-LABEL: @test2
+; CHECK: slwi 3, {{[0-9]+}}, 7
+; CHECK: rlwimi 3, {{[0-9]+}}, 15, 16, 16
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/splat-bug.ll b/test/CodeGen/PowerPC/splat-bug.ll
new file mode 100644
index 0000000..4b5250b
--- /dev/null
+++ b/test/CodeGen/PowerPC/splat-bug.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mcpu=ppc64 -O0 -fast-isel=false < %s | FileCheck %s
+
+; Checks for a previous bug where vspltisb/vaddubm were issued in place
+; of vsplitsh/vadduhm.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = external global <16 x i8>
+
+define void @foo() nounwind ssp {
+; CHECK: foo:
+  store <16 x i8> <i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16, i8 0, i8 16>, <16 x i8>* @a
+; CHECK: vspltish [[REG:[0-9]+]], 8
+; CHECK: vadduhm {{[0-9]+}}, [[REG]], [[REG]]
+  ret void
+}
+
diff --git a/test/CodeGen/R600/32-bit-local-address-space.ll b/test/CodeGen/R600/32-bit-local-address-space.ll
index fffaefe..7dec426 100644
--- a/test/CodeGen/R600/32-bit-local-address-space.ll
+++ b/test/CodeGen/R600/32-bit-local-address-space.ll
@@ -33,7 +33,7 @@ entry:
 
 ; CHECK-LABEL: @local_address_gep_const_offset
 ; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
-; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VPTR]], 4,
+; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VPTR]], 0x4,
 define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(3)* %in, i32 1
@@ -44,7 +44,7 @@ entry:
 
 ; Offset too large, can't fold into 16-bit immediate offset.
 ; CHECK-LABEL: @local_address_gep_large_const_offset
-; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 65540
+; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; CHECK: DS_READ_B32 [[VPTR]]
 define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
@@ -119,7 +119,7 @@ define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32
 ; CHECK-LABEL: @local_address_gep_const_offset_store
 ; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; CHECK: V_MOV_B32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
-; CHECK: DS_WRITE_B32 [[VPTR]], [[VAL]], 4
+; CHECK: DS_WRITE_B32 [[VPTR]], [[VAL]], 0x4
 define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32 addrspace(3)* %out, i32 1
   store i32 %val, i32 addrspace(3)* %gep, align 4
@@ -128,7 +128,7 @@ define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %v
 
 ; Offset too large, can't fold into 16-bit immediate offset.
 ; CHECK-LABEL: @local_address_gep_large_const_offset_store
-; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 65540
+; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; CHECK: DS_WRITE_B32 [[VPTR]], v{{[0-9]+}}, 0
 define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
index 0d6bfb1..2d82c1e 100644
--- a/test/CodeGen/R600/64bit-kernel-args.ll
+++ b/test/CodeGen/R600/64bit-kernel-args.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; SI-CHECK: @f64_kernel_arg
-; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 9
-; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 11
+; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
+; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
 ; SI-CHECK: BUFFER_STORE_DWORDX2
 define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
diff --git a/test/CodeGen/R600/add.ll b/test/CodeGen/R600/add.ll
index e9db52a..711a2bc 100644
--- a/test/CodeGen/R600/add.ll
+++ b/test/CodeGen/R600/add.ll
@@ -140,3 +140,28 @@ entry:
   store i64 %1, i64 addrspace(1)* %out
   ret void
 }
+
+; Test i64 add inside a branch.  We don't allow SALU instructions inside of
+; branches.
+; FIXME: We are being conservative here.  We could allow this in some cases.
+; FUNC-LABEL: @add64_in_branch
+; SI-CHECK-NOT: S_ADD_I32
+; SI-CHECK-NOT: S_ADDC_U32
+define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll
index 7081b07..c9eaeda 100644
--- a/test/CodeGen/R600/add_i64.ll
+++ b/test/CodeGen/R600/add_i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
 declare i32 @llvm.r600.read.tidig.x() readnone
diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll
index 15d2ed2..f75a8ac 100644
--- a/test/CodeGen/R600/address-space.ll
+++ b/test/CodeGen/R600/address-space.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s
 
 ; Test that codegenprepare understands address space sizes
 
@@ -10,8 +10,8 @@
 ; CHECK-LABEL: @do_as_ptr_calcs:
 ; CHECK: S_LOAD_DWORD [[SREG1:s[0-9]+]],
 ; CHECK: V_MOV_B32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
-; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 20
-; CHECK: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 12
+; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 0x14
+; CHECK: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 0xc
 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll
index cb2a1c8..c2362da 100644
--- a/test/CodeGen/R600/array-ptr-calc-i32.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
 declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
diff --git a/test/CodeGen/R600/array-ptr-calc-i64.ll b/test/CodeGen/R600/array-ptr-calc-i64.ll
index 652bbfe..e254c5f 100644
--- a/test/CodeGen/R600/array-ptr-calc-i64.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i64.ll
@@ -1,5 +1,5 @@
 ; XFAIL: *
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() readnone
 
diff --git a/test/CodeGen/R600/call.ll b/test/CodeGen/R600/call.ll
new file mode 100644
index 0000000..d803474
--- /dev/null
+++ b/test/CodeGen/R600/call.ll
@@ -0,0 +1,33 @@
+; RUN: not llc -march=r600 -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s
+; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported call to function defined_function in test_call
+
+
+declare i32 @external_function(i32) nounwind
+
+define i32 @defined_function(i32 %x) nounwind noinline {
+  %y = add i32 %x, 8
+  ret i32 %y
+}
+
+define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %c = call i32 @defined_function(i32 %b) nounwind
+  %result = add i32 %a, %c
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %c = call i32 @external_function(i32 %b) nounwind
+  %result = add i32 %a, %c
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
index 2e70d47..dc056e0 100644
--- a/test/CodeGen/R600/extload.ll
+++ b/test/CodeGen/R600/extload.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: @anyext_load_i8:
 ; EG: AND_INT
@@ -87,8 +87,9 @@ define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
 }
 
 ; FUNC-LABEL: @zextload_global_i8_to_i64
+; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
 ; SI: BUFFER_LOAD_UBYTE [[LOAD:v[0-9]+]],
-; SI: V_MOV_B32_e32 {{v[0-9]+}}, 0
+; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
 ; SI: BUFFER_STORE_DWORDX2
 define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %a = load i8 addrspace(1)* %in, align 8
@@ -98,8 +99,9 @@ define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
 }
 
 ; FUNC-LABEL: @zextload_global_i16_to_i64
+; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
 ; SI: BUFFER_LOAD_USHORT [[LOAD:v[0-9]+]],
-; SI: V_MOV_B32_e32 {{v[0-9]+}}, 0
+; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
 ; SI: BUFFER_STORE_DWORDX2
 define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16 addrspace(1)* %in, align 8
@@ -109,8 +111,9 @@ define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 }
 
 ; FUNC-LABEL: @zextload_global_i32_to_i64
+; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
 ; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
-; SI: V_MOV_B32_e32 {{v[0-9]+}}, 0
+; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
 ; SI: BUFFER_STORE_DWORDX2
 define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %a = load i32 addrspace(1)* %in, align 8
diff --git a/test/CodeGen/R600/extract_vector_elt_i16.ll b/test/CodeGen/R600/extract_vector_elt_i16.ll
new file mode 100644
index 0000000..5cd1b04
--- /dev/null
+++ b/test/CodeGen/R600/extract_vector_elt_i16.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @extract_vector_elt_v2i16
+; SI: BUFFER_LOAD_USHORT
+; SI: BUFFER_STORE_SHORT
+; SI: BUFFER_LOAD_USHORT
+; SI: BUFFER_STORE_SHORT
+define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind {
+  %p0 = extractelement <2 x i16> %foo, i32 0
+  %p1 = extractelement <2 x i16> %foo, i32 1
+  %out1 = getelementptr i16 addrspace(1)* %out, i32 1
+  store i16 %p1, i16 addrspace(1)* %out, align 2
+  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  ret void
+}
+
+; FUNC-LABEL: @extract_vector_elt_v4i16
+; SI: BUFFER_LOAD_USHORT
+; SI: BUFFER_STORE_SHORT
+; SI: BUFFER_LOAD_USHORT
+; SI: BUFFER_STORE_SHORT
+define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind {
+  %p0 = extractelement <4 x i16> %foo, i32 0
+  %p1 = extractelement <4 x i16> %foo, i32 2
+  %out1 = getelementptr i16 addrspace(1)* %out, i32 1
+  store i16 %p1, i16 addrspace(1)* %out, align 2
+  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  ret void
+}
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
index 2cd3a4f..b87ce22 100644
--- a/test/CodeGen/R600/fabs.ll
+++ b/test/CodeGen/R600/fabs.ll
@@ -49,6 +49,17 @@ entry:
   ret void
 }
 
+; SI-CHECK-LABEL: @fabs_fold
+; SI-CHECK-NOT: V_AND_B32_e32
+; SI-CHECK: V_MUL_F32_e64 v{{[0-9]+}}, s{{[0-9]+}}, |v{{[0-9]+}}|
+define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
+entry:
+  %0 = call float @fabs(float %in0)
+  %1 = fmul float %0, %in1
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
 declare float @fabs(float ) readnone
 declare <2 x float> @llvm.fabs.v2f32(<2 x float> ) readnone
 declare <4 x float> @llvm.fabs.v4f32(<4 x float> ) readnone
diff --git a/test/CodeGen/R600/fconst64.ll b/test/CodeGen/R600/fconst64.ll
index 5c5ee7e..9c3a7e3 100644
--- a/test/CodeGen/R600/fconst64.ll
+++ b/test/CodeGen/R600/fconst64.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fconst_f64
-; CHECK: V_MOV_B32_e32 {{v[0-9]+}}, 0.000000e+00
-; CHECK-NEXT: V_MOV_B32_e32 {{v[0-9]+}}, 2.312500e+00
+; CHECK-DAG: S_MOV_B32 {{s[0-9]+}}, 0x40140000
+; CHECK-DAG: S_MOV_B32 {{s[0-9]+}}, 0
 
 define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r1 = load double addrspace(1)* %in
diff --git a/test/CodeGen/R600/fneg.ll b/test/CodeGen/R600/fneg.ll
index f4e6be6..4cddc73 100644
--- a/test/CodeGen/R600/fneg.ll
+++ b/test/CodeGen/R600/fneg.ll
@@ -51,7 +51,7 @@ entry:
 ; R600-CHECK: -KC0[2].Z
 ; SI-CHECK-LABEL: @fneg_free
 ; XXX: We could use V_ADD_F32_e64 with the negate bit here instead.
-; SI-CHECK: V_SUB_F32_e64 v{{[0-9]}}, 0.000000e+00, s{{[0-9]}}, 0, 0, 0, 0
+; SI-CHECK: V_SUB_F32_e64 v{{[0-9]}}, 0.000000e+00, s{{[0-9]}}, 0, 0
 define void @fneg_free(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = bitcast i32 %in to float
@@ -59,3 +59,14 @@ entry:
   store float %1, float addrspace(1)* %out
   ret void
 }
+
+; SI-CHECK-LABEL: @fneg_fold
+; SI-CHECK-NOT: V_XOR_B32
+; SI-CHECK: V_MUL_F32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
+define void @fneg_fold(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fsub float -0.0, %in
+  %1 = fmul float %0, %in
+  store float %1, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fp_to_uint.f64.ll b/test/CodeGen/R600/fp_to_uint.f64.ll
new file mode 100644
index 0000000..bf607ce
--- /dev/null
+++ b/test/CodeGen/R600/fp_to_uint.f64.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @fp_to_uint_i32_f64
+; SI: V_CVT_U32_F64_e32
+define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
+  %cast = fptoui double %in to i32
+  store i32 %cast, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/gep-address-space.ll b/test/CodeGen/R600/gep-address-space.ll
index ee914fa..ab2c0bf 100644
--- a/test/CodeGen/R600/gep-address-space.ll
+++ b/test/CodeGen/R600/gep-address-space.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s
 
 define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
 ; CHECK-LABEL: @use_gep_address_space:
 ; CHECK: V_MOV_B32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
-; CHECK: DS_WRITE_B32 [[PTR]], v{{[0-9]+}}, 64
+; CHECK: DS_WRITE_B32 [[PTR]], v{{[0-9]+}}, 0x40
   %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16
   store i32 99, i32 addrspace(3)* %p
   ret void
diff --git a/test/CodeGen/R600/gv-const-addrspace-fail.ll b/test/CodeGen/R600/gv-const-addrspace-fail.ll
new file mode 100644
index 0000000..ebd7811
--- /dev/null
+++ b/test/CodeGen/R600/gv-const-addrspace-fail.ll
@@ -0,0 +1,58 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+@a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1
+
+; FUNC-LABEL: @test_i8
+; EG: CF_END
+; SI: BUFFER_STORE_BYTE
+; SI: S_ENDPGM
+define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
+  %arrayidx = getelementptr inbounds [1 x i8] addrspace(2)* @a, i32 0, i32 %s
+  %1 = load i8 addrspace(2)* %arrayidx, align 1
+  store i8 %1, i8 addrspace(1)* %out
+  ret void
+}
+
+@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
+
+; FUNC-LABEL: @test_i16
+; EG: CF_END
+; SI: BUFFER_STORE_SHORT
+; SI: S_ENDPGM
+define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
+  %arrayidx = getelementptr inbounds [1 x i16] addrspace(2)* @b, i32 0, i32 %s
+  %1 = load i16 addrspace(2)* %arrayidx, align 2
+  store i16 %1, i16 addrspace(1)* %out
+  ret void
+}
+
+%struct.bar = type { float, [5 x i8] }
+
+; The illegal i8s aren't handled
+@struct_bar_gv = internal addrspace(2) unnamed_addr constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ]
+
+; FUNC-LABEL: @struct_bar_gv_load
+define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i8 addrspace(2)* %gep, align 1
+  store i8 %load, i8 addrspace(1)* %out, align 1
+  ret void
+}
+
+
+; The private load isn't scalarzied.
+@array_vector_gv = internal addrspace(2) constant [4 x <4 x i32>] [ <4 x i32> <i32 1, i32 2, i32 3, i32 4>,
+                                                                    <4 x i32> <i32 5, i32 6, i32 7, i32 8>,
+                                                                    <4 x i32> <i32 9, i32 10, i32 11, i32 12>,
+                                                                    <4 x i32> <i32 13, i32 14, i32 15, i32 16> ]
+
+; FUNC-LABEL: @array_vector_gv_load
+define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index
+  %load = load <4 x i32> addrspace(2)* %gep, align 16
+  store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll
index cda7ab1..0176061 100644
--- a/test/CodeGen/R600/gv-const-addrspace.ll
+++ b/test/CodeGen/R600/gv-const-addrspace.ll
@@ -1,4 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+
+@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
 
 ; XXX: Test on SI once 64-bit adds are supportes.
 
@@ -6,12 +10,12 @@
 
 ; FUNC-LABEL: @float
 
-; R600-DAG: MOV {{\** *}}T2.X
-; R600-DAG: MOV {{\** *}}T3.X
-; R600-DAG: MOV {{\** *}}T4.X
-; R600-DAG: MOV {{\** *}}T5.X
-; R600-DAG: MOV {{\** *}}T6.X
-; R600: MOVA_INT
+; EG-DAG: MOV {{\** *}}T2.X
+; EG-DAG: MOV {{\** *}}T3.X
+; EG-DAG: MOV {{\** *}}T4.X
+; EG-DAG: MOV {{\** *}}T5.X
+; EG-DAG: MOV {{\** *}}T6.X
+; EG: MOVA_INT
 
 define void @float(float addrspace(1)* %out, i32 %index) {
 entry:
@@ -25,12 +29,12 @@ entry:
 
 ; FUNC-LABEL: @i32
 
-; R600-DAG: MOV {{\** *}}T2.X
-; R600-DAG: MOV {{\** *}}T3.X
-; R600-DAG: MOV {{\** *}}T4.X
-; R600-DAG: MOV {{\** *}}T5.X
-; R600-DAG: MOV {{\** *}}T6.X
-; R600: MOVA_INT
+; EG-DAG: MOV {{\** *}}T2.X
+; EG-DAG: MOV {{\** *}}T3.X
+; EG-DAG: MOV {{\** *}}T4.X
+; EG-DAG: MOV {{\** *}}T5.X
+; EG-DAG: MOV {{\** *}}T6.X
+; EG: MOVA_INT
 
 define void @i32(i32 addrspace(1)* %out, i32 %index) {
 entry:
@@ -39,3 +43,30 @@ entry:
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
+
+
+%struct.foo = type { float, [5 x i32] }
+
+@struct_foo_gv = internal addrspace(2) unnamed_addr constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
+
+; FUNC-LABEL: @struct_foo_gv_load
+
+define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i32 addrspace(2)* %gep, align 4
+  store i32 %load, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> <i32 1>,
+                                                                <1 x i32> <i32 2>,
+                                                                <1 x i32> <i32 3>,
+                                                                <1 x i32> <i32 4> ]
+
+; FUNC-LABEL: @array_v1_gv_load
+define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
+  %gep = getelementptr inbounds [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
+  %load = load <1 x i32> addrspace(2)* %gep, align 4
+  store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/infinite-loop.ll b/test/CodeGen/R600/infinite-loop.ll
index a60bc37..68ffaae 100644
--- a/test/CodeGen/R600/infinite-loop.ll
+++ b/test/CodeGen/R600/infinite-loop.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: @infinite_loop:
-; SI: V_MOV_B32_e32 [[REG:v[0-9]+]], 999
+; SI: V_MOV_B32_e32 [[REG:v[0-9]+]], 0x3e7
 ; SI: BB0_1:
 ; SI: BUFFER_STORE_DWORD [[REG]]
 ; SI: S_WAITCNT vmcnt(0) expcnt(0)
diff --git a/test/CodeGen/R600/insert_vector_elt.ll b/test/CodeGen/R600/insert_vector_elt.ll
index 530d1cc..43b4efc 100644
--- a/test/CodeGen/R600/insert_vector_elt.ll
+++ b/test/CodeGen/R600/insert_vector_elt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
 
 ; FIXME: Broken on evergreen
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
@@ -173,3 +173,29 @@ define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8>
   store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
   ret void
 }
+
+; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
+; the compiler doesn't crash.
+; SI-LABEL: @insert_split_bb
+define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
+entry:
+  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
+  %1 = icmp eq i32 %a, 0
+  br i1 %1, label %if, label %else
+
+if:
+  %2 = load i32 addrspace(1)* %in
+  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
+  br label %endif
+
+else:
+  %4 = getelementptr i32 addrspace(1)* %in, i32 1
+  %5 = load i32 addrspace(1)* %4
+  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
+  br label %endif
+
+endif:
+  %7 = phi <2 x i32> [%3, %if], [%6, %else]
+  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/insert_vector_elt_f64.ll b/test/CodeGen/R600/insert_vector_elt_f64.ll
index e334be1..595bc59 100644
--- a/test/CodeGen/R600/insert_vector_elt_f64.ll
+++ b/test/CodeGen/R600/insert_vector_elt_f64.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; XFAIL: *
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
 ; SI-LABEL: @dynamic_insertelement_v2f64:
diff --git a/test/CodeGen/R600/kernel-args.ll b/test/CodeGen/R600/kernel-args.ll
index 247e316..6fc6979 100644
--- a/test/CodeGen/R600/kernel-args.ll
+++ b/test/CodeGen/R600/kernel-args.ll
@@ -17,7 +17,7 @@ entry:
 ; EG-CHECK-LABEL: @i8_zext_arg
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK-LABEL: @i8_zext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
 
 define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
 entry:
@@ -29,7 +29,7 @@ entry:
 ; EG-CHECK-LABEL: @i8_sext_arg
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK-LABEL: @i8_sext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
 
 define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
 entry:
@@ -53,7 +53,7 @@ entry:
 ; EG-CHECK-LABEL: @i16_zext_arg
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK-LABEL: @i16_zext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
 
 define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
 entry:
@@ -65,7 +65,7 @@ entry:
 ; EG-CHECK-LABEL: @i16_sext_arg
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK-LABEL: @i16_sext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
 
 define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
 entry:
@@ -77,7 +77,7 @@ entry:
 ; EG-CHECK-LABEL: @i32_arg
 ; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK-LABEL: @i32_arg
-; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
 define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
 entry:
   store i32 %in, i32 addrspace(1)* %out, align 4
@@ -87,7 +87,7 @@ entry:
 ; EG-CHECK-LABEL: @f32_arg
 ; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK-LABEL: @f32_arg
-; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
 define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
 entry:
   store float %in, float addrspace(1)* %out, align 4
@@ -122,7 +122,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
 ; SI-CHECK-LABEL: @v2i32_arg
-; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 11
+; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
@@ -133,7 +133,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
 ; SI-CHECK-LABEL: @v2f32_arg
-; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 11
+; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
 entry:
   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
@@ -166,7 +166,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; SI-CHECK-LABEL: @v3i32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 13
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
 entry:
   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
@@ -178,7 +178,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; SI-CHECK-LABEL: @v3f32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 13
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
 entry:
   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
@@ -223,7 +223,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
 ; SI-CHECK-LABEL: @v4i32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 13
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
@@ -236,7 +236,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
 ; SI-CHECK-LABEL: @v4f32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 13
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
@@ -300,7 +300,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
 ; SI-CHECK-LABEL: @v8i32_arg
-; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 17
+; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
@@ -317,7 +317,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
 ; SI-CHECK-LABEL: @v8f32_arg
-; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 17
+; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
@@ -422,7 +422,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
 ; SI-CHECK-LABEL: @v16i32_arg
-; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 25
+; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
@@ -447,7 +447,7 @@ entry:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
 ; SI-CHECK-LABEL: @v16f32_arg
-; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 25
+; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
index c3f000a..eb50942 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
@@ -1,11 +1,12 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
 
 ; FUNC-LABEL: @bfe_i32_arg_arg_arg
 ; SI: V_BFE_I32
 ; EG: BFE_INT
+; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac
 define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
@@ -38,3 +39,388 @@ define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) n
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: @v_bfe_print_arg
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
+define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
+  %load = load i32 addrspace(1)* %src0, align 4
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_arg_0_width_reg_offset
+; SI-NOT: BFE
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_arg_0_width_imm_offset
+; SI-NOT: BFE
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_6
+; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: S_ENDPGM
+define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_7
+; SI-NOT: SHL
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FIXME: The shifts should be 1 BFE
+; FUNC-LABEL: @bfe_i32_test_8
+; SI: BUFFER_LOAD_DWORD
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: S_ENDPGM
+define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_9
+; SI-NOT: BFE
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_10
+; SI-NOT: BFE
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_11
+; SI-NOT: BFE
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_12
+; SI-NOT: BFE
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_13
+; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: @bfe_i32_test_14
+; SI-NOT: LSHR
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_0
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_1
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_2
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_3
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_4
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_5
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_6
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0xffffff80
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_7
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_8
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_9
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_10
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_11
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -6
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_12
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_13
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_14
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 40
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_15
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_16
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_17
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_i32_constant_fold_test_18
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX - This should really be a single BFE, but the sext_inreg of the
+; extended type i24 is never custom lowered.
+; FUNC-LABEL: @bfe_sext_in_reg_i24
+; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
+; SI: V_LSHLREV_B32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
+; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
+; XSI: V_BFE_I32 [[BFE:v[0-9]+]], [[LOAD]], 0, 8
+; XSI-NOT: SHL
+; XSI-NOT: SHR
+; XSI: BUFFER_STORE_DWORD [[BFE]],
+define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
+  %shl = shl i32 %bfe, 8
+  %ashr = ashr i32 %shl, 8
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
index 0d47863..1a62253 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
@@ -38,3 +38,517 @@ define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) n
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: @bfe_u32_arg_0_width_reg_offset
+; SI-NOT: BFE
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_arg_0_width_imm_offset
+; SI-NOT: BFE
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zextload_i8
+; SI: BUFFER_LOAD_UBYTE
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+  %load = load i8 addrspace(1)* %in
+  %ext = zext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i8
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: V_AND_B32_e32
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i16
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: V_AND_B32_e32
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_1
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_3
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: V_AND_B32_e32 {{v[0-9]+}}, 0xf8
+; SI-NEXT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_7
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: V_AND_B32_e32 {{v[0-9]+}}, 0x80
+; SI-NEXT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_zext_in_reg_i16_offset_8
+; SI: BUFFER_LOAD_DWORD
+; SI: V_ADD_I32
+; SI-NEXT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %load = load i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_1
+; SI: BUFFER_LOAD_DWORD
+; SI: V_AND_B32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI: S_ENDPGM
+; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
+define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_4
+; SI-NOT: LSHL
+; SI-NOT: SHR
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = lshr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_5
+; SI: BUFFER_LOAD_DWORD
+; SI-NOT: LSHL
+; SI-NOT: SHR
+; SI: V_BFE_I32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
+; SI: S_ENDPGM
+define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_6
+; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: S_ENDPGM
+define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_7
+; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_8
+; SI-NOT: BFE
+; SI: V_AND_B32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_9
+; SI-NOT: BFE
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_10
+; SI-NOT: BFE
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_11
+; SI-NOT: BFE
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_12
+; SI-NOT: BFE
+; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_13
+; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: @bfe_u32_test_14
+; SI-NOT: LSHR
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_0
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_1
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_2
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_3
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_4
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_5
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_6
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x80
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_7
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_8
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_9
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFEfppppppppppppp
+define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_10
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_11
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_12
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_13
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_14
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 40
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_15
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_16
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_17
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_u32_constant_fold_test_18
+; SI-NOT: BFE
+; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
+; SI: BUFFER_STORE_DWORD [[VREG]],
+; SI: S_ENDPGM
+; EG-NOT: BFE
+define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
new file mode 100644
index 0000000..95795ea
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FIXME: Store of i32 seems to be broken pre-EG somehow?
+
+declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: @test_imad24
+; SI: V_MAD_I32_I24
+; CM: MULADD_INT24
+; R600: MULLO_INT
+; R600: ADD_INT
+define void @test_imad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mad = call i32 @llvm.AMDGPU.imad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
new file mode 100644
index 0000000..8ee3520
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
+
+; FUNC-LABEL: @test_imul24
+; SI: V_MUL_I32_I24
+; CM: MUL_INT24
+; R600: MULLO_INT
+define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
new file mode 100644
index 0000000..afdfb18
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: @test_umad24
+; SI: V_MAD_U32_U24
+; EG: MULADD_UINT24
+; R600: MULLO_UINT
+; R600: ADD_INT
+define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
new file mode 100644
index 0000000..72a3602
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone
+
+; FUNC-LABEL: @test_umul24
+; SI: V_MUL_U32_U24
+; R600: MUL_UINT24
+; R600: MULLO_UINT
+define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+  %mul = call i32 @llvm.AMDGPU.umul24(i32 %src0, i32 %src1) nounwind readnone
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
index 569efb6..740581a 100644
--- a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
+++ b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: @test1
-;CHECK: TBUFFER_STORE_FORMAT_XYZW {{v\[[0-9]+:[0-9]+\]}}, 32, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: TBUFFER_STORE_FORMAT_XYZW {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test1(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -11,7 +11,7 @@ define void @test1(i32 %a1, i32 %vaddr) #0 {
 }
 
 ;CHECK-LABEL: @test2
-;CHECK: TBUFFER_STORE_FORMAT_XYZ {{v\[[0-9]+:[0-9]+\]}}, 24, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: TBUFFER_STORE_FORMAT_XYZ {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test2(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -21,7 +21,7 @@ define void @test2(i32 %a1, i32 %vaddr) #0 {
 }
 
 ;CHECK-LABEL: @test3
-;CHECK: TBUFFER_STORE_FORMAT_XY {{v\[[0-9]+:[0-9]+\]}}, 16, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: TBUFFER_STORE_FORMAT_XY {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test3(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
@@ -31,7 +31,7 @@ define void @test3(i32 %a1, i32 %vaddr) #0 {
 }
 
 ;CHECK-LABEL: @test4
-;CHECK: TBUFFER_STORE_FORMAT_X {{v[0-9]+}}, 8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: TBUFFER_STORE_FORMAT_X {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test4(i32 %vdata, i32 %vaddr) #0 {
     call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
         i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
index aaf2305..9e7a4de 100644
--- a/test/CodeGen/R600/llvm.cos.ll
+++ b/test/CodeGen/R600/llvm.cos.ll
@@ -1,19 +1,40 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 
-;CHECK: MULADD_IEEE *
-;CHECK: FRACT *
-;CHECK: ADD *
-;CHECK: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;FUNC-LABEL: test
+;EG: MULADD_IEEE *
+;EG: FRACT *
+;EG: ADD *
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG-NOT: COS
+;SI: V_COS_F32
+;SI-NOT: V_COS_F32
 
-define void @test(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = call float @llvm.cos.f32(float %r0)
-   %vec = insertelement <4 x float> undef, float %r1, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+define void @test(float addrspace(1)* %out, float %x) #1 {
+   %cos = call float @llvm.cos.f32(float %x)
+   store float %cos, float addrspace(1)* %out
+   ret void
+}
+
+;FUNC-LABEL: testv
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG-NOT: COS
+;SI: V_COS_F32
+;SI: V_COS_F32
+;SI: V_COS_F32
+;SI: V_COS_F32
+;SI-NOT: V_COS_F32
+
+define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
+   %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx)
+   store <4 x float> %cos, <4 x float> addrspace(1)* %out
    ret void
 }
 
 declare float @llvm.cos.f32(float) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare <4 x float> @llvm.cos.v4f32(<4 x float>) readnone
 
 attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.rint.f64.ll b/test/CodeGen/R600/llvm.rint.f64.ll
new file mode 100644
index 0000000..a7a909a
--- /dev/null
+++ b/test/CodeGen/R600/llvm.rint.f64.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @f64
+; CI: V_RNDNE_F64_e32
+define void @f64(double addrspace(1)* %out, double %in) {
+entry:
+  %0 = call double @llvm.rint.f64(double %in)
+  store double %0, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @v2f64
+; CI: V_RNDNE_F64_e32
+; CI: V_RNDNE_F64_e32
+define void @v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+entry:
+  %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
+  store <2 x double> %0, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @v4f64
+; CI: V_RNDNE_F64_e32
+; CI: V_RNDNE_F64_e32
+; CI: V_RNDNE_F64_e32
+; CI: V_RNDNE_F64_e32
+define void @v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+entry:
+  %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)
+  store <4 x double> %0, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+
+declare double @llvm.rint.f64(double) #0
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) #0
+declare <4 x double> @llvm.rint.v4f64(<4 x double>) #0
diff --git a/test/CodeGen/R600/llvm.rint.ll b/test/CodeGen/R600/llvm.rint.ll
index c174b33..db8352f 100644
--- a/test/CodeGen/R600/llvm.rint.ll
+++ b/test/CodeGen/R600/llvm.rint.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; R600-CHECK: @f32
-; R600-CHECK: RNDNE
-; SI-CHECK: @f32
-; SI-CHECK: V_RNDNE_F32_e32
+; FUNC-LABEL: @f32
+; R600: RNDNE
+
+; SI: V_RNDNE_F32_e32
 define void @f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.rint.f32(float %in)
@@ -12,12 +12,12 @@ entry:
   ret void
 }
 
-; R600-CHECK: @v2f32
-; R600-CHECK: RNDNE
-; R600-CHECK: RNDNE
-; SI-CHECK: @v2f32
-; SI-CHECK: V_RNDNE_F32_e32
-; SI-CHECK: V_RNDNE_F32_e32
+; FUNC-LABEL: @v2f32
+; R600: RNDNE
+; R600: RNDNE
+
+; SI: V_RNDNE_F32_e32
+; SI: V_RNDNE_F32_e32
 define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in)
@@ -25,16 +25,16 @@ entry:
   ret void
 }
 
-; R600-CHECK: @v4f32
-; R600-CHECK: RNDNE
-; R600-CHECK: RNDNE
-; R600-CHECK: RNDNE
-; R600-CHECK: RNDNE
-; SI-CHECK: @v4f32
-; SI-CHECK: V_RNDNE_F32_e32
-; SI-CHECK: V_RNDNE_F32_e32
-; SI-CHECK: V_RNDNE_F32_e32
-; SI-CHECK: V_RNDNE_F32_e32
+; FUNC-LABEL: @v4f32
+; R600: RNDNE
+; R600: RNDNE
+; R600: RNDNE
+; R600: RNDNE
+
+; SI: V_RNDNE_F32_e32
+; SI: V_RNDNE_F32_e32
+; SI: V_RNDNE_F32_e32
+; SI: V_RNDNE_F32_e32
 define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in)
@@ -42,13 +42,8 @@ entry:
   ret void
 }
 
-; Function Attrs: nounwind readonly
 declare float @llvm.rint.f32(float) #0
-
-; Function Attrs: nounwind readonly
 declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0
-
-; Function Attrs: nounwind readonly
 declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0
 
 attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
index 9eb9983..41c363c 100644
--- a/test/CodeGen/R600/llvm.sin.ll
+++ b/test/CodeGen/R600/llvm.sin.ll
@@ -1,19 +1,41 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 
-;CHECK: MULADD_IEEE *
-;CHECK: FRACT *
-;CHECK: ADD *
-;CHECK: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;FUNC-LABEL: test
+;EG: MULADD_IEEE *
+;EG: FRACT *
+;EG: ADD *
+;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG-NOT: SIN
+;SI: V_MUL_F32
+;SI: V_SIN_F32
+;SI-NOT: V_SIN_F32
 
-define void @test(<4 x float> inreg %reg0) #0 {
-   %r0 = extractelement <4 x float> %reg0, i32 0
-   %r1 = call float @llvm.sin.f32( float %r0)
-   %vec = insertelement <4 x float> undef, float %r1, i32 0
-   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
+define void @test(float addrspace(1)* %out, float %x) #1 {
+   %sin = call float @llvm.sin.f32(float %x)
+   store float %sin, float addrspace(1)* %out
+   ret void
+}
+
+;FUNC-LABEL: testv
+;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+;EG-NOT: SIN
+;SI: V_SIN_F32
+;SI: V_SIN_F32
+;SI: V_SIN_F32
+;SI: V_SIN_F32
+;SI-NOT: V_SIN_F32
+
+define void @testv(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
+   %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
+   store <4 x float> %sin, <4 x float> addrspace(1)* %out
    ret void
 }
 
 declare float @llvm.sin.f32(float) readnone
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+declare <4 x float> @llvm.sin.v4f32(<4 x float>) readnone
 
 attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.sqrt.ll b/test/CodeGen/R600/llvm.sqrt.ll
index 0d0d186..4eee37f 100644
--- a/test/CodeGen/R600/llvm.sqrt.ll
+++ b/test/CodeGen/R600/llvm.sqrt.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 --mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI-CHECK
 
 ; R600-CHECK-LABEL: @sqrt_f32
 ; R600-CHECK: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
diff --git a/test/CodeGen/R600/load-i1.ll b/test/CodeGen/R600/load-i1.ll
new file mode 100644
index 0000000..9ba81b8
--- /dev/null
+++ b/test/CodeGen/R600/load-i1.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+
+; SI-LABEL: @global_copy_i1_to_i1
+; SI: BUFFER_LOAD_UBYTE
+; SI: V_AND_B32_e32 v{{[0-9]+}}, 1
+; SI: BUFFER_STORE_BYTE
+; SI: S_ENDPGM
+define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  store i1 %load, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: @global_sextload_i1_to_i32
+; XSI: BUFFER_LOAD_BYTE
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = sext i1 %load to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @global_zextload_i1_to_i32
+; SI: BUFFER_LOAD_UBYTE
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = zext i1 %load to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @global_sextload_i1_to_i64
+; XSI: BUFFER_LOAD_BYTE
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = sext i1 %load to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @global_zextload_i1_to_i64
+; SI: BUFFER_LOAD_UBYTE
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+  %load = load i1 addrspace(1)* %in
+  %ext = zext i1 %load to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @i1_arg
+; SI: BUFFER_LOAD_UBYTE
+; SI: V_AND_B32_e32
+; SI: BUFFER_STORE_BYTE
+; SI: S_ENDPGM
+define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+  store i1 %x, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; SI-LABEL: @i1_arg_zext_i32
+; SI: BUFFER_LOAD_UBYTE
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = zext i1 %x to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @i1_arg_zext_i64
+; SI: BUFFER_LOAD_UBYTE
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = zext i1 %x to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @i1_arg_sext_i32
+; XSI: BUFFER_LOAD_BYTE
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = sext i1 %x to i32
+  store i32 %ext, i32addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @i1_arg_sext_i64
+; XSI: BUFFER_LOAD_BYTE
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+  %ext = sext i1 %x to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/local-64.ll b/test/CodeGen/R600/local-64.ll
index 87f18ae..c52b41b 100644
--- a/test/CodeGen/R600/local-64.ll
+++ b/test/CodeGen/R600/local-64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: @local_i32_load
-; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 28, [M0]
+; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 0x1c, [M0]
 ; SI: BUFFER_STORE_DWORD [[REG]],
 define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %gep = getelementptr i32 addrspace(3)* %in, i32 7
@@ -11,7 +11,7 @@ define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounw
 }
 
 ; SI-LABEL: @local_i32_load_0_offset
-; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 0, [M0]
+; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 0x0, [M0]
 ; SI: BUFFER_STORE_DWORD [[REG]],
 define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %val = load i32 addrspace(3)* %in, align 4
@@ -21,7 +21,7 @@ define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %
 
 ; SI-LABEL: @local_i8_load_i16_max_offset
 ; SI-NOT: ADD
-; SI: DS_READ_U8 [[REG:v[0-9]+]], {{v[0-9]+}}, -1, [M0]
+; SI: DS_READ_U8 [[REG:v[0-9]+]], {{v[0-9]+}}, 0xffff, [M0]
 ; SI: BUFFER_STORE_BYTE [[REG]],
 define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8 addrspace(3)* %in, i32 65535
@@ -31,9 +31,9 @@ define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 }
 
 ; SI-LABEL: @local_i8_load_over_i16_max_offset
-; SI: S_ADD_I32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 65536
+; SI: S_ADD_I32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
 ; SI: V_MOV_B32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
-; SI: DS_READ_U8 [[REG:v[0-9]+]], [[VREGADDR]], 0, [M0]
+; SI: DS_READ_U8 [[REG:v[0-9]+]], [[VREGADDR]], 0x0, [M0]
 ; SI: BUFFER_STORE_BYTE [[REG]],
 define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8 addrspace(3)* %in, i32 65536
@@ -44,7 +44,7 @@ define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspa
 
 ; SI-LABEL: @local_i64_load
 ; SI-NOT: ADD
-; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 56, [M0]
+; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 0x38, [M0]
 ; SI: BUFFER_STORE_DWORDX2 [[REG]],
 define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %gep = getelementptr i64 addrspace(3)* %in, i32 7
@@ -54,7 +54,7 @@ define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounw
 }
 
 ; SI-LABEL: @local_i64_load_0_offset
-; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0, [M0]
+; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0x0, [M0]
 ; SI: BUFFER_STORE_DWORDX2 [[REG]],
 define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %val = load i64 addrspace(3)* %in, align 8
@@ -64,7 +64,7 @@ define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %
 
 ; SI-LABEL: @local_f64_load
 ; SI-NOT: ADD
-; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 56, [M0]
+; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 0x38, [M0]
 ; SI: BUFFER_STORE_DWORDX2 [[REG]],
 define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %gep = getelementptr double addrspace(3)* %in, i32 7
@@ -74,7 +74,7 @@ define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in)
 }
 
 ; SI-LABEL: @local_f64_load_0_offset
-; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0, [M0]
+; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0x0, [M0]
 ; SI: BUFFER_STORE_DWORDX2 [[REG]],
 define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %val = load double addrspace(3)* %in, align 8
@@ -84,7 +84,7 @@ define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace
 
 ; SI-LABEL: @local_i64_store
 ; SI-NOT: ADD
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 56 [M0]
+; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x38 [M0]
 define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
   %gep = getelementptr i64 addrspace(3)* %out, i32 7
   store i64 5678, i64 addrspace(3)* %gep, align 8
@@ -93,7 +93,7 @@ define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
 
 ; SI-LABEL: @local_i64_store_0_offset
 ; SI-NOT: ADD
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0 [M0]
+; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
 define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
   store i64 1234, i64 addrspace(3)* %out, align 8
   ret void
@@ -101,7 +101,7 @@ define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
 
 ; SI-LABEL: @local_f64_store
 ; SI-NOT: ADD
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 56 [M0]
+; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x38 [M0]
 define void @local_f64_store(double addrspace(3)* %out) nounwind {
   %gep = getelementptr double addrspace(3)* %out, i32 7
   store double 16.0, double addrspace(3)* %gep, align 8
@@ -109,7 +109,7 @@ define void @local_f64_store(double addrspace(3)* %out) nounwind {
 }
 
 ; SI-LABEL: @local_f64_store_0_offset
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0 [M0]
+; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
 define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
   store double 20.0, double addrspace(3)* %out, align 8
   ret void
@@ -117,8 +117,8 @@ define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
 
 ; SI-LABEL: @local_v2i64_store
 ; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 120 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 112 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x78 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x70 [M0]
 define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <2 x i64> addrspace(3)* %out, i32 7
   store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
@@ -127,8 +127,8 @@ define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
 
 ; SI-LABEL: @local_v2i64_store_0_offset
 ; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 8 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x8 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
 define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
   ret void
@@ -136,10 +136,10 @@ define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
 
 ; SI-LABEL: @local_v4i64_store
 ; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 248 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 240 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 232 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 224 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xf8 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xf0 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xe8 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xe0 [M0]
 define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <4 x i64> addrspace(3)* %out, i32 7
   store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
@@ -148,10 +148,10 @@ define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
 
 ; SI-LABEL: @local_v4i64_store_0_offset
 ; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 24 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 16 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 8 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x18 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x10 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x8 [M0]
+; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
 define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
   ret void
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index 616000d..1e42285 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -28,8 +28,8 @@
 ; constant offsets.
 ; EG-CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
 ; EG-CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
-; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]], 16
-; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR]], 0,
+; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]], 0x10
+; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR]], 0x0,
 
 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/R600/loop-idiom.ll b/test/CodeGen/R600/loop-idiom.ll
index 8a9cba2..128f661 100644
--- a/test/CodeGen/R600/loop-idiom.ll
+++ b/test/CodeGen/R600/loop-idiom.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 target triple = "r600--"
diff --git a/test/CodeGen/R600/mad_int24.ll b/test/CodeGen/R600/mad_int24.ll
index df063ec..abb5290 100644
--- a/test/CodeGen/R600/mad_int24.ll
+++ b/test/CodeGen/R600/mad_int24.ll
@@ -1,12 +1,15 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; EG-CHECK: @i32_mad24
+; FUNC-LABEL: @i32_mad24
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
-; EG-CHECK: MULLO_INT
-; CM-CHECK: MULADD_INT24 {{[ *]*}}T{{[0-9].[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X
-; SI-CHECK: V_MAD_I32_I24
+; EG: MULLO_INT
+; Make sure we aren't masking the inputs.
+; CM-NOT: AND
+; CM: MULADD_INT24
+; SI-NOT: AND
+; SI: V_MAD_I32_I24
 define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll
index 3dcadc9..0f0893b 100644
--- a/test/CodeGen/R600/mad_uint24.ll
+++ b/test/CodeGen/R600/mad_uint24.ll
@@ -1,11 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; EG-CHECK-LABEL: @u32_mad24
-; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X
-; SI-CHECK-LABEL: @u32_mad24
-; SI-CHECK: V_MAD_U32_U24
+; FUNC-LABEL: @u32_mad24
+; EG: MULADD_UINT24
+; SI: V_MAD_U32_U24
 
 define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
@@ -19,18 +18,14 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i16_mad24
-; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
-; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
-; EG-CHECK-DAG: VTX_READ_16 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
+; FUNC-LABEL: @i16_mad24
 ; The order of A and B does not matter.
-; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]]
+; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
 ; The result must be sign-extended
-; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
-; EG-CHECK: 16
-; SI-CHECK-LABEL: @i16_mad24
-; SI-CHECK: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16
+; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
+; EG: 16
+; SI: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16
 
 define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 entry:
@@ -41,18 +36,13 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i8_mad24
-; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
-; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
-; EG-CHECK-DAG: VTX_READ_8 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
-; The order of A and B does not matter.
-; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]]
+; FUNC-LABEL: @i8_mad24
+; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
 ; The result must be sign-extended
-; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
-; EG-CHECK: 8
-; SI-CHECK-LABEL: @i8_mad24
-; SI-CHECK: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8
+; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
+; EG: 8
+; SI: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8
 
 define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 entry:
@@ -62,3 +52,24 @@ entry:
   store i32 %2, i32 addrspace(1)* %out
   ret void
 }
+
+; This tests for a bug where the mad_u24 pattern matcher would call
+; SimplifyDemandedBits on the first operand of the mul instruction
+; assuming that the pattern would be matched to a 24-bit mad.  This
+; led to some instructions being incorrectly erased when the entire
+; 24-bit mad pattern wasn't being matched.
+
+; Check that the select instruction is not deleted.
+; FUNC-LABEL: @i24_i32_i32_mad
+; EG: CNDE_INT
+; SI: V_CNDMASK
+define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  %0 = ashr i32 %a, 8
+  %1 = icmp ne i32 %c, 0
+  %2 = select i1 %1, i32 %0, i32 34
+  %3 = mul i32 %2, %c
+  %4 = add i32 %3, %d
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/mubuf.ll b/test/CodeGen/R600/mubuf.ll
index 2d5ddeb..f465d3d 100644
--- a/test/CodeGen/R600/mubuf.ll
+++ b/test/CodeGen/R600/mubuf.ll
@@ -6,7 +6,7 @@
 
 ; MUBUF load with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: @mubuf_load0
-; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4 ; encoding: [0x04,0x80
+; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
 define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 1
@@ -17,7 +17,7 @@ entry:
 
 ; MUBUF load with the largest possible immediate offset
 ; CHECK-LABEL: @mubuf_load1
-; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4095 ; encoding: [0xff,0x8f
+; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0xfff ; encoding: [0xff,0x8f
 define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i8 addrspace(1)* %in, i64 4095
@@ -28,7 +28,7 @@ entry:
 
 ; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
 ; CHECK-LABEL: @mubuf_load2
-; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0 ; encoding: [0x00,0x80
+; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x0 ; encoding: [0x00,0x80
 define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 1024
@@ -40,7 +40,7 @@ entry:
 ; MUBUF load with a 12-bit immediate offset and a register offset
 ; CHECK-LABEL: @mubuf_load3
 ; CHECK-NOT: ADD
-; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4 ; encoding: [0x04,0x80
+; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
 define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 %offset
@@ -56,7 +56,7 @@ entry:
 
 ; MUBUF store with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: @mubuf_store0
-; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4 ; encoding: [0x04,0x80
+; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
 define void @mubuf_store0(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 1
@@ -66,7 +66,7 @@ entry:
 
 ; MUBUF store with the largest possible immediate offset
 ; CHECK-LABEL: @mubuf_store1
-; CHECK: BUFFER_STORE_BYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4095 ; encoding: [0xff,0x8f
+; CHECK: BUFFER_STORE_BYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0xfff ; encoding: [0xff,0x8f
 
 define void @mubuf_store1(i8 addrspace(1)* %out) {
 entry:
@@ -77,7 +77,7 @@ entry:
 
 ; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
 ; CHECK-LABEL: @mubuf_store2
-; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0 ; encoding: [0x00,0x80
+; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x0 ; encoding: [0x00,0x80
 define void @mubuf_store2(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 1024
@@ -88,7 +88,7 @@ entry:
 ; MUBUF store with a 12-bit immediate offset and a register offset
 ; CHECK-LABEL: @mubuf_store3
 ; CHECK-NOT: ADD
-; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 4 ; encoding: [0x04,0x80
+; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
 define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 %offset
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
index e176148..6ed754c 100644
--- a/test/CodeGen/R600/mul.ll
+++ b/test/CodeGen/R600/mul.ll
@@ -1,15 +1,14 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; mul24 and mad24 are affected
 
-;EG-CHECK: @test2
-;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;FUNC-LABEL: @test2
+;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test2
-;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -20,17 +19,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK: @test4
-;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;FUNC-LABEL: @test4
+;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test4
-;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -52,3 +50,32 @@ define void @trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   store i32 %trunc, i32 addrspace(1)* %out, align 8
   ret void
 }
+
+; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
+; 32-bits of both arguments are sign bits.
+; FUNC-LABEL: @mul64_sext_c
+; EG-DAG: MULLO_INT
+; EG-DAG: MULHI_INT
+; SI-DAG: V_MUL_LO_I32
+; SI-DAG: V_MUL_HI_I32
+define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = sext i32 %in to i64
+  %1 = mul i64 %0, 80
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; A standard 64-bit multiply.  The expansion should be around 6 instructions.
+; It would be difficult to match the expansion correctly without writing
+; a really complicated list of FileCheck expressions.  I don't want
+; to confuse people who may 'break' this test with a correct optimization,
+; so this test just uses FUNC-LABEL to make sure the compiler does not
+; crash with a 'failed to select' error.
+; FUNC-LABEL: @mul64
+define void @mul64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = mul i64 %a, %b
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/mul_int24.ll b/test/CodeGen/R600/mul_int24.ll
index 66a1a9e..046911b 100644
--- a/test/CodeGen/R600/mul_int24.ll
+++ b/test/CodeGen/R600/mul_int24.ll
@@ -1,12 +1,15 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; EG-CHECK: @i32_mul24
+; FUNC-LABEL: @i32_mul24
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
-; EG-CHECK: MULLO_INT
-; CM-CHECK: MUL_INT24 {{[ *]*}}T{{[0-9].[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI-CHECK: V_MUL_I32_I24
+; EG: MULLO_INT
+; Make sure we are not masking the inputs
+; CM-NOT: AND
+; CM: MUL_INT24
+; SI-NOT: AND
+; SI: V_MUL_I32_I24
 define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = shl i32 %a, 8
diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll
index a413961..419f275 100644
--- a/test/CodeGen/R600/mul_uint24.ll
+++ b/test/CodeGen/R600/mul_uint24.ll
@@ -1,11 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; EG-CHECK-LABEL: @u32_mul24
-; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI-CHECK-LABEL: @u32_mul24
-; SI-CHECK: V_MUL_U32_U24
+; FUNC-LABEL: @u32_mul24
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI: V_MUL_U32_U24
 
 define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
@@ -18,17 +17,13 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i16_mul24
-; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
-; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
-; The order of A and B does not matter.
-; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]]
+; FUNC-LABEL: @i16_mul24
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; The result must be sign-extended
-; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-; EG-CHECK: 16
-; SI-CHECK-LABEL: @i16_mul24
-; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16,
+; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
+; EG: 16
+; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16,
 define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %0 = mul i16 %a, %b
@@ -37,16 +32,12 @@ entry:
   ret void
 }
 
-; EG-CHECK-LABEL: @i8_mul24
-; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
-; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
-; The order of A and B does not matter.
-; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]]
+; FUNC-LABEL: @i8_mul24
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; The result must be sign-extended
-; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-; SI-CHECK-LABEL: @i8_mul24
-; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8,
+; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
+; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8,
 
 define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
 entry:
@@ -55,3 +46,21 @@ entry:
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
+
+; Multiply with 24-bit inputs and 64-bit output
+; FUNC_LABEL: @mul24_i64
+; EG; MUL_UINT24
+; EG: MULHI
+; SI: V_MUL_U32_U24
+; FIXME: SI support 24-bit mulhi
+; SI: V_MUL_HI_U32
+define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = shl i64 %a, 40
+  %a_24 = lshr i64 %0, 40
+  %1 = shl i64 %b, 40
+  %b_24 = lshr i64 %1, 40
+  %2 = mul i64 %a_24, %b_24
+  store i64 %2, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/mulhu.ll b/test/CodeGen/R600/mulhu.ll
index d5fc014..8640127 100644
--- a/test/CodeGen/R600/mulhu.ll
+++ b/test/CodeGen/R600/mulhu.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_MOV_B32_e32 v{{[0-9]+}}, -1431655765
+;CHECK: V_MOV_B32_e32 v{{[0-9]+}}, 0xaaaaaaab
 ;CHECK: V_MUL_HI_U32 v0, {{[sv][0-9]+}}, {{v[0-9]+}}
 ;CHECK-NEXT: V_LSHRREV_B32_e32 v0, 1, v0
 
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
index 2cc991e..9878366 100644
--- a/test/CodeGen/R600/or.ll
+++ b/test/CodeGen/R600/or.ll
@@ -89,8 +89,8 @@ define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
 }
 
 ; SI-LABEL: @vector_or_i64_loadimm
-; SI-DAG: S_MOV_B32 [[LO_S_IMM:s[0-9]+]], -545810305
-; SI-DAG: S_MOV_B32 [[HI_S_IMM:s[0-9]+]], 5231
+; SI-DAG: S_MOV_B32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
+; SI-DAG: S_MOV_B32 [[HI_S_IMM:s[0-9]+]], 0x146f
 ; SI-DAG: BUFFER_LOAD_DWORDX2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
 ; SI-DAG: V_OR_B32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: V_OR_B32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index 4920320..d3453f2 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 
 ; This test checks that uses and defs of the AR register happen in the same
 ; instruction clause.
@@ -119,7 +119,7 @@ for.end:
 ; R600-CHECK: *
 ; R600-CHECK: MOVA_INT
 
-; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 65536
+; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 0x10000
 ; SI-CHECK: V_MOVRELS_B32_e32
 define void @short_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
@@ -142,7 +142,7 @@ entry:
 ; R600-CHECK: *
 ; R600-CHECK-NEXT: MOVA_INT
 
-; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 256
+; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100
 ; SI-CHECK: V_MOVRELS_B32_e32
 define void @char_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
index 5a930b2..f322bc7 100644
--- a/test/CodeGen/R600/pv.ll
+++ b/test/CodeGen/R600/pv.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 | FileCheck %s
 
 ;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
-;CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
+;CHECK: MAX T{{[0-9].[XYZW]}}, PV.X, 0.0
 
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
 main_body:
diff --git a/test/CodeGen/R600/register-count-comments.ll b/test/CodeGen/R600/register-count-comments.ll
index a64b280..329077c 100644
--- a/test/CodeGen/R600/register-count-comments.ll
+++ b/test/CodeGen/R600/register-count-comments.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
 
diff --git a/test/CodeGen/R600/salu-to-valu.ll b/test/CodeGen/R600/salu-to-valu.ll
index e461bf9..e7719b6 100644
--- a/test/CodeGen/R600/salu-to-valu.ll
+++ b/test/CodeGen/R600/salu-to-valu.ll
@@ -46,3 +46,45 @@ declare i32 @llvm.r600.read.tidig.x() #1
 declare i32 @llvm.r600.read.tidig.y() #1
 
 attributes #1 = { nounwind readnone }
+
+; Test moving an SMRD instruction to the VALU
+
+; CHECK-LABEL: @smrd_valu
+; CHECK: BUFFER_LOAD_DWORD [[OUT:v[0-9]+]]
+; CHECK: BUFFER_STORE_DWORD [[OUT]]
+
+define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) {
+entry:
+  %0 = icmp ne i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i32 addrspace(2)* addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = getelementptr i32 addrspace(2)* addrspace(1)* %in
+  %3 = load i32 addrspace(2)* addrspace(1)* %2
+  br label %endif
+
+endif:
+  %4 = phi i32 addrspace(2)*  [%1, %if], [%3, %else]
+  %5 = getelementptr i32 addrspace(2)* %4, i32 3000
+  %6 = load i32 addrspace(2)* %5
+  store i32 %6, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test moving ann SMRD with an immediate offset to the VALU
+
+; CHECK-LABEL: @smrd_valu2
+; CHECK: BUFFER_LOAD_DWORD
+define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %1 = add i32 %0, 4
+  %2 = getelementptr [8 x i32] addrspace(2)* %in, i32 %0, i32 4
+  %3 = load i32 addrspace(2)* %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
index 2a286d1..3d2142d 100644
--- a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
+++ b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
@@ -1,6 +1,6 @@
 ; XFAIL: *
 ; REQUIRES: asserts
-; RUN: llc -O0 -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI
+; RUN: llc -O0 -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
 
 declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
 
diff --git a/test/CodeGen/R600/selectcc.ll b/test/CodeGen/R600/selectcc.ll
new file mode 100644
index 0000000..a8f57cf
--- /dev/null
+++ b/test/CodeGen/R600/selectcc.ll
@@ -0,0 +1,19 @@
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: @selectcc_i64
+; EG: XOR_INT
+; EG: XOR_INT
+; EG: OR_INT
+; EG: CNDE_INT
+; EG: CNDE_INT
+; SI: V_CMP_EQ_I64
+; SI: V_CNDMASK
+; SI: V_CNDMASK
+define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
+entry:
+  %0 = icmp eq i64 %lhs, %rhs
+  %1 = select i1 %0, i64 %true, i64 %false
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index 8d34c4a..5bd95b7 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -1,5 +1,5 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; FUNC-LABEL: @setcc_v2i32
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
@@ -96,7 +96,9 @@ entry:
 ; R600-DAG: SETNE_INT
 ; SI: V_CMP_O_F32
 ; SI: V_CMP_NEQ_F32
-; SI: S_AND_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_AND_B32_e32
 define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp one float %a, %b
@@ -128,7 +130,9 @@ entry:
 ; R600-DAG: SETNE_INT
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_EQ_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ueq float %a, %b
@@ -142,7 +146,9 @@ entry:
 ; R600: SETE_DX10
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_GT_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ugt float %a, %b
@@ -156,7 +162,9 @@ entry:
 ; R600: SETE_DX10
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_GE_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp uge float %a, %b
@@ -170,7 +178,9 @@ entry:
 ; R600: SETE_DX10
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_LT_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ult float %a, %b
@@ -184,7 +194,9 @@ entry:
 ; R600: SETE_DX10
 ; SI: V_CMP_U_F32
 ; SI: V_CMP_LE_F32
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ule float %a, %b
diff --git a/test/CodeGen/R600/setcc64.ll b/test/CodeGen/R600/setcc64.ll
index 9202fc0..54a33b3 100644
--- a/test/CodeGen/R600/setcc64.ll
+++ b/test/CodeGen/R600/setcc64.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; XXX: Merge this into setcc, once R600 supports 64-bit operations
 
@@ -59,7 +59,9 @@ entry:
 ; FUNC-LABEL: @f64_one
 ; SI: V_CMP_O_F64
 ; SI: V_CMP_NEQ_F64
-; SI: S_AND_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_AND_B32_e32
 define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp one double %a, %b
@@ -81,7 +83,9 @@ entry:
 ; FUNC-LABEL: @f64_ueq
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_EQ_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ueq double %a, %b
@@ -93,7 +97,9 @@ entry:
 ; FUNC-LABEL: @f64_ugt
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_GT_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ugt double %a, %b
@@ -105,7 +111,9 @@ entry:
 ; FUNC-LABEL: @f64_uge
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_GE_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp uge double %a, %b
@@ -117,7 +125,9 @@ entry:
 ; FUNC-LABEL: @f64_ult
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_LT_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ult double %a, %b
@@ -129,7 +139,9 @@ entry:
 ; FUNC-LABEL: @f64_ule
 ; SI: V_CMP_U_F64
 ; SI: V_CMP_LE_F64
-; SI: S_OR_B64
+; SI: V_CNDMASK_B32_e64
+; SI: V_CNDMASK_B32_e64
+; SI: V_OR_B32_e32
 define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ule double %a, %b
diff --git a/test/CodeGen/R600/seto.ll b/test/CodeGen/R600/seto.ll
index 8633a4b..e90e788 100644
--- a/test/CodeGen/R600/seto.ll
+++ b/test/CodeGen/R600/seto.ll
@@ -1,6 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_CMP_O_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0, 0, 0
+;CHECK-LABEL: @main
+;CHECK: V_CMP_O_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0
 
 define void @main(float %p) {
 main_body:
diff --git a/test/CodeGen/R600/setuo.ll b/test/CodeGen/R600/setuo.ll
index c77a37e..3b1db8b 100644
--- a/test/CodeGen/R600/setuo.ll
+++ b/test/CodeGen/R600/setuo.ll
@@ -1,6 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_CMP_U_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0, 0, 0
+;CHECK-LABEL: @main
+;CHECK: V_CMP_U_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0
 
 define void @main(float %p) {
 main_body:
diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll
index eef3f07..1b02e4b 100644
--- a/test/CodeGen/R600/sext-in-reg.ll
+++ b/test/CodeGen/R600/sext-in-reg.ll
@@ -1,15 +1,18 @@
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
 
 
 ; FUNC-LABEL: @sext_in_reg_i1_i32
 ; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[ARG]], 0, 1
+; SI: S_BFE_I32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
+; SI: V_MOV_B32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
 ; SI: BUFFER_STORE_DWORD [[EXTRACT]],
 
-; EG: BFE_INT
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
+; EG-NEXT: LSHR * [[ADDR]]
 define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
   %shl = shl i32 %in, 31
   %sext = ashr i32 %shl, 31
@@ -19,10 +22,14 @@ define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
 
 ; FUNC-LABEL: @sext_in_reg_i8_to_i32
 ; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[VAL]], 0, 8
-; SI: BUFFER_STORE_DWORD [[EXTRACT]],
-
-; EG: BFE_INT
+; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: V_MOV_B32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: BUFFER_STORE_DWORD [[VEXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
+; EG-NEXT: LSHR * [[ADDR]]
 define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 24
@@ -33,10 +40,14 @@ define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounw
 
 ; FUNC-LABEL: @sext_in_reg_i16_to_i32
 ; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[VAL]], 0, 16
-; SI: BUFFER_STORE_DWORD [[EXTRACT]],
-
-; EG: BFE_INT
+; SI: S_SEXT_I32_I16 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: V_MOV_B32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: BUFFER_STORE_DWORD [[VEXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
+; EG-NEXT: LSHR * [[ADDR]]
 define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 16
@@ -47,10 +58,14 @@ define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) noun
 
 ; FUNC-LABEL: @sext_in_reg_i8_to_v1i32
 ; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[VAL]], 0, 8
-; SI: BUFFER_STORE_DWORD [[EXTRACT]],
-
-; EG: BFE_INT
+; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: V_MOV_B32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: BUFFER_STORE_DWORD [[VEXTRACT]],
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
+; EG-NEXT: LSHR * [[ADDR]]
 define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
   %c = add <1 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <1 x i32> %c, <i32 24>
@@ -59,13 +74,35 @@ define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a,
   ret void
 }
 
+; FUNC-LABEL: @sext_in_reg_i1_to_i64
+; SI: S_ADD_I32 [[VAL:s[0-9]+]],
+; SI: S_BFE_I32 s{{[0-9]+}}, s{{[0-9]+}}, 0x10000
+; SI: S_MOV_B32 {{s[0-9]+}}, -1
+; SI: BUFFER_STORE_DWORDX2
+define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %c = add i64 %a, %b
+  %shl = shl i64 %c, 63
+  %ashr = ashr i64 %shl, 63
+  store i64 %ashr, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
 ; FUNC-LABEL: @sext_in_reg_i8_to_i64
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
-; SI: BUFFER_STORE_DWORD
+; SI: S_ADD_I32 [[VAL:s[0-9]+]],
+; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: S_MOV_B32 {{s[0-9]+}}, -1
+; SI: BUFFER_STORE_DWORDX2
 
-; EG: BFE_INT
-; EG: ASHR
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
+; EG: ASHR [[RES_HI]]
+; EG-NOT: BFE_INT
+; EG: LSHR
+; EG: LSHR
+;; TODO Check address computation, using | with variables in {{}} does not work,
+;; also the _LO/_HI order might be different
 define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %c = add i64 %a, %b
   %shl = shl i64 %c, 56
@@ -75,12 +112,21 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw
 }
 
 ; FUNC-LABEL: @sext_in_reg_i16_to_i64
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 16
-; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
-; SI: BUFFER_STORE_DWORD
+; SI: S_ADD_I32 [[VAL:s[0-9]+]],
+; SI: S_SEXT_I32_I16 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: S_MOV_B32 {{s[0-9]+}}, -1
+; SI: BUFFER_STORE_DWORDX2
 
-; EG: BFE_INT
-; EG: ASHR
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
+; EG: ADD_INT
+; EG-NEXT: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
+; EG: ASHR [[RES_HI]]
+; EG-NOT: BFE_INT
+; EG: LSHR
+; EG: LSHR
+;; TODO Check address computation, using | with variables in {{}} does not work,
+;; also the _LO/_HI order might be different
 define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %c = add i64 %a, %b
   %shl = shl i64 %c, 48
@@ -95,6 +141,17 @@ define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
 ; SI: S_ADD_I32 [[ADD:s[0-9]+]],
 ; SI: S_ASHR_I32 s{{[0-9]+}}, [[ADD]], 31
 ; SI: BUFFER_STORE_DWORDX2
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
+; EG-NOT: BFE_INT
+; EG: ADD_INT {{\*?}} [[RES_LO]]
+; EG: ASHR [[RES_HI]]
+; EG: ADD_INT
+; EG: LSHR
+; EG: LSHR
+;; TODO Check address computation, using | with variables in {{}} does not work,
+;; also the _LO/_HI order might be different
 define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %c = add i64 %a, %b
   %shl = shl i64 %c, 32
@@ -105,8 +162,8 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
 
 ; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
 ; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64
-; XSI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; XSI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
+; XSI: S_BFE_I32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
+; XSI: S_ASHR_I32 {{v[0-9]+}}, [[EXTRACT]], 31
 ; XSI: BUFFER_STORE_DWORD
 ; XEG: BFE_INT
 ; XEG: ASHR
@@ -122,7 +179,13 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
 ; SI-NOT: BFE
 ; SI: S_LSHL_B32 [[REG:s[0-9]+]], {{s[0-9]+}}, 6
 ; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG]], 7
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG-NOT: BFE
+; EG: ADD_INT
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %c = add i32 %a, %b
   %x = shl i32 %c, 6
@@ -136,7 +199,15 @@ define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a,
 ; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG0]], 7
 ; SI: S_LSHL_B32 [[REG1:s[0-9]+]], {{s[0-9]}}, 6
 ; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG1]], 7
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG-NOT: BFE
+; EG: ADD_INT
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHL
+; EG: ASHR [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %c = add <2 x i32> %a, %b
   %x = shl <2 x i32> %c, <i32 6, i32 6>
@@ -147,11 +218,14 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out
 
 
 ; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
 ; SI: BUFFER_STORE_DWORDX2
-; EG: BFE
-; EG: BFE
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 31, i32 31>
@@ -161,16 +235,18 @@ define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 }
 
 ; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
 ; SI: BUFFER_STORE_DWORDX4
 
-; EG: BFE
-; EG: BFE
-; EG: BFE
-; EG: BFE
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
   %c = add <4 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
@@ -180,12 +256,14 @@ define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
 }
 
 ; FUNC-LABEL: @sext_in_reg_v2i8_to_v2i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
 ; SI: BUFFER_STORE_DWORDX2
 
-; EG: BFE
-; EG: BFE
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 24, i32 24>
@@ -195,16 +273,18 @@ define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 }
 
 ; FUNC-LABEL: @sext_in_reg_v4i8_to_v4i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
 ; SI: BUFFER_STORE_DWORDX4
 
-; EG: BFE
-; EG: BFE
-; EG: BFE
-; EG: BFE
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
   %c = add <4 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
@@ -214,16 +294,18 @@ define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
 }
 
 ; FUNC-LABEL: @sext_in_reg_v2i16_to_v2i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
+; SI: S_SEXT_I32_I16 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: S_SEXT_I32_I16 {{s[0-9]+}}, {{s[0-9]+}}
 ; SI: BUFFER_STORE_DWORDX2
 
-; EG: BFE
-; EG: BFE
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
+; EG: BFE_INT [[RES]]
+; EG: BFE_INT [[RES]]
+; EG: LSHR {{\*?}} [[ADDR]]
 define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
-  %shl = shl <2 x i32> %c, <i32 24, i32 24>
-  %ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
+  %shl = shl <2 x i32> %c, <i32 16, i32 16>
+  %ashr = ashr <2 x i32> %shl, <i32 16, i32 16>
   store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
@@ -252,8 +334,36 @@ define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
   ret void
 }
 
+; FUNC-LABEL: @vgpr_sext_in_reg_v4i8_to_v4i32
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
+  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @vgpr_sext_in_reg_v4i16_to_v4i32
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
+  %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
 ; FIXME: The BFE should really be eliminated. I think it should happen
-; when computeMaskedBitsForTargetNode is implemented for imax.
+; when computeKnownBitsForTargetNode is implemented for imax.
 
 ; FUNC-LABEL: @sext_in_reg_to_illegal_type
 ; SI: BUFFER_LOAD_SBYTE
@@ -269,3 +379,146 @@ define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 ad
   store i16 %tmp6, i16 addrspace(1)* %out, align 2
   ret void
 }
+
+declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
+
+; FUNC-LABEL: @bfe_0_width
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32 addrspace(1)* %ptr, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_8_bfe_8
+; SI: V_BFE_I32
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @bfe_8_bfe_16
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI: S_ENDPGM
+define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; This really should be folded into 1
+; FUNC-LABEL: @bfe_16_bfe_8
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
+  %load = load i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
+  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure there isn't a redundant BFE
+; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe
+; SI: S_SEXT_I32_I8 s{{[0-9]+}}, s{{[0-9]+}}
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe_wrong
+define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sextload_i8_to_i32_bfe
+; SI: BUFFER_LOAD_SBYTE
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
+  %load = load i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sextload_i8_to_i32_bfe_0:
+; SI-NOT: BFE
+; SI: S_ENDPGM
+define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
+  %load = load i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i1_bfe_offset_0:
+; SI-NOT: SHR
+; SI-NOT: SHL
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: S_ENDPGM
+define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i1_bfe_offset_1
+; SI: BUFFER_LOAD_DWORD
+; SI-NOT: SHL
+; SI-NOT: SHR
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
+; SI: S_ENDPGM
+define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @sext_in_reg_i2_bfe_offset_1:
+; SI: BUFFER_LOAD_DWORD
+; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
+; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
+; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
+; SI: S_ENDPGM
+define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %x = load i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/sgpr-control-flow.ll b/test/CodeGen/R600/sgpr-control-flow.ll
new file mode 100644
index 0000000..06ad24d
--- /dev/null
+++ b/test/CodeGen/R600/sgpr-control-flow.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+;
+;
+; Most SALU instructions ignore control flow, so we need to make sure
+; they don't overwrite values from other blocks.
+
+; SI-NOT: S_ADD
+
+define void @sgpr_if_else(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+  %0 = icmp eq i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = add i32 %b, %c
+  br label %endif
+
+else:
+  %2 = add i32 %d, %e
+  br label %endif
+
+endif:
+  %3 = phi i32 [%1, %if], [%2, %else]
+  %4 = add i32 %3, %a
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
index d74161b..9d8a623 100644
--- a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
+++ b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 ; Copy VGPR -> SGPR used twice as an instruction operand, which is then
 ; used in an REG_SEQUENCE that also needs to be handled.
diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll
index 5472c1b..c581d86 100644
--- a/test/CodeGen/R600/sgpr-copy.ll
+++ b/test/CodeGen/R600/sgpr-copy.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=SI  | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
 
 ; This test checks that no VGPR to SGPR copies are created by the register
 ; allocator.
diff --git a/test/CodeGen/R600/si-annotate-cf-assertion.ll b/test/CodeGen/R600/si-annotate-cf-assertion.ll
index cd3ba2b..daa4667 100644
--- a/test/CodeGen/R600/si-annotate-cf-assertion.ll
+++ b/test/CodeGen/R600/si-annotate-cf-assertion.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; XFAIL: *
-; RUN: llc -march=r600 -mcpu=SI -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s
 
 
 define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
diff --git a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
new file mode 100644
index 0000000..d9f60ea
--- /dev/null
+++ b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+; 64-bit select was originally lowered with a build_pair, and this
+; could be simplified to 1 cndmask instead of 2, but that broken when
+; it started being implemented with a v2i32 build_vector and
+; bitcasting.
+define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, i64 %a, i64 %b
+  %trunc = trunc i64 %select to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @trunc_load_alloca_i64:
+; SI: V_MOVRELS_B32
+; SI-NOT: V_MOVRELS_B32
+; SI: S_ENDPGM
+define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
+  %idx = add i32 %a, %b
+  %alloca = alloca i64, i32 4
+  %gep0 = getelementptr i64* %alloca, i64 0
+  %gep1 = getelementptr i64* %alloca, i64 1
+  %gep2 = getelementptr i64* %alloca, i64 2
+  %gep3 = getelementptr i64* %alloca, i64 3
+  store i64 24, i64* %gep0, align 8
+  store i64 9334, i64* %gep1, align 8
+  store i64 3935, i64* %gep2, align 8
+  store i64 9342, i64* %gep3, align 8
+  %gep = getelementptr i64* %alloca, i32 %idx
+  %load = load i64* %gep, align 8
+  %mask = and i64 %load, 4294967296
+  %add = add i64 %mask, -1
+  store i64 %add, i64 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll
index 43231df..dec6185 100644
--- a/test/CodeGen/R600/smrd.ll
+++ b/test/CodeGen/R600/smrd.ll
@@ -2,7 +2,7 @@
 
 ; SMRD load with an immediate offset.
 ; CHECK-LABEL: @smrd0
-; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 1 ; encoding: [0x01
+; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 1
@@ -13,7 +13,7 @@ entry:
 
 ; SMRD load with the largest possible immediate offset.
 ; CHECK-LABEL: @smrd1
-; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 255 ; encoding: [0xff
+; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 255
@@ -24,7 +24,7 @@ entry:
 
 ; SMRD load with an offset greater than the largest possible immediate.
 ; CHECK-LABEL: @smrd2
-; CHECK: S_MOV_B32 s[[OFFSET:[0-9]]], 1024
+; CHECK: S_MOV_B32 s[[OFFSET:[0-9]]], 0x400
 ; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
@@ -34,9 +34,27 @@ entry:
   ret void
 }
 
+; SMRD load with a 64-bit offset
+; CHECK-LABEL: @smrd3
+; CHECK-DAG: S_MOV_B32 s[[SHI:[0-9]+]], 4
+; CHECK-DAG: S_MOV_B32 s[[SLO:[0-9]+]], 0
+; FIXME: We don't need to copy these values to VGPRs
+; CHECK-DAG: V_MOV_B32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; CHECK-DAG: V_MOV_B32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; FIXME: We should be able to use S_LOAD_DWORD here
+; BUFFER_LOAD_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] + v[[[VLO]]:[[VHI]]] + 0x0
+
+define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
+  %1 = load i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
 ; SMRD load using the load.const intrinsic with an immediate offset
 ; CHECK-LABEL: @smrd_load_const0
-; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 4 ; encoding: [0x04
+; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
@@ -49,7 +67,7 @@ main_body:
 ; SMRD load using the load.const intrinsic with an offset greater largest possible
 ; immediate offset.
 ; CHECK-LABEL: @smrd_load_const1
-; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 255 ; encoding: [0xff
+; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
diff --git a/test/CodeGen/R600/store-v3i64.ll b/test/CodeGen/R600/store-v3i64.ll
index 58229f6..58d28b5 100644
--- a/test/CodeGen/R600/store-v3i64.ll
+++ b/test/CodeGen/R600/store-v3i64.ll
@@ -1,5 +1,5 @@
 ; XFAIL: *
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI
 
 ; SI-LABEL: @global_store_v3i64:
 ; SI: BUFFER_STORE_DWORDX4
diff --git a/test/CodeGen/R600/store-vector-ptrs.ll b/test/CodeGen/R600/store-vector-ptrs.ll
index 3af7d91..41c5edc 100644
--- a/test/CodeGen/R600/store-vector-ptrs.ll
+++ b/test/CodeGen/R600/store-vector-ptrs.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; XFAIL: *
-; RUN: llc -march=r600 -mcpu=SI < %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s
 
 define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
   %p = getelementptr <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
index a3c5331..c0c8ccc 100644
--- a/test/CodeGen/R600/store.ll
+++ b/test/CodeGen/R600/store.ll
@@ -177,6 +177,26 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: @store_i64_i8
+; EG-CHECK: MEM_RAT MSKOR
+; SI-CHECK: BUFFER_STORE_BYTE
+define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i8
+  store i8 %0, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @store_i64_i16
+; EG-CHECK: MEM_RAT MSKOR
+; SI-CHECK: BUFFER_STORE_SHORT
+define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i16
+  store i16 %0, i16 addrspace(1)* %out
+  ret void
+}
+
 ;===------------------------------------------------------------------------===;
 ; Local Address Space
 ;===------------------------------------------------------------------------===;
@@ -272,6 +292,26 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: @store_local_i64_i8
+; EG-CHECK: LDS_BYTE_WRITE
+; SI-CHECK: DS_WRITE_B8
+define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i8
+  store i8 %0, i8 addrspace(3)* %out
+  ret void
+}
+
+; FUNC-LABEL: @store_local_i64_i16
+; EG-CHECK: LDS_SHORT_WRITE
+; SI-CHECK: DS_WRITE_B16
+define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
+entry:
+  %0 = trunc i64 %in to i16
+  store i16 %0, i16 addrspace(3)* %out
+  ret void
+}
+
 ; The stores in this function are combined by the optimizer to create a
 ; 64-bit store with 32-bit alignment.  This is legal for SI and the legalizer
 ; should not try to split the 64-bit store back into 2 32-bit stores.
@@ -297,3 +337,29 @@ entry:
 }
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; When i128 was a legal type this program generated cannot select errors:
+
+; FUNC-LABEL: @i128-const-store
+; FIXME: We should be able to to this with one store instruction
+; EG-CHECK: STORE_RAW
+; EG-CHECK: STORE_RAW
+; EG-CHECK: STORE_RAW
+; EG-CHECK: STORE_RAW
+; CM-CHECK: STORE_DWORD
+; CM-CHECK: STORE_DWORD
+; CM-CHECK: STORE_DWORD
+; CM-CHECK: STORE_DWORD
+; SI: BUFFER_STORE_DWORDX2
+; SI: BUFFER_STORE_DWORDX2
+define void @i128-const-store(i32 addrspace(1)* %out) {
+entry:
+  store i32 1, i32 addrspace(1)* %out, align 4
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i64 1
+  store i32 1, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32 addrspace(1)* %out, i64 2
+  store i32 2, i32 addrspace(1)* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds i32 addrspace(1)* %out, i64 3
+  store i32 2, i32 addrspace(1)* %arrayidx6, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
index 5fdd2b8..e321ed6 100644
--- a/test/CodeGen/R600/sub.ll
+++ b/test/CodeGen/R600/sub.ll
@@ -1,13 +1,12 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
-;EG-CHECK: @test2
-;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;FUNC-LABEL: @test2
+;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test2
-;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +17,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK: @test4
-;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;FUNC-LABEL: @test4
+;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test4
-;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -38,3 +36,24 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+;FUNC_LABEL: @test5
+
+;EG-DAG: SETGE_UINT
+;EG-DAG: CNDE_INT
+;EG-DAG: SUB_INT
+;EG-DAG: SUB_INT
+;EG-DAG: SUB_INT
+
+;SI: S_XOR_B64
+;SI-DAG: S_ADD_I32
+;SI-DAG: S_ADDC_U32
+;SI-DAG: S_ADD_I32
+;SI-DAG: S_ADDC_U32
+
+define void @test5(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = sub i64 %a, %b
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/trunc-store-i1.ll b/test/CodeGen/R600/trunc-store-i1.ll
index a888943..a3975c8 100644
--- a/test/CodeGen/R600/trunc-store-i1.ll
+++ b/test/CodeGen/R600/trunc-store-i1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
 ; SI-LABEL: @global_truncstore_i32_to_i1
diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll
index 8a759dc..31cdfcd 100644
--- a/test/CodeGen/R600/trunc.ll
+++ b/test/CodeGen/R600/trunc.ll
@@ -3,7 +3,7 @@
 
 define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
 ; SI-LABEL: @trunc_i64_to_i32_store
-; SI: S_LOAD_DWORD s0, s[0:1], 11
+; SI: S_LOAD_DWORD s0, s[0:1], 0xb
 ; SI: V_MOV_B32_e32 v0, s0
 ; SI: BUFFER_STORE_DWORD v0
 
@@ -31,8 +31,9 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
 
 ; SI-LABEL: @trunc_shl_i64:
 ; SI: S_LOAD_DWORDX2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}},
-; SI: V_ADD_I32_e32 v[[LO_ADD:[0-9]+]], s[[LO_SREG]],
-; SI: V_LSHL_B64 v{{\[}}[[LO_VREG:[0-9]+]]:{{[0-9]+\]}}, v{{\[}}[[LO_ADD]]:{{[0-9]+\]}}, 2
+; SI: S_ADD_I32 s[[LO_ADD:[0-9]+]], s[[LO_SREG]],
+; SI: S_LSHL_B64 s{{\[}}[[LO_SREG2:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_ADD]]:{{[0-9]+\]}}, 2
+; SI: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
 ; SI: BUFFER_STORE_DWORD v[[LO_VREG]],
 define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
   %aa = add i64 %a, 234 ; Prevent shrinking store.
diff --git a/test/CodeGen/R600/uaddo.ll b/test/CodeGen/R600/uaddo.ll
new file mode 100644
index 0000000..3b69687
--- /dev/null
+++ b/test/CodeGen/R600/uaddo.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+
+declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+
+; SI-LABEL: @uaddo_i64_zext
+; SI: ADD
+; SI: ADDC
+; SI: ADDC
+define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
+  %val = extractvalue { i64, i1 } %uadd, 0
+  %carry = extractvalue { i64, i1 } %uadd, 1
+  %ext = zext i1 %carry to i64
+  %add2 = add i64 %val, %ext
+  store i64 %add2, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll
new file mode 100644
index 0000000..a71315a
--- /dev/null
+++ b/test/CodeGen/R600/udivrem64.ll
@@ -0,0 +1,82 @@
+;XUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
+
+;FUNC-LABEL: @test_udiv
+;EG: RECIP_UINT
+;EG: LSHL {{.*}}, 1,
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;SI: S_ENDPGM
+define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = udiv i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+;FUNC-LABEL: @test_urem
+;EG: RECIP_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: BFE_UINT
+;EG: AND_INT {{.*}}, 1,
+;SI: S_ENDPGM
+define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+  %result = urem i64 %x, %y
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll
new file mode 100644
index 0000000..75150c2
--- /dev/null
+++ b/test/CodeGen/R600/uint_to_fp.f64.ll
@@ -0,0 +1,9 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @uint_to_fp_f64_i32
+; SI: V_CVT_F64_U32_e32
+define void @uint_to_fp_f64_i32(double addrspace(1)* %out, i32 %in) {
+  %cast = uitofp i32 %in to double
+  store double %cast, double addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll
index 2824ff8..4df69d1 100644
--- a/test/CodeGen/R600/unaligned-load-store.ll
+++ b/test/CodeGen/R600/unaligned-load-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: @unaligned_load_store_i32:
 ; DS_READ_U32 {{v[0-9]+}}, 0, [[REG]]
diff --git a/test/CodeGen/R600/v_cndmask.ll b/test/CodeGen/R600/v_cndmask.ll
index f8e9655..84087ee 100644
--- a/test/CodeGen/R600/v_cndmask.ll
+++ b/test/CodeGen/R600/v_cndmask.ll
@@ -3,7 +3,8 @@
 ; SI: @v_cnd_nan
 ; SI: V_CNDMASK_B32_e64 v{{[0-9]}},
 ; SI-DAG: v{{[0-9]}}
-; SI-DAG: {{nan|#QNAN}}
+; All nan values are converted to 0xffffffff
+; SI-DAG: -1
 define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) {
 entry:
   %0 = icmp ne i32 %c, 0
diff --git a/test/CodeGen/R600/valu-i1.ll b/test/CodeGen/R600/valu-i1.ll
new file mode 100644
index 0000000..5d5e3ff
--- /dev/null
+++ b/test/CodeGen/R600/valu-i1.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI %s
+
+; Make sure the i1 values created by the cfg structurizer pass are
+; moved using VALU instructions
+; SI-NOT: S_MOV_B64 s[{{[0-9]:[0-9]}}], -1
+; SI: V_MOV_B32_e32 v{{[0-9]}}, -1
+define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) {
+entry:
+  switch i32 %a, label %default [
+    i32 0, label %case0
+    i32 1, label %case1
+  ]
+
+case0:
+  %arrayidx1 = getelementptr i32 addrspace(1)* %dst, i32 %b
+  store i32 0, i32 addrspace(1)* %arrayidx1, align 4
+  br label %end
+
+case1:
+  %arrayidx5 = getelementptr i32 addrspace(1)* %dst, i32 %b
+  store i32 1, i32 addrspace(1)* %arrayidx5, align 4
+  br label %end
+
+default:
+  %cmp8 = icmp eq i32 %a, 2
+  %arrayidx10 = getelementptr i32 addrspace(1)* %dst, i32 %b
+  br i1 %cmp8, label %if, label %else
+
+if:
+  store i32 2, i32 addrspace(1)* %arrayidx10, align 4
+  br label %end
+
+else:
+  store i32 3, i32 addrspace(1)* %arrayidx10, align 4
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll
index 9618d7f..90079b0 100644
--- a/test/CodeGen/R600/work-item-intrinsics.ll
+++ b/test/CodeGen/R600/work-item-intrinsics.ll
@@ -19,7 +19,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[0].Y
 ; SI-CHECK: @ngroups_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 1
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x1
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @ngroups_y (i32 addrspace(1)* %out) {
@@ -33,7 +33,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[0].Z
 ; SI-CHECK: @ngroups_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 2
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x2
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @ngroups_z (i32 addrspace(1)* %out) {
@@ -47,7 +47,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[0].W
 ; SI-CHECK: @global_size_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 3
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x3
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @global_size_x (i32 addrspace(1)* %out) {
@@ -61,7 +61,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[1].X
 ; SI-CHECK: @global_size_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 4
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x4
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @global_size_y (i32 addrspace(1)* %out) {
@@ -75,7 +75,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[1].Y
 ; SI-CHECK: @global_size_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 5
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x5
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @global_size_z (i32 addrspace(1)* %out) {
@@ -89,7 +89,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[1].Z
 ; SI-CHECK: @local_size_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 6
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x6
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @local_size_x (i32 addrspace(1)* %out) {
@@ -103,7 +103,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[1].W
 ; SI-CHECK: @local_size_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 7
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x7
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @local_size_y (i32 addrspace(1)* %out) {
@@ -117,7 +117,7 @@ entry:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; R600-CHECK: MOV [[VAL]], KC0[2].X
 ; SI-CHECK: @local_size_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 8
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x8
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @local_size_z (i32 addrspace(1)* %out) {
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index 49ed12d..5a5c86d 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -72,3 +72,21 @@ define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
+
+; SI-CHECK-LABEL: @scalar_not_i32
+; SI-CHECK: S_NOT_B32
+define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
+  %result = xor i32 %a, -1
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-CHECK-LABEL: @vector_not_i32
+; SI-CHECK: V_NOT_B32
+define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+  %a = load i32 addrspace(1)* %in0
+  %b = load i32 addrspace(1)* %in1
+  %result = xor i32 %a, -1
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
index a114bfc..8585d4a 100644
--- a/test/CodeGen/R600/zero_extend.ll
+++ b/test/CodeGen/R600/zero_extend.ll
@@ -6,8 +6,9 @@
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
 
 ; SI-CHECK: @test
-; SI-CHECK: V_MOV_B32_e32 v[[ZERO:[0-9]]], 0
-; SI-CHECK: BUFFER_STORE_DWORDX2 v[0:[[ZERO]]{{\]}}
+; SI-CHECK: S_MOV_B32 [[ZERO:s[0-9]]], 0
+; SI-CHECK: V_MOV_B32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
+; SI-CHECK: BUFFER_STORE_DWORDX2 v[0:[[V_ZERO]]{{\]}}
 define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = mul i32 %a, %b
@@ -26,3 +27,14 @@ entry:
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
+
+; SI-CHECK-LABEL: @zext_i1_to_i64
+; SI-CHECK: V_CMP_EQ_I32
+; SI-CHECK: V_CNDMASK_B32
+; SI-CHECK: S_MOV_B32 s{{[0-9]+}}, 0
+define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp eq i32 %a, %b
+  %ext = zext i1 %cmp to i64
+  store i64 %ext, i64 addrspace(1)* %out, align 8
+  ret void
+}
diff --git a/test/CodeGen/SPARC/2011-01-11-FrameAddr.ll b/test/CodeGen/SPARC/2011-01-11-FrameAddr.ll
index 050b76d..1c8e7d8 100644
--- a/test/CodeGen/SPARC/2011-01-11-FrameAddr.ll
+++ b/test/CodeGen/SPARC/2011-01-11-FrameAddr.ll
@@ -60,13 +60,13 @@ declare i8* @llvm.frameaddress(i32) nounwind readnone
 define i8* @retaddr() nounwind readnone {
 entry:
 ;V8-LABEL: retaddr:
-;V8: or %g0, %o7, {{.+}}
+;V8: mov %o7, {{.+}}
 
 ;V9-LABEL: retaddr:
-;V9: or %g0, %o7, {{.+}}
+;V9: mov %o7, {{.+}}
 
 ;SPARC64-LABEL: retaddr
-;SPARC64:       or %g0, %o7, {{.+}}
+;SPARC64:       mov %o7, {{.+}}
 
   %0 = tail call i8* @llvm.returnaddress(i32 0)
   ret i8* %0
diff --git a/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll b/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
index 60bdf06..8a3edc6 100644
--- a/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
+++ b/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
@@ -1,6 +1,7 @@
 ;RUN: llc -march=sparc < %s -verify-machineinstrs | FileCheck %s
 ;RUN: llc -march=sparc -O0 < %s -verify-machineinstrs | FileCheck %s -check-prefix=UNOPT
 
+target triple = "sparc-unknown-linux-gnu"
 
 define i32 @test(i32 %a) nounwind {
 entry:
@@ -59,7 +60,7 @@ entry:
 ;CHECK:      !NO_APP
 ;CHECK-NEXT: cmp
 ;CHECK-NEXT: bg
-;CHECK-NEXT: or
+;CHECK-NEXT: mov
   tail call void asm sideeffect "sethi 0, %g0", ""() nounwind
   %0 = icmp slt i32 %a, 0
   br i1 %0, label %bb, label %bb1
diff --git a/test/CodeGen/SPARC/64abi.ll b/test/CodeGen/SPARC/64abi.ll
index 3771888..a88e19a5 100644
--- a/test/CodeGen/SPARC/64abi.ll
+++ b/test/CodeGen/SPARC/64abi.ll
@@ -44,7 +44,7 @@ define void @intarg(i8  %a0,   ; %i0
 ; CHECK: sra %i0, 0, [[R:%[gilo][0-7]]]
 ; CHECK: stx [[R]], [%sp+2223]
 ; Use %o0-%o5 for outgoing arguments
-; CHECK: or %g0, 5, %o5
+; CHECK: mov 5, %o5
 ; CHECK: call intarg
 ; CHECK-NOT: add %sp
 ; CHECK: restore
@@ -208,7 +208,7 @@ define i32 @inreg_if(float inreg %a0, ; %f0
 
 ; CHECK: call_inreg_if
 ; CHECK: fmovs %f3, %f0
-; CHECK: or %g0, %i2, %o0
+; CHECK: mov %i2, %o0
 ; CHECK: call inreg_if
 define void @call_inreg_if(i32* %p, float %f3, i32 %i2) {
   %x = call i32 @inreg_if(float %f3, i32 %i2)
diff --git a/test/CodeGen/SPARC/64bit.ll b/test/CodeGen/SPARC/64bit.ll
index 7ab19f3..b18f1bc 100644
--- a/test/CodeGen/SPARC/64bit.ll
+++ b/test/CodeGen/SPARC/64bit.ll
@@ -2,11 +2,11 @@
 ; RUN: llc < %s -march=sparcv9 -mattr=+popc | FileCheck %s -check-prefix=OPT
 
 ; CHECK-LABEL: ret2:
-; CHECK: or %g0, %i1, %i0
+; CHECK: mov %i1, %i0
 
 ; OPT-LABEL: ret2:
 ; OPT: retl
-; OPT: or %g0, %o1, %o0
+; OPT: mov %o1, %o0
 define i64 @ret2(i64 %a, i64 %b) {
   ret i64 %b
 }
@@ -39,21 +39,21 @@ define i64 @sra_reg(i64 %a, i64 %b) {
 ;     restore %g0, %g0, %o0
 ;
 ; CHECK: ret_imm0
-; CHECK: or %g0, 0, %i0
+; CHECK: mov 0, %i0
 
 ; OPT: ret_imm0
 ; OPT: retl
-; OPT: or %g0, 0, %o0
+; OPT: mov 0, %o0
 define i64 @ret_imm0() {
   ret i64 0
 }
 
 ; CHECK: ret_simm13
-; CHECK: or %g0, -4096, %i0
+; CHECK: mov -4096, %i0
 
 ; OPT:   ret_simm13
 ; OPT:   retl
-; OPT:   or %g0, -4096, %o0
+; OPT:   mov -4096, %o0
 define i64 @ret_simm13() {
   ret i64 -4096
 }
diff --git a/test/CodeGen/SPARC/64cond.ll b/test/CodeGen/SPARC/64cond.ll
index 1bd17a4..e491d61 100644
--- a/test/CodeGen/SPARC/64cond.ll
+++ b/test/CodeGen/SPARC/64cond.ll
@@ -112,9 +112,9 @@ entry:
 
 ; CHECK-LABEL: setcc_resultty
 ; CHECK-DAG:       srax %i0, 63, %o0
-; CHECK-DAG:       or %g0, %i0, %o1
-; CHECK-DAG:       or %g0, 0, %o2
-; CHECK-DAG:       or %g0, 32, %o3
+; CHECK-DAG:       mov %i0, %o1
+; CHECK-DAG:       mov 0, %o2
+; CHECK-DAG:       mov 32, %o3
 ; CHECK-DAG:       call __multi3
 ; CHECK:       cmp
 ; CHECK:       movne %xcc, 1, [[R:%[gilo][0-7]]]
diff --git a/test/CodeGen/SPARC/atomics.ll b/test/CodeGen/SPARC/atomics.ll
index 4e3e7ae..5e41300 100644
--- a/test/CodeGen/SPARC/atomics.ll
+++ b/test/CodeGen/SPARC/atomics.ll
@@ -33,7 +33,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_cmpxchg_i32
-; CHECK:       or  %g0, 123, [[R:%[gilo][0-7]]]
+; CHECK:       mov 123, [[R:%[gilo][0-7]]]
 ; CHECK:       cas [%o1], %o0, [[R]]
 
 define i32 @test_cmpxchg_i32(i32 %a, i32* %ptr) {
@@ -43,7 +43,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_cmpxchg_i64
-; CHECK:       or  %g0, 123, [[R:%[gilo][0-7]]]
+; CHECK:       mov 123, [[R:%[gilo][0-7]]]
 ; CHECK:       casx [%o1], %o0, [[R]]
 
 define i64 @test_cmpxchg_i64(i64 %a, i64* %ptr) {
@@ -53,7 +53,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_swap_i32
-; CHECK:       or  %g0, 42, [[R:%[gilo][0-7]]]
+; CHECK:       mov 42, [[R:%[gilo][0-7]]]
 ; CHECK:       swap [%o1], [[R]]
 
 define i32 @test_swap_i32(i32 %a, i32* %ptr) {
diff --git a/test/CodeGen/SPARC/exception.ll b/test/CodeGen/SPARC/exception.ll
index 3a3f59f..eca9c8b 100644
--- a/test/CodeGen/SPARC/exception.ll
+++ b/test/CodeGen/SPARC/exception.ll
@@ -1,9 +1,7 @@
 ; RUN: llc < %s -march=sparc   -relocation-model=static | FileCheck -check-prefix=V8ABS %s
 ; RUN: llc < %s -march=sparc   -relocation-model=pic    | FileCheck -check-prefix=V8PIC %s
-; RUN: llc < %s -march=sparc   -relocation-model=pic -disable-cfi    | FileCheck -check-prefix=V8PIC_NOCFI %s
 ; RUN: llc < %s -march=sparcv9 -relocation-model=static | FileCheck -check-prefix=V9ABS %s
 ; RUN: llc < %s -march=sparcv9 -relocation-model=pic    | FileCheck -check-prefix=V9PIC %s
-; RUN: llc < %s -march=sparcv9 -relocation-model=pic -disable-cfi    | FileCheck -check-prefix=V9PIC_NOCFI %s
 
 
 %struct.__fundamental_type_info_pseudo = type { %struct.__type_info_pseudo }
@@ -47,22 +45,6 @@
 ; V8PIC: .L_ZTIi.DW.stub:
 ; V8PIC-NEXT:   .word _ZTIi
 
-; V8PIC_NOCFI-LABEL: main:
-; V8PIC_NOCFI:        .section .gcc_except_table
-; V8PIC_NOCFI-NOT:    .section
-; V8PIC_NOCFI:        .word %r_disp32(.L_ZTIi.DW.stub)
-; V8PIC_NOCFI:        .data
-; V8PIC_NOCFI: .L_ZTIi.DW.stub:
-; V8PIC_NOCFI-NEXT:   .word _ZTIi
-; V8PIC_NOCFI:        .section .eh_frame
-; V8PIC_NOCFI-NOT:    .section
-; V8PIC_NOCFI:        .byte 15                     ! CIE Return Address Column
-; V8PIC_NOCFI:        .word %r_disp32(DW.ref.__gxx_personality_v0)
-; V8PIC_NOCFI:        .byte 12                     ! DW_CFA_def_cfa
-; V8PIC_NOCFI:        .byte 14                     ! Reg 14
-; V8PIC_NOCFI-NEXT:   .byte 0                      ! Offset 0
-; V8PIC_NOCFI:        .word %r_disp32(.Ltmp{{.+}}) ! FDE initial location
-
 
 ; V9ABS-LABEL: main:
 ; V9ABS:        .cfi_startproc
@@ -89,22 +71,6 @@
 ; V9PIC: .L_ZTIi.DW.stub:
 ; V9PIC-NEXT:   .xword _ZTIi
 
-; V9PIC_NOCFI-LABEL: main:
-; V9PIC_NOCFI:        .section .gcc_except_table
-; V9PIC_NOCFI-NOT:    .section
-; V9PIC_NOCFI:        .word %r_disp32(.L_ZTIi.DW.stub)
-; V9PIC_NOCFI:        .data
-; V9PIC_NOCFI: .L_ZTIi.DW.stub:
-; V9PIC_NOCFI-NEXT:   .xword _ZTIi
-; V9PIC_NOCFI:        .section .eh_frame
-; V9PIC_NOCFI-NOT:    .section
-; V9PIC_NOCFI:        .byte 15                     ! CIE Return Address Column
-; V9PIC_NOCFI:        .word %r_disp32(DW.ref.__gxx_personality_v0)
-; V9PIC_NOCFI:        .byte 12                     ! DW_CFA_def_cfa
-; V9PIC_NOCFI-NEXT:   .byte 14                     ! Reg 14
-; V9PIC_NOCFI:        .ascii "\377\017"            ! Offset 2047
-; V9PIC_NOCFI:        .word %r_disp32(.Ltmp{{.+}}) ! FDE initial location
-
 define i32 @main(i32 %argc, i8** nocapture readnone %argv) unnamed_addr #0 {
 entry:
   %0 = icmp eq i32 %argc, 2
diff --git a/test/CodeGen/SPARC/leafproc.ll b/test/CodeGen/SPARC/leafproc.ll
index 963fac0..abb8ed9 100644
--- a/test/CodeGen/SPARC/leafproc.ll
+++ b/test/CodeGen/SPARC/leafproc.ll
@@ -11,7 +11,7 @@ entry:
 
 ; CHECK-LABEL:      return_int_const:
 ; CHECK:      retl
-; CHECK-NEXT: or %g0, 1729, %o0
+; CHECK-NEXT: mov 1729, %o0
 define i32 @return_int_const() {
 entry:
   ret i32 1729
@@ -58,9 +58,9 @@ entry:
 
 ; CHECK-LABEL:      leaf_proc_with_local_array:
 ; CHECK:      add %sp, -104, %sp
-; CHECK:      or %g0, 1, [[R1:%[go][0-7]]]
+; CHECK:      mov 1, [[R1:%[go][0-7]]]
 ; CHECK:      st [[R1]], [%sp+96]
-; CHECK:      or %g0, 2, [[R2:%[go][0-7]]]
+; CHECK:      mov 2, [[R2:%[go][0-7]]]
 ; CHECK:      st [[R2]], [%sp+100]
 ; CHECK:      ld {{.+}}, %o0
 ; CHECK:      retl
diff --git a/test/CodeGen/SPARC/parts.ll b/test/CodeGen/SPARC/parts.ll
index 57add49..47feb15 100644
--- a/test/CodeGen/SPARC/parts.ll
+++ b/test/CodeGen/SPARC/parts.ll
@@ -2,10 +2,10 @@
   
 ; CHECK-LABEL: test
 ; CHECK:        srl %i1, 0, %o2
-; CHECK-NEXT:   or %g0, %i2, %o0
+; CHECK-NEXT:   mov %i2, %o0
 ; CHECK-NEXT:   call __ashlti3
-; CHECK-NEXT:   or %g0, %i3, %o1
-; CHECK-NEXT:   or %g0, %o0, %i0
+; CHECK-NEXT:   mov %i3, %o1
+; CHECK-NEXT:   mov %o0, %i0
   
 define i128 @test(i128 %a, i128 %b) {
 entry:
diff --git a/test/CodeGen/SPARC/sret-secondary.ll b/test/CodeGen/SPARC/sret-secondary.ll
new file mode 100644
index 0000000..4efcabf
--- /dev/null
+++ b/test/CodeGen/SPARC/sret-secondary.ll
@@ -0,0 +1,8 @@
+; RUN: not llc -march=sparc < %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: sparc only supports sret on the first parameter
+
+define void @foo(i32 %a, i32* sret %out) {
+  store i32 %a, i32* %out
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/alias-01.ll b/test/CodeGen/SystemZ/alias-01.ll
index dc90481..8839aad 100644
--- a/test/CodeGen/SystemZ/alias-01.ll
+++ b/test/CodeGen/SystemZ/alias-01.ll
@@ -2,9 +2,6 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; The use of TBAA in CodeGen has been temporarily disabled pending correctness fixes.
-; XFAIL: *
-
 ; Check that there are no spills.
 define void @f1(<16 x i32> *%src1, <16 x float> *%dest) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb/2009-06-18-ThumbCommuteMul.ll b/test/CodeGen/Thumb/2009-06-18-ThumbCommuteMul.ll
index 5c883b3..ca6df7c 100644
--- a/test/CodeGen/Thumb/2009-06-18-ThumbCommuteMul.ll
+++ b/test/CodeGen/Thumb/2009-06-18-ThumbCommuteMul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb | grep r0 | count 1
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i32 @a(i32 %x, i32 %y) nounwind readnone {
 entry:
@@ -6,3 +6,5 @@ entry:
 	ret i32 %mul
 }
 
+; CHECK: r0
+
diff --git a/test/CodeGen/Thumb/2010-06-18-SibCallCrash.ll b/test/CodeGen/Thumb/2010-06-18-SibCallCrash.ll
index ad8b064..e1efd3b 100644
--- a/test/CodeGen/Thumb/2010-06-18-SibCallCrash.ll
+++ b/test/CodeGen/Thumb/2010-06-18-SibCallCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=thumb < %s
+; RUN: llc -mtriple=thumb-eabi %s -o /dev/null
 ; rdar://8104457
 
 define arm_apcscc void @t(i32* %m) nounwind {
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index b87bf24..ffc9584 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -151,5 +151,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !98 = metadata !{i32 52, i32 0, metadata !1, null}
 !101 = metadata !{metadata !"ggEdgeDiscrepancy.cc", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
 !102 = metadata !{i32 0}
-!103 = metadata !{metadata !3}
+!103 = metadata !{metadata !3, metadata !77}
 !104 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/Thumb/DbgValueOtherTargets.test b/test/CodeGen/Thumb/DbgValueOtherTargets.test
index afb18a4..557892b 100644
--- a/test/CodeGen/Thumb/DbgValueOtherTargets.test
+++ b/test/CodeGen/Thumb/DbgValueOtherTargets.test
@@ -1 +1 @@
-RUN: llc -O0 -march=thumb -asm-verbose < %S/../Inputs/DbgValueOtherTargets.ll | FileCheck %S/../Inputs/DbgValueOtherTargets.ll
+RUN: llc -O0 -mtriple=thumb-eabi -asm-verbose %S/../Inputs/DbgValueOtherTargets.ll -o - | FileCheck %S/../Inputs/DbgValueOtherTargets.ll
diff --git a/test/CodeGen/Thumb/barrier.ll b/test/CodeGen/Thumb/barrier.ll
index 1c27fa0..92d9bb2 100644
--- a/test/CodeGen/Thumb/barrier.ll
+++ b/test/CodeGen/Thumb/barrier.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=thumbv6-apple-darwin  | FileCheck %s -check-prefix=V6
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=-db | FileCheck %s -check-prefix=V6
-; RUN: llc < %s -march=thumb -mcpu=cortex-m0   | FileCheck %s -check-prefix=V6M
+; RUN: llc -mtriple=thumbv6-apple-darwin %s -o - | FileCheck %s -check-prefix=V6
+; RUN: llc -mtriple=thumbv7-apple-darwin -mattr=-db %s -o - | FileCheck %s -check-prefix=V6
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m0 %s -o - | FileCheck %s -check-prefix=V6M
 
 define void @t1() {
 ; V6-LABEL: t1:
diff --git a/test/CodeGen/Thumb/dyn-stackalloc.ll b/test/CodeGen/Thumb/dyn-stackalloc.ll
index 6c6de55..6bc39af 100644
--- a/test/CodeGen/Thumb/dyn-stackalloc.ll
+++ b/test/CodeGen/Thumb/dyn-stackalloc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra | FileCheck %s
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra | FileCheck %s -check-prefix=CHECK -check-prefix=RA_GREEDY
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic | FileCheck %s -check-prefix=CHECK -check-prefix=RA_BASIC
 
 	%struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* }
 	%struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* }
@@ -45,7 +45,8 @@ define void @t2(%struct.comment* %vc, i8* %tag, i8* %contents) {
 ; CHECK: sub sp, #
 ; CHECK: mov r[[R0:[0-9]+]], sp
 ; CHECK: str r{{[0-9+]}}, [r[[R0]]
-; CHECK: str r{{[0-9+]}}, [r[[R0]]
+; RA_GREEDY: str r{{[0-9+]}}, [r[[R0]]
+; RA_BASIC: stm r[[R0]]!
 ; CHECK-NOT: ldr r0, [sp
 ; CHECK: mov r[[R1:[0-9]+]], sp
 ; CHECK: subs r[[R2:[0-9]+]], r[[R1]], r{{[0-9]+}}
diff --git a/test/CodeGen/Thumb/fpconv.ll b/test/CodeGen/Thumb/fpconv.ll
index 7da36dd..0ade798 100644
--- a/test/CodeGen/Thumb/fpconv.ll
+++ b/test/CodeGen/Thumb/fpconv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb
+; RUN: llc -mtriple=thumb-eabi %s -o /dev/null
 
 define float @f1(double %x) {
 entry:
diff --git a/test/CodeGen/Thumb/fpow.ll b/test/CodeGen/Thumb/fpow.ll
index be3dc0b..18b1c91 100644
--- a/test/CodeGen/Thumb/fpow.ll
+++ b/test/CodeGen/Thumb/fpow.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb
+; RUN: llc -mtriple=thumb-eabi %s -o /dev/null
 
 define double @t(double %x, double %y) nounwind optsize {
 entry:
diff --git a/test/CodeGen/Thumb/inlineasm-imm-thumb.ll b/test/CodeGen/Thumb/inlineasm-imm-thumb.ll
index d557b9d..4e4f8fa 100644
--- a/test/CodeGen/Thumb/inlineasm-imm-thumb.ll
+++ b/test/CodeGen/Thumb/inlineasm-imm-thumb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -no-integrated-as
+; RUN: llc -mtriple=thumb-eabi -no-integrated-as %s -o /dev/null
 
 ; Test Thumb-mode "I" constraint, for ADD immediate.
 define i32 @testI(i32 %x) {
diff --git a/test/CodeGen/Thumb/inlineasm-thumb.ll b/test/CodeGen/Thumb/inlineasm-thumb.ll
index f2683c8..2547ce8 100644
--- a/test/CodeGen/Thumb/inlineasm-thumb.ll
+++ b/test/CodeGen/Thumb/inlineasm-thumb.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=thumb | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
+
 define i32 @t1(i32 %x, i32 %y) nounwind {
 entry:
   ; CHECK: mov r0, r12
diff --git a/test/CodeGen/Thumb/ispositive.ll b/test/CodeGen/Thumb/ispositive.ll
index 7b28227..8d39687 100644
--- a/test/CodeGen/Thumb/ispositive.ll
+++ b/test/CodeGen/Thumb/ispositive.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i32 @test1(i32 %X) {
 entry:
diff --git a/test/CodeGen/Thumb/ldr_ext.ll b/test/CodeGen/Thumb/ldr_ext.ll
index 9a28124..2d25af3 100644
--- a/test/CodeGen/Thumb/ldr_ext.ll
+++ b/test/CodeGen/Thumb/ldr_ext.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb | FileCheck %s -check-prefix=V5
-; RUN: llc < %s -march=thumb -mattr=+v6 | FileCheck %s -check-prefix=V6
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s -check-prefix=V5
+; RUN: llc -mtriple=thumb-eabi -mattr=+v6 %s -o - | FileCheck %s -check-prefix=V6
 
 ; rdar://7176514
 
diff --git a/test/CodeGen/Thumb/ldr_frame.ll b/test/CodeGen/Thumb/ldr_frame.ll
index 6c58638..0e879d7 100644
--- a/test/CodeGen/Thumb/ldr_frame.ll
+++ b/test/CodeGen/Thumb/ldr_frame.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i32 @f1() {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb/long-setcc.ll b/test/CodeGen/Thumb/long-setcc.ll
index 8f2d98f..3460edb 100644
--- a/test/CodeGen/Thumb/long-setcc.ll
+++ b/test/CodeGen/Thumb/long-setcc.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb | grep cmp | count 1
-
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i1 @t1(i64 %x) {
 	%B = icmp slt i64 %x, 0
@@ -15,3 +14,9 @@ define i1 @t3(i32 %x) {
 	%tmp = icmp ugt i32 %x, -1
 	ret i1 %tmp
 }
+
+; CHECK: cmp
+; CHECK-NOT: cmp
+
+
+
diff --git a/test/CodeGen/Thumb/long.ll b/test/CodeGen/Thumb/long.ll
index 197e19e..2449e5a 100644
--- a/test/CodeGen/Thumb/long.ll
+++ b/test/CodeGen/Thumb/long.ll
@@ -1,10 +1,5 @@
-; RUN: llc < %s -march=thumb | \
-; RUN:   grep mvn | count 1
-; RUN: llc < %s -march=thumb | \
-; RUN:   grep adc | count 1
-; RUN: llc < %s -march=thumb | \
-; RUN:   grep sbc | count 1
-; RUN: llc < %s -mtriple=thumb-apple-darwin | grep __muldi3
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-apple-darwin %s -o - | FileCheck %s -check-prefix CHECK-DARWIN
 
 define i64 @f1() {
 entry:
@@ -74,3 +69,14 @@ entry:
         ret i64 %retval
 }
 
+; CHECK: mvn
+; CHECK-NOT: mvn
+
+; CHECK: adc
+; CHECK-NOT: adc
+
+; CHECK: sbc
+; CHECK-NOT: sbc
+
+; CHECK-DARWIN: __muldi3
+
diff --git a/test/CodeGen/Thumb/long_shift.ll b/test/CodeGen/Thumb/long_shift.ll
index 2431714..6aa1afd 100644
--- a/test/CodeGen/Thumb/long_shift.ll
+++ b/test/CodeGen/Thumb/long_shift.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb
+; RUN: llc -mtriple=thumb-eabi %s -o /dev/null
 
 define i64 @f0(i64 %A, i64 %B) {
         %tmp = bitcast i64 %A to i64
diff --git a/test/CodeGen/Thumb/mul.ll b/test/CodeGen/Thumb/mul.ll
index c1a2fb2..13a2cfb 100644
--- a/test/CodeGen/Thumb/mul.ll
+++ b/test/CodeGen/Thumb/mul.ll
@@ -1,22 +1,32 @@
-; RUN: llc < %s -march=thumb | grep mul | count 3
-; RUN: llc < %s -march=thumb | grep lsl | count 1
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i32 @f1(i32 %u) {
     %tmp = mul i32 %u, %u
     ret i32 %tmp
 }
 
+; CHECK: mul{{s?}}
+
 define i32 @f2(i32 %u, i32 %v) {
     %tmp = mul i32 %u, %v
     ret i32 %tmp
 }
 
+; CHECK: mul{{s?}}
+
 define i32 @f3(i32 %u) {
     %tmp = mul i32 %u, 5
     ret i32 %tmp
 }
 
+; CHECK: mul{{s?}}
+
 define i32 @f4(i32 %u) {
     %tmp = mul i32 %u, 4
     ret i32 %tmp
 }
+
+; CHECK: lsl
+; CHECK-NOT: mul{{s?}}
+; CHECK-NOT: lsl
+
diff --git a/test/CodeGen/Thumb/rev.ll b/test/CodeGen/Thumb/rev.ll
index dcba00e..3e94702 100644
--- a/test/CodeGen/Thumb/rev.ll
+++ b/test/CodeGen/Thumb/rev.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+v6 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mattr=+v6 %s -o - | FileCheck %s
 
 define i32 @test1(i32 %X) nounwind {
 ; CHECK: test1
diff --git a/test/CodeGen/Thumb/segmented-stacks-dynamic.ll b/test/CodeGen/Thumb/segmented-stacks-dynamic.ll
index 067c07b..5d51f40 100644
--- a/test/CodeGen/Thumb/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/Thumb/segmented-stacks-dynamic.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-linux
-; RUN: llc < %s -mtriple=thumb-linux-androideabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
-; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -segmented-stacks -filetype=obj
-; RUN: llc < %s -mtriple=thumb-linux-androideabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-linux
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
+; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -filetype=obj
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -filetype=obj
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define i32 @test_basic(i32 %l) {
+define i32 @test_basic(i32 %l) #0 {
         %mem = alloca i32, i32 %l
         call void @dummy_use (i32* %mem, i32 %l)
         %terminate = icmp eq i32 %l, 0
@@ -61,3 +61,5 @@ false:
 ; Thumb-android:      pop {r4, r5}
 
 }
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/Thumb/segmented-stacks.ll b/test/CodeGen/Thumb/segmented-stacks.ll
index 5649b00..d6e25c7 100644
--- a/test/CodeGen/Thumb/segmented-stacks.ll
+++ b/test/CodeGen/Thumb/segmented-stacks.ll
@@ -1,13 +1,13 @@
-; RUN: llc < %s -mtriple=thumb-linux-androideabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
-; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-linux
-; RUN: llc < %s -mtriple=thumb-linux-androideabi -segmented-stacks -filetype=obj
-; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
+; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-linux
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -filetype=obj
+; RUN: llc < %s -mtriple=thumb-linux-unknown-gnueabi -filetype=obj
 
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define void @test_basic() {
+define void @test_basic() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
@@ -54,9 +54,11 @@ define void @test_basic() {
 
 }
 
-define i32 @test_nested(i32 * nest %closure, i32 %other) {
+define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        %addend = load i32 * %closure
        %result = add i32 %other, %addend
+       %mem = alloca i32, i32 10
+       call void @dummy_use (i32* %mem, i32 10)
        ret i32 %result
 
 ; Thumb-android:      test_nested:
@@ -68,7 +70,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; Thumb-android-NEXT: cmp     r4, r5
 ; Thumb-android-NEXT: blo     .LBB1_2
 
-; Thumb-android:      mov     r4, #0
+; Thumb-android:      mov     r4, #56
 ; Thumb-android-NEXT: mov     r5, #0
 ; Thumb-android-NEXT: push    {lr}
 ; Thumb-android-NEXT: bl      __morestack
@@ -88,7 +90,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; Thumb-linux-NEXT: cmp     r4, r5
 ; Thumb-linux-NEXT: blo     .LBB1_2
 
-; Thumb-linux:      mov     r4, #0
+; Thumb-linux:      mov     r4, #56
 ; Thumb-linux-NEXT: mov     r5, #0
 ; Thumb-linux-NEXT: push    {lr}
 ; Thumb-linux-NEXT: bl      __morestack
@@ -101,7 +103,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 
 }
 
-define void @test_large() {
+define void @test_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -150,7 +152,7 @@ define void @test_large() {
 
 }
 
-define fastcc void @test_fastcc() {
+define fastcc void @test_fastcc() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
         ret void
@@ -197,7 +199,7 @@ define fastcc void @test_fastcc() {
 
 }
 
-define fastcc void @test_fastcc_large() {
+define fastcc void @test_fastcc_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -245,3 +247,15 @@ define fastcc void @test_fastcc_large() {
 ; Thumb-linux:      pop     {r4, r5}
 
 }
+
+define void @test_nostack() #0 {
+	ret void
+
+; Thumb-android-LABEL: test_nostack:
+; Thumb-android-NOT:   bl __morestack
+
+; Thumb-linux-LABEL: test_nostack:
+; Thumb-linux-NOT:   bl __morestack
+}
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll b/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll
index 3f6407a..97c66d9 100644
--- a/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll
+++ b/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1022e
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1022e %s -o /dev/null
 
 %iterator = type { i8**, i8**, i8**, i8*** }
 %insert_iterator = type { %deque*, %iterator }
diff --git a/test/CodeGen/Thumb/stack-frame.ll b/test/CodeGen/Thumb/stack-frame.ll
index b103b33..09d480a 100644
--- a/test/CodeGen/Thumb/stack-frame.ll
+++ b/test/CodeGen/Thumb/stack-frame.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb
-; RUN: llc < %s -march=thumb | grep add | count 1
+; RUN: llc -mtriple=thumb-eabi < %s -o - | FileCheck %s
 
 define void @f1() {
 	%c = alloca i8, align 1
@@ -10,4 +9,6 @@ define i32 @f2() {
 	ret i32 1
 }
 
+; CHECK: add
+; CHECK-NOT: add
 
diff --git a/test/CodeGen/Thumb/thumb-imm.ll b/test/CodeGen/Thumb/thumb-imm.ll
index 74a57ff..592e694 100644
--- a/test/CodeGen/Thumb/thumb-imm.ll
+++ b/test/CodeGen/Thumb/thumb-imm.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb | not grep CPI
-
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i32 @test1() {
   ret i32 1000
@@ -8,3 +7,6 @@ define i32 @test1() {
 define i32 @test2() {
   ret i32 -256
 }
+
+; CHECK-NOT: CPI
+
diff --git a/test/CodeGen/Thumb/thumb-ldm.ll b/test/CodeGen/Thumb/thumb-ldm.ll
new file mode 100644
index 0000000..dd98e6f
--- /dev/null
+++ b/test/CodeGen/Thumb/thumb-ldm.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
+
+@X = external global [0 x i32]          ; <[0 x i32]*> [#uses=5]
+
+define i32 @t1() {
+; CHECK-LABEL: t1:
+; CHECK: push {r7, lr}
+; CHECK: ldm
+; CHECK: pop {r7, pc}
+        %tmp = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 0)            ; <i32> [#uses=1]
+        %tmp3 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 1)           ; <i32> [#uses=1]
+        %tmp4 = call i32 @f1( i32 %tmp, i32 %tmp3 )                ; <i32> [#uses=1]
+        ret i32 %tmp4
+}
+
+define i32 @t2() {
+; CHECK-LABEL: t2:
+; CHECK: push {r7, lr}
+; CHECK: ldm
+; CHECK: pop {r7, pc}
+        %tmp = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 2)            ; <i32> [#uses=1]
+        %tmp3 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 3)           ; <i32> [#uses=1]
+        %tmp5 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 4)           ; <i32> [#uses=1]
+        %tmp6 = call i32 @f2( i32 %tmp, i32 %tmp3, i32 %tmp5 )             ; <i32> [#uses=1]
+        ret i32 %tmp6
+}
+
+define i32 @t3() {
+; CHECK-LABEL: t3:
+; CHECK: push {r7, lr}
+; CHECK: ldm
+; CHECK: pop {r7, pc}
+        %tmp = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 1)            ; <i32> [#uses=1]
+        %tmp3 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 2)           ; <i32> [#uses=1]
+        %tmp5 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 3)           ; <i32> [#uses=1]
+        %tmp6 = call i32 @f2( i32 %tmp, i32 %tmp3, i32 %tmp5 )             ; <i32> [#uses=1]
+        ret i32 %tmp6
+}
+
+declare i32 @f1(i32, i32)
+
+declare i32 @f2(i32, i32, i32)
diff --git a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
new file mode 100644
index 0000000..06cfd9b
--- /dev/null
+++ b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s
+
+@d = external global [64 x i32]
+@s = external global [64 x i32]
+
+; Function Attrs: nounwind
+define void @t1() #0 {
+entry:
+; CHECK: ldr [[REG0:r[0-9]]],
+; CHECK: ldm [[REG0]]!,
+; CHECK: ldr [[REG1:r[0-9]]],
+; CHECK: stm [[REG1]]!,
+; CHECK: subs [[REG0]], #32
+; CHECK-NEXT: ldrb
+; CHECK: subs [[REG1]], #32
+; CHECK-NEXT: strb
+    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 33, i32 4, i1 false)
+    ret void
+}
+
+; Function Attrs: nounwind
+define void @t2() #0 {
+entry:
+; CHECK: ldr [[REG0:r[0-9]]],
+; CHECK: ldm [[REG0]]!,
+; CHECK: ldr [[REG1:r[0-9]]],
+; CHECK: stm [[REG1]]!,
+; CHECK: ldrh
+; CHECK: ldrb
+; CHECK: strb
+; CHECK: strh
+    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false)
+    ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
diff --git a/test/CodeGen/Thumb/trap.ll b/test/CodeGen/Thumb/trap.ll
index e04059c..7d2f6f1 100644
--- a/test/CodeGen/Thumb/trap.ll
+++ b/test/CodeGen/Thumb/trap.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 ; rdar://7961298
 
 define void @t() nounwind {
diff --git a/test/CodeGen/Thumb/tst_teq.ll b/test/CodeGen/Thumb/tst_teq.ll
index 21ada3e..2b6d9a3 100644
--- a/test/CodeGen/Thumb/tst_teq.ll
+++ b/test/CodeGen/Thumb/tst_teq.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb | grep tst
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 
 define i32 @f(i32 %a) {
 entry:
@@ -15,3 +15,6 @@ entry:
 	%retval = select i1 %0, i32 20, i32 10		; <i32> [#uses=1]
 	ret i32 %retval
 }
+
+; CHECK: tst
+
diff --git a/test/CodeGen/Thumb/vargs.ll b/test/CodeGen/Thumb/vargs.ll
index 50a1a07..4078b01 100644
--- a/test/CodeGen/Thumb/vargs.ll
+++ b/test/CodeGen/Thumb/vargs.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=thumb
-; RUN: llc < %s -mtriple=thumb-linux | grep pop | count 2
-; RUN: llc < %s -mtriple=thumb-darwin | grep pop | count 2
+; RUN: llc -mtriple=thumb-eabi %s -o /dev/null
+; RUN: llc -mtriple=thumb-linux %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-darwin %s -o - | FileCheck %s
 
 @str = internal constant [4 x i8] c"%d\0A\00"           ; <[4 x i8]*> [#uses=1]
 
@@ -34,3 +34,8 @@ declare void @llvm.va_start(i8*)
 declare i32 @printf(i8*, ...)
 
 declare void @llvm.va_end(i8*)
+
+; CHECK: pop
+; CHECK: pop
+; CHECK-NOT: pop
+
diff --git a/test/CodeGen/Thumb2/bfi.ll b/test/CodeGen/Thumb2/bfi.ll
index 3612e27..4f056d5 100644
--- a/test/CodeGen/Thumb2/bfi.ll
+++ b/test/CodeGen/Thumb2/bfi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=thumb -mattr=+v6t2 < %s | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mattr=+v6t2 %s -o - | FileCheck %s
 
 %struct.F = type { [3 x i8], i8 }
 
diff --git a/test/CodeGen/Thumb2/bfx.ll b/test/CodeGen/Thumb2/bfx.ll
index e380b8f..9bd8d70 100644
--- a/test/CodeGen/Thumb2/bfx.ll
+++ b/test/CodeGen/Thumb2/bfx.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @sbfx1(i32 %a) {
 ; CHECK: sbfx1
diff --git a/test/CodeGen/Thumb2/carry.ll b/test/CodeGen/Thumb2/carry.ll
index 48fba4e..26622e2 100644
--- a/test/CodeGen/Thumb2/carry.ll
+++ b/test/CodeGen/Thumb2/carry.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 entry:
diff --git a/test/CodeGen/Thumb2/div.ll b/test/CodeGen/Thumb2/div.ll
index e783c88..b273a89 100644
--- a/test/CodeGen/Thumb2/div.ll
+++ b/test/CodeGen/Thumb2/div.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin -mcpu=arm1156t2-s -mattr=+thumb2 \
+; RUN: llc -mtriple=thumb-apple-darwin -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMB
-; RUN: llc < %s -march=thumb -mcpu=cortex-m3 -mattr=+thumb2 \
+; RUN: llc -mtriple=thumb-apple-darwin -mcpu=cortex-m3 -mattr=+thumb2 %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMBV7M
-; RUN: llc < %s -march=thumb -mcpu=swift \
+; RUN: llc -mtriple=thumb-apple-darwin -mcpu=swift %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-HWDIV
-; RUN: llc < %s -march=thumb -mcpu=cortex-r5 \
+; RUN: llc -mtriple=thumb-apple-darwin -mcpu=cortex-r5 %s -o - \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-HWDIV
 
 define i32 @f1(i32 %a, i32 %b) {
diff --git a/test/CodeGen/Thumb2/ifcvt-neon.ll b/test/CodeGen/Thumb2/ifcvt-neon.ll
index 6832053..501b0b6 100644
--- a/test/CodeGen/Thumb2/ifcvt-neon.ll
+++ b/test/CodeGen/Thumb2/ifcvt-neon.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
 ; rdar://7368193
 
 @a = common global float 0.000000e+00             ; <float*> [#uses=2]
diff --git a/test/CodeGen/Thumb2/longMACt.ll b/test/CodeGen/Thumb2/longMACt.ll
index abe65f2..7322d0f 100644
--- a/test/CodeGen/Thumb2/longMACt.ll
+++ b/test/CodeGen/Thumb2/longMACt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 ; Check generated signed and unsigned multiply accumulate long.
 
 define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
diff --git a/test/CodeGen/Thumb2/mul_const.ll b/test/CodeGen/Thumb2/mul_const.ll
index 41de477..7064798 100644
--- a/test/CodeGen/Thumb2/mul_const.ll
+++ b/test/CodeGen/Thumb2/mul_const.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 ; rdar://7069502
 
 define i32 @t1(i32 %v) nounwind readnone {
diff --git a/test/CodeGen/Thumb2/segmented-stacks.ll b/test/CodeGen/Thumb2/segmented-stacks.ll
index 602fc84..38bf915 100644
--- a/test/CodeGen/Thumb2/segmented-stacks.ll
+++ b/test/CodeGen/Thumb2/segmented-stacks.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -mtriple=thumb-linux-androideabi -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
-; RUN: llc < %s -mtriple=thumb-linux-androideabi -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -segmented-stacks -filetype=obj
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -verify-machineinstrs | FileCheck %s -check-prefix=Thumb-android
+; RUN: llc < %s -mtriple=thumb-linux-androideabi -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -filetype=obj
 
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define void @test_basic() {
+define void @test_basic() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
@@ -30,3 +30,5 @@ define void @test_basic() {
 ; Thumb-android:      pop     {r4, r5}
 
 }
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/Thumb2/thumb2-adc.ll b/test/CodeGen/Thumb2/thumb2-adc.ll
index 58e4c59..a97654c 100644
--- a/test/CodeGen/Thumb2/thumb2-adc.ll
+++ b/test/CodeGen/Thumb2/thumb2-adc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 734439407618 = 0x000000ab00000002
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-add.ll b/test/CodeGen/Thumb2/thumb2-add.ll
index 5e81fcf..8ff931a 100644
--- a/test/CodeGen/Thumb2/thumb2-add.ll
+++ b/test/CodeGen/Thumb2/thumb2-add.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @t2ADDrc_255(i32 %lhs) {
 ; CHECK-LABEL: t2ADDrc_255:
diff --git a/test/CodeGen/Thumb2/thumb2-add2.ll b/test/CodeGen/Thumb2/thumb2-add2.ll
index ff0e087..9d64fd2 100644
--- a/test/CodeGen/Thumb2/thumb2-add2.ll
+++ b/test/CodeGen/Thumb2/thumb2-add2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-add3.ll b/test/CodeGen/Thumb2/thumb2-add3.ll
index bb7788f..03a8170 100644
--- a/test/CodeGen/Thumb2/thumb2-add3.ll
+++ b/test/CodeGen/Thumb2/thumb2-add3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
     %tmp = add i32 %a, 4095
diff --git a/test/CodeGen/Thumb2/thumb2-add4.ll b/test/CodeGen/Thumb2/thumb2-add4.ll
index ed68d62..ad9642d 100644
--- a/test/CodeGen/Thumb2/thumb2-add4.ll
+++ b/test/CodeGen/Thumb2/thumb2-add4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-add5.ll b/test/CodeGen/Thumb2/thumb2-add5.ll
index 7ef756f..f60e0be 100644
--- a/test/CodeGen/Thumb2/thumb2-add5.ll
+++ b/test/CodeGen/Thumb2/thumb2-add5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-add6.ll b/test/CodeGen/Thumb2/thumb2-add6.ll
index c4a13be..af09293 100644
--- a/test/CodeGen/Thumb2/thumb2-add6.ll
+++ b/test/CodeGen/Thumb2/thumb2-add6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-and.ll b/test/CodeGen/Thumb2/thumb2-and.ll
index 3ffcfd7..1984b3f 100644
--- a/test/CodeGen/Thumb2/thumb2-and.ll
+++ b/test/CodeGen/Thumb2/thumb2-and.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-and2.ll b/test/CodeGen/Thumb2/thumb2-and2.ll
index 3bfe9b2..70de9c9 100644
--- a/test/CodeGen/Thumb2/thumb2-and2.ll
+++ b/test/CodeGen/Thumb2/thumb2-and2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-asr.ll b/test/CodeGen/Thumb2/thumb2-asr.ll
index fbe3971..a4cccd5 100644
--- a/test/CodeGen/Thumb2/thumb2-asr.ll
+++ b/test/CodeGen/Thumb2/thumb2-asr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-asr2.ll b/test/CodeGen/Thumb2/thumb2-asr2.ll
index 321b3f5..da050fb 100644
--- a/test/CodeGen/Thumb2/thumb2-asr2.ll
+++ b/test/CodeGen/Thumb2/thumb2-asr2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-bcc.ll b/test/CodeGen/Thumb2/thumb2-bcc.ll
index 61171ac..e7b3822 100644
--- a/test/CodeGen/Thumb2/thumb2-bcc.ll
+++ b/test/CodeGen/Thumb2/thumb2-bcc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 ; If-conversion defeats the purpose of this test, which is to check CBZ
 ; generation, so use memory barrier instruction to make sure it doesn't
 ; happen and we get actual branches.
diff --git a/test/CodeGen/Thumb2/thumb2-bfc.ll b/test/CodeGen/Thumb2/thumb2-bfc.ll
index 844fb4a..dbf697cd 100644
--- a/test/CodeGen/Thumb2/thumb2-bfc.ll
+++ b/test/CodeGen/Thumb2/thumb2-bfc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 4278190095 = 0xff00000f
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-bic.ll b/test/CodeGen/Thumb2/thumb2-bic.ll
index fc57ec8..68d92b8 100644
--- a/test/CodeGen/Thumb2/thumb2-bic.ll
+++ b/test/CodeGen/Thumb2/thumb2-bic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-clz.ll b/test/CodeGen/Thumb2/thumb2-clz.ll
index a5cd074..52b540b 100644
--- a/test/CodeGen/Thumb2/thumb2-clz.ll
+++ b/test/CodeGen/Thumb2/thumb2-clz.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+v7 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+v7 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-cmn.ll b/test/CodeGen/Thumb2/thumb2-cmn.ll
index da7d4b1..efa1505 100644
--- a/test/CodeGen/Thumb2/thumb2-cmn.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests could be improved by 'movs r0, #0' being rematerialized below the
 ; test as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-cmn2.ll b/test/CodeGen/Thumb2/thumb2-cmn2.ll
index a09a149..42473c2 100644
--- a/test/CodeGen/Thumb2/thumb2-cmn2.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmn2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; -0x000000bb = 4294967109
 define i1 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-cmp.ll b/test/CodeGen/Thumb2/thumb2-cmp.ll
index 06c611d..8f08617 100644
--- a/test/CodeGen/Thumb2/thumb2-cmp.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; test as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-cmp2.ll b/test/CodeGen/Thumb2/thumb2-cmp2.ll
index 8ca3caf..4d84003 100644
--- a/test/CodeGen/Thumb2/thumb2-cmp2.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmp2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; test as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-eor.ll b/test/CodeGen/Thumb2/thumb2-eor.ll
index 6dfc5cd..2028299 100644
--- a/test/CodeGen/Thumb2/thumb2-eor.ll
+++ b/test/CodeGen/Thumb2/thumb2-eor.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-eor2.ll b/test/CodeGen/Thumb2/thumb2-eor2.ll
index cf27448..f26aafe 100644
--- a/test/CodeGen/Thumb2/thumb2-eor2.ll
+++ b/test/CodeGen/Thumb2/thumb2-eor2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 0x000000bb = 187
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-jtb.ll b/test/CodeGen/Thumb2/thumb2-jtb.ll
index 11620c2..ce7fb9f 100644
--- a/test/CodeGen/Thumb2/thumb2-jtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-jtb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -arm-adjust-jump-tables=0 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 -arm-adjust-jump-tables=0 %s -o - | FileCheck %s
 
 ; Do not use tbb / tbh if any destination is before the jumptable.
 ; rdar://7102917
diff --git a/test/CodeGen/Thumb2/thumb2-ldm.ll b/test/CodeGen/Thumb2/thumb2-ldm.ll
index 8716d80..adfcf2b 100644
--- a/test/CodeGen/Thumb2/thumb2-ldm.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldm.ll
@@ -5,6 +5,7 @@
 define i32 @t1() {
 ; CHECK-LABEL: t1:
 ; CHECK: push {r7, lr}
+; CHECK: ldrd
 ; CHECK: pop {r7, pc}
         %tmp = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 0)            ; <i32> [#uses=1]
         %tmp3 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 1)           ; <i32> [#uses=1]
@@ -27,6 +28,7 @@ define i32 @t2() {
 define i32 @t3() {
 ; CHECK-LABEL: t3:
 ; CHECK: push {r7, lr}
+; CHECK: ldm
 ; CHECK: pop {r7, pc}
         %tmp = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 1)            ; <i32> [#uses=1]
         %tmp3 = load i32* getelementptr ([0 x i32]* @X, i32 0, i32 2)           ; <i32> [#uses=1]
diff --git a/test/CodeGen/Thumb2/thumb2-ldr.ll b/test/CodeGen/Thumb2/thumb2-ldr.ll
index 09212d3..c25ed78 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32* %v) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-ldr_ext.ll b/test/CodeGen/Thumb2/thumb2-ldr_ext.ll
index b865cf4..b50b333 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr_ext.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr_ext.ll
@@ -1,7 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep ldrb | count 1
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep ldrh | count 1
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep ldrsb | count 1
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | grep ldrsh | count 1
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @test1(i8* %v.pntr.s0.u1) {
     %tmp.u = load i8* %v.pntr.s0.u1
@@ -26,3 +23,16 @@ define i32 @test4() {
     %tmp1.s = sext i16 %tmp.s to i32
     ret i32 %tmp1.s
 }
+
+; CHECK: ldrb
+; CHECK-NOT: ldrb
+
+; CHECK: ldrh
+; CHECK-NOT: ldrh
+
+; CHECK: ldrsb
+; CHECK-NOT: ldrsb
+
+; CHECK: ldrsh
+; CHECK-NOT: ldrsh
+
diff --git a/test/CodeGen/Thumb2/thumb2-ldr_post.ll b/test/CodeGen/Thumb2/thumb2-ldr_post.ll
index 4f04647..c26e6b1 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr_post.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr_post.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @test(i32 %a, i32 %b, i32 %c) {
         %tmp1 = mul i32 %a, %b          ; <i32> [#uses=2]
diff --git a/test/CodeGen/Thumb2/thumb2-ldr_pre.ll b/test/CodeGen/Thumb2/thumb2-ldr_pre.ll
index 4907dec..cafb02a 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr_pre.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr_pre.ll
@@ -1,7 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | \
-; RUN:   grep "ldr.*\!" | count 3
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | \
-; RUN:   grep "ldrsb.*\!" | count 1
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32* @test1(i32* %X, i32* %dest) {
         %Y = getelementptr i32* %X, i32 4               ; <i32*> [#uses=2]
@@ -10,6 +7,8 @@ define i32* @test1(i32* %X, i32* %dest) {
         ret i32* %Y
 }
 
+; CHECK: ldr{{.*}}!
+
 define i32 @test2(i32 %a, i32 %b) {
         %tmp1 = sub i32 %a, 64          ; <i32> [#uses=2]
         %tmp2 = inttoptr i32 %tmp1 to i32*              ; <i32*> [#uses=1]
@@ -19,6 +18,8 @@ define i32 @test2(i32 %a, i32 %b) {
         ret i32 %tmp5
 }
 
+; CHECK: ldr{{.*}}!
+
 define i8* @test3(i8* %X, i32* %dest) {
         %tmp1 = getelementptr i8* %X, i32 4
         %tmp2 = load i8* %tmp1
@@ -26,3 +27,6 @@ define i8* @test3(i8* %X, i32* %dest) {
         store i32 %tmp3, i32* %dest
         ret i8* %tmp1
 }
+
+; CHECK: ldrsb{{.*}}!
+
diff --git a/test/CodeGen/Thumb2/thumb2-ldrb.ll b/test/CodeGen/Thumb2/thumb2-ldrb.ll
index c79f732..0b3441e 100644
--- a/test/CodeGen/Thumb2/thumb2-ldrb.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldrb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i8 @f1(i8* %v) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-ldrh.ll b/test/CodeGen/Thumb2/thumb2-ldrh.ll
index 7ba9f22..db5dcfa 100644
--- a/test/CodeGen/Thumb2/thumb2-ldrh.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldrh.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i16 @f1(i16* %v) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-lsl.ll b/test/CodeGen/Thumb2/thumb2-lsl.ll
index 015a9dd..05441c8 100644
--- a/test/CodeGen/Thumb2/thumb2-lsl.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsl2.ll b/test/CodeGen/Thumb2/thumb2-lsl2.ll
index c64897a..5a456b0 100644
--- a/test/CodeGen/Thumb2/thumb2-lsl2.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsl2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsr.ll b/test/CodeGen/Thumb2/thumb2-lsr.ll
index 24973c7..48c2ec4 100644
--- a/test/CodeGen/Thumb2/thumb2-lsr.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsr2.ll b/test/CodeGen/Thumb2/thumb2-lsr2.ll
index 0b199bb..5d158af 100644
--- a/test/CodeGen/Thumb2/thumb2-lsr2.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsr2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-lsr3.ll b/test/CodeGen/Thumb2/thumb2-lsr3.ll
index c814123..c9344c8 100644
--- a/test/CodeGen/Thumb2/thumb2-lsr3.ll
+++ b/test/CodeGen/Thumb2/thumb2-lsr3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i1 @test1(i64 %poscnt, i32 %work) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-mla.ll b/test/CodeGen/Thumb2/thumb2-mla.ll
index a99ffe7..0c97d50 100644
--- a/test/CodeGen/Thumb2/thumb2-mla.ll
+++ b/test/CodeGen/Thumb2/thumb2-mla.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 \
-; RUN:  -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 \
+; RUN:  -arm-use-mulops=false %s -o - | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
diff --git a/test/CodeGen/Thumb2/thumb2-mls.ll b/test/CodeGen/Thumb2/thumb2-mls.ll
index 45d6d13..9b0e7ff 100644
--- a/test/CodeGen/Thumb2/thumb2-mls.ll
+++ b/test/CodeGen/Thumb2/thumb2-mls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
diff --git a/test/CodeGen/Thumb2/thumb2-mov.ll b/test/CodeGen/Thumb2/thumb2-mov.ll
index 7c0dc01..e563362 100644
--- a/test/CodeGen/Thumb2/thumb2-mov.ll
+++ b/test/CodeGen/Thumb2/thumb2-mov.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; Test #<const>
 
diff --git a/test/CodeGen/Thumb2/thumb2-mul.ll b/test/CodeGen/Thumb2/thumb2-mul.ll
index 5f68250..4815f4b 100644
--- a/test/CodeGen/Thumb2/thumb2-mul.ll
+++ b/test/CodeGen/Thumb2/thumb2-mul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-mulhi.ll b/test/CodeGen/Thumb2/thumb2-mulhi.ll
index e32bd26..db9b644 100644
--- a/test/CodeGen/Thumb2/thumb2-mulhi.ll
+++ b/test/CodeGen/Thumb2/thumb2-mulhi.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2dsp | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2dsp %s -o - | FileCheck %s
 
 define i32 @smulhi(i32 %x, i32 %y) {
 ; CHECK: smulhi
diff --git a/test/CodeGen/Thumb2/thumb2-mvn.ll b/test/CodeGen/Thumb2/thumb2-mvn.ll
index a5592f6..adf982f 100644
--- a/test/CodeGen/Thumb2/thumb2-mvn.ll
+++ b/test/CodeGen/Thumb2/thumb2-mvn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
+; RUN: llc -mtriple=thumbv7-apple-darwin %s -o - | FileCheck %s
 
 ; 0x000000bb = 187
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-mvn2.ll b/test/CodeGen/Thumb2/thumb2-mvn2.ll
index cee6f23..323c2cc 100644
--- a/test/CodeGen/Thumb2/thumb2-mvn2.ll
+++ b/test/CodeGen/Thumb2/thumb2-mvn2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-neg.ll b/test/CodeGen/Thumb2/thumb2-neg.ll
index 491e4de..bec6097 100644
--- a/test/CodeGen/Thumb2/thumb2-neg.ll
+++ b/test/CodeGen/Thumb2/thumb2-neg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-orn.ll b/test/CodeGen/Thumb2/thumb2-orn.ll
index 08676b1..e1f0bba 100644
--- a/test/CodeGen/Thumb2/thumb2-orn.ll
+++ b/test/CodeGen/Thumb2/thumb2-orn.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
-
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
     %tmp = xor i32 %b, 4294967295
diff --git a/test/CodeGen/Thumb2/thumb2-orn2.ll b/test/CodeGen/Thumb2/thumb2-orn2.ll
index a8f4a84..c8347df 100644
--- a/test/CodeGen/Thumb2/thumb2-orn2.ll
+++ b/test/CodeGen/Thumb2/thumb2-orn2.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
-
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 0x000000bb = 187
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-orr.ll b/test/CodeGen/Thumb2/thumb2-orr.ll
index 776d7fe..f962866 100644
--- a/test/CodeGen/Thumb2/thumb2-orr.ll
+++ b/test/CodeGen/Thumb2/thumb2-orr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-orr2.ll b/test/CodeGen/Thumb2/thumb2-orr2.ll
index 37885e2..045cc1d 100644
--- a/test/CodeGen/Thumb2/thumb2-orr2.ll
+++ b/test/CodeGen/Thumb2/thumb2-orr2.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
-
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 0x000000bb = 187
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-pack.ll b/test/CodeGen/Thumb2/thumb2-pack.ll
index 9a0d889..4825628 100644
--- a/test/CodeGen/Thumb2/thumb2-pack.ll
+++ b/test/CodeGen/Thumb2/thumb2-pack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk %s -o - | FileCheck %s
 
 ; CHECK: test1
 ; CHECK: pkhbt   r0, r0, r1, lsl #16
diff --git a/test/CodeGen/Thumb2/thumb2-rev.ll b/test/CodeGen/Thumb2/thumb2-rev.ll
index d710113..873a2d4 100644
--- a/test/CodeGen/Thumb2/thumb2-rev.ll
+++ b/test/CodeGen/Thumb2/thumb2-rev.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+v7,+t2xtpk | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+v7,+t2xtpk %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-ror.ll b/test/CodeGen/Thumb2/thumb2-ror.ll
index 3a21560..71b0015 100644
--- a/test/CodeGen/Thumb2/thumb2-ror.ll
+++ b/test/CodeGen/Thumb2/thumb2-ror.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
-; RUN: llc < %s -march=thumb | FileCheck %s -check-prefix=THUMB1
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s -check-prefix=THUMB1
 
 ; CHECK-LABEL: f1:
 ; CHECK: 	ror.w	r0, r0, #22
diff --git a/test/CodeGen/Thumb2/thumb2-rsb.ll b/test/CodeGen/Thumb2/thumb2-rsb.ll
index 94a1fb0..1c5acad 100644
--- a/test/CodeGen/Thumb2/thumb2-rsb.ll
+++ b/test/CodeGen/Thumb2/thumb2-rsb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
     %tmp = shl i32 %b, 5
diff --git a/test/CodeGen/Thumb2/thumb2-rsb2.ll b/test/CodeGen/Thumb2/thumb2-rsb2.ll
index 248ab16..838e55e 100644
--- a/test/CodeGen/Thumb2/thumb2-rsb2.ll
+++ b/test/CodeGen/Thumb2/thumb2-rsb2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-sbc.ll b/test/CodeGen/Thumb2/thumb2-sbc.ll
index 7c69451..b04dae6 100644
--- a/test/CodeGen/Thumb2/thumb2-sbc.ll
+++ b/test/CodeGen/Thumb2/thumb2-sbc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 < %s | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK: f1
diff --git a/test/CodeGen/Thumb2/thumb2-select.ll b/test/CodeGen/Thumb2/thumb2-select.ll
index 949b611..105c267 100644
--- a/test/CodeGen/Thumb2/thumb2-select.ll
+++ b/test/CodeGen/Thumb2/thumb2-select.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 -show-mc-encoding | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 -show-mc-encoding %s -o - \
+; RUN:  | FileCheck %s
 
 define i32 @f1(i32 %a.s) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-select_xform.ll b/test/CodeGen/Thumb2/thumb2-select_xform.ll
index f8ceba2..20f0e5e 100644
--- a/test/CodeGen/Thumb2/thumb2-select_xform.ll
+++ b/test/CodeGen/Thumb2/thumb2-select_xform.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK: t1
diff --git a/test/CodeGen/Thumb2/thumb2-shifter.ll b/test/CodeGen/Thumb2/thumb2-shifter.ll
index 05dd90c..538fc22 100644
--- a/test/CodeGen/Thumb2/thumb2-shifter.ll
+++ b/test/CodeGen/Thumb2/thumb2-shifter.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=cortex-a8 | FileCheck %s --check-prefix=A8
-; RUN: llc < %s -march=thumb -mcpu=swift | FileCheck %s --check-prefix=SWIFT
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s --check-prefix=A8
+; RUN: llc -mtriple=thumb-eabi -mcpu=swift %s -o - | FileCheck %s --check-prefix=SWIFT
 
 ; rdar://12892707
 
diff --git a/test/CodeGen/Thumb2/thumb2-smla.ll b/test/CodeGen/Thumb2/thumb2-smla.ll
index f96263e..8573d39 100644
--- a/test/CodeGen/Thumb2/thumb2-smla.ll
+++ b/test/CodeGen/Thumb2/thumb2-smla.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp | FileCheck %s
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp -arm-use-mulops=false %s -o - | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f3(i32 %a, i16 %x, i32 %y) {
 ; CHECK: f3
diff --git a/test/CodeGen/Thumb2/thumb2-smul.ll b/test/CodeGen/Thumb2/thumb2-smul.ll
index 742e766..67783d2 100644
--- a/test/CodeGen/Thumb2/thumb2-smul.ll
+++ b/test/CodeGen/Thumb2/thumb2-smul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp |  FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+t2dsp %s -o - |  FileCheck %s
 
 @x = weak global i16 0          ; <i16*> [#uses=1]
 @y = weak global i16 0          ; <i16*> [#uses=0]
diff --git a/test/CodeGen/Thumb2/thumb2-str.ll b/test/CodeGen/Thumb2/thumb2-str.ll
index f800974..4008145 100644
--- a/test/CodeGen/Thumb2/thumb2-str.ll
+++ b/test/CodeGen/Thumb2/thumb2-str.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32* %v) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-str_post.ll b/test/CodeGen/Thumb2/thumb2-str_post.ll
index 716c2d2..aed849e 100644
--- a/test/CodeGen/Thumb2/thumb2-str_post.ll
+++ b/test/CodeGen/Thumb2/thumb2-str_post.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i16 @test1(i32* %X, i16* %A) {
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/Thumb2/thumb2-str_pre.ll b/test/CodeGen/Thumb2/thumb2-str_pre.ll
index 83b3779..e957400 100644
--- a/test/CodeGen/Thumb2/thumb2-str_pre.ll
+++ b/test/CodeGen/Thumb2/thumb2-str_pre.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define void @test1(i32* %X, i32* %A, i32** %dest) {
 ; CHECK: test1
diff --git a/test/CodeGen/Thumb2/thumb2-strb.ll b/test/CodeGen/Thumb2/thumb2-strb.ll
index 39e376d..a2558ec 100644
--- a/test/CodeGen/Thumb2/thumb2-strb.ll
+++ b/test/CodeGen/Thumb2/thumb2-strb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i8 @f1(i8 %a, i8* %v) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-strh.ll b/test/CodeGen/Thumb2/thumb2-strh.ll
index 9444383..cbe73d5 100644
--- a/test/CodeGen/Thumb2/thumb2-strh.ll
+++ b/test/CodeGen/Thumb2/thumb2-strh.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i16 @f1(i16 %a, i16* %v) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-sub.ll b/test/CodeGen/Thumb2/thumb2-sub.ll
index ad5eda1..1c69aeb 100644
--- a/test/CodeGen/Thumb2/thumb2-sub.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-sub2.ll b/test/CodeGen/Thumb2/thumb2-sub2.ll
index f114892..8afc4cb 100644
--- a/test/CodeGen/Thumb2/thumb2-sub2.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
     %tmp = sub i32 %a, 4095
diff --git a/test/CodeGen/Thumb2/thumb2-sub3.ll b/test/CodeGen/Thumb2/thumb2-sub3.ll
index ae12b28..a3702f4 100644
--- a/test/CodeGen/Thumb2/thumb2-sub3.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub3.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 < %s | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-sub4.ll b/test/CodeGen/Thumb2/thumb2-sub4.ll
index 873080a..0ff7567 100644
--- a/test/CodeGen/Thumb2/thumb2-sub4.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-sub5.ll b/test/CodeGen/Thumb2/thumb2-sub5.ll
index 02c83f6..e12d3e1 100644
--- a/test/CodeGen/Thumb2/thumb2-sub5.ll
+++ b/test/CodeGen/Thumb2/thumb2-sub5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+32bit \
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+32bit %s -o - \
 ; RUN:  | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
diff --git a/test/CodeGen/Thumb2/thumb2-sxt-uxt.ll b/test/CodeGen/Thumb2/thumb2-sxt-uxt.ll
index 792ebef..47b94c5 100644
--- a/test/CodeGen/Thumb2/thumb2-sxt-uxt.ll
+++ b/test/CodeGen/Thumb2/thumb2-sxt-uxt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=cortex-m3 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s
 
 define i32 @test1(i16 zeroext %z) nounwind {
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
index 75bbd83..cef3490 100644
--- a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk %s -o - \
+; RUN:  | FileCheck %s
 
 define i32 @test0(i8 %A) {
 ; CHECK: test0
diff --git a/test/CodeGen/Thumb2/thumb2-teq.ll b/test/CodeGen/Thumb2/thumb2-teq.ll
index 6b34e70..258b7e4 100644
--- a/test/CodeGen/Thumb2/thumb2-teq.ll
+++ b/test/CodeGen/Thumb2/thumb2-teq.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; test as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-teq2.ll b/test/CodeGen/Thumb2/thumb2-teq2.ll
index ea43e560..3b4970b 100644
--- a/test/CodeGen/Thumb2/thumb2-teq2.ll
+++ b/test/CodeGen/Thumb2/thumb2-teq2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; tst as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-tst.ll b/test/CodeGen/Thumb2/thumb2-tst.ll
index c17510d..8cf6f14 100644
--- a/test/CodeGen/Thumb2/thumb2-tst.ll
+++ b/test/CodeGen/Thumb2/thumb2-tst.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; tst as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-tst2.ll b/test/CodeGen/Thumb2/thumb2-tst2.ll
index 764e3d4..178a2a5 100644
--- a/test/CodeGen/Thumb2/thumb2-tst2.ll
+++ b/test/CodeGen/Thumb2/thumb2-tst2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; These tests would be improved by 'movs r0, #0' being rematerialized below the
 ; tst as 'mov.w r0, #0'.
diff --git a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
index 61e849e..bcd4a0f 100644
--- a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=cortex-a8 | FileCheck %s --check-prefix=A8
-; RUN: llc < %s -march=thumb -mcpu=cortex-m3 | FileCheck %s --check-prefix=M3
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s --check-prefix=A8
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s --check-prefix=M3
 ; rdar://11318438
 
 define zeroext i8 @test1(i32 %A.u)  {
diff --git a/test/CodeGen/Thumb2/thumb2-uxtb.ll b/test/CodeGen/Thumb2/thumb2-uxtb.ll
index 2074f98..b8b1bc8 100644
--- a/test/CodeGen/Thumb2/thumb2-uxtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxtb.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=thumb -mcpu=cortex-a8 | FileCheck %s -check-prefix=ARMv7A
-; RUN: llc < %s -march=thumb -mcpu=cortex-m3 | FileCheck %s -check-prefix=ARMv7M
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=ARMv7A
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s -check-prefix=ARMv7M
 
 define i32 @test1(i32 %x) {
 ; ARMv7A: test1
diff --git a/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll b/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
index e1f8901..4d7c3a1 100644
--- a/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
+++ b/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
@@ -1,8 +1,14 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-darwin | \
-; RUN:   grep push | count 3
+; RUN: llc < %s -march=x86 -mtriple=i686-darwin | FileCheck %s
+; RUN: llc < %s -march=x86 -mtriple=i686-darwin -addr-sink-using-gep=1 | FileCheck %s
 
 define void @foo(i8** %buf, i32 %size, i32 %col, i8* %p) nounwind {
 entry:
+; CHECK-LABEL: @foo
+; CHECK: push
+; CHECK: push
+; CHECK: push
+; CHECK-NOT: push
+
 	icmp sgt i32 %size, 0		; <i1>:0 [#uses=1]
 	br i1 %0, label %bb.preheader, label %return
 
diff --git a/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll b/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
index e673d31..e64375a 100644
--- a/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
+++ b/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=pic | grep TLSGD | count 2
+; RUN: llc < %s -relocation-model=pic | FileCheck %s
 ; PR2137
 
 ; ModuleID = '1.c'
@@ -11,6 +11,8 @@ target triple = "i386-pc-linux-gnu"
 @__libc_resp = hidden alias %struct.__res_state** @__resp		; <%struct.__res_state**> [#uses=2]
 
 define i32 @foo() {
+; CHECK-LABEL: foo:
+; CHECK: leal    __libc_resp@TLSLD
 entry:
 	%retval = alloca i32		; <i32*> [#uses=1]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
@@ -24,6 +26,8 @@ return:		; preds = %entry
 }
 
 define i32 @bar() {
+; CHECK-LABEL: bar:
+; CHECK: leal    __libc_resp@TLSLD
 entry:
 	%retval = alloca i32		; <i32*> [#uses=1]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index 91fec3b..09e34ef 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -76,7 +76,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!49}
-!46 = metadata !{metadata !0, metadata !9, metadata !16, metadata !17, metadata !20}
+!46 = metadata !{metadata !16, metadata !17, metadata !20}
 
 !0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786451, metadata !47, metadata !2, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
index 0ef3aa5..f6d6852 100644
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index d0a262d..8487c60 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -22,7 +22,7 @@ define i32 @foo_f() {
 @bar_i = alias internal i32* @bar
 
 ; CHECK-DAG: .globl	A
-@A = alias bitcast (i32* @bar to i64*)
+@A = alias i64, i32* @bar
 
 ; CHECK-DAG: .globl	bar_h
 ; CHECK-DAG: .hidden	bar_h
diff --git a/test/CodeGen/X86/atom-bypass-slow-division-64.ll b/test/CodeGen/X86/atom-bypass-slow-division-64.ll
index d1b52a4..5980b79 100644
--- a/test/CodeGen/X86/atom-bypass-slow-division-64.ll
+++ b/test/CodeGen/X86/atom-bypass-slow-division-64.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mcpu=atom -mtriple=i686-linux -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mcpu=atom -march=x86-64 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
 
 ; Additional tests for 64-bit divide bypass
 
diff --git a/test/CodeGen/X86/avoid_complex_am.ll b/test/CodeGen/X86/avoid_complex_am.ll
new file mode 100644
index 0000000..7f09519
--- /dev/null
+++ b/test/CodeGen/X86/avoid_complex_am.ll
@@ -0,0 +1,40 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+; Complex addressing mode are costly.
+; Make loop-reduce prefer unscaled accesses.
+; On X86, reg1 + 1*reg2 has the same cost as reg1 + 8*reg2.
+; Therefore, LSR currently prefers to fold as much computation as possible
+; in the addressing mode.
+; <rdar://problem/16730541>
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
+; CHECK: @mulDouble
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
+; Only one induction variable should have been generated.
+; CHECK-NOT: phi
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = add nsw i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds double* %b, i64 %tmp
+  %tmp1 = load double* %arrayidx, align 8
+; The induction variable should carry the scaling factor: 1.
+; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
+  %tmp2 = load double* %arrayidx2, align 8
+  %mul = fmul double %tmp1, %tmp2
+  %arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
+  store double %mul, double* %arrayidx4, align 8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; Comparison should be 19 * 1 = 19.
+; CHECK: icmp eq i32 {{%[^,]+}}, 19
+  %exitcond = icmp eq i32 %lftr.wideiv, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
index 5fcd5ff..e21c7a0 100644
--- a/test/CodeGen/X86/avx-blend.ll
+++ b/test/CodeGen/X86/avx-blend.ll
@@ -3,7 +3,16 @@
 ; AVX128 tests:
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: vblendvps
+; select mask is <i1 true, i1 false, i1 true, i1 false>.
+; Big endian representation is 0101 = 5.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; the inverted mask: 1010 = 10.
+; According to the ABI:
+; v1 is in xmm0 => first argument is xmm0.
+; v2 is in xmm1 => second argument is xmm1.
+; result is in xmm0 => destination argument.
+;CHECK: vblendps    $10, %xmm1, %xmm0, %xmm0
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
@@ -12,7 +21,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 
 
 ;CHECK-LABEL: vsel_i32:
-;CHECK: vblendvps
+;CHECK: vblendps   $10, %xmm1, %xmm0, %xmm0
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
@@ -52,7 +61,13 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
 
 ;CHECK-LABEL: vsel_float8:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvps
+; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
+; which translates into the boolean mask (big endian representation):
+; 00010001 = 17.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; the inverted mask: 11101110 = 238.
+;CHECK: vblendps    $238, %ymm1, %ymm0, %ymm0
 ;CHECK: ret
 define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
@@ -61,7 +76,7 @@ define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
 
 ;CHECK-LABEL: vsel_i328:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvps
+;CHECK: vblendps    $238, %ymm1, %ymm0, %ymm0
 ;CHECK-NEXT: ret
 define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
@@ -69,7 +84,15 @@ define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
 }
 
 ;CHECK-LABEL: vsel_double8:
-;CHECK: vblendvpd
+; select mask is 2x: 0001 => intel mask: ~0001 = 14
+; ABI:
+; v1 is in ymm0 and ymm1.
+; v2 is in ymm2 and ymm3.
+; result is in ymm0 and ymm1.
+; Compute the low part: res.low = blend v1.low, v2.low, blendmask
+;CHECK: vblendpd    $14, %ymm2, %ymm0, %ymm0
+; Compute the high part.
+;CHECK: vblendpd    $14, %ymm3, %ymm1, %ymm1
 ;CHECK: ret
 define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
@@ -77,7 +100,8 @@ define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
 }
 
 ;CHECK-LABEL: vsel_i648:
-;CHECK: vblendvpd
+;CHECK: vblendpd    $14, %ymm2, %ymm0, %ymm0
+;CHECK: vblendpd    $14, %ymm3, %ymm1, %ymm1
 ;CHECK: ret
 define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
@@ -86,7 +110,7 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
 
 ;CHECK-LABEL: vsel_double4:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvpd
+;CHECK: vshufpd $10
 ;CHECK-NEXT: ret
 define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
@@ -112,4 +136,25 @@ define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
   ret <2 x double> %min
 }
 
+; If we can figure out a blend has a constant mask, we should emit the
+; blend instruction with an immediate mask
+define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
+; CHECK-LABEL: constant_blendvpd_avx:
+; CHECK-NOT: mov
+; CHECK: vblendpd
+; CHECK: ret
+  %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
+  ret <4 x double> %1
+}
+
+define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
+; CHECK-LABEL: constant_blendvps_avx:
+; CHECK-NOT: mov
+; CHECK: vblendps
+; CHECK: ret
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
+  ret <8 x float> %1
+}
 
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 02aa617..f407ba4 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -306,3 +306,29 @@ define void @test20() {
   store <3 x double> %a1, <3 x double>* undef, align 1
   ret void
 }
+
+define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
+; CHECK-LABEL: test_insert_64_zext
+; CHECK-NOT: xor
+; CHECK: vmovq
+  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %1
+}
+
+;; Ensure we don't use insertps from non v4x32 vectors.
+;; On SSE4.1 it works because bigger vectors use more than 1 register.
+;; On AVX they get passed in a single register.
+;; FIXME: We could probably optimize this case, if we're only using the
+;; first 4 indices.
+define <4 x i32> @insert_from_diff_size(<8 x i32> %x) {
+; CHECK-LABEL: insert_from_diff_size:
+; CHECK-NOT: insertps
+; CHECK: ret
+  %vecext = extractelement <8 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+  %a.0 = extractelement <8 x i32> %x, i32 0
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a.0, i32 3
+  ret <4 x i32> %vecinit3
+}
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
new file mode 100644
index 0000000..6069c14
--- /dev/null
+++ b/test/CodeGen/X86/avx.ll
@@ -0,0 +1,136 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
+
+define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @blendvb_fallback_v4i32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: @blendvb_fallback_v8i32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %ret
+}
+
+define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @blendvb_fallback_v8f32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
+  ret <8 x float> %ret
+}
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+  ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+  ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; X32: movl    8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: vinsertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+  %2 = load <4 x float>* %1, align 16
+  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+  ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl    4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %b, align 4
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK: vbroadcastss
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: vaddps
+; CHECK: vaddps
+; CHECK: vaddps
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+  %11 = fadd <4 x float> %7, %8
+  %12 = fadd <4 x float> %9, %10
+  %13 = fadd <4 x float> %11, %12
+  ret <4 x float> %13
+}
diff --git a/test/CodeGen/X86/avx1-logical-load-folding.ll b/test/CodeGen/X86/avx1-logical-load-folding.ll
new file mode 100644
index 0000000..32301b1
--- /dev/null
+++ b/test/CodeGen/X86/avx1-logical-load-folding.ll
@@ -0,0 +1,60 @@
+; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @test1(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = and <8 x i32> %tmp3, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp5 = bitcast <8 x i32> %tmp4 to <8 x float>
+  %tmp6 = extractelement <8 x float> %tmp5, i32 0
+  store float %tmp6, float* %C
+  ret void
+
+  ; CHECK: vandps LCPI0_0(%rip), %ymm0, %ymm0
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @test2(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = or <8 x i32> %tmp3, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp5 = bitcast <8 x i32> %tmp4 to <8 x float>
+  %tmp6 = extractelement <8 x float> %tmp5, i32 0
+  store float %tmp6, float* %C
+  ret void
+
+  ; CHECK: vorps LCPI1_0(%rip), %ymm0, %ymm0
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @test3(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = xor <8 x i32> %tmp3, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp5 = bitcast <8 x i32> %tmp4 to <8 x float>
+  %tmp6 = extractelement <8 x float> %tmp5, i32 0
+  store float %tmp6, float* %C
+  ret void
+
+  ; CHECK: vxorps LCPI2_0(%rip), %ymm0, %ymm0
+}
+
+define void @test4(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = xor <8 x i32> %tmp3, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %tmp5 = and <8 x i32> %tmp4, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp6 = bitcast <8 x i32> %tmp5 to <8 x float>
+  %tmp7 = extractelement <8 x float> %tmp6, i32 0
+  store float %tmp7, float * %C
+  ret void
+
+  ;CHECK: vandnps LCPI3_0(%rip), %ymm0, %ymm0
+}
diff --git a/test/CodeGen/X86/avx2-blend.ll b/test/CodeGen/X86/avx2-blend.ll
new file mode 100644
index 0000000..b02442b
--- /dev/null
+++ b/test/CodeGen/X86/avx2-blend.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
+
+define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; CHECK-LABEL: constant_pblendvb_avx2:
+; CHECK: vmovdqa
+; CHECK: vpblendvb
+  %1 = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
+  ret <32 x i8> %1
+}
+
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index 4ae2905..e355301 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -52,6 +52,16 @@ entry:
 ; CHECK: vpaddd  %ymm0, %ymm0, %ymm0
 ; CHECK: ret
 
+define <8 x i32> @test_vpslld_var(i32 %shift) {
+  %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
+  %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
+  ret <8 x i32> %tmp
+}
+
+; CHECK-LABEL: test_vpslld_var:
+; CHECK: vpslld %xmm0, %ymm1, %ymm0
+; CHECK: ret
+
 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
 entry:
   %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 1d83485..2476ea1 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -24,6 +24,22 @@ define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
   ret <16 x i32> %b
 }
 
+; CHECK-LABEL: fptoui_256
+; CHECK: vcvttps2udq
+; CHECK: ret
+define <8 x i32> @fptoui_256(<8 x float> %a) nounwind {
+  %b = fptoui <8 x float> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: fptoui_128
+; CHECK: vcvttps2udq
+; CHECK: ret
+define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
+  %b = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %b
+}
+
 ; CHECK-LABEL: fptoui01
 ; CHECK: vcvttpd2udq
 ; CHECK: ret
@@ -184,6 +200,22 @@ define <16 x float> @uitof32(<16 x i32> %a) nounwind {
   ret <16 x float> %b
 }
 
+; CHECK-LABEL: uitof32_256
+; CHECK: vcvtudq2ps
+; CHECK: ret
+define <8 x float> @uitof32_256(<8 x i32> %a) nounwind {
+  %b = uitofp <8 x i32> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: uitof32_128
+; CHECK: vcvtudq2ps
+; CHECK: ret
+define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
+  %b = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %b
+}
+
 ; CHECK-LABEL: @fptosi02
 ; CHECK: vcvttss2si {{.*}} encoding: [0x62
 ; CHECK: ret
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index e429a22..20bf7e4 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -1,14 +1,14 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-declare <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float>, i16, <16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dps.mask.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double>, i8, <8 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpd.mask.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
+declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
+declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32)
+declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
 
-declare <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qps.mask.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
+declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
 
 ;CHECK-LABEL: gather_mask_dps
 ;CHECK: kmovw
@@ -17,9 +17,9 @@ declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x dou
 ;CHECK: vscatterdps
 ;CHECK: ret
 define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
   %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
   ret void
 }
 
@@ -30,9 +30,9 @@ define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8*
 ;CHECK: vscatterdpd
 ;CHECK: ret
 define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
   ret void
 }
 
@@ -43,9 +43,9 @@ define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %b
 ;CHECK: vscatterqps
 ;CHECK: ret
 define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
   ret void
 }
 
@@ -56,23 +56,23 @@ define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %ba
 ;CHECK: vscatterqpd
 ;CHECK: ret
 define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
   ret void
 }
 ;;
 ;; Integer Gather/Scatter
 ;;
-declare <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32>, i16, <16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpi.mask.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64>, i8, <8 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpq.mask.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
+declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32)
+declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32)
+declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
 
-declare <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpi.mask.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
+declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
 
 ;CHECK-LABEL: gather_mask_dd
 ;CHECK: kmovw
@@ -81,9 +81,9 @@ declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64
 ;CHECK: vpscatterdd
 ;CHECK: ret
 define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
   %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dpi.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
   ret void
 }
 
@@ -94,9 +94,9 @@ define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %ba
 ;CHECK: vpscatterqd
 ;CHECK: ret
 define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpi.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
   ret void
 }
 
@@ -107,9 +107,9 @@ define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base,
 ;CHECK: vpscatterqq
 ;CHECK: ret
 define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
   ret void
 }
 
@@ -120,116 +120,19 @@ define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base,
 ;CHECK: vpscatterdq
 ;CHECK: ret
 define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
   ret void
 }
 
-;; FP Intinsics without masks
-
-declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dps.512 (i8*, <16 x i32>, <16 x float>, i32)
-declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qps.512 (i8*, <8 x i64>, <8 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, <8 x i64>, <8 x double>, i32)
-
-;CHECK-LABEL: gather_dps
-;CHECK: kxnorw
-;CHECK: vgatherdps
-;CHECK: vscatterdps
-;CHECK: ret
-define void @gather_dps(<16 x i32> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>%ind, i8* %base, i32 4)
-  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, <16 x i32>%ind2, <16 x float> %x, i32 4)
-  ret void
-}
-
-;CHECK-LABEL: gather_qps
-;CHECK: kxnorw
-;CHECK: vgatherqps
-;CHECK: vscatterqps
-;CHECK: ret
-define void @gather_qps(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>%ind, i8* %base, i32 4)
-  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, <8 x i64>%ind2, <8 x float> %x, i32 4)
-  ret void
-}
-
-;CHECK-LABEL: gather_qpd
-;CHECK: kxnorw
-;CHECK: vgatherqpd
-;CHECK: vpadd
-;CHECK: vscatterqpd
-;CHECK: ret
-define void @gather_qpd(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>%ind, i8* %base, i32 4)
-  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, <8 x i64>%ind2, <8 x double> %x, i32 4)
-  ret void
-}
-
-;; Integer Intinsics without masks
-
-declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, <16 x i32>, <16 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, <8 x i32>, <8 x i64>, i32)
-
-declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, <8 x i64>, <8 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, <8 x i64>, <8 x i64>, i32)
-
-;CHECK-LABEL: gather_dpi
-;CHECK: kxnorw
-;CHECK: vpgatherdd
-;CHECK: vpscatterdd
-;CHECK: ret
-define void @gather_dpi(<16 x i32> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>%ind, i8* %base, i32 4)
-  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, <16 x i32>%ind2, <16 x i32> %x, i32 4)
-  ret void
-}
-
-;CHECK-LABEL: gather_qpq
-;CHECK: vpxord  %zmm
-;CHECK: kxnorw
-;CHECK: vpgatherqq
-;CHECK: vpadd
-;CHECK: vpscatterqq
-;CHECK: ret
-define void @gather_qpq(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>%ind, i8* %base, i32 4)
-  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i64> %x, i32 4)
-  ret void
-}
-
-;CHECK-LABEL: gather_qpi
-;CHECK: vpxor %ymm
-;CHECK: kxnorw
-;CHECK: vpgatherqd
-;CHECK: vpadd
-;CHECK: vpscatterqd
-;CHECK: ret
-define void @gather_qpi(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>%ind, i8* %base, i32 4)
-  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i32> %x, i32 4)
-  ret void
-}
 
 ;CHECK-LABEL: gather_mask_dpd_execdomain
 ;CHECK: vgatherdpd
 ;CHECK: vmovapd
 ;CHECK: ret
 define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
   store <8 x double> %x, <8 x double>* %stbuf
   ret void
 }
@@ -239,7 +142,7 @@ define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %m
 ;CHECK: vmovapd
 ;CHECK: ret
 define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   store <8 x double> %x, <8 x double>* %stbuf
   ret void
 }
@@ -249,7 +152,7 @@ define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %m
 ;CHECK: vmovaps 
 ;CHECK: ret
 define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base)  {
-  %res = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
   ret <16 x float> %res;
 }
 
@@ -258,7 +161,7 @@ define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %s
 ;CHECK: vmovaps
 ;CHECK: ret
 define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base)  {
-  %res = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   ret <8 x float> %res;
 }
 
@@ -268,7 +171,7 @@ define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src,
 ;CHECK: ret
 define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
   %x = load <8 x double>* %src, align 64 
-  call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
   ret void
 }
 
@@ -278,7 +181,7 @@ define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8
 ;CHECK: ret
 define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
   %x = load <8 x double>* %src, align 64
-  call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
   ret void
 }
 
@@ -288,7 +191,7 @@ define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8
 ;CHECK: ret
 define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf)  {
   %x = load <16 x float>* %src, align 64
-  call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
   ret void
 }
 
@@ -298,6 +201,35 @@ define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i1
 ;CHECK: ret
 define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
   %x = load <8 x float>* %src, align 32 
-  call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_qps
+;CHECK: kxnorw
+;CHECK: vgatherqps
+;CHECK: vpadd
+;CHECK: vscatterqps
+;CHECK: ret
+define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf)  {
+  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: prefetch
+;CHECK: gatherpf0
+;CHECK: gatherpf1
+;CHECK: scatterpf0
+;CHECK: scatterpf1
+;CHECK: ret
+declare  void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
+declare  void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
+define void @prefetch(<8 x i64> %ind, i8* %base) {
+  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
+  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1)
+  call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
+  call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1)
   ret void
 }
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 6557ac3..b360c71 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -158,3 +158,41 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
   %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
   ret i64 %res
 }
+
+;CHECK-LABEL: test15
+;CHECK: kshiftlw
+;CHECK: kmovw
+;CHECK: ret
+define i16 @test15(i1 *%addr) {
+  %x = load i1 * %addr, align 128
+  %x1 = insertelement <16 x i1> undef, i1 %x, i32 10
+  %x2 = bitcast <16 x i1>%x1 to i16
+  ret i16 %x2
+}
+
+;CHECK-LABEL: test16
+;CHECK: kshiftlw
+;CHECK: kshiftrw
+;CHECK: korw
+;CHECK: ret
+define i16 @test16(i1 *%addr, i16 %a) {
+  %x = load i1 * %addr, align 128
+  %a1 = bitcast i16 %a to <16 x i1>
+  %x1 = insertelement <16 x i1> %a1, i1 %x, i32 10
+  %x2 = bitcast <16 x i1>%x1 to i16
+  ret i16 %x2
+}
+
+;CHECK-LABEL: test17
+;CHECK: kshiftlw
+;CHECK: kshiftrw
+;CHECK: korw
+;CHECK: ret
+define i8 @test17(i1 *%addr, i8 %a) {
+  %x = load i1 * %addr, align 128
+  %a1 = bitcast i8 %a to <8 x i1>
+  %x1 = insertelement <8 x i1> %a1, i1 %x, i32 10
+  %x2 = bitcast <8 x i1>%x1 to i8
+  ret i8 %x2
+}
+
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 3fb38ed..e19841a 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -78,7 +78,7 @@ declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8
 
 define <8 x double> @test7(<8 x double> %a) {
 ; CHECK: vrndscalepd {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x0b]
-  %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> zeroinitializer, i8 -1, i32 4)
+  %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
   ret <8 x double>%res
 }
 
@@ -86,7 +86,7 @@ declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <1
 
 define <16 x float> @test8(<16 x float> %a) {
 ; CHECK: vrndscaleps {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x0b]
-  %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
   ret <16 x float>%res
 }
 
@@ -536,4 +536,12 @@ define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
   ret void
 }
 
-declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
\ No newline at end of file
+declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
+
+define <16 x float> @test_vpermt2ps(<16 x float>%x, <16 x float>%y, <16 x i32>%perm) {
+; CHECK: vpermt2ps {{.*}}encoding: [0x62,0xf2,0x6d,0x48,0x7f,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>%perm, <16 x float>%x, <16 x float>%y, i16 -1)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 13e6843..009802f 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -153,3 +153,31 @@ define void @test18(i8 * %addr, <8 x i64> %data) {
   ret void
 }
 
+; CHECK-LABEL: store_i1_1
+; CHECK: movb
+; CHECK: movb
+; CHECK: ret
+define void @store_i1_1() {
+  store i1 true, i1 addrspace(3)* undef, align 128
+  store i1 false, i1 addrspace(2)* undef, align 128
+  ret void
+}
+
+; CHECK-LABEL: store_i1_2
+; CHECK: movb
+; CHECK: ret
+define void @store_i1_2(i64 %a, i64 %b) {
+  %res = icmp eq i64 %a, %b
+  store i1 %res, i1 addrspace(3)* undef, align 128
+  ret void
+}
+
+; CHECK-LABEL: store_i1_3
+; CHECK: kmovw
+; CHECK: ret
+define void @store_i1_3(i16 %a) {
+  %a_vec = bitcast i16 %a to <16 x i1>
+  %res = extractelement <16 x i1> %a_vec, i32 4
+  store i1 %res, i1 addrspace(3)* undef, align 128
+  ret void
+}
diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll
index 59d7010..23ddc3a 100644
--- a/test/CodeGen/X86/avx512-shuffle.ll
+++ b/test/CodeGen/X86/avx512-shuffle.ll
@@ -231,3 +231,22 @@ define <16 x i32> @test27(<4 x i32>%a) {
  %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  ret <16 x i32> %res
 }
+
+; CHECK-LABEL: @test28
+; CHECK: vinserti64x4 $1
+; CHECK: ret
+define <16 x i32> @test28(<16 x i32>%x, <16 x i32>%y) {
+ %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                                              i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i32> %res
+}
+
+; CHECK-LABEL: @test29
+; CHECK: vinserti64x4 $0
+; CHECK: ret
+define <16 x i32> @test29(<16 x i32>%x, <16 x i32>%y) {
+ %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+                                                              i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i32> %res
+}
+
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index 6b46596..34aaf2c 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -4,7 +4,7 @@
 ; Verify that we produce movss instead of blendvps when possible.
 
 ;CHECK-LABEL: vsel_float:
-;CHECK-NOT: blendvps
+;CHECK-NOT: blend
 ;CHECK: movss
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
@@ -13,7 +13,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 }
 
 ;CHECK-LABEL: vsel_4xi8:
-;CHECK-NOT: blendvps
+;CHECK-NOT: blend
 ;CHECK: movss
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
@@ -21,14 +21,18 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
   ret <4 x i8> %vsel
 }
 
-
-; We do not have native support for v8i16 blends and we have to use the
-; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not
-; reduce the mask in this case.
 ;CHECK-LABEL: vsel_8xi16:
-;CHECK: andps
-;CHECK: andps
-;CHECK: orps
+; The select mask is
+; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
+; which translates into the boolean mask (big endian representation):
+; 00010001 = 17.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; the inverted mask: 11101110 = 238.
+; According to the ABI:
+; v1 is in xmm0 => first argument is xmm0.
+; v2 is in xmm1 => second argument is xmm1.
+;CHECK: pblendw $238, %xmm1, %xmm0
 ;CHECK: ret
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index 242075a..a707209 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -216,6 +216,23 @@ entry:
 ; CHECK: bzhiq
 }
 
+define i64 @bzhi64_constant_mask(i64 %x) #0 {
+entry:
+  %and = and i64 %x, 4611686018427387903
+  ret i64 %and
+; CHECK-LABEL: bzhi64_constant_mask:
+; CHECK: movb    $62, %al
+; CHECK: bzhiq   %rax, %r[[ARG1:di|cx]], %rax
+}
+
+define i64 @bzhi64_small_constant_mask(i64 %x) #0 {
+entry:
+  %and = and i64 %x, 2147483647
+  ret i64 %and
+; CHECK-LABEL: bzhi64_small_constant_mask:
+; CHECK: andq  $2147483647, %r[[ARG1]]
+}
+
 define i32 @blsi32(i32 %x) nounwind readnone {
   %tmp = sub i32 0, %x
   %tmp2 = and i32 %x, %tmp
diff --git a/test/CodeGen/X86/br-fold.ll b/test/CodeGen/X86/br-fold.ll
index 5223463..fd1e73b 100644
--- a/test/CodeGen/X86/br-fold.ll
+++ b/test/CodeGen/X86/br-fold.ll
@@ -1,7 +1,19 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck  -check-prefix=X64_DARWIN %s
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck  -check-prefix=X64_LINUX %s
+; RUN: llc -mtriple=x86_64-pc-windows < %s | FileCheck  -check-prefix=X64_WINDOWS %s
+; RUN: llc -mtriple=x86_64-pc-windows-gnu < %s | FileCheck  -check-prefix=X64_WINDOWS_GNU %s
 
-; CHECK: orq
-; CHECK-NEXT: %bb8.i329
+; X64_DARWIN: orq
+; X64_DARWIN-NEXT: %bb8.i329
+
+; X64_LINUX: orq %rax, %rcx
+; X64_LINUX-NEXT: %bb8.i329
+
+; X64_WINDOWS: orq %rax, %rcx
+; X64_WINDOWS-NEXT: ud2
+
+; X64_WINDOWS_GNU: orq %rax, %rcx
+; X64_WINDOWS_GNU-NEXT: ud2
 
 @_ZN11xercesc_2_513SchemaSymbols21fgURI_SCHEMAFORSCHEMAE = external constant [33 x i16], align 32 ; <[33 x i16]*> [#uses=1]
 @_ZN11xercesc_2_56XMLUni16fgNotationStringE = external constant [9 x i16], align 16 ; <[9 x i16]*> [#uses=1]
diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll
index 6b77176..3c931db 100644
--- a/test/CodeGen/X86/bswap-vector.ll
+++ b/test/CodeGen/X86/bswap-vector.ll
@@ -1,19 +1,144 @@
-; RUN: llc < %s -mcpu=x86_64 | FileCheck %s
+; RUN: llc < %s -mcpu=x86-64 | FileCheck %s -check-prefix=CHECK-NOSSSE3
+; RUN: llc < %s -mcpu=core2 | FileCheck %s -check-prefix=CHECK-SSSE3
+; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
 
-define <2 x i64> @foo(<2 x i64> %v) #0 {
+define <8 x i16> @test1(<8 x i16> %v) #0 {
+entry:
+  %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
+  ret <8 x i16> %r
+
+; CHECK-NOSSSE3-LABEL: @test1
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: retq
+
+; CHECK-SSSE3-LABEL: @test1
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test1
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <4 x i32> @test2(<4 x i32> %v) #0 {
+entry:
+  %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v)
+  ret <4 x i32> %r
+
+; CHECK-NOSSSE3-LABEL: @test2
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: retq
+
+; CHECK-SSSE3-LABEL: @test2
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test2
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <2 x i64> @test3(<2 x i64> %v) #0 {
 entry:
   %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
   ret <2 x i64> %r
+
+; CHECK-NOSSSE3-LABEL: @test3
+; CHECK-NOSSSE3: bswapq
+; CHECK-NOSSSE3: bswapq
+; CHECK-NOSSSE3: retq
+
+; CHECK-SSSE3-LABEL: @test3
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test3
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)
+declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)
+declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
+
+define <16 x i16> @test4(<16 x i16> %v) #0 {
+entry:
+  %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v)
+  ret <16 x i16> %r
+
+; CHECK-SSSE3-LABEL: @test4
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test4
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <8 x i32> @test5(<8 x i32> %v) #0 {
+entry:
+  %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v)
+  ret <8 x i32> %r
+
+; CHECK-SSSE3-LABEL: @test5
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test5
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <4 x i64> @test6(<4 x i64> %v) #0 {
+entry:
+  %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v)
+  ret <4 x i64> %r
+
+; CHECK-SSSE3-LABEL: @test6
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test6
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
 }
 
-; CHECK-LABEL: @foo
-; CHECK: bswapq
-; CHECK: bswapq
-; CHECK: retq
+declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
+
+define <4 x i16> @test7(<4 x i16> %v) #0 {
+entry:
+  %r = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v)
+  ret <4 x i16> %r
+
+; CHECK-SSSE3-LABEL: @test7
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: psrld $16
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test7
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2: vpsrld $16
+; CHECK-AVX2-NEXT: retq
+}
 
 attributes #0 = { nounwind uwtable }
 
diff --git a/test/CodeGen/X86/cdecl-method-return.ll b/test/CodeGen/X86/cdecl-method-return.ll
deleted file mode 100644
index 2baa47a..0000000
--- a/test/CodeGen/X86/cdecl-method-return.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -mtriple=i686-pc-win32 -mcpu=core2 | FileCheck %s
-
-; The sret flag causes the first two parameters to be reordered on the stack.
-
-define x86_cdeclmethodcc void @foo(i32* sret %dst, i32* %src) {
-  %v = load i32* %src
-  store i32 %v, i32* %dst
-  ret void
-}
-
-; CHECK-LABEL: _foo:
-; CHECK:  movl    8(%esp), %[[dst:[^ ]*]]
-; CHECK:  movl    4(%esp), %[[src:[^ ]*]]
-; CHECK:  movl    (%[[src]]), %[[v:[^ ]*]]
-; CHECK:  movl    %[[v]], (%[[dst]])
-; CHECK:  retl
-
-define i32 @bar() {
-  %src = alloca i32
-  %dst = alloca i32
-  store i32 42, i32* %src
-  call x86_cdeclmethodcc void @foo(i32* sret %dst, i32* %src)
-  %v = load i32* %dst
-  ret i32 %v
-}
-
-; CHECK-LABEL: _bar:
-; CHECK:  movl    $42, [[src:[^,]*]]
-; CHECK:  leal    [[src]], %[[reg:[^ ]*]]
-; CHECK:  movl    %[[reg]], (%esp)
-; CHECK:  leal    [[dst:[^,]*]], %[[reg:[^ ]*]]
-; CHECK:  movl    %[[reg]], 4(%esp)
-; CHECK:  calll   _foo
-; CHECK:  movl    [[dst]], %eax
-; CHECK:  retl
-
-; If we don't have the sret flag, parameters are not reordered.
-
-define x86_cdeclmethodcc void @baz(i32* %dst, i32* %src) {
-  %v = load i32* %src
-  store i32 %v, i32* %dst
-  ret void
-}
-
-; CHECK-LABEL: _baz:
-; CHECK:  movl    4(%esp), %[[dst:[^ ]*]]
-; CHECK:  movl    8(%esp), %[[src:[^ ]*]]
-; CHECK:  movl    (%[[src]]), %[[v:[^ ]*]]
-; CHECK:  movl    %[[v]], (%[[dst]])
-; CHECK:  retl
-
-define i32 @qux() {
-  %src = alloca i32
-  %dst = alloca i32
-  store i32 42, i32* %src
-  call x86_cdeclmethodcc void @baz(i32* %dst, i32* %src)
-  %v = load i32* %dst
-  ret i32 %v
-}
-
-; CHECK-LABEL: _qux:
-; CHECK:  movl    $42, [[src:[^,]*]]
-; CHECK:  leal    [[src]], %[[reg:[^ ]*]]
-; CHECK:  movl    %[[reg]], 4(%esp)
-; CHECK:  leal    [[dst:[^,]*]], %[[reg:[^ ]*]]
-; CHECK:  movl    %[[reg]], (%esp)
-; CHECK:  calll   _baz
-; CHECK:  movl    [[dst]], %eax
-; CHECK:  retl
diff --git a/test/CodeGen/X86/cfi.ll b/test/CodeGen/X86/cfi.ll
new file mode 100644
index 0000000..b57ff45
--- /dev/null
+++ b/test/CodeGen/X86/cfi.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck --check-prefix=STATIC %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic | FileCheck --check-prefix=PIC %s
+
+; STATIC: .cfi_personality 3, __gxx_personality_v0
+; STATIC: .cfi_lsda 3, .Lexception0
+
+; PIC: .cfi_personality 155, DW.ref.__gxx_personality_v0
+; PIC: .cfi_lsda 27, .Lexception0
+
+
+define void @bar() {
+entry:
+  %call = invoke i32 @foo()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret void
+
+lpad:
+  %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
+            catch i8* null
+  ret void
+}
+
+declare i32 @foo()
+
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index 551d9bc..cdcdc96 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -26,9 +26,22 @@ cond_true:		; preds = %0
 ReturnBlock:		; preds = %0
 	ret i32 0
 ; CHECK-LABEL: test2:
-; CHECK: movl	(%rsi), %eax
-; CHECK: shll	$3, %eax
-; CHECK: testl	%eax, %eax
+; CHECK: testl	$536870911, (%rsi)
+}
+
+define i8 @test2b(i8 %X, i8* %y) nounwind {
+	%tmp = load i8* %y		; <i8> [#uses=1]
+	%tmp1 = shl i8 %tmp, 3		; <i8> [#uses=1]
+	%tmp1.upgrd.2 = icmp eq i8 %tmp1, 0		; <i1> [#uses=1]
+	br i1 %tmp1.upgrd.2, label %ReturnBlock, label %cond_true
+
+cond_true:		; preds = %0
+	ret i8 1
+
+ReturnBlock:		; preds = %0
+	ret i8 0
+; CHECK-LABEL: test2b:
+; CHECK: testb	$31, (%rsi)
 }
 
 define i64 @test3(i64 %x) nounwind {
@@ -68,8 +81,8 @@ define i32 @test5(double %A) nounwind  {
  bb12:; preds = %entry
  ret i32 32
 ; CHECK-LABEL: test5:
-; CHECK: ucomisd	LCPI4_0(%rip), %xmm0
-; CHECK: ucomisd	LCPI4_1(%rip), %xmm0
+; CHECK: ucomisd	LCPI5_0(%rip), %xmm0
+; CHECK: ucomisd	LCPI5_1(%rip), %xmm0
 }
 
 declare i32 @foo(...)
@@ -163,3 +176,25 @@ define i32 @test12() uwtable ssp {
 }
 
 declare zeroext i1 @test12b()
+
+define i32 @test13(i32 %mask, i32 %base, i32 %intra) {
+  %and = and i32 %mask, 8
+  %tobool = icmp ne i32 %and, 0
+  %cond = select i1 %tobool, i32 %intra, i32 %base
+  ret i32 %cond
+
+; CHECK-LABEL: test13:
+; CHECK: testb	$8, %dil
+; CHECK: cmovnel
+}
+
+define i32 @test14(i32 %mask, i32 %base, i32 %intra) #0 {
+  %s = lshr i32 %mask, 7
+  %tobool = icmp sgt i32 %s, -1
+  %cond = select i1 %tobool, i32 %intra, i32 %base
+  ret i32 %cond
+
+; CHECK-LABEL: test14:
+; CHECK: 	shrl	$7, %edi
+; CHECK-NEXT: 	cmovnsl	%edx, %esi
+}
diff --git a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
index e3d6b34..78e1dd2 100644
--- a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -codegenprepare %s -o - | FileCheck %s
+; RUN: opt -S -codegenprepare -addr-sink-using-gep=1 %s -o - | FileCheck -check-prefix=CHECK-GEP %s
 ; This file tests the different cases what are involved when codegen prepare
 ; tries to get sign extension out of the way of addressing mode.
 ; This tests require an actual target as addressing mode decisions depends
@@ -281,6 +282,25 @@ define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, i8* %base) {
 ; CHECK: [[ADDR2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[BASE2]] to i32*
 ; CHECK: load i32* [[ADDR2]]
 ; CHECK: ret
+; CHECK-GEP-LABEL: @checkProfitability
+; CHECK-GEP-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg1 to i64
+; CHECK-GEP-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg2 to i64
+; CHECK-GEP: [[SHL:%[a-zA-Z_0-9-]+]] = shl nsw i32 %arg1, 1
+; CHECK-GEP: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SHL]], %arg2
+; CHECK-GEP: [[SEXTADD:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
+; BB then
+; CHECK-GEP: [[BASE1:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to i32*
+; CHECK-GEP: [[BCC1:%[a-zA-Z_0-9-]+]] = bitcast i32* [[BASE1]] to i8*
+; CHECK-GEP: [[FULL1:%[a-zA-Z_0-9-]+]] = getelementptr i8* [[BCC1]], i64 48
+; CHECK-GEP: [[ADDR1:%[a-zA-Z_0-9-]+]] = bitcast i8* [[FULL1]] to i32*
+; CHECK-GEP: load i32* [[ADDR1]]
+; BB else
+; CHECK-GEP: [[BASE2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to i32*
+; CHECK-GEP: [[BCC2:%[a-zA-Z_0-9-]+]] = bitcast i32* [[BASE2]] to i8*
+; CHECK-GEP: [[FULL2:%[a-zA-Z_0-9-]+]] = getelementptr i8* [[BCC2]], i64 48
+; CHECK-GEP: [[ADDR2:%[a-zA-Z_0-9-]+]] = bitcast i8* [[FULL2]] to i32*
+; CHECK-GEP: load i32* [[ADDR2]]
+; CHECK-GEP: ret
 define i32 @checkProfitability(i32 %arg1, i32 %arg2, i1 %test) {
   %shl = shl nsw i32 %arg1, 1
   %add1 = add nsw i32 %shl, %arg2
diff --git a/test/CodeGen/X86/codegen-prepare-crash.ll b/test/CodeGen/X86/codegen-prepare-crash.ll
new file mode 100644
index 0000000..c328817
--- /dev/null
+++ b/test/CodeGen/X86/codegen-prepare-crash.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = external global [10 x i32]
+
+define void @f(i32 %u) {
+  %1 = add i32 %u, 4
+  br label %P.Proc8.exit
+
+P.Proc8.exit:
+  %valueindex35.i = getelementptr [10 x i32]* @g, i32 0, i32 %1
+  store i32 %u, i32* %valueindex35.i
+  ret void
+}
diff --git a/test/CodeGen/X86/codegen-prepare.ll b/test/CodeGen/X86/codegen-prepare.ll
index 316accf..4ff0f1c 100644
--- a/test/CodeGen/X86/codegen-prepare.ll
+++ b/test/CodeGen/X86/codegen-prepare.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -addr-sink-using-gep=1 | FileCheck %s
 
 ; Check that the CodeGenPrepare Pass
 ; does not wrongly rewrite the address computed by Instruction %4
diff --git a/test/CodeGen/X86/combine-avx-intrinsics.ll b/test/CodeGen/X86/combine-avx-intrinsics.ll
new file mode 100644
index 0000000..f610f7f
--- /dev/null
+++ b/test/CodeGen/X86/combine-avx-intrinsics.ll
@@ -0,0 +1,119 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s
+
+
+define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0) {
+  %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a0, i32 7)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test_x86_avx_blend_pd_256
+; CHECK-NOT: vblendpd
+; CHECK: ret
+
+
+define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0) {
+  %1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a0, i32 7)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test_x86_avx_blend_ps_256
+; CHECK-NOT: vblendps
+; CHECK: ret
+
+
+define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a0, <4 x double> %a1)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test_x86_avx_blendv_pd_256
+; CHECK-NOT: vblendvpd
+; CHECK: ret
+
+
+define <8 x float> @test_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test_x86_avx_blendv_ps_256
+; CHECK-NOT: vblendvps
+; CHECK: ret
+
+
+define <4 x double> @test2_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test2_x86_avx_blend_pd_256
+; CHECK-NOT: vblendpd
+; CHECK: ret
+
+
+define <8 x float> @test2_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test2_x86_avx_blend_ps_256
+; CHECK-NOT: vblendps
+; CHECK: ret
+
+
+define <4 x double> @test2_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> zeroinitializer)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test2_x86_avx_blendv_pd_256
+; CHECK-NOT: vblendvpd
+; CHECK: ret
+
+
+define <8 x float> @test2_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test2_x86_avx_blendv_ps_256
+; CHECK-NOT: vblendvps
+; CHECK: ret
+
+
+define <4 x double> @test3_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 -1)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test3_x86_avx_blend_pd_256
+; CHECK-NOT: vblendpd
+; CHECK: ret
+
+
+define <8 x float> @test3_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 -1)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test3_x86_avx_blend_ps_256
+; CHECK-NOT: vblendps
+; CHECK: ret
+
+
+define <4 x double> @test3_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %Mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <4 x double>
+  %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %Mask)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test3_x86_avx_blendv_pd_256
+; CHECK-NOT: vblendvpd
+; CHECK: ret
+
+
+define <8 x float> @test3_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %Mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x float>
+  %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %Mask)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test3_x86_avx_blendv_ps_256
+; CHECK-NOT: vblendvps
+; CHECK: ret
+
+
+
+declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32)
+declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
diff --git a/test/CodeGen/X86/combine-avx2-intrinsics.ll b/test/CodeGen/X86/combine-avx2-intrinsics.ll
new file mode 100644
index 0000000..8794f8b
--- /dev/null
+++ b/test/CodeGen/X86/combine-avx2-intrinsics.ll
@@ -0,0 +1,164 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s
+
+; Verify that the backend correctly combines AVX2 builtin intrinsics.
+
+
+define <8 x i32> @test_psra_1(<8 x i32> %A) {
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 3)
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
+  %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 2)
+  ret <8 x i32> %3
+}
+; CHECK-LABEL: test_psra_1
+; CHECK: vpsrad $8, %ymm0, %ymm0
+; CHECK-NEXT: ret
+
+define <16 x i16> @test_psra_2(<16 x i16> %A) {
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 3)
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 2)
+  ret <16 x i16> %3
+}
+; CHECK-LABEL: test_psra_2
+; CHECK: vpsraw $8, %ymm0, %ymm0
+; CHECK-NEXT: ret
+
+define <16 x i16> @test_psra_3(<16 x i16> %A) {
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0)
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0)
+  ret <16 x i16> %3
+}
+; CHECK-LABEL: test_psra_3
+; CHECK-NOT: vpsraw
+; CHECK: ret
+
+define <8 x i32> @test_psra_4(<8 x i32> %A) {
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0)
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
+  %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0)
+  ret <8 x i32> %3
+}
+; CHECK-LABEL: test_psra_4
+; CHECK-NOT: vpsrad
+; CHECK: ret
+
+
+define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
+  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %res
+}
+; CHECK-LABEL: test_x86_avx2_pblendvb
+; CHECK-NOT: vpblendvb
+; CHECK: ret
+
+
+define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0) {
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a0, i32 7)
+  ret <16 x i16> %res
+}
+; CHECK-LABEL: test_x86_avx2_pblendw
+; CHECK-NOT: vpblendw
+; CHECK: ret
+
+
+define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0) {
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a0, i32 7)
+  ret <4 x i32> %res
+}
+; CHECK-LABEL: test_x86_avx2_pblendd_128
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0) {
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a0, i32 7)
+  ret <8 x i32> %res
+}
+; CHECK-LABEL: test_x86_avx2_pblendd_256
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <32 x i8> @test2_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
+  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> zeroinitializer)
+  ret <32 x i8> %res
+}
+; CHECK-LABEL: test2_x86_avx2_pblendvb
+; CHECK-NOT: vpblendvb
+; CHECK: ret
+
+
+define <16 x i16> @test2_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 0)
+  ret <16 x i16> %res
+}
+; CHECK-LABEL: test2_x86_avx2_pblendw
+; CHECK-NOT: vpblendw
+; CHECK: ret
+
+
+define <4 x i32> @test2_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 0)
+  ret <4 x i32> %res
+}
+; CHECK-LABEL: test2_x86_avx2_pblendd_128
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <8 x i32> @test2_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 0)
+  ret <8 x i32> %res
+}
+; CHECK-LABEL: test2_x86_avx2_pblendd_256
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <32 x i8> @test3_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
+  %1 = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <32 x i8>
+  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %1)
+  ret <32 x i8> %res
+}
+; CHECK-LABEL: test3_x86_avx2_pblendvb
+; CHECK-NOT: vpblendvb
+; CHECK: ret
+
+
+define <16 x i16> @test3_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 -1)
+  ret <16 x i16> %res
+}
+; CHECK-LABEL: test3_x86_avx2_pblendw
+; CHECK-NOT: vpblendw
+; CHECK: ret
+
+
+define <4 x i32> @test3_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 -1)
+  ret <4 x i32> %res
+}
+; CHECK-LABEL: test3_x86_avx2_pblendd_128
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <8 x i32> @test3_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 -1)
+  ret <8 x i32> %res
+}
+; CHECK-LABEL: test3_x86_avx2_pblendd_256
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32)
+declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32)
+declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32)
+declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>)
+declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32)
+declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32)
+
diff --git a/test/CodeGen/X86/combine-sse2-intrinsics.ll b/test/CodeGen/X86/combine-sse2-intrinsics.ll
new file mode 100644
index 0000000..fa500e5
--- /dev/null
+++ b/test/CodeGen/X86/combine-sse2-intrinsics.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+; Verify that the backend correctly combines SSE2 builtin intrinsics.
+
+
+define <4 x i32> @test_psra_1(<4 x i32> %A) {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 3)
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
+  %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 2)
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test_psra_1
+; CHECK: psrad $8, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test_psra_2(<8 x i16> %A) {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 3)
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 2)
+  ret <8 x i16> %3
+}
+; CHECK-LABEL: test_psra_2
+; CHECK: psraw $8, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_psra_3(<4 x i32> %A) {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0)
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
+  %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 0)
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test_psra_3
+; CHECK-NOT: psrad
+; CHECK: ret
+
+
+define <8 x i16> @test_psra_4(<8 x i16> %A) {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0)
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0)
+  ret <8 x i16> %3
+}
+; CHECK-LABEL: test_psra_4
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)
+
diff --git a/test/CodeGen/X86/combine-sse41-intrinsics.ll b/test/CodeGen/X86/combine-sse41-intrinsics.ll
new file mode 100644
index 0000000..254991a
--- /dev/null
+++ b/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -0,0 +1,182 @@
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=corei7 | FileCheck %s
+
+
+define <2 x double> @test_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 0)
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test_x86_sse41_blend_pd
+; CHECK-NOT: blendpd
+; CHECK: ret
+
+
+define <4 x float> @test_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 0)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test_x86_sse41_blend_ps
+; CHECK-NOT: blendps
+; CHECK: ret
+
+
+define <2 x double> @test_x86_sse41_blendv_pd(<2 x double> %a0, <2 x double> %a1) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> zeroinitializer)
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test_x86_sse41_blendv_pd
+; CHECK-NOT: blendvpd
+; CHECK: ret
+
+
+define <4 x float> @test_x86_sse41_blendv_ps(<4 x float> %a0, <4 x float> %a1) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test_x86_sse41_blendv_ps
+; CHECK-NOT: blendvps
+; CHECK: ret
+
+
+define <16 x i8> @test_x86_sse41_pblendv_b(<16 x i8> %a0, <16 x i8> %a1) {
+  %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> zeroinitializer)
+  ret <16 x i8> %1
+}
+; CHECK-LABEL: test_x86_sse41_pblendv_b
+; CHECK-NOT: pblendvb
+; CHECK: ret
+
+
+define <8 x i16> @test_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
+  %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 0)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test_x86_sse41_pblend_w
+; CHECK-NOT: pblendw
+; CHECK: ret
+
+
+define <2 x double> @test2_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 -1)
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test2_x86_sse41_blend_pd
+; CHECK-NOT: blendpd
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test2_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 -1)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test2_x86_sse41_blend_ps
+; CHECK-NOT: blendps
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <2 x double> @test2_x86_sse41_blendv_pd(<2 x double> %a0, <2 x double> %a1) {
+  %Mask = bitcast <2 x i64> <i64 -1, i64 -1> to <2 x double>
+  %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %Mask )
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test2_x86_sse41_blendv_pd
+; CHECK-NOT: blendvpd
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test2_x86_sse41_blendv_ps(<4 x float> %a0, <4 x float> %a1) {
+  %Mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x float>
+  %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %Mask)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test2_x86_sse41_blendv_ps
+; CHECK-NOT: blendvps
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <16 x i8> @test2_x86_sse41_pblendv_b(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+  %Mask = bitcast <2 x i64> <i64 -1, i64 -1> to <16 x i8>
+  %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %Mask)
+  ret <16 x i8> %1
+}
+; CHECK-LABEL: test2_x86_sse41_pblendv_b
+; CHECK-NOT: pblendvb
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <8 x i16> @test2_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
+  %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 -1)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test2_x86_sse41_pblend_w
+; CHECK-NOT: pblendw
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <2 x double> @test3_x86_sse41_blend_pd(<2 x double> %a0) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a0, i32 7)
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test3_x86_sse41_blend_pd
+; CHECK-NOT: blendpd
+; CHECK: ret
+
+
+define <4 x float> @test3_x86_sse41_blend_ps(<4 x float> %a0) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a0, i32 7)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test3_x86_sse41_blend_ps
+; CHECK-NOT: blendps
+; CHECK: ret
+
+
+define <2 x double> @test3_x86_sse41_blendv_pd(<2 x double> %a0, <2 x double> %a1) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a0, <2 x double> %a1 )
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test3_x86_sse41_blendv_pd
+; CHECK-NOT: blendvpd
+; CHECK: ret
+
+
+define <4 x float> @test3_x86_sse41_blendv_ps(<4 x float> %a0, <4 x float> %a1) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test3_x86_sse41_blendv_ps
+; CHECK-NOT: blendvps
+; CHECK: ret
+
+
+define <16 x i8> @test3_x86_sse41_pblendv_b(<16 x i8> %a0, <16 x i8> %a1) {
+  %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %1
+}
+; CHECK-LABEL: test3_x86_sse41_pblendv_b
+; CHECK-NOT: pblendvb
+; CHECK: ret
+
+
+define <8 x i16> @test3_x86_sse41_pblend_w(<8 x i16> %a0) {
+  %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a0, i32 7)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test3_x86_sse41_pblend_w
+; CHECK-NOT: pblendw
+; CHECK: ret
+
+
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32)
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32)
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>)
+
diff --git a/test/CodeGen/X86/constant-hoisting-shift-immediate.ll b/test/CodeGen/X86/constant-hoisting-shift-immediate.ll
new file mode 100644
index 0000000..883be35
--- /dev/null
+++ b/test/CodeGen/X86/constant-hoisting-shift-immediate.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -O3 -march=x86-64 |FileCheck %s
+define i64 @foo(i1 %z, i192* %p, i192* %q)
+{
+; If const 128 is hoisted to a variable, then in basic block L_val2 we would
+; have %lshr2 = lshr i192 %data2, %const, and the definition of %const would
+; be in another basic block. As a result, a very inefficient code might be
+; produced. Here we check that this doesn't occur.
+entry:
+  %data1 = load i192* %p, align 8
+  %lshr1 = lshr i192 %data1, 128
+  %val1  = trunc i192 %lshr1 to i64
+  br i1 %z, label %End, label %L_val2
+
+; CHECK: movq    16(%rdx), %rax
+; CHECK-NEXT: retq
+L_val2:
+  %data2 = load i192* %q, align 8
+  %lshr2 = lshr i192 %data2, 128
+  %val2  = trunc i192 %lshr2 to i64
+  br label %End
+
+End:
+  %p1 = phi i64 [%val1,%entry], [%val2,%L_val2]
+  ret i64 %p1
+}
diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll
index 98ae1d5..21225e3 100644
--- a/test/CodeGen/X86/divide-by-constant.ll
+++ b/test/CodeGen/X86/divide-by-constant.ll
@@ -7,7 +7,7 @@ entry:
 	%div = udiv i16 %x, 33
 	ret i16 %div
 ; CHECK-LABEL: test1:
-; CHECK: imull	$63551, %eax, %eax
+; CHECK: imull	$63551, %eax
 ; CHECK-NEXT: shrl	$21, %eax
 ; CHECK-NEXT: ret
 }
@@ -18,7 +18,7 @@ entry:
   ret i16 %div
 
 ; CHECK-LABEL: test2:
-; CHECK: imull	$43691, %eax, %eax
+; CHECK: imull	$43691, %eax
 ; CHECK-NEXT: shrl	$17, %eax
 ; CHECK-NEXT: ret
 }
@@ -30,7 +30,7 @@ entry:
 
 ; CHECK-LABEL: test3:
 ; CHECK: movzbl  8(%esp), %eax
-; CHECK-NEXT: imull	$171, %eax, %eax
+; CHECK-NEXT: imull	$171, %eax
 ; CHECK-NEXT: shrl	$9, %eax
 ; CHECK-NEXT: ret
 }
@@ -40,7 +40,7 @@ entry:
 	%div = sdiv i16 %x, 33		; <i32> [#uses=1]
 	ret i16 %div
 ; CHECK-LABEL: test4:
-; CHECK: imull	$1986, %eax, %
+; CHECK: imull	$1986, %eax
 }
 
 define i32 @test5(i32 %A) nounwind {
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index a38c2d8..f4dec4f 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -40,18 +40,18 @@ define weak_odr dllexport void @weak1() {
 ; CHECK: .globl Var1
 @Var1 = dllexport global i32 1, align 4
 
-; CHECK: .rdata,"r"
+; CHECK: .rdata,"rd"
 ; CHECK: .globl Var2
 @Var2 = dllexport unnamed_addr constant i32 1
 
 ; CHECK: .comm Var3
 @Var3 = common dllexport global i32 0, align 4
 
-; CHECK: .section .data,"w",discard,WeakVar1
+; CHECK: .section .data,"wd",discard,WeakVar1
 ; CHECK: .globl WeakVar1
 @WeakVar1 = weak_odr dllexport global i32 1, align 4
 
-; CHECK: .section .rdata,"r",discard,WeakVar2
+; CHECK: .section .rdata,"rd",discard,WeakVar2
 ; CHECK: .globl WeakVar2
 @WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
 
@@ -66,39 +66,43 @@ define weak_odr dllexport void @weak1() {
 
 ; CHECK: .globl alias3
 ; CHECK: alias3 = notExported
-@alias3 = dllexport alias void()* @alias
+@alias3 = dllexport alias void()* @notExported
 
 ; CHECK: .weak weak_alias
 ; CHECK: weak_alias = f1
 @weak_alias = dllexport alias weak_odr void()* @f1
 
+@blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
+@blob_alias = dllexport alias i32 (), [6 x i8]* @blob
 
 ; CHECK: .section .drectve
-; WIN32: /EXPORT:Var1,DATA
-; WIN32: /EXPORT:Var2,DATA
-; WIN32: /EXPORT:Var3,DATA
-; WIN32: /EXPORT:WeakVar1,DATA
-; WIN32: /EXPORT:WeakVar2,DATA
-; WIN32: /EXPORT:f1
-; WIN32: /EXPORT:f2
-; WIN32: /EXPORT:lnk1
-; WIN32: /EXPORT:lnk2
-; WIN32: /EXPORT:weak1
-; WIN32: /EXPORT:alias
-; WIN32: /EXPORT:alias2
-; WIN32: /EXPORT:alias3
-; WIN32: /EXPORT:weak_alias
-; MINGW: -export:Var1,data
-; MINGW: -export:Var2,data
-; MINGW: -export:Var3,data
-; MINGW: -export:WeakVar1,data
-; MINGW: -export:WeakVar2,data
-; MINGW: -export:f1
-; MINGW: -export:f2
-; MINGW: -export:lnk1
-; MINGW: -export:lnk2
-; MINGW: -export:weak1
-; MINGW: -export:alias
-; MINGW: -export:alias2
-; MINGW: -export:alias3
-; MINGW: -export:weak_alias
+; WIN32: " /EXPORT:Var1,DATA"
+; WIN32: " /EXPORT:Var2,DATA"
+; WIN32: " /EXPORT:Var3,DATA"
+; WIN32: " /EXPORT:WeakVar1,DATA"
+; WIN32: " /EXPORT:WeakVar2,DATA"
+; WIN32: " /EXPORT:f1"
+; WIN32: " /EXPORT:f2"
+; WIN32: " /EXPORT:lnk1"
+; WIN32: " /EXPORT:lnk2"
+; WIN32: " /EXPORT:weak1"
+; WIN32: " /EXPORT:alias"
+; WIN32: " /EXPORT:alias2"
+; WIN32: " /EXPORT:alias3"
+; WIN32: " /EXPORT:weak_alias"
+; WIN32: " /EXPORT:blob_alias"
+; MINGW: " -export:Var1,data"
+; MINGW: " -export:Var2,data"
+; MINGW: " -export:Var3,data"
+; MINGW: " -export:WeakVar1,data"
+; MINGW: " -export:WeakVar2,data"
+; MINGW: " -export:f1"
+; MINGW: " -export:f2"
+; MINGW: " -export:lnk1"
+; MINGW: " -export:lnk2"
+; MINGW: " -export:weak1"
+; MINGW: " -export:alias"
+; MINGW: " -export:alias2"
+; MINGW: " -export:alias3"
+; MINGW: " -export:weak_alias"
+; MINGW: " -export:blob_alias"
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index 1b34d23..e2c3f13 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -1,5 +1,9 @@
-; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s
-; RUN: llc -mtriple i386-pc-mingw32 < %s | FileCheck -check-prefix=CHECK -check-prefix=MINGW %s
+; RUN: llc -mtriple i386-pc-win32 < %s \
+; RUN:    | FileCheck -check-prefix CHECK -check-prefix CHECK-CL %s
+; RUN: llc -mtriple i386-pc-mingw32 < %s \
+; RUN:    | FileCheck -check-prefix CHECK -check-prefix CHECK-GCC %s
+; RUN: llc -mtriple i686-pc-cygwin %s -o - \
+; RUN:    | FileCheck -check-prefix CHECK -check-prefix CHECK-GCC %s
 
 ; CHECK: .text
 
@@ -55,18 +59,18 @@ define weak_odr dllexport void @weak1() {
 ; CHECK: .globl _Var1
 @Var1 = dllexport global i32 1, align 4
 
-; CHECK: .rdata,"r"
+; CHECK: .rdata,"rd"
 ; CHECK: .globl _Var2
 @Var2 = dllexport unnamed_addr constant i32 1
 
 ; CHECK: .comm _Var3
 @Var3 = common dllexport global i32 0, align 4
 
-; CHECK: .section .data,"w",discard,_WeakVar1
+; CHECK: .section .data,"wd",discard,_WeakVar1
 ; CHECK: .globl _WeakVar1
 @WeakVar1 = weak_odr dllexport global i32 1, align 4
 
-; CHECK: .section .rdata,"r",discard,_WeakVar2
+; CHECK: .section .rdata,"rd",discard,_WeakVar2
 ; CHECK: .globl _WeakVar2
 @WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
 
@@ -81,7 +85,7 @@ define weak_odr dllexport void @weak1() {
 
 ; CHECK: .globl _alias3
 ; CHECK: _alias3 = _notExported
-@alias3 = dllexport alias void()* @alias
+@alias3 = dllexport alias void()* @notExported
 
 ; CHECK: .weak _weak_alias
 ; CHECK: _weak_alias = _f1
@@ -89,37 +93,38 @@ define weak_odr dllexport void @weak1() {
 
 
 ; CHECK: .section .drectve
-; WIN32: /EXPORT:_Var1,DATA
-; WIN32: /EXPORT:_Var2,DATA
-; WIN32: /EXPORT:_Var3,DATA
-; WIN32: /EXPORT:_WeakVar1,DATA
-; WIN32: /EXPORT:_WeakVar2,DATA
-; WIN32: /EXPORT:_f1
-; WIN32: /EXPORT:_f2
-; WIN32: /EXPORT:_stdfun@0
-; WIN32: /EXPORT:@fastfun@0
-; WIN32: /EXPORT:_thisfun
-; WIN32: /EXPORT:_lnk1
-; WIN32: /EXPORT:_lnk2
-; WIN32: /EXPORT:_weak1
-; WIN32: /EXPORT:_alias
-; WIN32: /EXPORT:_alias2
-; WIN32: /EXPORT:_alias3
-; WIN32: /EXPORT:_weak_alias
-; MINGW: -export:_Var1,data
-; MINGW: -export:_Var2,data
-; MINGW: -export:_Var3,data
-; MINGW: -export:_WeakVar1,data
-; MINGW: -export:_WeakVar2,data
-; MINGW: -export:_f1
-; MINGW: -export:_f2
-; MINGW: -export:_stdfun@0
-; MINGW: -export:@fastfun@0
-; MINGW: -export:_thisfun
-; MINGW: -export:_lnk1
-; MINGW: -export:_lnk2
-; MINGW: -export:_weak1
-; MINGW: -export:_alias
-; MINGW: -export:_alias2
-; MINGW: -export:_alias3
-; MINGW: -export:_weak_alias
+; CHECK-CL: " /EXPORT:_Var1,DATA"
+; CHECK-CL: " /EXPORT:_Var2,DATA"
+; CHECK-CL: " /EXPORT:_Var3,DATA"
+; CHECK-CL: " /EXPORT:_WeakVar1,DATA"
+; CHECK-CL: " /EXPORT:_WeakVar2,DATA"
+; CHECK-CL: " /EXPORT:_f1"
+; CHECK-CL: " /EXPORT:_f2"
+; CHECK-CL: " /EXPORT:_stdfun@0"
+; CHECK-CL: " /EXPORT:@fastfun@0"
+; CHECK-CL: " /EXPORT:_thisfun"
+; CHECK-CL: " /EXPORT:_lnk1"
+; CHECK-CL: " /EXPORT:_lnk2"
+; CHECK-CL: " /EXPORT:_weak1"
+; CHECK-CL: " /EXPORT:_alias"
+; CHECK-CL: " /EXPORT:_alias2"
+; CHECK-CL: " /EXPORT:_alias3"
+; CHECK-CL: " /EXPORT:_weak_alias"
+; CHECK-GCC: " -export:Var1,data"
+; CHECK-GCC: " -export:Var2,data"
+; CHECK-GCC: " -export:Var3,data"
+; CHECK-GCC: " -export:WeakVar1,data"
+; CHECK-GCC: " -export:WeakVar2,data"
+; CHECK-GCC: " -export:f1"
+; CHECK-GCC: " -export:f2"
+; CHECK-GCC: " -export:stdfun@0"
+; CHECK-GCC: " -export:@fastfun@0"
+; CHECK-GCC: " -export:thisfun"
+; CHECK-GCC: " -export:lnk1"
+; CHECK-GCC: " -export:lnk2"
+; CHECK-GCC: " -export:weak1"
+; CHECK-GCC: " -export:alias"
+; CHECK-GCC: " -export:alias2"
+; CHECK-GCC: " -export:alias3"
+; CHECK-GCC: " -export:weak_alias"
+
diff --git a/test/CodeGen/X86/expand-opaque-const.ll b/test/CodeGen/X86/expand-opaque-const.ll
new file mode 100644
index 0000000..6e461cf
--- /dev/null
+++ b/test/CodeGen/X86/expand-opaque-const.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mcpu=generic -O1 -relocation-model=pic < %s | FileCheck %s
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i686-apple-darwin"
+
+define i64 @test_lshr() {
+entry:
+; CHECK-NOT: movl $-1, 16(%esp)
+; CHECK-NOT: movl  $-1, %eax
+  %retval = alloca i64
+  %op1 = alloca i64
+  %op2 = alloca i64
+  store i64 -6687208052682386272, i64* %op1
+  store i64 7106745059734980448, i64* %op2
+  %tmp1 = load i64* %op1
+  %tmp2 = load i64* %op2
+  %tmp = xor i64 %tmp2, 7106745059734980448
+  %tmp3 = lshr i64 %tmp1, %tmp
+  store i64 %tmp3, i64* %retval
+  %tmp4 = load i64* %retval
+  ret i64 %tmp4
+}
diff --git a/test/CodeGen/X86/f16c-intrinsics.ll b/test/CodeGen/X86/f16c-intrinsics.ll
index 2135f94..514d929 100644
--- a/test/CodeGen/X86/f16c-intrinsics.ll
+++ b/test/CodeGen/X86/f16c-intrinsics.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86 -mattr=+avx,+f16c | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+avx,+f16c | FileCheck %s
 
 define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
   ; CHECK: vcvtph2ps
@@ -30,3 +31,16 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
+
+define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
+; CHECK-LABEL: test_x86_vcvtps2ph_128_scalar
+; CHECK-NOT: vmov
+; CHECK: vcvtph2ps (%
+
+  %load = load i64* %ptr
+  %ins1 = insertelement <2 x i64> undef, i64 %load, i32 0
+  %ins2 = insertelement <2 x i64> %ins1, i64 0, i32 1
+  %bc = bitcast <2 x i64> %ins2 to <8 x i16>
+  %res = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %bc) #2
+  ret <4 x float> %res
+}
diff --git a/test/CodeGen/X86/fma-do-not-commute.ll b/test/CodeGen/X86/fma-do-not-commute.ll
new file mode 100644
index 0000000..4e21172
--- /dev/null
+++ b/test/CodeGen/X86/fma-do-not-commute.ll
@@ -0,0 +1,30 @@
+; RUN: llc -fp-contract=fast -mattr=+fma -disable-cgp < %s -o - | FileCheck %s
+; Check that the 2nd and 3rd arguments of fmaXXX231 reg1, reg2, mem3 are not commuted.
+; <rdar://problem/16800495> 
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; CHECK-LABEL: test1:
+; %arg lives in xmm0 and it shouldn't be redefined until it is used in the FMA.
+; CHECK-NOT {{.*}}, %xmm0
+; %addr lives in rdi.
+; %addr2 lives in rsi.
+; CHECK: vmovss (%rsi), [[ADDR2:%xmm[0-9]+]]
+; The assembly syntax is in the reverse order.
+; CHECK: vfmadd231ss (%rdi), [[ADDR2]], %xmm0
+define void @test1(float* %addr, float* %addr2, float %arg) {
+entry:
+  br label %loop
+
+loop:
+  %sum0 = phi float [ %fma, %loop ], [ %arg, %entry ]
+  %addrVal = load float* %addr, align 4
+  %addr2Val = load float* %addr2, align 4
+  %fmul = fmul float %addrVal, %addr2Val
+  %fma = fadd float %sum0, %fmul
+  br i1 true, label %exit, label %loop
+
+exit:
+  store float %fma, float* %addr, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/fold-load-vec.ll b/test/CodeGen/X86/fold-load-vec.ll
index e85d8f7..96c5be4 100644
--- a/test/CodeGen/X86/fold-load-vec.ll
+++ b/test/CodeGen/X86/fold-load-vec.ll
@@ -5,7 +5,7 @@
 ; loads from m32.
 define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
 ; CHECK: sample_test
-; CHECK: movaps
+; CHECK-NOT: movaps
 ; CHECK: insertps
 entry:
   %source.addr = alloca <4 x float>*, align 8
diff --git a/test/CodeGen/X86/gcc_except_table.ll b/test/CodeGen/X86/gcc_except_table.ll
index 7a29b07..8c328ec 100644
--- a/test/CodeGen/X86/gcc_except_table.ll
+++ b/test/CodeGen/X86/gcc_except_table.ll
@@ -50,7 +50,3 @@ eh.resume:
 declare void @_Z1fv() optsize
 
 declare i32 @__gxx_personality_v0(...)
-
-; CHECK: Leh_func_end0:
-; CHECK: GCC_except_table0
-; CHECK: = Leh_func_end0-
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index 5ad5047..c763f39 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin9.7 | FileCheck %s -check-prefix=DARWIN
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -relocation-model=static | FileCheck %s -check-prefix=DARWIN-STATIC
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=DARWIN64
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -fdata-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
-; RUN: llc < %s -mtriple=i686-pc-win32 -fdata-sections -ffunction-sections | FileCheck %s -check-prefix=WIN32-SECTIONS
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -data-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
+; RUN: llc < %s -mtriple=i686-pc-win32 -data-sections -function-sections | FileCheck %s -check-prefix=WIN32-SECTIONS
 
 define void @F1() {
   ret void
@@ -18,13 +18,13 @@ define void @F1() {
 ; LINUX: .type   G1,@object
 ; LINUX: .comm  G1,4,4
 
-; DARWIN: .comm	_G1,4,2
+; DARWIN: .comm _G1,4,2
 
 
 
 
 ; const int G2 __attribute__((weak)) = 42;
-@G2 = weak_odr unnamed_addr constant i32 42	
+@G2 = weak_odr unnamed_addr constant i32 42     
 
 
 ; TODO: linux drops this into .rodata, we drop it into ".gnu.linkonce.r.G2"
@@ -48,7 +48,7 @@ define void @F1() {
 ; LINUX-SECTIONS: .section        .rodata.G3,"a",@progbits
 ; LINUX-SECTIONS: .globl  G3
 
-; WIN32-SECTIONS: .section        .rdata,"r",one_only,_G3
+; WIN32-SECTIONS: .section        .rdata,"rd",one_only,_G3
 ; WIN32-SECTIONS: .globl  _G3
 
 
@@ -85,25 +85,25 @@ define void @F1() {
 ; PR4584
 @"foo bar" = linkonce global i32 42
 
-; LINUX: .type	"foo bar",@object
+; LINUX: .type  "foo bar",@object
 ; LINUX: .section ".data.foo bar","aGw",@progbits,"foo bar",comdat
-; LINUX: .weak	"foo bar"
+; LINUX: .weak  "foo bar"
 ; LINUX: "foo bar":
 
-; DARWIN: .section		__DATA,__datacoal_nt,coalesced
-; DARWIN: .globl	"_foo bar"
-; DARWIN:	.weak_definition "_foo bar"
+; DARWIN: .section              __DATA,__datacoal_nt,coalesced
+; DARWIN: .globl        "_foo bar"
+; DARWIN:       .weak_definition "_foo bar"
 ; DARWIN: "_foo bar":
 
 ; PR4650
 @G6 = weak_odr unnamed_addr constant [1 x i8] c"\01"
 
-; LINUX:   .type	G6,@object
-; LINUX:   .section	.rodata.G6,"aG",@progbits,G6,comdat
-; LINUX:   .weak	G6
+; LINUX:   .type        G6,@object
+; LINUX:   .section     .rodata.G6,"aG",@progbits,G6,comdat
+; LINUX:   .weak        G6
 ; LINUX: G6:
-; LINUX:   .byte	1
-; LINUX:   .size	G6, 1
+; LINUX:   .byte        1
+; LINUX:   .size        G6, 1
 
 ; DARWIN:  .section __TEXT,__const_coal,coalesced
 ; DARWIN:  .globl _G6
@@ -114,58 +114,58 @@ define void @F1() {
 
 @G7 = unnamed_addr constant [10 x i8] c"abcdefghi\00"
 
-; DARWIN:	__TEXT,__cstring,cstring_literals
-; DARWIN:	.globl _G7
+; DARWIN:       __TEXT,__cstring,cstring_literals
+; DARWIN:       .globl _G7
 ; DARWIN: _G7:
-; DARWIN:	.asciz	"abcdefghi"
+; DARWIN:       .asciz  "abcdefghi"
 
-; LINUX:	.section	.rodata.str1.1,"aMS",@progbits,1
-; LINUX:	.globl G7
+; LINUX:        .section        .rodata.str1.1,"aMS",@progbits,1
+; LINUX:        .globl G7
 ; LINUX: G7:
-; LINUX:	.asciz	"abcdefghi"
+; LINUX:        .asciz  "abcdefghi"
 
 ; LINUX-SECTIONS: .section        .rodata.G7,"aMS",@progbits,1
-; LINUX-SECTIONS:	.globl G7
+; LINUX-SECTIONS:       .globl G7
 
-; WIN32-SECTIONS: .section        .rdata,"r",one_only,_G7
-; WIN32-SECTIONS:	.globl _G7
+; WIN32-SECTIONS: .section        .rdata,"rd",one_only,_G7
+; WIN32-SECTIONS:       .globl _G7
 
 
 @G8 = unnamed_addr constant [4 x i16] [ i16 1, i16 2, i16 3, i16 0 ]
 
-; DARWIN:	.section	__TEXT,__const
-; DARWIN:	.globl _G8
+; DARWIN:       .section        __TEXT,__const
+; DARWIN:       .globl _G8
 ; DARWIN: _G8:
 
-; LINUX:	.section	.rodata.str2.2,"aMS",@progbits,2
-; LINUX:	.globl G8
+; LINUX:        .section        .rodata.str2.2,"aMS",@progbits,2
+; LINUX:        .globl G8
 ; LINUX:G8:
 
 @G9 = unnamed_addr constant [4 x i32] [ i32 1, i32 2, i32 3, i32 0 ]
 
-; DARWIN:	.globl _G9
+; DARWIN:       .globl _G9
 ; DARWIN: _G9:
 
-; LINUX:	.section	.rodata.str4.4,"aMS",@progbits,4
-; LINUX:	.globl G9
+; LINUX:        .section        .rodata.str4.4,"aMS",@progbits,4
+; LINUX:        .globl G9
 ; LINUX:G9
 
 
 @G10 = weak global [100 x i32] zeroinitializer, align 32 ; <[100 x i32]*> [#uses=0]
 
 
-; DARWIN: 	.section	__DATA,__datacoal_nt,coalesced
+; DARWIN:       .section        __DATA,__datacoal_nt,coalesced
 ; DARWIN: .globl _G10
-; DARWIN:	.weak_definition _G10
-; DARWIN:	.align	5
+; DARWIN:       .weak_definition _G10
+; DARWIN:       .align  5
 ; DARWIN: _G10:
-; DARWIN:	.space	400
+; DARWIN:       .space  400
 
-; LINUX:	.bss
-; LINUX:	.weak	G10
-; LINUX:	.align	32
+; LINUX:        .bss
+; LINUX:        .weak   G10
+; LINUX:        .align  32
 ; LINUX: G10:
-; LINUX:	.zero	400
+; LINUX:        .zero   400
 
 
 
@@ -190,7 +190,7 @@ define void @F1() {
 ; LINUX-SECTIONS:        .asciz  "foo"
 ; LINUX-SECTIONS:        .size   .LG14, 4
 
-; WIN32-SECTIONS:        .section        .rdata,"r"
+; WIN32-SECTIONS:        .section        .rdata,"rd"
 ; WIN32-SECTIONS: L_G14:
 ; WIN32-SECTIONS:        .asciz  "foo"
 
diff --git a/test/CodeGen/X86/indirect-hidden.ll b/test/CodeGen/X86/indirect-hidden.ll
new file mode 100644
index 0000000..309375d
--- /dev/null
+++ b/test/CodeGen/X86/indirect-hidden.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=i686-apple-macosx -o - %s | FileCheck %s
+
+; x86 doesn't normally use indirect symbols, particularly hidden ones, but it
+; can be tricked into it for exception-handling typeids.
+
+@hidden_typeid = external hidden constant i8*
+@normal_typeid = external constant i8*
+
+declare void @throws()
+
+define void @get_indirect_hidden() {
+  invoke void @throws() to label %end unwind label %lpad
+lpad:
+  %tmp = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @hidden_typeid to i8*)
+  br label %end
+
+end:
+  ret void
+}
+
+define void @get_indirect() {
+  invoke void @throws() to label %end unwind label %lpad
+lpad:
+  %tmp = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @normal_typeid to i8*)
+  br label %end
+
+end:
+  ret void
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+; CHECK: .section __IMPORT,__pointers,non_lazy_symbol_pointers
+
+; CHECK-NOT: __DATA,__data
+; CHECK: .indirect_symbol _normal_typeid
+; CHECK-NEXT: .long 0
+
+; CHECK-NOT: __DATA,__data
+; CHECK: .indirect_symbol _hidden_typeid
+; CHECK-NEXT: .long 0
diff --git a/test/CodeGen/X86/isel-sink.ll b/test/CodeGen/X86/isel-sink.ll
index 458f19d..e4af9b6 100644
--- a/test/CodeGen/X86/isel-sink.ll
+++ b/test/CodeGen/X86/isel-sink.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -addr-sink-using-gep=1 | FileCheck %s
 
 define i32 @test(i32* %X, i32 %B) {
 ; CHECK-LABEL: test:
diff --git a/test/CodeGen/X86/lit.local.cfg b/test/CodeGen/X86/lit.local.cfg
index 1637fa4..3d91b03 100644
--- a/test/CodeGen/X86/lit.local.cfg
+++ b/test/CodeGen/X86/lit.local.cfg
@@ -4,7 +4,7 @@
 #
 # It should be possible to remove this override once all the bots have cycled
 # cleanly.
-config.suffixes = ['.ll', '.c', '.cpp', '.test', '.txt']
+config.suffixes = ['.ll', '.test', '.txt']
 
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
diff --git a/test/CodeGen/X86/live-out-reg-info.ll b/test/CodeGen/X86/live-out-reg-info.ll
index 8cd9774..283ee3a 100644
--- a/test/CodeGen/X86/live-out-reg-info.ll
+++ b/test/CodeGen/X86/live-out-reg-info.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86-64 | grep testb
 
 ; Make sure dagcombine doesn't eliminate the comparison due
-; to an off-by-one bug with ComputeMaskedBits information.
+; to an off-by-one bug with computeKnownBits information.
 
 declare void @qux()
 
diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll
new file mode 100644
index 0000000..b9b29a5
--- /dev/null
+++ b/test/CodeGen/X86/lower-bitcast.ll
@@ -0,0 +1,155 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -mattr=+sse2 | FileCheck %s
+
+
+define double @test1(double %A) {
+  %1 = bitcast double %A to <2 x i32>
+  %add = add <2 x i32> %1, <i32 3, i32 5>
+  %2 = bitcast <2 x i32> %add to double
+  ret double %2
+}
+; FIXME: Ideally we should be able to fold the entire body of @test1 into a
+; single paddd instruction. At the moment we produce the sequence 
+; pshufd+paddq+pshufd.
+
+; CHECK-LABEL: test1
+; CHECK-NOT: movsd
+; CHECK: pshufd
+; CHECK-NEXT: paddq
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define double @test2(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %add = add <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %add to double
+  ret double %3
+}
+; FIXME: Ideally we should be able to fold the entire body of @test2 into a
+; single 'paddd %xmm1, %xmm0' instruction. At the moment we produce the
+; sequence pshufd+pshufd+paddq+pshufd.
+
+; CHECK-LABEL: test2
+; CHECK-NOT: movsd
+; CHECK: pshufd
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: paddq
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define i64 @test3(i64 %A) {
+  %1 = bitcast i64 %A to <2 x float>
+  %add = fadd <2 x float> %1, <float 3.0, float 5.0>
+  %2 = bitcast <2 x float> %add to i64
+  ret i64 %2
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: pshufd
+; CHECK: addps
+; CHECK-NOT: pshufd
+; CHECK: ret
+
+
+define i64 @test4(i64 %A) {
+  %1 = bitcast i64 %A to <2 x i32>
+  %add = add <2 x i32> %1, <i32 3, i32 5>
+  %2 = bitcast <2 x i32> %add to i64
+  ret i64 %2
+}
+; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd.
+; Ideally, we should fold that sequence into a single paddd.
+
+; CHECK-LABEL: test4
+; CHECK: pshufd
+; CHECK-NEXT: paddq
+; CHECK-NEXT: pshufd
+; CHECK: ret
+
+
+define double @test5(double %A) {
+  %1 = bitcast double %A to <2 x float>
+  %add = fadd <2 x float> %1, <float 3.0, float 5.0>
+  %2 = bitcast <2 x float> %add to double
+  ret double %2
+}
+; CHECK-LABEL: test5
+; CHECK: addps
+; CHECK-NEXT: ret
+
+
+define double @test6(double %A) {
+  %1 = bitcast double %A to <4 x i16>
+  %add = add <4 x i16> %1, <i16 3, i16 4, i16 5, i16 6>
+  %2 = bitcast <4 x i16> %add to double
+  ret double %2
+}
+; FIXME: Ideally we should be able to fold the entire body of @test6 into a
+; single paddw instruction.
+
+; CHECK-LABEL: test6
+; CHECK-NOT: movsd
+; CHECK: punpcklwd
+; CHECK-NEXT: paddd
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test7(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %add = add <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %add to double
+  ret double %3
+}
+; FIXME: Ideally we should be able to fold the entire body of @test7 into a
+; single 'paddw %xmm1, %xmm0' instruction. At the moment we produce the
+; sequence pshufd+pshufd+paddd+pshufd.
+
+; CHECK-LABEL: test7
+; CHECK-NOT: movsd
+; CHECK: punpcklwd
+; CHECK-NEXT: punpcklwd
+; CHECK-NEXT: paddd
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test8(double %A) {
+  %1 = bitcast double %A to <8 x i8>
+  %add = add <8 x i8> %1, <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10>
+  %2 = bitcast <8 x i8> %add to double
+  ret double %2
+}
+; FIXME: Ideally we should be able to fold the entire body of @test8 into a
+; single paddb instruction. At the moment we produce the sequence 
+; pshufd+paddw+pshufd.
+
+; CHECK-LABEL: test8
+; CHECK-NOT: movsd
+; CHECK: punpcklbw
+; CHECK-NEXT: paddw
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test9(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %add = add <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %add to double
+  ret double %3
+}
+; FIXME: Ideally we should be able to fold the entire body of @test9 into a
+; single 'paddb %xmm1, %xmm0' instruction. At the moment we produce the
+; sequence pshufd+pshufd+paddw+pshufd.
+
+; CHECK-LABEL: test9
+; CHECK-NOT: movsd
+; CHECK: punpcklbw
+; CHECK-NEXT: punpcklbw
+; CHECK-NEXT: paddw
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
diff --git a/test/CodeGen/X86/lower-vec-shift.ll b/test/CodeGen/X86/lower-vec-shift.ll
new file mode 100644
index 0000000..c28f82a
--- /dev/null
+++ b/test/CodeGen/X86/lower-vec-shift.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+
+
+; Verify that the following shifts are lowered into a sequence of two shifts plus
+; a blend. On pre-avx2 targets, instead of scalarizing logical and arithmetic
+; packed shift right by a constant build_vector the backend should always try to
+; emit a simpler sequence of two shifts + blend when possible.
+
+define <8 x i16> @test1(<8 x i16> %a) {
+  %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %lshr
+}
+; CHECK-LABEL: test1
+; SSE: psrlw
+; SSE-NEXT: psrlw
+; SSE-NEXT: movss
+; AVX: vpsrlw
+; AVX-NEXT: vpsrlw
+; AVX-NEXT: vmovss
+; AVX2: vpsrlw
+; AVX2-NEXT: vpsrlw
+; AVX2-NEXT: vmovss
+; CHECK: ret
+
+
+define <8 x i16> @test2(<8 x i16> %a) {
+  %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %lshr
+}
+; CHECK-LABEL: test2
+; SSE: psrlw
+; SSE-NEXT: psrlw
+; SSE-NEXT: movsd
+; AVX: vpsrlw
+; AVX-NEXT: vpsrlw
+; AVX-NEXT: vmovsd
+; AVX2: vpsrlw
+; AVX2-NEXT: vpsrlw
+; AVX2-NEXT: vmovsd
+; CHECK: ret
+
+
+define <4 x i32> @test3(<4 x i32> %a) {
+  %lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
+  ret <4 x i32> %lshr
+}
+; CHECK-LABEL: test3
+; SSE: psrld
+; SSE-NEXT: psrld
+; SSE-NEXT: movss
+; AVX: vpsrld
+; AVX-NEXT: vpsrld
+; AVX-NEXT: vmovss
+; AVX2: vpsrlvd
+; CHECK: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a) {
+  %lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
+  ret <4 x i32> %lshr
+}
+; CHECK-LABEL: test4
+; SSE: psrld
+; SSE-NEXT: psrld
+; SSE-NEXT: movsd
+; AVX: vpsrld
+; AVX-NEXT: vpsrld
+; AVX-NEXT: vmovsd
+; AVX2: vpsrlvd
+; CHECK: ret
+
+
+define <8 x i16> @test5(<8 x i16> %a) {
+  %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %lshr
+}
+
+define <8 x i16> @test6(<8 x i16> %a) {
+  %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %lshr
+}
+; CHECK-LABEL: test6
+; SSE: psraw
+; SSE-NEXT: psraw
+; SSE-NEXT: movsd
+; AVX: vpsraw
+; AVX-NEXT: vpsraw
+; AVX-NEXT: vmovsd
+; AVX2: vpsraw
+; AVX2-NEXT: vpsraw
+; AVX2-NEXT: vmovsd
+; CHECK: ret
+
+
+define <4 x i32> @test7(<4 x i32> %a) {
+  %lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
+  ret <4 x i32> %lshr
+}
+; CHECK-LABEL: test7
+; SSE: psrad
+; SSE-NEXT: psrad
+; SSE-NEXT: movss
+; AVX: vpsrad
+; AVX-NEXT: vpsrad
+; AVX-NEXT: vmovss
+; AVX2: vpsravd
+; CHECK: ret
+
+
+define <4 x i32> @test8(<4 x i32> %a) {
+  %lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
+  ret <4 x i32> %lshr
+}
+; CHECK-LABEL: test8
+; SSE: psrad
+; SSE-NEXT: psrad
+; SSE-NEXT: movsd
+; AVX: vpsrad
+; AVX-NEXT: vpsrad
+; AVX-NEXT: vmovsd
+; AVX2: vpsravd
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/lzcnt-tzcnt.ll b/test/CodeGen/X86/lzcnt-tzcnt.ll
new file mode 100644
index 0000000..07e4b9d
--- /dev/null
+++ b/test/CodeGen/X86/lzcnt-tzcnt.ll
@@ -0,0 +1,447 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+bmi,+lzcnt | FileCheck %s
+
+; LZCNT and TZCNT will always produce the operand size when the input operand
+; is zero. This test is to verify that we efficiently select LZCNT/TZCNT
+; based on the fact that the 'icmp+select' sequence is always redundant
+; in every function defined below.
+
+
+define i16 @test1_ctlz(i16 %v) {
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test1_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test2_ctlz(i32 %v) {
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test2_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test3_ctlz(i64 %v) {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test3_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test4_ctlz(i16 %v) {
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test4_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test5_ctlz(i32 %v) {
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test5_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test6_ctlz(i64 %v) {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test6_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test7_ctlz(i16 %v) {
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test7_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test8_ctlz(i32 %v) {
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test8_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test9_ctlz(i64 %v) {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test9_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test10_ctlz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test10_ctlz
+; CHECK-NOT: movw
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test11_ctlz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test11_ctlz
+; CHECK-NOT: movd
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test12_ctlz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test12_ctlz
+; CHECK-NOT: movq
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test13_ctlz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test13_ctlz
+; CHECK-NOT: movw
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test14_ctlz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test14_ctlz
+; CHECK-NOT: movd
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test15_ctlz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test15_ctlz
+; CHECK-NOT: movq
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test16_ctlz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test16_ctlz
+; CHECK-NOT: movw
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test17_ctlz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test17_ctlz
+; CHECK-NOT: movd
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test18_ctlz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test18_ctlz
+; CHECK-NOT: movq
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test1_cttz(i16 %v) {
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test1_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test2_cttz(i32 %v) {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test2_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test3_cttz(i64 %v) {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test3_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test4_cttz(i16 %v) {
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test4_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test5_cttz(i32 %v) {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test5_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test6_cttz(i64 %v) {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test6_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test7_cttz(i16 %v) {
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test7_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test8_cttz(i32 %v) {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test8_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test9_cttz(i64 %v) {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test9_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test10_cttz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test10_cttz
+; CHECK-NOT: movw
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test11_cttz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test11_cttz
+; CHECK-NOT: movd
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test12_cttz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test12_cttz
+; CHECK-NOT: movq
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test13_cttz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test13_cttz
+; CHECK-NOT: movw
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test14_cttz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test14_cttz
+; CHECK-NOT: movd
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test15_cttz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test15_cttz
+; CHECK-NOT: movq
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test16_cttz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test16_cttz
+; CHECK-NOT: movw
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test17_cttz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test17_cttz
+; CHECK-NOT: movd
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test18_cttz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test18_cttz
+; CHECK-NOT: movq
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+
diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll
index 4a4d178..9ddc847 100644
--- a/test/CodeGen/X86/masked-iv-safe.ll
+++ b/test/CodeGen/X86/masked-iv-safe.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: count_up
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: inc
+; CHECK: incq
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_up(double* %d, i64 %n) nounwind {
@@ -71,7 +71,7 @@ return:
 
 ; CHECK-LABEL: count_up_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: inc
+; CHECK: incq
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_up_signed(double* %d, i64 %n) nounwind {
@@ -174,7 +174,7 @@ return:
 
 ; CHECK-LABEL: another_count_down
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: decq
+; CHECK: addq $-8,
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @another_count_down(double* %d, i64 %n) nounwind {
diff --git a/test/CodeGen/X86/merge_store.ll b/test/CodeGen/X86/merge_store.ll
index 940688c..f98963d 100644
--- a/test/CodeGen/X86/merge_store.ll
+++ b/test/CodeGen/X86/merge_store.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -addr-sink-using-gep=1 | FileCheck %s
 
 define void @merge_store(i32* nocapture %a) {
 ; CHECK-LABEL: merge_store:
diff --git a/test/CodeGen/X86/mod128.ll b/test/CodeGen/X86/mod128.ll
new file mode 100644
index 0000000..4fdee11
--- /dev/null
+++ b/test/CodeGen/X86/mod128.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86-64
+; RUN: llc < %s -mtriple=x86_64-cygwin | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-mingw32 | FileCheck %s -check-prefix=WIN64
+
+define i64 @mod128(i128 %x) {
+  ; X86-64: movl  $3, %edx
+  ; X86-64: xorl  %ecx, %ecx
+  ; X86-64: callq __modti3
+  ; X86-64-NOT: movd %xmm0, %rax
+
+  ; WIN64-NOT: movl $3, %r8d
+  ; WIN64-NOT: xorl %r9d, %r9d
+  ; WIN64-DAG: movq %rdx, 56(%rsp)
+  ; WIN64-DAG: movq %rcx, 48(%rsp)
+  ; WIN64-DAG: leaq 48(%rsp), %rcx
+  ; WIN64-DAG: leaq 32(%rsp), %rdx
+  ; WIN64-DAG: movq $0, 40(%rsp)
+  ; WIN64-DAG: movq $3, 32(%rsp)
+  ; WIN64: callq   __modti3
+  ; WIN64: movd    %xmm0, %rax
+
+  %1 = srem i128 %x, 3
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
diff --git a/test/CodeGen/X86/musttail-indirect.ll b/test/CodeGen/X86/musttail-indirect.ll
new file mode 100644
index 0000000..9d21b5e
--- /dev/null
+++ b/test/CodeGen/X86/musttail-indirect.ll
@@ -0,0 +1,124 @@
+; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-win32 -O0 | FileCheck %s
+
+; IR simplified from the following C++ snippet compiled for i686-windows-msvc:
+
+; struct A { A(); ~A(); int a; };
+;
+; struct B {
+;   virtual int  f(int);
+;   virtual int  g(A, int, A);
+;   virtual void h(A, int, A);
+;   virtual A    i(A, int, A);
+;   virtual A    j(int);
+; };
+;
+; int  (B::*mp_f)(int)       = &B::f;
+; int  (B::*mp_g)(A, int, A) = &B::g;
+; void (B::*mp_h)(A, int, A) = &B::h;
+; A    (B::*mp_i)(A, int, A) = &B::i;
+; A    (B::*mp_j)(int)       = &B::j;
+
+; Each member pointer creates a thunk.  The ones with inalloca are required to
+; tail calls by the ABI, even at O0.
+
+%struct.B = type { i32 (...)** }
+%struct.A = type { i32 }
+
+; CHECK-LABEL: f_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc i32 @f_thunk(%struct.B* %this, i32) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, i32)***
+  %vtable = load i32 (%struct.B*, i32)*** %1
+  %2 = load i32 (%struct.B*, i32)** %vtable
+  %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, i32 %0)
+  ret i32 %3
+}
+
+; Inalloca thunks shouldn't require any stores to the stack.
+; CHECK-LABEL: g_thunk:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc i32 @g_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+  %vtable = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+  %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0)
+  ret i32 %3
+}
+
+; CHECK-LABEL: h_thunk:
+; CHECK: jmpl
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK-NOT: ret
+define x86_thiscallcc void @h_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+  %vtable = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 2
+  %2 = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+  musttail call x86_thiscallcc void %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0)
+  ret void
+}
+
+; CHECK-LABEL: i_thunk:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc %struct.A* @i_thunk(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)***
+  %vtable = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vtable, i32 3
+  %2 = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vfn
+  %3 = musttail call x86_thiscallcc %struct.A* %2(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca %0)
+  ret %struct.A* %3
+}
+
+; CHECK-LABEL: j_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc void @j_thunk(%struct.A* noalias sret %agg.result, %struct.B* %this, i32) {
+entry:
+  %1 = bitcast %struct.B* %this to void (%struct.A*, %struct.B*, i32)***
+  %vtable = load void (%struct.A*, %struct.B*, i32)*** %1
+  %vfn = getelementptr inbounds void (%struct.A*, %struct.B*, i32)** %vtable, i32 4
+  %2 = load void (%struct.A*, %struct.B*, i32)** %vfn
+  musttail call x86_thiscallcc void %2(%struct.A* sret %agg.result, %struct.B* %this, i32 %0)
+  ret void
+}
+
+; CHECK-LABEL: _stdcall_thunk@8:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_stdcallcc i32 @stdcall_thunk(<{ %struct.B*, %struct.A }>* inalloca) {
+entry:
+  %this_ptr = getelementptr inbounds <{ %struct.B*, %struct.A }>* %0, i32 0, i32 0
+  %this = load %struct.B** %this_ptr
+  %1 = bitcast %struct.B* %this to i32 (<{ %struct.B*, %struct.A }>*)***
+  %vtable = load i32 (<{ %struct.B*, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (<{ %struct.B*, %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (<{ %struct.B*, %struct.A }>*)** %vfn
+  %3 = musttail call x86_stdcallcc i32 %2(<{ %struct.B*, %struct.A }>* inalloca %0)
+  ret i32 %3
+}
+
+; CHECK-LABEL: @fastcall_thunk@8:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_fastcallcc i32 @fastcall_thunk(%struct.B* inreg %this, <{ %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A }>*)***
+  %vtable = load i32 (%struct.B*, <{ %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (%struct.B*, <{ %struct.A }>*)** %vfn
+  %3 = musttail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* inalloca %0)
+  ret i32 %3
+}
diff --git a/test/CodeGen/X86/musttail-thiscall.ll b/test/CodeGen/X86/musttail-thiscall.ll
new file mode 100644
index 0000000..8ea1248
--- /dev/null
+++ b/test/CodeGen/X86/musttail-thiscall.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -march=x86 -O0 < %s | FileCheck %s
+
+; CHECK-LABEL: t1:
+; CHECK: jmp {{_?}}t1_callee
+define x86_thiscallcc void @t1(i8* %this) {
+  %adj = getelementptr i8* %this, i32 4
+  musttail call x86_thiscallcc void @t1_callee(i8* %adj)
+  ret void
+}
+declare x86_thiscallcc void @t1_callee(i8* %this)
+
+; CHECK-LABEL: t2:
+; CHECK: jmp {{_?}}t2_callee
+define x86_thiscallcc i32 @t2(i8* %this, i32 %a) {
+  %adj = getelementptr i8* %this, i32 4
+  %rv = musttail call x86_thiscallcc i32 @t2_callee(i8* %adj, i32 %a)
+  ret i32 %rv
+}
+declare x86_thiscallcc i32 @t2_callee(i8* %this, i32 %a)
+
+; CHECK-LABEL: t3:
+; CHECK: jmp {{_?}}t3_callee
+define x86_thiscallcc i8* @t3(i8* %this, <{ i8*, i32 }>* inalloca %args) {
+  %adj = getelementptr i8* %this, i32 4
+  %a_ptr = getelementptr <{ i8*, i32 }>* %args, i32 0, i32 1
+  store i32 0, i32* %a_ptr
+  %rv = musttail call x86_thiscallcc i8* @t3_callee(i8* %adj, <{ i8*, i32 }>* inalloca %args)
+  ret i8* %rv
+}
+declare x86_thiscallcc i8* @t3_callee(i8* %this, <{ i8*, i32 }>* inalloca %args);
diff --git a/test/CodeGen/X86/musttail.ll b/test/CodeGen/X86/musttail.ll
new file mode 100644
index 0000000..ca5d311
--- /dev/null
+++ b/test/CodeGen/X86/musttail.ll
@@ -0,0 +1,90 @@
+; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -march=x86 -O0 < %s | FileCheck %s
+; RUN: llc -march=x86 -disable-tail-calls < %s | FileCheck %s
+
+declare void @t1_callee(i8*)
+define void @t1(i32* %a) {
+; CHECK-LABEL: t1:
+; CHECK: jmp {{_?}}t1_callee
+  %b = bitcast i32* %a to i8*
+  musttail call void @t1_callee(i8* %b)
+  ret void
+}
+
+declare i8* @t2_callee()
+define i32* @t2() {
+; CHECK-LABEL: t2:
+; CHECK: jmp {{_?}}t2_callee
+  %v = musttail call i8* @t2_callee()
+  %w = bitcast i8* %v to i32*
+  ret i32* %w
+}
+
+; Complex frame layout: stack realignment with dynamic alloca.
+define void @t3(i32 %n) alignstack(32) nounwind {
+entry:
+; CHECK: t3:
+; CHECK: pushl %ebp
+; CHECK: pushl %esi
+; CHECK: andl $-32, %esp
+; CHECK: movl %esp, %esi
+; CHECK: popl %esi
+; CHECK: popl %ebp
+; CHECK-NEXT: jmp {{_?}}t3_callee
+  %a = alloca i8, i32 %n
+  call void @capture(i8* %a)
+  musttail call void @t3_callee(i32 %n) nounwind
+  ret void
+}
+
+declare void @capture(i8*)
+declare void @t3_callee(i32)
+
+; Test that we actually copy in and out stack arguments that aren't forwarded
+; without modification.
+define i32 @t4({}* %fn, i32 %n, i32 %r) {
+; CHECK-LABEL: t4:
+; CHECK: incl %[[r:.*]]
+; CHECK: decl %[[n:.*]]
+; CHECK: movl %[[r]], {{[0-9]+}}(%esp)
+; CHECK: movl %[[n]], {{[0-9]+}}(%esp)
+; CHECK: jmpl *%{{.*}}
+
+entry:
+  %r1 = add i32 %r, 1
+  %n1 = sub i32 %n, 1
+  %fn_cast = bitcast {}* %fn to i32 ({}*, i32, i32)*
+  %r2 = musttail call i32 %fn_cast({}* %fn, i32 %n1, i32 %r1)
+  ret i32 %r2
+}
+
+; Combine the complex stack frame with the parameter modification.
+define i32 @t5({}* %fn, i32 %n, i32 %r) alignstack(32) {
+; CHECK-LABEL: t5:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: pushl %esi
+; 	Align the stack.
+; CHECK: andl $-32, %esp
+; CHECK: movl %esp, %esi
+; 	Modify the args.
+; CHECK: incl %[[r:.*]]
+; CHECK: decl %[[n:.*]]
+; 	Store them through ebp, since that's the only stable arg pointer.
+; CHECK: movl %[[r]], {{[0-9]+}}(%ebp)
+; CHECK: movl %[[n]], {{[0-9]+}}(%ebp)
+; 	Epilogue.
+; CHECK: leal {{[-0-9]+}}(%ebp), %esp
+; CHECK: popl %esi
+; CHECK: popl %ebp
+; CHECK: jmpl *%{{.*}}
+
+entry:
+  %a = alloca i8, i32 %n
+  call void @capture(i8* %a)
+  %r1 = add i32 %r, 1
+  %n1 = sub i32 %n, 1
+  %fn_cast = bitcast {}* %fn to i32 ({}*, i32, i32)*
+  %r2 = musttail call i32 %fn_cast({}* %fn, i32 %n1, i32 %r1)
+  ret i32 %r2
+}
diff --git a/test/CodeGen/X86/named-reg-alloc.ll b/test/CodeGen/X86/named-reg-alloc.ll
new file mode 100644
index 0000000..9463ea3
--- /dev/null
+++ b/test/CodeGen/X86/named-reg-alloc.ll
@@ -0,0 +1,14 @@
+; RUN: not llc < %s -mtriple=x86_64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"eax\00"}
diff --git a/test/CodeGen/X86/named-reg-notareg.ll b/test/CodeGen/X86/named-reg-notareg.ll
new file mode 100644
index 0000000..d85dddd
--- /dev/null
+++ b/test/CodeGen/X86/named-reg-notareg.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -mtriple=x86_64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"notareg\00"}
diff --git a/test/CodeGen/X86/no-cfi.ll b/test/CodeGen/X86/no-cfi.ll
deleted file mode 100644
index 5bb9bb2..0000000
--- a/test/CodeGen/X86/no-cfi.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -disable-cfi | FileCheck --check-prefix=STATIC %s
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -disable-cfi -relocation-model=pic | FileCheck --check-prefix=PIC %s
-
-; STATIC:      .ascii   "zPLR"
-; STATIC:      .byte   3
-; STATIC-NEXT: .long   __gxx_personality_v0
-; STATIC-NEXT: .byte   3
-; STATIC-NEXT: .byte   3
-
-; PIC:      .ascii   "zPLR"
-; PIC:      .byte   155
-; PIC-NEXT: .L
-; PIC-NEXT: .long   DW.ref.__gxx_personality_v0-.L
-; PIC-NEXT: .byte   27
-; PIC-NEXT: .byte   27
-
-
-define void @bar() {
-entry:
-  %call = invoke i32 @foo()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:
-  ret void
-
-lpad:
-  %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
-            catch i8* null
-  ret void
-}
-
-declare i32 @foo()
-
-declare i32 @__gxx_personality_v0(...)
diff --git a/test/CodeGen/X86/peep-test-4.ll b/test/CodeGen/X86/peep-test-4.ll
index 884ee7c..1ae621f 100644
--- a/test/CodeGen/X86/peep-test-4.ll
+++ b/test/CodeGen/X86/peep-test-4.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+bmi,+bmi2,+popcnt | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+bmi,+bmi2,+popcnt,+lzcnt | FileCheck %s
 declare void @foo(i32)
+declare void @foo32(i32)
 declare void @foo64(i64)
 
 ; CHECK-LABEL: neg:
@@ -189,3 +190,76 @@ bb:
 return:
   ret void
 }
+
+; CHECK-LABEL: testCTZ
+; CHECK: tzcntq
+; CHECK-NOT: test
+; CHECK: cmovaeq
+declare i64 @llvm.cttz.i64(i64, i1)
+define i64 @testCTZ(i64 %v) nounwind {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 255, i64 %cnt
+  ret i64 %cond
+}
+
+; CHECK-LABEL: testCTZ2
+; CHECK: tzcntl
+; CHECK-NEXT: jb
+; CHECK: jmp foo
+declare i32 @llvm.cttz.i32(i32, i1)
+define void @testCTZ2(i32 %v) nounwind {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %cmp = icmp eq i32 %v, 0
+  br i1 %cmp, label %return, label %bb
+
+bb:
+  tail call void @foo(i32 %cnt)
+  br label %return
+
+return:
+  tail call void @foo32(i32 %cnt)
+  ret void
+}
+
+; CHECK-LABEL: testCTZ3
+; CHECK: tzcntl
+; CHECK-NEXT: jae
+; CHECK: jmp foo
+define void @testCTZ3(i32 %v) nounwind {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %cmp = icmp ne i32 %v, 0
+  br i1 %cmp, label %return, label %bb
+
+bb:
+  tail call void @foo(i32 %cnt)
+  br label %return
+
+return:
+  tail call void @foo32(i32 %cnt)
+  ret void
+}
+
+; CHECK-LABEL: testCLZ
+; CHECK: lzcntq
+; CHECK-NOT: test
+; CHECK: cmovaeq
+declare i64 @llvm.ctlz.i64(i64, i1)
+define i64 @testCLZ(i64 %v) nounwind {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 %cnt, i64 255
+  ret i64 %cond
+}
+
+; CHECK-LABEL: testPOPCNT
+; CHECK: popcntq
+; CHECK-NOT: test
+; CHECK: cmovneq
+declare i64 @llvm.ctpop.i64(i64)
+define i64 @testPOPCNT(i64 %v) nounwind {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %v)
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 %cnt, i64 255
+  ret i64 %cond
+}
diff --git a/test/CodeGen/X86/peephole-multiple-folds.ll b/test/CodeGen/X86/peephole-multiple-folds.ll
index d184569..a6cec66 100644
--- a/test/CodeGen/X86/peephole-multiple-folds.ll
+++ b/test/CodeGen/X86/peephole-multiple-folds.ll
@@ -9,8 +9,8 @@ entry:
 
 loopbody:
 ; CHECK: test_peephole_multi_fold:
-; CHECK: vfmadd231ps (%rdi),
-; CHECK: vfmadd231ps (%rsi),
+; CHECK: vfmadd231ps ({{%rdi|%rcx}}),
+; CHECK: vfmadd231ps ({{%rsi|%rdx}}),
   %vsum1 = phi <8 x float> [ %vsum1.next, %loopbody ], [ zeroinitializer, %entry ]
   %vsum2 = phi <8 x float> [ %vsum2.next, %loopbody ], [ zeroinitializer, %entry ]
   %m1 = load <8 x float>* %p1, align 1
diff --git a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
index f3669fb..d8e4572 100644
--- a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
+++ b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
@@ -2,6 +2,16 @@
 ; Without the last chance recoloring, this test fails with:
 ; "ran out of registers".
 
+; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH
+; Test whether failure due to cutoff for depth is reported
+
+; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF
+; Test whether failure due to cutoff for interference is reported
+
+; RUN: llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 -lcr-max-depth=0 -exhaustive-register-search < %s > %t 2>&1
+; RUN: FileCheck --input-file=%t %s --check-prefix=CHECK-EXHAUSTIVE
+; Test whether exhaustive-register-search can bypass the depth and interference cutoffs of last chance recoloring 
+
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 target triple = "i386-apple-macosx"
 
@@ -12,6 +22,9 @@ target triple = "i386-apple-macosx"
 
 ; Function Attrs: nounwind ssp
 ; CHECK-NOT: ran out of registers during register allocation
+; CHECK-INTERF: error: register allocation failed: maximum interference for recoloring reached
+; CHECK-DEPTH: error: register allocation failed: maximum depth for recoloring reached
+; CHECK-EXHAUSTIVE-NOT: error: register allocation failed: maximum {{depth|interference}} for recoloring reached
 define void @fp_dh_f870bf31fd8ffe068450366e3f05389a(i8* %arg) #0 {
 bb:
   indirectbr i8* undef, [label %bb85, label %bb206]
diff --git a/test/CodeGen/X86/rdtsc.ll b/test/CodeGen/X86/rdtsc.ll
index f21a44c..dba614a 100644
--- a/test/CodeGen/X86/rdtsc.ll
+++ b/test/CodeGen/X86/rdtsc.ll
@@ -1,8 +1,49 @@
-; RUN: llc < %s -march=x86 | grep rdtsc
-; RUN: llc < %s -march=x86-64 | grep rdtsc
-declare i64 @llvm.readcyclecounter()
+; RUN: llc < %s -march=x86-64 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+
+; Verify that we correctly lower ISD::READCYCLECOUNTER.
+
+
+define i64 @test_builtin_readcyclecounter() {
+  %1 = tail call i64 @llvm.readcyclecounter()
+  ret i64 %1
+}
+; CHECK-LABEL: test_builtin_readcyclecounter
+; CHECK: rdtsc
+; X86-NOT: shlq
+; X86-NOT: or
+; CHECK-NOT: mov
+; CHECK: ret
+
+
+; Verify that we correctly lower the Read Cycle Counter GCC x86 builtins
+; (i.e. RDTSC and RDTSCP).
 
-define i64 @foo() {
-	%tmp.1 = call i64 @llvm.readcyclecounter( )		; <i64> [#uses=1]
-	ret i64 %tmp.1
+define i64 @test_builtin_rdtsc() {
+  %1 = tail call i64 @llvm.x86.rdtsc()
+  ret i64 %1
 }
+; CHECK-LABEL: test_builtin_rdtsc
+; CHECK: rdtsc
+; X86-NOT: shlq
+; X86-NOT: or
+; CHECK-NOT: mov
+; CHECK: ret
+
+
+define i64 @test_builtin_rdtscp(i8* %A) {
+  %1 = tail call i64 @llvm.x86.rdtscp(i8* %A)
+  ret i64 %1
+}
+; CHECK-LABEL: test_builtin_rdtscp
+; CHECK: rdtscp
+; X86-NOT: shlq
+; CHECK:   movl	%ecx, (%{{[a-z0-9]+}})
+; X86-NOT: shlq
+; CHECK: ret
+
+
+declare i64 @llvm.readcyclecounter()
+declare i64 @llvm.x86.rdtscp(i8*)
+declare i64 @llvm.x86.rdtsc()
+
diff --git a/test/CodeGen/X86/remat-invalid-liveness.ll b/test/CodeGen/X86/remat-invalid-liveness.ll
new file mode 100644
index 0000000..d285e83
--- /dev/null
+++ b/test/CodeGen/X86/remat-invalid-liveness.ll
@@ -0,0 +1,85 @@
+; RUN: llc %s -mcpu=core2 -o - | FileCheck %s
+; This test was failing while tracking the liveness in the register scavenger
+; during the branching folding pass. The allocation of the subregisters was
+; incorrect.
+; I.e., the faulty pattern looked like:
+; CH = movb 64
+; ECX = movl 3 <- CH was killed here.
+; CH = subb CH, ...
+;
+; This reduced test case triggers the crash before the fix, but does not
+; strictly speaking check that the resulting code is correct.
+; To check that the code is actually correct we would need to check the
+; liveness of the produced code.
+;
+; Currently, we check that after ECX = movl 3, we do not have subb CH,
+; whereas CH could have been redefine in between and that would have been
+; totally fine.
+; <rdar://problem/16582185>
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9"
+
+%struct.A = type { %struct.B, %struct.C, %struct.D*, [1 x i8*] }
+%struct.B = type { i32, [4 x i8] }
+%struct.C = type { i128 }
+%struct.D = type { {}*, [0 x i32] }
+%union.E = type { i32 }
+
+; CHECK-LABEL: __XXX1:
+; CHECK: movl $3, %ecx
+; CHECK-NOT: subb %{{[a-z]+}}, %ch
+; Function Attrs: nounwind optsize ssp
+define fastcc void @__XXX1(%struct.A* %ht) #0 {
+entry:
+  %const72 = bitcast i128 72 to i128
+  %const3 = bitcast i128 3 to i128
+  switch i32 undef, label %if.end196 [
+    i32 1, label %sw.bb.i
+    i32 3, label %sw.bb2.i
+  ]
+
+sw.bb.i:                                          ; preds = %entry
+  %call.i.i.i = tail call i32 undef(%struct.A* %ht, i8 zeroext 22, i32 undef, i32 0, %struct.D* undef)
+  %bf.load.i.i = load i128* undef, align 4
+  %bf.lshr.i.i = lshr i128 %bf.load.i.i, %const72
+  %shl1.i.i = shl nuw nsw i128 %bf.lshr.i.i, 8
+  %shl.i.i = trunc i128 %shl1.i.i to i32
+  br i1 undef, label %cond.false10.i.i, label %__XXX2.exit.i.i
+
+__XXX2.exit.i.i:                    ; preds = %sw.bb.i
+  %extract11.i.i.i = lshr i128 %bf.load.i.i, %const3
+  %extract.t12.i.i.i = trunc i128 %extract11.i.i.i to i32
+  %bf.cast7.i.i.i = and i32 %extract.t12.i.i.i, 3
+  %arrayidx.i.i.i = getelementptr inbounds %struct.A* %ht, i32 0, i32 3, i32 %bf.cast7.i.i.i
+  br label %cond.end12.i.i
+
+cond.false10.i.i:                                 ; preds = %sw.bb.i
+  %arrayidx.i6.i.i = getelementptr inbounds %struct.A* %ht, i32 0, i32 3, i32 0
+  br label %cond.end12.i.i
+
+cond.end12.i.i:                                   ; preds = %cond.false10.i.i, %__XXX2.exit.i.i
+  %.sink.in.i.i = phi i8** [ %arrayidx.i.i.i, %__XXX2.exit.i.i ], [ %arrayidx.i6.i.i, %cond.false10.i.i ]
+  %.sink.i.i = load i8** %.sink.in.i.i, align 4
+  %tmp = bitcast i8* %.sink.i.i to %union.E*
+  br i1 undef, label %for.body.i.i, label %if.end196
+
+for.body.i.i:                                     ; preds = %for.body.i.i, %cond.end12.i.i
+  %weak.i.i = getelementptr inbounds %union.E* %tmp, i32 undef, i32 0
+  %tmp1 = load i32* %weak.i.i, align 4
+  %cmp36.i.i = icmp ne i32 %tmp1, %shl.i.i
+  %or.cond = and i1 %cmp36.i.i, false
+  br i1 %or.cond, label %for.body.i.i, label %if.end196
+
+sw.bb2.i:                                         ; preds = %entry
+  %bf.lshr.i85.i = lshr i128 undef, %const72
+  br i1 undef, label %if.end196, label %__XXX2.exit.i95.i
+
+__XXX2.exit.i95.i:                  ; preds = %sw.bb2.i
+  %extract11.i.i91.i = lshr i128 undef, %const3
+  br label %if.end196
+
+if.end196:                                        ; preds = %__XXX2.exit.i95.i, %sw.bb2.i, %for.body.i.i, %cond.end12.i.i, %entry
+  ret void
+}
+
+attributes #0 = { nounwind optsize ssp "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
diff --git a/test/CodeGen/X86/ret-mmx.ll b/test/CodeGen/X86/ret-mmx.ll
index 091fd53..fc9c78d 100644
--- a/test/CodeGen/X86/ret-mmx.ll
+++ b/test/CodeGen/X86/ret-mmx.ll
@@ -34,6 +34,7 @@ define double @t4() nounwind {
 	ret double bitcast (<2 x i32> <i32 1, i32 0> to double)
 ; CHECK-LABEL: t4:
 ; CHECK: movl $1
+; CHECK-NOT: pshufd
 ; CHECK: movd {{.*}}, %xmm0
 }
 
diff --git a/test/CodeGen/X86/rotate3.ll b/test/CodeGen/X86/rotate3.ll
deleted file mode 100644
index b92f7c2..0000000
--- a/test/CodeGen/X86/rotate3.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; Check that (or (shl x, y), (srl x, (sub 32, y))) is folded into (rotl x, y)
-; and (or (shl x, (sub 32, y)), (srl x, r)) into (rotr x, y) even if the
-; argument is zero extended. Fix for PR16726.
-
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
-
-define zeroext i8 @rolbyte(i32 %nBits_arg, i8 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i8 %x_arg to i32
-  %tmp3 = shl i32 %tmp1, %nBits_arg
-  %tmp8 = sub i32 8, %nBits_arg
-  %tmp10 = lshr i32 %tmp1, %tmp8
-  %tmp11 = or i32 %tmp3, %tmp10
-  %tmp12 = trunc i32 %tmp11 to i8
-  ret i8 %tmp12
-}
-; CHECK:    rolb %cl, %{{[a-z0-9]+}}
-
-
-define zeroext i8 @rorbyte(i32 %nBits_arg, i8 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i8 %x_arg to i32
-  %tmp3 = lshr i32 %tmp1, %nBits_arg
-  %tmp8 = sub i32 8, %nBits_arg
-  %tmp10 = shl i32 %tmp1, %tmp8
-  %tmp11 = or i32 %tmp3, %tmp10
-  %tmp12 = trunc i32 %tmp11 to i8
-  ret i8 %tmp12
-}
-; CHECK:    rorb %cl, %{{[a-z0-9]+}}
-
-define zeroext i16 @rolword(i32 %nBits_arg, i16 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i16 %x_arg to i32
-  %tmp3 = shl i32 %tmp1, %nBits_arg
-  %tmp8 = sub i32 16, %nBits_arg
-  %tmp10 = lshr i32 %tmp1, %tmp8
-  %tmp11 = or i32 %tmp3, %tmp10
-  %tmp12 = trunc i32 %tmp11 to i16
-  ret i16 %tmp12
-}
-; CHECK:    rolw %cl, %{{[a-z0-9]+}}
-
-define zeroext i16 @rorword(i32 %nBits_arg, i16 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i16 %x_arg to i32
-  %tmp3 = lshr i32 %tmp1, %nBits_arg
-  %tmp8 = sub i32 16, %nBits_arg
-  %tmp10 = shl i32 %tmp1, %tmp8
-  %tmp11 = or i32 %tmp3, %tmp10
-  %tmp12 = trunc i32 %tmp11 to i16
-  ret i16 %tmp12
-}
-; CHECK:    rorw %cl, %{{[a-z0-9]+}}
-
-define i64 @roldword(i64 %nBits_arg, i32 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i32 %x_arg to i64
-  %tmp3 = shl i64 %tmp1, %nBits_arg
-  %tmp8 = sub i64 32, %nBits_arg
-  %tmp10 = lshr i64 %tmp1, %tmp8
-  %tmp11 = or i64 %tmp3, %tmp10
-  ret i64 %tmp11
-}
-; CHECK:    roll %cl, %{{[a-z0-9]+}}
-
-define zeroext i64 @rordword(i64 %nBits_arg, i32 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i32 %x_arg to i64
-  %tmp3 = lshr i64 %tmp1, %nBits_arg
-  %tmp8 = sub i64 32, %nBits_arg
-  %tmp10 = shl i64 %tmp1, %tmp8
-  %tmp11 = or i64 %tmp3, %tmp10
-  ret i64 %tmp11
-}
-; CHECK:    rorl %cl, %{{[a-z0-9]+}}
diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index e170762..b82be41 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -segmented-stacks -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define i32 @test_basic(i32 %l) {
+define i32 @test_basic(i32 %l) #0 {
         %mem = alloca i32, i32 %l
         call void @dummy_use (i32* %mem, i32 %l)
         %terminate = icmp eq i32 %l, 0
@@ -62,3 +62,5 @@ false:
 ; X64:      movq %rax, %rdi
 
 }
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index c02152b..9dab3cd 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -1,23 +1,23 @@
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
-; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
-; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-FreeBSD
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-MinGW
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
+; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
+; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -verify-machineinstrs | FileCheck %s -check-prefix=X64-FreeBSD
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X64-MinGW
 
 ; We used to crash with filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -segmented-stacks -filetype=obj
-
-; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-solaris -segmented-stacks 2> %t.log
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -filetype=obj
+
+; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-solaris 2> %t.log
 ; RUN: FileCheck %s -input-file=%t.log -check-prefix=X64-Solaris
-; RUN: not llc < %s -mcpu=generic -mtriple=i686-freebsd -segmented-stacks 2> %t.log
+; RUN: not llc < %s -mcpu=generic -mtriple=i686-freebsd 2> %t.log
 ; RUN: FileCheck %s -input-file=%t.log -check-prefix=X32-FreeBSD
 
 ; X64-Solaris: Segmented stacks not supported on this platform
@@ -26,7 +26,7 @@
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define void @test_basic() {
+define void @test_basic() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
@@ -104,16 +104,18 @@ define void @test_basic() {
 
 }
 
-define i32 @test_nested(i32 * nest %closure, i32 %other) {
+define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        %addend = load i32 * %closure
        %result = add i32 %other, %addend
+       %mem = alloca i32, i32 10
+       call void @dummy_use (i32* %mem, i32 10)
        ret i32 %result
 
 ; X32-Linux:       cmpl %gs:48, %esp
 ; X32-Linux-NEXT:  ja      .LBB1_2
 
 ; X32-Linux:       pushl $4
-; X32-Linux-NEXT:  pushl $0
+; X32-Linux-NEXT:  pushl $60
 ; X32-Linux-NEXT:  calll __morestack
 ; X32-Linux-NEXT:  ret
 
@@ -121,7 +123,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X64-Linux-NEXT:  ja      .LBB1_2
 
 ; X64-Linux:       movq %r10, %rax
-; X64-Linux-NEXT:  movabsq $0, %r10
+; X64-Linux-NEXT:  movabsq $56, %r10
 ; X64-Linux-NEXT:  movabsq $0, %r11
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
@@ -132,7 +134,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X32-Darwin-NEXT: ja      LBB1_2
 
 ; X32-Darwin:      pushl $4
-; X32-Darwin-NEXT: pushl $0
+; X32-Darwin-NEXT: pushl $60
 ; X32-Darwin-NEXT: calll ___morestack
 ; X32-Darwin-NEXT: ret
 
@@ -140,7 +142,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X64-Darwin-NEXT: ja      LBB1_2
 
 ; X64-Darwin:      movq %r10, %rax
-; X64-Darwin-NEXT: movabsq $0, %r10
+; X64-Darwin-NEXT: movabsq $56, %r10
 ; X64-Darwin-NEXT: movabsq $0, %r11
 ; X64-Darwin-NEXT: callq ___morestack
 ; X64-Darwin-NEXT: ret
@@ -150,7 +152,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X32-MinGW-NEXT:  ja      LBB1_2
 
 ; X32-MinGW:       pushl $4
-; X32-MinGW-NEXT:  pushl $0
+; X32-MinGW-NEXT:  pushl $52
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
@@ -159,7 +161,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X64-MinGW-NEXT:  ja      .LBB1_2
 
 ; X64-MinGW:       movq %r10, %rax
-; X64-MinGW-NEXT:  movabsq $0, %r10
+; X64-MinGW-NEXT:  movabsq $88, %r10
 ; X64-MinGW-NEXT:  movabsq $32, %r11
 ; X64-MinGW-NEXT:  callq __morestack
 ; X64-MinGW-NEXT:  retq
@@ -169,7 +171,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X64-FreeBSD-NEXT:  ja      .LBB1_2
 
 ; X64-FreeBSD:       movq %r10, %rax
-; X64-FreeBSD-NEXT:  movabsq $0, %r10
+; X64-FreeBSD-NEXT:  movabsq $56, %r10
 ; X64-FreeBSD-NEXT:  movabsq $0, %r11
 ; X64-FreeBSD-NEXT:  callq __morestack
 ; X64-FreeBSD-NEXT:  ret
@@ -177,7 +179,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 
 }
 
-define void @test_large() {
+define void @test_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -249,7 +251,7 @@ define void @test_large() {
 
 }
 
-define fastcc void @test_fastcc() {
+define fastcc void @test_fastcc() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
         ret void
@@ -327,7 +329,7 @@ define fastcc void @test_fastcc() {
 
 }
 
-define fastcc void @test_fastcc_large() {
+define fastcc void @test_fastcc_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -412,7 +414,7 @@ define fastcc void @test_fastcc_large() {
 
 }
 
-define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) {
+define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 %a)
         ret void
@@ -434,3 +436,30 @@ define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) {
 ; X32-Darwin-NEXT: ret
 
 }
+
+define void @test_nostack() #0 {
+	ret void
+
+; X32-Linux-LABEL: test_nostack:
+; X32-Linux-NOT:   calll __morestack
+
+; X64-Linux-LABEL: test_nostack:
+; X32-Linux-NOT:   callq __morestack
+
+; X32-Darwin-LABEL: test_nostack:
+; X32-Darwin-NOT:   calll __morestack
+
+; X64-Darwin-LABEL: test_nostack:
+; X64-Darwin-NOT:   callq __morestack
+
+; X32-MinGW-LABEL: test_nostack:
+; X32-MinGW-NOT:   calll __morestack
+
+; X64-MinGW-LABEL: test_nostack:
+; X64-MinGW-NOT:   callq __morestack
+
+; X64-FreeBSD-LABEL: test_nostack:
+; X64-FreeBSD-NOT:   callq __morestack
+}
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index 628dba0..e8d3d6f 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -221,3 +221,21 @@ entry:
  %double2float.i = fptrunc <4 x double> %0 to <4 x float>
  ret <4 x float> %double2float.i
 }
+
+define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
+; CHECK-LABEL: test_insert_64_zext
+; CHECK-NOT: xor
+; CHECK: movq
+  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %1
+}
+
+define <4 x i32> @PR19721(<4 x i32> %i) {
+  %bc = bitcast <4 x i32> %i to i128
+  %insert = and i128 %bc, -4294967296
+  %bc2 = bitcast i128 %insert to <4 x i32>
+  ret <4 x i32> %bc2
+
+; CHECK-LABEL: PR19721
+; CHECK: punpckldq
+}
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 6d5b192..18bdcb3 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -209,7 +209,7 @@ entry:
 ; X64-LABEL: t13:
 ; X64: 	punpcklqdq	%xmm0, %xmm1
 ; X64: 	pextrw	$3, %xmm1, %eax
-; X64: 	pshufd	$52, %xmm1, %xmm0
+; X64: 	pshufhw	$12, %xmm1, %xmm0
 ; X64: 	pinsrw	$4, %eax, %xmm0
 ; X64: 	ret
 }
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
index 4681fde..8ad7987 100644
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2
@@ -10,7 +10,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 
 
 ;CHECK-LABEL: vsel_4xi8:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
@@ -18,7 +18,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 }
 
 ;CHECK-LABEL: vsel_4xi16:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
@@ -27,7 +27,7 @@ define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
 
 
 ;CHECK-LABEL: vsel_i32:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
@@ -88,3 +88,35 @@ entry:
   store double %extract214vector_func.i, double addrspace(1)* undef, align 8
   ret void
 }
+
+; If we can figure out a blend has a constant mask, we should emit the
+; blend instruction with an immediate mask
+define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
+; In this case, we emit a simple movss
+; CHECK-LABEL: constant_blendvpd
+; CHECK: movsd
+; CHECK: ret
+  %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %xy, <2 x double> %ab
+  ret <2 x double> %1
+}
+
+define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
+; CHECK-LABEL: constant_blendvps
+; CHECK-NOT: mov
+; CHECK: blendps $7
+; CHECK: ret
+  %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %xyzw, <4 x float> %abcd
+  ret <4 x float> %1
+}
+
+define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
+; CHECK-LABEL: constant_pblendvb:
+; CHECK: movaps
+; CHECK: pblendvb
+; CHECK: ret
+  %1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd
+  ret <16 x i8> %1
+}
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index c15e24c..a3c6201 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
 
 @g16 = external global i16
 
@@ -249,3 +249,446 @@ entry:
 ; X64: ret
 }
 
+define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+entry:
+  %0 = load <4 x float>* %pb, align 16
+  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_1:
+; CHECK-NOT: movss
+; CHECK-NOT: shufps
+; CHECK: insertps    $48,
+; CHECK: ret
+}
+
+define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
+entry:
+  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x float> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_2:
+; CHECK-NOT: shufps
+; CHECK: insertps    $96,
+; CHECK: ret
+}
+
+; For loading an i32 from memory into an xmm register we use pinsrd
+; instead of insertps
+define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
+entry:
+  %0 = load <4 x i32>* %pb, align 16
+  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecinit6
+; CHECK-LABEL: pinsrd_from_shufflevector_i32:
+; CHECK-NOT: movss
+; CHECK-NOT: shufps
+; CHECK: pinsrd  $3,
+; CHECK: ret
+}
+
+define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+  ret <4 x i32> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_i32_2:
+; CHECK-NOT: shufps
+; CHECK-NOT: movaps
+; CHECK: insertps    $208,
+; CHECK: ret
+}
+
+define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
+; CHECK-LABEL: insertps_from_load_ins_elt_undef:
+; CHECK-NOT: movss
+; CHECK-NOT: shufps
+; CHECK: insertps    $16,
+; CHECK: ret
+  %1 = load float* %b, align 4
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
+; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
+;; aCHECK-NOT: movd
+; CHECK-NOT: shufps
+; CHECK: insertps    $32,
+; CHECK: ret
+  %1 = load i32* %b, align 4
+  %2 = insertelement <4 x i32> undef, i32 %1, i32 0
+  %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+  ret <4 x i32> %result
+}
+
+;;;;;; Shuffles optimizable with a single insertps instruction
+define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYZ0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $8
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecext3 = extractelement <4 x float> %x, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
+  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+  ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XY00:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $12
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYY0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $104
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
+  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+  ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYW0:
+; CHECK: insertps    $232
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecext2 = extractelement <4 x float> %x, i32 3
+  %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_W00W:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $198
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 3
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X00A:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
+  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X00X:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
+  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X0YC:
+; CHECK: shufps
+; CHECK-NOT: movhlps
+; CHECK-NOT: shufps
+; CHECK: insertps    $176
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x float> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYZ0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $8
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecext3 = extractelement <4 x i32> %x, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
+  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
+  ret <4 x i32> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XY00:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $12
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYY0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $104
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
+  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
+  ret <4 x i32> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYW0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $232
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecext2 = extractelement <4 x i32> %x, i32 3
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_W00W:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $198
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 3
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X00A:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X00X:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X0YC:
+; CHECK: shufps
+; CHECK-NOT: movhlps
+; CHECK-NOT: shufps
+; CHECK: insertps    $176
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+  %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x i32> %vecinit5
+}
+
+;; Test for a bug in the first implementation of LowerBuildVectorv4x32
+define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
+; CHECK-LABEL: test_insertps_no_undef:
+; CHECK: movaps  %xmm0, %xmm1
+; CHECK-NEXT: insertps        $8, %xmm1, %xmm1
+; CHECK-NEXT: maxps   %xmm1, %xmm0
+; CHECK-NEXT: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecext3 = extractelement <4 x float> %x, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
+  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+  %mask = fcmp olt <4 x float> %vecinit5, %x
+  %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
+  ret <4 x float> %res
+}
+
+define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: blendvb_fallback
+; CHECK: blendvb
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %ret
+}
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+  ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+  ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; X32: movl    8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+  %2 = load <4 x float>* %1, align 16
+  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+  ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl    4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %b, align 4
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK: movss
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: addps
+; CHECK: addps
+; CHECK: addps
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+  %11 = fadd <4 x float> %7, %8
+  %12 = fadd <4 x float> %9, %10
+  %13 = fadd <4 x float> %11, %12
+  ret <4 x float> %13
+}
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index fb7e2db..cf88ade 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -33,7 +33,7 @@ attributes #0 = { sspreq }
 !5 = metadata !{}
 !6 = metadata !{metadata !7}
 !7 = metadata !{i32 786472, metadata !"max_frame_size", i64 0} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
-!8 = metadata !{metadata !9}
+!8 = metadata !{metadata !9, metadata !24, metadata !41, metadata !65}
 !9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"read_response_size", metadata !"read_response_size", metadata !"_Z18read_response_sizev", i32 27, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z18read_response_sizev, null, null, metadata !14, i32 27} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
 !10 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
 !11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
diff --git a/test/CodeGen/X86/stack-protector.ll b/test/CodeGen/X86/stack-protector.ll
index 265ec80..4db0f9a 100644
--- a/test/CodeGen/X86/stack-protector.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -16,13 +16,14 @@
 %struct.anon.0 = type { %union.anon.1 }
 %union.anon.1 = type { [2 x i8] }
 %struct.small = type { i8 }
+%struct.small_char = type { i32, [5 x i8] }
 
 @.str = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
 
 ; test1a: array of [16 x i8] 
 ;         no ssp attribute
 ; Requires no protector.
-define void @test1a(i8* %a) nounwind uwtable {
+define void @test1a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test1a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -53,7 +54,8 @@ entry:
 ; test1b: array of [16 x i8] 
 ;         ssp attribute
 ; Requires protector.
-define void @test1b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test1b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test1b:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -88,7 +90,8 @@ entry:
 ; test1c: array of [16 x i8] 
 ;         sspstrong attribute
 ; Requires protector.
-define void @test1c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test1c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test1c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -119,7 +122,8 @@ entry:
 ; test1d: array of [16 x i8] 
 ;         sspreq attribute
 ; Requires protector.
-define void @test1d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test1d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test1d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -150,7 +154,7 @@ entry:
 ; test2a: struct { [16 x i8] }
 ;         no ssp attribute
 ; Requires no protector.
-define void @test2a(i8* %a) nounwind uwtable {
+define void @test2a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test2a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -183,7 +187,8 @@ entry:
 ; test2b: struct { [16 x i8] }
 ;          ssp attribute
 ; Requires protector.
-define void @test2b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test2b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test2b:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -216,7 +221,8 @@ entry:
 ; test2c: struct { [16 x i8] }
 ;          sspstrong attribute
 ; Requires protector.
-define void @test2c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test2c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test2c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -249,7 +255,8 @@ entry:
 ; test2d: struct { [16 x i8] }
 ;          sspreq attribute
 ; Requires protector.
-define void @test2d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test2d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test2d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -282,7 +289,7 @@ entry:
 ; test3a:  array of [4 x i8]
 ;          no ssp attribute
 ; Requires no protector.
-define void @test3a(i8* %a) nounwind uwtable {
+define void @test3a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test3a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -313,7 +320,8 @@ entry:
 ; test3b:  array [4 x i8]
 ;          ssp attribute
 ; Requires no protector.
-define void @test3b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test3b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test3b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -344,7 +352,8 @@ entry:
 ; test3c:  array of [4 x i8]
 ;          sspstrong attribute
 ; Requires protector.
-define void @test3c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test3c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test3c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -375,7 +384,8 @@ entry:
 ; test3d:  array of [4 x i8]
 ;          sspreq attribute
 ; Requires protector.
-define void @test3d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test3d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test3d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -406,7 +416,7 @@ entry:
 ; test4a:  struct { [4 x i8] }
 ;          no ssp attribute
 ; Requires no protector.
-define void @test4a(i8* %a) nounwind uwtable {
+define void @test4a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test4a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -439,7 +449,8 @@ entry:
 ; test4b:  struct { [4 x i8] }
 ;          ssp attribute
 ; Requires no protector.
-define void @test4b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test4b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test4b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -472,7 +483,8 @@ entry:
 ; test4c:  struct { [4 x i8] }
 ;          sspstrong attribute
 ; Requires protector.
-define void @test4c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test4c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test4c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -505,7 +517,8 @@ entry:
 ; test4d:  struct { [4 x i8] }
 ;          sspreq attribute
 ; Requires protector.
-define void @test4d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test4d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test4d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -538,7 +551,7 @@ entry:
 ; test5a:  no arrays / no nested arrays
 ;          no ssp attribute
 ; Requires no protector.
-define void @test5a(i8* %a) nounwind uwtable {
+define void @test5a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test5a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -565,7 +578,8 @@ entry:
 ; test5b:  no arrays / no nested arrays
 ;          ssp attribute
 ; Requires no protector.
-define void @test5b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test5b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test5b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -592,7 +606,8 @@ entry:
 ; test5c:  no arrays / no nested arrays
 ;          sspstrong attribute
 ; Requires no protector.
-define void @test5c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test5c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test5c:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -619,7 +634,8 @@ entry:
 ; test5d:  no arrays / no nested arrays
 ;          sspreq attribute
 ; Requires protector.
-define void @test5d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test5d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test5d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -646,7 +662,7 @@ entry:
 ; test6a:  Address-of local taken (j = &a)
 ;          no ssp attribute
 ; Requires no protector.
-define void @test6a() nounwind uwtable {
+define void @test6a() {
 entry:
 ; LINUX-I386-LABEL: test6a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -677,7 +693,8 @@ entry:
 ; test6b:  Address-of local taken (j = &a)
 ;          ssp attribute
 ; Requires no protector.
-define void @test6b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test6b() #0 {
 entry:
 ; LINUX-I386-LABEL: test6b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -708,7 +725,8 @@ entry:
 ; test6c:  Address-of local taken (j = &a)
 ;          sspstrong attribute
 ; Requires protector.
-define void @test6c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test6c() #1 {
 entry:
 ; LINUX-I386-LABEL: test6c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -739,7 +757,8 @@ entry:
 ; test6d:  Address-of local taken (j = &a)
 ;          sspreq attribute
 ; Requires protector.
-define void @test6d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test6d() #2 {
 entry:
 ; LINUX-I386-LABEL: test6d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -770,7 +789,7 @@ entry:
 ; test7a:  PtrToInt Cast
 ;          no ssp attribute
 ; Requires no protector.
-define void @test7a() nounwind uwtable readnone {
+define void @test7a()  {
 entry:
 ; LINUX-I386-LABEL: test7a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -796,7 +815,8 @@ entry:
 ; test7b:  PtrToInt Cast
 ;          ssp attribute
 ; Requires no protector.
-define void @test7b() nounwind uwtable readnone ssp {
+; Function Attrs: ssp 
+define void @test7b() #0 {
 entry:
 ; LINUX-I386-LABEL: test7b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -822,7 +842,8 @@ entry:
 ; test7c:  PtrToInt Cast
 ;          sspstrong attribute
 ; Requires protector.
-define void @test7c() nounwind uwtable readnone sspstrong {
+; Function Attrs: sspstrong 
+define void @test7c() #1 {
 entry:
 ; LINUX-I386-LABEL: test7c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -848,7 +869,8 @@ entry:
 ; test7d:  PtrToInt Cast
 ;          sspreq attribute
 ; Requires protector.
-define void @test7d() nounwind uwtable readnone sspreq {
+; Function Attrs: sspreq 
+define void @test7d() #2 {
 entry:
 ; LINUX-I386-LABEL: test7d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -874,7 +896,7 @@ entry:
 ; test8a:  Passing addr-of to function call
 ;          no ssp attribute
 ; Requires no protector.
-define void @test8a() nounwind uwtable {
+define void @test8a() {
 entry:
 ; LINUX-I386-LABEL: test8a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -892,14 +914,15 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %b = alloca i32, align 4
-  call void @funcall(i32* %b) nounwind
+  call void @funcall(i32* %b)
   ret void
 }
 
 ; test8b:  Passing addr-of to function call
 ;          ssp attribute
 ; Requires no protector.
-define void @test8b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test8b() #0 {
 entry:
 ; LINUX-I386-LABEL: test8b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -917,14 +940,15 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %b = alloca i32, align 4
-  call void @funcall(i32* %b) nounwind
+  call void @funcall(i32* %b)
   ret void
 }
 
 ; test8c:  Passing addr-of to function call
 ;          sspstrong attribute
 ; Requires protector.
-define void @test8c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test8c() #1 {
 entry:
 ; LINUX-I386-LABEL: test8c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -942,14 +966,15 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %b = alloca i32, align 4
-  call void @funcall(i32* %b) nounwind
+  call void @funcall(i32* %b)
   ret void
 }
 
 ; test8d:  Passing addr-of to function call
 ;          sspreq attribute
 ; Requires protector.
-define void @test8d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test8d() #2 {
 entry:
 ; LINUX-I386-LABEL: test8d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -967,14 +992,14 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %b = alloca i32, align 4
-  call void @funcall(i32* %b) nounwind
+  call void @funcall(i32* %b)
   ret void
 }
 
 ; test9a:  Addr-of in select instruction
 ;          no ssp attribute
 ; Requires no protector.
-define void @test9a() nounwind uwtable {
+define void @test9a() {
 entry:
 ; LINUX-I386-LABEL: test9a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -992,7 +1017,7 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
@@ -1003,7 +1028,8 @@ entry:
 ; test9b:  Addr-of in select instruction
 ;          ssp attribute
 ; Requires no protector.
-define void @test9b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test9b() #0 {
 entry:
 ; LINUX-I386-LABEL: test9b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1021,7 +1047,7 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
@@ -1032,7 +1058,8 @@ entry:
 ; test9c:  Addr-of in select instruction
 ;          sspstrong attribute
 ; Requires protector.
-define void @test9c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test9c() #1 {
 entry:
 ; LINUX-I386-LABEL: test9c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1050,7 +1077,7 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
@@ -1061,7 +1088,8 @@ entry:
 ; test9d:  Addr-of in select instruction
 ;          sspreq attribute
 ; Requires protector.
-define void @test9d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test9d() #2 {
 entry:
 ; LINUX-I386-LABEL: test9d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1079,7 +1107,7 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
@@ -1090,7 +1118,7 @@ entry:
 ; test10a: Addr-of in phi instruction
 ;          no ssp attribute
 ; Requires no protector.
-define void @test10a() nounwind uwtable {
+define void @test10a() {
 entry:
 ; LINUX-I386-LABEL: test10a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1108,13 +1136,13 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp = fcmp ogt double %call, 3.140000e+00
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call1 = call double @testi_aux() nounwind
+  %call1 = call double @testi_aux()
   store double %call1, double* %x, align 8
   br label %if.end4
 
@@ -1127,14 +1155,15 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
 ; test10b: Addr-of in phi instruction
 ;          ssp attribute
 ; Requires no protector.
-define void @test10b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test10b() #0 {
 entry:
 ; LINUX-I386-LABEL: test10b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1152,13 +1181,13 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp = fcmp ogt double %call, 3.140000e+00
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call1 = call double @testi_aux() nounwind
+  %call1 = call double @testi_aux()
   store double %call1, double* %x, align 8
   br label %if.end4
 
@@ -1171,14 +1200,15 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
 ; test10c: Addr-of in phi instruction
 ;          sspstrong attribute
 ; Requires protector.
-define void @test10c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test10c() #1 {
 entry:
 ; LINUX-I386-LABEL: test10c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1196,13 +1226,13 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp = fcmp ogt double %call, 3.140000e+00
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call1 = call double @testi_aux() nounwind
+  %call1 = call double @testi_aux()
   store double %call1, double* %x, align 8
   br label %if.end4
 
@@ -1215,14 +1245,15 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
 ; test10d: Addr-of in phi instruction
 ;          sspreq attribute
 ; Requires protector.
-define void @test10d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test10d() #2 {
 entry:
 ; LINUX-I386-LABEL: test10d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1240,13 +1271,13 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp = fcmp ogt double %call, 3.140000e+00
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call1 = call double @testi_aux() nounwind
+  %call1 = call double @testi_aux()
   store double %call1, double* %x, align 8
   br label %if.end4
 
@@ -1259,14 +1290,14 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
 ; test11a: Addr-of struct element. (GEP followed by store).
 ;          no ssp attribute
 ; Requires no protector.
-define void @test11a() nounwind uwtable {
+define void @test11a() {
 entry:
 ; LINUX-I386-LABEL: test11a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1295,7 +1326,8 @@ entry:
 ; test11b: Addr-of struct element. (GEP followed by store).
 ;          ssp attribute
 ; Requires no protector.
-define void @test11b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test11b() #0 {
 entry:
 ; LINUX-I386-LABEL: test11b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1324,7 +1356,8 @@ entry:
 ; test11c: Addr-of struct element. (GEP followed by store).
 ;          sspstrong attribute
 ; Requires protector.
-define void @test11c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test11c() #1 {
 entry:
 ; LINUX-I386-LABEL: test11c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1353,7 +1386,8 @@ entry:
 ; test11d: Addr-of struct element. (GEP followed by store).
 ;          sspreq attribute
 ; Requires protector.
-define void @test11d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test11d() #2 {
 entry:
 ; LINUX-I386-LABEL: test11d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1382,7 +1416,7 @@ entry:
 ; test12a: Addr-of struct element, GEP followed by ptrtoint.
 ;          no ssp attribute
 ; Requires no protector.
-define void @test12a() nounwind uwtable {
+define void @test12a() {
 entry:
 ; LINUX-I386-LABEL: test12a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1410,7 +1444,8 @@ entry:
 ; test12b: Addr-of struct element, GEP followed by ptrtoint.
 ;          ssp attribute
 ; Requires no protector.
-define void @test12b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test12b() #0 {
 entry:
 ; LINUX-I386-LABEL: test12b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1437,8 +1472,8 @@ entry:
 
 ; test12c: Addr-of struct element, GEP followed by ptrtoint.
 ;          sspstrong attribute
-; Requires protector.
-define void @test12c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test12c() #1 {
 entry:
 ; LINUX-I386-LABEL: test12c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1466,7 +1501,8 @@ entry:
 ; test12d: Addr-of struct element, GEP followed by ptrtoint.
 ;          sspreq attribute
 ; Requires protector.
-define void @test12d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test12d() #2 {
 entry:
 ; LINUX-I386-LABEL: test12d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1494,7 +1530,7 @@ entry:
 ; test13a: Addr-of struct element, GEP followed by callinst.
 ;          no ssp attribute
 ; Requires no protector.
-define void @test13a() nounwind uwtable {
+define void @test13a() {
 entry:
 ; LINUX-I386-LABEL: test13a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1513,14 +1549,15 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
 ; test13b: Addr-of struct element, GEP followed by callinst.
 ;          ssp attribute
 ; Requires no protector.
-define void @test13b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test13b() #0 {
 entry:
 ; LINUX-I386-LABEL: test13b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1539,14 +1576,15 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
 ; test13c: Addr-of struct element, GEP followed by callinst.
 ;          sspstrong attribute
 ; Requires protector.
-define void @test13c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test13c() #1 {
 entry:
 ; LINUX-I386-LABEL: test13c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1565,14 +1603,15 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
 ; test13d: Addr-of struct element, GEP followed by callinst.
 ;          sspreq attribute
 ; Requires protector.
-define void @test13d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test13d() #2 {
 entry:
 ; LINUX-I386-LABEL: test13d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1591,14 +1630,14 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
 ; test14a: Addr-of a local, optimized into a GEP (e.g., &a - 12)
 ;          no ssp attribute
 ; Requires no protector.
-define void @test14a() nounwind uwtable {
+define void @test14a() {
 entry:
 ; LINUX-I386-LABEL: test14a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1617,14 +1656,15 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
 ; test14b: Addr-of a local, optimized into a GEP (e.g., &a - 12)
 ;          ssp attribute
 ; Requires no protector.
-define void @test14b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test14b() #0 {
 entry:
 ; LINUX-I386-LABEL: test14b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1643,14 +1683,15 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
 ; test14c: Addr-of a local, optimized into a GEP (e.g., &a - 12)
 ;          sspstrong attribute
 ; Requires protector.
-define void @test14c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test14c() #1 {
 entry:
 ; LINUX-I386-LABEL: test14c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1669,14 +1710,15 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
 ; test14d: Addr-of a local, optimized into a GEP (e.g., &a - 12)
 ;          sspreq  attribute
 ; Requires protector.
-define void @test14d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test14d() #2 {
 entry:
 ; LINUX-I386-LABEL: test14d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1695,7 +1737,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
@@ -1703,7 +1745,7 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          no ssp attribute
 ; Requires no protector.
-define void @test15a() nounwind uwtable {
+define void @test15a() {
 entry:
 ; LINUX-I386-LABEL: test15a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1734,7 +1776,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          ssp attribute
 ; Requires no protector.
-define void @test15b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test15b() #0 {
 entry:
 ; LINUX-I386-LABEL: test15b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1765,7 +1808,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          sspstrong attribute
 ; Requires protector.
-define void @test15c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test15c() #1 {
 entry:
 ; LINUX-I386-LABEL: test15c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1796,7 +1840,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          sspreq attribute
 ; Requires protector.
-define void @test15d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test15d() #2 {
 entry:
 ; LINUX-I386-LABEL: test15d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1827,7 +1872,7 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          no ssp attribute
 ; Requires no protector.
-define void @test16a() nounwind uwtable {
+define void @test16a() {
 entry:
 ; LINUX-I386-LABEL: test16a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1847,7 +1892,7 @@ entry:
   %a = alloca i32, align 4
   store i32 0, i32* %a, align 4
   %0 = bitcast i32* %a to float*
-  call void @funfloat(float* %0) nounwind
+  call void @funfloat(float* %0)
   ret void
 }
 
@@ -1855,7 +1900,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          ssp attribute
 ; Requires no protector.
-define void @test16b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test16b() #0 {
 entry:
 ; LINUX-I386-LABEL: test16b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1875,7 +1921,7 @@ entry:
   %a = alloca i32, align 4
   store i32 0, i32* %a, align 4
   %0 = bitcast i32* %a to float*
-  call void @funfloat(float* %0) nounwind
+  call void @funfloat(float* %0)
   ret void
 }
 
@@ -1883,7 +1929,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          sspstrong attribute
 ; Requires protector.
-define void @test16c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test16c() #1 {
 entry:
 ; LINUX-I386-LABEL: test16c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1903,7 +1950,7 @@ entry:
   %a = alloca i32, align 4
   store i32 0, i32* %a, align 4
   %0 = bitcast i32* %a to float*
-  call void @funfloat(float* %0) nounwind
+  call void @funfloat(float* %0)
   ret void
 }
 
@@ -1911,7 +1958,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          sspreq attribute
 ; Requires protector.
-define void @test16d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test16d() #2 {
 entry:
 ; LINUX-I386-LABEL: test16d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1931,14 +1979,14 @@ entry:
   %a = alloca i32, align 4
   store i32 0, i32* %a, align 4
   %0 = bitcast i32* %a to float*
-  call void @funfloat(float* %0) nounwind
+  call void @funfloat(float* %0)
   ret void
 }
 
 ; test17a: Addr-of a vector nested in a struct
 ;          no ssp attribute
 ; Requires no protector.
-define void @test17a() nounwind uwtable {
+define void @test17a() {
 entry:
 ; LINUX-I386-LABEL: test17a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1958,14 +2006,15 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
 ; test17b: Addr-of a vector nested in a struct
 ;          ssp attribute
 ; Requires no protector.
-define void @test17b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test17b() #0 {
 entry:
 ; LINUX-I386-LABEL: test17b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1985,14 +2034,15 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
 ; test17c: Addr-of a vector nested in a struct
 ;          sspstrong attribute
 ; Requires protector.
-define void @test17c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test17c() #1 {
 entry:
 ; LINUX-I386-LABEL: test17c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2012,14 +2062,15 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
 ; test17d: Addr-of a vector nested in a struct
 ;          sspreq attribute
 ; Requires protector.
-define void @test17d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test17d() #2 {
 entry:
 ; LINUX-I386-LABEL: test17d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2039,14 +2090,14 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
 ; test18a: Addr-of a variable passed into an invoke instruction.
 ;          no ssp attribute
 ; Requires no protector.
-define i32 @test18a() uwtable {
+define i32 @test18a()  {
 entry:
 ; LINUX-I386-LABEL: test18a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2082,7 +2133,8 @@ lpad:
 ; test18b: Addr-of a variable passed into an invoke instruction.
 ;          ssp attribute
 ; Requires no protector.
-define i32 @test18b() uwtable ssp {
+; Function Attrs: ssp 
+define i32 @test18b() #0 {
 entry:
 ; LINUX-I386-LABEL: test18b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2118,7 +2170,8 @@ lpad:
 ; test18c: Addr-of a variable passed into an invoke instruction.
 ;          sspstrong attribute
 ; Requires protector.
-define i32 @test18c() uwtable sspstrong {
+; Function Attrs: sspstrong 
+define i32 @test18c() #1 {
 entry:
 ; LINUX-I386-LABEL: test18c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2154,7 +2207,8 @@ lpad:
 ; test18d: Addr-of a variable passed into an invoke instruction.
 ;          sspreq attribute
 ; Requires protector.
-define i32 @test18d() uwtable sspreq {
+; Function Attrs: sspreq 
+define i32 @test18d() #2 {
 entry:
 ; LINUX-I386-LABEL: test18d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2186,12 +2240,11 @@ lpad:
           catch i8* null
   ret i32 0
 }
-
 ; test19a: Addr-of a struct element passed into an invoke instruction.
 ;           (GEP followed by an invoke)
 ;          no ssp attribute
 ; Requires no protector.
-define i32 @test19a() uwtable {
+define i32 @test19a()  {
 entry:
 ; LINUX-I386-LABEL: test19a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2230,7 +2283,8 @@ lpad:
 ;           (GEP followed by an invoke)
 ;          ssp attribute
 ; Requires no protector.
-define i32 @test19b() uwtable ssp {
+; Function Attrs: ssp 
+define i32 @test19b() #0 {
 entry:
 ; LINUX-I386-LABEL: test19b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2269,7 +2323,8 @@ lpad:
 ;           (GEP followed by an invoke)
 ;          sspstrong attribute
 ; Requires protector.
-define i32 @test19c() uwtable sspstrong {
+; Function Attrs: sspstrong 
+define i32 @test19c() #1 {
 entry:
 ; LINUX-I386-LABEL: test19c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2308,7 +2363,8 @@ lpad:
 ;           (GEP followed by an invoke)
 ;          sspreq attribute
 ; Requires protector.
-define i32 @test19d() uwtable sspreq {
+; Function Attrs: sspreq 
+define i32 @test19d() #2 {
 entry:
 ; LINUX-I386-LABEL: test19d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2350,7 +2406,7 @@ lpad:
 ; test20a: Addr-of a pointer
 ;          no ssp attribute
 ; Requires no protector.
-define void @test20a() nounwind uwtable {
+define void @test20a() {
 entry:
 ; LINUX-I386-LABEL: test20a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2380,7 +2436,8 @@ entry:
 ; test20b: Addr-of a pointer
 ;          ssp attribute
 ; Requires no protector.
-define void @test20b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test20b() #0 {
 entry:
 ; LINUX-I386-LABEL: test20b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2410,7 +2467,8 @@ entry:
 ; test20c: Addr-of a pointer
 ;          sspstrong attribute
 ; Requires protector.
-define void @test20c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test20c() #1 {
 entry:
 ; LINUX-I386-LABEL: test20c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2440,7 +2498,8 @@ entry:
 ; test20d: Addr-of a pointer
 ;          sspreq attribute
 ; Requires protector.
-define void @test20d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test20d() #2 {
 entry:
 ; LINUX-I386-LABEL: test20d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2470,7 +2529,7 @@ entry:
 ; test21a: Addr-of a casted pointer
 ;          no ssp attribute
 ; Requires no protector.
-define void @test21a() nounwind uwtable {
+define void @test21a() {
 entry:
 ; LINUX-I386-LABEL: test21a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2501,7 +2560,8 @@ entry:
 ; test21b: Addr-of a casted pointer
 ;          ssp attribute
 ; Requires no protector.
-define void @test21b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test21b() #0 {
 entry:
 ; LINUX-I386-LABEL: test21b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2532,7 +2592,8 @@ entry:
 ; test21c: Addr-of a casted pointer
 ;          sspstrong attribute
 ; Requires protector.
-define void @test21c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test21c() #1 {
 entry:
 ; LINUX-I386-LABEL: test21c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2563,7 +2624,8 @@ entry:
 ; test21d: Addr-of a casted pointer
 ;          sspreq attribute
 ; Requires protector.
-define void @test21d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test21d() #2 {
 entry:
 ; LINUX-I386-LABEL: test21d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2594,7 +2656,7 @@ entry:
 ; test22a: [2 x i8] in a class
 ;          no ssp attribute
 ; Requires no protector.
-define signext i8 @test22a() nounwind uwtable {
+define signext i8 @test22a() {
 entry:
 ; LINUX-I386-LABEL: test22a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2621,7 +2683,8 @@ entry:
 ; test22b: [2 x i8] in a class
 ;          ssp attribute
 ; Requires no protector.
-define signext i8 @test22b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define signext i8 @test22b() #0 {
 entry:
 ; LINUX-I386-LABEL: test22b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2648,7 +2711,8 @@ entry:
 ; test22c: [2 x i8] in a class
 ;          sspstrong attribute
 ; Requires protector.
-define signext i8 @test22c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define signext i8 @test22c() #1 {
 entry:
 ; LINUX-I386-LABEL: test22c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2675,7 +2739,8 @@ entry:
 ; test22d: [2 x i8] in a class
 ;          sspreq attribute
 ; Requires protector.
-define signext i8 @test22d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define signext i8 @test22d() #2 {
 entry:
 ; LINUX-I386-LABEL: test22d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2702,7 +2767,7 @@ entry:
 ; test23a: [2 x i8] nested in several layers of structs and unions
 ;          no ssp attribute
 ; Requires no protector.
-define signext i8 @test23a() nounwind uwtable {
+define signext i8 @test23a() {
 entry:
 ; LINUX-I386-LABEL: test23a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2733,7 +2798,8 @@ entry:
 ; test23b: [2 x i8] nested in several layers of structs and unions
 ;          ssp attribute
 ; Requires no protector.
-define signext i8 @test23b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define signext i8 @test23b() #0 {
 entry:
 ; LINUX-I386-LABEL: test23b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2764,7 +2830,8 @@ entry:
 ; test23c: [2 x i8] nested in several layers of structs and unions
 ;          sspstrong attribute
 ; Requires protector.
-define signext i8 @test23c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define signext i8 @test23c() #1 {
 entry:
 ; LINUX-I386-LABEL: test23c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2795,7 +2862,8 @@ entry:
 ; test23d: [2 x i8] nested in several layers of structs and unions
 ;          sspreq attribute
 ; Requires protector.
-define signext i8 @test23d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define signext i8 @test23d() #2 {
 entry:
 ; LINUX-I386-LABEL: test23d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2826,7 +2894,7 @@ entry:
 ; test24a: Variable sized alloca
 ;          no ssp attribute
 ; Requires no protector.
-define void @test24a(i32 %n) nounwind uwtable {
+define void @test24a(i32 %n) {
 entry:
 ; LINUX-I386-LABEL: test24a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2857,7 +2925,8 @@ entry:
 ; test24b: Variable sized alloca
 ;          ssp attribute
 ; Requires protector.
-define void @test24b(i32 %n) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test24b(i32 %n) #0 {
 entry:
 ; LINUX-I386-LABEL: test24b:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2888,7 +2957,8 @@ entry:
 ; test24c: Variable sized alloca
 ;          sspstrong attribute
 ; Requires protector.
-define void @test24c(i32 %n) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test24c(i32 %n) #1 {
 entry:
 ; LINUX-I386-LABEL: test24c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2919,7 +2989,8 @@ entry:
 ; test24d: Variable sized alloca
 ;          sspreq attribute
 ; Requires protector.
-define void @test24d(i32 %n) nounwind uwtable sspreq  {
+; Function Attrs: sspreq 
+define void @test24d(i32 %n) #2 {
 entry:
 ; LINUX-I386-LABEL: test24d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2950,7 +3021,7 @@ entry:
 ; test25a: array of [4 x i32]
 ;          no ssp attribute
 ; Requires no protector.
-define i32 @test25a() nounwind uwtable {
+define i32 @test25a() {
 entry:
 ; LINUX-I386-LABEL: test25a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2976,7 +3047,8 @@ entry:
 ; test25b: array of [4 x i32]
 ;          ssp attribute
 ; Requires no protector, except for Darwin which _does_ require a protector.
-define i32 @test25b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define i32 @test25b() #0 {
 entry:
 ; LINUX-I386-LABEL: test25b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -3002,7 +3074,8 @@ entry:
 ; test25c: array of [4 x i32]
 ;          sspstrong attribute
 ; Requires protector.
-define i32 @test25c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define i32 @test25c() #1 {
 entry:
 ; LINUX-I386-LABEL: test25c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -3028,7 +3101,8 @@ entry:
 ; test25d: array of [4 x i32]
 ;          sspreq attribute
 ; Requires protector.
-define i32 @test25d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define i32 @test25d() #2 {
 entry:
 ; LINUX-I386-LABEL: test25d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -3056,7 +3130,8 @@ entry:
 ;         a stack protector.
 ;         ssptrong attribute
 ; Requires no protector.
-define void @test26() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test26() #1 {
 entry:
 ; LINUX-I386-LABEL: test26:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -3087,7 +3162,8 @@ entry:
 ;         Verify that the address-of analysis does not get stuck in infinite
 ;         recursion when chasing the alloca through the PHI nodes.
 ; Requires protector.
-define i32 @test27(i32 %arg) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define i32 @test27(i32 %arg) #1 {
 bb:
 ; LINUX-I386-LABEL: test27:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -3105,7 +3181,7 @@ bb:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %tmp = alloca %struct.small*, align 8
-  %tmp1 = call i32 (...)* @dummy(%struct.small** %tmp) nounwind
+  %tmp1 = call i32 (...)* @dummy(%struct.small** %tmp)
   %tmp2 = load %struct.small** %tmp, align 8
   %tmp3 = ptrtoint %struct.small* %tmp2 to i64
   %tmp4 = trunc i64 %tmp3 to i32
@@ -3133,10 +3209,239 @@ bb17:                                             ; preds = %bb6
 
 bb21:                                             ; preds = %bb6, %bb
   %tmp22 = phi i32 [ %tmp1, %bb ], [ %tmp14, %bb6 ]
-  %tmp23 = call i32 (...)* @dummy(i32 %tmp22) nounwind
+  %tmp23 = call i32 (...)* @dummy(i32 %tmp22)
   ret i32 undef
 }
 
+; test28a: An array of [32 x i8] and a requested ssp-buffer-size of 33.
+; Requires no protector.
+; Function Attrs: ssp stack-protector-buffer-size=33
+define i32 @test28a() #3 {
+entry:
+; LINUX-I386-LABEL: test28a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64-LABEL: test28a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64-LABEL: test28a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64-LABEL: test28a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %test = alloca [32 x i8], align 16
+  %arraydecay = getelementptr inbounds [32 x i8]* %test, i32 0, i32 0
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  ret i32 %call
+}
+
+; test28b: An array of [33 x i8] and a requested ssp-buffer-size of 33.
+; Requires protector.
+; Function Attrs: ssp stack-protector-buffer-size=33
+define i32 @test28b() #3 {
+entry:
+; LINUX-I386-LABEL: test28b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64-LABEL: test28b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64-LABEL: test28b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64-LABEL: test28b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %test = alloca [33 x i8], align 16
+  %arraydecay = getelementptr inbounds [33 x i8]* %test, i32 0, i32 0
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  ret i32 %call
+}
+
+; test29a: An array of [4 x i8] and a requested ssp-buffer-size of 5.
+; Requires no protector.
+; Function Attrs: ssp stack-protector-buffer-size=5
+define i32 @test29a() #4 {
+entry:
+; LINUX-I386-LABEL: test29a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64-LABEL: test29a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64-LABEL: test29a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64-LABEL: test29a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %test = alloca [4 x i8], align 1
+  %arraydecay = getelementptr inbounds [4 x i8]* %test, i32 0, i32 0
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  ret i32 %call
+}
+
+; test29b: An array of [5 x i8] and a requested ssp-buffer-size of 5.
+; Requires protector.
+; Function Attrs: ssp stack-protector-buffer-size=5
+define i32 @test29b() #4 {
+entry:
+; LINUX-I386-LABEL: test29b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64-LABEL: test29b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64-LABEL: test29b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64-LABEL: test29b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %test = alloca [5 x i8], align 1
+  %arraydecay = getelementptr inbounds [5 x i8]* %test, i32 0, i32 0
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  ret i32 %call
+}
+
+; test30a: An structure containing an i32 and an array of [5 x i8].
+;          Requested ssp-buffer-size of 6.
+; Requires no protector.
+; Function Attrs: ssp stack-protector-buffer-size=6
+define i32 @test30a() #5 {
+entry:
+; LINUX-I386-LABEL: test30a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64-LABEL: test30a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64-LABEL: test30a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64-LABEL: test30a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %test = alloca %struct.small_char, align 4
+  %test.coerce = alloca { i64, i8 }
+  %0 = bitcast { i64, i8 }* %test.coerce to i8*
+  %1 = bitcast %struct.small_char* %test to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 12, i32 0, i1 false)
+  %2 = getelementptr { i64, i8 }* %test.coerce, i32 0, i32 0
+  %3 = load i64* %2, align 1
+  %4 = getelementptr { i64, i8 }* %test.coerce, i32 0, i32 1
+  %5 = load i8* %4, align 1
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
+  ret i32 %call
+}
+
+; test30b: An structure containing an i32 and an array of [5 x i8].
+;          Requested ssp-buffer-size of 5.
+; Requires protector.
+; Function Attrs: ssp stack-protector-buffer-size=5
+define i32 @test30b() #4 {
+entry:
+; LINUX-I386-LABEL: test30b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64-LABEL: test30b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64-LABEL: test30b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64-LABEL: test30b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %test = alloca %struct.small_char, align 4
+  %test.coerce = alloca { i64, i8 }
+  %0 = bitcast { i64, i8 }* %test.coerce to i8*
+  %1 = bitcast %struct.small_char* %test to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 12, i32 0, i1 false)
+  %2 = getelementptr { i64, i8 }* %test.coerce, i32 0, i32 0
+  %3 = load i64* %2, align 1
+  %4 = getelementptr { i64, i8 }* %test.coerce, i32 0, i32 1
+  %5 = load i8* %4, align 1
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
+  ret i32 %call
+}
+
+; test31a: An alloca of size 5.
+;          Requested ssp-buffer-size of 6.
+; Requires no protector.
+; Function Attrs: ssp stack-protector-buffer-size=6
+define i32 @test31a() #5 {
+entry:
+; LINUX-I386-LABEL: test31a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64-LABEL: test31a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64-LABEL: test31a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64-LABEL: test31a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %test = alloca i8*, align 8
+  %0 = alloca i8, i64 4
+  store i8* %0, i8** %test, align 8
+  %1 = load i8** %test, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %1)
+  ret i32 %call
+}
+
+; test31b: An alloca of size 5.
+;          Requested ssp-buffer-size of 5.
+; Requires protector.
+define i32 @test31b() #4 {
+entry:
+; LINUX-I386-LABEL: test31b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64-LABEL: test31b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64-LABEL: test31b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64-LABEL: test31b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %test = alloca i8*, align 8
+  %0 = alloca i8, i64 5
+  store i8* %0, i8** %test, align 8
+  %1 = load i8** %test, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %1)
+  ret i32 %call
+}
+
 declare double @testi_aux()
 declare i8* @strcpy(i8*, i8*)
 declare i32 @printf(i8*, ...)
@@ -3148,3 +3453,11 @@ declare void @_Z3exceptPi(i32*)
 declare i32 @__gxx_personality_v0(...)
 declare i32* @getp()
 declare i32 @dummy(...)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
+
+attributes #0 = { ssp }
+attributes #1 = { sspstrong }
+attributes #2 = { sspreq }
+attributes #3 = { ssp "stack-protector-buffer-size"="33" }
+attributes #4 = { ssp "stack-protector-buffer-size"="5" }
+attributes #5 = { ssp "stack-protector-buffer-size"="6" }
diff --git a/test/CodeGen/X86/stackpointer.ll b/test/CodeGen/X86/stackpointer.ll
new file mode 100644
index 0000000..80bcfbf
--- /dev/null
+++ b/test/CodeGen/X86/stackpointer.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin  | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnueabi | FileCheck %s
+; RUN: opt < %s -O3 -S -mtriple=x86_64-linux-gnueabi | FileCheck %s --check-prefix=OPT
+
+define i64 @get_stack() nounwind {
+entry:
+; CHECK-LABEL: get_stack:
+; CHECK: movq	%rsp, %rax
+	%sp = call i64 @llvm.read_register.i64(metadata !0)
+; OPT: @llvm.read_register.i64
+  ret i64 %sp
+}
+
+define void @set_stack(i64 %val) nounwind {
+entry:
+; CHECK-LABEL: set_stack:
+; CHECK: movq	%rdi, %rsp
+  call void @llvm.write_register.i64(metadata !0, i64 %val)
+; OPT: @llvm.write_register.i64
+  ret void
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+declare void @llvm.write_register.i64(metadata, i64) nounwind
+
+; register unsigned long current_stack_pointer asm("rsp");
+; CHECK-NOT: .asciz  "rsp"
+!0 = metadata !{metadata !"rsp\00"}
diff --git a/test/CodeGen/X86/tls.ll b/test/CodeGen/X86/tls.ll
index 76a8402..75e7fc4 100644
--- a/test/CodeGen/X86/tls.ll
+++ b/test/CodeGen/X86/tls.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64_LINUX %s
 ; RUN: llc < %s -march=x86 -mtriple=x86-pc-win32 | FileCheck -check-prefix=X32_WIN %s
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-win32 | FileCheck -check-prefix=X64_WIN %s
+; RUN: llc < %s -march=x86 -mtriple=x86-pc-windows-gnu | FileCheck -check-prefix=MINGW32 %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-windows-gnu | FileCheck -check-prefix=X64_WIN %s
 
 @i1 = thread_local global i32 15
 @i2 = external thread_local global i32
@@ -30,6 +32,12 @@ define i32 @f1() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movl i1@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f1:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i1@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i1
@@ -57,6 +65,12 @@ define i32* @f2() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: leaq i1@SECREL32(%rax), %rax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f2:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i1@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i1
@@ -83,6 +97,12 @@ define i32 @f3() nounwind {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movl i2@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f3:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i2@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i2
@@ -110,6 +130,12 @@ define i32* @f4() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: leaq i2@SECREL32(%rax), %rax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f4:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i2@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i2
@@ -134,6 +160,12 @@ define i32 @f5() nounwind {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movl i3@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f5:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i3@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i3
@@ -161,6 +193,12 @@ define i32* @f6() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: leaq i3@SECREL32(%rax), %rax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f6:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i3@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i3
@@ -173,6 +211,12 @@ define i32 @f7() {
 ; X64_LINUX-LABEL: f7:
 ; X64_LINUX:      movl %fs:i4@TPOFF, %eax
 ; X64_LINUX-NEXT: ret
+; MINGW32-LABEL: _f7:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i4@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i4
@@ -188,6 +232,12 @@ define i32* @f8() {
 ; X64_LINUX:      movq %fs:0, %rax
 ; X64_LINUX-NEXT: leaq i4@TPOFF(%rax), %rax
 ; X64_LINUX-NEXT: ret
+; MINGW32-LABEL: _f8:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i4@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i4
@@ -200,6 +250,12 @@ define i32 @f9() {
 ; X64_LINUX-LABEL: f9:
 ; X64_LINUX:      movl %fs:i5@TPOFF, %eax
 ; X64_LINUX-NEXT: ret
+; MINGW32-LABEL: _f9:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i5@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i5
@@ -215,6 +271,12 @@ define i32* @f10() {
 ; X64_LINUX:      movq %fs:0, %rax
 ; X64_LINUX-NEXT: leaq i5@TPOFF(%rax), %rax
 ; X64_LINUX-NEXT: ret
+; MINGW32-LABEL: _f10:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i5@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i5
@@ -239,6 +301,12 @@ define i16 @f11() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movzwl s1@SECREL32(%rax), %eax
 ; X64_WIN:      ret
+; MINGW32-LABEL: _f11:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movzwl  _s1@SECREL32(%eax), %eax
+; MINGW32: retl
 
 entry:
 	%tmp1 = load i16* @s1
@@ -264,6 +332,13 @@ define i32 @f12() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movswl s1@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f12:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movswl _s1@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
+
 
 entry:
 	%tmp1 = load i16* @s1
@@ -290,6 +365,12 @@ define i8 @f13() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movb b1@SECREL32(%rax), %al
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f13:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movb _b1@SECREL32(%eax), %al
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i8* @b1
@@ -315,6 +396,12 @@ define i32 @f14() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movsbl b1@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f14:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movsbl  _b1@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i8* @b1
diff --git a/test/CodeGen/X86/vec_shuffle-41.ll b/test/CodeGen/X86/vec_shuffle-41.ll
new file mode 100644
index 0000000..28fdd2f
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuffle-41.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+
+; Use buildFromShuffleMostly which allows this to be generated as two 128-bit
+; shuffles and an insert.
+
+; This is the (somewhat questionable) LLVM IR that is generated for:
+;    x8.s0123456 = x8.s1234567;  // x8 is a <8 x float> type
+;    x8.s7 = f;                  // f is float
+
+
+define <8 x float> @test1(<8 x float> %a, float %b) {
+; CHECK-LABEL: test1:
+; CHECK: vinsertps
+; CHECK-NOT: vinsertps
+entry:
+  %shift = shufflevector <8 x float> %a, <8 x float> undef, <7 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %extend = shufflevector <7 x float> %shift, <7 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
+  %insert = insertelement <8 x float> %extend, float %b, i32 7
+
+  ret <8 x float> %insert
+}
diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll
index 543c96e..a02e383 100644
--- a/test/CodeGen/X86/vec_splat.ll
+++ b/test/CodeGen/X86/vec_splat.ll
@@ -32,3 +32,19 @@ define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
 ; SSE3-LABEL: test_v2sd:
 ; SSE3: movddup
 }
+
+; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
+define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
+  %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
+  %2 = load <4 x float>* %1, align 16
+  %3 = extractelement <4 x float> %2, i64 %j
+  %4 = insertelement <4 x float> undef, float %3, i32 0
+  %5 = insertelement <4 x float> %4, float %3, i32 1
+  %6 = insertelement <4 x float> %5, float %3, i32 2
+  %7 = insertelement <4 x float> %6, float %3, i32 3
+  ret <4 x float> %7
+  
+; AVX-LABEL: load_extract_splat
+; AVX-NOT: movs
+; AVX: vbroadcastss
+}
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
new file mode 100644
index 0000000..4c30184
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -0,0 +1,217 @@
+; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=SSE41
+; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE
+; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX
+
+define <4 x i32> @test1(<4 x i32> %a) {
+  %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %div
+
+; SSE41-LABEL: test1:
+; SSE41: pmuludq
+; SSE41: pshufd	$57
+; SSE41: pmuludq
+; SSE41: shufps	$-35
+; SSE41: psubd
+; SSE41: psrld $1
+; SSE41: padd
+; SSE41: psrld $2
+
+; AVX-LABEL: test1:
+; AVX: vpmuludq
+; AVX: vpshufd	$57
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpsubd
+; AVX: vpsrld $1
+; AVX: vpadd
+; AVX: vpsrld $2
+}
+
+define <8 x i32> @test2(<8 x i32> %a) {
+  %div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %div
+
+; AVX-LABEL: test2:
+; AVX: vpermd
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpsubd
+; AVX: vpsrld $1
+; AVX: vpadd
+; AVX: vpsrld $2
+}
+
+define <8 x i16> @test3(<8 x i16> %a) {
+  %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %div
+
+; SSE41-LABEL: test3:
+; SSE41: pmulhuw
+; SSE41: psubw
+; SSE41: psrlw $1
+; SSE41: paddw
+; SSE41: psrlw $2
+
+; AVX-LABEL: test3:
+; AVX: vpmulhuw
+; AVX: vpsubw
+; AVX: vpsrlw $1
+; AVX: vpaddw
+; AVX: vpsrlw $2
+}
+
+define <16 x i16> @test4(<16 x i16> %a) {
+  %div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i16> %div
+
+; AVX-LABEL: test4:
+; AVX: vpmulhuw
+; AVX: vpsubw
+; AVX: vpsrlw $1
+; AVX: vpaddw
+; AVX: vpsrlw $2
+; AVX-NOT: vpmulhuw
+}
+
+define <8 x i16> @test5(<8 x i16> %a) {
+  %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %div
+
+; SSE41-LABEL: test5:
+; SSE41: pmulhw
+; SSE41: psrlw $15
+; SSE41: psraw $1
+; SSE41: paddw
+
+; AVX-LABEL: test5:
+; AVX: vpmulhw
+; AVX: vpsrlw $15
+; AVX: vpsraw $1
+; AVX: vpaddw
+}
+
+define <16 x i16> @test6(<16 x i16> %a) {
+  %div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i16> %div
+
+; AVX-LABEL: test6:
+; AVX: vpmulhw
+; AVX: vpsrlw $15
+; AVX: vpsraw $1
+; AVX: vpaddw
+; AVX-NOT: vpmulhw
+}
+
+define <16 x i8> @test7(<16 x i8> %a) {
+  %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %div
+}
+
+define <4 x i32> @test8(<4 x i32> %a) {
+  %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %div
+
+; SSE41-LABEL: test8:
+; SSE41: pmuldq
+; SSE41: pshufd	$57
+; SSE41-NOT: pshufd	$57
+; SSE41: pmuldq
+; SSE41: shufps	$-35
+; SSE41: pshufd	$-40
+; SSE41: padd
+; SSE41: psrld $31
+; SSE41: psrad $2
+; SSE41: padd
+
+; SSE-LABEL: test8:
+; SSE: psrad $31
+; SSE: pand
+; SSE: paddd
+; SSE: pmuludq
+; SSE: pshufd	$57
+; SSE-NOT: pshufd	$57
+; SSE: pmuludq
+; SSE: shufps	$-35
+; SSE: pshufd	$-40
+; SSE: psubd
+; SSE: padd
+; SSE: psrld $31
+; SSE: psrad $2
+; SSE: padd
+
+; AVX-LABEL: test8:
+; AVX: vpmuldq
+; AVX: vpshufd	$57
+; AVX-NOT: vpshufd	$57
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpshufd	$-40
+; AVX: vpadd
+; AVX: vpsrld $31
+; AVX: vpsrad $2
+; AVX: vpadd
+}
+
+define <8 x i32> @test9(<8 x i32> %a) {
+  %div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %div
+
+; AVX-LABEL: test9:
+; AVX: vpbroadcastd
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpshufd	$-40
+; AVX: vpadd
+; AVX: vpsrld $31
+; AVX: vpsrad $2
+; AVX: vpadd
+}
+
+define <8 x i32> @test10(<8 x i32> %a) {
+  %rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %rem
+
+; AVX-LABEL: test10:
+; AVX: vpbroadcastd
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpsubd
+; AVX: vpsrld $1
+; AVX: vpadd
+; AVX: vpsrld $2
+; AVX: vpmulld
+}
+
+define <8 x i32> @test11(<8 x i32> %a) {
+  %rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %rem
+
+; AVX-LABEL: test11:
+; AVX: vpbroadcastd
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpshufd	$-40
+; AVX: vpadd
+; AVX: vpsrld $31
+; AVX: vpsrad $2
+; AVX: vpadd
+; AVX: vpmulld
+}
+
+define <2 x i16> @test12() {
+  %I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0
+  %I9 = insertelement <2 x i16> %I8, i16 -1, i32 1
+  %B9 = urem <2 x i16> %I9, %I9
+  ret <2 x i16> %B9
+
+; AVX-LABEL: test12:
+; AVX: xorps
+}
diff --git a/test/CodeGen/X86/win32_sret.ll b/test/CodeGen/X86/win32_sret.ll
index d8ecd44..8728712 100644
--- a/test/CodeGen/X86/win32_sret.ll
+++ b/test/CodeGen/X86/win32_sret.ll
@@ -2,9 +2,11 @@
 ; some setups (e.g., Atom) from affecting the output.
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
+; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-cygwin | FileCheck %s -check-prefix=CYGWIN
 ; RUN: llc < %s -mcpu=core2 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32
 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
+; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-cygwin | FileCheck %s -check-prefix=CYGWIN
 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
 
 ; The SysV ABI used by most Unixes and Mingw on x86 specifies that an sret pointer
@@ -21,6 +23,9 @@ entry:
 ; MINGW_X86-LABEL:  _sret1:
 ; MINGW_X86:  {{retl$}}
 
+; CYGWIN-LABEL:     _sret1:
+; CYGWIN:     retl $4
+
 ; LINUX-LABEL:      sret1:
 ; LINUX:      retl $4
 
@@ -38,6 +43,9 @@ entry:
 ; MINGW_X86-LABEL:  _sret2:
 ; MINGW_X86:  {{retl$}}
 
+; CYGWIN-LABEL:     _sret2:
+; CYGWIN:     retl $4
+
 ; LINUX-LABEL:      sret2:
 ; LINUX:      retl $4
 
@@ -56,6 +64,9 @@ entry:
 ; MINGW_X86-LABEL:  _sret3:
 ; MINGW_X86:  {{retl$}}
 
+; CYGWIN-LABEL:     _sret3:
+; CYGWIN:     retl $4
+
 ; LINUX-LABEL:      sret3:
 ; LINUX:      retl $4
 
@@ -77,6 +88,9 @@ entry:
 ; MINGW_X86-LABEL: _sret4:
 ; MINGW_X86: {{retl$}}
 
+; CYGWIN-LABEL:    _sret4:
+; CYGWIN:    retl $4
+
 ; LINUX-LABEL:     sret4:
 ; LINUX:     retl $4
 
@@ -98,6 +112,7 @@ entry:
   ret void
 ; WIN32-LABEL:     {{^}}"?foo@C5@@QAE?AUS5@@XZ":
 ; MINGW_X86-LABEL: {{^}}"?foo@C5@@QAE?AUS5@@XZ":
+; CYGWIN-LABEL:    {{^}}"?foo@C5@@QAE?AUS5@@XZ":
 ; LINUX-LABEL:     {{^}}"?foo@C5@@QAE?AUS5@@XZ":
 
 ; The address of the return structure is passed as an implicit parameter.
@@ -115,6 +130,7 @@ entry:
   call x86_thiscallcc void @"\01?foo@C5@@QAE?AUS5@@XZ"(%struct.S5* sret %s, %class.C5* %c)
 ; WIN32-LABEL:      {{^}}_call_foo5:
 ; MINGW_X86-LABEL:  {{^}}_call_foo5:
+; CYGWIN-LABEL:     {{^}}_call_foo5:
 ; LINUX-LABEL:      {{^}}call_foo5:
 
 
@@ -135,6 +151,7 @@ entry:
 define void @test6_f(%struct.test6* %x) nounwind {
 ; WIN32-LABEL: _test6_f:
 ; MINGW_X86-LABEL: _test6_f:
+; CYGWIN-LABEL: _test6_f:
 ; LINUX-LABEL: test6_f:
 
 ; The %x argument is moved to %ecx. It will be the this pointer.
@@ -145,6 +162,9 @@ define void @test6_f(%struct.test6* %x) nounwind {
 ; MINGW_X86: movl    8(%ebp), %eax
 ; MINGW_X86: movl    %eax, (%e{{([a-d]x)|(sp)}})
 
+; CYGWIN: movl    8(%ebp), %eax
+; CYGWIN: movl    %eax, (%e{{([a-d]x)|(sp)}})
+
 ; The sret pointer is (%esp)
 ; WIN32:          leal    8(%esp), %[[REG:e[a-d]x]]
 ; WIN32-NEXT:     movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
@@ -153,8 +173,71 @@ define void @test6_f(%struct.test6* %x) nounwind {
 ; MINGW_X86-NEXT: leal    8(%esp), %ecx
 ; MINGW_X86-NEXT: calll   _test6_g
 
+; CYGWIN-NEXT: leal    8(%esp), %ecx
+; CYGWIN-NEXT: calll   _test6_g
+
   %tmp = alloca %struct.test6, align 4
   call x86_thiscallcc void @test6_g(%struct.test6* sret %tmp, %struct.test6* %x)
   ret void
 }
 declare x86_thiscallcc void @test6_g(%struct.test6* sret, %struct.test6*)
+
+; Flipping the parameters at the IR level generates the same code.
+%struct.test7 = type { i32, i32, i32 }
+define void @test7_f(%struct.test7* %x) nounwind {
+; WIN32-LABEL: _test7_f:
+; MINGW_X86-LABEL: _test7_f:
+; CYGWIN-LABEL: _test7_f:
+; LINUX-LABEL: test7_f:
+
+; The %x argument is moved to %ecx on all OSs. It will be the this pointer.
+; WIN32:      movl    8(%ebp), %ecx
+; MINGW_X86:  movl    8(%ebp), %ecx
+; CYGWIN:     movl    8(%ebp), %ecx
+
+; The sret pointer is (%esp)
+; WIN32:          leal    8(%esp), %[[REG:e[a-d]x]]
+; WIN32-NEXT:     movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
+; MINGW_X86:      leal    8(%esp), %[[REG:e[a-d]x]]
+; MINGW_X86-NEXT: movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
+; CYGWIN:         leal    8(%esp), %[[REG:e[a-d]x]]
+; CYGWIN-NEXT:    movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
+
+  %tmp = alloca %struct.test7, align 4
+  call x86_thiscallcc void @test7_g(%struct.test7* %x, %struct.test7* sret %tmp)
+  ret void
+}
+
+define x86_thiscallcc void @test7_g(%struct.test7* %in, %struct.test7* sret %out) {
+  %s = getelementptr %struct.test7* %in, i32 0, i32 0
+  %d = getelementptr %struct.test7* %out, i32 0, i32 0
+  %v = load i32* %s
+  store i32 %v, i32* %d
+  call void @clobber_eax()
+  ret void
+
+; Make sure we return the second parameter in %eax.
+; WIN32-LABEL: _test7_g:
+; WIN32: calll _clobber_eax
+; WIN32: movl {{.*}}, %eax
+; WIN32: retl
+}
+
+declare void @clobber_eax()
+
+; Test what happens if the first parameter has to be split by codegen.
+; Realistically, no frontend will generate code like this, but here it is for
+; completeness.
+define void @test8_f(i64 inreg %a, i64* sret %out) {
+  store i64 %a, i64* %out
+  call void @clobber_eax()
+  ret void
+
+; WIN32-LABEL: _test8_f:
+; WIN32: movl {{[0-9]+}}(%esp), %[[out:[a-z]+]]
+; WIN32-DAG: movl %edx, 4(%[[out]])
+; WIN32-DAG: movl %eax, (%[[out]])
+; WIN32: calll _clobber_eax
+; WIN32: movl {{.*}}, %eax
+; WIN32: retl
+}
diff --git a/test/CodeGen/X86/x86-64-sret-return-2.ll b/test/CodeGen/X86/x86-64-sret-return-2.ll
new file mode 100644
index 0000000..9f57ee1
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-sret-return-2.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=x86_64-apple-darwin8 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+
+; FIXME: x32 doesn't know how to select this.  This isn't a regression, it never
+; worked.
+; RUNX: llc -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
+
+; This used to crash due to topological sorting issues in selection DAG.
+define void @foo(i32* sret %agg.result, i32, i32, i32, i32, i32, void (i32)* %pred) {
+entry:
+  call void %pred(i32 undef)
+  ret void
+
+; CHECK-LABEL: foo:
+; CHECK: callq
+; CHECK: movq {{.*}}, %rax
+; CHECK: ret
+}
diff --git a/test/CodeGen/XCore/epilogue_prologue.ll b/test/CodeGen/XCore/epilogue_prologue.ll
index 14f04a3..9997814 100644
--- a/test/CodeGen/XCore/epilogue_prologue.ll
+++ b/test/CodeGen/XCore/epilogue_prologue.ll
@@ -206,14 +206,41 @@ entry:
   ret i32 %i
 }
 
+; FP + large frame: spill FP+SR+LR = entsp 2 + 256  + extsp 1
+; CHECKFP-LABEL:f8
+; CHECKFP: entsp 258
+; CHECKFP-NEXT: stw r10, sp[1]
+; CHECKFP-NEXT: ldaw r10, sp[0]
+; CHECKFP-NEXT: mkmsk [[REG:r[0-9]+]], 8
+; CHECKFP-NEXT: ldaw r0, r10{{\[}}[[REG]]{{\]}}
+; CHECKFP-NEXT: extsp 1
+; CHECKFP-NEXT: bl f5
+; CHECKFP-NEXT: ldaw sp, sp[1]
+; CHECKFP-NEXT: set sp, r10
+; CHECKFP-NEXT: ldw r10, sp[1]
+; CHECKFP-NEXT: retsp 258
+;
+; !FP + large frame: spill SR+SR+LR = entsp 3 + 256
+; CHECK-LABEL:f8
+; CHECK: entsp 257
+; CHECK-NEXT: ldaw r0, sp[254]
+; CHECK-NEXT: bl f5
+; CHECK-NEXT: retsp 257
+define void @f8() nounwind {
+entry:
+  %0 = alloca [256 x i32]
+  %1 = getelementptr inbounds [256 x i32]* %0, i32 0, i32 253
+  call void @f5(i32* %1)
+  ret void
+}
 
 ; FP + large frame: spill FP+SR+LR = entsp 2 + 32768  + extsp 1
-; CHECKFP-LABEL:f8
+; CHECKFP-LABEL:f9
 ; CHECKFP: entsp 32770
 ; CHECKFP-NEXT: stw r10, sp[1]
 ; CHECKFP-NEXT: ldaw r10, sp[0]
-; CHECKFP-NEXT: mkmsk r1, 15
-; CHECKFP-NEXT: ldaw r0, r10[r1]
+; CHECKFP-NEXT: ldc [[REG:r[0-9]+]], 32767
+; CHECKFP-NEXT: ldaw r0, r10{{\[}}[[REG]]{{\]}}
 ; CHECKFP-NEXT: extsp 1
 ; CHECKFP-NEXT: bl f5
 ; CHECKFP-NEXT: ldaw sp, sp[1]
@@ -222,12 +249,12 @@ entry:
 ; CHECKFP-NEXT: retsp 32770
 ;
 ; !FP + large frame: spill SR+SR+LR = entsp 3 + 32768
-; CHECK-LABEL:f8
+; CHECK-LABEL:f9
 ; CHECK: entsp 32771
 ; CHECK-NEXT: ldaw r0, sp[32768]
 ; CHECK-NEXT: bl f5
 ; CHECK-NEXT: retsp 32771
-define void @f8() nounwind {
+define void @f9() nounwind {
 entry:
   %0 = alloca [32768 x i32]
   %1 = getelementptr inbounds [32768 x i32]* %0, i32 0, i32 32765
diff --git a/test/CodeGen/XCore/llvm-intrinsics.ll b/test/CodeGen/XCore/llvm-intrinsics.ll
index e0acd66..b436282 100644
--- a/test/CodeGen/XCore/llvm-intrinsics.ll
+++ b/test/CodeGen/XCore/llvm-intrinsics.ll
@@ -287,9 +287,8 @@ define void @Unwind1() {
 ; CHECKFP: .LBB{{[0-9_]+}}
 ; CHECKFP-NEXT: ldc r2, 40
 ; CHECKFP-NEXT: add r2, r10, r2
-; CHECKFP-NEXT: add r0, r2, r0
+; CHECKFP-NEXT: add r2, r2, r0
 ; CHECKFP-NEXT: mov r3, r1
-; CHECKFP-NEXT: mov r2, r0
 ; CHECKFP-NEXT: ldw r9, r10[4]
 ; CHECKFP-NEXT: ldw r8, r10[5]
 ; CHECKFP-NEXT: ldw r7, r10[6]
@@ -337,9 +336,8 @@ define void @Unwind1() {
 ; CHECK-NEXT: ldc r2, 36
 ; CHECK-NEXT: ldaw r3, sp[0]
 ; CHECK-NEXT: add r2, r3, r2
-; CHECK-NEXT: add r0, r2, r0
+; CHECK-NEXT: add r2, r2, r0
 ; CHECK-NEXT: mov r3, r1
-; CHECK-NEXT: mov r2, r0
 ; CHECK-NEXT: ldw r10, sp[2]
 ; CHECK-NEXT: ldw r9, sp[3]
 ; CHECK-NEXT: ldw r8, sp[4]
diff --git a/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll b/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
index 6fd7887..c78b8b8 100644
--- a/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
+++ b/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
@@ -13,13 +13,11 @@ entry:
 !0 = metadata !{i32 720913, metadata !17, i32 12, metadata !"clang version 3.0 (trunk 139632)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !12, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !17, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @foo, null, null, metadata !10, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
+!5 = metadata !{i32 720942, metadata !17, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
 !6 = metadata !{i32 720937, metadata !17} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !12 = metadata !{metadata !14}
 !14 = metadata !{i32 720948, i32 0, metadata !5, metadata !"bar", metadata !"bar", metadata !"", metadata !6, i32 2, metadata !9, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
 !15 = metadata !{i32 3, i32 3, metadata !16, null}
diff --git a/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll b/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll
index 5a10459..9beab20 100644
--- a/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll
+++ b/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll
@@ -5,7 +5,7 @@
 !llvm.module.flags = !{!9}
 
 !0 = metadata !{i32 720913, metadata !8, i32 12, metadata !"clang version 3.0 (trunk 139632)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{i32 0}
+!2 = metadata !{}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 720948, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 2, metadata !7, i32 0, i32 1, i32* @0, null} ; [ DW_TAG_variable ]
 !6 = metadata !{i32 720937, metadata !8} ; [ DW_TAG_file_type ]
diff --git a/test/DebugInfo/2010-03-19-DbgDeclare.ll b/test/DebugInfo/2010-03-19-DbgDeclare.ll
index 0c0a4dc..94aa259 100644
--- a/test/DebugInfo/2010-03-19-DbgDeclare.ll
+++ b/test/DebugInfo/2010-03-19-DbgDeclare.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | opt -verify -S -asm-verbose | FileCheck %s
+; RUN: opt < %s -verify -S | FileCheck %s
 
 ; CHECK: lang 0x8001
 
diff --git a/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll b/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
index bec0318..5f7cb69 100644
--- a/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
+++ b/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
@@ -1,6 +1,22 @@
-; RUN: llvm-as < %s | %llc_dwarf -asm-verbose -O0 | grep AT_specification | count 2
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj -o - < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
 ; Radar 7833483
-; Do not emit AT_specification for nested function foo.
+; Do not emit a separate out-of-line definition DIE for the function-local 'foo'
+; function (member of the function local 'A' type)
+; CHECK: DW_TAG_class_type
+; CHECK: DW_TAG_class_type
+; CHECK-NEXT: DW_AT_name {{.*}} "A"
+; Check that the subprogram inside the class definition has low_pc, only
+; attached to the definition.
+; CHECK: [[FOO_INL:0x........]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_low_pc
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name {{.*}} "_ZZN1B2fnEvEN1A3fooEv"
+; And just double check that there's no out of line definition that references
+; this subprogram.
+; CHECK-NOT: DW_AT_specification {{.*}} {[[FOO_INL]]}
 
 %class.A = type { i8 }
 %class.B = type { i8 }
diff --git a/test/DebugInfo/2010-07-19-Crash.ll b/test/DebugInfo/2010-07-19-Crash.ll
index 6b6e61d..a10b10a 100644
--- a/test/DebugInfo/2010-07-19-Crash.ll
+++ b/test/DebugInfo/2010-07-19-Crash.ll
@@ -25,6 +25,6 @@ entry:
 !10 = metadata !{i32 524299, metadata !12, metadata !0, i32 3, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
 !11 = metadata !{i32 524334, metadata !12, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 7, metadata !3, i1 true, i1 false, i32 0, i32 0, null, i1 false, i1 true, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !12 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
-!13 = metadata !{metadata !0, metadata !6, metadata !11}
+!13 = metadata !{metadata !0}
 !14 = metadata !{i32 0}
 !15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/AArch64/cfi-frame.ll b/test/DebugInfo/AArch64/cfi-frame.ll
deleted file mode 100644
index 7290ddf..0000000
--- a/test/DebugInfo/AArch64/cfi-frame.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-WITH-FP
-
-@bigspace = global [8 x i64] zeroinitializer
-
-declare void @use_addr(i8*)
-
-define void @test_frame([8 x i64] %val) {
-; CHECK: test_frame:
-; CHECK: .cfi_startproc
-
-  %var = alloca i8, i32 1000000
-; CHECK: sub sp, sp, #[[SP_INIT_ADJ:[0-9]+]]
-; CHECK-NEXT: .Ltmp
-; CHECK-NEXT: .cfi_def_cfa sp, [[SP_INIT_ADJ]]
-
-; Make sure the prologue is reasonably efficient
-; CHECK-NEXT: stp x29, x30, [sp,
-; CHECK-NEXT: stp x25, x26, [sp,
-; CHECK-NEXT: stp x23, x24, [sp,
-; CHECK-NEXT: stp x21, x22, [sp,
-; CHECK-NEXT: stp x19, x20, [sp,
-; CHECK-NEXT: sub sp, sp, #160
-; CHECK-NEXT: sub sp, sp, #244, lsl #12
-; CHECK-NEXT: .Ltmp
-; CHECK-NEXT: .cfi_def_cfa sp, 1000080
-; CHECK-NEXT: .Ltmp
-; CHECK-NEXT: .cfi_offset x30, -8
-; CHECK-NEXT: .Ltmp
-; CHECK-NEXT: .cfi_offset x29, -16
-; [...]
-; CHECK: .cfi_offset x19, -80
-
-; CHECK: bl use_addr
-  call void @use_addr(i8* %var)
-
-  store [8 x i64] %val, [8 x i64]* @bigspace
-  ret void
-; CHECK: ret
-; CHECK: .cfi_endproc
-}
-
-; CHECK-WITH-FP: test_frame:
-
-; CHECK-WITH-FP: sub sp, sp, #[[SP_INIT_ADJ:[0-9]+]]
-; CHECK-WITH-FP-NEXT: .Ltmp
-; CHECK-WITH-FP-NEXT: .cfi_def_cfa sp, [[SP_INIT_ADJ]]
-
-; CHECK-WITH-FP: stp x29, x30, [sp, [[OFFSET:#[0-9]+]]]
-; CHECK-WITH-FP-NEXT: add x29, sp, [[OFFSET]]
-; CHECK-WITH-FP-NEXT: .Ltmp
-; CHECK-WITH-FP-NEXT: .cfi_def_cfa x29, 16
-
-  ; We shouldn't emit any kind of update for the second stack adjustment if the
-  ; FP is in use.
-; CHECK-WITH-FP-NOT: .cfi_def_cfa_offset
-
-; CHECK-WITH-FP: bl use_addr
diff --git a/test/DebugInfo/AArch64/lit.local.cfg b/test/DebugInfo/AArch64/lit.local.cfg
index 9a66a00..a75a42b 100644
--- a/test/DebugInfo/AArch64/lit.local.cfg
+++ b/test/DebugInfo/AArch64/lit.local.cfg
@@ -1,4 +1,4 @@
 targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
+if not 'ARM64' in targets:
     config.unsupported = True
 
diff --git a/test/DebugInfo/AArch64/struct_by_value.ll b/test/DebugInfo/AArch64/struct_by_value.ll
new file mode 100644
index 0000000..0023c3d
--- /dev/null
+++ b/test/DebugInfo/AArch64/struct_by_value.ll
@@ -0,0 +1,68 @@
+; A by-value struct is a register-indirect value (breg).
+; RUN: llc %s -filetype=asm -o - | FileCheck %s
+
+; CHECK: DW_OP_breg0
+
+; rdar://problem/13658587
+;
+; Generated from
+;
+; struct five
+; {
+;   int a;
+;   int b;
+;   int c;
+;   int d;
+;   int e;
+; };
+;
+; int
+; return_five_int (struct five f)
+; {
+;   return f.a;
+; }
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+%struct.five = type { i32, i32, i32, i32, i32 }
+
+; Function Attrs: nounwind ssp
+define i32 @return_five_int(%struct.five* %f) #0 {
+entry:
+  call void @llvm.dbg.declare(metadata !{%struct.five* %f}, metadata !17), !dbg !18
+  %a = getelementptr inbounds %struct.five* %f, i32 0, i32 0, !dbg !19
+  %0 = load i32* %a, align 4, !dbg !19
+  ret i32 %0, !dbg !19
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { nounwind ssp }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!16, !20}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"LLVM version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [struct_by_value.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"struct_by_value.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"return_five_int", metadata !"return_five_int", metadata !"", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.five*)* @return_five_int, null, null, metadata !2, i32 14} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 14] [return_five_int]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [struct_by_value.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !9}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786451, metadata !1, null, metadata !"five", i32 1, i64 160, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [five] [line 1, size 160, align 32, offset 0] [def] [from ]
+!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !14, metadata !15}
+!11 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"a", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 3, size 32, align 32, offset 0] [from int]
+!12 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"b", i32 4, i64 32, i64 32, i64 32, i32 0, metadata !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 32] [from int]
+!13 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"c", i32 5, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ] [c] [line 5, size 32, align 32, offset 64] [from int]
+!14 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"d", i32 6, i64 32, i64 32, i64 96, i32 0, metadata !8} ; [ DW_TAG_member ] [d] [line 6, size 32, align 32, offset 96] [from int]
+!15 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"e", i32 7, i64 32, i64 32, i64 128, i32 0, metadata !8} ; [ DW_TAG_member ] [e] [line 7, size 32, align 32, offset 128] [from int]
+!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!17 = metadata !{i32 786689, metadata !4, metadata !"f", metadata !5, i32 16777229, metadata !9, i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [f] [line 13]
+!18 = metadata !{i32 13, i32 0, metadata !4, null}
+!19 = metadata !{i32 16, i32 0, metadata !4, null}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/AArch64/variable-loc.ll b/test/DebugInfo/AArch64/variable-loc.ll
deleted file mode 100644
index f28ee76..0000000
--- a/test/DebugInfo/AArch64/variable-loc.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
-
-; This is a regression test making sure the location of variables is correct in
-; debugging information, even if they're addressed via the frame pointer.
-
-; In case it needs, regenerating, the following suffices:
-; int printf(const char *, ...);
-; void populate_array(int *, int);
-; int sum_array(int *, int);
-
-; int main() {
-;     int main_arr[100], val;
-;     populate_array(main_arr, 100);
-;     val = sum_array(main_arr, 100);
-;     printf("Total is %d\n", val);
-;     return 0;
-; }
-
-  ; First make sure main_arr is where we expect it: sp + 4 == x29 - 412:
-; CHECK: main:
-; CHECK: sub sp, sp, #432
-; CHECK: stp x29, x30, [sp, #416]
-; CHECK: add x29, sp, #416
-; CHECK: add {{x[0-9]+}}, sp, #4
-
-; CHECK: .Linfo_string7:
-; CHECK-NEXT: main_arr
-
-; Now check the debugging information reflects this:
-; CHECK: DW_TAG_variable
-; CHECK-NEXT: .word .Linfo_string7
-
-  ; Rather hard-coded, but 145 => DW_OP_fbreg and the .ascii is LEB128 encoded -412.
-; CHECK: DW_AT_location
-; CHECK-NEXT: .byte 145
-; CHECK-NEXT: .ascii "\344|"
-
-
-
-target datalayout = "e-p:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-n32:64-S128"
-target triple = "aarch64-none-linux-gnu"
-
-@.str = private unnamed_addr constant [13 x i8] c"Total is %d\0A\00", align 1
-
-declare void @populate_array(i32*, i32) nounwind
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-declare i32 @sum_array(i32*, i32) nounwind
-
-define i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  %main_arr = alloca [100 x i32], align 4
-  %val = alloca i32, align 4
-  store i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{[100 x i32]* %main_arr}, metadata !17), !dbg !22
-  call void @llvm.dbg.declare(metadata !{i32* %val}, metadata !23), !dbg !24
-  %arraydecay = getelementptr inbounds [100 x i32]* %main_arr, i32 0, i32 0, !dbg !25
-  call void @populate_array(i32* %arraydecay, i32 100), !dbg !25
-  %arraydecay1 = getelementptr inbounds [100 x i32]* %main_arr, i32 0, i32 0, !dbg !26
-  %call = call i32 @sum_array(i32* %arraydecay1, i32 100), !dbg !26
-  store i32 %call, i32* %val, align 4, !dbg !26
-  %0 = load i32* %val, align 4, !dbg !27
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), i32 %0), !dbg !27
-  ret i32 0, !dbg !28
-}
-
-declare i32 @printf(i8*, ...)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!30}
-
-!0 = metadata !{i32 786449, metadata !29, i32 12, metadata !"clang version 3.2 ", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/timnor01/a64-trunk/build/simple.c] [DW_LANG_C99]
-!1 = metadata !{}
-!3 = metadata !{metadata !5, metadata !11, metadata !14}
-!5 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"populate_array", metadata !"populate_array", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*, i32)* @populate_array, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [populate_array]
-!6 = metadata !{i32 786473, metadata !29} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9, metadata !10}
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!11 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"sum_array", metadata !"sum_array", metadata !"", i32 9, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*, i32)* @sum_array, null, null, metadata !1, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [sum_array]
-!12 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!13 = metadata !{metadata !10, metadata !9, metadata !10}
-!14 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
-!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!16 = metadata !{metadata !10}
-!17 = metadata !{i32 786688, metadata !18, metadata !"main_arr", metadata !6, i32 19, metadata !19, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [main_arr] [line 19]
-!18 = metadata !{i32 786443, metadata !29, metadata !14, i32 18, i32 16, i32 4} ; [ DW_TAG_lexical_block ] [/home/timnor01/a64-trunk/build/simple.c]
-!19 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 3200, i64 32, i32 0, i32 0, metadata !10, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 3200, align 32, offset 0] [from int]
-!20 = metadata !{i32 786465, i64 0, i64 99}       ; [ DW_TAG_subrange_type ] [0, 99]
-!22 = metadata !{i32 19, i32 7, metadata !18, null}
-!23 = metadata !{i32 786688, metadata !18, metadata !"val", metadata !6, i32 20, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [val] [line 20]
-!24 = metadata !{i32 20, i32 7, metadata !18, null}
-!25 = metadata !{i32 22, i32 3, metadata !18, null}
-!26 = metadata !{i32 23, i32 9, metadata !18, null}
-!27 = metadata !{i32 24, i32 3, metadata !18, null}
-!28 = metadata !{i32 26, i32 3, metadata !18, null}
-!29 = metadata !{metadata !"simple.c", metadata !"/home/timnor01/a64-trunk/build"}
-!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/ARM64/lit.local.cfg b/test/DebugInfo/ARM64/lit.local.cfg
deleted file mode 100644
index a75a42b..0000000
--- a/test/DebugInfo/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/DebugInfo/ARM64/struct_by_value.ll b/test/DebugInfo/ARM64/struct_by_value.ll
deleted file mode 100644
index 0023c3d..0000000
--- a/test/DebugInfo/ARM64/struct_by_value.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; A by-value struct is a register-indirect value (breg).
-; RUN: llc %s -filetype=asm -o - | FileCheck %s
-
-; CHECK: DW_OP_breg0
-
-; rdar://problem/13658587
-;
-; Generated from
-;
-; struct five
-; {
-;   int a;
-;   int b;
-;   int c;
-;   int d;
-;   int e;
-; };
-;
-; int
-; return_five_int (struct five f)
-; {
-;   return f.a;
-; }
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios3.0.0"
-
-%struct.five = type { i32, i32, i32, i32, i32 }
-
-; Function Attrs: nounwind ssp
-define i32 @return_five_int(%struct.five* %f) #0 {
-entry:
-  call void @llvm.dbg.declare(metadata !{%struct.five* %f}, metadata !17), !dbg !18
-  %a = getelementptr inbounds %struct.five* %f, i32 0, i32 0, !dbg !19
-  %0 = load i32* %a, align 4, !dbg !19
-  ret i32 %0, !dbg !19
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
-
-attributes #0 = { nounwind ssp }
-attributes #1 = { nounwind readnone }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!16, !20}
-
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"LLVM version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [struct_by_value.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"struct_by_value.c", metadata !""}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"return_five_int", metadata !"return_five_int", metadata !"", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.five*)* @return_five_int, null, null, metadata !2, i32 14} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 14] [return_five_int]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [struct_by_value.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !9}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786451, metadata !1, null, metadata !"five", i32 1, i64 160, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [five] [line 1, size 160, align 32, offset 0] [def] [from ]
-!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !14, metadata !15}
-!11 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"a", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 3, size 32, align 32, offset 0] [from int]
-!12 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"b", i32 4, i64 32, i64 32, i64 32, i32 0, metadata !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 32] [from int]
-!13 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"c", i32 5, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ] [c] [line 5, size 32, align 32, offset 64] [from int]
-!14 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"d", i32 6, i64 32, i64 32, i64 96, i32 0, metadata !8} ; [ DW_TAG_member ] [d] [line 6, size 32, align 32, offset 96] [from int]
-!15 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"e", i32 7, i64 32, i64 32, i64 128, i32 0, metadata !8} ; [ DW_TAG_member ] [e] [line 7, size 32, align 32, offset 128] [from int]
-!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!17 = metadata !{i32 786689, metadata !4, metadata !"f", metadata !5, i32 16777229, metadata !9, i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [f] [line 13]
-!18 = metadata !{i32 13, i32 0, metadata !4, null}
-!19 = metadata !{i32 16, i32 0, metadata !4, null}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/COFF/asm.ll b/test/DebugInfo/COFF/asm.ll
index 1ce3681..8c9dff0 100644
--- a/test/DebugInfo/COFF/asm.ll
+++ b/test/DebugInfo/COFF/asm.ll
@@ -21,7 +21,7 @@
 ; X86-NEXT: ret
 ; X86-NEXT: [[END_OF_F:.*]]:
 ;
-; X86-LABEL: .section        .debug$S,"rn"
+; X86-LABEL: .section        .debug$S,"rnd"
 ; X86-NEXT: .long   4
 ; X86-NEXT: .long   242
 ; X86-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
@@ -90,7 +90,7 @@
 ; X64-NEXT: ret
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
-; X64-LABEL: .section        .debug$S,"rn"
+; X64-LABEL: .section        .debug$S,"rnd"
 ; X64-NEXT: .long   4
 ; X64-NEXT: .long   242
 ; X64-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
diff --git a/test/DebugInfo/COFF/multifile.ll b/test/DebugInfo/COFF/multifile.ll
index 53a645e..c04bdb3 100644
--- a/test/DebugInfo/COFF/multifile.ll
+++ b/test/DebugInfo/COFF/multifile.ll
@@ -28,7 +28,7 @@
 ; X86-NEXT: ret
 ; X86-NEXT: [[END_OF_F:.*]]:
 ;
-; X86-LABEL: .section        .debug$S,"rn"
+; X86-LABEL: .section        .debug$S,"rnd"
 ; X86-NEXT: .long   4
 ; X86-NEXT: .long   242
 ; X86-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
@@ -122,7 +122,7 @@
 ; X64-NEXT: ret
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
-; X64-LABEL: .section        .debug$S,"rn"
+; X64-LABEL: .section        .debug$S,"rnd"
 ; X64-NEXT: .long   4
 ; X64-NEXT: .long   242
 ; X64-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
diff --git a/test/DebugInfo/COFF/multifunction.ll b/test/DebugInfo/COFF/multifunction.ll
index d664716..5a65558 100644
--- a/test/DebugInfo/COFF/multifunction.ll
+++ b/test/DebugInfo/COFF/multifunction.ll
@@ -50,7 +50,7 @@
 ; X86-NEXT: ret
 ; X86-NEXT: [[END_OF_F:.*]]:
 ;
-; X86-LABEL: .section        .debug$S,"rn"
+; X86-LABEL: .section        .debug$S,"rnd"
 ; X86-NEXT: .long   4
 ; Line table subsection for x
 ; X86-NEXT: .long   242
@@ -200,7 +200,7 @@
 ; X64-NEXT: ret
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
-; X64-LABEL: .section        .debug$S,"rn"
+; X64-LABEL: .section        .debug$S,"rnd"
 ; X64-NEXT: .long   4
 ; Line table subsection for x
 ; X64-NEXT: .long   242
diff --git a/test/DebugInfo/COFF/simple.ll b/test/DebugInfo/COFF/simple.ll
index 8fa6870..2613a18 100644
--- a/test/DebugInfo/COFF/simple.ll
+++ b/test/DebugInfo/COFF/simple.ll
@@ -19,7 +19,7 @@
 ; X86-NEXT: ret
 ; X86-NEXT: [[END_OF_F:.*]]:
 ;
-; X86-LABEL: .section        .debug$S,"rn"
+; X86-LABEL: .section        .debug$S,"rnd"
 ; X86-NEXT: .long   4
 ; X86-NEXT: .long   242
 ; X86-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
@@ -81,7 +81,7 @@
 ; X64-NEXT: ret
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
-; X64-LABEL: .section        .debug$S,"rn"
+; X64-LABEL: .section        .debug$S,"rnd"
 ; X64-NEXT: .long   4
 ; X64-NEXT: .long   242
 ; X64-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
diff --git a/test/DebugInfo/COFF/tail-call-without-lexical-scopes.ll b/test/DebugInfo/COFF/tail-call-without-lexical-scopes.ll
index f5e2eae..4d2e427 100644
--- a/test/DebugInfo/COFF/tail-call-without-lexical-scopes.ll
+++ b/test/DebugInfo/COFF/tail-call-without-lexical-scopes.ll
@@ -22,7 +22,7 @@
 ; X86-NEXT: [[END_OF_BAR:^L.*]]:{{$}}
 ; X86-NOT:  ret
 
-; X86-LABEL: .section        .debug$S,"rn"
+; X86-LABEL: .section        .debug$S,"rnd"
 ; X86:       .secrel32 "?bar@@YAXHZZ"
 ; X86-NEXT:  .secidx   "?bar@@YAXHZZ"
 ; X86:       .long   0
diff --git a/test/DebugInfo/Inputs/llvm-symbolizer-dwo-test b/test/DebugInfo/Inputs/llvm-symbolizer-dwo-test
new file mode 100755
index 0000000..c28c3d2
Binary files /dev/null and b/test/DebugInfo/Inputs/llvm-symbolizer-dwo-test differ
diff --git a/test/DebugInfo/Inputs/llvm-symbolizer-dwo-test.cc b/test/DebugInfo/Inputs/llvm-symbolizer-dwo-test.cc
new file mode 100644
index 0000000..ea0967a
--- /dev/null
+++ b/test/DebugInfo/Inputs/llvm-symbolizer-dwo-test.cc
@@ -0,0 +1,18 @@
+int f(int a, int b) {
+  return a + b;
+}
+
+int g(int a) {
+  return a + 1;
+}
+
+
+int main() {
+  return f(2, g(2));
+}
+
+// Built with Clang 3.5.0:
+// $ mkdir -p /tmp/dbginfo
+// $ cp llvm-symbolizer-dwo-test.cc /tmp/dbginfo
+// $ cd /tmp/dbginfo
+// $ clang -gsplit-dwarf llvm-symbolizer-dwo-test.cc
diff --git a/test/DebugInfo/Mips/delay-slot.ll b/test/DebugInfo/Mips/delay-slot.ll
new file mode 100644
index 0000000..9bce4ba
--- /dev/null
+++ b/test/DebugInfo/Mips/delay-slot.ll
@@ -0,0 +1,75 @@
+; RUN: llc -filetype=obj -O0 < %s -mtriple mips-unknown-linux-gnu | llvm-dwarfdump - | FileCheck %s
+; PR19815
+
+; Generated using clang -target mips-linux-gnu -g test.c -S -o - -flto|opt -sroa -S
+; test.c:
+;
+; int foo(int x) {
+;  if (x)
+;    return 0;
+;  return 1;
+; }
+
+; CHECK: Address            Line   Column File   ISA Discriminator Flags
+; CHECK: ------------------ ------ ------ ------ --- ------------- -------------
+; CHECK: 0x0000000000000000      1      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000000      1      0      1   0             0  is_stmt prologue_end
+; CHECK: 0x0000000000000008      2      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000020      3      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000030      4      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000040      5      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000050      5      0      1   0             0  is_stmt end_sequence
+
+target datalayout = "E-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+; Function Attrs: nounwind
+define i32 @foo(i32 %x) #0 {
+entry:
+  call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !12), !dbg !13
+  %tobool = icmp ne i32 %x, 0, !dbg !14
+  br i1 %tobool, label %if.then, label %if.end, !dbg !14
+
+if.then:                                          ; preds = %entry
+  br label %return, !dbg !16
+
+if.end:                                           ; preds = %entry
+  br label %return, !dbg !17
+
+return:                                           ; preds = %if.end, %if.then
+  %retval.0 = phi i32 [ 0, %if.then ], [ 1, %if.end ]
+  ret i32 %retval.0, !dbg !18
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/test.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"test.c", metadata !"/tmp"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{metadata !"clang version 3.5.0"}
+!12 = metadata !{i32 786689, metadata !4, metadata !"x", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!13 = metadata !{i32 1, i32 0, metadata !4, null}
+!14 = metadata !{i32 2, i32 0, metadata !15, null}
+!15 = metadata !{i32 786443, metadata !1, metadata !4, i32 2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/test.c]
+!16 = metadata !{i32 3, i32 0, metadata !15, null}
+!17 = metadata !{i32 4, i32 0, metadata !4, null}
+!18 = metadata !{i32 5, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/Mips/lit.local.cfg b/test/DebugInfo/Mips/lit.local.cfg
new file mode 100644
index 0000000..88262fb
--- /dev/null
+++ b/test/DebugInfo/Mips/lit.local.cfg
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'Mips' in targets:
+    config.unsupported = True
diff --git a/test/DebugInfo/SystemZ/variable-loc.ll b/test/DebugInfo/SystemZ/variable-loc.ll
index 2d92fd9..e0e4156 100644
--- a/test/DebugInfo/SystemZ/variable-loc.ll
+++ b/test/DebugInfo/SystemZ/variable-loc.ll
@@ -1,4 +1,6 @@
 ; RUN: llc -mtriple=s390x-linux-gnu -disable-fp-elim < %s | FileCheck %s
+; RUN: llc -mtriple=s390x-linux-gnu -disable-fp-elim -filetype=obj < %s \
+; RUN:     | llvm-dwarfdump -debug-dump=info - | FileCheck --check-prefix=DEBUG %s
 ;
 ; This is a regression test making sure the location of variables is correct in
 ; debugging information, even if they're addressed via the frame pointer.
@@ -10,20 +12,13 @@
 ; CHECK: aghi    %r15, -568
 ; CHECK: la      %r2, 164(%r11)
 ; CHECK: brasl   %r14, populate_array@PLT
-;
-; CHECK: .Linfo_string7:
-; CHECK-NEXT: main_arr
-;
-; Now check that the debugging information reflects this:
-; CHECK: DW_TAG_variable
-; CHECK-NEXT: .long .Linfo_string7
-;
-; Rather hard-coded, but 145 => DW_OP_fbreg and the .ascii is the sleb128
-; encoding of 164:
-; CHECK: DW_AT_location
-; CHECK-NEXT: .byte 145
-; CHECK-NEXT: .ascii "\244\001"
-;
+
+; DEBUG: DW_TAG_variable
+; DEBUG-NOT: DW_TAG
+; DEBUG: DW_AT_name {{.*}} "main_arr"
+; Rather hard-coded, but 0x91 => DW_OP_fbreg and 0xa401 is SLEB128 encoded 164.
+; DEBUG-NOT: DW_TAG
+; DEBUG: DW_AT_location {{.*}}(<0x3> 91 a4 01 )
 
 
 @.str = private unnamed_addr constant [13 x i8] c"Total is %d\0A\00", align 2
diff --git a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
index 0c90587..1bbfbf4 100644
--- a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
+++ b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
@@ -22,13 +22,11 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"clang version 3.0 (trunk)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !12,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !6, metadata !6, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @f, null, null, metadata !10, i32 0} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [f]
+!5 = metadata !{i32 720942, metadata !6, metadata !6, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @f, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [f]
 !6 = metadata !{i32 720937, metadata !20} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !12 = metadata !{metadata !14}
 !14 = metadata !{i32 720948, i32 0, null, metadata !"GLB", metadata !"GLB", metadata !"", metadata !6, i32 1, metadata !9, i32 0, i32 1, i32* @GLB, null} ; [ DW_TAG_variable ]
 !15 = metadata !{i32 786688, metadata !16, metadata !"LOC", metadata !6, i32 4, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
diff --git a/test/DebugInfo/X86/2011-12-16-BadStructRef.ll b/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
index 8898cf1..21dccd7 100644
--- a/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
+++ b/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
@@ -100,22 +100,18 @@ entry:
 !10 = metadata !{metadata !11, metadata !13}
 !11 = metadata !{i32 720909, metadata !82, metadata !9, metadata !"h", i32 5, i64 32, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ]
 !12 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!13 = metadata !{i32 720942, metadata !82, metadata !9, metadata !"baz", metadata !"baz", metadata !"", i32 6, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !17, i32 0} ; [ DW_TAG_subprogram ]
+!13 = metadata !{i32 720942, metadata !82, metadata !9, metadata !"baz", metadata !"baz", metadata !"", i32 6, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 0} ; [ DW_TAG_subprogram ]
 !14 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !16, metadata !12}
 !16 = metadata !{i32 720911, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !9} ; [ DW_TAG_pointer_type ]
-!17 = metadata !{metadata !18}
-!18 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !19 = metadata !{i32 720909, metadata !82, metadata !5, metadata !"b_ref", i32 12, i64 64, i64 64, i64 64, i32 0, metadata !20} ; [ DW_TAG_member ]
 !20 = metadata !{i32 720912, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_reference_type ]
-!21 = metadata !{i32 720942, metadata !82, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 13, metadata !22, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !25, i32 0} ; [ DW_TAG_subprogram ]
+!21 = metadata !{i32 720942, metadata !82, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 13, metadata !22, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 0} ; [ DW_TAG_subprogram ]
 !22 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null, metadata !24, metadata !12}
 !24 = metadata !{i32 720911, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !5} ; [ DW_TAG_pointer_type ]
-!25 = metadata !{metadata !26}
-!26 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !27 = metadata !{metadata !29, metadata !37, metadata !40, metadata !43, metadata !46}
-!29 = metadata !{i32 720942, metadata !82, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 17, metadata !30, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !47, i32 0} ; [ DW_TAG_subprogram ] [line 17] [def] [scope 0] [main]
+!29 = metadata !{i32 720942, metadata !82, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 17, metadata !30, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 17] [def] [scope 0] [main]
 !30 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !31 = metadata !{metadata !12, metadata !12, metadata !32}
 !32 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !33} ; [ DW_TAG_pointer_type ]
@@ -123,18 +119,16 @@ entry:
 !34 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !35 = metadata !{metadata !36}
 !36 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!37 = metadata !{i32 720942, metadata !82, null, metadata !"bar", metadata !"bar", metadata !"_ZN3barC1Ei", i32 13, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.bar*, i32)* @_ZN3barC1Ei, null, metadata !21, metadata !47, i32 0} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
+!37 = metadata !{i32 720942, metadata !82, null, metadata !"bar", metadata !"bar", metadata !"_ZN3barC1Ei", i32 13, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.bar*, i32)* @_ZN3barC1Ei, null, metadata !21, null, i32 0} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
 !38 = metadata !{metadata !39}
 !39 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!40 = metadata !{i32 720942, metadata !82, null, metadata !"bar", metadata !"bar", metadata !"_ZN3barC2Ei", i32 13, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.bar*, i32)* @_ZN3barC2Ei, null, metadata !21, metadata !47, i32 0} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
+!40 = metadata !{i32 720942, metadata !82, null, metadata !"bar", metadata !"bar", metadata !"_ZN3barC2Ei", i32 13, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.bar*, i32)* @_ZN3barC2Ei, null, metadata !21, null, i32 0} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
 !41 = metadata !{metadata !42}
 !42 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!43 = metadata !{i32 720942, metadata !82, null, metadata !"baz", metadata !"baz", metadata !"_ZN3bazC1Ei", i32 6, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.baz*, i32)* @_ZN3bazC1Ei, null, metadata !13, metadata !47, i32 0} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
+!43 = metadata !{i32 720942, metadata !82, null, metadata !"baz", metadata !"baz", metadata !"_ZN3bazC1Ei", i32 6, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.baz*, i32)* @_ZN3bazC1Ei, null, metadata !13, null, i32 0} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
 !44 = metadata !{metadata !45}
 !45 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!46 = metadata !{i32 720942, metadata !82, null, metadata !"baz", metadata !"baz", metadata !"_ZN3bazC2Ei", i32 6, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.baz*, i32)* @_ZN3bazC2Ei, null, metadata !13, metadata !47, i32 0} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
-!47 = metadata !{metadata !48}
-!48 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
+!46 = metadata !{i32 720942, metadata !82, null, metadata !"baz", metadata !"baz", metadata !"_ZN3bazC2Ei", i32 6, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.baz*, i32)* @_ZN3bazC2Ei, null, metadata !13, null, i32 0} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
 !49 = metadata !{i32 721153, metadata !29, metadata !"argc", metadata !6, i32 16777232, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !50 = metadata !{i32 16, i32 14, metadata !29, null}
 !51 = metadata !{i32 721153, metadata !29, metadata !"argv", metadata !6, i32 33554448, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
diff --git a/test/DebugInfo/X86/DW_AT_byte_size.ll b/test/DebugInfo/X86/DW_AT_byte_size.ll
index 6884c41..59921bd 100644
--- a/test/DebugInfo/X86/DW_AT_byte_size.ll
+++ b/test/DebugInfo/X86/DW_AT_byte_size.ll
@@ -29,7 +29,7 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !0 = metadata !{i32 786449, metadata !20, i32 4, metadata !"clang version 3.1 (trunk 150996)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !20, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooP1A", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.A*)* @_Z3fooP1A, null, null, metadata !14, i32 3} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 786478, metadata !20, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooP1A", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.A*)* @_Z3fooP1A, null, null, null, i32 3} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10}
@@ -38,8 +38,6 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !11 = metadata !{i32 786434, metadata !20, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 786445, metadata !20, metadata !11, metadata !"b", i32 1, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ]
-!14 = metadata !{metadata !15}
-!15 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !16 = metadata !{i32 786689, metadata !5, metadata !"a", metadata !6, i32 16777219, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !17 = metadata !{i32 3, i32 13, metadata !5, null}
 !18 = metadata !{i32 4, i32 3, metadata !19, null}
diff --git a/test/DebugInfo/X86/DW_AT_linkage_name.ll b/test/DebugInfo/X86/DW_AT_linkage_name.ll
new file mode 100644
index 0000000..dce234a
--- /dev/null
+++ b/test/DebugInfo/X86/DW_AT_linkage_name.ll
@@ -0,0 +1,116 @@
+; RUN: llc -mtriple=x86_64-apple-macosx %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
+;
+; struct A {
+;   A(int i);
+;   ~A();
+; };
+;
+; A::~A() {}
+;
+; void foo() {
+;   A a(1);
+; }
+;
+; rdar://problem/16362674
+;
+; Test that we do not emit a linkage name for the declaration of a destructor.
+; Test that we do emit a linkage name for a specific instance of it.
+
+; CHECK: DW_TAG_subprogram
+; CHECK: [[A_DTOR:.*]]:     DW_TAG_subprogram
+; CHECK: DW_AT_name {{.*}} "~A"
+; CHECK-NOT: DW_AT_MIPS_linkage_name
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name {{.*}} "_ZN1AD2Ev"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_specification {{.*}}[[A_DTOR]]
+
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+%struct.A = type { i8 }
+
+; Function Attrs: nounwind ssp uwtable
+define void @_ZN1AD2Ev(%struct.A* %this) unnamed_addr #0 align 2 {
+entry:
+  %this.addr = alloca %struct.A*, align 8
+  store %struct.A* %this, %struct.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.A** %this.addr}, metadata !26), !dbg !28
+  %this1 = load %struct.A** %this.addr
+  ret void, !dbg !29
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: nounwind ssp uwtable
+define void @_ZN1AD1Ev(%struct.A* %this) unnamed_addr #0 align 2 {
+entry:
+  %this.addr = alloca %struct.A*, align 8
+  store %struct.A* %this, %struct.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.A** %this.addr}, metadata !30), !dbg !31
+  %this1 = load %struct.A** %this.addr
+  call void @_ZN1AD2Ev(%struct.A* %this1), !dbg !32
+  ret void, !dbg !33
+}
+
+; Function Attrs: ssp uwtable
+define void @_Z3foov() #2 {
+entry:
+  %a = alloca %struct.A, align 1
+  call void @llvm.dbg.declare(metadata !{%struct.A* %a}, metadata !34), !dbg !35
+  call void @_ZN1AC1Ei(%struct.A* %a, i32 1), !dbg !35
+  call void @_ZN1AD1Ev(%struct.A* %a), !dbg !36
+  ret void, !dbg !36
+}
+
+declare void @_ZN1AC1Ei(%struct.A*, i32)
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind readnone }
+attributes #2 = { ssp uwtable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!23, !24}
+!llvm.ident = !{!25}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !16, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [linkage-name.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"linkage-name.cpp", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"A", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !6, metadata !12}
+!6 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !11, i32 2} ; [ DW_TAG_subprogram ] [line 2] [A]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9, metadata !10}
+!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = metadata !{i32 786468}
+!12 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"~A", metadata !"~A", metadata !"", i32 3, metadata !13, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !15, i32 3} ; [ DW_TAG_subprogram ] [line 3] [~A]
+!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{null, metadata !9}
+!15 = metadata !{i32 786468}
+!16 = metadata !{metadata !17, metadata !18, metadata !19}
+!17 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"~A", metadata !"~A", metadata !"_ZN1AD2Ev", i32 6, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.A*)* @_ZN1AD2Ev, null, metadata !12, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [~A]
+!18 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"~A", metadata !"~A", metadata !"_ZN1AD1Ev", i32 6, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.A*)* @_ZN1AD1Ev, null, metadata !12, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [~A]
+!19 = metadata !{i32 786478, metadata !1, metadata !20, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 10, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3foov, null, null, metadata !2, i32 10} ; [ DW_TAG_subprogram ] [line 10] [def] [foo]
+!20 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [linkage-name.cpp]
+!21 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{null}
+!23 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!25 = metadata !{metadata !"clang version 3.5.0 "}
+!26 = metadata !{i32 786689, metadata !17, metadata !"this", null, i32 16777216, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!27 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!28 = metadata !{i32 0, i32 0, metadata !17, null}
+!29 = metadata !{i32 8, i32 0, metadata !17, null} ; [ DW_TAG_imported_declaration ]
+!30 = metadata !{i32 786689, metadata !18, metadata !"this", null, i32 16777216, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!31 = metadata !{i32 0, i32 0, metadata !18, null}
+!32 = metadata !{i32 6, i32 0, metadata !18, null}
+!33 = metadata !{i32 8, i32 0, metadata !18, null} ; [ DW_TAG_imported_declaration ]
+!34 = metadata !{i32 786688, metadata !19, metadata !"a", metadata !20, i32 11, metadata !"_ZTS1A", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 11]
+!35 = metadata !{i32 11, i32 0, metadata !19, null}
+!36 = metadata !{i32 12, i32 0, metadata !19, null}
diff --git a/test/DebugInfo/X86/DW_AT_location-reference.ll b/test/DebugInfo/X86/DW_AT_location-reference.ll
index 4bdfd6f..6c5e32c0 100644
--- a/test/DebugInfo/X86/DW_AT_location-reference.ll
+++ b/test/DebugInfo/X86/DW_AT_location-reference.ll
@@ -1,8 +1,6 @@
 ; RUN: llc -O1 -filetype=obj -mtriple=x86_64-apple-darwin < %s > %t
 ; RUN: llvm-dwarfdump %t  | FileCheck %s
-; FIXME: llvm-objdump is failing with an error when parsing some relocations
-; here, though it doesn't seem to adversely affect the test
-; RUN: not llvm-objdump -r %t | FileCheck -check-prefix=DARWIN %s
+; RUN: llvm-objdump -r %t | FileCheck -check-prefix=DARWIN %s
 ; RUN: llc -O1 -filetype=obj -mtriple=x86_64-pc-linux-gnu < %s > %t
 ; RUN: llvm-dwarfdump %t  | FileCheck %s
 ; RUN: llvm-objdump -r %t | FileCheck -check-prefix=LINUX %s
diff --git a/test/DebugInfo/X86/DW_AT_specification.ll b/test/DebugInfo/X86/DW_AT_specification.ll
index c1e7d9c..b93cdf0 100644
--- a/test/DebugInfo/X86/DW_AT_specification.ll
+++ b/test/DebugInfo/X86/DW_AT_specification.ll
@@ -3,10 +3,10 @@
 
 ; test that the DW_AT_specification is a back edge in the file.
 
-; CHECK: DW_TAG_subprogram [{{[0-9]+}}] *
-; CHECK: DW_AT_specification [DW_FORM_ref4]      (cu + 0x[[OFFSET:[0-9a-f]*]] => {0x0000[[OFFSET]]})
-; CHECK: 0x0000[[OFFSET]]: DW_TAG_subprogram [{{[0-9]+}}] *
-; CHECK: DW_AT_name [DW_FORM_strp]	( .debug_str[0x{{[0-9a-f]*}}] = "bar")
+; CHECK: [[BAR_DECL:0x[0-9a-f]*]]: DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_MIPS_linkage_name {{.*}} "_ZN3foo3barEv"
+; CHECK: DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_specification {{.*}} {[[BAR_DECL]]}
 
 
 @_ZZN3foo3barEvE1x = constant i32 0, align 4
@@ -22,19 +22,15 @@ entry:
 !0 = metadata !{i32 786449, metadata !27, i32 4, metadata !"clang version 3.0 ()", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !18,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !6, null, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN3foo3barEv, null, metadata !11, metadata !16, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [bar]
+!5 = metadata !{i32 720942, metadata !6, null, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN3foo3barEv, null, metadata !11, null, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [bar]
 !6 = metadata !{i32 720937, metadata !27} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
 !9 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
 !10 = metadata !{i32 786451, metadata !27, null, metadata !"foo", i32 1, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 0, align 0, offset 0] [decl] [from ]
-!11 = metadata !{i32 720942, metadata !6, metadata !12, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !14, i32 2} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 720942, metadata !6, metadata !12, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 2} ; [ DW_TAG_subprogram ]
 !12 = metadata !{i32 720898, metadata !27, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !13, i32 0, null, null} ; [ DW_TAG_class_type ]
 !13 = metadata !{metadata !11}
-!14 = metadata !{metadata !15}
-!15 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!16 = metadata !{metadata !17}
-!17 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !18 = metadata !{metadata !20}
 !20 = metadata !{i32 720948, i32 0, metadata !5, metadata !"x", metadata !"x", metadata !"", metadata !6, i32 5, metadata !21, i32 1, i32 1, i32* @_ZZN3foo3barEvE1x, null} ; [ DW_TAG_variable ]
 !21 = metadata !{i32 720934, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_const_type ]
diff --git a/test/DebugInfo/X86/arguments.ll b/test/DebugInfo/X86/arguments.ll
index 6735284..3597b2c 100644
--- a/test/DebugInfo/X86/arguments.ll
+++ b/test/DebugInfo/X86/arguments.ll
@@ -15,7 +15,8 @@
 
 ; CHECK: debug_info contents
 ; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name{{.*}}"_Z4func3fooS_"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name{{.*}}"_Z4func3fooS_"
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NEXT: DW_AT_name{{.*}}"f"
diff --git a/test/DebugInfo/X86/array.ll b/test/DebugInfo/X86/array.ll
new file mode 100644
index 0000000..dc6c7a4
--- /dev/null
+++ b/test/DebugInfo/X86/array.ll
@@ -0,0 +1,101 @@
+; ModuleID = 'array.c'
+;
+; From (clang -g -c -O1):
+;
+; void f(int* p) {
+;   p[0] = 42;
+; }
+;
+; int main(int argc, char** argv) {
+;   int array[4] = { 0, 1, 2, 3 };
+;   f(array);
+;   return array[0];
+; }
+;
+; RUN: llc -filetype=asm %s -o - | FileCheck %s
+; Test that we only emit register-indirect locations for the array array.
+; rdar://problem/14874886
+;
+; CHECK:     ##DEBUG_VALUE: main:array <- [R{{.*}}+0]
+; CHECK-NOT: ##DEBUG_VALUE: main:array <- R{{.*}}
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+@main.array = private unnamed_addr constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 16
+
+; Function Attrs: nounwind ssp uwtable
+define void @f(i32* nocapture %p) #0 {
+  tail call void @llvm.dbg.value(metadata !{i32* %p}, i64 0, metadata !11), !dbg !28
+  store i32 42, i32* %p, align 4, !dbg !29, !tbaa !30
+  ret void, !dbg !34
+}
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
+  %array = alloca [4 x i32], align 16
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !19), !dbg !35
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !20), !dbg !35
+  tail call void @llvm.dbg.value(metadata !{[4 x i32]* %array}, i64 0, metadata !21), !dbg !36
+  %1 = bitcast [4 x i32]* %array to i8*, !dbg !36
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([4 x i32]* @main.array to i8*), i64 16, i32 16, i1 false), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{[4 x i32]* %array}, i64 0, metadata !21), !dbg !36
+  %2 = getelementptr inbounds [4 x i32]* %array, i64 0, i64 0, !dbg !37
+  call void @f(i32* %2), !dbg !37
+  tail call void @llvm.dbg.value(metadata !{[4 x i32]* %array}, i64 0, metadata !21), !dbg !36
+  %3 = load i32* %2, align 16, !dbg !38, !tbaa !30
+  ret i32 %3, !dbg !38
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #2
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!25, !26}
+!llvm.ident = !{!27}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/array.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"array.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !12}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*)* @f, null, null, metadata !10, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/array.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null, metadata !8}
+!8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !11}
+!11 = metadata !{i32 786689, metadata !4, metadata !"p", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 1]
+!12 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 5, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !18, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !9, metadata !9, metadata !15}
+!15 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!16 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!17 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!18 = metadata !{metadata !19, metadata !20, metadata !21}
+!19 = metadata !{i32 786689, metadata !12, metadata !"argc", metadata !5, i32 16777221, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 5]
+!20 = metadata !{i32 786689, metadata !12, metadata !"argv", metadata !5, i32 33554437, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 5]
+!21 = metadata !{i32 786688, metadata !12, metadata !"array", metadata !5, i32 6, metadata !22, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [array] [line 6]
+!22 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 128, i64 32, i32 0, i32 0, metadata !9, metadata !23, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from int]
+!23 = metadata !{metadata !24}
+!24 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
+!25 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!27 = metadata !{metadata !"clang version 3.5.0 "}
+!28 = metadata !{i32 1, i32 0, metadata !4, null}
+!29 = metadata !{i32 2, i32 0, metadata !4, null}
+!30 = metadata !{metadata !31, metadata !31, i64 0}
+!31 = metadata !{metadata !"int", metadata !32, i64 0}
+!32 = metadata !{metadata !"omnipotent char", metadata !33, i64 0}
+!33 = metadata !{metadata !"Simple C/C++ TBAA"}
+!34 = metadata !{i32 3, i32 0, metadata !4, null}
+!35 = metadata !{i32 5, i32 0, metadata !12, null}
+!36 = metadata !{i32 6, i32 0, metadata !12, null}
+!37 = metadata !{i32 7, i32 0, metadata !12, null}
+!38 = metadata !{i32 8, i32 0, metadata !12, null} ; [ DW_TAG_imported_declaration ]
diff --git a/test/DebugInfo/X86/array2.ll b/test/DebugInfo/X86/array2.ll
new file mode 100644
index 0000000..2dc2af3
--- /dev/null
+++ b/test/DebugInfo/X86/array2.ll
@@ -0,0 +1,107 @@
+; ModuleID = 'array.c'
+;
+; From (clang -g -c -O0):
+;
+; void f(int* p) {
+;   p[0] = 42;
+; }
+;
+; int main(int argc, char** argv) {
+;   int array[4] = { 0, 1, 2, 3 };
+;   f(array);
+;   return array[0];
+; }
+;
+; RUN: opt %s -O2 -S -o - | FileCheck %s
+; Test that we do not lower dbg.declares for arrays.
+;
+; CHECK: define i32 @main
+; CHECK: call void @llvm.dbg.value
+; CHECK: call void @llvm.dbg.value
+; CHECK: call void @llvm.dbg.declare
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+@main.array = private unnamed_addr constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 16
+
+; Function Attrs: nounwind ssp uwtable
+define void @f(i32* %p) #0 {
+entry:
+  %p.addr = alloca i32*, align 8
+  store i32* %p, i32** %p.addr, align 8
+  call void @llvm.dbg.declare(metadata !{i32** %p.addr}, metadata !19), !dbg !20
+  %0 = load i32** %p.addr, align 8, !dbg !21
+  %arrayidx = getelementptr inbounds i32* %0, i64 0, !dbg !21
+  store i32 42, i32* %arrayidx, align 4, !dbg !21
+  ret void, !dbg !22
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main(i32 %argc, i8** %argv) #0 {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  %array = alloca [4 x i32], align 16
+  store i32 0, i32* %retval
+  store i32 %argc, i32* %argc.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !23), !dbg !24
+  store i8** %argv, i8*** %argv.addr, align 8
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !25), !dbg !24
+  call void @llvm.dbg.declare(metadata !{[4 x i32]* %array}, metadata !26), !dbg !30
+  %0 = bitcast [4 x i32]* %array to i8*, !dbg !30
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([4 x i32]* @main.array to i8*), i64 16, i32 16, i1 false), !dbg !30
+  %arraydecay = getelementptr inbounds [4 x i32]* %array, i32 0, i32 0, !dbg !31
+  call void @f(i32* %arraydecay), !dbg !31
+  %arrayidx = getelementptr inbounds [4 x i32]* %array, i32 0, i64 0, !dbg !32
+  %1 = load i32* %arrayidx, align 4, !dbg !32
+  ret i32 %1, !dbg !32
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #2
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!16, !17}
+!llvm.ident = !{!18}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [array.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"array.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !10}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*)* @f, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [array.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null, metadata !8}
+!8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 5, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !9, metadata !9, metadata !13}
+!13 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !15} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!15 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{metadata !"clang version 3.5.0 "}
+!19 = metadata !{i32 786689, metadata !4, metadata !"p", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 1]
+!20 = metadata !{i32 1, i32 0, metadata !4, null}
+!21 = metadata !{i32 2, i32 0, metadata !4, null}
+!22 = metadata !{i32 3, i32 0, metadata !4, null}
+!23 = metadata !{i32 786689, metadata !10, metadata !"argc", metadata !5, i32 16777221, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 5]
+!24 = metadata !{i32 5, i32 0, metadata !10, null}
+!25 = metadata !{i32 786689, metadata !10, metadata !"argv", metadata !5, i32 33554437, metadata !13, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 5]
+!26 = metadata !{i32 786688, metadata !10, metadata !"array", metadata !5, i32 6, metadata !27, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [array] [line 6]
+!27 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 128, i64 32, i32 0, i32 0, metadata !9, metadata !28, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from int]
+!28 = metadata !{metadata !29}
+!29 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
+!30 = metadata !{i32 6, i32 0, metadata !10, null}
+!31 = metadata !{i32 7, i32 0, metadata !10, null}
+!32 = metadata !{i32 8, i32 0, metadata !10, null} ; [ DW_TAG_imported_declaration ]
diff --git a/test/DebugInfo/X86/block-capture.ll b/test/DebugInfo/X86/block-capture.ll
index 9f4c391..31b4fa9 100644
--- a/test/DebugInfo/X86/block-capture.ll
+++ b/test/DebugInfo/X86/block-capture.ll
@@ -4,15 +4,15 @@
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=DWARF3
 
 ; Checks that we emit debug info for the block variable declare.
-; CHECK: DW_TAG_subprogram [3]
-; CHECK: DW_TAG_variable [5]
-; CHECK: DW_AT_name [DW_FORM_strp]     ( .debug_str[{{.*}}] = "block")
-; CHECK: DW_AT_location [DW_FORM_sec_offset]        ({{.*}})
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_variable
+; CHECK: DW_AT_name {{.*}} "block"
+; CHECK: DW_AT_location [DW_FORM_sec_offset]
 
-; DWARF3: DW_TAG_subprogram [3]
-; DWARF3: DW_TAG_variable [5]
-; DWARF3: DW_AT_name [DW_FORM_strp]     ( .debug_str[{{.*}}] = "block")
-; DWARF3: DW_AT_location [DW_FORM_data4]        ({{.*}})
+; DWARF3: DW_TAG_subprogram
+; DWARF3: DW_TAG_variable
+; DWARF3: DW_AT_name {{.*}} "block"
+; DWARF3: DW_AT_location [DW_FORM_data4]
 
 %struct.__block_descriptor = type { i64, i64 }
 %struct.__block_literal_generic = type { i8*, i32, i32, i8*, %struct.__block_descriptor* }
@@ -72,7 +72,7 @@ declare i32 @__objc_personality_v0(...)
 !0 = metadata !{i32 786449, metadata !63, i32 16, metadata !"clang version 3.1 (trunk 151227)", i1 false, metadata !"", i32 2, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !28, metadata !31, metadata !34}
-!5 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !26, i32 5} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 5} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !63} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
@@ -93,15 +93,13 @@ declare i32 @__objc_personality_v0(...)
 !23 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"reserved", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_member ]
 !24 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
 !25 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"Size", i32 0, i64 64, i64 64, i64 64, i32 0, metadata !24} ; [ DW_TAG_member ]
-!26 = metadata !{metadata !27}
-!27 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!28 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"__foo_block_invoke_0", metadata !"__foo_block_invoke_0", metadata !"", i32 7, metadata !29, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*)* @__foo_block_invoke_0, null, null, metadata !26, i32 7} ; [ DW_TAG_subprogram ]
+!28 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"__foo_block_invoke_0", metadata !"__foo_block_invoke_0", metadata !"", i32 7, metadata !29, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*)* @__foo_block_invoke_0, null, null, null, i32 7} ; [ DW_TAG_subprogram ]
 !29 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !30 = metadata !{null, metadata !14}
-!31 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"__copy_helper_block_", metadata !"__copy_helper_block_", metadata !"", i32 10, metadata !32, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !26, i32 10} ; [ DW_TAG_subprogram ]
+!31 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"__copy_helper_block_", metadata !"__copy_helper_block_", metadata !"", i32 10, metadata !32, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 10} ; [ DW_TAG_subprogram ]
 !32 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !33, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !33 = metadata !{null, metadata !14, metadata !14}
-!34 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"__destroy_helper_block_", metadata !"__destroy_helper_block_", metadata !"", i32 10, metadata !29, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !26, i32 10} ; [ DW_TAG_subprogram ]
+!34 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"__destroy_helper_block_", metadata !"__destroy_helper_block_", metadata !"", i32 10, metadata !29, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 10} ; [ DW_TAG_subprogram ]
 !35 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
 !36 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
 !37 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
diff --git a/test/DebugInfo/X86/coff_debug_info_type.ll b/test/DebugInfo/X86/coff_debug_info_type.ll
index e61c807..a0b8ccc 100644
--- a/test/DebugInfo/X86/coff_debug_info_type.ll
+++ b/test/DebugInfo/X86/coff_debug_info_type.ll
@@ -4,7 +4,7 @@
 ; CHECK:    .section  .debug_info
 
 ; RUN: llc -mtriple=i686-pc-win32 -filetype=asm -O0 < %s | FileCheck -check-prefix=WIN32 %s
-; WIN32:    .section .debug$S,"rn"
+; WIN32:    .section .debug$S,"rnd"
 
 ; generated from:
 ; clang -g -S -emit-llvm test.c -o test.ll
diff --git a/test/DebugInfo/X86/concrete_out_of_line.ll b/test/DebugInfo/X86/concrete_out_of_line.ll
index 2f5a7d1..40300de 100644
--- a/test/DebugInfo/X86/concrete_out_of_line.ll
+++ b/test/DebugInfo/X86/concrete_out_of_line.ll
@@ -1,5 +1,4 @@
-; RUN: llc -mtriple=x86_64-linux %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux < %s -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s
 
 ; test that we add DW_AT_inline even when we only have concrete out of line
 ; instances.
@@ -8,21 +7,56 @@
 ; AT_inline.
 
 ; CHECK: DW_TAG_class_type
-; CHECK: DW_TAG_class_type
 ; CHECK:   DW_TAG_subprogram
+; CHECK: [[ASSIGN_DECL:0x........]]:  DW_TAG_subprogram
+
+; CHECK: DW_TAG_class_type
+; CHECK: [[RELEASE_DECL:0x........]]:  DW_TAG_subprogram
 ; CHECK: [[DTOR_DECL:0x........]]:  DW_TAG_subprogram
 
-; CHECK: [[DTOR_OOL:0x........]]: DW_TAG_subprogram
-; CHECK-NEXT:     DW_AT_specification {{.*}} {[[DTOR_DECL]]})
+; CHECK: [[D2_ABS:.*]]: DW_TAG_subprogram
+; CHECK-NEXT:     DW_AT_{{.*}}linkage_name {{.*}}D2
+; CHECK-NEXT:     DW_AT_specification {{.*}} {[[DTOR_DECL]]}
+; CHECK-NEXT:     DW_AT_inline
+; CHECK-NOT:      DW_AT
+; CHECK: DW_TAG
+; CHECK: [[D1_ABS:.*]]: DW_TAG_subprogram
+; CHECK-NEXT:     DW_AT_{{.*}}linkage_name {{.*}}D1
+; CHECK-NEXT:     DW_AT_specification {{.*}} {[[DTOR_DECL]]}
 ; CHECK-NEXT:     DW_AT_inline
+; CHECK-NOT:     DW_AT
+; CHECK: [[D1_THIS_ABS:.*]]: DW_TAG_formal_parameter
 
+; CHECK: [[RELEASE:0x........]]: DW_TAG_subprogram
+; CHECK:     DW_AT_specification {{.*}} {[[RELEASE_DECL]]}
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NOT: NULL
+; CHECK-NOT: DW_TAG
+; CHECK: DW_TAG_lexical_block
+; CHECK-NOT: NULL
+; CHECK-NOT: DW_TAG
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[ASSIGN:0x........]]}
+; CHECK-NOT: NULL
+; CHECK-NOT: DW_TAG
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[D1_ABS]]}
+; CHECK-NOT: NULL
+; CHECK-NOT: DW_TAG
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[D2_ABS]]}
 
 ; and then that a TAG_subprogram refers to it with AT_abstract_origin.
 
 ; CHECK: DW_TAG_subprogram
-; CHECK: DW_TAG_subprogram
-; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[DTOR_OOL]]})
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_abstract_origin {{.*}} {[[D1_ABS]]}
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_abstract_origin {{.*}} {[[D1_THIS_ABS]]}
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[D2_ABS]]}
+
 
 define i32 @_ZN17nsAutoRefCnt7ReleaseEv() {
 entry:
@@ -58,7 +92,7 @@ declare void @_Z8moz_freePv(i8*)
 !15 = metadata !{i32 720942, metadata !6, metadata !13, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"", i32 12, metadata !16, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 12} ; [ DW_TAG_subprogram ]
 !16 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null, metadata !10}
-!18 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
+!18 = metadata !{}
 !20 = metadata !{metadata !22}
 !22 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777230, metadata !10, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
 !23 = metadata !{i32 720942, metadata !6, null, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"_ZN17nsAutoRefCntD1Ev", i32 18, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32* null, null, metadata !15, metadata !24, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
diff --git a/test/DebugInfo/X86/cu-ranges.ll b/test/DebugInfo/X86/cu-ranges.ll
index e6dc17e..405a498 100644
--- a/test/DebugInfo/X86/cu-ranges.ll
+++ b/test/DebugInfo/X86/cu-ranges.ll
@@ -1,4 +1,4 @@
-; RUN: llc -split-dwarf=Enable -O0 %s -ffunction-sections -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
+; RUN: llc -split-dwarf=Enable -O0 %s -function-sections -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
 ; RUN: llvm-dwarfdump -debug-dump=all %t | FileCheck --check-prefix=FUNCTION-SECTIONS %s
 ; RUN: llvm-readobj --relocations %t | FileCheck --check-prefix=FUNCTION-SECTIONS-RELOCS %s
 
diff --git a/test/DebugInfo/X86/dbg-at-specficiation.ll b/test/DebugInfo/X86/dbg-at-specficiation.ll
index 8003a0f..c765367 100644
--- a/test/DebugInfo/X86/dbg-at-specficiation.ll
+++ b/test/DebugInfo/X86/dbg-at-specficiation.ll
@@ -9,7 +9,7 @@
 !llvm.module.flags = !{!12}
 
 !0 = metadata !{i32 720913, metadata !11, i32 12, metadata !"clang version 3.0 (trunk 140253)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, null, i32 0} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{i32 0}
+!2 = metadata !{}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 720948, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, [10 x i32]* @a, null} ; [ DW_TAG_variable ]
 !6 = metadata !{i32 720937, metadata !11} ; [ DW_TAG_file_type ]
diff --git a/test/DebugInfo/X86/dbg-const.ll b/test/DebugInfo/X86/dbg-const.ll
index 12dc154..300c1ee 100644
--- a/test/DebugInfo/X86/dbg-const.ll
+++ b/test/DebugInfo/X86/dbg-const.ll
@@ -13,7 +13,7 @@
 
 target triple = "x86_64-apple-darwin10.0.0"
 
-;CHECK:        ## DW_OP_constu
+;CHECK:        ## DW_OP_consts
 ;CHECK-NEXT:  .byte	42
 define i32 @foobar() nounwind readonly noinline ssp {
 entry:
diff --git a/test/DebugInfo/X86/dbg-declare-arg.ll b/test/DebugInfo/X86/dbg-declare-arg.ll
index 7bf6f4f..b537265 100644
--- a/test/DebugInfo/X86/dbg-declare-arg.ll
+++ b/test/DebugInfo/X86/dbg-declare-arg.ll
@@ -122,6 +122,6 @@ entry:
 !47 = metadata !{i32 2, i32 47, metadata !25, null}
 !48 = metadata !{i32 2, i32 54, metadata !49, null}
 !49 = metadata !{i32 786443, metadata !51, metadata !25, i32 2, i32 52, i32 2} ; [ DW_TAG_lexical_block ]
-!50 = metadata !{metadata !0, metadata !10, metadata !14, metadata !19, metadata !22, metadata !25}
+!50 = metadata !{metadata !19, metadata !22, metadata !25}
 !51 = metadata !{metadata !"a.cc", metadata !"/private/tmp"}
 !52 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-declare.ll b/test/DebugInfo/X86/dbg-declare.ll
index 8bb574e..241a5a1 100644
--- a/test/DebugInfo/X86/dbg-declare.ll
+++ b/test/DebugInfo/X86/dbg-declare.ll
@@ -33,15 +33,13 @@ declare void @llvm.stackrestore(i8*) nounwind
 !0 = metadata !{i32 786449, metadata !26, i32 12, metadata !"clang version 3.1 (trunk 153698)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !26, metadata !0, metadata !"foo", metadata !"foo", metadata !"", i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*)* @foo, null, null, metadata !12, i32 0} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 786478, metadata !26, metadata !0, metadata !"foo", metadata !"foo", metadata !"", i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
 !11 = metadata !{i32 786470, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_const_type ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !14 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !6, i32 16777221, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !15 = metadata !{i32 5, i32 21, metadata !5, null}
 !16 = metadata !{i32 7, i32 13, metadata !17, null}
diff --git a/test/DebugInfo/X86/dbg-large-unsigned-const.ll b/test/DebugInfo/X86/dbg-large-unsigned-const.ll
deleted file mode 100644
index a037f3c..0000000
--- a/test/DebugInfo/X86/dbg-large-unsigned-const.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: llc -filetype=obj %s -o /dev/null
-; Hanle large unsigned constant values.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-target triple = "i386-apple-macosx10.7.0"
-
-define zeroext i1 @_Z3iseRKxS0_(i64* nocapture %LHS, i64* nocapture %RHS) nounwind readonly optsize ssp {
-entry:
-  tail call void @llvm.dbg.value(metadata !{i64* %LHS}, i64 0, metadata !7), !dbg !13
-  tail call void @llvm.dbg.value(metadata !{i64* %RHS}, i64 0, metadata !11), !dbg !14
-  %tmp1 = load i64* %LHS, align 4, !dbg !15
-  %tmp3 = load i64* %RHS, align 4, !dbg !15
-  %cmp = icmp eq i64 %tmp1, %tmp3, !dbg !15
-  ret i1 %cmp, !dbg !15
-}
-
-define zeroext i1 @_Z2fnx(i64 %a) nounwind readnone optsize ssp {
-entry:
-  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !12), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !12), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !21), !dbg !24
-  tail call void @llvm.dbg.value(metadata !25, i64 0, metadata !26), !dbg !27
-  %cmp.i = icmp eq i64 %a, 9223372036854775807, !dbg !28
-  ret i1 %cmp.i, !dbg !22
-}
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!34}
-!29 = metadata !{metadata !1, metadata !6}
-!30 = metadata !{metadata !7, metadata !11}
-!31 = metadata !{metadata !12}
-
-!0 = metadata !{i32 786449, metadata !32, i32 4, metadata !"clang version 3.0 (trunk 135593)", i1 true, metadata !"", i32 0, metadata !33, metadata !33, metadata !29, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !32, null, metadata !"ise", metadata !"ise", metadata !"_Z3iseRKxS0_", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i1 (i64*, i64*)* @_Z3iseRKxS0_, null, null, metadata !30, i32 2} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !32} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !32, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !32, null, metadata !"fn", metadata !"fn", metadata !"_Z2fnx", i32 6, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i1 (i64)* @_Z2fnx, null, null, metadata !31, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [fn]
-!7 = metadata !{i32 786689, metadata !1, metadata !"LHS", metadata !2, i32 16777218, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786448, metadata !0, null, null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_reference_type ]
-!9 = metadata !{i32 786470, metadata !0, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_const_type ]
-!10 = metadata !{i32 786468, null, metadata !0, metadata !"long long int", i32 0, i64 64, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 786689, metadata !1, metadata !"RHS", metadata !2, i32 33554434, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 786689, metadata !6, metadata !"a", metadata !2, i32 16777222, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 2, i32 27, metadata !1, null}
-!14 = metadata !{i32 2, i32 49, metadata !1, null}
-!15 = metadata !{i32 3, i32 3, metadata !16, null}
-!16 = metadata !{i32 786443, metadata !32, metadata !1, i32 2, i32 54, i32 0} ; [ DW_TAG_lexical_block ]
-!20 = metadata !{i32 6, i32 19, metadata !6, null}
-!21 = metadata !{i32 786689, metadata !1, metadata !"LHS", metadata !2, i32 16777218, metadata !8, i32 0, metadata !22} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{i32 7, i32 10, metadata !23, null}
-!23 = metadata !{i32 786443, metadata !32, metadata !6, i32 6, i32 22, i32 1} ; [ DW_TAG_lexical_block ]
-!24 = metadata !{i32 2, i32 27, metadata !1, metadata !22}
-!25 = metadata !{i64 9223372036854775807}         
-!26 = metadata !{i32 786689, metadata !1, metadata !"RHS", metadata !2, i32 33554434, metadata !8, i32 0, metadata !22} ; [ DW_TAG_arg_variable ]
-!27 = metadata !{i32 2, i32 49, metadata !1, metadata !22}
-!28 = metadata !{i32 3, i32 3, metadata !16, metadata !22}
-!32 = metadata !{metadata !"lli.cc", metadata !"/private/tmp"}
-!33 = metadata !{i32 0}
-!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-subrange.ll b/test/DebugInfo/X86/dbg-subrange.ll
index f253cac..f8761d0 100644
--- a/test/DebugInfo/X86/dbg-subrange.ll
+++ b/test/DebugInfo/X86/dbg-subrange.ll
@@ -18,12 +18,10 @@ entry:
 !0 = metadata !{i32 786449, metadata !21, i32 12, metadata !"clang version 3.1 (trunk 144833)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !11,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !21, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @bar, null, null, metadata !9, i32 0} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [bar]
+!5 = metadata !{i32 720942, metadata !21, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [bar]
 !6 = metadata !{i32 720937, metadata !21} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
-!9 = metadata !{metadata !10}
-!10 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !11 = metadata !{metadata !13}
 !13 = metadata !{i32 720948, i32 0, null, metadata !"s", metadata !"s", metadata !"", metadata !6, i32 2, metadata !14, i32 0, i32 1, [4294967296 x i8]* @s, null} ; [ DW_TAG_variable ]
 !14 = metadata !{i32 720897, null, null, null, i32 0, i64 34359738368, i64 8, i32 0, i32 0, metadata !15, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 34359738368, align 8, offset 0] [from char]
diff --git a/test/DebugInfo/X86/dbg-value-const-byref.ll b/test/DebugInfo/X86/dbg-value-const-byref.ll
index 3d1e87d..baba0cd 100644
--- a/test/DebugInfo/X86/dbg-value-const-byref.ll
+++ b/test/DebugInfo/X86/dbg-value-const-byref.ll
@@ -14,7 +14,7 @@
 ; }
 ;
 ; Test that we generate valid debug info for optimized code,
-; particularily variables that are described as constants and passed
+; particularly variables that are described as constants and passed
 ; by reference.
 ; rdar://problem/14874886
 ;
@@ -25,18 +25,22 @@
 ; CHECK:     DW_AT_location [DW_FORM_data4]	([[LOC:.*]])
 ; CHECK: .debug_loc contents:
 ; CHECK: [[LOC]]:
-;        constu 0x00000003
+;        consts 0x00000003
 ; CHECK: Beginning address offset: 0x0000000000000{{.*}}
 ; CHECK:    Ending address offset: [[C1:.*]]
-; CHECK:     Location description: 10 03
-;        constu 0x00000007
+; CHECK:     Location description: 11 03
+;        consts 0x00000007
 ; CHECK: Beginning address offset: [[C1]]
 ; CHECK:    Ending address offset: [[C2:.*]]
-; CHECK:     Location description: 10 07
+; CHECK:     Location description: 11 07
 ;        rax, piece 0x00000004
 ; CHECK: Beginning address offset: [[C2]]
 ; CHECK:    Ending address offset: [[R1:.*]]
 ; CHECK:     Location description: 50 93 04
+;         rdi+0
+; CHECK: Beginning address offset: [[R1]]
+; CHECK:    Ending address offset: [[R2:.*]]
+; CHECK:     Location description: 75 00
 ;
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
@@ -51,6 +55,7 @@ entry:
   %call1 = call i32 (...)* @f1() #3, !dbg !19
   call void @llvm.dbg.value(metadata !{i32 %call1}, i64 0, metadata !10), !dbg !19
   store i32 %call1, i32* %i, align 4, !dbg !19, !tbaa !20
+  call void @llvm.dbg.value(metadata !{i32* %i}, i64 0, metadata !10), !dbg !24
   call void @f2(i32* %i) #3, !dbg !24
   ret i32 0, !dbg !25
 }
diff --git a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
index 3db67ff..1922272 100644
--- a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
+++ b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
@@ -5,8 +5,24 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin < %s -filetype=obj -regalloc=basic \
 ; RUN:     | llvm-dwarfdump -debug-dump=info - | FileCheck --check-prefix=CHECK --check-prefix=DARWIN %s
 
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_AT_abstract_origin {{.*}}{[[ABS:.*]]}
+; FIXME: An out of line definition preceeding an inline usage doesn't properly
+; reference abstract variables.
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NEXT:     DW_AT_name {{.*}} "sp"
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NEXT:     DW_AT_name {{.*}} "nums"
+
+; CHECK: [[ABS]]: DW_TAG_subprogram
+; CHECK:   DW_AT_name {{.*}} "foo"
+; CHECK: [[ABS_SP:.*]]:   DW_TAG_formal_parameter
+; CHECK-NEXT:     DW_AT_name {{.*}} "sp"
+; CHECK: [[ABS_NUMS:.*]]:  DW_TAG_formal_parameter
+; CHECK-NEXT:     DW_AT_name {{.*}} "nums"
+
 ;CHECK: DW_TAG_inlined_subroutine
-;CHECK-NEXT: DW_AT_abstract_origin
+;CHECK-NEXT: DW_AT_abstract_origin {{.*}}{[[ABS]]}
 ;CHECK-NEXT: DW_AT_low_pc [DW_FORM_addr]
 ;CHECK-NEXT: DW_AT_high_pc [DW_FORM_data4]
 ;CHECK-NEXT: DW_AT_call_file
@@ -14,9 +30,10 @@
 
 ;CHECK: DW_TAG_formal_parameter
 ;FIXME: Linux shouldn't drop this parameter either...
-;LINUX-NOT: DW_TAG_formal_parameter
+;DARWIN-NEXT:   DW_AT_abstract_origin {{.*}}{[[ABS_SP]]}
 ;DARWIN: DW_TAG_formal_parameter
-;DARWIN-NEXT: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000055] = "sp")
+;CHECK-NEXT: DW_AT_abstract_origin {{.*}}{[[ABS_NUMS]]}
+;CHECK-NOT: DW_TAG_formal_parameter
 
 %struct.S1 = type { float*, i32 }
 
@@ -62,7 +79,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !6 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"foobar", metadata !"foobar", metadata !"", i32 15, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, void ()* @foobar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 15] [def] [scope 0] [foobar]
 !7 = metadata !{i32 786453, metadata !42, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
-!9 = metadata !{i32 786689, metadata !0, metadata !"sp", metadata !1, i32 7, metadata !10, i32 0, metadata !32} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{i32 786689, metadata !0, metadata !"sp", metadata !1, i32 16777223, metadata !10, i32 0, metadata !32} ; [ DW_TAG_arg_variable ]
 !10 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
 !11 = metadata !{i32 786454, metadata !42, metadata !2, metadata !"S1", i32 4, i64 0, i64 0, i64 0, i32 0, metadata !12} ; [ DW_TAG_typedef ]
 !12 = metadata !{i32 786451, metadata !42, metadata !2, metadata !"S1", i32 1, i64 128, i64 64, i32 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [S1] [line 1, size 128, align 64, offset 0] [def] [from ]
@@ -71,7 +88,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !15 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ]
 !16 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !17 = metadata !{i32 786445, metadata !42, metadata !1, metadata !"nums", i32 3, i64 32, i64 32, i64 64, i32 0, metadata !5} ; [ DW_TAG_member ]
-!18 = metadata !{i32 786689, metadata !0, metadata !"nums", metadata !1, i32 7, metadata !5, i32 0, metadata !32} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{i32 786689, metadata !0, metadata !"nums", metadata !1, i32 33554439, metadata !5, i32 0, metadata !32} ; [ DW_TAG_arg_variable ]
 !19 = metadata !{i32 786484, i32 0, metadata !2, metadata !"p", metadata !"p", metadata !"", metadata !1, i32 14, metadata !11, i32 0, i32 1, %struct.S1* @p, null} ; [ DW_TAG_variable ]
 !20 = metadata !{i32 7, i32 13, metadata !0, null}
 !21 = metadata !{i32 7, i32 21, metadata !0, null}
diff --git a/test/DebugInfo/X86/dbg-value-location.ll b/test/DebugInfo/X86/dbg-value-location.ll
index a9449c6..9184217 100644
--- a/test/DebugInfo/X86/dbg-value-location.ll
+++ b/test/DebugInfo/X86/dbg-value-location.ll
@@ -71,7 +71,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !18 = metadata !{i32 786443, metadata !26, metadata !0, i32 19510, i32 1, i32 99} ; [ DW_TAG_lexical_block ]
 !22 = metadata !{i32 18094, i32 2, metadata !15, metadata !17}
 !23 = metadata !{i32 19524, i32 1, metadata !18, null}
-!24 = metadata !{metadata !0, metadata !6, metadata !7, metadata !8}
+!24 = metadata !{metadata !0, metadata !6, metadata !7, metadata !8, metadata !16}
 !25 = metadata !{i32 786473, metadata !27} ; [ DW_TAG_file_type ]
 !26 = metadata !{metadata !"/tmp/f.c", metadata !"/tmp"}
 !27 = metadata !{metadata !"f.i", metadata !"/tmp"}
diff --git a/test/DebugInfo/X86/debug-dead-local-var.ll b/test/DebugInfo/X86/debug-dead-local-var.ll
new file mode 100644
index 0000000..64f0b2a
--- /dev/null
+++ b/test/DebugInfo/X86/debug-dead-local-var.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mtriple=x86_64-linux-gnu %s -filetype=obj -o %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; Reconstruct this via clang and -O2.
+; static void foo() {
+;   struct X { int a; int b; } xyz;
+; }
+
+; int bar() {
+;   foo();
+;   return 1;
+; }
+
+; Check that we still have the structure type for X even though we're not
+; going to emit a low/high_pc for foo.
+; CHECK: DW_TAG_structure_type
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @bar() #0 {
+entry:
+  ret i32 1, !dbg !21
+}
+
+attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18, !19}
+!llvm.ident = !{!20}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 (trunk 209255) (llvm/trunk 209253)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/debug-dead-local-var.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"debug-dead-local-var.c", metadata !"/usr/local/google/home/echristo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !9}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 11, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @bar, null, null, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/debug-dead-local-var.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 6, metadata !10, i1 true, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, metadata !12, i32 6} ; [ DW_TAG_subprogram ] [line 6] [local] [def] [foo]
+!10 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{null}
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786688, metadata !9, metadata !"xyz", metadata !5, i32 8, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xyz] [line 8]
+!14 = metadata !{i32 786451, metadata !1, metadata !9, metadata !"X", i32 8, i64 64, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 8, size 64, align 32, offset 0] [def] [from ]
+!15 = metadata !{metadata !16, metadata !17}
+!16 = metadata !{i32 786445, metadata !1, metadata !14, metadata !"a", i32 8, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 8, size 32, align 32, offset 0] [from int]
+!17 = metadata !{i32 786445, metadata !1, metadata !14, metadata !"b", i32 8, i64 32, i64 32, i64 32, i32 0, metadata !8} ; [ DW_TAG_member ] [b] [line 8, size 32, align 32, offset 32] [from int]
+!18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{metadata !"clang version 3.5.0 (trunk 209255) (llvm/trunk 209253)"}
+!21 = metadata !{i32 13, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/X86/debug-info-blocks.ll b/test/DebugInfo/X86/debug-info-blocks.ll
index 9daecee..430c157 100644
--- a/test/DebugInfo/X86/debug-info-blocks.ll
+++ b/test/DebugInfo/X86/debug-info-blocks.ll
@@ -5,16 +5,26 @@
 ; rdar://problem/9279956
 ; test that the DW_AT_location of self is at ( fbreg +{{[0-9]+}}, deref, +{{[0-9]+}} )
 
+; CHECK: [[A:.*]]:   DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_APPLE_objc_complete_type
+; CHECK-NEXT: DW_AT_name{{.*}}"A"
+
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
 ; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_object_pointer
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_name{{.*}}_block_invoke
 
-; CHECK-NOT: DW_TAG_subprogram
+; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_name{{.*}}.block_descriptor
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}.block_descriptor
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_location
 
-; CHECK-NOT: DW_TAG_subprogram
+; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name{{.*}}"self"
 ; CHECK-NOT: DW_TAG
@@ -27,10 +37,7 @@
 ; 0x91 = DW_OP_fbreg
 ; CHECK: DW_AT_location{{.*}}91 {{[0-9]+}} 06 23 {{[0-9]+}} )
 
-; CHECK: [[A:.*]]:   DW_TAG_structure_type
-; CHECK-NEXT: DW_AT_APPLE_objc_complete_type
-; CHECK-NEXT: DW_AT_name{{.*}}"A"
-; CHECK: [[APTR]]:   DW_TAG_pointer_type [5]
+; CHECK: [[APTR]]:   DW_TAG_pointer_type
 ; CHECK-NEXT: {[[A]]}
 
 
diff --git a/test/DebugInfo/X86/debug-info-static-member.ll b/test/DebugInfo/X86/debug-info-static-member.ll
index f15f2c1..7d258f9 100644
--- a/test/DebugInfo/X86/debug-info-static-member.ll
+++ b/test/DebugInfo/X86/debug-info-static-member.ll
@@ -114,7 +114,7 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; PRESENT:      DW_TAG_member
 ; PRESENT-NEXT: DW_AT_name {{.*}} "const_b"
 ; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x02)
-; PRESENT:      DW_AT_const_value {{.*}} (0x4048f5c3)
+; PRESENT:      DW_AT_const_value [DW_FORM_udata] (1078523331)
 ; PRESENT:      0x[[DECL_C:[0-9a-f]+]]: DW_TAG_member
 ; PRESENT-NEXT: DW_AT_name {{.*}} "c"
 ; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
@@ -164,7 +164,7 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; DARWINP:      DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "const_b"
 ; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x02)
-; DARWINP:      DW_AT_const_value {{.*}} (0x4048f5c3)
+; DARWINP:      DW_AT_const_value [DW_FORM_udata] (1078523331)
 ; DARWINP:      0x[[DECL_C:[0-9a-f]+]]: DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "c"
 ; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
diff --git a/test/DebugInfo/X86/debug-loc-offset.ll b/test/DebugInfo/X86/debug-loc-offset.ll
index b10309c..3f4d39d 100644
--- a/test/DebugInfo/X86/debug-loc-offset.ll
+++ b/test/DebugInfo/X86/debug-loc-offset.ll
@@ -37,7 +37,9 @@
 ; CHECK: DW_AT_high_pc
 
 ; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name [DW_FORM_strp]{{.*}}"_Z1a1A"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name [DW_FORM_strp]{{.*}}"_Z1a1A"
+; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NEXT: DW_AT_name [DW_FORM_strp]{{.*}}"var"
 ; CHECK: DW_AT_location [DW_FORM_sec_offset]   (0x00000000)
diff --git a/test/DebugInfo/X86/debug-ranges-offset.ll b/test/DebugInfo/X86/debug-ranges-offset.ll
new file mode 100644
index 0000000..365ba17
--- /dev/null
+++ b/test/DebugInfo/X86/debug-ranges-offset.ll
@@ -0,0 +1,241 @@
+; RUN: llc -filetype=obj -mtriple=x86_64-pc-linux-gnu %s -o %t
+; RUN: llvm-readobj --relocations %t | FileCheck %s
+
+; Check that we don't have any relocations in the ranges section - 
+; to show that we're producing this as a relative offset to the
+; low_pc for the compile unit.
+; CHECK-NOT: .rela.debug_ranges
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 0, void ()* @__msan_init }]
+@str = private unnamed_addr constant [4 x i8] c"zzz\00"
+@__msan_retval_tls = external thread_local(initialexec) global [8 x i64]
+@__msan_retval_origin_tls = external thread_local(initialexec) global i32
+@__msan_param_tls = external thread_local(initialexec) global [1000 x i64]
+@__msan_param_origin_tls = external thread_local(initialexec) global [1000 x i32]
+@__msan_va_arg_tls = external thread_local(initialexec) global [1000 x i64]
+@__msan_va_arg_overflow_size_tls = external thread_local(initialexec) global i64
+@__msan_origin_tls = external thread_local(initialexec) global i32
+@__executable_start = external hidden global i32
+@_end = external hidden global i32
+
+; Function Attrs: sanitize_memory uwtable
+define void @_Z1fv() #0 {
+entry:
+  %p = alloca i32*, align 8
+  %0 = ptrtoint i32** %p to i64, !dbg !19
+  %1 = and i64 %0, -70368744177672, !dbg !19
+  %2 = inttoptr i64 %1 to i64*, !dbg !19
+  store i64 -1, i64* %2, align 8, !dbg !19
+  store i64 0, i64* getelementptr inbounds ([1000 x i64]* @__msan_param_tls, i64 0, i64 0), align 8, !dbg !19
+  store i64 0, i64* getelementptr inbounds ([8 x i64]* @__msan_retval_tls, i64 0, i64 0), align 8, !dbg !19
+  %call = call i8* @_Znwm(i64 4) #4, !dbg !19
+  %_msret = load i64* getelementptr inbounds ([8 x i64]* @__msan_retval_tls, i64 0, i64 0), align 8, !dbg !19
+  %3 = bitcast i8* %call to i32*, !dbg !19
+  tail call void @llvm.dbg.value(metadata !{i32* %3}, i64 0, metadata !9), !dbg !19
+  %4 = inttoptr i64 %1 to i64*, !dbg !19
+  store i64 %_msret, i64* %4, align 8, !dbg !19
+  store volatile i32* %3, i32** %p, align 8, !dbg !19
+  tail call void @llvm.dbg.value(metadata !{i32** %p}, i64 0, metadata !9), !dbg !19
+  %p.0.p.0. = load volatile i32** %p, align 8, !dbg !20
+  %_msld = load i64* %4, align 8, !dbg !20
+  %_mscmp = icmp eq i64 %_msld, 0, !dbg !20
+  br i1 %_mscmp, label %6, label %5, !dbg !20, !prof !22
+
+; <label>:5                                       ; preds = %entry
+  call void @__msan_warning_noreturn(), !dbg !20
+  call void asm sideeffect "", ""() #3, !dbg !20
+  unreachable, !dbg !20
+
+; <label>:6                                       ; preds = %entry
+  %7 = load i32* %p.0.p.0., align 4, !dbg !20, !tbaa !23
+  %8 = ptrtoint i32* %p.0.p.0. to i64, !dbg !20
+  %9 = and i64 %8, -70368744177665, !dbg !20
+  %10 = inttoptr i64 %9 to i32*, !dbg !20
+  %_msld2 = load i32* %10, align 4, !dbg !20
+  %11 = icmp ne i32 %_msld2, 0, !dbg !20
+  %12 = xor i32 %_msld2, -1, !dbg !20
+  %13 = and i32 %7, %12, !dbg !20
+  %14 = icmp eq i32 %13, 0, !dbg !20
+  %_msprop_icmp = and i1 %11, %14, !dbg !20
+  br i1 %_msprop_icmp, label %15, label %16, !dbg !20, !prof !27
+
+; <label>:15                                      ; preds = %6
+  call void @__msan_warning_noreturn(), !dbg !20
+  call void asm sideeffect "", ""() #3, !dbg !20
+  unreachable, !dbg !20
+
+; <label>:16                                      ; preds = %6
+  %tobool = icmp eq i32 %7, 0, !dbg !20
+  br i1 %tobool, label %if.end, label %if.then, !dbg !20
+
+if.then:                                          ; preds = %16
+  store i64 0, i64* getelementptr inbounds ([1000 x i64]* @__msan_param_tls, i64 0, i64 0), align 8, !dbg !28
+  store i32 0, i32* bitcast ([8 x i64]* @__msan_retval_tls to i32*), align 8, !dbg !28
+  %puts = call i32 @puts(i8* getelementptr inbounds ([4 x i8]* @str, i64 0, i64 0)), !dbg !28
+  br label %if.end, !dbg !28
+
+if.end:                                           ; preds = %16, %if.then
+  ret void, !dbg !29
+}
+
+; Function Attrs: nobuiltin
+declare i8* @_Znwm(i64) #1
+
+; Function Attrs: sanitize_memory uwtable
+define i32 @main() #0 {
+entry:
+  %p.i = alloca i32*, align 8
+  %0 = ptrtoint i32** %p.i to i64, !dbg !30
+  %1 = and i64 %0, -70368744177672, !dbg !30
+  %2 = inttoptr i64 %1 to i64*, !dbg !30
+  store i64 -1, i64* %2, align 8, !dbg !30
+  %p.i.0..sroa_cast = bitcast i32** %p.i to i8*, !dbg !30
+  call void @llvm.lifetime.start(i64 8, i8* %p.i.0..sroa_cast), !dbg !30
+  store i64 0, i64* getelementptr inbounds ([1000 x i64]* @__msan_param_tls, i64 0, i64 0), align 8, !dbg !30
+  store i64 0, i64* getelementptr inbounds ([8 x i64]* @__msan_retval_tls, i64 0, i64 0), align 8, !dbg !30
+  %call.i = call i8* @_Znwm(i64 4) #4, !dbg !30
+  %_msret = load i64* getelementptr inbounds ([8 x i64]* @__msan_retval_tls, i64 0, i64 0), align 8, !dbg !30
+  %3 = bitcast i8* %call.i to i32*, !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i32* %3}, i64 0, metadata !32), !dbg !30
+  %4 = inttoptr i64 %1 to i64*, !dbg !30
+  store i64 %_msret, i64* %4, align 8, !dbg !30
+  store volatile i32* %3, i32** %p.i, align 8, !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i32** %p.i}, i64 0, metadata !32), !dbg !30
+  %p.i.0.p.0.p.0..i = load volatile i32** %p.i, align 8, !dbg !33
+  %_msld = load i64* %4, align 8, !dbg !33
+  %_mscmp = icmp eq i64 %_msld, 0, !dbg !33
+  br i1 %_mscmp, label %6, label %5, !dbg !33, !prof !22
+
+; <label>:5                                       ; preds = %entry
+  call void @__msan_warning_noreturn(), !dbg !33
+  call void asm sideeffect "", ""() #3, !dbg !33
+  unreachable, !dbg !33
+
+; <label>:6                                       ; preds = %entry
+  %7 = load i32* %p.i.0.p.0.p.0..i, align 4, !dbg !33, !tbaa !23
+  %8 = ptrtoint i32* %p.i.0.p.0.p.0..i to i64, !dbg !33
+  %9 = and i64 %8, -70368744177665, !dbg !33
+  %10 = inttoptr i64 %9 to i32*, !dbg !33
+  %_msld2 = load i32* %10, align 4, !dbg !33
+  %11 = icmp ne i32 %_msld2, 0, !dbg !33
+  %12 = xor i32 %_msld2, -1, !dbg !33
+  %13 = and i32 %7, %12, !dbg !33
+  %14 = icmp eq i32 %13, 0, !dbg !33
+  %_msprop_icmp = and i1 %11, %14, !dbg !33
+  br i1 %_msprop_icmp, label %15, label %16, !dbg !33, !prof !27
+
+; <label>:15                                      ; preds = %6
+  call void @__msan_warning_noreturn(), !dbg !33
+  call void asm sideeffect "", ""() #3, !dbg !33
+  unreachable, !dbg !33
+
+; <label>:16                                      ; preds = %6
+  %tobool.i = icmp eq i32 %7, 0, !dbg !33
+  br i1 %tobool.i, label %_Z1fv.exit, label %if.then.i, !dbg !33
+
+if.then.i:                                        ; preds = %16
+  store i64 0, i64* getelementptr inbounds ([1000 x i64]* @__msan_param_tls, i64 0, i64 0), align 8, !dbg !34
+  store i32 0, i32* bitcast ([8 x i64]* @__msan_retval_tls to i32*), align 8, !dbg !34
+  %puts.i = call i32 @puts(i8* getelementptr inbounds ([4 x i8]* @str, i64 0, i64 0)), !dbg !34
+  br label %_Z1fv.exit, !dbg !34
+
+_Z1fv.exit:                                       ; preds = %16, %if.then.i
+  call void @llvm.lifetime.end(i64 8, i8* %p.i.0..sroa_cast), !dbg !35
+  store i32 0, i32* bitcast ([8 x i64]* @__msan_retval_tls to i32*), align 8, !dbg !36
+  ret i32 0, !dbg !36
+}
+
+declare void @__msan_init()
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #2
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) #3
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #3
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #3
+
+declare void @__msan_warning_noreturn()
+
+declare void @__msan_maybe_warning_1(i8, i32)
+
+declare void @__msan_maybe_store_origin_1(i8, i8*, i32)
+
+declare void @__msan_maybe_warning_2(i16, i32)
+
+declare void @__msan_maybe_store_origin_2(i16, i8*, i32)
+
+declare void @__msan_maybe_warning_4(i32, i32)
+
+declare void @__msan_maybe_store_origin_4(i32, i8*, i32)
+
+declare void @__msan_maybe_warning_8(i64, i32)
+
+declare void @__msan_maybe_store_origin_8(i64, i8*, i32)
+
+declare void @__msan_set_alloca_origin4(i8*, i64, i8*, i64)
+
+declare void @__msan_poison_stack(i8*, i64)
+
+declare i32 @__msan_chain_origin(i32)
+
+declare i8* @__msan_memmove(i8*, i8*, i64)
+
+declare i8* @__msan_memcpy(i8*, i8*, i64)
+
+declare i8* @__msan_memset(i8*, i32, i64)
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3
+
+attributes #0 = { sanitize_memory uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nobuiltin "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+attributes #4 = { builtin }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!16, !17}
+!llvm.ident = !{!18}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (trunk 207243) (llvm/trunk 207259)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !13}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"_Z1fv", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z1fv, null, null, metadata !8, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.cpp]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786688, metadata !4, metadata !"p", metadata !5, i32 4, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [p] [line 4]
+!10 = metadata !{i32 786485, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !11} ; [ DW_TAG_volatile_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!12 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 9, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [main]
+!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !12}
+!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{metadata !"clang version 3.5.0 (trunk 207243) (llvm/trunk 207259)"}
+!19 = metadata !{i32 4, i32 0, metadata !4, null}
+!20 = metadata !{i32 5, i32 0, metadata !21, null}
+!21 = metadata !{i32 786443, metadata !1, metadata !4, i32 5, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/foo.cpp]
+!22 = metadata !{metadata !"branch_weights", i32 1000, i32 1}
+!23 = metadata !{metadata !24, metadata !24, i64 0}
+!24 = metadata !{metadata !"int", metadata !25, i64 0}
+!25 = metadata !{metadata !"omnipotent char", metadata !26, i64 0}
+!26 = metadata !{metadata !"Simple C/C++ TBAA"}
+!27 = metadata !{metadata !"branch_weights", i32 1, i32 1000}
+!28 = metadata !{i32 6, i32 0, metadata !21, null}
+!29 = metadata !{i32 7, i32 0, metadata !4, null}
+!30 = metadata !{i32 4, i32 0, metadata !4, metadata !31}
+!31 = metadata !{i32 10, i32 0, metadata !13, null}
+!32 = metadata !{i32 786688, metadata !4, metadata !"p", metadata !5, i32 4, metadata !10, i32 0, metadata !31} ; [ DW_TAG_auto_variable ] [p] [line 4]
+!33 = metadata !{i32 5, i32 0, metadata !21, metadata !31}
+!34 = metadata !{i32 6, i32 0, metadata !21, metadata !31}
+!35 = metadata !{i32 7, i32 0, metadata !4, metadata !31}
+!36 = metadata !{i32 11, i32 0, metadata !13, null}
diff --git a/test/DebugInfo/X86/elf-names.ll b/test/DebugInfo/X86/elf-names.ll
index 7eef2de..176c2af 100644
--- a/test/DebugInfo/X86/elf-names.ll
+++ b/test/DebugInfo/X86/elf-names.ll
@@ -87,13 +87,11 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !24 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from D]
 !25 = metadata !{metadata !26}
 !26 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!27 = metadata !{metadata !28}
-!28 = metadata !{metadata !29}
+!27 = metadata !{metadata !29}
 !29 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777228, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 12]
 !30 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
 !31 = metadata !{i32 786478, metadata !6, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2ERKS_", i32 19, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*, %class.D*)* @_ZN1DC2ERKS_, null, metadata !20, metadata !32, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [D]
-!32 = metadata !{metadata !33}
-!33 = metadata !{metadata !34, metadata !35}
+!32 = metadata !{metadata !34, metadata !35}
 !34 = metadata !{i32 786689, metadata !31, metadata !"this", metadata !6, i32 16777235, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 19]
 !35 = metadata !{i32 786689, metadata !31, metadata !"d", metadata !6, i32 33554451, metadata !23, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [d] [line 19]
 !36 = metadata !{i32 12, i32 0, metadata !5, null}
diff --git a/test/DebugInfo/X86/empty-and-one-elem-array.ll b/test/DebugInfo/X86/empty-and-one-elem-array.ll
index f5c37df..974bd73 100644
--- a/test/DebugInfo/X86/empty-and-one-elem-array.ll
+++ b/test/DebugInfo/X86/empty-and-one-elem-array.ll
@@ -28,11 +28,6 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; An empty array should not have an AT_upper_bound attribute. But an array of 1
 ; should.
 
-; CHECK:      DW_TAG_base_type
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[{{.*}}] = "int")
-; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (0x05)
-; CHECK-NEXT: DW_AT_byte_size [DW_FORM_data1]  (0x04)
-
 ; int foo::b[1]:
 ; CHECK: DW_TAG_structure_type
 ; CHECK: DW_AT_name{{.*}}"foo"
@@ -41,6 +36,11 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[{{.*}}] = "b")
 ; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
 
+; CHECK:      DW_TAG_base_type
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[{{.*}}] = "int")
+; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (0x05)
+; CHECK-NEXT: DW_AT_byte_size [DW_FORM_data1]  (0x04)
+
 ; int[1]:
 ; CHECK:      DW_TAG_array_type [{{.*}}] *
 ; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
diff --git a/test/DebugInfo/X86/ending-run.ll b/test/DebugInfo/X86/ending-run.ll
index 6dd15af..165074e 100644
--- a/test/DebugInfo/X86/ending-run.ll
+++ b/test/DebugInfo/X86/ending-run.ll
@@ -32,13 +32,11 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.1 (trunk 153921) (llvm/trunk 153916)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !19, metadata !6, metadata !"callee", metadata !"callee", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 (i32)* @callee, null, null, metadata !10, i32 7} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 786478, metadata !19, metadata !6, metadata !"callee", metadata !"callee", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 (i32)* @callee, null, null, null, i32 7} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !12 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !6, i32 16777221, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !13 = metadata !{i32 5, i32 5, metadata !5, null}
 !14 = metadata !{i32 786688, metadata !15, metadata !"y", metadata !6, i32 8, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
diff --git a/test/DebugInfo/X86/fission-ranges.ll b/test/DebugInfo/X86/fission-ranges.ll
index 416e7cb..057039c 100644
--- a/test/DebugInfo/X86/fission-ranges.ll
+++ b/test/DebugInfo/X86/fission-ranges.ll
@@ -26,7 +26,7 @@
 
 ; CHECK: [[A]]: Beginning address index: 2
 ; CHECK-NEXT:                    Length: 199
-; CHECK-NEXT:      Location description: 10 00
+; CHECK-NEXT:      Location description: 11 00
 ; CHECK-NEXT: {{^$}}
 ; CHECK-NEXT:   Beginning address index: 3
 ; CHECK-NEXT:                    Length: 23
diff --git a/test/DebugInfo/X86/formal_parameter.ll b/test/DebugInfo/X86/formal_parameter.ll
new file mode 100644
index 0000000..3445f46
--- /dev/null
+++ b/test/DebugInfo/X86/formal_parameter.ll
@@ -0,0 +1,83 @@
+; ModuleID = 'formal_parameter.c'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+;
+; From (clang -g -c -O1):
+;
+; int lookup(int* map);
+; int verify(int val);
+; void foo(int map)
+; {
+;   lookup(&map);
+;   if (!verify(map)) {  }
+; }
+;
+; RUN: opt %s -O2 -S -o %t
+; RUN: cat %t | FileCheck --check-prefix=LOWERING %s
+; RUN: llc -filetype=obj %t -o - | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; Test that we only emit only one DW_AT_formal_parameter "map" for this function.
+; rdar://problem/14874886
+;
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_name {{.*}}map
+; CHECK-NOT: DW_AT_name {{.*}}map
+
+; Function Attrs: nounwind ssp uwtable
+define void @foo(i32 %map) #0 {
+entry:
+  %map.addr = alloca i32, align 4
+  store i32 %map, i32* %map.addr, align 4, !tbaa !15
+  call void @llvm.dbg.declare(metadata !{i32* %map.addr}, metadata !10), !dbg !14
+  %call = call i32 (i32*, ...)* bitcast (i32 (...)* @lookup to i32 (i32*, ...)*)(i32* %map.addr) #3, !dbg !19
+  ; Ensure that all dbg intrinsics have the same scope after
+  ; LowerDbgDeclare is finished with them.
+  ;
+  ; LOWERING: call void @llvm.dbg.value{{.*}}, !dbg ![[LOC:.*]]
+  ; LOWERING: call void @llvm.dbg.value{{.*}}, !dbg ![[LOC]]
+  ; LOWERING: call void @llvm.dbg.value{{.*}}, !dbg ![[LOC]]
+%0 = load i32* %map.addr, align 4, !dbg !20, !tbaa !15
+  %call1 = call i32 (i32, ...)* bitcast (i32 (...)* @verify to i32 (i32, ...)*)(i32 %0) #3, !dbg !20
+  ret void, !dbg !22
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+declare i32 @lookup(...)
+
+declare i32 @verify(...)
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11, !12}
+!llvm.ident = !{!13}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [formal_parameter.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"formal_parameter.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32)* @foo, null, null, metadata !9, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [foo]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [formal_parameter.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null, metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !10}
+!10 = metadata !{i32 786689, metadata !4, metadata !"map", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [map] [line 1]
+!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{metadata !"clang version 3.5.0 "}
+!14 = metadata !{i32 1, i32 0, metadata !4, null}
+!15 = metadata !{metadata !16, metadata !16, i64 0}
+!16 = metadata !{metadata !"int", metadata !17, i64 0}
+!17 = metadata !{metadata !"omnipotent char", metadata !18, i64 0}
+!18 = metadata !{metadata !"Simple C/C++ TBAA"}
+!19 = metadata !{i32 3, i32 0, metadata !4, null}
+!20 = metadata !{i32 4, i32 0, metadata !21, null}
+!21 = metadata !{i32 786443, metadata !1, metadata !4, i32 4, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [formal_parameter.c]
+!22 = metadata !{i32 5, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/X86/gnu-public-names.ll b/test/DebugInfo/X86/gnu-public-names.ll
index f4001e3..4e35dbe 100644
--- a/test/DebugInfo/X86/gnu-public-names.ll
+++ b/test/DebugInfo/X86/gnu-public-names.ll
@@ -86,8 +86,10 @@
 ; CHECK-NEXT: DW_AT_name {{.*}} "D"
 
 ; CHECK: [[GLOB_NS_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name
-; CHECK-NEXT: DW_AT_name {{.*}} "global_namespace_function"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}} "global_namespace_function"
 
 ; CHECK: [[GLOB_NS_VAR:[0-9a-f]+]]: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_specification {{.*}}[[GLOB_NS_VAR_DECL]]
@@ -96,14 +98,18 @@
 ; CHECK-NEXT: DW_AT_specification {{.*}}[[D_VAR_DECL]]
 
 ; CHECK: [[MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_specification {{.*}}[[MEM_FUNC_DECL]]
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_specification {{.*}}[[MEM_FUNC_DECL]]
 
 ; CHECK: [[STATIC_MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_specification {{.*}}[[STATIC_MEM_FUNC_DECL]]
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_specification {{.*}}[[STATIC_MEM_FUNC_DECL]]
 
 ; CHECK: [[GLOBAL_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name
-; CHECK-NEXT: DW_AT_name {{.*}} "global_function"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}} "global_function"
 
 ; CHECK-LABEL: .debug_gnu_pubnames contents:
 ; CHECK-NEXT: length = 0x000000e7 version = 0x0002 unit_offset = 0x00000000 unit_size = [[UNIT_SIZE]]
@@ -166,8 +172,10 @@
 ; DWARF3-NEXT: DW_AT_name {{.*}} "D"
 
 ; DWARF3: [[GLOB_NS_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NEXT: DW_AT_MIPS_linkage_name
-; DWARF3-NEXT: DW_AT_name {{.*}} "global_namespace_function"
+; DWARF3-NOT: DW_TAG
+; DWARF3: DW_AT_MIPS_linkage_name
+; DWARF3-NOT: DW_TAG
+; DWARF3: DW_AT_name {{.*}} "global_namespace_function"
 
 ; DWARF3: [[GLOB_NS_VAR:[0-9a-f]+]]: DW_TAG_variable
 ; DWARF3-NEXT: DW_AT_specification {{.*}}[[GLOB_NS_VAR_DECL]]
@@ -176,14 +184,18 @@
 ; DWARF3-NEXT: DW_AT_specification {{.*}}[[D_VAR_DECL]]
 
 ; DWARF3: [[MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NEXT: DW_AT_specification {{.*}}[[MEM_FUNC_DECL]]
+; DWARF3-NOT: DW_TAG
+; DWARF3: DW_AT_specification {{.*}}[[MEM_FUNC_DECL]]
 
 ; DWARF3: [[STATIC_MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NEXT: DW_AT_specification {{.*}}[[STATIC_MEM_FUNC_DECL]]
+; DWARF3-NOT: DW_TAG
+; DWARF3: DW_AT_specification {{.*}}[[STATIC_MEM_FUNC_DECL]]
 
 ; DWARF3: [[GLOBAL_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
-; DWARF3-NEXT: DW_AT_MIPS_linkage_name
-; DWARF3-NEXT: DW_AT_name {{.*}} "global_function"
+; DWARF3-NOT: DW_TAG
+; DWARF3: DW_AT_MIPS_linkage_name
+; DWARF3-NOT: DW_TAG
+; DWARF3: DW_AT_name {{.*}} "global_function"
 
 ; DWARF3-LABEL: .debug_gnu_pubnames contents:
 ; DWARF3-NEXT: length = 0x000000e7 version = 0x0002 unit_offset = 0x00000000 unit_size = [[UNIT_SIZE]]
diff --git a/test/DebugInfo/X86/inline-member-function.ll b/test/DebugInfo/X86/inline-member-function.ll
new file mode 100644
index 0000000..3dc6043
--- /dev/null
+++ b/test/DebugInfo/X86/inline-member-function.ll
@@ -0,0 +1,95 @@
+; REQUIRES: object-emission
+
+; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; From source:
+; struct foo {
+;   int __attribute__((always_inline)) func(int x) { return x + 2; }
+; };
+
+; int i;
+
+; int main() {
+;   return foo().func(i);
+; }
+
+; CHECK: DW_TAG_structure_type
+; CHECK:   DW_TAG_subprogram
+
+; But make sure we emit DW_AT_object_pointer on the abstract definition.
+; CHECK: [[ABSTRACT_ORIGIN:.*]]: DW_TAG_subprogram
+; CHECK-NOT: NULL
+; CHECK-NOT: TAG
+; CHECK: DW_AT_object_pointer
+
+; Ensure we omit DW_AT_object_pointer on inlined subroutines.
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}}{[[ABSTRACT_ORIGIN]]}
+; CHECK-NOT: NULL
+; CHECK-NOT: DW_AT_object_pointer
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NOT: DW_AT_artificial
+; CHECK: DW_TAG
+
+%struct.foo = type { i8 }
+
+@i = global i32 0, align 4
+
+; Function Attrs: uwtable
+define i32 @main() #0 {
+entry:
+  %this.addr.i = alloca %struct.foo*, align 8
+  %x.addr.i = alloca i32, align 4
+  %retval = alloca i32, align 4
+  %tmp = alloca %struct.foo, align 1
+  store i32 0, i32* %retval
+  %0 = load i32* @i, align 4, !dbg !23
+  store %struct.foo* %tmp, %struct.foo** %this.addr.i, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr.i}, metadata !24), !dbg !26
+  store i32 %0, i32* %x.addr.i, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr.i}, metadata !27), !dbg !28
+  %this1.i = load %struct.foo** %this.addr.i
+  %1 = load i32* %x.addr.i, align 4, !dbg !28
+  %add.i = add nsw i32 %1, 2, !dbg !28
+  ret i32 %add.i, !dbg !23
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!20, !21}
+!llvm.ident = !{!22}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !12, metadata !18, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"inline.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 786478, metadata !1, metadata !"_ZTS3foo", metadata !"func", metadata !"func", metadata !"_ZN3foo4funcEi", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !11, i32 2} ; [ DW_TAG_subprogram ] [line 2] [func]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9, metadata !10, metadata !9}
+!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
+!11 = metadata !{i32 786468}
+!12 = metadata !{metadata !13, metadata !17}
+!13 = metadata !{i32 786478, metadata !1, metadata !14, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!14 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline.cpp]
+!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !9}
+!17 = metadata !{i32 786478, metadata !1, metadata !"_ZTS3foo", metadata !"func", metadata !"func", metadata !"_ZN3foo4funcEi", i32 2, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, metadata !6, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [func]
+!18 = metadata !{metadata !19}
+!19 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"", metadata !14, i32 5, metadata !9, i32 0, i32 1, i32* @i, null} ; [ DW_TAG_variable ] [i] [line 5] [def]
+!20 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{metadata !"clang version 3.5.0 "}
+!23 = metadata !{i32 8, i32 0, metadata !13, null} ; [ DW_TAG_imported_declaration ]
+!24 = metadata !{i32 786689, metadata !17, metadata !"this", null, i32 16777216, metadata !25, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!25 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
+!26 = metadata !{i32 0, i32 0, metadata !17, metadata !23}
+!27 = metadata !{i32 786689, metadata !17, metadata !"x", metadata !14, i32 33554434, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 2]
+!28 = metadata !{i32 2, i32 0, metadata !17, metadata !23}
diff --git a/test/DebugInfo/X86/inline-seldag-test.ll b/test/DebugInfo/X86/inline-seldag-test.ll
new file mode 100644
index 0000000..615f03a
--- /dev/null
+++ b/test/DebugInfo/X86/inline-seldag-test.ll
@@ -0,0 +1,77 @@
+; RUN: llc -mtriple=x86_64-linux-gnu -fast-isel=false -filetype=obj < %s -o - | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu -fast-isel=false -filetype=asm < %s -o - | FileCheck --check-prefix=ASM %s
+
+; Generated from:
+; clang-tot -c -S -emit-llvm -g inline-seldag-test.c
+; inline int __attribute__((always_inline)) f(int y) {
+;   return y ? 4 : 7;
+; }
+; void func() {
+;   volatile int x;
+;   x = f(x);
+; }
+
+; CHECK: [[F:.*]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}} "f"
+
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[F]]}
+
+
+; Make sure the condition test is attributed to the inline function, not the
+; location of the test's operands within the caller.
+
+; ASM: # inline-seldag-test.c:2:0
+; ASM-NOT: .loc
+; ASM: testl
+
+; Function Attrs: nounwind uwtable
+define void @func() #0 {
+entry:
+  %y.addr.i = alloca i32, align 4
+  %x = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %x}, metadata !15), !dbg !17
+  %0 = load volatile i32* %x, align 4, !dbg !18
+  store i32 %0, i32* %y.addr.i, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %y.addr.i}, metadata !19), !dbg !20
+  %1 = load i32* %y.addr.i, align 4, !dbg !21
+  %tobool.i = icmp ne i32 %1, 0, !dbg !21
+  %cond.i = select i1 %tobool.i, i32 4, i32 7, !dbg !21
+  store volatile i32 %cond.i, i32* %x, align 4, !dbg !18
+  ret void, !dbg !22
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12, !13}
+!llvm.ident = !{!14}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline-seldag-test.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"inline-seldag-test.c", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !8}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"", i32 4, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @func, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [func]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline-seldag-test.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !11, metadata !11}
+!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!14 = metadata !{metadata !"clang version 3.5.0 "}
+!15 = metadata !{i32 786688, metadata !4, metadata !"x", metadata !5, i32 5, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [x] [line 5]
+!16 = metadata !{i32 786485, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !11} ; [ DW_TAG_volatile_type ] [line 0, size 0, align 0, offset 0] [from int]
+!17 = metadata !{i32 5, i32 0, metadata !4, null}
+!18 = metadata !{i32 6, i32 7, metadata !4, null}
+!19 = metadata !{i32 786689, metadata !8, metadata !"y", metadata !5, i32 16777217, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [y] [line 1]
+!20 = metadata !{i32 1, i32 0, metadata !8, metadata !18}
+!21 = metadata !{i32 2, i32 0, metadata !8, metadata !18}
+!22 = metadata !{i32 7, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/X86/instcombine-instrinsics.ll b/test/DebugInfo/X86/instcombine-instrinsics.ll
index 41dd09f..2fd7ee3 100644
--- a/test/DebugInfo/X86/instcombine-instrinsics.ll
+++ b/test/DebugInfo/X86/instcombine-instrinsics.ll
@@ -1,102 +1,79 @@
-; RUN: opt < %s -O2 -S | FileCheck %s
+; RUN: opt %s -O2 -S -o - | FileCheck %s
 ; Verify that we emit the same intrinsic at most once.
-; CHECK: call void @llvm.dbg.value(metadata !{%struct.i14** %i14}
-; CHECK-NOT: call void @llvm.dbg.value(metadata !{%struct.i14** %i14}
+; rdar://problem/13056109
+;
+; CHECK: call void @llvm.dbg.value(metadata !{%struct.i14** %p}
+; CHECK-NOT: call void @llvm.dbg.value(metadata !{%struct.i14** %p}
+; CHECK-NEXT: call i32 @foo
 ; CHECK: ret
+;
+;
+; typedef struct {
+;   long i;
+; } i14;
+;
+; int foo(i14**);
+;
+;   void init() {
+;     i14* p = 0;
+;     foo(&p);
+;     p->i |= 4;
+;     foo(&p);
+;   }
+;
+; ModuleID = 'instcombine_intrinsics.c'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
 
-;*** IR Dump After Dead Argument Elimination ***
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
+%struct.i14 = type { i64 }
 
-%struct.i3 = type { i32 }
-%struct.i14 = type { i32 }
-%struct.i24 = type opaque
-
-define %struct.i3* @barz(i64 %i9) nounwind {
-entry:
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %entry
-  br label %while.body
-
-while.body:                                       ; preds = %while.cond
-  br label %while.cond
-}
-
-declare void @llvm.dbg.declare(metadata, metadata)
-
-define void @init() nounwind {
-entry:
-  %i14 = alloca %struct.i14*, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.i14** %i14}, metadata !25)
-  store %struct.i14* null, %struct.i14** %i14, align 8
-  %call = call i32 @foo(i8* bitcast (void ()* @bar to i8*), %struct.i14** %i14)
-  %0 = load %struct.i14** %i14, align 8
-  %i16 = getelementptr inbounds %struct.i14* %0, i32 0, i32 0
-  %1 = load i32* %i16, align 4
-  %or = or i32 %1, 4
-  store i32 %or, i32* %i16, align 4
-  %call4 = call i32 @foo(i8* bitcast (void ()* @baz to i8*), %struct.i14** %i14)
-  ret void
+; Function Attrs: nounwind ssp uwtable
+define void @init() #0 {
+  %p = alloca %struct.i14*, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.i14** %p}, metadata !11), !dbg !18
+  store %struct.i14* null, %struct.i14** %p, align 8, !dbg !18
+  %1 = call i32 @foo(%struct.i14** %p), !dbg !19
+  %2 = load %struct.i14** %p, align 8, !dbg !20
+  %3 = getelementptr inbounds %struct.i14* %2, i32 0, i32 0, !dbg !20
+  %4 = load i64* %3, align 8, !dbg !20
+  %5 = or i64 %4, 4, !dbg !20
+  store i64 %5, i64* %3, align 8, !dbg !20
+  %6 = call i32 @foo(%struct.i14** %p), !dbg !21
+  ret void, !dbg !22
 }
 
-declare i32 @foo(i8*, %struct.i14**) nounwind
-
-define internal void @bar() nounwind {
-entry:
-  %i9 = alloca i64, align 8
-  store i64 0, i64* %i9, align 8
-  %call = call i32 @put(i64 0, i64* %i9, i64 0, %struct.i24* null)
-  ret void
-}
-
-define internal void @baz() nounwind {
-entry:
-  ret void
-}
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
 
-declare i32 @put(i64, i64*, i64, %struct.i24*) nounwind readnone
+declare i32 @foo(%struct.i14**)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!73}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.3 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !48, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !"i1", metadata !""}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4, metadata !21, metadata !33, metadata !47}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"i2", metadata !"i2", metadata !"", i32 31, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, %struct.i3* (i64)* @barz, null, null, metadata !16, i32 32} ; [ DW_TAG_subprogram ] [line 31]  [scope 32]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ]
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [instcombine_intrinsics.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"instcombine_intrinsics.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"init", metadata !"init", metadata !"", i32 7, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @init, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [init]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [instcombine_intrinsics.c]
 !6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !13}
-!8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from i3]
-!9 = metadata !{i32 786451, metadata !1, null, metadata !"i3", i32 25, i64 32, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [i3] [line 25, size 32, align 32, offset 0] [def] [from ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"i4", i32 26, i64 32, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ]  [line 26, size 32, align 32, offset 0] [from i5]
-!12 = metadata !{i32 786468, null, null, metadata !"i5", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]  [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
-!13 = metadata !{i32 786454, metadata !1, null, metadata !"i6", i32 5, i64 0, i64 0, i64 0, i32 0, metadata !14} ; [ DW_TAG_typedef ]  [line 5, size 0, align 0, offset 0] [from i7]
-!14 = metadata !{i32 786454, metadata !1, null, metadata !"i7", i32 2, i64 0, i64 0, i64 0, i32 0, metadata !15} ; [ DW_TAG_typedef ]  [line 2, size 0, align 0, offset 0] [from i8]
-!15 = metadata !{i32 786468, null, null, metadata !"i8", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]  [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!16 = metadata !{}
-!21 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"i13", metadata !"i13", metadata !"", i32 42, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @init, null, null, metadata !24, i32 43} ; [ DW_TAG_subprogram ] [line 42]  [scope 43]
-!22 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !34, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!23 = metadata !{null}
-!24 = metadata !{metadata !25}
-!25 = metadata !{i32 786688, metadata !21, metadata !"i14", metadata !5, i32 45, metadata !27, i32 0, i32 0} ; [ DW_TAG_auto_variable ]  [line 45]
-!27 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from i14]
-!28 = metadata !{i32 786451, metadata !1, null, metadata !"i14", i32 16, i64 32, i64 32, i32 0, i32 0, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [i14] [line 16, size 32, align 32, offset 0] [def] [from ]
-!29 = metadata !{metadata !30}
-!30 = metadata !{i32 786445, metadata !1, metadata !28, metadata !"i16", i32 17, i64 32, i64 32, i64 0, i32 0, metadata !31} ; [ DW_TAG_member ]  [line 17, size 32, align 32, offset 0] [from i17]
-!31 = metadata !{i32 786454, metadata !1, null, metadata !"i17", i32 7, i64 0, i64 0, i64 0, i32 0, metadata !32} ; [ DW_TAG_typedef ]  [line 7, size 0, align 0, offset 0] [from int]
-!32 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]  [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!33 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"i18", metadata !"i18", metadata !"", i32 54, metadata !22, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @bar, null, null, metadata !34, i32 55} ; [ DW_TAG_subprogram ] [line 54]   [scope 55]
-!34 = metadata !{null}
-!47 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"i29", metadata !"i29", metadata !"", i32 53, metadata !22, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @baz, null, null, metadata !2, i32 53} ; [ DW_TAG_subprogram ] [line 53]
-!48 = metadata !{metadata !49}
-!49 = metadata !{i32 786484, i32 0, metadata !21, metadata !"i30", metadata !"i30", metadata !"", metadata !5, i32 44, metadata !50, i32 1, i32 1, null, null}
-!50 = metadata !{i32 786454, metadata !1, null, metadata !"i31", i32 6, i64 0, i64 0, i64 0, i32 0, metadata !32} ; [ DW_TAG_typedef ]  [line 6, size 0, align 0, offset 0] [from int]
-!52 = metadata !{i64 0}
-!55 = metadata !{%struct.i3* null}
-!72 = metadata !{%struct.i24* null}
-!73 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!7 = metadata !{null}
+!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{metadata !"clang version 3.5.0 "}
+!11 = metadata !{i32 786688, metadata !4, metadata !"p", metadata !5, i32 8, metadata !12, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [p] [line 8]
+!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from i14]
+!13 = metadata !{i32 786454, metadata !1, null, metadata !"i14", i32 3, i64 0, i64 0, i64 0, i32 0, metadata !14} ; [ DW_TAG_typedef ] [i14] [line 3, size 0, align 0, offset 0] [from ]
+!14 = metadata !{i32 786451, metadata !1, null, metadata !"", i32 1, i64 64, i64 64, i32 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 64, align 64, offset 0] [def] [from ]
+!15 = metadata !{metadata !16}
+!16 = metadata !{i32 786445, metadata !1, metadata !14, metadata !"i", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !17} ; [ DW_TAG_member ] [i] [line 2, size 64, align 64, offset 0] [from long int]
+!17 = metadata !{i32 786468, null, null, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!18 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!19 = metadata !{i32 9, i32 0, metadata !4, null}
+!20 = metadata !{i32 10, i32 0, metadata !4, null}
+!21 = metadata !{i32 11, i32 0, metadata !4, null}
+!22 = metadata !{i32 12, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/X86/linkage-name.ll b/test/DebugInfo/X86/linkage-name.ll
index a89869d..2b1647b 100644
--- a/test/DebugInfo/X86/linkage-name.ll
+++ b/test/DebugInfo/X86/linkage-name.ll
@@ -30,7 +30,7 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !0 = metadata !{i32 786449, metadata !28, i32 4, metadata !"clang version 3.1 (trunk 152691) (llvm/trunk 152692)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !18,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !6, null, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%class.A*, i32)* @_ZN1A1aEi, null, metadata !13, metadata !16, i32 5} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 786478, metadata !6, null, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%class.A*, i32)* @_ZN1A1aEi, null, metadata !13, null, i32 5} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10, metadata !9}
@@ -38,11 +38,7 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !10 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !11} ; [ DW_TAG_pointer_type ]
 !11 = metadata !{i32 786434, metadata !28, null, metadata !"A", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786478, metadata !6, metadata !11, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 257, i1 false, null, null, i32 0, metadata !14, i32 0} ; [ DW_TAG_subprogram ]
-!14 = metadata !{metadata !15}
-!15 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!16 = metadata !{metadata !17}
-!17 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!13 = metadata !{i32 786478, metadata !6, metadata !11, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 257, i1 false, null, null, i32 0, null, i32 0} ; [ DW_TAG_subprogram ]
 !18 = metadata !{metadata !20}
 !20 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 9, metadata !11, i32 0, i32 1, %class.A* @a, null} ; [ DW_TAG_variable ]
 !21 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777221, metadata !22, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
diff --git a/test/DebugInfo/X86/pr11300.ll b/test/DebugInfo/X86/pr11300.ll
index 19032fa..11c409c 100644
--- a/test/DebugInfo/X86/pr11300.ll
+++ b/test/DebugInfo/X86/pr11300.ll
@@ -3,11 +3,14 @@
 
 ; test that the DW_AT_specification is a back edge in the file.
 
+; Skip the definition of zed(foo*)
 ; CHECK: DW_TAG_subprogram
-; CHECK: DW_AT_name [DW_FORM_strp]	( .debug_str[0x{{[0-9a-f]*}}] = "zed")
+; CHECK: DW_TAG_class_type
+; CHECK: [[BAR_DECL:0x[0-9a-f]*]]:     DW_TAG_subprogram
+; CHECK:     DW_AT_MIPS_linkage_name {{.*}} "_ZN3foo3barEv"
 ; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_specification [DW_FORM_ref4]      (cu + {{.*}} => {[[BACK:0x[0-9a-f]*]]})
-; CHECK: [[BACK]]:     DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_specification {{.*}} {[[BAR_DECL]]}
 
 %struct.foo = type { i8 }
 
@@ -38,7 +41,7 @@ entry:
 !0 = metadata !{i32 786449, metadata !32, i32 4, metadata !"clang version 3.0 ()", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !20}
-!5 = metadata !{i32 720942, metadata !6, metadata !6, metadata !"zed", metadata !"zed", metadata !"_Z3zedP3foo", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.foo*)* @_Z3zedP3foo, null, null, metadata !21, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [zed]
+!5 = metadata !{i32 720942, metadata !6, metadata !6, metadata !"zed", metadata !"zed", metadata !"_Z3zedP3foo", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.foo*)* @_Z3zedP3foo, null, null, null, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [zed]
 !6 = metadata !{i32 720937, metadata !32} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
@@ -53,9 +56,7 @@ entry:
 !17 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !18 = metadata !{metadata !19}
 !19 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!20 = metadata !{i32 720942, metadata !6, null, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 2, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.foo*)* @_ZN3foo3barEv, null, metadata !12, metadata !21, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
-!21 = metadata !{metadata !22}
-!22 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
+!20 = metadata !{i32 720942, metadata !6, null, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 2, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.foo*)* @_ZN3foo3barEv, null, metadata !12, null, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
 !23 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !6, i32 16777220, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !24 = metadata !{i32 4, i32 15, metadata !5, null}
 !25 = metadata !{i32 4, i32 20, metadata !26, null}
diff --git a/test/DebugInfo/X86/pr12831.ll b/test/DebugInfo/X86/pr12831.ll
index 6dea4a0..117e426 100644
--- a/test/DebugInfo/X86/pr12831.ll
+++ b/test/DebugInfo/X86/pr12831.ll
@@ -79,7 +79,7 @@ entry:
 !llvm.module.flags = !{!162}
 
 !0 = metadata !{i32 786449, metadata !161, i32 4, metadata !"clang version 3.2 ", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !128, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 0}
+!1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !106, metadata !107, metadata !126, metadata !127}
 !5 = metadata !{i32 786478, metadata !6, null, metadata !"writeExpr", metadata !"writeExpr", metadata !"_ZN17BPLFunctionWriter9writeExprEv", i32 19, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.BPLFunctionWriter*)* @_ZN17BPLFunctionWriter9writeExprEv, null, metadata !103, metadata !1, i32 19} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !160} ; [ DW_TAG_file_type ]
diff --git a/test/DebugInfo/X86/sret.ll b/test/DebugInfo/X86/sret.ll
index c43b045..fed4334 100644
--- a/test/DebugInfo/X86/sret.ll
+++ b/test/DebugInfo/X86/sret.ll
@@ -3,8 +3,8 @@
 
 ; Based on the debuginfo-tests/sret.cpp code.
 
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0xc68148e4333befda)
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0xc68148e4333befda)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x72aabf538392d298)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x72aabf538392d298)
 
 %class.A = type { i32 (...)**, i32 }
 %class.B = type { i8 }
@@ -277,40 +277,33 @@ attributes #7 = { builtin nounwind }
 !11 = metadata !{metadata !12}
 !12 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !13 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1A", metadata !"m_int", i32 13, i64 32, i64 32, i64 64, i32 2, metadata !12} ; [ DW_TAG_member ] [m_int] [line 13, size 32, align 32, offset 64] [protected] [from int]
-!14 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"", i32 4, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !18, i32 4} ; [ DW_TAG_subprogram ] [line 4] [A]
+!14 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"", i32 4, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 4} ; [ DW_TAG_subprogram ] [line 4] [A]
 !15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !17, metadata !12}
 !17 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!18 = metadata !{i32 786468}
-!19 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"", i32 5, metadata !20, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !24, i32 5} ; [ DW_TAG_subprogram ] [line 5] [A]
+!19 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"", i32 5, metadata !20, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 5} ; [ DW_TAG_subprogram ] [line 5] [A]
 !20 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !21 = metadata !{null, metadata !17, metadata !22}
 !22 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !23} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS1A]
-!24 = metadata !{i32 786468}
-!25 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"operator=", metadata !"operator=", metadata !"_ZN1AaSERKS_", i32 7, metadata !26, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !28, i32 7} ; [ DW_TAG_subprogram ] [line 7] [operator=]
+!25 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"operator=", metadata !"operator=", metadata !"_ZN1AaSERKS_", i32 7, metadata !26, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 7} ; [ DW_TAG_subprogram ] [line 7] [operator=]
 !26 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !27 = metadata !{metadata !22, metadata !17, metadata !22}
-!28 = metadata !{i32 786468}
-!29 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"~A", metadata !"~A", metadata !"", i32 8, metadata !30, i1 false, i1 false, i32 1, i32 0, metadata !"_ZTS1A", i32 256, i1 false, null, null, i32 0, metadata !32, i32 8} ; [ DW_TAG_subprogram ] [line 8] [~A]
+!29 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"~A", metadata !"~A", metadata !"", i32 8, metadata !30, i1 false, i1 false, i32 1, i32 0, metadata !"_ZTS1A", i32 256, i1 false, null, null, i32 0, null, i32 8} ; [ DW_TAG_subprogram ] [line 8] [~A]
 !30 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !31 = metadata !{null, metadata !17}
-!32 = metadata !{i32 786468}
-!33 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"get_int", metadata !"get_int", metadata !"_ZN1A7get_intEv", i32 10, metadata !34, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !36, i32 10} ; [ DW_TAG_subprogram ] [line 10] [get_int]
+!33 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"get_int", metadata !"get_int", metadata !"_ZN1A7get_intEv", i32 10, metadata !34, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 10} ; [ DW_TAG_subprogram ] [line 10] [get_int]
 !34 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !35, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !35 = metadata !{metadata !12, metadata !17}
-!36 = metadata !{i32 786468}
 !37 = metadata !{i32 786434, metadata !1, null, metadata !"B", i32 38, i64 8, i64 8, i32 0, i32 0, null, metadata !38, i32 0, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 38, size 8, align 8, offset 0] [def] [from ]
 !38 = metadata !{metadata !39, metadata !44}
-!39 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1B", metadata !"B", metadata !"B", metadata !"", i32 41, metadata !40, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !43, i32 41} ; [ DW_TAG_subprogram ] [line 41] [B]
+!39 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1B", metadata !"B", metadata !"B", metadata !"", i32 41, metadata !40, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 41} ; [ DW_TAG_subprogram ] [line 41] [B]
 !40 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !41, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !41 = metadata !{null, metadata !42}
 !42 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
-!43 = metadata !{i32 786468}
-!44 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1B", metadata !"AInstance", metadata !"AInstance", metadata !"_ZN1B9AInstanceEv", i32 43, metadata !45, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !47, i32 43} ; [ DW_TAG_subprogram ] [line 43] [AInstance]
+!44 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1B", metadata !"AInstance", metadata !"AInstance", metadata !"_ZN1B9AInstanceEv", i32 43, metadata !45, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 43} ; [ DW_TAG_subprogram ] [line 43] [AInstance]
 !45 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !46, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !46 = metadata !{metadata !4, metadata !42}
-!47 = metadata !{i32 786468}
 !48 = metadata !{metadata !49, metadata !50, metadata !51, metadata !52, metadata !53, metadata !54, metadata !61, metadata !62, metadata !63}
 !49 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"_ZN1AC2Ei", i32 16, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*, i32)* @_ZN1AC2Ei, null, metadata !14, metadata !2, i32 18} ; [ DW_TAG_subprogram ] [line 16] [def] [scope 18] [A]
 !50 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"_ZN1AC2ERKS_", i32 21, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*, %class.A*)* @_ZN1AC2ERKS_, null, metadata !19, metadata !2, i32 23} ; [ DW_TAG_subprogram ] [line 21] [def] [scope 23] [A]
diff --git a/test/DebugInfo/X86/type_units_with_addresses.ll b/test/DebugInfo/X86/type_units_with_addresses.ll
new file mode 100644
index 0000000..ff278f6
--- /dev/null
+++ b/test/DebugInfo/X86/type_units_with_addresses.ll
@@ -0,0 +1,151 @@
+; REQUIRES: object-emission
+
+; RUN: llc -split-dwarf=Enable -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu < %s \
+; RUN:     | llvm-dwarfdump - | FileCheck %s
+
+; RUN: llc -split-dwarf=Disable -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu < %s \
+; RUN:     | llvm-dwarfdump - | FileCheck --check-prefix=SINGLE %s
+
+; Test case built from:
+;int i;
+;
+;template <int *I>
+;struct S1 {};
+;
+;S1<&i> s1;
+;
+;template <int *I>
+;struct S2_1 {};
+;
+;struct S2 {
+;  S2_1<&i> s2_1;
+;};
+;
+;S2 s2;
+;
+;template <int *I>
+;struct S3_1 {};
+;
+;struct S3_2 {};
+;
+;struct S3 {
+;  S3_1<&i> s3_1;
+;  S3_2 s3_2;
+;};
+;
+;S3 s3;
+;
+;struct S4_1 {};
+;
+;template <int *T>
+;struct S4_2 {};
+;
+;struct S4 {
+;  S4_1 s4_1;
+;  S4_2<&::i> s4_2;
+;};
+;
+;S4 s4;
+
+
+; CHECK: .debug_info.dwo contents:
+
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}}"S1<&i>"
+
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}}"S2"
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}}"S2_1<&i>"
+
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}}"S3"
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}}"S3_1<&i>"
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_declaration
+; CHECK-NEXT: DW_AT_signature
+
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}}"S4"
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_declaration
+; CHECK-NEXT: DW_AT_signature
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}}"S4_2<&i>"
+
+; SINGLE: .debug_info contents:
+
+; SINGLE: DW_TAG_structure_type
+; SINGLE-NEXT: DW_AT_declaration
+; SINGLE-NEXT: DW_AT_signature
+
+; SINGLE: DW_TAG_structure_type
+; SINGLE-NEXT: DW_AT_declaration
+; SINGLE-NEXT: DW_AT_signature
+
+; SINGLE: DW_TAG_structure_type
+; SINGLE-NEXT: DW_AT_declaration
+; SINGLE-NEXT: DW_AT_signature
+
+; SINGLE: DW_TAG_structure_type
+; SINGLE-NEXT: DW_AT_declaration
+; SINGLE-NEXT: DW_AT_signature
+
+%struct.S1 = type { i8 }
+%struct.S2 = type { %struct.S2_1 }
+%struct.S2_1 = type { i8 }
+%struct.S3 = type { %struct.S3_1, %struct.S3_2 }
+%struct.S3_1 = type { i8 }
+%struct.S3_2 = type { i8 }
+%struct.S4 = type { %struct.S4_1, %struct.S4_2 }
+%struct.S4_1 = type { i8 }
+%struct.S4_2 = type { i8 }
+
+@i = global i32 0, align 4
+@a = global %struct.S1 zeroinitializer, align 1
+@s2 = global %struct.S2 zeroinitializer, align 1
+@s3 = global %struct.S3 zeroinitializer, align 1
+@s4 = global %struct.S4 zeroinitializer, align 1
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!34, !35}
+!llvm.ident = !{!36}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !2, metadata !27, metadata !2, metadata !"tu.dwo", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/tu.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"tu.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !9, metadata !12, metadata !13, metadata !17, metadata !18, metadata !19, metadata !23, metadata !24}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"S1<&i>", i32 4, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, metadata !5, metadata !"_ZTS2S1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S1<&i>] [line 4, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 786480, null, metadata !"I", metadata !7, i32* @i, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!7 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786451, metadata !1, null, metadata !"S2", i32 11, i64 8, i64 8, i32 0, i32 0, null, metadata !10, i32 0, null, null, metadata !"_ZTS2S2"} ; [ DW_TAG_structure_type ] [S2] [line 11, size 8, align 8, offset 0] [def] [from ]
+!10 = metadata !{metadata !11}
+!11 = metadata !{i32 786445, metadata !1, metadata !"_ZTS2S2", metadata !"s2_1", i32 12, i64 8, i64 8, i64 0, i32 0, metadata !"_ZTS4S2_1IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s2_1] [line 12, size 8, align 8, offset 0] [from _ZTS4S2_1IXadL_Z1iEEE]
+!12 = metadata !{i32 786451, metadata !1, null, metadata !"S2_1<&i>", i32 9, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, metadata !5, metadata !"_ZTS4S2_1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S2_1<&i>] [line 9, size 8, align 8, offset 0] [def] [from ]
+!13 = metadata !{i32 786451, metadata !1, null, metadata !"S3", i32 22, i64 16, i64 8, i32 0, i32 0, null, metadata !14, i32 0, null, null, metadata !"_ZTS2S3"} ; [ DW_TAG_structure_type ] [S3] [line 22, size 16, align 8, offset 0] [def] [from ]
+!14 = metadata !{metadata !15, metadata !16}
+!15 = metadata !{i32 786445, metadata !1, metadata !"_ZTS2S3", metadata !"s3_1", i32 23, i64 8, i64 8, i64 0, i32 0, metadata !"_ZTS4S3_1IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s3_1] [line 23, size 8, align 8, offset 0] [from _ZTS4S3_1IXadL_Z1iEEE]
+!16 = metadata !{i32 786445, metadata !1, metadata !"_ZTS2S3", metadata !"s3_2", i32 24, i64 8, i64 8, i64 8, i32 0, metadata !"_ZTS4S3_2"} ; [ DW_TAG_member ] [s3_2] [line 24, size 8, align 8, offset 8] [from _ZTS4S3_2]
+!17 = metadata !{i32 786451, metadata !1, null, metadata !"S3_1<&i>", i32 18, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, metadata !5, metadata !"_ZTS4S3_1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S3_1<&i>] [line 18, size 8, align 8, offset 0] [def] [from ]
+!18 = metadata !{i32 786451, metadata !1, null, metadata !"S3_2", i32 20, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS4S3_2"} ; [ DW_TAG_structure_type ] [S3_2] [line 20, size 8, align 8, offset 0] [def] [from ]
+!19 = metadata !{i32 786451, metadata !1, null, metadata !"S4", i32 34, i64 16, i64 8, i32 0, i32 0, null, metadata !20, i32 0, null, null, metadata !"_ZTS2S4"} ; [ DW_TAG_structure_type ] [S4] [line 34, size 16, align 8, offset 0] [def] [from ]
+!20 = metadata !{metadata !21, metadata !22}
+!21 = metadata !{i32 786445, metadata !1, metadata !"_ZTS2S4", metadata !"s4_1", i32 35, i64 8, i64 8, i64 0, i32 0, metadata !"_ZTS4S4_1"} ; [ DW_TAG_member ] [s4_1] [line 35, size 8, align 8, offset 0] [from _ZTS4S4_1]
+!22 = metadata !{i32 786445, metadata !1, metadata !"_ZTS2S4", metadata !"s4_2", i32 36, i64 8, i64 8, i64 8, i32 0, metadata !"_ZTS4S4_2IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s4_2] [line 36, size 8, align 8, offset 8] [from _ZTS4S4_2IXadL_Z1iEEE]
+!23 = metadata !{i32 786451, metadata !1, null, metadata !"S4_1", i32 29, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS4S4_1"} ; [ DW_TAG_structure_type ] [S4_1] [line 29, size 8, align 8, offset 0] [def] [from ]
+!24 = metadata !{i32 786451, metadata !1, null, metadata !"S4_2<&i>", i32 32, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, metadata !25, metadata !"_ZTS4S4_2IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S4_2<&i>] [line 32, size 8, align 8, offset 0] [def] [from ]
+!25 = metadata !{metadata !26}
+!26 = metadata !{i32 786480, null, metadata !"T", metadata !7, i32* @i, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!27 = metadata !{metadata !28, metadata !30, metadata !31, metadata !32, metadata !33}
+!28 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"", metadata !29, i32 1, metadata !8, i32 0, i32 1, i32* @i, null} ; [ DW_TAG_variable ] [i] [line 1] [def]
+!29 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/tu.cpp]
+!30 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !29, i32 6, metadata !"_ZTS2S1IXadL_Z1iEEE", i32 0, i32 1, %struct.S1* @a, null} ; [ DW_TAG_variable ] [a] [line 6] [def]
+!31 = metadata !{i32 786484, i32 0, null, metadata !"s2", metadata !"s2", metadata !"", metadata !29, i32 15, metadata !"_ZTS2S2", i32 0, i32 1, %struct.S2* @s2, null} ; [ DW_TAG_variable ] [s2] [line 15] [def]
+!32 = metadata !{i32 786484, i32 0, null, metadata !"s3", metadata !"s3", metadata !"", metadata !29, i32 27, metadata !"_ZTS2S3", i32 0, i32 1, %struct.S3* @s3, null} ; [ DW_TAG_variable ] [s3] [line 27] [def]
+!33 = metadata !{i32 786484, i32 0, null, metadata !"s4", metadata !"s4", metadata !"", metadata !29, i32 39, metadata !"_ZTS2S4", i32 0, i32 1, %struct.S4* @s4, null} ; [ DW_TAG_variable ] [s4] [line 39] [def]
+!34 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!36 = metadata !{metadata !"clang version 3.5.0 "}
diff --git a/test/DebugInfo/constant-pointers.ll b/test/DebugInfo/constant-pointers.ll
new file mode 100644
index 0000000..fdde06d
--- /dev/null
+++ b/test/DebugInfo/constant-pointers.ll
@@ -0,0 +1,51 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj %s -o - | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Ensure that pointer constants are emitted as unsigned data. Alternatively,
+; these could be signless data (dataN).
+
+; Built with Clang from:
+; template <void *V, void (*F)(), int i>
+; void func() {}
+; template void func<nullptr, nullptr, 42>();
+
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_TAG_template_value_parameter
+; CHECK:     DW_AT_name {{.*}} "V"
+; CHECK:     DW_AT_const_value [DW_FORM_udata] (0)
+; CHECK:   DW_TAG_template_value_parameter
+; CHECK:     DW_AT_name {{.*}} "F"
+; CHECK:     DW_AT_const_value [DW_FORM_udata] (0)
+
+; Function Attrs: nounwind uwtable
+define weak_odr void @_Z4funcILPv0ELPFvvE0ELi42EEvv() #0 {
+entry:
+  ret void, !dbg !18
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!15, !16}
+!llvm.ident = !{!17}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/constant-pointers.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"constant-pointers.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func<nullptr, nullptr, 42>", metadata !"func<nullptr, nullptr, 42>", metadata !"_Z4funcILPv0ELPFvvE0ELi42EEvv", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z4funcILPv0ELPFvvE0ELi42EEvv, metadata !8, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [func<nullptr, nullptr, 42>]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/constant-pointers.cpp]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{metadata !9, metadata !11, metadata !13}
+!9 = metadata !{i32 786480, null, metadata !"V", metadata !10, i8 0, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!11 = metadata !{i32 786480, null, metadata !"F", metadata !12, i8 0, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = metadata !{i32 786480, null, metadata !"i", metadata !14, i32 42, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!14 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!15 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!16 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{metadata !"clang version 3.5.0 "}
+!18 = metadata !{i32 3, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/cross-cu-inlining.ll b/test/DebugInfo/cross-cu-inlining.ll
new file mode 100644
index 0000000..266a24d
--- /dev/null
+++ b/test/DebugInfo/cross-cu-inlining.ll
@@ -0,0 +1,137 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Build from source:
+; $ clang++ a.cpp b.cpp -g -c -emit-llvm
+; $ llvm-link a.bc b.bc -o ab.bc
+; $ opt -inline ab.bc -o ab-opt.bc
+; $ cat a.cpp
+; extern int i;
+; int func(int);
+; int main() {
+;   return func(i);
+; }
+; $ cat b.cpp
+; int __attribute__((always_inline)) func(int x) {
+;   return x * 2;
+; }
+
+; Ensure that func inlined into main is described and references the abstract
+; definition in b.cpp's CU.
+
+; CHECK: DW_TAG_compile_unit
+; CHECK:   DW_AT_name {{.*}} "a.cpp"
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_type [DW_FORM_ref_addr] (0x00000000[[INT:.*]])
+; CHECK:     DW_TAG_inlined_subroutine
+; CHECK-NOT: DW_TAG
+; CHECK:       DW_AT_abstract_origin {{.*}}[[ABS_FUNC:........]])
+; CHECK:       DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:         DW_AT_abstract_origin {{.*}}[[ABS_VAR:........]])
+
+; Check the abstract definition is in the 'b.cpp' CU and doesn't contain any
+; concrete information (address range or variable location)
+; CHECK: DW_TAG_compile_unit
+; CHECK:   DW_AT_name {{.*}} "b.cpp"
+; CHECK: 0x[[ABS_FUNC]]: DW_TAG_subprogram
+; CHECK-NOT: DW_AT_low_pc
+; CHECK: 0x[[ABS_VAR]]: DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK-NOT: DW_AT_location
+; CHECK: DW_AT_type [DW_FORM_ref4] {{.*}} {0x[[INT]]}
+; CHECK-NOT: DW_AT_location
+
+; CHECK: 0x[[INT]]: DW_TAG_base_type
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "int"
+
+; Check the concrete out of line definition references the abstract and
+; provides the address range and variable location
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_low_pc
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_abstract_origin {{.*}} {0x[[ABS_FUNC]]}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}} {0x[[ABS_VAR]]}
+; CHECK:     DW_AT_location
+
+
+@i = external global i32
+
+; Function Attrs: uwtable
+define i32 @main() #0 {
+entry:
+  %x.addr.i = alloca i32, align 4
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @i, align 4, !dbg !19
+  %1 = bitcast i32* %x.addr.i to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %1)
+  store i32 %0, i32* %x.addr.i, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr.i}, metadata !20), !dbg !21
+  %2 = load i32* %x.addr.i, align 4, !dbg !22
+  %mul.i = mul nsw i32 %2, 2, !dbg !22
+  %3 = bitcast i32* %x.addr.i to i8*, !dbg !22
+  call void @llvm.lifetime.end(i64 4, i8* %3), !dbg !22
+  ret i32 %mul.i, !dbg !19
+}
+
+; Function Attrs: alwaysinline nounwind uwtable
+define i32 @_Z4funci(i32 %x) #1 {
+entry:
+  %x.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !20), !dbg !23
+  %0 = load i32* %x.addr, align 4, !dbg !24
+  %mul = mul nsw i32 %0, 2, !dbg !24
+  ret i32 %mul, !dbg !24
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #2
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #3
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #3
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0, !9}
+!llvm.module.flags = !{!16, !17}
+!llvm.ident = !{!18, !18}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a.cpp]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786449, metadata !10, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !11, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
+!10 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo"}
+!11 = metadata !{metadata !12}
+!12 = metadata !{i32 786478, metadata !10, metadata !13, metadata !"func", metadata !"func", metadata !"_Z4funci", i32 1, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4funci, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!13 = metadata !{i32 786473, metadata !10}        ; [ DW_TAG_file_type ] [/tmp/dbginfo/b.cpp]
+!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !8, metadata !8}
+!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!17 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{metadata !"clang version 3.5.0 "}
+!19 = metadata !{i32 4, i32 0, metadata !4, null}
+!20 = metadata !{i32 786689, metadata !12, metadata !"x", metadata !13, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!21 = metadata !{i32 1, i32 0, metadata !12, metadata !19}
+!22 = metadata !{i32 2, i32 0, metadata !12, metadata !19}
+!23 = metadata !{i32 1, i32 0, metadata !12, null}
+!24 = metadata !{i32 2, i32 0, metadata !12, null}
+
diff --git a/test/DebugInfo/cross-cu-linkonce.ll b/test/DebugInfo/cross-cu-linkonce.ll
new file mode 100644
index 0000000..16a5012
--- /dev/null
+++ b/test/DebugInfo/cross-cu-linkonce.ll
@@ -0,0 +1,74 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Built from source:
+; $ clang++ a.cpp b.cpp -g -c -emit-llvm
+; $ llvm-link a.bc b.bc -o ab.bc
+; $ opt -inline ab.bc -o ab-opt.bc
+; $ cat a.cpp
+; # 1 "func.h"
+; inline int func(int i) {
+;   return i * 2;
+; }
+; int (*x)(int) = &func;
+; $ cat b.cpp
+; # 1 "func.h"
+; inline int func(int i) {
+;   return i * 2;
+; }
+; int (*y)(int) = &func;
+
+; CHECK: DW_TAG_compile_unit
+; CHECK:   DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "func"
+; CHECK: DW_TAG_compile_unit
+; CHECK-NOT: DW_TAG_subprogram
+
+@x = global i32 (i32)* @_Z4funci, align 8
+@y = global i32 (i32)* @_Z4funci, align 8
+
+; Function Attrs: inlinehint nounwind uwtable
+define linkonce_odr i32 @_Z4funci(i32 %i) #0 {
+  %1 = alloca i32, align 4
+  store i32 %i, i32* %1, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !20), !dbg !21
+  %2 = load i32* %1, align 4, !dbg !22
+  %3 = mul nsw i32 %2, 2, !dbg !22
+  ret i32 %3, !dbg !22
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { inlinehint nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0, !13}
+!llvm.module.flags = !{!17, !18}
+!llvm.ident = !{!19, !19}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !10, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"func", metadata !"func", metadata !"_Z4funci", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4funci, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!5 = metadata !{metadata !"func.h", metadata !"/tmp/dbginfo"}
+!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/func.h]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9, metadata !9}
+!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !11}
+!11 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !6, i32 4, metadata !12, i32 0, i32 1, i32 (i32)** @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
+!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = metadata !{i32 786449, metadata !14, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
+!14 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo"}
+!15 = metadata !{metadata !16}
+!16 = metadata !{i32 786484, i32 0, null, metadata !"y", metadata !"y", metadata !"", metadata !6, i32 4, metadata !12, i32 0, i32 1, i32 (i32)** @y, null} ; [ DW_TAG_variable ] [y] [line 4] [def]
+!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{metadata !"clang version 3.5.0 "}
+!20 = metadata !{i32 786689, metadata !4, metadata !"i", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 1]
+!21 = metadata !{i32 1, i32 0, metadata !4, null}
+!22 = metadata !{i32 2, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/cu-line-tables.ll b/test/DebugInfo/cu-line-tables.ll
index d404a66..2496f3f 100644
--- a/test/DebugInfo/cu-line-tables.ll
+++ b/test/DebugInfo/cu-line-tables.ll
@@ -23,7 +23,6 @@ define i32 @f(i32 %a) #0 {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !12), !dbg !13
   %0 = load i32* %a.addr, align 4, !dbg !14
   %add = add nsw i32 %0, 4, !dbg !14
   ret i32 %add, !dbg !14
@@ -45,12 +44,8 @@ attributes #1 = { nounwind readnone }
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @f, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
 !11 = metadata !{metadata !"clang version 3.5 (trunk 197756) (llvm/trunk 197768)"}
-!12 = metadata !{i32 786689, metadata !4, metadata !"a", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 1]
-!13 = metadata !{i32 1, i32 0, metadata !4, null}
 !14 = metadata !{i32 2, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/debug-info-qualifiers.ll b/test/DebugInfo/debug-info-qualifiers.ll
index 2aea736..b624d38 100644
--- a/test/DebugInfo/debug-info-qualifiers.ll
+++ b/test/DebugInfo/debug-info-qualifiers.ll
@@ -21,8 +21,6 @@
 ; CHECK-NEXT: DW_AT_rvalue_reference DW_FORM_flag_present
 ;
 ; CHECK: DW_TAG_subprogram
-;
-; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG_subprogram
 ; CHECK:   DW_AT_name {{.*}}"l"
 ; CHECK-NOT: DW_TAG_subprogram
diff --git a/test/DebugInfo/dwarfdump-inlining.test b/test/DebugInfo/dwarfdump-inlining.test
deleted file mode 100644
index e926634..0000000
--- a/test/DebugInfo/dwarfdump-inlining.test
+++ /dev/null
@@ -1,28 +0,0 @@
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x710 \
-RUN:   --inlining --functions | FileCheck %s -check-prefix DEEP_STACK
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x7d1 \
-RUN:   --inlining | FileCheck %s -check-prefix SHORTER_STACK
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x785 \
-RUN:   --inlining | FileCheck %s -check-prefix SHORT_STACK
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x737 \
-RUN:   --functions | FileCheck %s -check-prefix INL_FUNC_NAME
-
-DEEP_STACK:      inlined_h
-DEEP_STACK-NEXT: dwarfdump-inl-test.h:2
-DEEP_STACK-NEXT: inlined_g
-DEEP_STACK-NEXT: dwarfdump-inl-test.h:7
-DEEP_STACK-NEXT: inlined_f
-DEEP_STACK-NEXT: dwarfdump-inl-test.cc:3
-DEEP_STACK-NEXT: main
-DEEP_STACK-NEXT: dwarfdump-inl-test.cc:8
-
-SHORTER_STACK:      dwarfdump-inl-test.h:7
-SHORTER_STACK-NEXT: dwarfdump-inl-test.cc:3
-SHORTER_STACK-NEXT: dwarfdump-inl-test.cc:8
-
-SHORT_STACK:      dwarfdump-inl-test.cc:3
-SHORT_STACK-NEXT: dwarfdump-inl-test.cc:8
-
-INL_FUNC_NAME:      inlined_g
-INL_FUNC_NAME-NEXT: dwarfdump-inl-test.h:7
-
diff --git a/test/DebugInfo/dwarfdump-ranges.test b/test/DebugInfo/dwarfdump-ranges.test
new file mode 100644
index 0000000..c9e33dc
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-ranges.test
@@ -0,0 +1,10 @@
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 | FileCheck %s
+
+CHECK:      .debug_ranges contents:
+CHECK-NEXT: 00000000 000000000000062c 0000000000000637
+CHECK-NEXT: 00000000 0000000000000637 000000000000063d
+CHECK-NEXT: 00000000 <End of list>
+CHECK-NEXT: 00000030 0000000000000640 000000000000064b
+CHECK-NEXT: 00000030 0000000000000637 000000000000063d
+CHECK-NEXT: 00000030 <End of list>
+
diff --git a/test/DebugInfo/dwarfdump-test.test b/test/DebugInfo/dwarfdump-test.test
deleted file mode 100644
index 058d6a3..0000000
--- a/test/DebugInfo/dwarfdump-test.test
+++ /dev/null
@@ -1,56 +0,0 @@
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test.elf-x86-64  \
-RUN:   --address=0x400559 --functions | FileCheck %s -check-prefix MAIN
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test.elf-x86-64  \
-RUN:   --address=0x400528 --functions | FileCheck %s -check-prefix FUNCTION
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test.elf-x86-64 \
-RUN:   --address=0x400586 --functions | FileCheck %s -check-prefix CTOR_WITH_SPEC
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test2.elf-x86-64 \
-RUN:   --address=0x4004e8 --functions | FileCheck %s -check-prefix MANY_CU_1
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test2.elf-x86-64 \
-RUN:   --address=0x4004f4 --functions | FileCheck %s -check-prefix MANY_CU_2
-RUN: llvm-dwarfdump "%p/Inputs/dwarfdump-test3.elf-x86-64 space" \
-RUN:   --address=0x640 --functions | FileCheck %s -check-prefix ABS_ORIGIN_1
-RUN: llvm-dwarfdump "%p/Inputs/dwarfdump-test3.elf-x86-64 space" \
-RUN:   --address=0x633 --functions | FileCheck %s -check-prefix INCLUDE_TEST_1
-RUN: llvm-dwarfdump "%p/Inputs/dwarfdump-test3.elf-x86-64 space" \
-RUN:   --address=0x62d --functions | FileCheck %s -check-prefix INCLUDE_TEST_2
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
-RUN:   --address=0x62c --functions \
-RUN:   | FileCheck %s -check-prefix MANY_SEQ_IN_LINE_TABLE
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
-RUN:   | FileCheck %s -check-prefix DEBUG_RANGES
-
-MAIN: main
-MAIN-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:16
-
-FUNCTION: _Z1fii
-FUNCTION-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:11
-
-CTOR_WITH_SPEC: DummyClass
-CTOR_WITH_SPEC-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:4
-
-MANY_CU_1: a
-MANY_CU_1-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test2-helper.cc:2
-
-MANY_CU_2: main
-MANY_CU_2-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test2-main.cc:4
-
-ABS_ORIGIN_1: C
-ABS_ORIGIN_1-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test3.cc:3
-
-INCLUDE_TEST_1: _Z3do1v
-INCLUDE_TEST_1-NEXT: /tmp/include{{[/\\]}}dwarfdump-test3-decl.h:7
-
-INCLUDE_TEST_2: _Z3do2v
-INCLUDE_TEST_2-NEXT: /tmp/dbginfo{{[/\\]}}include{{[/\\]}}dwarfdump-test3-decl2.h:1
-
-MANY_SEQ_IN_LINE_TABLE: _Z1cv
-MANY_SEQ_IN_LINE_TABLE-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test4-part1.cc:2
-
-DEBUG_RANGES:      .debug_ranges contents:
-DEBUG_RANGES-NEXT: 00000000 000000000000062c 0000000000000637
-DEBUG_RANGES-NEXT: 00000000 0000000000000637 000000000000063d
-DEBUG_RANGES-NEXT: 00000000 <End of list>
-DEBUG_RANGES-NEXT: 00000030 0000000000000640 000000000000064b
-DEBUG_RANGES-NEXT: 00000030 0000000000000637 000000000000063d
-DEBUG_RANGES-NEXT: 00000030 <End of list>
diff --git a/test/DebugInfo/dwarfdump-zlib.test b/test/DebugInfo/dwarfdump-zlib.test
index 8ce2cf7..cbd85ca 100644
--- a/test/DebugInfo/dwarfdump-zlib.test
+++ b/test/DebugInfo/dwarfdump-zlib.test
@@ -1,12 +1,6 @@
 REQUIRES: zlib
 
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test-zlib.elf-x86-64  \
-RUN:   | FileCheck %s -check-prefix FULLDUMP
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test-zlib.elf-x86-64  \
-RUN:   --address=0x400559 --functions | FileCheck %s -check-prefix MAIN
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test-zlib.elf-x86-64 | FileCheck %s
 
-FULLDUMP: .debug_abbrev contents
-FULLDUMP: .debug_info contents
-
-MAIN: main
-MAIN-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test-zlib.cc:16
+CHECK: .debug_abbrev contents
+CHECK: .debug_info contents
diff --git a/test/DebugInfo/inline-scopes.ll b/test/DebugInfo/inline-scopes.ll
new file mode 100644
index 0000000..36c0735
--- /dev/null
+++ b/test/DebugInfo/inline-scopes.ll
@@ -0,0 +1,130 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; bool f();
+; inline __attribute__((always_inline)) int f1() {
+;   if (bool b = f())
+;     return 1;
+;   return 2;
+; }
+;
+; inline __attribute__((always_inline)) int f2() {
+; # 2 "y.cc"
+;   if (bool b = f())
+;     return 3;
+;   return 4;
+; }
+;
+; int main() {
+;   f1();
+;   f2();
+; }
+
+; Ensure that lexical_blocks within inlined_subroutines are preserved/emitted.
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK-NOT: DW_TAG
+; CHECK-NOT: NULL
+; CHECK: DW_TAG_lexical_block
+; CHECK-NOT: DW_TAG
+; CHECK-NOT: NULL
+; CHECK: DW_TAG_variable
+; Ensure that file changes don't interfere with creating inlined subroutines.
+; (see the line directive inside 'f2' in thesource)
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK:   DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin
+
+; Function Attrs: uwtable
+define i32 @main() #0 {
+entry:
+  %retval.i2 = alloca i32, align 4
+  %b.i3 = alloca i8, align 1
+  %retval.i = alloca i32, align 4
+  %b.i = alloca i8, align 1
+  call void @llvm.dbg.declare(metadata !{i8* %b.i}, metadata !16), !dbg !19
+  %call.i = call zeroext i1 @_Z1fv(), !dbg !19
+  %frombool.i = zext i1 %call.i to i8, !dbg !19
+  store i8 %frombool.i, i8* %b.i, align 1, !dbg !19
+  %0 = load i8* %b.i, align 1, !dbg !19
+  %tobool.i = trunc i8 %0 to i1, !dbg !19
+  br i1 %tobool.i, label %if.then.i, label %if.end.i, !dbg !19
+
+if.then.i:                                        ; preds = %entry
+  store i32 1, i32* %retval.i, !dbg !21
+  br label %_Z2f1v.exit, !dbg !21
+
+if.end.i:                                         ; preds = %entry
+  store i32 2, i32* %retval.i, !dbg !22
+  br label %_Z2f1v.exit, !dbg !22
+
+_Z2f1v.exit:                                      ; preds = %if.then.i, %if.end.i
+  %1 = load i32* %retval.i, !dbg !23
+  call void @llvm.dbg.declare(metadata !{i8* %b.i3}, metadata !24), !dbg !27
+  %call.i4 = call zeroext i1 @_Z1fv(), !dbg !27
+  %frombool.i5 = zext i1 %call.i4 to i8, !dbg !27
+  store i8 %frombool.i5, i8* %b.i3, align 1, !dbg !27
+  %2 = load i8* %b.i3, align 1, !dbg !27
+  %tobool.i6 = trunc i8 %2 to i1, !dbg !27
+  br i1 %tobool.i6, label %if.then.i7, label %if.end.i8, !dbg !27
+
+if.then.i7:                                       ; preds = %_Z2f1v.exit
+  store i32 3, i32* %retval.i2, !dbg !29
+  br label %_Z2f2v.exit, !dbg !29
+
+if.end.i8:                                        ; preds = %_Z2f1v.exit
+  store i32 4, i32* %retval.i2, !dbg !30
+  br label %_Z2f2v.exit, !dbg !30
+
+_Z2f2v.exit:                                      ; preds = %if.then.i7, %if.end.i8
+  %3 = load i32* %retval.i2, !dbg !31
+  ret i32 0, !dbg !32
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+declare zeroext i1 @_Z1fv() #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13, !14}
+!llvm.ident = !{!15}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline-scopes.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"inline-scopes.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !10, metadata !12}
+!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!5 = metadata !{metadata !"y.cc", metadata !"/tmp/dbginfo"}
+!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/y.cc]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"f2", metadata !"f2", metadata !"_Z2f2v", i32 8, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !2, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [f2]
+!11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline-scopes.cpp]
+!12 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"f1", metadata !"f1", metadata !"_Z2f1v", i32 2, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [f1]
+!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!15 = metadata !{metadata !"clang version 3.5.0 "}
+!16 = metadata !{i32 786688, metadata !17, metadata !"b", metadata !11, i32 3, metadata !18, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 3]
+!17 = metadata !{i32 786443, metadata !1, metadata !12, i32 3, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/inline-scopes.cpp]
+!18 = metadata !{i32 786468, null, null, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!19 = metadata !{i32 3, i32 0, metadata !17, metadata !20}
+!20 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!21 = metadata !{i32 4, i32 0, metadata !17, metadata !20}
+!22 = metadata !{i32 5, i32 0, metadata !12, metadata !20}
+!23 = metadata !{i32 6, i32 0, metadata !12, metadata !20}
+!24 = metadata !{i32 786688, metadata !25, metadata !"b", metadata !6, i32 2, metadata !18, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 2]
+!25 = metadata !{i32 786443, metadata !5, metadata !26, i32 2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/y.cc]
+!26 = metadata !{i32 786443, metadata !5, metadata !10} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/y.cc]
+!27 = metadata !{i32 2, i32 0, metadata !25, metadata !28}
+!28 = metadata !{i32 9, i32 0, metadata !4, null}
+!29 = metadata !{i32 3, i32 0, metadata !25, metadata !28}
+!30 = metadata !{i32 4, i32 0, metadata !26, metadata !28}
+!31 = metadata !{i32 5, i32 0, metadata !26, metadata !28}
+!32 = metadata !{i32 10, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/inlined-vars.ll b/test/DebugInfo/inlined-vars.ll
index e9c439a..9cfde1f 100644
--- a/test/DebugInfo/inlined-vars.ll
+++ b/test/DebugInfo/inlined-vars.ll
@@ -30,8 +30,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !10 = metadata !{i32 786478, metadata !26, metadata !6, metadata !"f", metadata !"f", metadata !"_ZL1fi", i32 3, metadata !11, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !13, i32 3} ; [ DW_TAG_subprogram ]
 !11 = metadata !{i32 786453, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !9, metadata !9}
-!13 = metadata !{metadata !14}
-!14 = metadata !{metadata !15, metadata !16}
+!13 = metadata !{metadata !15, metadata !16}
 !15 = metadata !{i32 786689, metadata !10, metadata !"argument", metadata !6, i32 16777219, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 
 ; Two DW_TAG_formal_parameter: one abstract and one inlined.
diff --git a/test/DebugInfo/llvm-symbolizer-zlib.test b/test/DebugInfo/llvm-symbolizer-zlib.test
new file mode 100644
index 0000000..0aae7e6
--- /dev/null
+++ b/test/DebugInfo/llvm-symbolizer-zlib.test
@@ -0,0 +1,7 @@
+REQUIRES: zlib
+
+RUN: echo "%p/Inputs/dwarfdump-test-zlib.elf-x86-64 0x400559" > %t.input
+RUN: llvm-symbolizer < %t.input | FileCheck %s
+
+CHECK: main
+CHECK-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test-zlib.cc:16
diff --git a/test/DebugInfo/llvm-symbolizer.test b/test/DebugInfo/llvm-symbolizer.test
index 4b532f3..6aa1287 100644
--- a/test/DebugInfo/llvm-symbolizer.test
+++ b/test/DebugInfo/llvm-symbolizer.test
@@ -1,15 +1,24 @@
 RUN: echo "%p/Inputs/dwarfdump-test.elf-x86-64 0x400559" > %t.input
 RUN: echo "%p/Inputs/dwarfdump-test.elf-x86-64.debuglink 0x400559" >> %t.input
 RUN: echo "%p/Inputs/dwarfdump-test.elf-x86-64 0x400436" >> %t.input
+RUN: echo "%p/Inputs/dwarfdump-test.elf-x86-64 0x400528" >> %t.input
+RUN: echo "%p/Inputs/dwarfdump-test.elf-x86-64 0x400586" >> %t.input
+RUN: echo "%p/Inputs/dwarfdump-test2.elf-x86-64 0x4004e8" >> %t.input
+RUN: echo "%p/Inputs/dwarfdump-test2.elf-x86-64 0x4004f4" >> %t.input
 RUN: echo "%p/Inputs/dwarfdump-test4.elf-x86-64 0x62c" >> %t.input
 RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x710" >> %t.input
+RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x7d1" >> %t.input
+RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x785" >> %t.input
 RUN: echo "%p/Inputs/dwarfdump-inl-test.high_pc.elf-x86-64 0x568" >> %t.input
+RUN: echo "\"%p/Inputs/dwarfdump-test3.elf-x86-64 space\" 0x640" >> %t.input
 RUN: echo "\"%p/Inputs/dwarfdump-test3.elf-x86-64 space\" 0x633" >> %t.input
+RUN: echo "\"%p/Inputs/dwarfdump-test3.elf-x86-64 space\" 0x62d" >> %t.input
 RUN: echo "%p/Inputs/macho-universal 0x1f84" >> %t.input
 RUN: echo "%p/Inputs/macho-universal:i386 0x1f67" >> %t.input
 RUN: echo "%p/Inputs/macho-universal:x86_64 0x100000f05" >> %t.input
+RUN: echo "%p/Inputs/llvm-symbolizer-dwo-test 0x400514" >> %t.input
 
-RUN: llvm-symbolizer --functions --inlining --demangle=false \
+RUN: llvm-symbolizer --functions=linkage --inlining --demangle=false \
 RUN:    --default-arch=i386 < %t.input | FileCheck %s
 
 CHECK:       main
@@ -20,6 +29,18 @@ CHECK-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:16
 
 CHECK:      _start
 
+CHECK: _Z1fii
+CHECK-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:11
+
+CHECK: DummyClass
+CHECK-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:4
+
+CHECK: a
+CHECK-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test2-helper.cc:2
+
+CHECK: main
+CHECK-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test2-main.cc:4
+
 CHECK:      _Z1cv
 CHECK-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test4-part1.cc:2
 
@@ -30,7 +51,19 @@ CHECK-NEXT: dwarfdump-inl-test.h:7
 CHECK-NEXT: inlined_f
 CHECK-NEXT: dwarfdump-inl-test.cc:3
 CHECK-NEXT: main
-CHECK-NEXT: dwarfdump-inl-test.cc:
+CHECK-NEXT: dwarfdump-inl-test.cc:8
+
+CHECK:      inlined_g
+CHECK-NEXT: dwarfdump-inl-test.h:7
+CHECK-NEXT: inlined_f
+CHECK-NEXT: dwarfdump-inl-test.cc:3
+CHECK-NEXT: main
+CHECK-NEXT: dwarfdump-inl-test.cc:8
+
+CHECK:      inlined_f
+CHECK-NEXT: dwarfdump-inl-test.cc:3
+CHECK-NEXT: main
+CHECK-NEXT: dwarfdump-inl-test.cc:8
 
 CHECK:      inlined_h
 CHECK-NEXT: dwarfdump-inl-test.h:3
@@ -39,15 +72,24 @@ CHECK-NEXT: dwarfdump-inl-test.h:7
 CHECK-NEXT: inlined_f
 CHECK-NEXT: dwarfdump-inl-test.cc:3
 CHECK-NEXT: main
-CHECK-NEXT: dwarfdump-inl-test.cc:
+CHECK-NEXT: dwarfdump-inl-test.cc:8
+
+CHECK: C
+CHECK-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test3.cc:3
 
-CHECK:       _Z3do1v
-CHECK-NEXT: dwarfdump-test3-decl.h:7
+CHECK: _Z3do1v
+CHECK-NEXT: /tmp/include{{[/\\]}}dwarfdump-test3-decl.h:7
+
+CHECK: _Z3do2v
+CHECK-NEXT: /tmp/dbginfo{{[/\\]}}include{{[/\\]}}dwarfdump-test3-decl2.h:1
 
 CHECK:      main
 CHECK:      _Z3inci
 CHECK:      _Z3inci
 
+CHECK: main
+CHECK-NEXT: llvm-symbolizer-dwo-test.cc:11
+
 RUN: echo "unexisting-file 0x1234" > %t.input2
 RUN: llvm-symbolizer < %t.input2
 
@@ -83,3 +125,9 @@ RUN: llvm-symbolizer --obj %p/Inputs/shared-object-stripped.elf-i386 < %t.input6
 RUN:   | FileCheck %s --check-prefix=STRIPPED
 
 STRIPPED:  global_func
+
+RUN: echo "%p/Inputs/dwarfdump-test4.elf-x86-64 0x62c" > %t.input7
+RUN: llvm-symbolizer --functions=short --use-symbol-table=false --demangle=false < %t.input7 \
+RUN:    | FileCheck %s --check-prefix=SHORT_FUNCTION_NAME
+
+SHORT_FUNCTION_NAME-NOT: _Z1cv
diff --git a/test/DebugInfo/namespace.ll b/test/DebugInfo/namespace.ll
index f36688d..a9de62c 100644
--- a/test/DebugInfo/namespace.ll
+++ b/test/DebugInfo/namespace.ll
@@ -16,14 +16,6 @@
 ; CHECK: [[I:0x[0-9a-f]*]]:{{ *}}DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name{{.*}}= "i"
 ; CHECK-NOT: NULL
-; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name
-; CHECK-NEXT: DW_AT_name{{.*}}= "f1"
-; CHECK: [[FUNC1:0x[0-9a-f]*]]:{{ *}}DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name
-; CHECK-NEXT: DW_AT_name{{.*}}= "f1"
-; CHECK: NULL
-; CHECK-NOT: NULL
 ; CHECK: [[FOO:0x[0-9a-f]*]]:{{ *}}DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name{{.*}}= "foo"
 ; CHECK-NEXT: DW_AT_declaration
@@ -31,7 +23,16 @@
 ; CHECK: [[BAR:0x[0-9a-f]*]]:{{ *}}DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name{{.*}}= "bar"
 ; CHECK: NULL
-; CHECK: NULL
+; CHECK: [[FUNC1:.*]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}= "f1"
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}= "f1"
 ; CHECK: NULL
 
 ; CHECK-NOT: NULL
@@ -44,9 +45,18 @@
 ; CHECK: NULL
 ; CHECK-NOT: NULL
 
+; CHECK: DW_TAG_imported_module
+; Same bug as above, this should be F2, not F1
+; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F1]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x0b)
+; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS1]]})
+; CHECK-NOT: NULL
+
 ; CHECK: DW_TAG_subprogram
-; CHECK-NEXT: DW_AT_MIPS_linkage_name
-; CHECK-NEXT: DW_AT_name{{.*}}= "func"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_MIPS_linkage_name
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}= "func"
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_imported_module
 ; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2]])
@@ -73,13 +83,13 @@
 ; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x16)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[I]]})
 ; CHECK-NOT: NULL
-; CHECK: [[X:0x[0-9a-f]*]]:{{ *}}DW_TAG_imported_module
+; CHECK: [[X:0x[0-9a-f]*]]:{{ *}}DW_TAG_imported_declaration
 ; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2]])
 ; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x18)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS1]]})
 ; CHECK-NEXT: DW_AT_name{{.*}}"X"
 ; CHECK-NOT: NULL
-; CHECK: DW_TAG_imported_module
+; CHECK: DW_TAG_imported_declaration
 ; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2]])
 ; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x19)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[X]]})
@@ -93,13 +103,7 @@
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS2]]})
 ; CHECK: NULL
 ; CHECK: NULL
-; CHECK-NOT: NULL
-
-; CHECK: DW_TAG_imported_module
-; Same bug as above, this should be F2, not F1
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F1]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x0b)
-; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS1]]})
+; CHECK: NULL
 
 ; CHECK: file_names[  [[F1]]]{{.*}}debug-info-namespace.cpp
 ; CHECK: file_names[  [[F2]]]{{.*}}foo.cpp
@@ -199,7 +203,7 @@ attributes #1 = { nounwind readnone }
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !19, metadata !21, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/llvm/build/clang/debug//usr/local/google/home/blaikie/dev/llvm/src/tools/clang/test/CodeGenCXX/debug-info-namespace.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"/usr/local/google/home/blaikie/dev/llvm/src/tools/clang/test/CodeGenCXX/debug-info-namespace.cpp", metadata !"/usr/local/google/home/blaikie/dev/llvm/build/clang/debug"}
-!2 = metadata !{i32 0}
+!2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !10, metadata !14}
 !4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"f1", metadata !"f1", metadata !"_ZN1A1B2f1Ev", i32 3, metadata !8, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN1A1B2f1Ev, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f1]
 !5 = metadata !{metadata !"foo.cpp", metadata !"/usr/local/google/home/blaikie/dev/llvm/build/clang/debug"}
@@ -236,8 +240,8 @@ attributes #1 = { nounwind readnone }
 !36 = metadata !{i32 786468}
 !37 = metadata !{i32 786440, metadata !14, metadata !10, i32 21} ; [ DW_TAG_imported_declaration ]
 !38 = metadata !{i32 786440, metadata !14, metadata !20, i32 22} ; [ DW_TAG_imported_declaration ]
-!39 = metadata !{i32 786490, metadata !14, metadata !7, i32 24, metadata !"X"} ; [ DW_TAG_imported_module ]
-!40 = metadata !{i32 786490, metadata !14, metadata !39, i32 25, metadata !"Y"} ; [ DW_TAG_imported_module ]
+!39 = metadata !{i32 786440, metadata !14, metadata !7, i32 24, metadata !"X"} ; [ DW_TAG_imported_declaration ]
+!40 = metadata !{i32 786440, metadata !14, metadata !39, i32 25, metadata !"Y"} ; [ DW_TAG_imported_declaration ]
 !41 = metadata !{i32 3, i32 0, metadata !4, null}
 !42 = metadata !{i32 786689, metadata !10, metadata !"", metadata !15, i32 16777220, metadata !13, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 4]
 !43 = metadata !{i32 4, i32 0, metadata !10, null}
diff --git a/test/DebugInfo/namespace_function_definition.ll b/test/DebugInfo/namespace_function_definition.ll
new file mode 100644
index 0000000..590f2b3
--- /dev/null
+++ b/test/DebugInfo/namespace_function_definition.ll
@@ -0,0 +1,44 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Generated from clang with the following source:
+; namespace ns {
+; void func() {
+; }
+; }
+
+; CHECK: DW_TAG_namespace
+; CHECK-NEXT: DW_AT_name {{.*}} "ns"
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_low_pc
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZN2ns4funcEv"
+; CHECK: NULL
+; CHECK: NULL
+
+; Function Attrs: nounwind uwtable
+define void @_ZN2ns4funcEv() #0 {
+entry:
+  ret void, !dbg !11
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/namespace_function_definition.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"namespace_function_definition.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"_ZN2ns4funcEv", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns4funcEv, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [func]
+!5 = metadata !{i32 786489, metadata !1, null, metadata !"ns", i32 1} ; [ DW_TAG_namespace ] [ns] [line 1]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{metadata !"clang version 3.5.0 "}
+!11 = metadata !{i32 3, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/namespace_inline_function_definition.ll b/test/DebugInfo/namespace_inline_function_definition.ll
new file mode 100644
index 0000000..65fa4a4
--- /dev/null
+++ b/test/DebugInfo/namespace_inline_function_definition.ll
@@ -0,0 +1,92 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Generate from clang with the following source. Note that the definition of
+; the inline function follows its use to workaround another bug that should be
+; fixed soon.
+; namespace ns {
+; int func(int i);
+; }
+; extern int x;
+; int main() { return ns::func(x); }
+; int __attribute__((always_inline)) ns::func(int i) { return i * 2; }
+
+; CHECK: DW_TAG_namespace
+; CHECK-NEXT: DW_AT_name {{.*}} "ns"
+; CHECK-NOT: DW_TAG
+; CHECK: [[ABS_DEF:0x.*]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZN2ns4funcEi"
+; CHECK-NOT: DW_TAG
+; CHECK: [[ABS_PRM:0x.*]]:   DW_TAG_formal_parameter
+; CHECK:   NULL
+; CHECK-NOT: NULL
+; CHECK:   DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}} {[[ABS_DEF]]}
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_TAG_formal_parameter
+; CHECK:       DW_AT_abstract_origin {{.*}} {[[ABS_PRM]]}
+; CHECK:     NULL
+; CHECK:   NULL
+; CHECK: NULL
+
+@x = external global i32
+
+; Function Attrs: uwtable
+define i32 @main() #0 {
+entry:
+  %i.addr.i = alloca i32, align 4
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @x, align 4, !dbg !16
+  store i32 %0, i32* %i.addr.i, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr.i}, metadata !17), !dbg !18
+  %1 = load i32* %i.addr.i, align 4, !dbg !18
+  %mul.i = mul nsw i32 %1, 2, !dbg !18
+  ret i32 %mul.i, !dbg !16
+}
+
+; Function Attrs: alwaysinline nounwind uwtable
+define i32 @_ZN2ns4funcEi(i32 %i) #1 {
+entry:
+  %i.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !17), !dbg !19
+  %0 = load i32* %i.addr, align 4, !dbg !19
+  %mul = mul nsw i32 %0, 2, !dbg !19
+  ret i32 %mul, !dbg !19
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13, !14}
+!llvm.ident = !{!15}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/namespace_inline_function_definition.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"namespace_inline_function_definition.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !9}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/namespace_inline_function_definition.cpp]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"func", metadata !"func", metadata !"_ZN2ns4funcEi", i32 6, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_ZN2ns4funcEi, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
+!10 = metadata !{i32 786489, metadata !1, null, metadata !"ns", i32 1} ; [ DW_TAG_namespace ] [ns] [line 1]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !8, metadata !8}
+!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!14 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!15 = metadata !{metadata !"clang version 3.5.0 "}
+!16 = metadata !{i32 5, i32 0, metadata !4, null}
+!17 = metadata !{i32 786689, metadata !9, metadata !"i", metadata !5, i32 16777222, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 6]
+!18 = metadata !{i32 6, i32 0, metadata !9, metadata !16}
+!19 = metadata !{i32 6, i32 0, metadata !9, null}
diff --git a/test/DebugInfo/restrict.ll b/test/DebugInfo/restrict.ll
new file mode 100644
index 0000000..ceb844f
--- /dev/null
+++ b/test/DebugInfo/restrict.ll
@@ -0,0 +1,53 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -dwarf-version=2 -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck --check-prefix=CHECK --check-prefix=V2 %s
+; RUN: %llc_dwarf -dwarf-version=3 -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck --check-prefix=CHECK --check-prefix=V3 %s
+
+; CHECK: DW_AT_name {{.*}} "dst"
+; V2: DW_AT_type {{.*}} {[[PTR:0x.*]]}
+; V3: DW_AT_type {{.*}} {[[RESTRICT:0x.*]]}
+; V3: [[RESTRICT]]: {{.*}}DW_TAG_restrict_type
+; V3-NEXT: DW_AT_type {{.*}} {[[PTR:0x.*]]}
+; CHECK: [[PTR]]: {{.*}}DW_TAG_pointer_type
+; CHECK-NOT: DW_AT_type
+
+; Generated with clang from:
+; void foo(void* __restrict__ dst) {
+; }
+
+
+; Function Attrs: nounwind uwtable
+define void @_Z3fooPv(i8* noalias %dst) #0 {
+entry:
+  %dst.addr = alloca i8*, align 8
+  store i8* %dst, i8** %dst.addr, align 8
+  call void @llvm.dbg.declare(metadata !{i8** %dst.addr}, metadata !13), !dbg !14
+  ret void, !dbg !15
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11}
+!llvm.ident = !{!12}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/restrict.c] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"restrict.c", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"_Z3fooPv", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*)* @_Z3fooPv, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/restrict.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null, metadata !8}
+!8 = metadata !{i32 786487, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_restrict_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{metadata !"clang version 3.5.0 "}
+!13 = metadata !{i32 786689, metadata !4, metadata !"dst", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [dst] [line 1]
+!14 = metadata !{i32 1, i32 0, metadata !4, null}
+!15 = metadata !{i32 2, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/sugared-constants.ll b/test/DebugInfo/sugared-constants.ll
new file mode 100644
index 0000000..0d2ebe6
--- /dev/null
+++ b/test/DebugInfo/sugared-constants.ll
@@ -0,0 +1,82 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj %s -o - | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; Use correct signedness when emitting constants of derived (sugared) types.
+
+; Test compiled to IR from clang with -O1 and the following source:
+
+; void func(int);
+; void func(unsigned);
+; void func(char16_t);
+; int main() {
+;   const int i = 42;
+;   func(i);
+;   const unsigned j = 117;
+;   func(j);
+;   char16_t c = 7;
+;   func(c);
+; }
+
+; CHECK: DW_AT_const_value [DW_FORM_sdata] (42)
+; CHECK: DW_AT_const_value [DW_FORM_udata] (117)
+; CHECK: DW_AT_const_value [DW_FORM_udata] (7)
+
+; Function Attrs: uwtable
+define i32 @main() #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !20, i64 0, metadata !10), !dbg !21
+  tail call void @_Z4funci(i32 42), !dbg !22
+  tail call void @llvm.dbg.value(metadata !23, i64 0, metadata !12), !dbg !24
+  tail call void @_Z4funcj(i32 117), !dbg !25
+  tail call void @llvm.dbg.value(metadata !26, i64 0, metadata !15), !dbg !27
+  tail call void @_Z4funcDs(i16 zeroext 7), !dbg !28
+  ret i32 0, !dbg !29
+}
+
+declare void @_Z4funci(i32) #1
+
+declare void @_Z4funcj(i32) #1
+
+declare void @_Z4funcDs(i16 zeroext) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!17, !18}
+!llvm.ident = !{!19}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/const.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"const.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 4, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !9, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/const.cpp]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !10, metadata !12, metadata !15}
+!10 = metadata !{i32 786688, metadata !4, metadata !"i", metadata !5, i32 5, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 5]
+!11 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!12 = metadata !{i32 786688, metadata !4, metadata !"j", metadata !5, i32 7, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 7]
+!13 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !14} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from unsigned int]
+!14 = metadata !{i32 786468, null, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!15 = metadata !{i32 786688, metadata !4, metadata !"c", metadata !5, i32 9, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [c] [line 9]
+!16 = metadata !{i32 786468, null, null, metadata !"char16_t", i32 0, i64 16, i64 16, i64 0, i32 0, i32 16} ; [ DW_TAG_base_type ] [char16_t] [line 0, size 16, align 16, offset 0, enc DW_ATE_UTF]
+!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{metadata !"clang version 3.5.0 "}
+!20 = metadata !{i32 42}
+!21 = metadata !{i32 5, i32 0, metadata !4, null}
+!22 = metadata !{i32 6, i32 0, metadata !4, null}
+!23 = metadata !{i32 117}
+!24 = metadata !{i32 7, i32 0, metadata !4, null}
+!25 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!26 = metadata !{i16 7}
+!27 = metadata !{i32 9, i32 0, metadata !4, null}
+!28 = metadata !{i32 10, i32 0, metadata !4, null}
+!29 = metadata !{i32 11, i32 0, metadata !4, null}
diff --git a/test/DebugInfo/two-cus-from-same-file.ll b/test/DebugInfo/two-cus-from-same-file.ll
index c4d663c..2ab82a9 100644
--- a/test/DebugInfo/two-cus-from-same-file.ll
+++ b/test/DebugInfo/two-cus-from-same-file.ll
@@ -51,8 +51,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !16 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !17} ; [ DW_TAG_pointer_type ]
 !17 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !18} ; [ DW_TAG_pointer_type ]
 !18 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!19 = metadata !{metadata !20}
-!20 = metadata !{metadata !21, metadata !22}
+!19 = metadata !{metadata !21, metadata !22}
 !21 = metadata !{i32 786689, metadata !12, metadata !"argc", metadata !6, i32 16777227, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !22 = metadata !{i32 786689, metadata !12, metadata !"argv", metadata !6, i32 33554443, metadata !16, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !23 = metadata !{i32 6, i32 3, metadata !24, null}
diff --git a/test/DebugInfo/typedef.ll b/test/DebugInfo/typedef.ll
new file mode 100644
index 0000000..40cecdf
--- /dev/null
+++ b/test/DebugInfo/typedef.ll
@@ -0,0 +1,32 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; From source:
+; typedef void x;
+; x *y;
+
+; Check that a typedef with no DW_AT_type is produced. The absence of a type is used to imply the 'void' type.
+
+; CHECK: DW_TAG_typedef
+; CHECK-NOT: DW_AT_type
+; CHECK: {{DW_TAG|NULL}}
+
+@y = global i8* null, align 8
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/typedef.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"typedef.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786484, i32 0, null, metadata !"y", metadata !"y", metadata !"", metadata !5, i32 2, metadata !6, i32 0, i32 1, i8** @y, null} ; [ DW_TAG_variable ] [y] [line 2] [def]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/typedef.cpp]
+!6 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from x]
+!7 = metadata !{i32 786454, metadata !1, null, metadata !"x", i32 1, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_typedef ] [x] [line 1, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{metadata !"clang version 3.5.0 "}
+
diff --git a/test/DebugInfo/unconditional-branch.ll b/test/DebugInfo/unconditional-branch.ll
index 1e5dac6..6c31375 100644
--- a/test/DebugInfo/unconditional-branch.ll
+++ b/test/DebugInfo/unconditional-branch.ll
@@ -1,12 +1,12 @@
 ; REQUIRES: object-emission
 ; PR 19261
 
-; RUN: %llc_dwarf -fast-isel=false -O0 -filetype=obj < %s > %t
+; RUN: %llc_dwarf -fast-isel=false -O0 -filetype=obj %s -o %t
 ; RUN: llvm-dwarfdump %t | FileCheck %s
 
 ; CHECK: {{0x[0-9a-f]+}}      1      0      1   0             0  is_stmt
-; CHECK-NEXT: {{0x[0-9a-f]+}}      2      0      1   0             0  is_stmt
-; CHECK-NEXT: {{0x[0-9a-f]+}}      4      0      1   0             0  is_stmt
+; CHECK: {{0x[0-9a-f]+}}      2      0      1   0             0  is_stmt
+; CHECK: {{0x[0-9a-f]+}}      4      0      1   0             0  is_stmt
 
 ; IR generated from clang -O0 -g with the following source:
 ;void foo(int i){
diff --git a/test/DebugInfo/varargs.ll b/test/DebugInfo/varargs.ll
index a327414..ddfcd85 100644
--- a/test/DebugInfo/varargs.ll
+++ b/test/DebugInfo/varargs.ll
@@ -13,25 +13,25 @@
 ;
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_name {{.*}} "b"
+; CHECK: DW_AT_name {{.*}} "a"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_unspecified_parameters
 ;
-; Variadic C++ member function.
-; struct A { void a(int c, ...); }
-;
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_name {{.*}} "a"
-; CHECK-NOT: DW_TAG
-; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_AT_name {{.*}} "b"
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_unspecified_parameters
 ;
+; Variadic C++ member function.
+; struct A { void a(int c, ...); }
+;
 ; Variadic function pointer.
 ; void (*fptr)(int, ...);
 ;
diff --git a/test/ExecutionEngine/RuntimeDyld/arm_secdiff_reloc.test b/test/ExecutionEngine/RuntimeDyld/arm_secdiff_reloc.test
deleted file mode 100644
index 92e4dd7..0000000
--- a/test/ExecutionEngine/RuntimeDyld/arm_secdiff_reloc.test
+++ /dev/null
@@ -1 +0,0 @@
-RUN: llvm-rtdyld -printline %p/Inputs/arm_secdiff_reloc.o
diff --git a/test/ExecutionEngine/RuntimeDyld/macho_relocations.test b/test/ExecutionEngine/RuntimeDyld/macho_relocations.test
new file mode 100644
index 0000000..92e4dd7
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/macho_relocations.test
@@ -0,0 +1 @@
+RUN: llvm-rtdyld -printline %p/Inputs/arm_secdiff_reloc.o
diff --git a/test/ExecutionEngine/lit.local.cfg b/test/ExecutionEngine/lit.local.cfg
index a198439..7f0b69e 100644
--- a/test/ExecutionEngine/lit.local.cfg
+++ b/test/ExecutionEngine/lit.local.cfg
@@ -1,9 +1,10 @@
-if config.root.host_arch in ['PowerPC', 'AArch64', 'SystemZ']:
+if config.root.host_arch in ['PowerPC', 'AArch64', 'ARM64', 'SystemZ']:
     config.unsupported = True
 
 # CMake and autoconf diverge in naming or host_arch
-if 'aarch64' in config.root.target_triple:
-    config.unsupported = True
+if 'aarch64' in config.root.target_triple \
+    or 'arm64' in config.root.target_triple:
+        config.unsupported = True
 
 if 'hexagon' in config.root.target_triple:
     config.unsupported = True
diff --git a/test/Feature/alias2.ll b/test/Feature/alias2.ll
new file mode 100644
index 0000000..693ef7c
--- /dev/null
+++ b/test/Feature/alias2.ll
@@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+@v1 = global i32 0
+; CHECK: @v1 = global i32 0
+
+@v2 = global [1 x i32] zeroinitializer
+; CHECK: @v2 = global [1 x i32] zeroinitializer
+
+@v3 = alias i16, i32* @v1
+; CHECK: @v3 = alias i16, i32* @v1
+
+@v4 = alias i32, [1 x i32]* @v2
+; CHECK: @v4 = alias i32, [1 x i32]* @v2
+
+@v5 = alias addrspace(2) i32, i32* @v1
+; CHECK: @v5 = alias addrspace(2) i32, i32* @v1
+
+@v6 = alias i16, i32* @v1
+; CHECK: @v6 = alias i16, i32* @v1
diff --git a/test/Feature/aliases.ll b/test/Feature/aliases.ll
index 7fe9d0b..b2ce82a 100644
--- a/test/Feature/aliases.ll
+++ b/test/Feature/aliases.ll
@@ -7,7 +7,6 @@
 @bar = global i32 0
 @foo1 = alias i32* @bar
 @foo2 = alias i32* @bar
-@foo3 = alias i32* @foo2
 
 %FunTy = type i32()
 
@@ -15,11 +14,10 @@ define i32 @foo_f() {
   ret i32 0
 }
 @bar_f = alias weak_odr %FunTy* @foo_f
-@bar_ff = alias i32()* @bar_f
 
 @bar_i = alias internal i32* @bar
 
-@A = alias bitcast (i32* @bar to i64*)
+@A = alias i64, i32* @bar
 
 define i32 @test() {
 entry:
diff --git a/test/Feature/instructions.ll b/test/Feature/instructions.ll
index d0c303d..aa96294 100644
--- a/test/Feature/instructions.ll
+++ b/test/Feature/instructions.ll
@@ -4,11 +4,13 @@
 
 define i32 @test_extractelement(<4 x i32> %V) {
         %R = extractelement <4 x i32> %V, i32 1         ; <i32> [#uses=1]
+		%S = extractelement <4 x i32> %V, i64 1         ; <i32> [#uses=0]
         ret i32 %R
 }
 
 define <4 x i32> @test_insertelement(<4 x i32> %V) {
         %R = insertelement <4 x i32> %V, i32 0, i32 0           ; <<4 x i32>> [#uses=1]
+		%S = insertelement <4 x i32> %V, i32 0, i64 0           ; <<4 x i32>> [#uses=0]
         ret <4 x i32> %R
 }
 
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_attr.ll b/test/Instrumentation/AddressSanitizer/X86/asm_attr.ll
new file mode 100644
index 0000000..b83a7e9
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_attr.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2 -asm-instrumentation=address -asan-instrument-assembly | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: mov_no_attr
+; CHECK-NOT: callq __sanitizer_sanitize_load8@PLT
+; CHECK-NOT: callq __sanitizer_sanitize_store8@PLT
+define void @mov_no_attr(i64* %dst, i64* %src) {
+  tail call void asm sideeffect "movq ($1), %rax  \0A\09movq %rax, ($0)  \0A\09", "r,r,~{memory},~{rax},~{dirflag},~{fpsr},~{flags}"(i64* %dst, i64* %src)
+  ret void
+}
+
+; CHECK-LABEL: mov_sanitize
+; CHECK: callq __sanitizer_sanitize_load8@PLT
+; CHECK: callq __sanitizer_sanitize_store8@PLT
+define void @mov_sanitize(i64* %dst, i64* %src) sanitize_address {
+  tail call void asm sideeffect "movq ($1), %rax  \0A\09movq %rax, ($0)  \0A\09", "r,r,~{memory},~{rax},~{dirflag},~{fpsr},~{flags}"(i64* %dst, i64* %src)
+  ret void
+}
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll b/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll
index 7af8139..030af7e 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll
@@ -1,22 +1,22 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2 -asan-instrument-inline-assembly | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2 -asm-instrumentation=address -asan-instrument-assembly | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: mov1b
-; CHECK: subq $128, %rsp
+; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rdi
 ; CHECK-NEXT: leaq {{.*}}, %rdi
 ; CHECK-NEXT: callq __sanitizer_sanitize_load1@PLT
 ; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: addq $128, %rsp
+; CHECK-NEXT: leaq 128(%rsp), %rsp
 
-; CHECK: subq $128, %rsp
+; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rdi
 ; CHECK-NEXT: leaq {{.*}}, %rdi
 ; CHECK-NEXT: callq __sanitizer_sanitize_store1@PLT
 ; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: addq $128, %rsp
+; CHECK-NEXT: leaq 128(%rsp), %rsp
 
 ; CHECK: movb {{.*}}, {{.*}}
 define void @mov1b(i8* %dst, i8* %src) #0 {
@@ -26,19 +26,19 @@ entry:
 }
 
 ; CHECK-LABEL: mov2b
-; CHECK: subq $128, %rsp
+; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rdi
 ; CHECK-NEXT: leaq {{.*}}, %rdi
 ; CHECK-NEXT: callq __sanitizer_sanitize_load2@PLT
 ; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: addq $128, %rsp
+; CHECK-NEXT: leaq 128(%rsp), %rsp
 
-; CHECK: subq $128, %rsp
+; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rdi
 ; CHECK-NEXT: leaq {{.*}}, %rdi
 ; CHECK-NEXT: callq __sanitizer_sanitize_store2@PLT
 ; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: addq $128, %rsp
+; CHECK-NEXT: leaq 128(%rsp), %rsp
 
 ; CHECK: movw {{.*}}, {{.*}}
 define void @mov2b(i16* %dst, i16* %src) #0 {
@@ -48,19 +48,19 @@ entry:
 }
 
 ; CHECK-LABEL: mov4b
-; CHECK: subq $128, %rsp
+; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rdi
 ; CHECK-NEXT: leaq {{.*}}, %rdi
 ; CHECK-NEXT: callq __sanitizer_sanitize_load4@PLT
 ; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: addq $128, %rsp
+; CHECK-NEXT: leaq 128(%rsp), %rsp
 
-; CHECK: subq $128, %rsp
+; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rdi
 ; CHECK-NEXT: leaq {{.*}}, %rdi
 ; CHECK-NEXT: callq __sanitizer_sanitize_store4@PLT
 ; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: addq $128, %rsp
+; CHECK-NEXT: leaq 128(%rsp), %rsp
 
 ; CHECK: movl {{.*}}, {{.*}}
 define void @mov4b(i32* %dst, i32* %src) #0 {
@@ -70,19 +70,19 @@ entry:
 }
 
 ; CHECK-LABEL: mov8b
-; CHECK: subq $128, %rsp
+; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rdi
 ; CHECK-NEXT: leaq {{.*}}, %rdi
 ; CHECK-NEXT: callq __sanitizer_sanitize_load8@PLT
 ; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: addq $128, %rsp
+; CHECK-NEXT: leaq 128(%rsp), %rsp
 
-; CHECK: subq $128, %rsp
+; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rdi
 ; CHECK-NEXT: leaq {{.*}}, %rdi
 ; CHECK-NEXT: callq __sanitizer_sanitize_store8@PLT
 ; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: addq $128, %rsp
+; CHECK-NEXT: leaq 128(%rsp), %rsp
 
 ; CHECK: movq {{.*}}, {{.*}}
 define void @mov8b(i64* %dst, i64* %src) #0 {
@@ -92,19 +92,19 @@ entry:
 }
 
 ; CHECK-LABEL: mov16b
-; CHECK: subq $128, %rsp
+; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rdi
 ; CHECK-NEXT: leaq {{.*}}, %rdi
 ; CHECK-NEXT: callq __sanitizer_sanitize_load16@PLT
 ; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: addq $128, %rsp
+; CHECK-NEXT: leaq 128(%rsp), %rsp
 
-; CHECK: subq $128, %rsp
+; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rdi
 ; CHECK-NEXT: leaq {{.*}}, %rdi
 ; CHECK-NEXT: callq __sanitizer_sanitize_store16@PLT
 ; CHECK-NEXT: popq %rdi
-; CHECK-NEXT: addq $128, %rsp
+; CHECK-NEXT: leaq 128(%rsp), %rsp
 
 ; CHECK: movaps {{.*}}, {{.*}}
 define void @mov16b(<2 x i64>* %dst, <2 x i64>* %src) #0 {
@@ -113,7 +113,7 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable sanitize_address "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !0 = metadata !{i32 98, i32 122, i32 160}
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_mov.s b/test/Instrumentation/AddressSanitizer/X86/asm_mov.s
index 9001067..df217c0 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_mov.s
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_mov.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2 -asan-instrument-inline-assembly | FileCheck %s
+# RUN: llvm-mc %s -triple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2 -asm-instrumentation=address -asan-instrument-assembly | FileCheck %s
 
 	.text
 	.globl	mov1b
@@ -6,21 +6,21 @@
 	.type	mov1b,@function
 # CHECK-LABEL: mov1b:
 #
-# CHECK: subq $128, %rsp
+# CHECK: leaq -128(%rsp), %rsp
 # CHECK-NEXT: pushq %rdi
 # CHECK-NEXT: leaq (%rsi), %rdi
 # CHECK-NEXT: callq __sanitizer_sanitize_load1@PLT
 # CHECK-NEXT: popq %rdi
-# CHECK-NEXT: addq $128, %rsp
+# CHECK-NEXT: leaq 128(%rsp), %rsp
 #
 # CHECK-NEXT: movb (%rsi), %al
 #
-# CHECK-NEXT: subq $128, %rsp
+# CHECK-NEXT: leaq -128(%rsp), %rsp
 # CHECK-NEXT: pushq %rdi
 # CHECK-NEXT: leaq (%rdi), %rdi
 # CHECK-NEXT: callq __sanitizer_sanitize_store1@PLT
 # CHECK-NEXT: popq %rdi
-# CHECK-NEXT: addq $128, %rsp
+# CHECK-NEXT: leaq 128(%rsp), %rsp
 #
 # CHECK-NEXT: movb %al, (%rdi)
 mov1b:                                  # @mov1b
@@ -41,21 +41,21 @@ mov1b:                                  # @mov1b
 	.type	mov16b,@function
 # CHECK-LABEL: mov16b:
 #
-# CHECK: subq $128, %rsp
+# CHECK: leaq -128(%rsp), %rsp
 # CHECK-NEXT: pushq %rdi
 # CHECK-NEXT: leaq (%rsi), %rdi
 # CHECK-NEXT: callq __sanitizer_sanitize_load16@PLT
 # CHECK-NEXT: popq %rdi
-# CHECK-NEXT: addq $128, %rsp
+# CHECK-NEXT: leaq 128(%rsp), %rsp
 #
 # CHECK-NEXT: movaps (%rsi), %xmm0
 #
-# CHECK-NEXT: subq $128, %rsp
+# CHECK-NEXT: leaq -128(%rsp), %rsp
 # CHECK-NEXT: pushq %rdi
 # CHECK-NEXT: leaq (%rdi), %rdi
 # CHECK-NEXT: callq __sanitizer_sanitize_store16@PLT
 # CHECK-NEXT: popq %rdi
-# CHECK-NEXT: addq $128, %rsp
+# CHECK-NEXT: leaq 128(%rsp), %rsp
 #
 # CHECK-NEXT: movaps %xmm0, (%rdi)
 mov16b:                                 # @mov16b
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_mov_no_instrumentation.s b/test/Instrumentation/AddressSanitizer/X86/asm_mov_no_instrumentation.s
index a9ef4df..cc05527 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_mov_no_instrumentation.s
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_mov_no_instrumentation.s
@@ -20,25 +20,5 @@ mov1b:                                  # @mov1b
 	.size	mov1b, .Ltmp0-mov1b
 	.cfi_endproc
 
-	.globl	mov16b
-	.align	16, 0x90
-	.type	mov16b,@function
-# CHECK-LABEL: mov16b
-# CHECK-NOT: callq __sanitizer_sanitize_load16@PLT
-# CHECK-NOT: callq __sanitizer_sanitize_store16@PLT
-mov16b:                                 # @mov16b
-	.cfi_startproc
-# BB#0:
-	#APP
-	movaps	(%rsi), %xmm0
-	movaps	%xmm0, (%rdi)
-
-	#NO_APP
-	retq
-.Ltmp1:
-	.size	mov16b, .Ltmp1-mov16b
-	.cfi_endproc
-
-
 	.ident	"clang version 3.5 "
 	.section	".note.GNU-stack","",@progbits
diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_swap_intel.s b/test/Instrumentation/AddressSanitizer/X86/asm_swap_intel.s
new file mode 100644
index 0000000..8a6a8d5
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_swap_intel.s
@@ -0,0 +1,71 @@
+# RUN: llvm-mc %s -x86-asm-syntax=intel -triple=x86_64-unknown-linux-gnu -asm-instrumentation=address -asan-instrument-assembly | FileCheck %s
+
+	.text
+	.globl	swap
+	.align	16, 0x90
+	.type	swap,@function
+# CHECK-LABEL: swap:
+#
+# CHECK: leaq -128(%rsp), %rsp
+# CHECK-NEXT: pushq %rdi
+# CHECK-NEXT: leaq (%rcx), %rdi
+# CHECK-NEXT: callq __sanitizer_sanitize_load8@PLT
+# CHECK-NEXT: popq %rdi
+# CHECK-NEXT: leaq 128(%rsp), %rsp
+#
+# CHECK-NEXT: movq (%rcx), %rax
+#
+# CHECK-NEXT: leaq -128(%rsp), %rsp
+# CHECK-NEXT: pushq %rdi
+# CHECK-NEXT: leaq (%rdx), %rdi
+# CHECK-NEXT: callq __sanitizer_sanitize_load8@PLT
+# CHECK-NEXT: popq %rdi
+# CHECK-NEXT: leaq 128(%rsp), %rsp
+#
+# CHECK-NEXT: movq (%rdx), %rbx
+#
+# CHECK: leaq -128(%rsp), %rsp
+# CHECK-NEXT: pushq %rdi
+# CHECK-NEXT: leaq (%rcx), %rdi
+# CHECK-NEXT: callq __sanitizer_sanitize_store8@PLT
+# CHECK-NEXT: popq %rdi
+# CHECK-NEXT: leaq 128(%rsp), %rsp
+#
+# CHECK-NEXT: movq %rbx, (%rcx)
+#
+# CHECK-NEXT: leaq -128(%rsp), %rsp
+# CHECK-NEXT: pushq %rdi
+# CHECK-NEXT: leaq (%rdx), %rdi
+# CHECK-NEXT: callq __sanitizer_sanitize_store8@PLT
+# CHECK-NEXT: popq %rdi
+# CHECK-NEXT: leaq 128(%rsp), %rsp
+#
+# CHECK-NEXT: movq %rax, (%rdx)
+swap:                                   # @swap
+	.cfi_startproc
+# BB#0:
+	push	rbx
+.Ltmp0:
+	.cfi_def_cfa_offset 16
+.Ltmp1:
+	.cfi_offset rbx, -16
+	mov	rcx, rdi
+	mov	rdx, rsi
+	#APP
+
+
+	mov	rax, qword ptr [rcx]
+	mov	rbx, qword ptr [rdx]
+	mov	qword ptr [rcx], rbx
+	mov	qword ptr [rdx], rax
+
+	#NO_APP
+	pop	rbx
+	ret
+.Ltmp2:
+	.size	swap, .Ltmp2-swap
+	.cfi_endproc
+
+
+	.ident	"clang version 3.5.0 "
+	.section	".note.GNU-stack","",@progbits
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index 4863a3d..7d1aa0b 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -34,7 +34,7 @@ define i32 @test_load(i32* %a) sanitize_address {
 
 
 entry:
-  %tmp1 = load i32* %a
+  %tmp1 = load i32* %a, align 4
   ret i32 %tmp1
 }
 
@@ -66,7 +66,7 @@ define void @test_store(i32* %a) sanitize_address {
 ;
 
 entry:
-  store i32 42, i32* %a
+  store i32 42, i32* %a, align 4
   ret void
 }
 
@@ -115,6 +115,18 @@ define void @i40test(i40* %a, i40* %b) nounwind uwtable sanitize_address {
 ; CHECK: __asan_report_store_n{{.*}}, i64 5)
 ; CHECK: ret void
 
+define void @i64test_align1(i64* %b) nounwind uwtable sanitize_address {
+  entry:
+  store i64 0, i64* %b, align 1
+  ret void
+}
+
+; CHECK-LABEL: i64test_align1
+; CHECK: __asan_report_store_n{{.*}}, i64 8)
+; CHECK: __asan_report_store_n{{.*}}, i64 8)
+; CHECK: ret void
+
+
 define void @i80test(i80* %a, i80* %b) nounwind uwtable sanitize_address {
   entry:
   %t = load i80* %a
@@ -139,4 +151,21 @@ entry:
 ; CHECK-NOT: __asan_report
 ; CHECK: ret i32
 
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) nounwind
+
+define void @memintr_test(i8* %a, i8* %b) nounwind uwtable sanitize_address {
+  entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 100, i32 1, i1 false)
+  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %a, i8* %b, i64 100, i32 1, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 100, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: memintr_test
+; CHECK: __asan_memset
+; CHECK: __asan_memmove
+; CHECK: __asan_memcpy
+; CHECK: ret void
 
diff --git a/test/Instrumentation/AddressSanitizer/coverage-dbg.ll b/test/Instrumentation/AddressSanitizer/coverage-dbg.ll
new file mode 100644
index 0000000..77d7286
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/coverage-dbg.ll
@@ -0,0 +1,33 @@
+; Test that coverage instrumentation does not lose debug location.
+
+; RUN: opt < %s -asan -asan-module -asan-coverage=1 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone uwtable
+define void @_Z1fv() #0 {
+entry:
+  ret void, !dbg !11
+}
+
+; CHECK: call void @__sanitizer_cov(), !dbg !
+
+attributes #0 = { sanitize_address nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (208682)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp//tmp/1.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"/tmp/1.cc", metadata !"/tmp"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"_Z1fv", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z1fv, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp//tmp/1.cc]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{metadata !"clang version 3.5.0 (208682)"}
+!11 = metadata !{i32 2, i32 0, metadata !4, null}
diff --git a/test/Instrumentation/AddressSanitizer/coverage.ll b/test/Instrumentation/AddressSanitizer/coverage.ll
index 0670132..5bc5103 100644
--- a/test/Instrumentation/AddressSanitizer/coverage.ll
+++ b/test/Instrumentation/AddressSanitizer/coverage.ll
@@ -1,5 +1,7 @@
 ; RUN: opt < %s -asan -asan-module -asan-coverage=1 -S | FileCheck %s --check-prefix=CHECK1
 ; RUN: opt < %s -asan -asan-module -asan-coverage=2 -S | FileCheck %s --check-prefix=CHECK2
+; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK2
+; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=1  -S | FileCheck %s --check-prefix=CHECK1
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
 define void @foo(i32* %a) sanitize_address {
@@ -14,6 +16,7 @@ entry:
   if.end:                                           ; preds = %entry, %if.then
   ret void
 }
+
 ; CHECK1-LABEL: define void @foo
 ; CHECK1: %0 = load atomic i8* @__asan_gen_cov_foo monotonic, align 1
 ; CHECK1: %1 = icmp eq i8 0, %0
@@ -22,9 +25,20 @@ entry:
 ; CHECK1-NOT: call void @__sanitizer_cov
 ; CHECK1: store atomic i8 1, i8* @__asan_gen_cov_foo monotonic, align 1
 
+; CHECK1-LABEL: define internal void @asan.module_ctor
+; CHECK1-NOT: ret
+; CHECK1: call void @__sanitizer_cov_module_init(i64 1)
+; CHECK1: ret
+
+
 ; CHECK2-LABEL: define void @foo
 ; CHECK2: call void @__sanitizer_cov
 ; CHECK2: call void @__sanitizer_cov
 ; CHECK2: call void @__sanitizer_cov
 ; CHECK2-NOT: call void @__sanitizer_cov
 ; CHECK2: ret void
+
+; CHECK2-LABEL: define internal void @asan.module_ctor
+; CHECK2-NOT: ret
+; CHECK2: call void @__sanitizer_cov_module_init(i64 3)
+; CHECK2: ret
diff --git a/test/Instrumentation/AddressSanitizer/instrumentation-with-call-threshold.ll b/test/Instrumentation/AddressSanitizer/instrumentation-with-call-threshold.ll
new file mode 100644
index 0000000..adb4341
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/instrumentation-with-call-threshold.ll
@@ -0,0 +1,30 @@
+; Test asan internal compiler flags:
+;   -asan-instrumentation-with-call-threshold
+;   -asan-memory-access-callback-prefix
+
+; RUN: opt < %s -asan -asan-module -asan-instrumentation-with-call-threshold=1 -S | FileCheck %s --check-prefix=CHECK-CALL
+; RUN: opt < %s -asan -asan-module -asan-instrumentation-with-call-threshold=0 -S | FileCheck %s --check-prefix=CHECK-CALL
+; RUN: opt < %s -asan -asan-module -asan-instrumentation-with-call-threshold=0 -asan-memory-access-callback-prefix=__foo_ -S | FileCheck %s --check-prefix=CHECK-CUSTOM-PREFIX
+; RUN: opt < %s -asan -asan-module -asan-instrumentation-with-call-threshold=5 -S | FileCheck %s --check-prefix=CHECK-INLINE
+; RUN: opt < %s -asan -asan-module  -S | FileCheck %s --check-prefix=CHECK-INLINE
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test_load(i32* %a, i64* %b, i512* %c, i80* %d) sanitize_address {
+entry:
+; CHECK-CALL: call void @__asan_load4
+; CHECK-CALL: call void @__asan_load8
+; CHECK-CALL: call void @__asan_loadN{{.*}}i64 64)
+; CHECK-CALL: call void @__asan_loadN{{.*}}i64 10)
+; CHECK-CUSTOM-PREFIX: call void @__foo_load4
+; CHECK-CUSTOM-PREFIX: call void @__foo_load8
+; CHECK-CUSTOM-PREFIX: call void @__foo_loadN
+; CHECK-INLINE-NOT: call void @__asan_load
+  %tmp1 = load i32* %a, align 4
+  %tmp2 = load i64* %b, align 8
+  %tmp3 = load i512* %c, align 32
+  %tmp4 = load i80* %d, align 8
+  ret void
+}
+
+
diff --git a/test/Instrumentation/AddressSanitizer/test64.ll b/test/Instrumentation/AddressSanitizer/test64.ll
index 4f3ed5b..fd93f45 100644
--- a/test/Instrumentation/AddressSanitizer/test64.ll
+++ b/test/Instrumentation/AddressSanitizer/test64.ll
@@ -6,7 +6,7 @@ entry:
   %tmp1 = load i32* %a, align 4
   ret i32 %tmp1
 }
-; CHECK: @read_4_bytes
+; CHECK-LABEL: @read_4_bytes
 ; CHECK-NOT: ret
 ; CHECK: lshr {{.*}} 3
 ; Check for ASAN's Offset for 64-bit (7fff8000)
@@ -19,8 +19,10 @@ entry:
   ret void
 }
 
-; CHECK: @example_atomicrmw
+; CHECK-LABEL: @example_atomicrmw
 ; CHECK: lshr {{.*}} 3
+; CHECK: __asan_report_store8
+; CHECK-NOT: __asan_report
 ; CHECK: atomicrmw
 ; CHECK: ret
 
@@ -30,7 +32,9 @@ entry:
   ret void
 }
 
-; CHECK: @example_cmpxchg
+; CHECK-LABEL: @example_cmpxchg
 ; CHECK: lshr {{.*}} 3
+; CHECK: __asan_report_store8
+; CHECK-NOT: __asan_report
 ; CHECK: cmpxchg
 ; CHECK: ret
diff --git a/test/Instrumentation/MemorySanitizer/do-not-emit-module-limits.ll b/test/Instrumentation/MemorySanitizer/do-not-emit-module-limits.ll
new file mode 100644
index 0000000..7d0a62a
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/do-not-emit-module-limits.ll
@@ -0,0 +1,21 @@
+; Test that MSan does not emit undefined symbol __executable_start when it is
+; not needed (i.e. without -msan-wrap-indirect-calls).
+
+; RUN: opt < %s -msan -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define void @_Z1fv() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 (208165)"}
+
+; CHECK-NOT: __executable_start
diff --git a/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll b/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll
new file mode 100644
index 0000000..34988ef
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll
@@ -0,0 +1,47 @@
+; Test -msan-instrumentation-with-call-threshold
+
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-instrumentation-with-call-threshold=0 -S | FileCheck %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-instrumentation-with-call-threshold=0 -msan-track-origins=1 -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @LoadAndCmp(i32* nocapture %a) nounwind uwtable sanitize_memory {
+entry:
+  %0 = load i32* %a, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void (...)* @foo() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+declare void @foo(...)
+
+; CHECK-LABEL: @LoadAndCmp
+; CHECK: = load
+; CHECK: = load
+; CHECK: = zext i1 {{.*}} to i8
+; CHECK: call void @__msan_maybe_warning_1(
+; CHECK-NOT: unreachable
+; CHECK: ret void
+
+
+define void @Store(i64* nocapture %p, i64 %x) nounwind uwtable sanitize_memory {
+entry:
+  store i64 %x, i64* %p, align 4
+  ret void
+}
+
+; CHECK-LABEL: @Store
+; CHECK: load {{.*}} @__msan_param_tls
+; CHECK-ORIGINS: load {{.*}} @__msan_param_origin_tls
+; CHECK: store
+; CHECK-ORIGINS: bitcast i64* {{.*}} to i8*
+; CHECK-ORIGINS: call void @__msan_maybe_store_origin_8(
+; CHECK: store i64
+; CHECK: ret void
diff --git a/test/LTO/attrs.ll b/test/LTO/attrs.ll
new file mode 100644
index 0000000..d196747
--- /dev/null
+++ b/test/LTO/attrs.ll
@@ -0,0 +1,15 @@
+; RUN: llvm-as < %s >%t1
+; RUN: llvm-lto -exported-symbol=test_x86_aesni_aeskeygenassist -mattr=+aes -o %t2 %t1
+; RUN: llvm-objdump -d %t2 | FileCheck -check-prefix=WITH_AES %s
+; RUN: not llvm-lto -exported-symbol=test_x86_aesni_aeskeygenassist -mattr=-aes -o %t3 %t1 2>&1 | FileCheck -check-prefix=WITHOUT_AES %s
+
+target triple = "x86_64-unknown-linux-gnu"
+declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8)
+define <2 x i64> @test_x86_aesni_aeskeygenassist(<2 x i64> %a0) {
+  ; WITH_AES: test_x86_aesni_aeskeygenassist
+  ; WITH_AES: aeskeygenassist
+  %res = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
+  ret <2 x i64> %res
+}
+
+; WITHOUT_AES: LLVM ERROR: Cannot select: intrinsic %llvm.x86.aesni.aeskeygenassist
diff --git a/test/LTO/keep-used-puts-during-instcombine.ll b/test/LTO/keep-used-puts-during-instcombine.ll
index 1dc63dd..69ce3ee 100644
--- a/test/LTO/keep-used-puts-during-instcombine.ll
+++ b/test/LTO/keep-used-puts-during-instcombine.ll
@@ -20,14 +20,14 @@ entry:
   ret i32 0
 }
 
-define internal hidden i32 @printf(i8* readonly nocapture %fmt, ...) {
+define internal i32 @printf(i8* readonly nocapture %fmt, ...) {
 entry:
   %ret = call i32 @bar(i8* %fmt)
   ret i32 %ret
 }
 
 ; CHECK: define {{.*}} @puts(
-define internal hidden i32 @puts(i8* %s) {
+define internal i32 @puts(i8* %s) {
 entry:
   %ret = call i32 @bar(i8* %s)
   ret i32 %ret
diff --git a/test/Linker/Inputs/PR8300.b.ll b/test/Linker/Inputs/PR8300.b.ll
index 9e538f5..362d309 100644
--- a/test/Linker/Inputs/PR8300.b.ll
+++ b/test/Linker/Inputs/PR8300.b.ll
@@ -1,7 +1,7 @@
 %foo = type { [8 x i8] }
 %bar = type { [9 x i8] }
 
-@zed = alias bitcast (void (%bar*)* @xyz to void (%foo*)*)
+@zed = alias void (%foo*), void (%bar*)* @xyz
 
 define void @xyz(%bar* %this) {
 entry:
diff --git a/test/Linker/Inputs/alias.ll b/test/Linker/Inputs/alias.ll
new file mode 100644
index 0000000..b869cae
--- /dev/null
+++ b/test/Linker/Inputs/alias.ll
@@ -0,0 +1,3 @@
+@zed = global i32 42
+@foo = alias i32* @zed
+@foo2 = alias i16, i32* @zed
diff --git a/test/Linker/Inputs/cycle.ll b/test/Linker/Inputs/cycle.ll
new file mode 100644
index 0000000..d0eddb6
--- /dev/null
+++ b/test/Linker/Inputs/cycle.ll
@@ -0,0 +1,2 @@
+@foo = alias i32* @bar
+@bar = weak global i32 0
diff --git a/test/Linker/Inputs/datalayout-b.ll b/test/Linker/Inputs/datalayout-b.ll
index 59cdb68..d76c1aa 100644
--- a/test/Linker/Inputs/datalayout-b.ll
+++ b/test/Linker/Inputs/datalayout-b.ll
@@ -1 +1 @@
-target datalayout = "E"
+target datalayout = "e-p:16:16"
diff --git a/test/Linker/Inputs/old_global_ctors.3.4.bc b/test/Linker/Inputs/old_global_ctors.3.4.bc
new file mode 100644
index 0000000..a24b1b4
Binary files /dev/null and b/test/Linker/Inputs/old_global_ctors.3.4.bc differ
diff --git a/test/Linker/alias.ll b/test/Linker/alias.ll
new file mode 100644
index 0000000..5809a15
--- /dev/null
+++ b/test/Linker/alias.ll
@@ -0,0 +1,16 @@
+; RUN: llvm-link %s %S/Inputs/alias.ll -S -o - | FileCheck %s
+; RUN: llvm-link %S/Inputs/alias.ll %s -S -o - | FileCheck %s
+
+@foo = weak global i32 0
+; CHECK-DAG: @foo = alias i32* @zed
+
+@bar = alias i32* @foo
+; CHECK-DAG: @bar = alias i32* @zed
+
+@foo2 = weak global i32 0
+; CHECK-DAG: @foo2 = alias i16, i32* @zed
+
+@bar2 = alias i32* @foo2
+; CHECK-DAG: @bar2 = alias i32* @zed
+
+; CHECK-DAG: @zed = global i32 42
diff --git a/test/Linker/cycle.ll b/test/Linker/cycle.ll
new file mode 100644
index 0000000..7d9ad2d
--- /dev/null
+++ b/test/Linker/cycle.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-link %s %S/Inputs/cycle.ll 2>&1 | FileCheck %s
+; RUN: not llvm-link %S/Inputs/cycle.ll %s 2>&1 | FileCheck %s
+
+; CHECK: Linking these modules creates an alias cycle
+
+@foo = weak global i32 0
+@bar = alias i32* @foo
diff --git a/test/Linker/debug-info-version-a.ll b/test/Linker/debug-info-version-a.ll
new file mode 100644
index 0000000..c3d9c87
--- /dev/null
+++ b/test/Linker/debug-info-version-a.ll
@@ -0,0 +1,16 @@
+; RUN: llvm-link %s %p/debug-info-version-b.ll -S -o - | FileCheck %s
+
+; Test linking of incompatible debug info versions. The debug info
+; from the other file should be dropped.
+
+; CHECK-NOT: metadata !{metadata !"b.c", metadata !""}
+; CHECK: metadata !{metadata !"a.c", metadata !""}
+; CHECK-NOT: metadata !{metadata !"b.c", metadata !""}
+
+!llvm.module.flags = !{ !0 }
+!llvm.dbg.cu = !{!1}
+
+!0 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!1 = metadata !{i32 589841, metadata !2, i32 12, metadata !"clang", i1 true, metadata !"", i32 0, metadata !3, metadata !3, metadata !3, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!2 = metadata !{metadata !"a.c", metadata !""}
+!3 = metadata !{}
diff --git a/test/Linker/debug-info-version-b.ll b/test/Linker/debug-info-version-b.ll
new file mode 100644
index 0000000..2b4f184
--- /dev/null
+++ b/test/Linker/debug-info-version-b.ll
@@ -0,0 +1,10 @@
+; RUN: true
+; Companion for debug-info-version-a.ll.
+
+!llvm.module.flags = !{ !0 }
+!llvm.dbg.cu = !{!1}
+
+!0 = metadata !{i32 2, metadata !"Debug Info Version", i32 42}
+!1 = metadata !{i32 589841, metadata !2, i32 12, metadata !"clang", metadata !"I AM UNEXPECTED!"} ; [ DW_TAG_compile_unit ]
+!2 = metadata !{metadata !"b.c", metadata !""}
+!3 = metadata !{}
diff --git a/test/Linker/global_ctors.ll b/test/Linker/global_ctors.ll
new file mode 100644
index 0000000..541f0d4
--- /dev/null
+++ b/test/Linker/global_ctors.ll
@@ -0,0 +1,29 @@
+; RUN: llvm-as %s -o %t.new.bc
+; RUN: llvm-link %t.new.bc %S/Inputs/old_global_ctors.3.4.bc | llvm-dis | FileCheck %s
+
+; old_global_ctors.3.4.bc contains the following LLVM IL, assembled into
+; bitcode by llvm-as from 3.4.  It uses a two element @llvm.global_ctors array.
+; ---
+; declare void @a_global_ctor()
+; declare void @b_global_ctor()
+;
+; @llvm.global_ctors = appending global [2 x { i32, void ()* } ] [
+;   { i32, void ()* } { i32 65535, void ()* @a_global_ctor },
+;   { i32, void ()* } { i32 65535, void ()* @b_global_ctor }
+; ]
+; ---
+
+declare void @c_global_ctor()
+declare void @d_global_ctor()
+
+@llvm.global_ctors = appending global [2 x { i32, void ()*, i8* } ] [
+  { i32, void ()*, i8* } { i32 65535, void ()* @c_global_ctor, i8* null },
+  { i32, void ()*, i8* } { i32 65535, void ()* @d_global_ctor, i8* null }
+]
+
+; CHECK: @llvm.global_ctors = appending global [4 x { i32, void ()*, i8* }] [
+; CHECK-DAG:  { i32, void ()*, i8* } { i32 65535, void ()* @a_global_ctor, i8* null }
+; CHECK-DAG:  { i32, void ()*, i8* } { i32 65535, void ()* @b_global_ctor, i8* null }
+; CHECK-DAG:  { i32, void ()*, i8* } { i32 65535, void ()* @c_global_ctor, i8* null }
+; CHECK-DAG:  { i32, void ()*, i8* } { i32 65535, void ()* @d_global_ctor, i8* null }
+; CHECK: ]
diff --git a/test/Linker/type-unique-odr-a.ll b/test/Linker/type-unique-odr-a.ll
index a1b8d28..91c8033 100644
--- a/test/Linker/type-unique-odr-a.ll
+++ b/test/Linker/type-unique-odr-a.ll
@@ -22,10 +22,6 @@
 ;     return A().getFoo();
 ; }
 ;
-; CHECK:      DW_TAG_subprogram
-; CHECK-NEXT:   DW_AT_MIPS_linkage_name {{.*}} "_Z3bazv"
-; CHECK:      DW_TAG_subprogram
-; CHECK-NEXT:   DW_AT_MIPS_linkage_name {{.*}} "_ZL3barv"
 ; CHECK:      DW_TAG_class_type
 ; CHECK-NEXT:   DW_AT_name {{.*}} "A"
 ; CHECK-NOT:  DW_TAG
@@ -33,8 +29,16 @@
 ; CHECK-NEXT:   DW_AT_name {{.*}} "data"
 ; CHECK-NOT:  DW_TAG
 ; CHECK:      DW_TAG_subprogram
-; CHECK-NEXT:   DW_AT_MIPS_linkage_name {{.*}} "_ZN1A6getFooEv"
-; CHECK-NEXT:   DW_AT_name {{.*}} "getFoo"
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZN1A6getFooEv"
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}} "getFoo"
+; CHECK:      DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_Z3bazv"
+; CHECK:      DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZL3barv"
 
 ; getFoo and A may only appear once.
 ; CHECK-NOT:  {{(getFoo)|("A")}}
diff --git a/test/MC/AArch64/arm64-adr.s b/test/MC/AArch64/arm64-adr.s
new file mode 100644
index 0000000..131e545
--- /dev/null
+++ b/test/MC/AArch64/arm64-adr.s
@@ -0,0 +1,31 @@
+// RUN: not llvm-mc -triple arm64 -show-encoding < %s 2>%t | FileCheck %s
+// RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+adr x0, #0
+adr x0, #1
+adr x0, 1f
+adr x0, foo
+// CHECK: adr x0, #0          // encoding: [0x00,0x00,0x00,0x10]
+// CHECK: adr x0, #1          // encoding: [0x00,0x00,0x00,0x30]
+// CHECK: adr x0, .Ltmp0      // encoding: [A,A,A,0x10'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: .Ltmp0, kind: fixup_aarch64_pcrel_adr_imm21
+// CHECK: adr x0, foo         // encoding: [A,A,A,0x10'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: foo, kind: fixup_aarch64_pcrel_adr_imm21
+
+adrp x0, #0
+adrp x0, #4096
+adrp x0, 1f
+adrp x0, foo
+// CHECK: adrp    x0, #0      // encoding: [0x00,0x00,0x00,0x90]
+// CHECK: adrp    x0, #4096   // encoding: [0x00,0x00,0x00,0xb0]
+// CHECK: adrp    x0, .Ltmp0  // encoding: [A,A,A,0x90'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: .Ltmp0, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK: adrp    x0, foo     // encoding: [A,A,A,0x90'A']
+// CHECK-NEXT:                //   fixup A - offset: 0, value: foo, kind: fixup_aarch64_pcrel_adrp_imm21
+
+adr x0, #0xffffffff
+adrp x0, #0xffffffff
+adrp x0, #1
+// CHECK-ERRORS: error: expected label or encodable integer pc offset
+// CHECK-ERRORS: error: expected label or encodable integer pc offset
+// CHECK-ERRORS: error: expected label or encodable integer pc offset
diff --git a/test/MC/AArch64/arm64-advsimd.s b/test/MC/AArch64/arm64-advsimd.s
new file mode 100644
index 0000000..c627de7
--- /dev/null
+++ b/test/MC/AArch64/arm64-advsimd.s
@@ -0,0 +1,1997 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -output-asm-variant=1 -show-encoding < %s | FileCheck %s
+
+foo:
+
+  abs.8b  v0, v0
+  abs.16b v0, v0
+  abs.4h  v0, v0
+  abs.8h  v0, v0
+  abs.2s  v0, v0
+  abs.4s  v0, v0
+
+; CHECK: abs.8b  v0, v0              ; encoding: [0x00,0xb8,0x20,0x0e]
+; CHECK: abs.16b v0, v0              ; encoding: [0x00,0xb8,0x20,0x4e]
+; CHECK: abs.4h  v0, v0              ; encoding: [0x00,0xb8,0x60,0x0e]
+; CHECK: abs.8h  v0, v0              ; encoding: [0x00,0xb8,0x60,0x4e]
+; CHECK: abs.2s  v0, v0              ; encoding: [0x00,0xb8,0xa0,0x0e]
+; CHECK: abs.4s  v0, v0              ; encoding: [0x00,0xb8,0xa0,0x4e]
+
+  add.8b  v0, v0, v0
+  add.16b v0, v0, v0
+  add.4h  v0, v0, v0
+  add.8h  v0, v0, v0
+  add.2s  v0, v0, v0
+  add.4s  v0, v0, v0
+  add.2d  v0, v0, v0
+
+; CHECK: add.8b  v0, v0, v0          ; encoding: [0x00,0x84,0x20,0x0e]
+; CHECK: add.16b v0, v0, v0          ; encoding: [0x00,0x84,0x20,0x4e]
+; CHECK: add.4h  v0, v0, v0          ; encoding: [0x00,0x84,0x60,0x0e]
+; CHECK: add.8h  v0, v0, v0          ; encoding: [0x00,0x84,0x60,0x4e]
+; CHECK: add.2s  v0, v0, v0          ; encoding: [0x00,0x84,0xa0,0x0e]
+; CHECK: add.4s  v0, v0, v0          ; encoding: [0x00,0x84,0xa0,0x4e]
+; CHECK: add.2d  v0, v0, v0          ; encoding: [0x00,0x84,0xe0,0x4e]
+
+  add d1, d2, d3
+
+; CHECK: add d1, d2, d3              ; encoding: [0x41,0x84,0xe3,0x5e]
+
+  addhn.8b   v0, v0, v0
+  addhn2.16b v0, v0, v0
+  addhn.4h   v0, v0, v0
+  addhn2.8h  v0, v0, v0
+  addhn.2s   v0, v0, v0
+  addhn2.4s  v0, v0, v0
+
+; CHECK: addhn.8b   v0, v0, v0       ; encoding: [0x00,0x40,0x20,0x0e]
+; CHECK: addhn2.16b v0, v0, v0       ; encoding: [0x00,0x40,0x20,0x4e]
+; CHECK: addhn.4h   v0, v0, v0       ; encoding: [0x00,0x40,0x60,0x0e]
+; CHECK: addhn2.8h  v0, v0, v0       ; encoding: [0x00,0x40,0x60,0x4e]
+; CHECK: addhn.2s   v0, v0, v0       ; encoding: [0x00,0x40,0xa0,0x0e]
+; CHECK: addhn2.4s  v0, v0, v0       ; encoding: [0x00,0x40,0xa0,0x4e]
+
+  addp.8b  v0, v0, v0
+  addp.16b v0, v0, v0
+  addp.4h  v0, v0, v0
+  addp.8h  v0, v0, v0
+  addp.2s  v0, v0, v0
+  addp.4s  v0, v0, v0
+  addp.2d  v0, v0, v0
+
+; CHECK: addp.8b   v0, v0, v0        ; encoding: [0x00,0xbc,0x20,0x0e]
+; CHECK: addp.16b  v0, v0, v0        ; encoding: [0x00,0xbc,0x20,0x4e]
+; CHECK: addp.4h   v0, v0, v0        ; encoding: [0x00,0xbc,0x60,0x0e]
+; CHECK: addp.8h   v0, v0, v0        ; encoding: [0x00,0xbc,0x60,0x4e]
+; CHECK: addp.2s   v0, v0, v0        ; encoding: [0x00,0xbc,0xa0,0x0e]
+; CHECK: addp.4s   v0, v0, v0        ; encoding: [0x00,0xbc,0xa0,0x4e]
+; CHECK: addp.2d   v0, v0, v0        ; encoding: [0x00,0xbc,0xe0,0x4e]
+
+  addp.2d  d0, v0
+
+; CHECK: addp.2d d0, v0              ; encoding: [0x00,0xb8,0xf1,0x5e]
+
+  addv.8b  b0, v0
+  addv.16b b0, v0
+  addv.4h  h0, v0
+  addv.8h  h0, v0
+  addv.4s  s0, v0
+
+; CHECK: addv.8b  b0, v0             ; encoding: [0x00,0xb8,0x31,0x0e]
+; CHECK: addv.16b b0, v0             ; encoding: [0x00,0xb8,0x31,0x4e]
+; CHECK: addv.4h  h0, v0             ; encoding: [0x00,0xb8,0x71,0x0e]
+; CHECK: addv.8h  h0, v0             ; encoding: [0x00,0xb8,0x71,0x4e]
+; CHECK: addv.4s  s0, v0             ; encoding: [0x00,0xb8,0xb1,0x4e]
+
+
+; INS/DUP
+  dup.2d  v0, x3
+  dup.4s  v0, w3
+  dup.2s  v0, w3
+  dup.8h  v0, w3
+  dup.4h  v0, w3
+  dup.16b v0, w3
+  dup.8b  v0, w3
+
+  dup v1.2d, x3
+  dup v2.4s, w4
+  dup v3.2s, w5
+  dup v4.8h, w6
+  dup v5.4h, w7
+  dup v6.16b, w8
+  dup v7.8b, w9
+
+; CHECK: dup.2d  v0, x3              ; encoding: [0x60,0x0c,0x08,0x4e]
+; CHECK: dup.4s  v0, w3              ; encoding: [0x60,0x0c,0x04,0x4e]
+; CHECK: dup.2s  v0, w3              ; encoding: [0x60,0x0c,0x04,0x0e]
+; CHECK: dup.8h  v0, w3              ; encoding: [0x60,0x0c,0x02,0x4e]
+; CHECK: dup.4h  v0, w3              ; encoding: [0x60,0x0c,0x02,0x0e]
+; CHECK: dup.16b v0, w3              ; encoding: [0x60,0x0c,0x01,0x4e]
+; CHECK: dup.8b  v0, w3              ; encoding: [0x60,0x0c,0x01,0x0e]
+
+; CHECK: dup.2d	v1, x3               ; encoding: [0x61,0x0c,0x08,0x4e]
+; CHECK: dup.4s	v2, w4               ; encoding: [0x82,0x0c,0x04,0x4e]
+; CHECK: dup.2s	v3, w5               ; encoding: [0xa3,0x0c,0x04,0x0e]
+; CHECK: dup.8h	v4, w6               ; encoding: [0xc4,0x0c,0x02,0x4e]
+; CHECK: dup.4h	v5, w7               ; encoding: [0xe5,0x0c,0x02,0x0e]
+; CHECK: dup.16b v6, w8              ; encoding: [0x06,0x0d,0x01,0x4e]
+; CHECK: dup.8b	v7, w9               ; encoding: [0x27,0x0d,0x01,0x0e]
+
+  dup.2d  v0, v3[1]
+  dup.2s  v0, v3[1]
+  dup.4s  v0, v3[1]
+  dup.4h  v0, v3[1]
+  dup.8h  v0, v3[1]
+  dup.8b  v0, v3[1]
+  dup.16b v0, v3[1]
+
+  dup v7.2d, v9.d[1]
+  dup v6.2s, v8.s[1]
+  dup v5.4s, v7.s[2]
+  dup v4.4h, v6.h[3]
+  dup v3.8h, v5.h[4]
+  dup v2.8b, v4.b[5]
+  dup v1.16b, v3.b[6]
+
+; CHECK: dup.2d  v0, v3[1]           ; encoding: [0x60,0x04,0x18,0x4e]
+; CHECK: dup.2s  v0, v3[1]           ; encoding: [0x60,0x04,0x0c,0x0e]
+; CHECK: dup.4s  v0, v3[1]           ; encoding: [0x60,0x04,0x0c,0x4e]
+; CHECK: dup.4h  v0, v3[1]           ; encoding: [0x60,0x04,0x06,0x0e]
+; CHECK: dup.8h  v0, v3[1]           ; encoding: [0x60,0x04,0x06,0x4e]
+; CHECK: dup.8b  v0, v3[1]           ; encoding: [0x60,0x04,0x03,0x0e]
+; CHECK: dup.16b v0, v3[1]           ; encoding: [0x60,0x04,0x03,0x4e]
+
+; CHECK: dup.2d  v7, v9[1]            ; encoding: [0x27,0x05,0x18,0x4e]
+; CHECK: dup.2s  v6, v8[1]            ; encoding: [0x06,0x05,0x0c,0x0e]
+; CHECK: dup.4s  v5, v7[2]            ; encoding: [0xe5,0x04,0x14,0x4e]
+; CHECK: dup.4h  v4, v6[3]            ; encoding: [0xc4,0x04,0x0e,0x0e]
+; CHECK: dup.8h  v3, v5[4]            ; encoding: [0xa3,0x04,0x12,0x4e]
+; CHECK: dup.8b  v2, v4[5]            ; encoding: [0x82,0x04,0x0b,0x0e]
+; CHECK: dup.16b v1, v3[6]            ; encoding: [0x61,0x04,0x0d,0x4e]
+
+  dup b3, v4[1]
+  dup h3, v4[1]
+  dup s3, v4[1]
+  dup d3, v4[1]
+  dup b3, v4.b[1]
+  dup h3, v4.h[1]
+  dup s3, v4.s[1]
+  dup d3, v4.d[1]
+
+  mov b3, v4[1]
+  mov h3, v4[1]
+  mov s3, v4[1]
+  mov d3, v4[1]
+  mov b3, v4.b[1]
+  mov h3, v4.h[1]
+  mov s3, v4.s[1]
+  mov d3, v4.d[1]
+
+; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
+; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
+; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
+; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
+; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
+; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
+; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
+; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
+
+; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
+; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
+; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
+; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
+; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
+; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
+; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
+; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
+
+  smov.s x3, v2[2]
+  smov   x3, v2.s[2]
+  umov.s w3, v2[2]
+  umov   w3, v2.s[2]
+  umov.d x3, v2[1]
+  umov   x3, v2.d[1]
+
+; CHECK: smov.s  x3, v2[2]           ; encoding: [0x43,0x2c,0x14,0x4e]
+; CHECK: smov.s  x3, v2[2]           ; encoding: [0x43,0x2c,0x14,0x4e]
+; CHECK: mov.s  w3, v2[2]           ; encoding: [0x43,0x3c,0x14,0x0e]
+; CHECK: mov.s  w3, v2[2]           ; encoding: [0x43,0x3c,0x14,0x0e]
+; CHECK: mov.d  x3, v2[1]           ; encoding: [0x43,0x3c,0x18,0x4e]
+; CHECK: mov.d  x3, v2[1]           ; encoding: [0x43,0x3c,0x18,0x4e]
+
+  ; MOV aliases for UMOV instructions above
+
+  mov.s w2, v3[3]
+  mov   w5, v7.s[2]
+  mov.d x11, v13[1]
+  mov   x17, v19.d[0]
+
+; CHECK: mov.s  w2, v3[3]               ; encoding: [0x62,0x3c,0x1c,0x0e]
+; CHECK: mov.s  w5, v7[2]               ; encoding: [0xe5,0x3c,0x14,0x0e]
+; CHECK: mov.d  x11, v13[1]             ; encoding: [0xab,0x3d,0x18,0x4e]
+; CHECK: mov.d  x17, v19[0]             ; encoding: [0x71,0x3e,0x08,0x4e]
+
+  ins.d v2[1], x5
+  ins.s v2[1], w5
+  ins.h v2[1], w5
+  ins.b v2[1], w5
+
+  ins   v2.d[1], x5
+  ins   v2.s[1], w5
+  ins   v2.h[1], w5
+  ins   v2.b[1], w5
+
+; CHECK: ins.d v2[1], x5             ; encoding: [0xa2,0x1c,0x18,0x4e]
+; CHECK: ins.s v2[1], w5             ; encoding: [0xa2,0x1c,0x0c,0x4e]
+; CHECK: ins.h v2[1], w5             ; encoding: [0xa2,0x1c,0x06,0x4e]
+; CHECK: ins.b v2[1], w5             ; encoding: [0xa2,0x1c,0x03,0x4e]
+
+; CHECK: ins.d v2[1], x5             ; encoding: [0xa2,0x1c,0x18,0x4e]
+; CHECK: ins.s v2[1], w5             ; encoding: [0xa2,0x1c,0x0c,0x4e]
+; CHECK: ins.h v2[1], w5             ; encoding: [0xa2,0x1c,0x06,0x4e]
+; CHECK: ins.b v2[1], w5             ; encoding: [0xa2,0x1c,0x03,0x4e]
+
+  ins.d v2[1], v15[1]
+  ins.s v2[1], v15[1]
+  ins.h v2[1], v15[1]
+  ins.b v2[1], v15[1]
+
+  ins   v2.d[1], v15.d[0]
+  ins   v2.s[3], v15.s[2]
+  ins   v2.h[7], v15.h[3]
+  ins   v2.b[10], v15.b[5]
+
+; CHECK: ins.d v2[1], v15[1]         ; encoding: [0xe2,0x45,0x18,0x6e]
+; CHECK: ins.s v2[1], v15[1]         ; encoding: [0xe2,0x25,0x0c,0x6e]
+; CHECK: ins.h v2[1], v15[1]         ; encoding: [0xe2,0x15,0x06,0x6e]
+; CHECK: ins.b v2[1], v15[1]         ; encoding: [0xe2,0x0d,0x03,0x6e]
+
+; CHECK: ins.d v2[1], v15[0]         ; encoding: [0xe2,0x05,0x18,0x6e]
+; CHECK: ins.s v2[3], v15[2]         ; encoding: [0xe2,0x45,0x1c,0x6e]
+; CHECK: ins.h v2[7], v15[3]         ; encoding: [0xe2,0x35,0x1e,0x6e]
+; CHECK: ins.b v2[10], v15[5]        ; encoding: [0xe2,0x2d,0x15,0x6e]
+
+; MOV aliases for the above INS instructions.
+  mov.d v2[1], x5
+  mov.s v3[1], w6
+  mov.h v4[1], w7
+  mov.b v5[1], w8
+
+  mov   v9.d[1], x2
+  mov   v8.s[1], w3
+  mov   v7.h[1], w4
+  mov   v6.b[1], w5
+
+  mov.d v1[1], v10[1]
+  mov.s v2[1], v11[1]
+  mov.h v7[1], v12[1]
+  mov.b v8[1], v15[1]
+
+  mov   v2.d[1], v15.d[0]
+  mov   v7.s[3], v16.s[2]
+  mov   v8.h[7], v17.h[3]
+  mov   v9.b[10], v18.b[5]
+
+; CHECK: ins.d	v2[1], x5               ; encoding: [0xa2,0x1c,0x18,0x4e]
+; CHECK: ins.s	v3[1], w6               ; encoding: [0xc3,0x1c,0x0c,0x4e]
+; CHECK: ins.h	v4[1], w7               ; encoding: [0xe4,0x1c,0x06,0x4e]
+; CHECK: ins.b	v5[1], w8               ; encoding: [0x05,0x1d,0x03,0x4e]
+; CHECK: ins.d	v9[1], x2               ; encoding: [0x49,0x1c,0x18,0x4e]
+; CHECK: ins.s	v8[1], w3               ; encoding: [0x68,0x1c,0x0c,0x4e]
+; CHECK: ins.h	v7[1], w4               ; encoding: [0x87,0x1c,0x06,0x4e]
+; CHECK: ins.b	v6[1], w5               ; encoding: [0xa6,0x1c,0x03,0x4e]
+; CHECK: ins.d	v1[1], v10[1]           ; encoding: [0x41,0x45,0x18,0x6e]
+; CHECK: ins.s	v2[1], v11[1]           ; encoding: [0x62,0x25,0x0c,0x6e]
+; CHECK: ins.h	v7[1], v12[1]           ; encoding: [0x87,0x15,0x06,0x6e]
+; CHECK: ins.b	v8[1], v15[1]           ; encoding: [0xe8,0x0d,0x03,0x6e]
+; CHECK: ins.d	v2[1], v15[0]           ; encoding: [0xe2,0x05,0x18,0x6e]
+; CHECK: ins.s	v7[3], v16[2]           ; encoding: [0x07,0x46,0x1c,0x6e]
+; CHECK: ins.h	v8[7], v17[3]           ; encoding: [0x28,0x36,0x1e,0x6e]
+; CHECK: ins.b	v9[10], v18[5]          ; encoding: [0x49,0x2e,0x15,0x6e]
+
+
+  and.8b  v0, v0, v0
+  and.16b v0, v0, v0
+
+; CHECK: and.8b  v0, v0, v0          ; encoding: [0x00,0x1c,0x20,0x0e]
+; CHECK: and.16b v0, v0, v0          ; encoding: [0x00,0x1c,0x20,0x4e]
+
+  bic.8b  v0, v0, v0
+
+; CHECK: bic.8b  v0, v0, v0          ; encoding: [0x00,0x1c,0x60,0x0e]
+
+  cmeq.8b v0, v0, v0
+  cmge.8b v0, v0, v0
+  cmgt.8b v0, v0, v0
+  cmhi.8b v0, v0, v0
+  cmhs.8b v0, v0, v0
+  cmtst.8b v0, v0, v0
+  fabd.2s v0, v0, v0
+  facge.2s  v0, v0, v0
+  facgt.2s  v0, v0, v0
+  faddp.2s v0, v0, v0
+  fadd.2s v0, v0, v0
+  fcmeq.2s  v0, v0, v0
+  fcmge.2s  v0, v0, v0
+  fcmgt.2s  v0, v0, v0
+  fdiv.2s v0, v0, v0
+  fmaxnmp.2s v0, v0, v0
+  fmaxnm.2s v0, v0, v0
+  fmaxp.2s v0, v0, v0
+  fmax.2s v0, v0, v0
+  fminnmp.2s v0, v0, v0
+  fminnm.2s v0, v0, v0
+  fminp.2s v0, v0, v0
+  fmin.2s v0, v0, v0
+  fmla.2s v0, v0, v0
+  fmls.2s v0, v0, v0
+  fmulx.2s v0, v0, v0
+  fmul.2s v0, v0, v0
+  fmulx	d2, d3, d1
+  fmulx	s2, s3, s1
+  frecps.2s v0, v0, v0
+  frsqrts.2s v0, v0, v0
+  fsub.2s v0, v0, v0
+  mla.8b v0, v0, v0
+  mls.8b v0, v0, v0
+  mul.8b v0, v0, v0
+  pmul.8b v0, v0, v0
+  saba.8b v0, v0, v0
+  sabd.8b v0, v0, v0
+  shadd.8b v0, v0, v0
+  shsub.8b v0, v0, v0
+  smaxp.8b v0, v0, v0
+  smax.8b v0, v0, v0
+  sminp.8b v0, v0, v0
+  smin.8b v0, v0, v0
+  sqadd.8b v0, v0, v0
+  sqdmulh.4h v0, v0, v0
+  sqrdmulh.4h v0, v0, v0
+  sqrshl.8b v0, v0, v0
+  sqshl.8b v0, v0, v0
+  sqsub.8b v0, v0, v0
+  srhadd.8b v0, v0, v0
+  srshl.8b v0, v0, v0
+  sshl.8b v0, v0, v0
+  sub.8b v0, v0, v0
+  uaba.8b v0, v0, v0
+  uabd.8b v0, v0, v0
+  uhadd.8b v0, v0, v0
+  uhsub.8b v0, v0, v0
+  umaxp.8b v0, v0, v0
+  umax.8b v0, v0, v0
+  uminp.8b v0, v0, v0
+  umin.8b v0, v0, v0
+  uqadd.8b v0, v0, v0
+  uqrshl.8b v0, v0, v0
+  uqshl.8b v0, v0, v0
+  uqsub.8b v0, v0, v0
+  urhadd.8b v0, v0, v0
+  urshl.8b v0, v0, v0
+  ushl.8b v0, v0, v0
+
+; CHECK: cmeq.8b	v0, v0, v0              ; encoding: [0x00,0x8c,0x20,0x2e]
+; CHECK: cmge.8b	v0, v0, v0              ; encoding: [0x00,0x3c,0x20,0x0e]
+; CHECK: cmgt.8b	v0, v0, v0              ; encoding: [0x00,0x34,0x20,0x0e]
+; CHECK: cmhi.8b	v0, v0, v0              ; encoding: [0x00,0x34,0x20,0x2e]
+; CHECK: cmhs.8b	v0, v0, v0              ; encoding: [0x00,0x3c,0x20,0x2e]
+; CHECK: cmtst.8b	v0, v0, v0      ; encoding: [0x00,0x8c,0x20,0x0e]
+; CHECK: fabd.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0xa0,0x2e]
+; CHECK: facge.2s	v0, v0, v0      ; encoding: [0x00,0xec,0x20,0x2e]
+; CHECK: facgt.2s	v0, v0, v0      ; encoding: [0x00,0xec,0xa0,0x2e]
+; CHECK: faddp.2s	v0, v0, v0      ; encoding: [0x00,0xd4,0x20,0x2e]
+; CHECK: fadd.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0x20,0x0e]
+; CHECK: fcmeq.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0x20,0x0e]
+; CHECK: fcmge.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0x20,0x2e]
+; CHECK: fcmgt.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0xa0,0x2e]
+; CHECK: fdiv.2s	v0, v0, v0              ; encoding: [0x00,0xfc,0x20,0x2e]
+; CHECK: fmaxnmp.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0x20,0x2e]
+; CHECK: fmaxnm.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0x20,0x0e]
+; CHECK: fmaxp.2s	v0, v0, v0      ; encoding: [0x00,0xf4,0x20,0x2e]
+; CHECK: fmax.2s	v0, v0, v0              ; encoding: [0x00,0xf4,0x20,0x0e]
+; CHECK: fminnmp.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0xa0,0x2e]
+; CHECK: fminnm.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0xa0,0x0e]
+; CHECK: fminp.2s	v0, v0, v0      ; encoding: [0x00,0xf4,0xa0,0x2e]
+; CHECK: fmin.2s	v0, v0, v0              ; encoding: [0x00,0xf4,0xa0,0x0e]
+; CHECK: fmla.2s	v0, v0, v0              ; encoding: [0x00,0xcc,0x20,0x0e]
+; CHECK: fmls.2s	v0, v0, v0              ; encoding: [0x00,0xcc,0xa0,0x0e]
+; CHECK: fmulx.2s	v0, v0, v0      ; encoding: [0x00,0xdc,0x20,0x0e]
+
+; CHECK: fmul.2s	v0, v0, v0              ; encoding: [0x00,0xdc,0x20,0x2e]
+; CHECK: fmulx	d2, d3, d1              ; encoding: [0x62,0xdc,0x61,0x5e]
+; CHECK: fmulx	s2, s3, s1              ; encoding: [0x62,0xdc,0x21,0x5e]
+; CHECK: frecps.2s	v0, v0, v0      ; encoding: [0x00,0xfc,0x20,0x0e]
+; CHECK: frsqrts.2s	v0, v0, v0      ; encoding: [0x00,0xfc,0xa0,0x0e]
+; CHECK: fsub.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0xa0,0x0e]
+; CHECK: mla.8b	v0, v0, v0              ; encoding: [0x00,0x94,0x20,0x0e]
+; CHECK: mls.8b	v0, v0, v0              ; encoding: [0x00,0x94,0x20,0x2e]
+; CHECK: mul.8b	v0, v0, v0              ; encoding: [0x00,0x9c,0x20,0x0e]
+; CHECK: pmul.8b	v0, v0, v0              ; encoding: [0x00,0x9c,0x20,0x2e]
+; CHECK: saba.8b	v0, v0, v0              ; encoding: [0x00,0x7c,0x20,0x0e]
+; CHECK: sabd.8b	v0, v0, v0              ; encoding: [0x00,0x74,0x20,0x0e]
+; CHECK: shadd.8b	v0, v0, v0      ; encoding: [0x00,0x04,0x20,0x0e]
+; CHECK: shsub.8b	v0, v0, v0      ; encoding: [0x00,0x24,0x20,0x0e]
+; CHECK: smaxp.8b	v0, v0, v0      ; encoding: [0x00,0xa4,0x20,0x0e]
+; CHECK: smax.8b	v0, v0, v0              ; encoding: [0x00,0x64,0x20,0x0e]
+; CHECK: sminp.8b	v0, v0, v0      ; encoding: [0x00,0xac,0x20,0x0e]
+; CHECK: smin.8b	v0, v0, v0              ; encoding: [0x00,0x6c,0x20,0x0e]
+; CHECK: sqadd.8b	v0, v0, v0      ; encoding: [0x00,0x0c,0x20,0x0e]
+; CHECK: sqdmulh.4h v0, v0, v0 ; encoding: [0x00,0xb4,0x60,0x0e]
+; CHECK: sqrdmulh.4h v0, v0, v0 ; encoding: [0x00,0xb4,0x60,0x2e]
+; CHECK: sqrshl.8b	v0, v0, v0      ; encoding: [0x00,0x5c,0x20,0x0e]
+; CHECK: sqshl.8b	v0, v0, v0      ; encoding: [0x00,0x4c,0x20,0x0e]
+; CHECK: sqsub.8b	v0, v0, v0      ; encoding: [0x00,0x2c,0x20,0x0e]
+; CHECK: srhadd.8b	v0, v0, v0      ; encoding: [0x00,0x14,0x20,0x0e]
+; CHECK: srshl.8b	v0, v0, v0      ; encoding: [0x00,0x54,0x20,0x0e]
+; CHECK: sshl.8b	v0, v0, v0              ; encoding: [0x00,0x44,0x20,0x0e]
+; CHECK: sub.8b	v0, v0, v0              ; encoding: [0x00,0x84,0x20,0x2e]
+; CHECK: uaba.8b	v0, v0, v0              ; encoding: [0x00,0x7c,0x20,0x2e]
+; CHECK: uabd.8b	v0, v0, v0              ; encoding: [0x00,0x74,0x20,0x2e]
+; CHECK: uhadd.8b	v0, v0, v0      ; encoding: [0x00,0x04,0x20,0x2e]
+; CHECK: uhsub.8b	v0, v0, v0      ; encoding: [0x00,0x24,0x20,0x2e]
+; CHECK: umaxp.8b	v0, v0, v0      ; encoding: [0x00,0xa4,0x20,0x2e]
+; CHECK: umax.8b	v0, v0, v0              ; encoding: [0x00,0x64,0x20,0x2e]
+; CHECK: uminp.8b	v0, v0, v0      ; encoding: [0x00,0xac,0x20,0x2e]
+; CHECK: umin.8b	v0, v0, v0              ; encoding: [0x00,0x6c,0x20,0x2e]
+; CHECK: uqadd.8b	v0, v0, v0      ; encoding: [0x00,0x0c,0x20,0x2e]
+; CHECK: uqrshl.8b	v0, v0, v0      ; encoding: [0x00,0x5c,0x20,0x2e]
+; CHECK: uqshl.8b	v0, v0, v0      ; encoding: [0x00,0x4c,0x20,0x2e]
+; CHECK: uqsub.8b	v0, v0, v0      ; encoding: [0x00,0x2c,0x20,0x2e]
+; CHECK: urhadd.8b	v0, v0, v0      ; encoding: [0x00,0x14,0x20,0x2e]
+; CHECK: urshl.8b	v0, v0, v0      ; encoding: [0x00,0x54,0x20,0x2e]
+; CHECK: ushl.8b	v0, v0, v0              ; encoding: [0x00,0x44,0x20,0x2e]
+
+  bif.8b v0, v0, v0
+  bit.8b v0, v0, v0
+  bsl.8b v0, v0, v0
+  eor.8b v0, v0, v0
+  orn.8b v0, v0, v0
+  orr.8b v0, v0, v1
+
+; CHECK: bif.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xe0,0x2e]
+; CHECK: bit.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xa0,0x2e]
+; CHECK: bsl.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0x60,0x2e]
+; CHECK: eor.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0x20,0x2e]
+; CHECK: orn.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xe0,0x0e]
+; CHECK: orr.8b v0, v0, v1              ; encoding: [0x00,0x1c,0xa1,0x0e]
+
+  sadalp.4h   v0, v0
+  sadalp.8h  v0, v0
+  sadalp.2s   v0, v0
+  sadalp.4s   v0, v0
+  sadalp.1d   v0, v0
+  sadalp.2d   v0, v0
+
+; CHECK: sadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x0e]
+; CHECK: sadalp.8h	v0, v0          ; encoding: [0x00,0x68,0x20,0x4e]
+; CHECK: sadalp.2s	v0, v0          ; encoding: [0x00,0x68,0x60,0x0e]
+; CHECK: sadalp.4s	v0, v0          ; encoding: [0x00,0x68,0x60,0x4e]
+; CHECK: sadalp.1d	v0, v0          ; encoding: [0x00,0x68,0xa0,0x0e]
+; CHECK: sadalp.2d	v0, v0          ; encoding: [0x00,0x68,0xa0,0x4e]
+
+  cls.8b      v0, v0
+  clz.8b      v0, v0
+  cnt.8b      v0, v0
+  fabs.2s     v0, v0
+  fneg.2s     v0, v0
+  frecpe.2s   v0, v0
+  frinta.2s   v0, v0
+  frintx.2s   v0, v0
+  frinti.2s   v0, v0
+  frintm.2s   v0, v0
+  frintn.2s   v0, v0
+  frintp.2s   v0, v0
+  frintz.2s   v0, v0
+  frsqrte.2s  v0, v0
+  fsqrt.2s    v0, v0
+  neg.8b      v0, v0
+  not.8b      v0, v0
+  rbit.8b     v0, v0
+  rev16.8b    v0, v0
+  rev32.8b    v0, v0
+  rev64.8b    v0, v0
+  sadalp.4h   v0, v0
+  saddlp.4h	  v0, v0
+  scvtf.2s    v0, v0
+  sqabs.8b    v0, v0
+  sqneg.8b    v0, v0
+  sqxtn.8b    v0, v0
+  sqxtun.8b   v0, v0
+  suqadd.8b   v0, v0
+  uadalp.4h   v0, v0
+  uaddlp.4h   v0, v0
+  ucvtf.2s    v0, v0
+  uqxtn.8b    v0, v0
+  urecpe.2s   v0, v0
+  ursqrte.2s  v0, v0
+  usqadd.8b   v0, v0
+  xtn.8b      v0, v0
+  shll.8h v1, v2, #8
+  shll.4s v3, v4, #16
+  shll.2d v5, v6, #32
+  shll2.8h v7, v8, #8
+  shll2.4s v9, v10, #16
+  shll2.2d v11, v12, #32
+  shll v1.8h, v2.8b, #8
+  shll v1.4s, v2.4h, #16
+  shll v1.2d, v2.2s, #32
+  shll2 v1.8h, v2.16b, #8
+  shll2 v1.4s, v2.8h, #16
+  shll2 v1.2d, v2.4s, #32
+
+; CHECK: cls.8b	v0, v0                  ; encoding: [0x00,0x48,0x20,0x0e]
+; CHECK: clz.8b	v0, v0                  ; encoding: [0x00,0x48,0x20,0x2e]
+; CHECK: cnt.8b	v0, v0                  ; encoding: [0x00,0x58,0x20,0x0e]
+; CHECK: fabs.2s	v0, v0                  ; encoding: [0x00,0xf8,0xa0,0x0e]
+; CHECK: fneg.2s	v0, v0                  ; encoding: [0x00,0xf8,0xa0,0x2e]
+; CHECK: frecpe.2s	v0, v0          ; encoding: [0x00,0xd8,0xa1,0x0e]
+; CHECK: frinta.2s	v0, v0          ; encoding: [0x00,0x88,0x21,0x2e]
+; CHECK: frintx.2s	v0, v0          ; encoding: [0x00,0x98,0x21,0x2e]
+; CHECK: frinti.2s	v0, v0          ; encoding: [0x00,0x98,0xa1,0x2e]
+; CHECK: frintm.2s	v0, v0          ; encoding: [0x00,0x98,0x21,0x0e]
+; CHECK: frintn.2s	v0, v0          ; encoding: [0x00,0x88,0x21,0x0e]
+; CHECK: frintp.2s	v0, v0          ; encoding: [0x00,0x88,0xa1,0x0e]
+; CHECK: frintz.2s	v0, v0          ; encoding: [0x00,0x98,0xa1,0x0e]
+; CHECK: frsqrte.2s	v0, v0          ; encoding: [0x00,0xd8,0xa1,0x2e]
+; CHECK: fsqrt.2s	v0, v0          ; encoding: [0x00,0xf8,0xa1,0x2e]
+; CHECK: neg.8b	v0, v0                  ; encoding: [0x00,0xb8,0x20,0x2e]
+; CHECK: mvn.8b	v0, v0                  ; encoding: [0x00,0x58,0x20,0x2e]
+; CHECK: rbit.8b	v0, v0                  ; encoding: [0x00,0x58,0x60,0x2e]
+; CHECK: rev16.8b	v0, v0          ; encoding: [0x00,0x18,0x20,0x0e]
+; CHECK: rev32.8b	v0, v0          ; encoding: [0x00,0x08,0x20,0x2e]
+; CHECK: rev64.8b	v0, v0          ; encoding: [0x00,0x08,0x20,0x0e]
+; CHECK: sadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x0e]
+; CHECK: saddlp.4h	v0, v0          ; encoding: [0x00,0x28,0x20,0x0e]
+; CHECK: scvtf.2s	v0, v0          ; encoding: [0x00,0xd8,0x21,0x0e]
+; CHECK: sqabs.8b	v0, v0          ; encoding: [0x00,0x78,0x20,0x0e]
+; CHECK: sqneg.8b	v0, v0          ; encoding: [0x00,0x78,0x20,0x2e]
+; CHECK: sqxtn.8b	v0, v0          ; encoding: [0x00,0x48,0x21,0x0e]
+; CHECK: sqxtun.8b	v0, v0          ; encoding: [0x00,0x28,0x21,0x2e]
+; CHECK: suqadd.8b	v0, v0          ; encoding: [0x00,0x38,0x20,0x0e]
+; CHECK: uadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x2e]
+; CHECK: uaddlp.4h	v0, v0          ; encoding: [0x00,0x28,0x20,0x2e]
+; CHECK: ucvtf.2s	v0, v0          ; encoding: [0x00,0xd8,0x21,0x2e]
+; CHECK: uqxtn.8b	v0, v0          ; encoding: [0x00,0x48,0x21,0x2e]
+; CHECK: urecpe.2s	v0, v0          ; encoding: [0x00,0xc8,0xa1,0x0e]
+; CHECK: ursqrte.2s	v0, v0          ; encoding: [0x00,0xc8,0xa1,0x2e]
+; CHECK: usqadd.8b	v0, v0          ; encoding: [0x00,0x38,0x20,0x2e]
+; CHECK: xtn.8b	v0, v0                  ; encoding: [0x00,0x28,0x21,0x0e]
+; CHECK: shll.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x2e]
+; CHECK: shll.4s	v3, v4, #16     ; encoding: [0x83,0x38,0x61,0x2e]
+; CHECK: shll.2d	v5, v6, #32     ; encoding: [0xc5,0x38,0xa1,0x2e]
+; CHECK: shll2.8h	v7, v8, #8      ; encoding: [0x07,0x39,0x21,0x6e]
+; CHECK: shll2.4s	v9, v10, #16    ; encoding: [0x49,0x39,0x61,0x6e]
+; CHECK: shll2.2d	v11, v12, #32   ; encoding: [0x8b,0x39,0xa1,0x6e]
+; CHECK: shll.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x2e]
+; CHECK: shll.4s	v1, v2, #16     ; encoding: [0x41,0x38,0x61,0x2e]
+; CHECK: shll.2d	v1, v2, #32     ; encoding: [0x41,0x38,0xa1,0x2e]
+; CHECK: shll2.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x6e]
+; CHECK: shll2.4s	v1, v2, #16     ; encoding: [0x41,0x38,0x61,0x6e]
+; CHECK: shll2.2d	v1, v2, #32     ; encoding: [0x41,0x38,0xa1,0x6e]
+
+
+  cmeq.8b   v0, v0, #0
+  cmeq.16b  v0, v0, #0
+  cmeq.4h   v0, v0, #0
+  cmeq.8h   v0, v0, #0
+  cmeq.2s   v0, v0, #0
+  cmeq.4s   v0, v0, #0
+  cmeq.2d   v0, v0, #0
+
+; CHECK: cmeq.8b	v0, v0, #0              ; encoding: [0x00,0x98,0x20,0x0e]
+; CHECK: cmeq.16b	v0, v0, #0      ; encoding: [0x00,0x98,0x20,0x4e]
+; CHECK: cmeq.4h	v0, v0, #0              ; encoding: [0x00,0x98,0x60,0x0e]
+; CHECK: cmeq.8h	v0, v0, #0              ; encoding: [0x00,0x98,0x60,0x4e]
+; CHECK: cmeq.2s	v0, v0, #0              ; encoding: [0x00,0x98,0xa0,0x0e]
+; CHECK: cmeq.4s	v0, v0, #0              ; encoding: [0x00,0x98,0xa0,0x4e]
+; CHECK: cmeq.2d	v0, v0, #0              ; encoding: [0x00,0x98,0xe0,0x4e]
+
+  cmge.8b   v0, v0, #0
+  cmgt.8b   v0, v0, #0
+  cmle.8b   v0, v0, #0
+  cmlt.8b   v0, v0, #0
+  fcmeq.2s  v0, v0, #0
+  fcmge.2s  v0, v0, #0
+  fcmgt.2s  v0, v0, #0
+  fcmle.2s  v0, v0, #0
+  fcmlt.2s  v0, v0, #0
+
+; ARM verbose mode aliases
+  cmlt v8.8b, v14.8b, #0
+  cmlt v8.16b, v14.16b, #0
+  cmlt v8.4h, v14.4h, #0
+  cmlt v8.8h, v14.8h, #0
+  cmlt v8.2s, v14.2s, #0
+  cmlt v8.4s, v14.4s, #0
+  cmlt v8.2d, v14.2d, #0
+
+; CHECK: cmge.8b	v0, v0, #0              ; encoding: [0x00,0x88,0x20,0x2e]
+; CHECK: cmgt.8b	v0, v0, #0              ; encoding: [0x00,0x88,0x20,0x0e]
+; CHECK: cmle.8b	v0, v0, #0              ; encoding: [0x00,0x98,0x20,0x2e]
+; CHECK: cmlt.8b	v0, v0, #0              ; encoding: [0x00,0xa8,0x20,0x0e]
+; CHECK: fcmeq.2s	v0, v0, #0.0      ; encoding: [0x00,0xd8,0xa0,0x0e]
+; CHECK: fcmge.2s	v0, v0, #0.0      ; encoding: [0x00,0xc8,0xa0,0x2e]
+; CHECK: fcmgt.2s	v0, v0, #0.0      ; encoding: [0x00,0xc8,0xa0,0x0e]
+; CHECK: fcmle.2s	v0, v0, #0.0      ; encoding: [0x00,0xd8,0xa0,0x2e]
+; CHECK: fcmlt.2s	v0, v0, #0.0      ; encoding: [0x00,0xe8,0xa0,0x0e]
+; CHECK: cmlt.8b	v8, v14, #0             ; encoding: [0xc8,0xa9,0x20,0x0e]
+; CHECK: cmlt.16b	v8, v14, #0     ; encoding: [0xc8,0xa9,0x20,0x4e]
+; CHECK: cmlt.4h	v8, v14, #0             ; encoding: [0xc8,0xa9,0x60,0x0e]
+; CHECK: cmlt.8h	v8, v14, #0             ; encoding: [0xc8,0xa9,0x60,0x4e]
+; CHECK: cmlt.2s	v8, v14, #0             ; encoding: [0xc8,0xa9,0xa0,0x0e]
+; CHECK: cmlt.4s	v8, v14, #0             ; encoding: [0xc8,0xa9,0xa0,0x4e]
+; CHECK: cmlt.2d	v8, v14, #0             ; encoding: [0xc8,0xa9,0xe0,0x4e]
+
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD Floating-point <-> Integer Conversions
+;===-------------------------------------------------------------------------===
+
+  fcvtas.2s   v0, v0
+  fcvtas.4s   v0, v0
+  fcvtas.2d   v0, v0
+  fcvtas      s0, s0
+  fcvtas      d0, d0
+
+; CHECK: fcvtas.2s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x0e]
+; CHECK: fcvtas.4s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x4e]
+; CHECK: fcvtas.2d  v0, v0           ; encoding: [0x00,0xc8,0x61,0x4e]
+; CHECK: fcvtas     s0, s0           ; encoding: [0x00,0xc8,0x21,0x5e]
+; CHECK: fcvtas     d0, d0           ; encoding: [0x00,0xc8,0x61,0x5e]
+
+  fcvtau.2s   v0, v0
+  fcvtau.4s   v0, v0
+  fcvtau.2d   v0, v0
+  fcvtau      s0, s0
+  fcvtau      d0, d0
+
+; CHECK: fcvtau.2s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x2e]
+; CHECK: fcvtau.4s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x6e]
+; CHECK: fcvtau.2d  v0, v0           ; encoding: [0x00,0xc8,0x61,0x6e]
+; CHECK: fcvtau     s0, s0           ; encoding: [0x00,0xc8,0x21,0x7e]
+; CHECK: fcvtau     d0, d0           ; encoding: [0x00,0xc8,0x61,0x7e]
+
+  fcvtl   v1.4s, v5.4h
+  fcvtl   v2.2d, v6.2s
+  fcvtl2  v3.4s, v7.8h
+  fcvtl2  v4.2d, v8.4s
+
+; CHECK: fcvtl	v1.4s, v5.4h            ; encoding: [0xa1,0x78,0x21,0x0e]
+; CHECK: fcvtl	v2.2d, v6.2s            ; encoding: [0xc2,0x78,0x61,0x0e]
+; CHECK: fcvtl2	v3.4s, v7.8h            ; encoding: [0xe3,0x78,0x21,0x4e]
+; CHECK: fcvtl2	v4.2d, v8.4s            ; encoding: [0x04,0x79,0x61,0x4e]
+
+  fcvtms.2s  v0, v0
+  fcvtms.4s  v0, v0
+  fcvtms.2d  v0, v0
+  fcvtms     s0, s0
+  fcvtms     d0, d0
+
+; CHECK: fcvtms.2s v0, v0            ; encoding: [0x00,0xb8,0x21,0x0e]
+; CHECK: fcvtms.4s v0, v0            ; encoding: [0x00,0xb8,0x21,0x4e]
+; CHECK: fcvtms.2d v0, v0            ; encoding: [0x00,0xb8,0x61,0x4e]
+; CHECK: fcvtms    s0, s0            ; encoding: [0x00,0xb8,0x21,0x5e]
+; CHECK: fcvtms    d0, d0            ; encoding: [0x00,0xb8,0x61,0x5e]
+
+  fcvtmu.2s   v0, v0
+  fcvtmu.4s   v0, v0
+  fcvtmu.2d   v0, v0
+  fcvtmu      s0, s0
+  fcvtmu      d0, d0
+
+; CHECK: fcvtmu.2s v0, v0            ; encoding: [0x00,0xb8,0x21,0x2e]
+; CHECK: fcvtmu.4s v0, v0            ; encoding: [0x00,0xb8,0x21,0x6e]
+; CHECK: fcvtmu.2d v0, v0            ; encoding: [0x00,0xb8,0x61,0x6e]
+; CHECK: fcvtmu    s0, s0            ; encoding: [0x00,0xb8,0x21,0x7e]
+; CHECK: fcvtmu    d0, d0            ; encoding: [0x00,0xb8,0x61,0x7e]
+
+  fcvtns.2s   v0, v0
+  fcvtns.4s   v0, v0
+  fcvtns.2d   v0, v0
+  fcvtns      s0, s0
+  fcvtns      d0, d0
+
+; CHECK: fcvtns.2s v0, v0            ; encoding: [0x00,0xa8,0x21,0x0e]
+; CHECK: fcvtns.4s v0, v0            ; encoding: [0x00,0xa8,0x21,0x4e]
+; CHECK: fcvtns.2d v0, v0            ; encoding: [0x00,0xa8,0x61,0x4e]
+; CHECK: fcvtns    s0, s0            ; encoding: [0x00,0xa8,0x21,0x5e]
+; CHECK: fcvtns    d0, d0            ; encoding: [0x00,0xa8,0x61,0x5e]
+
+  fcvtnu.2s   v0, v0
+  fcvtnu.4s   v0, v0
+  fcvtnu.2d   v0, v0
+  fcvtnu      s0, s0
+  fcvtnu      d0, d0
+
+; CHECK: fcvtnu.2s v0, v0            ; encoding: [0x00,0xa8,0x21,0x2e]
+; CHECK: fcvtnu.4s v0, v0            ; encoding: [0x00,0xa8,0x21,0x6e]
+; CHECK: fcvtnu.2d v0, v0            ; encoding: [0x00,0xa8,0x61,0x6e]
+; CHECK: fcvtnu    s0, s0            ; encoding: [0x00,0xa8,0x21,0x7e]
+; CHECK: fcvtnu    d0, d0            ; encoding: [0x00,0xa8,0x61,0x7e]
+
+  fcvtn   v2.4h, v4.4s
+  fcvtn   v3.2s, v5.2d
+  fcvtn2  v4.8h, v6.4s
+  fcvtn2  v5.4s, v7.2d
+  fcvtxn  v6.2s, v9.2d
+  fcvtxn2 v7.4s, v8.2d
+
+; CHECK: fcvtn	v2.4h, v4.4s            ; encoding: [0x82,0x68,0x21,0x0e]
+; CHECK: fcvtn	v3.2s, v5.2d            ; encoding: [0xa3,0x68,0x61,0x0e]
+; CHECK: fcvtn2	v4.8h, v6.4s            ; encoding: [0xc4,0x68,0x21,0x4e]
+; CHECK: fcvtn2	v5.4s, v7.2d            ; encoding: [0xe5,0x68,0x61,0x4e]
+; CHECK: fcvtxn	v6.2s, v9.2d            ; encoding: [0x26,0x69,0x61,0x2e]
+; CHECK: fcvtxn2 v7.4s, v8.2d           ; encoding: [0x07,0x69,0x61,0x6e]
+
+  fcvtps.2s  v0, v0
+  fcvtps.4s  v0, v0
+  fcvtps.2d  v0, v0
+  fcvtps     s0, s0
+  fcvtps     d0, d0
+
+; CHECK: fcvtps.2s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x0e]
+; CHECK: fcvtps.4s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x4e]
+; CHECK: fcvtps.2d v0, v0            ; encoding: [0x00,0xa8,0xe1,0x4e]
+; CHECK: fcvtps    s0, s0            ; encoding: [0x00,0xa8,0xa1,0x5e]
+; CHECK: fcvtps    d0, d0            ; encoding: [0x00,0xa8,0xe1,0x5e]
+
+  fcvtpu.2s  v0, v0
+  fcvtpu.4s  v0, v0
+  fcvtpu.2d  v0, v0
+  fcvtpu     s0, s0
+  fcvtpu     d0, d0
+
+; CHECK: fcvtpu.2s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x2e]
+; CHECK: fcvtpu.4s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x6e]
+; CHECK: fcvtpu.2d v0, v0            ; encoding: [0x00,0xa8,0xe1,0x6e]
+; CHECK: fcvtpu    s0, s0            ; encoding: [0x00,0xa8,0xa1,0x7e]
+; CHECK: fcvtpu    d0, d0            ; encoding: [0x00,0xa8,0xe1,0x7e]
+
+  fcvtzs.2s  v0, v0
+  fcvtzs.4s  v0, v0
+  fcvtzs.2d  v0, v0
+  fcvtzs     s0, s0
+  fcvtzs     d0, d0
+
+; CHECK: fcvtzs.2s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x0e]
+; CHECK: fcvtzs.4s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x4e]
+; CHECK: fcvtzs.2d v0, v0            ; encoding: [0x00,0xb8,0xe1,0x4e]
+; CHECK: fcvtzs    s0, s0            ; encoding: [0x00,0xb8,0xa1,0x5e]
+; CHECK: fcvtzs    d0, d0            ; encoding: [0x00,0xb8,0xe1,0x5e]
+
+  fcvtzu.2s  v0, v0
+  fcvtzu.4s  v0, v0
+  fcvtzu.2d  v0, v0
+  fcvtzu     s0, s0
+  fcvtzu     d0, d0
+
+; CHECK: fcvtzu.2s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x2e]
+; CHECK: fcvtzu.4s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x6e]
+; CHECK: fcvtzu.2d v0, v0            ; encoding: [0x00,0xb8,0xe1,0x6e]
+; CHECK: fcvtzu    s0, s0            ; encoding: [0x00,0xb8,0xa1,0x7e]
+; CHECK: fcvtzu    d0, d0            ; encoding: [0x00,0xb8,0xe1,0x7e]
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD modified immediate instructions
+;===-------------------------------------------------------------------------===
+
+  bic.2s  v0, #1
+  bic.2s  v0, #1, lsl #0
+  bic.2s  v0, #1, lsl #8
+  bic.2s  v0, #1, lsl #16
+  bic.2s  v0, #1, lsl #24
+
+; CHECK: bic.2s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x2f]
+; CHECK: bic.2s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x2f]
+; CHECK: bic.2s v0, #0x1, lsl #8       ; encoding: [0x20,0x34,0x00,0x2f]
+; CHECK: bic.2s v0, #0x1, lsl #16      ; encoding: [0x20,0x54,0x00,0x2f]
+; CHECK: bic.2s v0, #0x1, lsl #24      ; encoding: [0x20,0x74,0x00,0x2f]
+
+  bic.4h  v0, #1
+  bic.4h  v0, #1, lsl #0
+  bic.4h  v0, #1, lsl #8
+
+; CHECK: bic.4h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x2f]
+; CHECK: bic.4h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x2f]
+; CHECK: bic.4h v0, #0x1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x2f]
+
+  bic.4s  v0, #1
+  bic.4s  v0, #1, lsl #0
+  bic.4s  v0, #1, lsl #8
+  bic.4s  v0, #1, lsl #16
+  bic.4s  v0, #1, lsl #24
+
+; CHECK: bic.4s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x6f]
+; CHECK: bic.4s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x6f]
+; CHECK: bic.4s v0, #0x1, lsl #8       ; encoding: [0x20,0x34,0x00,0x6f]
+; CHECK: bic.4s v0, #0x1, lsl #16      ; encoding: [0x20,0x54,0x00,0x6f]
+; CHECK: bic.4s v0, #0x1, lsl #24      ; encoding: [0x20,0x74,0x00,0x6f]
+
+  bic.8h  v0, #1
+  bic.8h  v0, #1, lsl #0
+  bic.8h  v0, #1, lsl #8
+
+; CHECK: bic.8h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x6f]
+; CHECK: bic.8h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x6f]
+; CHECK: bic.8h v0, #0x1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x6f]
+
+  fmov.2d v0, #1.250000e-01
+
+; CHECK: fmov.2d v0, #0.12500000             ; encoding: [0x00,0xf4,0x02,0x6f]
+
+  fmov.2s v0, #1.250000e-01
+  fmov.4s v0, #1.250000e-01
+
+; CHECK: fmov.2s v0, #0.12500000             ; encoding: [0x00,0xf4,0x02,0x0f]
+; CHECK: fmov.4s v0, #0.12500000             ; encoding: [0x00,0xf4,0x02,0x4f]
+
+  orr.2s  v0, #1
+  orr.2s  v0, #1, lsl #0
+  orr.2s  v0, #1, lsl #8
+  orr.2s  v0, #1, lsl #16
+  orr.2s  v0, #1, lsl #24
+
+; CHECK: orr.2s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x0f]
+; CHECK: orr.2s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x0f]
+; CHECK: orr.2s v0, #0x1, lsl #8       ; encoding: [0x20,0x34,0x00,0x0f]
+; CHECK: orr.2s v0, #0x1, lsl #16      ; encoding: [0x20,0x54,0x00,0x0f]
+; CHECK: orr.2s v0, #0x1, lsl #24      ; encoding: [0x20,0x74,0x00,0x0f]
+
+  orr.4h  v0, #1
+  orr.4h  v0, #1, lsl #0
+  orr.4h  v0, #1, lsl #8
+
+; CHECK: orr.4h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x0f]
+; CHECK: orr.4h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x0f]
+; CHECK: orr.4h v0, #0x1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x0f]
+
+  orr.4s  v0, #1
+  orr.4s  v0, #1, lsl #0
+  orr.4s  v0, #1, lsl #8
+  orr.4s  v0, #1, lsl #16
+  orr.4s  v0, #1, lsl #24
+
+; CHECK: orr.4s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x4f]
+; CHECK: orr.4s v0, #0x1               ; encoding: [0x20,0x14,0x00,0x4f]
+; CHECK: orr.4s v0, #0x1, lsl #8       ; encoding: [0x20,0x34,0x00,0x4f]
+; CHECK: orr.4s v0, #0x1, lsl #16      ; encoding: [0x20,0x54,0x00,0x4f]
+; CHECK: orr.4s v0, #0x1, lsl #24      ; encoding: [0x20,0x74,0x00,0x4f]
+
+  orr.8h  v0, #1
+  orr.8h  v0, #1, lsl #0
+  orr.8h  v0, #1, lsl #8
+
+; CHECK: orr.8h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x4f]
+; CHECK: orr.8h v0, #0x1               ; encoding: [0x20,0x94,0x00,0x4f]
+; CHECK: orr.8h v0, #0x1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x4f]
+
+  movi     d0, #0x000000000000ff
+  movi.2d  v0, #0x000000000000ff
+
+; CHECK: movi     d0, #0x000000000000ff ; encoding: [0x20,0xe4,0x00,0x2f]
+; CHECK: movi.2d  v0, #0x000000000000ff ; encoding: [0x20,0xe4,0x00,0x6f]
+
+  movi.2s v0, #1
+  movi.2s v0, #1, lsl #0
+  movi.2s v0, #1, lsl #8
+  movi.2s v0, #1, lsl #16
+  movi.2s v0, #1, lsl #24
+
+; CHECK: movi.2s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x0f]
+; CHECK: movi.2s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x0f]
+; CHECK: movi.2s v0, #0x1, lsl #8      ; encoding: [0x20,0x24,0x00,0x0f]
+; CHECK: movi.2s v0, #0x1, lsl #16     ; encoding: [0x20,0x44,0x00,0x0f]
+; CHECK: movi.2s v0, #0x1, lsl #24     ; encoding: [0x20,0x64,0x00,0x0f]
+
+  movi.4s v0, #1
+  movi.4s v0, #1, lsl #0
+  movi.4s v0, #1, lsl #8
+  movi.4s v0, #1, lsl #16
+  movi.4s v0, #1, lsl #24
+
+; CHECK: movi.4s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x4f]
+; CHECK: movi.4s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x4f]
+; CHECK: movi.4s v0, #0x1, lsl #8      ; encoding: [0x20,0x24,0x00,0x4f]
+; CHECK: movi.4s v0, #0x1, lsl #16     ; encoding: [0x20,0x44,0x00,0x4f]
+; CHECK: movi.4s v0, #0x1, lsl #24     ; encoding: [0x20,0x64,0x00,0x4f]
+
+  movi.4h v0, #1
+  movi.4h v0, #1, lsl #0
+  movi.4h v0, #1, lsl #8
+
+; CHECK: movi.4h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x0f]
+; CHECK: movi.4h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x0f]
+; CHECK: movi.4h v0, #0x1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x0f]
+
+  movi.8h v0, #1
+  movi.8h v0, #1, lsl #0
+  movi.8h v0, #1, lsl #8
+
+; CHECK: movi.8h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x4f]
+; CHECK: movi.8h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x4f]
+; CHECK: movi.8h v0, #0x1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x4f]
+
+  movi.2s v0, #1, msl #8
+  movi.2s v0, #1, msl #16
+  movi.4s v0, #1, msl #8
+  movi.4s v0, #1, msl #16
+
+; CHECK: movi.2s v0, #0x1, msl #8      ; encoding: [0x20,0xc4,0x00,0x0f]
+; CHECK: movi.2s v0, #0x1, msl #16     ; encoding: [0x20,0xd4,0x00,0x0f]
+; CHECK: movi.4s v0, #0x1, msl #8      ; encoding: [0x20,0xc4,0x00,0x4f]
+; CHECK: movi.4s v0, #0x1, msl #16     ; encoding: [0x20,0xd4,0x00,0x4f]
+
+  movi.8b  v0, #1
+  movi.16b v0, #1
+
+; CHECK: movi.8b  v0, #0x1             ; encoding: [0x20,0xe4,0x00,0x0f]
+; CHECK: movi.16b v0, #0x1             ; encoding: [0x20,0xe4,0x00,0x4f]
+
+  mvni.2s v0, #1
+  mvni.2s v0, #1, lsl #0
+  mvni.2s v0, #1, lsl #8
+  mvni.2s v0, #1, lsl #16
+  mvni.2s v0, #1, lsl #24
+
+; CHECK: mvni.2s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x2f]
+; CHECK: mvni.2s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x2f]
+; CHECK: mvni.2s v0, #0x1, lsl #8      ; encoding: [0x20,0x24,0x00,0x2f]
+; CHECK: mvni.2s v0, #0x1, lsl #16     ; encoding: [0x20,0x44,0x00,0x2f]
+; CHECK: mvni.2s v0, #0x1, lsl #24     ; encoding: [0x20,0x64,0x00,0x2f]
+
+  mvni.4s v0, #1
+  mvni.4s v0, #1, lsl #0
+  mvni.4s v0, #1, lsl #8
+  mvni.4s v0, #1, lsl #16
+  mvni.4s v0, #1, lsl #24
+
+; CHECK: mvni.4s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x6f]
+; CHECK: mvni.4s v0, #0x1              ; encoding: [0x20,0x04,0x00,0x6f]
+; CHECK: mvni.4s v0, #0x1, lsl #8      ; encoding: [0x20,0x24,0x00,0x6f]
+; CHECK: mvni.4s v0, #0x1, lsl #16     ; encoding: [0x20,0x44,0x00,0x6f]
+; CHECK: mvni.4s v0, #0x1, lsl #24     ; encoding: [0x20,0x64,0x00,0x6f]
+
+  mvni.4h v0, #1
+  mvni.4h v0, #1, lsl #0
+  mvni.4h v0, #1, lsl #8
+
+; CHECK: mvni.4h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x2f]
+; CHECK: mvni.4h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x2f]
+; CHECK: mvni.4h v0, #0x1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x2f]
+
+  mvni.8h v0, #1
+  mvni.8h v0, #1, lsl #0
+  mvni.8h v0, #1, lsl #8
+
+; CHECK: mvni.8h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x6f]
+; CHECK: mvni.8h v0, #0x1              ; encoding: [0x20,0x84,0x00,0x6f]
+; CHECK: mvni.8h v0, #0x1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x6f]
+
+  mvni.2s v0, #1, msl #8
+  mvni.2s v0, #1, msl #16
+  mvni.4s v0, #1, msl #8
+  mvni.4s v0, #1, msl #16
+
+; CHECK: mvni.2s v0, #0x1, msl #8      ; encoding: [0x20,0xc4,0x00,0x2f]
+; CHECK: mvni.2s v0, #0x1, msl #16     ; encoding: [0x20,0xd4,0x00,0x2f]
+; CHECK: mvni.4s v0, #0x1, msl #8      ; encoding: [0x20,0xc4,0x00,0x6f]
+; CHECK: mvni.4s v0, #0x1, msl #16     ; encoding: [0x20,0xd4,0x00,0x6f]
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD scalar x index
+;===-------------------------------------------------------------------------===
+
+  fmla.s  s0, s0, v0[3]
+  fmla.d  d0, d0, v0[1]
+  fmls.s  s0, s0, v0[3]
+  fmls.d  d0, d0, v0[1]
+  fmulx.s s0, s0, v0[3]
+  fmulx.d d0, d0, v0[1]
+  fmul.s  s0, s0, v0[3]
+  fmul.d  d0, d0, v0[1]
+  sqdmlal.h s0, h0, v0[7]
+  sqdmlal.s d0, s0, v0[3]
+  sqdmlsl.h s0, h0, v0[7]
+  sqdmulh.h h0, h0, v0[7]
+  sqdmulh.s s0, s0, v0[3]
+  sqdmull.h s0, h0, v0[7]
+  sqdmull.s d0, s0, v0[3]
+  sqrdmulh.h  h0, h0, v0[7]
+  sqrdmulh.s  s0, s0, v0[3]
+
+; CHECK: fmla.s	s0, s0, v0[3]           ; encoding: [0x00,0x18,0xa0,0x5f]
+; CHECK: fmla.d	d0, d0, v0[1]           ; encoding: [0x00,0x18,0xc0,0x5f]
+; CHECK: fmls.s	s0, s0, v0[3]           ; encoding: [0x00,0x58,0xa0,0x5f]
+; CHECK: fmls.d	d0, d0, v0[1]           ; encoding: [0x00,0x58,0xc0,0x5f]
+; CHECK: fmulx.s	s0, s0, v0[3]           ; encoding: [0x00,0x98,0xa0,0x7f]
+; CHECK: fmulx.d	d0, d0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x7f]
+; CHECK: fmul.s	s0, s0, v0[3]           ; encoding: [0x00,0x98,0xa0,0x5f]
+; CHECK: fmul.d	d0, d0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x5f]
+; CHECK: sqdmlal.h	s0, h0, v0[7]   ; encoding: [0x00,0x38,0x70,0x5f]
+; CHECK: sqdmlal.s	d0, s0, v0[3]   ; encoding: [0x00,0x38,0xa0,0x5f]
+; CHECK: sqdmlsl.h	s0, h0, v0[7]   ; encoding: [0x00,0x78,0x70,0x5f]
+; CHECK: sqdmulh.h	h0, h0, v0[7]   ; encoding: [0x00,0xc8,0x70,0x5f]
+; CHECK: sqdmulh.s	s0, s0, v0[3]   ; encoding: [0x00,0xc8,0xa0,0x5f]
+; CHECK: sqdmull.h	s0, h0, v0[7]   ; encoding: [0x00,0xb8,0x70,0x5f]
+; CHECK: sqdmull.s	d0, s0, v0[3]   ; encoding: [0x00,0xb8,0xa0,0x5f]
+; CHECK: sqrdmulh.h	h0, h0, v0[7]   ; encoding: [0x00,0xd8,0x70,0x5f]
+; CHECK: sqrdmulh.s	s0, s0, v0[3]   ; encoding: [0x00,0xd8,0xa0,0x5f]
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD SMLAL
+;===-------------------------------------------------------------------------===
+        smlal.8h v1, v2, v3
+        smlal.4s v1, v2, v3
+        smlal.2d v1, v2, v3
+        smlal2.8h v1, v2, v3
+        smlal2.4s v1, v2, v3
+        smlal2.2d v1, v2, v3
+
+        smlal v13.8h, v8.8b, v0.8b
+        smlal v13.4s, v8.4h, v0.4h
+        smlal v13.2d, v8.2s, v0.2s
+        smlal2 v13.8h, v8.16b, v0.16b
+        smlal2 v13.4s, v8.8h, v0.8h
+        smlal2 v13.2d, v8.4s, v0.4s
+
+; CHECK: smlal.8h	v1, v2, v3      ; encoding: [0x41,0x80,0x23,0x0e]
+; CHECK: smlal.4s	v1, v2, v3      ; encoding: [0x41,0x80,0x63,0x0e]
+; CHECK: smlal.2d	v1, v2, v3      ; encoding: [0x41,0x80,0xa3,0x0e]
+; CHECK: smlal2.8h	v1, v2, v3      ; encoding: [0x41,0x80,0x23,0x4e]
+; CHECK: smlal2.4s	v1, v2, v3      ; encoding: [0x41,0x80,0x63,0x4e]
+; CHECK: smlal2.2d	v1, v2, v3      ; encoding: [0x41,0x80,0xa3,0x4e]
+; CHECK: smlal.8h	v13, v8, v0     ; encoding: [0x0d,0x81,0x20,0x0e]
+; CHECK: smlal.4s	v13, v8, v0     ; encoding: [0x0d,0x81,0x60,0x0e]
+; CHECK: smlal.2d	v13, v8, v0     ; encoding: [0x0d,0x81,0xa0,0x0e]
+; CHECK: smlal2.8h	v13, v8, v0     ; encoding: [0x0d,0x81,0x20,0x4e]
+; CHECK: smlal2.4s	v13, v8, v0     ; encoding: [0x0d,0x81,0x60,0x4e]
+; CHECK: smlal2.2d	v13, v8, v0     ; encoding: [0x0d,0x81,0xa0,0x4e]
+
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD scalar x index
+;===-------------------------------------------------------------------------===
+
+  fmla.2s v0, v0, v0[0]
+  fmla.4s v0, v0, v0[1]
+  fmla.2d v0, v0, v0[1]
+  fmls.2s v0, v0, v0[0]
+  fmls.4s v0, v0, v0[1]
+  fmls.2d v0, v0, v0[1]
+  fmulx.2s  v0, v0, v0[0]
+  fmulx.4s  v0, v0, v0[1]
+  fmulx.2d  v0, v0, v0[1]
+  fmul.2s v0, v0, v0[0]
+  fmul.4s v0, v0, v0[1]
+  fmul.2d v0, v0, v0[1]
+  mla.4h  v0, v0, v0[0]
+  mla.8h  v0, v0, v0[1]
+  mla.2s  v0, v0, v0[2]
+  mla.4s  v0, v0, v0[3]
+  mls.4h  v0, v0, v0[0]
+  mls.8h  v0, v0, v0[1]
+  mls.2s  v0, v0, v0[2]
+  mls.4s  v0, v0, v0[3]
+  mul.4h  v0, v0, v0[0]
+  mul.8h  v0, v0, v0[1]
+  mul.2s  v0, v0, v0[2]
+  mul.4s  v0, v0, v0[3]
+  smlal.4s  v0, v0, v0[0]
+  smlal2.4s v0, v0, v0[1]
+  smlal.2d  v0, v0, v0[2]
+  smlal2.2d v0, v0, v0[3]
+  smlsl.4s  v0, v0, v0[0]
+  smlsl2.4s v0, v0, v0[1]
+  smlsl.2d  v0, v0, v0[2]
+  smlsl2.2d v0, v0, v0[3]
+  smull.4s  v0, v0, v0[0]
+  smull2.4s v0, v0, v0[1]
+  smull.2d  v0, v0, v0[2]
+  smull2.2d v0, v0, v0[3]
+  sqdmlal.4s  v0, v0, v0[0]
+  sqdmlal2.4s v0, v0, v0[1]
+  sqdmlal.2d  v0, v0, v0[2]
+  sqdmlal2.2d v0, v0, v0[3]
+  sqdmlsl.4s  v0, v0, v0[0]
+  sqdmlsl2.4s v0, v0, v0[1]
+  sqdmlsl.2d  v0, v0, v0[2]
+  sqdmlsl2.2d v0, v0, v0[3]
+  sqdmulh.4h  v0, v0, v0[0]
+  sqdmulh.8h  v0, v0, v0[1]
+  sqdmulh.2s  v0, v0, v0[2]
+  sqdmulh.4s  v0, v0, v0[3]
+  sqdmull.4s  v0, v0, v0[0]
+  sqdmull2.4s v0, v0, v0[1]
+  sqdmull.2d  v0, v0, v0[2]
+  sqdmull2.2d v0, v0, v0[3]
+  sqrdmulh.4h v0, v0, v0[0]
+  sqrdmulh.8h v0, v0, v0[1]
+  sqrdmulh.2s v0, v0, v0[2]
+  sqrdmulh.4s v0, v0, v0[3]
+  umlal.4s  v0, v0, v0[0]
+  umlal2.4s v0, v0, v0[1]
+  umlal.2d  v0, v0, v0[2]
+  umlal2.2d v0, v0, v0[3]
+  umlsl.4s  v0, v0, v0[0]
+  umlsl2.4s v0, v0, v0[1]
+  umlsl.2d  v0, v0, v0[2]
+  umlsl2.2d v0, v0, v0[3]
+  umull.4s  v0, v0, v0[0]
+  umull2.4s v0, v0, v0[1]
+  umull.2d  v0, v0, v0[2]
+  umull2.2d v0, v0, v0[3]
+
+; CHECK: fmla.2s	v0, v0, v0[0]           ; encoding: [0x00,0x10,0x80,0x0f]
+; CHECK: fmla.4s	v0, v0, v0[1]           ; encoding: [0x00,0x10,0xa0,0x4f]
+; CHECK: fmla.2d	v0, v0, v0[1]           ; encoding: [0x00,0x18,0xc0,0x4f]
+; CHECK: fmls.2s	v0, v0, v0[0]           ; encoding: [0x00,0x50,0x80,0x0f]
+; CHECK: fmls.4s	v0, v0, v0[1]           ; encoding: [0x00,0x50,0xa0,0x4f]
+; CHECK: fmls.2d	v0, v0, v0[1]           ; encoding: [0x00,0x58,0xc0,0x4f]
+; CHECK: fmulx.2s	v0, v0, v0[0]   ; encoding: [0x00,0x90,0x80,0x2f]
+; CHECK: fmulx.4s	v0, v0, v0[1]   ; encoding: [0x00,0x90,0xa0,0x6f]
+; CHECK: fmulx.2d	v0, v0, v0[1]   ; encoding: [0x00,0x98,0xc0,0x6f]
+; CHECK: fmul.2s	v0, v0, v0[0]           ; encoding: [0x00,0x90,0x80,0x0f]
+; CHECK: fmul.4s	v0, v0, v0[1]           ; encoding: [0x00,0x90,0xa0,0x4f]
+; CHECK: fmul.2d	v0, v0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x4f]
+; CHECK: mla.4h	v0, v0, v0[0]           ; encoding: [0x00,0x00,0x40,0x2f]
+; CHECK: mla.8h	v0, v0, v0[1]           ; encoding: [0x00,0x00,0x50,0x6f]
+; CHECK: mla.2s	v0, v0, v0[2]           ; encoding: [0x00,0x08,0x80,0x2f]
+; CHECK: mla.4s	v0, v0, v0[3]           ; encoding: [0x00,0x08,0xa0,0x6f]
+; CHECK: mls.4h	v0, v0, v0[0]           ; encoding: [0x00,0x40,0x40,0x2f]
+; CHECK: mls.8h	v0, v0, v0[1]           ; encoding: [0x00,0x40,0x50,0x6f]
+; CHECK: mls.2s	v0, v0, v0[2]           ; encoding: [0x00,0x48,0x80,0x2f]
+; CHECK: mls.4s	v0, v0, v0[3]           ; encoding: [0x00,0x48,0xa0,0x6f]
+; CHECK: mul.4h	v0, v0, v0[0]           ; encoding: [0x00,0x80,0x40,0x0f]
+; CHECK: mul.8h	v0, v0, v0[1]           ; encoding: [0x00,0x80,0x50,0x4f]
+; CHECK: mul.2s	v0, v0, v0[2]           ; encoding: [0x00,0x88,0x80,0x0f]
+; CHECK: mul.4s	v0, v0, v0[3]           ; encoding: [0x00,0x88,0xa0,0x4f]
+; CHECK: smlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x20,0x40,0x0f]
+; CHECK: smlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x20,0x50,0x4f]
+; CHECK: smlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x28,0x80,0x0f]
+; CHECK: smlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x28,0xa0,0x4f]
+; CHECK: smlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x60,0x40,0x0f]
+; CHECK: smlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x60,0x50,0x4f]
+; CHECK: smlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x68,0x80,0x0f]
+; CHECK: smlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x68,0xa0,0x4f]
+; CHECK: smull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xa0,0x40,0x0f]
+; CHECK: smull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xa0,0x50,0x4f]
+; CHECK: smull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xa8,0x80,0x0f]
+; CHECK: smull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xa8,0xa0,0x4f]
+; CHECK: sqdmlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x30,0x40,0x0f]
+; CHECK: sqdmlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x30,0x50,0x4f]
+; CHECK: sqdmlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x38,0x80,0x0f]
+; CHECK: sqdmlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x38,0xa0,0x4f]
+; CHECK: sqdmlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x70,0x40,0x0f]
+; CHECK: sqdmlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x70,0x50,0x4f]
+; CHECK: sqdmlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x78,0x80,0x0f]
+; CHECK: sqdmlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x78,0xa0,0x4f]
+; CHECK: sqdmulh.4h	v0, v0, v0[0]   ; encoding: [0x00,0xc0,0x40,0x0f]
+; CHECK: sqdmulh.8h	v0, v0, v0[1]   ; encoding: [0x00,0xc0,0x50,0x4f]
+; CHECK: sqdmulh.2s	v0, v0, v0[2]   ; encoding: [0x00,0xc8,0x80,0x0f]
+; CHECK: sqdmulh.4s	v0, v0, v0[3]   ; encoding: [0x00,0xc8,0xa0,0x4f]
+; CHECK: sqdmull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xb0,0x40,0x0f]
+; CHECK: sqdmull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xb0,0x50,0x4f]
+; CHECK: sqdmull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xb8,0x80,0x0f]
+; CHECK: sqdmull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xb8,0xa0,0x4f]
+; CHECK: sqrdmulh.4h	v0, v0, v0[0]   ; encoding: [0x00,0xd0,0x40,0x0f]
+; CHECK: sqrdmulh.8h	v0, v0, v0[1]   ; encoding: [0x00,0xd0,0x50,0x4f]
+; CHECK: sqrdmulh.2s	v0, v0, v0[2]   ; encoding: [0x00,0xd8,0x80,0x0f]
+; CHECK: sqrdmulh.4s	v0, v0, v0[3]   ; encoding: [0x00,0xd8,0xa0,0x4f]
+; CHECK: umlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x20,0x40,0x2f]
+; CHECK: umlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x20,0x50,0x6f]
+; CHECK: umlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x28,0x80,0x2f]
+; CHECK: umlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x28,0xa0,0x6f]
+; CHECK: umlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x60,0x40,0x2f]
+; CHECK: umlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x60,0x50,0x6f]
+; CHECK: umlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x68,0x80,0x2f]
+; CHECK: umlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x68,0xa0,0x6f]
+; CHECK: umull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xa0,0x40,0x2f]
+; CHECK: umull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xa0,0x50,0x6f]
+; CHECK: umull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xa8,0x80,0x2f]
+; CHECK: umull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xa8,0xa0,0x6f]
+
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD scalar with shift
+;===-------------------------------------------------------------------------===
+
+  fcvtzs s0, s0, #1
+  fcvtzs d0, d0, #2
+  fcvtzu s0, s0, #1
+  fcvtzu d0, d0, #2
+  shl    d0, d0, #1
+  sli    d0, d0, #1
+  sqrshrn b0, h0, #1
+  sqrshrn h0, s0, #2
+  sqrshrn s0, d0, #3
+  sqrshrun b0, h0, #1
+  sqrshrun h0, s0, #2
+  sqrshrun s0, d0, #3
+  sqshlu  b0, b0, #1
+  sqshlu  h0, h0, #2
+  sqshlu  s0, s0, #3
+  sqshlu  d0, d0, #4
+  sqshl   b0, b0, #1
+  sqshl   h0, h0, #2
+  sqshl   s0, s0, #3
+  sqshl   d0, d0, #4
+  sqshrn  b0, h0, #1
+  sqshrn  h0, s0, #2
+  sqshrn  s0, d0, #3
+  sqshrun b0, h0, #1
+  sqshrun h0, s0, #2
+  sqshrun s0, d0, #3
+  sri     d0, d0, #1
+  srshr   d0, d0, #1
+  srsra   d0, d0, #1
+  sshr    d0, d0, #1
+  ucvtf   s0, s0, #1
+  ucvtf   d0, d0, #2
+  scvtf   s0, s0, #1
+  scvtf   d0, d0, #2
+  uqrshrn b0, h0, #1
+  uqrshrn h0, s0, #2
+  uqrshrn s0, d0, #3
+  uqshl   b0, b0, #1
+  uqshl   h0, h0, #2
+  uqshl   s0, s0, #3
+  uqshl   d0, d0, #4
+  uqshrn  b0, h0, #1
+  uqshrn  h0, s0, #2
+  uqshrn  s0, d0, #3
+  urshr   d0, d0, #1
+  ursra   d0, d0, #1
+  ushr    d0, d0, #1
+  usra    d0, d0, #1
+
+; CHECK: fcvtzs	s0, s0, #1              ; encoding: [0x00,0xfc,0x3f,0x5f]
+; CHECK: fcvtzs	d0, d0, #2              ; encoding: [0x00,0xfc,0x7e,0x5f]
+; CHECK: fcvtzu	s0, s0, #1              ; encoding: [0x00,0xfc,0x3f,0x7f]
+; CHECK: fcvtzu	d0, d0, #2              ; encoding: [0x00,0xfc,0x7e,0x7f]
+; CHECK: shl	d0, d0, #1              ; encoding: [0x00,0x54,0x41,0x5f]
+; CHECK: sli	d0, d0, #1              ; encoding: [0x00,0x54,0x41,0x7f]
+; CHECK: sqrshrn	b0, h0, #1              ; encoding: [0x00,0x9c,0x0f,0x5f]
+; CHECK: sqrshrn	h0, s0, #2              ; encoding: [0x00,0x9c,0x1e,0x5f]
+; CHECK: sqrshrn	s0, d0, #3              ; encoding: [0x00,0x9c,0x3d,0x5f]
+; CHECK: sqrshrun	b0, h0, #1      ; encoding: [0x00,0x8c,0x0f,0x7f]
+; CHECK: sqrshrun	h0, s0, #2      ; encoding: [0x00,0x8c,0x1e,0x7f]
+; CHECK: sqrshrun	s0, d0, #3      ; encoding: [0x00,0x8c,0x3d,0x7f]
+; CHECK: sqshlu	b0, b0, #1              ; encoding: [0x00,0x64,0x09,0x7f]
+; CHECK: sqshlu	h0, h0, #2              ; encoding: [0x00,0x64,0x12,0x7f]
+; CHECK: sqshlu	s0, s0, #3              ; encoding: [0x00,0x64,0x23,0x7f]
+; CHECK: sqshlu	d0, d0, #4              ; encoding: [0x00,0x64,0x44,0x7f]
+; CHECK: sqshl	b0, b0, #1              ; encoding: [0x00,0x74,0x09,0x5f]
+; CHECK: sqshl	h0, h0, #2              ; encoding: [0x00,0x74,0x12,0x5f]
+; CHECK: sqshl	s0, s0, #3              ; encoding: [0x00,0x74,0x23,0x5f]
+; CHECK: sqshl	d0, d0, #4              ; encoding: [0x00,0x74,0x44,0x5f]
+; CHECK: sqshrn	b0, h0, #1              ; encoding: [0x00,0x94,0x0f,0x5f]
+; CHECK: sqshrn	h0, s0, #2              ; encoding: [0x00,0x94,0x1e,0x5f]
+; CHECK: sqshrn	s0, d0, #3              ; encoding: [0x00,0x94,0x3d,0x5f]
+; CHECK: sqshrun	b0, h0, #1              ; encoding: [0x00,0x84,0x0f,0x7f]
+; CHECK: sqshrun	h0, s0, #2              ; encoding: [0x00,0x84,0x1e,0x7f]
+; CHECK: sqshrun	s0, d0, #3              ; encoding: [0x00,0x84,0x3d,0x7f]
+; CHECK: sri	d0, d0, #1              ; encoding: [0x00,0x44,0x7f,0x7f]
+; CHECK: srshr	d0, d0, #1              ; encoding: [0x00,0x24,0x7f,0x5f]
+; CHECK: srsra	d0, d0, #1              ; encoding: [0x00,0x34,0x7f,0x5f]
+; CHECK: sshr	d0, d0, #1              ; encoding: [0x00,0x04,0x7f,0x5f]
+; CHECK: ucvtf	s0, s0, #1              ; encoding: [0x00,0xe4,0x3f,0x7f]
+; CHECK: ucvtf	d0, d0, #2              ; encoding: [0x00,0xe4,0x7e,0x7f]
+; check: scvtf  s0, s0, #1              ; encoding: [0x00,0xe4,0x3f,0x5f]
+; check: scvtf  d0, d0, #2              ; encoding: [0x00,0xe4,0x7e,0x5f]
+; CHECK: uqrshrn	b0, h0, #1              ; encoding: [0x00,0x9c,0x0f,0x7f]
+; CHECK: uqrshrn	h0, s0, #2              ; encoding: [0x00,0x9c,0x1e,0x7f]
+; CHECK: uqrshrn	s0, d0, #3              ; encoding: [0x00,0x9c,0x3d,0x7f]
+; CHECK: uqshl	b0, b0, #1              ; encoding: [0x00,0x74,0x09,0x7f]
+; CHECK: uqshl	h0, h0, #2              ; encoding: [0x00,0x74,0x12,0x7f]
+; CHECK: uqshl	s0, s0, #3              ; encoding: [0x00,0x74,0x23,0x7f]
+; CHECK: uqshl	d0, d0, #4              ; encoding: [0x00,0x74,0x44,0x7f]
+; CHECK: uqshrn	b0, h0, #1              ; encoding: [0x00,0x94,0x0f,0x7f]
+; CHECK: uqshrn	h0, s0, #2              ; encoding: [0x00,0x94,0x1e,0x7f]
+; CHECK: uqshrn	s0, d0, #3              ; encoding: [0x00,0x94,0x3d,0x7f]
+; CHECK: urshr	d0, d0, #1              ; encoding: [0x00,0x24,0x7f,0x7f]
+; CHECK: ursra	d0, d0, #1              ; encoding: [0x00,0x34,0x7f,0x7f]
+; CHECK: ushr	d0, d0, #1              ; encoding: [0x00,0x04,0x7f,0x7f]
+; CHECK: usra	d0, d0, #1              ; encoding: [0x00,0x14,0x7f,0x7f]
+
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD vector with shift
+;===-------------------------------------------------------------------------===
+
+   fcvtzs.2s v0, v0, #1
+   fcvtzs.4s v0, v0, #2
+   fcvtzs.2d v0, v0, #3
+   fcvtzu.2s v0, v0, #1
+   fcvtzu.4s v0, v0, #2
+   fcvtzu.2d v0, v0, #3
+   rshrn.8b v0, v0, #1
+   rshrn2.16b v0, v0, #2
+   rshrn.4h v0, v0, #3
+   rshrn2.8h v0, v0, #4
+   rshrn.2s v0, v0, #5
+   rshrn2.4s v0, v0, #6
+   scvtf.2s v0, v0, #1
+   scvtf.4s v0, v0, #2
+   scvtf.2d v0, v0, #3
+   shl.8b v0, v0, #1
+   shl.16b v0, v0, #2
+   shl.4h v0, v0, #3
+   shl.8h v0, v0, #4
+   shl.2s v0, v0, #5
+   shl.4s v0, v0, #6
+   shl.2d v0, v0, #7
+   shrn.8b v0, v0, #1
+   shrn2.16b v0, v0, #2
+   shrn.4h v0, v0, #3
+   shrn2.8h v0, v0, #4
+   shrn.2s v0, v0, #5
+   shrn2.4s v0, v0, #6
+   sli.8b v0, v0, #1
+   sli.16b v0, v0, #2
+   sli.4h v0, v0, #3
+   sli.8h v0, v0, #4
+   sli.2s v0, v0, #5
+   sli.4s v0, v0, #6
+   sli.2d v0, v0, #7
+   sqrshrn.8b v0, v0, #1
+   sqrshrn2.16b v0, v0, #2
+   sqrshrn.4h v0, v0, #3
+   sqrshrn2.8h v0, v0, #4
+   sqrshrn.2s v0, v0, #5
+   sqrshrn2.4s v0, v0, #6
+   sqrshrun.8b v0, v0, #1
+   sqrshrun2.16b v0, v0, #2
+   sqrshrun.4h v0, v0, #3
+   sqrshrun2.8h v0, v0, #4
+   sqrshrun.2s v0, v0, #5
+   sqrshrun2.4s v0, v0, #6
+   sqshlu.8b v0, v0, #1
+   sqshlu.16b v0, v0, #2
+   sqshlu.4h v0, v0, #3
+   sqshlu.8h v0, v0, #4
+   sqshlu.2s v0, v0, #5
+   sqshlu.4s v0, v0, #6
+   sqshlu.2d v0, v0, #7
+   sqshl.8b v0, v0, #1
+   sqshl.16b v0, v0, #2
+   sqshl.4h v0, v0, #3
+   sqshl.8h v0, v0, #4
+   sqshl.2s v0, v0, #5
+   sqshl.4s v0, v0, #6
+   sqshl.2d v0, v0, #7
+   sqshrn.8b v0, v0, #1
+   sqshrn2.16b v0, v0, #2
+   sqshrn.4h v0, v0, #3
+   sqshrn2.8h v0, v0, #4
+   sqshrn.2s v0, v0, #5
+   sqshrn2.4s v0, v0, #6
+   sqshrun.8b v0, v0, #1
+   sqshrun2.16b v0, v0, #2
+   sqshrun.4h v0, v0, #3
+   sqshrun2.8h v0, v0, #4
+   sqshrun.2s v0, v0, #5
+   sqshrun2.4s v0, v0, #6
+   sri.8b v0, v0, #1
+   sri.16b v0, v0, #2
+   sri.4h v0, v0, #3
+   sri.8h v0, v0, #4
+   sri.2s v0, v0, #5
+   sri.4s v0, v0, #6
+   sri.2d v0, v0, #7
+   srshr.8b v0, v0, #1
+   srshr.16b v0, v0, #2
+   srshr.4h v0, v0, #3
+   srshr.8h v0, v0, #4
+   srshr.2s v0, v0, #5
+   srshr.4s v0, v0, #6
+   srshr.2d v0, v0, #7
+   srsra.8b v0, v0, #1
+   srsra.16b v0, v0, #2
+   srsra.4h v0, v0, #3
+   srsra.8h v0, v0, #4
+   srsra.2s v0, v0, #5
+   srsra.4s v0, v0, #6
+   srsra.2d v0, v0, #7
+   sshll.8h v0, v0, #1
+   sshll2.8h v0, v0, #2
+   sshll.4s v0, v0, #3
+   sshll2.4s v0, v0, #4
+   sshll.2d v0, v0, #5
+   sshll2.2d v0, v0, #6
+   sshr.8b v0, v0, #1
+   sshr.16b v0, v0, #2
+   sshr.4h v0, v0, #3
+   sshr.8h v0, v0, #4
+   sshr.2s v0, v0, #5
+   sshr.4s v0, v0, #6
+   sshr.2d v0, v0, #7
+   sshr.8b v0, v0, #1
+   ssra.16b v0, v0, #2
+   ssra.4h v0, v0, #3
+   ssra.8h v0, v0, #4
+   ssra.2s v0, v0, #5
+   ssra.4s v0, v0, #6
+   ssra.2d v0, v0, #7
+   ssra d0, d0, #64
+   ucvtf.2s v0, v0, #1
+   ucvtf.4s v0, v0, #2
+   ucvtf.2d v0, v0, #3
+   uqrshrn.8b v0, v0, #1
+   uqrshrn2.16b v0, v0, #2
+   uqrshrn.4h v0, v0, #3
+   uqrshrn2.8h v0, v0, #4
+   uqrshrn.2s v0, v0, #5
+   uqrshrn2.4s v0, v0, #6
+   uqshl.8b v0, v0, #1
+   uqshl.16b v0, v0, #2
+   uqshl.4h v0, v0, #3
+   uqshl.8h v0, v0, #4
+   uqshl.2s v0, v0, #5
+   uqshl.4s v0, v0, #6
+   uqshl.2d v0, v0, #7
+   uqshrn.8b v0, v0, #1
+   uqshrn2.16b v0, v0, #2
+   uqshrn.4h v0, v0, #3
+   uqshrn2.8h v0, v0, #4
+   uqshrn.2s v0, v0, #5
+   uqshrn2.4s v0, v0, #6
+   urshr.8b v0, v0, #1
+   urshr.16b v0, v0, #2
+   urshr.4h v0, v0, #3
+   urshr.8h v0, v0, #4
+   urshr.2s v0, v0, #5
+   urshr.4s v0, v0, #6
+   urshr.2d v0, v0, #7
+   ursra.8b v0, v0, #1
+   ursra.16b v0, v0, #2
+   ursra.4h v0, v0, #3
+   ursra.8h v0, v0, #4
+   ursra.2s v0, v0, #5
+   ursra.4s v0, v0, #6
+   ursra.2d v0, v0, #7
+   ushll.8h v0, v0, #1
+   ushll2.8h v0, v0, #2
+   ushll.4s v0, v0, #3
+   ushll2.4s v0, v0, #4
+   ushll.2d v0, v0, #5
+   ushll2.2d v0, v0, #6
+   ushr.8b v0, v0, #1
+   ushr.16b v0, v0, #2
+   ushr.4h v0, v0, #3
+   ushr.8h v0, v0, #4
+   ushr.2s v0, v0, #5
+   ushr.4s v0, v0, #6
+   ushr.2d v0, v0, #7
+   usra.8b v0, v0, #1
+   usra.16b v0, v0, #2
+   usra.4h v0, v0, #3
+   usra.8h v0, v0, #4
+   usra.2s v0, v0, #5
+   usra.4s v0, v0, #6
+   usra.2d v0, v0, #7
+
+; CHECK: fcvtzs.2s	v0, v0, #1      ; encoding: [0x00,0xfc,0x3f,0x0f]
+; CHECK: fcvtzs.4s	v0, v0, #2      ; encoding: [0x00,0xfc,0x3e,0x4f]
+; CHECK: fcvtzs.2d	v0, v0, #3      ; encoding: [0x00,0xfc,0x7d,0x4f]
+; CHECK: fcvtzu.2s	v0, v0, #1      ; encoding: [0x00,0xfc,0x3f,0x2f]
+; CHECK: fcvtzu.4s	v0, v0, #2      ; encoding: [0x00,0xfc,0x3e,0x6f]
+; CHECK: fcvtzu.2d	v0, v0, #3      ; encoding: [0x00,0xfc,0x7d,0x6f]
+; CHECK: rshrn.8b	v0, v0, #1      ; encoding: [0x00,0x8c,0x0f,0x0f]
+; CHECK: rshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x8c,0x0e,0x4f]
+; CHECK: rshrn.4h	v0, v0, #3      ; encoding: [0x00,0x8c,0x1d,0x0f]
+; CHECK: rshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x8c,0x1c,0x4f]
+; CHECK: rshrn.2s	v0, v0, #5      ; encoding: [0x00,0x8c,0x3b,0x0f]
+; CHECK: rshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x8c,0x3a,0x4f]
+; CHECK: scvtf.2s	v0, v0, #1      ; encoding: [0x00,0xe4,0x3f,0x0f]
+; CHECK: scvtf.4s	v0, v0, #2      ; encoding: [0x00,0xe4,0x3e,0x4f]
+; CHECK: scvtf.2d	v0, v0, #3      ; encoding: [0x00,0xe4,0x7d,0x4f]
+; CHECK: shl.8b	v0, v0, #1              ; encoding: [0x00,0x54,0x09,0x0f]
+; CHECK: shl.16b	v0, v0, #2              ; encoding: [0x00,0x54,0x0a,0x4f]
+; CHECK: shl.4h	v0, v0, #3              ; encoding: [0x00,0x54,0x13,0x0f]
+; CHECK: shl.8h	v0, v0, #4              ; encoding: [0x00,0x54,0x14,0x4f]
+; CHECK: shl.2s	v0, v0, #5              ; encoding: [0x00,0x54,0x25,0x0f]
+; CHECK: shl.4s	v0, v0, #6              ; encoding: [0x00,0x54,0x26,0x4f]
+; CHECK: shl.2d	v0, v0, #7              ; encoding: [0x00,0x54,0x47,0x4f]
+; CHECK: shrn.8b	v0, v0, #1              ; encoding: [0x00,0x84,0x0f,0x0f]
+; CHECK: shrn2.16b	v0, v0, #2      ; encoding: [0x00,0x84,0x0e,0x4f]
+; CHECK: shrn.4h	v0, v0, #3              ; encoding: [0x00,0x84,0x1d,0x0f]
+; CHECK: shrn2.8h	v0, v0, #4      ; encoding: [0x00,0x84,0x1c,0x4f]
+; CHECK: shrn.2s	v0, v0, #5              ; encoding: [0x00,0x84,0x3b,0x0f]
+; CHECK: shrn2.4s	v0, v0, #6      ; encoding: [0x00,0x84,0x3a,0x4f]
+; CHECK: sli.8b	v0, v0, #1              ; encoding: [0x00,0x54,0x09,0x2f]
+; CHECK: sli.16b	v0, v0, #2              ; encoding: [0x00,0x54,0x0a,0x6f]
+; CHECK: sli.4h	v0, v0, #3              ; encoding: [0x00,0x54,0x13,0x2f]
+; CHECK: sli.8h	v0, v0, #4              ; encoding: [0x00,0x54,0x14,0x6f]
+; CHECK: sli.2s	v0, v0, #5              ; encoding: [0x00,0x54,0x25,0x2f]
+; CHECK: sli.4s	v0, v0, #6              ; encoding: [0x00,0x54,0x26,0x6f]
+; CHECK: sli.2d	v0, v0, #7              ; encoding: [0x00,0x54,0x47,0x6f]
+; CHECK: sqrshrn.8b	v0, v0, #1      ; encoding: [0x00,0x9c,0x0f,0x0f]
+; CHECK: sqrshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x9c,0x0e,0x4f]
+; CHECK: sqrshrn.4h	v0, v0, #3      ; encoding: [0x00,0x9c,0x1d,0x0f]
+; CHECK: sqrshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x9c,0x1c,0x4f]
+; CHECK: sqrshrn.2s	v0, v0, #5      ; encoding: [0x00,0x9c,0x3b,0x0f]
+; CHECK: sqrshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x9c,0x3a,0x4f]
+; CHECK: sqrshrun.8b	v0, v0, #1      ; encoding: [0x00,0x8c,0x0f,0x2f]
+; CHECK: sqrshrun2.16b	v0, v0, #2      ; encoding: [0x00,0x8c,0x0e,0x6f]
+; CHECK: sqrshrun.4h	v0, v0, #3      ; encoding: [0x00,0x8c,0x1d,0x2f]
+; CHECK: sqrshrun2.8h	v0, v0, #4      ; encoding: [0x00,0x8c,0x1c,0x6f]
+; CHECK: sqrshrun.2s	v0, v0, #5      ; encoding: [0x00,0x8c,0x3b,0x2f]
+; CHECK: sqrshrun2.4s	v0, v0, #6      ; encoding: [0x00,0x8c,0x3a,0x6f]
+; CHECK: sqshlu.8b	v0, v0, #1      ; encoding: [0x00,0x64,0x09,0x2f]
+; CHECK: sqshlu.16b	v0, v0, #2      ; encoding: [0x00,0x64,0x0a,0x6f]
+; CHECK: sqshlu.4h	v0, v0, #3      ; encoding: [0x00,0x64,0x13,0x2f]
+; CHECK: sqshlu.8h	v0, v0, #4      ; encoding: [0x00,0x64,0x14,0x6f]
+; CHECK: sqshlu.2s	v0, v0, #5      ; encoding: [0x00,0x64,0x25,0x2f]
+; CHECK: sqshlu.4s	v0, v0, #6      ; encoding: [0x00,0x64,0x26,0x6f]
+; CHECK: sqshlu.2d	v0, v0, #7      ; encoding: [0x00,0x64,0x47,0x6f]
+; CHECK: sqshl.8b	v0, v0, #1      ; encoding: [0x00,0x74,0x09,0x0f]
+; CHECK: sqshl.16b	v0, v0, #2      ; encoding: [0x00,0x74,0x0a,0x4f]
+; CHECK: sqshl.4h	v0, v0, #3      ; encoding: [0x00,0x74,0x13,0x0f]
+; CHECK: sqshl.8h	v0, v0, #4      ; encoding: [0x00,0x74,0x14,0x4f]
+; CHECK: sqshl.2s	v0, v0, #5      ; encoding: [0x00,0x74,0x25,0x0f]
+; CHECK: sqshl.4s	v0, v0, #6      ; encoding: [0x00,0x74,0x26,0x4f]
+; CHECK: sqshl.2d	v0, v0, #7      ; encoding: [0x00,0x74,0x47,0x4f]
+; CHECK: sqshrn.8b	v0, v0, #1      ; encoding: [0x00,0x94,0x0f,0x0f]
+; CHECK: sqshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x94,0x0e,0x4f]
+; CHECK: sqshrn.4h	v0, v0, #3      ; encoding: [0x00,0x94,0x1d,0x0f]
+; CHECK: sqshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x94,0x1c,0x4f]
+; CHECK: sqshrn.2s	v0, v0, #5      ; encoding: [0x00,0x94,0x3b,0x0f]
+; CHECK: sqshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x94,0x3a,0x4f]
+; CHECK: sqshrun.8b	v0, v0, #1      ; encoding: [0x00,0x84,0x0f,0x2f]
+; CHECK: sqshrun2.16b	v0, v0, #2      ; encoding: [0x00,0x84,0x0e,0x6f]
+; CHECK: sqshrun.4h	v0, v0, #3      ; encoding: [0x00,0x84,0x1d,0x2f]
+; CHECK: sqshrun2.8h	v0, v0, #4      ; encoding: [0x00,0x84,0x1c,0x6f]
+; CHECK: sqshrun.2s	v0, v0, #5      ; encoding: [0x00,0x84,0x3b,0x2f]
+; CHECK: sqshrun2.4s	v0, v0, #6      ; encoding: [0x00,0x84,0x3a,0x6f]
+; CHECK: sri.8b	v0, v0, #1              ; encoding: [0x00,0x44,0x0f,0x2f]
+; CHECK: sri.16b	v0, v0, #2              ; encoding: [0x00,0x44,0x0e,0x6f]
+; CHECK: sri.4h	v0, v0, #3              ; encoding: [0x00,0x44,0x1d,0x2f]
+; CHECK: sri.8h	v0, v0, #4              ; encoding: [0x00,0x44,0x1c,0x6f]
+; CHECK: sri.2s	v0, v0, #5              ; encoding: [0x00,0x44,0x3b,0x2f]
+; CHECK: sri.4s	v0, v0, #6              ; encoding: [0x00,0x44,0x3a,0x6f]
+; CHECK: sri.2d	v0, v0, #7              ; encoding: [0x00,0x44,0x79,0x6f]
+; CHECK: srshr.8b	v0, v0, #1      ; encoding: [0x00,0x24,0x0f,0x0f]
+; CHECK: srshr.16b	v0, v0, #2      ; encoding: [0x00,0x24,0x0e,0x4f]
+; CHECK: srshr.4h	v0, v0, #3      ; encoding: [0x00,0x24,0x1d,0x0f]
+; CHECK: srshr.8h	v0, v0, #4      ; encoding: [0x00,0x24,0x1c,0x4f]
+; CHECK: srshr.2s	v0, v0, #5      ; encoding: [0x00,0x24,0x3b,0x0f]
+; CHECK: srshr.4s	v0, v0, #6      ; encoding: [0x00,0x24,0x3a,0x4f]
+; CHECK: srshr.2d	v0, v0, #7      ; encoding: [0x00,0x24,0x79,0x4f]
+; CHECK: srsra.8b	v0, v0, #1      ; encoding: [0x00,0x34,0x0f,0x0f]
+; CHECK: srsra.16b	v0, v0, #2      ; encoding: [0x00,0x34,0x0e,0x4f]
+; CHECK: srsra.4h	v0, v0, #3      ; encoding: [0x00,0x34,0x1d,0x0f]
+; CHECK: srsra.8h	v0, v0, #4      ; encoding: [0x00,0x34,0x1c,0x4f]
+; CHECK: srsra.2s	v0, v0, #5      ; encoding: [0x00,0x34,0x3b,0x0f]
+; CHECK: srsra.4s	v0, v0, #6      ; encoding: [0x00,0x34,0x3a,0x4f]
+; CHECK: srsra.2d	v0, v0, #7      ; encoding: [0x00,0x34,0x79,0x4f]
+; CHECK: sshll.8h	v0, v0, #1      ; encoding: [0x00,0xa4,0x09,0x0f]
+; CHECK: sshll2.8h	v0, v0, #2      ; encoding: [0x00,0xa4,0x0a,0x4f]
+; CHECK: sshll.4s	v0, v0, #3      ; encoding: [0x00,0xa4,0x13,0x0f]
+; CHECK: sshll2.4s	v0, v0, #4      ; encoding: [0x00,0xa4,0x14,0x4f]
+; CHECK: sshll.2d	v0, v0, #5      ; encoding: [0x00,0xa4,0x25,0x0f]
+; CHECK: sshll2.2d	v0, v0, #6      ; encoding: [0x00,0xa4,0x26,0x4f]
+; CHECK: sshr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x0f]
+; CHECK: sshr.16b	v0, v0, #2      ; encoding: [0x00,0x04,0x0e,0x4f]
+; CHECK: sshr.4h	v0, v0, #3              ; encoding: [0x00,0x04,0x1d,0x0f]
+; CHECK: sshr.8h	v0, v0, #4              ; encoding: [0x00,0x04,0x1c,0x4f]
+; CHECK: sshr.2s	v0, v0, #5              ; encoding: [0x00,0x04,0x3b,0x0f]
+; CHECK: sshr.4s	v0, v0, #6              ; encoding: [0x00,0x04,0x3a,0x4f]
+; CHECK: sshr.2d	v0, v0, #7              ; encoding: [0x00,0x04,0x79,0x4f]
+; CHECK: sshr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x0f]
+; CHECK: ssra.16b	v0, v0, #2      ; encoding: [0x00,0x14,0x0e,0x4f]
+; CHECK: ssra.4h	v0, v0, #3              ; encoding: [0x00,0x14,0x1d,0x0f]
+; CHECK: ssra.8h	v0, v0, #4              ; encoding: [0x00,0x14,0x1c,0x4f]
+; CHECK: ssra.2s	v0, v0, #5              ; encoding: [0x00,0x14,0x3b,0x0f]
+; CHECK: ssra.4s	v0, v0, #6              ; encoding: [0x00,0x14,0x3a,0x4f]
+; CHECK: ssra.2d	v0, v0, #7              ; encoding: [0x00,0x14,0x79,0x4f]
+; CHECK: ssra		d0, d0, #64             ; encoding: [0x00,0x14,0x40,0x5f]
+; CHECK: ucvtf.2s	v0, v0, #1      ; encoding: [0x00,0xe4,0x3f,0x2f]
+; CHECK: ucvtf.4s	v0, v0, #2      ; encoding: [0x00,0xe4,0x3e,0x6f]
+; CHECK: ucvtf.2d	v0, v0, #3      ; encoding: [0x00,0xe4,0x7d,0x6f]
+; CHECK: uqrshrn.8b	v0, v0, #1      ; encoding: [0x00,0x9c,0x0f,0x2f]
+; CHECK: uqrshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x9c,0x0e,0x6f]
+; CHECK: uqrshrn.4h	v0, v0, #3      ; encoding: [0x00,0x9c,0x1d,0x2f]
+; CHECK: uqrshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x9c,0x1c,0x6f]
+; CHECK: uqrshrn.2s	v0, v0, #5      ; encoding: [0x00,0x9c,0x3b,0x2f]
+; CHECK: uqrshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x9c,0x3a,0x6f]
+; CHECK: uqshl.8b	v0, v0, #1      ; encoding: [0x00,0x74,0x09,0x2f]
+; CHECK: uqshl.16b	v0, v0, #2      ; encoding: [0x00,0x74,0x0a,0x6f]
+; CHECK: uqshl.4h	v0, v0, #3      ; encoding: [0x00,0x74,0x13,0x2f]
+; CHECK: uqshl.8h	v0, v0, #4      ; encoding: [0x00,0x74,0x14,0x6f]
+; CHECK: uqshl.2s	v0, v0, #5      ; encoding: [0x00,0x74,0x25,0x2f]
+; CHECK: uqshl.4s	v0, v0, #6      ; encoding: [0x00,0x74,0x26,0x6f]
+; CHECK: uqshl.2d	v0, v0, #7      ; encoding: [0x00,0x74,0x47,0x6f]
+; CHECK: uqshrn.8b	v0, v0, #1      ; encoding: [0x00,0x94,0x0f,0x2f]
+; CHECK: uqshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x94,0x0e,0x6f]
+; CHECK: uqshrn.4h	v0, v0, #3      ; encoding: [0x00,0x94,0x1d,0x2f]
+; CHECK: uqshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x94,0x1c,0x6f]
+; CHECK: uqshrn.2s	v0, v0, #5      ; encoding: [0x00,0x94,0x3b,0x2f]
+; CHECK: uqshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x94,0x3a,0x6f]
+; CHECK: urshr.8b	v0, v0, #1      ; encoding: [0x00,0x24,0x0f,0x2f]
+; CHECK: urshr.16b	v0, v0, #2      ; encoding: [0x00,0x24,0x0e,0x6f]
+; CHECK: urshr.4h	v0, v0, #3      ; encoding: [0x00,0x24,0x1d,0x2f]
+; CHECK: urshr.8h	v0, v0, #4      ; encoding: [0x00,0x24,0x1c,0x6f]
+; CHECK: urshr.2s	v0, v0, #5      ; encoding: [0x00,0x24,0x3b,0x2f]
+; CHECK: urshr.4s	v0, v0, #6      ; encoding: [0x00,0x24,0x3a,0x6f]
+; CHECK: urshr.2d	v0, v0, #7      ; encoding: [0x00,0x24,0x79,0x6f]
+; CHECK: ursra.8b	v0, v0, #1      ; encoding: [0x00,0x34,0x0f,0x2f]
+; CHECK: ursra.16b	v0, v0, #2      ; encoding: [0x00,0x34,0x0e,0x6f]
+; CHECK: ursra.4h	v0, v0, #3      ; encoding: [0x00,0x34,0x1d,0x2f]
+; CHECK: ursra.8h	v0, v0, #4      ; encoding: [0x00,0x34,0x1c,0x6f]
+; CHECK: ursra.2s	v0, v0, #5      ; encoding: [0x00,0x34,0x3b,0x2f]
+; CHECK: ursra.4s	v0, v0, #6      ; encoding: [0x00,0x34,0x3a,0x6f]
+; CHECK: ursra.2d	v0, v0, #7      ; encoding: [0x00,0x34,0x79,0x6f]
+; CHECK: ushll.8h	v0, v0, #1      ; encoding: [0x00,0xa4,0x09,0x2f]
+; CHECK: ushll2.8h	v0, v0, #2      ; encoding: [0x00,0xa4,0x0a,0x6f]
+; CHECK: ushll.4s	v0, v0, #3      ; encoding: [0x00,0xa4,0x13,0x2f]
+; CHECK: ushll2.4s	v0, v0, #4      ; encoding: [0x00,0xa4,0x14,0x6f]
+; CHECK: ushll.2d	v0, v0, #5      ; encoding: [0x00,0xa4,0x25,0x2f]
+; CHECK: ushll2.2d	v0, v0, #6      ; encoding: [0x00,0xa4,0x26,0x6f]
+; CHECK: ushr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x2f]
+; CHECK: ushr.16b	v0, v0, #2      ; encoding: [0x00,0x04,0x0e,0x6f]
+; CHECK: ushr.4h	v0, v0, #3              ; encoding: [0x00,0x04,0x1d,0x2f]
+; CHECK: ushr.8h	v0, v0, #4              ; encoding: [0x00,0x04,0x1c,0x6f]
+; CHECK: ushr.2s	v0, v0, #5              ; encoding: [0x00,0x04,0x3b,0x2f]
+; CHECK: ushr.4s	v0, v0, #6              ; encoding: [0x00,0x04,0x3a,0x6f]
+; CHECK: ushr.2d	v0, v0, #7              ; encoding: [0x00,0x04,0x79,0x6f]
+; CHECK: usra.8b	v0, v0, #1              ; encoding: [0x00,0x14,0x0f,0x2f]
+; CHECK: usra.16b	v0, v0, #2      ; encoding: [0x00,0x14,0x0e,0x6f]
+; CHECK: usra.4h	v0, v0, #3              ; encoding: [0x00,0x14,0x1d,0x2f]
+; CHECK: usra.8h	v0, v0, #4              ; encoding: [0x00,0x14,0x1c,0x6f]
+; CHECK: usra.2s	v0, v0, #5              ; encoding: [0x00,0x14,0x3b,0x2f]
+; CHECK: usra.4s	v0, v0, #6              ; encoding: [0x00,0x14,0x3a,0x6f]
+; CHECK: usra.2d	v0, v0, #7              ; encoding: [0x00,0x14,0x79,0x6f]
+
+
+; ARM Verbose syntax variants.
+
+   rshrn v9.8b, v11.8h, #1
+   rshrn2 v8.16b, v9.8h, #2
+   rshrn v7.4h, v8.4s, #3
+   rshrn2 v6.8h, v7.4s, #4
+   rshrn v5.2s, v6.2d, #5
+   rshrn2 v4.4s, v5.2d, #6
+
+   shrn v9.8b, v11.8h, #1
+   shrn2 v8.16b, v9.8h, #2
+   shrn v7.4h, v8.4s, #3
+   shrn2 v6.8h, v7.4s, #4
+   shrn v5.2s, v6.2d, #5
+   shrn2 v4.4s, v5.2d, #6
+
+   sqrshrn v9.8b, v11.8h, #1
+   sqrshrn2 v8.16b, v9.8h, #2
+   sqrshrn v7.4h, v8.4s, #3
+   sqrshrn2 v6.8h, v7.4s, #4
+   sqrshrn v5.2s, v6.2d, #5
+   sqrshrn2 v4.4s, v5.2d, #6
+
+   sqshrn v9.8b, v11.8h, #1
+   sqshrn2 v8.16b, v9.8h, #2
+   sqshrn v7.4h, v8.4s, #3
+   sqshrn2 v6.8h, v7.4s, #4
+   sqshrn v5.2s, v6.2d, #5
+   sqshrn2 v4.4s, v5.2d, #6
+
+   sqrshrun v9.8b, v11.8h, #1
+   sqrshrun2 v8.16b, v9.8h, #2
+   sqrshrun v7.4h, v8.4s, #3
+   sqrshrun2 v6.8h, v7.4s, #4
+   sqrshrun v5.2s, v6.2d, #5
+   sqrshrun2 v4.4s, v5.2d, #6
+
+   sqshrun v9.8b, v11.8h, #1
+   sqshrun2 v8.16b, v9.8h, #2
+   sqshrun v7.4h, v8.4s, #3
+   sqshrun2 v6.8h, v7.4s, #4
+   sqshrun v5.2s, v6.2d, #5
+   sqshrun2 v4.4s, v5.2d, #6
+
+   uqrshrn v9.8b, v11.8h, #1
+   uqrshrn2 v8.16b, v9.8h, #2
+   uqrshrn v7.4h, v8.4s, #3
+   uqrshrn2 v6.8h, v7.4s, #4
+   uqrshrn v5.2s, v6.2d, #5
+   uqrshrn2 v4.4s, v5.2d, #6
+
+   uqshrn v9.8b, v11.8h, #1
+   uqshrn2 v8.16b, v9.8h, #2
+   uqshrn v7.4h, v8.4s, #3
+   uqshrn2 v6.8h, v7.4s, #4
+   uqshrn v5.2s, v6.2d, #5
+   uqshrn2 v4.4s, v5.2d, #6
+
+   sshll2 v10.8h, v3.16b, #6
+   sshll2 v11.4s, v4.8h, #5
+   sshll2 v12.2d, v5.4s, #4
+   sshll v13.8h, v6.8b, #3
+   sshll v14.4s, v7.4h, #2
+   sshll v15.2d, v8.2s, #7
+
+   ushll2 v10.8h, v3.16b, #6
+   ushll2 v11.4s, v4.8h, #5
+   ushll2 v12.2d, v5.4s, #4
+   ushll v13.8h, v6.8b, #3
+   ushll v14.4s, v7.4h, #2
+   ushll v15.2d, v8.2s, #7
+
+
+; CHECK: rshrn.8b	v9, v11, #1     ; encoding: [0x69,0x8d,0x0f,0x0f]
+; CHECK: rshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x8d,0x0e,0x4f]
+; CHECK: rshrn.4h	v7, v8, #3      ; encoding: [0x07,0x8d,0x1d,0x0f]
+; CHECK: rshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x8c,0x1c,0x4f]
+; CHECK: rshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x8c,0x3b,0x0f]
+; CHECK: rshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x8c,0x3a,0x4f]
+; CHECK: shrn.8b	v9, v11, #1             ; encoding: [0x69,0x85,0x0f,0x0f]
+; CHECK: shrn2.16b	v8, v9, #2      ; encoding: [0x28,0x85,0x0e,0x4f]
+; CHECK: shrn.4h	v7, v8, #3              ; encoding: [0x07,0x85,0x1d,0x0f]
+; CHECK: shrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x84,0x1c,0x4f]
+; CHECK: shrn.2s	v5, v6, #5              ; encoding: [0xc5,0x84,0x3b,0x0f]
+; CHECK: shrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x84,0x3a,0x4f]
+; CHECK: sqrshrn.8b	v9, v11, #1     ; encoding: [0x69,0x9d,0x0f,0x0f]
+; CHECK: sqrshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x9d,0x0e,0x4f]
+; CHECK: sqrshrn.4h	v7, v8, #3      ; encoding: [0x07,0x9d,0x1d,0x0f]
+; CHECK: sqrshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x9c,0x1c,0x4f]
+; CHECK: sqrshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x9c,0x3b,0x0f]
+; CHECK: sqrshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x9c,0x3a,0x4f]
+; CHECK: sqshrn.8b	v9, v11, #1     ; encoding: [0x69,0x95,0x0f,0x0f]
+; CHECK: sqshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x95,0x0e,0x4f]
+; CHECK: sqshrn.4h	v7, v8, #3      ; encoding: [0x07,0x95,0x1d,0x0f]
+; CHECK: sqshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x94,0x1c,0x4f]
+; CHECK: sqshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x94,0x3b,0x0f]
+; CHECK: sqshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x94,0x3a,0x4f]
+; CHECK: sqrshrun.8b	v9, v11, #1     ; encoding: [0x69,0x8d,0x0f,0x2f]
+; CHECK: sqrshrun2.16b	v8, v9, #2      ; encoding: [0x28,0x8d,0x0e,0x6f]
+; CHECK: sqrshrun.4h	v7, v8, #3      ; encoding: [0x07,0x8d,0x1d,0x2f]
+; CHECK: sqrshrun2.8h	v6, v7, #4      ; encoding: [0xe6,0x8c,0x1c,0x6f]
+; CHECK: sqrshrun.2s	v5, v6, #5      ; encoding: [0xc5,0x8c,0x3b,0x2f]
+; CHECK: sqrshrun2.4s	v4, v5, #6      ; encoding: [0xa4,0x8c,0x3a,0x6f]
+; CHECK: sqshrun.8b	v9, v11, #1     ; encoding: [0x69,0x85,0x0f,0x2f]
+; CHECK: sqshrun2.16b	v8, v9, #2      ; encoding: [0x28,0x85,0x0e,0x6f]
+; CHECK: sqshrun.4h	v7, v8, #3      ; encoding: [0x07,0x85,0x1d,0x2f]
+; CHECK: sqshrun2.8h	v6, v7, #4      ; encoding: [0xe6,0x84,0x1c,0x6f]
+; CHECK: sqshrun.2s	v5, v6, #5      ; encoding: [0xc5,0x84,0x3b,0x2f]
+; CHECK: sqshrun2.4s	v4, v5, #6      ; encoding: [0xa4,0x84,0x3a,0x6f]
+; CHECK: uqrshrn.8b	v9, v11, #1     ; encoding: [0x69,0x9d,0x0f,0x2f]
+; CHECK: uqrshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x9d,0x0e,0x6f]
+; CHECK: uqrshrn.4h	v7, v8, #3      ; encoding: [0x07,0x9d,0x1d,0x2f]
+; CHECK: uqrshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x9c,0x1c,0x6f]
+; CHECK: uqrshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x9c,0x3b,0x2f]
+; CHECK: uqrshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x9c,0x3a,0x6f]
+; CHECK: uqshrn.8b	v9, v11, #1     ; encoding: [0x69,0x95,0x0f,0x2f]
+; CHECK: uqshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x95,0x0e,0x6f]
+; CHECK: uqshrn.4h	v7, v8, #3      ; encoding: [0x07,0x95,0x1d,0x2f]
+; CHECK: uqshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x94,0x1c,0x6f]
+; CHECK: uqshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x94,0x3b,0x2f]
+; CHECK: uqshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x94,0x3a,0x6f]
+; CHECK: sshll2.8h	v10, v3, #6     ; encoding: [0x6a,0xa4,0x0e,0x4f]
+; CHECK: sshll2.4s	v11, v4, #5     ; encoding: [0x8b,0xa4,0x15,0x4f]
+; CHECK: sshll2.2d	v12, v5, #4     ; encoding: [0xac,0xa4,0x24,0x4f]
+; CHECK: sshll.8h	v13, v6, #3     ; encoding: [0xcd,0xa4,0x0b,0x0f]
+; CHECK: sshll.4s	v14, v7, #2     ; encoding: [0xee,0xa4,0x12,0x0f]
+; CHECK: sshll.2d	v15, v8, #7     ; encoding: [0x0f,0xa5,0x27,0x0f]
+; CHECK: ushll2.8h	v10, v3, #6     ; encoding: [0x6a,0xa4,0x0e,0x6f]
+; CHECK: ushll2.4s	v11, v4, #5     ; encoding: [0x8b,0xa4,0x15,0x6f]
+; CHECK: ushll2.2d	v12, v5, #4     ; encoding: [0xac,0xa4,0x24,0x6f]
+; CHECK: ushll.8h	v13, v6, #3     ; encoding: [0xcd,0xa4,0x0b,0x2f]
+; CHECK: ushll.4s	v14, v7, #2     ; encoding: [0xee,0xa4,0x12,0x2f]
+; CHECK: ushll.2d	v15, v8, #7     ; encoding: [0x0f,0xa5,0x27,0x2f]
+
+
+  pmull.8h v0, v0, v0
+  pmull2.8h v0, v0, v0
+  pmull.1q v2, v3, v4
+  pmull2.1q v2, v3, v4
+  pmull v2.1q, v3.1d, v4.1d
+  pmull2 v2.1q, v3.2d, v4.2d
+
+; CHECK: pmull.8h	v0, v0, v0      ; encoding: [0x00,0xe0,0x20,0x0e]
+; CHECK: pmull2.8h	v0, v0, v0      ; encoding: [0x00,0xe0,0x20,0x4e]
+; CHECK: pmull.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x0e]
+; CHECK: pmull2.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x4e]
+; CHECK: pmull.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x0e]
+; CHECK: pmull2.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x4e]
+
+
+  faddp.2d d1, v2
+  faddp.2s s3, v4
+; CHECK: faddp.2d	d1, v2          ; encoding: [0x41,0xd8,0x70,0x7e]
+; CHECK: faddp.2s	s3, v4          ; encoding: [0x83,0xd8,0x30,0x7e]
+
+  tbl.16b v2, {v4,v5,v6,v7}, v1
+  tbl.8b v0, {v4,v5,v6,v7}, v1
+  tbl.16b v2, {v5}, v1
+  tbl.8b v0, {v5}, v1
+  tbl.16b v2, {v5,v6,v7}, v1
+  tbl.8b v0, {v5,v6,v7}, v1
+  tbl.16b v2, {v6,v7}, v1
+  tbl.8b v0, {v6,v7}, v1
+; CHECK: tbl.16b	v2, { v4, v5, v6, v7 }, v1 ; encoding: [0x82,0x60,0x01,0x4e]
+; CHECK: tbl.8b	v0, { v4, v5, v6, v7 }, v1 ; encoding: [0x80,0x60,0x01,0x0e]
+; CHECK: tbl.16b	v2, { v5 }, v1          ; encoding: [0xa2,0x00,0x01,0x4e]
+; CHECK: tbl.8b	v0, { v5 }, v1          ; encoding: [0xa0,0x00,0x01,0x0e]
+; CHECK: tbl.16b	v2, { v5, v6, v7 }, v1  ; encoding: [0xa2,0x40,0x01,0x4e]
+; CHECK: tbl.8b	v0, { v5, v6, v7 }, v1  ; encoding: [0xa0,0x40,0x01,0x0e]
+; CHECK: tbl.16b	v2, { v6, v7 }, v1      ; encoding: [0xc2,0x20,0x01,0x4e]
+; CHECK: tbl.8b	v0, { v6, v7 }, v1      ; encoding: [0xc0,0x20,0x01,0x0e]
+
+  tbl v2.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v1.16b
+  tbl v0.8b, {v4.16b,v5.16b,v6.16b,v7.16b}, v1.8b
+  tbl v2.16b, {v5.16b}, v1.16b
+  tbl v0.8b, {v5.16b}, v1.8b
+  tbl v2.16b, {v5.16b,v6.16b,v7.16b}, v1.16b
+  tbl v0.8b, {v5.16b,v6.16b,v7.16b}, v1.8b
+  tbl v2.16b, {v6.16b,v7.16b}, v1.16b
+  tbl v0.8b, {v6.16b,v7.16b}, v1.8b
+; CHECK: tbl.16b v2, { v4, v5, v6, v7 }, v1 ; encoding: [0x82,0x60,0x01,0x4e]
+; CHECK: tbl.8b v0, { v4, v5, v6, v7 }, v1 ; encoding: [0x80,0x60,0x01,0x0e]
+; CHECK: tbl.16b v2, { v5 }, v1          ; encoding: [0xa2,0x00,0x01,0x4e]
+; CHECK: tbl.8b v0, { v5 }, v1          ; encoding: [0xa0,0x00,0x01,0x0e]
+; CHECK: tbl.16b v2, { v5, v6, v7 }, v1  ; encoding: [0xa2,0x40,0x01,0x4e]
+; CHECK: tbl.8b v0, { v5, v6, v7 }, v1  ; encoding: [0xa0,0x40,0x01,0x0e]
+; CHECK: tbl.16b v2, { v6, v7 }, v1      ; encoding: [0xc2,0x20,0x01,0x4e]
+; CHECK: tbl.8b v0, { v6, v7 }, v1      ; encoding: [0xc0,0x20,0x01,0x0e]
+
+  sqdmull	s0, h0, h0
+  sqdmull	d0, s0, s0
+; CHECK: sqdmull	s0, h0, h0              ; encoding: [0x00,0xd0,0x60,0x5e]
+; CHECK: sqdmull	d0, s0, s0              ; encoding: [0x00,0xd0,0xa0,0x5e]
+
+  frsqrte s0, s0
+  frsqrte d0, d0
+; CHECK: frsqrte s0, s0                  ; encoding: [0x00,0xd8,0xa1,0x7e]
+; CHECK: frsqrte d0, d0                  ; encoding: [0x00,0xd8,0xe1,0x7e]
+
+  mov.16b v0, v0
+  mov.2s v0, v0
+; CHECK: mov.16b v0, v0              ; encoding: [0x00,0x1c,0xa0,0x4e]
+; CHECK: mov.8b v0, v0              ; encoding: [0x00,0x1c,0xa0,0x0e]
+
+
+; uadalp/sadalp verbose mode aliases.
+  uadalp v14.4h, v25.8b
+  uadalp v15.8h, v24.16b
+  uadalp v16.2s, v23.4h
+  uadalp v17.4s, v22.8h
+  uadalp v18.1d, v21.2s
+  uadalp v19.2d, v20.4s
+
+  sadalp v1.4h, v11.8b
+  sadalp v2.8h, v12.16b
+  sadalp v3.2s, v13.4h
+  sadalp v4.4s, v14.8h
+  sadalp v5.1d, v15.2s
+  sadalp v6.2d, v16.4s
+
+; CHECK: uadalp.4h	v14, v25        ; encoding: [0x2e,0x6b,0x20,0x2e]
+; CHECK: uadalp.8h	v15, v24        ; encoding: [0x0f,0x6b,0x20,0x6e]
+; CHECK: uadalp.2s	v16, v23        ; encoding: [0xf0,0x6a,0x60,0x2e]
+; CHECK: uadalp.4s	v17, v22        ; encoding: [0xd1,0x6a,0x60,0x6e]
+; CHECK: uadalp.1d	v18, v21        ; encoding: [0xb2,0x6a,0xa0,0x2e]
+; CHECK: uadalp.2d	v19, v20        ; encoding: [0x93,0x6a,0xa0,0x6e]
+; CHECK: sadalp.4h	v1, v11         ; encoding: [0x61,0x69,0x20,0x0e]
+; CHECK: sadalp.8h	v2, v12         ; encoding: [0x82,0x69,0x20,0x4e]
+; CHECK: sadalp.2s	v3, v13         ; encoding: [0xa3,0x69,0x60,0x0e]
+; CHECK: sadalp.4s	v4, v14         ; encoding: [0xc4,0x69,0x60,0x4e]
+; CHECK: sadalp.1d	v5, v15         ; encoding: [0xe5,0x69,0xa0,0x0e]
+; CHECK: sadalp.2d	v6, v16         ; encoding: [0x06,0x6a,0xa0,0x4e]
+
+; MVN is an alias for 'not'.
+  mvn v1.8b, v4.8b
+  mvn v19.16b, v17.16b
+  mvn.8b v10, v6
+  mvn.16b v11, v7
+
+; CHECK: mvn.8b	v1, v4                  ; encoding: [0x81,0x58,0x20,0x2e]
+; CHECK: mvn.16b	v19, v17                ; encoding: [0x33,0x5a,0x20,0x6e]
+; CHECK: mvn.8b	v10, v6                 ; encoding: [0xca,0x58,0x20,0x2e]
+; CHECK: mvn.16b	v11, v7                 ; encoding: [0xeb,0x58,0x20,0x6e]
+
+; sqdmull verbose mode aliases
+ sqdmull v10.4s, v12.4h, v12.4h
+ sqdmull2 v10.4s, v13.8h, v13.8h
+ sqdmull v10.2d, v13.2s, v13.2s
+ sqdmull2 v10.2d, v13.4s, v13.4s
+; CHECK: sqdmull.4s	v10, v12, v12   ; encoding: [0x8a,0xd1,0x6c,0x0e]
+; CHECK: sqdmull2.4s	v10, v13, v13   ; encoding: [0xaa,0xd1,0x6d,0x4e]
+; CHECK: sqdmull.2d	v10, v13, v13   ; encoding: [0xaa,0xd1,0xad,0x0e]
+; CHECK: sqdmull2.2d	v10, v13, v13   ; encoding: [0xaa,0xd1,0xad,0x4e]
+
+; xtn verbose mode aliases
+ xtn v14.8b, v14.8h
+ xtn2 v14.16b, v14.8h
+ xtn v14.4h, v14.4s
+ xtn2 v14.8h, v14.4s
+ xtn v14.2s, v14.2d
+ xtn2 v14.4s, v14.2d
+; CHECK: xtn.8b v14, v14                ; encoding: [0xce,0x29,0x21,0x0e]
+; CHECK: xtn2.16b v14, v14              ; encoding: [0xce,0x29,0x21,0x4e]
+; CHECK: xtn.4h v14, v14                ; encoding: [0xce,0x29,0x61,0x0e]
+; CHECK: xtn2.8h v14, v14               ; encoding: [0xce,0x29,0x61,0x4e]
+; CHECK: xtn.2s v14, v14                ; encoding: [0xce,0x29,0xa1,0x0e]
+; CHECK: xtn2.4s v14, v14               ; encoding: [0xce,0x29,0xa1,0x4e]
+
+; uaddl verbose mode aliases
+ uaddl v9.8h, v13.8b, v14.8b
+ uaddl2 v9.8h, v13.16b, v14.16b
+ uaddl v9.4s, v13.4h, v14.4h
+ uaddl2 v9.4s, v13.8h, v14.8h
+ uaddl v9.2d, v13.2s, v14.2s
+ uaddl2 v9.2d, v13.4s, v14.4s
+; CHECK: uaddl.8h	v9, v13, v14    ; encoding: [0xa9,0x01,0x2e,0x2e]
+; CHECK: uaddl2.8h	v9, v13, v14    ; encoding: [0xa9,0x01,0x2e,0x6e]
+; CHECK: uaddl.4s	v9, v13, v14    ; encoding: [0xa9,0x01,0x6e,0x2e]
+; CHECK: uaddl2.4s	v9, v13, v14    ; encoding: [0xa9,0x01,0x6e,0x6e]
+; CHECK: uaddl.2d	v9, v13, v14    ; encoding: [0xa9,0x01,0xae,0x2e]
+; CHECK: uaddl2.2d	v9, v13, v14    ; encoding: [0xa9,0x01,0xae,0x6e]
+
+; bit verbose mode aliases
+ bit v9.16b, v10.16b, v10.16b
+ bit v9.8b, v10.8b, v10.8b
+; CHECK: bit.16b v9, v10, v10           ; encoding: [0x49,0x1d,0xaa,0x6e]
+; CHECK: bit.8b v9, v10, v10            ; encoding: [0x49,0x1d,0xaa,0x2e]
+
+; pmull verbose mode aliases
+ pmull v8.8h, v8.8b, v8.8b
+ pmull2 v8.8h, v8.16b, v8.16b
+ pmull v8.1q, v8.1d, v8.1d
+ pmull2 v8.1q, v8.2d, v8.2d
+; CHECK: pmull.8h	v8, v8, v8      ; encoding: [0x08,0xe1,0x28,0x0e]
+; CHECK: pmull2.8h	v8, v8, v8      ; encoding: [0x08,0xe1,0x28,0x4e]
+; CHECK: pmull.1q	v8, v8, v8      ; encoding: [0x08,0xe1,0xe8,0x0e]
+; CHECK: pmull2.1q	v8, v8, v8      ; encoding: [0x08,0xe1,0xe8,0x4e]
+
+; usubl verbose mode aliases
+ usubl v9.8h, v13.8b, v14.8b
+ usubl2 v9.8h, v13.16b, v14.16b
+ usubl v9.4s, v13.4h, v14.4h
+ usubl2 v9.4s, v13.8h, v14.8h
+ usubl v9.2d, v13.2s, v14.2s
+ usubl2 v9.2d, v13.4s, v14.4s
+; CHECK: usubl.8h	v9, v13, v14    ; encoding: [0xa9,0x21,0x2e,0x2e]
+; CHECK: usubl2.8h	v9, v13, v14    ; encoding: [0xa9,0x21,0x2e,0x6e]
+; CHECK: usubl.4s	v9, v13, v14    ; encoding: [0xa9,0x21,0x6e,0x2e]
+; CHECK: usubl2.4s	v9, v13, v14    ; encoding: [0xa9,0x21,0x6e,0x6e]
+; CHECK: usubl.2d	v9, v13, v14    ; encoding: [0xa9,0x21,0xae,0x2e]
+; CHECK: usubl2.2d	v9, v13, v14    ; encoding: [0xa9,0x21,0xae,0x6e]
+
+; uabdl verbose mode aliases
+ uabdl v9.8h, v13.8b, v14.8b
+ uabdl2 v9.8h, v13.16b, v14.16b
+ uabdl v9.4s, v13.4h, v14.4h
+ uabdl2 v9.4s, v13.8h, v14.8h
+ uabdl v9.2d, v13.2s, v14.2s
+ uabdl2 v9.2d, v13.4s, v14.4s
+; CHECK: uabdl.8h	v9, v13, v14    ; encoding: [0xa9,0x71,0x2e,0x2e]
+; CHECK: uabdl2.8h	v9, v13, v14    ; encoding: [0xa9,0x71,0x2e,0x6e]
+; CHECK: uabdl.4s	v9, v13, v14    ; encoding: [0xa9,0x71,0x6e,0x2e]
+; CHECK: uabdl2.4s	v9, v13, v14    ; encoding: [0xa9,0x71,0x6e,0x6e]
+; CHECK: uabdl.2d	v9, v13, v14    ; encoding: [0xa9,0x71,0xae,0x2e]
+; CHECK: uabdl2.2d	v9, v13, v14    ; encoding: [0xa9,0x71,0xae,0x6e]
+
+; umull verbose mode aliases
+ umull v9.8h, v13.8b, v14.8b
+ umull2 v9.8h, v13.16b, v14.16b
+ umull v9.4s, v13.4h, v14.4h
+ umull2 v9.4s, v13.8h, v14.8h
+ umull v9.2d, v13.2s, v14.2s
+ umull2 v9.2d, v13.4s, v14.4s
+; CHECK: umull.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x2e]
+; CHECK: umull2.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x6e]
+; CHECK: umull.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x2e]
+; CHECK: umull2.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x6e]
+; CHECK: umull.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x2e]
+; CHECK: umull2.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x6e]
+
+; smull verbose mode aliases
+ smull v9.8h, v13.8b, v14.8b
+ smull2 v9.8h, v13.16b, v14.16b
+ smull v9.4s, v13.4h, v14.4h
+ smull2 v9.4s, v13.8h, v14.8h
+ smull v9.2d, v13.2s, v14.2s
+ smull2 v9.2d, v13.4s, v14.4s
+; CHECK: smull.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x0e]
+; CHECK: smull2.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x4e]
+; CHECK: smull.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x0e]
+; CHECK: smull2.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x4e]
+; CHECK: smull.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x0e]
+; CHECK: smull2.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x4e]
diff --git a/test/MC/AArch64/arm64-aliases.s b/test/MC/AArch64/arm64-aliases.s
new file mode 100644
index 0000000..c3affe3
--- /dev/null
+++ b/test/MC/AArch64/arm64-aliases.s
@@ -0,0 +1,753 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon -output-asm-variant=1 -show-encoding < %s | FileCheck %s
+
+foo:
+;-----------------------------------------------------------------------------
+; ADD #0 to/from SP/WSP is a MOV
+;-----------------------------------------------------------------------------
+  add x1, sp, #0
+; CHECK: mov x1, sp
+  add sp, x2, #0
+; CHECK: mov sp, x2
+  add w3, wsp, #0
+; CHECK: mov w3, wsp
+  add wsp, w4, #0
+; CHECK: mov wsp, w4
+  mov x5, sp
+; CHECK: mov x5, sp
+  mov sp, x6
+; CHECK: mov sp, x6
+  mov w7, wsp
+; CHECK: mov w7, wsp
+  mov wsp, w8
+; CHECK: mov wsp, w8
+
+;-----------------------------------------------------------------------------
+; ORR Rd, Rn, Rn is a MOV
+;-----------------------------------------------------------------------------
+  orr x2, xzr, x9
+; CHECK: mov x2, x9
+  orr w2, wzr, w9
+; CHECK: mov w2, w9
+  mov x3, x4
+; CHECK: mov x3, x4
+  mov w5, w6
+; CHECK: mov w5, w6
+
+;-----------------------------------------------------------------------------
+; TST Xn, #<imm>
+;-----------------------------------------------------------------------------
+        tst w1, #3
+        tst x1, #3
+        tst w1, w2
+        tst x1, x2
+        ands wzr, w1, w2, lsl #2
+        ands xzr, x1, x2, lsl #3
+        tst w3, w7, lsl #31
+        tst x2, x20, asr #0
+
+; CHECK: tst	w1, #0x3                ; encoding: [0x3f,0x04,0x00,0x72]
+; CHECK: tst	x1, #0x3                ; encoding: [0x3f,0x04,0x40,0xf2]
+; CHECK: tst	w1, w2                  ; encoding: [0x3f,0x00,0x02,0x6a]
+; CHECK: tst	x1, x2                  ; encoding: [0x3f,0x00,0x02,0xea]
+; CHECK: tst	w1, w2, lsl #2          ; encoding: [0x3f,0x08,0x02,0x6a]
+; CHECK: tst	x1, x2, lsl #3          ; encoding: [0x3f,0x0c,0x02,0xea]
+; CHECK: tst	w3, w7, lsl #31         ; encoding: [0x7f,0x7c,0x07,0x6a]
+; CHECK: tst	x2, x20, asr #0         ; encoding: [0x5f,0x00,0x94,0xea]
+
+;-----------------------------------------------------------------------------
+; ADDS to WZR/XZR is a CMN
+;-----------------------------------------------------------------------------
+  cmn w1, #3, lsl #0
+  cmn x2, #4194304
+  cmn w4, w5
+  cmn x6, x7
+  cmn w8, w9, asr #3
+  cmn x2, x3, lsr #4
+  cmn x2, w3, uxtb #1
+  cmn x4, x5, uxtx #1
+
+; CHECK: cmn	w1, #3                  ; encoding: [0x3f,0x0c,0x00,0x31]
+; CHECK: cmn	x2, #1024, lsl #12      ; encoding: [0x5f,0x00,0x50,0xb1]
+; CHECK: cmn	w4, w5                  ; encoding: [0x9f,0x00,0x05,0x2b]
+; CHECK: cmn	x6, x7                  ; encoding: [0xdf,0x00,0x07,0xab]
+; CHECK: cmn	w8, w9, asr #3          ; encoding: [0x1f,0x0d,0x89,0x2b]
+; CHECK: cmn	x2, x3, lsr #4          ; encoding: [0x5f,0x10,0x43,0xab]
+; CHECK: cmn	x2, w3, uxtb #1         ; encoding: [0x5f,0x04,0x23,0xab]
+; CHECK: cmn	x4, x5, uxtx #1         ; encoding: [0x9f,0x64,0x25,0xab]
+
+
+;-----------------------------------------------------------------------------
+; SUBS to WZR/XZR is a CMP
+;-----------------------------------------------------------------------------
+  cmp w1, #1024, lsl #12
+  cmp x2, #1024
+  cmp w4, w5
+  cmp x6, x7
+  cmp w8, w9, asr #3
+  cmp x2, x3, lsr #4
+  cmp x2, w3, uxth #2
+  cmp x4, x5, uxtx
+  cmp wzr, w1
+  cmp x8, w8, uxtw
+  cmp w9, w8, uxtw
+  cmp wsp, w9, lsl #0
+
+; CHECK: cmp	w1, #1024, lsl #12      ; encoding: [0x3f,0x00,0x50,0x71]
+; CHECK: cmp	x2, #1024               ; encoding: [0x5f,0x00,0x10,0xf1]
+; CHECK: cmp	w4, w5                  ; encoding: [0x9f,0x00,0x05,0x6b]
+; CHECK: cmp	x6, x7                  ; encoding: [0xdf,0x00,0x07,0xeb]
+; CHECK: cmp	w8, w9, asr #3          ; encoding: [0x1f,0x0d,0x89,0x6b]
+; CHECK: cmp	x2, x3, lsr #4          ; encoding: [0x5f,0x10,0x43,0xeb]
+; CHECK: cmp	x2, w3, uxth #2         ; encoding: [0x5f,0x28,0x23,0xeb]
+; CHECK: cmp	x4, x5, uxtx            ; encoding: [0x9f,0x60,0x25,0xeb]
+; CHECK: cmp	wzr, w1                 ; encoding: [0xff,0x03,0x01,0x6b]
+; CHECK: cmp	x8, w8, uxtw            ; encoding: [0x1f,0x41,0x28,0xeb]
+; CHECK: cmp	w9, w8, uxtw            ; encoding: [0x3f,0x41,0x28,0x6b]
+; CHECK: cmp	wsp, w9                 ; encoding: [0xff,0x43,0x29,0x6b]
+
+
+;-----------------------------------------------------------------------------
+; SUB/SUBS from WZR/XZR is a NEG
+;-----------------------------------------------------------------------------
+
+  neg w0, w1
+; CHECK: neg w0, w1
+  neg w0, w1, lsl #1
+; CHECK: neg w0, w1, lsl #1
+  neg x0, x1
+; CHECK: neg x0, x1
+  neg x0, x1, asr #1
+; CHECK: neg x0, x1, asr #1
+  negs w0, w1
+; CHECK: negs w0, w1
+  negs w0, w1, lsl #1
+; CHECK: negs w0, w1, lsl #1
+  negs x0, x1
+; CHECK: negs x0, x1
+  negs x0, x1, asr #1
+; CHECK: negs x0, x1, asr #1
+
+;-----------------------------------------------------------------------------
+; MOV aliases
+;-----------------------------------------------------------------------------
+
+  mov x0, #281470681743360
+  mov x0, #18446744073709486080
+
+; CHECK: movz	x0, #0xffff, lsl #32
+; CHECK: movn	x0, #0xffff
+
+  mov w0, #0xffffffff
+  mov w0, #0xffffff00
+  mov wzr, #0xffffffff
+  mov wzr, #0xffffff00
+
+; CHECK: movn   w0, #0
+; CHECK: movn   w0, #0xff
+; CHECK: movn   wzr, #0
+; CHECK: movn   wzr, #0xff
+
+;-----------------------------------------------------------------------------
+; MVN aliases
+;-----------------------------------------------------------------------------
+
+        mvn w4, w9
+        mvn x2, x3
+        orn w4, wzr, w9
+
+; CHECK: mvn	w4, w9             ; encoding: [0xe4,0x03,0x29,0x2a]
+; CHECK: mvn	x2, x3             ; encoding: [0xe2,0x03,0x23,0xaa]
+; CHECK: mvn	w4, w9             ; encoding: [0xe4,0x03,0x29,0x2a]
+
+        mvn w4, w9, lsl #1
+        mvn x2, x3, lsl #1
+        orn w4, wzr, w9, lsl #1
+
+; CHECK: mvn	w4, w9, lsl #1     ; encoding: [0xe4,0x07,0x29,0x2a]
+; CHECK: mvn	x2, x3, lsl #1     ; encoding: [0xe2,0x07,0x23,0xaa]
+; CHECK: mvn	w4, w9, lsl #1     ; encoding: [0xe4,0x07,0x29,0x2a]
+
+;-----------------------------------------------------------------------------
+; Bitfield aliases
+;-----------------------------------------------------------------------------
+
+  bfi   w0, w0, #1, #4
+  bfi   x0, x0, #1, #4
+  bfi   w0, w0, #0, #2
+  bfi   x0, x0, #0, #2
+  bfxil w0, w0, #2, #3
+  bfxil x0, x0, #2, #3
+  sbfiz w0, w0, #1, #4
+  sbfiz x0, x0, #1, #4
+  sbfx  w0, w0, #2, #3
+  sbfx  x0, x0, #2, #3
+  ubfiz w0, w0, #1, #4
+  ubfiz x0, x0, #1, #4
+  ubfx  w0, w0, #2, #3
+  ubfx  x0, x0, #2, #3
+
+; CHECK: bfi   w0, w0, #1, #4
+; CHECK: bfi   x0, x0, #1, #4
+; CHECK: bfxil w0, w0, #0, #2
+; CHECK: bfxil x0, x0, #0, #2
+; CHECK: bfxil w0, w0, #2, #3
+; CHECK: bfxil x0, x0, #2, #3
+; CHECK: sbfiz w0, w0, #1, #4
+; CHECK: sbfiz x0, x0, #1, #4
+; CHECK: sbfx  w0, w0, #2, #3
+; CHECK: sbfx  x0, x0, #2, #3
+; CHECK: ubfiz w0, w0, #1, #4
+; CHECK: ubfiz x0, x0, #1, #4
+; CHECK: ubfx  w0, w0, #2, #3
+; CHECK: ubfx  x0, x0, #2, #3
+
+;-----------------------------------------------------------------------------
+; Shift (immediate) aliases
+;-----------------------------------------------------------------------------
+
+; CHECK: asr w1, w3, #13
+; CHECK: asr x1, x3, #13
+; CHECK: lsl w0, w0, #1
+; CHECK: lsl x0, x0, #1
+; CHECK: lsr w0, w0, #4
+; CHECK: lsr x0, x0, #4
+
+   sbfm w1, w3, #13, #31
+   sbfm x1, x3, #13, #63
+   ubfm w0, w0, #31, #30
+   ubfm x0, x0, #63, #62
+   ubfm w0, w0, #4, #31
+   ubfm x0, x0, #4, #63
+; CHECK: ror w1, w3, #5
+; CHECK: ror x1, x3, #5
+   ror w1, w3, #5
+   ror x1, x3, #5
+; CHECK: lsl w1, wzr, #3
+   lsl w1, wzr, #3
+
+;-----------------------------------------------------------------------------
+; Sign/Zero extend aliases
+;-----------------------------------------------------------------------------
+
+  sxtb  w1, w2
+  sxth  w1, w2
+  uxtb  w1, w2
+  uxth  w1, w2
+
+; CHECK: sxtb w1, w2
+; CHECK: sxth w1, w2
+; CHECK: uxtb w1, w2
+; CHECK: uxth w1, w2
+
+  sxtb  x1, w2
+  sxth  x1, w2
+  sxtw  x1, w2
+  uxtb  x1, w2
+  uxth  x1, w2
+  uxtw  x1, w2
+
+; CHECK: sxtb x1, w2
+; CHECK: sxth x1, w2
+; CHECK: sxtw x1, w2
+; CHECK: uxtb w1, w2
+; CHECK: uxth w1, w2
+; CHECK: ubfx x1, x2, #0, #32
+
+;-----------------------------------------------------------------------------
+; Negate with carry
+;-----------------------------------------------------------------------------
+
+  ngc   w1, w2
+  ngc   x1, x2
+  ngcs  w1, w2
+  ngcs  x1, x2
+
+; CHECK: ngc  w1, w2
+; CHECK: ngc  x1, x2
+; CHECK: ngcs w1, w2
+; CHECK: ngcs x1, x2
+
+;-----------------------------------------------------------------------------
+; 6.6.1 Multiply aliases
+;-----------------------------------------------------------------------------
+
+  mneg   w1, w2, w3
+  mneg   x1, x2, x3
+  mul    w1, w2, w3
+  mul    x1, x2, x3
+  smnegl x1, w2, w3
+  umnegl x1, w2, w3
+  smull   x1, w2, w3
+  umull   x1, w2, w3
+
+; CHECK: mneg w1, w2, w3
+; CHECK: mneg x1, x2, x3
+; CHECK: mul w1, w2, w3
+; CHECK: mul x1, x2, x3
+; CHECK: smnegl x1, w2, w3
+; CHECK: umnegl x1, w2, w3
+; CHECK: smull x1, w2, w3
+; CHECK: umull x1, w2, w3
+
+;-----------------------------------------------------------------------------
+; Conditional select aliases
+;-----------------------------------------------------------------------------
+
+  cset   w1, eq
+  cset   x1, eq
+  csetm  w1, ne
+  csetm  x1, ne
+  cinc   w1, w2, lt
+  cinc   x1, x2, lt
+  cinv   w1, w2, mi
+  cinv   x1, x2, mi
+
+; CHECK: cset  w1, eq
+; CHECK: cset  x1, eq
+; CHECK: csetm  w1, ne
+; CHECK: csetm  x1, ne
+; CHECK: cinc  w1, w2, lt
+; CHECK: cinc  x1, x2, lt
+; CHECK: cinv  w1, w2, mi
+; CHECK: cinv  x1, x2, mi
+
+;-----------------------------------------------------------------------------
+; SYS aliases
+;-----------------------------------------------------------------------------
+
+  sys #0, c7, c1, #0
+; CHECK: ic ialluis
+  sys #0, c7, c5, #0
+; CHECK: ic iallu
+  sys #3, c7, c5, #1
+; CHECK: ic ivau
+
+  sys #3, c7, c4, #1
+; CHECK: dc zva
+  sys #0, c7, c6, #1
+; CHECK: dc ivac
+  sys #0, c7, c6, #2
+; CHECK: dc isw
+  sys #3, c7, c10, #1
+; CHECK: dc cvac
+  sys #0, c7, c10, #2
+; CHECK: dc csw
+  sys #3, c7, c11, #1
+; CHECK: dc cvau
+  sys #3, c7, c14, #1
+; CHECK: dc civac
+  sys #0, c7, c14, #2
+; CHECK: dc cisw
+
+  sys #0, c7, c8, #0
+; CHECK: at s1e1r
+  sys #4, c7, c8, #0
+; CHECK: at s1e2r
+  sys #6, c7, c8, #0
+; CHECK: at s1e3r
+  sys #0, c7, c8, #1
+; CHECK: at s1e1w
+  sys #4, c7, c8, #1
+; CHECK: at s1e2w
+  sys #6, c7, c8, #1
+; CHECK: at s1e3w
+  sys #0, c7, c8, #2
+; CHECK: at s1e0r
+  sys #0, c7, c8, #3
+; CHECK: at s1e0w
+  sys #4, c7, c8, #4
+; CHECK: at s12e1r
+  sys #4, c7, c8, #5
+; CHECK: at s12e1w
+  sys #4, c7, c8, #6
+; CHECK: at s12e0r
+  sys #4, c7, c8, #7
+; CHECK: at s12e0w
+
+  sys #0, c8, c3, #0
+; CHECK: tlbi vmalle1is
+  sys #4, c8, c3, #0
+; CHECK: tlbi alle2is
+  sys #6, c8, c3, #0
+; CHECK: tlbi alle3is
+  sys #0, c8, c3, #1
+; CHECK: tlbi vae1is
+  sys #4, c8, c3, #1
+; CHECK: tlbi vae2is
+  sys #6, c8, c3, #1
+; CHECK: tlbi vae3is
+  sys #0, c8, c3, #2
+; CHECK: tlbi aside1is
+  sys #0, c8, c3, #3
+; CHECK: tlbi vaae1is
+  sys #4, c8, c3, #4
+; CHECK: tlbi alle1is
+  sys #0, c8, c3, #5
+; CHECK: tlbi vale1is
+  sys #0, c8, c3, #7
+; CHECK: tlbi vaale1is
+  sys #0, c8, c7, #0
+; CHECK: tlbi vmalle1
+  sys #4, c8, c7, #0
+; CHECK: tlbi alle2
+  sys #4, c8, c3, #5
+; CHECK: tlbi vale2is
+  sys #6, c8, c3, #5
+; CHECK: tlbi vale3is
+  sys #6, c8, c7, #0
+; CHECK: tlbi alle3
+  sys #0, c8, c7, #1
+; CHECK: tlbi vae1
+  sys #4, c8, c7, #1
+; CHECK: tlbi vae2
+  sys #6, c8, c7, #1
+; CHECK: tlbi vae3
+  sys #0, c8, c7, #2
+; CHECK: tlbi aside1
+  sys #0, c8, c7, #3
+; CHECK: tlbi vaae1
+  sys #4, c8, c7, #4
+; CHECK: tlbi alle1
+  sys #0, c8, c7, #5
+; CHECK: tlbi vale1
+  sys #4, c8, c7, #5
+; CHECK: tlbi vale2
+  sys #6, c8, c7, #5
+; CHECK: tlbi vale3
+  sys #0, c8, c7, #7
+; CHECK: tlbi vaale1
+  sys #4, c8, c4, #1
+; CHECK: tlbi ipas2e1
+  sys #4, c8, c4, #5
+; CHECK: tlbi ipas2le1
+  sys #4, c8, c0, #1
+; CHECK: tlbi ipas2e1is
+  sys #4, c8, c0, #5
+; CHECK: tlbi ipas2le1is
+  sys #4, c8, c7, #6
+; CHECK: tlbi vmalls12e1
+  sys #4, c8, c3, #6
+; CHECK: tlbi vmalls12e1is
+
+  ic ialluis
+; CHECK: ic ialluis                 ; encoding: [0x1f,0x71,0x08,0xd5]
+  ic iallu
+; CHECK: ic iallu                   ; encoding: [0x1f,0x75,0x08,0xd5]
+  ic ivau, x0
+; CHECK: ic ivau, x0                ; encoding: [0x20,0x75,0x0b,0xd5]
+
+  dc zva, x0
+; CHECK: dc zva, x0                 ; encoding: [0x20,0x74,0x0b,0xd5]
+  dc ivac, x0
+; CHECK: dc ivac, x0                ; encoding: [0x20,0x76,0x08,0xd5]
+  dc isw, x0
+; CHECK: dc isw, x0                 ; encoding: [0x40,0x76,0x08,0xd5]
+  dc cvac, x0
+; CHECK: dc cvac, x0                ; encoding: [0x20,0x7a,0x0b,0xd5]
+  dc csw, x0
+; CHECK: dc csw, x0                 ; encoding: [0x40,0x7a,0x08,0xd5]
+  dc cvau, x0
+; CHECK: dc cvau, x0                ; encoding: [0x20,0x7b,0x0b,0xd5]
+  dc civac, x0
+; CHECK: dc civac, x0               ; encoding: [0x20,0x7e,0x0b,0xd5]
+  dc cisw, x0
+; CHECK: dc cisw, x0                ; encoding: [0x40,0x7e,0x08,0xd5]
+
+  at s1e1r, x0
+; CHECK: at s1e1r, x0               ; encoding: [0x00,0x78,0x08,0xd5]
+  at s1e2r, x0
+; CHECK: at s1e2r, x0               ; encoding: [0x00,0x78,0x0c,0xd5]
+  at s1e3r, x0
+; CHECK: at s1e3r, x0               ; encoding: [0x00,0x78,0x0e,0xd5]
+  at s1e1w, x0
+; CHECK: at s1e1w, x0               ; encoding: [0x20,0x78,0x08,0xd5]
+  at s1e2w, x0
+; CHECK: at s1e2w, x0               ; encoding: [0x20,0x78,0x0c,0xd5]
+  at s1e3w, x0
+; CHECK: at s1e3w, x0               ; encoding: [0x20,0x78,0x0e,0xd5]
+  at s1e0r, x0
+; CHECK: at s1e0r, x0               ; encoding: [0x40,0x78,0x08,0xd5]
+  at s1e0w, x0
+; CHECK: at s1e0w, x0               ; encoding: [0x60,0x78,0x08,0xd5]
+  at s12e1r, x0
+; CHECK: at s12e1r, x0              ; encoding: [0x80,0x78,0x0c,0xd5]
+  at s12e1w, x0
+; CHECK: at s12e1w, x0              ; encoding: [0xa0,0x78,0x0c,0xd5]
+  at s12e0r, x0
+; CHECK: at s12e0r, x0              ; encoding: [0xc0,0x78,0x0c,0xd5]
+  at s12e0w, x0
+; CHECK: at s12e0w, x0              ; encoding: [0xe0,0x78,0x0c,0xd5]
+
+  tlbi vmalle1is
+; CHECK: tlbi vmalle1is             ; encoding: [0x1f,0x83,0x08,0xd5]
+  tlbi alle2is
+; CHECK: tlbi alle2is               ; encoding: [0x1f,0x83,0x0c,0xd5]
+  tlbi alle3is
+; CHECK: tlbi alle3is               ; encoding: [0x1f,0x83,0x0e,0xd5]
+  tlbi vae1is, x0
+; CHECK: tlbi vae1is, x0            ; encoding: [0x20,0x83,0x08,0xd5]
+  tlbi vae2is, x0
+; CHECK: tlbi vae2is, x0            ; encoding: [0x20,0x83,0x0c,0xd5]
+  tlbi vae3is, x0
+; CHECK: tlbi vae3is, x0            ; encoding: [0x20,0x83,0x0e,0xd5]
+  tlbi aside1is, x0
+; CHECK: tlbi aside1is, x0          ; encoding: [0x40,0x83,0x08,0xd5]
+  tlbi vaae1is, x0
+; CHECK: tlbi vaae1is, x0           ; encoding: [0x60,0x83,0x08,0xd5]
+  tlbi alle1is
+; CHECK: tlbi alle1is               ; encoding: [0x9f,0x83,0x0c,0xd5]
+  tlbi vale1is, x0
+; CHECK: tlbi vale1is, x0           ; encoding: [0xa0,0x83,0x08,0xd5]
+  tlbi vaale1is, x0
+; CHECK: tlbi vaale1is, x0          ; encoding: [0xe0,0x83,0x08,0xd5]
+  tlbi vmalle1
+; CHECK: tlbi vmalle1               ; encoding: [0x1f,0x87,0x08,0xd5]
+  tlbi alle2
+; CHECK: tlbi alle2                 ; encoding: [0x1f,0x87,0x0c,0xd5]
+  tlbi vale2is, x0
+; CHECK: tlbi vale2is, x0           ; encoding: [0xa0,0x83,0x0c,0xd5]
+  tlbi vale3is, x0
+; CHECK: tlbi vale3is, x0           ; encoding: [0xa0,0x83,0x0e,0xd5]
+  tlbi alle3
+; CHECK: tlbi alle3                 ; encoding: [0x1f,0x87,0x0e,0xd5]
+  tlbi vae1, x0
+; CHECK: tlbi vae1, x0              ; encoding: [0x20,0x87,0x08,0xd5]
+  tlbi vae2, x0
+; CHECK: tlbi vae2, x0              ; encoding: [0x20,0x87,0x0c,0xd5]
+  tlbi vae3, x0
+; CHECK: tlbi vae3, x0              ; encoding: [0x20,0x87,0x0e,0xd5]
+  tlbi aside1, x0
+; CHECK: tlbi aside1, x0            ; encoding: [0x40,0x87,0x08,0xd5]
+  tlbi vaae1, x0
+; CHECK: tlbi vaae1, x0             ; encoding: [0x60,0x87,0x08,0xd5]
+  tlbi alle1
+; CHECK: tlbi alle1                 ; encoding: [0x9f,0x87,0x0c,0xd5
+  tlbi vale1, x0
+; CHECK: tlbi vale1, x0             ; encoding: [0xa0,0x87,0x08,0xd5]
+  tlbi vale2, x0
+; CHECK: tlbi vale2, x0             ; encoding: [0xa0,0x87,0x0c,0xd5]
+  tlbi vale3, x0
+; CHECK: tlbi vale3, x0             ; encoding: [0xa0,0x87,0x0e,0xd5]
+  tlbi vaale1, x0
+; CHECK: tlbi vaale1, x0            ; encoding: [0xe0,0x87,0x08,0xd5]
+  tlbi ipas2e1, x0
+; CHECK: tlbi ipas2e1, x0           ; encoding: [0x20,0x84,0x0c,0xd5]
+  tlbi ipas2le1, x0
+; CHECK: tlbi ipas2le1, x0          ; encoding: [0xa0,0x84,0x0c,0xd5]
+  tlbi ipas2e1is, x0
+; CHECK: tlbi ipas2e1is, x0         ; encoding: [0x20,0x80,0x0c,0xd5]
+  tlbi ipas2le1is, x0
+; CHECK: tlbi ipas2le1is, x0        ; encoding: [0xa0,0x80,0x0c,0xd5]
+  tlbi vmalls12e1
+; CHECK: tlbi vmalls12e1            ; encoding: [0xdf,0x87,0x0c,0xd5]
+  tlbi vmalls12e1is
+; CHECK: tlbi vmalls12e1is          ; encoding: [0xdf,0x83,0x0c,0xd5]
+
+;-----------------------------------------------------------------------------
+; 5.8.5 Vector Arithmetic aliases
+;-----------------------------------------------------------------------------
+
+  cmls.8b v0, v2, v1
+  cmls.16b v0, v2, v1
+  cmls.4h v0, v2, v1
+  cmls.8h v0, v2, v1
+  cmls.2s v0, v2, v1
+  cmls.4s v0, v2, v1
+  cmls.2d v0, v2, v1
+; CHECK: cmhs.8b v0, v1, v2
+; CHECK: cmhs.16b v0, v1, v2
+; CHECK: cmhs.4h v0, v1, v2
+; CHECK: cmhs.8h v0, v1, v2
+; CHECK: cmhs.2s v0, v1, v2
+; CHECK: cmhs.4s v0, v1, v2
+; CHECK: cmhs.2d v0, v1, v2
+
+  cmlo.8b v0, v2, v1
+  cmlo.16b v0, v2, v1
+  cmlo.4h v0, v2, v1
+  cmlo.8h v0, v2, v1
+  cmlo.2s v0, v2, v1
+  cmlo.4s v0, v2, v1
+  cmlo.2d v0, v2, v1
+; CHECK: cmhi.8b v0, v1, v2
+; CHECK: cmhi.16b v0, v1, v2
+; CHECK: cmhi.4h v0, v1, v2
+; CHECK: cmhi.8h v0, v1, v2
+; CHECK: cmhi.2s v0, v1, v2
+; CHECK: cmhi.4s v0, v1, v2
+; CHECK: cmhi.2d v0, v1, v2
+
+  cmle.8b v0, v2, v1
+  cmle.16b v0, v2, v1
+  cmle.4h v0, v2, v1
+  cmle.8h  v0, v2, v1
+  cmle.2s v0, v2, v1
+  cmle.4s v0, v2, v1
+  cmle.2d v0, v2, v1
+; CHECK: cmge.8b v0, v1, v2
+; CHECK: cmge.16b v0, v1, v2
+; CHECK: cmge.4h v0, v1, v2
+; CHECK: cmge.8h v0, v1, v2
+; CHECK: cmge.2s v0, v1, v2
+; CHECK: cmge.4s v0, v1, v2
+; CHECK: cmge.2d v0, v1, v2
+
+  cmlt.8b v0, v2, v1
+  cmlt.16b v0, v2, v1
+  cmlt.4h v0, v2, v1
+  cmlt.8h  v0, v2, v1
+  cmlt.2s v0, v2, v1
+  cmlt.4s v0, v2, v1
+  cmlt.2d v0, v2, v1
+; CHECK: cmgt.8b v0, v1, v2
+; CHECK: cmgt.16b v0, v1, v2
+; CHECK: cmgt.4h v0, v1, v2
+; CHECK: cmgt.8h v0, v1, v2
+; CHECK: cmgt.2s v0, v1, v2
+; CHECK: cmgt.4s v0, v1, v2
+; CHECK: cmgt.2d v0, v1, v2
+
+  fcmle.2s v0, v2, v1
+  fcmle.4s v0, v2, v1
+  fcmle.2d v0, v2, v1
+; CHECK: fcmge.2s v0, v1, v2
+; CHECK: fcmge.4s v0, v1, v2
+; CHECK: fcmge.2d v0, v1, v2
+
+  fcmlt.2s v0, v2, v1
+  fcmlt.4s v0, v2, v1
+  fcmlt.2d v0, v2, v1
+; CHECK: fcmgt.2s v0, v1, v2
+; CHECK: fcmgt.4s v0, v1, v2
+; CHECK: fcmgt.2d v0, v1, v2
+
+  facle.2s v0, v2, v1
+  facle.4s v0, v2, v1
+  facle.2d v0, v2, v1
+; CHECK: facge.2s v0, v1, v2
+; CHECK: facge.4s v0, v1, v2
+; CHECK: facge.2d v0, v1, v2
+
+  faclt.2s v0, v2, v1
+  faclt.4s v0, v2, v1
+  faclt.2d v0, v2, v1
+; CHECK: facgt.2s v0, v1, v2
+; CHECK: facgt.4s v0, v1, v2
+; CHECK: facgt.2d v0, v1, v2
+
+;-----------------------------------------------------------------------------
+; 5.8.6 Scalar Arithmetic aliases
+;-----------------------------------------------------------------------------
+
+  cmls d0, d2, d1
+; CHECK: cmhs d0, d1, d2
+
+  cmle d0, d2, d1
+; CHECK: cmge d0, d1, d2
+
+  cmlo d0, d2, d1
+; CHECK: cmhi d0, d1, d2
+
+  cmlt d0, d2, d1
+; CHECK: cmgt d0, d1, d2
+
+  fcmle s0, s2, s1
+  fcmle d0, d2, d1
+; CHECK: fcmge s0, s1, s2
+; CHECK: fcmge d0, d1, d2
+
+  fcmlt s0, s2, s1
+  fcmlt d0, d2, d1
+; CHECK: fcmgt s0, s1, s2
+; CHECK: fcmgt d0, d1, d2
+
+  facle s0, s2, s1
+  facle d0, d2, d1
+; CHECK: facge s0, s1, s2
+; CHECK: facge d0, d1, d2
+
+  faclt s0, s2, s1
+  faclt d0, d2, d1
+; CHECK: facgt s0, s1, s2
+; CHECK: facgt d0, d1, d2
+
+;-----------------------------------------------------------------------------
+; 5.8.14 Vector Shift (immediate)
+;-----------------------------------------------------------------------------
+  sxtl v1.8h, v2.8b
+; CHECK: sshll.8h v1, v2, #0
+  sxtl.8h v1, v2
+; CHECK: sshll.8h v1, v2, #0
+
+  sxtl v1.4s, v2.4h
+; CHECK: sshll.4s v1, v2, #0
+  sxtl.4s v1, v2
+; CHECK: sshll.4s v1, v2, #0
+
+  sxtl v1.2d, v2.2s
+; CHECK: sshll.2d v1, v2, #0
+  sxtl.2d v1, v2
+; CHECK: sshll.2d v1, v2, #0
+
+  sxtl2 v1.8h, v2.16b
+; CHECK: sshll2.8h v1, v2, #0
+  sxtl2.8h v1, v2
+; CHECK: sshll2.8h v1, v2, #0
+
+  sxtl2 v1.4s, v2.8h
+; CHECK: sshll2.4s v1, v2, #0
+  sxtl2.4s v1, v2
+; CHECK: sshll2.4s v1, v2, #0
+
+  sxtl2 v1.2d, v2.4s
+; CHECK: sshll2.2d v1, v2, #0
+  sxtl2.2d v1, v2
+; CHECK: sshll2.2d v1, v2, #0
+
+  uxtl v1.8h, v2.8b
+; CHECK: ushll.8h v1, v2, #0
+  uxtl.8h v1, v2
+; CHECK: ushll.8h v1, v2, #0
+
+  uxtl v1.4s, v2.4h
+; CHECK: ushll.4s v1, v2, #0
+  uxtl.4s v1, v2
+; CHECK: ushll.4s v1, v2, #0
+
+  uxtl v1.2d, v2.2s
+; CHECK: ushll.2d v1, v2, #0
+  uxtl.2d v1, v2
+; CHECK: ushll.2d v1, v2, #0
+
+  uxtl2 v1.8h, v2.16b
+; CHECK: ushll2.8h v1, v2, #0
+  uxtl2.8h v1, v2
+; CHECK: ushll2.8h v1, v2, #0
+
+  uxtl2 v1.4s, v2.8h
+; CHECK: ushll2.4s v1, v2, #0
+  uxtl2.4s v1, v2
+; CHECK: ushll2.4s v1, v2, #0
+
+  uxtl2 v1.2d, v2.4s
+; CHECK: ushll2.2d v1, v2, #0
+  uxtl2.2d v1, v2
+; CHECK: ushll2.2d v1, v2, #0
+
+
+;-----------------------------------------------------------------------------
+; MOVI verbose syntax with shift operand omitted.
+;-----------------------------------------------------------------------------
+  movi v4.16b, #0x00
+  movi v4.16B, #0x01
+  movi v4.8b, #0x02
+  movi v4.8B, #0x03
+  movi v1.2d, #0x000000000000ff
+  movi v2.2D, #0x000000000000ff
+
+; CHECK: movi.16b	v4, #0              ; encoding: [0x04,0xe4,0x00,0x4f]
+; CHECK: movi.16b	v4, #0x1              ; encoding: [0x24,0xe4,0x00,0x4f]
+; CHECK: movi.8b	v4, #0x2               ; encoding: [0x44,0xe4,0x00,0x0f]
+; CHECK: movi.8b	v4, #0x3               ; encoding: [0x64,0xe4,0x00,0x0f]
+; CHECK: movi.2d	v1, #0x000000000000ff ; encoding: [0x21,0xe4,0x00,0x6f]
+; CHECK: movi.2d	v2, #0x000000000000ff ; encoding: [0x22,0xe4,0x00,0x6f]
diff --git a/test/MC/AArch64/arm64-arithmetic-encoding.s b/test/MC/AArch64/arm64-arithmetic-encoding.s
new file mode 100644
index 0000000..5fd5912
--- /dev/null
+++ b/test/MC/AArch64/arm64-arithmetic-encoding.s
@@ -0,0 +1,615 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon -show-encoding < %s | FileCheck %s
+
+foo:
+;==---------------------------------------------------------------------------==
+; Add/Subtract with carry/borrow
+;==---------------------------------------------------------------------------==
+
+  adc   w1, w2, w3
+  adc   x1, x2, x3
+  adcs  w5, w4, w3
+  adcs  x5, x4, x3
+
+; CHECK: adc  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x1a]
+; CHECK: adc  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0x9a]
+; CHECK: adcs w5, w4, w3             ; encoding: [0x85,0x00,0x03,0x3a]
+; CHECK: adcs x5, x4, x3             ; encoding: [0x85,0x00,0x03,0xba]
+
+  sbc   w1, w2, w3
+  sbc   x1, x2, x3
+  sbcs  w1, w2, w3
+  sbcs  x1, x2, x3
+
+; CHECK: sbc  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x5a]
+; CHECK: sbc  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xda]
+; CHECK: sbcs w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x7a]
+; CHECK: sbcs x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xfa]
+
+;==---------------------------------------------------------------------------==
+; Add/Subtract with (optionally shifted) immediate
+;==---------------------------------------------------------------------------==
+
+  add w3, w4, #1024
+  add w3, w4, #1024, lsl #0
+  add x3, x4, #1024
+  add x3, x4, #1024, lsl #0
+
+; CHECK: add w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x11]
+; CHECK: add w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x11]
+; CHECK: add x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0x91]
+; CHECK: add x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0x91]
+
+  add w3, w4, #1024, lsl #12
+  add w3, w4, #4194304
+  add w3, w4, #0, lsl #12
+  add x3, x4, #1024, lsl #12
+  add x3, x4, #4194304
+  add x3, x4, #0, lsl #12
+  add sp, sp, #32
+
+; CHECK: add w3, w4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x11]
+; CHECK: add w3, w4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x11]
+; CHECK: add w3, w4, #0, lsl #12     ; encoding: [0x83,0x00,0x40,0x11]
+; CHECK: add x3, x4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x91]
+; CHECK: add x3, x4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x91]
+; CHECK: add x3, x4, #0, lsl #12     ; encoding: [0x83,0x00,0x40,0x91]
+; CHECK: add sp, sp, #32             ; encoding: [0xff,0x83,0x00,0x91]
+
+  adds w3, w4, #1024
+  adds w3, w4, #1024, lsl #0
+  adds w3, w4, #1024, lsl #12
+  adds x3, x4, #1024
+  adds x3, x4, #1024, lsl #0
+  adds x3, x4, #1024, lsl #12
+
+; CHECK: adds w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x31]
+; CHECK: adds w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x31]
+; CHECK: adds w3, w4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0x31]
+; CHECK: adds x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xb1]
+; CHECK: adds x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xb1]
+; CHECK: adds x3, x4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0xb1]
+
+  sub w3, w4, #1024
+  sub w3, w4, #1024, lsl #0
+  sub w3, w4, #1024, lsl #12
+  sub x3, x4, #1024
+  sub x3, x4, #1024, lsl #0
+  sub x3, x4, #1024, lsl #12
+  sub sp, sp, #32
+
+; CHECK: sub w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x51]
+; CHECK: sub w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x51]
+; CHECK: sub w3, w4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0x51]
+; CHECK: sub x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0xd1]
+; CHECK: sub x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0xd1]
+; CHECK: sub x3, x4, #1024, lsl #12  ; encoding: [0x83,0x00,0x50,0xd1]
+; CHECK: sub sp, sp, #32             ; encoding: [0xff,0x83,0x00,0xd1]
+
+  subs w3, w4, #1024
+  subs w3, w4, #1024, lsl #0
+  subs w3, w4, #1024, lsl #12
+  subs x3, x4, #1024
+  subs x3, x4, #1024, lsl #0
+  subs x3, x4, #1024, lsl #12
+
+; CHECK: subs w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x71]
+; CHECK: subs w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x71]
+; CHECK: subs w3, w4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0x71]
+; CHECK: subs x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xf1]
+; CHECK: subs x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xf1]
+; CHECK: subs x3, x4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0xf1]
+
+;==---------------------------------------------------------------------------==
+; Add/Subtract register with (optional) shift
+;==---------------------------------------------------------------------------==
+
+  add w12, w13, w14
+  add x12, x13, x14
+  add w12, w13, w14, lsl #12
+  add x12, x13, x14, lsl #12
+  add x12, x13, x14, lsr #42
+  add x12, x13, x14, asr #39
+
+; CHECK: add w12, w13, w14           ; encoding: [0xac,0x01,0x0e,0x0b]
+; CHECK: add x12, x13, x14           ; encoding: [0xac,0x01,0x0e,0x8b]
+; CHECK: add w12, w13, w14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x0b]
+; CHECK: add x12, x13, x14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x8b]
+; CHECK: add x12, x13, x14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0x8b]
+; CHECK: add x12, x13, x14, asr #39  ; encoding: [0xac,0x9d,0x8e,0x8b]
+
+  sub w12, w13, w14
+  sub x12, x13, x14
+  sub w12, w13, w14, lsl #12
+  sub x12, x13, x14, lsl #12
+  sub x12, x13, x14, lsr #42
+  sub x12, x13, x14, asr #39
+
+; CHECK: sub w12, w13, w14           ; encoding: [0xac,0x01,0x0e,0x4b]
+; CHECK: sub x12, x13, x14           ; encoding: [0xac,0x01,0x0e,0xcb]
+; CHECK: sub w12, w13, w14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x4b]
+; CHECK: sub x12, x13, x14, lsl #12  ; encoding: [0xac,0x31,0x0e,0xcb]
+; CHECK: sub x12, x13, x14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0xcb]
+; CHECK: sub x12, x13, x14, asr #39  ; encoding: [0xac,0x9d,0x8e,0xcb]
+
+  adds w12, w13, w14
+  adds x12, x13, x14
+  adds w12, w13, w14, lsl #12
+  adds x12, x13, x14, lsl #12
+  adds x12, x13, x14, lsr #42
+  adds x12, x13, x14, asr #39
+
+; CHECK: adds w12, w13, w14          ; encoding: [0xac,0x01,0x0e,0x2b]
+; CHECK: adds x12, x13, x14          ; encoding: [0xac,0x01,0x0e,0xab]
+; CHECK: adds w12, w13, w14, lsl #12 ; encoding: [0xac,0x31,0x0e,0x2b]
+; CHECK: adds x12, x13, x14, lsl #12 ; encoding: [0xac,0x31,0x0e,0xab]
+; CHECK: adds x12, x13, x14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0xab]
+; CHECK: adds x12, x13, x14, asr #39 ; encoding: [0xac,0x9d,0x8e,0xab]
+
+  subs w12, w13, w14
+  subs x12, x13, x14
+  subs w12, w13, w14, lsl #12
+  subs x12, x13, x14, lsl #12
+  subs x12, x13, x14, lsr #42
+  subs x12, x13, x14, asr #39
+
+; CHECK: subs w12, w13, w14          ; encoding: [0xac,0x01,0x0e,0x6b]
+; CHECK: subs x12, x13, x14          ; encoding: [0xac,0x01,0x0e,0xeb]
+; CHECK: subs w12, w13, w14, lsl #12 ; encoding: [0xac,0x31,0x0e,0x6b]
+; CHECK: subs x12, x13, x14, lsl #12 ; encoding: [0xac,0x31,0x0e,0xeb]
+; CHECK: subs x12, x13, x14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0xeb]
+; CHECK: subs x12, x13, x14, asr #39 ; encoding: [0xac,0x9d,0x8e,0xeb]
+
+; Check use of upper case register names rdar://14354073
+  add X2, X2, X2
+; CHECK: add x2, x2, x2              ; encoding: [0x42,0x00,0x02,0x8b]
+
+;==---------------------------------------------------------------------------==
+; Add/Subtract with (optional) extend
+;==---------------------------------------------------------------------------==
+
+  add w1, w2, w3, uxtb
+  add w1, w2, w3, uxth
+  add w1, w2, w3, uxtw
+  add w1, w2, w3, uxtx
+  add w1, w2, w3, sxtb
+  add w1, w2, w3, sxth
+  add w1, w2, w3, sxtw
+  add w1, w2, w3, sxtx
+
+; CHECK: add w1, w2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x0b]
+; CHECK: add w1, w2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x0b]
+; CHECK: add w1, w2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x0b]
+; CHECK: add w1, w2, w3, uxtx        ; encoding: [0x41,0x60,0x23,0x0b]
+; CHECK: add w1, w2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x0b]
+; CHECK: add w1, w2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x0b]
+; CHECK: add w1, w2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x0b]
+; CHECK: add w1, w2, w3, sxtx        ; encoding: [0x41,0xe0,0x23,0x0b]
+
+  add x1, x2, w3, uxtb
+  add x1, x2, w3, uxth
+  add x1, x2, w3, uxtw
+  add x1, x2, w3, sxtb
+  add x1, x2, w3, sxth
+  add x1, x2, w3, sxtw
+
+; CHECK: add x1, x2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x8b]
+; CHECK: add x1, x2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x8b]
+; CHECK: add x1, x2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x8b]
+; CHECK: add x1, x2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x8b]
+; CHECK: add x1, x2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x8b]
+; CHECK: add x1, x2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x8b]
+
+  add w1, wsp, w3
+  add w1, wsp, w3, uxtw #0
+  add w2, wsp, w3, lsl #1
+  add sp, x2, x3
+  add sp, x2, x3, uxtx #0
+
+; CHECK: add w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x0b]
+; CHECK: add w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x0b]
+; CHECK: add w2, wsp, w3, lsl #1     ; encoding: [0xe2,0x47,0x23,0x0b]
+; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
+; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
+
+  sub w1, w2, w3, uxtb
+  sub w1, w2, w3, uxth
+  sub w1, w2, w3, uxtw
+  sub w1, w2, w3, uxtx
+  sub w1, w2, w3, sxtb
+  sub w1, w2, w3, sxth
+  sub w1, w2, w3, sxtw
+  sub w1, w2, w3, sxtx
+
+; CHECK: sub w1, w2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x4b]
+; CHECK: sub w1, w2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x4b]
+; CHECK: sub w1, w2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x4b]
+; CHECK: sub w1, w2, w3, uxtx        ; encoding: [0x41,0x60,0x23,0x4b]
+; CHECK: sub w1, w2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x4b]
+; CHECK: sub w1, w2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x4b]
+; CHECK: sub w1, w2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x4b]
+; CHECK: sub w1, w2, w3, sxtx        ; encoding: [0x41,0xe0,0x23,0x4b]
+
+  sub x1, x2, w3, uxtb
+  sub x1, x2, w3, uxth
+  sub x1, x2, w3, uxtw
+  sub x1, x2, w3, sxtb
+  sub x1, x2, w3, sxth
+  sub x1, x2, w3, sxtw
+
+; CHECK: sub x1, x2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0xcb]
+; CHECK: sub x1, x2, w3, uxth        ; encoding: [0x41,0x20,0x23,0xcb]
+; CHECK: sub x1, x2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0xcb]
+; CHECK: sub x1, x2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0xcb]
+; CHECK: sub x1, x2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0xcb]
+; CHECK: sub x1, x2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0xcb]
+
+  sub w1, wsp, w3
+  sub w1, wsp, w3, uxtw #0
+  sub sp, x2, x3
+  sub sp, x2, x3, uxtx #0
+  sub sp, x3, x7, lsl #4
+
+; CHECK: sub w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x4b]
+; CHECK: sub w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x4b]
+; CHECK: sub sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0xcb]
+; CHECK: sub sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0xcb]
+; CHECK: sp, x3, x7, lsl #4          ; encoding: [0x7f,0x70,0x27,0xcb]
+
+  adds w1, w2, w3, uxtb
+  adds w1, w2, w3, uxth
+  adds w1, w2, w3, uxtw
+  adds w1, w2, w3, uxtx
+  adds w1, w2, w3, sxtb
+  adds w1, w2, w3, sxth
+  adds w1, w2, w3, sxtw
+  adds w1, w2, w3, sxtx
+
+; CHECK: adds w1, w2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0x2b]
+; CHECK: adds w1, w2, w3, uxth       ; encoding: [0x41,0x20,0x23,0x2b]
+; CHECK: adds w1, w2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0x2b]
+; CHECK: adds w1, w2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0x2b]
+; CHECK: adds w1, w2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0x2b]
+; CHECK: adds w1, w2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0x2b]
+; CHECK: adds w1, w2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0x2b]
+; CHECK: adds w1, w2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0x2b]
+
+  adds x1, x2, w3, uxtb
+  adds x1, x2, w3, uxth
+  adds x1, x2, w3, uxtw
+  adds x1, x2, w3, uxtx
+  adds x1, x2, w3, sxtb
+  adds x1, x2, w3, sxth
+  adds x1, x2, w3, sxtw
+  adds x1, x2, w3, sxtx
+
+; CHECK: adds x1, x2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0xab]
+; CHECK: adds x1, x2, w3, uxth       ; encoding: [0x41,0x20,0x23,0xab]
+; CHECK: adds x1, x2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0xab]
+; CHECK: adds x1, x2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0xab]
+; CHECK: adds x1, x2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0xab]
+; CHECK: adds x1, x2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0xab]
+; CHECK: adds x1, x2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0xab]
+; CHECK: adds x1, x2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0xab]
+
+  adds w1, wsp, w3
+  adds w1, wsp, w3, uxtw #0
+  adds wzr, wsp, w3, lsl #4
+
+; CHECK: adds w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x2b]
+; CHECK: adds w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x2b]
+; CHECK: cmn wsp, w3, lsl #4         ; encoding: [0xff,0x53,0x23,0x2b]
+
+  subs w1, w2, w3, uxtb
+  subs w1, w2, w3, uxth
+  subs w1, w2, w3, uxtw
+  subs w1, w2, w3, uxtx
+  subs w1, w2, w3, sxtb
+  subs w1, w2, w3, sxth
+  subs w1, w2, w3, sxtw
+  subs w1, w2, w3, sxtx
+
+; CHECK: subs w1, w2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0x6b]
+; CHECK: subs w1, w2, w3, uxth       ; encoding: [0x41,0x20,0x23,0x6b]
+; CHECK: subs w1, w2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0x6b]
+; CHECK: subs w1, w2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0x6b]
+; CHECK: subs w1, w2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0x6b]
+; CHECK: subs w1, w2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0x6b]
+; CHECK: subs w1, w2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0x6b]
+; CHECK: subs w1, w2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0x6b]
+
+  subs x1, x2, w3, uxtb
+  subs x1, x2, w3, uxth
+  subs x1, x2, w3, uxtw
+  subs x1, x2, w3, uxtx
+  subs x1, x2, w3, sxtb
+  subs x1, x2, w3, sxth
+  subs x1, x2, w3, sxtw
+  subs x1, x2, w3, sxtx
+
+; CHECK: subs x1, x2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0xeb]
+; CHECK: subs x1, x2, w3, uxth       ; encoding: [0x41,0x20,0x23,0xeb]
+; CHECK: subs x1, x2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0xeb]
+; CHECK: subs x1, x2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0xeb]
+; CHECK: subs x1, x2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0xeb]
+; CHECK: subs x1, x2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0xeb]
+; CHECK: subs x1, x2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0xeb]
+; CHECK: subs x1, x2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0xeb]
+
+  subs w1, wsp, w3
+  subs w1, wsp, w3, uxtw #0
+
+; CHECK: subs w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x6b]
+; CHECK: subs w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x6b]
+
+  cmp wsp, w9, lsl #0
+  subs x3, sp, x9, lsl #2
+  cmp wsp, w8, uxtw
+  subs wzr, wsp, w8, uxtw
+  cmp sp, w8, uxtw
+  subs xzr, sp, w8, uxtw
+
+; CHECK: cmp wsp, w9                 ; encoding: [0xff,0x43,0x29,0x6b]
+; CHECK: subs x3, sp, x9, lsl #2     ; encoding: [0xe3,0x6b,0x29,0xeb]
+; CHECK: cmp wsp, w8                 ; encoding: [0xff,0x43,0x28,0x6b]
+; CHECK: cmp wsp, w8                 ; encoding: [0xff,0x43,0x28,0x6b]
+; CHECK: cmp sp, w8, uxtw            ; encoding: [0xff,0x43,0x28,0xeb]
+; CHECK: cmp sp, w8, uxtw            ; encoding: [0xff,0x43,0x28,0xeb]
+
+  sub wsp, w9, w8, uxtw
+  sub w1, wsp, w8, uxtw
+  sub wsp, wsp, w8, uxtw
+  sub sp, x9, w8, uxtw
+  sub x1, sp, w8, uxtw
+  sub sp, sp, w8, uxtw
+  subs w1, wsp, w8, uxtw
+  subs x1, sp, w8, uxtw
+
+; CHECK: sub wsp, w9, w8             ; encoding: [0x3f,0x41,0x28,0x4b]
+; CHECK: sub w1, wsp, w8             ; encoding: [0xe1,0x43,0x28,0x4b]
+; CHECK: sub wsp, wsp, w8            ; encoding: [0xff,0x43,0x28,0x4b]
+; CHECK: sub sp, x9, w8, uxtw        ; encoding: [0x3f,0x41,0x28,0xcb]
+; CHECK: sub x1, sp, w8, uxtw        ; encoding: [0xe1,0x43,0x28,0xcb]
+; CHECK: sub sp, sp, w8, uxtw        ; encoding: [0xff,0x43,0x28,0xcb]
+; CHECK: subs w1, wsp, w8            ; encoding: [0xe1,0x43,0x28,0x6b]
+; CHECK: subs x1, sp, w8, uxtw       ; encoding: [0xe1,0x43,0x28,0xeb]
+
+;==---------------------------------------------------------------------------==
+; Signed/Unsigned divide
+;==---------------------------------------------------------------------------==
+
+  sdiv w1, w2, w3
+  sdiv x1, x2, x3
+  udiv w1, w2, w3
+  udiv x1, x2, x3
+
+; CHECK: sdiv w1, w2, w3             ; encoding: [0x41,0x0c,0xc3,0x1a]
+; CHECK: sdiv x1, x2, x3             ; encoding: [0x41,0x0c,0xc3,0x9a]
+; CHECK: udiv w1, w2, w3             ; encoding: [0x41,0x08,0xc3,0x1a]
+; CHECK: udiv x1, x2, x3             ; encoding: [0x41,0x08,0xc3,0x9a]
+
+;==---------------------------------------------------------------------------==
+; Variable shifts
+;==---------------------------------------------------------------------------==
+
+  asrv w1, w2, w3
+  asrv x1, x2, x3
+  asr w1, w2, w3
+  asr x1, x2, x3
+  lslv w1, w2, w3
+  lslv x1, x2, x3
+  lsl w1, w2, w3
+  lsl x1, x2, x3
+  lsrv w1, w2, w3
+  lsrv x1, x2, x3
+  lsr w1, w2, w3
+  lsr x1, x2, x3
+  rorv w1, w2, w3
+  rorv x1, x2, x3
+  ror w1, w2, w3
+  ror x1, x2, x3
+
+; CHECK: encoding: [0x41,0x28,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x28,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x28,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x28,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x20,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x20,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x20,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x20,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x24,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x24,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x24,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x24,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x2c,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x2c,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x2c,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x2c,0xc3,0x9a]
+
+;==---------------------------------------------------------------------------==
+; One operand instructions
+;==---------------------------------------------------------------------------==
+
+  cls w1, w2
+  cls x1, x2
+  clz w1, w2
+  clz x1, x2
+  rbit w1, w2
+  rbit x1, x2
+  rev w1, w2
+  rev x1, x2
+  rev16 w1, w2
+  rev16 x1, x2
+  rev32 x1, x2
+
+; CHECK: encoding: [0x41,0x14,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x14,0xc0,0xda]
+; CHECK: encoding: [0x41,0x10,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x10,0xc0,0xda]
+; CHECK: encoding: [0x41,0x00,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x00,0xc0,0xda]
+; CHECK: encoding: [0x41,0x08,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x0c,0xc0,0xda]
+; CHECK: encoding: [0x41,0x04,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x04,0xc0,0xda]
+; CHECK: encoding: [0x41,0x08,0xc0,0xda]
+
+;==---------------------------------------------------------------------------==
+; 6.6.1 Multiply-add instructions
+;==---------------------------------------------------------------------------==
+
+  madd   w1, w2, w3, w4
+  madd   x1, x2, x3, x4
+  msub   w1, w2, w3, w4
+  msub   x1, x2, x3, x4
+  smaddl x1, w2, w3, x4
+  smsubl x1, w2, w3, x4
+  umaddl x1, w2, w3, x4
+  umsubl x1, w2, w3, x4
+
+; CHECK: madd   w1, w2, w3, w4       ; encoding: [0x41,0x10,0x03,0x1b]
+; CHECK: madd   x1, x2, x3, x4       ; encoding: [0x41,0x10,0x03,0x9b]
+; CHECK: msub   w1, w2, w3, w4       ; encoding: [0x41,0x90,0x03,0x1b]
+; CHECK: msub   x1, x2, x3, x4       ; encoding: [0x41,0x90,0x03,0x9b]
+; CHECK: smaddl x1, w2, w3, x4       ; encoding: [0x41,0x10,0x23,0x9b]
+; CHECK: smsubl x1, w2, w3, x4       ; encoding: [0x41,0x90,0x23,0x9b]
+; CHECK: umaddl x1, w2, w3, x4       ; encoding: [0x41,0x10,0xa3,0x9b]
+; CHECK: umsubl x1, w2, w3, x4       ; encoding: [0x41,0x90,0xa3,0x9b]
+
+;==---------------------------------------------------------------------------==
+; Multiply-high instructions
+;==---------------------------------------------------------------------------==
+
+  smulh x1, x2, x3
+  umulh x1, x2, x3
+
+; CHECK: smulh x1, x2, x3            ; encoding: [0x41,0x7c,0x43,0x9b]
+; CHECK: umulh x1, x2, x3            ; encoding: [0x41,0x7c,0xc3,0x9b]
+
+;==---------------------------------------------------------------------------==
+; Move immediate instructions
+;==---------------------------------------------------------------------------==
+
+  movz w0, #1
+  movz x0, #1
+  movz w0, #1, lsl #16
+  movz x0, #1, lsl #16
+
+; CHECK: movz w0, #0x1                 ; encoding: [0x20,0x00,0x80,0x52]
+; CHECK: movz x0, #0x1                 ; encoding: [0x20,0x00,0x80,0xd2]
+; CHECK: movz w0, #0x1, lsl #16        ; encoding: [0x20,0x00,0xa0,0x52]
+; CHECK: movz x0, #0x1, lsl #16        ; encoding: [0x20,0x00,0xa0,0xd2]
+
+  movn w0, #2
+  movn x0, #2
+  movn w0, #2, lsl #16
+  movn x0, #2, lsl #16
+
+; CHECK: movn w0, #0x2                 ; encoding: [0x40,0x00,0x80,0x12]
+; CHECK: movn x0, #0x2                 ; encoding: [0x40,0x00,0x80,0x92]
+; CHECK: movn w0, #0x2, lsl #16        ; encoding: [0x40,0x00,0xa0,0x12]
+; CHECK: movn x0, #0x2, lsl #16        ; encoding: [0x40,0x00,0xa0,0x92]
+
+  movk w0, #1
+  movk x0, #1
+  movk w0, #1, lsl #16
+  movk x0, #1, lsl #16
+
+; CHECK: movk w0, #0x1                 ; encoding: [0x20,0x00,0x80,0x72]
+; CHECK: movk x0, #0x1                 ; encoding: [0x20,0x00,0x80,0xf2]
+; CHECK: movk w0, #0x1, lsl #16        ; encoding: [0x20,0x00,0xa0,0x72]
+; CHECK: movk x0, #0x1, lsl #16        ; encoding: [0x20,0x00,0xa0,0xf2]
+
+;==---------------------------------------------------------------------------==
+; Conditionally set flags instructions
+;==---------------------------------------------------------------------------==
+
+  ccmn w1, #2, #3, eq
+  ccmn x1, #2, #3, eq
+  ccmp w1, #2, #3, eq
+  ccmp x1, #2, #3, eq
+
+; CHECK: encoding: [0x23,0x08,0x42,0x3a]
+; CHECK: encoding: [0x23,0x08,0x42,0xba]
+; CHECK: encoding: [0x23,0x08,0x42,0x7a]
+; CHECK: encoding: [0x23,0x08,0x42,0xfa]
+
+  ccmn w1, w2, #3, eq
+  ccmn x1, x2, #3, eq
+  ccmp w1, w2, #3, eq
+  ccmp x1, x2, #3, eq
+
+; CHECK: encoding: [0x23,0x00,0x42,0x3a]
+; CHECK: encoding: [0x23,0x00,0x42,0xba]
+; CHECK: encoding: [0x23,0x00,0x42,0x7a]
+; CHECK: encoding: [0x23,0x00,0x42,0xfa]
+
+;==---------------------------------------------------------------------------==
+; Conditional select instructions
+;==---------------------------------------------------------------------------==
+
+  csel w1, w2, w3, eq
+  csel x1, x2, x3, eq
+  csinc w1, w2, w3, eq
+  csinc x1, x2, x3, eq
+  csinv w1, w2, w3, eq
+  csinv x1, x2, x3, eq
+  csneg w1, w2, w3, eq
+  csneg x1, x2, x3, eq
+
+; CHECK: encoding: [0x41,0x00,0x83,0x1a]
+; CHECK: encoding: [0x41,0x00,0x83,0x9a]
+; CHECK: encoding: [0x41,0x04,0x83,0x1a]
+; CHECK: encoding: [0x41,0x04,0x83,0x9a]
+; CHECK: encoding: [0x41,0x00,0x83,0x5a]
+; CHECK: encoding: [0x41,0x00,0x83,0xda]
+; CHECK: encoding: [0x41,0x04,0x83,0x5a]
+; CHECK: encoding: [0x41,0x04,0x83,0xda]
+
+; Make sure we handle upper case, too. In particular, condition codes.
+  CSEL W16, W7, W27, EQ
+  CSEL W15, W6, W26, NE
+  CSEL W14, W5, W25, CS
+  CSEL W13, W4, W24, HS
+  csel w12, w3, w23, CC
+  csel w11, w2, w22, LO
+  csel w10, w1, w21, MI
+  csel x9, x9, x1, PL
+  csel x8, x8, x2, VS
+  CSEL X7, X7, X3, VC
+  CSEL X6, X7, X4, HI
+  CSEL X5, X6, X5, LS
+  CSEL X4, X5, X6, GE
+  csel x3, x4, x7, LT
+  csel x2, x3, x8, GT
+  csel x1, x2, x9, LE
+  csel x10, x1, x20, AL
+
+; CHECK: csel	w16, w7, w27, eq        ; encoding: [0xf0,0x00,0x9b,0x1a]
+; CHECK: csel	w15, w6, w26, ne        ; encoding: [0xcf,0x10,0x9a,0x1a]
+; CHECK: csel	w14, w5, w25, hs        ; encoding: [0xae,0x20,0x99,0x1a]
+; CHECK: csel	w13, w4, w24, hs        ; encoding: [0x8d,0x20,0x98,0x1a]
+; CHECK: csel	w12, w3, w23, lo        ; encoding: [0x6c,0x30,0x97,0x1a]
+; CHECK: csel	w11, w2, w22, lo        ; encoding: [0x4b,0x30,0x96,0x1a]
+; CHECK: csel	w10, w1, w21, mi        ; encoding: [0x2a,0x40,0x95,0x1a]
+; CHECK: csel	x9, x9, x1, pl          ; encoding: [0x29,0x51,0x81,0x9a]
+; CHECK: csel	x8, x8, x2, vs          ; encoding: [0x08,0x61,0x82,0x9a]
+; CHECK: csel	x7, x7, x3, vc          ; encoding: [0xe7,0x70,0x83,0x9a]
+; CHECK: csel	x6, x7, x4, hi          ; encoding: [0xe6,0x80,0x84,0x9a]
+; CHECK: csel	x5, x6, x5, ls          ; encoding: [0xc5,0x90,0x85,0x9a]
+; CHECK: csel	x4, x5, x6, ge          ; encoding: [0xa4,0xa0,0x86,0x9a]
+; CHECK: csel	x3, x4, x7, lt          ; encoding: [0x83,0xb0,0x87,0x9a]
+; CHECK: csel	x2, x3, x8, gt          ; encoding: [0x62,0xc0,0x88,0x9a]
+; CHECK: csel	x1, x2, x9, le          ; encoding: [0x41,0xd0,0x89,0x9a]
+; CHECK: csel	x10, x1, x20, al        ; encoding: [0x2a,0xe0,0x94,0x9a]
+
+
+;==---------------------------------------------------------------------------==
+; Scalar saturating arithmetic
+;==---------------------------------------------------------------------------==
+  uqxtn b4, h2
+  uqxtn h2, s3
+  uqxtn s9, d2
+
+; CHECK: uqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x7e]
+; CHECK: uqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x7e]
+; CHECK: uqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x7e]
diff --git a/test/MC/AArch64/arm64-arm64-fixup.s b/test/MC/AArch64/arm64-arm64-fixup.s
new file mode 100644
index 0000000..81306fb
--- /dev/null
+++ b/test/MC/AArch64/arm64-arm64-fixup.s
@@ -0,0 +1,10 @@
+; RUN: llvm-mc < %s -triple arm64-apple-darwin --show-encoding | FileCheck %s
+
+foo:
+  adr x3, Lbar
+; CHECK: adr x3, Lbar            ; encoding: [0x03'A',A,A,0x10'A']
+; CHECK: fixup A - offset: 0, value: Lbar, kind: fixup_aarch64_pcrel_adr_imm21
+Lbar:
+  adrp x3, _printf@page
+; CHECK: adrp x3, _printf@PAGE      ; encoding: [0x03'A',A,A,0x90'A']
+; CHECK: fixup A - offset: 0, value: _printf@PAGE, kind: fixup_aarch64_pcrel_adrp_imm21
diff --git a/test/MC/AArch64/arm64-basic-a64-instructions.s b/test/MC/AArch64/arm64-basic-a64-instructions.s
new file mode 100644
index 0000000..2f58ead
--- /dev/null
+++ b/test/MC/AArch64/arm64-basic-a64-instructions.s
@@ -0,0 +1,18 @@
+// RUN: llvm-mc -triple arm64 -mattr=+crc -show-encoding < %s | FileCheck %s
+
+        crc32b  w5, w7, w20
+        crc32h  w28, wzr, w30
+        crc32w  w0, w1, w2
+        crc32x  w7, w9, x20
+        crc32cb w9, w5, w4
+        crc32ch w13, w17, w25
+        crc32cw wzr, w3, w5
+        crc32cx w18, w16, xzr
+// CHECK: crc32b   w5, w7, w20             // encoding: [0xe5,0x40,0xd4,0x1a]
+// CHECK: crc32h   w28, wzr, w30           // encoding: [0xfc,0x47,0xde,0x1a]
+// CHECK: crc32w   w0, w1, w2              // encoding: [0x20,0x48,0xc2,0x1a]
+// CHECK: crc32x   w7, w9, x20             // encoding: [0x27,0x4d,0xd4,0x9a]
+// CHECK: crc32cb  w9, w5, w4              // encoding: [0xa9,0x50,0xc4,0x1a]
+// CHECK: crc32ch  w13, w17, w25           // encoding: [0x2d,0x56,0xd9,0x1a]
+// CHECK: crc32cw  wzr, w3, w5             // encoding: [0x7f,0x58,0xc5,0x1a]
+// CHECK: crc32cx  w18, w16, xzr           // encoding: [0x12,0x5e,0xdf,0x9a]
diff --git a/test/MC/AArch64/arm64-be-datalayout.s b/test/MC/AArch64/arm64-be-datalayout.s
new file mode 100644
index 0000000..f448a4b
--- /dev/null
+++ b/test/MC/AArch64/arm64-be-datalayout.s
@@ -0,0 +1,4 @@
+// RUN: llvm-mc -filetype=obj -triple arm64_be %s | llvm-readobj -section-data -sections | FileCheck %s
+
+// CHECK: 0000: 00123456 789ABCDE
+foo:    .xword 0x123456789abcde
diff --git a/test/MC/AArch64/arm64-bitfield-encoding.s b/test/MC/AArch64/arm64-bitfield-encoding.s
new file mode 100644
index 0000000..1589aa7
--- /dev/null
+++ b/test/MC/AArch64/arm64-bitfield-encoding.s
@@ -0,0 +1,38 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+;==---------------------------------------------------------------------------==
+; 5.4.4 Bitfield Operations
+;==---------------------------------------------------------------------------==
+
+  bfm  w1, w2, #1, #15
+  bfm  x1, x2, #1, #15
+  sbfm w1, w2, #1, #15
+  sbfm x1, x2, #1, #15
+  ubfm w1, w2, #1, #15
+  ubfm x1, x2, #1, #15
+  sbfiz wzr, w0, #31, #1
+  sbfiz xzr, x0, #31, #1
+  ubfiz wzr, w0, #31, #1
+  ubfiz xzr, x0, #31, #1
+
+; CHECK: bfxil w1, w2, #1, #15       ; encoding: [0x41,0x3c,0x01,0x33]
+; CHECK: bfxil x1, x2, #1, #15       ; encoding: [0x41,0x3c,0x41,0xb3]
+; CHECK: sbfx w1, w2, #1, #15        ; encoding: [0x41,0x3c,0x01,0x13]
+; CHECK: sbfx x1, x2, #1, #15        ; encoding: [0x41,0x3c,0x41,0x93]
+; CHECK: ubfx w1, w2, #1, #15        ; encoding: [0x41,0x3c,0x01,0x53]
+; CHECK: ubfx x1, x2, #1, #15        ; encoding: [0x41,0x3c,0x41,0xd3]
+; CHECK: sbfiz wzr, w0, #31, #1      ; encoding: [0x1f,0x00,0x01,0x13]
+; CHECK: sbfiz xzr, x0, #31, #1      ; encoding: [0x1f,0x00,0x61,0x93]
+; CHECK: lsl  wzr, w0, #31           ; encoding: [0x1f,0x00,0x01,0x53]
+; CHECK: ubfiz xzr, x0, #31, #1      ; encoding: [0x1f,0x00,0x61,0xd3]
+
+;==---------------------------------------------------------------------------==
+; 5.4.5 Extract (immediate)
+;==---------------------------------------------------------------------------==
+
+  extr w1, w2, w3, #15
+  extr x2, x3, x4, #1
+
+; CHECK: extr w1, w2, w3, #15        ; encoding: [0x41,0x3c,0x83,0x13]
+; CHECK: extr x2, x3, x4, #1         ; encoding: [0x62,0x04,0xc4,0x93]
diff --git a/test/MC/AArch64/arm64-branch-encoding.s b/test/MC/AArch64/arm64-branch-encoding.s
new file mode 100644
index 0000000..48c2099
--- /dev/null
+++ b/test/MC/AArch64/arm64-branch-encoding.s
@@ -0,0 +1,159 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+
+;-----------------------------------------------------------------------------
+; Unconditional branch (register) instructions.
+;-----------------------------------------------------------------------------
+
+  ret
+; CHECK: encoding: [0xc0,0x03,0x5f,0xd6]
+  ret x1
+; CHECK: encoding: [0x20,0x00,0x5f,0xd6]
+  drps
+; CHECK: encoding: [0xe0,0x03,0xbf,0xd6]
+  eret
+; CHECK: encoding: [0xe0,0x03,0x9f,0xd6]
+  br  x5
+; CHECK: encoding: [0xa0,0x00,0x1f,0xd6]
+  blr x9
+; CHECK: encoding: [0x20,0x01,0x3f,0xd6]
+  bl  L1
+; CHECK: bl L1   ; encoding: [A,A,A,0b100101AA]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_call26
+
+;-----------------------------------------------------------------------------
+; Contitional branch instructions.
+;-----------------------------------------------------------------------------
+
+  b     L1
+; CHECK: b L1      ; encoding: [A,A,A,0b000101AA]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch26
+  b.eq  L1
+; CHECK: b.eq L1   ; encoding: [0bAAA00000,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.ne  L1
+; CHECK: b.ne L1   ; encoding: [0bAAA00001,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.cs  L1
+; CHECK: b.hs L1   ; encoding: [0bAAA00010,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.cc  L1
+; CHECK: b.lo L1   ; encoding: [0bAAA00011,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.mi  L1
+; CHECK: b.mi L1   ; encoding: [0bAAA00100,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.pl  L1
+; CHECK: b.pl L1   ; encoding: [0bAAA00101,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.vs  L1
+; CHECK: b.vs L1   ; encoding: [0bAAA00110,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.vc  L1
+; CHECK: b.vc L1   ; encoding: [0bAAA00111,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.hi  L1
+; CHECK: b.hi L1   ; encoding: [0bAAA01000,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.ls  L1
+; CHECK: b.ls L1   ; encoding: [0bAAA01001,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.ge  L1
+; CHECK: b.ge L1   ; encoding: [0bAAA01010,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.lt  L1
+; CHECK: b.lt L1   ; encoding: [0bAAA01011,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.gt  L1
+; CHECK: b.gt L1   ; encoding: [0bAAA01100,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.le  L1
+; CHECK: b.le L1   ; encoding: [0bAAA01101,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+  b.al  L1
+; CHECK: b.al L1      ; encoding: [0bAAA01110,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_aarch64_pcrel_branch19
+L1:
+  b #28
+; CHECK: b #28
+  b.lt #28
+; CHECK: b.lt #28
+  b.cc #1048572
+; CHECK: b.lo	#1048572                ; encoding: [0xe3,0xff,0x7f,0x54]
+  b #134217724
+; CHECK: b	#134217724              ; encoding: [0xff,0xff,0xff,0x15]
+  b #-134217728
+; CHECK: b	#-134217728             ; encoding: [0x00,0x00,0x00,0x16]
+
+;-----------------------------------------------------------------------------
+; Compare-and-branch instructions.
+;-----------------------------------------------------------------------------
+
+  cbz w1, foo
+; CHECK: encoding: [0bAAA00001,A,A,0x34]
+  cbz x1, foo
+; CHECK: encoding: [0bAAA00001,A,A,0xb4]
+  cbnz w2, foo
+; CHECK: encoding: [0bAAA00010,A,A,0x35]
+  cbnz x2, foo
+; CHECK: encoding: [0bAAA00010,A,A,0xb5]
+  cbz w1, #28
+; CHECK: cbz w1, #28
+  cbz     w20, #1048572
+; CHECK: cbz	w20, #1048572           ; encoding: [0xf4,0xff,0x7f,0x34]
+  cbnz x2, #-1048576
+; CHECK: cbnz	x2, #-1048576           ; encoding: [0x02,0x00,0x80,0xb5]
+
+
+;-----------------------------------------------------------------------------
+; Bit-test-and-branch instructions.
+;-----------------------------------------------------------------------------
+
+  tbz x1, #3, foo
+; CHECK: encoding: [0bAAA00001,A,0b00011AAA,0x36]
+  tbnz x1, #63, foo
+; CHECK: encoding: [0bAAA00001,A,0b11111AAA,0xb7]
+
+  tbz w1, #3, foo
+; CHECK: encoding: [0bAAA00001,A,0b00011AAA,0x36]
+  tbnz w1, #31, foo
+; CHECK: encoding: [0bAAA00001,A,0b11111AAA,0x37]
+
+  tbz w1, #3, #28
+; CHECK: tbz w1, #3, #28
+  tbz w3, #5, #32764
+; CHECK: tbz	w3, #5, #32764          ; encoding: [0xe3,0xff,0x2b,0x36]
+  tbnz x3, #8, #-32768
+; CHECK: tbnz	w3, #8, #-32768         ; encoding: [0x03,0x00,0x44,0x37]
+
+;-----------------------------------------------------------------------------
+; Exception generation instructions.
+;-----------------------------------------------------------------------------
+
+  brk   #1
+; CHECK: encoding: [0x20,0x00,0x20,0xd4]
+  dcps1 #2
+; CHECK: encoding: [0x41,0x00,0xa0,0xd4]
+  dcps2 #3
+; CHECK: encoding: [0x62,0x00,0xa0,0xd4]
+  dcps3 #4
+; CHECK: encoding: [0x83,0x00,0xa0,0xd4]
+  hlt   #5
+; CHECK: encoding: [0xa0,0x00,0x40,0xd4]
+  hvc   #6
+; CHECK: encoding: [0xc2,0x00,0x00,0xd4]
+  smc   #7
+; CHECK: encoding: [0xe3,0x00,0x00,0xd4]
+  svc   #8
+; CHECK: encoding: [0x01,0x01,0x00,0xd4]
+
+; The immediate defaults to zero for DCPSn
+  dcps1
+  dcps2
+  dcps3
+
+; CHECK: dcps1                     ; encoding: [0x01,0x00,0xa0,0xd4]
+; CHECK: dcps2                     ; encoding: [0x02,0x00,0xa0,0xd4]
+; CHECK: dcps3                     ; encoding: [0x03,0x00,0xa0,0xd4]
+
diff --git a/test/MC/AArch64/arm64-condbr-without-dots.s b/test/MC/AArch64/arm64-condbr-without-dots.s
new file mode 100644
index 0000000..2a9f7a7
--- /dev/null
+++ b/test/MC/AArch64/arm64-condbr-without-dots.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple arm64-apple-ios -o - %s | FileCheck %s
+        
+        beq lbl
+        bne lbl
+        bcs lbl
+        bhs lbl
+        blo lbl
+        bcc lbl
+        bmi lbl
+        bpl lbl
+        bvs lbl
+        bvc lbl
+        bhi lbl
+        bls lbl
+        bge lbl
+        blt lbl
+        bgt lbl
+        ble lbl
+        bal lbl
+
+// CHECK: b.eq lbl
+// CHECK: b.ne lbl
+// CHECK: b.hs lbl
+// CHECK: b.hs lbl
+// CHECK: b.lo lbl
+// CHECK: b.lo lbl
+// CHECK: b.mi lbl
+// CHECK: b.pl lbl
+// CHECK: b.vs lbl
+// CHECK: b.vc lbl
+// CHECK: b.hi lbl
+// CHECK: b.ls lbl
+// CHECK: b.ge lbl
+// CHECK: b.lt lbl
+// CHECK: b.gt lbl
+// CHECK: b.le lbl
+// CHECK: b.al lbl
diff --git a/test/MC/AArch64/arm64-crypto.s b/test/MC/AArch64/arm64-crypto.s
new file mode 100644
index 0000000..51efd21
--- /dev/null
+++ b/test/MC/AArch64/arm64-crypto.s
@@ -0,0 +1,66 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -show-encoding -output-asm-variant=1 < %s | FileCheck %s
+
+foo:
+  aese.16b v0, v1
+  aesd.16b v0, v1
+  aesmc.16b v0, v1
+  aesimc.16b v0, v1
+
+  sha1c.4s q0, s1, v2
+  sha1p.4s q0, s1, v2
+  sha1m.4s q0, s1, v2
+  sha1su0.4s v0, v1, v2
+  sha256h.4s q0, q1, v2
+  sha256h2.4s q0, q1, v2
+  sha256su1.4s v0, v1, v2
+  sha1h s0, s1
+  sha1su1.4s v0, v1
+  sha256su0.4s v0, v1
+
+; CHECK: aese.16b v0, v1               ; encoding: [0x20,0x48,0x28,0x4e]
+; CHECK: aesd.16b v0, v1               ; encoding: [0x20,0x58,0x28,0x4e]
+; CHECK: aesmc.16b v0, v1              ; encoding: [0x20,0x68,0x28,0x4e]
+; CHECK: aesimc.16b v0, v1             ; encoding: [0x20,0x78,0x28,0x4e]
+
+; CHECK: sha1c.4s q0, s1, v2           ; encoding: [0x20,0x00,0x02,0x5e]
+; CHECK: sha1p.4s q0, s1, v2           ; encoding: [0x20,0x10,0x02,0x5e]
+; CHECK: sha1m.4s q0, s1, v2           ; encoding: [0x20,0x20,0x02,0x5e]
+; CHECK: sha1su0.4s v0, v1, v2         ; encoding: [0x20,0x30,0x02,0x5e]
+; CHECK: sha256h.4s q0, q1, v2         ; encoding: [0x20,0x40,0x02,0x5e]
+; CHECK: sha256h2.4s q0, q1, v2        ; encoding: [0x20,0x50,0x02,0x5e]
+; CHECK: sha256su1.4s v0, v1, v2       ; encoding: [0x20,0x60,0x02,0x5e]
+; CHECK: sha1h s0, s1                  ; encoding: [0x20,0x08,0x28,0x5e]
+; CHECK: sha1su1.4s v0, v1             ; encoding: [0x20,0x18,0x28,0x5e]
+; CHECK: sha256su0.4s v0, v1           ; encoding: [0x20,0x28,0x28,0x5e]
+
+  aese v2.16b, v3.16b
+  aesd v5.16b, v7.16b
+  aesmc v11.16b, v13.16b
+  aesimc v17.16b, v19.16b
+
+; CHECK: aese.16b v2, v3            ; encoding: [0x62,0x48,0x28,0x4e]
+; CHECK: aesd.16b v5, v7            ; encoding: [0xe5,0x58,0x28,0x4e]
+; CHECK: aesmc.16b v11, v13         ; encoding: [0xab,0x69,0x28,0x4e]
+; CHECK: aesimc.16b v17, v19        ; encoding: [0x71,0x7a,0x28,0x4e]
+
+  sha1c q23, s29, v3.4s
+  sha1p q14, s15, v9.4s
+  sha1m q2, s6, v5.4s
+  sha1su0 v3.4s, v5.4s, v9.4s
+  sha256h q2, q7, v18.4s
+  sha256h2 q28, q18, v28.4s
+  sha256su1 v4.4s, v5.4s, v9.4s
+  sha1h s30, s0
+  sha1su1 v10.4s, v21.4s
+  sha256su0 v2.4s, v31.4s
+
+; CHECK: sha1c.4s q23, s29, v3       ; encoding: [0xb7,0x03,0x03,0x5e]
+; CHECK: sha1p.4s q14, s15, v9       ; encoding: [0xee,0x11,0x09,0x5e]
+; CHECK: sha1m.4s q2, s6, v5         ; encoding: [0xc2,0x20,0x05,0x5e]
+; CHECK: sha1su0.4s v3, v5, v9       ; encoding: [0xa3,0x30,0x09,0x5e]
+; CHECK: sha256h.4s q2, q7, v18      ; encoding: [0xe2,0x40,0x12,0x5e]
+; CHECK: sha256h2.4s q28, q18, v28   ; encoding: [0x5c,0x52,0x1c,0x5e]
+; CHECK: sha256su1.4s v4, v5, v9     ; encoding: [0xa4,0x60,0x09,0x5e]
+; CHECK: sha1h s30, s0               ; encoding: [0x1e,0x08,0x28,0x5e]
+; CHECK: sha1su1.4s v10, v21         ; encoding: [0xaa,0x1a,0x28,0x5e]
+; CHECK: sha256su0.4s v2, v31        ; encoding: [0xe2,0x2b,0x28,0x5e]
diff --git a/test/MC/AArch64/arm64-diagno-predicate.s b/test/MC/AArch64/arm64-diagno-predicate.s
new file mode 100644
index 0000000..3b757e8
--- /dev/null
+++ b/test/MC/AArch64/arm64-diagno-predicate.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc  -triple arm64-linux-gnu -mattr=-fp-armv8,-crc < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
+
+
+        fcvt d0, s0
+// CHECK-ERROR: error: instruction requires: fp-armv8
+// CHECK-ERROR-NEXT:        fcvt d0, s0
+// CHECK-ERROR-NEXT:        ^
+
+        fmla v9.2s, v9.2s, v0.2s
+// CHECK-ERROR: error: instruction requires: neon
+// CHECK-ERROR-NEXT:        fmla v9.2s, v9.2s, v0.2s
+// CHECK-ERROR-NEXT:        ^
+
+        pmull v0.1q, v1.1d, v2.1d
+// CHECK-ERROR: error: instruction requires: crypto
+// CHECK-ERROR-NEXT:        pmull v0.1q, v1.1d, v2.1d
+// CHECK-ERROR-NEXT:        ^
+
+        crc32b  w5, w7, w20
+// CHECK-ERROR: error: instruction requires: crc
+// CHECK-ERROR-NEXT:        crc32b  w5, w7, w20
+// CHECK-ERROR-NEXT:        ^
+
diff --git a/test/MC/AArch64/arm64-diags.s b/test/MC/AArch64/arm64-diags.s
new file mode 100644
index 0000000..cf00e98
--- /dev/null
+++ b/test/MC/AArch64/arm64-diags.s
@@ -0,0 +1,392 @@
+; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+foo:
+
+; The first should encode as an expression. The second should error expecting
+; a register.
+  ldr x3, (foo + 4)
+  ldr x3, [foo + 4]
+; CHECK:  ldr x3, foo+4               ; encoding: [0bAAA00011,A,A,0x58]
+; CHECK:                              ;   fixup A - offset: 0, value: foo+4, kind: fixup_aarch64_ldr_pcrel_imm19
+; CHECK-ERRORS: error: invalid operand for instruction
+
+; The last argument should be flagged as an error.  rdar://9576009
+  ld4.8b	{v0, v1, v2, v3}, [x0], #33
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: ld4.8b	{v0, v1, v2, v3}, [x0], #33
+
+
+        ldr x0, [x0, #804]
+        ldr w0, [x0, #802]
+        ldr x0, [x0, #804]!
+        ldr w0, [w0, #301]!
+        ldr x0, [x0], #804
+        ldr w0, [w0], #301
+
+        ldp w3, w4, [x5, #11]!
+        ldp x3, x4, [x5, #12]!
+        ldp q3, q4, [x5, #12]!
+        ldp w3, w4, [x5], #11
+        ldp x3, x4, [x5], #12
+        ldp q3, q4, [x5], #12
+
+        ldur x0, [x1, #-257]
+
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
+; CHECK-ERRORS:         ldr x0, [x0, #804]
+; CHECK-ERRORS:                 ^
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
+; CHECK-ERRORS:         ldr w0, [x0, #802]
+; CHECK-ERRORS:                 ^
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
+; CHECK-ERRORS:         ldr x0, [x0, #804]!
+; CHECK-ERRORS:                 ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS:         ldr w0, [w0, #301]!
+; CHECK-ERRORS:                  ^
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
+; CHECK-ERRORS:         ldr x0, [x0], #804
+; CHECK-ERRORS:                       ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS:         ldr w0, [w0], #301
+; CHECK-ERRORS:                  ^
+; CHECK-ERRORS: error: index must be a multiple of 4 in range [-256, 252].
+; CHECK-ERRORS:         ldp w3, w4, [x5, #11]!
+; CHECK-ERRORS:                     ^
+; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512, 504].
+; CHECK-ERRORS:         ldp x3, x4, [x5, #12]!
+; CHECK-ERRORS:                     ^
+; CHECK-ERRORS: error: index must be a multiple of 16 in range [-1024, 1008].
+; CHECK-ERRORS:         ldp q3, q4, [x5, #12]!
+; CHECK-ERRORS:                     ^
+; CHECK-ERRORS: error: index must be a multiple of 4 in range [-256, 252].
+; CHECK-ERRORS:         ldp w3, w4, [x5], #11
+; CHECK-ERRORS:                           ^
+; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512, 504].
+; CHECK-ERRORS:         ldp x3, x4, [x5], #12
+; CHECK-ERRORS:                           ^
+; CHECK-ERRORS: error: index must be a multiple of 16 in range [-1024, 1008].
+; CHECK-ERRORS:         ldp q3, q4, [x5], #12
+; CHECK-ERRORS:                           ^
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
+; CHECK-ERRORS:         ldur x0, [x1, #-257]
+; CHECK-ERRORS:                   ^
+
+
+ldrb   w1, [x3, w3, sxtw #4]
+ldrh   w1, [x3, w3, sxtw #4]
+ldr    w1, [x3, w3, sxtw #4]
+ldr    x1, [x3, w3, sxtw #4]
+ldr    b1, [x3, w3, sxtw #4]
+ldr    h1, [x3, w3, sxtw #4]
+ldr    s1, [x3, w3, sxtw #4]
+ldr    d1, [x3, w3, sxtw #4]
+ldr    q1, [x3, w3, sxtw #1]
+
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0
+; CHECK-ERRORS:ldrb   w1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #1
+; CHECK-ERRORS:ldrh   w1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
+; CHECK-ERRORS:ldr    w1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
+; CHECK-ERRORS:ldr    x1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0
+; CHECK-ERRORS:ldr    b1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #1
+; CHECK-ERRORS:ldr    h1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
+; CHECK-ERRORS:ldr    s1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
+; CHECK-ERRORS:ldr    d1, [x3, w3, sxtw #4]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #4
+; CHECK-ERRORS:ldr    q1, [x3, w3, sxtw #1]
+; CHECK-ERRORS:           ^
+
+; Check that register offset addressing modes only accept 32-bit offset
+; registers when using uxtw/sxtw extends. Everything else requires a 64-bit
+; register.
+  str    d1, [x3, w3, sxtx #3]
+  ldr    s1, [x3, d3, sxtx #2]
+
+; CHECK-ERRORS: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
+; CHECK-ERRORS:   str    d1, [x3, w3, sxtx #3]
+; CHECK-ERRORS:                       ^
+; CHECK-ERRORS: error: index must be an integer in range [-256, 255].
+; CHECK-ERRORS:   ldr    s1, [x3, d3, sxtx #2]
+; CHECK-ERRORS:                   ^
+
+; Shift immediates range checking.
+  sqrshrn b4, h9, #10
+  rshrn v9.8b, v11.8h, #17
+  sqrshrn v7.4h, v8.4s, #39
+  uqshrn2 v4.4s, v5.2d, #67
+
+; CHECK-ERRORS: error: immediate must be an integer in range [1, 8].
+; CHECK-ERRORS:   sqrshrn b4, h9, #10
+; CHECK-ERRORS:                   ^
+; CHECK-ERRORS: error: immediate must be an integer in range [1, 8].
+; CHECK-ERRORS:   rshrn v9.8b, v11.8h, #17
+; CHECK-ERRORS:                        ^
+; CHECK-ERRORS: error: immediate must be an integer in range [1, 16].
+; CHECK-ERRORS:   sqrshrn v7.4h, v8.4s, #39
+; CHECK-ERRORS:                         ^
+; CHECK-ERRORS: error: immediate must be an integer in range [1, 32].
+; CHECK-ERRORS:   uqshrn2 v4.4s, v5.2d, #67
+; CHECK-ERRORS:                         ^
+
+
+  st1.s4 {v14, v15}, [x2], #32
+; CHECK-ERRORS: error: invalid type suffix for instruction
+; CHECK-ERRORS: st1.s4 {v14, v15}, [x2], #32
+; CHECK-ERRORS:     ^
+
+
+
+; Load pair instructions where Rt==Rt2 and writeback load/store instructions
+; where Rt==Rn or Rt2==Rn are unpredicatable.
+  ldp x1, x2, [x2], #16
+  ldp x2, x2, [x2], #16
+  ldp w1, w2, [x2], #16
+  ldp w2, w2, [x2], #16
+  ldp x1, x1, [x2]
+
+  ldr x2, [x2], #8
+  ldr x2, [x2, #8]!
+  ldr w2, [x2], #8
+  ldr w2, [x2, #8]!
+
+  str x2, [x2], #8
+  str x2, [x2, #8]!
+  str w2, [x2], #8
+  str w2, [x2, #8]!
+
+; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
+; CHECK-ERRORS:   ldp x1, x2, [x2], #16
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
+; CHECK-ERRORS:   ldp x2, x2, [x2], #16
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
+; CHECK-ERRORS:   ldp w1, w2, [x2], #16
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
+; CHECK-ERRORS:   ldp w2, w2, [x2], #16
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
+; CHECK-ERRORS:   ldp x1, x1, [x2]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
+; CHECK-ERRORS:   ldr x2, [x2], #8
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
+; CHECK-ERRORS:   ldr x2, [x2, #8]!
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
+; CHECK-ERRORS:   ldr w2, [x2], #8
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
+; CHECK-ERRORS:   ldr w2, [x2, #8]!
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
+; CHECK-ERRORS:   str x2, [x2], #8
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
+; CHECK-ERRORS:   str x2, [x2, #8]!
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
+; CHECK-ERRORS:   str w2, [x2], #8
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
+; CHECK-ERRORS:   str w2, [x2, #8]!
+; CHECK-ERRORS:       ^
+
+; The validity checking for shifted-immediate operands.  rdar://13174476
+; Where the immediate is out of range.
+  add w1, w2, w3, lsr #75
+
+; CHECK-ERRORS: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+; CHECK-ERRORS: add w1, w2, w3, lsr #75
+; CHECK-ERRORS:                      ^
+
+; logical instructions on 32-bit regs with shift > 31 is not legal
+orr w0, w0, w0, lsl #32
+; CHECK-ERRORS: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+; CHECK-ERRORS:        orr w0, w0, w0, lsl #32
+; CHECK-ERRORS:                        ^
+eor w0, w0, w0, lsl #32
+; CHECK-ERRORS: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+; CHECK-ERRORS:        eor w0, w0, w0, lsl #32
+; CHECK-ERRORS:                        ^
+and w0, w0, w0, lsl #32
+; CHECK-ERRORS: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+; CHECK-ERRORS:        and w0, w0, w0, lsl #32
+; CHECK-ERRORS:                        ^
+ands w0, w0, w0, lsl #32
+; CHECK-ERRORS: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+; CHECK-ERRORS:        ands w0, w0, w0, lsl #32
+; CHECK-ERRORS:                        ^
+
+; Relocated expressions should not be accepted for 32-bit adds or sub (imm)
+add w3, w5, sym@PAGEOFF
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: add w3, w5, sym@PAGEOFF
+; CHECK-ERRORS:             ^
+
+adds w3, w5, sym@PAGEOFF
+adds x9, x12, sym@PAGEOFF
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: adds w3, w5, sym@PAGEOFF
+; CHECK-ERRORS:              ^
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: adds x9, x12, sym@PAGEOFF
+; CHECK-ERRORS:               ^
+
+sub x3, x5, sym@PAGEOFF
+sub w20, w30, sym@PAGEOFF
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: sub x3, x5, sym@PAGEOFF
+; CHECK-ERRORS:             ^
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: sub w20, w30, sym@PAGEOFF
+; CHECK-ERRORS:               ^
+
+subs w9, w10, sym@PAGEOFF
+subs x20, x30, sym@PAGEOFF
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: subs w9, w10, sym@PAGEOFF
+; CHECK-ERRORS:               ^
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: subs x20, x30, sym@PAGEOFF
+; CHECK-ERRORS:                ^
+
+tbl v0.8b, { v1 }, v0.8b
+tbl v0.16b, { v1.8b, v2.8b, v3.8b }, v0.16b
+tbx v3.16b, { v12.8b, v13.8b, v14.8b }, v6.8b
+tbx v2.8b, { v0 }, v6.8b
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: tbl v0.8b, { v1 }, v0.8b
+; CHECK-ERRORS:            ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: tbl v0.16b, { v1.8b, v2.8b, v3.8b }, v0.16b
+; CHECK-ERRORS:             ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: tbx v3.16b, { v12.8b, v13.8b, v14.8b }, v6.8b
+; CHECK-ERRORS:             ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: tbx v2.8b, { v0 }, v6.8b
+; CHECK-ERRORS:            ^
+
+b.c #0x4
+; CHECK-ERRORS: error: invalid condition code
+; CHECK-ERRORS: b.c #0x4
+; CHECK-ERRORS:   ^
+
+ic ialluis, x0
+; CHECK-ERRORS: error: specified ic op does not use a register
+ic iallu, x0
+; CHECK-ERRORS: error: specified ic op does not use a register
+ic ivau
+; CHECK-ERRORS: error: specified ic op requires a register
+
+dc zva
+; CHECK-ERRORS: error: specified dc op requires a register
+dc ivac
+; CHECK-ERRORS: error: specified dc op requires a register
+dc isw
+; CHECK-ERRORS: error: specified dc op requires a register
+dc cvac
+; CHECK-ERRORS: error: specified dc op requires a register
+dc csw
+; CHECK-ERRORS: error: specified dc op requires a register
+dc cvau
+; CHECK-ERRORS: error: specified dc op requires a register
+dc civac
+; CHECK-ERRORS: error: specified dc op requires a register
+dc cisw
+; CHECK-ERRORS: error: specified dc op requires a register
+
+at s1e1r
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e2r
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e3r
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e1w
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e2w
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e3w
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e0r
+; CHECK-ERRORS: error: specified at op requires a register
+at s1e0w
+; CHECK-ERRORS: error: specified at op requires a register
+at s12e1r
+; CHECK-ERRORS: error: specified at op requires a register
+at s12e1w
+; CHECK-ERRORS: error: specified at op requires a register
+at s12e0r
+; CHECK-ERRORS: error: specified at op requires a register
+at s12e0w
+; CHECK-ERRORS: error: specified at op requires a register
+
+tlbi vmalle1is, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi vmalle1, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi alle1is, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi alle2is, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi alle3is, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi alle1, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi alle2, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi alle3, x0
+; CHECK-ERRORS: error: specified tlbi op does not use a register
+tlbi vae1is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vae2is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vae3is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi aside1is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vaae1is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vale1is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vaale1is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vale2is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vale3is
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vae1
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vae2
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vae3
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi aside1
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vaae1
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vale1
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vale2
+; CHECK-ERRORS: error: specified tlbi op requires a register
+tlbi vale3
+; CHECK-ERRORS: error: specified tlbi op requires a register
diff --git a/test/MC/AArch64/arm64-directive_loh.s b/test/MC/AArch64/arm64-directive_loh.s
new file mode 100644
index 0000000..76d2d7f
--- /dev/null
+++ b/test/MC/AArch64/arm64-directive_loh.s
@@ -0,0 +1,93 @@
+# RUN: not llvm-mc -triple arm64-apple-darwin < %s 2> %t | FileCheck %s
+# RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+.globl _fct1
+_fct1:
+  L1:
+  L2:
+  L3:
+  L4:
+  ret lr;
+
+# Known LOHs with:
+# - Regular syntax.
+# - Alternative syntax.
+
+# CHECK: .loh AdrpAdrp L1, L2
+# CHECK: .loh AdrpAdrp L1, L2
+.loh AdrpAdrp L1, L2
+.loh 1 L1, L2
+
+# CHECK: .loh AdrpLdr L1, L2
+# CHECK: .loh AdrpLdr L1, L2
+.loh AdrpLdr L1, L2
+.loh 2 L1, L2
+
+# CHECK: .loh AdrpAddLdr L1, L2, L3
+# CHECK: .loh AdrpAddLdr L1, L2, L3
+.loh AdrpAddLdr L1, L2, L3
+.loh 3 L1, L2, L3
+
+# CHECK: .loh AdrpLdrGotLdr L1, L2, L3
+# CHECK: .loh AdrpLdrGotLdr L1, L2, L3
+.loh AdrpLdrGotLdr L1, L2, L3
+.loh 4 L1, L2, L3
+
+# CHECK: .loh AdrpAddStr L1, L2, L3
+# CHECK: .loh AdrpAddStr L1, L2, L3
+.loh AdrpAddStr L1, L2, L3
+.loh 5 L1, L2, L3
+
+# CHECK: .loh AdrpLdrGotStr L1, L2, L3
+# CHECK: .loh AdrpLdrGotStr L1, L2, L3
+.loh AdrpLdrGotStr L1, L2, L3
+.loh 6 L1, L2, L3
+
+# CHECK: .loh AdrpAdd L1, L2
+# CHECK: .loh AdrpAdd L1, L2
+.loh AdrpAdd L1, L2
+.loh 7 L1, L2
+
+# CHECK: .loh AdrpLdrGot L1, L2
+# CHECK: .loh AdrpLdrGot L1, L2
+.loh AdrpLdrGot L1, L2
+.loh 8 L1, L2
+
+# End Known LOHs.
+
+### Errors Check ####
+
+# Unknown textual identifier.
+# CHECK-ERRORS: error: invalid identifier in directive
+# CHECK-ERRORS-NEXT: .loh Unknown
+# CHECK-ERRORS-NEXT:      ^
+.loh Unknown
+# Unknown numeric identifier.
+# CHECK-ERRORS: error: invalid numeric identifier in directive
+# CHECK-ERRORS-NEXT: .loh 153, L1
+# CHECK-ERRORS-NEXT:      ^
+.loh 153, L1
+
+# Too much arguments.
+# CHECK-ERRORS: error: unexpected token in '.loh' directive
+# CHECK-ERRORS-NEXT: .loh AdrpAdrp L1, L2, L3
+# CHECK-ERRORS-NEXT:                     ^
+.loh AdrpAdrp L1, L2, L3
+
+# Too much arguments with alternative syntax.
+# CHECK-ERRORS: error: unexpected token in '.loh' directive
+# CHECK-ERRORS-NEXT: .loh 1 L1, L2, L3
+# CHECK-ERRORS-NEXT:              ^
+.loh 1 L1, L2, L3
+
+# Too few argumets.
+# CHECK-ERRORS: error: unexpected token in '.loh' directive
+# CHECK-ERRORS-NEXT: .loh AdrpAdrp L1
+# CHECK-ERRORS-NEXT:                 ^
+.loh AdrpAdrp L1
+
+# Too few argumets with alternative syntax.
+# CHECK-ERRORS: error: unexpected token in '.loh' directive
+# CHECK-ERRORS-NEXT: .loh 1 L1
+# CHECK-ERRORS-NEXT:          ^
+.loh 1 L1
diff --git a/test/MC/AArch64/arm64-elf-reloc-condbr.s b/test/MC/AArch64/arm64-elf-reloc-condbr.s
new file mode 100644
index 0000000..9b70a20
--- /dev/null
+++ b/test/MC/AArch64/arm64-elf-reloc-condbr.s
@@ -0,0 +1,10 @@
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj %s -o - | \
+// RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
+
+        b.eq somewhere
+
+// OBJ:      Relocations [
+// OBJ-NEXT:   Section (2) .rela.text {
+// OBJ-NEXT:     0x0 R_AARCH64_CONDBR19 somewhere 0x0
+// OBJ-NEXT:   }
+// OBJ-NEXT: ]
diff --git a/test/MC/AArch64/arm64-elf-relocs.s b/test/MC/AArch64/arm64-elf-relocs.s
new file mode 100644
index 0000000..eb22cc2
--- /dev/null
+++ b/test/MC/AArch64/arm64-elf-relocs.s
@@ -0,0 +1,249 @@
+// RUN: llvm-mc -triple=arm64-linux-gnu -o - < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
+
+   add x0, x2, #:lo12:sym
+// CHECK: add x0, x2, :lo12:sym
+// CHECK-OBJ: 0 R_AARCH64_ADD_ABS_LO12_NC sym
+
+   add x5, x7, #:dtprel_lo12:sym
+// CHECK: add x5, x7, :dtprel_lo12:sym
+// CHECK-OBJ: 4 R_AARCH64_TLSLD_ADD_DTPREL_LO12 sym
+
+   add x9, x12, #:dtprel_lo12_nc:sym
+// CHECK: add x9, x12, :dtprel_lo12_nc:sym
+// CHECK-OBJ: 8 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC sym
+
+   add x20, x30, #:tprel_lo12:sym
+// CHECK: add x20, x30, :tprel_lo12:sym
+// CHECK-OBJ: c R_AARCH64_TLSLE_ADD_TPREL_LO12 sym
+
+   add x9, x12, #:tprel_lo12_nc:sym
+// CHECK: add x9, x12, :tprel_lo12_nc:sym
+// CHECK-OBJ: 10 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC sym
+
+   add x5, x0, #:tlsdesc_lo12:sym
+// CHECK: add x5, x0, :tlsdesc_lo12:sym
+// CHECK-OBJ: 14 R_AARCH64_TLSDESC_ADD_LO12_NC sym
+
+        add x0, x2, #:lo12:sym+8
+// CHECK: add x0, x2, :lo12:sym
+// CHECK-OBJ: 18 R_AARCH64_ADD_ABS_LO12_NC sym+8
+
+   add x5, x7, #:dtprel_lo12:sym+1
+// CHECK: add x5, x7, :dtprel_lo12:sym+1
+// CHECK-OBJ: 1c R_AARCH64_TLSLD_ADD_DTPREL_LO12 sym+1
+
+   add x9, x12, #:dtprel_lo12_nc:sym+2
+// CHECK: add x9, x12, :dtprel_lo12_nc:sym+2
+// CHECK-OBJ:20 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC sym+2
+
+   add x20, x30, #:tprel_lo12:sym+12
+// CHECK: add x20, x30, :tprel_lo12:sym+12
+// CHECK-OBJ: 24 R_AARCH64_TLSLE_ADD_TPREL_LO12 sym+12
+
+   add x9, x12, #:tprel_lo12_nc:sym+54
+// CHECK: add x9, x12, :tprel_lo12_nc:sym+54
+// CHECK-OBJ: 28 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC sym+54
+
+   add x5, x0, #:tlsdesc_lo12:sym+70
+// CHECK: add x5, x0, :tlsdesc_lo12:sym+70
+// CHECK-OBJ: 2c R_AARCH64_TLSDESC_ADD_LO12_NC sym+70
+
+        .hword sym + 4 - .
+// CHECK-OBJ: 30 R_AARCH64_PREL16 sym+4
+        .word sym - . + 8
+// CHECK-OBJ: 32 R_AARCH64_PREL32 sym+8
+        .xword sym-.
+// CHECK-OBJ: 36 R_AARCH64_PREL64 sym{{$}}
+
+        .hword sym
+// CHECK-OBJ: 3e R_AARCH64_ABS16 sym
+        .word sym+1
+// CHECK-OBJ: 40 R_AARCH64_ABS32 sym+1
+        .xword sym+16
+// CHECK-OBJ: 44 R_AARCH64_ABS64 sym+16
+
+   adrp x0, sym
+// CHECK: adrp x0, sym
+// CHECK-OBJ: 4c R_AARCH64_ADR_PREL_PG_HI21 sym
+
+   adrp x15, :got:sym
+// CHECK: adrp x15, :got:sym
+// CHECK-OBJ: 50 R_AARCH64_ADR_GOT_PAGE sym
+
+   adrp x29, :gottprel:sym
+// CHECK: adrp x29, :gottprel:sym
+// CHECK-OBJ: 54 R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 sym
+
+   adrp x2, :tlsdesc:sym
+// CHECK: adrp x2, :tlsdesc:sym
+// CHECK-OBJ: 58 R_AARCH64_TLSDESC_ADR_PAGE sym
+
+   // LLVM is not competent enough to do this relocation because the
+   // page boundary could occur anywhere after linking. A relocation
+   // is needed.
+   adrp x3, trickQuestion
+   .global trickQuestion
+trickQuestion:
+// CHECK: adrp x3, trickQuestion
+// CHECK-OBJ: 5c R_AARCH64_ADR_PREL_PG_HI21 trickQuestion
+
+   ldrb w2, [x3, :lo12:sym]
+   ldrsb w5, [x7, #:lo12:sym]
+   ldrsb x11, [x13, :lo12:sym]
+   ldr b17, [x19, #:lo12:sym]
+// CHECK: ldrb w2, [x3, :lo12:sym]
+// CHECK: ldrsb w5, [x7, :lo12:sym]
+// CHECK: ldrsb x11, [x13, :lo12:sym]
+// CHECK: ldr b17, [x19, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
+
+   ldrb w23, [x29, #:dtprel_lo12_nc:sym]
+   ldrsb w23, [x19, #:dtprel_lo12:sym]
+   ldrsb x17, [x13, :dtprel_lo12_nc:sym]
+   ldr b11, [x7, #:dtprel_lo12:sym]
+// CHECK: ldrb w23, [x29, :dtprel_lo12_nc:sym]
+// CHECK: ldrsb w23, [x19, :dtprel_lo12:sym]
+// CHECK: ldrsb x17, [x13, :dtprel_lo12_nc:sym]
+// CHECK: ldr b11, [x7, :dtprel_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12 sym
+
+   ldrb w1, [x2, :tprel_lo12:sym]
+   ldrsb w3, [x4, #:tprel_lo12_nc:sym]
+   ldrsb x5, [x6, :tprel_lo12:sym]
+   ldr b7, [x8, #:tprel_lo12_nc:sym]
+// CHECK: ldrb w1, [x2, :tprel_lo12:sym]
+// CHECK: ldrsb w3, [x4, :tprel_lo12_nc:sym]
+// CHECK: ldrsb x5, [x6, :tprel_lo12:sym]
+// CHECK: ldr b7, [x8, :tprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC sym
+
+   ldrh w2, [x3, #:lo12:sym]
+   ldrsh w5, [x7, :lo12:sym]
+   ldrsh x11, [x13, #:lo12:sym]
+   ldr h17, [x19, :lo12:sym]
+// CHECK: ldrh w2, [x3, :lo12:sym]
+// CHECK: ldrsh w5, [x7, :lo12:sym]
+// CHECK: ldrsh x11, [x13, :lo12:sym]
+// CHECK: ldr h17, [x19, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
+
+   ldrh w23, [x29, #:dtprel_lo12_nc:sym]
+   ldrsh w23, [x19, :dtprel_lo12:sym]
+   ldrsh x17, [x13, :dtprel_lo12_nc:sym]
+   ldr h11, [x7, #:dtprel_lo12:sym]
+// CHECK: ldrh w23, [x29, :dtprel_lo12_nc:sym]
+// CHECK: ldrsh w23, [x19, :dtprel_lo12:sym]
+// CHECK: ldrsh x17, [x13, :dtprel_lo12_nc:sym]
+// CHECK: ldr h11, [x7, :dtprel_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12 sym
+
+   ldrh w1, [x2, :tprel_lo12:sym]
+   ldrsh w3, [x4, #:tprel_lo12_nc:sym]
+   ldrsh x5, [x6, :tprel_lo12:sym]
+   ldr h7, [x8, #:tprel_lo12_nc:sym]
+// CHECK: ldrh w1, [x2, :tprel_lo12:sym]
+// CHECK: ldrsh w3, [x4, :tprel_lo12_nc:sym]
+// CHECK: ldrsh x5, [x6, :tprel_lo12:sym]
+// CHECK: ldr h7, [x8, :tprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC sym
+
+   ldr w1, [x2, #:lo12:sym]
+   ldrsw x3, [x4, #:lo12:sym]
+   ldr s4, [x5, :lo12:sym]
+// CHECK: ldr w1, [x2, :lo12:sym]
+// CHECK: ldrsw x3, [x4, :lo12:sym]
+// CHECK: ldr s4, [x5, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
+
+   ldr w1, [x2, :dtprel_lo12:sym]
+   ldrsw x3, [x4, #:dtprel_lo12_nc:sym]
+   ldr s4, [x5, #:dtprel_lo12_nc:sym]
+// CHECK: ldr w1, [x2, :dtprel_lo12:sym]
+// CHECK: ldrsw x3, [x4, :dtprel_lo12_nc:sym]
+// CHECK: ldr s4, [x5, :dtprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC sym
+
+
+   ldr w1, [x2, #:tprel_lo12:sym]
+   ldrsw x3, [x4, :tprel_lo12_nc:sym]
+   ldr s4, [x5, :tprel_lo12_nc:sym]
+// CHECK: ldr w1, [x2, :tprel_lo12:sym]
+// CHECK: ldrsw x3, [x4, :tprel_lo12_nc:sym]
+// CHECK: ldr s4, [x5, :tprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC sym
+
+   ldr x28, [x27, :lo12:sym]
+   ldr d26, [x25, #:lo12:sym]
+// CHECK: ldr x28, [x27, :lo12:sym]
+// CHECK: ldr d26, [x25, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST64_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST64_ABS_LO12_NC sym
+
+   ldr x24, [x23, #:got_lo12:sym]
+   ldr d22, [x21, :got_lo12:sym]
+// CHECK: ldr x24, [x23, :got_lo12:sym]
+// CHECK: ldr d22, [x21, :got_lo12:sym]
+// CHECK-OBJ: R_AARCH64_LD64_GOT_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LD64_GOT_LO12_NC sym
+
+   ldr x24, [x23, :dtprel_lo12_nc:sym]
+   ldr d22, [x21, #:dtprel_lo12:sym]
+// CHECK: ldr x24, [x23, :dtprel_lo12_nc:sym]
+// CHECK: ldr d22, [x21, :dtprel_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST64_DTPREL_LO12 sym
+
+   ldr x24, [x23, #:tprel_lo12:sym]
+   ldr d22, [x21, :tprel_lo12_nc:sym]
+// CHECK: ldr x24, [x23, :tprel_lo12:sym]
+// CHECK: ldr d22, [x21, :tprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST64_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC sym
+
+   ldr x24, [x23, :gottprel_lo12:sym]
+   ldr d22, [x21, #:gottprel_lo12:sym]
+// CHECK: ldr x24, [x23, :gottprel_lo12:sym]
+// CHECK: ldr d22, [x21, :gottprel_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC sym
+
+   ldr x24, [x23, #:tlsdesc_lo12:sym]
+   ldr d22, [x21, :tlsdesc_lo12:sym]
+// CHECK: ldr x24, [x23, :tlsdesc_lo12:sym]
+// CHECK: ldr d22, [x21, :tlsdesc_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSDESC_LD64_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSDESC_LD64_LO12_NC sym
+
+   ldr q20, [x19, #:lo12:sym]
+// CHECK: ldr q20, [x19, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST128_ABS_LO12_NC sym
+
+// Since relocated instructions print without a '#', that syntax should
+// certainly be accepted when assembling.
+   add x3, x5, :lo12:imm
+// CHECK: add x3, x5, :lo12:imm
diff --git a/test/MC/AArch64/arm64-fp-encoding.s b/test/MC/AArch64/arm64-fp-encoding.s
new file mode 100644
index 0000000..684d988
--- /dev/null
+++ b/test/MC/AArch64/arm64-fp-encoding.s
@@ -0,0 +1,443 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon -show-encoding -output-asm-variant=1 < %s | FileCheck %s
+
+foo:
+;-----------------------------------------------------------------------------
+; Floating-point arithmetic
+;-----------------------------------------------------------------------------
+
+  fabs s1, s2
+  fabs d1, d2
+
+; CHECK: fabs s1, s2                 ; encoding: [0x41,0xc0,0x20,0x1e]
+; CHECK: fabs d1, d2                 ; encoding: [0x41,0xc0,0x60,0x1e]
+
+  fadd s1, s2, s3
+  fadd d1, d2, d3
+
+; CHECK: fadd s1, s2, s3             ; encoding: [0x41,0x28,0x23,0x1e]
+; CHECK: fadd d1, d2, d3             ; encoding: [0x41,0x28,0x63,0x1e]
+
+  fdiv s1, s2, s3
+  fdiv d1, d2, d3
+
+; CHECK: fdiv s1, s2, s3             ; encoding: [0x41,0x18,0x23,0x1e]
+; CHECK: fdiv d1, d2, d3             ; encoding: [0x41,0x18,0x63,0x1e]
+
+  fmadd s1, s2, s3, s4
+  fmadd d1, d2, d3, d4
+
+; CHECK: fmadd s1, s2, s3, s4        ; encoding: [0x41,0x10,0x03,0x1f]
+; CHECK: fmadd d1, d2, d3, d4        ; encoding: [0x41,0x10,0x43,0x1f]
+
+  fmax   s1, s2, s3
+  fmax   d1, d2, d3
+  fmaxnm s1, s2, s3
+  fmaxnm d1, d2, d3
+
+; CHECK: fmax   s1, s2, s3           ; encoding: [0x41,0x48,0x23,0x1e]
+; CHECK: fmax   d1, d2, d3           ; encoding: [0x41,0x48,0x63,0x1e]
+; CHECK: fmaxnm s1, s2, s3           ; encoding: [0x41,0x68,0x23,0x1e]
+; CHECK: fmaxnm d1, d2, d3           ; encoding: [0x41,0x68,0x63,0x1e]
+
+  fmin   s1, s2, s3
+  fmin   d1, d2, d3
+  fminnm s1, s2, s3
+  fminnm d1, d2, d3
+
+; CHECK: fmin   s1, s2, s3           ; encoding: [0x41,0x58,0x23,0x1e]
+; CHECK: fmin   d1, d2, d3           ; encoding: [0x41,0x58,0x63,0x1e]
+; CHECK: fminnm s1, s2, s3           ; encoding: [0x41,0x78,0x23,0x1e]
+; CHECK: fminnm d1, d2, d3           ; encoding: [0x41,0x78,0x63,0x1e]
+
+  fmsub s1, s2, s3, s4
+  fmsub d1, d2, d3, d4
+
+; CHECK: fmsub s1, s2, s3, s4        ; encoding: [0x41,0x90,0x03,0x1f]
+; CHECK: fmsub d1, d2, d3, d4        ; encoding: [0x41,0x90,0x43,0x1f]
+
+  fmul s1, s2, s3
+  fmul d1, d2, d3
+
+; CHECK: fmul s1, s2, s3             ; encoding: [0x41,0x08,0x23,0x1e]
+; CHECK: fmul d1, d2, d3             ; encoding: [0x41,0x08,0x63,0x1e]
+
+  fneg s1, s2
+  fneg d1, d2
+
+; CHECK: fneg s1, s2                 ; encoding: [0x41,0x40,0x21,0x1e]
+; CHECK: fneg d1, d2                 ; encoding: [0x41,0x40,0x61,0x1e]
+
+  fnmadd s1, s2, s3, s4
+  fnmadd d1, d2, d3, d4
+
+; CHECK: fnmadd s1, s2, s3, s4       ; encoding: [0x41,0x10,0x23,0x1f]
+; CHECK: fnmadd d1, d2, d3, d4       ; encoding: [0x41,0x10,0x63,0x1f]
+
+  fnmsub s1, s2, s3, s4
+  fnmsub d1, d2, d3, d4
+
+; CHECK: fnmsub s1, s2, s3, s4       ; encoding: [0x41,0x90,0x23,0x1f]
+; CHECK: fnmsub d1, d2, d3, d4       ; encoding: [0x41,0x90,0x63,0x1f]
+
+  fnmul s1, s2, s3
+  fnmul d1, d2, d3
+
+; CHECK: fnmul s1, s2, s3            ; encoding: [0x41,0x88,0x23,0x1e]
+; CHECK: fnmul d1, d2, d3            ; encoding: [0x41,0x88,0x63,0x1e]
+
+  fsqrt s1, s2
+  fsqrt d1, d2
+
+; CHECK: fsqrt s1, s2                ; encoding: [0x41,0xc0,0x21,0x1e]
+; CHECK: fsqrt d1, d2                ; encoding: [0x41,0xc0,0x61,0x1e]
+
+  fsub s1, s2, s3
+  fsub d1, d2, d3
+
+; CHECK: fsub s1, s2, s3             ; encoding: [0x41,0x38,0x23,0x1e]
+; CHECK: fsub d1, d2, d3             ; encoding: [0x41,0x38,0x63,0x1e]
+
+;-----------------------------------------------------------------------------
+; Floating-point comparison
+;-----------------------------------------------------------------------------
+
+  fccmp  s1, s2, #0, eq
+  fccmp  d1, d2, #0, eq
+  fccmpe s1, s2, #0, eq
+  fccmpe d1, d2, #0, eq
+
+; CHECK: fccmp  s1, s2, #0, eq       ; encoding: [0x20,0x04,0x22,0x1e]
+; CHECK: fccmp  d1, d2, #0, eq       ; encoding: [0x20,0x04,0x62,0x1e]
+; CHECK: fccmpe s1, s2, #0, eq       ; encoding: [0x30,0x04,0x22,0x1e]
+; CHECK: fccmpe d1, d2, #0, eq       ; encoding: [0x30,0x04,0x62,0x1e]
+
+  fcmp  s1, s2
+  fcmp  d1, d2
+  fcmp  s1, #0.0
+  fcmp  d1, #0.0
+  fcmpe s1, s2
+  fcmpe d1, d2
+  fcmpe s1, #0.0
+  fcmpe d1, #0.0
+
+; CHECK: fcmp  s1, s2                ; encoding: [0x20,0x20,0x22,0x1e]
+; CHECK: fcmp  d1, d2                ; encoding: [0x20,0x20,0x62,0x1e]
+; CHECK: fcmp  s1, #0.0              ; encoding: [0x28,0x20,0x20,0x1e]
+; CHECK: fcmp  d1, #0.0              ; encoding: [0x28,0x20,0x60,0x1e]
+; CHECK: fcmpe s1, s2                ; encoding: [0x30,0x20,0x22,0x1e]
+; CHECK: fcmpe d1, d2                ; encoding: [0x30,0x20,0x62,0x1e]
+; CHECK: fcmpe s1, #0.0              ; encoding: [0x38,0x20,0x20,0x1e]
+; CHECK: fcmpe d1, #0.0              ; encoding: [0x38,0x20,0x60,0x1e]
+
+;-----------------------------------------------------------------------------
+; Floating-point conditional select
+;-----------------------------------------------------------------------------
+
+  fcsel s1, s2, s3, eq
+  fcsel d1, d2, d3, eq
+
+; CHECK: fcsel s1, s2, s3, eq        ; encoding: [0x41,0x0c,0x23,0x1e]
+; CHECK: fcsel d1, d2, d3, eq        ; encoding: [0x41,0x0c,0x63,0x1e]
+
+;-----------------------------------------------------------------------------
+; Floating-point convert
+;-----------------------------------------------------------------------------
+
+  fcvt h1, d2
+  fcvt s1, d2
+  fcvt d1, h2
+  fcvt s1, h2
+  fcvt d1, s2
+  fcvt h1, s2
+
+; CHECK: fcvt h1, d2                 ; encoding: [0x41,0xc0,0x63,0x1e]
+; CHECK: fcvt s1, d2                 ; encoding: [0x41,0x40,0x62,0x1e]
+; CHECK: fcvt d1, h2                 ; encoding: [0x41,0xc0,0xe2,0x1e]
+; CHECK: fcvt s1, h2                 ; encoding: [0x41,0x40,0xe2,0x1e]
+; CHECK: fcvt d1, s2                 ; encoding: [0x41,0xc0,0x22,0x1e]
+; CHECK: fcvt h1, s2                 ; encoding: [0x41,0xc0,0x23,0x1e]
+
+  fcvtas w1, d2
+  fcvtas x1, d2
+  fcvtas w1, s2
+  fcvtas x1, s2
+
+; CHECK: fcvtas	w1, d2                  ; encoding: [0x41,0x00,0x64,0x1e]
+; CHECK: fcvtas	x1, d2                  ; encoding: [0x41,0x00,0x64,0x9e]
+; CHECK: fcvtas	w1, s2                  ; encoding: [0x41,0x00,0x24,0x1e]
+; CHECK: fcvtas	x1, s2                  ; encoding: [0x41,0x00,0x24,0x9e]
+
+  fcvtau w1, s2
+  fcvtau w1, d2
+  fcvtau x1, s2
+  fcvtau x1, d2
+
+; CHECK: fcvtau	w1, s2                  ; encoding: [0x41,0x00,0x25,0x1e]
+; CHECK: fcvtau	w1, d2                  ; encoding: [0x41,0x00,0x65,0x1e]
+; CHECK: fcvtau	x1, s2                  ; encoding: [0x41,0x00,0x25,0x9e]
+; CHECK: fcvtau	x1, d2                  ; encoding: [0x41,0x00,0x65,0x9e]
+
+  fcvtms w1, s2
+  fcvtms w1, d2
+  fcvtms x1, s2
+  fcvtms x1, d2
+
+; CHECK: fcvtms	w1, s2                  ; encoding: [0x41,0x00,0x30,0x1e]
+; CHECK: fcvtms	w1, d2                  ; encoding: [0x41,0x00,0x70,0x1e]
+; CHECK: fcvtms	x1, s2                  ; encoding: [0x41,0x00,0x30,0x9e]
+; CHECK: fcvtms	x1, d2                  ; encoding: [0x41,0x00,0x70,0x9e]
+
+  fcvtmu w1, s2
+  fcvtmu w1, d2
+  fcvtmu x1, s2
+  fcvtmu x1, d2
+
+; CHECK: fcvtmu	w1, s2                  ; encoding: [0x41,0x00,0x31,0x1e]
+; CHECK: fcvtmu	w1, d2                  ; encoding: [0x41,0x00,0x71,0x1e]
+; CHECK: fcvtmu	x1, s2                  ; encoding: [0x41,0x00,0x31,0x9e]
+; CHECK: fcvtmu	x1, d2                  ; encoding: [0x41,0x00,0x71,0x9e]
+
+  fcvtns w1, s2
+  fcvtns w1, d2
+  fcvtns x1, s2
+  fcvtns x1, d2
+
+; CHECK: fcvtns	w1, s2                  ; encoding: [0x41,0x00,0x20,0x1e]
+; CHECK: fcvtns	w1, d2                  ; encoding: [0x41,0x00,0x60,0x1e]
+; CHECK: fcvtns	x1, s2                  ; encoding: [0x41,0x00,0x20,0x9e]
+; CHECK: fcvtns	x1, d2                  ; encoding: [0x41,0x00,0x60,0x9e]
+
+  fcvtnu w1, s2
+  fcvtnu w1, d2
+  fcvtnu x1, s2
+  fcvtnu x1, d2
+
+; CHECK: fcvtnu	w1, s2                  ; encoding: [0x41,0x00,0x21,0x1e]
+; CHECK: fcvtnu	w1, d2                  ; encoding: [0x41,0x00,0x61,0x1e]
+; CHECK: fcvtnu	x1, s2                  ; encoding: [0x41,0x00,0x21,0x9e]
+; CHECK: fcvtnu	x1, d2                  ; encoding: [0x41,0x00,0x61,0x9e]
+
+  fcvtps w1, s2
+  fcvtps w1, d2
+  fcvtps x1, s2
+  fcvtps x1, d2
+
+; CHECK: fcvtps	w1, s2                  ; encoding: [0x41,0x00,0x28,0x1e]
+; CHECK: fcvtps	w1, d2                  ; encoding: [0x41,0x00,0x68,0x1e]
+; CHECK: fcvtps	x1, s2                  ; encoding: [0x41,0x00,0x28,0x9e]
+; CHECK: fcvtps	x1, d2                  ; encoding: [0x41,0x00,0x68,0x9e]
+
+  fcvtpu w1, s2
+  fcvtpu w1, d2
+  fcvtpu x1, s2
+  fcvtpu x1, d2
+
+; CHECK: fcvtpu	w1, s2                  ; encoding: [0x41,0x00,0x29,0x1e]
+; CHECK: fcvtpu	w1, d2                  ; encoding: [0x41,0x00,0x69,0x1e]
+; CHECK: fcvtpu	x1, s2                  ; encoding: [0x41,0x00,0x29,0x9e]
+; CHECK: fcvtpu	x1, d2                  ; encoding: [0x41,0x00,0x69,0x9e]
+
+  fcvtzs w1, s2
+  fcvtzs w1, s2, #1
+  fcvtzs w1, d2
+  fcvtzs w1, d2, #1
+  fcvtzs x1, s2
+  fcvtzs x1, s2, #1
+  fcvtzs x1, d2
+  fcvtzs x1, d2, #1
+
+; CHECK: fcvtzs	w1, s2                  ; encoding: [0x41,0x00,0x38,0x1e]
+; CHECK: fcvtzs	w1, s2, #1              ; encoding: [0x41,0xfc,0x18,0x1e]
+; CHECK: fcvtzs	w1, d2                  ; encoding: [0x41,0x00,0x78,0x1e]
+; CHECK: fcvtzs	w1, d2, #1              ; encoding: [0x41,0xfc,0x58,0x1e]
+; CHECK: fcvtzs	x1, s2                  ; encoding: [0x41,0x00,0x38,0x9e]
+; CHECK: fcvtzs	x1, s2, #1              ; encoding: [0x41,0xfc,0x18,0x9e]
+; CHECK: fcvtzs	x1, d2                  ; encoding: [0x41,0x00,0x78,0x9e]
+; CHECK: fcvtzs	x1, d2, #1              ; encoding: [0x41,0xfc,0x58,0x9e]
+
+  fcvtzu w1, s2
+  fcvtzu w1, s2, #1
+  fcvtzu w1, d2
+  fcvtzu w1, d2, #1
+  fcvtzu x1, s2
+  fcvtzu x1, s2, #1
+  fcvtzu x1, d2
+  fcvtzu x1, d2, #1
+
+; CHECK: fcvtzu	w1, s2                  ; encoding: [0x41,0x00,0x39,0x1e]
+; CHECK: fcvtzu	w1, s2, #1              ; encoding: [0x41,0xfc,0x19,0x1e]
+; CHECK: fcvtzu	w1, d2                  ; encoding: [0x41,0x00,0x79,0x1e]
+; CHECK: fcvtzu	w1, d2, #1              ; encoding: [0x41,0xfc,0x59,0x1e]
+; CHECK: fcvtzu	x1, s2                  ; encoding: [0x41,0x00,0x39,0x9e]
+; CHECK: fcvtzu	x1, s2, #1              ; encoding: [0x41,0xfc,0x19,0x9e]
+; CHECK: fcvtzu	x1, d2                  ; encoding: [0x41,0x00,0x79,0x9e]
+; CHECK: fcvtzu	x1, d2, #1              ; encoding: [0x41,0xfc,0x59,0x9e]
+
+  scvtf s1, w2
+  scvtf s1, w2, #1
+  scvtf d1, w2
+  scvtf d1, w2, #1
+  scvtf s1, x2
+  scvtf s1, x2, #1
+  scvtf d1, x2
+  scvtf d1, x2, #1
+
+; CHECK: scvtf	s1, w2                  ; encoding: [0x41,0x00,0x22,0x1e]
+; CHECK: scvtf	s1, w2, #1              ; encoding: [0x41,0xfc,0x02,0x1e]
+; CHECK: scvtf	d1, w2                  ; encoding: [0x41,0x00,0x62,0x1e]
+; CHECK: scvtf	d1, w2, #1              ; encoding: [0x41,0xfc,0x42,0x1e]
+; CHECK: scvtf	s1, x2                  ; encoding: [0x41,0x00,0x22,0x9e]
+; CHECK: scvtf	s1, x2, #1              ; encoding: [0x41,0xfc,0x02,0x9e]
+; CHECK: scvtf	d1, x2                  ; encoding: [0x41,0x00,0x62,0x9e]
+; CHECK: scvtf	d1, x2, #1              ; encoding: [0x41,0xfc,0x42,0x9e]
+
+  ucvtf s1, w2
+  ucvtf s1, w2, #1
+  ucvtf d1, w2
+  ucvtf d1, w2, #1
+  ucvtf s1, x2
+  ucvtf s1, x2, #1
+  ucvtf d1, x2
+  ucvtf d1, x2, #1
+
+; CHECK: ucvtf	s1, w2                  ; encoding: [0x41,0x00,0x23,0x1e]
+; CHECK: ucvtf	s1, w2, #1              ; encoding: [0x41,0xfc,0x03,0x1e]
+; CHECK: ucvtf	d1, w2                  ; encoding: [0x41,0x00,0x63,0x1e]
+; CHECK: ucvtf	d1, w2, #1              ; encoding: [0x41,0xfc,0x43,0x1e]
+; CHECK: ucvtf	s1, x2                  ; encoding: [0x41,0x00,0x23,0x9e]
+; CHECK: ucvtf	s1, x2, #1              ; encoding: [0x41,0xfc,0x03,0x9e]
+; CHECK: ucvtf	d1, x2                  ; encoding: [0x41,0x00,0x63,0x9e]
+; CHECK: ucvtf	d1, x2, #1              ; encoding: [0x41,0xfc,0x43,0x9e]
+
+;-----------------------------------------------------------------------------
+; Floating-point move
+;-----------------------------------------------------------------------------
+
+  fmov s1, w2
+  fmov w1, s2
+  fmov d1, x2
+  fmov x1, d2
+
+; CHECK: fmov s1, w2                 ; encoding: [0x41,0x00,0x27,0x1e]
+; CHECK: fmov w1, s2                 ; encoding: [0x41,0x00,0x26,0x1e]
+; CHECK: fmov d1, x2                 ; encoding: [0x41,0x00,0x67,0x9e]
+; CHECK: fmov x1, d2                 ; encoding: [0x41,0x00,0x66,0x9e]
+
+  fmov s1, #0.125
+  fmov s1, #0x40
+  fmov d1, #0.125
+  fmov d1, #0x40
+  fmov d1, #-4.843750e-01
+  fmov d1, #4.843750e-01
+  fmov d3, #3
+  fmov s2, #0.0
+  fmov d2, #0.0
+
+; CHECK: fmov s1, #0.12500000      ; encoding: [0x01,0x10,0x28,0x1e]
+; CHECK: fmov s1, #0.12500000      ; encoding: [0x01,0x10,0x28,0x1e]
+; CHECK: fmov d1, #0.12500000      ; encoding: [0x01,0x10,0x68,0x1e]
+; CHECK: fmov d1, #0.12500000      ; encoding: [0x01,0x10,0x68,0x1e]
+; CHECK: fmov d1, #-0.48437500     ; encoding: [0x01,0xf0,0x7b,0x1e]
+; CHECK: fmov d1, #0.48437500      ; encoding: [0x01,0xf0,0x6b,0x1e]
+; CHECK: fmov d3, #3.00000000      ; encoding: [0x03,0x10,0x61,0x1e]
+; CHECK: fmov s2, wzr                ; encoding: [0xe2,0x03,0x27,0x1e]
+; CHECK: fmov d2, xzr                ; encoding: [0xe2,0x03,0x67,0x9e]
+
+  fmov s1, s2
+  fmov d1, d2
+
+; CHECK: fmov s1, s2                 ; encoding: [0x41,0x40,0x20,0x1e]
+; CHECK: fmov d1, d2                 ; encoding: [0x41,0x40,0x60,0x1e]
+
+
+  fmov x2, v5.d[1]
+  fmov.d x9, v7[1]
+  fmov v1.d[1], x1
+  fmov.d v8[1], x6
+
+; CHECK: fmov.d	x2, v5[1]               ; encoding: [0xa2,0x00,0xae,0x9e]
+; CHECK: fmov.d	x9, v7[1]               ; encoding: [0xe9,0x00,0xae,0x9e]
+; CHECK: fmov.d	v1[1], x1               ; encoding: [0x21,0x00,0xaf,0x9e]
+; CHECK: fmov.d	v8[1], x6               ; encoding: [0xc8,0x00,0xaf,0x9e]
+
+
+;-----------------------------------------------------------------------------
+; Floating-point round to integral
+;-----------------------------------------------------------------------------
+
+  frinta s1, s2
+  frinta d1, d2
+
+; CHECK: frinta s1, s2               ; encoding: [0x41,0x40,0x26,0x1e]
+; CHECK: frinta d1, d2               ; encoding: [0x41,0x40,0x66,0x1e]
+
+  frinti s1, s2
+  frinti d1, d2
+
+; CHECK: frinti s1, s2               ; encoding: [0x41,0xc0,0x27,0x1e]
+; CHECK: frinti d1, d2               ; encoding: [0x41,0xc0,0x67,0x1e]
+
+  frintm s1, s2
+  frintm d1, d2
+
+; CHECK: frintm s1, s2               ; encoding: [0x41,0x40,0x25,0x1e]
+; CHECK: frintm d1, d2               ; encoding: [0x41,0x40,0x65,0x1e]
+
+  frintn s1, s2
+  frintn d1, d2
+
+; CHECK: frintn s1, s2               ; encoding: [0x41,0x40,0x24,0x1e]
+; CHECK: frintn d1, d2               ; encoding: [0x41,0x40,0x64,0x1e]
+
+  frintp s1, s2
+  frintp d1, d2
+
+; CHECK: frintp s1, s2               ; encoding: [0x41,0xc0,0x24,0x1e]
+; CHECK: frintp d1, d2               ; encoding: [0x41,0xc0,0x64,0x1e]
+
+  frintx s1, s2
+  frintx d1, d2
+
+; CHECK: frintx s1, s2               ; encoding: [0x41,0x40,0x27,0x1e]
+; CHECK: frintx d1, d2               ; encoding: [0x41,0x40,0x67,0x1e]
+
+  frintz s1, s2
+  frintz d1, d2
+
+; CHECK: frintz s1, s2               ; encoding: [0x41,0xc0,0x25,0x1e]
+; CHECK: frintz d1, d2               ; encoding: [0x41,0xc0,0x65,0x1e]
+
+  cmhs d0, d0, d0
+  cmtst d0, d0, d0
+
+; CHECK: cmhs	d0, d0, d0              ; encoding: [0x00,0x3c,0xe0,0x7e]
+; CHECK: cmtst	d0, d0, d0              ; encoding: [0x00,0x8c,0xe0,0x5e]
+
+
+
+;-----------------------------------------------------------------------------
+; Floating-point extract and narrow
+;-----------------------------------------------------------------------------
+  sqxtn b4, h2
+  sqxtn h2, s3
+  sqxtn s9, d2
+
+; CHECK: sqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x5e]
+; CHECK: sqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x5e]
+; CHECK: sqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x5e]
+
+  sqxtun b4, h2
+  sqxtun h2, s3
+  sqxtun s9, d2
+
+; CHECK: sqxtun b4, h2                  ; encoding: [0x44,0x28,0x21,0x7e]
+; CHECK: sqxtun h2, s3                  ; encoding: [0x62,0x28,0x61,0x7e]
+; CHECK: sqxtun s9, d2                  ; encoding: [0x49,0x28,0xa1,0x7e]
+
+  uqxtn b4, h2
+  uqxtn h2, s3
+  uqxtn s9, d2
+
+; CHECK: uqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x7e]
+; CHECK: uqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x7e]
+; CHECK: uqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x7e]
diff --git a/test/MC/AArch64/arm64-large-relocs.s b/test/MC/AArch64/arm64-large-relocs.s
new file mode 100644
index 0000000..2a0cfa2
--- /dev/null
+++ b/test/MC/AArch64/arm64-large-relocs.s
@@ -0,0 +1,38 @@
+// RUN: llvm-mc -triple=arm64-linux-gnu -show-encoding -o - %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64-linux-gnu -show-encoding -filetype=obj -o - %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-OBJ %s
+
+        movz x2, #:abs_g0:sym
+        movk w3, #:abs_g0_nc:sym
+// CHECK: movz    x2, #:abs_g0:sym        // encoding: [0bAAA00010,A,0b100AAAAA,0xd2]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_aarch64_movw
+// CHECK: movk     w3, #:abs_g0_nc:sym    // encoding: [0bAAA00011,A,0b100AAAAA,0x72]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_aarch64_movw
+
+// CHECK-OBJ: 0 R_AARCH64_MOVW_UABS_G0 sym
+// CHECK-OBJ: 4 R_AARCH64_MOVW_UABS_G0_NC sym
+
+        movz x4, #:abs_g1:sym
+        movk w5, #:abs_g1_nc:sym
+// CHECK: movz     x4, #:abs_g1:sym       // encoding: [0bAAA00100,A,0b101AAAAA,0xd2]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_aarch64_movw
+// CHECK: movk     w5, #:abs_g1_nc:sym    // encoding: [0bAAA00101,A,0b101AAAAA,0x72]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_aarch64_movw
+
+// CHECK-OBJ: 8 R_AARCH64_MOVW_UABS_G1 sym
+// CHECK-OBJ: c R_AARCH64_MOVW_UABS_G1_NC sym
+
+        movz x6, #:abs_g2:sym
+        movk x7, #:abs_g2_nc:sym
+// CHECK: movz     x6, #:abs_g2:sym       // encoding: [0bAAA00110,A,0b110AAAAA,0xd2]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_aarch64_movw
+// CHECK: movk     x7, #:abs_g2_nc:sym    // encoding: [0bAAA00111,A,0b110AAAAA,0xf2]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_aarch64_movw
+
+// CHECK-OBJ: 10 R_AARCH64_MOVW_UABS_G2 sym
+// CHECK-OBJ: 14 R_AARCH64_MOVW_UABS_G2_NC sym
+
+        movz x8, #:abs_g3:sym
+// CHECK: movz     x8, #:abs_g3:sym       // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_aarch64_movw
+
+// CHECK-OBJ: 18 R_AARCH64_MOVW_UABS_G3 sym
diff --git a/test/MC/AArch64/arm64-leaf-compact-unwind.s b/test/MC/AArch64/arm64-leaf-compact-unwind.s
new file mode 100644
index 0000000..d699813
--- /dev/null
+++ b/test/MC/AArch64/arm64-leaf-compact-unwind.s
@@ -0,0 +1,208 @@
+// RUN: llvm-mc -triple=arm64-apple-ios -filetype=obj < %s | \
+// RUN: llvm-readobj -sections -section-relocations -section-data | \
+// RUN: FileCheck %s
+//
+// rdar://13070556
+
+// FIXME: we should add compact unwind support to llvm-objdump -unwind-info
+
+// CHECK:      Section {
+// CHECK:        Index: 1
+// CHECK-NEXT:   Name: __compact_unwind
+// CHECK-NEXT:   Segment: __LD
+// CHECK-NEXT:   Address:
+// CHECK-NEXT:   Size:
+// CHECK-NEXT:   Offset:
+// CHECK-NEXT:   Alignment:
+// CHECK-NEXT:   RelocationOffset:
+// CHECK-NEXT:   RelocationCount:
+// CHECK-NEXT:   Type:
+// CHECK-NEXT:   Attributes [
+// CHECK-NEXT:     Debug
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Reserved1:
+// CHECK-NEXT:   Reserved2:
+// CHECK-NEXT:   Relocations [
+// CHECK-NEXT:     0x60 0 3 0 ARM64_RELOC_UNSIGNED 0 -
+// CHECK-NEXT:     0x40 0 3 0 ARM64_RELOC_UNSIGNED 0 -
+// CHECK-NEXT:     0x20 0 3 0 ARM64_RELOC_UNSIGNED 0 -
+// CHECK-NEXT:     0x0 0 3 0 ARM64_RELOC_UNSIGNED 0 -
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   SectionData (
+// CHECK-NEXT:     0000: 00000000 00000000 08000000 00000002
+// CHECK-NEXT:     0010: 00000000 00000000 00000000 00000000
+// CHECK-NEXT:     0020: 08000000 00000000 40000000 00900002
+// CHECK-NEXT:     0030: 00000000 00000000 00000000 00000000
+// CHECK-NEXT:     0040: 48000000 00000000 D4000000 0F400002
+// CHECK-NEXT:     0050: 00000000 00000000 00000000 00000000
+// CHECK-NEXT:     0060: 1C010000 00000000 54000000 10100202
+// CHECK-NEXT:     0070: 00000000 00000000 00000000 00000000
+// CHECK-NEXT:   )
+// CHECK-NEXT: }
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_foo1
+	.align	2
+_foo1:                                  ; @foo1
+	.cfi_startproc
+; BB#0:                                 ; %entry
+	add	w0, w0, #42             ; =#42
+	ret
+	.cfi_endproc
+
+	.globl	_foo2
+	.align	2
+_foo2:                                  ; @foo2
+	.cfi_startproc
+; BB#0:                                 ; %entry
+	sub	sp, sp, #144            ; =#144
+Ltmp2:
+	.cfi_def_cfa_offset 144
+	mov	x9, xzr
+	mov	x8, sp
+LBB1_1:                                 ; %for.body
+                                        ; =>This Inner Loop Header: Depth=1
+	str	w9, [x8, x9, lsl #2]
+	add	x9, x9, #1              ; =#1
+	cmp	w9, #36                 ; =#36
+	b.ne	LBB1_1
+; BB#2:
+	mov	x9, xzr
+	mov	w0, wzr
+LBB1_3:                                 ; %for.body4
+                                        ; =>This Inner Loop Header: Depth=1
+	ldr	w10, [x8, x9]
+	add	x9, x9, #4              ; =#4
+	cmp	w9, #144                ; =#144
+	add	w0, w10, w0
+	b.ne	LBB1_3
+; BB#4:                                 ; %for.end9
+	add	sp, sp, #144            ; =#144
+	ret
+	.cfi_endproc
+
+	.globl	_foo3
+	.align	2
+_foo3:                                  ; @foo3
+	.cfi_startproc
+; BB#0:                                 ; %entry
+	stp	x26, x25, [sp, #-64]!
+	stp	x24, x23, [sp, #16]
+	stp	x22, x21, [sp, #32]
+	stp	x20, x19, [sp, #48]
+Ltmp3:
+	.cfi_def_cfa_offset 64
+Ltmp4:
+	.cfi_offset w19, -16
+Ltmp5:
+	.cfi_offset w20, -24
+Ltmp6:
+	.cfi_offset w21, -32
+Ltmp7:
+	.cfi_offset w22, -40
+Ltmp8:
+	.cfi_offset w23, -48
+Ltmp9:
+	.cfi_offset w24, -56
+Ltmp10:
+	.cfi_offset w25, -64
+Ltmp11:
+	.cfi_offset w26, -72
+Lloh0:
+	adrp	x8, _bar@GOTPAGE
+Lloh1:
+	ldr	x8, [x8, _bar@GOTPAGEOFF]
+	ldr	w9, [x8]
+	ldr	w10, [x8]
+	ldr	w11, [x8]
+	ldr	w12, [x8]
+	ldr	w13, [x8]
+	ldr	w14, [x8]
+	ldr	w15, [x8]
+	ldr	w16, [x8]
+	ldr	w17, [x8]
+	ldr	w0, [x8]
+	ldr	w19, [x8]
+	ldr	w20, [x8]
+	ldr	w21, [x8]
+	ldr	w22, [x8]
+	ldr	w23, [x8]
+	ldr	w24, [x8]
+	ldr	w25, [x8]
+	ldr	w8, [x8]
+	add	w9, w10, w9
+	add	w9, w9, w11
+	add	w9, w9, w12
+	add	w9, w9, w13
+	add	w9, w9, w14
+	add	w9, w9, w15
+	add	w9, w9, w16
+	add	w9, w9, w17
+	add	w9, w9, w0
+	add	w9, w9, w19
+	add	w9, w9, w20
+	add	w9, w9, w21
+	add	w9, w9, w22
+	add	w9, w9, w23
+	add	w9, w9, w24
+	add	w9, w9, w25
+	sub	w8, w8, w9
+	sub	w8, w8, w7, lsl #1
+	sub	w8, w8, w6, lsl #1
+	sub	w8, w8, w5, lsl #1
+	sub	w8, w8, w4, lsl #1
+	sub	w8, w8, w3, lsl #1
+	sub	w8, w8, w2, lsl #1
+	sub	w0, w8, w1, lsl #1
+	ldp	x20, x19, [sp, #48]
+	ldp	x22, x21, [sp, #32]
+	ldp	x24, x23, [sp, #16]
+	ldp	x26, x25, [sp], #64
+	ret
+	.loh AdrpLdrGot	Lloh0, Lloh1
+	.cfi_endproc
+
+	.globl	_foo4
+	.align	2
+_foo4:                                  ; @foo4
+	.cfi_startproc
+; BB#0:                                 ; %entry
+	stp	x28, x27, [sp, #-16]!
+	sub	sp, sp, #512            ; =#512
+Ltmp12:
+	.cfi_def_cfa_offset 528
+Ltmp13:
+	.cfi_offset w27, -16
+Ltmp14:
+	.cfi_offset w28, -24
+                                        ; kill: W0<def> W0<kill> X0<def>
+	mov	x9, xzr
+	ubfx	x10, x0, #0, #32
+	mov	x8, sp
+LBB3_1:                                 ; %for.body
+                                        ; =>This Inner Loop Header: Depth=1
+	add	w11, w10, w9
+	str	w11, [x8, x9, lsl #2]
+	add	x9, x9, #1              ; =#1
+	cmp	w9, #128                ; =#128
+	b.ne	LBB3_1
+; BB#2:                                 ; %for.cond2.preheader
+	mov	x9, xzr
+	mov	w0, wzr
+	add	x8, x8, w5, sxtw #2
+LBB3_3:                                 ; %for.body4
+                                        ; =>This Inner Loop Header: Depth=1
+	ldr	w10, [x8, x9]
+	add	x9, x9, #4              ; =#4
+	cmp	w9, #512                ; =#512
+	add	w0, w10, w0
+	b.ne	LBB3_3
+; BB#4:                                 ; %for.end11
+	add	sp, sp, #512            ; =#512
+	ldp	x28, x27, [sp], #16
+	ret
+	.cfi_endproc
+
+	.comm	_bar,4,2                ; @bar
+
+.subsections_via_symbols
diff --git a/test/MC/AArch64/arm64-logical-encoding.s b/test/MC/AArch64/arm64-logical-encoding.s
new file mode 100644
index 0000000..e5f1436
--- /dev/null
+++ b/test/MC/AArch64/arm64-logical-encoding.s
@@ -0,0 +1,224 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+;==---------------------------------------------------------------------------==
+; 5.4.2 Logical (immediate)
+;==---------------------------------------------------------------------------==
+
+  and   w0, w0, #1
+  and   x0, x0, #1
+  and   w1, w2, #15
+  and   x1, x2, #15
+  and   sp, x5, #~15
+  ands  w0, w0, #1
+  ands  x0, x0, #1
+  ands  w1, w2, #15
+  ands  x1, x2, #15
+
+; CHECK: and  w0, w0, #0x1           ; encoding: [0x00,0x00,0x00,0x12]
+; CHECK: and  x0, x0, #0x1           ; encoding: [0x00,0x00,0x40,0x92]
+; CHECK: and  w1, w2, #0xf           ; encoding: [0x41,0x0c,0x00,0x12]
+; CHECK: and  x1, x2, #0xf           ; encoding: [0x41,0x0c,0x40,0x92]
+; CHECK: and  sp, x5, #0xfffffffffffffff0 ; encoding: [0xbf,0xec,0x7c,0x92]
+; CHECK: ands w0, w0, #0x1           ; encoding: [0x00,0x00,0x00,0x72]
+; CHECK: ands x0, x0, #0x1           ; encoding: [0x00,0x00,0x40,0xf2]
+; CHECK: ands w1, w2, #0xf           ; encoding: [0x41,0x0c,0x00,0x72]
+; CHECK: ands x1, x2, #0xf           ; encoding: [0x41,0x0c,0x40,0xf2]
+
+  eor w1, w2, #0x4000
+  eor x1, x2, #0x8000
+
+; CHECK: eor w1, w2, #0x4000         ; encoding: [0x41,0x00,0x12,0x52]
+; CHECK: eor x1, x2, #0x8000         ; encoding: [0x41,0x00,0x71,0xd2]
+
+  orr w1, w2, #0x4000
+  orr x1, x2, #0x8000
+
+; CHECK: orr w1, w2, #0x4000         ; encoding: [0x41,0x00,0x12,0x32]
+; CHECK: orr x1, x2, #0x8000         ; encoding: [0x41,0x00,0x71,0xb2]
+
+  orr w8, wzr, #0x1
+  orr x8, xzr, #0x1
+
+; CHECK: orr w8, wzr, #0x1           ; encoding: [0xe8,0x03,0x00,0x32]
+; CHECK: orr x8, xzr, #0x1           ; encoding: [0xe8,0x03,0x40,0xb2]
+
+;==---------------------------------------------------------------------------==
+; 5.5.3 Logical (shifted register)
+;==---------------------------------------------------------------------------==
+
+  and   w1, w2, w3
+  and   x1, x2, x3
+  and   w1, w2, w3, lsl #2
+  and   x1, x2, x3, lsl #2
+  and   w1, w2, w3, lsr #2
+  and   x1, x2, x3, lsr #2
+  and   w1, w2, w3, asr #2
+  and   x1, x2, x3, asr #2
+  and   w1, w2, w3, ror #2
+  and   x1, x2, x3, ror #2
+
+; CHECK: and  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x0a]
+; CHECK: and  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0x8a]
+; CHECK: and  w1, w2, w3, lsl #2     ; encoding: [0x41,0x08,0x03,0x0a]
+; CHECK: and  x1, x2, x3, lsl #2     ; encoding: [0x41,0x08,0x03,0x8a]
+; CHECK: and  w1, w2, w3, lsr #2     ; encoding: [0x41,0x08,0x43,0x0a]
+; CHECK: and  x1, x2, x3, lsr #2     ; encoding: [0x41,0x08,0x43,0x8a]
+; CHECK: and  w1, w2, w3, asr #2     ; encoding: [0x41,0x08,0x83,0x0a]
+; CHECK: and  x1, x2, x3, asr #2     ; encoding: [0x41,0x08,0x83,0x8a]
+; CHECK: and  w1, w2, w3, ror #2     ; encoding: [0x41,0x08,0xc3,0x0a]
+; CHECK: and  x1, x2, x3, ror #2     ; encoding: [0x41,0x08,0xc3,0x8a]
+
+  ands  w1, w2, w3
+  ands  x1, x2, x3
+  ands  w1, w2, w3, lsl #2
+  ands  x1, x2, x3, lsl #2
+  ands  w1, w2, w3, lsr #2
+  ands  x1, x2, x3, lsr #2
+  ands  w1, w2, w3, asr #2
+  ands  x1, x2, x3, asr #2
+  ands  w1, w2, w3, ror #2
+  ands  x1, x2, x3, ror #2
+
+; CHECK: ands w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x6a]
+; CHECK: ands x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xea]
+; CHECK: ands w1, w2, w3, lsl #2     ; encoding: [0x41,0x08,0x03,0x6a]
+; CHECK: ands x1, x2, x3, lsl #2     ; encoding: [0x41,0x08,0x03,0xea]
+; CHECK: ands w1, w2, w3, lsr #2     ; encoding: [0x41,0x08,0x43,0x6a]
+; CHECK: ands x1, x2, x3, lsr #2     ; encoding: [0x41,0x08,0x43,0xea]
+; CHECK: ands w1, w2, w3, asr #2     ; encoding: [0x41,0x08,0x83,0x6a]
+; CHECK: ands x1, x2, x3, asr #2     ; encoding: [0x41,0x08,0x83,0xea]
+; CHECK: ands w1, w2, w3, ror #2     ; encoding: [0x41,0x08,0xc3,0x6a]
+; CHECK: ands x1, x2, x3, ror #2     ; encoding: [0x41,0x08,0xc3,0xea]
+
+  bic w1, w2, w3
+  bic x1, x2, x3
+  bic w1, w2, w3, lsl #3
+  bic x1, x2, x3, lsl #3
+  bic w1, w2, w3, lsr #3
+  bic x1, x2, x3, lsr #3
+  bic w1, w2, w3, asr #3
+  bic x1, x2, x3, asr #3
+  bic w1, w2, w3, ror #3
+  bic x1, x2, x3, ror #3
+
+; CHECK: bic w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x0a]
+; CHECK: bic x1, x2, x3              ; encoding: [0x41,0x00,0x23,0x8a]
+; CHECK: bic w1, w2, w3, lsl #3      ; encoding: [0x41,0x0c,0x23,0x0a]
+; CHECK: bic x1, x2, x3, lsl #3      ; encoding: [0x41,0x0c,0x23,0x8a]
+; CHECK: bic w1, w2, w3, lsr #3      ; encoding: [0x41,0x0c,0x63,0x0a]
+; CHECK: bic x1, x2, x3, lsr #3      ; encoding: [0x41,0x0c,0x63,0x8a]
+; CHECK: bic w1, w2, w3, asr #3      ; encoding: [0x41,0x0c,0xa3,0x0a]
+; CHECK: bic x1, x2, x3, asr #3      ; encoding: [0x41,0x0c,0xa3,0x8a]
+; CHECK: bic w1, w2, w3, ror #3      ; encoding: [0x41,0x0c,0xe3,0x0a]
+; CHECK: bic x1, x2, x3, ror #3      ; encoding: [0x41,0x0c,0xe3,0x8a]
+
+  bics w1, w2, w3
+  bics x1, x2, x3
+  bics w1, w2, w3, lsl #3
+  bics x1, x2, x3, lsl #3
+  bics w1, w2, w3, lsr #3
+  bics x1, x2, x3, lsr #3
+  bics w1, w2, w3, asr #3
+  bics x1, x2, x3, asr #3
+  bics w1, w2, w3, ror #3
+  bics x1, x2, x3, ror #3
+
+; CHECK: bics w1, w2, w3             ; encoding: [0x41,0x00,0x23,0x6a]
+; CHECK: bics x1, x2, x3             ; encoding: [0x41,0x00,0x23,0xea]
+; CHECK: bics w1, w2, w3, lsl #3     ; encoding: [0x41,0x0c,0x23,0x6a]
+; CHECK: bics x1, x2, x3, lsl #3     ; encoding: [0x41,0x0c,0x23,0xea]
+; CHECK: bics w1, w2, w3, lsr #3     ; encoding: [0x41,0x0c,0x63,0x6a]
+; CHECK: bics x1, x2, x3, lsr #3     ; encoding: [0x41,0x0c,0x63,0xea]
+; CHECK: bics w1, w2, w3, asr #3     ; encoding: [0x41,0x0c,0xa3,0x6a]
+; CHECK: bics x1, x2, x3, asr #3     ; encoding: [0x41,0x0c,0xa3,0xea]
+; CHECK: bics w1, w2, w3, ror #3     ; encoding: [0x41,0x0c,0xe3,0x6a]
+; CHECK: bics x1, x2, x3, ror #3     ; encoding: [0x41,0x0c,0xe3,0xea]
+
+  eon w1, w2, w3
+  eon x1, x2, x3
+  eon w1, w2, w3, lsl #4
+  eon x1, x2, x3, lsl #4
+  eon w1, w2, w3, lsr #4
+  eon x1, x2, x3, lsr #4
+  eon w1, w2, w3, asr #4
+  eon x1, x2, x3, asr #4
+  eon w1, w2, w3, ror #4
+  eon x1, x2, x3, ror #4
+
+; CHECK: eon w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x4a]
+; CHECK: eon x1, x2, x3              ; encoding: [0x41,0x00,0x23,0xca]
+; CHECK: eon w1, w2, w3, lsl #4      ; encoding: [0x41,0x10,0x23,0x4a]
+; CHECK: eon x1, x2, x3, lsl #4      ; encoding: [0x41,0x10,0x23,0xca]
+; CHECK: eon w1, w2, w3, lsr #4      ; encoding: [0x41,0x10,0x63,0x4a]
+; CHECK: eon x1, x2, x3, lsr #4      ; encoding: [0x41,0x10,0x63,0xca]
+; CHECK: eon w1, w2, w3, asr #4      ; encoding: [0x41,0x10,0xa3,0x4a]
+; CHECK: eon x1, x2, x3, asr #4      ; encoding: [0x41,0x10,0xa3,0xca]
+; CHECK: eon w1, w2, w3, ror #4      ; encoding: [0x41,0x10,0xe3,0x4a]
+; CHECK: eon x1, x2, x3, ror #4      ; encoding: [0x41,0x10,0xe3,0xca]
+
+  eor w1, w2, w3
+  eor x1, x2, x3
+  eor w1, w2, w3, lsl #5
+  eor x1, x2, x3, lsl #5
+  eor w1, w2, w3, lsr #5
+  eor x1, x2, x3, lsr #5
+  eor w1, w2, w3, asr #5
+  eor x1, x2, x3, asr #5
+  eor w1, w2, w3, ror #5
+  eor x1, x2, x3, ror #5
+
+; CHECK: eor w1, w2, w3              ; encoding: [0x41,0x00,0x03,0x4a]
+; CHECK: eor x1, x2, x3              ; encoding: [0x41,0x00,0x03,0xca]
+; CHECK: eor w1, w2, w3, lsl #5      ; encoding: [0x41,0x14,0x03,0x4a]
+; CHECK: eor x1, x2, x3, lsl #5      ; encoding: [0x41,0x14,0x03,0xca]
+; CHECK: eor w1, w2, w3, lsr #5      ; encoding: [0x41,0x14,0x43,0x4a]
+; CHECK: eor x1, x2, x3, lsr #5      ; encoding: [0x41,0x14,0x43,0xca]
+; CHECK: eor w1, w2, w3, asr #5      ; encoding: [0x41,0x14,0x83,0x4a]
+; CHECK: eor x1, x2, x3, asr #5      ; encoding: [0x41,0x14,0x83,0xca]
+; CHECK: eor w1, w2, w3, ror #5      ; encoding: [0x41,0x14,0xc3,0x4a]
+; CHECK: eor x1, x2, x3, ror #5      ; encoding: [0x41,0x14,0xc3,0xca]
+
+  orr w1, w2, w3
+  orr x1, x2, x3
+  orr w1, w2, w3, lsl #6
+  orr x1, x2, x3, lsl #6
+  orr w1, w2, w3, lsr #6
+  orr x1, x2, x3, lsr #6
+  orr w1, w2, w3, asr #6
+  orr x1, x2, x3, asr #6
+  orr w1, w2, w3, ror #6
+  orr x1, x2, x3, ror #6
+
+; CHECK: orr w1, w2, w3              ; encoding: [0x41,0x00,0x03,0x2a]
+; CHECK: orr x1, x2, x3              ; encoding: [0x41,0x00,0x03,0xaa]
+; CHECK: orr w1, w2, w3, lsl #6      ; encoding: [0x41,0x18,0x03,0x2a]
+; CHECK: orr x1, x2, x3, lsl #6      ; encoding: [0x41,0x18,0x03,0xaa]
+; CHECK: orr w1, w2, w3, lsr #6      ; encoding: [0x41,0x18,0x43,0x2a]
+; CHECK: orr x1, x2, x3, lsr #6      ; encoding: [0x41,0x18,0x43,0xaa]
+; CHECK: orr w1, w2, w3, asr #6      ; encoding: [0x41,0x18,0x83,0x2a]
+; CHECK: orr x1, x2, x3, asr #6      ; encoding: [0x41,0x18,0x83,0xaa]
+; CHECK: orr w1, w2, w3, ror #6      ; encoding: [0x41,0x18,0xc3,0x2a]
+; CHECK: orr x1, x2, x3, ror #6      ; encoding: [0x41,0x18,0xc3,0xaa]
+
+  orn w1, w2, w3
+  orn x1, x2, x3
+  orn w1, w2, w3, lsl #7
+  orn x1, x2, x3, lsl #7
+  orn w1, w2, w3, lsr #7
+  orn x1, x2, x3, lsr #7
+  orn w1, w2, w3, asr #7
+  orn x1, x2, x3, asr #7
+  orn w1, w2, w3, ror #7
+  orn x1, x2, x3, ror #7
+
+; CHECK: orn w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x2a]
+; CHECK: orn x1, x2, x3              ; encoding: [0x41,0x00,0x23,0xaa]
+; CHECK: orn w1, w2, w3, lsl #7      ; encoding: [0x41,0x1c,0x23,0x2a]
+; CHECK: orn x1, x2, x3, lsl #7      ; encoding: [0x41,0x1c,0x23,0xaa]
+; CHECK: orn w1, w2, w3, lsr #7      ; encoding: [0x41,0x1c,0x63,0x2a]
+; CHECK: orn x1, x2, x3, lsr #7      ; encoding: [0x41,0x1c,0x63,0xaa]
+; CHECK: orn w1, w2, w3, asr #7      ; encoding: [0x41,0x1c,0xa3,0x2a]
+; CHECK: orn x1, x2, x3, asr #7      ; encoding: [0x41,0x1c,0xa3,0xaa]
+; CHECK: orn w1, w2, w3, ror #7      ; encoding: [0x41,0x1c,0xe3,0x2a]
+; CHECK: orn x1, x2, x3, ror #7      ; encoding: [0x41,0x1c,0xe3,0xaa]
diff --git a/test/MC/AArch64/arm64-mapping-across-sections.s b/test/MC/AArch64/arm64-mapping-across-sections.s
new file mode 100644
index 0000000..00b324c
--- /dev/null
+++ b/test/MC/AArch64/arm64-mapping-across-sections.s
@@ -0,0 +1,28 @@
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+
+        .text
+        add w0, w0, w0
+
+// .wibble should *not* inherit .text's mapping symbol. It's a completely different section.
+        .section .wibble
+        add w0, w0, w0
+
+// A setion should be able to start with a $d
+        .section .starts_data
+        .word 42
+
+// Changing back to .text should not emit a redundant $x
+        .text
+        add w0, w0, w0
+
+// With all those constraints, we want:
+//   + .text to have $x at 0 and no others
+//   + .wibble to have $x at 0
+//   + .starts_data to have $d at 0
+
+
+// CHECK: 00000000 .starts_data 00000000 $d
+// CHECK-NEXT: 00000000 .text 00000000 $x
+// CHECK-NEXT: 00000000 .wibble 00000000 $x
+// CHECK-NOT: ${{[adtx]}}
+
diff --git a/test/MC/AArch64/arm64-mapping-within-section.s b/test/MC/AArch64/arm64-mapping-within-section.s
new file mode 100644
index 0000000..f515cb9
--- /dev/null
+++ b/test/MC/AArch64/arm64-mapping-within-section.s
@@ -0,0 +1,23 @@
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+
+    .text
+// $x at 0x0000
+    add w0, w0, w0
+// $d at 0x0004
+    .ascii "012"
+    .byte 1
+    .hword 2
+    .word 4
+    .xword 8
+    .single 4.0
+    .double 8.0
+    .space 10
+    .zero 3
+    .fill 10, 2, 42
+    .org 100, 12
+// $x at 0x0018
+    add x0, x0, x0
+
+// CHECK: 00000004         .text  00000000 $d
+// CHECK-NEXT: 00000000         .text  00000000 $x
+// CHECK-NEXT: 00000064         .text  00000000 $x
diff --git a/test/MC/AArch64/arm64-memory.s b/test/MC/AArch64/arm64-memory.s
new file mode 100644
index 0000000..5798596
--- /dev/null
+++ b/test/MC/AArch64/arm64-memory.s
@@ -0,0 +1,634 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+;-----------------------------------------------------------------------------
+; Indexed loads
+;-----------------------------------------------------------------------------
+
+  ldr    w5, [x4, #20]
+  ldr    x4, [x3]
+  ldr    x2, [sp, #32]
+  ldr    b5, [sp, #1]
+  ldr    h6, [sp, #2]
+  ldr    s7, [sp, #4]
+  ldr    d8, [sp, #8]
+  ldr    q9, [sp, #16]
+  ldrb   w4, [x3]
+  ldrb   w5, [x4, #20]
+  ldrb	 w2, [x3, _foo@pageoff]
+  ldrb   w3, [x2, "+[Test method].var"@PAGEOFF]
+  ldrsb  w9, [x3]
+  ldrsb  x2, [sp, #128]
+  ldrh   w2, [sp, #32]
+  ldrsh  w3, [sp, #32]
+  ldrsh  x5, [x9, #24]
+  ldrsw  x9, [sp, #512]
+
+  prfm   #5, [sp, #32]
+  prfm   #31, [sp, #32]
+  prfm   pldl1keep, [x2]
+  prfm   pldl1strm, [x2]
+  prfm   pldl2keep, [x2]
+  prfm   pldl2strm, [x2]
+  prfm   pldl3keep, [x2]
+  prfm   pldl3strm, [x2]
+  prfm   pstl1keep, [x2]
+  prfm   pstl1strm, [x2]
+  prfm   pstl2keep, [x2]
+  prfm   pstl2strm, [x2]
+  prfm   pstl3keep, [x2]
+  prfm   pstl3strm, [x2]
+  prfm  pstl3strm, [x4, x5, lsl #3]
+
+; CHECK: ldr    w5, [x4, #20]           ; encoding: [0x85,0x14,0x40,0xb9]
+; CHECK: ldr    x4, [x3]                ; encoding: [0x64,0x00,0x40,0xf9]
+; CHECK: ldr    x2, [sp, #32]           ; encoding: [0xe2,0x13,0x40,0xf9]
+; CHECK: ldr    b5, [sp, #1]            ; encoding: [0xe5,0x07,0x40,0x3d]
+; CHECK: ldr    h6, [sp, #2]            ; encoding: [0xe6,0x07,0x40,0x7d]
+; CHECK: ldr    s7, [sp, #4]            ; encoding: [0xe7,0x07,0x40,0xbd]
+; CHECK: ldr    d8, [sp, #8]            ; encoding: [0xe8,0x07,0x40,0xfd]
+; CHECK: ldr    q9, [sp, #16]           ; encoding: [0xe9,0x07,0xc0,0x3d]
+; CHECK: ldrb   w4, [x3]                ; encoding: [0x64,0x00,0x40,0x39]
+; CHECK: ldrb   w5, [x4, #20]           ; encoding: [0x85,0x50,0x40,0x39]
+; CHECK: ldrb	w2, [x3, _foo@PAGEOFF]  ; encoding: [0x62,0bAAAAAA00,0b01AAAAAA,0x39]
+; CHECK: ldrb	w3, [x2, "+[Test method].var"@PAGEOFF] ; encoding: [0x43,0bAAAAAA00,0b01AAAAAA,0x39]
+; CHECK: ldrsb  w9, [x3]                ; encoding: [0x69,0x00,0xc0,0x39]
+; CHECK: ldrsb  x2, [sp, #128]          ; encoding: [0xe2,0x03,0x82,0x39]
+; CHECK: ldrh   w2, [sp, #32]           ; encoding: [0xe2,0x43,0x40,0x79]
+; CHECK: ldrsh  w3, [sp, #32]           ; encoding: [0xe3,0x43,0xc0,0x79]
+; CHECK: ldrsh  x5, [x9, #24]           ; encoding: [0x25,0x31,0x80,0x79]
+; CHECK: ldrsw  x9, [sp, #512]          ; encoding: [0xe9,0x03,0x82,0xb9]
+; CHECK: prfm   pldl3strm, [sp, #32]    ; encoding: [0xe5,0x13,0x80,0xf9]
+; CHECK: prfm	#31, [sp, #32]          ; encoding: [0xff,0x13,0x80,0xf9]
+; CHECK: prfm   pldl1keep, [x2]         ; encoding: [0x40,0x00,0x80,0xf9]
+; CHECK: prfm   pldl1strm, [x2]         ; encoding: [0x41,0x00,0x80,0xf9]
+; CHECK: prfm   pldl2keep, [x2]         ; encoding: [0x42,0x00,0x80,0xf9]
+; CHECK: prfm   pldl2strm, [x2]         ; encoding: [0x43,0x00,0x80,0xf9]
+; CHECK: prfm   pldl3keep, [x2]         ; encoding: [0x44,0x00,0x80,0xf9]
+; CHECK: prfm   pldl3strm, [x2]         ; encoding: [0x45,0x00,0x80,0xf9]
+; CHECK: prfm   pstl1keep, [x2]         ; encoding: [0x50,0x00,0x80,0xf9]
+; CHECK: prfm   pstl1strm, [x2]         ; encoding: [0x51,0x00,0x80,0xf9]
+; CHECK: prfm   pstl2keep, [x2]         ; encoding: [0x52,0x00,0x80,0xf9]
+; CHECK: prfm   pstl2strm, [x2]         ; encoding: [0x53,0x00,0x80,0xf9]
+; CHECK: prfm   pstl3keep, [x2]         ; encoding: [0x54,0x00,0x80,0xf9]
+; CHECK: prfm   pstl3strm, [x2]         ; encoding: [0x55,0x00,0x80,0xf9]
+; CHECK: prfm	pstl3strm, [x4, x5, lsl #3] ; encoding: [0x95,0x78,0xa5,0xf8]
+
+;-----------------------------------------------------------------------------
+; Indexed stores
+;-----------------------------------------------------------------------------
+
+  str   x4, [x3]
+  str   x2, [sp, #32]
+  str   w5, [x4, #20]
+  str   b5, [sp, #1]
+  str   h6, [sp, #2]
+  str   s7, [sp, #4]
+  str   d8, [sp, #8]
+  str   q9, [sp, #16]
+  strb  w4, [x3]
+  strb  w5, [x4, #20]
+  strh  w2, [sp, #32]
+
+; CHECK: str   x4, [x3]                 ; encoding: [0x64,0x00,0x00,0xf9]
+; CHECK: str   x2, [sp, #32]            ; encoding: [0xe2,0x13,0x00,0xf9]
+; CHECK: str   w5, [x4, #20]            ; encoding: [0x85,0x14,0x00,0xb9]
+; CHECK: str   b5, [sp, #1]             ; encoding: [0xe5,0x07,0x00,0x3d]
+; CHECK: str   h6, [sp, #2]             ; encoding: [0xe6,0x07,0x00,0x7d]
+; CHECK: str   s7, [sp, #4]             ; encoding: [0xe7,0x07,0x00,0xbd]
+; CHECK: str   d8, [sp, #8]             ; encoding: [0xe8,0x07,0x00,0xfd]
+; CHECK: str   q9, [sp, #16]            ; encoding: [0xe9,0x07,0x80,0x3d]
+; CHECK: strb  w4, [x3]                 ; encoding: [0x64,0x00,0x00,0x39]
+; CHECK: strb  w5, [x4, #20]            ; encoding: [0x85,0x50,0x00,0x39]
+; CHECK: strh  w2, [sp, #32]            ; encoding: [0xe2,0x43,0x00,0x79]
+
+;-----------------------------------------------------------------------------
+; Unscaled immediate loads and stores
+;-----------------------------------------------------------------------------
+
+  ldur    w2, [x3]
+  ldur    w2, [sp, #24]
+  ldur    x2, [x3]
+  ldur    x2, [sp, #24]
+  ldur    b5, [sp, #1]
+  ldur    h6, [sp, #2]
+  ldur    s7, [sp, #4]
+  ldur    d8, [sp, #8]
+  ldur    q9, [sp, #16]
+  ldursb  w9, [x3]
+  ldursb  x2, [sp, #128]
+  ldursh  w3, [sp, #32]
+  ldursh  x5, [x9, #24]
+  ldursw  x9, [sp, #-128]
+
+; CHECK: ldur    w2, [x3]               ; encoding: [0x62,0x00,0x40,0xb8]
+; CHECK: ldur    w2, [sp, #24]          ; encoding: [0xe2,0x83,0x41,0xb8]
+; CHECK: ldur    x2, [x3]               ; encoding: [0x62,0x00,0x40,0xf8]
+; CHECK: ldur    x2, [sp, #24]          ; encoding: [0xe2,0x83,0x41,0xf8]
+; CHECK: ldur    b5, [sp, #1]           ; encoding: [0xe5,0x13,0x40,0x3c]
+; CHECK: ldur    h6, [sp, #2]           ; encoding: [0xe6,0x23,0x40,0x7c]
+; CHECK: ldur    s7, [sp, #4]           ; encoding: [0xe7,0x43,0x40,0xbc]
+; CHECK: ldur    d8, [sp, #8]           ; encoding: [0xe8,0x83,0x40,0xfc]
+; CHECK: ldur    q9, [sp, #16]          ; encoding: [0xe9,0x03,0xc1,0x3c]
+; CHECK: ldursb  w9, [x3]               ; encoding: [0x69,0x00,0xc0,0x38]
+; CHECK: ldursb  x2, [sp, #128]         ; encoding: [0xe2,0x03,0x88,0x38]
+; CHECK: ldursh  w3, [sp, #32]          ; encoding: [0xe3,0x03,0xc2,0x78]
+; CHECK: ldursh  x5, [x9, #24]          ; encoding: [0x25,0x81,0x81,0x78]
+; CHECK: ldursw  x9, [sp, #-128]        ; encoding: [0xe9,0x03,0x98,0xb8]
+
+  stur    w4, [x3]
+  stur    w2, [sp, #32]
+  stur    x4, [x3]
+  stur    x2, [sp, #32]
+  stur    w5, [x4, #20]
+  stur    b5, [sp, #1]
+  stur    h6, [sp, #2]
+  stur    s7, [sp, #4]
+  stur    d8, [sp, #8]
+  stur    q9, [sp, #16]
+  sturb   w4, [x3]
+  sturb   w5, [x4, #20]
+  sturh   w2, [sp, #32]
+  prfum   #5, [sp, #32]
+
+; CHECK: stur    w4, [x3]               ; encoding: [0x64,0x00,0x00,0xb8]
+; CHECK: stur    w2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0xb8]
+; CHECK: stur    x4, [x3]               ; encoding: [0x64,0x00,0x00,0xf8]
+; CHECK: stur    x2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0xf8]
+; CHECK: stur    w5, [x4, #20]          ; encoding: [0x85,0x40,0x01,0xb8]
+; CHECK: stur    b5, [sp, #1]           ; encoding: [0xe5,0x13,0x00,0x3c]
+; CHECK: stur    h6, [sp, #2]           ; encoding: [0xe6,0x23,0x00,0x7c]
+; CHECK: stur    s7, [sp, #4]           ; encoding: [0xe7,0x43,0x00,0xbc]
+; CHECK: stur    d8, [sp, #8]           ; encoding: [0xe8,0x83,0x00,0xfc]
+; CHECK: stur    q9, [sp, #16]          ; encoding: [0xe9,0x03,0x81,0x3c]
+; CHECK: sturb   w4, [x3]               ; encoding: [0x64,0x00,0x00,0x38]
+; CHECK: sturb   w5, [x4, #20]          ; encoding: [0x85,0x40,0x01,0x38]
+; CHECK: sturh   w2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0x78]
+; CHECK: prfum   pldl3strm, [sp, #32]   ; encoding: [0xe5,0x03,0x82,0xf8]
+
+;-----------------------------------------------------------------------------
+; Unprivileged loads and stores
+;-----------------------------------------------------------------------------
+
+  ldtr    w3, [x4, #16]
+  ldtr    x3, [x4, #16]
+  ldtrb   w3, [x4, #16]
+  ldtrsb  w9, [x3]
+  ldtrsb  x2, [sp, #128]
+  ldtrh   w3, [x4, #16]
+  ldtrsh  w3, [sp, #32]
+  ldtrsh  x5, [x9, #24]
+  ldtrsw  x9, [sp, #-128]
+
+; CHECK: ldtr   w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0xb8]
+; CHECK: ldtr   x3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0xf8]
+; CHECK: ldtrb  w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0x38]
+; CHECK: ldtrsb w9, [x3]                ; encoding: [0x69,0x08,0xc0,0x38]
+; CHECK: ldtrsb x2, [sp, #128]          ; encoding: [0xe2,0x0b,0x88,0x38]
+; CHECK: ldtrh  w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0x78]
+; CHECK: ldtrsh w3, [sp, #32]           ; encoding: [0xe3,0x0b,0xc2,0x78]
+; CHECK: ldtrsh x5, [x9, #24]           ; encoding: [0x25,0x89,0x81,0x78]
+; CHECK: ldtrsw x9, [sp, #-128]         ; encoding: [0xe9,0x0b,0x98,0xb8]
+
+  sttr    w5, [x4, #20]
+  sttr    x4, [x3]
+  sttr    x2, [sp, #32]
+  sttrb   w4, [x3]
+  sttrb   w5, [x4, #20]
+  sttrh   w2, [sp, #32]
+
+; CHECK: sttr   w5, [x4, #20]           ; encoding: [0x85,0x48,0x01,0xb8]
+; CHECK: sttr   x4, [x3]                ; encoding: [0x64,0x08,0x00,0xf8]
+; CHECK: sttr   x2, [sp, #32]           ; encoding: [0xe2,0x0b,0x02,0xf8]
+; CHECK: sttrb  w4, [x3]                ; encoding: [0x64,0x08,0x00,0x38]
+; CHECK: sttrb  w5, [x4, #20]           ; encoding: [0x85,0x48,0x01,0x38]
+; CHECK: sttrh  w2, [sp, #32]           ; encoding: [0xe2,0x0b,0x02,0x78]
+
+;-----------------------------------------------------------------------------
+; Pre-indexed loads and stores
+;-----------------------------------------------------------------------------
+
+  ldr   x29, [x7, #8]!
+  ldr   x30, [x7, #8]!
+  ldr   b5, [x0, #1]!
+  ldr   h6, [x0, #2]!
+  ldr   s7, [x0, #4]!
+  ldr   d8, [x0, #8]!
+  ldr   q9, [x0, #16]!
+
+  str   x30, [x7, #-8]!
+  str   x29, [x7, #-8]!
+  str   b5, [x0, #-1]!
+  str   h6, [x0, #-2]!
+  str   s7, [x0, #-4]!
+  str   d8, [x0, #-8]!
+  str   q9, [x0, #-16]!
+
+; CHECK: ldr  x29, [x7, #8]!             ; encoding: [0xfd,0x8c,0x40,0xf8]
+; CHECK: ldr  x30, [x7, #8]!             ; encoding: [0xfe,0x8c,0x40,0xf8]
+; CHECK: ldr  b5, [x0, #1]!             ; encoding: [0x05,0x1c,0x40,0x3c]
+; CHECK: ldr  h6, [x0, #2]!             ; encoding: [0x06,0x2c,0x40,0x7c]
+; CHECK: ldr  s7, [x0, #4]!             ; encoding: [0x07,0x4c,0x40,0xbc]
+; CHECK: ldr  d8, [x0, #8]!             ; encoding: [0x08,0x8c,0x40,0xfc]
+; CHECK: ldr  q9, [x0, #16]!            ; encoding: [0x09,0x0c,0xc1,0x3c]
+
+; CHECK: str  x30, [x7, #-8]!            ; encoding: [0xfe,0x8c,0x1f,0xf8]
+; CHECK: str  x29, [x7, #-8]!            ; encoding: [0xfd,0x8c,0x1f,0xf8]
+; CHECK: str  b5, [x0, #-1]!            ; encoding: [0x05,0xfc,0x1f,0x3c]
+; CHECK: str  h6, [x0, #-2]!            ; encoding: [0x06,0xec,0x1f,0x7c]
+; CHECK: str  s7, [x0, #-4]!            ; encoding: [0x07,0xcc,0x1f,0xbc]
+; CHECK: str  d8, [x0, #-8]!            ; encoding: [0x08,0x8c,0x1f,0xfc]
+; CHECK: str  q9, [x0, #-16]!           ; encoding: [0x09,0x0c,0x9f,0x3c]
+
+;-----------------------------------------------------------------------------
+; post-indexed loads and stores
+;-----------------------------------------------------------------------------
+  str x30, [x7], #-8
+  str x29, [x7], #-8
+  str b5, [x0], #-1
+  str h6, [x0], #-2
+  str s7, [x0], #-4
+  str d8, [x0], #-8
+  str q9, [x0], #-16
+
+  ldr x29, [x7], #8
+  ldr x30, [x7], #8
+  ldr b5, [x0], #1
+  ldr h6, [x0], #2
+  ldr s7, [x0], #4
+  ldr d8, [x0], #8
+  ldr q9, [x0], #16
+
+; CHECK: str x30, [x7], #-8             ; encoding: [0xfe,0x84,0x1f,0xf8]
+; CHECK: str x29, [x7], #-8             ; encoding: [0xfd,0x84,0x1f,0xf8]
+; CHECK: str b5, [x0], #-1             ; encoding: [0x05,0xf4,0x1f,0x3c]
+; CHECK: str h6, [x0], #-2             ; encoding: [0x06,0xe4,0x1f,0x7c]
+; CHECK: str s7, [x0], #-4             ; encoding: [0x07,0xc4,0x1f,0xbc]
+; CHECK: str d8, [x0], #-8             ; encoding: [0x08,0x84,0x1f,0xfc]
+; CHECK: str q9, [x0], #-16            ; encoding: [0x09,0x04,0x9f,0x3c]
+
+; CHECK: ldr x29, [x7], #8              ; encoding: [0xfd,0x84,0x40,0xf8]
+; CHECK: ldr x30, [x7], #8              ; encoding: [0xfe,0x84,0x40,0xf8]
+; CHECK: ldr b5, [x0], #1              ; encoding: [0x05,0x14,0x40,0x3c]
+; CHECK: ldr h6, [x0], #2              ; encoding: [0x06,0x24,0x40,0x7c]
+; CHECK: ldr s7, [x0], #4              ; encoding: [0x07,0x44,0x40,0xbc]
+; CHECK: ldr d8, [x0], #8              ; encoding: [0x08,0x84,0x40,0xfc]
+; CHECK: ldr q9, [x0], #16             ; encoding: [0x09,0x04,0xc1,0x3c]
+
+;-----------------------------------------------------------------------------
+; Load/Store pair (indexed, offset)
+;-----------------------------------------------------------------------------
+
+  ldp    w3, w2, [x15, #16]
+  ldp    x4, x9, [sp, #-16]
+  ldpsw  x2, x3, [x14, #16]
+  ldpsw  x2, x3, [sp, #-16]
+  ldp    s10, s1, [x2, #64]
+  ldp    d10, d1, [x2]
+  ldp    q2, q3, [x0, #32]
+
+; CHECK: ldp    w3, w2, [x15, #16]      ; encoding: [0xe3,0x09,0x42,0x29]
+; CHECK: ldp    x4, x9, [sp, #-16]      ; encoding: [0xe4,0x27,0x7f,0xa9]
+; CHECK: ldpsw  x2, x3, [x14, #16]      ; encoding: [0xc2,0x0d,0x42,0x69]
+; CHECK: ldpsw  x2, x3, [sp, #-16]      ; encoding: [0xe2,0x0f,0x7e,0x69]
+; CHECK: ldp    s10, s1, [x2, #64]      ; encoding: [0x4a,0x04,0x48,0x2d]
+; CHECK: ldp    d10, d1, [x2]           ; encoding: [0x4a,0x04,0x40,0x6d]
+; CHECK: ldp    q2, q3, [x0, #32]       ; encoding: [0x02,0x0c,0x41,0xad]
+
+  stp    w3, w2, [x15, #16]
+  stp    x4, x9, [sp, #-16]
+  stp    s10, s1, [x2, #64]
+  stp    d10, d1, [x2]
+  stp    q2, q3, [x0, #32]
+
+; CHECK: stp    w3, w2, [x15, #16]      ; encoding: [0xe3,0x09,0x02,0x29]
+; CHECK: stp    x4, x9, [sp, #-16]      ; encoding: [0xe4,0x27,0x3f,0xa9]
+; CHECK: stp    s10, s1, [x2, #64]      ; encoding: [0x4a,0x04,0x08,0x2d]
+; CHECK: stp    d10, d1, [x2]           ; encoding: [0x4a,0x04,0x00,0x6d]
+; CHECK: stp    q2, q3, [x0, #32]       ; encoding: [0x02,0x0c,0x01,0xad]
+
+;-----------------------------------------------------------------------------
+; Load/Store pair (pre-indexed)
+;-----------------------------------------------------------------------------
+
+  ldp    w3, w2, [x15, #16]!
+  ldp    x4, x9, [sp, #-16]!
+  ldpsw  x2, x3, [x14, #16]!
+  ldpsw  x2, x3, [sp, #-16]!
+  ldp    s10, s1, [x2, #64]!
+  ldp    d10, d1, [x2, #16]!
+
+; CHECK: ldp  w3, w2, [x15, #16]!       ; encoding: [0xe3,0x09,0xc2,0x29]
+; CHECK: ldp  x4, x9, [sp, #-16]!       ; encoding: [0xe4,0x27,0xff,0xa9]
+; CHECK: ldpsw	x2, x3, [x14, #16]!     ; encoding: [0xc2,0x0d,0xc2,0x69]
+; CHECK: ldpsw	x2, x3, [sp, #-16]!     ; encoding: [0xe2,0x0f,0xfe,0x69]
+; CHECK: ldp  s10, s1, [x2, #64]!       ; encoding: [0x4a,0x04,0xc8,0x2d]
+; CHECK: ldp  d10, d1, [x2, #16]!       ; encoding: [0x4a,0x04,0xc1,0x6d]
+
+  stp    w3, w2, [x15, #16]!
+  stp    x4, x9, [sp, #-16]!
+  stp    s10, s1, [x2, #64]!
+  stp    d10, d1, [x2, #16]!
+
+; CHECK: stp  w3, w2, [x15, #16]!       ; encoding: [0xe3,0x09,0x82,0x29]
+; CHECK: stp  x4, x9, [sp, #-16]!       ; encoding: [0xe4,0x27,0xbf,0xa9]
+; CHECK: stp  s10, s1, [x2, #64]!       ; encoding: [0x4a,0x04,0x88,0x2d]
+; CHECK: stp  d10, d1, [x2, #16]!       ; encoding: [0x4a,0x04,0x81,0x6d]
+
+;-----------------------------------------------------------------------------
+; Load/Store pair (post-indexed)
+;-----------------------------------------------------------------------------
+
+  ldp    w3, w2, [x15], #16
+  ldp    x4, x9, [sp], #-16
+  ldpsw  x2, x3, [x14], #16
+  ldpsw  x2, x3, [sp], #-16
+  ldp    s10, s1, [x2], #64
+  ldp    d10, d1, [x2], #16
+
+; CHECK: ldp  w3, w2, [x15], #16        ; encoding: [0xe3,0x09,0xc2,0x28]
+; CHECK: ldp  x4, x9, [sp], #-16        ; encoding: [0xe4,0x27,0xff,0xa8]
+; CHECK: ldpsw	x2, x3, [x14], #16      ; encoding: [0xc2,0x0d,0xc2,0x68]
+; CHECK: ldpsw	x2, x3, [sp], #-16      ; encoding: [0xe2,0x0f,0xfe,0x68]
+; CHECK: ldp  s10, s1, [x2], #64        ; encoding: [0x4a,0x04,0xc8,0x2c]
+; CHECK: ldp  d10, d1, [x2], #16        ; encoding: [0x4a,0x04,0xc1,0x6c]
+
+  stp    w3, w2, [x15], #16
+  stp    x4, x9, [sp], #-16
+  stp    s10, s1, [x2], #64
+  stp    d10, d1, [x2], #16
+
+; CHECK: stp  w3, w2, [x15], #16        ; encoding: [0xe3,0x09,0x82,0x28]
+; CHECK: stp  x4, x9, [sp], #-16        ; encoding: [0xe4,0x27,0xbf,0xa8]
+; CHECK: stp  s10, s1, [x2], #64        ; encoding: [0x4a,0x04,0x88,0x2c]
+; CHECK: stp  d10, d1, [x2], #16        ; encoding: [0x4a,0x04,0x81,0x6c]
+
+;-----------------------------------------------------------------------------
+; Load/Store pair (no-allocate)
+;-----------------------------------------------------------------------------
+
+  ldnp  w3, w2, [x15, #16]
+  ldnp  x4, x9, [sp, #-16]
+  ldnp  s10, s1, [x2, #64]
+  ldnp  d10, d1, [x2]
+
+; CHECK: ldnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x42,0x28]
+; CHECK: ldnp  x4, x9, [sp, #-16]       ; encoding: [0xe4,0x27,0x7f,0xa8]
+; CHECK: ldnp  s10, s1, [x2, #64]       ; encoding: [0x4a,0x04,0x48,0x2c]
+; CHECK: ldnp  d10, d1, [x2]            ; encoding: [0x4a,0x04,0x40,0x6c]
+
+  stnp  w3, w2, [x15, #16]
+  stnp  x4, x9, [sp, #-16]
+  stnp  s10, s1, [x2, #64]
+  stnp  d10, d1, [x2]
+
+; CHECK: stnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x02,0x28]
+; CHECK: stnp  x4, x9, [sp, #-16]       ; encoding: [0xe4,0x27,0x3f,0xa8]
+; CHECK: stnp  s10, s1, [x2, #64]       ; encoding: [0x4a,0x04,0x08,0x2c]
+; CHECK: stnp  d10, d1, [x2]            ; encoding: [0x4a,0x04,0x00,0x6c]
+
+;-----------------------------------------------------------------------------
+; Load/Store register offset
+;-----------------------------------------------------------------------------
+
+  ldr  w0, [x0, x0]
+  ldr  w0, [x0, x0, lsl #2]
+  ldr  x0, [x0, x0]
+  ldr  x0, [x0, x0, lsl #3]
+  ldr  x0, [x0, x0, sxtx]
+
+; CHECK: ldr  w0, [x0, x0]              ; encoding: [0x00,0x68,0x60,0xb8]
+; CHECK: ldr  w0, [x0, x0, lsl #2]      ; encoding: [0x00,0x78,0x60,0xb8]
+; CHECK: ldr  x0, [x0, x0]              ; encoding: [0x00,0x68,0x60,0xf8]
+; CHECK: ldr  x0, [x0, x0, lsl #3]      ; encoding: [0x00,0x78,0x60,0xf8]
+; CHECK: ldr  x0, [x0, x0, sxtx]        ; encoding: [0x00,0xe8,0x60,0xf8]
+
+  ldr  b1, [x1, x2]
+  ldr  b1, [x1, x2, lsl #0]
+  ldr  h1, [x1, x2]
+  ldr  h1, [x1, x2, lsl #1]
+  ldr  s1, [x1, x2]
+  ldr  s1, [x1, x2, lsl #2]
+  ldr  d1, [x1, x2]
+  ldr  d1, [x1, x2, lsl #3]
+  ldr  q1, [x1, x2]
+  ldr  q1, [x1, x2, lsl #4]
+
+; CHECK: ldr  b1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0x3c]
+; CHECK: ldr  b1, [x1, x2, lsl #0]      ; encoding: [0x21,0x78,0x62,0x3c]
+; CHECK: ldr  h1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0x7c]
+; CHECK: ldr  h1, [x1, x2, lsl #1]      ; encoding: [0x21,0x78,0x62,0x7c]
+; CHECK: ldr  s1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0xbc]
+; CHECK: ldr  s1, [x1, x2, lsl #2]      ; encoding: [0x21,0x78,0x62,0xbc]
+; CHECK: ldr  d1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0xfc]
+; CHECK: ldr  d1, [x1, x2, lsl #3]      ; encoding: [0x21,0x78,0x62,0xfc]
+; CHECK: ldr  q1, [x1, x2]              ; encoding: [0x21,0x68,0xe2,0x3c]
+; CHECK: ldr  q1, [x1, x2, lsl #4]      ; encoding: [0x21,0x78,0xe2,0x3c]
+
+  str  d1, [sp, x3]
+  str  d1, [sp, w3, uxtw #3]
+  str  q1, [sp, x3]
+  str  q1, [sp, w3, uxtw #4]
+
+; CHECK: str  d1, [sp, x3]              ; encoding: [0xe1,0x6b,0x23,0xfc]
+; CHECK: str  d1, [sp, w3, uxtw #3]     ; encoding: [0xe1,0x5b,0x23,0xfc]
+; CHECK: str  q1, [sp, x3]              ; encoding: [0xe1,0x6b,0xa3,0x3c]
+; CHECK: str  q1, [sp, w3, uxtw #4]     ; encoding: [0xe1,0x5b,0xa3,0x3c]
+
+;-----------------------------------------------------------------------------
+; Load literal
+;-----------------------------------------------------------------------------
+
+  ldr    w5, foo
+  ldr    x4, foo
+  ldrsw  x9, foo
+  prfm   #5, foo
+
+; CHECK: ldr    w5, foo                 ; encoding: [0bAAA00101,A,A,0x18]
+; CHECK: ldr    x4, foo                 ; encoding: [0bAAA00100,A,A,0x58]
+; CHECK: ldrsw  x9, foo                 ; encoding: [0bAAA01001,A,A,0x98]
+; CHECK: prfm   pldl3strm, foo          ; encoding: [0bAAA00101,A,A,0xd8]
+
+;-----------------------------------------------------------------------------
+; Load/Store exclusive
+;-----------------------------------------------------------------------------
+
+  ldxr   w6, [x1]
+  ldxr   x6, [x1]
+  ldxrb  w6, [x1]
+  ldxrh  w6, [x1]
+  ldxp   w7, w3, [x9]
+  ldxp   x7, x3, [x9]
+
+; CHECK: ldxrb  w6, [x1]                ; encoding: [0x26,0x7c,0x5f,0x08]
+; CHECK: ldxrh  w6, [x1]                ; encoding: [0x26,0x7c,0x5f,0x48]
+; CHECK: ldxp   w7, w3, [x9]            ; encoding: [0x27,0x0d,0x7f,0x88]
+; CHECK: ldxp   x7, x3, [x9]            ; encoding: [0x27,0x0d,0x7f,0xc8]
+
+  stxr   w1, x4, [x3]
+  stxr   w1, w4, [x3]
+  stxrb  w1, w4, [x3]
+  stxrh  w1, w4, [x3]
+  stxp   w1, x2, x6, [x1]
+  stxp   w1, w2, w6, [x1]
+
+; CHECK: stxr   w1, x4, [x3]            ; encoding: [0x64,0x7c,0x01,0xc8]
+; CHECK: stxr   w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x88]
+; CHECK: stxrb  w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x08]
+; CHECK: stxrh  w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x48]
+; CHECK: stxp   w1, x2, x6, [x1]        ; encoding: [0x22,0x18,0x21,0xc8]
+; CHECK: stxp   w1, w2, w6, [x1]        ; encoding: [0x22,0x18,0x21,0x88]
+
+;-----------------------------------------------------------------------------
+; Load-acquire/Store-release non-exclusive
+;-----------------------------------------------------------------------------
+
+  ldar   w4, [sp]
+  ldar   x4, [sp, #0]
+  ldarb  w4, [sp]
+  ldarh  w4, [sp]
+
+; CHECK: ldar   w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x88]
+; CHECK: ldar   x4, [sp]                ; encoding: [0xe4,0xff,0xdf,0xc8]
+; CHECK: ldarb  w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x08]
+; CHECK: ldarh  w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x48]
+
+  stlr   w3, [x6]
+  stlr   x3, [x6]
+  stlrb  w3, [x6]
+  stlrh  w3, [x6]
+
+; CHECK: stlr   w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x88]
+; CHECK: stlr   x3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0xc8]
+; CHECK: stlrb  w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x08]
+; CHECK: stlrh  w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x48]
+
+;-----------------------------------------------------------------------------
+; Load-acquire/Store-release exclusive
+;-----------------------------------------------------------------------------
+
+  ldaxr   w2, [x4]
+  ldaxr   x2, [x4]
+  ldaxrb  w2, [x4, #0]
+  ldaxrh  w2, [x4]
+  ldaxp   w2, w6, [x1]
+  ldaxp   x2, x6, [x1]
+
+; CHECK: ldaxr   w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x88]
+; CHECK: ldaxr   x2, [x4]               ; encoding: [0x82,0xfc,0x5f,0xc8]
+; CHECK: ldaxrb  w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x08]
+; CHECK: ldaxrh  w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x48]
+; CHECK: ldaxp   w2, w6, [x1]           ; encoding: [0x22,0x98,0x7f,0x88]
+; CHECK: ldaxp   x2, x6, [x1]           ; encoding: [0x22,0x98,0x7f,0xc8]
+
+  stlxr   w8, x7, [x1]
+  stlxr   w8, w7, [x1]
+  stlxrb  w8, w7, [x1]
+  stlxrh  w8, w7, [x1]
+  stlxp   w1, x2, x6, [x1]
+  stlxp   w1, w2, w6, [x1]
+
+; CHECK: stlxr  w8, x7, [x1]            ; encoding: [0x27,0xfc,0x08,0xc8]
+; CHECK: stlxr  w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x88]
+; CHECK: stlxrb w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x08]
+; CHECK: stlxrh w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x48]
+; CHECK: stlxp  w1, x2, x6, [x1]        ; encoding: [0x22,0x98,0x21,0xc8]
+; CHECK: stlxp  w1, w2, w6, [x1]        ; encoding: [0x22,0x98,0x21,0x88]
+
+
+;-----------------------------------------------------------------------------
+; LDUR/STUR aliases for negative and unaligned LDR/STR instructions.
+;
+; According to the ARM ISA documentation:
+; "A programmer-friendly assembler should also generate these instructions
+; in response to the standard LDR/STR mnemonics when the immediate offset is
+; unambiguous, i.e. negative or unaligned."
+;-----------------------------------------------------------------------------
+
+  ldr x11, [x29, #-8]
+  ldr x11, [x29, #7]
+  ldr w0, [x0, #2]
+  ldr w0, [x0, #-256]
+  ldr b2, [x1, #-2]
+  ldr h3, [x2, #3]
+  ldr h3, [x3, #-4]
+  ldr s3, [x4, #3]
+  ldr s3, [x5, #-4]
+  ldr d4, [x6, #4]
+  ldr d4, [x7, #-8]
+  ldr q5, [x8, #8]
+  ldr q5, [x9, #-16]
+
+; CHECK: ldur	x11, [x29, #-8]          ; encoding: [0xab,0x83,0x5f,0xf8]
+; CHECK: ldur	x11, [x29, #7]           ; encoding: [0xab,0x73,0x40,0xf8]
+; CHECK: ldur	w0, [x0, #2]            ; encoding: [0x00,0x20,0x40,0xb8]
+; CHECK: ldur	w0, [x0, #-256]         ; encoding: [0x00,0x00,0x50,0xb8]
+; CHECK: ldur	b2, [x1, #-2]           ; encoding: [0x22,0xe0,0x5f,0x3c]
+; CHECK: ldur	h3, [x2, #3]            ; encoding: [0x43,0x30,0x40,0x7c]
+; CHECK: ldur	h3, [x3, #-4]           ; encoding: [0x63,0xc0,0x5f,0x7c]
+; CHECK: ldur	s3, [x4, #3]            ; encoding: [0x83,0x30,0x40,0xbc]
+; CHECK: ldur	s3, [x5, #-4]           ; encoding: [0xa3,0xc0,0x5f,0xbc]
+; CHECK: ldur	d4, [x6, #4]            ; encoding: [0xc4,0x40,0x40,0xfc]
+; CHECK: ldur	d4, [x7, #-8]           ; encoding: [0xe4,0x80,0x5f,0xfc]
+; CHECK: ldur	q5, [x8, #8]            ; encoding: [0x05,0x81,0xc0,0x3c]
+; CHECK: ldur	q5, [x9, #-16]          ; encoding: [0x25,0x01,0xdf,0x3c]
+
+  str x11, [x29, #-8]
+  str x11, [x29, #7]
+  str w0, [x0, #2]
+  str w0, [x0, #-256]
+  str b2, [x1, #-2]
+  str h3, [x2, #3]
+  str h3, [x3, #-4]
+  str s3, [x4, #3]
+  str s3, [x5, #-4]
+  str d4, [x6, #4]
+  str d4, [x7, #-8]
+  str q5, [x8, #8]
+  str q5, [x9, #-16]
+
+; CHECK: stur	x11, [x29, #-8]          ; encoding: [0xab,0x83,0x1f,0xf8]
+; CHECK: stur	x11, [x29, #7]           ; encoding: [0xab,0x73,0x00,0xf8]
+; CHECK: stur	w0, [x0, #2]            ; encoding: [0x00,0x20,0x00,0xb8]
+; CHECK: stur	w0, [x0, #-256]         ; encoding: [0x00,0x00,0x10,0xb8]
+; CHECK: stur	b2, [x1, #-2]           ; encoding: [0x22,0xe0,0x1f,0x3c]
+; CHECK: stur	h3, [x2, #3]            ; encoding: [0x43,0x30,0x00,0x7c]
+; CHECK: stur	h3, [x3, #-4]           ; encoding: [0x63,0xc0,0x1f,0x7c]
+; CHECK: stur	s3, [x4, #3]            ; encoding: [0x83,0x30,0x00,0xbc]
+; CHECK: stur	s3, [x5, #-4]           ; encoding: [0xa3,0xc0,0x1f,0xbc]
+; CHECK: stur	d4, [x6, #4]            ; encoding: [0xc4,0x40,0x00,0xfc]
+; CHECK: stur	d4, [x7, #-8]           ; encoding: [0xe4,0x80,0x1f,0xfc]
+; CHECK: stur	q5, [x8, #8]            ; encoding: [0x05,0x81,0x80,0x3c]
+; CHECK: stur	q5, [x9, #-16]          ; encoding: [0x25,0x01,0x9f,0x3c]
+
+  ldrb w3, [x1, #-1]
+  ldrh w4, [x2, #1]
+  ldrh w5, [x3, #-1]
+  ldrsb w6, [x4, #-1]
+  ldrsb x7, [x5, #-1]
+  ldrsh w8, [x6, #1]
+  ldrsh w9, [x7, #-1]
+  ldrsh x1, [x8, #1]
+  ldrsh x2, [x9, #-1]
+  ldrsw x3, [x10, #10]
+  ldrsw x4, [x11, #-1]
+
+; CHECK: ldurb	w3, [x1, #-1]           ; encoding: [0x23,0xf0,0x5f,0x38]
+; CHECK: ldurh	w4, [x2, #1]            ; encoding: [0x44,0x10,0x40,0x78]
+; CHECK: ldurh	w5, [x3, #-1]           ; encoding: [0x65,0xf0,0x5f,0x78]
+; CHECK: ldursb	w6, [x4, #-1]           ; encoding: [0x86,0xf0,0xdf,0x38]
+; CHECK: ldursb	x7, [x5, #-1]           ; encoding: [0xa7,0xf0,0x9f,0x38]
+; CHECK: ldursh	w8, [x6, #1]            ; encoding: [0xc8,0x10,0xc0,0x78]
+; CHECK: ldursh	w9, [x7, #-1]           ; encoding: [0xe9,0xf0,0xdf,0x78]
+; CHECK: ldursh	x1, [x8, #1]            ; encoding: [0x01,0x11,0x80,0x78]
+; CHECK: ldursh	x2, [x9, #-1]           ; encoding: [0x22,0xf1,0x9f,0x78]
+; CHECK: ldursw	x3, [x10, #10]          ; encoding: [0x43,0xa1,0x80,0xb8]
+; CHECK: ldursw	x4, [x11, #-1]          ; encoding: [0x64,0xf1,0x9f,0xb8]
+
+  strb w3, [x1, #-1]
+  strh w4, [x2, #1]
+  strh w5, [x3, #-1]
+
+; CHECK: sturb	w3, [x1, #-1]           ; encoding: [0x23,0xf0,0x1f,0x38]
+; CHECK: sturh	w4, [x2, #1]            ; encoding: [0x44,0x10,0x00,0x78]
+; CHECK: sturh	w5, [x3, #-1]           ; encoding: [0x65,0xf0,0x1f,0x78]
diff --git a/test/MC/AArch64/arm64-nv-cond.s b/test/MC/AArch64/arm64-nv-cond.s
new file mode 100644
index 0000000..1b4d054
--- /dev/null
+++ b/test/MC/AArch64/arm64-nv-cond.s
@@ -0,0 +1,11 @@
+// RUN: llvm-mc < %s -triple arm64 -mattr=neon -show-encoding | FileCheck %s
+
+fcsel d28,d31,d31,nv
+csel x0,x0,x0,nv
+ccmp x0,x0,#0,nv
+b.nv #0
+
+// CHECK: fcsel   d28, d31, d31, nv       // encoding: [0xfc,0xff,0x7f,0x1e]
+// CHECK: csel    x0, x0, x0, nv          // encoding: [0x00,0xf0,0x80,0x9a]
+// CHECK: ccmp    x0, x0, #0, nv          // encoding: [0x00,0xf0,0x40,0xfa]
+// CHECK: b.nv #0                         // encoding: [0x0f,0x00,0x00,0x54]
diff --git a/test/MC/AArch64/arm64-optional-hash.s b/test/MC/AArch64/arm64-optional-hash.s
new file mode 100644
index 0000000..71e2fda
--- /dev/null
+++ b/test/MC/AArch64/arm64-optional-hash.s
@@ -0,0 +1,31 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+.text
+; parseOperand check
+; CHECK: add sp, sp, #32             ; encoding: [0xff,0x83,0x00,0x91]
+    add sp, sp, 32
+
+; Optional shift
+; CHECK: adds x3, x4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0xb1]
+adds x3, x4, 1024, lsl 12
+
+; Optional extend
+; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
+add sp, x2, x3, uxtx 0
+
+; FP immediates
+; CHECK: fmov s1, #0.12500000      ; encoding: [0x01,0x10,0x28,0x1e]
+fmov s1, 0.125
+
+; Barrier operand
+; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
+dmb 3
+
+; Prefetch and memory
+
+; Single register inside []
+; CHECK: ldnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x42,0x28]
+ldnp  w3, w2, [x15, 16]
+
+; Memory, two registers inside []
+; CHECK: prfm   pstl3strm, [x4, x5, lsl #3] ; encoding: [0x95,0x78,0xa5,0xf8]
+prfm  pstl3strm, [x4, x5, lsl 3]
diff --git a/test/MC/AArch64/arm64-separator.s b/test/MC/AArch64/arm64-separator.s
new file mode 100644
index 0000000..e67deba
--- /dev/null
+++ b/test/MC/AArch64/arm64-separator.s
@@ -0,0 +1,20 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+; ARM64 uses a multi-character statement separator, "%%". Check that we lex
+; it properly and recognize the multiple assembly statements on the line.
+
+; To make sure the output assembly correctly handled the instructions,
+; tell it to show encodings. That will result in the two 'mov' instructions
+; being on separate lines in the output. We look for the "; encoding" string
+; to verify that. For this test, we don't care what the encoding is, just that
+; there is one for each 'mov' instruction.
+
+
+_foo:
+; CHECK: foo
+; CHECK: mov x0, x1 ; encoding
+; CHECK: mov x1, x0 ; encoding
+	mov x0, x1 %% mov x1, x0
+	ret	lr
+
+
diff --git a/test/MC/AArch64/arm64-simd-ldst.s b/test/MC/AArch64/arm64-simd-ldst.s
new file mode 100644
index 0000000..3085485
--- /dev/null
+++ b/test/MC/AArch64/arm64-simd-ldst.s
@@ -0,0 +1,2404 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon -output-asm-variant=1 -show-encoding < %s | FileCheck %s
+
+_ld1st1_multiple:
+  ld1.8b {v0}, [x1]
+  ld1.8b {v0, v1}, [x1]
+  ld1.8b {v0, v1, v2}, [x1]
+  ld1.8b {v0, v1, v2, v3}, [x1]
+
+  ld1.8b {v3}, [x1]
+  ld1.8b {v3, v4}, [x2]
+  ld1.8b {v4, v5, v6}, [x3]
+  ld1.8b {v7, v8, v9, v10}, [x4]
+
+  ld1.16b {v0}, [x1]
+  ld1.16b {v0, v1}, [x1]
+  ld1.16b {v0, v1, v2}, [x1]
+  ld1.16b {v0, v1, v2, v3}, [x1]
+
+  ld1.4h {v0}, [x1]
+  ld1.4h {v0, v1}, [x1]
+  ld1.4h {v0, v1, v2}, [x1]
+  ld1.4h {v0, v1, v2, v3}, [x1]
+
+  ld1.8h {v0}, [x1]
+  ld1.8h {v0, v1}, [x1]
+  ld1.8h {v0, v1, v2}, [x1]
+  ld1.8h {v0, v1, v2, v3}, [x1]
+
+  ld1.2s {v0}, [x1]
+  ld1.2s {v0, v1}, [x1]
+  ld1.2s {v0, v1, v2}, [x1]
+  ld1.2s {v0, v1, v2, v3}, [x1]
+
+  ld1.4s {v0}, [x1]
+  ld1.4s {v0, v1}, [x1]
+  ld1.4s {v0, v1, v2}, [x1]
+  ld1.4s {v0, v1, v2, v3}, [x1]
+
+  ld1.1d {v0}, [x1]
+  ld1.1d {v0, v1}, [x1]
+  ld1.1d {v0, v1, v2}, [x1]
+  ld1.1d {v0, v1, v2, v3}, [x1]
+
+  ld1.2d {v0}, [x1]
+  ld1.2d {v0, v1}, [x1]
+  ld1.2d {v0, v1, v2}, [x1]
+  ld1.2d {v0, v1, v2, v3}, [x1]
+
+  st1.8b {v0}, [x1]
+  st1.8b {v0, v1}, [x1]
+  st1.8b {v0, v1, v2}, [x1]
+  st1.8b {v0, v1, v2, v3}, [x1]
+
+  st1.16b {v0}, [x1]
+  st1.16b {v0, v1}, [x1]
+  st1.16b {v0, v1, v2}, [x1]
+  st1.16b {v0, v1, v2, v3}, [x1]
+
+  st1.4h {v0}, [x1]
+  st1.4h {v0, v1}, [x1]
+  st1.4h {v0, v1, v2}, [x1]
+  st1.4h {v0, v1, v2, v3}, [x1]
+
+  st1.8h {v0}, [x1]
+  st1.8h {v0, v1}, [x1]
+  st1.8h {v0, v1, v2}, [x1]
+  st1.8h {v0, v1, v2, v3}, [x1]
+
+  st1.2s {v0}, [x1]
+  st1.2s {v0, v1}, [x1]
+  st1.2s {v0, v1, v2}, [x1]
+  st1.2s {v0, v1, v2, v3}, [x1]
+
+  st1.4s {v0}, [x1]
+  st1.4s {v0, v1}, [x1]
+  st1.4s {v0, v1, v2}, [x1]
+  st1.4s {v0, v1, v2, v3}, [x1]
+
+  st1.1d {v0}, [x1]
+  st1.1d {v0, v1}, [x1]
+  st1.1d {v0, v1, v2}, [x1]
+  st1.1d {v0, v1, v2, v3}, [x1]
+
+  st1.2d {v0}, [x1]
+  st1.2d {v0, v1}, [x1]
+  st1.2d {v0, v1, v2}, [x1]
+  st1.2d {v0, v1, v2, v3}, [x1]
+
+  st1.2d {v5}, [x1]
+  st1.2d {v7, v8}, [x10]
+  st1.2d {v11, v12, v13}, [x1]
+  st1.2d {v28, v29, v30, v31}, [x13]
+
+; CHECK: _ld1st1_multiple:
+; CHECK: ld1.8b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x40,0x0c]
+; CHECK: ld1.8b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x40,0x0c]
+; CHECK: ld1.8b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x40,0x0c]
+; CHECK: ld1.8b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x40,0x0c]
+
+; CHECK: ld1.8b { v3 }, [x1]            ; encoding: [0x23,0x70,0x40,0x0c]
+; CHECK: ld1.8b { v3, v4 }, [x2]        ; encoding: [0x43,0xa0,0x40,0x0c]
+; CHECK: ld1.8b { v4, v5, v6 }, [x3]    ; encoding: [0x64,0x60,0x40,0x0c]
+; CHECK: ld1.8b { v7, v8, v9, v10 }, [x4] ; encoding: [0x87,0x20,0x40,0x0c]
+
+; CHECK: ld1.16b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x40,0x4c]
+; CHECK: ld1.16b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x40,0x4c]
+; CHECK: ld1.16b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x40,0x4c]
+; CHECK: ld1.16b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x40,0x4c]
+
+; CHECK: ld1.4h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x40,0x0c]
+; CHECK: ld1.4h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x40,0x0c]
+; CHECK: ld1.4h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x40,0x0c]
+; CHECK: ld1.4h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x40,0x0c]
+
+; CHECK: ld1.8h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x40,0x4c]
+; CHECK: ld1.8h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x40,0x4c]
+; CHECK: ld1.8h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x40,0x4c]
+; CHECK: ld1.8h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x40,0x4c]
+
+; CHECK: ld1.2s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x40,0x0c]
+; CHECK: ld1.2s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x40,0x0c]
+; CHECK: ld1.2s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x40,0x0c]
+; CHECK: ld1.2s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x40,0x0c]
+
+; CHECK: ld1.4s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x40,0x4c]
+; CHECK: ld1.4s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x40,0x4c]
+; CHECK: ld1.4s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x40,0x4c]
+; CHECK: ld1.4s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x40,0x4c]
+
+; CHECK: ld1.1d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x40,0x0c]
+; CHECK: ld1.1d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x40,0x0c]
+; CHECK: ld1.1d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x40,0x0c]
+; CHECK: ld1.1d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x40,0x0c]
+
+; CHECK: ld1.2d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x40,0x4c]
+; CHECK: ld1.2d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x40,0x4c]
+; CHECK: ld1.2d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x40,0x4c]
+; CHECK: ld1.2d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x40,0x4c]
+
+
+; CHECK: st1.8b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x00,0x0c]
+; CHECK: st1.8b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x00,0x0c]
+; CHECK: st1.8b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x00,0x0c]
+; CHECK: st1.8b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x00,0x0c]
+
+; CHECK: st1.16b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x00,0x4c]
+; CHECK: st1.16b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x00,0x4c]
+; CHECK: st1.16b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x00,0x4c]
+; CHECK: st1.16b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x00,0x4c]
+
+; CHECK: st1.4h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x00,0x0c]
+; CHECK: st1.4h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x00,0x0c]
+; CHECK: st1.4h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x00,0x0c]
+; CHECK: st1.4h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x00,0x0c]
+
+; CHECK: st1.8h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x00,0x4c]
+; CHECK: st1.8h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x00,0x4c]
+; CHECK: st1.8h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x00,0x4c]
+; CHECK: st1.8h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x00,0x4c]
+
+; CHECK: st1.2s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x00,0x0c]
+; CHECK: st1.2s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x00,0x0c]
+; CHECK: st1.2s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x00,0x0c]
+; CHECK: st1.2s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x00,0x0c]
+
+; CHECK: st1.4s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x00,0x4c]
+; CHECK: st1.4s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x00,0x4c]
+; CHECK: st1.4s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x00,0x4c]
+; CHECK: st1.4s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x00,0x4c]
+
+; CHECK: st1.1d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x00,0x0c]
+; CHECK: st1.1d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x00,0x0c]
+; CHECK: st1.1d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x00,0x0c]
+; CHECK: st1.1d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x00,0x0c]
+
+; CHECK: st1.2d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x00,0x4c]
+; CHECK: st1.2d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x00,0x4c]
+; CHECK: st1.2d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x00,0x4c]
+; CHECK: st1.2d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x00,0x4c]
+
+; CHECK: st1.2d { v5 }, [x1]            ; encoding: [0x25,0x7c,0x00,0x4c]
+; CHECK: st1.2d { v7, v8 }, [x10]       ; encoding: [0x47,0xad,0x00,0x4c]
+; CHECK: st1.2d { v11, v12, v13 }, [x1] ; encoding: [0x2b,0x6c,0x00,0x4c]
+; CHECK: st1.2d { v28, v29, v30, v31 }, [x13] ; encoding: [0xbc,0x2d,0x00,0x4c]
+
+_ld2st2_multiple:
+  ld2.8b {v4, v5}, [x19]
+  ld2.16b {v4, v5}, [x19]
+  ld2.4h {v4, v5}, [x19]
+  ld2.8h {v4, v5}, [x19]
+  ld2.2s {v4, v5}, [x19]
+  ld2.4s {v4, v5}, [x19]
+  ld2.2d {v4, v5}, [x19]
+
+  st2.8b {v4, v5}, [x19]
+  st2.16b {v4, v5}, [x19]
+  st2.4h {v4, v5}, [x19]
+  st2.8h {v4, v5}, [x19]
+  st2.2s {v4, v5}, [x19]
+  st2.4s {v4, v5}, [x19]
+  st2.2d {v4, v5}, [x19]
+
+
+; CHECK: _ld2st2_multiple
+; CHECK: ld2.8b { v4, v5 }, [x19]       ; encoding: [0x64,0x82,0x40,0x0c]
+; CHECK: ld2.16b { v4, v5 }, [x19]      ; encoding: [0x64,0x82,0x40,0x4c]
+; CHECK: ld2.4h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x40,0x0c]
+; CHECK: ld2.8h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x40,0x4c]
+; CHECK: ld2.2s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x40,0x0c]
+; CHECK: ld2.4s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x40,0x4c]
+; CHECK: ld2.2d { v4, v5 }, [x19]       ; encoding: [0x64,0x8e,0x40,0x4c]
+
+; CHECK: st2.8b { v4, v5 }, [x19]       ; encoding: [0x64,0x82,0x00,0x0c]
+; CHECK: st2.16b { v4, v5 }, [x19]      ; encoding: [0x64,0x82,0x00,0x4c]
+; CHECK: st2.4h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x00,0x0c]
+; CHECK: st2.8h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x00,0x4c]
+; CHECK: st2.2s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x00,0x0c]
+; CHECK: st2.4s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x00,0x4c]
+; CHECK: st2.2d { v4, v5 }, [x19]       ; encoding: [0x64,0x8e,0x00,0x4c]
+
+
+ld3st3_multiple:
+    ld3.8b {v4, v5, v6}, [x19]
+    ld3.16b {v4, v5, v6}, [x19]
+    ld3.4h {v4, v5, v6}, [x19]
+    ld3.8h {v4, v5, v6}, [x19]
+    ld3.2s {v4, v5, v6}, [x19]
+    ld3.4s {v4, v5, v6}, [x19]
+    ld3.2d {v4, v5, v6}, [x19]
+
+    ld3.8b {v9, v10, v11}, [x9]
+    ld3.16b {v14, v15, v16}, [x19]
+    ld3.4h {v24, v25, v26}, [x29]
+    ld3.8h {v30, v31, v0}, [x9]
+    ld3.2s {v2, v3, v4}, [x19]
+    ld3.4s {v4, v5, v6}, [x29]
+    ld3.2d {v7, v8, v9}, [x9]
+
+    st3.8b {v4, v5, v6}, [x19]
+    st3.16b {v4, v5, v6}, [x19]
+    st3.4h {v4, v5, v6}, [x19]
+    st3.8h {v4, v5, v6}, [x19]
+    st3.2s {v4, v5, v6}, [x19]
+    st3.4s {v4, v5, v6}, [x19]
+    st3.2d {v4, v5, v6}, [x19]
+
+    st3.8b {v10, v11, v12}, [x9]
+    st3.16b {v14, v15, v16}, [x19]
+    st3.4h {v24, v25, v26}, [x29]
+    st3.8h {v30, v31, v0}, [x9]
+    st3.2s {v2, v3, v4}, [x19]
+    st3.4s {v7, v8, v9}, [x29]
+    st3.2d {v4, v5, v6}, [x9]
+
+; CHECK: ld3st3_multiple:
+; CHECK: ld3.8b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x40,0x0c]
+; CHECK: ld3.16b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x40,0x4c]
+; CHECK: ld3.4h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x40,0x0c]
+; CHECK: ld3.8h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x40,0x4c]
+; CHECK: ld3.2s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x40,0x0c]
+; CHECK: ld3.4s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x40,0x4c]
+; CHECK: ld3.2d { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4e,0x40,0x4c]
+
+; CHECK: ld3.8b { v9, v10, v11 }, [x9]  ; encoding: [0x29,0x41,0x40,0x0c]
+; CHECK: ld3.16b { v14, v15, v16 }, [x19] ; encoding: [0x6e,0x42,0x40,0x4c]
+; CHECK: ld3.4h { v24, v25, v26 }, [x29] ; encoding: [0xb8,0x47,0x40,0x0c]
+; CHECK: ld3.8h { v30, v31, v0 }, [x9]  ; encoding: [0x3e,0x45,0x40,0x4c]
+; CHECK: ld3.2s { v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x0c]
+; CHECK: ld3.4s { v4, v5, v6 }, [x29]    ; encoding: [0xa4,0x4b,0x40,0x4c]
+; CHECK: ld3.2d { v7, v8, v9 }, [x9]    ; encoding: [0x27,0x4d,0x40,0x4c]
+
+; CHECK: st3.8b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x00,0x0c]
+; CHECK: st3.16b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x00,0x4c]
+; CHECK: st3.4h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x00,0x0c]
+; CHECK: st3.8h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x00,0x4c]
+; CHECK: st3.2s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x00,0x0c]
+; CHECK: st3.4s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x00,0x4c]
+; CHECK: st3.2d { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4e,0x00,0x4c]
+
+; CHECK: st3.8b { v10, v11, v12 }, [x9] ; encoding: [0x2a,0x41,0x00,0x0c]
+; CHECK: st3.16b { v14, v15, v16 }, [x19] ; encoding: [0x6e,0x42,0x00,0x4c]
+; CHECK: st3.4h { v24, v25, v26 }, [x29] ; encoding: [0xb8,0x47,0x00,0x0c]
+; CHECK: st3.8h { v30, v31, v0 }, [x9]  ; encoding: [0x3e,0x45,0x00,0x4c]
+; CHECK: st3.2s { v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x0c]
+; CHECK: st3.4s { v7, v8, v9 }, [x29]    ; encoding: [0xa7,0x4b,0x00,0x4c]
+; CHECK: st3.2d { v4, v5, v6 }, [x9]    ; encoding: [0x24,0x4d,0x00,0x4c]
+
+ld4st4_multiple:
+    ld4.8b {v4, v5, v6, v7}, [x19]
+    ld4.16b {v4, v5, v6, v7}, [x19]
+    ld4.4h {v4, v5, v6, v7}, [x19]
+    ld4.8h {v4, v5, v6, v7}, [x19]
+    ld4.2s {v4, v5, v6, v7}, [x19]
+    ld4.4s {v4, v5, v6, v7}, [x19]
+    ld4.2d {v4, v5, v6, v7}, [x19]
+
+    st4.8b {v4, v5, v6, v7}, [x19]
+    st4.16b {v4, v5, v6, v7}, [x19]
+    st4.4h {v4, v5, v6, v7}, [x19]
+    st4.8h {v4, v5, v6, v7}, [x19]
+    st4.2s {v4, v5, v6, v7}, [x19]
+    st4.4s {v4, v5, v6, v7}, [x19]
+    st4.2d {v4, v5, v6, v7}, [x19]
+
+; CHECK: ld4st4_multiple:
+; CHECK: ld4.8b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x40,0x0c]
+; CHECK: ld4.16b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x40,0x4c]
+; CHECK: ld4.4h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x40,0x0c]
+; CHECK: ld4.8h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x40,0x4c]
+; CHECK: ld4.2s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x40,0x0c]
+; CHECK: ld4.4s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x40,0x4c]
+; CHECK: ld4.2d { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0e,0x40,0x4c]
+
+; CHECK: st4.8b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x00,0x0c]
+; CHECK: st4.16b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x00,0x4c]
+; CHECK: st4.4h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x00,0x0c]
+; CHECK: st4.8h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x00,0x4c]
+; CHECK: st4.2s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x00,0x0c]
+; CHECK: st4.4s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x00,0x4c]
+; CHECK: st4.2d { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0e,0x00,0x4c]
+
+;-----------------------------------------------------------------------------
+; Post-increment versions.
+;-----------------------------------------------------------------------------
+
+_ld1st1_multiple_post:
+  ld1.8b {v0}, [x1], x15
+  ld1.8b {v0, v1}, [x1], x15
+  ld1.8b {v0, v1, v2}, [x1], x15
+  ld1.8b {v0, v1, v2, v3}, [x1], x15
+
+  ld1.16b {v0}, [x1], x15
+  ld1.16b {v0, v1}, [x1], x15
+  ld1.16b {v0, v1, v2}, [x1], x15
+  ld1.16b {v0, v1, v2, v3}, [x1], x15
+
+  ld1.4h {v0}, [x1], x15
+  ld1.4h {v0, v1}, [x1], x15
+  ld1.4h {v0, v1, v2}, [x1], x15
+  ld1.4h {v0, v1, v2, v3}, [x1], x15
+
+  ld1.8h {v0}, [x1], x15
+  ld1.8h {v0, v1}, [x1], x15
+  ld1.8h {v0, v1, v2}, [x1], x15
+  ld1.8h {v0, v1, v2, v3}, [x1], x15
+
+  ld1.2s {v0}, [x1], x15
+  ld1.2s {v0, v1}, [x1], x15
+  ld1.2s {v0, v1, v2}, [x1], x15
+  ld1.2s {v0, v1, v2, v3}, [x1], x15
+
+  ld1.4s {v0}, [x1], x15
+  ld1.4s {v0, v1}, [x1], x15
+  ld1.4s {v0, v1, v2}, [x1], x15
+  ld1.4s {v0, v1, v2, v3}, [x1], x15
+
+  ld1.1d {v0}, [x1], x15
+  ld1.1d {v0, v1}, [x1], x15
+  ld1.1d {v0, v1, v2}, [x1], x15
+  ld1.1d {v0, v1, v2, v3}, [x1], x15
+
+  ld1.2d {v0}, [x1], x15
+  ld1.2d {v0, v1}, [x1], x15
+  ld1.2d {v0, v1, v2}, [x1], x15
+  ld1.2d {v0, v1, v2, v3}, [x1], x15
+
+  st1.8b {v0}, [x1], x15
+  st1.8b {v0, v1}, [x1], x15
+  st1.8b {v0, v1, v2}, [x1], x15
+  st1.8b {v0, v1, v2, v3}, [x1], x15
+
+  st1.16b {v0}, [x1], x15
+  st1.16b {v0, v1}, [x1], x15
+  st1.16b {v0, v1, v2}, [x1], x15
+  st1.16b {v0, v1, v2, v3}, [x1], x15
+
+  st1.4h {v0}, [x1], x15
+  st1.4h {v0, v1}, [x1], x15
+  st1.4h {v0, v1, v2}, [x1], x15
+  st1.4h {v0, v1, v2, v3}, [x1], x15
+
+  st1.8h {v0}, [x1], x15
+  st1.8h {v0, v1}, [x1], x15
+  st1.8h {v0, v1, v2}, [x1], x15
+  st1.8h {v0, v1, v2, v3}, [x1], x15
+
+  st1.2s {v0}, [x1], x15
+  st1.2s {v0, v1}, [x1], x15
+  st1.2s {v0, v1, v2}, [x1], x15
+  st1.2s {v0, v1, v2, v3}, [x1], x15
+
+  st1.4s {v0}, [x1], x15
+  st1.4s {v0, v1}, [x1], x15
+  st1.4s {v0, v1, v2}, [x1], x15
+  st1.4s {v0, v1, v2, v3}, [x1], x15
+
+  st1.1d {v0}, [x1], x15
+  st1.1d {v0, v1}, [x1], x15
+  st1.1d {v0, v1, v2}, [x1], x15
+  st1.1d {v0, v1, v2, v3}, [x1], x15
+
+  st1.2d {v0}, [x1], x15
+  st1.2d {v0, v1}, [x1], x15
+  st1.2d {v0, v1, v2}, [x1], x15
+  st1.2d {v0, v1, v2, v3}, [x1], x15
+
+  ld1.8b {v0}, [x1], #8
+  ld1.8b {v0, v1}, [x1], #16
+  ld1.8b {v0, v1, v2}, [x1], #24
+  ld1.8b {v0, v1, v2, v3}, [x1], #32
+
+  ld1.16b {v0}, [x1], #16
+  ld1.16b {v0, v1}, [x1], #32
+  ld1.16b {v0, v1, v2}, [x1], #48
+  ld1.16b {v0, v1, v2, v3}, [x1], #64
+
+  ld1.4h {v0}, [x1], #8
+  ld1.4h {v0, v1}, [x1], #16
+  ld1.4h {v0, v1, v2}, [x1], #24
+  ld1.4h {v0, v1, v2, v3}, [x1], #32
+
+  ld1.8h {v0}, [x1], #16
+  ld1.8h {v0, v1}, [x1], #32
+  ld1.8h {v0, v1, v2}, [x1], #48
+  ld1.8h {v0, v1, v2, v3}, [x1], #64
+
+  ld1.2s {v0}, [x1], #8
+  ld1.2s {v0, v1}, [x1], #16
+  ld1.2s {v0, v1, v2}, [x1], #24
+  ld1.2s {v0, v1, v2, v3}, [x1], #32
+
+  ld1.4s {v0}, [x1], #16
+  ld1.4s {v0, v1}, [x1], #32
+  ld1.4s {v0, v1, v2}, [x1], #48
+  ld1.4s {v0, v1, v2, v3}, [x1], #64
+
+  ld1.1d {v0}, [x1], #8
+  ld1.1d {v0, v1}, [x1], #16
+  ld1.1d {v0, v1, v2}, [x1], #24
+  ld1.1d {v0, v1, v2, v3}, [x1], #32
+
+  ld1.2d {v0}, [x1], #16
+  ld1.2d {v0, v1}, [x1], #32
+  ld1.2d {v0, v1, v2}, [x1], #48
+  ld1.2d {v0, v1, v2, v3}, [x1], #64
+
+  st1.8b {v0}, [x1], #8
+  st1.8b {v0, v1}, [x1], #16
+  st1.8b {v0, v1, v2}, [x1], #24
+  st1.8b {v0, v1, v2, v3}, [x1], #32
+
+  st1.16b {v0}, [x1], #16
+  st1.16b {v0, v1}, [x1], #32
+  st1.16b {v0, v1, v2}, [x1], #48
+  st1.16b {v0, v1, v2, v3}, [x1], #64
+
+  st1.4h {v0}, [x1], #8
+  st1.4h {v0, v1}, [x1], #16
+  st1.4h {v0, v1, v2}, [x1], #24
+  st1.4h {v0, v1, v2, v3}, [x1], #32
+
+  st1.8h {v0}, [x1], #16
+  st1.8h {v0, v1}, [x1], #32
+  st1.8h {v0, v1, v2}, [x1], #48
+  st1.8h {v0, v1, v2, v3}, [x1], #64
+
+  st1.2s {v0}, [x1], #8
+  st1.2s {v0, v1}, [x1], #16
+  st1.2s {v0, v1, v2}, [x1], #24
+  st1.2s {v0, v1, v2, v3}, [x1], #32
+
+  st1.4s {v0}, [x1], #16
+  st1.4s {v0, v1}, [x1], #32
+  st1.4s {v0, v1, v2}, [x1], #48
+  st1.4s {v0, v1, v2, v3}, [x1], #64
+
+  st1.1d {v0}, [x1], #8
+  st1.1d {v0, v1}, [x1], #16
+  st1.1d {v0, v1, v2}, [x1], #24
+  st1.1d {v0, v1, v2, v3}, [x1], #32
+
+  st1.2d {v0}, [x1], #16
+  st1.2d {v0, v1}, [x1], #32
+  st1.2d {v0, v1, v2}, [x1], #48
+  st1.2d {v0, v1, v2, v3}, [x1], #64
+
+; CHECK: ld1st1_multiple_post:
+; CHECK: ld1.8b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0xcf,0x0c]
+; CHECK: ld1.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0xcf,0x0c]
+; CHECK: ld1.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0xcf,0x0c]
+; CHECK: ld1.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0xcf,0x0c]
+
+; CHECK: ld1.16b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0xcf,0x4c]
+; CHECK: ld1.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0xcf,0x4c]
+; CHECK: ld1.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0xcf,0x4c]
+; CHECK: ld1.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0xcf,0x4c]
+
+; CHECK: ld1.4h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0xcf,0x0c]
+; CHECK: ld1.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0xcf,0x0c]
+; CHECK: ld1.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0xcf,0x0c]
+; CHECK: ld1.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0xcf,0x0c]
+
+; CHECK: ld1.8h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0xcf,0x4c]
+; CHECK: ld1.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0xcf,0x4c]
+; CHECK: ld1.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0xcf,0x4c]
+; CHECK: ld1.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0xcf,0x4c]
+
+; CHECK: ld1.2s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0xcf,0x0c]
+; CHECK: ld1.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0xcf,0x0c]
+; CHECK: ld1.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0xcf,0x0c]
+; CHECK: ld1.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0xcf,0x0c]
+
+; CHECK: ld1.4s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0xcf,0x4c]
+; CHECK: ld1.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0xcf,0x4c]
+; CHECK: ld1.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0xcf,0x4c]
+; CHECK: ld1.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0xcf,0x4c]
+
+; CHECK: ld1.1d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0xcf,0x0c]
+; CHECK: ld1.1d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0xcf,0x0c]
+; CHECK: ld1.1d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0xcf,0x0c]
+; CHECK: ld1.1d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0xcf,0x0c]
+
+; CHECK: ld1.2d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0xcf,0x4c]
+; CHECK: ld1.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0xcf,0x4c]
+; CHECK: ld1.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0xcf,0x4c]
+; CHECK: ld1.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0xcf,0x4c]
+
+; CHECK: st1.8b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0x8f,0x0c]
+; CHECK: st1.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0x8f,0x0c]
+; CHECK: st1.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0x8f,0x0c]
+; CHECK: st1.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0x8f,0x0c]
+
+; CHECK: st1.16b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0x8f,0x4c]
+; CHECK: st1.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0x8f,0x4c]
+; CHECK: st1.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0x8f,0x4c]
+; CHECK: st1.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0x8f,0x4c]
+
+; CHECK: st1.4h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0x8f,0x0c]
+; CHECK: st1.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0x8f,0x0c]
+; CHECK: st1.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0x8f,0x0c]
+; CHECK: st1.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0x8f,0x0c]
+
+; CHECK: st1.8h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0x8f,0x4c]
+; CHECK: st1.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0x8f,0x4c]
+; CHECK: st1.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0x8f,0x4c]
+; CHECK: st1.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0x8f,0x4c]
+
+; CHECK: st1.2s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0x8f,0x0c]
+; CHECK: st1.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0x8f,0x0c]
+; CHECK: st1.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0x8f,0x0c]
+; CHECK: st1.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0x8f,0x0c]
+
+; CHECK: st1.4s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0x8f,0x4c]
+; CHECK: st1.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0x8f,0x4c]
+; CHECK: st1.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0x8f,0x4c]
+; CHECK: st1.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0x8f,0x4c]
+
+; CHECK: st1.1d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0x8f,0x0c]
+; CHECK: st1.1d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0x8f,0x0c]
+; CHECK: st1.1d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0x8f,0x0c]
+; CHECK: st1.1d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0x8f,0x0c]
+
+; CHECK: st1.2d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0x8f,0x4c]
+; CHECK: st1.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0x8f,0x4c]
+; CHECK: st1.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0x8f,0x4c]
+; CHECK: st1.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0x8f,0x4c]
+
+; CHECK: ld1.8b { v0 }, [x1], #8       ; encoding: [0x20,0x70,0xdf,0x0c]
+; CHECK: ld1.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa0,0xdf,0x0c]
+; CHECK: ld1.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x60,0xdf,0x0c]
+; CHECK: ld1.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x20,0xdf,0x0c]
+
+; CHECK: ld1.16b { v0 }, [x1], #16       ; encoding: [0x20,0x70,0xdf,0x4c]
+; CHECK: ld1.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa0,0xdf,0x4c]
+; CHECK: ld1.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x60,0xdf,0x4c]
+; CHECK: ld1.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x20,0xdf,0x4c]
+
+; CHECK: ld1.4h { v0 }, [x1], #8       ; encoding: [0x20,0x74,0xdf,0x0c]
+; CHECK: ld1.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa4,0xdf,0x0c]
+; CHECK: ld1.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x64,0xdf,0x0c]
+; CHECK: ld1.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x24,0xdf,0x0c]
+
+; CHECK: ld1.8h { v0 }, [x1], #16       ; encoding: [0x20,0x74,0xdf,0x4c]
+; CHECK: ld1.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa4,0xdf,0x4c]
+; CHECK: ld1.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x64,0xdf,0x4c]
+; CHECK: ld1.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x24,0xdf,0x4c]
+
+; CHECK: ld1.2s { v0 }, [x1], #8       ; encoding: [0x20,0x78,0xdf,0x0c]
+; CHECK: ld1.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa8,0xdf,0x0c]
+; CHECK: ld1.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x68,0xdf,0x0c]
+; CHECK: ld1.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x28,0xdf,0x0c]
+
+; CHECK: ld1.4s { v0 }, [x1], #16       ; encoding: [0x20,0x78,0xdf,0x4c]
+; CHECK: ld1.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa8,0xdf,0x4c]
+; CHECK: ld1.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x68,0xdf,0x4c]
+; CHECK: ld1.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x28,0xdf,0x4c]
+
+; CHECK: ld1.1d { v0 }, [x1], #8       ; encoding: [0x20,0x7c,0xdf,0x0c]
+; CHECK: ld1.1d { v0, v1 }, [x1], #16   ; encoding: [0x20,0xac,0xdf,0x0c]
+; CHECK: ld1.1d { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x6c,0xdf,0x0c]
+; CHECK: ld1.1d { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x2c,0xdf,0x0c]
+
+; CHECK: ld1.2d { v0 }, [x1], #16       ; encoding: [0x20,0x7c,0xdf,0x4c]
+; CHECK: ld1.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0xac,0xdf,0x4c]
+; CHECK: ld1.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x6c,0xdf,0x4c]
+; CHECK: ld1.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x2c,0xdf,0x4c]
+
+; CHECK: st1.8b { v0 }, [x1], #8       ; encoding: [0x20,0x70,0x9f,0x0c]
+; CHECK: st1.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa0,0x9f,0x0c]
+; CHECK: st1.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x60,0x9f,0x0c]
+; CHECK: st1.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x20,0x9f,0x0c]
+
+; CHECK: st1.16b { v0 }, [x1], #16       ; encoding: [0x20,0x70,0x9f,0x4c]
+; CHECK: st1.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa0,0x9f,0x4c]
+; CHECK: st1.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x60,0x9f,0x4c]
+; CHECK: st1.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x20,0x9f,0x4c]
+
+; CHECK: st1.4h { v0 }, [x1], #8       ; encoding: [0x20,0x74,0x9f,0x0c]
+; CHECK: st1.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa4,0x9f,0x0c]
+; CHECK: st1.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x64,0x9f,0x0c]
+; CHECK: st1.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x24,0x9f,0x0c]
+
+; CHECK: st1.8h { v0 }, [x1], #16       ; encoding: [0x20,0x74,0x9f,0x4c]
+; CHECK: st1.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa4,0x9f,0x4c]
+; CHECK: st1.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x64,0x9f,0x4c]
+; CHECK: st1.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x24,0x9f,0x4c]
+
+; CHECK: st1.2s { v0 }, [x1], #8       ; encoding: [0x20,0x78,0x9f,0x0c]
+; CHECK: st1.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa8,0x9f,0x0c]
+; CHECK: st1.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x68,0x9f,0x0c]
+; CHECK: st1.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x28,0x9f,0x0c]
+
+; CHECK: st1.4s { v0 }, [x1], #16       ; encoding: [0x20,0x78,0x9f,0x4c]
+; CHECK: st1.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa8,0x9f,0x4c]
+; CHECK: st1.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x68,0x9f,0x4c]
+; CHECK: st1.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x28,0x9f,0x4c]
+
+; CHECK: st1.1d { v0 }, [x1], #8       ; encoding: [0x20,0x7c,0x9f,0x0c]
+; CHECK: st1.1d { v0, v1 }, [x1], #16   ; encoding: [0x20,0xac,0x9f,0x0c]
+; CHECK: st1.1d { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x6c,0x9f,0x0c]
+; CHECK: st1.1d { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x2c,0x9f,0x0c]
+
+; CHECK: st1.2d { v0 }, [x1], #16       ; encoding: [0x20,0x7c,0x9f,0x4c]
+; CHECK: st1.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0xac,0x9f,0x4c]
+; CHECK: st1.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x6c,0x9f,0x4c]
+; CHECK: st1.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x2c,0x9f,0x4c]
+
+
+_ld2st2_multiple_post:
+  ld2.8b {v0, v1}, [x1], x15
+  ld2.16b {v0, v1}, [x1], x15
+  ld2.4h {v0, v1}, [x1], x15
+  ld2.8h {v0, v1}, [x1], x15
+  ld2.2s {v0, v1}, [x1], x15
+  ld2.4s {v0, v1}, [x1], x15
+  ld2.2d {v0, v1}, [x1], x15
+
+  st2.8b {v0, v1}, [x1], x15
+  st2.16b {v0, v1}, [x1], x15
+  st2.4h {v0, v1}, [x1], x15
+  st2.8h {v0, v1}, [x1], x15
+  st2.2s {v0, v1}, [x1], x15
+  st2.4s {v0, v1}, [x1], x15
+  st2.2d {v0, v1}, [x1], x15
+
+  ld2.8b {v0, v1}, [x1], #16
+  ld2.16b {v0, v1}, [x1], #32
+  ld2.4h {v0, v1}, [x1], #16
+  ld2.8h {v0, v1}, [x1], #32
+  ld2.2s {v0, v1}, [x1], #16
+  ld2.4s {v0, v1}, [x1], #32
+  ld2.2d {v0, v1}, [x1], #32
+
+  st2.8b {v0, v1}, [x1], #16
+  st2.16b {v0, v1}, [x1], #32
+  st2.4h {v0, v1}, [x1], #16
+  st2.8h {v0, v1}, [x1], #32
+  st2.2s {v0, v1}, [x1], #16
+  st2.4s {v0, v1}, [x1], #32
+  st2.2d {v0, v1}, [x1], #32
+
+
+; CHECK: ld2st2_multiple_post:
+; CHECK: ld2.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0xcf,0x0c]
+; CHECK: ld2.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0xcf,0x4c]
+; CHECK: ld2.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0xcf,0x0c]
+; CHECK: ld2.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0xcf,0x4c]
+; CHECK: ld2.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0xcf,0x0c]
+; CHECK: ld2.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0xcf,0x4c]
+; CHECK: ld2.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0x8c,0xcf,0x4c]
+
+; CHECK: st2.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0x8f,0x0c]
+; CHECK: st2.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0x8f,0x4c]
+; CHECK: st2.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0x8f,0x0c]
+; CHECK: st2.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0x8f,0x4c]
+; CHECK: st2.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0x8f,0x0c]
+; CHECK: st2.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0x8f,0x4c]
+; CHECK: st2.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0x8c,0x8f,0x4c]
+
+; CHECK: ld2.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0x80,0xdf,0x0c]
+; CHECK: ld2.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0x80,0xdf,0x4c]
+; CHECK: ld2.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0x84,0xdf,0x0c]
+; CHECK: ld2.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0x84,0xdf,0x4c]
+; CHECK: ld2.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0x88,0xdf,0x0c]
+; CHECK: ld2.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0x88,0xdf,0x4c]
+; CHECK: ld2.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0x8c,0xdf,0x4c]
+
+; CHECK: st2.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0x80,0x9f,0x0c]
+; CHECK: st2.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0x80,0x9f,0x4c]
+; CHECK: st2.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0x84,0x9f,0x0c]
+; CHECK: st2.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0x84,0x9f,0x4c]
+; CHECK: st2.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0x88,0x9f,0x0c]
+; CHECK: st2.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0x88,0x9f,0x4c]
+; CHECK: st2.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0x8c,0x9f,0x4c]
+
+
+_ld3st3_multiple_post:
+  ld3.8b {v0, v1, v2}, [x1], x15
+  ld3.16b {v0, v1, v2}, [x1], x15
+  ld3.4h {v0, v1, v2}, [x1], x15
+  ld3.8h {v0, v1, v2}, [x1], x15
+  ld3.2s {v0, v1, v2}, [x1], x15
+  ld3.4s {v0, v1, v2}, [x1], x15
+  ld3.2d {v0, v1, v2}, [x1], x15
+
+  st3.8b {v0, v1, v2}, [x1], x15
+  st3.16b {v0, v1, v2}, [x1], x15
+  st3.4h {v0, v1, v2}, [x1], x15
+  st3.8h {v0, v1, v2}, [x1], x15
+  st3.2s {v0, v1, v2}, [x1], x15
+  st3.4s {v0, v1, v2}, [x1], x15
+  st3.2d {v0, v1, v2}, [x1], x15
+
+  ld3.8b {v0, v1, v2}, [x1], #24
+  ld3.16b {v0, v1, v2}, [x1], #48
+  ld3.4h {v0, v1, v2}, [x1], #24
+  ld3.8h {v0, v1, v2}, [x1], #48
+  ld3.2s {v0, v1, v2}, [x1], #24
+  ld3.4s {v0, v1, v2}, [x1], #48
+  ld3.2d {v0, v1, v2}, [x1], #48
+
+  st3.8b {v0, v1, v2}, [x1], #24
+  st3.16b {v0, v1, v2}, [x1], #48
+  st3.4h {v0, v1, v2}, [x1], #24
+  st3.8h {v0, v1, v2}, [x1], #48
+  st3.2s {v0, v1, v2}, [x1], #24
+  st3.4s {v0, v1, v2}, [x1], #48
+  st3.2d {v0, v1, v2}, [x1], #48
+
+; CHECK: ld3st3_multiple_post:
+; CHECK: ld3.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0xcf,0x0c]
+; CHECK: ld3.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0xcf,0x4c]
+; CHECK: ld3.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0xcf,0x0c]
+; CHECK: ld3.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0xcf,0x4c]
+; CHECK: ld3.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0xcf,0x0c]
+; CHECK: ld3.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0xcf,0x4c]
+; CHECK: ld3.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x4c,0xcf,0x4c]
+
+; CHECK: st3.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0x8f,0x0c]
+; CHECK: st3.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0x8f,0x4c]
+; CHECK: st3.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0x8f,0x0c]
+; CHECK: st3.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0x8f,0x4c]
+; CHECK: st3.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0x8f,0x0c]
+; CHECK: st3.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0x8f,0x4c]
+; CHECK: st3.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x4c,0x8f,0x4c]
+
+; CHECK: ld3.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x40,0xdf,0x0c]
+; CHECK: ld3.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x40,0xdf,0x4c]
+; CHECK: ld3.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x44,0xdf,0x0c]
+; CHECK: ld3.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x44,0xdf,0x4c]
+; CHECK: ld3.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x48,0xdf,0x0c]
+; CHECK: ld3.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x48,0xdf,0x4c]
+; CHECK: ld3.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x4c,0xdf,0x4c]
+
+; CHECK: st3.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x40,0x9f,0x0c]
+; CHECK: st3.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x40,0x9f,0x4c]
+; CHECK: st3.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x44,0x9f,0x0c]
+; CHECK: st3.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x44,0x9f,0x4c]
+; CHECK: st3.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x48,0x9f,0x0c]
+; CHECK: st3.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x48,0x9f,0x4c]
+; CHECK: st3.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x4c,0x9f,0x4c]
+
+_ld4st4_multiple_post:
+  ld4.8b {v0, v1, v2, v3}, [x1], x15
+  ld4.16b {v0, v1, v2, v3}, [x1], x15
+  ld4.4h {v0, v1, v2, v3}, [x1], x15
+  ld4.8h {v0, v1, v2, v3}, [x1], x15
+  ld4.2s {v0, v1, v2, v3}, [x1], x15
+  ld4.4s {v0, v1, v2, v3}, [x1], x15
+  ld4.2d {v0, v1, v2, v3}, [x1], x15
+
+  st4.8b {v0, v1, v2, v3}, [x1], x15
+  st4.16b {v0, v1, v2, v3}, [x1], x15
+  st4.4h {v0, v1, v2, v3}, [x1], x15
+  st4.8h {v0, v1, v2, v3}, [x1], x15
+  st4.2s {v0, v1, v2, v3}, [x1], x15
+  st4.4s {v0, v1, v2, v3}, [x1], x15
+  st4.2d {v0, v1, v2, v3}, [x1], x15
+
+  ld4.8b {v0, v1, v2, v3}, [x1], #32
+  ld4.16b {v0, v1, v2, v3}, [x1], #64
+  ld4.4h {v0, v1, v2, v3}, [x1], #32
+  ld4.8h {v0, v1, v2, v3}, [x1], #64
+  ld4.2s {v0, v1, v2, v3}, [x1], #32
+  ld4.4s {v0, v1, v2, v3}, [x1], #64
+  ld4.2d {v0, v1, v2, v3}, [x1], #64
+
+  st4.8b {v0, v1, v2, v3}, [x1], #32
+  st4.16b {v0, v1, v2, v3}, [x1], #64
+  st4.4h {v0, v1, v2, v3}, [x1], #32
+  st4.8h {v0, v1, v2, v3}, [x1], #64
+  st4.2s {v0, v1, v2, v3}, [x1], #32
+  st4.4s {v0, v1, v2, v3}, [x1], #64
+  st4.2d {v0, v1, v2, v3}, [x1], #64
+
+
+; CHECK: ld4st4_multiple_post:
+; CHECK: ld4.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0xcf,0x0c]
+; CHECK: ld4.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0xcf,0x4c]
+; CHECK: ld4.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0xcf,0x0c]
+; CHECK: ld4.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0xcf,0x4c]
+; CHECK: ld4.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0xcf,0x0c]
+; CHECK: ld4.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0xcf,0x4c]
+; CHECK: ld4.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x0c,0xcf,0x4c]
+
+; CHECK: st4.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0x8f,0x0c]
+; CHECK: st4.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0x8f,0x4c]
+; CHECK: st4.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0x8f,0x0c]
+; CHECK: st4.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0x8f,0x4c]
+; CHECK: st4.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0x8f,0x0c]
+; CHECK: st4.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0x8f,0x4c]
+; CHECK: st4.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x0c,0x8f,0x4c]
+
+; CHECK: ld4.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x00,0xdf,0x0c]
+; CHECK: ld4.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x00,0xdf,0x4c]
+; CHECK: ld4.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x04,0xdf,0x0c]
+; CHECK: ld4.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x04,0xdf,0x4c]
+; CHECK: ld4.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x08,0xdf,0x0c]
+; CHECK: ld4.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x08,0xdf,0x4c]
+; CHECK: ld4.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x0c,0xdf,0x4c]
+
+; CHECK: st4.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x00,0x9f,0x0c]
+; CHECK: st4.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x00,0x9f,0x4c]
+; CHECK: st4.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x04,0x9f,0x0c]
+; CHECK: st4.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x04,0x9f,0x4c]
+; CHECK: st4.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x08,0x9f,0x0c]
+; CHECK: st4.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x08,0x9f,0x4c]
+; CHECK: st4.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x0c,0x9f,0x4c]
+
+ld1r:
+  ld1r.8b {v4}, [x2]
+  ld1r.8b {v4}, [x2], x3
+  ld1r.16b {v4}, [x2]
+  ld1r.16b {v4}, [x2], x3
+  ld1r.4h {v4}, [x2]
+  ld1r.4h {v4}, [x2], x3
+  ld1r.8h {v4}, [x2]
+  ld1r.8h {v4}, [x2], x3
+  ld1r.2s {v4}, [x2]
+  ld1r.2s {v4}, [x2], x3
+  ld1r.4s {v4}, [x2]
+  ld1r.4s {v4}, [x2], x3
+  ld1r.1d {v4}, [x2]
+  ld1r.1d {v4}, [x2], x3
+  ld1r.2d {v4}, [x2]
+  ld1r.2d {v4}, [x2], x3
+
+  ld1r.8b {v4}, [x2], #1
+  ld1r.16b {v4}, [x2], #1
+  ld1r.4h {v4}, [x2], #2
+  ld1r.8h {v4}, [x2], #2
+  ld1r.2s {v4}, [x2], #4
+  ld1r.4s {v4}, [x2], #4
+  ld1r.1d {v4}, [x2], #8
+  ld1r.2d {v4}, [x2], #8
+
+; CHECK: ld1r:
+; CHECK: ld1r.8b { v4 }, [x2]            ; encoding: [0x44,0xc0,0x40,0x0d]
+; CHECK: ld1r.8b { v4 }, [x2], x3        ; encoding: [0x44,0xc0,0xc3,0x0d]
+; CHECK: ld1r.16b { v4 }, [x2]    ; encoding: [0x44,0xc0,0x40,0x4d]
+; CHECK: ld1r.16b { v4 }, [x2], x3 ; encoding: [0x44,0xc0,0xc3,0x4d]
+; CHECK: ld1r.4h { v4 }, [x2]            ; encoding: [0x44,0xc4,0x40,0x0d]
+; CHECK: ld1r.4h { v4 }, [x2], x3        ; encoding: [0x44,0xc4,0xc3,0x0d]
+; CHECK: ld1r.8h { v4 }, [x2]            ; encoding: [0x44,0xc4,0x40,0x4d]
+; CHECK: ld1r.8h { v4 }, [x2], x3        ; encoding: [0x44,0xc4,0xc3,0x4d]
+; CHECK: ld1r.2s { v4 }, [x2]            ; encoding: [0x44,0xc8,0x40,0x0d]
+; CHECK: ld1r.2s { v4 }, [x2], x3        ; encoding: [0x44,0xc8,0xc3,0x0d]
+; CHECK: ld1r.4s { v4 }, [x2]            ; encoding: [0x44,0xc8,0x40,0x4d]
+; CHECK: ld1r.4s { v4 }, [x2], x3        ; encoding: [0x44,0xc8,0xc3,0x4d]
+; CHECK: ld1r.1d { v4 }, [x2]            ; encoding: [0x44,0xcc,0x40,0x0d]
+; CHECK: ld1r.1d { v4 }, [x2], x3        ; encoding: [0x44,0xcc,0xc3,0x0d]
+; CHECK: ld1r.2d { v4 }, [x2]            ; encoding: [0x44,0xcc,0x40,0x4d]
+; CHECK: ld1r.2d { v4 }, [x2], x3        ; encoding: [0x44,0xcc,0xc3,0x4d]
+
+; CHECK: ld1r.8b { v4 }, [x2], #1        ; encoding: [0x44,0xc0,0xdf,0x0d]
+; CHECK: ld1r.16b { v4 }, [x2], #1 ; encoding: [0x44,0xc0,0xdf,0x4d]
+; CHECK: ld1r.4h { v4 }, [x2], #2        ; encoding: [0x44,0xc4,0xdf,0x0d]
+; CHECK: ld1r.8h { v4 }, [x2], #2        ; encoding: [0x44,0xc4,0xdf,0x4d]
+; CHECK: ld1r.2s { v4 }, [x2], #4        ; encoding: [0x44,0xc8,0xdf,0x0d]
+; CHECK: ld1r.4s { v4 }, [x2], #4        ; encoding: [0x44,0xc8,0xdf,0x4d]
+; CHECK: ld1r.1d { v4 }, [x2], #8        ; encoding: [0x44,0xcc,0xdf,0x0d]
+; CHECK: ld1r.2d { v4 }, [x2], #8        ; encoding: [0x44,0xcc,0xdf,0x4d]
+
+ld2r:
+  ld2r.8b {v4, v5}, [x2]
+  ld2r.8b {v4, v5}, [x2], x3
+  ld2r.16b {v4, v5}, [x2]
+  ld2r.16b {v4, v5}, [x2], x3
+  ld2r.4h {v4, v5}, [x2]
+  ld2r.4h {v4, v5}, [x2], x3
+  ld2r.8h {v4, v5}, [x2]
+  ld2r.8h {v4, v5}, [x2], x3
+  ld2r.2s {v4, v5}, [x2]
+  ld2r.2s {v4, v5}, [x2], x3
+  ld2r.4s {v4, v5}, [x2]
+  ld2r.4s {v4, v5}, [x2], x3
+  ld2r.1d {v4, v5}, [x2]
+  ld2r.1d {v4, v5}, [x2], x3
+  ld2r.2d {v4, v5}, [x2]
+  ld2r.2d {v4, v5}, [x2], x3
+
+  ld2r.8b {v4, v5}, [x2], #2
+  ld2r.16b {v4, v5}, [x2], #2
+  ld2r.4h {v4, v5}, [x2], #4
+  ld2r.8h {v4, v5}, [x2], #4
+  ld2r.2s {v4, v5}, [x2], #8
+  ld2r.4s {v4, v5}, [x2], #8
+  ld2r.1d {v4, v5}, [x2], #16
+  ld2r.2d {v4, v5}, [x2], #16
+
+; CHECK: ld2r:
+; CHECK: ld2r.8b { v4, v5 }, [x2]        ; encoding: [0x44,0xc0,0x60,0x0d]
+; CHECK: ld2r.8b { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc0,0xe3,0x0d]
+; CHECK: ld2r.16b { v4, v5 }, [x2] ; encoding: [0x44,0xc0,0x60,0x4d]
+; CHECK: ld2r.16b { v4, v5 }, [x2], x3 ; encoding: [0x44,0xc0,0xe3,0x4d]
+; CHECK: ld2r.4h { v4, v5 }, [x2]        ; encoding: [0x44,0xc4,0x60,0x0d]
+; CHECK: ld2r.4h { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc4,0xe3,0x0d]
+; CHECK: ld2r.8h { v4, v5 }, [x2]        ; encoding: [0x44,0xc4,0x60,0x4d]
+; CHECK: ld2r.8h { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc4,0xe3,0x4d]
+; CHECK: ld2r.2s { v4, v5 }, [x2]        ; encoding: [0x44,0xc8,0x60,0x0d]
+; CHECK: ld2r.2s { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc8,0xe3,0x0d]
+; CHECK: ld2r.4s { v4, v5 }, [x2]        ; encoding: [0x44,0xc8,0x60,0x4d]
+; CHECK: ld2r.4s { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc8,0xe3,0x4d]
+; CHECK: ld2r.1d { v4, v5 }, [x2]        ; encoding: [0x44,0xcc,0x60,0x0d]
+; CHECK: ld2r.1d { v4, v5 }, [x2], x3    ; encoding: [0x44,0xcc,0xe3,0x0d]
+; CHECK: ld2r.2d { v4, v5 }, [x2]        ; encoding: [0x44,0xcc,0x60,0x4d]
+; CHECK: ld2r.2d { v4, v5 }, [x2], x3    ; encoding: [0x44,0xcc,0xe3,0x4d]
+
+; CHECK: ld2r.8b { v4, v5 }, [x2], #2    ; encoding: [0x44,0xc0,0xff,0x0d]
+; CHECK: ld2r.16b { v4, v5 }, [x2], #2 ; encoding: [0x44,0xc0,0xff,0x4d]
+; CHECK: ld2r.4h { v4, v5 }, [x2], #4    ; encoding: [0x44,0xc4,0xff,0x0d]
+; CHECK: ld2r.8h { v4, v5 }, [x2], #4    ; encoding: [0x44,0xc4,0xff,0x4d]
+; CHECK: ld2r.2s { v4, v5 }, [x2], #8    ; encoding: [0x44,0xc8,0xff,0x0d]
+; CHECK: ld2r.4s { v4, v5 }, [x2], #8    ; encoding: [0x44,0xc8,0xff,0x4d]
+; CHECK: ld2r.1d { v4, v5 }, [x2], #16    ; encoding: [0x44,0xcc,0xff,0x0d]
+; CHECK: ld2r.2d { v4, v5 }, [x2], #16    ; encoding: [0x44,0xcc,0xff,0x4d]
+
+ld3r:
+  ld3r.8b {v4, v5, v6}, [x2]
+  ld3r.8b {v4, v5, v6}, [x2], x3
+  ld3r.16b {v4, v5, v6}, [x2]
+  ld3r.16b {v4, v5, v6}, [x2], x3
+  ld3r.4h {v4, v5, v6}, [x2]
+  ld3r.4h {v4, v5, v6}, [x2], x3
+  ld3r.8h {v4, v5, v6}, [x2]
+  ld3r.8h {v4, v5, v6}, [x2], x3
+  ld3r.2s {v4, v5, v6}, [x2]
+  ld3r.2s {v4, v5, v6}, [x2], x3
+  ld3r.4s {v4, v5, v6}, [x2]
+  ld3r.4s {v4, v5, v6}, [x2], x3
+  ld3r.1d {v4, v5, v6}, [x2]
+  ld3r.1d {v4, v5, v6}, [x2], x3
+  ld3r.2d {v4, v5, v6}, [x2]
+  ld3r.2d {v4, v5, v6}, [x2], x3
+
+  ld3r.8b {v4, v5, v6}, [x2], #3
+  ld3r.16b {v4, v5, v6}, [x2], #3
+  ld3r.4h {v4, v5, v6}, [x2], #6
+  ld3r.8h {v4, v5, v6}, [x2], #6
+  ld3r.2s {v4, v5, v6}, [x2], #12
+  ld3r.4s {v4, v5, v6}, [x2], #12
+  ld3r.1d {v4, v5, v6}, [x2], #24
+  ld3r.2d {v4, v5, v6}, [x2], #24
+
+; CHECK: ld3r:
+; CHECK: ld3r.8b { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe0,0x40,0x0d]
+; CHECK: ld3r.8b { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe0,0xc3,0x0d]
+; CHECK: ld3r.16b { v4, v5, v6 }, [x2] ; encoding: [0x44,0xe0,0x40,0x4d]
+; CHECK: ld3r.16b { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe0,0xc3,0x4d]
+; CHECK: ld3r.4h { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe4,0x40,0x0d]
+; CHECK: ld3r.4h { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe4,0xc3,0x0d]
+; CHECK: ld3r.8h { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe4,0x40,0x4d]
+; CHECK: ld3r.8h { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe4,0xc3,0x4d]
+; CHECK: ld3r.2s { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe8,0x40,0x0d]
+; CHECK: ld3r.2s { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe8,0xc3,0x0d]
+; CHECK: ld3r.4s { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe8,0x40,0x4d]
+; CHECK: ld3r.4s { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe8,0xc3,0x4d]
+; CHECK: ld3r.1d { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xec,0x40,0x0d]
+; CHECK: ld3r.1d { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xec,0xc3,0x0d]
+; CHECK: ld3r.2d { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xec,0x40,0x4d]
+; CHECK: ld3r.2d { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xec,0xc3,0x4d]
+
+; CHECK: ld3r.8b { v4, v5, v6 }, [x2], #3 ; encoding: [0x44,0xe0,0xdf,0x0d]
+; CHECK: ld3r.16b { v4, v5, v6 }, [x2], #3 ; encoding: [0x44,0xe0,0xdf,0x4d]
+; CHECK: ld3r.4h { v4, v5, v6 }, [x2], #6 ; encoding: [0x44,0xe4,0xdf,0x0d]
+; CHECK: ld3r.8h { v4, v5, v6 }, [x2], #6 ; encoding: [0x44,0xe4,0xdf,0x4d]
+; CHECK: ld3r.2s { v4, v5, v6 }, [x2], #12 ; encoding: [0x44,0xe8,0xdf,0x0d]
+; CHECK: ld3r.4s { v4, v5, v6 }, [x2], #12 ; encoding: [0x44,0xe8,0xdf,0x4d]
+; CHECK: ld3r.1d { v4, v5, v6 }, [x2], #24 ; encoding: [0x44,0xec,0xdf,0x0d]
+; CHECK: ld3r.2d { v4, v5, v6 }, [x2], #24 ; encoding: [0x44,0xec,0xdf,0x4d]
+
+ld4r:
+  ld4r.8b {v4, v5, v6, v7}, [x2]
+  ld4r.8b {v4, v5, v6, v7}, [x2], x3
+  ld4r.16b {v4, v5, v6, v7}, [x2]
+  ld4r.16b {v4, v5, v6, v7}, [x2], x3
+  ld4r.4h {v4, v5, v6, v7}, [x2]
+  ld4r.4h {v4, v5, v6, v7}, [x2], x3
+  ld4r.8h {v4, v5, v6, v7}, [x2]
+  ld4r.8h {v4, v5, v6, v7}, [x2], x3
+  ld4r.2s {v4, v5, v6, v7}, [x2]
+  ld4r.2s {v4, v5, v6, v7}, [x2], x3
+  ld4r.4s {v4, v5, v6, v7}, [x2]
+  ld4r.4s {v4, v5, v6, v7}, [x2], x3
+  ld4r.1d {v4, v5, v6, v7}, [x2]
+  ld4r.1d {v4, v5, v6, v7}, [x2], x3
+  ld4r.2d {v4, v5, v6, v7}, [x2]
+  ld4r.2d {v4, v5, v6, v7}, [x2], x3
+
+  ld4r.8b {v4, v5, v6, v7}, [x2], #4
+  ld4r.16b {v5, v6, v7, v8}, [x2], #4
+  ld4r.4h {v6, v7, v8, v9}, [x2], #8
+  ld4r.8h {v1, v2, v3, v4}, [x2], #8
+  ld4r.2s {v2, v3, v4, v5}, [x2], #16
+  ld4r.4s {v3, v4, v5, v6}, [x2], #16
+  ld4r.1d {v0, v1, v2, v3}, [x2], #32
+  ld4r.2d {v4, v5, v6, v7}, [x2], #32
+
+; CHECK: ld4r:
+; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe0,0x60,0x0d]
+; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe0,0xe3,0x0d]
+; CHECK: ld4r.16b { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe0,0x60,0x4d]
+; CHECK: ld4r.16b { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe0,0xe3,0x4d]
+; CHECK: ld4r.4h { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe4,0x60,0x0d]
+; CHECK: ld4r.4h { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe4,0xe3,0x0d]
+; CHECK: ld4r.8h { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe4,0x60,0x4d]
+; CHECK: ld4r.8h { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe4,0xe3,0x4d]
+; CHECK: ld4r.2s { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe8,0x60,0x0d]
+; CHECK: ld4r.2s { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe8,0xe3,0x0d]
+; CHECK: ld4r.4s { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe8,0x60,0x4d]
+; CHECK: ld4r.4s { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe8,0xe3,0x4d]
+; CHECK: ld4r.1d { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xec,0x60,0x0d]
+; CHECK: ld4r.1d { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xec,0xe3,0x0d]
+; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xec,0x60,0x4d]
+; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xec,0xe3,0x4d]
+
+; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2], #4 ; encoding: [0x44,0xe0,0xff,0x0d]
+; CHECK: ld4r.16b { v5, v6, v7, v8 }, [x2], #4 ; encoding: [0x45,0xe0,0xff,0x4d]
+; CHECK: ld4r.4h { v6, v7, v8, v9 }, [x2], #8 ; encoding: [0x46,0xe4,0xff,0x0d]
+; CHECK: ld4r.8h { v1, v2, v3, v4 }, [x2], #8 ; encoding: [0x41,0xe4,0xff,0x4d]
+; CHECK: ld4r.2s { v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x0d]
+; CHECK: ld4r.4s { v3, v4, v5, v6 }, [x2], #16 ; encoding: [0x43,0xe8,0xff,0x4d]
+; CHECK: ld4r.1d { v0, v1, v2, v3 }, [x2], #32 ; encoding: [0x40,0xec,0xff,0x0d]
+; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2], #32 ; encoding: [0x44,0xec,0xff,0x4d]
+
+
+_ld1:
+  ld1.b {v4}[13], [x3]
+  ld1.h {v4}[2], [x3]
+  ld1.s {v4}[2], [x3]
+  ld1.d {v4}[1], [x3]
+  ld1.b {v4}[13], [x3], x5
+  ld1.h {v4}[2], [x3], x5
+  ld1.s {v4}[2], [x3], x5
+  ld1.d {v4}[1], [x3], x5
+  ld1.b {v4}[13], [x3], #1
+  ld1.h {v4}[2], [x3], #2
+  ld1.s {v4}[2], [x3], #4
+  ld1.d {v4}[1], [x3], #8
+
+; CHECK: _ld1:
+; CHECK: ld1.b { v4 }[13], [x3]        ; encoding: [0x64,0x14,0x40,0x4d]
+; CHECK: ld1.h { v4 }[2], [x3]         ; encoding: [0x64,0x50,0x40,0x0d]
+; CHECK: ld1.s { v4 }[2], [x3]         ; encoding: [0x64,0x80,0x40,0x4d]
+; CHECK: ld1.d { v4 }[1], [x3]         ; encoding: [0x64,0x84,0x40,0x4d]
+; CHECK: ld1.b { v4 }[13], [x3], x5    ; encoding: [0x64,0x14,0xc5,0x4d]
+; CHECK: ld1.h { v4 }[2], [x3], x5     ; encoding: [0x64,0x50,0xc5,0x0d]
+; CHECK: ld1.s { v4 }[2], [x3], x5     ; encoding: [0x64,0x80,0xc5,0x4d]
+; CHECK: ld1.d { v4 }[1], [x3], x5     ; encoding: [0x64,0x84,0xc5,0x4d]
+; CHECK: ld1.b { v4 }[13], [x3], #1   ; encoding: [0x64,0x14,0xdf,0x4d]
+; CHECK: ld1.h { v4 }[2], [x3], #2    ; encoding: [0x64,0x50,0xdf,0x0d]
+; CHECK: ld1.s { v4 }[2], [x3], #4    ; encoding: [0x64,0x80,0xdf,0x4d]
+; CHECK: ld1.d { v4 }[1], [x3], #8    ; encoding: [0x64,0x84,0xdf,0x4d]
+
+_ld2:
+  ld2.b {v4, v5}[13], [x3]
+  ld2.h {v4, v5}[2], [x3]
+  ld2.s {v4, v5}[2], [x3]
+  ld2.d {v4, v5}[1], [x3]
+  ld2.b {v4, v5}[13], [x3], x5
+  ld2.h {v4, v5}[2], [x3], x5
+  ld2.s {v4, v5}[2], [x3], x5
+  ld2.d {v4, v5}[1], [x3], x5
+  ld2.b {v4, v5}[13], [x3], #2
+  ld2.h {v4, v5}[2], [x3], #4
+  ld2.s {v4, v5}[2], [x3], #8
+  ld2.d {v4, v5}[1], [x3], #16
+
+
+; CHECK: _ld2:
+; CHECK: ld2.b { v4, v5 }[13], [x3]    ; encoding: [0x64,0x14,0x60,0x4d]
+; CHECK: ld2.h { v4, v5 }[2], [x3]     ; encoding: [0x64,0x50,0x60,0x0d]
+; CHECK: ld2.s { v4, v5 }[2], [x3]     ; encoding: [0x64,0x80,0x60,0x4d]
+; CHECK: ld2.d { v4, v5 }[1], [x3]     ; encoding: [0x64,0x84,0x60,0x4d]
+; CHECK: ld2.b { v4, v5 }[13], [x3], x5 ; encoding: [0x64,0x14,0xe5,0x4d]
+; CHECK: ld2.h { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x50,0xe5,0x0d]
+; CHECK: ld2.s { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x80,0xe5,0x4d]
+; CHECK: ld2.d { v4, v5 }[1], [x3], x5 ; encoding: [0x64,0x84,0xe5,0x4d]
+; CHECK: ld2.b { v4, v5 }[13], [x3], #2 ; encoding: [0x64,0x14,0xff,0x4d]
+; CHECK: ld2.h { v4, v5 }[2], [x3], #4 ; encoding: [0x64,0x50,0xff,0x0d]
+; CHECK: ld2.s { v4, v5 }[2], [x3], #8 ; encoding: [0x64,0x80,0xff,0x4d]
+; CHECK: ld2.d { v4, v5 }[1], [x3], #16 ; encoding: [0x64,0x84,0xff,0x4d]
+
+
+_ld3:
+  ld3.b {v4, v5, v6}[13], [x3]
+  ld3.h {v4, v5, v6}[2], [x3]
+  ld3.s {v4, v5, v6}[2], [x3]
+  ld3.d {v4, v5, v6}[1], [x3]
+  ld3.b {v4, v5, v6}[13], [x3], x5
+  ld3.h {v4, v5, v6}[2], [x3], x5
+  ld3.s {v4, v5, v6}[2], [x3], x5
+  ld3.d {v4, v5, v6}[1], [x3], x5
+  ld3.b {v4, v5, v6}[13], [x3], #3
+  ld3.h {v4, v5, v6}[2], [x3], #6
+  ld3.s {v4, v5, v6}[2], [x3], #12
+  ld3.d {v4, v5, v6}[1], [x3], #24
+
+
+; CHECK: _ld3:
+; CHECK: ld3.b { v4, v5, v6 }[13], [x3] ; encoding: [0x64,0x34,0x40,0x4d]
+; CHECK: ld3.h { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0x70,0x40,0x0d]
+; CHECK: ld3.s { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0xa0,0x40,0x4d]
+; CHECK: ld3.d { v4, v5, v6 }[1], [x3] ; encoding: [0x64,0xa4,0x40,0x4d]
+; CHECK: ld3.b { v4, v5, v6 }[13], [x3], x5 ; encoding: [0x64,0x34,0xc5,0x4d]
+; CHECK: ld3.h { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0x70,0xc5,0x0d]
+; CHECK: ld3.s { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xc5,0x4d]
+; CHECK: ld3.d { v4, v5, v6 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xc5,0x4d]
+; CHECK: ld3.b { v4, v5, v6 }[13], [x3], #3 ; encoding: [0x64,0x34,0xdf,0x4d]
+; CHECK: ld3.h { v4, v5, v6 }[2], [x3], #6 ; encoding: [0x64,0x70,0xdf,0x0d]
+; CHECK: ld3.s { v4, v5, v6 }[2], [x3], #12 ; encoding: [0x64,0xa0,0xdf,0x4d]
+; CHECK: ld3.d { v4, v5, v6 }[1], [x3], #24 ; encoding: [0x64,0xa4,0xdf,0x4d]
+
+
+_ld4:
+  ld4.b {v4, v5, v6, v7}[13], [x3]
+  ld4.h {v4, v5, v6, v7}[2], [x3]
+  ld4.s {v4, v5, v6, v7}[2], [x3]
+  ld4.d {v4, v5, v6, v7}[1], [x3]
+  ld4.b {v4, v5, v6, v7}[13], [x3], x5
+  ld4.h {v4, v5, v6, v7}[2], [x3], x5
+  ld4.s {v4, v5, v6, v7}[2], [x3], x5
+  ld4.d {v4, v5, v6, v7}[1], [x3], x5
+  ld4.b {v4, v5, v6, v7}[13], [x3], #4
+  ld4.h {v4, v5, v6, v7}[2], [x3], #8
+  ld4.s {v4, v5, v6, v7}[2], [x3], #16
+  ld4.d {v4, v5, v6, v7}[1], [x3], #32
+
+; CHECK: _ld4:
+; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3] ; encoding: [0x64,0x34,0x60,0x4d]
+; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0x70,0x60,0x0d]
+; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0xa0,0x60,0x4d]
+; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3] ; encoding: [0x64,0xa4,0x60,0x4d]
+; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3], x5 ; encoding: [0x64,0x34,0xe5,0x4d]
+; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0x70,0xe5,0x0d]
+; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xe5,0x4d]
+; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xe5,0x4d]
+; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3], #4 ; encoding: [0x64,0x34,0xff,0x4d]
+; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3], #8 ; encoding: [0x64,0x70,0xff,0x0d]
+; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3], #16 ; encoding: [0x64,0xa0,0xff,0x4d]
+; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3], #32 ; encoding: [0x64,0xa4,0xff,0x4d]
+
+_st1:
+  st1.b {v4}[13], [x3]
+  st1.h {v4}[2], [x3]
+  st1.s {v4}[2], [x3]
+  st1.d {v4}[1], [x3]
+  st1.b {v4}[13], [x3], x5
+  st1.h {v4}[2], [x3], x5
+  st1.s {v4}[2], [x3], x5
+  st1.d {v4}[1], [x3], x5
+  st1.b {v4}[13], [x3], #1
+  st1.h {v4}[2], [x3], #2
+  st1.s {v4}[2], [x3], #4
+  st1.d {v4}[1], [x3], #8
+
+; CHECK: _st1:
+; CHECK: st1.b { v4 }[13], [x3]        ; encoding: [0x64,0x14,0x00,0x4d]
+; CHECK: st1.h { v4 }[2], [x3]         ; encoding: [0x64,0x50,0x00,0x0d]
+; CHECK: st1.s { v4 }[2], [x3]         ; encoding: [0x64,0x80,0x00,0x4d]
+; CHECK: st1.d { v4 }[1], [x3]         ; encoding: [0x64,0x84,0x00,0x4d]
+; CHECK: st1.b { v4 }[13], [x3], x5    ; encoding: [0x64,0x14,0x85,0x4d]
+; CHECK: st1.h { v4 }[2], [x3], x5     ; encoding: [0x64,0x50,0x85,0x0d]
+; CHECK: st1.s { v4 }[2], [x3], x5     ; encoding: [0x64,0x80,0x85,0x4d]
+; CHECK: st1.d { v4 }[1], [x3], x5     ; encoding: [0x64,0x84,0x85,0x4d]
+; CHECK: st1.b { v4 }[13], [x3], #1   ; encoding: [0x64,0x14,0x9f,0x4d]
+; CHECK: st1.h { v4 }[2], [x3], #2    ; encoding: [0x64,0x50,0x9f,0x0d]
+; CHECK: st1.s { v4 }[2], [x3], #4    ; encoding: [0x64,0x80,0x9f,0x4d]
+; CHECK: st1.d { v4 }[1], [x3], #8    ; encoding: [0x64,0x84,0x9f,0x4d]
+
+_st2:
+  st2.b {v4, v5}[13], [x3]
+  st2.h {v4, v5}[2], [x3]
+  st2.s {v4, v5}[2], [x3]
+  st2.d {v4, v5}[1], [x3]
+  st2.b {v4, v5}[13], [x3], x5
+  st2.h {v4, v5}[2], [x3], x5
+  st2.s {v4, v5}[2], [x3], x5
+  st2.d {v4, v5}[1], [x3], x5
+  st2.b {v4, v5}[13], [x3], #2
+  st2.h {v4, v5}[2], [x3], #4
+  st2.s {v4, v5}[2], [x3], #8
+  st2.d {v4, v5}[1], [x3], #16
+
+; CHECK: _st2:
+; CHECK: st2.b { v4, v5 }[13], [x3]    ; encoding: [0x64,0x14,0x20,0x4d]
+; CHECK: st2.h { v4, v5 }[2], [x3]     ; encoding: [0x64,0x50,0x20,0x0d]
+; CHECK: st2.s { v4, v5 }[2], [x3]     ; encoding: [0x64,0x80,0x20,0x4d]
+; CHECK: st2.d { v4, v5 }[1], [x3]     ; encoding: [0x64,0x84,0x20,0x4d]
+; CHECK: st2.b { v4, v5 }[13], [x3], x5 ; encoding: [0x64,0x14,0xa5,0x4d]
+; CHECK: st2.h { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x50,0xa5,0x0d]
+; CHECK: st2.s { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x80,0xa5,0x4d]
+; CHECK: st2.d { v4, v5 }[1], [x3], x5 ; encoding: [0x64,0x84,0xa5,0x4d]
+; CHECK: st2.b { v4, v5 }[13], [x3], #2 ; encoding: [0x64,0x14,0xbf,0x4d]
+; CHECK: st2.h { v4, v5 }[2], [x3], #4 ; encoding: [0x64,0x50,0xbf,0x0d]
+; CHECK: st2.s { v4, v5 }[2], [x3], #8 ; encoding: [0x64,0x80,0xbf,0x4d]
+; CHECK: st2.d { v4, v5 }[1], [x3], #16 ; encoding: [0x64,0x84,0xbf,0x4d]
+
+
+_st3:
+  st3.b {v4, v5, v6}[13], [x3]
+  st3.h {v4, v5, v6}[2], [x3]
+  st3.s {v4, v5, v6}[2], [x3]
+  st3.d {v4, v5, v6}[1], [x3]
+  st3.b {v4, v5, v6}[13], [x3], x5
+  st3.h {v4, v5, v6}[2], [x3], x5
+  st3.s {v4, v5, v6}[2], [x3], x5
+  st3.d {v4, v5, v6}[1], [x3], x5
+  st3.b {v4, v5, v6}[13], [x3], #3
+  st3.h {v4, v5, v6}[2], [x3], #6
+  st3.s {v4, v5, v6}[2], [x3], #12
+  st3.d {v4, v5, v6}[1], [x3], #24
+
+; CHECK: _st3:
+; CHECK: st3.b { v4, v5, v6 }[13], [x3] ; encoding: [0x64,0x34,0x00,0x4d]
+; CHECK: st3.h { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0x70,0x00,0x0d]
+; CHECK: st3.s { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0xa0,0x00,0x4d]
+; CHECK: st3.d { v4, v5, v6 }[1], [x3] ; encoding: [0x64,0xa4,0x00,0x4d]
+; CHECK: st3.b { v4, v5, v6 }[13], [x3], x5 ; encoding: [0x64,0x34,0x85,0x4d]
+; CHECK: st3.h { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0x70,0x85,0x0d]
+; CHECK: st3.s { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0xa0,0x85,0x4d]
+; CHECK: st3.d { v4, v5, v6 }[1], [x3], x5 ; encoding: [0x64,0xa4,0x85,0x4d]
+; CHECK: st3.b { v4, v5, v6 }[13], [x3], #3 ; encoding: [0x64,0x34,0x9f,0x4d]
+; CHECK: st3.h { v4, v5, v6 }[2], [x3], #6 ; encoding: [0x64,0x70,0x9f,0x0d]
+; CHECK: st3.s { v4, v5, v6 }[2], [x3], #12 ; encoding: [0x64,0xa0,0x9f,0x4d]
+; CHECK: st3.d { v4, v5, v6 }[1], [x3], #24 ; encoding: [0x64,0xa4,0x9f,0x4d]
+
+_st4:
+  st4.b {v4, v5, v6, v7}[13], [x3]
+  st4.h {v4, v5, v6, v7}[2], [x3]
+  st4.s {v4, v5, v6, v7}[2], [x3]
+  st4.d {v4, v5, v6, v7}[1], [x3]
+  st4.b {v4, v5, v6, v7}[13], [x3], x5
+  st4.h {v4, v5, v6, v7}[2], [x3], x5
+  st4.s {v4, v5, v6, v7}[2], [x3], x5
+  st4.d {v4, v5, v6, v7}[1], [x3], x5
+  st4.b {v4, v5, v6, v7}[13], [x3], #4
+  st4.h {v4, v5, v6, v7}[2], [x3], #8
+  st4.s {v4, v5, v6, v7}[2], [x3], #16
+  st4.d {v4, v5, v6, v7}[1], [x3], #32
+
+; CHECK: _st4:
+; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3] ; encoding: [0x64,0x34,0x20,0x4d]
+; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0x70,0x20,0x0d]
+; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0xa0,0x20,0x4d]
+; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3] ; encoding: [0x64,0xa4,0x20,0x4d]
+; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3], x5 ; encoding: [0x64,0x34,0xa5,0x4d]
+; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0x70,0xa5,0x0d]
+; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xa5,0x4d]
+; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xa5,0x4d]
+; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3], #4 ; encoding: [0x64,0x34,0xbf,0x4d]
+; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3], #8 ; encoding: [0x64,0x70,0xbf,0x0d]
+; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3], #16 ; encoding: [0x64,0xa0,0xbf,0x4d]
+; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3], #32 ; encoding: [0x64,0xa4,0xbf,0x4d]
+
+
+;---------
+; ARM verbose syntax equivalents to the above.
+;---------
+verbose_syntax:
+
+  ld1 { v1.8b }, [x1]
+  ld1 { v2.8b, v3.8b }, [x1]
+  ld1 { v3.8b, v4.8b, v5.8b }, [x1]
+  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1]
+
+  ld1 { v1.16b }, [x1]
+  ld1 { v2.16b, v3.16b }, [x1]
+  ld1 { v3.16b, v4.16b, v5.16b }, [x1]
+  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1]
+
+  ld1 { v1.4h }, [x1]
+  ld1 { v2.4h, v3.4h }, [x1]
+  ld1 { v3.4h, v4.4h, v5.4h }, [x1]
+  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1]
+
+  ld1 { v1.8h }, [x1]
+  ld1 { v2.8h, v3.8h }, [x1]
+  ld1 { v3.8h, v4.8h, v5.8h }, [x1]
+  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1]
+
+  ld1 { v1.2s }, [x1]
+  ld1 { v2.2s, v3.2s }, [x1]
+  ld1 { v3.2s, v4.2s, v5.2s }, [x1]
+  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1]
+
+  ld1 { v1.4s }, [x1]
+  ld1 { v2.4s, v3.4s }, [x1]
+  ld1 { v3.4s, v4.4s, v5.4s }, [x1]
+  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1]
+
+  ld1 { v1.1d }, [x1]
+  ld1 { v2.1d, v3.1d }, [x1]
+  ld1 { v3.1d, v4.1d, v5.1d }, [x1]
+  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1]
+
+  ld1 { v1.2d }, [x1]
+  ld1 { v2.2d, v3.2d }, [x1]
+  ld1 { v3.2d, v4.2d, v5.2d }, [x1]
+  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1]
+
+  st1 { v1.8b }, [x1]
+  st1 { v2.8b, v3.8b }, [x1]
+  st1 { v3.8b, v4.8b, v5.8b }, [x1]
+  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1]
+
+  st1 { v1.16b }, [x1]
+  st1 { v2.16b, v3.16b }, [x1]
+  st1 { v3.16b, v4.16b, v5.16b }, [x1]
+  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1]
+
+  st1 { v1.4h }, [x1]
+  st1 { v2.4h, v3.4h }, [x1]
+  st1 { v3.4h, v4.4h, v5.4h }, [x1]
+  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1]
+
+  st1 { v1.8h }, [x1]
+  st1 { v2.8h, v3.8h }, [x1]
+  st1 { v3.8h, v4.8h, v5.8h }, [x1]
+  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1]
+
+  st1 { v1.2s }, [x1]
+  st1 { v2.2s, v3.2s }, [x1]
+  st1 { v3.2s, v4.2s, v5.2s }, [x1]
+  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1]
+
+  st1 { v1.4s }, [x1]
+  st1 { v2.4s, v3.4s }, [x1]
+  st1 { v3.4s, v4.4s, v5.4s }, [x1]
+  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1]
+
+  st1 { v1.1d }, [x1]
+  st1 { v2.1d, v3.1d }, [x1]
+  st1 { v3.1d, v4.1d, v5.1d }, [x1]
+  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1]
+
+  st1 { v1.2d }, [x1]
+  st1 { v2.2d, v3.2d }, [x1]
+  st1 { v3.2d, v4.2d, v5.2d }, [x1]
+  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1]
+
+  ld2 { v3.8b, v4.8b }, [x19]
+  ld2 { v3.16b, v4.16b }, [x19]
+  ld2 { v3.4h, v4.4h }, [x19]
+  ld2 { v3.8h, v4.8h }, [x19]
+  ld2 { v3.2s, v4.2s }, [x19]
+  ld2 { v3.4s, v4.4s }, [x19]
+  ld2 { v3.2d, v4.2d }, [x19]
+
+  st2 { v3.8b, v4.8b }, [x19]
+  st2 { v3.16b, v4.16b }, [x19]
+  st2 { v3.4h, v4.4h }, [x19]
+  st2 { v3.8h, v4.8h }, [x19]
+  st2 { v3.2s, v4.2s }, [x19]
+  st2 { v3.4s, v4.4s }, [x19]
+  st2 { v3.2d, v4.2d }, [x19]
+
+  ld3 { v2.8b, v3.8b, v4.8b }, [x19]
+  ld3 { v2.16b, v3.16b, v4.16b }, [x19]
+  ld3 { v2.4h, v3.4h, v4.4h }, [x19]
+  ld3 { v2.8h, v3.8h, v4.8h }, [x19]
+  ld3 { v2.2s, v3.2s, v4.2s }, [x19]
+  ld3 { v2.4s, v3.4s, v4.4s }, [x19]
+  ld3 { v2.2d, v3.2d, v4.2d }, [x19]
+
+  st3 { v2.8b, v3.8b, v4.8b }, [x19]
+  st3 { v2.16b, v3.16b, v4.16b }, [x19]
+  st3 { v2.4h, v3.4h, v4.4h }, [x19]
+  st3 { v2.8h, v3.8h, v4.8h }, [x19]
+  st3 { v2.2s, v3.2s, v4.2s }, [x19]
+  st3 { v2.4s, v3.4s, v4.4s }, [x19]
+  st3 { v2.2d, v3.2d, v4.2d }, [x19]
+
+  ld4 { v2.8b, v3.8b, v4.8b, v5.8b }, [x19]
+  ld4 { v2.16b, v3.16b, v4.16b, v5.16b }, [x19]
+  ld4 { v2.4h, v3.4h, v4.4h, v5.4h }, [x19]
+  ld4 { v2.8h, v3.8h, v4.8h, v5.8h }, [x19]
+  ld4 { v2.2s, v3.2s, v4.2s, v5.2s }, [x19]
+  ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x19]
+  ld4 { v2.2d, v3.2d, v4.2d, v5.2d }, [x19]
+
+  st4 { v2.8b, v3.8b, v4.8b, v5.8b }, [x19]
+  st4 { v2.16b, v3.16b, v4.16b, v5.16b }, [x19]
+  st4 { v2.4h, v3.4h, v4.4h, v5.4h }, [x19]
+  st4 { v2.8h, v3.8h, v4.8h, v5.8h }, [x19]
+  st4 { v2.2s, v3.2s, v4.2s, v5.2s }, [x19]
+  st4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x19]
+  st4 { v2.2d, v3.2d, v4.2d, v5.2d }, [x19]
+
+  ld1 { v1.8b }, [x1], x15
+  ld1 { v2.8b, v3.8b }, [x1], x15
+  ld1 { v3.8b, v4.8b, v5.8b }, [x1], x15
+  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
+
+  ld1 { v1.16b }, [x1], x15
+  ld1 { v2.16b, v3.16b }, [x1], x15
+  ld1 { v3.16b, v4.16b, v5.16b }, [x1], x15
+  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
+
+  ld1 { v1.4h }, [x1], x15
+  ld1 { v2.4h, v3.4h }, [x1], x15
+  ld1 { v3.4h, v4.4h, v5.4h }, [x1], x15
+  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
+
+  ld1 { v1.8h }, [x1], x15
+  ld1 { v2.8h, v3.8h }, [x1], x15
+  ld1 { v3.8h, v4.8h, v5.8h }, [x1], x15
+  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
+
+  ld1 { v1.2s }, [x1], x15
+  ld1 { v2.2s, v3.2s }, [x1], x15
+  ld1 { v3.2s, v4.2s, v5.2s }, [x1], x15
+  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
+
+  ld1 { v1.4s }, [x1], x15
+  ld1 { v2.4s, v3.4s }, [x1], x15
+  ld1 { v3.4s, v4.4s, v5.4s }, [x1], x15
+  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
+
+  ld1 { v1.1d }, [x1], x15
+  ld1 { v2.1d, v3.1d }, [x1], x15
+  ld1 { v3.1d, v4.1d, v5.1d }, [x1], x15
+  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], x15
+
+  ld1 { v1.2d }, [x1], x15
+  ld1 { v2.2d, v3.2d }, [x1], x15
+  ld1 { v3.2d, v4.2d, v5.2d }, [x1], x15
+  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
+
+  st1 { v1.8b }, [x1], x15
+  st1 { v2.8b, v3.8b }, [x1], x15
+  st1 { v3.8b, v4.8b, v5.8b }, [x1], x15
+  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
+
+  st1 { v1.16b }, [x1], x15
+  st1 { v2.16b, v3.16b }, [x1], x15
+  st1 { v3.16b, v4.16b, v5.16b }, [x1], x15
+  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
+
+  st1 { v1.4h }, [x1], x15
+  st1 { v2.4h, v3.4h }, [x1], x15
+  st1 { v3.4h, v4.4h, v5.4h }, [x1], x15
+  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
+
+  st1 { v1.8h }, [x1], x15
+  st1 { v2.8h, v3.8h }, [x1], x15
+  st1 { v3.8h, v4.8h, v5.8h }, [x1], x15
+  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
+
+  st1 { v1.2s }, [x1], x15
+  st1 { v2.2s, v3.2s }, [x1], x15
+  st1 { v3.2s, v4.2s, v5.2s }, [x1], x15
+  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
+
+  st1 { v1.4s }, [x1], x15
+  st1 { v2.4s, v3.4s }, [x1], x15
+  st1 { v3.4s, v4.4s, v5.4s }, [x1], x15
+  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
+
+  st1 { v1.1d }, [x1], x15
+  st1 { v2.1d, v3.1d }, [x1], x15
+  st1 { v3.1d, v4.1d, v5.1d }, [x1], x15
+  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], x15
+
+  st1 { v1.2d }, [x1], x15
+  st1 { v2.2d, v3.2d }, [x1], x15
+  st1 { v3.2d, v4.2d, v5.2d }, [x1], x15
+  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
+
+  ld1 { v1.8b }, [x1], #8
+  ld1 { v2.8b, v3.8b }, [x1], #16
+  ld1 { v3.8b, v4.8b, v5.8b }, [x1], #24
+  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
+
+  ld1 { v1.16b }, [x1], #16
+  ld1 { v2.16b, v3.16b }, [x1], #32
+  ld1 { v3.16b, v4.16b, v5.16b }, [x1], #48
+  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
+
+  ld1 { v1.4h }, [x1], #8
+  ld1 { v2.4h, v3.4h }, [x1], #16
+  ld1 { v3.4h, v4.4h, v5.4h }, [x1], #24
+  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
+
+  ld1 { v1.8h }, [x1], #16
+  ld1 { v2.8h, v3.8h }, [x1], #32
+  ld1 { v3.8h, v4.8h, v5.8h }, [x1], #48
+  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
+
+  ld1 { v1.2s }, [x1], #8
+  ld1 { v2.2s, v3.2s }, [x1], #16
+  ld1 { v3.2s, v4.2s, v5.2s }, [x1], #24
+  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
+
+  ld1 { v1.4s }, [x1], #16
+  ld1 { v2.4s, v3.4s }, [x1], #32
+  ld1 { v3.4s, v4.4s, v5.4s }, [x1], #48
+  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
+
+  ld1 { v1.1d }, [x1], #8
+  ld1 { v2.1d, v3.1d }, [x1], #16
+  ld1 { v3.1d, v4.1d, v5.1d }, [x1], #24
+  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], #32
+
+  ld1 { v1.2d }, [x1], #16
+  ld1 { v2.2d, v3.2d }, [x1], #32
+  ld1 { v3.2d, v4.2d, v5.2d }, [x1], #48
+  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
+
+  st1 { v1.8b }, [x1], #8
+  st1 { v2.8b, v3.8b }, [x1], #16
+  st1 { v3.8b, v4.8b, v5.8b }, [x1], #24
+  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
+
+  st1 { v1.16b }, [x1], #16
+  st1 { v2.16b, v3.16b }, [x1], #32
+  st1 { v3.16b, v4.16b, v5.16b }, [x1], #48
+  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
+
+  st1 { v1.4h }, [x1], #8
+  st1 { v2.4h, v3.4h }, [x1], #16
+  st1 { v3.4h, v4.4h, v5.4h }, [x1], #24
+  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
+
+  st1 { v1.8h }, [x1], #16
+  st1 { v2.8h, v3.8h }, [x1], #32
+  st1 { v3.8h, v4.8h, v5.8h }, [x1], #48
+  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
+
+  st1 { v1.2s }, [x1], #8
+  st1 { v2.2s, v3.2s }, [x1], #16
+  st1 { v3.2s, v4.2s, v5.2s }, [x1], #24
+  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
+
+  st1 { v1.4s }, [x1], #16
+  st1 { v2.4s, v3.4s }, [x1], #32
+  st1 { v3.4s, v4.4s, v5.4s }, [x1], #48
+  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
+
+  st1 { v1.1d }, [x1], #8
+  st1 { v2.1d, v3.1d }, [x1], #16
+  st1 { v3.1d, v4.1d, v5.1d }, [x1], #24
+  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], #32
+
+  st1 { v1.2d }, [x1], #16
+  st1 { v2.2d, v3.2d }, [x1], #32
+  st1 { v3.2d, v4.2d, v5.2d }, [x1], #48
+  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
+
+  ld2 { v2.8b, v3.8b }, [x1], x15
+  ld2 { v2.16b, v3.16b }, [x1], x15
+  ld2 { v2.4h, v3.4h }, [x1], x15
+  ld2 { v2.8h, v3.8h }, [x1], x15
+  ld2 { v2.2s, v3.2s }, [x1], x15
+  ld2 { v2.4s, v3.4s }, [x1], x15
+  ld2 { v2.2d, v3.2d }, [x1], x15
+
+  st2 { v2.8b, v3.8b }, [x1], x15
+  st2 { v2.16b, v3.16b }, [x1], x15
+  st2 { v2.4h, v3.4h }, [x1], x15
+  st2 { v2.8h, v3.8h }, [x1], x15
+  st2 { v2.2s, v3.2s }, [x1], x15
+  st2 { v2.4s, v3.4s }, [x1], x15
+  st2 { v2.2d, v3.2d }, [x1], x15
+
+  ld2 { v2.8b, v3.8b }, [x1], #16
+  ld2 { v2.16b, v3.16b }, [x1], #32
+  ld2 { v2.4h, v3.4h }, [x1], #16
+  ld2 { v2.8h, v3.8h }, [x1], #32
+  ld2 { v2.2s, v3.2s }, [x1], #16
+  ld2 { v2.4s, v3.4s }, [x1], #32
+  ld2 { v2.2d, v3.2d }, [x1], #32
+
+  st2 { v2.8b, v3.8b }, [x1], #16
+  st2 { v2.16b, v3.16b }, [x1], #32
+  st2 { v2.4h, v3.4h }, [x1], #16
+  st2 { v2.8h, v3.8h }, [x1], #32
+  st2 { v2.2s, v3.2s }, [x1], #16
+  st2 { v2.4s, v3.4s }, [x1], #32
+  st2 { v2.2d, v3.2d }, [x1], #32
+
+  ld3 { v3.8b, v4.8b, v5.8b }, [x1], x15
+  ld3 { v3.16b, v4.16b, v5.16b }, [x1], x15
+  ld3 { v3.4h, v4.4h, v5.4h }, [x1], x15
+  ld3 { v3.8h, v4.8h, v5.8h }, [x1], x15
+  ld3 { v3.2s, v4.2s, v5.2s }, [x1], x15
+  ld3 { v3.4s, v4.4s, v5.4s }, [x1], x15
+  ld3 { v3.2d, v4.2d, v5.2d }, [x1], x15
+
+  st3 { v3.8b, v4.8b, v5.8b }, [x1], x15
+  st3 { v3.16b, v4.16b, v5.16b }, [x1], x15
+  st3 { v3.4h, v4.4h, v5.4h }, [x1], x15
+  st3 { v3.8h, v4.8h, v5.8h }, [x1], x15
+  st3 { v3.2s, v4.2s, v5.2s }, [x1], x15
+  st3 { v3.4s, v4.4s, v5.4s }, [x1], x15
+  st3 { v3.2d, v4.2d, v5.2d }, [x1], x15
+  ld3 { v3.8b, v4.8b, v5.8b }, [x1], #24
+
+  ld3 { v3.16b, v4.16b, v5.16b }, [x1], #48
+  ld3 { v3.4h, v4.4h, v5.4h }, [x1], #24
+  ld3 { v3.8h, v4.8h, v5.8h }, [x1], #48
+  ld3 { v3.2s, v4.2s, v5.2s }, [x1], #24
+  ld3 { v3.4s, v4.4s, v5.4s }, [x1], #48
+  ld3 { v3.2d, v4.2d, v5.2d }, [x1], #48
+
+  st3 { v3.8b, v4.8b, v5.8b }, [x1], #24
+  st3 { v3.16b, v4.16b, v5.16b }, [x1], #48
+  st3 { v3.4h, v4.4h, v5.4h }, [x1], #24
+  st3 { v3.8h, v4.8h, v5.8h }, [x1], #48
+  st3 { v3.2s, v4.2s, v5.2s }, [x1], #24
+  st3 { v3.4s, v4.4s, v5.4s }, [x1], #48
+  st3 { v3.2d, v4.2d, v5.2d }, [x1], #48
+
+  ld4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
+  ld4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
+  ld4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
+  ld4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
+  ld4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
+  ld4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
+  ld4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
+
+  st4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
+  st4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
+  st4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
+  st4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
+  st4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
+  st4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
+  st4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
+
+  ld4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
+  ld4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
+  ld4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
+  ld4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
+  ld4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
+  ld4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
+  ld4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
+
+  st4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
+  st4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
+  st4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
+  st4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
+  st4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
+  st4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
+  st4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
+
+
+  ld1r { v12.8b }, [x2]
+  ld1r { v12.8b }, [x2], x3
+  ld1r { v12.16b }, [x2]
+  ld1r { v12.16b }, [x2], x3
+  ld1r { v12.4h }, [x2]
+  ld1r { v12.4h }, [x2], x3
+  ld1r { v12.8h }, [x2]
+  ld1r { v12.8h }, [x2], x3
+  ld1r { v12.2s }, [x2]
+  ld1r { v12.2s }, [x2], x3
+  ld1r { v12.4s }, [x2]
+  ld1r { v12.4s }, [x2], x3
+  ld1r { v12.1d }, [x2]
+  ld1r { v12.1d }, [x2], x3
+  ld1r { v12.2d }, [x2]
+  ld1r { v12.2d }, [x2], x3
+
+  ld1r { v12.8b }, [x2], #1
+  ld1r { v12.16b }, [x2], #1
+  ld1r { v12.4h }, [x2], #2
+  ld1r { v12.8h }, [x2], #2
+  ld1r { v12.2s }, [x2], #4
+  ld1r { v12.4s }, [x2], #4
+  ld1r { v12.1d }, [x2], #8
+  ld1r { v12.2d }, [x2], #8
+  ld2r { v3.8b, v4.8b }, [x2]
+  ld2r { v3.8b, v4.8b }, [x2], x3
+  ld2r { v3.16b, v4.16b }, [x2]
+  ld2r { v3.16b, v4.16b }, [x2], x3
+  ld2r { v3.4h, v4.4h }, [x2]
+  ld2r { v3.4h, v4.4h }, [x2], x3
+  ld2r { v3.8h, v4.8h }, [x2]
+  ld2r { v3.8h, v4.8h }, [x2], x3
+  ld2r { v3.2s, v4.2s }, [x2]
+  ld2r { v3.2s, v4.2s }, [x2], x3
+  ld2r { v3.4s, v4.4s }, [x2]
+  ld2r { v3.4s, v4.4s }, [x2], x3
+  ld2r { v3.1d, v4.1d }, [x2]
+  ld2r { v3.1d, v4.1d }, [x2], x3
+  ld2r { v3.2d, v4.2d }, [x2]
+  ld2r { v3.2d, v4.2d }, [x2], x3
+
+  ld2r { v3.8b, v4.8b }, [x2], #2
+  ld2r { v3.16b, v4.16b }, [x2], #2
+  ld2r { v3.4h, v4.4h }, [x2], #4
+  ld2r { v3.8h, v4.8h }, [x2], #4
+  ld2r { v3.2s, v4.2s }, [x2], #8
+  ld2r { v3.4s, v4.4s }, [x2], #8
+  ld2r { v3.1d, v4.1d }, [x2], #16
+  ld2r { v3.2d, v4.2d }, [x2], #16
+
+  ld3r { v2.8b, v3.8b, v4.8b }, [x2]
+  ld3r { v2.8b, v3.8b, v4.8b }, [x2], x3
+  ld3r { v2.16b, v3.16b, v4.16b }, [x2]
+  ld3r { v2.16b, v3.16b, v4.16b }, [x2], x3
+  ld3r { v2.4h, v3.4h, v4.4h }, [x2]
+  ld3r { v2.4h, v3.4h, v4.4h }, [x2], x3
+  ld3r { v2.8h, v3.8h, v4.8h }, [x2]
+  ld3r { v2.8h, v3.8h, v4.8h }, [x2], x3
+  ld3r { v2.2s, v3.2s, v4.2s }, [x2]
+  ld3r { v2.2s, v3.2s, v4.2s }, [x2], x3
+  ld3r { v2.4s, v3.4s, v4.4s }, [x2]
+  ld3r { v2.4s, v3.4s, v4.4s }, [x2], x3
+  ld3r { v2.1d, v3.1d, v4.1d }, [x2]
+  ld3r { v2.1d, v3.1d, v4.1d }, [x2], x3
+  ld3r { v2.2d, v3.2d, v4.2d }, [x2]
+  ld3r { v2.2d, v3.2d, v4.2d }, [x2], x3
+
+  ld3r { v2.8b, v3.8b, v4.8b }, [x2], #3
+  ld3r { v2.16b, v3.16b, v4.16b }, [x2], #3
+  ld3r { v2.4h, v3.4h, v4.4h }, [x2], #6
+  ld3r { v2.8h, v3.8h, v4.8h }, [x2], #6
+  ld3r { v2.2s, v3.2s, v4.2s }, [x2], #12
+  ld3r { v2.4s, v3.4s, v4.4s }, [x2], #12
+  ld3r { v2.1d, v3.1d, v4.1d }, [x2], #24
+  ld3r { v2.2d, v3.2d, v4.2d }, [x2], #24
+
+  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2]
+  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2], x3
+  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2]
+  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2], x3
+  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2]
+  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2], x3
+  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2]
+  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2], x3
+  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2]
+  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2], x3
+  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2]
+  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2], x3
+  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2]
+  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2], x3
+  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2]
+  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2], x3
+
+  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2], #4
+  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2], #4
+  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2], #8
+  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2], #8
+  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2], #16
+  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2], #16
+  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2], #32
+  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2], #32
+
+  ld1 { v6.b }[13], [x3]
+  ld1 { v6.h }[2], [x3]
+  ld1 { v6.s }[2], [x3]
+  ld1 { v6.d }[1], [x3]
+  ld1 { v6.b }[13], [x3], x5
+  ld1 { v6.h }[2], [x3], x5
+  ld1 { v6.s }[2], [x3], x5
+  ld1 { v6.d }[1], [x3], x5
+  ld1 { v6.b }[13], [x3], #1
+  ld1 { v6.h }[2], [x3], #2
+  ld1 { v6.s }[2], [x3], #4
+  ld1 { v6.d }[1], [x3], #8
+
+  ld2 { v5.b, v6.b }[13], [x3]
+  ld2 { v5.h, v6.h }[2], [x3]
+  ld2 { v5.s, v6.s }[2], [x3]
+  ld2 { v5.d, v6.d }[1], [x3]
+  ld2 { v5.b, v6.b }[13], [x3], x5
+  ld2 { v5.h, v6.h }[2], [x3], x5
+  ld2 { v5.s, v6.s }[2], [x3], x5
+  ld2 { v5.d, v6.d }[1], [x3], x5
+  ld2 { v5.b, v6.b }[13], [x3], #2
+  ld2 { v5.h, v6.h }[2], [x3], #4
+  ld2 { v5.s, v6.s }[2], [x3], #8
+  ld2 { v5.d, v6.d }[1], [x3], #16
+
+  ld3 { v7.b, v8.b, v9.b }[13], [x3]
+  ld3 { v7.h, v8.h, v9.h }[2], [x3]
+  ld3 { v7.s, v8.s, v9.s }[2], [x3]
+  ld3 { v7.d, v8.d, v9.d }[1], [x3]
+  ld3 { v7.b, v8.b, v9.b }[13], [x3], x5
+  ld3 { v7.h, v8.h, v9.h }[2], [x3], x5
+  ld3 { v7.s, v8.s, v9.s }[2], [x3], x5
+  ld3 { v7.d, v8.d, v9.d }[1], [x3], x5
+  ld3 { v7.b, v8.b, v9.b }[13], [x3], #3
+  ld3 { v7.h, v8.h, v9.h }[2], [x3], #6
+  ld3 { v7.s, v8.s, v9.s }[2], [x3], #12
+  ld3 { v7.d, v8.d, v9.d }[1], [x3], #24
+
+  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3]
+  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3]
+  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3]
+  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3]
+  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], x5
+  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], x5
+  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], x5
+  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], x5
+  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], #4
+  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], #8
+  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], #16
+  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], #32
+
+  st1 { v6.b }[13], [x3]
+  st1 { v6.h }[2], [x3]
+  st1 { v6.s }[2], [x3]
+  st1 { v6.d }[1], [x3]
+  st1 { v6.b }[13], [x3], x5
+  st1 { v6.h }[2], [x3], x5
+  st1 { v6.s }[2], [x3], x5
+  st1 { v6.d }[1], [x3], x5
+  st1 { v6.b }[13], [x3], #1
+  st1 { v6.h }[2], [x3], #2
+  st1 { v6.s }[2], [x3], #4
+  st1 { v6.d }[1], [x3], #8
+
+
+  st2 { v5.b, v6.b }[13], [x3]
+  st2 { v5.h, v6.h }[2], [x3]
+  st2 { v5.s, v6.s }[2], [x3]
+  st2 { v5.d, v6.d }[1], [x3]
+  st2 { v5.b, v6.b }[13], [x3], x5
+  st2 { v5.h, v6.h }[2], [x3], x5
+  st2 { v5.s, v6.s }[2], [x3], x5
+  st2 { v5.d, v6.d }[1], [x3], x5
+  st2 { v5.b, v6.b }[13], [x3], #2
+  st2 { v5.h, v6.h }[2], [x3], #4
+  st2 { v5.s, v6.s }[2], [x3], #8
+  st2 { v5.d, v6.d }[1], [x3], #16
+
+  st3 { v7.b, v8.b, v9.b }[13], [x3]
+  st3 { v7.h, v8.h, v9.h }[2], [x3]
+  st3 { v7.s, v8.s, v9.s }[2], [x3]
+  st3 { v7.d, v8.d, v9.d }[1], [x3]
+  st3 { v7.b, v8.b, v9.b }[13], [x3], x5
+  st3 { v7.h, v8.h, v9.h }[2], [x3], x5
+  st3 { v7.s, v8.s, v9.s }[2], [x3], x5
+  st3 { v7.d, v8.d, v9.d }[1], [x3], x5
+  st3 { v7.b, v8.b, v9.b }[13], [x3], #3
+  st3 { v7.h, v8.h, v9.h }[2], [x3], #6
+  st3 { v7.s, v8.s, v9.s }[2], [x3], #12
+  st3 { v7.d, v8.d, v9.d }[1], [x3], #24
+
+  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3]
+  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3]
+  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3]
+  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3]
+  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], x5
+  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], x5
+  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], x5
+  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], x5
+  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], #4
+  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], #8
+  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], #16
+  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], #32
+
+; CHECK: ld1.8b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x40,0x0c]
+; CHECK: ld1.8b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x40,0x0c]
+; CHECK: ld1.8b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x40,0x0c]
+; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x40,0x0c]
+; CHECK: ld1.16b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x40,0x4c]
+; CHECK: ld1.16b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x40,0x4c]
+; CHECK: ld1.16b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x40,0x4c]
+; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x40,0x4c]
+; CHECK: ld1.4h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x40,0x0c]
+; CHECK: ld1.4h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x40,0x0c]
+; CHECK: ld1.4h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x40,0x0c]
+; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x40,0x0c]
+; CHECK: ld1.8h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x40,0x4c]
+; CHECK: ld1.8h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x40,0x4c]
+; CHECK: ld1.8h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x40,0x4c]
+; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x40,0x4c]
+; CHECK: ld1.2s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x40,0x0c]
+; CHECK: ld1.2s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x40,0x0c]
+; CHECK: ld1.2s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x40,0x0c]
+; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x40,0x0c]
+; CHECK: ld1.4s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x40,0x4c]
+; CHECK: ld1.4s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x40,0x4c]
+; CHECK: ld1.4s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x40,0x4c]
+; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x40,0x4c]
+; CHECK: ld1.1d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x40,0x0c]
+; CHECK: ld1.1d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x40,0x0c]
+; CHECK: ld1.1d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x40,0x0c]
+; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x40,0x0c]
+; CHECK: ld1.2d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x40,0x4c]
+; CHECK: ld1.2d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x40,0x4c]
+; CHECK: ld1.2d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x40,0x4c]
+; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x40,0x4c]
+; CHECK: st1.8b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x00,0x0c]
+; CHECK: st1.8b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x00,0x0c]
+; CHECK: st1.8b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x00,0x0c]
+; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x00,0x0c]
+; CHECK: st1.16b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x00,0x4c]
+; CHECK: st1.16b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x00,0x4c]
+; CHECK: st1.16b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x00,0x4c]
+; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x00,0x4c]
+; CHECK: st1.4h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x00,0x0c]
+; CHECK: st1.4h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x00,0x0c]
+; CHECK: st1.4h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x00,0x0c]
+; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x00,0x0c]
+; CHECK: st1.8h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x00,0x4c]
+; CHECK: st1.8h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x00,0x4c]
+; CHECK: st1.8h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x00,0x4c]
+; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x00,0x4c]
+; CHECK: st1.2s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x00,0x0c]
+; CHECK: st1.2s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x00,0x0c]
+; CHECK: st1.2s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x00,0x0c]
+; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x00,0x0c]
+; CHECK: st1.4s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x00,0x4c]
+; CHECK: st1.4s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x00,0x4c]
+; CHECK: st1.4s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x00,0x4c]
+; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x00,0x4c]
+; CHECK: st1.1d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x00,0x0c]
+; CHECK: st1.1d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x00,0x0c]
+; CHECK: st1.1d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x00,0x0c]
+; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x00,0x0c]
+; CHECK: st1.2d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x00,0x4c]
+; CHECK: st1.2d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x00,0x4c]
+; CHECK: st1.2d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x00,0x4c]
+; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x00,0x4c]
+; CHECK: ld2.8b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x40,0x0c]
+; CHECK: ld2.16b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x40,0x4c]
+; CHECK: ld2.4h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x40,0x0c]
+; CHECK: ld2.8h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x40,0x4c]
+; CHECK: ld2.2s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x40,0x0c]
+; CHECK: ld2.4s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x40,0x4c]
+; CHECK: ld2.2d	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8e,0x40,0x4c]
+; CHECK: st2.8b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x00,0x0c]
+; CHECK: st2.16b { v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x00,0x4c]
+; CHECK: st2.4h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x00,0x0c]
+; CHECK: st2.8h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x00,0x4c]
+; CHECK: st2.2s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x00,0x0c]
+; CHECK: st2.4s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x00,0x4c]
+; CHECK: st2.2d	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8e,0x00,0x4c]
+; CHECK: ld3.8b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x40,0x0c]
+; CHECK: ld3.16b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x40,0x4c]
+; CHECK: ld3.4h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x40,0x0c]
+; CHECK: ld3.8h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x40,0x4c]
+; CHECK: ld3.2s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x0c]
+; CHECK: ld3.4s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x4c]
+; CHECK: ld3.2d	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4e,0x40,0x4c]
+; CHECK: st3.8b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x00,0x0c]
+; CHECK: st3.16b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x00,0x4c]
+; CHECK: st3.4h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x00,0x0c]
+; CHECK: st3.8h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x00,0x4c]
+; CHECK: st3.2s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x0c]
+; CHECK: st3.4s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x4c]
+; CHECK: st3.2d	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4e,0x00,0x4c]
+; CHECK: ld4.8b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x40,0x0c]
+; CHECK: ld4.16b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x40,0x4c]
+; CHECK: ld4.4h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x40,0x0c]
+; CHECK: ld4.8h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x40,0x4c]
+; CHECK: ld4.2s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x40,0x0c]
+; CHECK: ld4.4s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x40,0x4c]
+; CHECK: ld4.2d	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0e,0x40,0x4c]
+; CHECK: st4.8b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x00,0x0c]
+; CHECK: st4.16b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x00,0x4c]
+; CHECK: st4.4h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x00,0x0c]
+; CHECK: st4.8h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x00,0x4c]
+; CHECK: st4.2s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x00,0x0c]
+; CHECK: st4.4s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x00,0x4c]
+; CHECK: st4.2d	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0e,0x00,0x4c]
+; CHECK: ld1.8b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0xcf,0x0c]
+; CHECK: ld1.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0xcf,0x0c]
+; CHECK: ld1.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0xcf,0x0c]
+; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0xcf,0x0c]
+; CHECK: ld1.16b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0xcf,0x4c]
+; CHECK: ld1.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0xcf,0x4c]
+; CHECK: ld1.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0xcf,0x4c]
+; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0xcf,0x4c]
+; CHECK: ld1.4h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0xcf,0x0c]
+; CHECK: ld1.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0xcf,0x0c]
+; CHECK: ld1.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0xcf,0x0c]
+; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0xcf,0x0c]
+; CHECK: ld1.8h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0xcf,0x4c]
+; CHECK: ld1.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0xcf,0x4c]
+; CHECK: ld1.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0xcf,0x4c]
+; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0xcf,0x4c]
+; CHECK: ld1.2s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0xcf,0x0c]
+; CHECK: ld1.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0xcf,0x0c]
+; CHECK: ld1.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0xcf,0x0c]
+; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0xcf,0x0c]
+; CHECK: ld1.4s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0xcf,0x4c]
+; CHECK: ld1.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0xcf,0x4c]
+; CHECK: ld1.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0xcf,0x4c]
+; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0xcf,0x4c]
+; CHECK: ld1.1d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0xcf,0x0c]
+; CHECK: ld1.1d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0xcf,0x0c]
+; CHECK: ld1.1d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0xcf,0x0c]
+; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0xcf,0x0c]
+; CHECK: ld1.2d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0xcf,0x4c]
+; CHECK: ld1.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0xcf,0x4c]
+; CHECK: ld1.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0xcf,0x4c]
+; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0xcf,0x4c]
+; CHECK: st1.8b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0x8f,0x0c]
+; CHECK: st1.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0x8f,0x0c]
+; CHECK: st1.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0x8f,0x0c]
+; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0x8f,0x0c]
+; CHECK: st1.16b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0x8f,0x4c]
+; CHECK: st1.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0x8f,0x4c]
+; CHECK: st1.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0x8f,0x4c]
+; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0x8f,0x4c]
+; CHECK: st1.4h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0x8f,0x0c]
+; CHECK: st1.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0x8f,0x0c]
+; CHECK: st1.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0x8f,0x0c]
+; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0x8f,0x0c]
+; CHECK: st1.8h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0x8f,0x4c]
+; CHECK: st1.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0x8f,0x4c]
+; CHECK: st1.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0x8f,0x4c]
+; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0x8f,0x4c]
+; CHECK: st1.2s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0x8f,0x0c]
+; CHECK: st1.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0x8f,0x0c]
+; CHECK: st1.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0x8f,0x0c]
+; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0x8f,0x0c]
+; CHECK: st1.4s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0x8f,0x4c]
+; CHECK: st1.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0x8f,0x4c]
+; CHECK: st1.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0x8f,0x4c]
+; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0x8f,0x4c]
+; CHECK: st1.1d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0x8f,0x0c]
+; CHECK: st1.1d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0x8f,0x0c]
+; CHECK: st1.1d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0x8f,0x0c]
+; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0x8f,0x0c]
+; CHECK: st1.2d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0x8f,0x4c]
+; CHECK: st1.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0x8f,0x4c]
+; CHECK: st1.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0x8f,0x4c]
+; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0x8f,0x4c]
+; CHECK: ld1.8b	{ v1 }, [x1], #8       ; encoding: [0x21,0x70,0xdf,0x0c]
+; CHECK: ld1.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa0,0xdf,0x0c]
+; CHECK: ld1.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x60,0xdf,0x0c]
+; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x20,0xdf,0x0c]
+; CHECK: ld1.16b	{ v1 }, [x1], #16       ; encoding: [0x21,0x70,0xdf,0x4c]
+; CHECK: ld1.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa0,0xdf,0x4c]
+; CHECK: ld1.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x60,0xdf,0x4c]
+; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x20,0xdf,0x4c]
+; CHECK: ld1.4h	{ v1 }, [x1], #8       ; encoding: [0x21,0x74,0xdf,0x0c]
+; CHECK: ld1.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa4,0xdf,0x0c]
+; CHECK: ld1.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x64,0xdf,0x0c]
+; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x24,0xdf,0x0c]
+; CHECK: ld1.8h	{ v1 }, [x1], #16       ; encoding: [0x21,0x74,0xdf,0x4c]
+; CHECK: ld1.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa4,0xdf,0x4c]
+; CHECK: ld1.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x64,0xdf,0x4c]
+; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x24,0xdf,0x4c]
+; CHECK: ld1.2s	{ v1 }, [x1], #8       ; encoding: [0x21,0x78,0xdf,0x0c]
+; CHECK: ld1.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa8,0xdf,0x0c]
+; CHECK: ld1.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x68,0xdf,0x0c]
+; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x28,0xdf,0x0c]
+; CHECK: ld1.4s	{ v1 }, [x1], #16       ; encoding: [0x21,0x78,0xdf,0x4c]
+; CHECK: ld1.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa8,0xdf,0x4c]
+; CHECK: ld1.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x68,0xdf,0x4c]
+; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x28,0xdf,0x4c]
+; CHECK: ld1.1d	{ v1 }, [x1], #8       ; encoding: [0x21,0x7c,0xdf,0x0c]
+; CHECK: ld1.1d	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xac,0xdf,0x0c]
+; CHECK: ld1.1d	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x6c,0xdf,0x0c]
+; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x2c,0xdf,0x0c]
+; CHECK: ld1.2d	{ v1 }, [x1], #16       ; encoding: [0x21,0x7c,0xdf,0x4c]
+; CHECK: ld1.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xac,0xdf,0x4c]
+; CHECK: ld1.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x6c,0xdf,0x4c]
+; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x2c,0xdf,0x4c]
+; CHECK: st1.8b	{ v1 }, [x1], #8       ; encoding: [0x21,0x70,0x9f,0x0c]
+; CHECK: st1.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa0,0x9f,0x0c]
+; CHECK: st1.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x60,0x9f,0x0c]
+; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x20,0x9f,0x0c]
+; CHECK: st1.16b	{ v1 }, [x1], #16       ; encoding: [0x21,0x70,0x9f,0x4c]
+; CHECK: st1.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa0,0x9f,0x4c]
+; CHECK: st1.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x60,0x9f,0x4c]
+; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x20,0x9f,0x4c]
+; CHECK: st1.4h	{ v1 }, [x1], #8       ; encoding: [0x21,0x74,0x9f,0x0c]
+; CHECK: st1.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa4,0x9f,0x0c]
+; CHECK: st1.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x64,0x9f,0x0c]
+; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x24,0x9f,0x0c]
+; CHECK: st1.8h	{ v1 }, [x1], #16       ; encoding: [0x21,0x74,0x9f,0x4c]
+; CHECK: st1.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa4,0x9f,0x4c]
+; CHECK: st1.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x64,0x9f,0x4c]
+; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x24,0x9f,0x4c]
+; CHECK: st1.2s	{ v1 }, [x1], #8       ; encoding: [0x21,0x78,0x9f,0x0c]
+; CHECK: st1.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa8,0x9f,0x0c]
+; CHECK: st1.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x68,0x9f,0x0c]
+; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x28,0x9f,0x0c]
+; CHECK: st1.4s	{ v1 }, [x1], #16       ; encoding: [0x21,0x78,0x9f,0x4c]
+; CHECK: st1.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa8,0x9f,0x4c]
+; CHECK: st1.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x68,0x9f,0x4c]
+; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x28,0x9f,0x4c]
+; CHECK: st1.1d	{ v1 }, [x1], #8       ; encoding: [0x21,0x7c,0x9f,0x0c]
+; CHECK: st1.1d	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xac,0x9f,0x0c]
+; CHECK: st1.1d	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x6c,0x9f,0x0c]
+; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x2c,0x9f,0x0c]
+; CHECK: st1.2d	{ v1 }, [x1], #16       ; encoding: [0x21,0x7c,0x9f,0x4c]
+; CHECK: st1.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xac,0x9f,0x4c]
+; CHECK: st1.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x6c,0x9f,0x4c]
+; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x2c,0x9f,0x4c]
+; CHECK: ld2.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0xcf,0x0c]
+; CHECK: ld2.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0xcf,0x4c]
+; CHECK: ld2.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0xcf,0x0c]
+; CHECK: ld2.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0xcf,0x4c]
+; CHECK: ld2.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0xcf,0x0c]
+; CHECK: ld2.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0xcf,0x4c]
+; CHECK: ld2.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x8c,0xcf,0x4c]
+; CHECK: st2.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0x8f,0x0c]
+; CHECK: st2.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0x8f,0x4c]
+; CHECK: st2.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0x8f,0x0c]
+; CHECK: st2.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0x8f,0x4c]
+; CHECK: st2.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0x8f,0x0c]
+; CHECK: st2.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0x8f,0x4c]
+; CHECK: st2.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x8c,0x8f,0x4c]
+; CHECK: ld2.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x80,0xdf,0x0c]
+; CHECK: ld2.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x80,0xdf,0x4c]
+; CHECK: ld2.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x84,0xdf,0x0c]
+; CHECK: ld2.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x84,0xdf,0x4c]
+; CHECK: ld2.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x88,0xdf,0x0c]
+; CHECK: ld2.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x88,0xdf,0x4c]
+; CHECK: ld2.2d	{ v2, v3 }, [x1], #32	; encoding: [0x22,0x8c,0xdf,0x4c]
+; CHECK: st2.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x80,0x9f,0x0c]
+; CHECK: st2.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x80,0x9f,0x4c]
+; CHECK: st2.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x84,0x9f,0x0c]
+; CHECK: st2.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x84,0x9f,0x4c]
+; CHECK: st2.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x88,0x9f,0x0c]
+; CHECK: st2.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x88,0x9f,0x4c]
+; CHECK: st2.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x8c,0x9f,0x4c]
+; CHECK: ld3.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0xcf,0x0c]
+; CHECK: ld3.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0xcf,0x4c]
+; CHECK: ld3.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0xcf,0x0c]
+; CHECK: ld3.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0xcf,0x4c]
+; CHECK: ld3.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0xcf,0x0c]
+; CHECK: ld3.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0xcf,0x4c]
+; CHECK: ld3.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x4c,0xcf,0x4c]
+; CHECK: st3.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0x8f,0x0c]
+; CHECK: st3.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0x8f,0x4c]
+; CHECK: st3.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0x8f,0x0c]
+; CHECK: st3.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0x8f,0x4c]
+; CHECK: st3.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0x8f,0x0c]
+; CHECK: st3.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0x8f,0x4c]
+; CHECK: st3.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x4c,0x8f,0x4c]
+; CHECK: ld3.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x40,0xdf,0x0c]
+; CHECK: ld3.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x40,0xdf,0x4c]
+; CHECK: ld3.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x44,0xdf,0x0c]
+; CHECK: ld3.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x44,0xdf,0x4c]
+; CHECK: ld3.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x48,0xdf,0x0c]
+; CHECK: ld3.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x48,0xdf,0x4c]
+; CHECK: ld3.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x4c,0xdf,0x4c]
+; CHECK: st3.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x40,0x9f,0x0c]
+; CHECK: st3.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x40,0x9f,0x4c]
+; CHECK: st3.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x44,0x9f,0x0c]
+; CHECK: st3.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x44,0x9f,0x4c]
+; CHECK: st3.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x48,0x9f,0x0c]
+; CHECK: st3.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x48,0x9f,0x4c]
+; CHECK: st3.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x4c,0x9f,0x4c]
+; CHECK: ld4.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0xcf,0x0c]
+; CHECK: ld4.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0xcf,0x4c]
+; CHECK: ld4.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0xcf,0x0c]
+; CHECK: ld4.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0xcf,0x4c]
+; CHECK: ld4.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0xcf,0x0c]
+; CHECK: ld4.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0xcf,0x4c]
+; CHECK: ld4.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x0c,0xcf,0x4c]
+; CHECK: st4.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0x8f,0x0c]
+; CHECK: st4.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0x8f,0x4c]
+; CHECK: st4.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0x8f,0x0c]
+; CHECK: st4.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0x8f,0x4c]
+; CHECK: st4.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0x8f,0x0c]
+; CHECK: st4.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0x8f,0x4c]
+; CHECK: st4.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x0c,0x8f,0x4c]
+; CHECK: ld4.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x00,0xdf,0x0c]
+; CHECK: ld4.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x00,0xdf,0x4c]
+; CHECK: ld4.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x04,0xdf,0x0c]
+; CHECK: ld4.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x04,0xdf,0x4c]
+; CHECK: ld4.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x08,0xdf,0x0c]
+; CHECK: ld4.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x08,0xdf,0x4c]
+; CHECK: ld4.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x0c,0xdf,0x4c]
+; CHECK: st4.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x00,0x9f,0x0c]
+; CHECK: st4.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x00,0x9f,0x4c]
+; CHECK: st4.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x04,0x9f,0x0c]
+; CHECK: st4.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x04,0x9f,0x4c]
+; CHECK: st4.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x08,0x9f,0x0c]
+; CHECK: st4.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x08,0x9f,0x4c]
+; CHECK: st4.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x0c,0x9f,0x4c]
+; CHECK: ld1r.8b	{ v12 }, [x2]           ; encoding: [0x4c,0xc0,0x40,0x0d]
+; CHECK: ld1r.8b	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc0,0xc3,0x0d]
+; CHECK: ld1r.16b	{ v12 }, [x2]   ; encoding: [0x4c,0xc0,0x40,0x4d]
+; CHECK: ld1r.16b	{ v12 }, [x2], x3 ; encoding: [0x4c,0xc0,0xc3,0x4d]
+; CHECK: ld1r.4h	{ v12 }, [x2]           ; encoding: [0x4c,0xc4,0x40,0x0d]
+; CHECK: ld1r.4h	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc4,0xc3,0x0d]
+; CHECK: ld1r.8h	{ v12 }, [x2]           ; encoding: [0x4c,0xc4,0x40,0x4d]
+; CHECK: ld1r.8h	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc4,0xc3,0x4d]
+; CHECK: ld1r.2s	{ v12 }, [x2]           ; encoding: [0x4c,0xc8,0x40,0x0d]
+; CHECK: ld1r.2s	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc8,0xc3,0x0d]
+; CHECK: ld1r.4s	{ v12 }, [x2]           ; encoding: [0x4c,0xc8,0x40,0x4d]
+; CHECK: ld1r.4s	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc8,0xc3,0x4d]
+; CHECK: ld1r.1d	{ v12 }, [x2]           ; encoding: [0x4c,0xcc,0x40,0x0d]
+; CHECK: ld1r.1d	{ v12 }, [x2], x3       ; encoding: [0x4c,0xcc,0xc3,0x0d]
+; CHECK: ld1r.2d	{ v12 }, [x2]           ; encoding: [0x4c,0xcc,0x40,0x4d]
+; CHECK: ld1r.2d	{ v12 }, [x2], x3       ; encoding: [0x4c,0xcc,0xc3,0x4d]
+; CHECK: ld1r.8b	{ v12 }, [x2], #1      ; encoding: [0x4c,0xc0,0xdf,0x0d]
+; CHECK: ld1r.16b	{ v12 }, [x2], #1 ; encoding: [0x4c,0xc0,0xdf,0x4d]
+; CHECK: ld1r.4h	{ v12 }, [x2], #2      ; encoding: [0x4c,0xc4,0xdf,0x0d]
+; CHECK: ld1r.8h	{ v12 }, [x2], #2      ; encoding: [0x4c,0xc4,0xdf,0x4d]
+; CHECK: ld1r.2s	{ v12 }, [x2], #4      ; encoding: [0x4c,0xc8,0xdf,0x0d]
+; CHECK: ld1r.4s	{ v12 }, [x2], #4      ; encoding: [0x4c,0xc8,0xdf,0x4d]
+; CHECK: ld1r.1d	{ v12 }, [x2], #8      ; encoding: [0x4c,0xcc,0xdf,0x0d]
+; CHECK: ld1r.2d	{ v12 }, [x2], #8      ; encoding: [0x4c,0xcc,0xdf,0x4d]
+; CHECK: ld2r.8b	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc0,0x60,0x0d]
+; CHECK: ld2r.8b	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc0,0xe3,0x0d]
+; CHECK: ld2r.16b	{ v3, v4 }, [x2] ; encoding: [0x43,0xc0,0x60,0x4d]
+; CHECK: ld2r.16b	{ v3, v4 }, [x2], x3 ; encoding: [0x43,0xc0,0xe3,0x4d]
+; CHECK: ld2r.4h	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc4,0x60,0x0d]
+; CHECK: ld2r.4h	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc4,0xe3,0x0d]
+; CHECK: ld2r.8h	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc4,0x60,0x4d]
+; CHECK: ld2r.8h	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc4,0xe3,0x4d]
+; CHECK: ld2r.2s	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc8,0x60,0x0d]
+; CHECK: ld2r.2s	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc8,0xe3,0x0d]
+; CHECK: ld2r.4s	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc8,0x60,0x4d]
+; CHECK: ld2r.4s	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc8,0xe3,0x4d]
+; CHECK: ld2r.1d	{ v3, v4 }, [x2]        ; encoding: [0x43,0xcc,0x60,0x0d]
+; CHECK: ld2r.1d	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xcc,0xe3,0x0d]
+; CHECK: ld2r.2d	{ v3, v4 }, [x2]        ; encoding: [0x43,0xcc,0x60,0x4d]
+; CHECK: ld2r.2d	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xcc,0xe3,0x4d]
+; CHECK: ld2r.8b	{ v3, v4 }, [x2], #2   ; encoding: [0x43,0xc0,0xff,0x0d]
+; CHECK: ld2r.16b	{ v3, v4 }, [x2], #2 ; encoding: [0x43,0xc0,0xff,0x4d]
+; CHECK: ld2r.4h	{ v3, v4 }, [x2], #4   ; encoding: [0x43,0xc4,0xff,0x0d]
+; CHECK: ld2r.8h	{ v3, v4 }, [x2], #4   ; encoding: [0x43,0xc4,0xff,0x4d]
+; CHECK: ld2r.2s	{ v3, v4 }, [x2], #8   ; encoding: [0x43,0xc8,0xff,0x0d]
+; CHECK: ld2r.4s	{ v3, v4 }, [x2], #8   ; encoding: [0x43,0xc8,0xff,0x4d]
+; CHECK: ld2r.1d	{ v3, v4 }, [x2], #16   ; encoding: [0x43,0xcc,0xff,0x0d]
+; CHECK: ld2r.2d	{ v3, v4 }, [x2], #16   ; encoding: [0x43,0xcc,0xff,0x4d]
+; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe0,0x40,0x0d]
+; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe0,0xc3,0x0d]
+; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2] ; encoding: [0x42,0xe0,0x40,0x4d]
+; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe0,0xc3,0x4d]
+; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe4,0x40,0x0d]
+; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe4,0xc3,0x0d]
+; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe4,0x40,0x4d]
+; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe4,0xc3,0x4d]
+; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe8,0x40,0x0d]
+; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe8,0xc3,0x0d]
+; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe8,0x40,0x4d]
+; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe8,0xc3,0x4d]
+; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xec,0x40,0x0d]
+; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xec,0xc3,0x0d]
+; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xec,0x40,0x4d]
+; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xec,0xc3,0x4d]
+; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2], #3 ; encoding: [0x42,0xe0,0xdf,0x0d]
+; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2], #3 ; encoding: [0x42,0xe0,0xdf,0x4d]
+; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2], #6 ; encoding: [0x42,0xe4,0xdf,0x0d]
+; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2], #6 ; encoding: [0x42,0xe4,0xdf,0x4d]
+; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2], #12 ; encoding: [0x42,0xe8,0xdf,0x0d]
+; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2], #12 ; encoding: [0x42,0xe8,0xdf,0x4d]
+; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2], #24 ; encoding: [0x42,0xec,0xdf,0x0d]
+; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2], #24 ; encoding: [0x42,0xec,0xdf,0x4d]
+; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe0,0x60,0x0d]
+; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe0,0xe3,0x0d]
+; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe0,0x60,0x4d]
+; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe0,0xe3,0x4d]
+; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe4,0x60,0x0d]
+; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe4,0xe3,0x0d]
+; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe4,0x60,0x4d]
+; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe4,0xe3,0x4d]
+; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe8,0x60,0x0d]
+; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe8,0xe3,0x0d]
+; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe8,0x60,0x4d]
+; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe8,0xe3,0x4d]
+; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xec,0x60,0x0d]
+; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xec,0xe3,0x0d]
+; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xec,0x60,0x4d]
+; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xec,0xe3,0x4d]
+; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2], #4 ; encoding: [0x42,0xe0,0xff,0x0d]
+; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2], #4 ; encoding: [0x42,0xe0,0xff,0x4d]
+; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2], #8 ; encoding: [0x42,0xe4,0xff,0x0d]
+; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2], #8 ; encoding: [0x42,0xe4,0xff,0x4d]
+; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x0d]
+; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x4d]
+; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2], #32 ; encoding: [0x42,0xec,0xff,0x0d]
+; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2], #32 ; encoding: [0x42,0xec,0xff,0x4d]
+; CHECK: ld1.b	{ v6 }[13], [x3]        ; encoding: [0x66,0x14,0x40,0x4d]
+; CHECK: ld1.h	{ v6 }[2], [x3]         ; encoding: [0x66,0x50,0x40,0x0d]
+; CHECK: ld1.s	{ v6 }[2], [x3]         ; encoding: [0x66,0x80,0x40,0x4d]
+; CHECK: ld1.d	{ v6 }[1], [x3]         ; encoding: [0x66,0x84,0x40,0x4d]
+; CHECK: ld1.b	{ v6 }[13], [x3], x5    ; encoding: [0x66,0x14,0xc5,0x4d]
+; CHECK: ld1.h	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x50,0xc5,0x0d]
+; CHECK: ld1.s	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x80,0xc5,0x4d]
+; CHECK: ld1.d	{ v6 }[1], [x3], x5     ; encoding: [0x66,0x84,0xc5,0x4d]
+; CHECK: ld1.b	{ v6 }[13], [x3], #1   ; encoding: [0x66,0x14,0xdf,0x4d]
+; CHECK: ld1.h	{ v6 }[2], [x3], #2    ; encoding: [0x66,0x50,0xdf,0x0d]
+; CHECK: ld1.s	{ v6 }[2], [x3], #4    ; encoding: [0x66,0x80,0xdf,0x4d]
+; CHECK: ld1.d	{ v6 }[1], [x3], #8    ; encoding: [0x66,0x84,0xdf,0x4d]
+; CHECK: ld2.b	{ v5, v6 }[13], [x3]    ; encoding: [0x65,0x14,0x60,0x4d]
+; CHECK: ld2.h	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x50,0x60,0x0d]
+; CHECK: ld2.s	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x80,0x60,0x4d]
+; CHECK: ld2.d	{ v5, v6 }[1], [x3]     ; encoding: [0x65,0x84,0x60,0x4d]
+; CHECK: ld2.b	{ v5, v6 }[13], [x3], x5 ; encoding: [0x65,0x14,0xe5,0x4d]
+; CHECK: ld2.h	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x50,0xe5,0x0d]
+; CHECK: ld2.s	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x80,0xe5,0x4d]
+; CHECK: ld2.d	{ v5, v6 }[1], [x3], x5 ; encoding: [0x65,0x84,0xe5,0x4d]
+; CHECK: ld2.b	{ v5, v6 }[13], [x3], #2 ; encoding: [0x65,0x14,0xff,0x4d]
+; CHECK: ld2.h	{ v5, v6 }[2], [x3], #4 ; encoding: [0x65,0x50,0xff,0x0d]
+; CHECK: ld2.s	{ v5, v6 }[2], [x3], #8 ; encoding: [0x65,0x80,0xff,0x4d]
+; CHECK: ld2.d	{ v5, v6 }[1], [x3], #16 ; encoding: [0x65,0x84,0xff,0x4d]
+; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3] ; encoding: [0x67,0x34,0x40,0x4d]
+; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0x70,0x40,0x0d]
+; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0xa0,0x40,0x4d]
+; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3] ; encoding: [0x67,0xa4,0x40,0x4d]
+; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3], x5 ; encoding: [0x67,0x34,0xc5,0x4d]
+; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0x70,0xc5,0x0d]
+; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xc5,0x4d]
+; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xc5,0x4d]
+; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3], #3 ; encoding: [0x67,0x34,0xdf,0x4d]
+; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3], #6 ; encoding: [0x67,0x70,0xdf,0x0d]
+; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3], #12 ; encoding: [0x67,0xa0,0xdf,0x4d]
+; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3], #24 ; encoding: [0x67,0xa4,0xdf,0x4d]
+; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3] ; encoding: [0x67,0x34,0x60,0x4d]
+; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0x70,0x60,0x0d]
+; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0xa0,0x60,0x4d]
+; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3] ; encoding: [0x67,0xa4,0x60,0x4d]
+; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3], x5 ; encoding: [0x67,0x34,0xe5,0x4d]
+; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0x70,0xe5,0x0d]
+; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xe5,0x4d]
+; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xe5,0x4d]
+; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3], #4 ; encoding: [0x67,0x34,0xff,0x4d]
+; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3], #8 ; encoding: [0x67,0x70,0xff,0x0d]
+; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3], #16 ; encoding: [0x67,0xa0,0xff,0x4d]
+; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3], #32 ; encoding: [0x67,0xa4,0xff,0x4d]
+; CHECK: st1.b	{ v6 }[13], [x3]        ; encoding: [0x66,0x14,0x00,0x4d]
+; CHECK: st1.h	{ v6 }[2], [x3]         ; encoding: [0x66,0x50,0x00,0x0d]
+; CHECK: st1.s	{ v6 }[2], [x3]         ; encoding: [0x66,0x80,0x00,0x4d]
+; CHECK: st1.d	{ v6 }[1], [x3]         ; encoding: [0x66,0x84,0x00,0x4d]
+; CHECK: st1.b	{ v6 }[13], [x3], x5    ; encoding: [0x66,0x14,0x85,0x4d]
+; CHECK: st1.h	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x50,0x85,0x0d]
+; CHECK: st1.s	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x80,0x85,0x4d]
+; CHECK: st1.d	{ v6 }[1], [x3], x5     ; encoding: [0x66,0x84,0x85,0x4d]
+; CHECK: st1.b	{ v6 }[13], [x3], #1   ; encoding: [0x66,0x14,0x9f,0x4d]
+; CHECK: st1.h	{ v6 }[2], [x3], #2    ; encoding: [0x66,0x50,0x9f,0x0d]
+; CHECK: st1.s	{ v6 }[2], [x3], #4    ; encoding: [0x66,0x80,0x9f,0x4d]
+; CHECK: st1.d	{ v6 }[1], [x3], #8    ; encoding: [0x66,0x84,0x9f,0x4d]
+; CHECK: st2.b	{ v5, v6 }[13], [x3]    ; encoding: [0x65,0x14,0x20,0x4d]
+; CHECK: st2.h	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x50,0x20,0x0d]
+; CHECK: st2.s	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x80,0x20,0x4d]
+; CHECK: st2.d	{ v5, v6 }[1], [x3]     ; encoding: [0x65,0x84,0x20,0x4d]
+; CHECK: st2.b	{ v5, v6 }[13], [x3], x5 ; encoding: [0x65,0x14,0xa5,0x4d]
+; CHECK: st2.h	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x50,0xa5,0x0d]
+; CHECK: st2.s	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x80,0xa5,0x4d]
+; CHECK: st2.d	{ v5, v6 }[1], [x3], x5 ; encoding: [0x65,0x84,0xa5,0x4d]
+; CHECK: st2.b	{ v5, v6 }[13], [x3], #2 ; encoding: [0x65,0x14,0xbf,0x4d]
+; CHECK: st2.h	{ v5, v6 }[2], [x3], #4 ; encoding: [0x65,0x50,0xbf,0x0d]
+; CHECK: st2.s	{ v5, v6 }[2], [x3], #8 ; encoding: [0x65,0x80,0xbf,0x4d]
+; CHECK: st2.d	{ v5, v6 }[1], [x3], #16 ; encoding: [0x65,0x84,0xbf,0x4d]
+; CHECK: st3.b	{ v7, v8, v9 }[13], [x3] ; encoding: [0x67,0x34,0x00,0x4d]
+; CHECK: st3.h	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0x70,0x00,0x0d]
+; CHECK: st3.s	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0xa0,0x00,0x4d]
+; CHECK: st3.d	{ v7, v8, v9 }[1], [x3] ; encoding: [0x67,0xa4,0x00,0x4d]
+; CHECK: st3.b	{ v7, v8, v9 }[13], [x3], x5 ; encoding: [0x67,0x34,0x85,0x4d]
+; CHECK: st3.h	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0x70,0x85,0x0d]
+; CHECK: st3.s	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0xa0,0x85,0x4d]
+; CHECK: st3.d	{ v7, v8, v9 }[1], [x3], x5 ; encoding: [0x67,0xa4,0x85,0x4d]
+; CHECK: st3.b	{ v7, v8, v9 }[13], [x3], #3 ; encoding: [0x67,0x34,0x9f,0x4d]
+; CHECK: st3.h	{ v7, v8, v9 }[2], [x3], #6 ; encoding: [0x67,0x70,0x9f,0x0d]
+; CHECK: st3.s	{ v7, v8, v9 }[2], [x3], #12 ; encoding: [0x67,0xa0,0x9f,0x4d]
+; CHECK: st3.d	{ v7, v8, v9 }[1], [x3], #24 ; encoding: [0x67,0xa4,0x9f,0x4d]
+; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3] ; encoding: [0x67,0x34,0x20,0x4d]
+; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0x70,0x20,0x0d]
+; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0xa0,0x20,0x4d]
+; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3] ; encoding: [0x67,0xa4,0x20,0x4d]
+; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3], x5 ; encoding: [0x67,0x34,0xa5,0x4d]
+; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0x70,0xa5,0x0d]
+; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xa5,0x4d]
+; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xa5,0x4d]
+; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3], #4 ; encoding: [0x67,0x34,0xbf,0x4d]
+; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3], #8 ; encoding: [0x67,0x70,0xbf,0x0d]
+; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3], #16 ; encoding: [0x67,0xa0,0xbf,0x4d]
+; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3], #32 ; encoding: [0x67,0xa4,0xbf,0x4d]
diff --git a/test/MC/AArch64/arm64-small-data-fixups.s b/test/MC/AArch64/arm64-small-data-fixups.s
new file mode 100644
index 0000000..3fe7c75
--- /dev/null
+++ b/test/MC/AArch64/arm64-small-data-fixups.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o - %s | macho-dump | FileCheck %s
+
+foo:
+  .long 0
+bar:
+  .long 1
+
+baz:
+  .byte foo - bar
+  .short foo - bar
+
+; CHECK: # Relocation 0
+; CHECK: (('word-0', 0x9),
+; CHECK:  ('word-1', 0x1a000002)),
+; CHECK: # Relocation 1
+; CHECK: (('word-0', 0x9),
+; CHECK:  ('word-1', 0xa000001)),
+; CHECK: # Relocation 2
+; CHECK: (('word-0', 0x8),
+; CHECK:  ('word-1', 0x18000002)),
+; CHECK: # Relocation 3
+; CHECK: (('word-0', 0x8),
+; CHECK:  ('word-1', 0x8000001)),
+
diff --git a/test/MC/AArch64/arm64-spsel-sysreg.s b/test/MC/AArch64/arm64-spsel-sysreg.s
new file mode 100644
index 0000000..f1d94d8
--- /dev/null
+++ b/test/MC/AArch64/arm64-spsel-sysreg.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc -triple arm64 -show-encoding < %s 2>%t | FileCheck %s
+// RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+msr SPSel, #0
+msr SPSel, x0
+msr DAIFSet, #0
+msr ESR_EL1, x0
+mrs x0, SPSel
+mrs x0, ESR_EL1
+
+// CHECK: msr SPSEL, #0               // encoding: [0xbf,0x40,0x00,0xd5]
+// CHECK: msr SPSEL, x0               // encoding: [0x00,0x42,0x18,0xd5]
+// CHECK: msr DAIFSET, #0             // encoding: [0xdf,0x40,0x03,0xd5]
+// CHECK: msr ESR_EL1, x0             // encoding: [0x00,0x52,0x18,0xd5]
+// CHECK: mrs x0, SPSEL               // encoding: [0x00,0x42,0x38,0xd5]
+// CHECK: mrs x0, ESR_EL1             // encoding: [0x00,0x52,0x38,0xd5]
+
+
+msr DAIFSet, x0
+msr ESR_EL1, #0
+mrs x0, DAIFSet
+// CHECK-ERRORS: error: immediate must be an integer in range [0, 15]
+// CHECK-ERRORS: error: invalid operand for instruction
+// CHECK-ERRORS: error: expected readable system register
diff --git a/test/MC/AArch64/arm64-system-encoding.s b/test/MC/AArch64/arm64-system-encoding.s
new file mode 100644
index 0000000..9246608
--- /dev/null
+++ b/test/MC/AArch64/arm64-system-encoding.s
@@ -0,0 +1,623 @@
+; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+foo:
+
+;-----------------------------------------------------------------------------
+; Simple encodings (instuctions w/ no operands)
+;-----------------------------------------------------------------------------
+
+  nop
+  sev
+  sevl
+  wfe
+  wfi
+  yield
+
+; CHECK: nop                             ; encoding: [0x1f,0x20,0x03,0xd5]
+; CHECK: sev                             ; encoding: [0x9f,0x20,0x03,0xd5]
+; CHECK: sevl                            ; encoding: [0xbf,0x20,0x03,0xd5]
+; CHECK: wfe                             ; encoding: [0x5f,0x20,0x03,0xd5]
+; CHECK: wfi                             ; encoding: [0x7f,0x20,0x03,0xd5]
+; CHECK: yield                           ; encoding: [0x3f,0x20,0x03,0xd5]
+
+;-----------------------------------------------------------------------------
+; Single-immediate operand instructions
+;-----------------------------------------------------------------------------
+
+  clrex #10
+; CHECK: clrex #10  ; encoding: [0x5f,0x3a,0x03,0xd5]
+  isb #15
+  isb sy
+; CHECK: isb     ; encoding: [0xdf,0x3f,0x03,0xd5]
+; CHECK: isb     ; encoding: [0xdf,0x3f,0x03,0xd5]
+  dmb #3
+  dmb osh
+; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
+; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
+  dsb #7
+  dsb nsh
+; CHECK: dsb nsh    ; encoding: [0x9f,0x37,0x03,0xd5]
+; CHECK: dsb nsh    ; encoding: [0x9f,0x37,0x03,0xd5]
+
+;-----------------------------------------------------------------------------
+; Generic system instructions
+;-----------------------------------------------------------------------------
+  sys #2, c0, c5, #7
+; CHECK: encoding: [0xff,0x05,0x0a,0xd5]
+  sys #7, C6, c10, #7, x7
+; CHECK: encoding: [0xe7,0x6a,0x0f,0xd5]
+  sysl  x20, #6, c3, C15, #7
+; CHECK: encoding: [0xf4,0x3f,0x2e,0xd5]
+
+; Check for error on invalid 'C' operand value.
+  sys #2, c16, c5, #7
+; CHECK-ERRORS: error: Expected cN operand where 0 <= N <= 15
+
+;-----------------------------------------------------------------------------
+; MSR/MRS instructions
+;-----------------------------------------------------------------------------
+  msr ACTLR_EL1, x3
+  msr ACTLR_EL2, x3
+  msr ACTLR_EL3, x3
+  msr AFSR0_EL1, x3
+  msr AFSR0_EL2, x3
+  msr AFSR0_EL3, x3
+  msr AFSR1_EL1, x3
+  msr AFSR1_EL2, x3
+  msr AFSR1_EL3, x3
+  msr AMAIR_EL1, x3
+  msr AMAIR_EL2, x3
+  msr AMAIR_EL3, x3
+  msr CNTFRQ_EL0, x3
+  msr CNTHCTL_EL2, x3
+  msr CNTHP_CTL_EL2, x3
+  msr CNTHP_CVAL_EL2, x3
+  msr CNTHP_TVAL_EL2, x3
+  msr CNTKCTL_EL1, x3
+  msr CNTP_CTL_EL0, x3
+  msr CNTP_CVAL_EL0, x3
+  msr CNTP_TVAL_EL0, x3
+  msr CNTVOFF_EL2, x3
+  msr CNTV_CTL_EL0, x3
+  msr CNTV_CVAL_EL0, x3
+  msr CNTV_TVAL_EL0, x3
+  msr CONTEXTIDR_EL1, x3
+  msr CPACR_EL1, x3
+  msr CPTR_EL2, x3
+  msr CPTR_EL3, x3
+  msr CSSELR_EL1, x3
+  msr CURRENTEL, x3
+  msr DACR32_EL2, x3
+  msr ESR_EL1, x3
+  msr ESR_EL2, x3
+  msr ESR_EL3, x3
+  msr FAR_EL1, x3
+  msr FAR_EL2, x3
+  msr FAR_EL3, x3
+  msr FPEXC32_EL2, x3
+  msr HACR_EL2, x3
+  msr HCR_EL2, x3
+  msr HPFAR_EL2, x3
+  msr HSTR_EL2, x3
+  msr IFSR32_EL2, x3
+  msr MAIR_EL1, x3
+  msr MAIR_EL2, x3
+  msr MAIR_EL3, x3
+  msr MDCR_EL2, x3
+  msr MDCR_EL3, x3
+  msr PAR_EL1, x3
+  msr SCR_EL3, x3
+  msr SCTLR_EL1, x3
+  msr SCTLR_EL2, x3
+  msr SCTLR_EL3, x3
+  msr SDER32_EL3, x3
+  msr TCR_EL1, x3
+  msr TCR_EL2, x3
+  msr TCR_EL3, x3
+  msr TEECR32_EL1, x3
+  msr TEEHBR32_EL1, x3
+  msr TPIDRRO_EL0, x3
+  msr TPIDR_EL0, x3
+  msr TPIDR_EL1, x3
+  msr TPIDR_EL2, x3
+  msr TPIDR_EL3, x3
+  msr TTBR0_EL1, x3
+  msr TTBR0_EL2, x3
+  msr TTBR0_EL3, x3
+  msr TTBR1_EL1, x3
+  msr VBAR_EL1, x3
+  msr VBAR_EL2, x3
+  msr VBAR_EL3, x3
+  msr VMPIDR_EL2, x3
+  msr VPIDR_EL2, x3
+  msr VTCR_EL2, x3
+  msr VTTBR_EL2, x3
+  msr SPSel, x3
+  msr S3_2_C11_C6_4, x1
+; CHECK: msr ACTLR_EL1, x3              ; encoding: [0x23,0x10,0x18,0xd5]
+; CHECK: msr ACTLR_EL2, x3              ; encoding: [0x23,0x10,0x1c,0xd5]
+; CHECK: msr ACTLR_EL3, x3              ; encoding: [0x23,0x10,0x1e,0xd5]
+; CHECK: msr AFSR0_EL1, x3              ; encoding: [0x03,0x51,0x18,0xd5]
+; CHECK: msr AFSR0_EL2, x3              ; encoding: [0x03,0x51,0x1c,0xd5]
+; CHECK: msr AFSR0_EL3, x3              ; encoding: [0x03,0x51,0x1e,0xd5]
+; CHECK: msr AFSR1_EL1, x3              ; encoding: [0x23,0x51,0x18,0xd5]
+; CHECK: msr AFSR1_EL2, x3              ; encoding: [0x23,0x51,0x1c,0xd5]
+; CHECK: msr AFSR1_EL3, x3              ; encoding: [0x23,0x51,0x1e,0xd5]
+; CHECK: msr AMAIR_EL1, x3              ; encoding: [0x03,0xa3,0x18,0xd5]
+; CHECK: msr AMAIR_EL2, x3              ; encoding: [0x03,0xa3,0x1c,0xd5]
+; CHECK: msr AMAIR_EL3, x3              ; encoding: [0x03,0xa3,0x1e,0xd5]
+; CHECK: msr CNTFRQ_EL0, x3             ; encoding: [0x03,0xe0,0x1b,0xd5]
+; CHECK: msr CNTHCTL_EL2, x3            ; encoding: [0x03,0xe1,0x1c,0xd5]
+; CHECK: msr CNTHP_CTL_EL2, x3          ; encoding: [0x23,0xe2,0x1c,0xd5]
+; CHECK: msr CNTHP_CVAL_EL2, x3         ; encoding: [0x43,0xe2,0x1c,0xd5]
+; CHECK: msr CNTHP_TVAL_EL2, x3         ; encoding: [0x03,0xe2,0x1c,0xd5]
+; CHECK: msr CNTKCTL_EL1, x3            ; encoding: [0x03,0xe1,0x18,0xd5]
+; CHECK: msr CNTP_CTL_EL0, x3           ; encoding: [0x23,0xe2,0x1b,0xd5]
+; CHECK: msr CNTP_CVAL_EL0, x3          ; encoding: [0x43,0xe2,0x1b,0xd5]
+; CHECK: msr CNTP_TVAL_EL0, x3          ; encoding: [0x03,0xe2,0x1b,0xd5]
+; CHECK: msr CNTVOFF_EL2, x3            ; encoding: [0x63,0xe0,0x1c,0xd5]
+; CHECK: msr CNTV_CTL_EL0, x3           ; encoding: [0x23,0xe3,0x1b,0xd5]
+; CHECK: msr CNTV_CVAL_EL0, x3          ; encoding: [0x43,0xe3,0x1b,0xd5]
+; CHECK: msr CNTV_TVAL_EL0, x3          ; encoding: [0x03,0xe3,0x1b,0xd5]
+; CHECK: msr CONTEXTIDR_EL1, x3         ; encoding: [0x23,0xd0,0x18,0xd5]
+; CHECK: msr CPACR_EL1, x3              ; encoding: [0x43,0x10,0x18,0xd5]
+; CHECK: msr CPTR_EL2, x3               ; encoding: [0x43,0x11,0x1c,0xd5]
+; CHECK: msr CPTR_EL3, x3               ; encoding: [0x43,0x11,0x1e,0xd5]
+; CHECK: msr CSSELR_EL1, x3             ; encoding: [0x03,0x00,0x1a,0xd5]
+; CHECK: msr CURRENTEL, x3              ; encoding: [0x43,0x42,0x18,0xd5]
+; CHECK: msr DACR32_EL2, x3             ; encoding: [0x03,0x30,0x1c,0xd5]
+; CHECK: msr ESR_EL1, x3                ; encoding: [0x03,0x52,0x18,0xd5]
+; CHECK: msr ESR_EL2, x3                ; encoding: [0x03,0x52,0x1c,0xd5]
+; CHECK: msr ESR_EL3, x3                ; encoding: [0x03,0x52,0x1e,0xd5]
+; CHECK: msr FAR_EL1, x3                ; encoding: [0x03,0x60,0x18,0xd5]
+; CHECK: msr FAR_EL2, x3                ; encoding: [0x03,0x60,0x1c,0xd5]
+; CHECK: msr FAR_EL3, x3                ; encoding: [0x03,0x60,0x1e,0xd5]
+; CHECK: msr FPEXC32_EL2, x3            ; encoding: [0x03,0x53,0x1c,0xd5]
+; CHECK: msr HACR_EL2, x3               ; encoding: [0xe3,0x11,0x1c,0xd5]
+; CHECK: msr HCR_EL2, x3                ; encoding: [0x03,0x11,0x1c,0xd5]
+; CHECK: msr HPFAR_EL2, x3              ; encoding: [0x83,0x60,0x1c,0xd5]
+; CHECK: msr HSTR_EL2, x3               ; encoding: [0x63,0x11,0x1c,0xd5]
+; CHECK: msr IFSR32_EL2, x3             ; encoding: [0x23,0x50,0x1c,0xd5]
+; CHECK: msr MAIR_EL1, x3               ; encoding: [0x03,0xa2,0x18,0xd5]
+; CHECK: msr MAIR_EL2, x3               ; encoding: [0x03,0xa2,0x1c,0xd5]
+; CHECK: msr MAIR_EL3, x3               ; encoding: [0x03,0xa2,0x1e,0xd5]
+; CHECK: msr MDCR_EL2, x3               ; encoding: [0x23,0x11,0x1c,0xd5]
+; CHECK: msr MDCR_EL3, x3               ; encoding: [0x23,0x13,0x1e,0xd5]
+; CHECK: msr PAR_EL1, x3                ; encoding: [0x03,0x74,0x18,0xd5]
+; CHECK: msr SCR_EL3, x3                ; encoding: [0x03,0x11,0x1e,0xd5]
+; CHECK: msr SCTLR_EL1, x3              ; encoding: [0x03,0x10,0x18,0xd5]
+; CHECK: msr SCTLR_EL2, x3              ; encoding: [0x03,0x10,0x1c,0xd5]
+; CHECK: msr SCTLR_EL3, x3              ; encoding: [0x03,0x10,0x1e,0xd5]
+; CHECK: msr SDER32_EL3, x3             ; encoding: [0x23,0x11,0x1e,0xd5]
+; CHECK: msr TCR_EL1, x3                ; encoding: [0x43,0x20,0x18,0xd5]
+; CHECK: msr TCR_EL2, x3                ; encoding: [0x43,0x20,0x1c,0xd5]
+; CHECK: msr TCR_EL3, x3                ; encoding: [0x43,0x20,0x1e,0xd5]
+; CHECK: msr TEECR32_EL1, x3            ; encoding: [0x03,0x00,0x12,0xd5]
+; CHECK: msr TEEHBR32_EL1, x3           ; encoding: [0x03,0x10,0x12,0xd5]
+; CHECK: msr TPIDRRO_EL0, x3            ; encoding: [0x63,0xd0,0x1b,0xd5]
+; CHECK: msr TPIDR_EL0, x3              ; encoding: [0x43,0xd0,0x1b,0xd5]
+; CHECK: msr TPIDR_EL1, x3              ; encoding: [0x83,0xd0,0x18,0xd5]
+; CHECK: msr TPIDR_EL2, x3              ; encoding: [0x43,0xd0,0x1c,0xd5]
+; CHECK: msr TPIDR_EL3, x3              ; encoding: [0x43,0xd0,0x1e,0xd5]
+; CHECK: msr TTBR0_EL1, x3              ; encoding: [0x03,0x20,0x18,0xd5]
+; CHECK: msr TTBR0_EL2, x3              ; encoding: [0x03,0x20,0x1c,0xd5]
+; CHECK: msr TTBR0_EL3, x3              ; encoding: [0x03,0x20,0x1e,0xd5]
+; CHECK: msr TTBR1_EL1, x3              ; encoding: [0x23,0x20,0x18,0xd5]
+; CHECK: msr VBAR_EL1, x3               ; encoding: [0x03,0xc0,0x18,0xd5]
+; CHECK: msr VBAR_EL2, x3               ; encoding: [0x03,0xc0,0x1c,0xd5]
+; CHECK: msr VBAR_EL3, x3               ; encoding: [0x03,0xc0,0x1e,0xd5]
+; CHECK: msr VMPIDR_EL2, x3             ; encoding: [0xa3,0x00,0x1c,0xd5]
+; CHECK: msr VPIDR_EL2, x3              ; encoding: [0x03,0x00,0x1c,0xd5]
+; CHECK: msr VTCR_EL2, x3               ; encoding: [0x43,0x21,0x1c,0xd5]
+; CHECK: msr VTTBR_EL2, x3              ; encoding: [0x03,0x21,0x1c,0xd5]
+; CHECK: msr  SPSEL, x3                 ; encoding: [0x03,0x42,0x18,0xd5]
+; CHECK: msr  S3_2_C11_C6_4, x1         ; encoding: [0x81,0xb6,0x1a,0xd5]
+
+  mrs x3, ACTLR_EL1
+  mrs x3, ACTLR_EL2
+  mrs x3, ACTLR_EL3
+  mrs x3, AFSR0_EL1
+  mrs x3, AFSR0_EL2
+  mrs x3, AFSR0_EL3
+  mrs x3, AIDR_EL1
+  mrs x3, AFSR1_EL1
+  mrs x3, AFSR1_EL2
+  mrs x3, AFSR1_EL3
+  mrs x3, AMAIR_EL1
+  mrs x3, AMAIR_EL2
+  mrs x3, AMAIR_EL3
+  mrs x3, CCSIDR_EL1
+  mrs x3, CLIDR_EL1
+  mrs x3, CNTFRQ_EL0
+  mrs x3, CNTHCTL_EL2
+  mrs x3, CNTHP_CTL_EL2
+  mrs x3, CNTHP_CVAL_EL2
+  mrs x3, CNTHP_TVAL_EL2
+  mrs x3, CNTKCTL_EL1
+  mrs x3, CNTPCT_EL0
+  mrs x3, CNTP_CTL_EL0
+  mrs x3, CNTP_CVAL_EL0
+  mrs x3, CNTP_TVAL_EL0
+  mrs x3, CNTVCT_EL0
+  mrs x3, CNTVOFF_EL2
+  mrs x3, CNTV_CTL_EL0
+  mrs x3, CNTV_CVAL_EL0
+  mrs x3, CNTV_TVAL_EL0
+  mrs x3, CONTEXTIDR_EL1
+  mrs x3, CPACR_EL1
+  mrs x3, CPTR_EL2
+  mrs x3, CPTR_EL3
+  mrs x3, CSSELR_EL1
+  mrs x3, CTR_EL0
+  mrs x3, CURRENTEL
+  mrs x3, DACR32_EL2
+  mrs x3, DCZID_EL0
+  mrs x3, REVIDR_EL1
+  mrs x3, ESR_EL1
+  mrs x3, ESR_EL2
+  mrs x3, ESR_EL3
+  mrs x3, FAR_EL1
+  mrs x3, FAR_EL2
+  mrs x3, FAR_EL3
+  mrs x3, FPEXC32_EL2
+  mrs x3, HACR_EL2
+  mrs x3, HCR_EL2
+  mrs x3, HPFAR_EL2
+  mrs x3, HSTR_EL2
+  mrs x3, ID_AA64DFR0_EL1
+  mrs x3, ID_AA64DFR1_EL1
+  mrs x3, ID_AA64ISAR0_EL1
+  mrs x3, ID_AA64ISAR1_EL1
+  mrs x3, ID_AA64MMFR0_EL1
+  mrs x3, ID_AA64MMFR1_EL1
+  mrs x3, ID_AA64PFR0_EL1
+  mrs x3, ID_AA64PFR1_EL1
+  mrs x3, IFSR32_EL2
+  mrs x3, ISR_EL1
+  mrs x3, MAIR_EL1
+  mrs x3, MAIR_EL2
+  mrs x3, MAIR_EL3
+  mrs x3, MDCR_EL2
+  mrs x3, MDCR_EL3
+  mrs x3, MIDR_EL1
+  mrs x3, MPIDR_EL1
+  mrs x3, MVFR0_EL1
+  mrs x3, MVFR1_EL1
+  mrs x3, PAR_EL1
+  mrs x3, RVBAR_EL1
+  mrs x3, RVBAR_EL2
+  mrs x3, RVBAR_EL3
+  mrs x3, SCR_EL3
+  mrs x3, SCTLR_EL1
+  mrs x3, SCTLR_EL2
+  mrs x3, SCTLR_EL3
+  mrs x3, SDER32_EL3
+  mrs x3, TCR_EL1
+  mrs x3, TCR_EL2
+  mrs x3, TCR_EL3
+  mrs x3, TEECR32_EL1
+  mrs x3, TEEHBR32_EL1
+  mrs x3, TPIDRRO_EL0
+  mrs x3, TPIDR_EL0
+  mrs x3, TPIDR_EL1
+  mrs x3, TPIDR_EL2
+  mrs x3, TPIDR_EL3
+  mrs x3, TTBR0_EL1
+  mrs x3, TTBR0_EL2
+  mrs x3, TTBR0_EL3
+  mrs x3, TTBR1_EL1
+  mrs x3, VBAR_EL1
+  mrs x3, VBAR_EL2
+  mrs x3, VBAR_EL3
+  mrs x3, VMPIDR_EL2
+  mrs x3, VPIDR_EL2
+  mrs x3, VTCR_EL2
+  mrs x3, VTTBR_EL2
+
+  mrs x3, MDCCSR_EL0
+  mrs x3, MDCCINT_EL1
+  mrs x3, DBGDTR_EL0
+  mrs x3, DBGDTRRX_EL0
+  mrs x3, DBGVCR32_EL2
+  mrs x3, OSDTRRX_EL1
+  mrs x3, MDSCR_EL1
+  mrs x3, OSDTRTX_EL1
+  mrs x3, OSECCR_EL1
+  mrs x3, DBGBVR0_EL1
+  mrs x3, DBGBVR1_EL1
+  mrs x3, DBGBVR2_EL1
+  mrs x3, DBGBVR3_EL1
+  mrs x3, DBGBVR4_EL1
+  mrs x3, DBGBVR5_EL1
+  mrs x3, DBGBVR6_EL1
+  mrs x3, DBGBVR7_EL1
+  mrs x3, DBGBVR8_EL1
+  mrs x3, DBGBVR9_EL1
+  mrs x3, DBGBVR10_EL1
+  mrs x3, DBGBVR11_EL1
+  mrs x3, DBGBVR12_EL1
+  mrs x3, DBGBVR13_EL1
+  mrs x3, DBGBVR14_EL1
+  mrs x3, DBGBVR15_EL1
+  mrs x3, DBGBCR0_EL1
+  mrs x3, DBGBCR1_EL1
+  mrs x3, DBGBCR2_EL1
+  mrs x3, DBGBCR3_EL1
+  mrs x3, DBGBCR4_EL1
+  mrs x3, DBGBCR5_EL1
+  mrs x3, DBGBCR6_EL1
+  mrs x3, DBGBCR7_EL1
+  mrs x3, DBGBCR8_EL1
+  mrs x3, DBGBCR9_EL1
+  mrs x3, DBGBCR10_EL1
+  mrs x3, DBGBCR11_EL1
+  mrs x3, DBGBCR12_EL1
+  mrs x3, DBGBCR13_EL1
+  mrs x3, DBGBCR14_EL1
+  mrs x3, DBGBCR15_EL1
+  mrs x3, DBGWVR0_EL1
+  mrs x3, DBGWVR1_EL1
+  mrs x3, DBGWVR2_EL1
+  mrs x3, DBGWVR3_EL1
+  mrs x3, DBGWVR4_EL1
+  mrs x3, DBGWVR5_EL1
+  mrs x3, DBGWVR6_EL1
+  mrs x3, DBGWVR7_EL1
+  mrs x3, DBGWVR8_EL1
+  mrs x3, DBGWVR9_EL1
+  mrs x3, DBGWVR10_EL1
+  mrs x3, DBGWVR11_EL1
+  mrs x3, DBGWVR12_EL1
+  mrs x3, DBGWVR13_EL1
+  mrs x3, DBGWVR14_EL1
+  mrs x3, DBGWVR15_EL1
+  mrs x3, DBGWCR0_EL1
+  mrs x3, DBGWCR1_EL1
+  mrs x3, DBGWCR2_EL1
+  mrs x3, DBGWCR3_EL1
+  mrs x3, DBGWCR4_EL1
+  mrs x3, DBGWCR5_EL1
+  mrs x3, DBGWCR6_EL1
+  mrs x3, DBGWCR7_EL1
+  mrs x3, DBGWCR8_EL1
+  mrs x3, DBGWCR9_EL1
+  mrs x3, DBGWCR10_EL1
+  mrs x3, DBGWCR11_EL1
+  mrs x3, DBGWCR12_EL1
+  mrs x3, DBGWCR13_EL1
+  mrs x3, DBGWCR14_EL1
+  mrs x3, DBGWCR15_EL1
+  mrs x3, MDRAR_EL1
+  mrs x3, OSLSR_EL1
+  mrs x3, OSDLR_EL1
+  mrs x3, DBGPRCR_EL1
+  mrs x3, DBGCLAIMSET_EL1
+  mrs x3, DBGCLAIMCLR_EL1
+  mrs x3, DBGAUTHSTATUS_EL1
+  mrs x1, S3_2_C15_C6_4
+  mrs x3, s3_3_c11_c1_4
+  mrs x3, S3_3_c11_c1_4
+
+; CHECK: mrs x3, ACTLR_EL1              ; encoding: [0x23,0x10,0x38,0xd5]
+; CHECK: mrs x3, ACTLR_EL2              ; encoding: [0x23,0x10,0x3c,0xd5]
+; CHECK: mrs x3, ACTLR_EL3              ; encoding: [0x23,0x10,0x3e,0xd5]
+; CHECK: mrs x3, AFSR0_EL1              ; encoding: [0x03,0x51,0x38,0xd5]
+; CHECK: mrs x3, AFSR0_EL2              ; encoding: [0x03,0x51,0x3c,0xd5]
+; CHECK: mrs x3, AFSR0_EL3              ; encoding: [0x03,0x51,0x3e,0xd5]
+; CHECK: mrs x3, AIDR_EL1               ; encoding: [0xe3,0x00,0x39,0xd5]
+; CHECK: mrs x3, AFSR1_EL1              ; encoding: [0x23,0x51,0x38,0xd5]
+; CHECK: mrs x3, AFSR1_EL2              ; encoding: [0x23,0x51,0x3c,0xd5]
+; CHECK: mrs x3, AFSR1_EL3              ; encoding: [0x23,0x51,0x3e,0xd5]
+; CHECK: mrs x3, AMAIR_EL1              ; encoding: [0x03,0xa3,0x38,0xd5]
+; CHECK: mrs x3, AMAIR_EL2              ; encoding: [0x03,0xa3,0x3c,0xd5]
+; CHECK: mrs x3, AMAIR_EL3              ; encoding: [0x03,0xa3,0x3e,0xd5]
+; CHECK: mrs x3, CCSIDR_EL1             ; encoding: [0x03,0x00,0x39,0xd5]
+; CHECK: mrs x3, CLIDR_EL1              ; encoding: [0x23,0x00,0x39,0xd5]
+; CHECK: mrs x3, CNTFRQ_EL0             ; encoding: [0x03,0xe0,0x3b,0xd5]
+; CHECK: mrs x3, CNTHCTL_EL2            ; encoding: [0x03,0xe1,0x3c,0xd5]
+; CHECK: mrs x3, CNTHP_CTL_EL2          ; encoding: [0x23,0xe2,0x3c,0xd5]
+; CHECK: mrs x3, CNTHP_CVAL_EL2         ; encoding: [0x43,0xe2,0x3c,0xd5]
+; CHECK: mrs x3, CNTHP_TVAL_EL2         ; encoding: [0x03,0xe2,0x3c,0xd5]
+; CHECK: mrs x3, CNTKCTL_EL1            ; encoding: [0x03,0xe1,0x38,0xd5]
+; CHECK: mrs x3, CNTPCT_EL0             ; encoding: [0x23,0xe0,0x3b,0xd5]
+; CHECK: mrs x3, CNTP_CTL_EL0           ; encoding: [0x23,0xe2,0x3b,0xd5]
+; CHECK: mrs x3, CNTP_CVAL_EL0          ; encoding: [0x43,0xe2,0x3b,0xd5]
+; CHECK: mrs x3, CNTP_TVAL_EL0          ; encoding: [0x03,0xe2,0x3b,0xd5]
+; CHECK: mrs x3, CNTVCT_EL0             ; encoding: [0x43,0xe0,0x3b,0xd5]
+; CHECK: mrs x3, CNTVOFF_EL2            ; encoding: [0x63,0xe0,0x3c,0xd5]
+; CHECK: mrs x3, CNTV_CTL_EL0           ; encoding: [0x23,0xe3,0x3b,0xd5]
+; CHECK: mrs x3, CNTV_CVAL_EL0          ; encoding: [0x43,0xe3,0x3b,0xd5]
+; CHECK: mrs x3, CNTV_TVAL_EL0          ; encoding: [0x03,0xe3,0x3b,0xd5]
+; CHECK: mrs x3, CONTEXTIDR_EL1         ; encoding: [0x23,0xd0,0x38,0xd5]
+; CHECK: mrs x3, CPACR_EL1              ; encoding: [0x43,0x10,0x38,0xd5]
+; CHECK: mrs x3, CPTR_EL2               ; encoding: [0x43,0x11,0x3c,0xd5]
+; CHECK: mrs x3, CPTR_EL3               ; encoding: [0x43,0x11,0x3e,0xd5]
+; CHECK: mrs x3, CSSELR_EL1             ; encoding: [0x03,0x00,0x3a,0xd5]
+; CHECK: mrs x3, CTR_EL0                ; encoding: [0x23,0x00,0x3b,0xd5]
+; CHECK: mrs x3, CURRENTEL              ; encoding: [0x43,0x42,0x38,0xd5]
+; CHECK: mrs x3, DACR32_EL2             ; encoding: [0x03,0x30,0x3c,0xd5]
+; CHECK: mrs x3, DCZID_EL0              ; encoding: [0xe3,0x00,0x3b,0xd5]
+; CHECK: mrs x3, REVIDR_EL1             ; encoding: [0xc3,0x00,0x38,0xd5]
+; CHECK: mrs x3, ESR_EL1                ; encoding: [0x03,0x52,0x38,0xd5]
+; CHECK: mrs x3, ESR_EL2                ; encoding: [0x03,0x52,0x3c,0xd5]
+; CHECK: mrs x3, ESR_EL3                ; encoding: [0x03,0x52,0x3e,0xd5]
+; CHECK: mrs x3, FAR_EL1                ; encoding: [0x03,0x60,0x38,0xd5]
+; CHECK: mrs x3, FAR_EL2                ; encoding: [0x03,0x60,0x3c,0xd5]
+; CHECK: mrs x3, FAR_EL3                ; encoding: [0x03,0x60,0x3e,0xd5]
+; CHECK: mrs x3, FPEXC32_EL2            ; encoding: [0x03,0x53,0x3c,0xd5]
+; CHECK: mrs x3, HACR_EL2               ; encoding: [0xe3,0x11,0x3c,0xd5]
+; CHECK: mrs x3, HCR_EL2                ; encoding: [0x03,0x11,0x3c,0xd5]
+; CHECK: mrs x3, HPFAR_EL2              ; encoding: [0x83,0x60,0x3c,0xd5]
+; CHECK: mrs x3, HSTR_EL2               ; encoding: [0x63,0x11,0x3c,0xd5]
+; CHECK: mrs x3, ID_AA64DFR0_EL1        ; encoding: [0x03,0x05,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64DFR1_EL1        ; encoding: [0x23,0x05,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64ISAR0_EL1       ; encoding: [0x03,0x06,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64ISAR1_EL1       ; encoding: [0x23,0x06,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64MMFR0_EL1       ; encoding: [0x03,0x07,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64MMFR1_EL1       ; encoding: [0x23,0x07,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64PFR0_EL1        ; encoding: [0x03,0x04,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64PFR1_EL1        ; encoding: [0x23,0x04,0x38,0xd5]
+; CHECK: mrs x3, IFSR32_EL2             ; encoding: [0x23,0x50,0x3c,0xd5]
+; CHECK: mrs x3, ISR_EL1                ; encoding: [0x03,0xc1,0x38,0xd5]
+; CHECK: mrs x3, MAIR_EL1               ; encoding: [0x03,0xa2,0x38,0xd5]
+; CHECK: mrs x3, MAIR_EL2               ; encoding: [0x03,0xa2,0x3c,0xd5]
+; CHECK: mrs x3, MAIR_EL3               ; encoding: [0x03,0xa2,0x3e,0xd5]
+; CHECK: mrs x3, MDCR_EL2               ; encoding: [0x23,0x11,0x3c,0xd5]
+; CHECK: mrs x3, MDCR_EL3               ; encoding: [0x23,0x13,0x3e,0xd5]
+; CHECK: mrs x3, MIDR_EL1               ; encoding: [0x03,0x00,0x38,0xd5]
+; CHECK: mrs x3, MPIDR_EL1              ; encoding: [0xa3,0x00,0x38,0xd5]
+; CHECK: mrs x3, MVFR0_EL1              ; encoding: [0x03,0x03,0x38,0xd5]
+; CHECK: mrs x3, MVFR1_EL1              ; encoding: [0x23,0x03,0x38,0xd5]
+; CHECK: mrs x3, PAR_EL1                ; encoding: [0x03,0x74,0x38,0xd5]
+; CHECK: mrs x3, RVBAR_EL1              ; encoding: [0x23,0xc0,0x38,0xd5]
+; CHECK: mrs x3, RVBAR_EL2              ; encoding: [0x23,0xc0,0x3c,0xd5]
+; CHECK: mrs x3, RVBAR_EL3              ; encoding: [0x23,0xc0,0x3e,0xd5]
+; CHECK: mrs x3, SCR_EL3                ; encoding: [0x03,0x11,0x3e,0xd5]
+; CHECK: mrs x3, SCTLR_EL1              ; encoding: [0x03,0x10,0x38,0xd5]
+; CHECK: mrs x3, SCTLR_EL2              ; encoding: [0x03,0x10,0x3c,0xd5]
+; CHECK: mrs x3, SCTLR_EL3              ; encoding: [0x03,0x10,0x3e,0xd5]
+; CHECK: mrs x3, SDER32_EL3             ; encoding: [0x23,0x11,0x3e,0xd5]
+; CHECK: mrs x3, TCR_EL1                ; encoding: [0x43,0x20,0x38,0xd5]
+; CHECK: mrs x3, TCR_EL2                ; encoding: [0x43,0x20,0x3c,0xd5]
+; CHECK: mrs x3, TCR_EL3                ; encoding: [0x43,0x20,0x3e,0xd5]
+; CHECK: mrs x3, TEECR32_EL1            ; encoding: [0x03,0x00,0x32,0xd5]
+; CHECK: mrs x3, TEEHBR32_EL1           ; encoding: [0x03,0x10,0x32,0xd5]
+; CHECK: mrs x3, TPIDRRO_EL0            ; encoding: [0x63,0xd0,0x3b,0xd5]
+; CHECK: mrs x3, TPIDR_EL0              ; encoding: [0x43,0xd0,0x3b,0xd5]
+; CHECK: mrs x3, TPIDR_EL1              ; encoding: [0x83,0xd0,0x38,0xd5]
+; CHECK: mrs x3, TPIDR_EL2              ; encoding: [0x43,0xd0,0x3c,0xd5]
+; CHECK: mrs x3, TPIDR_EL3              ; encoding: [0x43,0xd0,0x3e,0xd5]
+; CHECK: mrs x3, TTBR0_EL1              ; encoding: [0x03,0x20,0x38,0xd5]
+; CHECK: mrs x3, TTBR0_EL2              ; encoding: [0x03,0x20,0x3c,0xd5]
+; CHECK: mrs x3, TTBR0_EL3              ; encoding: [0x03,0x20,0x3e,0xd5]
+; CHECK: mrs x3, TTBR1_EL1              ; encoding: [0x23,0x20,0x38,0xd5]
+; CHECK: mrs x3, VBAR_EL1               ; encoding: [0x03,0xc0,0x38,0xd5]
+; CHECK: mrs x3, VBAR_EL2               ; encoding: [0x03,0xc0,0x3c,0xd5]
+; CHECK: mrs x3, VBAR_EL3               ; encoding: [0x03,0xc0,0x3e,0xd5]
+; CHECK: mrs x3, VMPIDR_EL2             ; encoding: [0xa3,0x00,0x3c,0xd5]
+; CHECK: mrs x3, VPIDR_EL2              ; encoding: [0x03,0x00,0x3c,0xd5]
+; CHECK: mrs x3, VTCR_EL2               ; encoding: [0x43,0x21,0x3c,0xd5]
+; CHECK: mrs x3, VTTBR_EL2              ; encoding: [0x03,0x21,0x3c,0xd5]
+; CHECK: mrs	x3, MDCCSR_EL0          ; encoding: [0x03,0x01,0x33,0xd5]
+; CHECK: mrs	x3, MDCCINT_EL1         ; encoding: [0x03,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGDTR_EL0          ; encoding: [0x03,0x04,0x33,0xd5]
+; CHECK: mrs	x3, DBGDTRRX_EL0        ; encoding: [0x03,0x05,0x33,0xd5]
+; CHECK: mrs	x3, DBGVCR32_EL2        ; encoding: [0x03,0x07,0x34,0xd5]
+; CHECK: mrs	x3, OSDTRRX_EL1         ; encoding: [0x43,0x00,0x30,0xd5]
+; CHECK: mrs	x3, MDSCR_EL1           ; encoding: [0x43,0x02,0x30,0xd5]
+; CHECK: mrs	x3, OSDTRTX_EL1         ; encoding: [0x43,0x03,0x30,0xd5]
+; CHECK: mrs	x3, OSECCR_EL1          ; encoding: [0x43,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR0_EL1         ; encoding: [0x83,0x00,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR1_EL1         ; encoding: [0x83,0x01,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR2_EL1         ; encoding: [0x83,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR3_EL1         ; encoding: [0x83,0x03,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR4_EL1         ; encoding: [0x83,0x04,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR5_EL1         ; encoding: [0x83,0x05,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR6_EL1         ; encoding: [0x83,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR7_EL1         ; encoding: [0x83,0x07,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR8_EL1         ; encoding: [0x83,0x08,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR9_EL1         ; encoding: [0x83,0x09,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR10_EL1        ; encoding: [0x83,0x0a,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR11_EL1        ; encoding: [0x83,0x0b,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR12_EL1        ; encoding: [0x83,0x0c,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR13_EL1        ; encoding: [0x83,0x0d,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR14_EL1        ; encoding: [0x83,0x0e,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR15_EL1        ; encoding: [0x83,0x0f,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR0_EL1         ; encoding: [0xa3,0x00,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR1_EL1         ; encoding: [0xa3,0x01,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR2_EL1         ; encoding: [0xa3,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR3_EL1         ; encoding: [0xa3,0x03,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR4_EL1         ; encoding: [0xa3,0x04,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR5_EL1         ; encoding: [0xa3,0x05,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR6_EL1         ; encoding: [0xa3,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR7_EL1         ; encoding: [0xa3,0x07,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR8_EL1         ; encoding: [0xa3,0x08,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR9_EL1         ; encoding: [0xa3,0x09,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR10_EL1        ; encoding: [0xa3,0x0a,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR11_EL1        ; encoding: [0xa3,0x0b,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR12_EL1        ; encoding: [0xa3,0x0c,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR13_EL1        ; encoding: [0xa3,0x0d,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR14_EL1        ; encoding: [0xa3,0x0e,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR15_EL1        ; encoding: [0xa3,0x0f,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR0_EL1         ; encoding: [0xc3,0x00,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR1_EL1         ; encoding: [0xc3,0x01,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR2_EL1         ; encoding: [0xc3,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR3_EL1         ; encoding: [0xc3,0x03,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR4_EL1         ; encoding: [0xc3,0x04,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR5_EL1         ; encoding: [0xc3,0x05,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR6_EL1         ; encoding: [0xc3,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR7_EL1         ; encoding: [0xc3,0x07,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR8_EL1         ; encoding: [0xc3,0x08,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR9_EL1         ; encoding: [0xc3,0x09,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR10_EL1        ; encoding: [0xc3,0x0a,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR11_EL1        ; encoding: [0xc3,0x0b,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR12_EL1        ; encoding: [0xc3,0x0c,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR13_EL1        ; encoding: [0xc3,0x0d,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR14_EL1        ; encoding: [0xc3,0x0e,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR15_EL1        ; encoding: [0xc3,0x0f,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR0_EL1         ; encoding: [0xe3,0x00,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR1_EL1         ; encoding: [0xe3,0x01,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR2_EL1         ; encoding: [0xe3,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR3_EL1         ; encoding: [0xe3,0x03,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR4_EL1         ; encoding: [0xe3,0x04,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR5_EL1         ; encoding: [0xe3,0x05,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR6_EL1         ; encoding: [0xe3,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR7_EL1         ; encoding: [0xe3,0x07,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR8_EL1         ; encoding: [0xe3,0x08,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR9_EL1         ; encoding: [0xe3,0x09,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR10_EL1        ; encoding: [0xe3,0x0a,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR11_EL1        ; encoding: [0xe3,0x0b,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR12_EL1        ; encoding: [0xe3,0x0c,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR13_EL1        ; encoding: [0xe3,0x0d,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR14_EL1        ; encoding: [0xe3,0x0e,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR15_EL1        ; encoding: [0xe3,0x0f,0x30,0xd5]
+; CHECK: mrs	x3, MDRAR_EL1           ; encoding: [0x03,0x10,0x30,0xd5]
+; CHECK: mrs	x3, OSLSR_EL1           ; encoding: [0x83,0x11,0x30,0xd5]
+; CHECK: mrs	x3, OSDLR_EL1           ; encoding: [0x83,0x13,0x30,0xd5]
+; CHECK: mrs	x3, DBGPRCR_EL1         ; encoding: [0x83,0x14,0x30,0xd5]
+; CHECK: mrs	x3, DBGCLAIMSET_EL1     ; encoding: [0xc3,0x78,0x30,0xd5]
+; CHECK: mrs	x3, DBGCLAIMCLR_EL1     ; encoding: [0xc3,0x79,0x30,0xd5]
+; CHECK: mrs	x3, DBGAUTHSTATUS_EL1   ; encoding: [0xc3,0x7e,0x30,0xd5]
+; CHECK: mrs    x1, S3_2_C15_C6_4       ; encoding: [0x81,0xf6,0x3a,0xd5]
+; CHECK: mrs	x3, S3_3_C11_C1_4       ; encoding: [0x83,0xb1,0x3b,0xd5]
+; CHECK: mrs	x3, S3_3_C11_C1_4       ; encoding: [0x83,0xb1,0x3b,0xd5]
+
+  msr RMR_EL3, x0
+  msr RMR_EL2, x0
+  msr RMR_EL1, x0
+  msr OSLAR_EL1, x3
+  msr DBGDTRTX_EL0, x3
+        
+; CHECK: msr	RMR_EL3, x0             ; encoding: [0x40,0xc0,0x1e,0xd5]
+; CHECK: msr	RMR_EL2, x0             ; encoding: [0x40,0xc0,0x1c,0xd5]
+; CHECK: msr	RMR_EL1, x0             ; encoding: [0x40,0xc0,0x18,0xd5]
+; CHECK: msr	OSLAR_EL1, x3           ; encoding: [0x83,0x10,0x10,0xd5]
+; CHECK: msr	DBGDTRTX_EL0, x3        ; encoding: [0x03,0x05,0x13,0xd5]
+        
+ mrs x0, ID_PFR0_EL1
+ mrs x0, ID_PFR1_EL1
+ mrs x0, ID_DFR0_EL1
+ mrs x0, ID_AFR0_EL1
+ mrs x0, ID_ISAR0_EL1
+ mrs x0, ID_ISAR1_EL1
+ mrs x0, ID_ISAR2_EL1
+ mrs x0, ID_ISAR3_EL1
+ mrs x0, ID_ISAR4_EL1
+ mrs x0, ID_ISAR5_EL1
+ mrs x0, AFSR1_EL1
+ mrs x0, AFSR0_EL1
+ mrs x0, REVIDR_EL1
+; CHECK: mrs	x0, ID_PFR0_EL1         ; encoding: [0x00,0x01,0x38,0xd5]
+; CHECK: mrs	x0, ID_PFR1_EL1         ; encoding: [0x20,0x01,0x38,0xd5]
+; CHECK: mrs	x0, ID_DFR0_EL1         ; encoding: [0x40,0x01,0x38,0xd5]
+; CHECK: mrs	x0, ID_AFR0_EL1         ; encoding: [0x60,0x01,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR0_EL1        ; encoding: [0x00,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR1_EL1        ; encoding: [0x20,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR2_EL1        ; encoding: [0x40,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR3_EL1        ; encoding: [0x60,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR4_EL1        ; encoding: [0x80,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR5_EL1        ; encoding: [0xa0,0x02,0x38,0xd5]
+; CHECK: mrs	x0, AFSR1_EL1           ; encoding: [0x20,0x51,0x38,0xd5]
+; CHECK: mrs	x0, AFSR0_EL1           ; encoding: [0x00,0x51,0x38,0xd5]
+; CHECK: mrs	x0, REVIDR_EL1          ; encoding: [0xc0,0x00,0x38,0xd5]
diff --git a/test/MC/AArch64/arm64-target-specific-sysreg.s b/test/MC/AArch64/arm64-target-specific-sysreg.s
new file mode 100644
index 0000000..05cea3a
--- /dev/null
+++ b/test/MC/AArch64/arm64-target-specific-sysreg.s
@@ -0,0 +1,10 @@
+// RUN: not llvm-mc -triple arm64 -mcpu=generic -show-encoding < %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CHECK-GENERIC
+//
+// RUN: llvm-mc -triple arm64 -mcpu=cyclone -show-encoding < %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CHECK-CYCLONE
+
+msr CPM_IOACC_CTL_EL3, x0
+
+// CHECK-GENERIC: error: expected writable system register or pstate
+// CHECK-CYCLONE: msr CPM_IOACC_CTL_EL3, x0   // encoding: [0x00,0xf2,0x1f,0xd5]
diff --git a/test/MC/AArch64/arm64-tls-modifiers-darwin.s b/test/MC/AArch64/arm64-tls-modifiers-darwin.s
new file mode 100644
index 0000000..8ff07cd
--- /dev/null
+++ b/test/MC/AArch64/arm64-tls-modifiers-darwin.s
@@ -0,0 +1,13 @@
+; RUN: llvm-mc -triple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+; RUN: llvm-mc -triple=arm64-apple-ios7.0 -filetype=obj %s -o - | llvm-objdump -r - | FileCheck %s --check-prefix=CHECK-OBJ
+
+        adrp x2, _var@TLVPPAGE
+        ldr x0, [x15, _var@TLVPPAGEOFF]
+        add x30, x0, _var@TLVPPAGEOFF
+; CHECK: adrp x2, _var@TLVPPAG
+; CHECK: ldr x0, [x15, _var@TLVPPAGEOFF]
+; CHECK: add x30, x0, _var@TLVPPAGEOFF
+
+; CHECK-OBJ: 8 ARM64_RELOC_TLVP_LOAD_PAGEOFF12 _var
+; CHECK-OBJ: 4 ARM64_RELOC_TLVP_LOAD_PAGEOFF12 _var
+; CHECK-OBJ: 0 ARM64_RELOC_TLVP_LOAD_PAGE21 _var
diff --git a/test/MC/AArch64/arm64-tls-relocs.s b/test/MC/AArch64/arm64-tls-relocs.s
new file mode 100644
index 0000000..96c2b55
--- /dev/null
+++ b/test/MC/AArch64/arm64-tls-relocs.s
@@ -0,0 +1,320 @@
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s -o - | \
+// RUN:   llvm-readobj -r -t | FileCheck --check-prefix=CHECK-ELF %s
+
+
+////////////////////////////////////////////////////////////////////////////////
+// TLS initial-exec forms
+////////////////////////////////////////////////////////////////////////////////
+
+        movz x15, #:gottprel_g1:var
+// CHECK: movz    x15, #:gottprel_g1:var  // encoding: [0bAAA01111,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 [[VARSYM:[^ ]+]]
+
+
+        movk x13, #:gottprel_g0_nc:var
+// CHECK: movk    x13, #:gottprel_g0_nc:var // encoding: [0bAAA01101,A,0b100AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_aarch64_movw
+
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC [[VARSYM]]
+
+        adrp x11, :gottprel:var
+        ldr x10, [x0, #:gottprel_lo12:var]
+        ldr x9, :gottprel:var
+// CHECK: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK: ldr     x10, [x0, :gottprel_lo12:var] // encoding: [0x0a,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: ldr     x9, :gottprel:var       // encoding: [0bAAA01001,A,A,0x58]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_ldr_pcrel_imm19
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_LD_GOTTPREL_PREL19 [[VARSYM]]
+
+
+////////////////////////////////////////////////////////////////////////////////
+// TLS local-exec forms
+////////////////////////////////////////////////////////////////////////////////
+
+        movz x3, #:tprel_g2:var
+        movn x4, #:tprel_g2:var
+// CHECK: movz    x3, #:tprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x4, #:tprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
+
+
+        movz x5, #:tprel_g1:var
+        movn x6, #:tprel_g1:var
+        movz w7, #:tprel_g1:var
+// CHECK: movz    x5, #:tprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    x6, #:tprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w7, #:tprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
+
+
+        movk x9, #:tprel_g1_nc:var
+        movk w10, #:tprel_g1_nc:var
+// CHECK: movk    x9, #:tprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w10, #:tprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
+
+
+        movz x11, #:tprel_g0:var
+        movn x12, #:tprel_g0:var
+        movz w13, #:tprel_g0:var
+// CHECK: movz    x11, #:tprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    x12, #:tprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movz    w13, #:tprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
+
+
+        movk x15, #:tprel_g0_nc:var
+        movk w16, #:tprel_g0_nc:var
+// CHECK: movk    x15, #:tprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w16, #:tprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
+
+
+        add x21, x22, #:tprel_lo12:var
+// CHECK: add     x21, x22, :tprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_add_imm12
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
+
+
+        add x25, x26, #:tprel_lo12_nc:var
+// CHECK: add     x25, x26, :tprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
+
+
+        ldrb w29, [x30, #:tprel_lo12:var]
+        ldrsb x29, [x28, #:tprel_lo12_nc:var]
+// CHECK: ldrb    w29, [x30, :tprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsb   x29, [x28, :tprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST8_TPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC [[VARSYM]]
+
+
+        strh w27, [x26, #:tprel_lo12:var]
+        ldrsh x25, [x24, #:tprel_lo12_nc:var]
+// CHECK: strh    w27, [x26, :tprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsh   x25, [x24, :tprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST16_TPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC [[VARSYM]]
+
+
+        ldr w23, [x22, #:tprel_lo12:var]
+        ldrsw x21, [x20, #:tprel_lo12_nc:var]
+// CHECK: ldr     w23, [x22, :tprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldrsw   x21, [x20, :tprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST32_TPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC [[VARSYM]]
+
+        ldr x19, [x18, #:tprel_lo12:var]
+        str x17, [x16, #:tprel_lo12_nc:var]
+// CHECK: ldr     x19, [x18, :tprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: str     x17, [x16, :tprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST64_TPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC [[VARSYM]]
+
+
+////////////////////////////////////////////////////////////////////////////////
+// TLS local-dynamic forms
+////////////////////////////////////////////////////////////////////////////////
+
+        movz x3, #:dtprel_g2:var
+        movn x4, #:dtprel_g2:var
+// CHECK: movz    x3, #:dtprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x4, #:dtprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G2 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G2 [[VARSYM]]
+
+
+        movz x5, #:dtprel_g1:var
+        movn x6, #:dtprel_g1:var
+        movz w7, #:dtprel_g1:var
+// CHECK: movz    x5, #:dtprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    x6, #:dtprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w7, #:dtprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
+
+
+        movk x9, #:dtprel_g1_nc:var
+        movk w10, #:dtprel_g1_nc:var
+// CHECK: movk    x9, #:dtprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w10, #:dtprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
+
+
+        movz x11, #:dtprel_g0:var
+        movn x12, #:dtprel_g0:var
+        movz w13, #:dtprel_g0:var
+// CHECK: movz    x11, #:dtprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    x12, #:dtprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movz    w13, #:dtprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
+
+
+        movk x15, #:dtprel_g0_nc:var
+        movk w16, #:dtprel_g0_nc:var
+// CHECK: movk    x15, #:dtprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w16, #:dtprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
+
+
+        add x21, x22, #:dtprel_lo12:var
+// CHECK: add     x21, x22, :dtprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_add_imm12
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
+
+
+        add x25, x26, #:dtprel_lo12_nc:var
+// CHECK: add     x25, x26, :dtprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
+
+
+        ldrb w29, [x30, #:dtprel_lo12:var]
+        ldrsb x29, [x28, #:dtprel_lo12_nc:var]
+// CHECK: ldrb    w29, [x30, :dtprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsb   x29, [x28, :dtprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST8_DTPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC [[VARSYM]]
+
+
+        strh w27, [x26, #:dtprel_lo12:var]
+        ldrsh x25, [x24, #:dtprel_lo12_nc:var]
+// CHECK: strh    w27, [x26, :dtprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsh   x25, [x24, :dtprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST16_DTPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC [[VARSYM]]
+
+
+        ldr w23, [x22, #:dtprel_lo12:var]
+        ldrsw x21, [x20, #:dtprel_lo12_nc:var]
+// CHECK: ldr     w23, [x22, :dtprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldrsw   x21, [x20, :dtprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST32_DTPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC [[VARSYM]]
+
+        ldr x19, [x18, #:dtprel_lo12:var]
+        str x17, [x16, #:dtprel_lo12_nc:var]
+// CHECK: ldr     x19, [x18, :dtprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: str     x17, [x16, :dtprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST64_DTPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC [[VARSYM]]
+
+////////////////////////////////////////////////////////////////////////////////
+// TLS descriptor forms
+////////////////////////////////////////////////////////////////////////////////
+
+        adrp x8, :tlsdesc:var
+        ldr x7, [x6, #:tlsdesc_lo12:var]
+        add x5, x4, #:tlsdesc_lo12:var
+        .tlsdesccall var
+        blr x3
+
+// CHECK: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK: ldr     x7, [x6, :tlsdesc_lo12:var] // encoding: [0xc7,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: add     x5, x4, :tlsdesc_lo12:var // encoding: [0x85,0bAAAAAA00,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_add_imm12
+// CHECK: .tlsdesccall var                // encoding: []
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: var, kind: fixup_aarch64_tlsdesc_call
+// CHECK: blr     x3                      // encoding: [0x60,0x00,0x3f,0xd6]
+
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADR_PAGE [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_LD64_LO12_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADD_LO12_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_CALL [[VARSYM]]
+
+        // Make sure symbol 5 has type STT_TLS:
+
+// CHECK-ELF:      Symbols [
+// CHECK-ELF:        Symbol {
+// CHECK-ELF:          Name: var
+// CHECK-ELF-NEXT:     Value:
+// CHECK-ELF-NEXT:     Size:
+// CHECK-ELF-NEXT:     Binding: Global
+// CHECK-ELF-NEXT:     Type: TLS
diff --git a/test/MC/AArch64/arm64-v128_lo-diagnostics.s b/test/MC/AArch64/arm64-v128_lo-diagnostics.s
new file mode 100644
index 0000000..ffe29cf
--- /dev/null
+++ b/test/MC/AArch64/arm64-v128_lo-diagnostics.s
@@ -0,0 +1,11 @@
+// RUN: not llvm-mc -triple arm64 -mattr=neon %s 2> %t > /dev/null
+// RUN: FileCheck %s < %t
+
+        sqrdmulh v0.8h, v1.8h, v16.h[0]
+// CHECK: error: invalid operand for instruction
+
+        sqrdmulh h0, h1, v16.h[0]
+// CHECK: error: invalid operand for instruction
+
+        sqdmull2 v0.4h, v1.8h, v16.h[0]
+// CHECK: error: invalid operand for instruction
diff --git a/test/MC/AArch64/arm64-variable-exprs.s b/test/MC/AArch64/arm64-variable-exprs.s
new file mode 100644
index 0000000..0120442
--- /dev/null
+++ b/test/MC/AArch64/arm64-variable-exprs.s
@@ -0,0 +1,40 @@
+// RUN: llvm-mc -triple arm64-apple-darwin10 %s -filetype=obj -o %t.o
+
+.data
+
+        .long 0
+a:
+        .long 0
+b = a
+
+c:      .long b
+
+d2 = d
+.globl d2
+d3 = d + 4
+.globl d3
+
+e = a + 4
+
+g:
+f = g
+        .long 0
+
+        .long b
+        .long e
+        .long a + 4
+        .long d
+        .long d2
+        .long d3
+        .long f
+        .long g
+
+///
+        .text
+t0:
+Lt0_a:
+        .long 0
+
+	.section	__DWARF,__debug_frame,regular,debug
+Lt1 = Lt0_a
+	.long	Lt1
diff --git a/test/MC/AArch64/arm64-vector-lists.s b/test/MC/AArch64/arm64-vector-lists.s
new file mode 100644
index 0000000..a9b2d19
--- /dev/null
+++ b/test/MC/AArch64/arm64-vector-lists.s
@@ -0,0 +1,20 @@
+// RUN: not llvm-mc -triple arm64 -mattr=neon -show-encoding < %s 2>%t | FileCheck %s
+// RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+    ST4     {v0.8B-v3.8B}, [x0]
+    ST4     {v0.4H-v3.4H}, [x0]
+
+// CHECK: st4  { v0.8b, v1.8b, v2.8b, v3.8b }, [x0] // encoding: [0x00,0x00,0x00,0x0c]
+// CHECK: st4  { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] // encoding: [0x00,0x04,0x00,0x0c]
+
+    ST4     {v0.8B-v4.8B}, [x0]
+    ST4     {v0.8B-v3.8B,v4.8B}, [x0]
+    ST4     {v0.8B-v3.8H}, [x0]
+    ST4     {v0.8B-v3.16B}, [x0]
+    ST4     {v0.8B-},[x0]
+
+// CHECK-ERRORS: error: invalid number of vectors
+// CHECK-ERRORS: error: '}' expected
+// CHECK-ERRORS: error: mismatched register size suffix
+// CHECK-ERRORS: error: mismatched register size suffix
+// CHECK-ERRORS: error: vector register expected
diff --git a/test/MC/AArch64/arm64-verbose-vector-case.s b/test/MC/AArch64/arm64-verbose-vector-case.s
new file mode 100644
index 0000000..6f0a381
--- /dev/null
+++ b/test/MC/AArch64/arm64-verbose-vector-case.s
@@ -0,0 +1,19 @@
+// RUN: llvm-mc -triple arm64 -mattr=crypto -show-encoding < %s | FileCheck %s
+
+pmull v8.8h, v8.8b, v8.8b
+pmull2 v8.8h, v8.16b, v8.16b
+pmull v8.1q, v8.1d, v8.1d
+pmull2 v8.1q, v8.2d, v8.2d
+// CHECK: pmull v8.8h, v8.8b, v8.8b    // encoding: [0x08,0xe1,0x28,0x0e]
+// CHECK: pmull2 v8.8h, v8.16b, v8.16b // encoding: [0x08,0xe1,0x28,0x4e]
+// CHECK: pmull v8.1q, v8.1d, v8.1d    // encoding: [0x08,0xe1,0xe8,0x0e]
+// CHECK: pmull2 v8.1q, v8.2d, v8.2d   // encoding: [0x08,0xe1,0xe8,0x4e]
+
+pmull v8.8H, v8.8B, v8.8B
+pmull2 v8.8H, v8.16B, v8.16B
+pmull v8.1Q, v8.1D, v8.1D
+pmull2 v8.1Q, v8.2D, v8.2D
+// CHECK: pmull v8.8h, v8.8b, v8.8b    // encoding: [0x08,0xe1,0x28,0x0e]
+// CHECK: pmull2 v8.8h, v8.16b, v8.16b // encoding: [0x08,0xe1,0x28,0x4e]
+// CHECK: pmull v8.1q, v8.1d, v8.1d    // encoding: [0x08,0xe1,0xe8,0x0e]
+// CHECK: pmull2 v8.1q, v8.2d, v8.2d   // encoding: [0x08,0xe1,0xe8,0x4e]
diff --git a/test/MC/AArch64/basic-a64-diagnostics.s b/test/MC/AArch64/basic-a64-diagnostics.s
index 792538c..a4a3b13 100644
--- a/test/MC/AArch64/basic-a64-diagnostics.s
+++ b/test/MC/AArch64/basic-a64-diagnostics.s
@@ -1,5 +1,5 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
+// RUN: FileCheck --check-prefix=CHECK-ERROR --check-prefix=CHECK-ERROR-ARM64 < %t %s
 
 //------------------------------------------------------------------------------
 // Add/sub (extended register)
@@ -83,9 +83,9 @@
 // CHECK-ERROR: error: expected compatible register, symbol or integer in range [0, 4095]
 // CHECK-ERROR-NEXT:         add w4, w5, #-1
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected compatible register, symbol or integer in range [0, 4095]
-// CHECK-ERROR-NEXT:         add w5, w6, #0x1000
-// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-AARCH64-NEXT: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR-AARCH64-NEXT:         add w5, w6, #0x1000
+// CHECK-ERROR-AARCH64-NEXT:                     ^
 // CHECK-ERROR-NEXT: error: expected compatible register, symbol or integer in range [0, 4095]
 // CHECK-ERROR-NEXT:         add w4, w5, #-1, lsl #12
 // CHECK-ERROR-NEXT:                     ^
@@ -141,9 +141,9 @@
 
 // Out of range immediate
         adds w0, w5, #0x10000
-// CHECK-ERROR: error: expected compatible register, symbol or integer in range [0, 4095]
-// CHECK-ERROR-NEXT:         adds w0, w5, #0x10000
-// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-AARCH64: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR-AARCH64-NEXT:         adds w0, w5, #0x10000
+// CHECK-ERROR-AARCH64-NEXT:                      ^
 
 // Wn|WSP should be in second place
         adds w4, wzr, #0x123
@@ -750,10 +750,10 @@
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         sbfm w3, wsp, #1, #9
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         sbfm x9, x5, #-1, #0
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         sbfm x9, x5, #0, #-1
 // CHECK-ERROR-NEXT:                          ^
 
@@ -761,16 +761,16 @@
         sbfm w7, w11, #19, #32
         sbfm x29, x30, #64, #0
         sbfm x10, x20, #63, #64
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         sbfm w3, w5, #32, #1
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         sbfm w7, w11, #19, #32
 // CHECK-ERROR-NEXT:                            ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         sbfm x29, x30, #64, #0
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         sbfm x10, x20, #63, #64
 // CHECK-ERROR-NEXT:                             ^
 
@@ -778,16 +778,16 @@
         ubfm w7, w11, #19, #32
         ubfm x29, x30, #64, #0
         ubfm x10, x20, #63, #64
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         ubfm w3, w5, #32, #1
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         ubfm w7, w11, #19, #32
 // CHECK-ERROR-NEXT:                            ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         ubfm x29, x30, #64, #0
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         ubfm x10, x20, #63, #64
 // CHECK-ERROR-NEXT:                             ^
 
@@ -795,31 +795,31 @@
         bfm w7, w11, #19, #32
         bfm x29, x30, #64, #0
         bfm x10, x20, #63, #64
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         bfm w3, w5, #32, #1
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         bfm w7, w11, #19, #32
 // CHECK-ERROR-NEXT:                            ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         bfm x29, x30, #64, #0
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         bfm x10, x20, #63, #64
 // CHECK-ERROR-NEXT:                             ^
 
         sxtb x3, x2
         sxth xzr, xzr
         sxtw x3, x5
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         sxtb x3, x2
-// CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         sxth xzr, xzr
-// CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         sxtw x3, x5
-// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-AARCH64: error: invalid operand for instruction
+// CHECK-ERROR-AARCH64-NEXT:         sxtb x3, x2
+// CHECK-ERROR-AARCH64-NEXT:                  ^
+// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-AARCH64-NEXT:         sxth xzr, xzr
+// CHECK-ERROR-AARCH64-NEXT:                   ^
+// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-AARCH64-NEXT:         sxtw x3, x5
+// CHECK-ERROR-AARCH64-NEXT:                  ^
 
         uxtb x3, x12
         uxth x5, x9
@@ -832,9 +832,9 @@
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         uxth x5, x9
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: invalid instruction
-// CHECK-ERROR-NEXT:         uxtw x3, x5
-// CHECK-ERROR-NEXT:         ^
+// CHECK-ERROR-AARCH64-NEXT: error: invalid instruction
+// CHECK-ERROR-AARCH64-NEXT:         uxtw x3, x5
+// CHECK-ERROR-AARCH64-NEXT:         ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         uxtb x2, sp
 // CHECK-ERROR-NEXT:                  ^
@@ -853,13 +853,13 @@
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         asr sp, x2, #1
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         asr x25, x26, #-1
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         asr x25, x26, #64
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         asr w9, w8, #32
 // CHECK-ERROR-NEXT:                     ^
 
@@ -869,18 +869,19 @@
         sbfiz w11, w12, #32, #0
         sbfiz w9, w10, #10, #23
         sbfiz x3, x5, #12, #53
-        sbfiz sp, x3, #5, #6
-        sbfiz w3, wsp, #7, #8
-// CHECK-ERROR: error: expected integer in range [<lsb>, 31]
+        sbfiz sp, x3, #7, #6
+        sbfiz w3, wsp, #10, #8
+// CHECK-ERROR-AARCH64: error: expected integer in range [<lsb>, 31]
+// CHECK-ERROR-ARM64: error: expected integer in range [1, 32]
 // CHECK-ERROR-NEXT:         sbfiz w1, w2, #0, #0
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         sbfiz wsp, w9, #0, #1
 // CHECK-ERROR-NEXT:               ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         sbfiz w9, w10, #32, #1
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         sbfiz w11, w12, #32, #0
 // CHECK-ERROR-NEXT:                         ^
 // CHECK-ERROR-NEXT: error: requested insert overflows register
@@ -890,10 +891,10 @@
 // CHECK-ERROR-NEXT:         sbfiz x3, x5, #12, #53
 // CHECK-ERROR-NEXT:                            ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         sbfiz sp, x3, #5, #6
+// CHECK-ERROR-NEXT:         sbfiz sp, x3, #7, #6
 // CHECK-ERROR-NEXT:               ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         sbfiz w3, wsp, #7, #8
+// CHECK-ERROR-NEXT:         sbfiz w3, wsp, #10, #8
 // CHECK-ERROR-NEXT:                   ^
 
         sbfx w1, w2, #0, #0
@@ -902,18 +903,19 @@
         sbfx w11, w12, #32, #0
         sbfx w9, w10, #10, #23
         sbfx x3, x5, #12, #53
-        sbfx sp, x3, #5, #6
-        sbfx w3, wsp, #7, #8
-// CHECK-ERROR: error: expected integer in range [<lsb>, 31]
+        sbfx sp, x3, #7, #6
+        sbfx w3, wsp, #10, #8
+// CHECK-ERROR-AARCH64: error: expected integer in range [<lsb>, 31]
+// CHECK-ERROR-ARM64: error: expected integer in range [1, 32]
 // CHECK-ERROR-NEXT:         sbfx w1, w2, #0, #0
 // CHECK-ERROR-NEXT:                          ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         sbfx wsp, w9, #0, #1
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         sbfx w9, w10, #32, #1
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         sbfx w11, w12, #32, #0
 // CHECK-ERROR-NEXT:                        ^
 // CHECK-ERROR-NEXT: error: requested extract overflows register
@@ -923,10 +925,10 @@
 // CHECK-ERROR-NEXT:         sbfx x3, x5, #12, #53
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         sbfx sp, x3, #5, #6
+// CHECK-ERROR-NEXT:         sbfx sp, x3, #7, #6
 // CHECK-ERROR-NEXT:              ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         sbfx w3, wsp, #7, #8
+// CHECK-ERROR-NEXT:         sbfx w3, wsp, #10, #8
 // CHECK-ERROR-NEXT:                  ^
 
         bfi w1, w2, #0, #0
@@ -935,18 +937,19 @@
         bfi w11, w12, #32, #0
         bfi w9, w10, #10, #23
         bfi x3, x5, #12, #53
-        bfi sp, x3, #5, #6
-        bfi w3, wsp, #7, #8
-// CHECK-ERROR: error: expected integer in range [<lsb>, 31]
+        bfi sp, x3, #7, #6
+        bfi w3, wsp, #10, #8
+// CHECK-ERROR-AARCH64: error: expected integer in range [<lsb>, 31]
+// CHECK-ERROR-ARM64: error: expected integer in range [1, 32]
 // CHECK-ERROR-NEXT:         bfi w1, w2, #0, #0
 // CHECK-ERROR-NEXT:                         ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         bfi wsp, w9, #0, #1
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         bfi w9, w10, #32, #1
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         bfi w11, w12, #32, #0
 // CHECK-ERROR-NEXT:                       ^
 // CHECK-ERROR-NEXT: error: requested insert overflows register
@@ -956,10 +959,10 @@
 // CHECK-ERROR-NEXT:         bfi x3, x5, #12, #53
 // CHECK-ERROR-NEXT:                          ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         bfi sp, x3, #5, #6
+// CHECK-ERROR-NEXT:         bfi sp, x3, #7, #6
 // CHECK-ERROR-NEXT:             ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         bfi w3, wsp, #7, #8
+// CHECK-ERROR-NEXT:         bfi w3, wsp, #10, #8
 // CHECK-ERROR-NEXT:                 ^
 
         bfxil w1, w2, #0, #0
@@ -968,18 +971,19 @@
         bfxil w11, w12, #32, #0
         bfxil w9, w10, #10, #23
         bfxil x3, x5, #12, #53
-        bfxil sp, x3, #5, #6
-        bfxil w3, wsp, #7, #8
-// CHECK-ERROR: error: expected integer in range [<lsb>, 31]
+        bfxil sp, x3, #7, #6
+        bfxil w3, wsp, #10, #8
+// CHECK-ERROR-AARCH64: error: expected integer in range [<lsb>, 31]
+// CHECK-ERROR-ARM64: error: expected integer in range [1, 32]
 // CHECK-ERROR-NEXT:         bfxil w1, w2, #0, #0
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         bfxil wsp, w9, #0, #1
 // CHECK-ERROR-NEXT:               ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         bfxil w9, w10, #32, #1
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         bfxil w11, w12, #32, #0
 // CHECK-ERROR-NEXT:                         ^
 // CHECK-ERROR-NEXT: error: requested extract overflows register
@@ -989,10 +993,10 @@
 // CHECK-ERROR-NEXT:         bfxil x3, x5, #12, #53
 // CHECK-ERROR-NEXT:                            ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         bfxil sp, x3, #5, #6
+// CHECK-ERROR-NEXT:         bfxil sp, x3, #7, #6
 // CHECK-ERROR-NEXT:               ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         bfxil w3, wsp, #7, #8
+// CHECK-ERROR-NEXT:         bfxil w3, wsp, #10, #8
 // CHECK-ERROR-NEXT:                   ^
 
         ubfiz w1, w2, #0, #0
@@ -1001,18 +1005,19 @@
         ubfiz w11, w12, #32, #0
         ubfiz w9, w10, #10, #23
         ubfiz x3, x5, #12, #53
-        ubfiz sp, x3, #5, #6
-        ubfiz w3, wsp, #7, #8
-// CHECK-ERROR: error: expected integer in range [<lsb>, 31]
+        ubfiz sp, x3, #7, #6
+        ubfiz w3, wsp, #10, #8
+// CHECK-ERROR-AARCH64: error: expected integer in range [<lsb>, 31]
+// CHECK-ERROR-ARM64: error: expected integer in range [1, 32]
 // CHECK-ERROR-NEXT:         ubfiz w1, w2, #0, #0
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ubfiz wsp, w9, #0, #1
 // CHECK-ERROR-NEXT:               ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         ubfiz w9, w10, #32, #1
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         ubfiz w11, w12, #32, #0
 // CHECK-ERROR-NEXT:                         ^
 // CHECK-ERROR-NEXT: error: requested insert overflows register
@@ -1022,10 +1027,10 @@
 // CHECK-ERROR-NEXT:         ubfiz x3, x5, #12, #53
 // CHECK-ERROR-NEXT:                            ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         ubfiz sp, x3, #5, #6
+// CHECK-ERROR-NEXT:         ubfiz sp, x3, #7, #6
 // CHECK-ERROR-NEXT:               ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         ubfiz w3, wsp, #7, #8
+// CHECK-ERROR-NEXT:         ubfiz w3, wsp, #10, #8
 // CHECK-ERROR-NEXT:                   ^
 
         ubfx w1, w2, #0, #0
@@ -1034,18 +1039,19 @@
         ubfx w11, w12, #32, #0
         ubfx w9, w10, #10, #23
         ubfx x3, x5, #12, #53
-        ubfx sp, x3, #5, #6
-        ubfx w3, wsp, #7, #8
-// CHECK-ERROR: error: expected integer in range [<lsb>, 31]
+        ubfx sp, x3, #7, #6
+        ubfx w3, wsp, #10, #8
+// CHECK-ERROR-AARCH64: error: expected integer in range [<lsb>, 31]
+// CHECK-ERROR-ARM64: error: expected integer in range [1, 32]
 // CHECK-ERROR-NEXT:         ubfx w1, w2, #0, #0
 // CHECK-ERROR-NEXT:                      ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ubfx wsp, w9, #0, #1
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         ubfx w9, w10, #32, #1
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         ubfx w11, w12, #32, #0
 // CHECK-ERROR-NEXT:                        ^
 // CHECK-ERROR-NEXT: error: requested extract overflows register
@@ -1055,10 +1061,10 @@
 // CHECK-ERROR-NEXT:         ubfx x3, x5, #12, #53
 // CHECK-ERROR-NEXT:                           ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         ubfx sp, x3, #5, #6
+// CHECK-ERROR-NEXT:         ubfx sp, x3, #7, #6
 // CHECK-ERROR-NEXT:              ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         ubfx w3, wsp, #7, #8
+// CHECK-ERROR-NEXT:         ubfx w3, wsp, #10, #8
 // CHECK-ERROR-NEXT:                  ^
 
 //------------------------------------------------------------------------------
@@ -1125,16 +1131,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        ccmp wsp, #4, #2, ne
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        ccmp w25, #-1, #15, hs
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        ccmp w3, #32, #0, ge
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmp w19, #5, #-1, lt
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmp w20, #7, #16, hs
 // CHECK-ERROR-NEXT:                      ^
 
@@ -1146,16 +1152,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        ccmp sp, #4, #2, ne
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        ccmp x25, #-1, #15, hs
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        ccmp x3, #32, #0, ge
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmp x19, #5, #-1, lt
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmp x20, #7, #16, hs
 // CHECK-ERROR-NEXT:                      ^
 
@@ -1167,16 +1173,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        ccmn wsp, #4, #2, ne
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        ccmn w25, #-1, #15, hs
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        ccmn w3, #32, #0, ge
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmn w19, #5, #-1, lt
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmn w20, #7, #16, hs
 // CHECK-ERROR-NEXT:                      ^
 
@@ -1188,16 +1194,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        ccmn sp, #4, #2, ne
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        ccmn x25, #-1, #15, hs
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        ccmn x3, #32, #0, ge
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmn x19, #5, #-1, lt
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmn x20, #7, #16, hs
 // CHECK-ERROR-NEXT:                      ^
 
@@ -1212,13 +1218,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        ccmp wsp, w4, #2, ne
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        ccmp w3, wsp, #0, ge
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmp w19, w5, #-1, lt
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmp w20, w7, #16, hs
 // CHECK-ERROR-NEXT:                      ^
 
@@ -1229,13 +1235,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        ccmp sp, x4, #2, ne
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        ccmp x25, sp, #15, hs
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmp x19, x5, #-1, lt
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmp x20, x7, #16, hs
 // CHECK-ERROR-NEXT:                      ^
 
@@ -1246,13 +1252,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        ccmn wsp, w4, #2, ne
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        ccmn w25, wsp, #15, hs
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmn w19, w5, #-1, lt
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmn w20, w7, #16, hs
 // CHECK-ERROR-NEXT:                      ^
 
@@ -1263,13 +1269,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        ccmn sp, x4, #2, ne
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        ccmn x25, sp, #15, hs
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmn x19, x5, #-1, lt
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        ccmn x20, x7, #16, hs
 // CHECK-ERROR-NEXT:                      ^
 
@@ -1418,16 +1424,16 @@
         hlt #65536
         dcps4 #43
         dcps4
-// CHECK-ERROR: error: expected integer in range [0, 65535]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         svc #-1
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         hlt #65536
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: invalid instruction
+// CHECK-ERROR-NEXT: error: {{invalid instruction|unrecognized instruction mnemonic}}
 // CHECK-ERROR-NEXT:         dcps4 #43
 // CHECK-ERROR-NEXT:         ^
-// CHECK-ERROR-NEXT: error: invalid instruction
+// CHECK-ERROR-NEXT: error: {{invalid instruction|unrecognized instruction mnemonic}}
 // CHECK-ERROR-NEXT:         dcps4
 // CHECK-ERROR-NEXT:         ^
 
@@ -1437,28 +1443,28 @@
 
         extr w2, w20, w30, #-1
         extr w9, w19, w20, #32
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         extr w2, w20, w30, #-1
 // CHECK-ERROR-NEXT:                            ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         extr w9, w19, w20, #32
 // CHECK-ERROR-NEXT:                            ^
 
         extr x10, x15, x20, #-1
         extr x20, x25, x30, #64
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         extr x10, x15, x20, #-1
 // CHECK-ERROR-NEXT:                             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         extr x20, x25, x30, #64
 // CHECK-ERROR-NEXT:                             ^
 
         ror w9, w10, #32
         ror x10, x11, #64
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:         ror w9, w10, #32
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:         ror x10, x11, #64
 // CHECK-ERROR-NEXT:                       ^
 
@@ -1467,7 +1473,8 @@
 //------------------------------------------------------------------------------
 
         fcmp s3, d2
-// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR-AARCH64: error: expected floating-point constant #0.0
+// CHECK-ERROR-ARM64: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         fcmp s3, d2
 // CHECK-ERROR-NEXT:                  ^
 
@@ -1494,37 +1501,37 @@
 
         fccmp s19, s5, #-1, lt
         fccmp s20, s7, #16, hs
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        fccmp s19, s5, #-1, lt
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        fccmp s20, s7, #16, hs
 // CHECK-ERROR-NEXT:                      ^
 
         fccmp d19, d5, #-1, lt
         fccmp d20, d7, #16, hs
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        fccmp d19, d5, #-1, lt
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        fccmp d20, d7, #16, hs
 // CHECK-ERROR-NEXT:                      ^
 
         fccmpe s19, s5, #-1, lt
         fccmpe s20, s7, #16, hs
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        fccmpe s19, s5, #-1, lt
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        fccmpe s20, s7, #16, hs
 // CHECK-ERROR-NEXT:                      ^
 
         fccmpe d19, d5, #-1, lt
         fccmpe d20, d7, #16, hs
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        fccmpe d19, d5, #-1, lt
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:        fccmpe d20, d7, #16, hs
 // CHECK-ERROR-NEXT:                      ^
 
@@ -1604,10 +1611,10 @@
         fcvtzs w13, s31, #0
         fcvtzs w19, s20, #33
         fcvtzs wsp, s19, #14
-// CHECK-ERROR-NEXT: error: expected integer in range [1, 32]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR-NEXT:        fcvtzs w13, s31, #0
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [1, 32]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR-NEXT:        fcvtzs w19, s20, #33
 // CHECK-ERROR-NEXT:                         ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
@@ -1617,10 +1624,10 @@
         fcvtzs x13, s31, #0
         fcvtzs x19, s20, #65
         fcvtzs sp, s19, #14
-// CHECK-ERROR-NEXT: error: expected integer in range [1, 64]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR-NEXT:        fcvtzs x13, s31, #0
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [1, 64]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR-NEXT:        fcvtzs x19, s20, #65
 // CHECK-ERROR-NEXT:                         ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
@@ -1630,10 +1637,10 @@
         fcvtzu w13, s31, #0
         fcvtzu w19, s20, #33
         fcvtzu wsp, s19, #14
-// CHECK-ERROR-NEXT: error: expected integer in range [1, 32]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR-NEXT:        fcvtzu w13, s31, #0
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [1, 32]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR-NEXT:        fcvtzu w19, s20, #33
 // CHECK-ERROR-NEXT:                         ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
@@ -1643,10 +1650,10 @@
         fcvtzu x13, s31, #0
         fcvtzu x19, s20, #65
         fcvtzu sp, s19, #14
-// CHECK-ERROR-NEXT: error: expected integer in range [1, 64]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR-NEXT:        fcvtzu x13, s31, #0
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [1, 64]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR-NEXT:        fcvtzu x19, s20, #65
 // CHECK-ERROR-NEXT:                         ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
@@ -1730,9 +1737,9 @@
 
         ;; No particular reason, but a striking omission
         fmov d0, #0.0
-// CHECK-ERROR: error: expected compatible register or floating-point constant
-// CHECK-ERROR-NEXT:           fmov d0, #0.0
-// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-AARCH64: error: expected compatible register or floating-point constant
+// CHECK-ERROR-AARCH64-NEXT:           fmov d0, #0.0
+// CHECK-ERROR-AARCH64-NEXT:                    ^
 
 //------------------------------------------------------------------------------
 // Floating-point <-> integer conversion
@@ -1746,10 +1753,12 @@
 // CHECK-ERROR: error: expected lane specifier '[1]'
 // CHECK-ERROR-NEXT:         fmov x3, v0.d[0]
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: lane number incompatible with layout
+// CHECK-ERROR-AARCH64-NEXT: error: lane number incompatible with layout
+// CHECK-ERROR-ARM64-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT: fmov v29.1d[1], x2
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: lane number incompatible with layout
+// CHECK-ERROR-AARCH64-NEXT: error: lane number incompatible with layout
+// CHECK-ERROR-ARM64-NEXT: error: expected lane specifier '[1]'
 // CHECK-ERROR-NEXT: fmov x7, v0.d[2]
 // CHECK-ERROR-NEXT:               ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
@@ -1789,10 +1798,11 @@
 // Load/store exclusive
 //------------------------------------------------------------------------------
 
-       stxrb w2, x3, [x4, #20]
+       stxrb w2, w3, [x4, #20]
        stlxrh w10, w11, [w2]
-// CHECK-ERROR: error: expected '#0'
-// CHECK-ERROR-NEXT:         stxrb w2, x3, [x4, #20]
+// CHECK-ERROR-AARCH64: error: expected '#0'
+// CHECK-ERROR-ARM64: error: index must be absent or #0
+// CHECK-ERROR-NEXT:         stxrb w2, w3, [x4, #20]
 // CHECK-ERROR-NEXT:                       ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         stlxrh w10, w11, [w2]
@@ -1831,16 +1841,16 @@
         sturh w17, [x1, #256]
         ldursw x20, [x1, #256]
         ldur x12, [sp, #256]
-// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:        ldurb w2, [sp, #256]
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         sturh w17, [x1, #256]
 // CHECK-ERROR-NEXT:                    ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldursw x20, [x1, #256]
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldur x12, [sp, #256]
 // CHECK-ERROR-NEXT:                   ^
 
@@ -1849,19 +1859,19 @@
         ldursb x9, [sp, #-257]
         ldur w2, [x30, #-257]
         stur q9, [x20, #-257]
-// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         stur h2, [x2, #-257]
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         stur b2, [x2, #-257]
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldursb x9, [sp, #-257]
 // CHECK-ERROR-NEXT:                    ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldur w2, [x30, #-257]
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         stur q9, [x20, #-257]
 // CHECK-ERROR-NEXT:                  ^
 
@@ -1875,12 +1885,13 @@
 //------------------------------------------------------------------------------
         ldr x3, [x4, #25], #0
         ldr x4, [x9, #0], #4
-// CHECK-ERROR: error: expected symbolic reference or integer in range [0, 32760]
+// CHECK-ERROR-AARCH64: error: {{expected symbolic reference or integer|index must be a multiple of 8}} in range [0, 32760]
+// CHECK-ERROR-ARM64: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldr x3, [x4, #25], #0
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         ldr x4, [x9, #0], #4
-// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-AARCH64-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-AARCH64-NEXT:         ldr x4, [x9, #0], #4
+// CHECK-ERROR-AARCH64-NEXT:                           ^
 
         strb w1, [x19], #256
         strb w9, [sp], #-257
@@ -1888,22 +1899,22 @@
         strh w9, [sp], #-257
         str w1, [x19], #256
         str w9, [sp], #-257
-// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         strb w1, [x19], #256
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         strb w9, [sp], #-257
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         strh w1, [x19], #256
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         strh w9, [sp], #-257
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str w1, [x19], #256
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str w9, [sp], #-257
 // CHECK-ERROR-NEXT:                       ^
 
@@ -1913,22 +1924,22 @@
         ldrh w9, [sp], #-257
         ldr w1, [x19], #256
         ldr w9, [sp], #-257
-// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrb w1, [x19], #256
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrb w9, [sp], #-257
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrh w1, [x19], #256
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrh w9, [sp], #-257
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr w1, [x19], #256
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr w9, [sp], #-257
 // CHECK-ERROR-NEXT:                       ^
 
@@ -1938,22 +1949,22 @@
         ldrsh x22, [x13], #-257
         ldrsw x2, [x3], #256
         ldrsw x22, [x13], #-257
-// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsb x2, [x3], #256
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsb x22, [x13], #-257
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsh x2, [x3], #256
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsh x22, [x13], #-257
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsw x2, [x3], #256
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsw x22, [x13], #-257
 // CHECK-ERROR-NEXT:                           ^
 
@@ -1961,16 +1972,16 @@
         ldrsb w22, [x13], #-257
         ldrsh w2, [x3], #256
         ldrsh w22, [x13], #-257
-// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsb w2, [x3], #256
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsb w22, [x13], #-257
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsh w2, [x3], #256
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsh w22, [x13], #-257
 // CHECK-ERROR-NEXT:                           ^
 
@@ -1984,34 +1995,34 @@
         str d3, [x13], #-257
         str q3, [x3], #256
         str q3, [x13], #-257
-// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str b3, [x3], #256
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str b3, [x13], #-257
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str h3, [x3], #256
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str h3, [x13], #-257
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str s3, [x3], #256
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str s3, [x13], #-257
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str d3, [x3], #256
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str d3, [x13], #-257
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str q3, [x3], #256
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str q3, [x13], #-257
 // CHECK-ERROR-NEXT:                        ^
 
@@ -2025,34 +2036,34 @@
         ldr d3, [x13], #-257
         ldr q3, [x3], #256
         ldr q3, [x13], #-257
-// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr b3, [x3], #256
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr b3, [x13], #-257
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr h3, [x3], #256
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr h3, [x13], #-257
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr s3, [x3], #256
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr s3, [x13], #-257
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr d3, [x3], #256
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr d3, [x13], #-257
 // CHECK-ERROR-NEXT:                        ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr q3, [x3], #256
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr q3, [x13], #-257
 // CHECK-ERROR-NEXT:                        ^
 
@@ -2074,19 +2085,19 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         strb w1, [x19, #256]!
 // CHECK-ERROR-NEXT:                             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         strb w9, [sp, #-257]!
 // CHECK-ERROR-NEXT:                  ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         strh w1, [x19, #256]!
 // CHECK-ERROR-NEXT:                             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         strh w9, [sp, #-257]!
 // CHECK-ERROR-NEXT:                  ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         str w1, [x19, #256]!
 // CHECK-ERROR-NEXT:                            ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str w9, [sp, #-257]!
 // CHECK-ERROR-NEXT:                 ^
 
@@ -2099,19 +2110,19 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrb w1, [x19, #256]!
 // CHECK-ERROR-NEXT:                             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrb w9, [sp, #-257]!
 // CHECK-ERROR-NEXT:                  ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrh w1, [x19, #256]!
 // CHECK-ERROR-NEXT:                             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrh w9, [sp, #-257]!
 // CHECK-ERROR-NEXT:                  ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldr w1, [x19, #256]!
 // CHECK-ERROR-NEXT:                            ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr w9, [sp, #-257]!
 // CHECK-ERROR-NEXT:                 ^
 
@@ -2124,19 +2135,19 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrsb x2, [x3, #256]!
 // CHECK-ERROR-NEXT:                             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsb x22, [x13, #-257]!
 // CHECK-ERROR-NEXT:                    ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrsh x2, [x3, #256]!
 // CHECK-ERROR-NEXT:                             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsh x22, [x13, #-257]!
 // CHECK-ERROR-NEXT:                    ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrsw x2, [x3, #256]!
 // CHECK-ERROR-NEXT:                             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsw x22, [x13, #-257]!
 // CHECK-ERROR-NEXT:                    ^
 
@@ -2147,13 +2158,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrsb w2, [x3, #256]!
 // CHECK-ERROR-NEXT:                             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsb w22, [x13, #-257]!
 // CHECK-ERROR-NEXT:                    ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldrsh w2, [x3, #256]!
 // CHECK-ERROR-NEXT:                             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrsh w22, [x13, #-257]!
 // CHECK-ERROR-NEXT:                    ^
 
@@ -2168,25 +2179,25 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         str b3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str b3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         str h3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str h3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         str s3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str s3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         str d3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str d3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
 
@@ -2201,25 +2212,25 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldr b3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr b3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldr h3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr h3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldr s3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr s3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldr d3, [x3, #256]!
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr d3, [x13, #-257]!
 // CHECK-ERROR-NEXT:                 ^
 
@@ -2231,16 +2242,16 @@
         sttrh w17, [x1, #256]
         ldtrsw x20, [x1, #256]
         ldtr x12, [sp, #256]
-// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:        ldtrb w2, [sp, #256]
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         sttrh w17, [x1, #256]
 // CHECK-ERROR-NEXT:                    ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldtrsw x20, [x1, #256]
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldtr x12, [sp, #256]
 // CHECK-ERROR-NEXT:                   ^
 
@@ -2255,10 +2266,10 @@
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         sttr b2, [x2, #-257]
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldtrsb x9, [sp, #-257]
 // CHECK-ERROR-NEXT:                    ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldtr w2, [x30, #-257]
 // CHECK-ERROR-NEXT:                  ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
@@ -2276,19 +2287,19 @@
         ldr w0, [x4, #16384]
         ldrh w2, [x21, #8192]
         ldrb w3, [x12, #4096]
-// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr q0, [x11, #65536]
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr x0, [sp, #32768]
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldr w0, [x4, #16384]
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrh w2, [x21, #8192]
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         ldrb w3, [x12, #4096]
 // CHECK-ERROR-NEXT:                  ^
 
@@ -2296,15 +2307,15 @@
         ldr w0, [x0, #2]
         ldrsh w2, [x0, #123]
         str q0, [x0, #8]
-// CHECK-ERROR: error: too few operands for instruction
-// CHECK-ERROR-NEXT:         ldr w0, [x0, #2]
-// CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: too few operands for instruction
-// CHECK-ERROR-NEXT:         ldrsh w2, [x0, #123]
-// CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: too few operands for instruction
-// CHECK-ERROR-NEXT:         str q0, [x0, #8]
-// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-AARCH64: error: too few operands for instruction
+// CHECK-ERROR-AARCH64-NEXT:         ldr w0, [x0, #2]
+// CHECK-ERROR-AARCH64-NEXT:                 ^
+// CHECK-ERROR-AARCH64-NEXT: error: too few operands for instruction
+// CHECK-ERROR-AARCH64-NEXT:         ldrsh w2, [x0, #123]
+// CHECK-ERROR-AARCH64-NEXT:                   ^
+// CHECK-ERROR-AARCH64-NEXT: error: too few operands for instruction
+// CHECK-ERROR-AARCH64-NEXT:         str q0, [x0, #8]
+// CHECK-ERROR-AARCH64-NEXT:                 ^
 
 //// 32-bit addresses
         ldr w0, [w20]
@@ -2324,13 +2335,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT: strb w0, [wsp]
 // CHECK-ERROR-NEXT:           ^
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR-NEXT:         strh w31, [x23, #1]
-// CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: too few operands for instruction
-// CHECK-ERROR-NEXT:         str x5, [x22, #12]
-// CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-AARCH64: error: invalid operand for instruction
+// CHECK-ERROR-AARCH64-NEXT:         strh w31, [x23, #1]
+// CHECK-ERROR-AARCH64-NEXT:              ^
+// CHECK-ERROR-AARCH64-NEXT: error: too few operands for instruction
+// CHECK-ERROR-AARCH64-NEXT:         str x5, [x22, #12]
+// CHECK-ERROR-AARCH64-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: {{expected|index must be an}} integer in range [-256, 255]
 // CHECK-ERROR-NEXT:         str w7, [x12, #16384]
 // CHECK-ERROR-NEXT:                 ^
 
@@ -2339,16 +2350,19 @@
         prfm #32, [sp, #8]
         prfm pldl1strm, [w3, #8]
         prfm wibble, [sp]
-// CHECK-ERROR: error: Invalid immediate for instruction
+// CHECK-ERROR-AARCH64: error: Invalid immediate for instruction
+// CHECK-ERROR-ARM64: error: prefetch operand out of range, [0,31] expected
 // CHECK-ERROR-NEXT:        prfm #-1, [sp]
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-AARCH64-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-ARM64-NEXT: error: prefetch operand out of range, [0,31] expected
 // CHECK-ERROR-NEXT:        prfm #32, [sp, #8]
 // CHECK-ERROR-NEXT:             ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:        prfm pldl1strm, [w3, #8]
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: operand specifier not recognised
+// CHECK-ERROR-AARCH64-NEXT: error: operand specifier not recognised
+// CHECK-ERROR-ARM64-NEXT: error: pre-fetch hint expected
 // CHECK-ERROR-NEXT:        prfm wibble, [sp]
 // CHECK-ERROR-NEXT:             ^
 
@@ -2431,10 +2445,12 @@
 // CHECK-ERROR-NEXT: error: expected integer shift amount
 // CHECK-ERROR-NEXT:         ldr q5, [sp, x2, lsl #-1]
 // CHECK-ERROR-NEXT:                               ^
-// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtw' with optional shift of #0 or #4
+// CHECK-ERROR-AARCH64-NEXT: error: expected 'lsl' or 'sxtw' with optional shift of #0 or #4
+// CHECK-ERROR-ARM64-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #4
 // CHECK-ERROR-NEXT:         ldr q10, [x20, w4, uxtw #2]
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtw' with optional shift of #0 or #4
+// CHECK-ERROR-AARCH64-NEXT: error: expected 'lsl' or 'sxtw' with optional shift of #0 or #4
+// CHECK-ERROR-ARM64-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #4
 // CHECK-ERROR-NEXT:         str q21, [x20, w4, uxtw #5]
 // CHECK-ERROR-NEXT:                  ^
 
@@ -2446,16 +2462,16 @@
         stp w9, w10, [x5, #256]
         ldp w11, w12, [x9, #-260]
         stp wsp, w9, [sp]
-// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldp w3, w2, [x4, #1]
 // CHECK-ERROR-NEXT:                          ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stp w1, w2, [x3, #253]
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stp w9, w10, [x5, #256]
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldp w11, w12, [x9, #-260]
 // CHECK-ERROR-NEXT:                       ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
@@ -2465,26 +2481,26 @@
         ldpsw x9, x2, [sp, #2]
         ldpsw x1, x2, [x10, #256]
         ldpsw x3, x4, [x11, #-260]
-// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldpsw x9, x2, [sp, #2]
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldpsw x1, x2, [x10, #256]
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldpsw x3, x4, [x11, #-260]
 // CHECK-ERROR-NEXT:                       ^
 
         ldp x2, x5, [sp, #4]
         ldp x5, x6, [x9, #512]
         stp x7, x8, [x10, #-520]
-// CHECK-ERROR: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         ldp x2, x5, [sp, #4]
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         ldp x5, x6, [x9, #512]
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         stp x7, x8, [x10, #-520]
 // CHECK-ERROR-NEXT:                     ^
 
@@ -2500,13 +2516,13 @@
         stp s3, s5, [sp, #-2]
         ldp s6, s26, [x4, #-260]
         stp s13, s19, [x5, #256]
-// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stp s3, s5, [sp, #-2]
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldp s6, s26, [x4, #-260]
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stp s13, s19, [x5, #256]
 // CHECK-ERROR-NEXT:                       ^
 
@@ -2516,10 +2532,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldp d3, d4, [xzr]
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         ldp d5, d6, [x0, #512]
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         stp d7, d8, [x0, #-520]
 // CHECK-ERROR-NEXT:                     ^
 
@@ -2530,13 +2546,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldp d3, q2, [sp]
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1008]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         ldp q3, q5, [sp, #8]
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1008]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         stp q20, q25, [x5, #1024]
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1008]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         ldp q30, q15, [x23, #-1040]
 // CHECK-ERROR-NEXT:                       ^
 
@@ -2549,16 +2565,16 @@
         stp w9, w10, [x5], #256
         ldp w11, w12, [x9], #-260
         stp wsp, w9, [sp], #0
-// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldp w3, w2, [x4], #1
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stp w1, w2, [x3], #253
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stp w9, w10, [x5], #256
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldp w11, w12, [x9], #-260
 // CHECK-ERROR-NEXT:                       ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
@@ -2568,26 +2584,26 @@
         ldpsw x9, x2, [sp], #2
         ldpsw x1, x2, [x10], #256
         ldpsw x3, x4, [x11], #-260
-// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldpsw x9, x2, [sp], #2
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldpsw x1, x2, [x10], #256
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldpsw x3, x4, [x11], #-260
 // CHECK-ERROR-NEXT:                       ^
 
         ldp x2, x5, [sp], #4
         ldp x5, x6, [x9], #512
         stp x7, x8, [x10], #-520
-// CHECK-ERROR: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         ldp x2, x5, [sp], #4
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         ldp x5, x6, [x9], #512
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         stp x7, x8, [x10], #-520
 // CHECK-ERROR-NEXT:                            ^
 
@@ -2603,13 +2619,13 @@
         stp s3, s5, [sp], #-2
         ldp s6, s26, [x4], #-260
         stp s13, s19, [x5], #256
-// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stp s3, s5, [sp], #-2
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldp s6, s26, [x4], #-260
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stp s13, s19, [x5], #256
 // CHECK-ERROR-NEXT:                       ^
 
@@ -2619,10 +2635,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldp d3, d4, [xzr], #0
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         ldp d5, d6, [x0], #512
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         stp d7, d8, [x0], #-520
 // CHECK-ERROR-NEXT:                     ^
 
@@ -2633,13 +2649,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldp d3, q2, [sp], #0
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1008]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         ldp q3, q5, [sp], #8
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1008]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         stp q20, q25, [x5], #1024
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1008]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         ldp q30, q15, [x23], #-1040
 // CHECK-ERROR-NEXT:                       ^
 
@@ -2652,16 +2668,16 @@
         stp w9, w10, [x5, #256]!
         ldp w11, w12, [x9, #-260]!
         stp wsp, w9, [sp, #0]!
-// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldp w3, w2, [x4, #1]!
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stp w1, w2, [x3, #253]!
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stp w9, w10, [x5, #256]!
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldp w11, w12, [x9, #-260]!
 // CHECK-ERROR-NEXT:                       ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
@@ -2671,26 +2687,26 @@
         ldpsw x9, x2, [sp, #2]!
         ldpsw x1, x2, [x10, #256]!
         ldpsw x3, x4, [x11, #-260]!
-// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldpsw x9, x2, [sp, #2]!
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldpsw x1, x2, [x10, #256]!
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldpsw x3, x4, [x11, #-260]!
 // CHECK-ERROR-NEXT:                       ^
 
         ldp x2, x5, [sp, #4]!
         ldp x5, x6, [x9, #512]!
         stp x7, x8, [x10, #-520]!
-// CHECK-ERROR: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         ldp x2, x5, [sp, #4]!
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         ldp x5, x6, [x9, #512]!
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         stp x7, x8, [x10, #-520]!
 // CHECK-ERROR-NEXT:                     ^
 
@@ -2706,13 +2722,13 @@
         stp s3, s5, [sp, #-2]!
         ldp s6, s26, [x4, #-260]!
         stp s13, s19, [x5, #256]!
-// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stp s3, s5, [sp, #-2]!
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldp s6, s26, [x4, #-260]!
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stp s13, s19, [x5, #256]!
 // CHECK-ERROR-NEXT:                       ^
 
@@ -2722,10 +2738,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldp d3, d4, [xzr, #0]!
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         ldp d5, d6, [x0, #512]!
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         stp d7, d8, [x0, #-520]!
 // CHECK-ERROR-NEXT:                     ^
 
@@ -2736,13 +2752,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldp d3, q2, [sp, #0]!
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1008]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         ldp q3, q5, [sp, #8]!
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1008]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         stp q20, q25, [x5, #1024]!
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1008]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         ldp q30, q15, [x23, #-1040]!
 // CHECK-ERROR-NEXT:                       ^
 
@@ -2754,16 +2770,16 @@
         stnp w9, w10, [x5, #256]
         ldnp w11, w12, [x9, #-260]
         stnp wsp, w9, [sp]
-// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldnp w3, w2, [x4, #1]
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stnp w1, w2, [x3, #253]
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stnp w9, w10, [x5, #256]
 // CHECK-ERROR-NEXT:                            ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldnp w11, w12, [x9, #-260]
 // CHECK-ERROR-NEXT:                             ^
 // CHECK-ERROR-NEXT: error: invalid operand for instruction
@@ -2773,13 +2789,13 @@
         ldnp x2, x5, [sp, #4]
         ldnp x5, x6, [x9, #512]
         stnp x7, x8, [x10, #-520]
-// CHECK-ERROR: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         ldnp x2, x5, [sp, #4]
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         ldnp x5, x6, [x9, #512]
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         stnp x7, x8, [x10, #-520]
 // CHECK-ERROR-NEXT:                            ^
 
@@ -2795,13 +2811,13 @@
         stnp s3, s5, [sp, #-2]
         ldnp s6, s26, [x4, #-260]
         stnp s13, s19, [x5, #256]
-// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stnp s3, s5, [sp, #-2]
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         ldnp s6, s26, [x4, #-260]
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 4 in range [-256, 252]
 // CHECK-ERROR-NEXT:         stnp s13, s19, [x5, #256]
 // CHECK-ERROR-NEXT:                       ^
 
@@ -2811,10 +2827,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldnp d3, d4, [xzr]
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         ldnp d5, d6, [x0, #512]
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 504]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 8 in range [-512, 504]
 // CHECK-ERROR-NEXT:         stnp d7, d8, [x0, #-520]
 // CHECK-ERROR-NEXT:                     ^
 
@@ -2825,13 +2841,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         ldnp d3, q2, [sp]
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1008]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         ldnp q3, q5, [sp, #8]
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1008]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         stnp q20, q25, [x5, #1024]
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1008]
+// CHECK-ERROR-NEXT: error: {{expected integer|index must be a}} multiple of 16 in range [-1024, 1008]
 // CHECK-ERROR-NEXT:         ldnp q30, q15, [x23, #-1040]
 // CHECK-ERROR-NEXT:                       ^
 
@@ -2974,28 +2990,32 @@
         movz x3, #-1
         movk w3, #1, lsl #32
         movn x2, #12, lsl #64
-// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movz w3, #65536, lsl #0
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movz w4, #65536
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-AARCH64-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-ARM64-NEXT: error: expected 'lsl' with optional integer 0 or 16
 // CHECK-ERROR-NEXT:         movn w1, #2, lsl #1
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: only 'lsl #+N' valid after immediate
+// CHECK-ERROR-AARCH64-NEXT: error: only 'lsl #+N' valid after immediate
+// CHECK-ERROR-ARM64-NEXT: error: expected integer shift amount
 // CHECK-ERROR-NEXT:         movk w3, #0, lsl #-1
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movn w2, #-1, lsl #0
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movz x3, #-1
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-AARCH64-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-ARM64-NEXT: error: expected 'lsl' with optional integer 0 or 16
 // CHECK-ERROR-NEXT:         movk w3, #1, lsl #32
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-AARCH64-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-ARM64-NEXT: error: expected 'lsl' with optional integer 0, 16, 32 or 48
 // CHECK-ERROR-NEXT:         movn x2, #12, lsl #64
 // CHECK-ERROR-NEXT:                  ^
 
@@ -3005,22 +3025,22 @@
         movk w3, #:abs_g0:sym
         movz x3, #:abs_g0_nc:sym
         movn x4, #:abs_g0_nc:sym
-// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movz x12, #:abs_g0:sym, lsl #16
 // CHECK-ERROR-NEXT:                                 ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movz x12, #:abs_g0:sym, lsl #0
 // CHECK-ERROR-NEXT:                                 ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
-// CHECK-ERROR-NEXT:         movn x2, #:abs_g0:sym
-// CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-AARCH64-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
+// CHECK-ERROR-AARCH64-NEXT:         movn x2, #:abs_g0:sym
+// CHECK-ERROR-AARCH64-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movk w3, #:abs_g0:sym
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movz x3, #:abs_g0_nc:sym
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movn x4, #:abs_g0_nc:sym
 // CHECK-ERROR-NEXT:                  ^
 
@@ -3028,16 +3048,16 @@
         movk w3, #:abs_g1:sym
         movz x3, #:abs_g1_nc:sym
         movn x4, #:abs_g1_nc:sym
-// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
-// CHECK-ERROR-NEXT:         movn x2, #:abs_g1:sym
-// CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-AARCH64: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
+// CHECK-ERROR-AARCH64-NEXT:         movn x2, #:abs_g1:sym
+// CHECK-ERROR-AARCH64-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movk w3, #:abs_g1:sym
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movz x3, #:abs_g1_nc:sym
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movn x4, #:abs_g1_nc:sym
 // CHECK-ERROR-NEXT:                  ^
 
@@ -3047,53 +3067,53 @@
         movk w3, #:abs_g2_nc:sym
         movz x13, #:abs_g2_nc:sym
         movn x24, #:abs_g2_nc:sym
-// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movz w12, #:abs_g2:sym
 // CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
-// CHECK-ERROR-NEXT:         movn x12, #:abs_g2:sym
-// CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-AARCH64-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
+// CHECK-ERROR-AARCH64-NEXT:         movn x12, #:abs_g2:sym
+// CHECK-ERROR-AARCH64-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movk x13, #:abs_g2:sym
 // CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movk w3, #:abs_g2_nc:sym
 // CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movz x13, #:abs_g2_nc:sym
 // CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movn x24, #:abs_g2_nc:sym
 // CHECK-ERROR-NEXT:                   ^
 
         movn x19, #:abs_g3:sym
         movz w20, #:abs_g3:sym
         movk w21, #:abs_g3:sym
-// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
-// CHECK-ERROR-NEXT:         movn x19, #:abs_g3:sym
-// CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-AARCH64: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
+// CHECK-ERROR-AARCH64-NEXT:         movn x19, #:abs_g3:sym
+// CHECK-ERROR-AARCH64-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movz w20, #:abs_g3:sym
 // CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movk w21, #:abs_g3:sym
 // CHECK-ERROR-NEXT:                   ^
 
         movk x19, #:abs_g0_s:sym
         movk w23, #:abs_g0_s:sym
-// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movk x19, #:abs_g0_s:sym
 // CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movk w23, #:abs_g0_s:sym
 // CHECK-ERROR-NEXT:                   ^
 
         movk x19, #:abs_g1_s:sym
         movk w23, #:abs_g1_s:sym
-// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movk x19, #:abs_g1_s:sym
 // CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movk w23, #:abs_g1_s:sym
 // CHECK-ERROR-NEXT:                   ^
 
@@ -3101,16 +3121,16 @@
         movn w29, #:abs_g2_s:sym
         movk x19, #:abs_g2_s:sym
         movk w23, #:abs_g2_s:sym
-// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movz w2, #:abs_g2_s:sym
 // CHECK-ERROR-NEXT:                    ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movn w29, #:abs_g2_s:sym
 // CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movk x19, #:abs_g2_s:sym
 // CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT: error: {{expected relocated symbol or|immediate must be an}} integer in range [0, 65535]
 // CHECK-ERROR-NEXT:         movk w23, #:abs_g2_s:sym
 // CHECK-ERROR-NEXT:                   ^
 
@@ -3154,19 +3174,19 @@
 
         hint #-1
         hint #128
-// CHECK-ERROR: error: expected integer in range [0, 127]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 127]
 // CHECK-ERROR-NEXT:         hint #-1
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 127]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 127]
 // CHECK-ERROR-NEXT:         hint #128
 // CHECK-ERROR-NEXT:              ^
 
         clrex #-1
         clrex #16
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:         clrex #-1
 // CHECK-ERROR-NEXT:               ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:         clrex #16
 // CHECK-ERROR-NEXT:               ^
 
@@ -3174,25 +3194,25 @@
         dsb #16
         dmb #-1
         dmb #16
-// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT: error: {{Invalid immediate for instruction|barrier operand out of range}}
 // CHECK-ERROR-NEXT:         dsb #-1
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT: error: {{Invalid immediate for instruction|barrier operand out of range}}
 // CHECK-ERROR-NEXT:         dsb #16
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT: error: {{Invalid immediate for instruction|barrier operand out of range}}
 // CHECK-ERROR-NEXT:         dmb #-1
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT: error: {{Invalid immediate for instruction|barrier operand out of range}}
 // CHECK-ERROR-NEXT:         dmb #16
 // CHECK-ERROR-NEXT:             ^
 
         isb #-1
         isb #16
-// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT: error: {{Invalid immediate for instruction|barrier operand out of range}}
 // CHECK-ERROR-NEXT:         isb #-1
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT: error: {{Invalid immediate for instruction|barrier operand out of range}}
 // CHECK-ERROR-NEXT:         isb #16
 // CHECK-ERROR-NEXT:             ^
 
@@ -3200,16 +3220,16 @@
         msr spsel, #-1
         msr spsel #-1
         msr daifclr, #16
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:         msr daifset, x4
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:         msr spsel, #-1
 // CHECK-ERROR-NEXT:                    ^
-// CHECK-ERROR-NEXT: error: expected comma before next operand
+// CHECK-ERROR-NEXT: error: {{expected comma before next operand|unexpected token in argument list}}
 // CHECK-ERROR-NEXT:         msr spsel #-1
 // CHECK-ERROR-NEXT:                   ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR-NEXT:         msr daifclr, #16
 // CHECK-ERROR-NEXT:                      ^
 
@@ -3221,7 +3241,7 @@
         sysl x13, #3, c16, c2, #3
         sysl x9, #2, c11, c16, #5
         sysl x4, #4, c9, c8, #8
-// CHECK-ERROR-NEXT: error:  expected integer in range [0, 7]
+// CHECK-ERROR-NEXT: error:  {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR-NEXT:         sys #8, c1, c2, #7, x9
 // CHECK-ERROR-NEXT:             ^
 // CHECK-ERROR-NEXT: error: Expected cN operand where 0 <= N <= 15
@@ -3230,10 +3250,10 @@
 // CHECK-ERROR-NEXT: error: Expected cN operand where 0 <= N <= 15
 // CHECK-ERROR-NEXT:         sys #2, c11, c16, #5
 // CHECK-ERROR-NEXT:                      ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 7]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR-NEXT:         sys #4, c9, c8, #8, xzr
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 7]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR-NEXT:         sysl x11, #8, c1, c2, #7
 // CHECK-ERROR-NEXT:                   ^
 // CHECK-ERROR-NEXT: error: Expected cN operand where 0 <= N <= 15
@@ -3242,20 +3262,21 @@
 // CHECK-ERROR-NEXT: error: Expected cN operand where 0 <= N <= 15
 // CHECK-ERROR-NEXT:         sysl x9, #2, c11, c16, #5
 // CHECK-ERROR-NEXT:                           ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 7]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR-NEXT:         sysl x4, #4, c9, c8, #8
 // CHECK-ERROR-NEXT:                              ^
 
         ic ialluis, x2
         ic allu, x7
         ic ivau
-// CHECK-ERROR-NEXT: error: specified IC op does not use a register
+// CHECK-ERROR-NEXT: error: specified {{IC|ic}} op does not use a register
 // CHECK-ERROR-NEXT:         ic ialluis, x2
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: operand specifier not recognised
+// CHECK-ERROR-AARCH64-NEXT: error: operand specifier not recognised
+// CHECK-ERROR-ARM64-NEXT: error: invalid operand for IC instruction
 // CHECK-ERROR-NEXT:         ic allu, x7
 // CHECK-ERROR-NEXT:            ^
-// CHECK-ERROR-NEXT: error: specified IC op requires a register
+// CHECK-ERROR-NEXT: error: specified {{IC|ic}} op requires a register
 // CHECK-ERROR-NEXT:         ic ivau
 // CHECK-ERROR-NEXT:            ^
 
@@ -3291,100 +3312,100 @@
         tlbi VALE3
         tlbi VMALLS12E1, x15
         tlbi VAALE1
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi IPAS2E1IS
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi IPAS2LE1IS
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op does not use a register
 // CHECK-ERROR-NEXT:         tlbi VMALLE1IS, x12
 // CHECK-ERROR-NEXT:                         ^
-// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op does not use a register
 // CHECK-ERROR-NEXT:         tlbi ALLE2IS, x11
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op does not use a register
 // CHECK-ERROR-NEXT:         tlbi ALLE3IS, x20
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VAE1IS
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VAE2IS
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VAE3IS
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi ASIDE1IS
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VAAE1IS
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op does not use a register
 // CHECK-ERROR-NEXT:         tlbi ALLE1IS, x0
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VALE1IS
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VALE2IS
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VALE3IS
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op does not use a register
 // CHECK-ERROR-NEXT:         tlbi VMALLS12E1IS, xzr
 // CHECK-ERROR-NEXT:                            ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VAALE1IS
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi IPAS2E1
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi IPAS2LE1
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op does not use a register
 // CHECK-ERROR-NEXT:         tlbi VMALLE1, x9
 // CHECK-ERROR-NEXT:                       ^
-// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op does not use a register
 // CHECK-ERROR-NEXT:         tlbi ALLE2, x10
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op does not use a register
 // CHECK-ERROR-NEXT:         tlbi ALLE3, x11
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VAE1
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VAE2
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VAE3
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi ASIDE1
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VAAE1
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op does not use a register
 // CHECK-ERROR-NEXT:         tlbi ALLE1, x25
 // CHECK-ERROR-NEXT:                     ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VALE1
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VALE2
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VALE3
 // CHECK-ERROR-NEXT:              ^
-// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op does not use a register
 // CHECK-ERROR-NEXT:         tlbi VMALLS12E1, x15
 // CHECK-ERROR-NEXT:                          ^
-// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT: error: specified {{TLBI|tlbi}} op requires a register
 // CHECK-ERROR-NEXT:         tlbi VAALE1
 // CHECK-ERROR-NEXT:              ^
 
@@ -3642,16 +3663,16 @@
         tbz w3, #32, nowhere
         tbz x9, #-1, there
         tbz x20, #64, dont
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:     tbz w3, #-1, addr
 // CHECK-ERROR-NEXT:             ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        tbz w3, #32, nowhere
 // CHECK-ERROR-NEXT:                ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:        tbz x9, #-1, there
 // CHECK-ERROR-NEXT:                ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:        tbz x20, #64, dont
 // CHECK-ERROR-NEXT:                 ^
 
@@ -3659,16 +3680,16 @@
         tbnz w3, #32, nowhere
         tbnz x9, #-1, there
         tbnz x20, #64, dont
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        tbnz w3, #-1, addr
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR-NEXT:        tbnz w3, #32, nowhere
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:        tbnz x9, #-1, there
 // CHECK-ERROR-NEXT:                 ^
-// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR-NEXT:        tbnz x20, #64, dont
 
 //------------------------------------------------------------------------------
diff --git a/test/MC/AArch64/basic-a64-instructions.s b/test/MC/AArch64/basic-a64-instructions.s
index be00e14..a12968b 100644
--- a/test/MC/AArch64/basic-a64-instructions.s
+++ b/test/MC/AArch64/basic-a64-instructions.s
@@ -108,9 +108,9 @@ _func:
 // CHECK: adds     x20, sp, w19, uxth #4      // encoding: [0xf4,0x33,0x33,0xab]
 // CHECK: adds     x12, x1, w20, uxtw         // encoding: [0x2c,0x40,0x34,0xab]
 // CHECK: adds     x20, x3, x13, uxtx         // encoding: [0x74,0x60,0x2d,0xab]
-// CHECK: adds     xzr, x25, w20, sxtb #3     // encoding: [0x3f,0x8f,0x34,0xab]
+// CHECK: {{adds xzr,|cmn}} x25, w20, sxtb #3     // encoding: [0x3f,0x8f,0x34,0xab]
 // CHECK: adds     x18, sp, w19, sxth         // encoding: [0xf2,0xa3,0x33,0xab]
-// CHECK: adds     xzr, x2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0xab]
+// CHECK: {{adds xzr,|cmn}} x2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0xab]
 // CHECK: adds     x3, x5, x9, sxtx #2        // encoding: [0xa3,0xe8,0x29,0xab]
 
         adds w2, w5, w7, uxtb
@@ -127,7 +127,7 @@ _func:
 // CHECK: adds     w19, w17, w1, uxtx         // encoding: [0x33,0x62,0x21,0x2b]
 // CHECK: adds     w2, w5, w1, sxtb #1        // encoding: [0xa2,0x84,0x21,0x2b]
 // CHECK: adds     w26, wsp, w19, sxth        // encoding: [0xfa,0xa3,0x33,0x2b]
-// CHECK: adds     wzr, w2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0x2b]
+// CHECK: cmn w2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0x2b]
 // CHECK: adds     w2, w3, w5, sxtx           // encoding: [0x62,0xe0,0x25,0x2b]
 
         // subs
@@ -143,9 +143,9 @@ _func:
 // CHECK: subs     x20, sp, w19, uxth #4      // encoding: [0xf4,0x33,0x33,0xeb]
 // CHECK: subs     x12, x1, w20, uxtw         // encoding: [0x2c,0x40,0x34,0xeb]
 // CHECK: subs     x20, x3, x13, uxtx         // encoding: [0x74,0x60,0x2d,0xeb]
-// CHECK: subs     xzr, x25, w20, sxtb #3     // encoding: [0x3f,0x8f,0x34,0xeb]
+// CHECK: {{subs xzr,|cmp}} x25, w20, sxtb #3     // encoding: [0x3f,0x8f,0x34,0xeb]
 // CHECK: subs     x18, sp, w19, sxth         // encoding: [0xf2,0xa3,0x33,0xeb]
-// CHECK: subs     xzr, x2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0xeb]
+// CHECK: {{subs xzr,|cmp}} x2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0xeb]
 // CHECK: subs     x3, x5, x9, sxtx #2        // encoding: [0xa3,0xe8,0x29,0xeb]
 
         subs w2, w5, w7, uxtb
@@ -162,7 +162,7 @@ _func:
 // CHECK: subs     w19, w17, w1, uxtx         // encoding: [0x33,0x62,0x21,0x6b]
 // CHECK: subs     w2, w5, w1, sxtb #1        // encoding: [0xa2,0x84,0x21,0x6b]
 // CHECK: subs     w26, wsp, w19, sxth        // encoding: [0xfa,0xa3,0x33,0x6b]
-// CHECK: subs     wzr, w2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0x6b]
+// CHECK: {{subs wzr,|cmp}} w2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0x6b]
 // CHECK: subs     w2, w3, w5, sxtx           // encoding: [0x62,0xe0,0x25,0x6b]
 
         // cmp
@@ -227,14 +227,14 @@ _func:
         cmn wsp, w19, sxth
         cmn w2, w3, sxtw
         cmn w3, w5, sxtx
-// CHECK: cmn      w5, w7, uxtb               // encoding: [0xbf,0x00,0x27,0x2b]
-// CHECK: cmn      w15, w17, uxth             // encoding: [0xff,0x21,0x31,0x2b]
-// CHECK: cmn      w29, wzr, uxtw             // encoding: [0xbf,0x43,0x3f,0x2b]
-// CHECK: cmn      w17, w1, uxtx              // encoding: [0x3f,0x62,0x21,0x2b]
-// CHECK: cmn      w5, w1, sxtb #1            // encoding: [0xbf,0x84,0x21,0x2b]
-// CHECK: cmn      wsp, w19, sxth             // encoding: [0xff,0xa3,0x33,0x2b]
-// CHECK: cmn      w2, w3, sxtw               // encoding: [0x5f,0xc0,0x23,0x2b]
-// CHECK: cmn      w3, w5, sxtx               // encoding: [0x7f,0xe0,0x25,0x2b]
+// CHECK: {{cmn|adds wzr,}}      w5, w7, uxtb               // encoding: [0xbf,0x00,0x27,0x2b]
+// CHECK: {{cmn|adds wzr,}}      w15, w17, uxth             // encoding: [0xff,0x21,0x31,0x2b]
+// CHECK: {{cmn|adds wzr,}}      w29, wzr, uxtw             // encoding: [0xbf,0x43,0x3f,0x2b]
+// CHECK: {{cmn|adds wzr,}}      w17, w1, uxtx              // encoding: [0x3f,0x62,0x21,0x2b]
+// CHECK: {{cmn|adds wzr,}}      w5, w1, sxtb #1            // encoding: [0xbf,0x84,0x21,0x2b]
+// CHECK: {{cmn|adds wzr,}}      wsp, w19, sxth             // encoding: [0xff,0xa3,0x33,0x2b]
+// CHECK: {{cmn|adds wzr,}}      w2, w3, sxtw               // encoding: [0x5f,0xc0,0x23,0x2b]
+// CHECK: {{cmn|adds wzr,}}      w3, w5, sxtx               // encoding: [0x7f,0xe0,0x25,0x2b]
 
         // operands for cmp
         cmp x20, w29, uxtb #3
@@ -244,7 +244,7 @@ _func:
 // CHECK: cmp      x20, w29, uxtb #3          // encoding: [0x9f,0x0e,0x3d,0xeb]
 // CHECK: cmp      x12, x13, uxtx #4          // encoding: [0x9f,0x71,0x2d,0xeb]
 // CHECK: cmp      wsp, w1, uxtb              // encoding: [0xff,0x03,0x21,0x6b]
-// CHECK: cmn      wsp, wzr, sxtw             // encoding: [0xff,0xc3,0x3f,0x2b]
+// CHECK: {{cmn|adds wzr,}}      wsp, wzr, sxtw             // encoding: [0xff,0xc3,0x3f,0x2b]
 
         // LSL variant if sp involved
         sub sp, x3, x7, lsl #4
@@ -255,7 +255,7 @@ _func:
 // CHECK: sub      sp, x3, x7, lsl #4         // encoding: [0x7f,0x70,0x27,0xcb]
 // CHECK: add      w2, wsp, w3, lsl #1        // encoding: [0xe2,0x47,0x23,0x0b]
 // CHECK: cmp      wsp, w9                    // encoding: [0xff,0x43,0x29,0x6b]
-// CHECK: adds     wzr, wsp, w3, lsl #4       // encoding: [0xff,0x53,0x23,0x2b]
+// CHECK: cmn wsp, w3, lsl #4       // encoding: [0xff,0x53,0x23,0x2b]
 // CHECK: subs     x3, sp, x9, lsl #2         // encoding: [0xe3,0x6b,0x29,0xeb]
 
 //------------------------------------------------------------------------------
@@ -309,16 +309,16 @@ _func:
         adds w20, wsp, #0x0
         adds xzr, x3, #0x1, lsl #12          // FIXME: canonically should be cmn
 // CHECK: adds     w13, w23, #291, lsl #12    // encoding: [0xed,0x8e,0x44,0x31]
-// CHECK: adds     wzr, w2, #4095             // encoding: [0x5f,0xfc,0x3f,0x31]
+// CHECK: {{adds wzr,|cmn}} w2, #4095         // encoding: [0x5f,0xfc,0x3f,0x31]
 // CHECK: adds     w20, wsp, #0               // encoding: [0xf4,0x03,0x00,0x31]
-// CHECK: adds     xzr, x3, #1, lsl #12       // encoding: [0x7f,0x04,0x40,0xb1]
+// CHECK: {{adds xzr,|cmn}} x3, #1, lsl #12   // encoding: [0x7f,0x04,0x40,0xb1]
 
 // Checks for subs
         subs xzr, sp, #20, lsl #12           // FIXME: canonically should be cmp
         subs xzr, x30, #4095, lsl #0         // FIXME: canonically should be cmp
         subs x4, sp, #3822
-// CHECK: subs     xzr, sp, #20, lsl #12      // encoding: [0xff,0x53,0x40,0xf1]
-// CHECK: subs     xzr, x30, #4095            // encoding: [0xdf,0xff,0x3f,0xf1]
+// CHECK: {{subs xzr,|cmp}} sp, #20, lsl #12  // encoding: [0xff,0x53,0x40,0xf1]
+// CHECK: {{subs xzr,|cmp}} x30, #4095        // encoding: [0xdf,0xff,0x3f,0xf1]
 // CHECK: subs     x4, sp, #3822              // encoding: [0xe4,0xbb,0x3b,0xf1]
 
 // cmn is an alias for adds zr, ...
@@ -349,8 +349,8 @@ _func:
 
 // A relocation check (default to lo12, which is the only sane relocation anyway really)
         add x0, x4, #:lo12:var
-// CHECK: add     x0, x4, #:lo12:var         // encoding: [0x80'A',A,A,0x91'A']
-// CHECK:                                    //   fixup A - offset: 0, value: :lo12:var, kind: fixup_a64_add_lo12
+// CHECK: add x0, x4, :lo12:var       // encoding: [0x80,0bAAAAAA00,0b00AAAAAA,0x91]
+// CHECK:                             // fixup A - offset: 0, value: :lo12:var, kind: fixup_aarch64_add_imm12
 
 //------------------------------------------------------------------------------
 // Add-sub (shifted register)
@@ -423,7 +423,7 @@ _func:
         adds w20, wzr, w4
         adds w4, w6, wzr
 // CHECK: adds     w3, w5, w7                 // encoding: [0xa3,0x00,0x07,0x2b]
-// CHECK: adds     wzr, w3, w5                // encoding: [0x7f,0x00,0x05,0x2b]
+// CHECK: {{adds wzr,|cmn}} w3, w5                // encoding: [0x7f,0x00,0x05,0x2b]
 // CHECK: adds     w20, wzr, w4               // encoding: [0xf4,0x03,0x04,0x2b]
 // CHECK: adds     w4, w6, wzr                // encoding: [0xc4,0x00,0x1f,0x2b]
 
@@ -453,7 +453,7 @@ _func:
         adds x20, xzr, x4
         adds x4, x6, xzr
 // CHECK: adds     x3, x5, x7                 // encoding: [0xa3,0x00,0x07,0xab]
-// CHECK: adds     xzr, x3, x5                // encoding: [0x7f,0x00,0x05,0xab]
+// CHECK: {{adds xzr,|cmn}} x3, x5                // encoding: [0x7f,0x00,0x05,0xab]
 // CHECK: adds     x20, xzr, x4               // encoding: [0xf4,0x03,0x04,0xab]
 // CHECK: adds     x4, x6, xzr                // encoding: [0xc4,0x00,0x1f,0xab]
 
@@ -484,7 +484,7 @@ _func:
         sub w4, w6, wzr
 // CHECK: sub      w3, w5, w7                 // encoding: [0xa3,0x00,0x07,0x4b]
 // CHECK: sub      wzr, w3, w5                // encoding: [0x7f,0x00,0x05,0x4b]
-// CHECK: sub      w20, wzr, w4               // encoding: [0xf4,0x03,0x04,0x4b]
+// CHECK: neg      w20, w4              // encoding: [0xf4,0x03,0x04,0x4b]
 // CHECK: sub      w4, w6, wzr                // encoding: [0xc4,0x00,0x1f,0x4b]
 
         sub w11, w13, w15, lsl #0
@@ -514,7 +514,7 @@ _func:
         sub x4, x6, xzr
 // CHECK: sub      x3, x5, x7                 // encoding: [0xa3,0x00,0x07,0xcb]
 // CHECK: sub      xzr, x3, x5                // encoding: [0x7f,0x00,0x05,0xcb]
-// CHECK: sub      x20, xzr, x4               // encoding: [0xf4,0x03,0x04,0xcb]
+// CHECK: neg      x20, x4              // encoding: [0xf4,0x03,0x04,0xcb]
 // CHECK: sub      x4, x6, xzr                // encoding: [0xc4,0x00,0x1f,0xcb]
 
         sub x11, x13, x15, lsl #0
@@ -543,8 +543,8 @@ _func:
         subs w20, wzr, w4
         subs w4, w6, wzr
 // CHECK: subs     w3, w5, w7                 // encoding: [0xa3,0x00,0x07,0x6b]
-// CHECK: subs     wzr, w3, w5                // encoding: [0x7f,0x00,0x05,0x6b]
-// CHECK: subs     w20, wzr, w4               // encoding: [0xf4,0x03,0x04,0x6b]
+// CHECK: {{subs wzr,|cmp}} w3, w5            // encoding: [0x7f,0x00,0x05,0x6b]
+// CHECK: negs     w20, w4              // encoding: [0xf4,0x03,0x04,0x6b]
 // CHECK: subs     w4, w6, wzr                // encoding: [0xc4,0x00,0x1f,0x6b]
 
         subs w11, w13, w15, lsl #0
@@ -573,8 +573,8 @@ _func:
         subs x20, xzr, x4
         subs x4, x6, xzr
 // CHECK: subs     x3, x5, x7                 // encoding: [0xa3,0x00,0x07,0xeb]
-// CHECK: subs     xzr, x3, x5                // encoding: [0x7f,0x00,0x05,0xeb]
-// CHECK: subs     x20, xzr, x4               // encoding: [0xf4,0x03,0x04,0xeb]
+// CHECK: {{subs xzr,|cmp}} x3, x5            // encoding: [0x7f,0x00,0x05,0xeb]
+// CHECK: negs     x20, x4              // encoding: [0xf4,0x03,0x04,0xeb]
 // CHECK: subs     x4, x6, xzr                // encoding: [0xc4,0x00,0x1f,0xeb]
 
         subs x11, x13, x15, lsl #0
@@ -713,114 +713,118 @@ _func:
         neg w29, w30
         neg w30, wzr
         neg wzr, w0
-// CHECK: sub      w29, wzr, w30              // encoding: [0xfd,0x03,0x1e,0x4b]
-// CHECK: sub      w30, wzr, wzr              // encoding: [0xfe,0x03,0x1f,0x4b]
-// CHECK: sub      wzr, wzr, w0                    // encoding: [0xff,0x03,0x00,0x4b]
+// CHECK: neg      w29, w30              // encoding: [0xfd,0x03,0x1e,0x4b]
+// CHECK: neg      w30, wzr              // encoding: [0xfe,0x03,0x1f,0x4b]
+// CHECK: neg      wzr, w0               // encoding: [0xff,0x03,0x00,0x4b]
 
         neg w28, w27, lsl #0
         neg w26, w25, lsl #29
         neg w24, w23, lsl #31
-// CHECK: sub      w28, wzr, w27              // encoding: [0xfc,0x03,0x1b,0x4b]
-// CHECK: sub      w26, wzr, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x4b]
-// CHECK: sub      w24, wzr, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x4b]
+
+// CHECK: neg      w28, w27              // encoding: [0xfc,0x03,0x1b,0x4b]
+// CHECK: neg      w26, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x4b]
+// CHECK: neg      w24, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x4b]
 
         neg w22, w21, lsr #0
         neg w20, w19, lsr #1
         neg w18, w17, lsr #31
-// CHECK: sub      w22, wzr, w21, lsr #0      // encoding: [0xf6,0x03,0x55,0x4b]
-// CHECK: sub      w20, wzr, w19, lsr #1      // encoding: [0xf4,0x07,0x53,0x4b]
-// CHECK: sub      w18, wzr, w17, lsr #31     // encoding: [0xf2,0x7f,0x51,0x4b]
+// CHECK: neg      w22, w21, lsr #0      // encoding: [0xf6,0x03,0x55,0x4b]
+// CHECK: neg      w20, w19, lsr #1      // encoding: [0xf4,0x07,0x53,0x4b]
+// CHECK: neg      w18, w17, lsr #31     // encoding: [0xf2,0x7f,0x51,0x4b]
 
         neg w16, w15, asr #0
         neg w14, w13, asr #12
         neg w12, w11, asr #31
-// CHECK: sub      w16, wzr, w15, asr #0      // encoding: [0xf0,0x03,0x8f,0x4b]
-// CHECK: sub      w14, wzr, w13, asr #12     // encoding: [0xee,0x33,0x8d,0x4b]
-// CHECK: sub      w12, wzr, w11, asr #31     // encoding: [0xec,0x7f,0x8b,0x4b]
+// CHECK: neg      w16, w15, asr #0      // encoding: [0xf0,0x03,0x8f,0x4b]
+// CHECK: neg      w14, w13, asr #12     // encoding: [0xee,0x33,0x8d,0x4b]
+// CHECK: neg      w12, w11, asr #31     // encoding: [0xec,0x7f,0x8b,0x4b]
 
         neg x29, x30
         neg x30, xzr
         neg xzr, x0
-// CHECK: sub      x29, xzr, x30              // encoding: [0xfd,0x03,0x1e,0xcb]
-// CHECK: sub      x30, xzr, xzr              // encoding: [0xfe,0x03,0x1f,0xcb]
-// CHECK: sub      xzr, xzr, x0               // encoding: [0xff,0x03,0x00,0xcb]
+// CHECK: neg      x29, x30              // encoding: [0xfd,0x03,0x1e,0xcb]
+// CHECK: neg      x30, xzr              // encoding: [0xfe,0x03,0x1f,0xcb]
+// CHECK: neg      xzr, x0               // encoding: [0xff,0x03,0x00,0xcb]
 
         neg x28, x27, lsl #0
         neg x26, x25, lsl #29
         neg x24, x23, lsl #31
-// CHECK: sub      x28, xzr, x27              // encoding: [0xfc,0x03,0x1b,0xcb]
-// CHECK: sub      x26, xzr, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xcb]
-// CHECK: sub      x24, xzr, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xcb]
+
+// CHECK: neg      x28, x27              // encoding: [0xfc,0x03,0x1b,0xcb]
+// CHECK: neg      x26, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xcb]
+// CHECK: neg      x24, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xcb]
 
         neg x22, x21, lsr #0
         neg x20, x19, lsr #1
         neg x18, x17, lsr #31
-// CHECK: sub      x22, xzr, x21, lsr #0      // encoding: [0xf6,0x03,0x55,0xcb]
-// CHECK: sub      x20, xzr, x19, lsr #1      // encoding: [0xf4,0x07,0x53,0xcb]
-// CHECK: sub      x18, xzr, x17, lsr #31     // encoding: [0xf2,0x7f,0x51,0xcb]
+// CHECK: neg      x22, x21, lsr #0      // encoding: [0xf6,0x03,0x55,0xcb]
+// CHECK: neg      x20, x19, lsr #1      // encoding: [0xf4,0x07,0x53,0xcb]
+// CHECK: neg      x18, x17, lsr #31     // encoding: [0xf2,0x7f,0x51,0xcb]
 
         neg x16, x15, asr #0
         neg x14, x13, asr #12
         neg x12, x11, asr #31
-// CHECK: sub      x16, xzr, x15, asr #0      // encoding: [0xf0,0x03,0x8f,0xcb]
-// CHECK: sub      x14, xzr, x13, asr #12     // encoding: [0xee,0x33,0x8d,0xcb]
-// CHECK: sub      x12, xzr, x11, asr #31     // encoding: [0xec,0x7f,0x8b,0xcb]
+// CHECK: neg      x16, x15, asr #0      // encoding: [0xf0,0x03,0x8f,0xcb]
+// CHECK: neg      x14, x13, asr #12     // encoding: [0xee,0x33,0x8d,0xcb]
+// CHECK: neg      x12, x11, asr #31     // encoding: [0xec,0x7f,0x8b,0xcb]
 
         negs w29, w30
         negs w30, wzr
         negs wzr, w0
-// CHECK: subs     w29, wzr, w30              // encoding: [0xfd,0x03,0x1e,0x6b]
-// CHECK: subs     w30, wzr, wzr              // encoding: [0xfe,0x03,0x1f,0x6b]
-// CHECK: subs     wzr, wzr, w0               // encoding: [0xff,0x03,0x00,0x6b]
+// CHECK: negs     w29, w30              // encoding: [0xfd,0x03,0x1e,0x6b]
+// CHECK: negs     w30, wzr              // encoding: [0xfe,0x03,0x1f,0x6b]
+// CHECK: cmp      wzr, w0               // encoding: [0xff,0x03,0x00,0x6b]
 
         negs w28, w27, lsl #0
         negs w26, w25, lsl #29
         negs w24, w23, lsl #31
-// CHECK: subs     w28, wzr, w27              // encoding: [0xfc,0x03,0x1b,0x6b]
-// CHECK: subs     w26, wzr, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x6b]
-// CHECK: subs     w24, wzr, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x6b]
+
+// CHECK: negs     w28, w27             // encoding: [0xfc,0x03,0x1b,0x6b]
+// CHECK: negs     w26, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x6b]
+// CHECK: negs     w24, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x6b]
 
         negs w22, w21, lsr #0
         negs w20, w19, lsr #1
         negs w18, w17, lsr #31
-// CHECK: subs     w22, wzr, w21, lsr #0      // encoding: [0xf6,0x03,0x55,0x6b]
-// CHECK: subs     w20, wzr, w19, lsr #1      // encoding: [0xf4,0x07,0x53,0x6b]
-// CHECK: subs     w18, wzr, w17, lsr #31     // encoding: [0xf2,0x7f,0x51,0x6b]
+// CHECK: negs     w22, w21, lsr #0      // encoding: [0xf6,0x03,0x55,0x6b]
+// CHECK: negs     w20, w19, lsr #1      // encoding: [0xf4,0x07,0x53,0x6b]
+// CHECK: negs     w18, w17, lsr #31     // encoding: [0xf2,0x7f,0x51,0x6b]
 
         negs w16, w15, asr #0
         negs w14, w13, asr #12
         negs w12, w11, asr #31
-// CHECK: subs     w16, wzr, w15, asr #0      // encoding: [0xf0,0x03,0x8f,0x6b]
-// CHECK: subs     w14, wzr, w13, asr #12     // encoding: [0xee,0x33,0x8d,0x6b]
-// CHECK: subs     w12, wzr, w11, asr #31     // encoding: [0xec,0x7f,0x8b,0x6b]
+// CHECK: negs     w16, w15, asr #0      // encoding: [0xf0,0x03,0x8f,0x6b]
+// CHECK: negs     w14, w13, asr #12     // encoding: [0xee,0x33,0x8d,0x6b]
+// CHECK: negs     w12, w11, asr #31     // encoding: [0xec,0x7f,0x8b,0x6b]
 
         negs x29, x30
         negs x30, xzr
         negs xzr, x0
-// CHECK: subs     x29, xzr, x30              // encoding: [0xfd,0x03,0x1e,0xeb]
-// CHECK: subs     x30, xzr, xzr              // encoding: [0xfe,0x03,0x1f,0xeb]
-// CHECK: subs     xzr, xzr, x0               // encoding: [0xff,0x03,0x00,0xeb]
+// CHECK: negs     x29, x30              // encoding: [0xfd,0x03,0x1e,0xeb]
+// CHECK: negs     x30, xzr              // encoding: [0xfe,0x03,0x1f,0xeb]
+// CHECK: cmp     xzr, x0                // encoding: [0xff,0x03,0x00,0xeb]
 
         negs x28, x27, lsl #0
         negs x26, x25, lsl #29
         negs x24, x23, lsl #31
-// CHECK: subs     x28, xzr, x27              // encoding: [0xfc,0x03,0x1b,0xeb]
-// CHECK: subs     x26, xzr, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xeb]
-// CHECK: subs     x24, xzr, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xeb]
+
+// CHECK: negs     x28, x27              // encoding: [0xfc,0x03,0x1b,0xeb]
+// CHECK: negs     x26, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xeb]
+// CHECK: negs     x24, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xeb]
 
         negs x22, x21, lsr #0
         negs x20, x19, lsr #1
         negs x18, x17, lsr #31
-// CHECK: subs     x22, xzr, x21, lsr #0      // encoding: [0xf6,0x03,0x55,0xeb]
-// CHECK: subs     x20, xzr, x19, lsr #1      // encoding: [0xf4,0x07,0x53,0xeb]
-// CHECK: subs     x18, xzr, x17, lsr #31     // encoding: [0xf2,0x7f,0x51,0xeb]
+// CHECK: negs     x22, x21, lsr #0      // encoding: [0xf6,0x03,0x55,0xeb]
+// CHECK: negs     x20, x19, lsr #1      // encoding: [0xf4,0x07,0x53,0xeb]
+// CHECK: negs     x18, x17, lsr #31     // encoding: [0xf2,0x7f,0x51,0xeb]
 
         negs x16, x15, asr #0
         negs x14, x13, asr #12
         negs x12, x11, asr #31
-// CHECK: subs     x16, xzr, x15, asr #0      // encoding: [0xf0,0x03,0x8f,0xeb]
-// CHECK: subs     x14, xzr, x13, asr #12     // encoding: [0xee,0x33,0x8d,0xeb]
-// CHECK: subs     x12, xzr, x11, asr #31     // encoding: [0xec,0x7f,0x8b,0xeb]
+// CHECK: negs     x16, x15, asr #0      // encoding: [0xf0,0x03,0x8f,0xeb]
+// CHECK: negs     x14, x13, asr #12     // encoding: [0xee,0x33,0x8d,0xeb]
+// CHECK: negs     x12, x11, asr #31     // encoding: [0xec,0x7f,0x8b,0xeb]
 
 //------------------------------------------------------------------------------
 // Add-sub (shifted register)
@@ -933,28 +937,29 @@ _func:
         sbfm x3, x4, #63, #63
         sbfm wzr, wzr, #31, #31
         sbfm w12, w9, #0, #0
-// CHECK: sbfm     x1, x2, #3, #4             // encoding: [0x41,0x10,0x43,0x93]
-// CHECK: sbfm     x3, x4, #63, #63           // encoding: [0x83,0xfc,0x7f,0x93]
-// CHECK: sbfm     wzr, wzr, #31, #31         // encoding: [0xff,0x7f,0x1f,0x13]
-// CHECK: sbfm     w12, w9, #0, #0            // encoding: [0x2c,0x01,0x00,0x13]
+
+// CHECK: sbfx     x1, x2, #3, #2       // encoding: [0x41,0x10,0x43,0x93]
+// CHECK: asr      x3, x4, #63          // encoding: [0x83,0xfc,0x7f,0x93]
+// CHECK: asr      wzr, wzr, #31        // encoding: [0xff,0x7f,0x1f,0x13]
+// CHECK: sbfx     w12, w9, #0, #1      // encoding: [0x2c,0x01,0x00,0x13]
 
         ubfm x4, x5, #12, #10
         ubfm xzr, x4, #0, #0
         ubfm x4, xzr, #63, #5
         ubfm x5, x6, #12, #63
-// CHECK: ubfm     x4, x5, #12, #10           // encoding: [0xa4,0x28,0x4c,0xd3]
-// CHECK: ubfm     xzr, x4, #0, #0            // encoding: [0x9f,0x00,0x40,0xd3]
-// CHECK: ubfm     x4, xzr, #63, #5            // encoding: [0xe4,0x17,0x7f,0xd3]
-// CHECK: ubfm     x5, x6, #12, #63           // encoding: [0xc5,0xfc,0x4c,0xd3]
+// CHECK: ubfiz    x4, x5, #52, #11        // encoding: [0xa4,0x28,0x4c,0xd3]
+// CHECK: ubfx     xzr, x4, #0, #1         // encoding: [0x9f,0x00,0x40,0xd3]
+// CHECK: ubfiz    x4, xzr, #1, #6         // encoding: [0xe4,0x17,0x7f,0xd3]
+// CHECK: lsr      x5, x6, #12             // encoding: [0xc5,0xfc,0x4c,0xd3]
 
         bfm x4, x5, #12, #10
         bfm xzr, x4, #0, #0
         bfm x4, xzr, #63, #5
         bfm x5, x6, #12, #63
-// CHECK: bfm      x4, x5, #12, #10           // encoding: [0xa4,0x28,0x4c,0xb3]
-// CHECK: bfm      xzr, x4, #0, #0            // encoding: [0x9f,0x00,0x40,0xb3]
-// CHECK: bfm      x4, xzr, #63, #5            // encoding: [0xe4,0x17,0x7f,0xb3]
-// CHECK: bfm      x5, x6, #12, #63           // encoding: [0xc5,0xfc,0x4c,0xb3]
+// CHECK: bfi      x4, x5, #52, #11           // encoding: [0xa4,0x28,0x4c,0xb3]
+// CHECK: bfxil    xzr, x4, #0, #1            // encoding: [0x9f,0x00,0x40,0xb3]
+// CHECK: bfi      x4, xzr, #1, #6            // encoding: [0xe4,0x17,0x7f,0xb3]
+// CHECK: bfxil    x5, x6, #12, #52           // encoding: [0xc5,0xfc,0x4c,0xb3]
 
         sxtb w1, w2
         sxtb xzr, w3
@@ -972,9 +977,9 @@ _func:
         uxth w9, w10
         uxth x0, w1
 // CHECK: uxtb     w1, w2                     // encoding: [0x41,0x1c,0x00,0x53]
-// CHECK: uxtb     xzr, w3                    // encoding: [0x7f,0x1c,0x00,0x53]
+// CHECK: uxtb     {{[wx]}}zr, w3             // encoding: [0x7f,0x1c,0x00,0x53]
 // CHECK: uxth     w9, w10                    // encoding: [0x49,0x3d,0x00,0x53]
-// CHECK: uxth     x0, w1                     // encoding: [0x20,0x3c,0x00,0x53]
+// CHECK: uxth     {{[wx]}}0, w1              // encoding: [0x20,0x3c,0x00,0x53]
 
         asr w3, w2, #0
         asr w9, w10, #31
@@ -998,7 +1003,7 @@ _func:
         lsl w9, w10, #31
         lsl x20, x21, #63
         lsl w1, wzr, #3
-// CHECK: lsl      w3, w2, #0                 // encoding: [0x43,0x7c,0x00,0x53]
+// CHECK: {{lsl|lsr}}      w3, w2, #0         // encoding: [0x43,0x7c,0x00,0x53]
 // CHECK: lsl      w9, w10, #31               // encoding: [0x49,0x01,0x01,0x53]
 // CHECK: lsl      x20, x21, #63              // encoding: [0xb4,0x02,0x41,0xd3]
 // CHECK: lsl      w1, wzr, #3                // encoding: [0xe1,0x73,0x1d,0x53]
@@ -1011,11 +1016,11 @@ _func:
         sbfiz w11, w12, #31, #1
         sbfiz w13, w14, #29, #3
         sbfiz xzr, xzr, #10, #11
-// CHECK: sbfiz    w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x13]
+// CHECK: {{sbfiz|sbfx}}    w9, w10, #0, #1   // encoding: [0x49,0x01,0x00,0x13]
 // CHECK: sbfiz    x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0x93]
-// CHECK: sbfiz    x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0x93]
+// CHECK: asr    x19, x20, #0           // encoding: [0x93,0xfe,0x40,0x93]
 // CHECK: sbfiz    x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0x93]
-// CHECK: sbfiz    w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x13]
+// CHECK: asr    w9, w10, #0            // encoding: [0x49,0x7d,0x00,0x13]
 // CHECK: sbfiz    w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x13]
 // CHECK: sbfiz    w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x13]
 // CHECK: sbfiz    xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0x93]
@@ -1029,12 +1034,12 @@ _func:
         sbfx w13, w14, #29, #3
         sbfx xzr, xzr, #10, #11
 // CHECK: sbfx     w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x13]
-// CHECK: sbfx     x2, x3, #63, #1            // encoding: [0x62,0xfc,0x7f,0x93]
-// CHECK: sbfx     x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0x93]
-// CHECK: sbfx     x9, x10, #5, #59           // encoding: [0x49,0xfd,0x45,0x93]
-// CHECK: sbfx     w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x13]
-// CHECK: sbfx     w11, w12, #31, #1          // encoding: [0x8b,0x7d,0x1f,0x13]
-// CHECK: sbfx     w13, w14, #29, #3          // encoding: [0xcd,0x7d,0x1d,0x13]
+// CHECK: asr     x2, x3, #63           // encoding: [0x62,0xfc,0x7f,0x93]
+// CHECK: asr     x19, x20, #0          // encoding: [0x93,0xfe,0x40,0x93]
+// CHECK: asr     x9, x10, #5           // encoding: [0x49,0xfd,0x45,0x93]
+// CHECK: asr     w9, w10, #0           // encoding: [0x49,0x7d,0x00,0x13]
+// CHECK: asr     w11, w12, #31         // encoding: [0x8b,0x7d,0x1f,0x13]
+// CHECK: asr     w13, w14, #29         // encoding: [0xcd,0x7d,0x1d,0x13]
 // CHECK: sbfx     xzr, xzr, #10, #11         // encoding: [0xff,0x53,0x4a,0x93]
 
         bfi w9, w10, #0, #1
@@ -1045,11 +1050,12 @@ _func:
         bfi w11, w12, #31, #1
         bfi w13, w14, #29, #3
         bfi xzr, xzr, #10, #11
-// CHECK: bfi      w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x33]
+
+// CHECK: bfxil    w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x33]
 // CHECK: bfi      x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0xb3]
-// CHECK: bfi      x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xb3]
+// CHECK: bfxil    x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xb3]
 // CHECK: bfi      x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0xb3]
-// CHECK: bfi      w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x33]
+// CHECK: bfxil    w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x33]
 // CHECK: bfi      w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x33]
 // CHECK: bfi      w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x33]
 // CHECK: bfi      xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0xb3]
@@ -1079,14 +1085,15 @@ _func:
         ubfiz w11, w12, #31, #1
         ubfiz w13, w14, #29, #3
         ubfiz xzr, xzr, #10, #11
-// CHECK: ubfiz    w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x53]
-// CHECK: ubfiz    x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0xd3]
-// CHECK: ubfiz    x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xd3]
-// CHECK: ubfiz    x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0xd3]
-// CHECK: ubfiz    w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x53]
-// CHECK: ubfiz    w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x53]
-// CHECK: ubfiz    w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x53]
-// CHECK: ubfiz    xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0xd3]
+
+// CHECK: ubfx     w9, w10, #0, #1         // encoding: [0x49,0x01,0x00,0x53]
+// CHECK: lsl      x2, x3, #63             // encoding: [0x62,0x00,0x41,0xd3]
+// CHECK: lsr      x19, x20, #0            // encoding: [0x93,0xfe,0x40,0xd3]
+// CHECK: lsl      x9, x10, #5             // encoding: [0x49,0xe9,0x7b,0xd3]
+// CHECK: lsr      w9, w10, #0             // encoding: [0x49,0x7d,0x00,0x53]
+// CHECK: lsl      w11, w12, #31           // encoding: [0x8b,0x01,0x01,0x53]
+// CHECK: lsl      w13, w14, #29           // encoding: [0xcd,0x09,0x03,0x53]
+// CHECK: ubfiz    xzr, xzr, #10, #11      // encoding: [0xff,0x2b,0x76,0xd3]
 
         ubfx w9, w10, #0, #1
         ubfx x2, x3, #63, #1
@@ -1096,15 +1103,15 @@ _func:
         ubfx w11, w12, #31, #1
         ubfx w13, w14, #29, #3
         ubfx xzr, xzr, #10, #11
-// CHECK: ubfx     w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x53]
-// CHECK: ubfx     x2, x3, #63, #1            // encoding: [0x62,0xfc,0x7f,0xd3]
-// CHECK: ubfx     x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xd3]
-// CHECK: ubfx     x9, x10, #5, #59           // encoding: [0x49,0xfd,0x45,0xd3]
-// CHECK: ubfx     w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x53]
-// CHECK: ubfx     w11, w12, #31, #1          // encoding: [0x8b,0x7d,0x1f,0x53]
-// CHECK: ubfx     w13, w14, #29, #3          // encoding: [0xcd,0x7d,0x1d,0x53]
-// CHECK: ubfx     xzr, xzr, #10, #11         // encoding: [0xff,0x53,0x4a,0xd3]
 
+// CHECK: ubfx    w9, w10, #0, #1         // encoding: [0x49,0x01,0x00,0x53]
+// CHECK: lsr     x2, x3, #63             // encoding: [0x62,0xfc,0x7f,0xd3]
+// CHECK: lsr     x19, x20, #0            // encoding: [0x93,0xfe,0x40,0xd3]
+// CHECK: lsr     x9, x10, #5             // encoding: [0x49,0xfd,0x45,0xd3]
+// CHECK: lsr     w9, w10, #0             // encoding: [0x49,0x7d,0x00,0x53]
+// CHECK: lsr     w11, w12, #31           // encoding: [0x8b,0x7d,0x1f,0x53]
+// CHECK: lsr     w13, w14, #29           // encoding: [0xcd,0x7d,0x1d,0x53]
+// CHECK: ubfx    xzr, xzr, #10, #11      // encoding: [0xff,0x53,0x4a,0xd3]
 //------------------------------------------------------------------------------
 // Compare & branch (immediate)
 //------------------------------------------------------------------------------
@@ -1113,21 +1120,22 @@ _func:
         cbz x5, lbl
         cbnz x2, lbl
         cbnz x26, lbl
-// CHECK: cbz      w5, lbl                // encoding: [0x05'A',A,A,0x34'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: cbz      x5, lbl                // encoding: [0x05'A',A,A,0xb4'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: cbnz     x2, lbl                // encoding: [0x02'A',A,A,0xb5'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: cbnz     x26, lbl               // encoding: [0x1a'A',A,A,0xb5'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: cbz    w5, lbl                 // encoding: [0bAAA00101,A,A,0x34]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: cbz    x5, lbl                 // encoding: [0bAAA00101,A,A,0xb4]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: cbnz    x2, lbl                 // encoding: [0bAAA00010,A,A,0xb5]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: cbnz    x26, lbl                // encoding: [0bAAA11010,A,A,0xb5]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
 
         cbz wzr, lbl
         cbnz xzr, lbl
-// CHECK: cbz      wzr, lbl               // encoding: [0x1f'A',A,A,0x34'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: cbnz     xzr, lbl               // encoding: [0x1f'A',A,A,0xb5'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+
+// CHECK: cbz    wzr, lbl                // encoding: [0bAAA11111,A,A,0x34]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: cbnz    xzr, lbl                // encoding: [0bAAA11111,A,A,0xb5]
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
 
         cbz w5, #0
         cbnz x3, #-4
@@ -1159,41 +1167,43 @@ _func:
         b.gt lbl
         b.le lbl
         b.al lbl
-// CHECK: b.eq lbl                        // encoding: [A,A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.ne lbl                        // encoding: [0x01'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.hs lbl                        // encoding: [0x02'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.hs lbl                        // encoding: [0x02'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.lo lbl                        // encoding: [0x03'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.lo lbl                        // encoding: [0x03'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.mi lbl                        // encoding: [0x04'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.pl lbl                        // encoding: [0x05'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.vs lbl                        // encoding: [0x06'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.vc lbl                        // encoding: [0x07'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.hi lbl                        // encoding: [0x08'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.ls lbl                        // encoding: [0x09'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.ge lbl                        // encoding: [0x0a'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.lt lbl                        // encoding: [0x0b'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.gt lbl                        // encoding: [0x0c'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.le lbl                        // encoding: [0x0d'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.al lbl                        // encoding: [0x0e'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
 
+// CHECK: b.eq lbl                     // encoding: [0bAAA00000,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.ne lbl                     // encoding: [0bAAA00001,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.hs lbl                     // encoding: [0bAAA00010,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.hs lbl                     // encoding: [0bAAA00010,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.lo lbl                     // encoding: [0bAAA00011,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.lo lbl                     // encoding: [0bAAA00011,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.mi lbl                     // encoding: [0bAAA00100,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.pl lbl                     // encoding: [0bAAA00101,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.vs lbl                     // encoding: [0bAAA00110,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.vc lbl                     // encoding: [0bAAA00111,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.hi lbl                     // encoding: [0bAAA01000,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.ls lbl                     // encoding: [0bAAA01001,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.ge lbl                     // encoding: [0bAAA01010,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.lt lbl                     // encoding: [0bAAA01011,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.gt lbl                     // encoding: [0bAAA01100,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.le lbl                     // encoding: [0bAAA01101,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+// CHECK: b.al lbl                     // encoding: [0bAAA01110,A,A,0x54]
+// CHECK:                              //   fixup A - offset: 0, value: lbl, kind: fixup_aarch64_pcrel_branch19
+
+        //  ARM64 has these in a separate file
         beq lbl
         bne lbl
         bcs lbl
@@ -1211,40 +1221,6 @@ _func:
         bgt lbl
         ble lbl
         bal lbl
-// CHECK: b.eq lbl                        // encoding: [A,A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.ne lbl                        // encoding: [0x01'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.hs lbl                        // encoding: [0x02'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.hs lbl                        // encoding: [0x02'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.lo lbl                        // encoding: [0x03'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.lo lbl                        // encoding: [0x03'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.mi lbl                        // encoding: [0x04'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.pl lbl                        // encoding: [0x05'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.vs lbl                        // encoding: [0x06'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.vc lbl                        // encoding: [0x07'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.hi lbl                        // encoding: [0x08'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.ls lbl                        // encoding: [0x09'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.ge lbl                        // encoding: [0x0a'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.lt lbl                        // encoding: [0x0b'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.gt lbl                        // encoding: [0x0c'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.le lbl                        // encoding: [0x0d'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
-// CHECK: b.al lbl                        // encoding: [0x0e'A',A,A,0x54'A']
-// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
 
         b.eq #0
         b.lt #-4
@@ -1394,55 +1370,55 @@ _func:
 
         cset w3, eq
         cset x9, pl
-// CHECK: csinc    w3, wzr, wzr, ne           // encoding: [0xe3,0x17,0x9f,0x1a]
-// CHECK: csinc    x9, xzr, xzr, mi           // encoding: [0xe9,0x47,0x9f,0x9a]
+// CHECK: cset    w3, eq           // encoding: [0xe3,0x17,0x9f,0x1a]
+// CHECK: cset    x9, pl           // encoding: [0xe9,0x47,0x9f,0x9a]
 
         csetm w20, ne
         csetm x30, ge
-// CHECK: csinv    w20, wzr, wzr, eq          // encoding: [0xf4,0x03,0x9f,0x5a]
-// CHECK: csinv    x30, xzr, xzr, lt          // encoding: [0xfe,0xb3,0x9f,0xda]
+// CHECK: csetm    w20, ne          // encoding: [0xf4,0x03,0x9f,0x5a]
+// CHECK: csetm    x30, ge          // encoding: [0xfe,0xb3,0x9f,0xda]
 
         cinc w3, w5, gt
         cinc wzr, w4, le
         cinc w9, wzr, lt
-// CHECK: csinc    w3, w5, w5, le             // encoding: [0xa3,0xd4,0x85,0x1a]
-// CHECK: csinc    wzr, w4, w4, gt            // encoding: [0x9f,0xc4,0x84,0x1a]
-// CHECK: csinc    w9, wzr, wzr, ge           // encoding: [0xe9,0xa7,0x9f,0x1a]
+// CHECK: cinc    w3, w5, gt             // encoding: [0xa3,0xd4,0x85,0x1a]
+// CHECK: cinc    wzr, w4, le            // encoding: [0x9f,0xc4,0x84,0x1a]
+// CHECK: cset    w9, lt           // encoding: [0xe9,0xa7,0x9f,0x1a]
 
         cinc x3, x5, gt
         cinc xzr, x4, le
         cinc x9, xzr, lt
-// CHECK: csinc     x3, x5, x5, le             // encoding: [0xa3,0xd4,0x85,0x9a]
-// CHECK: csinc     xzr, x4, x4, gt            // encoding: [0x9f,0xc4,0x84,0x9a]
-// CHECK: csinc     x9, xzr, xzr, ge           // encoding: [0xe9,0xa7,0x9f,0x9a]
+// CHECK: cinc     x3, x5, gt             // encoding: [0xa3,0xd4,0x85,0x9a]
+// CHECK: cinc     xzr, x4, le            // encoding: [0x9f,0xc4,0x84,0x9a]
+// CHECK: cset     x9, lt           // encoding: [0xe9,0xa7,0x9f,0x9a]
 
         cinv w3, w5, gt
         cinv wzr, w4, le
         cinv w9, wzr, lt
-// CHECK: csinv    w3, w5, w5, le             // encoding: [0xa3,0xd0,0x85,0x5a]
-// CHECK: csinv    wzr, w4, w4, gt            // encoding: [0x9f,0xc0,0x84,0x5a]
-// CHECK: csinv    w9, wzr, wzr, ge           // encoding: [0xe9,0xa3,0x9f,0x5a]
+// CHECK: cinv    w3, w5, gt             // encoding: [0xa3,0xd0,0x85,0x5a]
+// CHECK: cinv    wzr, w4, le            // encoding: [0x9f,0xc0,0x84,0x5a]
+// CHECK: csetm    w9, lt           // encoding: [0xe9,0xa3,0x9f,0x5a]
 
         cinv x3, x5, gt
         cinv xzr, x4, le
         cinv x9, xzr, lt
-// CHECK: csinv    x3, x5, x5, le             // encoding: [0xa3,0xd0,0x85,0xda]
-// CHECK: csinv    xzr, x4, x4, gt            // encoding: [0x9f,0xc0,0x84,0xda]
-// CHECK: csinv    x9, xzr, xzr, ge           // encoding: [0xe9,0xa3,0x9f,0xda]
+// CHECK: cinv    x3, x5, gt             // encoding: [0xa3,0xd0,0x85,0xda]
+// CHECK: cinv    xzr, x4, le            // encoding: [0x9f,0xc0,0x84,0xda]
+// CHECK: csetm    x9, lt           // encoding: [0xe9,0xa3,0x9f,0xda]
 
         cneg w3, w5, gt
         cneg wzr, w4, le
         cneg w9, wzr, lt
-// CHECK: csneg    w3, w5, w5, le             // encoding: [0xa3,0xd4,0x85,0x5a]
-// CHECK: csneg    wzr, w4, w4, gt            // encoding: [0x9f,0xc4,0x84,0x5a]
-// CHECK: csneg    w9, wzr, wzr, ge           // encoding: [0xe9,0xa7,0x9f,0x5a]
+// CHECK: cneg    w3, w5, gt            // encoding: [0xa3,0xd4,0x85,0x5a]
+// CHECK: cneg    wzr, w4, le            // encoding: [0x9f,0xc4,0x84,0x5a]
+// CHECK: cneg    w9, wzr, lt           // encoding: [0xe9,0xa7,0x9f,0x5a]
 
         cneg x3, x5, gt
         cneg xzr, x4, le
         cneg x9, xzr, lt
-// CHECK: csneg    x3, x5, x5, le             // encoding: [0xa3,0xd4,0x85,0xda]
-// CHECK: csneg    xzr, x4, x4, gt            // encoding: [0x9f,0xc4,0x84,0xda]
-// CHECK: csneg    x9, xzr, xzr, ge           // encoding: [0xe9,0xa7,0x9f,0xda]
+// CHECK: cneg    x3, x5, gt             // encoding: [0xa3,0xd4,0x85,0xda]
+// CHECK: cneg    xzr, x4, le            // encoding: [0x9f,0xc4,0x84,0xda]
+// CHECK: cneg    x9, xzr, lt           // encoding: [0xe9,0xa7,0x9f,0xda]
 
 //------------------------------------------------------------------------------
 // Data-processing (1 source)
@@ -1699,23 +1675,23 @@ _func:
         svc #0
         svc #65535
 // CHECK: svc      #0                         // encoding: [0x01,0x00,0x00,0xd4]
-// CHECK: svc      #65535                     // encoding: [0xe1,0xff,0x1f,0xd4]
+// CHECK: svc      #{{65535|0xffff}}          // encoding: [0xe1,0xff,0x1f,0xd4]
 
         hvc #1
         smc #12000
         brk #12
         hlt #123
-// CHECK: hvc      #1                         // encoding: [0x22,0x00,0x00,0xd4]
-// CHECK: smc      #12000                     // encoding: [0x03,0xdc,0x05,0xd4]
-// CHECK: brk      #12                        // encoding: [0x80,0x01,0x20,0xd4]
-// CHECK: hlt      #123                       // encoding: [0x60,0x0f,0x40,0xd4]
+// CHECK: hvc      #{{1|0x1}}                 // encoding: [0x22,0x00,0x00,0xd4]
+// CHECK: smc      #{{12000|0x2ee0}}          // encoding: [0x03,0xdc,0x05,0xd4]
+// CHECK: brk      #{{12|0xc}}                // encoding: [0x80,0x01,0x20,0xd4]
+// CHECK: hlt      #{{123|0x7b}}              // encoding: [0x60,0x0f,0x40,0xd4]
 
         dcps1 #42
         dcps2 #9
         dcps3 #1000
-// CHECK: dcps1    #42                        // encoding: [0x41,0x05,0xa0,0xd4]
-// CHECK: dcps2    #9                         // encoding: [0x22,0x01,0xa0,0xd4]
-// CHECK: dcps3    #1000                      // encoding: [0x03,0x7d,0xa0,0xd4]
+// CHECK: dcps1    #{{42|0x2a}}               // encoding: [0x41,0x05,0xa0,0xd4]
+// CHECK: dcps2    #{{9|0x9}}                 // encoding: [0x22,0x01,0xa0,0xd4]
+// CHECK: dcps3    #{{1000|0x3e8}}            // encoding: [0x03,0x7d,0xa0,0xd4]
 
         dcps1
         dcps2
@@ -1740,11 +1716,11 @@ _func:
 
         ror x19, x23, #24
         ror x29, xzr, #63
-// CHECK: extr     x19, x23, x23, #24         // encoding: [0xf3,0x62,0xd7,0x93]
-// CHECK: extr     x29, xzr, xzr, #63         // encoding: [0xfd,0xff,0xdf,0x93]
+// CHECK: ror     x19, x23, #24         // encoding: [0xf3,0x62,0xd7,0x93]
+// CHECK: ror     x29, xzr, #63         // encoding: [0xfd,0xff,0xdf,0x93]
 
         ror w9, w13, #31
-// CHECK: extr     w9, w13, w13, #31          // encoding: [0xa9,0x7d,0x8d,0x13]
+// CHECK: ror     w9, w13, #31          // encoding: [0xa9,0x7d,0x8d,0x13]
 
 //------------------------------------------------------------------------------
 // Floating-point compare
@@ -2176,7 +2152,7 @@ _func:
 
         fmov x3, v12.d[1]
         fmov v1.d[1], x19
-        fmov v3.2d[1], xzr
+        fmov v3.d[1], xzr
 // CHECK: fmov     x3, v12.d[1]               // encoding: [0x83,0x01,0xae,0x9e]
 // CHECK: fmov     v1.d[1], x19               // encoding: [0x61,0x02,0xaf,0x9e]
 // CHECK: fmov     v3.d[1], xzr               // encoding: [0xe3,0x03,0xaf,0x9e]
@@ -2188,20 +2164,20 @@ _func:
         fmov s2, #0.125
         fmov s3, #1.0
         fmov d30, #16.0
-// CHECK: fmov     s2, #0.12500000            // encoding: [0x02,0x10,0x28,0x1e]
-// CHECK: fmov     s3, #1.00000000            // encoding: [0x03,0x10,0x2e,0x1e]
-// CHECK: fmov     d30, #16.00000000          // encoding: [0x1e,0x10,0x66,0x1e]
+// CHECK: fmov     s2, #{{0.12500000|1.250*e-01}}            // encoding: [0x02,0x10,0x28,0x1e]
+// CHECK: fmov     s3, #{{1.00000000|1.0*e\+00}}            // encoding: [0x03,0x10,0x2e,0x1e]
+// CHECK: fmov     d30, #{{16.00000000|1.60*e\+01}}          // encoding: [0x1e,0x10,0x66,0x1e]
 
         fmov s4, #1.0625
         fmov d10, #1.9375
-// CHECK: fmov     s4, #1.06250000            // encoding: [0x04,0x30,0x2e,0x1e]
-// CHECK: fmov     d10, #1.93750000           // encoding: [0x0a,0xf0,0x6f,0x1e]
+// CHECK: fmov     s4, #{{1.06250*(e\+00)?}}            // encoding: [0x04,0x30,0x2e,0x1e]
+// CHECK: fmov     d10, #{{1.93750*(e\+00)?}}           // encoding: [0x0a,0xf0,0x6f,0x1e]
 
         fmov s12, #-1.0
-// CHECK: fmov     s12, #-1.00000000          // encoding: [0x0c,0x10,0x3e,0x1e]
+// CHECK: fmov     s12, #{{-1.0*(e\+00)?}}          // encoding: [0x0c,0x10,0x3e,0x1e]
 
         fmov d16, #8.5
-// CHECK: fmov     d16, #8.50000000           // encoding: [0x10,0x30,0x64,0x1e]
+// CHECK: fmov     d16, #{{8.50*(e\+00)?}}          // encoding: [0x10,0x30,0x64,0x1e]
 
 //------------------------------------------------------------------------------
 // Load-register (literal)
@@ -2209,22 +2185,24 @@ _func:
         ldr w3, here
         ldr x29, there
         ldrsw xzr, everywhere
-// CHECK: ldr     w3, here                // encoding: [0x03'A',A,A,0x18'A']
-// CHECK:                                 //   fixup A - offset: 0, value: here, kind: fixup_a64_ld_prel
-// CHECK: ldr     x29, there              // encoding: [0x1d'A',A,A,0x58'A']
-// CHECK:                                 //   fixup A - offset: 0, value: there, kind: fixup_a64_ld_prel
-// CHECK: ldrsw   xzr, everywhere         // encoding: [0x1f'A',A,A,0x98'A']
-// CHECK:                                 //   fixup A - offset: 0, value: everywhere, kind: fixup_a64_ld_prel
+
+// CHECK: ldr    w3, here                // encoding: [0bAAA00011,A,A,0x18]
+// CHECK:                                 //   fixup A - offset: 0, value: here, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: ldr    x29, there              // encoding: [0bAAA11101,A,A,0x58]
+// CHECK:                                 //   fixup A - offset: 0, value: there, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: ldrsw    xzr, everywhere         // encoding: [0bAAA11111,A,A,0x98]
+// CHECK:                                 //   fixup A - offset: 0, value: everywhere, kind: fixup_aarch64_ldr_pcrel_imm19
 
         ldr s0, who_knows
         ldr d0, i_dont
         ldr q0, there_must_be_a_better_way
-// CHECK: ldr     s0, who_knows           // encoding: [A,A,A,0x1c'A']
-// CHECK:                                 //   fixup A - offset: 0, value: who_knows, kind: fixup_a64_ld_prel
-// CHECK: ldr     d0, i_dont              // encoding: [A,A,A,0x5c'A']
-// CHECK:                                 //   fixup A - offset: 0, value: i_dont, kind: fixup_a64_ld_prel
-// CHECK: ldr     q0, there_must_be_a_better_way // encoding: [A,A,A,0x9c'A']
-// CHECK:                                 //   fixup A - offset: 0, value: there_must_be_a_better_way, kind: fixup_a64_ld_prel
+
+// CHECK: ldr    s0, who_knows           // encoding: [0bAAA00000,A,A,0x1c]
+// CHECK:                                 //   fixup A - offset: 0, value: who_knows, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: ldr    d0, i_dont              // encoding: [0bAAA00000,A,A,0x5c]
+// CHECK:                                 //   fixup A - offset: 0, value: i_dont, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: ldr    q0, there_must_be_a_better_way // encoding: [0bAAA00000,A,A,0x9c]
+// CHECK:                                 //   fixup A - offset: 0, value: there_must_be_a_better_way, kind: fixup_aarch64_ldr_pcrel_imm19
 
         ldr w0, #1048572
         ldr x10, #-1048576
@@ -2233,32 +2211,11 @@ _func:
 
         prfm pldl1strm, nowhere
         prfm #22, somewhere
-// CHECK: prfm    pldl1strm, nowhere      // encoding: [0x01'A',A,A,0xd8'A']
-// CHECK:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_a64_ld_prel
-// CHECK: prfm    #22, somewhere          // encoding: [0x16'A',A,A,0xd8'A']
-// CHECK:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_a64_ld_prel
-
-//------------------------------------------------------------------------------
-// Floating-point immediate
-//------------------------------------------------------------------------------
 
-        fmov s2, #0.125
-        fmov s3, #1.0
-        fmov d30, #16.0
-// CHECK: fmov     s2, #0.12500000            // encoding: [0x02,0x10,0x28,0x1e]
-// CHECK: fmov     s3, #1.00000000            // encoding: [0x03,0x10,0x2e,0x1e]
-// CHECK: fmov     d30, #16.00000000          // encoding: [0x1e,0x10,0x66,0x1e]
-
-        fmov s4, #1.0625
-        fmov d10, #1.9375
-// CHECK: fmov     s4, #1.06250000            // encoding: [0x04,0x30,0x2e,0x1e]
-// CHECK: fmov     d10, #1.93750000           // encoding: [0x0a,0xf0,0x6f,0x1e]
-
-        fmov s12, #-1.0
-// CHECK: fmov     s12, #-1.00000000          // encoding: [0x0c,0x10,0x3e,0x1e]
-
-        fmov d16, #8.5
-// CHECK: fmov     d16, #8.50000000           // encoding: [0x10,0x30,0x64,0x1e]
+// CHECK: prfm    pldl1strm, nowhere      // encoding: [0bAAA00001,A,A,0xd8]
+// CHECK:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_aarch64_ldr_pcrel_imm19
+// CHECK: prfm    #22, somewhere          // encoding: [0bAAA10110,A,A,0xd8]
+// CHECK:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_aarch64_ldr_pcrel_imm19
 
 //------------------------------------------------------------------------------
 // Load/store exclusive
@@ -2473,18 +2430,19 @@ _func:
         ldrsw x15, [x5, #:lo12:sym]
         ldr x15, [x5, #:lo12:sym]
         ldr q3, [x2, #:lo12:sym]
-// CHECK: str     x15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,A,0xf9'A']
-// CHECK:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst64_lo12
-// CHECK: ldrb    w15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,0x40'A',0x39'A']
-// CHECK:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst8_lo12
-// CHECK: ldrsh   x15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,0x80'A',0x79'A']
-// CHECK:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst16_lo12
-// CHECK: ldrsw   x15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,0x80'A',0xb9'A']
-// CHECK:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst32_lo12
-// CHECK: ldr     x15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,0x40'A',0xf9'A']
-// CHECK:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst64_lo12
-// CHECK: ldr     q3, [x2, #:lo12:sym]    // encoding: [0x43'A',A,0xc0'A',0x3d'A']
-// CHECK:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst128_lo12
+
+// CHECK: str    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b00AAAAAA,0xf9]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: ldrb    w15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b01AAAAAA,0x39]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsh    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b10AAAAAA,0x79]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsw    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b10AAAAAA,0xb9]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldr    x15, [x5, :lo12:sym]    // encoding: [0xaf,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: ldr    q3, [x2, :lo12:sym]     // encoding: [0x43,0bAAAAAA00,0b11AAAAAA,0x3d]
+// CHECK:                                 //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_aarch64_ldst_imm12_scale16
 
         prfm pldl1keep, [sp, #8]
         prfm pldl1strm, [x3]
@@ -2506,24 +2464,24 @@ _func:
         prfm pstl3strm, [x6]
         prfm #15, [sp]
 // CHECK: prfm    pldl1keep, [sp, #8]     // encoding: [0xe0,0x07,0x80,0xf9]
-// CHECK: prfm    pldl1strm, [x3, #0]     // encoding: [0x61,0x00,0x80,0xf9]
+// CHECK: prfm    pldl1strm, [x3{{(, #0)?}}]     // encoding: [0x61,0x00,0x80,0xf9]
 // CHECK: prfm    pldl2keep, [x5, #16]    // encoding: [0xa2,0x08,0x80,0xf9]
-// CHECK: prfm    pldl2strm, [x2, #0]     // encoding: [0x43,0x00,0x80,0xf9]
-// CHECK: prfm    pldl3keep, [x5, #0]     // encoding: [0xa4,0x00,0x80,0xf9]
-// CHECK: prfm    pldl3strm, [x6, #0]     // encoding: [0xc5,0x00,0x80,0xf9]
+// CHECK: prfm    pldl2strm, [x2{{(, #0)?}}]     // encoding: [0x43,0x00,0x80,0xf9]
+// CHECK: prfm    pldl3keep, [x5{{(, #0)?}}]     // encoding: [0xa4,0x00,0x80,0xf9]
+// CHECK: prfm    pldl3strm, [x6{{(, #0)?}}]     // encoding: [0xc5,0x00,0x80,0xf9]
 // CHECK: prfm    plil1keep, [sp, #8]     // encoding: [0xe8,0x07,0x80,0xf9]
-// CHECK: prfm    plil1strm, [x3, #0]     // encoding: [0x69,0x00,0x80,0xf9]
+// CHECK: prfm    plil1strm, [x3{{(, #0)?}}]     // encoding: [0x69,0x00,0x80,0xf9]
 // CHECK: prfm    plil2keep, [x5, #16]    // encoding: [0xaa,0x08,0x80,0xf9]
-// CHECK: prfm    plil2strm, [x2, #0]     // encoding: [0x4b,0x00,0x80,0xf9]
-// CHECK: prfm    plil3keep, [x5, #0]     // encoding: [0xac,0x00,0x80,0xf9]
-// CHECK: prfm    plil3strm, [x6, #0]     // encoding: [0xcd,0x00,0x80,0xf9]
+// CHECK: prfm    plil2strm, [x2{{(, #0)?}}]     // encoding: [0x4b,0x00,0x80,0xf9]
+// CHECK: prfm    plil3keep, [x5{{(, #0)?}}]     // encoding: [0xac,0x00,0x80,0xf9]
+// CHECK: prfm    plil3strm, [x6{{(, #0)?}}]     // encoding: [0xcd,0x00,0x80,0xf9]
 // CHECK: prfm    pstl1keep, [sp, #8]     // encoding: [0xf0,0x07,0x80,0xf9]
-// CHECK: prfm    pstl1strm, [x3, #0]     // encoding: [0x71,0x00,0x80,0xf9]
+// CHECK: prfm    pstl1strm, [x3{{(, #0)?}}]     // encoding: [0x71,0x00,0x80,0xf9]
 // CHECK: prfm    pstl2keep, [x5, #16]    // encoding: [0xb2,0x08,0x80,0xf9]
-// CHECK: prfm    pstl2strm, [x2, #0]     // encoding: [0x53,0x00,0x80,0xf9]
-// CHECK: prfm    pstl3keep, [x5, #0]     // encoding: [0xb4,0x00,0x80,0xf9]
-// CHECK: prfm    pstl3strm, [x6, #0]     // encoding: [0xd5,0x00,0x80,0xf9]
-// CHECK: prfm    #15, [sp, #0]           // encoding: [0xef,0x03,0x80,0xf9]
+// CHECK: prfm    pstl2strm, [x2{{(, #0)?}}]     // encoding: [0x53,0x00,0x80,0xf9]
+// CHECK: prfm    pstl3keep, [x5{{(, #0)?}}]     // encoding: [0xb4,0x00,0x80,0xf9]
+// CHECK: prfm    pstl3strm, [x6{{(, #0)?}}]     // encoding: [0xd5,0x00,0x80,0xf9]
+// CHECK: prfm    #15, [sp{{(, #0)?}}]           // encoding: [0xef,0x03,0x80,0xf9]
 
 //// Floating-point versions
 
@@ -2636,7 +2594,7 @@ _func:
 // CHECK: ldr      x17, [x23, w9, sxtw]       // encoding: [0xf1,0xca,0x69,0xf8]
 // CHECK: ldr      x18, [x22, w10, sxtw]      // encoding: [0xd2,0xca,0x6a,0xf8]
 // CHECK: str      d19, [x21, wzr, sxtw #3]   // encoding: [0xb3,0xda,0x3f,0xfc]
-// CHECK: prfm     #6, [x0, x5, lsl #0]       // encoding: [0x06,0x68,0xa5,0xf8]
+// CHECK: prfm     #6, [x0, x5{{(, lsl #0)?}}]       // encoding: [0x06,0x68,0xa5,0xf8]
 
         ldr q3, [sp, x5]
         ldr q9, [x27, x6, lsl #0]
@@ -3218,15 +3176,15 @@ _func:
         ands wzr, w18, #0xcccccccc
         ands w19, w20, #0x33333333
         ands w21, w22, #0x99999999
-// CHECK: ands     wzr, w18, #0xcccccccc      // encoding: [0x5f,0xe6,0x02,0x72]
+// CHECK: {{ands wzr,|tst}} w18, #0xcccccccc      // encoding: [0x5f,0xe6,0x02,0x72]
 // CHECK: ands     w19, w20, #0x33333333      // encoding: [0x93,0xe6,0x00,0x72]
 // CHECK: ands     w21, w22, #0x99999999      // encoding: [0xd5,0xe6,0x01,0x72]
 
         // 2 bit replication width
         tst w3, #0xaaaaaaaa
         tst wzr, #0x55555555
-// CHECK: ands     wzr, w3, #0xaaaaaaaa       // encoding: [0x7f,0xf0,0x01,0x72]
-// CHECK: ands     wzr, wzr, #0x55555555      // encoding: [0xff,0xf3,0x00,0x72]
+// CHECK: {{ands wzr,|tst}} w3, #0xaaaaaaaa       // encoding: [0x7f,0xf0,0x01,0x72]
+// CHECK: {{ands wzr,|tst}} wzr, #0x55555555      // encoding: [0xff,0xf3,0x00,0x72]
 
         // 64 bit replication-width
         eor x3, x5, #0xffffffffc000000
@@ -3264,20 +3222,20 @@ _func:
         ands xzr, x18, #0xcccccccccccccccc
         ands x19, x20, #0x3333333333333333
         ands x21, x22, #0x9999999999999999
-// CHECK: ands     xzr, x18, #0xcccccccccccccccc // encoding: [0x5f,0xe6,0x02,0xf2]
+// CHECK: {{ands xzr,|tst}} x18, #0xcccccccccccccccc // encoding: [0x5f,0xe6,0x02,0xf2]
 // CHECK: ands     x19, x20, #0x3333333333333333 // encoding: [0x93,0xe6,0x00,0xf2]
 // CHECK: ands     x21, x22, #0x9999999999999999 // encoding: [0xd5,0xe6,0x01,0xf2]
 
         // 2 bit replication-width
         tst x3, #0xaaaaaaaaaaaaaaaa
         tst xzr, #0x5555555555555555
-// CHECK: ands     xzr, x3, #0xaaaaaaaaaaaaaaaa    // encoding: [0x7f,0xf0,0x01,0xf2]
-// CHECK: ands     xzr, xzr, #0x5555555555555555   // encoding: [0xff,0xf3,0x00,0xf2]
+// CHECK: {{ands xzr,|tst}} x3, #0xaaaaaaaaaaaaaaaa    // encoding: [0x7f,0xf0,0x01,0xf2]
+// CHECK: {{ands xzr,|tst}} xzr, #0x5555555555555555   // encoding: [0xff,0xf3,0x00,0xf2]
 
         mov w3, #0xf000f
         mov x10, #0xaaaaaaaaaaaaaaaa
 // CHECK: orr      w3, wzr, #0xf000f          // encoding: [0xe3,0x8f,0x00,0x32]
-// CHECK: orr      x10, xzr, #0xaaaaaaaaaaaaaaaa // encoding: [0xea,0xf3,0x01,0xb2]
+// CHECK: orr x10, xzr, #0xaaaaaaaaaaaaaaaa // encoding: [0xea,0xf3,0x01,0xb2]
 
 //------------------------------------------------------------------------------
 // Logical (shifted register)
@@ -3353,75 +3311,83 @@ _func:
         movz w1, #65535, lsl #0
         movz w2, #0, lsl #16
         movn w2, #1234, lsl #0
-// CHECK: movz     w1, #65535                 // encoding: [0xe1,0xff,0x9f,0x52]
+// CHECK: movz     w1, #{{65535|0xffff}}      // encoding: [0xe1,0xff,0x9f,0x52]
 // CHECK: movz     w2, #0, lsl #16            // encoding: [0x02,0x00,0xa0,0x52]
-// CHECK: movn     w2, #1234                  // encoding: [0x42,0x9a,0x80,0x12]
+// CHECK: movn     w2, #{{1234|0x4d2}}        // encoding: [0x42,0x9a,0x80,0x12]
 
         movz x2, #1234, lsl #32
         movk xzr, #4321, lsl #48
-// CHECK: movz     x2, #1234, lsl #32         // encoding: [0x42,0x9a,0xc0,0xd2]
-// CHECK: movk     xzr, #4321, lsl #48        // encoding: [0x3f,0x1c,0xe2,0xf2]
+// CHECK: movz     x2, #{{1234|0x4d2}}, lsl #32   // encoding: [0x42,0x9a,0xc0,0xd2]
+// CHECK: movk     xzr, #{{4321|0x10e1}}, lsl #48 // encoding: [0x3f,0x1c,0xe2,0xf2]
 
         movz x2, #:abs_g0:sym
         movk w3, #:abs_g0_nc:sym
-// CHECK: movz    x2, #:abs_g0:sym        // encoding: [0x02'A',A,0x80'A',0xd2'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_a64_movw_uabs_g0
-// CHECK: movk     w3, #:abs_g0_nc:sym    // encoding: [0x03'A',A,0x80'A',0x72'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_a64_movw_uabs_g0_nc
+
+// CHECK: movz    x2, #:abs_g0:sym        // encoding: [0bAAA00010,A,0b100AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_aarch64_movw
+// CHECK: movk    w3, #:abs_g0_nc:sym     // encoding: [0bAAA00011,A,0b100AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_aarch64_movw
 
         movz x4, #:abs_g1:sym
         movk w5, #:abs_g1_nc:sym
-// CHECK: movz     x4, #:abs_g1:sym       // encoding: [0x04'A',A,0xa0'A',0xd2'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_a64_movw_uabs_g1
-// CHECK: movk     w5, #:abs_g1_nc:sym    // encoding: [0x05'A',A,0xa0'A',0x72'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_a64_movw_uabs_g1_nc
+
+// CHECK: movz    x4, #:abs_g1:sym        // encoding: [0bAAA00100,A,0b101AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_aarch64_movw
+// CHECK: movk    w5, #:abs_g1_nc:sym     // encoding: [0bAAA00101,A,0b101AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_aarch64_movw
 
         movz x6, #:abs_g2:sym
         movk x7, #:abs_g2_nc:sym
-// CHECK: movz     x6, #:abs_g2:sym       // encoding: [0x06'A',A,0xc0'A',0xd2'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_a64_movw_uabs_g2
-// CHECK: movk     x7, #:abs_g2_nc:sym    // encoding: [0x07'A',A,0xc0'A',0xf2'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_a64_movw_uabs_g2_nc
+
+// CHECK: movz    x6, #:abs_g2:sym        // encoding: [0bAAA00110,A,0b110AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_aarch64_movw
+// CHECK: movk    x7, #:abs_g2_nc:sym     // encoding: [0bAAA00111,A,0b110AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_aarch64_movw
 
         movz x8, #:abs_g3:sym
         movk x9, #:abs_g3:sym
-// CHECK: movz     x8, #:abs_g3:sym       // encoding: [0x08'A',A,0xe0'A',0xd2'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_a64_movw_uabs_g3
-// CHECK: movk     x9, #:abs_g3:sym       // encoding: [0x09'A',A,0xe0'A',0xf2'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_a64_movw_uabs_g3
+
+// CHECK: movz    x8, #:abs_g3:sym        // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_aarch64_movw
+// CHECK: movk    x9, #:abs_g3:sym        // encoding: [0bAAA01001,A,0b111AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_aarch64_movw
+
 
         movn x30, #:abs_g0_s:sym
         movz x19, #:abs_g0_s:sym
         movn w10, #:abs_g0_s:sym
         movz w25, #:abs_g0_s:sym
-// CHECK: movn     x30, #:abs_g0_s:sym    // encoding: [0x1e'A',A,0x80'A',0x92'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_a64_movw_sabs_g0
-// CHECK: movz     x19, #:abs_g0_s:sym    // encoding: [0x13'A',A,0x80'A',0x92'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_a64_movw_sabs_g0
-// CHECK: movn     w10, #:abs_g0_s:sym    // encoding: [0x0a'A',A,0x80'A',0x12'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_a64_movw_sabs_g0
-// CHECK: movz     w25, #:abs_g0_s:sym    // encoding: [0x19'A',A,0x80'A',0x12'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_a64_movw_sabs_g0
+
+// CHECK: movn    x30, #:abs_g0_s:sym     // encoding: [0bAAA11110,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    x19, #:abs_g0_s:sym     // encoding: [0bAAA10011,A,0b100AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
+// CHECK: movn    w10, #:abs_g0_s:sym     // encoding: [0bAAA01010,A,0b100AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    w25, #:abs_g0_s:sym     // encoding: [0bAAA11001,A,0b100AAAAA,0x52]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_aarch64_movw
 
         movn x30, #:abs_g1_s:sym
         movz x19, #:abs_g1_s:sym
         movn w10, #:abs_g1_s:sym
         movz w25, #:abs_g1_s:sym
-// CHECK: movn     x30, #:abs_g1_s:sym    // encoding: [0x1e'A',A,0xa0'A',0x92'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_a64_movw_sabs_g1
-// CHECK: movz     x19, #:abs_g1_s:sym    // encoding: [0x13'A',A,0xa0'A',0x92'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_a64_movw_sabs_g1
-// CHECK: movn     w10, #:abs_g1_s:sym    // encoding: [0x0a'A',A,0xa0'A',0x12'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_a64_movw_sabs_g1
-// CHECK: movz     w25, #:abs_g1_s:sym    // encoding: [0x19'A',A,0xa0'A',0x12'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_a64_movw_sabs_g1
+
+// CHECK: movn    x30, #:abs_g1_s:sym     // encoding: [0bAAA11110,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    x19, #:abs_g1_s:sym     // encoding: [0bAAA10011,A,0b101AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_aarch64_movw
+// CHECK: movn    w10, #:abs_g1_s:sym     // encoding: [0bAAA01010,A,0b101AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    w25, #:abs_g1_s:sym     // encoding: [0bAAA11001,A,0b101AAAAA,0x52]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_aarch64_movw
 
         movn x30, #:abs_g2_s:sym
         movz x19, #:abs_g2_s:sym
-// CHECK: movn     x30, #:abs_g2_s:sym    // encoding: [0x1e'A',A,0xc0'A',0x92'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_a64_movw_sabs_g2
-// CHECK: movz     x19, #:abs_g2_s:sym    // encoding: [0x13'A',A,0xc0'A',0x92'A']
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_a64_movw_sabs_g2
+
+// CHECK: movn    x30, #:abs_g2_s:sym     // encoding: [0bAAA11110,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_aarch64_movw
+// CHECK: movz    x19, #:abs_g2_s:sym     // encoding: [0bAAA10011,A,0b110AAAAA,0xd2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_aarch64_movw
 
 //------------------------------------------------------------------------------
 // PC-relative addressing
@@ -3429,15 +3395,16 @@ _func:
 
         adr x2, loc
         adr xzr, loc
- // CHECK: adr     x2, loc                 // encoding: [0x02'A',A,A,0x10'A']
- // CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_a64_adr_prel
- // CHECK: adr     xzr, loc                // encoding: [0x1f'A',A,A,0x10'A']
- // CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_a64_adr_prel
+
+// CHECK: adr    x2, loc                 // encoding: [0x02'A',A,A,0x10'A']
+// CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_aarch64_pcrel_adr_imm21
+// CHECK: adr    xzr, loc                // encoding: [0x1f'A',A,A,0x10'A']
+// CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_aarch64_pcrel_adr_imm21
 
         adrp x29, loc
- // CHECK: adrp    x29, loc                // encoding: [0x1d'A',A,A,0x90'A']
- // CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_a64_adr_prel_page
 
+// CHECK: adrp    x29, loc                // encoding: [0x1d'A',A,A,0x90'A']
+// CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_aarch64_pcrel_adrp_imm21
         adrp x30, #4096
         adr x20, #0
         adr x9, #-1
@@ -3463,7 +3430,7 @@ _func:
         hint #0
         hint #127
 // CHECK: nop                             // encoding: [0x1f,0x20,0x03,0xd5]
-// CHECK: hint    #127                    // encoding: [0xff,0x2f,0x03,0xd5]
+// CHECK: hint    #{{127|0x7f}}           // encoding: [0xff,0x2f,0x03,0xd5]
 
         nop
         yield
@@ -3560,14 +3527,14 @@ _func:
         msr spsel, #0
         msr daifset, #15
         msr daifclr, #12
-// CHECK: msr     spsel, #0               // encoding: [0xbf,0x40,0x00,0xd5]
-// CHECK: msr     daifset, #15            // encoding: [0xdf,0x4f,0x03,0xd5]
-// CHECK: msr     daifclr, #12            // encoding: [0xff,0x4c,0x03,0xd5]
+// CHECK: msr     {{spsel|SPSEL}}, #0               // encoding: [0xbf,0x40,0x00,0xd5]
+// CHECK: msr     {{daifset|DAIFSET}}, #15            // encoding: [0xdf,0x4f,0x03,0xd5]
+// CHECK: msr     {{daifclr|DAIFCLR}}, #12            // encoding: [0xff,0x4c,0x03,0xd5]
 
         sys #7, c5, c9, #7, x5
         sys #0, c15, c15, #2
 // CHECK: sys     #7, c5, c9, #7, x5      // encoding: [0xe5,0x59,0x0f,0xd5]
-// CHECK: sys     #0, c15, c15, #2, xzr   // encoding: [0x5f,0xff,0x08,0xd5]
+// CHECK: sys     #0, c15, c15, #2   // encoding: [0x5f,0xff,0x08,0xd5]
 
         sysl x9, #7, c5, c9, #7
         sysl x1, #0, c15, c15, #2
@@ -3942,260 +3909,260 @@ _func:
 	msr PMEVTYPER28_EL0, x12
 	msr PMEVTYPER29_EL0, x12
 	msr PMEVTYPER30_EL0, x12
-// CHECK: msr      teecr32_el1, x12           // encoding: [0x0c,0x00,0x12,0xd5]
-// CHECK: msr      osdtrrx_el1, x12           // encoding: [0x4c,0x00,0x10,0xd5]
-// CHECK: msr      mdccint_el1, x12           // encoding: [0x0c,0x02,0x10,0xd5]
-// CHECK: msr      mdscr_el1, x12             // encoding: [0x4c,0x02,0x10,0xd5]
-// CHECK: msr      osdtrtx_el1, x12           // encoding: [0x4c,0x03,0x10,0xd5]
-// CHECK: msr      dbgdtr_el0, x12            // encoding: [0x0c,0x04,0x13,0xd5]
-// CHECK: msr      dbgdtrtx_el0, x12          // encoding: [0x0c,0x05,0x13,0xd5]
-// CHECK: msr      oseccr_el1, x12            // encoding: [0x4c,0x06,0x10,0xd5]
-// CHECK: msr      dbgvcr32_el2, x12          // encoding: [0x0c,0x07,0x14,0xd5]
-// CHECK: msr      dbgbvr0_el1, x12           // encoding: [0x8c,0x00,0x10,0xd5]
-// CHECK: msr      dbgbvr1_el1, x12           // encoding: [0x8c,0x01,0x10,0xd5]
-// CHECK: msr      dbgbvr2_el1, x12           // encoding: [0x8c,0x02,0x10,0xd5]
-// CHECK: msr      dbgbvr3_el1, x12           // encoding: [0x8c,0x03,0x10,0xd5]
-// CHECK: msr      dbgbvr4_el1, x12           // encoding: [0x8c,0x04,0x10,0xd5]
-// CHECK: msr      dbgbvr5_el1, x12           // encoding: [0x8c,0x05,0x10,0xd5]
-// CHECK: msr      dbgbvr6_el1, x12           // encoding: [0x8c,0x06,0x10,0xd5]
-// CHECK: msr      dbgbvr7_el1, x12           // encoding: [0x8c,0x07,0x10,0xd5]
-// CHECK: msr      dbgbvr8_el1, x12           // encoding: [0x8c,0x08,0x10,0xd5]
-// CHECK: msr      dbgbvr9_el1, x12           // encoding: [0x8c,0x09,0x10,0xd5]
-// CHECK: msr      dbgbvr10_el1, x12          // encoding: [0x8c,0x0a,0x10,0xd5]
-// CHECK: msr      dbgbvr11_el1, x12          // encoding: [0x8c,0x0b,0x10,0xd5]
-// CHECK: msr      dbgbvr12_el1, x12          // encoding: [0x8c,0x0c,0x10,0xd5]
-// CHECK: msr      dbgbvr13_el1, x12          // encoding: [0x8c,0x0d,0x10,0xd5]
-// CHECK: msr      dbgbvr14_el1, x12          // encoding: [0x8c,0x0e,0x10,0xd5]
-// CHECK: msr      dbgbvr15_el1, x12          // encoding: [0x8c,0x0f,0x10,0xd5]
-// CHECK: msr      dbgbcr0_el1, x12           // encoding: [0xac,0x00,0x10,0xd5]
-// CHECK: msr      dbgbcr1_el1, x12           // encoding: [0xac,0x01,0x10,0xd5]
-// CHECK: msr      dbgbcr2_el1, x12           // encoding: [0xac,0x02,0x10,0xd5]
-// CHECK: msr      dbgbcr3_el1, x12           // encoding: [0xac,0x03,0x10,0xd5]
-// CHECK: msr      dbgbcr4_el1, x12           // encoding: [0xac,0x04,0x10,0xd5]
-// CHECK: msr      dbgbcr5_el1, x12           // encoding: [0xac,0x05,0x10,0xd5]
-// CHECK: msr      dbgbcr6_el1, x12           // encoding: [0xac,0x06,0x10,0xd5]
-// CHECK: msr      dbgbcr7_el1, x12           // encoding: [0xac,0x07,0x10,0xd5]
-// CHECK: msr      dbgbcr8_el1, x12           // encoding: [0xac,0x08,0x10,0xd5]
-// CHECK: msr      dbgbcr9_el1, x12           // encoding: [0xac,0x09,0x10,0xd5]
-// CHECK: msr      dbgbcr10_el1, x12          // encoding: [0xac,0x0a,0x10,0xd5]
-// CHECK: msr      dbgbcr11_el1, x12          // encoding: [0xac,0x0b,0x10,0xd5]
-// CHECK: msr      dbgbcr12_el1, x12          // encoding: [0xac,0x0c,0x10,0xd5]
-// CHECK: msr      dbgbcr13_el1, x12          // encoding: [0xac,0x0d,0x10,0xd5]
-// CHECK: msr      dbgbcr14_el1, x12          // encoding: [0xac,0x0e,0x10,0xd5]
-// CHECK: msr      dbgbcr15_el1, x12          // encoding: [0xac,0x0f,0x10,0xd5]
-// CHECK: msr      dbgwvr0_el1, x12           // encoding: [0xcc,0x00,0x10,0xd5]
-// CHECK: msr      dbgwvr1_el1, x12           // encoding: [0xcc,0x01,0x10,0xd5]
-// CHECK: msr      dbgwvr2_el1, x12           // encoding: [0xcc,0x02,0x10,0xd5]
-// CHECK: msr      dbgwvr3_el1, x12           // encoding: [0xcc,0x03,0x10,0xd5]
-// CHECK: msr      dbgwvr4_el1, x12           // encoding: [0xcc,0x04,0x10,0xd5]
-// CHECK: msr      dbgwvr5_el1, x12           // encoding: [0xcc,0x05,0x10,0xd5]
-// CHECK: msr      dbgwvr6_el1, x12           // encoding: [0xcc,0x06,0x10,0xd5]
-// CHECK: msr      dbgwvr7_el1, x12           // encoding: [0xcc,0x07,0x10,0xd5]
-// CHECK: msr      dbgwvr8_el1, x12           // encoding: [0xcc,0x08,0x10,0xd5]
-// CHECK: msr      dbgwvr9_el1, x12           // encoding: [0xcc,0x09,0x10,0xd5]
-// CHECK: msr      dbgwvr10_el1, x12          // encoding: [0xcc,0x0a,0x10,0xd5]
-// CHECK: msr      dbgwvr11_el1, x12          // encoding: [0xcc,0x0b,0x10,0xd5]
-// CHECK: msr      dbgwvr12_el1, x12          // encoding: [0xcc,0x0c,0x10,0xd5]
-// CHECK: msr      dbgwvr13_el1, x12          // encoding: [0xcc,0x0d,0x10,0xd5]
-// CHECK: msr      dbgwvr14_el1, x12          // encoding: [0xcc,0x0e,0x10,0xd5]
-// CHECK: msr      dbgwvr15_el1, x12          // encoding: [0xcc,0x0f,0x10,0xd5]
-// CHECK: msr      dbgwcr0_el1, x12           // encoding: [0xec,0x00,0x10,0xd5]
-// CHECK: msr      dbgwcr1_el1, x12           // encoding: [0xec,0x01,0x10,0xd5]
-// CHECK: msr      dbgwcr2_el1, x12           // encoding: [0xec,0x02,0x10,0xd5]
-// CHECK: msr      dbgwcr3_el1, x12           // encoding: [0xec,0x03,0x10,0xd5]
-// CHECK: msr      dbgwcr4_el1, x12           // encoding: [0xec,0x04,0x10,0xd5]
-// CHECK: msr      dbgwcr5_el1, x12           // encoding: [0xec,0x05,0x10,0xd5]
-// CHECK: msr      dbgwcr6_el1, x12           // encoding: [0xec,0x06,0x10,0xd5]
-// CHECK: msr      dbgwcr7_el1, x12           // encoding: [0xec,0x07,0x10,0xd5]
-// CHECK: msr      dbgwcr8_el1, x12           // encoding: [0xec,0x08,0x10,0xd5]
-// CHECK: msr      dbgwcr9_el1, x12           // encoding: [0xec,0x09,0x10,0xd5]
-// CHECK: msr      dbgwcr10_el1, x12          // encoding: [0xec,0x0a,0x10,0xd5]
-// CHECK: msr      dbgwcr11_el1, x12          // encoding: [0xec,0x0b,0x10,0xd5]
-// CHECK: msr      dbgwcr12_el1, x12          // encoding: [0xec,0x0c,0x10,0xd5]
-// CHECK: msr      dbgwcr13_el1, x12          // encoding: [0xec,0x0d,0x10,0xd5]
-// CHECK: msr      dbgwcr14_el1, x12          // encoding: [0xec,0x0e,0x10,0xd5]
-// CHECK: msr      dbgwcr15_el1, x12          // encoding: [0xec,0x0f,0x10,0xd5]
-// CHECK: msr      teehbr32_el1, x12          // encoding: [0x0c,0x10,0x12,0xd5]
-// CHECK: msr      oslar_el1, x12             // encoding: [0x8c,0x10,0x10,0xd5]
-// CHECK: msr      osdlr_el1, x12             // encoding: [0x8c,0x13,0x10,0xd5]
-// CHECK: msr      dbgprcr_el1, x12           // encoding: [0x8c,0x14,0x10,0xd5]
-// CHECK: msr      dbgclaimset_el1, x12       // encoding: [0xcc,0x78,0x10,0xd5]
-// CHECK: msr      dbgclaimclr_el1, x12       // encoding: [0xcc,0x79,0x10,0xd5]
-// CHECK: msr      csselr_el1, x12            // encoding: [0x0c,0x00,0x1a,0xd5]
-// CHECK: msr      vpidr_el2, x12             // encoding: [0x0c,0x00,0x1c,0xd5]
-// CHECK: msr      vmpidr_el2, x12            // encoding: [0xac,0x00,0x1c,0xd5]
-// CHECK: msr      sctlr_el1, x12             // encoding: [0x0c,0x10,0x18,0xd5]
-// CHECK: msr      sctlr_el2, x12             // encoding: [0x0c,0x10,0x1c,0xd5]
-// CHECK: msr      sctlr_el3, x12             // encoding: [0x0c,0x10,0x1e,0xd5]
-// CHECK: msr      actlr_el1, x12             // encoding: [0x2c,0x10,0x18,0xd5]
-// CHECK: msr      actlr_el2, x12             // encoding: [0x2c,0x10,0x1c,0xd5]
-// CHECK: msr      actlr_el3, x12             // encoding: [0x2c,0x10,0x1e,0xd5]
-// CHECK: msr      cpacr_el1, x12             // encoding: [0x4c,0x10,0x18,0xd5]
-// CHECK: msr      hcr_el2, x12               // encoding: [0x0c,0x11,0x1c,0xd5]
-// CHECK: msr      scr_el3, x12               // encoding: [0x0c,0x11,0x1e,0xd5]
-// CHECK: msr      mdcr_el2, x12              // encoding: [0x2c,0x11,0x1c,0xd5]
-// CHECK: msr      sder32_el3, x12            // encoding: [0x2c,0x11,0x1e,0xd5]
-// CHECK: msr      cptr_el2, x12              // encoding: [0x4c,0x11,0x1c,0xd5]
-// CHECK: msr      cptr_el3, x12              // encoding: [0x4c,0x11,0x1e,0xd5]
-// CHECK: msr      hstr_el2, x12              // encoding: [0x6c,0x11,0x1c,0xd5]
-// CHECK: msr      hacr_el2, x12              // encoding: [0xec,0x11,0x1c,0xd5]
-// CHECK: msr      mdcr_el3, x12              // encoding: [0x2c,0x13,0x1e,0xd5]
-// CHECK: msr      ttbr0_el1, x12             // encoding: [0x0c,0x20,0x18,0xd5]
-// CHECK: msr      ttbr0_el2, x12             // encoding: [0x0c,0x20,0x1c,0xd5]
-// CHECK: msr      ttbr0_el3, x12             // encoding: [0x0c,0x20,0x1e,0xd5]
-// CHECK: msr      ttbr1_el1, x12             // encoding: [0x2c,0x20,0x18,0xd5]
-// CHECK: msr      tcr_el1, x12               // encoding: [0x4c,0x20,0x18,0xd5]
-// CHECK: msr      tcr_el2, x12               // encoding: [0x4c,0x20,0x1c,0xd5]
-// CHECK: msr      tcr_el3, x12               // encoding: [0x4c,0x20,0x1e,0xd5]
-// CHECK: msr      vttbr_el2, x12             // encoding: [0x0c,0x21,0x1c,0xd5]
-// CHECK: msr      vtcr_el2, x12              // encoding: [0x4c,0x21,0x1c,0xd5]
-// CHECK: msr      dacr32_el2, x12            // encoding: [0x0c,0x30,0x1c,0xd5]
-// CHECK: msr      spsr_el1, x12              // encoding: [0x0c,0x40,0x18,0xd5]
-// CHECK: msr      spsr_el2, x12              // encoding: [0x0c,0x40,0x1c,0xd5]
-// CHECK: msr      spsr_el3, x12              // encoding: [0x0c,0x40,0x1e,0xd5]
-// CHECK: msr      elr_el1, x12               // encoding: [0x2c,0x40,0x18,0xd5]
-// CHECK: msr      elr_el2, x12               // encoding: [0x2c,0x40,0x1c,0xd5]
-// CHECK: msr      elr_el3, x12               // encoding: [0x2c,0x40,0x1e,0xd5]
-// CHECK: msr      sp_el0, x12                // encoding: [0x0c,0x41,0x18,0xd5]
-// CHECK: msr      sp_el1, x12                // encoding: [0x0c,0x41,0x1c,0xd5]
-// CHECK: msr      sp_el2, x12                // encoding: [0x0c,0x41,0x1e,0xd5]
-// CHECK: msr      spsel, x12                 // encoding: [0x0c,0x42,0x18,0xd5]
-// CHECK: msr      nzcv, x12                  // encoding: [0x0c,0x42,0x1b,0xd5]
-// CHECK: msr      daif, x12                  // encoding: [0x2c,0x42,0x1b,0xd5]
-// CHECK: msr      currentel, x12             // encoding: [0x4c,0x42,0x18,0xd5]
-// CHECK: msr      spsr_irq, x12              // encoding: [0x0c,0x43,0x1c,0xd5]
-// CHECK: msr      spsr_abt, x12              // encoding: [0x2c,0x43,0x1c,0xd5]
-// CHECK: msr      spsr_und, x12              // encoding: [0x4c,0x43,0x1c,0xd5]
-// CHECK: msr      spsr_fiq, x12              // encoding: [0x6c,0x43,0x1c,0xd5]
-// CHECK: msr      fpcr, x12                  // encoding: [0x0c,0x44,0x1b,0xd5]
-// CHECK: msr      fpsr, x12                  // encoding: [0x2c,0x44,0x1b,0xd5]
-// CHECK: msr      dspsr_el0, x12             // encoding: [0x0c,0x45,0x1b,0xd5]
-// CHECK: msr      dlr_el0, x12               // encoding: [0x2c,0x45,0x1b,0xd5]
-// CHECK: msr      ifsr32_el2, x12            // encoding: [0x2c,0x50,0x1c,0xd5]
-// CHECK: msr      afsr0_el1, x12             // encoding: [0x0c,0x51,0x18,0xd5]
-// CHECK: msr      afsr0_el2, x12             // encoding: [0x0c,0x51,0x1c,0xd5]
-// CHECK: msr      afsr0_el3, x12             // encoding: [0x0c,0x51,0x1e,0xd5]
-// CHECK: msr      afsr1_el1, x12             // encoding: [0x2c,0x51,0x18,0xd5]
-// CHECK: msr      afsr1_el2, x12             // encoding: [0x2c,0x51,0x1c,0xd5]
-// CHECK: msr      afsr1_el3, x12             // encoding: [0x2c,0x51,0x1e,0xd5]
-// CHECK: msr      esr_el1, x12               // encoding: [0x0c,0x52,0x18,0xd5]
-// CHECK: msr      esr_el2, x12               // encoding: [0x0c,0x52,0x1c,0xd5]
-// CHECK: msr      esr_el3, x12               // encoding: [0x0c,0x52,0x1e,0xd5]
-// CHECK: msr      fpexc32_el2, x12           // encoding: [0x0c,0x53,0x1c,0xd5]
-// CHECK: msr      far_el1, x12               // encoding: [0x0c,0x60,0x18,0xd5]
-// CHECK: msr      far_el2, x12               // encoding: [0x0c,0x60,0x1c,0xd5]
-// CHECK: msr      far_el3, x12               // encoding: [0x0c,0x60,0x1e,0xd5]
-// CHECK: msr      hpfar_el2, x12             // encoding: [0x8c,0x60,0x1c,0xd5]
-// CHECK: msr      par_el1, x12               // encoding: [0x0c,0x74,0x18,0xd5]
-// CHECK: msr      pmcr_el0, x12              // encoding: [0x0c,0x9c,0x1b,0xd5]
-// CHECK: msr      pmcntenset_el0, x12        // encoding: [0x2c,0x9c,0x1b,0xd5]
-// CHECK: msr      pmcntenclr_el0, x12        // encoding: [0x4c,0x9c,0x1b,0xd5]
-// CHECK: msr      pmovsclr_el0, x12          // encoding: [0x6c,0x9c,0x1b,0xd5]
-// CHECK: msr      pmselr_el0, x12            // encoding: [0xac,0x9c,0x1b,0xd5]
-// CHECK: msr      pmccntr_el0, x12           // encoding: [0x0c,0x9d,0x1b,0xd5]
-// CHECK: msr      pmxevtyper_el0, x12        // encoding: [0x2c,0x9d,0x1b,0xd5]
-// CHECK: msr      pmxevcntr_el0, x12         // encoding: [0x4c,0x9d,0x1b,0xd5]
-// CHECK: msr      pmuserenr_el0, x12         // encoding: [0x0c,0x9e,0x1b,0xd5]
-// CHECK: msr      pmintenset_el1, x12        // encoding: [0x2c,0x9e,0x18,0xd5]
-// CHECK: msr      pmintenclr_el1, x12        // encoding: [0x4c,0x9e,0x18,0xd5]
-// CHECK: msr      pmovsset_el0, x12          // encoding: [0x6c,0x9e,0x1b,0xd5]
-// CHECK: msr      mair_el1, x12              // encoding: [0x0c,0xa2,0x18,0xd5]
-// CHECK: msr      mair_el2, x12              // encoding: [0x0c,0xa2,0x1c,0xd5]
-// CHECK: msr      mair_el3, x12              // encoding: [0x0c,0xa2,0x1e,0xd5]
-// CHECK: msr      amair_el1, x12             // encoding: [0x0c,0xa3,0x18,0xd5]
-// CHECK: msr      amair_el2, x12             // encoding: [0x0c,0xa3,0x1c,0xd5]
-// CHECK: msr      amair_el3, x12             // encoding: [0x0c,0xa3,0x1e,0xd5]
-// CHECK: msr      vbar_el1, x12              // encoding: [0x0c,0xc0,0x18,0xd5]
-// CHECK: msr      vbar_el2, x12              // encoding: [0x0c,0xc0,0x1c,0xd5]
-// CHECK: msr      vbar_el3, x12              // encoding: [0x0c,0xc0,0x1e,0xd5]
-// CHECK: msr      rmr_el1, x12               // encoding: [0x4c,0xc0,0x18,0xd5]
-// CHECK: msr      rmr_el2, x12               // encoding: [0x4c,0xc0,0x1c,0xd5]
-// CHECK: msr      rmr_el3, x12               // encoding: [0x4c,0xc0,0x1e,0xd5]
-// CHECK: msr      contextidr_el1, x12        // encoding: [0x2c,0xd0,0x18,0xd5]
-// CHECK: msr      tpidr_el0, x12             // encoding: [0x4c,0xd0,0x1b,0xd5]
-// CHECK: msr      tpidr_el2, x12             // encoding: [0x4c,0xd0,0x1c,0xd5]
-// CHECK: msr      tpidr_el3, x12             // encoding: [0x4c,0xd0,0x1e,0xd5]
-// CHECK: msr      tpidrro_el0, x12           // encoding: [0x6c,0xd0,0x1b,0xd5]
-// CHECK: msr      tpidr_el1, x12             // encoding: [0x8c,0xd0,0x18,0xd5]
-// CHECK: msr      cntfrq_el0, x12            // encoding: [0x0c,0xe0,0x1b,0xd5]
-// CHECK: msr      cntvoff_el2, x12           // encoding: [0x6c,0xe0,0x1c,0xd5]
-// CHECK: msr      cntkctl_el1, x12           // encoding: [0x0c,0xe1,0x18,0xd5]
-// CHECK: msr      cnthctl_el2, x12           // encoding: [0x0c,0xe1,0x1c,0xd5]
-// CHECK: msr      cntp_tval_el0, x12         // encoding: [0x0c,0xe2,0x1b,0xd5]
-// CHECK: msr      cnthp_tval_el2, x12        // encoding: [0x0c,0xe2,0x1c,0xd5]
-// CHECK: msr      cntps_tval_el1, x12        // encoding: [0x0c,0xe2,0x1f,0xd5]
-// CHECK: msr      cntp_ctl_el0, x12          // encoding: [0x2c,0xe2,0x1b,0xd5]
-// CHECK: msr      cnthp_ctl_el2, x12         // encoding: [0x2c,0xe2,0x1c,0xd5]
-// CHECK: msr      cntps_ctl_el1, x12         // encoding: [0x2c,0xe2,0x1f,0xd5]
-// CHECK: msr      cntp_cval_el0, x12         // encoding: [0x4c,0xe2,0x1b,0xd5]
-// CHECK: msr      cnthp_cval_el2, x12        // encoding: [0x4c,0xe2,0x1c,0xd5]
-// CHECK: msr      cntps_cval_el1, x12        // encoding: [0x4c,0xe2,0x1f,0xd5]
-// CHECK: msr      cntv_tval_el0, x12         // encoding: [0x0c,0xe3,0x1b,0xd5]
-// CHECK: msr      cntv_ctl_el0, x12          // encoding: [0x2c,0xe3,0x1b,0xd5]
-// CHECK: msr      cntv_cval_el0, x12         // encoding: [0x4c,0xe3,0x1b,0xd5]
-// CHECK: msr      pmevcntr0_el0, x12         // encoding: [0x0c,0xe8,0x1b,0xd5]
-// CHECK: msr      pmevcntr1_el0, x12         // encoding: [0x2c,0xe8,0x1b,0xd5]
-// CHECK: msr      pmevcntr2_el0, x12         // encoding: [0x4c,0xe8,0x1b,0xd5]
-// CHECK: msr      pmevcntr3_el0, x12         // encoding: [0x6c,0xe8,0x1b,0xd5]
-// CHECK: msr      pmevcntr4_el0, x12         // encoding: [0x8c,0xe8,0x1b,0xd5]
-// CHECK: msr      pmevcntr5_el0, x12         // encoding: [0xac,0xe8,0x1b,0xd5]
-// CHECK: msr      pmevcntr6_el0, x12         // encoding: [0xcc,0xe8,0x1b,0xd5]
-// CHECK: msr      pmevcntr7_el0, x12         // encoding: [0xec,0xe8,0x1b,0xd5]
-// CHECK: msr      pmevcntr8_el0, x12         // encoding: [0x0c,0xe9,0x1b,0xd5]
-// CHECK: msr      pmevcntr9_el0, x12         // encoding: [0x2c,0xe9,0x1b,0xd5]
-// CHECK: msr      pmevcntr10_el0, x12        // encoding: [0x4c,0xe9,0x1b,0xd5]
-// CHECK: msr      pmevcntr11_el0, x12        // encoding: [0x6c,0xe9,0x1b,0xd5]
-// CHECK: msr      pmevcntr12_el0, x12        // encoding: [0x8c,0xe9,0x1b,0xd5]
-// CHECK: msr      pmevcntr13_el0, x12        // encoding: [0xac,0xe9,0x1b,0xd5]
-// CHECK: msr      pmevcntr14_el0, x12        // encoding: [0xcc,0xe9,0x1b,0xd5]
-// CHECK: msr      pmevcntr15_el0, x12        // encoding: [0xec,0xe9,0x1b,0xd5]
-// CHECK: msr      pmevcntr16_el0, x12        // encoding: [0x0c,0xea,0x1b,0xd5]
-// CHECK: msr      pmevcntr17_el0, x12        // encoding: [0x2c,0xea,0x1b,0xd5]
-// CHECK: msr      pmevcntr18_el0, x12        // encoding: [0x4c,0xea,0x1b,0xd5]
-// CHECK: msr      pmevcntr19_el0, x12        // encoding: [0x6c,0xea,0x1b,0xd5]
-// CHECK: msr      pmevcntr20_el0, x12        // encoding: [0x8c,0xea,0x1b,0xd5]
-// CHECK: msr      pmevcntr21_el0, x12        // encoding: [0xac,0xea,0x1b,0xd5]
-// CHECK: msr      pmevcntr22_el0, x12        // encoding: [0xcc,0xea,0x1b,0xd5]
-// CHECK: msr      pmevcntr23_el0, x12        // encoding: [0xec,0xea,0x1b,0xd5]
-// CHECK: msr      pmevcntr24_el0, x12        // encoding: [0x0c,0xeb,0x1b,0xd5]
-// CHECK: msr      pmevcntr25_el0, x12        // encoding: [0x2c,0xeb,0x1b,0xd5]
-// CHECK: msr      pmevcntr26_el0, x12        // encoding: [0x4c,0xeb,0x1b,0xd5]
-// CHECK: msr      pmevcntr27_el0, x12        // encoding: [0x6c,0xeb,0x1b,0xd5]
-// CHECK: msr      pmevcntr28_el0, x12        // encoding: [0x8c,0xeb,0x1b,0xd5]
-// CHECK: msr      pmevcntr29_el0, x12        // encoding: [0xac,0xeb,0x1b,0xd5]
-// CHECK: msr      pmevcntr30_el0, x12        // encoding: [0xcc,0xeb,0x1b,0xd5]
-// CHECK: msr      pmccfiltr_el0, x12         // encoding: [0xec,0xef,0x1b,0xd5]
-// CHECK: msr      pmevtyper0_el0, x12        // encoding: [0x0c,0xec,0x1b,0xd5]
-// CHECK: msr      pmevtyper1_el0, x12        // encoding: [0x2c,0xec,0x1b,0xd5]
-// CHECK: msr      pmevtyper2_el0, x12        // encoding: [0x4c,0xec,0x1b,0xd5]
-// CHECK: msr      pmevtyper3_el0, x12        // encoding: [0x6c,0xec,0x1b,0xd5]
-// CHECK: msr      pmevtyper4_el0, x12        // encoding: [0x8c,0xec,0x1b,0xd5]
-// CHECK: msr      pmevtyper5_el0, x12        // encoding: [0xac,0xec,0x1b,0xd5]
-// CHECK: msr      pmevtyper6_el0, x12        // encoding: [0xcc,0xec,0x1b,0xd5]
-// CHECK: msr      pmevtyper7_el0, x12        // encoding: [0xec,0xec,0x1b,0xd5]
-// CHECK: msr      pmevtyper8_el0, x12        // encoding: [0x0c,0xed,0x1b,0xd5]
-// CHECK: msr      pmevtyper9_el0, x12        // encoding: [0x2c,0xed,0x1b,0xd5]
-// CHECK: msr      pmevtyper10_el0, x12       // encoding: [0x4c,0xed,0x1b,0xd5]
-// CHECK: msr      pmevtyper11_el0, x12       // encoding: [0x6c,0xed,0x1b,0xd5]
-// CHECK: msr      pmevtyper12_el0, x12       // encoding: [0x8c,0xed,0x1b,0xd5]
-// CHECK: msr      pmevtyper13_el0, x12       // encoding: [0xac,0xed,0x1b,0xd5]
-// CHECK: msr      pmevtyper14_el0, x12       // encoding: [0xcc,0xed,0x1b,0xd5]
-// CHECK: msr      pmevtyper15_el0, x12       // encoding: [0xec,0xed,0x1b,0xd5]
-// CHECK: msr      pmevtyper16_el0, x12       // encoding: [0x0c,0xee,0x1b,0xd5]
-// CHECK: msr      pmevtyper17_el0, x12       // encoding: [0x2c,0xee,0x1b,0xd5]
-// CHECK: msr      pmevtyper18_el0, x12       // encoding: [0x4c,0xee,0x1b,0xd5]
-// CHECK: msr      pmevtyper19_el0, x12       // encoding: [0x6c,0xee,0x1b,0xd5]
-// CHECK: msr      pmevtyper20_el0, x12       // encoding: [0x8c,0xee,0x1b,0xd5]
-// CHECK: msr      pmevtyper21_el0, x12       // encoding: [0xac,0xee,0x1b,0xd5]
-// CHECK: msr      pmevtyper22_el0, x12       // encoding: [0xcc,0xee,0x1b,0xd5]
-// CHECK: msr      pmevtyper23_el0, x12       // encoding: [0xec,0xee,0x1b,0xd5]
-// CHECK: msr      pmevtyper24_el0, x12       // encoding: [0x0c,0xef,0x1b,0xd5]
-// CHECK: msr      pmevtyper25_el0, x12       // encoding: [0x2c,0xef,0x1b,0xd5]
-// CHECK: msr      pmevtyper26_el0, x12       // encoding: [0x4c,0xef,0x1b,0xd5]
-// CHECK: msr      pmevtyper27_el0, x12       // encoding: [0x6c,0xef,0x1b,0xd5]
-// CHECK: msr      pmevtyper28_el0, x12       // encoding: [0x8c,0xef,0x1b,0xd5]
-// CHECK: msr      pmevtyper29_el0, x12       // encoding: [0xac,0xef,0x1b,0xd5]
-// CHECK: msr      pmevtyper30_el0, x12       // encoding: [0xcc,0xef,0x1b,0xd5]
+// CHECK: msr      {{teecr32_el1|TEECR32_EL1}}, x12           // encoding: [0x0c,0x00,0x12,0xd5]
+// CHECK: msr      {{osdtrrx_el1|OSDTRRX_EL1}}, x12           // encoding: [0x4c,0x00,0x10,0xd5]
+// CHECK: msr      {{mdccint_el1|MDCCINT_EL1}}, x12           // encoding: [0x0c,0x02,0x10,0xd5]
+// CHECK: msr      {{mdscr_el1|MDSCR_EL1}}, x12             // encoding: [0x4c,0x02,0x10,0xd5]
+// CHECK: msr      {{osdtrtx_el1|OSDTRTX_EL1}}, x12           // encoding: [0x4c,0x03,0x10,0xd5]
+// CHECK: msr      {{dbgdtr_el0|DBGDTR_EL0}}, x12            // encoding: [0x0c,0x04,0x13,0xd5]
+// CHECK: msr      {{dbgdtrtx_el0|DBGDTRTX_EL0}}, x12          // encoding: [0x0c,0x05,0x13,0xd5]
+// CHECK: msr      {{oseccr_el1|OSECCR_EL1}}, x12            // encoding: [0x4c,0x06,0x10,0xd5]
+// CHECK: msr      {{dbgvcr32_el2|DBGVCR32_EL2}}, x12          // encoding: [0x0c,0x07,0x14,0xd5]
+// CHECK: msr      {{dbgbvr0_el1|DBGBVR0_EL1}}, x12           // encoding: [0x8c,0x00,0x10,0xd5]
+// CHECK: msr      {{dbgbvr1_el1|DBGBVR1_EL1}}, x12           // encoding: [0x8c,0x01,0x10,0xd5]
+// CHECK: msr      {{dbgbvr2_el1|DBGBVR2_EL1}}, x12           // encoding: [0x8c,0x02,0x10,0xd5]
+// CHECK: msr      {{dbgbvr3_el1|DBGBVR3_EL1}}, x12           // encoding: [0x8c,0x03,0x10,0xd5]
+// CHECK: msr      {{dbgbvr4_el1|DBGBVR4_EL1}}, x12           // encoding: [0x8c,0x04,0x10,0xd5]
+// CHECK: msr      {{dbgbvr5_el1|DBGBVR5_EL1}}, x12           // encoding: [0x8c,0x05,0x10,0xd5]
+// CHECK: msr      {{dbgbvr6_el1|DBGBVR6_EL1}}, x12           // encoding: [0x8c,0x06,0x10,0xd5]
+// CHECK: msr      {{dbgbvr7_el1|DBGBVR7_EL1}}, x12           // encoding: [0x8c,0x07,0x10,0xd5]
+// CHECK: msr      {{dbgbvr8_el1|DBGBVR8_EL1}}, x12           // encoding: [0x8c,0x08,0x10,0xd5]
+// CHECK: msr      {{dbgbvr9_el1|DBGBVR9_EL1}}, x12           // encoding: [0x8c,0x09,0x10,0xd5]
+// CHECK: msr      {{dbgbvr10_el1|DBGBVR10_EL1}}, x12          // encoding: [0x8c,0x0a,0x10,0xd5]
+// CHECK: msr      {{dbgbvr11_el1|DBGBVR11_EL1}}, x12          // encoding: [0x8c,0x0b,0x10,0xd5]
+// CHECK: msr      {{dbgbvr12_el1|DBGBVR12_EL1}}, x12          // encoding: [0x8c,0x0c,0x10,0xd5]
+// CHECK: msr      {{dbgbvr13_el1|DBGBVR13_EL1}}, x12          // encoding: [0x8c,0x0d,0x10,0xd5]
+// CHECK: msr      {{dbgbvr14_el1|DBGBVR14_EL1}}, x12          // encoding: [0x8c,0x0e,0x10,0xd5]
+// CHECK: msr      {{dbgbvr15_el1|DBGBVR15_EL1}}, x12          // encoding: [0x8c,0x0f,0x10,0xd5]
+// CHECK: msr      {{dbgbcr0_el1|DBGBCR0_EL1}}, x12           // encoding: [0xac,0x00,0x10,0xd5]
+// CHECK: msr      {{dbgbcr1_el1|DBGBCR1_EL1}}, x12           // encoding: [0xac,0x01,0x10,0xd5]
+// CHECK: msr      {{dbgbcr2_el1|DBGBCR2_EL1}}, x12           // encoding: [0xac,0x02,0x10,0xd5]
+// CHECK: msr      {{dbgbcr3_el1|DBGBCR3_EL1}}, x12           // encoding: [0xac,0x03,0x10,0xd5]
+// CHECK: msr      {{dbgbcr4_el1|DBGBCR4_EL1}}, x12           // encoding: [0xac,0x04,0x10,0xd5]
+// CHECK: msr      {{dbgbcr5_el1|DBGBCR5_EL1}}, x12           // encoding: [0xac,0x05,0x10,0xd5]
+// CHECK: msr      {{dbgbcr6_el1|DBGBCR6_EL1}}, x12           // encoding: [0xac,0x06,0x10,0xd5]
+// CHECK: msr      {{dbgbcr7_el1|DBGBCR7_EL1}}, x12           // encoding: [0xac,0x07,0x10,0xd5]
+// CHECK: msr      {{dbgbcr8_el1|DBGBCR8_EL1}}, x12           // encoding: [0xac,0x08,0x10,0xd5]
+// CHECK: msr      {{dbgbcr9_el1|DBGBCR9_EL1}}, x12           // encoding: [0xac,0x09,0x10,0xd5]
+// CHECK: msr      {{dbgbcr10_el1|DBGBCR10_EL1}}, x12          // encoding: [0xac,0x0a,0x10,0xd5]
+// CHECK: msr      {{dbgbcr11_el1|DBGBCR11_EL1}}, x12          // encoding: [0xac,0x0b,0x10,0xd5]
+// CHECK: msr      {{dbgbcr12_el1|DBGBCR12_EL1}}, x12          // encoding: [0xac,0x0c,0x10,0xd5]
+// CHECK: msr      {{dbgbcr13_el1|DBGBCR13_EL1}}, x12          // encoding: [0xac,0x0d,0x10,0xd5]
+// CHECK: msr      {{dbgbcr14_el1|DBGBCR14_EL1}}, x12          // encoding: [0xac,0x0e,0x10,0xd5]
+// CHECK: msr      {{dbgbcr15_el1|DBGBCR15_EL1}}, x12          // encoding: [0xac,0x0f,0x10,0xd5]
+// CHECK: msr      {{dbgwvr0_el1|DBGWVR0_EL1}}, x12           // encoding: [0xcc,0x00,0x10,0xd5]
+// CHECK: msr      {{dbgwvr1_el1|DBGWVR1_EL1}}, x12           // encoding: [0xcc,0x01,0x10,0xd5]
+// CHECK: msr      {{dbgwvr2_el1|DBGWVR2_EL1}}, x12           // encoding: [0xcc,0x02,0x10,0xd5]
+// CHECK: msr      {{dbgwvr3_el1|DBGWVR3_EL1}}, x12           // encoding: [0xcc,0x03,0x10,0xd5]
+// CHECK: msr      {{dbgwvr4_el1|DBGWVR4_EL1}}, x12           // encoding: [0xcc,0x04,0x10,0xd5]
+// CHECK: msr      {{dbgwvr5_el1|DBGWVR5_EL1}}, x12           // encoding: [0xcc,0x05,0x10,0xd5]
+// CHECK: msr      {{dbgwvr6_el1|DBGWVR6_EL1}}, x12           // encoding: [0xcc,0x06,0x10,0xd5]
+// CHECK: msr      {{dbgwvr7_el1|DBGWVR7_EL1}}, x12           // encoding: [0xcc,0x07,0x10,0xd5]
+// CHECK: msr      {{dbgwvr8_el1|DBGWVR8_EL1}}, x12           // encoding: [0xcc,0x08,0x10,0xd5]
+// CHECK: msr      {{dbgwvr9_el1|DBGWVR9_EL1}}, x12           // encoding: [0xcc,0x09,0x10,0xd5]
+// CHECK: msr      {{dbgwvr10_el1|DBGWVR10_EL1}}, x12          // encoding: [0xcc,0x0a,0x10,0xd5]
+// CHECK: msr      {{dbgwvr11_el1|DBGWVR11_EL1}}, x12          // encoding: [0xcc,0x0b,0x10,0xd5]
+// CHECK: msr      {{dbgwvr12_el1|DBGWVR12_EL1}}, x12          // encoding: [0xcc,0x0c,0x10,0xd5]
+// CHECK: msr      {{dbgwvr13_el1|DBGWVR13_EL1}}, x12          // encoding: [0xcc,0x0d,0x10,0xd5]
+// CHECK: msr      {{dbgwvr14_el1|DBGWVR14_EL1}}, x12          // encoding: [0xcc,0x0e,0x10,0xd5]
+// CHECK: msr      {{dbgwvr15_el1|DBGWVR15_EL1}}, x12          // encoding: [0xcc,0x0f,0x10,0xd5]
+// CHECK: msr      {{dbgwcr0_el1|DBGWCR0_EL1}}, x12           // encoding: [0xec,0x00,0x10,0xd5]
+// CHECK: msr      {{dbgwcr1_el1|DBGWCR1_EL1}}, x12           // encoding: [0xec,0x01,0x10,0xd5]
+// CHECK: msr      {{dbgwcr2_el1|DBGWCR2_EL1}}, x12           // encoding: [0xec,0x02,0x10,0xd5]
+// CHECK: msr      {{dbgwcr3_el1|DBGWCR3_EL1}}, x12           // encoding: [0xec,0x03,0x10,0xd5]
+// CHECK: msr      {{dbgwcr4_el1|DBGWCR4_EL1}}, x12           // encoding: [0xec,0x04,0x10,0xd5]
+// CHECK: msr      {{dbgwcr5_el1|DBGWCR5_EL1}}, x12           // encoding: [0xec,0x05,0x10,0xd5]
+// CHECK: msr      {{dbgwcr6_el1|DBGWCR6_EL1}}, x12           // encoding: [0xec,0x06,0x10,0xd5]
+// CHECK: msr      {{dbgwcr7_el1|DBGWCR7_EL1}}, x12           // encoding: [0xec,0x07,0x10,0xd5]
+// CHECK: msr      {{dbgwcr8_el1|DBGWCR8_EL1}}, x12           // encoding: [0xec,0x08,0x10,0xd5]
+// CHECK: msr      {{dbgwcr9_el1|DBGWCR9_EL1}}, x12           // encoding: [0xec,0x09,0x10,0xd5]
+// CHECK: msr      {{dbgwcr10_el1|DBGWCR10_EL1}}, x12          // encoding: [0xec,0x0a,0x10,0xd5]
+// CHECK: msr      {{dbgwcr11_el1|DBGWCR11_EL1}}, x12          // encoding: [0xec,0x0b,0x10,0xd5]
+// CHECK: msr      {{dbgwcr12_el1|DBGWCR12_EL1}}, x12          // encoding: [0xec,0x0c,0x10,0xd5]
+// CHECK: msr      {{dbgwcr13_el1|DBGWCR13_EL1}}, x12          // encoding: [0xec,0x0d,0x10,0xd5]
+// CHECK: msr      {{dbgwcr14_el1|DBGWCR14_EL1}}, x12          // encoding: [0xec,0x0e,0x10,0xd5]
+// CHECK: msr      {{dbgwcr15_el1|DBGWCR15_EL1}}, x12          // encoding: [0xec,0x0f,0x10,0xd5]
+// CHECK: msr      {{teehbr32_el1|TEEHBR32_EL1}}, x12          // encoding: [0x0c,0x10,0x12,0xd5]
+// CHECK: msr      {{oslar_el1|OSLAR_EL1}}, x12             // encoding: [0x8c,0x10,0x10,0xd5]
+// CHECK: msr      {{osdlr_el1|OSDLR_EL1}}, x12             // encoding: [0x8c,0x13,0x10,0xd5]
+// CHECK: msr      {{dbgprcr_el1|DBGPRCR_EL1}}, x12           // encoding: [0x8c,0x14,0x10,0xd5]
+// CHECK: msr      {{dbgclaimset_el1|DBGCLAIMSET_EL1}}, x12       // encoding: [0xcc,0x78,0x10,0xd5]
+// CHECK: msr      {{dbgclaimclr_el1|DBGCLAIMCLR_EL1}}, x12       // encoding: [0xcc,0x79,0x10,0xd5]
+// CHECK: msr      {{csselr_el1|CSSELR_EL1}}, x12            // encoding: [0x0c,0x00,0x1a,0xd5]
+// CHECK: msr      {{vpidr_el2|VPIDR_EL2}}, x12             // encoding: [0x0c,0x00,0x1c,0xd5]
+// CHECK: msr      {{vmpidr_el2|VMPIDR_EL2}}, x12            // encoding: [0xac,0x00,0x1c,0xd5]
+// CHECK: msr      {{sctlr_el1|SCTLR_EL1}}, x12             // encoding: [0x0c,0x10,0x18,0xd5]
+// CHECK: msr      {{sctlr_el2|SCTLR_EL2}}, x12             // encoding: [0x0c,0x10,0x1c,0xd5]
+// CHECK: msr      {{sctlr_el3|SCTLR_EL3}}, x12             // encoding: [0x0c,0x10,0x1e,0xd5]
+// CHECK: msr      {{actlr_el1|ACTLR_EL1}}, x12             // encoding: [0x2c,0x10,0x18,0xd5]
+// CHECK: msr      {{actlr_el2|ACTLR_EL2}}, x12             // encoding: [0x2c,0x10,0x1c,0xd5]
+// CHECK: msr      {{actlr_el3|ACTLR_EL3}}, x12             // encoding: [0x2c,0x10,0x1e,0xd5]
+// CHECK: msr      {{cpacr_el1|CPACR_EL1}}, x12             // encoding: [0x4c,0x10,0x18,0xd5]
+// CHECK: msr      {{hcr_el2|HCR_EL2}}, x12               // encoding: [0x0c,0x11,0x1c,0xd5]
+// CHECK: msr      {{scr_el3|SCR_EL3}}, x12               // encoding: [0x0c,0x11,0x1e,0xd5]
+// CHECK: msr      {{mdcr_el2|MDCR_EL2}}, x12              // encoding: [0x2c,0x11,0x1c,0xd5]
+// CHECK: msr      {{sder32_el3|SDER32_EL3}}, x12            // encoding: [0x2c,0x11,0x1e,0xd5]
+// CHECK: msr      {{cptr_el2|CPTR_EL2}}, x12              // encoding: [0x4c,0x11,0x1c,0xd5]
+// CHECK: msr      {{cptr_el3|CPTR_EL3}}, x12              // encoding: [0x4c,0x11,0x1e,0xd5]
+// CHECK: msr      {{hstr_el2|HSTR_EL2}}, x12              // encoding: [0x6c,0x11,0x1c,0xd5]
+// CHECK: msr      {{hacr_el2|HACR_EL2}}, x12              // encoding: [0xec,0x11,0x1c,0xd5]
+// CHECK: msr      {{mdcr_el3|MDCR_EL3}}, x12              // encoding: [0x2c,0x13,0x1e,0xd5]
+// CHECK: msr      {{ttbr0_el1|TTBR0_EL1}}, x12             // encoding: [0x0c,0x20,0x18,0xd5]
+// CHECK: msr      {{ttbr0_el2|TTBR0_EL2}}, x12             // encoding: [0x0c,0x20,0x1c,0xd5]
+// CHECK: msr      {{ttbr0_el3|TTBR0_EL3}}, x12             // encoding: [0x0c,0x20,0x1e,0xd5]
+// CHECK: msr      {{ttbr1_el1|TTBR1_EL1}}, x12             // encoding: [0x2c,0x20,0x18,0xd5]
+// CHECK: msr      {{tcr_el1|TCR_EL1}}, x12               // encoding: [0x4c,0x20,0x18,0xd5]
+// CHECK: msr      {{tcr_el2|TCR_EL2}}, x12               // encoding: [0x4c,0x20,0x1c,0xd5]
+// CHECK: msr      {{tcr_el3|TCR_EL3}}, x12               // encoding: [0x4c,0x20,0x1e,0xd5]
+// CHECK: msr      {{vttbr_el2|VTTBR_EL2}}, x12             // encoding: [0x0c,0x21,0x1c,0xd5]
+// CHECK: msr      {{vtcr_el2|VTCR_EL2}}, x12              // encoding: [0x4c,0x21,0x1c,0xd5]
+// CHECK: msr      {{dacr32_el2|DACR32_EL2}}, x12            // encoding: [0x0c,0x30,0x1c,0xd5]
+// CHECK: msr      {{spsr_el1|SPSR_EL1}}, x12              // encoding: [0x0c,0x40,0x18,0xd5]
+// CHECK: msr      {{spsr_el2|SPSR_EL2}}, x12              // encoding: [0x0c,0x40,0x1c,0xd5]
+// CHECK: msr      {{spsr_el3|SPSR_EL3}}, x12              // encoding: [0x0c,0x40,0x1e,0xd5]
+// CHECK: msr      {{elr_el1|ELR_EL1}}, x12               // encoding: [0x2c,0x40,0x18,0xd5]
+// CHECK: msr      {{elr_el2|ELR_EL2}}, x12               // encoding: [0x2c,0x40,0x1c,0xd5]
+// CHECK: msr      {{elr_el3|ELR_EL3}}, x12               // encoding: [0x2c,0x40,0x1e,0xd5]
+// CHECK: msr      {{sp_el0|SP_EL0}}, x12                // encoding: [0x0c,0x41,0x18,0xd5]
+// CHECK: msr      {{sp_el1|SP_EL1}}, x12                // encoding: [0x0c,0x41,0x1c,0xd5]
+// CHECK: msr      {{sp_el2|SP_EL2}}, x12                // encoding: [0x0c,0x41,0x1e,0xd5]
+// CHECK: msr      {{spsel|SPSEL}}, x12                 // encoding: [0x0c,0x42,0x18,0xd5]
+// CHECK: msr      {{nzcv|NZCV}}, x12                  // encoding: [0x0c,0x42,0x1b,0xd5]
+// CHECK: msr      {{daif|DAIF}}, x12                  // encoding: [0x2c,0x42,0x1b,0xd5]
+// CHECK: msr      {{currentel|CURRENTEL}}, x12             // encoding: [0x4c,0x42,0x18,0xd5]
+// CHECK: msr      {{spsr_irq|SPSR_IRQ}}, x12              // encoding: [0x0c,0x43,0x1c,0xd5]
+// CHECK: msr      {{spsr_abt|SPSR_ABT}}, x12              // encoding: [0x2c,0x43,0x1c,0xd5]
+// CHECK: msr      {{spsr_und|SPSR_UND}}, x12              // encoding: [0x4c,0x43,0x1c,0xd5]
+// CHECK: msr      {{spsr_fiq|SPSR_FIQ}}, x12              // encoding: [0x6c,0x43,0x1c,0xd5]
+// CHECK: msr      {{fpcr|FPCR}}, x12                  // encoding: [0x0c,0x44,0x1b,0xd5]
+// CHECK: msr      {{fpsr|FPSR}}, x12                  // encoding: [0x2c,0x44,0x1b,0xd5]
+// CHECK: msr      {{dspsr_el0|DSPSR_EL0}}, x12             // encoding: [0x0c,0x45,0x1b,0xd5]
+// CHECK: msr      {{dlr_el0|DLR_EL0}}, x12               // encoding: [0x2c,0x45,0x1b,0xd5]
+// CHECK: msr      {{ifsr32_el2|IFSR32_EL2}}, x12            // encoding: [0x2c,0x50,0x1c,0xd5]
+// CHECK: msr      {{afsr0_el1|AFSR0_EL1}}, x12             // encoding: [0x0c,0x51,0x18,0xd5]
+// CHECK: msr      {{afsr0_el2|AFSR0_EL2}}, x12             // encoding: [0x0c,0x51,0x1c,0xd5]
+// CHECK: msr      {{afsr0_el3|AFSR0_EL3}}, x12             // encoding: [0x0c,0x51,0x1e,0xd5]
+// CHECK: msr      {{afsr1_el1|AFSR1_EL1}}, x12             // encoding: [0x2c,0x51,0x18,0xd5]
+// CHECK: msr      {{afsr1_el2|AFSR1_EL2}}, x12             // encoding: [0x2c,0x51,0x1c,0xd5]
+// CHECK: msr      {{afsr1_el3|AFSR1_EL3}}, x12             // encoding: [0x2c,0x51,0x1e,0xd5]
+// CHECK: msr      {{esr_el1|ESR_EL1}}, x12               // encoding: [0x0c,0x52,0x18,0xd5]
+// CHECK: msr      {{esr_el2|ESR_EL2}}, x12               // encoding: [0x0c,0x52,0x1c,0xd5]
+// CHECK: msr      {{esr_el3|ESR_EL3}}, x12               // encoding: [0x0c,0x52,0x1e,0xd5]
+// CHECK: msr      {{fpexc32_el2|FPEXC32_EL2}}, x12           // encoding: [0x0c,0x53,0x1c,0xd5]
+// CHECK: msr      {{far_el1|FAR_EL1}}, x12               // encoding: [0x0c,0x60,0x18,0xd5]
+// CHECK: msr      {{far_el2|FAR_EL2}}, x12               // encoding: [0x0c,0x60,0x1c,0xd5]
+// CHECK: msr      {{far_el3|FAR_EL3}}, x12               // encoding: [0x0c,0x60,0x1e,0xd5]
+// CHECK: msr      {{hpfar_el2|HPFAR_EL2}}, x12             // encoding: [0x8c,0x60,0x1c,0xd5]
+// CHECK: msr      {{par_el1|PAR_EL1}}, x12               // encoding: [0x0c,0x74,0x18,0xd5]
+// CHECK: msr      {{pmcr_el0|PMCR_EL0}}, x12              // encoding: [0x0c,0x9c,0x1b,0xd5]
+// CHECK: msr      {{pmcntenset_el0|PMCNTENSET_EL0}}, x12        // encoding: [0x2c,0x9c,0x1b,0xd5]
+// CHECK: msr      {{pmcntenclr_el0|PMCNTENCLR_EL0}}, x12        // encoding: [0x4c,0x9c,0x1b,0xd5]
+// CHECK: msr      {{pmovsclr_el0|PMOVSCLR_EL0}}, x12          // encoding: [0x6c,0x9c,0x1b,0xd5]
+// CHECK: msr      {{pmselr_el0|PMSELR_EL0}}, x12            // encoding: [0xac,0x9c,0x1b,0xd5]
+// CHECK: msr      {{pmccntr_el0|PMCCNTR_EL0}}, x12           // encoding: [0x0c,0x9d,0x1b,0xd5]
+// CHECK: msr      {{pmxevtyper_el0|PMXEVTYPER_EL0}}, x12        // encoding: [0x2c,0x9d,0x1b,0xd5]
+// CHECK: msr      {{pmxevcntr_el0|PMXEVCNTR_EL0}}, x12         // encoding: [0x4c,0x9d,0x1b,0xd5]
+// CHECK: msr      {{pmuserenr_el0|PMUSERENR_EL0}}, x12         // encoding: [0x0c,0x9e,0x1b,0xd5]
+// CHECK: msr      {{pmintenset_el1|PMINTENSET_EL1}}, x12        // encoding: [0x2c,0x9e,0x18,0xd5]
+// CHECK: msr      {{pmintenclr_el1|PMINTENCLR_EL1}}, x12        // encoding: [0x4c,0x9e,0x18,0xd5]
+// CHECK: msr      {{pmovsset_el0|PMOVSSET_EL0}}, x12          // encoding: [0x6c,0x9e,0x1b,0xd5]
+// CHECK: msr      {{mair_el1|MAIR_EL1}}, x12              // encoding: [0x0c,0xa2,0x18,0xd5]
+// CHECK: msr      {{mair_el2|MAIR_EL2}}, x12              // encoding: [0x0c,0xa2,0x1c,0xd5]
+// CHECK: msr      {{mair_el3|MAIR_EL3}}, x12              // encoding: [0x0c,0xa2,0x1e,0xd5]
+// CHECK: msr      {{amair_el1|AMAIR_EL1}}, x12             // encoding: [0x0c,0xa3,0x18,0xd5]
+// CHECK: msr      {{amair_el2|AMAIR_EL2}}, x12             // encoding: [0x0c,0xa3,0x1c,0xd5]
+// CHECK: msr      {{amair_el3|AMAIR_EL3}}, x12             // encoding: [0x0c,0xa3,0x1e,0xd5]
+// CHECK: msr      {{vbar_el1|VBAR_EL1}}, x12              // encoding: [0x0c,0xc0,0x18,0xd5]
+// CHECK: msr      {{vbar_el2|VBAR_EL2}}, x12              // encoding: [0x0c,0xc0,0x1c,0xd5]
+// CHECK: msr      {{vbar_el3|VBAR_EL3}}, x12              // encoding: [0x0c,0xc0,0x1e,0xd5]
+// CHECK: msr      {{rmr_el1|RMR_EL1}}, x12               // encoding: [0x4c,0xc0,0x18,0xd5]
+// CHECK: msr      {{rmr_el2|RMR_EL2}}, x12               // encoding: [0x4c,0xc0,0x1c,0xd5]
+// CHECK: msr      {{rmr_el3|RMR_EL3}}, x12               // encoding: [0x4c,0xc0,0x1e,0xd5]
+// CHECK: msr      {{contextidr_el1|CONTEXTIDR_EL1}}, x12        // encoding: [0x2c,0xd0,0x18,0xd5]
+// CHECK: msr      {{tpidr_el0|TPIDR_EL0}}, x12             // encoding: [0x4c,0xd0,0x1b,0xd5]
+// CHECK: msr      {{tpidr_el2|TPIDR_EL2}}, x12             // encoding: [0x4c,0xd0,0x1c,0xd5]
+// CHECK: msr      {{tpidr_el3|TPIDR_EL3}}, x12             // encoding: [0x4c,0xd0,0x1e,0xd5]
+// CHECK: msr      {{tpidrro_el0|TPIDRRO_EL0}}, x12           // encoding: [0x6c,0xd0,0x1b,0xd5]
+// CHECK: msr      {{tpidr_el1|TPIDR_EL1}}, x12             // encoding: [0x8c,0xd0,0x18,0xd5]
+// CHECK: msr      {{cntfrq_el0|CNTFRQ_EL0}}, x12            // encoding: [0x0c,0xe0,0x1b,0xd5]
+// CHECK: msr      {{cntvoff_el2|CNTVOFF_EL2}}, x12           // encoding: [0x6c,0xe0,0x1c,0xd5]
+// CHECK: msr      {{cntkctl_el1|CNTKCTL_EL1}}, x12           // encoding: [0x0c,0xe1,0x18,0xd5]
+// CHECK: msr      {{cnthctl_el2|CNTHCTL_EL2}}, x12           // encoding: [0x0c,0xe1,0x1c,0xd5]
+// CHECK: msr      {{cntp_tval_el0|CNTP_TVAL_EL0}}, x12         // encoding: [0x0c,0xe2,0x1b,0xd5]
+// CHECK: msr      {{cnthp_tval_el2|CNTHP_TVAL_EL2}}, x12        // encoding: [0x0c,0xe2,0x1c,0xd5]
+// CHECK: msr      {{cntps_tval_el1|CNTPS_TVAL_EL1}}, x12        // encoding: [0x0c,0xe2,0x1f,0xd5]
+// CHECK: msr      {{cntp_ctl_el0|CNTP_CTL_EL0}}, x12          // encoding: [0x2c,0xe2,0x1b,0xd5]
+// CHECK: msr      {{cnthp_ctl_el2|CNTHP_CTL_EL2}}, x12         // encoding: [0x2c,0xe2,0x1c,0xd5]
+// CHECK: msr      {{cntps_ctl_el1|CNTPS_CTL_EL1}}, x12         // encoding: [0x2c,0xe2,0x1f,0xd5]
+// CHECK: msr      {{cntp_cval_el0|CNTP_CVAL_EL0}}, x12         // encoding: [0x4c,0xe2,0x1b,0xd5]
+// CHECK: msr      {{cnthp_cval_el2|CNTHP_CVAL_EL2}}, x12        // encoding: [0x4c,0xe2,0x1c,0xd5]
+// CHECK: msr      {{cntps_cval_el1|CNTPS_CVAL_EL1}}, x12        // encoding: [0x4c,0xe2,0x1f,0xd5]
+// CHECK: msr      {{cntv_tval_el0|CNTV_TVAL_EL0}}, x12         // encoding: [0x0c,0xe3,0x1b,0xd5]
+// CHECK: msr      {{cntv_ctl_el0|CNTV_CTL_EL0}}, x12          // encoding: [0x2c,0xe3,0x1b,0xd5]
+// CHECK: msr      {{cntv_cval_el0|CNTV_CVAL_EL0}}, x12         // encoding: [0x4c,0xe3,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr0_el0|PMEVCNTR0_EL0}}, x12         // encoding: [0x0c,0xe8,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr1_el0|PMEVCNTR1_EL0}}, x12         // encoding: [0x2c,0xe8,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr2_el0|PMEVCNTR2_EL0}}, x12         // encoding: [0x4c,0xe8,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr3_el0|PMEVCNTR3_EL0}}, x12         // encoding: [0x6c,0xe8,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr4_el0|PMEVCNTR4_EL0}}, x12         // encoding: [0x8c,0xe8,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr5_el0|PMEVCNTR5_EL0}}, x12         // encoding: [0xac,0xe8,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr6_el0|PMEVCNTR6_EL0}}, x12         // encoding: [0xcc,0xe8,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr7_el0|PMEVCNTR7_EL0}}, x12         // encoding: [0xec,0xe8,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr8_el0|PMEVCNTR8_EL0}}, x12         // encoding: [0x0c,0xe9,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr9_el0|PMEVCNTR9_EL0}}, x12         // encoding: [0x2c,0xe9,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr10_el0|PMEVCNTR10_EL0}}, x12        // encoding: [0x4c,0xe9,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr11_el0|PMEVCNTR11_EL0}}, x12        // encoding: [0x6c,0xe9,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr12_el0|PMEVCNTR12_EL0}}, x12        // encoding: [0x8c,0xe9,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr13_el0|PMEVCNTR13_EL0}}, x12        // encoding: [0xac,0xe9,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr14_el0|PMEVCNTR14_EL0}}, x12        // encoding: [0xcc,0xe9,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr15_el0|PMEVCNTR15_EL0}}, x12        // encoding: [0xec,0xe9,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr16_el0|PMEVCNTR16_EL0}}, x12        // encoding: [0x0c,0xea,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr17_el0|PMEVCNTR17_EL0}}, x12        // encoding: [0x2c,0xea,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr18_el0|PMEVCNTR18_EL0}}, x12        // encoding: [0x4c,0xea,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr19_el0|PMEVCNTR19_EL0}}, x12        // encoding: [0x6c,0xea,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr20_el0|PMEVCNTR20_EL0}}, x12        // encoding: [0x8c,0xea,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr21_el0|PMEVCNTR21_EL0}}, x12        // encoding: [0xac,0xea,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr22_el0|PMEVCNTR22_EL0}}, x12        // encoding: [0xcc,0xea,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr23_el0|PMEVCNTR23_EL0}}, x12        // encoding: [0xec,0xea,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr24_el0|PMEVCNTR24_EL0}}, x12        // encoding: [0x0c,0xeb,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr25_el0|PMEVCNTR25_EL0}}, x12        // encoding: [0x2c,0xeb,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr26_el0|PMEVCNTR26_EL0}}, x12        // encoding: [0x4c,0xeb,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr27_el0|PMEVCNTR27_EL0}}, x12        // encoding: [0x6c,0xeb,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr28_el0|PMEVCNTR28_EL0}}, x12        // encoding: [0x8c,0xeb,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr29_el0|PMEVCNTR29_EL0}}, x12        // encoding: [0xac,0xeb,0x1b,0xd5]
+// CHECK: msr      {{pmevcntr30_el0|PMEVCNTR30_EL0}}, x12        // encoding: [0xcc,0xeb,0x1b,0xd5]
+// CHECK: msr      {{pmccfiltr_el0|PMCCFILTR_EL0}}, x12         // encoding: [0xec,0xef,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper0_el0|PMEVTYPER0_EL0}}, x12        // encoding: [0x0c,0xec,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper1_el0|PMEVTYPER1_EL0}}, x12        // encoding: [0x2c,0xec,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper2_el0|PMEVTYPER2_EL0}}, x12        // encoding: [0x4c,0xec,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper3_el0|PMEVTYPER3_EL0}}, x12        // encoding: [0x6c,0xec,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper4_el0|PMEVTYPER4_EL0}}, x12        // encoding: [0x8c,0xec,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper5_el0|PMEVTYPER5_EL0}}, x12        // encoding: [0xac,0xec,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper6_el0|PMEVTYPER6_EL0}}, x12        // encoding: [0xcc,0xec,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper7_el0|PMEVTYPER7_EL0}}, x12        // encoding: [0xec,0xec,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper8_el0|PMEVTYPER8_EL0}}, x12        // encoding: [0x0c,0xed,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper9_el0|PMEVTYPER9_EL0}}, x12        // encoding: [0x2c,0xed,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper10_el0|PMEVTYPER10_EL0}}, x12       // encoding: [0x4c,0xed,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper11_el0|PMEVTYPER11_EL0}}, x12       // encoding: [0x6c,0xed,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper12_el0|PMEVTYPER12_EL0}}, x12       // encoding: [0x8c,0xed,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper13_el0|PMEVTYPER13_EL0}}, x12       // encoding: [0xac,0xed,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper14_el0|PMEVTYPER14_EL0}}, x12       // encoding: [0xcc,0xed,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper15_el0|PMEVTYPER15_EL0}}, x12       // encoding: [0xec,0xed,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper16_el0|PMEVTYPER16_EL0}}, x12       // encoding: [0x0c,0xee,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper17_el0|PMEVTYPER17_EL0}}, x12       // encoding: [0x2c,0xee,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper18_el0|PMEVTYPER18_EL0}}, x12       // encoding: [0x4c,0xee,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper19_el0|PMEVTYPER19_EL0}}, x12       // encoding: [0x6c,0xee,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper20_el0|PMEVTYPER20_EL0}}, x12       // encoding: [0x8c,0xee,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper21_el0|PMEVTYPER21_EL0}}, x12       // encoding: [0xac,0xee,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper22_el0|PMEVTYPER22_EL0}}, x12       // encoding: [0xcc,0xee,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper23_el0|PMEVTYPER23_EL0}}, x12       // encoding: [0xec,0xee,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper24_el0|PMEVTYPER24_EL0}}, x12       // encoding: [0x0c,0xef,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper25_el0|PMEVTYPER25_EL0}}, x12       // encoding: [0x2c,0xef,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper26_el0|PMEVTYPER26_EL0}}, x12       // encoding: [0x4c,0xef,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper27_el0|PMEVTYPER27_EL0}}, x12       // encoding: [0x6c,0xef,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper28_el0|PMEVTYPER28_EL0}}, x12       // encoding: [0x8c,0xef,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper29_el0|PMEVTYPER29_EL0}}, x12       // encoding: [0xac,0xef,0x1b,0xd5]
+// CHECK: msr      {{pmevtyper30_el0|PMEVTYPER30_EL0}}, x12       // encoding: [0xcc,0xef,0x1b,0xd5]
 
 	mrs x9, TEECR32_EL1
 	mrs x9, OSDTRRX_EL1
@@ -4497,315 +4464,315 @@ _func:
 	mrs x9, PMEVTYPER28_EL0
 	mrs x9, PMEVTYPER29_EL0
 	mrs x9, PMEVTYPER30_EL0
-// CHECK: mrs      x9, teecr32_el1            // encoding: [0x09,0x00,0x32,0xd5]
-// CHECK: mrs      x9, osdtrrx_el1            // encoding: [0x49,0x00,0x30,0xd5]
-// CHECK: mrs      x9, mdccsr_el0             // encoding: [0x09,0x01,0x33,0xd5]
-// CHECK: mrs      x9, mdccint_el1            // encoding: [0x09,0x02,0x30,0xd5]
-// CHECK: mrs      x9, mdscr_el1              // encoding: [0x49,0x02,0x30,0xd5]
-// CHECK: mrs      x9, osdtrtx_el1            // encoding: [0x49,0x03,0x30,0xd5]
-// CHECK: mrs      x9, dbgdtr_el0             // encoding: [0x09,0x04,0x33,0xd5]
-// CHECK: mrs      x9, dbgdtrrx_el0           // encoding: [0x09,0x05,0x33,0xd5]
-// CHECK: mrs      x9, oseccr_el1             // encoding: [0x49,0x06,0x30,0xd5]
-// CHECK: mrs      x9, dbgvcr32_el2           // encoding: [0x09,0x07,0x34,0xd5]
-// CHECK: mrs      x9, dbgbvr0_el1            // encoding: [0x89,0x00,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr1_el1            // encoding: [0x89,0x01,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr2_el1            // encoding: [0x89,0x02,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr3_el1            // encoding: [0x89,0x03,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr4_el1            // encoding: [0x89,0x04,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr5_el1            // encoding: [0x89,0x05,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr6_el1            // encoding: [0x89,0x06,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr7_el1            // encoding: [0x89,0x07,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr8_el1            // encoding: [0x89,0x08,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr9_el1            // encoding: [0x89,0x09,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr10_el1           // encoding: [0x89,0x0a,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr11_el1           // encoding: [0x89,0x0b,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr12_el1           // encoding: [0x89,0x0c,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr13_el1           // encoding: [0x89,0x0d,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr14_el1           // encoding: [0x89,0x0e,0x30,0xd5]
-// CHECK: mrs      x9, dbgbvr15_el1           // encoding: [0x89,0x0f,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr0_el1            // encoding: [0xa9,0x00,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr1_el1            // encoding: [0xa9,0x01,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr2_el1            // encoding: [0xa9,0x02,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr3_el1            // encoding: [0xa9,0x03,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr4_el1            // encoding: [0xa9,0x04,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr5_el1            // encoding: [0xa9,0x05,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr6_el1            // encoding: [0xa9,0x06,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr7_el1            // encoding: [0xa9,0x07,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr8_el1            // encoding: [0xa9,0x08,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr9_el1            // encoding: [0xa9,0x09,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr10_el1           // encoding: [0xa9,0x0a,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr11_el1           // encoding: [0xa9,0x0b,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr12_el1           // encoding: [0xa9,0x0c,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr13_el1           // encoding: [0xa9,0x0d,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr14_el1           // encoding: [0xa9,0x0e,0x30,0xd5]
-// CHECK: mrs      x9, dbgbcr15_el1           // encoding: [0xa9,0x0f,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr0_el1            // encoding: [0xc9,0x00,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr1_el1            // encoding: [0xc9,0x01,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr2_el1            // encoding: [0xc9,0x02,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr3_el1            // encoding: [0xc9,0x03,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr4_el1            // encoding: [0xc9,0x04,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr5_el1            // encoding: [0xc9,0x05,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr6_el1            // encoding: [0xc9,0x06,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr7_el1            // encoding: [0xc9,0x07,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr8_el1            // encoding: [0xc9,0x08,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr9_el1            // encoding: [0xc9,0x09,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr10_el1           // encoding: [0xc9,0x0a,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr11_el1           // encoding: [0xc9,0x0b,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr12_el1           // encoding: [0xc9,0x0c,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr13_el1           // encoding: [0xc9,0x0d,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr14_el1           // encoding: [0xc9,0x0e,0x30,0xd5]
-// CHECK: mrs      x9, dbgwvr15_el1           // encoding: [0xc9,0x0f,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr0_el1            // encoding: [0xe9,0x00,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr1_el1            // encoding: [0xe9,0x01,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr2_el1            // encoding: [0xe9,0x02,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr3_el1            // encoding: [0xe9,0x03,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr4_el1            // encoding: [0xe9,0x04,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr5_el1            // encoding: [0xe9,0x05,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr6_el1            // encoding: [0xe9,0x06,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr7_el1            // encoding: [0xe9,0x07,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr8_el1            // encoding: [0xe9,0x08,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr9_el1            // encoding: [0xe9,0x09,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr10_el1           // encoding: [0xe9,0x0a,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr11_el1           // encoding: [0xe9,0x0b,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr12_el1           // encoding: [0xe9,0x0c,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr13_el1           // encoding: [0xe9,0x0d,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr14_el1           // encoding: [0xe9,0x0e,0x30,0xd5]
-// CHECK: mrs      x9, dbgwcr15_el1           // encoding: [0xe9,0x0f,0x30,0xd5]
-// CHECK: mrs      x9, mdrar_el1              // encoding: [0x09,0x10,0x30,0xd5]
-// CHECK: mrs      x9, teehbr32_el1           // encoding: [0x09,0x10,0x32,0xd5]
-// CHECK: mrs      x9, oslsr_el1              // encoding: [0x89,0x11,0x30,0xd5]
-// CHECK: mrs      x9, osdlr_el1              // encoding: [0x89,0x13,0x30,0xd5]
-// CHECK: mrs      x9, dbgprcr_el1            // encoding: [0x89,0x14,0x30,0xd5]
-// CHECK: mrs      x9, dbgclaimset_el1        // encoding: [0xc9,0x78,0x30,0xd5]
-// CHECK: mrs      x9, dbgclaimclr_el1        // encoding: [0xc9,0x79,0x30,0xd5]
-// CHECK: mrs      x9, dbgauthstatus_el1      // encoding: [0xc9,0x7e,0x30,0xd5]
-// CHECK: mrs      x9, midr_el1               // encoding: [0x09,0x00,0x38,0xd5]
-// CHECK: mrs      x9, ccsidr_el1             // encoding: [0x09,0x00,0x39,0xd5]
-// CHECK: mrs      x9, csselr_el1             // encoding: [0x09,0x00,0x3a,0xd5]
-// CHECK: mrs      x9, vpidr_el2              // encoding: [0x09,0x00,0x3c,0xd5]
-// CHECK: mrs      x9, clidr_el1              // encoding: [0x29,0x00,0x39,0xd5]
-// CHECK: mrs      x9, ctr_el0                // encoding: [0x29,0x00,0x3b,0xd5]
-// CHECK: mrs      x9, mpidr_el1              // encoding: [0xa9,0x00,0x38,0xd5]
-// CHECK: mrs      x9, vmpidr_el2             // encoding: [0xa9,0x00,0x3c,0xd5]
-// CHECK: mrs      x9, revidr_el1             // encoding: [0xc9,0x00,0x38,0xd5]
-// CHECK: mrs      x9, aidr_el1               // encoding: [0xe9,0x00,0x39,0xd5]
-// CHECK: mrs      x9, dczid_el0              // encoding: [0xe9,0x00,0x3b,0xd5]
-// CHECK: mrs      x9, id_pfr0_el1            // encoding: [0x09,0x01,0x38,0xd5]
-// CHECK: mrs      x9, id_pfr1_el1            // encoding: [0x29,0x01,0x38,0xd5]
-// CHECK: mrs      x9, id_dfr0_el1            // encoding: [0x49,0x01,0x38,0xd5]
-// CHECK: mrs      x9, id_afr0_el1            // encoding: [0x69,0x01,0x38,0xd5]
-// CHECK: mrs      x9, id_mmfr0_el1           // encoding: [0x89,0x01,0x38,0xd5]
-// CHECK: mrs      x9, id_mmfr1_el1           // encoding: [0xa9,0x01,0x38,0xd5]
-// CHECK: mrs      x9, id_mmfr2_el1           // encoding: [0xc9,0x01,0x38,0xd5]
-// CHECK: mrs      x9, id_mmfr3_el1           // encoding: [0xe9,0x01,0x38,0xd5]
-// CHECK: mrs      x9, id_isar0_el1           // encoding: [0x09,0x02,0x38,0xd5]
-// CHECK: mrs      x9, id_isar1_el1           // encoding: [0x29,0x02,0x38,0xd5]
-// CHECK: mrs      x9, id_isar2_el1           // encoding: [0x49,0x02,0x38,0xd5]
-// CHECK: mrs      x9, id_isar3_el1           // encoding: [0x69,0x02,0x38,0xd5]
-// CHECK: mrs      x9, id_isar4_el1           // encoding: [0x89,0x02,0x38,0xd5]
-// CHECK: mrs      x9, id_isar5_el1           // encoding: [0xa9,0x02,0x38,0xd5]
-// CHECK: mrs      x9, mvfr0_el1              // encoding: [0x09,0x03,0x38,0xd5]
-// CHECK: mrs      x9, mvfr1_el1              // encoding: [0x29,0x03,0x38,0xd5]
-// CHECK: mrs      x9, mvfr2_el1              // encoding: [0x49,0x03,0x38,0xd5]
-// CHECK: mrs      x9, id_aa64pfr0_el1        // encoding: [0x09,0x04,0x38,0xd5]
-// CHECK: mrs      x9, id_aa64pfr1_el1        // encoding: [0x29,0x04,0x38,0xd5]
-// CHECK: mrs      x9, id_aa64dfr0_el1        // encoding: [0x09,0x05,0x38,0xd5]
-// CHECK: mrs      x9, id_aa64dfr1_el1        // encoding: [0x29,0x05,0x38,0xd5]
-// CHECK: mrs      x9, id_aa64afr0_el1        // encoding: [0x89,0x05,0x38,0xd5]
-// CHECK: mrs      x9, id_aa64afr1_el1        // encoding: [0xa9,0x05,0x38,0xd5]
-// CHECK: mrs      x9, id_aa64isar0_el1       // encoding: [0x09,0x06,0x38,0xd5]
-// CHECK: mrs      x9, id_aa64isar1_el1       // encoding: [0x29,0x06,0x38,0xd5]
-// CHECK: mrs      x9, id_aa64mmfr0_el1       // encoding: [0x09,0x07,0x38,0xd5]
-// CHECK: mrs      x9, id_aa64mmfr1_el1       // encoding: [0x29,0x07,0x38,0xd5]
-// CHECK: mrs      x9, sctlr_el1              // encoding: [0x09,0x10,0x38,0xd5]
-// CHECK: mrs      x9, sctlr_el2              // encoding: [0x09,0x10,0x3c,0xd5]
-// CHECK: mrs      x9, sctlr_el3              // encoding: [0x09,0x10,0x3e,0xd5]
-// CHECK: mrs      x9, actlr_el1              // encoding: [0x29,0x10,0x38,0xd5]
-// CHECK: mrs      x9, actlr_el2              // encoding: [0x29,0x10,0x3c,0xd5]
-// CHECK: mrs      x9, actlr_el3              // encoding: [0x29,0x10,0x3e,0xd5]
-// CHECK: mrs      x9, cpacr_el1              // encoding: [0x49,0x10,0x38,0xd5]
-// CHECK: mrs      x9, hcr_el2                // encoding: [0x09,0x11,0x3c,0xd5]
-// CHECK: mrs      x9, scr_el3                // encoding: [0x09,0x11,0x3e,0xd5]
-// CHECK: mrs      x9, mdcr_el2               // encoding: [0x29,0x11,0x3c,0xd5]
-// CHECK: mrs      x9, sder32_el3             // encoding: [0x29,0x11,0x3e,0xd5]
-// CHECK: mrs      x9, cptr_el2               // encoding: [0x49,0x11,0x3c,0xd5]
-// CHECK: mrs      x9, cptr_el3               // encoding: [0x49,0x11,0x3e,0xd5]
-// CHECK: mrs      x9, hstr_el2               // encoding: [0x69,0x11,0x3c,0xd5]
-// CHECK: mrs      x9, hacr_el2               // encoding: [0xe9,0x11,0x3c,0xd5]
-// CHECK: mrs      x9, mdcr_el3               // encoding: [0x29,0x13,0x3e,0xd5]
-// CHECK: mrs      x9, ttbr0_el1              // encoding: [0x09,0x20,0x38,0xd5]
-// CHECK: mrs      x9, ttbr0_el2              // encoding: [0x09,0x20,0x3c,0xd5]
-// CHECK: mrs      x9, ttbr0_el3              // encoding: [0x09,0x20,0x3e,0xd5]
-// CHECK: mrs      x9, ttbr1_el1              // encoding: [0x29,0x20,0x38,0xd5]
-// CHECK: mrs      x9, tcr_el1                // encoding: [0x49,0x20,0x38,0xd5]
-// CHECK: mrs      x9, tcr_el2                // encoding: [0x49,0x20,0x3c,0xd5]
-// CHECK: mrs      x9, tcr_el3                // encoding: [0x49,0x20,0x3e,0xd5]
-// CHECK: mrs      x9, vttbr_el2              // encoding: [0x09,0x21,0x3c,0xd5]
-// CHECK: mrs      x9, vtcr_el2               // encoding: [0x49,0x21,0x3c,0xd5]
-// CHECK: mrs      x9, dacr32_el2             // encoding: [0x09,0x30,0x3c,0xd5]
-// CHECK: mrs      x9, spsr_el1               // encoding: [0x09,0x40,0x38,0xd5]
-// CHECK: mrs      x9, spsr_el2               // encoding: [0x09,0x40,0x3c,0xd5]
-// CHECK: mrs      x9, spsr_el3               // encoding: [0x09,0x40,0x3e,0xd5]
-// CHECK: mrs      x9, elr_el1                // encoding: [0x29,0x40,0x38,0xd5]
-// CHECK: mrs      x9, elr_el2                // encoding: [0x29,0x40,0x3c,0xd5]
-// CHECK: mrs      x9, elr_el3                // encoding: [0x29,0x40,0x3e,0xd5]
-// CHECK: mrs      x9, sp_el0                 // encoding: [0x09,0x41,0x38,0xd5]
-// CHECK: mrs      x9, sp_el1                 // encoding: [0x09,0x41,0x3c,0xd5]
-// CHECK: mrs      x9, sp_el2                 // encoding: [0x09,0x41,0x3e,0xd5]
-// CHECK: mrs      x9, spsel                  // encoding: [0x09,0x42,0x38,0xd5]
-// CHECK: mrs      x9, nzcv                   // encoding: [0x09,0x42,0x3b,0xd5]
-// CHECK: mrs      x9, daif                   // encoding: [0x29,0x42,0x3b,0xd5]
-// CHECK: mrs      x9, currentel              // encoding: [0x49,0x42,0x38,0xd5]
-// CHECK: mrs      x9, spsr_irq               // encoding: [0x09,0x43,0x3c,0xd5]
-// CHECK: mrs      x9, spsr_abt               // encoding: [0x29,0x43,0x3c,0xd5]
-// CHECK: mrs      x9, spsr_und               // encoding: [0x49,0x43,0x3c,0xd5]
-// CHECK: mrs      x9, spsr_fiq               // encoding: [0x69,0x43,0x3c,0xd5]
-// CHECK: mrs      x9, fpcr                   // encoding: [0x09,0x44,0x3b,0xd5]
-// CHECK: mrs      x9, fpsr                   // encoding: [0x29,0x44,0x3b,0xd5]
-// CHECK: mrs      x9, dspsr_el0              // encoding: [0x09,0x45,0x3b,0xd5]
-// CHECK: mrs      x9, dlr_el0                // encoding: [0x29,0x45,0x3b,0xd5]
-// CHECK: mrs      x9, ifsr32_el2             // encoding: [0x29,0x50,0x3c,0xd5]
-// CHECK: mrs      x9, afsr0_el1              // encoding: [0x09,0x51,0x38,0xd5]
-// CHECK: mrs      x9, afsr0_el2              // encoding: [0x09,0x51,0x3c,0xd5]
-// CHECK: mrs      x9, afsr0_el3              // encoding: [0x09,0x51,0x3e,0xd5]
-// CHECK: mrs      x9, afsr1_el1              // encoding: [0x29,0x51,0x38,0xd5]
-// CHECK: mrs      x9, afsr1_el2              // encoding: [0x29,0x51,0x3c,0xd5]
-// CHECK: mrs      x9, afsr1_el3              // encoding: [0x29,0x51,0x3e,0xd5]
-// CHECK: mrs      x9, esr_el1                // encoding: [0x09,0x52,0x38,0xd5]
-// CHECK: mrs      x9, esr_el2                // encoding: [0x09,0x52,0x3c,0xd5]
-// CHECK: mrs      x9, esr_el3                // encoding: [0x09,0x52,0x3e,0xd5]
-// CHECK: mrs      x9, fpexc32_el2            // encoding: [0x09,0x53,0x3c,0xd5]
-// CHECK: mrs      x9, far_el1                // encoding: [0x09,0x60,0x38,0xd5]
-// CHECK: mrs      x9, far_el2                // encoding: [0x09,0x60,0x3c,0xd5]
-// CHECK: mrs      x9, far_el3                // encoding: [0x09,0x60,0x3e,0xd5]
-// CHECK: mrs      x9, hpfar_el2              // encoding: [0x89,0x60,0x3c,0xd5]
-// CHECK: mrs      x9, par_el1                // encoding: [0x09,0x74,0x38,0xd5]
-// CHECK: mrs      x9, pmcr_el0               // encoding: [0x09,0x9c,0x3b,0xd5]
-// CHECK: mrs      x9, pmcntenset_el0         // encoding: [0x29,0x9c,0x3b,0xd5]
-// CHECK: mrs      x9, pmcntenclr_el0         // encoding: [0x49,0x9c,0x3b,0xd5]
-// CHECK: mrs      x9, pmovsclr_el0           // encoding: [0x69,0x9c,0x3b,0xd5]
-// CHECK: mrs      x9, pmselr_el0             // encoding: [0xa9,0x9c,0x3b,0xd5]
-// CHECK: mrs      x9, pmceid0_el0            // encoding: [0xc9,0x9c,0x3b,0xd5]
-// CHECK: mrs      x9, pmceid1_el0            // encoding: [0xe9,0x9c,0x3b,0xd5]
-// CHECK: mrs      x9, pmccntr_el0            // encoding: [0x09,0x9d,0x3b,0xd5]
-// CHECK: mrs      x9, pmxevtyper_el0         // encoding: [0x29,0x9d,0x3b,0xd5]
-// CHECK: mrs      x9, pmxevcntr_el0          // encoding: [0x49,0x9d,0x3b,0xd5]
-// CHECK: mrs      x9, pmuserenr_el0          // encoding: [0x09,0x9e,0x3b,0xd5]
-// CHECK: mrs      x9, pmintenset_el1         // encoding: [0x29,0x9e,0x38,0xd5]
-// CHECK: mrs      x9, pmintenclr_el1         // encoding: [0x49,0x9e,0x38,0xd5]
-// CHECK: mrs      x9, pmovsset_el0           // encoding: [0x69,0x9e,0x3b,0xd5]
-// CHECK: mrs      x9, mair_el1               // encoding: [0x09,0xa2,0x38,0xd5]
-// CHECK: mrs      x9, mair_el2               // encoding: [0x09,0xa2,0x3c,0xd5]
-// CHECK: mrs      x9, mair_el3               // encoding: [0x09,0xa2,0x3e,0xd5]
-// CHECK: mrs      x9, amair_el1              // encoding: [0x09,0xa3,0x38,0xd5]
-// CHECK: mrs      x9, amair_el2              // encoding: [0x09,0xa3,0x3c,0xd5]
-// CHECK: mrs      x9, amair_el3              // encoding: [0x09,0xa3,0x3e,0xd5]
-// CHECK: mrs      x9, vbar_el1               // encoding: [0x09,0xc0,0x38,0xd5]
-// CHECK: mrs      x9, vbar_el2               // encoding: [0x09,0xc0,0x3c,0xd5]
-// CHECK: mrs      x9, vbar_el3               // encoding: [0x09,0xc0,0x3e,0xd5]
-// CHECK: mrs      x9, rvbar_el1              // encoding: [0x29,0xc0,0x38,0xd5]
-// CHECK: mrs      x9, rvbar_el2              // encoding: [0x29,0xc0,0x3c,0xd5]
-// CHECK: mrs      x9, rvbar_el3              // encoding: [0x29,0xc0,0x3e,0xd5]
-// CHECK: mrs      x9, rmr_el1                // encoding: [0x49,0xc0,0x38,0xd5]
-// CHECK: mrs      x9, rmr_el2                // encoding: [0x49,0xc0,0x3c,0xd5]
-// CHECK: mrs      x9, rmr_el3                // encoding: [0x49,0xc0,0x3e,0xd5]
-// CHECK: mrs      x9, isr_el1                // encoding: [0x09,0xc1,0x38,0xd5]
-// CHECK: mrs      x9, contextidr_el1         // encoding: [0x29,0xd0,0x38,0xd5]
-// CHECK: mrs      x9, tpidr_el0              // encoding: [0x49,0xd0,0x3b,0xd5]
-// CHECK: mrs      x9, tpidr_el2              // encoding: [0x49,0xd0,0x3c,0xd5]
-// CHECK: mrs      x9, tpidr_el3              // encoding: [0x49,0xd0,0x3e,0xd5]
-// CHECK: mrs      x9, tpidrro_el0            // encoding: [0x69,0xd0,0x3b,0xd5]
-// CHECK: mrs      x9, tpidr_el1              // encoding: [0x89,0xd0,0x38,0xd5]
-// CHECK: mrs      x9, cntfrq_el0             // encoding: [0x09,0xe0,0x3b,0xd5]
-// CHECK: mrs      x9, cntpct_el0             // encoding: [0x29,0xe0,0x3b,0xd5]
-// CHECK: mrs      x9, cntvct_el0             // encoding: [0x49,0xe0,0x3b,0xd5]
-// CHECK: mrs      x9, cntvoff_el2            // encoding: [0x69,0xe0,0x3c,0xd5]
-// CHECK: mrs      x9, cntkctl_el1            // encoding: [0x09,0xe1,0x38,0xd5]
-// CHECK: mrs      x9, cnthctl_el2            // encoding: [0x09,0xe1,0x3c,0xd5]
-// CHECK: mrs      x9, cntp_tval_el0          // encoding: [0x09,0xe2,0x3b,0xd5]
-// CHECK: mrs      x9, cnthp_tval_el2         // encoding: [0x09,0xe2,0x3c,0xd5]
-// CHECK: mrs      x9, cntps_tval_el1         // encoding: [0x09,0xe2,0x3f,0xd5]
-// CHECK: mrs      x9, cntp_ctl_el0           // encoding: [0x29,0xe2,0x3b,0xd5]
-// CHECK: mrs      x9, cnthp_ctl_el2          // encoding: [0x29,0xe2,0x3c,0xd5]
-// CHECK: mrs      x9, cntps_ctl_el1          // encoding: [0x29,0xe2,0x3f,0xd5]
-// CHECK: mrs      x9, cntp_cval_el0          // encoding: [0x49,0xe2,0x3b,0xd5]
-// CHECK: mrs      x9, cnthp_cval_el2         // encoding: [0x49,0xe2,0x3c,0xd5]
-// CHECK: mrs      x9, cntps_cval_el1         // encoding: [0x49,0xe2,0x3f,0xd5]
-// CHECK: mrs      x9, cntv_tval_el0          // encoding: [0x09,0xe3,0x3b,0xd5]
-// CHECK: mrs      x9, cntv_ctl_el0           // encoding: [0x29,0xe3,0x3b,0xd5]
-// CHECK: mrs      x9, cntv_cval_el0          // encoding: [0x49,0xe3,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr0_el0          // encoding: [0x09,0xe8,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr1_el0          // encoding: [0x29,0xe8,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr2_el0          // encoding: [0x49,0xe8,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr3_el0          // encoding: [0x69,0xe8,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr4_el0          // encoding: [0x89,0xe8,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr5_el0          // encoding: [0xa9,0xe8,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr6_el0          // encoding: [0xc9,0xe8,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr7_el0          // encoding: [0xe9,0xe8,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr8_el0          // encoding: [0x09,0xe9,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr9_el0          // encoding: [0x29,0xe9,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr10_el0         // encoding: [0x49,0xe9,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr11_el0         // encoding: [0x69,0xe9,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr12_el0         // encoding: [0x89,0xe9,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr13_el0         // encoding: [0xa9,0xe9,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr14_el0         // encoding: [0xc9,0xe9,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr15_el0         // encoding: [0xe9,0xe9,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr16_el0         // encoding: [0x09,0xea,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr17_el0         // encoding: [0x29,0xea,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr18_el0         // encoding: [0x49,0xea,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr19_el0         // encoding: [0x69,0xea,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr20_el0         // encoding: [0x89,0xea,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr21_el0         // encoding: [0xa9,0xea,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr22_el0         // encoding: [0xc9,0xea,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr23_el0         // encoding: [0xe9,0xea,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr24_el0         // encoding: [0x09,0xeb,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr25_el0         // encoding: [0x29,0xeb,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr26_el0         // encoding: [0x49,0xeb,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr27_el0         // encoding: [0x69,0xeb,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr28_el0         // encoding: [0x89,0xeb,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr29_el0         // encoding: [0xa9,0xeb,0x3b,0xd5]
-// CHECK: mrs      x9, pmevcntr30_el0         // encoding: [0xc9,0xeb,0x3b,0xd5]
-// CHECK: mrs      x9, pmccfiltr_el0          // encoding: [0xe9,0xef,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper0_el0         // encoding: [0x09,0xec,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper1_el0         // encoding: [0x29,0xec,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper2_el0         // encoding: [0x49,0xec,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper3_el0         // encoding: [0x69,0xec,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper4_el0         // encoding: [0x89,0xec,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper5_el0         // encoding: [0xa9,0xec,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper6_el0         // encoding: [0xc9,0xec,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper7_el0         // encoding: [0xe9,0xec,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper8_el0         // encoding: [0x09,0xed,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper9_el0         // encoding: [0x29,0xed,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper10_el0        // encoding: [0x49,0xed,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper11_el0        // encoding: [0x69,0xed,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper12_el0        // encoding: [0x89,0xed,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper13_el0        // encoding: [0xa9,0xed,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper14_el0        // encoding: [0xc9,0xed,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper15_el0        // encoding: [0xe9,0xed,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper16_el0        // encoding: [0x09,0xee,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper17_el0        // encoding: [0x29,0xee,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper18_el0        // encoding: [0x49,0xee,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper19_el0        // encoding: [0x69,0xee,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper20_el0        // encoding: [0x89,0xee,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper21_el0        // encoding: [0xa9,0xee,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper22_el0        // encoding: [0xc9,0xee,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper23_el0        // encoding: [0xe9,0xee,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper24_el0        // encoding: [0x09,0xef,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper25_el0        // encoding: [0x29,0xef,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper26_el0        // encoding: [0x49,0xef,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper27_el0        // encoding: [0x69,0xef,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper28_el0        // encoding: [0x89,0xef,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper29_el0        // encoding: [0xa9,0xef,0x3b,0xd5]
-// CHECK: mrs      x9, pmevtyper30_el0        // encoding: [0xc9,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, {{teecr32_el1|TEECR32_EL1}}            // encoding: [0x09,0x00,0x32,0xd5]
+// CHECK: mrs      x9, {{osdtrrx_el1|OSDTRRX_EL1}}            // encoding: [0x49,0x00,0x30,0xd5]
+// CHECK: mrs      x9, {{mdccsr_el0|MDCCSR_EL0}}             // encoding: [0x09,0x01,0x33,0xd5]
+// CHECK: mrs      x9, {{mdccint_el1|MDCCINT_EL1}}            // encoding: [0x09,0x02,0x30,0xd5]
+// CHECK: mrs      x9, {{mdscr_el1|MDSCR_EL1}}              // encoding: [0x49,0x02,0x30,0xd5]
+// CHECK: mrs      x9, {{osdtrtx_el1|OSDTRTX_EL1}}            // encoding: [0x49,0x03,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgdtr_el0|DBGDTR_EL0}}             // encoding: [0x09,0x04,0x33,0xd5]
+// CHECK: mrs      x9, {{dbgdtrrx_el0|DBGDTRRX_EL0}}           // encoding: [0x09,0x05,0x33,0xd5]
+// CHECK: mrs      x9, {{oseccr_el1|OSECCR_EL1}}             // encoding: [0x49,0x06,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgvcr32_el2|DBGVCR32_EL2}}           // encoding: [0x09,0x07,0x34,0xd5]
+// CHECK: mrs      x9, {{dbgbvr0_el1|DBGBVR0_EL1}}            // encoding: [0x89,0x00,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr1_el1|DBGBVR1_EL1}}            // encoding: [0x89,0x01,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr2_el1|DBGBVR2_EL1}}            // encoding: [0x89,0x02,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr3_el1|DBGBVR3_EL1}}            // encoding: [0x89,0x03,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr4_el1|DBGBVR4_EL1}}            // encoding: [0x89,0x04,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr5_el1|DBGBVR5_EL1}}            // encoding: [0x89,0x05,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr6_el1|DBGBVR6_EL1}}            // encoding: [0x89,0x06,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr7_el1|DBGBVR7_EL1}}            // encoding: [0x89,0x07,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr8_el1|DBGBVR8_EL1}}            // encoding: [0x89,0x08,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr9_el1|DBGBVR9_EL1}}            // encoding: [0x89,0x09,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr10_el1|DBGBVR10_EL1}}           // encoding: [0x89,0x0a,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr11_el1|DBGBVR11_EL1}}           // encoding: [0x89,0x0b,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr12_el1|DBGBVR12_EL1}}           // encoding: [0x89,0x0c,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr13_el1|DBGBVR13_EL1}}           // encoding: [0x89,0x0d,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr14_el1|DBGBVR14_EL1}}           // encoding: [0x89,0x0e,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbvr15_el1|DBGBVR15_EL1}}           // encoding: [0x89,0x0f,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr0_el1|DBGBCR0_EL1}}            // encoding: [0xa9,0x00,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr1_el1|DBGBCR1_EL1}}            // encoding: [0xa9,0x01,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr2_el1|DBGBCR2_EL1}}            // encoding: [0xa9,0x02,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr3_el1|DBGBCR3_EL1}}            // encoding: [0xa9,0x03,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr4_el1|DBGBCR4_EL1}}            // encoding: [0xa9,0x04,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr5_el1|DBGBCR5_EL1}}            // encoding: [0xa9,0x05,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr6_el1|DBGBCR6_EL1}}            // encoding: [0xa9,0x06,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr7_el1|DBGBCR7_EL1}}            // encoding: [0xa9,0x07,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr8_el1|DBGBCR8_EL1}}            // encoding: [0xa9,0x08,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr9_el1|DBGBCR9_EL1}}            // encoding: [0xa9,0x09,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr10_el1|DBGBCR10_EL1}}           // encoding: [0xa9,0x0a,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr11_el1|DBGBCR11_EL1}}           // encoding: [0xa9,0x0b,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr12_el1|DBGBCR12_EL1}}           // encoding: [0xa9,0x0c,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr13_el1|DBGBCR13_EL1}}           // encoding: [0xa9,0x0d,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr14_el1|DBGBCR14_EL1}}           // encoding: [0xa9,0x0e,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgbcr15_el1|DBGBCR15_EL1}}           // encoding: [0xa9,0x0f,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr0_el1|DBGWVR0_EL1}}            // encoding: [0xc9,0x00,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr1_el1|DBGWVR1_EL1}}            // encoding: [0xc9,0x01,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr2_el1|DBGWVR2_EL1}}            // encoding: [0xc9,0x02,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr3_el1|DBGWVR3_EL1}}            // encoding: [0xc9,0x03,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr4_el1|DBGWVR4_EL1}}            // encoding: [0xc9,0x04,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr5_el1|DBGWVR5_EL1}}            // encoding: [0xc9,0x05,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr6_el1|DBGWVR6_EL1}}            // encoding: [0xc9,0x06,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr7_el1|DBGWVR7_EL1}}            // encoding: [0xc9,0x07,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr8_el1|DBGWVR8_EL1}}            // encoding: [0xc9,0x08,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr9_el1|DBGWVR9_EL1}}            // encoding: [0xc9,0x09,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr10_el1|DBGWVR10_EL1}}           // encoding: [0xc9,0x0a,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr11_el1|DBGWVR11_EL1}}           // encoding: [0xc9,0x0b,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr12_el1|DBGWVR12_EL1}}           // encoding: [0xc9,0x0c,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr13_el1|DBGWVR13_EL1}}           // encoding: [0xc9,0x0d,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr14_el1|DBGWVR14_EL1}}           // encoding: [0xc9,0x0e,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwvr15_el1|DBGWVR15_EL1}}           // encoding: [0xc9,0x0f,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr0_el1|DBGWCR0_EL1}}            // encoding: [0xe9,0x00,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr1_el1|DBGWCR1_EL1}}            // encoding: [0xe9,0x01,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr2_el1|DBGWCR2_EL1}}            // encoding: [0xe9,0x02,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr3_el1|DBGWCR3_EL1}}            // encoding: [0xe9,0x03,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr4_el1|DBGWCR4_EL1}}            // encoding: [0xe9,0x04,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr5_el1|DBGWCR5_EL1}}            // encoding: [0xe9,0x05,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr6_el1|DBGWCR6_EL1}}            // encoding: [0xe9,0x06,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr7_el1|DBGWCR7_EL1}}            // encoding: [0xe9,0x07,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr8_el1|DBGWCR8_EL1}}            // encoding: [0xe9,0x08,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr9_el1|DBGWCR9_EL1}}            // encoding: [0xe9,0x09,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr10_el1|DBGWCR10_EL1}}           // encoding: [0xe9,0x0a,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr11_el1|DBGWCR11_EL1}}           // encoding: [0xe9,0x0b,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr12_el1|DBGWCR12_EL1}}           // encoding: [0xe9,0x0c,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr13_el1|DBGWCR13_EL1}}           // encoding: [0xe9,0x0d,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr14_el1|DBGWCR14_EL1}}           // encoding: [0xe9,0x0e,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgwcr15_el1|DBGWCR15_EL1}}           // encoding: [0xe9,0x0f,0x30,0xd5]
+// CHECK: mrs      x9, {{mdrar_el1|MDRAR_EL1}}              // encoding: [0x09,0x10,0x30,0xd5]
+// CHECK: mrs      x9, {{teehbr32_el1|TEEHBR32_EL1}}           // encoding: [0x09,0x10,0x32,0xd5]
+// CHECK: mrs      x9, {{oslsr_el1|OSLSR_EL1}}              // encoding: [0x89,0x11,0x30,0xd5]
+// CHECK: mrs      x9, {{osdlr_el1|OSDLR_EL1}}              // encoding: [0x89,0x13,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgprcr_el1|DBGPRCR_EL1}}            // encoding: [0x89,0x14,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgclaimset_el1|DBGCLAIMSET_EL1}}        // encoding: [0xc9,0x78,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgclaimclr_el1|DBGCLAIMCLR_EL1}}        // encoding: [0xc9,0x79,0x30,0xd5]
+// CHECK: mrs      x9, {{dbgauthstatus_el1|DBGAUTHSTATUS_EL1}}      // encoding: [0xc9,0x7e,0x30,0xd5]
+// CHECK: mrs      x9, {{midr_el1|MIDR_EL1}}               // encoding: [0x09,0x00,0x38,0xd5]
+// CHECK: mrs      x9, {{ccsidr_el1|CCSIDR_EL1}}             // encoding: [0x09,0x00,0x39,0xd5]
+// CHECK: mrs      x9, {{csselr_el1|CSSELR_EL1}}             // encoding: [0x09,0x00,0x3a,0xd5]
+// CHECK: mrs      x9, {{vpidr_el2|VPIDR_EL2}}              // encoding: [0x09,0x00,0x3c,0xd5]
+// CHECK: mrs      x9, {{clidr_el1|CLIDR_EL1}}              // encoding: [0x29,0x00,0x39,0xd5]
+// CHECK: mrs      x9, {{ctr_el0|CTR_EL0}}                // encoding: [0x29,0x00,0x3b,0xd5]
+// CHECK: mrs      x9, {{mpidr_el1|MPIDR_EL1}}              // encoding: [0xa9,0x00,0x38,0xd5]
+// CHECK: mrs      x9, {{vmpidr_el2|VMPIDR_EL2}}             // encoding: [0xa9,0x00,0x3c,0xd5]
+// CHECK: mrs      x9, {{revidr_el1|REVIDR_EL1}}             // encoding: [0xc9,0x00,0x38,0xd5]
+// CHECK: mrs      x9, {{aidr_el1|AIDR_EL1}}               // encoding: [0xe9,0x00,0x39,0xd5]
+// CHECK: mrs      x9, {{dczid_el0|DCZID_EL0}}              // encoding: [0xe9,0x00,0x3b,0xd5]
+// CHECK: mrs      x9, {{id_pfr0_el1|ID_PFR0_EL1}}            // encoding: [0x09,0x01,0x38,0xd5]
+// CHECK: mrs      x9, {{id_pfr1_el1|ID_PFR1_EL1}}            // encoding: [0x29,0x01,0x38,0xd5]
+// CHECK: mrs      x9, {{id_dfr0_el1|ID_DFR0_EL1}}            // encoding: [0x49,0x01,0x38,0xd5]
+// CHECK: mrs      x9, {{id_afr0_el1|ID_AFR0_EL1}}            // encoding: [0x69,0x01,0x38,0xd5]
+// CHECK: mrs      x9, {{id_mmfr0_el1|ID_MMFR0_EL1}}           // encoding: [0x89,0x01,0x38,0xd5]
+// CHECK: mrs      x9, {{id_mmfr1_el1|ID_MMFR1_EL1}}           // encoding: [0xa9,0x01,0x38,0xd5]
+// CHECK: mrs      x9, {{id_mmfr2_el1|ID_MMFR2_EL1}}           // encoding: [0xc9,0x01,0x38,0xd5]
+// CHECK: mrs      x9, {{id_mmfr3_el1|ID_MMFR3_EL1}}           // encoding: [0xe9,0x01,0x38,0xd5]
+// CHECK: mrs      x9, {{id_isar0_el1|ID_ISAR0_EL1}}           // encoding: [0x09,0x02,0x38,0xd5]
+// CHECK: mrs      x9, {{id_isar1_el1|ID_ISAR1_EL1}}           // encoding: [0x29,0x02,0x38,0xd5]
+// CHECK: mrs      x9, {{id_isar2_el1|ID_ISAR2_EL1}}           // encoding: [0x49,0x02,0x38,0xd5]
+// CHECK: mrs      x9, {{id_isar3_el1|ID_ISAR3_EL1}}           // encoding: [0x69,0x02,0x38,0xd5]
+// CHECK: mrs      x9, {{id_isar4_el1|ID_ISAR4_EL1}}           // encoding: [0x89,0x02,0x38,0xd5]
+// CHECK: mrs      x9, {{id_isar5_el1|ID_ISAR5_EL1}}           // encoding: [0xa9,0x02,0x38,0xd5]
+// CHECK: mrs      x9, {{mvfr0_el1|MVFR0_EL1}}              // encoding: [0x09,0x03,0x38,0xd5]
+// CHECK: mrs      x9, {{mvfr1_el1|MVFR1_EL1}}              // encoding: [0x29,0x03,0x38,0xd5]
+// CHECK: mrs      x9, {{mvfr2_el1|MVFR2_EL1}}              // encoding: [0x49,0x03,0x38,0xd5]
+// CHECK: mrs      x9, {{id_aa64pfr0_el1|ID_AA64PFR0_EL1}}        // encoding: [0x09,0x04,0x38,0xd5]
+// CHECK: mrs      x9, {{id_aa64pfr1_el1|ID_AA64PFR1_EL1}}        // encoding: [0x29,0x04,0x38,0xd5]
+// CHECK: mrs      x9, {{id_aa64dfr0_el1|ID_AA64DFR0_EL1}}        // encoding: [0x09,0x05,0x38,0xd5]
+// CHECK: mrs      x9, {{id_aa64dfr1_el1|ID_AA64DFR1_EL1}}        // encoding: [0x29,0x05,0x38,0xd5]
+// CHECK: mrs      x9, {{id_aa64afr0_el1|ID_AA64AFR0_EL1}}        // encoding: [0x89,0x05,0x38,0xd5]
+// CHECK: mrs      x9, {{id_aa64afr1_el1|ID_AA64AFR1_EL1}}        // encoding: [0xa9,0x05,0x38,0xd5]
+// CHECK: mrs      x9, {{id_aa64isar0_el1|ID_AA64ISAR0_EL1}}       // encoding: [0x09,0x06,0x38,0xd5]
+// CHECK: mrs      x9, {{id_aa64isar1_el1|ID_AA64ISAR1_EL1}}       // encoding: [0x29,0x06,0x38,0xd5]
+// CHECK: mrs      x9, {{id_aa64mmfr0_el1|ID_AA64MMFR0_EL1}}       // encoding: [0x09,0x07,0x38,0xd5]
+// CHECK: mrs      x9, {{id_aa64mmfr1_el1|ID_AA64MMFR1_EL1}}       // encoding: [0x29,0x07,0x38,0xd5]
+// CHECK: mrs      x9, {{sctlr_el1|SCTLR_EL1}}              // encoding: [0x09,0x10,0x38,0xd5]
+// CHECK: mrs      x9, {{sctlr_el2|SCTLR_EL2}}              // encoding: [0x09,0x10,0x3c,0xd5]
+// CHECK: mrs      x9, {{sctlr_el3|SCTLR_EL3}}              // encoding: [0x09,0x10,0x3e,0xd5]
+// CHECK: mrs      x9, {{actlr_el1|ACTLR_EL1}}              // encoding: [0x29,0x10,0x38,0xd5]
+// CHECK: mrs      x9, {{actlr_el2|ACTLR_EL2}}              // encoding: [0x29,0x10,0x3c,0xd5]
+// CHECK: mrs      x9, {{actlr_el3|ACTLR_EL3}}              // encoding: [0x29,0x10,0x3e,0xd5]
+// CHECK: mrs      x9, {{cpacr_el1|CPACR_EL1}}              // encoding: [0x49,0x10,0x38,0xd5]
+// CHECK: mrs      x9, {{hcr_el2|HCR_EL2}}                // encoding: [0x09,0x11,0x3c,0xd5]
+// CHECK: mrs      x9, {{scr_el3|SCR_EL3}}                // encoding: [0x09,0x11,0x3e,0xd5]
+// CHECK: mrs      x9, {{mdcr_el2|MDCR_EL2}}               // encoding: [0x29,0x11,0x3c,0xd5]
+// CHECK: mrs      x9, {{sder32_el3|SDER32_EL3}}             // encoding: [0x29,0x11,0x3e,0xd5]
+// CHECK: mrs      x9, {{cptr_el2|CPTR_EL2}}               // encoding: [0x49,0x11,0x3c,0xd5]
+// CHECK: mrs      x9, {{cptr_el3|CPTR_EL3}}               // encoding: [0x49,0x11,0x3e,0xd5]
+// CHECK: mrs      x9, {{hstr_el2|HSTR_EL2}}               // encoding: [0x69,0x11,0x3c,0xd5]
+// CHECK: mrs      x9, {{hacr_el2|HACR_EL2}}               // encoding: [0xe9,0x11,0x3c,0xd5]
+// CHECK: mrs      x9, {{mdcr_el3|MDCR_EL3}}               // encoding: [0x29,0x13,0x3e,0xd5]
+// CHECK: mrs      x9, {{ttbr0_el1|TTBR0_EL1}}              // encoding: [0x09,0x20,0x38,0xd5]
+// CHECK: mrs      x9, {{ttbr0_el2|TTBR0_EL2}}              // encoding: [0x09,0x20,0x3c,0xd5]
+// CHECK: mrs      x9, {{ttbr0_el3|TTBR0_EL3}}              // encoding: [0x09,0x20,0x3e,0xd5]
+// CHECK: mrs      x9, {{ttbr1_el1|TTBR1_EL1}}              // encoding: [0x29,0x20,0x38,0xd5]
+// CHECK: mrs      x9, {{tcr_el1|TCR_EL1}}                // encoding: [0x49,0x20,0x38,0xd5]
+// CHECK: mrs      x9, {{tcr_el2|TCR_EL2}}                // encoding: [0x49,0x20,0x3c,0xd5]
+// CHECK: mrs      x9, {{tcr_el3|TCR_EL3}}                // encoding: [0x49,0x20,0x3e,0xd5]
+// CHECK: mrs      x9, {{vttbr_el2|VTTBR_EL2}}              // encoding: [0x09,0x21,0x3c,0xd5]
+// CHECK: mrs      x9, {{vtcr_el2|VTCR_EL2}}               // encoding: [0x49,0x21,0x3c,0xd5]
+// CHECK: mrs      x9, {{dacr32_el2|DACR32_EL2}}             // encoding: [0x09,0x30,0x3c,0xd5]
+// CHECK: mrs      x9, {{spsr_el1|SPSR_EL1}}               // encoding: [0x09,0x40,0x38,0xd5]
+// CHECK: mrs      x9, {{spsr_el2|SPSR_EL2}}               // encoding: [0x09,0x40,0x3c,0xd5]
+// CHECK: mrs      x9, {{spsr_el3|SPSR_EL3}}               // encoding: [0x09,0x40,0x3e,0xd5]
+// CHECK: mrs      x9, {{elr_el1|ELR_EL1}}                // encoding: [0x29,0x40,0x38,0xd5]
+// CHECK: mrs      x9, {{elr_el2|ELR_EL2}}                // encoding: [0x29,0x40,0x3c,0xd5]
+// CHECK: mrs      x9, {{elr_el3|ELR_EL3}}                // encoding: [0x29,0x40,0x3e,0xd5]
+// CHECK: mrs      x9, {{sp_el0|SP_EL0}}                 // encoding: [0x09,0x41,0x38,0xd5]
+// CHECK: mrs      x9, {{sp_el1|SP_EL1}}                 // encoding: [0x09,0x41,0x3c,0xd5]
+// CHECK: mrs      x9, {{sp_el2|SP_EL2}}                 // encoding: [0x09,0x41,0x3e,0xd5]
+// CHECK: mrs      x9, {{spsel|SPSEL}}                  // encoding: [0x09,0x42,0x38,0xd5]
+// CHECK: mrs      x9, {{nzcv|NZCV}}                   // encoding: [0x09,0x42,0x3b,0xd5]
+// CHECK: mrs      x9, {{daif|DAIF}}                   // encoding: [0x29,0x42,0x3b,0xd5]
+// CHECK: mrs      x9, {{currentel|CURRENTEL}}              // encoding: [0x49,0x42,0x38,0xd5]
+// CHECK: mrs      x9, {{spsr_irq|SPSR_IRQ}}               // encoding: [0x09,0x43,0x3c,0xd5]
+// CHECK: mrs      x9, {{spsr_abt|SPSR_ABT}}               // encoding: [0x29,0x43,0x3c,0xd5]
+// CHECK: mrs      x9, {{spsr_und|SPSR_UND}}               // encoding: [0x49,0x43,0x3c,0xd5]
+// CHECK: mrs      x9, {{spsr_fiq|SPSR_FIQ}}               // encoding: [0x69,0x43,0x3c,0xd5]
+// CHECK: mrs      x9, {{fpcr|FPCR}}                   // encoding: [0x09,0x44,0x3b,0xd5]
+// CHECK: mrs      x9, {{fpsr|FPSR}}                   // encoding: [0x29,0x44,0x3b,0xd5]
+// CHECK: mrs      x9, {{dspsr_el0|DSPSR_EL0}}              // encoding: [0x09,0x45,0x3b,0xd5]
+// CHECK: mrs      x9, {{dlr_el0|DLR_EL0}}                // encoding: [0x29,0x45,0x3b,0xd5]
+// CHECK: mrs      x9, {{ifsr32_el2|IFSR32_EL2}}             // encoding: [0x29,0x50,0x3c,0xd5]
+// CHECK: mrs      x9, {{afsr0_el1|AFSR0_EL1}}              // encoding: [0x09,0x51,0x38,0xd5]
+// CHECK: mrs      x9, {{afsr0_el2|AFSR0_EL2}}              // encoding: [0x09,0x51,0x3c,0xd5]
+// CHECK: mrs      x9, {{afsr0_el3|AFSR0_EL3}}              // encoding: [0x09,0x51,0x3e,0xd5]
+// CHECK: mrs      x9, {{afsr1_el1|AFSR1_EL1}}              // encoding: [0x29,0x51,0x38,0xd5]
+// CHECK: mrs      x9, {{afsr1_el2|AFSR1_EL2}}              // encoding: [0x29,0x51,0x3c,0xd5]
+// CHECK: mrs      x9, {{afsr1_el3|AFSR1_EL3}}              // encoding: [0x29,0x51,0x3e,0xd5]
+// CHECK: mrs      x9, {{esr_el1|ESR_EL1}}                // encoding: [0x09,0x52,0x38,0xd5]
+// CHECK: mrs      x9, {{esr_el2|ESR_EL2}}                // encoding: [0x09,0x52,0x3c,0xd5]
+// CHECK: mrs      x9, {{esr_el3|ESR_EL3}}                // encoding: [0x09,0x52,0x3e,0xd5]
+// CHECK: mrs      x9, {{fpexc32_el2|FPEXC32_EL2}}            // encoding: [0x09,0x53,0x3c,0xd5]
+// CHECK: mrs      x9, {{far_el1|FAR_EL1}}                // encoding: [0x09,0x60,0x38,0xd5]
+// CHECK: mrs      x9, {{far_el2|FAR_EL2}}                // encoding: [0x09,0x60,0x3c,0xd5]
+// CHECK: mrs      x9, {{far_el3|FAR_EL3}}                // encoding: [0x09,0x60,0x3e,0xd5]
+// CHECK: mrs      x9, {{hpfar_el2|HPFAR_EL2}}              // encoding: [0x89,0x60,0x3c,0xd5]
+// CHECK: mrs      x9, {{par_el1|PAR_EL1}}                // encoding: [0x09,0x74,0x38,0xd5]
+// CHECK: mrs      x9, {{pmcr_el0|PMCR_EL0}}               // encoding: [0x09,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmcntenset_el0|PMCNTENSET_EL0}}         // encoding: [0x29,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmcntenclr_el0|PMCNTENCLR_EL0}}         // encoding: [0x49,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmovsclr_el0|PMOVSCLR_EL0}}           // encoding: [0x69,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmselr_el0|PMSELR_EL0}}             // encoding: [0xa9,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmceid0_el0|PMCEID0_EL0}}            // encoding: [0xc9,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmceid1_el0|PMCEID1_EL0}}            // encoding: [0xe9,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmccntr_el0|PMCCNTR_EL0}}            // encoding: [0x09,0x9d,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmxevtyper_el0|PMXEVTYPER_EL0}}         // encoding: [0x29,0x9d,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmxevcntr_el0|PMXEVCNTR_EL0}}          // encoding: [0x49,0x9d,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmuserenr_el0|PMUSERENR_EL0}}          // encoding: [0x09,0x9e,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmintenset_el1|PMINTENSET_EL1}}         // encoding: [0x29,0x9e,0x38,0xd5]
+// CHECK: mrs      x9, {{pmintenclr_el1|PMINTENCLR_EL1}}         // encoding: [0x49,0x9e,0x38,0xd5]
+// CHECK: mrs      x9, {{pmovsset_el0|PMOVSSET_EL0}}           // encoding: [0x69,0x9e,0x3b,0xd5]
+// CHECK: mrs      x9, {{mair_el1|MAIR_EL1}}               // encoding: [0x09,0xa2,0x38,0xd5]
+// CHECK: mrs      x9, {{mair_el2|MAIR_EL2}}               // encoding: [0x09,0xa2,0x3c,0xd5]
+// CHECK: mrs      x9, {{mair_el3|MAIR_EL3}}               // encoding: [0x09,0xa2,0x3e,0xd5]
+// CHECK: mrs      x9, {{amair_el1|AMAIR_EL1}}              // encoding: [0x09,0xa3,0x38,0xd5]
+// CHECK: mrs      x9, {{amair_el2|AMAIR_EL2}}              // encoding: [0x09,0xa3,0x3c,0xd5]
+// CHECK: mrs      x9, {{amair_el3|AMAIR_EL3}}              // encoding: [0x09,0xa3,0x3e,0xd5]
+// CHECK: mrs      x9, {{vbar_el1|VBAR_EL1}}               // encoding: [0x09,0xc0,0x38,0xd5]
+// CHECK: mrs      x9, {{vbar_el2|VBAR_EL2}}               // encoding: [0x09,0xc0,0x3c,0xd5]
+// CHECK: mrs      x9, {{vbar_el3|VBAR_EL3}}               // encoding: [0x09,0xc0,0x3e,0xd5]
+// CHECK: mrs      x9, {{rvbar_el1|RVBAR_EL1}}              // encoding: [0x29,0xc0,0x38,0xd5]
+// CHECK: mrs      x9, {{rvbar_el2|RVBAR_EL2}}              // encoding: [0x29,0xc0,0x3c,0xd5]
+// CHECK: mrs      x9, {{rvbar_el3|RVBAR_EL3}}              // encoding: [0x29,0xc0,0x3e,0xd5]
+// CHECK: mrs      x9, {{rmr_el1|RMR_EL1}}                // encoding: [0x49,0xc0,0x38,0xd5]
+// CHECK: mrs      x9, {{rmr_el2|RMR_EL2}}                // encoding: [0x49,0xc0,0x3c,0xd5]
+// CHECK: mrs      x9, {{rmr_el3|RMR_EL3}}                // encoding: [0x49,0xc0,0x3e,0xd5]
+// CHECK: mrs      x9, {{isr_el1|ISR_EL1}}                // encoding: [0x09,0xc1,0x38,0xd5]
+// CHECK: mrs      x9, {{contextidr_el1|CONTEXTIDR_EL1}}         // encoding: [0x29,0xd0,0x38,0xd5]
+// CHECK: mrs      x9, {{tpidr_el0|TPIDR_EL0}}              // encoding: [0x49,0xd0,0x3b,0xd5]
+// CHECK: mrs      x9, {{tpidr_el2|TPIDR_EL2}}              // encoding: [0x49,0xd0,0x3c,0xd5]
+// CHECK: mrs      x9, {{tpidr_el3|TPIDR_EL3}}              // encoding: [0x49,0xd0,0x3e,0xd5]
+// CHECK: mrs      x9, {{tpidrro_el0|TPIDRRO_EL0}}            // encoding: [0x69,0xd0,0x3b,0xd5]
+// CHECK: mrs      x9, {{tpidr_el1|TPIDR_EL1}}              // encoding: [0x89,0xd0,0x38,0xd5]
+// CHECK: mrs      x9, {{cntfrq_el0|CNTFRQ_EL0}}             // encoding: [0x09,0xe0,0x3b,0xd5]
+// CHECK: mrs      x9, {{cntpct_el0|CNTPCT_EL0}}             // encoding: [0x29,0xe0,0x3b,0xd5]
+// CHECK: mrs      x9, {{cntvct_el0|CNTVCT_EL0}}             // encoding: [0x49,0xe0,0x3b,0xd5]
+// CHECK: mrs      x9, {{cntvoff_el2|CNTVOFF_EL2}}            // encoding: [0x69,0xe0,0x3c,0xd5]
+// CHECK: mrs      x9, {{cntkctl_el1|CNTKCTL_EL1}}            // encoding: [0x09,0xe1,0x38,0xd5]
+// CHECK: mrs      x9, {{cnthctl_el2|CNTHCTL_EL2}}            // encoding: [0x09,0xe1,0x3c,0xd5]
+// CHECK: mrs      x9, {{cntp_tval_el0|CNTP_TVAL_EL0}}          // encoding: [0x09,0xe2,0x3b,0xd5]
+// CHECK: mrs      x9, {{cnthp_tval_el2|CNTHP_TVAL_EL2}}         // encoding: [0x09,0xe2,0x3c,0xd5]
+// CHECK: mrs      x9, {{cntps_tval_el1|CNTPS_TVAL_EL1}}         // encoding: [0x09,0xe2,0x3f,0xd5]
+// CHECK: mrs      x9, {{cntp_ctl_el0|CNTP_CTL_EL0}}           // encoding: [0x29,0xe2,0x3b,0xd5]
+// CHECK: mrs      x9, {{cnthp_ctl_el2|CNTHP_CTL_EL2}}          // encoding: [0x29,0xe2,0x3c,0xd5]
+// CHECK: mrs      x9, {{cntps_ctl_el1|CNTPS_CTL_EL1}}          // encoding: [0x29,0xe2,0x3f,0xd5]
+// CHECK: mrs      x9, {{cntp_cval_el0|CNTP_CVAL_EL0}}          // encoding: [0x49,0xe2,0x3b,0xd5]
+// CHECK: mrs      x9, {{cnthp_cval_el2|CNTHP_CVAL_EL2}}         // encoding: [0x49,0xe2,0x3c,0xd5]
+// CHECK: mrs      x9, {{cntps_cval_el1|CNTPS_CVAL_EL1}}         // encoding: [0x49,0xe2,0x3f,0xd5]
+// CHECK: mrs      x9, {{cntv_tval_el0|CNTV_TVAL_EL0}}          // encoding: [0x09,0xe3,0x3b,0xd5]
+// CHECK: mrs      x9, {{cntv_ctl_el0|CNTV_CTL_EL0}}           // encoding: [0x29,0xe3,0x3b,0xd5]
+// CHECK: mrs      x9, {{cntv_cval_el0|CNTV_CVAL_EL0}}          // encoding: [0x49,0xe3,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr0_el0|PMEVCNTR0_EL0}}          // encoding: [0x09,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr1_el0|PMEVCNTR1_EL0}}          // encoding: [0x29,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr2_el0|PMEVCNTR2_EL0}}          // encoding: [0x49,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr3_el0|PMEVCNTR3_EL0}}          // encoding: [0x69,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr4_el0|PMEVCNTR4_EL0}}          // encoding: [0x89,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr5_el0|PMEVCNTR5_EL0}}          // encoding: [0xa9,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr6_el0|PMEVCNTR6_EL0}}          // encoding: [0xc9,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr7_el0|PMEVCNTR7_EL0}}          // encoding: [0xe9,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr8_el0|PMEVCNTR8_EL0}}          // encoding: [0x09,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr9_el0|PMEVCNTR9_EL0}}          // encoding: [0x29,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr10_el0|PMEVCNTR10_EL0}}         // encoding: [0x49,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr11_el0|PMEVCNTR11_EL0}}         // encoding: [0x69,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr12_el0|PMEVCNTR12_EL0}}         // encoding: [0x89,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr13_el0|PMEVCNTR13_EL0}}         // encoding: [0xa9,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr14_el0|PMEVCNTR14_EL0}}         // encoding: [0xc9,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr15_el0|PMEVCNTR15_EL0}}         // encoding: [0xe9,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr16_el0|PMEVCNTR16_EL0}}         // encoding: [0x09,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr17_el0|PMEVCNTR17_EL0}}         // encoding: [0x29,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr18_el0|PMEVCNTR18_EL0}}         // encoding: [0x49,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr19_el0|PMEVCNTR19_EL0}}         // encoding: [0x69,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr20_el0|PMEVCNTR20_EL0}}         // encoding: [0x89,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr21_el0|PMEVCNTR21_EL0}}         // encoding: [0xa9,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr22_el0|PMEVCNTR22_EL0}}         // encoding: [0xc9,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr23_el0|PMEVCNTR23_EL0}}         // encoding: [0xe9,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr24_el0|PMEVCNTR24_EL0}}         // encoding: [0x09,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr25_el0|PMEVCNTR25_EL0}}         // encoding: [0x29,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr26_el0|PMEVCNTR26_EL0}}         // encoding: [0x49,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr27_el0|PMEVCNTR27_EL0}}         // encoding: [0x69,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr28_el0|PMEVCNTR28_EL0}}         // encoding: [0x89,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr29_el0|PMEVCNTR29_EL0}}         // encoding: [0xa9,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevcntr30_el0|PMEVCNTR30_EL0}}         // encoding: [0xc9,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmccfiltr_el0|PMCCFILTR_EL0}}          // encoding: [0xe9,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper0_el0|PMEVTYPER0_EL0}}         // encoding: [0x09,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper1_el0|PMEVTYPER1_EL0}}         // encoding: [0x29,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper2_el0|PMEVTYPER2_EL0}}         // encoding: [0x49,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper3_el0|PMEVTYPER3_EL0}}         // encoding: [0x69,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper4_el0|PMEVTYPER4_EL0}}         // encoding: [0x89,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper5_el0|PMEVTYPER5_EL0}}         // encoding: [0xa9,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper6_el0|PMEVTYPER6_EL0}}         // encoding: [0xc9,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper7_el0|PMEVTYPER7_EL0}}         // encoding: [0xe9,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper8_el0|PMEVTYPER8_EL0}}         // encoding: [0x09,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper9_el0|PMEVTYPER9_EL0}}         // encoding: [0x29,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper10_el0|PMEVTYPER10_EL0}}        // encoding: [0x49,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper11_el0|PMEVTYPER11_EL0}}        // encoding: [0x69,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper12_el0|PMEVTYPER12_EL0}}        // encoding: [0x89,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper13_el0|PMEVTYPER13_EL0}}        // encoding: [0xa9,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper14_el0|PMEVTYPER14_EL0}}        // encoding: [0xc9,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper15_el0|PMEVTYPER15_EL0}}        // encoding: [0xe9,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper16_el0|PMEVTYPER16_EL0}}        // encoding: [0x09,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper17_el0|PMEVTYPER17_EL0}}        // encoding: [0x29,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper18_el0|PMEVTYPER18_EL0}}        // encoding: [0x49,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper19_el0|PMEVTYPER19_EL0}}        // encoding: [0x69,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper20_el0|PMEVTYPER20_EL0}}        // encoding: [0x89,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper21_el0|PMEVTYPER21_EL0}}        // encoding: [0xa9,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper22_el0|PMEVTYPER22_EL0}}        // encoding: [0xc9,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper23_el0|PMEVTYPER23_EL0}}        // encoding: [0xe9,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper24_el0|PMEVTYPER24_EL0}}        // encoding: [0x09,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper25_el0|PMEVTYPER25_EL0}}        // encoding: [0x29,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper26_el0|PMEVTYPER26_EL0}}        // encoding: [0x49,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper27_el0|PMEVTYPER27_EL0}}        // encoding: [0x69,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper28_el0|PMEVTYPER28_EL0}}        // encoding: [0x89,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper29_el0|PMEVTYPER29_EL0}}        // encoding: [0xa9,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, {{pmevtyper30_el0|PMEVTYPER30_EL0}}        // encoding: [0xc9,0xef,0x3b,0xd5]
 
         mrs x12, s3_7_c15_c1_5
         mrs x13, s3_2_c11_c15_7
         msr s3_0_c15_c0_0, x12
         msr s3_7_c11_c13_7, x5
-// CHECK: mrs     x12, s3_7_c15_c1_5      // encoding: [0xac,0xf1,0x3f,0xd5]
-// CHECK: mrs     x13, s3_2_c11_c15_7     // encoding: [0xed,0xbf,0x3a,0xd5]
-// CHECK: msr     s3_0_c15_c0_0, x12      // encoding: [0x0c,0xf0,0x18,0xd5]
-// CHECK: msr     s3_7_c11_c13_7, x5      // encoding: [0xe5,0xbd,0x1f,0xd5]
+// CHECK: mrs     x12, {{s3_7_c15_c1_5|S3_7_C15_C1_5}}      // encoding: [0xac,0xf1,0x3f,0xd5]
+// CHECK: mrs     x13, {{s3_2_c11_c15_7|S3_2_C11_C15_7}}     // encoding: [0xed,0xbf,0x3a,0xd5]
+// CHECK: msr     {{s3_0_c15_c0_0|S3_0_C15_C0_0}}, x12      // encoding: [0x0c,0xf0,0x18,0xd5]
+// CHECK: msr     {{s3_7_c11_c13_7|S3_7_C11_C13_7}}, x5      // encoding: [0xe5,0xbd,0x1f,0xd5]
 
 //------------------------------------------------------------------------------
 // Unconditional branch (immediate)
@@ -4814,22 +4781,25 @@ _func:
         tbz x5, #0, somewhere
         tbz xzr, #63, elsewhere
         tbnz x5, #45, nowhere
-// CHECK: tbz     x5, #0, somewhere       // encoding: [0x05'A',A,A,0x36'A']
-// CHECK:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_a64_tstbr
-// CHECK: tbz     xzr, #63, elsewhere     // encoding: [0x1f'A',A,0xf8'A',0xb6'A']
-// CHECK:                                 //   fixup A - offset: 0, value: elsewhere, kind: fixup_a64_tstbr
-// CHECK: tbnz    x5, #45, nowhere        // encoding: [0x05'A',A,0x68'A',0xb7'A']
-// CHECK:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_a64_tstbr
+
+// CHECK: tbz    w5, #0, somewhere       // encoding: [0bAAA00101,A,0b00000AAA,0x36]
+// CHECK:                                //   fixup A - offset: 0, value: somewhere, kind: fixup_aarch64_pcrel_branch14
+// CHECK: tbz    xzr, #63, elsewhere     // encoding: [0bAAA11111,A,0b11111AAA,0xb6]
+// CHECK:                                //   fixup A - offset: 0, value: elsewhere, kind: fixup_aarch64_pcrel_branch14
+// CHECK: tbnz   x5, #45, nowhere        // encoding: [0bAAA00101,A,0b01101AAA,0xb7]
+// CHECK:                                //   fixup A - offset: 0, value: nowhere, kind: fixup_aarch64_pcrel_branch14
+
 
         tbnz w3, #2, there
         tbnz wzr, #31, nowhere
         tbz w5, #12, anywhere
-// CHECK: tbnz    w3, #2, there           // encoding: [0x03'A',A,0x10'A',0x37'A']
-// CHECK:                                 //   fixup A - offset: 0, value: there, kind: fixup_a64_tstbr
-// CHECK: tbnz    wzr, #31, nowhere       // encoding: [0x1f'A',A,0xf8'A',0x37'A']
-// CHECK:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_a64_tstbr
-// CHECK: tbz     w5, #12, anywhere       // encoding: [0x05'A',A,0x60'A',0x36'A']
-// CHECK:                                 //   fixup A - offset: 0, value: anywhere, kind: fixup_a64_tstbr
+
+// CHECK: tbnz    w3, #2, there           // encoding: [0bAAA00011,A,0b00010AAA,0x37]
+// CHECK:                                 //   fixup A - offset: 0, value: there, kind: fixup_aarch64_pcrel_branch14
+// CHECK: tbnz    wzr, #31, nowhere       // encoding: [0bAAA11111,A,0b11111AAA,0x37]
+// CHECK:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_aarch64_pcrel_branch14
+// CHECK: tbz     w5, #12, anywhere       // encoding: [0bAAA00101,A,0b01100AAA,0x36]
+// CHECK:                                 //   fixup A - offset: 0, value: anywhere, kind: fixup_aarch64_pcrel_branch14
 
 //------------------------------------------------------------------------------
 // Unconditional branch (immediate)
@@ -4837,10 +4807,11 @@ _func:
 
         b somewhere
         bl elsewhere
-// CHECK: b       somewhere               // encoding: [A,A,A,0x14'A']
-// CHECK:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_a64_uncondbr
-// CHECK: bl      elsewhere               // encoding: [A,A,A,0x94'A']
-// CHECK:                                 //   fixup A - offset: 0, value: elsewhere, kind: fixup_a64_call
+
+// CHECK: b    somewhere               // encoding: [A,A,A,0b000101AA]
+// CHECK:                              //   fixup A - offset: 0, value: somewhere, kind: fixup_aarch64_pcrel_branch26
+// CHECK: bl    elsewhere               // encoding: [A,A,A,0b100101AA]
+// CHECK:                               //   fixup A - offset: 0, value: elsewhere, kind: fixup_aarch64_pcrel_call26
 
         b #4
         bl #0
diff --git a/test/MC/AArch64/elf-globaladdress.ll b/test/MC/AArch64/elf-globaladdress.ll
index bc43113..7d031e6 100644
--- a/test/MC/AArch64/elf-globaladdress.ll
+++ b/test/MC/AArch64/elf-globaladdress.ll
@@ -3,7 +3,7 @@
 
 ; Also take it on a round-trip through llvm-mc to stretch assembly-parsing's legs:
 ;; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | \
-;; RUN:     llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj -o - | \
+;; RUN:     llvm-mc -triple=arm64-none-linux-gnu -filetype=obj -o - | \
 ;; RUN:     llvm-readobj -h -r | FileCheck -check-prefix=OBJ %s
 
 @var8 = global i8 0
diff --git a/test/MC/AArch64/elf-reloc-addend.s b/test/MC/AArch64/elf-reloc-addend.s
deleted file mode 100644
index 0e7e2ca..0000000
--- a/test/MC/AArch64/elf-reloc-addend.s
+++ /dev/null
@@ -1,8 +0,0 @@
-// RUN: llvm-mc -triple=aarch64-linux-gnu -filetype=obj -o - %s | llvm-objdump -triple=aarch64-linux-gnu -r - | FileCheck %s
-
-	add x0, x4, #:lo12:sym
-// CHECK: 0 R_AARCH64_ADD_ABS_LO12_NC sym
-	add x3, x5, #:lo12:sym+1
-// CHECK: 4 R_AARCH64_ADD_ABS_LO12_NC sym+1
-	add x3, x5, #:lo12:sym-1
-// CHECK: 8 R_AARCH64_ADD_ABS_LO12_NC sym-1
diff --git a/test/MC/AArch64/elf-reloc-condbr.s b/test/MC/AArch64/elf-reloc-condbr.s
deleted file mode 100644
index b70dfa7..0000000
--- a/test/MC/AArch64/elf-reloc-condbr.s
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
-// RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
-
-        b.eq somewhere
-
-// OBJ:      Relocations [
-// OBJ-NEXT:   Section (2) .rela.text {
-// OBJ-NEXT:     0x0 R_AARCH64_CONDBR19 somewhere 0x0
-// OBJ-NEXT:   }
-// OBJ-NEXT: ]
diff --git a/test/MC/AArch64/gicv3-regs.s b/test/MC/AArch64/gicv3-regs.s
index f777651..0f5742e 100644
--- a/test/MC/AArch64/gicv3-regs.s
+++ b/test/MC/AArch64/gicv3-regs.s
@@ -56,62 +56,62 @@
         mrs x8, ich_lr13_el2
         mrs x2, ich_lr14_el2
         mrs x8, ich_lr15_el2
-// CHECK: mrs      x8, icc_iar1_el1           // encoding: [0x08,0xcc,0x38,0xd5]
-// CHECK: mrs      x26, icc_iar0_el1          // encoding: [0x1a,0xc8,0x38,0xd5]
-// CHECK: mrs      x2, icc_hppir1_el1         // encoding: [0x42,0xcc,0x38,0xd5]
-// CHECK: mrs      x17, icc_hppir0_el1        // encoding: [0x51,0xc8,0x38,0xd5]
-// CHECK: mrs      x29, icc_rpr_el1           // encoding: [0x7d,0xcb,0x38,0xd5]
-// CHECK: mrs      x4, ich_vtr_el2            // encoding: [0x24,0xcb,0x3c,0xd5]
-// CHECK: mrs      x24, ich_eisr_el2          // encoding: [0x78,0xcb,0x3c,0xd5]
-// CHECK: mrs      x9, ich_elsr_el2           // encoding: [0xa9,0xcb,0x3c,0xd5]
-// CHECK: mrs      x24, icc_bpr1_el1          // encoding: [0x78,0xcc,0x38,0xd5]
-// CHECK: mrs      x14, icc_bpr0_el1          // encoding: [0x6e,0xc8,0x38,0xd5]
-// CHECK: mrs      x19, icc_pmr_el1           // encoding: [0x13,0x46,0x38,0xd5]
-// CHECK: mrs      x23, icc_ctlr_el1          // encoding: [0x97,0xcc,0x38,0xd5]
-// CHECK: mrs      x20, icc_ctlr_el3          // encoding: [0x94,0xcc,0x3e,0xd5]
-// CHECK: mrs      x28, icc_sre_el1           // encoding: [0xbc,0xcc,0x38,0xd5]
-// CHECK: mrs      x25, icc_sre_el2           // encoding: [0xb9,0xc9,0x3c,0xd5]
-// CHECK: mrs      x8, icc_sre_el3            // encoding: [0xa8,0xcc,0x3e,0xd5]
-// CHECK: mrs      x22, icc_igrpen0_el1       // encoding: [0xd6,0xcc,0x38,0xd5]
-// CHECK: mrs      x5, icc_igrpen1_el1        // encoding: [0xe5,0xcc,0x38,0xd5]
-// CHECK: mrs      x7, icc_igrpen1_el3        // encoding: [0xe7,0xcc,0x3e,0xd5]
-// CHECK: mrs      x22, icc_seien_el1         // encoding: [0x16,0xcd,0x38,0xd5]
-// CHECK: mrs      x4, icc_ap0r0_el1          // encoding: [0x84,0xc8,0x38,0xd5]
-// CHECK: mrs      x11, icc_ap0r1_el1         // encoding: [0xab,0xc8,0x38,0xd5]
-// CHECK: mrs      x27, icc_ap0r2_el1         // encoding: [0xdb,0xc8,0x38,0xd5]
-// CHECK: mrs      x21, icc_ap0r3_el1         // encoding: [0xf5,0xc8,0x38,0xd5]
-// CHECK: mrs      x2, icc_ap1r0_el1          // encoding: [0x02,0xc9,0x38,0xd5]
-// CHECK: mrs      x21, icc_ap1r1_el1         // encoding: [0x35,0xc9,0x38,0xd5]
-// CHECK: mrs      x10, icc_ap1r2_el1         // encoding: [0x4a,0xc9,0x38,0xd5]
-// CHECK: mrs      x27, icc_ap1r3_el1         // encoding: [0x7b,0xc9,0x38,0xd5]
-// CHECK: mrs      x20, ich_ap0r0_el2         // encoding: [0x14,0xc8,0x3c,0xd5]
-// CHECK: mrs      x21, ich_ap0r1_el2         // encoding: [0x35,0xc8,0x3c,0xd5]
-// CHECK: mrs      x5, ich_ap0r2_el2          // encoding: [0x45,0xc8,0x3c,0xd5]
-// CHECK: mrs      x4, ich_ap0r3_el2          // encoding: [0x64,0xc8,0x3c,0xd5]
-// CHECK: mrs      x15, ich_ap1r0_el2         // encoding: [0x0f,0xc9,0x3c,0xd5]
-// CHECK: mrs      x12, ich_ap1r1_el2         // encoding: [0x2c,0xc9,0x3c,0xd5]
-// CHECK: mrs      x27, ich_ap1r2_el2         // encoding: [0x5b,0xc9,0x3c,0xd5]
-// CHECK: mrs      x20, ich_ap1r3_el2         // encoding: [0x74,0xc9,0x3c,0xd5]
-// CHECK: mrs      x10, ich_hcr_el2           // encoding: [0x0a,0xcb,0x3c,0xd5]
-// CHECK: mrs      x27, ich_misr_el2          // encoding: [0x5b,0xcb,0x3c,0xd5]
-// CHECK: mrs      x6, ich_vmcr_el2           // encoding: [0xe6,0xcb,0x3c,0xd5]
-// CHECK: mrs      x19, ich_vseir_el2         // encoding: [0x93,0xc9,0x3c,0xd5]
-// CHECK: mrs      x3, ich_lr0_el2            // encoding: [0x03,0xcc,0x3c,0xd5]
-// CHECK: mrs      x1, ich_lr1_el2            // encoding: [0x21,0xcc,0x3c,0xd5]
-// CHECK: mrs      x22, ich_lr2_el2           // encoding: [0x56,0xcc,0x3c,0xd5]
-// CHECK: mrs      x21, ich_lr3_el2           // encoding: [0x75,0xcc,0x3c,0xd5]
-// CHECK: mrs      x6, ich_lr4_el2            // encoding: [0x86,0xcc,0x3c,0xd5]
-// CHECK: mrs      x10, ich_lr5_el2           // encoding: [0xaa,0xcc,0x3c,0xd5]
-// CHECK: mrs      x11, ich_lr6_el2           // encoding: [0xcb,0xcc,0x3c,0xd5]
-// CHECK: mrs      x12, ich_lr7_el2           // encoding: [0xec,0xcc,0x3c,0xd5]
-// CHECK: mrs      x0, ich_lr8_el2            // encoding: [0x00,0xcd,0x3c,0xd5]
-// CHECK: mrs      x21, ich_lr9_el2           // encoding: [0x35,0xcd,0x3c,0xd5]
-// CHECK: mrs      x13, ich_lr10_el2          // encoding: [0x4d,0xcd,0x3c,0xd5]
-// CHECK: mrs      x26, ich_lr11_el2          // encoding: [0x7a,0xcd,0x3c,0xd5]
-// CHECK: mrs      x1, ich_lr12_el2           // encoding: [0x81,0xcd,0x3c,0xd5]
-// CHECK: mrs      x8, ich_lr13_el2           // encoding: [0xa8,0xcd,0x3c,0xd5]
-// CHECK: mrs      x2, ich_lr14_el2           // encoding: [0xc2,0xcd,0x3c,0xd5]
-// CHECK: mrs      x8, ich_lr15_el2           // encoding: [0xe8,0xcd,0x3c,0xd5]
+// CHECK: mrs      x8, {{icc_iar1_el1|ICC_IAR1_EL1}}           // encoding: [0x08,0xcc,0x38,0xd5]
+// CHECK: mrs      x26, {{icc_iar0_el1|ICC_IAR0_EL1}}          // encoding: [0x1a,0xc8,0x38,0xd5]
+// CHECK: mrs      x2, {{icc_hppir1_el1|ICC_HPPIR1_EL1}}         // encoding: [0x42,0xcc,0x38,0xd5]
+// CHECK: mrs      x17, {{icc_hppir0_el1|ICC_HPPIR0_EL1}}        // encoding: [0x51,0xc8,0x38,0xd5]
+// CHECK: mrs      x29, {{icc_rpr_el1|ICC_RPR_EL1}}           // encoding: [0x7d,0xcb,0x38,0xd5]
+// CHECK: mrs      x4, {{ich_vtr_el2|ICH_VTR_EL2}}            // encoding: [0x24,0xcb,0x3c,0xd5]
+// CHECK: mrs      x24, {{ich_eisr_el2|ICH_EISR_EL2}}          // encoding: [0x78,0xcb,0x3c,0xd5]
+// CHECK: mrs      x9, {{ich_elsr_el2|ICH_ELSR_EL2}}           // encoding: [0xa9,0xcb,0x3c,0xd5]
+// CHECK: mrs      x24, {{icc_bpr1_el1|ICC_BPR1_EL1}}          // encoding: [0x78,0xcc,0x38,0xd5]
+// CHECK: mrs      x14, {{icc_bpr0_el1|ICC_BPR0_EL1}}          // encoding: [0x6e,0xc8,0x38,0xd5]
+// CHECK: mrs      x19, {{icc_pmr_el1|ICC_PMR_EL1}}           // encoding: [0x13,0x46,0x38,0xd5]
+// CHECK: mrs      x23, {{icc_ctlr_el1|ICC_CTLR_EL1}}          // encoding: [0x97,0xcc,0x38,0xd5]
+// CHECK: mrs      x20, {{icc_ctlr_el3|ICC_CTLR_EL3}}          // encoding: [0x94,0xcc,0x3e,0xd5]
+// CHECK: mrs      x28, {{icc_sre_el1|ICC_SRE_EL1}}           // encoding: [0xbc,0xcc,0x38,0xd5]
+// CHECK: mrs      x25, {{icc_sre_el2|ICC_SRE_EL2}}           // encoding: [0xb9,0xc9,0x3c,0xd5]
+// CHECK: mrs      x8, {{icc_sre_el3|ICC_SRE_EL3}}            // encoding: [0xa8,0xcc,0x3e,0xd5]
+// CHECK: mrs      x22, {{icc_igrpen0_el1|ICC_IGRPEN0_EL1}}       // encoding: [0xd6,0xcc,0x38,0xd5]
+// CHECK: mrs      x5, {{icc_igrpen1_el1|ICC_IGRPEN1_EL1}}        // encoding: [0xe5,0xcc,0x38,0xd5]
+// CHECK: mrs      x7, {{icc_igrpen1_el3|ICC_IGRPEN1_EL3}}        // encoding: [0xe7,0xcc,0x3e,0xd5]
+// CHECK: mrs      x22, {{icc_seien_el1|ICC_SEIEN_EL1}}         // encoding: [0x16,0xcd,0x38,0xd5]
+// CHECK: mrs      x4, {{icc_ap0r0_el1|ICC_AP0R0_EL1}}          // encoding: [0x84,0xc8,0x38,0xd5]
+// CHECK: mrs      x11, {{icc_ap0r1_el1|ICC_AP0R1_EL1}}         // encoding: [0xab,0xc8,0x38,0xd5]
+// CHECK: mrs      x27, {{icc_ap0r2_el1|ICC_AP0R2_EL1}}         // encoding: [0xdb,0xc8,0x38,0xd5]
+// CHECK: mrs      x21, {{icc_ap0r3_el1|ICC_AP0R3_EL1}}         // encoding: [0xf5,0xc8,0x38,0xd5]
+// CHECK: mrs      x2, {{icc_ap1r0_el1|ICC_AP1R0_EL1}}          // encoding: [0x02,0xc9,0x38,0xd5]
+// CHECK: mrs      x21, {{icc_ap1r1_el1|ICC_AP1R1_EL1}}         // encoding: [0x35,0xc9,0x38,0xd5]
+// CHECK: mrs      x10, {{icc_ap1r2_el1|ICC_AP1R2_EL1}}         // encoding: [0x4a,0xc9,0x38,0xd5]
+// CHECK: mrs      x27, {{icc_ap1r3_el1|ICC_AP1R3_EL1}}         // encoding: [0x7b,0xc9,0x38,0xd5]
+// CHECK: mrs      x20, {{ich_ap0r0_el2|ICH_AP0R0_EL2}}         // encoding: [0x14,0xc8,0x3c,0xd5]
+// CHECK: mrs      x21, {{ich_ap0r1_el2|ICH_AP0R1_EL2}}         // encoding: [0x35,0xc8,0x3c,0xd5]
+// CHECK: mrs      x5, {{ich_ap0r2_el2|ICH_AP0R2_EL2}}          // encoding: [0x45,0xc8,0x3c,0xd5]
+// CHECK: mrs      x4, {{ich_ap0r3_el2|ICH_AP0R3_EL2}}          // encoding: [0x64,0xc8,0x3c,0xd5]
+// CHECK: mrs      x15, {{ich_ap1r0_el2|ICH_AP1R0_EL2}}         // encoding: [0x0f,0xc9,0x3c,0xd5]
+// CHECK: mrs      x12, {{ich_ap1r1_el2|ICH_AP1R1_EL2}}         // encoding: [0x2c,0xc9,0x3c,0xd5]
+// CHECK: mrs      x27, {{ich_ap1r2_el2|ICH_AP1R2_EL2}}         // encoding: [0x5b,0xc9,0x3c,0xd5]
+// CHECK: mrs      x20, {{ich_ap1r3_el2|ICH_AP1R3_EL2}}         // encoding: [0x74,0xc9,0x3c,0xd5]
+// CHECK: mrs      x10, {{ich_hcr_el2|ICH_HCR_EL2}}           // encoding: [0x0a,0xcb,0x3c,0xd5]
+// CHECK: mrs      x27, {{ich_misr_el2|ICH_MISR_EL2}}          // encoding: [0x5b,0xcb,0x3c,0xd5]
+// CHECK: mrs      x6, {{ich_vmcr_el2|ICH_VMCR_EL2}}           // encoding: [0xe6,0xcb,0x3c,0xd5]
+// CHECK: mrs      x19, {{ich_vseir_el2|ICH_VSEIR_EL2}}         // encoding: [0x93,0xc9,0x3c,0xd5]
+// CHECK: mrs      x3, {{ich_lr0_el2|ICH_LR0_EL2}}            // encoding: [0x03,0xcc,0x3c,0xd5]
+// CHECK: mrs      x1, {{ich_lr1_el2|ICH_LR1_EL2}}            // encoding: [0x21,0xcc,0x3c,0xd5]
+// CHECK: mrs      x22, {{ich_lr2_el2|ICH_LR2_EL2}}           // encoding: [0x56,0xcc,0x3c,0xd5]
+// CHECK: mrs      x21, {{ich_lr3_el2|ICH_LR3_EL2}}           // encoding: [0x75,0xcc,0x3c,0xd5]
+// CHECK: mrs      x6, {{ich_lr4_el2|ICH_LR4_EL2}}            // encoding: [0x86,0xcc,0x3c,0xd5]
+// CHECK: mrs      x10, {{ich_lr5_el2|ICH_LR5_EL2}}           // encoding: [0xaa,0xcc,0x3c,0xd5]
+// CHECK: mrs      x11, {{ich_lr6_el2|ICH_LR6_EL2}}           // encoding: [0xcb,0xcc,0x3c,0xd5]
+// CHECK: mrs      x12, {{ich_lr7_el2|ICH_LR7_EL2}}           // encoding: [0xec,0xcc,0x3c,0xd5]
+// CHECK: mrs      x0, {{ich_lr8_el2|ICH_LR8_EL2}}            // encoding: [0x00,0xcd,0x3c,0xd5]
+// CHECK: mrs      x21, {{ich_lr9_el2|ICH_LR9_EL2}}           // encoding: [0x35,0xcd,0x3c,0xd5]
+// CHECK: mrs      x13, {{ich_lr10_el2|ICH_LR10_EL2}}          // encoding: [0x4d,0xcd,0x3c,0xd5]
+// CHECK: mrs      x26, {{ich_lr11_el2|ICH_LR11_EL2}}          // encoding: [0x7a,0xcd,0x3c,0xd5]
+// CHECK: mrs      x1, {{ich_lr12_el2|ICH_LR12_EL2}}           // encoding: [0x81,0xcd,0x3c,0xd5]
+// CHECK: mrs      x8, {{ich_lr13_el2|ICH_LR13_EL2}}           // encoding: [0xa8,0xcd,0x3c,0xd5]
+// CHECK: mrs      x2, {{ich_lr14_el2|ICH_LR14_EL2}}           // encoding: [0xc2,0xcd,0x3c,0xd5]
+// CHECK: mrs      x8, {{ich_lr15_el2|ICH_LR15_EL2}}           // encoding: [0xe8,0xcd,0x3c,0xd5]
 
         msr icc_eoir1_el1, x27
         msr icc_eoir0_el1, x5
@@ -167,57 +167,57 @@
         msr ich_lr13_el2, x2
         msr ich_lr14_el2, x13
         msr ich_lr15_el2, x27
-// CHECK: msr      icc_eoir1_el1, x27         // encoding: [0x3b,0xcc,0x18,0xd5]
-// CHECK: msr      icc_eoir0_el1, x5          // encoding: [0x25,0xc8,0x18,0xd5]
-// CHECK: msr      icc_dir_el1, x13           // encoding: [0x2d,0xcb,0x18,0xd5]
-// CHECK: msr      icc_sgi1r_el1, x21         // encoding: [0xb5,0xcb,0x18,0xd5]
-// CHECK: msr      icc_asgi1r_el1, x25        // encoding: [0xd9,0xcb,0x18,0xd5]
-// CHECK: msr      icc_sgi0r_el1, x28         // encoding: [0xfc,0xcb,0x18,0xd5]
-// CHECK: msr      icc_bpr1_el1, x7           // encoding: [0x67,0xcc,0x18,0xd5]
-// CHECK: msr      icc_bpr0_el1, x9           // encoding: [0x69,0xc8,0x18,0xd5]
-// CHECK: msr      icc_pmr_el1, x29           // encoding: [0x1d,0x46,0x18,0xd5]
-// CHECK: msr      icc_ctlr_el1, x24          // encoding: [0x98,0xcc,0x18,0xd5]
-// CHECK: msr      icc_ctlr_el3, x0           // encoding: [0x80,0xcc,0x1e,0xd5]
-// CHECK: msr      icc_sre_el1, x2            // encoding: [0xa2,0xcc,0x18,0xd5]
-// CHECK: msr      icc_sre_el2, x5            // encoding: [0xa5,0xc9,0x1c,0xd5]
-// CHECK: msr      icc_sre_el3, x10           // encoding: [0xaa,0xcc,0x1e,0xd5]
-// CHECK: msr      icc_igrpen0_el1, x22       // encoding: [0xd6,0xcc,0x18,0xd5]
-// CHECK: msr      icc_igrpen1_el1, x11       // encoding: [0xeb,0xcc,0x18,0xd5]
-// CHECK: msr      icc_igrpen1_el3, x8        // encoding: [0xe8,0xcc,0x1e,0xd5]
-// CHECK: msr      icc_seien_el1, x4          // encoding: [0x04,0xcd,0x18,0xd5]
-// CHECK: msr      icc_ap0r0_el1, x27         // encoding: [0x9b,0xc8,0x18,0xd5]
-// CHECK: msr      icc_ap0r1_el1, x5          // encoding: [0xa5,0xc8,0x18,0xd5]
-// CHECK: msr      icc_ap0r2_el1, x20         // encoding: [0xd4,0xc8,0x18,0xd5]
-// CHECK: msr      icc_ap0r3_el1, x0          // encoding: [0xe0,0xc8,0x18,0xd5]
-// CHECK: msr      icc_ap1r0_el1, x2          // encoding: [0x02,0xc9,0x18,0xd5]
-// CHECK: msr      icc_ap1r1_el1, x29         // encoding: [0x3d,0xc9,0x18,0xd5]
-// CHECK: msr      icc_ap1r2_el1, x23         // encoding: [0x57,0xc9,0x18,0xd5]
-// CHECK: msr      icc_ap1r3_el1, x11         // encoding: [0x6b,0xc9,0x18,0xd5]
-// CHECK: msr      ich_ap0r0_el2, x2          // encoding: [0x02,0xc8,0x1c,0xd5]
-// CHECK: msr      ich_ap0r1_el2, x27         // encoding: [0x3b,0xc8,0x1c,0xd5]
-// CHECK: msr      ich_ap0r2_el2, x7          // encoding: [0x47,0xc8,0x1c,0xd5]
-// CHECK: msr      ich_ap0r3_el2, x1          // encoding: [0x61,0xc8,0x1c,0xd5]
-// CHECK: msr      ich_ap1r0_el2, x7          // encoding: [0x07,0xc9,0x1c,0xd5]
-// CHECK: msr      ich_ap1r1_el2, x12         // encoding: [0x2c,0xc9,0x1c,0xd5]
-// CHECK: msr      ich_ap1r2_el2, x14         // encoding: [0x4e,0xc9,0x1c,0xd5]
-// CHECK: msr      ich_ap1r3_el2, x13         // encoding: [0x6d,0xc9,0x1c,0xd5]
-// CHECK: msr      ich_hcr_el2, x1            // encoding: [0x01,0xcb,0x1c,0xd5]
-// CHECK: msr      ich_misr_el2, x10          // encoding: [0x4a,0xcb,0x1c,0xd5]
-// CHECK: msr      ich_vmcr_el2, x24          // encoding: [0xf8,0xcb,0x1c,0xd5]
-// CHECK: msr      ich_vseir_el2, x29         // encoding: [0x9d,0xc9,0x1c,0xd5]
-// CHECK: msr      ich_lr0_el2, x26           // encoding: [0x1a,0xcc,0x1c,0xd5]
-// CHECK: msr      ich_lr1_el2, x9            // encoding: [0x29,0xcc,0x1c,0xd5]
-// CHECK: msr      ich_lr2_el2, x18           // encoding: [0x52,0xcc,0x1c,0xd5]
-// CHECK: msr      ich_lr3_el2, x26           // encoding: [0x7a,0xcc,0x1c,0xd5]
-// CHECK: msr      ich_lr4_el2, x22           // encoding: [0x96,0xcc,0x1c,0xd5]
-// CHECK: msr      ich_lr5_el2, x26           // encoding: [0xba,0xcc,0x1c,0xd5]
-// CHECK: msr      ich_lr6_el2, x27           // encoding: [0xdb,0xcc,0x1c,0xd5]
-// CHECK: msr      ich_lr7_el2, x8            // encoding: [0xe8,0xcc,0x1c,0xd5]
-// CHECK: msr      ich_lr8_el2, x17           // encoding: [0x11,0xcd,0x1c,0xd5]
-// CHECK: msr      ich_lr9_el2, x19           // encoding: [0x33,0xcd,0x1c,0xd5]
-// CHECK: msr      ich_lr10_el2, x17          // encoding: [0x51,0xcd,0x1c,0xd5]
-// CHECK: msr      ich_lr11_el2, x5           // encoding: [0x65,0xcd,0x1c,0xd5]
-// CHECK: msr      ich_lr12_el2, x29          // encoding: [0x9d,0xcd,0x1c,0xd5]
-// CHECK: msr      ich_lr13_el2, x2           // encoding: [0xa2,0xcd,0x1c,0xd5]
-// CHECK: msr      ich_lr14_el2, x13          // encoding: [0xcd,0xcd,0x1c,0xd5]
-// CHECK: msr      ich_lr15_el2, x27          // encoding: [0xfb,0xcd,0x1c,0xd5]
+// CHECK: msr      {{icc_eoir1_el1|ICC_EOIR1_EL1}}, x27         // encoding: [0x3b,0xcc,0x18,0xd5]
+// CHECK: msr      {{icc_eoir0_el1|ICC_EOIR0_EL1}}, x5          // encoding: [0x25,0xc8,0x18,0xd5]
+// CHECK: msr      {{icc_dir_el1|ICC_DIR_EL1}}, x13           // encoding: [0x2d,0xcb,0x18,0xd5]
+// CHECK: msr      {{icc_sgi1r_el1|ICC_SGI1R_EL1}}, x21         // encoding: [0xb5,0xcb,0x18,0xd5]
+// CHECK: msr      {{icc_asgi1r_el1|ICC_ASGI1R_EL1}}, x25        // encoding: [0xd9,0xcb,0x18,0xd5]
+// CHECK: msr      {{icc_sgi0r_el1|ICC_SGI0R_EL1}}, x28         // encoding: [0xfc,0xcb,0x18,0xd5]
+// CHECK: msr      {{icc_bpr1_el1|ICC_BPR1_EL1}}, x7           // encoding: [0x67,0xcc,0x18,0xd5]
+// CHECK: msr      {{icc_bpr0_el1|ICC_BPR0_EL1}}, x9           // encoding: [0x69,0xc8,0x18,0xd5]
+// CHECK: msr      {{icc_pmr_el1|ICC_PMR_EL1}}, x29           // encoding: [0x1d,0x46,0x18,0xd5]
+// CHECK: msr      {{icc_ctlr_el1|ICC_CTLR_EL1}}, x24          // encoding: [0x98,0xcc,0x18,0xd5]
+// CHECK: msr      {{icc_ctlr_el3|ICC_CTLR_EL3}}, x0           // encoding: [0x80,0xcc,0x1e,0xd5]
+// CHECK: msr      {{icc_sre_el1|ICC_SRE_EL1}}, x2            // encoding: [0xa2,0xcc,0x18,0xd5]
+// CHECK: msr      {{icc_sre_el2|ICC_SRE_EL2}}, x5            // encoding: [0xa5,0xc9,0x1c,0xd5]
+// CHECK: msr      {{icc_sre_el3|ICC_SRE_EL3}}, x10           // encoding: [0xaa,0xcc,0x1e,0xd5]
+// CHECK: msr      {{icc_igrpen0_el1|ICC_IGRPEN0_EL1}}, x22       // encoding: [0xd6,0xcc,0x18,0xd5]
+// CHECK: msr      {{icc_igrpen1_el1|ICC_IGRPEN1_EL1}}, x11       // encoding: [0xeb,0xcc,0x18,0xd5]
+// CHECK: msr      {{icc_igrpen1_el3|ICC_IGRPEN1_EL3}}, x8        // encoding: [0xe8,0xcc,0x1e,0xd5]
+// CHECK: msr      {{icc_seien_el1|ICC_SEIEN_EL1}}, x4          // encoding: [0x04,0xcd,0x18,0xd5]
+// CHECK: msr      {{icc_ap0r0_el1|ICC_AP0R0_EL1}}, x27         // encoding: [0x9b,0xc8,0x18,0xd5]
+// CHECK: msr      {{icc_ap0r1_el1|ICC_AP0R1_EL1}}, x5          // encoding: [0xa5,0xc8,0x18,0xd5]
+// CHECK: msr      {{icc_ap0r2_el1|ICC_AP0R2_EL1}}, x20         // encoding: [0xd4,0xc8,0x18,0xd5]
+// CHECK: msr      {{icc_ap0r3_el1|ICC_AP0R3_EL1}}, x0          // encoding: [0xe0,0xc8,0x18,0xd5]
+// CHECK: msr      {{icc_ap1r0_el1|ICC_AP1R0_EL1}}, x2          // encoding: [0x02,0xc9,0x18,0xd5]
+// CHECK: msr      {{icc_ap1r1_el1|ICC_AP1R1_EL1}}, x29         // encoding: [0x3d,0xc9,0x18,0xd5]
+// CHECK: msr      {{icc_ap1r2_el1|ICC_AP1R2_EL1}}, x23         // encoding: [0x57,0xc9,0x18,0xd5]
+// CHECK: msr      {{icc_ap1r3_el1|ICC_AP1R3_EL1}}, x11         // encoding: [0x6b,0xc9,0x18,0xd5]
+// CHECK: msr      {{ich_ap0r0_el2|ICH_AP0R0_EL2}}, x2          // encoding: [0x02,0xc8,0x1c,0xd5]
+// CHECK: msr      {{ich_ap0r1_el2|ICH_AP0R1_EL2}}, x27         // encoding: [0x3b,0xc8,0x1c,0xd5]
+// CHECK: msr      {{ich_ap0r2_el2|ICH_AP0R2_EL2}}, x7          // encoding: [0x47,0xc8,0x1c,0xd5]
+// CHECK: msr      {{ich_ap0r3_el2|ICH_AP0R3_EL2}}, x1          // encoding: [0x61,0xc8,0x1c,0xd5]
+// CHECK: msr      {{ich_ap1r0_el2|ICH_AP1R0_EL2}}, x7          // encoding: [0x07,0xc9,0x1c,0xd5]
+// CHECK: msr      {{ich_ap1r1_el2|ICH_AP1R1_EL2}}, x12         // encoding: [0x2c,0xc9,0x1c,0xd5]
+// CHECK: msr      {{ich_ap1r2_el2|ICH_AP1R2_EL2}}, x14         // encoding: [0x4e,0xc9,0x1c,0xd5]
+// CHECK: msr      {{ich_ap1r3_el2|ICH_AP1R3_EL2}}, x13         // encoding: [0x6d,0xc9,0x1c,0xd5]
+// CHECK: msr      {{ich_hcr_el2|ICH_HCR_EL2}}, x1            // encoding: [0x01,0xcb,0x1c,0xd5]
+// CHECK: msr      {{ich_misr_el2|ICH_MISR_EL2}}, x10          // encoding: [0x4a,0xcb,0x1c,0xd5]
+// CHECK: msr      {{ich_vmcr_el2|ICH_VMCR_EL2}}, x24          // encoding: [0xf8,0xcb,0x1c,0xd5]
+// CHECK: msr      {{ich_vseir_el2|ICH_VSEIR_EL2}}, x29         // encoding: [0x9d,0xc9,0x1c,0xd5]
+// CHECK: msr      {{ich_lr0_el2|ICH_LR0_EL2}}, x26           // encoding: [0x1a,0xcc,0x1c,0xd5]
+// CHECK: msr      {{ich_lr1_el2|ICH_LR1_EL2}}, x9            // encoding: [0x29,0xcc,0x1c,0xd5]
+// CHECK: msr      {{ich_lr2_el2|ICH_LR2_EL2}}, x18           // encoding: [0x52,0xcc,0x1c,0xd5]
+// CHECK: msr      {{ich_lr3_el2|ICH_LR3_EL2}}, x26           // encoding: [0x7a,0xcc,0x1c,0xd5]
+// CHECK: msr      {{ich_lr4_el2|ICH_LR4_EL2}}, x22           // encoding: [0x96,0xcc,0x1c,0xd5]
+// CHECK: msr      {{ich_lr5_el2|ICH_LR5_EL2}}, x26           // encoding: [0xba,0xcc,0x1c,0xd5]
+// CHECK: msr      {{ich_lr6_el2|ICH_LR6_EL2}}, x27           // encoding: [0xdb,0xcc,0x1c,0xd5]
+// CHECK: msr      {{ich_lr7_el2|ICH_LR7_EL2}}, x8            // encoding: [0xe8,0xcc,0x1c,0xd5]
+// CHECK: msr      {{ich_lr8_el2|ICH_LR8_EL2}}, x17           // encoding: [0x11,0xcd,0x1c,0xd5]
+// CHECK: msr      {{ich_lr9_el2|ICH_LR9_EL2}}, x19           // encoding: [0x33,0xcd,0x1c,0xd5]
+// CHECK: msr      {{ich_lr10_el2|ICH_LR10_EL2}}, x17          // encoding: [0x51,0xcd,0x1c,0xd5]
+// CHECK: msr      {{ich_lr11_el2|ICH_LR11_EL2}}, x5           // encoding: [0x65,0xcd,0x1c,0xd5]
+// CHECK: msr      {{ich_lr12_el2|ICH_LR12_EL2}}, x29          // encoding: [0x9d,0xcd,0x1c,0xd5]
+// CHECK: msr      {{ich_lr13_el2|ICH_LR13_EL2}}, x2           // encoding: [0xa2,0xcd,0x1c,0xd5]
+// CHECK: msr      {{ich_lr14_el2|ICH_LR14_EL2}}, x13          // encoding: [0xcd,0xcd,0x1c,0xd5]
+// CHECK: msr      {{ich_lr15_el2|ICH_LR15_EL2}}, x27          // encoding: [0xfb,0xcd,0x1c,0xd5]
diff --git a/test/MC/AArch64/lit.local.cfg b/test/MC/AArch64/lit.local.cfg
index 75dba81..1be70c0 100644
--- a/test/MC/AArch64/lit.local.cfg
+++ b/test/MC/AArch64/lit.local.cfg
@@ -1,3 +1,3 @@
 targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
-    config.unsupported = True
\ No newline at end of file
+if 'AArch64' not in targets:
+    config.unsupported = True
diff --git a/test/MC/AArch64/neon-2velem.s b/test/MC/AArch64/neon-2velem.s
index cde792a..04841d0 100644
--- a/test/MC/AArch64/neon-2velem.s
+++ b/test/MC/AArch64/neon-2velem.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-3vdiff.s b/test/MC/AArch64/neon-3vdiff.s
index 3ff86bf..fc3215b 100644
--- a/test/MC/AArch64/neon-3vdiff.s
+++ b/test/MC/AArch64/neon-3vdiff.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+crypto -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-across.s b/test/MC/AArch64/neon-across.s
index 8b1c2d4..60b766d 100644
--- a/test/MC/AArch64/neon-across.s
+++ b/test/MC/AArch64/neon-across.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-compare-instructions.s b/test/MC/AArch64/neon-compare-instructions.s
index d4e3ef5..19cfaf1 100644
--- a/test/MC/AArch64/neon-compare-instructions.s
+++ b/test/MC/AArch64/neon-compare-instructions.s
@@ -255,13 +255,13 @@
          cmeq v9.4s, v7.4s, #0
          cmeq v3.2d, v31.2d, #0
 
-// CHECK: cmeq v0.8b, v15.8b, #0x0    // encoding: [0xe0,0x99,0x20,0x0e]
-// CHECK: cmeq v1.16b, v31.16b, #0x0  // encoding: [0xe1,0x9b,0x20,0x4e]
-// CHECK: cmeq v15.4h, v16.4h, #0x0   // encoding: [0x0f,0x9a,0x60,0x0e]
-// CHECK: cmeq v5.8h, v6.8h, #0x0     // encoding: [0xc5,0x98,0x60,0x4e]
-// CHECK: cmeq v29.2s, v27.2s, #0x0   // encoding: [0x7d,0x9b,0xa0,0x0e]
-// CHECK: cmeq v9.4s, v7.4s, #0x0     // encoding: [0xe9,0x98,0xa0,0x4e]
-// CHECK: cmeq v3.2d, v31.2d, #0x0    // encoding: [0xe3,0x9b,0xe0,0x4e]
+// CHECK: cmeq v0.8b, v15.8b, #{{0x0|0}}    // encoding: [0xe0,0x99,0x20,0x0e]
+// CHECK: cmeq v1.16b, v31.16b, #{{0x0|0}}  // encoding: [0xe1,0x9b,0x20,0x4e]
+// CHECK: cmeq v15.4h, v16.4h, #{{0x0|0}}   // encoding: [0x0f,0x9a,0x60,0x0e]
+// CHECK: cmeq v5.8h, v6.8h, #{{0x0|0}}     // encoding: [0xc5,0x98,0x60,0x4e]
+// CHECK: cmeq v29.2s, v27.2s, #{{0x0|0}}   // encoding: [0x7d,0x9b,0xa0,0x0e]
+// CHECK: cmeq v9.4s, v7.4s, #{{0x0|0}}     // encoding: [0xe9,0x98,0xa0,0x4e]
+// CHECK: cmeq v3.2d, v31.2d, #{{0x0|0}}    // encoding: [0xe3,0x9b,0xe0,0x4e]
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
@@ -274,13 +274,13 @@
          cmge v17.4s, v20.4s, #0
          cmge v3.2d, v31.2d, #0
 
-// CHECK: cmge v0.8b, v15.8b, #0x0    // encoding: [0xe0,0x89,0x20,0x2e]
-// CHECK: cmge v1.16b, v31.16b, #0x0  // encoding: [0xe1,0x8b,0x20,0x6e]
-// CHECK: cmge v15.4h, v16.4h, #0x0   // encoding: [0x0f,0x8a,0x60,0x2e]
-// CHECK: cmge v5.8h, v6.8h, #0x0     // encoding: [0xc5,0x88,0x60,0x6e]
-// CHECK: cmge v29.2s, v27.2s, #0x0   // encoding: [0x7d,0x8b,0xa0,0x2e]
-// CHECK: cmge v17.4s, v20.4s, #0x0   // encoding: [0x91,0x8a,0xa0,0x6e]
-// CHECK: cmge v3.2d, v31.2d, #0x0    // encoding: [0xe3,0x8b,0xe0,0x6e]
+// CHECK: cmge v0.8b, v15.8b, #{{0x0|0}}    // encoding: [0xe0,0x89,0x20,0x2e]
+// CHECK: cmge v1.16b, v31.16b, #{{0x0|0}}  // encoding: [0xe1,0x8b,0x20,0x6e]
+// CHECK: cmge v15.4h, v16.4h, #{{0x0|0}}   // encoding: [0x0f,0x8a,0x60,0x2e]
+// CHECK: cmge v5.8h, v6.8h, #{{0x0|0}}     // encoding: [0xc5,0x88,0x60,0x6e]
+// CHECK: cmge v29.2s, v27.2s, #{{0x0|0}}   // encoding: [0x7d,0x8b,0xa0,0x2e]
+// CHECK: cmge v17.4s, v20.4s, #{{0x0|0}}   // encoding: [0x91,0x8a,0xa0,0x6e]
+// CHECK: cmge v3.2d, v31.2d, #{{0x0|0}}    // encoding: [0xe3,0x8b,0xe0,0x6e]
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Greater Than Zero (Signed Integer)
@@ -294,13 +294,13 @@
          cmgt v9.4s, v7.4s, #0
          cmgt v3.2d, v31.2d, #0
 
-// CHECK: cmgt v0.8b, v15.8b, #0x0    // encoding: [0xe0,0x89,0x20,0x0e]
-// CHECK: cmgt v1.16b, v31.16b, #0x0  // encoding: [0xe1,0x8b,0x20,0x4e]
-// CHECK: cmgt v15.4h, v16.4h, #0x0   // encoding: [0x0f,0x8a,0x60,0x0e]
-// CHECK: cmgt v5.8h, v6.8h, #0x0     // encoding: [0xc5,0x88,0x60,0x4e]
-// CHECK: cmgt v29.2s, v27.2s, #0x0   // encoding: [0x7d,0x8b,0xa0,0x0e]
-// CHECK: cmgt v9.4s, v7.4s, #0x0     // encoding: [0xe9,0x88,0xa0,0x4e]
-// CHECK: cmgt v3.2d, v31.2d, #0x0    // encoding: [0xe3,0x8b,0xe0,0x4e]
+// CHECK: cmgt v0.8b, v15.8b, #{{0x0|0}}    // encoding: [0xe0,0x89,0x20,0x0e]
+// CHECK: cmgt v1.16b, v31.16b, #{{0x0|0}}  // encoding: [0xe1,0x8b,0x20,0x4e]
+// CHECK: cmgt v15.4h, v16.4h, #{{0x0|0}}   // encoding: [0x0f,0x8a,0x60,0x0e]
+// CHECK: cmgt v5.8h, v6.8h, #{{0x0|0}}     // encoding: [0xc5,0x88,0x60,0x4e]
+// CHECK: cmgt v29.2s, v27.2s, #{{0x0|0}}   // encoding: [0x7d,0x8b,0xa0,0x0e]
+// CHECK: cmgt v9.4s, v7.4s, #{{0x0|0}}     // encoding: [0xe9,0x88,0xa0,0x4e]
+// CHECK: cmgt v3.2d, v31.2d, #{{0x0|0}}    // encoding: [0xe3,0x8b,0xe0,0x4e]
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
@@ -313,13 +313,13 @@
          cmle v9.4s, v7.4s, #0
          cmle v3.2d, v31.2d, #0
 
-// CHECK: cmle v0.8b, v15.8b, #0x0    // encoding: [0xe0,0x99,0x20,0x2e]
-// CHECK: cmle v1.16b, v31.16b, #0x0  // encoding: [0xe1,0x9b,0x20,0x6e]
-// CHECK: cmle v15.4h, v16.4h, #0x0   // encoding: [0x0f,0x9a,0x60,0x2e]
-// CHECK: cmle v5.8h, v6.8h, #0x0     // encoding: [0xc5,0x98,0x60,0x6e]
-// CHECK: cmle v29.2s, v27.2s, #0x0   // encoding: [0x7d,0x9b,0xa0,0x2e]
-// CHECK: cmle v9.4s, v7.4s, #0x0     // encoding: [0xe9,0x98,0xa0,0x6e]
-// CHECK: cmle v3.2d, v31.2d, #0x0    // encoding: [0xe3,0x9b,0xe0,0x6e]
+// CHECK: cmle v0.8b, v15.8b, #{{0x0|0}}    // encoding: [0xe0,0x99,0x20,0x2e]
+// CHECK: cmle v1.16b, v31.16b, #{{0x0|0}}  // encoding: [0xe1,0x9b,0x20,0x6e]
+// CHECK: cmle v15.4h, v16.4h, #{{0x0|0}}   // encoding: [0x0f,0x9a,0x60,0x2e]
+// CHECK: cmle v5.8h, v6.8h, #{{0x0|0}}     // encoding: [0xc5,0x98,0x60,0x6e]
+// CHECK: cmle v29.2s, v27.2s, #{{0x0|0}}   // encoding: [0x7d,0x9b,0xa0,0x2e]
+// CHECK: cmle v9.4s, v7.4s, #{{0x0|0}}     // encoding: [0xe9,0x98,0xa0,0x6e]
+// CHECK: cmle v3.2d, v31.2d, #{{0x0|0}}    // encoding: [0xe3,0x9b,0xe0,0x6e]
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Less Than Zero (Signed Integer)
@@ -332,13 +332,13 @@
          cmlt v9.4s, v7.4s, #0
          cmlt v3.2d, v31.2d, #0
 
-// CHECK: cmlt v0.8b, v15.8b, #0x0    // encoding: [0xe0,0xa9,0x20,0x0e]
-// CHECK: cmlt v1.16b, v31.16b, #0x0  // encoding: [0xe1,0xab,0x20,0x4e]
-// CHECK: cmlt v15.4h, v16.4h, #0x0   // encoding: [0x0f,0xaa,0x60,0x0e]
-// CHECK: cmlt v5.8h, v6.8h, #0x0     // encoding: [0xc5,0xa8,0x60,0x4e]
-// CHECK: cmlt v29.2s, v27.2s, #0x0   // encoding: [0x7d,0xab,0xa0,0x0e]
-// CHECK: cmlt v9.4s, v7.4s, #0x0     // encoding: [0xe9,0xa8,0xa0,0x4e]
-// CHECK: cmlt v3.2d, v31.2d, #0x0    // encoding: [0xe3,0xab,0xe0,0x4e]
+// CHECK: cmlt v0.8b, v15.8b, #{{0x0|0}}    // encoding: [0xe0,0xa9,0x20,0x0e]
+// CHECK: cmlt v1.16b, v31.16b, #{{0x0|0}}  // encoding: [0xe1,0xab,0x20,0x4e]
+// CHECK: cmlt v15.4h, v16.4h, #{{0x0|0}}   // encoding: [0x0f,0xaa,0x60,0x0e]
+// CHECK: cmlt v5.8h, v6.8h, #{{0x0|0}}     // encoding: [0xc5,0xa8,0x60,0x4e]
+// CHECK: cmlt v29.2s, v27.2s, #{{0x0|0}}   // encoding: [0x7d,0xab,0xa0,0x0e]
+// CHECK: cmlt v9.4s, v7.4s, #{{0x0|0}}     // encoding: [0xe9,0xa8,0xa0,0x4e]
+// CHECK: cmlt v3.2d, v31.2d, #{{0x0|0}}    // encoding: [0xe3,0xab,0xe0,0x4e]
 
 //----------------------------------------------------------------------
 // Vector Compare Mask Equal to Zero (Floating Point)
diff --git a/test/MC/AArch64/neon-crypto.s b/test/MC/AArch64/neon-crypto.s
index 2952dd5..ed1bf88 100644
--- a/test/MC/AArch64/neon-crypto.s
+++ b/test/MC/AArch64/neon-crypto.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -mattr=+crypto -show-encoding < %s | FileCheck %s
-// RUN: not llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s 2>&1 | FileCheck -check-prefix=CHECK-NO-CRYPTO %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon -mattr=+crypto -show-encoding < %s | FileCheck %s
+// RUN: not llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s 2>&1 | FileCheck -check-prefix=CHECK-NO-CRYPTO-ARM64 %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -13,6 +13,7 @@
         aesimc v0.16b, v1.16b
 
 // CHECK-NO-CRYPTO: error: instruction requires a CPU feature not currently enabled
+// CHECK-NO-CRYPTO-ARM64: error: instruction requires: crypto
 // CHECK: aese	v0.16b, v1.16b          // encoding: [0x20,0x48,0x28,0x4e]
 // CHECK: aesd	v0.16b, v1.16b          // encoding: [0x20,0x58,0x28,0x4e]
 // CHECK: aesmc	v0.16b, v1.16b          // encoding: [0x20,0x68,0x28,0x4e]
diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s
index aa08857..fa1f3ca 100644
--- a/test/MC/AArch64/neon-diagnostics.s
+++ b/test/MC/AArch64/neon-diagnostics.s
@@ -587,10 +587,11 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmgt v0.2d, v31.2s, v16.2s
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type
+
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmgt v4.4s, v7.4s, v15.4h
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmlt v29.2d, v5.2d, v2.16b
 // CHECK-ERROR:                                ^
 
@@ -680,12 +681,15 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmeq v0.16b, v1.16b, #0.0
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: only #0.0 is acceptable as immediate
+
+
+// CHECK-ERROR: error: expected floating-point constant #0.0
 // CHECK-ERROR:        fcmeq v0.8b, v1.4h, #1.0
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: only #0.0 is acceptable as immediate
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmeq v0.8b, v1.4h, #1
 // CHECK-ERROR:                             ^
+
 //----------------------------------------------------------------------
 // Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
 //----------------------------------------------------------------------
@@ -702,12 +706,15 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmge v3.8b, v8.2s, #0.0
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: only #0.0 is acceptable as immediate
+
+
+// CHECK-ERROR: error: expected floating-point constant #0.0
 // CHECK-ERROR:        fcmle v17.8h, v15.2d, #-1.0
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: only #0.0 is acceptable as immediate
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmle v17.8h, v15.2d, #2
 // CHECK-ERROR:                               ^
+
 //----------------------------------------------------------------------
 // Vector Compare Mask Greater Than Zero (Floating Point)
 //----------------------------------------------------------------------
@@ -723,10 +730,12 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmgt v4.4s, v7.4h, #0.0
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: only #0.0 is acceptable as immediate
+
+
+// CHECK-ERROR: error: expected floating-point constant #0.0
 // CHECK-ERROR:        fcmlt v29.2d, v5.2d, #255.0
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: only #0.0 is acceptable as immediate
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmlt v29.2d, v5.2d, #255
 // CHECK-ERROR:                              ^
 
@@ -745,10 +754,12 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmge v3.8b, v8.2s, #0.0
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: only #0.0 is acceptable as immediate
+
+
+// CHECK-ERROR: error: expected floating-point constant #0.0
 // CHECK-ERROR:        fcmle v17.2d, v15.2d, #15.0
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: only #0.0 is acceptable as immediate
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmle v17.2d, v15.2d, #15
 // CHECK-ERROR:                              ^
 
@@ -767,10 +778,12 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmgt v4.4s, v7.4h, #0.0
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: only #0.0 is acceptable as immediate
+
+
+// CHECK-ERROR: error: expected floating-point constant #0.0
 // CHECK-ERROR:        fcmlt v29.2d, v5.2d, #16.0
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: only #0.0 is acceptable as immediate
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fcmlt v29.2d, v5.2d, #2
 // CHECK-ERROR:                              ^
 
@@ -1285,22 +1298,24 @@
          shl v0.4s, v21.4s, #32
          shl v0.2d, v1.2d, #64
 
-// CHECK-ERROR: error: expected comma before next operand
+
+// CHECK-ERROR: error: unexpected token in argument list
 // CHECK-ERROR:         shl v0.4s, v15,2s, #3
 // CHECK-ERROR:                         ^
+
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         shl v0.2d, v17.4s, #3
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR:         shl v0.8b, v31.8b, #-1
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR:         shl v0.8b, v31.8b, #8
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR:         shl v0.4s, v21.4s, #32
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR:         shl v0.2d, v1.2d, #64
 // CHECK-ERROR:                           ^
 
@@ -1334,25 +1349,25 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        ushll2 v1.4s, v25.4s, #7
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR:        sshll v0.8h, v1.8b, #-1
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR:        sshll v0.8h, v1.8b, #9
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR:        ushll v0.4s, v1.4h, #17
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR:        ushll v0.2d, v1.2s, #33
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR:        sshll2 v0.8h, v1.16b, #9
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR:        sshll2 v0.4s, v1.8h, #17
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR:        ushll2 v0.2d, v1.4s, #33
 // CHECK-ERROR:                             ^
 
@@ -1377,16 +1392,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         sshr v0.2s, v1.2d, #3
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         sshr v0.16b, v1.16b, #9
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         sshr v0.8h, v1.8h, #17
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         sshr v0.4s, v1.4s, #33
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:         sshr v0.2d, v1.2d, #65
 // CHECK-ERROR:                            ^
 
@@ -1410,16 +1425,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         ushr v0.2s, v1.2d, #3
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         ushr v0.16b, v1.16b, #9
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         ushr v0.8h, v1.8h, #17
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         ushr v0.4s, v1.4s, #33
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:         ushr v0.2d, v1.2d, #65
 // CHECK-ERROR:                            ^
 
@@ -1443,16 +1458,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         ssra v0.2s, v1.2d, #3
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         ssra v0.16b, v1.16b, #9
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         ssra v0.8h, v1.8h, #17
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         ssra v0.4s, v1.4s, #33
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:         ssra v0.2d, v1.2d, #65
 // CHECK-ERROR:                            ^
 
@@ -1476,16 +1491,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         usra v0.2s, v1.2d, #3
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         usra v0.16b, v1.16b, #9
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         usra v0.8h, v1.8h, #17
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         usra v0.4s, v1.4s, #33
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:         usra v0.2d, v1.2d, #65
 // CHECK-ERROR:                            ^
 
@@ -1509,16 +1524,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         srshr v0.2s, v1.2d, #3
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         srshr v0.16b, v1.16b, #9
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         srshr v0.8h, v1.8h, #17
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         srshr v0.4s, v1.4s, #33
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:         srshr v0.2d, v1.2d, #65
 // CHECK-ERROR:                             ^
 
@@ -1542,16 +1557,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         urshr v0.2s, v1.2d, #3
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         urshr v0.16b, v1.16b, #9
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         urshr v0.8h, v1.8h, #17
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         urshr v0.4s, v1.4s, #33
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:         urshr v0.2d, v1.2d, #65
 // CHECK-ERROR:                             ^
 
@@ -1575,16 +1590,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         srsra v0.2s, v1.2d, #3
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         srsra v0.16b, v1.16b, #9
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         srsra v0.8h, v1.8h, #17
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         srsra v0.4s, v1.4s, #33
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:         srsra v0.2d, v1.2d, #65
 // CHECK-ERROR:                             ^
 
@@ -1608,16 +1623,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         ursra v0.2s, v1.2d, #3
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         ursra v0.16b, v1.16b, #9
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         ursra v0.8h, v1.8h, #17
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         ursra v0.4s, v1.4s, #33
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:         ursra v0.2d, v1.2d, #65
 // CHECK-ERROR:                             ^
 
@@ -1641,16 +1656,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         sri v0.2s, v1.2d, #3
 // CHECK-ERROR:                       ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         sri v0.16b, v1.16b, #9
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         sri v0.8h, v1.8h, #17
 // CHECK-ERROR:                           ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         sri v0.4s, v1.4s, #33
 // CHECK-ERROR:                           ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:         sri v0.2d, v1.2d, #65
 // CHECK-ERROR:                           ^
 
@@ -1674,16 +1689,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         sli v0.2s, v1.2d, #3
 // CHECK-ERROR:                       ^
-// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR:         sli v0.16b, v1.16b, #8
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR:         sli v0.8h, v1.8h, #16
 // CHECK-ERROR:                           ^
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR:         sli v0.4s, v1.4s, #32
 // CHECK-ERROR:                           ^
-// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR:         sli v0.2d, v1.2d, #64
 // CHECK-ERROR:                           ^
 
@@ -1707,16 +1722,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         sqshlu v0.2s, v1.2d, #3
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR:         sqshlu v0.16b, v1.16b, #8
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR:         sqshlu v0.8h, v1.8h, #16
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR:         sqshlu v0.4s, v1.4s, #32
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR:         sqshlu v0.2d, v1.2d, #64
 // CHECK-ERROR:                              ^
 
@@ -1740,16 +1755,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         sqshl v0.2s, v1.2d, #3
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR:         sqshl v0.16b, v1.16b, #8
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR:         sqshl v0.8h, v1.8h, #16
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR:         sqshl v0.4s, v1.4s, #32
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR:         sqshl v0.2d, v1.2d, #64
 // CHECK-ERROR:                             ^
 
@@ -1773,16 +1788,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         uqshl v0.2s, v1.2d, #3
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR:         uqshl v0.16b, v1.16b, #8
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR:         uqshl v0.8h, v1.8h, #16
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR:         uqshl v0.4s, v1.4s, #32
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR:         uqshl v0.2d, v1.2d, #64
 // CHECK-ERROR:                             ^
 
@@ -1805,13 +1820,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         shrn v0.2s, v1.2s, #3
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         shrn2 v0.16b, v1.8h, #17
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         shrn2 v0.8h, v1.4s, #33
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         shrn2 v0.4s, v1.2d, #65
 // CHECK-ERROR:                             ^
 
@@ -1834,13 +1849,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         sqshrun v0.2s, v1.2s, #3
 // CHECK-ERROR:                           ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         sqshrun2 v0.16b, v1.8h, #17
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         sqshrun2 v0.8h, v1.4s, #33
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         sqshrun2 v0.4s, v1.2d, #65
 // CHECK-ERROR:                                ^
 
@@ -1863,13 +1878,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         rshrn v0.2s, v1.2s, #3
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         rshrn2 v0.16b, v1.8h, #17
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         rshrn2 v0.8h, v1.4s, #33
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         rshrn2 v0.4s, v1.2d, #65
 // CHECK-ERROR:                              ^
 
@@ -1892,13 +1907,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         sqrshrun v0.2s, v1.2s, #3
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         sqrshrun2 v0.16b, v1.8h, #17
 // CHECK-ERROR:                                  ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         sqrshrun2 v0.8h, v1.4s, #33
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         sqrshrun2 v0.4s, v1.2d, #65
 // CHECK-ERROR:                                 ^
 
@@ -1921,13 +1936,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         sqshrn v0.2s, v1.2s, #3
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         sqshrn2 v0.16b, v1.8h, #17
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         sqshrn2 v0.8h, v1.4s, #33
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         sqshrn2 v0.4s, v1.2d, #65
 // CHECK-ERROR:                               ^
 
@@ -1950,13 +1965,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         uqshrn v0.2s, v1.2s, #3
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         uqshrn2 v0.16b, v1.8h, #17
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         uqshrn2 v0.8h, v1.4s, #33
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         uqshrn2 v0.4s, v1.2d, #65
 // CHECK-ERROR:                               ^
 
@@ -1979,13 +1994,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         sqrshrn v0.2s, v1.2s, #3
 // CHECK-ERROR:                           ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         sqrshrn2 v0.16b, v1.8h, #17
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         sqrshrn2 v0.8h, v1.4s, #33
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         sqrshrn2 v0.4s, v1.2d, #65
 // CHECK-ERROR:                                ^
 
@@ -2008,13 +2023,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         uqrshrn v0.2s, v1.2s, #3
 // CHECK-ERROR:                           ^
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:         uqrshrn2 v0.16b, v1.8h, #17
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:         uqrshrn2 v0.8h, v1.4s, #33
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         uqrshrn2 v0.4s, v1.2d, #65
 // CHECK-ERROR:                                ^
 
@@ -2037,13 +2052,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         scvtf v0.2d, v1.2s, #3
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         ucvtf v0.2s, v1.2s, #33
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         ucvtf v0.4s, v1.4s, #33
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:         ucvtf v0.2d, v1.2d, #65
 // CHECK-ERROR:                             ^
 
@@ -2066,13 +2081,13 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fcvtzs v0.2d, v1.2s, #3
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         fcvtzu v0.2s, v1.2s, #33
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:         fcvtzu v0.4s, v1.4s, #33
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:         fcvtzu v0.2d, v1.2d, #65
 // CHECK-ERROR:                              ^
 
@@ -2616,9 +2631,11 @@
         pmull2 v0.4s, v1.8h v2.8h
         pmull2 v0.2d, v1.4s, v2.4s
 
-// CHECK-ERROR: error: expected comma before next operand
+
+// CHECK-ERROR: error: unexpected token in argument list
 // CHECK-ERROR:        pmull2 v0.4s, v1.8h v2.8h
 // CHECK-ERROR:                            ^
+
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        pmull2 v0.2d, v1.4s, v2.4s
 // CHECK-ERROR:                  ^
@@ -2941,19 +2958,19 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mla v0.2d, v1.2d, v16.d[1]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mla v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mla v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mla v0.2h, v1.2h, v2.h[1]
 // CHECK-ERROR:            ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mla v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mla v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -2975,19 +2992,19 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mls v0.2d, v1.2d, v16.d[1]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mls v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mls v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mls v0.2h, v1.2h, v2.h[1]
 // CHECK-ERROR:            ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mls v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mls v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3012,22 +3029,22 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmla v0.8h, v1.8h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v3.4s, v8.4s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v3.4s, v8.4s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v0.2d, v1.2d, v2.d[2]
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmla v0.2d, v1.2d, v22.d[2]
 // CHECK-ERROR:                                 ^
 
@@ -3046,29 +3063,29 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmls v0.8h, v1.8h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v3.4s, v8.4s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v3.4s, v8.4s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v0.2d, v1.2d, v2.d[2]
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmls v0.2d, v1.2d, v22.d[2]
 // CHECK-ERROR:                                 ^
 
       smlal v0.4h, v1.4h, v2.h[2]
       smlal v0.4s, v1.4h, v2.h[8]
       smlal v0.4s, v1.4h, v16.h[2]
-      smlal v0.2s, v1.2s, v2.s[4]
+      smlal v0.2s, v1.2s, v2.s[1]
       smlal v0.2d, v1.2s, v2.s[4]
       smlal v0.2d, v1.2s, v22.s[4]
       smlal2 v0.4h, v1.8h, v1.h[2]
@@ -3081,25 +3098,25 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlal v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlal v0.4s, v1.4h, v16.h[2]
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: lane number incompatible with layout
-// CHECK-ERROR:        smlal v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlal v0.2s, v1.2s, v2.s[1]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlal2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3108,17 +3125,17 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlal2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlal2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
       smlsl v0.4h, v1.4h, v2.h[2]
       smlsl v0.4s, v1.4h, v2.h[8]
       smlsl v0.4s, v1.4h, v16.h[2]
-      smlsl v0.2s, v1.2s, v2.s[4]
+      smlsl v0.2s, v1.2s, v2.s[1]
       smlsl v0.2d, v1.2s, v2.s[4]
       smlsl v0.2d, v1.2s, v22.s[4]
       smlsl2 v0.4h, v1.8h, v1.h[2]
@@ -3131,25 +3148,25 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlsl v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlsl v0.4s, v1.4h, v16.h[2]
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: lane number incompatible with layout
-// CHECK-ERROR:        smlsl v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlsl v0.2s, v1.2s, v2.s[1]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlsl2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3158,17 +3175,17 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smlsl2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smlsl2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
       umlal v0.4h, v1.4h, v2.h[2]
       umlal v0.4s, v1.4h, v2.h[8]
       umlal v0.4s, v1.4h, v16.h[2]
-      umlal v0.2s, v1.2s, v2.s[4]
+      umlal v0.2s, v1.2s, v2.s[1]
       umlal v0.2d, v1.2s, v2.s[4]
       umlal v0.2d, v1.2s, v22.s[4]
       umlal2 v0.4h, v1.8h, v1.h[2]
@@ -3181,25 +3198,25 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlal v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlal v0.4s, v1.4h, v16.h[2]
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: lane number incompatible with layout
-// CHECK-ERROR:        umlal v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlal v0.2s, v1.2s, v2.s[1]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlal2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3208,17 +3225,17 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlal2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlal2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
       umlsl v0.4h, v1.4h, v2.h[2]
       umlsl v0.4s, v1.4h, v2.h[8]
       umlsl v0.4s, v1.4h, v16.h[2]
-      umlsl v0.2s, v1.2s, v2.s[4]
+      umlsl v0.2s, v1.2s, v2.s[3]
       umlsl v0.2d, v1.2s, v2.s[4]
       umlsl v0.2d, v1.2s, v22.s[4]
       umlsl2 v0.4h, v1.8h, v1.h[2]
@@ -3231,25 +3248,25 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlsl v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlsl v0.4s, v1.4h, v16.h[2]
 // CHECK-ERROR:                            ^
-// CHECK-ERROR: error: lane number incompatible with layout
-// CHECK-ERROR:        umlsl v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlsl v0.2s, v1.2s, v2.s[3]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlsl2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3258,17 +3275,17 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umlsl2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umlsl2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
       sqdmlal v0.4h, v1.4h, v2.h[2]
       sqdmlal v0.4s, v1.4h, v2.h[8]
       sqdmlal v0.4s, v1.4h, v16.h[2]
-      sqdmlal v0.2s, v1.2s, v2.s[4]
+      sqdmlal v0.2s, v1.2s, v2.s[3]
       sqdmlal v0.2d, v1.2s, v2.s[4]
       sqdmlal v0.2d, v1.2s, v22.s[4]
       sqdmlal2 v0.4h, v1.8h, v1.h[2]
@@ -3281,25 +3298,25 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal v0.4s, v1.4h, v16.h[2]
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: lane number incompatible with layout
-// CHECK-ERROR:        sqdmlal v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal v0.2s, v1.2s, v2.s[3]
 // CHECK-ERROR:                                   ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3308,17 +3325,17 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlal2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                     ^
 
       sqdmlsl v0.4h, v1.4h, v2.h[2]
       sqdmlsl v0.4s, v1.4h, v2.h[8]
       sqdmlsl v0.4s, v1.4h, v16.h[2]
-      sqdmlsl v0.2s, v1.2s, v2.s[4]
+      sqdmlsl v0.2s, v1.2s, v2.s[3]
       sqdmlsl v0.2d, v1.2s, v2.s[4]
       sqdmlsl v0.2d, v1.2s, v22.s[4]
       sqdmlsl2 v0.4h, v1.8h, v1.h[2]
@@ -3331,25 +3348,25 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl v0.4s, v1.4h, v16.h[2]
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: lane number incompatible with layout
-// CHECK-ERROR:        sqdmlsl v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl v0.2s, v1.2s, v2.s[3]
 // CHECK-ERROR:                                   ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl2 v0.4h, v1.8h, v1.h[2]
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl2 v0.4s, v1.8h, v1.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3358,10 +3375,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl2 v0.2s, v1.4s, v1.s[2]
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl2 v0.2d, v1.4s, v1.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmlsl2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                     ^
 
@@ -3375,28 +3392,28 @@
       mul v0.4s, v1.4s, v22.s[4]
       mul v0.2d, v1.2d, v2.d[1]
 
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        mul v0.4h, v1.4h, v16.h[8]
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: invalid operand for instruction
 // CHECK-ERROR:        mul v0.8h, v1.8h, v16.h[8]
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        mul v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                ^
 
@@ -3414,22 +3431,22 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmul v0.4h, v1.4h, v2.h[4]
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.2d, v1.2d, v2.d[2]
 // CHECK-ERROR:                                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmul v0.2d, v1.2d, v22.d[2]
 // CHECK-ERROR:                                 ^
 
@@ -3444,22 +3461,22 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        fmulx v0.4h, v1.4h, v2.h[4]
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.2d, v1.2d, v2.d[2]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        fmulx v0.2d, v1.2d, v22.d[2]
 // CHECK-ERROR:                                  ^
 
@@ -3479,7 +3496,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smull v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3488,16 +3505,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smull v0.2s, v1.2s, v2.s[2]
 // CHECK-ERROR:              ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smull2 v0.4h, v1.8h, v2.h[2]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull2 v0.4s, v1.8h, v2.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3506,10 +3523,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        smull2 v0.2s, v1.4s, v2.s[2]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull2 v0.2d, v1.4s, v2.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        smull2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3529,7 +3546,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umull v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:              ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3538,16 +3555,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umull v0.2s, v1.2s, v2.s[2]
 // CHECK-ERROR:              ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umull2 v0.4h, v1.8h, v2.h[2]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull2 v0.4s, v1.8h, v2.h[8]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3556,10 +3573,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        umull2 v0.2s, v1.4s, v2.s[2]
 // CHECK-ERROR:               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull2 v0.2d, v1.4s, v2.s[4]
 // CHECK-ERROR:                                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        umull2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                   ^
 
@@ -3579,7 +3596,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull v0.4h, v1.4h, v2.h[2]
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull v0.4s, v1.4h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3588,16 +3605,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull v0.2s, v1.2s, v2.s[2]
 // CHECK-ERROR:                ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull v0.2d, v1.2s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull v0.2d, v1.2s, v22.s[4]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull2 v0.4h, v1.8h, v2.h[2]
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull2 v0.4s, v1.8h, v2.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3606,10 +3623,10 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull2 v0.2s, v1.4s, v2.s[2]
 // CHECK-ERROR:                 ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull2 v0.2d, v1.4s, v2.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmull2 v0.2d, v1.4s, v22.s[4]
 // CHECK-ERROR:                                     ^
 
@@ -3623,28 +3640,28 @@
       sqdmulh v0.4s, v1.4s, v22.s[4]
       sqdmulh v0.2d, v1.2d, v22.d[1]
 
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmulh v0.4h, v1.4h, v16.h[2]
 // CHECK-ERROR:                              ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                                   ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmulh v0.8h, v1.8h, v16.h[2]
 // CHECK-ERROR:                                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                                   ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqdmulh v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3661,28 +3678,28 @@
       sqrdmulh v0.4s, v1.4s, v22.s[4]
       sqrdmulh v0.2d, v1.2d, v22.d[1]
 
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.4h, v1.4h, v2.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqrdmulh v0.4h, v1.4h, v16.h[2]
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.8h, v1.8h, v2.h[8]
 // CHECK-ERROR:                                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqrdmulh v0.8h, v1.8h, v16.h[2]
 // CHECK-ERROR:                                   ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.2s, v1.2s, v2.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.2s, v1.2s, v22.s[4]
 // CHECK-ERROR:                                     ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.4s, v1.4s, v2.s[4]
 // CHECK-ERROR:                                    ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:        sqrdmulh v0.4s, v1.4s, v22.s[4]
 // CHECK-ERROR:                                     ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3900,13 +3917,13 @@
          ld1 {v4}, [x0]
          ld1 {v32.16b}, [x0]
          ld1 {v15.8h}, [x32]
-// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR:        ld1 {x3}, [x2]
 // CHECK-ERROR:             ^
-// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        ld1 {v4}, [x0]
 // CHECK-ERROR:             ^
-// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR:        ld1 {v32.16b}, [x0]
 // CHECK-ERROR:             ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3920,13 +3937,13 @@
          ld1 {v1.8h-v1.8h}, [x0]
          ld1 {v15.8h-v17.4h}, [x15]
          ld1 {v0.8b-v2.8b, [x0]
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        ld1 {v0.16b, v2.16b}, [x0]
 // CHECK-ERROR:                     ^
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        ld1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
 // CHECK-ERROR:                                         ^
-// CHECK-ERROR: error: '{' expected
+// CHECK-ERROR: error: unexpected token in argument list
 // CHECK-ERROR:        ld1 v0.8b, v1.8b}, [x0]
 // CHECK-ERROR:            ^
 // CHECK-ERROR: error: invalid number of vectors
@@ -3935,7 +3952,7 @@
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        ld1 {v1.8h-v1.8h}, [x0]
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld1 {v15.8h-v17.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: '}' expected
@@ -3947,16 +3964,15 @@
          ld2 {v15.4h, v16.4h, v17.4h}, [x32]
          ld2 {v15.8h-v16.4h}, [x15]
          ld2 {v0.2d-v2.2d}, [x0]
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld2 {v15.8h, v16.4h}, [x15]
 // CHECK-ERROR:                     ^
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        ld2 {v0.8b, v2.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        ld2 {v15.4h, v16.4h, v17.4h}, [x32]
 // CHECK-ERROR:            ^
-// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld2 {v15.8h-v16.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3968,16 +3984,16 @@
          ld3 {v0.8b, v2.8b, v3.8b}, [x0]
          ld3 {v15.8h-v17.4h}, [x15]
          ld3 {v31.4s-v2.4s}, [sp]
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld3 {v15.8h, v16.8h, v17.4h}, [x15]
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        ld3 {v0.8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld3 {v15.8h-v17.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -3989,16 +4005,16 @@
          ld4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
          ld4 {v15.8h-v18.4h}, [x15]
          ld4 {v31.2s-v1.2s}, [x31]
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        ld4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        ld4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
 // CHECK-ERROR:                                             ^
-// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        ld4 {v15.8h-v18.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4009,13 +4025,13 @@
          st1 {v4}, [x0]
          st1 {v32.16b}, [x0]
          st1 {v15.8h}, [x32]
-// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR:        st1 {x3}, [x2]
 // CHECK-ERROR:             ^
-// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        st1 {v4}, [x0]
 // CHECK-ERROR:             ^
-// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR:        st1 {v32.16b}, [x0]
 // CHECK-ERROR:             ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4029,13 +4045,13 @@
          st1 {v1.8h-v1.8h}, [x0]
          st1 {v15.8h-v17.4h}, [x15]
          st1 {v0.8b-v2.8b, [x0]
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        st1 {v0.16b, v2.16b}, [x0]
 // CHECK-ERROR:                     ^
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        st1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
 // CHECK-ERROR:                                         ^
-// CHECK-ERROR: error: '{' expected
+// CHECK-ERROR: error: unexpected token in argument list
 // CHECK-ERROR:        st1 v0.8b, v1.8b}, [x0]
 // CHECK-ERROR:            ^
 // CHECK-ERROR: error: invalid number of vectors
@@ -4044,7 +4060,7 @@
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        st1 {v1.8h-v1.8h}, [x0]
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st1 {v15.8h-v17.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: '}' expected
@@ -4056,16 +4072,16 @@
          st2 {v15.4h, v16.4h, v17.4h}, [x30]
          st2 {v15.8h-v16.4h}, [x15]
          st2 {v0.2d-v2.2d}, [x0]
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st2 {v15.8h, v16.4h}, [x15]
 // CHECK-ERROR:                     ^
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        st2 {v0.8b, v2.8b}, [x0]
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        st2 {v15.4h, v16.4h, v17.4h}, [x30]
 // CHECK-ERROR:            ^
-// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st2 {v15.8h-v16.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4077,16 +4093,16 @@
          st3 {v0.8b, v2.8b, v3.8b}, [x0]
          st3 {v15.8h-v17.4h}, [x15]
          st3 {v31.4s-v2.4s}, [sp]
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st3 {v15.8h, v16.8h, v17.4h}, [x15]
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        st3 {v0.8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:                    ^
-// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st3 {v15.8h-v17.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4098,16 +4114,16 @@
          st4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
          st4 {v15.8h-v18.4h}, [x15]
          st4 {v31.2s-v1.2s}, [x31]
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
 // CHECK-ERROR:                             ^
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: registers must be sequential
 // CHECK-ERROR:        st4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
 // CHECK-ERROR:                    ^
 // CHECK-ERROR: error: invalid number of vectors
 // CHECK-ERROR:        st4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
 // CHECK-ERROR:                                             ^
-// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:        st4 {v15.8h-v18.4h}, [x15]
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4124,7 +4140,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          ld1 {v0.16b}, [x0], #8
 // CHECK-ERROR:                              ^
-// CHECK-ERROR:  error: expected vector type register
+// CHECK-ERROR: error: invalid vector kind qualifier
 // CHECK-ERROR:          ld1 {v0.8h, v1.16h}, [x0], x1
 // CHECK-ERROR:                      ^
 // CHECK-ERROR:  error: invalid operand for instruction
@@ -4140,7 +4156,7 @@
 // CHECK-ERROR:  error: invalid operand for instruction
 // CHECK-ERROR:          ld3 {v5.2s, v6.2s, v7.2s}, [x1], #48
 // CHECK-ERROR:                                           ^
-// CHECK-ERROR:  error: invalid space between two vectors
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:          ld4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1
 // CHECK-ERROR:                                     ^
 
@@ -4150,7 +4166,7 @@
 // CHECK-ERROR:  error: invalid operand for instruction
 // CHECK-ERROR:          st1 {v0.16b}, [x0], #8
 // CHECK-ERROR:                              ^
-// CHECK-ERROR:  error: expected vector type register
+// CHECK-ERROR: error: invalid vector kind qualifier
 // CHECK-ERROR:          st1 {v0.8h, v1.16h}, [x0], x1
 // CHECK-ERROR:                      ^
 // CHECK-ERROR:  error: invalid operand for instruction
@@ -4166,7 +4182,7 @@
 // CHECK-ERROR:  error: invalid operand for instruction
 // CHECK-ERROR:          st3 {v5.2s, v6.2s, v7.2s}, [x1], #48
 // CHECK-ERROR:                                           ^
-// CHECK-ERROR:  error: invalid space between two vectors
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR:          st4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1
 // CHECK-ERROR:                                     ^
 
@@ -4178,16 +4194,16 @@
          ld2r {v31.4s, v0.2s}, [sp]
          ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
          ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp]
-// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR: error: vector register expected
 // CHECK-ERROR: ld1r {x1}, [x0]
 // CHECK-ERROR:       ^
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR: ld2r {v31.4s, v0.2s}, [sp]
 // CHECK-ERROR:               ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR: ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
 // CHECK-ERROR:      ^
-// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR: ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp]
 // CHECK-ERROR:                      ^
 
@@ -4199,16 +4215,16 @@
          ld2 {v15.h, v16.h}[8], [x15]
          ld3 {v31.s, v0.s, v1.s}[-1], [sp]
          ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
-// CHECK-ERROR:: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: ld1 {v0.b}[16], [x0]
 // CHECK-ERROR:            ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: ld2 {v15.h, v16.h}[8], [x15]
 // CHECK-ERROR:                    ^
-// CHECK-ERROR: error: expected lane number
+// CHECK-ERROR: error: vector lane must be an integer in range
 // CHECK-ERROR: ld3 {v31.s, v0.s, v1.s}[-1], [sp]
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
 // CHECK-ERROR:                              ^
 
@@ -4216,16 +4232,16 @@
          st2 {v31.s, v0.s}[3], [8]
          st3 {v15.h, v16.h, v17.h}[-1], [x15]
          st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
-// CHECK-ERROR:: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: st1 {v0.d}[16], [x0]
 // CHECK-ERROR:            ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR: st2 {v31.s, v0.s}[3], [8]
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected lane number
+// CHECK-ERROR: error: vector lane must be an integer in range
 // CHECK-ERROR: st3 {v15.h, v16.h, v17.h}[-1], [x15]
 // CHECK-ERROR:                           ^
-// CHECK-ERROR: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR: st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
 // CHECK-ERROR:                              ^
 
@@ -4264,7 +4280,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR: ld2 {v15.h, v16.h}[0], [x15], #3
 // CHECK-ERROR:                               ^
-// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR: error: mismatched register size suffix
 // CHECK-ERROR: ld3 {v31.s, v0.s, v1.d}[0], [sp], x9
 // CHECK-ERROR:                      ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4298,16 +4314,16 @@
          ins v20.s[1], s30
          ins v1.d[0], d7
 
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:         ins v2.b[16], w1
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:         ins v7.h[8], w14
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:         ins v20.s[5], w30
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:         ins v1.d[2], x7
 // CHECK-ERROR:                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -4334,19 +4350,19 @@
          smov x14, v6.d[1]
          smov x20, v9.d[0]
 
-// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov w1, v0.b[16]
 // CHECK-ERROR                       ^
-// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov w14, v6.h[8]
 // CHECK-ERROR                        ^
-// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov x1, v0.b[16]
 // CHECK-ERROR                       ^
-// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov x14, v6.h[8]
 // CHECK-ERROR                        ^
-// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         smov x20, v9.s[5]
 // CHECK-ERROR                        ^
 // CHECK-ERROR error: invalid operand for instruction
@@ -4373,16 +4389,16 @@
          umov s20, v9.s[2]
          umov d7, v18.d[1]
 
-// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         umov w1, v0.b[16]
 // CHECK-ERROR                       ^
-// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         umov w14, v6.h[8]
 // CHECK-ERROR                        ^
-// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         umov w20, v9.s[5]
 // CHECK-ERROR                        ^
-// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR         umov x7, v18.d[3]
 // CHECK-ERROR                        ^
 // CHECK-ERROR error: invalid operand for instruction
@@ -4798,7 +4814,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal s17, h27, s12
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: too few operands for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlal d19, s24, d12
 // CHECK-ERROR:                          ^
 
@@ -4812,7 +4828,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl s14, h12, s25
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: too few operands for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmlsl d12, s23, d13
 // CHECK-ERROR:                          ^
 
@@ -4826,7 +4842,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull s12, h22, s12
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: too few operands for instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        sqdmull d15, s22, d12
 // CHECK-ERROR:                          ^
 
@@ -4890,7 +4906,7 @@
 //----------------------------------------------------------------------
         sshr d15, d16, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        sshr d15, d16, #99
 // CHECK-ERROR:                       ^
 
@@ -4906,7 +4922,7 @@
 
         ushr d10, d17, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        ushr d10, d17, #99
 // CHECK-ERROR:                       ^
 
@@ -4916,7 +4932,7 @@
 
         srshr d19, d18, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        srshr d19, d18, #99
 // CHECK-ERROR:                        ^
 
@@ -4926,7 +4942,7 @@
 
         urshr d20, d23, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        urshr d20, d23, #99
 // CHECK-ERROR:                        ^
 
@@ -4936,7 +4952,7 @@
 
         ssra d18, d12, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        ssra d18, d12, #99
 // CHECK-ERROR:                       ^
 
@@ -4946,7 +4962,7 @@
 
         usra d20, d13, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        usra d20, d13, #99
 // CHECK-ERROR:                       ^
 
@@ -4956,7 +4972,7 @@
 
         srsra d15, d11, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        srsra d15, d11, #99
 // CHECK-ERROR:                        ^
 
@@ -4966,7 +4982,7 @@
 
         ursra d18, d10, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        ursra d18, d10, #99
 // CHECK-ERROR:                        ^
 
@@ -4976,7 +4992,7 @@
 
         shl d7, d10, #99
 
-// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR:        shl d7, d10, #99
 // CHECK-ERROR:                     ^
 
@@ -4995,16 +5011,16 @@
         sqshl s14, s17, #99
         sqshl d15, d16, #99
 
-// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR:        sqshl b11, b19, #99
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR:        sqshl h13, h18, #99
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR:        sqshl s14, s17, #99
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR:        sqshl d15, d16, #99
 // CHECK-ERROR:                        ^
 
@@ -5017,16 +5033,16 @@
         uqshl s14, s19, #99
         uqshl d15, d12, #99
 
-// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR:        uqshl b18, b15, #99
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR:        uqshl h11, h18, #99
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR:        uqshl s14, s19, #99
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR:        uqshl d15, d12, #99
 // CHECK-ERROR:                        ^
 
@@ -5039,16 +5055,16 @@
         sqshlu s16, s14, #99
         sqshlu d11, d13, #99
 
-// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 7]
 // CHECK-ERROR:        sqshlu  b15, b18, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 15]
 // CHECK-ERROR:        sqshlu  h19, h17, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 31]
 // CHECK-ERROR:        sqshlu  s16, s14, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR:        sqshlu  d11, d13, #99
 // CHECK-ERROR:                          ^
 
@@ -5058,7 +5074,7 @@
 
         sri d10, d12, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        sri d10, d12, #99
 // CHECK-ERROR:                      ^
 
@@ -5068,7 +5084,7 @@
 
         sli d10, d14, #99
 
-// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [0, 63]
 // CHECK-ERROR:        sli d10, d14, #99
 // CHECK-ERROR:                      ^
 
@@ -5080,13 +5096,13 @@
         sqshrn h17, s10, #99
         sqshrn s18, d10, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:        sqshrn  b10, h15, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:        sqshrn  h17, s10, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:        sqshrn  s18, d10, #99
 // CHECK-ERROR:                          ^
         
@@ -5098,13 +5114,13 @@
         uqshrn h10, s14, #99
         uqshrn s10, d12, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:        uqshrn  b12, h10, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:        uqshrn  h10, s14, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:        uqshrn  s10, d12, #99
 // CHECK-ERROR:                          ^
         
@@ -5116,13 +5132,13 @@
         sqrshrn h15, s10, #99
         sqrshrn s15, d12, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:        sqrshrn b10, h13, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:        sqrshrn h15, s10, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:        sqrshrn s15, d12, #99
 // CHECK-ERROR:                          ^
         
@@ -5134,13 +5150,13 @@
         uqrshrn h12, s10, #99
         uqrshrn s10, d10, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:        uqrshrn b10, h12, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:        uqrshrn h12, s10, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:        uqrshrn s10, d10, #99
 // CHECK-ERROR:                          ^
 
@@ -5152,13 +5168,13 @@
         sqshrun h20, s14, #99
         sqshrun s10, d15, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:        sqshrun b15, h10, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:        sqshrun h20, s14, #99
 // CHECK-ERROR:                          ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:        sqshrun s10, d15, #99
 // CHECK-ERROR:                          ^
 
@@ -5170,13 +5186,13 @@
         sqrshrun h10, s13, #99
         sqrshrun s22, d16, #99
 
-// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 8]
 // CHECK-ERROR:        sqrshrun b17, h10, #99
 // CHECK-ERROR:                           ^
-// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 16]
 // CHECK-ERROR:        sqrshrun h10, s13, #99
 // CHECK-ERROR:                           ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:        sqrshrun s22, d16, #99
 // CHECK-ERROR:                           ^
 
@@ -5189,13 +5205,13 @@
     scvtf d21, d12, #65
     scvtf d21, s12, #31
         
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:        scvtf s22, s13, #0
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:        scvtf s22, s13, #33
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        scvtf d21, d12, #65
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -5210,10 +5226,10 @@
     ucvtf d21, d14, #65
     ucvtf d21, s14, #64
         
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:        ucvtf s22, s13, #34
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        ucvtf d21, d14, #65
 // CHECK-ERROR:                        ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -6262,10 +6278,10 @@
     fcvtzs d21, d12, #65
     fcvtzs s21, d12, #1
 
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:        fcvtzs s21, s12, #0
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        fcvtzs d21, d12, #65
 // CHECK-ERROR:                         ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -6280,10 +6296,10 @@
     fcvtzu d21, d12, #0
     fcvtzu s21, d12, #1
 
-// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 32]
 // CHECK-ERROR:        fcvtzu s21, s12, #33
 // CHECK-ERROR:                         ^
-// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR: error: {{expected|immediate must be an}} integer in range [1, 64]
 // CHECK-ERROR:        fcvtzu d21, d12, #0
 // CHECK-ERROR:                         ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -6868,7 +6884,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          fmul    h0, h1, v1.s[0]
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error:  lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          fmul    s2, s29, v10.s[4]
 // CHECK-ERROR:                                 ^
 
@@ -6887,7 +6903,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          fmulx    h0, h1, v1.d[0]
 // CHECK-ERROR:                   ^
-// CHECK-ERROR: error:  lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          fmulx    d2, d29, v10.d[3]
 // CHECK-ERROR:                                  ^
 
@@ -6906,7 +6922,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          fmla    d30, s11, v1.d[1]
 // CHECK-ERROR:                       ^
-// CHECK-ERROR: error:  lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          fmla    s16, s22, v16.s[5]
 // CHECK-ERROR:                                  ^
 
@@ -6925,7 +6941,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          fmls    h7, h17, v26.s[2]
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error:  expected lane number
+// CHECK-ERROR: error: vector lane must be an integer in range [0, 1]
 // CHECK-ERROR:          fmls    d16, d22, v16.d[-1]
 // CHECK-ERROR:                                  ^
 
@@ -6937,7 +6953,7 @@
     sqdmlal s0, h0, v0.s[0]
     sqdmlal s8, s9, v14.s[1]
     // invalid lane
-    sqdmlal s4, s5, v1.s[5]
+    sqdmlal d4, s5, v1.s[5]
     // invalid vector index
     sqdmlal s0, h0, v17.h[0]
 
@@ -6947,8 +6963,8 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmlal s8, s9, v14.s[1]
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
-// CHECK-ERROR:          sqdmlal s4, s5, v1.s[5]
+// CHECK-ERROR: vector lane must be an integer in range
+// CHECK-ERROR:          sqdmlal d4, s5, v1.s[5]
 // CHECK-ERROR:                               ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmlal s0, h0, v17.h[0]
@@ -6972,7 +6988,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmlsl d1, h1, v13.s[0]
 // CHECK-ERROR:                      ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqdmlsl d1, s1, v13.s[4]
 // CHECK-ERROR:                                ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -6999,7 +7015,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmull s1, s1, v4.s[0]
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqdmull s12, h17, v9.h[9]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -7024,7 +7040,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqdmulh s25, s26, v27.h[3]
 // CHECK-ERROR:                  ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqdmulh s25, s26, v27.s[4]
 // CHECK-ERROR:                                  ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -7049,7 +7065,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          sqrdmulh s5, h6, v7.s[2]
 // CHECK-ERROR:                       ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          sqrdmulh h31, h30, v14.h[9]
 // CHECK-ERROR:                                 ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -7081,16 +7097,16 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:          dup d0, v17.s[3]
 // CHECK-ERROR:                      ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          dup d0, v17.d[4]
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          dup s0, v1.s[7]
 // CHECK-ERROR:                       ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          dup h0, v31.h[16]
 // CHECK-ERROR:                        ^
-// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: vector lane must be an integer in range
 // CHECK-ERROR:          dup b1, v3.b[16]
 // CHECK-ERROR:                       ^
 
diff --git a/test/MC/AArch64/neon-extract.s b/test/MC/AArch64/neon-extract.s
index 2d58a75..1daa46d 100644
--- a/test/MC/AArch64/neon-extract.s
+++ b/test/MC/AArch64/neon-extract.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -9,5 +9,5 @@
         ext v0.8b, v1.8b, v2.8b, #0x3
         ext v0.16b, v1.16b, v2.16b, #0x3
 
-// CHECK: ext	v0.8b, v1.8b, v2.8b, #0x3  // encoding: [0x20,0x18,0x02,0x2e]
-// CHECK: ext	v0.16b, v1.16b, v2.16b, #0x3 // encoding: [0x20,0x18,0x02,0x6e]
+// CHECK: ext	v0.8b, v1.8b, v2.8b, #{{0x3|3}}  // encoding: [0x20,0x18,0x02,0x2e]
+// CHECK: ext	v0.16b, v1.16b, v2.16b, #{{0x3|3}} // encoding: [0x20,0x18,0x02,0x6e]
diff --git a/test/MC/AArch64/neon-mov.s b/test/MC/AArch64/neon-mov.s
index c2ca803..567a5ec 100644
--- a/test/MC/AArch64/neon-mov.s
+++ b/test/MC/AArch64/neon-mov.s
@@ -20,19 +20,19 @@
          movi v0.8h, #1
          movi v0.8h, #1, lsl #8
 
-// CHECK:  movi v0.2s, #0x1           // encoding: [0x20,0x04,0x00,0x0f]
-// CHECK:  movi v1.2s, #0x0           // encoding: [0x01,0x04,0x00,0x0f]
-// CHECK:  movi v15.2s, #0x1, lsl #8  // encoding: [0x2f,0x24,0x00,0x0f]
-// CHECK:  movi v16.2s, #0x1, lsl #16 // encoding: [0x30,0x44,0x00,0x0f]
-// CHECK:  movi v31.2s, #0x1, lsl #24 // encoding: [0x3f,0x64,0x00,0x0f]
-// CHECK:  movi v0.4s, #0x1           // encoding: [0x20,0x04,0x00,0x4f]
-// CHECK:  movi v0.4s, #0x1, lsl #8   // encoding: [0x20,0x24,0x00,0x4f]
-// CHECK:  movi v0.4s, #0x1, lsl #16  // encoding: [0x20,0x44,0x00,0x4f]
-// CHECK:  movi v0.4s, #0x1, lsl #24  // encoding: [0x20,0x64,0x00,0x4f]
-// CHECK:  movi v0.4h, #0x1           // encoding: [0x20,0x84,0x00,0x0f]
-// CHECK:  movi v0.4h, #0x1, lsl #8   // encoding: [0x20,0xa4,0x00,0x0f]
-// CHECK:  movi v0.8h, #0x1           // encoding: [0x20,0x84,0x00,0x4f]
-// CHECK:  movi v0.8h, #0x1, lsl #8   // encoding: [0x20,0xa4,0x00,0x4f]
+// CHECK:  movi v0.2s, #{{0x1|1}}           // encoding: [0x20,0x04,0x00,0x0f]
+// CHECK:  movi v1.2s, #{{0x0|0}}           // encoding: [0x01,0x04,0x00,0x0f]
+// CHECK:  movi v15.2s, #{{0x1|1}}, lsl #8  // encoding: [0x2f,0x24,0x00,0x0f]
+// CHECK:  movi v16.2s, #{{0x1|1}}, lsl #16 // encoding: [0x30,0x44,0x00,0x0f]
+// CHECK:  movi v31.2s, #{{0x1|1}}, lsl #24 // encoding: [0x3f,0x64,0x00,0x0f]
+// CHECK:  movi v0.4s, #{{0x1|1}}           // encoding: [0x20,0x04,0x00,0x4f]
+// CHECK:  movi v0.4s, #{{0x1|1}}, lsl #8   // encoding: [0x20,0x24,0x00,0x4f]
+// CHECK:  movi v0.4s, #{{0x1|1}}, lsl #16  // encoding: [0x20,0x44,0x00,0x4f]
+// CHECK:  movi v0.4s, #{{0x1|1}}, lsl #24  // encoding: [0x20,0x64,0x00,0x4f]
+// CHECK:  movi v0.4h, #{{0x1|1}}           // encoding: [0x20,0x84,0x00,0x0f]
+// CHECK:  movi v0.4h, #{{0x1|1}}, lsl #8   // encoding: [0x20,0xa4,0x00,0x0f]
+// CHECK:  movi v0.8h, #{{0x1|1}}           // encoding: [0x20,0x84,0x00,0x4f]
+// CHECK:  movi v0.8h, #{{0x1|1}}, lsl #8   // encoding: [0x20,0xa4,0x00,0x4f]
 
 //----------------------------------------------------------------------
 // Vector Move Inverted Immediate Shifted
@@ -51,19 +51,19 @@
          mvni v0.8h, #1
          mvni v0.8h, #1, lsl #8
 
-// CHECK:  mvni v0.2s, #0x1           // encoding: [0x20,0x04,0x00,0x2f]
-// CHECK:  mvni v1.2s, #0x0           // encoding: [0x01,0x04,0x00,0x2f]
-// CHECK:  mvni v0.2s, #0x1, lsl #8   // encoding: [0x20,0x24,0x00,0x2f]
-// CHECK:  mvni v0.2s, #0x1, lsl #16  // encoding: [0x20,0x44,0x00,0x2f]
-// CHECK:  mvni v0.2s, #0x1, lsl #24  // encoding: [0x20,0x64,0x00,0x2f]
-// CHECK:  mvni v0.4s, #0x1           // encoding: [0x20,0x04,0x00,0x6f]
-// CHECK:  mvni v15.4s, #0x1, lsl #8  // encoding: [0x2f,0x24,0x00,0x6f]
-// CHECK:  mvni v16.4s, #0x1, lsl #16 // encoding: [0x30,0x44,0x00,0x6f]
-// CHECK:  mvni v31.4s, #0x1, lsl #24 // encoding: [0x3f,0x64,0x00,0x6f]
-// CHECK:  mvni v0.4h, #0x1           // encoding: [0x20,0x84,0x00,0x2f]
-// CHECK:  mvni v0.4h, #0x1, lsl #8   // encoding: [0x20,0xa4,0x00,0x2f]
-// CHECK:  mvni v0.8h, #0x1           // encoding: [0x20,0x84,0x00,0x6f]
-// CHECK:  mvni v0.8h, #0x1, lsl #8   // encoding: [0x20,0xa4,0x00,0x6f]
+// CHECK:  mvni v0.2s, #{{0x1|1}}           // encoding: [0x20,0x04,0x00,0x2f]
+// CHECK:  mvni v1.2s, #{{0x0|0}}           // encoding: [0x01,0x04,0x00,0x2f]
+// CHECK:  mvni v0.2s, #{{0x1|1}}, lsl #8   // encoding: [0x20,0x24,0x00,0x2f]
+// CHECK:  mvni v0.2s, #{{0x1|1}}, lsl #16  // encoding: [0x20,0x44,0x00,0x2f]
+// CHECK:  mvni v0.2s, #{{0x1|1}}, lsl #24  // encoding: [0x20,0x64,0x00,0x2f]
+// CHECK:  mvni v0.4s, #{{0x1|1}}           // encoding: [0x20,0x04,0x00,0x6f]
+// CHECK:  mvni v15.4s, #{{0x1|1}}, lsl #8  // encoding: [0x2f,0x24,0x00,0x6f]
+// CHECK:  mvni v16.4s, #{{0x1|1}}, lsl #16 // encoding: [0x30,0x44,0x00,0x6f]
+// CHECK:  mvni v31.4s, #{{0x1|1}}, lsl #24 // encoding: [0x3f,0x64,0x00,0x6f]
+// CHECK:  mvni v0.4h, #{{0x1|1}}           // encoding: [0x20,0x84,0x00,0x2f]
+// CHECK:  mvni v0.4h, #{{0x1|1}}, lsl #8   // encoding: [0x20,0xa4,0x00,0x2f]
+// CHECK:  mvni v0.8h, #{{0x1|1}}           // encoding: [0x20,0x84,0x00,0x6f]
+// CHECK:  mvni v0.8h, #{{0x1|1}}, lsl #8   // encoding: [0x20,0xa4,0x00,0x6f]
 
 //----------------------------------------------------------------------
 // Vector Bitwise Bit Clear (AND NOT) - immediate
@@ -82,19 +82,19 @@
          bic v0.8h, #1
          bic v31.8h, #1, lsl #8
 
-// CHECK:  bic v0.2s, #0x1           // encoding: [0x20,0x14,0x00,0x2f]
-// CHECK:  bic v1.2s, #0x0           // encoding: [0x01,0x14,0x00,0x2f]
-// CHECK:  bic v0.2s, #0x1, lsl #8   // encoding: [0x20,0x34,0x00,0x2f]
-// CHECK:  bic v0.2s, #0x1, lsl #16  // encoding: [0x20,0x54,0x00,0x2f]
-// CHECK:  bic v0.2s, #0x1, lsl #24  // encoding: [0x20,0x74,0x00,0x2f]
-// CHECK:  bic v0.4s, #0x1           // encoding: [0x20,0x14,0x00,0x6f]
-// CHECK:  bic v0.4s, #0x1, lsl #8   // encoding: [0x20,0x34,0x00,0x6f]
-// CHECK:  bic v0.4s, #0x1, lsl #16  // encoding: [0x20,0x54,0x00,0x6f]
-// CHECK:  bic v0.4s, #0x1, lsl #24  // encoding: [0x20,0x74,0x00,0x6f]
-// CHECK:  bic v15.4h, #0x1          // encoding: [0x2f,0x94,0x00,0x2f]
-// CHECK:  bic v16.4h, #0x1, lsl #8  // encoding: [0x30,0xb4,0x00,0x2f]
-// CHECK:  bic v0.8h, #0x1           // encoding: [0x20,0x94,0x00,0x6f]
-// CHECK:  bic v31.8h, #0x1, lsl #8  // encoding: [0x3f,0xb4,0x00,0x6f]
+// CHECK:  bic v0.2s, #{{0x1|1}}           // encoding: [0x20,0x14,0x00,0x2f]
+// CHECK:  bic v1.2s, #{{0x0|0}}           // encoding: [0x01,0x14,0x00,0x2f]
+// CHECK:  bic v0.2s, #{{0x1|1}}, lsl #8   // encoding: [0x20,0x34,0x00,0x2f]
+// CHECK:  bic v0.2s, #{{0x1|1}}, lsl #16  // encoding: [0x20,0x54,0x00,0x2f]
+// CHECK:  bic v0.2s, #{{0x1|1}}, lsl #24  // encoding: [0x20,0x74,0x00,0x2f]
+// CHECK:  bic v0.4s, #{{0x1|1}}           // encoding: [0x20,0x14,0x00,0x6f]
+// CHECK:  bic v0.4s, #{{0x1|1}}, lsl #8   // encoding: [0x20,0x34,0x00,0x6f]
+// CHECK:  bic v0.4s, #{{0x1|1}}, lsl #16  // encoding: [0x20,0x54,0x00,0x6f]
+// CHECK:  bic v0.4s, #{{0x1|1}}, lsl #24  // encoding: [0x20,0x74,0x00,0x6f]
+// CHECK:  bic v15.4h, #{{0x1|1}}          // encoding: [0x2f,0x94,0x00,0x2f]
+// CHECK:  bic v16.4h, #{{0x1|1}}, lsl #8  // encoding: [0x30,0xb4,0x00,0x2f]
+// CHECK:  bic v0.8h, #{{0x1|1}}           // encoding: [0x20,0x94,0x00,0x6f]
+// CHECK:  bic v31.8h, #{{0x1|1}}, lsl #8  // encoding: [0x3f,0xb4,0x00,0x6f]
 
 //----------------------------------------------------------------------
 // Vector Bitwise OR - immedidate
@@ -113,19 +113,19 @@
          orr v0.8h, #1
          orr v16.8h, #1, lsl #8
 
-// CHECK:  orr v0.2s, #0x1           // encoding: [0x20,0x14,0x00,0x0f]
-// CHECK:  orr v1.2s, #0x0           // encoding: [0x01,0x14,0x00,0x0f]
-// CHECK:  orr v0.2s, #0x1, lsl #8   // encoding: [0x20,0x34,0x00,0x0f]
-// CHECK:  orr v0.2s, #0x1, lsl #16  // encoding: [0x20,0x54,0x00,0x0f]
-// CHECK:  orr v0.2s, #0x1, lsl #24  // encoding: [0x20,0x74,0x00,0x0f]
-// CHECK:  orr v0.4s, #0x1           // encoding: [0x20,0x14,0x00,0x4f]
-// CHECK:  orr v0.4s, #0x1, lsl #8   // encoding: [0x20,0x34,0x00,0x4f]
-// CHECK:  orr v0.4s, #0x1, lsl #16  // encoding: [0x20,0x54,0x00,0x4f]
-// CHECK:  orr v0.4s, #0x1, lsl #24  // encoding: [0x20,0x74,0x00,0x4f]
-// CHECK:  orr v31.4h, #0x1          // encoding: [0x3f,0x94,0x00,0x0f]
-// CHECK:  orr v15.4h, #0x1, lsl #8  // encoding: [0x2f,0xb4,0x00,0x0f]
-// CHECK:  orr v0.8h, #0x1           // encoding: [0x20,0x94,0x00,0x4f]
-// CHECK:  orr v16.8h, #0x1, lsl #8  // encoding: [0x30,0xb4,0x00,0x4f]
+// CHECK:  orr v0.2s, #{{0x1|1}}           // encoding: [0x20,0x14,0x00,0x0f]
+// CHECK:  orr v1.2s, #{{0x0|0}}           // encoding: [0x01,0x14,0x00,0x0f]
+// CHECK:  orr v0.2s, #{{0x1|1}}, lsl #8   // encoding: [0x20,0x34,0x00,0x0f]
+// CHECK:  orr v0.2s, #{{0x1|1}}, lsl #16  // encoding: [0x20,0x54,0x00,0x0f]
+// CHECK:  orr v0.2s, #{{0x1|1}}, lsl #24  // encoding: [0x20,0x74,0x00,0x0f]
+// CHECK:  orr v0.4s, #{{0x1|1}}           // encoding: [0x20,0x14,0x00,0x4f]
+// CHECK:  orr v0.4s, #{{0x1|1}}, lsl #8   // encoding: [0x20,0x34,0x00,0x4f]
+// CHECK:  orr v0.4s, #{{0x1|1}}, lsl #16  // encoding: [0x20,0x54,0x00,0x4f]
+// CHECK:  orr v0.4s, #{{0x1|1}}, lsl #24  // encoding: [0x20,0x74,0x00,0x4f]
+// CHECK:  orr v31.4h, #{{0x1|1}}          // encoding: [0x3f,0x94,0x00,0x0f]
+// CHECK:  orr v15.4h, #{{0x1|1}}, lsl #8  // encoding: [0x2f,0xb4,0x00,0x0f]
+// CHECK:  orr v0.8h, #{{0x1|1}}           // encoding: [0x20,0x94,0x00,0x4f]
+// CHECK:  orr v16.8h, #{{0x1|1}}, lsl #8  // encoding: [0x30,0xb4,0x00,0x4f]
 
 //----------------------------------------------------------------------
 // Vector Move Immediate Masked
@@ -135,10 +135,10 @@
          movi v0.4s, #1, msl #8
          movi v31.4s, #1, msl #16
 
-// CHECK:  movi v0.2s, #0x1, msl #8   // encoding: [0x20,0xc4,0x00,0x0f]
-// CHECK:  movi v1.2s, #0x1, msl #16  // encoding: [0x21,0xd4,0x00,0x0f]
-// CHECK:  movi v0.4s, #0x1, msl #8   // encoding: [0x20,0xc4,0x00,0x4f]
-// CHECK:  movi v31.4s, #0x1, msl #16 // encoding: [0x3f,0xd4,0x00,0x4f]
+// CHECK:  movi v0.2s, #{{0x1|1}}, msl #8   // encoding: [0x20,0xc4,0x00,0x0f]
+// CHECK:  movi v1.2s, #{{0x1|1}}, msl #16  // encoding: [0x21,0xd4,0x00,0x0f]
+// CHECK:  movi v0.4s, #{{0x1|1}}, msl #8   // encoding: [0x20,0xc4,0x00,0x4f]
+// CHECK:  movi v31.4s, #{{0x1|1}}, msl #16 // encoding: [0x3f,0xd4,0x00,0x4f]
 
 //----------------------------------------------------------------------
 // Vector Move Inverted Immediate Masked
@@ -148,10 +148,10 @@
          mvni v31.4s, #0x1, msl #8
          mvni v0.4s, #0x1, msl #16
 
-// CHECK:   mvni v1.2s, #0x1, msl #8  // encoding: [0x21,0xc4,0x00,0x2f]
-// CHECK:   mvni v0.2s, #0x1, msl #16 // encoding: [0x20,0xd4,0x00,0x2f]
-// CHECK:   mvni v31.4s, #0x1, msl #8 // encoding: [0x3f,0xc4,0x00,0x6f]
-// CHECK:   mvni v0.4s, #0x1, msl #16 // encoding: [0x20,0xd4,0x00,0x6f]
+// CHECK:   mvni v1.2s, #{{0x1|1}}, msl #8  // encoding: [0x21,0xc4,0x00,0x2f]
+// CHECK:   mvni v0.2s, #{{0x1|1}}, msl #16 // encoding: [0x20,0xd4,0x00,0x2f]
+// CHECK:   mvni v31.4s, #{{0x1|1}}, msl #8 // encoding: [0x3f,0xc4,0x00,0x6f]
+// CHECK:   mvni v0.4s, #{{0x1|1}}, msl #16 // encoding: [0x20,0xd4,0x00,0x6f]
 
 //----------------------------------------------------------------------
 // Vector Immediate - per byte
@@ -161,10 +161,10 @@
          movi v15.16b, #0xf
          movi v31.16b, #0x1f
 
-// CHECK:   movi v0.8b, #0x0        // encoding: [0x00,0xe4,0x00,0x0f]
-// CHECK:   movi v31.8b, #0xff      // encoding: [0xff,0xe7,0x07,0x0f]
-// CHECK:   movi v15.16b, #0xf      // encoding: [0xef,0xe5,0x00,0x4f]
-// CHECK:   movi v31.16b, #0x1f     // encoding: [0xff,0xe7,0x00,0x4f]
+// CHECK:   movi v0.8b, #{{0x0|0}}        // encoding: [0x00,0xe4,0x00,0x0f]
+// CHECK:   movi v31.8b, #{{0xff|255}}      // encoding: [0xff,0xe7,0x07,0x0f]
+// CHECK:   movi v15.16b, #{{0xf|15}}      // encoding: [0xef,0xe5,0x00,0x4f]
+// CHECK:   movi v31.16b, #{{0x1f|31}}     // encoding: [0xff,0xe7,0x00,0x4f]
 
 //----------------------------------------------------------------------
 // Vector Move Immediate - bytemask, per doubleword
@@ -187,23 +187,22 @@
          fmov v15.4s, #1.0
          fmov v31.2d, #1.0
 
-// CHECK:  fmov v1.2s, #1.00000000     // encoding: [0x01,0xf6,0x03,0x0f]
-// CHECK:  fmov v15.4s, #1.00000000    // encoding: [0x0f,0xf6,0x03,0x4f]
-// CHECK:  fmov v31.2d, #1.00000000    // encoding: [0x1f,0xf6,0x03,0x6f]
+// CHECK:  fmov v1.2s, #{{1.00000000|1.000000e\+00}}     // encoding: [0x01,0xf6,0x03,0x0f]
+// CHECK:  fmov v15.4s, #{{1.00000000|1.000000e\+00}}    // encoding: [0x0f,0xf6,0x03,0x4f]
+// CHECK:  fmov v31.2d, #{{1.00000000|1.000000e\+00}}    // encoding: [0x1f,0xf6,0x03,0x6f]
 
 
 //----------------------------------------------------------------------
 // Vector Move -  register
 //----------------------------------------------------------------------
 
-      // FIXME: these should all print with the "mov" syntax.
       mov v0.8b, v31.8b
       mov v15.16b, v16.16b
       orr v0.8b, v31.8b, v31.8b
       orr v15.16b, v16.16b, v16.16b
 
-// CHECK:   orr v0.8b, v31.8b, v31.8b      // encoding: [0xe0,0x1f,0xbf,0x0e]
-// CHECK:   orr v15.16b, v16.16b, v16.16b  // encoding: [0x0f,0x1e,0xb0,0x4e]
-// CHECK:   orr v0.8b, v31.8b, v31.8b      // encoding: [0xe0,0x1f,0xbf,0x0e]
-// CHECK:   orr v15.16b, v16.16b, v16.16b  // encoding: [0x0f,0x1e,0xb0,0x4e]
+// CHECK:   mov v0.8b, v31.8b      // encoding: [0xe0,0x1f,0xbf,0x0e]
+// CHECK:   mov v15.16b, v16.16b  // encoding: [0x0f,0x1e,0xb0,0x4e]
+// CHECK:   mov v0.8b, v31.8b      // encoding: [0xe0,0x1f,0xbf,0x0e]
+// CHECK:   mov v15.16b, v16.16b  // encoding: [0x0f,0x1e,0xb0,0x4e]
 
diff --git a/test/MC/AArch64/neon-perm.s b/test/MC/AArch64/neon-perm.s
index 20a4acde..4b28dd0 100644
--- a/test/MC/AArch64/neon-perm.s
+++ b/test/MC/AArch64/neon-perm.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
diff --git a/test/MC/AArch64/neon-scalar-compare.s b/test/MC/AArch64/neon-scalar-compare.s
index 55ade0e..28de46a 100644
--- a/test/MC/AArch64/neon-scalar-compare.s
+++ b/test/MC/AArch64/neon-scalar-compare.s
@@ -16,7 +16,7 @@
 
          cmeq d20, d21, #0x0
 
-// CHECK: cmeq d20, d21, #0x0   // encoding: [0xb4,0x9a,0xe0,0x5e]
+// CHECK: cmeq d20, d21, #{{0x0|0}}   // encoding: [0xb4,0x9a,0xe0,0x5e]
 
 //----------------------------------------------------------------------
 // Scalar Compare Unsigned Higher Or Same
@@ -40,7 +40,7 @@
 
          cmge d20, d21, #0x0
 
-// CHECK: cmge d20, d21, #0x0   // encoding: [0xb4,0x8a,0xe0,0x7e]
+// CHECK: cmge d20, d21, #{{0x0|0}}   // encoding: [0xb4,0x8a,0xe0,0x7e]
 
 //----------------------------------------------------------------------
 // Scalar Compare Unsigned Higher
@@ -63,7 +63,7 @@
 
          cmgt d20, d21, #0x0
 
-// CHECK: cmgt d20, d21, #0x0   // encoding: [0xb4,0x8a,0xe0,0x5e]
+// CHECK: cmgt d20, d21, #{{0x0|0}}   // encoding: [0xb4,0x8a,0xe0,0x5e]
 
 //----------------------------------------------------------------------
 // Scalar Compare Signed Less Than Or Equal To Zero
@@ -71,7 +71,7 @@
 
          cmle d20, d21, #0x0
 
-// CHECK: cmle d20, d21, #0x0   // encoding: [0xb4,0x9a,0xe0,0x7e]
+// CHECK: cmle d20, d21, #{{0x0|0}}   // encoding: [0xb4,0x9a,0xe0,0x7e]
 
 //----------------------------------------------------------------------
 // Scalar Compare Less Than Zero
@@ -79,7 +79,7 @@
 
          cmlt d20, d21, #0x0
 
-// CHECK: cmlt d20, d21, #0x0   // encoding: [0xb4,0xaa,0xe0,0x5e]
+// CHECK: cmlt d20, d21, #{{0x0|0}}   // encoding: [0xb4,0xaa,0xe0,0x5e]
 
 //----------------------------------------------------------------------
 // Scalar Compare Bitwise Test Bits
diff --git a/test/MC/AArch64/neon-scalar-dup.s b/test/MC/AArch64/neon-scalar-dup.s
index 77c638d..db11ea2 100644
--- a/test/MC/AArch64/neon-scalar-dup.s
+++ b/test/MC/AArch64/neon-scalar-dup.s
@@ -15,17 +15,17 @@
          dup d3, v5.d[0]
          dup d6, v5.d[1]
 
-// CHECK: dup b0, v0.b[15]      // encoding: [0x00,0x04,0x1f,0x5e]
-// CHECK: dup b1, v0.b[7]       // encoding: [0x01,0x04,0x0f,0x5e]
-// CHECK: dup b17, v0.b[0]      // encoding: [0x11,0x04,0x01,0x5e]
-// CHECK: dup h5, v31.h[7]      // encoding: [0xe5,0x07,0x1e,0x5e]
-// CHECK: dup h9, v1.h[4]       // encoding: [0x29,0x04,0x12,0x5e]
-// CHECK: dup h11, v17.h[0]     // encoding: [0x2b,0x06,0x02,0x5e]
-// CHECK: dup s2, v2.s[3]       // encoding: [0x42,0x04,0x1c,0x5e]
-// CHECK: dup s4, v21.s[0]      // encoding: [0xa4,0x06,0x04,0x5e]
-// CHECK: dup s31, v21.s[2]     // encoding: [0xbf,0x06,0x14,0x5e]
-// CHECK: dup d3, v5.d[0]       // encoding: [0xa3,0x04,0x08,0x5e]
-// CHECK: dup d6, v5.d[1]       // encoding: [0xa6,0x04,0x18,0x5e]
+// CHECK: {{dup|mov}} b0, v0.b[15]      // encoding: [0x00,0x04,0x1f,0x5e]
+// CHECK: {{dup|mov}} b1, v0.b[7]       // encoding: [0x01,0x04,0x0f,0x5e]
+// CHECK: {{dup|mov}} b17, v0.b[0]      // encoding: [0x11,0x04,0x01,0x5e]
+// CHECK: {{dup|mov}} h5, v31.h[7]      // encoding: [0xe5,0x07,0x1e,0x5e]
+// CHECK: {{dup|mov}} h9, v1.h[4]       // encoding: [0x29,0x04,0x12,0x5e]
+// CHECK: {{dup|mov}} h11, v17.h[0]     // encoding: [0x2b,0x06,0x02,0x5e]
+// CHECK: {{dup|mov}} s2, v2.s[3]       // encoding: [0x42,0x04,0x1c,0x5e]
+// CHECK: {{dup|mov}} s4, v21.s[0]      // encoding: [0xa4,0x06,0x04,0x5e]
+// CHECK: {{dup|mov}} s31, v21.s[2]     // encoding: [0xbf,0x06,0x14,0x5e]
+// CHECK: {{dup|mov}} d3, v5.d[0]       // encoding: [0xa3,0x04,0x08,0x5e]
+// CHECK: {{dup|mov}} d6, v5.d[1]       // encoding: [0xa6,0x04,0x18,0x5e]
 
 //------------------------------------------------------------------------------
 // Aliases for Duplicate element (scalar)
@@ -42,14 +42,14 @@
          mov d3, v5.d[0]
          mov d6, v5.d[1]
 
-// CHECK: dup b0, v0.b[15]      // encoding: [0x00,0x04,0x1f,0x5e]
-// CHECK: dup b1, v0.b[7]       // encoding: [0x01,0x04,0x0f,0x5e]
-// CHECK: dup b17, v0.b[0]      // encoding: [0x11,0x04,0x01,0x5e]
-// CHECK: dup h5, v31.h[7]      // encoding: [0xe5,0x07,0x1e,0x5e]
-// CHECK: dup h9, v1.h[4]       // encoding: [0x29,0x04,0x12,0x5e]
-// CHECK: dup h11, v17.h[0]     // encoding: [0x2b,0x06,0x02,0x5e]
-// CHECK: dup s2, v2.s[3]       // encoding: [0x42,0x04,0x1c,0x5e]
-// CHECK: dup s4, v21.s[0]      // encoding: [0xa4,0x06,0x04,0x5e]
-// CHECK: dup s31, v21.s[2]     // encoding: [0xbf,0x06,0x14,0x5e]
-// CHECK: dup d3, v5.d[0]       // encoding: [0xa3,0x04,0x08,0x5e]
-// CHECK: dup d6, v5.d[1]       // encoding: [0xa6,0x04,0x18,0x5e]
+// CHECK: {{dup|mov}} b0, v0.b[15]      // encoding: [0x00,0x04,0x1f,0x5e]
+// CHECK: {{dup|mov}} b1, v0.b[7]       // encoding: [0x01,0x04,0x0f,0x5e]
+// CHECK: {{dup|mov}} b17, v0.b[0]      // encoding: [0x11,0x04,0x01,0x5e]
+// CHECK: {{dup|mov}} h5, v31.h[7]      // encoding: [0xe5,0x07,0x1e,0x5e]
+// CHECK: {{dup|mov}} h9, v1.h[4]       // encoding: [0x29,0x04,0x12,0x5e]
+// CHECK: {{dup|mov}} h11, v17.h[0]     // encoding: [0x2b,0x06,0x02,0x5e]
+// CHECK: {{dup|mov}} s2, v2.s[3]       // encoding: [0x42,0x04,0x1c,0x5e]
+// CHECK: {{dup|mov}} s4, v21.s[0]      // encoding: [0xa4,0x06,0x04,0x5e]
+// CHECK: {{dup|mov}} s31, v21.s[2]     // encoding: [0xbf,0x06,0x14,0x5e]
+// CHECK: {{dup|mov}} d3, v5.d[0]       // encoding: [0xa3,0x04,0x08,0x5e]
+// CHECK: {{dup|mov}} d6, v5.d[1]       // encoding: [0xa6,0x04,0x18,0x5e]
diff --git a/test/MC/AArch64/neon-simd-copy.s b/test/MC/AArch64/neon-simd-copy.s
index f254d65..4837a4c 100644
--- a/test/MC/AArch64/neon-simd-copy.s
+++ b/test/MC/AArch64/neon-simd-copy.s
@@ -16,15 +16,15 @@
          mov v20.s[0], w30
          mov v1.d[1], x7
 
-// CHECK: ins	v2.b[2], w1           // encoding: [0x22,0x1c,0x05,0x4e]
-// CHECK: ins	v7.h[7], w14          // encoding: [0xc7,0x1d,0x1e,0x4e]
-// CHECK: ins	v20.s[0], w30         // encoding: [0xd4,0x1f,0x04,0x4e]
-// CHECK: ins	v1.d[1], x7           // encoding: [0xe1,0x1c,0x18,0x4e]
+// CHECK: {{mov|ins}}	v2.b[2], w1           // encoding: [0x22,0x1c,0x05,0x4e]
+// CHECK: {{mov|ins}}	v7.h[7], w14          // encoding: [0xc7,0x1d,0x1e,0x4e]
+// CHECK: {{mov|ins}}	v20.s[0], w30         // encoding: [0xd4,0x1f,0x04,0x4e]
+// CHECK: {{mov|ins}}	v1.d[1], x7           // encoding: [0xe1,0x1c,0x18,0x4e]
 
-// CHECK: ins v2.b[2], w1           // encoding: [0x22,0x1c,0x05,0x4e]
-// CHECK: ins v7.h[7], w14          // encoding: [0xc7,0x1d,0x1e,0x4e]
-// CHECK: ins v20.s[0], w30         // encoding: [0xd4,0x1f,0x04,0x4e]
-// CHECK: ins v1.d[1], x7           // encoding: [0xe1,0x1c,0x18,0x4e]
+// CHECK: {{mov|ins}} v2.b[2], w1           // encoding: [0x22,0x1c,0x05,0x4e]
+// CHECK: {{mov|ins}} v7.h[7], w14          // encoding: [0xc7,0x1d,0x1e,0x4e]
+// CHECK: {{mov|ins}} v20.s[0], w30         // encoding: [0xd4,0x1f,0x04,0x4e]
+// CHECK: {{mov|ins}} v1.d[1], x7           // encoding: [0xe1,0x1c,0x18,0x4e]
 
 
 //------------------------------------------------------------------------------
@@ -54,13 +54,13 @@
          mov w20, v9.s[2]
          mov x7, v18.d[1]
 
-// CHECK: umov	w1, v0.b[15]          // encoding: [0x01,0x3c,0x1f,0x0e]
-// CHECK: umov	w14, v6.h[4]          // encoding: [0xce,0x3c,0x12,0x0e]
-// CHECK: umov	w20, v9.s[2]          // encoding: [0x34,0x3d,0x14,0x0e]
-// CHECK: umov	x7, v18.d[1]          // encoding: [0x47,0x3e,0x18,0x4e]
+// CHECK: {{mov|umov}}	w1, v0.b[15]          // encoding: [0x01,0x3c,0x1f,0x0e]
+// CHECK: {{mov|umov}}	w14, v6.h[4]          // encoding: [0xce,0x3c,0x12,0x0e]
+// CHECK: {{mov|umov}}	w20, v9.s[2]          // encoding: [0x34,0x3d,0x14,0x0e]
+// CHECK: {{mov|umov}}	x7, v18.d[1]          // encoding: [0x47,0x3e,0x18,0x4e]
 
-// CHECK: umov  w20, v9.s[2]          // encoding: [0x34,0x3d,0x14,0x0e]
-// CHECK: umov  x7, v18.d[1]          // encoding: [0x47,0x3e,0x18,0x4e]
+// CHECK: {{mov|umov}}  w20, v9.s[2]          // encoding: [0x34,0x3d,0x14,0x0e]
+// CHECK: {{mov|umov}}  x7, v18.d[1]          // encoding: [0x47,0x3e,0x18,0x4e]
 
 //------------------------------------------------------------------------------
 // Insert element (vector, from element)
@@ -76,15 +76,15 @@
          mov v15.s[3], v22.s[2]
          mov v0.d[0], v4.d[1]
 
-// CHECK: ins	v1.b[14], v3.b[6]       // encoding: [0x61,0x34,0x1d,0x6e]
-// CHECK: ins	v6.h[7], v7.h[5]        // encoding: [0xe6,0x54,0x1e,0x6e]
-// CHECK: ins	v15.s[3], v22.s[2]      // encoding: [0xcf,0x46,0x1c,0x6e]
-// CHECK: ins	v0.d[0], v4.d[1]        // encoding: [0x80,0x44,0x08,0x6e]
+// CHECK: {{mov|ins}}	v1.b[14], v3.b[6]       // encoding: [0x61,0x34,0x1d,0x6e]
+// CHECK: {{mov|ins}}	v6.h[7], v7.h[5]        // encoding: [0xe6,0x54,0x1e,0x6e]
+// CHECK: {{mov|ins}}	v15.s[3], v22.s[2]      // encoding: [0xcf,0x46,0x1c,0x6e]
+// CHECK: {{mov|ins}}	v0.d[0], v4.d[1]        // encoding: [0x80,0x44,0x08,0x6e]
 
-// CHECK: ins v1.b[14], v3.b[6]       // encoding: [0x61,0x34,0x1d,0x6e]
-// CHECK: ins v6.h[7], v7.h[5]        // encoding: [0xe6,0x54,0x1e,0x6e]
-// CHECK: ins v15.s[3], v22.s[2]      // encoding: [0xcf,0x46,0x1c,0x6e]
-// CHECK: ins v0.d[0], v4.d[1]        // encoding: [0x80,0x44,0x08,0x6e]
+// CHECK: {{mov|ins}} v1.b[14], v3.b[6]       // encoding: [0x61,0x34,0x1d,0x6e]
+// CHECK: {{mov|ins}} v6.h[7], v7.h[5]        // encoding: [0xe6,0x54,0x1e,0x6e]
+// CHECK: {{mov|ins}} v15.s[3], v22.s[2]      // encoding: [0xcf,0x46,0x1c,0x6e]
+// CHECK: {{mov|ins}} v0.d[0], v4.d[1]        // encoding: [0x80,0x44,0x08,0x6e]
 
 //------------------------------------------------------------------------------
 // Duplicate to all lanes( vector, from element)
@@ -97,13 +97,13 @@
          dup v17.4s, v20.s[0]
          dup v5.2d, v1.d[1]         
 
-// CHECK: dup v1.8b, v2.b[2]        // encoding: [0x41,0x04,0x05,0x0e]
-// CHECK: dup v11.4h, v7.h[7]       // encoding: [0xeb,0x04,0x1e,0x0e]
-// CHECK: dup v17.2s, v20.s[0]      // encoding: [0x91,0x06,0x04,0x0e]
-// CHECK: dup v1.16b, v2.b[2]       // encoding: [0x41,0x04,0x05,0x4e]
-// CHECK: dup v11.8h, v7.h[7]       // encoding: [0xeb,0x04,0x1e,0x4e]
-// CHECK: dup v17.4s, v20.s[0]      // encoding: [0x91,0x06,0x04,0x4e]
-// CHECK: dup v5.2d, v1.d[1]        // encoding: [0x25,0x04,0x18,0x4e]
+// CHECK: {{mov|dup}} v1.8b, v2.b[2]        // encoding: [0x41,0x04,0x05,0x0e]
+// CHECK: {{mov|dup}} v11.4h, v7.h[7]       // encoding: [0xeb,0x04,0x1e,0x0e]
+// CHECK: {{mov|dup}} v17.2s, v20.s[0]      // encoding: [0x91,0x06,0x04,0x0e]
+// CHECK: {{mov|dup}} v1.16b, v2.b[2]       // encoding: [0x41,0x04,0x05,0x4e]
+// CHECK: {{mov|dup}} v11.8h, v7.h[7]       // encoding: [0xeb,0x04,0x1e,0x4e]
+// CHECK: {{mov|dup}} v17.4s, v20.s[0]      // encoding: [0x91,0x06,0x04,0x4e]
+// CHECK: {{mov|dup}} v5.2d, v1.d[1]        // encoding: [0x25,0x04,0x18,0x4e]
 
 //------------------------------------------------------------------------------
 // Duplicate to all lanes( vector, from main)
@@ -116,13 +116,13 @@
          dup v17.4s, w28
          dup v5.2d, x0        
 
-// CHECK: dup	v1.8b, w1             // encoding: [0x21,0x0c,0x01,0x0e]
-// CHECK: dup	v11.4h, w14           // encoding: [0xcb,0x0d,0x02,0x0e]
-// CHECK: dup	v17.2s, w30           // encoding: [0xd1,0x0f,0x04,0x0e]
-// CHECK: dup	v1.16b, w2            // encoding: [0x41,0x0c,0x01,0x4e]
-// CHECK: dup	v11.8h, w16           // encoding: [0x0b,0x0e,0x02,0x4e]
-// CHECK: dup	v17.4s, w28           // encoding: [0x91,0x0f,0x04,0x4e]
-// CHECK: dup	v5.2d, x0             // encoding: [0x05,0x0c,0x08,0x4e]
+// CHECK: {{mov|dup}}	v1.8b, w1             // encoding: [0x21,0x0c,0x01,0x0e]
+// CHECK: {{mov|dup}}	v11.4h, w14           // encoding: [0xcb,0x0d,0x02,0x0e]
+// CHECK: {{mov|dup}}	v17.2s, w30           // encoding: [0xd1,0x0f,0x04,0x0e]
+// CHECK: {{mov|dup}}	v1.16b, w2            // encoding: [0x41,0x0c,0x01,0x4e]
+// CHECK: {{mov|dup}}	v11.8h, w16           // encoding: [0x0b,0x0e,0x02,0x4e]
+// CHECK: {{mov|dup}}	v17.4s, w28           // encoding: [0x91,0x0f,0x04,0x4e]
+// CHECK: {{mov|dup}}	v5.2d, x0             // encoding: [0x05,0x0c,0x08,0x4e]
 
 
 
diff --git a/test/MC/AArch64/neon-simd-ldst-multi-elem.s b/test/MC/AArch64/neon-simd-ldst-multi-elem.s
index 05fe4da..b8b3e72 100644
--- a/test/MC/AArch64/neon-simd-ldst-multi-elem.s
+++ b/test/MC/AArch64/neon-simd-ldst-multi-elem.s
@@ -1,463 +1,463 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
 //------------------------------------------------------------------------------
 // Store multiple 1-element structures from one register
 //------------------------------------------------------------------------------
-         st1 {v0.16b}, [x0]
-         st1 {v15.8h}, [x15]
-         st1 {v31.4s}, [sp]
-         st1 {v0.2d}, [x0]
-         st1 {v0.8b}, [x0]
-         st1 {v15.4h}, [x15]
-         st1 {v31.2s}, [sp]
-         st1 {v0.1d}, [x0]
-// CHECK:	st1	{v0.16b}, [x0]          // encoding: [0x00,0x70,0x00,0x4c]
-// CHECK:	st1	{v15.8h}, [x15]         // encoding: [0xef,0x75,0x00,0x4c]
-// CHECK:	st1	{v31.4s}, [sp]          // encoding: [0xff,0x7b,0x00,0x4c]
-// CHECK:	st1	{v0.2d}, [x0]           // encoding: [0x00,0x7c,0x00,0x4c]
-// CHECK:	st1	{v0.8b}, [x0]           // encoding: [0x00,0x70,0x00,0x0c]
-// CHECK:	st1	{v15.4h}, [x15]         // encoding: [0xef,0x75,0x00,0x0c]
-// CHECK:	st1	{v31.2s}, [sp]          // encoding: [0xff,0x7b,0x00,0x0c]
-// CHECK:	st1	{v0.1d}, [x0]           // encoding: [0x00,0x7c,0x00,0x0c]
+         st1 { v0.16b }, [x0]
+         st1 { v15.8h }, [x15]
+         st1 { v31.4s }, [sp]
+         st1 { v0.2d }, [x0]
+         st1 { v0.8b }, [x0]
+         st1 { v15.4h }, [x15]
+         st1 { v31.2s }, [sp]
+         st1 { v0.1d }, [x0]
+// CHECK:	st1	{ v0.16b }, [x0]          // encoding: [0x00,0x70,0x00,0x4c]
+// CHECK:	st1	{ v15.8h }, [x15]         // encoding: [0xef,0x75,0x00,0x4c]
+// CHECK:	st1	{ v31.4s }, [sp]          // encoding: [0xff,0x7b,0x00,0x4c]
+// CHECK:	st1	{ v0.2d }, [x0]           // encoding: [0x00,0x7c,0x00,0x4c]
+// CHECK:	st1	{ v0.8b }, [x0]           // encoding: [0x00,0x70,0x00,0x0c]
+// CHECK:	st1	{ v15.4h }, [x15]         // encoding: [0xef,0x75,0x00,0x0c]
+// CHECK:	st1	{ v31.2s }, [sp]          // encoding: [0xff,0x7b,0x00,0x0c]
+// CHECK:	st1	{ v0.1d }, [x0]           // encoding: [0x00,0x7c,0x00,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 1-element structures from two consecutive registers
 //------------------------------------------------------------------------------
-         st1 {v0.16b, v1.16b}, [x0]
-         st1 {v15.8h, v16.8h}, [x15]
-         st1 {v31.4s, v0.4s}, [sp]
-         st1 {v0.2d, v1.2d}, [x0]
-         st1 {v0.8b, v1.8b}, [x0]
-         st1 {v15.4h, v16.4h}, [x15]
-         st1 {v31.2s, v0.2s}, [sp]
-         st1 {v0.1d, v1.1d}, [x0]
-// CHECK:	st1	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0xa0,0x00,0x4c]
-// CHECK:	st1	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x00,0x4c]
-// CHECK:	st1	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0xab,0x00,0x4c]
-// CHECK:	st1	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0xac,0x00,0x4c]
-// CHECK:	st1	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0xa0,0x00,0x0c]
-// CHECK:	st1	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x00,0x0c]
-// CHECK:	st1	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0xab,0x00,0x0c]
-// CHECK:	st1	{v0.1d, v1.1d}, [x0]    // encoding: [0x00,0xac,0x00,0x0c]
+         st1 { v0.16b, v1.16b }, [x0]
+         st1 { v15.8h, v16.8h }, [x15]
+         st1 { v31.4s, v0.4s }, [sp]
+         st1 { v0.2d, v1.2d }, [x0]
+         st1 { v0.8b, v1.8b }, [x0]
+         st1 { v15.4h, v16.4h }, [x15]
+         st1 { v31.2s, v0.2s }, [sp]
+         st1 { v0.1d, v1.1d }, [x0]
+// CHECK:	st1	{ v0.16b, v1.16b }, [x0]  // encoding: [0x00,0xa0,0x00,0x4c]
+// CHECK:	st1	{ v15.8h, v16.8h }, [x15] // encoding: [0xef,0xa5,0x00,0x4c]
+// CHECK:	st1	{ v31.4s, v0.4s }, [sp]   // encoding: [0xff,0xab,0x00,0x4c]
+// CHECK:	st1	{ v0.2d, v1.2d }, [x0]    // encoding: [0x00,0xac,0x00,0x4c]
+// CHECK:	st1	{ v0.8b, v1.8b }, [x0]    // encoding: [0x00,0xa0,0x00,0x0c]
+// CHECK:	st1	{ v15.4h, v16.4h }, [x15] // encoding: [0xef,0xa5,0x00,0x0c]
+// CHECK:	st1	{ v31.2s, v0.2s }, [sp]   // encoding: [0xff,0xab,0x00,0x0c]
+// CHECK:	st1	{ v0.1d, v1.1d }, [x0]    // encoding: [0x00,0xac,0x00,0x0c]
 
-         st1 {v0.16b-v1.16b}, [x0]
-         st1 {v15.8h-v16.8h}, [x15]
-         st1 {v31.4s-v0.4s}, [sp]
-         st1 {v0.2d-v1.2d}, [x0]
-         st1 {v0.8b-v1.8b}, [x0]
-         st1 {v15.4h-v16.4h}, [x15]
-         st1 {v31.2s-v0.2s}, [sp]
-         st1 {v0.1d-v1.1d}, [x0]
-// CHECK:	st1	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0xa0,0x00,0x4c]
-// CHECK:	st1	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x00,0x4c]
-// CHECK:	st1	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0xab,0x00,0x4c]
-// CHECK:	st1	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0xac,0x00,0x4c]
-// CHECK:	st1	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0xa0,0x00,0x0c]
-// CHECK:	st1	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x00,0x0c]
-// CHECK:	st1	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0xab,0x00,0x0c]
-// CHECK:	st1	{v0.1d, v1.1d}, [x0]    // encoding: [0x00,0xac,0x00,0x0c]
+         st1 { v0.16b-v1.16b }, [x0]
+         st1 { v15.8h-v16.8h }, [x15]
+         st1 { v31.4s-v0.4s }, [sp]
+         st1 { v0.2d-v1.2d }, [x0]
+         st1 { v0.8b-v1.8b }, [x0]
+         st1 { v15.4h-v16.4h }, [x15]
+         st1 { v31.2s-v0.2s }, [sp]
+         st1 { v0.1d-v1.1d }, [x0]
+// CHECK:	st1	{ v0.16b, v1.16b }, [x0]  // encoding: [0x00,0xa0,0x00,0x4c]
+// CHECK:	st1	{ v15.8h, v16.8h }, [x15] // encoding: [0xef,0xa5,0x00,0x4c]
+// CHECK:	st1	{ v31.4s, v0.4s }, [sp]   // encoding: [0xff,0xab,0x00,0x4c]
+// CHECK:	st1	{ v0.2d, v1.2d }, [x0]    // encoding: [0x00,0xac,0x00,0x4c]
+// CHECK:	st1	{ v0.8b, v1.8b }, [x0]    // encoding: [0x00,0xa0,0x00,0x0c]
+// CHECK:	st1	{ v15.4h, v16.4h }, [x15] // encoding: [0xef,0xa5,0x00,0x0c]
+// CHECK:	st1	{ v31.2s, v0.2s }, [sp]   // encoding: [0xff,0xab,0x00,0x0c]
+// CHECK:	st1	{ v0.1d, v1.1d }, [x0]    // encoding: [0x00,0xac,0x00,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 1-element structures from three consecutive registers
 //------------------------------------------------------------------------------
-         st1 {v0.16b, v1.16b, v2.16b}, [x0]
-         st1 {v15.8h, v16.8h, v17.8h}, [x15]
-         st1 {v31.4s, v0.4s, v1.4s}, [sp]
-         st1 {v0.2d, v1.2d, v2.2d}, [x0]
-         st1 {v0.8b, v1.8b, v2.8b}, [x0]
-         st1 {v15.4h, v16.4h, v17.4h}, [x15]
-         st1 {v31.2s, v0.2s, v1.2s}, [sp]
-         st1 {v0.1d, v1.1d, v2.1d}, [x0]
-// CHECK:	st1	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x00,0x4c]
-// CHECK:	st1	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x00,0x4c]
-// CHECK:	st1	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x00,0x4c]
-// CHECK:	st1	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x00,0x4c]
-// CHECK:	st1	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x00,0x0c]
-// CHECK:	st1	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x00,0x0c]
-// CHECK:	st1	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x00,0x0c]
-// CHECK:	st1	{v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x00,0x0c]
+         st1 { v0.16b, v1.16b, v2.16b }, [x0]
+         st1 { v15.8h, v16.8h, v17.8h }, [x15]
+         st1 { v31.4s, v0.4s, v1.4s }, [sp]
+         st1 { v0.2d, v1.2d, v2.2d }, [x0]
+         st1 { v0.8b, v1.8b, v2.8b }, [x0]
+         st1 { v15.4h, v16.4h, v17.4h }, [x15]
+         st1 { v31.2s, v0.2s, v1.2s }, [sp]
+         st1 { v0.1d, v1.1d, v2.1d }, [x0]
+// CHECK:	st1	{ v0.16b, v1.16b, v2.16b }, [x0] // encoding: [0x00,0x60,0x00,0x4c]
+// CHECK:	st1	{ v15.8h, v16.8h, v17.8h }, [x15] // encoding: [0xef,0x65,0x00,0x4c]
+// CHECK:	st1	{ v31.4s, v0.4s, v1.4s }, [sp] // encoding: [0xff,0x6b,0x00,0x4c]
+// CHECK:	st1	{ v0.2d, v1.2d, v2.2d }, [x0] // encoding: [0x00,0x6c,0x00,0x4c]
+// CHECK:	st1	{ v0.8b, v1.8b, v2.8b }, [x0] // encoding: [0x00,0x60,0x00,0x0c]
+// CHECK:	st1	{ v15.4h, v16.4h, v17.4h }, [x15] // encoding: [0xef,0x65,0x00,0x0c]
+// CHECK:	st1	{ v31.2s, v0.2s, v1.2s }, [sp] // encoding: [0xff,0x6b,0x00,0x0c]
+// CHECK:	st1	{ v0.1d, v1.1d, v2.1d }, [x0] // encoding: [0x00,0x6c,0x00,0x0c]
 
-         st1 {v0.16b-v2.16b}, [x0]
-         st1 {v15.8h-v17.8h}, [x15]
-         st1 {v31.4s-v1.4s}, [sp]
-         st1 {v0.2d-v2.2d}, [x0]
-         st1 {v0.8b-v2.8b}, [x0]
-         st1 {v15.4h-v17.4h}, [x15]
-         st1 {v31.2s-v1.2s}, [sp]
-         st1 {v0.1d-v2.1d}, [x0]
-// CHECK:	st1	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x00,0x4c]
-// CHECK:	st1	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x00,0x4c]
-// CHECK:	st1	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x00,0x4c]
-// CHECK:	st1	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x00,0x4c]
-// CHECK:	st1	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x00,0x0c]
-// CHECK:	st1	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x00,0x0c]
-// CHECK:	st1	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x00,0x0c]
-// CHECK:	st1	{v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x00,0x0c]
+         st1 { v0.16b-v2.16b }, [x0]
+         st1 { v15.8h-v17.8h }, [x15]
+         st1 { v31.4s-v1.4s }, [sp]
+         st1 { v0.2d-v2.2d }, [x0]
+         st1 { v0.8b-v2.8b }, [x0]
+         st1 { v15.4h-v17.4h }, [x15]
+         st1 { v31.2s-v1.2s }, [sp]
+         st1 { v0.1d-v2.1d }, [x0]
+// CHECK:	st1	{ v0.16b, v1.16b, v2.16b }, [x0] // encoding: [0x00,0x60,0x00,0x4c]
+// CHECK:	st1	{ v15.8h, v16.8h, v17.8h }, [x15] // encoding: [0xef,0x65,0x00,0x4c]
+// CHECK:	st1	{ v31.4s, v0.4s, v1.4s }, [sp] // encoding: [0xff,0x6b,0x00,0x4c]
+// CHECK:	st1	{ v0.2d, v1.2d, v2.2d }, [x0] // encoding: [0x00,0x6c,0x00,0x4c]
+// CHECK:	st1	{ v0.8b, v1.8b, v2.8b }, [x0] // encoding: [0x00,0x60,0x00,0x0c]
+// CHECK:	st1	{ v15.4h, v16.4h, v17.4h }, [x15] // encoding: [0xef,0x65,0x00,0x0c]
+// CHECK:	st1	{ v31.2s, v0.2s, v1.2s }, [sp] // encoding: [0xff,0x6b,0x00,0x0c]
+// CHECK:	st1	{ v0.1d, v1.1d, v2.1d }, [x0] // encoding: [0x00,0x6c,0x00,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 1-element structures from four consecutive registers
 //------------------------------------------------------------------------------
-         st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
-         st1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
-         st1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
-         st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
-         st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
-         st1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
-         st1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
-         st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0]
-// CHECK:	st1	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x00,0x4c]
-// CHECK:	st1	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x00,0x4c]
-// CHECK:	st1	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x00,0x4c]
-// CHECK:	st1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x00,0x4c]
-// CHECK:	st1	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x00,0x0c]
-// CHECK:	st1	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x00,0x0c]
-// CHECK:	st1	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x00,0x0c]
-// CHECK:	st1	{v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x00,0x0c]
+         st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
+         st1 { v15.8h, v16.8h, v17.8h, v18.8h }, [x15]
+         st1 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp]
+         st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+         st1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0]
+         st1 { v15.4h, v16.4h, v17.4h, v18.4h }, [x15]
+         st1 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp]
+         st1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x0]
+// CHECK:	st1	{ v0.16b, v1.16b, v2.16b, v3.16b }, [x0] // encoding: [0x00,0x20,0x00,0x4c]
+// CHECK:	st1	{ v15.8h, v16.8h, v17.8h, v18.8h }, [x15] // encoding: [0xef,0x25,0x00,0x4c]
+// CHECK:	st1	{ v31.4s, v0.4s, v1.4s, v2.4s }, [sp] // encoding: [0xff,0x2b,0x00,0x4c]
+// CHECK:	st1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0] // encoding: [0x00,0x2c,0x00,0x4c]
+// CHECK:	st1	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0] // encoding: [0x00,0x20,0x00,0x0c]
+// CHECK:	st1	{ v15.4h, v16.4h, v17.4h, v18.4h }, [x15] // encoding: [0xef,0x25,0x00,0x0c]
+// CHECK:	st1	{ v31.2s, v0.2s, v1.2s, v2.2s }, [sp] // encoding: [0xff,0x2b,0x00,0x0c]
+// CHECK:	st1	{ v0.1d, v1.1d, v2.1d, v3.1d }, [x0] // encoding: [0x00,0x2c,0x00,0x0c]
 
-         st1 {v0.16b-v3.16b}, [x0]
-         st1 {v15.8h-v18.8h}, [x15]
-         st1 {v31.4s-v2.4s}, [sp]
-         st1 {v0.2d-v3.2d}, [x0]
-         st1 {v0.8b-v3.8b}, [x0]
-         st1 {v15.4h-v18.4h}, [x15]
-         st1 {v31.2s-v2.2s}, [sp]
-         st1 {v0.1d-v3.1d}, [x0]
-// CHECK:	st1	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x00,0x4c]
-// CHECK:	st1	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x00,0x4c]
-// CHECK:	st1	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x00,0x4c]
-// CHECK:	st1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x00,0x4c]
-// CHECK:	st1	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x00,0x0c]
-// CHECK:	st1	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x00,0x0c]
-// CHECK:	st1	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x00,0x0c]
-// CHECK:	st1	{v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x00,0x0c]
+         st1 { v0.16b-v3.16b }, [x0]
+         st1 { v15.8h-v18.8h }, [x15]
+         st1 { v31.4s-v2.4s }, [sp]
+         st1 { v0.2d-v3.2d }, [x0]
+         st1 { v0.8b-v3.8b }, [x0]
+         st1 { v15.4h-v18.4h }, [x15]
+         st1 { v31.2s-v2.2s }, [sp]
+         st1 { v0.1d-v3.1d }, [x0]
+// CHECK:	st1	{ v0.16b, v1.16b, v2.16b, v3.16b }, [x0] // encoding: [0x00,0x20,0x00,0x4c]
+// CHECK:	st1	{ v15.8h, v16.8h, v17.8h, v18.8h }, [x15] // encoding: [0xef,0x25,0x00,0x4c]
+// CHECK:	st1	{ v31.4s, v0.4s, v1.4s, v2.4s }, [sp] // encoding: [0xff,0x2b,0x00,0x4c]
+// CHECK:	st1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0] // encoding: [0x00,0x2c,0x00,0x4c]
+// CHECK:	st1	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0] // encoding: [0x00,0x20,0x00,0x0c]
+// CHECK:	st1	{ v15.4h, v16.4h, v17.4h, v18.4h }, [x15] // encoding: [0xef,0x25,0x00,0x0c]
+// CHECK:	st1	{ v31.2s, v0.2s, v1.2s, v2.2s }, [sp] // encoding: [0xff,0x2b,0x00,0x0c]
+// CHECK:	st1	{ v0.1d, v1.1d, v2.1d, v3.1d }, [x0] // encoding: [0x00,0x2c,0x00,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 2-element structures from two consecutive registers
 //------------------------------------------------------------------------------
-         st2 {v0.16b, v1.16b}, [x0]
-         st2 {v15.8h, v16.8h}, [x15]
-         st2 {v31.4s, v0.4s}, [sp]
-         st2 {v0.2d, v1.2d}, [x0]
-         st2 {v0.8b, v1.8b}, [x0]
-         st2 {v15.4h, v16.4h}, [x15]
-         st2 {v31.2s, v0.2s}, [sp]
-// CHECK:	st2	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0x80,0x00,0x4c]
-// CHECK:	st2	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x00,0x4c]
-// CHECK:	st2	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0x8b,0x00,0x4c]
-// CHECK:	st2	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0x8c,0x00,0x4c]
-// CHECK:	st2	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0x80,0x00,0x0c]
-// CHECK:	st2	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x00,0x0c]
-// CHECK:	st2	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0x8b,0x00,0x0c]
+         st2 { v0.16b, v1.16b }, [x0]
+         st2 { v15.8h, v16.8h }, [x15]
+         st2 { v31.4s, v0.4s }, [sp]
+         st2 { v0.2d, v1.2d }, [x0]
+         st2 { v0.8b, v1.8b }, [x0]
+         st2 { v15.4h, v16.4h }, [x15]
+         st2 { v31.2s, v0.2s }, [sp]
+// CHECK:	st2	{ v0.16b, v1.16b }, [x0]  // encoding: [0x00,0x80,0x00,0x4c]
+// CHECK:	st2	{ v15.8h, v16.8h }, [x15] // encoding: [0xef,0x85,0x00,0x4c]
+// CHECK:	st2	{ v31.4s, v0.4s }, [sp]   // encoding: [0xff,0x8b,0x00,0x4c]
+// CHECK:	st2	{ v0.2d, v1.2d }, [x0]    // encoding: [0x00,0x8c,0x00,0x4c]
+// CHECK:	st2	{ v0.8b, v1.8b }, [x0]    // encoding: [0x00,0x80,0x00,0x0c]
+// CHECK:	st2	{ v15.4h, v16.4h }, [x15] // encoding: [0xef,0x85,0x00,0x0c]
+// CHECK:	st2	{ v31.2s, v0.2s }, [sp]   // encoding: [0xff,0x8b,0x00,0x0c]
 
-         st2 {v0.16b-v1.16b}, [x0]
-         st2 {v15.8h-v16.8h}, [x15]
-         st2 {v31.4s-v0.4s}, [sp]
-         st2 {v0.2d-v1.2d}, [x0]
-         st2 {v0.8b-v1.8b}, [x0]
-         st2 {v15.4h-v16.4h}, [x15]
-         st2 {v31.2s-v0.2s}, [sp]
-// CHECK:	st2	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0x80,0x00,0x4c]
-// CHECK:	st2	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x00,0x4c]
-// CHECK:	st2	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0x8b,0x00,0x4c]
-// CHECK:	st2	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0x8c,0x00,0x4c]
-// CHECK:	st2	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0x80,0x00,0x0c]
-// CHECK:	st2	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x00,0x0c]
-// CHECK:	st2	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0x8b,0x00,0x0c]
+         st2 { v0.16b-v1.16b }, [x0]
+         st2 { v15.8h-v16.8h }, [x15]
+         st2 { v31.4s-v0.4s }, [sp]
+         st2 { v0.2d-v1.2d }, [x0]
+         st2 { v0.8b-v1.8b }, [x0]
+         st2 { v15.4h-v16.4h }, [x15]
+         st2 { v31.2s-v0.2s }, [sp]
+// CHECK:	st2	{ v0.16b, v1.16b }, [x0]  // encoding: [0x00,0x80,0x00,0x4c]
+// CHECK:	st2	{ v15.8h, v16.8h }, [x15] // encoding: [0xef,0x85,0x00,0x4c]
+// CHECK:	st2	{ v31.4s, v0.4s }, [sp]   // encoding: [0xff,0x8b,0x00,0x4c]
+// CHECK:	st2	{ v0.2d, v1.2d }, [x0]    // encoding: [0x00,0x8c,0x00,0x4c]
+// CHECK:	st2	{ v0.8b, v1.8b }, [x0]    // encoding: [0x00,0x80,0x00,0x0c]
+// CHECK:	st2	{ v15.4h, v16.4h }, [x15] // encoding: [0xef,0x85,0x00,0x0c]
+// CHECK:	st2	{ v31.2s, v0.2s }, [sp]   // encoding: [0xff,0x8b,0x00,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 3-element structures from three consecutive registers
 //------------------------------------------------------------------------------
-         st3 {v0.16b, v1.16b, v2.16b}, [x0]
-         st3 {v15.8h, v16.8h, v17.8h}, [x15]
-         st3 {v31.4s, v0.4s, v1.4s}, [sp]
-         st3 {v0.2d, v1.2d, v2.2d}, [x0]
-         st3 {v0.8b, v1.8b, v2.8b}, [x0]
-         st3 {v15.4h, v16.4h, v17.4h}, [x15]
-         st3 {v31.2s, v0.2s, v1.2s}, [sp]
-// CHECK:	st3	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x00,0x4c]
-// CHECK:	st3	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x00,0x4c]
-// CHECK:	st3	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x00,0x4c]
-// CHECK:	st3	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x00,0x4c]
-// CHECK:	st3	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x00,0x0c]
-// CHECK:	st3	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x00,0x0c]
-// CHECK:	st3	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x00,0x0c]
+         st3 { v0.16b, v1.16b, v2.16b }, [x0]
+         st3 { v15.8h, v16.8h, v17.8h }, [x15]
+         st3 { v31.4s, v0.4s, v1.4s }, [sp]
+         st3 { v0.2d, v1.2d, v2.2d }, [x0]
+         st3 { v0.8b, v1.8b, v2.8b }, [x0]
+         st3 { v15.4h, v16.4h, v17.4h }, [x15]
+         st3 { v31.2s, v0.2s, v1.2s }, [sp]
+// CHECK:	st3	{ v0.16b, v1.16b, v2.16b }, [x0] // encoding: [0x00,0x40,0x00,0x4c]
+// CHECK:	st3	{ v15.8h, v16.8h, v17.8h }, [x15] // encoding: [0xef,0x45,0x00,0x4c]
+// CHECK:	st3	{ v31.4s, v0.4s, v1.4s }, [sp] // encoding: [0xff,0x4b,0x00,0x4c]
+// CHECK:	st3	{ v0.2d, v1.2d, v2.2d }, [x0] // encoding: [0x00,0x4c,0x00,0x4c]
+// CHECK:	st3	{ v0.8b, v1.8b, v2.8b }, [x0] // encoding: [0x00,0x40,0x00,0x0c]
+// CHECK:	st3	{ v15.4h, v16.4h, v17.4h }, [x15] // encoding: [0xef,0x45,0x00,0x0c]
+// CHECK:	st3	{ v31.2s, v0.2s, v1.2s }, [sp] // encoding: [0xff,0x4b,0x00,0x0c]
 
-         st3 {v0.16b-v2.16b}, [x0]
-         st3 {v15.8h-v17.8h}, [x15]
-         st3 {v31.4s-v1.4s}, [sp]
-         st3 {v0.2d-v2.2d}, [x0]
-         st3 {v0.8b-v2.8b}, [x0]
-         st3 {v15.4h-v17.4h}, [x15]
-         st3 {v31.2s-v1.2s}, [sp]
-// CHECK:	st3	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x00,0x4c]
-// CHECK:	st3	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x00,0x4c]
-// CHECK:	st3	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x00,0x4c]
-// CHECK:	st3	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x00,0x4c]
-// CHECK:	st3	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x00,0x0c]
-// CHECK:	st3	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x00,0x0c]
-// CHECK:	st3	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x00,0x0c]
+         st3 { v0.16b-v2.16b }, [x0]
+         st3 { v15.8h-v17.8h }, [x15]
+         st3 { v31.4s-v1.4s }, [sp]
+         st3 { v0.2d-v2.2d }, [x0]
+         st3 { v0.8b-v2.8b }, [x0]
+         st3 { v15.4h-v17.4h }, [x15]
+         st3 { v31.2s-v1.2s }, [sp]
+// CHECK:	st3	{ v0.16b, v1.16b, v2.16b }, [x0] // encoding: [0x00,0x40,0x00,0x4c]
+// CHECK:	st3	{ v15.8h, v16.8h, v17.8h }, [x15] // encoding: [0xef,0x45,0x00,0x4c]
+// CHECK:	st3	{ v31.4s, v0.4s, v1.4s }, [sp] // encoding: [0xff,0x4b,0x00,0x4c]
+// CHECK:	st3	{ v0.2d, v1.2d, v2.2d }, [x0] // encoding: [0x00,0x4c,0x00,0x4c]
+// CHECK:	st3	{ v0.8b, v1.8b, v2.8b }, [x0] // encoding: [0x00,0x40,0x00,0x0c]
+// CHECK:	st3	{ v15.4h, v16.4h, v17.4h }, [x15] // encoding: [0xef,0x45,0x00,0x0c]
+// CHECK:	st3	{ v31.2s, v0.2s, v1.2s }, [sp] // encoding: [0xff,0x4b,0x00,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 4-element structures from four consecutive registers
 //------------------------------------------------------------------------------
-         st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
-         st4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
-         st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
-         st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
-         st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
-         st4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
-         st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
-// CHECK:	st4	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x00,0x4c]
-// CHECK:	st4	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x00,0x4c]
-// CHECK:	st4	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x00,0x4c]
-// CHECK:	st4	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x00,0x4c]
-// CHECK:	st4	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x00,0x0c]
-// CHECK:	st4	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x00,0x0c]
-// CHECK:	st4	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x00,0x0c]
+         st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
+         st4 { v15.8h, v16.8h, v17.8h, v18.8h }, [x15]
+         st4 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp]
+         st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+         st4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0]
+         st4 { v15.4h, v16.4h, v17.4h, v18.4h }, [x15]
+         st4 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp]
+// CHECK:	st4	{ v0.16b, v1.16b, v2.16b, v3.16b }, [x0] // encoding: [0x00,0x00,0x00,0x4c]
+// CHECK:	st4	{ v15.8h, v16.8h, v17.8h, v18.8h }, [x15] // encoding: [0xef,0x05,0x00,0x4c]
+// CHECK:	st4	{ v31.4s, v0.4s, v1.4s, v2.4s }, [sp] // encoding: [0xff,0x0b,0x00,0x4c]
+// CHECK:	st4	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0] // encoding: [0x00,0x0c,0x00,0x4c]
+// CHECK:	st4	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0] // encoding: [0x00,0x00,0x00,0x0c]
+// CHECK:	st4	{ v15.4h, v16.4h, v17.4h, v18.4h }, [x15] // encoding: [0xef,0x05,0x00,0x0c]
+// CHECK:	st4	{ v31.2s, v0.2s, v1.2s, v2.2s }, [sp] // encoding: [0xff,0x0b,0x00,0x0c]
 
-         st4 {v0.16b-v3.16b}, [x0]
-         st4 {v15.8h-v18.8h}, [x15]
-         st4 {v31.4s-v2.4s}, [sp]
-         st4 {v0.2d-v3.2d}, [x0]
-         st4 {v0.8b-v3.8b}, [x0]
-         st4 {v15.4h-v18.4h}, [x15]
-         st4 {v31.2s-v2.2s}, [sp]
-// CHECK:	st4	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x00,0x4c]
-// CHECK:	st4	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x00,0x4c]
-// CHECK:	st4	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x00,0x4c]
-// CHECK:	st4	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x00,0x4c]
-// CHECK:	st4	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x00,0x0c]
-// CHECK:	st4	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x00,0x0c]
-// CHECK:	st4	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x00,0x0c]
+         st4 { v0.16b-v3.16b }, [x0]
+         st4 { v15.8h-v18.8h }, [x15]
+         st4 { v31.4s-v2.4s }, [sp]
+         st4 { v0.2d-v3.2d }, [x0]
+         st4 { v0.8b-v3.8b }, [x0]
+         st4 { v15.4h-v18.4h }, [x15]
+         st4 { v31.2s-v2.2s }, [sp]
+// CHECK:	st4	{ v0.16b, v1.16b, v2.16b, v3.16b }, [x0] // encoding: [0x00,0x00,0x00,0x4c]
+// CHECK:	st4	{ v15.8h, v16.8h, v17.8h, v18.8h }, [x15] // encoding: [0xef,0x05,0x00,0x4c]
+// CHECK:	st4	{ v31.4s, v0.4s, v1.4s, v2.4s }, [sp] // encoding: [0xff,0x0b,0x00,0x4c]
+// CHECK:	st4	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0] // encoding: [0x00,0x0c,0x00,0x4c]
+// CHECK:	st4	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0] // encoding: [0x00,0x00,0x00,0x0c]
+// CHECK:	st4	{ v15.4h, v16.4h, v17.4h, v18.4h }, [x15] // encoding: [0xef,0x05,0x00,0x0c]
+// CHECK:	st4	{ v31.2s, v0.2s, v1.2s, v2.2s }, [sp] // encoding: [0xff,0x0b,0x00,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 1-element structures to one register
 //------------------------------------------------------------------------------
-         ld1 {v0.16b}, [x0]
-         ld1 {v15.8h}, [x15]
-         ld1 {v31.4s}, [sp]
-         ld1 {v0.2d}, [x0]
-         ld1 {v0.8b}, [x0]
-         ld1 {v15.4h}, [x15]
-         ld1 {v31.2s}, [sp]
-         ld1 {v0.1d}, [x0]
-// CHECK:	ld1	{v0.16b}, [x0]          // encoding: [0x00,0x70,0x40,0x4c]
-// CHECK:	ld1	{v15.8h}, [x15]         // encoding: [0xef,0x75,0x40,0x4c]
-// CHECK:	ld1	{v31.4s}, [sp]          // encoding: [0xff,0x7b,0x40,0x4c]
-// CHECK:	ld1	{v0.2d}, [x0]           // encoding: [0x00,0x7c,0x40,0x4c]
-// CHECK:	ld1	{v0.8b}, [x0]           // encoding: [0x00,0x70,0x40,0x0c]
-// CHECK:	ld1	{v15.4h}, [x15]         // encoding: [0xef,0x75,0x40,0x0c]
-// CHECK:	ld1	{v31.2s}, [sp]          // encoding: [0xff,0x7b,0x40,0x0c]
-// CHECK:	ld1	{v0.1d}, [x0]           // encoding: [0x00,0x7c,0x40,0x0c]
+         ld1 { v0.16b }, [x0]
+         ld1 { v15.8h }, [x15]
+         ld1 { v31.4s }, [sp]
+         ld1 { v0.2d }, [x0]
+         ld1 { v0.8b }, [x0]
+         ld1 { v15.4h }, [x15]
+         ld1 { v31.2s }, [sp]
+         ld1 { v0.1d }, [x0]
+// CHECK:	ld1	{ v0.16b }, [x0]          // encoding: [0x00,0x70,0x40,0x4c]
+// CHECK:	ld1	{ v15.8h }, [x15]         // encoding: [0xef,0x75,0x40,0x4c]
+// CHECK:	ld1	{ v31.4s }, [sp]          // encoding: [0xff,0x7b,0x40,0x4c]
+// CHECK:	ld1	{ v0.2d }, [x0]           // encoding: [0x00,0x7c,0x40,0x4c]
+// CHECK:	ld1	{ v0.8b }, [x0]           // encoding: [0x00,0x70,0x40,0x0c]
+// CHECK:	ld1	{ v15.4h }, [x15]         // encoding: [0xef,0x75,0x40,0x0c]
+// CHECK:	ld1	{ v31.2s }, [sp]          // encoding: [0xff,0x7b,0x40,0x0c]
+// CHECK:	ld1	{ v0.1d }, [x0]           // encoding: [0x00,0x7c,0x40,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 1-element structures to two consecutive registers
 //------------------------------------------------------------------------------
-         ld1 {v0.16b, v1.16b}, [x0]
-         ld1 {v15.8h, v16.8h}, [x15]
-         ld1 {v31.4s, v0.4s}, [sp]
-         ld1 {v0.2d, v1.2d}, [x0]
-         ld1 {v0.8b, v1.8b}, [x0]
-         ld1 {v15.4h, v16.4h}, [x15]
-         ld1 {v31.2s, v0.2s}, [sp]
-         ld1 {v0.1d, v1.1d}, [x0]
-// CHECK:	ld1	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0xa0,0x40,0x4c]
-// CHECK:	ld1	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x40,0x4c]
-// CHECK:	ld1	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0xab,0x40,0x4c]
-// CHECK:	ld1	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0xac,0x40,0x4c]
-// CHECK:	ld1	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0xa0,0x40,0x0c]
-// CHECK:	ld1	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x40,0x0c]
-// CHECK:	ld1	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0xab,0x40,0x0c]
-// CHECK:	ld1	{v0.1d, v1.1d}, [x0]    // encoding: [0x00,0xac,0x40,0x0c]
+         ld1 { v0.16b, v1.16b }, [x0]
+         ld1 { v15.8h, v16.8h }, [x15]
+         ld1 { v31.4s, v0.4s }, [sp]
+         ld1 { v0.2d, v1.2d }, [x0]
+         ld1 { v0.8b, v1.8b }, [x0]
+         ld1 { v15.4h, v16.4h }, [x15]
+         ld1 { v31.2s, v0.2s }, [sp]
+         ld1 { v0.1d, v1.1d }, [x0]
+// CHECK:	ld1	{ v0.16b, v1.16b }, [x0]  // encoding: [0x00,0xa0,0x40,0x4c]
+// CHECK:	ld1	{ v15.8h, v16.8h }, [x15] // encoding: [0xef,0xa5,0x40,0x4c]
+// CHECK:	ld1	{ v31.4s, v0.4s }, [sp]   // encoding: [0xff,0xab,0x40,0x4c]
+// CHECK:	ld1	{ v0.2d, v1.2d }, [x0]    // encoding: [0x00,0xac,0x40,0x4c]
+// CHECK:	ld1	{ v0.8b, v1.8b }, [x0]    // encoding: [0x00,0xa0,0x40,0x0c]
+// CHECK:	ld1	{ v15.4h, v16.4h }, [x15] // encoding: [0xef,0xa5,0x40,0x0c]
+// CHECK:	ld1	{ v31.2s, v0.2s }, [sp]   // encoding: [0xff,0xab,0x40,0x0c]
+// CHECK:	ld1	{ v0.1d, v1.1d }, [x0]    // encoding: [0x00,0xac,0x40,0x0c]
 
-         ld1 {v0.16b-v1.16b}, [x0]
-         ld1 {v15.8h-v16.8h}, [x15]
-         ld1 {v31.4s-v0.4s}, [sp]
-         ld1 {v0.2d-v1.2d}, [x0]
-         ld1 {v0.8b-v1.8b}, [x0]
-         ld1 {v15.4h-v16.4h}, [x15]
-         ld1 {v31.2s-v0.2s}, [sp]
-         ld1 {v0.1d-v1.1d}, [x0]
-// CHECK:	ld1	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0xa0,0x40,0x4c]
-// CHECK:	ld1	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x40,0x4c]
-// CHECK:	ld1	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0xab,0x40,0x4c]
-// CHECK:	ld1	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0xac,0x40,0x4c]
-// CHECK:	ld1	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0xa0,0x40,0x0c]
-// CHECK:	ld1	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x40,0x0c]
-// CHECK:	ld1	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0xab,0x40,0x0c]
-// CHECK:	ld1	{v0.1d, v1.1d}, [x0]    // encoding: [0x00,0xac,0x40,0x0c]
+         ld1 { v0.16b-v1.16b }, [x0]
+         ld1 { v15.8h-v16.8h }, [x15]
+         ld1 { v31.4s-v0.4s }, [sp]
+         ld1 { v0.2d-v1.2d }, [x0]
+         ld1 { v0.8b-v1.8b }, [x0]
+         ld1 { v15.4h-v16.4h }, [x15]
+         ld1 { v31.2s-v0.2s }, [sp]
+         ld1 { v0.1d-v1.1d }, [x0]
+// CHECK:	ld1	{ v0.16b, v1.16b }, [x0]  // encoding: [0x00,0xa0,0x40,0x4c]
+// CHECK:	ld1	{ v15.8h, v16.8h }, [x15] // encoding: [0xef,0xa5,0x40,0x4c]
+// CHECK:	ld1	{ v31.4s, v0.4s }, [sp]   // encoding: [0xff,0xab,0x40,0x4c]
+// CHECK:	ld1	{ v0.2d, v1.2d }, [x0]    // encoding: [0x00,0xac,0x40,0x4c]
+// CHECK:	ld1	{ v0.8b, v1.8b }, [x0]    // encoding: [0x00,0xa0,0x40,0x0c]
+// CHECK:	ld1	{ v15.4h, v16.4h }, [x15] // encoding: [0xef,0xa5,0x40,0x0c]
+// CHECK:	ld1	{ v31.2s, v0.2s }, [sp]   // encoding: [0xff,0xab,0x40,0x0c]
+// CHECK:	ld1	{ v0.1d, v1.1d }, [x0]    // encoding: [0x00,0xac,0x40,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 1-element structures to three consecutive registers
 //------------------------------------------------------------------------------
-         ld1 {v0.16b, v1.16b, v2.16b}, [x0]
-         ld1 {v15.8h, v16.8h, v17.8h}, [x15]
-         ld1 {v31.4s, v0.4s, v1.4s}, [sp]
-         ld1 {v0.2d, v1.2d, v2.2d}, [x0]
-         ld1 {v0.8b, v1.8b, v2.8b}, [x0]
-         ld1 {v15.4h, v16.4h, v17.4h}, [x15]
-         ld1 {v31.2s, v0.2s, v1.2s}, [sp]
-         ld1 {v0.1d, v1.1d, v2.1d}, [x0]
-// CHECK:	ld1	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x40,0x4c]
-// CHECK:	ld1	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x40,0x4c]
-// CHECK:	ld1	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x40,0x4c]
-// CHECK:	ld1	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x40,0x4c]
-// CHECK:	ld1	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x40,0x0c]
-// CHECK:	ld1	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x40,0x0c]
-// CHECK:	ld1	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x40,0x0c]
-// CHECK:	ld1	{v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x40,0x0c]
+         ld1 { v0.16b, v1.16b, v2.16b }, [x0]
+         ld1 { v15.8h, v16.8h, v17.8h }, [x15]
+         ld1 { v31.4s, v0.4s, v1.4s }, [sp]
+         ld1 { v0.2d, v1.2d, v2.2d }, [x0]
+         ld1 { v0.8b, v1.8b, v2.8b }, [x0]
+         ld1 { v15.4h, v16.4h, v17.4h }, [x15]
+         ld1 { v31.2s, v0.2s, v1.2s }, [sp]
+         ld1 { v0.1d, v1.1d, v2.1d }, [x0]
+// CHECK:	ld1	{ v0.16b, v1.16b, v2.16b }, [x0] // encoding: [0x00,0x60,0x40,0x4c]
+// CHECK:	ld1	{ v15.8h, v16.8h, v17.8h }, [x15] // encoding: [0xef,0x65,0x40,0x4c]
+// CHECK:	ld1	{ v31.4s, v0.4s, v1.4s }, [sp] // encoding: [0xff,0x6b,0x40,0x4c]
+// CHECK:	ld1	{ v0.2d, v1.2d, v2.2d }, [x0] // encoding: [0x00,0x6c,0x40,0x4c]
+// CHECK:	ld1	{ v0.8b, v1.8b, v2.8b }, [x0] // encoding: [0x00,0x60,0x40,0x0c]
+// CHECK:	ld1	{ v15.4h, v16.4h, v17.4h }, [x15] // encoding: [0xef,0x65,0x40,0x0c]
+// CHECK:	ld1	{ v31.2s, v0.2s, v1.2s }, [sp] // encoding: [0xff,0x6b,0x40,0x0c]
+// CHECK:	ld1	{ v0.1d, v1.1d, v2.1d }, [x0] // encoding: [0x00,0x6c,0x40,0x0c]
 
-         ld1 {v0.16b-v2.16b}, [x0]
-         ld1 {v15.8h-v17.8h}, [x15]
-         ld1 {v31.4s-v1.4s}, [sp]
-         ld1 {v0.2d-v2.2d}, [x0]
-         ld1 {v0.8b-v2.8b}, [x0]
-         ld1 {v15.4h-v17.4h}, [x15]
-         ld1 {v31.2s-v1.2s}, [sp]
-         ld1 {v0.1d-v2.1d}, [x0]
-// CHECK:	ld1	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x40,0x4c]
-// CHECK:	ld1	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x40,0x4c]
-// CHECK:	ld1	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x40,0x4c]
-// CHECK:	ld1	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x40,0x4c]
-// CHECK:	ld1	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x40,0x0c]
-// CHECK:	ld1	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x40,0x0c]
-// CHECK:	ld1	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x40,0x0c]
-// CHECK:	ld1	{v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x40,0x0c]
+         ld1 { v0.16b-v2.16b }, [x0]
+         ld1 { v15.8h-v17.8h }, [x15]
+         ld1 { v31.4s-v1.4s }, [sp]
+         ld1 { v0.2d-v2.2d }, [x0]
+         ld1 { v0.8b-v2.8b }, [x0]
+         ld1 { v15.4h-v17.4h }, [x15]
+         ld1 { v31.2s-v1.2s }, [sp]
+         ld1 { v0.1d-v2.1d }, [x0]
+// CHECK:	ld1	{ v0.16b, v1.16b, v2.16b }, [x0] // encoding: [0x00,0x60,0x40,0x4c]
+// CHECK:	ld1	{ v15.8h, v16.8h, v17.8h }, [x15] // encoding: [0xef,0x65,0x40,0x4c]
+// CHECK:	ld1	{ v31.4s, v0.4s, v1.4s }, [sp] // encoding: [0xff,0x6b,0x40,0x4c]
+// CHECK:	ld1	{ v0.2d, v1.2d, v2.2d }, [x0] // encoding: [0x00,0x6c,0x40,0x4c]
+// CHECK:	ld1	{ v0.8b, v1.8b, v2.8b }, [x0] // encoding: [0x00,0x60,0x40,0x0c]
+// CHECK:	ld1	{ v15.4h, v16.4h, v17.4h }, [x15] // encoding: [0xef,0x65,0x40,0x0c]
+// CHECK:	ld1	{ v31.2s, v0.2s, v1.2s }, [sp] // encoding: [0xff,0x6b,0x40,0x0c]
+// CHECK:	ld1	{ v0.1d, v1.1d, v2.1d }, [x0] // encoding: [0x00,0x6c,0x40,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 1-element structures to four consecutive registers
 //------------------------------------------------------------------------------
-         ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
-         ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
-         ld1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
-         ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
-         ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
-         ld1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
-         ld1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
-         ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0]
-// CHECK:	ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x40,0x4c]
-// CHECK:	ld1	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x40,0x4c]
-// CHECK:	ld1	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x40,0x4c]
-// CHECK:	ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x40,0x4c]
-// CHECK:	ld1	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x40,0x0c]
-// CHECK:	ld1	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x40,0x0c]
-// CHECK:	ld1	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x40,0x0c]
-// CHECK:	ld1	{v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x40,0x0c]
+         ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
+         ld1 { v15.8h, v16.8h, v17.8h, v18.8h }, [x15]
+         ld1 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp]
+         ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+         ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0]
+         ld1 { v15.4h, v16.4h, v17.4h, v18.4h }, [x15]
+         ld1 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp]
+         ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x0]
+// CHECK:	ld1	{ v0.16b, v1.16b, v2.16b, v3.16b }, [x0] // encoding: [0x00,0x20,0x40,0x4c]
+// CHECK:	ld1	{ v15.8h, v16.8h, v17.8h, v18.8h }, [x15] // encoding: [0xef,0x25,0x40,0x4c]
+// CHECK:	ld1	{ v31.4s, v0.4s, v1.4s, v2.4s }, [sp] // encoding: [0xff,0x2b,0x40,0x4c]
+// CHECK:	ld1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0] // encoding: [0x00,0x2c,0x40,0x4c]
+// CHECK:	ld1	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0] // encoding: [0x00,0x20,0x40,0x0c]
+// CHECK:	ld1	{ v15.4h, v16.4h, v17.4h, v18.4h }, [x15] // encoding: [0xef,0x25,0x40,0x0c]
+// CHECK:	ld1	{ v31.2s, v0.2s, v1.2s, v2.2s }, [sp] // encoding: [0xff,0x2b,0x40,0x0c]
+// CHECK:	ld1	{ v0.1d, v1.1d, v2.1d, v3.1d }, [x0] // encoding: [0x00,0x2c,0x40,0x0c]
 
-         ld1 {v0.16b-v3.16b}, [x0]
-         ld1 {v15.8h-v18.8h}, [x15]
-         ld1 {v31.4s-v2.4s}, [sp]
-         ld1 {v0.2d-v3.2d}, [x0]
-         ld1 {v0.8b-v3.8b}, [x0]
-         ld1 {v15.4h-v18.4h}, [x15]
-         ld1 {v31.2s-v2.2s}, [sp]
-         ld1 {v0.1d-v3.1d}, [x0]
-// CHECK:	ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x40,0x4c]
-// CHECK:	ld1	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x40,0x4c]
-// CHECK:	ld1	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x40,0x4c]
-// CHECK:	ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x40,0x4c]
-// CHECK:	ld1	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x40,0x0c]
-// CHECK:	ld1	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x40,0x0c]
-// CHECK:	ld1	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x40,0x0c]
-// CHECK:	ld1	{v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x40,0x0c]
+         ld1 { v0.16b-v3.16b }, [x0]
+         ld1 { v15.8h-v18.8h }, [x15]
+         ld1 { v31.4s-v2.4s }, [sp]
+         ld1 { v0.2d-v3.2d }, [x0]
+         ld1 { v0.8b-v3.8b }, [x0]
+         ld1 { v15.4h-v18.4h }, [x15]
+         ld1 { v31.2s-v2.2s }, [sp]
+         ld1 { v0.1d-v3.1d }, [x0]
+// CHECK:	ld1	{ v0.16b, v1.16b, v2.16b, v3.16b }, [x0] // encoding: [0x00,0x20,0x40,0x4c]
+// CHECK:	ld1	{ v15.8h, v16.8h, v17.8h, v18.8h }, [x15] // encoding: [0xef,0x25,0x40,0x4c]
+// CHECK:	ld1	{ v31.4s, v0.4s, v1.4s, v2.4s }, [sp] // encoding: [0xff,0x2b,0x40,0x4c]
+// CHECK:	ld1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0] // encoding: [0x00,0x2c,0x40,0x4c]
+// CHECK:	ld1	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0] // encoding: [0x00,0x20,0x40,0x0c]
+// CHECK:	ld1	{ v15.4h, v16.4h, v17.4h, v18.4h }, [x15] // encoding: [0xef,0x25,0x40,0x0c]
+// CHECK:	ld1	{ v31.2s, v0.2s, v1.2s, v2.2s }, [sp] // encoding: [0xff,0x2b,0x40,0x0c]
+// CHECK:	ld1	{ v0.1d, v1.1d, v2.1d, v3.1d }, [x0] // encoding: [0x00,0x2c,0x40,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 4-element structures to two consecutive registers
 //------------------------------------------------------------------------------
-         ld2 {v0.16b, v1.16b}, [x0]
-         ld2 {v15.8h, v16.8h}, [x15]
-         ld2 {v31.4s, v0.4s}, [sp]
-         ld2 {v0.2d, v1.2d}, [x0]
-         ld2 {v0.8b, v1.8b}, [x0]
-         ld2 {v15.4h, v16.4h}, [x15]
-         ld2 {v31.2s, v0.2s}, [sp]
-// CHECK:	ld2	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0x80,0x40,0x4c]
-// CHECK:	ld2	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x40,0x4c]
-// CHECK:	ld2	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0x8b,0x40,0x4c]
-// CHECK:	ld2	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0x8c,0x40,0x4c]
-// CHECK:	ld2	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0x80,0x40,0x0c]
-// CHECK:	ld2	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x40,0x0c]
-// CHECK:	ld2	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0x8b,0x40,0x0c]
+         ld2 { v0.16b, v1.16b }, [x0]
+         ld2 { v15.8h, v16.8h }, [x15]
+         ld2 { v31.4s, v0.4s }, [sp]
+         ld2 { v0.2d, v1.2d }, [x0]
+         ld2 { v0.8b, v1.8b }, [x0]
+         ld2 { v15.4h, v16.4h }, [x15]
+         ld2 { v31.2s, v0.2s }, [sp]
+// CHECK:	ld2	{ v0.16b, v1.16b }, [x0]  // encoding: [0x00,0x80,0x40,0x4c]
+// CHECK:	ld2	{ v15.8h, v16.8h }, [x15] // encoding: [0xef,0x85,0x40,0x4c]
+// CHECK:	ld2	{ v31.4s, v0.4s }, [sp]   // encoding: [0xff,0x8b,0x40,0x4c]
+// CHECK:	ld2	{ v0.2d, v1.2d }, [x0]    // encoding: [0x00,0x8c,0x40,0x4c]
+// CHECK:	ld2	{ v0.8b, v1.8b }, [x0]    // encoding: [0x00,0x80,0x40,0x0c]
+// CHECK:	ld2	{ v15.4h, v16.4h }, [x15] // encoding: [0xef,0x85,0x40,0x0c]
+// CHECK:	ld2	{ v31.2s, v0.2s }, [sp]   // encoding: [0xff,0x8b,0x40,0x0c]
 
-         ld2 {v0.16b-v1.16b}, [x0]
-         ld2 {v15.8h-v16.8h}, [x15]
-         ld2 {v31.4s-v0.4s}, [sp]
-         ld2 {v0.2d-v1.2d}, [x0]
-         ld2 {v0.8b-v1.8b}, [x0]
-         ld2 {v15.4h-v16.4h}, [x15]
-         ld2 {v31.2s-v0.2s}, [sp]
-// CHECK:	ld2	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0x80,0x40,0x4c]
-// CHECK:	ld2	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x40,0x4c]
-// CHECK:	ld2	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0x8b,0x40,0x4c]
-// CHECK:	ld2	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0x8c,0x40,0x4c]
-// CHECK:	ld2	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0x80,0x40,0x0c]
-// CHECK:	ld2	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x40,0x0c]
-// CHECK:	ld2	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0x8b,0x40,0x0c]
+         ld2 { v0.16b-v1.16b }, [x0]
+         ld2 { v15.8h-v16.8h }, [x15]
+         ld2 { v31.4s-v0.4s }, [sp]
+         ld2 { v0.2d-v1.2d }, [x0]
+         ld2 { v0.8b-v1.8b }, [x0]
+         ld2 { v15.4h-v16.4h }, [x15]
+         ld2 { v31.2s-v0.2s }, [sp]
+// CHECK:	ld2	{ v0.16b, v1.16b }, [x0]  // encoding: [0x00,0x80,0x40,0x4c]
+// CHECK:	ld2	{ v15.8h, v16.8h }, [x15] // encoding: [0xef,0x85,0x40,0x4c]
+// CHECK:	ld2	{ v31.4s, v0.4s }, [sp]   // encoding: [0xff,0x8b,0x40,0x4c]
+// CHECK:	ld2	{ v0.2d, v1.2d }, [x0]    // encoding: [0x00,0x8c,0x40,0x4c]
+// CHECK:	ld2	{ v0.8b, v1.8b }, [x0]    // encoding: [0x00,0x80,0x40,0x0c]
+// CHECK:	ld2	{ v15.4h, v16.4h }, [x15] // encoding: [0xef,0x85,0x40,0x0c]
+// CHECK:	ld2	{ v31.2s, v0.2s }, [sp]   // encoding: [0xff,0x8b,0x40,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 3-element structures to three consecutive registers
 //------------------------------------------------------------------------------
-         ld3 {v0.16b, v1.16b, v2.16b}, [x0]
-         ld3 {v15.8h, v16.8h, v17.8h}, [x15]
-         ld3 {v31.4s, v0.4s, v1.4s}, [sp]
-         ld3 {v0.2d, v1.2d, v2.2d}, [x0]
-         ld3 {v0.8b, v1.8b, v2.8b}, [x0]
-         ld3 {v15.4h, v16.4h, v17.4h}, [x15]
-         ld3 {v31.2s, v0.2s, v1.2s}, [sp]
-// CHECK:	ld3	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x40,0x4c]
-// CHECK:	ld3	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x40,0x4c]
-// CHECK:	ld3	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x40,0x4c]
-// CHECK:	ld3	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x40,0x4c]
-// CHECK:	ld3	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x40,0x0c]
-// CHECK:	ld3	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x40,0x0c]
-// CHECK:	ld3	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x40,0x0c]
+         ld3 { v0.16b, v1.16b, v2.16b }, [x0]
+         ld3 { v15.8h, v16.8h, v17.8h }, [x15]
+         ld3 { v31.4s, v0.4s, v1.4s }, [sp]
+         ld3 { v0.2d, v1.2d, v2.2d }, [x0]
+         ld3 { v0.8b, v1.8b, v2.8b }, [x0]
+         ld3 { v15.4h, v16.4h, v17.4h }, [x15]
+         ld3 { v31.2s, v0.2s, v1.2s }, [sp]
+// CHECK:	ld3	{ v0.16b, v1.16b, v2.16b }, [x0] // encoding: [0x00,0x40,0x40,0x4c]
+// CHECK:	ld3	{ v15.8h, v16.8h, v17.8h }, [x15] // encoding: [0xef,0x45,0x40,0x4c]
+// CHECK:	ld3	{ v31.4s, v0.4s, v1.4s }, [sp] // encoding: [0xff,0x4b,0x40,0x4c]
+// CHECK:	ld3	{ v0.2d, v1.2d, v2.2d }, [x0] // encoding: [0x00,0x4c,0x40,0x4c]
+// CHECK:	ld3	{ v0.8b, v1.8b, v2.8b }, [x0] // encoding: [0x00,0x40,0x40,0x0c]
+// CHECK:	ld3	{ v15.4h, v16.4h, v17.4h }, [x15] // encoding: [0xef,0x45,0x40,0x0c]
+// CHECK:	ld3	{ v31.2s, v0.2s, v1.2s }, [sp] // encoding: [0xff,0x4b,0x40,0x0c]
 
-         ld3 {v0.16b-v2.16b}, [x0]
-         ld3 {v15.8h-v17.8h}, [x15]
-         ld3 {v31.4s-v1.4s}, [sp]
-         ld3 {v0.2d-v2.2d}, [x0]
-         ld3 {v0.8b-v2.8b}, [x0]
-         ld3 {v15.4h-v17.4h}, [x15]
-         ld3 {v31.2s-v1.2s}, [sp]
-// CHECK:	ld3	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x40,0x4c]
-// CHECK:	ld3	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x40,0x4c]
-// CHECK:	ld3	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x40,0x4c]
-// CHECK:	ld3	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x40,0x4c]
-// CHECK:	ld3	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x40,0x0c]
-// CHECK:	ld3	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x40,0x0c]
-// CHECK:	ld3	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x40,0x0c]
+         ld3 { v0.16b-v2.16b }, [x0]
+         ld3 { v15.8h-v17.8h }, [x15]
+         ld3 { v31.4s-v1.4s }, [sp]
+         ld3 { v0.2d-v2.2d }, [x0]
+         ld3 { v0.8b-v2.8b }, [x0]
+         ld3 { v15.4h-v17.4h }, [x15]
+         ld3 { v31.2s-v1.2s }, [sp]
+// CHECK:	ld3	{ v0.16b, v1.16b, v2.16b }, [x0] // encoding: [0x00,0x40,0x40,0x4c]
+// CHECK:	ld3	{ v15.8h, v16.8h, v17.8h }, [x15] // encoding: [0xef,0x45,0x40,0x4c]
+// CHECK:	ld3	{ v31.4s, v0.4s, v1.4s }, [sp] // encoding: [0xff,0x4b,0x40,0x4c]
+// CHECK:	ld3	{ v0.2d, v1.2d, v2.2d }, [x0] // encoding: [0x00,0x4c,0x40,0x4c]
+// CHECK:	ld3	{ v0.8b, v1.8b, v2.8b }, [x0] // encoding: [0x00,0x40,0x40,0x0c]
+// CHECK:	ld3	{ v15.4h, v16.4h, v17.4h }, [x15] // encoding: [0xef,0x45,0x40,0x0c]
+// CHECK:	ld3	{ v31.2s, v0.2s, v1.2s }, [sp] // encoding: [0xff,0x4b,0x40,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 4-element structures to four consecutive registers
 //------------------------------------------------------------------------------
-         ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
-         ld4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
-         ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
-         ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
-         ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
-         ld4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
-         ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
-// CHECK:	ld4	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x40,0x4c]
-// CHECK:	ld4	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x40,0x4c]
-// CHECK:	ld4	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x40,0x4c]
-// CHECK:	ld4	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x40,0x4c]
-// CHECK:	ld4	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x40,0x0c]
-// CHECK:	ld4	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x40,0x0c]
-// CHECK:	ld4	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x40,0x0c]
+         ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
+         ld4 { v15.8h, v16.8h, v17.8h, v18.8h }, [x15]
+         ld4 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp]
+         ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+         ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0]
+         ld4 { v15.4h, v16.4h, v17.4h, v18.4h }, [x15]
+         ld4 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp]
+// CHECK:	ld4	{ v0.16b, v1.16b, v2.16b, v3.16b }, [x0] // encoding: [0x00,0x00,0x40,0x4c]
+// CHECK:	ld4	{ v15.8h, v16.8h, v17.8h, v18.8h }, [x15] // encoding: [0xef,0x05,0x40,0x4c]
+// CHECK:	ld4	{ v31.4s, v0.4s, v1.4s, v2.4s }, [sp] // encoding: [0xff,0x0b,0x40,0x4c]
+// CHECK:	ld4	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0] // encoding: [0x00,0x0c,0x40,0x4c]
+// CHECK:	ld4	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0] // encoding: [0x00,0x00,0x40,0x0c]
+// CHECK:	ld4	{ v15.4h, v16.4h, v17.4h, v18.4h }, [x15] // encoding: [0xef,0x05,0x40,0x0c]
+// CHECK:	ld4	{ v31.2s, v0.2s, v1.2s, v2.2s }, [sp] // encoding: [0xff,0x0b,0x40,0x0c]
 
-         ld4 {v0.16b-v3.16b}, [x0]
-         ld4 {v15.8h-v18.8h}, [x15]
-         ld4 {v31.4s-v2.4s}, [sp]
-         ld4 {v0.2d-v3.2d}, [x0]
-         ld4 {v0.8b-v3.8b}, [x0]
-         ld4 {v15.4h-v18.4h}, [x15]
-         ld4 {v31.2s-v2.2s}, [sp]
-// CHECK:	ld4	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x40,0x4c]
-// CHECK:	ld4	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x40,0x4c]
-// CHECK:	ld4	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x40,0x4c]
-// CHECK:	ld4	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x40,0x4c]
-// CHECK:	ld4	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x40,0x0c]
-// CHECK:	ld4	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x40,0x0c]
-// CHECK:	ld4	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x40,0x0c]
+         ld4 { v0.16b-v3.16b }, [x0]
+         ld4 { v15.8h-v18.8h }, [x15]
+         ld4 { v31.4s-v2.4s }, [sp]
+         ld4 { v0.2d-v3.2d }, [x0]
+         ld4 { v0.8b-v3.8b }, [x0]
+         ld4 { v15.4h-v18.4h }, [x15]
+         ld4 { v31.2s-v2.2s }, [sp]
+// CHECK:	ld4	{ v0.16b, v1.16b, v2.16b, v3.16b }, [x0] // encoding: [0x00,0x00,0x40,0x4c]
+// CHECK:	ld4	{ v15.8h, v16.8h, v17.8h, v18.8h }, [x15] // encoding: [0xef,0x05,0x40,0x4c]
+// CHECK:	ld4	{ v31.4s, v0.4s, v1.4s, v2.4s }, [sp] // encoding: [0xff,0x0b,0x40,0x4c]
+// CHECK:	ld4	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0] // encoding: [0x00,0x0c,0x40,0x4c]
+// CHECK:	ld4	{ v0.8b, v1.8b, v2.8b, v3.8b }, [x0] // encoding: [0x00,0x00,0x40,0x0c]
+// CHECK:	ld4	{ v15.4h, v16.4h, v17.4h, v18.4h }, [x15] // encoding: [0xef,0x05,0x40,0x0c]
+// CHECK:	ld4	{ v31.2s, v0.2s, v1.2s, v2.2s }, [sp] // encoding: [0xff,0x0b,0x40,0x0c]
diff --git a/test/MC/AArch64/neon-simd-ldst-one-elem.s b/test/MC/AArch64/neon-simd-ldst-one-elem.s
index 140d752..4febf6d 100644
--- a/test/MC/AArch64/neon-simd-ldst-one-elem.s
+++ b/test/MC/AArch64/neon-simd-ldst-one-elem.s
@@ -1,325 +1,325 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
 //------------------------------------------------------------------------------
 // Load single 1-element structure to all lanes of 1 register
 //------------------------------------------------------------------------------
-         ld1r {v0.16b}, [x0]
-         ld1r {v15.8h}, [x15]
-         ld1r {v31.4s}, [sp]
-         ld1r {v0.2d}, [x0]
-         ld1r {v0.8b}, [x0]
-         ld1r {v15.4h}, [x15]
-         ld1r {v31.2s}, [sp]
-         ld1r {v0.1d}, [x0]
-// CHECK: ld1r {v0.16b}, [x0]          // encoding: [0x00,0xc0,0x40,0x4d]
-// CHECK: ld1r {v15.8h}, [x15]         // encoding: [0xef,0xc5,0x40,0x4d]
-// CHECK: ld1r {v31.4s}, [sp]          // encoding: [0xff,0xcb,0x40,0x4d]
-// CHECK: ld1r {v0.2d}, [x0]           // encoding: [0x00,0xcc,0x40,0x4d]
-// CHECK: ld1r {v0.8b}, [x0]           // encoding: [0x00,0xc0,0x40,0x0d]
-// CHECK: ld1r {v15.4h}, [x15]         // encoding: [0xef,0xc5,0x40,0x0d]
-// CHECK: ld1r {v31.2s}, [sp]          // encoding: [0xff,0xcb,0x40,0x0d]
-// CHECK: ld1r {v0.1d}, [x0]           // encoding: [0x00,0xcc,0x40,0x0d]
+         ld1r { v0.16b }, [x0]
+         ld1r { v15.8h }, [x15]
+         ld1r { v31.4s }, [sp]
+         ld1r { v0.2d }, [x0]
+         ld1r { v0.8b }, [x0]
+         ld1r { v15.4h }, [x15]
+         ld1r { v31.2s }, [sp]
+         ld1r { v0.1d }, [x0]
+// CHECK: ld1r { v0.16b }, [x0]          // encoding: [0x00,0xc0,0x40,0x4d]
+// CHECK: ld1r { v15.8h }, [x15]         // encoding: [0xef,0xc5,0x40,0x4d]
+// CHECK: ld1r { v31.4s }, [sp]          // encoding: [0xff,0xcb,0x40,0x4d]
+// CHECK: ld1r { v0.2d }, [x0]           // encoding: [0x00,0xcc,0x40,0x4d]
+// CHECK: ld1r { v0.8b }, [x0]           // encoding: [0x00,0xc0,0x40,0x0d]
+// CHECK: ld1r { v15.4h }, [x15]         // encoding: [0xef,0xc5,0x40,0x0d]
+// CHECK: ld1r { v31.2s }, [sp]          // encoding: [0xff,0xcb,0x40,0x0d]
+// CHECK: ld1r { v0.1d }, [x0]           // encoding: [0x00,0xcc,0x40,0x0d]
 
 //------------------------------------------------------------------------------
 // Load single N-element structure to all lanes of N consecutive
 // registers (N = 2,3,4)
 //------------------------------------------------------------------------------
-         ld2r {v0.16b, v1.16b}, [x0]
-         ld2r {v15.8h, v16.8h}, [x15]
-         ld2r {v31.4s, v0.4s}, [sp]
-         ld2r {v0.2d, v1.2d}, [x0]
-         ld2r {v0.8b, v1.8b}, [x0]
-         ld2r {v15.4h, v16.4h}, [x15]
-         ld2r {v31.2s, v0.2s}, [sp]
-         ld2r {v31.1d, v0.1d}, [sp]
-// CHECK: ld2r {v0.16b, v1.16b}, [x0]  // encoding: [0x00,0xc0,0x60,0x4d]
-// CHECK: ld2r {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xc5,0x60,0x4d]
-// CHECK: ld2r {v31.4s, v0.4s}, [sp]   // encoding: [0xff,0xcb,0x60,0x4d]
-// CHECK: ld2r {v0.2d, v1.2d}, [x0]    // encoding: [0x00,0xcc,0x60,0x4d]
-// CHECK: ld2r {v0.8b, v1.8b}, [x0]    // encoding: [0x00,0xc0,0x60,0x0d]
-// CHECK: ld2r {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xc5,0x60,0x0d]
-// CHECK: ld2r {v31.2s, v0.2s}, [sp]   // encoding: [0xff,0xcb,0x60,0x0d]
-// CHECK: ld2r {v31.1d, v0.1d}, [sp]   // encoding: [0xff,0xcf,0x60,0x0d]
+         ld2r { v0.16b, v1.16b }, [x0]
+         ld2r { v15.8h, v16.8h }, [x15]
+         ld2r { v31.4s, v0.4s }, [sp]
+         ld2r { v0.2d, v1.2d }, [x0]
+         ld2r { v0.8b, v1.8b }, [x0]
+         ld2r { v15.4h, v16.4h }, [x15]
+         ld2r { v31.2s, v0.2s }, [sp]
+         ld2r { v31.1d, v0.1d }, [sp]
+// CHECK: ld2r { v0.16b, v1.16b }, [x0]  // encoding: [0x00,0xc0,0x60,0x4d]
+// CHECK: ld2r { v15.8h, v16.8h }, [x15] // encoding: [0xef,0xc5,0x60,0x4d]
+// CHECK: ld2r { v31.4s, v0.4s }, [sp]   // encoding: [0xff,0xcb,0x60,0x4d]
+// CHECK: ld2r { v0.2d, v1.2d }, [x0]    // encoding: [0x00,0xcc,0x60,0x4d]
+// CHECK: ld2r { v0.8b, v1.8b }, [x0]    // encoding: [0x00,0xc0,0x60,0x0d]
+// CHECK: ld2r { v15.4h, v16.4h }, [x15] // encoding: [0xef,0xc5,0x60,0x0d]
+// CHECK: ld2r { v31.2s, v0.2s }, [sp]   // encoding: [0xff,0xcb,0x60,0x0d]
+// CHECK: ld2r { v31.1d, v0.1d }, [sp]   // encoding: [0xff,0xcf,0x60,0x0d]
 
-         ld3r {v0.16b, v1.16b, v2.16b}, [x0]
-         ld3r {v15.8h, v16.8h, v17.8h}, [x15]
-         ld3r {v31.4s, v0.4s, v1.4s}, [sp]
-         ld3r {v0.2d, v1.2d, v2.2d}, [x0]
-         ld3r {v0.8b, v1.8b, v2.8b}, [x0]
-         ld3r {v15.4h, v16.4h, v17.4h}, [x15]
-         ld3r {v31.2s, v0.2s, v1.2s}, [sp]
-         ld3r {v31.1d, v0.1d, v1.1d}, [sp]
-// CHECK: ld3r {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0xe0,0x40,0x4d]
-// CHECK: ld3r {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0xe5,0x40,0x4d]
-// CHECK: ld3r {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0xeb,0x40,0x4d]
-// CHECK: ld3r {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0xec,0x40,0x4d]
-// CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0xe0,0x40,0x0d]
-// CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0xe5,0x40,0x0d]
-// CHECK: ld3r {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0xeb,0x40,0x0d]
-// CHECK: ld3r {v31.1d, v0.1d, v1.1d}, [sp] // encoding: [0xff,0xef,0x40,0x0d]
+         ld3r { v0.16b, v1.16b, v2.16b }, [x0]
+         ld3r { v15.8h, v16.8h, v17.8h }, [x15]
+         ld3r { v31.4s, v0.4s, v1.4s }, [sp]
+         ld3r { v0.2d, v1.2d, v2.2d }, [x0]
+         ld3r { v0.8b, v1.8b, v2.8b }, [x0]
+         ld3r { v15.4h, v16.4h, v17.4h }, [x15]
+         ld3r { v31.2s, v0.2s, v1.2s }, [sp]
+         ld3r { v31.1d, v0.1d, v1.1d }, [sp]
+// CHECK: ld3r { v0.16b, v1.16b, v2.16b }, [x0] // encoding: [0x00,0xe0,0x40,0x4d]
+// CHECK: ld3r { v15.8h, v16.8h, v17.8h }, [x15] // encoding: [0xef,0xe5,0x40,0x4d]
+// CHECK: ld3r { v31.4s, v0.4s, v1.4s }, [sp] // encoding: [0xff,0xeb,0x40,0x4d]
+// CHECK: ld3r { v0.2d, v1.2d, v2.2d }, [x0] // encoding: [0x00,0xec,0x40,0x4d]
+// CHECK: ld3r { v0.8b, v1.8b, v2.8b }, [x0] // encoding: [0x00,0xe0,0x40,0x0d]
+// CHECK: ld3r { v15.4h, v16.4h, v17.4h }, [x15] // encoding: [0xef,0xe5,0x40,0x0d]
+// CHECK: ld3r { v31.2s, v0.2s, v1.2s }, [sp] // encoding: [0xff,0xeb,0x40,0x0d]
+// CHECK: ld3r { v31.1d, v0.1d, v1.1d }, [sp] // encoding: [0xff,0xef,0x40,0x0d]
 
-         ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
-         ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
-         ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
-         ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
-         ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
-         ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
-         ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
-         ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp]
-// CHECK: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0xe0,0x60,0x4d]
-// CHECK: ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0xe5,0x60,0x4d]
-// CHECK: ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0xeb,0x60,0x4d]
-// CHECK: ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0xec,0x60,0x4d]
-// CHECK: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0xe0,0x60,0x0d]
-// CHECK: ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0xe5,0x60,0x0d]
-// CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0xeb,0x60,0x0d]
-// CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp] // encoding: [0xff,0xef,0x60,0x0d]
+         ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
+         ld4r { v15.8h, v16.8h, v17.8h, v18.8h }, [x15]
+         ld4r { v31.4s, v0.4s, v1.4s, v2.4s }, [sp]
+         ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+         ld4r { v0.8b, v1.8b, v2.8b, v3.8b }, [x0]
+         ld4r { v15.4h, v16.4h, v17.4h, v18.4h }, [x15]
+         ld4r { v31.2s, v0.2s, v1.2s, v2.2s }, [sp]
+         ld4r { v31.1d, v0.1d, v1.1d, v2.1d }, [sp]
+// CHECK: ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x0] // encoding: [0x00,0xe0,0x60,0x4d]
+// CHECK: ld4r { v15.8h, v16.8h, v17.8h, v18.8h }, [x15] // encoding: [0xef,0xe5,0x60,0x4d]
+// CHECK: ld4r { v31.4s, v0.4s, v1.4s, v2.4s }, [sp] // encoding: [0xff,0xeb,0x60,0x4d]
+// CHECK: ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] // encoding: [0x00,0xec,0x60,0x4d]
+// CHECK: ld4r { v0.8b, v1.8b, v2.8b, v3.8b }, [x0] // encoding: [0x00,0xe0,0x60,0x0d]
+// CHECK: ld4r { v15.4h, v16.4h, v17.4h, v18.4h }, [x15] // encoding: [0xef,0xe5,0x60,0x0d]
+// CHECK: ld4r { v31.2s, v0.2s, v1.2s, v2.2s }, [sp] // encoding: [0xff,0xeb,0x60,0x0d]
+// CHECK: ld4r { v31.1d, v0.1d, v1.1d, v2.1d }, [sp] // encoding: [0xff,0xef,0x60,0x0d]
 
 //------------------------------------------------------------------------------
 // Load single 1-element structure to one lane of 1 register.
 //------------------------------------------------------------------------------
-         ld1 {v0.b}[9], [x0]
-         ld1 {v15.h}[7], [x15]
-         ld1 {v31.s}[3], [sp]
-         ld1 {v0.d}[1], [x0]
-// CHECK: ld1 {v0.b}[9], [x0]         // encoding: [0x00,0x04,0x40,0x4d]
-// CHECK: ld1 {v15.h}[7], [x15]       // encoding: [0xef,0x59,0x40,0x4d]
-// CHECK: ld1 {v31.s}[3], [sp]        // encoding: [0xff,0x93,0x40,0x4d]
-// CHECK: ld1 {v0.d}[1], [x0]         // encoding: [0x00,0x84,0x40,0x4d]
+         ld1 { v0.b }[9], [x0]
+         ld1 { v15.h }[7], [x15]
+         ld1 { v31.s }[3], [sp]
+         ld1 { v0.d }[1], [x0]
+// CHECK: ld1 { v0.b }[9], [x0]         // encoding: [0x00,0x04,0x40,0x4d]
+// CHECK: ld1 { v15.h }[7], [x15]       // encoding: [0xef,0x59,0x40,0x4d]
+// CHECK: ld1 { v31.s }[3], [sp]        // encoding: [0xff,0x93,0x40,0x4d]
+// CHECK: ld1 { v0.d }[1], [x0]         // encoding: [0x00,0x84,0x40,0x4d]
 
 //------------------------------------------------------------------------------
 // Load single N-element structure to one lane of N consecutive registers
 // (N = 2,3,4)
 //------------------------------------------------------------------------------
-         ld2 {v0.b, v1.b}[9], [x0]
-         ld2 {v15.h, v16.h}[7], [x15]
-         ld2 {v31.s, v0.s}[3], [sp]
-         ld2 {v0.d, v1.d}[1], [x0]
-// CHECK: ld2 {v0.b, v1.b}[9], [x0]   // encoding: [0x00,0x04,0x60,0x4d]
-// CHECK: ld2 {v15.h, v16.h}[7], [x15] // encoding: [0xef,0x59,0x60,0x4d]
-// CHECK: ld2 {v31.s, v0.s}[3], [sp]  // encoding: [0xff,0x93,0x60,0x4d]
-// CHECK: ld2 {v0.d, v1.d}[1], [x0]   // encoding: [0x00,0x84,0x60,0x4d]
+         ld2 { v0.b, v1.b }[9], [x0]
+         ld2 { v15.h, v16.h }[7], [x15]
+         ld2 { v31.s, v0.s }[3], [sp]
+         ld2 { v0.d, v1.d }[1], [x0]
+// CHECK: ld2 { v0.b, v1.b }[9], [x0]   // encoding: [0x00,0x04,0x60,0x4d]
+// CHECK: ld2 { v15.h, v16.h }[7], [x15] // encoding: [0xef,0x59,0x60,0x4d]
+// CHECK: ld2 { v31.s, v0.s }[3], [sp]  // encoding: [0xff,0x93,0x60,0x4d]
+// CHECK: ld2 { v0.d, v1.d }[1], [x0]   // encoding: [0x00,0x84,0x60,0x4d]
 
-         ld3 {v0.b, v1.b, v2.b}[9], [x0]
-         ld3 {v15.h, v16.h, v17.h}[7], [x15]
-         ld3 {v31.s, v0.s, v1.s}[3], [sp]
-         ld3 {v0.d, v1.d, v2.d}[1], [x0]
-// CHECK: ld3 {v0.b, v1.b, v2.b}[9], [x0] // encoding: [0x00,0x24,0x40,0x4d]
-// CHECK: ld3 {v15.h, v16.h, v17.h}[7], [x15] // encoding: [0xef,0x79,0x40,0x4d]
-// CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp] // encoding: [0xff,0xb3,0x40,0x4d]
-// CHECK: ld3 {v0.d, v1.d, v2.d}[1], [x0] // encoding: [0x00,0xa4,0x40,0x4d]
+         ld3 { v0.b, v1.b, v2.b }[9], [x0]
+         ld3 { v15.h, v16.h, v17.h }[7], [x15]
+         ld3 { v31.s, v0.s, v1.s }[3], [sp]
+         ld3 { v0.d, v1.d, v2.d }[1], [x0]
+// CHECK: ld3 { v0.b, v1.b, v2.b }[9], [x0] // encoding: [0x00,0x24,0x40,0x4d]
+// CHECK: ld3 { v15.h, v16.h, v17.h }[7], [x15] // encoding: [0xef,0x79,0x40,0x4d]
+// CHECK: ld3 { v31.s, v0.s, v1.s }[3], [sp] // encoding: [0xff,0xb3,0x40,0x4d]
+// CHECK: ld3 { v0.d, v1.d, v2.d }[1], [x0] // encoding: [0x00,0xa4,0x40,0x4d]
 
-         ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0]
-         ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15]
-         ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp]
-         ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0]
-// CHECK: ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] // encoding: [0x00,0x24,0x60,0x4d]
-// CHECK: ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15] // encoding: [0xef,0x79,0x60,0x4d]
-// CHECK: ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp] // encoding: [0xff,0xb3,0x60,0x4d]
-// CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] // encoding: [0x00,0xa4,0x60,0x4d]
+         ld4 { v0.b, v1.b, v2.b, v3.b }[9], [x0]
+         ld4 { v15.h, v16.h, v17.h, v18.h }[7], [x15]
+         ld4 { v31.s, v0.s, v1.s, v2.s }[3], [sp]
+         ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0]
+// CHECK: ld4 { v0.b, v1.b, v2.b, v3.b }[9], [x0] // encoding: [0x00,0x24,0x60,0x4d]
+// CHECK: ld4 { v15.h, v16.h, v17.h, v18.h }[7], [x15] // encoding: [0xef,0x79,0x60,0x4d]
+// CHECK: ld4 { v31.s, v0.s, v1.s, v2.s }[3], [sp] // encoding: [0xff,0xb3,0x60,0x4d]
+// CHECK: ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0] // encoding: [0x00,0xa4,0x60,0x4d]
 
 //------------------------------------------------------------------------------
 // Store single 1-element structure from one lane of 1 register.
 //------------------------------------------------------------------------------
-         st1 {v0.b}[9], [x0]
-         st1 {v15.h}[7], [x15]
-         st1 {v31.s}[3], [sp]
-         st1 {v0.d}[1], [x0]
-// CHECK: st1 {v0.b}[9], [x0]         // encoding: [0x00,0x04,0x00,0x4d]
-// CHECK: st1 {v15.h}[7], [x15]       // encoding: [0xef,0x59,0x00,0x4d]
-// CHECK: st1 {v31.s}[3], [sp]        // encoding: [0xff,0x93,0x00,0x4d]
-// CHECK: st1 {v0.d}[1], [x0]         // encoding: [0x00,0x84,0x00,0x4d]
+         st1 { v0.b }[9], [x0]
+         st1 { v15.h }[7], [x15]
+         st1 { v31.s }[3], [sp]
+         st1 { v0.d }[1], [x0]
+// CHECK: st1 { v0.b }[9], [x0]         // encoding: [0x00,0x04,0x00,0x4d]
+// CHECK: st1 { v15.h }[7], [x15]       // encoding: [0xef,0x59,0x00,0x4d]
+// CHECK: st1 { v31.s }[3], [sp]        // encoding: [0xff,0x93,0x00,0x4d]
+// CHECK: st1 { v0.d }[1], [x0]         // encoding: [0x00,0x84,0x00,0x4d]
 
 //------------------------------------------------------------------------------
 // Store single N-element structure from one lane of N consecutive registers
 // (N = 2,3,4)
 //------------------------------------------------------------------------------
-         st2 {v0.b, v1.b}[9], [x0]
-         st2 {v15.h, v16.h}[7], [x15]
-         st2 {v31.s, v0.s}[3], [sp]
-         st2 {v0.d, v1.d}[1], [x0]
-// CHECK: st2 {v0.b, v1.b}[9], [x0]   // encoding: [0x00,0x04,0x20,0x4d]
-// CHECK: st2 {v15.h, v16.h}[7], [x15] // encoding: [0xef,0x59,0x20,0x4d]
-// CHECK: st2 {v31.s, v0.s}[3], [sp]  // encoding: [0xff,0x93,0x20,0x4d]
-// CHECK: st2 {v0.d, v1.d}[1], [x0]   // encoding: [0x00,0x84,0x20,0x4d]
+         st2 { v0.b, v1.b }[9], [x0]
+         st2 { v15.h, v16.h }[7], [x15]
+         st2 { v31.s, v0.s }[3], [sp]
+         st2 { v0.d, v1.d }[1], [x0]
+// CHECK: st2 { v0.b, v1.b }[9], [x0]   // encoding: [0x00,0x04,0x20,0x4d]
+// CHECK: st2 { v15.h, v16.h }[7], [x15] // encoding: [0xef,0x59,0x20,0x4d]
+// CHECK: st2 { v31.s, v0.s }[3], [sp]  // encoding: [0xff,0x93,0x20,0x4d]
+// CHECK: st2 { v0.d, v1.d }[1], [x0]   // encoding: [0x00,0x84,0x20,0x4d]
 
-         st3 {v0.b, v1.b, v2.b}[9], [x0]
-         st3 {v15.h, v16.h, v17.h}[7], [x15]
-         st3 {v31.s, v0.s, v1.s}[3], [sp]
-         st3 {v0.d, v1.d, v2.d}[1], [x0]
-// CHECK: st3 {v0.b, v1.b, v2.b}[9], [x0] // encoding: [0x00,0x24,0x00,0x4d]
-// CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15] // encoding: [0xef,0x79,0x00,0x4d]
-// CHECK: st3 {v31.s, v0.s, v1.s}[3], [sp] // encoding: [0xff,0xb3,0x00,0x4d]
-// CHECK: st3 {v0.d, v1.d, v2.d}[1], [x0] // encoding: [0x00,0xa4,0x00,0x4d]
+         st3 { v0.b, v1.b, v2.b }[9], [x0]
+         st3 { v15.h, v16.h, v17.h }[7], [x15]
+         st3 { v31.s, v0.s, v1.s }[3], [sp]
+         st3 { v0.d, v1.d, v2.d }[1], [x0]
+// CHECK: st3 { v0.b, v1.b, v2.b }[9], [x0] // encoding: [0x00,0x24,0x00,0x4d]
+// CHECK: st3 { v15.h, v16.h, v17.h }[7], [x15] // encoding: [0xef,0x79,0x00,0x4d]
+// CHECK: st3 { v31.s, v0.s, v1.s }[3], [sp] // encoding: [0xff,0xb3,0x00,0x4d]
+// CHECK: st3 { v0.d, v1.d, v2.d }[1], [x0] // encoding: [0x00,0xa4,0x00,0x4d]
 
-         st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0]
-         st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15]
-         st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp]
-         st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0]
-// CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] // encoding: [0x00,0x24,0x20,0x4d]
-// CHECK: st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15] // encoding: [0xef,0x79,0x20,0x4d]
-// CHECK: st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp] // encoding: [0xff,0xb3,0x20,0x4d]
-// CHECK: st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] // encoding: [0x00,0xa4,0x20,0x4d]
+         st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0]
+         st4 { v15.h, v16.h, v17.h, v18.h }[7], [x15]
+         st4 { v31.s, v0.s, v1.s, v2.s }[3], [sp]
+         st4 { v0.d, v1.d, v2.d, v3.d }[1], [x0]
+// CHECK: st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0] // encoding: [0x00,0x24,0x20,0x4d]
+// CHECK: st4 { v15.h, v16.h, v17.h, v18.h }[7], [x15] // encoding: [0xef,0x79,0x20,0x4d]
+// CHECK: st4 { v31.s, v0.s, v1.s, v2.s }[3], [sp] // encoding: [0xff,0xb3,0x20,0x4d]
+// CHECK: st4 { v0.d, v1.d, v2.d, v3.d }[1], [x0] // encoding: [0x00,0xa4,0x20,0x4d]
 
 //------------------------------------------------------------------------------
 // Post-index oad single 1-element structure to all lanes of 1 register
 //------------------------------------------------------------------------------
-         ld1r {v0.16b}, [x0], #1
-         ld1r {v15.8h}, [x15], #2
-         ld1r {v31.4s}, [sp], #4
-         ld1r {v0.2d}, [x0], #8
-         ld1r {v0.8b}, [x0], x0
-         ld1r {v15.4h}, [x15], x1
-         ld1r {v31.2s}, [sp], x2
-         ld1r {v0.1d}, [x0], x3
-// CHECK: ld1r {v0.16b}, [x0], #1      // encoding: [0x00,0xc0,0xdf,0x4d]
-// CHECK: ld1r {v15.8h}, [x15], #2     // encoding: [0xef,0xc5,0xdf,0x4d]
-// CHECK: ld1r {v31.4s}, [sp], #4      // encoding: [0xff,0xcb,0xdf,0x4d]
-// CHECK: ld1r {v0.2d}, [x0], #8       // encoding: [0x00,0xcc,0xdf,0x4d]
-// CHECK: ld1r {v0.8b}, [x0], x0       // encoding: [0x00,0xc0,0xc0,0x0d]
-// CHECK: ld1r {v15.4h}, [x15], x1     // encoding: [0xef,0xc5,0xc1,0x0d]
-// CHECK: ld1r {v31.2s}, [sp], x2      // encoding: [0xff,0xcb,0xc2,0x0d]
-// CHECK: ld1r {v0.1d}, [x0], x3       // encoding: [0x00,0xcc,0xc3,0x0d]
+         ld1r { v0.16b }, [x0], #1
+         ld1r { v15.8h }, [x15], #2
+         ld1r { v31.4s }, [sp], #4
+         ld1r { v0.2d }, [x0], #8
+         ld1r { v0.8b }, [x0], x0
+         ld1r { v15.4h }, [x15], x1
+         ld1r { v31.2s }, [sp], x2
+         ld1r { v0.1d }, [x0], x3
+// CHECK: ld1r { v0.16b }, [x0], #1      // encoding: [0x00,0xc0,0xdf,0x4d]
+// CHECK: ld1r { v15.8h }, [x15], #2     // encoding: [0xef,0xc5,0xdf,0x4d]
+// CHECK: ld1r { v31.4s }, [sp], #4      // encoding: [0xff,0xcb,0xdf,0x4d]
+// CHECK: ld1r { v0.2d }, [x0], #8       // encoding: [0x00,0xcc,0xdf,0x4d]
+// CHECK: ld1r { v0.8b }, [x0], x0       // encoding: [0x00,0xc0,0xc0,0x0d]
+// CHECK: ld1r { v15.4h }, [x15], x1     // encoding: [0xef,0xc5,0xc1,0x0d]
+// CHECK: ld1r { v31.2s }, [sp], x2      // encoding: [0xff,0xcb,0xc2,0x0d]
+// CHECK: ld1r { v0.1d }, [x0], x3       // encoding: [0x00,0xcc,0xc3,0x0d]
 
 //------------------------------------------------------------------------------
 // Post-index load single N-element structure to all lanes of N consecutive
 // registers (N = 2,3,4)
 //------------------------------------------------------------------------------
-         ld2r {v0.16b, v1.16b}, [x0], #2
-         ld2r {v15.8h, v16.8h}, [x15], #4
-         ld2r {v31.4s, v0.4s}, [sp], #8
-         ld2r {v0.2d, v1.2d}, [x0], #16
-         ld2r {v0.8b, v1.8b}, [x0], x6
-         ld2r {v15.4h, v16.4h}, [x15], x7
-         ld2r {v31.2s, v0.2s}, [sp], x9
-         ld2r {v31.1d, v0.1d}, [x0], x5
-// CHECK: ld2r {v0.16b, v1.16b}, [x0], #2 // encoding: [0x00,0xc0,0xff,0x4d]
-// CHECK: ld2r {v15.8h, v16.8h}, [x15], #4 // encoding: [0xef,0xc5,0xff,0x4d]
-// CHECK: ld2r {v31.4s, v0.4s}, [sp], #8 // encoding: [0xff,0xcb,0xff,0x4d]
-// CHECK: ld2r {v0.2d, v1.2d}, [x0], #16 // encoding: [0x00,0xcc,0xff,0x4d]
-// CHECK: ld2r {v0.8b, v1.8b}, [x0], x6 // encoding: [0x00,0xc0,0xe6,0x0d]
-// CHECK: ld2r {v15.4h, v16.4h}, [x15], x7 // encoding: [0xef,0xc5,0xe7,0x0d]
-// CHECK: ld2r {v31.2s, v0.2s}, [sp], x9 // encoding: [0xff,0xcb,0xe9,0x0d]
-// CHECK: ld2r {v31.1d, v0.1d}, [x0], x5 // encoding: [0x1f,0xcc,0xe5,0x0d]
+         ld2r { v0.16b, v1.16b }, [x0], #2
+         ld2r { v15.8h, v16.8h }, [x15], #4
+         ld2r { v31.4s, v0.4s }, [sp], #8
+         ld2r { v0.2d, v1.2d }, [x0], #16
+         ld2r { v0.8b, v1.8b }, [x0], x6
+         ld2r { v15.4h, v16.4h }, [x15], x7
+         ld2r { v31.2s, v0.2s }, [sp], x9
+         ld2r { v31.1d, v0.1d }, [x0], x5
+// CHECK: ld2r { v0.16b, v1.16b }, [x0], #2 // encoding: [0x00,0xc0,0xff,0x4d]
+// CHECK: ld2r { v15.8h, v16.8h }, [x15], #4 // encoding: [0xef,0xc5,0xff,0x4d]
+// CHECK: ld2r { v31.4s, v0.4s }, [sp], #8 // encoding: [0xff,0xcb,0xff,0x4d]
+// CHECK: ld2r { v0.2d, v1.2d }, [x0], #16 // encoding: [0x00,0xcc,0xff,0x4d]
+// CHECK: ld2r { v0.8b, v1.8b }, [x0], x6 // encoding: [0x00,0xc0,0xe6,0x0d]
+// CHECK: ld2r { v15.4h, v16.4h }, [x15], x7 // encoding: [0xef,0xc5,0xe7,0x0d]
+// CHECK: ld2r { v31.2s, v0.2s }, [sp], x9 // encoding: [0xff,0xcb,0xe9,0x0d]
+// CHECK: ld2r { v31.1d, v0.1d }, [x0], x5 // encoding: [0x1f,0xcc,0xe5,0x0d]
 
-         ld3r {v0.16b, v1.16b, v2.16b}, [x0], x9
-         ld3r {v15.8h, v16.8h, v17.8h}, [x15], x6
-         ld3r {v31.4s, v0.4s, v1.4s}, [sp], x7
-         ld3r {v0.2d, v1.2d, v2.2d}, [x0], x5
-         ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3
-         ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6
-         ld3r {v31.2s, v0.2s, v1.2s}, [sp], #12
-         ld3r {v31.1d, v0.1d, v1.1d}, [sp], #24
-// CHECK: ld3r {v0.16b, v1.16b, v2.16b}, [x0], x9 // encoding: [0x00,0xe0,0xc9,0x4d]
-// CHECK: ld3r {v15.8h, v16.8h, v17.8h}, [x15], x6 // encoding: [0xef,0xe5,0xc6,0x4d]
-// CHECK: ld3r {v31.4s, v0.4s, v1.4s}, [sp], x7 // encoding: [0xff,0xeb,0xc7,0x4d]
-// CHECK: ld3r {v0.2d, v1.2d, v2.2d}, [x0], x5 // encoding: [0x00,0xec,0xc5,0x4d]
-// CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3 // encoding: [0x00,0xe0,0xdf,0x0d]
-// CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6 // encoding: [0xef,0xe5,0xdf,0x0d]
-// CHECK: ld3r {v31.2s, v0.2s, v1.2s}, [sp], #12 // encoding: [0xff,0xeb,0xdf,0x0d]
-// CHECK: ld3r {v31.1d, v0.1d, v1.1d}, [sp], #24 // encoding: [0xff,0xef,0xdf,0x0d]
+         ld3r { v0.16b, v1.16b, v2.16b }, [x0], x9
+         ld3r { v15.8h, v16.8h, v17.8h }, [x15], x6
+         ld3r { v31.4s, v0.4s, v1.4s }, [sp], x7
+         ld3r { v0.2d, v1.2d, v2.2d }, [x0], x5
+         ld3r { v0.8b, v1.8b, v2.8b }, [x0], #3
+         ld3r { v15.4h, v16.4h, v17.4h }, [x15], #6
+         ld3r { v31.2s, v0.2s, v1.2s }, [sp], #12
+         ld3r { v31.1d, v0.1d, v1.1d }, [sp], #24
+// CHECK: ld3r { v0.16b, v1.16b, v2.16b }, [x0], x9 // encoding: [0x00,0xe0,0xc9,0x4d]
+// CHECK: ld3r { v15.8h, v16.8h, v17.8h }, [x15], x6 // encoding: [0xef,0xe5,0xc6,0x4d]
+// CHECK: ld3r { v31.4s, v0.4s, v1.4s }, [sp], x7 // encoding: [0xff,0xeb,0xc7,0x4d]
+// CHECK: ld3r { v0.2d, v1.2d, v2.2d }, [x0], x5 // encoding: [0x00,0xec,0xc5,0x4d]
+// CHECK: ld3r { v0.8b, v1.8b, v2.8b }, [x0], #3 // encoding: [0x00,0xe0,0xdf,0x0d]
+// CHECK: ld3r { v15.4h, v16.4h, v17.4h }, [x15], #6 // encoding: [0xef,0xe5,0xdf,0x0d]
+// CHECK: ld3r { v31.2s, v0.2s, v1.2s }, [sp], #12 // encoding: [0xff,0xeb,0xdf,0x0d]
+// CHECK: ld3r { v31.1d, v0.1d, v1.1d }, [sp], #24 // encoding: [0xff,0xef,0xdf,0x0d]
 
-         ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #4
-         ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], #8
-         ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #16
-         ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #32
-         ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x5
-         ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x9
-         ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30
-         ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7
-// CHECK: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #4 // encoding: [0x00,0xe0,0xff,0x4d]
-// CHECK: ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], #8 // encoding: [0xef,0xe5,0xff,0x4d]
-// CHECK: ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #16 // encoding: [0xff,0xeb,0xff,0x4d]
-// CHECK: ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #32 // encoding: [0x00,0xec,0xff,0x4d]
-// CHECK: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x5 // encoding: [0x00,0xe0,0xe5,0x0d]
-// CHECK: ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x9 // encoding: [0xef,0xe5,0xe9,0x0d]
-// CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30 // encoding: [0xff,0xeb,0xfe,0x0d]
-// CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7 // encoding: [0xff,0xef,0xe7,0x0d]
+         ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #4
+         ld4r { v15.8h, v16.8h, v17.8h, v18.8h }, [x15], #8
+         ld4r { v31.4s, v0.4s, v1.4s, v2.4s }, [sp], #16
+         ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x0], #32
+         ld4r { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x5
+         ld4r { v15.4h, v16.4h, v17.4h, v18.4h }, [x15], x9
+         ld4r { v31.2s, v0.2s, v1.2s, v2.2s }, [sp], x30
+         ld4r { v31.1d, v0.1d, v1.1d, v2.1d }, [sp], x7
+// CHECK: ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #4 // encoding: [0x00,0xe0,0xff,0x4d]
+// CHECK: ld4r { v15.8h, v16.8h, v17.8h, v18.8h }, [x15], #8 // encoding: [0xef,0xe5,0xff,0x4d]
+// CHECK: ld4r { v31.4s, v0.4s, v1.4s, v2.4s }, [sp], #16 // encoding: [0xff,0xeb,0xff,0x4d]
+// CHECK: ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x0], #32 // encoding: [0x00,0xec,0xff,0x4d]
+// CHECK: ld4r { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x5 // encoding: [0x00,0xe0,0xe5,0x0d]
+// CHECK: ld4r { v15.4h, v16.4h, v17.4h, v18.4h }, [x15], x9 // encoding: [0xef,0xe5,0xe9,0x0d]
+// CHECK: ld4r { v31.2s, v0.2s, v1.2s, v2.2s }, [sp], x30 // encoding: [0xff,0xeb,0xfe,0x0d]
+// CHECK: ld4r { v31.1d, v0.1d, v1.1d, v2.1d }, [sp], x7 // encoding: [0xff,0xef,0xe7,0x0d]
 
 //------------------------------------------------------------------------------
 // Post-index load single 1-element structure to one lane of 1 register.
 //------------------------------------------------------------------------------
-         ld1 {v0.b}[9], [x0], #1
-         ld1 {v15.h}[7], [x15], x9
-         ld1 {v31.s}[3], [sp], x6
-         ld1 {v0.d}[1], [x0], #8
-// CHECK: ld1 {v0.b}[9], [x0], #1     // encoding: [0x00,0x04,0xdf,0x4d]
-// CHECK: ld1 {v15.h}[7], [x15], x9   // encoding: [0xef,0x59,0xc9,0x4d]
-// CHECK: ld1 {v31.s}[3], [sp], x6    // encoding: [0xff,0x93,0xc6,0x4d]
-// CHECK: ld1 {v0.d}[1], [x0], #8     // encoding: [0x00,0x84,0xdf,0x4d]
+         ld1 { v0.b }[9], [x0], #1
+         ld1 { v15.h }[7], [x15], x9
+         ld1 { v31.s }[3], [sp], x6
+         ld1 { v0.d }[1], [x0], #8
+// CHECK: ld1 { v0.b }[9], [x0], #1     // encoding: [0x00,0x04,0xdf,0x4d]
+// CHECK: ld1 { v15.h }[7], [x15], x9   // encoding: [0xef,0x59,0xc9,0x4d]
+// CHECK: ld1 { v31.s }[3], [sp], x6    // encoding: [0xff,0x93,0xc6,0x4d]
+// CHECK: ld1 { v0.d }[1], [x0], #8     // encoding: [0x00,0x84,0xdf,0x4d]
 
 //------------------------------------------------------------------------------
 // Post-index load single N-element structure to one lane of N consecutive
 // registers (N = 2,3,4)
 //------------------------------------------------------------------------------
-         ld2 {v0.b, v1.b}[9], [x0], x3
-         ld2 {v15.h, v16.h}[7], [x15], #4
-         ld2 {v31.s, v0.s}[3], [sp], #8
-         ld2 {v0.d, v1.d}[1], [x0], x0
-// CHECK: ld2 {v0.b, v1.b}[9], [x0], x3 // encoding: [0x00,0x04,0xe3,0x4d]
-// CHECK: ld2 {v15.h, v16.h}[7], [x15], #4 // encoding: [0xef,0x59,0xff,0x4d]
-// CHECK: ld2 {v31.s, v0.s}[3], [sp], #8 // encoding: [0xff,0x93,0xff,0x4d]
-// CHECK: ld2 {v0.d, v1.d}[1], [x0], x0 // encoding: [0x00,0x84,0xe0,0x4d]
+         ld2 { v0.b, v1.b }[9], [x0], x3
+         ld2 { v15.h, v16.h }[7], [x15], #4
+         ld2 { v31.s, v0.s }[3], [sp], #8
+         ld2 { v0.d, v1.d }[1], [x0], x0
+// CHECK: ld2 { v0.b, v1.b }[9], [x0], x3 // encoding: [0x00,0x04,0xe3,0x4d]
+// CHECK: ld2 { v15.h, v16.h }[7], [x15], #4 // encoding: [0xef,0x59,0xff,0x4d]
+// CHECK: ld2 { v31.s, v0.s }[3], [sp], #8 // encoding: [0xff,0x93,0xff,0x4d]
+// CHECK: ld2 { v0.d, v1.d }[1], [x0], x0 // encoding: [0x00,0x84,0xe0,0x4d]
 
-         ld3 {v0.b, v1.b, v2.b}[9], [x0], #3
-         ld3 {v15.h, v16.h, v17.h}[7], [x15], #6
-         ld3 {v31.s, v0.s, v1.s}[3], [sp], x3
-         ld3 {v0.d, v1.d, v2.d}[1], [x0], x6
-// CHECK: ld3 {v0.b, v1.b, v2.b}[9], [x0], #3 // encoding: [0x00,0x24,0xdf,0x4d]
-// CHECK: ld3 {v15.h, v16.h, v17.h}[7], [x15], #6 // encoding: [0xef,0x79,0xdf,0x4d]
-// CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp], x3 // encoding: [0xff,0xb3,0xc3,0x4d]
-// CHECK: ld3 {v0.d, v1.d, v2.d}[1], [x0], x6 // encoding: [0x00,0xa4,0xc6,0x4d]
+         ld3 { v0.b, v1.b, v2.b }[9], [x0], #3
+         ld3 { v15.h, v16.h, v17.h }[7], [x15], #6
+         ld3 { v31.s, v0.s, v1.s }[3], [sp], x3
+         ld3 { v0.d, v1.d, v2.d }[1], [x0], x6
+// CHECK: ld3 { v0.b, v1.b, v2.b }[9], [x0], #3 // encoding: [0x00,0x24,0xdf,0x4d]
+// CHECK: ld3 { v15.h, v16.h, v17.h }[7], [x15], #6 // encoding: [0xef,0x79,0xdf,0x4d]
+// CHECK: ld3 { v31.s, v0.s, v1.s }[3], [sp], x3 // encoding: [0xff,0xb3,0xc3,0x4d]
+// CHECK: ld3 { v0.d, v1.d, v2.d }[1], [x0], x6 // encoding: [0x00,0xa4,0xc6,0x4d]
 
-         ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5
-         ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7
-         ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16
-         ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
-// CHECK: ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5 // encoding: [0x00,0x24,0xe5,0x4d]
-// CHECK: ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7 // encoding: [0xef,0x79,0xe7,0x4d]
-// CHECK: ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16 // encoding: [0xff,0xb3,0xff,0x4d]
-// CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 // encoding: [0x00,0xa4,0xff,0x4d]
+         ld4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5
+         ld4 { v15.h, v16.h, v17.h, v18.h }[7], [x15], x7
+         ld4 { v31.s, v0.s, v1.s, v2.s }[3], [sp], #16
+         ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], #32
+// CHECK: ld4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5 // encoding: [0x00,0x24,0xe5,0x4d]
+// CHECK: ld4 { v15.h, v16.h, v17.h, v18.h }[7], [x15], x7 // encoding: [0xef,0x79,0xe7,0x4d]
+// CHECK: ld4 { v31.s, v0.s, v1.s, v2.s }[3], [sp], #16 // encoding: [0xff,0xb3,0xff,0x4d]
+// CHECK: ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], #32 // encoding: [0x00,0xa4,0xff,0x4d]
 
 //------------------------------------------------------------------------------
 // Post-index store single 1-element structure from one lane of 1 register.
 //------------------------------------------------------------------------------
-         st1 {v0.b}[9], [x0], #1
-         st1 {v15.h}[7], [x15], x9
-         st1 {v31.s}[3], [sp], x6
-         st1 {v0.d}[1], [x0], #8
-// CHECK: st1 {v0.b}[9], [x0], #1     // encoding: [0x00,0x04,0x9f,0x4d]
-// CHECK: st1 {v15.h}[7], [x15], x9   // encoding: [0xef,0x59,0x89,0x4d]
-// CHECK: st1 {v31.s}[3], [sp], x6    // encoding: [0xff,0x93,0x86,0x4d]
-// CHECK: st1 {v0.d}[1], [x0], #8     // encoding: [0x00,0x84,0x9f,0x4d]
+         st1 { v0.b }[9], [x0], #1
+         st1 { v15.h }[7], [x15], x9
+         st1 { v31.s }[3], [sp], x6
+         st1 { v0.d }[1], [x0], #8
+// CHECK: st1 { v0.b }[9], [x0], #1     // encoding: [0x00,0x04,0x9f,0x4d]
+// CHECK: st1 { v15.h }[7], [x15], x9   // encoding: [0xef,0x59,0x89,0x4d]
+// CHECK: st1 { v31.s }[3], [sp], x6    // encoding: [0xff,0x93,0x86,0x4d]
+// CHECK: st1 { v0.d }[1], [x0], #8     // encoding: [0x00,0x84,0x9f,0x4d]
 
 //------------------------------------------------------------------------------
 // Post-index store single N-element structure from one lane of N consecutive
 // registers (N = 2,3,4)
 //------------------------------------------------------------------------------
-         st2 {v0.b, v1.b}[9], [x0], x3
-         st2 {v15.h, v16.h}[7], [x15], #4
-         st2 {v31.s, v0.s}[3], [sp], #8
-         st2 {v0.d, v1.d}[1], [x0], x0
-// CHECK: st2 {v0.b, v1.b}[9], [x0], x3 // encoding: [0x00,0x04,0xa3,0x4d]
-// CHECK: st2 {v15.h, v16.h}[7], [x15], #4 // encoding: [0xef,0x59,0xbf,0x4d]
-// CHECK: st2 {v31.s, v0.s}[3], [sp], #8 // encoding: [0xff,0x93,0xbf,0x4d]
-// CHECK: st2 {v0.d, v1.d}[1], [x0], x0 // encoding: [0x00,0x84,0xa0,0x4d]
+         st2 { v0.b, v1.b }[9], [x0], x3
+         st2 { v15.h, v16.h }[7], [x15], #4
+         st2 { v31.s, v0.s }[3], [sp], #8
+         st2 { v0.d, v1.d }[1], [x0], x0
+// CHECK: st2 { v0.b, v1.b }[9], [x0], x3 // encoding: [0x00,0x04,0xa3,0x4d]
+// CHECK: st2 { v15.h, v16.h }[7], [x15], #4 // encoding: [0xef,0x59,0xbf,0x4d]
+// CHECK: st2 { v31.s, v0.s }[3], [sp], #8 // encoding: [0xff,0x93,0xbf,0x4d]
+// CHECK: st2 { v0.d, v1.d }[1], [x0], x0 // encoding: [0x00,0x84,0xa0,0x4d]
 
-         st3 {v0.b, v1.b, v2.b}[9], [x0], #3
-         st3 {v15.h, v16.h, v17.h}[7], [x15], #6
-         st3 {v31.s, v0.s, v1.s}[3], [sp], x3
-         st3 {v0.d, v1.d, v2.d}[1], [x0], x6
-// CHECK: st3 {v0.b, v1.b, v2.b}[9], [x0], #3 // encoding: [0x00,0x24,0x9f,0x4d]
-// CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15], #6 // encoding: [0xef,0x79,0x9f,0x4d]
-// CHECK: st3 {v31.s, v0.s, v1.s}[3], [sp], x3 // encoding: [0xff,0xb3,0x83,0x4d]
-// CHECK: st3 {v0.d, v1.d, v2.d}[1], [x0], x6 // encoding: [0x00,0xa4,0x86,0x4d]
+         st3 { v0.b, v1.b, v2.b }[9], [x0], #3
+         st3 { v15.h, v16.h, v17.h }[7], [x15], #6
+         st3 { v31.s, v0.s, v1.s }[3], [sp], x3
+         st3 { v0.d, v1.d, v2.d }[1], [x0], x6
+// CHECK: st3 { v0.b, v1.b, v2.b }[9], [x0], #3 // encoding: [0x00,0x24,0x9f,0x4d]
+// CHECK: st3 { v15.h, v16.h, v17.h }[7], [x15], #6 // encoding: [0xef,0x79,0x9f,0x4d]
+// CHECK: st3 { v31.s, v0.s, v1.s }[3], [sp], x3 // encoding: [0xff,0xb3,0x83,0x4d]
+// CHECK: st3 { v0.d, v1.d, v2.d }[1], [x0], x6 // encoding: [0x00,0xa4,0x86,0x4d]
 
-         st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5
-         st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7
-         st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16
-         st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
-// CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5 // encoding: [0x00,0x24,0xa5,0x4d]
-// CHECK: st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7 // encoding: [0xef,0x79,0xa7,0x4d]
-// CHECK: st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16 // encoding: [0xff,0xb3,0xbf,0x4d]
-// CHECK: st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 // encoding: [0x00,0xa4,0xbf,0x4d]
+         st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5
+         st4 { v15.h, v16.h, v17.h, v18.h }[7], [x15], x7
+         st4 { v31.s, v0.s, v1.s, v2.s }[3], [sp], #16
+         st4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], #32
+// CHECK: st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5 // encoding: [0x00,0x24,0xa5,0x4d]
+// CHECK: st4 { v15.h, v16.h, v17.h, v18.h }[7], [x15], x7 // encoding: [0xef,0x79,0xa7,0x4d]
+// CHECK: st4 { v31.s, v0.s, v1.s, v2.s }[3], [sp], #16 // encoding: [0xff,0xb3,0xbf,0x4d]
+// CHECK: st4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], #32 // encoding: [0x00,0xa4,0xbf,0x4d]
diff --git a/test/MC/AArch64/neon-simd-misc.s b/test/MC/AArch64/neon-simd-misc.s
index 9e0f9c5..6d1aafd 100644
--- a/test/MC/AArch64/neon-simd-misc.s
+++ b/test/MC/AArch64/neon-simd-misc.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -281,8 +281,8 @@
          not v0.16b, v31.16b
          not v1.8b, v9.8b
 
-// CHECK:	not	v0.16b, v31.16b         // encoding: [0xe0,0x5b,0x20,0x6e]
-// CHECK:	not	v1.8b, v9.8b            // encoding: [0x21,0x59,0x20,0x2e]
+// CHECK: {{mvn|not}} v0.16b, v31.16b         // encoding: [0xe0,0x5b,0x20,0x6e]
+// CHECK: {{mvn|not}} v1.8b, v9.8b            // encoding: [0x21,0x59,0x20,0x2e]
 
 //------------------------------------------------------------------------------
 // Bitwise reverse
diff --git a/test/MC/AArch64/neon-simd-post-ldst-multi-elem.s b/test/MC/AArch64/neon-simd-post-ldst-multi-elem.s
index 8dc271e..c57a122 100644
--- a/test/MC/AArch64/neon-simd-post-ldst-multi-elem.s
+++ b/test/MC/AArch64/neon-simd-post-ldst-multi-elem.s
@@ -1,389 +1,389 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
 //------------------------------------------------------------------------------
 // Load multiple 1-element structures from one register (post-index)
 //------------------------------------------------------------------------------
-         ld1 {v0.16b}, [x0], x1
-         ld1 {v15.8h}, [x15], x2
-         ld1 {v31.4s}, [sp], #16
-         ld1 {v0.2d}, [x0], #16
-         ld1 {v0.8b}, [x0], x2
-         ld1 {v15.4h}, [x15], x3
-         ld1 {v31.2s}, [sp], #8
-         ld1 {v0.1d}, [x0], #8
-// CHECK: ld1 {v0.16b}, [x0], x1
+         ld1 { v0.16b }, [x0], x1
+         ld1 { v15.8h }, [x15], x2
+         ld1 { v31.4s }, [sp], #16
+         ld1 { v0.2d }, [x0], #16
+         ld1 { v0.8b }, [x0], x2
+         ld1 { v15.4h }, [x15], x3
+         ld1 { v31.2s }, [sp], #8
+         ld1 { v0.1d }, [x0], #8
+// CHECK: ld1 { v0.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0x70,0xc1,0x4c]
-// CHECK: ld1 {v15.8h}, [x15], x2
+// CHECK: ld1 { v15.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0x75,0xc2,0x4c]
-// CHECK: ld1 {v31.4s}, [sp], #16
+// CHECK: ld1 { v31.4s }, [sp], #16
 // CHECK:     // encoding: [0xff,0x7b,0xdf,0x4c]
-// CHECK: ld1 {v0.2d}, [x0], #16
+// CHECK: ld1 { v0.2d }, [x0], #16
 // CHECK:     // encoding: [0x00,0x7c,0xdf,0x4c]
-// CHECK: ld1 {v0.8b}, [x0], x2
+// CHECK: ld1 { v0.8b }, [x0], x2
 // CHECK:     // encoding: [0x00,0x70,0xc2,0x0c]
-// CHECK: ld1 {v15.4h}, [x15], x3
+// CHECK: ld1 { v15.4h }, [x15], x3
 // CHECK:     // encoding: [0xef,0x75,0xc3,0x0c]
-// CHECK: ld1 {v31.2s}, [sp], #8
+// CHECK: ld1 { v31.2s }, [sp], #8
 // CHECK:     // encoding: [0xff,0x7b,0xdf,0x0c]
-// CHECK: ld1 {v0.1d}, [x0], #8
+// CHECK: ld1 { v0.1d }, [x0], #8
 // CHECK:     // encoding: [0x00,0x7c,0xdf,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 1-element structures from two consecutive registers
 // (post-index)
 //------------------------------------------------------------------------------
-         ld1 {v0.16b, v1.16b}, [x0], x1
-         ld1 {v15.8h, v16.8h}, [x15], x2
-         ld1 {v31.4s, v0.4s}, [sp], #32
-         ld1 {v0.2d, v1.2d}, [x0], #32
-         ld1 {v0.8b, v1.8b}, [x0], x2
-         ld1 {v15.4h, v16.4h}, [x15], x3
-         ld1 {v31.2s, v0.2s}, [sp], #16
-         ld1 {v0.1d, v1.1d}, [x0], #16
-// CHECK: ld1 {v0.16b, v1.16b}, [x0], x1
+         ld1 { v0.16b, v1.16b }, [x0], x1
+         ld1 { v15.8h, v16.8h }, [x15], x2
+         ld1 { v31.4s, v0.4s }, [sp], #32
+         ld1 { v0.2d, v1.2d }, [x0], #32
+         ld1 { v0.8b, v1.8b }, [x0], x2
+         ld1 { v15.4h, v16.4h }, [x15], x3
+         ld1 { v31.2s, v0.2s }, [sp], #16
+         ld1 { v0.1d, v1.1d }, [x0], #16
+// CHECK: ld1 { v0.16b, v1.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0xa0,0xc1,0x4c]
-// CHECK: ld1 {v15.8h, v16.8h}, [x15], x2
+// CHECK: ld1 { v15.8h, v16.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0xa5,0xc2,0x4c]
-// CHECK: ld1 {v31.4s, v0.4s}, [sp], #32
+// CHECK: ld1 { v31.4s, v0.4s }, [sp], #32
 // CHECK:     // encoding: [0xff,0xab,0xdf,0x4c]
-// CHECK: ld1 {v0.2d, v1.2d}, [x0], #32
+// CHECK: ld1 { v0.2d, v1.2d }, [x0], #32
 // CHECK:     // encoding: [0x00,0xac,0xdf,0x4c]
-// CHECK: ld1 {v0.8b, v1.8b}, [x0], x2
+// CHECK: ld1 { v0.8b, v1.8b }, [x0], x2
 // CHECK:     // encoding: [0x00,0xa0,0xc2,0x0c]
-// CHECK: ld1 {v15.4h, v16.4h}, [x15], x3
+// CHECK: ld1 { v15.4h, v16.4h }, [x15], x3
 // CHECK:     // encoding: [0xef,0xa5,0xc3,0x0c]
-// CHECK: ld1 {v31.2s, v0.2s}, [sp], #16
+// CHECK: ld1 { v31.2s, v0.2s }, [sp], #16
 // CHECK:     // encoding: [0xff,0xab,0xdf,0x0c]
-// CHECK: ld1 {v0.1d, v1.1d}, [x0], #16
+// CHECK: ld1 { v0.1d, v1.1d }, [x0], #16
 // CHECK:     // encoding: [0x00,0xac,0xdf,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 1-element structures from three consecutive registers
 // (post-index)
 //------------------------------------------------------------------------------
-         ld1 {v0.16b, v1.16b, v2.16b}, [x0], x1
-         ld1 {v15.8h, v16.8h, v17.8h}, [x15], x2
-         ld1 {v31.4s, v0.4s, v1.4s}, [sp], #48
-         ld1 {v0.2d, v1.2d, v2.2d}, [x0], #48
-         ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2
-         ld1 {v15.4h, v16.4h, v17.4h}, [x15], x3
-         ld1 {v31.2s, v0.2s, v1.2s}, [sp], #24
-         ld1 {v0.1d, v1.1d, v2.1d}, [x0], #24
-// CHECK: ld1 {v0.16b, v1.16b, v2.16b}, [x0], x1
+         ld1 { v0.16b, v1.16b, v2.16b }, [x0], x1
+         ld1 { v15.8h, v16.8h, v17.8h }, [x15], x2
+         ld1 { v31.4s, v0.4s, v1.4s }, [sp], #48
+         ld1 { v0.2d, v1.2d, v2.2d }, [x0], #48
+         ld1 { v0.8b, v1.8b, v2.8b }, [x0], x2
+         ld1 { v15.4h, v16.4h, v17.4h }, [x15], x3
+         ld1 { v31.2s, v0.2s, v1.2s }, [sp], #24
+         ld1 { v0.1d, v1.1d, v2.1d }, [x0], #24
+// CHECK: ld1 { v0.16b, v1.16b, v2.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0x60,0xc1,0x4c]
-// CHECK: ld1 {v15.8h, v16.8h, v17.8h}, [x15], x2
+// CHECK: ld1 { v15.8h, v16.8h, v17.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0x65,0xc2,0x4c]
-// CHECK: ld1 {v31.4s, v0.4s, v1.4s}, [sp], #48
+// CHECK: ld1 { v31.4s, v0.4s, v1.4s }, [sp], #48
 // CHECK:     // encoding: [0xff,0x6b,0xdf,0x4c]
-// CHECK: ld1 {v0.2d, v1.2d, v2.2d}, [x0], #48
+// CHECK: ld1 { v0.2d, v1.2d, v2.2d }, [x0], #48
 // CHECK:     // encoding: [0x00,0x6c,0xdf,0x4c]
-// CHECK: ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2
+// CHECK: ld1 { v0.8b, v1.8b, v2.8b }, [x0], x2
 // CHECK:     // encoding: [0x00,0x60,0xc2,0x0c]
-// CHECK: ld1 {v15.4h, v16.4h, v17.4h}, [x15], x3
+// CHECK: ld1 { v15.4h, v16.4h, v17.4h }, [x15], x3
 // CHECK:     // encoding: [0xef,0x65,0xc3,0x0c]
-// CHECK: ld1 {v31.2s, v0.2s, v1.2s}, [sp], #24
+// CHECK: ld1 { v31.2s, v0.2s, v1.2s }, [sp], #24
 // CHECK:     // encoding: [0xff,0x6b,0xdf,0x0c]
-// CHECK: ld1 {v0.1d, v1.1d, v2.1d}, [x0], #24
+// CHECK: ld1 { v0.1d, v1.1d, v2.1d }, [x0], #24
 // CHECK:     // encoding: [0x00,0x6c,0xdf,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 1-element structures from four consecutive registers
 // (post-index)
 //------------------------------------------------------------------------------
-         ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
-         ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
-         ld1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
-         ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
-         ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
-         ld1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
-         ld1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
-         ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0], #32
-// CHECK: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+         ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], x1
+         ld1 { v15.8h, v16.8h, v17.8h, v18.8h }, [x15], x2
+         ld1 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp], #64
+         ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0], #64
+         ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+         ld1 { v15.4h, v16.4h, v17.4h, v18.4h }, [x15], x4
+         ld1 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp], #32
+         ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x0], #32
+// CHECK: ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0x20,0xc1,0x4c]
-// CHECK: ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
+// CHECK: ld1 { v15.8h, v16.8h, v17.8h, v18.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0x25,0xc2,0x4c]
-// CHECK: ld1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+// CHECK: ld1 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp], #64
 // CHECK:     // encoding: [0xff,0x2b,0xdf,0x4c]
-// CHECK: ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
+// CHECK: ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0], #64
 // CHECK:     // encoding: [0x00,0x2c,0xdf,0x4c]
-// CHECK: ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+// CHECK: ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
 // CHECK:     // encoding: [0x00,0x20,0xc3,0x0c]
-// CHECK: ld1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
+// CHECK: ld1 { v15.4h, v16.4h, v17.4h, v18.4h }, [x15], x4
 // CHECK:     // encoding: [0xef,0x25,0xc4,0x0c]
-// CHECK: ld1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
+// CHECK: ld1 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp], #32
 // CHECK:     // encoding: [0xff,0x2b,0xdf,0x0c]
-// CHECK: ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0], #32
+// CHECK: ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x0], #32
 // CHECK:     // encoding: [0x00,0x2c,0xdf,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 2-element structures from two consecutive registers
 // (post-index)
 //------------------------------------------------------------------------------
-         ld2 {v0.16b, v1.16b}, [x0], x1
-         ld2 {v15.8h, v16.8h}, [x15], x2
-         ld2 {v31.4s, v0.4s}, [sp], #32
-         ld2 {v0.2d, v1.2d}, [x0], #32
-         ld2 {v0.8b, v1.8b}, [x0], x2
-         ld2 {v15.4h, v16.4h}, [x15], x3
-         ld2 {v31.2s, v0.2s}, [sp], #16
-// CHECK: ld2 {v0.16b, v1.16b}, [x0], x1
+         ld2 { v0.16b, v1.16b }, [x0], x1
+         ld2 { v15.8h, v16.8h }, [x15], x2
+         ld2 { v31.4s, v0.4s }, [sp], #32
+         ld2 { v0.2d, v1.2d }, [x0], #32
+         ld2 { v0.8b, v1.8b }, [x0], x2
+         ld2 { v15.4h, v16.4h }, [x15], x3
+         ld2 { v31.2s, v0.2s }, [sp], #16
+// CHECK: ld2 { v0.16b, v1.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0x80,0xc1,0x4c]
-// CHECK: ld2 {v15.8h, v16.8h}, [x15], x2
+// CHECK: ld2 { v15.8h, v16.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0x85,0xc2,0x4c]
-// CHECK: ld2 {v31.4s, v0.4s}, [sp], #32
+// CHECK: ld2 { v31.4s, v0.4s }, [sp], #32
 // CHECK:     // encoding: [0xff,0x8b,0xdf,0x4c]
-// CHECK: ld2 {v0.2d, v1.2d}, [x0], #32
+// CHECK: ld2 { v0.2d, v1.2d }, [x0], #32
 // CHECK:     // encoding: [0x00,0x8c,0xdf,0x4c]
-// CHECK: ld2 {v0.8b, v1.8b}, [x0], x2
+// CHECK: ld2 { v0.8b, v1.8b }, [x0], x2
 // CHECK:     // encoding: [0x00,0x80,0xc2,0x0c]
-// CHECK: ld2 {v15.4h, v16.4h}, [x15], x3
+// CHECK: ld2 { v15.4h, v16.4h }, [x15], x3
 // CHECK:     // encoding: [0xef,0x85,0xc3,0x0c]
-// CHECK: ld2 {v31.2s, v0.2s}, [sp], #16
+// CHECK: ld2 { v31.2s, v0.2s }, [sp], #16
 // CHECK:     // encoding: [0xff,0x8b,0xdf,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 3-element structures from three consecutive registers
 // (post-index)
 //------------------------------------------------------------------------------
-         ld3 {v0.16b, v1.16b, v2.16b}, [x0], x1
-         ld3 {v15.8h, v16.8h, v17.8h}, [x15], x2
-         ld3 {v31.4s, v0.4s, v1.4s}, [sp], #48
-         ld3 {v0.2d, v1.2d, v2.2d}, [x0], #48
-         ld3 {v0.8b, v1.8b, v2.8b}, [x0], x2
-         ld3 {v15.4h, v16.4h, v17.4h}, [x15], x3
-         ld3 {v31.2s, v0.2s, v1.2s}, [sp], #24
-// CHECK: ld3 {v0.16b, v1.16b, v2.16b}, [x0], x1
+         ld3 { v0.16b, v1.16b, v2.16b }, [x0], x1
+         ld3 { v15.8h, v16.8h, v17.8h }, [x15], x2
+         ld3 { v31.4s, v0.4s, v1.4s }, [sp], #48
+         ld3 { v0.2d, v1.2d, v2.2d }, [x0], #48
+         ld3 { v0.8b, v1.8b, v2.8b }, [x0], x2
+         ld3 { v15.4h, v16.4h, v17.4h }, [x15], x3
+         ld3 { v31.2s, v0.2s, v1.2s }, [sp], #24
+// CHECK: ld3 { v0.16b, v1.16b, v2.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0x40,0xc1,0x4c]
-// CHECK: ld3 {v15.8h, v16.8h, v17.8h}, [x15], x2
+// CHECK: ld3 { v15.8h, v16.8h, v17.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0x45,0xc2,0x4c]
-// CHECK: ld3 {v31.4s, v0.4s, v1.4s}, [sp], #48
+// CHECK: ld3 { v31.4s, v0.4s, v1.4s }, [sp], #48
 // CHECK:     // encoding: [0xff,0x4b,0xdf,0x4c]
-// CHECK: ld3 {v0.2d, v1.2d, v2.2d}, [x0], #48
+// CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0], #48
 // CHECK:     // encoding: [0x00,0x4c,0xdf,0x4c]
-// CHECK: ld3 {v0.8b, v1.8b, v2.8b}, [x0], x2
+// CHECK: ld3 { v0.8b, v1.8b, v2.8b }, [x0], x2
 // CHECK:     // encoding: [0x00,0x40,0xc2,0x0c]
-// CHECK: ld3 {v15.4h, v16.4h, v17.4h}, [x15], x3
+// CHECK: ld3 { v15.4h, v16.4h, v17.4h }, [x15], x3
 // CHECK:     // encoding: [0xef,0x45,0xc3,0x0c]
-// CHECK: ld3 {v31.2s, v0.2s, v1.2s}, [sp], #24
+// CHECK: ld3 { v31.2s, v0.2s, v1.2s }, [sp], #24
 // CHECK:     // encoding: [0xff,0x4b,0xdf,0x0c]
 
 //------------------------------------------------------------------------------
 // Load multiple 4-element structures from four consecutive registers
 // (post-index)
 //------------------------------------------------------------------------------
-         ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
-         ld4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
-         ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
-         ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
-         ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
-         ld4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
-         ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
-// CHECK: ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+         ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], x1
+         ld4 { v15.8h, v16.8h, v17.8h, v18.8h }, [x15], x2
+         ld4 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp], #64
+         ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0], #64
+         ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+         ld4 { v15.4h, v16.4h, v17.4h, v18.4h }, [x15], x4
+         ld4 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp], #32
+// CHECK: ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0x00,0xc1,0x4c]
-// CHECK: ld4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
+// CHECK: ld4 { v15.8h, v16.8h, v17.8h, v18.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0x05,0xc2,0x4c]
-// CHECK: ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+// CHECK: ld4 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp], #64
 // CHECK:     // encoding: [0xff,0x0b,0xdf,0x4c]
-// CHECK: ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
+// CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0], #64
 // CHECK:     // encoding: [0x00,0x0c,0xdf,0x4c]
-// CHECK: ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+// CHECK: ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
 // CHECK:     // encoding: [0x00,0x00,0xc3,0x0c]
-// CHECK: ld4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
+// CHECK: ld4 { v15.4h, v16.4h, v17.4h, v18.4h }, [x15], x4
 // CHECK:     // encoding: [0xef,0x05,0xc4,0x0c]
-// CHECK: ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
+// CHECK: ld4 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp], #32
 // CHECK:     // encoding: [0xff,0x0b,0xdf,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 1-element structures from one register (post-index)
 //------------------------------------------------------------------------------
-         st1 {v0.16b}, [x0], x1
-         st1 {v15.8h}, [x15], x2
-         st1 {v31.4s}, [sp], #16
-         st1 {v0.2d}, [x0], #16
-         st1 {v0.8b}, [x0], x2
-         st1 {v15.4h}, [x15], x3
-         st1 {v31.2s}, [sp], #8
-         st1 {v0.1d}, [x0], #8
-// CHECK: st1 {v0.16b}, [x0], x1
+         st1 { v0.16b }, [x0], x1
+         st1 { v15.8h }, [x15], x2
+         st1 { v31.4s }, [sp], #16
+         st1 { v0.2d }, [x0], #16
+         st1 { v0.8b }, [x0], x2
+         st1 { v15.4h }, [x15], x3
+         st1 { v31.2s }, [sp], #8
+         st1 { v0.1d }, [x0], #8
+// CHECK: st1 { v0.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0x70,0x81,0x4c]
-// CHECK: st1 {v15.8h}, [x15], x2
+// CHECK: st1 { v15.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0x75,0x82,0x4c]
-// CHECK: st1 {v31.4s}, [sp], #16
+// CHECK: st1 { v31.4s }, [sp], #16
 // CHECK:     // encoding: [0xff,0x7b,0x9f,0x4c]
-// CHECK: st1 {v0.2d}, [x0], #16
+// CHECK: st1 { v0.2d }, [x0], #16
 // CHECK:     // encoding: [0x00,0x7c,0x9f,0x4c]
-// CHECK: st1 {v0.8b}, [x0], x2
+// CHECK: st1 { v0.8b }, [x0], x2
 // CHECK:     // encoding: [0x00,0x70,0x82,0x0c]
-// CHECK: st1 {v15.4h}, [x15], x3
+// CHECK: st1 { v15.4h }, [x15], x3
 // CHECK:     // encoding: [0xef,0x75,0x83,0x0c]
-// CHECK: st1 {v31.2s}, [sp], #8
+// CHECK: st1 { v31.2s }, [sp], #8
 // CHECK:     // encoding: [0xff,0x7b,0x9f,0x0c]
-// CHECK: st1 {v0.1d}, [x0], #8
+// CHECK: st1 { v0.1d }, [x0], #8
 // CHECK:     // encoding: [0x00,0x7c,0x9f,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 1-element structures from two consecutive registers
 // (post-index)
 //------------------------------------------------------------------------------
-         st1 {v0.16b, v1.16b}, [x0], x1
-         st1 {v15.8h, v16.8h}, [x15], x2
-         st1 {v31.4s, v0.4s}, [sp], #32
-         st1 {v0.2d, v1.2d}, [x0], #32
-         st1 {v0.8b, v1.8b}, [x0], x2
-         st1 {v15.4h, v16.4h}, [x15], x3
-         st1 {v31.2s, v0.2s}, [sp], #16
-         st1 {v0.1d, v1.1d}, [x0], #16
-// CHECK: st1 {v0.16b, v1.16b}, [x0], x1
+         st1 { v0.16b, v1.16b }, [x0], x1
+         st1 { v15.8h, v16.8h }, [x15], x2
+         st1 { v31.4s, v0.4s }, [sp], #32
+         st1 { v0.2d, v1.2d }, [x0], #32
+         st1 { v0.8b, v1.8b }, [x0], x2
+         st1 { v15.4h, v16.4h }, [x15], x3
+         st1 { v31.2s, v0.2s }, [sp], #16
+         st1 { v0.1d, v1.1d }, [x0], #16
+// CHECK: st1 { v0.16b, v1.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0xa0,0x81,0x4c]
-// CHECK: st1 {v15.8h, v16.8h}, [x15], x2
+// CHECK: st1 { v15.8h, v16.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0xa5,0x82,0x4c]
-// CHECK: st1 {v31.4s, v0.4s}, [sp], #32
+// CHECK: st1 { v31.4s, v0.4s }, [sp], #32
 // CHECK:     // encoding: [0xff,0xab,0x9f,0x4c]
-// CHECK: st1 {v0.2d, v1.2d}, [x0], #32
+// CHECK: st1 { v0.2d, v1.2d }, [x0], #32
 // CHECK:     // encoding: [0x00,0xac,0x9f,0x4c]
-// CHECK: st1 {v0.8b, v1.8b}, [x0], x2
+// CHECK: st1 { v0.8b, v1.8b }, [x0], x2
 // CHECK:     // encoding: [0x00,0xa0,0x82,0x0c]
-// CHECK: st1 {v15.4h, v16.4h}, [x15], x3
+// CHECK: st1 { v15.4h, v16.4h }, [x15], x3
 // CHECK:     // encoding: [0xef,0xa5,0x83,0x0c]
-// CHECK: st1 {v31.2s, v0.2s}, [sp], #16
+// CHECK: st1 { v31.2s, v0.2s }, [sp], #16
 // CHECK:     // encoding: [0xff,0xab,0x9f,0x0c]
-// CHECK: st1 {v0.1d, v1.1d}, [x0], #16
+// CHECK: st1 { v0.1d, v1.1d }, [x0], #16
 // CHECK:     // encoding: [0x00,0xac,0x9f,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 1-element structures from three consecutive registers
 // (post-index)
 //------------------------------------------------------------------------------
-         st1 {v0.16b, v1.16b, v2.16b}, [x0], x1
-         st1 {v15.8h, v16.8h, v17.8h}, [x15], x2
-         st1 {v31.4s, v0.4s, v1.4s}, [sp], #48
-         st1 {v0.2d, v1.2d, v2.2d}, [x0], #48
-         st1 {v0.8b, v1.8b, v2.8b}, [x0], x2
-         st1 {v15.4h, v16.4h, v17.4h}, [x15], x3
-         st1 {v31.2s, v0.2s, v1.2s}, [sp], #24
-         st1 {v0.1d, v1.1d, v2.1d}, [x0], #24
-// CHECK: st1 {v0.16b, v1.16b, v2.16b}, [x0], x1
+         st1 { v0.16b, v1.16b, v2.16b }, [x0], x1
+         st1 { v15.8h, v16.8h, v17.8h }, [x15], x2
+         st1 { v31.4s, v0.4s, v1.4s }, [sp], #48
+         st1 { v0.2d, v1.2d, v2.2d }, [x0], #48
+         st1 { v0.8b, v1.8b, v2.8b }, [x0], x2
+         st1 { v15.4h, v16.4h, v17.4h }, [x15], x3
+         st1 { v31.2s, v0.2s, v1.2s }, [sp], #24
+         st1 { v0.1d, v1.1d, v2.1d }, [x0], #24
+// CHECK: st1 { v0.16b, v1.16b, v2.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0x60,0x81,0x4c]
-// CHECK: st1 {v15.8h, v16.8h, v17.8h}, [x15], x2
+// CHECK: st1 { v15.8h, v16.8h, v17.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0x65,0x82,0x4c]
-// CHECK: st1 {v31.4s, v0.4s, v1.4s}, [sp], #48
+// CHECK: st1 { v31.4s, v0.4s, v1.4s }, [sp], #48
 // CHECK:     // encoding: [0xff,0x6b,0x9f,0x4c]
-// CHECK: st1 {v0.2d, v1.2d, v2.2d}, [x0], #48
+// CHECK: st1 { v0.2d, v1.2d, v2.2d }, [x0], #48
 // CHECK:     // encoding: [0x00,0x6c,0x9f,0x4c]
-// CHECK: st1 {v0.8b, v1.8b, v2.8b}, [x0], x2
+// CHECK: st1 { v0.8b, v1.8b, v2.8b }, [x0], x2
 // CHECK:     // encoding: [0x00,0x60,0x82,0x0c]
-// CHECK: st1 {v15.4h, v16.4h, v17.4h}, [x15], x3
+// CHECK: st1 { v15.4h, v16.4h, v17.4h }, [x15], x3
 // CHECK:     // encoding: [0xef,0x65,0x83,0x0c]
-// CHECK: st1 {v31.2s, v0.2s, v1.2s}, [sp], #24
+// CHECK: st1 { v31.2s, v0.2s, v1.2s }, [sp], #24
 // CHECK:     // encoding: [0xff,0x6b,0x9f,0x0c]
-// CHECK: st1 {v0.1d, v1.1d, v2.1d}, [x0], #24
+// CHECK: st1 { v0.1d, v1.1d, v2.1d }, [x0], #24
 // CHECK:     // encoding: [0x00,0x6c,0x9f,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 1-element structures from four consecutive registers
 // (post-index)
 //------------------------------------------------------------------------------
-         st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
-         st1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
-         st1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
-         st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
-         st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
-         st1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
-         st1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
-         st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0], #32
-// CHECK: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+         st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], x1
+         st1 { v15.8h, v16.8h, v17.8h, v18.8h }, [x15], x2
+         st1 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp], #64
+         st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0], #64
+         st1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+         st1 { v15.4h, v16.4h, v17.4h, v18.4h }, [x15], x4
+         st1 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp], #32
+         st1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x0], #32
+// CHECK: st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0x20,0x81,0x4c]
-// CHECK: st1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
+// CHECK: st1 { v15.8h, v16.8h, v17.8h, v18.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0x25,0x82,0x4c]
-// CHECK: st1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+// CHECK: st1 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp], #64
 // CHECK:     // encoding: [0xff,0x2b,0x9f,0x4c]
-// CHECK: st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
+// CHECK: st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0], #64
 // CHECK:     // encoding: [0x00,0x2c,0x9f,0x4c]
-// CHECK: st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+// CHECK: st1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
 // CHECK:     // encoding: [0x00,0x20,0x83,0x0c]
-// CHECK: st1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
+// CHECK: st1 { v15.4h, v16.4h, v17.4h, v18.4h }, [x15], x4
 // CHECK:     // encoding: [0xef,0x25,0x84,0x0c]
-// CHECK: st1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
+// CHECK: st1 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp], #32
 // CHECK:     // encoding: [0xff,0x2b,0x9f,0x0c]
-// CHECK: st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0], #32
+// CHECK: st1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x0], #32
 // CHECK:     // encoding: [0x00,0x2c,0x9f,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 2-element structures from two consecutive registers
 // (post-index)
 //------------------------------------------------------------------------------
-         st2 {v0.16b, v1.16b}, [x0], x1
-         st2 {v15.8h, v16.8h}, [x15], x2
-         st2 {v31.4s, v0.4s}, [sp], #32
-         st2 {v0.2d, v1.2d}, [x0], #32
-         st2 {v0.8b, v1.8b}, [x0], x2
-         st2 {v15.4h, v16.4h}, [x15], x3
-         st2 {v31.2s, v0.2s}, [sp], #16
-// CHECK: st2 {v0.16b, v1.16b}, [x0], x1
+         st2 { v0.16b, v1.16b }, [x0], x1
+         st2 { v15.8h, v16.8h }, [x15], x2
+         st2 { v31.4s, v0.4s }, [sp], #32
+         st2 { v0.2d, v1.2d }, [x0], #32
+         st2 { v0.8b, v1.8b }, [x0], x2
+         st2 { v15.4h, v16.4h }, [x15], x3
+         st2 { v31.2s, v0.2s }, [sp], #16
+// CHECK: st2 { v0.16b, v1.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0x80,0x81,0x4c]
-// CHECK: st2 {v15.8h, v16.8h}, [x15], x2
+// CHECK: st2 { v15.8h, v16.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0x85,0x82,0x4c]
-// CHECK: st2 {v31.4s, v0.4s}, [sp], #32
+// CHECK: st2 { v31.4s, v0.4s }, [sp], #32
 // CHECK:     // encoding: [0xff,0x8b,0x9f,0x4c]
-// CHECK: st2 {v0.2d, v1.2d}, [x0], #32
+// CHECK: st2 { v0.2d, v1.2d }, [x0], #32
 // CHECK:     // encoding: [0x00,0x8c,0x9f,0x4c]
-// CHECK: st2 {v0.8b, v1.8b}, [x0], x2
+// CHECK: st2 { v0.8b, v1.8b }, [x0], x2
 // CHECK:     // encoding: [0x00,0x80,0x82,0x0c]
-// CHECK: st2 {v15.4h, v16.4h}, [x15], x3
+// CHECK: st2 { v15.4h, v16.4h }, [x15], x3
 // CHECK:     // encoding: [0xef,0x85,0x83,0x0c]
-// CHECK: st2 {v31.2s, v0.2s}, [sp], #16
+// CHECK: st2 { v31.2s, v0.2s }, [sp], #16
 // CHECK:     // encoding: [0xff,0x8b,0x9f,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 3-element structures from three consecutive registers
 // (post-index)
 //------------------------------------------------------------------------------
-         st3 {v0.16b, v1.16b, v2.16b}, [x0], x1
-         st3 {v15.8h, v16.8h, v17.8h}, [x15], x2
-         st3 {v31.4s, v0.4s, v1.4s}, [sp], #48
-         st3 {v0.2d, v1.2d, v2.2d}, [x0], #48
-         st3 {v0.8b, v1.8b, v2.8b}, [x0], x2
-         st3 {v15.4h, v16.4h, v17.4h}, [x15], x3
-         st3 {v31.2s, v0.2s, v1.2s}, [sp], #24
-// CHECK: st3 {v0.16b, v1.16b, v2.16b}, [x0], x1
+         st3 { v0.16b, v1.16b, v2.16b }, [x0], x1
+         st3 { v15.8h, v16.8h, v17.8h }, [x15], x2
+         st3 { v31.4s, v0.4s, v1.4s }, [sp], #48
+         st3 { v0.2d, v1.2d, v2.2d }, [x0], #48
+         st3 { v0.8b, v1.8b, v2.8b }, [x0], x2
+         st3 { v15.4h, v16.4h, v17.4h }, [x15], x3
+         st3 { v31.2s, v0.2s, v1.2s }, [sp], #24
+// CHECK: st3 { v0.16b, v1.16b, v2.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0x40,0x81,0x4c]
-// CHECK: st3 {v15.8h, v16.8h, v17.8h}, [x15], x2
+// CHECK: st3 { v15.8h, v16.8h, v17.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0x45,0x82,0x4c]
-// CHECK: st3 {v31.4s, v0.4s, v1.4s}, [sp], #48
+// CHECK: st3 { v31.4s, v0.4s, v1.4s }, [sp], #48
 // CHECK:     // encoding: [0xff,0x4b,0x9f,0x4c]
-// CHECK: st3 {v0.2d, v1.2d, v2.2d}, [x0], #48
+// CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0], #48
 // CHECK:     // encoding: [0x00,0x4c,0x9f,0x4c]
-// CHECK: st3 {v0.8b, v1.8b, v2.8b}, [x0], x2
+// CHECK: st3 { v0.8b, v1.8b, v2.8b }, [x0], x2
 // CHECK:     // encoding: [0x00,0x40,0x82,0x0c]
-// CHECK: st3 {v15.4h, v16.4h, v17.4h}, [x15], x3
+// CHECK: st3 { v15.4h, v16.4h, v17.4h }, [x15], x3
 // CHECK:     // encoding: [0xef,0x45,0x83,0x0c]
-// CHECK: st3 {v31.2s, v0.2s, v1.2s}, [sp], #24
+// CHECK: st3 { v31.2s, v0.2s, v1.2s }, [sp], #24
 // CHECK:     // encoding: [0xff,0x4b,0x9f,0x0c]
 
 //------------------------------------------------------------------------------
 // Store multiple 4-element structures from four consecutive registers
 // (post-index)
 //------------------------------------------------------------------------------
-         st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
-         st4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
-         st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
-         st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
-         st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
-         st4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
-         st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
-// CHECK: st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+         st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], x1
+         st4 { v15.8h, v16.8h, v17.8h, v18.8h }, [x15], x2
+         st4 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp], #64
+         st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0], #64
+         st4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
+         st4 { v15.4h, v16.4h, v17.4h, v18.4h }, [x15], x4
+         st4 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp], #32
+// CHECK: st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], x1
 // CHECK:     // encoding: [0x00,0x00,0x81,0x4c]
-// CHECK: st4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
+// CHECK: st4 { v15.8h, v16.8h, v17.8h, v18.8h }, [x15], x2
 // CHECK:     // encoding: [0xef,0x05,0x82,0x4c]
-// CHECK: st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+// CHECK: st4 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp], #64
 // CHECK:     // encoding: [0xff,0x0b,0x9f,0x4c]
-// CHECK: st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
+// CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0], #64
 // CHECK:     // encoding: [0x00,0x0c,0x9f,0x4c]
-// CHECK: st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+// CHECK: st4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
 // CHECK:     // encoding: [0x00,0x00,0x83,0x0c]
-// CHECK: st4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
+// CHECK: st4 { v15.4h, v16.4h, v17.4h, v18.4h }, [x15], x4
 // CHECK:     // encoding: [0xef,0x05,0x84,0x0c]
-// CHECK: st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
+// CHECK: st4 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp], #32
 // CHECK:     // encoding: [0xff,0x0b,0x9f,0x0c]
diff --git a/test/MC/AArch64/neon-tbl.s b/test/MC/AArch64/neon-tbl.s
index ff3e86b..bb39fa9 100644
--- a/test/MC/AArch64/neon-tbl.s
+++ b/test/MC/AArch64/neon-tbl.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64 -mattr=+neon -show-encoding < %s | FileCheck %s
 
 // Check that the assembler can handle the documented syntax for AArch64
 
@@ -6,51 +6,50 @@
 // Instructions across vector registers
 //------------------------------------------------------------------------------
 
-        tbl v0.8b, {v1.16b}, v2.8b
-        tbl v0.8b, {v1.16b, v2.16b}, v2.8b
-        tbl v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b
-        tbl v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.8b
-        tbl v0.8b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.8b
-
-// CHECK: tbl	v0.8b, {v1.16b}, v2.8b  // encoding: [0x20,0x00,0x02,0x0e]
-// CHECK: tbl	v0.8b, {v1.16b, v2.16b}, v2.8b // encoding: [0x20,0x20,0x02,0x0e]
-// CHECK: tbl	v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b // encoding: [0x20,0x40,0x02,0x0e]
-// CHECK: tbl	v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.8b // encoding: [0x20,0x60,0x02,0x0e]
-// CHECK: tbl	v0.8b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.8b // encoding: [0xe0,0x63,0x02,0x0e]
-
-        tbl v0.16b, {v1.16b}, v2.16b
-        tbl v0.16b, {v1.16b, v2.16b}, v2.16b
-        tbl v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b
-        tbl v0.16b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.16b
-        tbl v0.16b, {v30.16b, v31.16b, v0.16b, v1.16b}, v2.16b
-
-// CHECK: tbl	v0.16b, {v1.16b}, v2.16b // encoding: [0x20,0x00,0x02,0x4e]
-// CHECK: tbl	v0.16b, {v1.16b, v2.16b}, v2.16b // encoding: [0x20,0x20,0x02,0x4e]
-// CHECK: tbl	v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b // encoding: [0x20,0x40,0x02,0x4e]
-// CHECK: tbl	v0.16b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.16b // encoding: [0x20,0x60,0x02,0x4e]
-// CHECK: tbl	v0.16b, {v30.16b, v31.16b, v0.16b, v1.16b}, v2.16b // encoding: [0xc0,0x63,0x02,0x4e]
-
-        tbx v0.8b, {v1.16b}, v2.8b
-        tbx v0.8b, {v1.16b, v2.16b}, v2.8b
-        tbx v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b
-        tbx v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.8b
-        tbx v0.8b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.8b
-
-// CHECK: tbx	v0.8b, {v1.16b}, v2.8b  // encoding: [0x20,0x10,0x02,0x0e]
-// CHECK: tbx	v0.8b, {v1.16b, v2.16b}, v2.8b // encoding: [0x20,0x30,0x02,0x0e]
-// CHECK: tbx	v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b // encoding: [0x20,0x50,0x02,0x0e]
-// CHECK: tbx	v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.8b // encoding: [0x20,0x70,0x02,0x0e]
-// CHECK: tbx	v0.8b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.8b // encoding: [0xe0,0x73,0x02,0x0e]
-
-        tbx v0.16b, {v1.16b}, v2.16b
-        tbx v0.16b, {v1.16b, v2.16b}, v2.16b
-        tbx v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b
-        tbx v0.16b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.16b
-        tbx v0.16b, {v30.16b, v31.16b, v0.16b, v1.16b}, v2.16b
-
-// CHECK: tbx	v0.16b, {v1.16b}, v2.16b // encoding: [0x20,0x10,0x02,0x4e]
-// CHECK: tbx	v0.16b, {v1.16b, v2.16b}, v2.16b // encoding: [0x20,0x30,0x02,0x4e]
-// CHECK: tbx	v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b // encoding: [0x20,0x50,0x02,0x4e]
-// CHECK: tbx	v0.16b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.16b // encoding: [0x20,0x70,0x02,0x4e]
-// CHECK: tbx	v0.16b, {v30.16b, v31.16b, v0.16b, v1.16b}, v2.16b // encoding: [0xc0,0x73,0x02,0x4e]
-
+        tbl v0.8b, { v1.16b }, v2.8b
+        tbl v0.8b, { v1.16b, v2.16b }, v2.8b
+        tbl v0.8b, { v1.16b, v2.16b, v3.16b }, v2.8b
+        tbl v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v2.8b
+        tbl v0.8b, { v31.16b, v0.16b, v1.16b, v2.16b }, v2.8b
+
+// CHECK: tbl	v0.8b, { v1.16b }, v2.8b  // encoding: [0x20,0x00,0x02,0x0e]
+// CHECK: tbl	v0.8b, { v1.16b, v2.16b }, v2.8b // encoding: [0x20,0x20,0x02,0x0e]
+// CHECK: tbl	v0.8b, { v1.16b, v2.16b, v3.16b }, v2.8b // encoding: [0x20,0x40,0x02,0x0e]
+// CHECK: tbl	v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v2.8b // encoding: [0x20,0x60,0x02,0x0e]
+// CHECK: tbl	v0.8b, { v31.16b, v0.16b, v1.16b, v2.16b }, v2.8b // encoding: [0xe0,0x63,0x02,0x0e]
+
+        tbl v0.16b, { v1.16b }, v2.16b
+        tbl v0.16b, { v1.16b, v2.16b }, v2.16b
+        tbl v0.16b, { v1.16b, v2.16b, v3.16b }, v2.16b
+        tbl v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v2.16b
+        tbl v0.16b, { v30.16b, v31.16b, v0.16b, v1.16b }, v2.16b
+
+// CHECK: tbl	v0.16b, { v1.16b }, v2.16b // encoding: [0x20,0x00,0x02,0x4e]
+// CHECK: tbl	v0.16b, { v1.16b, v2.16b }, v2.16b // encoding: [0x20,0x20,0x02,0x4e]
+// CHECK: tbl	v0.16b, { v1.16b, v2.16b, v3.16b }, v2.16b // encoding: [0x20,0x40,0x02,0x4e]
+// CHECK: tbl	v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v2.16b // encoding: [0x20,0x60,0x02,0x4e]
+// CHECK: tbl	v0.16b, { v30.16b, v31.16b, v0.16b, v1.16b }, v2.16b // encoding: [0xc0,0x63,0x02,0x4e]
+
+        tbx v0.8b, { v1.16b }, v2.8b
+        tbx v0.8b, { v1.16b, v2.16b }, v2.8b
+        tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v2.8b
+        tbx v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v2.8b
+        tbx v0.8b, { v31.16b, v0.16b, v1.16b, v2.16b }, v2.8b
+
+// CHECK: tbx	v0.8b, { v1.16b }, v2.8b  // encoding: [0x20,0x10,0x02,0x0e]
+// CHECK: tbx	v0.8b, { v1.16b, v2.16b }, v2.8b // encoding: [0x20,0x30,0x02,0x0e]
+// CHECK: tbx	v0.8b, { v1.16b, v2.16b, v3.16b }, v2.8b // encoding: [0x20,0x50,0x02,0x0e]
+// CHECK: tbx	v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v2.8b // encoding: [0x20,0x70,0x02,0x0e]
+// CHECK: tbx	v0.8b, { v31.16b, v0.16b, v1.16b, v2.16b }, v2.8b // encoding: [0xe0,0x73,0x02,0x0e]
+
+        tbx v0.16b, { v1.16b }, v2.16b
+        tbx v0.16b, { v1.16b, v2.16b }, v2.16b
+        tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v2.16b
+        tbx v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v2.16b
+        tbx v0.16b, { v30.16b, v31.16b, v0.16b, v1.16b }, v2.16b
+
+// CHECK: tbx	v0.16b, { v1.16b }, v2.16b // encoding: [0x20,0x10,0x02,0x4e]
+// CHECK: tbx	v0.16b, { v1.16b, v2.16b }, v2.16b // encoding: [0x20,0x30,0x02,0x4e]
+// CHECK: tbx	v0.16b, { v1.16b, v2.16b, v3.16b }, v2.16b // encoding: [0x20,0x50,0x02,0x4e]
+// CHECK: tbx	v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v2.16b // encoding: [0x20,0x70,0x02,0x4e]
+// CHECK: tbx	v0.16b, { v30.16b, v31.16b, v0.16b, v1.16b }, v2.16b // encoding: [0xc0,0x73,0x02,0x4e]
diff --git a/test/MC/AArch64/noneon-diagnostics.s b/test/MC/AArch64/noneon-diagnostics.s
index ea786c0..60a5fd2 100644
--- a/test/MC/AArch64/noneon-diagnostics.s
+++ b/test/MC/AArch64/noneon-diagnostics.s
@@ -4,25 +4,26 @@
         fmla v3.4s, v12.4s, v17.4s
         fmla v1.2d, v30.2d, v20.2d
         fmla v9.2s, v9.2s, v0.2s
-// CHECK-ERROR: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmla v3.4s, v12.4s, v17.4s
 // CHECK-ERROR-NEXT:    ^
-// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmla v1.2d, v30.2d, v20.2d
 // CHECK-ERROR-NEXT:    ^
-// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmla v9.2s, v9.2s, v0.2s
 // CHECK-ERROR-NEXT:    ^
 
         fmls v3.4s, v12.4s, v17.4s
         fmls v1.2d, v30.2d, v20.2d
         fmls v9.2s, v9.2s, v0.2s
-// CHECK-ERROR: error: instruction requires a CPU feature not currently enabled
+
+// CHECK-ERROR: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmls v3.4s, v12.4s, v17.4s
 // CHECK-ERROR-NEXT:    ^
-// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmls v1.2d, v30.2d, v20.2d
 // CHECK-ERROR-NEXT:    ^
-// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: error: instruction requires: neon
 // CHECK-ERROR-NEXT:    fmls v9.2s, v9.2s, v0.2s
 // CHECK-ERROR-NEXT:    ^
diff --git a/test/MC/AArch64/optional-hash.s b/test/MC/AArch64/optional-hash.s
index 54b6fb3..3922b5b 100644
--- a/test/MC/AArch64/optional-hash.s
+++ b/test/MC/AArch64/optional-hash.s
@@ -1,6 +1,6 @@
 // PR18929
 // RUN: llvm-mc < %s -triple=aarch64-linux-gnueabi -mattr=+fp-armv8,+neon -filetype=obj -o - \
-// RUN: | llvm-objdump --disassemble -arch=aarch64 -mattr=+fp-armv8,+neon - | FileCheck %s
+// RUN: | llvm-objdump --disassemble -arch=arm64 -mattr=+fp-armv8,+neon - | FileCheck %s
 
     .text
 // CHECK: cmp w0, #123
diff --git a/test/MC/AArch64/tls-relocs.s b/test/MC/AArch64/tls-relocs.s
index f99cb41..ebf0216 100644
--- a/test/MC/AArch64/tls-relocs.s
+++ b/test/MC/AArch64/tls-relocs.s
@@ -7,14 +7,15 @@
         movn x2, #:dtprel_g2:var
         movz x3, #:dtprel_g2:var
         movn x4, #:dtprel_g2:var
-// CHECK: movz    x1, #:dtprel_g2:var     // encoding: [0x01'A',A,0xc0'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_a64_movw_dtprel_g2
-// CHECK: movn    x2, #:dtprel_g2:var     // encoding: [0x02'A',A,0xc0'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_a64_movw_dtprel_g2
-// CHECK: movz    x3, #:dtprel_g2:var     // encoding: [0x03'A',A,0xc0'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_a64_movw_dtprel_g2
-// CHECK: movn    x4, #:dtprel_g2:var     // encoding: [0x04'A',A,0xc0'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_a64_movw_dtprel_g2
+
+// CHECK: movz    x1, #:dtprel_g2:var     // encoding: [0bAAA00001,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x2, #:dtprel_g2:var     // encoding: [0bAAA00010,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movz    x3, #:dtprel_g2:var     // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x4, #:dtprel_g2:var     // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF:      Relocations [
 // CHECK-ELF-NEXT:   Section (2) .rela.text {
@@ -28,14 +29,15 @@
         movn x6, #:dtprel_g1:var
         movz w7, #:dtprel_g1:var
         movn w8, #:dtprel_g1:var
-// CHECK: movz    x5, #:dtprel_g1:var     // encoding: [0x05'A',A,0xa0'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_a64_movw_dtprel_g1
-// CHECK: movn    x6, #:dtprel_g1:var     // encoding: [0x06'A',A,0xa0'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_a64_movw_dtprel_g1
-// CHECK: movz    w7, #:dtprel_g1:var     // encoding: [0x07'A',A,0xa0'A',0x12'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_a64_movw_dtprel_g1
-// CHECK: movn    w8, #:dtprel_g1:var     // encoding: [0x08'A',A,0xa0'A',0x12'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_a64_movw_dtprel_g1
+
+// CHECK: movz    x5, #:dtprel_g1:var     // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    x6, #:dtprel_g1:var     // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w7, #:dtprel_g1:var     // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    w8, #:dtprel_g1:var     // encoding: [0bAAA01000,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x10 R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x14 R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
@@ -45,10 +47,11 @@
 
         movk x9, #:dtprel_g1_nc:var
         movk w10, #:dtprel_g1_nc:var
-// CHECK: movk    x9, #:dtprel_g1_nc:var  // encoding: [0x09'A',A,0xa0'A',0xf2'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_a64_movw_dtprel_g1_nc
-// CHECK: movk    w10, #:dtprel_g1_nc:var // encoding: [0x0a'A',A,0xa0'A',0x72'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_a64_movw_dtprel_g1_nc
+
+// CHECK: movk    x9, #:dtprel_g1_nc:var  // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w10, #:dtprel_g1_nc:var // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x20 R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x24 R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
@@ -58,13 +61,15 @@
         movn x12, #:dtprel_g0:var
         movz w13, #:dtprel_g0:var
         movn w14, #:dtprel_g0:var
-// CHECK: movz    x11, #:dtprel_g0:var    // encoding: [0x0b'A',A,0x80'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_a64_movw_dtprel_g0
-// CHECK: movn    x12, #:dtprel_g0:var    // encoding: [0x0c'A',A,0x80'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_a64_movw_dtprel_g0
-// CHECK: movz    w13, #:dtprel_g0:var    // encoding: [0x0d'A',A,0x80'A',0x12'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_a64_movw_dtprel_g0
-// CHECK: movn    w14, #:dtprel_g0:var    // encoding: [0x0e'A',A,0x80'A',0x12'A']
+
+// CHECK: movz    x11, #:dtprel_g0:var    // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    x12, #:dtprel_g0:var    // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movz    w13, #:dtprel_g0:var    // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    w14, #:dtprel_g0:var    // encoding: [0bAAA01110,A,0b100AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x28 R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x2C R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
@@ -74,10 +79,11 @@
 
         movk x15, #:dtprel_g0_nc:var
         movk w16, #:dtprel_g0_nc:var
-// CHECK: movk    x15, #:dtprel_g0_nc:var // encoding: [0x0f'A',A,0x80'A',0xf2'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_a64_movw_dtprel_g0_nc
-// CHECK: movk    w16, #:dtprel_g0_nc:var // encoding: [0x10'A',A,0x80'A',0x72'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_a64_movw_dtprel_g0_nc
+
+// CHECK: movk    x15, #:dtprel_g0_nc:var // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w16, #:dtprel_g0_nc:var // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
+// CHECK:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x38 R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x3C R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
@@ -85,10 +91,11 @@
 
         add x17, x18, #:dtprel_hi12:var, lsl #12
         add w19, w20, #:dtprel_hi12:var, lsl #12
-// CHECK: add     x17, x18, #:dtprel_hi12:var, lsl #12 // encoding: [0x51'A',0x02'A',0x40'A',0x91'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_a64_add_dtprel_hi12
-// CHECK: add     w19, w20, #:dtprel_hi12:var, lsl #12 // encoding: [0x93'A',0x02'A',0x40'A',0x11'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_a64_add_dtprel_hi12
+
+// CHECK: add    x17, x18, :dtprel_hi12:var, lsl #12 // encoding: [0x51,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK:                                            //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w19, w20, :dtprel_hi12:var, lsl #12 // encoding: [0x93,0bAAAAAA10,0b00AAAAAA,0x11]
+// CHECK:                                            //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0x40 R_AARCH64_TLSLD_ADD_DTPREL_HI12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x44 R_AARCH64_TLSLD_ADD_DTPREL_HI12 [[VARSYM]]
@@ -96,10 +103,11 @@
 
         add x21, x22, #:dtprel_lo12:var
         add w23, w24, #:dtprel_lo12:var
-// CHECK: add     x21, x22, #:dtprel_lo12:var // encoding: [0xd5'A',0x02'A',A,0x91'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_add_dtprel_lo12
-// CHECK: add     w23, w24, #:dtprel_lo12:var // encoding: [0x17'A',0x03'A',A,0x11'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_add_dtprel_lo12
+
+// CHECK: add    x21, x22, :dtprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK:                                   //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w23, w24, :dtprel_lo12:var // encoding: [0x17,0bAAAAAA11,0b00AAAAAA,0x11]
+// CHECK:                                   //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0x48 R_AARCH64_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x4C R_AARCH64_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
@@ -107,10 +115,11 @@
 
         add x25, x26, #:dtprel_lo12_nc:var
         add w27, w28, #:dtprel_lo12_nc:var
-// CHECK: add     x25, x26, #:dtprel_lo12_nc:var // encoding: [0x59'A',0x03'A',A,0x91'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_add_dtprel_lo12_nc
-// CHECK: add     w27, w28, #:dtprel_lo12_nc:var // encoding: [0x9b'A',0x03'A',A,0x11'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_add_dtprel_lo12_nc
+
+// CHECK: add    x25, x26, :dtprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
+// CHECK:                                      //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w27, w28, :dtprel_lo12_nc:var // encoding: [0x9b,0bAAAAAA11,0b00AAAAAA,0x11]
+// CHECK:                                      //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0x50 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x54 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
@@ -118,10 +127,11 @@
 
         ldrb w29, [x30, #:dtprel_lo12:var]
         ldrsb x29, [x28, #:dtprel_lo12_nc:var]
-// CHECK: ldrb    w29, [x30, #:dtprel_lo12:var] // encoding: [0xdd'A',0x03'A',0x40'A',0x39'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_ldst8_dtprel_lo12
-// CHECK: ldrsb   x29, [x28, #:dtprel_lo12_nc:var] // encoding: [0x9d'A',0x03'A',0x80'A',0x39'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_ldst8_dtprel_lo12_nc
+
+// CHECK: ldrb    w29, [x30, :dtprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
+// CHECK:                                      //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsb    x29, [x28, :dtprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
+// CHECK:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
 
 // CHECK-ELF-NEXT:     0x58 R_AARCH64_TLSLD_LDST8_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x5C R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC [[VARSYM]]
@@ -129,10 +139,11 @@
 
         strh w27, [x26, #:dtprel_lo12:var]
         ldrsh x25, [x24, #:dtprel_lo12_nc:var]
-// CHECK: strh    w27, [x26, #:dtprel_lo12:var] // encoding: [0x5b'A',0x03'A',A,0x79'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_ldst16_dtprel_lo12
-// CHECK: ldrsh   x25, [x24, #:dtprel_lo12_nc:var] // encoding: [0x19'A',0x03'A',0x80'A',0x79'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_ldst16_dtprel_lo12_n
+
+// CHECK: strh    w27, [x26, :dtprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
+// CHECK:                                      //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsh    x25, [x24, :dtprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
+// CHECK:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
 
 // CHECK-ELF-NEXT:     0x60 R_AARCH64_TLSLD_LDST16_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x64 R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC [[VARSYM]]
@@ -140,10 +151,11 @@
 
         ldr w23, [x22, #:dtprel_lo12:var]
         ldrsw x21, [x20, #:dtprel_lo12_nc:var]
-// CHECK: ldr     w23, [x22, #:dtprel_lo12:var] // encoding: [0xd7'A',0x02'A',0x40'A',0xb9'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_ldst32_dtprel_lo12
-// CHECK: ldrsw   x21, [x20, #:dtprel_lo12_nc:var] // encoding: [0x95'A',0x02'A',0x80'A',0xb9'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_ldst32_dtprel_lo12_n
+
+// CHECK: ldr    w23, [x22, :dtprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
+// CHECK:                                     //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldrsw    x21, [x20, :dtprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
+// CHECK:                                          //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
 
 // CHECK-ELF-NEXT:     0x68 R_AARCH64_TLSLD_LDST32_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x6C R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC [[VARSYM]]
@@ -151,11 +163,11 @@
 
         ldr x19, [x18, #:dtprel_lo12:var]
         str x17, [x16, #:dtprel_lo12_nc:var]
-// CHECK: ldr     x19, [x18, #:dtprel_lo12:var] // encoding: [0x53'A',0x02'A',0x40'A',0xf9'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_ldst64_dtprel_lo12
-// CHECK: str     x17, [x16, #:dtprel_lo12_nc:var] // encoding: [0x11'A',0x02'A',A,0xf9'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_ldst64_dtprel_lo12_nc
 
+// CHECK: ldr    x19, [x18, :dtprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
+// CHECK:                                     //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: str    x17, [x16, :dtprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
+// CHECK:                                        //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
 
 // CHECK-ELF-NEXT:     0x70 R_AARCH64_TLSLD_LDST64_DTPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x74 R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC [[VARSYM]]
@@ -164,10 +176,11 @@
         // TLS initial-exec forms
         movz x15, #:gottprel_g1:var
         movz w14, #:gottprel_g1:var
-// CHECK: movz    x15, #:gottprel_g1:var  // encoding: [0x0f'A',A,0xa0'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_a64_movw_gottprel_g1
-// CHECK: movz    w14, #:gottprel_g1:var  // encoding: [0x0e'A',A,0xa0'A',0x12'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_a64_movw_gottprel_g1
+
+// CHECK: movz    x15, #:gottprel_g1:var  // encoding: [0bAAA01111,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w14, #:gottprel_g1:var  // encoding: [0bAAA01110,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x78 R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x7C R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 [[VARSYM]]
@@ -175,10 +188,11 @@
 
         movk x13, #:gottprel_g0_nc:var
         movk w12, #:gottprel_g0_nc:var
-// CHECK: movk    x13, #:gottprel_g0_nc:var // encoding: [0x0d'A',A,0x80'A',0xf2'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_a64_movw_gottprel_g0_nc
-// CHECK: movk    w12, #:gottprel_g0_nc:var // encoding: [0x0c'A',A,0x80'A',0x72'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_a64_movw_gottprel_g0_nc
+
+// CHECK: movk    x13, #:gottprel_g0_nc:var // encoding: [0bAAA01101,A,0b100AAAAA,0xf2]
+// CHECK:                                   //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w12, #:gottprel_g0_nc:var // encoding: [0bAAA01100,A,0b100AAAAA,0x72]
+// CHECK:                                   //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x80 R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0x84 R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC [[VARSYM]]
@@ -187,12 +201,13 @@
         adrp x11, :gottprel:var
         ldr x10, [x0, #:gottprel_lo12:var]
         ldr x9, :gottprel:var
+
 // CHECK: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_a64_adr_gottprel_page
-// CHECK: ldr     x10, [x0, #:gottprel_lo12:var] // encoding: [0x0a'A',A,0x40'A',0xf9'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_a64_ld64_gottprel_lo12_nc
-// CHECK: ldr     x9, :gottprel:var       // encoding: [0x09'A',A,A,0x58'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_a64_ld_gottprel_prel19
+// CHECK:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK: ldr    x10, [x0, :gottprel_lo12:var] // encoding: [0x0a,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK:                                      //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: ldr    x9, :gottprel:var       // encoding: [0bAAA01001,A,A,0x58]
+// CHECK:                                //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_aarch64_ldr_pcrel_imm19
 
 // CHECK-ELF-NEXT:     0x88 R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x8C R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC [[VARSYM]]
@@ -202,10 +217,11 @@
         // TLS local-exec forms
         movz x3, #:tprel_g2:var
         movn x4, #:tprel_g2:var
-// CHECK: movz    x3, #:tprel_g2:var      // encoding: [0x03'A',A,0xc0'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_a64_movw_tprel_g2
-// CHECK: movn    x4, #:tprel_g2:var      // encoding: [0x04'A',A,0xc0'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_a64_movw_tprel_g2
+
+// CHECK: movz    x3, #:tprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_aarch64_movw
+// CHECK: movn    x4, #:tprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x94 R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x98 R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
@@ -215,14 +231,15 @@
         movn x6, #:tprel_g1:var
         movz w7, #:tprel_g1:var
         movn w8, #:tprel_g1:var
-// CHECK: movz    x5, #:tprel_g1:var      // encoding: [0x05'A',A,0xa0'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_a64_movw_tprel_g1
-// CHECK: movn    x6, #:tprel_g1:var      // encoding: [0x06'A',A,0xa0'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_a64_movw_tprel_g1
-// CHECK: movz    w7, #:tprel_g1:var      // encoding: [0x07'A',A,0xa0'A',0x12'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_a64_movw_tprel_g1
-// CHECK: movn    w8, #:tprel_g1:var      // encoding: [0x08'A',A,0xa0'A',0x12'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_a64_movw_tprel_g1
+
+// CHECK: movz    x5, #:tprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    x6, #:tprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movz    w7, #:tprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
+// CHECK: movn    w8, #:tprel_g1:var      // encoding: [0bAAA01000,A,0b101AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0x9C R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xA0 R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
@@ -232,10 +249,11 @@
 
         movk x9, #:tprel_g1_nc:var
         movk w10, #:tprel_g1_nc:var
-// CHECK: movk    x9, #:tprel_g1_nc:var   // encoding: [0x09'A',A,0xa0'A',0xf2'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_a64_movw_tprel_g1_nc
-// CHECK: movk    w10, #:tprel_g1_nc:var  // encoding: [0x0a'A',A,0xa0'A',0x72'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_a64_movw_tprel_g1_nc
+
+// CHECK: movk    x9, #:tprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w10, #:tprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0xAC R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0xB0 R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
@@ -245,14 +263,15 @@
         movn x12, #:tprel_g0:var
         movz w13, #:tprel_g0:var
         movn w14, #:tprel_g0:var
-// CHECK: movz    x11, #:tprel_g0:var     // encoding: [0x0b'A',A,0x80'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_a64_movw_tprel_g0
-// CHECK: movn    x12, #:tprel_g0:var     // encoding: [0x0c'A',A,0x80'A',0x92'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_a64_movw_tprel_g0
-// CHECK: movz    w13, #:tprel_g0:var     // encoding: [0x0d'A',A,0x80'A',0x12'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_a64_movw_tprel_g0
-// CHECK: movn    w14, #:tprel_g0:var     // encoding: [0x0e'A',A,0x80'A',0x12'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_a64_movw_tprel_g0
+
+// CHECK: movz    x11, #:tprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    x12, #:tprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movz    w13, #:tprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
+// CHECK: movn    w14, #:tprel_g0:var     // encoding: [0bAAA01110,A,0b100AAAAA,0x12]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0xB4 R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xB8 R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
@@ -262,10 +281,11 @@
 
         movk x15, #:tprel_g0_nc:var
         movk w16, #:tprel_g0_nc:var
-// CHECK: movk    x15, #:tprel_g0_nc:var  // encoding: [0x0f'A',A,0x80'A',0xf2'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_a64_movw_tprel_g0_nc
-// CHECK: movk    w16, #:tprel_g0_nc:var  // encoding: [0x10'A',A,0x80'A',0x72'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_a64_movw_tprel_g0_nc
+
+// CHECK: movk    x15, #:tprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
+// CHECK: movk    w16, #:tprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
+// CHECK:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_aarch64_movw
 
 // CHECK-ELF-NEXT:     0xC4 R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0xC8 R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
@@ -273,10 +293,11 @@
 
         add x17, x18, #:tprel_hi12:var, lsl #12
         add w19, w20, #:tprel_hi12:var, lsl #12
-// CHECK: add     x17, x18, #:tprel_hi12:var, lsl #12 // encoding: [0x51'A',0x02'A',0x40'A',0x91'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_a64_add_tprel_hi12
-// CHECK: add     w19, w20, #:tprel_hi12:var, lsl #12 // encoding: [0x93'A',0x02'A',0x40'A',0x11'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_a64_add_tprel_hi12
+
+// CHECK: add    x17, x18, :tprel_hi12:var, lsl #12 // encoding: [0x51,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK:                                           //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w19, w20, :tprel_hi12:var, lsl #12 // encoding: [0x93,0bAAAAAA10,0b00AAAAAA,0x11]
+// CHECK:                                           //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0xCC R_AARCH64_TLSLE_ADD_TPREL_HI12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xD0 R_AARCH64_TLSLE_ADD_TPREL_HI12 [[VARSYM]]
@@ -284,10 +305,11 @@
 
         add x21, x22, #:tprel_lo12:var
         add w23, w24, #:tprel_lo12:var
-// CHECK: add     x21, x22, #:tprel_lo12:var // encoding: [0xd5'A',0x02'A',A,0x91'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_add_tprel_lo12
-// CHECK: add     w23, w24, #:tprel_lo12:var // encoding: [0x17'A',0x03'A',A,0x11'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_add_tprel_lo12
+
+// CHECK: add    x21, x22, :tprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK:                                  //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w23, w24, :tprel_lo12:var // encoding: [0x17,0bAAAAAA11,0b00AAAAAA,0x11]
+// CHECK:                                  //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0xD4 R_AARCH64_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xD8 R_AARCH64_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
@@ -295,10 +317,11 @@
 
         add x25, x26, #:tprel_lo12_nc:var
         add w27, w28, #:tprel_lo12_nc:var
-// CHECK: add     x25, x26, #:tprel_lo12_nc:var // encoding: [0x59'A',0x03'A',A,0x91'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_add_tprel_lo12_nc
-// CHECK: add     w27, w28, #:tprel_lo12_nc:var // encoding: [0x9b'A',0x03'A',A,0x11'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_add_tprel_lo12_nc
+
+// CHECK: add    x25, x26, :tprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
+// CHECK: add    w27, w28, :tprel_lo12_nc:var // encoding: [0x9b,0bAAAAAA11,0b00AAAAAA,0x11]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_add_imm12
 
 // CHECK-ELF-NEXT:     0xDC R_AARCH64_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
 // CHECK-ELF-NEXT:     0xE0 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
@@ -306,10 +329,11 @@
 
         ldrb w29, [x30, #:tprel_lo12:var]
         ldrsb x29, [x28, #:tprel_lo12_nc:var]
-// CHECK: ldrb    w29, [x30, #:tprel_lo12:var] // encoding: [0xdd'A',0x03'A',0x40'A',0x39'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_ldst8_tprel_lo12
-// CHECK: ldrsb   x29, [x28, #:tprel_lo12_nc:var] // encoding: [0x9d'A',0x03'A',0x80'A',0x39'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_ldst8_tprel_lo12_nc
+
+// CHECK: ldrb    w29, [x30, :tprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale1
+// CHECK: ldrsb    x29, [x28, :tprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale1
 
 // CHECK-ELF-NEXT:     0xE4 R_AARCH64_TLSLE_LDST8_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xE8 R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC [[VARSYM]]
@@ -317,10 +341,11 @@
 
         strh w27, [x26, #:tprel_lo12:var]
         ldrsh x25, [x24, #:tprel_lo12_nc:var]
-// CHECK: strh    w27, [x26, #:tprel_lo12:var] // encoding: [0x5b'A',0x03'A',A,0x79'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_ldst16_tprel_lo12
-// CHECK: ldrsh   x25, [x24, #:tprel_lo12_nc:var] // encoding: [0x19'A',0x03'A',0x80'A',0x79'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_ldst16_tprel_lo12_n
+
+// CHECK: strh    w27, [x26, :tprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
+// CHECK:                                     //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale2
+// CHECK: ldrsh    x25, [x24, :tprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
+// CHECK:                                         //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale2
 
 // CHECK-ELF-NEXT:     0xEC R_AARCH64_TLSLE_LDST16_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xF0 R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC [[VARSYM]]
@@ -328,20 +353,22 @@
 
         ldr w23, [x22, #:tprel_lo12:var]
         ldrsw x21, [x20, #:tprel_lo12_nc:var]
-// CHECK: ldr     w23, [x22, #:tprel_lo12:var] // encoding: [0xd7'A',0x02'A',0x40'A',0xb9'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_ldst32_tprel_lo12
-// CHECK: ldrsw   x21, [x20, #:tprel_lo12_nc:var] // encoding: [0x95'A',0x02'A',0x80'A',0xb9'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_ldst32_tprel_lo12_n
+
+// CHECK: ldr    w23, [x22, :tprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
+// CHECK:                                    //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale4
+// CHECK: ldrsw    x21, [x20, :tprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
+// CHECK:                                         //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale4
 
 // CHECK-ELF-NEXT:     0xF4 R_AARCH64_TLSLE_LDST32_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0xF8 R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC [[VARSYM]]
 
         ldr x19, [x18, #:tprel_lo12:var]
         str x17, [x16, #:tprel_lo12_nc:var]
-// CHECK: ldr     x19, [x18, #:tprel_lo12:var] // encoding: [0x53'A',0x02'A',0x40'A',0xf9'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_ldst64_tprel_lo12
-// CHECK: str     x17, [x16, #:tprel_lo12_nc:var] // encoding: [0x11'A',0x02'A',A,0xf9'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_ldst64_tprel_lo12_nc
+
+// CHECK: ldr    x19, [x18, :tprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
+// CHECK:                                    //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: str    x17, [x16, :tprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
+// CHECK:                                       //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_aarch64_ldst_imm12_scale8
 
 // CHECK-ELF-NEXT:     0xFC  R_AARCH64_TLSLE_LDST64_TPREL_LO12 [[VARSYM]]
 // CHECK-ELF-NEXT:     0x100 R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC [[VARSYM]]
@@ -353,16 +380,16 @@
         .tlsdesccall var
         blr x3
 
+
 // CHECK: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_a64_tlsdesc_adr_page
-// CHECK: ldr     x7, [x6, #:tlsdesc_lo12:var] // encoding: [0xc7'A',A,0x40'A',0xf9'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_a64_tlsdesc_ld64_lo12_nc
-// CHECK: add     x5, x4, #:tlsdesc_lo12:var // encoding: [0x85'A',A,A,0x91'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_a64_tlsdesc_add_lo12_nc
+// CHECK:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_aarch64_pcrel_adrp_imm21
+// CHECK: ldr    x7, [x6, :tlsdesc_lo12:var] // encoding: [0xc7,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK:                                    //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_ldst_imm12_scale8
+// CHECK: add    x5, x4, :tlsdesc_lo12:var // encoding: [0x85,0bAAAAAA00,0b00AAAAAA,0x91]
+// CHECK:                                  //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_aarch64_add_imm12
 // CHECK: .tlsdesccall var                // encoding: []
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_a64_tlsdesc_call
-// CHECK: blr     x3                      // encoding: [0x60,0x00,0x3f,0xd6]
-
+// CHECK:                                 //   fixup A - offset: 0, value: var, kind: fixup_aarch64_tlsdesc_call
+// CHECK: blr    x3                      // encoding: [0x60,0x00,0x3f,0xd6]
 
 // CHECK-ELF-NEXT:     0x104 R_AARCH64_TLSDESC_ADR_PAGE [[VARSYM]]
 // CHECK-ELF-NEXT:     0x108 R_AARCH64_TLSDESC_LD64_LO12_NC [[VARSYM]]
@@ -374,7 +401,7 @@
 
 // CHECK-ELF:      Symbols [
 // CHECK-ELF:        Symbol {
-// CHECK-ELF:          Name: var (6)
+// CHECK-ELF:          Name: var
 // CHECK-ELF-NEXT:     Value:
 // CHECK-ELF-NEXT:     Size:
 // CHECK-ELF-NEXT:     Binding: Global
diff --git a/test/MC/AArch64/trace-regs.s b/test/MC/AArch64/trace-regs.s
index f9ab4c9..92f16cd 100644
--- a/test/MC/AArch64/trace-regs.s
+++ b/test/MC/AArch64/trace-regs.s
@@ -1,4 +1,5 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s
+
         mrs x8, trcstatr
         mrs x9, trcidr8
         mrs x11, trcidr9
@@ -207,214 +208,214 @@
         mrs x22, trcitctrl
         mrs x23, trcclaimset
         mrs x14, trcclaimclr
-// CHECK: mrs      x8, trcstatr               // encoding: [0x08,0x03,0x31,0xd5]
-// CHECK: mrs      x9, trcidr8                // encoding: [0xc9,0x00,0x31,0xd5]
-// CHECK: mrs      x11, trcidr9               // encoding: [0xcb,0x01,0x31,0xd5]
-// CHECK: mrs      x25, trcidr10              // encoding: [0xd9,0x02,0x31,0xd5]
-// CHECK: mrs      x7, trcidr11               // encoding: [0xc7,0x03,0x31,0xd5]
-// CHECK: mrs      x7, trcidr12               // encoding: [0xc7,0x04,0x31,0xd5]
-// CHECK: mrs      x6, trcidr13               // encoding: [0xc6,0x05,0x31,0xd5]
-// CHECK: mrs      x27, trcidr0               // encoding: [0xfb,0x08,0x31,0xd5]
-// CHECK: mrs      x29, trcidr1               // encoding: [0xfd,0x09,0x31,0xd5]
-// CHECK: mrs      x4, trcidr2                // encoding: [0xe4,0x0a,0x31,0xd5]
-// CHECK: mrs      x8, trcidr3                // encoding: [0xe8,0x0b,0x31,0xd5]
-// CHECK: mrs      x15, trcidr4               // encoding: [0xef,0x0c,0x31,0xd5]
-// CHECK: mrs      x20, trcidr5               // encoding: [0xf4,0x0d,0x31,0xd5]
-// CHECK: mrs      x6, trcidr6                // encoding: [0xe6,0x0e,0x31,0xd5]
-// CHECK: mrs      x6, trcidr7                // encoding: [0xe6,0x0f,0x31,0xd5]
-// CHECK: mrs      x24, trcoslsr              // encoding: [0x98,0x11,0x31,0xd5]
-// CHECK: mrs      x18, trcpdsr               // encoding: [0x92,0x15,0x31,0xd5]
-// CHECK: mrs      x28, trcdevaff0            // encoding: [0xdc,0x7a,0x31,0xd5]
-// CHECK: mrs      x5, trcdevaff1             // encoding: [0xc5,0x7b,0x31,0xd5]
-// CHECK: mrs      x5, trclsr                 // encoding: [0xc5,0x7d,0x31,0xd5]
-// CHECK: mrs      x11, trcauthstatus         // encoding: [0xcb,0x7e,0x31,0xd5]
-// CHECK: mrs      x13, trcdevarch            // encoding: [0xcd,0x7f,0x31,0xd5]
-// CHECK: mrs      x18, trcdevid              // encoding: [0xf2,0x72,0x31,0xd5]
-// CHECK: mrs      x22, trcdevtype            // encoding: [0xf6,0x73,0x31,0xd5]
-// CHECK: mrs      x14, trcpidr4              // encoding: [0xee,0x74,0x31,0xd5]
-// CHECK: mrs      x5, trcpidr5               // encoding: [0xe5,0x75,0x31,0xd5]
-// CHECK: mrs      x5, trcpidr6               // encoding: [0xe5,0x76,0x31,0xd5]
-// CHECK: mrs      x9, trcpidr7               // encoding: [0xe9,0x77,0x31,0xd5]
-// CHECK: mrs      x15, trcpidr0              // encoding: [0xef,0x78,0x31,0xd5]
-// CHECK: mrs      x6, trcpidr1               // encoding: [0xe6,0x79,0x31,0xd5]
-// CHECK: mrs      x11, trcpidr2              // encoding: [0xeb,0x7a,0x31,0xd5]
-// CHECK: mrs      x20, trcpidr3              // encoding: [0xf4,0x7b,0x31,0xd5]
-// CHECK: mrs      x17, trccidr0              // encoding: [0xf1,0x7c,0x31,0xd5]
-// CHECK: mrs      x2, trccidr1               // encoding: [0xe2,0x7d,0x31,0xd5]
-// CHECK: mrs      x20, trccidr2              // encoding: [0xf4,0x7e,0x31,0xd5]
-// CHECK: mrs      x4, trccidr3               // encoding: [0xe4,0x7f,0x31,0xd5]
-// CHECK: mrs      x11, trcprgctlr            // encoding: [0x0b,0x01,0x31,0xd5]
-// CHECK: mrs      x23, trcprocselr           // encoding: [0x17,0x02,0x31,0xd5]
-// CHECK: mrs      x13, trcconfigr            // encoding: [0x0d,0x04,0x31,0xd5]
-// CHECK: mrs      x23, trcauxctlr            // encoding: [0x17,0x06,0x31,0xd5]
-// CHECK: mrs      x9, trceventctl0r          // encoding: [0x09,0x08,0x31,0xd5]
-// CHECK: mrs      x16, trceventctl1r         // encoding: [0x10,0x09,0x31,0xd5]
-// CHECK: mrs      x4, trcstallctlr           // encoding: [0x04,0x0b,0x31,0xd5]
-// CHECK: mrs      x14, trctsctlr             // encoding: [0x0e,0x0c,0x31,0xd5]
-// CHECK: mrs      x24, trcsyncpr             // encoding: [0x18,0x0d,0x31,0xd5]
-// CHECK: mrs      x28, trcccctlr             // encoding: [0x1c,0x0e,0x31,0xd5]
-// CHECK: mrs      x15, trcbbctlr             // encoding: [0x0f,0x0f,0x31,0xd5]
-// CHECK: mrs      x1, trctraceidr            // encoding: [0x21,0x00,0x31,0xd5]
-// CHECK: mrs      x20, trcqctlr              // encoding: [0x34,0x01,0x31,0xd5]
-// CHECK: mrs      x2, trcvictlr              // encoding: [0x42,0x00,0x31,0xd5]
-// CHECK: mrs      x12, trcviiectlr           // encoding: [0x4c,0x01,0x31,0xd5]
-// CHECK: mrs      x16, trcvissctlr           // encoding: [0x50,0x02,0x31,0xd5]
-// CHECK: mrs      x8, trcvipcssctlr          // encoding: [0x48,0x03,0x31,0xd5]
-// CHECK: mrs      x27, trcvdctlr             // encoding: [0x5b,0x08,0x31,0xd5]
-// CHECK: mrs      x9, trcvdsacctlr           // encoding: [0x49,0x09,0x31,0xd5]
-// CHECK: mrs      x0, trcvdarcctlr           // encoding: [0x40,0x0a,0x31,0xd5]
-// CHECK: mrs      x13, trcseqevr0            // encoding: [0x8d,0x00,0x31,0xd5]
-// CHECK: mrs      x11, trcseqevr1            // encoding: [0x8b,0x01,0x31,0xd5]
-// CHECK: mrs      x26, trcseqevr2            // encoding: [0x9a,0x02,0x31,0xd5]
-// CHECK: mrs      x14, trcseqrstevr          // encoding: [0x8e,0x06,0x31,0xd5]
-// CHECK: mrs      x4, trcseqstr              // encoding: [0x84,0x07,0x31,0xd5]
-// CHECK: mrs      x17, trcextinselr          // encoding: [0x91,0x08,0x31,0xd5]
-// CHECK: mrs      x21, trccntrldvr0          // encoding: [0xb5,0x00,0x31,0xd5]
-// CHECK: mrs      x10, trccntrldvr1          // encoding: [0xaa,0x01,0x31,0xd5]
-// CHECK: mrs      x20, trccntrldvr2          // encoding: [0xb4,0x02,0x31,0xd5]
-// CHECK: mrs      x5, trccntrldvr3           // encoding: [0xa5,0x03,0x31,0xd5]
-// CHECK: mrs      x17, trccntctlr0           // encoding: [0xb1,0x04,0x31,0xd5]
-// CHECK: mrs      x1, trccntctlr1            // encoding: [0xa1,0x05,0x31,0xd5]
-// CHECK: mrs      x17, trccntctlr2           // encoding: [0xb1,0x06,0x31,0xd5]
-// CHECK: mrs      x6, trccntctlr3            // encoding: [0xa6,0x07,0x31,0xd5]
-// CHECK: mrs      x28, trccntvr0             // encoding: [0xbc,0x08,0x31,0xd5]
-// CHECK: mrs      x23, trccntvr1             // encoding: [0xb7,0x09,0x31,0xd5]
-// CHECK: mrs      x9, trccntvr2              // encoding: [0xa9,0x0a,0x31,0xd5]
-// CHECK: mrs      x6, trccntvr3              // encoding: [0xa6,0x0b,0x31,0xd5]
-// CHECK: mrs      x24, trcimspec0            // encoding: [0xf8,0x00,0x31,0xd5]
-// CHECK: mrs      x24, trcimspec1            // encoding: [0xf8,0x01,0x31,0xd5]
-// CHECK: mrs      x15, trcimspec2            // encoding: [0xef,0x02,0x31,0xd5]
-// CHECK: mrs      x10, trcimspec3            // encoding: [0xea,0x03,0x31,0xd5]
-// CHECK: mrs      x29, trcimspec4            // encoding: [0xfd,0x04,0x31,0xd5]
-// CHECK: mrs      x18, trcimspec5            // encoding: [0xf2,0x05,0x31,0xd5]
-// CHECK: mrs      x29, trcimspec6            // encoding: [0xfd,0x06,0x31,0xd5]
-// CHECK: mrs      x2, trcimspec7             // encoding: [0xe2,0x07,0x31,0xd5]
-// CHECK: mrs      x8, trcrsctlr2             // encoding: [0x08,0x12,0x31,0xd5]
-// CHECK: mrs      x0, trcrsctlr3             // encoding: [0x00,0x13,0x31,0xd5]
-// CHECK: mrs      x12, trcrsctlr4            // encoding: [0x0c,0x14,0x31,0xd5]
-// CHECK: mrs      x26, trcrsctlr5            // encoding: [0x1a,0x15,0x31,0xd5]
-// CHECK: mrs      x29, trcrsctlr6            // encoding: [0x1d,0x16,0x31,0xd5]
-// CHECK: mrs      x17, trcrsctlr7            // encoding: [0x11,0x17,0x31,0xd5]
-// CHECK: mrs      x0, trcrsctlr8             // encoding: [0x00,0x18,0x31,0xd5]
-// CHECK: mrs      x1, trcrsctlr9             // encoding: [0x01,0x19,0x31,0xd5]
-// CHECK: mrs      x17, trcrsctlr10           // encoding: [0x11,0x1a,0x31,0xd5]
-// CHECK: mrs      x21, trcrsctlr11           // encoding: [0x15,0x1b,0x31,0xd5]
-// CHECK: mrs      x1, trcrsctlr12            // encoding: [0x01,0x1c,0x31,0xd5]
-// CHECK: mrs      x8, trcrsctlr13            // encoding: [0x08,0x1d,0x31,0xd5]
-// CHECK: mrs      x24, trcrsctlr14           // encoding: [0x18,0x1e,0x31,0xd5]
-// CHECK: mrs      x0, trcrsctlr15            // encoding: [0x00,0x1f,0x31,0xd5]
-// CHECK: mrs      x2, trcrsctlr16            // encoding: [0x22,0x10,0x31,0xd5]
-// CHECK: mrs      x29, trcrsctlr17           // encoding: [0x3d,0x11,0x31,0xd5]
-// CHECK: mrs      x22, trcrsctlr18           // encoding: [0x36,0x12,0x31,0xd5]
-// CHECK: mrs      x6, trcrsctlr19            // encoding: [0x26,0x13,0x31,0xd5]
-// CHECK: mrs      x26, trcrsctlr20           // encoding: [0x3a,0x14,0x31,0xd5]
-// CHECK: mrs      x26, trcrsctlr21           // encoding: [0x3a,0x15,0x31,0xd5]
-// CHECK: mrs      x4, trcrsctlr22            // encoding: [0x24,0x16,0x31,0xd5]
-// CHECK: mrs      x12, trcrsctlr23           // encoding: [0x2c,0x17,0x31,0xd5]
-// CHECK: mrs      x1, trcrsctlr24            // encoding: [0x21,0x18,0x31,0xd5]
-// CHECK: mrs      x0, trcrsctlr25            // encoding: [0x20,0x19,0x31,0xd5]
-// CHECK: mrs      x17, trcrsctlr26           // encoding: [0x31,0x1a,0x31,0xd5]
-// CHECK: mrs      x8, trcrsctlr27            // encoding: [0x28,0x1b,0x31,0xd5]
-// CHECK: mrs      x10, trcrsctlr28           // encoding: [0x2a,0x1c,0x31,0xd5]
-// CHECK: mrs      x25, trcrsctlr29           // encoding: [0x39,0x1d,0x31,0xd5]
-// CHECK: mrs      x12, trcrsctlr30           // encoding: [0x2c,0x1e,0x31,0xd5]
-// CHECK: mrs      x11, trcrsctlr31           // encoding: [0x2b,0x1f,0x31,0xd5]
-// CHECK: mrs      x18, trcssccr0             // encoding: [0x52,0x10,0x31,0xd5]
-// CHECK: mrs      x12, trcssccr1             // encoding: [0x4c,0x11,0x31,0xd5]
-// CHECK: mrs      x3, trcssccr2              // encoding: [0x43,0x12,0x31,0xd5]
-// CHECK: mrs      x2, trcssccr3              // encoding: [0x42,0x13,0x31,0xd5]
-// CHECK: mrs      x21, trcssccr4             // encoding: [0x55,0x14,0x31,0xd5]
-// CHECK: mrs      x10, trcssccr5             // encoding: [0x4a,0x15,0x31,0xd5]
-// CHECK: mrs      x22, trcssccr6             // encoding: [0x56,0x16,0x31,0xd5]
-// CHECK: mrs      x23, trcssccr7             // encoding: [0x57,0x17,0x31,0xd5]
-// CHECK: mrs      x23, trcsscsr0             // encoding: [0x57,0x18,0x31,0xd5]
-// CHECK: mrs      x19, trcsscsr1             // encoding: [0x53,0x19,0x31,0xd5]
-// CHECK: mrs      x25, trcsscsr2             // encoding: [0x59,0x1a,0x31,0xd5]
-// CHECK: mrs      x17, trcsscsr3             // encoding: [0x51,0x1b,0x31,0xd5]
-// CHECK: mrs      x19, trcsscsr4             // encoding: [0x53,0x1c,0x31,0xd5]
-// CHECK: mrs      x11, trcsscsr5             // encoding: [0x4b,0x1d,0x31,0xd5]
-// CHECK: mrs      x5, trcsscsr6              // encoding: [0x45,0x1e,0x31,0xd5]
-// CHECK: mrs      x9, trcsscsr7              // encoding: [0x49,0x1f,0x31,0xd5]
-// CHECK: mrs      x1, trcsspcicr0            // encoding: [0x61,0x10,0x31,0xd5]
-// CHECK: mrs      x12, trcsspcicr1           // encoding: [0x6c,0x11,0x31,0xd5]
-// CHECK: mrs      x21, trcsspcicr2           // encoding: [0x75,0x12,0x31,0xd5]
-// CHECK: mrs      x11, trcsspcicr3           // encoding: [0x6b,0x13,0x31,0xd5]
-// CHECK: mrs      x3, trcsspcicr4            // encoding: [0x63,0x14,0x31,0xd5]
-// CHECK: mrs      x9, trcsspcicr5            // encoding: [0x69,0x15,0x31,0xd5]
-// CHECK: mrs      x5, trcsspcicr6            // encoding: [0x65,0x16,0x31,0xd5]
-// CHECK: mrs      x2, trcsspcicr7            // encoding: [0x62,0x17,0x31,0xd5]
-// CHECK: mrs      x26, trcpdcr               // encoding: [0x9a,0x14,0x31,0xd5]
-// CHECK: mrs      x8, trcacvr0               // encoding: [0x08,0x20,0x31,0xd5]
-// CHECK: mrs      x15, trcacvr1              // encoding: [0x0f,0x22,0x31,0xd5]
-// CHECK: mrs      x19, trcacvr2              // encoding: [0x13,0x24,0x31,0xd5]
-// CHECK: mrs      x8, trcacvr3               // encoding: [0x08,0x26,0x31,0xd5]
-// CHECK: mrs      x28, trcacvr4              // encoding: [0x1c,0x28,0x31,0xd5]
-// CHECK: mrs      x3, trcacvr5               // encoding: [0x03,0x2a,0x31,0xd5]
-// CHECK: mrs      x25, trcacvr6              // encoding: [0x19,0x2c,0x31,0xd5]
-// CHECK: mrs      x24, trcacvr7              // encoding: [0x18,0x2e,0x31,0xd5]
-// CHECK: mrs      x6, trcacvr8               // encoding: [0x26,0x20,0x31,0xd5]
-// CHECK: mrs      x3, trcacvr9               // encoding: [0x23,0x22,0x31,0xd5]
-// CHECK: mrs      x24, trcacvr10             // encoding: [0x38,0x24,0x31,0xd5]
-// CHECK: mrs      x3, trcacvr11              // encoding: [0x23,0x26,0x31,0xd5]
-// CHECK: mrs      x12, trcacvr12             // encoding: [0x2c,0x28,0x31,0xd5]
-// CHECK: mrs      x9, trcacvr13              // encoding: [0x29,0x2a,0x31,0xd5]
-// CHECK: mrs      x14, trcacvr14             // encoding: [0x2e,0x2c,0x31,0xd5]
-// CHECK: mrs      x3, trcacvr15              // encoding: [0x23,0x2e,0x31,0xd5]
-// CHECK: mrs      x21, trcacatr0             // encoding: [0x55,0x20,0x31,0xd5]
-// CHECK: mrs      x26, trcacatr1             // encoding: [0x5a,0x22,0x31,0xd5]
-// CHECK: mrs      x8, trcacatr2              // encoding: [0x48,0x24,0x31,0xd5]
-// CHECK: mrs      x22, trcacatr3             // encoding: [0x56,0x26,0x31,0xd5]
-// CHECK: mrs      x6, trcacatr4              // encoding: [0x46,0x28,0x31,0xd5]
-// CHECK: mrs      x29, trcacatr5             // encoding: [0x5d,0x2a,0x31,0xd5]
-// CHECK: mrs      x5, trcacatr6              // encoding: [0x45,0x2c,0x31,0xd5]
-// CHECK: mrs      x18, trcacatr7             // encoding: [0x52,0x2e,0x31,0xd5]
-// CHECK: mrs      x2, trcacatr8              // encoding: [0x62,0x20,0x31,0xd5]
-// CHECK: mrs      x19, trcacatr9             // encoding: [0x73,0x22,0x31,0xd5]
-// CHECK: mrs      x13, trcacatr10            // encoding: [0x6d,0x24,0x31,0xd5]
-// CHECK: mrs      x25, trcacatr11            // encoding: [0x79,0x26,0x31,0xd5]
-// CHECK: mrs      x18, trcacatr12            // encoding: [0x72,0x28,0x31,0xd5]
-// CHECK: mrs      x29, trcacatr13            // encoding: [0x7d,0x2a,0x31,0xd5]
-// CHECK: mrs      x9, trcacatr14             // encoding: [0x69,0x2c,0x31,0xd5]
-// CHECK: mrs      x18, trcacatr15            // encoding: [0x72,0x2e,0x31,0xd5]
-// CHECK: mrs      x29, trcdvcvr0             // encoding: [0x9d,0x20,0x31,0xd5]
-// CHECK: mrs      x15, trcdvcvr1             // encoding: [0x8f,0x24,0x31,0xd5]
-// CHECK: mrs      x15, trcdvcvr2             // encoding: [0x8f,0x28,0x31,0xd5]
-// CHECK: mrs      x15, trcdvcvr3             // encoding: [0x8f,0x2c,0x31,0xd5]
-// CHECK: mrs      x19, trcdvcvr4             // encoding: [0xb3,0x20,0x31,0xd5]
-// CHECK: mrs      x22, trcdvcvr5             // encoding: [0xb6,0x24,0x31,0xd5]
-// CHECK: mrs      x27, trcdvcvr6             // encoding: [0xbb,0x28,0x31,0xd5]
-// CHECK: mrs      x1, trcdvcvr7              // encoding: [0xa1,0x2c,0x31,0xd5]
-// CHECK: mrs      x29, trcdvcmr0             // encoding: [0xdd,0x20,0x31,0xd5]
-// CHECK: mrs      x9, trcdvcmr1              // encoding: [0xc9,0x24,0x31,0xd5]
-// CHECK: mrs      x1, trcdvcmr2              // encoding: [0xc1,0x28,0x31,0xd5]
-// CHECK: mrs      x2, trcdvcmr3              // encoding: [0xc2,0x2c,0x31,0xd5]
-// CHECK: mrs      x5, trcdvcmr4              // encoding: [0xe5,0x20,0x31,0xd5]
-// CHECK: mrs      x21, trcdvcmr5             // encoding: [0xf5,0x24,0x31,0xd5]
-// CHECK: mrs      x5, trcdvcmr6              // encoding: [0xe5,0x28,0x31,0xd5]
-// CHECK: mrs      x1, trcdvcmr7              // encoding: [0xe1,0x2c,0x31,0xd5]
-// CHECK: mrs      x21, trccidcvr0            // encoding: [0x15,0x30,0x31,0xd5]
-// CHECK: mrs      x24, trccidcvr1            // encoding: [0x18,0x32,0x31,0xd5]
-// CHECK: mrs      x24, trccidcvr2            // encoding: [0x18,0x34,0x31,0xd5]
-// CHECK: mrs      x12, trccidcvr3            // encoding: [0x0c,0x36,0x31,0xd5]
-// CHECK: mrs      x10, trccidcvr4            // encoding: [0x0a,0x38,0x31,0xd5]
-// CHECK: mrs      x9, trccidcvr5             // encoding: [0x09,0x3a,0x31,0xd5]
-// CHECK: mrs      x6, trccidcvr6             // encoding: [0x06,0x3c,0x31,0xd5]
-// CHECK: mrs      x20, trccidcvr7            // encoding: [0x14,0x3e,0x31,0xd5]
-// CHECK: mrs      x20, trcvmidcvr0           // encoding: [0x34,0x30,0x31,0xd5]
-// CHECK: mrs      x20, trcvmidcvr1           // encoding: [0x34,0x32,0x31,0xd5]
-// CHECK: mrs      x26, trcvmidcvr2           // encoding: [0x3a,0x34,0x31,0xd5]
-// CHECK: mrs      x1, trcvmidcvr3            // encoding: [0x21,0x36,0x31,0xd5]
-// CHECK: mrs      x14, trcvmidcvr4           // encoding: [0x2e,0x38,0x31,0xd5]
-// CHECK: mrs      x27, trcvmidcvr5           // encoding: [0x3b,0x3a,0x31,0xd5]
-// CHECK: mrs      x29, trcvmidcvr6           // encoding: [0x3d,0x3c,0x31,0xd5]
-// CHECK: mrs      x17, trcvmidcvr7           // encoding: [0x31,0x3e,0x31,0xd5]
-// CHECK: mrs      x10, trccidcctlr0          // encoding: [0x4a,0x30,0x31,0xd5]
-// CHECK: mrs      x4, trccidcctlr1           // encoding: [0x44,0x31,0x31,0xd5]
-// CHECK: mrs      x9, trcvmidcctlr0          // encoding: [0x49,0x32,0x31,0xd5]
-// CHECK: mrs      x11, trcvmidcctlr1         // encoding: [0x4b,0x33,0x31,0xd5]
-// CHECK: mrs      x22, trcitctrl             // encoding: [0x96,0x70,0x31,0xd5]
-// CHECK: mrs      x23, trcclaimset           // encoding: [0xd7,0x78,0x31,0xd5]
-// CHECK: mrs      x14, trcclaimclr           // encoding: [0xce,0x79,0x31,0xd5]
+// CHECK: mrs      x8, {{trcstatr|TRCSTATR}}               // encoding: [0x08,0x03,0x31,0xd5]
+// CHECK: mrs      x9, {{trcidr8|TRCIDR8}}                // encoding: [0xc9,0x00,0x31,0xd5]
+// CHECK: mrs      x11, {{trcidr9|TRCIDR9}}               // encoding: [0xcb,0x01,0x31,0xd5]
+// CHECK: mrs      x25, {{trcidr10|TRCIDR10}}              // encoding: [0xd9,0x02,0x31,0xd5]
+// CHECK: mrs      x7, {{trcidr11|TRCIDR11}}               // encoding: [0xc7,0x03,0x31,0xd5]
+// CHECK: mrs      x7, {{trcidr12|TRCIDR12}}               // encoding: [0xc7,0x04,0x31,0xd5]
+// CHECK: mrs      x6, {{trcidr13|TRCIDR13}}               // encoding: [0xc6,0x05,0x31,0xd5]
+// CHECK: mrs      x27, {{trcidr0|TRCIDR0}}               // encoding: [0xfb,0x08,0x31,0xd5]
+// CHECK: mrs      x29, {{trcidr1|TRCIDR1}}               // encoding: [0xfd,0x09,0x31,0xd5]
+// CHECK: mrs      x4, {{trcidr2|TRCIDR2}}                // encoding: [0xe4,0x0a,0x31,0xd5]
+// CHECK: mrs      x8, {{trcidr3|TRCIDR3}}                // encoding: [0xe8,0x0b,0x31,0xd5]
+// CHECK: mrs      x15, {{trcidr4|TRCIDR4}}               // encoding: [0xef,0x0c,0x31,0xd5]
+// CHECK: mrs      x20, {{trcidr5|TRCIDR5}}               // encoding: [0xf4,0x0d,0x31,0xd5]
+// CHECK: mrs      x6, {{trcidr6|TRCIDR6}}                // encoding: [0xe6,0x0e,0x31,0xd5]
+// CHECK: mrs      x6, {{trcidr7|TRCIDR7}}                // encoding: [0xe6,0x0f,0x31,0xd5]
+// CHECK: mrs      x24, {{trcoslsr|TRCOSLSR}}              // encoding: [0x98,0x11,0x31,0xd5]
+// CHECK: mrs      x18, {{trcpdsr|TRCPDSR}}               // encoding: [0x92,0x15,0x31,0xd5]
+// CHECK: mrs      x28, {{trcdevaff0|TRCDEVAFF0}}            // encoding: [0xdc,0x7a,0x31,0xd5]
+// CHECK: mrs      x5, {{trcdevaff1|TRCDEVAFF1}}             // encoding: [0xc5,0x7b,0x31,0xd5]
+// CHECK: mrs      x5, {{trclsr|TRCLSR}}                 // encoding: [0xc5,0x7d,0x31,0xd5]
+// CHECK: mrs      x11, {{trcauthstatus|TRCAUTHSTATUS}}         // encoding: [0xcb,0x7e,0x31,0xd5]
+// CHECK: mrs      x13, {{trcdevarch|TRCDEVARCH}}            // encoding: [0xcd,0x7f,0x31,0xd5]
+// CHECK: mrs      x18, {{trcdevid|TRCDEVID}}              // encoding: [0xf2,0x72,0x31,0xd5]
+// CHECK: mrs      x22, {{trcdevtype|TRCDEVTYPE}}            // encoding: [0xf6,0x73,0x31,0xd5]
+// CHECK: mrs      x14, {{trcpidr4|TRCPIDR4}}              // encoding: [0xee,0x74,0x31,0xd5]
+// CHECK: mrs      x5, {{trcpidr5|TRCPIDR5}}               // encoding: [0xe5,0x75,0x31,0xd5]
+// CHECK: mrs      x5, {{trcpidr6|TRCPIDR6}}               // encoding: [0xe5,0x76,0x31,0xd5]
+// CHECK: mrs      x9, {{trcpidr7|TRCPIDR7}}               // encoding: [0xe9,0x77,0x31,0xd5]
+// CHECK: mrs      x15, {{trcpidr0|TRCPIDR0}}              // encoding: [0xef,0x78,0x31,0xd5]
+// CHECK: mrs      x6, {{trcpidr1|TRCPIDR1}}               // encoding: [0xe6,0x79,0x31,0xd5]
+// CHECK: mrs      x11, {{trcpidr2|TRCPIDR2}}              // encoding: [0xeb,0x7a,0x31,0xd5]
+// CHECK: mrs      x20, {{trcpidr3|TRCPIDR3}}              // encoding: [0xf4,0x7b,0x31,0xd5]
+// CHECK: mrs      x17, {{trccidr0|TRCCIDR0}}              // encoding: [0xf1,0x7c,0x31,0xd5]
+// CHECK: mrs      x2, {{trccidr1|TRCCIDR1}}               // encoding: [0xe2,0x7d,0x31,0xd5]
+// CHECK: mrs      x20, {{trccidr2|TRCCIDR2}}              // encoding: [0xf4,0x7e,0x31,0xd5]
+// CHECK: mrs      x4, {{trccidr3|TRCCIDR3}}               // encoding: [0xe4,0x7f,0x31,0xd5]
+// CHECK: mrs      x11, {{trcprgctlr|TRCPRGCTLR}}            // encoding: [0x0b,0x01,0x31,0xd5]
+// CHECK: mrs      x23, {{trcprocselr|TRCPROCSELR}}           // encoding: [0x17,0x02,0x31,0xd5]
+// CHECK: mrs      x13, {{trcconfigr|TRCCONFIGR}}            // encoding: [0x0d,0x04,0x31,0xd5]
+// CHECK: mrs      x23, {{trcauxctlr|TRCAUXCTLR}}            // encoding: [0x17,0x06,0x31,0xd5]
+// CHECK: mrs      x9, {{trceventctl0r|TRCEVENTCTL0R}}          // encoding: [0x09,0x08,0x31,0xd5]
+// CHECK: mrs      x16, {{trceventctl1r|TRCEVENTCTL1R}}         // encoding: [0x10,0x09,0x31,0xd5]
+// CHECK: mrs      x4, {{trcstallctlr|TRCSTALLCTLR}}           // encoding: [0x04,0x0b,0x31,0xd5]
+// CHECK: mrs      x14, {{trctsctlr|TRCTSCTLR}}             // encoding: [0x0e,0x0c,0x31,0xd5]
+// CHECK: mrs      x24, {{trcsyncpr|TRCSYNCPR}}             // encoding: [0x18,0x0d,0x31,0xd5]
+// CHECK: mrs      x28, {{trcccctlr|TRCCCCTLR}}             // encoding: [0x1c,0x0e,0x31,0xd5]
+// CHECK: mrs      x15, {{trcbbctlr|TRCBBCTLR}}             // encoding: [0x0f,0x0f,0x31,0xd5]
+// CHECK: mrs      x1, {{trctraceidr|TRCTRACEIDR}}            // encoding: [0x21,0x00,0x31,0xd5]
+// CHECK: mrs      x20, {{trcqctlr|TRCQCTLR}}              // encoding: [0x34,0x01,0x31,0xd5]
+// CHECK: mrs      x2, {{trcvictlr|TRCVICTLR}}              // encoding: [0x42,0x00,0x31,0xd5]
+// CHECK: mrs      x12, {{trcviiectlr|TRCVIIECTLR}}           // encoding: [0x4c,0x01,0x31,0xd5]
+// CHECK: mrs      x16, {{trcvissctlr|TRCVISSCTLR}}           // encoding: [0x50,0x02,0x31,0xd5]
+// CHECK: mrs      x8, {{trcvipcssctlr|TRCVIPCSSCTLR}}          // encoding: [0x48,0x03,0x31,0xd5]
+// CHECK: mrs      x27, {{trcvdctlr|TRCVDCTLR}}             // encoding: [0x5b,0x08,0x31,0xd5]
+// CHECK: mrs      x9, {{trcvdsacctlr|TRCVDSACCTLR}}           // encoding: [0x49,0x09,0x31,0xd5]
+// CHECK: mrs      x0, {{trcvdarcctlr|TRCVDARCCTLR}}           // encoding: [0x40,0x0a,0x31,0xd5]
+// CHECK: mrs      x13, {{trcseqevr0|TRCSEQEVR0}}            // encoding: [0x8d,0x00,0x31,0xd5]
+// CHECK: mrs      x11, {{trcseqevr1|TRCSEQEVR1}}            // encoding: [0x8b,0x01,0x31,0xd5]
+// CHECK: mrs      x26, {{trcseqevr2|TRCSEQEVR2}}            // encoding: [0x9a,0x02,0x31,0xd5]
+// CHECK: mrs      x14, {{trcseqrstevr|TRCSEQRSTEVR}}          // encoding: [0x8e,0x06,0x31,0xd5]
+// CHECK: mrs      x4, {{trcseqstr|TRCSEQSTR}}              // encoding: [0x84,0x07,0x31,0xd5]
+// CHECK: mrs      x17, {{trcextinselr|TRCEXTINSELR}}          // encoding: [0x91,0x08,0x31,0xd5]
+// CHECK: mrs      x21, {{trccntrldvr0|TRCCNTRLDVR0}}          // encoding: [0xb5,0x00,0x31,0xd5]
+// CHECK: mrs      x10, {{trccntrldvr1|TRCCNTRLDVR1}}          // encoding: [0xaa,0x01,0x31,0xd5]
+// CHECK: mrs      x20, {{trccntrldvr2|TRCCNTRLDVR2}}          // encoding: [0xb4,0x02,0x31,0xd5]
+// CHECK: mrs      x5, {{trccntrldvr3|TRCCNTRLDVR3}}           // encoding: [0xa5,0x03,0x31,0xd5]
+// CHECK: mrs      x17, {{trccntctlr0|TRCCNTCTLR0}}           // encoding: [0xb1,0x04,0x31,0xd5]
+// CHECK: mrs      x1, {{trccntctlr1|TRCCNTCTLR1}}            // encoding: [0xa1,0x05,0x31,0xd5]
+// CHECK: mrs      x17, {{trccntctlr2|TRCCNTCTLR2}}           // encoding: [0xb1,0x06,0x31,0xd5]
+// CHECK: mrs      x6, {{trccntctlr3|TRCCNTCTLR3}}            // encoding: [0xa6,0x07,0x31,0xd5]
+// CHECK: mrs      x28, {{trccntvr0|TRCCNTVR0}}             // encoding: [0xbc,0x08,0x31,0xd5]
+// CHECK: mrs      x23, {{trccntvr1|TRCCNTVR1}}             // encoding: [0xb7,0x09,0x31,0xd5]
+// CHECK: mrs      x9, {{trccntvr2|TRCCNTVR2}}              // encoding: [0xa9,0x0a,0x31,0xd5]
+// CHECK: mrs      x6, {{trccntvr3|TRCCNTVR3}}              // encoding: [0xa6,0x0b,0x31,0xd5]
+// CHECK: mrs      x24, {{trcimspec0|TRCIMSPEC0}}            // encoding: [0xf8,0x00,0x31,0xd5]
+// CHECK: mrs      x24, {{trcimspec1|TRCIMSPEC1}}            // encoding: [0xf8,0x01,0x31,0xd5]
+// CHECK: mrs      x15, {{trcimspec2|TRCIMSPEC2}}            // encoding: [0xef,0x02,0x31,0xd5]
+// CHECK: mrs      x10, {{trcimspec3|TRCIMSPEC3}}            // encoding: [0xea,0x03,0x31,0xd5]
+// CHECK: mrs      x29, {{trcimspec4|TRCIMSPEC4}}            // encoding: [0xfd,0x04,0x31,0xd5]
+// CHECK: mrs      x18, {{trcimspec5|TRCIMSPEC5}}            // encoding: [0xf2,0x05,0x31,0xd5]
+// CHECK: mrs      x29, {{trcimspec6|TRCIMSPEC6}}            // encoding: [0xfd,0x06,0x31,0xd5]
+// CHECK: mrs      x2, {{trcimspec7|TRCIMSPEC7}}             // encoding: [0xe2,0x07,0x31,0xd5]
+// CHECK: mrs      x8, {{trcrsctlr2|TRCRSCTLR2}}             // encoding: [0x08,0x12,0x31,0xd5]
+// CHECK: mrs      x0, {{trcrsctlr3|TRCRSCTLR3}}             // encoding: [0x00,0x13,0x31,0xd5]
+// CHECK: mrs      x12, {{trcrsctlr4|TRCRSCTLR4}}            // encoding: [0x0c,0x14,0x31,0xd5]
+// CHECK: mrs      x26, {{trcrsctlr5|TRCRSCTLR5}}            // encoding: [0x1a,0x15,0x31,0xd5]
+// CHECK: mrs      x29, {{trcrsctlr6|TRCRSCTLR6}}            // encoding: [0x1d,0x16,0x31,0xd5]
+// CHECK: mrs      x17, {{trcrsctlr7|TRCRSCTLR7}}            // encoding: [0x11,0x17,0x31,0xd5]
+// CHECK: mrs      x0, {{trcrsctlr8|TRCRSCTLR8}}             // encoding: [0x00,0x18,0x31,0xd5]
+// CHECK: mrs      x1, {{trcrsctlr9|TRCRSCTLR9}}             // encoding: [0x01,0x19,0x31,0xd5]
+// CHECK: mrs      x17, {{trcrsctlr10|TRCRSCTLR10}}           // encoding: [0x11,0x1a,0x31,0xd5]
+// CHECK: mrs      x21, {{trcrsctlr11|TRCRSCTLR11}}           // encoding: [0x15,0x1b,0x31,0xd5]
+// CHECK: mrs      x1, {{trcrsctlr12|TRCRSCTLR12}}            // encoding: [0x01,0x1c,0x31,0xd5]
+// CHECK: mrs      x8, {{trcrsctlr13|TRCRSCTLR13}}            // encoding: [0x08,0x1d,0x31,0xd5]
+// CHECK: mrs      x24, {{trcrsctlr14|TRCRSCTLR14}}           // encoding: [0x18,0x1e,0x31,0xd5]
+// CHECK: mrs      x0, {{trcrsctlr15|TRCRSCTLR15}}            // encoding: [0x00,0x1f,0x31,0xd5]
+// CHECK: mrs      x2, {{trcrsctlr16|TRCRSCTLR16}}            // encoding: [0x22,0x10,0x31,0xd5]
+// CHECK: mrs      x29, {{trcrsctlr17|TRCRSCTLR17}}           // encoding: [0x3d,0x11,0x31,0xd5]
+// CHECK: mrs      x22, {{trcrsctlr18|TRCRSCTLR18}}           // encoding: [0x36,0x12,0x31,0xd5]
+// CHECK: mrs      x6, {{trcrsctlr19|TRCRSCTLR19}}            // encoding: [0x26,0x13,0x31,0xd5]
+// CHECK: mrs      x26, {{trcrsctlr20|TRCRSCTLR20}}           // encoding: [0x3a,0x14,0x31,0xd5]
+// CHECK: mrs      x26, {{trcrsctlr21|TRCRSCTLR21}}           // encoding: [0x3a,0x15,0x31,0xd5]
+// CHECK: mrs      x4, {{trcrsctlr22|TRCRSCTLR22}}            // encoding: [0x24,0x16,0x31,0xd5]
+// CHECK: mrs      x12, {{trcrsctlr23|TRCRSCTLR23}}           // encoding: [0x2c,0x17,0x31,0xd5]
+// CHECK: mrs      x1, {{trcrsctlr24|TRCRSCTLR24}}            // encoding: [0x21,0x18,0x31,0xd5]
+// CHECK: mrs      x0, {{trcrsctlr25|TRCRSCTLR25}}            // encoding: [0x20,0x19,0x31,0xd5]
+// CHECK: mrs      x17, {{trcrsctlr26|TRCRSCTLR26}}           // encoding: [0x31,0x1a,0x31,0xd5]
+// CHECK: mrs      x8, {{trcrsctlr27|TRCRSCTLR27}}            // encoding: [0x28,0x1b,0x31,0xd5]
+// CHECK: mrs      x10, {{trcrsctlr28|TRCRSCTLR28}}           // encoding: [0x2a,0x1c,0x31,0xd5]
+// CHECK: mrs      x25, {{trcrsctlr29|TRCRSCTLR29}}           // encoding: [0x39,0x1d,0x31,0xd5]
+// CHECK: mrs      x12, {{trcrsctlr30|TRCRSCTLR30}}           // encoding: [0x2c,0x1e,0x31,0xd5]
+// CHECK: mrs      x11, {{trcrsctlr31|TRCRSCTLR31}}           // encoding: [0x2b,0x1f,0x31,0xd5]
+// CHECK: mrs      x18, {{trcssccr0|TRCSSCCR0}}             // encoding: [0x52,0x10,0x31,0xd5]
+// CHECK: mrs      x12, {{trcssccr1|TRCSSCCR1}}             // encoding: [0x4c,0x11,0x31,0xd5]
+// CHECK: mrs      x3, {{trcssccr2|TRCSSCCR2}}              // encoding: [0x43,0x12,0x31,0xd5]
+// CHECK: mrs      x2, {{trcssccr3|TRCSSCCR3}}              // encoding: [0x42,0x13,0x31,0xd5]
+// CHECK: mrs      x21, {{trcssccr4|TRCSSCCR4}}             // encoding: [0x55,0x14,0x31,0xd5]
+// CHECK: mrs      x10, {{trcssccr5|TRCSSCCR5}}             // encoding: [0x4a,0x15,0x31,0xd5]
+// CHECK: mrs      x22, {{trcssccr6|TRCSSCCR6}}             // encoding: [0x56,0x16,0x31,0xd5]
+// CHECK: mrs      x23, {{trcssccr7|TRCSSCCR7}}             // encoding: [0x57,0x17,0x31,0xd5]
+// CHECK: mrs      x23, {{trcsscsr0|TRCSSCSR0}}             // encoding: [0x57,0x18,0x31,0xd5]
+// CHECK: mrs      x19, {{trcsscsr1|TRCSSCSR1}}             // encoding: [0x53,0x19,0x31,0xd5]
+// CHECK: mrs      x25, {{trcsscsr2|TRCSSCSR2}}             // encoding: [0x59,0x1a,0x31,0xd5]
+// CHECK: mrs      x17, {{trcsscsr3|TRCSSCSR3}}             // encoding: [0x51,0x1b,0x31,0xd5]
+// CHECK: mrs      x19, {{trcsscsr4|TRCSSCSR4}}             // encoding: [0x53,0x1c,0x31,0xd5]
+// CHECK: mrs      x11, {{trcsscsr5|TRCSSCSR5}}             // encoding: [0x4b,0x1d,0x31,0xd5]
+// CHECK: mrs      x5, {{trcsscsr6|TRCSSCSR6}}              // encoding: [0x45,0x1e,0x31,0xd5]
+// CHECK: mrs      x9, {{trcsscsr7|TRCSSCSR7}}              // encoding: [0x49,0x1f,0x31,0xd5]
+// CHECK: mrs      x1, {{trcsspcicr0|TRCSSPCICR0}}            // encoding: [0x61,0x10,0x31,0xd5]
+// CHECK: mrs      x12, {{trcsspcicr1|TRCSSPCICR1}}           // encoding: [0x6c,0x11,0x31,0xd5]
+// CHECK: mrs      x21, {{trcsspcicr2|TRCSSPCICR2}}           // encoding: [0x75,0x12,0x31,0xd5]
+// CHECK: mrs      x11, {{trcsspcicr3|TRCSSPCICR3}}           // encoding: [0x6b,0x13,0x31,0xd5]
+// CHECK: mrs      x3, {{trcsspcicr4|TRCSSPCICR4}}            // encoding: [0x63,0x14,0x31,0xd5]
+// CHECK: mrs      x9, {{trcsspcicr5|TRCSSPCICR5}}            // encoding: [0x69,0x15,0x31,0xd5]
+// CHECK: mrs      x5, {{trcsspcicr6|TRCSSPCICR6}}            // encoding: [0x65,0x16,0x31,0xd5]
+// CHECK: mrs      x2, {{trcsspcicr7|TRCSSPCICR7}}            // encoding: [0x62,0x17,0x31,0xd5]
+// CHECK: mrs      x26, {{trcpdcr|TRCPDCR}}               // encoding: [0x9a,0x14,0x31,0xd5]
+// CHECK: mrs      x8, {{trcacvr0|TRCACVR0}}               // encoding: [0x08,0x20,0x31,0xd5]
+// CHECK: mrs      x15, {{trcacvr1|TRCACVR1}}              // encoding: [0x0f,0x22,0x31,0xd5]
+// CHECK: mrs      x19, {{trcacvr2|TRCACVR2}}              // encoding: [0x13,0x24,0x31,0xd5]
+// CHECK: mrs      x8, {{trcacvr3|TRCACVR3}}               // encoding: [0x08,0x26,0x31,0xd5]
+// CHECK: mrs      x28, {{trcacvr4|TRCACVR4}}              // encoding: [0x1c,0x28,0x31,0xd5]
+// CHECK: mrs      x3, {{trcacvr5|TRCACVR5}}               // encoding: [0x03,0x2a,0x31,0xd5]
+// CHECK: mrs      x25, {{trcacvr6|TRCACVR6}}              // encoding: [0x19,0x2c,0x31,0xd5]
+// CHECK: mrs      x24, {{trcacvr7|TRCACVR7}}              // encoding: [0x18,0x2e,0x31,0xd5]
+// CHECK: mrs      x6, {{trcacvr8|TRCACVR8}}               // encoding: [0x26,0x20,0x31,0xd5]
+// CHECK: mrs      x3, {{trcacvr9|TRCACVR9}}               // encoding: [0x23,0x22,0x31,0xd5]
+// CHECK: mrs      x24, {{trcacvr10|TRCACVR10}}             // encoding: [0x38,0x24,0x31,0xd5]
+// CHECK: mrs      x3, {{trcacvr11|TRCACVR11}}              // encoding: [0x23,0x26,0x31,0xd5]
+// CHECK: mrs      x12, {{trcacvr12|TRCACVR12}}             // encoding: [0x2c,0x28,0x31,0xd5]
+// CHECK: mrs      x9, {{trcacvr13|TRCACVR13}}              // encoding: [0x29,0x2a,0x31,0xd5]
+// CHECK: mrs      x14, {{trcacvr14|TRCACVR14}}             // encoding: [0x2e,0x2c,0x31,0xd5]
+// CHECK: mrs      x3, {{trcacvr15|TRCACVR15}}              // encoding: [0x23,0x2e,0x31,0xd5]
+// CHECK: mrs      x21, {{trcacatr0|TRCACATR0}}             // encoding: [0x55,0x20,0x31,0xd5]
+// CHECK: mrs      x26, {{trcacatr1|TRCACATR1}}             // encoding: [0x5a,0x22,0x31,0xd5]
+// CHECK: mrs      x8, {{trcacatr2|TRCACATR2}}              // encoding: [0x48,0x24,0x31,0xd5]
+// CHECK: mrs      x22, {{trcacatr3|TRCACATR3}}             // encoding: [0x56,0x26,0x31,0xd5]
+// CHECK: mrs      x6, {{trcacatr4|TRCACATR4}}              // encoding: [0x46,0x28,0x31,0xd5]
+// CHECK: mrs      x29, {{trcacatr5|TRCACATR5}}             // encoding: [0x5d,0x2a,0x31,0xd5]
+// CHECK: mrs      x5, {{trcacatr6|TRCACATR6}}              // encoding: [0x45,0x2c,0x31,0xd5]
+// CHECK: mrs      x18, {{trcacatr7|TRCACATR7}}             // encoding: [0x52,0x2e,0x31,0xd5]
+// CHECK: mrs      x2, {{trcacatr8|TRCACATR8}}              // encoding: [0x62,0x20,0x31,0xd5]
+// CHECK: mrs      x19, {{trcacatr9|TRCACATR9}}             // encoding: [0x73,0x22,0x31,0xd5]
+// CHECK: mrs      x13, {{trcacatr10|TRCACATR10}}            // encoding: [0x6d,0x24,0x31,0xd5]
+// CHECK: mrs      x25, {{trcacatr11|TRCACATR11}}            // encoding: [0x79,0x26,0x31,0xd5]
+// CHECK: mrs      x18, {{trcacatr12|TRCACATR12}}            // encoding: [0x72,0x28,0x31,0xd5]
+// CHECK: mrs      x29, {{trcacatr13|TRCACATR13}}            // encoding: [0x7d,0x2a,0x31,0xd5]
+// CHECK: mrs      x9, {{trcacatr14|TRCACATR14}}             // encoding: [0x69,0x2c,0x31,0xd5]
+// CHECK: mrs      x18, {{trcacatr15|TRCACATR15}}            // encoding: [0x72,0x2e,0x31,0xd5]
+// CHECK: mrs      x29, {{trcdvcvr0|TRCDVCVR0}}             // encoding: [0x9d,0x20,0x31,0xd5]
+// CHECK: mrs      x15, {{trcdvcvr1|TRCDVCVR1}}             // encoding: [0x8f,0x24,0x31,0xd5]
+// CHECK: mrs      x15, {{trcdvcvr2|TRCDVCVR2}}             // encoding: [0x8f,0x28,0x31,0xd5]
+// CHECK: mrs      x15, {{trcdvcvr3|TRCDVCVR3}}             // encoding: [0x8f,0x2c,0x31,0xd5]
+// CHECK: mrs      x19, {{trcdvcvr4|TRCDVCVR4}}             // encoding: [0xb3,0x20,0x31,0xd5]
+// CHECK: mrs      x22, {{trcdvcvr5|TRCDVCVR5}}             // encoding: [0xb6,0x24,0x31,0xd5]
+// CHECK: mrs      x27, {{trcdvcvr6|TRCDVCVR6}}             // encoding: [0xbb,0x28,0x31,0xd5]
+// CHECK: mrs      x1, {{trcdvcvr7|TRCDVCVR7}}              // encoding: [0xa1,0x2c,0x31,0xd5]
+// CHECK: mrs      x29, {{trcdvcmr0|TRCDVCMR0}}             // encoding: [0xdd,0x20,0x31,0xd5]
+// CHECK: mrs      x9, {{trcdvcmr1|TRCDVCMR1}}              // encoding: [0xc9,0x24,0x31,0xd5]
+// CHECK: mrs      x1, {{trcdvcmr2|TRCDVCMR2}}              // encoding: [0xc1,0x28,0x31,0xd5]
+// CHECK: mrs      x2, {{trcdvcmr3|TRCDVCMR3}}              // encoding: [0xc2,0x2c,0x31,0xd5]
+// CHECK: mrs      x5, {{trcdvcmr4|TRCDVCMR4}}              // encoding: [0xe5,0x20,0x31,0xd5]
+// CHECK: mrs      x21, {{trcdvcmr5|TRCDVCMR5}}             // encoding: [0xf5,0x24,0x31,0xd5]
+// CHECK: mrs      x5, {{trcdvcmr6|TRCDVCMR6}}              // encoding: [0xe5,0x28,0x31,0xd5]
+// CHECK: mrs      x1, {{trcdvcmr7|TRCDVCMR7}}              // encoding: [0xe1,0x2c,0x31,0xd5]
+// CHECK: mrs      x21, {{trccidcvr0|TRCCIDCVR0}}            // encoding: [0x15,0x30,0x31,0xd5]
+// CHECK: mrs      x24, {{trccidcvr1|TRCCIDCVR1}}            // encoding: [0x18,0x32,0x31,0xd5]
+// CHECK: mrs      x24, {{trccidcvr2|TRCCIDCVR2}}            // encoding: [0x18,0x34,0x31,0xd5]
+// CHECK: mrs      x12, {{trccidcvr3|TRCCIDCVR3}}            // encoding: [0x0c,0x36,0x31,0xd5]
+// CHECK: mrs      x10, {{trccidcvr4|TRCCIDCVR4}}            // encoding: [0x0a,0x38,0x31,0xd5]
+// CHECK: mrs      x9, {{trccidcvr5|TRCCIDCVR5}}             // encoding: [0x09,0x3a,0x31,0xd5]
+// CHECK: mrs      x6, {{trccidcvr6|TRCCIDCVR6}}             // encoding: [0x06,0x3c,0x31,0xd5]
+// CHECK: mrs      x20, {{trccidcvr7|TRCCIDCVR7}}            // encoding: [0x14,0x3e,0x31,0xd5]
+// CHECK: mrs      x20, {{trcvmidcvr0|TRCVMIDCVR0}}           // encoding: [0x34,0x30,0x31,0xd5]
+// CHECK: mrs      x20, {{trcvmidcvr1|TRCVMIDCVR1}}           // encoding: [0x34,0x32,0x31,0xd5]
+// CHECK: mrs      x26, {{trcvmidcvr2|TRCVMIDCVR2}}           // encoding: [0x3a,0x34,0x31,0xd5]
+// CHECK: mrs      x1, {{trcvmidcvr3|TRCVMIDCVR3}}            // encoding: [0x21,0x36,0x31,0xd5]
+// CHECK: mrs      x14, {{trcvmidcvr4|TRCVMIDCVR4}}           // encoding: [0x2e,0x38,0x31,0xd5]
+// CHECK: mrs      x27, {{trcvmidcvr5|TRCVMIDCVR5}}           // encoding: [0x3b,0x3a,0x31,0xd5]
+// CHECK: mrs      x29, {{trcvmidcvr6|TRCVMIDCVR6}}           // encoding: [0x3d,0x3c,0x31,0xd5]
+// CHECK: mrs      x17, {{trcvmidcvr7|TRCVMIDCVR7}}           // encoding: [0x31,0x3e,0x31,0xd5]
+// CHECK: mrs      x10, {{trccidcctlr0|TRCCIDCCTLR0}}          // encoding: [0x4a,0x30,0x31,0xd5]
+// CHECK: mrs      x4, {{trccidcctlr1|TRCCIDCCTLR1}}           // encoding: [0x44,0x31,0x31,0xd5]
+// CHECK: mrs      x9, {{trcvmidcctlr0|TRCVMIDCCTLR0}}          // encoding: [0x49,0x32,0x31,0xd5]
+// CHECK: mrs      x11, {{trcvmidcctlr1|TRCVMIDCCTLR1}}         // encoding: [0x4b,0x33,0x31,0xd5]
+// CHECK: mrs      x22, {{trcitctrl|TRCITCTRL}}             // encoding: [0x96,0x70,0x31,0xd5]
+// CHECK: mrs      x23, {{trcclaimset|TRCCLAIMSET}}           // encoding: [0xd7,0x78,0x31,0xd5]
+// CHECK: mrs      x14, {{trcclaimclr|TRCCLAIMCLR}}           // encoding: [0xce,0x79,0x31,0xd5]
 
         msr trcoslar, x28
         msr trclar, x14
@@ -590,177 +591,177 @@
         msr trcitctrl, x1
         msr trcclaimset, x7
         msr trcclaimclr, x29
-// CHECK: msr      trcoslar, x28              // encoding: [0x9c,0x10,0x11,0xd5]
-// CHECK: msr      trclar, x14                // encoding: [0xce,0x7c,0x11,0xd5]
-// CHECK: msr      trcprgctlr, x10            // encoding: [0x0a,0x01,0x11,0xd5]
-// CHECK: msr      trcprocselr, x27           // encoding: [0x1b,0x02,0x11,0xd5]
-// CHECK: msr      trcconfigr, x24            // encoding: [0x18,0x04,0x11,0xd5]
-// CHECK: msr      trcauxctlr, x8             // encoding: [0x08,0x06,0x11,0xd5]
-// CHECK: msr      trceventctl0r, x16         // encoding: [0x10,0x08,0x11,0xd5]
-// CHECK: msr      trceventctl1r, x27         // encoding: [0x1b,0x09,0x11,0xd5]
-// CHECK: msr      trcstallctlr, x26          // encoding: [0x1a,0x0b,0x11,0xd5]
-// CHECK: msr      trctsctlr, x0              // encoding: [0x00,0x0c,0x11,0xd5]
-// CHECK: msr      trcsyncpr, x14             // encoding: [0x0e,0x0d,0x11,0xd5]
-// CHECK: msr      trcccctlr, x8              // encoding: [0x08,0x0e,0x11,0xd5]
-// CHECK: msr      trcbbctlr, x6              // encoding: [0x06,0x0f,0x11,0xd5]
-// CHECK: msr      trctraceidr, x23           // encoding: [0x37,0x00,0x11,0xd5]
-// CHECK: msr      trcqctlr, x5               // encoding: [0x25,0x01,0x11,0xd5]
-// CHECK: msr      trcvictlr, x0              // encoding: [0x40,0x00,0x11,0xd5]
-// CHECK: msr      trcviiectlr, x0            // encoding: [0x40,0x01,0x11,0xd5]
-// CHECK: msr      trcvissctlr, x1            // encoding: [0x41,0x02,0x11,0xd5]
-// CHECK: msr      trcvipcssctlr, x0          // encoding: [0x40,0x03,0x11,0xd5]
-// CHECK: msr      trcvdctlr, x7              // encoding: [0x47,0x08,0x11,0xd5]
-// CHECK: msr      trcvdsacctlr, x18          // encoding: [0x52,0x09,0x11,0xd5]
-// CHECK: msr      trcvdarcctlr, x24          // encoding: [0x58,0x0a,0x11,0xd5]
-// CHECK: msr      trcseqevr0, x28            // encoding: [0x9c,0x00,0x11,0xd5]
-// CHECK: msr      trcseqevr1, x21            // encoding: [0x95,0x01,0x11,0xd5]
-// CHECK: msr      trcseqevr2, x16            // encoding: [0x90,0x02,0x11,0xd5]
-// CHECK: msr      trcseqrstevr, x16          // encoding: [0x90,0x06,0x11,0xd5]
-// CHECK: msr      trcseqstr, x25             // encoding: [0x99,0x07,0x11,0xd5]
-// CHECK: msr      trcextinselr, x29          // encoding: [0x9d,0x08,0x11,0xd5]
-// CHECK: msr      trccntrldvr0, x20          // encoding: [0xb4,0x00,0x11,0xd5]
-// CHECK: msr      trccntrldvr1, x20          // encoding: [0xb4,0x01,0x11,0xd5]
-// CHECK: msr      trccntrldvr2, x22          // encoding: [0xb6,0x02,0x11,0xd5]
-// CHECK: msr      trccntrldvr3, x12          // encoding: [0xac,0x03,0x11,0xd5]
-// CHECK: msr      trccntctlr0, x20           // encoding: [0xb4,0x04,0x11,0xd5]
-// CHECK: msr      trccntctlr1, x4            // encoding: [0xa4,0x05,0x11,0xd5]
-// CHECK: msr      trccntctlr2, x8            // encoding: [0xa8,0x06,0x11,0xd5]
-// CHECK: msr      trccntctlr3, x16           // encoding: [0xb0,0x07,0x11,0xd5]
-// CHECK: msr      trccntvr0, x5              // encoding: [0xa5,0x08,0x11,0xd5]
-// CHECK: msr      trccntvr1, x27             // encoding: [0xbb,0x09,0x11,0xd5]
-// CHECK: msr      trccntvr2, x21             // encoding: [0xb5,0x0a,0x11,0xd5]
-// CHECK: msr      trccntvr3, x8              // encoding: [0xa8,0x0b,0x11,0xd5]
-// CHECK: msr      trcimspec0, x6             // encoding: [0xe6,0x00,0x11,0xd5]
-// CHECK: msr      trcimspec1, x27            // encoding: [0xfb,0x01,0x11,0xd5]
-// CHECK: msr      trcimspec2, x23            // encoding: [0xf7,0x02,0x11,0xd5]
-// CHECK: msr      trcimspec3, x15            // encoding: [0xef,0x03,0x11,0xd5]
-// CHECK: msr      trcimspec4, x13            // encoding: [0xed,0x04,0x11,0xd5]
-// CHECK: msr      trcimspec5, x25            // encoding: [0xf9,0x05,0x11,0xd5]
-// CHECK: msr      trcimspec6, x19            // encoding: [0xf3,0x06,0x11,0xd5]
-// CHECK: msr      trcimspec7, x27            // encoding: [0xfb,0x07,0x11,0xd5]
-// CHECK: msr      trcrsctlr2, x4             // encoding: [0x04,0x12,0x11,0xd5]
-// CHECK: msr      trcrsctlr3, x0             // encoding: [0x00,0x13,0x11,0xd5]
-// CHECK: msr      trcrsctlr4, x21            // encoding: [0x15,0x14,0x11,0xd5]
-// CHECK: msr      trcrsctlr5, x8             // encoding: [0x08,0x15,0x11,0xd5]
-// CHECK: msr      trcrsctlr6, x20            // encoding: [0x14,0x16,0x11,0xd5]
-// CHECK: msr      trcrsctlr7, x11            // encoding: [0x0b,0x17,0x11,0xd5]
-// CHECK: msr      trcrsctlr8, x18            // encoding: [0x12,0x18,0x11,0xd5]
-// CHECK: msr      trcrsctlr9, x24            // encoding: [0x18,0x19,0x11,0xd5]
-// CHECK: msr      trcrsctlr10, x15           // encoding: [0x0f,0x1a,0x11,0xd5]
-// CHECK: msr      trcrsctlr11, x21           // encoding: [0x15,0x1b,0x11,0xd5]
-// CHECK: msr      trcrsctlr12, x4            // encoding: [0x04,0x1c,0x11,0xd5]
-// CHECK: msr      trcrsctlr13, x28           // encoding: [0x1c,0x1d,0x11,0xd5]
-// CHECK: msr      trcrsctlr14, x3            // encoding: [0x03,0x1e,0x11,0xd5]
-// CHECK: msr      trcrsctlr15, x20           // encoding: [0x14,0x1f,0x11,0xd5]
-// CHECK: msr      trcrsctlr16, x12           // encoding: [0x2c,0x10,0x11,0xd5]
-// CHECK: msr      trcrsctlr17, x17           // encoding: [0x31,0x11,0x11,0xd5]
-// CHECK: msr      trcrsctlr18, x10           // encoding: [0x2a,0x12,0x11,0xd5]
-// CHECK: msr      trcrsctlr19, x11           // encoding: [0x2b,0x13,0x11,0xd5]
-// CHECK: msr      trcrsctlr20, x3            // encoding: [0x23,0x14,0x11,0xd5]
-// CHECK: msr      trcrsctlr21, x18           // encoding: [0x32,0x15,0x11,0xd5]
-// CHECK: msr      trcrsctlr22, x26           // encoding: [0x3a,0x16,0x11,0xd5]
-// CHECK: msr      trcrsctlr23, x5            // encoding: [0x25,0x17,0x11,0xd5]
-// CHECK: msr      trcrsctlr24, x25           // encoding: [0x39,0x18,0x11,0xd5]
-// CHECK: msr      trcrsctlr25, x5            // encoding: [0x25,0x19,0x11,0xd5]
-// CHECK: msr      trcrsctlr26, x4            // encoding: [0x24,0x1a,0x11,0xd5]
-// CHECK: msr      trcrsctlr27, x20           // encoding: [0x34,0x1b,0x11,0xd5]
-// CHECK: msr      trcrsctlr28, x5            // encoding: [0x25,0x1c,0x11,0xd5]
-// CHECK: msr      trcrsctlr29, x10           // encoding: [0x2a,0x1d,0x11,0xd5]
-// CHECK: msr      trcrsctlr30, x24           // encoding: [0x38,0x1e,0x11,0xd5]
-// CHECK: msr      trcrsctlr31, x20           // encoding: [0x34,0x1f,0x11,0xd5]
-// CHECK: msr      trcssccr0, x23             // encoding: [0x57,0x10,0x11,0xd5]
-// CHECK: msr      trcssccr1, x27             // encoding: [0x5b,0x11,0x11,0xd5]
-// CHECK: msr      trcssccr2, x27             // encoding: [0x5b,0x12,0x11,0xd5]
-// CHECK: msr      trcssccr3, x6              // encoding: [0x46,0x13,0x11,0xd5]
-// CHECK: msr      trcssccr4, x3              // encoding: [0x43,0x14,0x11,0xd5]
-// CHECK: msr      trcssccr5, x12             // encoding: [0x4c,0x15,0x11,0xd5]
-// CHECK: msr      trcssccr6, x7              // encoding: [0x47,0x16,0x11,0xd5]
-// CHECK: msr      trcssccr7, x6              // encoding: [0x46,0x17,0x11,0xd5]
-// CHECK: msr      trcsscsr0, x20             // encoding: [0x54,0x18,0x11,0xd5]
-// CHECK: msr      trcsscsr1, x17             // encoding: [0x51,0x19,0x11,0xd5]
-// CHECK: msr      trcsscsr2, x11             // encoding: [0x4b,0x1a,0x11,0xd5]
-// CHECK: msr      trcsscsr3, x4              // encoding: [0x44,0x1b,0x11,0xd5]
-// CHECK: msr      trcsscsr4, x14             // encoding: [0x4e,0x1c,0x11,0xd5]
-// CHECK: msr      trcsscsr5, x22             // encoding: [0x56,0x1d,0x11,0xd5]
-// CHECK: msr      trcsscsr6, x3              // encoding: [0x43,0x1e,0x11,0xd5]
-// CHECK: msr      trcsscsr7, x11             // encoding: [0x4b,0x1f,0x11,0xd5]
-// CHECK: msr      trcsspcicr0, x2            // encoding: [0x62,0x10,0x11,0xd5]
-// CHECK: msr      trcsspcicr1, x3            // encoding: [0x63,0x11,0x11,0xd5]
-// CHECK: msr      trcsspcicr2, x5            // encoding: [0x65,0x12,0x11,0xd5]
-// CHECK: msr      trcsspcicr3, x7            // encoding: [0x67,0x13,0x11,0xd5]
-// CHECK: msr      trcsspcicr4, x11           // encoding: [0x6b,0x14,0x11,0xd5]
-// CHECK: msr      trcsspcicr5, x13           // encoding: [0x6d,0x15,0x11,0xd5]
-// CHECK: msr      trcsspcicr6, x17           // encoding: [0x71,0x16,0x11,0xd5]
-// CHECK: msr      trcsspcicr7, x23           // encoding: [0x77,0x17,0x11,0xd5]
-// CHECK: msr      trcpdcr, x3                // encoding: [0x83,0x14,0x11,0xd5]
-// CHECK: msr      trcacvr0, x6               // encoding: [0x06,0x20,0x11,0xd5]
-// CHECK: msr      trcacvr1, x20              // encoding: [0x14,0x22,0x11,0xd5]
-// CHECK: msr      trcacvr2, x25              // encoding: [0x19,0x24,0x11,0xd5]
-// CHECK: msr      trcacvr3, x1               // encoding: [0x01,0x26,0x11,0xd5]
-// CHECK: msr      trcacvr4, x28              // encoding: [0x1c,0x28,0x11,0xd5]
-// CHECK: msr      trcacvr5, x15              // encoding: [0x0f,0x2a,0x11,0xd5]
-// CHECK: msr      trcacvr6, x25              // encoding: [0x19,0x2c,0x11,0xd5]
-// CHECK: msr      trcacvr7, x12              // encoding: [0x0c,0x2e,0x11,0xd5]
-// CHECK: msr      trcacvr8, x5               // encoding: [0x25,0x20,0x11,0xd5]
-// CHECK: msr      trcacvr9, x25              // encoding: [0x39,0x22,0x11,0xd5]
-// CHECK: msr      trcacvr10, x13             // encoding: [0x2d,0x24,0x11,0xd5]
-// CHECK: msr      trcacvr11, x10             // encoding: [0x2a,0x26,0x11,0xd5]
-// CHECK: msr      trcacvr12, x19             // encoding: [0x33,0x28,0x11,0xd5]
-// CHECK: msr      trcacvr13, x10             // encoding: [0x2a,0x2a,0x11,0xd5]
-// CHECK: msr      trcacvr14, x19             // encoding: [0x33,0x2c,0x11,0xd5]
-// CHECK: msr      trcacvr15, x2              // encoding: [0x22,0x2e,0x11,0xd5]
-// CHECK: msr      trcacatr0, x15             // encoding: [0x4f,0x20,0x11,0xd5]
-// CHECK: msr      trcacatr1, x13             // encoding: [0x4d,0x22,0x11,0xd5]
-// CHECK: msr      trcacatr2, x8              // encoding: [0x48,0x24,0x11,0xd5]
-// CHECK: msr      trcacatr3, x1              // encoding: [0x41,0x26,0x11,0xd5]
-// CHECK: msr      trcacatr4, x11             // encoding: [0x4b,0x28,0x11,0xd5]
-// CHECK: msr      trcacatr5, x8              // encoding: [0x48,0x2a,0x11,0xd5]
-// CHECK: msr      trcacatr6, x24             // encoding: [0x58,0x2c,0x11,0xd5]
-// CHECK: msr      trcacatr7, x6              // encoding: [0x46,0x2e,0x11,0xd5]
-// CHECK: msr      trcacatr8, x23             // encoding: [0x77,0x20,0x11,0xd5]
-// CHECK: msr      trcacatr9, x5              // encoding: [0x65,0x22,0x11,0xd5]
-// CHECK: msr      trcacatr10, x11            // encoding: [0x6b,0x24,0x11,0xd5]
-// CHECK: msr      trcacatr11, x11            // encoding: [0x6b,0x26,0x11,0xd5]
-// CHECK: msr      trcacatr12, x3             // encoding: [0x63,0x28,0x11,0xd5]
-// CHECK: msr      trcacatr13, x28            // encoding: [0x7c,0x2a,0x11,0xd5]
-// CHECK: msr      trcacatr14, x25            // encoding: [0x79,0x2c,0x11,0xd5]
-// CHECK: msr      trcacatr15, x4             // encoding: [0x64,0x2e,0x11,0xd5]
-// CHECK: msr      trcdvcvr0, x6              // encoding: [0x86,0x20,0x11,0xd5]
-// CHECK: msr      trcdvcvr1, x3              // encoding: [0x83,0x24,0x11,0xd5]
-// CHECK: msr      trcdvcvr2, x5              // encoding: [0x85,0x28,0x11,0xd5]
-// CHECK: msr      trcdvcvr3, x11             // encoding: [0x8b,0x2c,0x11,0xd5]
-// CHECK: msr      trcdvcvr4, x9              // encoding: [0xa9,0x20,0x11,0xd5]
-// CHECK: msr      trcdvcvr5, x14             // encoding: [0xae,0x24,0x11,0xd5]
-// CHECK: msr      trcdvcvr6, x10             // encoding: [0xaa,0x28,0x11,0xd5]
-// CHECK: msr      trcdvcvr7, x12             // encoding: [0xac,0x2c,0x11,0xd5]
-// CHECK: msr      trcdvcmr0, x8              // encoding: [0xc8,0x20,0x11,0xd5]
-// CHECK: msr      trcdvcmr1, x8              // encoding: [0xc8,0x24,0x11,0xd5]
-// CHECK: msr      trcdvcmr2, x22             // encoding: [0xd6,0x28,0x11,0xd5]
-// CHECK: msr      trcdvcmr3, x22             // encoding: [0xd6,0x2c,0x11,0xd5]
-// CHECK: msr      trcdvcmr4, x5              // encoding: [0xe5,0x20,0x11,0xd5]
-// CHECK: msr      trcdvcmr5, x16             // encoding: [0xf0,0x24,0x11,0xd5]
-// CHECK: msr      trcdvcmr6, x27             // encoding: [0xfb,0x28,0x11,0xd5]
-// CHECK: msr      trcdvcmr7, x21             // encoding: [0xf5,0x2c,0x11,0xd5]
-// CHECK: msr      trccidcvr0, x8             // encoding: [0x08,0x30,0x11,0xd5]
-// CHECK: msr      trccidcvr1, x6             // encoding: [0x06,0x32,0x11,0xd5]
-// CHECK: msr      trccidcvr2, x9             // encoding: [0x09,0x34,0x11,0xd5]
-// CHECK: msr      trccidcvr3, x8             // encoding: [0x08,0x36,0x11,0xd5]
-// CHECK: msr      trccidcvr4, x3             // encoding: [0x03,0x38,0x11,0xd5]
-// CHECK: msr      trccidcvr5, x21            // encoding: [0x15,0x3a,0x11,0xd5]
-// CHECK: msr      trccidcvr6, x12            // encoding: [0x0c,0x3c,0x11,0xd5]
-// CHECK: msr      trccidcvr7, x7             // encoding: [0x07,0x3e,0x11,0xd5]
-// CHECK: msr      trcvmidcvr0, x4            // encoding: [0x24,0x30,0x11,0xd5]
-// CHECK: msr      trcvmidcvr1, x3            // encoding: [0x23,0x32,0x11,0xd5]
-// CHECK: msr      trcvmidcvr2, x9            // encoding: [0x29,0x34,0x11,0xd5]
-// CHECK: msr      trcvmidcvr3, x17           // encoding: [0x31,0x36,0x11,0xd5]
-// CHECK: msr      trcvmidcvr4, x14           // encoding: [0x2e,0x38,0x11,0xd5]
-// CHECK: msr      trcvmidcvr5, x12           // encoding: [0x2c,0x3a,0x11,0xd5]
-// CHECK: msr      trcvmidcvr6, x10           // encoding: [0x2a,0x3c,0x11,0xd5]
-// CHECK: msr      trcvmidcvr7, x3            // encoding: [0x23,0x3e,0x11,0xd5]
-// CHECK: msr      trccidcctlr0, x14          // encoding: [0x4e,0x30,0x11,0xd5]
-// CHECK: msr      trccidcctlr1, x22          // encoding: [0x56,0x31,0x11,0xd5]
-// CHECK: msr      trcvmidcctlr0, x8          // encoding: [0x48,0x32,0x11,0xd5]
-// CHECK: msr      trcvmidcctlr1, x15         // encoding: [0x4f,0x33,0x11,0xd5]
-// CHECK: msr      trcitctrl, x1              // encoding: [0x81,0x70,0x11,0xd5]
-// CHECK: msr      trcclaimset, x7            // encoding: [0xc7,0x78,0x11,0xd5]
-// CHECK: msr      trcclaimclr, x29           // encoding: [0xdd,0x79,0x11,0xd5]
+// CHECK: msr      {{trcoslar|TRCOSLAR}}, x28              // encoding: [0x9c,0x10,0x11,0xd5]
+// CHECK: msr      {{trclar|TRCLAR}}, x14                // encoding: [0xce,0x7c,0x11,0xd5]
+// CHECK: msr      {{trcprgctlr|TRCPRGCTLR}}, x10            // encoding: [0x0a,0x01,0x11,0xd5]
+// CHECK: msr      {{trcprocselr|TRCPROCSELR}}, x27           // encoding: [0x1b,0x02,0x11,0xd5]
+// CHECK: msr      {{trcconfigr|TRCCONFIGR}}, x24            // encoding: [0x18,0x04,0x11,0xd5]
+// CHECK: msr      {{trcauxctlr|TRCAUXCTLR}}, x8             // encoding: [0x08,0x06,0x11,0xd5]
+// CHECK: msr      {{trceventctl0r|TRCEVENTCTL0R}}, x16         // encoding: [0x10,0x08,0x11,0xd5]
+// CHECK: msr      {{trceventctl1r|TRCEVENTCTL1R}}, x27         // encoding: [0x1b,0x09,0x11,0xd5]
+// CHECK: msr      {{trcstallctlr|TRCSTALLCTLR}}, x26          // encoding: [0x1a,0x0b,0x11,0xd5]
+// CHECK: msr      {{trctsctlr|TRCTSCTLR}}, x0              // encoding: [0x00,0x0c,0x11,0xd5]
+// CHECK: msr      {{trcsyncpr|TRCSYNCPR}}, x14             // encoding: [0x0e,0x0d,0x11,0xd5]
+// CHECK: msr      {{trcccctlr|TRCCCCTLR}}, x8              // encoding: [0x08,0x0e,0x11,0xd5]
+// CHECK: msr      {{trcbbctlr|TRCBBCTLR}}, x6              // encoding: [0x06,0x0f,0x11,0xd5]
+// CHECK: msr      {{trctraceidr|TRCTRACEIDR}}, x23           // encoding: [0x37,0x00,0x11,0xd5]
+// CHECK: msr      {{trcqctlr|TRCQCTLR}}, x5               // encoding: [0x25,0x01,0x11,0xd5]
+// CHECK: msr      {{trcvictlr|TRCVICTLR}}, x0              // encoding: [0x40,0x00,0x11,0xd5]
+// CHECK: msr      {{trcviiectlr|TRCVIIECTLR}}, x0            // encoding: [0x40,0x01,0x11,0xd5]
+// CHECK: msr      {{trcvissctlr|TRCVISSCTLR}}, x1            // encoding: [0x41,0x02,0x11,0xd5]
+// CHECK: msr      {{trcvipcssctlr|TRCVIPCSSCTLR}}, x0          // encoding: [0x40,0x03,0x11,0xd5]
+// CHECK: msr      {{trcvdctlr|TRCVDCTLR}}, x7              // encoding: [0x47,0x08,0x11,0xd5]
+// CHECK: msr      {{trcvdsacctlr|TRCVDSACCTLR}}, x18          // encoding: [0x52,0x09,0x11,0xd5]
+// CHECK: msr      {{trcvdarcctlr|TRCVDARCCTLR}}, x24          // encoding: [0x58,0x0a,0x11,0xd5]
+// CHECK: msr      {{trcseqevr0|TRCSEQEVR0}}, x28            // encoding: [0x9c,0x00,0x11,0xd5]
+// CHECK: msr      {{trcseqevr1|TRCSEQEVR1}}, x21            // encoding: [0x95,0x01,0x11,0xd5]
+// CHECK: msr      {{trcseqevr2|TRCSEQEVR2}}, x16            // encoding: [0x90,0x02,0x11,0xd5]
+// CHECK: msr      {{trcseqrstevr|TRCSEQRSTEVR}}, x16          // encoding: [0x90,0x06,0x11,0xd5]
+// CHECK: msr      {{trcseqstr|TRCSEQSTR}}, x25             // encoding: [0x99,0x07,0x11,0xd5]
+// CHECK: msr      {{trcextinselr|TRCEXTINSELR}}, x29          // encoding: [0x9d,0x08,0x11,0xd5]
+// CHECK: msr      {{trccntrldvr0|TRCCNTRLDVR0}}, x20          // encoding: [0xb4,0x00,0x11,0xd5]
+// CHECK: msr      {{trccntrldvr1|TRCCNTRLDVR1}}, x20          // encoding: [0xb4,0x01,0x11,0xd5]
+// CHECK: msr      {{trccntrldvr2|TRCCNTRLDVR2}}, x22          // encoding: [0xb6,0x02,0x11,0xd5]
+// CHECK: msr      {{trccntrldvr3|TRCCNTRLDVR3}}, x12          // encoding: [0xac,0x03,0x11,0xd5]
+// CHECK: msr      {{trccntctlr0|TRCCNTCTLR0}}, x20           // encoding: [0xb4,0x04,0x11,0xd5]
+// CHECK: msr      {{trccntctlr1|TRCCNTCTLR1}}, x4            // encoding: [0xa4,0x05,0x11,0xd5]
+// CHECK: msr      {{trccntctlr2|TRCCNTCTLR2}}, x8            // encoding: [0xa8,0x06,0x11,0xd5]
+// CHECK: msr      {{trccntctlr3|TRCCNTCTLR3}}, x16           // encoding: [0xb0,0x07,0x11,0xd5]
+// CHECK: msr      {{trccntvr0|TRCCNTVR0}}, x5              // encoding: [0xa5,0x08,0x11,0xd5]
+// CHECK: msr      {{trccntvr1|TRCCNTVR1}}, x27             // encoding: [0xbb,0x09,0x11,0xd5]
+// CHECK: msr      {{trccntvr2|TRCCNTVR2}}, x21             // encoding: [0xb5,0x0a,0x11,0xd5]
+// CHECK: msr      {{trccntvr3|TRCCNTVR3}}, x8              // encoding: [0xa8,0x0b,0x11,0xd5]
+// CHECK: msr      {{trcimspec0|TRCIMSPEC0}}, x6             // encoding: [0xe6,0x00,0x11,0xd5]
+// CHECK: msr      {{trcimspec1|TRCIMSPEC1}}, x27            // encoding: [0xfb,0x01,0x11,0xd5]
+// CHECK: msr      {{trcimspec2|TRCIMSPEC2}}, x23            // encoding: [0xf7,0x02,0x11,0xd5]
+// CHECK: msr      {{trcimspec3|TRCIMSPEC3}}, x15            // encoding: [0xef,0x03,0x11,0xd5]
+// CHECK: msr      {{trcimspec4|TRCIMSPEC4}}, x13            // encoding: [0xed,0x04,0x11,0xd5]
+// CHECK: msr      {{trcimspec5|TRCIMSPEC5}}, x25            // encoding: [0xf9,0x05,0x11,0xd5]
+// CHECK: msr      {{trcimspec6|TRCIMSPEC6}}, x19            // encoding: [0xf3,0x06,0x11,0xd5]
+// CHECK: msr      {{trcimspec7|TRCIMSPEC7}}, x27            // encoding: [0xfb,0x07,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr2|TRCRSCTLR2}}, x4             // encoding: [0x04,0x12,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr3|TRCRSCTLR3}}, x0             // encoding: [0x00,0x13,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr4|TRCRSCTLR4}}, x21            // encoding: [0x15,0x14,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr5|TRCRSCTLR5}}, x8             // encoding: [0x08,0x15,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr6|TRCRSCTLR6}}, x20            // encoding: [0x14,0x16,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr7|TRCRSCTLR7}}, x11            // encoding: [0x0b,0x17,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr8|TRCRSCTLR8}}, x18            // encoding: [0x12,0x18,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr9|TRCRSCTLR9}}, x24            // encoding: [0x18,0x19,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr10|TRCRSCTLR10}}, x15           // encoding: [0x0f,0x1a,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr11|TRCRSCTLR11}}, x21           // encoding: [0x15,0x1b,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr12|TRCRSCTLR12}}, x4            // encoding: [0x04,0x1c,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr13|TRCRSCTLR13}}, x28           // encoding: [0x1c,0x1d,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr14|TRCRSCTLR14}}, x3            // encoding: [0x03,0x1e,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr15|TRCRSCTLR15}}, x20           // encoding: [0x14,0x1f,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr16|TRCRSCTLR16}}, x12           // encoding: [0x2c,0x10,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr17|TRCRSCTLR17}}, x17           // encoding: [0x31,0x11,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr18|TRCRSCTLR18}}, x10           // encoding: [0x2a,0x12,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr19|TRCRSCTLR19}}, x11           // encoding: [0x2b,0x13,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr20|TRCRSCTLR20}}, x3            // encoding: [0x23,0x14,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr21|TRCRSCTLR21}}, x18           // encoding: [0x32,0x15,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr22|TRCRSCTLR22}}, x26           // encoding: [0x3a,0x16,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr23|TRCRSCTLR23}}, x5            // encoding: [0x25,0x17,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr24|TRCRSCTLR24}}, x25           // encoding: [0x39,0x18,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr25|TRCRSCTLR25}}, x5            // encoding: [0x25,0x19,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr26|TRCRSCTLR26}}, x4            // encoding: [0x24,0x1a,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr27|TRCRSCTLR27}}, x20           // encoding: [0x34,0x1b,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr28|TRCRSCTLR28}}, x5            // encoding: [0x25,0x1c,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr29|TRCRSCTLR29}}, x10           // encoding: [0x2a,0x1d,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr30|TRCRSCTLR30}}, x24           // encoding: [0x38,0x1e,0x11,0xd5]
+// CHECK: msr      {{trcrsctlr31|TRCRSCTLR31}}, x20           // encoding: [0x34,0x1f,0x11,0xd5]
+// CHECK: msr      {{trcssccr0|TRCSSCCR0}}, x23             // encoding: [0x57,0x10,0x11,0xd5]
+// CHECK: msr      {{trcssccr1|TRCSSCCR1}}, x27             // encoding: [0x5b,0x11,0x11,0xd5]
+// CHECK: msr      {{trcssccr2|TRCSSCCR2}}, x27             // encoding: [0x5b,0x12,0x11,0xd5]
+// CHECK: msr      {{trcssccr3|TRCSSCCR3}}, x6              // encoding: [0x46,0x13,0x11,0xd5]
+// CHECK: msr      {{trcssccr4|TRCSSCCR4}}, x3              // encoding: [0x43,0x14,0x11,0xd5]
+// CHECK: msr      {{trcssccr5|TRCSSCCR5}}, x12             // encoding: [0x4c,0x15,0x11,0xd5]
+// CHECK: msr      {{trcssccr6|TRCSSCCR6}}, x7              // encoding: [0x47,0x16,0x11,0xd5]
+// CHECK: msr      {{trcssccr7|TRCSSCCR7}}, x6              // encoding: [0x46,0x17,0x11,0xd5]
+// CHECK: msr      {{trcsscsr0|TRCSSCSR0}}, x20             // encoding: [0x54,0x18,0x11,0xd5]
+// CHECK: msr      {{trcsscsr1|TRCSSCSR1}}, x17             // encoding: [0x51,0x19,0x11,0xd5]
+// CHECK: msr      {{trcsscsr2|TRCSSCSR2}}, x11             // encoding: [0x4b,0x1a,0x11,0xd5]
+// CHECK: msr      {{trcsscsr3|TRCSSCSR3}}, x4              // encoding: [0x44,0x1b,0x11,0xd5]
+// CHECK: msr      {{trcsscsr4|TRCSSCSR4}}, x14             // encoding: [0x4e,0x1c,0x11,0xd5]
+// CHECK: msr      {{trcsscsr5|TRCSSCSR5}}, x22             // encoding: [0x56,0x1d,0x11,0xd5]
+// CHECK: msr      {{trcsscsr6|TRCSSCSR6}}, x3              // encoding: [0x43,0x1e,0x11,0xd5]
+// CHECK: msr      {{trcsscsr7|TRCSSCSR7}}, x11             // encoding: [0x4b,0x1f,0x11,0xd5]
+// CHECK: msr      {{trcsspcicr0|TRCSSPCICR0}}, x2            // encoding: [0x62,0x10,0x11,0xd5]
+// CHECK: msr      {{trcsspcicr1|TRCSSPCICR1}}, x3            // encoding: [0x63,0x11,0x11,0xd5]
+// CHECK: msr      {{trcsspcicr2|TRCSSPCICR2}}, x5            // encoding: [0x65,0x12,0x11,0xd5]
+// CHECK: msr      {{trcsspcicr3|TRCSSPCICR3}}, x7            // encoding: [0x67,0x13,0x11,0xd5]
+// CHECK: msr      {{trcsspcicr4|TRCSSPCICR4}}, x11           // encoding: [0x6b,0x14,0x11,0xd5]
+// CHECK: msr      {{trcsspcicr5|TRCSSPCICR5}}, x13           // encoding: [0x6d,0x15,0x11,0xd5]
+// CHECK: msr      {{trcsspcicr6|TRCSSPCICR6}}, x17           // encoding: [0x71,0x16,0x11,0xd5]
+// CHECK: msr      {{trcsspcicr7|TRCSSPCICR7}}, x23           // encoding: [0x77,0x17,0x11,0xd5]
+// CHECK: msr      {{trcpdcr|TRCPDCR}}, x3                // encoding: [0x83,0x14,0x11,0xd5]
+// CHECK: msr      {{trcacvr0|TRCACVR0}}, x6               // encoding: [0x06,0x20,0x11,0xd5]
+// CHECK: msr      {{trcacvr1|TRCACVR1}}, x20              // encoding: [0x14,0x22,0x11,0xd5]
+// CHECK: msr      {{trcacvr2|TRCACVR2}}, x25              // encoding: [0x19,0x24,0x11,0xd5]
+// CHECK: msr      {{trcacvr3|TRCACVR3}}, x1               // encoding: [0x01,0x26,0x11,0xd5]
+// CHECK: msr      {{trcacvr4|TRCACVR4}}, x28              // encoding: [0x1c,0x28,0x11,0xd5]
+// CHECK: msr      {{trcacvr5|TRCACVR5}}, x15              // encoding: [0x0f,0x2a,0x11,0xd5]
+// CHECK: msr      {{trcacvr6|TRCACVR6}}, x25              // encoding: [0x19,0x2c,0x11,0xd5]
+// CHECK: msr      {{trcacvr7|TRCACVR7}}, x12              // encoding: [0x0c,0x2e,0x11,0xd5]
+// CHECK: msr      {{trcacvr8|TRCACVR8}}, x5               // encoding: [0x25,0x20,0x11,0xd5]
+// CHECK: msr      {{trcacvr9|TRCACVR9}}, x25              // encoding: [0x39,0x22,0x11,0xd5]
+// CHECK: msr      {{trcacvr10|TRCACVR10}}, x13             // encoding: [0x2d,0x24,0x11,0xd5]
+// CHECK: msr      {{trcacvr11|TRCACVR11}}, x10             // encoding: [0x2a,0x26,0x11,0xd5]
+// CHECK: msr      {{trcacvr12|TRCACVR12}}, x19             // encoding: [0x33,0x28,0x11,0xd5]
+// CHECK: msr      {{trcacvr13|TRCACVR13}}, x10             // encoding: [0x2a,0x2a,0x11,0xd5]
+// CHECK: msr      {{trcacvr14|TRCACVR14}}, x19             // encoding: [0x33,0x2c,0x11,0xd5]
+// CHECK: msr      {{trcacvr15|TRCACVR15}}, x2              // encoding: [0x22,0x2e,0x11,0xd5]
+// CHECK: msr      {{trcacatr0|TRCACATR0}}, x15             // encoding: [0x4f,0x20,0x11,0xd5]
+// CHECK: msr      {{trcacatr1|TRCACATR1}}, x13             // encoding: [0x4d,0x22,0x11,0xd5]
+// CHECK: msr      {{trcacatr2|TRCACATR2}}, x8              // encoding: [0x48,0x24,0x11,0xd5]
+// CHECK: msr      {{trcacatr3|TRCACATR3}}, x1              // encoding: [0x41,0x26,0x11,0xd5]
+// CHECK: msr      {{trcacatr4|TRCACATR4}}, x11             // encoding: [0x4b,0x28,0x11,0xd5]
+// CHECK: msr      {{trcacatr5|TRCACATR5}}, x8              // encoding: [0x48,0x2a,0x11,0xd5]
+// CHECK: msr      {{trcacatr6|TRCACATR6}}, x24             // encoding: [0x58,0x2c,0x11,0xd5]
+// CHECK: msr      {{trcacatr7|TRCACATR7}}, x6              // encoding: [0x46,0x2e,0x11,0xd5]
+// CHECK: msr      {{trcacatr8|TRCACATR8}}, x23             // encoding: [0x77,0x20,0x11,0xd5]
+// CHECK: msr      {{trcacatr9|TRCACATR9}}, x5              // encoding: [0x65,0x22,0x11,0xd5]
+// CHECK: msr      {{trcacatr10|TRCACATR10}}, x11            // encoding: [0x6b,0x24,0x11,0xd5]
+// CHECK: msr      {{trcacatr11|TRCACATR11}}, x11            // encoding: [0x6b,0x26,0x11,0xd5]
+// CHECK: msr      {{trcacatr12|TRCACATR12}}, x3             // encoding: [0x63,0x28,0x11,0xd5]
+// CHECK: msr      {{trcacatr13|TRCACATR13}}, x28            // encoding: [0x7c,0x2a,0x11,0xd5]
+// CHECK: msr      {{trcacatr14|TRCACATR14}}, x25            // encoding: [0x79,0x2c,0x11,0xd5]
+// CHECK: msr      {{trcacatr15|TRCACATR15}}, x4             // encoding: [0x64,0x2e,0x11,0xd5]
+// CHECK: msr      {{trcdvcvr0|TRCDVCVR0}}, x6              // encoding: [0x86,0x20,0x11,0xd5]
+// CHECK: msr      {{trcdvcvr1|TRCDVCVR1}}, x3              // encoding: [0x83,0x24,0x11,0xd5]
+// CHECK: msr      {{trcdvcvr2|TRCDVCVR2}}, x5              // encoding: [0x85,0x28,0x11,0xd5]
+// CHECK: msr      {{trcdvcvr3|TRCDVCVR3}}, x11             // encoding: [0x8b,0x2c,0x11,0xd5]
+// CHECK: msr      {{trcdvcvr4|TRCDVCVR4}}, x9              // encoding: [0xa9,0x20,0x11,0xd5]
+// CHECK: msr      {{trcdvcvr5|TRCDVCVR5}}, x14             // encoding: [0xae,0x24,0x11,0xd5]
+// CHECK: msr      {{trcdvcvr6|TRCDVCVR6}}, x10             // encoding: [0xaa,0x28,0x11,0xd5]
+// CHECK: msr      {{trcdvcvr7|TRCDVCVR7}}, x12             // encoding: [0xac,0x2c,0x11,0xd5]
+// CHECK: msr      {{trcdvcmr0|TRCDVCMR0}}, x8              // encoding: [0xc8,0x20,0x11,0xd5]
+// CHECK: msr      {{trcdvcmr1|TRCDVCMR1}}, x8              // encoding: [0xc8,0x24,0x11,0xd5]
+// CHECK: msr      {{trcdvcmr2|TRCDVCMR2}}, x22             // encoding: [0xd6,0x28,0x11,0xd5]
+// CHECK: msr      {{trcdvcmr3|TRCDVCMR3}}, x22             // encoding: [0xd6,0x2c,0x11,0xd5]
+// CHECK: msr      {{trcdvcmr4|TRCDVCMR4}}, x5              // encoding: [0xe5,0x20,0x11,0xd5]
+// CHECK: msr      {{trcdvcmr5|TRCDVCMR5}}, x16             // encoding: [0xf0,0x24,0x11,0xd5]
+// CHECK: msr      {{trcdvcmr6|TRCDVCMR6}}, x27             // encoding: [0xfb,0x28,0x11,0xd5]
+// CHECK: msr      {{trcdvcmr7|TRCDVCMR7}}, x21             // encoding: [0xf5,0x2c,0x11,0xd5]
+// CHECK: msr      {{trccidcvr0|TRCCIDCVR0}}, x8             // encoding: [0x08,0x30,0x11,0xd5]
+// CHECK: msr      {{trccidcvr1|TRCCIDCVR1}}, x6             // encoding: [0x06,0x32,0x11,0xd5]
+// CHECK: msr      {{trccidcvr2|TRCCIDCVR2}}, x9             // encoding: [0x09,0x34,0x11,0xd5]
+// CHECK: msr      {{trccidcvr3|TRCCIDCVR3}}, x8             // encoding: [0x08,0x36,0x11,0xd5]
+// CHECK: msr      {{trccidcvr4|TRCCIDCVR4}}, x3             // encoding: [0x03,0x38,0x11,0xd5]
+// CHECK: msr      {{trccidcvr5|TRCCIDCVR5}}, x21            // encoding: [0x15,0x3a,0x11,0xd5]
+// CHECK: msr      {{trccidcvr6|TRCCIDCVR6}}, x12            // encoding: [0x0c,0x3c,0x11,0xd5]
+// CHECK: msr      {{trccidcvr7|TRCCIDCVR7}}, x7             // encoding: [0x07,0x3e,0x11,0xd5]
+// CHECK: msr      {{trcvmidcvr0|TRCVMIDCVR0}}, x4            // encoding: [0x24,0x30,0x11,0xd5]
+// CHECK: msr      {{trcvmidcvr1|TRCVMIDCVR1}}, x3            // encoding: [0x23,0x32,0x11,0xd5]
+// CHECK: msr      {{trcvmidcvr2|TRCVMIDCVR2}}, x9            // encoding: [0x29,0x34,0x11,0xd5]
+// CHECK: msr      {{trcvmidcvr3|TRCVMIDCVR3}}, x17           // encoding: [0x31,0x36,0x11,0xd5]
+// CHECK: msr      {{trcvmidcvr4|TRCVMIDCVR4}}, x14           // encoding: [0x2e,0x38,0x11,0xd5]
+// CHECK: msr      {{trcvmidcvr5|TRCVMIDCVR5}}, x12           // encoding: [0x2c,0x3a,0x11,0xd5]
+// CHECK: msr      {{trcvmidcvr6|TRCVMIDCVR6}}, x10           // encoding: [0x2a,0x3c,0x11,0xd5]
+// CHECK: msr      {{trcvmidcvr7|TRCVMIDCVR7}}, x3            // encoding: [0x23,0x3e,0x11,0xd5]
+// CHECK: msr      {{trccidcctlr0|TRCCIDCCTLR0}}, x14          // encoding: [0x4e,0x30,0x11,0xd5]
+// CHECK: msr      {{trccidcctlr1|TRCCIDCCTLR1}}, x22          // encoding: [0x56,0x31,0x11,0xd5]
+// CHECK: msr      {{trcvmidcctlr0|TRCVMIDCCTLR0}}, x8          // encoding: [0x48,0x32,0x11,0xd5]
+// CHECK: msr      {{trcvmidcctlr1|TRCVMIDCCTLR1}}, x15         // encoding: [0x4f,0x33,0x11,0xd5]
+// CHECK: msr      {{trcitctrl|TRCITCTRL}}, x1              // encoding: [0x81,0x70,0x11,0xd5]
+// CHECK: msr      {{trcclaimset|TRCCLAIMSET}}, x7            // encoding: [0xc7,0x78,0x11,0xd5]
+// CHECK: msr      {{trcclaimclr|TRCCLAIMCLR}}, x29           // encoding: [0xdd,0x79,0x11,0xd5]
diff --git a/test/MC/ARM/Windows/mov32t-range.s b/test/MC/ARM/Windows/mov32t-range.s
new file mode 100644
index 0000000..fef8ff2
--- /dev/null
+++ b/test/MC/ARM/Windows/mov32t-range.s
@@ -0,0 +1,37 @@
+@ RUN: llvm-mc -triple thumbv7-windows-itanium -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -r - | FileCheck -check-prefix CHECK-RELOCATIONS %s
+
+@ RUN: llvm-mc -triple thumbv7-windows-itanium -filetype obj -o - %s \
+@ RUN:   | llvm-objdump -d - | FileCheck -check-prefix CHECK-ENCODING %s
+
+	.syntax unified
+	.thumb
+	.text
+
+	.def truncation
+		.scl 3
+		.type 32
+	.endef
+	.align 2
+	.thumb_func
+truncation:
+	movw r0, :lower16:.Lerange
+	movt r0, :upper16:.Lerange
+	bx lr
+
+	.section .rdata,"rd"
+.Lbuffer:
+	.zero 65536
+.Lerange:
+	.asciz "-erange"
+
+@ CHECK-RELOCATIONS: Relocations [
+@ CHECK-RELOCATIONS:   .text {
+@ CHECK-RELOCATIONS:     0x0 IMAGE_REL_ARM_MOV32T .rdata
+@ CHECK-RELOCATIONS-NOT: 0x4 IMAGE_REL_ARM_MOV32T .rdata
+@ CHECK-RELOCATIONS:   }
+@ CHECK-RELOCATIONS: ]
+
+@ CHECK-ENCODING:      0: 40 f2 00 00
+@ CHECK-ENCODING-NEXT: 4: c0 f2 01 00
+
diff --git a/test/MC/ARM/arm-thumb-cpus-default.s b/test/MC/ARM/arm-thumb-cpus-default.s
index 636ee3c..d7a1849 100644
--- a/test/MC/ARM/arm-thumb-cpus-default.s
+++ b/test/MC/ARM/arm-thumb-cpus-default.s
@@ -1,9 +1,20 @@
-@ RUN: llvm-mc -show-encoding -arch=arm < %s | FileCheck %s --check-prefix=CHECK-ARM-ONLY
-@ RUN: llvm-mc -show-encoding -triple=armv4t < %s | FileCheck %s --check-prefix=CHECK-ARM-THUMB
-@ RUN: llvm-mc -show-encoding -arch=arm -mcpu=cortex-a15 < %s| FileCheck %s --check-prefix=CHECK-ARM-THUMB
-@ RUN: llvm-mc -show-encoding -arch=arm -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
-@ RUN: llvm-mc -show-encoding -triple=armv7m < %s | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
-@ RUN: llvm-mc -show-encoding -triple=armv6m < %s | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
+@ RUN: llvm-mc -show-encoding -triple=arm-eabi < %s \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-ARM-ONLY
+
+@ RUN: llvm-mc -show-encoding -triple=armv4t-eabi < %s \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-ARM-THUMB
+
+@ RUN: llvm-mc -show-encoding -triple=arm-eabi -mcpu=cortex-a15 < %s \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-ARM-THUMB
+
+@ RUN: llvm-mc -show-encoding -triple=arm-eabi -mcpu=cortex-m3 < %s \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
+
+@ RUN: llvm-mc -show-encoding -triple=armv7m-eabi < %s \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
+
+@ RUN: llvm-mc -show-encoding -triple=armv6m-eabi < %s \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
 
         @ Make sure the architecture chosen by LLVM defaults to a compatible
         @ ARM/Thumb mode.
diff --git a/test/MC/ARM/arm-thumb-cpus.s b/test/MC/ARM/arm-thumb-cpus.s
index 24be989..9005c7f 100644
--- a/test/MC/ARM/arm-thumb-cpus.s
+++ b/test/MC/ARM/arm-thumb-cpus.s
@@ -1,9 +1,20 @@
-@ RUN: not llvm-mc -show-encoding -arch=arm < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ARM-ONLY
-@ RUN: llvm-mc -show-encoding -triple=armv4t < %s 2>&1| FileCheck %s --check-prefix=CHECK-ARM-THUMB
-@ RUN: llvm-mc -show-encoding -arch=arm -mcpu=cortex-a15 < %s 2>&1| FileCheck %s --check-prefix=CHECK-ARM-THUMB
-@ RUN: not llvm-mc -show-encoding -arch=arm -mcpu=cortex-m3 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
-@ RUN: not llvm-mc -show-encoding -triple=armv7m < %s 2>&1 | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
-@ RUN: not llvm-mc -show-encoding -triple=armv6m < %s 2>&1 | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
+@ RUN: not llvm-mc -show-encoding -triple=arm-eabi < %s 2>&1 \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-ARM-ONLY
+
+@ RUN: llvm-mc -show-encoding -triple=armv4t < %s 2>&1 \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-ARM-THUMB
+
+@ RUN: llvm-mc -show-encoding -triple=arm-eabi -mcpu=cortex-a15 < %s 2>&1 \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-ARM-THUMB
+
+@ RUN: not llvm-mc -show-encoding -triple=arm-eabi -mcpu=cortex-m3 < %s 2>&1 \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
+
+@ RUN: not llvm-mc -show-encoding -triple=armv7m-eabi < %s 2>&1 \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
+
+@ RUN: not llvm-mc -show-encoding -triple=armv6m-eabi < %s 2>&1 \
+@ RUN:  | FileCheck %s --check-prefix=CHECK-THUMB-ONLY
 
         @ Make sure correct diagnostics are given for CPUs without support for
         @ one or other of the execution states.
diff --git a/test/MC/ARM/arm_fixups.s b/test/MC/ARM/arm_fixups.s
index bd6906b..1f56e12 100644
--- a/test/MC/ARM/arm_fixups.s
+++ b/test/MC/ARM/arm_fixups.s
@@ -26,9 +26,9 @@
 @ CHECK-BE: movt	r9, :upper16:_foo       @ encoding: [0xe3,0b0100AAAA,0x90'A',A]
 @ CHECK-BE: @   fixup A - offset: 0, value: _foo, kind: fixup_arm_movt_hi16
 
-    mov r2, fred
+    mov r2, :lower16:fred
 
-@ CHECK: movw  r2, fred                 @ encoding: [A,0x20'A',0b0000AAAA,0xe3]
+@ CHECK: movw  r2, :lower16:fred                 @ encoding: [A,0x20'A',0b0000AAAA,0xe3]
 @ CHECK: @   fixup A - offset: 0, value: fred, kind: fixup_arm_movw_lo16
-@ CHECK-BE: movw  r2, fred                 @ encoding: [0xe3,0b0000AAAA,0x20'A',A]
+@ CHECK-BE: movw  r2, :lower16:fred                 @ encoding: [0xe3,0b0000AAAA,0x20'A',A]
 @ CHECK-BE: @   fixup A - offset: 0, value: fred, kind: fixup_arm_movw_lo16
diff --git a/test/MC/ARM/basic-thumb2-instructions.s b/test/MC/ARM/basic-thumb2-instructions.s
index a8c9cdc..05e0b2b 100644
--- a/test/MC/ARM/basic-thumb2-instructions.s
+++ b/test/MC/ARM/basic-thumb2-instructions.s
@@ -2805,6 +2805,9 @@ _func:
         strd r0, r1, [r2, #-0]
         strd r0, r1, [r2, #-0]!
         strd r0, r1, [r2], #-0
+        strd r0, r1, [r2, #256]
+        strd r0, r1, [r2, #256]!
+        strd r0, r1, [r2], #256
 
 @ CHECK: strd	r3, r5, [r6, #24]       @ encoding: [0xc6,0xe9,0x06,0x35]
 @ CHECK: strd	r3, r5, [r6, #24]!      @ encoding: [0xe6,0xe9,0x06,0x35]
@@ -2815,6 +2818,9 @@ _func:
 @ CHECK: strd   r0, r1, [r2, #-0]       @ encoding: [0x42,0xe9,0x00,0x01]
 @ CHECK: strd   r0, r1, [r2, #-0]!      @ encoding: [0x62,0xe9,0x00,0x01]
 @ CHECK: strd   r0, r1, [r2], #-0       @ encoding: [0x62,0xe8,0x00,0x01]
+@ CHECK: strd	r0, r1, [r2, #256]      @ encoding: [0xc2,0xe9,0x40,0x01]
+@ CHECK: strd	r0, r1, [r2, #256]!     @ encoding: [0xe2,0xe9,0x40,0x01]
+@ CHECK: strd	r0, r1, [r2], #256      @ encoding: [0xe2,0xe8,0x40,0x01]
 
 
 @------------------------------------------------------------------------------
diff --git a/test/MC/ARM/big-endian-arm-fixup.s b/test/MC/ARM/big-endian-arm-fixup.s
new file mode 100644
index 0000000..5fb9cef
--- /dev/null
+++ b/test/MC/ARM/big-endian-arm-fixup.s
@@ -0,0 +1,107 @@
+// RUN: llvm-mc -triple=armeb-eabi -mattr v7,vfp2 -filetype=obj < %s | llvm-objdump -s - | FileCheck %s
+
+	.syntax unified
+	.text
+	.align	2
+	.code 32
+
+@ARM::fixup_arm_condbl
+.section s_condbl,"ax",%progbits
+// CHECK-LABEL: Contents of section s_condbl
+// CHECK: 0000 0b000002
+ 	bleq condbl_label+16
+condbl_label:
+
+@ARM::fixup_arm_uncondbl
+.section s_uncondbl,"ax",%progbits
+// CHECK-LABEL: Contents of section s_uncondbl
+// CHECK: 0000 eb000002
+ 	bl uncond_label+16
+uncond_label:
+
+@ARM::fixup_arm_blx
+.section s_blx,"ax",%progbits
+// CHECK-LABEL: Contents of section s_blx
+// CHECK: 0000 fa000002
+ 	blx blx_label+16
+blx_label:
+
+@ARM::fixup_arm_uncondbranch
+.section s_uncondbranch,"ax",%progbits
+// CHECK-LABEL: Contents of section s_uncondbranch
+// CHECK: 0000 ea000003
+ 	b uncondbranch_label+16
+uncondbranch_label:
+
+@ARM::fixup_arm_condbranch
+.section s_condbranch,"ax",%progbits
+// CHECK-LABEL: Contents of section s_condbranch
+// CHECK: 0000 0a000003
+ 	beq condbranch_label+16
+condbranch_label:
+
+@ARM::fixup_arm_pcrel_10
+.section s_arm_pcrel_10,"ax",%progbits
+// CHECK-LABEL: Contents of section s_arm_pcrel_10
+// CHECK: 0000 ed9f0b03
+ 	vldr d0, arm_pcrel_10_label+16
+arm_pcrel_10_label:
+
+@ARM::fixup_arm_ldst_pcrel_12
+.section s_arm_ldst_pcrel_12,"ax",%progbits
+// CHECK-LABEL: Contents of section s_arm_ldst_pcrel_12
+// CHECK: 0000 e59f000c
+ 	ldr r0, arm_ldst_pcrel_12_label+16
+arm_ldst_pcrel_12_label:
+
+@ARM::fixup_arm_adr_pcrel_12
+.section s_arm_adr_pcrel_12,"ax",%progbits
+// CHECK-LABEL: Contents of section s_arm_adr_pcrel_12
+// CHECK: 0000 e28f0010
+	adr	r0, arm_adr_pcrel_12_label+20
+arm_adr_pcrel_12_label:
+
+@ARM::fixup_arm_adr_pcrel_10_unscaled
+.section s_arm_adr_pcrel_10_unscaled,"ax",%progbits
+// CHECK-LABEL: Contents of section s_arm_adr_pcrel_10_unscaled
+// CHECK: 0000 e1cf01d4
+	ldrd	r0, r1, arm_adr_pcrel_10_unscaled_label+24
+arm_adr_pcrel_10_unscaled_label:
+
+@ARM::fixup_arm_movw_lo16
+.section s_movw,"ax",%progbits
+// CHECK-LABEL: Contents of section s_movw
+// CHECK: 0000 e3000008
+	movw	r0, :lower16:(some_label+8)
+
+@ARM::fixup_arm_movt_hi16
+.section s_movt,"ax",%progbits
+// CHECK-LABEL: Contents of section s_movt
+// CHECK: 0000 e34f0ffc
+	movt	r0, :upper16:GOT-(movt_label)
+movt_label:
+
+@FK_Data_1
+.section s_fk_data_1
+// CHECK-LABEL: Contents of section s_fk_data_1
+// CHECK: 0000 01
+fk_data1_l_label:
+.byte fk_data1_h_label-fk_data1_l_label
+fk_data1_h_label:
+
+@FK_Data_2
+.section s_fk_data_2
+// CHECK-LABEL: Contents of section s_fk_data_2
+// CHECK: 0000 0002
+fk_data2_l_label:
+.short fk_data2_h_label-fk_data2_l_label
+fk_data2_h_label:
+
+@FK_Data_4
+.section s_fk_data_4
+// CHECK-LABEL: Contents of section s_fk_data_4
+// CHECK: 0000 00000004
+fk_data4_l_label:
+.long fk_data4_h_label-fk_data4_l_label
+fk_data4_h_label:
+
diff --git a/test/MC/ARM/big-endian-thumb-fixup.s b/test/MC/ARM/big-endian-thumb-fixup.s
new file mode 100644
index 0000000..5023fca
--- /dev/null
+++ b/test/MC/ARM/big-endian-thumb-fixup.s
@@ -0,0 +1,63 @@
+// RUN: llvm-mc -triple=armeb-eabi -mattr v7,vfp2 -filetype=obj < %s | llvm-objdump -s - | FileCheck %s
+
+	.syntax unified
+	.text
+	.align	2
+	.code 16
+
+@ARM::fixup_arm_thumb_bl
+.section s_thumb_bl,"ax",%progbits
+// CHECK-LABEL: Contents of section s_thumb_bl
+// CHECK: 0000 f000f801
+ 	bl thumb_bl_label
+	nop
+thumb_bl_label:
+
+@ARM::fixup_arm_thumb_blx
+// CHECK-LABEL: Contents of section s_thumb_bl
+// CHECK: 0000 f000e802
+.section s_thumb_blx,"ax",%progbits
+ 	blx thumb_blx_label+8
+thumb_blx_label:
+
+@ARM::fixup_arm_thumb_br
+.section s_thumb_br,"ax",%progbits
+// CHECK-LABEL: Contents of section s_thumb_br
+// CHECK: 0000 e000bf00
+ 	b thumb_br_label
+	nop
+thumb_br_label:
+
+@ARM::fixup_arm_thumb_bcc
+.section s_thumb_bcc,"ax",%progbits
+// CHECK-LABEL: Contents of section s_thumb_bcc
+// CHECK: 0000 d000bf00
+ 	beq thumb_bcc_label
+	nop
+thumb_bcc_label:
+
+@ARM::fixup_arm_thumb_cb
+.section s_thumb_cb,"ax",%progbits
+// CHECK-LABEL: Contents of section s_thumb_cb
+// CHECK: 0000 b100bf00
+ 	cbz r0, thumb_cb_label
+	nop
+thumb_cb_label:
+
+@ARM::fixup_arm_thumb_cp
+.section s_thumb_cp,"ax",%progbits
+// CHECK-LABEL: Contents of section s_thumb_cp
+// CHECK: 0000 4801bf00
+ 	ldr r0, =thumb_cp_label
+	nop
+	nop
+thumb_cp_label:
+
+@ARM::fixup_arm_thumb_adr_pcrel_10
+.section s_thumb_adr_pcrel_10,"ax",%progbits
+// CHECK-LABEL: Contents of section s_thumb_adr_pcrel_10
+// CHECK: 0000 a000bf00
+	adr r0, thumb_adr_pcrel_10_label
+	nop
+thumb_adr_pcrel_10_label:
+
diff --git a/test/MC/ARM/big-endian-thumb2-fixup.s b/test/MC/ARM/big-endian-thumb2-fixup.s
new file mode 100644
index 0000000..4fd5276
--- /dev/null
+++ b/test/MC/ARM/big-endian-thumb2-fixup.s
@@ -0,0 +1,49 @@
+// RUN: llvm-mc -triple=thumbeb-eabi -mattr v7,vfp2 -filetype=obj < %s | llvm-objdump -s - | FileCheck %s
+
+	.syntax unified
+	.text
+	.align	2
+
+@ARM::fixup_t2_movw_lo16
+.section s_movw,"ax",%progbits
+// CHECK-LABEL: Contents of section s_movw
+// CHECK: 0000 f2400008
+	movw	r0, :lower16:(some_label+8)
+
+@ARM::fixup_t2_movt_hi16
+.section s_movt,"ax",%progbits
+// CHECK-LABEL: Contents of section s_movt
+// CHECK: 0000 f6cf70fc
+	movt	r0, :upper16:GOT-(movt_label)
+movt_label:
+
+@ARM::fixup_t2_uncondbranch
+.section s_uncondbranch,"ax",%progbits
+// CHECK-LABEL: Contents of section s_uncondbranch
+// CHECK: 0000 f000b801 bf00
+ 	b.w uncond_label
+	nop
+uncond_label:
+
+@ARM::fixup_t2_condbranch
+.section s_condbranch,"ax",%progbits
+// CHECK-LABEL: Contents of section s_condbranch
+// CHECK: 0000 f0008001 bf00
+ 	beq.w cond_label
+	nop
+cond_label:
+
+@ARM::fixup_t2_ldst_precel_12
+.section s_ldst_precel_12,"ax",%progbits
+ 	ldr r0, ldst_precel_12_label
+	nop
+	nop
+ldst_precel_12_label:
+
+@ARM::fixup_t2_adr_pcrel_12
+.section s_adr_pcrel_12,"ax",%progbits
+ 	adr r0, adr_pcrel_12_label
+	nop
+	nop
+adr_pcrel_12_label:
+
diff --git a/test/MC/ARM/coff-debugging-secrel.ll b/test/MC/ARM/coff-debugging-secrel.ll
new file mode 100644
index 0000000..f37b19e
--- /dev/null
+++ b/test/MC/ARM/coff-debugging-secrel.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple thumbv7--windows-itanium -filetype obj -o - %s \
+; RUN:     | llvm-readobj -r - | FileCheck %s -check-prefix CHECK-ITANIUM
+
+; RUN: llc -mtriple thumbv7--windows-msvc -filetype obj -o - %s \
+; RUN:    | llvm-readobj -r - | FileCheck %s -check-prefix CHECK-MSVC
+
+; ModuleID = '/Users/compnerd/work/llvm/test/MC/ARM/reduced.c'
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7--windows-itanium"
+
+define arm_aapcs_vfpcc void @function() {
+entry:
+  ret void, !dbg !0
+}
+
+!llvm.dbg.cu = !{!7}
+!llvm.module.flags = !{!9, !10}
+
+!0 = metadata !{i32 1, i32 0, metadata !1, null}
+!1 = metadata !{i32 786478, metadata !2, metadata !3, metadata !"function", metadata !"function", metadata !"", i32 1, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @function, null, null, metadata !6, i32 1} ; [ DW_TAG_subprogram ], [line 1], [def], [function]
+!2 = metadata !{metadata !"/Users/compnerd/work/llvm/test/MC/ARM/reduced.c", metadata !"/Users/compnerd/work/llvm"}
+!3 = metadata !{i32 786473, metadata !2} ; [ DW_TAG_file_type] [/Users/compnerd/work/llvm/test/MC/ARM/reduced.c]
+!4 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ], [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{null}
+!6 = metadata !{}
+!7 = metadata !{i32 786449, metadata !2, i32 12, metadata !"clang version 3.5.0", i1 false, metadata !"", i32 0, metadata !6, metadata !6, metadata !8, metadata !6, metadata !6, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/Users/compnerd/work/llvm/test/MC/ARM/reduced.c] [DW_LANG_C99]
+!8 = metadata !{metadata !1}
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+
+; CHECK-ITANIUM: Relocations [
+; CHECK-ITANIUM:   Section {{.*}} .debug_info {
+; CHECK-ITANIUM:     0x6 IMAGE_REL_ARM_SECREL .debug_abbrev
+; CHECK-ITANIUM:     0xC IMAGE_REL_ARM_SECREL .debug_str
+; CHECK-ITANIUM:     0x12 IMAGE_REL_ARM_SECREL .debug_str
+; CHECK-ITANIUM:     0x16 IMAGE_REL_ARM_SECREL .debug_line
+; CHECK-ITANIUM:   }
+; CHECK-ITANIUM:   Section {{.*}}.debug_pubnames {
+; CHECK-ITANIUM:     0x6 IMAGE_REL_ARM_SECREL .debug_info
+; CHECK-ITANIUM:   }
+; CHECK-ITANIUM: ]
+
+; CHECK-MSVC: Relocations [
+; CHECK-MSVC:   Section {{.*}} .debug$S {
+; CHECK-MSVC:     0xC IMAGE_REL_ARM_SECREL function
+; CHECK-MSVC:     0x10 IMAGE_REL_ARM_SECTION function
+; CHECK-MSVC:   }
+; CHECK-MSVC: ]
+
diff --git a/test/MC/ARM/coff-file.s b/test/MC/ARM/coff-file.s
new file mode 100644
index 0000000..f0dd29a
--- /dev/null
+++ b/test/MC/ARM/coff-file.s
@@ -0,0 +1,47 @@
+// RUN: llvm-mc -triple thumbv7-windows -filetype obj %s -o - | llvm-objdump -t - \
+// RUN:   | FileCheck %s
+
+// RUN: llvm-mc -triple thumbv7-windows -filetype obj %s -o - \
+// RUN:	  | llvm-readobj -symbols | FileCheck %s -check-prefix CHECK-SCN
+
+	.file "null-padded.asm"
+// CHECK: (nx 1) {{0x[0-9]+}} .file
+// CHECK-NEXT: AUX null-padded.asm{{$}}
+
+	.file "eighteen-chars.asm"
+
+// CHECK: (nx 1) {{0x[0-9]+}} .file
+// CHECK-NEXT: AUX eighteen-chars.asm{{$}}
+
+	.file "multiple-auxiliary-entries.asm"
+
+// CHECK: (nx 2) {{0x[0-9]+}} .file
+// CHECK-NEXT: AUX multiple-auxiliary-entries.asm{{$}}
+
+// CHECK-SCN: Symbols [
+// CHECK-SCN:   Symbol {
+// CHECK-SCN:     Name: .file
+// CHECK-SCN:     Section: (65534)
+// CHECK-SCN:     StorageClass: File
+// CHECK-SCN:     AuxFileRecord {
+// CHECK-SCN:       FileName: null-padded.asm
+// CHECK-SCN:     }
+// CHECK-SCN:   }
+// CHECK-SCN:   Symbol {
+// CHECK-SCN:     Name: .file
+// CHECK-SCN:     Section: (65534)
+// CHECK-SCN:     StorageClass: File
+// CHECK-SCN:     AuxFileRecord {
+// CHECK-SCN:       FileName: eighteen-chars.asm
+// CHECK-SCN:     }
+// CHECK-SCN:   }
+// CHECK-SCN:   Symbol {
+// CHECK-SCN:     Name: .file
+// CHECK-SCN:     Section: (65534)
+// CHECK-SCN:     StorageClass: File
+// CHECK-SCN:     AuxFileRecord {
+// CHECK-SCN:       FileName: multiple-auxiliary-entries.asm
+// CHECK-SCN:     }
+// CHECK-SCN:   }
+// CHECK-SCN: ]
+
diff --git a/test/MC/ARM/coff-function-type-info.ll b/test/MC/ARM/coff-function-type-info.ll
new file mode 100644
index 0000000..a9f7c18
--- /dev/null
+++ b/test/MC/ARM/coff-function-type-info.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple thumbv7-windows-itanium -filetype asm -o - %s \
+; RUN:    | FileCheck %s -check-prefix CHECK-ASM
+
+; RUN: llc -mtriple thumbv7-windows-itanium -filetype obj -o - %s \
+; RUN:    | llvm-readobj -t | FileCheck %s -check-prefix CHECK-OBJECT
+
+define arm_aapcs_vfpcc void @external() {
+entry:
+  ret void
+}
+
+; CHECK-ASM: .def external
+; CHECK-ASM:   .scl 2
+; CHECK-ASM:   .type 32
+; CHECK-ASM: .endef
+; CHECK-ASM: .globl external
+
+define internal arm_aapcs_vfpcc void @internal() {
+entry:
+  ret void
+}
+
+; CHECK-ASM: .def internal
+; CHECK-ASM:    .scl 3
+; CHECK-ASM:    .type 32
+; CHECK-ASM: .endef
+; CHECK-ASM-NOT: .globl internal
+
+; CHECK-OBJECT: Symbol {
+; CHECK-OBJECT:   Name: external
+; CHECK-OBJECT:   Section: .text
+; CHECK-OBJECT:   BaseType: Null
+; CHECK-OBJECT:   ComplexType: Function
+; CHECK-OBJECT:   StorageClass: External
+; CHECK-OBJECT:   AuxSymbolCount: 0
+; CHECK-OBJECT: }
+; CHECK-OBJECT: Symbol {
+; CHECK-OBJECT:   Name: internal
+; CHECK-OBJECT:   Section: .text
+; CHECK-OBJECT:   BaseType: Null
+; CHECK-OBJECT:   ComplexType: Function
+; CHECK-OBJECT:   StorageClass: Static
+; CHECK-OBJECT:   AuxSymbolCount: 0
+; CHECK-OBJECT: }
+
diff --git a/test/MC/ARM/coff-relocations.s b/test/MC/ARM/coff-relocations.s
new file mode 100644
index 0000000..6ebae70
--- /dev/null
+++ b/test/MC/ARM/coff-relocations.s
@@ -0,0 +1,101 @@
+@ RUN: llvm-mc -triple thumbv7-windows-itanium -filetype obj -o - %s \
+@ RUN:   | llvm-readobj -r - | FileCheck %s -check-prefix CHECK-RELOCATION
+
+@ RUN: llvm-mc -triple thumbv7-windows-itanium -filetype obj -o - %s \
+@ RUN:   | llvm-objdump -d - | FileCheck %s -check-prefix CHECK-ENCODING
+
+	.syntax unified
+	.text
+	.thumb
+
+	.global target
+
+	.thumb_func
+branch24t:
+	b target
+
+@ CHECK-ENCODING-LABEL: branch24t
+@ CHECK-ENCODING-NEXT: b.w #0
+
+	.thumb_func
+branch20t:
+	bcc target
+
+@ CHECK-ENCODING-LABEL: branch20t
+@ CHECK-ENCODING-NEXT: blo.w #0
+
+	.thumb_func
+blx23t:
+	bl target
+
+@ CHECK-ENCODING-LABEL: blx23t
+@ CHECK-ENCODING-NEXT: bl #0
+
+	.thumb_func
+mov32t:
+	movw r0, :lower16:target
+	movt r0, :upper16:target
+	blx r0
+
+@ CHECK-ENCODING-LABEL: mov32t
+@ CHECK-ENCODING-NEXT: movw r0, #0
+@ CHECK-ENCODING-NEXT: movt r0, #0
+@ CHECK-ENCODING-NEXT: blx r0
+
+	.thumb_func
+addr32:
+	ldr r0, .Laddr32
+	bx r0
+	trap
+.Laddr32:
+	.long target
+
+@ CHECK-ENCODING-LABEL: addr32
+@ CHECK-ENCODING-NEXT: ldr r0, [pc, #4]
+@ CHECK-ENCODING-NEXT: bx r0
+@ CHECK-ENCODING-NEXT: trap
+@ CHECK-ENCODING-NEXT: movs r0, r0
+@ CHECK-ENCODING-NEXT: movs r0, r0
+
+	.thumb_func
+addr32nb:
+	ldr r0, .Laddr32nb
+	bx r0
+	trap
+.Laddr32nb:
+	.long target(imgrel)
+
+@ CHECK-ENCODING-LABEL: addr32nb
+@ CHECK-ENCODING-NEXT: ldr.w r0, [pc, #4]
+@ CHECK-ENCODING-NEXT: bx r0
+@ CHECK-ENCODING-NEXT: trap
+@ CHECK-ENCODING-NEXT: movs r0, r0
+@ CHECK-ENCODING-NEXT: movs r0, r0
+
+       .thumb_func
+secrel:
+	ldr r0, .Lsecrel
+	bx r0
+	trap
+.Lsecrel:
+	.long target(secrel32)
+
+@ CHECK-ENCODING-LABEL: secrel
+@ CHECK-ENCODING-NEXT: ldr.w r0, [pc, #4]
+@ CHECK-ENCODING-NEXT: bx r0
+@ CHECK-ENCODING-NEXT: trap
+@ CHECK-ENCODING-NEXT: movs r0, r0
+@ CHECK-ENCODING-NEXT: movs r0, r0
+
+@ CHECK-RELOCATION: Relocations [
+@ CHECK-RELOCATION:   Section (1) .text {
+@ CHCEK-RELOCATION:     0x0 IMAGE_REL_ARM_BRANCH24T
+@ CHECK-RELOCATION:     0x4 IMAGE_REL_ARM_BRANCH20T
+@ CHECK-RELOCATION:     0x8 IMAGE_REL_ARM_BLX23T
+@ CHECK-RELOCATION:     0xC IMAGE_REL_ARM_MOV32T
+@ CHECK-RELOCATION:     0x1C IMAGE_REL_ARM_ADDR32
+@ CHECK-RELOCATION:     0x28 IMAGE_REL_ARM_ADDR32NB
+@ CHECK-RELOCATION:     0x34 IMAGE_REL_ARM_SECREL
+@ CHECK-RELOCATION:   }
+@ CHECK-RELOCATION: ]
+
diff --git a/test/MC/ARM/complex-operands.s b/test/MC/ARM/complex-operands.s
index 2a721c4..72f8f88 100644
--- a/test/MC/ARM/complex-operands.s
+++ b/test/MC/ARM/complex-operands.s
@@ -21,20 +21,20 @@ return:
 	.global arm_function
 	.type arm_function,%function
 arm_function:
-	mov r0, #(.L_table_end - .L_table_begin) >> 2
+	mov r0, #:lower16:((.L_table_end - .L_table_begin) >> 2)
 	blx return
 
 @ CHECK-LABEL: arm_function
-@ CHECK:  	movw r0, #(.L_table_end-.L_table_begin)>>2
+@ CHECK:  	movw r0, :lower16:((.L_table_end-.L_table_begin)>>2)
 @ CHECK:  	blx return
 
 	.global thumb_function
 	.type thumb_function,%function
 thumb_function:
-	mov r0, #(.L_table_end - .L_table_begin) >> 2
+	mov r0, #:lower16:((.L_table_end - .L_table_begin) >> 2)
 	blx return
 
 @ CHECK-LABEL: thumb_function
-@ CHECK:  	movw r0, #(.L_table_end-.L_table_begin)>>2
+@ CHECK:  	movw r0, :lower16:((.L_table_end-.L_table_begin)>>2)
 @ CHECK:  	blx return
 
diff --git a/test/MC/ARM/diagnostics.s b/test/MC/ARM/diagnostics.s
index 3c26f6d..62d7dae 100644
--- a/test/MC/ARM/diagnostics.s
+++ b/test/MC/ARM/diagnostics.s
@@ -465,3 +465,11 @@
         ldm sp!, {r0}^
 @ CHECK-ERRORS: error: system STM cannot have writeback register
 @ CHECK-ERRORS: error: writeback register only allowed on system LDM if PC in register-list
+
+foo2:
+        mov r0, foo2
+        movw r0, foo2
+@ CHECK-ERRORS: error: immediate expression for mov requires :lower16: or :upper16
+@ CHECK-ERRORS:                 ^
+@ CHECK-ERRORS: error: immediate expression for mov requires :lower16: or :upper16
+@ CHECK-ERRORS:                  ^
diff --git a/test/MC/ARM/dwarf-cfi-initial-state.s b/test/MC/ARM/dwarf-cfi-initial-state.s
index 2d638e9..0d1c08a 100644
--- a/test/MC/ARM/dwarf-cfi-initial-state.s
+++ b/test/MC/ARM/dwarf-cfi-initial-state.s
@@ -1,6 +1,7 @@
 # RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o - \
 # RUN:     | llvm-dwarfdump - | FileCheck %s
 
+_proc:
 .cfi_sections .debug_frame
 .cfi_startproc
 bx lr
diff --git a/test/MC/ARM/eh-directive-save-diagnoatics.s b/test/MC/ARM/eh-directive-save-diagnoatics.s
deleted file mode 100644
index 0e6d740..0000000
--- a/test/MC/ARM/eh-directive-save-diagnoatics.s
+++ /dev/null
@@ -1,41 +0,0 @@
-@ RUN: not llvm-mc -triple=armv7-unknown-linux-gnueabi < %s 2> %t
-@ RUN: FileCheck --check-prefix=CHECK < %t %s
-
-@ Check the diagnostics for .save directive
-
-@ .save directive should always come after .fnstart directive and
-@ before .handlerdata directive.
-
-	.syntax unified
-	.text
-
-@-------------------------------------------------------------------------------
-@ TEST1: .save before .fnstart
-@-------------------------------------------------------------------------------
-	.globl	func1
-	.align	2
-	.type	func1,%function
-	.save	{r4, r5, r6, r7}
-@ CHECK: error: .fnstart must precede .save or .vsave directives
-@ CHECK:        .save {r4, r5, r6, r7}
-@ CHECK:        ^
-	.fnstart
-func1:
-	.fnend
-
-
-
-@-------------------------------------------------------------------------------
-@ TEST2: .save after .handlerdata
-@-------------------------------------------------------------------------------
-	.globl	func2
-	.align	2
-	.type	func2,%function
-	.fnstart
-func2:
-	.handlerdata
-	.save	{r4, r5, r6, r7}
-@ CHECK: error: .save or .vsave must precede .handlerdata directive
-@ CHECK:        .save {r4, r5, r6, r7}
-@ CHECK:        ^
-	.fnend
diff --git a/test/MC/ARM/eh-directive-save-diagnostics.s b/test/MC/ARM/eh-directive-save-diagnostics.s
new file mode 100644
index 0000000..0e6d740
--- /dev/null
+++ b/test/MC/ARM/eh-directive-save-diagnostics.s
@@ -0,0 +1,41 @@
+@ RUN: not llvm-mc -triple=armv7-unknown-linux-gnueabi < %s 2> %t
+@ RUN: FileCheck --check-prefix=CHECK < %t %s
+
+@ Check the diagnostics for .save directive
+
+@ .save directive should always come after .fnstart directive and
+@ before .handlerdata directive.
+
+	.syntax unified
+	.text
+
+@-------------------------------------------------------------------------------
+@ TEST1: .save before .fnstart
+@-------------------------------------------------------------------------------
+	.globl	func1
+	.align	2
+	.type	func1,%function
+	.save	{r4, r5, r6, r7}
+@ CHECK: error: .fnstart must precede .save or .vsave directives
+@ CHECK:        .save {r4, r5, r6, r7}
+@ CHECK:        ^
+	.fnstart
+func1:
+	.fnend
+
+
+
+@-------------------------------------------------------------------------------
+@ TEST2: .save after .handlerdata
+@-------------------------------------------------------------------------------
+	.globl	func2
+	.align	2
+	.type	func2,%function
+	.fnstart
+func2:
+	.handlerdata
+	.save	{r4, r5, r6, r7}
+@ CHECK: error: .save or .vsave must precede .handlerdata directive
+@ CHECK:        .save {r4, r5, r6, r7}
+@ CHECK:        ^
+	.fnend
diff --git a/test/MC/ARM/elf-thumbfunc-reloc.s b/test/MC/ARM/elf-thumbfunc-reloc.s
index 6147020..ea7d507 100644
--- a/test/MC/ARM/elf-thumbfunc-reloc.s
+++ b/test/MC/ARM/elf-thumbfunc-reloc.s
@@ -5,7 +5,6 @@
 
 	.syntax unified
         .text
-        .globl  f
         .align  2
         .type   f,%function
         .code   16
@@ -16,9 +15,21 @@ f:
         bl      g
         pop     {r7, pc}
 
+	.section	.data.rel.local,"aw",%progbits
+ptr:
+	.long	f
+
+
 @@ make sure an R_ARM_THM_CALL relocation is generated for the call to g
 @CHECK:      Relocations [
 @CHECK-NEXT:   Section (2) .rel.text {
 @CHECK-NEXT:     0x4 R_ARM_THM_CALL g 0x0
 @CHECK-NEXT:   }
+
+
+@@ make sure the relocation is with f. That is one way to make sure it includes
+@@ the thumb bit.
+@CHECK-NEXT:   Section (6) .rel.data.rel.local {
+@CHECK-NEXT:     0x0 R_ARM_ABS32 f 0x0
+@CHECK-NEXT:   }
 @CHECK-NEXT: ]
diff --git a/test/MC/ARM/elf-thumbfunc.s b/test/MC/ARM/elf-thumbfunc.s
index 0ea1182..af061b5 100644
--- a/test/MC/ARM/elf-thumbfunc.s
+++ b/test/MC/ARM/elf-thumbfunc.s
@@ -11,7 +11,17 @@
 foo:
 	bx	lr
 
-@@ make sure foo is thumb function: bit 0 = 1 (st_value)
+	.global bar
+bar = foo
+
+@@ make sure foo and bar are thumb function: bit 0 = 1 (st_value)
+@CHECK:        Symbol {
+@CHECK:          Name: bar
+@CHECK-NEXT:     Value: 0x1
+@CHECK-NEXT:     Size: 0
+@CHECK-NEXT:     Binding: Global
+@CHECK-NEXT:     Type: Function
+
 @CHECK:        Symbol {
 @CHECK:          Name: foo
 @CHECK-NEXT:     Value: 0x1
diff --git a/test/MC/ARM/ldrd-strd-gnu-arm-bad-imm.s b/test/MC/ARM/ldrd-strd-gnu-arm-bad-imm.s
new file mode 100644
index 0000000..fbe459c
--- /dev/null
+++ b/test/MC/ARM/ldrd-strd-gnu-arm-bad-imm.s
@@ -0,0 +1,9 @@
+@ RUN: not llvm-mc -triple=armv7-linux-gnueabi %s 2>&1 | FileCheck %s
+.text
+@ CHECK: error: instruction requires: thumb2
+@ CHECK:         ldrd    r0, [r0, #512]
+        ldrd    r0, [r0, #512]
+
+@ CHECK: error: instruction requires: thumb2
+@ CHECK:         strd    r0, [r0, #512]
+        strd    r0, [r0, #512]
diff --git a/test/MC/ARM/ldrd-strd-gnu-arm.s b/test/MC/ARM/ldrd-strd-gnu-arm.s
new file mode 100644
index 0000000..57d21c7
--- /dev/null
+++ b/test/MC/ARM/ldrd-strd-gnu-arm.s
@@ -0,0 +1,20 @@
+@ PR18921
+@ RUN: llvm-mc -triple=armv7-linux-gnueabi -show-encoding < %s | FileCheck %s
+.text
+
+@ CHECK-NOT: .code	16
+
+
+@ CHECK: ldrd	r0, r1, [r10, #32]!     @ encoding: [0xd0,0x02,0xea,0xe1]
+@ CHECK: ldrd	r0, r1, [r10], #32      @ encoding: [0xd0,0x02,0xca,0xe0]
+@ CHECK: ldrd	r0, r1, [r10, #32]      @ encoding: [0xd0,0x02,0xca,0xe1]
+        ldrd    r0, [r10, #32]!
+        ldrd    r0, [r10], #32
+        ldrd    r0, [r10, #32]
+
+@ CHECK: strd	r0, r1, [r10, #32]!     @ encoding: [0xf0,0x02,0xea,0xe1]
+@ CHECK: strd	r0, r1, [r10], #32      @ encoding: [0xf0,0x02,0xca,0xe0]
+@ CHECK: strd	r0, r1, [r10, #32]      @ encoding: [0xf0,0x02,0xca,0xe1]
+        strd    r0, [r10, #32]!
+        strd    r0, [r10], #32
+        strd    r0, [r10, #32]
diff --git a/test/MC/ARM/ldrd-strd-gnu-thumb-bad-regs.s b/test/MC/ARM/ldrd-strd-gnu-thumb-bad-regs.s
new file mode 100644
index 0000000..9d81a27
--- /dev/null
+++ b/test/MC/ARM/ldrd-strd-gnu-thumb-bad-regs.s
@@ -0,0 +1,10 @@
+@ RUN: not llvm-mc -triple=armv7-linux-gnueabi %s 2>&1 | FileCheck %s
+.text
+.thumb
+@ CHECK: error: invalid operand for instruction
+@ CHECK:         ldrd    r12, [r0, #512]
+        ldrd    r12, [r0, #512]
+
+@ CHECK: error: invalid operand for instruction
+@ CHECK:         strd    r12, [r0, #512]
+        strd    r12, [r0, #512]
diff --git a/test/MC/ARM/ldrd-strd-gnu-thumb.s b/test/MC/ARM/ldrd-strd-gnu-thumb.s
new file mode 100644
index 0000000..67d2aa7
--- /dev/null
+++ b/test/MC/ARM/ldrd-strd-gnu-thumb.s
@@ -0,0 +1,20 @@
+@ PR18921
+@ RUN: llvm-mc -triple=armv7-linux-gnueabi -show-encoding < %s | FileCheck %s
+.text
+.thumb
+
+@ CHECK: .code	16
+
+@ CHECK: ldrd	r0, r1, [r10, #512]!    @ encoding: [0xfa,0xe9,0x80,0x01]
+@ CHECK: ldrd	r0, r1, [r10], #512     @ encoding: [0xfa,0xe8,0x80,0x01]
+@ CHECK: ldrd	r0, r1, [r10, #512]     @ encoding: [0xda,0xe9,0x80,0x01]
+        ldrd    r0, [r10, #512]!
+        ldrd    r0, [r10], #512
+        ldrd    r0, [r10, #512]
+
+@ CHECK: strd	r0, r1, [r10, #512]!    @ encoding: [0xea,0xe9,0x80,0x01]
+@ CHECK: strd	r0, r1, [r10], #512     @ encoding: [0xea,0xe8,0x80,0x01]
+@ CHECK: strd	r0, r1, [r10, #512]     @ encoding: [0xca,0xe9,0x80,0x01]
+        strd    r0, [r10, #512]!
+        strd    r0, [r10], #512
+        strd    r0, [r10, #512]
diff --git a/test/MC/ARM/neon-vld-encoding.s b/test/MC/ARM/neon-vld-encoding.s
index 3fcbe3e..b96784e 100644
--- a/test/MC/ARM/neon-vld-encoding.s
+++ b/test/MC/ARM/neon-vld-encoding.s
@@ -367,7 +367,7 @@
 @ CHECK: vld3.16 {d16[], d17[], d18[]}, [r2]! @ encoding: [0x4d,0x0e,0xe2,0xf4]
 @ CHECK: vld3.32 {d16[], d17[], d18[]}, [r3]! @ encoding: [0x8d,0x0e,0xe3,0xf4]
 @ CHECK: vld3.8 {d17[], d18[], d19[]}, [r7]! @ encoding: [0x2d,0x1e,0xe7,0xf4]
-@ CHECK: vld3.16 {d17[], d18[], d19[]}, [r7]! @ encoding: [0x6d,0x1e,0xe7,0xf4]
+@ CHECK: vld3.16 {d17[], d19[], d21[]}, [r7]! @ encoding: [0x6d,0x1e,0xe7,0xf4]
 @ CHECK: vld3.32 {d16[], d18[], d20[]}, [r8]! @ encoding: [0xad,0x0e,0xe8,0xf4]
 @ CHECK: vld3.8 {d16[], d17[], d18[]}, [r1], r8 @ encoding: [0x08,0x0e,0xe1,0xf4]
 @ CHECK: vld3.16 {d16[], d17[], d18[]}, [r2], r7 @ encoding: [0x47,0x0e,0xe2,0xf4]
diff --git a/test/MC/ARM/neon-vld-vst-align.s b/test/MC/ARM/neon-vld-vst-align.s
new file mode 100644
index 0000000..c3628ce
--- /dev/null
+++ b/test/MC/ARM/neon-vld-vst-align.s
@@ -0,0 +1,8354 @@
+@ RUN: not llvm-mc -triple=thumbv7-apple-darwin -show-encoding < %s > %t 2> %t.err
+@ RUN: FileCheck < %t %s
+@ RUN: FileCheck --check-prefix=CHECK-ERRORS < %t.err %s
+
+	vld1.8	{d0}, [r4]
+	vld1.8	{d0}, [r4:16]
+	vld1.8	{d0}, [r4:32]
+	vld1.8	{d0}, [r4:64]
+	vld1.8	{d0}, [r4:128]
+	vld1.8	{d0}, [r4:256]
+
+@ CHECK: vld1.8	{d0}, [r4]              @ encoding: [0x24,0xf9,0x0f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0}, [r4:16]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0}, [r4:32]
+@ CHECK-ERRORS:                           ^
+@ CHECK: vld1.8	{d0}, [r4:64]           @ encoding: [0x24,0xf9,0x1f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0}, [r4:128]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0}, [r4:256]
+@ CHECK-ERRORS:                           ^
+
+	vld1.8	{d0}, [r4]!
+	vld1.8	{d0}, [r4:16]!
+	vld1.8	{d0}, [r4:32]!
+	vld1.8	{d0}, [r4:64]!
+	vld1.8	{d0}, [r4:128]!
+	vld1.8	{d0}, [r4:256]!
+
+@ CHECK: vld1.8	{d0}, [r4]!             @ encoding: [0x24,0xf9,0x0d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0}, [r4:16]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0}, [r4:32]!
+@ CHECK-ERRORS:                           ^
+@ CHECK: vld1.8	{d0}, [r4:64]!          @ encoding: [0x24,0xf9,0x1d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0}, [r4:128]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0}, [r4:256]!
+@ CHECK-ERRORS:                           ^
+
+	vld1.8	{d0}, [r4], r6
+	vld1.8	{d0}, [r4:16], r6
+	vld1.8	{d0}, [r4:32], r6
+	vld1.8	{d0}, [r4:64], r6
+	vld1.8	{d0}, [r4:128], r6
+	vld1.8	{d0}, [r4:256], r6
+
+@ CHECK: vld1.8	{d0}, [r4], r6          @ encoding: [0x24,0xf9,0x06,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0}, [r4:16], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0}, [r4:32], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK: vld1.8	{d0}, [r4:64], r6       @ encoding: [0x24,0xf9,0x16,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0}, [r4:128], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0}, [r4:256], r6
+@ CHECK-ERRORS:                           ^
+
+	vld1.8	{d0, d1}, [r4]
+	vld1.8	{d0, d1}, [r4:16]
+	vld1.8	{d0, d1}, [r4:32]
+	vld1.8	{d0, d1}, [r4:64]
+	vld1.8	{d0, d1}, [r4:128]
+	vld1.8	{d0, d1}, [r4:256]
+
+@ CHECK: vld1.8	{d0, d1}, [r4]          @ encoding: [0x24,0xf9,0x0f,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld1.8	{d0, d1}, [r4:64]       @ encoding: [0x24,0xf9,0x1f,0x0a]
+@ CHECK: vld1.8	{d0, d1}, [r4:128]      @ encoding: [0x24,0xf9,0x2f,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vld1.8	{d0, d1}, [r4]!
+	vld1.8	{d0, d1}, [r4:16]!
+	vld1.8	{d0, d1}, [r4:32]!
+	vld1.8	{d0, d1}, [r4:64]!
+	vld1.8	{d0, d1}, [r4:128]!
+	vld1.8	{d0, d1}, [r4:256]!
+
+@ CHECK: vld1.8	{d0, d1}, [r4]!         @ encoding: [0x24,0xf9,0x0d,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld1.8	{d0, d1}, [r4:64]!      @ encoding: [0x24,0xf9,0x1d,0x0a]
+@ CHECK: vld1.8	{d0, d1}, [r4:128]!     @ encoding: [0x24,0xf9,0x2d,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vld1.8	{d0, d1}, [r4], r6
+	vld1.8	{d0, d1}, [r4:16], r6
+	vld1.8	{d0, d1}, [r4:32], r6
+	vld1.8	{d0, d1}, [r4:64], r6
+	vld1.8	{d0, d1}, [r4:128], r6
+	vld1.8	{d0, d1}, [r4:256], r6
+
+@ CHECK: vld1.8	{d0, d1}, [r4], r6      @ encoding: [0x24,0xf9,0x06,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld1.8	{d0, d1}, [r4:64], r6   @ encoding: [0x24,0xf9,0x16,0x0a]
+@ CHECK: vld1.8	{d0, d1}, [r4:128], r6  @ encoding: [0x24,0xf9,0x26,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vld1.8	{d0, d1, d2}, [r4]
+	vld1.8	{d0, d1, d2}, [r4:16]
+	vld1.8	{d0, d1, d2}, [r4:32]
+	vld1.8	{d0, d1, d2}, [r4:64]
+	vld1.8	{d0, d1, d2}, [r4:128]
+	vld1.8	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vld1.8	{d0, d1, d2}, [r4]      @ encoding: [0x24,0xf9,0x0f,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.8	{d0, d1, d2}, [r4:64]   @ encoding: [0x24,0xf9,0x1f,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld1.8	{d0, d1, d2}, [r4]!
+	vld1.8	{d0, d1, d2}, [r4:16]!
+	vld1.8	{d0, d1, d2}, [r4:32]!
+	vld1.8	{d0, d1, d2}, [r4:64]!
+	vld1.8	{d0, d1, d2}, [r4:128]!
+	vld1.8	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vld1.8	{d0, d1, d2}, [r4]!     @ encoding: [0x24,0xf9,0x0d,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.8	{d0, d1, d2}, [r4:64]!  @ encoding: [0x24,0xf9,0x1d,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld1.8	{d0, d1, d2}, [r4], r6
+	vld1.8	{d0, d1, d2}, [r4:16], r6
+	vld1.8	{d0, d1, d2}, [r4:32], r6
+	vld1.8	{d0, d1, d2}, [r4:64], r6
+	vld1.8	{d0, d1, d2}, [r4:128], r6
+	vld1.8	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vld1.8	{d0, d1, d2}, [r4], r6  @ encoding: [0x24,0xf9,0x06,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.8	{d0, d1, d2}, [r4:64], r6 @ encoding: [0x24,0xf9,0x16,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld1.8	{d0, d1, d2, d3}, [r4]
+	vld1.8	{d0, d1, d2, d3}, [r4:16]
+	vld1.8	{d0, d1, d2, d3}, [r4:32]
+	vld1.8	{d0, d1, d2, d3}, [r4:64]
+	vld1.8	{d0, d1, d2, d3}, [r4:128]
+	vld1.8	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vld1.8	{d0, d1, d2, d3}, [r4]  @ encoding: [0x24,0xf9,0x0f,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld1.8	{d0, d1, d2, d3}, [r4:64] @ encoding: [0x24,0xf9,0x1f,0x02]
+@ CHECK: vld1.8	{d0, d1, d2, d3}, [r4:128] @ encoding: [0x24,0xf9,0x2f,0x02]
+@ CHECK: vld1.8	{d0, d1, d2, d3}, [r4:256] @ encoding: [0x24,0xf9,0x3f,0x02]
+
+	vld1.8	{d0, d1, d2, d3}, [r4]!
+	vld1.8	{d0, d1, d2, d3}, [r4:16]!
+	vld1.8	{d0, d1, d2, d3}, [r4:32]!
+	vld1.8	{d0, d1, d2, d3}, [r4:64]!
+	vld1.8	{d0, d1, d2, d3}, [r4:128]!
+	vld1.8	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vld1.8	{d0, d1, d2, d3}, [r4]! @ encoding: [0x24,0xf9,0x0d,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld1.8	{d0, d1, d2, d3}, [r4:64]! @ encoding: [0x24,0xf9,0x1d,0x02]
+@ CHECK: vld1.8	{d0, d1, d2, d3}, [r4:128]! @ encoding: [0x24,0xf9,0x2d,0x02]
+@ CHECK: vld1.8	{d0, d1, d2, d3}, [r4:256]! @ encoding: [0x24,0xf9,0x3d,0x02]
+
+	vld1.8	{d0, d1, d2, d3}, [r4], r6
+	vld1.8	{d0, d1, d2, d3}, [r4:16], r6
+	vld1.8	{d0, d1, d2, d3}, [r4:32], r6
+	vld1.8	{d0, d1, d2, d3}, [r4:64], r6
+	vld1.8	{d0, d1, d2, d3}, [r4:128], r6
+	vld1.8	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vld1.8	{d0, d1, d2, d3}, [r4], r6 @ encoding: [0x24,0xf9,0x06,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.8  {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld1.8	{d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x24,0xf9,0x16,0x02]
+@ CHECK: vld1.8	{d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x24,0xf9,0x26,0x02]
+@ CHECK: vld1.8	{d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x24,0xf9,0x36,0x02]
+
+	vld1.8	{d0[2]}, [r4]
+	vld1.8	{d0[2]}, [r4:16]
+	vld1.8	{d0[2]}, [r4:32]
+	vld1.8	{d0[2]}, [r4:64]
+	vld1.8	{d0[2]}, [r4:128]
+	vld1.8	{d0[2]}, [r4:256]
+
+@ CHECK: vld1.8	{d0[2]}, [r4]           @ encoding: [0xa4,0xf9,0x4f,0x00]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:16]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:32]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:64]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:128]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:256]
+@ CHECK-ERRORS:                              ^
+
+	vld1.8	{d0[2]}, [r4]!
+	vld1.8	{d0[2]}, [r4:16]!
+	vld1.8	{d0[2]}, [r4:32]!
+	vld1.8	{d0[2]}, [r4:64]!
+	vld1.8	{d0[2]}, [r4:128]!
+	vld1.8	{d0[2]}, [r4:256]!
+
+@ CHECK: vld1.8	{d0[2]}, [r4]!          @ encoding: [0xa4,0xf9,0x4d,0x00]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:16]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:32]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:64]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:128]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:256]!
+@ CHECK-ERRORS:                              ^
+
+	vld1.8	{d0[2]}, [r4], r6
+	vld1.8	{d0[2]}, [r4:16], r6
+	vld1.8	{d0[2]}, [r4:32], r6
+	vld1.8	{d0[2]}, [r4:64], r6
+	vld1.8	{d0[2]}, [r4:128], r6
+	vld1.8	{d0[2]}, [r4:256], r6
+
+@ CHECK: vld1.8	{d0[2]}, [r4], r6       @ encoding: [0xa4,0xf9,0x46,0x00]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:16], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:32], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:64], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:128], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[2]}, [r4:256], r6
+@ CHECK-ERRORS:                              ^
+
+	vld1.8	{d0[]}, [r4]
+	vld1.8	{d0[]}, [r4:16]
+	vld1.8	{d0[]}, [r4:32]
+	vld1.8	{d0[]}, [r4:64]
+	vld1.8	{d0[]}, [r4:128]
+	vld1.8	{d0[]}, [r4:256]
+
+@ CHECK: vld1.8	{d0[]}, [r4]            @ encoding: [0xa4,0xf9,0x0f,0x0c]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:16]
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:32]
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:64]
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:128]
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:256]
+@ CHECK-ERRORS:                             ^
+
+	vld1.8	{d0[]}, [r4]!
+	vld1.8	{d0[]}, [r4:16]!
+	vld1.8	{d0[]}, [r4:32]!
+	vld1.8	{d0[]}, [r4:64]!
+	vld1.8	{d0[]}, [r4:128]!
+	vld1.8	{d0[]}, [r4:256]!
+
+@ CHECK: vld1.8	{d0[]}, [r4]!           @ encoding: [0xa4,0xf9,0x0d,0x0c]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:16]!
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:32]!
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:64]!
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:128]!
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:256]!
+@ CHECK-ERRORS:                             ^
+
+	vld1.8	{d0[]}, [r4], r6
+	vld1.8	{d0[]}, [r4:16], r6
+	vld1.8	{d0[]}, [r4:32], r6
+	vld1.8	{d0[]}, [r4:64], r6
+	vld1.8	{d0[]}, [r4:128], r6
+	vld1.8	{d0[]}, [r4:256], r6
+
+@ CHECK: vld1.8	{d0[]}, [r4], r6        @ encoding: [0xa4,0xf9,0x06,0x0c]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:16], r6
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:32], r6
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:64], r6
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:128], r6
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[]}, [r4:256], r6
+@ CHECK-ERRORS:                             ^
+
+	vld1.8	{d0[], d1[]}, [r4]
+	vld1.8	{d0[], d1[]}, [r4:16]
+	vld1.8	{d0[], d1[]}, [r4:32]
+	vld1.8	{d0[], d1[]}, [r4:64]
+	vld1.8	{d0[], d1[]}, [r4:128]
+	vld1.8	{d0[], d1[]}, [r4:256]
+
+@ CHECK: vld1.8	{d0[], d1[]}, [r4]      @ encoding: [0xa4,0xf9,0x2f,0x0c]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:64]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld1.8	{d0[], d1[]}, [r4]!
+	vld1.8	{d0[], d1[]}, [r4:16]!
+	vld1.8	{d0[], d1[]}, [r4:32]!
+	vld1.8	{d0[], d1[]}, [r4:64]!
+	vld1.8	{d0[], d1[]}, [r4:128]!
+	vld1.8	{d0[], d1[]}, [r4:256]!
+
+@ CHECK: vld1.8	{d0[], d1[]}, [r4]!     @ encoding: [0xa4,0xf9,0x2d,0x0c]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:64]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld1.8	{d0[], d1[]}, [r4], r6
+	vld1.8	{d0[], d1[]}, [r4:16], r6
+	vld1.8	{d0[], d1[]}, [r4:32], r6
+	vld1.8	{d0[], d1[]}, [r4:64], r6
+	vld1.8	{d0[], d1[]}, [r4:128], r6
+	vld1.8	{d0[], d1[]}, [r4:256], r6
+
+@ CHECK: vld1.8	{d0[], d1[]}, [r4], r6  @ encoding: [0xa4,0xf9,0x26,0x0c]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:64], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld1.8  {d0[], d1[]}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld1.16	{d0}, [r4]
+	vld1.16	{d0}, [r4:16]
+	vld1.16	{d0}, [r4:32]
+	vld1.16	{d0}, [r4:64]
+	vld1.16	{d0}, [r4:128]
+	vld1.16	{d0}, [r4:256]
+
+@ CHECK: vld1.16 {d0}, [r4]              @ encoding: [0x24,0xf9,0x4f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0}, [r4:16]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0}, [r4:32]
+@ CHECK-ERRORS:                           ^
+@ CHECK: vld1.16 {d0}, [r4:64]           @ encoding: [0x24,0xf9,0x5f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0}, [r4:128]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0}, [r4:256]
+@ CHECK-ERRORS:                           ^
+
+	vld1.16	{d0}, [r4]!
+	vld1.16	{d0}, [r4:16]!
+	vld1.16	{d0}, [r4:32]!
+	vld1.16	{d0}, [r4:64]!
+	vld1.16	{d0}, [r4:128]!
+	vld1.16	{d0}, [r4:256]!
+
+@ CHECK: vld1.16 {d0}, [r4]!             @ encoding: [0x24,0xf9,0x4d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0}, [r4:16]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0}, [r4:32]!
+@ CHECK-ERRORS:                           ^
+@ CHECK: vld1.16 {d0}, [r4:64]!          @ encoding: [0x24,0xf9,0x5d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0}, [r4:128]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0}, [r4:256]!
+@ CHECK-ERRORS:                           ^
+
+	vld1.16	{d0}, [r4], r6
+	vld1.16	{d0}, [r4:16], r6
+	vld1.16	{d0}, [r4:32], r6
+	vld1.16	{d0}, [r4:64], r6
+	vld1.16	{d0}, [r4:128], r6
+	vld1.16	{d0}, [r4:256], r6
+
+@ CHECK: vld1.16 {d0}, [r4], r6          @ encoding: [0x24,0xf9,0x46,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0}, [r4:16], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0}, [r4:32], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK: vld1.16 {d0}, [r4:64], r6       @ encoding: [0x24,0xf9,0x56,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0}, [r4:128], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0}, [r4:256], r6
+@ CHECK-ERRORS:                           ^
+
+	vld1.16	{d0, d1}, [r4]
+	vld1.16	{d0, d1}, [r4:16]
+	vld1.16	{d0, d1}, [r4:32]
+	vld1.16	{d0, d1}, [r4:64]
+	vld1.16	{d0, d1}, [r4:128]
+	vld1.16	{d0, d1}, [r4:256]
+
+@ CHECK: vld1.16 {d0, d1}, [r4]          @ encoding: [0x24,0xf9,0x4f,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld1.16 {d0, d1}, [r4:64]       @ encoding: [0x24,0xf9,0x5f,0x0a]
+@ CHECK: vld1.16 {d0, d1}, [r4:128]      @ encoding: [0x24,0xf9,0x6f,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vld1.16	{d0, d1}, [r4]!
+	vld1.16	{d0, d1}, [r4:16]!
+	vld1.16	{d0, d1}, [r4:32]!
+	vld1.16	{d0, d1}, [r4:64]!
+	vld1.16	{d0, d1}, [r4:128]!
+	vld1.16	{d0, d1}, [r4:256]!
+
+@ CHECK: vld1.16 {d0, d1}, [r4]!         @ encoding: [0x24,0xf9,0x4d,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld1.16 {d0, d1}, [r4:64]!      @ encoding: [0x24,0xf9,0x5d,0x0a]
+@ CHECK: vld1.16 {d0, d1}, [r4:128]!     @ encoding: [0x24,0xf9,0x6d,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vld1.16	{d0, d1}, [r4], r6
+	vld1.16	{d0, d1}, [r4:16], r6
+	vld1.16	{d0, d1}, [r4:32], r6
+	vld1.16	{d0, d1}, [r4:64], r6
+	vld1.16	{d0, d1}, [r4:128], r6
+	vld1.16	{d0, d1}, [r4:256], r6
+
+@ CHECK: vld1.16 {d0, d1}, [r4], r6      @ encoding: [0x24,0xf9,0x46,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld1.16 {d0, d1}, [r4:64], r6   @ encoding: [0x24,0xf9,0x56,0x0a]
+@ CHECK: vld1.16 {d0, d1}, [r4:128], r6  @ encoding: [0x24,0xf9,0x66,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vld1.16	{d0, d1, d2}, [r4]
+	vld1.16	{d0, d1, d2}, [r4:16]
+	vld1.16	{d0, d1, d2}, [r4:32]
+	vld1.16	{d0, d1, d2}, [r4:64]
+	vld1.16	{d0, d1, d2}, [r4:128]
+	vld1.16	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vld1.16 {d0, d1, d2}, [r4]      @ encoding: [0x24,0xf9,0x4f,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.16 {d0, d1, d2}, [r4:64]   @ encoding: [0x24,0xf9,0x5f,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld1.16	{d0, d1, d2}, [r4]!
+	vld1.16	{d0, d1, d2}, [r4:16]!
+	vld1.16	{d0, d1, d2}, [r4:32]!
+	vld1.16	{d0, d1, d2}, [r4:64]!
+	vld1.16	{d0, d1, d2}, [r4:128]!
+	vld1.16	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vld1.16 {d0, d1, d2}, [r4]!     @ encoding: [0x24,0xf9,0x4d,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.16 {d0, d1, d2}, [r4:64]!  @ encoding: [0x24,0xf9,0x5d,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld1.16	{d0, d1, d2}, [r4], r6
+	vld1.16	{d0, d1, d2}, [r4:16], r6
+	vld1.16	{d0, d1, d2}, [r4:32], r6
+	vld1.16	{d0, d1, d2}, [r4:64], r6
+	vld1.16	{d0, d1, d2}, [r4:128], r6
+	vld1.16	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vld1.16 {d0, d1, d2}, [r4], r6  @ encoding: [0x24,0xf9,0x46,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.16 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x24,0xf9,0x56,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld1.16	{d0, d1, d2, d3}, [r4]
+	vld1.16	{d0, d1, d2, d3}, [r4:16]
+	vld1.16	{d0, d1, d2, d3}, [r4:32]
+	vld1.16	{d0, d1, d2, d3}, [r4:64]
+	vld1.16	{d0, d1, d2, d3}, [r4:128]
+	vld1.16	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vld1.16 {d0, d1, d2, d3}, [r4]  @ encoding: [0x24,0xf9,0x4f,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld1.16 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x24,0xf9,0x5f,0x02]
+@ CHECK: vld1.16 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x24,0xf9,0x6f,0x02]
+@ CHECK: vld1.16 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x24,0xf9,0x7f,0x02]
+
+	vld1.16	{d0, d1, d2, d3}, [r4]!
+	vld1.16	{d0, d1, d2, d3}, [r4:16]!
+	vld1.16	{d0, d1, d2, d3}, [r4:32]!
+	vld1.16	{d0, d1, d2, d3}, [r4:64]!
+	vld1.16	{d0, d1, d2, d3}, [r4:128]!
+	vld1.16	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vld1.16 {d0, d1, d2, d3}, [r4]! @ encoding: [0x24,0xf9,0x4d,0x02]
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld1.16 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x24,0xf9,0x5d,0x02]
+@ CHECK: vld1.16 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x24,0xf9,0x6d,0x02]
+@ CHECK: vld1.16 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x24,0xf9,0x7d,0x02]
+
+	vld1.16	{d0, d1, d2, d3}, [r4], r6
+	vld1.16	{d0, d1, d2, d3}, [r4:16], r6
+	vld1.16	{d0, d1, d2, d3}, [r4:32], r6
+	vld1.16	{d0, d1, d2, d3}, [r4:64], r6
+	vld1.16	{d0, d1, d2, d3}, [r4:128], r6
+	vld1.16	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vld1.16 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x24,0xf9,0x46,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld1.16 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x24,0xf9,0x56,0x02]
+@ CHECK: vld1.16 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x24,0xf9,0x66,0x02]
+@ CHECK: vld1.16 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x24,0xf9,0x76,0x02]
+
+	vld1.16	{d0[2]}, [r4]
+	vld1.16	{d0[2]}, [r4:16]
+	vld1.16	{d0[2]}, [r4:32]
+	vld1.16	{d0[2]}, [r4:64]
+	vld1.16	{d0[2]}, [r4:128]
+	vld1.16	{d0[2]}, [r4:256]
+
+@ CHECK: vld1.16 {d0[2]}, [r4]           @ encoding: [0xa4,0xf9,0x8f,0x04]
+@ CHECK: vld1.16 {d0[2]}, [r4:16]        @ encoding: [0xa4,0xf9,0x9f,0x04]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[2]}, [r4:32]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[2]}, [r4:64]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[2]}, [r4:128]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[2]}, [r4:256]
+@ CHECK-ERRORS:                              ^
+
+	vld1.16	{d0[2]}, [r4]!
+	vld1.16	{d0[2]}, [r4:16]!
+	vld1.16	{d0[2]}, [r4:32]!
+	vld1.16	{d0[2]}, [r4:64]!
+	vld1.16	{d0[2]}, [r4:128]!
+	vld1.16	{d0[2]}, [r4:256]!
+
+@ CHECK: vld1.16 {d0[2]}, [r4]!          @ encoding: [0xa4,0xf9,0x8d,0x04]
+@ CHECK: vld1.16 {d0[2]}, [r4:16]!       @ encoding: [0xa4,0xf9,0x9d,0x04]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[2]}, [r4:32]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[2]}, [r4:64]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[2]}, [r4:128]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[2]}, [r4:256]!
+@ CHECK-ERRORS:                              ^
+
+	vld1.16	{d0[2]}, [r4], r6
+	vld1.16	{d0[2]}, [r4:16], r6
+	vld1.16	{d0[2]}, [r4:32], r6
+	vld1.16	{d0[2]}, [r4:64], r6
+	vld1.16	{d0[2]}, [r4:128], r6
+	vld1.16	{d0[2]}, [r4:256], r6
+
+@ CHECK: vld1.16 {d0[2]}, [r4], r6       @ encoding: [0xa4,0xf9,0x86,0x04]
+@ CHECK: vld1.16 {d0[2]}, [r4:16], r6    @ encoding: [0xa4,0xf9,0x96,0x04]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[2]}, [r4:32], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[2]}, [r4:64], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[2]}, [r4:128], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[2]}, [r4:256], r6
+@ CHECK-ERRORS:                              ^
+
+	vld1.16	{d0[]}, [r4]
+	vld1.16	{d0[]}, [r4:16]
+	vld1.16	{d0[]}, [r4:32]
+	vld1.16	{d0[]}, [r4:64]
+	vld1.16	{d0[]}, [r4:128]
+	vld1.16	{d0[]}, [r4:256]
+
+@ CHECK: vld1.16 {d0[]}, [r4]            @ encoding: [0xa4,0xf9,0x4f,0x0c]
+@ CHECK: vld1.16 {d0[]}, [r4:16]         @ encoding: [0xa4,0xf9,0x5f,0x0c]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[]}, [r4:32]
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[]}, [r4:64]
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[]}, [r4:128]
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[]}, [r4:256]
+@ CHECK-ERRORS:                             ^
+
+	vld1.16	{d0[]}, [r4]!
+	vld1.16	{d0[]}, [r4:16]!
+	vld1.16	{d0[]}, [r4:32]!
+	vld1.16	{d0[]}, [r4:64]!
+	vld1.16	{d0[]}, [r4:128]!
+	vld1.16	{d0[]}, [r4:256]!
+
+@ CHECK: vld1.16 {d0[]}, [r4]!           @ encoding: [0xa4,0xf9,0x4d,0x0c]
+@ CHECK: vld1.16 {d0[]}, [r4:16]!        @ encoding: [0xa4,0xf9,0x5d,0x0c]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[]}, [r4:32]!
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[]}, [r4:64]!
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[]}, [r4:128]!
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[]}, [r4:256]!
+@ CHECK-ERRORS:                             ^
+
+	vld1.16	{d0[]}, [r4], r6
+	vld1.16	{d0[]}, [r4:16], r6
+	vld1.16	{d0[]}, [r4:32], r6
+	vld1.16	{d0[]}, [r4:64], r6
+	vld1.16	{d0[]}, [r4:128], r6
+	vld1.16	{d0[]}, [r4:256], r6
+
+@ CHECK: vld1.16 {d0[]}, [r4], r6        @ encoding: [0xa4,0xf9,0x46,0x0c]
+@ CHECK: vld1.16 {d0[]}, [r4:16], r6     @ encoding: [0xa4,0xf9,0x56,0x0c]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[]}, [r4:32], r6
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[]}, [r4:64], r6
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[]}, [r4:128], r6
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[]}, [r4:256], r6
+@ CHECK-ERRORS:                             ^
+
+	vld1.16	{d0[], d1[]}, [r4]
+	vld1.16	{d0[], d1[]}, [r4:16]
+	vld1.16	{d0[], d1[]}, [r4:32]
+	vld1.16	{d0[], d1[]}, [r4:64]
+	vld1.16	{d0[], d1[]}, [r4:128]
+	vld1.16	{d0[], d1[]}, [r4:256]
+
+@ CHECK: vld1.16 {d0[], d1[]}, [r4]      @ encoding: [0xa4,0xf9,0x6f,0x0c]
+@ CHECK: vld1.16 {d0[], d1[]}, [r4:16]   @ encoding: [0xa4,0xf9,0x7f,0x0c]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[], d1[]}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[], d1[]}, [r4:64]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[], d1[]}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[], d1[]}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld1.16	{d0[], d1[]}, [r4]!
+	vld1.16	{d0[], d1[]}, [r4:16]!
+	vld1.16	{d0[], d1[]}, [r4:32]!
+	vld1.16	{d0[], d1[]}, [r4:64]!
+	vld1.16	{d0[], d1[]}, [r4:128]!
+	vld1.16	{d0[], d1[]}, [r4:256]!
+
+@ CHECK: vld1.16 {d0[], d1[]}, [r4]!     @ encoding: [0xa4,0xf9,0x6d,0x0c]
+@ CHECK: vld1.16 {d0[], d1[]}, [r4:16]!  @ encoding: [0xa4,0xf9,0x7d,0x0c]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[], d1[]}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[], d1[]}, [r4:64]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[], d1[]}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[], d1[]}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld1.16	{d0[], d1[]}, [r4], r6
+	vld1.16	{d0[], d1[]}, [r4:16], r6
+	vld1.16	{d0[], d1[]}, [r4:32], r6
+	vld1.16	{d0[], d1[]}, [r4:64], r6
+	vld1.16	{d0[], d1[]}, [r4:128], r6
+	vld1.16	{d0[], d1[]}, [r4:256], r6
+
+@ CHECK: vld1.16 {d0[], d1[]}, [r4], r6  @ encoding: [0xa4,0xf9,0x66,0x0c]
+@ CHECK: vld1.16 {d0[], d1[]}, [r4:16], r6 @ encoding: [0xa4,0xf9,0x76,0x0c]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[], d1[]}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[], d1[]}, [r4:64], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[], d1[]}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld1.16 {d0[], d1[]}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld1.32	{d0}, [r4]
+	vld1.32	{d0}, [r4:16]
+	vld1.32	{d0}, [r4:32]
+	vld1.32	{d0}, [r4:64]
+	vld1.32	{d0}, [r4:128]
+	vld1.32	{d0}, [r4:256]
+
+@ CHECK: vld1.32 {d0}, [r4]              @ encoding: [0x24,0xf9,0x8f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0}, [r4:16]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0}, [r4:32]
+@ CHECK-ERRORS:                           ^
+@ CHECK: vld1.32 {d0}, [r4:64]           @ encoding: [0x24,0xf9,0x9f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0}, [r4:128]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0}, [r4:256]
+@ CHECK-ERRORS:                           ^
+
+	vld1.32	{d0}, [r4]!
+	vld1.32	{d0}, [r4:16]!
+	vld1.32	{d0}, [r4:32]!
+	vld1.32	{d0}, [r4:64]!
+	vld1.32	{d0}, [r4:128]!
+	vld1.32	{d0}, [r4:256]!
+
+@ CHECK: vld1.32 {d0}, [r4]!             @ encoding: [0x24,0xf9,0x8d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0}, [r4:16]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0}, [r4:32]!
+@ CHECK-ERRORS:                           ^
+@ CHECK: vld1.32 {d0}, [r4:64]!          @ encoding: [0x24,0xf9,0x9d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0}, [r4:128]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0}, [r4:256]!
+@ CHECK-ERRORS:                           ^
+
+	vld1.32	{d0}, [r4], r6
+	vld1.32	{d0}, [r4:16], r6
+	vld1.32	{d0}, [r4:32], r6
+	vld1.32	{d0}, [r4:64], r6
+	vld1.32	{d0}, [r4:128], r6
+	vld1.32	{d0}, [r4:256], r6
+
+@ CHECK: vld1.32 {d0}, [r4], r6          @ encoding: [0x24,0xf9,0x86,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0}, [r4:16], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0}, [r4:32], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK: vld1.32 {d0}, [r4:64], r6       @ encoding: [0x24,0xf9,0x96,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0}, [r4:128], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0}, [r4:256], r6
+@ CHECK-ERRORS:                           ^
+
+	vld1.32	{d0, d1}, [r4]
+	vld1.32	{d0, d1}, [r4:16]
+	vld1.32	{d0, d1}, [r4:32]
+	vld1.32	{d0, d1}, [r4:64]
+	vld1.32	{d0, d1}, [r4:128]
+	vld1.32	{d0, d1}, [r4:256]
+
+@ CHECK: vld1.32 {d0, d1}, [r4]          @ encoding: [0x24,0xf9,0x8f,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld1.32 {d0, d1}, [r4:64]       @ encoding: [0x24,0xf9,0x9f,0x0a]
+@ CHECK: vld1.32 {d0, d1}, [r4:128]      @ encoding: [0x24,0xf9,0xaf,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vld1.32	{d0, d1}, [r4]!
+	vld1.32	{d0, d1}, [r4:16]!
+	vld1.32	{d0, d1}, [r4:32]!
+	vld1.32	{d0, d1}, [r4:64]!
+	vld1.32	{d0, d1}, [r4:128]!
+	vld1.32	{d0, d1}, [r4:256]!
+
+@ CHECK: vld1.32 {d0, d1}, [r4]!         @ encoding: [0x24,0xf9,0x8d,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld1.32 {d0, d1}, [r4:64]!      @ encoding: [0x24,0xf9,0x9d,0x0a]
+@ CHECK: vld1.32 {d0, d1}, [r4:128]!     @ encoding: [0x24,0xf9,0xad,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vld1.32	{d0, d1}, [r4], r6
+	vld1.32	{d0, d1}, [r4:16], r6
+	vld1.32	{d0, d1}, [r4:32], r6
+	vld1.32	{d0, d1}, [r4:64], r6
+	vld1.32	{d0, d1}, [r4:128], r6
+	vld1.32	{d0, d1}, [r4:256], r6
+
+@ CHECK: vld1.32 {d0, d1}, [r4], r6      @ encoding: [0x24,0xf9,0x86,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld1.32 {d0, d1}, [r4:64], r6   @ encoding: [0x24,0xf9,0x96,0x0a]
+@ CHECK: vld1.32 {d0, d1}, [r4:128], r6  @ encoding: [0x24,0xf9,0xa6,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vld1.32	{d0, d1, d2}, [r4]
+	vld1.32	{d0, d1, d2}, [r4:16]
+	vld1.32	{d0, d1, d2}, [r4:32]
+	vld1.32	{d0, d1, d2}, [r4:64]
+	vld1.32	{d0, d1, d2}, [r4:128]
+	vld1.32	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vld1.32 {d0, d1, d2}, [r4]      @ encoding: [0x24,0xf9,0x8f,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.32 {d0, d1, d2}, [r4:64]   @ encoding: [0x24,0xf9,0x9f,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld1.32	{d0, d1, d2}, [r4]!
+	vld1.32	{d0, d1, d2}, [r4:16]!
+	vld1.32	{d0, d1, d2}, [r4:32]!
+	vld1.32	{d0, d1, d2}, [r4:64]!
+	vld1.32	{d0, d1, d2}, [r4:128]!
+	vld1.32	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vld1.32 {d0, d1, d2}, [r4]!     @ encoding: [0x24,0xf9,0x8d,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.32 {d0, d1, d2}, [r4:64]!  @ encoding: [0x24,0xf9,0x9d,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld1.32	{d0, d1, d2}, [r4], r6
+	vld1.32	{d0, d1, d2}, [r4:16], r6
+	vld1.32	{d0, d1, d2}, [r4:32], r6
+	vld1.32	{d0, d1, d2}, [r4:64], r6
+	vld1.32	{d0, d1, d2}, [r4:128], r6
+	vld1.32	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vld1.32 {d0, d1, d2}, [r4], r6  @ encoding: [0x24,0xf9,0x86,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.32 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x24,0xf9,0x96,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld1.32	{d0, d1, d2, d3}, [r4]
+	vld1.32	{d0, d1, d2, d3}, [r4:16]
+	vld1.32	{d0, d1, d2, d3}, [r4:32]
+	vld1.32	{d0, d1, d2, d3}, [r4:64]
+	vld1.32	{d0, d1, d2, d3}, [r4:128]
+	vld1.32	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vld1.32 {d0, d1, d2, d3}, [r4]  @ encoding: [0x24,0xf9,0x8f,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld1.32 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x24,0xf9,0x9f,0x02]
+@ CHECK: vld1.32 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x24,0xf9,0xaf,0x02]
+@ CHECK: vld1.32 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x24,0xf9,0xbf,0x02]
+
+	vld1.32	{d0, d1, d2, d3}, [r4]!
+	vld1.32	{d0, d1, d2, d3}, [r4:16]!
+	vld1.32	{d0, d1, d2, d3}, [r4:32]!
+	vld1.32	{d0, d1, d2, d3}, [r4:64]!
+	vld1.32	{d0, d1, d2, d3}, [r4:128]!
+	vld1.32	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vld1.32 {d0, d1, d2, d3}, [r4]! @ encoding: [0x24,0xf9,0x8d,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld1.32 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x24,0xf9,0x9d,0x02]
+@ CHECK: vld1.32 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x24,0xf9,0xad,0x02]
+@ CHECK: vld1.32 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x24,0xf9,0xbd,0x02]
+
+	vld1.32	{d0, d1, d2, d3}, [r4], r6
+	vld1.32	{d0, d1, d2, d3}, [r4:16], r6
+	vld1.32	{d0, d1, d2, d3}, [r4:32], r6
+	vld1.32	{d0, d1, d2, d3}, [r4:64], r6
+	vld1.32	{d0, d1, d2, d3}, [r4:128], r6
+	vld1.32	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vld1.32 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x24,0xf9,0x86,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld1.32 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x24,0xf9,0x96,0x02]
+@ CHECK: vld1.32 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x24,0xf9,0xa6,0x02]
+@ CHECK: vld1.32 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x24,0xf9,0xb6,0x02]
+
+	vld1.32	{d0[1]}, [r4]
+	vld1.32	{d0[1]}, [r4:16]
+	vld1.32	{d0[1]}, [r4:32]
+	vld1.32	{d0[1]}, [r4:64]
+	vld1.32	{d0[1]}, [r4:128]
+	vld1.32	{d0[1]}, [r4:256]
+
+@ CHECK: vld1.32 {d0[1]}, [r4]           @ encoding: [0xa4,0xf9,0x8f,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:16]
+@ CHECK-ERRORS:                              ^
+@ CHECK: vld1.32 {d0[1]}, [r4:32]        @ encoding: [0xa4,0xf9,0xbf,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:64]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:128]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:256]
+@ CHECK-ERRORS:                              ^
+
+	vld1.32	{d0[1]}, [r4]!
+	vld1.32	{d0[1]}, [r4:16]!
+	vld1.32	{d0[1]}, [r4:32]!
+	vld1.32	{d0[1]}, [r4:64]!
+	vld1.32	{d0[1]}, [r4:128]!
+	vld1.32	{d0[1]}, [r4:256]!
+
+@ CHECK: vld1.32 {d0[1]}, [r4]!          @ encoding: [0xa4,0xf9,0x8d,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:16]!
+@ CHECK-ERRORS:                              ^
+@ CHECK: vld1.32 {d0[1]}, [r4:32]!       @ encoding: [0xa4,0xf9,0xbd,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:64]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:128]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:256]!
+@ CHECK-ERRORS:                              ^
+
+	vld1.32	{d0[1]}, [r4], r6
+	vld1.32	{d0[1]}, [r4:16], r6
+	vld1.32	{d0[1]}, [r4:32], r6
+	vld1.32	{d0[1]}, [r4:64], r6
+	vld1.32	{d0[1]}, [r4:128], r6
+	vld1.32	{d0[1]}, [r4:256], r6
+
+@ CHECK: vld1.32 {d0[1]}, [r4], r6       @ encoding: [0xa4,0xf9,0x86,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:16], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK: vld1.32 {d0[1]}, [r4:32], r6    @ encoding: [0xa4,0xf9,0xb6,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:64], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:128], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:256], r6
+@ CHECK-ERRORS:                              ^
+
+	vld1.32	{d0[]}, [r4]
+	vld1.32	{d0[]}, [r4:16]
+	vld1.32	{d0[]}, [r4:32]
+	vld1.32	{d0[]}, [r4:64]
+	vld1.32	{d0[]}, [r4:128]
+	vld1.32	{d0[]}, [r4:256]
+
+@ CHECK: vld1.32 {d0[]}, [r4]            @ encoding: [0xa4,0xf9,0x8f,0x0c]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[]}, [r4:16]
+@ CHECK-ERRORS:                             ^
+@ CHECK: vld1.32 {d0[]}, [r4:32]         @ encoding: [0xa4,0xf9,0x9f,0x0c]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[]}, [r4:64]
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[]}, [r4:128]
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[]}, [r4:256]
+@ CHECK-ERRORS:                             ^
+
+	vld1.32	{d0[]}, [r4]!
+	vld1.32	{d0[]}, [r4:16]!
+	vld1.32	{d0[]}, [r4:32]!
+	vld1.32	{d0[]}, [r4:64]!
+	vld1.32	{d0[]}, [r4:128]!
+	vld1.32	{d0[]}, [r4:256]!
+
+@ CHECK: vld1.32 {d0[]}, [r4]!           @ encoding: [0xa4,0xf9,0x8d,0x0c]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[]}, [r4:16]!
+@ CHECK-ERRORS:                             ^
+@ CHECK: vld1.32 {d0[]}, [r4:32]!        @ encoding: [0xa4,0xf9,0x9d,0x0c]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[]}, [r4:64]!
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[]}, [r4:128]!
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[]}, [r4:256]!
+@ CHECK-ERRORS:                             ^
+
+	vld1.32	{d0[]}, [r4], r6
+	vld1.32	{d0[]}, [r4:16], r6
+	vld1.32	{d0[]}, [r4:32], r6
+	vld1.32	{d0[]}, [r4:64], r6
+	vld1.32	{d0[]}, [r4:128], r6
+	vld1.32	{d0[]}, [r4:256], r6
+
+@ CHECK: vld1.32 {d0[]}, [r4], r6        @ encoding: [0xa4,0xf9,0x86,0x0c]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[]}, [r4:16], r6
+@ CHECK-ERRORS:                             ^
+@ CHECK: vld1.32 {d0[]}, [r4:32], r6     @ encoding: [0xa4,0xf9,0x96,0x0c]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[]}, [r4:64], r6
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[]}, [r4:128], r6
+@ CHECK-ERRORS:                             ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[]}, [r4:256], r6
+@ CHECK-ERRORS:                             ^
+
+	vld1.32	{d0[], d1[]}, [r4]
+	vld1.32	{d0[], d1[]}, [r4:16]
+	vld1.32	{d0[], d1[]}, [r4:32]
+	vld1.32	{d0[], d1[]}, [r4:64]
+	vld1.32	{d0[], d1[]}, [r4:128]
+	vld1.32	{d0[], d1[]}, [r4:256]
+
+@ CHECK: vld1.32 {d0[], d1[]}, [r4]      @ encoding: [0xa4,0xf9,0xaf,0x0c]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[], d1[]}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.32 {d0[], d1[]}, [r4:32]   @ encoding: [0xa4,0xf9,0xbf,0x0c]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[], d1[]}, [r4:64]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[], d1[]}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[], d1[]}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld1.32	{d0[], d1[]}, [r4]!
+	vld1.32	{d0[], d1[]}, [r4:16]!
+	vld1.32	{d0[], d1[]}, [r4:32]!
+	vld1.32	{d0[], d1[]}, [r4:64]!
+	vld1.32	{d0[], d1[]}, [r4:128]!
+	vld1.32	{d0[], d1[]}, [r4:256]!
+
+@ CHECK: vld1.32 {d0[], d1[]}, [r4]!     @ encoding: [0xa4,0xf9,0xad,0x0c]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[], d1[]}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.32 {d0[], d1[]}, [r4:32]!  @ encoding: [0xa4,0xf9,0xbd,0x0c]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[], d1[]}, [r4:64]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[], d1[]}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[], d1[]}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld1.32	{d0[], d1[]}, [r4], r6
+	vld1.32	{d0[], d1[]}, [r4:16], r6
+	vld1.32	{d0[], d1[]}, [r4:32], r6
+	vld1.32	{d0[], d1[]}, [r4:64], r6
+	vld1.32	{d0[], d1[]}, [r4:128], r6
+	vld1.32	{d0[], d1[]}, [r4:256], r6
+
+@ CHECK: vld1.32 {d0[], d1[]}, [r4], r6  @ encoding: [0xa4,0xf9,0xa6,0x0c]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[], d1[]}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.32 {d0[], d1[]}, [r4:32], r6 @ encoding: [0xa4,0xf9,0xb6,0x0c]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[], d1[]}, [r4:64], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[], d1[]}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[], d1[]}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld1.32	{d0[1]}, [r4]
+	vld1.32	{d0[1]}, [r4:16]
+	vld1.32	{d0[1]}, [r4:32]
+	vld1.32	{d0[1]}, [r4:64]
+	vld1.32	{d0[1]}, [r4:128]
+	vld1.32	{d0[1]}, [r4:256]
+
+@ CHECK: vld1.32 {d0[1]}, [r4]           @ encoding: [0xa4,0xf9,0x8f,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:16]
+@ CHECK-ERRORS:                              ^
+@ CHECK: vld1.32 {d0[1]}, [r4:32]        @ encoding: [0xa4,0xf9,0xbf,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:64]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:128]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:256]
+@ CHECK-ERRORS:                              ^
+
+	vld1.32	{d0[1]}, [r4]!
+	vld1.32	{d0[1]}, [r4:16]!
+	vld1.32	{d0[1]}, [r4:32]!
+	vld1.32	{d0[1]}, [r4:64]!
+	vld1.32	{d0[1]}, [r4:128]!
+	vld1.32	{d0[1]}, [r4:256]!
+
+@ CHECK: vld1.32 {d0[1]}, [r4]!          @ encoding: [0xa4,0xf9,0x8d,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:16]!
+@ CHECK-ERRORS:                              ^
+@ CHECK: vld1.32 {d0[1]}, [r4:32]!       @ encoding: [0xa4,0xf9,0xbd,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:64]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:128]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:256]!
+@ CHECK-ERRORS:                              ^
+
+	vld1.32	{d0[1]}, [r4], r6
+	vld1.32	{d0[1]}, [r4:16], r6
+	vld1.32	{d0[1]}, [r4:32], r6
+	vld1.32	{d0[1]}, [r4:64], r6
+	vld1.32	{d0[1]}, [r4:128], r6
+	vld1.32	{d0[1]}, [r4:256], r6
+
+@ CHECK: vld1.32 {d0[1]}, [r4], r6       @ encoding: [0xa4,0xf9,0x86,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:16], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK: vld1.32 {d0[1]}, [r4:32], r6    @ encoding: [0xa4,0xf9,0xb6,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:64], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:128], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld1.32 {d0[1]}, [r4:256], r6
+@ CHECK-ERRORS:                              ^
+
+	vld1.64	{d0}, [r4]
+	vld1.64	{d0}, [r4:16]
+	vld1.64	{d0}, [r4:32]
+	vld1.64	{d0}, [r4:64]
+	vld1.64	{d0}, [r4:128]
+	vld1.64	{d0}, [r4:256]
+
+@ CHECK: vld1.64 {d0}, [r4]              @ encoding: [0x24,0xf9,0xcf,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0}, [r4:16]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0}, [r4:32]
+@ CHECK-ERRORS:                           ^
+@ CHECK: vld1.64 {d0}, [r4:64]           @ encoding: [0x24,0xf9,0xdf,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0}, [r4:128]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0}, [r4:256]
+@ CHECK-ERRORS:                           ^
+
+	vld1.64	{d0}, [r4]!
+	vld1.64	{d0}, [r4:16]!
+	vld1.64	{d0}, [r4:32]!
+	vld1.64	{d0}, [r4:64]!
+	vld1.64	{d0}, [r4:128]!
+	vld1.64	{d0}, [r4:256]!
+
+@ CHECK: vld1.64 {d0}, [r4]!             @ encoding: [0x24,0xf9,0xcd,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0}, [r4:16]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0}, [r4:32]!
+@ CHECK-ERRORS:                           ^
+@ CHECK: vld1.64 {d0}, [r4:64]!          @ encoding: [0x24,0xf9,0xdd,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0}, [r4:128]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0}, [r4:256]!
+@ CHECK-ERRORS:                           ^
+
+	vld1.64	{d0}, [r4], r6
+	vld1.64	{d0}, [r4:16], r6
+	vld1.64	{d0}, [r4:32], r6
+	vld1.64	{d0}, [r4:64], r6
+	vld1.64	{d0}, [r4:128], r6
+	vld1.64	{d0}, [r4:256], r6
+
+@ CHECK: vld1.64 {d0}, [r4], r6          @ encoding: [0x24,0xf9,0xc6,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0}, [r4:16], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0}, [r4:32], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK: vld1.64 {d0}, [r4:64], r6       @ encoding: [0x24,0xf9,0xd6,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0}, [r4:128], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0}, [r4:256], r6
+@ CHECK-ERRORS:                           ^
+
+	vld1.64	{d0, d1}, [r4]
+	vld1.64	{d0, d1}, [r4:16]
+	vld1.64	{d0, d1}, [r4:32]
+	vld1.64	{d0, d1}, [r4:64]
+	vld1.64	{d0, d1}, [r4:128]
+	vld1.64	{d0, d1}, [r4:256]
+
+@ CHECK: vld1.64 {d0, d1}, [r4]          @ encoding: [0x24,0xf9,0xcf,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld1.64 {d0, d1}, [r4:64]       @ encoding: [0x24,0xf9,0xdf,0x0a]
+@ CHECK: vld1.64 {d0, d1}, [r4:128]      @ encoding: [0x24,0xf9,0xef,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vld1.64	{d0, d1}, [r4]!
+	vld1.64	{d0, d1}, [r4:16]!
+	vld1.64	{d0, d1}, [r4:32]!
+	vld1.64	{d0, d1}, [r4:64]!
+	vld1.64	{d0, d1}, [r4:128]!
+	vld1.64	{d0, d1}, [r4:256]!
+
+@ CHECK: vld1.64 {d0, d1}, [r4]!         @ encoding: [0x24,0xf9,0xcd,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld1.64 {d0, d1}, [r4:64]!      @ encoding: [0x24,0xf9,0xdd,0x0a]
+@ CHECK: vld1.64 {d0, d1}, [r4:128]!     @ encoding: [0x24,0xf9,0xed,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vld1.64	{d0, d1}, [r4], r6
+	vld1.64	{d0, d1}, [r4:16], r6
+	vld1.64	{d0, d1}, [r4:32], r6
+	vld1.64	{d0, d1}, [r4:64], r6
+	vld1.64	{d0, d1}, [r4:128], r6
+	vld1.64	{d0, d1}, [r4:256], r6
+
+@ CHECK: vld1.64 {d0, d1}, [r4], r6      @ encoding: [0x24,0xf9,0xc6,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld1.64 {d0, d1}, [r4:64], r6   @ encoding: [0x24,0xf9,0xd6,0x0a]
+@ CHECK: vld1.64 {d0, d1}, [r4:128], r6  @ encoding: [0x24,0xf9,0xe6,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vld1.64	{d0, d1, d2}, [r4]
+	vld1.64	{d0, d1, d2}, [r4:16]
+	vld1.64	{d0, d1, d2}, [r4:32]
+	vld1.64	{d0, d1, d2}, [r4:64]
+	vld1.64	{d0, d1, d2}, [r4:128]
+	vld1.64	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vld1.64 {d0, d1, d2}, [r4]      @ encoding: [0x24,0xf9,0xcf,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.64 {d0, d1, d2}, [r4:64]   @ encoding: [0x24,0xf9,0xdf,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld1.64	{d0, d1, d2}, [r4]!
+	vld1.64	{d0, d1, d2}, [r4:16]!
+	vld1.64	{d0, d1, d2}, [r4:32]!
+	vld1.64	{d0, d1, d2}, [r4:64]!
+	vld1.64	{d0, d1, d2}, [r4:128]!
+	vld1.64	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vld1.64 {d0, d1, d2}, [r4]!     @ encoding: [0x24,0xf9,0xcd,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.64 {d0, d1, d2}, [r4:64]!  @ encoding: [0x24,0xf9,0xdd,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld1.64	{d0, d1, d2}, [r4], r6
+	vld1.64	{d0, d1, d2}, [r4:16], r6
+	vld1.64	{d0, d1, d2}, [r4:32], r6
+	vld1.64	{d0, d1, d2}, [r4:64], r6
+	vld1.64	{d0, d1, d2}, [r4:128], r6
+	vld1.64	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vld1.64 {d0, d1, d2}, [r4], r6  @ encoding: [0x24,0xf9,0xc6,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld1.64 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x24,0xf9,0xd6,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld1.64	{d0, d1, d2, d3}, [r4]
+	vld1.64	{d0, d1, d2, d3}, [r4:16]
+	vld1.64	{d0, d1, d2, d3}, [r4:32]
+	vld1.64	{d0, d1, d2, d3}, [r4:64]
+	vld1.64	{d0, d1, d2, d3}, [r4:128]
+	vld1.64	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vld1.64 {d0, d1, d2, d3}, [r4]  @ encoding: [0x24,0xf9,0xcf,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld1.64 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x24,0xf9,0xdf,0x02]
+@ CHECK: vld1.64 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x24,0xf9,0xef,0x02]
+@ CHECK: vld1.64 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x24,0xf9,0xff,0x02]
+
+	vld1.64	{d0, d1, d2, d3}, [r4]!
+	vld1.64	{d0, d1, d2, d3}, [r4:16]!
+	vld1.64	{d0, d1, d2, d3}, [r4:32]!
+	vld1.64	{d0, d1, d2, d3}, [r4:64]!
+	vld1.64	{d0, d1, d2, d3}, [r4:128]!
+	vld1.64	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vld1.64 {d0, d1, d2, d3}, [r4]! @ encoding: [0x24,0xf9,0xcd,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld1.64 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x24,0xf9,0xdd,0x02]
+@ CHECK: vld1.64 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x24,0xf9,0xed,0x02]
+@ CHECK: vld1.64 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x24,0xf9,0xfd,0x02]
+
+	vld1.64	{d0, d1, d2, d3}, [r4], r6
+	vld1.64	{d0, d1, d2, d3}, [r4:16], r6
+	vld1.64	{d0, d1, d2, d3}, [r4:32], r6
+	vld1.64	{d0, d1, d2, d3}, [r4:64], r6
+	vld1.64	{d0, d1, d2, d3}, [r4:128], r6
+	vld1.64	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vld1.64 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x24,0xf9,0xc6,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld1.64 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld1.64 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x24,0xf9,0xd6,0x02]
+@ CHECK: vld1.64 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x24,0xf9,0xe6,0x02]
+@ CHECK: vld1.64 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x24,0xf9,0xf6,0x02]
+
+	vld2.8	{d0, d1}, [r4]
+	vld2.8	{d0, d1}, [r4:16]
+	vld2.8	{d0, d1}, [r4:32]
+	vld2.8	{d0, d1}, [r4:64]
+	vld2.8	{d0, d1}, [r4:128]
+	vld2.8	{d0, d1}, [r4:256]
+
+@ CHECK: vld2.8 {d0, d1}, [r4]          @ encoding: [0x24,0xf9,0x0f,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.8 {d0, d1}, [r4:64]       @ encoding: [0x24,0xf9,0x1f,0x08]
+@ CHECK: vld2.8 {d0, d1}, [r4:128]      @ encoding: [0x24,0xf9,0x2f,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vld2.8	{d0, d1}, [r4]!
+	vld2.8	{d0, d1}, [r4:16]!
+	vld2.8	{d0, d1}, [r4:32]!
+	vld2.8	{d0, d1}, [r4:64]!
+	vld2.8	{d0, d1}, [r4:128]!
+	vld2.8	{d0, d1}, [r4:256]!
+
+@ CHECK: vld2.8 {d0, d1}, [r4]!         @ encoding: [0x24,0xf9,0x0d,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.8 {d0, d1}, [r4:64]!      @ encoding: [0x24,0xf9,0x1d,0x08]
+@ CHECK: vld2.8 {d0, d1}, [r4:128]!     @ encoding: [0x24,0xf9,0x2d,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vld2.8	{d0, d1}, [r4], r6
+	vld2.8	{d0, d1}, [r4:16], r6
+	vld2.8	{d0, d1}, [r4:32], r6
+	vld2.8	{d0, d1}, [r4:64], r6
+	vld2.8	{d0, d1}, [r4:128], r6
+	vld2.8	{d0, d1}, [r4:256], r6
+
+@ CHECK: vld2.8 {d0, d1}, [r4], r6      @ encoding: [0x24,0xf9,0x06,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.8 {d0, d1}, [r4:64], r6   @ encoding: [0x24,0xf9,0x16,0x08]
+@ CHECK: vld2.8 {d0, d1}, [r4:128], r6  @ encoding: [0x24,0xf9,0x26,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vld2.8	{d0, d2}, [r4]
+	vld2.8	{d0, d2}, [r4:16]
+	vld2.8	{d0, d2}, [r4:32]
+	vld2.8	{d0, d2}, [r4:64]
+	vld2.8	{d0, d2}, [r4:128]
+	vld2.8	{d0, d2}, [r4:256]
+
+@ CHECK: vld2.8 {d0, d2}, [r4]          @ encoding: [0x24,0xf9,0x0f,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d2}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d2}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.8 {d0, d2}, [r4:64]       @ encoding: [0x24,0xf9,0x1f,0x09]
+@ CHECK: vld2.8 {d0, d2}, [r4:128]      @ encoding: [0x24,0xf9,0x2f,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d2}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vld2.8	{d0, d2}, [r4]!
+	vld2.8	{d0, d2}, [r4:16]!
+	vld2.8	{d0, d2}, [r4:32]!
+	vld2.8	{d0, d2}, [r4:64]!
+	vld2.8	{d0, d2}, [r4:128]!
+	vld2.8	{d0, d2}, [r4:256]!
+
+@ CHECK: vld2.8 {d0, d2}, [r4]!         @ encoding: [0x24,0xf9,0x0d,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d2}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d2}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.8 {d0, d2}, [r4:64]!      @ encoding: [0x24,0xf9,0x1d,0x09]
+@ CHECK: vld2.8 {d0, d2}, [r4:128]!     @ encoding: [0x24,0xf9,0x2d,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d2}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vld2.8	{d0, d2}, [r4], r6
+	vld2.8	{d0, d2}, [r4:16], r6
+	vld2.8	{d0, d2}, [r4:32], r6
+	vld2.8	{d0, d2}, [r4:64], r6
+	vld2.8	{d0, d2}, [r4:128], r6
+	vld2.8	{d0, d2}, [r4:256], r6
+
+@ CHECK: vld2.8 {d0, d2}, [r4], r6      @ encoding: [0x24,0xf9,0x06,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d2}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d2}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.8 {d0, d2}, [r4:64], r6   @ encoding: [0x24,0xf9,0x16,0x09]
+@ CHECK: vld2.8 {d0, d2}, [r4:128], r6  @ encoding: [0x24,0xf9,0x26,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d2}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vld2.8	{d0, d1, d2, d3}, [r4]
+	vld2.8	{d0, d1, d2, d3}, [r4:16]
+	vld2.8	{d0, d1, d2, d3}, [r4:32]
+	vld2.8	{d0, d1, d2, d3}, [r4:64]
+	vld2.8	{d0, d1, d2, d3}, [r4:128]
+	vld2.8	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vld2.8 {d0, d1, d2, d3}, [r4]  @ encoding: [0x24,0xf9,0x0f,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld2.8 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x24,0xf9,0x1f,0x03]
+@ CHECK: vld2.8 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x24,0xf9,0x2f,0x03]
+@ CHECK: vld2.8 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x24,0xf9,0x3f,0x03]
+
+	vld2.8	{d0, d1, d2, d3}, [r4]!
+	vld2.8	{d0, d1, d2, d3}, [r4:16]!
+	vld2.8	{d0, d1, d2, d3}, [r4:32]!
+	vld2.8	{d0, d1, d2, d3}, [r4:64]!
+	vld2.8	{d0, d1, d2, d3}, [r4:128]!
+	vld2.8	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vld2.8 {d0, d1, d2, d3}, [r4]! @ encoding: [0x24,0xf9,0x0d,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld2.8 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x24,0xf9,0x1d,0x03]
+@ CHECK: vld2.8 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x24,0xf9,0x2d,0x03]
+@ CHECK: vld2.8 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x24,0xf9,0x3d,0x03]
+
+	vld2.8	{d0, d1, d2, d3}, [r4], r6
+	vld2.8	{d0, d1, d2, d3}, [r4:16], r6
+	vld2.8	{d0, d1, d2, d3}, [r4:32], r6
+	vld2.8	{d0, d1, d2, d3}, [r4:64], r6
+	vld2.8	{d0, d1, d2, d3}, [r4:128], r6
+	vld2.8	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vld2.8 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x24,0xf9,0x06,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK: vld2.8 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x24,0xf9,0x16,0x03]
+@ CHECK: vld2.8 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x24,0xf9,0x26,0x03]
+@ CHECK: vld2.8 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x24,0xf9,0x36,0x03]
+
+	vld2.8	{d0[2], d1[2]}, [r4]
+	vld2.8	{d0[2], d1[2]}, [r4:16]
+	vld2.8	{d0[2], d1[2]}, [r4:32]
+	vld2.8	{d0[2], d1[2]}, [r4:64]
+	vld2.8	{d0[2], d1[2]}, [r4:128]
+	vld2.8	{d0[2], d1[2]}, [r4:256]
+
+@ CHECK: vld2.8 {d0[2], d1[2]}, [r4]    @ encoding: [0xa4,0xf9,0x4f,0x01]
+@ CHECK: vld2.8 {d0[2], d1[2]}, [r4:16] @ encoding: [0xa4,0xf9,0x5f,0x01]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[2], d1[2]}, [r4:32]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[2], d1[2]}, [r4:64]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[2], d1[2]}, [r4:128]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[2], d1[2]}, [r4:256]
+@ CHECK-ERRORS:                                     ^
+
+	vld2.8	{d0[2], d1[2]}, [r4]!
+	vld2.8	{d0[2], d1[2]}, [r4:16]!
+	vld2.8	{d0[2], d1[2]}, [r4:32]!
+	vld2.8	{d0[2], d1[2]}, [r4:64]!
+	vld2.8	{d0[2], d1[2]}, [r4:128]!
+	vld2.8	{d0[2], d1[2]}, [r4:256]!
+
+@ CHECK: vld2.8 {d0[2], d1[2]}, [r4]!   @ encoding: [0xa4,0xf9,0x4d,0x01]
+@ CHECK: vld2.8 {d0[2], d1[2]}, [r4:16]! @ encoding: [0xa4,0xf9,0x5d,0x01]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[2], d1[2]}, [r4:32]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[2], d1[2]}, [r4:64]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[2], d1[2]}, [r4:128]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[2], d1[2]}, [r4:256]!
+@ CHECK-ERRORS:                                     ^
+
+	vld2.8	{d0[2], d1[2]}, [r4], r6
+	vld2.8	{d0[2], d1[2]}, [r4:16], r6
+	vld2.8	{d0[2], d1[2]}, [r4:32], r6
+	vld2.8	{d0[2], d1[2]}, [r4:64], r6
+	vld2.8	{d0[2], d1[2]}, [r4:128], r6
+	vld2.8	{d0[2], d1[2]}, [r4:256], r6
+
+@ CHECK: vld2.8 {d0[2], d1[2]}, [r4], r6 @ encoding: [0xa4,0xf9,0x46,0x01]
+@ CHECK: vld2.8 {d0[2], d1[2]}, [r4:16], r6 @ encoding: [0xa4,0xf9,0x56,0x01]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[2], d1[2]}, [r4:32], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[2], d1[2]}, [r4:64], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[2], d1[2]}, [r4:128], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[2], d1[2]}, [r4:256], r6
+@ CHECK-ERRORS:                                     ^
+
+	vld2.8	{d0[], d1[]}, [r4]
+	vld2.8	{d0[], d1[]}, [r4:16]
+	vld2.8	{d0[], d1[]}, [r4:32]
+	vld2.8	{d0[], d1[]}, [r4:64]
+	vld2.8	{d0[], d1[]}, [r4:128]
+	vld2.8	{d0[], d1[]}, [r4:256]
+
+@ CHECK: vld2.8 {d0[], d1[]}, [r4]      @ encoding: [0xa4,0xf9,0x0f,0x0d]
+@ CHECK: vld2.8 {d0[], d1[]}, [r4:16]   @ encoding: [0xa4,0xf9,0x1f,0x0d]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d1[]}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d1[]}, [r4:64]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d1[]}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d1[]}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld2.8	{d0[], d1[]}, [r4]!
+	vld2.8	{d0[], d1[]}, [r4:16]!
+	vld2.8	{d0[], d1[]}, [r4:32]!
+	vld2.8	{d0[], d1[]}, [r4:64]!
+	vld2.8	{d0[], d1[]}, [r4:128]!
+	vld2.8	{d0[], d1[]}, [r4:256]!
+
+@ CHECK: vld2.8 {d0[], d1[]}, [r4]!     @ encoding: [0xa4,0xf9,0x0d,0x0d]
+@ CHECK: vld2.8 {d0[], d1[]}, [r4:16]!  @ encoding: [0xa4,0xf9,0x1d,0x0d]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d1[]}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d1[]}, [r4:64]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d1[]}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d1[]}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld2.8	{d0[], d1[]}, [r4], r6
+	vld2.8	{d0[], d1[]}, [r4:16], r6
+	vld2.8	{d0[], d1[]}, [r4:32], r6
+	vld2.8	{d0[], d1[]}, [r4:64], r6
+	vld2.8	{d0[], d1[]}, [r4:128], r6
+	vld2.8	{d0[], d1[]}, [r4:256], r6
+
+@ CHECK: vld2.8 {d0[], d1[]}, [r4], r6  @ encoding: [0xa4,0xf9,0x06,0x0d]
+@ CHECK: vld2.8 {d0[], d1[]}, [r4:16], r6 @ encoding: [0xa4,0xf9,0x16,0x0d]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d1[]}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d1[]}, [r4:64], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d1[]}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d1[]}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld2.8	{d0[], d2[]}, [r4]
+	vld2.8	{d0[], d2[]}, [r4:16]
+	vld2.8	{d0[], d2[]}, [r4:32]
+	vld2.8	{d0[], d2[]}, [r4:64]
+	vld2.8	{d0[], d2[]}, [r4:128]
+	vld2.8	{d0[], d2[]}, [r4:256]
+
+@ CHECK: vld2.8 {d0[], d2[]}, [r4]      @ encoding: [0xa4,0xf9,0x2f,0x0d]
+@ CHECK: vld2.8 {d0[], d2[]}, [r4:16]   @ encoding: [0xa4,0xf9,0x3f,0x0d]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d2[]}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d2[]}, [r4:64]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d2[]}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d2[]}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld2.8	{d0[], d2[]}, [r4]!
+	vld2.8	{d0[], d2[]}, [r4:16]!
+	vld2.8	{d0[], d2[]}, [r4:32]!
+	vld2.8	{d0[], d2[]}, [r4:64]!
+	vld2.8	{d0[], d2[]}, [r4:128]!
+	vld2.8	{d0[], d2[]}, [r4:256]!
+
+@ CHECK: vld2.8 {d0[], d2[]}, [r4]!     @ encoding: [0xa4,0xf9,0x2d,0x0d]
+@ CHECK: vld2.8 {d0[], d2[]}, [r4:16]!  @ encoding: [0xa4,0xf9,0x3d,0x0d]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d2[]}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d2[]}, [r4:64]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d2[]}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d2[]}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld2.8	{d0[], d2[]}, [r4], r6
+	vld2.8	{d0[], d2[]}, [r4:16], r6
+	vld2.8	{d0[], d2[]}, [r4:32], r6
+	vld2.8	{d0[], d2[]}, [r4:64], r6
+	vld2.8	{d0[], d2[]}, [r4:128], r6
+	vld2.8	{d0[], d2[]}, [r4:256], r6
+
+@ CHECK: vld2.8 {d0[], d2[]}, [r4], r6  @ encoding: [0xa4,0xf9,0x26,0x0d]
+@ CHECK: vld2.8 {d0[], d2[]}, [r4:16], r6 @ encoding: [0xa4,0xf9,0x36,0x0d]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d2[]}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d2[]}, [r4:64], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d2[]}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vld2.8  {d0[], d2[]}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld2.16	{d0, d1}, [r4]
+	vld2.16	{d0, d1}, [r4:16]
+	vld2.16	{d0, d1}, [r4:32]
+	vld2.16	{d0, d1}, [r4:64]
+	vld2.16	{d0, d1}, [r4:128]
+	vld2.16	{d0, d1}, [r4:256]
+
+@ CHECK: vld2.16 {d0, d1}, [r4]          @ encoding: [0x24,0xf9,0x4f,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.16 {d0, d1}, [r4:64]       @ encoding: [0x24,0xf9,0x5f,0x08]
+@ CHECK: vld2.16 {d0, d1}, [r4:128]      @ encoding: [0x24,0xf9,0x6f,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vld2.16	{d0, d1}, [r4]!
+	vld2.16	{d0, d1}, [r4:16]!
+	vld2.16	{d0, d1}, [r4:32]!
+	vld2.16	{d0, d1}, [r4:64]!
+	vld2.16	{d0, d1}, [r4:128]!
+	vld2.16	{d0, d1}, [r4:256]!
+
+@ CHECK: vld2.16 {d0, d1}, [r4]!         @ encoding: [0x24,0xf9,0x4d,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.16 {d0, d1}, [r4:64]!      @ encoding: [0x24,0xf9,0x5d,0x08]
+@ CHECK: vld2.16 {d0, d1}, [r4:128]!     @ encoding: [0x24,0xf9,0x6d,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vld2.16	{d0, d1}, [r4], r6
+	vld2.16	{d0, d1}, [r4:16], r6
+	vld2.16	{d0, d1}, [r4:32], r6
+	vld2.16	{d0, d1}, [r4:64], r6
+	vld2.16	{d0, d1}, [r4:128], r6
+	vld2.16	{d0, d1}, [r4:256], r6
+
+@ CHECK: vld2.16 {d0, d1}, [r4], r6      @ encoding: [0x24,0xf9,0x46,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.16 {d0, d1}, [r4:64], r6   @ encoding: [0x24,0xf9,0x56,0x08]
+@ CHECK: vld2.16 {d0, d1}, [r4:128], r6  @ encoding: [0x24,0xf9,0x66,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vld2.16	{d0, d2}, [r4]
+	vld2.16	{d0, d2}, [r4:16]
+	vld2.16	{d0, d2}, [r4:32]
+	vld2.16	{d0, d2}, [r4:64]
+	vld2.16	{d0, d2}, [r4:128]
+	vld2.16	{d0, d2}, [r4:256]
+
+@ CHECK: vld2.16 {d0, d2}, [r4]          @ encoding: [0x24,0xf9,0x4f,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d2}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d2}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.16 {d0, d2}, [r4:64]       @ encoding: [0x24,0xf9,0x5f,0x09]
+@ CHECK: vld2.16 {d0, d2}, [r4:128]      @ encoding: [0x24,0xf9,0x6f,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d2}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vld2.16	{d0, d2}, [r4]!
+	vld2.16	{d0, d2}, [r4:16]!
+	vld2.16	{d0, d2}, [r4:32]!
+	vld2.16	{d0, d2}, [r4:64]!
+	vld2.16	{d0, d2}, [r4:128]!
+	vld2.16	{d0, d2}, [r4:256]!
+
+@ CHECK: vld2.16 {d0, d2}, [r4]!         @ encoding: [0x24,0xf9,0x4d,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d2}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d2}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.16 {d0, d2}, [r4:64]!      @ encoding: [0x24,0xf9,0x5d,0x09]
+@ CHECK: vld2.16 {d0, d2}, [r4:128]!     @ encoding: [0x24,0xf9,0x6d,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d2}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vld2.16	{d0, d2}, [r4], r6
+	vld2.16	{d0, d2}, [r4:16], r6
+	vld2.16	{d0, d2}, [r4:32], r6
+	vld2.16	{d0, d2}, [r4:64], r6
+	vld2.16	{d0, d2}, [r4:128], r6
+	vld2.16	{d0, d2}, [r4:256], r6
+
+@ CHECK: vld2.16 {d0, d2}, [r4], r6      @ encoding: [0x24,0xf9,0x46,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d2}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d2}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.16 {d0, d2}, [r4:64], r6   @ encoding: [0x24,0xf9,0x56,0x09]
+@ CHECK: vld2.16 {d0, d2}, [r4:128], r6  @ encoding: [0x24,0xf9,0x66,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d2}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vld2.16	{d0, d1, d2, d3}, [r4]
+	vld2.16	{d0, d1, d2, d3}, [r4:16]
+	vld2.16	{d0, d1, d2, d3}, [r4:32]
+	vld2.16	{d0, d1, d2, d3}, [r4:64]
+	vld2.16	{d0, d1, d2, d3}, [r4:128]
+	vld2.16	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vld2.16 {d0, d1, d2, d3}, [r4]  @ encoding: [0x24,0xf9,0x4f,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld2.16 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x24,0xf9,0x5f,0x03]
+@ CHECK: vld2.16 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x24,0xf9,0x6f,0x03]
+@ CHECK: vld2.16 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x24,0xf9,0x7f,0x03]
+
+	vld2.16	{d0, d1, d2, d3}, [r4]!
+	vld2.16	{d0, d1, d2, d3}, [r4:16]!
+	vld2.16	{d0, d1, d2, d3}, [r4:32]!
+	vld2.16	{d0, d1, d2, d3}, [r4:64]!
+	vld2.16	{d0, d1, d2, d3}, [r4:128]!
+	vld2.16	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vld2.16 {d0, d1, d2, d3}, [r4]! @ encoding: [0x24,0xf9,0x4d,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld2.16 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x24,0xf9,0x5d,0x03]
+@ CHECK: vld2.16 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x24,0xf9,0x6d,0x03]
+@ CHECK: vld2.16 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x24,0xf9,0x7d,0x03]
+
+	vld2.16	{d0, d1, d2, d3}, [r4], r6
+	vld2.16	{d0, d1, d2, d3}, [r4:16], r6
+	vld2.16	{d0, d1, d2, d3}, [r4:32], r6
+	vld2.16	{d0, d1, d2, d3}, [r4:64], r6
+	vld2.16	{d0, d1, d2, d3}, [r4:128], r6
+	vld2.16	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vld2.16 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x24,0xf9,0x46,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld2.16 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x24,0xf9,0x56,0x03]
+@ CHECK: vld2.16 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x24,0xf9,0x66,0x03]
+@ CHECK: vld2.16 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x24,0xf9,0x76,0x03]
+
+	vld2.16	{d0[2], d1[2]}, [r4]
+	vld2.16	{d0[2], d1[2]}, [r4:16]
+	vld2.16	{d0[2], d1[2]}, [r4:32]
+	vld2.16	{d0[2], d1[2]}, [r4:64]
+	vld2.16	{d0[2], d1[2]}, [r4:128]
+	vld2.16	{d0[2], d1[2]}, [r4:256]
+
+@ CHECK: vld2.16 {d0[2], d1[2]}, [r4]    @ encoding: [0xa4,0xf9,0x8f,0x05]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d1[2]}, [r4:16]
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vld2.16 {d0[2], d1[2]}, [r4:32] @ encoding: [0xa4,0xf9,0x9f,0x05]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d1[2]}, [r4:64]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d1[2]}, [r4:128]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d1[2]}, [r4:256]
+@ CHECK-ERRORS:                                     ^
+
+	vld2.16	{d0[2], d1[2]}, [r4]!
+	vld2.16	{d0[2], d1[2]}, [r4:16]!
+	vld2.16	{d0[2], d1[2]}, [r4:32]!
+	vld2.16	{d0[2], d1[2]}, [r4:64]!
+	vld2.16	{d0[2], d1[2]}, [r4:128]!
+	vld2.16	{d0[2], d1[2]}, [r4:256]!
+
+@ CHECK: vld2.16 {d0[2], d1[2]}, [r4]!   @ encoding: [0xa4,0xf9,0x8d,0x05]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d1[2]}, [r4:16]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vld2.16 {d0[2], d1[2]}, [r4:32]! @ encoding: [0xa4,0xf9,0x9d,0x05]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d1[2]}, [r4:64]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d1[2]}, [r4:128]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d1[2]}, [r4:256]!
+@ CHECK-ERRORS:                                     ^
+
+	vld2.16	{d0[2], d1[2]}, [r4], r6
+	vld2.16	{d0[2], d1[2]}, [r4:16], r6
+	vld2.16	{d0[2], d1[2]}, [r4:32], r6
+	vld2.16	{d0[2], d1[2]}, [r4:64], r6
+	vld2.16	{d0[2], d1[2]}, [r4:128], r6
+	vld2.16	{d0[2], d1[2]}, [r4:256], r6
+
+@ CHECK: vld2.16 {d0[2], d1[2]}, [r4], r6 @ encoding: [0xa4,0xf9,0x86,0x05]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d1[2]}, [r4:16], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vld2.16 {d0[2], d1[2]}, [r4:32], r6 @ encoding: [0xa4,0xf9,0x96,0x05]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d1[2]}, [r4:64], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d1[2]}, [r4:128], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d1[2]}, [r4:256], r6
+@ CHECK-ERRORS:                                     ^
+
+	vld2.16	{d0[2], d2[2]}, [r4]
+	vld2.16	{d0[2], d2[2]}, [r4:16]
+	vld2.16	{d0[2], d2[2]}, [r4:32]
+	vld2.16	{d0[2], d2[2]}, [r4:64]
+	vld2.16	{d0[2], d2[2]}, [r4:128]
+	vld2.16	{d0[2], d2[2]}, [r4:256]
+
+@ CHECK: vld2.16 {d0[2], d2[2]}, [r4]    @ encoding: [0xa4,0xf9,0xaf,0x05]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d2[2]}, [r4:16]
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vld2.16 {d0[2], d2[2]}, [r4:32] @ encoding: [0xa4,0xf9,0xbf,0x05]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d2[2]}, [r4:64]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d2[2]}, [r4:128]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d2[2]}, [r4:256]
+@ CHECK-ERRORS:                                     ^
+
+	vld2.16	{d0[2], d2[2]}, [r4]!
+	vld2.16	{d0[2], d2[2]}, [r4:16]!
+	vld2.16	{d0[2], d2[2]}, [r4:32]!
+	vld2.16	{d0[2], d2[2]}, [r4:64]!
+	vld2.16	{d0[2], d2[2]}, [r4:128]!
+	vld2.16	{d0[2], d2[2]}, [r4:256]!
+
+@ CHECK: vld2.16 {d0[2], d1[2]}, [r4]!   @ encoding: [0xa4,0xf9,0xad,0x05]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d2[2]}, [r4:16]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vld2.16 {d0[2], d1[2]}, [r4:32]! @ encoding: [0xa4,0xf9,0xbd,0x05]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d2[2]}, [r4:64]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d2[2]}, [r4:128]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d2[2]}, [r4:256]!
+@ CHECK-ERRORS:                                     ^
+
+	vld2.16	{d0[2], d2[2]}, [r4], r6
+	vld2.16	{d0[2], d2[2]}, [r4:16], r6
+	vld2.16	{d0[2], d2[2]}, [r4:32], r6
+	vld2.16	{d0[2], d2[2]}, [r4:64], r6
+	vld2.16	{d0[2], d2[2]}, [r4:128], r6
+	vld2.16	{d0[2], d2[2]}, [r4:256], r6
+
+@ CHECK: vld2.16 {d0[2], d2[2]}, [r4], r6 @ encoding: [0xa4,0xf9,0xa6,0x05]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d2[2]}, [r4:16], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vld2.16 {d0[2], d2[2]}, [r4:32], r6 @ encoding: [0xa4,0xf9,0xb6,0x05]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d2[2]}, [r4:64], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d2[2]}, [r4:128], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[2], d2[2]}, [r4:256], r6
+@ CHECK-ERRORS:                                     ^
+
+	vld2.16	{d0[], d1[]}, [r4]
+	vld2.16	{d0[], d1[]}, [r4:16]
+	vld2.16	{d0[], d1[]}, [r4:32]
+	vld2.16	{d0[], d1[]}, [r4:64]
+	vld2.16	{d0[], d1[]}, [r4:128]
+	vld2.16	{d0[], d1[]}, [r4:256]
+
+@ CHECK: vld2.16 {d0[], d1[]}, [r4]      @ encoding: [0xa4,0xf9,0x4f,0x0d]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d1[]}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld2.16 {d0[], d1[]}, [r4:32]   @ encoding: [0xa4,0xf9,0x5f,0x0d]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d1[]}, [r4:64]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d1[]}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d1[]}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld2.16	{d0[], d1[]}, [r4]!
+	vld2.16	{d0[], d1[]}, [r4:16]!
+	vld2.16	{d0[], d1[]}, [r4:32]!
+	vld2.16	{d0[], d1[]}, [r4:64]!
+	vld2.16	{d0[], d1[]}, [r4:128]!
+	vld2.16	{d0[], d1[]}, [r4:256]!
+
+@ CHECK: vld2.16 {d0[], d1[]}, [r4]!     @ encoding: [0xa4,0xf9,0x4d,0x0d]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d1[]}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld2.16 {d0[], d1[]}, [r4:32]!  @ encoding: [0xa4,0xf9,0x5d,0x0d]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d1[]}, [r4:64]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d1[]}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d1[]}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld2.16	{d0[], d1[]}, [r4], r6
+	vld2.16	{d0[], d1[]}, [r4:16], r6
+	vld2.16	{d0[], d1[]}, [r4:32], r6
+	vld2.16	{d0[], d1[]}, [r4:64], r6
+	vld2.16	{d0[], d1[]}, [r4:128], r6
+	vld2.16	{d0[], d1[]}, [r4:256], r6
+
+@ CHECK: vld2.16 {d0[], d1[]}, [r4], r6  @ encoding: [0xa4,0xf9,0x46,0x0d]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d1[]}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld2.16 {d0[], d1[]}, [r4:32], r6 @ encoding: [0xa4,0xf9,0x56,0x0d]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d1[]}, [r4:64], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d1[]}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d1[]}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld2.16	{d0[], d2[]}, [r4]
+	vld2.16	{d0[], d2[]}, [r4:16]
+	vld2.16	{d0[], d2[]}, [r4:32]
+	vld2.16	{d0[], d2[]}, [r4:64]
+	vld2.16	{d0[], d2[]}, [r4:128]
+	vld2.16	{d0[], d2[]}, [r4:256]
+
+@ CHECK: vld2.16 {d0[], d2[]}, [r4]      @ encoding: [0xa4,0xf9,0x6f,0x0d]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d2[]}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld2.16 {d0[], d2[]}, [r4:32]   @ encoding: [0xa4,0xf9,0x7f,0x0d]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d2[]}, [r4:64]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d2[]}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d2[]}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld2.16	{d0[], d2[]}, [r4]!
+	vld2.16	{d0[], d2[]}, [r4:16]!
+	vld2.16	{d0[], d2[]}, [r4:32]!
+	vld2.16	{d0[], d2[]}, [r4:64]!
+	vld2.16	{d0[], d2[]}, [r4:128]!
+	vld2.16	{d0[], d2[]}, [r4:256]!
+
+@ CHECK: vld2.16 {d0[], d2[]}, [r4]!     @ encoding: [0xa4,0xf9,0x6d,0x0d]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d2[]}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld2.16 {d0[], d2[]}, [r4:32]!  @ encoding: [0xa4,0xf9,0x7d,0x0d]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d2[]}, [r4:64]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d2[]}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d2[]}, [r4:256]!
+
+	vld2.16	{d0[], d2[]}, [r4], r6
+	vld2.16	{d0[], d2[]}, [r4:16], r6
+	vld2.16	{d0[], d2[]}, [r4:32], r6
+	vld2.16	{d0[], d2[]}, [r4:64], r6
+	vld2.16	{d0[], d2[]}, [r4:128], r6
+	vld2.16	{d0[], d2[]}, [r4:256], r6
+
+@ CHECK: vld2.16 {d0[], d2[]}, [r4], r6  @ encoding: [0xa4,0xf9,0x66,0x0d]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d2[]}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld2.16 {d0[], d2[]}, [r4:32], r6 @ encoding: [0xa4,0xf9,0x76,0x0d]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d2[]}, [r4:64], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d2[]}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld2.16 {d0[], d2[]}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld2.32	{d0, d1}, [r4]
+	vld2.32	{d0, d1}, [r4:16]
+	vld2.32	{d0, d1}, [r4:32]
+	vld2.32	{d0, d1}, [r4:64]
+	vld2.32	{d0, d1}, [r4:128]
+	vld2.32	{d0, d1}, [r4:256]
+
+@ CHECK: vld2.32 {d0, d1}, [r4]          @ encoding: [0x24,0xf9,0x8f,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.32 {d0, d1}, [r4:64]       @ encoding: [0x24,0xf9,0x9f,0x08]
+@ CHECK: vld2.32 {d0, d1}, [r4:128]      @ encoding: [0x24,0xf9,0xaf,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vld2.32	{d0, d1}, [r4]!
+	vld2.32	{d0, d1}, [r4:16]!
+	vld2.32	{d0, d1}, [r4:32]!
+	vld2.32	{d0, d1}, [r4:64]!
+	vld2.32	{d0, d1}, [r4:128]!
+	vld2.32	{d0, d1}, [r4:256]!
+
+@ CHECK: vld2.32 {d0, d1}, [r4]!         @ encoding: [0x24,0xf9,0x8d,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.32 {d0, d1}, [r4:64]!      @ encoding: [0x24,0xf9,0x9d,0x08]
+@ CHECK: vld2.32 {d0, d1}, [r4:128]!     @ encoding: [0x24,0xf9,0xad,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vld2.32	{d0, d1}, [r4], r6
+	vld2.32	{d0, d1}, [r4:16], r6
+	vld2.32	{d0, d1}, [r4:32], r6
+	vld2.32	{d0, d1}, [r4:64], r6
+	vld2.32	{d0, d1}, [r4:128], r6
+	vld2.32	{d0, d1}, [r4:256], r6
+
+@ CHECK: vld2.32 {d0, d1}, [r4], r6      @ encoding: [0x24,0xf9,0x86,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.32 {d0, d1}, [r4:64], r6   @ encoding: [0x24,0xf9,0x96,0x08]
+@ CHECK: vld2.32 {d0, d1}, [r4:128], r6  @ encoding: [0x24,0xf9,0xa6,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vld2.32	{d0, d2}, [r4]
+	vld2.32	{d0, d2}, [r4:16]
+	vld2.32	{d0, d2}, [r4:32]
+	vld2.32	{d0, d2}, [r4:64]
+	vld2.32	{d0, d2}, [r4:128]
+	vld2.32	{d0, d2}, [r4:256]
+
+@ CHECK: vld2.32 {d0, d2}, [r4]          @ encoding: [0x24,0xf9,0x8f,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d2}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d2}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.32 {d0, d2}, [r4:64]       @ encoding: [0x24,0xf9,0x9f,0x09]
+@ CHECK: vld2.32 {d0, d2}, [r4:128]      @ encoding: [0x24,0xf9,0xaf,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d2}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vld2.32	{d0, d2}, [r4]!
+	vld2.32	{d0, d2}, [r4:16]!
+	vld2.32	{d0, d2}, [r4:32]!
+	vld2.32	{d0, d2}, [r4:64]!
+	vld2.32	{d0, d2}, [r4:128]!
+	vld2.32	{d0, d2}, [r4:256]!
+
+@ CHECK: vld2.32 {d0, d2}, [r4]!         @ encoding: [0x24,0xf9,0x8d,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d2}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d2}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.32 {d0, d2}, [r4:64]!      @ encoding: [0x24,0xf9,0x9d,0x09]
+@ CHECK: vld2.32 {d0, d2}, [r4:128]!     @ encoding: [0x24,0xf9,0xad,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d2}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vld2.32	{d0, d2}, [r4], r6
+	vld2.32	{d0, d2}, [r4:16], r6
+	vld2.32	{d0, d2}, [r4:32], r6
+	vld2.32	{d0, d2}, [r4:64], r6
+	vld2.32	{d0, d2}, [r4:128], r6
+	vld2.32	{d0, d2}, [r4:256], r6
+
+@ CHECK: vld2.32 {d0, d2}, [r4], r6      @ encoding: [0x24,0xf9,0x86,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d2}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d2}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vld2.32 {d0, d2}, [r4:64], r6   @ encoding: [0x24,0xf9,0x96,0x09]
+@ CHECK: vld2.32 {d0, d2}, [r4:128], r6  @ encoding: [0x24,0xf9,0xa6,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d2}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vld2.32	{d0, d1, d2, d3}, [r4]
+	vld2.32	{d0, d1, d2, d3}, [r4:16]
+	vld2.32	{d0, d1, d2, d3}, [r4:32]
+	vld2.32	{d0, d1, d2, d3}, [r4:64]
+	vld2.32	{d0, d1, d2, d3}, [r4:128]
+	vld2.32	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vld2.32 {d0, d1, d2, d3}, [r4]  @ encoding: [0x24,0xf9,0x8f,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld2.32 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x24,0xf9,0x9f,0x03]
+@ CHECK: vld2.32 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x24,0xf9,0xaf,0x03]
+@ CHECK: vld2.32 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x24,0xf9,0xbf,0x03]
+
+	vld2.32	{d0, d1, d2, d3}, [r4]!
+	vld2.32	{d0, d1, d2, d3}, [r4:16]!
+	vld2.32	{d0, d1, d2, d3}, [r4:32]!
+	vld2.32	{d0, d1, d2, d3}, [r4:64]!
+	vld2.32	{d0, d1, d2, d3}, [r4:128]!
+	vld2.32	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vld2.32 {d0, d1, d2, d3}, [r4]! @ encoding: [0x24,0xf9,0x8d,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld2.32 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x24,0xf9,0x9d,0x03]
+@ CHECK: vld2.32 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x24,0xf9,0xad,0x03]
+@ CHECK: vld2.32 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x24,0xf9,0xbd,0x03]
+
+	vld2.32	{d0, d1, d2, d3}, [r4], r6
+	vld2.32	{d0, d1, d2, d3}, [r4:16], r6
+	vld2.32	{d0, d1, d2, d3}, [r4:32], r6
+	vld2.32	{d0, d1, d2, d3}, [r4:64], r6
+	vld2.32	{d0, d1, d2, d3}, [r4:128], r6
+	vld2.32	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vld2.32 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x24,0xf9,0x86,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld2.32 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x24,0xf9,0x96,0x03]
+@ CHECK: vld2.32 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x24,0xf9,0xa6,0x03]
+@ CHECK: vld2.32 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x24,0xf9,0xb6,0x03]
+
+	vld2.32	{d0[1], d1[1]}, [r4]
+	vld2.32	{d0[1], d1[1]}, [r4:16]
+	vld2.32	{d0[1], d1[1]}, [r4:32]
+	vld2.32	{d0[1], d1[1]}, [r4:64]
+	vld2.32	{d0[1], d1[1]}, [r4:128]
+	vld2.32	{d0[1], d1[1]}, [r4:256]
+
+@ CHECK: vld2.32 {d0[1], d1[1]}, [r4]    @ encoding: [0xa4,0xf9,0x8f,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d1[1]}, [r4:16]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d1[1]}, [r4:32]
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vld2.32 {d0[1], d1[1]}, [r4:64] @ encoding: [0xa4,0xf9,0x9f,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d1[1]}, [r4:128]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d1[1]}, [r4:256]
+@ CHECK-ERRORS:                                     ^
+
+	vld2.32	{d0[1], d1[1]}, [r4]!
+	vld2.32	{d0[1], d1[1]}, [r4:16]!
+	vld2.32	{d0[1], d1[1]}, [r4:32]!
+	vld2.32	{d0[1], d1[1]}, [r4:64]!
+	vld2.32	{d0[1], d1[1]}, [r4:128]!
+	vld2.32	{d0[1], d1[1]}, [r4:256]!
+
+@ CHECK: vld2.32 {d0[1], d1[1]}, [r4]!   @ encoding: [0xa4,0xf9,0x8d,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d1[1]}, [r4:16]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d1[1]}, [r4:32]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vld2.32 {d0[1], d1[1]}, [r4:64]! @ encoding: [0xa4,0xf9,0x9d,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d1[1]}, [r4:128]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d1[1]}, [r4:256]!
+@ CHECK-ERRORS:                                     ^
+
+	vld2.32	{d0[1], d1[1]}, [r4], r6
+	vld2.32	{d0[1], d1[1]}, [r4:16], r6
+	vld2.32	{d0[1], d1[1]}, [r4:32], r6
+	vld2.32	{d0[1], d1[1]}, [r4:64], r6
+	vld2.32	{d0[1], d1[1]}, [r4:128], r6
+	vld2.32	{d0[1], d1[1]}, [r4:256], r6
+
+@ CHECK: vld2.32 {d0[1], d1[1]}, [r4], r6 @ encoding: [0xa4,0xf9,0x86,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d1[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d1[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vld2.32 {d0[1], d1[1]}, [r4:64], r6 @ encoding: [0xa4,0xf9,0x96,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d1[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d1[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                     ^
+
+	vld2.32	{d0[1], d2[1]}, [r4]
+	vld2.32	{d0[1], d2[1]}, [r4:16]
+	vld2.32	{d0[1], d2[1]}, [r4:32]
+	vld2.32	{d0[1], d2[1]}, [r4:64]
+	vld2.32	{d0[1], d2[1]}, [r4:128]
+	vld2.32	{d0[1], d2[1]}, [r4:256]
+
+@ CHECK: vld2.32 {d0[1], d2[1]}, [r4]    @ encoding: [0xa4,0xf9,0xcf,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d2[1]}, [r4:16]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d2[1]}, [r4:32]
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vld2.32 {d0[1], d2[1]}, [r4:64] @ encoding: [0xa4,0xf9,0xdf,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d2[1]}, [r4:128]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d2[1]}, [r4:256]
+@ CHECK-ERRORS:                                     ^
+
+	vld2.32	{d0[1], d2[1]}, [r4]!
+	vld2.32	{d0[1], d2[1]}, [r4:16]!
+	vld2.32	{d0[1], d2[1]}, [r4:32]!
+	vld2.32	{d0[1], d2[1]}, [r4:64]!
+	vld2.32	{d0[1], d2[1]}, [r4:128]!
+	vld2.32	{d0[1], d2[1]}, [r4:256]!
+
+@ CHECK: vld2.32 {d0[1], d2[1]}, [r4]!   @ encoding: [0xa4,0xf9,0xcd,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d2[1]}, [r4:16]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d2[1]}, [r4:32]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vld2.32 {d0[1], d2[1]}, [r4:64]! @ encoding: [0xa4,0xf9,0xdd,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d2[1]}, [r4:128]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d2[1]}, [r4:256]!
+@ CHECK-ERRORS:                                     ^
+
+	vld2.32	{d0[1], d2[1]}, [r4], r6
+	vld2.32	{d0[1], d2[1]}, [r4:16], r6
+	vld2.32	{d0[1], d2[1]}, [r4:32], r6
+	vld2.32	{d0[1], d2[1]}, [r4:64], r6
+	vld2.32	{d0[1], d2[1]}, [r4:128], r6
+	vld2.32	{d0[1], d2[1]}, [r4:256], r6
+
+@ CHECK: vld2.32 {d0[1], d2[1]}, [r4], r6 @ encoding: [0xa4,0xf9,0xc6,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d2[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d2[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vld2.32 {d0[1], d2[1]}, [r4:64], r6 @ encoding: [0xa4,0xf9,0xd6,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d2[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[1], d2[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                     ^
+
+	vld2.32	{d0[], d1[]}, [r4]
+	vld2.32	{d0[], d1[]}, [r4:16]
+	vld2.32	{d0[], d1[]}, [r4:32]
+	vld2.32	{d0[], d1[]}, [r4:64]
+	vld2.32	{d0[], d1[]}, [r4:128]
+	vld2.32	{d0[], d1[]}, [r4:256]
+
+@ CHECK: vld2.32 {d0[], d1[]}, [r4]      @ encoding: [0xa4,0xf9,0x8f,0x0d]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d1[]}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d1[]}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld2.32 {d0[], d1[]}, [r4:64]   @ encoding: [0xa4,0xf9,0x9f,0x0d]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d1[]}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d1[]}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld2.32	{d0[], d1[]}, [r4]!
+	vld2.32	{d0[], d1[]}, [r4:16]!
+	vld2.32	{d0[], d1[]}, [r4:32]!
+	vld2.32	{d0[], d1[]}, [r4:64]!
+	vld2.32	{d0[], d1[]}, [r4:128]!
+	vld2.32	{d0[], d1[]}, [r4:256]!
+
+@ CHECK: vld2.32 {d0[], d1[]}, [r4]!     @ encoding: [0xa4,0xf9,0x8d,0x0d]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d1[]}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d1[]}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld2.32 {d0[], d1[]}, [r4:64]!  @ encoding: [0xa4,0xf9,0x9d,0x0d]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d1[]}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d1[]}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld2.32	{d0[], d1[]}, [r4], r6
+	vld2.32	{d0[], d1[]}, [r4:16], r6
+	vld2.32	{d0[], d1[]}, [r4:32], r6
+	vld2.32	{d0[], d1[]}, [r4:64], r6
+	vld2.32	{d0[], d1[]}, [r4:128], r6
+	vld2.32	{d0[], d1[]}, [r4:256], r6
+
+@ CHECK: vld2.32 {d0[], d1[]}, [r4], r6  @ encoding: [0xa4,0xf9,0x86,0x0d]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d1[]}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d1[]}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld2.32 {d0[], d1[]}, [r4:64], r6 @ encoding: [0xa4,0xf9,0x96,0x0d]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d1[]}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d1[]}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld2.32	{d0[], d2[]}, [r4]
+	vld2.32	{d0[], d2[]}, [r4:16]
+	vld2.32	{d0[], d2[]}, [r4:32]
+	vld2.32	{d0[], d2[]}, [r4:64]
+	vld2.32	{d0[], d2[]}, [r4:128]
+	vld2.32	{d0[], d2[]}, [r4:256]
+
+@ CHECK: vld2.32 {d0[], d2[]}, [r4]      @ encoding: [0xa4,0xf9,0xaf,0x0d]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d2[]}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d2[]}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld2.32 {d0[], d2[]}, [r4:64]   @ encoding: [0xa4,0xf9,0xbf,0x0d]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d2[]}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d2[]}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld2.32	{d0[], d2[]}, [r4]!
+	vld2.32	{d0[], d2[]}, [r4:16]!
+	vld2.32	{d0[], d2[]}, [r4:32]!
+	vld2.32	{d0[], d2[]}, [r4:64]!
+	vld2.32	{d0[], d2[]}, [r4:128]!
+	vld2.32	{d0[], d2[]}, [r4:256]!
+
+@ CHECK: vld2.32 {d0[], d2[]}, [r4]!     @ encoding: [0xa4,0xf9,0xad,0x0d]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d2[]}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d2[]}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld2.32 {d0[], d2[]}, [r4:64]!  @ encoding: [0xa4,0xf9,0xbd,0x0d]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d2[]}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d2[]}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld2.32	{d0[], d2[]}, [r4], r6
+	vld2.32	{d0[], d2[]}, [r4:16], r6
+	vld2.32	{d0[], d2[]}, [r4:32], r6
+	vld2.32	{d0[], d2[]}, [r4:64], r6
+	vld2.32	{d0[], d2[]}, [r4:128], r6
+	vld2.32	{d0[], d2[]}, [r4:256], r6
+
+@ CHECK: vld2.32 {d0[], d2[]}, [r4], r6  @ encoding: [0xa4,0xf9,0xa6,0x0d]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d2[]}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d2[]}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld2.32 {d0[], d2[]}, [r4:64], r6 @ encoding: [0xa4,0xf9,0xb6,0x0d]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d2[]}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld2.32 {d0[], d2[]}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld3.8	{d0, d1, d2}, [r4]
+	vld3.8	{d0, d1, d2}, [r4:16]
+	vld3.8	{d0, d1, d2}, [r4:32]
+	vld3.8	{d0, d1, d2}, [r4:64]
+	vld3.8	{d0, d1, d2}, [r4:128]
+	vld3.8	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vld3.8 {d0, d1, d2}, [r4]      @ encoding: [0x24,0xf9,0x0f,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.8 {d0, d1, d2}, [r4:64]   @ encoding: [0x24,0xf9,0x1f,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld3.8	{d0, d1, d2}, [r4]!
+	vld3.8	{d0, d1, d2}, [r4:16]!
+	vld3.8	{d0, d1, d2}, [r4:32]!
+	vld3.8	{d0, d1, d2}, [r4:64]!
+	vld3.8	{d0, d1, d2}, [r4:128]!
+	vld3.8	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vld3.8 {d0, d1, d2}, [r4]!     @ encoding: [0x24,0xf9,0x0d,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.8 {d0, d1, d2}, [r4:64]!  @ encoding: [0x24,0xf9,0x1d,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld3.8	{d0, d1, d2}, [r4], r6
+	vld3.8	{d0, d1, d2}, [r4:16], r6
+	vld3.8	{d0, d1, d2}, [r4:32], r6
+	vld3.8	{d0, d1, d2}, [r4:64], r6
+	vld3.8	{d0, d1, d2}, [r4:128], r6
+	vld3.8	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vld3.8 {d0, d1, d2}, [r4], r6  @ encoding: [0x24,0xf9,0x06,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.8 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x24,0xf9,0x16,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld3.8	{d0, d2, d4}, [r4]
+	vld3.8	{d0, d2, d4}, [r4:16]
+	vld3.8	{d0, d2, d4}, [r4:32]
+	vld3.8	{d0, d2, d4}, [r4:64]
+	vld3.8	{d0, d2, d4}, [r4:128]
+	vld3.8	{d0, d2, d4}, [r4:256]
+
+@ CHECK: vld3.8 {d0, d2, d4}, [r4]      @ encoding: [0x24,0xf9,0x0f,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d2, d4}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d2, d4}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.8 {d0, d2, d4}, [r4:64]   @ encoding: [0x24,0xf9,0x1f,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d2, d4}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d2, d4}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld3.8	{d0, d2, d4}, [r4]!
+	vld3.8	{d0, d2, d4}, [r4:16]!
+	vld3.8	{d0, d2, d4}, [r4:32]!
+	vld3.8	{d0, d2, d4}, [r4:64]!
+	vld3.8	{d0, d2, d4}, [r4:128]!
+	vld3.8	{d0, d2, d4}, [r4:256]!
+
+@ CHECK: vld3.8 {d0, d2, d4}, [r4]!     @ encoding: [0x24,0xf9,0x0d,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d2, d4}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d2, d4}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.8 {d0, d2, d4}, [r4:64]!  @ encoding: [0x24,0xf9,0x1d,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d2, d4}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d2, d4}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld3.8	{d0, d2, d4}, [r4], r6
+	vld3.8	{d0, d2, d4}, [r4:16], r6
+	vld3.8	{d0, d2, d4}, [r4:32], r6
+	vld3.8	{d0, d2, d4}, [r4:64], r6
+	vld3.8	{d0, d2, d4}, [r4:128], r6
+	vld3.8	{d0, d2, d4}, [r4:256], r6
+
+@ CHECK: vld3.8 {d0, d2, d4}, [r4], r6  @ encoding: [0x24,0xf9,0x06,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d2, d4}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d2, d4}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.8 {d0, d2, d4}, [r4:64], r6 @ encoding: [0x24,0xf9,0x16,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d2, d4}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.8  {d0, d2, d4}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4]
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:16]
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:32]
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:64]
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:128]
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:256]
+
+@ CHECK: vld3.8 {d0[1], d1[1], d2[1]}, [r4] @ encoding: [0xa4,0xf9,0x2f,0x02]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:16]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:32]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:64]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:128]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:256]
+@ CHECK-ERRORS:                                            ^
+
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4]!
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:16]!
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:32]!
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:64]!
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:128]!
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:256]!
+
+@ CHECK: vld3.8 {d0[1], d1[1], d2[1]}, [r4]! @ encoding: [0xa4,0xf9,0x2d,0x02]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:16]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:32]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:64]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:128]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:256]!
+@ CHECK-ERRORS:                                            ^
+
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4], r6
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:16], r6
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:32], r6
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:64], r6
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:128], r6
+	vld3.8	{d0[1], d1[1], d2[1]}, [r4:256], r6
+
+@ CHECK: vld3.8 {d0[1], d1[1], d2[1]}, [r4], r6 @ encoding: [0xa4,0xf9,0x26,0x02]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:64], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[1], d1[1], d2[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                            ^
+
+	vld3.8	{d0[], d1[], d2[]}, [r4]
+	vld3.8	{d0[], d1[], d2[]}, [r4:16]
+	vld3.8	{d0[], d1[], d2[]}, [r4:32]
+	vld3.8	{d0[], d1[], d2[]}, [r4:64]
+	vld3.8	{d0[], d1[], d2[]}, [r4:128]
+	vld3.8	{d0[], d1[], d2[]}, [r4:256]
+
+@ CHECK: vld3.8 {d0[], d1[], d2[]}, [r4] @ encoding: [0xa4,0xf9,0x0f,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:16]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:32]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:64]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:128]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:256]
+@ CHECK-ERRORS:                                         ^
+
+	vld3.8	{d0[], d1[], d2[]}, [r4]!
+	vld3.8	{d0[], d1[], d2[]}, [r4:16]!
+	vld3.8	{d0[], d1[], d2[]}, [r4:32]!
+	vld3.8	{d0[], d1[], d2[]}, [r4:64]!
+	vld3.8	{d0[], d1[], d2[]}, [r4:128]!
+	vld3.8	{d0[], d1[], d2[]}, [r4:256]!
+
+@ CHECK: vld3.8 {d0[], d1[], d2[]}, [r4]! @ encoding: [0xa4,0xf9,0x0d,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:16]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:32]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:64]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:128]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:256]!
+@ CHECK-ERRORS:                                         ^
+
+	vld3.8	{d0[], d1[], d2[]}, [r4], r6
+	vld3.8	{d0[], d1[], d2[]}, [r4:16], r6
+	vld3.8	{d0[], d1[], d2[]}, [r4:32], r6
+	vld3.8	{d0[], d1[], d2[]}, [r4:64], r6
+	vld3.8	{d0[], d1[], d2[]}, [r4:128], r6
+	vld3.8	{d0[], d1[], d2[]}, [r4:256], r6
+
+@ CHECK: vld3.8 {d0[], d1[], d2[]}, [r4], r6 @ encoding: [0xa4,0xf9,0x06,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:16], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:32], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:64], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:128], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d1[], d2[]}, [r4:256], r6
+@ CHECK-ERRORS:                                         ^
+
+	vld3.8	{d0[], d2[], d4[]}, [r4]
+	vld3.8	{d0[], d2[], d4[]}, [r4:16]
+	vld3.8	{d0[], d2[], d4[]}, [r4:32]
+	vld3.8	{d0[], d2[], d4[]}, [r4:64]
+	vld3.8	{d0[], d2[], d4[]}, [r4:128]
+	vld3.8	{d0[], d2[], d4[]}, [r4:256]
+
+@ CHECK: vld3.8 {d0[], d2[], d4[]}, [r4] @ encoding: [0xa4,0xf9,0x2f,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:16]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:32]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:64]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:128]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:256]
+@ CHECK-ERRORS:                                         ^
+
+	vld3.8	{d0[], d2[], d4[]}, [r4]!
+	vld3.8	{d0[], d2[], d4[]}, [r4:16]!
+	vld3.8	{d0[], d2[], d4[]}, [r4:32]!
+	vld3.8	{d0[], d2[], d4[]}, [r4:64]!
+	vld3.8	{d0[], d2[], d4[]}, [r4:128]!
+	vld3.8	{d0[], d2[], d4[]}, [r4:256]!
+
+@ CHECK: vld3.8 {d0[], d1[], d2[]}, [r4]! @ encoding: [0xa4,0xf9,0x2d,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:16]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:32]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:64]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:128]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:256]!
+@ CHECK-ERRORS:                                         ^
+
+	vld3.8	{d0[], d2[], d4[]}, [r4], r6
+	vld3.8	{d0[], d2[], d4[]}, [r4:16], r6
+	vld3.8	{d0[], d2[], d4[]}, [r4:32], r6
+	vld3.8	{d0[], d2[], d4[]}, [r4:64], r6
+	vld3.8	{d0[], d2[], d4[]}, [r4:128], r6
+	vld3.8	{d0[], d2[], d4[]}, [r4:256], r6
+
+@ CHECK: vld3.8 {d0[], d2[], d4[]}, [r4], r6 @ encoding: [0xa4,0xf9,0x26,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:16], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:32], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:64], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:128], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.8  {d0[], d2[], d4[]}, [r4:256], r6
+@ CHECK-ERRORS:                                         ^
+
+	vld3.16	{d0, d1, d2}, [r4]
+	vld3.16	{d0, d1, d2}, [r4:16]
+	vld3.16	{d0, d1, d2}, [r4:32]
+	vld3.16	{d0, d1, d2}, [r4:64]
+	vld3.16	{d0, d1, d2}, [r4:128]
+	vld3.16	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vld3.16 {d0, d1, d2}, [r4]      @ encoding: [0x24,0xf9,0x4f,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.16 {d0, d1, d2}, [r4:64]   @ encoding: [0x24,0xf9,0x5f,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld3.16	{d0, d1, d2}, [r4]!
+	vld3.16	{d0, d1, d2}, [r4:16]!
+	vld3.16	{d0, d1, d2}, [r4:32]!
+	vld3.16	{d0, d1, d2}, [r4:64]!
+	vld3.16	{d0, d1, d2}, [r4:128]!
+	vld3.16	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vld3.16 {d0, d1, d2}, [r4]!     @ encoding: [0x24,0xf9,0x4d,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.16 {d0, d1, d2}, [r4:64]!  @ encoding: [0x24,0xf9,0x5d,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld3.16	{d0, d1, d2}, [r4], r6
+	vld3.16	{d0, d1, d2}, [r4:16], r6
+	vld3.16	{d0, d1, d2}, [r4:32], r6
+	vld3.16	{d0, d1, d2}, [r4:64], r6
+	vld3.16	{d0, d1, d2}, [r4:128], r6
+	vld3.16	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vld3.16 {d0, d1, d2}, [r4], r6  @ encoding: [0x24,0xf9,0x46,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.16 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x24,0xf9,0x56,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld3.16	{d0, d2, d4}, [r4]
+	vld3.16	{d0, d2, d4}, [r4:16]
+	vld3.16	{d0, d2, d4}, [r4:32]
+	vld3.16	{d0, d2, d4}, [r4:64]
+	vld3.16	{d0, d2, d4}, [r4:128]
+	vld3.16	{d0, d2, d4}, [r4:256]
+
+@ CHECK: vld3.16 {d0, d2, d4}, [r4]      @ encoding: [0x24,0xf9,0x4f,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d2, d4}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d2, d4}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.16 {d0, d2, d4}, [r4:64]   @ encoding: [0x24,0xf9,0x5f,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d2, d4}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d2, d4}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld3.16	{d0, d2, d4}, [r4]!
+	vld3.16	{d0, d2, d4}, [r4:16]!
+	vld3.16	{d0, d2, d4}, [r4:32]!
+	vld3.16	{d0, d2, d4}, [r4:64]!
+	vld3.16	{d0, d2, d4}, [r4:128]!
+	vld3.16	{d0, d2, d4}, [r4:256]!
+
+@ CHECK: vld3.16 {d0, d2, d4}, [r4]!     @ encoding: [0x24,0xf9,0x4d,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d2, d4}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d2, d4}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.16 {d0, d2, d4}, [r4:64]!  @ encoding: [0x24,0xf9,0x5d,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d2, d4}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d2, d4}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld3.16	{d0, d2, d4}, [r4], r6
+	vld3.16	{d0, d2, d4}, [r4:16], r6
+	vld3.16	{d0, d2, d4}, [r4:32], r6
+	vld3.16	{d0, d2, d4}, [r4:64], r6
+	vld3.16	{d0, d2, d4}, [r4:128], r6
+	vld3.16	{d0, d2, d4}, [r4:256], r6
+
+@ CHECK: vld3.16 {d0, d2, d4}, [r4], r6  @ encoding: [0x24,0xf9,0x46,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d2, d4}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d2, d4}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.16 {d0, d2, d4}, [r4:64], r6 @ encoding: [0x24,0xf9,0x56,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d2, d4}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.16 {d0, d2, d4}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4]
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:16]
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:32]
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:64]
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:128]
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:256]
+
+@ CHECK: vld3.16 {d0[1], d1[1], d2[1]}, [r4] @ encoding: [0xa4,0xf9,0x4f,0x06]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:16]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:32]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:64]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:128]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:256]
+@ CHECK-ERRORS:                                            ^
+
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4]!
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:16]!
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:32]!
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:64]!
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:128]!
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:256]!
+
+@ CHECK: vld3.16 {d0[1], d1[1], d2[1]}, [r4]! @ encoding: [0xa4,0xf9,0x4d,0x06]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:16]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:32]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:64]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:128]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:256]!
+@ CHECK-ERRORS:                                            ^
+
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4], r6
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:16], r6
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:32], r6
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:64], r6
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:128], r6
+	vld3.16	{d0[1], d1[1], d2[1]}, [r4:256], r6
+
+@ CHECK: vld3.16 {d0[1], d1[1], d2[1]}, [r4], r6 @ encoding: [0xa4,0xf9,0x46,0x06]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:64], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d1[1], d2[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                            ^
+
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4]
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:16]
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:32]
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:64]
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:128]
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:256]
+
+@ CHECK: vld3.16 {d0[1], d2[1], d4[1]}, [r4] @ encoding: [0xa4,0xf9,0x6f,0x06]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:16]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:32]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:64]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:128]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:256]
+@ CHECK-ERRORS:                                            ^
+
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4]!
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:16]!
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:32]!
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:64]!
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:128]!
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:256]!
+
+@ CHECK: vld3.16 {d0[1], d1[1], d2[1]}, [r4]! @ encoding: [0xa4,0xf9,0x6d,0x06]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:16]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:32]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:64]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:128]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:256]!
+@ CHECK-ERRORS:                                            ^
+
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4], r6
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:16], r6
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:32], r6
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:64], r6
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:128], r6
+	vld3.16	{d0[1], d2[1], d4[1]}, [r4:256], r6
+
+@ CHECK: vld3.16 {d0[1], d2[1], d4[1]}, [r4], r6 @ encoding: [0xa4,0xf9,0x66,0x06]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:64], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[1], d2[1], d4[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                            ^
+
+	vld3.16	{d0[], d1[], d2[]}, [r4]
+	vld3.16	{d0[], d1[], d2[]}, [r4:16]
+	vld3.16	{d0[], d1[], d2[]}, [r4:32]
+	vld3.16	{d0[], d1[], d2[]}, [r4:64]
+	vld3.16	{d0[], d1[], d2[]}, [r4:128]
+	vld3.16	{d0[], d1[], d2[]}, [r4:256]
+
+@ CHECK: vld3.16 {d0[], d1[], d2[]}, [r4] @ encoding: [0xa4,0xf9,0x4f,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:16]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:32]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:64]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:128]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:256]
+@ CHECK-ERRORS:                                         ^
+
+	vld3.16	{d0[], d1[], d2[]}, [r4]!
+	vld3.16	{d0[], d1[], d2[]}, [r4:16]!
+	vld3.16	{d0[], d1[], d2[]}, [r4:32]!
+	vld3.16	{d0[], d1[], d2[]}, [r4:64]!
+	vld3.16	{d0[], d1[], d2[]}, [r4:128]!
+	vld3.16	{d0[], d1[], d2[]}, [r4:256]!
+
+@ CHECK: vld3.16 {d0[], d1[], d2[]}, [r4]! @ encoding: [0xa4,0xf9,0x4d,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:16]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:32]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:64]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:128]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:256]!
+@ CHECK-ERRORS:                                         ^
+
+	vld3.16	{d0[], d1[], d2[]}, [r4], r6
+	vld3.16	{d0[], d1[], d2[]}, [r4:16], r6
+	vld3.16	{d0[], d1[], d2[]}, [r4:32], r6
+	vld3.16	{d0[], d1[], d2[]}, [r4:64], r6
+	vld3.16	{d0[], d1[], d2[]}, [r4:128], r6
+	vld3.16	{d0[], d1[], d2[]}, [r4:256], r6
+
+@ CHECK: vld3.16 {d0[], d1[], d2[]}, [r4], r6 @ encoding: [0xa4,0xf9,0x46,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:16], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:32], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:64], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:128], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d1[], d2[]}, [r4:256], r6
+@ CHECK-ERRORS:                                         ^
+
+	vld3.16	{d0[], d2[], d4[]}, [r4]
+	vld3.16	{d0[], d2[], d4[]}, [r4:16]
+	vld3.16	{d0[], d2[], d4[]}, [r4:32]
+	vld3.16	{d0[], d2[], d4[]}, [r4:64]
+	vld3.16	{d0[], d2[], d4[]}, [r4:128]
+	vld3.16	{d0[], d2[], d4[]}, [r4:256]
+
+@ CHECK: vld3.16 {d0[], d2[], d4[]}, [r4] @ encoding: [0xa4,0xf9,0x6f,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:16]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:32]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:64]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:128]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:256]
+@ CHECK-ERRORS:                                         ^
+
+	vld3.16	{d0[], d2[], d4[]}, [r4]!
+	vld3.16	{d0[], d2[], d4[]}, [r4:16]!
+	vld3.16	{d0[], d2[], d4[]}, [r4:32]!
+	vld3.16	{d0[], d2[], d4[]}, [r4:64]!
+	vld3.16	{d0[], d2[], d4[]}, [r4:128]!
+	vld3.16	{d0[], d2[], d4[]}, [r4:256]!
+
+@ CHECK: vld3.16 {d0[], d2[], d4[]}, [r4]! @ encoding: [0xa4,0xf9,0x6d,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:16]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:32]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:64]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:128]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:256]!
+@ CHECK-ERRORS:                                         ^
+
+	vld3.16	{d0[], d2[], d4[]}, [r4], r6
+	vld3.16	{d0[], d2[], d4[]}, [r4:16], r6
+	vld3.16	{d0[], d2[], d4[]}, [r4:32], r6
+	vld3.16	{d0[], d2[], d4[]}, [r4:64], r6
+	vld3.16	{d0[], d2[], d4[]}, [r4:128], r6
+	vld3.16	{d0[], d2[], d4[]}, [r4:256], r6
+
+@ CHECK: vld3.16 {d0[], d2[], d4[]}, [r4], r6 @ encoding: [0xa4,0xf9,0x66,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:16], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:32], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:64], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:128], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.16 {d0[], d2[], d4[]}, [r4:256], r6
+
+	vld3.32	{d0, d1, d2}, [r4]
+	vld3.32	{d0, d1, d2}, [r4:16]
+	vld3.32	{d0, d1, d2}, [r4:32]
+	vld3.32	{d0, d1, d2}, [r4:64]
+	vld3.32	{d0, d1, d2}, [r4:128]
+	vld3.32	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vld3.32 {d0, d1, d2}, [r4]      @ encoding: [0x24,0xf9,0x8f,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.32 {d0, d1, d2}, [r4:64]   @ encoding: [0x24,0xf9,0x9f,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld3.32	{d0, d1, d2}, [r4]!
+	vld3.32	{d0, d1, d2}, [r4:16]!
+	vld3.32	{d0, d1, d2}, [r4:32]!
+	vld3.32	{d0, d1, d2}, [r4:64]!
+	vld3.32	{d0, d1, d2}, [r4:128]!
+	vld3.32	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vld3.32 {d0, d1, d2}, [r4]!     @ encoding: [0x24,0xf9,0x8d,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.32 {d0, d1, d2}, [r4:64]!  @ encoding: [0x24,0xf9,0x9d,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld3.32	{d0, d1, d2}, [r4], r6
+	vld3.32	{d0, d1, d2}, [r4:16], r6
+	vld3.32	{d0, d1, d2}, [r4:32], r6
+	vld3.32	{d0, d1, d2}, [r4:64], r6
+	vld3.32	{d0, d1, d2}, [r4:128], r6
+	vld3.32	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vld3.32 {d0, d1, d2}, [r4], r6  @ encoding: [0x24,0xf9,0x86,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.32 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x24,0xf9,0x96,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld3.32	{d0, d2, d4}, [r4]
+	vld3.32	{d0, d2, d4}, [r4:16]
+	vld3.32	{d0, d2, d4}, [r4:32]
+	vld3.32	{d0, d2, d4}, [r4:64]
+	vld3.32	{d0, d2, d4}, [r4:128]
+	vld3.32	{d0, d2, d4}, [r4:256]
+
+@ CHECK: vld3.32 {d0, d2, d4}, [r4]      @ encoding: [0x24,0xf9,0x8f,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d2, d4}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d2, d4}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.32 {d0, d2, d4}, [r4:64]   @ encoding: [0x24,0xf9,0x9f,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d2, d4}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d2, d4}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vld3.32	{d0, d2, d4}, [r4]!
+	vld3.32	{d0, d2, d4}, [r4:16]!
+	vld3.32	{d0, d2, d4}, [r4:32]!
+	vld3.32	{d0, d2, d4}, [r4:64]!
+	vld3.32	{d0, d2, d4}, [r4:128]!
+	vld3.32	{d0, d2, d4}, [r4:256]!
+
+@ CHECK: vld3.32 {d0, d2, d4}, [r4]!     @ encoding: [0x24,0xf9,0x8d,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d2, d4}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d2, d4}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.32 {d0, d2, d4}, [r4:64]!  @ encoding: [0x24,0xf9,0x9d,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d2, d4}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d2, d4}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vld3.32	{d0, d2, d4}, [r4], r6
+	vld3.32	{d0, d2, d4}, [r4:16], r6
+	vld3.32	{d0, d2, d4}, [r4:32], r6
+	vld3.32	{d0, d2, d4}, [r4:64], r6
+	vld3.32	{d0, d2, d4}, [r4:128], r6
+	vld3.32	{d0, d2, d4}, [r4:256], r6
+
+@ CHECK: vld3.32 {d0, d2, d4}, [r4], r6  @ encoding: [0x24,0xf9,0x86,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d2, d4}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d2, d4}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vld3.32 {d0, d2, d4}, [r4:64], r6 @ encoding: [0x24,0xf9,0x96,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d2, d4}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld3.32 {d0, d2, d4}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4]
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:16]
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:32]
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:64]
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:128]
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:256]
+
+@ CHECK: vld3.32 {d0[1], d1[1], d2[1]}, [r4] @ encoding: [0xa4,0xf9,0x8f,0x0a]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:16]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:32]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:64]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:128]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:256]
+@ CHECK-ERRORS:                                            ^
+
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4]!
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:16]!
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:32]!
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:64]!
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:128]!
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:256]!
+
+@ CHECK: vld3.32 {d0[1], d1[1], d2[1]}, [r4]! @ encoding: [0xa4,0xf9,0x8d,0x0a]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:16]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:32]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:64]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:128]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:256]!
+@ CHECK-ERRORS:                                            ^
+
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4], r6
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:16], r6
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:32], r6
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:64], r6
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:128], r6
+	vld3.32	{d0[1], d1[1], d2[1]}, [r4:256], r6
+
+@ CHECK: vld3.32 {d0[1], d1[1], d2[1]}, [r4], r6 @ encoding: [0xa4,0xf9,0x86,0x0a]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:64], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d1[1], d2[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                            ^
+
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4]
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:16]
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:32]
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:64]
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:128]
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:256]
+
+@ CHECK: vld3.32 {d0[1], d2[1], d4[1]}, [r4] @ encoding: [0xa4,0xf9,0xcf,0x0a]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:16]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:32]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:64]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:128]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:256]
+@ CHECK-ERRORS:                                            ^
+
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4]!
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:16]!
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:32]!
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:64]!
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:128]!
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:256]!
+
+@ CHECK: vld3.32 {d0[1], d2[1], d4[1]}, [r4]! @ encoding: [0xa4,0xf9,0xcd,0x0a]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:16]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:32]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:64]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:128]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:256]!
+@ CHECK-ERRORS:                                            ^
+
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4], r6
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:16], r6
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:32], r6
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:64], r6
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:128], r6
+	vld3.32	{d0[1], d2[1], d4[1]}, [r4:256], r6
+
+@ CHECK: vld3.32 {d0[1], d2[1], d4[1]}, [r4], r6 @ encoding: [0xa4,0xf9,0xc6,0x0a]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:64], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[1], d2[1], d4[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                            ^
+
+	vld3.32	{d0[], d1[], d2[]}, [r4]
+	vld3.32	{d0[], d1[], d2[]}, [r4:16]
+	vld3.32	{d0[], d1[], d2[]}, [r4:32]
+	vld3.32	{d0[], d1[], d2[]}, [r4:64]
+	vld3.32	{d0[], d1[], d2[]}, [r4:128]
+	vld3.32	{d0[], d1[], d2[]}, [r4:256]
+
+@ CHECK: vld3.32 {d0[], d1[], d2[]}, [r4] @ encoding: [0xa4,0xf9,0x8f,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:16]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:32]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:64]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:128]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:256]
+@ CHECK-ERRORS:                                         ^
+
+	vld3.32	{d0[], d1[], d2[]}, [r4]!
+	vld3.32	{d0[], d1[], d2[]}, [r4:16]!
+	vld3.32	{d0[], d1[], d2[]}, [r4:32]!
+	vld3.32	{d0[], d1[], d2[]}, [r4:64]!
+	vld3.32	{d0[], d1[], d2[]}, [r4:128]!
+	vld3.32	{d0[], d1[], d2[]}, [r4:256]!
+
+@ CHECK: vld3.32 {d0[], d1[], d2[]}, [r4]! @ encoding: [0xa4,0xf9,0x8d,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:16]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:32]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:64]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:128]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:256]!
+@ CHECK-ERRORS:                                         ^
+
+	vld3.32	{d0[], d1[], d2[]}, [r4], r6
+	vld3.32	{d0[], d1[], d2[]}, [r4:16], r6
+	vld3.32	{d0[], d1[], d2[]}, [r4:32], r6
+	vld3.32	{d0[], d1[], d2[]}, [r4:64], r6
+	vld3.32	{d0[], d1[], d2[]}, [r4:128], r6
+	vld3.32	{d0[], d1[], d2[]}, [r4:256], r6
+
+@ CHECK: vld3.32 {d0[], d1[], d2[]}, [r4], r6 @ encoding: [0xa4,0xf9,0x86,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:16], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:32], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:64], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:128], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d1[], d2[]}, [r4:256], r6
+@ CHECK-ERRORS:                                         ^
+
+	vld3.32	{d0[], d2[], d4[]}, [r4]
+	vld3.32	{d0[], d2[], d4[]}, [r4:16]
+	vld3.32	{d0[], d2[], d4[]}, [r4:32]
+	vld3.32	{d0[], d2[], d4[]}, [r4:64]
+	vld3.32	{d0[], d2[], d4[]}, [r4:128]
+	vld3.32	{d0[], d2[], d4[]}, [r4:256]
+
+@ CHECK: vld3.32 {d0[], d2[], d4[]}, [r4] @ encoding: [0xa4,0xf9,0xaf,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:16]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:32]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:64]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:128]
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:256]
+@ CHECK-ERRORS:                                         ^
+
+	vld3.32	{d0[], d2[], d4[]}, [r4]!
+	vld3.32	{d0[], d2[], d4[]}, [r4:16]!
+	vld3.32	{d0[], d2[], d4[]}, [r4:32]!
+	vld3.32	{d0[], d2[], d4[]}, [r4:64]!
+	vld3.32	{d0[], d2[], d4[]}, [r4:128]!
+	vld3.32	{d0[], d2[], d4[]}, [r4:256]!
+
+@ CHECK: vld3.32 {d0[], d2[], d4[]}, [r4]! @ encoding: [0xa4,0xf9,0xad,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:16]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:32]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:64]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:128]!
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:256]!
+@ CHECK-ERRORS:                                         ^
+
+	vld3.32	{d0[], d2[], d4[]}, [r4], r6
+	vld3.32	{d0[], d2[], d4[]}, [r4:16], r6
+	vld3.32	{d0[], d2[], d4[]}, [r4:32], r6
+	vld3.32	{d0[], d2[], d4[]}, [r4:64], r6
+	vld3.32	{d0[], d2[], d4[]}, [r4:128], r6
+	vld3.32	{d0[], d2[], d4[]}, [r4:256], r6
+
+@ CHECK: vld3.32 {d0[], d2[], d4[]}, [r4], r6 @ encoding: [0xa4,0xf9,0xa6,0x0e]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:16], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:32], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:64], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:128], r6
+@ CHECK-ERRORS:                                         ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vld3.32 {d0[], d2[], d4[]}, [r4:256], r6
+@ CHECK-ERRORS:                                         ^
+
+	vld4.8	{d0, d1, d2, d3}, [r4]
+	vld4.8	{d0, d1, d2, d3}, [r4:16]
+	vld4.8	{d0, d1, d2, d3}, [r4:32]
+	vld4.8	{d0, d1, d2, d3}, [r4:64]
+	vld4.8	{d0, d1, d2, d3}, [r4:128]
+	vld4.8	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vld4.8 {d0, d1, d2, d3}, [r4]  @ encoding: [0x24,0xf9,0x0f,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.8 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x24,0xf9,0x1f,0x00]
+@ CHECK: vld4.8 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x24,0xf9,0x2f,0x00]
+@ CHECK: vld4.8 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x24,0xf9,0x3f,0x00]
+
+	vld4.8	{d0, d1, d2, d3}, [r4]!
+	vld4.8	{d0, d1, d2, d3}, [r4:16]!
+	vld4.8	{d0, d1, d2, d3}, [r4:32]!
+	vld4.8	{d0, d1, d2, d3}, [r4:64]!
+	vld4.8	{d0, d1, d2, d3}, [r4:128]!
+	vld4.8	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vld4.8 {d0, d1, d2, d3}, [r4]! @ encoding: [0x24,0xf9,0x0d,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.8 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x24,0xf9,0x1d,0x00]
+@ CHECK: vld4.8 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x24,0xf9,0x2d,0x00]
+@ CHECK: vld4.8 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x24,0xf9,0x3d,0x00]
+
+	vld4.8	{d0, d1, d2, d3}, [r4], r6
+	vld4.8	{d0, d1, d2, d3}, [r4:16], r6
+	vld4.8	{d0, d1, d2, d3}, [r4:32], r6
+	vld4.8	{d0, d1, d2, d3}, [r4:64], r6
+	vld4.8	{d0, d1, d2, d3}, [r4:128], r6
+	vld4.8	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vld4.8 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x24,0xf9,0x06,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.8 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x24,0xf9,0x16,0x00]
+@ CHECK: vld4.8 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x24,0xf9,0x26,0x00]
+@ CHECK: vld4.8 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x24,0xf9,0x36,0x00]
+
+	vld4.8	{d0, d2, d4, d6}, [r4]
+	vld4.8	{d0, d2, d4, d6}, [r4:16]
+	vld4.8	{d0, d2, d4, d6}, [r4:32]
+	vld4.8	{d0, d2, d4, d6}, [r4:64]
+	vld4.8	{d0, d2, d4, d6}, [r4:128]
+	vld4.8	{d0, d2, d4, d6}, [r4:256]
+
+@ CHECK: vld4.8 {d0, d2, d4, d6}, [r4]  @ encoding: [0x24,0xf9,0x0f,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0, d2, d4, d6}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0, d2, d4, d6}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.8 {d0, d2, d4, d6}, [r4:64] @ encoding: [0x24,0xf9,0x1f,0x01]
+@ CHECK: vld4.8 {d0, d2, d4, d6}, [r4:128] @ encoding: [0x24,0xf9,0x2f,0x01]
+@ CHECK: vld4.8 {d0, d2, d4, d6}, [r4:256] @ encoding: [0x24,0xf9,0x3f,0x01]
+
+	vld4.8	{d0, d2, d4, d6}, [r4]!
+	vld4.8	{d0, d2, d4, d6}, [r4:16]!
+	vld4.8	{d0, d2, d4, d6}, [r4:32]!
+	vld4.8	{d0, d2, d4, d6}, [r4:64]!
+	vld4.8	{d0, d2, d4, d6}, [r4:128]!
+	vld4.8	{d0, d2, d4, d6}, [r4:256]!
+
+@ CHECK: vld4.8 {d0, d2, d4, d6}, [r4]! @ encoding: [0x24,0xf9,0x0d,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0, d2, d4, d6}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0, d2, d4, d6}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.8 {d0, d2, d4, d6}, [r4:64]! @ encoding: [0x24,0xf9,0x1d,0x01]
+@ CHECK: vld4.8 {d0, d2, d4, d6}, [r4:128]! @ encoding: [0x24,0xf9,0x2d,0x01]
+@ CHECK: vld4.8 {d0, d2, d4, d6}, [r4:256]! @ encoding: [0x24,0xf9,0x3d,0x01]
+
+	vld4.8	{d0, d2, d4, d6}, [r4], r6
+	vld4.8	{d0, d2, d4, d6}, [r4:16], r6
+	vld4.8	{d0, d2, d4, d6}, [r4:32], r6
+	vld4.8	{d0, d2, d4, d6}, [r4:64], r6
+	vld4.8	{d0, d2, d4, d6}, [r4:128], r6
+	vld4.8	{d0, d2, d4, d6}, [r4:256], r6
+
+@ CHECK: vld4.8 {d0, d2, d4, d6}, [r4], r6 @ encoding: [0x24,0xf9,0x06,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0, d2, d4, d6}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0, d2, d4, d6}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.8 {d0, d2, d4, d6}, [r4:64], r6 @ encoding: [0x24,0xf9,0x16,0x01]
+@ CHECK: vld4.8 {d0, d2, d4, d6}, [r4:128], r6 @ encoding: [0x24,0xf9,0x26,0x01]
+@ CHECK: vld4.8 {d0, d2, d4, d6}, [r4:256], r6 @ encoding: [0x24,0xf9,0x36,0x01]
+
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4]
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]
+
+@ CHECK: vld4.8 {d0[1], d1[1], d2[1], d3[1]}, [r4] @ encoding: [0xa4,0xf9,0x2f,0x03]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:16]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.8 {d0[1], d1[1], d2[1], d3[1]}, [r4:32] @ encoding: [0xa4,0xf9,0x3f,0x03]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:64]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:128]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:256]
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4]!
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]!
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]!
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]!
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+
+@ CHECK: vld4.8 {d0[1], d1[1], d2[1], d3[1]}, [r4]! @ encoding: [0xa4,0xf9,0x2d,0x03]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.8 {d0[1], d1[1], d2[1], d3[1]}, [r4:32]! @ encoding: [0xa4,0xf9,0x3d,0x03]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:64]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:128]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4], r6
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6
+	vld4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+
+@ CHECK: vld4.8 {d0[1], d1[1], d2[1], d3[1]}, [r4], r6 @ encoding: [0xa4,0xf9,0x26,0x03]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.8 {d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6 @ encoding: [0xa4,0xf9,0x36,0x03]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4]
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:16]
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:32]
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:64]
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:128]
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:256]
+
+@ CHECK: vld4.8 {d0[], d1[], d2[], d3[]}, [r4] @ encoding: [0xa4,0xf9,0x0f,0x0f]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d1[], d2[], d3[]}, [r4:16]
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.8 {d0[], d1[], d2[], d3[]}, [r4:32] @ encoding: [0xa4,0xf9,0x1f,0x0f]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d1[], d2[], d3[]}, [r4:64]
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d1[], d2[], d3[]}, [r4:128]
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d1[], d2[], d3[]}, [r4:256]
+@ CHECK-ERRORS:                                               ^
+
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4]!
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:16]!
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:32]!
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:64]!
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:128]!
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:256]!
+
+@ CHECK: vld4.8 {d0[], d1[], d2[], d3[]}, [r4]! @ encoding: [0xa4,0xf9,0x0d,0x0f]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d1[], d2[], d3[]}, [r4:16]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.8 {d0[], d1[], d2[], d3[]}, [r4:32]! @ encoding: [0xa4,0xf9,0x1d,0x0f]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d1[], d2[], d3[]}, [r4:64]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d1[], d2[], d3[]}, [r4:128]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d1[], d2[], d3[]}, [r4:256]!
+@ CHECK-ERRORS:                                               ^
+
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4], r6
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:16], r6
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:32], r6
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:64], r6
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:128], r6
+	vld4.8	{d0[], d1[], d2[], d3[]}, [r4:256], r6
+
+@ CHECK: vld4.8 {d0[], d1[], d2[], d3[]}, [r4], r6 @ encoding: [0xa4,0xf9,0x06,0x0f]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d1[], d2[], d3[]}, [r4:16], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.8 {d0[], d1[], d2[], d3[]}, [r4:32], r6 @ encoding: [0xa4,0xf9,0x16,0x0f]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d1[], d2[], d3[]}, [r4:64], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d1[], d2[], d3[]}, [r4:128], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d1[], d2[], d3[]}, [r4:256], r6
+@ CHECK-ERRORS:                                               ^
+
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4]
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:16]
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:32]
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:64]
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:128]
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:256]
+
+@ CHECK: vld4.8 {d0[], d2[], d4[], d6[]}, [r4] @ encoding: [0xa4,0xf9,0x2f,0x0f]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d2[], d4[], d6[]}, [r4:16]
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.8 {d0[], d2[], d4[], d6[]}, [r4:32] @ encoding: [0xa4,0xf9,0x3f,0x0f]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d2[], d4[], d6[]}, [r4:64]
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d2[], d4[], d6[]}, [r4:128]
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d2[], d4[], d6[]}, [r4:256]
+@ CHECK-ERRORS:                                               ^
+
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4]!
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:16]!
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:32]!
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:64]!
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:128]!
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:256]!
+
+@ CHECK: vld4.8 {d0[], d1[], d2[], d3[]}, [r4]! @ encoding: [0xa4,0xf9,0x2d,0x0f]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d2[], d4[], d6[]}, [r4:16]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.8 {d0[], d1[], d2[], d3[]}, [r4:32]! @ encoding: [0xa4,0xf9,0x3d,0x0f]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d2[], d4[], d6[]}, [r4:64]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d2[], d4[], d6[]}, [r4:128]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d2[], d4[], d6[]}, [r4:256]!
+@ CHECK-ERRORS:                                               ^
+
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4], r6
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:16], r6
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:32], r6
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:64], r6
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:128], r6
+	vld4.8	{d0[], d2[], d4[], d6[]}, [r4:256], r6
+
+@ CHECK: vld4.8 {d0[], d2[], d4[], d6[]}, [r4], r6 @ encoding: [0xa4,0xf9,0x26,0x0f]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d2[], d4[], d6[]}, [r4:16], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.8 {d0[], d2[], d4[], d6[]}, [r4:32], r6 @ encoding: [0xa4,0xf9,0x36,0x0f]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d2[], d4[], d6[]}, [r4:64], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d2[], d4[], d6[]}, [r4:128], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vld4.8  {d0[], d2[], d4[], d6[]}, [r4:256], r6
+@ CHECK-ERRORS:                                               ^
+
+	vld4.16	{d0, d1, d2, d3}, [r4]
+	vld4.16	{d0, d1, d2, d3}, [r4:16]
+	vld4.16	{d0, d1, d2, d3}, [r4:32]
+	vld4.16	{d0, d1, d2, d3}, [r4:64]
+	vld4.16	{d0, d1, d2, d3}, [r4:128]
+	vld4.16	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vld4.16 {d0, d1, d2, d3}, [r4]  @ encoding: [0x24,0xf9,0x4f,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.16 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x24,0xf9,0x5f,0x00]
+@ CHECK: vld4.16 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x24,0xf9,0x6f,0x00]
+@ CHECK: vld4.16 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x24,0xf9,0x7f,0x00]
+
+	vld4.16	{d0, d1, d2, d3}, [r4]!
+	vld4.16	{d0, d1, d2, d3}, [r4:16]!
+	vld4.16	{d0, d1, d2, d3}, [r4:32]!
+	vld4.16	{d0, d1, d2, d3}, [r4:64]!
+	vld4.16	{d0, d1, d2, d3}, [r4:128]!
+	vld4.16	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vld4.16 {d0, d1, d2, d3}, [r4]! @ encoding: [0x24,0xf9,0x4d,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.16 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x24,0xf9,0x5d,0x00]
+@ CHECK: vld4.16 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x24,0xf9,0x6d,0x00]
+@ CHECK: vld4.16 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x24,0xf9,0x7d,0x00]
+
+	vld4.16	{d0, d1, d2, d3}, [r4], r6
+	vld4.16	{d0, d1, d2, d3}, [r4:16], r6
+	vld4.16	{d0, d1, d2, d3}, [r4:32], r6
+	vld4.16	{d0, d1, d2, d3}, [r4:64], r6
+	vld4.16	{d0, d1, d2, d3}, [r4:128], r6
+	vld4.16	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vld4.16 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x24,0xf9,0x46,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.16 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x24,0xf9,0x56,0x00]
+@ CHECK: vld4.16 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x24,0xf9,0x66,0x00]
+@ CHECK: vld4.16 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x24,0xf9,0x76,0x00]
+
+	vld4.16	{d0, d2, d4, d6}, [r4]
+	vld4.16	{d0, d2, d4, d6}, [r4:16]
+	vld4.16	{d0, d2, d4, d6}, [r4:32]
+	vld4.16	{d0, d2, d4, d6}, [r4:64]
+	vld4.16	{d0, d2, d4, d6}, [r4:128]
+	vld4.16	{d0, d2, d4, d6}, [r4:256]
+
+@ CHECK: vld4.16 {d0, d2, d4, d6}, [r4]  @ encoding: [0x24,0xf9,0x4f,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0, d2, d4, d6}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0, d2, d4, d6}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.16 {d0, d2, d4, d6}, [r4:64] @ encoding: [0x24,0xf9,0x5f,0x01]
+@ CHECK: vld4.16 {d0, d2, d4, d6}, [r4:128] @ encoding: [0x24,0xf9,0x6f,0x01]
+@ CHECK: vld4.16 {d0, d2, d4, d6}, [r4:256] @ encoding: [0x24,0xf9,0x7f,0x01]
+
+	vld4.16	{d0, d2, d4, d6}, [r4]!
+	vld4.16	{d0, d2, d4, d6}, [r4:16]!
+	vld4.16	{d0, d2, d4, d6}, [r4:32]!
+	vld4.16	{d0, d2, d4, d6}, [r4:64]!
+	vld4.16	{d0, d2, d4, d6}, [r4:128]!
+	vld4.16	{d0, d2, d4, d6}, [r4:256]!
+
+@ CHECK: vld4.16 {d0, d2, d4, d6}, [r4]! @ encoding: [0x24,0xf9,0x4d,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0, d2, d4, d6}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0, d2, d4, d6}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.16 {d0, d2, d4, d6}, [r4:64]! @ encoding: [0x24,0xf9,0x5d,0x01]
+@ CHECK: vld4.16 {d0, d2, d4, d6}, [r4:128]! @ encoding: [0x24,0xf9,0x6d,0x01]
+@ CHECK: vld4.16 {d0, d2, d4, d6}, [r4:256]! @ encoding: [0x24,0xf9,0x7d,0x01]
+
+	vld4.16	{d0, d2, d4, d6}, [r4], r6
+	vld4.16	{d0, d2, d4, d6}, [r4:16], r6
+	vld4.16	{d0, d2, d4, d6}, [r4:32], r6
+	vld4.16	{d0, d2, d4, d6}, [r4:64], r6
+	vld4.16	{d0, d2, d4, d6}, [r4:128], r6
+	vld4.16	{d0, d2, d4, d6}, [r4:256], r6
+
+@ CHECK: vld4.16 {d0, d2, d4, d6}, [r4], r6 @ encoding: [0x24,0xf9,0x46,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0, d2, d4, d6}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0, d2, d4, d6}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.16 {d0, d2, d4, d6}, [r4:64], r6 @ encoding: [0x24,0xf9,0x56,0x01]
+@ CHECK: vld4.16 {d0, d2, d4, d6}, [r4:128], r6 @ encoding: [0x24,0xf9,0x66,0x01]
+@ CHECK: vld4.16 {d0, d2, d4, d6}, [r4:256], r6 @ encoding: [0x24,0xf9,0x76,0x01]
+
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4]
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]
+
+@ CHECK: vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4] @ encoding: [0xa4,0xf9,0x4f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:16]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:32]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:64] @ encoding: [0xa4,0xf9,0x5f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:128]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:256]
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4]!
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]!
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]!
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]!
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+
+@ CHECK: vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4]! @ encoding: [0xa4,0xf9,0x4d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:32]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:64]! @ encoding: [0xa4,0xf9,0x5d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:128]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4], r6
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6
+	vld4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+
+@ CHECK: vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4], r6 @ encoding: [0xa4,0xf9,0x46,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6 @ encoding: [0xa4,0xf9,0x56,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4]
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:16]
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:32]
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:64]
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:128]
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:256]
+
+@ CHECK: vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4] @ encoding: [0xa4,0xf9,0x6f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:16]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:32]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:64] @ encoding: [0xa4,0xf9,0x7f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:128]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:256]
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4]!
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:16]!
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:32]!
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:64]!
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:128]!
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:256]!
+
+@ CHECK: vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4]! @ encoding: [0xa4,0xf9,0x6d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:16]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:32]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:64]! @ encoding: [0xa4,0xf9,0x7d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:128]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:256]!
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4], r6
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:16], r6
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:32], r6
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:64], r6
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:128], r6
+	vld4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:256], r6
+
+@ CHECK: vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4], r6 @ encoding: [0xa4,0xf9,0x66,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:64], r6 @ encoding: [0xa4,0xf9,0x76,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4]
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:16]
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:32]
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:64]
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:128]
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:256]
+
+@ CHECK: vld4.16 {d0[], d1[], d2[], d3[]}, [r4] @ encoding: [0xa4,0xf9,0x4f,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d1[], d2[], d3[]}, [r4:16]
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d1[], d2[], d3[]}, [r4:32]
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.16 {d0[], d1[], d2[], d3[]}, [r4:64] @ encoding: [0xa4,0xf9,0x5f,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d1[], d2[], d3[]}, [r4:128]
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d1[], d2[], d3[]}, [r4:256]
+@ CHECK-ERRORS:                                               ^
+
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4]!
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:16]!
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:32]!
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:64]!
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:128]!
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:256]!
+
+@ CHECK: vld4.16 {d0[], d1[], d2[], d3[]}, [r4]! @ encoding: [0xa4,0xf9,0x4d,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d1[], d2[], d3[]}, [r4:16]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d1[], d2[], d3[]}, [r4:32]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.16 {d0[], d1[], d2[], d3[]}, [r4:64]! @ encoding: [0xa4,0xf9,0x5d,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d1[], d2[], d3[]}, [r4:128]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d1[], d2[], d3[]}, [r4:256]!
+@ CHECK-ERRORS:                                               ^
+
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4], r6
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:16], r6
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:32], r6
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:64], r6
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:128], r6
+	vld4.16	{d0[], d1[], d2[], d3[]}, [r4:256], r6
+
+@ CHECK: vld4.16 {d0[], d1[], d2[], d3[]}, [r4], r6 @ encoding: [0xa4,0xf9,0x46,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d1[], d2[], d3[]}, [r4:16], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d1[], d2[], d3[]}, [r4:32], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.16 {d0[], d1[], d2[], d3[]}, [r4:64], r6 @ encoding: [0xa4,0xf9,0x56,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d1[], d2[], d3[]}, [r4:128], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d1[], d2[], d3[]}, [r4:256], r6
+@ CHECK-ERRORS:                                               ^
+
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4]
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:16]
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:32]
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:64]
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:128]
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:256]
+
+@ CHECK: vld4.16 {d0[], d2[], d4[], d6[]}, [r4] @ encoding: [0xa4,0xf9,0x6f,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d2[], d4[], d6[]}, [r4:16]
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d2[], d4[], d6[]}, [r4:32]
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.16 {d0[], d2[], d4[], d6[]}, [r4:64] @ encoding: [0xa4,0xf9,0x7f,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d2[], d4[], d6[]}, [r4:128]
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d2[], d4[], d6[]}, [r4:256]
+@ CHECK-ERRORS:                                               ^
+
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4]!
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:16]!
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:32]!
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:64]!
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:128]!
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:256]!
+
+@ CHECK: vld4.16 {d0[], d1[], d2[], d3[]}, [r4]! @ encoding: [0xa4,0xf9,0x6d,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d2[], d4[], d6[]}, [r4:16]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d2[], d4[], d6[]}, [r4:32]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.16 {d0[], d1[], d2[], d3[]}, [r4:64]! @ encoding: [0xa4,0xf9,0x7d,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d2[], d4[], d6[]}, [r4:128]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d2[], d4[], d6[]}, [r4:256]!
+@ CHECK-ERRORS:                                               ^
+
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4], r6
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:16], r6
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:32], r6
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:64], r6
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:128], r6
+	vld4.16	{d0[], d2[], d4[], d6[]}, [r4:256], r6
+
+@ CHECK: vld4.16 {d0[], d2[], d4[], d6[]}, [r4], r6 @ encoding: [0xa4,0xf9,0x66,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d2[], d4[], d6[]}, [r4:16], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d2[], d4[], d6[]}, [r4:32], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.16 {d0[], d2[], d4[], d6[]}, [r4:64], r6 @ encoding: [0xa4,0xf9,0x76,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d2[], d4[], d6[]}, [r4:128], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vld4.16 {d0[], d2[], d4[], d6[]}, [r4:256], r6
+@ CHECK-ERRORS:                                               ^
+
+	vld4.32	{d0, d1, d2, d3}, [r4]
+	vld4.32	{d0, d1, d2, d3}, [r4:16]
+	vld4.32	{d0, d1, d2, d3}, [r4:32]
+	vld4.32	{d0, d1, d2, d3}, [r4:64]
+	vld4.32	{d0, d1, d2, d3}, [r4:128]
+	vld4.32	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vld4.32 {d0, d1, d2, d3}, [r4]  @ encoding: [0x24,0xf9,0x8f,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.32 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x24,0xf9,0x9f,0x00]
+@ CHECK: vld4.32 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x24,0xf9,0xaf,0x00]
+@ CHECK: vld4.32 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x24,0xf9,0xbf,0x00]
+
+	vld4.32	{d0, d1, d2, d3}, [r4]!
+	vld4.32	{d0, d1, d2, d3}, [r4:16]!
+	vld4.32	{d0, d1, d2, d3}, [r4:32]!
+	vld4.32	{d0, d1, d2, d3}, [r4:64]!
+	vld4.32	{d0, d1, d2, d3}, [r4:128]!
+	vld4.32	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vld4.32 {d0, d1, d2, d3}, [r4]! @ encoding: [0x24,0xf9,0x8d,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.32 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x24,0xf9,0x9d,0x00]
+@ CHECK: vld4.32 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x24,0xf9,0xad,0x00]
+@ CHECK: vld4.32 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x24,0xf9,0xbd,0x00]
+
+	vld4.32	{d0, d1, d2, d3}, [r4], r6
+	vld4.32	{d0, d1, d2, d3}, [r4:16], r6
+	vld4.32	{d0, d1, d2, d3}, [r4:32], r6
+	vld4.32	{d0, d1, d2, d3}, [r4:64], r6
+	vld4.32	{d0, d1, d2, d3}, [r4:128], r6
+	vld4.32	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vld4.32 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x24,0xf9,0x86,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.32 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x24,0xf9,0x96,0x00]
+@ CHECK: vld4.32 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x24,0xf9,0xa6,0x00]
+@ CHECK: vld4.32 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x24,0xf9,0xb6,0x00]
+
+	vld4.32	{d0, d2, d4, d6}, [r4]
+	vld4.32	{d0, d2, d4, d6}, [r4:16]
+	vld4.32	{d0, d2, d4, d6}, [r4:32]
+	vld4.32	{d0, d2, d4, d6}, [r4:64]
+	vld4.32	{d0, d2, d4, d6}, [r4:128]
+	vld4.32	{d0, d2, d4, d6}, [r4:256]
+
+@ CHECK: vld4.32 {d0, d2, d4, d6}, [r4]  @ encoding: [0x24,0xf9,0x8f,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0, d2, d4, d6}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0, d2, d4, d6}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.32 {d0, d2, d4, d6}, [r4:64] @ encoding: [0x24,0xf9,0x9f,0x01]
+@ CHECK: vld4.32 {d0, d2, d4, d6}, [r4:128] @ encoding: [0x24,0xf9,0xaf,0x01]
+@ CHECK: vld4.32 {d0, d2, d4, d6}, [r4:256] @ encoding: [0x24,0xf9,0xbf,0x01]
+
+	vld4.32	{d0, d2, d4, d6}, [r4]!
+	vld4.32	{d0, d2, d4, d6}, [r4:16]!
+	vld4.32	{d0, d2, d4, d6}, [r4:32]!
+	vld4.32	{d0, d2, d4, d6}, [r4:64]!
+	vld4.32	{d0, d2, d4, d6}, [r4:128]!
+	vld4.32	{d0, d2, d4, d6}, [r4:256]!
+
+@ CHECK: vld4.32 {d0, d2, d4, d6}, [r4]! @ encoding: [0x24,0xf9,0x8d,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0, d2, d4, d6}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0, d2, d4, d6}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.32 {d0, d2, d4, d6}, [r4:64]! @ encoding: [0x24,0xf9,0x9d,0x01]
+@ CHECK: vld4.32 {d0, d2, d4, d6}, [r4:128]! @ encoding: [0x24,0xf9,0xad,0x01]
+@ CHECK: vld4.32 {d0, d2, d4, d6}, [r4:256]! @ encoding: [0x24,0xf9,0xbd,0x01]
+
+	vld4.32	{d0, d2, d4, d6}, [r4], r6
+	vld4.32	{d0, d2, d4, d6}, [r4:16], r6
+	vld4.32	{d0, d2, d4, d6}, [r4:32], r6
+	vld4.32	{d0, d2, d4, d6}, [r4:64], r6
+	vld4.32	{d0, d2, d4, d6}, [r4:128], r6
+	vld4.32	{d0, d2, d4, d6}, [r4:256], r6
+
+@ CHECK: vld4.32 {d0, d2, d4, d6}, [r4], r6 @ encoding: [0x24,0xf9,0x86,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0, d2, d4, d6}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0, d2, d4, d6}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vld4.32 {d0, d2, d4, d6}, [r4:64], r6 @ encoding: [0x24,0xf9,0x96,0x01]
+@ CHECK: vld4.32 {d0, d2, d4, d6}, [r4:128], r6 @ encoding: [0x24,0xf9,0xa6,0x01]
+@ CHECK: vld4.32 {d0, d2, d4, d6}, [r4:256], r6 @ encoding: [0x24,0xf9,0xb6,0x01]
+
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4]
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]
+
+@ CHECK: vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4] @ encoding: [0xa4,0xf9,0x8f,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:16]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:32]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:64] @ encoding: [0xa4,0xf9,0x9f,0x0b]
+@ CHECK: vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:128] @ encoding: [0xa4,0xf9,0xaf,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:256]
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4]!
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]!
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]!
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]!
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+
+@ CHECK: vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4]! @ encoding: [0xa4,0xf9,0x8d,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:32]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:64]! @ encoding: [0xa4,0xf9,0x9d,0x0b]
+@ CHECK: vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:128]! @ encoding: [0xa4,0xf9,0xad,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4], r6
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6
+	vld4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+
+@ CHECK: vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4], r6 @ encoding: [0xa4,0xf9,0x86,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6 @ encoding: [0xa4,0xf9,0x96,0x0b]
+@ CHECK: vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6 @ encoding: [0xa4,0xf9,0xa6,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4]
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:16]
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:32]
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:64]
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:128]
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:256]
+
+@ CHECK: vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4] @ encoding: [0xa4,0xf9,0xcf,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:16]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:32]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:64] @ encoding: [0xa4,0xf9,0xdf,0x0b]
+@ CHECK: vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:128] @ encoding: [0xa4,0xf9,0xef,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:256]
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4]!
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:16]!
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:32]!
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:64]!
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:128]!
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:256]!
+
+@ CHECK: vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4]! @ encoding: [0xa4,0xf9,0xcd,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:16]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:32]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:64]! @ encoding: [0xa4,0xf9,0xdd,0x0b]
+@ CHECK: vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:128]! @ encoding: [0xa4,0xf9,0xed,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:256]!
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4], r6
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:16], r6
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:32], r6
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:64], r6
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:128], r6
+	vld4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:256], r6
+
+@ CHECK: vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4], r6 @ encoding: [0xa4,0xf9,0xc6,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:64], r6 @ encoding: [0xa4,0xf9,0xd6,0x0b]
+@ CHECK: vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:128], r6 @ encoding: [0xa4,0xf9,0xe6,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                                   ^
+
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4]
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:16]
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:32]
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:64]
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:128]
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:256]
+
+@ CHECK: vld4.32 {d0[], d1[], d2[], d3[]}, [r4] @ encoding: [0xa4,0xf9,0x8f,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d1[], d2[], d3[]}, [r4:16]
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d1[], d2[], d3[]}, [r4:32]
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.32 {d0[], d1[], d2[], d3[]}, [r4:64] @ encoding: [0xa4,0xf9,0x9f,0x0f]
+@ CHECK: vld4.32 {d0[], d1[], d2[], d3[]}, [r4:128] @ encoding: [0xa4,0xf9,0xdf,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d1[], d2[], d3[]}, [r4:256]
+@ CHECK-ERRORS:                                               ^
+
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4]!
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:16]!
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:32]!
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:64]!
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:128]!
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:256]!
+
+@ CHECK: vld4.32 {d0[], d1[], d2[], d3[]}, [r4]! @ encoding: [0xa4,0xf9,0x8d,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d1[], d2[], d3[]}, [r4:16]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d1[], d2[], d3[]}, [r4:32]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.32 {d0[], d1[], d2[], d3[]}, [r4:64]! @ encoding: [0xa4,0xf9,0x9d,0x0f]
+@ CHECK: vld4.32 {d0[], d1[], d2[], d3[]}, [r4:128]! @ encoding: [0xa4,0xf9,0xdd,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d1[], d2[], d3[]}, [r4:256]!
+@ CHECK-ERRORS:                                               ^
+
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4], r6
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:16], r6
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:32], r6
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:64], r6
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:128], r6
+	vld4.32	{d0[], d1[], d2[], d3[]}, [r4:256], r6
+
+@ CHECK: vld4.32 {d0[], d1[], d2[], d3[]}, [r4], r6 @ encoding: [0xa4,0xf9,0x86,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d1[], d2[], d3[]}, [r4:16], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d1[], d2[], d3[]}, [r4:32], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.32 {d0[], d1[], d2[], d3[]}, [r4:64], r6 @ encoding: [0xa4,0xf9,0x96,0x0f]
+@ CHECK: vld4.32 {d0[], d1[], d2[], d3[]}, [r4:128], r6 @ encoding: [0xa4,0xf9,0xd6,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d1[], d2[], d3[]}, [r4:256], r6
+@ CHECK-ERRORS:                                               ^
+
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4]
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:16]
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:32]
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:64]
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:128]
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:256]
+
+@ CHECK: vld4.32 {d0[], d2[], d4[], d6[]}, [r4] @ encoding: [0xa4,0xf9,0xaf,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d2[], d4[], d6[]}, [r4:16]
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d2[], d4[], d6[]}, [r4:32]
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.32 {d0[], d2[], d4[], d6[]}, [r4:64] @ encoding: [0xa4,0xf9,0xbf,0x0f]
+@ CHECK: vld4.32 {d0[], d2[], d4[], d6[]}, [r4:128] @ encoding: [0xa4,0xf9,0xff,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d2[], d4[], d6[]}, [r4:256]
+@ CHECK-ERRORS:                                               ^
+
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4]!
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:16]!
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:32]!
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:64]!
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:128]!
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:256]!
+
+@ CHECK: vld4.32 {d0[], d2[], d4[], d6[]}, [r4]! @ encoding: [0xa4,0xf9,0xad,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d2[], d4[], d6[]}, [r4:16]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d2[], d4[], d6[]}, [r4:32]!
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.32 {d0[], d2[], d4[], d6[]}, [r4:64]! @ encoding: [0xa4,0xf9,0xbd,0x0f]
+@ CHECK: vld4.32 {d0[], d2[], d4[], d6[]}, [r4:128]! @ encoding: [0xa4,0xf9,0xfd,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d2[], d4[], d6[]}, [r4:256]!
+@ CHECK-ERRORS:                                               ^
+
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4], r6
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:16], r6
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:32], r6
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:64], r6
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:128], r6
+	vld4.32	{d0[], d2[], d4[], d6[]}, [r4:256], r6
+
+@ CHECK: vld4.32 {d0[], d2[], d4[], d6[]}, [r4], r6 @ encoding: [0xa4,0xf9,0xa6,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d2[], d4[], d6[]}, [r4:16], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d2[], d4[], d6[]}, [r4:32], r6
+@ CHECK-ERRORS:                                               ^
+@ CHECK: vld4.32 {d0[], d2[], d4[], d6[]}, [r4:64], r6 @ encoding: [0xa4,0xf9,0xb6,0x0f]
+@ CHECK: vld4.32 {d0[], d2[], d4[], d6[]}, [r4:128], r6 @ encoding: [0xa4,0xf9,0xf6,0x0f]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vld4.32 {d0[], d2[], d4[], d6[]}, [r4:256], r6
+@ CHECK-ERRORS:                                               ^
+
+	vst1.8	{d0}, [r4]
+	vst1.8	{d0}, [r4:16]
+	vst1.8	{d0}, [r4:32]
+	vst1.8	{d0}, [r4:64]
+	vst1.8	{d0}, [r4:128]
+	vst1.8	{d0}, [r4:256]
+
+@ CHECK: vst1.8 {d0}, [r4]              @ encoding: [0x04,0xf9,0x0f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0}, [r4:16]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0}, [r4:32]
+@ CHECK-ERRORS:                           ^
+@ CHECK: vst1.8 {d0}, [r4:64]           @ encoding: [0x04,0xf9,0x1f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0}, [r4:128]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0}, [r4:256]
+@ CHECK-ERRORS:                           ^
+
+	vst1.8	{d0}, [r4]!
+	vst1.8	{d0}, [r4:16]!
+	vst1.8	{d0}, [r4:32]!
+	vst1.8	{d0}, [r4:64]!
+	vst1.8	{d0}, [r4:128]!
+	vst1.8	{d0}, [r4:256]!
+
+@ CHECK: vst1.8 {d0}, [r4]!             @ encoding: [0x04,0xf9,0x0d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0}, [r4:16]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0}, [r4:32]!
+@ CHECK-ERRORS:                           ^
+@ CHECK: vst1.8 {d0}, [r4:64]!          @ encoding: [0x04,0xf9,0x1d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0}, [r4:128]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0}, [r4:256]!
+@ CHECK-ERRORS:                           ^
+
+	vst1.8	{d0}, [r4], r6
+	vst1.8	{d0}, [r4:16], r6
+	vst1.8	{d0}, [r4:32], r6
+	vst1.8	{d0}, [r4:64], r6
+	vst1.8	{d0}, [r4:128], r6
+	vst1.8	{d0}, [r4:256], r6
+
+@ CHECK: vst1.8 {d0}, [r4], r6          @ encoding: [0x04,0xf9,0x06,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0}, [r4:16], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0}, [r4:32], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK: vst1.8 {d0}, [r4:64], r6       @ encoding: [0x04,0xf9,0x16,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0}, [r4:128], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0}, [r4:256], r6
+@ CHECK-ERRORS:                           ^
+
+	vst1.8	{d0, d1}, [r4]
+	vst1.8	{d0, d1}, [r4:16]
+	vst1.8	{d0, d1}, [r4:32]
+	vst1.8	{d0, d1}, [r4:64]
+	vst1.8	{d0, d1}, [r4:128]
+	vst1.8	{d0, d1}, [r4:256]
+
+@ CHECK: vst1.8 {d0, d1}, [r4]          @ encoding: [0x04,0xf9,0x0f,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst1.8 {d0, d1}, [r4:64]       @ encoding: [0x04,0xf9,0x1f,0x0a]
+@ CHECK: vst1.8 {d0, d1}, [r4:128]      @ encoding: [0x04,0xf9,0x2f,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vst1.8	{d0, d1}, [r4]!
+	vst1.8	{d0, d1}, [r4:16]!
+	vst1.8	{d0, d1}, [r4:32]!
+	vst1.8	{d0, d1}, [r4:64]!
+	vst1.8	{d0, d1}, [r4:128]!
+	vst1.8	{d0, d1}, [r4:256]!
+
+@ CHECK: vst1.8 {d0, d1}, [r4]!         @ encoding: [0x04,0xf9,0x0d,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst1.8 {d0, d1}, [r4:64]!      @ encoding: [0x04,0xf9,0x1d,0x0a]
+@ CHECK: vst1.8 {d0, d1}, [r4:128]!     @ encoding: [0x04,0xf9,0x2d,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vst1.8	{d0, d1}, [r4], r6
+	vst1.8	{d0, d1}, [r4:16], r6
+	vst1.8	{d0, d1}, [r4:32], r6
+	vst1.8	{d0, d1}, [r4:64], r6
+	vst1.8	{d0, d1}, [r4:128], r6
+	vst1.8	{d0, d1}, [r4:256], r6
+
+@ CHECK: vst1.8 {d0, d1}, [r4], r6      @ encoding: [0x04,0xf9,0x06,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst1.8 {d0, d1}, [r4:64], r6   @ encoding: [0x04,0xf9,0x16,0x0a]
+@ CHECK: vst1.8 {d0, d1}, [r4:128], r6  @ encoding: [0x04,0xf9,0x26,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vst1.8	{d0, d1, d2}, [r4]
+	vst1.8	{d0, d1, d2}, [r4:16]
+	vst1.8	{d0, d1, d2}, [r4:32]
+	vst1.8	{d0, d1, d2}, [r4:64]
+	vst1.8	{d0, d1, d2}, [r4:128]
+	vst1.8	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vst1.8 {d0, d1, d2}, [r4]      @ encoding: [0x04,0xf9,0x0f,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst1.8 {d0, d1, d2}, [r4:64]   @ encoding: [0x04,0xf9,0x1f,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vst1.8	{d0, d1, d2}, [r4]!
+	vst1.8	{d0, d1, d2}, [r4:16]!
+	vst1.8	{d0, d1, d2}, [r4:32]!
+	vst1.8	{d0, d1, d2}, [r4:64]!
+	vst1.8	{d0, d1, d2}, [r4:128]!
+	vst1.8	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vst1.8 {d0, d1, d2}, [r4]!     @ encoding: [0x04,0xf9,0x0d,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst1.8 {d0, d1, d2}, [r4:64]!  @ encoding: [0x04,0xf9,0x1d,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vst1.8	{d0, d1, d2}, [r4], r6
+	vst1.8	{d0, d1, d2}, [r4:16], r6
+	vst1.8	{d0, d1, d2}, [r4:32], r6
+	vst1.8	{d0, d1, d2}, [r4:64], r6
+	vst1.8	{d0, d1, d2}, [r4:128], r6
+	vst1.8	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vst1.8 {d0, d1, d2}, [r4], r6  @ encoding: [0x04,0xf9,0x06,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst1.8 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x04,0xf9,0x16,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vst1.8	{d0, d1, d2, d3}, [r4]
+	vst1.8	{d0, d1, d2, d3}, [r4:16]
+	vst1.8	{d0, d1, d2, d3}, [r4:32]
+	vst1.8	{d0, d1, d2, d3}, [r4:64]
+	vst1.8	{d0, d1, d2, d3}, [r4:128]
+	vst1.8	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vst1.8 {d0, d1, d2, d3}, [r4]  @ encoding: [0x04,0xf9,0x0f,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst1.8 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x04,0xf9,0x1f,0x02]
+@ CHECK: vst1.8 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x04,0xf9,0x2f,0x02]
+@ CHECK: vst1.8 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x04,0xf9,0x3f,0x02]
+
+	vst1.8	{d0, d1, d2, d3}, [r4]!
+	vst1.8	{d0, d1, d2, d3}, [r4:16]!
+	vst1.8	{d0, d1, d2, d3}, [r4:32]!
+	vst1.8	{d0, d1, d2, d3}, [r4:64]!
+	vst1.8	{d0, d1, d2, d3}, [r4:128]!
+	vst1.8	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vst1.8 {d0, d1, d2, d3}, [r4]! @ encoding: [0x04,0xf9,0x0d,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst1.8 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x04,0xf9,0x1d,0x02]
+@ CHECK: vst1.8 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x04,0xf9,0x2d,0x02]
+@ CHECK: vst1.8 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x04,0xf9,0x3d,0x02]
+
+	vst1.8	{d0, d1, d2, d3}, [r4], r6
+	vst1.8	{d0, d1, d2, d3}, [r4:16], r6
+	vst1.8	{d0, d1, d2, d3}, [r4:32], r6
+	vst1.8	{d0, d1, d2, d3}, [r4:64], r6
+	vst1.8	{d0, d1, d2, d3}, [r4:128], r6
+	vst1.8	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vst1.8 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x04,0xf9,0x06,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.8  {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst1.8 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x04,0xf9,0x16,0x02]
+@ CHECK: vst1.8 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x04,0xf9,0x26,0x02]
+@ CHECK: vst1.8 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x04,0xf9,0x36,0x02]
+
+	vst1.8	{d0[2]}, [r4]
+	vst1.8	{d0[2]}, [r4:16]
+	vst1.8	{d0[2]}, [r4:32]
+	vst1.8	{d0[2]}, [r4:64]
+	vst1.8	{d0[2]}, [r4:128]
+	vst1.8	{d0[2]}, [r4:256]
+
+@ CHECK: vst1.8 {d0[2]}, [r4]           @ encoding: [0x84,0xf9,0x4f,0x00]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:16]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:32]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:64]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:128]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:256]
+@ CHECK-ERRORS:                              ^
+
+	vst1.8	{d0[2]}, [r4]!
+	vst1.8	{d0[2]}, [r4:16]!
+	vst1.8	{d0[2]}, [r4:32]!
+	vst1.8	{d0[2]}, [r4:64]!
+	vst1.8	{d0[2]}, [r4:128]!
+	vst1.8	{d0[2]}, [r4:256]!
+
+@ CHECK: vst1.8 {d0[2]}, [r4]!          @ encoding: [0x84,0xf9,0x4d,0x00]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:16]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:32]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:64]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:128]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:256]!
+@ CHECK-ERRORS:                              ^
+
+	vst1.8	{d0[2]}, [r4], r6
+	vst1.8	{d0[2]}, [r4:16], r6
+	vst1.8	{d0[2]}, [r4:32], r6
+	vst1.8	{d0[2]}, [r4:64], r6
+	vst1.8	{d0[2]}, [r4:128], r6
+	vst1.8	{d0[2]}, [r4:256], r6
+
+@ CHECK: vst1.8 {d0[2]}, [r4], r6       @ encoding: [0x84,0xf9,0x46,0x00]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:16], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:32], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:64], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:128], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst1.8  {d0[2]}, [r4:256], r6
+@ CHECK-ERRORS:                              ^
+
+	vst1.16	{d0}, [r4]
+	vst1.16	{d0}, [r4:16]
+	vst1.16	{d0}, [r4:32]
+	vst1.16	{d0}, [r4:64]
+	vst1.16	{d0}, [r4:128]
+	vst1.16	{d0}, [r4:256]
+
+@ CHECK: vst1.16 {d0}, [r4]              @ encoding: [0x04,0xf9,0x4f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0}, [r4:16]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0}, [r4:32]
+@ CHECK-ERRORS:                           ^
+@ CHECK: vst1.16 {d0}, [r4:64]           @ encoding: [0x04,0xf9,0x5f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0}, [r4:128]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0}, [r4:256]
+@ CHECK-ERRORS:                           ^
+
+	vst1.16	{d0}, [r4]!
+	vst1.16	{d0}, [r4:16]!
+	vst1.16	{d0}, [r4:32]!
+	vst1.16	{d0}, [r4:64]!
+	vst1.16	{d0}, [r4:128]!
+	vst1.16	{d0}, [r4:256]!
+
+@ CHECK: vst1.16 {d0}, [r4]!             @ encoding: [0x04,0xf9,0x4d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0}, [r4:16]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0}, [r4:32]!
+@ CHECK-ERRORS:                           ^
+@ CHECK: vst1.16 {d0}, [r4:64]!          @ encoding: [0x04,0xf9,0x5d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0}, [r4:128]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0}, [r4:256]!
+@ CHECK-ERRORS:                           ^
+
+	vst1.16	{d0}, [r4], r6
+	vst1.16	{d0}, [r4:16], r6
+	vst1.16	{d0}, [r4:32], r6
+	vst1.16	{d0}, [r4:64], r6
+	vst1.16	{d0}, [r4:128], r6
+	vst1.16	{d0}, [r4:256], r6
+
+@ CHECK: vst1.16 {d0}, [r4], r6          @ encoding: [0x04,0xf9,0x46,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0}, [r4:16], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0}, [r4:32], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK: vst1.16 {d0}, [r4:64], r6       @ encoding: [0x04,0xf9,0x56,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0}, [r4:128], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0}, [r4:256], r6
+@ CHECK-ERRORS:                           ^
+
+	vst1.16	{d0, d1}, [r4]
+	vst1.16	{d0, d1}, [r4:16]
+	vst1.16	{d0, d1}, [r4:32]
+	vst1.16	{d0, d1}, [r4:64]
+	vst1.16	{d0, d1}, [r4:128]
+	vst1.16	{d0, d1}, [r4:256]
+
+@ CHECK: vst1.16 {d0, d1}, [r4]          @ encoding: [0x04,0xf9,0x4f,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst1.16 {d0, d1}, [r4:64]       @ encoding: [0x04,0xf9,0x5f,0x0a]
+@ CHECK: vst1.16 {d0, d1}, [r4:128]      @ encoding: [0x04,0xf9,0x6f,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vst1.16	{d0, d1}, [r4]!
+	vst1.16	{d0, d1}, [r4:16]!
+	vst1.16	{d0, d1}, [r4:32]!
+	vst1.16	{d0, d1}, [r4:64]!
+	vst1.16	{d0, d1}, [r4:128]!
+	vst1.16	{d0, d1}, [r4:256]!
+
+@ CHECK: vst1.16 {d0, d1}, [r4]!         @ encoding: [0x04,0xf9,0x4d,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst1.16 {d0, d1}, [r4:64]!      @ encoding: [0x04,0xf9,0x5d,0x0a]
+@ CHECK: vst1.16 {d0, d1}, [r4:128]!     @ encoding: [0x04,0xf9,0x6d,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vst1.16	{d0, d1}, [r4], r6
+	vst1.16	{d0, d1}, [r4:16], r6
+	vst1.16	{d0, d1}, [r4:32], r6
+	vst1.16	{d0, d1}, [r4:64], r6
+	vst1.16	{d0, d1}, [r4:128], r6
+	vst1.16	{d0, d1}, [r4:256], r6
+
+@ CHECK: vst1.16 {d0, d1}, [r4], r6      @ encoding: [0x04,0xf9,0x46,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst1.16 {d0, d1}, [r4:64], r6   @ encoding: [0x04,0xf9,0x56,0x0a]
+@ CHECK: vst1.16 {d0, d1}, [r4:128], r6  @ encoding: [0x04,0xf9,0x66,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vst1.16	{d0, d1, d2}, [r4]
+	vst1.16	{d0, d1, d2}, [r4:16]
+	vst1.16	{d0, d1, d2}, [r4:32]
+	vst1.16	{d0, d1, d2}, [r4:64]
+	vst1.16	{d0, d1, d2}, [r4:128]
+	vst1.16	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vst1.16 {d0, d1, d2}, [r4]      @ encoding: [0x04,0xf9,0x4f,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst1.16 {d0, d1, d2}, [r4:64]   @ encoding: [0x04,0xf9,0x5f,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vst1.16	{d0, d1, d2}, [r4]!
+	vst1.16	{d0, d1, d2}, [r4:16]!
+	vst1.16	{d0, d1, d2}, [r4:32]!
+	vst1.16	{d0, d1, d2}, [r4:64]!
+	vst1.16	{d0, d1, d2}, [r4:128]!
+	vst1.16	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vst1.16 {d0, d1, d2}, [r4]!     @ encoding: [0x04,0xf9,0x4d,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst1.16 {d0, d1, d2}, [r4:64]!  @ encoding: [0x04,0xf9,0x5d,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vst1.16	{d0, d1, d2}, [r4], r6
+	vst1.16	{d0, d1, d2}, [r4:16], r6
+	vst1.16	{d0, d1, d2}, [r4:32], r6
+	vst1.16	{d0, d1, d2}, [r4:64], r6
+	vst1.16	{d0, d1, d2}, [r4:128], r6
+	vst1.16	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vst1.16 {d0, d1, d2}, [r4], r6  @ encoding: [0x04,0xf9,0x46,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst1.16 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x04,0xf9,0x56,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vst1.16	{d0, d1, d2, d3}, [r4]
+	vst1.16	{d0, d1, d2, d3}, [r4:16]
+	vst1.16	{d0, d1, d2, d3}, [r4:32]
+	vst1.16	{d0, d1, d2, d3}, [r4:64]
+	vst1.16	{d0, d1, d2, d3}, [r4:128]
+	vst1.16	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vst1.16 {d0, d1, d2, d3}, [r4]  @ encoding: [0x04,0xf9,0x4f,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst1.16 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x04,0xf9,0x5f,0x02]
+@ CHECK: vst1.16 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x04,0xf9,0x6f,0x02]
+@ CHECK: vst1.16 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x04,0xf9,0x7f,0x02]
+
+	vst1.16	{d0, d1, d2, d3}, [r4]!
+	vst1.16	{d0, d1, d2, d3}, [r4:16]!
+	vst1.16	{d0, d1, d2, d3}, [r4:32]!
+	vst1.16	{d0, d1, d2, d3}, [r4:64]!
+	vst1.16	{d0, d1, d2, d3}, [r4:128]!
+	vst1.16	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vst1.16 {d0, d1, d2, d3}, [r4]! @ encoding: [0x04,0xf9,0x4d,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst1.16 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x04,0xf9,0x5d,0x02]
+@ CHECK: vst1.16 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x04,0xf9,0x6d,0x02]
+@ CHECK: vst1.16 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x04,0xf9,0x7d,0x02]
+
+	vst1.16	{d0, d1, d2, d3}, [r4], r6
+	vst1.16	{d0, d1, d2, d3}, [r4:16], r6
+	vst1.16	{d0, d1, d2, d3}, [r4:32], r6
+	vst1.16	{d0, d1, d2, d3}, [r4:64], r6
+	vst1.16	{d0, d1, d2, d3}, [r4:128], r6
+	vst1.16	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vst1.16 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x04,0xf9,0x46,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst1.16 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x04,0xf9,0x56,0x02]
+@ CHECK: vst1.16 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x04,0xf9,0x66,0x02]
+@ CHECK: vst1.16 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x04,0xf9,0x76,0x02]
+
+	vst1.16	{d0[2]}, [r4]
+	vst1.16	{d0[2]}, [r4:16]
+	vst1.16	{d0[2]}, [r4:32]
+	vst1.16	{d0[2]}, [r4:64]
+	vst1.16	{d0[2]}, [r4:128]
+	vst1.16	{d0[2]}, [r4:256]
+
+@ CHECK: vst1.16 {d0[2]}, [r4]           @ encoding: [0x84,0xf9,0x8f,0x04]
+@ CHECK: vst1.16 {d0[2]}, [r4:16]        @ encoding: [0x84,0xf9,0x9f,0x04]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0[2]}, [r4:32]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0[2]}, [r4:64]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0[2]}, [r4:128]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0[2]}, [r4:256]
+@ CHECK-ERRORS:                              ^
+
+	vst1.16	{d0[2]}, [r4]!
+	vst1.16	{d0[2]}, [r4:16]!
+	vst1.16	{d0[2]}, [r4:32]!
+	vst1.16	{d0[2]}, [r4:64]!
+	vst1.16	{d0[2]}, [r4:128]!
+	vst1.16	{d0[2]}, [r4:256]!
+
+@ CHECK: vst1.16 {d0[2]}, [r4]!          @ encoding: [0x84,0xf9,0x8d,0x04]
+@ CHECK: vst1.16 {d0[2]}, [r4:16]!       @ encoding: [0x84,0xf9,0x9d,0x04]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0[2]}, [r4:32]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0[2]}, [r4:64]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0[2]}, [r4:128]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0[2]}, [r4:256]!
+@ CHECK-ERRORS:                              ^
+
+	vst1.16	{d0[2]}, [r4], r6
+	vst1.16	{d0[2]}, [r4:16], r6
+	vst1.16	{d0[2]}, [r4:32], r6
+	vst1.16	{d0[2]}, [r4:64], r6
+	vst1.16	{d0[2]}, [r4:128], r6
+	vst1.16	{d0[2]}, [r4:256], r6
+
+@ CHECK: vst1.16 {d0[2]}, [r4], r6       @ encoding: [0x84,0xf9,0x86,0x04]
+@ CHECK: vst1.16 {d0[2]}, [r4:16], r6    @ encoding: [0x84,0xf9,0x96,0x04]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0[2]}, [r4:32], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0[2]}, [r4:64], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0[2]}, [r4:128], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst1.16 {d0[2]}, [r4:256], r6
+@ CHECK-ERRORS:                              ^
+
+	vst1.32	{d0}, [r4]
+	vst1.32	{d0}, [r4:16]
+	vst1.32	{d0}, [r4:32]
+	vst1.32	{d0}, [r4:64]
+	vst1.32	{d0}, [r4:128]
+	vst1.32	{d0}, [r4:256]
+
+@ CHECK: vst1.32 {d0}, [r4]              @ encoding: [0x04,0xf9,0x8f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0}, [r4:16]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0}, [r4:32]
+@ CHECK-ERRORS:                           ^
+@ CHECK: vst1.32 {d0}, [r4:64]           @ encoding: [0x04,0xf9,0x9f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0}, [r4:128]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0}, [r4:256]
+@ CHECK-ERRORS:                           ^
+
+	vst1.32	{d0}, [r4]!
+	vst1.32	{d0}, [r4:16]!
+	vst1.32	{d0}, [r4:32]!
+	vst1.32	{d0}, [r4:64]!
+	vst1.32	{d0}, [r4:128]!
+	vst1.32	{d0}, [r4:256]!
+
+@ CHECK: vst1.32 {d0}, [r4]!             @ encoding: [0x04,0xf9,0x8d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0}, [r4:16]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0}, [r4:32]!
+@ CHECK-ERRORS:                           ^
+@ CHECK: vst1.32 {d0}, [r4:64]!          @ encoding: [0x04,0xf9,0x9d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0}, [r4:128]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0}, [r4:256]!
+@ CHECK-ERRORS:                           ^
+
+	vst1.32	{d0}, [r4], r6
+	vst1.32	{d0}, [r4:16], r6
+	vst1.32	{d0}, [r4:32], r6
+	vst1.32	{d0}, [r4:64], r6
+	vst1.32	{d0}, [r4:128], r6
+	vst1.32	{d0}, [r4:256], r6
+
+@ CHECK: vst1.32 {d0}, [r4], r6          @ encoding: [0x04,0xf9,0x86,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0}, [r4:16], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0}, [r4:32], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK: vst1.32 {d0}, [r4:64], r6       @ encoding: [0x04,0xf9,0x96,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0}, [r4:128], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0}, [r4:256], r6
+@ CHECK-ERRORS:                           ^
+
+	vst1.32	{d0, d1}, [r4]
+	vst1.32	{d0, d1}, [r4:16]
+	vst1.32	{d0, d1}, [r4:32]
+	vst1.32	{d0, d1}, [r4:64]
+	vst1.32	{d0, d1}, [r4:128]
+	vst1.32	{d0, d1}, [r4:256]
+
+@ CHECK: vst1.32 {d0, d1}, [r4]          @ encoding: [0x04,0xf9,0x8f,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst1.32 {d0, d1}, [r4:64]       @ encoding: [0x04,0xf9,0x9f,0x0a]
+@ CHECK: vst1.32 {d0, d1}, [r4:128]      @ encoding: [0x04,0xf9,0xaf,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vst1.32	{d0, d1}, [r4]!
+	vst1.32	{d0, d1}, [r4:16]!
+	vst1.32	{d0, d1}, [r4:32]!
+	vst1.32	{d0, d1}, [r4:64]!
+	vst1.32	{d0, d1}, [r4:128]!
+	vst1.32	{d0, d1}, [r4:256]!
+
+@ CHECK: vst1.32 {d0, d1}, [r4]!         @ encoding: [0x04,0xf9,0x8d,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst1.32 {d0, d1}, [r4:64]!      @ encoding: [0x04,0xf9,0x9d,0x0a]
+@ CHECK: vst1.32 {d0, d1}, [r4:128]!     @ encoding: [0x04,0xf9,0xad,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vst1.32	{d0, d1}, [r4], r6
+	vst1.32	{d0, d1}, [r4:16], r6
+	vst1.32	{d0, d1}, [r4:32], r6
+	vst1.32	{d0, d1}, [r4:64], r6
+	vst1.32	{d0, d1}, [r4:128], r6
+	vst1.32	{d0, d1}, [r4:256], r6
+
+@ CHECK: vst1.32 {d0, d1}, [r4], r6      @ encoding: [0x04,0xf9,0x86,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst1.32 {d0, d1}, [r4:64], r6   @ encoding: [0x04,0xf9,0x96,0x0a]
+@ CHECK: vst1.32 {d0, d1}, [r4:128], r6  @ encoding: [0x04,0xf9,0xa6,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vst1.32	{d0, d1, d2}, [r4]
+	vst1.32	{d0, d1, d2}, [r4:16]
+	vst1.32	{d0, d1, d2}, [r4:32]
+	vst1.32	{d0, d1, d2}, [r4:64]
+	vst1.32	{d0, d1, d2}, [r4:128]
+	vst1.32	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vst1.32 {d0, d1, d2}, [r4]      @ encoding: [0x04,0xf9,0x8f,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst1.32 {d0, d1, d2}, [r4:64]   @ encoding: [0x04,0xf9,0x9f,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vst1.32	{d0, d1, d2}, [r4]!
+	vst1.32	{d0, d1, d2}, [r4:16]!
+	vst1.32	{d0, d1, d2}, [r4:32]!
+	vst1.32	{d0, d1, d2}, [r4:64]!
+	vst1.32	{d0, d1, d2}, [r4:128]!
+	vst1.32	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vst1.32 {d0, d1, d2}, [r4]!     @ encoding: [0x04,0xf9,0x8d,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst1.32 {d0, d1, d2}, [r4:64]!  @ encoding: [0x04,0xf9,0x9d,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vst1.32	{d0, d1, d2}, [r4], r6
+	vst1.32	{d0, d1, d2}, [r4:16], r6
+	vst1.32	{d0, d1, d2}, [r4:32], r6
+	vst1.32	{d0, d1, d2}, [r4:64], r6
+	vst1.32	{d0, d1, d2}, [r4:128], r6
+	vst1.32	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vst1.32 {d0, d1, d2}, [r4], r6  @ encoding: [0x04,0xf9,0x86,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst1.32 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x04,0xf9,0x96,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vst1.32	{d0, d1, d2, d3}, [r4]
+	vst1.32	{d0, d1, d2, d3}, [r4:16]
+	vst1.32	{d0, d1, d2, d3}, [r4:32]
+	vst1.32	{d0, d1, d2, d3}, [r4:64]
+	vst1.32	{d0, d1, d2, d3}, [r4:128]
+	vst1.32	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vst1.32 {d0, d1, d2, d3}, [r4]  @ encoding: [0x04,0xf9,0x8f,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst1.32 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x04,0xf9,0x9f,0x02]
+@ CHECK: vst1.32 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x04,0xf9,0xaf,0x02]
+@ CHECK: vst1.32 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x04,0xf9,0xbf,0x02]
+
+	vst1.32	{d0, d1, d2, d3}, [r4]!
+	vst1.32	{d0, d1, d2, d3}, [r4:16]!
+	vst1.32	{d0, d1, d2, d3}, [r4:32]!
+	vst1.32	{d0, d1, d2, d3}, [r4:64]!
+	vst1.32	{d0, d1, d2, d3}, [r4:128]!
+	vst1.32	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vst1.32 {d0, d1, d2, d3}, [r4]! @ encoding: [0x04,0xf9,0x8d,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst1.32 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x04,0xf9,0x9d,0x02]
+@ CHECK: vst1.32 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x04,0xf9,0xad,0x02]
+@ CHECK: vst1.32 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x04,0xf9,0xbd,0x02]
+
+	vst1.32	{d0, d1, d2, d3}, [r4], r6
+	vst1.32	{d0, d1, d2, d3}, [r4:16], r6
+	vst1.32	{d0, d1, d2, d3}, [r4:32], r6
+	vst1.32	{d0, d1, d2, d3}, [r4:64], r6
+	vst1.32	{d0, d1, d2, d3}, [r4:128], r6
+	vst1.32	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vst1.32 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x04,0xf9,0x86,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst1.32 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x04,0xf9,0x96,0x02]
+@ CHECK: vst1.32 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x04,0xf9,0xa6,0x02]
+@ CHECK: vst1.32 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x04,0xf9,0xb6,0x02]
+
+	vst1.32	{d0[1]}, [r4]
+	vst1.32	{d0[1]}, [r4:16]
+	vst1.32	{d0[1]}, [r4:32]
+	vst1.32	{d0[1]}, [r4:64]
+	vst1.32	{d0[1]}, [r4:128]
+	vst1.32	{d0[1]}, [r4:256]
+
+@ CHECK: vst1.32 {d0[1]}, [r4]           @ encoding: [0x84,0xf9,0x8f,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0[1]}, [r4:16]
+@ CHECK-ERRORS:                              ^
+@ CHECK: vst1.32 {d0[1]}, [r4:32]        @ encoding: [0x84,0xf9,0xbf,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0[1]}, [r4:64]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0[1]}, [r4:128]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0[1]}, [r4:256]
+@ CHECK-ERRORS:                              ^
+
+	vst1.32	{d0[1]}, [r4]!
+	vst1.32	{d0[1]}, [r4:16]!
+	vst1.32	{d0[1]}, [r4:32]!
+	vst1.32	{d0[1]}, [r4:64]!
+	vst1.32	{d0[1]}, [r4:128]!
+	vst1.32	{d0[1]}, [r4:256]!
+
+@ CHECK: vst1.32 {d0[1]}, [r4]!          @ encoding: [0x84,0xf9,0x8d,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0[1]}, [r4:16]!
+@ CHECK-ERRORS:                              ^
+@ CHECK: vst1.32 {d0[1]}, [r4:32]!       @ encoding: [0x84,0xf9,0xbd,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0[1]}, [r4:64]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0[1]}, [r4:128]!
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0[1]}, [r4:256]!
+@ CHECK-ERRORS:                              ^
+
+	vst1.32	{d0[1]}, [r4], r6
+	vst1.32	{d0[1]}, [r4:16], r6
+	vst1.32	{d0[1]}, [r4:32], r6
+	vst1.32	{d0[1]}, [r4:64], r6
+	vst1.32	{d0[1]}, [r4:128], r6
+	vst1.32	{d0[1]}, [r4:256], r6
+
+@ CHECK: vst1.32 {d0[1]}, [r4], r6       @ encoding: [0x84,0xf9,0x86,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0[1]}, [r4:16], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK: vst1.32 {d0[1]}, [r4:32], r6    @ encoding: [0x84,0xf9,0xb6,0x08]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0[1]}, [r4:64], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0[1]}, [r4:128], r6
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst1.32 {d0[1]}, [r4:256], r6
+@ CHECK-ERRORS:                              ^
+
+	vst1.64	{d0}, [r4]
+	vst1.64	{d0}, [r4:16]
+	vst1.64	{d0}, [r4:32]
+	vst1.64	{d0}, [r4:64]
+	vst1.64	{d0}, [r4:128]
+	vst1.64	{d0}, [r4:256]
+
+@ CHECK: vst1.64 {d0}, [r4]              @ encoding: [0x04,0xf9,0xcf,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0}, [r4:16]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0}, [r4:32]
+@ CHECK-ERRORS:                           ^
+@ CHECK: vst1.64 {d0}, [r4:64]           @ encoding: [0x04,0xf9,0xdf,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0}, [r4:128]
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0}, [r4:256]
+@ CHECK-ERRORS:                           ^
+
+	vst1.64	{d0}, [r4]!
+	vst1.64	{d0}, [r4:16]!
+	vst1.64	{d0}, [r4:32]!
+	vst1.64	{d0}, [r4:64]!
+	vst1.64	{d0}, [r4:128]!
+	vst1.64	{d0}, [r4:256]!
+
+@ CHECK: vst1.64 {d0}, [r4]!             @ encoding: [0x04,0xf9,0xcd,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0}, [r4:16]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0}, [r4:32]!
+@ CHECK-ERRORS:                           ^
+@ CHECK: vst1.64 {d0}, [r4:64]!          @ encoding: [0x04,0xf9,0xdd,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0}, [r4:128]!
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0}, [r4:256]!
+@ CHECK-ERRORS:                           ^
+
+	vst1.64	{d0}, [r4], r6
+	vst1.64	{d0}, [r4:16], r6
+	vst1.64	{d0}, [r4:32], r6
+	vst1.64	{d0}, [r4:64], r6
+	vst1.64	{d0}, [r4:128], r6
+	vst1.64	{d0}, [r4:256], r6
+
+@ CHECK: vst1.64 {d0}, [r4], r6          @ encoding: [0x04,0xf9,0xc6,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0}, [r4:16], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0}, [r4:32], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK: vst1.64 {d0}, [r4:64], r6       @ encoding: [0x04,0xf9,0xd6,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0}, [r4:128], r6
+@ CHECK-ERRORS:                           ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0}, [r4:256], r6
+@ CHECK-ERRORS:                           ^
+
+	vst1.64	{d0, d1}, [r4]
+	vst1.64	{d0, d1}, [r4:16]
+	vst1.64	{d0, d1}, [r4:32]
+	vst1.64	{d0, d1}, [r4:64]
+	vst1.64	{d0, d1}, [r4:128]
+	vst1.64	{d0, d1}, [r4:256]
+
+@ CHECK: vst1.64 {d0, d1}, [r4]          @ encoding: [0x04,0xf9,0xcf,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst1.64 {d0, d1}, [r4:64]       @ encoding: [0x04,0xf9,0xdf,0x0a]
+@ CHECK: vst1.64 {d0, d1}, [r4:128]      @ encoding: [0x04,0xf9,0xef,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vst1.64	{d0, d1}, [r4]!
+	vst1.64	{d0, d1}, [r4:16]!
+	vst1.64	{d0, d1}, [r4:32]!
+	vst1.64	{d0, d1}, [r4:64]!
+	vst1.64	{d0, d1}, [r4:128]!
+	vst1.64	{d0, d1}, [r4:256]!
+
+@ CHECK: vst1.64 {d0, d1}, [r4]!         @ encoding: [0x04,0xf9,0xcd,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst1.64 {d0, d1}, [r4:64]!      @ encoding: [0x04,0xf9,0xdd,0x0a]
+@ CHECK: vst1.64 {d0, d1}, [r4:128]!     @ encoding: [0x04,0xf9,0xed,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vst1.64	{d0, d1}, [r4], r6
+	vst1.64	{d0, d1}, [r4:16], r6
+	vst1.64	{d0, d1}, [r4:32], r6
+	vst1.64	{d0, d1}, [r4:64], r6
+	vst1.64	{d0, d1}, [r4:128], r6
+	vst1.64	{d0, d1}, [r4:256], r6
+
+@ CHECK: vst1.64 {d0, d1}, [r4], r6      @ encoding: [0x04,0xf9,0xc6,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst1.64 {d0, d1}, [r4:64], r6   @ encoding: [0x04,0xf9,0xd6,0x0a]
+@ CHECK: vst1.64 {d0, d1}, [r4:128], r6  @ encoding: [0x04,0xf9,0xe6,0x0a]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vst1.64	{d0, d1, d2}, [r4]
+	vst1.64	{d0, d1, d2}, [r4:16]
+	vst1.64	{d0, d1, d2}, [r4:32]
+	vst1.64	{d0, d1, d2}, [r4:64]
+	vst1.64	{d0, d1, d2}, [r4:128]
+	vst1.64	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vst1.64 {d0, d1, d2}, [r4]      @ encoding: [0x04,0xf9,0xcf,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst1.64 {d0, d1, d2}, [r4:64]   @ encoding: [0x04,0xf9,0xdf,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vst1.64	{d0, d1, d2}, [r4]!
+	vst1.64	{d0, d1, d2}, [r4:16]!
+	vst1.64	{d0, d1, d2}, [r4:32]!
+	vst1.64	{d0, d1, d2}, [r4:64]!
+	vst1.64	{d0, d1, d2}, [r4:128]!
+	vst1.64	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vst1.64 {d0, d1, d2}, [r4]!     @ encoding: [0x04,0xf9,0xcd,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst1.64 {d0, d1, d2}, [r4:64]!  @ encoding: [0x04,0xf9,0xdd,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vst1.64	{d0, d1, d2}, [r4], r6
+	vst1.64	{d0, d1, d2}, [r4:16], r6
+	vst1.64	{d0, d1, d2}, [r4:32], r6
+	vst1.64	{d0, d1, d2}, [r4:64], r6
+	vst1.64	{d0, d1, d2}, [r4:128], r6
+	vst1.64	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vst1.64 {d0, d1, d2}, [r4], r6  @ encoding: [0x04,0xf9,0xc6,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst1.64 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x04,0xf9,0xd6,0x06]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vst1.64	{d0, d1, d2, d3}, [r4]
+	vst1.64	{d0, d1, d2, d3}, [r4:16]
+	vst1.64	{d0, d1, d2, d3}, [r4:32]
+	vst1.64	{d0, d1, d2, d3}, [r4:64]
+	vst1.64	{d0, d1, d2, d3}, [r4:128]
+	vst1.64	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vst1.64 {d0, d1, d2, d3}, [r4]  @ encoding: [0x04,0xf9,0xcf,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst1.64 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x04,0xf9,0xdf,0x02]
+@ CHECK: vst1.64 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x04,0xf9,0xef,0x02]
+@ CHECK: vst1.64 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x04,0xf9,0xff,0x02]
+
+	vst1.64	{d0, d1, d2, d3}, [r4]!
+	vst1.64	{d0, d1, d2, d3}, [r4:16]!
+	vst1.64	{d0, d1, d2, d3}, [r4:32]!
+	vst1.64	{d0, d1, d2, d3}, [r4:64]!
+	vst1.64	{d0, d1, d2, d3}, [r4:128]!
+	vst1.64	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vst1.64 {d0, d1, d2, d3}, [r4]! @ encoding: [0x04,0xf9,0xcd,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst1.64 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x04,0xf9,0xdd,0x02]
+@ CHECK: vst1.64 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x04,0xf9,0xed,0x02]
+@ CHECK: vst1.64 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x04,0xf9,0xfd,0x02]
+
+	vst1.64	{d0, d1, d2, d3}, [r4], r6
+	vst1.64	{d0, d1, d2, d3}, [r4:16], r6
+	vst1.64	{d0, d1, d2, d3}, [r4:32], r6
+	vst1.64	{d0, d1, d2, d3}, [r4:64], r6
+	vst1.64	{d0, d1, d2, d3}, [r4:128], r6
+	vst1.64	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vst1.64 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x04,0xf9,0xc6,0x02]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst1.64 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst1.64 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x04,0xf9,0xd6,0x02]
+@ CHECK: vst1.64 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x04,0xf9,0xe6,0x02]
+@ CHECK: vst1.64 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x04,0xf9,0xf6,0x02]
+
+	vst2.8	{d0, d1}, [r4]
+	vst2.8	{d0, d1}, [r4:16]
+	vst2.8	{d0, d1}, [r4:32]
+	vst2.8	{d0, d1}, [r4:64]
+	vst2.8	{d0, d1}, [r4:128]
+	vst2.8	{d0, d1}, [r4:256]
+
+@ CHECK: vst2.8 {d0, d1}, [r4]          @ encoding: [0x04,0xf9,0x0f,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst2.8 {d0, d1}, [r4:64]       @ encoding: [0x04,0xf9,0x1f,0x08]
+@ CHECK: vst2.8 {d0, d1}, [r4:128]      @ encoding: [0x04,0xf9,0x2f,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vst2.8	{d0, d1}, [r4]!
+	vst2.8	{d0, d1}, [r4:16]!
+	vst2.8	{d0, d1}, [r4:32]!
+	vst2.8	{d0, d1}, [r4:64]!
+	vst2.8	{d0, d1}, [r4:128]!
+	vst2.8	{d0, d1}, [r4:256]!
+
+@ CHECK: vst2.8 {d0, d1}, [r4]!         @ encoding: [0x04,0xf9,0x0d,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst2.8 {d0, d1}, [r4:64]!      @ encoding: [0x04,0xf9,0x1d,0x08]
+@ CHECK: vst2.8 {d0, d1}, [r4:128]!     @ encoding: [0x04,0xf9,0x2d,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vst2.8	{d0, d1}, [r4], r6
+	vst2.8	{d0, d1}, [r4:16], r6
+	vst2.8	{d0, d1}, [r4:32], r6
+	vst2.8	{d0, d1}, [r4:64], r6
+	vst2.8	{d0, d1}, [r4:128], r6
+	vst2.8	{d0, d1}, [r4:256], r6
+
+@ CHECK: vst2.8 {d0, d1}, [r4], r6      @ encoding: [0x04,0xf9,0x06,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst2.8 {d0, d1}, [r4:64], r6   @ encoding: [0x04,0xf9,0x16,0x08]
+@ CHECK: vst2.8 {d0, d1}, [r4:128], r6  @ encoding: [0x04,0xf9,0x26,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vst2.8	{d0, d2}, [r4]
+	vst2.8	{d0, d2}, [r4:16]
+	vst2.8	{d0, d2}, [r4:32]
+	vst2.8	{d0, d2}, [r4:64]
+	vst2.8	{d0, d2}, [r4:128]
+	vst2.8	{d0, d2}, [r4:256]
+
+@ CHECK: vst2.8 {d0, d2}, [r4]          @ encoding: [0x04,0xf9,0x0f,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d2}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d2}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst2.8 {d0, d2}, [r4:64]       @ encoding: [0x04,0xf9,0x1f,0x09]
+@ CHECK: vst2.8 {d0, d2}, [r4:128]      @ encoding: [0x04,0xf9,0x2f,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d2}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vst2.8	{d0, d2}, [r4]!
+	vst2.8	{d0, d2}, [r4:16]!
+	vst2.8	{d0, d2}, [r4:32]!
+	vst2.8	{d0, d2}, [r4:64]!
+	vst2.8	{d0, d2}, [r4:128]!
+	vst2.8	{d0, d2}, [r4:256]!
+
+@ CHECK: vst2.8 {d0, d2}, [r4]!         @ encoding: [0x04,0xf9,0x0d,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d2}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d2}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst2.8 {d0, d2}, [r4:64]!      @ encoding: [0x04,0xf9,0x1d,0x09]
+@ CHECK: vst2.8 {d0, d2}, [r4:128]!     @ encoding: [0x04,0xf9,0x2d,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d2}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vst2.8	{d0, d2}, [r4], r6
+	vst2.8	{d0, d2}, [r4:16], r6
+	vst2.8	{d0, d2}, [r4:32], r6
+	vst2.8	{d0, d2}, [r4:64], r6
+	vst2.8	{d0, d2}, [r4:128], r6
+	vst2.8	{d0, d2}, [r4:256], r6
+
+@ CHECK: vst2.8 {d0, d2}, [r4], r6      @ encoding: [0x04,0xf9,0x06,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d2}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d2}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst2.8 {d0, d2}, [r4:64], r6   @ encoding: [0x04,0xf9,0x16,0x09]
+@ CHECK: vst2.8 {d0, d2}, [r4:128], r6  @ encoding: [0x04,0xf9,0x26,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d2}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vst2.8	{d0, d1, d2, d3}, [r4]
+	vst2.8	{d0, d1, d2, d3}, [r4:16]
+	vst2.8	{d0, d1, d2, d3}, [r4:32]
+	vst2.8	{d0, d1, d2, d3}, [r4:64]
+	vst2.8	{d0, d1, d2, d3}, [r4:128]
+	vst2.8	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vst2.8 {d0, d1, d2, d3}, [r4]  @ encoding: [0x04,0xf9,0x0f,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst2.8 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x04,0xf9,0x1f,0x03]
+@ CHECK: vst2.8 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x04,0xf9,0x2f,0x03]
+@ CHECK: vst2.8 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x04,0xf9,0x3f,0x03]
+
+	vst2.8	{d0, d1, d2, d3}, [r4]!
+	vst2.8	{d0, d1, d2, d3}, [r4:16]!
+	vst2.8	{d0, d1, d2, d3}, [r4:32]!
+	vst2.8	{d0, d1, d2, d3}, [r4:64]!
+	vst2.8	{d0, d1, d2, d3}, [r4:128]!
+	vst2.8	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vst2.8 {d0, d1, d2, d3}, [r4]! @ encoding: [0x04,0xf9,0x0d,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst2.8 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x04,0xf9,0x1d,0x03]
+@ CHECK: vst2.8 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x04,0xf9,0x2d,0x03]
+@ CHECK: vst2.8 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x04,0xf9,0x3d,0x03]
+
+	vst2.8	{d0, d1, d2, d3}, [r4], r6
+	vst2.8	{d0, d1, d2, d3}, [r4:16], r6
+	vst2.8	{d0, d1, d2, d3}, [r4:32], r6
+	vst2.8	{d0, d1, d2, d3}, [r4:64], r6
+	vst2.8	{d0, d1, d2, d3}, [r4:128], r6
+	vst2.8	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vst2.8 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x04,0xf9,0x06,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst2.8 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x04,0xf9,0x16,0x03]
+@ CHECK: vst2.8 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x04,0xf9,0x26,0x03]
+@ CHECK: vst2.8 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x04,0xf9,0x36,0x03]
+
+	vst2.8	{d0[2], d1[2]}, [r4]
+	vst2.8	{d0[2], d1[2]}, [r4:16]
+	vst2.8	{d0[2], d1[2]}, [r4:32]
+	vst2.8	{d0[2], d1[2]}, [r4:64]
+	vst2.8	{d0[2], d1[2]}, [r4:128]
+	vst2.8	{d0[2], d1[2]}, [r4:256]
+
+@ CHECK: vst2.8 {d0[2], d1[2]}, [r4]    @ encoding: [0x84,0xf9,0x4f,0x01]
+@ CHECK: vst2.8 {d0[2], d1[2]}, [r4:16] @ encoding: [0x84,0xf9,0x5f,0x01]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0[2], d1[2]}, [r4:32]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0[2], d1[2]}, [r4:64]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0[2], d1[2]}, [r4:128]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0[2], d1[2]}, [r4:256]
+@ CHECK-ERRORS:                                     ^
+
+	vst2.8	{d0[2], d1[2]}, [r4]!
+	vst2.8	{d0[2], d1[2]}, [r4:16]!
+	vst2.8	{d0[2], d1[2]}, [r4:32]!
+	vst2.8	{d0[2], d1[2]}, [r4:64]!
+	vst2.8	{d0[2], d1[2]}, [r4:128]!
+	vst2.8	{d0[2], d1[2]}, [r4:256]!
+
+@ CHECK: vst2.8 {d0[2], d1[2]}, [r4]!   @ encoding: [0x84,0xf9,0x4d,0x01]
+@ CHECK: vst2.8 {d0[2], d1[2]}, [r4:16]! @ encoding: [0x84,0xf9,0x5d,0x01]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0[2], d1[2]}, [r4:32]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0[2], d1[2]}, [r4:64]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0[2], d1[2]}, [r4:128]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0[2], d1[2]}, [r4:256]!
+@ CHECK-ERRORS:                                     ^
+
+	vst2.8	{d0[2], d1[2]}, [r4], r6
+	vst2.8	{d0[2], d1[2]}, [r4:16], r6
+	vst2.8	{d0[2], d1[2]}, [r4:32], r6
+	vst2.8	{d0[2], d1[2]}, [r4:64], r6
+	vst2.8	{d0[2], d1[2]}, [r4:128], r6
+	vst2.8	{d0[2], d1[2]}, [r4:256], r6
+
+@ CHECK: vst2.8 {d0[2], d1[2]}, [r4], r6 @ encoding: [0x84,0xf9,0x46,0x01]
+@ CHECK: vst2.8 {d0[2], d1[2]}, [r4:16], r6 @ encoding: [0x84,0xf9,0x56,0x01]
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0[2], d1[2]}, [r4:32], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0[2], d1[2]}, [r4:64], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0[2], d1[2]}, [r4:128], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 16 or omitted
+@ CHECK-ERRORS:         vst2.8  {d0[2], d1[2]}, [r4:256], r6
+@ CHECK-ERRORS:                                     ^
+
+	vst2.32	{d0, d1}, [r4]
+	vst2.32	{d0, d1}, [r4:16]
+	vst2.32	{d0, d1}, [r4:32]
+	vst2.32	{d0, d1}, [r4:64]
+	vst2.32	{d0, d1}, [r4:128]
+	vst2.32	{d0, d1}, [r4:256]
+
+@ CHECK: vst2.32 {d0, d1}, [r4]          @ encoding: [0x04,0xf9,0x8f,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst2.32 {d0, d1}, [r4:64]       @ encoding: [0x04,0xf9,0x9f,0x08]
+@ CHECK: vst2.32 {d0, d1}, [r4:128]      @ encoding: [0x04,0xf9,0xaf,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vst2.32	{d0, d1}, [r4]!
+	vst2.32	{d0, d1}, [r4:16]!
+	vst2.32	{d0, d1}, [r4:32]!
+	vst2.32	{d0, d1}, [r4:64]!
+	vst2.32	{d0, d1}, [r4:128]!
+	vst2.32	{d0, d1}, [r4:256]!
+
+@ CHECK: vst2.32 {d0, d1}, [r4]!         @ encoding: [0x04,0xf9,0x8d,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst2.32 {d0, d1}, [r4:64]!      @ encoding: [0x04,0xf9,0x9d,0x08]
+@ CHECK: vst2.32 {d0, d1}, [r4:128]!     @ encoding: [0x04,0xf9,0xad,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vst2.32	{d0, d1}, [r4], r6
+	vst2.32	{d0, d1}, [r4:16], r6
+	vst2.32	{d0, d1}, [r4:32], r6
+	vst2.32	{d0, d1}, [r4:64], r6
+	vst2.32	{d0, d1}, [r4:128], r6
+	vst2.32	{d0, d1}, [r4:256], r6
+
+@ CHECK: vst2.32 {d0, d1}, [r4], r6      @ encoding: [0x04,0xf9,0x86,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst2.32 {d0, d1}, [r4:64], r6   @ encoding: [0x04,0xf9,0x96,0x08]
+@ CHECK: vst2.32 {d0, d1}, [r4:128], r6  @ encoding: [0x04,0xf9,0xa6,0x08]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vst2.32	{d0, d2}, [r4]
+	vst2.32	{d0, d2}, [r4:16]
+	vst2.32	{d0, d2}, [r4:32]
+	vst2.32	{d0, d2}, [r4:64]
+	vst2.32	{d0, d2}, [r4:128]
+	vst2.32	{d0, d2}, [r4:256]
+
+@ CHECK: vst2.32 {d0, d2}, [r4]          @ encoding: [0x04,0xf9,0x8f,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d2}, [r4:16]
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d2}, [r4:32]
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst2.32 {d0, d2}, [r4:64]       @ encoding: [0x04,0xf9,0x9f,0x09]
+@ CHECK: vst2.32 {d0, d2}, [r4:128]      @ encoding: [0x04,0xf9,0xaf,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d2}, [r4:256]
+@ CHECK-ERRORS:                               ^
+
+	vst2.32	{d0, d2}, [r4]!
+	vst2.32	{d0, d2}, [r4:16]!
+	vst2.32	{d0, d2}, [r4:32]!
+	vst2.32	{d0, d2}, [r4:64]!
+	vst2.32	{d0, d2}, [r4:128]!
+	vst2.32	{d0, d2}, [r4:256]!
+
+@ CHECK: vst2.32 {d0, d2}, [r4]!         @ encoding: [0x04,0xf9,0x8d,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d2}, [r4:16]!
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d2}, [r4:32]!
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst2.32 {d0, d2}, [r4:64]!      @ encoding: [0x04,0xf9,0x9d,0x09]
+@ CHECK: vst2.32 {d0, d2}, [r4:128]!     @ encoding: [0x04,0xf9,0xad,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d2}, [r4:256]!
+@ CHECK-ERRORS:                               ^
+
+	vst2.32	{d0, d2}, [r4], r6
+	vst2.32	{d0, d2}, [r4:16], r6
+	vst2.32	{d0, d2}, [r4:32], r6
+	vst2.32	{d0, d2}, [r4:64], r6
+	vst2.32	{d0, d2}, [r4:128], r6
+	vst2.32	{d0, d2}, [r4:256], r6
+
+@ CHECK: vst2.32 {d0, d2}, [r4], r6      @ encoding: [0x04,0xf9,0x86,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d2}, [r4:16], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d2}, [r4:32], r6
+@ CHECK-ERRORS:                               ^
+@ CHECK: vst2.32 {d0, d2}, [r4:64], r6   @ encoding: [0x04,0xf9,0x96,0x09]
+@ CHECK: vst2.32 {d0, d2}, [r4:128], r6  @ encoding: [0x04,0xf9,0xa6,0x09]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d2}, [r4:256], r6
+@ CHECK-ERRORS:                               ^
+
+	vst2.32	{d0, d1, d2, d3}, [r4]
+	vst2.32	{d0, d1, d2, d3}, [r4:16]
+	vst2.32	{d0, d1, d2, d3}, [r4:32]
+	vst2.32	{d0, d1, d2, d3}, [r4:64]
+	vst2.32	{d0, d1, d2, d3}, [r4:128]
+	vst2.32	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vst2.32 {d0, d1, d2, d3}, [r4]  @ encoding: [0x04,0xf9,0x8f,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst2.32 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x04,0xf9,0x9f,0x03]
+@ CHECK: vst2.32 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x04,0xf9,0xaf,0x03]
+@ CHECK: vst2.32 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x04,0xf9,0xbf,0x03]
+
+	vst2.32	{d0, d1, d2, d3}, [r4]!
+	vst2.32	{d0, d1, d2, d3}, [r4:16]!
+	vst2.32	{d0, d1, d2, d3}, [r4:32]!
+	vst2.32	{d0, d1, d2, d3}, [r4:64]!
+	vst2.32	{d0, d1, d2, d3}, [r4:128]!
+	vst2.32	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vst2.32 {d0, d1, d2, d3}, [r4]! @ encoding: [0x04,0xf9,0x8d,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst2.32 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x04,0xf9,0x9d,0x03]
+@ CHECK: vst2.32 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x04,0xf9,0xad,0x03]
+@ CHECK: vst2.32 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x04,0xf9,0xbd,0x03]
+
+	vst2.32	{d0, d1, d2, d3}, [r4], r6
+	vst2.32	{d0, d1, d2, d3}, [r4:16], r6
+	vst2.32	{d0, d1, d2, d3}, [r4:32], r6
+	vst2.32	{d0, d1, d2, d3}, [r4:64], r6
+	vst2.32	{d0, d1, d2, d3}, [r4:128], r6
+	vst2.32	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vst2.32 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x04,0xf9,0x86,0x03]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst2.32 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x04,0xf9,0x96,0x03]
+@ CHECK: vst2.32 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x04,0xf9,0xa6,0x03]
+@ CHECK: vst2.32 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x04,0xf9,0xb6,0x03]
+
+	vst2.32	{d0[1], d1[1]}, [r4]
+	vst2.32	{d0[1], d1[1]}, [r4:16]
+	vst2.32	{d0[1], d1[1]}, [r4:32]
+	vst2.32	{d0[1], d1[1]}, [r4:64]
+	vst2.32	{d0[1], d1[1]}, [r4:128]
+	vst2.32	{d0[1], d1[1]}, [r4:256]
+
+@ CHECK: vst2.32 {d0[1], d1[1]}, [r4]    @ encoding: [0x84,0xf9,0x8f,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d1[1]}, [r4:16]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d1[1]}, [r4:32]
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vst2.32 {d0[1], d1[1]}, [r4:64] @ encoding: [0x84,0xf9,0x9f,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d1[1]}, [r4:128]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d1[1]}, [r4:256]
+@ CHECK-ERRORS:                                     ^
+
+	vst2.32	{d0[1], d1[1]}, [r4]!
+	vst2.32	{d0[1], d1[1]}, [r4:16]!
+	vst2.32	{d0[1], d1[1]}, [r4:32]!
+	vst2.32	{d0[1], d1[1]}, [r4:64]!
+	vst2.32	{d0[1], d1[1]}, [r4:128]!
+	vst2.32	{d0[1], d1[1]}, [r4:256]!
+
+@ CHECK: vst2.32 {d0[1], d1[1]}, [r4]!   @ encoding: [0x84,0xf9,0x8d,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d1[1]}, [r4:16]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d1[1]}, [r4:32]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vst2.32 {d0[1], d1[1]}, [r4:64]! @ encoding: [0x84,0xf9,0x9d,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d1[1]}, [r4:128]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d1[1]}, [r4:256]!
+@ CHECK-ERRORS:                                     ^
+
+	vst2.32	{d0[1], d1[1]}, [r4], r6
+	vst2.32	{d0[1], d1[1]}, [r4:16], r6
+	vst2.32	{d0[1], d1[1]}, [r4:32], r6
+	vst2.32	{d0[1], d1[1]}, [r4:64], r6
+	vst2.32	{d0[1], d1[1]}, [r4:128], r6
+	vst2.32	{d0[1], d1[1]}, [r4:256], r6
+
+@ CHECK: vst2.32 {d0[1], d1[1]}, [r4], r6 @ encoding: [0x84,0xf9,0x86,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d1[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d1[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vst2.32 {d0[1], d1[1]}, [r4:64], r6 @ encoding: [0x84,0xf9,0x96,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d1[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d1[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                     ^
+
+	vst2.32	{d0[1], d2[1]}, [r4]
+	vst2.32	{d0[1], d2[1]}, [r4:16]
+	vst2.32	{d0[1], d2[1]}, [r4:32]
+	vst2.32	{d0[1], d2[1]}, [r4:64]
+	vst2.32	{d0[1], d2[1]}, [r4:128]
+	vst2.32	{d0[1], d2[1]}, [r4:256]
+
+@ CHECK: vst2.32 {d0[1], d2[1]}, [r4]    @ encoding: [0x84,0xf9,0xcf,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d2[1]}, [r4:16]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d2[1]}, [r4:32]
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vst2.32 {d0[1], d2[1]}, [r4:64] @ encoding: [0x84,0xf9,0xdf,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d2[1]}, [r4:128]
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d2[1]}, [r4:256]
+@ CHECK-ERRORS:                                     ^
+
+	vst2.32	{d0[1], d2[1]}, [r4]!
+	vst2.32	{d0[1], d2[1]}, [r4:16]!
+	vst2.32	{d0[1], d2[1]}, [r4:32]!
+	vst2.32	{d0[1], d2[1]}, [r4:64]!
+	vst2.32	{d0[1], d2[1]}, [r4:128]!
+	vst2.32	{d0[1], d2[1]}, [r4:256]!
+
+@ CHECK: vst2.32 {d0[1], d2[1]}, [r4]!   @ encoding: [0x84,0xf9,0xcd,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d2[1]}, [r4:16]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d2[1]}, [r4:32]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vst2.32 {d0[1], d2[1]}, [r4:64]! @ encoding: [0x84,0xf9,0xdd,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d2[1]}, [r4:128]!
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d2[1]}, [r4:256]!
+@ CHECK-ERRORS:                                     ^
+
+	vst2.32	{d0[1], d2[1]}, [r4], r6
+	vst2.32	{d0[1], d2[1]}, [r4:16], r6
+	vst2.32	{d0[1], d2[1]}, [r4:32], r6
+	vst2.32	{d0[1], d2[1]}, [r4:64], r6
+	vst2.32	{d0[1], d2[1]}, [r4:128], r6
+	vst2.32	{d0[1], d2[1]}, [r4:256], r6
+
+@ CHECK: vst2.32 {d0[1], d2[1]}, [r4], r6 @ encoding: [0x84,0xf9,0xc6,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d2[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d2[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK: vst2.32 {d0[1], d2[1]}, [r4:64], r6 @ encoding: [0x84,0xf9,0xd6,0x09]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d2[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                     ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst2.32 {d0[1], d2[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                     ^
+
+	vst3.8	{d0, d1, d2}, [r4]
+	vst3.8	{d0, d1, d2}, [r4:16]
+	vst3.8	{d0, d1, d2}, [r4:32]
+	vst3.8	{d0, d1, d2}, [r4:64]
+	vst3.8	{d0, d1, d2}, [r4:128]
+	vst3.8	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vst3.8 {d0, d1, d2}, [r4]      @ encoding: [0x04,0xf9,0x0f,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.8 {d0, d1, d2}, [r4:64]   @ encoding: [0x04,0xf9,0x1f,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vst3.8	{d0, d1, d2}, [r4]!
+	vst3.8	{d0, d1, d2}, [r4:16]!
+	vst3.8	{d0, d1, d2}, [r4:32]!
+	vst3.8	{d0, d1, d2}, [r4:64]!
+	vst3.8	{d0, d1, d2}, [r4:128]!
+	vst3.8	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vst3.8 {d0, d1, d2}, [r4]!     @ encoding: [0x04,0xf9,0x0d,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.8 {d0, d1, d2}, [r4:64]!  @ encoding: [0x04,0xf9,0x1d,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vst3.8	{d0, d1, d2}, [r4], r6
+	vst3.8	{d0, d1, d2}, [r4:16], r6
+	vst3.8	{d0, d1, d2}, [r4:32], r6
+	vst3.8	{d0, d1, d2}, [r4:64], r6
+	vst3.8	{d0, d1, d2}, [r4:128], r6
+	vst3.8	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vst3.8 {d0, d1, d2}, [r4], r6  @ encoding: [0x04,0xf9,0x06,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.8 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x04,0xf9,0x16,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vst3.8	{d0, d2, d4}, [r4]
+	vst3.8	{d0, d2, d4}, [r4:16]
+	vst3.8	{d0, d2, d4}, [r4:32]
+	vst3.8	{d0, d2, d4}, [r4:64]
+	vst3.8	{d0, d2, d4}, [r4:128]
+	vst3.8	{d0, d2, d4}, [r4:256]
+
+@ CHECK: vst3.8 {d0, d2, d4}, [r4]      @ encoding: [0x04,0xf9,0x0f,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d2, d4}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d2, d4}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.8 {d0, d2, d4}, [r4:64]   @ encoding: [0x04,0xf9,0x1f,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d2, d4}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d2, d4}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vst3.8	{d0, d2, d4}, [r4]!
+	vst3.8	{d0, d2, d4}, [r4:16]!
+	vst3.8	{d0, d2, d4}, [r4:32]!
+	vst3.8	{d0, d2, d4}, [r4:64]!
+	vst3.8	{d0, d2, d4}, [r4:128]!
+	vst3.8	{d0, d2, d4}, [r4:256]!
+
+@ CHECK: vst3.8 {d0, d2, d4}, [r4]!     @ encoding: [0x04,0xf9,0x0d,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d2, d4}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d2, d4}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.8 {d0, d2, d4}, [r4:64]!  @ encoding: [0x04,0xf9,0x1d,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d2, d4}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d2, d4}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vst3.8	{d0, d2, d4}, [r4], r6
+	vst3.8	{d0, d2, d4}, [r4:16], r6
+	vst3.8	{d0, d2, d4}, [r4:32], r6
+	vst3.8	{d0, d2, d4}, [r4:64], r6
+	vst3.8	{d0, d2, d4}, [r4:128], r6
+	vst3.8	{d0, d2, d4}, [r4:256], r6
+
+@ CHECK: vst3.8 {d0, d2, d4}, [r4], r6  @ encoding: [0x04,0xf9,0x06,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d2, d4}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d2, d4}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.8 {d0, d2, d4}, [r4:64], r6 @ encoding: [0x04,0xf9,0x16,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d2, d4}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.8  {d0, d2, d4}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4]
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:16]
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:32]
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:64]
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:128]
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:256]
+
+@ CHECK: vst3.8 {d0[1], d1[1], d2[1]}, [r4] @ encoding: [0x84,0xf9,0x2f,0x02]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:16]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:32]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:64]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:128]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:256]
+@ CHECK-ERRORS:                                            ^
+
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4]!
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:16]!
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:32]!
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:64]!
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:128]!
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:256]!
+
+@ CHECK: vst3.8 {d0[1], d1[1], d2[1]}, [r4]! @ encoding: [0x84,0xf9,0x2d,0x02]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:16]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:32]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:64]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:128]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:256]!
+@ CHECK-ERRORS:                                            ^
+
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4], r6
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:16], r6
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:32], r6
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:64], r6
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:128], r6
+	vst3.8	{d0[1], d1[1], d2[1]}, [r4:256], r6
+
+@ CHECK: vst3.8 {d0[1], d1[1], d2[1]}, [r4], r6 @ encoding: [0x84,0xf9,0x26,0x02]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:64], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.8  {d0[1], d1[1], d2[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                            ^
+
+	vst3.16	{d0, d1, d2}, [r4]
+	vst3.16	{d0, d1, d2}, [r4:16]
+	vst3.16	{d0, d1, d2}, [r4:32]
+	vst3.16	{d0, d1, d2}, [r4:64]
+	vst3.16	{d0, d1, d2}, [r4:128]
+	vst3.16	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vst3.16 {d0, d1, d2}, [r4]      @ encoding: [0x04,0xf9,0x4f,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.16 {d0, d1, d2}, [r4:64]   @ encoding: [0x04,0xf9,0x5f,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vst3.16	{d0, d1, d2}, [r4]!
+	vst3.16	{d0, d1, d2}, [r4:16]!
+	vst3.16	{d0, d1, d2}, [r4:32]!
+	vst3.16	{d0, d1, d2}, [r4:64]!
+	vst3.16	{d0, d1, d2}, [r4:128]!
+	vst3.16	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vst3.16 {d0, d1, d2}, [r4]!     @ encoding: [0x04,0xf9,0x4d,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.16 {d0, d1, d2}, [r4:64]!  @ encoding: [0x04,0xf9,0x5d,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vst3.16	{d0, d1, d2}, [r4], r6
+	vst3.16	{d0, d1, d2}, [r4:16], r6
+	vst3.16	{d0, d1, d2}, [r4:32], r6
+	vst3.16	{d0, d1, d2}, [r4:64], r6
+	vst3.16	{d0, d1, d2}, [r4:128], r6
+	vst3.16	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vst3.16 {d0, d1, d2}, [r4], r6  @ encoding: [0x04,0xf9,0x46,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.16 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x04,0xf9,0x56,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vst3.16	{d0, d2, d4}, [r4]
+	vst3.16	{d0, d2, d4}, [r4:16]
+	vst3.16	{d0, d2, d4}, [r4:32]
+	vst3.16	{d0, d2, d4}, [r4:64]
+	vst3.16	{d0, d2, d4}, [r4:128]
+	vst3.16	{d0, d2, d4}, [r4:256]
+
+@ CHECK: vst3.16 {d0, d2, d4}, [r4]      @ encoding: [0x04,0xf9,0x4f,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d2, d4}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d2, d4}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.16 {d0, d2, d4}, [r4:64]   @ encoding: [0x04,0xf9,0x5f,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d2, d4}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d2, d4}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vst3.16	{d0, d2, d4}, [r4]!
+	vst3.16	{d0, d2, d4}, [r4:16]!
+	vst3.16	{d0, d2, d4}, [r4:32]!
+	vst3.16	{d0, d2, d4}, [r4:64]!
+	vst3.16	{d0, d2, d4}, [r4:128]!
+	vst3.16	{d0, d2, d4}, [r4:256]!
+
+@ CHECK: vst3.16 {d0, d2, d4}, [r4]!     @ encoding: [0x04,0xf9,0x4d,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d2, d4}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d2, d4}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.16 {d0, d2, d4}, [r4:64]!  @ encoding: [0x04,0xf9,0x5d,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d2, d4}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d2, d4}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vst3.16	{d0, d2, d4}, [r4], r6
+	vst3.16	{d0, d2, d4}, [r4:16], r6
+	vst3.16	{d0, d2, d4}, [r4:32], r6
+	vst3.16	{d0, d2, d4}, [r4:64], r6
+	vst3.16	{d0, d2, d4}, [r4:128], r6
+	vst3.16	{d0, d2, d4}, [r4:256], r6
+
+@ CHECK: vst3.16 {d0, d2, d4}, [r4], r6  @ encoding: [0x04,0xf9,0x46,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d2, d4}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d2, d4}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.16 {d0, d2, d4}, [r4:64], r6 @ encoding: [0x04,0xf9,0x56,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d2, d4}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.16 {d0, d2, d4}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4]
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:16]
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:32]
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:64]
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:128]
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:256]
+
+@ CHECK: vst3.16 {d0[1], d1[1], d2[1]}, [r4] @ encoding: [0x84,0xf9,0x4f,0x06]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:16]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:32]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:64]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:128]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:256]
+@ CHECK-ERRORS:                                            ^
+
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4]!
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:16]!
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:32]!
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:64]!
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:128]!
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:256]!
+
+@ CHECK: vst3.16 {d0[1], d1[1], d2[1]}, [r4]! @ encoding: [0x84,0xf9,0x4d,0x06]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:16]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:32]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:64]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:128]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:256]!
+@ CHECK-ERRORS:                                            ^
+
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4], r6
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:16], r6
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:32], r6
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:64], r6
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:128], r6
+	vst3.16	{d0[1], d1[1], d2[1]}, [r4:256], r6
+
+@ CHECK: vst3.16 {d0[1], d1[1], d2[1]}, [r4], r6 @ encoding: [0x84,0xf9,0x46,0x06]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:64], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d1[1], d2[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                            ^
+
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4]
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:16]
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:32]
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:64]
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:128]
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:256]
+
+@ CHECK: vst3.16 {d0[1], d2[1], d4[1]}, [r4] @ encoding: [0x84,0xf9,0x6f,0x06]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:16]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:32]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:64]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:128]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:256]
+@ CHECK-ERRORS:                                            ^
+
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4]!
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:16]!
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:32]!
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:64]!
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:128]!
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:256]!
+
+@ CHECK: vst3.16 {d0[1], d1[1], d2[1]}, [r4]! @ encoding: [0x84,0xf9,0x6d,0x06]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:16]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:32]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:64]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:128]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:256]!
+@ CHECK-ERRORS:                                            ^
+
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4], r6
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:16], r6
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:32], r6
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:64], r6
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:128], r6
+	vst3.16	{d0[1], d2[1], d4[1]}, [r4:256], r6
+
+@ CHECK: vst3.16 {d0[1], d2[1], d4[1]}, [r4], r6 @ encoding: [0x84,0xf9,0x66,0x06]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:64], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.16 {d0[1], d2[1], d4[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                            ^
+
+	vst3.32	{d0, d1, d2}, [r4]
+	vst3.32	{d0, d1, d2}, [r4:16]
+	vst3.32	{d0, d1, d2}, [r4:32]
+	vst3.32	{d0, d1, d2}, [r4:64]
+	vst3.32	{d0, d1, d2}, [r4:128]
+	vst3.32	{d0, d1, d2}, [r4:256]
+
+@ CHECK: vst3.32 {d0, d1, d2}, [r4]      @ encoding: [0x04,0xf9,0x8f,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d1, d2}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d1, d2}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.32 {d0, d1, d2}, [r4:64]   @ encoding: [0x04,0xf9,0x9f,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d1, d2}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d1, d2}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vst3.32	{d0, d1, d2}, [r4]!
+	vst3.32	{d0, d1, d2}, [r4:16]!
+	vst3.32	{d0, d1, d2}, [r4:32]!
+	vst3.32	{d0, d1, d2}, [r4:64]!
+	vst3.32	{d0, d1, d2}, [r4:128]!
+	vst3.32	{d0, d1, d2}, [r4:256]!
+
+@ CHECK: vst3.32 {d0, d1, d2}, [r4]!     @ encoding: [0x04,0xf9,0x8d,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d1, d2}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d1, d2}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.32 {d0, d1, d2}, [r4:64]!  @ encoding: [0x04,0xf9,0x9d,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d1, d2}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d1, d2}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vst3.32	{d0, d1, d2}, [r4], r6
+	vst3.32	{d0, d1, d2}, [r4:16], r6
+	vst3.32	{d0, d1, d2}, [r4:32], r6
+	vst3.32	{d0, d1, d2}, [r4:64], r6
+	vst3.32	{d0, d1, d2}, [r4:128], r6
+	vst3.32	{d0, d1, d2}, [r4:256], r6
+
+@ CHECK: vst3.32 {d0, d1, d2}, [r4], r6  @ encoding: [0x04,0xf9,0x86,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d1, d2}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d1, d2}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.32 {d0, d1, d2}, [r4:64], r6 @ encoding: [0x04,0xf9,0x96,0x04]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d1, d2}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d1, d2}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vst3.32	{d0, d2, d4}, [r4]
+	vst3.32	{d0, d2, d4}, [r4:16]
+	vst3.32	{d0, d2, d4}, [r4:32]
+	vst3.32	{d0, d2, d4}, [r4:64]
+	vst3.32	{d0, d2, d4}, [r4:128]
+	vst3.32	{d0, d2, d4}, [r4:256]
+
+@ CHECK: vst3.32 {d0, d2, d4}, [r4]      @ encoding: [0x04,0xf9,0x8f,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d2, d4}, [r4:16]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d2, d4}, [r4:32]
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.32 {d0, d2, d4}, [r4:64]   @ encoding: [0x04,0xf9,0x9f,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d2, d4}, [r4:128]
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d2, d4}, [r4:256]
+@ CHECK-ERRORS:                                   ^
+
+	vst3.32	{d0, d2, d4}, [r4]!
+	vst3.32	{d0, d2, d4}, [r4:16]!
+	vst3.32	{d0, d2, d4}, [r4:32]!
+	vst3.32	{d0, d2, d4}, [r4:64]!
+	vst3.32	{d0, d2, d4}, [r4:128]!
+	vst3.32	{d0, d2, d4}, [r4:256]!
+
+@ CHECK: vst3.32 {d0, d2, d4}, [r4]!     @ encoding: [0x04,0xf9,0x8d,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d2, d4}, [r4:16]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d2, d4}, [r4:32]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.32 {d0, d2, d4}, [r4:64]!  @ encoding: [0x04,0xf9,0x9d,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d2, d4}, [r4:128]!
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d2, d4}, [r4:256]!
+@ CHECK-ERRORS:                                   ^
+
+	vst3.32	{d0, d2, d4}, [r4], r6
+	vst3.32	{d0, d2, d4}, [r4:16], r6
+	vst3.32	{d0, d2, d4}, [r4:32], r6
+	vst3.32	{d0, d2, d4}, [r4:64], r6
+	vst3.32	{d0, d2, d4}, [r4:128], r6
+	vst3.32	{d0, d2, d4}, [r4:256], r6
+
+@ CHECK: vst3.32 {d0, d2, d4}, [r4], r6  @ encoding: [0x04,0xf9,0x86,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d2, d4}, [r4:16], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d2, d4}, [r4:32], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK: vst3.32 {d0, d2, d4}, [r4:64], r6 @ encoding: [0x04,0xf9,0x96,0x05]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d2, d4}, [r4:128], r6
+@ CHECK-ERRORS:                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst3.32 {d0, d2, d4}, [r4:256], r6
+@ CHECK-ERRORS:                                   ^
+
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4]
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:16]
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:32]
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:64]
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:128]
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:256]
+
+@ CHECK: vst3.32 {d0[1], d1[1], d2[1]}, [r4] @ encoding: [0x84,0xf9,0x8f,0x0a]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:16]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:32]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:64]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:128]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:256]
+@ CHECK-ERRORS:                                            ^
+
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4]!
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:16]!
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:32]!
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:64]!
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:128]!
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:256]!
+
+@ CHECK: vst3.32 {d0[1], d1[1], d2[1]}, [r4]! @ encoding: [0x84,0xf9,0x8d,0x0a]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:16]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:32]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:64]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:128]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:256]!
+@ CHECK-ERRORS:                                            ^
+
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4], r6
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:16], r6
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:32], r6
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:64], r6
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:128], r6
+	vst3.32	{d0[1], d1[1], d2[1]}, [r4:256], r6
+
+@ CHECK: vst3.32 {d0[1], d1[1], d2[1]}, [r4], r6 @ encoding: [0x84,0xf9,0x86,0x0a]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:64], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d1[1], d2[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                            ^
+
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4]
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:16]
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:32]
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:64]
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:128]
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:256]
+
+@ CHECK: vst3.32 {d0[1], d2[1], d4[1]}, [r4] @ encoding: [0x84,0xf9,0xcf,0x0a]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:16]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:32]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:64]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:128]
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:256]
+@ CHECK-ERRORS:                                            ^
+
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4]!
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:16]!
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:32]!
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:64]!
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:128]!
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:256]!
+
+@ CHECK: vst3.32 {d0[1], d2[1], d4[1]}, [r4]! @ encoding: [0x84,0xf9,0xcd,0x0a]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:16]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:32]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:64]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:128]!
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:256]!
+@ CHECK-ERRORS:                                            ^
+
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4], r6
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:16], r6
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:32], r6
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:64], r6
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:128], r6
+	vst3.32	{d0[1], d2[1], d4[1]}, [r4:256], r6
+
+@ CHECK: vst3.32 {d0[1], d2[1], d4[1]}, [r4], r6 @ encoding: [0x84,0xf9,0xc6,0x0a]
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:64], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                            ^
+@ CHECK-ERRORS: error: alignment must be omitted
+@ CHECK-ERRORS:         vst3.32 {d0[1], d2[1], d4[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                            ^
+
+	vst4.8	{d0, d1, d2, d3}, [r4]
+	vst4.8	{d0, d1, d2, d3}, [r4:16]
+	vst4.8	{d0, d1, d2, d3}, [r4:32]
+	vst4.8	{d0, d1, d2, d3}, [r4:64]
+	vst4.8	{d0, d1, d2, d3}, [r4:128]
+	vst4.8	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vst4.8 {d0, d1, d2, d3}, [r4]  @ encoding: [0x04,0xf9,0x0f,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.8 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x04,0xf9,0x1f,0x00]
+@ CHECK: vst4.8 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x04,0xf9,0x2f,0x00]
+@ CHECK: vst4.8 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x04,0xf9,0x3f,0x00]
+
+	vst4.8	{d0, d1, d2, d3}, [r4]!
+	vst4.8	{d0, d1, d2, d3}, [r4:16]!
+	vst4.8	{d0, d1, d2, d3}, [r4:32]!
+	vst4.8	{d0, d1, d2, d3}, [r4:64]!
+	vst4.8	{d0, d1, d2, d3}, [r4:128]!
+	vst4.8	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vst4.8 {d0, d1, d2, d3}, [r4]! @ encoding: [0x04,0xf9,0x0d,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.8 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x04,0xf9,0x1d,0x00]
+@ CHECK: vst4.8 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x04,0xf9,0x2d,0x00]
+@ CHECK: vst4.8 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x04,0xf9,0x3d,0x00]
+
+	vst4.8	{d0, d1, d2, d3}, [r4], r6
+	vst4.8	{d0, d1, d2, d3}, [r4:16], r6
+	vst4.8	{d0, d1, d2, d3}, [r4:32], r6
+	vst4.8	{d0, d1, d2, d3}, [r4:64], r6
+	vst4.8	{d0, d1, d2, d3}, [r4:128], r6
+	vst4.8	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vst4.8 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x04,0xf9,0x06,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.8 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x04,0xf9,0x16,0x00]
+@ CHECK: vst4.8 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x04,0xf9,0x26,0x00]
+@ CHECK: vst4.8 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x04,0xf9,0x36,0x00]
+
+	vst4.8	{d0, d2, d4, d6}, [r4]
+	vst4.8	{d0, d2, d4, d6}, [r4:16]
+	vst4.8	{d0, d2, d4, d6}, [r4:32]
+	vst4.8	{d0, d2, d4, d6}, [r4:64]
+	vst4.8	{d0, d2, d4, d6}, [r4:128]
+	vst4.8	{d0, d2, d4, d6}, [r4:256]
+
+@ CHECK: vst4.8 {d0, d2, d4, d6}, [r4]  @ encoding: [0x04,0xf9,0x0f,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0, d2, d4, d6}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0, d2, d4, d6}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.8 {d0, d2, d4, d6}, [r4:64] @ encoding: [0x04,0xf9,0x1f,0x01]
+@ CHECK: vst4.8 {d0, d2, d4, d6}, [r4:128] @ encoding: [0x04,0xf9,0x2f,0x01]
+@ CHECK: vst4.8 {d0, d2, d4, d6}, [r4:256] @ encoding: [0x04,0xf9,0x3f,0x01]
+
+	vst4.8	{d0, d2, d4, d6}, [r4]!
+	vst4.8	{d0, d2, d4, d6}, [r4:16]!
+	vst4.8	{d0, d2, d4, d6}, [r4:32]!
+	vst4.8	{d0, d2, d4, d6}, [r4:64]!
+	vst4.8	{d0, d2, d4, d6}, [r4:128]!
+	vst4.8	{d0, d2, d4, d6}, [r4:256]!
+
+@ CHECK: vst4.8 {d0, d2, d4, d6}, [r4]! @ encoding: [0x04,0xf9,0x0d,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0, d2, d4, d6}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0, d2, d4, d6}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.8 {d0, d2, d4, d6}, [r4:64]! @ encoding: [0x04,0xf9,0x1d,0x01]
+@ CHECK: vst4.8 {d0, d2, d4, d6}, [r4:128]! @ encoding: [0x04,0xf9,0x2d,0x01]
+@ CHECK: vst4.8 {d0, d2, d4, d6}, [r4:256]! @ encoding: [0x04,0xf9,0x3d,0x01]
+
+	vst4.8	{d0, d2, d4, d6}, [r4], r6
+	vst4.8	{d0, d2, d4, d6}, [r4:16], r6
+	vst4.8	{d0, d2, d4, d6}, [r4:32], r6
+	vst4.8	{d0, d2, d4, d6}, [r4:64], r6
+	vst4.8	{d0, d2, d4, d6}, [r4:128], r6
+	vst4.8	{d0, d2, d4, d6}, [r4:256], r6
+
+@ CHECK: vst4.8 {d0, d2, d4, d6}, [r4], r6 @ encoding: [0x04,0xf9,0x06,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0, d2, d4, d6}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0, d2, d4, d6}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.8 {d0, d2, d4, d6}, [r4:64], r6 @ encoding: [0x04,0xf9,0x16,0x01]
+@ CHECK: vst4.8 {d0, d2, d4, d6}, [r4:128], r6 @ encoding: [0x04,0xf9,0x26,0x01]
+@ CHECK: vst4.8 {d0, d2, d4, d6}, [r4:256], r6 @ encoding: [0x04,0xf9,0x36,0x01]
+
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4]
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]
+
+@ CHECK: vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r4] @ encoding: [0x84,0xf9,0x2f,0x03]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:16]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r4:32] @ encoding: [0x84,0xf9,0x3f,0x03]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:64]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:128]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:256]
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4]!
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]!
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]!
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]!
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+
+@ CHECK: vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r4]! @ encoding: [0x84,0xf9,0x2d,0x03]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r4:32]! @ encoding: [0x84,0xf9,0x3d,0x03]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:64]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:128]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4], r6
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6
+	vst4.8	{d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+
+@ CHECK: vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r4], r6 @ encoding: [0x84,0xf9,0x26,0x03]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6 @ encoding: [0x84,0xf9,0x36,0x03]
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 32 or omitted
+@ CHECK-ERRORS:         vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.16	{d0, d1, d2, d3}, [r4]
+	vst4.16	{d0, d1, d2, d3}, [r4:16]
+	vst4.16	{d0, d1, d2, d3}, [r4:32]
+	vst4.16	{d0, d1, d2, d3}, [r4:64]
+	vst4.16	{d0, d1, d2, d3}, [r4:128]
+	vst4.16	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vst4.16 {d0, d1, d2, d3}, [r4]  @ encoding: [0x04,0xf9,0x4f,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.16 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x04,0xf9,0x5f,0x00]
+@ CHECK: vst4.16 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x04,0xf9,0x6f,0x00]
+@ CHECK: vst4.16 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x04,0xf9,0x7f,0x00]
+
+	vst4.16	{d0, d1, d2, d3}, [r4]!
+	vst4.16	{d0, d1, d2, d3}, [r4:16]!
+	vst4.16	{d0, d1, d2, d3}, [r4:32]!
+	vst4.16	{d0, d1, d2, d3}, [r4:64]!
+	vst4.16	{d0, d1, d2, d3}, [r4:128]!
+	vst4.16	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vst4.16 {d0, d1, d2, d3}, [r4]! @ encoding: [0x04,0xf9,0x4d,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.16 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x04,0xf9,0x5d,0x00]
+@ CHECK: vst4.16 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x04,0xf9,0x6d,0x00]
+@ CHECK: vst4.16 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x04,0xf9,0x7d,0x00]
+
+	vst4.16	{d0, d1, d2, d3}, [r4], r6
+	vst4.16	{d0, d1, d2, d3}, [r4:16], r6
+	vst4.16	{d0, d1, d2, d3}, [r4:32], r6
+	vst4.16	{d0, d1, d2, d3}, [r4:64], r6
+	vst4.16	{d0, d1, d2, d3}, [r4:128], r6
+	vst4.16	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vst4.16 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x04,0xf9,0x46,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.16 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x04,0xf9,0x56,0x00]
+@ CHECK: vst4.16 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x04,0xf9,0x66,0x00]
+@ CHECK: vst4.16 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x04,0xf9,0x76,0x00]
+
+	vst4.16	{d0, d2, d4, d6}, [r4]
+	vst4.16	{d0, d2, d4, d6}, [r4:16]
+	vst4.16	{d0, d2, d4, d6}, [r4:32]
+	vst4.16	{d0, d2, d4, d6}, [r4:64]
+	vst4.16	{d0, d2, d4, d6}, [r4:128]
+	vst4.16	{d0, d2, d4, d6}, [r4:256]
+
+@ CHECK: vst4.16 {d0, d2, d4, d6}, [r4]  @ encoding: [0x04,0xf9,0x4f,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0, d2, d4, d6}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0, d2, d4, d6}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.16 {d0, d2, d4, d6}, [r4:64] @ encoding: [0x04,0xf9,0x5f,0x01]
+@ CHECK: vst4.16 {d0, d2, d4, d6}, [r4:128] @ encoding: [0x04,0xf9,0x6f,0x01]
+@ CHECK: vst4.16 {d0, d2, d4, d6}, [r4:256] @ encoding: [0x04,0xf9,0x7f,0x01]
+
+	vst4.16	{d0, d2, d4, d6}, [r4]!
+	vst4.16	{d0, d2, d4, d6}, [r4:16]!
+	vst4.16	{d0, d2, d4, d6}, [r4:32]!
+	vst4.16	{d0, d2, d4, d6}, [r4:64]!
+	vst4.16	{d0, d2, d4, d6}, [r4:128]!
+	vst4.16	{d0, d2, d4, d6}, [r4:256]!
+
+@ CHECK: vst4.16 {d0, d2, d4, d6}, [r4]! @ encoding: [0x04,0xf9,0x4d,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0, d2, d4, d6}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0, d2, d4, d6}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.16 {d0, d2, d4, d6}, [r4:64]! @ encoding: [0x04,0xf9,0x5d,0x01]
+@ CHECK: vst4.16 {d0, d2, d4, d6}, [r4:128]! @ encoding: [0x04,0xf9,0x6d,0x01]
+@ CHECK: vst4.16 {d0, d2, d4, d6}, [r4:256]! @ encoding: [0x04,0xf9,0x7d,0x01]
+
+	vst4.16	{d0, d2, d4, d6}, [r4], r6
+	vst4.16	{d0, d2, d4, d6}, [r4:16], r6
+	vst4.16	{d0, d2, d4, d6}, [r4:32], r6
+	vst4.16	{d0, d2, d4, d6}, [r4:64], r6
+	vst4.16	{d0, d2, d4, d6}, [r4:128], r6
+	vst4.16	{d0, d2, d4, d6}, [r4:256], r6
+
+@ CHECK: vst4.16 {d0, d2, d4, d6}, [r4], r6 @ encoding: [0x04,0xf9,0x46,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0, d2, d4, d6}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0, d2, d4, d6}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.16 {d0, d2, d4, d6}, [r4:64], r6 @ encoding: [0x04,0xf9,0x56,0x01]
+@ CHECK: vst4.16 {d0, d2, d4, d6}, [r4:128], r6 @ encoding: [0x04,0xf9,0x66,0x01]
+@ CHECK: vst4.16 {d0, d2, d4, d6}, [r4:256], r6 @ encoding: [0x04,0xf9,0x76,0x01]
+
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4]
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]
+
+@ CHECK: vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4] @ encoding: [0x84,0xf9,0x4f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:16]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:32]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:64] @ encoding: [0x84,0xf9,0x5f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:128]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:256]
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4]!
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]!
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]!
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]!
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+
+@ CHECK: vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4]! @ encoding: [0x84,0xf9,0x4d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:32]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:64]! @ encoding: [0x84,0xf9,0x5d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:128]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4], r6
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6
+	vst4.16	{d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+
+@ CHECK: vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4], r6 @ encoding: [0x84,0xf9,0x46,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6 @ encoding: [0x84,0xf9,0x56,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4]
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:16]
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:32]
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:64]
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:128]
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:256]
+
+@ CHECK: vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4] @ encoding: [0x84,0xf9,0x6f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:16]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:32]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:64] @ encoding: [0x84,0xf9,0x7f,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:128]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:256]
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4]!
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:16]!
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:32]!
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:64]!
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:128]!
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:256]!
+
+@ CHECK: vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4]! @ encoding: [0x84,0xf9,0x6d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:16]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:32]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r4:64]! @ encoding: [0x84,0xf9,0x7d,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:128]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:256]!
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4], r6
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:16], r6
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:32], r6
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:64], r6
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:128], r6
+	vst4.16	{d0[1], d2[1], d4[1], d6[1]}, [r4:256], r6
+
+@ CHECK: vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4], r6 @ encoding: [0x84,0xf9,0x66,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:64], r6 @ encoding: [0x84,0xf9,0x76,0x07]
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:128], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64 or omitted
+@ CHECK-ERRORS:         vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.32	{d0, d1, d2, d3}, [r4]
+	vst4.32	{d0, d1, d2, d3}, [r4:16]
+	vst4.32	{d0, d1, d2, d3}, [r4:32]
+	vst4.32	{d0, d1, d2, d3}, [r4:64]
+	vst4.32	{d0, d1, d2, d3}, [r4:128]
+	vst4.32	{d0, d1, d2, d3}, [r4:256]
+
+@ CHECK: vst4.32 {d0, d1, d2, d3}, [r4]  @ encoding: [0x04,0xf9,0x8f,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0, d1, d2, d3}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0, d1, d2, d3}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.32 {d0, d1, d2, d3}, [r4:64] @ encoding: [0x04,0xf9,0x9f,0x00]
+@ CHECK: vst4.32 {d0, d1, d2, d3}, [r4:128] @ encoding: [0x04,0xf9,0xaf,0x00]
+@ CHECK: vst4.32 {d0, d1, d2, d3}, [r4:256] @ encoding: [0x04,0xf9,0xbf,0x00]
+
+	vst4.32	{d0, d1, d2, d3}, [r4]!
+	vst4.32	{d0, d1, d2, d3}, [r4:16]!
+	vst4.32	{d0, d1, d2, d3}, [r4:32]!
+	vst4.32	{d0, d1, d2, d3}, [r4:64]!
+	vst4.32	{d0, d1, d2, d3}, [r4:128]!
+	vst4.32	{d0, d1, d2, d3}, [r4:256]!
+
+@ CHECK: vst4.32 {d0, d1, d2, d3}, [r4]! @ encoding: [0x04,0xf9,0x8d,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0, d1, d2, d3}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0, d1, d2, d3}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.32 {d0, d1, d2, d3}, [r4:64]! @ encoding: [0x04,0xf9,0x9d,0x00]
+@ CHECK: vst4.32 {d0, d1, d2, d3}, [r4:128]! @ encoding: [0x04,0xf9,0xad,0x00]
+@ CHECK: vst4.32 {d0, d1, d2, d3}, [r4:256]! @ encoding: [0x04,0xf9,0xbd,0x00]
+
+	vst4.32	{d0, d1, d2, d3}, [r4], r6
+	vst4.32	{d0, d1, d2, d3}, [r4:16], r6
+	vst4.32	{d0, d1, d2, d3}, [r4:32], r6
+	vst4.32	{d0, d1, d2, d3}, [r4:64], r6
+	vst4.32	{d0, d1, d2, d3}, [r4:128], r6
+	vst4.32	{d0, d1, d2, d3}, [r4:256], r6
+
+@ CHECK: vst4.32 {d0, d1, d2, d3}, [r4], r6 @ encoding: [0x04,0xf9,0x86,0x00]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0, d1, d2, d3}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0, d1, d2, d3}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.32 {d0, d1, d2, d3}, [r4:64], r6 @ encoding: [0x04,0xf9,0x96,0x00]
+@ CHECK: vst4.32 {d0, d1, d2, d3}, [r4:128], r6 @ encoding: [0x04,0xf9,0xa6,0x00]
+@ CHECK: vst4.32 {d0, d1, d2, d3}, [r4:256], r6 @ encoding: [0x04,0xf9,0xb6,0x00]
+
+	vst4.32	{d0, d2, d4, d6}, [r4]
+	vst4.32	{d0, d2, d4, d6}, [r4:16]
+	vst4.32	{d0, d2, d4, d6}, [r4:32]
+	vst4.32	{d0, d2, d4, d6}, [r4:64]
+	vst4.32	{d0, d2, d4, d6}, [r4:128]
+	vst4.32	{d0, d2, d4, d6}, [r4:256]
+
+@ CHECK: vst4.32 {d0, d2, d4, d6}, [r4]  @ encoding: [0x04,0xf9,0x8f,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0, d2, d4, d6}, [r4:16]
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0, d2, d4, d6}, [r4:32]
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.32 {d0, d2, d4, d6}, [r4:64] @ encoding: [0x04,0xf9,0x9f,0x01]
+@ CHECK: vst4.32 {d0, d2, d4, d6}, [r4:128] @ encoding: [0x04,0xf9,0xaf,0x01]
+@ CHECK: vst4.32 {d0, d2, d4, d6}, [r4:256] @ encoding: [0x04,0xf9,0xbf,0x01]
+
+	vst4.32	{d0, d2, d4, d6}, [r4]!
+	vst4.32	{d0, d2, d4, d6}, [r4:16]!
+	vst4.32	{d0, d2, d4, d6}, [r4:32]!
+	vst4.32	{d0, d2, d4, d6}, [r4:64]!
+	vst4.32	{d0, d2, d4, d6}, [r4:128]!
+	vst4.32	{d0, d2, d4, d6}, [r4:256]!
+
+@ CHECK: vst4.32 {d0, d2, d4, d6}, [r4]! @ encoding: [0x04,0xf9,0x8d,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0, d2, d4, d6}, [r4:16]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0, d2, d4, d6}, [r4:32]!
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.32 {d0, d2, d4, d6}, [r4:64]! @ encoding: [0x04,0xf9,0x9d,0x01]
+@ CHECK: vst4.32 {d0, d2, d4, d6}, [r4:128]! @ encoding: [0x04,0xf9,0xad,0x01]
+@ CHECK: vst4.32 {d0, d2, d4, d6}, [r4:256]! @ encoding: [0x04,0xf9,0xbd,0x01]
+
+	vst4.32	{d0, d2, d4, d6}, [r4], r6
+	vst4.32	{d0, d2, d4, d6}, [r4:16], r6
+	vst4.32	{d0, d2, d4, d6}, [r4:32], r6
+	vst4.32	{d0, d2, d4, d6}, [r4:64], r6
+	vst4.32	{d0, d2, d4, d6}, [r4:128], r6
+	vst4.32	{d0, d2, d4, d6}, [r4:256], r6
+
+@ CHECK: vst4.32 {d0, d2, d4, d6}, [r4], r6 @ encoding: [0x04,0xf9,0x86,0x01]
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0, d2, d4, d6}, [r4:16], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK-ERRORS: error: alignment must be 64, 128, 256 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0, d2, d4, d6}, [r4:32], r6
+@ CHECK-ERRORS:                                       ^
+@ CHECK: vst4.32 {d0, d2, d4, d6}, [r4:64], r6 @ encoding: [0x04,0xf9,0x96,0x01]
+@ CHECK: vst4.32 {d0, d2, d4, d6}, [r4:128], r6 @ encoding: [0x04,0xf9,0xa6,0x01]
+@ CHECK: vst4.32 {d0, d2, d4, d6}, [r4:256], r6 @ encoding: [0x04,0xf9,0xb6,0x01]
+
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4]
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]
+
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4] @ encoding: [0x84,0xf9,0x8f,0x0b]
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:16]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:32]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:64] @ encoding: [0x84,0xf9,0x9f,0x0b]
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:128] @ encoding: [0x84,0xf9,0xaf,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:256]
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4]!
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]!
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]!
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]!
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4]! @ encoding: [0x84,0xf9,0x8d,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:32]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:64]! @ encoding: [0x84,0xf9,0x9d,0x0b]
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:128]! @ encoding: [0x84,0xf9,0xad,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4], r6
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4], r6 @ encoding: [0x84,0xf9,0x86,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6 @ encoding: [0x84,0xf9,0x96,0x0b]
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6 @ encoding: [0x84,0xf9,0xa6,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4]
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:16]
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:32]
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:64]
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:128]
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:256]
+
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4] @ encoding: [0x84,0xf9,0xcf,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:16]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:32]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:64] @ encoding: [0x84,0xf9,0xdf,0x0b]
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:128] @ encoding: [0x84,0xf9,0xef,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:256]
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4]!
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:16]!
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:32]!
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:64]!
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:128]!
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:256]!
+
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4]! @ encoding: [0x84,0xf9,0xcd,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:16]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:32]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:64]! @ encoding: [0x84,0xf9,0xdd,0x0b]
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:128]! @ encoding: [0x84,0xf9,0xed,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:256]!
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4], r6
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:16], r6
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:32], r6
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:64], r6
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:128], r6
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:256], r6
+
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4], r6 @ encoding: [0x84,0xf9,0xc6,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:64], r6 @ encoding: [0x84,0xf9,0xd6,0x0b]
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:128], r6 @ encoding: [0x84,0xf9,0xe6,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4]!
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:32]!
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:64]!
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:128]!
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4]! @ encoding: [0x84,0xf9,0x8d,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:16]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:32]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:64]! @ encoding: [0x84,0xf9,0x9d,0x0b]
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:128]! @ encoding: [0x84,0xf9,0xad,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:256]!
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4], r6
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6
+	vst4.32	{d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4], r6 @ encoding: [0x84,0xf9,0x86,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:64], r6 @ encoding: [0x84,0xf9,0x96,0x0b]
+@ CHECK: vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:128], r6 @ encoding: [0x84,0xf9,0xa6,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d1[1], d2[1], d3[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4]
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:16]
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:32]
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:64]
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:128]
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:256]
+
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4] @ encoding: [0x84,0xf9,0xcf,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:16]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:32]
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:64] @ encoding: [0x84,0xf9,0xdf,0x0b]
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:128] @ encoding: [0x84,0xf9,0xef,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:256]
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4]!
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:16]!
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:32]!
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:64]!
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:128]!
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:256]!
+
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4]! @ encoding: [0x84,0xf9,0xcd,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:16]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:32]!
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:64]! @ encoding: [0x84,0xf9,0xdd,0x0b]
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:128]! @ encoding: [0x84,0xf9,0xed,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:256]!
+@ CHECK-ERRORS:                                                   ^
+
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4], r6
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:16], r6
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:32], r6
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:64], r6
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:128], r6
+	vst4.32	{d0[1], d2[1], d4[1], d6[1]}, [r4:256], r6
+
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4], r6 @ encoding: [0x84,0xf9,0xc6,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:16], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:32], r6
+@ CHECK-ERRORS:                                                   ^
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:64], r6 @ encoding: [0x84,0xf9,0xd6,0x0b]
+@ CHECK: vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:128], r6 @ encoding: [0x84,0xf9,0xe6,0x0b]
+@ CHECK-ERRORS: error: alignment must be 64, 128 or omitted
+@ CHECK-ERRORS:         vst4.32 {d0[1], d2[1], d4[1], d6[1]}, [r4:256], r6
+@ CHECK-ERRORS:                                                   ^
diff --git a/test/MC/ARM/pool.s b/test/MC/ARM/pool.s
index 926b4f1..782f67e 100644
--- a/test/MC/ARM/pool.s
+++ b/test/MC/ARM/pool.s
@@ -16,3 +16,4 @@ pool:
 @ CHECK-LABEL: .Ltmp0:
 @ CHECK: .long	3126770193
 
+
diff --git a/test/MC/ARM/symbol-variants.s b/test/MC/ARM/symbol-variants.s
index e1036a3..a10fe50 100644
--- a/test/MC/ARM/symbol-variants.s
+++ b/test/MC/ARM/symbol-variants.s
@@ -2,6 +2,7 @@
 @ RUN: llvm-mc < %s -triple thumbv7-none-linux-gnueabi -filetype=obj  | llvm-objdump -triple thumbv7-none-linux-gnueabi -r - | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB
 
 @ CHECK-LABEL: RELOCATION RECORDS FOR [.rel.text]
+.Lsym:
 
 @ empty
 .word f00
@@ -83,3 +84,8 @@ bl f05(plt)
 @ CHECK: 60 R_ARM_TLS_GOTDESC f24
 @ CHECK: 64 R_ARM_TLS_GOTDESC f25
 
+@ got_prel
+.word	f26(GOT_PREL) + (. - .Lsym)
+	ldr r3, =f27(GOT_PREL)
+@ CHECK: 68 R_ARM_GOT_PREL f26
+@ CHECK: 70 R_ARM_GOT_PREL f27
diff --git a/test/MC/ARM/thumb2-diagnostics.s b/test/MC/ARM/thumb2-diagnostics.s
index 6ac2db0..b2b14bc 100644
--- a/test/MC/ARM/thumb2-diagnostics.s
+++ b/test/MC/ARM/thumb2-diagnostics.s
@@ -70,3 +70,21 @@
 @ CHECK-ERRORS: error: branch target out of range
 @ CHECK-ERRORS: error: branch target out of range
 @ CHECK-ERRORS: error: branch target out of range
+
+foo2:
+        mov r0, foo2
+        movw r0, foo2
+        movt r0, foo2
+@ CHECK-ERRORS: error: immediate expression for mov requires :lower16: or :upper16
+@ CHECK-ERRORS:                 ^
+@ CHECK-ERRORS: error: immediate expression for mov requires :lower16: or :upper16
+@ CHECK-ERRORS:                  ^
+@ CHECK-ERRORS: error: immediate expression for mov requires :lower16: or :upper16
+@ CHECK-ERRORS:                  ^
+
+        and sp, r1, #80008000
+        and pc, r1, #80008000
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: invalid operand for instruction
+
+
diff --git a/test/MC/ARM/thumb2-strd.s b/test/MC/ARM/thumb2-strd.s
new file mode 100644
index 0000000..3f8025d
--- /dev/null
+++ b/test/MC/ARM/thumb2-strd.s
@@ -0,0 +1,10 @@
+@ RUN: not llvm-mc -triple=armv7-linux-gnueabi %s 2>&1 | FileCheck %s
+.text
+.thumb
+
+@ CHECK: error: invalid operand for instruction
+@ CHECK: error: invalid operand for instruction
+@ CHECK: error: invalid operand for instruction
+strd r12, SP, [r0, #256]
+strd r12, SP, [r0, #256]!
+strd r12, SP, [r0], #256
diff --git a/test/MC/ARM/thumb2be-b.w-encoding.s b/test/MC/ARM/thumb2be-b.w-encoding.s
new file mode 100644
index 0000000..2c3e31b
--- /dev/null
+++ b/test/MC/ARM/thumb2be-b.w-encoding.s
@@ -0,0 +1,9 @@
+@ RUN: llvm-mc -triple=thumbv7-none-linux-gnueabi -show-encoding < %s | FileCheck %s --check-prefix=CHECK-LE
+@ RUN: llvm-mc -triple=thumbebv7-none-linux-gnueabi -show-encoding < %s | FileCheck %s --check-prefix=CHECK-BE
+
+b.w   bar
+@ CHECK-LE: b.w	bar                     @ encoding: [A,0xf0'A',A,0x90'A']
+@ CHECK-LE-NEXT:                        @   fixup A - offset: 0, value: bar, kind: fixup_t2_uncondbranch
+@ CHECK-BE: b.w	bar                     @ encoding: [0xf0'A',A,0x90'A',A]
+@ CHECK-BE-NEXT:                        @   fixup A - offset: 0, value: bar, kind: fixup_t2_uncondbranch
+
diff --git a/test/MC/ARM/thumb2be-beq.w-encoding.s b/test/MC/ARM/thumb2be-beq.w-encoding.s
new file mode 100644
index 0000000..e39e541
--- /dev/null
+++ b/test/MC/ARM/thumb2be-beq.w-encoding.s
@@ -0,0 +1,9 @@
+@ RUN: llvm-mc -triple=thumbv7-none-linux-gnueabi -show-encoding < %s | FileCheck %s --check-prefix=CHECK-LE
+@ RUN: llvm-mc -triple=thumbebv7-none-linux-gnueabi -show-encoding < %s | FileCheck %s --check-prefix=CHECK-BE
+
+beq.w   bar
+@ CHECK-LE: beq.w	bar                     @ encoding: [A,0xf0'A',A,0x80'A']
+@ CHECK-LE-NEXT:                                @   fixup A - offset: 0, value: bar, kind: fixup_t2_condbranch
+@ CHECK-BE: beq.w	bar                     @ encoding: [0xf0'A',A,0x80'A',A]
+@ CHECK-BE-NEXT:                                @   fixup A - offset: 0, value: bar, kind: fixup_t2_condbranch
+
diff --git a/test/MC/ARM/thumb2be-movt-encoding.s b/test/MC/ARM/thumb2be-movt-encoding.s
new file mode 100644
index 0000000..cc6c04e
--- /dev/null
+++ b/test/MC/ARM/thumb2be-movt-encoding.s
@@ -0,0 +1,9 @@
+@ RUN: llvm-mc -triple=thumbv7-none-linux-gnueabi -show-encoding < %s | FileCheck %s --check-prefix=CHECK-LE
+@ RUN: llvm-mc -triple=thumbebv7-none-linux-gnueabi -show-encoding < %s | FileCheck %s --check-prefix=CHECK-BE
+
+movt r9, :upper16:(_bar)
+@ CHECK-LE: movt    r9, :upper16:_bar       @ encoding: [0xc0'A',0xf2'A',0b0000AAAA,0x09]
+@ CHECK-LE-NEXT:                            @   fixup A - offset: 0, value: _bar, kind: fixup_t2_movt_hi16
+@ CHECK-BE: movt    r9, :upper16:_bar       @ encoding: [0xf2,0b1100AAAA,0x09'A',A]
+@ CHECK-BE-NEXT:                            @   fixup A - offset: 0, value: _bar, kind: fixup_t2_movt_hi16
+
diff --git a/test/MC/ARM/thumb2be-movw-encoding.s b/test/MC/ARM/thumb2be-movw-encoding.s
new file mode 100644
index 0000000..3bff457
--- /dev/null
+++ b/test/MC/ARM/thumb2be-movw-encoding.s
@@ -0,0 +1,9 @@
+@ RUN: llvm-mc -triple=thumbv7-none-linux-gnueabi -show-encoding < %s | FileCheck %s --check-prefix=CHECK-LE
+@ RUN: llvm-mc -triple=thumbebv7-none-linux-gnueabi -show-encoding < %s | FileCheck %s --check-prefix=CHECK-BE
+
+movw r9, :lower16:(_bar)
+@ CHECK-LE: movw    r9, :lower16:_bar       @ encoding: [0x40'A',0xf2'A',0b0000AAAA,0x09]
+@ CHECK-LE-NEXT:                            @   fixup A - offset: 0, value: _bar, kind: fixup_t2_movw_lo16
+@ CHECK-BE: movw    r9, :lower16:_bar       @ encoding: [0xf2,0b0100AAAA,0x09'A',A]
+@ CHECK-BE-NEXT:                            @   fixup A - offset: 0, value: _bar, kind: fixup_t2_movw_lo16
+
diff --git a/test/MC/ARM/thumb_set.s b/test/MC/ARM/thumb_set.s
index d0bc985..d2a0dc0 100644
--- a/test/MC/ARM/thumb_set.s
+++ b/test/MC/ARM/thumb_set.s
@@ -1,6 +1,9 @@
 @ RUN: llvm-mc -triple armv7-eabi -filetype obj -o - %s | llvm-readobj -t \
 @ RUN:   | FileCheck %s
 
+@ RUN: llvm-mc -triple armv7-eabi -filetype asm -o - %s \
+@ RUN:   | FileCheck --check-prefix=ASM %s
+
 	.syntax unified
 
 	.arm
@@ -11,6 +14,11 @@ arm_func:
 
 	.thumb_set alias_arm_func, arm_func
 
+        alias_arm_func2 = alias_arm_func
+        alias_arm_func3 = alias_arm_func2
+
+@ ASM: .thumb_set alias_arm_func, arm_func
+
 	.thumb
 
 	.type thumb_func,%function
@@ -51,8 +59,6 @@ beta:
 
 	.thumb_set beta, alpha
 
-	.thumb_set alias_undefined, undefined
-
 @ CHECK: Symbol {
 @ CHECK:   Name: alias_arm_func
 @ CHECK:   Value: 0x1
@@ -60,6 +66,18 @@ beta:
 @ CHECK: }
 
 @ CHECK: Symbol {
+@ CHECK:   Name: alias_arm_func2
+@ CHECK:   Value: 0x1
+@ CHECK:   Type: Function
+@ CHECK: }
+
+@ CHECK: Symbol {
+@ CHECK:   Name: alias_arm_func3
+@ CHECK:   Value: 0x1
+@ CHECK:   Type: Function
+@ CHECK: }
+
+@ CHECK: Symbol {
 @ CHECK:   Name: alias_defined_data
 @ CHECK:   Value: 0x5
 @ CHECK:   Type: Function
@@ -89,6 +107,16 @@ beta:
 @ CHECK:   Type: Function
 @ CHECK: }
 
+@ CHECK:      Symbol {
+@ CHECK:        Name: badblood
+@ CHECK-NEXT:   Value: 0x0
+@ CHECK-NEXT:   Size: 0
+@ CHECK-NEXT:   Binding: Local
+@ CHECK-NEXT:   Type: Object
+@ CHECK-NEXT:   Other: 0
+@ CHECK-NEXT:   Section: .data
+@ CHECK-NEXT: }
+
 @ CHECK: Symbol {
 @ CHECK:   Name: bedazzle
 @ CHECK:   Value: 0x4
@@ -124,16 +152,3 @@ beta:
 @ CHECK:   Value: 0x5
 @ CHECK:   Type: Function
 @ CHECK: }
-
-@ CHECK: Symbol {
-@ CHECK:   Name: badblood
-@ CHECK:   Value: 0x0
-@ CHECK:   Type: Object
-@ CHECK: }
-
-@ CHECK: Symbol {
-@ CHECK:   Name: undefined
-@ CHECK:   Value: 0x0
-@ CHECK:   Type: None
-@ CHECK: }
-
diff --git a/test/MC/ARM/udf-arm-diagnostics.s b/test/MC/ARM/udf-arm-diagnostics.s
new file mode 100644
index 0000000..9ec9bf2
--- /dev/null
+++ b/test/MC/ARM/udf-arm-diagnostics.s
@@ -0,0 +1,19 @@
+@ RUN: not llvm-mc -triple arm-eabi %s 2>&1 | FileCheck %s
+
+	.syntax unified
+	.text
+	.arm
+
+undefined:
+	udfpl
+
+@ CHECK: error: instruction 'udf' is not predicable, but condition code specified
+@ CHECK: 	udfpl
+@ CHECK: 	^
+
+	udf #65536
+
+@ CHECK: error: invalid operand for instruction
+@ CHECK: 	udf #65536
+@ CHECK: 	    ^
+
diff --git a/test/MC/ARM/udf-arm.s b/test/MC/ARM/udf-arm.s
new file mode 100644
index 0000000..a9d19ca
--- /dev/null
+++ b/test/MC/ARM/udf-arm.s
@@ -0,0 +1,11 @@
+@ RUN: llvm-mc -triple arm-eabi -show-encoding %s | FileCheck %s
+
+	.syntax unified
+	.text
+	.arm
+
+undefined:
+	udf #0
+
+@ CHECK: udf	#0                      @ encoding: [0xf0,0x00,0xf0,0xe7]
+
diff --git a/test/MC/ARM/udf-thumb-2-diagnostics.s b/test/MC/ARM/udf-thumb-2-diagnostics.s
new file mode 100644
index 0000000..f837560
--- /dev/null
+++ b/test/MC/ARM/udf-thumb-2-diagnostics.s
@@ -0,0 +1,25 @@
+@ RUN: not llvm-mc -triple thumbv7-eabi -mattr +thumb2 %s 2>&1 | FileCheck %s
+
+	.syntax unified
+	.text
+	.thumb
+
+undefined:
+	udfpl
+
+@ CHECK: error: instruction 'udf' is not predicable, but condition code specified
+@ CHECK: 	udfpl
+@ CHECK: 	^
+
+	udf #256
+
+@ CHECK: error: instruction requires: arm-mode
+@ CHECK: 	udf #256
+@ CHECK: 	^
+
+	udf.w #65536
+
+@ CHECK: error: invalid operand for instruction
+@ CHECK: 	udf.w #65536
+@ CHECK: 	      ^
+
diff --git a/test/MC/ARM/udf-thumb-2.s b/test/MC/ARM/udf-thumb-2.s
new file mode 100644
index 0000000..beb6549
--- /dev/null
+++ b/test/MC/ARM/udf-thumb-2.s
@@ -0,0 +1,13 @@
+@ RUN: llvm-mc -triple thumbv7-eabi -mattr +thumb2 -show-encoding %s | FileCheck %s
+
+	.syntax unified
+	.text
+	.thumb
+
+undefined:
+	udf #0
+	udf.w #0
+
+@ CHECK: udf	#0                      @ encoding: [0x00,0xde]
+@ CHECK: udf.w	#0                      @ encoding: [0xf0,0xf7,0x00,0xa0]
+
diff --git a/test/MC/ARM/udf-thumb-diagnostics.s b/test/MC/ARM/udf-thumb-diagnostics.s
new file mode 100644
index 0000000..51388d0
--- /dev/null
+++ b/test/MC/ARM/udf-thumb-diagnostics.s
@@ -0,0 +1,19 @@
+@ RUN: not llvm-mc -triple thumbv6m-eabi %s 2>&1 | FileCheck %s
+
+	.syntax unified
+	.text
+	.thumb
+
+undefined:
+	udfpl
+
+@ CHECK: error: conditional execution not supported in Thumb1
+@ CHECK: 	udfpl
+@ CHECK: 	^
+
+	udf #256
+
+@ CHECK: error: instruction requires: arm-mode
+@ CHECK: 	udf #256
+@ CHECK: 	^
+
diff --git a/test/MC/ARM/udf-thumb.s b/test/MC/ARM/udf-thumb.s
new file mode 100644
index 0000000..10b3aff
--- /dev/null
+++ b/test/MC/ARM/udf-thumb.s
@@ -0,0 +1,11 @@
+@ RUN: llvm-mc -triple thumbv6m-eabi -show-encoding %s | FileCheck %s
+
+	.syntax unified
+	.text
+	.thumb
+
+undefined:
+	udf #0
+
+@ CHECK: udf	#0                      @ encoding: [0x00,0xde]
+
diff --git a/test/MC/ARM/vmov-vmvn-byte-replicate.s b/test/MC/ARM/vmov-vmvn-byte-replicate.s
new file mode 100644
index 0000000..5931160
--- /dev/null
+++ b/test/MC/ARM/vmov-vmvn-byte-replicate.s
@@ -0,0 +1,31 @@
+@ PR18921, "vmov" part.
+@ RUN: llvm-mc -triple=armv7-linux-gnueabi -show-encoding < %s | FileCheck %s
+.text
+
+@ CHECK: vmov.i8 d2, #0xff @ encoding: [0x1f,0x2e,0x87,0xf3]
+@ CHECK: vmov.i8 q2, #0xff @ encoding: [0x5f,0x4e,0x87,0xf3]
+@ CHECK: vmov.i8 d2, #0xab @ encoding: [0x1b,0x2e,0x82,0xf3]
+@ CHECK: vmov.i8 q2, #0xab @ encoding: [0x5b,0x4e,0x82,0xf3]
+@ CHECK: vmov.i8 q2, #0xab @ encoding: [0x5b,0x4e,0x82,0xf3]
+@ CHECK: vmov.i8 q2, #0xab @ encoding: [0x5b,0x4e,0x82,0xf3]
+
+@ CHECK: vmov.i8 d2, #0x0  @ encoding: [0x10,0x2e,0x80,0xf2]
+@ CHECK: vmov.i8 q2, #0x0  @ encoding: [0x50,0x4e,0x80,0xf2]
+@ CHECK: vmov.i8 d2, #0x54 @ encoding: [0x14,0x2e,0x85,0xf2]
+@ CHECK: vmov.i8 q2, #0x54 @ encoding: [0x54,0x4e,0x85,0xf2]
+@ CHECK: vmov.i8 d2, #0x54 @ encoding: [0x14,0x2e,0x85,0xf2]
+@ CHECK: vmov.i8 q2, #0x54 @ encoding: [0x54,0x4e,0x85,0xf2]
+
+        vmov.i32        d2, #0xffffffff
+        vmov.i32        q2, #0xffffffff
+        vmov.i32        d2, #0xabababab
+        vmov.i32        q2, #0xabababab
+        vmov.i16        q2, #0xabab
+        vmov.i16        q2, #0xabab
+
+        vmvn.i32        d2, #0xffffffff
+        vmvn.i32        q2, #0xffffffff
+        vmvn.i32        d2, #0xabababab
+        vmvn.i32        q2, #0xabababab
+        vmvn.i16        d2, #0xabab
+        vmvn.i16        q2, #0xabab
diff --git a/test/MC/ARM/vmov-vmvn-illegal-cases.s b/test/MC/ARM/vmov-vmvn-illegal-cases.s
new file mode 100644
index 0000000..4609b77
--- /dev/null
+++ b/test/MC/ARM/vmov-vmvn-illegal-cases.s
@@ -0,0 +1,30 @@
+@ RUN: not llvm-mc -triple=armv7-linux-gnueabi %s 2>&1 | FileCheck %s
+.text
+
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vmov.i32        d2, #0xffffffab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vmov.i32        q2, #0xffffffab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vmov.i16        q2, #0xffab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vmov.i16        q2, #0xffab
+
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vmvn.i32        d2, #0xffffffab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vmvn.i32        q2, #0xffffffab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vmvn.i16        q2, #0xffab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vmvn.i16        q2, #0xffab
+
+        vmov.i32        d2, #0xffffffab
+        vmov.i32        q2, #0xffffffab
+        vmov.i16        q2, #0xffab
+        vmov.i16        q2, #0xffab
+
+        vmvn.i32        d2, #0xffffffab
+        vmvn.i32        q2, #0xffffffab
+        vmvn.i16        q2, #0xffab
+        vmvn.i16        q2, #0xffab
diff --git a/test/MC/ARM/vorr-vbic-illegal-cases.s b/test/MC/ARM/vorr-vbic-illegal-cases.s
new file mode 100644
index 0000000..16ab6b5
--- /dev/null
+++ b/test/MC/ARM/vorr-vbic-illegal-cases.s
@@ -0,0 +1,42 @@
+@ RUN: not llvm-mc -triple=armv7-linux-gnueabi %s 2>&1 | FileCheck %s
+.text
+
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vorr.i32        d2, #0xffffffff
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vorr.i32        q2, #0xffffffff
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vorr.i32        d2, #0xabababab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vorr.i32        q2, #0xabababab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vorr.i16        q2, #0xabab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vorr.i16        q2, #0xabab
+
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i32        d2, #0xffffffff
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i32        q2, #0xffffffff
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i32        d2, #0xabababab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i32        q2, #0xabababab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i16        d2, #0xabab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i16        q2, #0xabab
+
+        vorr.i32        d2, #0xffffffff
+        vorr.i32        q2, #0xffffffff
+        vorr.i32        d2, #0xabababab
+        vorr.i32        q2, #0xabababab
+        vorr.i16        q2, #0xabab
+        vorr.i16        q2, #0xabab
+
+        vbic.i32        d2, #0xffffffff
+        vbic.i32        q2, #0xffffffff
+        vbic.i32        d2, #0xabababab
+        vbic.i32        q2, #0xabababab
+        vbic.i16        d2, #0xabab
+        vbic.i16        q2, #0xabab
diff --git a/test/MC/ARM64/advsimd.s b/test/MC/ARM64/advsimd.s
deleted file mode 100644
index fce0832..0000000
--- a/test/MC/ARM64/advsimd.s
+++ /dev/null
@@ -1,1997 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -output-asm-variant=1 -show-encoding < %s | FileCheck %s
-
-foo:
-
-  abs.8b  v0, v0
-  abs.16b v0, v0
-  abs.4h  v0, v0
-  abs.8h  v0, v0
-  abs.2s  v0, v0
-  abs.4s  v0, v0
-
-; CHECK: abs.8b  v0, v0              ; encoding: [0x00,0xb8,0x20,0x0e]
-; CHECK: abs.16b v0, v0              ; encoding: [0x00,0xb8,0x20,0x4e]
-; CHECK: abs.4h  v0, v0              ; encoding: [0x00,0xb8,0x60,0x0e]
-; CHECK: abs.8h  v0, v0              ; encoding: [0x00,0xb8,0x60,0x4e]
-; CHECK: abs.2s  v0, v0              ; encoding: [0x00,0xb8,0xa0,0x0e]
-; CHECK: abs.4s  v0, v0              ; encoding: [0x00,0xb8,0xa0,0x4e]
-
-  add.8b  v0, v0, v0
-  add.16b v0, v0, v0
-  add.4h  v0, v0, v0
-  add.8h  v0, v0, v0
-  add.2s  v0, v0, v0
-  add.4s  v0, v0, v0
-  add.2d  v0, v0, v0
-
-; CHECK: add.8b  v0, v0, v0          ; encoding: [0x00,0x84,0x20,0x0e]
-; CHECK: add.16b v0, v0, v0          ; encoding: [0x00,0x84,0x20,0x4e]
-; CHECK: add.4h  v0, v0, v0          ; encoding: [0x00,0x84,0x60,0x0e]
-; CHECK: add.8h  v0, v0, v0          ; encoding: [0x00,0x84,0x60,0x4e]
-; CHECK: add.2s  v0, v0, v0          ; encoding: [0x00,0x84,0xa0,0x0e]
-; CHECK: add.4s  v0, v0, v0          ; encoding: [0x00,0x84,0xa0,0x4e]
-; CHECK: add.2d  v0, v0, v0          ; encoding: [0x00,0x84,0xe0,0x4e]
-
-  add d1, d2, d3
-
-; CHECK: add d1, d2, d3              ; encoding: [0x41,0x84,0xe3,0x5e]
-
-  addhn.8b   v0, v0, v0
-  addhn2.16b v0, v0, v0
-  addhn.4h   v0, v0, v0
-  addhn2.8h  v0, v0, v0
-  addhn.2s   v0, v0, v0
-  addhn2.4s  v0, v0, v0
-
-; CHECK: addhn.8b   v0, v0, v0       ; encoding: [0x00,0x40,0x20,0x0e]
-; CHECK: addhn2.16b v0, v0, v0       ; encoding: [0x00,0x40,0x20,0x4e]
-; CHECK: addhn.4h   v0, v0, v0       ; encoding: [0x00,0x40,0x60,0x0e]
-; CHECK: addhn2.8h  v0, v0, v0       ; encoding: [0x00,0x40,0x60,0x4e]
-; CHECK: addhn.2s   v0, v0, v0       ; encoding: [0x00,0x40,0xa0,0x0e]
-; CHECK: addhn2.4s  v0, v0, v0       ; encoding: [0x00,0x40,0xa0,0x4e]
-
-  addp.8b  v0, v0, v0
-  addp.16b v0, v0, v0
-  addp.4h  v0, v0, v0
-  addp.8h  v0, v0, v0
-  addp.2s  v0, v0, v0
-  addp.4s  v0, v0, v0
-  addp.2d  v0, v0, v0
-
-; CHECK: addp.8b   v0, v0, v0        ; encoding: [0x00,0xbc,0x20,0x0e]
-; CHECK: addp.16b  v0, v0, v0        ; encoding: [0x00,0xbc,0x20,0x4e]
-; CHECK: addp.4h   v0, v0, v0        ; encoding: [0x00,0xbc,0x60,0x0e]
-; CHECK: addp.8h   v0, v0, v0        ; encoding: [0x00,0xbc,0x60,0x4e]
-; CHECK: addp.2s   v0, v0, v0        ; encoding: [0x00,0xbc,0xa0,0x0e]
-; CHECK: addp.4s   v0, v0, v0        ; encoding: [0x00,0xbc,0xa0,0x4e]
-; CHECK: addp.2d   v0, v0, v0        ; encoding: [0x00,0xbc,0xe0,0x4e]
-
-  addp.2d  d0, v0
-
-; CHECK: addp.2d d0, v0              ; encoding: [0x00,0xb8,0xf1,0x5e]
-
-  addv.8b  b0, v0
-  addv.16b b0, v0
-  addv.4h  h0, v0
-  addv.8h  h0, v0
-  addv.4s  s0, v0
-
-; CHECK: addv.8b  b0, v0             ; encoding: [0x00,0xb8,0x31,0x0e]
-; CHECK: addv.16b b0, v0             ; encoding: [0x00,0xb8,0x31,0x4e]
-; CHECK: addv.4h  h0, v0             ; encoding: [0x00,0xb8,0x71,0x0e]
-; CHECK: addv.8h  h0, v0             ; encoding: [0x00,0xb8,0x71,0x4e]
-; CHECK: addv.4s  s0, v0             ; encoding: [0x00,0xb8,0xb1,0x4e]
-
-
-; INS/DUP
-  dup.2d  v0, x3
-  dup.4s  v0, w3
-  dup.2s  v0, w3
-  dup.8h  v0, w3
-  dup.4h  v0, w3
-  dup.16b v0, w3
-  dup.8b  v0, w3
-
-  dup v1.2d, x3
-  dup v2.4s, w4
-  dup v3.2s, w5
-  dup v4.8h, w6
-  dup v5.4h, w7
-  dup v6.16b, w8
-  dup v7.8b, w9
-
-; CHECK: dup.2d  v0, x3              ; encoding: [0x60,0x0c,0x08,0x4e]
-; CHECK: dup.4s  v0, w3              ; encoding: [0x60,0x0c,0x04,0x4e]
-; CHECK: dup.2s  v0, w3              ; encoding: [0x60,0x0c,0x04,0x0e]
-; CHECK: dup.8h  v0, w3              ; encoding: [0x60,0x0c,0x02,0x4e]
-; CHECK: dup.4h  v0, w3              ; encoding: [0x60,0x0c,0x02,0x0e]
-; CHECK: dup.16b v0, w3              ; encoding: [0x60,0x0c,0x01,0x4e]
-; CHECK: dup.8b  v0, w3              ; encoding: [0x60,0x0c,0x01,0x0e]
-
-; CHECK: dup.2d	v1, x3               ; encoding: [0x61,0x0c,0x08,0x4e]
-; CHECK: dup.4s	v2, w4               ; encoding: [0x82,0x0c,0x04,0x4e]
-; CHECK: dup.2s	v3, w5               ; encoding: [0xa3,0x0c,0x04,0x0e]
-; CHECK: dup.8h	v4, w6               ; encoding: [0xc4,0x0c,0x02,0x4e]
-; CHECK: dup.4h	v5, w7               ; encoding: [0xe5,0x0c,0x02,0x0e]
-; CHECK: dup.16b v6, w8              ; encoding: [0x06,0x0d,0x01,0x4e]
-; CHECK: dup.8b	v7, w9               ; encoding: [0x27,0x0d,0x01,0x0e]
-
-  dup.2d  v0, v3[1]
-  dup.2s  v0, v3[1]
-  dup.4s  v0, v3[1]
-  dup.4h  v0, v3[1]
-  dup.8h  v0, v3[1]
-  dup.8b  v0, v3[1]
-  dup.16b v0, v3[1]
-
-  dup v7.2d, v9.d[1]
-  dup v6.2s, v8.s[1]
-  dup v5.4s, v7.s[2]
-  dup v4.4h, v6.h[3]
-  dup v3.8h, v5.h[4]
-  dup v2.8b, v4.b[5]
-  dup v1.16b, v3.b[6]
-
-; CHECK: dup.2d  v0, v3[1]           ; encoding: [0x60,0x04,0x18,0x4e]
-; CHECK: dup.2s  v0, v3[1]           ; encoding: [0x60,0x04,0x0c,0x0e]
-; CHECK: dup.4s  v0, v3[1]           ; encoding: [0x60,0x04,0x0c,0x4e]
-; CHECK: dup.4h  v0, v3[1]           ; encoding: [0x60,0x04,0x06,0x0e]
-; CHECK: dup.8h  v0, v3[1]           ; encoding: [0x60,0x04,0x06,0x4e]
-; CHECK: dup.8b  v0, v3[1]           ; encoding: [0x60,0x04,0x03,0x0e]
-; CHECK: dup.16b v0, v3[1]           ; encoding: [0x60,0x04,0x03,0x4e]
-
-; CHECK: dup.2d  v7, v9[1]            ; encoding: [0x27,0x05,0x18,0x4e]
-; CHECK: dup.2s  v6, v8[1]            ; encoding: [0x06,0x05,0x0c,0x0e]
-; CHECK: dup.4s  v5, v7[2]            ; encoding: [0xe5,0x04,0x14,0x4e]
-; CHECK: dup.4h  v4, v6[3]            ; encoding: [0xc4,0x04,0x0e,0x0e]
-; CHECK: dup.8h  v3, v5[4]            ; encoding: [0xa3,0x04,0x12,0x4e]
-; CHECK: dup.8b  v2, v4[5]            ; encoding: [0x82,0x04,0x0b,0x0e]
-; CHECK: dup.16b v1, v3[6]            ; encoding: [0x61,0x04,0x0d,0x4e]
-
-  dup b3, v4[1]
-  dup h3, v4[1]
-  dup s3, v4[1]
-  dup d3, v4[1]
-  dup b3, v4.b[1]
-  dup h3, v4.h[1]
-  dup s3, v4.s[1]
-  dup d3, v4.d[1]
-
-  mov b3, v4[1]
-  mov h3, v4[1]
-  mov s3, v4[1]
-  mov d3, v4[1]
-  mov b3, v4.b[1]
-  mov h3, v4.h[1]
-  mov s3, v4.s[1]
-  mov d3, v4.d[1]
-
-; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
-; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
-; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
-; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
-; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
-; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
-; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
-; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
-
-; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
-; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
-; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
-; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
-; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
-; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
-; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
-; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
-
-  smov.s x3, v2[2]
-  smov   x3, v2.s[2]
-  umov.s w3, v2[2]
-  umov   w3, v2.s[2]
-  umov.d x3, v2[1]
-  umov   x3, v2.d[1]
-
-; CHECK: smov.s  x3, v2[2]           ; encoding: [0x43,0x2c,0x14,0x4e]
-; CHECK: smov.s  x3, v2[2]           ; encoding: [0x43,0x2c,0x14,0x4e]
-; CHECK: umov.s  w3, v2[2]           ; encoding: [0x43,0x3c,0x14,0x0e]
-; CHECK: umov.s  w3, v2[2]           ; encoding: [0x43,0x3c,0x14,0x0e]
-; CHECK: umov.d  x3, v2[1]           ; encoding: [0x43,0x3c,0x18,0x4e]
-; CHECK: umov.d  x3, v2[1]           ; encoding: [0x43,0x3c,0x18,0x4e]
-
-  ; MOV aliases for UMOV instructions above
-
-  mov.s w2, v3[3]
-  mov   w5, v7.s[2]
-  mov.d x11, v13[1]
-  mov   x17, v19.d[0]
-
-; CHECK: umov.s  w2, v3[3]               ; encoding: [0x62,0x3c,0x1c,0x0e]
-; CHECK: umov.s  w5, v7[2]               ; encoding: [0xe5,0x3c,0x14,0x0e]
-; CHECK: umov.d  x11, v13[1]             ; encoding: [0xab,0x3d,0x18,0x4e]
-; CHECK: umov.d  x17, v19[0]             ; encoding: [0x71,0x3e,0x08,0x4e]
-
-  ins.d v2[1], x5
-  ins.s v2[1], w5
-  ins.h v2[1], w5
-  ins.b v2[1], w5
-
-  ins   v2.d[1], x5
-  ins   v2.s[1], w5
-  ins   v2.h[1], w5
-  ins   v2.b[1], w5
-
-; CHECK: ins.d v2[1], x5             ; encoding: [0xa2,0x1c,0x18,0x4e]
-; CHECK: ins.s v2[1], w5             ; encoding: [0xa2,0x1c,0x0c,0x4e]
-; CHECK: ins.h v2[1], w5             ; encoding: [0xa2,0x1c,0x06,0x4e]
-; CHECK: ins.b v2[1], w5             ; encoding: [0xa2,0x1c,0x03,0x4e]
-
-; CHECK: ins.d v2[1], x5             ; encoding: [0xa2,0x1c,0x18,0x4e]
-; CHECK: ins.s v2[1], w5             ; encoding: [0xa2,0x1c,0x0c,0x4e]
-; CHECK: ins.h v2[1], w5             ; encoding: [0xa2,0x1c,0x06,0x4e]
-; CHECK: ins.b v2[1], w5             ; encoding: [0xa2,0x1c,0x03,0x4e]
-
-  ins.d v2[1], v15[1]
-  ins.s v2[1], v15[1]
-  ins.h v2[1], v15[1]
-  ins.b v2[1], v15[1]
-
-  ins   v2.d[1], v15.d[0]
-  ins   v2.s[3], v15.s[2]
-  ins   v2.h[7], v15.h[3]
-  ins   v2.b[10], v15.b[5]
-
-; CHECK: ins.d v2[1], v15[1]         ; encoding: [0xe2,0x45,0x18,0x6e]
-; CHECK: ins.s v2[1], v15[1]         ; encoding: [0xe2,0x25,0x0c,0x6e]
-; CHECK: ins.h v2[1], v15[1]         ; encoding: [0xe2,0x15,0x06,0x6e]
-; CHECK: ins.b v2[1], v15[1]         ; encoding: [0xe2,0x0d,0x03,0x6e]
-
-; CHECK: ins.d v2[1], v15[0]         ; encoding: [0xe2,0x05,0x18,0x6e]
-; CHECK: ins.s v2[3], v15[2]         ; encoding: [0xe2,0x45,0x1c,0x6e]
-; CHECK: ins.h v2[7], v15[3]         ; encoding: [0xe2,0x35,0x1e,0x6e]
-; CHECK: ins.b v2[10], v15[5]        ; encoding: [0xe2,0x2d,0x15,0x6e]
-
-; MOV aliases for the above INS instructions.
-  mov.d v2[1], x5
-  mov.s v3[1], w6
-  mov.h v4[1], w7
-  mov.b v5[1], w8
-
-  mov   v9.d[1], x2
-  mov   v8.s[1], w3
-  mov   v7.h[1], w4
-  mov   v6.b[1], w5
-
-  mov.d v1[1], v10[1]
-  mov.s v2[1], v11[1]
-  mov.h v7[1], v12[1]
-  mov.b v8[1], v15[1]
-
-  mov   v2.d[1], v15.d[0]
-  mov   v7.s[3], v16.s[2]
-  mov   v8.h[7], v17.h[3]
-  mov   v9.b[10], v18.b[5]
-
-; CHECK: ins.d	v2[1], x5               ; encoding: [0xa2,0x1c,0x18,0x4e]
-; CHECK: ins.s	v3[1], w6               ; encoding: [0xc3,0x1c,0x0c,0x4e]
-; CHECK: ins.h	v4[1], w7               ; encoding: [0xe4,0x1c,0x06,0x4e]
-; CHECK: ins.b	v5[1], w8               ; encoding: [0x05,0x1d,0x03,0x4e]
-; CHECK: ins.d	v9[1], x2               ; encoding: [0x49,0x1c,0x18,0x4e]
-; CHECK: ins.s	v8[1], w3               ; encoding: [0x68,0x1c,0x0c,0x4e]
-; CHECK: ins.h	v7[1], w4               ; encoding: [0x87,0x1c,0x06,0x4e]
-; CHECK: ins.b	v6[1], w5               ; encoding: [0xa6,0x1c,0x03,0x4e]
-; CHECK: ins.d	v1[1], v10[1]           ; encoding: [0x41,0x45,0x18,0x6e]
-; CHECK: ins.s	v2[1], v11[1]           ; encoding: [0x62,0x25,0x0c,0x6e]
-; CHECK: ins.h	v7[1], v12[1]           ; encoding: [0x87,0x15,0x06,0x6e]
-; CHECK: ins.b	v8[1], v15[1]           ; encoding: [0xe8,0x0d,0x03,0x6e]
-; CHECK: ins.d	v2[1], v15[0]           ; encoding: [0xe2,0x05,0x18,0x6e]
-; CHECK: ins.s	v7[3], v16[2]           ; encoding: [0x07,0x46,0x1c,0x6e]
-; CHECK: ins.h	v8[7], v17[3]           ; encoding: [0x28,0x36,0x1e,0x6e]
-; CHECK: ins.b	v9[10], v18[5]          ; encoding: [0x49,0x2e,0x15,0x6e]
-
-
-  and.8b  v0, v0, v0
-  and.16b v0, v0, v0
-
-; CHECK: and.8b  v0, v0, v0          ; encoding: [0x00,0x1c,0x20,0x0e]
-; CHECK: and.16b v0, v0, v0          ; encoding: [0x00,0x1c,0x20,0x4e]
-
-  bic.8b  v0, v0, v0
-
-; CHECK: bic.8b  v0, v0, v0          ; encoding: [0x00,0x1c,0x60,0x0e]
-
-  cmeq.8b v0, v0, v0
-  cmge.8b v0, v0, v0
-  cmgt.8b v0, v0, v0
-  cmhi.8b v0, v0, v0
-  cmhs.8b v0, v0, v0
-  cmtst.8b v0, v0, v0
-  fabd.2s v0, v0, v0
-  facge.2s  v0, v0, v0
-  facgt.2s  v0, v0, v0
-  faddp.2s v0, v0, v0
-  fadd.2s v0, v0, v0
-  fcmeq.2s  v0, v0, v0
-  fcmge.2s  v0, v0, v0
-  fcmgt.2s  v0, v0, v0
-  fdiv.2s v0, v0, v0
-  fmaxnmp.2s v0, v0, v0
-  fmaxnm.2s v0, v0, v0
-  fmaxp.2s v0, v0, v0
-  fmax.2s v0, v0, v0
-  fminnmp.2s v0, v0, v0
-  fminnm.2s v0, v0, v0
-  fminp.2s v0, v0, v0
-  fmin.2s v0, v0, v0
-  fmla.2s v0, v0, v0
-  fmls.2s v0, v0, v0
-  fmulx.2s v0, v0, v0
-  fmul.2s v0, v0, v0
-  fmulx	d2, d3, d1
-  fmulx	s2, s3, s1
-  frecps.2s v0, v0, v0
-  frsqrts.2s v0, v0, v0
-  fsub.2s v0, v0, v0
-  mla.8b v0, v0, v0
-  mls.8b v0, v0, v0
-  mul.8b v0, v0, v0
-  pmul.8b v0, v0, v0
-  saba.8b v0, v0, v0
-  sabd.8b v0, v0, v0
-  shadd.8b v0, v0, v0
-  shsub.8b v0, v0, v0
-  smaxp.8b v0, v0, v0
-  smax.8b v0, v0, v0
-  sminp.8b v0, v0, v0
-  smin.8b v0, v0, v0
-  sqadd.8b v0, v0, v0
-  sqdmulh.4h v0, v0, v0
-  sqrdmulh.4h v0, v0, v0
-  sqrshl.8b v0, v0, v0
-  sqshl.8b v0, v0, v0
-  sqsub.8b v0, v0, v0
-  srhadd.8b v0, v0, v0
-  srshl.8b v0, v0, v0
-  sshl.8b v0, v0, v0
-  sub.8b v0, v0, v0
-  uaba.8b v0, v0, v0
-  uabd.8b v0, v0, v0
-  uhadd.8b v0, v0, v0
-  uhsub.8b v0, v0, v0
-  umaxp.8b v0, v0, v0
-  umax.8b v0, v0, v0
-  uminp.8b v0, v0, v0
-  umin.8b v0, v0, v0
-  uqadd.8b v0, v0, v0
-  uqrshl.8b v0, v0, v0
-  uqshl.8b v0, v0, v0
-  uqsub.8b v0, v0, v0
-  urhadd.8b v0, v0, v0
-  urshl.8b v0, v0, v0
-  ushl.8b v0, v0, v0
-
-; CHECK: cmeq.8b	v0, v0, v0              ; encoding: [0x00,0x8c,0x20,0x2e]
-; CHECK: cmge.8b	v0, v0, v0              ; encoding: [0x00,0x3c,0x20,0x0e]
-; CHECK: cmgt.8b	v0, v0, v0              ; encoding: [0x00,0x34,0x20,0x0e]
-; CHECK: cmhi.8b	v0, v0, v0              ; encoding: [0x00,0x34,0x20,0x2e]
-; CHECK: cmhs.8b	v0, v0, v0              ; encoding: [0x00,0x3c,0x20,0x2e]
-; CHECK: cmtst.8b	v0, v0, v0      ; encoding: [0x00,0x8c,0x20,0x0e]
-; CHECK: fabd.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0xa0,0x2e]
-; CHECK: facge.2s	v0, v0, v0      ; encoding: [0x00,0xec,0x20,0x2e]
-; CHECK: facgt.2s	v0, v0, v0      ; encoding: [0x00,0xec,0xa0,0x2e]
-; CHECK: faddp.2s	v0, v0, v0      ; encoding: [0x00,0xd4,0x20,0x2e]
-; CHECK: fadd.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0x20,0x0e]
-; CHECK: fcmeq.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0x20,0x0e]
-; CHECK: fcmge.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0x20,0x2e]
-; CHECK: fcmgt.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0xa0,0x2e]
-; CHECK: fdiv.2s	v0, v0, v0              ; encoding: [0x00,0xfc,0x20,0x2e]
-; CHECK: fmaxnmp.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0x20,0x2e]
-; CHECK: fmaxnm.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0x20,0x0e]
-; CHECK: fmaxp.2s	v0, v0, v0      ; encoding: [0x00,0xf4,0x20,0x2e]
-; CHECK: fmax.2s	v0, v0, v0              ; encoding: [0x00,0xf4,0x20,0x0e]
-; CHECK: fminnmp.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0xa0,0x2e]
-; CHECK: fminnm.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0xa0,0x0e]
-; CHECK: fminp.2s	v0, v0, v0      ; encoding: [0x00,0xf4,0xa0,0x2e]
-; CHECK: fmin.2s	v0, v0, v0              ; encoding: [0x00,0xf4,0xa0,0x0e]
-; CHECK: fmla.2s	v0, v0, v0              ; encoding: [0x00,0xcc,0x20,0x0e]
-; CHECK: fmls.2s	v0, v0, v0              ; encoding: [0x00,0xcc,0xa0,0x0e]
-; CHECK: fmulx.2s	v0, v0, v0      ; encoding: [0x00,0xdc,0x20,0x0e]
-
-; CHECK: fmul.2s	v0, v0, v0              ; encoding: [0x00,0xdc,0x20,0x2e]
-; CHECK: fmulx	d2, d3, d1              ; encoding: [0x62,0xdc,0x61,0x5e]
-; CHECK: fmulx	s2, s3, s1              ; encoding: [0x62,0xdc,0x21,0x5e]
-; CHECK: frecps.2s	v0, v0, v0      ; encoding: [0x00,0xfc,0x20,0x0e]
-; CHECK: frsqrts.2s	v0, v0, v0      ; encoding: [0x00,0xfc,0xa0,0x0e]
-; CHECK: fsub.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0xa0,0x0e]
-; CHECK: mla.8b	v0, v0, v0              ; encoding: [0x00,0x94,0x20,0x0e]
-; CHECK: mls.8b	v0, v0, v0              ; encoding: [0x00,0x94,0x20,0x2e]
-; CHECK: mul.8b	v0, v0, v0              ; encoding: [0x00,0x9c,0x20,0x0e]
-; CHECK: pmul.8b	v0, v0, v0              ; encoding: [0x00,0x9c,0x20,0x2e]
-; CHECK: saba.8b	v0, v0, v0              ; encoding: [0x00,0x7c,0x20,0x0e]
-; CHECK: sabd.8b	v0, v0, v0              ; encoding: [0x00,0x74,0x20,0x0e]
-; CHECK: shadd.8b	v0, v0, v0      ; encoding: [0x00,0x04,0x20,0x0e]
-; CHECK: shsub.8b	v0, v0, v0      ; encoding: [0x00,0x24,0x20,0x0e]
-; CHECK: smaxp.8b	v0, v0, v0      ; encoding: [0x00,0xa4,0x20,0x0e]
-; CHECK: smax.8b	v0, v0, v0              ; encoding: [0x00,0x64,0x20,0x0e]
-; CHECK: sminp.8b	v0, v0, v0      ; encoding: [0x00,0xac,0x20,0x0e]
-; CHECK: smin.8b	v0, v0, v0              ; encoding: [0x00,0x6c,0x20,0x0e]
-; CHECK: sqadd.8b	v0, v0, v0      ; encoding: [0x00,0x0c,0x20,0x0e]
-; CHECK: sqdmulh.4h v0, v0, v0 ; encoding: [0x00,0xb4,0x60,0x0e]
-; CHECK: sqrdmulh.4h v0, v0, v0 ; encoding: [0x00,0xb4,0x60,0x2e]
-; CHECK: sqrshl.8b	v0, v0, v0      ; encoding: [0x00,0x5c,0x20,0x0e]
-; CHECK: sqshl.8b	v0, v0, v0      ; encoding: [0x00,0x4c,0x20,0x0e]
-; CHECK: sqsub.8b	v0, v0, v0      ; encoding: [0x00,0x2c,0x20,0x0e]
-; CHECK: srhadd.8b	v0, v0, v0      ; encoding: [0x00,0x14,0x20,0x0e]
-; CHECK: srshl.8b	v0, v0, v0      ; encoding: [0x00,0x54,0x20,0x0e]
-; CHECK: sshl.8b	v0, v0, v0              ; encoding: [0x00,0x44,0x20,0x0e]
-; CHECK: sub.8b	v0, v0, v0              ; encoding: [0x00,0x84,0x20,0x2e]
-; CHECK: uaba.8b	v0, v0, v0              ; encoding: [0x00,0x7c,0x20,0x2e]
-; CHECK: uabd.8b	v0, v0, v0              ; encoding: [0x00,0x74,0x20,0x2e]
-; CHECK: uhadd.8b	v0, v0, v0      ; encoding: [0x00,0x04,0x20,0x2e]
-; CHECK: uhsub.8b	v0, v0, v0      ; encoding: [0x00,0x24,0x20,0x2e]
-; CHECK: umaxp.8b	v0, v0, v0      ; encoding: [0x00,0xa4,0x20,0x2e]
-; CHECK: umax.8b	v0, v0, v0              ; encoding: [0x00,0x64,0x20,0x2e]
-; CHECK: uminp.8b	v0, v0, v0      ; encoding: [0x00,0xac,0x20,0x2e]
-; CHECK: umin.8b	v0, v0, v0              ; encoding: [0x00,0x6c,0x20,0x2e]
-; CHECK: uqadd.8b	v0, v0, v0      ; encoding: [0x00,0x0c,0x20,0x2e]
-; CHECK: uqrshl.8b	v0, v0, v0      ; encoding: [0x00,0x5c,0x20,0x2e]
-; CHECK: uqshl.8b	v0, v0, v0      ; encoding: [0x00,0x4c,0x20,0x2e]
-; CHECK: uqsub.8b	v0, v0, v0      ; encoding: [0x00,0x2c,0x20,0x2e]
-; CHECK: urhadd.8b	v0, v0, v0      ; encoding: [0x00,0x14,0x20,0x2e]
-; CHECK: urshl.8b	v0, v0, v0      ; encoding: [0x00,0x54,0x20,0x2e]
-; CHECK: ushl.8b	v0, v0, v0              ; encoding: [0x00,0x44,0x20,0x2e]
-
-  bif.8b v0, v0, v0
-  bit.8b v0, v0, v0
-  bsl.8b v0, v0, v0
-  eor.8b v0, v0, v0
-  orn.8b v0, v0, v0
-  orr.8b v0, v0, v0
-
-; CHECK: bif.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xe0,0x2e]
-; CHECK: bit.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xa0,0x2e]
-; CHECK: bsl.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0x60,0x2e]
-; CHECK: eor.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0x20,0x2e]
-; CHECK: orn.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xe0,0x0e]
-; CHECK: orr.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xa0,0x0e]
-
-  sadalp.4h   v0, v0
-  sadalp.8h  v0, v0
-  sadalp.2s   v0, v0
-  sadalp.4s   v0, v0
-  sadalp.1d   v0, v0
-  sadalp.2d   v0, v0
-
-; CHECK: sadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x0e]
-; CHECK: sadalp.8h	v0, v0          ; encoding: [0x00,0x68,0x20,0x4e]
-; CHECK: sadalp.2s	v0, v0          ; encoding: [0x00,0x68,0x60,0x0e]
-; CHECK: sadalp.4s	v0, v0          ; encoding: [0x00,0x68,0x60,0x4e]
-; CHECK: sadalp.1d	v0, v0          ; encoding: [0x00,0x68,0xa0,0x0e]
-; CHECK: sadalp.2d	v0, v0          ; encoding: [0x00,0x68,0xa0,0x4e]
-
-  cls.8b      v0, v0
-  clz.8b      v0, v0
-  cnt.8b      v0, v0
-  fabs.2s     v0, v0
-  fneg.2s     v0, v0
-  frecpe.2s   v0, v0
-  frinta.2s   v0, v0
-  frintx.2s   v0, v0
-  frinti.2s   v0, v0
-  frintm.2s   v0, v0
-  frintn.2s   v0, v0
-  frintp.2s   v0, v0
-  frintz.2s   v0, v0
-  frsqrte.2s  v0, v0
-  fsqrt.2s    v0, v0
-  neg.8b      v0, v0
-  not.8b      v0, v0
-  rbit.8b     v0, v0
-  rev16.8b    v0, v0
-  rev32.8b    v0, v0
-  rev64.8b    v0, v0
-  sadalp.4h   v0, v0
-  saddlp.4h	  v0, v0
-  scvtf.2s    v0, v0
-  sqabs.8b    v0, v0
-  sqneg.8b    v0, v0
-  sqxtn.8b    v0, v0
-  sqxtun.8b   v0, v0
-  suqadd.8b   v0, v0
-  uadalp.4h   v0, v0
-  uaddlp.4h   v0, v0
-  ucvtf.2s    v0, v0
-  uqxtn.8b    v0, v0
-  urecpe.2s   v0, v0
-  ursqrte.2s  v0, v0
-  usqadd.8b   v0, v0
-  xtn.8b      v0, v0
-  shll.8h v1, v2, #8
-  shll.4s v3, v4, #16
-  shll.2d v5, v6, #32
-  shll2.8h v7, v8, #8
-  shll2.4s v9, v10, #16
-  shll2.2d v11, v12, #32
-  shll v1.8h, v2.8b, #8
-  shll v1.4s, v2.4h, #16
-  shll v1.2d, v2.2s, #32
-  shll2 v1.8h, v2.16b, #8
-  shll2 v1.4s, v2.8h, #16
-  shll2 v1.2d, v2.4s, #32
-
-; CHECK: cls.8b	v0, v0                  ; encoding: [0x00,0x48,0x20,0x0e]
-; CHECK: clz.8b	v0, v0                  ; encoding: [0x00,0x48,0x20,0x2e]
-; CHECK: cnt.8b	v0, v0                  ; encoding: [0x00,0x58,0x20,0x0e]
-; CHECK: fabs.2s	v0, v0                  ; encoding: [0x00,0xf8,0xa0,0x0e]
-; CHECK: fneg.2s	v0, v0                  ; encoding: [0x00,0xf8,0xa0,0x2e]
-; CHECK: frecpe.2s	v0, v0          ; encoding: [0x00,0xd8,0xa1,0x0e]
-; CHECK: frinta.2s	v0, v0          ; encoding: [0x00,0x88,0x21,0x2e]
-; CHECK: frintx.2s	v0, v0          ; encoding: [0x00,0x98,0x21,0x2e]
-; CHECK: frinti.2s	v0, v0          ; encoding: [0x00,0x98,0xa1,0x2e]
-; CHECK: frintm.2s	v0, v0          ; encoding: [0x00,0x98,0x21,0x0e]
-; CHECK: frintn.2s	v0, v0          ; encoding: [0x00,0x88,0x21,0x0e]
-; CHECK: frintp.2s	v0, v0          ; encoding: [0x00,0x88,0xa1,0x0e]
-; CHECK: frintz.2s	v0, v0          ; encoding: [0x00,0x98,0xa1,0x0e]
-; CHECK: frsqrte.2s	v0, v0          ; encoding: [0x00,0xd8,0xa1,0x2e]
-; CHECK: fsqrt.2s	v0, v0          ; encoding: [0x00,0xf8,0xa1,0x2e]
-; CHECK: neg.8b	v0, v0                  ; encoding: [0x00,0xb8,0x20,0x2e]
-; CHECK: not.8b	v0, v0                  ; encoding: [0x00,0x58,0x20,0x2e]
-; CHECK: rbit.8b	v0, v0                  ; encoding: [0x00,0x58,0x60,0x2e]
-; CHECK: rev16.8b	v0, v0          ; encoding: [0x00,0x18,0x20,0x0e]
-; CHECK: rev32.8b	v0, v0          ; encoding: [0x00,0x08,0x20,0x2e]
-; CHECK: rev64.8b	v0, v0          ; encoding: [0x00,0x08,0x20,0x0e]
-; CHECK: sadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x0e]
-; CHECK: saddlp.4h	v0, v0          ; encoding: [0x00,0x28,0x20,0x0e]
-; CHECK: scvtf.2s	v0, v0          ; encoding: [0x00,0xd8,0x21,0x0e]
-; CHECK: sqabs.8b	v0, v0          ; encoding: [0x00,0x78,0x20,0x0e]
-; CHECK: sqneg.8b	v0, v0          ; encoding: [0x00,0x78,0x20,0x2e]
-; CHECK: sqxtn.8b	v0, v0          ; encoding: [0x00,0x48,0x21,0x0e]
-; CHECK: sqxtun.8b	v0, v0          ; encoding: [0x00,0x28,0x21,0x2e]
-; CHECK: suqadd.8b	v0, v0          ; encoding: [0x00,0x38,0x20,0x0e]
-; CHECK: uadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x2e]
-; CHECK: uaddlp.4h	v0, v0          ; encoding: [0x00,0x28,0x20,0x2e]
-; CHECK: ucvtf.2s	v0, v0          ; encoding: [0x00,0xd8,0x21,0x2e]
-; CHECK: uqxtn.8b	v0, v0          ; encoding: [0x00,0x48,0x21,0x2e]
-; CHECK: urecpe.2s	v0, v0          ; encoding: [0x00,0xc8,0xa1,0x0e]
-; CHECK: ursqrte.2s	v0, v0          ; encoding: [0x00,0xc8,0xa1,0x2e]
-; CHECK: usqadd.8b	v0, v0          ; encoding: [0x00,0x38,0x20,0x2e]
-; CHECK: xtn.8b	v0, v0                  ; encoding: [0x00,0x28,0x21,0x0e]
-; CHECK: shll.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x2e]
-; CHECK: shll.4s	v3, v4, #16     ; encoding: [0x83,0x38,0x61,0x2e]
-; CHECK: shll.2d	v5, v6, #32     ; encoding: [0xc5,0x38,0xa1,0x2e]
-; CHECK: shll2.8h	v7, v8, #8      ; encoding: [0x07,0x39,0x21,0x6e]
-; CHECK: shll2.4s	v9, v10, #16    ; encoding: [0x49,0x39,0x61,0x6e]
-; CHECK: shll2.2d	v11, v12, #32   ; encoding: [0x8b,0x39,0xa1,0x6e]
-; CHECK: shll.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x2e]
-; CHECK: shll.4s	v1, v2, #16     ; encoding: [0x41,0x38,0x61,0x2e]
-; CHECK: shll.2d	v1, v2, #32     ; encoding: [0x41,0x38,0xa1,0x2e]
-; CHECK: shll2.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x6e]
-; CHECK: shll2.4s	v1, v2, #16     ; encoding: [0x41,0x38,0x61,0x6e]
-; CHECK: shll2.2d	v1, v2, #32     ; encoding: [0x41,0x38,0xa1,0x6e]
-
-
-  cmeq.8b   v0, v0, #0
-  cmeq.16b  v0, v0, #0
-  cmeq.4h   v0, v0, #0
-  cmeq.8h   v0, v0, #0
-  cmeq.2s   v0, v0, #0
-  cmeq.4s   v0, v0, #0
-  cmeq.2d   v0, v0, #0
-
-; CHECK: cmeq.8b	v0, v0, #0              ; encoding: [0x00,0x98,0x20,0x0e]
-; CHECK: cmeq.16b	v0, v0, #0      ; encoding: [0x00,0x98,0x20,0x4e]
-; CHECK: cmeq.4h	v0, v0, #0              ; encoding: [0x00,0x98,0x60,0x0e]
-; CHECK: cmeq.8h	v0, v0, #0              ; encoding: [0x00,0x98,0x60,0x4e]
-; CHECK: cmeq.2s	v0, v0, #0              ; encoding: [0x00,0x98,0xa0,0x0e]
-; CHECK: cmeq.4s	v0, v0, #0              ; encoding: [0x00,0x98,0xa0,0x4e]
-; CHECK: cmeq.2d	v0, v0, #0              ; encoding: [0x00,0x98,0xe0,0x4e]
-
-  cmge.8b   v0, v0, #0
-  cmgt.8b   v0, v0, #0
-  cmle.8b   v0, v0, #0
-  cmlt.8b   v0, v0, #0
-  fcmeq.2s  v0, v0, #0
-  fcmge.2s  v0, v0, #0
-  fcmgt.2s  v0, v0, #0
-  fcmle.2s  v0, v0, #0
-  fcmlt.2s  v0, v0, #0
-
-; ARM verbose mode aliases
-  cmlt v8.8b, v14.8b, #0
-  cmlt v8.16b, v14.16b, #0
-  cmlt v8.4h, v14.4h, #0
-  cmlt v8.8h, v14.8h, #0
-  cmlt v8.2s, v14.2s, #0
-  cmlt v8.4s, v14.4s, #0
-  cmlt v8.2d, v14.2d, #0
-
-; CHECK: cmge.8b	v0, v0, #0              ; encoding: [0x00,0x88,0x20,0x2e]
-; CHECK: cmgt.8b	v0, v0, #0              ; encoding: [0x00,0x88,0x20,0x0e]
-; CHECK: cmle.8b	v0, v0, #0              ; encoding: [0x00,0x98,0x20,0x2e]
-; CHECK: cmlt.8b	v0, v0, #0              ; encoding: [0x00,0xa8,0x20,0x0e]
-; CHECK: fcmeq.2s	v0, v0, #0      ; encoding: [0x00,0xd8,0xa0,0x0e]
-; CHECK: fcmge.2s	v0, v0, #0      ; encoding: [0x00,0xc8,0xa0,0x2e]
-; CHECK: fcmgt.2s	v0, v0, #0      ; encoding: [0x00,0xc8,0xa0,0x0e]
-; CHECK: fcmle.2s	v0, v0, #0      ; encoding: [0x00,0xd8,0xa0,0x2e]
-; CHECK: fcmlt.2s	v0, v0, #0      ; encoding: [0x00,0xe8,0xa0,0x0e]
-; CHECK: cmlt.8b	v8, v14, #0             ; encoding: [0xc8,0xa9,0x20,0x0e]
-; CHECK: cmlt.16b	v8, v14, #0     ; encoding: [0xc8,0xa9,0x20,0x4e]
-; CHECK: cmlt.4h	v8, v14, #0             ; encoding: [0xc8,0xa9,0x60,0x0e]
-; CHECK: cmlt.8h	v8, v14, #0             ; encoding: [0xc8,0xa9,0x60,0x4e]
-; CHECK: cmlt.2s	v8, v14, #0             ; encoding: [0xc8,0xa9,0xa0,0x0e]
-; CHECK: cmlt.4s	v8, v14, #0             ; encoding: [0xc8,0xa9,0xa0,0x4e]
-; CHECK: cmlt.2d	v8, v14, #0             ; encoding: [0xc8,0xa9,0xe0,0x4e]
-
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD Floating-point <-> Integer Conversions
-;===-------------------------------------------------------------------------===
-
-  fcvtas.2s   v0, v0
-  fcvtas.4s   v0, v0
-  fcvtas.2d   v0, v0
-  fcvtas      s0, s0
-  fcvtas      d0, d0
-
-; CHECK: fcvtas.2s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x0e]
-; CHECK: fcvtas.4s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x4e]
-; CHECK: fcvtas.2d  v0, v0           ; encoding: [0x00,0xc8,0x61,0x4e]
-; CHECK: fcvtas     s0, s0           ; encoding: [0x00,0xc8,0x21,0x5e]
-; CHECK: fcvtas     d0, d0           ; encoding: [0x00,0xc8,0x61,0x5e]
-
-  fcvtau.2s   v0, v0
-  fcvtau.4s   v0, v0
-  fcvtau.2d   v0, v0
-  fcvtau      s0, s0
-  fcvtau      d0, d0
-
-; CHECK: fcvtau.2s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x2e]
-; CHECK: fcvtau.4s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x6e]
-; CHECK: fcvtau.2d  v0, v0           ; encoding: [0x00,0xc8,0x61,0x6e]
-; CHECK: fcvtau     s0, s0           ; encoding: [0x00,0xc8,0x21,0x7e]
-; CHECK: fcvtau     d0, d0           ; encoding: [0x00,0xc8,0x61,0x7e]
-
-  fcvtl   v1.4s, v5.4h
-  fcvtl   v2.2d, v6.2s
-  fcvtl2  v3.4s, v7.8h
-  fcvtl2  v4.2d, v8.4s
-
-; CHECK: fcvtl	v1.4s, v5.4h            ; encoding: [0xa1,0x78,0x21,0x0e]
-; CHECK: fcvtl	v2.2d, v6.2s            ; encoding: [0xc2,0x78,0x61,0x0e]
-; CHECK: fcvtl2	v3.4s, v7.8h            ; encoding: [0xe3,0x78,0x21,0x4e]
-; CHECK: fcvtl2	v4.2d, v8.4s            ; encoding: [0x04,0x79,0x61,0x4e]
-
-  fcvtms.2s  v0, v0
-  fcvtms.4s  v0, v0
-  fcvtms.2d  v0, v0
-  fcvtms     s0, s0
-  fcvtms     d0, d0
-
-; CHECK: fcvtms.2s v0, v0            ; encoding: [0x00,0xb8,0x21,0x0e]
-; CHECK: fcvtms.4s v0, v0            ; encoding: [0x00,0xb8,0x21,0x4e]
-; CHECK: fcvtms.2d v0, v0            ; encoding: [0x00,0xb8,0x61,0x4e]
-; CHECK: fcvtms    s0, s0            ; encoding: [0x00,0xb8,0x21,0x5e]
-; CHECK: fcvtms    d0, d0            ; encoding: [0x00,0xb8,0x61,0x5e]
-
-  fcvtmu.2s   v0, v0
-  fcvtmu.4s   v0, v0
-  fcvtmu.2d   v0, v0
-  fcvtmu      s0, s0
-  fcvtmu      d0, d0
-
-; CHECK: fcvtmu.2s v0, v0            ; encoding: [0x00,0xb8,0x21,0x2e]
-; CHECK: fcvtmu.4s v0, v0            ; encoding: [0x00,0xb8,0x21,0x6e]
-; CHECK: fcvtmu.2d v0, v0            ; encoding: [0x00,0xb8,0x61,0x6e]
-; CHECK: fcvtmu    s0, s0            ; encoding: [0x00,0xb8,0x21,0x7e]
-; CHECK: fcvtmu    d0, d0            ; encoding: [0x00,0xb8,0x61,0x7e]
-
-  fcvtns.2s   v0, v0
-  fcvtns.4s   v0, v0
-  fcvtns.2d   v0, v0
-  fcvtns      s0, s0
-  fcvtns      d0, d0
-
-; CHECK: fcvtns.2s v0, v0            ; encoding: [0x00,0xa8,0x21,0x0e]
-; CHECK: fcvtns.4s v0, v0            ; encoding: [0x00,0xa8,0x21,0x4e]
-; CHECK: fcvtns.2d v0, v0            ; encoding: [0x00,0xa8,0x61,0x4e]
-; CHECK: fcvtns    s0, s0            ; encoding: [0x00,0xa8,0x21,0x5e]
-; CHECK: fcvtns    d0, d0            ; encoding: [0x00,0xa8,0x61,0x5e]
-
-  fcvtnu.2s   v0, v0
-  fcvtnu.4s   v0, v0
-  fcvtnu.2d   v0, v0
-  fcvtnu      s0, s0
-  fcvtnu      d0, d0
-
-; CHECK: fcvtnu.2s v0, v0            ; encoding: [0x00,0xa8,0x21,0x2e]
-; CHECK: fcvtnu.4s v0, v0            ; encoding: [0x00,0xa8,0x21,0x6e]
-; CHECK: fcvtnu.2d v0, v0            ; encoding: [0x00,0xa8,0x61,0x6e]
-; CHECK: fcvtnu    s0, s0            ; encoding: [0x00,0xa8,0x21,0x7e]
-; CHECK: fcvtnu    d0, d0            ; encoding: [0x00,0xa8,0x61,0x7e]
-
-  fcvtn   v2.4h, v4.4s
-  fcvtn   v3.2s, v5.2d
-  fcvtn2  v4.8h, v6.4s
-  fcvtn2  v5.4s, v7.2d
-  fcvtxn  v6.2s, v9.2d
-  fcvtxn2 v7.4s, v8.2d
-
-; CHECK: fcvtn	v2.4h, v4.4s            ; encoding: [0x82,0x68,0x21,0x0e]
-; CHECK: fcvtn	v3.2s, v5.2d            ; encoding: [0xa3,0x68,0x61,0x0e]
-; CHECK: fcvtn2	v4.8h, v6.4s            ; encoding: [0xc4,0x68,0x21,0x4e]
-; CHECK: fcvtn2	v5.4s, v7.2d            ; encoding: [0xe5,0x68,0x61,0x4e]
-; CHECK: fcvtxn	v6.2s, v9.2d            ; encoding: [0x26,0x69,0x61,0x2e]
-; CHECK: fcvtxn2 v7.4s, v8.2d           ; encoding: [0x07,0x69,0x61,0x6e]
-
-  fcvtps.2s  v0, v0
-  fcvtps.4s  v0, v0
-  fcvtps.2d  v0, v0
-  fcvtps     s0, s0
-  fcvtps     d0, d0
-
-; CHECK: fcvtps.2s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x0e]
-; CHECK: fcvtps.4s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x4e]
-; CHECK: fcvtps.2d v0, v0            ; encoding: [0x00,0xa8,0xe1,0x4e]
-; CHECK: fcvtps    s0, s0            ; encoding: [0x00,0xa8,0xa1,0x5e]
-; CHECK: fcvtps    d0, d0            ; encoding: [0x00,0xa8,0xe1,0x5e]
-
-  fcvtpu.2s  v0, v0
-  fcvtpu.4s  v0, v0
-  fcvtpu.2d  v0, v0
-  fcvtpu     s0, s0
-  fcvtpu     d0, d0
-
-; CHECK: fcvtpu.2s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x2e]
-; CHECK: fcvtpu.4s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x6e]
-; CHECK: fcvtpu.2d v0, v0            ; encoding: [0x00,0xa8,0xe1,0x6e]
-; CHECK: fcvtpu    s0, s0            ; encoding: [0x00,0xa8,0xa1,0x7e]
-; CHECK: fcvtpu    d0, d0            ; encoding: [0x00,0xa8,0xe1,0x7e]
-
-  fcvtzs.2s  v0, v0
-  fcvtzs.4s  v0, v0
-  fcvtzs.2d  v0, v0
-  fcvtzs     s0, s0
-  fcvtzs     d0, d0
-
-; CHECK: fcvtzs.2s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x0e]
-; CHECK: fcvtzs.4s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x4e]
-; CHECK: fcvtzs.2d v0, v0            ; encoding: [0x00,0xb8,0xe1,0x4e]
-; CHECK: fcvtzs    s0, s0            ; encoding: [0x00,0xb8,0xa1,0x5e]
-; CHECK: fcvtzs    d0, d0            ; encoding: [0x00,0xb8,0xe1,0x5e]
-
-  fcvtzu.2s  v0, v0
-  fcvtzu.4s  v0, v0
-  fcvtzu.2d  v0, v0
-  fcvtzu     s0, s0
-  fcvtzu     d0, d0
-
-; CHECK: fcvtzu.2s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x2e]
-; CHECK: fcvtzu.4s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x6e]
-; CHECK: fcvtzu.2d v0, v0            ; encoding: [0x00,0xb8,0xe1,0x6e]
-; CHECK: fcvtzu    s0, s0            ; encoding: [0x00,0xb8,0xa1,0x7e]
-; CHECK: fcvtzu    d0, d0            ; encoding: [0x00,0xb8,0xe1,0x7e]
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD modified immediate instructions
-;===-------------------------------------------------------------------------===
-
-  bic.2s  v0, #1
-  bic.2s  v0, #1, lsl #0
-  bic.2s  v0, #1, lsl #8
-  bic.2s  v0, #1, lsl #16
-  bic.2s  v0, #1, lsl #24
-
-; CHECK: bic.2s v0, #1               ; encoding: [0x20,0x14,0x00,0x2f]
-; CHECK: bic.2s v0, #1               ; encoding: [0x20,0x14,0x00,0x2f]
-; CHECK: bic.2s v0, #1, lsl #8       ; encoding: [0x20,0x34,0x00,0x2f]
-; CHECK: bic.2s v0, #1, lsl #16      ; encoding: [0x20,0x54,0x00,0x2f]
-; CHECK: bic.2s v0, #1, lsl #24      ; encoding: [0x20,0x74,0x00,0x2f]
-
-  bic.4h  v0, #1
-  bic.4h  v0, #1, lsl #0
-  bic.4h  v0, #1, lsl #8
-
-; CHECK: bic.4h v0, #1               ; encoding: [0x20,0x94,0x00,0x2f]
-; CHECK: bic.4h v0, #1               ; encoding: [0x20,0x94,0x00,0x2f]
-; CHECK: bic.4h v0, #1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x2f]
-
-  bic.4s  v0, #1
-  bic.4s  v0, #1, lsl #0
-  bic.4s  v0, #1, lsl #8
-  bic.4s  v0, #1, lsl #16
-  bic.4s  v0, #1, lsl #24
-
-; CHECK: bic.4s v0, #1               ; encoding: [0x20,0x14,0x00,0x6f]
-; CHECK: bic.4s v0, #1               ; encoding: [0x20,0x14,0x00,0x6f]
-; CHECK: bic.4s v0, #1, lsl #8       ; encoding: [0x20,0x34,0x00,0x6f]
-; CHECK: bic.4s v0, #1, lsl #16      ; encoding: [0x20,0x54,0x00,0x6f]
-; CHECK: bic.4s v0, #1, lsl #24      ; encoding: [0x20,0x74,0x00,0x6f]
-
-  bic.8h  v0, #1
-  bic.8h  v0, #1, lsl #0
-  bic.8h  v0, #1, lsl #8
-
-; CHECK: bic.8h v0, #1               ; encoding: [0x20,0x94,0x00,0x6f]
-; CHECK: bic.8h v0, #1               ; encoding: [0x20,0x94,0x00,0x6f]
-; CHECK: bic.8h v0, #1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x6f]
-
-  fmov.2d v0, #1.250000e-01
-
-; CHECK: fmov.2d v0, #1.250000e-01             ; encoding: [0x00,0xf4,0x02,0x6f]
-
-  fmov.2s v0, #1.250000e-01
-  fmov.4s v0, #1.250000e-01
-
-; CHECK: fmov.2s v0, #1.250000e-01             ; encoding: [0x00,0xf4,0x02,0x0f]
-; CHECK: fmov.4s v0, #1.250000e-01             ; encoding: [0x00,0xf4,0x02,0x4f]
-
-  orr.2s  v0, #1
-  orr.2s  v0, #1, lsl #0
-  orr.2s  v0, #1, lsl #8
-  orr.2s  v0, #1, lsl #16
-  orr.2s  v0, #1, lsl #24
-
-; CHECK: orr.2s v0, #1               ; encoding: [0x20,0x14,0x00,0x0f]
-; CHECK: orr.2s v0, #1               ; encoding: [0x20,0x14,0x00,0x0f]
-; CHECK: orr.2s v0, #1, lsl #8       ; encoding: [0x20,0x34,0x00,0x0f]
-; CHECK: orr.2s v0, #1, lsl #16      ; encoding: [0x20,0x54,0x00,0x0f]
-; CHECK: orr.2s v0, #1, lsl #24      ; encoding: [0x20,0x74,0x00,0x0f]
-
-  orr.4h  v0, #1
-  orr.4h  v0, #1, lsl #0
-  orr.4h  v0, #1, lsl #8
-
-; CHECK: orr.4h v0, #1               ; encoding: [0x20,0x94,0x00,0x0f]
-; CHECK: orr.4h v0, #1               ; encoding: [0x20,0x94,0x00,0x0f]
-; CHECK: orr.4h v0, #1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x0f]
-
-  orr.4s  v0, #1
-  orr.4s  v0, #1, lsl #0
-  orr.4s  v0, #1, lsl #8
-  orr.4s  v0, #1, lsl #16
-  orr.4s  v0, #1, lsl #24
-
-; CHECK: orr.4s v0, #1               ; encoding: [0x20,0x14,0x00,0x4f]
-; CHECK: orr.4s v0, #1               ; encoding: [0x20,0x14,0x00,0x4f]
-; CHECK: orr.4s v0, #1, lsl #8       ; encoding: [0x20,0x34,0x00,0x4f]
-; CHECK: orr.4s v0, #1, lsl #16      ; encoding: [0x20,0x54,0x00,0x4f]
-; CHECK: orr.4s v0, #1, lsl #24      ; encoding: [0x20,0x74,0x00,0x4f]
-
-  orr.8h  v0, #1
-  orr.8h  v0, #1, lsl #0
-  orr.8h  v0, #1, lsl #8
-
-; CHECK: orr.8h v0, #1               ; encoding: [0x20,0x94,0x00,0x4f]
-; CHECK: orr.8h v0, #1               ; encoding: [0x20,0x94,0x00,0x4f]
-; CHECK: orr.8h v0, #1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x4f]
-
-  movi     d0, #0x000000000000ff
-  movi.2d  v0, #0x000000000000ff
-
-; CHECK: movi     d0, #0x000000000000ff ; encoding: [0x20,0xe4,0x00,0x2f]
-; CHECK: movi.2d  v0, #0x000000000000ff ; encoding: [0x20,0xe4,0x00,0x6f]
-
-  movi.2s v0, #1
-  movi.2s v0, #1, lsl #0
-  movi.2s v0, #1, lsl #8
-  movi.2s v0, #1, lsl #16
-  movi.2s v0, #1, lsl #24
-
-; CHECK: movi.2s v0, #1              ; encoding: [0x20,0x04,0x00,0x0f]
-; CHECK: movi.2s v0, #1              ; encoding: [0x20,0x04,0x00,0x0f]
-; CHECK: movi.2s v0, #1, lsl #8      ; encoding: [0x20,0x24,0x00,0x0f]
-; CHECK: movi.2s v0, #1, lsl #16     ; encoding: [0x20,0x44,0x00,0x0f]
-; CHECK: movi.2s v0, #1, lsl #24     ; encoding: [0x20,0x64,0x00,0x0f]
-
-  movi.4s v0, #1
-  movi.4s v0, #1, lsl #0
-  movi.4s v0, #1, lsl #8
-  movi.4s v0, #1, lsl #16
-  movi.4s v0, #1, lsl #24
-
-; CHECK: movi.4s v0, #1              ; encoding: [0x20,0x04,0x00,0x4f]
-; CHECK: movi.4s v0, #1              ; encoding: [0x20,0x04,0x00,0x4f]
-; CHECK: movi.4s v0, #1, lsl #8      ; encoding: [0x20,0x24,0x00,0x4f]
-; CHECK: movi.4s v0, #1, lsl #16     ; encoding: [0x20,0x44,0x00,0x4f]
-; CHECK: movi.4s v0, #1, lsl #24     ; encoding: [0x20,0x64,0x00,0x4f]
-
-  movi.4h v0, #1
-  movi.4h v0, #1, lsl #0
-  movi.4h v0, #1, lsl #8
-
-; CHECK: movi.4h v0, #1              ; encoding: [0x20,0x84,0x00,0x0f]
-; CHECK: movi.4h v0, #1              ; encoding: [0x20,0x84,0x00,0x0f]
-; CHECK: movi.4h v0, #1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x0f]
-
-  movi.8h v0, #1
-  movi.8h v0, #1, lsl #0
-  movi.8h v0, #1, lsl #8
-
-; CHECK: movi.8h v0, #1              ; encoding: [0x20,0x84,0x00,0x4f]
-; CHECK: movi.8h v0, #1              ; encoding: [0x20,0x84,0x00,0x4f]
-; CHECK: movi.8h v0, #1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x4f]
-
-  movi.2s v0, #1, msl #8
-  movi.2s v0, #1, msl #16
-  movi.4s v0, #1, msl #8
-  movi.4s v0, #1, msl #16
-
-; CHECK: movi.2s v0, #1, msl #8      ; encoding: [0x20,0xc4,0x00,0x0f]
-; CHECK: movi.2s v0, #1, msl #16     ; encoding: [0x20,0xd4,0x00,0x0f]
-; CHECK: movi.4s v0, #1, msl #8      ; encoding: [0x20,0xc4,0x00,0x4f]
-; CHECK: movi.4s v0, #1, msl #16     ; encoding: [0x20,0xd4,0x00,0x4f]
-
-  movi.8b  v0, #1
-  movi.16b v0, #1
-
-; CHECK: movi.8b  v0, #1             ; encoding: [0x20,0xe4,0x00,0x0f]
-; CHECK: movi.16b v0, #1             ; encoding: [0x20,0xe4,0x00,0x4f]
-
-  mvni.2s v0, #1
-  mvni.2s v0, #1, lsl #0
-  mvni.2s v0, #1, lsl #8
-  mvni.2s v0, #1, lsl #16
-  mvni.2s v0, #1, lsl #24
-
-; CHECK: mvni.2s v0, #1              ; encoding: [0x20,0x04,0x00,0x2f]
-; CHECK: mvni.2s v0, #1              ; encoding: [0x20,0x04,0x00,0x2f]
-; CHECK: mvni.2s v0, #1, lsl #8      ; encoding: [0x20,0x24,0x00,0x2f]
-; CHECK: mvni.2s v0, #1, lsl #16     ; encoding: [0x20,0x44,0x00,0x2f]
-; CHECK: mvni.2s v0, #1, lsl #24     ; encoding: [0x20,0x64,0x00,0x2f]
-
-  mvni.4s v0, #1
-  mvni.4s v0, #1, lsl #0
-  mvni.4s v0, #1, lsl #8
-  mvni.4s v0, #1, lsl #16
-  mvni.4s v0, #1, lsl #24
-
-; CHECK: mvni.4s v0, #1              ; encoding: [0x20,0x04,0x00,0x6f]
-; CHECK: mvni.4s v0, #1              ; encoding: [0x20,0x04,0x00,0x6f]
-; CHECK: mvni.4s v0, #1, lsl #8      ; encoding: [0x20,0x24,0x00,0x6f]
-; CHECK: mvni.4s v0, #1, lsl #16     ; encoding: [0x20,0x44,0x00,0x6f]
-; CHECK: mvni.4s v0, #1, lsl #24     ; encoding: [0x20,0x64,0x00,0x6f]
-
-  mvni.4h v0, #1
-  mvni.4h v0, #1, lsl #0
-  mvni.4h v0, #1, lsl #8
-
-; CHECK: mvni.4h v0, #1              ; encoding: [0x20,0x84,0x00,0x2f]
-; CHECK: mvni.4h v0, #1              ; encoding: [0x20,0x84,0x00,0x2f]
-; CHECK: mvni.4h v0, #1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x2f]
-
-  mvni.8h v0, #1
-  mvni.8h v0, #1, lsl #0
-  mvni.8h v0, #1, lsl #8
-
-; CHECK: mvni.8h v0, #1              ; encoding: [0x20,0x84,0x00,0x6f]
-; CHECK: mvni.8h v0, #1              ; encoding: [0x20,0x84,0x00,0x6f]
-; CHECK: mvni.8h v0, #1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x6f]
-
-  mvni.2s v0, #1, msl #8
-  mvni.2s v0, #1, msl #16
-  mvni.4s v0, #1, msl #8
-  mvni.4s v0, #1, msl #16
-
-; CHECK: mvni.2s v0, #1, msl #8      ; encoding: [0x20,0xc4,0x00,0x2f]
-; CHECK: mvni.2s v0, #1, msl #16     ; encoding: [0x20,0xd4,0x00,0x2f]
-; CHECK: mvni.4s v0, #1, msl #8      ; encoding: [0x20,0xc4,0x00,0x6f]
-; CHECK: mvni.4s v0, #1, msl #16     ; encoding: [0x20,0xd4,0x00,0x6f]
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD scalar x index
-;===-------------------------------------------------------------------------===
-
-  fmla.s  s0, s0, v0[3]
-  fmla.d  d0, d0, v0[1]
-  fmls.s  s0, s0, v0[3]
-  fmls.d  d0, d0, v0[1]
-  fmulx.s s0, s0, v0[3]
-  fmulx.d d0, d0, v0[1]
-  fmul.s  s0, s0, v0[3]
-  fmul.d  d0, d0, v0[1]
-  sqdmlal.h s0, h0, v0[7]
-  sqdmlal.s d0, s0, v0[3]
-  sqdmlsl.h s0, h0, v0[7]
-  sqdmulh.h h0, h0, v0[7]
-  sqdmulh.s s0, s0, v0[3]
-  sqdmull.h s0, h0, v0[7]
-  sqdmull.s d0, s0, v0[3]
-  sqrdmulh.h  h0, h0, v0[7]
-  sqrdmulh.s  s0, s0, v0[3]
-
-; CHECK: fmla.s	s0, s0, v0[3]           ; encoding: [0x00,0x18,0xa0,0x5f]
-; CHECK: fmla.d	d0, d0, v0[1]           ; encoding: [0x00,0x18,0xc0,0x5f]
-; CHECK: fmls.s	s0, s0, v0[3]           ; encoding: [0x00,0x58,0xa0,0x5f]
-; CHECK: fmls.d	d0, d0, v0[1]           ; encoding: [0x00,0x58,0xc0,0x5f]
-; CHECK: fmulx.s	s0, s0, v0[3]           ; encoding: [0x00,0x98,0xa0,0x7f]
-; CHECK: fmulx.d	d0, d0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x7f]
-; CHECK: fmul.s	s0, s0, v0[3]           ; encoding: [0x00,0x98,0xa0,0x5f]
-; CHECK: fmul.d	d0, d0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x5f]
-; CHECK: sqdmlal.h	s0, h0, v0[7]   ; encoding: [0x00,0x38,0x70,0x5f]
-; CHECK: sqdmlal.s	d0, s0, v0[3]   ; encoding: [0x00,0x38,0xa0,0x5f]
-; CHECK: sqdmlsl.h	s0, h0, v0[7]   ; encoding: [0x00,0x78,0x70,0x5f]
-; CHECK: sqdmulh.h	h0, h0, v0[7]   ; encoding: [0x00,0xc8,0x70,0x5f]
-; CHECK: sqdmulh.s	s0, s0, v0[3]   ; encoding: [0x00,0xc8,0xa0,0x5f]
-; CHECK: sqdmull.h	s0, h0, v0[7]   ; encoding: [0x00,0xb8,0x70,0x5f]
-; CHECK: sqdmull.s	d0, s0, v0[3]   ; encoding: [0x00,0xb8,0xa0,0x5f]
-; CHECK: sqrdmulh.h	h0, h0, v0[7]   ; encoding: [0x00,0xd8,0x70,0x5f]
-; CHECK: sqrdmulh.s	s0, s0, v0[3]   ; encoding: [0x00,0xd8,0xa0,0x5f]
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD SMLAL
-;===-------------------------------------------------------------------------===
-        smlal.8h v1, v2, v3
-        smlal.4s v1, v2, v3
-        smlal.2d v1, v2, v3
-        smlal2.8h v1, v2, v3
-        smlal2.4s v1, v2, v3
-        smlal2.2d v1, v2, v3
-
-        smlal v13.8h, v8.8b, v0.8b
-        smlal v13.4s, v8.4h, v0.4h
-        smlal v13.2d, v8.2s, v0.2s
-        smlal2 v13.8h, v8.16b, v0.16b
-        smlal2 v13.4s, v8.8h, v0.8h
-        smlal2 v13.2d, v8.4s, v0.4s
-
-; CHECK: smlal.8h	v1, v2, v3      ; encoding: [0x41,0x80,0x23,0x0e]
-; CHECK: smlal.4s	v1, v2, v3      ; encoding: [0x41,0x80,0x63,0x0e]
-; CHECK: smlal.2d	v1, v2, v3      ; encoding: [0x41,0x80,0xa3,0x0e]
-; CHECK: smlal2.8h	v1, v2, v3      ; encoding: [0x41,0x80,0x23,0x4e]
-; CHECK: smlal2.4s	v1, v2, v3      ; encoding: [0x41,0x80,0x63,0x4e]
-; CHECK: smlal2.2d	v1, v2, v3      ; encoding: [0x41,0x80,0xa3,0x4e]
-; CHECK: smlal.8h	v13, v8, v0     ; encoding: [0x0d,0x81,0x20,0x0e]
-; CHECK: smlal.4s	v13, v8, v0     ; encoding: [0x0d,0x81,0x60,0x0e]
-; CHECK: smlal.2d	v13, v8, v0     ; encoding: [0x0d,0x81,0xa0,0x0e]
-; CHECK: smlal2.8h	v13, v8, v0     ; encoding: [0x0d,0x81,0x20,0x4e]
-; CHECK: smlal2.4s	v13, v8, v0     ; encoding: [0x0d,0x81,0x60,0x4e]
-; CHECK: smlal2.2d	v13, v8, v0     ; encoding: [0x0d,0x81,0xa0,0x4e]
-
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD scalar x index
-;===-------------------------------------------------------------------------===
-
-  fmla.2s v0, v0, v0[0]
-  fmla.4s v0, v0, v0[1]
-  fmla.2d v0, v0, v0[1]
-  fmls.2s v0, v0, v0[0]
-  fmls.4s v0, v0, v0[1]
-  fmls.2d v0, v0, v0[1]
-  fmulx.2s  v0, v0, v0[0]
-  fmulx.4s  v0, v0, v0[1]
-  fmulx.2d  v0, v0, v0[1]
-  fmul.2s v0, v0, v0[0]
-  fmul.4s v0, v0, v0[1]
-  fmul.2d v0, v0, v0[1]
-  mla.4h  v0, v0, v0[0]
-  mla.8h  v0, v0, v0[1]
-  mla.2s  v0, v0, v0[2]
-  mla.4s  v0, v0, v0[3]
-  mls.4h  v0, v0, v0[0]
-  mls.8h  v0, v0, v0[1]
-  mls.2s  v0, v0, v0[2]
-  mls.4s  v0, v0, v0[3]
-  mul.4h  v0, v0, v0[0]
-  mul.8h  v0, v0, v0[1]
-  mul.2s  v0, v0, v0[2]
-  mul.4s  v0, v0, v0[3]
-  smlal.4s  v0, v0, v0[0]
-  smlal2.4s v0, v0, v0[1]
-  smlal.2d  v0, v0, v0[2]
-  smlal2.2d v0, v0, v0[3]
-  smlsl.4s  v0, v0, v0[0]
-  smlsl2.4s v0, v0, v0[1]
-  smlsl.2d  v0, v0, v0[2]
-  smlsl2.2d v0, v0, v0[3]
-  smull.4s  v0, v0, v0[0]
-  smull2.4s v0, v0, v0[1]
-  smull.2d  v0, v0, v0[2]
-  smull2.2d v0, v0, v0[3]
-  sqdmlal.4s  v0, v0, v0[0]
-  sqdmlal2.4s v0, v0, v0[1]
-  sqdmlal.2d  v0, v0, v0[2]
-  sqdmlal2.2d v0, v0, v0[3]
-  sqdmlsl.4s  v0, v0, v0[0]
-  sqdmlsl2.4s v0, v0, v0[1]
-  sqdmlsl.2d  v0, v0, v0[2]
-  sqdmlsl2.2d v0, v0, v0[3]
-  sqdmulh.4h  v0, v0, v0[0]
-  sqdmulh.8h  v0, v0, v0[1]
-  sqdmulh.2s  v0, v0, v0[2]
-  sqdmulh.4s  v0, v0, v0[3]
-  sqdmull.4s  v0, v0, v0[0]
-  sqdmull2.4s v0, v0, v0[1]
-  sqdmull.2d  v0, v0, v0[2]
-  sqdmull2.2d v0, v0, v0[3]
-  sqrdmulh.4h v0, v0, v0[0]
-  sqrdmulh.8h v0, v0, v0[1]
-  sqrdmulh.2s v0, v0, v0[2]
-  sqrdmulh.4s v0, v0, v0[3]
-  umlal.4s  v0, v0, v0[0]
-  umlal2.4s v0, v0, v0[1]
-  umlal.2d  v0, v0, v0[2]
-  umlal2.2d v0, v0, v0[3]
-  umlsl.4s  v0, v0, v0[0]
-  umlsl2.4s v0, v0, v0[1]
-  umlsl.2d  v0, v0, v0[2]
-  umlsl2.2d v0, v0, v0[3]
-  umull.4s  v0, v0, v0[0]
-  umull2.4s v0, v0, v0[1]
-  umull.2d  v0, v0, v0[2]
-  umull2.2d v0, v0, v0[3]
-
-; CHECK: fmla.2s	v0, v0, v0[0]           ; encoding: [0x00,0x10,0x80,0x0f]
-; CHECK: fmla.4s	v0, v0, v0[1]           ; encoding: [0x00,0x10,0xa0,0x4f]
-; CHECK: fmla.2d	v0, v0, v0[1]           ; encoding: [0x00,0x18,0xc0,0x4f]
-; CHECK: fmls.2s	v0, v0, v0[0]           ; encoding: [0x00,0x50,0x80,0x0f]
-; CHECK: fmls.4s	v0, v0, v0[1]           ; encoding: [0x00,0x50,0xa0,0x4f]
-; CHECK: fmls.2d	v0, v0, v0[1]           ; encoding: [0x00,0x58,0xc0,0x4f]
-; CHECK: fmulx.2s	v0, v0, v0[0]   ; encoding: [0x00,0x90,0x80,0x2f]
-; CHECK: fmulx.4s	v0, v0, v0[1]   ; encoding: [0x00,0x90,0xa0,0x6f]
-; CHECK: fmulx.2d	v0, v0, v0[1]   ; encoding: [0x00,0x98,0xc0,0x6f]
-; CHECK: fmul.2s	v0, v0, v0[0]           ; encoding: [0x00,0x90,0x80,0x0f]
-; CHECK: fmul.4s	v0, v0, v0[1]           ; encoding: [0x00,0x90,0xa0,0x4f]
-; CHECK: fmul.2d	v0, v0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x4f]
-; CHECK: mla.4h	v0, v0, v0[0]           ; encoding: [0x00,0x00,0x40,0x2f]
-; CHECK: mla.8h	v0, v0, v0[1]           ; encoding: [0x00,0x00,0x50,0x6f]
-; CHECK: mla.2s	v0, v0, v0[2]           ; encoding: [0x00,0x08,0x80,0x2f]
-; CHECK: mla.4s	v0, v0, v0[3]           ; encoding: [0x00,0x08,0xa0,0x6f]
-; CHECK: mls.4h	v0, v0, v0[0]           ; encoding: [0x00,0x40,0x40,0x2f]
-; CHECK: mls.8h	v0, v0, v0[1]           ; encoding: [0x00,0x40,0x50,0x6f]
-; CHECK: mls.2s	v0, v0, v0[2]           ; encoding: [0x00,0x48,0x80,0x2f]
-; CHECK: mls.4s	v0, v0, v0[3]           ; encoding: [0x00,0x48,0xa0,0x6f]
-; CHECK: mul.4h	v0, v0, v0[0]           ; encoding: [0x00,0x80,0x40,0x0f]
-; CHECK: mul.8h	v0, v0, v0[1]           ; encoding: [0x00,0x80,0x50,0x4f]
-; CHECK: mul.2s	v0, v0, v0[2]           ; encoding: [0x00,0x88,0x80,0x0f]
-; CHECK: mul.4s	v0, v0, v0[3]           ; encoding: [0x00,0x88,0xa0,0x4f]
-; CHECK: smlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x20,0x40,0x0f]
-; CHECK: smlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x20,0x50,0x4f]
-; CHECK: smlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x28,0x80,0x0f]
-; CHECK: smlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x28,0xa0,0x4f]
-; CHECK: smlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x60,0x40,0x0f]
-; CHECK: smlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x60,0x50,0x4f]
-; CHECK: smlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x68,0x80,0x0f]
-; CHECK: smlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x68,0xa0,0x4f]
-; CHECK: smull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xa0,0x40,0x0f]
-; CHECK: smull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xa0,0x50,0x4f]
-; CHECK: smull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xa8,0x80,0x0f]
-; CHECK: smull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xa8,0xa0,0x4f]
-; CHECK: sqdmlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x30,0x40,0x0f]
-; CHECK: sqdmlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x30,0x50,0x4f]
-; CHECK: sqdmlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x38,0x80,0x0f]
-; CHECK: sqdmlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x38,0xa0,0x4f]
-; CHECK: sqdmlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x70,0x40,0x0f]
-; CHECK: sqdmlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x70,0x50,0x4f]
-; CHECK: sqdmlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x78,0x80,0x0f]
-; CHECK: sqdmlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x78,0xa0,0x4f]
-; CHECK: sqdmulh.4h	v0, v0, v0[0]   ; encoding: [0x00,0xc0,0x40,0x0f]
-; CHECK: sqdmulh.8h	v0, v0, v0[1]   ; encoding: [0x00,0xc0,0x50,0x4f]
-; CHECK: sqdmulh.2s	v0, v0, v0[2]   ; encoding: [0x00,0xc8,0x80,0x0f]
-; CHECK: sqdmulh.4s	v0, v0, v0[3]   ; encoding: [0x00,0xc8,0xa0,0x4f]
-; CHECK: sqdmull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xb0,0x40,0x0f]
-; CHECK: sqdmull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xb0,0x50,0x4f]
-; CHECK: sqdmull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xb8,0x80,0x0f]
-; CHECK: sqdmull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xb8,0xa0,0x4f]
-; CHECK: sqrdmulh.4h	v0, v0, v0[0]   ; encoding: [0x00,0xd0,0x40,0x0f]
-; CHECK: sqrdmulh.8h	v0, v0, v0[1]   ; encoding: [0x00,0xd0,0x50,0x4f]
-; CHECK: sqrdmulh.2s	v0, v0, v0[2]   ; encoding: [0x00,0xd8,0x80,0x0f]
-; CHECK: sqrdmulh.4s	v0, v0, v0[3]   ; encoding: [0x00,0xd8,0xa0,0x4f]
-; CHECK: umlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x20,0x40,0x2f]
-; CHECK: umlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x20,0x50,0x6f]
-; CHECK: umlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x28,0x80,0x2f]
-; CHECK: umlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x28,0xa0,0x6f]
-; CHECK: umlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x60,0x40,0x2f]
-; CHECK: umlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x60,0x50,0x6f]
-; CHECK: umlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x68,0x80,0x2f]
-; CHECK: umlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x68,0xa0,0x6f]
-; CHECK: umull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xa0,0x40,0x2f]
-; CHECK: umull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xa0,0x50,0x6f]
-; CHECK: umull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xa8,0x80,0x2f]
-; CHECK: umull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xa8,0xa0,0x6f]
-
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD scalar with shift
-;===-------------------------------------------------------------------------===
-
-  fcvtzs s0, s0, #1
-  fcvtzs d0, d0, #2
-  fcvtzu s0, s0, #1
-  fcvtzu d0, d0, #2
-  shl    d0, d0, #1
-  sli    d0, d0, #1
-  sqrshrn b0, h0, #1
-  sqrshrn h0, s0, #2
-  sqrshrn s0, d0, #3
-  sqrshrun b0, h0, #1
-  sqrshrun h0, s0, #2
-  sqrshrun s0, d0, #3
-  sqshlu  b0, b0, #1
-  sqshlu  h0, h0, #2
-  sqshlu  s0, s0, #3
-  sqshlu  d0, d0, #4
-  sqshl   b0, b0, #1
-  sqshl   h0, h0, #2
-  sqshl   s0, s0, #3
-  sqshl   d0, d0, #4
-  sqshrn  b0, h0, #1
-  sqshrn  h0, s0, #2
-  sqshrn  s0, d0, #3
-  sqshrun b0, h0, #1
-  sqshrun h0, s0, #2
-  sqshrun s0, d0, #3
-  sri     d0, d0, #1
-  srshr   d0, d0, #1
-  srsra   d0, d0, #1
-  sshr    d0, d0, #1
-  ucvtf   s0, s0, #1
-  ucvtf   d0, d0, #2
-  scvtf   s0, s0, #1
-  scvtf   d0, d0, #2
-  uqrshrn b0, h0, #1
-  uqrshrn h0, s0, #2
-  uqrshrn s0, d0, #3
-  uqshl   b0, b0, #1
-  uqshl   h0, h0, #2
-  uqshl   s0, s0, #3
-  uqshl   d0, d0, #4
-  uqshrn  b0, h0, #1
-  uqshrn  h0, s0, #2
-  uqshrn  s0, d0, #3
-  urshr   d0, d0, #1
-  ursra   d0, d0, #1
-  ushr    d0, d0, #1
-  usra    d0, d0, #1
-
-; CHECK: fcvtzs	s0, s0, #1              ; encoding: [0x00,0xfc,0x3f,0x5f]
-; CHECK: fcvtzs	d0, d0, #2              ; encoding: [0x00,0xfc,0x7e,0x5f]
-; CHECK: fcvtzu	s0, s0, #1              ; encoding: [0x00,0xfc,0x3f,0x7f]
-; CHECK: fcvtzu	d0, d0, #2              ; encoding: [0x00,0xfc,0x7e,0x7f]
-; CHECK: shl	d0, d0, #1              ; encoding: [0x00,0x54,0x41,0x5f]
-; CHECK: sli	d0, d0, #1              ; encoding: [0x00,0x54,0x41,0x7f]
-; CHECK: sqrshrn	b0, h0, #1              ; encoding: [0x00,0x9c,0x0f,0x5f]
-; CHECK: sqrshrn	h0, s0, #2              ; encoding: [0x00,0x9c,0x1e,0x5f]
-; CHECK: sqrshrn	s0, d0, #3              ; encoding: [0x00,0x9c,0x3d,0x5f]
-; CHECK: sqrshrun	b0, h0, #1      ; encoding: [0x00,0x8c,0x0f,0x7f]
-; CHECK: sqrshrun	h0, s0, #2      ; encoding: [0x00,0x8c,0x1e,0x7f]
-; CHECK: sqrshrun	s0, d0, #3      ; encoding: [0x00,0x8c,0x3d,0x7f]
-; CHECK: sqshlu	b0, b0, #1              ; encoding: [0x00,0x64,0x09,0x7f]
-; CHECK: sqshlu	h0, h0, #2              ; encoding: [0x00,0x64,0x12,0x7f]
-; CHECK: sqshlu	s0, s0, #3              ; encoding: [0x00,0x64,0x23,0x7f]
-; CHECK: sqshlu	d0, d0, #4              ; encoding: [0x00,0x64,0x44,0x7f]
-; CHECK: sqshl	b0, b0, #1              ; encoding: [0x00,0x74,0x09,0x5f]
-; CHECK: sqshl	h0, h0, #2              ; encoding: [0x00,0x74,0x12,0x5f]
-; CHECK: sqshl	s0, s0, #3              ; encoding: [0x00,0x74,0x23,0x5f]
-; CHECK: sqshl	d0, d0, #4              ; encoding: [0x00,0x74,0x44,0x5f]
-; CHECK: sqshrn	b0, h0, #1              ; encoding: [0x00,0x94,0x0f,0x5f]
-; CHECK: sqshrn	h0, s0, #2              ; encoding: [0x00,0x94,0x1e,0x5f]
-; CHECK: sqshrn	s0, d0, #3              ; encoding: [0x00,0x94,0x3d,0x5f]
-; CHECK: sqshrun	b0, h0, #1              ; encoding: [0x00,0x84,0x0f,0x7f]
-; CHECK: sqshrun	h0, s0, #2              ; encoding: [0x00,0x84,0x1e,0x7f]
-; CHECK: sqshrun	s0, d0, #3              ; encoding: [0x00,0x84,0x3d,0x7f]
-; CHECK: sri	d0, d0, #1              ; encoding: [0x00,0x44,0x7f,0x7f]
-; CHECK: srshr	d0, d0, #1              ; encoding: [0x00,0x24,0x7f,0x5f]
-; CHECK: srsra	d0, d0, #1              ; encoding: [0x00,0x34,0x7f,0x5f]
-; CHECK: sshr	d0, d0, #1              ; encoding: [0x00,0x04,0x7f,0x5f]
-; CHECK: ucvtf	s0, s0, #1              ; encoding: [0x00,0xe4,0x3f,0x7f]
-; CHECK: ucvtf	d0, d0, #2              ; encoding: [0x00,0xe4,0x7e,0x7f]
-; check: scvtf  s0, s0, #1              ; encoding: [0x00,0xe4,0x3f,0x5f]
-; check: scvtf  d0, d0, #2              ; encoding: [0x00,0xe4,0x7e,0x5f]
-; CHECK: uqrshrn	b0, h0, #1              ; encoding: [0x00,0x9c,0x0f,0x7f]
-; CHECK: uqrshrn	h0, s0, #2              ; encoding: [0x00,0x9c,0x1e,0x7f]
-; CHECK: uqrshrn	s0, d0, #3              ; encoding: [0x00,0x9c,0x3d,0x7f]
-; CHECK: uqshl	b0, b0, #1              ; encoding: [0x00,0x74,0x09,0x7f]
-; CHECK: uqshl	h0, h0, #2              ; encoding: [0x00,0x74,0x12,0x7f]
-; CHECK: uqshl	s0, s0, #3              ; encoding: [0x00,0x74,0x23,0x7f]
-; CHECK: uqshl	d0, d0, #4              ; encoding: [0x00,0x74,0x44,0x7f]
-; CHECK: uqshrn	b0, h0, #1              ; encoding: [0x00,0x94,0x0f,0x7f]
-; CHECK: uqshrn	h0, s0, #2              ; encoding: [0x00,0x94,0x1e,0x7f]
-; CHECK: uqshrn	s0, d0, #3              ; encoding: [0x00,0x94,0x3d,0x7f]
-; CHECK: urshr	d0, d0, #1              ; encoding: [0x00,0x24,0x7f,0x7f]
-; CHECK: ursra	d0, d0, #1              ; encoding: [0x00,0x34,0x7f,0x7f]
-; CHECK: ushr	d0, d0, #1              ; encoding: [0x00,0x04,0x7f,0x7f]
-; CHECK: usra	d0, d0, #1              ; encoding: [0x00,0x14,0x7f,0x7f]
-
-
-;===-------------------------------------------------------------------------===
-; AdvSIMD vector with shift
-;===-------------------------------------------------------------------------===
-
-   fcvtzs.2s v0, v0, #1
-   fcvtzs.4s v0, v0, #2
-   fcvtzs.2d v0, v0, #3
-   fcvtzu.2s v0, v0, #1
-   fcvtzu.4s v0, v0, #2
-   fcvtzu.2d v0, v0, #3
-   rshrn.8b v0, v0, #1
-   rshrn2.16b v0, v0, #2
-   rshrn.4h v0, v0, #3
-   rshrn2.8h v0, v0, #4
-   rshrn.2s v0, v0, #5
-   rshrn2.4s v0, v0, #6
-   scvtf.2s v0, v0, #1
-   scvtf.4s v0, v0, #2
-   scvtf.2d v0, v0, #3
-   shl.8b v0, v0, #1
-   shl.16b v0, v0, #2
-   shl.4h v0, v0, #3
-   shl.8h v0, v0, #4
-   shl.2s v0, v0, #5
-   shl.4s v0, v0, #6
-   shl.2d v0, v0, #7
-   shrn.8b v0, v0, #1
-   shrn2.16b v0, v0, #2
-   shrn.4h v0, v0, #3
-   shrn2.8h v0, v0, #4
-   shrn.2s v0, v0, #5
-   shrn2.4s v0, v0, #6
-   sli.8b v0, v0, #1
-   sli.16b v0, v0, #2
-   sli.4h v0, v0, #3
-   sli.8h v0, v0, #4
-   sli.2s v0, v0, #5
-   sli.4s v0, v0, #6
-   sli.2d v0, v0, #7
-   sqrshrn.8b v0, v0, #1
-   sqrshrn2.16b v0, v0, #2
-   sqrshrn.4h v0, v0, #3
-   sqrshrn2.8h v0, v0, #4
-   sqrshrn.2s v0, v0, #5
-   sqrshrn2.4s v0, v0, #6
-   sqrshrun.8b v0, v0, #1
-   sqrshrun2.16b v0, v0, #2
-   sqrshrun.4h v0, v0, #3
-   sqrshrun2.8h v0, v0, #4
-   sqrshrun.2s v0, v0, #5
-   sqrshrun2.4s v0, v0, #6
-   sqshlu.8b v0, v0, #1
-   sqshlu.16b v0, v0, #2
-   sqshlu.4h v0, v0, #3
-   sqshlu.8h v0, v0, #4
-   sqshlu.2s v0, v0, #5
-   sqshlu.4s v0, v0, #6
-   sqshlu.2d v0, v0, #7
-   sqshl.8b v0, v0, #1
-   sqshl.16b v0, v0, #2
-   sqshl.4h v0, v0, #3
-   sqshl.8h v0, v0, #4
-   sqshl.2s v0, v0, #5
-   sqshl.4s v0, v0, #6
-   sqshl.2d v0, v0, #7
-   sqshrn.8b v0, v0, #1
-   sqshrn2.16b v0, v0, #2
-   sqshrn.4h v0, v0, #3
-   sqshrn2.8h v0, v0, #4
-   sqshrn.2s v0, v0, #5
-   sqshrn2.4s v0, v0, #6
-   sqshrun.8b v0, v0, #1
-   sqshrun2.16b v0, v0, #2
-   sqshrun.4h v0, v0, #3
-   sqshrun2.8h v0, v0, #4
-   sqshrun.2s v0, v0, #5
-   sqshrun2.4s v0, v0, #6
-   sri.8b v0, v0, #1
-   sri.16b v0, v0, #2
-   sri.4h v0, v0, #3
-   sri.8h v0, v0, #4
-   sri.2s v0, v0, #5
-   sri.4s v0, v0, #6
-   sri.2d v0, v0, #7
-   srshr.8b v0, v0, #1
-   srshr.16b v0, v0, #2
-   srshr.4h v0, v0, #3
-   srshr.8h v0, v0, #4
-   srshr.2s v0, v0, #5
-   srshr.4s v0, v0, #6
-   srshr.2d v0, v0, #7
-   srsra.8b v0, v0, #1
-   srsra.16b v0, v0, #2
-   srsra.4h v0, v0, #3
-   srsra.8h v0, v0, #4
-   srsra.2s v0, v0, #5
-   srsra.4s v0, v0, #6
-   srsra.2d v0, v0, #7
-   sshll.8h v0, v0, #1
-   sshll2.8h v0, v0, #2
-   sshll.4s v0, v0, #3
-   sshll2.4s v0, v0, #4
-   sshll.2d v0, v0, #5
-   sshll2.2d v0, v0, #6
-   sshr.8b v0, v0, #1
-   sshr.16b v0, v0, #2
-   sshr.4h v0, v0, #3
-   sshr.8h v0, v0, #4
-   sshr.2s v0, v0, #5
-   sshr.4s v0, v0, #6
-   sshr.2d v0, v0, #7
-   sshr.8b v0, v0, #1
-   ssra.16b v0, v0, #2
-   ssra.4h v0, v0, #3
-   ssra.8h v0, v0, #4
-   ssra.2s v0, v0, #5
-   ssra.4s v0, v0, #6
-   ssra.2d v0, v0, #7
-   ssra d0, d0, #64
-   ucvtf.2s v0, v0, #1
-   ucvtf.4s v0, v0, #2
-   ucvtf.2d v0, v0, #3
-   uqrshrn.8b v0, v0, #1
-   uqrshrn2.16b v0, v0, #2
-   uqrshrn.4h v0, v0, #3
-   uqrshrn2.8h v0, v0, #4
-   uqrshrn.2s v0, v0, #5
-   uqrshrn2.4s v0, v0, #6
-   uqshl.8b v0, v0, #1
-   uqshl.16b v0, v0, #2
-   uqshl.4h v0, v0, #3
-   uqshl.8h v0, v0, #4
-   uqshl.2s v0, v0, #5
-   uqshl.4s v0, v0, #6
-   uqshl.2d v0, v0, #7
-   uqshrn.8b v0, v0, #1
-   uqshrn2.16b v0, v0, #2
-   uqshrn.4h v0, v0, #3
-   uqshrn2.8h v0, v0, #4
-   uqshrn.2s v0, v0, #5
-   uqshrn2.4s v0, v0, #6
-   urshr.8b v0, v0, #1
-   urshr.16b v0, v0, #2
-   urshr.4h v0, v0, #3
-   urshr.8h v0, v0, #4
-   urshr.2s v0, v0, #5
-   urshr.4s v0, v0, #6
-   urshr.2d v0, v0, #7
-   ursra.8b v0, v0, #1
-   ursra.16b v0, v0, #2
-   ursra.4h v0, v0, #3
-   ursra.8h v0, v0, #4
-   ursra.2s v0, v0, #5
-   ursra.4s v0, v0, #6
-   ursra.2d v0, v0, #7
-   ushll.8h v0, v0, #1
-   ushll2.8h v0, v0, #2
-   ushll.4s v0, v0, #3
-   ushll2.4s v0, v0, #4
-   ushll.2d v0, v0, #5
-   ushll2.2d v0, v0, #6
-   ushr.8b v0, v0, #1
-   ushr.16b v0, v0, #2
-   ushr.4h v0, v0, #3
-   ushr.8h v0, v0, #4
-   ushr.2s v0, v0, #5
-   ushr.4s v0, v0, #6
-   ushr.2d v0, v0, #7
-   usra.8b v0, v0, #1
-   usra.16b v0, v0, #2
-   usra.4h v0, v0, #3
-   usra.8h v0, v0, #4
-   usra.2s v0, v0, #5
-   usra.4s v0, v0, #6
-   usra.2d v0, v0, #7
-
-; CHECK: fcvtzs.2s	v0, v0, #1      ; encoding: [0x00,0xfc,0x3f,0x0f]
-; CHECK: fcvtzs.4s	v0, v0, #2      ; encoding: [0x00,0xfc,0x3e,0x4f]
-; CHECK: fcvtzs.2d	v0, v0, #3      ; encoding: [0x00,0xfc,0x7d,0x4f]
-; CHECK: fcvtzu.2s	v0, v0, #1      ; encoding: [0x00,0xfc,0x3f,0x2f]
-; CHECK: fcvtzu.4s	v0, v0, #2      ; encoding: [0x00,0xfc,0x3e,0x6f]
-; CHECK: fcvtzu.2d	v0, v0, #3      ; encoding: [0x00,0xfc,0x7d,0x6f]
-; CHECK: rshrn.8b	v0, v0, #1      ; encoding: [0x00,0x8c,0x0f,0x0f]
-; CHECK: rshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x8c,0x0e,0x4f]
-; CHECK: rshrn.4h	v0, v0, #3      ; encoding: [0x00,0x8c,0x1d,0x0f]
-; CHECK: rshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x8c,0x1c,0x4f]
-; CHECK: rshrn.2s	v0, v0, #5      ; encoding: [0x00,0x8c,0x3b,0x0f]
-; CHECK: rshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x8c,0x3a,0x4f]
-; CHECK: scvtf.2s	v0, v0, #1      ; encoding: [0x00,0xe4,0x3f,0x0f]
-; CHECK: scvtf.4s	v0, v0, #2      ; encoding: [0x00,0xe4,0x3e,0x4f]
-; CHECK: scvtf.2d	v0, v0, #3      ; encoding: [0x00,0xe4,0x7d,0x4f]
-; CHECK: shl.8b	v0, v0, #1              ; encoding: [0x00,0x54,0x09,0x0f]
-; CHECK: shl.16b	v0, v0, #2              ; encoding: [0x00,0x54,0x0a,0x4f]
-; CHECK: shl.4h	v0, v0, #3              ; encoding: [0x00,0x54,0x13,0x0f]
-; CHECK: shl.8h	v0, v0, #4              ; encoding: [0x00,0x54,0x14,0x4f]
-; CHECK: shl.2s	v0, v0, #5              ; encoding: [0x00,0x54,0x25,0x0f]
-; CHECK: shl.4s	v0, v0, #6              ; encoding: [0x00,0x54,0x26,0x4f]
-; CHECK: shl.2d	v0, v0, #7              ; encoding: [0x00,0x54,0x47,0x4f]
-; CHECK: shrn.8b	v0, v0, #1              ; encoding: [0x00,0x84,0x0f,0x0f]
-; CHECK: shrn2.16b	v0, v0, #2      ; encoding: [0x00,0x84,0x0e,0x4f]
-; CHECK: shrn.4h	v0, v0, #3              ; encoding: [0x00,0x84,0x1d,0x0f]
-; CHECK: shrn2.8h	v0, v0, #4      ; encoding: [0x00,0x84,0x1c,0x4f]
-; CHECK: shrn.2s	v0, v0, #5              ; encoding: [0x00,0x84,0x3b,0x0f]
-; CHECK: shrn2.4s	v0, v0, #6      ; encoding: [0x00,0x84,0x3a,0x4f]
-; CHECK: sli.8b	v0, v0, #1              ; encoding: [0x00,0x54,0x09,0x2f]
-; CHECK: sli.16b	v0, v0, #2              ; encoding: [0x00,0x54,0x0a,0x6f]
-; CHECK: sli.4h	v0, v0, #3              ; encoding: [0x00,0x54,0x13,0x2f]
-; CHECK: sli.8h	v0, v0, #4              ; encoding: [0x00,0x54,0x14,0x6f]
-; CHECK: sli.2s	v0, v0, #5              ; encoding: [0x00,0x54,0x25,0x2f]
-; CHECK: sli.4s	v0, v0, #6              ; encoding: [0x00,0x54,0x26,0x6f]
-; CHECK: sli.2d	v0, v0, #7              ; encoding: [0x00,0x54,0x47,0x6f]
-; CHECK: sqrshrn.8b	v0, v0, #1      ; encoding: [0x00,0x9c,0x0f,0x0f]
-; CHECK: sqrshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x9c,0x0e,0x4f]
-; CHECK: sqrshrn.4h	v0, v0, #3      ; encoding: [0x00,0x9c,0x1d,0x0f]
-; CHECK: sqrshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x9c,0x1c,0x4f]
-; CHECK: sqrshrn.2s	v0, v0, #5      ; encoding: [0x00,0x9c,0x3b,0x0f]
-; CHECK: sqrshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x9c,0x3a,0x4f]
-; CHECK: sqrshrun.8b	v0, v0, #1      ; encoding: [0x00,0x8c,0x0f,0x2f]
-; CHECK: sqrshrun2.16b	v0, v0, #2      ; encoding: [0x00,0x8c,0x0e,0x6f]
-; CHECK: sqrshrun.4h	v0, v0, #3      ; encoding: [0x00,0x8c,0x1d,0x2f]
-; CHECK: sqrshrun2.8h	v0, v0, #4      ; encoding: [0x00,0x8c,0x1c,0x6f]
-; CHECK: sqrshrun.2s	v0, v0, #5      ; encoding: [0x00,0x8c,0x3b,0x2f]
-; CHECK: sqrshrun2.4s	v0, v0, #6      ; encoding: [0x00,0x8c,0x3a,0x6f]
-; CHECK: sqshlu.8b	v0, v0, #1      ; encoding: [0x00,0x64,0x09,0x2f]
-; CHECK: sqshlu.16b	v0, v0, #2      ; encoding: [0x00,0x64,0x0a,0x6f]
-; CHECK: sqshlu.4h	v0, v0, #3      ; encoding: [0x00,0x64,0x13,0x2f]
-; CHECK: sqshlu.8h	v0, v0, #4      ; encoding: [0x00,0x64,0x14,0x6f]
-; CHECK: sqshlu.2s	v0, v0, #5      ; encoding: [0x00,0x64,0x25,0x2f]
-; CHECK: sqshlu.4s	v0, v0, #6      ; encoding: [0x00,0x64,0x26,0x6f]
-; CHECK: sqshlu.2d	v0, v0, #7      ; encoding: [0x00,0x64,0x47,0x6f]
-; CHECK: sqshl.8b	v0, v0, #1      ; encoding: [0x00,0x74,0x09,0x0f]
-; CHECK: sqshl.16b	v0, v0, #2      ; encoding: [0x00,0x74,0x0a,0x4f]
-; CHECK: sqshl.4h	v0, v0, #3      ; encoding: [0x00,0x74,0x13,0x0f]
-; CHECK: sqshl.8h	v0, v0, #4      ; encoding: [0x00,0x74,0x14,0x4f]
-; CHECK: sqshl.2s	v0, v0, #5      ; encoding: [0x00,0x74,0x25,0x0f]
-; CHECK: sqshl.4s	v0, v0, #6      ; encoding: [0x00,0x74,0x26,0x4f]
-; CHECK: sqshl.2d	v0, v0, #7      ; encoding: [0x00,0x74,0x47,0x4f]
-; CHECK: sqshrn.8b	v0, v0, #1      ; encoding: [0x00,0x94,0x0f,0x0f]
-; CHECK: sqshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x94,0x0e,0x4f]
-; CHECK: sqshrn.4h	v0, v0, #3      ; encoding: [0x00,0x94,0x1d,0x0f]
-; CHECK: sqshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x94,0x1c,0x4f]
-; CHECK: sqshrn.2s	v0, v0, #5      ; encoding: [0x00,0x94,0x3b,0x0f]
-; CHECK: sqshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x94,0x3a,0x4f]
-; CHECK: sqshrun.8b	v0, v0, #1      ; encoding: [0x00,0x84,0x0f,0x2f]
-; CHECK: sqshrun2.16b	v0, v0, #2      ; encoding: [0x00,0x84,0x0e,0x6f]
-; CHECK: sqshrun.4h	v0, v0, #3      ; encoding: [0x00,0x84,0x1d,0x2f]
-; CHECK: sqshrun2.8h	v0, v0, #4      ; encoding: [0x00,0x84,0x1c,0x6f]
-; CHECK: sqshrun.2s	v0, v0, #5      ; encoding: [0x00,0x84,0x3b,0x2f]
-; CHECK: sqshrun2.4s	v0, v0, #6      ; encoding: [0x00,0x84,0x3a,0x6f]
-; CHECK: sri.8b	v0, v0, #1              ; encoding: [0x00,0x44,0x0f,0x2f]
-; CHECK: sri.16b	v0, v0, #2              ; encoding: [0x00,0x44,0x0e,0x6f]
-; CHECK: sri.4h	v0, v0, #3              ; encoding: [0x00,0x44,0x1d,0x2f]
-; CHECK: sri.8h	v0, v0, #4              ; encoding: [0x00,0x44,0x1c,0x6f]
-; CHECK: sri.2s	v0, v0, #5              ; encoding: [0x00,0x44,0x3b,0x2f]
-; CHECK: sri.4s	v0, v0, #6              ; encoding: [0x00,0x44,0x3a,0x6f]
-; CHECK: sri.2d	v0, v0, #7              ; encoding: [0x00,0x44,0x79,0x6f]
-; CHECK: srshr.8b	v0, v0, #1      ; encoding: [0x00,0x24,0x0f,0x0f]
-; CHECK: srshr.16b	v0, v0, #2      ; encoding: [0x00,0x24,0x0e,0x4f]
-; CHECK: srshr.4h	v0, v0, #3      ; encoding: [0x00,0x24,0x1d,0x0f]
-; CHECK: srshr.8h	v0, v0, #4      ; encoding: [0x00,0x24,0x1c,0x4f]
-; CHECK: srshr.2s	v0, v0, #5      ; encoding: [0x00,0x24,0x3b,0x0f]
-; CHECK: srshr.4s	v0, v0, #6      ; encoding: [0x00,0x24,0x3a,0x4f]
-; CHECK: srshr.2d	v0, v0, #7      ; encoding: [0x00,0x24,0x79,0x4f]
-; CHECK: srsra.8b	v0, v0, #1      ; encoding: [0x00,0x34,0x0f,0x0f]
-; CHECK: srsra.16b	v0, v0, #2      ; encoding: [0x00,0x34,0x0e,0x4f]
-; CHECK: srsra.4h	v0, v0, #3      ; encoding: [0x00,0x34,0x1d,0x0f]
-; CHECK: srsra.8h	v0, v0, #4      ; encoding: [0x00,0x34,0x1c,0x4f]
-; CHECK: srsra.2s	v0, v0, #5      ; encoding: [0x00,0x34,0x3b,0x0f]
-; CHECK: srsra.4s	v0, v0, #6      ; encoding: [0x00,0x34,0x3a,0x4f]
-; CHECK: srsra.2d	v0, v0, #7      ; encoding: [0x00,0x34,0x79,0x4f]
-; CHECK: sshll.8h	v0, v0, #1      ; encoding: [0x00,0xa4,0x09,0x0f]
-; CHECK: sshll2.8h	v0, v0, #2      ; encoding: [0x00,0xa4,0x0a,0x4f]
-; CHECK: sshll.4s	v0, v0, #3      ; encoding: [0x00,0xa4,0x13,0x0f]
-; CHECK: sshll2.4s	v0, v0, #4      ; encoding: [0x00,0xa4,0x14,0x4f]
-; CHECK: sshll.2d	v0, v0, #5      ; encoding: [0x00,0xa4,0x25,0x0f]
-; CHECK: sshll2.2d	v0, v0, #6      ; encoding: [0x00,0xa4,0x26,0x4f]
-; CHECK: sshr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x0f]
-; CHECK: sshr.16b	v0, v0, #2      ; encoding: [0x00,0x04,0x0e,0x4f]
-; CHECK: sshr.4h	v0, v0, #3              ; encoding: [0x00,0x04,0x1d,0x0f]
-; CHECK: sshr.8h	v0, v0, #4              ; encoding: [0x00,0x04,0x1c,0x4f]
-; CHECK: sshr.2s	v0, v0, #5              ; encoding: [0x00,0x04,0x3b,0x0f]
-; CHECK: sshr.4s	v0, v0, #6              ; encoding: [0x00,0x04,0x3a,0x4f]
-; CHECK: sshr.2d	v0, v0, #7              ; encoding: [0x00,0x04,0x79,0x4f]
-; CHECK: sshr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x0f]
-; CHECK: ssra.16b	v0, v0, #2      ; encoding: [0x00,0x14,0x0e,0x4f]
-; CHECK: ssra.4h	v0, v0, #3              ; encoding: [0x00,0x14,0x1d,0x0f]
-; CHECK: ssra.8h	v0, v0, #4              ; encoding: [0x00,0x14,0x1c,0x4f]
-; CHECK: ssra.2s	v0, v0, #5              ; encoding: [0x00,0x14,0x3b,0x0f]
-; CHECK: ssra.4s	v0, v0, #6              ; encoding: [0x00,0x14,0x3a,0x4f]
-; CHECK: ssra.2d	v0, v0, #7              ; encoding: [0x00,0x14,0x79,0x4f]
-; CHECK: ssra		d0, d0, #64             ; encoding: [0x00,0x14,0x40,0x5f]
-; CHECK: ucvtf.2s	v0, v0, #1      ; encoding: [0x00,0xe4,0x3f,0x2f]
-; CHECK: ucvtf.4s	v0, v0, #2      ; encoding: [0x00,0xe4,0x3e,0x6f]
-; CHECK: ucvtf.2d	v0, v0, #3      ; encoding: [0x00,0xe4,0x7d,0x6f]
-; CHECK: uqrshrn.8b	v0, v0, #1      ; encoding: [0x00,0x9c,0x0f,0x2f]
-; CHECK: uqrshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x9c,0x0e,0x6f]
-; CHECK: uqrshrn.4h	v0, v0, #3      ; encoding: [0x00,0x9c,0x1d,0x2f]
-; CHECK: uqrshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x9c,0x1c,0x6f]
-; CHECK: uqrshrn.2s	v0, v0, #5      ; encoding: [0x00,0x9c,0x3b,0x2f]
-; CHECK: uqrshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x9c,0x3a,0x6f]
-; CHECK: uqshl.8b	v0, v0, #1      ; encoding: [0x00,0x74,0x09,0x2f]
-; CHECK: uqshl.16b	v0, v0, #2      ; encoding: [0x00,0x74,0x0a,0x6f]
-; CHECK: uqshl.4h	v0, v0, #3      ; encoding: [0x00,0x74,0x13,0x2f]
-; CHECK: uqshl.8h	v0, v0, #4      ; encoding: [0x00,0x74,0x14,0x6f]
-; CHECK: uqshl.2s	v0, v0, #5      ; encoding: [0x00,0x74,0x25,0x2f]
-; CHECK: uqshl.4s	v0, v0, #6      ; encoding: [0x00,0x74,0x26,0x6f]
-; CHECK: uqshl.2d	v0, v0, #7      ; encoding: [0x00,0x74,0x47,0x6f]
-; CHECK: uqshrn.8b	v0, v0, #1      ; encoding: [0x00,0x94,0x0f,0x2f]
-; CHECK: uqshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x94,0x0e,0x6f]
-; CHECK: uqshrn.4h	v0, v0, #3      ; encoding: [0x00,0x94,0x1d,0x2f]
-; CHECK: uqshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x94,0x1c,0x6f]
-; CHECK: uqshrn.2s	v0, v0, #5      ; encoding: [0x00,0x94,0x3b,0x2f]
-; CHECK: uqshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x94,0x3a,0x6f]
-; CHECK: urshr.8b	v0, v0, #1      ; encoding: [0x00,0x24,0x0f,0x2f]
-; CHECK: urshr.16b	v0, v0, #2      ; encoding: [0x00,0x24,0x0e,0x6f]
-; CHECK: urshr.4h	v0, v0, #3      ; encoding: [0x00,0x24,0x1d,0x2f]
-; CHECK: urshr.8h	v0, v0, #4      ; encoding: [0x00,0x24,0x1c,0x6f]
-; CHECK: urshr.2s	v0, v0, #5      ; encoding: [0x00,0x24,0x3b,0x2f]
-; CHECK: urshr.4s	v0, v0, #6      ; encoding: [0x00,0x24,0x3a,0x6f]
-; CHECK: urshr.2d	v0, v0, #7      ; encoding: [0x00,0x24,0x79,0x6f]
-; CHECK: ursra.8b	v0, v0, #1      ; encoding: [0x00,0x34,0x0f,0x2f]
-; CHECK: ursra.16b	v0, v0, #2      ; encoding: [0x00,0x34,0x0e,0x6f]
-; CHECK: ursra.4h	v0, v0, #3      ; encoding: [0x00,0x34,0x1d,0x2f]
-; CHECK: ursra.8h	v0, v0, #4      ; encoding: [0x00,0x34,0x1c,0x6f]
-; CHECK: ursra.2s	v0, v0, #5      ; encoding: [0x00,0x34,0x3b,0x2f]
-; CHECK: ursra.4s	v0, v0, #6      ; encoding: [0x00,0x34,0x3a,0x6f]
-; CHECK: ursra.2d	v0, v0, #7      ; encoding: [0x00,0x34,0x79,0x6f]
-; CHECK: ushll.8h	v0, v0, #1      ; encoding: [0x00,0xa4,0x09,0x2f]
-; CHECK: ushll2.8h	v0, v0, #2      ; encoding: [0x00,0xa4,0x0a,0x6f]
-; CHECK: ushll.4s	v0, v0, #3      ; encoding: [0x00,0xa4,0x13,0x2f]
-; CHECK: ushll2.4s	v0, v0, #4      ; encoding: [0x00,0xa4,0x14,0x6f]
-; CHECK: ushll.2d	v0, v0, #5      ; encoding: [0x00,0xa4,0x25,0x2f]
-; CHECK: ushll2.2d	v0, v0, #6      ; encoding: [0x00,0xa4,0x26,0x6f]
-; CHECK: ushr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x2f]
-; CHECK: ushr.16b	v0, v0, #2      ; encoding: [0x00,0x04,0x0e,0x6f]
-; CHECK: ushr.4h	v0, v0, #3              ; encoding: [0x00,0x04,0x1d,0x2f]
-; CHECK: ushr.8h	v0, v0, #4              ; encoding: [0x00,0x04,0x1c,0x6f]
-; CHECK: ushr.2s	v0, v0, #5              ; encoding: [0x00,0x04,0x3b,0x2f]
-; CHECK: ushr.4s	v0, v0, #6              ; encoding: [0x00,0x04,0x3a,0x6f]
-; CHECK: ushr.2d	v0, v0, #7              ; encoding: [0x00,0x04,0x79,0x6f]
-; CHECK: usra.8b	v0, v0, #1              ; encoding: [0x00,0x14,0x0f,0x2f]
-; CHECK: usra.16b	v0, v0, #2      ; encoding: [0x00,0x14,0x0e,0x6f]
-; CHECK: usra.4h	v0, v0, #3              ; encoding: [0x00,0x14,0x1d,0x2f]
-; CHECK: usra.8h	v0, v0, #4              ; encoding: [0x00,0x14,0x1c,0x6f]
-; CHECK: usra.2s	v0, v0, #5              ; encoding: [0x00,0x14,0x3b,0x2f]
-; CHECK: usra.4s	v0, v0, #6              ; encoding: [0x00,0x14,0x3a,0x6f]
-; CHECK: usra.2d	v0, v0, #7              ; encoding: [0x00,0x14,0x79,0x6f]
-
-
-; ARM Verbose syntax variants.
-
-   rshrn v9.8b, v11.8h, #1
-   rshrn2 v8.16b, v9.8h, #2
-   rshrn v7.4h, v8.4s, #3
-   rshrn2 v6.8h, v7.4s, #4
-   rshrn v5.2s, v6.2d, #5
-   rshrn2 v4.4s, v5.2d, #6
-
-   shrn v9.8b, v11.8h, #1
-   shrn2 v8.16b, v9.8h, #2
-   shrn v7.4h, v8.4s, #3
-   shrn2 v6.8h, v7.4s, #4
-   shrn v5.2s, v6.2d, #5
-   shrn2 v4.4s, v5.2d, #6
-
-   sqrshrn v9.8b, v11.8h, #1
-   sqrshrn2 v8.16b, v9.8h, #2
-   sqrshrn v7.4h, v8.4s, #3
-   sqrshrn2 v6.8h, v7.4s, #4
-   sqrshrn v5.2s, v6.2d, #5
-   sqrshrn2 v4.4s, v5.2d, #6
-
-   sqshrn v9.8b, v11.8h, #1
-   sqshrn2 v8.16b, v9.8h, #2
-   sqshrn v7.4h, v8.4s, #3
-   sqshrn2 v6.8h, v7.4s, #4
-   sqshrn v5.2s, v6.2d, #5
-   sqshrn2 v4.4s, v5.2d, #6
-
-   sqrshrun v9.8b, v11.8h, #1
-   sqrshrun2 v8.16b, v9.8h, #2
-   sqrshrun v7.4h, v8.4s, #3
-   sqrshrun2 v6.8h, v7.4s, #4
-   sqrshrun v5.2s, v6.2d, #5
-   sqrshrun2 v4.4s, v5.2d, #6
-
-   sqshrun v9.8b, v11.8h, #1
-   sqshrun2 v8.16b, v9.8h, #2
-   sqshrun v7.4h, v8.4s, #3
-   sqshrun2 v6.8h, v7.4s, #4
-   sqshrun v5.2s, v6.2d, #5
-   sqshrun2 v4.4s, v5.2d, #6
-
-   uqrshrn v9.8b, v11.8h, #1
-   uqrshrn2 v8.16b, v9.8h, #2
-   uqrshrn v7.4h, v8.4s, #3
-   uqrshrn2 v6.8h, v7.4s, #4
-   uqrshrn v5.2s, v6.2d, #5
-   uqrshrn2 v4.4s, v5.2d, #6
-
-   uqshrn v9.8b, v11.8h, #1
-   uqshrn2 v8.16b, v9.8h, #2
-   uqshrn v7.4h, v8.4s, #3
-   uqshrn2 v6.8h, v7.4s, #4
-   uqshrn v5.2s, v6.2d, #5
-   uqshrn2 v4.4s, v5.2d, #6
-
-   sshll2 v10.8h, v3.16b, #6
-   sshll2 v11.4s, v4.8h, #5
-   sshll2 v12.2d, v5.4s, #4
-   sshll v13.8h, v6.8b, #3
-   sshll v14.4s, v7.4h, #2
-   sshll v15.2d, v8.2s, #7
-
-   ushll2 v10.8h, v3.16b, #6
-   ushll2 v11.4s, v4.8h, #5
-   ushll2 v12.2d, v5.4s, #4
-   ushll v13.8h, v6.8b, #3
-   ushll v14.4s, v7.4h, #2
-   ushll v15.2d, v8.2s, #7
-
-
-; CHECK: rshrn.8b	v9, v11, #1     ; encoding: [0x69,0x8d,0x0f,0x0f]
-; CHECK: rshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x8d,0x0e,0x4f]
-; CHECK: rshrn.4h	v7, v8, #3      ; encoding: [0x07,0x8d,0x1d,0x0f]
-; CHECK: rshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x8c,0x1c,0x4f]
-; CHECK: rshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x8c,0x3b,0x0f]
-; CHECK: rshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x8c,0x3a,0x4f]
-; CHECK: shrn.8b	v9, v11, #1             ; encoding: [0x69,0x85,0x0f,0x0f]
-; CHECK: shrn2.16b	v8, v9, #2      ; encoding: [0x28,0x85,0x0e,0x4f]
-; CHECK: shrn.4h	v7, v8, #3              ; encoding: [0x07,0x85,0x1d,0x0f]
-; CHECK: shrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x84,0x1c,0x4f]
-; CHECK: shrn.2s	v5, v6, #5              ; encoding: [0xc5,0x84,0x3b,0x0f]
-; CHECK: shrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x84,0x3a,0x4f]
-; CHECK: sqrshrn.8b	v9, v11, #1     ; encoding: [0x69,0x9d,0x0f,0x0f]
-; CHECK: sqrshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x9d,0x0e,0x4f]
-; CHECK: sqrshrn.4h	v7, v8, #3      ; encoding: [0x07,0x9d,0x1d,0x0f]
-; CHECK: sqrshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x9c,0x1c,0x4f]
-; CHECK: sqrshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x9c,0x3b,0x0f]
-; CHECK: sqrshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x9c,0x3a,0x4f]
-; CHECK: sqshrn.8b	v9, v11, #1     ; encoding: [0x69,0x95,0x0f,0x0f]
-; CHECK: sqshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x95,0x0e,0x4f]
-; CHECK: sqshrn.4h	v7, v8, #3      ; encoding: [0x07,0x95,0x1d,0x0f]
-; CHECK: sqshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x94,0x1c,0x4f]
-; CHECK: sqshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x94,0x3b,0x0f]
-; CHECK: sqshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x94,0x3a,0x4f]
-; CHECK: sqrshrun.8b	v9, v11, #1     ; encoding: [0x69,0x8d,0x0f,0x2f]
-; CHECK: sqrshrun2.16b	v8, v9, #2      ; encoding: [0x28,0x8d,0x0e,0x6f]
-; CHECK: sqrshrun.4h	v7, v8, #3      ; encoding: [0x07,0x8d,0x1d,0x2f]
-; CHECK: sqrshrun2.8h	v6, v7, #4      ; encoding: [0xe6,0x8c,0x1c,0x6f]
-; CHECK: sqrshrun.2s	v5, v6, #5      ; encoding: [0xc5,0x8c,0x3b,0x2f]
-; CHECK: sqrshrun2.4s	v4, v5, #6      ; encoding: [0xa4,0x8c,0x3a,0x6f]
-; CHECK: sqshrun.8b	v9, v11, #1     ; encoding: [0x69,0x85,0x0f,0x2f]
-; CHECK: sqshrun2.16b	v8, v9, #2      ; encoding: [0x28,0x85,0x0e,0x6f]
-; CHECK: sqshrun.4h	v7, v8, #3      ; encoding: [0x07,0x85,0x1d,0x2f]
-; CHECK: sqshrun2.8h	v6, v7, #4      ; encoding: [0xe6,0x84,0x1c,0x6f]
-; CHECK: sqshrun.2s	v5, v6, #5      ; encoding: [0xc5,0x84,0x3b,0x2f]
-; CHECK: sqshrun2.4s	v4, v5, #6      ; encoding: [0xa4,0x84,0x3a,0x6f]
-; CHECK: uqrshrn.8b	v9, v11, #1     ; encoding: [0x69,0x9d,0x0f,0x2f]
-; CHECK: uqrshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x9d,0x0e,0x6f]
-; CHECK: uqrshrn.4h	v7, v8, #3      ; encoding: [0x07,0x9d,0x1d,0x2f]
-; CHECK: uqrshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x9c,0x1c,0x6f]
-; CHECK: uqrshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x9c,0x3b,0x2f]
-; CHECK: uqrshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x9c,0x3a,0x6f]
-; CHECK: uqshrn.8b	v9, v11, #1     ; encoding: [0x69,0x95,0x0f,0x2f]
-; CHECK: uqshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x95,0x0e,0x6f]
-; CHECK: uqshrn.4h	v7, v8, #3      ; encoding: [0x07,0x95,0x1d,0x2f]
-; CHECK: uqshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x94,0x1c,0x6f]
-; CHECK: uqshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x94,0x3b,0x2f]
-; CHECK: uqshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x94,0x3a,0x6f]
-; CHECK: sshll2.8h	v10, v3, #6     ; encoding: [0x6a,0xa4,0x0e,0x4f]
-; CHECK: sshll2.4s	v11, v4, #5     ; encoding: [0x8b,0xa4,0x15,0x4f]
-; CHECK: sshll2.2d	v12, v5, #4     ; encoding: [0xac,0xa4,0x24,0x4f]
-; CHECK: sshll.8h	v13, v6, #3     ; encoding: [0xcd,0xa4,0x0b,0x0f]
-; CHECK: sshll.4s	v14, v7, #2     ; encoding: [0xee,0xa4,0x12,0x0f]
-; CHECK: sshll.2d	v15, v8, #7     ; encoding: [0x0f,0xa5,0x27,0x0f]
-; CHECK: ushll2.8h	v10, v3, #6     ; encoding: [0x6a,0xa4,0x0e,0x6f]
-; CHECK: ushll2.4s	v11, v4, #5     ; encoding: [0x8b,0xa4,0x15,0x6f]
-; CHECK: ushll2.2d	v12, v5, #4     ; encoding: [0xac,0xa4,0x24,0x6f]
-; CHECK: ushll.8h	v13, v6, #3     ; encoding: [0xcd,0xa4,0x0b,0x2f]
-; CHECK: ushll.4s	v14, v7, #2     ; encoding: [0xee,0xa4,0x12,0x2f]
-; CHECK: ushll.2d	v15, v8, #7     ; encoding: [0x0f,0xa5,0x27,0x2f]
-
-
-  pmull.8h v0, v0, v0
-  pmull2.8h v0, v0, v0
-  pmull.1q v2, v3, v4
-  pmull2.1q v2, v3, v4
-  pmull v2.1q, v3.1d, v4.1d
-  pmull2 v2.1q, v3.2d, v4.2d
-
-; CHECK: pmull.8h	v0, v0, v0      ; encoding: [0x00,0xe0,0x20,0x0e]
-; CHECK: pmull2.8h	v0, v0, v0      ; encoding: [0x00,0xe0,0x20,0x4e]
-; CHECK: pmull.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x0e]
-; CHECK: pmull2.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x4e]
-; CHECK: pmull.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x0e]
-; CHECK: pmull2.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x4e]
-
-
-  faddp.2d d1, v2
-  faddp.2s s3, v4
-; CHECK: faddp.2d	d1, v2          ; encoding: [0x41,0xd8,0x70,0x7e]
-; CHECK: faddp.2s	s3, v4          ; encoding: [0x83,0xd8,0x30,0x7e]
-
-  tbl.16b v2, {v4,v5,v6,v7}, v1
-  tbl.8b v0, {v4,v5,v6,v7}, v1
-  tbl.16b v2, {v5}, v1
-  tbl.8b v0, {v5}, v1
-  tbl.16b v2, {v5,v6,v7}, v1
-  tbl.8b v0, {v5,v6,v7}, v1
-  tbl.16b v2, {v6,v7}, v1
-  tbl.8b v0, {v6,v7}, v1
-; CHECK: tbl.16b	v2, { v4, v5, v6, v7 }, v1 ; encoding: [0x82,0x60,0x01,0x4e]
-; CHECK: tbl.8b	v0, { v4, v5, v6, v7 }, v1 ; encoding: [0x80,0x60,0x01,0x0e]
-; CHECK: tbl.16b	v2, { v5 }, v1          ; encoding: [0xa2,0x00,0x01,0x4e]
-; CHECK: tbl.8b	v0, { v5 }, v1          ; encoding: [0xa0,0x00,0x01,0x0e]
-; CHECK: tbl.16b	v2, { v5, v6, v7 }, v1  ; encoding: [0xa2,0x40,0x01,0x4e]
-; CHECK: tbl.8b	v0, { v5, v6, v7 }, v1  ; encoding: [0xa0,0x40,0x01,0x0e]
-; CHECK: tbl.16b	v2, { v6, v7 }, v1      ; encoding: [0xc2,0x20,0x01,0x4e]
-; CHECK: tbl.8b	v0, { v6, v7 }, v1      ; encoding: [0xc0,0x20,0x01,0x0e]
-
-  tbl v2.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v1.16b
-  tbl v0.8b, {v4.16b,v5.16b,v6.16b,v7.16b}, v1.8b
-  tbl v2.16b, {v5.16b}, v1.16b
-  tbl v0.8b, {v5.16b}, v1.8b
-  tbl v2.16b, {v5.16b,v6.16b,v7.16b}, v1.16b
-  tbl v0.8b, {v5.16b,v6.16b,v7.16b}, v1.8b
-  tbl v2.16b, {v6.16b,v7.16b}, v1.16b
-  tbl v0.8b, {v6.16b,v7.16b}, v1.8b
-; CHECK: tbl.16b v2, { v4, v5, v6, v7 }, v1 ; encoding: [0x82,0x60,0x01,0x4e]
-; CHECK: tbl.8b v0, { v4, v5, v6, v7 }, v1 ; encoding: [0x80,0x60,0x01,0x0e]
-; CHECK: tbl.16b v2, { v5 }, v1          ; encoding: [0xa2,0x00,0x01,0x4e]
-; CHECK: tbl.8b v0, { v5 }, v1          ; encoding: [0xa0,0x00,0x01,0x0e]
-; CHECK: tbl.16b v2, { v5, v6, v7 }, v1  ; encoding: [0xa2,0x40,0x01,0x4e]
-; CHECK: tbl.8b v0, { v5, v6, v7 }, v1  ; encoding: [0xa0,0x40,0x01,0x0e]
-; CHECK: tbl.16b v2, { v6, v7 }, v1      ; encoding: [0xc2,0x20,0x01,0x4e]
-; CHECK: tbl.8b v0, { v6, v7 }, v1      ; encoding: [0xc0,0x20,0x01,0x0e]
-
-  sqdmull	s0, h0, h0
-  sqdmull	d0, s0, s0
-; CHECK: sqdmull	s0, h0, h0              ; encoding: [0x00,0xd0,0x60,0x5e]
-; CHECK: sqdmull	d0, s0, s0              ; encoding: [0x00,0xd0,0xa0,0x5e]
-
-  frsqrte s0, s0
-  frsqrte d0, d0
-; CHECK: frsqrte s0, s0                  ; encoding: [0x00,0xd8,0xa1,0x7e]
-; CHECK: frsqrte d0, d0                  ; encoding: [0x00,0xd8,0xe1,0x7e]
-
-  mov.16b v0, v0
-  mov.2s v0, v0
-; CHECK: orr.16b	v0, v0, v0              ; encoding: [0x00,0x1c,0xa0,0x4e]
-; CHECK: orr.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xa0,0x0e]
-
-
-; uadalp/sadalp verbose mode aliases.
-  uadalp v14.4h, v25.8b
-  uadalp v15.8h, v24.16b
-  uadalp v16.2s, v23.4h
-  uadalp v17.4s, v22.8h
-  uadalp v18.1d, v21.2s
-  uadalp v19.2d, v20.4s
-
-  sadalp v1.4h, v11.8b
-  sadalp v2.8h, v12.16b
-  sadalp v3.2s, v13.4h
-  sadalp v4.4s, v14.8h
-  sadalp v5.1d, v15.2s
-  sadalp v6.2d, v16.4s
-
-; CHECK: uadalp.4h	v14, v25        ; encoding: [0x2e,0x6b,0x20,0x2e]
-; CHECK: uadalp.8h	v15, v24        ; encoding: [0x0f,0x6b,0x20,0x6e]
-; CHECK: uadalp.2s	v16, v23        ; encoding: [0xf0,0x6a,0x60,0x2e]
-; CHECK: uadalp.4s	v17, v22        ; encoding: [0xd1,0x6a,0x60,0x6e]
-; CHECK: uadalp.1d	v18, v21        ; encoding: [0xb2,0x6a,0xa0,0x2e]
-; CHECK: uadalp.2d	v19, v20        ; encoding: [0x93,0x6a,0xa0,0x6e]
-; CHECK: sadalp.4h	v1, v11         ; encoding: [0x61,0x69,0x20,0x0e]
-; CHECK: sadalp.8h	v2, v12         ; encoding: [0x82,0x69,0x20,0x4e]
-; CHECK: sadalp.2s	v3, v13         ; encoding: [0xa3,0x69,0x60,0x0e]
-; CHECK: sadalp.4s	v4, v14         ; encoding: [0xc4,0x69,0x60,0x4e]
-; CHECK: sadalp.1d	v5, v15         ; encoding: [0xe5,0x69,0xa0,0x0e]
-; CHECK: sadalp.2d	v6, v16         ; encoding: [0x06,0x6a,0xa0,0x4e]
-
-; MVN is an alias for 'not'.
-  mvn v1.8b, v4.8b
-  mvn v19.16b, v17.16b
-  mvn.8b v10, v6
-  mvn.16b v11, v7
-
-; CHECK: not.8b	v1, v4                  ; encoding: [0x81,0x58,0x20,0x2e]
-; CHECK: not.16b	v19, v17                ; encoding: [0x33,0x5a,0x20,0x6e]
-; CHECK: not.8b	v10, v6                 ; encoding: [0xca,0x58,0x20,0x2e]
-; CHECK: not.16b	v11, v7                 ; encoding: [0xeb,0x58,0x20,0x6e]
-
-; sqdmull verbose mode aliases
- sqdmull v10.4s, v12.4h, v12.4h
- sqdmull2 v10.4s, v13.8h, v13.8h
- sqdmull v10.2d, v13.2s, v13.2s
- sqdmull2 v10.2d, v13.4s, v13.4s
-; CHECK: sqdmull.4s	v10, v12, v12   ; encoding: [0x8a,0xd1,0x6c,0x0e]
-; CHECK: sqdmull2.4s	v10, v13, v13   ; encoding: [0xaa,0xd1,0x6d,0x4e]
-; CHECK: sqdmull.2d	v10, v13, v13   ; encoding: [0xaa,0xd1,0xad,0x0e]
-; CHECK: sqdmull2.2d	v10, v13, v13   ; encoding: [0xaa,0xd1,0xad,0x4e]
-
-; xtn verbose mode aliases
- xtn v14.8b, v14.8h
- xtn2 v14.16b, v14.8h
- xtn v14.4h, v14.4s
- xtn2 v14.8h, v14.4s
- xtn v14.2s, v14.2d
- xtn2 v14.4s, v14.2d
-; CHECK: xtn.8b v14, v14                ; encoding: [0xce,0x29,0x21,0x0e]
-; CHECK: xtn2.16b v14, v14              ; encoding: [0xce,0x29,0x21,0x4e]
-; CHECK: xtn.4h v14, v14                ; encoding: [0xce,0x29,0x61,0x0e]
-; CHECK: xtn2.8h v14, v14               ; encoding: [0xce,0x29,0x61,0x4e]
-; CHECK: xtn.2s v14, v14                ; encoding: [0xce,0x29,0xa1,0x0e]
-; CHECK: xtn2.4s v14, v14               ; encoding: [0xce,0x29,0xa1,0x4e]
-
-; uaddl verbose mode aliases
- uaddl v9.8h, v13.8b, v14.8b
- uaddl2 v9.8h, v13.16b, v14.16b
- uaddl v9.4s, v13.4h, v14.4h
- uaddl2 v9.4s, v13.8h, v14.8h
- uaddl v9.2d, v13.2s, v14.2s
- uaddl2 v9.2d, v13.4s, v14.4s
-; CHECK: uaddl.8h	v9, v13, v14    ; encoding: [0xa9,0x01,0x2e,0x2e]
-; CHECK: uaddl2.8h	v9, v13, v14    ; encoding: [0xa9,0x01,0x2e,0x6e]
-; CHECK: uaddl.4s	v9, v13, v14    ; encoding: [0xa9,0x01,0x6e,0x2e]
-; CHECK: uaddl2.4s	v9, v13, v14    ; encoding: [0xa9,0x01,0x6e,0x6e]
-; CHECK: uaddl.2d	v9, v13, v14    ; encoding: [0xa9,0x01,0xae,0x2e]
-; CHECK: uaddl2.2d	v9, v13, v14    ; encoding: [0xa9,0x01,0xae,0x6e]
-
-; bit verbose mode aliases
- bit v9.16b, v10.16b, v10.16b
- bit v9.8b, v10.8b, v10.8b
-; CHECK: bit.16b v9, v10, v10           ; encoding: [0x49,0x1d,0xaa,0x6e]
-; CHECK: bit.8b v9, v10, v10            ; encoding: [0x49,0x1d,0xaa,0x2e]
-
-; pmull verbose mode aliases
- pmull v8.8h, v8.8b, v8.8b
- pmull2 v8.8h, v8.16b, v8.16b
- pmull v8.1q, v8.1d, v8.1d
- pmull2 v8.1q, v8.2d, v8.2d
-; CHECK: pmull.8h	v8, v8, v8      ; encoding: [0x08,0xe1,0x28,0x0e]
-; CHECK: pmull2.8h	v8, v8, v8      ; encoding: [0x08,0xe1,0x28,0x4e]
-; CHECK: pmull.1q	v8, v8, v8      ; encoding: [0x08,0xe1,0xe8,0x0e]
-; CHECK: pmull2.1q	v8, v8, v8      ; encoding: [0x08,0xe1,0xe8,0x4e]
-
-; usubl verbose mode aliases
- usubl v9.8h, v13.8b, v14.8b
- usubl2 v9.8h, v13.16b, v14.16b
- usubl v9.4s, v13.4h, v14.4h
- usubl2 v9.4s, v13.8h, v14.8h
- usubl v9.2d, v13.2s, v14.2s
- usubl2 v9.2d, v13.4s, v14.4s
-; CHECK: usubl.8h	v9, v13, v14    ; encoding: [0xa9,0x21,0x2e,0x2e]
-; CHECK: usubl2.8h	v9, v13, v14    ; encoding: [0xa9,0x21,0x2e,0x6e]
-; CHECK: usubl.4s	v9, v13, v14    ; encoding: [0xa9,0x21,0x6e,0x2e]
-; CHECK: usubl2.4s	v9, v13, v14    ; encoding: [0xa9,0x21,0x6e,0x6e]
-; CHECK: usubl.2d	v9, v13, v14    ; encoding: [0xa9,0x21,0xae,0x2e]
-; CHECK: usubl2.2d	v9, v13, v14    ; encoding: [0xa9,0x21,0xae,0x6e]
-
-; uabdl verbose mode aliases
- uabdl v9.8h, v13.8b, v14.8b
- uabdl2 v9.8h, v13.16b, v14.16b
- uabdl v9.4s, v13.4h, v14.4h
- uabdl2 v9.4s, v13.8h, v14.8h
- uabdl v9.2d, v13.2s, v14.2s
- uabdl2 v9.2d, v13.4s, v14.4s
-; CHECK: uabdl.8h	v9, v13, v14    ; encoding: [0xa9,0x71,0x2e,0x2e]
-; CHECK: uabdl2.8h	v9, v13, v14    ; encoding: [0xa9,0x71,0x2e,0x6e]
-; CHECK: uabdl.4s	v9, v13, v14    ; encoding: [0xa9,0x71,0x6e,0x2e]
-; CHECK: uabdl2.4s	v9, v13, v14    ; encoding: [0xa9,0x71,0x6e,0x6e]
-; CHECK: uabdl.2d	v9, v13, v14    ; encoding: [0xa9,0x71,0xae,0x2e]
-; CHECK: uabdl2.2d	v9, v13, v14    ; encoding: [0xa9,0x71,0xae,0x6e]
-
-; umull verbose mode aliases
- umull v9.8h, v13.8b, v14.8b
- umull2 v9.8h, v13.16b, v14.16b
- umull v9.4s, v13.4h, v14.4h
- umull2 v9.4s, v13.8h, v14.8h
- umull v9.2d, v13.2s, v14.2s
- umull2 v9.2d, v13.4s, v14.4s
-; CHECK: umull.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x2e]
-; CHECK: umull2.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x6e]
-; CHECK: umull.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x2e]
-; CHECK: umull2.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x6e]
-; CHECK: umull.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x2e]
-; CHECK: umull2.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x6e]
-
-; smull verbose mode aliases
- smull v9.8h, v13.8b, v14.8b
- smull2 v9.8h, v13.16b, v14.16b
- smull v9.4s, v13.4h, v14.4h
- smull2 v9.4s, v13.8h, v14.8h
- smull v9.2d, v13.2s, v14.2s
- smull2 v9.2d, v13.4s, v14.4s
-; CHECK: smull.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x0e]
-; CHECK: smull2.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x4e]
-; CHECK: smull.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x0e]
-; CHECK: smull2.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x4e]
-; CHECK: smull.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x0e]
-; CHECK: smull2.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x4e]
diff --git a/test/MC/ARM64/aliases.s b/test/MC/ARM64/aliases.s
deleted file mode 100644
index 055edb5..0000000
--- a/test/MC/ARM64/aliases.s
+++ /dev/null
@@ -1,733 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -output-asm-variant=1 -show-encoding < %s | FileCheck %s
-
-foo:
-;-----------------------------------------------------------------------------
-; ADD #0 to/from SP/WSP is a MOV
-;-----------------------------------------------------------------------------
-  add x1, sp, #0
-; CHECK: mov x1, sp
-  add sp, x2, #0
-; CHECK: mov sp, x2
-  add w3, wsp, #0
-; CHECK: mov w3, wsp
-  add wsp, w4, #0
-; CHECK: mov wsp, w4
-  mov x5, sp
-; CHECK: mov x5, sp
-  mov sp, x6
-; CHECK: mov sp, x6
-  mov w7, wsp
-; CHECK: mov w7, wsp
-  mov wsp, w8
-; CHECK: mov wsp, w8
-
-;-----------------------------------------------------------------------------
-; ORR Rd, Rn, Rn is a MOV
-;-----------------------------------------------------------------------------
-  orr x2, xzr, x9
-; CHECK: mov x2, x9
-  orr w2, wzr, w9
-; CHECK: mov w2, w9
-  mov x3, x4
-; CHECK: mov x3, x4
-  mov w5, w6
-; CHECK: mov w5, w6
-
-;-----------------------------------------------------------------------------
-; TST Xn, #<imm>
-;-----------------------------------------------------------------------------
-        tst w1, #3
-        tst x1, #3
-        tst w1, w2
-        tst x1, x2
-        ands wzr, w1, w2, lsl #2
-        ands xzr, x1, x2, lsl #3
-        tst w3, w7, lsl #31
-        tst x2, x20, asr #0
-
-; CHECK: tst	w1, #0x3                ; encoding: [0x3f,0x04,0x00,0x72]
-; CHECK: tst	x1, #0x3                ; encoding: [0x3f,0x04,0x40,0xf2]
-; CHECK: tst	w1, w2                  ; encoding: [0x3f,0x00,0x02,0x6a]
-; CHECK: tst	x1, x2                  ; encoding: [0x3f,0x00,0x02,0xea]
-; CHECK: tst	w1, w2, lsl #2          ; encoding: [0x3f,0x08,0x02,0x6a]
-; CHECK: tst	x1, x2, lsl #3          ; encoding: [0x3f,0x0c,0x02,0xea]
-; CHECK: tst	w3, w7, lsl #31         ; encoding: [0x7f,0x7c,0x07,0x6a]
-; CHECK: tst	x2, x20, asr #0         ; encoding: [0x5f,0x00,0x94,0xea]
-
-;-----------------------------------------------------------------------------
-; ADDS to WZR/XZR is a CMN
-;-----------------------------------------------------------------------------
-  cmn w1, #3, lsl #0
-  cmn x2, #4194304
-  cmn w4, w5
-  cmn x6, x7
-  cmn w8, w9, asr #3
-  cmn x2, x3, lsr #4
-  cmn x2, w3, uxtb #1
-  cmn x4, x5, uxtx #1
-
-; CHECK: cmn	w1, #3                  ; encoding: [0x3f,0x0c,0x00,0x31]
-; CHECK: cmn	x2, #4194304            ; encoding: [0x5f,0x00,0x50,0xb1]
-; CHECK: cmn	w4, w5                  ; encoding: [0x9f,0x00,0x05,0x2b]
-; CHECK: cmn	x6, x7                  ; encoding: [0xdf,0x00,0x07,0xab]
-; CHECK: cmn	w8, w9, asr #3          ; encoding: [0x1f,0x0d,0x89,0x2b]
-; CHECK: cmn	x2, x3, lsr #4          ; encoding: [0x5f,0x10,0x43,0xab]
-; CHECK: cmn	x2, w3, uxtb #1         ; encoding: [0x5f,0x04,0x23,0xab]
-; CHECK: cmn	x4, x5, uxtx #1         ; encoding: [0x9f,0x64,0x25,0xab]
-
-
-;-----------------------------------------------------------------------------
-; SUBS to WZR/XZR is a CMP
-;-----------------------------------------------------------------------------
-  cmp w1, #1024, lsl #12
-  cmp x2, #1024
-  cmp w4, w5
-  cmp x6, x7
-  cmp w8, w9, asr #3
-  cmp x2, x3, lsr #4
-  cmp x2, w3, uxth #2
-  cmp x4, x5, uxtx
-  cmp wzr, w1
-  cmp x8, w8, uxtw
-  cmp w9, w8, uxtw
-  cmp wsp, w9, lsl #0
-
-; CHECK: cmp	w1, #4194304            ; encoding: [0x3f,0x00,0x50,0x71]
-; CHECK: cmp	x2, #1024               ; encoding: [0x5f,0x00,0x10,0xf1]
-; CHECK: cmp	w4, w5                  ; encoding: [0x9f,0x00,0x05,0x6b]
-; CHECK: cmp	x6, x7                  ; encoding: [0xdf,0x00,0x07,0xeb]
-; CHECK: cmp	w8, w9, asr #3          ; encoding: [0x1f,0x0d,0x89,0x6b]
-; CHECK: cmp	x2, x3, lsr #4          ; encoding: [0x5f,0x10,0x43,0xeb]
-; CHECK: cmp	x2, w3, uxth #2         ; encoding: [0x5f,0x28,0x23,0xeb]
-; CHECK: cmp	x4, x5, uxtx            ; encoding: [0x9f,0x60,0x25,0xeb]
-; CHECK: cmp	wzr, w1                 ; encoding: [0xff,0x03,0x01,0x6b]
-; CHECK: cmp	x8, w8, uxtw            ; encoding: [0x1f,0x41,0x28,0xeb]
-; CHECK: cmp	w9, w8, uxtw            ; encoding: [0x3f,0x41,0x28,0x6b]
-; CHECK: cmp	wsp, w9                 ; encoding: [0xff,0x63,0x29,0x6b]
-
-
-;-----------------------------------------------------------------------------
-; SUB/SUBS from WZR/XZR is a NEG
-;-----------------------------------------------------------------------------
-
-  neg w0, w1
-; CHECK: neg w0, w1
-  neg w0, w1, lsl #1
-; CHECK: sub w0, wzr, w1, lsl #1
-  neg x0, x1
-; CHECK: neg x0, x1
-  neg x0, x1, asr #1
-; CHECK: sub x0, xzr, x1, asr #1
-  negs w0, w1
-; CHECK: negs w0, w1
-  negs w0, w1, lsl #1
-; CHECK: subs w0, wzr, w1, lsl #1
-  negs x0, x1
-; CHECK: negs x0, x1
-  negs x0, x1, asr #1
-; CHECK: subs x0, xzr, x1, asr #1
-
-;-----------------------------------------------------------------------------
-; MOV aliases
-;-----------------------------------------------------------------------------
-
-  mov x0, #281470681743360
-  mov x0, #18446744073709486080
-
-; CHECK: movz	x0, #65535, lsl #32
-; CHECK: movn	x0, #65535
-
-  mov w0, #0xffffffff
-  mov w0, #0xffffff00
-
-; CHECK: movn   w0, #0
-; CHECK: movn   w0, #255
-
-;-----------------------------------------------------------------------------
-; MVN aliases
-;-----------------------------------------------------------------------------
-
-        mvn w4, w9
-        mvn x2, x3
-        orn w4, wzr, w9
-
-; CHECK: mvn	w4, w9             ; encoding: [0xe4,0x03,0x29,0x2a]
-; CHECK: mvn	x2, x3             ; encoding: [0xe2,0x03,0x23,0xaa]
-; CHECK: mvn	w4, w9             ; encoding: [0xe4,0x03,0x29,0x2a]
-
-;-----------------------------------------------------------------------------
-; Bitfield aliases
-;-----------------------------------------------------------------------------
-
-  bfi   w0, w0, #1, #4
-  bfi   x0, x0, #1, #4
-  bfi   w0, w0, #0, #2
-  bfi   x0, x0, #0, #2
-  bfxil w0, w0, #2, #3
-  bfxil x0, x0, #2, #3
-  sbfiz w0, w0, #1, #4
-  sbfiz x0, x0, #1, #4
-  sbfx  w0, w0, #2, #3
-  sbfx  x0, x0, #2, #3
-  ubfiz w0, w0, #1, #4
-  ubfiz x0, x0, #1, #4
-  ubfx  w0, w0, #2, #3
-  ubfx  x0, x0, #2, #3
-
-; CHECK: bfm  w0, w0, #31, #3
-; CHECK: bfm  x0, x0, #63, #3
-; CHECK: bfm  w0, w0, #0, #1
-; CHECK: bfm  x0, x0, #0, #1
-; CHECK: bfm  w0, w0, #2, #4
-; CHECK: bfm  x0, x0, #2, #4
-; CHECK: sbfm w0, w0, #31, #3
-; CHECK: sbfm x0, x0, #63, #3
-; CHECK: sbfm w0, w0, #2, #4
-; CHECK: sbfm x0, x0, #2, #4
-; CHECK: ubfm w0, w0, #31, #3
-; CHECK: ubfm x0, x0, #63, #3
-; CHECK: ubfm w0, w0, #2, #4
-; CHECK: ubfm x0, x0, #2, #4
-
-;-----------------------------------------------------------------------------
-; Shift (immediate) aliases
-;-----------------------------------------------------------------------------
-
-; CHECK: asr w1, w3, #13
-; CHECK: asr x1, x3, #13
-; CHECK: lsl w0, w0, #1
-; CHECK: lsl x0, x0, #1
-; CHECK: lsr w0, w0, #4
-; CHECK: lsr x0, x0, #4
-
-   sbfm w1, w3, #13, #31
-   sbfm x1, x3, #13, #63
-   ubfm w0, w0, #31, #30
-   ubfm x0, x0, #63, #62
-   ubfm w0, w0, #4, #31
-   ubfm x0, x0, #4, #63
-; CHECK: extr w1, w3, w3, #5
-; CHECK: extr x1, x3, x3, #5
-   ror w1, w3, #5
-   ror x1, x3, #5
-; CHECK: lsl w1, wzr, #3
-   lsl w1, wzr, #3
-
-;-----------------------------------------------------------------------------
-; Sign/Zero extend aliases
-;-----------------------------------------------------------------------------
-
-  sxtb  w1, w2
-  sxth  w1, w2
-  uxtb  w1, w2
-  uxth  w1, w2
-
-; CHECK: sxtb w1, w2
-; CHECK: sxth w1, w2
-; CHECK: uxtb w1, w2
-; CHECK: uxth w1, w2
-
-  sxtb  x1, x2
-  sxth  x1, x2
-  sxtw  x1, x2
-  uxtb  x1, x2
-  uxth  x1, x2
-  uxtw  x1, x2
-
-; CHECK: sxtb x1, x2
-; CHECK: sxth x1, x2
-; CHECK: sxtw x1, x2
-; CHECK: uxtb x1, x2
-; CHECK: uxth x1, x2
-; CHECK: uxtw x1, x2
-
-;-----------------------------------------------------------------------------
-; Negate with carry
-;-----------------------------------------------------------------------------
-
-  ngc   w1, w2
-  ngc   x1, x2
-  ngcs  w1, w2
-  ngcs  x1, x2
-
-; CHECK: ngc  w1, w2
-; CHECK: ngc  x1, x2
-; CHECK: ngcs w1, w2
-; CHECK: ngcs x1, x2
-
-;-----------------------------------------------------------------------------
-; 6.6.1 Multiply aliases
-;-----------------------------------------------------------------------------
-
-  mneg   w1, w2, w3
-  mneg   x1, x2, x3
-  mul    w1, w2, w3
-  mul    x1, x2, x3
-  smnegl x1, w2, w3
-  umnegl x1, w2, w3
-  smull   x1, w2, w3
-  umull   x1, w2, w3
-
-; CHECK: mneg w1, w2, w3
-; CHECK: mneg x1, x2, x3
-; CHECK: mul w1, w2, w3
-; CHECK: mul x1, x2, x3
-; CHECK: smnegl x1, w2, w3
-; CHECK: umnegl x1, w2, w3
-; CHECK: smull x1, w2, w3
-; CHECK: umull x1, w2, w3
-
-;-----------------------------------------------------------------------------
-; Conditional select aliases
-;-----------------------------------------------------------------------------
-
-  cset   w1, eq
-  cset   x1, eq
-  csetm  w1, ne
-  csetm  x1, ne
-  cinc   w1, w2, lt
-  cinc   x1, x2, lt
-  cinv   w1, w2, mi
-  cinv   x1, x2, mi
-
-; CHECK: csinc  w1, wzr, wzr, ne
-; CHECK: csinc  x1, xzr, xzr, ne
-; CHECK: csinv  w1, wzr, wzr, eq
-; CHECK: csinv  x1, xzr, xzr, eq
-; CHECK: csinc  w1, w2, w2, ge
-; CHECK: csinc  x1, x2, x2, ge
-; CHECK: csinv  w1, w2, w2, pl
-; CHECK: csinv  x1, x2, x2, pl
-
-;-----------------------------------------------------------------------------
-; SYS aliases
-;-----------------------------------------------------------------------------
-
-  sys #0, c7, c1, #0
-; CHECK: ic ialluis
-  sys #0, c7, c5, #0
-; CHECK: ic iallu
-  sys #3, c7, c5, #1
-; CHECK: ic ivau
-
-  sys #3, c7, c4, #1
-; CHECK: dc zva
-  sys #0, c7, c6, #1
-; CHECK: dc ivac
-  sys #0, c7, c6, #2
-; CHECK: dc isw
-  sys #3, c7, c10, #1
-; CHECK: dc cvac
-  sys #0, c7, c10, #2
-; CHECK: dc csw
-  sys #3, c7, c11, #1
-; CHECK: dc cvau
-  sys #3, c7, c14, #1
-; CHECK: dc civac
-  sys #0, c7, c14, #2
-; CHECK: dc cisw
-
-  sys #0, c7, c8, #0
-; CHECK: at s1e1r
-  sys #4, c7, c8, #0
-; CHECK: at s1e2r
-  sys #6, c7, c8, #0
-; CHECK: at s1e3r
-  sys #0, c7, c8, #1
-; CHECK: at s1e1w
-  sys #4, c7, c8, #1
-; CHECK: at s1e2w
-  sys #6, c7, c8, #1
-; CHECK: at s1e3w
-  sys #0, c7, c8, #2
-; CHECK: at s1e0r
-  sys #0, c7, c8, #3
-; CHECK: at s1e0w
-  sys #4, c7, c8, #4
-; CHECK: at s12e1r
-  sys #4, c7, c8, #5
-; CHECK: at s12e1w
-  sys #4, c7, c8, #6
-; CHECK: at s12e0r
-  sys #4, c7, c8, #7
-; CHECK: at s12e0w
-
-  sys #0, c8, c3, #0
-; CHECK: tlbi vmalle1is
-  sys #4, c8, c3, #0
-; CHECK: tlbi alle2is
-  sys #6, c8, c3, #0
-; CHECK: tlbi alle3is
-  sys #0, c8, c3, #1
-; CHECK: tlbi vae1is
-  sys #4, c8, c3, #1
-; CHECK: tlbi vae2is
-  sys #6, c8, c3, #1
-; CHECK: tlbi vae3is
-  sys #0, c8, c3, #2
-; CHECK: tlbi aside1is
-  sys #0, c8, c3, #3
-; CHECK: tlbi vaae1is
-  sys #4, c8, c3, #4
-; CHECK: tlbi alle1is
-  sys #0, c8, c3, #5
-; CHECK: tlbi vale1is
-  sys #0, c8, c3, #7
-; CHECK: tlbi vaale1is
-  sys #0, c8, c7, #0
-; CHECK: tlbi vmalle1
-  sys #4, c8, c7, #0
-; CHECK: tlbi alle2
-  sys #4, c8, c3, #5
-; CHECK: tlbi vale2is
-  sys #6, c8, c3, #5
-; CHECK: tlbi vale3is
-  sys #6, c8, c7, #0
-; CHECK: tlbi alle3
-  sys #0, c8, c7, #1
-; CHECK: tlbi vae1
-  sys #4, c8, c7, #1
-; CHECK: tlbi vae2
-  sys #6, c8, c7, #1
-; CHECK: tlbi vae3
-  sys #0, c8, c7, #2
-; CHECK: tlbi aside1
-  sys #0, c8, c7, #3
-; CHECK: tlbi vaae1
-  sys #4, c8, c7, #4
-; CHECK: tlbi alle1
-  sys #0, c8, c7, #5
-; CHECK: tlbi vale1
-  sys #4, c8, c7, #5
-; CHECK: tlbi vale2
-  sys #6, c8, c7, #5
-; CHECK: tlbi vale3
-  sys #0, c8, c7, #7
-; CHECK: tlbi vaale1
-  sys #4, c8, c4, #1
-; CHECK: tlbi ipas2e1
-  sys #4, c8, c4, #5
-; CHECK: tlbi ipas2le1
-  sys #4, c8, c7, #6
-; CHECK: tlbi vmalls12e1
-  sys #4, c8, c3, #6
-; CHECK: tlbi vmalls12e1is
-
-  ic ialluis
-; CHECK: ic ialluis
-  ic iallu
-; CHECK: ic iallu
-  ic ivau
-; CHECK: ic ivau
-
-  dc zva
-; CHECK: dc zva
-  dc ivac
-; CHECK: dc ivac
-  dc isw
-; CHECK: dc isw
-  dc cvac
-; CHECK: dc cvac
-  dc csw
-; CHECK: dc csw
-  dc cvau
-; CHECK: dc cvau
-  dc civac
-; CHECK: dc civac
-  dc cisw
-; CHECK: dc cisw
-
-  at s1e1r
-; CHECK: at s1e1r
-  at s1e2r
-; CHECK: at s1e2r
-  at s1e3r
-; CHECK: at s1e3r
-  at s1e1w
-; CHECK: at s1e1w
-  at s1e2w
-; CHECK: at s1e2w
-  at s1e3w
-; CHECK: at s1e3w
-  at s1e0r
-; CHECK: at s1e0r
-  at s1e0w
-; CHECK: at s1e0w
-  at s12e1r
-; CHECK: at s12e1r
-  at s12e1w
-; CHECK: at s12e1w
-  at s12e0r
-; CHECK: at s12e0r
-  at s12e0w
-; CHECK: at s12e0w
-
-  tlbi vmalle1is
-; CHECK: tlbi vmalle1is
-  tlbi alle2is
-; CHECK: tlbi alle2is
-  tlbi alle3is
-; CHECK: tlbi alle3is
-  tlbi vae1is
-; CHECK: tlbi vae1is
-  tlbi vae2is
-; CHECK: tlbi vae2is
-  tlbi vae3is
-; CHECK: tlbi vae3is
-  tlbi aside1is
-; CHECK: tlbi aside1is
-  tlbi vaae1is
-; CHECK: tlbi vaae1is
-  tlbi alle1is
-; CHECK: tlbi alle1is
-  tlbi vale1is
-; CHECK: tlbi vale1is
-  tlbi vaale1is
-; CHECK: tlbi vaale1is
-  tlbi vmalle1
-; CHECK: tlbi vmalle1
-  tlbi alle2
-; CHECK: tlbi alle2
-  tlbi vale2is
-; CHECK: tlbi vale2is
-  tlbi vale3is
-; CHECK: tlbi vale3is
-  tlbi alle3
-; CHECK: tlbi alle3
-  tlbi vae1
-; CHECK: tlbi vae1
-  tlbi vae2
-; CHECK: tlbi vae2
-  tlbi vae3
-; CHECK: tlbi vae3
-  tlbi aside1
-; CHECK: tlbi aside1
-  tlbi vaae1
-; CHECK: tlbi vaae1
-  tlbi alle1
-; CHECK: tlbi alle1
-  tlbi vale1
-; CHECK: tlbi vale1
-  tlbi vale2
-; CHECK: tlbi vale2
-  tlbi vale3
-; CHECK: tlbi vale3
-  tlbi vaale1
-; CHECK: tlbi vaale1
-  tlbi ipas2e1, x10
-; CHECK: tlbi ipas2e1, x10
-  tlbi ipas2le1, x1
-; CHECK: tlbi ipas2le1, x1
-  tlbi vmalls12e1
-; CHECK: tlbi vmalls12e1
-  tlbi vmalls12e1is
-; CHECK: tlbi vmalls12e1is
-
-;-----------------------------------------------------------------------------
-; 5.8.5 Vector Arithmetic aliases
-;-----------------------------------------------------------------------------
-
-  cmls.8b v0, v2, v1
-  cmls.16b v0, v2, v1
-  cmls.4h v0, v2, v1
-  cmls.8h v0, v2, v1
-  cmls.2s v0, v2, v1
-  cmls.4s v0, v2, v1
-  cmls.2d v0, v2, v1
-; CHECK: cmhs.8b v0, v1, v2
-; CHECK: cmhs.16b v0, v1, v2
-; CHECK: cmhs.4h v0, v1, v2
-; CHECK: cmhs.8h v0, v1, v2
-; CHECK: cmhs.2s v0, v1, v2
-; CHECK: cmhs.4s v0, v1, v2
-; CHECK: cmhs.2d v0, v1, v2
-
-  cmlo.8b v0, v2, v1
-  cmlo.16b v0, v2, v1
-  cmlo.4h v0, v2, v1
-  cmlo.8h v0, v2, v1
-  cmlo.2s v0, v2, v1
-  cmlo.4s v0, v2, v1
-  cmlo.2d v0, v2, v1
-; CHECK: cmhi.8b v0, v1, v2
-; CHECK: cmhi.16b v0, v1, v2
-; CHECK: cmhi.4h v0, v1, v2
-; CHECK: cmhi.8h v0, v1, v2
-; CHECK: cmhi.2s v0, v1, v2
-; CHECK: cmhi.4s v0, v1, v2
-; CHECK: cmhi.2d v0, v1, v2
-
-  cmle.8b v0, v2, v1
-  cmle.16b v0, v2, v1
-  cmle.4h v0, v2, v1
-  cmle.8h  v0, v2, v1
-  cmle.2s v0, v2, v1
-  cmle.4s v0, v2, v1
-  cmle.2d v0, v2, v1
-; CHECK: cmge.8b v0, v1, v2
-; CHECK: cmge.16b v0, v1, v2
-; CHECK: cmge.4h v0, v1, v2
-; CHECK: cmge.8h v0, v1, v2
-; CHECK: cmge.2s v0, v1, v2
-; CHECK: cmge.4s v0, v1, v2
-; CHECK: cmge.2d v0, v1, v2
-
-  cmlt.8b v0, v2, v1
-  cmlt.16b v0, v2, v1
-  cmlt.4h v0, v2, v1
-  cmlt.8h  v0, v2, v1
-  cmlt.2s v0, v2, v1
-  cmlt.4s v0, v2, v1
-  cmlt.2d v0, v2, v1
-; CHECK: cmgt.8b v0, v1, v2
-; CHECK: cmgt.16b v0, v1, v2
-; CHECK: cmgt.4h v0, v1, v2
-; CHECK: cmgt.8h v0, v1, v2
-; CHECK: cmgt.2s v0, v1, v2
-; CHECK: cmgt.4s v0, v1, v2
-; CHECK: cmgt.2d v0, v1, v2
-
-  fcmle.2s v0, v2, v1
-  fcmle.4s v0, v2, v1
-  fcmle.2d v0, v2, v1
-; CHECK: fcmge.2s v0, v1, v2
-; CHECK: fcmge.4s v0, v1, v2
-; CHECK: fcmge.2d v0, v1, v2
-
-  fcmlt.2s v0, v2, v1
-  fcmlt.4s v0, v2, v1
-  fcmlt.2d v0, v2, v1
-; CHECK: fcmgt.2s v0, v1, v2
-; CHECK: fcmgt.4s v0, v1, v2
-; CHECK: fcmgt.2d v0, v1, v2
-
-  facle.2s v0, v2, v1
-  facle.4s v0, v2, v1
-  facle.2d v0, v2, v1
-; CHECK: facge.2s v0, v1, v2
-; CHECK: facge.4s v0, v1, v2
-; CHECK: facge.2d v0, v1, v2
-
-  faclt.2s v0, v2, v1
-  faclt.4s v0, v2, v1
-  faclt.2d v0, v2, v1
-; CHECK: facgt.2s v0, v1, v2
-; CHECK: facgt.4s v0, v1, v2
-; CHECK: facgt.2d v0, v1, v2
-
-;-----------------------------------------------------------------------------
-; 5.8.6 Scalar Arithmetic aliases
-;-----------------------------------------------------------------------------
-
-  cmls d0, d2, d1
-; CHECK: cmhs d0, d1, d2
-
-  cmle d0, d2, d1
-; CHECK: cmge d0, d1, d2
-
-  cmlo d0, d2, d1
-; CHECK: cmhi d0, d1, d2
-
-  cmlt d0, d2, d1
-; CHECK: cmgt d0, d1, d2
-
-  fcmle s0, s2, s1
-  fcmle d0, d2, d1
-; CHECK: fcmge s0, s1, s2
-; CHECK: fcmge d0, d1, d2
-
-  fcmlt s0, s2, s1
-  fcmlt d0, d2, d1
-; CHECK: fcmgt s0, s1, s2
-; CHECK: fcmgt d0, d1, d2
-
-  facle s0, s2, s1
-  facle d0, d2, d1
-; CHECK: facge s0, s1, s2
-; CHECK: facge d0, d1, d2
-
-  faclt s0, s2, s1
-  faclt d0, d2, d1
-; CHECK: facgt s0, s1, s2
-; CHECK: facgt d0, d1, d2
-
-;-----------------------------------------------------------------------------
-; 5.8.14 Vector Shift (immediate)
-;-----------------------------------------------------------------------------
-  sxtl v1.8h, v2.8b
-; CHECK: sshll.8h v1, v2, #0
-  sxtl.8h v1, v2
-; CHECK: sshll.8h v1, v2, #0
-
-  sxtl v1.4s, v2.4h
-; CHECK: sshll.4s v1, v2, #0
-  sxtl.4s v1, v2
-; CHECK: sshll.4s v1, v2, #0
-
-  sxtl v1.2d, v2.2s
-; CHECK: sshll.2d v1, v2, #0
-  sxtl.2d v1, v2
-; CHECK: sshll.2d v1, v2, #0
-
-  sxtl2 v1.8h, v2.16b
-; CHECK: sshll2.8h v1, v2, #0
-  sxtl2.8h v1, v2
-; CHECK: sshll2.8h v1, v2, #0
-
-  sxtl2 v1.4s, v2.8h
-; CHECK: sshll2.4s v1, v2, #0
-  sxtl2.4s v1, v2
-; CHECK: sshll2.4s v1, v2, #0
-
-  sxtl2 v1.2d, v2.4s
-; CHECK: sshll2.2d v1, v2, #0
-  sxtl2.2d v1, v2
-; CHECK: sshll2.2d v1, v2, #0
-
-  uxtl v1.8h, v2.8b
-; CHECK: ushll.8h v1, v2, #0
-  uxtl.8h v1, v2
-; CHECK: ushll.8h v1, v2, #0
-
-  uxtl v1.4s, v2.4h
-; CHECK: ushll.4s v1, v2, #0
-  uxtl.4s v1, v2
-; CHECK: ushll.4s v1, v2, #0
-
-  uxtl v1.2d, v2.2s
-; CHECK: ushll.2d v1, v2, #0
-  uxtl.2d v1, v2
-; CHECK: ushll.2d v1, v2, #0
-
-  uxtl2 v1.8h, v2.16b
-; CHECK: ushll2.8h v1, v2, #0
-  uxtl2.8h v1, v2
-; CHECK: ushll2.8h v1, v2, #0
-
-  uxtl2 v1.4s, v2.8h
-; CHECK: ushll2.4s v1, v2, #0
-  uxtl2.4s v1, v2
-; CHECK: ushll2.4s v1, v2, #0
-
-  uxtl2 v1.2d, v2.4s
-; CHECK: ushll2.2d v1, v2, #0
-  uxtl2.2d v1, v2
-; CHECK: ushll2.2d v1, v2, #0
-
-
-;-----------------------------------------------------------------------------
-; MOVI verbose syntax with shift operand omitted.
-;-----------------------------------------------------------------------------
-  movi v4.16b, #0x00
-  movi v4.16B, #0x01
-  movi v4.8b, #0x02
-  movi v4.8B, #0x03
-  movi v1.2d, #0x000000000000ff
-  movi v2.2D, #0x000000000000ff
-
-; CHECK: movi.16b	v4, #0              ; encoding: [0x04,0xe4,0x00,0x4f]
-; CHECK: movi.16b	v4, #1              ; encoding: [0x24,0xe4,0x00,0x4f]
-; CHECK: movi.8b	v4, #2               ; encoding: [0x44,0xe4,0x00,0x0f]
-; CHECK: movi.8b	v4, #3               ; encoding: [0x64,0xe4,0x00,0x0f]
-; CHECK: movi.2d	v1, #0x000000000000ff ; encoding: [0x21,0xe4,0x00,0x6f]
-; CHECK: movi.2d	v2, #0x000000000000ff ; encoding: [0x22,0xe4,0x00,0x6f]
diff --git a/test/MC/ARM64/arithmetic-encoding.s b/test/MC/ARM64/arithmetic-encoding.s
deleted file mode 100644
index 7c89244..0000000
--- a/test/MC/ARM64/arithmetic-encoding.s
+++ /dev/null
@@ -1,631 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-
-foo:
-;==---------------------------------------------------------------------------==
-; Add/Subtract with carry/borrow
-;==---------------------------------------------------------------------------==
-
-  adc   w1, w2, w3
-  adc   x1, x2, x3
-  adcs  w5, w4, w3
-  adcs  x5, x4, x3
-
-; CHECK: adc  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x1a]
-; CHECK: adc  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0x9a]
-; CHECK: adcs w5, w4, w3             ; encoding: [0x85,0x00,0x03,0x3a]
-; CHECK: adcs x5, x4, x3             ; encoding: [0x85,0x00,0x03,0xba]
-
-  sbc   w1, w2, w3
-  sbc   x1, x2, x3
-  sbcs  w1, w2, w3
-  sbcs  x1, x2, x3
-
-; CHECK: sbc  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x5a]
-; CHECK: sbc  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xda]
-; CHECK: sbcs w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x7a]
-; CHECK: sbcs x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xfa]
-
-;==---------------------------------------------------------------------------==
-; Add/Subtract with (optionally shifted) immediate
-;==---------------------------------------------------------------------------==
-
-  add w3, w4, #1024
-  add w3, w4, #1024, lsl #0
-  add x3, x4, #1024
-  add x3, x4, #1024, lsl #0
-
-; CHECK: add w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x11]
-; CHECK: add w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x11]
-; CHECK: add x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0x91]
-; CHECK: add x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0x91]
-
-  add w3, w4, #1024, lsl #12
-  add w3, w4, #4194304
-  add w3, w4, #0, lsl #12
-  add x3, x4, #1024, lsl #12
-  add x3, x4, #4194304
-  add x3, x4, #0, lsl #12
-  add sp, sp, #32
-
-; CHECK: add w3, w4, #4194304        ; encoding: [0x83,0x00,0x50,0x11]
-; CHECK: add w3, w4, #4194304        ; encoding: [0x83,0x00,0x50,0x11]
-; CHECK: add w3, w4, #0, lsl #12     ; encoding: [0x83,0x00,0x40,0x11]
-; CHECK: add x3, x4, #4194304        ; encoding: [0x83,0x00,0x50,0x91]
-; CHECK: add x3, x4, #4194304        ; encoding: [0x83,0x00,0x50,0x91]
-; CHECK: add x3, x4, #0, lsl #12     ; encoding: [0x83,0x00,0x40,0x91]
-; CHECK: add sp, sp, #32             ; encoding: [0xff,0x83,0x00,0x91]
-
-  adds w3, w4, #1024
-  adds w3, w4, #1024, lsl #0
-  adds w3, w4, #1024, lsl #12
-  adds x3, x4, #1024
-  adds x3, x4, #1024, lsl #0
-  adds x3, x4, #1024, lsl #12
-
-; CHECK: adds w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x31]
-; CHECK: adds w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x31]
-; CHECK: adds w3, w4, #4194304       ; encoding: [0x83,0x00,0x50,0x31]
-; CHECK: adds x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xb1]
-; CHECK: adds x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xb1]
-; CHECK: adds x3, x4, #4194304       ; encoding: [0x83,0x00,0x50,0xb1]
-
-  sub w3, w4, #1024
-  sub w3, w4, #1024, lsl #0
-  sub w3, w4, #1024, lsl #12
-  sub x3, x4, #1024
-  sub x3, x4, #1024, lsl #0
-  sub x3, x4, #1024, lsl #12
-  sub sp, sp, #32
-
-; CHECK: sub w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x51]
-; CHECK: sub w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x51]
-; CHECK: sub w3, w4, #4194304        ; encoding: [0x83,0x00,0x50,0x51]
-; CHECK: sub x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0xd1]
-; CHECK: sub x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0xd1]
-; CHECK: sub x3, x4, #4194304        ; encoding: [0x83,0x00,0x50,0xd1]
-; CHECK: sub sp, sp, #32             ; encoding: [0xff,0x83,0x00,0xd1]
-
-  subs w3, w4, #1024
-  subs w3, w4, #1024, lsl #0
-  subs w3, w4, #1024, lsl #12
-  subs x3, x4, #1024
-  subs x3, x4, #1024, lsl #0
-  subs x3, x4, #1024, lsl #12
-
-; CHECK: subs w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x71]
-; CHECK: subs w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x71]
-; CHECK: subs w3, w4, #4194304       ; encoding: [0x83,0x00,0x50,0x71]
-; CHECK: subs x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xf1]
-; CHECK: subs x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xf1]
-; CHECK: subs x3, x4, #4194304       ; encoding: [0x83,0x00,0x50,0xf1]
-
-;==---------------------------------------------------------------------------==
-; Add/Subtract register with (optional) shift
-;==---------------------------------------------------------------------------==
-
-  add w12, w13, w14
-  add x12, x13, x14
-  add w12, w13, w14, lsl #12
-  add x12, x13, x14, lsl #12
-  add w12, w13, w14, lsr #42
-  add x12, x13, x14, lsr #42
-  add w12, w13, w14, asr #39
-  add x12, x13, x14, asr #39
-
-; CHECK: add w12, w13, w14           ; encoding: [0xac,0x01,0x0e,0x0b]
-; CHECK: add x12, x13, x14           ; encoding: [0xac,0x01,0x0e,0x8b]
-; CHECK: add w12, w13, w14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x0b]
-; CHECK: add x12, x13, x14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x8b]
-; CHECK: add w12, w13, w14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0x0b]
-; CHECK: add x12, x13, x14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0x8b]
-; CHECK: add w12, w13, w14, asr #39  ; encoding: [0xac,0x9d,0x8e,0x0b]
-; CHECK: add x12, x13, x14, asr #39  ; encoding: [0xac,0x9d,0x8e,0x8b]
-
-  sub w12, w13, w14
-  sub x12, x13, x14
-  sub w12, w13, w14, lsl #12
-  sub x12, x13, x14, lsl #12
-  sub w12, w13, w14, lsr #42
-  sub x12, x13, x14, lsr #42
-  sub w12, w13, w14, asr #39
-  sub x12, x13, x14, asr #39
-
-; CHECK: sub w12, w13, w14           ; encoding: [0xac,0x01,0x0e,0x4b]
-; CHECK: sub x12, x13, x14           ; encoding: [0xac,0x01,0x0e,0xcb]
-; CHECK: sub w12, w13, w14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x4b]
-; CHECK: sub x12, x13, x14, lsl #12  ; encoding: [0xac,0x31,0x0e,0xcb]
-; CHECK: sub w12, w13, w14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0x4b]
-; CHECK: sub x12, x13, x14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0xcb]
-; CHECK: sub w12, w13, w14, asr #39  ; encoding: [0xac,0x9d,0x8e,0x4b]
-; CHECK: sub x12, x13, x14, asr #39  ; encoding: [0xac,0x9d,0x8e,0xcb]
-
-  adds w12, w13, w14
-  adds x12, x13, x14
-  adds w12, w13, w14, lsl #12
-  adds x12, x13, x14, lsl #12
-  adds w12, w13, w14, lsr #42
-  adds x12, x13, x14, lsr #42
-  adds w12, w13, w14, asr #39
-  adds x12, x13, x14, asr #39
-
-; CHECK: adds w12, w13, w14          ; encoding: [0xac,0x01,0x0e,0x2b]
-; CHECK: adds x12, x13, x14          ; encoding: [0xac,0x01,0x0e,0xab]
-; CHECK: adds w12, w13, w14, lsl #12 ; encoding: [0xac,0x31,0x0e,0x2b]
-; CHECK: adds x12, x13, x14, lsl #12 ; encoding: [0xac,0x31,0x0e,0xab]
-; CHECK: adds w12, w13, w14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0x2b]
-; CHECK: adds x12, x13, x14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0xab]
-; CHECK: adds w12, w13, w14, asr #39 ; encoding: [0xac,0x9d,0x8e,0x2b]
-; CHECK: adds x12, x13, x14, asr #39 ; encoding: [0xac,0x9d,0x8e,0xab]
-
-  subs w12, w13, w14
-  subs x12, x13, x14
-  subs w12, w13, w14, lsl #12
-  subs x12, x13, x14, lsl #12
-  subs w12, w13, w14, lsr #42
-  subs x12, x13, x14, lsr #42
-  subs w12, w13, w14, asr #39
-  subs x12, x13, x14, asr #39
-
-; CHECK: subs w12, w13, w14          ; encoding: [0xac,0x01,0x0e,0x6b]
-; CHECK: subs x12, x13, x14          ; encoding: [0xac,0x01,0x0e,0xeb]
-; CHECK: subs w12, w13, w14, lsl #12 ; encoding: [0xac,0x31,0x0e,0x6b]
-; CHECK: subs x12, x13, x14, lsl #12 ; encoding: [0xac,0x31,0x0e,0xeb]
-; CHECK: subs w12, w13, w14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0x6b]
-; CHECK: subs x12, x13, x14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0xeb]
-; CHECK: subs w12, w13, w14, asr #39 ; encoding: [0xac,0x9d,0x8e,0x6b]
-; CHECK: subs x12, x13, x14, asr #39 ; encoding: [0xac,0x9d,0x8e,0xeb]
-
-; Check use of upper case register names rdar://14354073
-  add X2, X2, X2
-; CHECK: add x2, x2, x2              ; encoding: [0x42,0x00,0x02,0x8b]
-
-;==---------------------------------------------------------------------------==
-; Add/Subtract with (optional) extend
-;==---------------------------------------------------------------------------==
-
-  add w1, w2, w3, uxtb
-  add w1, w2, w3, uxth
-  add w1, w2, w3, uxtw
-  add w1, w2, w3, uxtx
-  add w1, w2, w3, sxtb
-  add w1, w2, w3, sxth
-  add w1, w2, w3, sxtw
-  add w1, w2, w3, sxtx
-
-; CHECK: add w1, w2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x0b]
-; CHECK: add w1, w2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x0b]
-; CHECK: add w1, w2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x0b]
-; CHECK: add w1, w2, w3, uxtx        ; encoding: [0x41,0x60,0x23,0x0b]
-; CHECK: add w1, w2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x0b]
-; CHECK: add w1, w2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x0b]
-; CHECK: add w1, w2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x0b]
-; CHECK: add w1, w2, w3, sxtx        ; encoding: [0x41,0xe0,0x23,0x0b]
-
-  add x1, x2, w3, uxtb
-  add x1, x2, w3, uxth
-  add x1, x2, w3, uxtw
-  add x1, x2, w3, sxtb
-  add x1, x2, w3, sxth
-  add x1, x2, w3, sxtw
-
-; CHECK: add x1, x2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x8b]
-; CHECK: add x1, x2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x8b]
-; CHECK: add x1, x2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x8b]
-; CHECK: add x1, x2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x8b]
-; CHECK: add x1, x2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x8b]
-; CHECK: add x1, x2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x8b]
-
-  add w1, wsp, w3
-  add w1, wsp, w3, uxtw #0
-  add w2, wsp, w3, lsl #1
-  add sp, x2, x3
-  add sp, x2, x3, uxtx #0
-
-; CHECK: add w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x0b]
-; CHECK: add w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x0b]
-; CHECK: add w2, wsp, w3, lsl #1     ; encoding: [0xe2,0x67,0x23,0x0b]
-; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
-; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
-
-  sub w1, w2, w3, uxtb
-  sub w1, w2, w3, uxth
-  sub w1, w2, w3, uxtw
-  sub w1, w2, w3, uxtx
-  sub w1, w2, w3, sxtb
-  sub w1, w2, w3, sxth
-  sub w1, w2, w3, sxtw
-  sub w1, w2, w3, sxtx
-
-; CHECK: sub w1, w2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x4b]
-; CHECK: sub w1, w2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x4b]
-; CHECK: sub w1, w2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x4b]
-; CHECK: sub w1, w2, w3, uxtx        ; encoding: [0x41,0x60,0x23,0x4b]
-; CHECK: sub w1, w2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x4b]
-; CHECK: sub w1, w2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x4b]
-; CHECK: sub w1, w2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x4b]
-; CHECK: sub w1, w2, w3, sxtx        ; encoding: [0x41,0xe0,0x23,0x4b]
-
-  sub x1, x2, w3, uxtb
-  sub x1, x2, w3, uxth
-  sub x1, x2, w3, uxtw
-  sub x1, x2, w3, sxtb
-  sub x1, x2, w3, sxth
-  sub x1, x2, w3, sxtw
-
-; CHECK: sub x1, x2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0xcb]
-; CHECK: sub x1, x2, w3, uxth        ; encoding: [0x41,0x20,0x23,0xcb]
-; CHECK: sub x1, x2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0xcb]
-; CHECK: sub x1, x2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0xcb]
-; CHECK: sub x1, x2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0xcb]
-; CHECK: sub x1, x2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0xcb]
-
-  sub w1, wsp, w3
-  sub w1, wsp, w3, uxtw #0
-  sub sp, x2, x3
-  sub sp, x2, x3, uxtx #0
-  sub sp, x3, x7, lsl #4
-
-; CHECK: sub w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x4b]
-; CHECK: sub w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x4b]
-; CHECK: sub sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0xcb]
-; CHECK: sub sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0xcb]
-; CHECK: sp, x3, x7, lsl #4          ; encoding: [0x7f,0x70,0x27,0xcb]
-
-  adds w1, w2, w3, uxtb
-  adds w1, w2, w3, uxth
-  adds w1, w2, w3, uxtw
-  adds w1, w2, w3, uxtx
-  adds w1, w2, w3, sxtb
-  adds w1, w2, w3, sxth
-  adds w1, w2, w3, sxtw
-  adds w1, w2, w3, sxtx
-
-; CHECK: adds w1, w2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0x2b]
-; CHECK: adds w1, w2, w3, uxth       ; encoding: [0x41,0x20,0x23,0x2b]
-; CHECK: adds w1, w2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0x2b]
-; CHECK: adds w1, w2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0x2b]
-; CHECK: adds w1, w2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0x2b]
-; CHECK: adds w1, w2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0x2b]
-; CHECK: adds w1, w2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0x2b]
-; CHECK: adds w1, w2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0x2b]
-
-  adds x1, x2, w3, uxtb
-  adds x1, x2, w3, uxth
-  adds x1, x2, w3, uxtw
-  adds x1, x2, w3, uxtx
-  adds x1, x2, w3, sxtb
-  adds x1, x2, w3, sxth
-  adds x1, x2, w3, sxtw
-  adds x1, x2, w3, sxtx
-
-; CHECK: adds x1, x2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0xab]
-; CHECK: adds x1, x2, w3, uxth       ; encoding: [0x41,0x20,0x23,0xab]
-; CHECK: adds x1, x2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0xab]
-; CHECK: adds x1, x2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0xab]
-; CHECK: adds x1, x2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0xab]
-; CHECK: adds x1, x2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0xab]
-; CHECK: adds x1, x2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0xab]
-; CHECK: adds x1, x2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0xab]
-
-  adds w1, wsp, w3
-  adds w1, wsp, w3, uxtw #0
-  adds wzr, wsp, w3, lsl #4
-
-; CHECK: adds w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x2b]
-; CHECK: adds w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x2b]
-; CHECK: adds wzr, wsp, w3, lsl #4   ; encoding: [0xff,0x73,0x23,0x2b]
-
-  subs w1, w2, w3, uxtb
-  subs w1, w2, w3, uxth
-  subs w1, w2, w3, uxtw
-  subs w1, w2, w3, uxtx
-  subs w1, w2, w3, sxtb
-  subs w1, w2, w3, sxth
-  subs w1, w2, w3, sxtw
-  subs w1, w2, w3, sxtx
-
-; CHECK: subs w1, w2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0x6b]
-; CHECK: subs w1, w2, w3, uxth       ; encoding: [0x41,0x20,0x23,0x6b]
-; CHECK: subs w1, w2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0x6b]
-; CHECK: subs w1, w2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0x6b]
-; CHECK: subs w1, w2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0x6b]
-; CHECK: subs w1, w2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0x6b]
-; CHECK: subs w1, w2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0x6b]
-; CHECK: subs w1, w2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0x6b]
-
-  subs x1, x2, w3, uxtb
-  subs x1, x2, w3, uxth
-  subs x1, x2, w3, uxtw
-  subs x1, x2, w3, uxtx
-  subs x1, x2, w3, sxtb
-  subs x1, x2, w3, sxth
-  subs x1, x2, w3, sxtw
-  subs x1, x2, w3, sxtx
-
-; CHECK: subs x1, x2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0xeb]
-; CHECK: subs x1, x2, w3, uxth       ; encoding: [0x41,0x20,0x23,0xeb]
-; CHECK: subs x1, x2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0xeb]
-; CHECK: subs x1, x2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0xeb]
-; CHECK: subs x1, x2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0xeb]
-; CHECK: subs x1, x2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0xeb]
-; CHECK: subs x1, x2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0xeb]
-; CHECK: subs x1, x2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0xeb]
-
-  subs w1, wsp, w3
-  subs w1, wsp, w3, uxtw #0
-
-; CHECK: subs w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x6b]
-; CHECK: subs w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x6b]
-
-  cmp wsp, w9, lsl #0
-  subs x3, sp, x9, lsl #2
-  cmp wsp, w8, uxtw
-  subs wzr, wsp, w8, uxtw
-  cmp sp, w8, uxtw
-  subs xzr, sp, w8, uxtw
-
-; CHECK: cmp wsp, w9                 ; encoding: [0xff,0x63,0x29,0x6b]
-; CHECK: subs x3, sp, x9, lsl #2     ; encoding: [0xe3,0x6b,0x29,0xeb]
-; CHECK: cmp wsp, w8                 ; encoding: [0xff,0x43,0x28,0x6b]
-; CHECK: cmp wsp, w8                 ; encoding: [0xff,0x43,0x28,0x6b]
-; CHECK: cmp sp, w8                  ; encoding: [0xff,0x43,0x28,0xeb]
-; CHECK: cmp sp, w8                  ; encoding: [0xff,0x43,0x28,0xeb]
-
-  sub wsp, w9, w8, uxtw
-  sub w1, wsp, w8, uxtw
-  sub wsp, wsp, w8, uxtw
-  sub sp, x9, w8, uxtw
-  sub x1, sp, w8, uxtw
-  sub sp, sp, w8, uxtw
-  subs w1, wsp, w8, uxtw
-  subs x1, sp, w8, uxtw
-
-; CHECK: sub wsp, w9, w8             ; encoding: [0x3f,0x41,0x28,0x4b]
-; CHECK: sub w1, wsp, w8             ; encoding: [0xe1,0x43,0x28,0x4b]
-; CHECK: sub wsp, wsp, w8            ; encoding: [0xff,0x43,0x28,0x4b]
-; CHECK: sub sp, x9, w8              ; encoding: [0x3f,0x41,0x28,0xcb]
-; CHECK: sub x1, sp, w8              ; encoding: [0xe1,0x43,0x28,0xcb]
-; CHECK: sub sp, sp, w8              ; encoding: [0xff,0x43,0x28,0xcb]
-; CHECK: subs w1, wsp, w8            ; encoding: [0xe1,0x43,0x28,0x6b]
-; CHECK: subs x1, sp, w8             ; encoding: [0xe1,0x43,0x28,0xeb]
-
-;==---------------------------------------------------------------------------==
-; Signed/Unsigned divide
-;==---------------------------------------------------------------------------==
-
-  sdiv w1, w2, w3
-  sdiv x1, x2, x3
-  udiv w1, w2, w3
-  udiv x1, x2, x3
-
-; CHECK: sdiv w1, w2, w3             ; encoding: [0x41,0x0c,0xc3,0x1a]
-; CHECK: sdiv x1, x2, x3             ; encoding: [0x41,0x0c,0xc3,0x9a]
-; CHECK: udiv w1, w2, w3             ; encoding: [0x41,0x08,0xc3,0x1a]
-; CHECK: udiv x1, x2, x3             ; encoding: [0x41,0x08,0xc3,0x9a]
-
-;==---------------------------------------------------------------------------==
-; Variable shifts
-;==---------------------------------------------------------------------------==
-
-  asrv w1, w2, w3
-  asrv x1, x2, x3
-  asr w1, w2, w3
-  asr x1, x2, x3
-  lslv w1, w2, w3
-  lslv x1, x2, x3
-  lsl w1, w2, w3
-  lsl x1, x2, x3
-  lsrv w1, w2, w3
-  lsrv x1, x2, x3
-  lsr w1, w2, w3
-  lsr x1, x2, x3
-  rorv w1, w2, w3
-  rorv x1, x2, x3
-  ror w1, w2, w3
-  ror x1, x2, x3
-
-; CHECK: encoding: [0x41,0x28,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x28,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x28,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x28,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x20,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x20,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x20,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x20,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x24,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x24,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x24,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x24,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x2c,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x2c,0xc3,0x9a]
-; CHECK: encoding: [0x41,0x2c,0xc3,0x1a]
-; CHECK: encoding: [0x41,0x2c,0xc3,0x9a]
-
-;==---------------------------------------------------------------------------==
-; One operand instructions
-;==---------------------------------------------------------------------------==
-
-  cls w1, w2
-  cls x1, x2
-  clz w1, w2
-  clz x1, x2
-  rbit w1, w2
-  rbit x1, x2
-  rev w1, w2
-  rev x1, x2
-  rev16 w1, w2
-  rev16 x1, x2
-  rev32 x1, x2
-
-; CHECK: encoding: [0x41,0x14,0xc0,0x5a]
-; CHECK: encoding: [0x41,0x14,0xc0,0xda]
-; CHECK: encoding: [0x41,0x10,0xc0,0x5a]
-; CHECK: encoding: [0x41,0x10,0xc0,0xda]
-; CHECK: encoding: [0x41,0x00,0xc0,0x5a]
-; CHECK: encoding: [0x41,0x00,0xc0,0xda]
-; CHECK: encoding: [0x41,0x08,0xc0,0x5a]
-; CHECK: encoding: [0x41,0x0c,0xc0,0xda]
-; CHECK: encoding: [0x41,0x04,0xc0,0x5a]
-; CHECK: encoding: [0x41,0x04,0xc0,0xda]
-; CHECK: encoding: [0x41,0x08,0xc0,0xda]
-
-;==---------------------------------------------------------------------------==
-; 6.6.1 Multiply-add instructions
-;==---------------------------------------------------------------------------==
-
-  madd   w1, w2, w3, w4
-  madd   x1, x2, x3, x4
-  msub   w1, w2, w3, w4
-  msub   x1, x2, x3, x4
-  smaddl x1, w2, w3, x4
-  smsubl x1, w2, w3, x4
-  umaddl x1, w2, w3, x4
-  umsubl x1, w2, w3, x4
-
-; CHECK: madd   w1, w2, w3, w4       ; encoding: [0x41,0x10,0x03,0x1b]
-; CHECK: madd   x1, x2, x3, x4       ; encoding: [0x41,0x10,0x03,0x9b]
-; CHECK: msub   w1, w2, w3, w4       ; encoding: [0x41,0x90,0x03,0x1b]
-; CHECK: msub   x1, x2, x3, x4       ; encoding: [0x41,0x90,0x03,0x9b]
-; CHECK: smaddl x1, w2, w3, x4       ; encoding: [0x41,0x10,0x23,0x9b]
-; CHECK: smsubl x1, w2, w3, x4       ; encoding: [0x41,0x90,0x23,0x9b]
-; CHECK: umaddl x1, w2, w3, x4       ; encoding: [0x41,0x10,0xa3,0x9b]
-; CHECK: umsubl x1, w2, w3, x4       ; encoding: [0x41,0x90,0xa3,0x9b]
-
-;==---------------------------------------------------------------------------==
-; Multiply-high instructions
-;==---------------------------------------------------------------------------==
-
-  smulh x1, x2, x3
-  umulh x1, x2, x3
-
-; CHECK: smulh x1, x2, x3            ; encoding: [0x41,0x7c,0x43,0x9b]
-; CHECK: umulh x1, x2, x3            ; encoding: [0x41,0x7c,0xc3,0x9b]
-
-;==---------------------------------------------------------------------------==
-; Move immediate instructions
-;==---------------------------------------------------------------------------==
-
-  movz w0, #1
-  movz x0, #1
-  movz w0, #1, lsl #16
-  movz x0, #1, lsl #16
-
-; CHECK: movz w0, #1                 ; encoding: [0x20,0x00,0x80,0x52]
-; CHECK: movz x0, #1                 ; encoding: [0x20,0x00,0x80,0xd2]
-; CHECK: movz w0, #1, lsl #16        ; encoding: [0x20,0x00,0xa0,0x52]
-; CHECK: movz x0, #1, lsl #16        ; encoding: [0x20,0x00,0xa0,0xd2]
-
-  movn w0, #2
-  movn x0, #2
-  movn w0, #2, lsl #16
-  movn x0, #2, lsl #16
-
-; CHECK: movn w0, #2                 ; encoding: [0x40,0x00,0x80,0x12]
-; CHECK: movn x0, #2                 ; encoding: [0x40,0x00,0x80,0x92]
-; CHECK: movn w0, #2, lsl #16        ; encoding: [0x40,0x00,0xa0,0x12]
-; CHECK: movn x0, #2, lsl #16        ; encoding: [0x40,0x00,0xa0,0x92]
-
-  movk w0, #1
-  movk x0, #1
-  movk w0, #1, lsl #16
-  movk x0, #1, lsl #16
-
-; CHECK: movk w0, #1                 ; encoding: [0x20,0x00,0x80,0x72]
-; CHECK: movk x0, #1                 ; encoding: [0x20,0x00,0x80,0xf2]
-; CHECK: movk w0, #1, lsl #16        ; encoding: [0x20,0x00,0xa0,0x72]
-; CHECK: movk x0, #1, lsl #16        ; encoding: [0x20,0x00,0xa0,0xf2]
-
-;==---------------------------------------------------------------------------==
-; Conditionally set flags instructions
-;==---------------------------------------------------------------------------==
-
-  ccmn w1, #2, #3, eq
-  ccmn x1, #2, #3, eq
-  ccmp w1, #2, #3, eq
-  ccmp x1, #2, #3, eq
-
-; CHECK: encoding: [0x23,0x08,0x42,0x3a]
-; CHECK: encoding: [0x23,0x08,0x42,0xba]
-; CHECK: encoding: [0x23,0x08,0x42,0x7a]
-; CHECK: encoding: [0x23,0x08,0x42,0xfa]
-
-  ccmn w1, w2, #3, eq
-  ccmn x1, x2, #3, eq
-  ccmp w1, w2, #3, eq
-  ccmp x1, x2, #3, eq
-
-; CHECK: encoding: [0x23,0x00,0x42,0x3a]
-; CHECK: encoding: [0x23,0x00,0x42,0xba]
-; CHECK: encoding: [0x23,0x00,0x42,0x7a]
-; CHECK: encoding: [0x23,0x00,0x42,0xfa]
-
-;==---------------------------------------------------------------------------==
-; Conditional select instructions
-;==---------------------------------------------------------------------------==
-
-  csel w1, w2, w3, eq
-  csel x1, x2, x3, eq
-  csinc w1, w2, w3, eq
-  csinc x1, x2, x3, eq
-  csinv w1, w2, w3, eq
-  csinv x1, x2, x3, eq
-  csneg w1, w2, w3, eq
-  csneg x1, x2, x3, eq
-
-; CHECK: encoding: [0x41,0x00,0x83,0x1a]
-; CHECK: encoding: [0x41,0x00,0x83,0x9a]
-; CHECK: encoding: [0x41,0x04,0x83,0x1a]
-; CHECK: encoding: [0x41,0x04,0x83,0x9a]
-; CHECK: encoding: [0x41,0x00,0x83,0x5a]
-; CHECK: encoding: [0x41,0x00,0x83,0xda]
-; CHECK: encoding: [0x41,0x04,0x83,0x5a]
-; CHECK: encoding: [0x41,0x04,0x83,0xda]
-
-; Make sure we handle upper case, too. In particular, condition codes.
-  CSEL W16, W7, W27, EQ
-  CSEL W15, W6, W26, NE
-  CSEL W14, W5, W25, CS
-  CSEL W13, W4, W24, HS
-  csel w12, w3, w23, CC
-  csel w11, w2, w22, LO
-  csel w10, w1, w21, MI
-  csel x9, x9, x1, PL
-  csel x8, x8, x2, VS
-  CSEL X7, X7, X3, VC
-  CSEL X6, X7, X4, HI
-  CSEL X5, X6, X5, LS
-  CSEL X4, X5, X6, GE
-  csel x3, x4, x7, LT
-  csel x2, x3, x8, GT
-  csel x1, x2, x9, LE
-  csel x10, x1, x20, AL
-
-; CHECK: csel	w16, w7, w27, eq        ; encoding: [0xf0,0x00,0x9b,0x1a]
-; CHECK: csel	w15, w6, w26, ne        ; encoding: [0xcf,0x10,0x9a,0x1a]
-; CHECK: csel	w14, w5, w25, cs        ; encoding: [0xae,0x20,0x99,0x1a]
-; CHECK: csel	w13, w4, w24, cs        ; encoding: [0x8d,0x20,0x98,0x1a]
-; CHECK: csel	w12, w3, w23, cc        ; encoding: [0x6c,0x30,0x97,0x1a]
-; CHECK: csel	w11, w2, w22, cc        ; encoding: [0x4b,0x30,0x96,0x1a]
-; CHECK: csel	w10, w1, w21, mi        ; encoding: [0x2a,0x40,0x95,0x1a]
-; CHECK: csel	x9, x9, x1, pl          ; encoding: [0x29,0x51,0x81,0x9a]
-; CHECK: csel	x8, x8, x2, vs          ; encoding: [0x08,0x61,0x82,0x9a]
-; CHECK: csel	x7, x7, x3, vc          ; encoding: [0xe7,0x70,0x83,0x9a]
-; CHECK: csel	x6, x7, x4, hi          ; encoding: [0xe6,0x80,0x84,0x9a]
-; CHECK: csel	x5, x6, x5, ls          ; encoding: [0xc5,0x90,0x85,0x9a]
-; CHECK: csel	x4, x5, x6, ge          ; encoding: [0xa4,0xa0,0x86,0x9a]
-; CHECK: csel	x3, x4, x7, lt          ; encoding: [0x83,0xb0,0x87,0x9a]
-; CHECK: csel	x2, x3, x8, gt          ; encoding: [0x62,0xc0,0x88,0x9a]
-; CHECK: csel	x1, x2, x9, le          ; encoding: [0x41,0xd0,0x89,0x9a]
-; CHECK: csel	x10, x1, x20, al        ; encoding: [0x2a,0xe0,0x94,0x9a]
-
-
-;==---------------------------------------------------------------------------==
-; Scalar saturating arithmetic
-;==---------------------------------------------------------------------------==
-  uqxtn b4, h2
-  uqxtn h2, s3
-  uqxtn s9, d2
-
-; CHECK: uqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x7e]
-; CHECK: uqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x7e]
-; CHECK: uqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x7e]
diff --git a/test/MC/ARM64/arm64-fixup.s b/test/MC/ARM64/arm64-fixup.s
deleted file mode 100644
index eae6f68..0000000
--- a/test/MC/ARM64/arm64-fixup.s
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llvm-mc < %s -triple arm64-apple-darwin --show-encoding | FileCheck %s
-
-foo:
-  adr x3, Lbar
-; CHECK: adr x3, Lbar            ; encoding: [0x03'A',A,A,0x10'A']
-; CHECK: fixup A - offset: 0, value: Lbar, kind: fixup_arm64_pcrel_adr_imm21
-Lbar:
-  adrp x3, _printf@page
-; CHECK: adrp x3, _printf@PAGE      ; encoding: [0x03'A',A,A,0x90'A']
-; CHECK: fixup A - offset: 0, value: _printf@PAGE, kind: fixup_arm64_pcrel_adrp_imm21
diff --git a/test/MC/ARM64/basic-a64-instructions.s b/test/MC/ARM64/basic-a64-instructions.s
deleted file mode 100644
index 99b438d..0000000
--- a/test/MC/ARM64/basic-a64-instructions.s
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: llvm-mc -triple arm64 -show-encoding < %s | FileCheck %s
-
-        crc32b  w5, w7, w20
-        crc32h  w28, wzr, w30
-        crc32w  w0, w1, w2
-        crc32x  w7, w9, x20
-        crc32cb w9, w5, w4
-        crc32ch w13, w17, w25
-        crc32cw wzr, w3, w5
-        crc32cx w18, w16, xzr
-// CHECK: crc32b   w5, w7, w20             // encoding: [0xe5,0x40,0xd4,0x1a]
-// CHECK: crc32h   w28, wzr, w30           // encoding: [0xfc,0x47,0xde,0x1a]
-// CHECK: crc32w   w0, w1, w2              // encoding: [0x20,0x48,0xc2,0x1a]
-// CHECK: crc32x   w7, w9, x20             // encoding: [0x27,0x4d,0xd4,0x9a]
-// CHECK: crc32cb  w9, w5, w4              // encoding: [0xa9,0x50,0xc4,0x1a]
-// CHECK: crc32ch  w13, w17, w25           // encoding: [0x2d,0x56,0xd9,0x1a]
-// CHECK: crc32cw  wzr, w3, w5             // encoding: [0x7f,0x58,0xc5,0x1a]
-// CHECK: crc32cx  w18, w16, xzr           // encoding: [0x12,0x5e,0xdf,0x9a]
diff --git a/test/MC/ARM64/bitfield-encoding.s b/test/MC/ARM64/bitfield-encoding.s
deleted file mode 100644
index cdbac08..0000000
--- a/test/MC/ARM64/bitfield-encoding.s
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-
-foo:
-;==---------------------------------------------------------------------------==
-; 5.4.4 Bitfield Operations
-;==---------------------------------------------------------------------------==
-
-  bfm  w1, w2, #1, #15
-  bfm  x1, x2, #1, #15
-  sbfm w1, w2, #1, #15
-  sbfm x1, x2, #1, #15
-  ubfm w1, w2, #1, #15
-  ubfm x1, x2, #1, #15
-
-; CHECK: bfm  w1, w2, #1, #15        ; encoding: [0x41,0x3c,0x01,0x33]
-; CHECK: bfm  x1, x2, #1, #15        ; encoding: [0x41,0x3c,0x41,0xb3]
-; CHECK: sbfm w1, w2, #1, #15        ; encoding: [0x41,0x3c,0x01,0x13]
-; CHECK: sbfm x1, x2, #1, #15        ; encoding: [0x41,0x3c,0x41,0x93]
-; CHECK: ubfm w1, w2, #1, #15        ; encoding: [0x41,0x3c,0x01,0x53]
-; CHECK: ubfm x1, x2, #1, #15        ; encoding: [0x41,0x3c,0x41,0xd3]
-
-;==---------------------------------------------------------------------------==
-; 5.4.5 Extract (immediate)
-;==---------------------------------------------------------------------------==
-
-  extr w1, w2, w3, #15
-  extr x2, x3, x4, #1
-
-; CHECK: extr w1, w2, w3, #15        ; encoding: [0x41,0x3c,0x83,0x13]
-; CHECK: extr x2, x3, x4, #1         ; encoding: [0x62,0x04,0xc4,0x93]
diff --git a/test/MC/ARM64/branch-encoding.s b/test/MC/ARM64/branch-encoding.s
deleted file mode 100644
index 7857fea..0000000
--- a/test/MC/ARM64/branch-encoding.s
+++ /dev/null
@@ -1,159 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-
-foo:
-
-;-----------------------------------------------------------------------------
-; Unconditional branch (register) instructions.
-;-----------------------------------------------------------------------------
-
-  ret
-; CHECK: encoding: [0xc0,0x03,0x5f,0xd6]
-  ret x1
-; CHECK: encoding: [0x20,0x00,0x5f,0xd6]
-  drps
-; CHECK: encoding: [0xe0,0x03,0xbf,0xd6]
-  eret
-; CHECK: encoding: [0xe0,0x03,0x9f,0xd6]
-  br  x5
-; CHECK: encoding: [0xa0,0x00,0x1f,0xd6]
-  blr x9
-; CHECK: encoding: [0x20,0x01,0x3f,0xd6]
-  bl  L1
-; CHECK: bl L1   ; encoding: [A,A,A,0b100101AA]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_call26
-
-;-----------------------------------------------------------------------------
-; Contitional branch instructions.
-;-----------------------------------------------------------------------------
-
-  b     L1
-; CHECK: b L1      ; encoding: [A,A,A,0b000101AA]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch26
-  b.eq  L1
-; CHECK: b.eq L1   ; encoding: [0bAAA00000,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.ne  L1
-; CHECK: b.ne L1   ; encoding: [0bAAA00001,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.cs  L1
-; CHECK: b.cs L1   ; encoding: [0bAAA00010,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.cc  L1
-; CHECK: b.cc L1   ; encoding: [0bAAA00011,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.mi  L1
-; CHECK: b.mi L1   ; encoding: [0bAAA00100,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.pl  L1
-; CHECK: b.pl L1   ; encoding: [0bAAA00101,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.vs  L1
-; CHECK: b.vs L1   ; encoding: [0bAAA00110,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.vc  L1
-; CHECK: b.vc L1   ; encoding: [0bAAA00111,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.hi  L1
-; CHECK: b.hi L1   ; encoding: [0bAAA01000,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.ls  L1
-; CHECK: b.ls L1   ; encoding: [0bAAA01001,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.ge  L1
-; CHECK: b.ge L1   ; encoding: [0bAAA01010,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.lt  L1
-; CHECK: b.lt L1   ; encoding: [0bAAA01011,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.gt  L1
-; CHECK: b.gt L1   ; encoding: [0bAAA01100,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.le  L1
-; CHECK: b.le L1   ; encoding: [0bAAA01101,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-  b.al  L1
-; CHECK: b L1      ; encoding: [0bAAA01110,A,A,0x54]
-; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
-L1:
-  b #28
-; CHECK: b #28
-  b.lt #28
-; CHECK: b.lt #28
-  b.cc #1048572
-; CHECK: b.cc	#1048572                ; encoding: [0xe3,0xff,0x7f,0x54]
-  b #134217724
-; CHECK: b	#134217724              ; encoding: [0xff,0xff,0xff,0x15]
-  b #-134217728
-; CHECK: b	#-134217728             ; encoding: [0x00,0x00,0x00,0x16]
-
-;-----------------------------------------------------------------------------
-; Compare-and-branch instructions.
-;-----------------------------------------------------------------------------
-
-  cbz w1, foo
-; CHECK: encoding: [0bAAA00001,A,A,0x34]
-  cbz x1, foo
-; CHECK: encoding: [0bAAA00001,A,A,0xb4]
-  cbnz w2, foo
-; CHECK: encoding: [0bAAA00010,A,A,0x35]
-  cbnz x2, foo
-; CHECK: encoding: [0bAAA00010,A,A,0xb5]
-  cbz w1, #28
-; CHECK: cbz w1, #28
-  cbz     w20, #1048572
-; CHECK: cbz	w20, #1048572           ; encoding: [0xf4,0xff,0x7f,0x34]
-  cbnz x2, #-1048576
-; CHECK: cbnz	x2, #-1048576           ; encoding: [0x02,0x00,0x80,0xb5]
-
-
-;-----------------------------------------------------------------------------
-; Bit-test-and-branch instructions.
-;-----------------------------------------------------------------------------
-
-  tbz x1, #3, foo
-; CHECK: encoding: [0bAAA00001,A,0b00011AAA,0x36]
-  tbnz x1, #63, foo
-; CHECK: encoding: [0bAAA00001,A,0b11111AAA,0xb7]
-
-  tbz w1, #3, foo
-; CHECK: encoding: [0bAAA00001,A,0b00011AAA,0x36]
-  tbnz w1, #31, foo
-; CHECK: encoding: [0bAAA00001,A,0b11111AAA,0x37]
-
-  tbz w1, #3, #28
-; CHECK: tbz w1, #3, #28
-  tbz w3, #5, #32764
-; CHECK: tbz	w3, #5, #32764          ; encoding: [0xe3,0xff,0x2b,0x36]
-  tbnz x3, #8, #-32768
-; CHECK: tbnz	w3, #8, #-32768         ; encoding: [0x03,0x00,0x44,0x37]
-
-;-----------------------------------------------------------------------------
-; Exception generation instructions.
-;-----------------------------------------------------------------------------
-
-  brk   #1
-; CHECK: encoding: [0x20,0x00,0x20,0xd4]
-  dcps1 #2
-; CHECK: encoding: [0x41,0x00,0xa0,0xd4]
-  dcps2 #3
-; CHECK: encoding: [0x62,0x00,0xa0,0xd4]
-  dcps3 #4
-; CHECK: encoding: [0x83,0x00,0xa0,0xd4]
-  hlt   #5
-; CHECK: encoding: [0xa0,0x00,0x40,0xd4]
-  hvc   #6
-; CHECK: encoding: [0xc2,0x00,0x00,0xd4]
-  smc   #7
-; CHECK: encoding: [0xe3,0x00,0x00,0xd4]
-  svc   #8
-; CHECK: encoding: [0x01,0x01,0x00,0xd4]
-
-; The immediate defaults to zero for DCPSn
-  dcps1
-  dcps2
-  dcps3
-
-; CHECK: dcps1                     ; encoding: [0x01,0x00,0xa0,0xd4]
-; CHECK: dcps2                     ; encoding: [0x02,0x00,0xa0,0xd4]
-; CHECK: dcps3                     ; encoding: [0x03,0x00,0xa0,0xd4]
-
diff --git a/test/MC/ARM64/crypto.s b/test/MC/ARM64/crypto.s
deleted file mode 100644
index d7c4ec3..0000000
--- a/test/MC/ARM64/crypto.s
+++ /dev/null
@@ -1,66 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding -output-asm-variant=1 < %s | FileCheck %s
-
-foo:
-  aese.16b v0, v1
-  aesd.16b v0, v1
-  aesmc.16b v0, v1
-  aesimc.16b v0, v1
-
-  sha1c.4s q0, s1, v2
-  sha1p.4s q0, s1, v2
-  sha1m.4s q0, s1, v2
-  sha1su0.4s v0, v1, v2
-  sha256h.4s q0, q1, v2
-  sha256h2.4s q0, q1, v2
-  sha256su1.4s v0, v1, v2
-  sha1h s0, s1
-  sha1su1.4s v0, v1
-  sha256su0.4s v0, v1
-
-; CHECK: aese.16b v0, v1               ; encoding: [0x20,0x48,0x28,0x4e]
-; CHECK: aesd.16b v0, v1               ; encoding: [0x20,0x58,0x28,0x4e]
-; CHECK: aesmc.16b v0, v1              ; encoding: [0x20,0x68,0x28,0x4e]
-; CHECK: aesimc.16b v0, v1             ; encoding: [0x20,0x78,0x28,0x4e]
-
-; CHECK: sha1c.4s q0, s1, v2           ; encoding: [0x20,0x00,0x02,0x5e]
-; CHECK: sha1p.4s q0, s1, v2           ; encoding: [0x20,0x10,0x02,0x5e]
-; CHECK: sha1m.4s q0, s1, v2           ; encoding: [0x20,0x20,0x02,0x5e]
-; CHECK: sha1su0.4s v0, v1, v2         ; encoding: [0x20,0x30,0x02,0x5e]
-; CHECK: sha256h.4s q0, q1, v2         ; encoding: [0x20,0x40,0x02,0x5e]
-; CHECK: sha256h2.4s q0, q1, v2        ; encoding: [0x20,0x50,0x02,0x5e]
-; CHECK: sha256su1.4s v0, v1, v2       ; encoding: [0x20,0x60,0x02,0x5e]
-; CHECK: sha1h s0, s1                  ; encoding: [0x20,0x08,0x28,0x5e]
-; CHECK: sha1su1.4s v0, v1             ; encoding: [0x20,0x18,0x28,0x5e]
-; CHECK: sha256su0.4s v0, v1           ; encoding: [0x20,0x28,0x28,0x5e]
-
-  aese v2.16b, v3.16b
-  aesd v5.16b, v7.16b
-  aesmc v11.16b, v13.16b
-  aesimc v17.16b, v19.16b
-
-; CHECK: aese.16b v2, v3            ; encoding: [0x62,0x48,0x28,0x4e]
-; CHECK: aesd.16b v5, v7            ; encoding: [0xe5,0x58,0x28,0x4e]
-; CHECK: aesmc.16b v11, v13         ; encoding: [0xab,0x69,0x28,0x4e]
-; CHECK: aesimc.16b v17, v19        ; encoding: [0x71,0x7a,0x28,0x4e]
-
-  sha1c q23, s29, v3.4s
-  sha1p q14, s15, v9.4s
-  sha1m q2, s6, v5.4s
-  sha1su0 v3.4s, v5.4s, v9.4s
-  sha256h q2, q7, v18.4s
-  sha256h2 q28, q18, v28.4s
-  sha256su1 v4.4s, v5.4s, v9.4s
-  sha1h s30, s0
-  sha1su1 v10.4s, v21.4s
-  sha256su0 v2.4s, v31.4s
-
-; CHECK: sha1c.4s q23, s29, v3       ; encoding: [0xb7,0x03,0x03,0x5e]
-; CHECK: sha1p.4s q14, s15, v9       ; encoding: [0xee,0x11,0x09,0x5e]
-; CHECK: sha1m.4s q2, s6, v5         ; encoding: [0xc2,0x20,0x05,0x5e]
-; CHECK: sha1su0.4s v3, v5, v9       ; encoding: [0xa3,0x30,0x09,0x5e]
-; CHECK: sha256h.4s q2, q7, v18      ; encoding: [0xe2,0x40,0x12,0x5e]
-; CHECK: sha256h2.4s q28, q18, v28   ; encoding: [0x5c,0x52,0x1c,0x5e]
-; CHECK: sha256su1.4s v4, v5, v9     ; encoding: [0xa4,0x60,0x09,0x5e]
-; CHECK: sha1h s30, s0               ; encoding: [0x1e,0x08,0x28,0x5e]
-; CHECK: sha1su1.4s v10, v21         ; encoding: [0xaa,0x1a,0x28,0x5e]
-; CHECK: sha256su0.4s v2, v31        ; encoding: [0xe2,0x2b,0x28,0x5e]
diff --git a/test/MC/ARM64/diags.s b/test/MC/ARM64/diags.s
deleted file mode 100644
index d857fe1..0000000
--- a/test/MC/ARM64/diags.s
+++ /dev/null
@@ -1,242 +0,0 @@
-; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-foo:
-
-; The first should encode as an expression. The second should error expecting
-; a register.
-  ldr x3, (foo + 4)
-  ldr x3, [foo + 4]
-; CHECK:  ldr x3, foo+4               ; encoding: [0bAAA00011,A,A,0x58]
-; CHECK:                              ;   fixup A - offset: 0, value: foo+4, kind: fixup_arm64_pcrel_imm19
-; CHECK-ERRORS: error: register expected
-
-; The last argument should be flagged as an error.  rdar://9576009
-  ld4.8b	{v0, v1, v2, v3}, [x0], #33
-; CHECK-ERRORS: error: invalid operand for instruction
-; CHECK-ERRORS: ld4.8b	{v0, v1, v2, v3}, [x0], #33
-
-
-        ldr x0, [x0, #804]
-        ldr w0, [x0, #802]
-        ldr x0, [x0, #804]!
-        ldr w0, [w0, #301]!
-        ldr x0, [x0], #804
-        ldr w0, [w0], #301
-
-        ldp w3, w4, [x5, #11]!
-        ldp x3, x4, [x5, #12]!
-        ldp q3, q4, [x5, #12]!
-        ldp w3, w4, [x5], #11
-        ldp x3, x4, [x5], #12
-        ldp q3, q4, [x5], #12
-
-        ldur x0, [x1, #-257]
-
-; CHECK-ERRORS: error: index must be a multiple of 8 in range [0,32760].
-; CHECK-ERRORS:         ldr x0, [x0, #804]
-; CHECK-ERRORS:                 ^
-; CHECK-ERRORS: error: index must be a multiple of 4 in range [0,16380].
-; CHECK-ERRORS:         ldr w0, [x0, #802]
-; CHECK-ERRORS:                 ^
-; CHECK-ERRORS: error: index must be an integer in range [-256,255].
-; CHECK-ERRORS:         ldr x0, [x0, #804]!
-; CHECK-ERRORS:                 ^
-; CHECK-ERRORS: error: index must be an integer in range [-256,255].
-; CHECK-ERRORS:         ldr w0, [w0, #301]!
-; CHECK-ERRORS:                 ^
-; CHECK-ERRORS: error: index must be an integer in range [-256,255].
-; CHECK-ERRORS:         ldr x0, [x0], #804
-; CHECK-ERRORS:                       ^
-; CHECK-ERRORS: error: index must be an integer in range [-256,255].
-; CHECK-ERRORS:         ldr w0, [w0], #301
-; CHECK-ERRORS:                       ^
-; CHECK-ERRORS: error: index must be a multiple of 4 in range [-256,252].
-; CHECK-ERRORS:         ldp w3, w4, [x5, #11]!
-; CHECK-ERRORS:                     ^
-; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512,504].
-; CHECK-ERRORS:         ldp x3, x4, [x5, #12]!
-; CHECK-ERRORS:                     ^
-; CHECK-ERRORS: error: index must be a multiple of 16 in range [-1024,1008].
-; CHECK-ERRORS:         ldp q3, q4, [x5, #12]!
-; CHECK-ERRORS:                     ^
-; CHECK-ERRORS: error: index must be a multiple of 4 in range [-256,252].
-; CHECK-ERRORS:         ldp w3, w4, [x5], #11
-; CHECK-ERRORS:                           ^
-; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512,504].
-; CHECK-ERRORS:         ldp x3, x4, [x5], #12
-; CHECK-ERRORS:                           ^
-; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512,504].
-; CHECK-ERRORS:         ldp q3, q4, [x5], #12
-; CHECK-ERRORS:                           ^
-; CHECK-ERRORS: error: index must be an integer in range [-256,255].
-; CHECK-ERRORS:         ldur x0, [x1, #-257]
-; CHECK-ERRORS:                   ^
-
-
-
-; Shift immediates range checking.
-  sqrshrn b4, h9, #10
-  rshrn v9.8b, v11.8h, #17
-  sqrshrn v7.4h, v8.4s, #39
-  uqshrn2 v4.4s, v5.2d, #67
-
-; CHECK-ERRORS: error: immediate must be an integer in range [1,8].
-; CHECK-ERRORS:   sqrshrn b4, h9, #10
-; CHECK-ERRORS:                   ^
-; CHECK-ERRORS: error: immediate must be an integer in range [1,8].
-; CHECK-ERRORS:   rshrn v9.8b, v11.8h, #17
-; CHECK-ERRORS:                        ^
-; CHECK-ERRORS: error: immediate must be an integer in range [1,16].
-; CHECK-ERRORS:   sqrshrn v7.4h, v8.4s, #39
-; CHECK-ERRORS:                         ^
-; CHECK-ERRORS: error: immediate must be an integer in range [1,32].
-; CHECK-ERRORS:   uqshrn2 v4.4s, v5.2d, #67
-; CHECK-ERRORS:                         ^
-
-
-  st1.s4 {v14, v15}, [x2], #32
-; CHECK-ERRORS: error: invalid type suffix for instruction
-; CHECK-ERRORS: st1.s4 {v14, v15}, [x2], #32
-; CHECK-ERRORS:     ^
-
-
-
-; Load pair instructions where Rt==Rt2 and writeback load/store instructions
-; where Rt==Rn or Rt2==Rn are unpredicatable.
-  ldp x1, x2, [x2], #16
-  ldp x2, x2, [x2], #16
-  ldp w1, w2, [x2], #16
-  ldp w2, w2, [x2], #16
-  ldp x1, x1, [x2]
-
-  ldr x2, [x2], #8
-  ldr x2, [x2, #8]!
-  ldr w2, [x2], #8
-  ldr w2, [x2, #8]!
-
-  str x2, [x2], #8
-  str x2, [x2, #8]!
-  str w2, [x2], #8
-  str w2, [x2, #8]!
-
-; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
-; CHECK-ERRORS:   ldp x1, x2, [x2], #16
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
-; CHECK-ERRORS:   ldp x2, x2, [x2], #16
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
-; CHECK-ERRORS:   ldp w1, w2, [x2], #16
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
-; CHECK-ERRORS:   ldp w2, w2, [x2], #16
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
-; CHECK-ERRORS:   ldp x1, x1, [x2]
-; CHECK-ERRORS:           ^
-; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
-; CHECK-ERRORS:   ldr x2, [x2], #8
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
-; CHECK-ERRORS:   ldr x2, [x2, #8]!
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
-; CHECK-ERRORS:   ldr w2, [x2], #8
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
-; CHECK-ERRORS:   ldr w2, [x2, #8]!
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
-; CHECK-ERRORS:   str x2, [x2], #8
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
-; CHECK-ERRORS:   str x2, [x2, #8]!
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
-; CHECK-ERRORS:   str w2, [x2], #8
-; CHECK-ERRORS:       ^
-; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
-; CHECK-ERRORS:   str w2, [x2, #8]!
-; CHECK-ERRORS:       ^
-
-; The validity checking for shifted-immediate operands.  rdar://13174476
-; Where the immediate is out of range.
-  add w1, w2, w3, lsr #75
-
-; CHECK-ERRORS: error: immediate value too large for shifter operand
-; CHECK-ERRORS: add w1, w2, w3, lsr #75
-; CHECK-ERRORS:                      ^
-
-; logical instructions on 32-bit regs with shift > 31 is not legal
-orr w0, w0, w0, lsl #32
-; CHECK-ERRORS: error: shift value out of range
-; CHECK-ERRORS:        orr w0, w0, w0, lsl #32
-; CHECK-ERRORS:                        ^
-eor w0, w0, w0, lsl #32
-; CHECK-ERRORS: error: shift value out of range
-; CHECK-ERRORS:        eor w0, w0, w0, lsl #32
-; CHECK-ERRORS:                        ^
-and w0, w0, w0, lsl #32
-; CHECK-ERRORS: error: shift value out of range
-; CHECK-ERRORS:        and w0, w0, w0, lsl #32
-; CHECK-ERRORS:                        ^
-ands w0, w0, w0, lsl #32
-; CHECK-ERRORS: error: shift value out of range
-; CHECK-ERRORS:        ands w0, w0, w0, lsl #32
-; CHECK-ERRORS:                        ^
-
-; Relocated expressions should not be accepted for 32-bit adds or sub (imm)
-add w3, w5, sym@PAGEOFF
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: add w3, w5, sym@PAGEOFF
-; CHECK-ERRORS:             ^
-
-adds w3, w5, sym@PAGEOFF
-adds x9, x12, sym@PAGEOFF
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: adds w3, w5, sym@PAGEOFF
-; CHECK-ERRORS:              ^
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: adds x9, x12, sym@PAGEOFF
-; CHECK-ERRORS:               ^
-
-sub x3, x5, sym@PAGEOFF
-sub w20, w30, sym@PAGEOFF
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: sub x3, x5, sym@PAGEOFF
-; CHECK-ERRORS:             ^
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: sub w20, w30, sym@PAGEOFF
-; CHECK-ERRORS:               ^
-
-subs w9, w10, sym@PAGEOFF
-subs x20, x30, sym@PAGEOFF
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: subs w9, w10, sym@PAGEOFF
-; CHECK-ERRORS:               ^
-; CHECK-ERRORS: error: invalid immediate expression
-; CHECK-ERRORS: subs x20, x30, sym@PAGEOFF
-; CHECK-ERRORS:                ^
-
-tbl v0.8b, { v1 }, v0.8b
-tbl v0.16b, { v1.8b, v2.8b, v3.8b }, v0.16b
-tbx v3.16b, { v12.8b, v13.8b, v14.8b }, v6.8b
-tbx v2.8b, { v0 }, v6.8b
-; CHECK-ERRORS: error: invalid operand for instruction
-; CHECK-ERRORS: tbl v0.8b, { v1 }, v0.8b
-; CHECK-ERRORS:            ^
-; CHECK-ERRORS: error: invalid operand for instruction
-; CHECK-ERRORS: tbl v0.16b, { v1.8b, v2.8b, v3.8b }, v0.16b
-; CHECK-ERRORS:             ^
-; CHECK-ERRORS: error: invalid operand for instruction
-; CHECK-ERRORS: tbx v3.16b, { v12.8b, v13.8b, v14.8b }, v6.8b
-; CHECK-ERRORS:             ^
-; CHECK-ERRORS: error: invalid operand for instruction
-; CHECK-ERRORS: tbx v2.8b, { v0 }, v6.8b
-; CHECK-ERRORS:            ^
-
-b.c #0x4
-; CHECK-ERRORS: error: invalid condition code
-; CHECK-ERRORS: b.c #0x4
-; CHECK-ERRORS:   ^
diff --git a/test/MC/ARM64/directive_loh.s b/test/MC/ARM64/directive_loh.s
deleted file mode 100644
index 76d2d7f..0000000
--- a/test/MC/ARM64/directive_loh.s
+++ /dev/null
@@ -1,93 +0,0 @@
-# RUN: not llvm-mc -triple arm64-apple-darwin < %s 2> %t | FileCheck %s
-# RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-.globl _fct1
-_fct1:
-  L1:
-  L2:
-  L3:
-  L4:
-  ret lr;
-
-# Known LOHs with:
-# - Regular syntax.
-# - Alternative syntax.
-
-# CHECK: .loh AdrpAdrp L1, L2
-# CHECK: .loh AdrpAdrp L1, L2
-.loh AdrpAdrp L1, L2
-.loh 1 L1, L2
-
-# CHECK: .loh AdrpLdr L1, L2
-# CHECK: .loh AdrpLdr L1, L2
-.loh AdrpLdr L1, L2
-.loh 2 L1, L2
-
-# CHECK: .loh AdrpAddLdr L1, L2, L3
-# CHECK: .loh AdrpAddLdr L1, L2, L3
-.loh AdrpAddLdr L1, L2, L3
-.loh 3 L1, L2, L3
-
-# CHECK: .loh AdrpLdrGotLdr L1, L2, L3
-# CHECK: .loh AdrpLdrGotLdr L1, L2, L3
-.loh AdrpLdrGotLdr L1, L2, L3
-.loh 4 L1, L2, L3
-
-# CHECK: .loh AdrpAddStr L1, L2, L3
-# CHECK: .loh AdrpAddStr L1, L2, L3
-.loh AdrpAddStr L1, L2, L3
-.loh 5 L1, L2, L3
-
-# CHECK: .loh AdrpLdrGotStr L1, L2, L3
-# CHECK: .loh AdrpLdrGotStr L1, L2, L3
-.loh AdrpLdrGotStr L1, L2, L3
-.loh 6 L1, L2, L3
-
-# CHECK: .loh AdrpAdd L1, L2
-# CHECK: .loh AdrpAdd L1, L2
-.loh AdrpAdd L1, L2
-.loh 7 L1, L2
-
-# CHECK: .loh AdrpLdrGot L1, L2
-# CHECK: .loh AdrpLdrGot L1, L2
-.loh AdrpLdrGot L1, L2
-.loh 8 L1, L2
-
-# End Known LOHs.
-
-### Errors Check ####
-
-# Unknown textual identifier.
-# CHECK-ERRORS: error: invalid identifier in directive
-# CHECK-ERRORS-NEXT: .loh Unknown
-# CHECK-ERRORS-NEXT:      ^
-.loh Unknown
-# Unknown numeric identifier.
-# CHECK-ERRORS: error: invalid numeric identifier in directive
-# CHECK-ERRORS-NEXT: .loh 153, L1
-# CHECK-ERRORS-NEXT:      ^
-.loh 153, L1
-
-# Too much arguments.
-# CHECK-ERRORS: error: unexpected token in '.loh' directive
-# CHECK-ERRORS-NEXT: .loh AdrpAdrp L1, L2, L3
-# CHECK-ERRORS-NEXT:                     ^
-.loh AdrpAdrp L1, L2, L3
-
-# Too much arguments with alternative syntax.
-# CHECK-ERRORS: error: unexpected token in '.loh' directive
-# CHECK-ERRORS-NEXT: .loh 1 L1, L2, L3
-# CHECK-ERRORS-NEXT:              ^
-.loh 1 L1, L2, L3
-
-# Too few argumets.
-# CHECK-ERRORS: error: unexpected token in '.loh' directive
-# CHECK-ERRORS-NEXT: .loh AdrpAdrp L1
-# CHECK-ERRORS-NEXT:                 ^
-.loh AdrpAdrp L1
-
-# Too few argumets with alternative syntax.
-# CHECK-ERRORS: error: unexpected token in '.loh' directive
-# CHECK-ERRORS-NEXT: .loh 1 L1
-# CHECK-ERRORS-NEXT:          ^
-.loh 1 L1
diff --git a/test/MC/ARM64/elf-relocs.s b/test/MC/ARM64/elf-relocs.s
deleted file mode 100644
index 31446ff..0000000
--- a/test/MC/ARM64/elf-relocs.s
+++ /dev/null
@@ -1,249 +0,0 @@
-// RUN: llvm-mc -triple=arm64-linux-gnu -o - < %s | FileCheck %s
-// RUN: llvm-mc -triple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
-
-   add x0, x2, #:lo12:sym
-// CHECK: add x0, x2, :lo12:sym
-// CHECK-OBJ: 0 R_AARCH64_ADD_ABS_LO12_NC sym
-
-   add x5, x7, #:dtprel_lo12:sym
-// CHECK: add x5, x7, :dtprel_lo12:sym
-// CHECK-OBJ: 4 R_AARCH64_TLSLD_ADD_DTPREL_LO12 sym
-
-   add x9, x12, #:dtprel_lo12_nc:sym
-// CHECK: add x9, x12, :dtprel_lo12_nc:sym
-// CHECK-OBJ: 8 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC sym
-
-   add x20, x30, #:tprel_lo12:sym
-// CHECK: add x20, lr, :tprel_lo12:sym
-// CHECK-OBJ: c R_AARCH64_TLSLE_ADD_TPREL_LO12 sym
-
-   add x9, x12, #:tprel_lo12_nc:sym
-// CHECK: add x9, x12, :tprel_lo12_nc:sym
-// CHECK-OBJ: 10 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC sym
-
-   add x5, x0, #:tlsdesc_lo12:sym
-// CHECK: add x5, x0, :tlsdesc_lo12:sym
-// CHECK-OBJ: 14 R_AARCH64_TLSDESC_ADD_LO12_NC sym
-
-        add x0, x2, #:lo12:sym+8
-// CHECK: add x0, x2, :lo12:sym
-// CHECK-OBJ: 18 R_AARCH64_ADD_ABS_LO12_NC sym+8
-
-   add x5, x7, #:dtprel_lo12:sym+1
-// CHECK: add x5, x7, :dtprel_lo12:sym+1
-// CHECK-OBJ: 1c R_AARCH64_TLSLD_ADD_DTPREL_LO12 sym+1
-
-   add x9, x12, #:dtprel_lo12_nc:sym+2
-// CHECK: add x9, x12, :dtprel_lo12_nc:sym+2
-// CHECK-OBJ:20 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC sym+2
-
-   add x20, x30, #:tprel_lo12:sym+12
-// CHECK: add x20, lr, :tprel_lo12:sym+12
-// CHECK-OBJ: 24 R_AARCH64_TLSLE_ADD_TPREL_LO12 sym+12
-
-   add x9, x12, #:tprel_lo12_nc:sym+54
-// CHECK: add x9, x12, :tprel_lo12_nc:sym+54
-// CHECK-OBJ: 28 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC sym+54
-
-   add x5, x0, #:tlsdesc_lo12:sym+70
-// CHECK: add x5, x0, :tlsdesc_lo12:sym+70
-// CHECK-OBJ: 2c R_AARCH64_TLSDESC_ADD_LO12_NC sym+70
-
-        .hword sym + 4 - .
-// CHECK-OBJ: 30 R_AARCH64_PREL16 sym+4
-        .word sym - . + 8
-// CHECK-OBJ: 32 R_AARCH64_PREL32 sym+8
-        .xword sym-.
-// CHECK-OBJ: 36 R_AARCH64_PREL64 sym{{$}}
-
-        .hword sym
-// CHECK-OBJ: 3e R_AARCH64_ABS16 sym
-        .word sym+1
-// CHECK-OBJ: 40 R_AARCH64_ABS32 sym+1
-        .xword sym+16
-// CHECK-OBJ: 44 R_AARCH64_ABS64 sym+16
-
-   adrp x0, sym
-// CHECK: adrp x0, sym
-// CHECK-OBJ: 4c R_AARCH64_ADR_PREL_PG_HI21 sym
-
-   adrp x15, :got:sym
-// CHECK: adrp x15, :got:sym
-// CHECK-OBJ: 50 R_AARCH64_ADR_GOT_PAGE sym
-
-   adrp x29, :gottprel:sym
-// CHECK: adrp fp, :gottprel:sym
-// CHECK-OBJ: 54 R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 sym
-
-   adrp x2, :tlsdesc:sym
-// CHECK: adrp x2, :tlsdesc:sym
-// CHECK-OBJ: 58 R_AARCH64_TLSDESC_ADR_PAGE sym
-
-   // LLVM is not competent enough to do this relocation because the
-   // page boundary could occur anywhere after linking. A relocation
-   // is needed.
-   adrp x3, trickQuestion
-   .global trickQuestion
-trickQuestion:
-// CHECK: adrp x3, trickQuestion
-// CHECK-OBJ: 5c R_AARCH64_ADR_PREL_PG_HI21 trickQuestion
-
-   ldrb w2, [x3, #:lo12:sym]
-   ldrsb w5, [x7, #:lo12:sym]
-   ldrsb x11, [x13, #:lo12:sym]
-   ldr b17, [x19, #:lo12:sym]
-// CHECK: ldrb w2, [x3, :lo12:sym]
-// CHECK: ldrsb w5, [x7, :lo12:sym]
-// CHECK: ldrsb x11, [x13, :lo12:sym]
-// CHECK: ldr b17, [x19, :lo12:sym]
-// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
-
-   ldrb w23, [x29, #:dtprel_lo12_nc:sym]
-   ldrsb w23, [x19, #:dtprel_lo12:sym]
-   ldrsb x17, [x13, #:dtprel_lo12_nc:sym]
-   ldr b11, [x7, #:dtprel_lo12:sym]
-// CHECK: ldrb w23, [fp, :dtprel_lo12_nc:sym]
-// CHECK: ldrsb w23, [x19, :dtprel_lo12:sym]
-// CHECK: ldrsb x17, [x13, :dtprel_lo12_nc:sym]
-// CHECK: ldr b11, [x7, :dtprel_lo12:sym]
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12 sym
-
-   ldrb w1, [x2, #:tprel_lo12:sym]
-   ldrsb w3, [x4, #:tprel_lo12_nc:sym]
-   ldrsb x5, [x6, #:tprel_lo12:sym]
-   ldr b7, [x8, #:tprel_lo12_nc:sym]
-// CHECK: ldrb w1, [x2, :tprel_lo12:sym]
-// CHECK: ldrsb w3, [x4, :tprel_lo12_nc:sym]
-// CHECK: ldrsb x5, [x6, :tprel_lo12:sym]
-// CHECK: ldr b7, [x8, :tprel_lo12_nc:sym]
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC sym
-
-   ldrh w2, [x3, #:lo12:sym]
-   ldrsh w5, [x7, #:lo12:sym]
-   ldrsh x11, [x13, #:lo12:sym]
-   ldr h17, [x19, #:lo12:sym]
-// CHECK: ldrh w2, [x3, :lo12:sym]
-// CHECK: ldrsh w5, [x7, :lo12:sym]
-// CHECK: ldrsh x11, [x13, :lo12:sym]
-// CHECK: ldr h17, [x19, :lo12:sym]
-// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
-
-   ldrh w23, [x29, #:dtprel_lo12_nc:sym]
-   ldrsh w23, [x19, #:dtprel_lo12:sym]
-   ldrsh x17, [x13, #:dtprel_lo12_nc:sym]
-   ldr h11, [x7, #:dtprel_lo12:sym]
-// CHECK: ldrh w23, [fp, :dtprel_lo12_nc:sym]
-// CHECK: ldrsh w23, [x19, :dtprel_lo12:sym]
-// CHECK: ldrsh x17, [x13, :dtprel_lo12_nc:sym]
-// CHECK: ldr h11, [x7, :dtprel_lo12:sym]
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12 sym
-
-   ldrh w1, [x2, #:tprel_lo12:sym]
-   ldrsh w3, [x4, #:tprel_lo12_nc:sym]
-   ldrsh x5, [x6, #:tprel_lo12:sym]
-   ldr h7, [x8, #:tprel_lo12_nc:sym]
-// CHECK: ldrh w1, [x2, :tprel_lo12:sym]
-// CHECK: ldrsh w3, [x4, :tprel_lo12_nc:sym]
-// CHECK: ldrsh x5, [x6, :tprel_lo12:sym]
-// CHECK: ldr h7, [x8, :tprel_lo12_nc:sym]
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC sym
-
-   ldr w1, [x2, #:lo12:sym]
-   ldrsw x3, [x4, #:lo12:sym]
-   ldr s4, [x5, #:lo12:sym]
-// CHECK: ldr w1, [x2, :lo12:sym]
-// CHECK: ldrsw x3, [x4, :lo12:sym]
-// CHECK: ldr s4, [x5, :lo12:sym]
-// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
-
-   ldr w1, [x2, #:dtprel_lo12:sym]
-   ldrsw x3, [x4, #:dtprel_lo12_nc:sym]
-   ldr s4, [x5, #:dtprel_lo12_nc:sym]
-// CHECK: ldr w1, [x2, :dtprel_lo12:sym]
-// CHECK: ldrsw x3, [x4, :dtprel_lo12_nc:sym]
-// CHECK: ldr s4, [x5, :dtprel_lo12_nc:sym]
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC sym
-
-
-   ldr w1, [x2, #:tprel_lo12:sym]
-   ldrsw x3, [x4, #:tprel_lo12_nc:sym]
-   ldr s4, [x5, #:tprel_lo12_nc:sym]
-// CHECK: ldr w1, [x2, :tprel_lo12:sym]
-// CHECK: ldrsw x3, [x4, :tprel_lo12_nc:sym]
-// CHECK: ldr s4, [x5, :tprel_lo12_nc:sym]
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC sym
-
-   ldr x28, [x27, #:lo12:sym]
-   ldr d26, [x25, #:lo12:sym]
-// CHECK: ldr x28, [x27, :lo12:sym]
-// CHECK: ldr d26, [x25, :lo12:sym]
-// CHECK-OBJ: R_AARCH64_LDST64_ABS_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LDST64_ABS_LO12_NC sym
-
-   ldr x24, [x23, #:got_lo12:sym]
-   ldr d22, [x21, #:got_lo12:sym]
-// CHECK: ldr x24, [x23, :got_lo12:sym]
-// CHECK: ldr d22, [x21, :got_lo12:sym]
-// CHECK-OBJ: R_AARCH64_LD64_GOT_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_LD64_GOT_LO12_NC sym
-
-   ldr x24, [x23, #:dtprel_lo12_nc:sym]
-   ldr d22, [x21, #:dtprel_lo12:sym]
-// CHECK: ldr x24, [x23, :dtprel_lo12_nc:sym]
-// CHECK: ldr d22, [x21, :dtprel_lo12:sym]
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSLD_LDST64_DTPREL_LO12 sym
-
-   ldr x24, [x23, #:tprel_lo12:sym]
-   ldr d22, [x21, #:tprel_lo12_nc:sym]
-// CHECK: ldr x24, [x23, :tprel_lo12:sym]
-// CHECK: ldr d22, [x21, :tprel_lo12_nc:sym]
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST64_TPREL_LO12 sym
-// CHECK-OBJ: R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC sym
-
-   ldr x24, [x23, #:gottprel_lo12:sym]
-   ldr d22, [x21, #:gottprel_lo12:sym]
-// CHECK: ldr x24, [x23, :gottprel_lo12:sym]
-// CHECK: ldr d22, [x21, :gottprel_lo12:sym]
-// CHECK-OBJ: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC sym
-
-   ldr x24, [x23, #:tlsdesc_lo12:sym]
-   ldr d22, [x21, #:tlsdesc_lo12:sym]
-// CHECK: ldr x24, [x23, :tlsdesc_lo12:sym]
-// CHECK: ldr d22, [x21, :tlsdesc_lo12:sym]
-// CHECK-OBJ: R_AARCH64_TLSDESC_LD64_LO12_NC sym
-// CHECK-OBJ: R_AARCH64_TLSDESC_LD64_LO12_NC sym
-
-   ldr q20, [x19, #:lo12:sym]
-// CHECK: ldr q20, [x19, :lo12:sym]
-// CHECK-OBJ: R_AARCH64_LDST128_ABS_LO12_NC sym
-
-// Since relocated instructions print without a '#', that syntax should
-// certainly be accepted when assembling.
-   add x3, x5, :lo12:imm
-// CHECK: add x3, x5, :lo12:imm
diff --git a/test/MC/ARM64/fp-encoding.s b/test/MC/ARM64/fp-encoding.s
deleted file mode 100644
index 25474c1..0000000
--- a/test/MC/ARM64/fp-encoding.s
+++ /dev/null
@@ -1,507 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-
-foo:
-;-----------------------------------------------------------------------------
-; Floating-point arithmetic
-;-----------------------------------------------------------------------------
-
-  fabs s1, s2
-  fabs d1, d2
-
-; CHECK: fabs s1, s2                 ; encoding: [0x41,0xc0,0x20,0x1e]
-; CHECK: fabs d1, d2                 ; encoding: [0x41,0xc0,0x60,0x1e]
-
-  fadd s1, s2, s3
-  fadd d1, d2, d3
-
-; CHECK: fadd s1, s2, s3             ; encoding: [0x41,0x28,0x23,0x1e]
-; CHECK: fadd d1, d2, d3             ; encoding: [0x41,0x28,0x63,0x1e]
-
-  fdiv s1, s2, s3
-  fdiv d1, d2, d3
-
-; CHECK: fdiv s1, s2, s3             ; encoding: [0x41,0x18,0x23,0x1e]
-; CHECK: fdiv d1, d2, d3             ; encoding: [0x41,0x18,0x63,0x1e]
-
-  fmadd s1, s2, s3, s4
-  fmadd d1, d2, d3, d4
-
-; CHECK: fmadd s1, s2, s3, s4        ; encoding: [0x41,0x10,0x03,0x1f]
-; CHECK: fmadd d1, d2, d3, d4        ; encoding: [0x41,0x10,0x43,0x1f]
-
-  fmax   s1, s2, s3
-  fmax   d1, d2, d3
-  fmaxnm s1, s2, s3
-  fmaxnm d1, d2, d3
-
-; CHECK: fmax   s1, s2, s3           ; encoding: [0x41,0x48,0x23,0x1e]
-; CHECK: fmax   d1, d2, d3           ; encoding: [0x41,0x48,0x63,0x1e]
-; CHECK: fmaxnm s1, s2, s3           ; encoding: [0x41,0x68,0x23,0x1e]
-; CHECK: fmaxnm d1, d2, d3           ; encoding: [0x41,0x68,0x63,0x1e]
-
-  fmin   s1, s2, s3
-  fmin   d1, d2, d3
-  fminnm s1, s2, s3
-  fminnm d1, d2, d3
-
-; CHECK: fmin   s1, s2, s3           ; encoding: [0x41,0x58,0x23,0x1e]
-; CHECK: fmin   d1, d2, d3           ; encoding: [0x41,0x58,0x63,0x1e]
-; CHECK: fminnm s1, s2, s3           ; encoding: [0x41,0x78,0x23,0x1e]
-; CHECK: fminnm d1, d2, d3           ; encoding: [0x41,0x78,0x63,0x1e]
-
-  fmsub s1, s2, s3, s4
-  fmsub d1, d2, d3, d4
-
-; CHECK: fmsub s1, s2, s3, s4        ; encoding: [0x41,0x90,0x03,0x1f]
-; CHECK: fmsub d1, d2, d3, d4        ; encoding: [0x41,0x90,0x43,0x1f]
-
-  fmul s1, s2, s3
-  fmul d1, d2, d3
-
-; CHECK: fmul s1, s2, s3             ; encoding: [0x41,0x08,0x23,0x1e]
-; CHECK: fmul d1, d2, d3             ; encoding: [0x41,0x08,0x63,0x1e]
-
-  fneg s1, s2
-  fneg d1, d2
-
-; CHECK: fneg s1, s2                 ; encoding: [0x41,0x40,0x21,0x1e]
-; CHECK: fneg d1, d2                 ; encoding: [0x41,0x40,0x61,0x1e]
-
-  fnmadd s1, s2, s3, s4
-  fnmadd d1, d2, d3, d4
-
-; CHECK: fnmadd s1, s2, s3, s4       ; encoding: [0x41,0x10,0x23,0x1f]
-; CHECK: fnmadd d1, d2, d3, d4       ; encoding: [0x41,0x10,0x63,0x1f]
-
-  fnmsub s1, s2, s3, s4
-  fnmsub d1, d2, d3, d4
-
-; CHECK: fnmsub s1, s2, s3, s4       ; encoding: [0x41,0x90,0x23,0x1f]
-; CHECK: fnmsub d1, d2, d3, d4       ; encoding: [0x41,0x90,0x63,0x1f]
-
-  fnmul s1, s2, s3
-  fnmul d1, d2, d3
-
-; CHECK: fnmul s1, s2, s3            ; encoding: [0x41,0x88,0x23,0x1e]
-; CHECK: fnmul d1, d2, d3            ; encoding: [0x41,0x88,0x63,0x1e]
-
-  fsqrt s1, s2
-  fsqrt d1, d2
-
-; CHECK: fsqrt s1, s2                ; encoding: [0x41,0xc0,0x21,0x1e]
-; CHECK: fsqrt d1, d2                ; encoding: [0x41,0xc0,0x61,0x1e]
-
-  fsub s1, s2, s3
-  fsub d1, d2, d3
-
-; CHECK: fsub s1, s2, s3             ; encoding: [0x41,0x38,0x23,0x1e]
-; CHECK: fsub d1, d2, d3             ; encoding: [0x41,0x38,0x63,0x1e]
-
-;-----------------------------------------------------------------------------
-; Floating-point comparison
-;-----------------------------------------------------------------------------
-
-  fccmp  s1, s2, #0, eq
-  fccmp  d1, d2, #0, eq
-  fccmpe s1, s2, #0, eq
-  fccmpe d1, d2, #0, eq
-
-; CHECK: fccmp  s1, s2, #0, eq       ; encoding: [0x20,0x04,0x22,0x1e]
-; CHECK: fccmp  d1, d2, #0, eq       ; encoding: [0x20,0x04,0x62,0x1e]
-; CHECK: fccmpe s1, s2, #0, eq       ; encoding: [0x30,0x04,0x22,0x1e]
-; CHECK: fccmpe d1, d2, #0, eq       ; encoding: [0x30,0x04,0x62,0x1e]
-
-  fcmp  s1, s2
-  fcmp  d1, d2
-  fcmp  s1, #0.0
-  fcmp  d1, #0.0
-  fcmpe s1, s2
-  fcmpe d1, d2
-  fcmpe s1, #0.0
-  fcmpe d1, #0.0
-
-; CHECK: fcmp  s1, s2                ; encoding: [0x20,0x20,0x22,0x1e]
-; CHECK: fcmp  d1, d2                ; encoding: [0x20,0x20,0x62,0x1e]
-; CHECK: fcmp  s1, #0.0              ; encoding: [0x28,0x20,0x20,0x1e]
-; CHECK: fcmp  d1, #0.0              ; encoding: [0x28,0x20,0x60,0x1e]
-; CHECK: fcmpe s1, s2                ; encoding: [0x30,0x20,0x22,0x1e]
-; CHECK: fcmpe d1, d2                ; encoding: [0x30,0x20,0x62,0x1e]
-; CHECK: fcmpe s1, #0.0              ; encoding: [0x38,0x20,0x20,0x1e]
-; CHECK: fcmpe d1, #0.0              ; encoding: [0x38,0x20,0x60,0x1e]
-
-;-----------------------------------------------------------------------------
-; Floating-point conditional select
-;-----------------------------------------------------------------------------
-
-  fcsel s1, s2, s3, eq
-  fcsel d1, d2, d3, eq
-
-; CHECK: fcsel s1, s2, s3, eq        ; encoding: [0x41,0x0c,0x23,0x1e]
-; CHECK: fcsel d1, d2, d3, eq        ; encoding: [0x41,0x0c,0x63,0x1e]
-
-;-----------------------------------------------------------------------------
-; Floating-point convert
-;-----------------------------------------------------------------------------
-
-  fcvt h1, d2
-  fcvt s1, d2
-  fcvt d1, h2
-  fcvt s1, h2
-  fcvt d1, s2
-  fcvt h1, s2
-
-; CHECK: fcvt h1, d2                 ; encoding: [0x41,0xc0,0x63,0x1e]
-; CHECK: fcvt s1, d2                 ; encoding: [0x41,0x40,0x62,0x1e]
-; CHECK: fcvt d1, h2                 ; encoding: [0x41,0xc0,0xe2,0x1e]
-; CHECK: fcvt s1, h2                 ; encoding: [0x41,0x40,0xe2,0x1e]
-; CHECK: fcvt d1, s2                 ; encoding: [0x41,0xc0,0x22,0x1e]
-; CHECK: fcvt h1, s2                 ; encoding: [0x41,0xc0,0x23,0x1e]
-
-  fcvtas w1, d2
-  fcvtas w1, d2, #1
-  fcvtas x1, d2
-  fcvtas x1, d2, #1
-  fcvtas w1, s2
-  fcvtas w1, s2, #1
-  fcvtas x1, s2
-  fcvtas x1, s2, #1
-
-; CHECK: fcvtas	w1, d2                  ; encoding: [0x41,0x00,0x64,0x1e]
-; CHECK: fcvtas	w1, d2, #1              ; encoding: [0x41,0xfc,0x44,0x1e]
-; CHECK: fcvtas	x1, d2                  ; encoding: [0x41,0x00,0x64,0x9e]
-; CHECK: fcvtas	x1, d2, #1              ; encoding: [0x41,0xfc,0x44,0x9e]
-; CHECK: fcvtas	w1, s2                  ; encoding: [0x41,0x00,0x24,0x1e]
-; CHECK: fcvtas	w1, s2, #1              ; encoding: [0x41,0xfc,0x04,0x1e]
-; CHECK: fcvtas	x1, s2                  ; encoding: [0x41,0x00,0x24,0x9e]
-; CHECK: fcvtas	x1, s2, #1              ; encoding: [0x41,0xfc,0x04,0x9e]
-
-  fcvtau w1, s2
-  fcvtau w1, s2, #1
-  fcvtau w1, d2
-  fcvtau w1, d2, #1
-  fcvtau x1, s2
-  fcvtau x1, s2, #1
-  fcvtau x1, d2
-  fcvtau x1, d2, #1
-
-; CHECK: fcvtau	w1, s2                  ; encoding: [0x41,0x00,0x25,0x1e]
-; CHECK: fcvtau	w1, s2, #1              ; encoding: [0x41,0xfc,0x05,0x1e]
-; CHECK: fcvtau	w1, d2                  ; encoding: [0x41,0x00,0x65,0x1e]
-; CHECK: fcvtau	w1, d2, #1              ; encoding: [0x41,0xfc,0x45,0x1e]
-; CHECK: fcvtau	x1, s2                  ; encoding: [0x41,0x00,0x25,0x9e]
-; CHECK: fcvtau	x1, s2, #1              ; encoding: [0x41,0xfc,0x05,0x9e]
-; CHECK: fcvtau	x1, d2                  ; encoding: [0x41,0x00,0x65,0x9e]
-; CHECK: fcvtau	x1, d2, #1              ; encoding: [0x41,0xfc,0x45,0x9e]
-
-  fcvtms w1, s2
-  fcvtms w1, s2, #1
-  fcvtms w1, d2
-  fcvtms w1, d2, #1
-  fcvtms x1, s2
-  fcvtms x1, s2, #1
-  fcvtms x1, d2
-  fcvtms x1, d2, #1
-
-; CHECK: fcvtms	w1, s2                  ; encoding: [0x41,0x00,0x30,0x1e]
-; CHECK: fcvtms	w1, s2, #1              ; encoding: [0x41,0xfc,0x10,0x1e]
-; CHECK: fcvtms	w1, d2                  ; encoding: [0x41,0x00,0x70,0x1e]
-; CHECK: fcvtms	w1, d2, #1              ; encoding: [0x41,0xfc,0x50,0x1e]
-; CHECK: fcvtms	x1, s2                  ; encoding: [0x41,0x00,0x30,0x9e]
-; CHECK: fcvtms	x1, s2, #1              ; encoding: [0x41,0xfc,0x10,0x9e]
-; CHECK: fcvtms	x1, d2                  ; encoding: [0x41,0x00,0x70,0x9e]
-; CHECK: fcvtms	x1, d2, #1              ; encoding: [0x41,0xfc,0x50,0x9e]
-
-  fcvtmu w1, s2
-  fcvtmu w1, s2, #1
-  fcvtmu w1, d2
-  fcvtmu w1, d2, #1
-  fcvtmu x1, s2
-  fcvtmu x1, s2, #1
-  fcvtmu x1, d2
-  fcvtmu x1, d2, #1
-
-; CHECK: fcvtmu	w1, s2                  ; encoding: [0x41,0x00,0x31,0x1e]
-; CHECK: fcvtmu	w1, s2, #1              ; encoding: [0x41,0xfc,0x11,0x1e]
-; CHECK: fcvtmu	w1, d2                  ; encoding: [0x41,0x00,0x71,0x1e]
-; CHECK: fcvtmu	w1, d2, #1              ; encoding: [0x41,0xfc,0x51,0x1e]
-; CHECK: fcvtmu	x1, s2                  ; encoding: [0x41,0x00,0x31,0x9e]
-; CHECK: fcvtmu	x1, s2, #1              ; encoding: [0x41,0xfc,0x11,0x9e]
-; CHECK: fcvtmu	x1, d2                  ; encoding: [0x41,0x00,0x71,0x9e]
-; CHECK: fcvtmu	x1, d2, #1              ; encoding: [0x41,0xfc,0x51,0x9e]
-
-  fcvtns w1, s2
-  fcvtns w1, s2, #1
-  fcvtns w1, d2
-  fcvtns w1, d2, #1
-  fcvtns x1, s2
-  fcvtns x1, s2, #1
-  fcvtns x1, d2
-  fcvtns x1, d2, #1
-
-; CHECK: fcvtns	w1, s2                  ; encoding: [0x41,0x00,0x20,0x1e]
-; CHECK: fcvtns	w1, s2, #1              ; encoding: [0x41,0xfc,0x00,0x1e]
-; CHECK: fcvtns	w1, d2                  ; encoding: [0x41,0x00,0x60,0x1e]
-; CHECK: fcvtns	w1, d2, #1              ; encoding: [0x41,0xfc,0x40,0x1e]
-; CHECK: fcvtns	x1, s2                  ; encoding: [0x41,0x00,0x20,0x9e]
-; CHECK: fcvtns	x1, s2, #1              ; encoding: [0x41,0xfc,0x00,0x9e]
-; CHECK: fcvtns	x1, d2                  ; encoding: [0x41,0x00,0x60,0x9e]
-; CHECK: fcvtns	x1, d2, #1              ; encoding: [0x41,0xfc,0x40,0x9e]
-
-  fcvtnu w1, s2
-  fcvtnu w1, s2, #1
-  fcvtnu w1, d2
-  fcvtnu w1, d2, #1
-  fcvtnu x1, s2
-  fcvtnu x1, s2, #1
-  fcvtnu x1, d2
-  fcvtnu x1, d2, #1
-
-; CHECK: fcvtnu	w1, s2                  ; encoding: [0x41,0x00,0x21,0x1e]
-; CHECK: fcvtnu	w1, s2, #1              ; encoding: [0x41,0xfc,0x01,0x1e]
-; CHECK: fcvtnu	w1, d2                  ; encoding: [0x41,0x00,0x61,0x1e]
-; CHECK: fcvtnu	w1, d2, #1              ; encoding: [0x41,0xfc,0x41,0x1e]
-; CHECK: fcvtnu	x1, s2                  ; encoding: [0x41,0x00,0x21,0x9e]
-; CHECK: fcvtnu	x1, s2, #1              ; encoding: [0x41,0xfc,0x01,0x9e]
-; CHECK: fcvtnu	x1, d2                  ; encoding: [0x41,0x00,0x61,0x9e]
-; CHECK: fcvtnu	x1, d2, #1              ; encoding: [0x41,0xfc,0x41,0x9e]
-
-  fcvtps w1, s2
-  fcvtps w1, s2, #1
-  fcvtps w1, d2
-  fcvtps w1, d2, #1
-  fcvtps x1, s2
-  fcvtps x1, s2, #1
-  fcvtps x1, d2
-  fcvtps x1, d2, #1
-
-; CHECK: fcvtps	w1, s2                  ; encoding: [0x41,0x00,0x28,0x1e]
-; CHECK: fcvtps	w1, s2, #1              ; encoding: [0x41,0xfc,0x08,0x1e]
-; CHECK: fcvtps	w1, d2                  ; encoding: [0x41,0x00,0x68,0x1e]
-; CHECK: fcvtps	w1, d2, #1              ; encoding: [0x41,0xfc,0x48,0x1e]
-; CHECK: fcvtps	x1, s2                  ; encoding: [0x41,0x00,0x28,0x9e]
-; CHECK: fcvtps	x1, s2, #1              ; encoding: [0x41,0xfc,0x08,0x9e]
-; CHECK: fcvtps	x1, d2                  ; encoding: [0x41,0x00,0x68,0x9e]
-; CHECK: fcvtps	x1, d2, #1              ; encoding: [0x41,0xfc,0x48,0x9e]
-
-  fcvtpu w1, s2
-  fcvtpu w1, s2, #1
-  fcvtpu w1, d2
-  fcvtpu w1, d2, #1
-  fcvtpu x1, s2
-  fcvtpu x1, s2, #1
-  fcvtpu x1, d2
-  fcvtpu x1, d2, #1
-
-; CHECK: fcvtpu	w1, s2                  ; encoding: [0x41,0x00,0x29,0x1e]
-; CHECK: fcvtpu	w1, s2, #1              ; encoding: [0x41,0xfc,0x09,0x1e]
-; CHECK: fcvtpu	w1, d2                  ; encoding: [0x41,0x00,0x69,0x1e]
-; CHECK: fcvtpu	w1, d2, #1              ; encoding: [0x41,0xfc,0x49,0x1e]
-; CHECK: fcvtpu	x1, s2                  ; encoding: [0x41,0x00,0x29,0x9e]
-; CHECK: fcvtpu	x1, s2, #1              ; encoding: [0x41,0xfc,0x09,0x9e]
-; CHECK: fcvtpu	x1, d2                  ; encoding: [0x41,0x00,0x69,0x9e]
-; CHECK: fcvtpu	x1, d2, #1              ; encoding: [0x41,0xfc,0x49,0x9e]
-
-  fcvtzs w1, s2
-  fcvtzs w1, s2, #1
-  fcvtzs w1, d2
-  fcvtzs w1, d2, #1
-  fcvtzs x1, s2
-  fcvtzs x1, s2, #1
-  fcvtzs x1, d2
-  fcvtzs x1, d2, #1
-
-; CHECK: fcvtzs	w1, s2                  ; encoding: [0x41,0x00,0x38,0x1e]
-; CHECK: fcvtzs	w1, s2, #1              ; encoding: [0x41,0xfc,0x18,0x1e]
-; CHECK: fcvtzs	w1, d2                  ; encoding: [0x41,0x00,0x78,0x1e]
-; CHECK: fcvtzs	w1, d2, #1              ; encoding: [0x41,0xfc,0x58,0x1e]
-; CHECK: fcvtzs	x1, s2                  ; encoding: [0x41,0x00,0x38,0x9e]
-; CHECK: fcvtzs	x1, s2, #1              ; encoding: [0x41,0xfc,0x18,0x9e]
-; CHECK: fcvtzs	x1, d2                  ; encoding: [0x41,0x00,0x78,0x9e]
-; CHECK: fcvtzs	x1, d2, #1              ; encoding: [0x41,0xfc,0x58,0x9e]
-
-  fcvtzu w1, s2
-  fcvtzu w1, s2, #1
-  fcvtzu w1, d2
-  fcvtzu w1, d2, #1
-  fcvtzu x1, s2
-  fcvtzu x1, s2, #1
-  fcvtzu x1, d2
-  fcvtzu x1, d2, #1
-
-; CHECK: fcvtzu	w1, s2                  ; encoding: [0x41,0x00,0x39,0x1e]
-; CHECK: fcvtzu	w1, s2, #1              ; encoding: [0x41,0xfc,0x19,0x1e]
-; CHECK: fcvtzu	w1, d2                  ; encoding: [0x41,0x00,0x79,0x1e]
-; CHECK: fcvtzu	w1, d2, #1              ; encoding: [0x41,0xfc,0x59,0x1e]
-; CHECK: fcvtzu	x1, s2                  ; encoding: [0x41,0x00,0x39,0x9e]
-; CHECK: fcvtzu	x1, s2, #1              ; encoding: [0x41,0xfc,0x19,0x9e]
-; CHECK: fcvtzu	x1, d2                  ; encoding: [0x41,0x00,0x79,0x9e]
-; CHECK: fcvtzu	x1, d2, #1              ; encoding: [0x41,0xfc,0x59,0x9e]
-
-  scvtf s1, w2
-  scvtf s1, w2, #1
-  scvtf d1, w2
-  scvtf d1, w2, #1
-  scvtf s1, x2
-  scvtf s1, x2, #1
-  scvtf d1, x2
-  scvtf d1, x2, #1
-
-; CHECK: scvtf	s1, w2                  ; encoding: [0x41,0x00,0x22,0x1e]
-; CHECK: scvtf	s1, w2, #1              ; encoding: [0x41,0xfc,0x02,0x1e]
-; CHECK: scvtf	d1, w2                  ; encoding: [0x41,0x00,0x62,0x1e]
-; CHECK: scvtf	d1, w2, #1              ; encoding: [0x41,0xfc,0x42,0x1e]
-; CHECK: scvtf	s1, x2                  ; encoding: [0x41,0x00,0x22,0x9e]
-; CHECK: scvtf	s1, x2, #1              ; encoding: [0x41,0xfc,0x02,0x9e]
-; CHECK: scvtf	d1, x2                  ; encoding: [0x41,0x00,0x62,0x9e]
-; CHECK: scvtf	d1, x2, #1              ; encoding: [0x41,0xfc,0x42,0x9e]
-
-  ucvtf s1, w2
-  ucvtf s1, w2, #1
-  ucvtf d1, w2
-  ucvtf d1, w2, #1
-  ucvtf s1, x2
-  ucvtf s1, x2, #1
-  ucvtf d1, x2
-  ucvtf d1, x2, #1
-
-; CHECK: ucvtf	s1, w2                  ; encoding: [0x41,0x00,0x23,0x1e]
-; CHECK: ucvtf	s1, w2, #1              ; encoding: [0x41,0xfc,0x03,0x1e]
-; CHECK: ucvtf	d1, w2                  ; encoding: [0x41,0x00,0x63,0x1e]
-; CHECK: ucvtf	d1, w2, #1              ; encoding: [0x41,0xfc,0x43,0x1e]
-; CHECK: ucvtf	s1, x2                  ; encoding: [0x41,0x00,0x23,0x9e]
-; CHECK: ucvtf	s1, x2, #1              ; encoding: [0x41,0xfc,0x03,0x9e]
-; CHECK: ucvtf	d1, x2                  ; encoding: [0x41,0x00,0x63,0x9e]
-; CHECK: ucvtf	d1, x2, #1              ; encoding: [0x41,0xfc,0x43,0x9e]
-
-;-----------------------------------------------------------------------------
-; Floating-point move
-;-----------------------------------------------------------------------------
-
-  fmov s1, w2
-  fmov w1, s2
-  fmov d1, x2
-  fmov x1, d2
-
-; CHECK: fmov s1, w2                 ; encoding: [0x41,0x00,0x27,0x1e]
-; CHECK: fmov w1, s2                 ; encoding: [0x41,0x00,0x26,0x1e]
-; CHECK: fmov d1, x2                 ; encoding: [0x41,0x00,0x67,0x9e]
-; CHECK: fmov x1, d2                 ; encoding: [0x41,0x00,0x66,0x9e]
-
-  fmov s1, #0.125
-  fmov s1, #0x40
-  fmov d1, #0.125
-  fmov d1, #0x40
-  fmov d1, #-4.843750e-01
-  fmov d1, #4.843750e-01
-  fmov d3, #3
-  fmov s2, #0.0
-  fmov d2, #0.0
-
-; CHECK: fmov s1, #1.250000e-01      ; encoding: [0x01,0x10,0x28,0x1e]
-; CHECK: fmov s1, #1.250000e-01      ; encoding: [0x01,0x10,0x28,0x1e]
-; CHECK: fmov d1, #1.250000e-01      ; encoding: [0x01,0x10,0x68,0x1e]
-; CHECK: fmov d1, #1.250000e-01      ; encoding: [0x01,0x10,0x68,0x1e]
-; CHECK: fmov d1, #-4.843750e-01     ; encoding: [0x01,0xf0,0x7b,0x1e]
-; CHECK: fmov d1, #4.843750e-01      ; encoding: [0x01,0xf0,0x6b,0x1e]
-; CHECK: fmov d3, #3.000000e+00      ; encoding: [0x03,0x10,0x61,0x1e]
-; CHECK: fmov s2, wzr                ; encoding: [0xe2,0x03,0x27,0x1e]
-; CHECK: fmov d2, xzr                ; encoding: [0xe2,0x03,0x67,0x9e]
-
-  fmov s1, s2
-  fmov d1, d2
-
-; CHECK: fmov s1, s2                 ; encoding: [0x41,0x40,0x20,0x1e]
-; CHECK: fmov d1, d2                 ; encoding: [0x41,0x40,0x60,0x1e]
-
-
-  fmov x2, v5.d[1]
-  fmov.d x9, v7[1]
-  fmov v1.d[1], x1
-  fmov.d v8[1], x6
-
-; CHECK: fmov.d	x2, v5[1]               ; encoding: [0xa2,0x00,0xae,0x9e]
-; CHECK: fmov.d	x9, v7[1]               ; encoding: [0xe9,0x00,0xae,0x9e]
-; CHECK: fmov.d	v1[1], x1               ; encoding: [0x21,0x00,0xaf,0x9e]
-; CHECK: fmov.d	v8[1], x6               ; encoding: [0xc8,0x00,0xaf,0x9e]
-
-
-;-----------------------------------------------------------------------------
-; Floating-point round to integral
-;-----------------------------------------------------------------------------
-
-  frinta s1, s2
-  frinta d1, d2
-
-; CHECK: frinta s1, s2               ; encoding: [0x41,0x40,0x26,0x1e]
-; CHECK: frinta d1, d2               ; encoding: [0x41,0x40,0x66,0x1e]
-
-  frinti s1, s2
-  frinti d1, d2
-
-; CHECK: frinti s1, s2               ; encoding: [0x41,0xc0,0x27,0x1e]
-; CHECK: frinti d1, d2               ; encoding: [0x41,0xc0,0x67,0x1e]
-
-  frintm s1, s2
-  frintm d1, d2
-
-; CHECK: frintm s1, s2               ; encoding: [0x41,0x40,0x25,0x1e]
-; CHECK: frintm d1, d2               ; encoding: [0x41,0x40,0x65,0x1e]
-
-  frintn s1, s2
-  frintn d1, d2
-
-; CHECK: frintn s1, s2               ; encoding: [0x41,0x40,0x24,0x1e]
-; CHECK: frintn d1, d2               ; encoding: [0x41,0x40,0x64,0x1e]
-
-  frintp s1, s2
-  frintp d1, d2
-
-; CHECK: frintp s1, s2               ; encoding: [0x41,0xc0,0x24,0x1e]
-; CHECK: frintp d1, d2               ; encoding: [0x41,0xc0,0x64,0x1e]
-
-  frintx s1, s2
-  frintx d1, d2
-
-; CHECK: frintx s1, s2               ; encoding: [0x41,0x40,0x27,0x1e]
-; CHECK: frintx d1, d2               ; encoding: [0x41,0x40,0x67,0x1e]
-
-  frintz s1, s2
-  frintz d1, d2
-
-; CHECK: frintz s1, s2               ; encoding: [0x41,0xc0,0x25,0x1e]
-; CHECK: frintz d1, d2               ; encoding: [0x41,0xc0,0x65,0x1e]
-
-  cmhs d0, d0, d0
-  cmtst d0, d0, d0
-
-; CHECK: cmhs	d0, d0, d0              ; encoding: [0x00,0x3c,0xe0,0x7e]
-; CHECK: cmtst	d0, d0, d0              ; encoding: [0x00,0x8c,0xe0,0x5e]
-
-
-
-;-----------------------------------------------------------------------------
-; Floating-point extract and narrow
-;-----------------------------------------------------------------------------
-  sqxtn b4, h2
-  sqxtn h2, s3
-  sqxtn s9, d2
-
-; CHECK: sqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x5e]
-; CHECK: sqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x5e]
-; CHECK: sqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x5e]
-
-  sqxtun b4, h2
-  sqxtun h2, s3
-  sqxtun s9, d2
-
-; CHECK: sqxtun b4, h2                  ; encoding: [0x44,0x28,0x21,0x7e]
-; CHECK: sqxtun h2, s3                  ; encoding: [0x62,0x28,0x61,0x7e]
-; CHECK: sqxtun s9, d2                  ; encoding: [0x49,0x28,0xa1,0x7e]
-
-  uqxtn b4, h2
-  uqxtn h2, s3
-  uqxtn s9, d2
-
-; CHECK: uqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x7e]
-; CHECK: uqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x7e]
-; CHECK: uqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x7e]
diff --git a/test/MC/ARM64/large-relocs.s b/test/MC/ARM64/large-relocs.s
deleted file mode 100644
index 348ceb6..0000000
--- a/test/MC/ARM64/large-relocs.s
+++ /dev/null
@@ -1,38 +0,0 @@
-// RUN: llvm-mc -triple=arm64-linux-gnu -show-encoding -o - %s | FileCheck %s
-// RUN: llvm-mc -triple=arm64-linux-gnu -show-encoding -filetype=obj -o - %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-OBJ %s
-
-        movz x2, #:abs_g0:sym
-        movk w3, #:abs_g0_nc:sym
-// CHECK: movz    x2, #:abs_g0:sym        // encoding: [0bAAA00010,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_arm64_movw
-// CHECK: movk     w3, #:abs_g0_nc:sym    // encoding: [0bAAA00011,A,0b100AAAAA,0x72]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_arm64_movw
-
-// CHECK-OBJ: 0 R_AARCH64_MOVW_UABS_G0 sym
-// CHECK-OBJ: 4 R_AARCH64_MOVW_UABS_G0_NC sym
-
-        movz x4, #:abs_g1:sym
-        movk w5, #:abs_g1_nc:sym
-// CHECK: movz     x4, #:abs_g1:sym       // encoding: [0bAAA00100,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_arm64_movw
-// CHECK: movk     w5, #:abs_g1_nc:sym    // encoding: [0bAAA00101,A,0b101AAAAA,0x72]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_arm64_movw
-
-// CHECK-OBJ: 8 R_AARCH64_MOVW_UABS_G1 sym
-// CHECK-OBJ: c R_AARCH64_MOVW_UABS_G1_NC sym
-
-        movz x6, #:abs_g2:sym
-        movk x7, #:abs_g2_nc:sym
-// CHECK: movz     x6, #:abs_g2:sym       // encoding: [0bAAA00110,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_arm64_movw
-// CHECK: movk     x7, #:abs_g2_nc:sym    // encoding: [0bAAA00111,A,0b110AAAAA,0xf2]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_arm64_movw
-
-// CHECK-OBJ: 10 R_AARCH64_MOVW_UABS_G2 sym
-// CHECK-OBJ: 14 R_AARCH64_MOVW_UABS_G2_NC sym
-
-        movz x8, #:abs_g3:sym
-// CHECK: movz     x8, #:abs_g3:sym       // encoding: [0bAAA01000,A,0b111AAAAA,0x92]
-// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_arm64_movw
-
-// CHECK-OBJ: 18 R_AARCH64_MOVW_UABS_G3 sym
diff --git a/test/MC/ARM64/lit.local.cfg b/test/MC/ARM64/lit.local.cfg
deleted file mode 100644
index 49447af..0000000
--- a/test/MC/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.s']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/MC/ARM64/logical-encoding.s b/test/MC/ARM64/logical-encoding.s
deleted file mode 100644
index e5f1436..0000000
--- a/test/MC/ARM64/logical-encoding.s
+++ /dev/null
@@ -1,224 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-
-foo:
-;==---------------------------------------------------------------------------==
-; 5.4.2 Logical (immediate)
-;==---------------------------------------------------------------------------==
-
-  and   w0, w0, #1
-  and   x0, x0, #1
-  and   w1, w2, #15
-  and   x1, x2, #15
-  and   sp, x5, #~15
-  ands  w0, w0, #1
-  ands  x0, x0, #1
-  ands  w1, w2, #15
-  ands  x1, x2, #15
-
-; CHECK: and  w0, w0, #0x1           ; encoding: [0x00,0x00,0x00,0x12]
-; CHECK: and  x0, x0, #0x1           ; encoding: [0x00,0x00,0x40,0x92]
-; CHECK: and  w1, w2, #0xf           ; encoding: [0x41,0x0c,0x00,0x12]
-; CHECK: and  x1, x2, #0xf           ; encoding: [0x41,0x0c,0x40,0x92]
-; CHECK: and  sp, x5, #0xfffffffffffffff0 ; encoding: [0xbf,0xec,0x7c,0x92]
-; CHECK: ands w0, w0, #0x1           ; encoding: [0x00,0x00,0x00,0x72]
-; CHECK: ands x0, x0, #0x1           ; encoding: [0x00,0x00,0x40,0xf2]
-; CHECK: ands w1, w2, #0xf           ; encoding: [0x41,0x0c,0x00,0x72]
-; CHECK: ands x1, x2, #0xf           ; encoding: [0x41,0x0c,0x40,0xf2]
-
-  eor w1, w2, #0x4000
-  eor x1, x2, #0x8000
-
-; CHECK: eor w1, w2, #0x4000         ; encoding: [0x41,0x00,0x12,0x52]
-; CHECK: eor x1, x2, #0x8000         ; encoding: [0x41,0x00,0x71,0xd2]
-
-  orr w1, w2, #0x4000
-  orr x1, x2, #0x8000
-
-; CHECK: orr w1, w2, #0x4000         ; encoding: [0x41,0x00,0x12,0x32]
-; CHECK: orr x1, x2, #0x8000         ; encoding: [0x41,0x00,0x71,0xb2]
-
-  orr w8, wzr, #0x1
-  orr x8, xzr, #0x1
-
-; CHECK: orr w8, wzr, #0x1           ; encoding: [0xe8,0x03,0x00,0x32]
-; CHECK: orr x8, xzr, #0x1           ; encoding: [0xe8,0x03,0x40,0xb2]
-
-;==---------------------------------------------------------------------------==
-; 5.5.3 Logical (shifted register)
-;==---------------------------------------------------------------------------==
-
-  and   w1, w2, w3
-  and   x1, x2, x3
-  and   w1, w2, w3, lsl #2
-  and   x1, x2, x3, lsl #2
-  and   w1, w2, w3, lsr #2
-  and   x1, x2, x3, lsr #2
-  and   w1, w2, w3, asr #2
-  and   x1, x2, x3, asr #2
-  and   w1, w2, w3, ror #2
-  and   x1, x2, x3, ror #2
-
-; CHECK: and  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x0a]
-; CHECK: and  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0x8a]
-; CHECK: and  w1, w2, w3, lsl #2     ; encoding: [0x41,0x08,0x03,0x0a]
-; CHECK: and  x1, x2, x3, lsl #2     ; encoding: [0x41,0x08,0x03,0x8a]
-; CHECK: and  w1, w2, w3, lsr #2     ; encoding: [0x41,0x08,0x43,0x0a]
-; CHECK: and  x1, x2, x3, lsr #2     ; encoding: [0x41,0x08,0x43,0x8a]
-; CHECK: and  w1, w2, w3, asr #2     ; encoding: [0x41,0x08,0x83,0x0a]
-; CHECK: and  x1, x2, x3, asr #2     ; encoding: [0x41,0x08,0x83,0x8a]
-; CHECK: and  w1, w2, w3, ror #2     ; encoding: [0x41,0x08,0xc3,0x0a]
-; CHECK: and  x1, x2, x3, ror #2     ; encoding: [0x41,0x08,0xc3,0x8a]
-
-  ands  w1, w2, w3
-  ands  x1, x2, x3
-  ands  w1, w2, w3, lsl #2
-  ands  x1, x2, x3, lsl #2
-  ands  w1, w2, w3, lsr #2
-  ands  x1, x2, x3, lsr #2
-  ands  w1, w2, w3, asr #2
-  ands  x1, x2, x3, asr #2
-  ands  w1, w2, w3, ror #2
-  ands  x1, x2, x3, ror #2
-
-; CHECK: ands w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x6a]
-; CHECK: ands x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xea]
-; CHECK: ands w1, w2, w3, lsl #2     ; encoding: [0x41,0x08,0x03,0x6a]
-; CHECK: ands x1, x2, x3, lsl #2     ; encoding: [0x41,0x08,0x03,0xea]
-; CHECK: ands w1, w2, w3, lsr #2     ; encoding: [0x41,0x08,0x43,0x6a]
-; CHECK: ands x1, x2, x3, lsr #2     ; encoding: [0x41,0x08,0x43,0xea]
-; CHECK: ands w1, w2, w3, asr #2     ; encoding: [0x41,0x08,0x83,0x6a]
-; CHECK: ands x1, x2, x3, asr #2     ; encoding: [0x41,0x08,0x83,0xea]
-; CHECK: ands w1, w2, w3, ror #2     ; encoding: [0x41,0x08,0xc3,0x6a]
-; CHECK: ands x1, x2, x3, ror #2     ; encoding: [0x41,0x08,0xc3,0xea]
-
-  bic w1, w2, w3
-  bic x1, x2, x3
-  bic w1, w2, w3, lsl #3
-  bic x1, x2, x3, lsl #3
-  bic w1, w2, w3, lsr #3
-  bic x1, x2, x3, lsr #3
-  bic w1, w2, w3, asr #3
-  bic x1, x2, x3, asr #3
-  bic w1, w2, w3, ror #3
-  bic x1, x2, x3, ror #3
-
-; CHECK: bic w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x0a]
-; CHECK: bic x1, x2, x3              ; encoding: [0x41,0x00,0x23,0x8a]
-; CHECK: bic w1, w2, w3, lsl #3      ; encoding: [0x41,0x0c,0x23,0x0a]
-; CHECK: bic x1, x2, x3, lsl #3      ; encoding: [0x41,0x0c,0x23,0x8a]
-; CHECK: bic w1, w2, w3, lsr #3      ; encoding: [0x41,0x0c,0x63,0x0a]
-; CHECK: bic x1, x2, x3, lsr #3      ; encoding: [0x41,0x0c,0x63,0x8a]
-; CHECK: bic w1, w2, w3, asr #3      ; encoding: [0x41,0x0c,0xa3,0x0a]
-; CHECK: bic x1, x2, x3, asr #3      ; encoding: [0x41,0x0c,0xa3,0x8a]
-; CHECK: bic w1, w2, w3, ror #3      ; encoding: [0x41,0x0c,0xe3,0x0a]
-; CHECK: bic x1, x2, x3, ror #3      ; encoding: [0x41,0x0c,0xe3,0x8a]
-
-  bics w1, w2, w3
-  bics x1, x2, x3
-  bics w1, w2, w3, lsl #3
-  bics x1, x2, x3, lsl #3
-  bics w1, w2, w3, lsr #3
-  bics x1, x2, x3, lsr #3
-  bics w1, w2, w3, asr #3
-  bics x1, x2, x3, asr #3
-  bics w1, w2, w3, ror #3
-  bics x1, x2, x3, ror #3
-
-; CHECK: bics w1, w2, w3             ; encoding: [0x41,0x00,0x23,0x6a]
-; CHECK: bics x1, x2, x3             ; encoding: [0x41,0x00,0x23,0xea]
-; CHECK: bics w1, w2, w3, lsl #3     ; encoding: [0x41,0x0c,0x23,0x6a]
-; CHECK: bics x1, x2, x3, lsl #3     ; encoding: [0x41,0x0c,0x23,0xea]
-; CHECK: bics w1, w2, w3, lsr #3     ; encoding: [0x41,0x0c,0x63,0x6a]
-; CHECK: bics x1, x2, x3, lsr #3     ; encoding: [0x41,0x0c,0x63,0xea]
-; CHECK: bics w1, w2, w3, asr #3     ; encoding: [0x41,0x0c,0xa3,0x6a]
-; CHECK: bics x1, x2, x3, asr #3     ; encoding: [0x41,0x0c,0xa3,0xea]
-; CHECK: bics w1, w2, w3, ror #3     ; encoding: [0x41,0x0c,0xe3,0x6a]
-; CHECK: bics x1, x2, x3, ror #3     ; encoding: [0x41,0x0c,0xe3,0xea]
-
-  eon w1, w2, w3
-  eon x1, x2, x3
-  eon w1, w2, w3, lsl #4
-  eon x1, x2, x3, lsl #4
-  eon w1, w2, w3, lsr #4
-  eon x1, x2, x3, lsr #4
-  eon w1, w2, w3, asr #4
-  eon x1, x2, x3, asr #4
-  eon w1, w2, w3, ror #4
-  eon x1, x2, x3, ror #4
-
-; CHECK: eon w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x4a]
-; CHECK: eon x1, x2, x3              ; encoding: [0x41,0x00,0x23,0xca]
-; CHECK: eon w1, w2, w3, lsl #4      ; encoding: [0x41,0x10,0x23,0x4a]
-; CHECK: eon x1, x2, x3, lsl #4      ; encoding: [0x41,0x10,0x23,0xca]
-; CHECK: eon w1, w2, w3, lsr #4      ; encoding: [0x41,0x10,0x63,0x4a]
-; CHECK: eon x1, x2, x3, lsr #4      ; encoding: [0x41,0x10,0x63,0xca]
-; CHECK: eon w1, w2, w3, asr #4      ; encoding: [0x41,0x10,0xa3,0x4a]
-; CHECK: eon x1, x2, x3, asr #4      ; encoding: [0x41,0x10,0xa3,0xca]
-; CHECK: eon w1, w2, w3, ror #4      ; encoding: [0x41,0x10,0xe3,0x4a]
-; CHECK: eon x1, x2, x3, ror #4      ; encoding: [0x41,0x10,0xe3,0xca]
-
-  eor w1, w2, w3
-  eor x1, x2, x3
-  eor w1, w2, w3, lsl #5
-  eor x1, x2, x3, lsl #5
-  eor w1, w2, w3, lsr #5
-  eor x1, x2, x3, lsr #5
-  eor w1, w2, w3, asr #5
-  eor x1, x2, x3, asr #5
-  eor w1, w2, w3, ror #5
-  eor x1, x2, x3, ror #5
-
-; CHECK: eor w1, w2, w3              ; encoding: [0x41,0x00,0x03,0x4a]
-; CHECK: eor x1, x2, x3              ; encoding: [0x41,0x00,0x03,0xca]
-; CHECK: eor w1, w2, w3, lsl #5      ; encoding: [0x41,0x14,0x03,0x4a]
-; CHECK: eor x1, x2, x3, lsl #5      ; encoding: [0x41,0x14,0x03,0xca]
-; CHECK: eor w1, w2, w3, lsr #5      ; encoding: [0x41,0x14,0x43,0x4a]
-; CHECK: eor x1, x2, x3, lsr #5      ; encoding: [0x41,0x14,0x43,0xca]
-; CHECK: eor w1, w2, w3, asr #5      ; encoding: [0x41,0x14,0x83,0x4a]
-; CHECK: eor x1, x2, x3, asr #5      ; encoding: [0x41,0x14,0x83,0xca]
-; CHECK: eor w1, w2, w3, ror #5      ; encoding: [0x41,0x14,0xc3,0x4a]
-; CHECK: eor x1, x2, x3, ror #5      ; encoding: [0x41,0x14,0xc3,0xca]
-
-  orr w1, w2, w3
-  orr x1, x2, x3
-  orr w1, w2, w3, lsl #6
-  orr x1, x2, x3, lsl #6
-  orr w1, w2, w3, lsr #6
-  orr x1, x2, x3, lsr #6
-  orr w1, w2, w3, asr #6
-  orr x1, x2, x3, asr #6
-  orr w1, w2, w3, ror #6
-  orr x1, x2, x3, ror #6
-
-; CHECK: orr w1, w2, w3              ; encoding: [0x41,0x00,0x03,0x2a]
-; CHECK: orr x1, x2, x3              ; encoding: [0x41,0x00,0x03,0xaa]
-; CHECK: orr w1, w2, w3, lsl #6      ; encoding: [0x41,0x18,0x03,0x2a]
-; CHECK: orr x1, x2, x3, lsl #6      ; encoding: [0x41,0x18,0x03,0xaa]
-; CHECK: orr w1, w2, w3, lsr #6      ; encoding: [0x41,0x18,0x43,0x2a]
-; CHECK: orr x1, x2, x3, lsr #6      ; encoding: [0x41,0x18,0x43,0xaa]
-; CHECK: orr w1, w2, w3, asr #6      ; encoding: [0x41,0x18,0x83,0x2a]
-; CHECK: orr x1, x2, x3, asr #6      ; encoding: [0x41,0x18,0x83,0xaa]
-; CHECK: orr w1, w2, w3, ror #6      ; encoding: [0x41,0x18,0xc3,0x2a]
-; CHECK: orr x1, x2, x3, ror #6      ; encoding: [0x41,0x18,0xc3,0xaa]
-
-  orn w1, w2, w3
-  orn x1, x2, x3
-  orn w1, w2, w3, lsl #7
-  orn x1, x2, x3, lsl #7
-  orn w1, w2, w3, lsr #7
-  orn x1, x2, x3, lsr #7
-  orn w1, w2, w3, asr #7
-  orn x1, x2, x3, asr #7
-  orn w1, w2, w3, ror #7
-  orn x1, x2, x3, ror #7
-
-; CHECK: orn w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x2a]
-; CHECK: orn x1, x2, x3              ; encoding: [0x41,0x00,0x23,0xaa]
-; CHECK: orn w1, w2, w3, lsl #7      ; encoding: [0x41,0x1c,0x23,0x2a]
-; CHECK: orn x1, x2, x3, lsl #7      ; encoding: [0x41,0x1c,0x23,0xaa]
-; CHECK: orn w1, w2, w3, lsr #7      ; encoding: [0x41,0x1c,0x63,0x2a]
-; CHECK: orn x1, x2, x3, lsr #7      ; encoding: [0x41,0x1c,0x63,0xaa]
-; CHECK: orn w1, w2, w3, asr #7      ; encoding: [0x41,0x1c,0xa3,0x2a]
-; CHECK: orn x1, x2, x3, asr #7      ; encoding: [0x41,0x1c,0xa3,0xaa]
-; CHECK: orn w1, w2, w3, ror #7      ; encoding: [0x41,0x1c,0xe3,0x2a]
-; CHECK: orn x1, x2, x3, ror #7      ; encoding: [0x41,0x1c,0xe3,0xaa]
diff --git a/test/MC/ARM64/mapping-across-sections.s b/test/MC/ARM64/mapping-across-sections.s
deleted file mode 100644
index 00b324c..0000000
--- a/test/MC/ARM64/mapping-across-sections.s
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
-
-        .text
-        add w0, w0, w0
-
-// .wibble should *not* inherit .text's mapping symbol. It's a completely different section.
-        .section .wibble
-        add w0, w0, w0
-
-// A setion should be able to start with a $d
-        .section .starts_data
-        .word 42
-
-// Changing back to .text should not emit a redundant $x
-        .text
-        add w0, w0, w0
-
-// With all those constraints, we want:
-//   + .text to have $x at 0 and no others
-//   + .wibble to have $x at 0
-//   + .starts_data to have $d at 0
-
-
-// CHECK: 00000000 .starts_data 00000000 $d
-// CHECK-NEXT: 00000000 .text 00000000 $x
-// CHECK-NEXT: 00000000 .wibble 00000000 $x
-// CHECK-NOT: ${{[adtx]}}
-
diff --git a/test/MC/ARM64/mapping-within-section.s b/test/MC/ARM64/mapping-within-section.s
deleted file mode 100644
index f515cb9..0000000
--- a/test/MC/ARM64/mapping-within-section.s
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
-
-    .text
-// $x at 0x0000
-    add w0, w0, w0
-// $d at 0x0004
-    .ascii "012"
-    .byte 1
-    .hword 2
-    .word 4
-    .xword 8
-    .single 4.0
-    .double 8.0
-    .space 10
-    .zero 3
-    .fill 10, 2, 42
-    .org 100, 12
-// $x at 0x0018
-    add x0, x0, x0
-
-// CHECK: 00000004         .text  00000000 $d
-// CHECK-NEXT: 00000000         .text  00000000 $x
-// CHECK-NEXT: 00000064         .text  00000000 $x
diff --git a/test/MC/ARM64/memory.s b/test/MC/ARM64/memory.s
deleted file mode 100644
index 0e8f1d5..0000000
--- a/test/MC/ARM64/memory.s
+++ /dev/null
@@ -1,634 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-
-foo:
-;-----------------------------------------------------------------------------
-; Indexed loads
-;-----------------------------------------------------------------------------
-
-  ldr    w5, [x4, #20]
-  ldr    x4, [x3]
-  ldr    x2, [sp, #32]
-  ldr    b5, [sp, #1]
-  ldr    h6, [sp, #2]
-  ldr    s7, [sp, #4]
-  ldr    d8, [sp, #8]
-  ldr    q9, [sp, #16]
-  ldrb   w4, [x3]
-  ldrb   w5, [x4, #20]
-  ldrb	 w2, [x3, _foo@pageoff]
-  ldrb   w3, [x2, "+[Test method].var"@PAGEOFF]
-  ldrsb  w9, [x3]
-  ldrsb  x2, [sp, #128]
-  ldrh   w2, [sp, #32]
-  ldrsh  w3, [sp, #32]
-  ldrsh  x5, [x9, #24]
-  ldrsw  x9, [sp, #512]
-
-  prfm   #5, [sp, #32]
-  prfm   #31, [sp, #32]
-  prfm   pldl1keep, [x2]
-  prfm   pldl1strm, [x2]
-  prfm   pldl2keep, [x2]
-  prfm   pldl2strm, [x2]
-  prfm   pldl3keep, [x2]
-  prfm   pldl3strm, [x2]
-  prfm   pstl1keep, [x2]
-  prfm   pstl1strm, [x2]
-  prfm   pstl2keep, [x2]
-  prfm   pstl2strm, [x2]
-  prfm   pstl3keep, [x2]
-  prfm   pstl3strm, [x2]
-  prfm  pstl3strm, [x4, x5, lsl #3]
-
-; CHECK: ldr    w5, [x4, #20]           ; encoding: [0x85,0x14,0x40,0xb9]
-; CHECK: ldr    x4, [x3]                ; encoding: [0x64,0x00,0x40,0xf9]
-; CHECK: ldr    x2, [sp, #32]           ; encoding: [0xe2,0x13,0x40,0xf9]
-; CHECK: ldr    b5, [sp, #1]            ; encoding: [0xe5,0x07,0x40,0x3d]
-; CHECK: ldr    h6, [sp, #2]            ; encoding: [0xe6,0x07,0x40,0x7d]
-; CHECK: ldr    s7, [sp, #4]            ; encoding: [0xe7,0x07,0x40,0xbd]
-; CHECK: ldr    d8, [sp, #8]            ; encoding: [0xe8,0x07,0x40,0xfd]
-; CHECK: ldr    q9, [sp, #16]           ; encoding: [0xe9,0x07,0xc0,0x3d]
-; CHECK: ldrb   w4, [x3]                ; encoding: [0x64,0x00,0x40,0x39]
-; CHECK: ldrb   w5, [x4, #20]           ; encoding: [0x85,0x50,0x40,0x39]
-; CHECK: ldrb	w2, [x3, _foo@PAGEOFF]  ; encoding: [0x62,0bAAAAAA00,0b01AAAAAA,0x39]
-; CHECK: ldrb	w3, [x2, "+[Test method].var"@PAGEOFF] ; encoding: [0x43,0bAAAAAA00,0b01AAAAAA,0x39]
-; CHECK: ldrsb  w9, [x3]                ; encoding: [0x69,0x00,0xc0,0x39]
-; CHECK: ldrsb  x2, [sp, #128]          ; encoding: [0xe2,0x03,0x82,0x39]
-; CHECK: ldrh   w2, [sp, #32]           ; encoding: [0xe2,0x43,0x40,0x79]
-; CHECK: ldrsh  w3, [sp, #32]           ; encoding: [0xe3,0x43,0xc0,0x79]
-; CHECK: ldrsh  x5, [x9, #24]           ; encoding: [0x25,0x31,0x80,0x79]
-; CHECK: ldrsw  x9, [sp, #512]          ; encoding: [0xe9,0x03,0x82,0xb9]
-; CHECK: prfm   pldl3strm, [sp, #32]    ; encoding: [0xe5,0x13,0x80,0xf9]
-; CHECK: prfm	#31, [sp, #32]          ; encoding: [0xff,0x13,0x80,0xf9]
-; CHECK: prfm   pldl1keep, [x2]         ; encoding: [0x40,0x00,0x80,0xf9]
-; CHECK: prfm   pldl1strm, [x2]         ; encoding: [0x41,0x00,0x80,0xf9]
-; CHECK: prfm   pldl2keep, [x2]         ; encoding: [0x42,0x00,0x80,0xf9]
-; CHECK: prfm   pldl2strm, [x2]         ; encoding: [0x43,0x00,0x80,0xf9]
-; CHECK: prfm   pldl3keep, [x2]         ; encoding: [0x44,0x00,0x80,0xf9]
-; CHECK: prfm   pldl3strm, [x2]         ; encoding: [0x45,0x00,0x80,0xf9]
-; CHECK: prfm   pstl1keep, [x2]         ; encoding: [0x50,0x00,0x80,0xf9]
-; CHECK: prfm   pstl1strm, [x2]         ; encoding: [0x51,0x00,0x80,0xf9]
-; CHECK: prfm   pstl2keep, [x2]         ; encoding: [0x52,0x00,0x80,0xf9]
-; CHECK: prfm   pstl2strm, [x2]         ; encoding: [0x53,0x00,0x80,0xf9]
-; CHECK: prfm   pstl3keep, [x2]         ; encoding: [0x54,0x00,0x80,0xf9]
-; CHECK: prfm   pstl3strm, [x2]         ; encoding: [0x55,0x00,0x80,0xf9]
-; CHECK: prfm	pstl3strm, [x4, x5, lsl #3] ; encoding: [0x95,0x78,0xa5,0xf8]
-
-;-----------------------------------------------------------------------------
-; Indexed stores
-;-----------------------------------------------------------------------------
-
-  str   x4, [x3]
-  str   x2, [sp, #32]
-  str   w5, [x4, #20]
-  str   b5, [sp, #1]
-  str   h6, [sp, #2]
-  str   s7, [sp, #4]
-  str   d8, [sp, #8]
-  str   q9, [sp, #16]
-  strb  w4, [x3]
-  strb  w5, [x4, #20]
-  strh  w2, [sp, #32]
-
-; CHECK: str   x4, [x3]                 ; encoding: [0x64,0x00,0x00,0xf9]
-; CHECK: str   x2, [sp, #32]            ; encoding: [0xe2,0x13,0x00,0xf9]
-; CHECK: str   w5, [x4, #20]            ; encoding: [0x85,0x14,0x00,0xb9]
-; CHECK: str   b5, [sp, #1]             ; encoding: [0xe5,0x07,0x00,0x3d]
-; CHECK: str   h6, [sp, #2]             ; encoding: [0xe6,0x07,0x00,0x7d]
-; CHECK: str   s7, [sp, #4]             ; encoding: [0xe7,0x07,0x00,0xbd]
-; CHECK: str   d8, [sp, #8]             ; encoding: [0xe8,0x07,0x00,0xfd]
-; CHECK: str   q9, [sp, #16]            ; encoding: [0xe9,0x07,0x80,0x3d]
-; CHECK: strb  w4, [x3]                 ; encoding: [0x64,0x00,0x00,0x39]
-; CHECK: strb  w5, [x4, #20]            ; encoding: [0x85,0x50,0x00,0x39]
-; CHECK: strh  w2, [sp, #32]            ; encoding: [0xe2,0x43,0x00,0x79]
-
-;-----------------------------------------------------------------------------
-; Unscaled immediate loads and stores
-;-----------------------------------------------------------------------------
-
-  ldur    w2, [x3]
-  ldur    w2, [sp, #24]
-  ldur    x2, [x3]
-  ldur    x2, [sp, #24]
-  ldur    b5, [sp, #1]
-  ldur    h6, [sp, #2]
-  ldur    s7, [sp, #4]
-  ldur    d8, [sp, #8]
-  ldur    q9, [sp, #16]
-  ldursb  w9, [x3]
-  ldursb  x2, [sp, #128]
-  ldursh  w3, [sp, #32]
-  ldursh  x5, [x9, #24]
-  ldursw  x9, [sp, #-128]
-
-; CHECK: ldur    w2, [x3]               ; encoding: [0x62,0x00,0x40,0xb8]
-; CHECK: ldur    w2, [sp, #24]          ; encoding: [0xe2,0x83,0x41,0xb8]
-; CHECK: ldur    x2, [x3]               ; encoding: [0x62,0x00,0x40,0xf8]
-; CHECK: ldur    x2, [sp, #24]          ; encoding: [0xe2,0x83,0x41,0xf8]
-; CHECK: ldur    b5, [sp, #1]           ; encoding: [0xe5,0x13,0x40,0x3c]
-; CHECK: ldur    h6, [sp, #2]           ; encoding: [0xe6,0x23,0x40,0x7c]
-; CHECK: ldur    s7, [sp, #4]           ; encoding: [0xe7,0x43,0x40,0xbc]
-; CHECK: ldur    d8, [sp, #8]           ; encoding: [0xe8,0x83,0x40,0xfc]
-; CHECK: ldur    q9, [sp, #16]          ; encoding: [0xe9,0x03,0xc1,0x3c]
-; CHECK: ldursb  w9, [x3]               ; encoding: [0x69,0x00,0xc0,0x38]
-; CHECK: ldursb  x2, [sp, #128]         ; encoding: [0xe2,0x03,0x88,0x38]
-; CHECK: ldursh  w3, [sp, #32]          ; encoding: [0xe3,0x03,0xc2,0x78]
-; CHECK: ldursh  x5, [x9, #24]          ; encoding: [0x25,0x81,0x81,0x78]
-; CHECK: ldursw  x9, [sp, #-128]        ; encoding: [0xe9,0x03,0x98,0xb8]
-
-  stur    w4, [x3]
-  stur    w2, [sp, #32]
-  stur    x4, [x3]
-  stur    x2, [sp, #32]
-  stur    w5, [x4, #20]
-  stur    b5, [sp, #1]
-  stur    h6, [sp, #2]
-  stur    s7, [sp, #4]
-  stur    d8, [sp, #8]
-  stur    q9, [sp, #16]
-  sturb   w4, [x3]
-  sturb   w5, [x4, #20]
-  sturh   w2, [sp, #32]
-  prfum   #5, [sp, #32]
-
-; CHECK: stur    w4, [x3]               ; encoding: [0x64,0x00,0x00,0xb8]
-; CHECK: stur    w2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0xb8]
-; CHECK: stur    x4, [x3]               ; encoding: [0x64,0x00,0x00,0xf8]
-; CHECK: stur    x2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0xf8]
-; CHECK: stur    w5, [x4, #20]          ; encoding: [0x85,0x40,0x01,0xb8]
-; CHECK: stur    b5, [sp, #1]           ; encoding: [0xe5,0x13,0x00,0x3c]
-; CHECK: stur    h6, [sp, #2]           ; encoding: [0xe6,0x23,0x00,0x7c]
-; CHECK: stur    s7, [sp, #4]           ; encoding: [0xe7,0x43,0x00,0xbc]
-; CHECK: stur    d8, [sp, #8]           ; encoding: [0xe8,0x83,0x00,0xfc]
-; CHECK: stur    q9, [sp, #16]          ; encoding: [0xe9,0x03,0x81,0x3c]
-; CHECK: sturb   w4, [x3]               ; encoding: [0x64,0x00,0x00,0x38]
-; CHECK: sturb   w5, [x4, #20]          ; encoding: [0x85,0x40,0x01,0x38]
-; CHECK: sturh   w2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0x78]
-; CHECK: prfum   pldl3strm, [sp, #32]   ; encoding: [0xe5,0x03,0x82,0xf8]
-
-;-----------------------------------------------------------------------------
-; Unprivileged loads and stores
-;-----------------------------------------------------------------------------
-
-  ldtr    w3, [x4, #16]
-  ldtr    x3, [x4, #16]
-  ldtrb   w3, [x4, #16]
-  ldtrsb  w9, [x3]
-  ldtrsb  x2, [sp, #128]
-  ldtrh   w3, [x4, #16]
-  ldtrsh  w3, [sp, #32]
-  ldtrsh  x5, [x9, #24]
-  ldtrsw  x9, [sp, #-128]
-
-; CHECK: ldtr   w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0xb8]
-; CHECK: ldtr   x3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0xf8]
-; CHECK: ldtrb  w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0x38]
-; CHECK: ldtrsb w9, [x3]                ; encoding: [0x69,0x08,0xc0,0x38]
-; CHECK: ldtrsb x2, [sp, #128]          ; encoding: [0xe2,0x0b,0x88,0x38]
-; CHECK: ldtrh  w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0x78]
-; CHECK: ldtrsh w3, [sp, #32]           ; encoding: [0xe3,0x0b,0xc2,0x78]
-; CHECK: ldtrsh x5, [x9, #24]           ; encoding: [0x25,0x89,0x81,0x78]
-; CHECK: ldtrsw x9, [sp, #-128]         ; encoding: [0xe9,0x0b,0x98,0xb8]
-
-  sttr    w5, [x4, #20]
-  sttr    x4, [x3]
-  sttr    x2, [sp, #32]
-  sttrb   w4, [x3]
-  sttrb   w5, [x4, #20]
-  sttrh   w2, [sp, #32]
-
-; CHECK: sttr   w5, [x4, #20]           ; encoding: [0x85,0x48,0x01,0xb8]
-; CHECK: sttr   x4, [x3]                ; encoding: [0x64,0x08,0x00,0xf8]
-; CHECK: sttr   x2, [sp, #32]           ; encoding: [0xe2,0x0b,0x02,0xf8]
-; CHECK: sttrb  w4, [x3]                ; encoding: [0x64,0x08,0x00,0x38]
-; CHECK: sttrb  w5, [x4, #20]           ; encoding: [0x85,0x48,0x01,0x38]
-; CHECK: sttrh  w2, [sp, #32]           ; encoding: [0xe2,0x0b,0x02,0x78]
-
-;-----------------------------------------------------------------------------
-; Pre-indexed loads and stores
-;-----------------------------------------------------------------------------
-
-  ldr   fp, [x7, #8]!
-  ldr   lr, [x7, #8]!
-  ldr   b5, [x0, #1]!
-  ldr   h6, [x0, #2]!
-  ldr   s7, [x0, #4]!
-  ldr   d8, [x0, #8]!
-  ldr   q9, [x0, #16]!
-
-  str   lr, [x7, #-8]!
-  str   fp, [x7, #-8]!
-  str   b5, [x0, #-1]!
-  str   h6, [x0, #-2]!
-  str   s7, [x0, #-4]!
-  str   d8, [x0, #-8]!
-  str   q9, [x0, #-16]!
-
-; CHECK: ldr  fp, [x7, #8]!             ; encoding: [0xfd,0x8c,0x40,0xf8]
-; CHECK: ldr  lr, [x7, #8]!             ; encoding: [0xfe,0x8c,0x40,0xf8]
-; CHECK: ldr  b5, [x0, #1]!             ; encoding: [0x05,0x1c,0x40,0x3c]
-; CHECK: ldr  h6, [x0, #2]!             ; encoding: [0x06,0x2c,0x40,0x7c]
-; CHECK: ldr  s7, [x0, #4]!             ; encoding: [0x07,0x4c,0x40,0xbc]
-; CHECK: ldr  d8, [x0, #8]!             ; encoding: [0x08,0x8c,0x40,0xfc]
-; CHECK: ldr  q9, [x0, #16]!            ; encoding: [0x09,0x0c,0xc1,0x3c]
-
-; CHECK: str  lr, [x7, #-8]!            ; encoding: [0xfe,0x8c,0x1f,0xf8]
-; CHECK: str  fp, [x7, #-8]!            ; encoding: [0xfd,0x8c,0x1f,0xf8]
-; CHECK: str  b5, [x0, #-1]!            ; encoding: [0x05,0xfc,0x1f,0x3c]
-; CHECK: str  h6, [x0, #-2]!            ; encoding: [0x06,0xec,0x1f,0x7c]
-; CHECK: str  s7, [x0, #-4]!            ; encoding: [0x07,0xcc,0x1f,0xbc]
-; CHECK: str  d8, [x0, #-8]!            ; encoding: [0x08,0x8c,0x1f,0xfc]
-; CHECK: str  q9, [x0, #-16]!           ; encoding: [0x09,0x0c,0x9f,0x3c]
-
-;-----------------------------------------------------------------------------
-; post-indexed loads and stores
-;-----------------------------------------------------------------------------
-  str lr, [x7], #-8
-  str fp, [x7], #-8
-  str b5, [x0], #-1
-  str h6, [x0], #-2
-  str s7, [x0], #-4
-  str d8, [x0], #-8
-  str q9, [x0], #-16
-
-  ldr fp, [x7], #8
-  ldr lr, [x7], #8
-  ldr b5, [x0], #1
-  ldr h6, [x0], #2
-  ldr s7, [x0], #4
-  ldr d8, [x0], #8
-  ldr q9, [x0], #16
-
-; CHECK: str lr, [x7], #-8             ; encoding: [0xfe,0x84,0x1f,0xf8]
-; CHECK: str fp, [x7], #-8             ; encoding: [0xfd,0x84,0x1f,0xf8]
-; CHECK: str b5, [x0], #-1             ; encoding: [0x05,0xf4,0x1f,0x3c]
-; CHECK: str h6, [x0], #-2             ; encoding: [0x06,0xe4,0x1f,0x7c]
-; CHECK: str s7, [x0], #-4             ; encoding: [0x07,0xc4,0x1f,0xbc]
-; CHECK: str d8, [x0], #-8             ; encoding: [0x08,0x84,0x1f,0xfc]
-; CHECK: str q9, [x0], #-16            ; encoding: [0x09,0x04,0x9f,0x3c]
-
-; CHECK: ldr fp, [x7], #8              ; encoding: [0xfd,0x84,0x40,0xf8]
-; CHECK: ldr lr, [x7], #8              ; encoding: [0xfe,0x84,0x40,0xf8]
-; CHECK: ldr b5, [x0], #1              ; encoding: [0x05,0x14,0x40,0x3c]
-; CHECK: ldr h6, [x0], #2              ; encoding: [0x06,0x24,0x40,0x7c]
-; CHECK: ldr s7, [x0], #4              ; encoding: [0x07,0x44,0x40,0xbc]
-; CHECK: ldr d8, [x0], #8              ; encoding: [0x08,0x84,0x40,0xfc]
-; CHECK: ldr q9, [x0], #16             ; encoding: [0x09,0x04,0xc1,0x3c]
-
-;-----------------------------------------------------------------------------
-; Load/Store pair (indexed, offset)
-;-----------------------------------------------------------------------------
-
-  ldp    w3, w2, [x15, #16]
-  ldp    x4, x9, [sp, #-16]
-  ldpsw  x2, x3, [x14, #16]
-  ldpsw  x2, x3, [sp, #-16]
-  ldp    s10, s1, [x2, #64]
-  ldp    d10, d1, [x2]
-  ldp    q2, q3, [x0, #32]
-
-; CHECK: ldp    w3, w2, [x15, #16]      ; encoding: [0xe3,0x09,0x42,0x29]
-; CHECK: ldp    x4, x9, [sp, #-16]      ; encoding: [0xe4,0x27,0x7f,0xa9]
-; CHECK: ldpsw  x2, x3, [x14, #16]      ; encoding: [0xc2,0x0d,0x42,0x69]
-; CHECK: ldpsw  x2, x3, [sp, #-16]      ; encoding: [0xe2,0x0f,0x7e,0x69]
-; CHECK: ldp    s10, s1, [x2, #64]      ; encoding: [0x4a,0x04,0x48,0x2d]
-; CHECK: ldp    d10, d1, [x2]           ; encoding: [0x4a,0x04,0x40,0x6d]
-; CHECK: ldp    q2, q3, [x0, #32]       ; encoding: [0x02,0x0c,0x41,0xad]
-
-  stp    w3, w2, [x15, #16]
-  stp    x4, x9, [sp, #-16]
-  stp    s10, s1, [x2, #64]
-  stp    d10, d1, [x2]
-  stp    q2, q3, [x0, #32]
-
-; CHECK: stp    w3, w2, [x15, #16]      ; encoding: [0xe3,0x09,0x02,0x29]
-; CHECK: stp    x4, x9, [sp, #-16]      ; encoding: [0xe4,0x27,0x3f,0xa9]
-; CHECK: stp    s10, s1, [x2, #64]      ; encoding: [0x4a,0x04,0x08,0x2d]
-; CHECK: stp    d10, d1, [x2]           ; encoding: [0x4a,0x04,0x00,0x6d]
-; CHECK: stp    q2, q3, [x0, #32]       ; encoding: [0x02,0x0c,0x01,0xad]
-
-;-----------------------------------------------------------------------------
-; Load/Store pair (pre-indexed)
-;-----------------------------------------------------------------------------
-
-  ldp    w3, w2, [x15, #16]!
-  ldp    x4, x9, [sp, #-16]!
-  ldpsw  x2, x3, [x14, #16]!
-  ldpsw  x2, x3, [sp, #-16]!
-  ldp    s10, s1, [x2, #64]!
-  ldp    d10, d1, [x2, #16]!
-
-; CHECK: ldp  w3, w2, [x15, #16]!       ; encoding: [0xe3,0x09,0xc2,0x29]
-; CHECK: ldp  x4, x9, [sp, #-16]!       ; encoding: [0xe4,0x27,0xff,0xa9]
-; CHECK: ldpsw	x2, x3, [x14, #16]!     ; encoding: [0xc2,0x0d,0xc2,0x69]
-; CHECK: ldpsw	x2, x3, [sp, #-16]!     ; encoding: [0xe2,0x0f,0xfe,0x69]
-; CHECK: ldp  s10, s1, [x2, #64]!       ; encoding: [0x4a,0x04,0xc8,0x2d]
-; CHECK: ldp  d10, d1, [x2, #16]!       ; encoding: [0x4a,0x04,0xc1,0x6d]
-
-  stp    w3, w2, [x15, #16]!
-  stp    x4, x9, [sp, #-16]!
-  stp    s10, s1, [x2, #64]!
-  stp    d10, d1, [x2, #16]!
-
-; CHECK: stp  w3, w2, [x15, #16]!       ; encoding: [0xe3,0x09,0x82,0x29]
-; CHECK: stp  x4, x9, [sp, #-16]!       ; encoding: [0xe4,0x27,0xbf,0xa9]
-; CHECK: stp  s10, s1, [x2, #64]!       ; encoding: [0x4a,0x04,0x88,0x2d]
-; CHECK: stp  d10, d1, [x2, #16]!       ; encoding: [0x4a,0x04,0x81,0x6d]
-
-;-----------------------------------------------------------------------------
-; Load/Store pair (post-indexed)
-;-----------------------------------------------------------------------------
-
-  ldp    w3, w2, [x15], #16
-  ldp    x4, x9, [sp], #-16
-  ldpsw  x2, x3, [x14], #16
-  ldpsw  x2, x3, [sp], #-16
-  ldp    s10, s1, [x2], #64
-  ldp    d10, d1, [x2], #16
-
-; CHECK: ldp  w3, w2, [x15], #16        ; encoding: [0xe3,0x09,0xc2,0x28]
-; CHECK: ldp  x4, x9, [sp], #-16        ; encoding: [0xe4,0x27,0xff,0xa8]
-; CHECK: ldpsw	x2, x3, [x14], #16      ; encoding: [0xc2,0x0d,0xc2,0x68]
-; CHECK: ldpsw	x2, x3, [sp], #-16      ; encoding: [0xe2,0x0f,0xfe,0x68]
-; CHECK: ldp  s10, s1, [x2], #64        ; encoding: [0x4a,0x04,0xc8,0x2c]
-; CHECK: ldp  d10, d1, [x2], #16        ; encoding: [0x4a,0x04,0xc1,0x6c]
-
-  stp    w3, w2, [x15], #16
-  stp    x4, x9, [sp], #-16
-  stp    s10, s1, [x2], #64
-  stp    d10, d1, [x2], #16
-
-; CHECK: stp  w3, w2, [x15], #16        ; encoding: [0xe3,0x09,0x82,0x28]
-; CHECK: stp  x4, x9, [sp], #-16        ; encoding: [0xe4,0x27,0xbf,0xa8]
-; CHECK: stp  s10, s1, [x2], #64        ; encoding: [0x4a,0x04,0x88,0x2c]
-; CHECK: stp  d10, d1, [x2], #16        ; encoding: [0x4a,0x04,0x81,0x6c]
-
-;-----------------------------------------------------------------------------
-; Load/Store pair (no-allocate)
-;-----------------------------------------------------------------------------
-
-  ldnp  w3, w2, [x15, #16]
-  ldnp  x4, x9, [sp, #-16]
-  ldnp  s10, s1, [x2, #64]
-  ldnp  d10, d1, [x2]
-
-; CHECK: ldnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x42,0x28]
-; CHECK: ldnp  x4, x9, [sp, #-16]       ; encoding: [0xe4,0x27,0x7f,0xa8]
-; CHECK: ldnp  s10, s1, [x2, #64]       ; encoding: [0x4a,0x04,0x48,0x2c]
-; CHECK: ldnp  d10, d1, [x2]            ; encoding: [0x4a,0x04,0x40,0x6c]
-
-  stnp  w3, w2, [x15, #16]
-  stnp  x4, x9, [sp, #-16]
-  stnp  s10, s1, [x2, #64]
-  stnp  d10, d1, [x2]
-
-; CHECK: stnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x02,0x28]
-; CHECK: stnp  x4, x9, [sp, #-16]       ; encoding: [0xe4,0x27,0x3f,0xa8]
-; CHECK: stnp  s10, s1, [x2, #64]       ; encoding: [0x4a,0x04,0x08,0x2c]
-; CHECK: stnp  d10, d1, [x2]            ; encoding: [0x4a,0x04,0x00,0x6c]
-
-;-----------------------------------------------------------------------------
-; Load/Store register offset
-;-----------------------------------------------------------------------------
-
-  ldr  w0, [x0, x0]
-  ldr  w0, [x0, x0, lsl #2]
-  ldr  x0, [x0, x0]
-  ldr  x0, [x0, x0, lsl #3]
-  ldr  x0, [x0, x0, sxtx]
-
-; CHECK: ldr  w0, [x0, x0]              ; encoding: [0x00,0x68,0x60,0xb8]
-; CHECK: ldr  w0, [x0, x0, lsl #2]      ; encoding: [0x00,0x78,0x60,0xb8]
-; CHECK: ldr  x0, [x0, x0]              ; encoding: [0x00,0x68,0x60,0xf8]
-; CHECK: ldr  x0, [x0, x0, lsl #3]      ; encoding: [0x00,0x78,0x60,0xf8]
-; CHECK: ldr  x0, [x0, x0, sxtx]        ; encoding: [0x00,0xe8,0x60,0xf8]
-
-  ldr  b1, [x1, x2]
-  ldr  b1, [x1, x2, lsl #0]
-  ldr  h1, [x1, x2]
-  ldr  h1, [x1, x2, lsl #1]
-  ldr  s1, [x1, x2]
-  ldr  s1, [x1, x2, lsl #2]
-  ldr  d1, [x1, x2]
-  ldr  d1, [x1, x2, lsl #3]
-  ldr  q1, [x1, x2]
-  ldr  q1, [x1, x2, lsl #4]
-
-; CHECK: ldr  b1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0x3c]
-; CHECK: ldr  b1, [x1, x2, lsl #0]      ; encoding: [0x21,0x78,0x62,0x3c]
-; CHECK: ldr  h1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0x7c]
-; CHECK: ldr  h1, [x1, x2, lsl #1]      ; encoding: [0x21,0x78,0x62,0x7c]
-; CHECK: ldr  s1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0xbc]
-; CHECK: ldr  s1, [x1, x2, lsl #2]      ; encoding: [0x21,0x78,0x62,0xbc]
-; CHECK: ldr  d1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0xfc]
-; CHECK: ldr  d1, [x1, x2, lsl #3]      ; encoding: [0x21,0x78,0x62,0xfc]
-; CHECK: ldr  q1, [x1, x2]              ; encoding: [0x21,0x68,0xe2,0x3c]
-; CHECK: ldr  q1, [x1, x2, lsl #4]      ; encoding: [0x21,0x78,0xe2,0x3c]
-
-  str  d1, [sp, x3]
-  str  d1, [sp, x3, uxtw #3]
-  str  q1, [sp, x3]
-  str  q1, [sp, x3, uxtw #4]
-
-; CHECK: str  d1, [sp, x3]              ; encoding: [0xe1,0x6b,0x23,0xfc]
-; CHECK: str  d1, [sp, x3, uxtw #3]     ; encoding: [0xe1,0x5b,0x23,0xfc]
-; CHECK: str  q1, [sp, x3]              ; encoding: [0xe1,0x6b,0xa3,0x3c]
-; CHECK: str  q1, [sp, x3, uxtw #4]     ; encoding: [0xe1,0x5b,0xa3,0x3c]
-
-;-----------------------------------------------------------------------------
-; Load literal
-;-----------------------------------------------------------------------------
-
-  ldr    w5, foo
-  ldr    x4, foo
-  ldrsw  x9, foo
-  prfm   #5, foo
-
-; CHECK: ldr    w5, foo                 ; encoding: [0bAAA00101,A,A,0x18]
-; CHECK: ldr    x4, foo                 ; encoding: [0bAAA00100,A,A,0x58]
-; CHECK: ldrsw  x9, foo                 ; encoding: [0bAAA01001,A,A,0x98]
-; CHECK: prfm   pldl3strm, foo          ; encoding: [0bAAA00101,A,A,0xd8]
-
-;-----------------------------------------------------------------------------
-; Load/Store exclusive
-;-----------------------------------------------------------------------------
-
-  ldxr   w6, [x1]
-  ldxr   x6, [x1]
-  ldxrb  w6, [x1]
-  ldxrh  w6, [x1]
-  ldxp   w7, w3, [x9]
-  ldxp   x7, x3, [x9]
-
-; CHECK: ldxrb  w6, [x1]                ; encoding: [0x26,0x7c,0x5f,0x08]
-; CHECK: ldxrh  w6, [x1]                ; encoding: [0x26,0x7c,0x5f,0x48]
-; CHECK: ldxp   w7, w3, [x9]            ; encoding: [0x27,0x0d,0x7f,0x88]
-; CHECK: ldxp   x7, x3, [x9]            ; encoding: [0x27,0x0d,0x7f,0xc8]
-
-  stxr   w1, x4, [x3]
-  stxr   w1, w4, [x3]
-  stxrb  w1, w4, [x3]
-  stxrh  w1, w4, [x3]
-  stxp   w1, x2, x6, [x1]
-  stxp   w1, w2, w6, [x1]
-
-; CHECK: stxr   w1, x4, [x3]            ; encoding: [0x64,0x7c,0x01,0xc8]
-; CHECK: stxr   w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x88]
-; CHECK: stxrb  w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x08]
-; CHECK: stxrh  w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x48]
-; CHECK: stxp   w1, x2, x6, [x1]        ; encoding: [0x22,0x18,0x21,0xc8]
-; CHECK: stxp   w1, w2, w6, [x1]        ; encoding: [0x22,0x18,0x21,0x88]
-
-;-----------------------------------------------------------------------------
-; Load-acquire/Store-release non-exclusive
-;-----------------------------------------------------------------------------
-
-  ldar   w4, [sp]
-  ldar   x4, [sp, #0]
-  ldarb  w4, [sp]
-  ldarh  w4, [sp]
-
-; CHECK: ldar   w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x88]
-; CHECK: ldar   x4, [sp]                ; encoding: [0xe4,0xff,0xdf,0xc8]
-; CHECK: ldarb  w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x08]
-; CHECK: ldarh  w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x48]
-
-  stlr   w3, [x6]
-  stlr   x3, [x6]
-  stlrb  w3, [x6]
-  stlrh  w3, [x6]
-
-; CHECK: stlr   w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x88]
-; CHECK: stlr   x3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0xc8]
-; CHECK: stlrb  w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x08]
-; CHECK: stlrh  w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x48]
-
-;-----------------------------------------------------------------------------
-; Load-acquire/Store-release exclusive
-;-----------------------------------------------------------------------------
-
-  ldaxr   w2, [x4]
-  ldaxr   x2, [x4]
-  ldaxrb  w2, [x4, #0]
-  ldaxrh  w2, [x4]
-  ldaxp   w2, w6, [x1]
-  ldaxp   x2, x6, [x1]
-
-; CHECK: ldaxr   w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x88]
-; CHECK: ldaxr   x2, [x4]               ; encoding: [0x82,0xfc,0x5f,0xc8]
-; CHECK: ldaxrb  w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x08]
-; CHECK: ldaxrh  w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x48]
-; CHECK: ldaxp   w2, w6, [x1]           ; encoding: [0x22,0x98,0x7f,0x88]
-; CHECK: ldaxp   x2, x6, [x1]           ; encoding: [0x22,0x98,0x7f,0xc8]
-
-  stlxr   w8, x7, [x1]
-  stlxr   w8, w7, [x1]
-  stlxrb  w8, w7, [x1]
-  stlxrh  w8, w7, [x1]
-  stlxp   w1, x2, x6, [x1]
-  stlxp   w1, w2, w6, [x1]
-
-; CHECK: stlxr  w8, x7, [x1]            ; encoding: [0x27,0xfc,0x08,0xc8]
-; CHECK: stlxr  w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x88]
-; CHECK: stlxrb w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x08]
-; CHECK: stlxrh w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x48]
-; CHECK: stlxp  w1, x2, x6, [x1]        ; encoding: [0x22,0x98,0x21,0xc8]
-; CHECK: stlxp  w1, w2, w6, [x1]        ; encoding: [0x22,0x98,0x21,0x88]
-
-
-;-----------------------------------------------------------------------------
-; LDUR/STUR aliases for negative and unaligned LDR/STR instructions.
-;
-; According to the ARM ISA documentation:
-; "A programmer-friendly assembler should also generate these instructions
-; in response to the standard LDR/STR mnemonics when the immediate offset is
-; unambiguous, i.e. negative or unaligned."
-;-----------------------------------------------------------------------------
-
-  ldr x11, [fp, #-8]
-  ldr x11, [fp, #7]
-  ldr w0, [x0, #2]
-  ldr w0, [x0, #-256]
-  ldr b2, [x1, #-2]
-  ldr h3, [x2, #3]
-  ldr h3, [x3, #-4]
-  ldr s3, [x4, #3]
-  ldr s3, [x5, #-4]
-  ldr d4, [x6, #4]
-  ldr d4, [x7, #-8]
-  ldr q5, [x8, #8]
-  ldr q5, [x9, #-16]
-
-; CHECK: ldur	x11, [fp, #-8]          ; encoding: [0xab,0x83,0x5f,0xf8]
-; CHECK: ldur	x11, [fp, #7]           ; encoding: [0xab,0x73,0x40,0xf8]
-; CHECK: ldur	w0, [x0, #2]            ; encoding: [0x00,0x20,0x40,0xb8]
-; CHECK: ldur	w0, [x0, #-256]         ; encoding: [0x00,0x00,0x50,0xb8]
-; CHECK: ldur	b2, [x1, #-2]           ; encoding: [0x22,0xe0,0x5f,0x3c]
-; CHECK: ldur	h3, [x2, #3]            ; encoding: [0x43,0x30,0x40,0x7c]
-; CHECK: ldur	h3, [x3, #-4]           ; encoding: [0x63,0xc0,0x5f,0x7c]
-; CHECK: ldur	s3, [x4, #3]            ; encoding: [0x83,0x30,0x40,0xbc]
-; CHECK: ldur	s3, [x5, #-4]           ; encoding: [0xa3,0xc0,0x5f,0xbc]
-; CHECK: ldur	d4, [x6, #4]            ; encoding: [0xc4,0x40,0x40,0xfc]
-; CHECK: ldur	d4, [x7, #-8]           ; encoding: [0xe4,0x80,0x5f,0xfc]
-; CHECK: ldur	q5, [x8, #8]            ; encoding: [0x05,0x81,0xc0,0x3c]
-; CHECK: ldur	q5, [x9, #-16]          ; encoding: [0x25,0x01,0xdf,0x3c]
-
-  str x11, [fp, #-8]
-  str x11, [fp, #7]
-  str w0, [x0, #2]
-  str w0, [x0, #-256]
-  str b2, [x1, #-2]
-  str h3, [x2, #3]
-  str h3, [x3, #-4]
-  str s3, [x4, #3]
-  str s3, [x5, #-4]
-  str d4, [x6, #4]
-  str d4, [x7, #-8]
-  str q5, [x8, #8]
-  str q5, [x9, #-16]
-
-; CHECK: stur	x11, [fp, #-8]          ; encoding: [0xab,0x83,0x1f,0xf8]
-; CHECK: stur	x11, [fp, #7]           ; encoding: [0xab,0x73,0x00,0xf8]
-; CHECK: stur	w0, [x0, #2]            ; encoding: [0x00,0x20,0x00,0xb8]
-; CHECK: stur	w0, [x0, #-256]         ; encoding: [0x00,0x00,0x10,0xb8]
-; CHECK: stur	b2, [x1, #-2]           ; encoding: [0x22,0xe0,0x1f,0x3c]
-; CHECK: stur	h3, [x2, #3]            ; encoding: [0x43,0x30,0x00,0x7c]
-; CHECK: stur	h3, [x3, #-4]           ; encoding: [0x63,0xc0,0x1f,0x7c]
-; CHECK: stur	s3, [x4, #3]            ; encoding: [0x83,0x30,0x00,0xbc]
-; CHECK: stur	s3, [x5, #-4]           ; encoding: [0xa3,0xc0,0x1f,0xbc]
-; CHECK: stur	d4, [x6, #4]            ; encoding: [0xc4,0x40,0x00,0xfc]
-; CHECK: stur	d4, [x7, #-8]           ; encoding: [0xe4,0x80,0x1f,0xfc]
-; CHECK: stur	q5, [x8, #8]            ; encoding: [0x05,0x81,0x80,0x3c]
-; CHECK: stur	q5, [x9, #-16]          ; encoding: [0x25,0x01,0x9f,0x3c]
-
-  ldrb w3, [x1, #-1]
-  ldrh w4, [x2, #1]
-  ldrh w5, [x3, #-1]
-  ldrsb w6, [x4, #-1]
-  ldrsb x7, [x5, #-1]
-  ldrsh w8, [x6, #1]
-  ldrsh w9, [x7, #-1]
-  ldrsh x1, [x8, #1]
-  ldrsh x2, [x9, #-1]
-  ldrsw x3, [x10, #10]
-  ldrsw x4, [x11, #-1]
-
-; CHECK: ldurb	w3, [x1, #-1]           ; encoding: [0x23,0xf0,0x5f,0x38]
-; CHECK: ldurh	w4, [x2, #1]            ; encoding: [0x44,0x10,0x40,0x78]
-; CHECK: ldurh	w5, [x3, #-1]           ; encoding: [0x65,0xf0,0x5f,0x78]
-; CHECK: ldursb	w6, [x4, #-1]           ; encoding: [0x86,0xf0,0xdf,0x38]
-; CHECK: ldursb	x7, [x5, #-1]           ; encoding: [0xa7,0xf0,0x9f,0x38]
-; CHECK: ldursh	w8, [x6, #1]            ; encoding: [0xc8,0x10,0xc0,0x78]
-; CHECK: ldursh	w9, [x7, #-1]           ; encoding: [0xe9,0xf0,0xdf,0x78]
-; CHECK: ldursh	x1, [x8, #1]            ; encoding: [0x01,0x11,0x80,0x78]
-; CHECK: ldursh	x2, [x9, #-1]           ; encoding: [0x22,0xf1,0x9f,0x78]
-; CHECK: ldursw	x3, [x10, #10]          ; encoding: [0x43,0xa1,0x80,0xb8]
-; CHECK: ldursw	x4, [x11, #-1]          ; encoding: [0x64,0xf1,0x9f,0xb8]
-
-  strb w3, [x1, #-1]
-  strh w4, [x2, #1]
-  strh w5, [x3, #-1]
-
-; CHECK: sturb	w3, [x1, #-1]           ; encoding: [0x23,0xf0,0x1f,0x38]
-; CHECK: sturh	w4, [x2, #1]            ; encoding: [0x44,0x10,0x00,0x78]
-; CHECK: sturh	w5, [x3, #-1]           ; encoding: [0x65,0xf0,0x1f,0x78]
diff --git a/test/MC/ARM64/separator.s b/test/MC/ARM64/separator.s
deleted file mode 100644
index 18f34b9..0000000
--- a/test/MC/ARM64/separator.s
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
-
-; ARM64 uses a multi-character statment separator, "%%". Check that we lex
-; it properly and recognize the multiple assembly statements on the line.
-
-; To make sure the output assembly correctly handled the instructions,
-; tell it to show encodings. That will result in the two 'mov' instructions
-; being on separate lines in the output. We look for the "; encoding" string
-; to verify that. For this test, we don't care what the encoding is, just that
-; there is one for each 'mov' instruction.
-
-
-_foo:
-; CHECK: foo
-; CHECK: mov x0, x1 ; encoding
-; CHECK: mov x1, x0 ; encoding
-	mov x0, x1 %% mov x1, x0
-	ret	lr
-
-
diff --git a/test/MC/ARM64/simd-ldst.s b/test/MC/ARM64/simd-ldst.s
deleted file mode 100644
index a754c72..0000000
--- a/test/MC/ARM64/simd-ldst.s
+++ /dev/null
@@ -1,2404 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -output-asm-variant=1 -show-encoding < %s | FileCheck %s
-
-_ld1st1_multiple:
-  ld1.8b {v0}, [x1]
-  ld1.8b {v0, v1}, [x1]
-  ld1.8b {v0, v1, v2}, [x1]
-  ld1.8b {v0, v1, v2, v3}, [x1]
-
-  ld1.8b {v3}, [x1]
-  ld1.8b {v3, v4}, [x2]
-  ld1.8b {v4, v5, v6}, [x3]
-  ld1.8b {v7, v8, v9, v10}, [x4]
-
-  ld1.16b {v0}, [x1]
-  ld1.16b {v0, v1}, [x1]
-  ld1.16b {v0, v1, v2}, [x1]
-  ld1.16b {v0, v1, v2, v3}, [x1]
-
-  ld1.4h {v0}, [x1]
-  ld1.4h {v0, v1}, [x1]
-  ld1.4h {v0, v1, v2}, [x1]
-  ld1.4h {v0, v1, v2, v3}, [x1]
-
-  ld1.8h {v0}, [x1]
-  ld1.8h {v0, v1}, [x1]
-  ld1.8h {v0, v1, v2}, [x1]
-  ld1.8h {v0, v1, v2, v3}, [x1]
-
-  ld1.2s {v0}, [x1]
-  ld1.2s {v0, v1}, [x1]
-  ld1.2s {v0, v1, v2}, [x1]
-  ld1.2s {v0, v1, v2, v3}, [x1]
-
-  ld1.4s {v0}, [x1]
-  ld1.4s {v0, v1}, [x1]
-  ld1.4s {v0, v1, v2}, [x1]
-  ld1.4s {v0, v1, v2, v3}, [x1]
-
-  ld1.1d {v0}, [x1]
-  ld1.1d {v0, v1}, [x1]
-  ld1.1d {v0, v1, v2}, [x1]
-  ld1.1d {v0, v1, v2, v3}, [x1]
-
-  ld1.2d {v0}, [x1]
-  ld1.2d {v0, v1}, [x1]
-  ld1.2d {v0, v1, v2}, [x1]
-  ld1.2d {v0, v1, v2, v3}, [x1]
-
-  st1.8b {v0}, [x1]
-  st1.8b {v0, v1}, [x1]
-  st1.8b {v0, v1, v2}, [x1]
-  st1.8b {v0, v1, v2, v3}, [x1]
-
-  st1.16b {v0}, [x1]
-  st1.16b {v0, v1}, [x1]
-  st1.16b {v0, v1, v2}, [x1]
-  st1.16b {v0, v1, v2, v3}, [x1]
-
-  st1.4h {v0}, [x1]
-  st1.4h {v0, v1}, [x1]
-  st1.4h {v0, v1, v2}, [x1]
-  st1.4h {v0, v1, v2, v3}, [x1]
-
-  st1.8h {v0}, [x1]
-  st1.8h {v0, v1}, [x1]
-  st1.8h {v0, v1, v2}, [x1]
-  st1.8h {v0, v1, v2, v3}, [x1]
-
-  st1.2s {v0}, [x1]
-  st1.2s {v0, v1}, [x1]
-  st1.2s {v0, v1, v2}, [x1]
-  st1.2s {v0, v1, v2, v3}, [x1]
-
-  st1.4s {v0}, [x1]
-  st1.4s {v0, v1}, [x1]
-  st1.4s {v0, v1, v2}, [x1]
-  st1.4s {v0, v1, v2, v3}, [x1]
-
-  st1.1d {v0}, [x1]
-  st1.1d {v0, v1}, [x1]
-  st1.1d {v0, v1, v2}, [x1]
-  st1.1d {v0, v1, v2, v3}, [x1]
-
-  st1.2d {v0}, [x1]
-  st1.2d {v0, v1}, [x1]
-  st1.2d {v0, v1, v2}, [x1]
-  st1.2d {v0, v1, v2, v3}, [x1]
-
-  st1.2d {v5}, [x1]
-  st1.2d {v7, v8}, [x10]
-  st1.2d {v11, v12, v13}, [x1]
-  st1.2d {v28, v29, v30, v31}, [x13]
-
-; CHECK: _ld1st1_multiple:
-; CHECK: ld1.8b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x40,0x0c]
-; CHECK: ld1.8b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x40,0x0c]
-; CHECK: ld1.8b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x40,0x0c]
-; CHECK: ld1.8b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x40,0x0c]
-
-; CHECK: ld1.8b { v3 }, [x1]            ; encoding: [0x23,0x70,0x40,0x0c]
-; CHECK: ld1.8b { v3, v4 }, [x2]        ; encoding: [0x43,0xa0,0x40,0x0c]
-; CHECK: ld1.8b { v4, v5, v6 }, [x3]    ; encoding: [0x64,0x60,0x40,0x0c]
-; CHECK: ld1.8b { v7, v8, v9, v10 }, [x4] ; encoding: [0x87,0x20,0x40,0x0c]
-
-; CHECK: ld1.16b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x40,0x4c]
-; CHECK: ld1.16b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x40,0x4c]
-; CHECK: ld1.16b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x40,0x4c]
-; CHECK: ld1.16b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x40,0x4c]
-
-; CHECK: ld1.4h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x40,0x0c]
-; CHECK: ld1.4h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x40,0x0c]
-; CHECK: ld1.4h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x40,0x0c]
-; CHECK: ld1.4h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x40,0x0c]
-
-; CHECK: ld1.8h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x40,0x4c]
-; CHECK: ld1.8h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x40,0x4c]
-; CHECK: ld1.8h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x40,0x4c]
-; CHECK: ld1.8h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x40,0x4c]
-
-; CHECK: ld1.2s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x40,0x0c]
-; CHECK: ld1.2s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x40,0x0c]
-; CHECK: ld1.2s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x40,0x0c]
-; CHECK: ld1.2s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x40,0x0c]
-
-; CHECK: ld1.4s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x40,0x4c]
-; CHECK: ld1.4s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x40,0x4c]
-; CHECK: ld1.4s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x40,0x4c]
-; CHECK: ld1.4s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x40,0x4c]
-
-; CHECK: ld1.1d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x40,0x0c]
-; CHECK: ld1.1d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x40,0x0c]
-; CHECK: ld1.1d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x40,0x0c]
-; CHECK: ld1.1d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x40,0x0c]
-
-; CHECK: ld1.2d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x40,0x4c]
-; CHECK: ld1.2d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x40,0x4c]
-; CHECK: ld1.2d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x40,0x4c]
-; CHECK: ld1.2d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x40,0x4c]
-
-
-; CHECK: st1.8b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x00,0x0c]
-; CHECK: st1.8b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x00,0x0c]
-; CHECK: st1.8b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x00,0x0c]
-; CHECK: st1.8b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x00,0x0c]
-
-; CHECK: st1.16b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x00,0x4c]
-; CHECK: st1.16b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x00,0x4c]
-; CHECK: st1.16b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x00,0x4c]
-; CHECK: st1.16b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x00,0x4c]
-
-; CHECK: st1.4h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x00,0x0c]
-; CHECK: st1.4h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x00,0x0c]
-; CHECK: st1.4h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x00,0x0c]
-; CHECK: st1.4h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x00,0x0c]
-
-; CHECK: st1.8h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x00,0x4c]
-; CHECK: st1.8h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x00,0x4c]
-; CHECK: st1.8h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x00,0x4c]
-; CHECK: st1.8h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x00,0x4c]
-
-; CHECK: st1.2s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x00,0x0c]
-; CHECK: st1.2s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x00,0x0c]
-; CHECK: st1.2s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x00,0x0c]
-; CHECK: st1.2s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x00,0x0c]
-
-; CHECK: st1.4s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x00,0x4c]
-; CHECK: st1.4s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x00,0x4c]
-; CHECK: st1.4s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x00,0x4c]
-; CHECK: st1.4s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x00,0x4c]
-
-; CHECK: st1.1d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x00,0x0c]
-; CHECK: st1.1d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x00,0x0c]
-; CHECK: st1.1d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x00,0x0c]
-; CHECK: st1.1d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x00,0x0c]
-
-; CHECK: st1.2d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x00,0x4c]
-; CHECK: st1.2d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x00,0x4c]
-; CHECK: st1.2d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x00,0x4c]
-; CHECK: st1.2d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x00,0x4c]
-
-; CHECK: st1.2d { v5 }, [x1]            ; encoding: [0x25,0x7c,0x00,0x4c]
-; CHECK: st1.2d { v7, v8 }, [x10]       ; encoding: [0x47,0xad,0x00,0x4c]
-; CHECK: st1.2d { v11, v12, v13 }, [x1] ; encoding: [0x2b,0x6c,0x00,0x4c]
-; CHECK: st1.2d { v28, v29, v30, v31 }, [x13] ; encoding: [0xbc,0x2d,0x00,0x4c]
-
-_ld2st2_multiple:
-  ld2.8b {v4, v5}, [x19]
-  ld2.16b {v4, v5}, [x19]
-  ld2.4h {v4, v5}, [x19]
-  ld2.8h {v4, v5}, [x19]
-  ld2.2s {v4, v5}, [x19]
-  ld2.4s {v4, v5}, [x19]
-  ld2.2d {v4, v5}, [x19]
-
-  st2.8b {v4, v5}, [x19]
-  st2.16b {v4, v5}, [x19]
-  st2.4h {v4, v5}, [x19]
-  st2.8h {v4, v5}, [x19]
-  st2.2s {v4, v5}, [x19]
-  st2.4s {v4, v5}, [x19]
-  st2.2d {v4, v5}, [x19]
-
-
-; CHECK: _ld2st2_multiple
-; CHECK: ld2.8b { v4, v5 }, [x19]       ; encoding: [0x64,0x82,0x40,0x0c]
-; CHECK: ld2.16b { v4, v5 }, [x19]      ; encoding: [0x64,0x82,0x40,0x4c]
-; CHECK: ld2.4h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x40,0x0c]
-; CHECK: ld2.8h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x40,0x4c]
-; CHECK: ld2.2s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x40,0x0c]
-; CHECK: ld2.4s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x40,0x4c]
-; CHECK: ld2.2d { v4, v5 }, [x19]       ; encoding: [0x64,0x8e,0x40,0x4c]
-
-; CHECK: st2.8b { v4, v5 }, [x19]       ; encoding: [0x64,0x82,0x00,0x0c]
-; CHECK: st2.16b { v4, v5 }, [x19]      ; encoding: [0x64,0x82,0x00,0x4c]
-; CHECK: st2.4h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x00,0x0c]
-; CHECK: st2.8h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x00,0x4c]
-; CHECK: st2.2s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x00,0x0c]
-; CHECK: st2.4s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x00,0x4c]
-; CHECK: st2.2d { v4, v5 }, [x19]       ; encoding: [0x64,0x8e,0x00,0x4c]
-
-
-ld3st3_multiple:
-    ld3.8b {v4, v5, v6}, [x19]
-    ld3.16b {v4, v5, v6}, [x19]
-    ld3.4h {v4, v5, v6}, [x19]
-    ld3.8h {v4, v5, v6}, [x19]
-    ld3.2s {v4, v5, v6}, [x19]
-    ld3.4s {v4, v5, v6}, [x19]
-    ld3.2d {v4, v5, v6}, [x19]
-
-    ld3.8b {v9, v10, v11}, [x9]
-    ld3.16b {v14, v15, v16}, [x19]
-    ld3.4h {v24, v25, v26}, [x29]
-    ld3.8h {v30, v31, v0}, [x9]
-    ld3.2s {v2, v3, v4}, [x19]
-    ld3.4s {v4, v5, v6}, [x29]
-    ld3.2d {v7, v8, v9}, [x9]
-
-    st3.8b {v4, v5, v6}, [x19]
-    st3.16b {v4, v5, v6}, [x19]
-    st3.4h {v4, v5, v6}, [x19]
-    st3.8h {v4, v5, v6}, [x19]
-    st3.2s {v4, v5, v6}, [x19]
-    st3.4s {v4, v5, v6}, [x19]
-    st3.2d {v4, v5, v6}, [x19]
-
-    st3.8b {v10, v11, v12}, [x9]
-    st3.16b {v14, v15, v16}, [x19]
-    st3.4h {v24, v25, v26}, [x29]
-    st3.8h {v30, v31, v0}, [x9]
-    st3.2s {v2, v3, v4}, [x19]
-    st3.4s {v7, v8, v9}, [x29]
-    st3.2d {v4, v5, v6}, [x9]
-
-; CHECK: ld3st3_multiple:
-; CHECK: ld3.8b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x40,0x0c]
-; CHECK: ld3.16b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x40,0x4c]
-; CHECK: ld3.4h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x40,0x0c]
-; CHECK: ld3.8h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x40,0x4c]
-; CHECK: ld3.2s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x40,0x0c]
-; CHECK: ld3.4s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x40,0x4c]
-; CHECK: ld3.2d { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4e,0x40,0x4c]
-
-; CHECK: ld3.8b { v9, v10, v11 }, [x9]  ; encoding: [0x29,0x41,0x40,0x0c]
-; CHECK: ld3.16b { v14, v15, v16 }, [x19] ; encoding: [0x6e,0x42,0x40,0x4c]
-; CHECK: ld3.4h { v24, v25, v26 }, [fp] ; encoding: [0xb8,0x47,0x40,0x0c]
-; CHECK: ld3.8h { v30, v31, v0 }, [x9]  ; encoding: [0x3e,0x45,0x40,0x4c]
-; CHECK: ld3.2s { v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x0c]
-; CHECK: ld3.4s { v4, v5, v6 }, [fp]    ; encoding: [0xa4,0x4b,0x40,0x4c]
-; CHECK: ld3.2d { v7, v8, v9 }, [x9]    ; encoding: [0x27,0x4d,0x40,0x4c]
-
-; CHECK: st3.8b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x00,0x0c]
-; CHECK: st3.16b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x00,0x4c]
-; CHECK: st3.4h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x00,0x0c]
-; CHECK: st3.8h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x00,0x4c]
-; CHECK: st3.2s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x00,0x0c]
-; CHECK: st3.4s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x00,0x4c]
-; CHECK: st3.2d { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4e,0x00,0x4c]
-
-; CHECK: st3.8b { v10, v11, v12 }, [x9] ; encoding: [0x2a,0x41,0x00,0x0c]
-; CHECK: st3.16b { v14, v15, v16 }, [x19] ; encoding: [0x6e,0x42,0x00,0x4c]
-; CHECK: st3.4h { v24, v25, v26 }, [fp] ; encoding: [0xb8,0x47,0x00,0x0c]
-; CHECK: st3.8h { v30, v31, v0 }, [x9]  ; encoding: [0x3e,0x45,0x00,0x4c]
-; CHECK: st3.2s { v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x0c]
-; CHECK: st3.4s { v7, v8, v9 }, [fp]    ; encoding: [0xa7,0x4b,0x00,0x4c]
-; CHECK: st3.2d { v4, v5, v6 }, [x9]    ; encoding: [0x24,0x4d,0x00,0x4c]
-
-ld4st4_multiple:
-    ld4.8b {v4, v5, v6, v7}, [x19]
-    ld4.16b {v4, v5, v6, v7}, [x19]
-    ld4.4h {v4, v5, v6, v7}, [x19]
-    ld4.8h {v4, v5, v6, v7}, [x19]
-    ld4.2s {v4, v5, v6, v7}, [x19]
-    ld4.4s {v4, v5, v6, v7}, [x19]
-    ld4.2d {v4, v5, v6, v7}, [x19]
-
-    st4.8b {v4, v5, v6, v7}, [x19]
-    st4.16b {v4, v5, v6, v7}, [x19]
-    st4.4h {v4, v5, v6, v7}, [x19]
-    st4.8h {v4, v5, v6, v7}, [x19]
-    st4.2s {v4, v5, v6, v7}, [x19]
-    st4.4s {v4, v5, v6, v7}, [x19]
-    st4.2d {v4, v5, v6, v7}, [x19]
-
-; CHECK: ld4st4_multiple:
-; CHECK: ld4.8b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x40,0x0c]
-; CHECK: ld4.16b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x40,0x4c]
-; CHECK: ld4.4h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x40,0x0c]
-; CHECK: ld4.8h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x40,0x4c]
-; CHECK: ld4.2s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x40,0x0c]
-; CHECK: ld4.4s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x40,0x4c]
-; CHECK: ld4.2d { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0e,0x40,0x4c]
-
-; CHECK: st4.8b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x00,0x0c]
-; CHECK: st4.16b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x00,0x4c]
-; CHECK: st4.4h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x00,0x0c]
-; CHECK: st4.8h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x00,0x4c]
-; CHECK: st4.2s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x00,0x0c]
-; CHECK: st4.4s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x00,0x4c]
-; CHECK: st4.2d { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0e,0x00,0x4c]
-
-;-----------------------------------------------------------------------------
-; Post-increment versions.
-;-----------------------------------------------------------------------------
-
-_ld1st1_multiple_post:
-  ld1.8b {v0}, [x1], x15
-  ld1.8b {v0, v1}, [x1], x15
-  ld1.8b {v0, v1, v2}, [x1], x15
-  ld1.8b {v0, v1, v2, v3}, [x1], x15
-
-  ld1.16b {v0}, [x1], x15
-  ld1.16b {v0, v1}, [x1], x15
-  ld1.16b {v0, v1, v2}, [x1], x15
-  ld1.16b {v0, v1, v2, v3}, [x1], x15
-
-  ld1.4h {v0}, [x1], x15
-  ld1.4h {v0, v1}, [x1], x15
-  ld1.4h {v0, v1, v2}, [x1], x15
-  ld1.4h {v0, v1, v2, v3}, [x1], x15
-
-  ld1.8h {v0}, [x1], x15
-  ld1.8h {v0, v1}, [x1], x15
-  ld1.8h {v0, v1, v2}, [x1], x15
-  ld1.8h {v0, v1, v2, v3}, [x1], x15
-
-  ld1.2s {v0}, [x1], x15
-  ld1.2s {v0, v1}, [x1], x15
-  ld1.2s {v0, v1, v2}, [x1], x15
-  ld1.2s {v0, v1, v2, v3}, [x1], x15
-
-  ld1.4s {v0}, [x1], x15
-  ld1.4s {v0, v1}, [x1], x15
-  ld1.4s {v0, v1, v2}, [x1], x15
-  ld1.4s {v0, v1, v2, v3}, [x1], x15
-
-  ld1.1d {v0}, [x1], x15
-  ld1.1d {v0, v1}, [x1], x15
-  ld1.1d {v0, v1, v2}, [x1], x15
-  ld1.1d {v0, v1, v2, v3}, [x1], x15
-
-  ld1.2d {v0}, [x1], x15
-  ld1.2d {v0, v1}, [x1], x15
-  ld1.2d {v0, v1, v2}, [x1], x15
-  ld1.2d {v0, v1, v2, v3}, [x1], x15
-
-  st1.8b {v0}, [x1], x15
-  st1.8b {v0, v1}, [x1], x15
-  st1.8b {v0, v1, v2}, [x1], x15
-  st1.8b {v0, v1, v2, v3}, [x1], x15
-
-  st1.16b {v0}, [x1], x15
-  st1.16b {v0, v1}, [x1], x15
-  st1.16b {v0, v1, v2}, [x1], x15
-  st1.16b {v0, v1, v2, v3}, [x1], x15
-
-  st1.4h {v0}, [x1], x15
-  st1.4h {v0, v1}, [x1], x15
-  st1.4h {v0, v1, v2}, [x1], x15
-  st1.4h {v0, v1, v2, v3}, [x1], x15
-
-  st1.8h {v0}, [x1], x15
-  st1.8h {v0, v1}, [x1], x15
-  st1.8h {v0, v1, v2}, [x1], x15
-  st1.8h {v0, v1, v2, v3}, [x1], x15
-
-  st1.2s {v0}, [x1], x15
-  st1.2s {v0, v1}, [x1], x15
-  st1.2s {v0, v1, v2}, [x1], x15
-  st1.2s {v0, v1, v2, v3}, [x1], x15
-
-  st1.4s {v0}, [x1], x15
-  st1.4s {v0, v1}, [x1], x15
-  st1.4s {v0, v1, v2}, [x1], x15
-  st1.4s {v0, v1, v2, v3}, [x1], x15
-
-  st1.1d {v0}, [x1], x15
-  st1.1d {v0, v1}, [x1], x15
-  st1.1d {v0, v1, v2}, [x1], x15
-  st1.1d {v0, v1, v2, v3}, [x1], x15
-
-  st1.2d {v0}, [x1], x15
-  st1.2d {v0, v1}, [x1], x15
-  st1.2d {v0, v1, v2}, [x1], x15
-  st1.2d {v0, v1, v2, v3}, [x1], x15
-
-  ld1.8b {v0}, [x1], #8
-  ld1.8b {v0, v1}, [x1], #16
-  ld1.8b {v0, v1, v2}, [x1], #24
-  ld1.8b {v0, v1, v2, v3}, [x1], #32
-
-  ld1.16b {v0}, [x1], #16
-  ld1.16b {v0, v1}, [x1], #32
-  ld1.16b {v0, v1, v2}, [x1], #48
-  ld1.16b {v0, v1, v2, v3}, [x1], #64
-
-  ld1.4h {v0}, [x1], #8
-  ld1.4h {v0, v1}, [x1], #16
-  ld1.4h {v0, v1, v2}, [x1], #24
-  ld1.4h {v0, v1, v2, v3}, [x1], #32
-
-  ld1.8h {v0}, [x1], #16
-  ld1.8h {v0, v1}, [x1], #32
-  ld1.8h {v0, v1, v2}, [x1], #48
-  ld1.8h {v0, v1, v2, v3}, [x1], #64
-
-  ld1.2s {v0}, [x1], #8
-  ld1.2s {v0, v1}, [x1], #16
-  ld1.2s {v0, v1, v2}, [x1], #24
-  ld1.2s {v0, v1, v2, v3}, [x1], #32
-
-  ld1.4s {v0}, [x1], #16
-  ld1.4s {v0, v1}, [x1], #32
-  ld1.4s {v0, v1, v2}, [x1], #48
-  ld1.4s {v0, v1, v2, v3}, [x1], #64
-
-  ld1.1d {v0}, [x1], #8
-  ld1.1d {v0, v1}, [x1], #16
-  ld1.1d {v0, v1, v2}, [x1], #24
-  ld1.1d {v0, v1, v2, v3}, [x1], #32
-
-  ld1.2d {v0}, [x1], #16
-  ld1.2d {v0, v1}, [x1], #32
-  ld1.2d {v0, v1, v2}, [x1], #48
-  ld1.2d {v0, v1, v2, v3}, [x1], #64
-
-  st1.8b {v0}, [x1], #8
-  st1.8b {v0, v1}, [x1], #16
-  st1.8b {v0, v1, v2}, [x1], #24
-  st1.8b {v0, v1, v2, v3}, [x1], #32
-
-  st1.16b {v0}, [x1], #16
-  st1.16b {v0, v1}, [x1], #32
-  st1.16b {v0, v1, v2}, [x1], #48
-  st1.16b {v0, v1, v2, v3}, [x1], #64
-
-  st1.4h {v0}, [x1], #8
-  st1.4h {v0, v1}, [x1], #16
-  st1.4h {v0, v1, v2}, [x1], #24
-  st1.4h {v0, v1, v2, v3}, [x1], #32
-
-  st1.8h {v0}, [x1], #16
-  st1.8h {v0, v1}, [x1], #32
-  st1.8h {v0, v1, v2}, [x1], #48
-  st1.8h {v0, v1, v2, v3}, [x1], #64
-
-  st1.2s {v0}, [x1], #8
-  st1.2s {v0, v1}, [x1], #16
-  st1.2s {v0, v1, v2}, [x1], #24
-  st1.2s {v0, v1, v2, v3}, [x1], #32
-
-  st1.4s {v0}, [x1], #16
-  st1.4s {v0, v1}, [x1], #32
-  st1.4s {v0, v1, v2}, [x1], #48
-  st1.4s {v0, v1, v2, v3}, [x1], #64
-
-  st1.1d {v0}, [x1], #8
-  st1.1d {v0, v1}, [x1], #16
-  st1.1d {v0, v1, v2}, [x1], #24
-  st1.1d {v0, v1, v2, v3}, [x1], #32
-
-  st1.2d {v0}, [x1], #16
-  st1.2d {v0, v1}, [x1], #32
-  st1.2d {v0, v1, v2}, [x1], #48
-  st1.2d {v0, v1, v2, v3}, [x1], #64
-
-; CHECK: ld1st1_multiple_post:
-; CHECK: ld1.8b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0xcf,0x0c]
-; CHECK: ld1.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0xcf,0x0c]
-; CHECK: ld1.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0xcf,0x0c]
-; CHECK: ld1.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0xcf,0x0c]
-
-; CHECK: ld1.16b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0xcf,0x4c]
-; CHECK: ld1.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0xcf,0x4c]
-; CHECK: ld1.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0xcf,0x4c]
-; CHECK: ld1.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0xcf,0x4c]
-
-; CHECK: ld1.4h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0xcf,0x0c]
-; CHECK: ld1.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0xcf,0x0c]
-; CHECK: ld1.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0xcf,0x0c]
-; CHECK: ld1.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0xcf,0x0c]
-
-; CHECK: ld1.8h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0xcf,0x4c]
-; CHECK: ld1.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0xcf,0x4c]
-; CHECK: ld1.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0xcf,0x4c]
-; CHECK: ld1.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0xcf,0x4c]
-
-; CHECK: ld1.2s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0xcf,0x0c]
-; CHECK: ld1.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0xcf,0x0c]
-; CHECK: ld1.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0xcf,0x0c]
-; CHECK: ld1.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0xcf,0x0c]
-
-; CHECK: ld1.4s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0xcf,0x4c]
-; CHECK: ld1.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0xcf,0x4c]
-; CHECK: ld1.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0xcf,0x4c]
-; CHECK: ld1.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0xcf,0x4c]
-
-; CHECK: ld1.1d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0xcf,0x0c]
-; CHECK: ld1.1d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0xcf,0x0c]
-; CHECK: ld1.1d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0xcf,0x0c]
-; CHECK: ld1.1d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0xcf,0x0c]
-
-; CHECK: ld1.2d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0xcf,0x4c]
-; CHECK: ld1.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0xcf,0x4c]
-; CHECK: ld1.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0xcf,0x4c]
-; CHECK: ld1.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0xcf,0x4c]
-
-; CHECK: st1.8b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0x8f,0x0c]
-; CHECK: st1.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0x8f,0x0c]
-; CHECK: st1.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0x8f,0x0c]
-; CHECK: st1.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0x8f,0x0c]
-
-; CHECK: st1.16b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0x8f,0x4c]
-; CHECK: st1.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0x8f,0x4c]
-; CHECK: st1.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0x8f,0x4c]
-; CHECK: st1.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0x8f,0x4c]
-
-; CHECK: st1.4h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0x8f,0x0c]
-; CHECK: st1.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0x8f,0x0c]
-; CHECK: st1.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0x8f,0x0c]
-; CHECK: st1.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0x8f,0x0c]
-
-; CHECK: st1.8h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0x8f,0x4c]
-; CHECK: st1.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0x8f,0x4c]
-; CHECK: st1.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0x8f,0x4c]
-; CHECK: st1.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0x8f,0x4c]
-
-; CHECK: st1.2s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0x8f,0x0c]
-; CHECK: st1.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0x8f,0x0c]
-; CHECK: st1.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0x8f,0x0c]
-; CHECK: st1.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0x8f,0x0c]
-
-; CHECK: st1.4s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0x8f,0x4c]
-; CHECK: st1.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0x8f,0x4c]
-; CHECK: st1.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0x8f,0x4c]
-; CHECK: st1.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0x8f,0x4c]
-
-; CHECK: st1.1d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0x8f,0x0c]
-; CHECK: st1.1d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0x8f,0x0c]
-; CHECK: st1.1d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0x8f,0x0c]
-; CHECK: st1.1d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0x8f,0x0c]
-
-; CHECK: st1.2d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0x8f,0x4c]
-; CHECK: st1.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0x8f,0x4c]
-; CHECK: st1.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0x8f,0x4c]
-; CHECK: st1.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0x8f,0x4c]
-
-; CHECK: ld1.8b { v0 }, [x1], #8       ; encoding: [0x20,0x70,0xdf,0x0c]
-; CHECK: ld1.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa0,0xdf,0x0c]
-; CHECK: ld1.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x60,0xdf,0x0c]
-; CHECK: ld1.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x20,0xdf,0x0c]
-
-; CHECK: ld1.16b { v0 }, [x1], #16       ; encoding: [0x20,0x70,0xdf,0x4c]
-; CHECK: ld1.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa0,0xdf,0x4c]
-; CHECK: ld1.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x60,0xdf,0x4c]
-; CHECK: ld1.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x20,0xdf,0x4c]
-
-; CHECK: ld1.4h { v0 }, [x1], #8       ; encoding: [0x20,0x74,0xdf,0x0c]
-; CHECK: ld1.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa4,0xdf,0x0c]
-; CHECK: ld1.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x64,0xdf,0x0c]
-; CHECK: ld1.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x24,0xdf,0x0c]
-
-; CHECK: ld1.8h { v0 }, [x1], #16       ; encoding: [0x20,0x74,0xdf,0x4c]
-; CHECK: ld1.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa4,0xdf,0x4c]
-; CHECK: ld1.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x64,0xdf,0x4c]
-; CHECK: ld1.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x24,0xdf,0x4c]
-
-; CHECK: ld1.2s { v0 }, [x1], #8       ; encoding: [0x20,0x78,0xdf,0x0c]
-; CHECK: ld1.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa8,0xdf,0x0c]
-; CHECK: ld1.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x68,0xdf,0x0c]
-; CHECK: ld1.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x28,0xdf,0x0c]
-
-; CHECK: ld1.4s { v0 }, [x1], #16       ; encoding: [0x20,0x78,0xdf,0x4c]
-; CHECK: ld1.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa8,0xdf,0x4c]
-; CHECK: ld1.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x68,0xdf,0x4c]
-; CHECK: ld1.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x28,0xdf,0x4c]
-
-; CHECK: ld1.1d { v0 }, [x1], #8       ; encoding: [0x20,0x7c,0xdf,0x0c]
-; CHECK: ld1.1d { v0, v1 }, [x1], #16   ; encoding: [0x20,0xac,0xdf,0x0c]
-; CHECK: ld1.1d { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x6c,0xdf,0x0c]
-; CHECK: ld1.1d { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x2c,0xdf,0x0c]
-
-; CHECK: ld1.2d { v0 }, [x1], #16       ; encoding: [0x20,0x7c,0xdf,0x4c]
-; CHECK: ld1.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0xac,0xdf,0x4c]
-; CHECK: ld1.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x6c,0xdf,0x4c]
-; CHECK: ld1.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x2c,0xdf,0x4c]
-
-; CHECK: st1.8b { v0 }, [x1], #8       ; encoding: [0x20,0x70,0x9f,0x0c]
-; CHECK: st1.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa0,0x9f,0x0c]
-; CHECK: st1.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x60,0x9f,0x0c]
-; CHECK: st1.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x20,0x9f,0x0c]
-
-; CHECK: st1.16b { v0 }, [x1], #16       ; encoding: [0x20,0x70,0x9f,0x4c]
-; CHECK: st1.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa0,0x9f,0x4c]
-; CHECK: st1.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x60,0x9f,0x4c]
-; CHECK: st1.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x20,0x9f,0x4c]
-
-; CHECK: st1.4h { v0 }, [x1], #8       ; encoding: [0x20,0x74,0x9f,0x0c]
-; CHECK: st1.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa4,0x9f,0x0c]
-; CHECK: st1.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x64,0x9f,0x0c]
-; CHECK: st1.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x24,0x9f,0x0c]
-
-; CHECK: st1.8h { v0 }, [x1], #16       ; encoding: [0x20,0x74,0x9f,0x4c]
-; CHECK: st1.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa4,0x9f,0x4c]
-; CHECK: st1.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x64,0x9f,0x4c]
-; CHECK: st1.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x24,0x9f,0x4c]
-
-; CHECK: st1.2s { v0 }, [x1], #8       ; encoding: [0x20,0x78,0x9f,0x0c]
-; CHECK: st1.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa8,0x9f,0x0c]
-; CHECK: st1.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x68,0x9f,0x0c]
-; CHECK: st1.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x28,0x9f,0x0c]
-
-; CHECK: st1.4s { v0 }, [x1], #16       ; encoding: [0x20,0x78,0x9f,0x4c]
-; CHECK: st1.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa8,0x9f,0x4c]
-; CHECK: st1.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x68,0x9f,0x4c]
-; CHECK: st1.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x28,0x9f,0x4c]
-
-; CHECK: st1.1d { v0 }, [x1], #8       ; encoding: [0x20,0x7c,0x9f,0x0c]
-; CHECK: st1.1d { v0, v1 }, [x1], #16   ; encoding: [0x20,0xac,0x9f,0x0c]
-; CHECK: st1.1d { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x6c,0x9f,0x0c]
-; CHECK: st1.1d { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x2c,0x9f,0x0c]
-
-; CHECK: st1.2d { v0 }, [x1], #16       ; encoding: [0x20,0x7c,0x9f,0x4c]
-; CHECK: st1.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0xac,0x9f,0x4c]
-; CHECK: st1.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x6c,0x9f,0x4c]
-; CHECK: st1.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x2c,0x9f,0x4c]
-
-
-_ld2st2_multiple_post:
-  ld2.8b {v0, v1}, [x1], x15
-  ld2.16b {v0, v1}, [x1], x15
-  ld2.4h {v0, v1}, [x1], x15
-  ld2.8h {v0, v1}, [x1], x15
-  ld2.2s {v0, v1}, [x1], x15
-  ld2.4s {v0, v1}, [x1], x15
-  ld2.2d {v0, v1}, [x1], x15
-
-  st2.8b {v0, v1}, [x1], x15
-  st2.16b {v0, v1}, [x1], x15
-  st2.4h {v0, v1}, [x1], x15
-  st2.8h {v0, v1}, [x1], x15
-  st2.2s {v0, v1}, [x1], x15
-  st2.4s {v0, v1}, [x1], x15
-  st2.2d {v0, v1}, [x1], x15
-
-  ld2.8b {v0, v1}, [x1], #16
-  ld2.16b {v0, v1}, [x1], #32
-  ld2.4h {v0, v1}, [x1], #16
-  ld2.8h {v0, v1}, [x1], #32
-  ld2.2s {v0, v1}, [x1], #16
-  ld2.4s {v0, v1}, [x1], #32
-  ld2.2d {v0, v1}, [x1], #32
-
-  st2.8b {v0, v1}, [x1], #16
-  st2.16b {v0, v1}, [x1], #32
-  st2.4h {v0, v1}, [x1], #16
-  st2.8h {v0, v1}, [x1], #32
-  st2.2s {v0, v1}, [x1], #16
-  st2.4s {v0, v1}, [x1], #32
-  st2.2d {v0, v1}, [x1], #32
-
-
-; CHECK: ld2st2_multiple_post:
-; CHECK: ld2.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0xcf,0x0c]
-; CHECK: ld2.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0xcf,0x4c]
-; CHECK: ld2.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0xcf,0x0c]
-; CHECK: ld2.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0xcf,0x4c]
-; CHECK: ld2.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0xcf,0x0c]
-; CHECK: ld2.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0xcf,0x4c]
-; CHECK: ld2.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0x8c,0xcf,0x4c]
-
-; CHECK: st2.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0x8f,0x0c]
-; CHECK: st2.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0x8f,0x4c]
-; CHECK: st2.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0x8f,0x0c]
-; CHECK: st2.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0x8f,0x4c]
-; CHECK: st2.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0x8f,0x0c]
-; CHECK: st2.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0x8f,0x4c]
-; CHECK: st2.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0x8c,0x8f,0x4c]
-
-; CHECK: ld2.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0x80,0xdf,0x0c]
-; CHECK: ld2.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0x80,0xdf,0x4c]
-; CHECK: ld2.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0x84,0xdf,0x0c]
-; CHECK: ld2.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0x84,0xdf,0x4c]
-; CHECK: ld2.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0x88,0xdf,0x0c]
-; CHECK: ld2.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0x88,0xdf,0x4c]
-; CHECK: ld2.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0x8c,0xdf,0x4c]
-
-; CHECK: st2.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0x80,0x9f,0x0c]
-; CHECK: st2.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0x80,0x9f,0x4c]
-; CHECK: st2.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0x84,0x9f,0x0c]
-; CHECK: st2.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0x84,0x9f,0x4c]
-; CHECK: st2.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0x88,0x9f,0x0c]
-; CHECK: st2.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0x88,0x9f,0x4c]
-; CHECK: st2.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0x8c,0x9f,0x4c]
-
-
-_ld3st3_multiple_post:
-  ld3.8b {v0, v1, v2}, [x1], x15
-  ld3.16b {v0, v1, v2}, [x1], x15
-  ld3.4h {v0, v1, v2}, [x1], x15
-  ld3.8h {v0, v1, v2}, [x1], x15
-  ld3.2s {v0, v1, v2}, [x1], x15
-  ld3.4s {v0, v1, v2}, [x1], x15
-  ld3.2d {v0, v1, v2}, [x1], x15
-
-  st3.8b {v0, v1, v2}, [x1], x15
-  st3.16b {v0, v1, v2}, [x1], x15
-  st3.4h {v0, v1, v2}, [x1], x15
-  st3.8h {v0, v1, v2}, [x1], x15
-  st3.2s {v0, v1, v2}, [x1], x15
-  st3.4s {v0, v1, v2}, [x1], x15
-  st3.2d {v0, v1, v2}, [x1], x15
-
-  ld3.8b {v0, v1, v2}, [x1], #24
-  ld3.16b {v0, v1, v2}, [x1], #48
-  ld3.4h {v0, v1, v2}, [x1], #24
-  ld3.8h {v0, v1, v2}, [x1], #48
-  ld3.2s {v0, v1, v2}, [x1], #24
-  ld3.4s {v0, v1, v2}, [x1], #48
-  ld3.2d {v0, v1, v2}, [x1], #48
-
-  st3.8b {v0, v1, v2}, [x1], #24
-  st3.16b {v0, v1, v2}, [x1], #48
-  st3.4h {v0, v1, v2}, [x1], #24
-  st3.8h {v0, v1, v2}, [x1], #48
-  st3.2s {v0, v1, v2}, [x1], #24
-  st3.4s {v0, v1, v2}, [x1], #48
-  st3.2d {v0, v1, v2}, [x1], #48
-
-; CHECK: ld3st3_multiple_post:
-; CHECK: ld3.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0xcf,0x0c]
-; CHECK: ld3.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0xcf,0x4c]
-; CHECK: ld3.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0xcf,0x0c]
-; CHECK: ld3.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0xcf,0x4c]
-; CHECK: ld3.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0xcf,0x0c]
-; CHECK: ld3.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0xcf,0x4c]
-; CHECK: ld3.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x4c,0xcf,0x4c]
-
-; CHECK: st3.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0x8f,0x0c]
-; CHECK: st3.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0x8f,0x4c]
-; CHECK: st3.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0x8f,0x0c]
-; CHECK: st3.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0x8f,0x4c]
-; CHECK: st3.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0x8f,0x0c]
-; CHECK: st3.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0x8f,0x4c]
-; CHECK: st3.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x4c,0x8f,0x4c]
-
-; CHECK: ld3.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x40,0xdf,0x0c]
-; CHECK: ld3.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x40,0xdf,0x4c]
-; CHECK: ld3.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x44,0xdf,0x0c]
-; CHECK: ld3.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x44,0xdf,0x4c]
-; CHECK: ld3.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x48,0xdf,0x0c]
-; CHECK: ld3.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x48,0xdf,0x4c]
-; CHECK: ld3.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x4c,0xdf,0x4c]
-
-; CHECK: st3.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x40,0x9f,0x0c]
-; CHECK: st3.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x40,0x9f,0x4c]
-; CHECK: st3.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x44,0x9f,0x0c]
-; CHECK: st3.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x44,0x9f,0x4c]
-; CHECK: st3.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x48,0x9f,0x0c]
-; CHECK: st3.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x48,0x9f,0x4c]
-; CHECK: st3.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x4c,0x9f,0x4c]
-
-_ld4st4_multiple_post:
-  ld4.8b {v0, v1, v2, v3}, [x1], x15
-  ld4.16b {v0, v1, v2, v3}, [x1], x15
-  ld4.4h {v0, v1, v2, v3}, [x1], x15
-  ld4.8h {v0, v1, v2, v3}, [x1], x15
-  ld4.2s {v0, v1, v2, v3}, [x1], x15
-  ld4.4s {v0, v1, v2, v3}, [x1], x15
-  ld4.2d {v0, v1, v2, v3}, [x1], x15
-
-  st4.8b {v0, v1, v2, v3}, [x1], x15
-  st4.16b {v0, v1, v2, v3}, [x1], x15
-  st4.4h {v0, v1, v2, v3}, [x1], x15
-  st4.8h {v0, v1, v2, v3}, [x1], x15
-  st4.2s {v0, v1, v2, v3}, [x1], x15
-  st4.4s {v0, v1, v2, v3}, [x1], x15
-  st4.2d {v0, v1, v2, v3}, [x1], x15
-
-  ld4.8b {v0, v1, v2, v3}, [x1], #32
-  ld4.16b {v0, v1, v2, v3}, [x1], #64
-  ld4.4h {v0, v1, v2, v3}, [x1], #32
-  ld4.8h {v0, v1, v2, v3}, [x1], #64
-  ld4.2s {v0, v1, v2, v3}, [x1], #32
-  ld4.4s {v0, v1, v2, v3}, [x1], #64
-  ld4.2d {v0, v1, v2, v3}, [x1], #64
-
-  st4.8b {v0, v1, v2, v3}, [x1], #32
-  st4.16b {v0, v1, v2, v3}, [x1], #64
-  st4.4h {v0, v1, v2, v3}, [x1], #32
-  st4.8h {v0, v1, v2, v3}, [x1], #64
-  st4.2s {v0, v1, v2, v3}, [x1], #32
-  st4.4s {v0, v1, v2, v3}, [x1], #64
-  st4.2d {v0, v1, v2, v3}, [x1], #64
-
-
-; CHECK: ld4st4_multiple_post:
-; CHECK: ld4.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0xcf,0x0c]
-; CHECK: ld4.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0xcf,0x4c]
-; CHECK: ld4.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0xcf,0x0c]
-; CHECK: ld4.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0xcf,0x4c]
-; CHECK: ld4.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0xcf,0x0c]
-; CHECK: ld4.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0xcf,0x4c]
-; CHECK: ld4.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x0c,0xcf,0x4c]
-
-; CHECK: st4.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0x8f,0x0c]
-; CHECK: st4.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0x8f,0x4c]
-; CHECK: st4.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0x8f,0x0c]
-; CHECK: st4.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0x8f,0x4c]
-; CHECK: st4.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0x8f,0x0c]
-; CHECK: st4.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0x8f,0x4c]
-; CHECK: st4.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x0c,0x8f,0x4c]
-
-; CHECK: ld4.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x00,0xdf,0x0c]
-; CHECK: ld4.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x00,0xdf,0x4c]
-; CHECK: ld4.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x04,0xdf,0x0c]
-; CHECK: ld4.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x04,0xdf,0x4c]
-; CHECK: ld4.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x08,0xdf,0x0c]
-; CHECK: ld4.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x08,0xdf,0x4c]
-; CHECK: ld4.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x0c,0xdf,0x4c]
-
-; CHECK: st4.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x00,0x9f,0x0c]
-; CHECK: st4.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x00,0x9f,0x4c]
-; CHECK: st4.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x04,0x9f,0x0c]
-; CHECK: st4.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x04,0x9f,0x4c]
-; CHECK: st4.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x08,0x9f,0x0c]
-; CHECK: st4.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x08,0x9f,0x4c]
-; CHECK: st4.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x0c,0x9f,0x4c]
-
-ld1r:
-  ld1r.8b {v4}, [x2]
-  ld1r.8b {v4}, [x2], x3
-  ld1r.16b {v4}, [x2]
-  ld1r.16b {v4}, [x2], x3
-  ld1r.4h {v4}, [x2]
-  ld1r.4h {v4}, [x2], x3
-  ld1r.8h {v4}, [x2]
-  ld1r.8h {v4}, [x2], x3
-  ld1r.2s {v4}, [x2]
-  ld1r.2s {v4}, [x2], x3
-  ld1r.4s {v4}, [x2]
-  ld1r.4s {v4}, [x2], x3
-  ld1r.1d {v4}, [x2]
-  ld1r.1d {v4}, [x2], x3
-  ld1r.2d {v4}, [x2]
-  ld1r.2d {v4}, [x2], x3
-
-  ld1r.8b {v4}, [x2], #1
-  ld1r.16b {v4}, [x2], #1
-  ld1r.4h {v4}, [x2], #2
-  ld1r.8h {v4}, [x2], #2
-  ld1r.2s {v4}, [x2], #4
-  ld1r.4s {v4}, [x2], #4
-  ld1r.1d {v4}, [x2], #8
-  ld1r.2d {v4}, [x2], #8
-
-; CHECK: ld1r:
-; CHECK: ld1r.8b { v4 }, [x2]            ; encoding: [0x44,0xc0,0x40,0x0d]
-; CHECK: ld1r.8b { v4 }, [x2], x3        ; encoding: [0x44,0xc0,0xc3,0x0d]
-; CHECK: ld1r.16b { v4 }, [x2]    ; encoding: [0x44,0xc0,0x40,0x4d]
-; CHECK: ld1r.16b { v4 }, [x2], x3 ; encoding: [0x44,0xc0,0xc3,0x4d]
-; CHECK: ld1r.4h { v4 }, [x2]            ; encoding: [0x44,0xc4,0x40,0x0d]
-; CHECK: ld1r.4h { v4 }, [x2], x3        ; encoding: [0x44,0xc4,0xc3,0x0d]
-; CHECK: ld1r.8h { v4 }, [x2]            ; encoding: [0x44,0xc4,0x40,0x4d]
-; CHECK: ld1r.8h { v4 }, [x2], x3        ; encoding: [0x44,0xc4,0xc3,0x4d]
-; CHECK: ld1r.2s { v4 }, [x2]            ; encoding: [0x44,0xc8,0x40,0x0d]
-; CHECK: ld1r.2s { v4 }, [x2], x3        ; encoding: [0x44,0xc8,0xc3,0x0d]
-; CHECK: ld1r.4s { v4 }, [x2]            ; encoding: [0x44,0xc8,0x40,0x4d]
-; CHECK: ld1r.4s { v4 }, [x2], x3        ; encoding: [0x44,0xc8,0xc3,0x4d]
-; CHECK: ld1r.1d { v4 }, [x2]            ; encoding: [0x44,0xcc,0x40,0x0d]
-; CHECK: ld1r.1d { v4 }, [x2], x3        ; encoding: [0x44,0xcc,0xc3,0x0d]
-; CHECK: ld1r.2d { v4 }, [x2]            ; encoding: [0x44,0xcc,0x40,0x4d]
-; CHECK: ld1r.2d { v4 }, [x2], x3        ; encoding: [0x44,0xcc,0xc3,0x4d]
-
-; CHECK: ld1r.8b { v4 }, [x2], #1        ; encoding: [0x44,0xc0,0xdf,0x0d]
-; CHECK: ld1r.16b { v4 }, [x2], #1 ; encoding: [0x44,0xc0,0xdf,0x4d]
-; CHECK: ld1r.4h { v4 }, [x2], #2        ; encoding: [0x44,0xc4,0xdf,0x0d]
-; CHECK: ld1r.8h { v4 }, [x2], #2        ; encoding: [0x44,0xc4,0xdf,0x4d]
-; CHECK: ld1r.2s { v4 }, [x2], #4        ; encoding: [0x44,0xc8,0xdf,0x0d]
-; CHECK: ld1r.4s { v4 }, [x2], #4        ; encoding: [0x44,0xc8,0xdf,0x4d]
-; CHECK: ld1r.1d { v4 }, [x2], #8        ; encoding: [0x44,0xcc,0xdf,0x0d]
-; CHECK: ld1r.2d { v4 }, [x2], #8        ; encoding: [0x44,0xcc,0xdf,0x4d]
-
-ld2r:
-  ld2r.8b {v4, v5}, [x2]
-  ld2r.8b {v4, v5}, [x2], x3
-  ld2r.16b {v4, v5}, [x2]
-  ld2r.16b {v4, v5}, [x2], x3
-  ld2r.4h {v4, v5}, [x2]
-  ld2r.4h {v4, v5}, [x2], x3
-  ld2r.8h {v4, v5}, [x2]
-  ld2r.8h {v4, v5}, [x2], x3
-  ld2r.2s {v4, v5}, [x2]
-  ld2r.2s {v4, v5}, [x2], x3
-  ld2r.4s {v4, v5}, [x2]
-  ld2r.4s {v4, v5}, [x2], x3
-  ld2r.1d {v4, v5}, [x2]
-  ld2r.1d {v4, v5}, [x2], x3
-  ld2r.2d {v4, v5}, [x2]
-  ld2r.2d {v4, v5}, [x2], x3
-
-  ld2r.8b {v4, v5}, [x2], #2
-  ld2r.16b {v4, v5}, [x2], #2
-  ld2r.4h {v4, v5}, [x2], #4
-  ld2r.8h {v4, v5}, [x2], #4
-  ld2r.2s {v4, v5}, [x2], #8
-  ld2r.4s {v4, v5}, [x2], #8
-  ld2r.1d {v4, v5}, [x2], #16
-  ld2r.2d {v4, v5}, [x2], #16
-
-; CHECK: ld2r:
-; CHECK: ld2r.8b { v4, v5 }, [x2]        ; encoding: [0x44,0xc0,0x60,0x0d]
-; CHECK: ld2r.8b { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc0,0xe3,0x0d]
-; CHECK: ld2r.16b { v4, v5 }, [x2] ; encoding: [0x44,0xc0,0x60,0x4d]
-; CHECK: ld2r.16b { v4, v5 }, [x2], x3 ; encoding: [0x44,0xc0,0xe3,0x4d]
-; CHECK: ld2r.4h { v4, v5 }, [x2]        ; encoding: [0x44,0xc4,0x60,0x0d]
-; CHECK: ld2r.4h { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc4,0xe3,0x0d]
-; CHECK: ld2r.8h { v4, v5 }, [x2]        ; encoding: [0x44,0xc4,0x60,0x4d]
-; CHECK: ld2r.8h { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc4,0xe3,0x4d]
-; CHECK: ld2r.2s { v4, v5 }, [x2]        ; encoding: [0x44,0xc8,0x60,0x0d]
-; CHECK: ld2r.2s { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc8,0xe3,0x0d]
-; CHECK: ld2r.4s { v4, v5 }, [x2]        ; encoding: [0x44,0xc8,0x60,0x4d]
-; CHECK: ld2r.4s { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc8,0xe3,0x4d]
-; CHECK: ld2r.1d { v4, v5 }, [x2]        ; encoding: [0x44,0xcc,0x60,0x0d]
-; CHECK: ld2r.1d { v4, v5 }, [x2], x3    ; encoding: [0x44,0xcc,0xe3,0x0d]
-; CHECK: ld2r.2d { v4, v5 }, [x2]        ; encoding: [0x44,0xcc,0x60,0x4d]
-; CHECK: ld2r.2d { v4, v5 }, [x2], x3    ; encoding: [0x44,0xcc,0xe3,0x4d]
-
-; CHECK: ld2r.8b { v4, v5 }, [x2], #2    ; encoding: [0x44,0xc0,0xff,0x0d]
-; CHECK: ld2r.16b { v4, v5 }, [x2], #2 ; encoding: [0x44,0xc0,0xff,0x4d]
-; CHECK: ld2r.4h { v4, v5 }, [x2], #4    ; encoding: [0x44,0xc4,0xff,0x0d]
-; CHECK: ld2r.8h { v4, v5 }, [x2], #4    ; encoding: [0x44,0xc4,0xff,0x4d]
-; CHECK: ld2r.2s { v4, v5 }, [x2], #8    ; encoding: [0x44,0xc8,0xff,0x0d]
-; CHECK: ld2r.4s { v4, v5 }, [x2], #8    ; encoding: [0x44,0xc8,0xff,0x4d]
-; CHECK: ld2r.1d { v4, v5 }, [x2], #16    ; encoding: [0x44,0xcc,0xff,0x0d]
-; CHECK: ld2r.2d { v4, v5 }, [x2], #16    ; encoding: [0x44,0xcc,0xff,0x4d]
-
-ld3r:
-  ld3r.8b {v4, v5, v6}, [x2]
-  ld3r.8b {v4, v5, v6}, [x2], x3
-  ld3r.16b {v4, v5, v6}, [x2]
-  ld3r.16b {v4, v5, v6}, [x2], x3
-  ld3r.4h {v4, v5, v6}, [x2]
-  ld3r.4h {v4, v5, v6}, [x2], x3
-  ld3r.8h {v4, v5, v6}, [x2]
-  ld3r.8h {v4, v5, v6}, [x2], x3
-  ld3r.2s {v4, v5, v6}, [x2]
-  ld3r.2s {v4, v5, v6}, [x2], x3
-  ld3r.4s {v4, v5, v6}, [x2]
-  ld3r.4s {v4, v5, v6}, [x2], x3
-  ld3r.1d {v4, v5, v6}, [x2]
-  ld3r.1d {v4, v5, v6}, [x2], x3
-  ld3r.2d {v4, v5, v6}, [x2]
-  ld3r.2d {v4, v5, v6}, [x2], x3
-
-  ld3r.8b {v4, v5, v6}, [x2], #3
-  ld3r.16b {v4, v5, v6}, [x2], #3
-  ld3r.4h {v4, v5, v6}, [x2], #6
-  ld3r.8h {v4, v5, v6}, [x2], #6
-  ld3r.2s {v4, v5, v6}, [x2], #12
-  ld3r.4s {v4, v5, v6}, [x2], #12
-  ld3r.1d {v4, v5, v6}, [x2], #24
-  ld3r.2d {v4, v5, v6}, [x2], #24
-
-; CHECK: ld3r:
-; CHECK: ld3r.8b { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe0,0x40,0x0d]
-; CHECK: ld3r.8b { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe0,0xc3,0x0d]
-; CHECK: ld3r.16b { v4, v5, v6 }, [x2] ; encoding: [0x44,0xe0,0x40,0x4d]
-; CHECK: ld3r.16b { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe0,0xc3,0x4d]
-; CHECK: ld3r.4h { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe4,0x40,0x0d]
-; CHECK: ld3r.4h { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe4,0xc3,0x0d]
-; CHECK: ld3r.8h { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe4,0x40,0x4d]
-; CHECK: ld3r.8h { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe4,0xc3,0x4d]
-; CHECK: ld3r.2s { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe8,0x40,0x0d]
-; CHECK: ld3r.2s { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe8,0xc3,0x0d]
-; CHECK: ld3r.4s { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe8,0x40,0x4d]
-; CHECK: ld3r.4s { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe8,0xc3,0x4d]
-; CHECK: ld3r.1d { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xec,0x40,0x0d]
-; CHECK: ld3r.1d { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xec,0xc3,0x0d]
-; CHECK: ld3r.2d { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xec,0x40,0x4d]
-; CHECK: ld3r.2d { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xec,0xc3,0x4d]
-
-; CHECK: ld3r.8b { v4, v5, v6 }, [x2], #3 ; encoding: [0x44,0xe0,0xdf,0x0d]
-; CHECK: ld3r.16b { v4, v5, v6 }, [x2], #3 ; encoding: [0x44,0xe0,0xdf,0x4d]
-; CHECK: ld3r.4h { v4, v5, v6 }, [x2], #6 ; encoding: [0x44,0xe4,0xdf,0x0d]
-; CHECK: ld3r.8h { v4, v5, v6 }, [x2], #6 ; encoding: [0x44,0xe4,0xdf,0x4d]
-; CHECK: ld3r.2s { v4, v5, v6 }, [x2], #12 ; encoding: [0x44,0xe8,0xdf,0x0d]
-; CHECK: ld3r.4s { v4, v5, v6 }, [x2], #12 ; encoding: [0x44,0xe8,0xdf,0x4d]
-; CHECK: ld3r.1d { v4, v5, v6 }, [x2], #24 ; encoding: [0x44,0xec,0xdf,0x0d]
-; CHECK: ld3r.2d { v4, v5, v6 }, [x2], #24 ; encoding: [0x44,0xec,0xdf,0x4d]
-
-ld4r:
-  ld4r.8b {v4, v5, v6, v7}, [x2]
-  ld4r.8b {v4, v5, v6, v7}, [x2], x3
-  ld4r.16b {v4, v5, v6, v7}, [x2]
-  ld4r.16b {v4, v5, v6, v7}, [x2], x3
-  ld4r.4h {v4, v5, v6, v7}, [x2]
-  ld4r.4h {v4, v5, v6, v7}, [x2], x3
-  ld4r.8h {v4, v5, v6, v7}, [x2]
-  ld4r.8h {v4, v5, v6, v7}, [x2], x3
-  ld4r.2s {v4, v5, v6, v7}, [x2]
-  ld4r.2s {v4, v5, v6, v7}, [x2], x3
-  ld4r.4s {v4, v5, v6, v7}, [x2]
-  ld4r.4s {v4, v5, v6, v7}, [x2], x3
-  ld4r.1d {v4, v5, v6, v7}, [x2]
-  ld4r.1d {v4, v5, v6, v7}, [x2], x3
-  ld4r.2d {v4, v5, v6, v7}, [x2]
-  ld4r.2d {v4, v5, v6, v7}, [x2], x3
-
-  ld4r.8b {v4, v5, v6, v7}, [x2], #4
-  ld4r.16b {v5, v6, v7, v8}, [x2], #4
-  ld4r.4h {v6, v7, v8, v9}, [x2], #8
-  ld4r.8h {v1, v2, v3, v4}, [x2], #8
-  ld4r.2s {v2, v3, v4, v5}, [x2], #16
-  ld4r.4s {v3, v4, v5, v6}, [x2], #16
-  ld4r.1d {v0, v1, v2, v3}, [x2], #32
-  ld4r.2d {v4, v5, v6, v7}, [x2], #32
-
-; CHECK: ld4r:
-; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe0,0x60,0x0d]
-; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe0,0xe3,0x0d]
-; CHECK: ld4r.16b { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe0,0x60,0x4d]
-; CHECK: ld4r.16b { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe0,0xe3,0x4d]
-; CHECK: ld4r.4h { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe4,0x60,0x0d]
-; CHECK: ld4r.4h { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe4,0xe3,0x0d]
-; CHECK: ld4r.8h { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe4,0x60,0x4d]
-; CHECK: ld4r.8h { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe4,0xe3,0x4d]
-; CHECK: ld4r.2s { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe8,0x60,0x0d]
-; CHECK: ld4r.2s { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe8,0xe3,0x0d]
-; CHECK: ld4r.4s { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe8,0x60,0x4d]
-; CHECK: ld4r.4s { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe8,0xe3,0x4d]
-; CHECK: ld4r.1d { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xec,0x60,0x0d]
-; CHECK: ld4r.1d { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xec,0xe3,0x0d]
-; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xec,0x60,0x4d]
-; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xec,0xe3,0x4d]
-
-; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2], #4 ; encoding: [0x44,0xe0,0xff,0x0d]
-; CHECK: ld4r.16b { v5, v6, v7, v8 }, [x2], #4 ; encoding: [0x45,0xe0,0xff,0x4d]
-; CHECK: ld4r.4h { v6, v7, v8, v9 }, [x2], #8 ; encoding: [0x46,0xe4,0xff,0x0d]
-; CHECK: ld4r.8h { v1, v2, v3, v4 }, [x2], #8 ; encoding: [0x41,0xe4,0xff,0x4d]
-; CHECK: ld4r.2s { v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x0d]
-; CHECK: ld4r.4s { v3, v4, v5, v6 }, [x2], #16 ; encoding: [0x43,0xe8,0xff,0x4d]
-; CHECK: ld4r.1d { v0, v1, v2, v3 }, [x2], #32 ; encoding: [0x40,0xec,0xff,0x0d]
-; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2], #32 ; encoding: [0x44,0xec,0xff,0x4d]
-
-
-_ld1:
-  ld1.b {v4}[13], [x3]
-  ld1.h {v4}[2], [x3]
-  ld1.s {v4}[2], [x3]
-  ld1.d {v4}[1], [x3]
-  ld1.b {v4}[13], [x3], x5
-  ld1.h {v4}[2], [x3], x5
-  ld1.s {v4}[2], [x3], x5
-  ld1.d {v4}[1], [x3], x5
-  ld1.b {v4}[13], [x3], #1
-  ld1.h {v4}[2], [x3], #2
-  ld1.s {v4}[2], [x3], #4
-  ld1.d {v4}[1], [x3], #8
-
-; CHECK: _ld1:
-; CHECK: ld1.b { v4 }[13], [x3]        ; encoding: [0x64,0x14,0x40,0x4d]
-; CHECK: ld1.h { v4 }[2], [x3]         ; encoding: [0x64,0x50,0x40,0x0d]
-; CHECK: ld1.s { v4 }[2], [x3]         ; encoding: [0x64,0x80,0x40,0x4d]
-; CHECK: ld1.d { v4 }[1], [x3]         ; encoding: [0x64,0x84,0x40,0x4d]
-; CHECK: ld1.b { v4 }[13], [x3], x5    ; encoding: [0x64,0x14,0xc5,0x4d]
-; CHECK: ld1.h { v4 }[2], [x3], x5     ; encoding: [0x64,0x50,0xc5,0x0d]
-; CHECK: ld1.s { v4 }[2], [x3], x5     ; encoding: [0x64,0x80,0xc5,0x4d]
-; CHECK: ld1.d { v4 }[1], [x3], x5     ; encoding: [0x64,0x84,0xc5,0x4d]
-; CHECK: ld1.b { v4 }[13], [x3], #1   ; encoding: [0x64,0x14,0xdf,0x4d]
-; CHECK: ld1.h { v4 }[2], [x3], #2    ; encoding: [0x64,0x50,0xdf,0x0d]
-; CHECK: ld1.s { v4 }[2], [x3], #4    ; encoding: [0x64,0x80,0xdf,0x4d]
-; CHECK: ld1.d { v4 }[1], [x3], #8    ; encoding: [0x64,0x84,0xdf,0x4d]
-
-_ld2:
-  ld2.b {v4, v5}[13], [x3]
-  ld2.h {v4, v5}[2], [x3]
-  ld2.s {v4, v5}[2], [x3]
-  ld2.d {v4, v5}[1], [x3]
-  ld2.b {v4, v5}[13], [x3], x5
-  ld2.h {v4, v5}[2], [x3], x5
-  ld2.s {v4, v5}[2], [x3], x5
-  ld2.d {v4, v5}[1], [x3], x5
-  ld2.b {v4, v5}[13], [x3], #2
-  ld2.h {v4, v5}[2], [x3], #4
-  ld2.s {v4, v5}[2], [x3], #8
-  ld2.d {v4, v5}[1], [x3], #16
-
-
-; CHECK: _ld2:
-; CHECK: ld2.b { v4, v5 }[13], [x3]    ; encoding: [0x64,0x14,0x60,0x4d]
-; CHECK: ld2.h { v4, v5 }[2], [x3]     ; encoding: [0x64,0x50,0x60,0x0d]
-; CHECK: ld2.s { v4, v5 }[2], [x3]     ; encoding: [0x64,0x80,0x60,0x4d]
-; CHECK: ld2.d { v4, v5 }[1], [x3]     ; encoding: [0x64,0x84,0x60,0x4d]
-; CHECK: ld2.b { v4, v5 }[13], [x3], x5 ; encoding: [0x64,0x14,0xe5,0x4d]
-; CHECK: ld2.h { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x50,0xe5,0x0d]
-; CHECK: ld2.s { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x80,0xe5,0x4d]
-; CHECK: ld2.d { v4, v5 }[1], [x3], x5 ; encoding: [0x64,0x84,0xe5,0x4d]
-; CHECK: ld2.b { v4, v5 }[13], [x3], #2 ; encoding: [0x64,0x14,0xff,0x4d]
-; CHECK: ld2.h { v4, v5 }[2], [x3], #4 ; encoding: [0x64,0x50,0xff,0x0d]
-; CHECK: ld2.s { v4, v5 }[2], [x3], #8 ; encoding: [0x64,0x80,0xff,0x4d]
-; CHECK: ld2.d { v4, v5 }[1], [x3], #16 ; encoding: [0x64,0x84,0xff,0x4d]
-
-
-_ld3:
-  ld3.b {v4, v5, v6}[13], [x3]
-  ld3.h {v4, v5, v6}[2], [x3]
-  ld3.s {v4, v5, v6}[2], [x3]
-  ld3.d {v4, v5, v6}[1], [x3]
-  ld3.b {v4, v5, v6}[13], [x3], x5
-  ld3.h {v4, v5, v6}[2], [x3], x5
-  ld3.s {v4, v5, v6}[2], [x3], x5
-  ld3.d {v4, v5, v6}[1], [x3], x5
-  ld3.b {v4, v5, v6}[13], [x3], #3
-  ld3.h {v4, v5, v6}[2], [x3], #6
-  ld3.s {v4, v5, v6}[2], [x3], #12
-  ld3.d {v4, v5, v6}[1], [x3], #24
-
-
-; CHECK: _ld3:
-; CHECK: ld3.b { v4, v5, v6 }[13], [x3] ; encoding: [0x64,0x34,0x40,0x4d]
-; CHECK: ld3.h { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0x70,0x40,0x0d]
-; CHECK: ld3.s { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0xa0,0x40,0x4d]
-; CHECK: ld3.d { v4, v5, v6 }[1], [x3] ; encoding: [0x64,0xa4,0x40,0x4d]
-; CHECK: ld3.b { v4, v5, v6 }[13], [x3], x5 ; encoding: [0x64,0x34,0xc5,0x4d]
-; CHECK: ld3.h { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0x70,0xc5,0x0d]
-; CHECK: ld3.s { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xc5,0x4d]
-; CHECK: ld3.d { v4, v5, v6 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xc5,0x4d]
-; CHECK: ld3.b { v4, v5, v6 }[13], [x3], #3 ; encoding: [0x64,0x34,0xdf,0x4d]
-; CHECK: ld3.h { v4, v5, v6 }[2], [x3], #6 ; encoding: [0x64,0x70,0xdf,0x0d]
-; CHECK: ld3.s { v4, v5, v6 }[2], [x3], #12 ; encoding: [0x64,0xa0,0xdf,0x4d]
-; CHECK: ld3.d { v4, v5, v6 }[1], [x3], #24 ; encoding: [0x64,0xa4,0xdf,0x4d]
-
-
-_ld4:
-  ld4.b {v4, v5, v6, v7}[13], [x3]
-  ld4.h {v4, v5, v6, v7}[2], [x3]
-  ld4.s {v4, v5, v6, v7}[2], [x3]
-  ld4.d {v4, v5, v6, v7}[1], [x3]
-  ld4.b {v4, v5, v6, v7}[13], [x3], x5
-  ld4.h {v4, v5, v6, v7}[2], [x3], x5
-  ld4.s {v4, v5, v6, v7}[2], [x3], x5
-  ld4.d {v4, v5, v6, v7}[1], [x3], x5
-  ld4.b {v4, v5, v6, v7}[13], [x3], #4
-  ld4.h {v4, v5, v6, v7}[2], [x3], #8
-  ld4.s {v4, v5, v6, v7}[2], [x3], #16
-  ld4.d {v4, v5, v6, v7}[1], [x3], #32
-
-; CHECK: _ld4:
-; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3] ; encoding: [0x64,0x34,0x60,0x4d]
-; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0x70,0x60,0x0d]
-; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0xa0,0x60,0x4d]
-; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3] ; encoding: [0x64,0xa4,0x60,0x4d]
-; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3], x5 ; encoding: [0x64,0x34,0xe5,0x4d]
-; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0x70,0xe5,0x0d]
-; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xe5,0x4d]
-; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xe5,0x4d]
-; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3], #4 ; encoding: [0x64,0x34,0xff,0x4d]
-; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3], #8 ; encoding: [0x64,0x70,0xff,0x0d]
-; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3], #16 ; encoding: [0x64,0xa0,0xff,0x4d]
-; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3], #32 ; encoding: [0x64,0xa4,0xff,0x4d]
-
-_st1:
-  st1.b {v4}[13], [x3]
-  st1.h {v4}[2], [x3]
-  st1.s {v4}[2], [x3]
-  st1.d {v4}[1], [x3]
-  st1.b {v4}[13], [x3], x5
-  st1.h {v4}[2], [x3], x5
-  st1.s {v4}[2], [x3], x5
-  st1.d {v4}[1], [x3], x5
-  st1.b {v4}[13], [x3], #1
-  st1.h {v4}[2], [x3], #2
-  st1.s {v4}[2], [x3], #4
-  st1.d {v4}[1], [x3], #8
-
-; CHECK: _st1:
-; CHECK: st1.b { v4 }[13], [x3]        ; encoding: [0x64,0x14,0x00,0x4d]
-; CHECK: st1.h { v4 }[2], [x3]         ; encoding: [0x64,0x50,0x00,0x0d]
-; CHECK: st1.s { v4 }[2], [x3]         ; encoding: [0x64,0x80,0x00,0x4d]
-; CHECK: st1.d { v4 }[1], [x3]         ; encoding: [0x64,0x84,0x00,0x4d]
-; CHECK: st1.b { v4 }[13], [x3], x5    ; encoding: [0x64,0x14,0x85,0x4d]
-; CHECK: st1.h { v4 }[2], [x3], x5     ; encoding: [0x64,0x50,0x85,0x0d]
-; CHECK: st1.s { v4 }[2], [x3], x5     ; encoding: [0x64,0x80,0x85,0x4d]
-; CHECK: st1.d { v4 }[1], [x3], x5     ; encoding: [0x64,0x84,0x85,0x4d]
-; CHECK: st1.b { v4 }[13], [x3], #1   ; encoding: [0x64,0x14,0x9f,0x4d]
-; CHECK: st1.h { v4 }[2], [x3], #2    ; encoding: [0x64,0x50,0x9f,0x0d]
-; CHECK: st1.s { v4 }[2], [x3], #4    ; encoding: [0x64,0x80,0x9f,0x4d]
-; CHECK: st1.d { v4 }[1], [x3], #8    ; encoding: [0x64,0x84,0x9f,0x4d]
-
-_st2:
-  st2.b {v4, v5}[13], [x3]
-  st2.h {v4, v5}[2], [x3]
-  st2.s {v4, v5}[2], [x3]
-  st2.d {v4, v5}[1], [x3]
-  st2.b {v4, v5}[13], [x3], x5
-  st2.h {v4, v5}[2], [x3], x5
-  st2.s {v4, v5}[2], [x3], x5
-  st2.d {v4, v5}[1], [x3], x5
-  st2.b {v4, v5}[13], [x3], #2
-  st2.h {v4, v5}[2], [x3], #4
-  st2.s {v4, v5}[2], [x3], #8
-  st2.d {v4, v5}[1], [x3], #16
-
-; CHECK: _st2:
-; CHECK: st2.b { v4, v5 }[13], [x3]    ; encoding: [0x64,0x14,0x20,0x4d]
-; CHECK: st2.h { v4, v5 }[2], [x3]     ; encoding: [0x64,0x50,0x20,0x0d]
-; CHECK: st2.s { v4, v5 }[2], [x3]     ; encoding: [0x64,0x80,0x20,0x4d]
-; CHECK: st2.d { v4, v5 }[1], [x3]     ; encoding: [0x64,0x84,0x20,0x4d]
-; CHECK: st2.b { v4, v5 }[13], [x3], x5 ; encoding: [0x64,0x14,0xa5,0x4d]
-; CHECK: st2.h { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x50,0xa5,0x0d]
-; CHECK: st2.s { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x80,0xa5,0x4d]
-; CHECK: st2.d { v4, v5 }[1], [x3], x5 ; encoding: [0x64,0x84,0xa5,0x4d]
-; CHECK: st2.b { v4, v5 }[13], [x3], #2 ; encoding: [0x64,0x14,0xbf,0x4d]
-; CHECK: st2.h { v4, v5 }[2], [x3], #4 ; encoding: [0x64,0x50,0xbf,0x0d]
-; CHECK: st2.s { v4, v5 }[2], [x3], #8 ; encoding: [0x64,0x80,0xbf,0x4d]
-; CHECK: st2.d { v4, v5 }[1], [x3], #16 ; encoding: [0x64,0x84,0xbf,0x4d]
-
-
-_st3:
-  st3.b {v4, v5, v6}[13], [x3]
-  st3.h {v4, v5, v6}[2], [x3]
-  st3.s {v4, v5, v6}[2], [x3]
-  st3.d {v4, v5, v6}[1], [x3]
-  st3.b {v4, v5, v6}[13], [x3], x5
-  st3.h {v4, v5, v6}[2], [x3], x5
-  st3.s {v4, v5, v6}[2], [x3], x5
-  st3.d {v4, v5, v6}[1], [x3], x5
-  st3.b {v4, v5, v6}[13], [x3], #3
-  st3.h {v4, v5, v6}[2], [x3], #6
-  st3.s {v4, v5, v6}[2], [x3], #12
-  st3.d {v4, v5, v6}[1], [x3], #24
-
-; CHECK: _st3:
-; CHECK: st3.b { v4, v5, v6 }[13], [x3] ; encoding: [0x64,0x34,0x00,0x4d]
-; CHECK: st3.h { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0x70,0x00,0x0d]
-; CHECK: st3.s { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0xa0,0x00,0x4d]
-; CHECK: st3.d { v4, v5, v6 }[1], [x3] ; encoding: [0x64,0xa4,0x00,0x4d]
-; CHECK: st3.b { v4, v5, v6 }[13], [x3], x5 ; encoding: [0x64,0x34,0x85,0x4d]
-; CHECK: st3.h { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0x70,0x85,0x0d]
-; CHECK: st3.s { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0xa0,0x85,0x4d]
-; CHECK: st3.d { v4, v5, v6 }[1], [x3], x5 ; encoding: [0x64,0xa4,0x85,0x4d]
-; CHECK: st3.b { v4, v5, v6 }[13], [x3], #3 ; encoding: [0x64,0x34,0x9f,0x4d]
-; CHECK: st3.h { v4, v5, v6 }[2], [x3], #6 ; encoding: [0x64,0x70,0x9f,0x0d]
-; CHECK: st3.s { v4, v5, v6 }[2], [x3], #12 ; encoding: [0x64,0xa0,0x9f,0x4d]
-; CHECK: st3.d { v4, v5, v6 }[1], [x3], #24 ; encoding: [0x64,0xa4,0x9f,0x4d]
-
-_st4:
-  st4.b {v4, v5, v6, v7}[13], [x3]
-  st4.h {v4, v5, v6, v7}[2], [x3]
-  st4.s {v4, v5, v6, v7}[2], [x3]
-  st4.d {v4, v5, v6, v7}[1], [x3]
-  st4.b {v4, v5, v6, v7}[13], [x3], x5
-  st4.h {v4, v5, v6, v7}[2], [x3], x5
-  st4.s {v4, v5, v6, v7}[2], [x3], x5
-  st4.d {v4, v5, v6, v7}[1], [x3], x5
-  st4.b {v4, v5, v6, v7}[13], [x3], #4
-  st4.h {v4, v5, v6, v7}[2], [x3], #8
-  st4.s {v4, v5, v6, v7}[2], [x3], #16
-  st4.d {v4, v5, v6, v7}[1], [x3], #32
-
-; CHECK: _st4:
-; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3] ; encoding: [0x64,0x34,0x20,0x4d]
-; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0x70,0x20,0x0d]
-; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0xa0,0x20,0x4d]
-; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3] ; encoding: [0x64,0xa4,0x20,0x4d]
-; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3], x5 ; encoding: [0x64,0x34,0xa5,0x4d]
-; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0x70,0xa5,0x0d]
-; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xa5,0x4d]
-; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xa5,0x4d]
-; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3], #4 ; encoding: [0x64,0x34,0xbf,0x4d]
-; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3], #8 ; encoding: [0x64,0x70,0xbf,0x0d]
-; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3], #16 ; encoding: [0x64,0xa0,0xbf,0x4d]
-; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3], #32 ; encoding: [0x64,0xa4,0xbf,0x4d]
-
-
-;---------
-; ARM verbose syntax equivalents to the above.
-;---------
-verbose_syntax:
-
-  ld1 { v1.8b }, [x1]
-  ld1 { v2.8b, v3.8b }, [x1]
-  ld1 { v3.8b, v4.8b, v5.8b }, [x1]
-  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1]
-
-  ld1 { v1.16b }, [x1]
-  ld1 { v2.16b, v3.16b }, [x1]
-  ld1 { v3.16b, v4.16b, v5.16b }, [x1]
-  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1]
-
-  ld1 { v1.4h }, [x1]
-  ld1 { v2.4h, v3.4h }, [x1]
-  ld1 { v3.4h, v4.4h, v5.4h }, [x1]
-  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1]
-
-  ld1 { v1.8h }, [x1]
-  ld1 { v2.8h, v3.8h }, [x1]
-  ld1 { v3.8h, v4.8h, v5.8h }, [x1]
-  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1]
-
-  ld1 { v1.2s }, [x1]
-  ld1 { v2.2s, v3.2s }, [x1]
-  ld1 { v3.2s, v4.2s, v5.2s }, [x1]
-  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1]
-
-  ld1 { v1.4s }, [x1]
-  ld1 { v2.4s, v3.4s }, [x1]
-  ld1 { v3.4s, v4.4s, v5.4s }, [x1]
-  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1]
-
-  ld1 { v1.1d }, [x1]
-  ld1 { v2.1d, v3.1d }, [x1]
-  ld1 { v3.1d, v4.1d, v5.1d }, [x1]
-  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1]
-
-  ld1 { v1.2d }, [x1]
-  ld1 { v2.2d, v3.2d }, [x1]
-  ld1 { v3.2d, v4.2d, v5.2d }, [x1]
-  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1]
-
-  st1 { v1.8b }, [x1]
-  st1 { v2.8b, v3.8b }, [x1]
-  st1 { v3.8b, v4.8b, v5.8b }, [x1]
-  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1]
-
-  st1 { v1.16b }, [x1]
-  st1 { v2.16b, v3.16b }, [x1]
-  st1 { v3.16b, v4.16b, v5.16b }, [x1]
-  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1]
-
-  st1 { v1.4h }, [x1]
-  st1 { v2.4h, v3.4h }, [x1]
-  st1 { v3.4h, v4.4h, v5.4h }, [x1]
-  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1]
-
-  st1 { v1.8h }, [x1]
-  st1 { v2.8h, v3.8h }, [x1]
-  st1 { v3.8h, v4.8h, v5.8h }, [x1]
-  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1]
-
-  st1 { v1.2s }, [x1]
-  st1 { v2.2s, v3.2s }, [x1]
-  st1 { v3.2s, v4.2s, v5.2s }, [x1]
-  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1]
-
-  st1 { v1.4s }, [x1]
-  st1 { v2.4s, v3.4s }, [x1]
-  st1 { v3.4s, v4.4s, v5.4s }, [x1]
-  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1]
-
-  st1 { v1.1d }, [x1]
-  st1 { v2.1d, v3.1d }, [x1]
-  st1 { v3.1d, v4.1d, v5.1d }, [x1]
-  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1]
-
-  st1 { v1.2d }, [x1]
-  st1 { v2.2d, v3.2d }, [x1]
-  st1 { v3.2d, v4.2d, v5.2d }, [x1]
-  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1]
-
-  ld2 { v3.8b, v4.8b }, [x19]
-  ld2 { v3.16b, v4.16b }, [x19]
-  ld2 { v3.4h, v4.4h }, [x19]
-  ld2 { v3.8h, v4.8h }, [x19]
-  ld2 { v3.2s, v4.2s }, [x19]
-  ld2 { v3.4s, v4.4s }, [x19]
-  ld2 { v3.2d, v4.2d }, [x19]
-
-  st2 { v3.8b, v4.8b }, [x19]
-  st2 { v3.16b, v4.16b }, [x19]
-  st2 { v3.4h, v4.4h }, [x19]
-  st2 { v3.8h, v4.8h }, [x19]
-  st2 { v3.2s, v4.2s }, [x19]
-  st2 { v3.4s, v4.4s }, [x19]
-  st2 { v3.2d, v4.2d }, [x19]
-
-  ld3 { v2.8b, v3.8b, v4.8b }, [x19]
-  ld3 { v2.16b, v3.16b, v4.16b }, [x19]
-  ld3 { v2.4h, v3.4h, v4.4h }, [x19]
-  ld3 { v2.8h, v3.8h, v4.8h }, [x19]
-  ld3 { v2.2s, v3.2s, v4.2s }, [x19]
-  ld3 { v2.4s, v3.4s, v4.4s }, [x19]
-  ld3 { v2.2d, v3.2d, v4.2d }, [x19]
-
-  st3 { v2.8b, v3.8b, v4.8b }, [x19]
-  st3 { v2.16b, v3.16b, v4.16b }, [x19]
-  st3 { v2.4h, v3.4h, v4.4h }, [x19]
-  st3 { v2.8h, v3.8h, v4.8h }, [x19]
-  st3 { v2.2s, v3.2s, v4.2s }, [x19]
-  st3 { v2.4s, v3.4s, v4.4s }, [x19]
-  st3 { v2.2d, v3.2d, v4.2d }, [x19]
-
-  ld4 { v2.8b, v3.8b, v4.8b, v5.8b }, [x19]
-  ld4 { v2.16b, v3.16b, v4.16b, v5.16b }, [x19]
-  ld4 { v2.4h, v3.4h, v4.4h, v5.4h }, [x19]
-  ld4 { v2.8h, v3.8h, v4.8h, v5.8h }, [x19]
-  ld4 { v2.2s, v3.2s, v4.2s, v5.2s }, [x19]
-  ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x19]
-  ld4 { v2.2d, v3.2d, v4.2d, v5.2d }, [x19]
-
-  st4 { v2.8b, v3.8b, v4.8b, v5.8b }, [x19]
-  st4 { v2.16b, v3.16b, v4.16b, v5.16b }, [x19]
-  st4 { v2.4h, v3.4h, v4.4h, v5.4h }, [x19]
-  st4 { v2.8h, v3.8h, v4.8h, v5.8h }, [x19]
-  st4 { v2.2s, v3.2s, v4.2s, v5.2s }, [x19]
-  st4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x19]
-  st4 { v2.2d, v3.2d, v4.2d, v5.2d }, [x19]
-
-  ld1 { v1.8b }, [x1], x15
-  ld1 { v2.8b, v3.8b }, [x1], x15
-  ld1 { v3.8b, v4.8b, v5.8b }, [x1], x15
-  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
-
-  ld1 { v1.16b }, [x1], x15
-  ld1 { v2.16b, v3.16b }, [x1], x15
-  ld1 { v3.16b, v4.16b, v5.16b }, [x1], x15
-  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
-
-  ld1 { v1.4h }, [x1], x15
-  ld1 { v2.4h, v3.4h }, [x1], x15
-  ld1 { v3.4h, v4.4h, v5.4h }, [x1], x15
-  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
-
-  ld1 { v1.8h }, [x1], x15
-  ld1 { v2.8h, v3.8h }, [x1], x15
-  ld1 { v3.8h, v4.8h, v5.8h }, [x1], x15
-  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
-
-  ld1 { v1.2s }, [x1], x15
-  ld1 { v2.2s, v3.2s }, [x1], x15
-  ld1 { v3.2s, v4.2s, v5.2s }, [x1], x15
-  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
-
-  ld1 { v1.4s }, [x1], x15
-  ld1 { v2.4s, v3.4s }, [x1], x15
-  ld1 { v3.4s, v4.4s, v5.4s }, [x1], x15
-  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
-
-  ld1 { v1.1d }, [x1], x15
-  ld1 { v2.1d, v3.1d }, [x1], x15
-  ld1 { v3.1d, v4.1d, v5.1d }, [x1], x15
-  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], x15
-
-  ld1 { v1.2d }, [x1], x15
-  ld1 { v2.2d, v3.2d }, [x1], x15
-  ld1 { v3.2d, v4.2d, v5.2d }, [x1], x15
-  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
-
-  st1 { v1.8b }, [x1], x15
-  st1 { v2.8b, v3.8b }, [x1], x15
-  st1 { v3.8b, v4.8b, v5.8b }, [x1], x15
-  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
-
-  st1 { v1.16b }, [x1], x15
-  st1 { v2.16b, v3.16b }, [x1], x15
-  st1 { v3.16b, v4.16b, v5.16b }, [x1], x15
-  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
-
-  st1 { v1.4h }, [x1], x15
-  st1 { v2.4h, v3.4h }, [x1], x15
-  st1 { v3.4h, v4.4h, v5.4h }, [x1], x15
-  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
-
-  st1 { v1.8h }, [x1], x15
-  st1 { v2.8h, v3.8h }, [x1], x15
-  st1 { v3.8h, v4.8h, v5.8h }, [x1], x15
-  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
-
-  st1 { v1.2s }, [x1], x15
-  st1 { v2.2s, v3.2s }, [x1], x15
-  st1 { v3.2s, v4.2s, v5.2s }, [x1], x15
-  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
-
-  st1 { v1.4s }, [x1], x15
-  st1 { v2.4s, v3.4s }, [x1], x15
-  st1 { v3.4s, v4.4s, v5.4s }, [x1], x15
-  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
-
-  st1 { v1.1d }, [x1], x15
-  st1 { v2.1d, v3.1d }, [x1], x15
-  st1 { v3.1d, v4.1d, v5.1d }, [x1], x15
-  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], x15
-
-  st1 { v1.2d }, [x1], x15
-  st1 { v2.2d, v3.2d }, [x1], x15
-  st1 { v3.2d, v4.2d, v5.2d }, [x1], x15
-  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
-
-  ld1 { v1.8b }, [x1], #8
-  ld1 { v2.8b, v3.8b }, [x1], #16
-  ld1 { v3.8b, v4.8b, v5.8b }, [x1], #24
-  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
-
-  ld1 { v1.16b }, [x1], #16
-  ld1 { v2.16b, v3.16b }, [x1], #32
-  ld1 { v3.16b, v4.16b, v5.16b }, [x1], #48
-  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
-
-  ld1 { v1.4h }, [x1], #8
-  ld1 { v2.4h, v3.4h }, [x1], #16
-  ld1 { v3.4h, v4.4h, v5.4h }, [x1], #24
-  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
-
-  ld1 { v1.8h }, [x1], #16
-  ld1 { v2.8h, v3.8h }, [x1], #32
-  ld1 { v3.8h, v4.8h, v5.8h }, [x1], #48
-  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
-
-  ld1 { v1.2s }, [x1], #8
-  ld1 { v2.2s, v3.2s }, [x1], #16
-  ld1 { v3.2s, v4.2s, v5.2s }, [x1], #24
-  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
-
-  ld1 { v1.4s }, [x1], #16
-  ld1 { v2.4s, v3.4s }, [x1], #32
-  ld1 { v3.4s, v4.4s, v5.4s }, [x1], #48
-  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
-
-  ld1 { v1.1d }, [x1], #8
-  ld1 { v2.1d, v3.1d }, [x1], #16
-  ld1 { v3.1d, v4.1d, v5.1d }, [x1], #24
-  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], #32
-
-  ld1 { v1.2d }, [x1], #16
-  ld1 { v2.2d, v3.2d }, [x1], #32
-  ld1 { v3.2d, v4.2d, v5.2d }, [x1], #48
-  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
-
-  st1 { v1.8b }, [x1], #8
-  st1 { v2.8b, v3.8b }, [x1], #16
-  st1 { v3.8b, v4.8b, v5.8b }, [x1], #24
-  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
-
-  st1 { v1.16b }, [x1], #16
-  st1 { v2.16b, v3.16b }, [x1], #32
-  st1 { v3.16b, v4.16b, v5.16b }, [x1], #48
-  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
-
-  st1 { v1.4h }, [x1], #8
-  st1 { v2.4h, v3.4h }, [x1], #16
-  st1 { v3.4h, v4.4h, v5.4h }, [x1], #24
-  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
-
-  st1 { v1.8h }, [x1], #16
-  st1 { v2.8h, v3.8h }, [x1], #32
-  st1 { v3.8h, v4.8h, v5.8h }, [x1], #48
-  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
-
-  st1 { v1.2s }, [x1], #8
-  st1 { v2.2s, v3.2s }, [x1], #16
-  st1 { v3.2s, v4.2s, v5.2s }, [x1], #24
-  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
-
-  st1 { v1.4s }, [x1], #16
-  st1 { v2.4s, v3.4s }, [x1], #32
-  st1 { v3.4s, v4.4s, v5.4s }, [x1], #48
-  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
-
-  st1 { v1.1d }, [x1], #8
-  st1 { v2.1d, v3.1d }, [x1], #16
-  st1 { v3.1d, v4.1d, v5.1d }, [x1], #24
-  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], #32
-
-  st1 { v1.2d }, [x1], #16
-  st1 { v2.2d, v3.2d }, [x1], #32
-  st1 { v3.2d, v4.2d, v5.2d }, [x1], #48
-  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
-
-  ld2 { v2.8b, v3.8b }, [x1], x15
-  ld2 { v2.16b, v3.16b }, [x1], x15
-  ld2 { v2.4h, v3.4h }, [x1], x15
-  ld2 { v2.8h, v3.8h }, [x1], x15
-  ld2 { v2.2s, v3.2s }, [x1], x15
-  ld2 { v2.4s, v3.4s }, [x1], x15
-  ld2 { v2.2d, v3.2d }, [x1], x15
-
-  st2 { v2.8b, v3.8b }, [x1], x15
-  st2 { v2.16b, v3.16b }, [x1], x15
-  st2 { v2.4h, v3.4h }, [x1], x15
-  st2 { v2.8h, v3.8h }, [x1], x15
-  st2 { v2.2s, v3.2s }, [x1], x15
-  st2 { v2.4s, v3.4s }, [x1], x15
-  st2 { v2.2d, v3.2d }, [x1], x15
-
-  ld2 { v2.8b, v3.8b }, [x1], #16
-  ld2 { v2.16b, v3.16b }, [x1], #32
-  ld2 { v2.4h, v3.4h }, [x1], #16
-  ld2 { v2.8h, v3.8h }, [x1], #32
-  ld2 { v2.2s, v3.2s }, [x1], #16
-  ld2 { v2.4s, v3.4s }, [x1], #32
-  ld2 { v2.2d, v3.2d }, [x1], #32
-
-  st2 { v2.8b, v3.8b }, [x1], #16
-  st2 { v2.16b, v3.16b }, [x1], #32
-  st2 { v2.4h, v3.4h }, [x1], #16
-  st2 { v2.8h, v3.8h }, [x1], #32
-  st2 { v2.2s, v3.2s }, [x1], #16
-  st2 { v2.4s, v3.4s }, [x1], #32
-  st2 { v2.2d, v3.2d }, [x1], #32
-
-  ld3 { v3.8b, v4.8b, v5.8b }, [x1], x15
-  ld3 { v3.16b, v4.16b, v5.16b }, [x1], x15
-  ld3 { v3.4h, v4.4h, v5.4h }, [x1], x15
-  ld3 { v3.8h, v4.8h, v5.8h }, [x1], x15
-  ld3 { v3.2s, v4.2s, v5.2s }, [x1], x15
-  ld3 { v3.4s, v4.4s, v5.4s }, [x1], x15
-  ld3 { v3.2d, v4.2d, v5.2d }, [x1], x15
-
-  st3 { v3.8b, v4.8b, v5.8b }, [x1], x15
-  st3 { v3.16b, v4.16b, v5.16b }, [x1], x15
-  st3 { v3.4h, v4.4h, v5.4h }, [x1], x15
-  st3 { v3.8h, v4.8h, v5.8h }, [x1], x15
-  st3 { v3.2s, v4.2s, v5.2s }, [x1], x15
-  st3 { v3.4s, v4.4s, v5.4s }, [x1], x15
-  st3 { v3.2d, v4.2d, v5.2d }, [x1], x15
-  ld3 { v3.8b, v4.8b, v5.8b }, [x1], #24
-
-  ld3 { v3.16b, v4.16b, v5.16b }, [x1], #48
-  ld3 { v3.4h, v4.4h, v5.4h }, [x1], #24
-  ld3 { v3.8h, v4.8h, v5.8h }, [x1], #48
-  ld3 { v3.2s, v4.2s, v5.2s }, [x1], #24
-  ld3 { v3.4s, v4.4s, v5.4s }, [x1], #48
-  ld3 { v3.2d, v4.2d, v5.2d }, [x1], #48
-
-  st3 { v3.8b, v4.8b, v5.8b }, [x1], #24
-  st3 { v3.16b, v4.16b, v5.16b }, [x1], #48
-  st3 { v3.4h, v4.4h, v5.4h }, [x1], #24
-  st3 { v3.8h, v4.8h, v5.8h }, [x1], #48
-  st3 { v3.2s, v4.2s, v5.2s }, [x1], #24
-  st3 { v3.4s, v4.4s, v5.4s }, [x1], #48
-  st3 { v3.2d, v4.2d, v5.2d }, [x1], #48
-
-  ld4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
-  ld4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
-  ld4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
-  ld4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
-  ld4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
-  ld4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
-  ld4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
-
-  st4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
-  st4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
-  st4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
-  st4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
-  st4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
-  st4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
-  st4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
-
-  ld4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
-  ld4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
-  ld4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
-  ld4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
-  ld4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
-  ld4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
-  ld4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
-
-  st4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
-  st4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
-  st4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
-  st4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
-  st4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
-  st4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
-  st4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
-
-
-  ld1r { v12.8b }, [x2]
-  ld1r { v12.8b }, [x2], x3
-  ld1r { v12.16b }, [x2]
-  ld1r { v12.16b }, [x2], x3
-  ld1r { v12.4h }, [x2]
-  ld1r { v12.4h }, [x2], x3
-  ld1r { v12.8h }, [x2]
-  ld1r { v12.8h }, [x2], x3
-  ld1r { v12.2s }, [x2]
-  ld1r { v12.2s }, [x2], x3
-  ld1r { v12.4s }, [x2]
-  ld1r { v12.4s }, [x2], x3
-  ld1r { v12.1d }, [x2]
-  ld1r { v12.1d }, [x2], x3
-  ld1r { v12.2d }, [x2]
-  ld1r { v12.2d }, [x2], x3
-
-  ld1r { v12.8b }, [x2], #1
-  ld1r { v12.16b }, [x2], #1
-  ld1r { v12.4h }, [x2], #2
-  ld1r { v12.8h }, [x2], #2
-  ld1r { v12.2s }, [x2], #4
-  ld1r { v12.4s }, [x2], #4
-  ld1r { v12.1d }, [x2], #8
-  ld1r { v12.2d }, [x2], #8
-  ld2r { v3.8b, v4.8b }, [x2]
-  ld2r { v3.8b, v4.8b }, [x2], x3
-  ld2r { v3.16b, v4.16b }, [x2]
-  ld2r { v3.16b, v4.16b }, [x2], x3
-  ld2r { v3.4h, v4.4h }, [x2]
-  ld2r { v3.4h, v4.4h }, [x2], x3
-  ld2r { v3.8h, v4.8h }, [x2]
-  ld2r { v3.8h, v4.8h }, [x2], x3
-  ld2r { v3.2s, v4.2s }, [x2]
-  ld2r { v3.2s, v4.2s }, [x2], x3
-  ld2r { v3.4s, v4.4s }, [x2]
-  ld2r { v3.4s, v4.4s }, [x2], x3
-  ld2r { v3.1d, v4.1d }, [x2]
-  ld2r { v3.1d, v4.1d }, [x2], x3
-  ld2r { v3.2d, v4.2d }, [x2]
-  ld2r { v3.2d, v4.2d }, [x2], x3
-
-  ld2r { v3.8b, v4.8b }, [x2], #2
-  ld2r { v3.16b, v4.16b }, [x2], #2
-  ld2r { v3.4h, v4.4h }, [x2], #4
-  ld2r { v3.8h, v4.8h }, [x2], #4
-  ld2r { v3.2s, v4.2s }, [x2], #8
-  ld2r { v3.4s, v4.4s }, [x2], #8
-  ld2r { v3.1d, v4.1d }, [x2], #16
-  ld2r { v3.2d, v4.2d }, [x2], #16
-
-  ld3r { v2.8b, v3.8b, v4.8b }, [x2]
-  ld3r { v2.8b, v3.8b, v4.8b }, [x2], x3
-  ld3r { v2.16b, v3.16b, v4.16b }, [x2]
-  ld3r { v2.16b, v3.16b, v4.16b }, [x2], x3
-  ld3r { v2.4h, v3.4h, v4.4h }, [x2]
-  ld3r { v2.4h, v3.4h, v4.4h }, [x2], x3
-  ld3r { v2.8h, v3.8h, v4.8h }, [x2]
-  ld3r { v2.8h, v3.8h, v4.8h }, [x2], x3
-  ld3r { v2.2s, v3.2s, v4.2s }, [x2]
-  ld3r { v2.2s, v3.2s, v4.2s }, [x2], x3
-  ld3r { v2.4s, v3.4s, v4.4s }, [x2]
-  ld3r { v2.4s, v3.4s, v4.4s }, [x2], x3
-  ld3r { v2.1d, v3.1d, v4.1d }, [x2]
-  ld3r { v2.1d, v3.1d, v4.1d }, [x2], x3
-  ld3r { v2.2d, v3.2d, v4.2d }, [x2]
-  ld3r { v2.2d, v3.2d, v4.2d }, [x2], x3
-
-  ld3r { v2.8b, v3.8b, v4.8b }, [x2], #3
-  ld3r { v2.16b, v3.16b, v4.16b }, [x2], #3
-  ld3r { v2.4h, v3.4h, v4.4h }, [x2], #6
-  ld3r { v2.8h, v3.8h, v4.8h }, [x2], #6
-  ld3r { v2.2s, v3.2s, v4.2s }, [x2], #12
-  ld3r { v2.4s, v3.4s, v4.4s }, [x2], #12
-  ld3r { v2.1d, v3.1d, v4.1d }, [x2], #24
-  ld3r { v2.2d, v3.2d, v4.2d }, [x2], #24
-
-  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2]
-  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2], x3
-  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2]
-  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2], x3
-  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2]
-  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2], x3
-  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2]
-  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2], x3
-  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2]
-  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2], x3
-  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2]
-  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2], x3
-  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2]
-  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2], x3
-  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2]
-  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2], x3
-
-  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2], #4
-  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2], #4
-  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2], #8
-  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2], #8
-  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2], #16
-  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2], #16
-  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2], #32
-  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2], #32
-
-  ld1 { v6.b }[13], [x3]
-  ld1 { v6.h }[2], [x3]
-  ld1 { v6.s }[2], [x3]
-  ld1 { v6.d }[1], [x3]
-  ld1 { v6.b }[13], [x3], x5
-  ld1 { v6.h }[2], [x3], x5
-  ld1 { v6.s }[2], [x3], x5
-  ld1 { v6.d }[1], [x3], x5
-  ld1 { v6.b }[13], [x3], #1
-  ld1 { v6.h }[2], [x3], #2
-  ld1 { v6.s }[2], [x3], #4
-  ld1 { v6.d }[1], [x3], #8
-
-  ld2 { v5.b, v6.b }[13], [x3]
-  ld2 { v5.h, v6.h }[2], [x3]
-  ld2 { v5.s, v6.s }[2], [x3]
-  ld2 { v5.d, v6.d }[1], [x3]
-  ld2 { v5.b, v6.b }[13], [x3], x5
-  ld2 { v5.h, v6.h }[2], [x3], x5
-  ld2 { v5.s, v6.s }[2], [x3], x5
-  ld2 { v5.d, v6.d }[1], [x3], x5
-  ld2 { v5.b, v6.b }[13], [x3], #2
-  ld2 { v5.h, v6.h }[2], [x3], #4
-  ld2 { v5.s, v6.s }[2], [x3], #8
-  ld2 { v5.d, v6.d }[1], [x3], #16
-
-  ld3 { v7.b, v8.b, v9.b }[13], [x3]
-  ld3 { v7.h, v8.h, v9.h }[2], [x3]
-  ld3 { v7.s, v8.s, v9.s }[2], [x3]
-  ld3 { v7.d, v8.d, v9.d }[1], [x3]
-  ld3 { v7.b, v8.b, v9.b }[13], [x3], x5
-  ld3 { v7.h, v8.h, v9.h }[2], [x3], x5
-  ld3 { v7.s, v8.s, v9.s }[2], [x3], x5
-  ld3 { v7.d, v8.d, v9.d }[1], [x3], x5
-  ld3 { v7.b, v8.b, v9.b }[13], [x3], #3
-  ld3 { v7.h, v8.h, v9.h }[2], [x3], #6
-  ld3 { v7.s, v8.s, v9.s }[2], [x3], #12
-  ld3 { v7.d, v8.d, v9.d }[1], [x3], #24
-
-  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3]
-  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3]
-  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3]
-  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3]
-  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], x5
-  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], x5
-  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], x5
-  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], x5
-  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], #4
-  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], #8
-  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], #16
-  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], #32
-
-  st1 { v6.b }[13], [x3]
-  st1 { v6.h }[2], [x3]
-  st1 { v6.s }[2], [x3]
-  st1 { v6.d }[1], [x3]
-  st1 { v6.b }[13], [x3], x5
-  st1 { v6.h }[2], [x3], x5
-  st1 { v6.s }[2], [x3], x5
-  st1 { v6.d }[1], [x3], x5
-  st1 { v6.b }[13], [x3], #1
-  st1 { v6.h }[2], [x3], #2
-  st1 { v6.s }[2], [x3], #4
-  st1 { v6.d }[1], [x3], #8
-
-
-  st2 { v5.b, v6.b }[13], [x3]
-  st2 { v5.h, v6.h }[2], [x3]
-  st2 { v5.s, v6.s }[2], [x3]
-  st2 { v5.d, v6.d }[1], [x3]
-  st2 { v5.b, v6.b }[13], [x3], x5
-  st2 { v5.h, v6.h }[2], [x3], x5
-  st2 { v5.s, v6.s }[2], [x3], x5
-  st2 { v5.d, v6.d }[1], [x3], x5
-  st2 { v5.b, v6.b }[13], [x3], #2
-  st2 { v5.h, v6.h }[2], [x3], #4
-  st2 { v5.s, v6.s }[2], [x3], #8
-  st2 { v5.d, v6.d }[1], [x3], #16
-
-  st3 { v7.b, v8.b, v9.b }[13], [x3]
-  st3 { v7.h, v8.h, v9.h }[2], [x3]
-  st3 { v7.s, v8.s, v9.s }[2], [x3]
-  st3 { v7.d, v8.d, v9.d }[1], [x3]
-  st3 { v7.b, v8.b, v9.b }[13], [x3], x5
-  st3 { v7.h, v8.h, v9.h }[2], [x3], x5
-  st3 { v7.s, v8.s, v9.s }[2], [x3], x5
-  st3 { v7.d, v8.d, v9.d }[1], [x3], x5
-  st3 { v7.b, v8.b, v9.b }[13], [x3], #3
-  st3 { v7.h, v8.h, v9.h }[2], [x3], #6
-  st3 { v7.s, v8.s, v9.s }[2], [x3], #12
-  st3 { v7.d, v8.d, v9.d }[1], [x3], #24
-
-  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3]
-  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3]
-  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3]
-  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3]
-  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], x5
-  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], x5
-  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], x5
-  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], x5
-  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], #4
-  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], #8
-  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], #16
-  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], #32
-
-; CHECK: ld1.8b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x40,0x0c]
-; CHECK: ld1.8b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x40,0x0c]
-; CHECK: ld1.8b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x40,0x0c]
-; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x40,0x0c]
-; CHECK: ld1.16b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x40,0x4c]
-; CHECK: ld1.16b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x40,0x4c]
-; CHECK: ld1.16b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x40,0x4c]
-; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x40,0x4c]
-; CHECK: ld1.4h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x40,0x0c]
-; CHECK: ld1.4h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x40,0x0c]
-; CHECK: ld1.4h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x40,0x0c]
-; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x40,0x0c]
-; CHECK: ld1.8h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x40,0x4c]
-; CHECK: ld1.8h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x40,0x4c]
-; CHECK: ld1.8h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x40,0x4c]
-; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x40,0x4c]
-; CHECK: ld1.2s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x40,0x0c]
-; CHECK: ld1.2s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x40,0x0c]
-; CHECK: ld1.2s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x40,0x0c]
-; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x40,0x0c]
-; CHECK: ld1.4s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x40,0x4c]
-; CHECK: ld1.4s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x40,0x4c]
-; CHECK: ld1.4s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x40,0x4c]
-; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x40,0x4c]
-; CHECK: ld1.1d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x40,0x0c]
-; CHECK: ld1.1d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x40,0x0c]
-; CHECK: ld1.1d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x40,0x0c]
-; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x40,0x0c]
-; CHECK: ld1.2d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x40,0x4c]
-; CHECK: ld1.2d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x40,0x4c]
-; CHECK: ld1.2d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x40,0x4c]
-; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x40,0x4c]
-; CHECK: st1.8b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x00,0x0c]
-; CHECK: st1.8b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x00,0x0c]
-; CHECK: st1.8b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x00,0x0c]
-; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x00,0x0c]
-; CHECK: st1.16b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x00,0x4c]
-; CHECK: st1.16b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x00,0x4c]
-; CHECK: st1.16b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x00,0x4c]
-; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x00,0x4c]
-; CHECK: st1.4h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x00,0x0c]
-; CHECK: st1.4h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x00,0x0c]
-; CHECK: st1.4h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x00,0x0c]
-; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x00,0x0c]
-; CHECK: st1.8h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x00,0x4c]
-; CHECK: st1.8h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x00,0x4c]
-; CHECK: st1.8h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x00,0x4c]
-; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x00,0x4c]
-; CHECK: st1.2s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x00,0x0c]
-; CHECK: st1.2s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x00,0x0c]
-; CHECK: st1.2s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x00,0x0c]
-; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x00,0x0c]
-; CHECK: st1.4s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x00,0x4c]
-; CHECK: st1.4s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x00,0x4c]
-; CHECK: st1.4s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x00,0x4c]
-; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x00,0x4c]
-; CHECK: st1.1d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x00,0x0c]
-; CHECK: st1.1d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x00,0x0c]
-; CHECK: st1.1d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x00,0x0c]
-; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x00,0x0c]
-; CHECK: st1.2d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x00,0x4c]
-; CHECK: st1.2d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x00,0x4c]
-; CHECK: st1.2d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x00,0x4c]
-; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x00,0x4c]
-; CHECK: ld2.8b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x40,0x0c]
-; CHECK: ld2.16b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x40,0x4c]
-; CHECK: ld2.4h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x40,0x0c]
-; CHECK: ld2.8h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x40,0x4c]
-; CHECK: ld2.2s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x40,0x0c]
-; CHECK: ld2.4s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x40,0x4c]
-; CHECK: ld2.2d	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8e,0x40,0x4c]
-; CHECK: st2.8b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x00,0x0c]
-; CHECK: st2.16b { v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x00,0x4c]
-; CHECK: st2.4h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x00,0x0c]
-; CHECK: st2.8h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x00,0x4c]
-; CHECK: st2.2s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x00,0x0c]
-; CHECK: st2.4s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x00,0x4c]
-; CHECK: st2.2d	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8e,0x00,0x4c]
-; CHECK: ld3.8b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x40,0x0c]
-; CHECK: ld3.16b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x40,0x4c]
-; CHECK: ld3.4h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x40,0x0c]
-; CHECK: ld3.8h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x40,0x4c]
-; CHECK: ld3.2s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x0c]
-; CHECK: ld3.4s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x4c]
-; CHECK: ld3.2d	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4e,0x40,0x4c]
-; CHECK: st3.8b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x00,0x0c]
-; CHECK: st3.16b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x00,0x4c]
-; CHECK: st3.4h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x00,0x0c]
-; CHECK: st3.8h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x00,0x4c]
-; CHECK: st3.2s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x0c]
-; CHECK: st3.4s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x4c]
-; CHECK: st3.2d	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4e,0x00,0x4c]
-; CHECK: ld4.8b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x40,0x0c]
-; CHECK: ld4.16b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x40,0x4c]
-; CHECK: ld4.4h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x40,0x0c]
-; CHECK: ld4.8h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x40,0x4c]
-; CHECK: ld4.2s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x40,0x0c]
-; CHECK: ld4.4s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x40,0x4c]
-; CHECK: ld4.2d	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0e,0x40,0x4c]
-; CHECK: st4.8b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x00,0x0c]
-; CHECK: st4.16b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x00,0x4c]
-; CHECK: st4.4h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x00,0x0c]
-; CHECK: st4.8h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x00,0x4c]
-; CHECK: st4.2s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x00,0x0c]
-; CHECK: st4.4s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x00,0x4c]
-; CHECK: st4.2d	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0e,0x00,0x4c]
-; CHECK: ld1.8b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0xcf,0x0c]
-; CHECK: ld1.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0xcf,0x0c]
-; CHECK: ld1.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0xcf,0x0c]
-; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0xcf,0x0c]
-; CHECK: ld1.16b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0xcf,0x4c]
-; CHECK: ld1.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0xcf,0x4c]
-; CHECK: ld1.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0xcf,0x4c]
-; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0xcf,0x4c]
-; CHECK: ld1.4h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0xcf,0x0c]
-; CHECK: ld1.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0xcf,0x0c]
-; CHECK: ld1.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0xcf,0x0c]
-; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0xcf,0x0c]
-; CHECK: ld1.8h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0xcf,0x4c]
-; CHECK: ld1.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0xcf,0x4c]
-; CHECK: ld1.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0xcf,0x4c]
-; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0xcf,0x4c]
-; CHECK: ld1.2s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0xcf,0x0c]
-; CHECK: ld1.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0xcf,0x0c]
-; CHECK: ld1.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0xcf,0x0c]
-; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0xcf,0x0c]
-; CHECK: ld1.4s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0xcf,0x4c]
-; CHECK: ld1.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0xcf,0x4c]
-; CHECK: ld1.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0xcf,0x4c]
-; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0xcf,0x4c]
-; CHECK: ld1.1d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0xcf,0x0c]
-; CHECK: ld1.1d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0xcf,0x0c]
-; CHECK: ld1.1d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0xcf,0x0c]
-; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0xcf,0x0c]
-; CHECK: ld1.2d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0xcf,0x4c]
-; CHECK: ld1.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0xcf,0x4c]
-; CHECK: ld1.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0xcf,0x4c]
-; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0xcf,0x4c]
-; CHECK: st1.8b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0x8f,0x0c]
-; CHECK: st1.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0x8f,0x0c]
-; CHECK: st1.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0x8f,0x0c]
-; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0x8f,0x0c]
-; CHECK: st1.16b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0x8f,0x4c]
-; CHECK: st1.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0x8f,0x4c]
-; CHECK: st1.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0x8f,0x4c]
-; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0x8f,0x4c]
-; CHECK: st1.4h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0x8f,0x0c]
-; CHECK: st1.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0x8f,0x0c]
-; CHECK: st1.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0x8f,0x0c]
-; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0x8f,0x0c]
-; CHECK: st1.8h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0x8f,0x4c]
-; CHECK: st1.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0x8f,0x4c]
-; CHECK: st1.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0x8f,0x4c]
-; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0x8f,0x4c]
-; CHECK: st1.2s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0x8f,0x0c]
-; CHECK: st1.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0x8f,0x0c]
-; CHECK: st1.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0x8f,0x0c]
-; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0x8f,0x0c]
-; CHECK: st1.4s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0x8f,0x4c]
-; CHECK: st1.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0x8f,0x4c]
-; CHECK: st1.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0x8f,0x4c]
-; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0x8f,0x4c]
-; CHECK: st1.1d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0x8f,0x0c]
-; CHECK: st1.1d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0x8f,0x0c]
-; CHECK: st1.1d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0x8f,0x0c]
-; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0x8f,0x0c]
-; CHECK: st1.2d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0x8f,0x4c]
-; CHECK: st1.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0x8f,0x4c]
-; CHECK: st1.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0x8f,0x4c]
-; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0x8f,0x4c]
-; CHECK: ld1.8b	{ v1 }, [x1], #8       ; encoding: [0x21,0x70,0xdf,0x0c]
-; CHECK: ld1.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa0,0xdf,0x0c]
-; CHECK: ld1.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x60,0xdf,0x0c]
-; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x20,0xdf,0x0c]
-; CHECK: ld1.16b	{ v1 }, [x1], #16       ; encoding: [0x21,0x70,0xdf,0x4c]
-; CHECK: ld1.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa0,0xdf,0x4c]
-; CHECK: ld1.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x60,0xdf,0x4c]
-; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x20,0xdf,0x4c]
-; CHECK: ld1.4h	{ v1 }, [x1], #8       ; encoding: [0x21,0x74,0xdf,0x0c]
-; CHECK: ld1.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa4,0xdf,0x0c]
-; CHECK: ld1.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x64,0xdf,0x0c]
-; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x24,0xdf,0x0c]
-; CHECK: ld1.8h	{ v1 }, [x1], #16       ; encoding: [0x21,0x74,0xdf,0x4c]
-; CHECK: ld1.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa4,0xdf,0x4c]
-; CHECK: ld1.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x64,0xdf,0x4c]
-; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x24,0xdf,0x4c]
-; CHECK: ld1.2s	{ v1 }, [x1], #8       ; encoding: [0x21,0x78,0xdf,0x0c]
-; CHECK: ld1.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa8,0xdf,0x0c]
-; CHECK: ld1.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x68,0xdf,0x0c]
-; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x28,0xdf,0x0c]
-; CHECK: ld1.4s	{ v1 }, [x1], #16       ; encoding: [0x21,0x78,0xdf,0x4c]
-; CHECK: ld1.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa8,0xdf,0x4c]
-; CHECK: ld1.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x68,0xdf,0x4c]
-; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x28,0xdf,0x4c]
-; CHECK: ld1.1d	{ v1 }, [x1], #8       ; encoding: [0x21,0x7c,0xdf,0x0c]
-; CHECK: ld1.1d	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xac,0xdf,0x0c]
-; CHECK: ld1.1d	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x6c,0xdf,0x0c]
-; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x2c,0xdf,0x0c]
-; CHECK: ld1.2d	{ v1 }, [x1], #16       ; encoding: [0x21,0x7c,0xdf,0x4c]
-; CHECK: ld1.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xac,0xdf,0x4c]
-; CHECK: ld1.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x6c,0xdf,0x4c]
-; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x2c,0xdf,0x4c]
-; CHECK: st1.8b	{ v1 }, [x1], #8       ; encoding: [0x21,0x70,0x9f,0x0c]
-; CHECK: st1.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa0,0x9f,0x0c]
-; CHECK: st1.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x60,0x9f,0x0c]
-; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x20,0x9f,0x0c]
-; CHECK: st1.16b	{ v1 }, [x1], #16       ; encoding: [0x21,0x70,0x9f,0x4c]
-; CHECK: st1.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa0,0x9f,0x4c]
-; CHECK: st1.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x60,0x9f,0x4c]
-; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x20,0x9f,0x4c]
-; CHECK: st1.4h	{ v1 }, [x1], #8       ; encoding: [0x21,0x74,0x9f,0x0c]
-; CHECK: st1.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa4,0x9f,0x0c]
-; CHECK: st1.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x64,0x9f,0x0c]
-; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x24,0x9f,0x0c]
-; CHECK: st1.8h	{ v1 }, [x1], #16       ; encoding: [0x21,0x74,0x9f,0x4c]
-; CHECK: st1.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa4,0x9f,0x4c]
-; CHECK: st1.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x64,0x9f,0x4c]
-; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x24,0x9f,0x4c]
-; CHECK: st1.2s	{ v1 }, [x1], #8       ; encoding: [0x21,0x78,0x9f,0x0c]
-; CHECK: st1.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa8,0x9f,0x0c]
-; CHECK: st1.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x68,0x9f,0x0c]
-; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x28,0x9f,0x0c]
-; CHECK: st1.4s	{ v1 }, [x1], #16       ; encoding: [0x21,0x78,0x9f,0x4c]
-; CHECK: st1.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa8,0x9f,0x4c]
-; CHECK: st1.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x68,0x9f,0x4c]
-; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x28,0x9f,0x4c]
-; CHECK: st1.1d	{ v1 }, [x1], #8       ; encoding: [0x21,0x7c,0x9f,0x0c]
-; CHECK: st1.1d	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xac,0x9f,0x0c]
-; CHECK: st1.1d	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x6c,0x9f,0x0c]
-; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x2c,0x9f,0x0c]
-; CHECK: st1.2d	{ v1 }, [x1], #16       ; encoding: [0x21,0x7c,0x9f,0x4c]
-; CHECK: st1.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xac,0x9f,0x4c]
-; CHECK: st1.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x6c,0x9f,0x4c]
-; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x2c,0x9f,0x4c]
-; CHECK: ld2.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0xcf,0x0c]
-; CHECK: ld2.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0xcf,0x4c]
-; CHECK: ld2.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0xcf,0x0c]
-; CHECK: ld2.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0xcf,0x4c]
-; CHECK: ld2.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0xcf,0x0c]
-; CHECK: ld2.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0xcf,0x4c]
-; CHECK: ld2.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x8c,0xcf,0x4c]
-; CHECK: st2.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0x8f,0x0c]
-; CHECK: st2.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0x8f,0x4c]
-; CHECK: st2.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0x8f,0x0c]
-; CHECK: st2.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0x8f,0x4c]
-; CHECK: st2.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0x8f,0x0c]
-; CHECK: st2.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0x8f,0x4c]
-; CHECK: st2.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x8c,0x8f,0x4c]
-; CHECK: ld2.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x80,0xdf,0x0c]
-; CHECK: ld2.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x80,0xdf,0x4c]
-; CHECK: ld2.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x84,0xdf,0x0c]
-; CHECK: ld2.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x84,0xdf,0x4c]
-; CHECK: ld2.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x88,0xdf,0x0c]
-; CHECK: ld2.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x88,0xdf,0x4c]
-; CHECK: ld2.2d	{ v2, v3 }, [x1], #32	; encoding: [0x22,0x8c,0xdf,0x4c]
-; CHECK: st2.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x80,0x9f,0x0c]
-; CHECK: st2.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x80,0x9f,0x4c]
-; CHECK: st2.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x84,0x9f,0x0c]
-; CHECK: st2.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x84,0x9f,0x4c]
-; CHECK: st2.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x88,0x9f,0x0c]
-; CHECK: st2.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x88,0x9f,0x4c]
-; CHECK: st2.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x8c,0x9f,0x4c]
-; CHECK: ld3.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0xcf,0x0c]
-; CHECK: ld3.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0xcf,0x4c]
-; CHECK: ld3.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0xcf,0x0c]
-; CHECK: ld3.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0xcf,0x4c]
-; CHECK: ld3.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0xcf,0x0c]
-; CHECK: ld3.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0xcf,0x4c]
-; CHECK: ld3.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x4c,0xcf,0x4c]
-; CHECK: st3.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0x8f,0x0c]
-; CHECK: st3.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0x8f,0x4c]
-; CHECK: st3.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0x8f,0x0c]
-; CHECK: st3.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0x8f,0x4c]
-; CHECK: st3.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0x8f,0x0c]
-; CHECK: st3.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0x8f,0x4c]
-; CHECK: st3.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x4c,0x8f,0x4c]
-; CHECK: ld3.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x40,0xdf,0x0c]
-; CHECK: ld3.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x40,0xdf,0x4c]
-; CHECK: ld3.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x44,0xdf,0x0c]
-; CHECK: ld3.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x44,0xdf,0x4c]
-; CHECK: ld3.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x48,0xdf,0x0c]
-; CHECK: ld3.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x48,0xdf,0x4c]
-; CHECK: ld3.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x4c,0xdf,0x4c]
-; CHECK: st3.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x40,0x9f,0x0c]
-; CHECK: st3.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x40,0x9f,0x4c]
-; CHECK: st3.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x44,0x9f,0x0c]
-; CHECK: st3.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x44,0x9f,0x4c]
-; CHECK: st3.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x48,0x9f,0x0c]
-; CHECK: st3.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x48,0x9f,0x4c]
-; CHECK: st3.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x4c,0x9f,0x4c]
-; CHECK: ld4.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0xcf,0x0c]
-; CHECK: ld4.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0xcf,0x4c]
-; CHECK: ld4.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0xcf,0x0c]
-; CHECK: ld4.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0xcf,0x4c]
-; CHECK: ld4.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0xcf,0x0c]
-; CHECK: ld4.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0xcf,0x4c]
-; CHECK: ld4.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x0c,0xcf,0x4c]
-; CHECK: st4.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0x8f,0x0c]
-; CHECK: st4.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0x8f,0x4c]
-; CHECK: st4.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0x8f,0x0c]
-; CHECK: st4.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0x8f,0x4c]
-; CHECK: st4.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0x8f,0x0c]
-; CHECK: st4.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0x8f,0x4c]
-; CHECK: st4.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x0c,0x8f,0x4c]
-; CHECK: ld4.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x00,0xdf,0x0c]
-; CHECK: ld4.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x00,0xdf,0x4c]
-; CHECK: ld4.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x04,0xdf,0x0c]
-; CHECK: ld4.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x04,0xdf,0x4c]
-; CHECK: ld4.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x08,0xdf,0x0c]
-; CHECK: ld4.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x08,0xdf,0x4c]
-; CHECK: ld4.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x0c,0xdf,0x4c]
-; CHECK: st4.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x00,0x9f,0x0c]
-; CHECK: st4.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x00,0x9f,0x4c]
-; CHECK: st4.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x04,0x9f,0x0c]
-; CHECK: st4.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x04,0x9f,0x4c]
-; CHECK: st4.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x08,0x9f,0x0c]
-; CHECK: st4.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x08,0x9f,0x4c]
-; CHECK: st4.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x0c,0x9f,0x4c]
-; CHECK: ld1r.8b	{ v12 }, [x2]           ; encoding: [0x4c,0xc0,0x40,0x0d]
-; CHECK: ld1r.8b	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc0,0xc3,0x0d]
-; CHECK: ld1r.16b	{ v12 }, [x2]   ; encoding: [0x4c,0xc0,0x40,0x4d]
-; CHECK: ld1r.16b	{ v12 }, [x2], x3 ; encoding: [0x4c,0xc0,0xc3,0x4d]
-; CHECK: ld1r.4h	{ v12 }, [x2]           ; encoding: [0x4c,0xc4,0x40,0x0d]
-; CHECK: ld1r.4h	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc4,0xc3,0x0d]
-; CHECK: ld1r.8h	{ v12 }, [x2]           ; encoding: [0x4c,0xc4,0x40,0x4d]
-; CHECK: ld1r.8h	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc4,0xc3,0x4d]
-; CHECK: ld1r.2s	{ v12 }, [x2]           ; encoding: [0x4c,0xc8,0x40,0x0d]
-; CHECK: ld1r.2s	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc8,0xc3,0x0d]
-; CHECK: ld1r.4s	{ v12 }, [x2]           ; encoding: [0x4c,0xc8,0x40,0x4d]
-; CHECK: ld1r.4s	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc8,0xc3,0x4d]
-; CHECK: ld1r.1d	{ v12 }, [x2]           ; encoding: [0x4c,0xcc,0x40,0x0d]
-; CHECK: ld1r.1d	{ v12 }, [x2], x3       ; encoding: [0x4c,0xcc,0xc3,0x0d]
-; CHECK: ld1r.2d	{ v12 }, [x2]           ; encoding: [0x4c,0xcc,0x40,0x4d]
-; CHECK: ld1r.2d	{ v12 }, [x2], x3       ; encoding: [0x4c,0xcc,0xc3,0x4d]
-; CHECK: ld1r.8b	{ v12 }, [x2], #1      ; encoding: [0x4c,0xc0,0xdf,0x0d]
-; CHECK: ld1r.16b	{ v12 }, [x2], #1 ; encoding: [0x4c,0xc0,0xdf,0x4d]
-; CHECK: ld1r.4h	{ v12 }, [x2], #2      ; encoding: [0x4c,0xc4,0xdf,0x0d]
-; CHECK: ld1r.8h	{ v12 }, [x2], #2      ; encoding: [0x4c,0xc4,0xdf,0x4d]
-; CHECK: ld1r.2s	{ v12 }, [x2], #4      ; encoding: [0x4c,0xc8,0xdf,0x0d]
-; CHECK: ld1r.4s	{ v12 }, [x2], #4      ; encoding: [0x4c,0xc8,0xdf,0x4d]
-; CHECK: ld1r.1d	{ v12 }, [x2], #8      ; encoding: [0x4c,0xcc,0xdf,0x0d]
-; CHECK: ld1r.2d	{ v12 }, [x2], #8      ; encoding: [0x4c,0xcc,0xdf,0x4d]
-; CHECK: ld2r.8b	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc0,0x60,0x0d]
-; CHECK: ld2r.8b	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc0,0xe3,0x0d]
-; CHECK: ld2r.16b	{ v3, v4 }, [x2] ; encoding: [0x43,0xc0,0x60,0x4d]
-; CHECK: ld2r.16b	{ v3, v4 }, [x2], x3 ; encoding: [0x43,0xc0,0xe3,0x4d]
-; CHECK: ld2r.4h	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc4,0x60,0x0d]
-; CHECK: ld2r.4h	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc4,0xe3,0x0d]
-; CHECK: ld2r.8h	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc4,0x60,0x4d]
-; CHECK: ld2r.8h	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc4,0xe3,0x4d]
-; CHECK: ld2r.2s	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc8,0x60,0x0d]
-; CHECK: ld2r.2s	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc8,0xe3,0x0d]
-; CHECK: ld2r.4s	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc8,0x60,0x4d]
-; CHECK: ld2r.4s	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc8,0xe3,0x4d]
-; CHECK: ld2r.1d	{ v3, v4 }, [x2]        ; encoding: [0x43,0xcc,0x60,0x0d]
-; CHECK: ld2r.1d	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xcc,0xe3,0x0d]
-; CHECK: ld2r.2d	{ v3, v4 }, [x2]        ; encoding: [0x43,0xcc,0x60,0x4d]
-; CHECK: ld2r.2d	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xcc,0xe3,0x4d]
-; CHECK: ld2r.8b	{ v3, v4 }, [x2], #2   ; encoding: [0x43,0xc0,0xff,0x0d]
-; CHECK: ld2r.16b	{ v3, v4 }, [x2], #2 ; encoding: [0x43,0xc0,0xff,0x4d]
-; CHECK: ld2r.4h	{ v3, v4 }, [x2], #4   ; encoding: [0x43,0xc4,0xff,0x0d]
-; CHECK: ld2r.8h	{ v3, v4 }, [x2], #4   ; encoding: [0x43,0xc4,0xff,0x4d]
-; CHECK: ld2r.2s	{ v3, v4 }, [x2], #8   ; encoding: [0x43,0xc8,0xff,0x0d]
-; CHECK: ld2r.4s	{ v3, v4 }, [x2], #8   ; encoding: [0x43,0xc8,0xff,0x4d]
-; CHECK: ld2r.1d	{ v3, v4 }, [x2], #16   ; encoding: [0x43,0xcc,0xff,0x0d]
-; CHECK: ld2r.2d	{ v3, v4 }, [x2], #16   ; encoding: [0x43,0xcc,0xff,0x4d]
-; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe0,0x40,0x0d]
-; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe0,0xc3,0x0d]
-; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2] ; encoding: [0x42,0xe0,0x40,0x4d]
-; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe0,0xc3,0x4d]
-; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe4,0x40,0x0d]
-; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe4,0xc3,0x0d]
-; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe4,0x40,0x4d]
-; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe4,0xc3,0x4d]
-; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe8,0x40,0x0d]
-; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe8,0xc3,0x0d]
-; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe8,0x40,0x4d]
-; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe8,0xc3,0x4d]
-; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xec,0x40,0x0d]
-; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xec,0xc3,0x0d]
-; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xec,0x40,0x4d]
-; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xec,0xc3,0x4d]
-; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2], #3 ; encoding: [0x42,0xe0,0xdf,0x0d]
-; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2], #3 ; encoding: [0x42,0xe0,0xdf,0x4d]
-; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2], #6 ; encoding: [0x42,0xe4,0xdf,0x0d]
-; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2], #6 ; encoding: [0x42,0xe4,0xdf,0x4d]
-; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2], #12 ; encoding: [0x42,0xe8,0xdf,0x0d]
-; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2], #12 ; encoding: [0x42,0xe8,0xdf,0x4d]
-; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2], #24 ; encoding: [0x42,0xec,0xdf,0x0d]
-; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2], #24 ; encoding: [0x42,0xec,0xdf,0x4d]
-; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe0,0x60,0x0d]
-; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe0,0xe3,0x0d]
-; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe0,0x60,0x4d]
-; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe0,0xe3,0x4d]
-; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe4,0x60,0x0d]
-; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe4,0xe3,0x0d]
-; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe4,0x60,0x4d]
-; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe4,0xe3,0x4d]
-; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe8,0x60,0x0d]
-; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe8,0xe3,0x0d]
-; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe8,0x60,0x4d]
-; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe8,0xe3,0x4d]
-; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xec,0x60,0x0d]
-; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xec,0xe3,0x0d]
-; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xec,0x60,0x4d]
-; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xec,0xe3,0x4d]
-; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2], #4 ; encoding: [0x42,0xe0,0xff,0x0d]
-; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2], #4 ; encoding: [0x42,0xe0,0xff,0x4d]
-; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2], #8 ; encoding: [0x42,0xe4,0xff,0x0d]
-; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2], #8 ; encoding: [0x42,0xe4,0xff,0x4d]
-; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x0d]
-; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x4d]
-; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2], #32 ; encoding: [0x42,0xec,0xff,0x0d]
-; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2], #32 ; encoding: [0x42,0xec,0xff,0x4d]
-; CHECK: ld1.b	{ v6 }[13], [x3]        ; encoding: [0x66,0x14,0x40,0x4d]
-; CHECK: ld1.h	{ v6 }[2], [x3]         ; encoding: [0x66,0x50,0x40,0x0d]
-; CHECK: ld1.s	{ v6 }[2], [x3]         ; encoding: [0x66,0x80,0x40,0x4d]
-; CHECK: ld1.d	{ v6 }[1], [x3]         ; encoding: [0x66,0x84,0x40,0x4d]
-; CHECK: ld1.b	{ v6 }[13], [x3], x5    ; encoding: [0x66,0x14,0xc5,0x4d]
-; CHECK: ld1.h	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x50,0xc5,0x0d]
-; CHECK: ld1.s	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x80,0xc5,0x4d]
-; CHECK: ld1.d	{ v6 }[1], [x3], x5     ; encoding: [0x66,0x84,0xc5,0x4d]
-; CHECK: ld1.b	{ v6 }[13], [x3], #1   ; encoding: [0x66,0x14,0xdf,0x4d]
-; CHECK: ld1.h	{ v6 }[2], [x3], #2    ; encoding: [0x66,0x50,0xdf,0x0d]
-; CHECK: ld1.s	{ v6 }[2], [x3], #4    ; encoding: [0x66,0x80,0xdf,0x4d]
-; CHECK: ld1.d	{ v6 }[1], [x3], #8    ; encoding: [0x66,0x84,0xdf,0x4d]
-; CHECK: ld2.b	{ v5, v6 }[13], [x3]    ; encoding: [0x65,0x14,0x60,0x4d]
-; CHECK: ld2.h	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x50,0x60,0x0d]
-; CHECK: ld2.s	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x80,0x60,0x4d]
-; CHECK: ld2.d	{ v5, v6 }[1], [x3]     ; encoding: [0x65,0x84,0x60,0x4d]
-; CHECK: ld2.b	{ v5, v6 }[13], [x3], x5 ; encoding: [0x65,0x14,0xe5,0x4d]
-; CHECK: ld2.h	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x50,0xe5,0x0d]
-; CHECK: ld2.s	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x80,0xe5,0x4d]
-; CHECK: ld2.d	{ v5, v6 }[1], [x3], x5 ; encoding: [0x65,0x84,0xe5,0x4d]
-; CHECK: ld2.b	{ v5, v6 }[13], [x3], #2 ; encoding: [0x65,0x14,0xff,0x4d]
-; CHECK: ld2.h	{ v5, v6 }[2], [x3], #4 ; encoding: [0x65,0x50,0xff,0x0d]
-; CHECK: ld2.s	{ v5, v6 }[2], [x3], #8 ; encoding: [0x65,0x80,0xff,0x4d]
-; CHECK: ld2.d	{ v5, v6 }[1], [x3], #16 ; encoding: [0x65,0x84,0xff,0x4d]
-; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3] ; encoding: [0x67,0x34,0x40,0x4d]
-; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0x70,0x40,0x0d]
-; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0xa0,0x40,0x4d]
-; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3] ; encoding: [0x67,0xa4,0x40,0x4d]
-; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3], x5 ; encoding: [0x67,0x34,0xc5,0x4d]
-; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0x70,0xc5,0x0d]
-; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xc5,0x4d]
-; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xc5,0x4d]
-; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3], #3 ; encoding: [0x67,0x34,0xdf,0x4d]
-; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3], #6 ; encoding: [0x67,0x70,0xdf,0x0d]
-; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3], #12 ; encoding: [0x67,0xa0,0xdf,0x4d]
-; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3], #24 ; encoding: [0x67,0xa4,0xdf,0x4d]
-; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3] ; encoding: [0x67,0x34,0x60,0x4d]
-; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0x70,0x60,0x0d]
-; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0xa0,0x60,0x4d]
-; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3] ; encoding: [0x67,0xa4,0x60,0x4d]
-; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3], x5 ; encoding: [0x67,0x34,0xe5,0x4d]
-; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0x70,0xe5,0x0d]
-; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xe5,0x4d]
-; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xe5,0x4d]
-; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3], #4 ; encoding: [0x67,0x34,0xff,0x4d]
-; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3], #8 ; encoding: [0x67,0x70,0xff,0x0d]
-; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3], #16 ; encoding: [0x67,0xa0,0xff,0x4d]
-; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3], #32 ; encoding: [0x67,0xa4,0xff,0x4d]
-; CHECK: st1.b	{ v6 }[13], [x3]        ; encoding: [0x66,0x14,0x00,0x4d]
-; CHECK: st1.h	{ v6 }[2], [x3]         ; encoding: [0x66,0x50,0x00,0x0d]
-; CHECK: st1.s	{ v6 }[2], [x3]         ; encoding: [0x66,0x80,0x00,0x4d]
-; CHECK: st1.d	{ v6 }[1], [x3]         ; encoding: [0x66,0x84,0x00,0x4d]
-; CHECK: st1.b	{ v6 }[13], [x3], x5    ; encoding: [0x66,0x14,0x85,0x4d]
-; CHECK: st1.h	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x50,0x85,0x0d]
-; CHECK: st1.s	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x80,0x85,0x4d]
-; CHECK: st1.d	{ v6 }[1], [x3], x5     ; encoding: [0x66,0x84,0x85,0x4d]
-; CHECK: st1.b	{ v6 }[13], [x3], #1   ; encoding: [0x66,0x14,0x9f,0x4d]
-; CHECK: st1.h	{ v6 }[2], [x3], #2    ; encoding: [0x66,0x50,0x9f,0x0d]
-; CHECK: st1.s	{ v6 }[2], [x3], #4    ; encoding: [0x66,0x80,0x9f,0x4d]
-; CHECK: st1.d	{ v6 }[1], [x3], #8    ; encoding: [0x66,0x84,0x9f,0x4d]
-; CHECK: st2.b	{ v5, v6 }[13], [x3]    ; encoding: [0x65,0x14,0x20,0x4d]
-; CHECK: st2.h	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x50,0x20,0x0d]
-; CHECK: st2.s	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x80,0x20,0x4d]
-; CHECK: st2.d	{ v5, v6 }[1], [x3]     ; encoding: [0x65,0x84,0x20,0x4d]
-; CHECK: st2.b	{ v5, v6 }[13], [x3], x5 ; encoding: [0x65,0x14,0xa5,0x4d]
-; CHECK: st2.h	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x50,0xa5,0x0d]
-; CHECK: st2.s	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x80,0xa5,0x4d]
-; CHECK: st2.d	{ v5, v6 }[1], [x3], x5 ; encoding: [0x65,0x84,0xa5,0x4d]
-; CHECK: st2.b	{ v5, v6 }[13], [x3], #2 ; encoding: [0x65,0x14,0xbf,0x4d]
-; CHECK: st2.h	{ v5, v6 }[2], [x3], #4 ; encoding: [0x65,0x50,0xbf,0x0d]
-; CHECK: st2.s	{ v5, v6 }[2], [x3], #8 ; encoding: [0x65,0x80,0xbf,0x4d]
-; CHECK: st2.d	{ v5, v6 }[1], [x3], #16 ; encoding: [0x65,0x84,0xbf,0x4d]
-; CHECK: st3.b	{ v7, v8, v9 }[13], [x3] ; encoding: [0x67,0x34,0x00,0x4d]
-; CHECK: st3.h	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0x70,0x00,0x0d]
-; CHECK: st3.s	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0xa0,0x00,0x4d]
-; CHECK: st3.d	{ v7, v8, v9 }[1], [x3] ; encoding: [0x67,0xa4,0x00,0x4d]
-; CHECK: st3.b	{ v7, v8, v9 }[13], [x3], x5 ; encoding: [0x67,0x34,0x85,0x4d]
-; CHECK: st3.h	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0x70,0x85,0x0d]
-; CHECK: st3.s	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0xa0,0x85,0x4d]
-; CHECK: st3.d	{ v7, v8, v9 }[1], [x3], x5 ; encoding: [0x67,0xa4,0x85,0x4d]
-; CHECK: st3.b	{ v7, v8, v9 }[13], [x3], #3 ; encoding: [0x67,0x34,0x9f,0x4d]
-; CHECK: st3.h	{ v7, v8, v9 }[2], [x3], #6 ; encoding: [0x67,0x70,0x9f,0x0d]
-; CHECK: st3.s	{ v7, v8, v9 }[2], [x3], #12 ; encoding: [0x67,0xa0,0x9f,0x4d]
-; CHECK: st3.d	{ v7, v8, v9 }[1], [x3], #24 ; encoding: [0x67,0xa4,0x9f,0x4d]
-; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3] ; encoding: [0x67,0x34,0x20,0x4d]
-; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0x70,0x20,0x0d]
-; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0xa0,0x20,0x4d]
-; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3] ; encoding: [0x67,0xa4,0x20,0x4d]
-; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3], x5 ; encoding: [0x67,0x34,0xa5,0x4d]
-; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0x70,0xa5,0x0d]
-; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xa5,0x4d]
-; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xa5,0x4d]
-; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3], #4 ; encoding: [0x67,0x34,0xbf,0x4d]
-; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3], #8 ; encoding: [0x67,0x70,0xbf,0x0d]
-; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3], #16 ; encoding: [0x67,0xa0,0xbf,0x4d]
-; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3], #32 ; encoding: [0x67,0xa4,0xbf,0x4d]
diff --git a/test/MC/ARM64/small-data-fixups.s b/test/MC/ARM64/small-data-fixups.s
deleted file mode 100644
index 3fe7c75..0000000
--- a/test/MC/ARM64/small-data-fixups.s
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o - %s | macho-dump | FileCheck %s
-
-foo:
-  .long 0
-bar:
-  .long 1
-
-baz:
-  .byte foo - bar
-  .short foo - bar
-
-; CHECK: # Relocation 0
-; CHECK: (('word-0', 0x9),
-; CHECK:  ('word-1', 0x1a000002)),
-; CHECK: # Relocation 1
-; CHECK: (('word-0', 0x9),
-; CHECK:  ('word-1', 0xa000001)),
-; CHECK: # Relocation 2
-; CHECK: (('word-0', 0x8),
-; CHECK:  ('word-1', 0x18000002)),
-; CHECK: # Relocation 3
-; CHECK: (('word-0', 0x8),
-; CHECK:  ('word-1', 0x8000001)),
-
diff --git a/test/MC/ARM64/system-encoding.s b/test/MC/ARM64/system-encoding.s
deleted file mode 100644
index 9f0d3c4..0000000
--- a/test/MC/ARM64/system-encoding.s
+++ /dev/null
@@ -1,679 +0,0 @@
-; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-
-foo:
-
-;-----------------------------------------------------------------------------
-; Simple encodings (instuctions w/ no operands)
-;-----------------------------------------------------------------------------
-
-  nop
-  sev
-  sevl
-  wfe
-  wfi
-  yield
-
-; CHECK: nop                             ; encoding: [0x1f,0x20,0x03,0xd5]
-; CHECK: sev                             ; encoding: [0x9f,0x20,0x03,0xd5]
-; CHECK: sevl                            ; encoding: [0xbf,0x20,0x03,0xd5]
-; CHECK: wfe                             ; encoding: [0x5f,0x20,0x03,0xd5]
-; CHECK: wfi                             ; encoding: [0x7f,0x20,0x03,0xd5]
-; CHECK: yield                           ; encoding: [0x3f,0x20,0x03,0xd5]
-
-;-----------------------------------------------------------------------------
-; Single-immediate operand instructions
-;-----------------------------------------------------------------------------
-
-  clrex #10
-; CHECK: clrex #10  ; encoding: [0x5f,0x3a,0x03,0xd5]
-  isb #15
-  isb sy
-; CHECK: isb     ; encoding: [0xdf,0x3f,0x03,0xd5]
-; CHECK: isb     ; encoding: [0xdf,0x3f,0x03,0xd5]
-  dmb #3
-  dmb osh
-; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
-; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
-  dsb #7
-  dsb nsh
-; CHECK: dsb nsh    ; encoding: [0x9f,0x37,0x03,0xd5]
-; CHECK: dsb nsh    ; encoding: [0x9f,0x37,0x03,0xd5]
-
-;-----------------------------------------------------------------------------
-; Generic system instructions
-;-----------------------------------------------------------------------------
-  sys #2, c0, c5, #7
-; CHECK: encoding: [0xff,0x05,0x0a,0xd5]
-  sys #7, C6, c10, #7, x7
-; CHECK: encoding: [0xe7,0x6a,0x0f,0xd5]
-  sysl  x20, #6, c3, C15, #7
-; CHECK: encoding: [0xf4,0x3f,0x2e,0xd5]
-
-; Check for error on invalid 'C' operand value.
-  sys #2, c16, c5, #7
-; CHECK-ERRORS: invalid operand for instruction
-
-;-----------------------------------------------------------------------------
-; MSR/MRS instructions
-;-----------------------------------------------------------------------------
-  msr ACTLR_EL1, x3
-  msr ACTLR_EL2, x3
-  msr ACTLR_EL3, x3
-  msr ADFSR_EL1, x3
-  msr ADFSR_EL2, x3
-  msr ADFSR_EL3, x3
-  msr AIDR_EL1, x3
-  msr AIFSR_EL1, x3
-  msr AIFSR_EL2, x3
-  msr AIFSR_EL3, x3
-  msr AMAIR_EL1, x3
-  msr AMAIR_EL2, x3
-  msr AMAIR_EL3, x3
-  msr CCSIDR_EL1, x3
-  msr CLIDR_EL1, x3
-  msr CNTFRQ_EL0, x3
-  msr CNTHCTL_EL2, x3
-  msr CNTHP_CTL_EL2, x3
-  msr CNTHP_CVAL_EL2, x3
-  msr CNTHP_TVAL_EL2, x3
-  msr CNTKCTL_EL1, x3
-  msr CNTPCT_EL0, x3
-  msr CNTP_CTL_EL0, x3
-  msr CNTP_CVAL_EL0, x3
-  msr CNTP_TVAL_EL0, x3
-  msr CNTVCT_EL0, x3
-  msr CNTVOFF_EL2, x3
-  msr CNTV_CTL_EL0, x3
-  msr CNTV_CVAL_EL0, x3
-  msr CNTV_TVAL_EL0, x3
-  msr CONTEXTIDR_EL1, x3
-  msr CPACR_EL1, x3
-  msr CPTR_EL2, x3
-  msr CPTR_EL3, x3
-  msr CSSELR_EL1, x3
-  msr CTR_EL0, x3
-  msr CURRENT_EL, x3
-  msr DACR32_EL2, x3
-  msr DCZID_EL0, x3
-  msr ECOIDR_EL1, x3
-  msr ESR_EL1, x3
-  msr ESR_EL2, x3
-  msr ESR_EL3, x3
-  msr FAR_EL1, x3
-  msr FAR_EL2, x3
-  msr FAR_EL3, x3
-  msr FPEXC32_EL2, x3
-  msr HACR_EL2, x3
-  msr HCR_EL2, x3
-  msr HPFAR_EL2, x3
-  msr HSTR_EL2, x3
-  msr ID_AA64DFR0_EL1, x3
-  msr ID_AA64DFR1_EL1, x3
-  msr ID_AA64ISAR0_EL1, x3
-  msr ID_AA64ISAR1_EL1, x3
-  msr ID_AA64MMFR0_EL1, x3
-  msr ID_AA64MMFR1_EL1, x3
-  msr ID_AA64PFR0_EL1, x3
-  msr ID_AA64PFR1_EL1, x3
-  msr IFSR32_EL2, x3
-  msr ISR_EL1, x3
-  msr MAIR_EL1, x3
-  msr MAIR_EL2, x3
-  msr MAIR_EL3, x3
-  msr MDCR_EL2, x3
-  msr MDCR_EL3, x3
-  msr MIDR_EL1, x3
-  msr MPIDR_EL1, x3
-  msr MVFR0_EL1, x3
-  msr MVFR1_EL1, x3
-  msr PAR_EL1, x3
-  msr RVBAR_EL1, x3
-  msr RVBAR_EL2, x3
-  msr RVBAR_EL3, x3
-  msr SCR_EL3, x3
-  msr SCTLR_EL1, x3
-  msr SCTLR_EL2, x3
-  msr SCTLR_EL3, x3
-  msr SDER32_EL3, x3
-  msr TCR_EL1, x3
-  msr TCR_EL2, x3
-  msr TCR_EL3, x3
-  msr TEECR32_EL1, x3
-  msr TEEHBR32_EL1, x3
-  msr TPIDRRO_EL0, x3
-  msr TPIDR_EL0, x3
-  msr TPIDR_EL1, x3
-  msr TPIDR_EL2, x3
-  msr TPIDR_EL3, x3
-  msr TTBR0_EL1, x3
-  msr TTBR0_EL2, x3
-  msr TTBR0_EL3, x3
-  msr TTBR1_EL1, x3
-  msr VBAR_EL1, x3
-  msr VBAR_EL2, x3
-  msr VBAR_EL3, x3
-  msr VMPIDR_EL2, x3
-  msr VPIDR_EL2, x3
-  msr VTCR_EL2, x3
-  msr VTTBR_EL2, x3
-  msr SPSel, x3
-  msr S2_2_C4_C6_4, x1
-; CHECK: msr ACTLR_EL1, x3              ; encoding: [0x23,0x10,0x18,0xd5]
-; CHECK: msr ACTLR_EL2, x3              ; encoding: [0x23,0x10,0x1c,0xd5]
-; CHECK: msr ACTLR_EL3, x3              ; encoding: [0x23,0x10,0x1e,0xd5]
-; CHECK: msr AFSR0_EL1, x3              ; encoding: [0x03,0x51,0x18,0xd5]
-; CHECK: msr ADFSR_EL2, x3              ; encoding: [0x03,0x51,0x1c,0xd5]
-; CHECK: msr ADFSR_EL3, x3              ; encoding: [0x03,0x51,0x1e,0xd5]
-; CHECK: msr AIDR_EL1, x3               ; encoding: [0xe3,0x00,0x19,0xd5]
-; CHECK: msr AFSR1_EL1, x3              ; encoding: [0x23,0x51,0x18,0xd5]
-; CHECK: msr AIFSR_EL2, x3              ; encoding: [0x23,0x51,0x1c,0xd5]
-; CHECK: msr AIFSR_EL3, x3              ; encoding: [0x23,0x51,0x1e,0xd5]
-; CHECK: msr AMAIR_EL1, x3              ; encoding: [0x03,0xa3,0x18,0xd5]
-; CHECK: msr AMAIR_EL2, x3              ; encoding: [0x03,0xa3,0x1c,0xd5]
-; CHECK: msr AMAIR_EL3, x3              ; encoding: [0x03,0xa3,0x1e,0xd5]
-; CHECK: msr CCSIDR_EL1, x3             ; encoding: [0x03,0x00,0x19,0xd5]
-; CHECK: msr CLIDR_EL1, x3              ; encoding: [0x23,0x00,0x19,0xd5]
-; CHECK: msr CNTFRQ_EL0, x3             ; encoding: [0x03,0xe0,0x1b,0xd5]
-; CHECK: msr CNTHCTL_EL2, x3            ; encoding: [0x03,0xe1,0x1c,0xd5]
-; CHECK: msr CNTHP_CTL_EL2, x3          ; encoding: [0x23,0xe2,0x1c,0xd5]
-; CHECK: msr CNTHP_CVAL_EL2, x3         ; encoding: [0x43,0xe2,0x1c,0xd5]
-; CHECK: msr CNTHP_TVAL_EL2, x3         ; encoding: [0x03,0xe2,0x1c,0xd5]
-; CHECK: msr CNTKCTL_EL1, x3            ; encoding: [0x03,0xe1,0x18,0xd5]
-; CHECK: msr CNTPCT_EL0, x3             ; encoding: [0x23,0xe0,0x1b,0xd5]
-; CHECK: msr CNTP_CTL_EL0, x3           ; encoding: [0x23,0xe2,0x1b,0xd5]
-; CHECK: msr CNTP_CVAL_EL0, x3          ; encoding: [0x43,0xe2,0x1b,0xd5]
-; CHECK: msr CNTP_TVAL_EL0, x3          ; encoding: [0x03,0xe2,0x1b,0xd5]
-; CHECK: msr CNTVCT_EL0, x3             ; encoding: [0x43,0xe0,0x1b,0xd5]
-; CHECK: msr CNTVOFF_EL2, x3            ; encoding: [0x63,0xe0,0x1c,0xd5]
-; CHECK: msr CNTV_CTL_EL0, x3           ; encoding: [0x23,0xe3,0x1b,0xd5]
-; CHECK: msr CNTV_CVAL_EL0, x3          ; encoding: [0x43,0xe3,0x1b,0xd5]
-; CHECK: msr CNTV_TVAL_EL0, x3          ; encoding: [0x03,0xe3,0x1b,0xd5]
-; CHECK: msr CONTEXTIDR_EL1, x3         ; encoding: [0x23,0xd0,0x18,0xd5]
-; CHECK: msr CPACR_EL1, x3              ; encoding: [0x43,0x10,0x18,0xd5]
-; CHECK: msr CPTR_EL2, x3               ; encoding: [0x43,0x11,0x1c,0xd5]
-; CHECK: msr CPTR_EL3, x3               ; encoding: [0x43,0x11,0x1e,0xd5]
-; CHECK: msr CSSELR_EL1, x3             ; encoding: [0x03,0x00,0x1a,0xd5]
-; CHECK: msr CTR_EL0, x3                ; encoding: [0x23,0x00,0x1b,0xd5]
-; CHECK: msr CurrentEL, x3              ; encoding: [0x43,0x42,0x18,0xd5]
-; CHECK: msr DACR32_EL2, x3             ; encoding: [0x03,0x30,0x1c,0xd5]
-; CHECK: msr DCZID_EL0, x3              ; encoding: [0xe3,0x00,0x1b,0xd5]
-; CHECK: msr REVIDR_EL1, x3             ; encoding: [0xc3,0x00,0x18,0xd5]
-; CHECK: msr ESR_EL1, x3                ; encoding: [0x03,0x52,0x18,0xd5]
-; CHECK: msr ESR_EL2, x3                ; encoding: [0x03,0x52,0x1c,0xd5]
-; CHECK: msr ESR_EL3, x3                ; encoding: [0x03,0x52,0x1e,0xd5]
-; CHECK: msr FAR_EL1, x3                ; encoding: [0x03,0x60,0x18,0xd5]
-; CHECK: msr FAR_EL2, x3                ; encoding: [0x03,0x60,0x1c,0xd5]
-; CHECK: msr FAR_EL3, x3                ; encoding: [0x03,0x60,0x1e,0xd5]
-; CHECK: msr FPEXC32_EL2, x3            ; encoding: [0x03,0x53,0x1c,0xd5]
-; CHECK: msr HACR_EL2, x3               ; encoding: [0xe3,0x11,0x1c,0xd5]
-; CHECK: msr HCR_EL2, x3                ; encoding: [0x03,0x11,0x1c,0xd5]
-; CHECK: msr HPFAR_EL2, x3              ; encoding: [0x83,0x60,0x1c,0xd5]
-; CHECK: msr HSTR_EL2, x3               ; encoding: [0x63,0x11,0x1c,0xd5]
-; CHECK: msr ID_AA64DFR0_EL1, x3        ; encoding: [0x03,0x05,0x18,0xd5]
-; CHECK: msr ID_AA64DFR1_EL1, x3        ; encoding: [0x23,0x05,0x18,0xd5]
-; CHECK: msr ID_AA64ISAR0_EL1, x3       ; encoding: [0x03,0x06,0x18,0xd5]
-; CHECK: msr ID_AA64ISAR1_EL1, x3       ; encoding: [0x23,0x06,0x18,0xd5]
-; CHECK: msr ID_AA64MMFR0_EL1, x3       ; encoding: [0x03,0x07,0x18,0xd5]
-; CHECK: msr ID_AA64MMFR1_EL1, x3       ; encoding: [0x23,0x07,0x18,0xd5]
-; CHECK: msr ID_AA64PFR0_EL1, x3        ; encoding: [0x03,0x04,0x18,0xd5]
-; CHECK: msr ID_AA64PFR1_EL1, x3        ; encoding: [0x23,0x04,0x18,0xd5]
-; CHECK: msr IFSR32_EL2, x3             ; encoding: [0x23,0x50,0x1c,0xd5]
-; CHECK: msr ISR_EL1, x3                ; encoding: [0x03,0xc1,0x18,0xd5]
-; CHECK: msr MAIR_EL1, x3               ; encoding: [0x03,0xa2,0x18,0xd5]
-; CHECK: msr MAIR_EL2, x3               ; encoding: [0x03,0xa2,0x1c,0xd5]
-; CHECK: msr MAIR_EL3, x3               ; encoding: [0x03,0xa2,0x1e,0xd5]
-; CHECK: msr MDCR_EL2, x3               ; encoding: [0x23,0x11,0x1c,0xd5]
-; CHECK: msr MDCR_EL3, x3               ; encoding: [0x23,0x13,0x1e,0xd5]
-; CHECK: msr MIDR_EL1, x3               ; encoding: [0x03,0x00,0x18,0xd5]
-; CHECK: msr MPIDR_EL1, x3              ; encoding: [0xa3,0x00,0x18,0xd5]
-; CHECK: msr MVFR0_EL1, x3              ; encoding: [0x03,0x03,0x18,0xd5]
-; CHECK: msr MVFR1_EL1, x3              ; encoding: [0x23,0x03,0x18,0xd5]
-; CHECK: msr PAR_EL1, x3                ; encoding: [0x03,0x74,0x18,0xd5]
-; CHECK: msr RVBAR_EL1, x3              ; encoding: [0x23,0xc0,0x18,0xd5]
-; CHECK: msr RVBAR_EL2, x3              ; encoding: [0x23,0xc0,0x1c,0xd5]
-; CHECK: msr RVBAR_EL3, x3              ; encoding: [0x23,0xc0,0x1e,0xd5]
-; CHECK: msr SCR_EL3, x3                ; encoding: [0x03,0x11,0x1e,0xd5]
-; CHECK: msr SCTLR_EL1, x3              ; encoding: [0x03,0x10,0x18,0xd5]
-; CHECK: msr SCTLR_EL2, x3              ; encoding: [0x03,0x10,0x1c,0xd5]
-; CHECK: msr SCTLR_EL3, x3              ; encoding: [0x03,0x10,0x1e,0xd5]
-; CHECK: msr SDER32_EL3, x3             ; encoding: [0x23,0x11,0x1e,0xd5]
-; CHECK: msr TCR_EL1, x3                ; encoding: [0x43,0x20,0x18,0xd5]
-; CHECK: msr TCR_EL2, x3                ; encoding: [0x43,0x20,0x1c,0xd5]
-; CHECK: msr TCR_EL3, x3                ; encoding: [0x43,0x20,0x1e,0xd5]
-; CHECK: msr TEECR32_EL1, x3            ; encoding: [0x03,0x00,0x12,0xd5]
-; CHECK: msr TEEHBR32_EL1, x3           ; encoding: [0x03,0x10,0x12,0xd5]
-; CHECK: msr TPIDRRO_EL0, x3            ; encoding: [0x63,0xd0,0x1b,0xd5]
-; CHECK: msr TPIDR_EL0, x3              ; encoding: [0x43,0xd0,0x1b,0xd5]
-; CHECK: msr TPIDR_EL1, x3              ; encoding: [0x83,0xd0,0x18,0xd5]
-; CHECK: msr TPIDR_EL2, x3              ; encoding: [0x43,0xd0,0x1c,0xd5]
-; CHECK: msr TPIDR_EL3, x3              ; encoding: [0x43,0xd0,0x1e,0xd5]
-; CHECK: msr TTBR0_EL1, x3              ; encoding: [0x03,0x20,0x18,0xd5]
-; CHECK: msr TTBR0_EL2, x3              ; encoding: [0x03,0x20,0x1c,0xd5]
-; CHECK: msr TTBR0_EL3, x3              ; encoding: [0x03,0x20,0x1e,0xd5]
-; CHECK: msr TTBR1_EL1, x3              ; encoding: [0x23,0x20,0x18,0xd5]
-; CHECK: msr VBAR_EL1, x3               ; encoding: [0x03,0xc0,0x18,0xd5]
-; CHECK: msr VBAR_EL2, x3               ; encoding: [0x03,0xc0,0x1c,0xd5]
-; CHECK: msr VBAR_EL3, x3               ; encoding: [0x03,0xc0,0x1e,0xd5]
-; CHECK: msr VMPIDR_EL2, x3             ; encoding: [0xa3,0x00,0x1c,0xd5]
-; CHECK: msr VPIDR_EL2, x3              ; encoding: [0x03,0x00,0x1c,0xd5]
-; CHECK: msr VTCR_EL2, x3               ; encoding: [0x43,0x21,0x1c,0xd5]
-; CHECK: msr VTTBR_EL2, x3              ; encoding: [0x03,0x21,0x1c,0xd5]
-; CHECK: msr  SPSel, x3                 ; encoding: [0x03,0x42,0x18,0xd5]
-; CHECK: msr  S2_2_C4_C6_4, x1          ; encoding: [0x81,0x46,0x12,0xd5]
-
-  mrs x3, ACTLR_EL1
-  mrs x3, ACTLR_EL2
-  mrs x3, ACTLR_EL3
-  mrs x3, ADFSR_EL1
-  mrs x3, ADFSR_EL2
-  mrs x3, ADFSR_EL3
-  mrs x3, AIDR_EL1
-  mrs x3, AIFSR_EL1
-  mrs x3, AIFSR_EL2
-  mrs x3, AIFSR_EL3
-  mrs x3, AMAIR_EL1
-  mrs x3, AMAIR_EL2
-  mrs x3, AMAIR_EL3
-  mrs x3, CCSIDR_EL1
-  mrs x3, CLIDR_EL1
-  mrs x3, CNTFRQ_EL0
-  mrs x3, CNTHCTL_EL2
-  mrs x3, CNTHP_CTL_EL2
-  mrs x3, CNTHP_CVAL_EL2
-  mrs x3, CNTHP_TVAL_EL2
-  mrs x3, CNTKCTL_EL1
-  mrs x3, CNTPCT_EL0
-  mrs x3, CNTP_CTL_EL0
-  mrs x3, CNTP_CVAL_EL0
-  mrs x3, CNTP_TVAL_EL0
-  mrs x3, CNTVCT_EL0
-  mrs x3, CNTVOFF_EL2
-  mrs x3, CNTV_CTL_EL0
-  mrs x3, CNTV_CVAL_EL0
-  mrs x3, CNTV_TVAL_EL0
-  mrs x3, CONTEXTIDR_EL1
-  mrs x3, CPACR_EL1
-  mrs x3, CPTR_EL2
-  mrs x3, CPTR_EL3
-  mrs x3, CSSELR_EL1
-  mrs x3, CTR_EL0
-  mrs x3, CURRENT_EL
-  mrs x3, DACR32_EL2
-  mrs x3, DCZID_EL0
-  mrs x3, ECOIDR_EL1
-  mrs x3, ESR_EL1
-  mrs x3, ESR_EL2
-  mrs x3, ESR_EL3
-  mrs x3, FAR_EL1
-  mrs x3, FAR_EL2
-  mrs x3, FAR_EL3
-  mrs x3, FPEXC32_EL2
-  mrs x3, HACR_EL2
-  mrs x3, HCR_EL2
-  mrs x3, HPFAR_EL2
-  mrs x3, HSTR_EL2
-  mrs x3, ID_AA64DFR0_EL1
-  mrs x3, ID_AA64DFR1_EL1
-  mrs x3, ID_AA64ISAR0_EL1
-  mrs x3, ID_AA64ISAR1_EL1
-  mrs x3, ID_AA64MMFR0_EL1
-  mrs x3, ID_AA64MMFR1_EL1
-  mrs x3, ID_AA64PFR0_EL1
-  mrs x3, ID_AA64PFR1_EL1
-  mrs x3, IFSR32_EL2
-  mrs x3, ISR_EL1
-  mrs x3, MAIR_EL1
-  mrs x3, MAIR_EL2
-  mrs x3, MAIR_EL3
-  mrs x3, MDCR_EL2
-  mrs x3, MDCR_EL3
-  mrs x3, MIDR_EL1
-  mrs x3, MPIDR_EL1
-  mrs x3, MVFR0_EL1
-  mrs x3, MVFR1_EL1
-  mrs x3, PAR_EL1
-  mrs x3, RVBAR_EL1
-  mrs x3, RVBAR_EL2
-  mrs x3, RVBAR_EL3
-  mrs x3, SCR_EL3
-  mrs x3, SCTLR_EL1
-  mrs x3, SCTLR_EL2
-  mrs x3, SCTLR_EL3
-  mrs x3, SDER32_EL3
-  mrs x3, TCR_EL1
-  mrs x3, TCR_EL2
-  mrs x3, TCR_EL3
-  mrs x3, TEECR32_EL1
-  mrs x3, TEEHBR32_EL1
-  mrs x3, TPIDRRO_EL0
-  mrs x3, TPIDR_EL0
-  mrs x3, TPIDR_EL1
-  mrs x3, TPIDR_EL2
-  mrs x3, TPIDR_EL3
-  mrs x3, TTBR0_EL1
-  mrs x3, TTBR0_EL2
-  mrs x3, TTBR0_EL3
-  mrs x3, TTBR1_EL1
-  mrs x3, VBAR_EL1
-  mrs x3, VBAR_EL2
-  mrs x3, VBAR_EL3
-  mrs x3, VMPIDR_EL2
-  mrs x3, VPIDR_EL2
-  mrs x3, VTCR_EL2
-  mrs x3, VTTBR_EL2
-
-  mrs x3, MDCCSR_EL0
-  mrs x3, MDCCINT_EL1
-  mrs x3, DBGDTR_EL0
-  mrs x3, DBGDTRRX_EL0
-  mrs x3, DBGDTRTX_EL0
-  mrs x3, DBGVCR32_EL2
-  mrs x3, OSDTRRX_EL1
-  mrs x3, MDSCR_EL1
-  mrs x3, OSDTRTX_EL1
-  mrs x3, OSECCR_EL11
-  mrs x3, DBGBVR0_EL1
-  mrs x3, DBGBVR1_EL1
-  mrs x3, DBGBVR2_EL1
-  mrs x3, DBGBVR3_EL1
-  mrs x3, DBGBVR4_EL1
-  mrs x3, DBGBVR5_EL1
-  mrs x3, DBGBVR6_EL1
-  mrs x3, DBGBVR7_EL1
-  mrs x3, DBGBVR8_EL1
-  mrs x3, DBGBVR9_EL1
-  mrs x3, DBGBVR10_EL1
-  mrs x3, DBGBVR11_EL1
-  mrs x3, DBGBVR12_EL1
-  mrs x3, DBGBVR13_EL1
-  mrs x3, DBGBVR14_EL1
-  mrs x3, DBGBVR15_EL1
-  mrs x3, DBGBCR0_EL1
-  mrs x3, DBGBCR1_EL1
-  mrs x3, DBGBCR2_EL1
-  mrs x3, DBGBCR3_EL1
-  mrs x3, DBGBCR4_EL1
-  mrs x3, DBGBCR5_EL1
-  mrs x3, DBGBCR6_EL1
-  mrs x3, DBGBCR7_EL1
-  mrs x3, DBGBCR8_EL1
-  mrs x3, DBGBCR9_EL1
-  mrs x3, DBGBCR10_EL1
-  mrs x3, DBGBCR11_EL1
-  mrs x3, DBGBCR12_EL1
-  mrs x3, DBGBCR13_EL1
-  mrs x3, DBGBCR14_EL1
-  mrs x3, DBGBCR15_EL1
-  mrs x3, DBGWVR0_EL1
-  mrs x3, DBGWVR1_EL1
-  mrs x3, DBGWVR2_EL1
-  mrs x3, DBGWVR3_EL1
-  mrs x3, DBGWVR4_EL1
-  mrs x3, DBGWVR5_EL1
-  mrs x3, DBGWVR6_EL1
-  mrs x3, DBGWVR7_EL1
-  mrs x3, DBGWVR8_EL1
-  mrs x3, DBGWVR9_EL1
-  mrs x3, DBGWVR10_EL1
-  mrs x3, DBGWVR11_EL1
-  mrs x3, DBGWVR12_EL1
-  mrs x3, DBGWVR13_EL1
-  mrs x3, DBGWVR14_EL1
-  mrs x3, DBGWVR15_EL1
-  mrs x3, DBGWCR0_EL1
-  mrs x3, DBGWCR1_EL1
-  mrs x3, DBGWCR2_EL1
-  mrs x3, DBGWCR3_EL1
-  mrs x3, DBGWCR4_EL1
-  mrs x3, DBGWCR5_EL1
-  mrs x3, DBGWCR6_EL1
-  mrs x3, DBGWCR7_EL1
-  mrs x3, DBGWCR8_EL1
-  mrs x3, DBGWCR9_EL1
-  mrs x3, DBGWCR10_EL1
-  mrs x3, DBGWCR11_EL1
-  mrs x3, DBGWCR12_EL1
-  mrs x3, DBGWCR13_EL1
-  mrs x3, DBGWCR14_EL1
-  mrs x3, DBGWCR15_EL1
-  mrs x3, MDRAR_EL1
-  mrs x3, OSLAR_EL1
-  mrs x3, OSLSR_EL1
-  mrs x3, OSDLR_EL1
-  mrs x3, DBGPRCR_EL1
-  mrs x3, DBGCLAIMSET_EL1
-  mrs x3, DBGCLAIMCLR_EL1
-  mrs x3, DBGAUTHSTATUS_EL1
-  mrs x3, DBGDEVID2
-  mrs x3, DBGDEVID1
-  mrs x3, DBGDEVID0
-  mrs x1, S2_2_C4_C6_4
-  mrs x3, s2_3_c2_c1_4
-  mrs x3, S2_3_c2_c1_4
-
-; CHECK: mrs x3, ACTLR_EL1              ; encoding: [0x23,0x10,0x38,0xd5]
-; CHECK: mrs x3, ACTLR_EL2              ; encoding: [0x23,0x10,0x3c,0xd5]
-; CHECK: mrs x3, ACTLR_EL3              ; encoding: [0x23,0x10,0x3e,0xd5]
-; CHECK: mrs x3, AFSR0_EL1              ; encoding: [0x03,0x51,0x38,0xd5]
-; CHECK: mrs x3, ADFSR_EL2              ; encoding: [0x03,0x51,0x3c,0xd5]
-; CHECK: mrs x3, ADFSR_EL3              ; encoding: [0x03,0x51,0x3e,0xd5]
-; CHECK: mrs x3, AIDR_EL1               ; encoding: [0xe3,0x00,0x39,0xd5]
-; CHECK: mrs x3, AFSR1_EL1              ; encoding: [0x23,0x51,0x38,0xd5]
-; CHECK: mrs x3, AIFSR_EL2              ; encoding: [0x23,0x51,0x3c,0xd5]
-; CHECK: mrs x3, AIFSR_EL3              ; encoding: [0x23,0x51,0x3e,0xd5]
-; CHECK: mrs x3, AMAIR_EL1              ; encoding: [0x03,0xa3,0x38,0xd5]
-; CHECK: mrs x3, AMAIR_EL2              ; encoding: [0x03,0xa3,0x3c,0xd5]
-; CHECK: mrs x3, AMAIR_EL3              ; encoding: [0x03,0xa3,0x3e,0xd5]
-; CHECK: mrs x3, CCSIDR_EL1             ; encoding: [0x03,0x00,0x39,0xd5]
-; CHECK: mrs x3, CLIDR_EL1              ; encoding: [0x23,0x00,0x39,0xd5]
-; CHECK: mrs x3, CNTFRQ_EL0             ; encoding: [0x03,0xe0,0x3b,0xd5]
-; CHECK: mrs x3, CNTHCTL_EL2            ; encoding: [0x03,0xe1,0x3c,0xd5]
-; CHECK: mrs x3, CNTHP_CTL_EL2          ; encoding: [0x23,0xe2,0x3c,0xd5]
-; CHECK: mrs x3, CNTHP_CVAL_EL2         ; encoding: [0x43,0xe2,0x3c,0xd5]
-; CHECK: mrs x3, CNTHP_TVAL_EL2         ; encoding: [0x03,0xe2,0x3c,0xd5]
-; CHECK: mrs x3, CNTKCTL_EL1            ; encoding: [0x03,0xe1,0x38,0xd5]
-; CHECK: mrs x3, CNTPCT_EL0             ; encoding: [0x23,0xe0,0x3b,0xd5]
-; CHECK: mrs x3, CNTP_CTL_EL0           ; encoding: [0x23,0xe2,0x3b,0xd5]
-; CHECK: mrs x3, CNTP_CVAL_EL0          ; encoding: [0x43,0xe2,0x3b,0xd5]
-; CHECK: mrs x3, CNTP_TVAL_EL0          ; encoding: [0x03,0xe2,0x3b,0xd5]
-; CHECK: mrs x3, CNTVCT_EL0             ; encoding: [0x43,0xe0,0x3b,0xd5]
-; CHECK: mrs x3, CNTVOFF_EL2            ; encoding: [0x63,0xe0,0x3c,0xd5]
-; CHECK: mrs x3, CNTV_CTL_EL0           ; encoding: [0x23,0xe3,0x3b,0xd5]
-; CHECK: mrs x3, CNTV_CVAL_EL0          ; encoding: [0x43,0xe3,0x3b,0xd5]
-; CHECK: mrs x3, CNTV_TVAL_EL0          ; encoding: [0x03,0xe3,0x3b,0xd5]
-; CHECK: mrs x3, CONTEXTIDR_EL1         ; encoding: [0x23,0xd0,0x38,0xd5]
-; CHECK: mrs x3, CPACR_EL1              ; encoding: [0x43,0x10,0x38,0xd5]
-; CHECK: mrs x3, CPTR_EL2               ; encoding: [0x43,0x11,0x3c,0xd5]
-; CHECK: mrs x3, CPTR_EL3               ; encoding: [0x43,0x11,0x3e,0xd5]
-; CHECK: mrs x3, CSSELR_EL1             ; encoding: [0x03,0x00,0x3a,0xd5]
-; CHECK: mrs x3, CTR_EL0                ; encoding: [0x23,0x00,0x3b,0xd5]
-; CHECK: mrs x3, CurrentEL              ; encoding: [0x43,0x42,0x38,0xd5]
-; CHECK: mrs x3, DACR32_EL2             ; encoding: [0x03,0x30,0x3c,0xd5]
-; CHECK: mrs x3, DCZID_EL0              ; encoding: [0xe3,0x00,0x3b,0xd5]
-; CHECK: mrs x3, REVIDR_EL1             ; encoding: [0xc3,0x00,0x38,0xd5]
-; CHECK: mrs x3, ESR_EL1                ; encoding: [0x03,0x52,0x38,0xd5]
-; CHECK: mrs x3, ESR_EL2                ; encoding: [0x03,0x52,0x3c,0xd5]
-; CHECK: mrs x3, ESR_EL3                ; encoding: [0x03,0x52,0x3e,0xd5]
-; CHECK: mrs x3, FAR_EL1                ; encoding: [0x03,0x60,0x38,0xd5]
-; CHECK: mrs x3, FAR_EL2                ; encoding: [0x03,0x60,0x3c,0xd5]
-; CHECK: mrs x3, FAR_EL3                ; encoding: [0x03,0x60,0x3e,0xd5]
-; CHECK: mrs x3, FPEXC32_EL2            ; encoding: [0x03,0x53,0x3c,0xd5]
-; CHECK: mrs x3, HACR_EL2               ; encoding: [0xe3,0x11,0x3c,0xd5]
-; CHECK: mrs x3, HCR_EL2                ; encoding: [0x03,0x11,0x3c,0xd5]
-; CHECK: mrs x3, HPFAR_EL2              ; encoding: [0x83,0x60,0x3c,0xd5]
-; CHECK: mrs x3, HSTR_EL2               ; encoding: [0x63,0x11,0x3c,0xd5]
-; CHECK: mrs x3, ID_AA64DFR0_EL1        ; encoding: [0x03,0x05,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64DFR1_EL1        ; encoding: [0x23,0x05,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64ISAR0_EL1       ; encoding: [0x03,0x06,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64ISAR1_EL1       ; encoding: [0x23,0x06,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64MMFR0_EL1       ; encoding: [0x03,0x07,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64MMFR1_EL1       ; encoding: [0x23,0x07,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64PFR0_EL1        ; encoding: [0x03,0x04,0x38,0xd5]
-; CHECK: mrs x3, ID_AA64PFR1_EL1        ; encoding: [0x23,0x04,0x38,0xd5]
-; CHECK: mrs x3, IFSR32_EL2             ; encoding: [0x23,0x50,0x3c,0xd5]
-; CHECK: mrs x3, ISR_EL1                ; encoding: [0x03,0xc1,0x38,0xd5]
-; CHECK: mrs x3, MAIR_EL1               ; encoding: [0x03,0xa2,0x38,0xd5]
-; CHECK: mrs x3, MAIR_EL2               ; encoding: [0x03,0xa2,0x3c,0xd5]
-; CHECK: mrs x3, MAIR_EL3               ; encoding: [0x03,0xa2,0x3e,0xd5]
-; CHECK: mrs x3, MDCR_EL2               ; encoding: [0x23,0x11,0x3c,0xd5]
-; CHECK: mrs x3, MDCR_EL3               ; encoding: [0x23,0x13,0x3e,0xd5]
-; CHECK: mrs x3, MIDR_EL1               ; encoding: [0x03,0x00,0x38,0xd5]
-; CHECK: mrs x3, MPIDR_EL1              ; encoding: [0xa3,0x00,0x38,0xd5]
-; CHECK: mrs x3, MVFR0_EL1              ; encoding: [0x03,0x03,0x38,0xd5]
-; CHECK: mrs x3, MVFR1_EL1              ; encoding: [0x23,0x03,0x38,0xd5]
-; CHECK: mrs x3, PAR_EL1                ; encoding: [0x03,0x74,0x38,0xd5]
-; CHECK: mrs x3, RVBAR_EL1              ; encoding: [0x23,0xc0,0x38,0xd5]
-; CHECK: mrs x3, RVBAR_EL2              ; encoding: [0x23,0xc0,0x3c,0xd5]
-; CHECK: mrs x3, RVBAR_EL3              ; encoding: [0x23,0xc0,0x3e,0xd5]
-; CHECK: mrs x3, SCR_EL3                ; encoding: [0x03,0x11,0x3e,0xd5]
-; CHECK: mrs x3, SCTLR_EL1              ; encoding: [0x03,0x10,0x38,0xd5]
-; CHECK: mrs x3, SCTLR_EL2              ; encoding: [0x03,0x10,0x3c,0xd5]
-; CHECK: mrs x3, SCTLR_EL3              ; encoding: [0x03,0x10,0x3e,0xd5]
-; CHECK: mrs x3, SDER32_EL3             ; encoding: [0x23,0x11,0x3e,0xd5]
-; CHECK: mrs x3, TCR_EL1                ; encoding: [0x43,0x20,0x38,0xd5]
-; CHECK: mrs x3, TCR_EL2                ; encoding: [0x43,0x20,0x3c,0xd5]
-; CHECK: mrs x3, TCR_EL3                ; encoding: [0x43,0x20,0x3e,0xd5]
-; CHECK: mrs x3, TEECR32_EL1            ; encoding: [0x03,0x00,0x32,0xd5]
-; CHECK: mrs x3, TEEHBR32_EL1           ; encoding: [0x03,0x10,0x32,0xd5]
-; CHECK: mrs x3, TPIDRRO_EL0            ; encoding: [0x63,0xd0,0x3b,0xd5]
-; CHECK: mrs x3, TPIDR_EL0              ; encoding: [0x43,0xd0,0x3b,0xd5]
-; CHECK: mrs x3, TPIDR_EL1              ; encoding: [0x83,0xd0,0x38,0xd5]
-; CHECK: mrs x3, TPIDR_EL2              ; encoding: [0x43,0xd0,0x3c,0xd5]
-; CHECK: mrs x3, TPIDR_EL3              ; encoding: [0x43,0xd0,0x3e,0xd5]
-; CHECK: mrs x3, TTBR0_EL1              ; encoding: [0x03,0x20,0x38,0xd5]
-; CHECK: mrs x3, TTBR0_EL2              ; encoding: [0x03,0x20,0x3c,0xd5]
-; CHECK: mrs x3, TTBR0_EL3              ; encoding: [0x03,0x20,0x3e,0xd5]
-; CHECK: mrs x3, TTBR1_EL1              ; encoding: [0x23,0x20,0x38,0xd5]
-; CHECK: mrs x3, VBAR_EL1               ; encoding: [0x03,0xc0,0x38,0xd5]
-; CHECK: mrs x3, VBAR_EL2               ; encoding: [0x03,0xc0,0x3c,0xd5]
-; CHECK: mrs x3, VBAR_EL3               ; encoding: [0x03,0xc0,0x3e,0xd5]
-; CHECK: mrs x3, VMPIDR_EL2             ; encoding: [0xa3,0x00,0x3c,0xd5]
-; CHECK: mrs x3, VPIDR_EL2              ; encoding: [0x03,0x00,0x3c,0xd5]
-; CHECK: mrs x3, VTCR_EL2               ; encoding: [0x43,0x21,0x3c,0xd5]
-; CHECK: mrs x3, VTTBR_EL2              ; encoding: [0x03,0x21,0x3c,0xd5]
-; CHECK: mrs	x3, MDCCSR_EL0          ; encoding: [0x03,0x01,0x33,0xd5]
-; CHECK: mrs	x3, MDCCINT_EL1         ; encoding: [0x03,0x02,0x30,0xd5]
-; CHECK: mrs	x3, DBGDTR_EL0          ; encoding: [0x03,0x04,0x33,0xd5]
-; CHECK: mrs	x3, DBGDTRRX_EL0        ; encoding: [0x03,0x05,0x33,0xd5]
-; CHECK: mrs	x3, DBGDTRRX_EL0        ; encoding: [0x03,0x05,0x33,0xd5]
-; CHECK: mrs	x3, DBGVCR32_EL2        ; encoding: [0x03,0x07,0x34,0xd5]
-; CHECK: mrs	x3, OSDTRRX_EL1         ; encoding: [0x43,0x00,0x30,0xd5]
-; CHECK: mrs	x3, MDSCR_EL1           ; encoding: [0x43,0x02,0x30,0xd5]
-; CHECK: mrs	x3, OSDTRTX_EL1         ; encoding: [0x43,0x03,0x30,0xd5]
-; CHECK: mrs	x3, OSECCR_EL11         ; encoding: [0x43,0x06,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR0_EL1         ; encoding: [0x83,0x00,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR1_EL1         ; encoding: [0x83,0x01,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR2_EL1         ; encoding: [0x83,0x02,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR3_EL1         ; encoding: [0x83,0x03,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR4_EL1         ; encoding: [0x83,0x04,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR5_EL1         ; encoding: [0x83,0x05,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR6_EL1         ; encoding: [0x83,0x06,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR7_EL1         ; encoding: [0x83,0x07,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR8_EL1         ; encoding: [0x83,0x08,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR9_EL1         ; encoding: [0x83,0x09,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR10_EL1        ; encoding: [0x83,0x0a,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR11_EL1        ; encoding: [0x83,0x0b,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR12_EL1        ; encoding: [0x83,0x0c,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR13_EL1        ; encoding: [0x83,0x0d,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR14_EL1        ; encoding: [0x83,0x0e,0x30,0xd5]
-; CHECK: mrs	x3, DBGBVR15_EL1        ; encoding: [0x83,0x0f,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR0_EL1         ; encoding: [0xa3,0x00,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR1_EL1         ; encoding: [0xa3,0x01,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR2_EL1         ; encoding: [0xa3,0x02,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR3_EL1         ; encoding: [0xa3,0x03,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR4_EL1         ; encoding: [0xa3,0x04,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR5_EL1         ; encoding: [0xa3,0x05,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR6_EL1         ; encoding: [0xa3,0x06,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR7_EL1         ; encoding: [0xa3,0x07,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR8_EL1         ; encoding: [0xa3,0x08,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR9_EL1         ; encoding: [0xa3,0x09,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR10_EL1        ; encoding: [0xa3,0x0a,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR11_EL1        ; encoding: [0xa3,0x0b,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR12_EL1        ; encoding: [0xa3,0x0c,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR13_EL1        ; encoding: [0xa3,0x0d,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR14_EL1        ; encoding: [0xa3,0x0e,0x30,0xd5]
-; CHECK: mrs	x3, DBGBCR15_EL1        ; encoding: [0xa3,0x0f,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR0_EL1         ; encoding: [0xc3,0x00,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR1_EL1         ; encoding: [0xc3,0x01,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR2_EL1         ; encoding: [0xc3,0x02,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR3_EL1         ; encoding: [0xc3,0x03,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR4_EL1         ; encoding: [0xc3,0x04,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR5_EL1         ; encoding: [0xc3,0x05,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR6_EL1         ; encoding: [0xc3,0x06,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR7_EL1         ; encoding: [0xc3,0x07,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR8_EL1         ; encoding: [0xc3,0x08,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR9_EL1         ; encoding: [0xc3,0x09,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR10_EL1        ; encoding: [0xc3,0x0a,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR11_EL1        ; encoding: [0xc3,0x0b,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR12_EL1        ; encoding: [0xc3,0x0c,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR13_EL1        ; encoding: [0xc3,0x0d,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR14_EL1        ; encoding: [0xc3,0x0e,0x30,0xd5]
-; CHECK: mrs	x3, DBGWVR15_EL1        ; encoding: [0xc3,0x0f,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR0_EL1         ; encoding: [0xe3,0x00,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR1_EL1         ; encoding: [0xe3,0x01,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR2_EL1         ; encoding: [0xe3,0x02,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR3_EL1         ; encoding: [0xe3,0x03,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR4_EL1         ; encoding: [0xe3,0x04,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR5_EL1         ; encoding: [0xe3,0x05,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR6_EL1         ; encoding: [0xe3,0x06,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR7_EL1         ; encoding: [0xe3,0x07,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR8_EL1         ; encoding: [0xe3,0x08,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR9_EL1         ; encoding: [0xe3,0x09,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR10_EL1        ; encoding: [0xe3,0x0a,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR11_EL1        ; encoding: [0xe3,0x0b,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR12_EL1        ; encoding: [0xe3,0x0c,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR13_EL1        ; encoding: [0xe3,0x0d,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR14_EL1        ; encoding: [0xe3,0x0e,0x30,0xd5]
-; CHECK: mrs	x3, DBGWCR15_EL1        ; encoding: [0xe3,0x0f,0x30,0xd5]
-; CHECK: mrs	x3, MDRAR_EL1           ; encoding: [0x03,0x10,0x30,0xd5]
-; CHECK: mrs	x3, OSLAR_EL1           ; encoding: [0x83,0x10,0x30,0xd5]
-; CHECK: mrs	x3, OSLSR_EL1           ; encoding: [0x83,0x11,0x30,0xd5]
-; CHECK: mrs	x3, OSDLR_EL1           ; encoding: [0x83,0x13,0x30,0xd5]
-; CHECK: mrs	x3, DBGPRCR_EL1         ; encoding: [0x83,0x14,0x30,0xd5]
-; CHECK: mrs	x3, DBGCLAIMSET_EL1     ; encoding: [0xc3,0x78,0x30,0xd5]
-; CHECK: mrs	x3, DBGCLAIMCLR_EL1     ; encoding: [0xc3,0x79,0x30,0xd5]
-; CHECK: mrs	x3, DBGAUTHSTATUS_EL1   ; encoding: [0xc3,0x7e,0x30,0xd5]
-; CHECK: mrs	x3, DBGDEVID2           ; encoding: [0xe3,0x70,0x30,0xd5]
-; CHECK: mrs	x3, DBGDEVID1           ; encoding: [0xe3,0x71,0x30,0xd5]
-; CHECK: mrs	x3, DBGDEVID0           ; encoding: [0xe3,0x72,0x30,0xd5]
-; CHECK: mrs    x1, S2_2_C4_C6_4        ; encoding: [0x81,0x46,0x32,0xd5]
-; CHECK: mrs	x3, S2_3_C2_C1_4        ; encoding: [0x83,0x21,0x33,0xd5]
-; CHECK: mrs	x3, S2_3_C2_C1_4        ; encoding: [0x83,0x21,0x33,0xd5]
-
-  msr RMR_EL3, x0
-  msr RMR_EL2, x0
-  msr RMR_EL1, x0
-  msr CPM_IOACC_CTL_EL3, x0
-
-; CHECK: msr	RMR_EL3, x0             ; encoding: [0x40,0xc0,0x1e,0xd5]
-; CHECK: msr	RMR_EL2, x0             ; encoding: [0x40,0xc0,0x1a,0xd5]
-; CHECK: msr	RMR_EL1, x0             ; encoding: [0x40,0xc0,0x19,0xd5]
-; CHECK: msr	CPM_IOACC_CTL_EL3, x0   ; encoding: [0x00,0xf2,0x1f,0xd5]
-
- mrs x0, ID_PFR0_EL1
- mrs x0, ID_PFR1_EL1
- mrs x0, ID_DFR0_EL1
- mrs x0, ID_AFR0_EL1
- mrs x0, ID_ISAR0_EL1
- mrs x0, ID_ISAR1_EL1
- mrs x0, ID_ISAR2_EL1
- mrs x0, ID_ISAR3_EL1
- mrs x0, ID_ISAR4_EL1
- mrs x0, ID_ISAR5_EL1
- mrs x0, AFSR1_EL1
- mrs x0, AFSR0_EL1
- mrs x0, REVIDR_EL1
-; CHECK: mrs	x0, ID_PFR0_EL1         ; encoding: [0x00,0x01,0x38,0xd5]
-; CHECK: mrs	x0, ID_PFR1_EL1         ; encoding: [0x20,0x01,0x38,0xd5]
-; CHECK: mrs	x0, ID_DFR0_EL1         ; encoding: [0x40,0x01,0x38,0xd5]
-; CHECK: mrs	x0, ID_AFR0_EL1         ; encoding: [0x60,0x01,0x38,0xd5]
-; CHECK: mrs	x0, ID_ISAR0_EL1        ; encoding: [0x00,0x02,0x38,0xd5]
-; CHECK: mrs	x0, ID_ISAR1_EL1        ; encoding: [0x20,0x02,0x38,0xd5]
-; CHECK: mrs	x0, ID_ISAR2_EL1        ; encoding: [0x40,0x02,0x38,0xd5]
-; CHECK: mrs	x0, ID_ISAR3_EL1        ; encoding: [0x60,0x02,0x38,0xd5]
-; CHECK: mrs	x0, ID_ISAR4_EL1        ; encoding: [0x80,0x02,0x38,0xd5]
-; CHECK: mrs	x0, ID_ISAR5_EL1        ; encoding: [0xa0,0x02,0x38,0xd5]
-; CHECK: mrs	x0, AFSR1_EL1           ; encoding: [0x20,0x51,0x38,0xd5]
-; CHECK: mrs	x0, AFSR0_EL1           ; encoding: [0x00,0x51,0x38,0xd5]
-; CHECK: mrs	x0, REVIDR_EL1          ; encoding: [0xc0,0x00,0x38,0xd5]
diff --git a/test/MC/ARM64/tls-modifiers-darwin.s b/test/MC/ARM64/tls-modifiers-darwin.s
deleted file mode 100644
index 6478d26..0000000
--- a/test/MC/ARM64/tls-modifiers-darwin.s
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llvm-mc -triple=arm64-apple-ios7.0 %s -o - | FileCheck %s
-; RUN: llvm-mc -triple=arm64-apple-ios7.0 -filetype=obj %s -o - | llvm-objdump -r - | FileCheck %s --check-prefix=CHECK-OBJ
-
-        adrp x2, _var@TLVPPAGE
-        ldr x0, [x15, _var@TLVPPAGEOFF]
-        add lr, x0, _var@TLVPPAGEOFF
-; CHECK: adrp x2, _var@TLVPPAG
-; CHECK: ldr x0, [x15, _var@TLVPPAGEOFF]
-; CHECK: add lr, x0, _var@TLVPPAGEOFF
-
-; CHECK-OBJ: 8 ARM64_RELOC_TLVP_LOAD_PAGEOFF12 _var
-; CHECK-OBJ: 4 ARM64_RELOC_TLVP_LOAD_PAGEOFF12 _var
-; CHECK-OBJ: 0 ARM64_RELOC_TLVP_LOAD_PAGE21 _var
diff --git a/test/MC/ARM64/tls-relocs.s b/test/MC/ARM64/tls-relocs.s
deleted file mode 100644
index 7e8b754..0000000
--- a/test/MC/ARM64/tls-relocs.s
+++ /dev/null
@@ -1,320 +0,0 @@
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -show-encoding < %s | FileCheck %s
-// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s -o - | \
-// RUN:   llvm-readobj -r -t | FileCheck --check-prefix=CHECK-ELF %s
-
-
-////////////////////////////////////////////////////////////////////////////////
-// TLS initial-exec forms
-////////////////////////////////////////////////////////////////////////////////
-
-        movz x15, #:gottprel_g1:var
-// CHECK: movz    x15, #:gottprel_g1:var  // encoding: [0bAAA01111,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_arm64_movw
-
-// CHECK-ELF:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 [[VARSYM:[^ ]+]]
-
-
-        movk x13, #:gottprel_g0_nc:var
-// CHECK: movk    x13, #:gottprel_g0_nc:var // encoding: [0bAAA01101,A,0b100AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_arm64_movw
-
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC [[VARSYM]]
-
-        adrp x11, :gottprel:var
-        ldr x10, [x0, #:gottprel_lo12:var]
-        ldr x9, :gottprel:var
-// CHECK: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_arm64_pcrel_adrp_imm21
-// CHECK: ldr     x10, [x0, :gottprel_lo12:var] // encoding: [0x0a,0bAAAAAA00,0b01AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK: ldr     x9, :gottprel:var       // encoding: [0bAAA01001,A,A,0x58]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_arm64_pcrel_imm19
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_LD_GOTTPREL_PREL19 [[VARSYM]]
-
-
-////////////////////////////////////////////////////////////////////////////////
-// TLS local-exec forms
-////////////////////////////////////////////////////////////////////////////////
-
-        movz x3, #:tprel_g2:var
-        movn x4, #:tprel_g2:var
-// CHECK: movz    x3, #:tprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_arm64_movw
-// CHECK: movn    x4, #:tprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
-
-
-        movz x5, #:tprel_g1:var
-        movn x6, #:tprel_g1:var
-        movz w7, #:tprel_g1:var
-// CHECK: movz    x5, #:tprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
-// CHECK: movn    x6, #:tprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
-// CHECK: movz    w7, #:tprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
-
-
-        movk x9, #:tprel_g1_nc:var
-        movk w10, #:tprel_g1_nc:var
-// CHECK: movk    x9, #:tprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_arm64_movw
-// CHECK: movk    w10, #:tprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
-
-
-        movz x11, #:tprel_g0:var
-        movn x12, #:tprel_g0:var
-        movz w13, #:tprel_g0:var
-// CHECK: movz    x11, #:tprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
-// CHECK: movn    x12, #:tprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
-// CHECK: movz    w13, #:tprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
-
-
-        movk x15, #:tprel_g0_nc:var
-        movk w16, #:tprel_g0_nc:var
-// CHECK: movk    x15, #:tprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_arm64_movw
-// CHECK: movk    w16, #:tprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
-
-
-        add x21, x22, #:tprel_lo12:var
-// CHECK: add     x21, x22, :tprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_add_imm12
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
-
-
-        add x25, x26, #:tprel_lo12_nc:var
-// CHECK: add     x25, x26, :tprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_add_imm12
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
-
-
-        ldrb w29, [x30, #:tprel_lo12:var]
-        ldrsb x29, [x28, #:tprel_lo12_nc:var]
-// CHECK: ldrb    w29, [lr, :tprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale1
-// CHECK: ldrsb   fp, [x28, :tprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale1
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST8_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC [[VARSYM]]
-
-
-        strh w27, [x26, #:tprel_lo12:var]
-        ldrsh x25, [x24, #:tprel_lo12_nc:var]
-// CHECK: strh    w27, [x26, :tprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale2
-// CHECK: ldrsh   x25, [x24, :tprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale2
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST16_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC [[VARSYM]]
-
-
-        ldr w23, [x22, #:tprel_lo12:var]
-        ldrsw x21, [x20, #:tprel_lo12_nc:var]
-// CHECK: ldr     w23, [x22, :tprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale4
-// CHECK: ldrsw   x21, [x20, :tprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale4
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST32_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC [[VARSYM]]
-
-        ldr x19, [x18, #:tprel_lo12:var]
-        str x17, [x16, #:tprel_lo12_nc:var]
-// CHECK: ldr     x19, [x18, :tprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK: str     x17, [x16, :tprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale8
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST64_TPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC [[VARSYM]]
-
-
-////////////////////////////////////////////////////////////////////////////////
-// TLS local-dynamic forms
-////////////////////////////////////////////////////////////////////////////////
-
-        movz x3, #:dtprel_g2:var
-        movn x4, #:dtprel_g2:var
-// CHECK: movz    x3, #:dtprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
-// CHECK: movn    x4, #:dtprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G2 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G2 [[VARSYM]]
-
-
-        movz x5, #:dtprel_g1:var
-        movn x6, #:dtprel_g1:var
-        movz w7, #:dtprel_g1:var
-// CHECK: movz    x5, #:dtprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
-// CHECK: movn    x6, #:dtprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
-// CHECK: movz    w7, #:dtprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
-
-
-        movk x9, #:dtprel_g1_nc:var
-        movk w10, #:dtprel_g1_nc:var
-// CHECK: movk    x9, #:dtprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_arm64_movw
-// CHECK: movk    w10, #:dtprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
-
-
-        movz x11, #:dtprel_g0:var
-        movn x12, #:dtprel_g0:var
-        movz w13, #:dtprel_g0:var
-// CHECK: movz    x11, #:dtprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
-// CHECK: movn    x12, #:dtprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
-// CHECK: movz    w13, #:dtprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
-
-
-        movk x15, #:dtprel_g0_nc:var
-        movk w16, #:dtprel_g0_nc:var
-// CHECK: movk    x15, #:dtprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_arm64_movw
-// CHECK: movk    w16, #:dtprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_arm64_movw
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
-
-
-        add x21, x22, #:dtprel_lo12:var
-// CHECK: add     x21, x22, :dtprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_add_imm12
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
-
-
-        add x25, x26, #:dtprel_lo12_nc:var
-// CHECK: add     x25, x26, :dtprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_add_imm12
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
-
-
-        ldrb w29, [x30, #:dtprel_lo12:var]
-        ldrsb x29, [x28, #:dtprel_lo12_nc:var]
-// CHECK: ldrb    w29, [lr, :dtprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale1
-// CHECK: ldrsb   fp, [x28, :dtprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale1
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST8_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC [[VARSYM]]
-
-
-        strh w27, [x26, #:dtprel_lo12:var]
-        ldrsh x25, [x24, #:dtprel_lo12_nc:var]
-// CHECK: strh    w27, [x26, :dtprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale2
-// CHECK: ldrsh   x25, [x24, :dtprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale2
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST16_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC [[VARSYM]]
-
-
-        ldr w23, [x22, #:dtprel_lo12:var]
-        ldrsw x21, [x20, #:dtprel_lo12_nc:var]
-// CHECK: ldr     w23, [x22, :dtprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale4
-// CHECK: ldrsw   x21, [x20, :dtprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale4
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST32_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC [[VARSYM]]
-
-        ldr x19, [x18, #:dtprel_lo12:var]
-        str x17, [x16, #:dtprel_lo12_nc:var]
-// CHECK: ldr     x19, [x18, :dtprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK: str     x17, [x16, :dtprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale8
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST64_DTPREL_LO12 [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC [[VARSYM]]
-
-////////////////////////////////////////////////////////////////////////////////
-// TLS descriptor forms
-////////////////////////////////////////////////////////////////////////////////
-
-        adrp x8, :tlsdesc:var
-        ldr x7, [x6, #:tlsdesc_lo12:var]
-        add x5, x4, #:tlsdesc_lo12:var
-        .tlsdesccall var
-        blr x3
-
-// CHECK: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_arm64_pcrel_adrp_imm21
-// CHECK: ldr     x7, [x6, :tlsdesc_lo12:var] // encoding: [0xc7,0bAAAAAA00,0b01AAAAAA,0xf9]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
-// CHECK: add     x5, x4, :tlsdesc_lo12:var // encoding: [0x85,0bAAAAAA00,0b00AAAAAA,0x91]
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_arm64_add_imm12
-// CHECK: .tlsdesccall var                // encoding: []
-// CHECK-NEXT:                                 //   fixup A - offset: 0, value: var, kind: fixup_arm64_tlsdesc_call
-// CHECK: blr     x3                      // encoding: [0x60,0x00,0x3f,0xd6]
-
-
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADR_PAGE [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_LD64_LO12_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADD_LO12_NC [[VARSYM]]
-// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_CALL [[VARSYM]]
-
-        // Make sure symbol 5 has type STT_TLS:
-
-// CHECK-ELF:      Symbols [
-// CHECK-ELF:        Symbol {
-// CHECK-ELF:          Name: var (6)
-// CHECK-ELF-NEXT:     Value:
-// CHECK-ELF-NEXT:     Size:
-// CHECK-ELF-NEXT:     Binding: Global
-// CHECK-ELF-NEXT:     Type: TLS
diff --git a/test/MC/ARM64/variable-exprs.s b/test/MC/ARM64/variable-exprs.s
deleted file mode 100644
index 0120442..0000000
--- a/test/MC/ARM64/variable-exprs.s
+++ /dev/null
@@ -1,40 +0,0 @@
-// RUN: llvm-mc -triple arm64-apple-darwin10 %s -filetype=obj -o %t.o
-
-.data
-
-        .long 0
-a:
-        .long 0
-b = a
-
-c:      .long b
-
-d2 = d
-.globl d2
-d3 = d + 4
-.globl d3
-
-e = a + 4
-
-g:
-f = g
-        .long 0
-
-        .long b
-        .long e
-        .long a + 4
-        .long d
-        .long d2
-        .long d3
-        .long f
-        .long g
-
-///
-        .text
-t0:
-Lt0_a:
-        .long 0
-
-	.section	__DWARF,__debug_frame,regular,debug
-Lt1 = Lt0_a
-	.long	Lt1
diff --git a/test/MC/AsmParser/cfi-invalid-startproc.s b/test/MC/AsmParser/cfi-invalid-startproc.s
new file mode 100644
index 0000000..57ded13
--- /dev/null
+++ b/test/MC/AsmParser/cfi-invalid-startproc.s
@@ -0,0 +1,16 @@
+# RUN: not llvm-mc -triple=x86_64-apple-macosx10.8 -filetype=obj -o %t %s 2>&1 | FileCheck %s
+# Check that the cfi_startproc is declared after the beginning of
+# a procedure, otherwise it will reference an invalid symbol for
+# emitting the relocation.
+# <rdar://problem/15939159>
+
+# CHECK: No symbol to start a frame
+.text
+.cfi_startproc
+.globl _someFunction
+_someFunction:
+.cfi_def_cfa_offset 16
+.cfi_offset %rbp, -16
+.cfi_def_cfa_register rbp
+  ret
+.cfi_endproc
diff --git a/test/MC/AsmParser/directive_seh.s b/test/MC/AsmParser/directive_seh.s
index 98fc606..f6eb970 100644
--- a/test/MC/AsmParser/directive_seh.s
+++ b/test/MC/AsmParser/directive_seh.s
@@ -3,10 +3,10 @@
 # CHECK: .seh_proc func
 # CHECK: .seh_pushframe @code
 # CHECK: .seh_stackalloc 24
-# CHECK: .seh_savereg 6, 16
-# CHECK: .seh_savexmm 8, 0
-# CHECK: .seh_pushreg 3
-# CHECK: .seh_setframe 3, 0
+# CHECK: .seh_savereg %rbp, 16
+# CHECK: .seh_savexmm %r8, 0
+# CHECK: .seh_pushreg %rbx
+# CHECK: .seh_setframe %rbx, 0
 # CHECK: .seh_endprologue
 # CHECK: .seh_handler __C_specific_handler, @except
 # CHECK-NOT: .section{{.*}}.xdata
diff --git a/test/MC/AsmParser/invalid-input-assertion.s b/test/MC/AsmParser/invalid-input-assertion.s
new file mode 100644
index 0000000..2557f6e
--- /dev/null
+++ b/test/MC/AsmParser/invalid-input-assertion.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -triple i686-linux -o /dev/null %s
+
+	.macro macro parameter=0
+		.if \parameter
+		.else
+	.endm
+
+	macro 1
+
diff --git a/test/MC/AsmParser/macros-darwin-vararg.s b/test/MC/AsmParser/macros-darwin-vararg.s
new file mode 100644
index 0000000..a650c08
--- /dev/null
+++ b/test/MC/AsmParser/macros-darwin-vararg.s
@@ -0,0 +1,8 @@
+// RUN: not llvm-mc -triple i386-apple-darwin10 %s 2>&1 | FileCheck %s
+
+// CHECK: error: vararg is not a valid parameter qualifier for 'arg' in macro 'abc'
+// CHECK: .macro abc arg:vararg
+
+.macro abc arg:vararg
+    \arg
+.endm
diff --git a/test/MC/AsmParser/vararg-default-value.s b/test/MC/AsmParser/vararg-default-value.s
new file mode 100644
index 0000000..77cd1e8
--- /dev/null
+++ b/test/MC/AsmParser/vararg-default-value.s
@@ -0,0 +1,15 @@
+// RUN: llvm-mc -triple x86_64-linux-gnu %s | FileCheck %s
+.macro abc arg:vararg=nop
+  \arg
+.endm
+
+.macro abcd arg0=%eax arg1:vararg=%ebx
+  movl \arg0, \arg1
+.endm
+
+.text
+
+// CHECK: nop
+  abc
+// CHECK: movl %eax, %ebx
+  abcd ,
diff --git a/test/MC/AsmParser/vararg.s b/test/MC/AsmParser/vararg.s
new file mode 100644
index 0000000..b27668e
--- /dev/null
+++ b/test/MC/AsmParser/vararg.s
@@ -0,0 +1,41 @@
+// RUN: llvm-mc -triple x86_64-linux-gnu %s | FileCheck %s
+.macro ifcc arg:vararg
+.if cc
+            \arg
+.endif
+.endm
+
+.macro ifcc2 arg0 arg1:vararg
+.if cc
+            movl \arg0, \arg1
+.endif
+.endm
+
+.macro ifcc3 arg0, arg1:vararg
+.if cc
+            movl \arg0, \arg1
+.endif
+.endm
+
+.text
+
+// CHECK: movl %esp, %ebp
+// CHECK: subl $0, %esp
+// CHECK: movl %eax, %ebx
+// CHECK: movl %ecx, %ebx
+// CHECK: movl %ecx, %eax
+// CHECK: movl %eax, %ecx
+.set cc,1
+  ifcc  movl    %esp, %ebp
+        subl $0, %esp
+
+  ifcc2 %eax %ebx
+  ifcc2 %ecx, %ebx
+  ifcc3 %ecx %eax
+  ifcc3 %eax, %ecx
+
+// CHECK-NOT movl
+// CHECK: subl $1, %esp
+.set cc,0
+  ifcc  movl    %esp, %ebp
+        subl $1, %esp
diff --git a/test/MC/COFF/alias.s b/test/MC/COFF/alias.s
index f6f6d46..dc4f65a 100644
--- a/test/MC/COFF/alias.s
+++ b/test/MC/COFF/alias.s
@@ -68,7 +68,7 @@ weak_aliased_to_external = external2
 // CHECK-NEXT:     Section: .text (1)
 // CHECK-NEXT:     BaseType: Null (0x0)
 // CHECK-NEXT:     ComplexType: Null (0x0)
-// CHECK-NEXT:     StorageClass: Static (0x3)
+// CHECK-NEXT:     StorageClass: External (0x2)
 // CHECK-NEXT:     AuxSymbolCount: 0
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
diff --git a/test/MC/COFF/comm.ll b/test/MC/COFF/comm.ll
index 74da557..6fe122e 100644
--- a/test/MC/COFF/comm.ll
+++ b/test/MC/COFF/comm.ll
@@ -9,5 +9,5 @@
 ; CHECK: .lcomm	_a,1
 ; CHECK: .lcomm	_b,8,8
 ; .comm uses log2 alignment
-; CHECK: .comm	_c,1,0
-; CHECK: .comm	_d,8,3
+; CHECK: .comm	_c,1
+; CHECK: .comm	_d,8
diff --git a/test/MC/COFF/comm.s b/test/MC/COFF/comm.s
index 21ae5d2..37db75f 100644
--- a/test/MC/COFF/comm.s
+++ b/test/MC/COFF/comm.s
@@ -1,7 +1,7 @@
 // RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | llvm-readobj -t | FileCheck %s
 
 .lcomm _a,4,4
-.comm	_b, 4, 2
+.comm	_b, 4
 
 
 // CHECK:       Symbol {
@@ -17,7 +17,7 @@
 // CHECK:       Symbol {
 // CHECK:         Name: _b
 // CHECK-NEXT:    Value: 4
-// CHECK-NEXT:    Section: .bss
+// CHECK-NEXT:    Section:  (0)
 // CHECK-NEXT:    BaseType: Null
 // CHECK-NEXT:    ComplexType: Null
 // CHECK-NEXT:    StorageClass: External
diff --git a/test/MC/COFF/directive-section-characteristics.ll b/test/MC/COFF/directive-section-characteristics.ll
new file mode 100644
index 0000000..ca8102a
--- /dev/null
+++ b/test/MC/COFF/directive-section-characteristics.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple i686-windows -filetype obj -o - %s | llvm-readobj -sections \
+; RUN:    | FileCheck %s
+
+define dllexport void @function() {
+entry:
+  ret void
+}
+
+; CHECK: Section {
+; CHECK:   Name: .drectve
+; CHECK:   Characteristics [
+; CHECK:     IMAGE_SCN_ALIGN_1BYTES
+; CHECK:     IMAGE_SCN_LNK_INFO
+; CHECK:     IMAGE_SCN_LNK_REMOVE
+; CHECK:   ]
+; CHECK: }
+
diff --git a/test/MC/COFF/file.s b/test/MC/COFF/file.s
new file mode 100644
index 0000000..132e82b
--- /dev/null
+++ b/test/MC/COFF/file.s
@@ -0,0 +1,47 @@
+// RUN: llvm-mc -triple i686-windows -filetype obj %s -o - | llvm-objdump -t - \
+// RUN:   | FileCheck %s
+
+// RUN: llvm-mc -triple i686-windows -filetype obj %s -o - \
+// RUN:	  | llvm-readobj -symbols | FileCheck %s -check-prefix CHECK-SCN
+
+	.file "null-padded.asm"
+// CHECK: (nx 1) {{0x[0-9]+}} .file
+// CHECK-NEXT: AUX null-padded.asm{{$}}
+
+	.file "eighteen-chars.asm"
+
+// CHECK: (nx 1) {{0x[0-9]+}} .file
+// CHECK-NEXT: AUX eighteen-chars.asm{{$}}
+
+	.file "multiple-auxiliary-entries.asm"
+
+// CHECK: (nx 2) {{0x[0-9]+}} .file
+// CHECK-NEXT: AUX multiple-auxiliary-entries.asm{{$}}
+
+// CHECK-SCN: Symbols [
+// CHECK-SCN:   Symbol {
+// CHECK-SCN:     Name: .file
+// CHECK-SCN:     Section: (65534)
+// CHECK-SCN:     StorageClass: File
+// CHECK-SCN:     AuxFileRecord {
+// CHECK-SCN:       FileName: null-padded.asm
+// CHECK-SCN:     }
+// CHECK-SCN:   }
+// CHECK-SCN:   Symbol {
+// CHECK-SCN:     Name: .file
+// CHECK-SCN:     Section: (65534)
+// CHECK-SCN:     StorageClass: File
+// CHECK-SCN:     AuxFileRecord {
+// CHECK-SCN:       FileName: eighteen-chars.asm
+// CHECK-SCN:     }
+// CHECK-SCN:   }
+// CHECK-SCN:   Symbol {
+// CHECK-SCN:     Name: .file
+// CHECK-SCN:     Section: (65534)
+// CHECK-SCN:     StorageClass: File
+// CHECK-SCN:     AuxFileRecord {
+// CHECK-SCN:       FileName: multiple-auxiliary-entries.asm
+// CHECK-SCN:     }
+// CHECK-SCN:   }
+// CHECK-SCN: ]
+
diff --git a/test/MC/COFF/global_ctors_dtors.ll b/test/MC/COFF/global_ctors_dtors.ll
index 2a25219..046e93a 100644
--- a/test/MC/COFF/global_ctors_dtors.ll
+++ b/test/MC/COFF/global_ctors_dtors.ll
@@ -9,8 +9,13 @@
 @.str2 = private unnamed_addr constant [12 x i8] c"destructing\00", align 1
 @.str3 = private unnamed_addr constant [5 x i8] c"main\00", align 1
 
-@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @a_global_ctor }]
-@llvm.global_dtors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @a_global_dtor }]
+%ini = type { i32, void()*, i8* }
+
+@llvm.global_ctors = appending global [2 x %ini ] [
+  %ini { i32 65535, void ()* @a_global_ctor, i8* null },
+  %ini { i32 65535, void ()* @b_global_ctor, i8* bitcast (i32* @b to i8*) }
+]
+@llvm.global_dtors = appending global [1 x %ini ] [%ini { i32 65535, void ()* @a_global_dtor, i8* null }]
 
 declare i32 @puts(i8*)
 
@@ -19,6 +24,13 @@ define void @a_global_ctor() nounwind {
   ret void
 }
 
+@b = global i32 zeroinitializer
+
+define void @b_global_ctor() nounwind {
+  store i32 42, i32* @b
+  ret void
+}
+
 define void @a_global_dtor() nounwind {
   %1 = call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.str2, i32 0, i32 0))
   ret void
@@ -29,11 +41,15 @@ define i32 @main() nounwind {
   ret i32 0
 }
 
-; WIN32: .section .CRT$XCU,"r"
+; WIN32: .section .CRT$XCU,"rd"
 ; WIN32: a_global_ctor
-; WIN32: .section .CRT$XTX,"r"
+; WIN32: .section .CRT$XCU,"rd",associative .bss,{{_?}}b
+; WIN32: b_global_ctor
+; WIN32: .section .CRT$XTX,"rd"
 ; WIN32: a_global_dtor
-; MINGW32: .section .ctors,"w"
+; MINGW32: .section .ctors,"wd"
 ; MINGW32: a_global_ctor
-; MINGW32: .section .dtors,"w"
+; MINGW32: .section .ctors,"wd",associative .bss,{{_?}}b
+; MINGW32: b_global_ctor
+; MINGW32: .section .dtors,"wd"
 ; MINGW32: a_global_dtor
diff --git a/test/MC/COFF/initialised-data.ll b/test/MC/COFF/initialised-data.ll
new file mode 100644
index 0000000..c428469
--- /dev/null
+++ b/test/MC/COFF/initialised-data.ll
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple i686-windows %s -o - | FileCheck %s
+; RUN: llc -mtriple x86_64-windows %s -o - | FileCheck %s
+
+@data = dllexport constant [5 x i8] c"data\00", align 1
+
+; CHECK: .section	.rdata,"rd"
+
diff --git a/test/MC/COFF/invalid-def.s b/test/MC/COFF/invalid-def.s
new file mode 100644
index 0000000..42821c2
--- /dev/null
+++ b/test/MC/COFF/invalid-def.s
@@ -0,0 +1,5 @@
+# RUN: not llvm-mc -triple i686-windows -filetype obj -o /dev/null %s
+
+	.def first
+	.def second
+
diff --git a/test/MC/COFF/invalid-endef.s b/test/MC/COFF/invalid-endef.s
new file mode 100644
index 0000000..c6fd8f5
--- /dev/null
+++ b/test/MC/COFF/invalid-endef.s
@@ -0,0 +1,4 @@
+# RUN: not llvm-mc -triple i686-windows -filetype obj -o /dev/null %s
+
+	.endef
+
diff --git a/test/MC/COFF/invalid-scl-range.s b/test/MC/COFF/invalid-scl-range.s
new file mode 100644
index 0000000..5722505
--- /dev/null
+++ b/test/MC/COFF/invalid-scl-range.s
@@ -0,0 +1,6 @@
+# RUN: not llvm-mc -triple i686-windows -filetype obj -o /dev/null %s
+
+	.def storage_class_range
+		.scl 1337
+	.endef
+
diff --git a/test/MC/COFF/invalid-scl.s b/test/MC/COFF/invalid-scl.s
new file mode 100644
index 0000000..8565a5a
--- /dev/null
+++ b/test/MC/COFF/invalid-scl.s
@@ -0,0 +1,4 @@
+# RUN: not llvm-mc -triple i686-windows -filetype obj -o /dev/null %s
+
+	.scl 1337
+
diff --git a/test/MC/COFF/invalid-type-range.s b/test/MC/COFF/invalid-type-range.s
new file mode 100644
index 0000000..92874cc
--- /dev/null
+++ b/test/MC/COFF/invalid-type-range.s
@@ -0,0 +1,6 @@
+# RUN: not llvm-mc -triple i686-windows -filetype obj -o /dev/null %s
+
+	.def invalid_type_range
+		.type 65536
+	.endef
+
diff --git a/test/MC/COFF/invalid-type.s b/test/MC/COFF/invalid-type.s
new file mode 100644
index 0000000..a1e131e
--- /dev/null
+++ b/test/MC/COFF/invalid-type.s
@@ -0,0 +1,4 @@
+# RUN: not llvm-mc -triple i686-windows -filetype obj -o /dev/null %s
+
+	.type 65536
+
diff --git a/test/MC/COFF/offset.s b/test/MC/COFF/offset.s
new file mode 100644
index 0000000..d0d3710
--- /dev/null
+++ b/test/MC/COFF/offset.s
@@ -0,0 +1,19 @@
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - | llvm-readobj -t -r | FileCheck %s
+
+	.data
+	.globl	test1_foo
+test1_foo:
+        .long 42
+
+        .globl test1_zed
+test1_zed = test1_foo + 1
+
+// CHECK:      Symbol {
+// CHECK:        Name: test1_zed
+// CHECK-NEXT:   Value: 1
+// CHECK-NEXT:   Section: .data
+// CHECK-NEXT:   BaseType: Null
+// CHECK-NEXT:   ComplexType: Null
+// CHECK-NEXT:   StorageClass: External
+// CHECK-NEXT:   AuxSymbolCount: 0
+// CHECK-NEXT: }
diff --git a/test/MC/COFF/symbol-alias.s b/test/MC/COFF/symbol-alias.s
index ccada37..71ccec3 100644
--- a/test/MC/COFF/symbol-alias.s
+++ b/test/MC/COFF/symbol-alias.s
@@ -51,7 +51,7 @@ _bar_alias = _bar
 // CHECK-NEXT: Value:               [[FOO_VALUE]]
 // CHECK-NEXT: Section:             [[FOO_SECTION_NUMBER]]
 // CHECK-NEXT: BaseType:            [[FOO_SIMPLE_TYPE]]
-// CHECK-NEXT: ComplexType:         [[FOO_COMPLEX_TYPE]]
+// CHECK-NEXT: ComplexType:         Null (0x0)
 // CHECK-NEXT: StorageClass:        [[FOO_STORAGE_CLASS]]
 // CHECK-NEXT: AuxSymbolCount:      [[FOO_NUMBER_OF_AUX_SYMBOLS]]
 
diff --git a/test/MC/COFF/weak-symbol.ll b/test/MC/COFF/weak-symbol.ll
index c06692e..fd78307 100644
--- a/test/MC/COFF/weak-symbol.ll
+++ b/test/MC/COFF/weak-symbol.ll
@@ -28,20 +28,20 @@ define weak void @f() section ".sect" {
 }
 
 ; Weak global
-; X86: .section .data,"r",discard,_a
+; X86: .section .data,"rd",discard,_a
 ; X86: .globl _a
 ; X86: .zero 12
 ;
-; X64: .section .data,"r",discard,a
+; X64: .section .data,"rd",discard,a
 ; X64: .globl a
 ; X64: .zero 12
 @a = weak unnamed_addr constant { i32, i32, i32 } { i32 0, i32 0, i32 0}, section ".data"
 
-; X86:  .section        .tls$,"w",discard,_b
+; X86:  .section        .tls$,"wd",discard,_b
 ; X86:  .globl  _b
 ; X86:  .long   0
 ;
-; X64:  .section        .tls$,"w",discard,b
+; X64:  .section        .tls$,"wd",discard,b
 ; X64:  .globl  b
 ; X64:  .long   0
 
diff --git a/test/MC/Disassembler/AArch64/a64-ignored-fields.txt b/test/MC/Disassembler/AArch64/a64-ignored-fields.txt
index 799ecdf..1860bf6 100644
--- a/test/MC/Disassembler/AArch64/a64-ignored-fields.txt
+++ b/test/MC/Disassembler/AArch64/a64-ignored-fields.txt
@@ -1,4 +1,5 @@
 # RUN: llvm-mc -triple=aarch64 -mattr=fp-armv8 -disassemble -show-encoding < %s | FileCheck %s
+# RUN: llvm-mc -triple=arm64 -mattr=fp-armv8 -disassemble -show-encoding < %s | FileCheck %s
 
 # The "Rm" bits are ignored, but the canonical representation has them filled
 # with 0s. This is what we should produce even if the input bit-pattern had
diff --git a/test/MC/Disassembler/AArch64/arm64-advsimd.txt b/test/MC/Disassembler/AArch64/arm64-advsimd.txt
new file mode 100644
index 0000000..cceee67
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-advsimd.txt
@@ -0,0 +1,2283 @@
+# RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -output-asm-variant=1 --disassemble < %s | FileCheck %s
+
+0x00 0xb8 0x20 0x0e
+0x00 0xb8 0x20 0x4e
+0x00 0xb8 0x60 0x0e
+0x00 0xb8 0x60 0x4e
+0x00 0xb8 0xa0 0x0e
+0x00 0xb8 0xa0 0x4e
+
+# CHECK: abs.8b  v0, v0
+# CHECK: abs.16b v0, v0
+# CHECK: abs.4h  v0, v0
+# CHECK: abs.8h  v0, v0
+# CHECK: abs.2s  v0, v0
+# CHECK: abs.4s  v0, v0
+
+0x00 0x84 0x20 0x0e
+0x00 0x84 0x20 0x4e
+0x00 0x84 0x60 0x0e
+0x00 0x84 0x60 0x4e
+0x00 0x84 0xa0 0x0e
+0x00 0x84 0xa0 0x4e
+0x00 0x84 0xe0 0x4e
+
+# CHECK: add.8b  v0, v0, v0
+# CHECK: add.16b v0, v0, v0
+# CHECK: add.4h  v0, v0, v0
+# CHECK: add.8h  v0, v0, v0
+# CHECK: add.2s  v0, v0, v0
+# CHECK: add.4s  v0, v0, v0
+# CHECK: add.2d  v0, v0, v0
+
+0x41 0x84 0xe3 0x5e
+
+# CHECK: add d1, d2, d3
+
+0x00 0x40 0x20 0x0e
+0x00 0x40 0x20 0x4e
+0x00 0x40 0x60 0x0e
+0x00 0x40 0x60 0x4e
+0x00 0x40 0xa0 0x0e
+0x00 0x40 0xa0 0x4e
+
+# CHECK: addhn.8b   v0, v0, v0
+# CHECK: addhn2.16b v0, v0, v0
+# CHECK: addhn.4h   v0, v0, v0
+# CHECK: addhn2.8h  v0, v0, v0
+# CHECK: addhn.2s   v0, v0, v0
+# CHECK: addhn2.4s  v0, v0, v0
+
+0x00 0xbc 0x20 0x0e
+0x00 0xbc 0x20 0x4e
+0x00 0xbc 0x60 0x0e
+0x00 0xbc 0x60 0x4e
+0x00 0xbc 0xa0 0x0e
+0x00 0xbc 0xa0 0x4e
+0x00 0xbc 0xe0 0x4e
+
+# CHECK: addp.8b   v0, v0, v0
+# CHECK: addp.16b  v0, v0, v0
+# CHECK: addp.4h   v0, v0, v0
+# CHECK: addp.8h   v0, v0, v0
+# CHECK: addp.2s   v0, v0, v0
+# CHECK: addp.4s   v0, v0, v0
+# CHECK: addp.2d   v0, v0, v0
+
+0x00 0xb8 0xf1 0x5e
+
+# CHECK: addp.2d d0, v0
+
+0x00 0xb8 0x31 0x0e
+0x00 0xb8 0x31 0x4e
+0x00 0xb8 0x71 0x0e
+0x00 0xb8 0x71 0x4e
+0x00 0xb8 0xb1 0x4e
+
+# CHECK: addv.8b  b0, v0
+# CHECK: addv.16b b0, v0
+# CHECK: addv.4h  h0, v0
+# CHECK: addv.8h  h0, v0
+# CHECK: addv.4s  s0, v0
+
+
+# INS/DUP
+0x60 0x0c 0x08 0x4e
+0x60 0x0c 0x04 0x4e
+0x60 0x0c 0x04 0x0e
+0x60 0x0c 0x02 0x4e
+0x60 0x0c 0x02 0x0e
+0x60 0x0c 0x01 0x4e
+0x60 0x0c 0x01 0x0e
+
+# CHECK: dup.2d  v0, x3
+# CHECK: dup.4s  v0, w3
+# CHECK: dup.2s  v0, w3
+# CHECK: dup.8h  v0, w3
+# CHECK: dup.4h  v0, w3
+# CHECK: dup.16b v0, w3
+# CHECK: dup.8b  v0, w3
+
+0x60 0x04 0x18 0x4e
+0x60 0x04 0x0c 0x0e
+0x60 0x04 0x0c 0x4e
+0x60 0x04 0x06 0x0e
+0x60 0x04 0x06 0x4e
+0x60 0x04 0x03 0x0e
+0x60 0x04 0x03 0x4e
+
+# CHECK: dup.2d  v0, v3[1]
+# CHECK: dup.2s  v0, v3[1]
+# CHECK: dup.4s  v0, v3[1]
+# CHECK: dup.4h  v0, v3[1]
+# CHECK: dup.8h  v0, v3[1]
+# CHECK: dup.8b  v0, v3[1]
+# CHECK: dup.16b v0, v3[1]
+
+
+0x43 0x2c 0x14 0x4e
+0x43 0x2c 0x14 0x4e
+0x43 0x3c 0x14 0x0e
+0x43 0x3c 0x14 0x0e
+0x43 0x3c 0x18 0x4e
+0x43 0x3c 0x18 0x4e
+
+# CHECK: smov.s  x3, v2[2]
+# CHECK: smov.s  x3, v2[2]
+# CHECK: mov.s  w3, v2[2]
+# CHECK: mov.s  w3, v2[2]
+# CHECK: mov.d  x3, v2[1]
+# CHECK: mov.d  x3, v2[1]
+
+0xa2 0x1c 0x18 0x4e
+0xa2 0x1c 0x0c 0x4e
+0xa2 0x1c 0x06 0x4e
+0xa2 0x1c 0x03 0x4e
+
+0xa2 0x1c 0x18 0x4e
+0xa2 0x1c 0x0c 0x4e
+0xa2 0x1c 0x06 0x4e
+0xa2 0x1c 0x03 0x4e
+
+# CHECK: ins.d v2[1], x5
+# CHECK: ins.s v2[1], w5
+# CHECK: ins.h v2[1], w5
+# CHECK: ins.b v2[1], w5
+
+# CHECK: ins.d v2[1], x5
+# CHECK: ins.s v2[1], w5
+# CHECK: ins.h v2[1], w5
+# CHECK: ins.b v2[1], w5
+
+0xe2 0x45 0x18 0x6e
+0xe2 0x25 0x0c 0x6e
+0xe2 0x15 0x06 0x6e
+0xe2 0x0d 0x03 0x6e
+
+0xe2 0x05 0x18 0x6e
+0xe2 0x45 0x1c 0x6e
+0xe2 0x35 0x1e 0x6e
+0xe2 0x2d 0x15 0x6e
+
+# CHECK: ins.d v2[1], v15[1]
+# CHECK: ins.s v2[1], v15[1]
+# CHECK: ins.h v2[1], v15[1]
+# CHECK: ins.b v2[1], v15[1]
+
+# CHECK: ins.d v2[1], v15[0]
+# CHECK: ins.s v2[3], v15[2]
+# CHECK: ins.h v2[7], v15[3]
+# CHECK: ins.b v2[10], v15[5]
+
+0x00 0x1c 0x20 0x0e
+0x00 0x1c 0x20 0x4e
+
+# CHECK: and.8b  v0, v0, v0
+# CHECK: and.16b v0, v0, v0
+
+0x00 0x1c 0x60 0x0e
+
+# CHECK: bic.8b  v0, v0, v0
+
+0x00 0x8c 0x20 0x2e
+0x00 0x3c 0x20 0x0e
+0x00 0x34 0x20 0x0e
+0x00 0x34 0x20 0x2e
+0x00 0x3c 0x20 0x2e
+0x00 0x8c 0x20 0x0e
+0x00 0xd4 0xa0 0x2e
+0x00 0xec 0x20 0x2e
+0x00 0xec 0xa0 0x2e
+0x00 0xd4 0x20 0x2e
+0x00 0xd4 0x20 0x0e
+0x00 0xe4 0x20 0x0e
+0x00 0xe4 0x20 0x2e
+0x00 0xe4 0xa0 0x2e
+0x00 0xfc 0x20 0x2e
+0x00 0xc4 0x20 0x2e
+0x00 0xc4 0x20 0x0e
+0x00 0xf4 0x20 0x2e
+0x00 0xf4 0x20 0x0e
+0x00 0xc4 0xa0 0x2e
+0x00 0xc4 0xa0 0x0e
+0x00 0xf4 0xa0 0x2e
+0x00 0xf4 0xa0 0x0e
+0x00 0xcc 0x20 0x0e
+0x00 0xcc 0xa0 0x0e
+0x00 0xdc 0x20 0x0e
+0x00 0xdc 0x20 0x2e
+0x00 0xfc 0x20 0x0e
+0x00 0xfc 0xa0 0x0e
+0x00 0xd4 0xa0 0x0e
+0x00 0x94 0x20 0x0e
+0x00 0x94 0x20 0x2e
+0x00 0x9c 0x20 0x0e
+0x00 0x9c 0x20 0x2e
+0x00 0x7c 0x20 0x0e
+0x00 0x74 0x20 0x0e
+0x00 0x04 0x20 0x0e
+0x00 0x24 0x20 0x0e
+0x00 0xa4 0x20 0x0e
+0x00 0x64 0x20 0x0e
+0x00 0xac 0x20 0x0e
+0x00 0x6c 0x20 0x0e
+0x00 0x0c 0x20 0x0e
+0x00 0xb4 0x60 0x0e
+0x00 0xb4 0x60 0x2e
+0x00 0x5c 0x20 0x0e
+0x00 0x4c 0x20 0x0e
+0x00 0x2c 0x20 0x0e
+0x00 0x14 0x20 0x0e
+0x00 0x54 0x20 0x0e
+0x00 0x44 0x20 0x0e
+0x00 0x84 0x20 0x2e
+0x00 0x7c 0x20 0x2e
+0x00 0x74 0x20 0x2e
+0x00 0x04 0x20 0x2e
+0x00 0x24 0x20 0x2e
+0x00 0xa4 0x20 0x2e
+0x00 0x64 0x20 0x2e
+0x00 0xac 0x20 0x2e
+0x00 0x6c 0x20 0x2e
+0x00 0x0c 0x20 0x2e
+0x00 0x5c 0x20 0x2e
+0x00 0x4c 0x20 0x2e
+0x00 0x2c 0x20 0x2e
+0x00 0x14 0x20 0x2e
+0x00 0x54 0x20 0x2e
+0x00 0x44 0x20 0x2e
+
+# CHECK: cmeq.8b	v0, v0, v0
+# CHECK: cmge.8b	v0, v0, v0
+# CHECK: cmgt.8b	v0, v0, v0
+# CHECK: cmhi.8b	v0, v0, v0
+# CHECK: cmhs.8b	v0, v0, v0
+# CHECK: cmtst.8b	v0, v0, v0
+# CHECK: fabd.2s	v0, v0, v0
+# CHECK: facge.2s	v0, v0, v0
+# CHECK: facgt.2s	v0, v0, v0
+# CHECK: faddp.2s	v0, v0, v0
+# CHECK: fadd.2s	v0, v0, v0
+# CHECK: fcmeq.2s	v0, v0, v0
+# CHECK: fcmge.2s	v0, v0, v0
+# CHECK: fcmgt.2s	v0, v0, v0
+# CHECK: fdiv.2s	v0, v0, v0
+# CHECK: fmaxnmp.2s	v0, v0, v0
+# CHECK: fmaxnm.2s	v0, v0, v0
+# CHECK: fmaxp.2s	v0, v0, v0
+# CHECK: fmax.2s	v0, v0, v0
+# CHECK: fminnmp.2s	v0, v0, v0
+# CHECK: fminnm.2s	v0, v0, v0
+# CHECK: fminp.2s	v0, v0, v0
+# CHECK: fmin.2s	v0, v0, v0
+# CHECK: fmla.2s	v0, v0, v0
+# CHECK: fmls.2s	v0, v0, v0
+# CHECK: fmulx.2s	v0, v0, v0
+# CHECK: fmul.2s	v0, v0, v0
+# CHECK: frecps.2s	v0, v0, v0
+# CHECK: frsqrts.2s	v0, v0, v0
+# CHECK: fsub.2s	v0, v0, v0
+# CHECK: mla.8b	v0, v0, v0
+# CHECK: mls.8b	v0, v0, v0
+# CHECK: mul.8b	v0, v0, v0
+# CHECK: pmul.8b	v0, v0, v0
+# CHECK: saba.8b	v0, v0, v0
+# CHECK: sabd.8b	v0, v0, v0
+# CHECK: shadd.8b	v0, v0, v0
+# CHECK: shsub.8b	v0, v0, v0
+# CHECK: smaxp.8b	v0, v0, v0
+# CHECK: smax.8b	v0, v0, v0
+# CHECK: sminp.8b	v0, v0, v0
+# CHECK: smin.8b	v0, v0, v0
+# CHECK: sqadd.8b	v0, v0, v0
+# CHECK: sqdmulh.4h v0, v0, v0
+# CHECK: sqrdmulh.4h v0, v0, v0
+# CHECK: sqrshl.8b	v0, v0, v0
+# CHECK: sqshl.8b	v0, v0, v0
+# CHECK: sqsub.8b	v0, v0, v0
+# CHECK: srhadd.8b	v0, v0, v0
+# CHECK: srshl.8b	v0, v0, v0
+# CHECK: sshl.8b	v0, v0, v0
+# CHECK: sub.8b	v0, v0, v0
+# CHECK: uaba.8b	v0, v0, v0
+# CHECK: uabd.8b	v0, v0, v0
+# CHECK: uhadd.8b	v0, v0, v0
+# CHECK: uhsub.8b	v0, v0, v0
+# CHECK: umaxp.8b	v0, v0, v0
+# CHECK: umax.8b	v0, v0, v0
+# CHECK: uminp.8b	v0, v0, v0
+# CHECK: umin.8b	v0, v0, v0
+# CHECK: uqadd.8b	v0, v0, v0
+# CHECK: uqrshl.8b	v0, v0, v0
+# CHECK: uqshl.8b	v0, v0, v0
+# CHECK: uqsub.8b	v0, v0, v0
+# CHECK: urhadd.8b	v0, v0, v0
+# CHECK: urshl.8b	v0, v0, v0
+# CHECK: ushl.8b	v0, v0, v0
+
+0x00 0x1c 0xe0 0x2e
+0x00 0x1c 0xa0 0x2e
+0x00 0x1c 0x60 0x2e
+0x00 0x1c 0x20 0x2e
+0x00 0x1c 0xe0 0x0e
+0x00 0x1c 0xa1 0x0e
+
+# CHECK: bif.8b	v0, v0, v0
+# CHECK: bit.8b	v0, v0, v0
+# CHECK: bsl.8b	v0, v0, v0
+# CHECK: eor.8b	v0, v0, v0
+# CHECK: orn.8b	v0, v0, v0
+# CHECK: orr.8b	v0, v0, v1
+
+0x00 0x68 0x20 0x0e
+0x00 0x68 0x20 0x4e
+0x00 0x68 0x60 0x0e
+0x00 0x68 0x60 0x4e
+0x00 0x68 0xa0 0x0e
+0x00 0x68 0xa0 0x4e
+
+# CHECK: sadalp.4h	v0, v0
+# CHECK: sadalp.8h	v0, v0
+# CHECK: sadalp.2s	v0, v0
+# CHECK: sadalp.4s	v0, v0
+# CHECK: sadalp.1d	v0, v0
+# CHECK: sadalp.2d	v0, v0
+
+0x00 0x48 0x20 0x0e
+0x00 0x48 0x20 0x2e
+0x00 0x58 0x20 0x0e
+0x00 0xf8 0xa0 0x0e
+0x00 0xc8 0x21 0x0e
+0x00 0xc8 0x21 0x2e
+0x00 0xb8 0x21 0x0e
+0x00 0xb8 0x21 0x2e
+0x00 0xa8 0x21 0x0e
+0x00 0xa8 0x21 0x2e
+0x00 0xa8 0xa1 0x0e
+0x00 0xa8 0xa1 0x2e
+0x00 0xb8 0xa1 0x0e
+0x00 0xb8 0xa1 0x2e
+0x00 0xf8 0xa0 0x2e
+0x00 0xd8 0xa1 0x0e
+0x00 0xd8 0xa1 0x2e
+0x00 0xf8 0xa1 0x2e
+0x00 0xb8 0x20 0x2e
+0x00 0x58 0x20 0x2e
+0x00 0x58 0x60 0x2e
+0x00 0x18 0x20 0x0e
+0x00 0x08 0x20 0x2e
+0x00 0x08 0x20 0x0e
+0x00 0x68 0x20 0x0e
+0x00 0x28 0x20 0x0e
+0x00 0xd8 0x21 0x0e
+0x00 0x38 0x21 0x2e
+0x00 0x78 0x20 0x0e
+0x00 0x78 0x20 0x2e
+0x00 0x48 0x21 0x0e
+0x00 0x28 0x21 0x2e
+0x00 0x38 0x20 0x0e
+0x00 0x68 0x20 0x2e
+0x00 0x28 0x20 0x2e
+0x00 0xd8 0x21 0x2e
+0x00 0x48 0x21 0x2e
+0x00 0xc8 0xa1 0x0e
+0x00 0xc8 0xa1 0x2e
+0x00 0x38 0x20 0x2e
+0x00 0x28 0x21 0x0e
+0x00 0x48 0x20 0x0e
+0x00 0x48 0x20 0x2e
+0x00 0x58 0x20 0x0e
+0x00 0xf8 0xa0 0x0e
+0x00 0xc8 0x21 0x0e
+0x00 0xc8 0x21 0x2e
+0x00 0xb8 0x21 0x0e
+0x00 0xb8 0x21 0x2e
+0x00 0xa8 0x21 0x0e
+0x00 0xa8 0x21 0x2e
+0x00 0xa8 0xa1 0x0e
+0x00 0xa8 0xa1 0x2e
+0x00 0xb8 0xa1 0x0e
+0x00 0xb8 0xa1 0x2e
+0x00 0xf8 0xa0 0x2e
+0x00 0xd8 0xa1 0x0e
+0x00 0xd8 0xa1 0x2e
+0x00 0xf8 0xa1 0x2e
+0x00 0xb8 0x20 0x2e
+0x00 0x58 0x20 0x2e
+0x00 0x58 0x60 0x2e
+0x00 0x18 0x20 0x0e
+0x00 0x08 0x20 0x2e
+0x00 0x08 0x20 0x0e
+0x00 0x68 0x20 0x0e
+0x00 0x28 0x20 0x0e
+0x00 0xd8 0x21 0x0e
+0x00 0x38 0x21 0x2e
+0x00 0x78 0x20 0x0e
+0x00 0x78 0x20 0x2e
+0x00 0x48 0x21 0x0e
+0x00 0x28 0x21 0x2e
+0x00 0x38 0x20 0x0e
+0x00 0x68 0x20 0x2e
+0x00 0x28 0x20 0x2e
+0x00 0xd8 0x21 0x2e
+0x00 0x48 0x21 0x2e
+0x00 0xc8 0xa1 0x0e
+0x00 0xc8 0xa1 0x2e
+0x00 0x38 0x20 0x2e
+0x00 0x28 0x21 0x0e
+
+# CHECK: cls.8b	v0, v0
+# CHECK: clz.8b	v0, v0
+# CHECK: cnt.8b	v0, v0
+# CHECK: fabs.2s	v0, v0
+# CHECK: fcvtas.2s	v0, v0
+# CHECK: fcvtau.2s	v0, v0
+# CHECK: fcvtms.2s	v0, v0
+# CHECK: fcvtmu.2s	v0, v0
+# CHECK: fcvtns.2s	v0, v0
+# CHECK: fcvtnu.2s	v0, v0
+# CHECK: fcvtps.2s	v0, v0
+# CHECK: fcvtpu.2s	v0, v0
+# CHECK: fcvtzs.2s	v0, v0
+# CHECK: fcvtzu.2s	v0, v0
+# CHECK: fneg.2s	v0, v0
+# CHECK: frecpe.2s	v0, v0
+# CHECK: frsqrte.2s	v0, v0
+# CHECK: fsqrt.2s	v0, v0
+# CHECK: neg.8b	v0, v0
+# CHECK: mvn.8b	v0, v0
+# CHECK: rbit.8b	v0, v0
+# CHECK: rev16.8b	v0, v0
+# CHECK: rev32.8b	v0, v0
+# CHECK: rev64.8b	v0, v0
+# CHECK: sadalp.4h	v0, v0
+# CHECK: saddlp.4h	v0, v0
+# CHECK: scvtf.2s	v0, v0
+# CHECK: shll.8h	v0, v0, #8
+# CHECK: sqabs.8b	v0, v0
+# CHECK: sqneg.8b	v0, v0
+# CHECK: sqxtn.8b	v0, v0
+# CHECK: sqxtun.8b	v0, v0
+# CHECK: suqadd.8b	v0, v0
+# CHECK: uadalp.4h	v0, v0
+# CHECK: uaddlp.4h	v0, v0
+# CHECK: ucvtf.2s	v0, v0
+# CHECK: uqxtn.8b	v0, v0
+# CHECK: urecpe.2s	v0, v0
+# CHECK: ursqrte.2s	v0, v0
+# CHECK: usqadd.8b	v0, v0
+# CHECK: xtn.8b	v0, v0
+
+0x00 0x98 0x20 0x0e
+0x00 0x98 0x20 0x4e
+0x00 0x98 0x60 0x0e
+0x00 0x98 0x60 0x4e
+0x00 0x98 0xa0 0x0e
+0x00 0x98 0xa0 0x4e
+0x00 0x98 0xe0 0x4e
+
+# CHECK: cmeq.8b	v0, v0, #0
+# CHECK: cmeq.16b	v0, v0, #0
+# CHECK: cmeq.4h	v0, v0, #0
+# CHECK: cmeq.8h	v0, v0, #0
+# CHECK: cmeq.2s	v0, v0, #0
+# CHECK: cmeq.4s	v0, v0, #0
+# CHECK: cmeq.2d	v0, v0, #0
+
+0x00 0x88 0x20 0x2e
+0x00 0x88 0x20 0x0e
+0x00 0x98 0x20 0x2e
+0x00 0xa8 0x20 0x0e
+0x00 0xd8 0xa0 0x0e
+0x00 0xc8 0xa0 0x2e
+0x00 0xc8 0xa0 0x0e
+0x00 0xd8 0xa0 0x2e
+0x00 0xe8 0xa0 0x0e
+
+# CHECK: cmge.8b	v0, v0, #0
+# CHECK: cmgt.8b	v0, v0, #0
+# CHECK: cmle.8b	v0, v0, #0
+# CHECK: cmlt.8b	v0, v0, #0
+# CHECK: fcmeq.2s	v0, v0, #0
+# CHECK: fcmge.2s	v0, v0, #0
+# CHECK: fcmgt.2s	v0, v0, #0
+# CHECK: fcmle.2s	v0, v0, #0
+# CHECK: fcmlt.2s	v0, v0, #0
+
+0x00 0x78 0x21 0x0e
+0x00 0x78 0x21 0x4e
+0x00 0x78 0x61 0x0e
+0x00 0x78 0x61 0x4e
+0x00 0x68 0x21 0x0e
+0x00 0x68 0x21 0x4e
+0x00 0x68 0x61 0x0e
+0x00 0x68 0x61 0x4e
+0x00 0x68 0x61 0x2e
+0x00 0x68 0x61 0x6e
+
+# CHECK: fcvtl	v0.4s, v0.4h
+# CHECK: fcvtl2	v0.4s, v0.8h
+# CHECK: fcvtl	v0.2d, v0.2s
+# CHECK: fcvtl2	v0.2d, v0.4s
+# CHECK: fcvtn	v0.4h, v0.4s
+# CHECK: fcvtn2	v0.8h, v0.4s
+# CHECK: fcvtn	v0.2s, v0.2d
+# CHECK: fcvtn2	v0.4s, v0.2d
+# CHECK: fcvtxn	v0.2s, v0.2d
+# CHECK: fcvtxn2	v0.4s, v0.2d
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD modified immediate instructions
+#===-------------------------------------------------------------------------===
+
+0x20 0x14 0x00 0x2f
+0x20 0x34 0x00 0x2f
+0x20 0x54 0x00 0x2f
+0x20 0x74 0x00 0x2f
+
+# CHECK: bic.2s v0, #0x1
+# CHECK: bic.2s v0, #0x1, lsl #8
+# CHECK: bic.2s v0, #0x1, lsl #16
+# CHECK: bic.2s v0, #0x1, lsl #24
+
+0x20 0x94 0x00 0x2f
+0x20 0x94 0x00 0x2f
+0x20 0xb4 0x00 0x2f
+
+# CHECK: bic.4h v0, #0x1
+# CHECK: bic.4h v0, #0x1
+# FIXME: bic.4h v0, #0x1, lsl #8
+#    'bic.4h' should be selected over "fcvtnu.2s v0, v1, #0"
+
+0x20 0x14 0x00 0x6f
+0x20 0x34 0x00 0x6f
+0x20 0x54 0x00 0x6f
+0x20 0x74 0x00 0x6f
+
+# CHECK: bic.4s v0, #0x1
+# CHECK: bic.4s v0, #0x1, lsl #8
+# CHECK: bic.4s v0, #0x1, lsl #16
+# CHECK: bic.4s v0, #0x1, lsl #24
+
+0x20 0x94 0x00 0x6f
+0x20 0xb4 0x00 0x6f
+
+# CHECK: bic.8h v0, #0x1
+# FIXME: bic.8h v0, #0x1, lsl #8
+#    "bic.8h" should be selected over "fcvtnu.4s v0, v1, #0"
+
+0x00 0xf4 0x02 0x6f
+
+# CHECK: fmov.2d v0, #0.12500000
+
+0x00 0xf4 0x02 0x0f
+0x00 0xf4 0x02 0x4f
+
+# CHECK: fmov.2s v0, #0.12500000
+# CHECK: fmov.4s v0, #0.12500000
+
+0x20 0x14 0x00 0x0f
+0x20 0x34 0x00 0x0f
+0x20 0x54 0x00 0x0f
+0x20 0x74 0x00 0x0f
+
+# CHECK: orr.2s v0, #0x1
+# CHECK: orr.2s v0, #0x1, lsl #8
+# CHECK: orr.2s v0, #0x1, lsl #16
+# CHECK: orr.2s v0, #0x1, lsl #24
+
+0x20 0x94 0x00 0x0f
+0x20 0xb4 0x00 0x0f
+
+# CHECK: orr.4h v0, #0x1
+# FIXME: orr.4h v0, #0x1, lsl #8
+#    'orr.4h' should be selected over "fcvtns.2s v0, v1, #0"
+
+0x20 0x14 0x00 0x4f
+0x20 0x34 0x00 0x4f
+0x20 0x54 0x00 0x4f
+0x20 0x74 0x00 0x4f
+
+# CHECK: orr.4s v0, #0x1
+# CHECK: orr.4s v0, #0x1, lsl #8
+# CHECK: orr.4s v0, #0x1, lsl #16
+# CHECK: orr.4s v0, #0x1, lsl #24
+
+0x20 0x94 0x00 0x4f
+0x20 0xb4 0x00 0x4f
+
+# CHECK: orr.8h v0, #0x1
+# CHECK: orr.8h v0, #0x1, lsl #8
+
+0x21 0x70 0x40 0x0c
+0x42 0xa0 0x40 0x4c
+0x64 0x64 0x40 0x0c
+0x87 0x24 0x40 0x4c
+0x0c 0xa8 0x40 0x0c
+0x0a 0x68 0x40 0x4c
+0x2d 0xac 0x40 0x0c
+0x4f 0x7c 0x40 0x4c
+0xe0 0x03 0x40 0x0d
+
+# CHECK: ld1.8b { v1 }, [x1]
+# CHECK: ld1.16b { v2, v3 }, [x2]
+# CHECK: ld1.4h { v4, v5, v6 }, [x3]
+# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4]
+# CHECK: ld1.2s { v12, v13 }, [x0]
+# CHECK: ld1.4s { v10, v11, v12 }, [x0]
+# CHECK: ld1.1d { v13, v14 }, [x1]
+# CHECK: ld1.2d	{ v15 }, [x2]
+# CHECK: ld1.b	{ v0 }[0], [sp]
+
+0x41 0x70 0xdf 0x0c
+0x41 0xa0 0xdf 0x0c
+0x41 0x60 0xdf 0x0c
+0x41 0x20 0xdf 0x0c
+0x42 0x70 0xdf 0x4c
+0x42 0xa0 0xdf 0x4c
+0x42 0x60 0xdf 0x4c
+0x42 0x20 0xdf 0x4c
+0x64 0x74 0xdf 0x0c
+0x64 0xa4 0xdf 0x0c
+0x64 0x64 0xdf 0x0c
+0x64 0x24 0xdf 0x0c
+0x87 0x74 0xdf 0x4c
+0x87 0xa4 0xdf 0x4c
+0x87 0x64 0xdf 0x4c
+0x87 0x24 0xdf 0x4c
+0x0c 0x78 0xdf 0x0c
+0x0c 0xa8 0xdf 0x0c
+0x0c 0x68 0xdf 0x0c
+0x0c 0x28 0xdf 0x0c
+0x0a 0x78 0xdf 0x4c
+0x0a 0xa8 0xdf 0x4c
+0x0a 0x68 0xdf 0x4c
+0x0a 0x28 0xdf 0x4c
+0x2d 0x7c 0xdf 0x0c
+0x2d 0xac 0xdf 0x0c
+0x2d 0x6c 0xdf 0x0c
+0x2d 0x2c 0xdf 0x0c
+0x4f 0x7c 0xdf 0x4c
+0x4f 0xac 0xdf 0x4c
+0x4f 0x6c 0xdf 0x4c
+0x4f 0x2c 0xdf 0x4c
+
+# CHECK: ld1.8b { v1 }, [x2], #8
+# CHECK: ld1.8b { v1, v2 }, [x2], #16
+# CHECK: ld1.8b { v1, v2, v3 }, [x2], #24
+# CHECK: ld1.8b { v1, v2, v3, v4 }, [x2], #32
+# CHECK: ld1.16b { v2 }, [x2], #16
+# CHECK: ld1.16b { v2, v3 }, [x2], #32
+# CHECK: ld1.16b { v2, v3, v4 }, [x2], #48
+# CHECK: ld1.16b { v2, v3, v4, v5 }, [x2], #64
+# CHECK: ld1.4h { v4 }, [x3], #8
+# CHECK: ld1.4h { v4, v5 }, [x3], #16
+# CHECK: ld1.4h { v4, v5, v6 }, [x3], #24
+# CHECK: ld1.4h { v4, v5, v6, v7 }, [x3], #32
+# CHECK: ld1.8h { v7 }, [x4], #16
+# CHECK: ld1.8h { v7, v8 }, [x4], #32
+# CHECK: ld1.8h { v7, v8, v9 }, [x4], #48
+# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4], #64
+# CHECK: ld1.2s { v12 }, [x0], #8
+# CHECK: ld1.2s { v12, v13 }, [x0], #16
+# CHECK: ld1.2s { v12, v13, v14 }, [x0], #24
+# CHECK: ld1.2s { v12, v13, v14, v15 }, [x0], #32
+# CHECK: ld1.4s { v10 }, [x0], #16
+# CHECK: ld1.4s { v10, v11 }, [x0], #32
+# CHECK: ld1.4s { v10, v11, v12 }, [x0], #48
+# CHECK: ld1.4s { v10, v11, v12, v13 }, [x0], #64
+# CHECK: ld1.1d { v13 }, [x1], #8
+# CHECK: ld1.1d { v13, v14 }, [x1], #16
+# CHECK: ld1.1d { v13, v14, v15 }, [x1], #24
+# CHECK: ld1.1d { v13, v14, v15, v16 }, [x1], #32
+# CHECK: ld1.2d { v15 }, [x2], #16
+# CHECK: ld1.2d { v15, v16 }, [x2], #32
+# CHECK: ld1.2d { v15, v16, v17 }, [x2], #48
+# CHECK: ld1.2d { v15, v16, v17, v18 }, [x2], #64
+
+0x21 0x70 0x00 0x0c
+0x42 0xa0 0x00 0x4c
+0x64 0x64 0x00 0x0c
+0x87 0x24 0x00 0x4c
+0x0c 0xa8 0x00 0x0c
+0x0a 0x68 0x00 0x4c
+0x2d 0xac 0x00 0x0c
+0x4f 0x7c 0x00 0x4c
+
+# CHECK: st1.8b { v1 }, [x1]
+# CHECK: st1.16b { v2, v3 }, [x2]
+# CHECK: st1.4h { v4, v5, v6 }, [x3]
+# CHECK: st1.8h { v7, v8, v9, v10 }, [x4]
+# CHECK: st1.2s { v12, v13 }, [x0]
+# CHECK: st1.4s { v10, v11, v12 }, [x0]
+# CHECK: st1.1d { v13, v14 }, [x1]
+# CHECK: st1.2d	{ v15 }, [x2]
+
+0x61 0x08 0x40 0x0d
+0x82 0x84 0x40 0x4d
+0xa3 0x58 0x40 0x0d
+0xc4 0x80 0x40 0x4d
+
+# CHECK: ld1.b { v1 }[2], [x3]
+# CHECK: ld1.d { v2 }[1], [x4]
+# CHECK: ld1.h { v3 }[3], [x5]
+# CHECK: ld1.s { v4 }[2], [x6]
+
+0x61 0x08 0xdf 0x0d
+0x82 0x84 0xdf 0x4d
+0xa3 0x58 0xdf 0x0d
+0xc4 0x80 0xdf 0x4d
+
+# CHECK: ld1.b { v1 }[2], [x3], #1
+# CHECK: ld1.d { v2 }[1], [x4], #8
+# CHECK: ld1.h { v3 }[3], [x5], #2
+# CHECK: ld1.s { v4 }[2], [x6], #4
+
+0x61 0x08 0x00 0x0d
+0x82 0x84 0x00 0x4d
+0xa3 0x58 0x00 0x0d
+0xc4 0x80 0x00 0x4d
+
+# CHECK: st1.b { v1 }[2], [x3]
+# CHECK: st1.d { v2 }[1], [x4]
+# CHECK: st1.h { v3 }[3], [x5]
+# CHECK: st1.s { v4 }[2], [x6]
+
+0x61 0x08 0x9f 0x0d
+0x82 0x84 0x9f 0x4d
+0xa3 0x58 0x9f 0x0d
+0xc4 0x80 0x9f 0x4d
+
+# CHECK: st1.b { v1 }[2], [x3], #1
+# CHECK: st1.d { v2 }[1], [x4], #8
+# CHECK: st1.h { v3 }[3], [x5], #2
+# CHECK: st1.s { v4 }[2], [x6], #4
+
+0x61 0x08 0xc4 0x0d
+0x82 0x84 0xc5 0x4d
+0xa3 0x58 0xc6 0x0d
+0xc4 0x80 0xc7 0x4d
+
+# CHECK: ld1.b { v1 }[2], [x3], x4
+# CHECK: ld1.d { v2 }[1], [x4], x5
+# CHECK: ld1.h { v3 }[3], [x5], x6
+# CHECK: ld1.s { v4 }[2], [x6], x7
+
+0x61 0x08 0x84 0x0d
+0x82 0x84 0x85 0x4d
+0xa3 0x58 0x86 0x0d
+0xc4 0x80 0x87 0x4d
+
+# CHECK: st1.b { v1 }[2], [x3], x4
+# CHECK: st1.d { v2 }[1], [x4], x5
+# CHECK: st1.h { v3 }[3], [x5], x6
+# CHECK: st1.s { v4 }[2], [x6], x7
+
+0x41 0x70 0xc3 0x0c
+0x42 0xa0 0xc4 0x4c
+0x64 0x64 0xc5 0x0c
+0x87 0x24 0xc6 0x4c
+0x0c 0xa8 0xc7 0x0c
+0x0a 0x68 0xc8 0x4c
+0x2d 0xac 0xc9 0x0c
+0x4f 0x7c 0xca 0x4c
+
+# CHECK: ld1.8b { v1 }, [x2], x3
+# CHECK: ld1.16b { v2, v3 }, [x2], x4
+# CHECK: ld1.4h { v4, v5, v6 }, [x3], x5
+# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4], x6
+# CHECK: ld1.2s { v12, v13 }, [x0], x7
+# CHECK: ld1.4s { v10, v11, v12 }, [x0], x8
+# CHECK: ld1.1d { v13, v14 }, [x1], x9
+# CHECK: ld1.2d { v15 }, [x2], x10
+
+0x41 0x70 0x83 0x0c
+0x42 0xa0 0x84 0x4c
+0x64 0x64 0x85 0x0c
+0x87 0x24 0x86 0x4c
+0x0c 0xa8 0x87 0x0c
+0x0a 0x68 0x88 0x4c
+0x2d 0xac 0x89 0x0c
+0x4f 0x7c 0x8a 0x4c
+
+# CHECK: st1.8b { v1 }, [x2], x3
+# CHECK: st1.16b { v2, v3 }, [x2], x4
+# CHECK: st1.4h { v4, v5, v6 }, [x3], x5
+# CHECK: st1.8h { v7, v8, v9, v10 }, [x4], x6
+# CHECK: st1.2s { v12, v13 }, [x0], x7
+# CHECK: st1.4s { v10, v11, v12 }, [x0], x8
+# CHECK: st1.1d { v13, v14 }, [x1], x9
+# CHECK: st1.2d { v15 }, [x2], x10
+
+0x41 0x70 0x9f 0x0c
+0x41 0xa0 0x9f 0x0c
+0x41 0x60 0x9f 0x0c
+0x41 0x20 0x9f 0x0c
+0x42 0x70 0x9f 0x4c
+0x42 0xa0 0x9f 0x4c
+0x42 0x60 0x9f 0x4c
+0x42 0x20 0x9f 0x4c
+0x64 0x74 0x9f 0x0c
+0x64 0xa4 0x9f 0x0c
+0x64 0x64 0x9f 0x0c
+0x64 0x24 0x9f 0x0c
+0x87 0x74 0x9f 0x4c
+0x87 0xa4 0x9f 0x4c
+0x87 0x64 0x9f 0x4c
+0x87 0x24 0x9f 0x4c
+0x0c 0x78 0x9f 0x0c
+0x0c 0xa8 0x9f 0x0c
+0x0c 0x68 0x9f 0x0c
+0x0c 0x28 0x9f 0x0c
+0x0a 0x78 0x9f 0x4c
+0x0a 0xa8 0x9f 0x4c
+0x0a 0x68 0x9f 0x4c
+0x0a 0x28 0x9f 0x4c
+0x2d 0x7c 0x9f 0x0c
+0x2d 0xac 0x9f 0x0c
+0x2d 0x6c 0x9f 0x0c
+0x2d 0x2c 0x9f 0x0c
+0x4f 0x7c 0x9f 0x4c
+0x4f 0xac 0x9f 0x4c
+0x4f 0x6c 0x9f 0x4c
+0x4f 0x2c 0x9f 0x4c
+
+# CHECK: st1.8b { v1 }, [x2], #8
+# CHECK: st1.8b { v1, v2 }, [x2], #16
+# CHECK: st1.8b { v1, v2, v3 }, [x2], #24
+# CHECK: st1.8b { v1, v2, v3, v4 }, [x2], #32
+# CHECK: st1.16b { v2 }, [x2], #16
+# CHECK: st1.16b { v2, v3 }, [x2], #32
+# CHECK: st1.16b { v2, v3, v4 }, [x2], #48
+# CHECK: st1.16b { v2, v3, v4, v5 }, [x2], #64
+# CHECK: st1.4h { v4 }, [x3], #8
+# CHECK: st1.4h { v4, v5 }, [x3], #16
+# CHECK: st1.4h { v4, v5, v6 }, [x3], #24
+# CHECK: st1.4h { v4, v5, v6, v7 }, [x3], #32
+# CHECK: st1.8h { v7 }, [x4], #16
+# CHECK: st1.8h { v7, v8 }, [x4], #32
+# CHECK: st1.8h { v7, v8, v9 }, [x4], #48
+# CHECK: st1.8h { v7, v8, v9, v10 }, [x4], #64
+# CHECK: st1.2s { v12 }, [x0], #8
+# CHECK: st1.2s { v12, v13 }, [x0], #16
+# CHECK: st1.2s { v12, v13, v14 }, [x0], #24
+# CHECK: st1.2s { v12, v13, v14, v15 }, [x0], #32
+# CHECK: st1.4s { v10 }, [x0], #16
+# CHECK: st1.4s { v10, v11 }, [x0], #32
+# CHECK: st1.4s { v10, v11, v12 }, [x0], #48
+# CHECK: st1.4s { v10, v11, v12, v13 }, [x0], #64
+# CHECK: st1.1d { v13 }, [x1], #8
+# CHECK: st1.1d { v13, v14 }, [x1], #16
+# CHECK: st1.1d { v13, v14, v15 }, [x1], #24
+# CHECK: st1.1d { v13, v14, v15, v16 }, [x1], #32
+# CHECK: st1.2d { v15 }, [x2], #16
+# CHECK: st1.2d { v15, v16 }, [x2], #32
+# CHECK: st1.2d { v15, v16, v17 }, [x2], #48
+# CHECK: st1.2d { v15, v16, v17, v18 }, [x2], #64
+
+0x21 0xc0 0x40 0x0d
+0x21 0xc0 0xc2 0x0d
+0x64 0xc4 0x40 0x0d
+0x64 0xc4 0xc5 0x0d
+0xa9 0xc8 0x40 0x0d
+0xa9 0xc8 0xc6 0x0d
+0xec 0xcc 0x40 0x0d
+0xec 0xcc 0xc8 0x0d
+
+# CHECK: ld1r.8b { v1 }, [x1]
+# CHECK: ld1r.8b { v1 }, [x1], x2
+# CHECK: ld1r.4h { v4 }, [x3]
+# CHECK: ld1r.4h { v4 }, [x3], x5
+# CHECK: ld1r.2s { v9 }, [x5]
+# CHECK: ld1r.2s { v9 }, [x5], x6
+# CHECK: ld1r.1d { v12 }, [x7]
+# CHECK: ld1r.1d { v12 }, [x7], x8
+
+0x21 0xc0 0xdf 0x0d
+0x21 0xc4 0xdf 0x0d
+0x21 0xc8 0xdf 0x0d
+0x21 0xcc 0xdf 0x0d
+
+# CHECK: ld1r.8b { v1 }, [x1], #1
+# CHECK: ld1r.4h { v1 }, [x1], #2
+# CHECK: ld1r.2s { v1 }, [x1], #4
+# CHECK: ld1r.1d { v1 }, [x1], #8
+
+0x45 0x80 0x40 0x4c
+0x0a 0x88 0x40 0x0c
+
+# CHECK: ld2.16b { v5, v6 }, [x2]
+# CHECK: ld2.2s { v10, v11 }, [x0]
+
+0x45 0x80 0x00 0x4c
+0x0a 0x88 0x00 0x0c
+
+# CHECK: st2.16b { v5, v6 }, [x2]
+# CHECK: st2.2s { v10, v11 }, [x0]
+
+0x61 0x08 0x20 0x0d
+0x82 0x84 0x20 0x4d
+0xc3 0x50 0x20 0x0d
+0xe4 0x90 0x20 0x4d
+
+# CHECK: st2.b { v1, v2 }[2], [x3]
+# CHECK: st2.d { v2, v3 }[1], [x4]
+# CHECK: st2.h { v3, v4 }[2], [x6]
+# CHECK: st2.s { v4, v5 }[3], [x7]
+
+0x61 0x08 0xbf 0x0d
+0x82 0x84 0xbf 0x4d
+0xa3 0x58 0xbf 0x0d
+0xc4 0x80 0xbf 0x4d
+
+# CHECK: st2.b { v1, v2 }[2], [x3], #2
+# CHECK: st2.d { v2, v3 }[1], [x4], #16
+# CHECK: st2.h { v3, v4 }[3], [x5], #4
+# CHECK: st2.s { v4, v5 }[2], [x6], #8
+
+0x61 0x08 0x60 0x0d
+0x82 0x84 0x60 0x4d
+0xc3 0x50 0x60 0x0d
+0xe4 0x90 0x60 0x4d
+
+# CHECK: ld2.b { v1, v2 }[2], [x3]
+# CHECK: ld2.d { v2, v3 }[1], [x4]
+# CHECK: ld2.h { v3, v4 }[2], [x6]
+# CHECK: ld2.s { v4, v5 }[3], [x7]
+
+0x61 0x08 0xff 0x0d
+0x82 0x84 0xff 0x4d
+0xa3 0x58 0xff 0x0d
+0xc4 0x80 0xff 0x4d
+
+# CHECK: ld2.b { v1, v2 }[2], [x3], #2
+# CHECK: ld2.d { v2, v3 }[1], [x4], #16
+# CHECK: ld2.h { v3, v4 }[3], [x5], #4
+# CHECK: ld2.s { v4, v5 }[2], [x6], #8
+
+0x61 0x08 0xe4 0x0d
+0x82 0x84 0xe6 0x4d
+0xa3 0x58 0xe8 0x0d
+0xc4 0x80 0xea 0x4d
+
+# CHECK: ld2.b { v1, v2 }[2], [x3], x4
+# CHECK: ld2.d { v2, v3 }[1], [x4], x6
+# CHECK: ld2.h { v3, v4 }[3], [x5], x8
+# CHECK: ld2.s { v4, v5 }[2], [x6], x10
+
+0x61 0x08 0xa4 0x0d
+0x82 0x84 0xa6 0x4d
+0xa3 0x58 0xa8 0x0d
+0xc4 0x80 0xaa 0x4d
+
+# CHECK: st2.b { v1, v2 }[2], [x3], x4
+# CHECK: st2.d { v2, v3 }[1], [x4], x6
+# CHECK: st2.h { v3, v4 }[3], [x5], x8
+# CHECK: st2.s { v4, v5 }[2], [x6], x10
+
+0x64 0x84 0xc5 0x0c
+0x0c 0x88 0xc7 0x0c
+
+# CHECK: ld2.4h { v4, v5 }, [x3], x5
+# CHECK: ld2.2s { v12, v13 }, [x0], x7
+
+0x00 0x80 0xdf 0x0c
+0x00 0x80 0xdf 0x4c
+0x00 0x84 0xdf 0x0c
+0x00 0x84 0xdf 0x4c
+0x00 0x88 0xdf 0x0c
+0x00 0x88 0xdf 0x4c
+0x00 0x8c 0xdf 0x4c
+
+# CHECK: ld2.8b { v0, v1 }, [x0], #16
+# CHECK: ld2.16b { v0, v1 }, [x0], #32
+# CHECK: ld2.4h { v0, v1 }, [x0], #16
+# CHECK: ld2.8h { v0, v1 }, [x0], #32
+# CHECK: ld2.2s { v0, v1 }, [x0], #16
+# CHECK: ld2.4s { v0, v1 }, [x0], #32
+# CHECK: ld2.2d { v0, v1 }, [x0], #32
+
+0x64 0x84 0x85 0x0c
+0x0c 0x88 0x87 0x0c
+
+# CHECK: st2.4h { v4, v5 }, [x3], x5
+# CHECK: st2.2s { v12, v13 }, [x0], x7
+
+0x00 0x80 0x9f 0x0c
+0x00 0x80 0x9f 0x4c
+0x00 0x84 0x9f 0x0c
+0x00 0x84 0x9f 0x4c
+0x00 0x88 0x9f 0x0c
+0x00 0x88 0x9f 0x4c
+0x00 0x8c 0x9f 0x4c
+
+# CHECK: st2.8b { v0, v1 }, [x0], #16
+# CHECK: st2.16b { v0, v1 }, [x0], #32
+# CHECK: st2.4h { v0, v1 }, [x0], #16
+# CHECK: st2.8h { v0, v1 }, [x0], #32
+# CHECK: st2.2s { v0, v1 }, [x0], #16
+# CHECK: st2.4s { v0, v1 }, [x0], #32
+# CHECK: st2.2d { v0, v1 }, [x0], #32
+
+0x21 0xc0 0x60 0x0d
+0x21 0xc0 0xe2 0x0d
+0x21 0xc0 0x60 0x4d
+0x21 0xc0 0xe2 0x4d
+0x21 0xc4 0x60 0x0d
+0x21 0xc4 0xe2 0x0d
+0x21 0xc4 0x60 0x4d
+0x21 0xc4 0xe2 0x4d
+0x21 0xc8 0x60 0x0d
+0x21 0xc8 0xe2 0x0d
+0x21 0xcc 0x60 0x4d
+0x21 0xcc 0xe2 0x4d
+0x21 0xcc 0x60 0x0d
+0x21 0xcc 0xe2 0x0d
+
+# CHECK: ld2r.8b { v1, v2 }, [x1]
+# CHECK: ld2r.8b { v1, v2 }, [x1], x2
+# CHECK: ld2r.16b { v1, v2 }, [x1]
+# CHECK: ld2r.16b { v1, v2 }, [x1], x2
+# CHECK: ld2r.4h { v1, v2 }, [x1]
+# CHECK: ld2r.4h { v1, v2 }, [x1], x2
+# CHECK: ld2r.8h { v1, v2 }, [x1]
+# CHECK: ld2r.8h { v1, v2 }, [x1], x2
+# CHECK: ld2r.2s { v1, v2 }, [x1]
+# CHECK: ld2r.2s { v1, v2 }, [x1], x2
+# CHECK: ld2r.2d { v1, v2 }, [x1]
+# CHECK: ld2r.2d { v1, v2 }, [x1], x2
+# CHECK: ld2r.1d { v1, v2 }, [x1]
+# CHECK: ld2r.1d { v1, v2 }, [x1], x2
+
+0x21 0xc0 0xff 0x0d
+0x21 0xc0 0xff 0x4d
+0x21 0xc4 0xff 0x0d
+0x21 0xc4 0xff 0x4d
+0x21 0xc8 0xff 0x0d
+0x21 0xcc 0xff 0x4d
+0x21 0xcc 0xff 0x0d
+
+# CHECK: ld2r.8b { v1, v2 }, [x1], #2
+# CHECK: ld2r.16b { v1, v2 }, [x1], #2
+# CHECK: ld2r.4h { v1, v2 }, [x1], #4
+# CHECK: ld2r.8h { v1, v2 }, [x1], #4
+# CHECK: ld2r.2s { v1, v2 }, [x1], #8
+# CHECK: ld2r.2d { v1, v2 }, [x1], #16
+# CHECK: ld2r.1d { v1, v2 }, [x1], #16
+
+0x21 0x40 0x40 0x0c
+0x45 0x40 0x40 0x4c
+0x0a 0x48 0x40 0x0c
+
+# CHECK: ld3.8b { v1, v2, v3 }, [x1]
+# CHECK: ld3.16b { v5, v6, v7 }, [x2]
+# CHECK: ld3.2s { v10, v11, v12 }, [x0]
+
+0x21 0x40 0x00 0x0c
+0x45 0x40 0x00 0x4c
+0x0a 0x48 0x00 0x0c
+
+# CHECK: st3.8b { v1, v2, v3 }, [x1]
+# CHECK: st3.16b { v5, v6, v7 }, [x2]
+# CHECK: st3.2s { v10, v11, v12 }, [x0]
+
+0x61 0x28 0xc4 0x0d
+0x82 0xa4 0xc5 0x4d
+0xa3 0x78 0xc6 0x0d
+0xc4 0xa0 0xc7 0x4d
+
+# CHECK: ld3.b { v1, v2, v3 }[2], [x3], x4
+# CHECK: ld3.d { v2, v3, v4 }[1], [x4], x5
+# CHECK: ld3.h { v3, v4, v5 }[3], [x5], x6
+# CHECK: ld3.s { v4, v5, v6 }[2], [x6], x7
+
+0x61 0x28 0x84 0x0d
+0x82 0xa4 0x85 0x4d
+0xa3 0x78 0x86 0x0d
+0xc4 0xa0 0x87 0x4d
+
+# CHECK: st3.b { v1, v2, v3 }[2], [x3], x4
+# CHECK: st3.d { v2, v3, v4 }[1], [x4], x5
+# CHECK: st3.h { v3, v4, v5 }[3], [x5], x6
+# CHECK: st3.s { v4, v5, v6 }[2], [x6], x7
+
+0x61 0x28 0x9f 0x0d
+0x82 0xa4 0x9f 0x4d
+0xa3 0x78 0x9f 0x0d
+0xc4 0xa0 0x9f 0x4d
+
+# CHECK: st3.b { v1, v2, v3 }[2], [x3], #3
+# CHECK: st3.d { v2, v3, v4 }[1], [x4], #24
+# CHECK: st3.h { v3, v4, v5 }[3], [x5], #6
+# CHECK: st3.s { v4, v5, v6 }[2], [x6], #12
+
+0x41 0x40 0xc3 0x0c
+0x42 0x40 0xc4 0x4c
+0x64 0x44 0xc5 0x0c
+0x87 0x44 0xc6 0x4c
+0x0c 0x48 0xc7 0x0c
+0x0a 0x48 0xc8 0x4c
+0x4f 0x4c 0xca 0x4c
+
+# CHECK: ld3.8b { v1, v2, v3 }, [x2], x3
+# CHECK: ld3.16b { v2, v3, v4 }, [x2], x4
+# CHECK: ld3.4h { v4, v5, v6 }, [x3], x5
+# CHECK: ld3.8h { v7, v8, v9 }, [x4], x6
+# CHECK: ld3.2s { v12, v13, v14 }, [x0], x7
+# CHECK: ld3.4s { v10, v11, v12 }, [x0], x8
+# CHECK: ld3.2d { v15, v16, v17 }, [x2], x10
+
+0x00 0x40 0xdf 0x0c
+0x00 0x40 0xdf 0x4c
+0x00 0x44 0xdf 0x0c
+0x00 0x44 0xdf 0x4c
+0x00 0x48 0xdf 0x0c
+0x00 0x48 0xdf 0x4c
+0x00 0x4c 0xdf 0x4c
+
+# CHECK: ld3.8b { v0, v1, v2 }, [x0], #24
+# CHECK: ld3.16b { v0, v1, v2 }, [x0], #48
+# CHECK: ld3.4h { v0, v1, v2 }, [x0], #24
+# CHECK: ld3.8h { v0, v1, v2 }, [x0], #48
+# CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
+# CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
+# CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
+
+0x41 0x40 0x83 0x0c
+0x42 0x40 0x84 0x4c
+0x64 0x44 0x85 0x0c
+0x87 0x44 0x86 0x4c
+0x0c 0x48 0x87 0x0c
+0x0a 0x48 0x88 0x4c
+0x4f 0x4c 0x8a 0x4c
+
+# CHECK: st3.8b { v1, v2, v3 }, [x2], x3
+# CHECK: st3.16b { v2, v3, v4 }, [x2], x4
+# CHECK: st3.4h { v4, v5, v6 }, [x3], x5
+# CHECK: st3.8h { v7, v8, v9 }, [x4], x6
+# CHECK: st3.2s { v12, v13, v14 }, [x0], x7
+# CHECK: st3.4s { v10, v11, v12 }, [x0], x8
+# CHECK: st3.2d { v15, v16, v17 }, [x2], x10
+
+0x00 0x40 0x9f 0x0c
+0x00 0x40 0x9f 0x4c
+0x00 0x44 0x9f 0x0c
+0x00 0x44 0x9f 0x4c
+0x00 0x48 0x9f 0x0c
+0x00 0x48 0x9f 0x4c
+0x00 0x4c 0x9f 0x4c
+
+# CHECK: st3.8b { v0, v1, v2 }, [x0], #24
+# CHECK: st3.16b { v0, v1, v2 }, [x0], #48
+# CHECK: st3.4h { v0, v1, v2 }, [x0], #24
+# CHECK: st3.8h { v0, v1, v2 }, [x0], #48
+# CHECK: st3.2s { v0, v1, v2 }, [x0], #24
+# CHECK: st3.4s { v0, v1, v2 }, [x0], #48
+# CHECK: st3.2d { v0, v1, v2 }, [x0], #48
+
+0x61 0x28 0x40 0x0d
+0x82 0xa4 0x40 0x4d
+0xc3 0x70 0x40 0x0d
+0xe4 0xb0 0x40 0x4d
+
+# CHECK: ld3.b { v1, v2, v3 }[2], [x3]
+# CHECK: ld3.d { v2, v3, v4 }[1], [x4]
+# CHECK: ld3.h { v3, v4, v5 }[2], [x6]
+# CHECK: ld3.s { v4, v5, v6 }[3], [x7]
+
+0x61 0x28 0xdf 0x0d
+0x82 0xa4 0xdf 0x4d
+0xa3 0x78 0xdf 0x0d
+0xc4 0xa0 0xdf 0x4d
+
+# CHECK: ld3.b { v1, v2, v3 }[2], [x3], #3
+# CHECK: ld3.d { v2, v3, v4 }[1], [x4], #24
+# CHECK: ld3.h { v3, v4, v5 }[3], [x5], #6
+# CHECK: ld3.s { v4, v5, v6 }[2], [x6], #12
+
+0x61 0x28 0x00 0x0d
+0x82 0xa4 0x00 0x4d
+0xc3 0x70 0x00 0x0d
+0xe4 0xb0 0x00 0x4d
+
+# CHECK: st3.b { v1, v2, v3 }[2], [x3]
+# CHECK: st3.d { v2, v3, v4 }[1], [x4]
+# CHECK: st3.h { v3, v4, v5 }[2], [x6]
+# CHECK: st3.s { v4, v5, v6 }[3], [x7]
+
+0x21 0xe0 0x40 0x0d
+0x21 0xe0 0xc2 0x0d
+0x21 0xe0 0x40 0x4d
+0x21 0xe0 0xc2 0x4d
+0x21 0xe4 0x40 0x0d
+0x21 0xe4 0xc2 0x0d
+0x21 0xe4 0x40 0x4d
+0x21 0xe4 0xc2 0x4d
+0x21 0xe8 0x40 0x0d
+0x21 0xe8 0xc2 0x0d
+0x21 0xec 0x40 0x4d
+0x21 0xec 0xc2 0x4d
+0x21 0xec 0x40 0x0d
+0x21 0xec 0xc2 0x0d
+
+# CHECK: ld3r.8b { v1, v2, v3 }, [x1]
+# CHECK: ld3r.8b { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.16b { v1, v2, v3 }, [x1]
+# CHECK: ld3r.16b { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.4h { v1, v2, v3 }, [x1]
+# CHECK: ld3r.4h { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.8h { v1, v2, v3 }, [x1]
+# CHECK: ld3r.8h { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.2s { v1, v2, v3 }, [x1]
+# CHECK: ld3r.2s { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.2d { v1, v2, v3 }, [x1]
+# CHECK: ld3r.2d { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.1d { v1, v2, v3 }, [x1]
+# CHECK: ld3r.1d { v1, v2, v3 }, [x1], x2
+
+0x21 0xe0 0xdf 0x0d
+0x21 0xe0 0xdf 0x4d
+0x21 0xe4 0xdf 0x0d
+0x21 0xe4 0xdf 0x4d
+0x21 0xe8 0xdf 0x0d
+0x21 0xec 0xdf 0x4d
+0x21 0xec 0xdf 0x0d
+
+# CHECK: ld3r.8b	{ v1, v2, v3 }, [x1], #3
+# CHECK: ld3r.16b	{ v1, v2, v3 }, [x1], #3
+# CHECK: ld3r.4h	{ v1, v2, v3 }, [x1], #6
+# CHECK: ld3r.8h	{ v1, v2, v3 }, [x1], #6
+# CHECK: ld3r.2s	{ v1, v2, v3 }, [x1], #12
+# CHECK: ld3r.2d	{ v1, v2, v3 }, [x1], #24
+# CHECK: ld3r.1d	{ v1, v2, v3 }, [x1], #24
+
+0x21 0x00 0x40 0x0c
+0x45 0x00 0x40 0x4c
+0x0a 0x08 0x40 0x0c
+
+# CHECK: ld4.8b { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4.16b { v5, v6, v7, v8 }, [x2]
+# CHECK: ld4.2s { v10, v11, v12, v13 }, [x0]
+
+0x21 0x00 0x00 0x0c
+0x45 0x00 0x00 0x4c
+0x0a 0x08 0x00 0x0c
+
+# CHECK: st4.8b { v1, v2, v3, v4 }, [x1]
+# CHECK: st4.16b { v5, v6, v7, v8 }, [x2]
+# CHECK: st4.2s { v10, v11, v12, v13 }, [x0]
+
+0x61 0x28 0xe4 0x0d
+0x82 0xa4 0xe5 0x4d
+0xa3 0x78 0xe6 0x0d
+0xc4 0xa0 0xe7 0x4d
+
+# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3], x4
+# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4], x5
+# CHECK: ld4.h { v3, v4, v5, v6 }[3], [x5], x6
+# CHECK: ld4.s { v4, v5, v6, v7 }[2], [x6], x7
+
+0x61 0x28 0xff 0x0d
+0x82 0xa4 0xff 0x4d
+0xa3 0x78 0xff 0x0d
+0xc4 0xa0 0xff 0x4d
+
+# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3], #4
+# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4], #32
+# CHECK: ld4.h { v3, v4, v5, v6 }[3], [x5], #8
+# CHECK: ld4.s { v4, v5, v6, v7 }[2], [x6], #16
+
+0x61 0x28 0xa4 0x0d
+0x82 0xa4 0xa5 0x4d
+0xa3 0x78 0xa6 0x0d
+0xc4 0xa0 0xa7 0x4d
+
+# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3], x4
+# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4], x5
+# CHECK: st4.h { v3, v4, v5, v6 }[3], [x5], x6
+# CHECK: st4.s { v4, v5, v6, v7 }[2], [x6], x7
+
+0x61 0x28 0xbf 0x0d
+0x82 0xa4 0xbf 0x4d
+0xa3 0x78 0xbf 0x0d
+0xc4 0xa0 0xbf 0x4d
+
+# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3], #4
+# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4], #32
+# CHECK: st4.h { v3, v4, v5, v6 }[3], [x5], #8
+# CHECK: st4.s { v4, v5, v6, v7 }[2], [x6], #16
+
+0x41 0x00 0xc3 0x0c
+0x42 0x00 0xc4 0x4c
+0x64 0x04 0xc5 0x0c
+0x87 0x04 0xc6 0x4c
+0x0c 0x08 0xc7 0x0c
+0x0a 0x08 0xc8 0x4c
+0x4f 0x0c 0xca 0x4c
+
+# CHECK: ld4.8b { v1, v2, v3, v4 }, [x2], x3
+# CHECK: ld4.16b { v2, v3, v4, v5 }, [x2], x4
+# CHECK: ld4.4h { v4, v5, v6, v7 }, [x3], x5
+# CHECK: ld4.8h { v7, v8, v9, v10 }, [x4], x6
+# CHECK: ld4.2s { v12, v13, v14, v15 }, [x0], x7
+# CHECK: ld4.4s { v10, v11, v12, v13 }, [x0], x8
+# CHECK: ld4.2d { v15, v16, v17, v18 }, [x2], x10
+
+0x00 0x00 0xdf 0x0c
+0x00 0x00 0xdf 0x4c
+0x00 0x04 0xdf 0x0c
+0x00 0x04 0xdf 0x4c
+0x00 0x08 0xdf 0x0c
+0x00 0x08 0xdf 0x4c
+0x00 0x0c 0xdf 0x4c
+
+# CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32
+# CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64
+# CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32
+# CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64
+# CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
+# CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
+# CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
+
+0x00 0x00 0x9f 0x0c
+0x00 0x00 0x9f 0x4c
+0x00 0x04 0x9f 0x0c
+0x00 0x04 0x9f 0x4c
+0x00 0x08 0x9f 0x0c
+0x00 0x08 0x9f 0x4c
+0x00 0x0c 0x9f 0x4c
+
+# CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32
+# CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64
+# CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32
+# CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64
+# CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
+# CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
+# CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
+
+0x41 0x00 0x83 0x0c
+0x42 0x00 0x84 0x4c
+0x64 0x04 0x85 0x0c
+0x87 0x04 0x86 0x4c
+0x0c 0x08 0x87 0x0c
+0x0a 0x08 0x88 0x4c
+0x4f 0x0c 0x8a 0x4c
+
+# CHECK: st4.8b { v1, v2, v3, v4 }, [x2], x3
+# CHECK: st4.16b { v2, v3, v4, v5 }, [x2], x4
+# CHECK: st4.4h { v4, v5, v6, v7 }, [x3], x5
+# CHECK: st4.8h { v7, v8, v9, v10 }, [x4], x6
+# CHECK: st4.2s { v12, v13, v14, v15 }, [x0], x7
+# CHECK: st4.4s { v10, v11, v12, v13 }, [x0], x8
+# CHECK: st4.2d { v15, v16, v17, v18 }, [x2], x10
+
+0x61 0x28 0x60 0x0d
+0x82 0xa4 0x60 0x4d
+0xc3 0x70 0x60 0x0d
+0xe4 0xb0 0x60 0x4d
+
+# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3]
+# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4]
+# CHECK: ld4.h { v3, v4, v5, v6 }[2], [x6]
+# CHECK: ld4.s { v4, v5, v6, v7 }[3], [x7]
+
+0x61 0x28 0x20 0x0d
+0x82 0xa4 0x20 0x4d
+0xc3 0x70 0x20 0x0d
+0xe4 0xb0 0x20 0x4d
+
+# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3]
+# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4]
+# CHECK: st4.h { v3, v4, v5, v6 }[2], [x6]
+# CHECK: st4.s { v4, v5, v6, v7 }[3], [x7]
+
+0x21 0xe0 0x60 0x0d
+0x21 0xe0 0xe2 0x0d
+0x21 0xe0 0x60 0x4d
+0x21 0xe0 0xe2 0x4d
+0x21 0xe4 0x60 0x0d
+0x21 0xe4 0xe2 0x0d
+0x21 0xe4 0x60 0x4d
+0x21 0xe4 0xe2 0x4d
+0x21 0xe8 0x60 0x0d
+0x21 0xe8 0xe2 0x0d
+0x21 0xec 0x60 0x4d
+0x21 0xec 0xe2 0x4d
+0x21 0xec 0x60 0x0d
+0x21 0xec 0xe2 0x0d
+
+# CHECK: ld4r.8b { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.8b { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.16b { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.16b { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.4h { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.4h { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.8h { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.8h { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.2s { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.2s { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.2d { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.2d { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.1d { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.1d { v1, v2, v3, v4 }, [x1], x2
+
+0x21 0xe0 0xff 0x0d
+0x21 0xe0 0xff 0x4d
+0x21 0xe4 0xff 0x0d
+0x21 0xe4 0xff 0x4d
+0x21 0xe8 0xff 0x0d
+0x21 0xec 0xff 0x4d
+0x21 0xec 0xff 0x0d
+
+# CHECK: ld4r.8b	{ v1, v2, v3, v4 }, [x1], #4
+# CHECK: ld4r.16b	{ v1, v2, v3, v4 }, [x1], #4
+# CHECK: ld4r.4h	{ v1, v2, v3, v4 }, [x1], #8
+# CHECK: ld4r.8h	{ v1, v2, v3, v4 }, [x1], #8
+# CHECK: ld4r.2s	{ v1, v2, v3, v4 }, [x1], #16
+# CHECK: ld4r.2d	{ v1, v2, v3, v4 }, [x1], #32
+# CHECK: ld4r.1d	{ v1, v2, v3, v4 }, [x1], #32
+
+0x20 0xe4 0x00 0x2f
+0x20 0xe4 0x00 0x6f
+0x20 0xe4 0x00 0x0f
+0x20 0xe4 0x00 0x4f
+
+# CHECK: movi     d0, #0x000000000000ff
+# CHECK: movi.2d  v0, #0x000000000000ff
+# CHECK: movi.8b  v0, #0x1
+# CHECK: movi.16b v0, #0x1
+
+0x20 0x04 0x00 0x0f
+0x20 0x24 0x00 0x0f
+0x20 0x44 0x00 0x0f
+0x20 0x64 0x00 0x0f
+
+# CHECK: movi.2s v0, #0x1
+# CHECK: movi.2s v0, #0x1, lsl #8
+# CHECK: movi.2s v0, #0x1, lsl #16
+# CHECK: movi.2s v0, #0x1, lsl #24
+
+0x20 0x04 0x00 0x4f
+0x20 0x24 0x00 0x4f
+0x20 0x44 0x00 0x4f
+0x20 0x64 0x00 0x4f
+
+# CHECK: movi.4s v0, #0x1
+# CHECK: movi.4s v0, #0x1, lsl #8
+# CHECK: movi.4s v0, #0x1, lsl #16
+# CHECK: movi.4s v0, #0x1, lsl #24
+
+0x20 0x84 0x00 0x0f
+0x20 0xa4 0x00 0x0f
+
+# CHECK: movi.4h v0, #0x1
+# CHECK: movi.4h v0, #0x1, lsl #8
+
+0x20 0x84 0x00 0x4f
+0x20 0xa4 0x00 0x4f
+
+# CHECK: movi.8h v0, #0x1
+# CHECK: movi.8h v0, #0x1, lsl #8
+
+0x20 0x04 0x00 0x2f
+0x20 0x24 0x00 0x2f
+0x20 0x44 0x00 0x2f
+0x20 0x64 0x00 0x2f
+
+# CHECK: mvni.2s v0, #0x1
+# CHECK: mvni.2s v0, #0x1, lsl #8
+# CHECK: mvni.2s v0, #0x1, lsl #16
+# CHECK: mvni.2s v0, #0x1, lsl #24
+
+0x20 0x04 0x00 0x6f
+0x20 0x24 0x00 0x6f
+0x20 0x44 0x00 0x6f
+0x20 0x64 0x00 0x6f
+
+# CHECK: mvni.4s v0, #0x1
+# CHECK: mvni.4s v0, #0x1, lsl #8
+# CHECK: mvni.4s v0, #0x1, lsl #16
+# CHECK: mvni.4s v0, #0x1, lsl #24
+
+0x20 0x84 0x00 0x2f
+0x20 0xa4 0x00 0x2f
+
+# CHECK: mvni.4h v0, #0x1
+# CHECK: mvni.4h v0, #0x1, lsl #8
+
+0x20 0x84 0x00 0x6f
+0x20 0xa4 0x00 0x6f
+
+# CHECK: mvni.8h v0, #0x1
+# CHECK: mvni.8h v0, #0x1, lsl #8
+
+0x20 0xc4 0x00 0x2f
+0x20 0xd4 0x00 0x2f
+0x20 0xc4 0x00 0x6f
+0x20 0xd4 0x00 0x6f
+
+# CHECK: mvni.2s v0, #0x1, msl #8
+# CHECK: mvni.2s v0, #0x1, msl #16
+# CHECK: mvni.4s v0, #0x1, msl #8
+# CHECK: mvni.4s v0, #0x1, msl #16
+
+0x00 0x88 0x21 0x2e
+0x00 0x98 0x21 0x2e
+0x00 0x98 0xa1 0x2e
+0x00 0x98 0x21 0x0e
+0x00 0x88 0x21 0x0e
+0x00 0x88 0xa1 0x0e
+0x00 0x98 0xa1 0x0e
+
+# CHECK: frinta.2s	v0, v0
+# CHECK: frintx.2s	v0, v0
+# CHECK: frinti.2s	v0, v0
+# CHECK: frintm.2s	v0, v0
+# CHECK: frintn.2s	v0, v0
+# CHECK: frintp.2s	v0, v0
+# CHECK: frintz.2s	v0, v0
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD scalar x index instructions
+#===-------------------------------------------------------------------------===
+
+0x00 0x18 0xa0 0x5f
+0x00 0x18 0xc0 0x5f
+0x00 0x58 0xa0 0x5f
+0x00 0x58 0xc0 0x5f
+0x00 0x98 0xa0 0x7f
+0x00 0x98 0xc0 0x7f
+0x00 0x98 0xa0 0x5f
+0x00 0x98 0xc0 0x5f
+0x00 0x38 0x70 0x5f
+0x00 0x38 0xa0 0x5f
+0x00 0x78 0x70 0x5f
+0x00 0xc8 0x70 0x5f
+0x00 0xc8 0xa0 0x5f
+0x00 0xb8 0x70 0x5f
+0x00 0xb8 0xa0 0x5f
+0x00 0xd8 0x70 0x5f
+0x00 0xd8 0xa0 0x5f
+
+# CHECK: fmla.s	s0, s0, v0[3]
+# CHECK: fmla.d	d0, d0, v0[1]
+# CHECK: fmls.s	s0, s0, v0[3]
+# CHECK: fmls.d	d0, d0, v0[1]
+# CHECK: fmulx.s	s0, s0, v0[3]
+# CHECK: fmulx.d	d0, d0, v0[1]
+# CHECK: fmul.s	s0, s0, v0[3]
+# CHECK: fmul.d	d0, d0, v0[1]
+# CHECK: sqdmlal.h	s0, h0, v0[7]
+# CHECK: sqdmlal.s	d0, s0, v0[3]
+# CHECK: sqdmlsl.h	s0, h0, v0[7]
+# CHECK: sqdmulh.h	h0, h0, v0[7]
+# CHECK: sqdmulh.s	s0, s0, v0[3]
+# CHECK: sqdmull.h	s0, h0, v0[7]
+# CHECK: sqdmull.s	d0, s0, v0[3]
+# CHECK: sqrdmulh.h	h0, h0, v0[7]
+# CHECK: sqrdmulh.s	s0, s0, v0[3]
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD vector x index instructions
+#===-------------------------------------------------------------------------===
+
+  0x00 0x10 0x80 0x0f
+  0x00 0x10 0xa0 0x4f
+  0x00 0x18 0xc0 0x4f
+  0x00 0x50 0x80 0x0f
+  0x00 0x50 0xa0 0x4f
+  0x00 0x58 0xc0 0x4f
+  0x00 0x90 0x80 0x2f
+  0x00 0x90 0xa0 0x6f
+  0x00 0x98 0xc0 0x6f
+  0x00 0x90 0x80 0x0f
+  0x00 0x90 0xa0 0x4f
+  0x00 0x98 0xc0 0x4f
+  0x00 0x00 0x40 0x2f
+  0x00 0x00 0x50 0x6f
+  0x00 0x08 0x80 0x2f
+  0x00 0x08 0xa0 0x6f
+  0x00 0x40 0x40 0x2f
+  0x00 0x40 0x50 0x6f
+  0x00 0x48 0x80 0x2f
+  0x00 0x48 0xa0 0x6f
+  0x00 0x80 0x40 0x0f
+  0x00 0x80 0x50 0x4f
+  0x00 0x88 0x80 0x0f
+  0x00 0x88 0xa0 0x4f
+  0x00 0x20 0x40 0x0f
+  0x00 0x20 0x50 0x4f
+  0x00 0x28 0x80 0x0f
+  0x00 0x28 0xa0 0x4f
+  0x00 0x60 0x40 0x0f
+  0x00 0x60 0x50 0x4f
+  0x00 0x68 0x80 0x0f
+  0x00 0x68 0xa0 0x4f
+  0x00 0xa0 0x40 0x0f
+  0x00 0xa0 0x50 0x4f
+  0x00 0xa8 0x80 0x0f
+  0x00 0xa8 0xa0 0x4f
+  0x00 0x30 0x40 0x0f
+  0x00 0x30 0x50 0x4f
+  0x00 0x38 0x80 0x0f
+  0x00 0x38 0xa0 0x4f
+  0x00 0x70 0x40 0x0f
+  0x00 0x70 0x50 0x4f
+  0x00 0x78 0x80 0x0f
+  0x00 0x78 0xa0 0x4f
+  0x00 0xc0 0x40 0x0f
+  0x00 0xc0 0x50 0x4f
+  0x00 0xc8 0x80 0x0f
+  0x00 0xc8 0xa0 0x4f
+  0x00 0xb0 0x40 0x0f
+  0x00 0xb0 0x50 0x4f
+  0x00 0xb8 0x80 0x0f
+  0x00 0xb8 0xa0 0x4f
+  0x00 0xd0 0x40 0x0f
+  0x00 0xd0 0x50 0x4f
+  0x00 0xd8 0x80 0x0f
+  0x00 0xd8 0xa0 0x4f
+  0x00 0x20 0x40 0x2f
+  0x00 0x20 0x50 0x6f
+  0x00 0x28 0x80 0x2f
+  0x00 0x28 0xa0 0x6f
+  0x00 0x60 0x40 0x2f
+  0x00 0x60 0x50 0x6f
+  0x00 0x68 0x80 0x2f
+  0x00 0x68 0xa0 0x6f
+  0x00 0xa0 0x40 0x2f
+  0x00 0xa0 0x50 0x6f
+  0x00 0xa8 0x80 0x2f
+  0x00 0xa8 0xa0 0x6f
+
+# CHECK: fmla.2s	v0, v0, v0[0]
+# CHECK: fmla.4s	v0, v0, v0[1]
+# CHECK: fmla.2d	v0, v0, v0[1]
+# CHECK: fmls.2s	v0, v0, v0[0]
+# CHECK: fmls.4s	v0, v0, v0[1]
+# CHECK: fmls.2d	v0, v0, v0[1]
+# CHECK: fmulx.2s	v0, v0, v0[0]
+# CHECK: fmulx.4s	v0, v0, v0[1]
+# CHECK: fmulx.2d	v0, v0, v0[1]
+# CHECK: fmul.2s	v0, v0, v0[0]
+# CHECK: fmul.4s	v0, v0, v0[1]
+# CHECK: fmul.2d	v0, v0, v0[1]
+# CHECK: mla.4h	v0, v0, v0[0]
+# CHECK: mla.8h	v0, v0, v0[1]
+# CHECK: mla.2s	v0, v0, v0[2]
+# CHECK: mla.4s	v0, v0, v0[3]
+# CHECK: mls.4h	v0, v0, v0[0]
+# CHECK: mls.8h	v0, v0, v0[1]
+# CHECK: mls.2s	v0, v0, v0[2]
+# CHECK: mls.4s	v0, v0, v0[3]
+# CHECK: mul.4h	v0, v0, v0[0]
+# CHECK: mul.8h	v0, v0, v0[1]
+# CHECK: mul.2s	v0, v0, v0[2]
+# CHECK: mul.4s	v0, v0, v0[3]
+# CHECK: smlal.4s	v0, v0, v0[0]
+# CHECK: smlal2.4s	v0, v0, v0[1]
+# CHECK: smlal.2d	v0, v0, v0[2]
+# CHECK: smlal2.2d	v0, v0, v0[3]
+# CHECK: smlsl.4s	v0, v0, v0[0]
+# CHECK: smlsl2.4s	v0, v0, v0[1]
+# CHECK: smlsl.2d	v0, v0, v0[2]
+# CHECK: smlsl2.2d	v0, v0, v0[3]
+# CHECK: smull.4s	v0, v0, v0[0]
+# CHECK: smull2.4s	v0, v0, v0[1]
+# CHECK: smull.2d	v0, v0, v0[2]
+# CHECK: smull2.2d	v0, v0, v0[3]
+# CHECK: sqdmlal.4s	v0, v0, v0[0]
+# CHECK: sqdmlal2.4s	v0, v0, v0[1]
+# CHECK: sqdmlal.2d	v0, v0, v0[2]
+# CHECK: sqdmlal2.2d	v0, v0, v0[3]
+# CHECK: sqdmlsl.4s	v0, v0, v0[0]
+# CHECK: sqdmlsl2.4s	v0, v0, v0[1]
+# CHECK: sqdmlsl.2d	v0, v0, v0[2]
+# CHECK: sqdmlsl2.2d	v0, v0, v0[3]
+# CHECK: sqdmulh.4h	v0, v0, v0[0]
+# CHECK: sqdmulh.8h	v0, v0, v0[1]
+# CHECK: sqdmulh.2s	v0, v0, v0[2]
+# CHECK: sqdmulh.4s	v0, v0, v0[3]
+# CHECK: sqdmull.4s	v0, v0, v0[0]
+# CHECK: sqdmull2.4s	v0, v0, v0[1]
+# CHECK: sqdmull.2d	v0, v0, v0[2]
+# CHECK: sqdmull2.2d	v0, v0, v0[3]
+# CHECK: sqrdmulh.4h	v0, v0, v0[0]
+# CHECK: sqrdmulh.8h	v0, v0, v0[1]
+# CHECK: sqrdmulh.2s	v0, v0, v0[2]
+# CHECK: sqrdmulh.4s	v0, v0, v0[3]
+# CHECK: umlal.4s	v0, v0, v0[0]
+# CHECK: umlal2.4s	v0, v0, v0[1]
+# CHECK: umlal.2d	v0, v0, v0[2]
+# CHECK: umlal2.2d	v0, v0, v0[3]
+# CHECK: umlsl.4s	v0, v0, v0[0]
+# CHECK: umlsl2.4s	v0, v0, v0[1]
+# CHECK: umlsl.2d	v0, v0, v0[2]
+# CHECK: umlsl2.2d	v0, v0, v0[3]
+# CHECK: umull.4s	v0, v0, v0[0]
+# CHECK: umull2.4s	v0, v0, v0[1]
+# CHECK: umull.2d	v0, v0, v0[2]
+# CHECK: umull2.2d	v0, v0, v0[3]
+
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD scalar + shift instructions
+#===-------------------------------------------------------------------------===
+
+  0x00 0x54 0x41 0x5f
+  0x00 0x54 0x41 0x7f
+  0x00 0x9c 0x09 0x5f
+  0x00 0x9c 0x12 0x5f
+  0x00 0x9c 0x23 0x5f
+  0x00 0x8c 0x09 0x7f
+  0x00 0x8c 0x12 0x7f
+  0x00 0x8c 0x23 0x7f
+  0x00 0x64 0x09 0x7f
+  0x00 0x64 0x12 0x7f
+  0x00 0x64 0x23 0x7f
+  0x00 0x64 0x44 0x7f
+  0x00 0x74 0x09 0x5f
+  0x00 0x74 0x12 0x5f
+  0x00 0x74 0x23 0x5f
+  0x00 0x74 0x44 0x5f
+  0x00 0x94 0x09 0x5f
+  0x00 0x94 0x12 0x5f
+  0x00 0x94 0x23 0x5f
+  0x00 0x84 0x09 0x7f
+  0x00 0x84 0x12 0x7f
+  0x00 0x84 0x23 0x7f
+  0x00 0x44 0x41 0x7f
+  0x00 0x24 0x41 0x5f
+  0x00 0x34 0x41 0x5f
+  0x00 0x04 0x41 0x5f
+  0x00 0xe4 0x21 0x7f
+  0x00 0xe4 0x42 0x7f
+  0x00 0x9c 0x09 0x7f
+  0x00 0x9c 0x12 0x7f
+  0x00 0x9c 0x23 0x7f
+  0x00 0x74 0x09 0x7f
+  0x00 0x74 0x12 0x7f
+  0x00 0x74 0x23 0x7f
+  0x00 0x74 0x44 0x7f
+  0x00 0x94 0x09 0x7f
+  0x00 0x94 0x12 0x7f
+  0x00 0x94 0x23 0x7f
+  0x00 0x24 0x41 0x7f
+  0x00 0x34 0x41 0x7f
+  0x00 0x04 0x41 0x7f
+  0x00 0x14 0x41 0x7f
+
+# CHECK: shl	d0, d0, #1
+# CHECK: sli	d0, d0, #1
+# CHECK: sqrshrn	b0, h0, #7
+# CHECK: sqrshrn	h0, s0, #14
+# CHECK: sqrshrn	s0, d0, #29
+# CHECK: sqrshrun	b0, h0, #7
+# CHECK: sqrshrun	h0, s0, #14
+# CHECK: sqrshrun	s0, d0, #29
+# CHECK: sqshlu	b0, b0, #1
+# CHECK: sqshlu	h0, h0, #2
+# CHECK: sqshlu	s0, s0, #3
+# CHECK: sqshlu	d0, d0, #4
+# CHECK: sqshl	b0, b0, #1
+# CHECK: sqshl	h0, h0, #2
+# CHECK: sqshl	s0, s0, #3
+# CHECK: sqshl	d0, d0, #4
+# CHECK: sqshrn	b0, h0, #7
+# CHECK: sqshrn	h0, s0, #14
+# CHECK: sqshrn	s0, d0, #29
+# CHECK: sqshrun	b0, h0, #7
+# CHECK: sqshrun	h0, s0, #14
+# CHECK: sqshrun	s0, d0, #29
+# CHECK: sri	d0, d0, #63
+# CHECK: srshr	d0, d0, #63
+# CHECK: srsra	d0, d0, #63
+# CHECK: sshr	d0, d0, #63
+# CHECK: ucvtf	s0, s0, #31
+# CHECK: ucvtf	d0, d0, #62
+# CHECK: uqrshrn	b0, h0, #7
+# CHECK: uqrshrn	h0, s0, #14
+# CHECK: uqrshrn	s0, d0, #29
+# CHECK: uqshl	b0, b0, #1
+# CHECK: uqshl	h0, h0, #2
+# CHECK: uqshl	s0, s0, #3
+# CHECK: uqshl	d0, d0, #4
+# CHECK: uqshrn	b0, h0, #7
+# CHECK: uqshrn	h0, s0, #14
+# CHECK: uqshrn	s0, d0, #29
+# CHECK: urshr	d0, d0, #63
+# CHECK: ursra	d0, d0, #63
+# CHECK: ushr	d0, d0, #63
+# CHECK: usra	d0, d0, #63
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD vector + shift instructions
+#===-------------------------------------------------------------------------===
+
+  0x00 0xfc 0x21 0x0f
+  0x00 0xfc 0x22 0x4f
+  0x00 0xfc 0x43 0x4f
+  0x00 0xfc 0x21 0x2f
+  0x00 0xfc 0x22 0x6f
+  0x00 0xfc 0x43 0x6f
+  0x00 0x8c 0x09 0x0f
+  0x00 0x8c 0x0a 0x4f
+  0x00 0x8c 0x13 0x0f
+  0x00 0x8c 0x14 0x4f
+  0x00 0x8c 0x25 0x0f
+  0x00 0x8c 0x26 0x4f
+  0x00 0xe4 0x21 0x0f
+  0x00 0xe4 0x22 0x4f
+  0x00 0xe4 0x43 0x4f
+  0x00 0x54 0x09 0x0f
+  0x00 0x54 0x0a 0x4f
+  0x00 0x54 0x13 0x0f
+  0x00 0x54 0x14 0x4f
+  0x00 0x54 0x25 0x0f
+  0x00 0x54 0x26 0x4f
+  0x00 0x54 0x47 0x4f
+  0x00 0x84 0x09 0x0f
+  0x00 0x84 0x0a 0x4f
+  0x00 0x84 0x13 0x0f
+  0x00 0x84 0x14 0x4f
+  0x00 0x84 0x25 0x0f
+  0x00 0x84 0x26 0x4f
+  0x00 0x54 0x09 0x2f
+  0x00 0x54 0x0a 0x6f
+  0x00 0x54 0x13 0x2f
+  0x00 0x54 0x14 0x6f
+  0x00 0x54 0x25 0x2f
+  0x00 0x54 0x26 0x6f
+  0x00 0x54 0x47 0x6f
+  0x00 0x9c 0x09 0x0f
+  0x00 0x9c 0x0a 0x4f
+  0x00 0x9c 0x13 0x0f
+  0x00 0x9c 0x14 0x4f
+  0x00 0x9c 0x25 0x0f
+  0x00 0x9c 0x26 0x4f
+  0x00 0x8c 0x09 0x2f
+  0x00 0x8c 0x0a 0x6f
+  0x00 0x8c 0x13 0x2f
+  0x00 0x8c 0x14 0x6f
+  0x00 0x8c 0x25 0x2f
+  0x00 0x8c 0x26 0x6f
+  0x00 0x64 0x09 0x2f
+  0x00 0x64 0x0a 0x6f
+  0x00 0x64 0x13 0x2f
+  0x00 0x64 0x14 0x6f
+  0x00 0x64 0x25 0x2f
+  0x00 0x64 0x26 0x6f
+  0x00 0x64 0x47 0x6f
+  0x00 0x74 0x09 0x0f
+  0x00 0x74 0x0a 0x4f
+  0x00 0x74 0x13 0x0f
+  0x00 0x74 0x14 0x4f
+  0x00 0x74 0x25 0x0f
+  0x00 0x74 0x26 0x4f
+  0x00 0x74 0x47 0x4f
+  0x00 0x94 0x09 0x0f
+  0x00 0x94 0x0a 0x4f
+  0x00 0x94 0x13 0x0f
+  0x00 0x94 0x14 0x4f
+  0x00 0x94 0x25 0x0f
+  0x00 0x94 0x26 0x4f
+  0x00 0x84 0x09 0x2f
+  0x00 0x84 0x0a 0x6f
+  0x00 0x84 0x13 0x2f
+  0x00 0x84 0x14 0x6f
+  0x00 0x84 0x25 0x2f
+  0x00 0x84 0x26 0x6f
+  0x00 0x44 0x09 0x2f
+  0x00 0x44 0x0a 0x6f
+  0x00 0x44 0x13 0x2f
+  0x00 0x44 0x14 0x6f
+  0x00 0x44 0x25 0x2f
+  0x00 0x44 0x26 0x6f
+  0x00 0x44 0x47 0x6f
+  0x00 0x24 0x09 0x0f
+  0x00 0x24 0x0a 0x4f
+  0x00 0x24 0x13 0x0f
+  0x00 0x24 0x14 0x4f
+  0x00 0x24 0x25 0x0f
+  0x00 0x24 0x26 0x4f
+  0x00 0x24 0x47 0x4f
+  0x00 0x34 0x09 0x0f
+  0x00 0x34 0x0a 0x4f
+  0x00 0x34 0x13 0x0f
+  0x00 0x34 0x14 0x4f
+  0x00 0x34 0x25 0x0f
+  0x00 0x34 0x26 0x4f
+  0x00 0x34 0x47 0x4f
+  0x00 0xa4 0x09 0x0f
+  0x00 0xa4 0x0a 0x4f
+  0x00 0xa4 0x13 0x0f
+  0x00 0xa4 0x14 0x4f
+  0x00 0xa4 0x25 0x0f
+  0x00 0xa4 0x26 0x4f
+  0x00 0x04 0x09 0x0f
+  0x00 0x04 0x0a 0x4f
+  0x00 0x04 0x13 0x0f
+  0x00 0x04 0x14 0x4f
+  0x00 0x04 0x25 0x0f
+  0x00 0x04 0x26 0x4f
+  0x00 0x04 0x47 0x4f
+  0x00 0x04 0x09 0x0f
+  0x00 0x14 0x0a 0x4f
+  0x00 0x14 0x13 0x0f
+  0x00 0x14 0x14 0x4f
+  0x00 0x14 0x25 0x0f
+  0x00 0x14 0x26 0x4f
+  0x00 0x14 0x47 0x4f
+  0x00 0x14 0x40 0x5f
+  0x00 0xe4 0x21 0x2f
+  0x00 0xe4 0x22 0x6f
+  0x00 0xe4 0x43 0x6f
+  0x00 0x9c 0x09 0x2f
+  0x00 0x9c 0x0a 0x6f
+  0x00 0x9c 0x13 0x2f
+  0x00 0x9c 0x14 0x6f
+  0x00 0x9c 0x25 0x2f
+  0x00 0x9c 0x26 0x6f
+  0x00 0x74 0x09 0x2f
+  0x00 0x74 0x0a 0x6f
+  0x00 0x74 0x13 0x2f
+  0x00 0x74 0x14 0x6f
+  0x00 0x74 0x25 0x2f
+  0x00 0x74 0x26 0x6f
+  0x00 0x74 0x47 0x6f
+  0x00 0x94 0x09 0x2f
+  0x00 0x94 0x0a 0x6f
+  0x00 0x94 0x13 0x2f
+  0x00 0x94 0x14 0x6f
+  0x00 0x94 0x25 0x2f
+  0x00 0x94 0x26 0x6f
+  0x00 0x24 0x09 0x2f
+  0x00 0x24 0x0a 0x6f
+  0x00 0x24 0x13 0x2f
+  0x00 0x24 0x14 0x6f
+  0x00 0x24 0x25 0x2f
+  0x00 0x24 0x26 0x6f
+  0x00 0x24 0x47 0x6f
+  0x00 0x34 0x09 0x2f
+  0x00 0x34 0x0a 0x6f
+  0x00 0x34 0x13 0x2f
+  0x00 0x34 0x14 0x6f
+  0x00 0x34 0x25 0x2f
+  0x00 0x34 0x26 0x6f
+  0x00 0x34 0x47 0x6f
+  0x00 0xa4 0x09 0x2f
+  0x00 0xa4 0x0a 0x6f
+  0x00 0xa4 0x13 0x2f
+  0x00 0xa4 0x14 0x6f
+  0x00 0xa4 0x25 0x2f
+  0x00 0xa4 0x26 0x6f
+  0x00 0x04 0x09 0x2f
+  0x00 0x04 0x0a 0x6f
+  0x00 0x04 0x13 0x2f
+  0x00 0x04 0x14 0x6f
+  0x00 0x04 0x25 0x2f
+  0x00 0x04 0x26 0x6f
+  0x00 0x04 0x47 0x6f
+  0x00 0x14 0x09 0x2f
+  0x00 0x14 0x0a 0x6f
+  0x00 0x14 0x13 0x2f
+  0x00 0x14 0x14 0x6f
+  0x00 0x14 0x25 0x2f
+  0x00 0x14 0x26 0x6f
+  0x00 0x14 0x47 0x6f
+
+# CHECK: fcvtzs.2s	v0, v0, #31
+# CHECK: fcvtzs.4s	v0, v0, #30
+# CHECK: fcvtzs.2d	v0, v0, #61
+# CHECK: fcvtzu.2s	v0, v0, #31
+# CHECK: fcvtzu.4s	v0, v0, #30
+# CHECK: fcvtzu.2d	v0, v0, #61
+# CHECK: rshrn.8b	v0, v0, #7
+# CHECK: rshrn2.16b	v0, v0, #6
+# CHECK: rshrn.4h	v0, v0, #13
+# CHECK: rshrn2.8h	v0, v0, #12
+# CHECK: rshrn.2s	v0, v0, #27
+# CHECK: rshrn2.4s	v0, v0, #26
+# CHECK: scvtf.2s	v0, v0, #31
+# CHECK: scvtf.4s	v0, v0, #30
+# CHECK: scvtf.2d	v0, v0, #61
+# CHECK: shl.8b	v0, v0, #1
+# CHECK: shl.16b	v0, v0, #2
+# CHECK: shl.4h	v0, v0, #3
+# CHECK: shl.8h	v0, v0, #4
+# CHECK: shl.2s	v0, v0, #5
+# CHECK: shl.4s	v0, v0, #6
+# CHECK: shl.2d	v0, v0, #7
+# CHECK: shrn.8b	v0, v0, #7
+# CHECK: shrn2.16b	v0, v0, #6
+# CHECK: shrn.4h	v0, v0, #13
+# CHECK: shrn2.8h	v0, v0, #12
+# CHECK: shrn.2s	v0, v0, #27
+# CHECK: shrn2.4s	v0, v0, #26
+# CHECK: sli.8b	v0, v0, #1
+# CHECK: sli.16b	v0, v0, #2
+# CHECK: sli.4h	v0, v0, #3
+# CHECK: sli.8h	v0, v0, #4
+# CHECK: sli.2s	v0, v0, #5
+# CHECK: sli.4s	v0, v0, #6
+# CHECK: sli.2d	v0, v0, #7
+# CHECK: sqrshrn.8b	v0, v0, #7
+# CHECK: sqrshrn2.16b	v0, v0, #6
+# CHECK: sqrshrn.4h	v0, v0, #13
+# CHECK: sqrshrn2.8h	v0, v0, #12
+# CHECK: sqrshrn.2s	v0, v0, #27
+# CHECK: sqrshrn2.4s	v0, v0, #26
+# CHECK: sqrshrun.8b	v0, v0, #7
+# CHECK: sqrshrun2.16b	v0, v0, #6
+# CHECK: sqrshrun.4h	v0, v0, #13
+# CHECK: sqrshrun2.8h	v0, v0, #12
+# CHECK: sqrshrun.2s	v0, v0, #27
+# CHECK: sqrshrun2.4s	v0, v0, #26
+# CHECK: sqshlu.8b	v0, v0, #1
+# CHECK: sqshlu.16b	v0, v0, #2
+# CHECK: sqshlu.4h	v0, v0, #3
+# CHECK: sqshlu.8h	v0, v0, #4
+# CHECK: sqshlu.2s	v0, v0, #5
+# CHECK: sqshlu.4s	v0, v0, #6
+# CHECK: sqshlu.2d	v0, v0, #7
+# CHECK: sqshl.8b	v0, v0, #1
+# CHECK: sqshl.16b	v0, v0, #2
+# CHECK: sqshl.4h	v0, v0, #3
+# CHECK: sqshl.8h	v0, v0, #4
+# CHECK: sqshl.2s	v0, v0, #5
+# CHECK: sqshl.4s	v0, v0, #6
+# CHECK: sqshl.2d	v0, v0, #7
+# CHECK: sqshrn.8b	v0, v0, #7
+# CHECK: sqshrn2.16b	v0, v0, #6
+# CHECK: sqshrn.4h	v0, v0, #13
+# CHECK: sqshrn2.8h	v0, v0, #12
+# CHECK: sqshrn.2s	v0, v0, #27
+# CHECK: sqshrn2.4s	v0, v0, #26
+# CHECK: sqshrun.8b	v0, v0, #7
+# CHECK: sqshrun2.16b	v0, v0, #6
+# CHECK: sqshrun.4h	v0, v0, #13
+# CHECK: sqshrun2.8h	v0, v0, #12
+# CHECK: sqshrun.2s	v0, v0, #27
+# CHECK: sqshrun2.4s	v0, v0, #26
+# CHECK: sri.8b	v0, v0, #7
+# CHECK: sri.16b	v0, v0, #6
+# CHECK: sri.4h	v0, v0, #13
+# CHECK: sri.8h	v0, v0, #12
+# CHECK: sri.2s	v0, v0, #27
+# CHECK: sri.4s	v0, v0, #26
+# CHECK: sri.2d	v0, v0, #57
+# CHECK: srshr.8b	v0, v0, #7
+# CHECK: srshr.16b	v0, v0, #6
+# CHECK: srshr.4h	v0, v0, #13
+# CHECK: srshr.8h	v0, v0, #12
+# CHECK: srshr.2s	v0, v0, #27
+# CHECK: srshr.4s	v0, v0, #26
+# CHECK: srshr.2d	v0, v0, #57
+# CHECK: srsra.8b	v0, v0, #7
+# CHECK: srsra.16b	v0, v0, #6
+# CHECK: srsra.4h	v0, v0, #13
+# CHECK: srsra.8h	v0, v0, #12
+# CHECK: srsra.2s	v0, v0, #27
+# CHECK: srsra.4s	v0, v0, #26
+# CHECK: srsra.2d	v0, v0, #57
+# CHECK: sshll.8h	v0, v0, #1
+# CHECK: sshll2.8h	v0, v0, #2
+# CHECK: sshll.4s	v0, v0, #3
+# CHECK: sshll2.4s	v0, v0, #4
+# CHECK: sshll.2d	v0, v0, #5
+# CHECK: sshll2.2d	v0, v0, #6
+# CHECK: sshr.8b	v0, v0, #7
+# CHECK: sshr.16b	v0, v0, #6
+# CHECK: sshr.4h	v0, v0, #13
+# CHECK: sshr.8h	v0, v0, #12
+# CHECK: sshr.2s	v0, v0, #27
+# CHECK: sshr.4s	v0, v0, #26
+# CHECK: sshr.2d	v0, v0, #57
+# CHECK: sshr.8b	v0, v0, #7
+# CHECK: ssra.16b	v0, v0, #6
+# CHECK: ssra.4h	v0, v0, #13
+# CHECK: ssra.8h	v0, v0, #12
+# CHECK: ssra.2s	v0, v0, #27
+# CHECK: ssra.4s	v0, v0, #26
+# CHECK: ssra.2d	v0, v0, #57
+# CHECK: ssra		d0, d0, #64
+# CHECK: ucvtf.2s	v0, v0, #31
+# CHECK: ucvtf.4s	v0, v0, #30
+# CHECK: ucvtf.2d	v0, v0, #61
+# CHECK: uqrshrn.8b	v0, v0, #7
+# CHECK: uqrshrn2.16b	v0, v0, #6
+# CHECK: uqrshrn.4h	v0, v0, #13
+# CHECK: uqrshrn2.8h	v0, v0, #12
+# CHECK: uqrshrn.2s	v0, v0, #27
+# CHECK: uqrshrn2.4s	v0, v0, #26
+# CHECK: uqshl.8b	v0, v0, #1
+# CHECK: uqshl.16b	v0, v0, #2
+# CHECK: uqshl.4h	v0, v0, #3
+# CHECK: uqshl.8h	v0, v0, #4
+# CHECK: uqshl.2s	v0, v0, #5
+# CHECK: uqshl.4s	v0, v0, #6
+# CHECK: uqshl.2d	v0, v0, #7
+# CHECK: uqshrn.8b	v0, v0, #7
+# CHECK: uqshrn2.16b	v0, v0, #6
+# CHECK: uqshrn.4h	v0, v0, #13
+# CHECK: uqshrn2.8h	v0, v0, #12
+# CHECK: uqshrn.2s	v0, v0, #27
+# CHECK: uqshrn2.4s	v0, v0, #26
+# CHECK: urshr.8b	v0, v0, #7
+# CHECK: urshr.16b	v0, v0, #6
+# CHECK: urshr.4h	v0, v0, #13
+# CHECK: urshr.8h	v0, v0, #12
+# CHECK: urshr.2s	v0, v0, #27
+# CHECK: urshr.4s	v0, v0, #26
+# CHECK: urshr.2d	v0, v0, #57
+# CHECK: ursra.8b	v0, v0, #7
+# CHECK: ursra.16b	v0, v0, #6
+# CHECK: ursra.4h	v0, v0, #13
+# CHECK: ursra.8h	v0, v0, #12
+# CHECK: ursra.2s	v0, v0, #27
+# CHECK: ursra.4s	v0, v0, #26
+# CHECK: ursra.2d	v0, v0, #57
+# CHECK: ushll.8h	v0, v0, #1
+# CHECK: ushll2.8h	v0, v0, #2
+# CHECK: ushll.4s	v0, v0, #3
+# CHECK: ushll2.4s	v0, v0, #4
+# CHECK: ushll.2d	v0, v0, #5
+# CHECK: ushll2.2d	v0, v0, #6
+# CHECK: ushr.8b	v0, v0, #7
+# CHECK: ushr.16b	v0, v0, #6
+# CHECK: ushr.4h	v0, v0, #13
+# CHECK: ushr.8h	v0, v0, #12
+# CHECK: ushr.2s	v0, v0, #27
+# CHECK: ushr.4s	v0, v0, #26
+# CHECK: ushr.2d	v0, v0, #57
+# CHECK: usra.8b	v0, v0, #7
+# CHECK: usra.16b	v0, v0, #6
+# CHECK: usra.4h	v0, v0, #13
+# CHECK: usra.8h	v0, v0, #12
+# CHECK: usra.2s	v0, v0, #27
+# CHECK: usra.4s	v0, v0, #26
+# CHECK: usra.2d	v0, v0, #57
+
+
+  0x00 0xe0 0x20 0x0e
+  0x00 0xe0 0x20 0x4e
+  0x00 0xe0 0xe0 0x0e
+  0x00 0xe0 0xe0 0x4e
+
+# CHECK: pmull.8h v0, v0, v0
+# CHECK: pmull2.8h v0, v0, v0
+# CHECK: pmull.1q v0, v0, v0
+# CHECK: pmull2.1q v0, v0, v0
+
+  0x41 0xd8 0x70 0x7e
+  0x83 0xd8 0x30 0x7e
+# CHECK: faddp.2d	d1, v2
+# CHECK: faddp.2s	s3, v4
+
+  0x82 0x60 0x01 0x4e
+  0x80 0x60 0x01 0x0e
+  0xa2 0x00 0x01 0x4e
+  0xa0 0x00 0x01 0x0e
+  0xa2 0x40 0x01 0x4e
+  0xa0 0x40 0x01 0x0e
+  0xc2 0x20 0x01 0x4e
+  0xc0 0x20 0x01 0x0e
+
+# CHECK: tbl.16b	v2, { v4, v5, v6, v7 }, v1
+# CHECK: tbl.8b	v0, { v4, v5, v6, v7 }, v1
+# CHECK: tbl.16b	v2, { v5 }, v1
+# CHECK: tbl.8b	v0, { v5 }, v1
+# CHECK: tbl.16b	v2, { v5, v6, v7 }, v1
+# CHECK: tbl.8b	v0, { v5, v6, v7 }, v1
+# CHECK: tbl.16b	v2, { v6, v7 }, v1
+# CHECK: tbl.8b	v0, { v6, v7 }, v1
+#
+  0x82 0x70 0x01 0x4e
+  0x80 0x70 0x01 0x0e
+  0xa2 0x10 0x01 0x4e
+  0xa0 0x10 0x01 0x0e
+  0xa2 0x50 0x01 0x4e
+  0xa0 0x50 0x01 0x0e
+  0xc2 0x30 0x01 0x4e
+  0xc0 0x30 0x01 0x0e
+
+# CHECK: tbx.16b	v2, { v4, v5, v6, v7 }, v1
+# CHECK: tbx.8b	v0, { v4, v5, v6, v7 }, v1
+# CHECK: tbx.16b	v2, { v5 }, v1
+# CHECK: tbx.8b	v0, { v5 }, v1
+# CHECK: tbx.16b	v2, { v5, v6, v7 }, v1
+# CHECK: tbx.8b	v0, { v5, v6, v7 }, v1
+# CHECK: tbx.16b	v2, { v6, v7 }, v1
+# CHECK: tbx.8b	v0, { v6, v7 }, v1
+#
+
+0x00 0x80 0x20 0x0e
+0x00 0x80 0x20 0x4e
+0x00 0x80 0xa0 0x0e
+0x00 0x80 0xa0 0x4e
+
+# CHECK: smlal.8h v0, v0, v0
+# CHECK: smlal2.8h v0, v0, v0
+# CHECK: smlal.2d v0, v0, v0
+# CHECK: smlal2.2d v0, v0, v0
+
+0x00 0x80 0x20 0x2e
+0x00 0x80 0x20 0x6e
+0x00 0x80 0xa0 0x2e
+0x00 0x80 0xa0 0x6e
+
+# CHECK: umlal.8h v0, v0, v0
+# CHECK: umlal2.8h v0, v0, v0
+# CHECK: umlal.2d v0, v0, v0
+# CHECK: umlal2.2d v0, v0, v0
+
+0x00 0x90 0x60 0x5e
+0x00 0x90 0xa0 0x5e
+0x00 0xb0 0x60 0x5e
+0x00 0xb0 0xa0 0x5e
+
+# CHECK: sqdmlal s0, h0, h0
+# CHECK: sqdmlal d0, s0, s0
+# CHECK: sqdmlsl s0, h0, h0
+# CHECK: sqdmlsl d0, s0, s0
+
+0xaa 0xc5 0xc7 0x4d
+0xaa 0xc9 0xc7 0x4d
+0xaa 0xc1 0xc7 0x4d
+
+# CHECK: ld1r.8h { v10 }, [x13], x7
+# CHECK: ld1r.4s { v10 }, [x13], x7
+# CHECK: ld1r.16b { v10 }, [x13], x7
+
+0x00 0xd0 0x60 0x5e
+0x00 0xd0 0xa0 0x5e
+# CHECK: sqdmull	s0, h0, h0
+# CHECK: sqdmull	d0, s0, s0
+
+0x00 0xd8 0xa1 0x7e
+0x00 0xd8 0xe1 0x7e
+
+# CHECK: frsqrte s0, s0
+# CHECK: frsqrte d0, d0
+
+0xca 0xcd 0xc7 0x4d
+0xea 0xc9 0xe7 0x4d
+0xea 0xe9 0xc7 0x4d
+0xea 0xe9 0xe7 0x4d
+# CHECK: ld1r.2d	{ v10 }, [x14], x7
+# CHECK: ld2r.4s	{ v10, v11 }, [x15], x7
+# CHECK: ld3r.4s	{ v10, v11, v12 }, [x15], x7
+# CHECK: ld4r.4s	{ v10, v11, v12, v13 }, [x15], x7
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD scalar three same
+#===-------------------------------------------------------------------------===
+0x62 0xdc 0x21 0x5e
+# CHECK: fmulx	s2, s3, s1
+0x62 0xdc 0x61 0x5e
+# CHECK: fmulx	d2, d3, d1
+
+
+# rdar://12511369
+0xe8 0x6b 0xdf 0x4c
+# CHECK: ld1.4s	{ v8, v9, v10 }, [sp], #48
diff --git a/test/MC/Disassembler/AArch64/arm64-arithmetic.txt b/test/MC/Disassembler/AArch64/arm64-arithmetic.txt
new file mode 100644
index 0000000..bd870ed
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-arithmetic.txt
@@ -0,0 +1,526 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#==---------------------------------------------------------------------------==
+# Add/Subtract with carry/borrow
+#==---------------------------------------------------------------------------==
+
+0x41 0x00 0x03 0x1a
+0x41 0x00 0x03 0x9a
+0x85 0x00 0x03 0x3a
+0x85 0x00 0x03 0xba
+
+# CHECK: adc  w1, w2, w3
+# CHECK: adc  x1, x2, x3
+# CHECK: adcs w5, w4, w3
+# CHECK: adcs x5, x4, x3
+
+0x41 0x00 0x03 0x5a
+0x41 0x00 0x03 0xda
+0x41 0x00 0x03 0x7a
+0x41 0x00 0x03 0xfa
+
+# CHECK: sbc  w1, w2, w3
+# CHECK: sbc  x1, x2, x3
+# CHECK: sbcs w1, w2, w3
+# CHECK: sbcs x1, x2, x3
+
+#==---------------------------------------------------------------------------==
+# Add/Subtract with (optionally shifted) immediate
+#==---------------------------------------------------------------------------==
+
+0x83 0x00 0x10 0x11
+0x83 0x00 0x10 0x91
+
+# CHECK: add w3, w4, #1024
+# CHECK: add x3, x4, #1024
+
+0x83 0x00 0x50 0x11
+0x83 0x00 0x40 0x11
+0x83 0x00 0x50 0x91
+0x83 0x00 0x40 0x91
+0xff 0x83 0x00 0x91
+
+# CHECK: add w3, w4, #1024, lsl #12
+# CHECK: add x3, x4, #1024, lsl #12
+# CHECK: add x3, x4, #0, lsl #12
+# CHECK: add sp, sp, #32
+
+0x83 0x00 0x10 0x31
+0x83 0x00 0x50 0x31
+0x83 0x00 0x10 0xb1
+0x83 0x00 0x50 0xb1
+0xff 0x83 0x00 0xb1
+
+# CHECK: adds w3, w4, #1024
+# CHECK: adds w3, w4, #1024, lsl #12
+# CHECK: adds x3, x4, #1024
+# CHECK: adds x3, x4, #1024, lsl #12
+# CHECK: cmn  sp, #32
+
+0x83 0x00 0x10 0x51
+0x83 0x00 0x50 0x51
+0x83 0x00 0x10 0xd1
+0x83 0x00 0x50 0xd1
+0xff 0x83 0x00 0xd1
+
+# CHECK: sub w3, w4, #1024
+# CHECK: sub w3, w4, #1024, lsl #12
+# CHECK: sub x3, x4, #1024
+# CHECK: sub x3, x4, #1024, lsl #12
+# CHECK: sub sp, sp, #32
+
+0x83 0x00 0x10 0x71
+0x83 0x00 0x50 0x71
+0x83 0x00 0x10 0xf1
+0x83 0x00 0x50 0xf1
+0xff 0x83 0x00 0xf1
+
+# CHECK: subs w3, w4, #1024
+# CHECK: subs w3, w4, #1024, lsl #12
+# CHECK: subs x3, x4, #1024
+# CHECK: subs x3, x4, #1024, lsl #12
+# CHECK: cmp  sp, #32
+
+#==---------------------------------------------------------------------------==
+# Add/Subtract register with (optional) shift
+#==---------------------------------------------------------------------------==
+
+0xac 0x01 0x0e 0x0b
+0xac 0x01 0x0e 0x8b
+0xac 0x31 0x0e 0x0b
+0xac 0x31 0x0e 0x8b
+0xac 0x29 0x4e 0x0b
+0xac 0x29 0x4e 0x8b
+0xac 0x1d 0x8e 0x0b
+0xac 0x9d 0x8e 0x8b
+
+# CHECK: add w12, w13, w14
+# CHECK: add x12, x13, x14
+# CHECK: add w12, w13, w14, lsl #12
+# CHECK: add x12, x13, x14, lsl #12
+# CHECK: add w12, w13, w14, lsr #10
+# CHECK: add x12, x13, x14, lsr #10
+# CHECK: add w12, w13, w14, asr #7
+# CHECK: add x12, x13, x14, asr #39
+
+0xac 0x01 0x0e 0x4b
+0xac 0x01 0x0e 0xcb
+0xac 0x31 0x0e 0x4b
+0xac 0x31 0x0e 0xcb
+0xac 0x29 0x4e 0x4b
+0xac 0x29 0x4e 0xcb
+0xac 0x1d 0x8e 0x4b
+0xac 0x9d 0x8e 0xcb
+
+# CHECK: sub w12, w13, w14
+# CHECK: sub x12, x13, x14
+# CHECK: sub w12, w13, w14, lsl #12
+# CHECK: sub x12, x13, x14, lsl #12
+# CHECK: sub w12, w13, w14, lsr #10
+# CHECK: sub x12, x13, x14, lsr #10
+# CHECK: sub w12, w13, w14, asr #7
+# CHECK: sub x12, x13, x14, asr #39
+
+0xac 0x01 0x0e 0x2b
+0xac 0x01 0x0e 0xab
+0xac 0x31 0x0e 0x2b
+0xac 0x31 0x0e 0xab
+0xac 0x29 0x4e 0x2b
+0xac 0x29 0x4e 0xab
+0xac 0x1d 0x8e 0x2b
+0xac 0x9d 0x8e 0xab
+
+# CHECK: adds w12, w13, w14
+# CHECK: adds x12, x13, x14
+# CHECK: adds w12, w13, w14, lsl #12
+# CHECK: adds x12, x13, x14, lsl #12
+# CHECK: adds w12, w13, w14, lsr #10
+# CHECK: adds x12, x13, x14, lsr #10
+# CHECK: adds w12, w13, w14, asr #7
+# CHECK: adds x12, x13, x14, asr #39
+
+0xac 0x01 0x0e 0x6b
+0xac 0x01 0x0e 0xeb
+0xac 0x31 0x0e 0x6b
+0xac 0x31 0x0e 0xeb
+0xac 0x29 0x4e 0x6b
+0xac 0x29 0x4e 0xeb
+0xac 0x1d 0x8e 0x6b
+0xac 0x9d 0x8e 0xeb
+
+# CHECK: subs w12, w13, w14
+# CHECK: subs x12, x13, x14
+# CHECK: subs w12, w13, w14, lsl #12
+# CHECK: subs x12, x13, x14, lsl #12
+# CHECK: subs w12, w13, w14, lsr #10
+# CHECK: subs x12, x13, x14, lsr #10
+# CHECK: subs w12, w13, w14, asr #7
+# CHECK: subs x12, x13, x14, asr #39
+
+#==---------------------------------------------------------------------------==
+# Add/Subtract with (optional) extend
+#==---------------------------------------------------------------------------==
+
+0x41 0x00 0x23 0x0b
+0x41 0x20 0x23 0x0b
+0x41 0x40 0x23 0x0b
+0x41 0x60 0x23 0x0b
+0x41 0x80 0x23 0x0b
+0x41 0xa0 0x23 0x0b
+0x41 0xc0 0x23 0x0b
+0x41 0xe0 0x23 0x0b
+
+# CHECK: add w1, w2, w3, uxtb
+# CHECK: add w1, w2, w3, uxth
+# CHECK: add w1, w2, w3
+# CHECK: add w1, w2, w3, uxtx
+# CHECK: add w1, w2, w3, sxtb
+# CHECK: add w1, w2, w3, sxth
+# CHECK: add w1, w2, w3, sxtw
+# CHECK: add w1, w2, w3, sxtx
+
+0x41 0x00 0x23 0x8b
+0x41 0x20 0x23 0x8b
+0x41 0x40 0x23 0x8b
+0x41 0x80 0x23 0x8b
+0x41 0xa0 0x23 0x8b
+0x41 0xc0 0x23 0x8b
+
+# CHECK: add x1, x2, w3, uxtb
+# CHECK: add x1, x2, w3, uxth
+# CHECK: add x1, x2, w3, uxtw
+# CHECK: add x1, x2, w3, sxtb
+# CHECK: add x1, x2, w3, sxth
+# CHECK: add x1, x2, w3, sxtw
+
+0xe1 0x43 0x23 0x0b
+0xe1 0x43 0x23 0x0b
+0x5f 0x60 0x23 0x8b
+0x5f 0x60 0x23 0x8b
+
+# CHECK: add w1, wsp, w3
+# CHECK: add w1, wsp, w3
+# CHECK: add sp, x2, x3
+# CHECK: add sp, x2, x3
+
+0x41 0x00 0x23 0x4b
+0x41 0x20 0x23 0x4b
+0x41 0x40 0x23 0x4b
+0x41 0x60 0x23 0x4b
+0x41 0x80 0x23 0x4b
+0x41 0xa0 0x23 0x4b
+0x41 0xc0 0x23 0x4b
+0x41 0xe0 0x23 0x4b
+
+# CHECK: sub w1, w2, w3, uxtb
+# CHECK: sub w1, w2, w3, uxth
+# CHECK: sub w1, w2, w3
+# CHECK: sub w1, w2, w3, uxtx
+# CHECK: sub w1, w2, w3, sxtb
+# CHECK: sub w1, w2, w3, sxth
+# CHECK: sub w1, w2, w3, sxtw
+# CHECK: sub w1, w2, w3, sxtx
+
+0x41 0x00 0x23 0xcb
+0x41 0x20 0x23 0xcb
+0x41 0x40 0x23 0xcb
+0x41 0x80 0x23 0xcb
+0x41 0xa0 0x23 0xcb
+0x41 0xc0 0x23 0xcb
+
+# CHECK: sub x1, x2, w3, uxtb
+# CHECK: sub x1, x2, w3, uxth
+# CHECK: sub x1, x2, w3, uxtw
+# CHECK: sub x1, x2, w3, sxtb
+# CHECK: sub x1, x2, w3, sxth
+# CHECK: sub x1, x2, w3, sxtw
+
+0xe1 0x43 0x23 0x4b
+0xe1 0x43 0x23 0x4b
+0x5f 0x60 0x23 0xcb
+0x5f 0x60 0x23 0xcb
+
+# CHECK: sub w1, wsp, w3
+# CHECK: sub w1, wsp, w3
+# CHECK: sub sp, x2, x3
+# CHECK: sub sp, x2, x3
+
+0x41 0x00 0x23 0x2b
+0x41 0x20 0x23 0x2b
+0x41 0x40 0x23 0x2b
+0x41 0x60 0x23 0x2b
+0x41 0x80 0x23 0x2b
+0x41 0xa0 0x23 0x2b
+0x41 0xc0 0x23 0x2b
+0x41 0xe0 0x23 0x2b
+
+# CHECK: adds w1, w2, w3, uxtb
+# CHECK: adds w1, w2, w3, uxth
+# CHECK: adds w1, w2, w3
+# CHECK: adds w1, w2, w3, uxtx
+# CHECK: adds w1, w2, w3, sxtb
+# CHECK: adds w1, w2, w3, sxth
+# CHECK: adds w1, w2, w3, sxtw
+# CHECK: adds w1, w2, w3, sxtx
+
+0x41 0x00 0x23 0xab
+0x41 0x20 0x23 0xab
+0x41 0x40 0x23 0xab
+0x41 0x80 0x23 0xab
+0x41 0xa0 0x23 0xab
+0x41 0xc0 0x23 0xab
+
+# CHECK: adds x1, x2, w3, uxtb
+# CHECK: adds x1, x2, w3, uxth
+# CHECK: adds x1, x2, w3, uxtw
+# CHECK: adds x1, x2, w3, sxtb
+# CHECK: adds x1, x2, w3, sxth
+# CHECK: adds x1, x2, w3, sxtw
+
+0xe1 0x43 0x23 0x2b
+0xe1 0x43 0x23 0x2b
+
+# CHECK: adds w1, wsp, w3
+# CHECK: adds w1, wsp, w3
+
+0x41 0x00 0x23 0x6b
+0x41 0x20 0x23 0x6b
+0x41 0x40 0x23 0x6b
+0x41 0x60 0x23 0x6b
+0x41 0x80 0x23 0x6b
+0x41 0xa0 0x23 0x6b
+0x41 0xc0 0x23 0x6b
+0x41 0xe0 0x23 0x6b
+
+# CHECK: subs w1, w2, w3, uxtb
+# CHECK: subs w1, w2, w3, uxth
+# CHECK: subs w1, w2, w3
+# CHECK: subs w1, w2, w3, uxtx
+# CHECK: subs w1, w2, w3, sxtb
+# CHECK: subs w1, w2, w3, sxth
+# CHECK: subs w1, w2, w3, sxtw
+# CHECK: subs w1, w2, w3, sxtx
+
+0x41 0x00 0x23 0xeb
+0x41 0x20 0x23 0xeb
+0x41 0x40 0x23 0xeb
+0x41 0x80 0x23 0xeb
+0x41 0xa0 0x23 0xeb
+0x41 0xc0 0x23 0xeb
+
+# CHECK: subs x1, x2, w3, uxtb
+# CHECK: subs x1, x2, w3, uxth
+# CHECK: subs x1, x2, w3, uxtw
+# CHECK: subs x1, x2, w3, sxtb
+# CHECK: subs x1, x2, w3, sxth
+# CHECK: subs x1, x2, w3, sxtw
+
+0xe1 0x43 0x23 0x6b
+0xe1 0x43 0x23 0x6b
+
+# CHECK: subs w1, wsp, w3
+# CHECK: subs w1, wsp, w3
+
+0x1f 0x41 0x28 0xeb
+0x3f 0x41 0x28 0x6b
+0xff 0x43 0x28 0x6b
+0xff 0x43 0x28 0xeb
+
+# CHECK: cmp x8, w8, uxtw
+# CHECK: cmp w9, w8, uxtw
+# CHECK: cmp wsp, w8
+# CHECK: cmp sp, w8
+
+0x3f 0x41 0x28 0x4b
+0xe1 0x43 0x28 0x4b
+0xff 0x43 0x28 0x4b
+0x3f 0x41 0x28 0xcb
+0xe1 0x43 0x28 0xcb
+0xff 0x43 0x28 0xcb
+0xe1 0x43 0x28 0x6b
+0xe1 0x43 0x28 0xeb
+
+# CHECK: sub wsp, w9, w8
+# CHECK: sub w1, wsp, w8
+# CHECK: sub wsp, wsp, w8
+# CHECK: sub sp, x9, w8
+# CHECK: sub x1, sp, w8
+# CHECK: sub sp, sp, w8
+# CHECK: subs w1, wsp, w8
+# CHECK: subs x1, sp, w8
+
+#==---------------------------------------------------------------------------==
+# Signed/Unsigned divide
+#==---------------------------------------------------------------------------==
+
+0x41 0x0c 0xc3 0x1a
+0x41 0x0c 0xc3 0x9a
+0x41 0x08 0xc3 0x1a
+0x41 0x08 0xc3 0x9a
+
+# CHECK: sdiv w1, w2, w3
+# CHECK: sdiv x1, x2, x3
+# CHECK: udiv w1, w2, w3
+# CHECK: udiv x1, x2, x3
+
+#==---------------------------------------------------------------------------==
+# Variable shifts
+#==---------------------------------------------------------------------------==
+
+  0x41 0x28 0xc3 0x1a
+# CHECK: asr w1, w2, w3
+  0x41 0x28 0xc3 0x9a
+# CHECK: asr x1, x2, x3
+  0x41 0x20 0xc3 0x1a
+# CHECK: lsl w1, w2, w3
+  0x41 0x20 0xc3 0x9a
+# CHECK: lsl x1, x2, x3
+  0x41 0x24 0xc3 0x1a
+# CHECK: lsr w1, w2, w3
+  0x41 0x24 0xc3 0x9a
+# CHECK: lsr x1, x2, x3
+  0x41 0x2c 0xc3 0x1a
+# CHECK: ror w1, w2, w3
+  0x41 0x2c 0xc3 0x9a
+# CHECK: ror x1, x2, x3
+
+#==---------------------------------------------------------------------------==
+# One operand instructions
+#==---------------------------------------------------------------------------==
+
+  0x41 0x14 0xc0 0x5a
+# CHECK: cls w1, w2
+  0x41 0x14 0xc0 0xda
+# CHECK: cls x1, x2
+  0x41 0x10 0xc0 0x5a
+# CHECK: clz w1, w2
+  0x41 0x10 0xc0 0xda
+# CHECK: clz x1, x2
+  0x41 0x00 0xc0 0x5a
+# CHECK: rbit w1, w2
+  0x41 0x00 0xc0 0xda
+# CHECK: rbit x1, x2
+  0x41 0x08 0xc0 0x5a
+# CHECK: rev w1, w2
+  0x41 0x0c 0xc0 0xda
+# CHECK: rev x1, x2
+  0x41 0x04 0xc0 0x5a
+# CHECK: rev16 w1, w2
+  0x41 0x04 0xc0 0xda
+# CHECK: rev16 x1, x2
+  0x41 0x08 0xc0 0xda
+# CHECK: rev32 x1, x2
+
+#==---------------------------------------------------------------------------==
+# 6.6.1 Multiply-add instructions
+#==---------------------------------------------------------------------------==
+
+0x41 0x10 0x03 0x1b
+0x41 0x10 0x03 0x9b
+0x41 0x90 0x03 0x1b
+0x41 0x90 0x03 0x9b
+0x41 0x10 0x23 0x9b
+0x41 0x90 0x23 0x9b
+0x41 0x10 0xa3 0x9b
+0x41 0x90 0xa3 0x9b
+
+# CHECK: madd   w1, w2, w3, w4
+# CHECK: madd   x1, x2, x3, x4
+# CHECK: msub   w1, w2, w3, w4
+# CHECK: msub   x1, x2, x3, x4
+# CHECK: smaddl x1, w2, w3, x4
+# CHECK: smsubl x1, w2, w3, x4
+# CHECK: umaddl x1, w2, w3, x4
+# CHECK: umsubl x1, w2, w3, x4
+
+#==---------------------------------------------------------------------------==
+# Multiply-high instructions
+#==---------------------------------------------------------------------------==
+
+0x41 0x7c 0x43 0x9b
+0x41 0x7c 0xc3 0x9b
+
+# CHECK: smulh x1, x2, x3
+# CHECK: umulh x1, x2, x3
+
+#==---------------------------------------------------------------------------==
+# Move immediate instructions
+#==---------------------------------------------------------------------------==
+
+0x20 0x00 0x80 0x52
+0x20 0x00 0x80 0xd2
+0x20 0x00 0xa0 0x52
+0x20 0x00 0xa0 0xd2
+
+# CHECK: movz w0, #0x1
+# CHECK: movz x0, #0x1
+# CHECK: movz w0, #0x1, lsl #16
+# CHECK: movz x0, #0x1, lsl #16
+
+0x40 0x00 0x80 0x12
+0x40 0x00 0x80 0x92
+0x40 0x00 0xa0 0x12
+0x40 0x00 0xa0 0x92
+
+# CHECK: movn w0, #0x2
+# CHECK: movn x0, #0x2
+# CHECK: movn w0, #0x2, lsl #16
+# CHECK: movn x0, #0x2, lsl #16
+
+0x20 0x00 0x80 0x72
+0x20 0x00 0x80 0xf2
+0x20 0x00 0xa0 0x72
+0x20 0x00 0xa0 0xf2
+
+# CHECK: movk w0, #0x1
+# CHECK: movk x0, #0x1
+# CHECK: movk w0, #0x1, lsl #16
+# CHECK: movk x0, #0x1, lsl #16
+
+#==---------------------------------------------------------------------------==
+# Conditionally set flags instructions
+#==---------------------------------------------------------------------------==
+
+  0x1f 0x00 0x00 0x31
+# CHECK: cmn w0, #0
+  0x1f 0xfc 0x03 0xb1
+# CHECK: x0, #255
+
+  0x23 0x08 0x42 0x3a
+# CHECK: ccmn w1, #2, #3, eq
+  0x23 0x08 0x42 0xba
+# CHECK: ccmn x1, #2, #3, eq
+  0x23 0x08 0x42 0x7a
+# CHECK: ccmp w1, #2, #3, eq
+  0x23 0x08 0x42 0xfa
+# CHECK: ccmp x1, #2, #3, eq
+
+  0x23 0x00 0x42 0x3a
+# CHECK: ccmn w1, w2, #3, eq
+  0x23 0x00 0x42 0xba
+# CHECK: ccmn x1, x2, #3, eq
+  0x23 0x00 0x42 0x7a
+# CHECK: ccmp w1, w2, #3, eq
+  0x23 0x00 0x42 0xfa
+# CHECK: ccmp x1, x2, #3, eq
+
+#==---------------------------------------------------------------------------==
+# Conditional select instructions
+#==---------------------------------------------------------------------------==
+
+  0x41 0x00 0x83 0x1a
+# CHECK: csel w1, w2, w3, eq
+  0x41 0x00 0x83 0x9a
+# CHECK: csel x1, x2, x3, eq
+  0x41 0x04 0x83 0x1a
+# CHECK: csinc w1, w2, w3, eq
+  0x41 0x04 0x83 0x9a
+# CHECK: csinc x1, x2, x3, eq
+  0x41 0x00 0x83 0x5a
+# CHECK: csinv w1, w2, w3, eq
+  0x41 0x00 0x83 0xda
+# CHECK: csinv x1, x2, x3, eq
+  0x41 0x04 0x83 0x5a
+# CHECK: csneg w1, w2, w3, eq
+  0x41 0x04 0x83 0xda
+# CHECK: csneg x1, x2, x3, eq
diff --git a/test/MC/Disassembler/AArch64/arm64-basic-a64-undefined.txt b/test/MC/Disassembler/AArch64/arm64-basic-a64-undefined.txt
new file mode 100644
index 0000000..0e15af6
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-basic-a64-undefined.txt
@@ -0,0 +1,31 @@
+# These spawn another process so they're rather expensive. Not many.
+
+# LDR/STR: undefined if option field is 10x or 00x.
+# RUN: echo "0x00 0x08 0x20 0xf8" | llvm-mc -triple arm64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x88 0x20 0xf8" | llvm-mc -triple arm64 -disassemble 2>&1 | FileCheck %s
+
+# Instructions notionally in the add/sub (extended register) sheet, but with
+# invalid shift amount or "opt" field.
+# RUN: echo "0x00 0x10 0xa0 0x0b" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x10 0x60 0x0b" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x14 0x20 0x0b" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+
+# MOVK with sf == 0 and hw<1> == 1 is unallocated.
+# RUN: echo "0x00 0x00 0xc0 0x72" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+
+# ADD/SUB (shifted register) are reserved if shift == '11' or sf == '0' and imm6<5> == '1'.
+# RUN: echo "0x00 0x00 0xc0 0xeb" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x80 0x80 0x6b" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+
+# UBFM is undefined when s == 0 and imms<5> or immr<5> is 1.
+# RUN: echo "0x00 0x80 0x00 0x53" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+
+# EXT on vectors of i8 must have imm<3> = 0.
+# RUN: echo "0x00 0x40 0x00 0x2e" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+
+# SCVTF on fixed point W-registers is undefined if scale<5> == 0.
+# Same with FCVTZS and FCVTZU.
+# RUN: echo "0x00 0x00 0x02 0x1e" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x00 0x18 0x1e" | llvm-mc -triple=arm64 -disassemble 2>&1 | FileCheck %s
+
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/AArch64/arm64-bitfield.txt b/test/MC/Disassembler/AArch64/arm64-bitfield.txt
new file mode 100644
index 0000000..d620cb3
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-bitfield.txt
@@ -0,0 +1,29 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#==---------------------------------------------------------------------------==
+# 5.4.4 Bitfield Operations
+#==---------------------------------------------------------------------------==
+
+0x41 0x3c 0x01 0x33
+0x41 0x3c 0x41 0xb3
+0x41 0x3c 0x01 0x13
+0x41 0x3c 0x41 0x93
+0x41 0x3c 0x01 0x53
+0x41 0x3c 0x41 0xd3
+
+# CHECK: bfxil  w1, w2, #1, #15
+# CHECK: bfxil  x1, x2, #1, #15
+# CHECK: sbfx w1, w2, #1, #15
+# CHECK: sbfx x1, x2, #1, #15
+# CHECK: ubfx w1, w2, #1, #15
+# CHECK: ubfx x1, x2, #1, #15
+
+#==---------------------------------------------------------------------------==
+# 5.4.5 Extract (immediate)
+#==---------------------------------------------------------------------------==
+
+0x41 0x3c 0x83 0x13
+0x62 0x04 0xc4 0x93
+
+# CHECK: extr w1, w2, w3, #15
+# CHECK: extr x2, x3, x4, #1
diff --git a/test/MC/Disassembler/AArch64/arm64-branch.txt b/test/MC/Disassembler/AArch64/arm64-branch.txt
new file mode 100644
index 0000000..6af1ad8
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-branch.txt
@@ -0,0 +1,75 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#-----------------------------------------------------------------------------
+# Unconditional branch (register) instructions.
+#-----------------------------------------------------------------------------
+
+  0xc0 0x03 0x5f 0xd6
+# CHECK: ret
+  0x20 0x00 0x5f 0xd6
+# CHECK: ret x1
+  0xe0 0x03 0xbf 0xd6
+# CHECK: drps
+  0xe0 0x03 0x9f 0xd6
+# CHECK: eret
+  0xa0 0x00 0x1f 0xd6
+# CHECK: br  x5
+  0x20 0x01 0x3f 0xd6
+# CHECK: blr x9
+  0x0B 0x00 0x18 0x37
+# CHECK: tbnz	w11, #3, #0
+
+#-----------------------------------------------------------------------------
+# Exception generation instructions.
+#-----------------------------------------------------------------------------
+
+  0x20 0x00 0x20 0xd4
+# CHECK: brk   #0x1
+  0x41 0x00 0xa0 0xd4
+# CHECK: dcps1 #0x2
+  0x62 0x00 0xa0 0xd4
+# CHECK: dcps2 #0x3
+  0x83 0x00 0xa0 0xd4
+# CHECK: dcps3 #0x4
+  0xa0 0x00 0x40 0xd4
+# CHECK: hlt   #0x5
+  0xc2 0x00 0x00 0xd4
+# CHECK: hvc   #0x6
+  0xe3 0x00 0x00 0xd4
+# CHECK: smc   #0x7
+  0x01 0x01 0x00 0xd4
+# CHECK: svc   #0x8
+
+#-----------------------------------------------------------------------------
+# PC-relative branches (both positive and negative displacement)
+#-----------------------------------------------------------------------------
+
+  0x07 0x00 0x00 0x14
+# CHECK: b #28
+  0x06 0x00 0x00 0x94
+# CHECK: bl #24
+  0xa1 0x00 0x00 0x54
+# CHECK: b.ne #20
+  0x80 0x00 0x08 0x36
+# CHECK: tbz w0, #1, #16
+  0xe1 0xff 0xf7 0x36
+# CHECK: tbz w1, #30, #-4
+  0x60 0x00 0x08 0x37
+# CHECK: tbnz w0, #1, #12
+  0x40 0x00 0x00 0xb4
+# CHECK: cbz x0, #8
+  0x20 0x00 0x00 0xb5
+# CHECK: cbnz x0, #4
+  0x1f 0x20 0x03 0xd5
+# CHECK: nop
+  0xff 0xff 0xff 0x17
+# CHECK: b #-4
+  0xc1 0xff 0xff 0x54
+# CHECK: b.ne #-8
+  0xa0 0xff 0x0f 0x36
+# CHECK: tbz w0, #1, #-12
+  0x80 0xff 0xff 0xb4
+# CHECK: cbz x0, #-16
+  0x1f 0x20 0x03 0xd5
+# CHECK: nop
+
diff --git a/test/MC/Disassembler/AArch64/arm64-canonical-form.txt b/test/MC/Disassembler/AArch64/arm64-canonical-form.txt
new file mode 100644
index 0000000..1c94b13
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-canonical-form.txt
@@ -0,0 +1,21 @@
+# RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon --disassemble < %s | FileCheck %s
+
+0x00 0x08 0x00 0xc8
+
+# CHECK: stxr	w0, x0, [x0]
+
+0x00 0x00 0x40 0x9b
+
+# CHECK: smulh x0, x0, x0
+
+0x08 0x20 0x21 0x1e
+
+# CHECK: fcmp s0, #0.0
+
+0x1f 0x00 0x00 0x11
+
+# CHECK: mov wsp, w0
+
+0x00 0x7c 0x00 0x13
+
+# CHECK: asr	w0, w0, #0
diff --git a/test/MC/Disassembler/AArch64/arm64-crc32.txt b/test/MC/Disassembler/AArch64/arm64-crc32.txt
new file mode 100644
index 0000000..51717ee
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-crc32.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc -triple=arm64 -mattr=+crc -disassemble < %s | FileCheck %s
+
+# CHECK: crc32b  w5, w7, w20
+# CHECK: crc32h  w28, wzr, w30
+# CHECK: crc32w  w0, w1, w2
+# CHECK: crc32x  w7, w9, x20
+# CHECK: crc32cb w9, w5, w4
+# CHECK: crc32ch w13, w17, w25
+# CHECK: crc32cw wzr, w3, w5
+# CHECK: crc32cx w18, w16, xzr
+0xe5 0x40 0xd4 0x1a
+0xfc 0x47 0xde 0x1a
+0x20 0x48 0xc2 0x1a
+0x27 0x4d 0xd4 0x9a
+0xa9 0x50 0xc4 0x1a
+0x2d 0x56 0xd9 0x1a
+0x7f 0x58 0xc5 0x1a
+0x12 0x5e 0xdf 0x9a
diff --git a/test/MC/Disassembler/AArch64/arm64-crypto.txt b/test/MC/Disassembler/AArch64/arm64-crypto.txt
new file mode 100644
index 0000000..b905b92
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-crypto.txt
@@ -0,0 +1,47 @@
+# RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -output-asm-variant=1 --disassemble < %s | FileCheck %s --check-prefix=CHECK-APPLE
+
+  0x20 0x48 0x28 0x4e
+  0x20 0x58 0x28 0x4e
+  0x20 0x68 0x28 0x4e
+  0x20 0x78 0x28 0x4e
+  0x20 0x00 0x02 0x5e
+  0x20 0x10 0x02 0x5e
+  0x20 0x20 0x02 0x5e
+  0x20 0x30 0x02 0x5e
+  0x20 0x40 0x02 0x5e
+  0x20 0x50 0x02 0x5e
+  0x20 0x60 0x02 0x5e
+  0x20 0x08 0x28 0x5e
+  0x20 0x18 0x28 0x5e
+  0x20 0x28 0x28 0x5e
+
+# CHECK: aese v0.16b, v1.16b
+# CHECK: aesd v0.16b, v1.16b
+# CHECK: aesmc v0.16b, v1.16b
+# CHECK: aesimc v0.16b, v1.16b
+# CHECK: sha1c q0, s1, v2.4s
+# CHECK: sha1p q0, s1, v2.4s
+# CHECK: sha1m q0, s1, v2.4s
+# CHECK: sha1su0 v0.4s, v1.4s, v2
+# CHECK: sha256h q0, q1, v2.4s
+# CHECK: sha256h2 q0, q1, v2.4s
+# CHECK: sha256su1 v0.4s, v1.4s, v2.4s
+# CHECK: sha1h s0, s1
+# CHECK: sha1su1 v0.4s, v1.4s
+# CHECK: sha256su0 v0.4s, v1.4s
+
+# CHECK-APPLE: aese.16b v0, v1
+# CHECK-APPLE: aesd.16b v0, v1
+# CHECK-APPLE: aesmc.16b v0, v1
+# CHECK-APPLE: aesimc.16b v0, v1
+# CHECK-APPLE: sha1c.4s q0, s1, v2
+# CHECK-APPLE: sha1p.4s q0, s1, v2
+# CHECK-APPLE: sha1m.4s q0, s1, v2
+# CHECK-APPLE: sha1su0.4s v0, v1, v2
+# CHECK-APPLE: sha256h.4s q0, q1, v2
+# CHECK-APPLE: sha256h2.4s q0, q1, v2
+# CHECK-APPLE: sha256su1.4s v0, v1, v2
+# CHECK-APPLE: sha1h s0, s1
+# CHECK-APPLE: sha1su1.4s v0, v1
+# CHECK-APPLE: sha256su0.4s v0, v1
diff --git a/test/MC/Disassembler/AArch64/arm64-invalid-logical.txt b/test/MC/Disassembler/AArch64/arm64-invalid-logical.txt
new file mode 100644
index 0000000..8a4ecb6
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-invalid-logical.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc -triple arm64-apple-darwin -disassemble < %s 2>&1 | FileCheck %s
+
+# rdar://15226511
+0x7b 0xbf 0x25 0x72
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: 0x7b 0xbf 0x25 0x72
diff --git a/test/MC/Disassembler/AArch64/arm64-logical.txt b/test/MC/Disassembler/AArch64/arm64-logical.txt
new file mode 100644
index 0000000..e3cb3eb
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-logical.txt
@@ -0,0 +1,223 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#==---------------------------------------------------------------------------==
+# 5.4.2 Logical (immediate)
+#==---------------------------------------------------------------------------==
+
+0x00 0x00 0x00 0x12
+0x00 0x00 0x40 0x92
+0x41 0x0c 0x00 0x12
+0x41 0x0c 0x40 0x92
+0xbf 0xec 0x7c 0x92
+0x00 0x00 0x00 0x72
+0x00 0x00 0x40 0xf2
+0x41 0x0c 0x00 0x72
+0x41 0x0c 0x40 0xf2
+0x5f 0x0c 0x40 0xf2
+
+# CHECK: and  w0, w0, #0x1
+# CHECK: and  x0, x0, #0x1
+# CHECK: and  w1, w2, #0xf
+# CHECK: and  x1, x2, #0xf
+# CHECK: and  sp, x5, #0xfffffffffffffff0
+# CHECK: ands w0, w0, #0x1
+# CHECK: ands x0, x0, #0x1
+# CHECK: ands w1, w2, #0xf
+# CHECK: ands x1, x2, #0xf
+# CHECK: tst x2, #0xf
+
+0x41 0x00 0x12 0x52
+0x41 0x00 0x71 0xd2
+0x5f 0x00 0x71 0xd2
+
+# CHECK: eor w1, w2, #0x4000
+# CHECK: eor x1, x2, #0x8000
+# CHECK: eor sp, x2, #0x8000
+
+0x41 0x00 0x12 0x32
+0x41 0x00 0x71 0xb2
+0x5f 0x00 0x71 0xb2
+
+# CHECK: orr w1, w2, #0x4000
+# CHECK: orr x1, x2, #0x8000
+# CHECK: orr sp, x2, #0x8000
+
+#==---------------------------------------------------------------------------==
+# 5.5.3 Logical (shifted register)
+#==---------------------------------------------------------------------------==
+
+0x41 0x00 0x03 0x0a
+0x41 0x00 0x03 0x8a
+0x41 0x08 0x03 0x0a
+0x41 0x08 0x03 0x8a
+0x41 0x08 0x43 0x0a
+0x41 0x08 0x43 0x8a
+0x41 0x08 0x83 0x0a
+0x41 0x08 0x83 0x8a
+0x41 0x08 0xc3 0x0a
+0x41 0x08 0xc3 0x8a
+
+# CHECK: and  w1, w2, w3
+# CHECK: and  x1, x2, x3
+# CHECK: and  w1, w2, w3, lsl #2
+# CHECK: and  x1, x2, x3, lsl #2
+# CHECK: and  w1, w2, w3, lsr #2
+# CHECK: and  x1, x2, x3, lsr #2
+# CHECK: and  w1, w2, w3, asr #2
+# CHECK: and  x1, x2, x3, asr #2
+# CHECK: and  w1, w2, w3, ror #2
+# CHECK: and  x1, x2, x3, ror #2
+
+0x41 0x00 0x03 0x6a
+0x41 0x00 0x03 0xea
+0x41 0x08 0x03 0x6a
+0x41 0x08 0x03 0xea
+0x41 0x08 0x43 0x6a
+0x41 0x08 0x43 0xea
+0x41 0x08 0x83 0x6a
+0x41 0x08 0x83 0xea
+0x41 0x08 0xc3 0x6a
+0x41 0x08 0xc3 0xea
+
+# CHECK: ands w1, w2, w3
+# CHECK: ands x1, x2, x3
+# CHECK: ands w1, w2, w3, lsl #2
+# CHECK: ands x1, x2, x3, lsl #2
+# CHECK: ands w1, w2, w3, lsr #2
+# CHECK: ands x1, x2, x3, lsr #2
+# CHECK: ands w1, w2, w3, asr #2
+# CHECK: ands x1, x2, x3, asr #2
+# CHECK: ands w1, w2, w3, ror #2
+# CHECK: ands x1, x2, x3, ror #2
+
+0x41 0x00 0x23 0x0a
+0x41 0x00 0x23 0x8a
+0x41 0x0c 0x23 0x0a
+0x41 0x0c 0x23 0x8a
+0x41 0x0c 0x63 0x0a
+0x41 0x0c 0x63 0x8a
+0x41 0x0c 0xa3 0x0a
+0x41 0x0c 0xa3 0x8a
+0x41 0x0c 0xe3 0x0a
+0x41 0x0c 0xe3 0x8a
+
+# CHECK: bic w1, w2, w3
+# CHECK: bic x1, x2, x3
+# CHECK: bic w1, w2, w3, lsl #3
+# CHECK: bic x1, x2, x3, lsl #3
+# CHECK: bic w1, w2, w3, lsr #3
+# CHECK: bic x1, x2, x3, lsr #3
+# CHECK: bic w1, w2, w3, asr #3
+# CHECK: bic x1, x2, x3, asr #3
+# CHECK: bic w1, w2, w3, ror #3
+# CHECK: bic x1, x2, x3, ror #3
+
+0x41 0x00 0x23 0x6a
+0x41 0x00 0x23 0xea
+0x41 0x0c 0x23 0x6a
+0x41 0x0c 0x23 0xea
+0x41 0x0c 0x63 0x6a
+0x41 0x0c 0x63 0xea
+0x41 0x0c 0xa3 0x6a
+0x41 0x0c 0xa3 0xea
+0x41 0x0c 0xe3 0x6a
+0x41 0x0c 0xe3 0xea
+
+# CHECK: bics w1, w2, w3
+# CHECK: bics x1, x2, x3
+# CHECK: bics w1, w2, w3, lsl #3
+# CHECK: bics x1, x2, x3, lsl #3
+# CHECK: bics w1, w2, w3, lsr #3
+# CHECK: bics x1, x2, x3, lsr #3
+# CHECK: bics w1, w2, w3, asr #3
+# CHECK: bics x1, x2, x3, asr #3
+# CHECK: bics w1, w2, w3, ror #3
+# CHECK: bics x1, x2, x3, ror #3
+
+0x41 0x00 0x23 0x4a
+0x41 0x00 0x23 0xca
+0x41 0x10 0x23 0x4a
+0x41 0x10 0x23 0xca
+0x41 0x10 0x63 0x4a
+0x41 0x10 0x63 0xca
+0x41 0x10 0xa3 0x4a
+0x41 0x10 0xa3 0xca
+0x41 0x10 0xe3 0x4a
+0x41 0x10 0xe3 0xca
+
+# CHECK: eon w1, w2, w3
+# CHECK: eon x1, x2, x3
+# CHECK: eon w1, w2, w3, lsl #4
+# CHECK: eon x1, x2, x3, lsl #4
+# CHECK: eon w1, w2, w3, lsr #4
+# CHECK: eon x1, x2, x3, lsr #4
+# CHECK: eon w1, w2, w3, asr #4
+# CHECK: eon x1, x2, x3, asr #4
+# CHECK: eon w1, w2, w3, ror #4
+# CHECK: eon x1, x2, x3, ror #4
+
+0x41 0x00 0x03 0x4a
+0x41 0x00 0x03 0xca
+0x41 0x14 0x03 0x4a
+0x41 0x14 0x03 0xca
+0x41 0x14 0x43 0x4a
+0x41 0x14 0x43 0xca
+0x41 0x14 0x83 0x4a
+0x41 0x14 0x83 0xca
+0x41 0x14 0xc3 0x4a
+0x41 0x14 0xc3 0xca
+
+# CHECK: eor w1, w2, w3
+# CHECK: eor x1, x2, x3
+# CHECK: eor w1, w2, w3, lsl #5
+# CHECK: eor x1, x2, x3, lsl #5
+# CHECK: eor w1, w2, w3, lsr #5
+# CHECK: eor x1, x2, x3, lsr #5
+# CHECK: eor w1, w2, w3, asr #5
+# CHECK: eor x1, x2, x3, asr #5
+# CHECK: eor w1, w2, w3, ror #5
+# CHECK: eor x1, x2, x3, ror #5
+
+0x41 0x00 0x03 0x2a
+0x41 0x00 0x03 0xaa
+0x41 0x18 0x03 0x2a
+0x41 0x18 0x03 0xaa
+0x41 0x18 0x43 0x2a
+0x41 0x18 0x43 0xaa
+0x41 0x18 0x83 0x2a
+0x41 0x18 0x83 0xaa
+0x41 0x18 0xc3 0x2a
+0x41 0x18 0xc3 0xaa
+
+# CHECK: orr w1, w2, w3
+# CHECK: orr x1, x2, x3
+# CHECK: orr w1, w2, w3, lsl #6
+# CHECK: orr x1, x2, x3, lsl #6
+# CHECK: orr w1, w2, w3, lsr #6
+# CHECK: orr x1, x2, x3, lsr #6
+# CHECK: orr w1, w2, w3, asr #6
+# CHECK: orr x1, x2, x3, asr #6
+# CHECK: orr w1, w2, w3, ror #6
+# CHECK: orr x1, x2, x3, ror #6
+
+0x41 0x00 0x23 0x2a
+0x41 0x00 0x23 0xaa
+0x41 0x1c 0x23 0x2a
+0x41 0x1c 0x23 0xaa
+0x41 0x1c 0x63 0x2a
+0x41 0x1c 0x63 0xaa
+0x41 0x1c 0xa3 0x2a
+0x41 0x1c 0xa3 0xaa
+0x41 0x1c 0xe3 0x2a
+0x41 0x1c 0xe3 0xaa
+
+# CHECK: orn w1, w2, w3
+# CHECK: orn x1, x2, x3
+# CHECK: orn w1, w2, w3, lsl #7
+# CHECK: orn x1, x2, x3, lsl #7
+# CHECK: orn w1, w2, w3, lsr #7
+# CHECK: orn x1, x2, x3, lsr #7
+# CHECK: orn w1, w2, w3, asr #7
+# CHECK: orn x1, x2, x3, asr #7
+# CHECK: orn w1, w2, w3, ror #7
+# CHECK: orn x1, x2, x3, ror #7
diff --git a/test/MC/Disassembler/AArch64/arm64-memory.txt b/test/MC/Disassembler/AArch64/arm64-memory.txt
new file mode 100644
index 0000000..54556a1
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-memory.txt
@@ -0,0 +1,564 @@
+# RUN: llvm-mc --disassemble -triple arm64-apple-darwin < %s | FileCheck %s
+
+#-----------------------------------------------------------------------------
+# Indexed loads
+#-----------------------------------------------------------------------------
+
+  0x85 0x14 0x40 0xb9
+  0x64 0x00 0x40 0xf9
+  0xe2 0x13 0x40 0xf9
+  0xe5 0x07 0x40 0x3d
+  0xe6 0x07 0x40 0x7d
+  0xe7 0x07 0x40 0xbd
+  0xe8 0x07 0x40 0xfd
+  0xe9 0x07 0xc0 0x3d
+  0x64 0x00 0x40 0x39
+  0x20 0x78 0xa0 0xb8
+  0x85 0x50 0x40 0x39
+
+# CHECK: ldr	w5, [x4, #20]
+# CHECK: ldr	x4, [x3]
+# CHECK: ldr	x2, [sp, #32]
+# CHECK: ldr	b5, [sp, #1]
+# CHECK: ldr	h6, [sp, #2]
+# CHECK: ldr	s7, [sp, #4]
+# CHECK: ldr	d8, [sp, #8]
+# CHECK: ldr	q9, [sp, #16]
+# CHECK: ldrb	w4, [x3]
+# CHECK: ldrsw	x0, [x1, x0, lsl #2]
+# CHECK: ldrb	w5, [x4, #20]
+# CHECK: ldrsb	w9, [x3]
+# CHECK: ldrsb	x2, [sp, #128]
+# CHECK: ldrh	w2, [sp, #32]
+# CHECK: ldrsh	w3, [sp, #32]
+# CHECK: ldrsh	x5, [x9, #24]
+# CHECK: ldrsw	x9, [sp, #512]
+# CHECK: prfm	pldl3strm, [sp, #32]
+
+  0x69 0x00 0xc0 0x39
+  0xe2 0x03 0x82 0x39
+  0xe2 0x43 0x40 0x79
+  0xe3 0x43 0xc0 0x79
+  0x25 0x31 0x80 0x79
+  0xe9 0x03 0x82 0xb9
+  0xe5 0x13 0x80 0xf9
+  0x40 0x00 0x80 0xf9
+  0x41 0x00 0x80 0xf9
+  0x42 0x00 0x80 0xf9
+  0x43 0x00 0x80 0xf9
+  0x44 0x00 0x80 0xf9
+  0x45 0x00 0x80 0xf9
+  0x50 0x00 0x80 0xf9
+  0x51 0x00 0x80 0xf9
+  0x52 0x00 0x80 0xf9
+  0x53 0x00 0x80 0xf9
+  0x54 0x00 0x80 0xf9
+  0x55 0x00 0x80 0xf9
+
+# CHECK: prfm	pldl1keep, [x2]
+# CHECK: prfm	pldl1strm, [x2]
+# CHECK: prfm	pldl2keep, [x2]
+# CHECK: prfm	pldl2strm, [x2]
+# CHECK: prfm	pldl3keep, [x2]
+# CHECK: prfm	pldl3strm, [x2]
+# CHECK: prfm	pstl1keep, [x2]
+# CHECK: prfm	pstl1strm, [x2]
+# CHECK: prfm	pstl2keep, [x2]
+# CHECK: prfm	pstl2strm, [x2]
+# CHECK: prfm	pstl3keep, [x2]
+# CHECK: prfm	pstl3strm, [x2]
+
+#-----------------------------------------------------------------------------
+# Indexed stores
+#-----------------------------------------------------------------------------
+
+  0x64 0x00 0x00 0xf9
+  0xe2 0x13 0x00 0xf9
+  0x85 0x14 0x00 0xb9
+  0xe5 0x07 0x00 0x3d
+  0xe6 0x07 0x00 0x7d
+  0xe7 0x07 0x00 0xbd
+  0xe8 0x07 0x00 0xfd
+  0xe9 0x07 0x80 0x3d
+  0x64 0x00 0x00 0x39
+  0x85 0x50 0x00 0x39
+  0xe2 0x43 0x00 0x79
+  0x00 0xe8 0x20 0x38
+  0x00 0x48 0x20 0x38
+
+# CHECK: str	x4, [x3]
+# CHECK: str	x2, [sp, #32]
+# CHECK: str	w5, [x4, #20]
+# CHECK: str	b5, [sp, #1]
+# CHECK: str	h6, [sp, #2]
+# CHECK: str	s7, [sp, #4]
+# CHECK: str	d8, [sp, #8]
+# CHECK: str	q9, [sp, #16]
+# CHECK: strb	w4, [x3]
+# CHECK: strb	w5, [x4, #20]
+# CHECK: strh	w2, [sp, #32]
+# CHECK: strb	w0, [x0, x0, sxtx]
+# CHECK: strb	w0, [x0, w0, uxtw]
+
+#-----------------------------------------------------------------------------
+# Unscaled immediate loads and stores
+#-----------------------------------------------------------------------------
+
+  0x62 0x00 0x40 0xb8
+  0xe2 0x83 0x41 0xb8
+  0x62 0x00 0x40 0xf8
+  0xe2 0x83 0x41 0xf8
+  0xe5 0x13 0x40 0x3c
+  0xe6 0x23 0x40 0x7c
+  0xe7 0x43 0x40 0xbc
+  0xe8 0x83 0x40 0xfc
+  0xe9 0x03 0xc1 0x3c
+  0x69 0x00 0xc0 0x38
+  0xe2 0x03 0x88 0x38
+  0xe3 0x03 0xc2 0x78
+  0x25 0x81 0x81 0x78
+  0xe9 0x03 0x98 0xb8
+
+# CHECK: ldur	w2, [x3]
+# CHECK: ldur	w2, [sp, #24]
+# CHECK: ldur	x2, [x3]
+# CHECK: ldur	x2, [sp, #24]
+# CHECK: ldur	b5, [sp, #1]
+# CHECK: ldur	h6, [sp, #2]
+# CHECK: ldur	s7, [sp, #4]
+# CHECK: ldur	d8, [sp, #8]
+# CHECK: ldur	q9, [sp, #16]
+# CHECK: ldursb	w9, [x3]
+# CHECK: ldursb	x2, [sp, #128]
+# CHECK: ldursh	w3, [sp, #32]
+# CHECK: ldursh	x5, [x9, #24]
+# CHECK: ldursw	x9, [sp, #-128]
+
+  0x64 0x00 0x00 0xb8
+  0xe2 0x03 0x02 0xb8
+  0x64 0x00 0x00 0xf8
+  0xe2 0x03 0x02 0xf8
+  0x85 0x40 0x01 0xb8
+  0xe5 0x13 0x00 0x3c
+  0xe6 0x23 0x00 0x7c
+  0xe7 0x43 0x00 0xbc
+  0xe8 0x83 0x00 0xfc
+  0xe9 0x03 0x81 0x3c
+  0x64 0x00 0x00 0x38
+  0x85 0x40 0x01 0x38
+  0xe2 0x03 0x02 0x78
+  0xe5 0x03 0x82 0xf8
+
+# CHECK: stur	w4, [x3]
+# CHECK: stur	w2, [sp, #32]
+# CHECK: stur	x4, [x3]
+# CHECK: stur	x2, [sp, #32]
+# CHECK: stur	w5, [x4, #20]
+# CHECK: stur	b5, [sp, #1]
+# CHECK: stur	h6, [sp, #2]
+# CHECK: stur	s7, [sp, #4]
+# CHECK: stur	d8, [sp, #8]
+# CHECK: stur	q9, [sp, #16]
+# CHECK: sturb	w4, [x3]
+# CHECK: sturb	w5, [x4, #20]
+# CHECK: sturh	w2, [sp, #32]
+# CHECK: prfum	pldl3strm, [sp, #32]
+
+#-----------------------------------------------------------------------------
+# Unprivileged loads and stores
+#-----------------------------------------------------------------------------
+
+  0x83 0x08 0x41 0xb8
+  0x83 0x08 0x41 0xf8
+  0x83 0x08 0x41 0x38
+  0x69 0x08 0xc0 0x38
+  0xe2 0x0b 0x88 0x38
+  0x83 0x08 0x41 0x78
+  0xe3 0x0b 0xc2 0x78
+  0x25 0x89 0x81 0x78
+  0xe9 0x0b 0x98 0xb8
+
+# CHECK: ldtr	w3, [x4, #16]
+# CHECK: ldtr	x3, [x4, #16]
+# CHECK: ldtrb	w3, [x4, #16]
+# CHECK: ldtrsb	w9, [x3]
+# CHECK: ldtrsb	x2, [sp, #128]
+# CHECK: ldtrh	w3, [x4, #16]
+# CHECK: ldtrsh	w3, [sp, #32]
+# CHECK: ldtrsh	x5, [x9, #24]
+# CHECK: ldtrsw	x9, [sp, #-128]
+
+  0x85 0x48 0x01 0xb8
+  0x64 0x08 0x00 0xf8
+  0xe2 0x0b 0x02 0xf8
+  0x64 0x08 0x00 0x38
+  0x85 0x48 0x01 0x38
+  0xe2 0x0b 0x02 0x78
+
+# CHECK: sttr	w5, [x4, #20]
+# CHECK: sttr	x4, [x3]
+# CHECK: sttr	x2, [sp, #32]
+# CHECK: sttrb	w4, [x3]
+# CHECK: sttrb	w5, [x4, #20]
+# CHECK: sttrh	w2, [sp, #32]
+
+#-----------------------------------------------------------------------------
+# Pre-indexed loads and stores
+#-----------------------------------------------------------------------------
+
+  0xfd 0x8c 0x40 0xf8
+  0xfe 0x8c 0x40 0xf8
+  0x05 0x1c 0x40 0x3c
+  0x06 0x2c 0x40 0x7c
+  0x07 0x4c 0x40 0xbc
+  0x08 0x8c 0x40 0xfc
+  0x09 0x0c 0xc1 0x3c
+
+# CHECK: ldr	x29, [x7, #8]!
+# CHECK: ldr	x30, [x7, #8]!
+# CHECK: ldr	b5, [x0, #1]!
+# CHECK: ldr	h6, [x0, #2]!
+# CHECK: ldr	s7, [x0, #4]!
+# CHECK: ldr	d8, [x0, #8]!
+# CHECK: ldr	q9, [x0, #16]!
+
+  0xfe 0x8c 0x1f 0xf8
+  0xfd 0x8c 0x1f 0xf8
+  0x05 0xfc 0x1f 0x3c
+  0x06 0xec 0x1f 0x7c
+  0x07 0xcc 0x1f 0xbc
+  0x08 0x8c 0x1f 0xfc
+  0x09 0x0c 0x9f 0x3c
+
+# CHECK: str	x30, [x7, #-8]!
+# CHECK: str	x29, [x7, #-8]!
+# CHECK: str	b5, [x0, #-1]!
+# CHECK: str	h6, [x0, #-2]!
+# CHECK: str	s7, [x0, #-4]!
+# CHECK: str	d8, [x0, #-8]!
+# CHECK: str	q9, [x0, #-16]!
+
+#-----------------------------------------------------------------------------
+# post-indexed loads and stores
+#-----------------------------------------------------------------------------
+
+  0xfe 0x84 0x1f 0xf8
+  0xfd 0x84 0x1f 0xf8
+  0x05 0xf4 0x1f 0x3c
+  0x06 0xe4 0x1f 0x7c
+  0x07 0xc4 0x1f 0xbc
+  0x08 0x84 0x1f 0xfc
+  0x09 0x04 0x9f 0x3c
+
+# CHECK: str	x30, [x7], #-8
+# CHECK: str	x29, [x7], #-8
+# CHECK: str	b5, [x0], #-1
+# CHECK: str	h6, [x0], #-2
+# CHECK: str	s7, [x0], #-4
+# CHECK: str	d8, [x0], #-8
+# CHECK: str	q9, [x0], #-16
+
+  0xfd 0x84 0x40 0xf8
+  0xfe 0x84 0x40 0xf8
+  0x05 0x14 0x40 0x3c
+  0x06 0x24 0x40 0x7c
+  0x07 0x44 0x40 0xbc
+  0x08 0x84 0x40 0xfc
+  0x09 0x04 0xc1 0x3c
+
+# CHECK: ldr	x29, [x7], #8
+# CHECK: ldr	x30, [x7], #8
+# CHECK: ldr	b5, [x0], #1
+# CHECK: ldr	h6, [x0], #2
+# CHECK: ldr	s7, [x0], #4
+# CHECK: ldr	d8, [x0], #8
+# CHECK: ldr	q9, [x0], #16
+
+#-----------------------------------------------------------------------------
+# Load/Store pair (indexed  offset)
+#-----------------------------------------------------------------------------
+
+  0xe3 0x09 0x42 0x29
+  0xe4 0x27 0x7f 0xa9
+  0xc2 0x0d 0x42 0x69
+  0xe2 0x0f 0x7e 0x69
+  0x4a 0x04 0x48 0x2d
+  0x4a 0x04 0x40 0x6d
+
+# CHECK: ldp	w3, w2, [x15, #16]
+# CHECK: ldp	x4, x9, [sp, #-16]
+# CHECK: ldpsw	x2, x3, [x14, #16]
+# CHECK: ldpsw	x2, x3, [sp, #-16]
+# CHECK: ldp	s10, s1, [x2, #64]
+# CHECK: ldp	d10, d1, [x2]
+
+  0xe3 0x09 0x02 0x29
+  0xe4 0x27 0x3f 0xa9
+  0x4a 0x04 0x08 0x2d
+  0x4a 0x04 0x00 0x6d
+
+# CHECK: stp	w3, w2, [x15, #16]
+# CHECK: stp	x4, x9, [sp, #-16]
+# CHECK: stp	s10, s1, [x2, #64]
+# CHECK: stp	d10, d1, [x2]
+
+#-----------------------------------------------------------------------------
+# Load/Store pair (pre-indexed)
+#-----------------------------------------------------------------------------
+
+  0xe3 0x09 0xc2 0x29
+  0xe4 0x27 0xff 0xa9
+  0xc2 0x0d 0xc2 0x69
+  0xe2 0x0f 0xfe 0x69
+  0x4a 0x04 0xc8 0x2d
+  0x4a 0x04 0xc1 0x6d
+
+# CHECK: ldp	w3, w2, [x15, #16]!
+# CHECK: ldp	x4, x9, [sp, #-16]!
+# CHECK: ldpsw	x2, x3, [x14, #16]!
+# CHECK: ldpsw	x2, x3, [sp, #-16]!
+# CHECK: ldp	s10, s1, [x2, #64]!
+# CHECK: ldp	d10, d1, [x2, #16]!
+
+  0xe3 0x09 0x82 0x29
+  0xe4 0x27 0xbf 0xa9
+  0x4a 0x04 0x88 0x2d
+  0x4a 0x04 0x81 0x6d
+
+# CHECK: stp	w3, w2, [x15, #16]!
+# CHECK: stp	x4, x9, [sp, #-16]!
+# CHECK: stp	s10, s1, [x2, #64]!
+# CHECK: stp	d10, d1, [x2, #16]!
+
+#-----------------------------------------------------------------------------
+# Load/Store pair (post-indexed)
+#-----------------------------------------------------------------------------
+
+  0xe3 0x09 0xc2 0x28
+  0xe4 0x27 0xff 0xa8
+  0xc2 0x0d 0xc2 0x68
+  0xe2 0x0f 0xfe 0x68
+  0x4a 0x04 0xc8 0x2c
+  0x4a 0x04 0xc1 0x6c
+
+# CHECK: ldp	w3, w2, [x15], #16
+# CHECK: ldp	x4, x9, [sp], #-16
+# CHECK: ldpsw	x2, x3, [x14], #16
+# CHECK: ldpsw	x2, x3, [sp], #-16
+# CHECK: ldp	s10, s1, [x2], #64
+# CHECK: ldp	d10, d1, [x2], #16
+
+  0xe3 0x09 0x82 0x28
+  0xe4 0x27 0xbf 0xa8
+  0x4a 0x04 0x88 0x2c
+  0x4a 0x04 0x81 0x6c
+
+# CHECK: stp	w3, w2, [x15], #16
+# CHECK: stp	x4, x9, [sp], #-16
+# CHECK: stp	s10, s1, [x2], #64
+# CHECK: stp	d10, d1, [x2], #16
+
+#-----------------------------------------------------------------------------
+# Load/Store pair (no-allocate)
+#-----------------------------------------------------------------------------
+
+  0xe3 0x09 0x42 0x28
+  0xe4 0x27 0x7f 0xa8
+  0x4a 0x04 0x48 0x2c
+  0x4a 0x04 0x40 0x6c
+
+# CHECK: ldnp	w3, w2, [x15, #16]
+# CHECK: ldnp	x4, x9, [sp, #-16]
+# CHECK: ldnp	s10, s1, [x2, #64]
+# CHECK: ldnp	d10, d1, [x2]
+
+  0xe3 0x09 0x02 0x28
+  0xe4 0x27 0x3f 0xa8
+  0x4a 0x04 0x08 0x2c
+  0x4a 0x04 0x00 0x6c
+
+# CHECK: stnp	w3, w2, [x15, #16]
+# CHECK: stnp	x4, x9, [sp, #-16]
+# CHECK: stnp	s10, s1, [x2, #64]
+# CHECK: stnp	d10, d1, [x2]
+
+#-----------------------------------------------------------------------------
+# Load/Store register offset
+#-----------------------------------------------------------------------------
+
+  0x00 0x68 0x60 0xb8
+  0x00 0x78 0x60 0xb8
+  0x00 0x68 0x60 0xf8
+  0x00 0x78 0x60 0xf8
+  0x00 0xe8 0x60 0xf8
+
+# CHECK: ldr	w0, [x0, x0]
+# CHECK: ldr	w0, [x0, x0, lsl #2]
+# CHECK: ldr	x0, [x0, x0]
+# CHECK: ldr	x0, [x0, x0, lsl #3]
+# CHECK: ldr	x0, [x0, x0, sxtx]
+
+  0x21 0x68 0x62 0x3c
+  0x21 0x78 0x62 0x3c
+  0x21 0x68 0x62 0x7c
+  0x21 0x78 0x62 0x7c
+  0x21 0x68 0x62 0xbc
+  0x21 0x78 0x62 0xbc
+  0x21 0x68 0x62 0xfc
+  0x21 0x78 0x62 0xfc
+  0x21 0x68 0xe2 0x3c
+  0x21 0x78 0xe2 0x3c
+
+# CHECK: ldr	b1, [x1, x2]
+# CHECK: ldr	b1, [x1, x2, lsl #0]
+# CHECK: ldr	h1, [x1, x2]
+# CHECK: ldr	h1, [x1, x2, lsl #1]
+# CHECK: ldr	s1, [x1, x2]
+# CHECK: ldr	s1, [x1, x2, lsl #2]
+# CHECK: ldr	d1, [x1, x2]
+# CHECK: ldr	d1, [x1, x2, lsl #3]
+# CHECK: ldr	q1, [x1, x2]
+# CHECK: ldr	q1, [x1, x2, lsl #4]
+
+  0x00 0x48 0x20 0x7c
+  0xe1 0x6b 0x23 0xfc
+  0xe1 0x5b 0x23 0xfc
+  0xe1 0x6b 0xa3 0x3c
+  0xe1 0x5b 0xa3 0x3c
+
+# CHECK: str	h0, [x0, w0, uxtw]
+# CHECK: str	d1, [sp, x3]
+# CHECK: str	d1, [sp, w3, uxtw #3]
+# CHECK: str	q1, [sp, x3]
+# CHECK: str	q1, [sp, w3, uxtw #4]
+
+#-----------------------------------------------------------------------------
+# Load/Store exclusive
+#-----------------------------------------------------------------------------
+
+  0x26 0x7c 0x5f 0x08
+  0x26 0x7c 0x5f 0x48
+  0x27 0x0d 0x7f 0x88
+  0x27 0x0d 0x7f 0xc8
+
+# CHECK: ldxrb	w6, [x1]
+# CHECK: ldxrh	w6, [x1]
+# CHECK: ldxp	w7, w3, [x9]
+# CHECK: ldxp	x7, x3, [x9]
+
+  0x64 0x7c 0x01 0xc8
+  0x64 0x7c 0x01 0x88
+  0x64 0x7c 0x01 0x08
+  0x64 0x7c 0x01 0x48
+  0x22 0x18 0x21 0xc8
+  0x22 0x18 0x21 0x88
+
+# CHECK: stxr	w1, x4, [x3]
+# CHECK: stxr	w1, w4, [x3]
+# CHECK: stxrb	w1, w4, [x3]
+# CHECK: stxrh	w1, w4, [x3]
+# CHECK: stxp	w1, x2, x6, [x1]
+# CHECK: stxp	w1, w2, w6, [x1]
+
+#-----------------------------------------------------------------------------
+# Load-acquire/Store-release non-exclusive
+#-----------------------------------------------------------------------------
+
+  0xe4 0xff 0xdf 0x88
+  0xe4 0xff 0xdf 0xc8
+  0xe4 0xff 0xdf 0x08
+  0xe4 0xff 0xdf 0x48
+
+# CHECK: ldar	w4, [sp]
+# CHECK: ldar	x4, [sp]
+# CHECK: ldarb	w4, [sp]
+# CHECK: ldarh	w4, [sp]
+
+  0xc3 0xfc 0x9f 0x88
+  0xc3 0xfc 0x9f 0xc8
+  0xc3 0xfc 0x9f 0x08
+  0xc3 0xfc 0x9f 0x48
+
+# CHECK: stlr	w3, [x6]
+# CHECK: stlr	x3, [x6]
+# CHECK: stlrb	w3, [x6]
+# CHECK: stlrh	w3, [x6]
+
+#-----------------------------------------------------------------------------
+# Load-acquire/Store-release exclusive
+#-----------------------------------------------------------------------------
+
+  0x82 0xfc 0x5f 0x88
+  0x82 0xfc 0x5f 0xc8
+  0x82 0xfc 0x5f 0x08
+  0x82 0xfc 0x5f 0x48
+  0x22 0x98 0x7f 0x88
+  0x22 0x98 0x7f 0xc8
+
+# CHECK: ldaxr	w2, [x4]
+# CHECK: ldaxr	x2, [x4]
+# CHECK: ldaxrb	w2, [x4]
+# CHECK: ldaxrh	w2, [x4]
+# CHECK: ldaxp	w2, w6, [x1]
+# CHECK: ldaxp	x2, x6, [x1]
+
+  0x27 0xfc 0x08 0xc8
+  0x27 0xfc 0x08 0x88
+  0x27 0xfc 0x08 0x08
+  0x27 0xfc 0x08 0x48
+  0x22 0x98 0x21 0xc8
+  0x22 0x98 0x21 0x88
+
+# CHECK: stlxr	w8, x7, [x1]
+# CHECK: stlxr	w8, w7, [x1]
+# CHECK: stlxrb	w8, w7, [x1]
+# CHECK: stlxrh	w8, w7, [x1]
+# CHECK: stlxp	w1, x2, x6, [x1]
+# CHECK: stlxp	w1, w2, w6, [x1]
+
+#-----------------------------------------------------------------------------
+# Load/Store with explicit LSL values
+#-----------------------------------------------------------------------------
+  0x20 0x78 0xa0 0xb8
+  0x20 0x78 0x60 0xf8
+  0x20 0x78 0x20 0xf8
+  0x20 0x78 0x60 0xb8
+  0x20 0x78 0x20 0xb8
+  0x20 0x78 0xe0 0x3c
+  0x20 0x78 0xa0 0x3c
+  0x20 0x78 0x60 0xfc
+  0x20 0x78 0x20 0xfc
+  0x20 0x78 0x60 0xbc
+  0x20 0x78 0x20 0xbc
+  0x20 0x78 0x60 0x7c
+  0x20 0x78 0x60 0x3c
+  0x20 0x78 0x60 0x38
+  0x20 0x78 0x20 0x38
+  0x20 0x78 0xe0 0x38
+  0x20 0x78 0x60 0x78
+  0x20 0x78 0x20 0x78
+  0x20 0x78 0xe0 0x78
+  0x20 0x78 0xa0 0x38
+  0x20 0x78 0xa0 0x78
+
+# CHECK: ldrsw	x0, [x1, x0, lsl #2]
+# CHECK: ldr	x0, [x1, x0, lsl #3]
+# CHECK: str	x0, [x1, x0, lsl #3]
+# CHECK: ldr	w0, [x1, x0, lsl #2]
+# CHECK: str	w0, [x1, x0, lsl #2]
+# CHECK: ldr	q0, [x1, x0, lsl #4]
+# CHECK: str	q0, [x1, x0, lsl #4]
+# CHECK: ldr	d0, [x1, x0, lsl #3]
+# CHECK: str	d0, [x1, x0, lsl #3]
+# CHECK: ldr	s0, [x1, x0, lsl #2]
+# CHECK: str	s0, [x1, x0, lsl #2]
+# CHECK: ldr	h0, [x1, x0, lsl #1]
+# CHECK: ldr	b0, [x1, x0, lsl #0]
+# CHECK: ldrb	w0, [x1, x0, lsl #0]
+# CHECK: strb	w0, [x1, x0, lsl #0]
+# CHECK: ldrsb	w0, [x1, x0, lsl #0]
+# CHECK: ldrh	w0, [x1, x0, lsl #1]
+# CHECK: strh	w0, [x1, x0, lsl #1]
+# CHECK: ldrsh	w0, [x1, x0, lsl #1]
+# CHECK: ldrsb	x0, [x1, x0, lsl #0]
+# CHECK: ldrsh	x0, [x1, x0, lsl #1]
diff --git a/test/MC/Disassembler/AArch64/arm64-non-apple-fmov.txt b/test/MC/Disassembler/AArch64/arm64-non-apple-fmov.txt
new file mode 100644
index 0000000..75cb95c
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-non-apple-fmov.txt
@@ -0,0 +1,7 @@
+# RUN: llvm-mc -triple arm64 -mattr=neon -disassemble < %s | FileCheck %s
+
+0x00 0x00 0xae 0x9e
+0x00 0x00 0xaf 0x9e
+
+# CHECK: fmov x0, v0.d[1]
+# CHECK: fmov v0.d[1], x0
diff --git a/test/MC/Disassembler/AArch64/arm64-scalar-fp.txt b/test/MC/Disassembler/AArch64/arm64-scalar-fp.txt
new file mode 100644
index 0000000..f139700
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-scalar-fp.txt
@@ -0,0 +1,255 @@
+# RUN: llvm-mc -triple arm64-apple-darwin -mattr=neon --disassemble -output-asm-variant=1 < %s | FileCheck %s
+
+#-----------------------------------------------------------------------------
+# Floating-point arithmetic
+#-----------------------------------------------------------------------------
+
+0x41 0xc0 0x20 0x1e
+0x41 0xc0 0x60 0x1e
+
+# CHECK: fabs s1, s2
+# CHECK: fabs d1, d2
+
+0x41 0x28 0x23 0x1e
+0x41 0x28 0x63 0x1e
+
+# CHECK: fadd s1, s2, s3
+# CHECK: fadd d1, d2, d3
+
+0x41 0x18 0x23 0x1e
+0x41 0x18 0x63 0x1e
+
+# CHECK: fdiv s1, s2, s3
+# CHECK: fdiv d1, d2, d3
+
+0x41 0x10 0x03 0x1f
+0x41 0x10 0x43 0x1f
+
+# CHECK: fmadd s1, s2, s3, s4
+# CHECK: fmadd d1, d2, d3, d4
+
+0x41 0x48 0x23 0x1e
+0x41 0x48 0x63 0x1e
+0x41 0x68 0x23 0x1e
+0x41 0x68 0x63 0x1e
+
+# CHECK: fmax   s1, s2, s3
+# CHECK: fmax   d1, d2, d3
+# CHECK: fmaxnm s1, s2, s3
+# CHECK: fmaxnm d1, d2, d3
+
+0x41 0x58 0x23 0x1e
+0x41 0x58 0x63 0x1e
+0x41 0x78 0x23 0x1e
+0x41 0x78 0x63 0x1e
+
+# CHECK: fmin   s1, s2, s3
+# CHECK: fmin   d1, d2, d3
+# CHECK: fminnm s1, s2, s3
+# CHECK: fminnm d1, d2, d3
+
+0x41 0x90 0x03 0x1f
+0x41 0x90 0x43 0x1f
+
+# CHECK: fmsub s1, s2, s3, s4
+# CHECK: fmsub d1, d2, d3, d4
+
+0x41 0x08 0x23 0x1e
+0x41 0x08 0x63 0x1e
+
+# CHECK: fmul s1, s2, s3
+# CHECK: fmul d1, d2, d3
+
+0x41 0x40 0x21 0x1e
+0x41 0x40 0x61 0x1e
+
+# CHECK: fneg s1, s2
+# CHECK: fneg d1, d2
+
+0x41 0x10 0x23 0x1f
+0x41 0x10 0x63 0x1f
+
+# CHECK: fnmadd s1, s2, s3, s4
+# CHECK: fnmadd d1, d2, d3, d4
+
+0x41 0x90 0x23 0x1f
+0x41 0x90 0x63 0x1f
+
+# CHECK: fnmsub s1, s2, s3, s4
+# CHECK: fnmsub d1, d2, d3, d4
+
+0x41 0x88 0x23 0x1e
+0x41 0x88 0x63 0x1e
+
+# CHECK: fnmul s1, s2, s3
+# CHECK: fnmul d1, d2, d3
+
+0x41 0xc0 0x21 0x1e
+0x41 0xc0 0x61 0x1e
+
+# CHECK: fsqrt s1, s2
+# CHECK: fsqrt d1, d2
+
+0x41 0x38 0x23 0x1e
+0x41 0x38 0x63 0x1e
+
+# CHECK: fsub s1, s2, s3
+# CHECK: fsub d1, d2, d3
+
+#-----------------------------------------------------------------------------
+# Floating-point comparison
+#-----------------------------------------------------------------------------
+
+0x20 0x04 0x22 0x1e
+0x20 0x04 0x62 0x1e
+0x30 0x04 0x22 0x1e
+0x30 0x04 0x62 0x1e
+
+# CHECK: fccmp  s1, s2, #0, eq
+# CHECK: fccmp  d1, d2, #0, eq
+# CHECK: fccmpe s1, s2, #0, eq
+# CHECK: fccmpe d1, d2, #0, eq
+
+0x20 0x20 0x22 0x1e
+0x20 0x20 0x62 0x1e
+0x28 0x20 0x20 0x1e
+0x28 0x20 0x60 0x1e
+0x30 0x20 0x22 0x1e
+0x30 0x20 0x62 0x1e
+0x38 0x20 0x20 0x1e
+0x38 0x20 0x60 0x1e
+
+# CHECK: fcmp  s1, s2
+# CHECK: fcmp  d1, d2
+# CHECK: fcmp  s1, #0.0
+# CHECK: fcmp  d1, #0.0
+# CHECK: fcmpe s1, s2
+# CHECK: fcmpe d1, d2
+# CHECK: fcmpe s1, #0.0
+# CHECK: fcmpe d1, #0.0
+
+#-----------------------------------------------------------------------------
+# Floating-point conditional select
+#-----------------------------------------------------------------------------
+
+0x41 0x0c 0x23 0x1e
+0x41 0x0c 0x63 0x1e
+
+# CHECK: fcsel s1, s2, s3, eq
+# CHECK: fcsel d1, d2, d3, eq
+
+#-----------------------------------------------------------------------------
+# Floating-point convert
+#-----------------------------------------------------------------------------
+
+0x41 0xc0 0x63 0x1e
+0x41 0x40 0x62 0x1e
+0x41 0xc0 0xe2 0x1e
+0x41 0x40 0xe2 0x1e
+0x41 0xc0 0x22 0x1e
+0x41 0xc0 0x23 0x1e
+
+# CHECK: fcvt h1, d2
+# CHECK: fcvt s1, d2
+# CHECK: fcvt d1, h2
+# CHECK: fcvt s1, h2
+# CHECK: fcvt d1, s2
+# CHECK: fcvt h1, s2
+
+0x41 0x00 0x44 0x1e
+0x41 0x04 0x44 0x1e
+0x41 0x00 0x44 0x9e
+0x41 0x04 0x44 0x9e
+0x41 0x00 0x04 0x1e
+0x41 0x04 0x04 0x1e
+0x41 0x00 0x04 0x9e
+0x41 0x04 0x04 0x9e
+
+#-----------------------------------------------------------------------------
+# Floating-point move
+#-----------------------------------------------------------------------------
+
+0x41 0x00 0x27 0x1e
+0x41 0x00 0x26 0x1e
+0x41 0x00 0x67 0x9e
+0x41 0x00 0x66 0x9e
+
+# CHECK: fmov s1, w2
+# CHECK: fmov w1, s2
+# CHECK: fmov d1, x2
+# CHECK: fmov x1, d2
+
+0x01 0x10 0x28 0x1e
+0x01 0x10 0x68 0x1e
+0x01 0xf0 0x7b 0x1e
+0x01 0xf0 0x6b 0x1e
+
+# CHECK: fmov s1, #0.12500000
+# CHECK: fmov d1, #0.12500000
+# CHECK: fmov d1, #-0.48437500
+# CHECK: fmov d1, #0.48437500
+
+0x41 0x40 0x20 0x1e
+0x41 0x40 0x60 0x1e
+
+# CHECK: fmov s1, s2
+# CHECK: fmov d1, d2
+
+#-----------------------------------------------------------------------------
+# Floating-point round to integral
+#-----------------------------------------------------------------------------
+
+0x41 0x40 0x26 0x1e
+0x41 0x40 0x66 0x1e
+
+# CHECK: frinta s1, s2
+# CHECK: frinta d1, d2
+
+0x41 0xc0 0x27 0x1e
+0x41 0xc0 0x67 0x1e
+
+# CHECK: frinti s1, s2
+# CHECK: frinti d1, d2
+
+0x41 0x40 0x25 0x1e
+0x41 0x40 0x65 0x1e
+
+# CHECK: frintm s1, s2
+# CHECK: frintm d1, d2
+
+0x41 0x40 0x24 0x1e
+0x41 0x40 0x64 0x1e
+
+# CHECK: frintn s1, s2
+# CHECK: frintn d1, d2
+
+0x41 0xc0 0x24 0x1e
+0x41 0xc0 0x64 0x1e
+
+# CHECK: frintp s1, s2
+# CHECK: frintp d1, d2
+
+0x41 0x40 0x27 0x1e
+0x41 0x40 0x67 0x1e
+
+# CHECK: frintx s1, s2
+# CHECK: frintx d1, d2
+
+0x41 0xc0 0x25 0x1e
+0x41 0xc0 0x65 0x1e
+
+# CHECK: frintz s1, s2
+# CHECK: frintz d1, d2
+
+  0x00 0x3c 0xe0 0x7e
+  0x00 0x8c 0xe0 0x5e
+
+# CHECK: cmhs d0, d0, d0
+# CHECK: cmtst d0, d0, d0
+
+0x00 0x00 0xaf 0x9e
+0x00 0x00 0xae 0x9e
+
+# CHECK: fmov.d v0[1], x0
+# CHECK: fmov.d x0, v0[1]
+
diff --git a/test/MC/Disassembler/AArch64/arm64-system.txt b/test/MC/Disassembler/AArch64/arm64-system.txt
new file mode 100644
index 0000000..9027a60
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/arm64-system.txt
@@ -0,0 +1,62 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+
+#-----------------------------------------------------------------------------
+# Hint encodings
+#-----------------------------------------------------------------------------
+
+  0x1f 0x20 0x03 0xd5
+# CHECK: nop
+  0x9f 0x20 0x03 0xd5
+# CHECK: sev
+  0xbf 0x20 0x03 0xd5
+# CHECK: sevl
+  0x5f 0x20 0x03 0xd5
+# CHECK: wfe
+  0x7f 0x20 0x03 0xd5
+# CHECK: wfi
+  0x3f 0x20 0x03 0xd5
+# CHECK: yield
+
+#-----------------------------------------------------------------------------
+# Single-immediate operand instructions
+#-----------------------------------------------------------------------------
+
+  0x5f 0x3a 0x03 0xd5
+# CHECK: clrex #10
+  0xdf 0x3f 0x03 0xd5
+# CHECK: isb{{$}}
+  0xdf 0x31 0x03 0xd5
+# CHECK: isb #1
+  0xbf 0x33 0x03 0xd5
+# CHECK: dmb osh
+  0x9f 0x37 0x03 0xd5
+# CHECK: dsb nsh
+  0x3f 0x76 0x08 0xd5
+# CHECK: dc ivac
+
+#-----------------------------------------------------------------------------
+# Generic system instructions
+#-----------------------------------------------------------------------------
+  0xff 0x05 0x0a 0xd5
+  0xe7 0x6a 0x0f 0xd5
+  0xf4 0x3f 0x2e 0xd5
+  0xbf 0x40 0x00 0xd5
+  0x00 0xb0 0x18 0xd5
+  0x00 0xb0 0x38 0xd5
+
+# CHECK: sys #2, c0, c5, #7
+# CHECK: sys #7, c6, c10, #7, x7
+# CHECK: sysl  x20, #6, c3, c15, #7
+# CHECK: msr  SPSEL, #0
+# CHECK: msr S3_0_C11_C0_0, x0
+# CHECK: mrs x0, S3_0_C11_C0_0
+
+  0x40 0xc0 0x1e 0xd5
+  0x40 0xc0 0x1c 0xd5
+  0x40 0xc0 0x18 0xd5
+
+# CHECK: msr RMR_EL3, x0
+# CHECK: msr RMR_EL2, x0
+# CHECK: msr RMR_EL1, x0
+
diff --git a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
index 40926b1..397a39e 100644
--- a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
+++ b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
@@ -1,4 +1,5 @@
 # RUN: llvm-mc -triple=aarch64 -mattr=+fp-armv8 -disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple=arm64 -mattr=+fp-armv8 -disassemble < %s | FileCheck %s
 
 #------------------------------------------------------------------------------
 # Add/sub (immediate)
@@ -187,7 +188,7 @@
 
 # CHECK: sub      w3, w5, w7
 # CHECK: sub      wzr, w3, w5
-# CHECK: sub      w20, wzr, w4
+# CHECK: {{sub      w20, wzr, w4|neg w20, w4}}
 # CHECK: sub      w4, w6, wzr
 # CHECK: sub      w11, w13, w15
 # CHECK: sub      w9, w3, wzr, lsl #10
@@ -214,7 +215,7 @@
 
 # CHECK: sub      x3, x5, x7
 # CHECK: sub      xzr, x3, x5
-# CHECK: sub      x20, xzr, x4
+# CHECK: {{sub      x20, xzr, x4|neg x20, x4}}
 # CHECK: sub      x4, x6, xzr
 # CHECK: sub      x11, x13, x15
 # CHECK: sub      x9, x3, xzr, lsl #10
@@ -241,7 +242,7 @@
 
 # CHECK: subs     w3, w5, w7
 # CHECK: cmp      w3, w5
-# CHECK: subs     w20, wzr, w4
+# CHECK: {{subs     w20, wzr, w4|negs w20, w4}}
 # CHECK: subs     w4, w6, wzr
 # CHECK: subs     w11, w13, w15
 # CHECK: subs     w9, w3, wzr, lsl #10
@@ -268,7 +269,7 @@
 
 # CHECK: subs     x3, x5, x7
 # CHECK: cmp      x3, x5
-# CHECK: subs     x20, xzr, x4
+# CHECK: {{subs     x20, xzr, x4|negs x20, x4}}
 # CHECK: subs     x4, x6, xzr
 # CHECK: subs     x11, x13, x15
 # CHECK: subs     x9, x3, xzr, lsl #10
@@ -393,18 +394,18 @@
 0x9f 0xde 0x95 0xeb
 0xdf 0xfe 0x97 0xeb
 
-# CHECK: sub      w29, wzr, w30
-# CHECK: sub      w30, wzr, wzr
-# CHECK: sub      wzr, wzr, w0
-# CHECK: sub      w28, wzr, w27
-# CHECK: sub      w26, wzr, w25, lsl #29
-# CHECK: sub      w24, wzr, w23, lsl #31
-# CHECK: sub      w22, wzr, w21, lsr #0
-# CHECK: sub      w20, wzr, w19, lsr #1
-# CHECK: sub      w18, wzr, w17, lsr #31
-# CHECK: sub      w16, wzr, w15, asr #0
-# CHECK: sub      w14, wzr, w13, asr #12
-# CHECK: sub      w12, wzr, w11, asr #31
+# CHECK: {{sub      w29, wzr|neg w29}}, w30
+# CHECK: {{sub      w30, wzr|neg w30}}, wzr
+# CHECK: {{sub      wzr, wzr|neg wzr}}, w0
+# CHECK: {{sub      w28, wzr|neg w28}}, w27
+# CHECK: {{sub      w26, wzr|neg w26}}, w25, lsl #29
+# CHECK: {{sub      w24, wzr|neg w24}}, w23, lsl #31
+# CHECK: {{sub      w22, wzr|neg w22}}, w21, lsr #0
+# CHECK: {{sub      w20, wzr|neg w20}}, w19, lsr #1
+# CHECK: {{sub      w18, wzr|neg w18}}, w17, lsr #31
+# CHECK: {{sub      w16, wzr|neg w16}}, w15, asr #0
+# CHECK: {{sub      w14, wzr|neg w14}}, w13, asr #12
+# CHECK: {{sub      w12, wzr|neg w12}}, w11, asr #31
 0xfd 0x3 0x1e 0x4b
 0xfe 0x3 0x1f 0x4b
 0xff 0x3 0x0 0x4b
@@ -418,18 +419,18 @@
 0xee 0x33 0x8d 0x4b
 0xec 0x7f 0x8b 0x4b
 
-# CHECK: sub      x29, xzr, x30
-# CHECK: sub      x30, xzr, xzr
-# CHECK: sub      xzr, xzr, x0
-# CHECK: sub      x28, xzr, x27
-# CHECK: sub      x26, xzr, x25, lsl #29
-# CHECK: sub      x24, xzr, x23, lsl #31
-# CHECK: sub      x22, xzr, x21, lsr #0
-# CHECK: sub      x20, xzr, x19, lsr #1
-# CHECK: sub      x18, xzr, x17, lsr #31
-# CHECK: sub      x16, xzr, x15, asr #0
-# CHECK: sub      x14, xzr, x13, asr #12
-# CHECK: sub      x12, xzr, x11, asr #31
+# CHECK: {{sub      x29, xzr|neg x29}}, x30
+# CHECK: {{sub      x30, xzr|neg x30}}, xzr
+# CHECK: {{sub      xzr, xzr|neg xzr}}, x0
+# CHECK: {{sub      x28, xzr|neg x28}}, x27
+# CHECK: {{sub      x26, xzr|neg x26}}, x25, lsl #29
+# CHECK: {{sub      x24, xzr|neg x24}}, x23, lsl #31
+# CHECK: {{sub      x22, xzr|neg x22}}, x21, lsr #0
+# CHECK: {{sub      x20, xzr|neg x20}}, x19, lsr #1
+# CHECK: {{sub      x18, xzr|neg x18}}, x17, lsr #31
+# CHECK: {{sub      x16, xzr|neg x16}}, x15, asr #0
+# CHECK: {{sub      x14, xzr|neg x14}}, x13, asr #12
+# CHECK: {{sub      x12, xzr|neg x12}}, x11, asr #31
 0xfd 0x3 0x1e 0xcb
 0xfe 0x3 0x1f 0xcb
 0xff 0x3 0x0 0xcb
@@ -443,18 +444,18 @@
 0xee 0x33 0x8d 0xcb
 0xec 0x7f 0x8b 0xcb
 
-# CHECK: subs     w29, wzr, w30
-# CHECK: subs     w30, wzr, wzr
+# CHECK: {{subs     w29, wzr|negs w29}}, w30
+# CHECK: {{subs     w30, wzr|negs w30}}, wzr
 # CHECK: cmp      wzr, w0
-# CHECK: subs     w28, wzr, w27
-# CHECK: subs     w26, wzr, w25, lsl #29
-# CHECK: subs     w24, wzr, w23, lsl #31
-# CHECK: subs     w22, wzr, w21, lsr #0
-# CHECK: subs     w20, wzr, w19, lsr #1
-# CHECK: subs     w18, wzr, w17, lsr #31
-# CHECK: subs     w16, wzr, w15, asr #0
-# CHECK: subs     w14, wzr, w13, asr #12
-# CHECK: subs     w12, wzr, w11, asr #31
+# CHECK: {{subs     w28, wzr|negs w28}}, w27
+# CHECK: {{subs     w26, wzr|negs w26}}, w25, lsl #29
+# CHECK: {{subs     w24, wzr|negs w24}}, w23, lsl #31
+# CHECK: {{subs     w22, wzr|negs w22}}, w21, lsr #0
+# CHECK: {{subs     w20, wzr|negs w20}}, w19, lsr #1
+# CHECK: {{subs     w18, wzr|negs w18}}, w17, lsr #31
+# CHECK: {{subs     w16, wzr|negs w16}}, w15, asr #0
+# CHECK: {{subs     w14, wzr|negs w14}}, w13, asr #12
+# CHECK: {{subs     w12, wzr|negs w12}}, w11, asr #31
 0xfd 0x3 0x1e 0x6b
 0xfe 0x3 0x1f 0x6b
 0xff 0x3 0x0 0x6b
@@ -468,18 +469,18 @@
 0xee 0x33 0x8d 0x6b
 0xec 0x7f 0x8b 0x6b
 
-# CHECK: subs     x29, xzr, x30
-# CHECK: subs     x30, xzr, xzr
+# CHECK: {{subs     x29, xzr|negs x29}}, x30
+# CHECK: {{subs     x30, xzr|negs x30}}, xzr
 # CHECK: cmp      xzr, x0
-# CHECK: subs     x28, xzr, x27
-# CHECK: subs     x26, xzr, x25, lsl #29
-# CHECK: subs     x24, xzr, x23, lsl #31
-# CHECK: subs     x22, xzr, x21, lsr #0
-# CHECK: subs     x20, xzr, x19, lsr #1
-# CHECK: subs     x18, xzr, x17, lsr #31
-# CHECK: subs     x16, xzr, x15, asr #0
-# CHECK: subs     x14, xzr, x13, asr #12
-# CHECK: subs     x12, xzr, x11, asr #31
+# CHECK: {{subs     x28, xzr|negs x28}}, x27
+# CHECK: {{subs     x26, xzr|negs x26}}, x25, lsl #29
+# CHECK: {{subs     x24, xzr|negs x24}}, x23, lsl #31
+# CHECK: {{subs     x22, xzr|negs x22}}, x21, lsr #0
+# CHECK: {{subs     x20, xzr|negs x20}}, x19, lsr #1
+# CHECK: {{subs     x18, xzr|negs x18}}, x17, lsr #31
+# CHECK: {{subs     x16, xzr|negs x16}}, x15, asr #0
+# CHECK: {{subs     x14, xzr|negs x14}}, x13, asr #12
+# CHECK: {{subs     x12, xzr|negs x12}}, x11, asr #31
 0xfd 0x3 0x1e 0xeb
 0xfe 0x3 0x1f 0xeb
 0xff 0x3 0x0 0xeb
@@ -940,21 +941,21 @@
 0xe5 0x27 0x86 0xda
 0x7 0x35 0x9f 0xda
 
-# CHECK: csinc    w3, wzr, wzr, ne
-# CHECK: csinc    x9, xzr, xzr, mi
-# CHECK: csinv    w20, wzr, wzr, eq
-# CHECK: csinv    x30, xzr, xzr, lt
+# CHECK: cset    w3, eq
+# CHECK: cset    x9, pl
+# CHECK: csetm    w20, ne
+# CHECK: csetm    x30, ge
 0xe3 0x17 0x9f 0x1a
 0xe9 0x47 0x9f 0x9a
 0xf4 0x3 0x9f 0x5a
 0xfe 0xb3 0x9f 0xda
 
-# CHECK: csinc    w3, w5, w5, le
-# CHECK: csinc    wzr, w4, w4, gt
-# CHECK: csinc    w9, wzr, wzr, ge
-# CHECK: csinc    x3, x5, x5, le
-# CHECK: csinc    xzr, x4, x4, gt
-# CHECK: csinc    x9, xzr, xzr, ge
+# CHECK: cinc    w3, w5, gt
+# CHECK: cinc    wzr, w4, le
+# CHECK: cset    w9, lt
+# CHECK: cinc    x3, x5, gt
+# CHECK: cinc    xzr, x4, le
+# CHECK: cset    x9, lt
 0xa3 0xd4 0x85 0x1a
 0x9f 0xc4 0x84 0x1a
 0xe9 0xa7 0x9f 0x1a
@@ -962,12 +963,12 @@
 0x9f 0xc4 0x84 0x9a
 0xe9 0xa7 0x9f 0x9a
 
-# CHECK: csinv    w3, w5, w5, le
-# CHECK: csinv    wzr, w4, w4, gt
-# CHECK: csinv    w9, wzr, wzr, ge
-# CHECK: csinv    x3, x5, x5, le
-# CHECK: csinv    xzr, x4, x4, gt
-# CHECK: csinv    x9, xzr, xzr, ge
+# CHECK: cinv    w3, w5, gt
+# CHECK: cinv    wzr, w4, le
+# CHECK: csetm    w9, lt
+# CHECK: cinv    x3, x5, gt
+# CHECK: cinv    xzr, x4, le
+# CHECK: csetm    x9, lt
 0xa3 0xd0 0x85 0x5a
 0x9f 0xc0 0x84 0x5a
 0xe9 0xa3 0x9f 0x5a
@@ -975,12 +976,12 @@
 0x9f 0xc0 0x84 0xda
 0xe9 0xa3 0x9f 0xda
 
-# CHECK: csneg     w3, w5, w5, le
-# CHECK: csneg     wzr, w4, w4, gt
-# CHECK: csneg     w9, wzr, wzr, ge
-# CHECK: csneg     x3, x5, x5, le
-# CHECK: csneg     xzr, x4, x4, gt
-# CHECK: csneg     x9, xzr, xzr, ge
+# CHECK: cneg     w3, w5, gt
+# CHECK: cneg     wzr, w4, le
+# CHECK: cneg     w9, wzr, lt
+# CHECK: cneg     x3, x5, gt
+# CHECK: cneg     xzr, x4, le
+# CHECK: cneg     x9, xzr, lt
 0xa3 0xd4 0x85 0x5a
 0x9f 0xc4 0x84 0x5a
 0xe9 0xa7 0x9f 0x5a
@@ -1243,22 +1244,22 @@
 #------------------------------------------------------------------------------
 
 # CHECK: svc      #0
-# CHECK: svc      #65535
+# CHECK: svc      #{{65535|0xffff}}
 0x1 0x0 0x0 0xd4
 0xe1 0xff 0x1f 0xd4
 
-# CHECK: hvc      #1
-# CHECK: smc      #12000
-# CHECK: brk      #12
-# CHECK: hlt      #123
+# CHECK: hvc      #{{1|0x1}}
+# CHECK: smc      #{{12000|0x2ee0}}
+# CHECK: brk      #{{12|0xc}}
+# CHECK: hlt      #{{123|0x7b}}
 0x22 0x0 0x0 0xd4
 0x3 0xdc 0x5 0xd4
 0x80 0x1 0x20 0xd4
 0x60 0xf 0x40 0xd4
 
-# CHECK: dcps1    #42
-# CHECK: dcps2    #9
-# CHECK: dcps3    #1000
+# CHECK: dcps1    #{{42|0x2a}}
+# CHECK: dcps2    #{{9|0x9}}
+# CHECK: dcps3    #{{1000|0x3e8}}
 0x41 0x5 0xa0 0xd4
 0x22 0x1 0xa0 0xd4
 0x3 0x7d 0xa0 0xd4
@@ -1284,9 +1285,9 @@
 0xa3 0x3c 0xc7 0x93
 0xab 0xfd 0xd1 0x93
 
-# CHECK: extr     x19, x23, x23, #24
-# CHECK: extr     x29, xzr, xzr, #63
-# CHECK: extr     w9, w13, w13, #31
+# CHECK: ror     x19, x23, #24
+# CHECK: ror     x29, xzr, #63
+# CHECK: ror     w9, w13, #31
 0xf3 0x62 0xd7 0x93
 0xfd 0xff 0xdf 0x93
 0xa9 0x7d 0x8d 0x13
@@ -2353,23 +2354,23 @@
 0xec 0xff 0xbf 0x3d
 
 # CHECK: prfm    pldl1keep, [sp, #8]
-# CHECK: prfm    pldl1strm, [x3, #0]
+# CHECK: prfm    pldl1strm, [x3{{(, #0)?}}]
 # CHECK: prfm    pldl2keep, [x5, #16]
-# CHECK: prfm    pldl2strm, [x2, #0]
-# CHECK: prfm    pldl3keep, [x5, #0]
-# CHECK: prfm    pldl3strm, [x6, #0]
+# CHECK: prfm    pldl2strm, [x2{{(, #0)?}}]
+# CHECK: prfm    pldl3keep, [x5{{(, #0)?}}]
+# CHECK: prfm    pldl3strm, [x6{{(, #0)?}}]
 # CHECK: prfm    plil1keep, [sp, #8]
-# CHECK: prfm    plil1strm, [x3, #0]
+# CHECK: prfm    plil1strm, [x3{{(, #0)?}}]
 # CHECK: prfm    plil2keep, [x5, #16]
-# CHECK: prfm    plil2strm, [x2, #0]
-# CHECK: prfm    plil3keep, [x5, #0]
-# CHECK: prfm    plil3strm, [x6, #0]
+# CHECK: prfm    plil2strm, [x2{{(, #0)?}}]
+# CHECK: prfm    plil3keep, [x5{{(, #0)?}}]
+# CHECK: prfm    plil3strm, [x6{{(, #0)?}}]
 # CHECK: prfm    pstl1keep, [sp, #8]
-# CHECK: prfm    pstl1strm, [x3, #0]
+# CHECK: prfm    pstl1strm, [x3{{(, #0)?}}]
 # CHECK: prfm    pstl2keep, [x5, #16]
-# CHECK: prfm    pstl2strm, [x2, #0]
-# CHECK: prfm    pstl3keep, [x5, #0]
-# CHECK: prfm    pstl3strm, [x6, #0]
+# CHECK: prfm    pstl2strm, [x2{{(, #0)?}}]
+# CHECK: prfm    pstl3keep, [x5{{(, #0)?}}]
+# CHECK: prfm    pstl3strm, [x6{{(, #0)?}}]
 0xe0 0x07 0x80 0xf9
 0x61 0x00 0x80 0xf9
 0xa2 0x08 0x80 0xf9
@@ -2722,15 +2723,15 @@
 0xff 0xc7 0x0 0x52
 0x30 0xc6 0x1 0x52
 
-# CHECK: ands     wzr, w18, #0xcccccccc
+# CHECK: {{ands     wzr,|tst}} w18, #0xcccccccc
 # CHECK: ands     w19, w20, #0x33333333
 # CHECK: ands     w21, w22, #0x99999999
 0x5f 0xe6 0x2 0x72
 0x93 0xe6 0x0 0x72
 0xd5 0xe6 0x1 0x72
 
-# CHECK: ands     wzr, w3, #0xaaaaaaaa
-# CHECK: ands     wzr, wzr, #0x55555555
+# CHECK: {{ands     wzr,|tst}} w3, #0xaaaaaaaa
+# CHECK: {{ands     wzr,|tst}} wzr, #0x55555555
 0x7f 0xf0 0x1 0x72
 0xff 0xf3 0x0 0x72
 
@@ -2762,15 +2763,15 @@
 0xff 0xc7 0x0 0xd2
 0x30 0xc6 0x1 0xd2
 
-# CHECK: ands     xzr, x18, #0xcccccccccccccccc
+# CHECK: {{ands     xzr,|tst}} x18, #0xcccccccccccccccc
 # CHECK: ands     x19, x20, #0x3333333333333333
 # CHECK: ands     x21, x22, #0x9999999999999999
 0x5f 0xe6 0x2 0xf2
 0x93 0xe6 0x0 0xf2
 0xd5 0xe6 0x1 0xf2
 
-# CHECK: ands     xzr, x3, #0xaaaaaaaaaaaaaaaa
-# CHECK: ands     xzr, xzr, #0x5555555555555555
+# CHECK: {{ands     xzr,|tst}} x3, #0xaaaaaaaaaaaaaaaa
+# CHECK: {{ands     xzr,|tst}} xzr, #0x5555555555555555
 0x7f 0xf0 0x1 0xf2
 0xff 0xf3 0x0 0xf2
 
@@ -2858,15 +2859,15 @@
 # limitation in InstAlias. Lots of the "mov[nz]" instructions should
 # be "mov".
 
-# CHECK: movz     w1, #65535
+# CHECK: movz     w1, #{{65535|0xffff}}
 # CHECK: movz     w2, #0, lsl #16
-# CHECK: movn     w2, #1234
+# CHECK: movn     w2, #{{1234|0x4d2}}
 0xe1 0xff 0x9f 0x52
 0x2 0x0 0xa0 0x52
 0x42 0x9a 0x80 0x12
 
-# CHECK: movz     x2, #1234, lsl #32
-# CHECK: movk     xzr, #4321, lsl #48
+# CHECK: movz     x2, #{{1234|0x4d2}}, lsl #32
+# CHECK: movk     xzr, #{{4321|0x10e1}}, lsl #48
 0x42 0x9a 0xc0 0xd2
 0x3f 0x1c 0xe2 0xf2
 
@@ -2906,7 +2907,7 @@
 #------------------------------------------------------------------------------
 
 # CHECK: nop
-# CHECK: hint     #127
+# CHECK: hint     #{{127|0x7f}}
 # CHECK: nop
 # CHECK: yield
 # CHECK: wfe
@@ -2998,9 +2999,9 @@
 0xdf 0x3f 0x3 0xd5
 0xdf 0x3c 0x3 0xd5
 
-# CHECK: msr      spsel, #0
-# CHECK: msr      daifset, #15
-# CHECK: msr      daifclr, #12
+# CHECK: msr      {{spsel|SPSEL}}, #0
+# CHECK: msr      {{daifset|DAIFSET}}, #15
+# CHECK: msr      {{daifclr|DAIFCLR}}, #12
 0xbf 0x40 0x0 0xd5
 0xdf 0x4f 0x3 0xd5
 0xff 0x4c 0x3 0xd5
@@ -3014,21 +3015,21 @@
 0xe9 0x59 0x2f 0xd5
 0x41 0xff 0x28 0xd5
 
-# CHECK: sys     #0, c7, c1, #0, xzr
-# CHECK: sys     #0, c7, c5, #0, xzr
-# CHECK: sys     #3, c7, c5, #1, x9
+# CHECK: {{sys     #0, c7, c1, #0|ic ialluis}}
+# CHECK: {{sys     #0, c7, c5, #0|ic iallu}}
+# CHECK: {{sys     #3, c7, c5, #1|ic ivau}}, x9
 0x1f 0x71 0x8 0xd5
 0x1f 0x75 0x8 0xd5
 0x29 0x75 0xb 0xd5
 
-# CHECK: sys     #3, c7, c4, #1, x12
-# CHECK: sys     #0, c7, c6, #1, xzr
-# CHECK: sys     #0, c7, c6, #2, x2
-# CHECK: sys     #3, c7, c10, #1, x9
-# CHECK: sys     #0, c7, c10, #2, x10
-# CHECK: sys     #3, c7, c11, #1, x0
-# CHECK: sys     #3, c7, c14, #1, x3
-# CHECK: sys     #0, c7, c14, #2, x30
+# CHECK: {{sys     #3, c7, c4, #1|dc zva}}, x12
+# CHECK: {{sys     #0, c7, c6, #1|dc ivac}}
+# CHECK: {{sys     #0, c7, c6, #2|dc isw}}, x2
+# CHECK: {{sys     #3, c7, c10, #1|dc cvac}}, x9
+# CHECK: {{sys     #0, c7, c10, #2|dc csw}}, x10
+# CHECK: {{sys     #3, c7, c11, #1|dc cvau}}, x0
+# CHECK: {{sys     #3, c7, c14, #1|dc civac}}, x3
+# CHECK: {{sys     #0, c7, c14, #2|dc cisw}}, x30
 0x2c 0x74 0xb 0xd5
 0x3f 0x76 0x8 0xd5
 0x42 0x76 0x8 0xd5
@@ -3039,559 +3040,559 @@
 0x5e 0x7e 0x8 0xd5
 
 
-# CHECK: msr      teecr32_el1, x12
-# CHECK: msr      osdtrrx_el1, x12
-# CHECK: msr      mdccint_el1, x12
-# CHECK: msr      mdscr_el1, x12
-# CHECK: msr      osdtrtx_el1, x12
-# CHECK: msr      dbgdtr_el0, x12
-# CHECK: msr      dbgdtrtx_el0, x12
-# CHECK: msr      oseccr_el1, x12
-# CHECK: msr      dbgvcr32_el2, x12
-# CHECK: msr      dbgbvr0_el1, x12
-# CHECK: msr      dbgbvr1_el1, x12
-# CHECK: msr      dbgbvr2_el1, x12
-# CHECK: msr      dbgbvr3_el1, x12
-# CHECK: msr      dbgbvr4_el1, x12
-# CHECK: msr      dbgbvr5_el1, x12
-# CHECK: msr      dbgbvr6_el1, x12
-# CHECK: msr      dbgbvr7_el1, x12
-# CHECK: msr      dbgbvr8_el1, x12
-# CHECK: msr      dbgbvr9_el1, x12
-# CHECK: msr      dbgbvr10_el1, x12
-# CHECK: msr      dbgbvr11_el1, x12
-# CHECK: msr      dbgbvr12_el1, x12
-# CHECK: msr      dbgbvr13_el1, x12
-# CHECK: msr      dbgbvr14_el1, x12
-# CHECK: msr      dbgbvr15_el1, x12
-# CHECK: msr      dbgbcr0_el1, x12
-# CHECK: msr      dbgbcr1_el1, x12
-# CHECK: msr      dbgbcr2_el1, x12
-# CHECK: msr      dbgbcr3_el1, x12
-# CHECK: msr      dbgbcr4_el1, x12
-# CHECK: msr      dbgbcr5_el1, x12
-# CHECK: msr      dbgbcr6_el1, x12
-# CHECK: msr      dbgbcr7_el1, x12
-# CHECK: msr      dbgbcr8_el1, x12
-# CHECK: msr      dbgbcr9_el1, x12
-# CHECK: msr      dbgbcr10_el1, x12
-# CHECK: msr      dbgbcr11_el1, x12
-# CHECK: msr      dbgbcr12_el1, x12
-# CHECK: msr      dbgbcr13_el1, x12
-# CHECK: msr      dbgbcr14_el1, x12
-# CHECK: msr      dbgbcr15_el1, x12
-# CHECK: msr      dbgwvr0_el1, x12
-# CHECK: msr      dbgwvr1_el1, x12
-# CHECK: msr      dbgwvr2_el1, x12
-# CHECK: msr      dbgwvr3_el1, x12
-# CHECK: msr      dbgwvr4_el1, x12
-# CHECK: msr      dbgwvr5_el1, x12
-# CHECK: msr      dbgwvr6_el1, x12
-# CHECK: msr      dbgwvr7_el1, x12
-# CHECK: msr      dbgwvr8_el1, x12
-# CHECK: msr      dbgwvr9_el1, x12
-# CHECK: msr      dbgwvr10_el1, x12
-# CHECK: msr      dbgwvr11_el1, x12
-# CHECK: msr      dbgwvr12_el1, x12
-# CHECK: msr      dbgwvr13_el1, x12
-# CHECK: msr      dbgwvr14_el1, x12
-# CHECK: msr      dbgwvr15_el1, x12
-# CHECK: msr      dbgwcr0_el1, x12
-# CHECK: msr      dbgwcr1_el1, x12
-# CHECK: msr      dbgwcr2_el1, x12
-# CHECK: msr      dbgwcr3_el1, x12
-# CHECK: msr      dbgwcr4_el1, x12
-# CHECK: msr      dbgwcr5_el1, x12
-# CHECK: msr      dbgwcr6_el1, x12
-# CHECK: msr      dbgwcr7_el1, x12
-# CHECK: msr      dbgwcr8_el1, x12
-# CHECK: msr      dbgwcr9_el1, x12
-# CHECK: msr      dbgwcr10_el1, x12
-# CHECK: msr      dbgwcr11_el1, x12
-# CHECK: msr      dbgwcr12_el1, x12
-# CHECK: msr      dbgwcr13_el1, x12
-# CHECK: msr      dbgwcr14_el1, x12
-# CHECK: msr      dbgwcr15_el1, x12
-# CHECK: msr      teehbr32_el1, x12
-# CHECK: msr      oslar_el1, x12
-# CHECK: msr      osdlr_el1, x12
-# CHECK: msr      dbgprcr_el1, x12
-# CHECK: msr      dbgclaimset_el1, x12
-# CHECK: msr      dbgclaimclr_el1, x12
-# CHECK: msr      csselr_el1, x12
-# CHECK: msr      vpidr_el2, x12
-# CHECK: msr      vmpidr_el2, x12
-# CHECK: msr      sctlr_el1, x12
-# CHECK: msr      sctlr_el2, x12
-# CHECK: msr      sctlr_el3, x12
-# CHECK: msr      actlr_el1, x12
-# CHECK: msr      actlr_el2, x12
-# CHECK: msr      actlr_el3, x12
-# CHECK: msr      cpacr_el1, x12
-# CHECK: msr      hcr_el2, x12
-# CHECK: msr      scr_el3, x12
-# CHECK: msr      mdcr_el2, x12
-# CHECK: msr      sder32_el3, x12
-# CHECK: msr      cptr_el2, x12
-# CHECK: msr      cptr_el3, x12
-# CHECK: msr      hstr_el2, x12
-# CHECK: msr      hacr_el2, x12
-# CHECK: msr      mdcr_el3, x12
-# CHECK: msr      ttbr0_el1, x12
-# CHECK: msr      ttbr0_el2, x12
-# CHECK: msr      ttbr0_el3, x12
-# CHECK: msr      ttbr1_el1, x12
-# CHECK: msr      tcr_el1, x12
-# CHECK: msr      tcr_el2, x12
-# CHECK: msr      tcr_el3, x12
-# CHECK: msr      vttbr_el2, x12
-# CHECK: msr      vtcr_el2, x12
-# CHECK: msr      dacr32_el2, x12
-# CHECK: msr      spsr_el1, x12
-# CHECK: msr      spsr_el2, x12
-# CHECK: msr      spsr_el3, x12
-# CHECK: msr      elr_el1, x12
-# CHECK: msr      elr_el2, x12
-# CHECK: msr      elr_el3, x12
-# CHECK: msr      sp_el0, x12
-# CHECK: msr      sp_el1, x12
-# CHECK: msr      sp_el2, x12
-# CHECK: msr      spsel, x12
-# CHECK: msr      nzcv, x12
-# CHECK: msr      daif, x12
-# CHECK: msr      currentel, x12
-# CHECK: msr      spsr_irq, x12
-# CHECK: msr      spsr_abt, x12
-# CHECK: msr      spsr_und, x12
-# CHECK: msr      spsr_fiq, x12
-# CHECK: msr      fpcr, x12
-# CHECK: msr      fpsr, x12
-# CHECK: msr      dspsr_el0, x12
-# CHECK: msr      dlr_el0, x12
-# CHECK: msr      ifsr32_el2, x12
-# CHECK: msr      afsr0_el1, x12
-# CHECK: msr      afsr0_el2, x12
-# CHECK: msr      afsr0_el3, x12
-# CHECK: msr      afsr1_el1, x12
-# CHECK: msr      afsr1_el2, x12
-# CHECK: msr      afsr1_el3, x12
-# CHECK: msr      esr_el1, x12
-# CHECK: msr      esr_el2, x12
-# CHECK: msr      esr_el3, x12
-# CHECK: msr      fpexc32_el2, x12
-# CHECK: msr      far_el1, x12
-# CHECK: msr      far_el2, x12
-# CHECK: msr      far_el3, x12
-# CHECK: msr      hpfar_el2, x12
-# CHECK: msr      par_el1, x12
-# CHECK: msr      pmcr_el0, x12
-# CHECK: msr      pmcntenset_el0, x12
-# CHECK: msr      pmcntenclr_el0, x12
-# CHECK: msr      pmovsclr_el0, x12
-# CHECK: msr      pmselr_el0, x12
-# CHECK: msr      pmccntr_el0, x12
-# CHECK: msr      pmxevtyper_el0, x12
-# CHECK: msr      pmxevcntr_el0, x12
-# CHECK: msr      pmuserenr_el0, x12
-# CHECK: msr      pmintenset_el1, x12
-# CHECK: msr      pmintenclr_el1, x12
-# CHECK: msr      pmovsset_el0, x12
-# CHECK: msr      mair_el1, x12
-# CHECK: msr      mair_el2, x12
-# CHECK: msr      mair_el3, x12
-# CHECK: msr      amair_el1, x12
-# CHECK: msr      amair_el2, x12
-# CHECK: msr      amair_el3, x12
-# CHECK: msr      vbar_el1, x12
-# CHECK: msr      vbar_el2, x12
-# CHECK: msr      vbar_el3, x12
-# CHECK: msr      rmr_el1, x12
-# CHECK: msr      rmr_el2, x12
-# CHECK: msr      rmr_el3, x12
-# CHECK: msr      tpidr_el0, x12
-# CHECK: msr      tpidr_el2, x12
-# CHECK: msr      tpidr_el3, x12
-# CHECK: msr      tpidrro_el0, x12
-# CHECK: msr      tpidr_el1, x12
-# CHECK: msr      cntfrq_el0, x12
-# CHECK: msr      cntvoff_el2, x12
-# CHECK: msr      cntkctl_el1, x12
-# CHECK: msr      cnthctl_el2, x12
-# CHECK: msr      cntp_tval_el0, x12
-# CHECK: msr      cnthp_tval_el2, x12
-# CHECK: msr      cntps_tval_el1, x12
-# CHECK: msr      cntp_ctl_el0, x12
-# CHECK: msr      cnthp_ctl_el2, x12
-# CHECK: msr      cntps_ctl_el1, x12
-# CHECK: msr      cntp_cval_el0, x12
-# CHECK: msr      cnthp_cval_el2, x12
-# CHECK: msr      cntps_cval_el1, x12
-# CHECK: msr      cntv_tval_el0, x12
-# CHECK: msr      cntv_ctl_el0, x12
-# CHECK: msr      cntv_cval_el0, x12
-# CHECK: msr      pmevcntr0_el0, x12
-# CHECK: msr      pmevcntr1_el0, x12
-# CHECK: msr      pmevcntr2_el0, x12
-# CHECK: msr      pmevcntr3_el0, x12
-# CHECK: msr      pmevcntr4_el0, x12
-# CHECK: msr      pmevcntr5_el0, x12
-# CHECK: msr      pmevcntr6_el0, x12
-# CHECK: msr      pmevcntr7_el0, x12
-# CHECK: msr      pmevcntr8_el0, x12
-# CHECK: msr      pmevcntr9_el0, x12
-# CHECK: msr      pmevcntr10_el0, x12
-# CHECK: msr      pmevcntr11_el0, x12
-# CHECK: msr      pmevcntr12_el0, x12
-# CHECK: msr      pmevcntr13_el0, x12
-# CHECK: msr      pmevcntr14_el0, x12
-# CHECK: msr      pmevcntr15_el0, x12
-# CHECK: msr      pmevcntr16_el0, x12
-# CHECK: msr      pmevcntr17_el0, x12
-# CHECK: msr      pmevcntr18_el0, x12
-# CHECK: msr      pmevcntr19_el0, x12
-# CHECK: msr      pmevcntr20_el0, x12
-# CHECK: msr      pmevcntr21_el0, x12
-# CHECK: msr      pmevcntr22_el0, x12
-# CHECK: msr      pmevcntr23_el0, x12
-# CHECK: msr      pmevcntr24_el0, x12
-# CHECK: msr      pmevcntr25_el0, x12
-# CHECK: msr      pmevcntr26_el0, x12
-# CHECK: msr      pmevcntr27_el0, x12
-# CHECK: msr      pmevcntr28_el0, x12
-# CHECK: msr      pmevcntr29_el0, x12
-# CHECK: msr      pmevcntr30_el0, x12
-# CHECK: msr      pmccfiltr_el0, x12
-# CHECK: msr      pmevtyper0_el0, x12
-# CHECK: msr      pmevtyper1_el0, x12
-# CHECK: msr      pmevtyper2_el0, x12
-# CHECK: msr      pmevtyper3_el0, x12
-# CHECK: msr      pmevtyper4_el0, x12
-# CHECK: msr      pmevtyper5_el0, x12
-# CHECK: msr      pmevtyper6_el0, x12
-# CHECK: msr      pmevtyper7_el0, x12
-# CHECK: msr      pmevtyper8_el0, x12
-# CHECK: msr      pmevtyper9_el0, x12
-# CHECK: msr      pmevtyper10_el0, x12
-# CHECK: msr      pmevtyper11_el0, x12
-# CHECK: msr      pmevtyper12_el0, x12
-# CHECK: msr      pmevtyper13_el0, x12
-# CHECK: msr      pmevtyper14_el0, x12
-# CHECK: msr      pmevtyper15_el0, x12
-# CHECK: msr      pmevtyper16_el0, x12
-# CHECK: msr      pmevtyper17_el0, x12
-# CHECK: msr      pmevtyper18_el0, x12
-# CHECK: msr      pmevtyper19_el0, x12
-# CHECK: msr      pmevtyper20_el0, x12
-# CHECK: msr      pmevtyper21_el0, x12
-# CHECK: msr      pmevtyper22_el0, x12
-# CHECK: msr      pmevtyper23_el0, x12
-# CHECK: msr      pmevtyper24_el0, x12
-# CHECK: msr      pmevtyper25_el0, x12
-# CHECK: msr      pmevtyper26_el0, x12
-# CHECK: msr      pmevtyper27_el0, x12
-# CHECK: msr      pmevtyper28_el0, x12
-# CHECK: msr      pmevtyper29_el0, x12
-# CHECK: msr      pmevtyper30_el0, x12
-# CHECK: mrs      x9, teecr32_el1
-# CHECK: mrs      x9, osdtrrx_el1
-# CHECK: mrs      x9, mdccsr_el0
-# CHECK: mrs      x9, mdccint_el1
-# CHECK: mrs      x9, mdscr_el1
-# CHECK: mrs      x9, osdtrtx_el1
-# CHECK: mrs      x9, dbgdtr_el0
-# CHECK: mrs      x9, dbgdtrrx_el0
-# CHECK: mrs      x9, oseccr_el1
-# CHECK: mrs      x9, dbgvcr32_el2
-# CHECK: mrs      x9, dbgbvr0_el1
-# CHECK: mrs      x9, dbgbvr1_el1
-# CHECK: mrs      x9, dbgbvr2_el1
-# CHECK: mrs      x9, dbgbvr3_el1
-# CHECK: mrs      x9, dbgbvr4_el1
-# CHECK: mrs      x9, dbgbvr5_el1
-# CHECK: mrs      x9, dbgbvr6_el1
-# CHECK: mrs      x9, dbgbvr7_el1
-# CHECK: mrs      x9, dbgbvr8_el1
-# CHECK: mrs      x9, dbgbvr9_el1
-# CHECK: mrs      x9, dbgbvr10_el1
-# CHECK: mrs      x9, dbgbvr11_el1
-# CHECK: mrs      x9, dbgbvr12_el1
-# CHECK: mrs      x9, dbgbvr13_el1
-# CHECK: mrs      x9, dbgbvr14_el1
-# CHECK: mrs      x9, dbgbvr15_el1
-# CHECK: mrs      x9, dbgbcr0_el1
-# CHECK: mrs      x9, dbgbcr1_el1
-# CHECK: mrs      x9, dbgbcr2_el1
-# CHECK: mrs      x9, dbgbcr3_el1
-# CHECK: mrs      x9, dbgbcr4_el1
-# CHECK: mrs      x9, dbgbcr5_el1
-# CHECK: mrs      x9, dbgbcr6_el1
-# CHECK: mrs      x9, dbgbcr7_el1
-# CHECK: mrs      x9, dbgbcr8_el1
-# CHECK: mrs      x9, dbgbcr9_el1
-# CHECK: mrs      x9, dbgbcr10_el1
-# CHECK: mrs      x9, dbgbcr11_el1
-# CHECK: mrs      x9, dbgbcr12_el1
-# CHECK: mrs      x9, dbgbcr13_el1
-# CHECK: mrs      x9, dbgbcr14_el1
-# CHECK: mrs      x9, dbgbcr15_el1
-# CHECK: mrs      x9, dbgwvr0_el1
-# CHECK: mrs      x9, dbgwvr1_el1
-# CHECK: mrs      x9, dbgwvr2_el1
-# CHECK: mrs      x9, dbgwvr3_el1
-# CHECK: mrs      x9, dbgwvr4_el1
-# CHECK: mrs      x9, dbgwvr5_el1
-# CHECK: mrs      x9, dbgwvr6_el1
-# CHECK: mrs      x9, dbgwvr7_el1
-# CHECK: mrs      x9, dbgwvr8_el1
-# CHECK: mrs      x9, dbgwvr9_el1
-# CHECK: mrs      x9, dbgwvr10_el1
-# CHECK: mrs      x9, dbgwvr11_el1
-# CHECK: mrs      x9, dbgwvr12_el1
-# CHECK: mrs      x9, dbgwvr13_el1
-# CHECK: mrs      x9, dbgwvr14_el1
-# CHECK: mrs      x9, dbgwvr15_el1
-# CHECK: mrs      x9, dbgwcr0_el1
-# CHECK: mrs      x9, dbgwcr1_el1
-# CHECK: mrs      x9, dbgwcr2_el1
-# CHECK: mrs      x9, dbgwcr3_el1
-# CHECK: mrs      x9, dbgwcr4_el1
-# CHECK: mrs      x9, dbgwcr5_el1
-# CHECK: mrs      x9, dbgwcr6_el1
-# CHECK: mrs      x9, dbgwcr7_el1
-# CHECK: mrs      x9, dbgwcr8_el1
-# CHECK: mrs      x9, dbgwcr9_el1
-# CHECK: mrs      x9, dbgwcr10_el1
-# CHECK: mrs      x9, dbgwcr11_el1
-# CHECK: mrs      x9, dbgwcr12_el1
-# CHECK: mrs      x9, dbgwcr13_el1
-# CHECK: mrs      x9, dbgwcr14_el1
-# CHECK: mrs      x9, dbgwcr15_el1
-# CHECK: mrs      x9, mdrar_el1
-# CHECK: mrs      x9, teehbr32_el1
-# CHECK: mrs      x9, oslsr_el1
-# CHECK: mrs      x9, osdlr_el1
-# CHECK: mrs      x9, dbgprcr_el1
-# CHECK: mrs      x9, dbgclaimset_el1
-# CHECK: mrs      x9, dbgclaimclr_el1
-# CHECK: mrs      x9, dbgauthstatus_el1
-# CHECK: mrs      x9, midr_el1
-# CHECK: mrs      x9, ccsidr_el1
-# CHECK: mrs      x9, csselr_el1
-# CHECK: mrs      x9, vpidr_el2
-# CHECK: mrs      x9, clidr_el1
-# CHECK: mrs      x9, ctr_el0
-# CHECK: mrs      x9, mpidr_el1
-# CHECK: mrs      x9, vmpidr_el2
-# CHECK: mrs      x9, revidr_el1
-# CHECK: mrs      x9, aidr_el1
-# CHECK: mrs      x9, dczid_el0
-# CHECK: mrs      x9, id_pfr0_el1
-# CHECK: mrs      x9, id_pfr1_el1
-# CHECK: mrs      x9, id_dfr0_el1
-# CHECK: mrs      x9, id_afr0_el1
-# CHECK: mrs      x9, id_mmfr0_el1
-# CHECK: mrs      x9, id_mmfr1_el1
-# CHECK: mrs      x9, id_mmfr2_el1
-# CHECK: mrs      x9, id_mmfr3_el1
-# CHECK: mrs      x9, id_isar0_el1
-# CHECK: mrs      x9, id_isar1_el1
-# CHECK: mrs      x9, id_isar2_el1
-# CHECK: mrs      x9, id_isar3_el1
-# CHECK: mrs      x9, id_isar4_el1
-# CHECK: mrs      x9, id_isar5_el1
-# CHECK: mrs      x9, mvfr0_el1
-# CHECK: mrs      x9, mvfr1_el1
-# CHECK: mrs      x9, mvfr2_el1
-# CHECK: mrs      x9, id_aa64pfr0_el1
-# CHECK: mrs      x9, id_aa64pfr1_el1
-# CHECK: mrs      x9, id_aa64dfr0_el1
-# CHECK: mrs      x9, id_aa64dfr1_el1
-# CHECK: mrs      x9, id_aa64afr0_el1
-# CHECK: mrs      x9, id_aa64afr1_el1
-# CHECK: mrs      x9, id_aa64isar0_el1
-# CHECK: mrs      x9, id_aa64isar1_el1
-# CHECK: mrs      x9, id_aa64mmfr0_el1
-# CHECK: mrs      x9, id_aa64mmfr1_el1
-# CHECK: mrs      x9, sctlr_el1
-# CHECK: mrs      x9, sctlr_el2
-# CHECK: mrs      x9, sctlr_el3
-# CHECK: mrs      x9, actlr_el1
-# CHECK: mrs      x9, actlr_el2
-# CHECK: mrs      x9, actlr_el3
-# CHECK: mrs      x9, cpacr_el1
-# CHECK: mrs      x9, hcr_el2
-# CHECK: mrs      x9, scr_el3
-# CHECK: mrs      x9, mdcr_el2
-# CHECK: mrs      x9, sder32_el3
-# CHECK: mrs      x9, cptr_el2
-# CHECK: mrs      x9, cptr_el3
-# CHECK: mrs      x9, hstr_el2
-# CHECK: mrs      x9, hacr_el2
-# CHECK: mrs      x9, mdcr_el3
-# CHECK: mrs      x9, ttbr0_el1
-# CHECK: mrs      x9, ttbr0_el2
-# CHECK: mrs      x9, ttbr0_el3
-# CHECK: mrs      x9, ttbr1_el1
-# CHECK: mrs      x9, tcr_el1
-# CHECK: mrs      x9, tcr_el2
-# CHECK: mrs      x9, tcr_el3
-# CHECK: mrs      x9, vttbr_el2
-# CHECK: mrs      x9, vtcr_el2
-# CHECK: mrs      x9, dacr32_el2
-# CHECK: mrs      x9, spsr_el1
-# CHECK: mrs      x9, spsr_el2
-# CHECK: mrs      x9, spsr_el3
-# CHECK: mrs      x9, elr_el1
-# CHECK: mrs      x9, elr_el2
-# CHECK: mrs      x9, elr_el3
-# CHECK: mrs      x9, sp_el0
-# CHECK: mrs      x9, sp_el1
-# CHECK: mrs      x9, sp_el2
-# CHECK: mrs      x9, spsel
-# CHECK: mrs      x9, nzcv
-# CHECK: mrs      x9, daif
-# CHECK: mrs      x9, currentel
-# CHECK: mrs      x9, spsr_irq
-# CHECK: mrs      x9, spsr_abt
-# CHECK: mrs      x9, spsr_und
-# CHECK: mrs      x9, spsr_fiq
-# CHECK: mrs      x9, fpcr
-# CHECK: mrs      x9, fpsr
-# CHECK: mrs      x9, dspsr_el0
-# CHECK: mrs      x9, dlr_el0
-# CHECK: mrs      x9, ifsr32_el2
-# CHECK: mrs      x9, afsr0_el1
-# CHECK: mrs      x9, afsr0_el2
-# CHECK: mrs      x9, afsr0_el3
-# CHECK: mrs      x9, afsr1_el1
-# CHECK: mrs      x9, afsr1_el2
-# CHECK: mrs      x9, afsr1_el3
-# CHECK: mrs      x9, esr_el1
-# CHECK: mrs      x9, esr_el2
-# CHECK: mrs      x9, esr_el3
-# CHECK: mrs      x9, fpexc32_el2
-# CHECK: mrs      x9, far_el1
-# CHECK: mrs      x9, far_el2
-# CHECK: mrs      x9, far_el3
-# CHECK: mrs      x9, hpfar_el2
-# CHECK: mrs      x9, par_el1
-# CHECK: mrs      x9, pmcr_el0
-# CHECK: mrs      x9, pmcntenset_el0
-# CHECK: mrs      x9, pmcntenclr_el0
-# CHECK: mrs      x9, pmovsclr_el0
-# CHECK: mrs      x9, pmselr_el0
-# CHECK: mrs      x9, pmceid0_el0
-# CHECK: mrs      x9, pmceid1_el0
-# CHECK: mrs      x9, pmccntr_el0
-# CHECK: mrs      x9, pmxevtyper_el0
-# CHECK: mrs      x9, pmxevcntr_el0
-# CHECK: mrs      x9, pmuserenr_el0
-# CHECK: mrs      x9, pmintenset_el1
-# CHECK: mrs      x9, pmintenclr_el1
-# CHECK: mrs      x9, pmovsset_el0
-# CHECK: mrs      x9, mair_el1
-# CHECK: mrs      x9, mair_el2
-# CHECK: mrs      x9, mair_el3
-# CHECK: mrs      x9, amair_el1
-# CHECK: mrs      x9, amair_el2
-# CHECK: mrs      x9, amair_el3
-# CHECK: mrs      x9, vbar_el1
-# CHECK: mrs      x9, vbar_el2
-# CHECK: mrs      x9, vbar_el3
-# CHECK: mrs      x9, rvbar_el1
-# CHECK: mrs      x9, rvbar_el2
-# CHECK: mrs      x9, rvbar_el3
-# CHECK: mrs      x9, rmr_el1
-# CHECK: mrs      x9, rmr_el2
-# CHECK: mrs      x9, rmr_el3
-# CHECK: mrs      x9, isr_el1
-# CHECK: mrs      x9, contextidr_el1
-# CHECK: mrs      x9, tpidr_el0
-# CHECK: mrs      x9, tpidr_el2
-# CHECK: mrs      x9, tpidr_el3
-# CHECK: mrs      x9, tpidrro_el0
-# CHECK: mrs      x9, tpidr_el1
-# CHECK: mrs      x9, cntfrq_el0
-# CHECK: mrs      x9, cntpct_el0
-# CHECK: mrs      x9, cntvct_el0
-# CHECK: mrs      x9, cntvoff_el2
-# CHECK: mrs      x9, cntkctl_el1
-# CHECK: mrs      x9, cnthctl_el2
-# CHECK: mrs      x9, cntp_tval_el0
-# CHECK: mrs      x9, cnthp_tval_el2
-# CHECK: mrs      x9, cntps_tval_el1
-# CHECK: mrs      x9, cntp_ctl_el0
-# CHECK: mrs      x9, cnthp_ctl_el2
-# CHECK: mrs      x9, cntps_ctl_el1
-# CHECK: mrs      x9, cntp_cval_el0
-# CHECK: mrs      x9, cnthp_cval_el2
-# CHECK: mrs      x9, cntps_cval_el1
-# CHECK: mrs      x9, cntv_tval_el0
-# CHECK: mrs      x9, cntv_ctl_el0
-# CHECK: mrs      x9, cntv_cval_el0
-# CHECK: mrs      x9, pmevcntr0_el0
-# CHECK: mrs      x9, pmevcntr1_el0
-# CHECK: mrs      x9, pmevcntr2_el0
-# CHECK: mrs      x9, pmevcntr3_el0
-# CHECK: mrs      x9, pmevcntr4_el0
-# CHECK: mrs      x9, pmevcntr5_el0
-# CHECK: mrs      x9, pmevcntr6_el0
-# CHECK: mrs      x9, pmevcntr7_el0
-# CHECK: mrs      x9, pmevcntr8_el0
-# CHECK: mrs      x9, pmevcntr9_el0
-# CHECK: mrs      x9, pmevcntr10_el0
-# CHECK: mrs      x9, pmevcntr11_el0
-# CHECK: mrs      x9, pmevcntr12_el0
-# CHECK: mrs      x9, pmevcntr13_el0
-# CHECK: mrs      x9, pmevcntr14_el0
-# CHECK: mrs      x9, pmevcntr15_el0
-# CHECK: mrs      x9, pmevcntr16_el0
-# CHECK: mrs      x9, pmevcntr17_el0
-# CHECK: mrs      x9, pmevcntr18_el0
-# CHECK: mrs      x9, pmevcntr19_el0
-# CHECK: mrs      x9, pmevcntr20_el0
-# CHECK: mrs      x9, pmevcntr21_el0
-# CHECK: mrs      x9, pmevcntr22_el0
-# CHECK: mrs      x9, pmevcntr23_el0
-# CHECK: mrs      x9, pmevcntr24_el0
-# CHECK: mrs      x9, pmevcntr25_el0
-# CHECK: mrs      x9, pmevcntr26_el0
-# CHECK: mrs      x9, pmevcntr27_el0
-# CHECK: mrs      x9, pmevcntr28_el0
-# CHECK: mrs      x9, pmevcntr29_el0
-# CHECK: mrs      x9, pmevcntr30_el0
-# CHECK: mrs      x9, pmccfiltr_el0
-# CHECK: mrs      x9, pmevtyper0_el0
-# CHECK: mrs      x9, pmevtyper1_el0
-# CHECK: mrs      x9, pmevtyper2_el0
-# CHECK: mrs      x9, pmevtyper3_el0
-# CHECK: mrs      x9, pmevtyper4_el0
-# CHECK: mrs      x9, pmevtyper5_el0
-# CHECK: mrs      x9, pmevtyper6_el0
-# CHECK: mrs      x9, pmevtyper7_el0
-# CHECK: mrs      x9, pmevtyper8_el0
-# CHECK: mrs      x9, pmevtyper9_el0
-# CHECK: mrs      x9, pmevtyper10_el0
-# CHECK: mrs      x9, pmevtyper11_el0
-# CHECK: mrs      x9, pmevtyper12_el0
-# CHECK: mrs      x9, pmevtyper13_el0
-# CHECK: mrs      x9, pmevtyper14_el0
-# CHECK: mrs      x9, pmevtyper15_el0
-# CHECK: mrs      x9, pmevtyper16_el0
-# CHECK: mrs      x9, pmevtyper17_el0
-# CHECK: mrs      x9, pmevtyper18_el0
-# CHECK: mrs      x9, pmevtyper19_el0
-# CHECK: mrs      x9, pmevtyper20_el0
-# CHECK: mrs      x9, pmevtyper21_el0
-# CHECK: mrs      x9, pmevtyper22_el0
-# CHECK: mrs      x9, pmevtyper23_el0
-# CHECK: mrs      x9, pmevtyper24_el0
-# CHECK: mrs      x9, pmevtyper25_el0
-# CHECK: mrs      x9, pmevtyper26_el0
-# CHECK: mrs      x9, pmevtyper27_el0
-# CHECK: mrs      x9, pmevtyper28_el0
-# CHECK: mrs      x9, pmevtyper29_el0
-# CHECK: mrs      x9, pmevtyper30_el0
+# CHECK: msr      {{teecr32_el1|TEECR32_EL1}}, x12
+# CHECK: msr      {{osdtrrx_el1|OSDTRRX_EL1}}, x12
+# CHECK: msr      {{mdccint_el1|MDCCINT_EL1}}, x12
+# CHECK: msr      {{mdscr_el1|MDSCR_EL1}}, x12
+# CHECK: msr      {{osdtrtx_el1|OSDTRTX_EL1}}, x12
+# CHECK: msr      {{dbgdtr_el0|DBGDTR_EL0}}, x12
+# CHECK: msr      {{dbgdtrtx_el0|DBGDTRTX_EL0}}, x12
+# CHECK: msr      {{oseccr_el1|OSECCR_EL1}}, x12
+# CHECK: msr      {{dbgvcr32_el2|DBGVCR32_EL2}}, x12
+# CHECK: msr      {{dbgbvr0_el1|DBGBVR0_EL1}}, x12
+# CHECK: msr      {{dbgbvr1_el1|DBGBVR1_EL1}}, x12
+# CHECK: msr      {{dbgbvr2_el1|DBGBVR2_EL1}}, x12
+# CHECK: msr      {{dbgbvr3_el1|DBGBVR3_EL1}}, x12
+# CHECK: msr      {{dbgbvr4_el1|DBGBVR4_EL1}}, x12
+# CHECK: msr      {{dbgbvr5_el1|DBGBVR5_EL1}}, x12
+# CHECK: msr      {{dbgbvr6_el1|DBGBVR6_EL1}}, x12
+# CHECK: msr      {{dbgbvr7_el1|DBGBVR7_EL1}}, x12
+# CHECK: msr      {{dbgbvr8_el1|DBGBVR8_EL1}}, x12
+# CHECK: msr      {{dbgbvr9_el1|DBGBVR9_EL1}}, x12
+# CHECK: msr      {{dbgbvr10_el1|DBGBVR10_EL1}}, x12
+# CHECK: msr      {{dbgbvr11_el1|DBGBVR11_EL1}}, x12
+# CHECK: msr      {{dbgbvr12_el1|DBGBVR12_EL1}}, x12
+# CHECK: msr      {{dbgbvr13_el1|DBGBVR13_EL1}}, x12
+# CHECK: msr      {{dbgbvr14_el1|DBGBVR14_EL1}}, x12
+# CHECK: msr      {{dbgbvr15_el1|DBGBVR15_EL1}}, x12
+# CHECK: msr      {{dbgbcr0_el1|DBGBCR0_EL1}}, x12
+# CHECK: msr      {{dbgbcr1_el1|DBGBCR1_EL1}}, x12
+# CHECK: msr      {{dbgbcr2_el1|DBGBCR2_EL1}}, x12
+# CHECK: msr      {{dbgbcr3_el1|DBGBCR3_EL1}}, x12
+# CHECK: msr      {{dbgbcr4_el1|DBGBCR4_EL1}}, x12
+# CHECK: msr      {{dbgbcr5_el1|DBGBCR5_EL1}}, x12
+# CHECK: msr      {{dbgbcr6_el1|DBGBCR6_EL1}}, x12
+# CHECK: msr      {{dbgbcr7_el1|DBGBCR7_EL1}}, x12
+# CHECK: msr      {{dbgbcr8_el1|DBGBCR8_EL1}}, x12
+# CHECK: msr      {{dbgbcr9_el1|DBGBCR9_EL1}}, x12
+# CHECK: msr      {{dbgbcr10_el1|DBGBCR10_EL1}}, x12
+# CHECK: msr      {{dbgbcr11_el1|DBGBCR11_EL1}}, x12
+# CHECK: msr      {{dbgbcr12_el1|DBGBCR12_EL1}}, x12
+# CHECK: msr      {{dbgbcr13_el1|DBGBCR13_EL1}}, x12
+# CHECK: msr      {{dbgbcr14_el1|DBGBCR14_EL1}}, x12
+# CHECK: msr      {{dbgbcr15_el1|DBGBCR15_EL1}}, x12
+# CHECK: msr      {{dbgwvr0_el1|DBGWVR0_EL1}}, x12
+# CHECK: msr      {{dbgwvr1_el1|DBGWVR1_EL1}}, x12
+# CHECK: msr      {{dbgwvr2_el1|DBGWVR2_EL1}}, x12
+# CHECK: msr      {{dbgwvr3_el1|DBGWVR3_EL1}}, x12
+# CHECK: msr      {{dbgwvr4_el1|DBGWVR4_EL1}}, x12
+# CHECK: msr      {{dbgwvr5_el1|DBGWVR5_EL1}}, x12
+# CHECK: msr      {{dbgwvr6_el1|DBGWVR6_EL1}}, x12
+# CHECK: msr      {{dbgwvr7_el1|DBGWVR7_EL1}}, x12
+# CHECK: msr      {{dbgwvr8_el1|DBGWVR8_EL1}}, x12
+# CHECK: msr      {{dbgwvr9_el1|DBGWVR9_EL1}}, x12
+# CHECK: msr      {{dbgwvr10_el1|DBGWVR10_EL1}}, x12
+# CHECK: msr      {{dbgwvr11_el1|DBGWVR11_EL1}}, x12
+# CHECK: msr      {{dbgwvr12_el1|DBGWVR12_EL1}}, x12
+# CHECK: msr      {{dbgwvr13_el1|DBGWVR13_EL1}}, x12
+# CHECK: msr      {{dbgwvr14_el1|DBGWVR14_EL1}}, x12
+# CHECK: msr      {{dbgwvr15_el1|DBGWVR15_EL1}}, x12
+# CHECK: msr      {{dbgwcr0_el1|DBGWCR0_EL1}}, x12
+# CHECK: msr      {{dbgwcr1_el1|DBGWCR1_EL1}}, x12
+# CHECK: msr      {{dbgwcr2_el1|DBGWCR2_EL1}}, x12
+# CHECK: msr      {{dbgwcr3_el1|DBGWCR3_EL1}}, x12
+# CHECK: msr      {{dbgwcr4_el1|DBGWCR4_EL1}}, x12
+# CHECK: msr      {{dbgwcr5_el1|DBGWCR5_EL1}}, x12
+# CHECK: msr      {{dbgwcr6_el1|DBGWCR6_EL1}}, x12
+# CHECK: msr      {{dbgwcr7_el1|DBGWCR7_EL1}}, x12
+# CHECK: msr      {{dbgwcr8_el1|DBGWCR8_EL1}}, x12
+# CHECK: msr      {{dbgwcr9_el1|DBGWCR9_EL1}}, x12
+# CHECK: msr      {{dbgwcr10_el1|DBGWCR10_EL1}}, x12
+# CHECK: msr      {{dbgwcr11_el1|DBGWCR11_EL1}}, x12
+# CHECK: msr      {{dbgwcr12_el1|DBGWCR12_EL1}}, x12
+# CHECK: msr      {{dbgwcr13_el1|DBGWCR13_EL1}}, x12
+# CHECK: msr      {{dbgwcr14_el1|DBGWCR14_EL1}}, x12
+# CHECK: msr      {{dbgwcr15_el1|DBGWCR15_EL1}}, x12
+# CHECK: msr      {{teehbr32_el1|TEEHBR32_EL1}}, x12
+# CHECK: msr      {{oslar_el1|OSLAR_EL1}}, x12
+# CHECK: msr      {{osdlr_el1|OSDLR_EL1}}, x12
+# CHECK: msr      {{dbgprcr_el1|DBGPRCR_EL1}}, x12
+# CHECK: msr      {{dbgclaimset_el1|DBGCLAIMSET_EL1}}, x12
+# CHECK: msr      {{dbgclaimclr_el1|DBGCLAIMCLR_EL1}}, x12
+# CHECK: msr      {{csselr_el1|CSSELR_EL1}}, x12
+# CHECK: msr      {{vpidr_el2|VPIDR_EL2}}, x12
+# CHECK: msr      {{vmpidr_el2|VMPIDR_EL2}}, x12
+# CHECK: msr      {{sctlr_el1|SCTLR_EL1}}, x12
+# CHECK: msr      {{sctlr_el2|SCTLR_EL2}}, x12
+# CHECK: msr      {{sctlr_el3|SCTLR_EL3}}, x12
+# CHECK: msr      {{actlr_el1|ACTLR_EL1}}, x12
+# CHECK: msr      {{actlr_el2|ACTLR_EL2}}, x12
+# CHECK: msr      {{actlr_el3|ACTLR_EL3}}, x12
+# CHECK: msr      {{cpacr_el1|CPACR_EL1}}, x12
+# CHECK: msr      {{hcr_el2|HCR_EL2}}, x12
+# CHECK: msr      {{scr_el3|SCR_EL3}}, x12
+# CHECK: msr      {{mdcr_el2|MDCR_EL2}}, x12
+# CHECK: msr      {{sder32_el3|SDER32_EL3}}, x12
+# CHECK: msr      {{cptr_el2|CPTR_EL2}}, x12
+# CHECK: msr      {{cptr_el3|CPTR_EL3}}, x12
+# CHECK: msr      {{hstr_el2|HSTR_EL2}}, x12
+# CHECK: msr      {{hacr_el2|HACR_EL2}}, x12
+# CHECK: msr      {{mdcr_el3|MDCR_EL3}}, x12
+# CHECK: msr      {{ttbr0_el1|TTBR0_EL1}}, x12
+# CHECK: msr      {{ttbr0_el2|TTBR0_EL2}}, x12
+# CHECK: msr      {{ttbr0_el3|TTBR0_EL3}}, x12
+# CHECK: msr      {{ttbr1_el1|TTBR1_EL1}}, x12
+# CHECK: msr      {{tcr_el1|TCR_EL1}}, x12
+# CHECK: msr      {{tcr_el2|TCR_EL2}}, x12
+# CHECK: msr      {{tcr_el3|TCR_EL3}}, x12
+# CHECK: msr      {{vttbr_el2|VTTBR_EL2}}, x12
+# CHECK: msr      {{vtcr_el2|VTCR_EL2}}, x12
+# CHECK: msr      {{dacr32_el2|DACR32_EL2}}, x12
+# CHECK: msr      {{spsr_el1|SPSR_EL1}}, x12
+# CHECK: msr      {{spsr_el2|SPSR_EL2}}, x12
+# CHECK: msr      {{spsr_el3|SPSR_EL3}}, x12
+# CHECK: msr      {{elr_el1|ELR_EL1}}, x12
+# CHECK: msr      {{elr_el2|ELR_EL2}}, x12
+# CHECK: msr      {{elr_el3|ELR_EL3}}, x12
+# CHECK: msr      {{sp_el0|SP_EL0}}, x12
+# CHECK: msr      {{sp_el1|SP_EL1}}, x12
+# CHECK: msr      {{sp_el2|SP_EL2}}, x12
+# CHECK: msr      {{spsel|SPSEL}}, x12
+# CHECK: msr      {{nzcv|NZCV}}, x12
+# CHECK: msr      {{daif|DAIF}}, x12
+# CHECK: msr      {{currentel|CURRENTEL}}, x12
+# CHECK: msr      {{spsr_irq|SPSR_IRQ}}, x12
+# CHECK: msr      {{spsr_abt|SPSR_ABT}}, x12
+# CHECK: msr      {{spsr_und|SPSR_UND}}, x12
+# CHECK: msr      {{spsr_fiq|SPSR_FIQ}}, x12
+# CHECK: msr      {{fpcr|FPCR}}, x12
+# CHECK: msr      {{fpsr|FPSR}}, x12
+# CHECK: msr      {{dspsr_el0|DSPSR_EL0}}, x12
+# CHECK: msr      {{dlr_el0|DLR_EL0}}, x12
+# CHECK: msr      {{ifsr32_el2|IFSR32_EL2}}, x12
+# CHECK: msr      {{afsr0_el1|AFSR0_EL1}}, x12
+# CHECK: msr      {{afsr0_el2|AFSR0_EL2}}, x12
+# CHECK: msr      {{afsr0_el3|AFSR0_EL3}}, x12
+# CHECK: msr      {{afsr1_el1|AFSR1_EL1}}, x12
+# CHECK: msr      {{afsr1_el2|AFSR1_EL2}}, x12
+# CHECK: msr      {{afsr1_el3|AFSR1_EL3}}, x12
+# CHECK: msr      {{esr_el1|ESR_EL1}}, x12
+# CHECK: msr      {{esr_el2|ESR_EL2}}, x12
+# CHECK: msr      {{esr_el3|ESR_EL3}}, x12
+# CHECK: msr      {{fpexc32_el2|FPEXC32_EL2}}, x12
+# CHECK: msr      {{far_el1|FAR_EL1}}, x12
+# CHECK: msr      {{far_el2|FAR_EL2}}, x12
+# CHECK: msr      {{far_el3|FAR_EL3}}, x12
+# CHECK: msr      {{hpfar_el2|HPFAR_EL2}}, x12
+# CHECK: msr      {{par_el1|PAR_EL1}}, x12
+# CHECK: msr      {{pmcr_el0|PMCR_EL0}}, x12
+# CHECK: msr      {{pmcntenset_el0|PMCNTENSET_EL0}}, x12
+# CHECK: msr      {{pmcntenclr_el0|PMCNTENCLR_EL0}}, x12
+# CHECK: msr      {{pmovsclr_el0|PMOVSCLR_EL0}}, x12
+# CHECK: msr      {{pmselr_el0|PMSELR_EL0}}, x12
+# CHECK: msr      {{pmccntr_el0|PMCCNTR_EL0}}, x12
+# CHECK: msr      {{pmxevtyper_el0|PMXEVTYPER_EL0}}, x12
+# CHECK: msr      {{pmxevcntr_el0|PMXEVCNTR_EL0}}, x12
+# CHECK: msr      {{pmuserenr_el0|PMUSERENR_EL0}}, x12
+# CHECK: msr      {{pmintenset_el1|PMINTENSET_EL1}}, x12
+# CHECK: msr      {{pmintenclr_el1|PMINTENCLR_EL1}}, x12
+# CHECK: msr      {{pmovsset_el0|PMOVSSET_EL0}}, x12
+# CHECK: msr      {{mair_el1|MAIR_EL1}}, x12
+# CHECK: msr      {{mair_el2|MAIR_EL2}}, x12
+# CHECK: msr      {{mair_el3|MAIR_EL3}}, x12
+# CHECK: msr      {{amair_el1|AMAIR_EL1}}, x12
+# CHECK: msr      {{amair_el2|AMAIR_EL2}}, x12
+# CHECK: msr      {{amair_el3|AMAIR_EL3}}, x12
+# CHECK: msr      {{vbar_el1|VBAR_EL1}}, x12
+# CHECK: msr      {{vbar_el2|VBAR_EL2}}, x12
+# CHECK: msr      {{vbar_el3|VBAR_EL3}}, x12
+# CHECK: msr      {{rmr_el1|RMR_EL1}}, x12
+# CHECK: msr      {{rmr_el2|RMR_EL2}}, x12
+# CHECK: msr      {{rmr_el3|RMR_EL3}}, x12
+# CHECK: msr      {{tpidr_el0|TPIDR_EL0}}, x12
+# CHECK: msr      {{tpidr_el2|TPIDR_EL2}}, x12
+# CHECK: msr      {{tpidr_el3|TPIDR_EL3}}, x12
+# CHECK: msr      {{tpidrro_el0|TPIDRRO_EL0}}, x12
+# CHECK: msr      {{tpidr_el1|TPIDR_EL1}}, x12
+# CHECK: msr      {{cntfrq_el0|CNTFRQ_EL0}}, x12
+# CHECK: msr      {{cntvoff_el2|CNTVOFF_EL2}}, x12
+# CHECK: msr      {{cntkctl_el1|CNTKCTL_EL1}}, x12
+# CHECK: msr      {{cnthctl_el2|CNTHCTL_EL2}}, x12
+# CHECK: msr      {{cntp_tval_el0|CNTP_TVAL_EL0}}, x12
+# CHECK: msr      {{cnthp_tval_el2|CNTHP_TVAL_EL2}}, x12
+# CHECK: msr      {{cntps_tval_el1|CNTPS_TVAL_EL1}}, x12
+# CHECK: msr      {{cntp_ctl_el0|CNTP_CTL_EL0}}, x12
+# CHECK: msr      {{cnthp_ctl_el2|CNTHP_CTL_EL2}}, x12
+# CHECK: msr      {{cntps_ctl_el1|CNTPS_CTL_EL1}}, x12
+# CHECK: msr      {{cntp_cval_el0|CNTP_CVAL_EL0}}, x12
+# CHECK: msr      {{cnthp_cval_el2|CNTHP_CVAL_EL2}}, x12
+# CHECK: msr      {{cntps_cval_el1|CNTPS_CVAL_EL1}}, x12
+# CHECK: msr      {{cntv_tval_el0|CNTV_TVAL_EL0}}, x12
+# CHECK: msr      {{cntv_ctl_el0|CNTV_CTL_EL0}}, x12
+# CHECK: msr      {{cntv_cval_el0|CNTV_CVAL_EL0}}, x12
+# CHECK: msr      {{pmevcntr0_el0|PMEVCNTR0_EL0}}, x12
+# CHECK: msr      {{pmevcntr1_el0|PMEVCNTR1_EL0}}, x12
+# CHECK: msr      {{pmevcntr2_el0|PMEVCNTR2_EL0}}, x12
+# CHECK: msr      {{pmevcntr3_el0|PMEVCNTR3_EL0}}, x12
+# CHECK: msr      {{pmevcntr4_el0|PMEVCNTR4_EL0}}, x12
+# CHECK: msr      {{pmevcntr5_el0|PMEVCNTR5_EL0}}, x12
+# CHECK: msr      {{pmevcntr6_el0|PMEVCNTR6_EL0}}, x12
+# CHECK: msr      {{pmevcntr7_el0|PMEVCNTR7_EL0}}, x12
+# CHECK: msr      {{pmevcntr8_el0|PMEVCNTR8_EL0}}, x12
+# CHECK: msr      {{pmevcntr9_el0|PMEVCNTR9_EL0}}, x12
+# CHECK: msr      {{pmevcntr10_el0|PMEVCNTR10_EL0}}, x12
+# CHECK: msr      {{pmevcntr11_el0|PMEVCNTR11_EL0}}, x12
+# CHECK: msr      {{pmevcntr12_el0|PMEVCNTR12_EL0}}, x12
+# CHECK: msr      {{pmevcntr13_el0|PMEVCNTR13_EL0}}, x12
+# CHECK: msr      {{pmevcntr14_el0|PMEVCNTR14_EL0}}, x12
+# CHECK: msr      {{pmevcntr15_el0|PMEVCNTR15_EL0}}, x12
+# CHECK: msr      {{pmevcntr16_el0|PMEVCNTR16_EL0}}, x12
+# CHECK: msr      {{pmevcntr17_el0|PMEVCNTR17_EL0}}, x12
+# CHECK: msr      {{pmevcntr18_el0|PMEVCNTR18_EL0}}, x12
+# CHECK: msr      {{pmevcntr19_el0|PMEVCNTR19_EL0}}, x12
+# CHECK: msr      {{pmevcntr20_el0|PMEVCNTR20_EL0}}, x12
+# CHECK: msr      {{pmevcntr21_el0|PMEVCNTR21_EL0}}, x12
+# CHECK: msr      {{pmevcntr22_el0|PMEVCNTR22_EL0}}, x12
+# CHECK: msr      {{pmevcntr23_el0|PMEVCNTR23_EL0}}, x12
+# CHECK: msr      {{pmevcntr24_el0|PMEVCNTR24_EL0}}, x12
+# CHECK: msr      {{pmevcntr25_el0|PMEVCNTR25_EL0}}, x12
+# CHECK: msr      {{pmevcntr26_el0|PMEVCNTR26_EL0}}, x12
+# CHECK: msr      {{pmevcntr27_el0|PMEVCNTR27_EL0}}, x12
+# CHECK: msr      {{pmevcntr28_el0|PMEVCNTR28_EL0}}, x12
+# CHECK: msr      {{pmevcntr29_el0|PMEVCNTR29_EL0}}, x12
+# CHECK: msr      {{pmevcntr30_el0|PMEVCNTR30_EL0}}, x12
+# CHECK: msr      {{pmccfiltr_el0|PMCCFILTR_EL0}}, x12
+# CHECK: msr      {{pmevtyper0_el0|PMEVTYPER0_EL0}}, x12
+# CHECK: msr      {{pmevtyper1_el0|PMEVTYPER1_EL0}}, x12
+# CHECK: msr      {{pmevtyper2_el0|PMEVTYPER2_EL0}}, x12
+# CHECK: msr      {{pmevtyper3_el0|PMEVTYPER3_EL0}}, x12
+# CHECK: msr      {{pmevtyper4_el0|PMEVTYPER4_EL0}}, x12
+# CHECK: msr      {{pmevtyper5_el0|PMEVTYPER5_EL0}}, x12
+# CHECK: msr      {{pmevtyper6_el0|PMEVTYPER6_EL0}}, x12
+# CHECK: msr      {{pmevtyper7_el0|PMEVTYPER7_EL0}}, x12
+# CHECK: msr      {{pmevtyper8_el0|PMEVTYPER8_EL0}}, x12
+# CHECK: msr      {{pmevtyper9_el0|PMEVTYPER9_EL0}}, x12
+# CHECK: msr      {{pmevtyper10_el0|PMEVTYPER10_EL0}}, x12
+# CHECK: msr      {{pmevtyper11_el0|PMEVTYPER11_EL0}}, x12
+# CHECK: msr      {{pmevtyper12_el0|PMEVTYPER12_EL0}}, x12
+# CHECK: msr      {{pmevtyper13_el0|PMEVTYPER13_EL0}}, x12
+# CHECK: msr      {{pmevtyper14_el0|PMEVTYPER14_EL0}}, x12
+# CHECK: msr      {{pmevtyper15_el0|PMEVTYPER15_EL0}}, x12
+# CHECK: msr      {{pmevtyper16_el0|PMEVTYPER16_EL0}}, x12
+# CHECK: msr      {{pmevtyper17_el0|PMEVTYPER17_EL0}}, x12
+# CHECK: msr      {{pmevtyper18_el0|PMEVTYPER18_EL0}}, x12
+# CHECK: msr      {{pmevtyper19_el0|PMEVTYPER19_EL0}}, x12
+# CHECK: msr      {{pmevtyper20_el0|PMEVTYPER20_EL0}}, x12
+# CHECK: msr      {{pmevtyper21_el0|PMEVTYPER21_EL0}}, x12
+# CHECK: msr      {{pmevtyper22_el0|PMEVTYPER22_EL0}}, x12
+# CHECK: msr      {{pmevtyper23_el0|PMEVTYPER23_EL0}}, x12
+# CHECK: msr      {{pmevtyper24_el0|PMEVTYPER24_EL0}}, x12
+# CHECK: msr      {{pmevtyper25_el0|PMEVTYPER25_EL0}}, x12
+# CHECK: msr      {{pmevtyper26_el0|PMEVTYPER26_EL0}}, x12
+# CHECK: msr      {{pmevtyper27_el0|PMEVTYPER27_EL0}}, x12
+# CHECK: msr      {{pmevtyper28_el0|PMEVTYPER28_EL0}}, x12
+# CHECK: msr      {{pmevtyper29_el0|PMEVTYPER29_EL0}}, x12
+# CHECK: msr      {{pmevtyper30_el0|PMEVTYPER30_EL0}}, x12
+# CHECK: mrs      x9, {{teecr32_el1|TEECR32_EL1}}
+# CHECK: mrs      x9, {{osdtrrx_el1|OSDTRRX_EL1}}
+# CHECK: mrs      x9, {{mdccsr_el0|MDCCSR_EL0}}
+# CHECK: mrs      x9, {{mdccint_el1|MDCCINT_EL1}}
+# CHECK: mrs      x9, {{mdscr_el1|MDSCR_EL1}}
+# CHECK: mrs      x9, {{osdtrtx_el1|OSDTRTX_EL1}}
+# CHECK: mrs      x9, {{dbgdtr_el0|DBGDTR_EL0}}
+# CHECK: mrs      x9, {{dbgdtrrx_el0|DBGDTRRX_EL0}}
+# CHECK: mrs      x9, {{oseccr_el1|OSECCR_EL1}}
+# CHECK: mrs      x9, {{dbgvcr32_el2|DBGVCR32_EL2}}
+# CHECK: mrs      x9, {{dbgbvr0_el1|DBGBVR0_EL1}}
+# CHECK: mrs      x9, {{dbgbvr1_el1|DBGBVR1_EL1}}
+# CHECK: mrs      x9, {{dbgbvr2_el1|DBGBVR2_EL1}}
+# CHECK: mrs      x9, {{dbgbvr3_el1|DBGBVR3_EL1}}
+# CHECK: mrs      x9, {{dbgbvr4_el1|DBGBVR4_EL1}}
+# CHECK: mrs      x9, {{dbgbvr5_el1|DBGBVR5_EL1}}
+# CHECK: mrs      x9, {{dbgbvr6_el1|DBGBVR6_EL1}}
+# CHECK: mrs      x9, {{dbgbvr7_el1|DBGBVR7_EL1}}
+# CHECK: mrs      x9, {{dbgbvr8_el1|DBGBVR8_EL1}}
+# CHECK: mrs      x9, {{dbgbvr9_el1|DBGBVR9_EL1}}
+# CHECK: mrs      x9, {{dbgbvr10_el1|DBGBVR10_EL1}}
+# CHECK: mrs      x9, {{dbgbvr11_el1|DBGBVR11_EL1}}
+# CHECK: mrs      x9, {{dbgbvr12_el1|DBGBVR12_EL1}}
+# CHECK: mrs      x9, {{dbgbvr13_el1|DBGBVR13_EL1}}
+# CHECK: mrs      x9, {{dbgbvr14_el1|DBGBVR14_EL1}}
+# CHECK: mrs      x9, {{dbgbvr15_el1|DBGBVR15_EL1}}
+# CHECK: mrs      x9, {{dbgbcr0_el1|DBGBCR0_EL1}}
+# CHECK: mrs      x9, {{dbgbcr1_el1|DBGBCR1_EL1}}
+# CHECK: mrs      x9, {{dbgbcr2_el1|DBGBCR2_EL1}}
+# CHECK: mrs      x9, {{dbgbcr3_el1|DBGBCR3_EL1}}
+# CHECK: mrs      x9, {{dbgbcr4_el1|DBGBCR4_EL1}}
+# CHECK: mrs      x9, {{dbgbcr5_el1|DBGBCR5_EL1}}
+# CHECK: mrs      x9, {{dbgbcr6_el1|DBGBCR6_EL1}}
+# CHECK: mrs      x9, {{dbgbcr7_el1|DBGBCR7_EL1}}
+# CHECK: mrs      x9, {{dbgbcr8_el1|DBGBCR8_EL1}}
+# CHECK: mrs      x9, {{dbgbcr9_el1|DBGBCR9_EL1}}
+# CHECK: mrs      x9, {{dbgbcr10_el1|DBGBCR10_EL1}}
+# CHECK: mrs      x9, {{dbgbcr11_el1|DBGBCR11_EL1}}
+# CHECK: mrs      x9, {{dbgbcr12_el1|DBGBCR12_EL1}}
+# CHECK: mrs      x9, {{dbgbcr13_el1|DBGBCR13_EL1}}
+# CHECK: mrs      x9, {{dbgbcr14_el1|DBGBCR14_EL1}}
+# CHECK: mrs      x9, {{dbgbcr15_el1|DBGBCR15_EL1}}
+# CHECK: mrs      x9, {{dbgwvr0_el1|DBGWVR0_EL1}}
+# CHECK: mrs      x9, {{dbgwvr1_el1|DBGWVR1_EL1}}
+# CHECK: mrs      x9, {{dbgwvr2_el1|DBGWVR2_EL1}}
+# CHECK: mrs      x9, {{dbgwvr3_el1|DBGWVR3_EL1}}
+# CHECK: mrs      x9, {{dbgwvr4_el1|DBGWVR4_EL1}}
+# CHECK: mrs      x9, {{dbgwvr5_el1|DBGWVR5_EL1}}
+# CHECK: mrs      x9, {{dbgwvr6_el1|DBGWVR6_EL1}}
+# CHECK: mrs      x9, {{dbgwvr7_el1|DBGWVR7_EL1}}
+# CHECK: mrs      x9, {{dbgwvr8_el1|DBGWVR8_EL1}}
+# CHECK: mrs      x9, {{dbgwvr9_el1|DBGWVR9_EL1}}
+# CHECK: mrs      x9, {{dbgwvr10_el1|DBGWVR10_EL1}}
+# CHECK: mrs      x9, {{dbgwvr11_el1|DBGWVR11_EL1}}
+# CHECK: mrs      x9, {{dbgwvr12_el1|DBGWVR12_EL1}}
+# CHECK: mrs      x9, {{dbgwvr13_el1|DBGWVR13_EL1}}
+# CHECK: mrs      x9, {{dbgwvr14_el1|DBGWVR14_EL1}}
+# CHECK: mrs      x9, {{dbgwvr15_el1|DBGWVR15_EL1}}
+# CHECK: mrs      x9, {{dbgwcr0_el1|DBGWCR0_EL1}}
+# CHECK: mrs      x9, {{dbgwcr1_el1|DBGWCR1_EL1}}
+# CHECK: mrs      x9, {{dbgwcr2_el1|DBGWCR2_EL1}}
+# CHECK: mrs      x9, {{dbgwcr3_el1|DBGWCR3_EL1}}
+# CHECK: mrs      x9, {{dbgwcr4_el1|DBGWCR4_EL1}}
+# CHECK: mrs      x9, {{dbgwcr5_el1|DBGWCR5_EL1}}
+# CHECK: mrs      x9, {{dbgwcr6_el1|DBGWCR6_EL1}}
+# CHECK: mrs      x9, {{dbgwcr7_el1|DBGWCR7_EL1}}
+# CHECK: mrs      x9, {{dbgwcr8_el1|DBGWCR8_EL1}}
+# CHECK: mrs      x9, {{dbgwcr9_el1|DBGWCR9_EL1}}
+# CHECK: mrs      x9, {{dbgwcr10_el1|DBGWCR10_EL1}}
+# CHECK: mrs      x9, {{dbgwcr11_el1|DBGWCR11_EL1}}
+# CHECK: mrs      x9, {{dbgwcr12_el1|DBGWCR12_EL1}}
+# CHECK: mrs      x9, {{dbgwcr13_el1|DBGWCR13_EL1}}
+# CHECK: mrs      x9, {{dbgwcr14_el1|DBGWCR14_EL1}}
+# CHECK: mrs      x9, {{dbgwcr15_el1|DBGWCR15_EL1}}
+# CHECK: mrs      x9, {{mdrar_el1|MDRAR_EL1}}
+# CHECK: mrs      x9, {{teehbr32_el1|TEEHBR32_EL1}}
+# CHECK: mrs      x9, {{oslsr_el1|OSLSR_EL1}}
+# CHECK: mrs      x9, {{osdlr_el1|OSDLR_EL1}}
+# CHECK: mrs      x9, {{dbgprcr_el1|DBGPRCR_EL1}}
+# CHECK: mrs      x9, {{dbgclaimset_el1|DBGCLAIMSET_EL1}}
+# CHECK: mrs      x9, {{dbgclaimclr_el1|DBGCLAIMCLR_EL1}}
+# CHECK: mrs      x9, {{dbgauthstatus_el1|DBGAUTHSTATUS_EL1}}
+# CHECK: mrs      x9, {{midr_el1|MIDR_EL1}}
+# CHECK: mrs      x9, {{ccsidr_el1|CCSIDR_EL1}}
+# CHECK: mrs      x9, {{csselr_el1|CSSELR_EL1}}
+# CHECK: mrs      x9, {{vpidr_el2|VPIDR_EL2}}
+# CHECK: mrs      x9, {{clidr_el1|CLIDR_EL1}}
+# CHECK: mrs      x9, {{ctr_el0|CTR_EL0}}
+# CHECK: mrs      x9, {{mpidr_el1|MPIDR_EL1}}
+# CHECK: mrs      x9, {{vmpidr_el2|VMPIDR_EL2}}
+# CHECK: mrs      x9, {{revidr_el1|REVIDR_EL1}}
+# CHECK: mrs      x9, {{aidr_el1|AIDR_EL1}}
+# CHECK: mrs      x9, {{dczid_el0|DCZID_EL0}}
+# CHECK: mrs      x9, {{id_pfr0_el1|ID_PFR0_EL1}}
+# CHECK: mrs      x9, {{id_pfr1_el1|ID_PFR1_EL1}}
+# CHECK: mrs      x9, {{id_dfr0_el1|ID_DFR0_EL1}}
+# CHECK: mrs      x9, {{id_afr0_el1|ID_AFR0_EL1}}
+# CHECK: mrs      x9, {{id_mmfr0_el1|ID_MMFR0_EL1}}
+# CHECK: mrs      x9, {{id_mmfr1_el1|ID_MMFR1_EL1}}
+# CHECK: mrs      x9, {{id_mmfr2_el1|ID_MMFR2_EL1}}
+# CHECK: mrs      x9, {{id_mmfr3_el1|ID_MMFR3_EL1}}
+# CHECK: mrs      x9, {{id_isar0_el1|ID_ISAR0_EL1}}
+# CHECK: mrs      x9, {{id_isar1_el1|ID_ISAR1_EL1}}
+# CHECK: mrs      x9, {{id_isar2_el1|ID_ISAR2_EL1}}
+# CHECK: mrs      x9, {{id_isar3_el1|ID_ISAR3_EL1}}
+# CHECK: mrs      x9, {{id_isar4_el1|ID_ISAR4_EL1}}
+# CHECK: mrs      x9, {{id_isar5_el1|ID_ISAR5_EL1}}
+# CHECK: mrs      x9, {{mvfr0_el1|MVFR0_EL1}}
+# CHECK: mrs      x9, {{mvfr1_el1|MVFR1_EL1}}
+# CHECK: mrs      x9, {{mvfr2_el1|MVFR2_EL1}}
+# CHECK: mrs      x9, {{id_aa64pfr0_el1|ID_AA64PFR0_EL1}}
+# CHECK: mrs      x9, {{id_aa64pfr1_el1|ID_AA64PFR1_EL1}}
+# CHECK: mrs      x9, {{id_aa64dfr0_el1|ID_AA64DFR0_EL1}}
+# CHECK: mrs      x9, {{id_aa64dfr1_el1|ID_AA64DFR1_EL1}}
+# CHECK: mrs      x9, {{id_aa64afr0_el1|ID_AA64AFR0_EL1}}
+# CHECK: mrs      x9, {{id_aa64afr1_el1|ID_AA64AFR1_EL1}}
+# CHECK: mrs      x9, {{id_aa64isar0_el1|ID_AA64ISAR0_EL1}}
+# CHECK: mrs      x9, {{id_aa64isar1_el1|ID_AA64ISAR1_EL1}}
+# CHECK: mrs      x9, {{id_aa64mmfr0_el1|ID_AA64MMFR0_EL1}}
+# CHECK: mrs      x9, {{id_aa64mmfr1_el1|ID_AA64MMFR1_EL1}}
+# CHECK: mrs      x9, {{sctlr_el1|SCTLR_EL1}}
+# CHECK: mrs      x9, {{sctlr_el2|SCTLR_EL2}}
+# CHECK: mrs      x9, {{sctlr_el3|SCTLR_EL3}}
+# CHECK: mrs      x9, {{actlr_el1|ACTLR_EL1}}
+# CHECK: mrs      x9, {{actlr_el2|ACTLR_EL2}}
+# CHECK: mrs      x9, {{actlr_el3|ACTLR_EL3}}
+# CHECK: mrs      x9, {{cpacr_el1|CPACR_EL1}}
+# CHECK: mrs      x9, {{hcr_el2|HCR_EL2}}
+# CHECK: mrs      x9, {{scr_el3|SCR_EL3}}
+# CHECK: mrs      x9, {{mdcr_el2|MDCR_EL2}}
+# CHECK: mrs      x9, {{sder32_el3|SDER32_EL3}}
+# CHECK: mrs      x9, {{cptr_el2|CPTR_EL2}}
+# CHECK: mrs      x9, {{cptr_el3|CPTR_EL3}}
+# CHECK: mrs      x9, {{hstr_el2|HSTR_EL2}}
+# CHECK: mrs      x9, {{hacr_el2|HACR_EL2}}
+# CHECK: mrs      x9, {{mdcr_el3|MDCR_EL3}}
+# CHECK: mrs      x9, {{ttbr0_el1|TTBR0_EL1}}
+# CHECK: mrs      x9, {{ttbr0_el2|TTBR0_EL2}}
+# CHECK: mrs      x9, {{ttbr0_el3|TTBR0_EL3}}
+# CHECK: mrs      x9, {{ttbr1_el1|TTBR1_EL1}}
+# CHECK: mrs      x9, {{tcr_el1|TCR_EL1}}
+# CHECK: mrs      x9, {{tcr_el2|TCR_EL2}}
+# CHECK: mrs      x9, {{tcr_el3|TCR_EL3}}
+# CHECK: mrs      x9, {{vttbr_el2|VTTBR_EL2}}
+# CHECK: mrs      x9, {{vtcr_el2|VTCR_EL2}}
+# CHECK: mrs      x9, {{dacr32_el2|DACR32_EL2}}
+# CHECK: mrs      x9, {{spsr_el1|SPSR_EL1}}
+# CHECK: mrs      x9, {{spsr_el2|SPSR_EL2}}
+# CHECK: mrs      x9, {{spsr_el3|SPSR_EL3}}
+# CHECK: mrs      x9, {{elr_el1|ELR_EL1}}
+# CHECK: mrs      x9, {{elr_el2|ELR_EL2}}
+# CHECK: mrs      x9, {{elr_el3|ELR_EL3}}
+# CHECK: mrs      x9, {{sp_el0|SP_EL0}}
+# CHECK: mrs      x9, {{sp_el1|SP_EL1}}
+# CHECK: mrs      x9, {{sp_el2|SP_EL2}}
+# CHECK: mrs      x9, {{spsel|SPSEL}}
+# CHECK: mrs      x9, {{nzcv|NZCV}}
+# CHECK: mrs      x9, {{daif|DAIF}}
+# CHECK: mrs      x9, {{currentel|CURRENTEL}}
+# CHECK: mrs      x9, {{spsr_irq|SPSR_IRQ}}
+# CHECK: mrs      x9, {{spsr_abt|SPSR_ABT}}
+# CHECK: mrs      x9, {{spsr_und|SPSR_UND}}
+# CHECK: mrs      x9, {{spsr_fiq|SPSR_FIQ}}
+# CHECK: mrs      x9, {{fpcr|FPCR}}
+# CHECK: mrs      x9, {{fpsr|FPSR}}
+# CHECK: mrs      x9, {{dspsr_el0|DSPSR_EL0}}
+# CHECK: mrs      x9, {{dlr_el0|DLR_EL0}}
+# CHECK: mrs      x9, {{ifsr32_el2|IFSR32_EL2}}
+# CHECK: mrs      x9, {{afsr0_el1|AFSR0_EL1}}
+# CHECK: mrs      x9, {{afsr0_el2|AFSR0_EL2}}
+# CHECK: mrs      x9, {{afsr0_el3|AFSR0_EL3}}
+# CHECK: mrs      x9, {{afsr1_el1|AFSR1_EL1}}
+# CHECK: mrs      x9, {{afsr1_el2|AFSR1_EL2}}
+# CHECK: mrs      x9, {{afsr1_el3|AFSR1_EL3}}
+# CHECK: mrs      x9, {{esr_el1|ESR_EL1}}
+# CHECK: mrs      x9, {{esr_el2|ESR_EL2}}
+# CHECK: mrs      x9, {{esr_el3|ESR_EL3}}
+# CHECK: mrs      x9, {{fpexc32_el2|FPEXC32_EL2}}
+# CHECK: mrs      x9, {{far_el1|FAR_EL1}}
+# CHECK: mrs      x9, {{far_el2|FAR_EL2}}
+# CHECK: mrs      x9, {{far_el3|FAR_EL3}}
+# CHECK: mrs      x9, {{hpfar_el2|HPFAR_EL2}}
+# CHECK: mrs      x9, {{par_el1|PAR_EL1}}
+# CHECK: mrs      x9, {{pmcr_el0|PMCR_EL0}}
+# CHECK: mrs      x9, {{pmcntenset_el0|PMCNTENSET_EL0}}
+# CHECK: mrs      x9, {{pmcntenclr_el0|PMCNTENCLR_EL0}}
+# CHECK: mrs      x9, {{pmovsclr_el0|PMOVSCLR_EL0}}
+# CHECK: mrs      x9, {{pmselr_el0|PMSELR_EL0}}
+# CHECK: mrs      x9, {{pmceid0_el0|PMCEID0_EL0}}
+# CHECK: mrs      x9, {{pmceid1_el0|PMCEID1_EL0}}
+# CHECK: mrs      x9, {{pmccntr_el0|PMCCNTR_EL0}}
+# CHECK: mrs      x9, {{pmxevtyper_el0|PMXEVTYPER_EL0}}
+# CHECK: mrs      x9, {{pmxevcntr_el0|PMXEVCNTR_EL0}}
+# CHECK: mrs      x9, {{pmuserenr_el0|PMUSERENR_EL0}}
+# CHECK: mrs      x9, {{pmintenset_el1|PMINTENSET_EL1}}
+# CHECK: mrs      x9, {{pmintenclr_el1|PMINTENCLR_EL1}}
+# CHECK: mrs      x9, {{pmovsset_el0|PMOVSSET_EL0}}
+# CHECK: mrs      x9, {{mair_el1|MAIR_EL1}}
+# CHECK: mrs      x9, {{mair_el2|MAIR_EL2}}
+# CHECK: mrs      x9, {{mair_el3|MAIR_EL3}}
+# CHECK: mrs      x9, {{amair_el1|AMAIR_EL1}}
+# CHECK: mrs      x9, {{amair_el2|AMAIR_EL2}}
+# CHECK: mrs      x9, {{amair_el3|AMAIR_EL3}}
+# CHECK: mrs      x9, {{vbar_el1|VBAR_EL1}}
+# CHECK: mrs      x9, {{vbar_el2|VBAR_EL2}}
+# CHECK: mrs      x9, {{vbar_el3|VBAR_EL3}}
+# CHECK: mrs      x9, {{rvbar_el1|RVBAR_EL1}}
+# CHECK: mrs      x9, {{rvbar_el2|RVBAR_EL2}}
+# CHECK: mrs      x9, {{rvbar_el3|RVBAR_EL3}}
+# CHECK: mrs      x9, {{rmr_el1|RMR_EL1}}
+# CHECK: mrs      x9, {{rmr_el2|RMR_EL2}}
+# CHECK: mrs      x9, {{rmr_el3|RMR_EL3}}
+# CHECK: mrs      x9, {{isr_el1|ISR_EL1}}
+# CHECK: mrs      x9, {{contextidr_el1|CONTEXTIDR_EL1}}
+# CHECK: mrs      x9, {{tpidr_el0|TPIDR_EL0}}
+# CHECK: mrs      x9, {{tpidr_el2|TPIDR_EL2}}
+# CHECK: mrs      x9, {{tpidr_el3|TPIDR_EL3}}
+# CHECK: mrs      x9, {{tpidrro_el0|TPIDRRO_EL0}}
+# CHECK: mrs      x9, {{tpidr_el1|TPIDR_EL1}}
+# CHECK: mrs      x9, {{cntfrq_el0|CNTFRQ_EL0}}
+# CHECK: mrs      x9, {{cntpct_el0|CNTPCT_EL0}}
+# CHECK: mrs      x9, {{cntvct_el0|CNTVCT_EL0}}
+# CHECK: mrs      x9, {{cntvoff_el2|CNTVOFF_EL2}}
+# CHECK: mrs      x9, {{cntkctl_el1|CNTKCTL_EL1}}
+# CHECK: mrs      x9, {{cnthctl_el2|CNTHCTL_EL2}}
+# CHECK: mrs      x9, {{cntp_tval_el0|CNTP_TVAL_EL0}}
+# CHECK: mrs      x9, {{cnthp_tval_el2|CNTHP_TVAL_EL2}}
+# CHECK: mrs      x9, {{cntps_tval_el1|CNTPS_TVAL_EL1}}
+# CHECK: mrs      x9, {{cntp_ctl_el0|CNTP_CTL_EL0}}
+# CHECK: mrs      x9, {{cnthp_ctl_el2|CNTHP_CTL_EL2}}
+# CHECK: mrs      x9, {{cntps_ctl_el1|CNTPS_CTL_EL1}}
+# CHECK: mrs      x9, {{cntp_cval_el0|CNTP_CVAL_EL0}}
+# CHECK: mrs      x9, {{cnthp_cval_el2|CNTHP_CVAL_EL2}}
+# CHECK: mrs      x9, {{cntps_cval_el1|CNTPS_CVAL_EL1}}
+# CHECK: mrs      x9, {{cntv_tval_el0|CNTV_TVAL_EL0}}
+# CHECK: mrs      x9, {{cntv_ctl_el0|CNTV_CTL_EL0}}
+# CHECK: mrs      x9, {{cntv_cval_el0|CNTV_CVAL_EL0}}
+# CHECK: mrs      x9, {{pmevcntr0_el0|PMEVCNTR0_EL0}}
+# CHECK: mrs      x9, {{pmevcntr1_el0|PMEVCNTR1_EL0}}
+# CHECK: mrs      x9, {{pmevcntr2_el0|PMEVCNTR2_EL0}}
+# CHECK: mrs      x9, {{pmevcntr3_el0|PMEVCNTR3_EL0}}
+# CHECK: mrs      x9, {{pmevcntr4_el0|PMEVCNTR4_EL0}}
+# CHECK: mrs      x9, {{pmevcntr5_el0|PMEVCNTR5_EL0}}
+# CHECK: mrs      x9, {{pmevcntr6_el0|PMEVCNTR6_EL0}}
+# CHECK: mrs      x9, {{pmevcntr7_el0|PMEVCNTR7_EL0}}
+# CHECK: mrs      x9, {{pmevcntr8_el0|PMEVCNTR8_EL0}}
+# CHECK: mrs      x9, {{pmevcntr9_el0|PMEVCNTR9_EL0}}
+# CHECK: mrs      x9, {{pmevcntr10_el0|PMEVCNTR10_EL0}}
+# CHECK: mrs      x9, {{pmevcntr11_el0|PMEVCNTR11_EL0}}
+# CHECK: mrs      x9, {{pmevcntr12_el0|PMEVCNTR12_EL0}}
+# CHECK: mrs      x9, {{pmevcntr13_el0|PMEVCNTR13_EL0}}
+# CHECK: mrs      x9, {{pmevcntr14_el0|PMEVCNTR14_EL0}}
+# CHECK: mrs      x9, {{pmevcntr15_el0|PMEVCNTR15_EL0}}
+# CHECK: mrs      x9, {{pmevcntr16_el0|PMEVCNTR16_EL0}}
+# CHECK: mrs      x9, {{pmevcntr17_el0|PMEVCNTR17_EL0}}
+# CHECK: mrs      x9, {{pmevcntr18_el0|PMEVCNTR18_EL0}}
+# CHECK: mrs      x9, {{pmevcntr19_el0|PMEVCNTR19_EL0}}
+# CHECK: mrs      x9, {{pmevcntr20_el0|PMEVCNTR20_EL0}}
+# CHECK: mrs      x9, {{pmevcntr21_el0|PMEVCNTR21_EL0}}
+# CHECK: mrs      x9, {{pmevcntr22_el0|PMEVCNTR22_EL0}}
+# CHECK: mrs      x9, {{pmevcntr23_el0|PMEVCNTR23_EL0}}
+# CHECK: mrs      x9, {{pmevcntr24_el0|PMEVCNTR24_EL0}}
+# CHECK: mrs      x9, {{pmevcntr25_el0|PMEVCNTR25_EL0}}
+# CHECK: mrs      x9, {{pmevcntr26_el0|PMEVCNTR26_EL0}}
+# CHECK: mrs      x9, {{pmevcntr27_el0|PMEVCNTR27_EL0}}
+# CHECK: mrs      x9, {{pmevcntr28_el0|PMEVCNTR28_EL0}}
+# CHECK: mrs      x9, {{pmevcntr29_el0|PMEVCNTR29_EL0}}
+# CHECK: mrs      x9, {{pmevcntr30_el0|PMEVCNTR30_EL0}}
+# CHECK: mrs      x9, {{pmccfiltr_el0|PMCCFILTR_EL0}}
+# CHECK: mrs      x9, {{pmevtyper0_el0|PMEVTYPER0_EL0}}
+# CHECK: mrs      x9, {{pmevtyper1_el0|PMEVTYPER1_EL0}}
+# CHECK: mrs      x9, {{pmevtyper2_el0|PMEVTYPER2_EL0}}
+# CHECK: mrs      x9, {{pmevtyper3_el0|PMEVTYPER3_EL0}}
+# CHECK: mrs      x9, {{pmevtyper4_el0|PMEVTYPER4_EL0}}
+# CHECK: mrs      x9, {{pmevtyper5_el0|PMEVTYPER5_EL0}}
+# CHECK: mrs      x9, {{pmevtyper6_el0|PMEVTYPER6_EL0}}
+# CHECK: mrs      x9, {{pmevtyper7_el0|PMEVTYPER7_EL0}}
+# CHECK: mrs      x9, {{pmevtyper8_el0|PMEVTYPER8_EL0}}
+# CHECK: mrs      x9, {{pmevtyper9_el0|PMEVTYPER9_EL0}}
+# CHECK: mrs      x9, {{pmevtyper10_el0|PMEVTYPER10_EL0}}
+# CHECK: mrs      x9, {{pmevtyper11_el0|PMEVTYPER11_EL0}}
+# CHECK: mrs      x9, {{pmevtyper12_el0|PMEVTYPER12_EL0}}
+# CHECK: mrs      x9, {{pmevtyper13_el0|PMEVTYPER13_EL0}}
+# CHECK: mrs      x9, {{pmevtyper14_el0|PMEVTYPER14_EL0}}
+# CHECK: mrs      x9, {{pmevtyper15_el0|PMEVTYPER15_EL0}}
+# CHECK: mrs      x9, {{pmevtyper16_el0|PMEVTYPER16_EL0}}
+# CHECK: mrs      x9, {{pmevtyper17_el0|PMEVTYPER17_EL0}}
+# CHECK: mrs      x9, {{pmevtyper18_el0|PMEVTYPER18_EL0}}
+# CHECK: mrs      x9, {{pmevtyper19_el0|PMEVTYPER19_EL0}}
+# CHECK: mrs      x9, {{pmevtyper20_el0|PMEVTYPER20_EL0}}
+# CHECK: mrs      x9, {{pmevtyper21_el0|PMEVTYPER21_EL0}}
+# CHECK: mrs      x9, {{pmevtyper22_el0|PMEVTYPER22_EL0}}
+# CHECK: mrs      x9, {{pmevtyper23_el0|PMEVTYPER23_EL0}}
+# CHECK: mrs      x9, {{pmevtyper24_el0|PMEVTYPER24_EL0}}
+# CHECK: mrs      x9, {{pmevtyper25_el0|PMEVTYPER25_EL0}}
+# CHECK: mrs      x9, {{pmevtyper26_el0|PMEVTYPER26_EL0}}
+# CHECK: mrs      x9, {{pmevtyper27_el0|PMEVTYPER27_EL0}}
+# CHECK: mrs      x9, {{pmevtyper28_el0|PMEVTYPER28_EL0}}
+# CHECK: mrs      x9, {{pmevtyper29_el0|PMEVTYPER29_EL0}}
+# CHECK: mrs      x9, {{pmevtyper30_el0|PMEVTYPER30_EL0}}
 
 0xc 0x0 0x12 0xd5
 0x4c 0x0 0x10 0xd5
@@ -4147,10 +4148,10 @@
 0xa9 0xef 0x3b 0xd5
 0xc9 0xef 0x3b 0xd5
 
-# CHECK: mrs     x12, s3_7_c15_c1_5
-# CHECK: mrs     x13, s3_2_c11_c15_7
-# CHECK: msr     s3_0_c15_c0_0, x12
-# CHECK: msr     s3_7_c11_c13_7, x5
+# CHECK: mrs     x12, {{s3_7_c15_c1_5|S3_7_C15_C1_5}}
+# CHECK: mrs     x13, {{s3_2_c11_c15_7|S3_2_C11_C15_7}}
+# CHECK: msr     {{s3_0_c15_c0_0|S3_0_C15_C0_0}}, x12
+# CHECK: msr     {{s3_7_c11_c13_7|S3_7_C11_C13_7}}, x5
 0xac 0xf1 0x3f 0xd5
 0xed 0xbf 0x3a 0xd5
 0x0c 0xf0 0x18 0xd5
diff --git a/test/MC/Disassembler/AArch64/basic-a64-undefined.txt b/test/MC/Disassembler/AArch64/basic-a64-undefined.txt
index a17579c..968a454 100644
--- a/test/MC/Disassembler/AArch64/basic-a64-undefined.txt
+++ b/test/MC/Disassembler/AArch64/basic-a64-undefined.txt
@@ -1,43 +1,66 @@
-# These spawn another process so they're rather expensive. Not many.
+# RUN: not llvm-mc -disassemble -triple=aarch64 %s 2> %t
+# RUN: FileCheck %s < %t
+# RUN: not llvm-mc -disassemble -triple=arm64 %s 2> %t
+# RUN: FileCheck %s < %t
 
 # Instructions notionally in the add/sub (extended register) sheet, but with
 # invalid shift amount or "opt" field.
-# RUN: echo "0x00 0x10 0xa0 0x0b" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x00 0x10 0x60 0x0b" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x00 0x14 0x20 0x0b" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+[0x00 0x10 0xa0 0x0b]
+[0x00 0x10 0x60 0x0b]
+[0x00 0x14 0x20 0x0b]
+# CHECK: invalid instruction encoding
+# CHECK: invalid instruction encoding
+# CHECK: invalid instruction encoding
 
 # Instructions notionally in the add/sub (immediate) sheet, but with
 # invalid "shift" field.
-# RUN: echo "0xdf 0x3 0x80 0x91" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0xed 0x8e 0xc4 0x31" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x62 0xfc 0xbf 0x11" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x3 0xff 0xff 0x91" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+[0xdf 0x3 0x80 0x91]
+[0xed 0x8e 0xc4 0x31]
+[0x62 0xfc 0xbf 0x11]
+[0x3 0xff 0xff 0x91]
+# CHECK: invalid instruction encoding
+# CHECK: invalid instruction encoding
+# CHECK: invalid instruction encoding
+# CHECK: invalid instruction encoding
 
 # Instructions notionally in the load/store (unsigned immediate) sheet.
 # Only unallocated (int-register) variants are: opc=0b11, size=0b10, 0b11
-# RUN: echo "0xd7 0xfc 0xff 0xb9" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0xd7 0xfc 0xcf 0xf9" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+[0xd7 0xfc 0xff 0xb9]
+[0xd7 0xfc 0xcf 0xf9]
+# CHECK: invalid instruction encoding
+# CHECK: invalid instruction encoding
 
 # Instructions notionally in the floating-point <-> fixed-point conversion
 # Scale field is 64-<imm> and <imm> should be 1-32 for a 32-bit int register.
-# RUN: echo "0x23 0x01 0x18 0x1e" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x23 0x25 0x42 0x1e" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+[0x23 0x01 0x18 0x1e]
+[0x23 0x25 0x42 0x1e]
+# CHECK: invalid instruction encoding
+# CHECK: invalid instruction encoding
 
 # Instructions notionally in the logical (shifted register) sheet, but with out
 # of range shift: w-registers can only have 0-31.
-# RUN: echo "0x00 0x80 0x00 0x0a" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+[0x00 0x80 0x00 0x0a]
+# CHECK: invalid instruction encoding
 
 # Instructions notionally in the move wide (immediate) sheet, but with out
 # of range shift: w-registers can only have 0 or 16.
-# RUN: echo "0x00 0x00 0xc0 0x12" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x12 0x34 0xe0 0x52" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
-
-# Data-processing instructions are undefined when S=1 and for the 0b0000111 value in opcode:sf
-# RUN: echo "0x00 0x00 0xc0 0x5f" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x56 0x0c 0xc0 0x5a" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+[0x00 0x00 0xc0 0x12]
+[0x12 0x34 0xe0 0x52]
+# CHECK: invalid instruction encoding
+# CHECK: invalid instruction encoding
 
-# Data-processing instructions (2 source) are undefined for a value of 0001xx:0:x or 0011xx:0:x for opcode:S:sf
-# RUN: echo "0x00 0x30 0xc1 0x1a" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
-# RUN: echo "0x00 0x10 0xc1 0x1a" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+# Data-processing instructions are undefined when S=1 and for the 0b0000111
+# value in opcode:sf
+[0x00 0x00 0xc0 0x5f]
+[0x56 0x0c 0xc0 0x5a]
+# CHECK: invalid instruction encoding
+# CHECK: invalid instruction encoding
 
+# Data-processing instructions (2 source) are undefined for a value of
+#  0001xx:0:x or 0011xx:0:x for opcode:S:sf
+[0x00 0x30 0xc1 0x1a]
+[0x00 0x10 0xc1 0x1a]
+# CHECK: invalid instruction encoding
 # CHECK: invalid instruction encoding
+
+
diff --git a/test/MC/Disassembler/AArch64/basic-a64-unpredictable.txt b/test/MC/Disassembler/AArch64/basic-a64-unpredictable.txt
index 5363863..2fccccb 100644
--- a/test/MC/Disassembler/AArch64/basic-a64-unpredictable.txt
+++ b/test/MC/Disassembler/AArch64/basic-a64-unpredictable.txt
@@ -1,4 +1,5 @@
 # RUN: llvm-mc -triple=aarch64 -mattr=+fp-armv8 -disassemble < %s 2>&1 | FileCheck %s
+# RUN: llvm-mc -triple=arm64 -mattr=+fp-armv8 -disassemble < %s 2>&1 | FileCheck %s
 
 #------------------------------------------------------------------------------
 # Load-store exclusive
diff --git a/test/MC/Disassembler/AArch64/gicv3-regs.txt b/test/MC/Disassembler/AArch64/gicv3-regs.txt
index 4351f64..851e83d 100644
--- a/test/MC/Disassembler/AArch64/gicv3-regs.txt
+++ b/test/MC/Disassembler/AArch64/gicv3-regs.txt
@@ -1,222 +1,223 @@
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple arm64-none-linux-gnu -disassemble < %s | FileCheck %s
 
 0x8 0xcc 0x38 0xd5
-# CHECK: mrs      x8, icc_iar1_el1
+# CHECK: mrs      x8, {{icc_iar1_el1|ICC_IAR1_EL1}}
 0x1a 0xc8 0x38 0xd5
-# CHECK: mrs      x26, icc_iar0_el1
+# CHECK: mrs      x26, {{icc_iar0_el1|ICC_IAR0_EL1}}
 0x42 0xcc 0x38 0xd5
-# CHECK: mrs      x2, icc_hppir1_el1
+# CHECK: mrs      x2, {{icc_hppir1_el1|ICC_HPPIR1_EL1}}
 0x51 0xc8 0x38 0xd5
-# CHECK: mrs      x17, icc_hppir0_el1
+# CHECK: mrs      x17, {{icc_hppir0_el1|ICC_HPPIR0_EL1}}
 0x7d 0xcb 0x38 0xd5
-# CHECK: mrs      x29, icc_rpr_el1
+# CHECK: mrs      x29, {{icc_rpr_el1|ICC_RPR_EL1}}
 0x24 0xcb 0x3c 0xd5
-# CHECK: mrs      x4, ich_vtr_el2
+# CHECK: mrs      x4, {{ich_vtr_el2|ICH_VTR_EL2}}
 0x78 0xcb 0x3c 0xd5
-# CHECK: mrs      x24, ich_eisr_el2
+# CHECK: mrs      x24, {{ich_eisr_el2|ICH_EISR_EL2}}
 0xa9 0xcb 0x3c 0xd5
-# CHECK: mrs      x9, ich_elsr_el2
+# CHECK: mrs      x9, {{ich_elsr_el2|ICH_ELSR_EL2}}
 0x78 0xcc 0x38 0xd5
-# CHECK: mrs      x24, icc_bpr1_el1
+# CHECK: mrs      x24, {{icc_bpr1_el1|ICC_BPR1_EL1}}
 0x6e 0xc8 0x38 0xd5
-# CHECK: mrs      x14, icc_bpr0_el1
+# CHECK: mrs      x14, {{icc_bpr0_el1|ICC_BPR0_EL1}}
 0x13 0x46 0x38 0xd5
-# CHECK: mrs      x19, icc_pmr_el1
+# CHECK: mrs      x19, {{icc_pmr_el1|ICC_PMR_EL1}}
 0x97 0xcc 0x38 0xd5
-# CHECK: mrs      x23, icc_ctlr_el1
+# CHECK: mrs      x23, {{icc_ctlr_el1|ICC_CTLR_EL1}}
 0x94 0xcc 0x3e 0xd5
-# CHECK: mrs      x20, icc_ctlr_el3
+# CHECK: mrs      x20, {{icc_ctlr_el3|ICC_CTLR_EL3}}
 0xbc 0xcc 0x38 0xd5
-# CHECK: mrs      x28, icc_sre_el1
+# CHECK: mrs      x28, {{icc_sre_el1|ICC_SRE_EL1}}
 0xb9 0xc9 0x3c 0xd5
-# CHECK: mrs      x25, icc_sre_el2
+# CHECK: mrs      x25, {{icc_sre_el2|ICC_SRE_EL2}}
 0xa8 0xcc 0x3e 0xd5
-# CHECK: mrs      x8, icc_sre_el3
+# CHECK: mrs      x8, {{icc_sre_el3|ICC_SRE_EL3}}
 0xd6 0xcc 0x38 0xd5
-# CHECK: mrs      x22, icc_igrpen0_el1
+# CHECK: mrs      x22, {{icc_igrpen0_el1|ICC_IGRPEN0_EL1}}
 0xe5 0xcc 0x38 0xd5
-# CHECK: mrs      x5, icc_igrpen1_el1
+# CHECK: mrs      x5, {{icc_igrpen1_el1|ICC_IGRPEN1_EL1}}
 0xe7 0xcc 0x3e 0xd5
-# CHECK: mrs      x7, icc_igrpen1_el3
+# CHECK: mrs      x7, {{icc_igrpen1_el3|ICC_IGRPEN1_EL3}}
 0x16 0xcd 0x38 0xd5
-# CHECK: mrs      x22, icc_seien_el1
+# CHECK: mrs      x22, {{icc_seien_el1|ICC_SEIEN_EL1}}
 0x84 0xc8 0x38 0xd5
-# CHECK: mrs      x4, icc_ap0r0_el1
+# CHECK: mrs      x4, {{icc_ap0r0_el1|ICC_AP0R0_EL1}}
 0xab 0xc8 0x38 0xd5
-# CHECK: mrs      x11, icc_ap0r1_el1
+# CHECK: mrs      x11, {{icc_ap0r1_el1|ICC_AP0R1_EL1}}
 0xdb 0xc8 0x38 0xd5
-# CHECK: mrs      x27, icc_ap0r2_el1
+# CHECK: mrs      x27, {{icc_ap0r2_el1|ICC_AP0R2_EL1}}
 0xf5 0xc8 0x38 0xd5
-# CHECK: mrs      x21, icc_ap0r3_el1
+# CHECK: mrs      x21, {{icc_ap0r3_el1|ICC_AP0R3_EL1}}
 0x2 0xc9 0x38 0xd5
-# CHECK: mrs      x2, icc_ap1r0_el1
+# CHECK: mrs      x2, {{icc_ap1r0_el1|ICC_AP1R0_EL1}}
 0x35 0xc9 0x38 0xd5
-# CHECK: mrs      x21, icc_ap1r1_el1
+# CHECK: mrs      x21, {{icc_ap1r1_el1|ICC_AP1R1_EL1}}
 0x4a 0xc9 0x38 0xd5
-# CHECK: mrs      x10, icc_ap1r2_el1
+# CHECK: mrs      x10, {{icc_ap1r2_el1|ICC_AP1R2_EL1}}
 0x7b 0xc9 0x38 0xd5
-# CHECK: mrs      x27, icc_ap1r3_el1
+# CHECK: mrs      x27, {{icc_ap1r3_el1|ICC_AP1R3_EL1}}
 0x14 0xc8 0x3c 0xd5
-# CHECK: mrs      x20, ich_ap0r0_el2
+# CHECK: mrs      x20, {{ich_ap0r0_el2|ICH_AP0R0_EL2}}
 0x35 0xc8 0x3c 0xd5
-# CHECK: mrs      x21, ich_ap0r1_el2
+# CHECK: mrs      x21, {{ich_ap0r1_el2|ICH_AP0R1_EL2}}
 0x45 0xc8 0x3c 0xd5
-# CHECK: mrs      x5, ich_ap0r2_el2
+# CHECK: mrs      x5, {{ich_ap0r2_el2|ICH_AP0R2_EL2}}
 0x64 0xc8 0x3c 0xd5
-# CHECK: mrs      x4, ich_ap0r3_el2
+# CHECK: mrs      x4, {{ich_ap0r3_el2|ICH_AP0R3_EL2}}
 0xf 0xc9 0x3c 0xd5
-# CHECK: mrs      x15, ich_ap1r0_el2
+# CHECK: mrs      x15, {{ich_ap1r0_el2|ICH_AP1R0_EL2}}
 0x2c 0xc9 0x3c 0xd5
-# CHECK: mrs      x12, ich_ap1r1_el2
+# CHECK: mrs      x12, {{ich_ap1r1_el2|ICH_AP1R1_EL2}}
 0x5b 0xc9 0x3c 0xd5
-# CHECK: mrs      x27, ich_ap1r2_el2
+# CHECK: mrs      x27, {{ich_ap1r2_el2|ICH_AP1R2_EL2}}
 0x74 0xc9 0x3c 0xd5
-# CHECK: mrs      x20, ich_ap1r3_el2
+# CHECK: mrs      x20, {{ich_ap1r3_el2|ICH_AP1R3_EL2}}
 0xa 0xcb 0x3c 0xd5
-# CHECK: mrs      x10, ich_hcr_el2
+# CHECK: mrs      x10, {{ich_hcr_el2|ICH_HCR_EL2}}
 0x5b 0xcb 0x3c 0xd5
-# CHECK: mrs      x27, ich_misr_el2
+# CHECK: mrs      x27, {{ich_misr_el2|ICH_MISR_EL2}}
 0xe6 0xcb 0x3c 0xd5
-# CHECK: mrs      x6, ich_vmcr_el2
+# CHECK: mrs      x6, {{ich_vmcr_el2|ICH_VMCR_EL2}}
 0x93 0xc9 0x3c 0xd5
-# CHECK: mrs      x19, ich_vseir_el2
+# CHECK: mrs      x19, {{ich_vseir_el2|ICH_VSEIR_EL2}}
 0x3 0xcc 0x3c 0xd5
-# CHECK: mrs      x3, ich_lr0_el2
+# CHECK: mrs      x3, {{ich_lr0_el2|ICH_LR0_EL2}}
 0x21 0xcc 0x3c 0xd5
-# CHECK: mrs      x1, ich_lr1_el2
+# CHECK: mrs      x1, {{ich_lr1_el2|ICH_LR1_EL2}}
 0x56 0xcc 0x3c 0xd5
-# CHECK: mrs      x22, ich_lr2_el2
+# CHECK: mrs      x22, {{ich_lr2_el2|ICH_LR2_EL2}}
 0x75 0xcc 0x3c 0xd5
-# CHECK: mrs      x21, ich_lr3_el2
+# CHECK: mrs      x21, {{ich_lr3_el2|ICH_LR3_EL2}}
 0x86 0xcc 0x3c 0xd5
-# CHECK: mrs      x6, ich_lr4_el2
+# CHECK: mrs      x6, {{ich_lr4_el2|ICH_LR4_EL2}}
 0xaa 0xcc 0x3c 0xd5
-# CHECK: mrs      x10, ich_lr5_el2
+# CHECK: mrs      x10, {{ich_lr5_el2|ICH_LR5_EL2}}
 0xcb 0xcc 0x3c 0xd5
-# CHECK: mrs      x11, ich_lr6_el2
+# CHECK: mrs      x11, {{ich_lr6_el2|ICH_LR6_EL2}}
 0xec 0xcc 0x3c 0xd5
-# CHECK: mrs      x12, ich_lr7_el2
+# CHECK: mrs      x12, {{ich_lr7_el2|ICH_LR7_EL2}}
 0x0 0xcd 0x3c 0xd5
-# CHECK: mrs      x0, ich_lr8_el2
+# CHECK: mrs      x0, {{ich_lr8_el2|ICH_LR8_EL2}}
 0x35 0xcd 0x3c 0xd5
-# CHECK: mrs      x21, ich_lr9_el2
+# CHECK: mrs      x21, {{ich_lr9_el2|ICH_LR9_EL2}}
 0x4d 0xcd 0x3c 0xd5
-# CHECK: mrs      x13, ich_lr10_el2
+# CHECK: mrs      x13, {{ich_lr10_el2|ICH_LR10_EL2}}
 0x7a 0xcd 0x3c 0xd5
-# CHECK: mrs      x26, ich_lr11_el2
+# CHECK: mrs      x26, {{ich_lr11_el2|ICH_LR11_EL2}}
 0x81 0xcd 0x3c 0xd5
-# CHECK: mrs      x1, ich_lr12_el2
+# CHECK: mrs      x1, {{ich_lr12_el2|ICH_LR12_EL2}}
 0xa8 0xcd 0x3c 0xd5
-# CHECK: mrs      x8, ich_lr13_el2
+# CHECK: mrs      x8, {{ich_lr13_el2|ICH_LR13_EL2}}
 0xc2 0xcd 0x3c 0xd5
-# CHECK: mrs      x2, ich_lr14_el2
+# CHECK: mrs      x2, {{ich_lr14_el2|ICH_LR14_EL2}}
 0xe8 0xcd 0x3c 0xd5
-# CHECK: mrs      x8, ich_lr15_el2
+# CHECK: mrs      x8, {{ich_lr15_el2|ICH_LR15_EL2}}
 0x3b 0xcc 0x18 0xd5
-# CHECK: msr      icc_eoir1_el1, x27
+# CHECK: msr      {{icc_eoir1_el1|ICC_EOIR1_EL1}}, x27
 0x25 0xc8 0x18 0xd5
-# CHECK: msr      icc_eoir0_el1, x5
+# CHECK: msr      {{icc_eoir0_el1|ICC_EOIR0_EL1}}, x5
 0x2d 0xcb 0x18 0xd5
-# CHECK: msr      icc_dir_el1, x13
+# CHECK: msr      {{icc_dir_el1|ICC_DIR_EL1}}, x13
 0xb5 0xcb 0x18 0xd5
-# CHECK: msr      icc_sgi1r_el1, x21
+# CHECK: msr      {{icc_sgi1r_el1|ICC_SGI1R_EL1}}, x21
 0xd9 0xcb 0x18 0xd5
-# CHECK: msr      icc_asgi1r_el1, x25
+# CHECK: msr      {{icc_asgi1r_el1|ICC_ASGI1R_EL1}}, x25
 0xfc 0xcb 0x18 0xd5
-# CHECK: msr      icc_sgi0r_el1, x28
+# CHECK: msr      {{icc_sgi0r_el1|ICC_SGI0R_EL1}}, x28
 0x67 0xcc 0x18 0xd5
-# CHECK: msr      icc_bpr1_el1, x7
+# CHECK: msr      {{icc_bpr1_el1|ICC_BPR1_EL1}}, x7
 0x69 0xc8 0x18 0xd5
-# CHECK: msr      icc_bpr0_el1, x9
+# CHECK: msr      {{icc_bpr0_el1|ICC_BPR0_EL1}}, x9
 0x1d 0x46 0x18 0xd5
-# CHECK: msr      icc_pmr_el1, x29
+# CHECK: msr      {{icc_pmr_el1|ICC_PMR_EL1}}, x29
 0x98 0xcc 0x18 0xd5
-# CHECK: msr      icc_ctlr_el1, x24
+# CHECK: msr      {{icc_ctlr_el1|ICC_CTLR_EL1}}, x24
 0x80 0xcc 0x1e 0xd5
-# CHECK: msr      icc_ctlr_el3, x0
+# CHECK: msr      {{icc_ctlr_el3|ICC_CTLR_EL3}}, x0
 0xa2 0xcc 0x18 0xd5
-# CHECK: msr      icc_sre_el1, x2
+# CHECK: msr      {{icc_sre_el1|ICC_SRE_EL1}}, x2
 0xa5 0xc9 0x1c 0xd5
-# CHECK: msr      icc_sre_el2, x5
+# CHECK: msr      {{icc_sre_el2|ICC_SRE_EL2}}, x5
 0xaa 0xcc 0x1e 0xd5
-# CHECK: msr      icc_sre_el3, x10
+# CHECK: msr      {{icc_sre_el3|ICC_SRE_EL3}}, x10
 0xd6 0xcc 0x18 0xd5
-# CHECK: msr      icc_igrpen0_el1, x22
+# CHECK: msr      {{icc_igrpen0_el1|ICC_IGRPEN0_EL1}}, x22
 0xeb 0xcc 0x18 0xd5
-# CHECK: msr      icc_igrpen1_el1, x11
+# CHECK: msr      {{icc_igrpen1_el1|ICC_IGRPEN1_EL1}}, x11
 0xe8 0xcc 0x1e 0xd5
-# CHECK: msr      icc_igrpen1_el3, x8
+# CHECK: msr      {{icc_igrpen1_el3|ICC_IGRPEN1_EL3}}, x8
 0x4 0xcd 0x18 0xd5
-# CHECK: msr      icc_seien_el1, x4
+# CHECK: msr      {{icc_seien_el1|ICC_SEIEN_EL1}}, x4
 0x9b 0xc8 0x18 0xd5
-# CHECK: msr      icc_ap0r0_el1, x27
+# CHECK: msr      {{icc_ap0r0_el1|ICC_AP0R0_EL1}}, x27
 0xa5 0xc8 0x18 0xd5
-# CHECK: msr      icc_ap0r1_el1, x5
+# CHECK: msr      {{icc_ap0r1_el1|ICC_AP0R1_EL1}}, x5
 0xd4 0xc8 0x18 0xd5
-# CHECK: msr      icc_ap0r2_el1, x20
+# CHECK: msr      {{icc_ap0r2_el1|ICC_AP0R2_EL1}}, x20
 0xe0 0xc8 0x18 0xd5
-# CHECK: msr      icc_ap0r3_el1, x0
+# CHECK: msr      {{icc_ap0r3_el1|ICC_AP0R3_EL1}}, x0
 0x2 0xc9 0x18 0xd5
-# CHECK: msr      icc_ap1r0_el1, x2
+# CHECK: msr      {{icc_ap1r0_el1|ICC_AP1R0_EL1}}, x2
 0x3d 0xc9 0x18 0xd5
-# CHECK: msr      icc_ap1r1_el1, x29
+# CHECK: msr      {{icc_ap1r1_el1|ICC_AP1R1_EL1}}, x29
 0x57 0xc9 0x18 0xd5
-# CHECK: msr      icc_ap1r2_el1, x23
+# CHECK: msr      {{icc_ap1r2_el1|ICC_AP1R2_EL1}}, x23
 0x6b 0xc9 0x18 0xd5
-# CHECK: msr      icc_ap1r3_el1, x11
+# CHECK: msr      {{icc_ap1r3_el1|ICC_AP1R3_EL1}}, x11
 0x2 0xc8 0x1c 0xd5
-# CHECK: msr      ich_ap0r0_el2, x2
+# CHECK: msr      {{ich_ap0r0_el2|ICH_AP0R0_EL2}}, x2
 0x3b 0xc8 0x1c 0xd5
-# CHECK: msr      ich_ap0r1_el2, x27
+# CHECK: msr      {{ich_ap0r1_el2|ICH_AP0R1_EL2}}, x27
 0x47 0xc8 0x1c 0xd5
-# CHECK: msr      ich_ap0r2_el2, x7
+# CHECK: msr      {{ich_ap0r2_el2|ICH_AP0R2_EL2}}, x7
 0x61 0xc8 0x1c 0xd5
-# CHECK: msr      ich_ap0r3_el2, x1
+# CHECK: msr      {{ich_ap0r3_el2|ICH_AP0R3_EL2}}, x1
 0x7 0xc9 0x1c 0xd5
-# CHECK: msr      ich_ap1r0_el2, x7
+# CHECK: msr      {{ich_ap1r0_el2|ICH_AP1R0_EL2}}, x7
 0x2c 0xc9 0x1c 0xd5
-# CHECK: msr      ich_ap1r1_el2, x12
+# CHECK: msr      {{ich_ap1r1_el2|ICH_AP1R1_EL2}}, x12
 0x4e 0xc9 0x1c 0xd5
-# CHECK: msr      ich_ap1r2_el2, x14
+# CHECK: msr      {{ich_ap1r2_el2|ICH_AP1R2_EL2}}, x14
 0x6d 0xc9 0x1c 0xd5
-# CHECK: msr      ich_ap1r3_el2, x13
+# CHECK: msr      {{ich_ap1r3_el2|ICH_AP1R3_EL2}}, x13
 0x1 0xcb 0x1c 0xd5
-# CHECK: msr      ich_hcr_el2, x1
+# CHECK: msr      {{ich_hcr_el2|ICH_HCR_EL2}}, x1
 0x4a 0xcb 0x1c 0xd5
-# CHECK: msr      ich_misr_el2, x10
+# CHECK: msr      {{ich_misr_el2|ICH_MISR_EL2}}, x10
 0xf8 0xcb 0x1c 0xd5
-# CHECK: msr      ich_vmcr_el2, x24
+# CHECK: msr      {{ich_vmcr_el2|ICH_VMCR_EL2}}, x24
 0x9d 0xc9 0x1c 0xd5
-# CHECK: msr      ich_vseir_el2, x29
+# CHECK: msr      {{ich_vseir_el2|ICH_VSEIR_EL2}}, x29
 0x1a 0xcc 0x1c 0xd5
-# CHECK: msr      ich_lr0_el2, x26
+# CHECK: msr      {{ich_lr0_el2|ICH_LR0_EL2}}, x26
 0x29 0xcc 0x1c 0xd5
-# CHECK: msr      ich_lr1_el2, x9
+# CHECK: msr      {{ich_lr1_el2|ICH_LR1_EL2}}, x9
 0x52 0xcc 0x1c 0xd5
-# CHECK: msr      ich_lr2_el2, x18
+# CHECK: msr      {{ich_lr2_el2|ICH_LR2_EL2}}, x18
 0x7a 0xcc 0x1c 0xd5
-# CHECK: msr      ich_lr3_el2, x26
+# CHECK: msr      {{ich_lr3_el2|ICH_LR3_EL2}}, x26
 0x96 0xcc 0x1c 0xd5
-# CHECK: msr      ich_lr4_el2, x22
+# CHECK: msr      {{ich_lr4_el2|ICH_LR4_EL2}}, x22
 0xba 0xcc 0x1c 0xd5
-# CHECK: msr      ich_lr5_el2, x26
+# CHECK: msr      {{ich_lr5_el2|ICH_LR5_EL2}}, x26
 0xdb 0xcc 0x1c 0xd5
-# CHECK: msr      ich_lr6_el2, x27
+# CHECK: msr      {{ich_lr6_el2|ICH_LR6_EL2}}, x27
 0xe8 0xcc 0x1c 0xd5
-# CHECK: msr      ich_lr7_el2, x8
+# CHECK: msr      {{ich_lr7_el2|ICH_LR7_EL2}}, x8
 0x11 0xcd 0x1c 0xd5
-# CHECK: msr      ich_lr8_el2, x17
+# CHECK: msr      {{ich_lr8_el2|ICH_LR8_EL2}}, x17
 0x33 0xcd 0x1c 0xd5
-# CHECK: msr      ich_lr9_el2, x19
+# CHECK: msr      {{ich_lr9_el2|ICH_LR9_EL2}}, x19
 0x51 0xcd 0x1c 0xd5
-# CHECK: msr      ich_lr10_el2, x17
+# CHECK: msr      {{ich_lr10_el2|ICH_LR10_EL2}}, x17
 0x65 0xcd 0x1c 0xd5
-# CHECK: msr      ich_lr11_el2, x5
+# CHECK: msr      {{ich_lr11_el2|ICH_LR11_EL2}}, x5
 0x9d 0xcd 0x1c 0xd5
-# CHECK: msr      ich_lr12_el2, x29
+# CHECK: msr      {{ich_lr12_el2|ICH_LR12_EL2}}, x29
 0xa2 0xcd 0x1c 0xd5
-# CHECK: msr      ich_lr13_el2, x2
+# CHECK: msr      {{ich_lr13_el2|ICH_LR13_EL2}}, x2
 0xcd 0xcd 0x1c 0xd5
-# CHECK: msr      ich_lr14_el2, x13
+# CHECK: msr      {{ich_lr14_el2|ICH_LR14_EL2}}, x13
 0xfb 0xcd 0x1c 0xd5
-# CHECK: msr      ich_lr15_el2, x27
+# CHECK: msr      {{ich_lr15_el2|ICH_LR15_EL2}}, x27
diff --git a/test/MC/Disassembler/AArch64/ldp-offset-predictable.txt b/test/MC/Disassembler/AArch64/ldp-offset-predictable.txt
index 7ff495f..3c443a9 100644
--- a/test/MC/Disassembler/AArch64/ldp-offset-predictable.txt
+++ b/test/MC/Disassembler/AArch64/ldp-offset-predictable.txt
@@ -1,4 +1,5 @@
 # RUN: llvm-mc -triple=aarch64 -disassemble < %s 2>&1 | FileCheck %s
+# RUN: llvm-mc -triple=arm64 -disassemble < %s 2>&1 | FileCheck %s
 
 # Stores are OK.
 0xe0 0x83 0x00 0xa9
diff --git a/test/MC/Disassembler/AArch64/ldp-postind.predictable.txt b/test/MC/Disassembler/AArch64/ldp-postind.predictable.txt
index 637ebdb..6ba33ad 100644
--- a/test/MC/Disassembler/AArch64/ldp-postind.predictable.txt
+++ b/test/MC/Disassembler/AArch64/ldp-postind.predictable.txt
@@ -1,4 +1,5 @@
 # RUN: llvm-mc -triple=aarch64 -mattr=+fp-armv8 -disassemble < %s 2>&1 | FileCheck %s
+# RUN: llvm-mc -triple=arm64 -mattr=+fp-armv8 -disassemble < %s 2>&1 | FileCheck %s
 
 # None of these instructions should be classified as unpredictable:
 
diff --git a/test/MC/Disassembler/AArch64/ldp-preind.predictable.txt b/test/MC/Disassembler/AArch64/ldp-preind.predictable.txt
index f52d37f..1915340 100644
--- a/test/MC/Disassembler/AArch64/ldp-preind.predictable.txt
+++ b/test/MC/Disassembler/AArch64/ldp-preind.predictable.txt
@@ -1,4 +1,5 @@
 # RUN: llvm-mc -triple=aarch64 -mattr=+fp-armv8 -disassemble < %s 2>&1 | FileCheck %s
+# RUN: llvm-mc -triple=arm64 -mattr=+fp-armv8 -disassemble < %s 2>&1 | FileCheck %s
 
 # None of these instructions should be classified as unpredictable:
 
diff --git a/test/MC/Disassembler/AArch64/lit.local.cfg b/test/MC/Disassembler/AArch64/lit.local.cfg
index 9a66a00..2c423d1 100644
--- a/test/MC/Disassembler/AArch64/lit.local.cfg
+++ b/test/MC/Disassembler/AArch64/lit.local.cfg
@@ -1,4 +1,4 @@
 targets = set(config.root.targets_to_build.split())
-if not 'AArch64' in targets:
+if 'AArch64' not in targets:
     config.unsupported = True
 
diff --git a/test/MC/Disassembler/AArch64/neon-instructions.txt b/test/MC/Disassembler/AArch64/neon-instructions.txt
index 863730a..3590668 100644
--- a/test/MC/Disassembler/AArch64/neon-instructions.txt
+++ b/test/MC/Disassembler/AArch64/neon-instructions.txt
@@ -1,4 +1,5 @@
 # RUN: llvm-mc  -triple aarch64-none-linux-gnu -mattr=+neon -disassemble < %s | FileCheck %s
+# RUN: llvm-mc  -triple arm64-none-linux-gnu -mattr=+neon -disassemble < %s | FileCheck %s
 
 #------------------------------------------------------------------------------
 # Vector Integer Add/Sub
@@ -87,7 +88,7 @@
 # Vector Bitwise OR - immedidate
 #------------------------------------------------------------------------------
 # CHECK: movi v31.4s, #0xff, lsl #24
-# CHECK: mvni v0.2s, #0x0
+# CHECK: mvni v0.2s, #{{0x0|0}}
 # CHECK: bic v15.4h, #0xf, lsl #8
 # CHECK: orr v16.8h, #0x1f
 0xff 0x67 0x07 0x4f
@@ -132,10 +133,8 @@
 # Vector Move - register
 #------------------------------------------------------------------------------
 
-# FIXME: these should print as "mov", but TableGen can't handle it.
-
-# CHECK: orr v1.16b, v15.16b, v15.16b
-# CHECK: orr v25.8b, v4.8b, v4.8b
+# CHECK: mov v1.16b, v15.16b
+# CHECK: mov v25.8b, v4.8b
 0xe1 0x1d 0xaf 0x4e
 0x99 0x1c 0xa4 0x0e
 
@@ -246,31 +245,31 @@
 #----------------------------------------------------------------------
 # Vector Compare Mask Equal to Zero (Integer)
 #----------------------------------------------------------------------
-# CHECK: cmeq v31.16b, v15.16b, #0x0
+# CHECK: cmeq v31.16b, v15.16b, #{{0x0|0}}
 0xff 0x99 0x20 0x4e
 
 #----------------------------------------------------------------------
 # Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
 #----------------------------------------------------------------------
-# CHECK: cmge v3.8b, v15.8b, #0x0
+# CHECK: cmge v3.8b, v15.8b, #{{0x0|0}}
 0xe3 0x89 0x20 0x2e
 
 #----------------------------------------------------------------------
 # Vector Compare Mask Greater Than Zero (Signed Integer)
 #----------------------------------------------------------------------
-# CHECK: cmgt v22.2s, v9.2s, #0x0
+# CHECK: cmgt v22.2s, v9.2s, #{{0x0|0}}
 0x36 0x89 0xa0 0x0e
 
 #----------------------------------------------------------------------
 # Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
 #----------------------------------------------------------------------
-# CHECK: cmle v5.2d, v14.2d, #0x0
+# CHECK: cmle v5.2d, v14.2d, #{{0x0|0}}
 0xc5 0x99 0xe0 0x6e
 
 #----------------------------------------------------------------------
 # Vector Compare Mask Less Than Zero (Signed Integer)
 #----------------------------------------------------------------------
-# CHECK: cmlt v13.8h, v11.8h, #0x0
+# CHECK: cmlt v13.8h, v11.8h, #{{0x0|0}}
 0x6d 0xa9 0x60 0x4e
 
 #----------------------------------------------------------------------
@@ -1559,7 +1558,7 @@
 #----------------------------------------------------------------------
 # Scalar Compare Bitwise Equal To Zero
 #----------------------------------------------------------------------
-# CHECK: cmeq d20, d21, #0x0
+# CHECK: cmeq d20, d21, #{{0x0|0}}
 0xb4,0x9a,0xe0,0x5e
 
 #----------------------------------------------------------------------
@@ -1578,7 +1577,7 @@
 #----------------------------------------------------------------------
 # Scalar Compare Signed Greather Than Or Equal To Zero
 #----------------------------------------------------------------------
-# CHECK: cmge d20, d21, #0x0
+# CHECK: cmge d20, d21, #{{0x0|0}}
 0xb4,0x8a,0xe0,0x7e
 
 #----------------------------------------------------------------------
@@ -1596,19 +1595,19 @@
 #----------------------------------------------------------------------
 # Scalar Compare Signed Greater Than Zero
 #----------------------------------------------------------------------
-# CHECK: cmgt d20, d21, #0x0
+# CHECK: cmgt d20, d21, #{{0x0|0}}
 0xb4,0x8a,0xe0,0x5e
 
 #----------------------------------------------------------------------
 # Scalar Compare Signed Less Than Or Equal To Zero
 #----------------------------------------------------------------------
-# CHECK: cmle d20, d21, #0x0
+# CHECK: cmle d20, d21, #{{0x0|0}}
 0xb4,0x9a,0xe0,0x7e
 
 #----------------------------------------------------------------------
 # Scalar Compare Less Than Zero
 #----------------------------------------------------------------------
-# CHECK: cmlt d20, d21, #0x0
+# CHECK: cmlt d20, d21, #{{0x0|0}}
 0xb4,0xaa,0xe0,0x5e
 
 #----------------------------------------------------------------------
@@ -2008,34 +2007,34 @@
 #----------------------------------------------------------------------
 # Vector load/store multiple N-element structure
 #----------------------------------------------------------------------
-# CHECK: ld1 {v0.16b}, [x0]
-# CHECK: ld1 {v15.8h, v16.8h}, [x15]
-# CHECK: ld1 {v31.4s, v0.4s, v1.4s}, [sp]
-# CHECK: ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
+# CHECK: ld1 { v0.16b }, [x0]
+# CHECK: ld1 { v15.8h, v16.8h }, [x15]
+# CHECK: ld1 { v31.4s, v0.4s, v1.4s }, [sp]
+# CHECK: ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
 0x00,0x70,0x40,0x4c
 0xef,0xa5,0x40,0x4c
 0xff,0x6b,0x40,0x4c
 0x00,0x2c,0x40,0x4c
 
-# CHECK: ld2 {v0.8b, v1.8b}, [x0]
-# CHECK: ld3 {v15.4h, v16.4h, v17.4h}, [x15]
-# CHECK: ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+# CHECK: ld2 { v0.8b, v1.8b }, [x0]
+# CHECK: ld3 { v15.4h, v16.4h, v17.4h }, [x15]
+# CHECK: ld4 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp]
 0x00,0x80,0x40,0x0c
 0xef,0x45,0x40,0x0c
 0xff,0x0b,0x40,0x0c
 
-# CHECK: st1 {v0.16b}, [x0]
-# CHECK: st1 {v15.8h, v16.8h}, [x15]
-# CHECK: st1 {v31.4s, v0.4s, v1.4s}, [sp]
-# CHECK: st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
+# CHECK: st1 { v0.16b }, [x0]
+# CHECK: st1 { v15.8h, v16.8h }, [x15]
+# CHECK: st1 { v31.4s, v0.4s, v1.4s }, [sp]
+# CHECK: st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
 0x00,0x70,0x00,0x4c
 0xef,0xa5,0x00,0x4c
 0xff,0x6b,0x00,0x4c
 0x00,0x2c,0x00,0x4c
 
-# CHECK: st2 {v0.8b, v1.8b}, [x0]
-# CHECK: st3 {v15.4h, v16.4h, v17.4h}, [x15]
-# CHECK: st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+# CHECK: st2 { v0.8b, v1.8b }, [x0]
+# CHECK: st3 { v15.4h, v16.4h, v17.4h }, [x15]
+# CHECK: st4 { v31.2s, v0.2s, v1.2s, v2.2s }, [sp]
 0x00,0x80,0x00,0x0c
 0xef,0x45,0x00,0x0c
 0xff,0x0b,0x00,0x0c
@@ -2043,35 +2042,35 @@
 #----------------------------------------------------------------------
 # Vector load/store multiple N-element structure (post-index)
 #----------------------------------------------------------------------
-# CHECK: ld1 {v15.8h}, [x15], x2
-# CHECK: ld1 {v31.4s, v0.4s}, [sp], #32
-# CHECK: ld1 {v0.2d, v1.2d, v2.2d}, [x0], #48
-# CHECK: ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+# CHECK: ld1 { v15.8h }, [x15], x2
+# CHECK: ld1 { v31.4s, v0.4s }, [sp], #32
+# CHECK: ld1 { v0.2d, v1.2d, v2.2d }, [x0], #48
+# CHECK: ld1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
 0xef,0x75,0xc2,0x4c
 0xff,0xab,0xdf,0x4c
 0x00,0x6c,0xdf,0x4c
 0x00,0x20,0xc3,0x0c
 
-# CHECK: ld2 {v0.16b, v1.16b}, [x0], x1
-# CHECK: ld3 {v15.8h, v16.8h, v17.8h}, [x15], x2
-# CHECK: ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+# CHECK: ld2 { v0.16b, v1.16b }, [x0], x1
+# CHECK: ld3 { v15.8h, v16.8h, v17.8h }, [x15], x2
+# CHECK: ld4 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp], #64
 0x00,0x80,0xc1,0x4c
 0xef,0x45,0xc2,0x4c
 0xff,0x0b,0xdf,0x4c
 
 
-# CHECK: st1 {v15.8h}, [x15], x2
-# CHECK: st1 {v31.4s, v0.4s}, [sp], #32
-# CHECK: st1 {v0.2d, v1.2d, v2.2d}, [x0], #48
-# CHECK: st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+# CHECK: st1 { v15.8h }, [x15], x2
+# CHECK: st1 { v31.4s, v0.4s }, [sp], #32
+# CHECK: st1 { v0.2d, v1.2d, v2.2d }, [x0], #48
+# CHECK: st1 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0], x3
 0xef,0x75,0x82,0x4c
 0xff,0xab,0x9f,0x4c
 0x00,0x6c,0x9f,0x4c
 0x00,0x20,0x83,0x0c
 
-# CHECK: st2 {v0.16b, v1.16b}, [x0], x1
-# CHECK: st3 {v15.8h, v16.8h, v17.8h}, [x15], x2
-# CHECK: st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+# CHECK: st2 { v0.16b, v1.16b }, [x0], x1
+# CHECK: st3 { v15.8h, v16.8h, v17.8h }, [x15], x2
+# CHECK: st4 { v31.4s, v0.4s, v1.4s, v2.4s }, [sp], #64
 0x00,0x80,0x81,0x4c
 0xef,0x45,0x82,0x4c
 0xff,0x0b,0x9f,0x4c
@@ -2080,14 +2079,14 @@
 # Vector load single N-element structure to all lane of N
 # consecutive registers (N = 1,2,3,4)
 #----------------------------------------------------------------------
-# CHECK: ld1r {v0.16b}, [x0]
-# CHECK: ld1r {v15.8h}, [x15]
-# CHECK: ld2r {v31.4s, v0.4s}, [sp]
-# CHECK: ld2r {v0.2d, v1.2d}, [x0]
-# CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0]
-# CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15]
-# CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
-# CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp]
+# CHECK: ld1r { v0.16b }, [x0]
+# CHECK: ld1r { v15.8h }, [x15]
+# CHECK: ld2r { v31.4s, v0.4s }, [sp]
+# CHECK: ld2r { v0.2d, v1.2d }, [x0]
+# CHECK: ld3r { v0.8b, v1.8b, v2.8b }, [x0]
+# CHECK: ld3r { v15.4h, v16.4h, v17.4h }, [x15]
+# CHECK: ld4r { v31.2s, v0.2s, v1.2s, v2.2s }, [sp]
+# CHECK: ld4r { v31.1d, v0.1d, v1.1d, v2.1d }, [sp]
 0x00,0xc0,0x40,0x4d
 0xef,0xc5,0x40,0x4d
 0xff,0xcb,0x60,0x4d
@@ -2101,14 +2100,14 @@
 # Vector load/store single N-element structure to/from one lane of N
 # consecutive registers (N = 1,2,3,4)
 #----------------------------------------------------------------------
-# CHECK: ld1 {v0.b}[9], [x0]
-# CHECK: ld2 {v15.h, v16.h}[7], [x15]
-# CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp]
-# CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0]
-# CHECK: st1 {v0.d}[1], [x0]
-# CHECK: st2 {v31.s, v0.s}[3], [sp]
-# CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15]
-# CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0]
+# CHECK: ld1 { v0.b }[9], [x0]
+# CHECK: ld2 { v15.h, v16.h }[7], [x15]
+# CHECK: ld3 { v31.s, v0.s, v1.s }[3], [sp]
+# CHECK: ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0]
+# CHECK: st1 { v0.d }[1], [x0]
+# CHECK: st2 { v31.s, v0.s }[3], [sp]
+# CHECK: st3 { v15.h, v16.h, v17.h }[7], [x15]
+# CHECK: st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0]
 0x00,0x04,0x40,0x4d
 0xef,0x59,0x60,0x4d
 0xff,0xb3,0x40,0x4d
@@ -2122,14 +2121,14 @@
 # Post-index of vector load single N-element structure to all lane of N
 # consecutive registers (N = 1,2,3,4)
 #----------------------------------------------------------------------
-# CHECK: ld1r {v0.16b}, [x0], #1
-# CHECK: ld1r {v15.8h}, [x15], #2
-# CHECK: ld2r {v31.4s, v0.4s}, [sp], #8
-# CHECK: ld2r {v0.2d, v1.2d}, [x0], #16
-# CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3
-# CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6
-# CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30
-# CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7
+# CHECK: ld1r { v0.16b }, [x0], #1
+# CHECK: ld1r { v15.8h }, [x15], #2
+# CHECK: ld2r { v31.4s, v0.4s }, [sp], #8
+# CHECK: ld2r { v0.2d, v1.2d }, [x0], #16
+# CHECK: ld3r { v0.8b, v1.8b, v2.8b }, [x0], #3
+# CHECK: ld3r { v15.4h, v16.4h, v17.4h }, [x15], #6
+# CHECK: ld4r { v31.2s, v0.2s, v1.2s, v2.2s }, [sp], x30
+# CHECK: ld4r { v31.1d, v0.1d, v1.1d, v2.1d }, [sp], x7
 0x00,0xc0,0xdf,0x4d
 0xef,0xc5,0xdf,0x4d
 0xff,0xcb,0xff,0x4d
@@ -2143,15 +2142,15 @@
 # Post-index of vector load/store single N-element structure to/from
 #  one lane of N consecutive registers (N = 1,2,3,4)
 #----------------------------------------------------------------------
-# CHECK: ld1 {v0.b}[9], [x0], #1
-# CHECK: ld2 {v15.h, v16.h}[7], [x15], #4
-# CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp], x3
-# CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
-# CHECK: ld4 {v0.h, v1.h, v2.h, v3.h}[7], [x0], x0
-# CHECK: st1 {v0.d}[1], [x0], #8
-# CHECK: st2 {v31.s, v0.s}[3], [sp], #8
-# CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15], #6
-# CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5
+# CHECK: ld1 { v0.b }[9], [x0], #1
+# CHECK: ld2 { v15.h, v16.h }[7], [x15], #4
+# CHECK: ld3 { v31.s, v0.s, v1.s }[3], [sp], x3
+# CHECK: ld4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], #32
+# CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0], x0
+# CHECK: st1 { v0.d }[1], [x0], #8
+# CHECK: st2 { v31.s, v0.s }[3], [sp], #8
+# CHECK: st3 { v15.h, v16.h, v17.h }[7], [x15], #6
+# CHECK: st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5
 0x00,0x04,0xdf,0x4d
 0xef,0x59,0xff,0x4d
 0xff,0xb3,0xc3,0x4d
@@ -2167,8 +2166,8 @@
 #----------------------------------------------------------------------
 0x20,0x18,0x02,0x2e
 0x20,0x18,0x02,0x6e
-# CHECK: ext v0.8b, v1.8b, v2.8b, #0x3
-# CHECK: ext v0.16b, v1.16b, v2.16b, #0x3
+# CHECK: ext v0.8b, v1.8b, v2.8b, #{{0x3|3}}
+# CHECK: ext v0.16b, v1.16b, v2.16b, #{{0x3|3}}
 
 #----------------------------------------------------------------------
 # unzip with 3 same vectors to get primary result
@@ -2481,10 +2480,10 @@
 #----------------------------------------------------------------------
 #Duplicate element (scalar)
 #----------------------------------------------------------------------
-# CHECK: dup b0, v0.b[15]
-# CHECK: dup h2, v31.h[5]
-# CHECK: dup s17, v2.s[2]
-# CHECK: dup d6, v12.d[1]
+# CHECK: {{dup|mov}} b0, v0.b[15]
+# CHECK: {{dup|mov}} h2, v31.h[5]
+# CHECK: {{dup|mov}} s17, v2.s[2]
+# CHECK: {{dup|mov}} d6, v12.d[1]
 0x00 0x04 0x1f 0x5e
 0xe2 0x07 0x16 0x5e
 0x51 0x04 0x14 0x5e
@@ -2497,37 +2496,37 @@
 0xf0,0x23,0x02,0x0e
 0x20,0x40,0x02,0x0e
 0xf0,0x62,0x02,0x0e
-# CHECK: tbl v0.8b, {v1.16b}, v2.8b
-# CHECK: tbl v16.8b, {v31.16b, v0.16b}, v2.8b
-# CHECK: tbl v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b
-# CHECK: tbl v16.8b, {v23.16b, v24.16b, v25.16b, v26.16b}, v2.8b
+# CHECK: tbl v0.8b, { v1.16b }, v2.8b
+# CHECK: tbl v16.8b, { v31.16b, v0.16b }, v2.8b
+# CHECK: tbl v0.8b, { v1.16b, v2.16b, v3.16b }, v2.8b
+# CHECK: tbl v16.8b, { v23.16b, v24.16b, v25.16b, v26.16b }, v2.8b
 
 0x20,0x00,0x02,0x4e
 0xf0,0x23,0x02,0x4e
 0x20,0x40,0x02,0x4e
 0xe0,0x63,0x02,0x4e
-# CHECK: tbl v0.16b, {v1.16b}, v2.16b
-# CHECK: tbl v16.16b, {v31.16b, v0.16b}, v2.16b
-# CHECK: tbl v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b
-# CHECK: tbl v0.16b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.16b
+# CHECK: tbl v0.16b, { v1.16b }, v2.16b
+# CHECK: tbl v16.16b, { v31.16b, v0.16b }, v2.16b
+# CHECK: tbl v0.16b, { v1.16b, v2.16b, v3.16b }, v2.16b
+# CHECK: tbl v0.16b, { v31.16b, v0.16b, v1.16b, v2.16b }, v2.16b
 
 0x20,0x10,0x02,0x0e
 0xf0,0x33,0x02,0x0e
 0x20,0x50,0x02,0x0e
 0xf0,0x72,0x02,0x0e
-# CHECK: tbx v0.8b, {v1.16b}, v2.8b
-# CHECK: tbx v16.8b, {v31.16b, v0.16b}, v2.8b
-# CHECK: tbx v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b
-# CHECK: tbx v16.8b, {v23.16b, v24.16b, v25.16b, v26.16b}, v2.8b
+# CHECK: tbx v0.8b, { v1.16b }, v2.8b
+# CHECK: tbx v16.8b, { v31.16b, v0.16b }, v2.8b
+# CHECK: tbx v0.8b, { v1.16b, v2.16b, v3.16b }, v2.8b
+# CHECK: tbx v16.8b, { v23.16b, v24.16b, v25.16b, v26.16b }, v2.8b
 
 0x20,0x10,0x02,0x4e
 0xf0,0x33,0x02,0x4e
 0x20,0x50,0x02,0x4e
 0xf0,0x73,0x02,0x4e
-# CHECK: tbx v0.16b, {v1.16b}, v2.16b
-# CHECK: tbx v16.16b, {v31.16b, v0.16b}, v2.16b
-# CHECK: tbx v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b
-# CHECK: tbx v16.16b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.16b
+# CHECK: tbx v0.16b, { v1.16b }, v2.16b
+# CHECK: tbx v16.16b, { v31.16b, v0.16b }, v2.16b
+# CHECK: tbx v0.16b, { v1.16b, v2.16b, v3.16b }, v2.16b
+# CHECK: tbx v16.16b, { v31.16b, v0.16b, v1.16b, v2.16b }, v2.16b
 
 #----------------------------------------------------------------------
 # Scalar Floating-point Convert To Lower Precision Narrow, Rounding To
diff --git a/test/MC/Disassembler/AArch64/trace-regs.txt b/test/MC/Disassembler/AArch64/trace-regs.txt
index 10c5937..43171e3 100644
--- a/test/MC/Disassembler/AArch64/trace-regs.txt
+++ b/test/MC/Disassembler/AArch64/trace-regs.txt
@@ -1,736 +1,737 @@
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple arm64-none-linux-gnu -disassemble < %s | FileCheck %s
 
 0x8 0x3 0x31 0xd5
-# CHECK: mrs      x8, trcstatr
+# CHECK: mrs      x8, {{trcstatr|TRCSTATR}}
 0xc9 0x0 0x31 0xd5
-# CHECK: mrs      x9, trcidr8
+# CHECK: mrs      x9, {{trcidr8|TRCIDR8}}
 0xcb 0x1 0x31 0xd5
-# CHECK: mrs      x11, trcidr9
+# CHECK: mrs      x11, {{trcidr9|TRCIDR9}}
 0xd9 0x2 0x31 0xd5
-# CHECK: mrs      x25, trcidr10
+# CHECK: mrs      x25, {{trcidr10|TRCIDR10}}
 0xc7 0x3 0x31 0xd5
-# CHECK: mrs      x7, trcidr11
+# CHECK: mrs      x7, {{trcidr11|TRCIDR11}}
 0xc7 0x4 0x31 0xd5
-# CHECK: mrs      x7, trcidr12
+# CHECK: mrs      x7, {{trcidr12|TRCIDR12}}
 0xc6 0x5 0x31 0xd5
-# CHECK: mrs      x6, trcidr13
+# CHECK: mrs      x6, {{trcidr13|TRCIDR13}}
 0xfb 0x8 0x31 0xd5
-# CHECK: mrs      x27, trcidr0
+# CHECK: mrs      x27, {{trcidr0|TRCIDR0}}
 0xfd 0x9 0x31 0xd5
-# CHECK: mrs      x29, trcidr1
+# CHECK: mrs      x29, {{trcidr1|TRCIDR1}}
 0xe4 0xa 0x31 0xd5
-# CHECK: mrs      x4, trcidr2
+# CHECK: mrs      x4, {{trcidr2|TRCIDR2}}
 0xe8 0xb 0x31 0xd5
-# CHECK: mrs      x8, trcidr3
+# CHECK: mrs      x8, {{trcidr3|TRCIDR3}}
 0xef 0xc 0x31 0xd5
-# CHECK: mrs      x15, trcidr4
+# CHECK: mrs      x15, {{trcidr4|TRCIDR4}}
 0xf4 0xd 0x31 0xd5
-# CHECK: mrs      x20, trcidr5
+# CHECK: mrs      x20, {{trcidr5|TRCIDR5}}
 0xe6 0xe 0x31 0xd5
-# CHECK: mrs      x6, trcidr6
+# CHECK: mrs      x6, {{trcidr6|TRCIDR6}}
 0xe6 0xf 0x31 0xd5
-# CHECK: mrs      x6, trcidr7
+# CHECK: mrs      x6, {{trcidr7|TRCIDR7}}
 0x98 0x11 0x31 0xd5
-# CHECK: mrs      x24, trcoslsr
+# CHECK: mrs      x24, {{trcoslsr|TRCOSLSR}}
 0x92 0x15 0x31 0xd5
-# CHECK: mrs      x18, trcpdsr
+# CHECK: mrs      x18, {{trcpdsr|TRCPDSR}}
 0xdc 0x7a 0x31 0xd5
-# CHECK: mrs      x28, trcdevaff0
+# CHECK: mrs      x28, {{trcdevaff0|TRCDEVAFF0}}
 0xc5 0x7b 0x31 0xd5
-# CHECK: mrs      x5, trcdevaff1
+# CHECK: mrs      x5, {{trcdevaff1|TRCDEVAFF1}}
 0xc5 0x7d 0x31 0xd5
-# CHECK: mrs      x5, trclsr
+# CHECK: mrs      x5, {{trclsr|TRCLSR}}
 0xcb 0x7e 0x31 0xd5
-# CHECK: mrs      x11, trcauthstatus
+# CHECK: mrs      x11, {{trcauthstatus|TRCAUTHSTATUS}}
 0xcd 0x7f 0x31 0xd5
-# CHECK: mrs      x13, trcdevarch
+# CHECK: mrs      x13, {{trcdevarch|TRCDEVARCH}}
 0xf2 0x72 0x31 0xd5
-# CHECK: mrs      x18, trcdevid
+# CHECK: mrs      x18, {{trcdevid|TRCDEVID}}
 0xf6 0x73 0x31 0xd5
-# CHECK: mrs      x22, trcdevtype
+# CHECK: mrs      x22, {{trcdevtype|TRCDEVTYPE}}
 0xee 0x74 0x31 0xd5
-# CHECK: mrs      x14, trcpidr4
+# CHECK: mrs      x14, {{trcpidr4|TRCPIDR4}}
 0xe5 0x75 0x31 0xd5
-# CHECK: mrs      x5, trcpidr5
+# CHECK: mrs      x5, {{trcpidr5|TRCPIDR5}}
 0xe5 0x76 0x31 0xd5
-# CHECK: mrs      x5, trcpidr6
+# CHECK: mrs      x5, {{trcpidr6|TRCPIDR6}}
 0xe9 0x77 0x31 0xd5
-# CHECK: mrs      x9, trcpidr7
+# CHECK: mrs      x9, {{trcpidr7|TRCPIDR7}}
 0xef 0x78 0x31 0xd5
-# CHECK: mrs      x15, trcpidr0
+# CHECK: mrs      x15, {{trcpidr0|TRCPIDR0}}
 0xe6 0x79 0x31 0xd5
-# CHECK: mrs      x6, trcpidr1
+# CHECK: mrs      x6, {{trcpidr1|TRCPIDR1}}
 0xeb 0x7a 0x31 0xd5
-# CHECK: mrs      x11, trcpidr2
+# CHECK: mrs      x11, {{trcpidr2|TRCPIDR2}}
 0xf4 0x7b 0x31 0xd5
-# CHECK: mrs      x20, trcpidr3
+# CHECK: mrs      x20, {{trcpidr3|TRCPIDR3}}
 0xf1 0x7c 0x31 0xd5
-# CHECK: mrs      x17, trccidr0
+# CHECK: mrs      x17, {{trccidr0|TRCCIDR0}}
 0xe2 0x7d 0x31 0xd5
-# CHECK: mrs      x2, trccidr1
+# CHECK: mrs      x2, {{trccidr1|TRCCIDR1}}
 0xf4 0x7e 0x31 0xd5
-# CHECK: mrs      x20, trccidr2
+# CHECK: mrs      x20, {{trccidr2|TRCCIDR2}}
 0xe4 0x7f 0x31 0xd5
-# CHECK: mrs      x4, trccidr3
+# CHECK: mrs      x4, {{trccidr3|TRCCIDR3}}
 0xb 0x1 0x31 0xd5
-# CHECK: mrs      x11, trcprgctlr
+# CHECK: mrs      x11, {{trcprgctlr|TRCPRGCTLR}}
 0x17 0x2 0x31 0xd5
-# CHECK: mrs      x23, trcprocselr
+# CHECK: mrs      x23, {{trcprocselr|TRCPROCSELR}}
 0xd 0x4 0x31 0xd5
-# CHECK: mrs      x13, trcconfigr
+# CHECK: mrs      x13, {{trcconfigr|TRCCONFIGR}}
 0x17 0x6 0x31 0xd5
-# CHECK: mrs      x23, trcauxctlr
+# CHECK: mrs      x23, {{trcauxctlr|TRCAUXCTLR}}
 0x9 0x8 0x31 0xd5
-# CHECK: mrs      x9, trceventctl0r
+# CHECK: mrs      x9, {{trceventctl0r|TRCEVENTCTL0R}}
 0x10 0x9 0x31 0xd5
-# CHECK: mrs      x16, trceventctl1r
+# CHECK: mrs      x16, {{trceventctl1r|TRCEVENTCTL1R}}
 0x4 0xb 0x31 0xd5
-# CHECK: mrs      x4, trcstallctlr
+# CHECK: mrs      x4, {{trcstallctlr|TRCSTALLCTLR}}
 0xe 0xc 0x31 0xd5
-# CHECK: mrs      x14, trctsctlr
+# CHECK: mrs      x14, {{trctsctlr|TRCTSCTLR}}
 0x18 0xd 0x31 0xd5
-# CHECK: mrs      x24, trcsyncpr
+# CHECK: mrs      x24, {{trcsyncpr|TRCSYNCPR}}
 0x1c 0xe 0x31 0xd5
-# CHECK: mrs      x28, trcccctlr
+# CHECK: mrs      x28, {{trcccctlr|TRCCCCTLR}}
 0xf 0xf 0x31 0xd5
-# CHECK: mrs      x15, trcbbctlr
+# CHECK: mrs      x15, {{trcbbctlr|TRCBBCTLR}}
 0x21 0x0 0x31 0xd5
-# CHECK: mrs      x1, trctraceidr
+# CHECK: mrs      x1, {{trctraceidr|TRCTRACEIDR}}
 0x34 0x1 0x31 0xd5
-# CHECK: mrs      x20, trcqctlr
+# CHECK: mrs      x20, {{trcqctlr|TRCQCTLR}}
 0x42 0x0 0x31 0xd5
-# CHECK: mrs      x2, trcvictlr
+# CHECK: mrs      x2, {{trcvictlr|TRCVICTLR}}
 0x4c 0x1 0x31 0xd5
-# CHECK: mrs      x12, trcviiectlr
+# CHECK: mrs      x12, {{trcviiectlr|TRCVIIECTLR}}
 0x50 0x2 0x31 0xd5
-# CHECK: mrs      x16, trcvissctlr
+# CHECK: mrs      x16, {{trcvissctlr|TRCVISSCTLR}}
 0x48 0x3 0x31 0xd5
-# CHECK: mrs      x8, trcvipcssctlr
+# CHECK: mrs      x8, {{trcvipcssctlr|TRCVIPCSSCTLR}}
 0x5b 0x8 0x31 0xd5
-# CHECK: mrs      x27, trcvdctlr
+# CHECK: mrs      x27, {{trcvdctlr|TRCVDCTLR}}
 0x49 0x9 0x31 0xd5
-# CHECK: mrs      x9, trcvdsacctlr
+# CHECK: mrs      x9, {{trcvdsacctlr|TRCVDSACCTLR}}
 0x40 0xa 0x31 0xd5
-# CHECK: mrs      x0, trcvdarcctlr
+# CHECK: mrs      x0, {{trcvdarcctlr|TRCVDARCCTLR}}
 0x8d 0x0 0x31 0xd5
-# CHECK: mrs      x13, trcseqevr0
+# CHECK: mrs      x13, {{trcseqevr0|TRCSEQEVR0}}
 0x8b 0x1 0x31 0xd5
-# CHECK: mrs      x11, trcseqevr1
+# CHECK: mrs      x11, {{trcseqevr1|TRCSEQEVR1}}
 0x9a 0x2 0x31 0xd5
-# CHECK: mrs      x26, trcseqevr2
+# CHECK: mrs      x26, {{trcseqevr2|TRCSEQEVR2}}
 0x8e 0x6 0x31 0xd5
-# CHECK: mrs      x14, trcseqrstevr
+# CHECK: mrs      x14, {{trcseqrstevr|TRCSEQRSTEVR}}
 0x84 0x7 0x31 0xd5
-# CHECK: mrs      x4, trcseqstr
+# CHECK: mrs      x4, {{trcseqstr|TRCSEQSTR}}
 0x91 0x8 0x31 0xd5
-# CHECK: mrs      x17, trcextinselr
+# CHECK: mrs      x17, {{trcextinselr|TRCEXTINSELR}}
 0xb5 0x0 0x31 0xd5
-# CHECK: mrs      x21, trccntrldvr0
+# CHECK: mrs      x21, {{trccntrldvr0|TRCCNTRLDVR0}}
 0xaa 0x1 0x31 0xd5
-# CHECK: mrs      x10, trccntrldvr1
+# CHECK: mrs      x10, {{trccntrldvr1|TRCCNTRLDVR1}}
 0xb4 0x2 0x31 0xd5
-# CHECK: mrs      x20, trccntrldvr2
+# CHECK: mrs      x20, {{trccntrldvr2|TRCCNTRLDVR2}}
 0xa5 0x3 0x31 0xd5
-# CHECK: mrs      x5, trccntrldvr3
+# CHECK: mrs      x5, {{trccntrldvr3|TRCCNTRLDVR3}}
 0xb1 0x4 0x31 0xd5
-# CHECK: mrs      x17, trccntctlr0
+# CHECK: mrs      x17, {{trccntctlr0|TRCCNTCTLR0}}
 0xa1 0x5 0x31 0xd5
-# CHECK: mrs      x1, trccntctlr1
+# CHECK: mrs      x1, {{trccntctlr1|TRCCNTCTLR1}}
 0xb1 0x6 0x31 0xd5
-# CHECK: mrs      x17, trccntctlr2
+# CHECK: mrs      x17, {{trccntctlr2|TRCCNTCTLR2}}
 0xa6 0x7 0x31 0xd5
-# CHECK: mrs      x6, trccntctlr3
+# CHECK: mrs      x6, {{trccntctlr3|TRCCNTCTLR3}}
 0xbc 0x8 0x31 0xd5
-# CHECK: mrs      x28, trccntvr0
+# CHECK: mrs      x28, {{trccntvr0|TRCCNTVR0}}
 0xb7 0x9 0x31 0xd5
-# CHECK: mrs      x23, trccntvr1
+# CHECK: mrs      x23, {{trccntvr1|TRCCNTVR1}}
 0xa9 0xa 0x31 0xd5
-# CHECK: mrs      x9, trccntvr2
+# CHECK: mrs      x9, {{trccntvr2|TRCCNTVR2}}
 0xa6 0xb 0x31 0xd5
-# CHECK: mrs      x6, trccntvr3
+# CHECK: mrs      x6, {{trccntvr3|TRCCNTVR3}}
 0xf8 0x0 0x31 0xd5
-# CHECK: mrs      x24, trcimspec0
+# CHECK: mrs      x24, {{trcimspec0|TRCIMSPEC0}}
 0xf8 0x1 0x31 0xd5
-# CHECK: mrs      x24, trcimspec1
+# CHECK: mrs      x24, {{trcimspec1|TRCIMSPEC1}}
 0xef 0x2 0x31 0xd5
-# CHECK: mrs      x15, trcimspec2
+# CHECK: mrs      x15, {{trcimspec2|TRCIMSPEC2}}
 0xea 0x3 0x31 0xd5
-# CHECK: mrs      x10, trcimspec3
+# CHECK: mrs      x10, {{trcimspec3|TRCIMSPEC3}}
 0xfd 0x4 0x31 0xd5
-# CHECK: mrs      x29, trcimspec4
+# CHECK: mrs      x29, {{trcimspec4|TRCIMSPEC4}}
 0xf2 0x5 0x31 0xd5
-# CHECK: mrs      x18, trcimspec5
+# CHECK: mrs      x18, {{trcimspec5|TRCIMSPEC5}}
 0xfd 0x6 0x31 0xd5
-# CHECK: mrs      x29, trcimspec6
+# CHECK: mrs      x29, {{trcimspec6|TRCIMSPEC6}}
 0xe2 0x7 0x31 0xd5
-# CHECK: mrs      x2, trcimspec7
+# CHECK: mrs      x2, {{trcimspec7|TRCIMSPEC7}}
 0x8 0x12 0x31 0xd5
-# CHECK: mrs      x8, trcrsctlr2
+# CHECK: mrs      x8, {{trcrsctlr2|TRCRSCTLR2}}
 0x0 0x13 0x31 0xd5
-# CHECK: mrs      x0, trcrsctlr3
+# CHECK: mrs      x0, {{trcrsctlr3|TRCRSCTLR3}}
 0xc 0x14 0x31 0xd5
-# CHECK: mrs      x12, trcrsctlr4
+# CHECK: mrs      x12, {{trcrsctlr4|TRCRSCTLR4}}
 0x1a 0x15 0x31 0xd5
-# CHECK: mrs      x26, trcrsctlr5
+# CHECK: mrs      x26, {{trcrsctlr5|TRCRSCTLR5}}
 0x1d 0x16 0x31 0xd5
-# CHECK: mrs      x29, trcrsctlr6
+# CHECK: mrs      x29, {{trcrsctlr6|TRCRSCTLR6}}
 0x11 0x17 0x31 0xd5
-# CHECK: mrs      x17, trcrsctlr7
+# CHECK: mrs      x17, {{trcrsctlr7|TRCRSCTLR7}}
 0x0 0x18 0x31 0xd5
-# CHECK: mrs      x0, trcrsctlr8
+# CHECK: mrs      x0, {{trcrsctlr8|TRCRSCTLR8}}
 0x1 0x19 0x31 0xd5
-# CHECK: mrs      x1, trcrsctlr9
+# CHECK: mrs      x1, {{trcrsctlr9|TRCRSCTLR9}}
 0x11 0x1a 0x31 0xd5
-# CHECK: mrs      x17, trcrsctlr10
+# CHECK: mrs      x17, {{trcrsctlr10|TRCRSCTLR10}}
 0x15 0x1b 0x31 0xd5
-# CHECK: mrs      x21, trcrsctlr11
+# CHECK: mrs      x21, {{trcrsctlr11|TRCRSCTLR11}}
 0x1 0x1c 0x31 0xd5
-# CHECK: mrs      x1, trcrsctlr12
+# CHECK: mrs      x1, {{trcrsctlr12|TRCRSCTLR12}}
 0x8 0x1d 0x31 0xd5
-# CHECK: mrs      x8, trcrsctlr13
+# CHECK: mrs      x8, {{trcrsctlr13|TRCRSCTLR13}}
 0x18 0x1e 0x31 0xd5
-# CHECK: mrs      x24, trcrsctlr14
+# CHECK: mrs      x24, {{trcrsctlr14|TRCRSCTLR14}}
 0x0 0x1f 0x31 0xd5
-# CHECK: mrs      x0, trcrsctlr15
+# CHECK: mrs      x0, {{trcrsctlr15|TRCRSCTLR15}}
 0x22 0x10 0x31 0xd5
-# CHECK: mrs      x2, trcrsctlr16
+# CHECK: mrs      x2, {{trcrsctlr16|TRCRSCTLR16}}
 0x3d 0x11 0x31 0xd5
-# CHECK: mrs      x29, trcrsctlr17
+# CHECK: mrs      x29, {{trcrsctlr17|TRCRSCTLR17}}
 0x36 0x12 0x31 0xd5
-# CHECK: mrs      x22, trcrsctlr18
+# CHECK: mrs      x22, {{trcrsctlr18|TRCRSCTLR18}}
 0x26 0x13 0x31 0xd5
-# CHECK: mrs      x6, trcrsctlr19
+# CHECK: mrs      x6, {{trcrsctlr19|TRCRSCTLR19}}
 0x3a 0x14 0x31 0xd5
-# CHECK: mrs      x26, trcrsctlr20
+# CHECK: mrs      x26, {{trcrsctlr20|TRCRSCTLR20}}
 0x3a 0x15 0x31 0xd5
-# CHECK: mrs      x26, trcrsctlr21
+# CHECK: mrs      x26, {{trcrsctlr21|TRCRSCTLR21}}
 0x24 0x16 0x31 0xd5
-# CHECK: mrs      x4, trcrsctlr22
+# CHECK: mrs      x4, {{trcrsctlr22|TRCRSCTLR22}}
 0x2c 0x17 0x31 0xd5
-# CHECK: mrs      x12, trcrsctlr23
+# CHECK: mrs      x12, {{trcrsctlr23|TRCRSCTLR23}}
 0x21 0x18 0x31 0xd5
-# CHECK: mrs      x1, trcrsctlr24
+# CHECK: mrs      x1, {{trcrsctlr24|TRCRSCTLR24}}
 0x20 0x19 0x31 0xd5
-# CHECK: mrs      x0, trcrsctlr25
+# CHECK: mrs      x0, {{trcrsctlr25|TRCRSCTLR25}}
 0x31 0x1a 0x31 0xd5
-# CHECK: mrs      x17, trcrsctlr26
+# CHECK: mrs      x17, {{trcrsctlr26|TRCRSCTLR26}}
 0x28 0x1b 0x31 0xd5
-# CHECK: mrs      x8, trcrsctlr27
+# CHECK: mrs      x8, {{trcrsctlr27|TRCRSCTLR27}}
 0x2a 0x1c 0x31 0xd5
-# CHECK: mrs      x10, trcrsctlr28
+# CHECK: mrs      x10, {{trcrsctlr28|TRCRSCTLR28}}
 0x39 0x1d 0x31 0xd5
-# CHECK: mrs      x25, trcrsctlr29
+# CHECK: mrs      x25, {{trcrsctlr29|TRCRSCTLR29}}
 0x2c 0x1e 0x31 0xd5
-# CHECK: mrs      x12, trcrsctlr30
+# CHECK: mrs      x12, {{trcrsctlr30|TRCRSCTLR30}}
 0x2b 0x1f 0x31 0xd5
-# CHECK: mrs      x11, trcrsctlr31
+# CHECK: mrs      x11, {{trcrsctlr31|TRCRSCTLR31}}
 0x52 0x10 0x31 0xd5
-# CHECK: mrs      x18, trcssccr0
+# CHECK: mrs      x18, {{trcssccr0|TRCSSCCR0}}
 0x4c 0x11 0x31 0xd5
-# CHECK: mrs      x12, trcssccr1
+# CHECK: mrs      x12, {{trcssccr1|TRCSSCCR1}}
 0x43 0x12 0x31 0xd5
-# CHECK: mrs      x3, trcssccr2
+# CHECK: mrs      x3, {{trcssccr2|TRCSSCCR2}}
 0x42 0x13 0x31 0xd5
-# CHECK: mrs      x2, trcssccr3
+# CHECK: mrs      x2, {{trcssccr3|TRCSSCCR3}}
 0x55 0x14 0x31 0xd5
-# CHECK: mrs      x21, trcssccr4
+# CHECK: mrs      x21, {{trcssccr4|TRCSSCCR4}}
 0x4a 0x15 0x31 0xd5
-# CHECK: mrs      x10, trcssccr5
+# CHECK: mrs      x10, {{trcssccr5|TRCSSCCR5}}
 0x56 0x16 0x31 0xd5
-# CHECK: mrs      x22, trcssccr6
+# CHECK: mrs      x22, {{trcssccr6|TRCSSCCR6}}
 0x57 0x17 0x31 0xd5
-# CHECK: mrs      x23, trcssccr7
+# CHECK: mrs      x23, {{trcssccr7|TRCSSCCR7}}
 0x57 0x18 0x31 0xd5
-# CHECK: mrs      x23, trcsscsr0
+# CHECK: mrs      x23, {{trcsscsr0|TRCSSCSR0}}
 0x53 0x19 0x31 0xd5
-# CHECK: mrs      x19, trcsscsr1
+# CHECK: mrs      x19, {{trcsscsr1|TRCSSCSR1}}
 0x59 0x1a 0x31 0xd5
-# CHECK: mrs      x25, trcsscsr2
+# CHECK: mrs      x25, {{trcsscsr2|TRCSSCSR2}}
 0x51 0x1b 0x31 0xd5
-# CHECK: mrs      x17, trcsscsr3
+# CHECK: mrs      x17, {{trcsscsr3|TRCSSCSR3}}
 0x53 0x1c 0x31 0xd5
-# CHECK: mrs      x19, trcsscsr4
+# CHECK: mrs      x19, {{trcsscsr4|TRCSSCSR4}}
 0x4b 0x1d 0x31 0xd5
-# CHECK: mrs      x11, trcsscsr5
+# CHECK: mrs      x11, {{trcsscsr5|TRCSSCSR5}}
 0x45 0x1e 0x31 0xd5
-# CHECK: mrs      x5, trcsscsr6
+# CHECK: mrs      x5, {{trcsscsr6|TRCSSCSR6}}
 0x49 0x1f 0x31 0xd5
-# CHECK: mrs      x9, trcsscsr7
+# CHECK: mrs      x9, {{trcsscsr7|TRCSSCSR7}}
 0x9a 0x14 0x31 0xd5
-# CHECK: mrs      x26, trcpdcr
+# CHECK: mrs      x26, {{trcpdcr|TRCPDCR}}
 0x8 0x20 0x31 0xd5
-# CHECK: mrs      x8, trcacvr0
+# CHECK: mrs      x8, {{trcacvr0|TRCACVR0}}
 0xf 0x22 0x31 0xd5
-# CHECK: mrs      x15, trcacvr1
+# CHECK: mrs      x15, {{trcacvr1|TRCACVR1}}
 0x13 0x24 0x31 0xd5
-# CHECK: mrs      x19, trcacvr2
+# CHECK: mrs      x19, {{trcacvr2|TRCACVR2}}
 0x8 0x26 0x31 0xd5
-# CHECK: mrs      x8, trcacvr3
+# CHECK: mrs      x8, {{trcacvr3|TRCACVR3}}
 0x1c 0x28 0x31 0xd5
-# CHECK: mrs      x28, trcacvr4
+# CHECK: mrs      x28, {{trcacvr4|TRCACVR4}}
 0x3 0x2a 0x31 0xd5
-# CHECK: mrs      x3, trcacvr5
+# CHECK: mrs      x3, {{trcacvr5|TRCACVR5}}
 0x19 0x2c 0x31 0xd5
-# CHECK: mrs      x25, trcacvr6
+# CHECK: mrs      x25, {{trcacvr6|TRCACVR6}}
 0x18 0x2e 0x31 0xd5
-# CHECK: mrs      x24, trcacvr7
+# CHECK: mrs      x24, {{trcacvr7|TRCACVR7}}
 0x26 0x20 0x31 0xd5
-# CHECK: mrs      x6, trcacvr8
+# CHECK: mrs      x6, {{trcacvr8|TRCACVR8}}
 0x23 0x22 0x31 0xd5
-# CHECK: mrs      x3, trcacvr9
+# CHECK: mrs      x3, {{trcacvr9|TRCACVR9}}
 0x38 0x24 0x31 0xd5
-# CHECK: mrs      x24, trcacvr10
+# CHECK: mrs      x24, {{trcacvr10|TRCACVR10}}
 0x23 0x26 0x31 0xd5
-# CHECK: mrs      x3, trcacvr11
+# CHECK: mrs      x3, {{trcacvr11|TRCACVR11}}
 0x2c 0x28 0x31 0xd5
-# CHECK: mrs      x12, trcacvr12
+# CHECK: mrs      x12, {{trcacvr12|TRCACVR12}}
 0x29 0x2a 0x31 0xd5
-# CHECK: mrs      x9, trcacvr13
+# CHECK: mrs      x9, {{trcacvr13|TRCACVR13}}
 0x2e 0x2c 0x31 0xd5
-# CHECK: mrs      x14, trcacvr14
+# CHECK: mrs      x14, {{trcacvr14|TRCACVR14}}
 0x23 0x2e 0x31 0xd5
-# CHECK: mrs      x3, trcacvr15
+# CHECK: mrs      x3, {{trcacvr15|TRCACVR15}}
 0x55 0x20 0x31 0xd5
-# CHECK: mrs      x21, trcacatr0
+# CHECK: mrs      x21, {{trcacatr0|TRCACATR0}}
 0x5a 0x22 0x31 0xd5
-# CHECK: mrs      x26, trcacatr1
+# CHECK: mrs      x26, {{trcacatr1|TRCACATR1}}
 0x48 0x24 0x31 0xd5
-# CHECK: mrs      x8, trcacatr2
+# CHECK: mrs      x8, {{trcacatr2|TRCACATR2}}
 0x56 0x26 0x31 0xd5
-# CHECK: mrs      x22, trcacatr3
+# CHECK: mrs      x22, {{trcacatr3|TRCACATR3}}
 0x46 0x28 0x31 0xd5
-# CHECK: mrs      x6, trcacatr4
+# CHECK: mrs      x6, {{trcacatr4|TRCACATR4}}
 0x5d 0x2a 0x31 0xd5
-# CHECK: mrs      x29, trcacatr5
+# CHECK: mrs      x29, {{trcacatr5|TRCACATR5}}
 0x45 0x2c 0x31 0xd5
-# CHECK: mrs      x5, trcacatr6
+# CHECK: mrs      x5, {{trcacatr6|TRCACATR6}}
 0x52 0x2e 0x31 0xd5
-# CHECK: mrs      x18, trcacatr7
+# CHECK: mrs      x18, {{trcacatr7|TRCACATR7}}
 0x62 0x20 0x31 0xd5
-# CHECK: mrs      x2, trcacatr8
+# CHECK: mrs      x2, {{trcacatr8|TRCACATR8}}
 0x73 0x22 0x31 0xd5
-# CHECK: mrs      x19, trcacatr9
+# CHECK: mrs      x19, {{trcacatr9|TRCACATR9}}
 0x6d 0x24 0x31 0xd5
-# CHECK: mrs      x13, trcacatr10
+# CHECK: mrs      x13, {{trcacatr10|TRCACATR10}}
 0x79 0x26 0x31 0xd5
-# CHECK: mrs      x25, trcacatr11
+# CHECK: mrs      x25, {{trcacatr11|TRCACATR11}}
 0x72 0x28 0x31 0xd5
-# CHECK: mrs      x18, trcacatr12
+# CHECK: mrs      x18, {{trcacatr12|TRCACATR12}}
 0x7d 0x2a 0x31 0xd5
-# CHECK: mrs      x29, trcacatr13
+# CHECK: mrs      x29, {{trcacatr13|TRCACATR13}}
 0x69 0x2c 0x31 0xd5
-# CHECK: mrs      x9, trcacatr14
+# CHECK: mrs      x9, {{trcacatr14|TRCACATR14}}
 0x72 0x2e 0x31 0xd5
-# CHECK: mrs      x18, trcacatr15
+# CHECK: mrs      x18, {{trcacatr15|TRCACATR15}}
 0x9d 0x20 0x31 0xd5
-# CHECK: mrs      x29, trcdvcvr0
+# CHECK: mrs      x29, {{trcdvcvr0|TRCDVCVR0}}
 0x8f 0x24 0x31 0xd5
-# CHECK: mrs      x15, trcdvcvr1
+# CHECK: mrs      x15, {{trcdvcvr1|TRCDVCVR1}}
 0x8f 0x28 0x31 0xd5
-# CHECK: mrs      x15, trcdvcvr2
+# CHECK: mrs      x15, {{trcdvcvr2|TRCDVCVR2}}
 0x8f 0x2c 0x31 0xd5
-# CHECK: mrs      x15, trcdvcvr3
+# CHECK: mrs      x15, {{trcdvcvr3|TRCDVCVR3}}
 0xb3 0x20 0x31 0xd5
-# CHECK: mrs      x19, trcdvcvr4
+# CHECK: mrs      x19, {{trcdvcvr4|TRCDVCVR4}}
 0xb6 0x24 0x31 0xd5
-# CHECK: mrs      x22, trcdvcvr5
+# CHECK: mrs      x22, {{trcdvcvr5|TRCDVCVR5}}
 0xbb 0x28 0x31 0xd5
-# CHECK: mrs      x27, trcdvcvr6
+# CHECK: mrs      x27, {{trcdvcvr6|TRCDVCVR6}}
 0xa1 0x2c 0x31 0xd5
-# CHECK: mrs      x1, trcdvcvr7
+# CHECK: mrs      x1, {{trcdvcvr7|TRCDVCVR7}}
 0xdd 0x20 0x31 0xd5
-# CHECK: mrs      x29, trcdvcmr0
+# CHECK: mrs      x29, {{trcdvcmr0|TRCDVCMR0}}
 0xc9 0x24 0x31 0xd5
-# CHECK: mrs      x9, trcdvcmr1
+# CHECK: mrs      x9, {{trcdvcmr1|TRCDVCMR1}}
 0xc1 0x28 0x31 0xd5
-# CHECK: mrs      x1, trcdvcmr2
+# CHECK: mrs      x1, {{trcdvcmr2|TRCDVCMR2}}
 0xc2 0x2c 0x31 0xd5
-# CHECK: mrs      x2, trcdvcmr3
+# CHECK: mrs      x2, {{trcdvcmr3|TRCDVCMR3}}
 0xe5 0x20 0x31 0xd5
-# CHECK: mrs      x5, trcdvcmr4
+# CHECK: mrs      x5, {{trcdvcmr4|TRCDVCMR4}}
 0xf5 0x24 0x31 0xd5
-# CHECK: mrs      x21, trcdvcmr5
+# CHECK: mrs      x21, {{trcdvcmr5|TRCDVCMR5}}
 0xe5 0x28 0x31 0xd5
-# CHECK: mrs      x5, trcdvcmr6
+# CHECK: mrs      x5, {{trcdvcmr6|TRCDVCMR6}}
 0xe1 0x2c 0x31 0xd5
-# CHECK: mrs      x1, trcdvcmr7
+# CHECK: mrs      x1, {{trcdvcmr7|TRCDVCMR7}}
 0x15 0x30 0x31 0xd5
-# CHECK: mrs      x21, trccidcvr0
+# CHECK: mrs      x21, {{trccidcvr0|TRCCIDCVR0}}
 0x18 0x32 0x31 0xd5
-# CHECK: mrs      x24, trccidcvr1
+# CHECK: mrs      x24, {{trccidcvr1|TRCCIDCVR1}}
 0x18 0x34 0x31 0xd5
-# CHECK: mrs      x24, trccidcvr2
+# CHECK: mrs      x24, {{trccidcvr2|TRCCIDCVR2}}
 0xc 0x36 0x31 0xd5
-# CHECK: mrs      x12, trccidcvr3
+# CHECK: mrs      x12, {{trccidcvr3|TRCCIDCVR3}}
 0xa 0x38 0x31 0xd5
-# CHECK: mrs      x10, trccidcvr4
+# CHECK: mrs      x10, {{trccidcvr4|TRCCIDCVR4}}
 0x9 0x3a 0x31 0xd5
-# CHECK: mrs      x9, trccidcvr5
+# CHECK: mrs      x9, {{trccidcvr5|TRCCIDCVR5}}
 0x6 0x3c 0x31 0xd5
-# CHECK: mrs      x6, trccidcvr6
+# CHECK: mrs      x6, {{trccidcvr6|TRCCIDCVR6}}
 0x14 0x3e 0x31 0xd5
-# CHECK: mrs      x20, trccidcvr7
+# CHECK: mrs      x20, {{trccidcvr7|TRCCIDCVR7}}
 0x34 0x30 0x31 0xd5
-# CHECK: mrs      x20, trcvmidcvr0
+# CHECK: mrs      x20, {{trcvmidcvr0|TRCVMIDCVR0}}
 0x34 0x32 0x31 0xd5
-# CHECK: mrs      x20, trcvmidcvr1
+# CHECK: mrs      x20, {{trcvmidcvr1|TRCVMIDCVR1}}
 0x3a 0x34 0x31 0xd5
-# CHECK: mrs      x26, trcvmidcvr2
+# CHECK: mrs      x26, {{trcvmidcvr2|TRCVMIDCVR2}}
 0x21 0x36 0x31 0xd5
-# CHECK: mrs      x1, trcvmidcvr3
+# CHECK: mrs      x1, {{trcvmidcvr3|TRCVMIDCVR3}}
 0x2e 0x38 0x31 0xd5
-# CHECK: mrs      x14, trcvmidcvr4
+# CHECK: mrs      x14, {{trcvmidcvr4|TRCVMIDCVR4}}
 0x3b 0x3a 0x31 0xd5
-# CHECK: mrs      x27, trcvmidcvr5
+# CHECK: mrs      x27, {{trcvmidcvr5|TRCVMIDCVR5}}
 0x3d 0x3c 0x31 0xd5
-# CHECK: mrs      x29, trcvmidcvr6
+# CHECK: mrs      x29, {{trcvmidcvr6|TRCVMIDCVR6}}
 0x31 0x3e 0x31 0xd5
-# CHECK: mrs      x17, trcvmidcvr7
+# CHECK: mrs      x17, {{trcvmidcvr7|TRCVMIDCVR7}}
 0x4a 0x30 0x31 0xd5
-# CHECK: mrs      x10, trccidcctlr0
+# CHECK: mrs      x10, {{trccidcctlr0|TRCCIDCCTLR0}}
 0x44 0x31 0x31 0xd5
-# CHECK: mrs      x4, trccidcctlr1
+# CHECK: mrs      x4, {{trccidcctlr1|TRCCIDCCTLR1}}
 0x49 0x32 0x31 0xd5
-# CHECK: mrs      x9, trcvmidcctlr0
+# CHECK: mrs      x9, {{trcvmidcctlr0|TRCVMIDCCTLR0}}
 0x4b 0x33 0x31 0xd5
-# CHECK: mrs      x11, trcvmidcctlr1
+# CHECK: mrs      x11, {{trcvmidcctlr1|TRCVMIDCCTLR1}}
 0x96 0x70 0x31 0xd5
-# CHECK: mrs      x22, trcitctrl
+# CHECK: mrs      x22, {{trcitctrl|TRCITCTRL}}
 0xd7 0x78 0x31 0xd5
-# CHECK: mrs      x23, trcclaimset
+# CHECK: mrs      x23, {{trcclaimset|TRCCLAIMSET}}
 0xce 0x79 0x31 0xd5
-# CHECK: mrs      x14, trcclaimclr
+# CHECK: mrs      x14, {{trcclaimclr|TRCCLAIMCLR}}
 0x9c 0x10 0x11 0xd5
-# CHECK: msr      trcoslar, x28
+# CHECK: msr      {{trcoslar|TRCOSLAR}}, x28
 0xce 0x7c 0x11 0xd5
-# CHECK: msr      trclar, x14
+# CHECK: msr      {{trclar|TRCLAR}}, x14
 0xa 0x1 0x11 0xd5
-# CHECK: msr      trcprgctlr, x10
+# CHECK: msr      {{trcprgctlr|TRCPRGCTLR}}, x10
 0x1b 0x2 0x11 0xd5
-# CHECK: msr      trcprocselr, x27
+# CHECK: msr      {{trcprocselr|TRCPROCSELR}}, x27
 0x18 0x4 0x11 0xd5
-# CHECK: msr      trcconfigr, x24
+# CHECK: msr      {{trcconfigr|TRCCONFIGR}}, x24
 0x8 0x6 0x11 0xd5
-# CHECK: msr      trcauxctlr, x8
+# CHECK: msr      {{trcauxctlr|TRCAUXCTLR}}, x8
 0x10 0x8 0x11 0xd5
-# CHECK: msr      trceventctl0r, x16
+# CHECK: msr      {{trceventctl0r|TRCEVENTCTL0R}}, x16
 0x1b 0x9 0x11 0xd5
-# CHECK: msr      trceventctl1r, x27
+# CHECK: msr      {{trceventctl1r|TRCEVENTCTL1R}}, x27
 0x1a 0xb 0x11 0xd5
-# CHECK: msr      trcstallctlr, x26
+# CHECK: msr      {{trcstallctlr|TRCSTALLCTLR}}, x26
 0x0 0xc 0x11 0xd5
-# CHECK: msr      trctsctlr, x0
+# CHECK: msr      {{trctsctlr|TRCTSCTLR}}, x0
 0xe 0xd 0x11 0xd5
-# CHECK: msr      trcsyncpr, x14
+# CHECK: msr      {{trcsyncpr|TRCSYNCPR}}, x14
 0x8 0xe 0x11 0xd5
-# CHECK: msr      trcccctlr, x8
+# CHECK: msr      {{trcccctlr|TRCCCCTLR}}, x8
 0x6 0xf 0x11 0xd5
-# CHECK: msr      trcbbctlr, x6
+# CHECK: msr      {{trcbbctlr|TRCBBCTLR}}, x6
 0x37 0x0 0x11 0xd5
-# CHECK: msr      trctraceidr, x23
+# CHECK: msr      {{trctraceidr|TRCTRACEIDR}}, x23
 0x25 0x1 0x11 0xd5
-# CHECK: msr      trcqctlr, x5
+# CHECK: msr      {{trcqctlr|TRCQCTLR}}, x5
 0x40 0x0 0x11 0xd5
-# CHECK: msr      trcvictlr, x0
+# CHECK: msr      {{trcvictlr|TRCVICTLR}}, x0
 0x40 0x1 0x11 0xd5
-# CHECK: msr      trcviiectlr, x0
+# CHECK: msr      {{trcviiectlr|TRCVIIECTLR}}, x0
 0x41 0x2 0x11 0xd5
-# CHECK: msr      trcvissctlr, x1
+# CHECK: msr      {{trcvissctlr|TRCVISSCTLR}}, x1
 0x40 0x3 0x11 0xd5
-# CHECK: msr      trcvipcssctlr, x0
+# CHECK: msr      {{trcvipcssctlr|TRCVIPCSSCTLR}}, x0
 0x47 0x8 0x11 0xd5
-# CHECK: msr      trcvdctlr, x7
+# CHECK: msr      {{trcvdctlr|TRCVDCTLR}}, x7
 0x52 0x9 0x11 0xd5
-# CHECK: msr      trcvdsacctlr, x18
+# CHECK: msr      {{trcvdsacctlr|TRCVDSACCTLR}}, x18
 0x58 0xa 0x11 0xd5
-# CHECK: msr      trcvdarcctlr, x24
+# CHECK: msr      {{trcvdarcctlr|TRCVDARCCTLR}}, x24
 0x9c 0x0 0x11 0xd5
-# CHECK: msr      trcseqevr0, x28
+# CHECK: msr      {{trcseqevr0|TRCSEQEVR0}}, x28
 0x95 0x1 0x11 0xd5
-# CHECK: msr      trcseqevr1, x21
+# CHECK: msr      {{trcseqevr1|TRCSEQEVR1}}, x21
 0x90 0x2 0x11 0xd5
-# CHECK: msr      trcseqevr2, x16
+# CHECK: msr      {{trcseqevr2|TRCSEQEVR2}}, x16
 0x90 0x6 0x11 0xd5
-# CHECK: msr      trcseqrstevr, x16
+# CHECK: msr      {{trcseqrstevr|TRCSEQRSTEVR}}, x16
 0x99 0x7 0x11 0xd5
-# CHECK: msr      trcseqstr, x25
+# CHECK: msr      {{trcseqstr|TRCSEQSTR}}, x25
 0x9d 0x8 0x11 0xd5
-# CHECK: msr      trcextinselr, x29
+# CHECK: msr      {{trcextinselr|TRCEXTINSELR}}, x29
 0xb4 0x0 0x11 0xd5
-# CHECK: msr      trccntrldvr0, x20
+# CHECK: msr      {{trccntrldvr0|TRCCNTRLDVR0}}, x20
 0xb4 0x1 0x11 0xd5
-# CHECK: msr      trccntrldvr1, x20
+# CHECK: msr      {{trccntrldvr1|TRCCNTRLDVR1}}, x20
 0xb6 0x2 0x11 0xd5
-# CHECK: msr      trccntrldvr2, x22
+# CHECK: msr      {{trccntrldvr2|TRCCNTRLDVR2}}, x22
 0xac 0x3 0x11 0xd5
-# CHECK: msr      trccntrldvr3, x12
+# CHECK: msr      {{trccntrldvr3|TRCCNTRLDVR3}}, x12
 0xb4 0x4 0x11 0xd5
-# CHECK: msr      trccntctlr0, x20
+# CHECK: msr      {{trccntctlr0|TRCCNTCTLR0}}, x20
 0xa4 0x5 0x11 0xd5
-# CHECK: msr      trccntctlr1, x4
+# CHECK: msr      {{trccntctlr1|TRCCNTCTLR1}}, x4
 0xa8 0x6 0x11 0xd5
-# CHECK: msr      trccntctlr2, x8
+# CHECK: msr      {{trccntctlr2|TRCCNTCTLR2}}, x8
 0xb0 0x7 0x11 0xd5
-# CHECK: msr      trccntctlr3, x16
+# CHECK: msr      {{trccntctlr3|TRCCNTCTLR3}}, x16
 0xa5 0x8 0x11 0xd5
-# CHECK: msr      trccntvr0, x5
+# CHECK: msr      {{trccntvr0|TRCCNTVR0}}, x5
 0xbb 0x9 0x11 0xd5
-# CHECK: msr      trccntvr1, x27
+# CHECK: msr      {{trccntvr1|TRCCNTVR1}}, x27
 0xb5 0xa 0x11 0xd5
-# CHECK: msr      trccntvr2, x21
+# CHECK: msr      {{trccntvr2|TRCCNTVR2}}, x21
 0xa8 0xb 0x11 0xd5
-# CHECK: msr      trccntvr3, x8
+# CHECK: msr      {{trccntvr3|TRCCNTVR3}}, x8
 0xe6 0x0 0x11 0xd5
-# CHECK: msr      trcimspec0, x6
+# CHECK: msr      {{trcimspec0|TRCIMSPEC0}}, x6
 0xfb 0x1 0x11 0xd5
-# CHECK: msr      trcimspec1, x27
+# CHECK: msr      {{trcimspec1|TRCIMSPEC1}}, x27
 0xf7 0x2 0x11 0xd5
-# CHECK: msr      trcimspec2, x23
+# CHECK: msr      {{trcimspec2|TRCIMSPEC2}}, x23
 0xef 0x3 0x11 0xd5
-# CHECK: msr      trcimspec3, x15
+# CHECK: msr      {{trcimspec3|TRCIMSPEC3}}, x15
 0xed 0x4 0x11 0xd5
-# CHECK: msr      trcimspec4, x13
+# CHECK: msr      {{trcimspec4|TRCIMSPEC4}}, x13
 0xf9 0x5 0x11 0xd5
-# CHECK: msr      trcimspec5, x25
+# CHECK: msr      {{trcimspec5|TRCIMSPEC5}}, x25
 0xf3 0x6 0x11 0xd5
-# CHECK: msr      trcimspec6, x19
+# CHECK: msr      {{trcimspec6|TRCIMSPEC6}}, x19
 0xfb 0x7 0x11 0xd5
-# CHECK: msr      trcimspec7, x27
+# CHECK: msr      {{trcimspec7|TRCIMSPEC7}}, x27
 0x4 0x12 0x11 0xd5
-# CHECK: msr      trcrsctlr2, x4
+# CHECK: msr      {{trcrsctlr2|TRCRSCTLR2}}, x4
 0x0 0x13 0x11 0xd5
-# CHECK: msr      trcrsctlr3, x0
+# CHECK: msr      {{trcrsctlr3|TRCRSCTLR3}}, x0
 0x15 0x14 0x11 0xd5
-# CHECK: msr      trcrsctlr4, x21
+# CHECK: msr      {{trcrsctlr4|TRCRSCTLR4}}, x21
 0x8 0x15 0x11 0xd5
-# CHECK: msr      trcrsctlr5, x8
+# CHECK: msr      {{trcrsctlr5|TRCRSCTLR5}}, x8
 0x14 0x16 0x11 0xd5
-# CHECK: msr      trcrsctlr6, x20
+# CHECK: msr      {{trcrsctlr6|TRCRSCTLR6}}, x20
 0xb 0x17 0x11 0xd5
-# CHECK: msr      trcrsctlr7, x11
+# CHECK: msr      {{trcrsctlr7|TRCRSCTLR7}}, x11
 0x12 0x18 0x11 0xd5
-# CHECK: msr      trcrsctlr8, x18
+# CHECK: msr      {{trcrsctlr8|TRCRSCTLR8}}, x18
 0x18 0x19 0x11 0xd5
-# CHECK: msr      trcrsctlr9, x24
+# CHECK: msr      {{trcrsctlr9|TRCRSCTLR9}}, x24
 0xf 0x1a 0x11 0xd5
-# CHECK: msr      trcrsctlr10, x15
+# CHECK: msr      {{trcrsctlr10|TRCRSCTLR10}}, x15
 0x15 0x1b 0x11 0xd5
-# CHECK: msr      trcrsctlr11, x21
+# CHECK: msr      {{trcrsctlr11|TRCRSCTLR11}}, x21
 0x4 0x1c 0x11 0xd5
-# CHECK: msr      trcrsctlr12, x4
+# CHECK: msr      {{trcrsctlr12|TRCRSCTLR12}}, x4
 0x1c 0x1d 0x11 0xd5
-# CHECK: msr      trcrsctlr13, x28
+# CHECK: msr      {{trcrsctlr13|TRCRSCTLR13}}, x28
 0x3 0x1e 0x11 0xd5
-# CHECK: msr      trcrsctlr14, x3
+# CHECK: msr      {{trcrsctlr14|TRCRSCTLR14}}, x3
 0x14 0x1f 0x11 0xd5
-# CHECK: msr      trcrsctlr15, x20
+# CHECK: msr      {{trcrsctlr15|TRCRSCTLR15}}, x20
 0x2c 0x10 0x11 0xd5
-# CHECK: msr      trcrsctlr16, x12
+# CHECK: msr      {{trcrsctlr16|TRCRSCTLR16}}, x12
 0x31 0x11 0x11 0xd5
-# CHECK: msr      trcrsctlr17, x17
+# CHECK: msr      {{trcrsctlr17|TRCRSCTLR17}}, x17
 0x2a 0x12 0x11 0xd5
-# CHECK: msr      trcrsctlr18, x10
+# CHECK: msr      {{trcrsctlr18|TRCRSCTLR18}}, x10
 0x2b 0x13 0x11 0xd5
-# CHECK: msr      trcrsctlr19, x11
+# CHECK: msr      {{trcrsctlr19|TRCRSCTLR19}}, x11
 0x23 0x14 0x11 0xd5
-# CHECK: msr      trcrsctlr20, x3
+# CHECK: msr      {{trcrsctlr20|TRCRSCTLR20}}, x3
 0x32 0x15 0x11 0xd5
-# CHECK: msr      trcrsctlr21, x18
+# CHECK: msr      {{trcrsctlr21|TRCRSCTLR21}}, x18
 0x3a 0x16 0x11 0xd5
-# CHECK: msr      trcrsctlr22, x26
+# CHECK: msr      {{trcrsctlr22|TRCRSCTLR22}}, x26
 0x25 0x17 0x11 0xd5
-# CHECK: msr      trcrsctlr23, x5
+# CHECK: msr      {{trcrsctlr23|TRCRSCTLR23}}, x5
 0x39 0x18 0x11 0xd5
-# CHECK: msr      trcrsctlr24, x25
+# CHECK: msr      {{trcrsctlr24|TRCRSCTLR24}}, x25
 0x25 0x19 0x11 0xd5
-# CHECK: msr      trcrsctlr25, x5
+# CHECK: msr      {{trcrsctlr25|TRCRSCTLR25}}, x5
 0x24 0x1a 0x11 0xd5
-# CHECK: msr      trcrsctlr26, x4
+# CHECK: msr      {{trcrsctlr26|TRCRSCTLR26}}, x4
 0x34 0x1b 0x11 0xd5
-# CHECK: msr      trcrsctlr27, x20
+# CHECK: msr      {{trcrsctlr27|TRCRSCTLR27}}, x20
 0x25 0x1c 0x11 0xd5
-# CHECK: msr      trcrsctlr28, x5
+# CHECK: msr      {{trcrsctlr28|TRCRSCTLR28}}, x5
 0x2a 0x1d 0x11 0xd5
-# CHECK: msr      trcrsctlr29, x10
+# CHECK: msr      {{trcrsctlr29|TRCRSCTLR29}}, x10
 0x38 0x1e 0x11 0xd5
-# CHECK: msr      trcrsctlr30, x24
+# CHECK: msr      {{trcrsctlr30|TRCRSCTLR30}}, x24
 0x34 0x1f 0x11 0xd5
-# CHECK: msr      trcrsctlr31, x20
+# CHECK: msr      {{trcrsctlr31|TRCRSCTLR31}}, x20
 0x57 0x10 0x11 0xd5
-# CHECK: msr      trcssccr0, x23
+# CHECK: msr      {{trcssccr0|TRCSSCCR0}}, x23
 0x5b 0x11 0x11 0xd5
-# CHECK: msr      trcssccr1, x27
+# CHECK: msr      {{trcssccr1|TRCSSCCR1}}, x27
 0x5b 0x12 0x11 0xd5
-# CHECK: msr      trcssccr2, x27
+# CHECK: msr      {{trcssccr2|TRCSSCCR2}}, x27
 0x46 0x13 0x11 0xd5
-# CHECK: msr      trcssccr3, x6
+# CHECK: msr      {{trcssccr3|TRCSSCCR3}}, x6
 0x43 0x14 0x11 0xd5
-# CHECK: msr      trcssccr4, x3
+# CHECK: msr      {{trcssccr4|TRCSSCCR4}}, x3
 0x4c 0x15 0x11 0xd5
-# CHECK: msr      trcssccr5, x12
+# CHECK: msr      {{trcssccr5|TRCSSCCR5}}, x12
 0x47 0x16 0x11 0xd5
-# CHECK: msr      trcssccr6, x7
+# CHECK: msr      {{trcssccr6|TRCSSCCR6}}, x7
 0x46 0x17 0x11 0xd5
-# CHECK: msr      trcssccr7, x6
+# CHECK: msr      {{trcssccr7|TRCSSCCR7}}, x6
 0x54 0x18 0x11 0xd5
-# CHECK: msr      trcsscsr0, x20
+# CHECK: msr      {{trcsscsr0|TRCSSCSR0}}, x20
 0x51 0x19 0x11 0xd5
-# CHECK: msr      trcsscsr1, x17
+# CHECK: msr      {{trcsscsr1|TRCSSCSR1}}, x17
 0x4b 0x1a 0x11 0xd5
-# CHECK: msr      trcsscsr2, x11
+# CHECK: msr      {{trcsscsr2|TRCSSCSR2}}, x11
 0x44 0x1b 0x11 0xd5
-# CHECK: msr      trcsscsr3, x4
+# CHECK: msr      {{trcsscsr3|TRCSSCSR3}}, x4
 0x4e 0x1c 0x11 0xd5
-# CHECK: msr      trcsscsr4, x14
+# CHECK: msr      {{trcsscsr4|TRCSSCSR4}}, x14
 0x56 0x1d 0x11 0xd5
-# CHECK: msr      trcsscsr5, x22
+# CHECK: msr      {{trcsscsr5|TRCSSCSR5}}, x22
 0x43 0x1e 0x11 0xd5
-# CHECK: msr      trcsscsr6, x3
+# CHECK: msr      {{trcsscsr6|TRCSSCSR6}}, x3
 0x4b 0x1f 0x11 0xd5
-# CHECK: msr      trcsscsr7, x11
+# CHECK: msr      {{trcsscsr7|TRCSSCSR7}}, x11
 0x83 0x14 0x11 0xd5
-# CHECK: msr      trcpdcr, x3
+# CHECK: msr      {{trcpdcr|TRCPDCR}}, x3
 0x6 0x20 0x11 0xd5
-# CHECK: msr      trcacvr0, x6
+# CHECK: msr      {{trcacvr0|TRCACVR0}}, x6
 0x14 0x22 0x11 0xd5
-# CHECK: msr      trcacvr1, x20
+# CHECK: msr      {{trcacvr1|TRCACVR1}}, x20
 0x19 0x24 0x11 0xd5
-# CHECK: msr      trcacvr2, x25
+# CHECK: msr      {{trcacvr2|TRCACVR2}}, x25
 0x1 0x26 0x11 0xd5
-# CHECK: msr      trcacvr3, x1
+# CHECK: msr      {{trcacvr3|TRCACVR3}}, x1
 0x1c 0x28 0x11 0xd5
-# CHECK: msr      trcacvr4, x28
+# CHECK: msr      {{trcacvr4|TRCACVR4}}, x28
 0xf 0x2a 0x11 0xd5
-# CHECK: msr      trcacvr5, x15
+# CHECK: msr      {{trcacvr5|TRCACVR5}}, x15
 0x19 0x2c 0x11 0xd5
-# CHECK: msr      trcacvr6, x25
+# CHECK: msr      {{trcacvr6|TRCACVR6}}, x25
 0xc 0x2e 0x11 0xd5
-# CHECK: msr      trcacvr7, x12
+# CHECK: msr      {{trcacvr7|TRCACVR7}}, x12
 0x25 0x20 0x11 0xd5
-# CHECK: msr      trcacvr8, x5
+# CHECK: msr      {{trcacvr8|TRCACVR8}}, x5
 0x39 0x22 0x11 0xd5
-# CHECK: msr      trcacvr9, x25
+# CHECK: msr      {{trcacvr9|TRCACVR9}}, x25
 0x2d 0x24 0x11 0xd5
-# CHECK: msr      trcacvr10, x13
+# CHECK: msr      {{trcacvr10|TRCACVR10}}, x13
 0x2a 0x26 0x11 0xd5
-# CHECK: msr      trcacvr11, x10
+# CHECK: msr      {{trcacvr11|TRCACVR11}}, x10
 0x33 0x28 0x11 0xd5
-# CHECK: msr      trcacvr12, x19
+# CHECK: msr      {{trcacvr12|TRCACVR12}}, x19
 0x2a 0x2a 0x11 0xd5
-# CHECK: msr      trcacvr13, x10
+# CHECK: msr      {{trcacvr13|TRCACVR13}}, x10
 0x33 0x2c 0x11 0xd5
-# CHECK: msr      trcacvr14, x19
+# CHECK: msr      {{trcacvr14|TRCACVR14}}, x19
 0x22 0x2e 0x11 0xd5
-# CHECK: msr      trcacvr15, x2
+# CHECK: msr      {{trcacvr15|TRCACVR15}}, x2
 0x4f 0x20 0x11 0xd5
-# CHECK: msr      trcacatr0, x15
+# CHECK: msr      {{trcacatr0|TRCACATR0}}, x15
 0x4d 0x22 0x11 0xd5
-# CHECK: msr      trcacatr1, x13
+# CHECK: msr      {{trcacatr1|TRCACATR1}}, x13
 0x48 0x24 0x11 0xd5
-# CHECK: msr      trcacatr2, x8
+# CHECK: msr      {{trcacatr2|TRCACATR2}}, x8
 0x41 0x26 0x11 0xd5
-# CHECK: msr      trcacatr3, x1
+# CHECK: msr      {{trcacatr3|TRCACATR3}}, x1
 0x4b 0x28 0x11 0xd5
-# CHECK: msr      trcacatr4, x11
+# CHECK: msr      {{trcacatr4|TRCACATR4}}, x11
 0x48 0x2a 0x11 0xd5
-# CHECK: msr      trcacatr5, x8
+# CHECK: msr      {{trcacatr5|TRCACATR5}}, x8
 0x58 0x2c 0x11 0xd5
-# CHECK: msr      trcacatr6, x24
+# CHECK: msr      {{trcacatr6|TRCACATR6}}, x24
 0x46 0x2e 0x11 0xd5
-# CHECK: msr      trcacatr7, x6
+# CHECK: msr      {{trcacatr7|TRCACATR7}}, x6
 0x77 0x20 0x11 0xd5
-# CHECK: msr      trcacatr8, x23
+# CHECK: msr      {{trcacatr8|TRCACATR8}}, x23
 0x65 0x22 0x11 0xd5
-# CHECK: msr      trcacatr9, x5
+# CHECK: msr      {{trcacatr9|TRCACATR9}}, x5
 0x6b 0x24 0x11 0xd5
-# CHECK: msr      trcacatr10, x11
+# CHECK: msr      {{trcacatr10|TRCACATR10}}, x11
 0x6b 0x26 0x11 0xd5
-# CHECK: msr      trcacatr11, x11
+# CHECK: msr      {{trcacatr11|TRCACATR11}}, x11
 0x63 0x28 0x11 0xd5
-# CHECK: msr      trcacatr12, x3
+# CHECK: msr      {{trcacatr12|TRCACATR12}}, x3
 0x7c 0x2a 0x11 0xd5
-# CHECK: msr      trcacatr13, x28
+# CHECK: msr      {{trcacatr13|TRCACATR13}}, x28
 0x79 0x2c 0x11 0xd5
-# CHECK: msr      trcacatr14, x25
+# CHECK: msr      {{trcacatr14|TRCACATR14}}, x25
 0x64 0x2e 0x11 0xd5
-# CHECK: msr      trcacatr15, x4
+# CHECK: msr      {{trcacatr15|TRCACATR15}}, x4
 0x86 0x20 0x11 0xd5
-# CHECK: msr      trcdvcvr0, x6
+# CHECK: msr      {{trcdvcvr0|TRCDVCVR0}}, x6
 0x83 0x24 0x11 0xd5
-# CHECK: msr      trcdvcvr1, x3
+# CHECK: msr      {{trcdvcvr1|TRCDVCVR1}}, x3
 0x85 0x28 0x11 0xd5
-# CHECK: msr      trcdvcvr2, x5
+# CHECK: msr      {{trcdvcvr2|TRCDVCVR2}}, x5
 0x8b 0x2c 0x11 0xd5
-# CHECK: msr      trcdvcvr3, x11
+# CHECK: msr      {{trcdvcvr3|TRCDVCVR3}}, x11
 0xa9 0x20 0x11 0xd5
-# CHECK: msr      trcdvcvr4, x9
+# CHECK: msr      {{trcdvcvr4|TRCDVCVR4}}, x9
 0xae 0x24 0x11 0xd5
-# CHECK: msr      trcdvcvr5, x14
+# CHECK: msr      {{trcdvcvr5|TRCDVCVR5}}, x14
 0xaa 0x28 0x11 0xd5
-# CHECK: msr      trcdvcvr6, x10
+# CHECK: msr      {{trcdvcvr6|TRCDVCVR6}}, x10
 0xac 0x2c 0x11 0xd5
-# CHECK: msr      trcdvcvr7, x12
+# CHECK: msr      {{trcdvcvr7|TRCDVCVR7}}, x12
 0xc8 0x20 0x11 0xd5
-# CHECK: msr      trcdvcmr0, x8
+# CHECK: msr      {{trcdvcmr0|TRCDVCMR0}}, x8
 0xc8 0x24 0x11 0xd5
-# CHECK: msr      trcdvcmr1, x8
+# CHECK: msr      {{trcdvcmr1|TRCDVCMR1}}, x8
 0xd6 0x28 0x11 0xd5
-# CHECK: msr      trcdvcmr2, x22
+# CHECK: msr      {{trcdvcmr2|TRCDVCMR2}}, x22
 0xd6 0x2c 0x11 0xd5
-# CHECK: msr      trcdvcmr3, x22
+# CHECK: msr      {{trcdvcmr3|TRCDVCMR3}}, x22
 0xe5 0x20 0x11 0xd5
-# CHECK: msr      trcdvcmr4, x5
+# CHECK: msr      {{trcdvcmr4|TRCDVCMR4}}, x5
 0xf0 0x24 0x11 0xd5
-# CHECK: msr      trcdvcmr5, x16
+# CHECK: msr      {{trcdvcmr5|TRCDVCMR5}}, x16
 0xfb 0x28 0x11 0xd5
-# CHECK: msr      trcdvcmr6, x27
+# CHECK: msr      {{trcdvcmr6|TRCDVCMR6}}, x27
 0xf5 0x2c 0x11 0xd5
-# CHECK: msr      trcdvcmr7, x21
+# CHECK: msr      {{trcdvcmr7|TRCDVCMR7}}, x21
 0x8 0x30 0x11 0xd5
-# CHECK: msr      trccidcvr0, x8
+# CHECK: msr      {{trccidcvr0|TRCCIDCVR0}}, x8
 0x6 0x32 0x11 0xd5
-# CHECK: msr      trccidcvr1, x6
+# CHECK: msr      {{trccidcvr1|TRCCIDCVR1}}, x6
 0x9 0x34 0x11 0xd5
-# CHECK: msr      trccidcvr2, x9
+# CHECK: msr      {{trccidcvr2|TRCCIDCVR2}}, x9
 0x8 0x36 0x11 0xd5
-# CHECK: msr      trccidcvr3, x8
+# CHECK: msr      {{trccidcvr3|TRCCIDCVR3}}, x8
 0x3 0x38 0x11 0xd5
-# CHECK: msr      trccidcvr4, x3
+# CHECK: msr      {{trccidcvr4|TRCCIDCVR4}}, x3
 0x15 0x3a 0x11 0xd5
-# CHECK: msr      trccidcvr5, x21
+# CHECK: msr      {{trccidcvr5|TRCCIDCVR5}}, x21
 0xc 0x3c 0x11 0xd5
-# CHECK: msr      trccidcvr6, x12
+# CHECK: msr      {{trccidcvr6|TRCCIDCVR6}}, x12
 0x7 0x3e 0x11 0xd5
-# CHECK: msr      trccidcvr7, x7
+# CHECK: msr      {{trccidcvr7|TRCCIDCVR7}}, x7
 0x24 0x30 0x11 0xd5
-# CHECK: msr      trcvmidcvr0, x4
+# CHECK: msr      {{trcvmidcvr0|TRCVMIDCVR0}}, x4
 0x23 0x32 0x11 0xd5
-# CHECK: msr      trcvmidcvr1, x3
+# CHECK: msr      {{trcvmidcvr1|TRCVMIDCVR1}}, x3
 0x29 0x34 0x11 0xd5
-# CHECK: msr      trcvmidcvr2, x9
+# CHECK: msr      {{trcvmidcvr2|TRCVMIDCVR2}}, x9
 0x31 0x36 0x11 0xd5
-# CHECK: msr      trcvmidcvr3, x17
+# CHECK: msr      {{trcvmidcvr3|TRCVMIDCVR3}}, x17
 0x2e 0x38 0x11 0xd5
-# CHECK: msr      trcvmidcvr4, x14
+# CHECK: msr      {{trcvmidcvr4|TRCVMIDCVR4}}, x14
 0x2c 0x3a 0x11 0xd5
-# CHECK: msr      trcvmidcvr5, x12
+# CHECK: msr      {{trcvmidcvr5|TRCVMIDCVR5}}, x12
 0x2a 0x3c 0x11 0xd5
-# CHECK: msr      trcvmidcvr6, x10
+# CHECK: msr      {{trcvmidcvr6|TRCVMIDCVR6}}, x10
 0x23 0x3e 0x11 0xd5
-# CHECK: msr      trcvmidcvr7, x3
+# CHECK: msr      {{trcvmidcvr7|TRCVMIDCVR7}}, x3
 0x4e 0x30 0x11 0xd5
-# CHECK: msr      trccidcctlr0, x14
+# CHECK: msr      {{trccidcctlr0|TRCCIDCCTLR0}}, x14
 0x56 0x31 0x11 0xd5
-# CHECK: msr      trccidcctlr1, x22
+# CHECK: msr      {{trccidcctlr1|TRCCIDCCTLR1}}, x22
 0x48 0x32 0x11 0xd5
-# CHECK: msr      trcvmidcctlr0, x8
+# CHECK: msr      {{trcvmidcctlr0|TRCVMIDCCTLR0}}, x8
 0x4f 0x33 0x11 0xd5
-# CHECK: msr      trcvmidcctlr1, x15
+# CHECK: msr      {{trcvmidcctlr1|TRCVMIDCCTLR1}}, x15
 0x81 0x70 0x11 0xd5
-# CHECK: msr      trcitctrl, x1
+# CHECK: msr      {{trcitctrl|TRCITCTRL}}, x1
 0xc7 0x78 0x11 0xd5
-# CHECK: msr      trcclaimset, x7
+# CHECK: msr      {{trcclaimset|TRCCLAIMSET}}, x7
 0xdd 0x79 0x11 0xd5
-# CHECK: msr      trcclaimclr, x29
+# CHECK: msr      {{trcclaimclr|TRCCLAIMCLR}}, x29
 
 
diff --git a/test/MC/Disassembler/ARM/invalid-thumbv7.txt b/test/MC/Disassembler/ARM/invalid-thumbv7.txt
index 2c84b8a..5257633 100644
--- a/test/MC/Disassembler/ARM/invalid-thumbv7.txt
+++ b/test/MC/Disassembler/ARM/invalid-thumbv7.txt
@@ -21,17 +21,6 @@
 # CHECK: warning: invalid instruction encoding
 # CHECK-NEXT: [0xaf 0xf7 0x44 0x8b]
 
-# Opcode=2249 Name=tBcc Format=ARM_FORMAT_THUMBFRM(25)
-#  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
-# -------------------------------------------------------------------------------------------------
-# | 0: 0: 0: 0| 0: 0: 0: 0| 0: 0: 0: 0| 0: 0: 0: 0| 1: 1: 0: 1| 1: 1: 1: 0| 0: 1: 1: 0| 1: 1: 1: 1|
-# -------------------------------------------------------------------------------------------------
-#
-# if cond = '1110' then UNDEFINED
-[0x6f 0xde]
-# CHECK: invalid instruction encoding
-# CHECK-NEXT: [0x6f 0xde]
-
 #------------------------------------------------------------------------------
 # Undefined encoding for it
 #------------------------------------------------------------------------------
@@ -249,34 +238,6 @@
 # CHECK-NEXT: [0xe4 0xe9 0x02 0x46]
 
 #------------------------------------------------------------------------------
-# Undefined encodings for NEON/VFP instructions with invalid predicate bits
-#------------------------------------------------------------------------------
-
-# VABS
-[0x40 0xde 0x00 0x0a]
-# CHECK: invalid instruction encoding
-# CHECK-NEXT: [0x40 0xde 0x00 0x0a]
-
-
-# VMLA
-[0xf0 0xde 0xe0 0x0b]
-# CHECK: invalid instruction encoding
-# CHECK-NEXT: [0xf0 0xde 0xe0 0x0b]
-
-# VMOV/VDUP between scalar and core registers with invalid predicate bits (pred != 0b1110)
-
-# VMOV
-[0x00 0xde 0x10 0x0b]
-# CHECK: invalid instruction encoding
-# CHECK-NEXT: [0x00 0xde 0x10 0x0b]
-
-# VDUP
-[0xff 0xde 0xf0 0xfb]
-# CHECK: invalid instruction encoding
-# CHECK-NEXT: [0xff 0xde 0xf0 0xfb]
-
-
-#------------------------------------------------------------------------------
 # Undefined encodings for NEON vld instructions
 #------------------------------------------------------------------------------
 
diff --git a/test/MC/Disassembler/ARM64/advsimd.txt b/test/MC/Disassembler/ARM64/advsimd.txt
deleted file mode 100644
index 486dd16..0000000
--- a/test/MC/Disassembler/ARM64/advsimd.txt
+++ /dev/null
@@ -1,2282 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin -output-asm-variant=1 --disassemble < %s | FileCheck %s
-
-0x00 0xb8 0x20 0x0e
-0x00 0xb8 0x20 0x4e
-0x00 0xb8 0x60 0x0e
-0x00 0xb8 0x60 0x4e
-0x00 0xb8 0xa0 0x0e
-0x00 0xb8 0xa0 0x4e
-
-# CHECK: abs.8b  v0, v0
-# CHECK: abs.16b v0, v0
-# CHECK: abs.4h  v0, v0
-# CHECK: abs.8h  v0, v0
-# CHECK: abs.2s  v0, v0
-# CHECK: abs.4s  v0, v0
-
-0x00 0x84 0x20 0x0e
-0x00 0x84 0x20 0x4e
-0x00 0x84 0x60 0x0e
-0x00 0x84 0x60 0x4e
-0x00 0x84 0xa0 0x0e
-0x00 0x84 0xa0 0x4e
-0x00 0x84 0xe0 0x4e
-
-# CHECK: add.8b  v0, v0, v0
-# CHECK: add.16b v0, v0, v0
-# CHECK: add.4h  v0, v0, v0
-# CHECK: add.8h  v0, v0, v0
-# CHECK: add.2s  v0, v0, v0
-# CHECK: add.4s  v0, v0, v0
-# CHECK: add.2d  v0, v0, v0
-
-0x41 0x84 0xe3 0x5e
-
-# CHECK: add d1, d2, d3
-
-0x00 0x40 0x20 0x0e
-0x00 0x40 0x20 0x4e
-0x00 0x40 0x60 0x0e
-0x00 0x40 0x60 0x4e
-0x00 0x40 0xa0 0x0e
-0x00 0x40 0xa0 0x4e
-
-# CHECK: addhn.8b   v0, v0, v0
-# CHECK: addhn2.16b v0, v0, v0
-# CHECK: addhn.4h   v0, v0, v0
-# CHECK: addhn2.8h  v0, v0, v0
-# CHECK: addhn.2s   v0, v0, v0
-# CHECK: addhn2.4s  v0, v0, v0
-
-0x00 0xbc 0x20 0x0e
-0x00 0xbc 0x20 0x4e
-0x00 0xbc 0x60 0x0e
-0x00 0xbc 0x60 0x4e
-0x00 0xbc 0xa0 0x0e
-0x00 0xbc 0xa0 0x4e
-0x00 0xbc 0xe0 0x4e
-
-# CHECK: addp.8b   v0, v0, v0
-# CHECK: addp.16b  v0, v0, v0
-# CHECK: addp.4h   v0, v0, v0
-# CHECK: addp.8h   v0, v0, v0
-# CHECK: addp.2s   v0, v0, v0
-# CHECK: addp.4s   v0, v0, v0
-# CHECK: addp.2d   v0, v0, v0
-
-0x00 0xb8 0xf1 0x5e
-
-# CHECK: addp.2d d0, v0
-
-0x00 0xb8 0x31 0x0e
-0x00 0xb8 0x31 0x4e
-0x00 0xb8 0x71 0x0e
-0x00 0xb8 0x71 0x4e
-0x00 0xb8 0xb1 0x4e
-
-# CHECK: addv.8b  b0, v0
-# CHECK: addv.16b b0, v0
-# CHECK: addv.4h  h0, v0
-# CHECK: addv.8h  h0, v0
-# CHECK: addv.4s  s0, v0
-
-
-# INS/DUP
-0x60 0x0c 0x08 0x4e
-0x60 0x0c 0x04 0x4e
-0x60 0x0c 0x04 0x0e
-0x60 0x0c 0x02 0x4e
-0x60 0x0c 0x02 0x0e
-0x60 0x0c 0x01 0x4e
-0x60 0x0c 0x01 0x0e
-
-# CHECK: dup.2d  v0, x3
-# CHECK: dup.4s  v0, w3
-# CHECK: dup.2s  v0, w3
-# CHECK: dup.8h  v0, w3
-# CHECK: dup.4h  v0, w3
-# CHECK: dup.16b v0, w3
-# CHECK: dup.8b  v0, w3
-
-0x60 0x04 0x18 0x4e
-0x60 0x04 0x0c 0x0e
-0x60 0x04 0x0c 0x4e
-0x60 0x04 0x06 0x0e
-0x60 0x04 0x06 0x4e
-0x60 0x04 0x03 0x0e
-0x60 0x04 0x03 0x4e
-
-# CHECK: dup.2d  v0, v3[1]
-# CHECK: dup.2s  v0, v3[1]
-# CHECK: dup.4s  v0, v3[1]
-# CHECK: dup.4h  v0, v3[1]
-# CHECK: dup.8h  v0, v3[1]
-# CHECK: dup.8b  v0, v3[1]
-# CHECK: dup.16b v0, v3[1]
-
-
-0x43 0x2c 0x14 0x4e
-0x43 0x2c 0x14 0x4e
-0x43 0x3c 0x14 0x0e
-0x43 0x3c 0x14 0x0e
-0x43 0x3c 0x18 0x4e
-0x43 0x3c 0x18 0x4e
-
-# CHECK: smov.s  x3, v2[2]
-# CHECK: smov.s  x3, v2[2]
-# CHECK: umov.s  w3, v2[2]
-# CHECK: umov.s  w3, v2[2]
-# CHECK: umov.d  x3, v2[1]
-# CHECK: umov.d  x3, v2[1]
-
-0xa2 0x1c 0x18 0x4e
-0xa2 0x1c 0x0c 0x4e
-0xa2 0x1c 0x06 0x4e
-0xa2 0x1c 0x03 0x4e
-
-0xa2 0x1c 0x18 0x4e
-0xa2 0x1c 0x0c 0x4e
-0xa2 0x1c 0x06 0x4e
-0xa2 0x1c 0x03 0x4e
-
-# CHECK: ins.d v2[1], x5
-# CHECK: ins.s v2[1], w5
-# CHECK: ins.h v2[1], w5
-# CHECK: ins.b v2[1], w5
-
-# CHECK: ins.d v2[1], x5
-# CHECK: ins.s v2[1], w5
-# CHECK: ins.h v2[1], w5
-# CHECK: ins.b v2[1], w5
-
-0xe2 0x45 0x18 0x6e
-0xe2 0x25 0x0c 0x6e
-0xe2 0x15 0x06 0x6e
-0xe2 0x0d 0x03 0x6e
-
-0xe2 0x05 0x18 0x6e
-0xe2 0x45 0x1c 0x6e
-0xe2 0x35 0x1e 0x6e
-0xe2 0x2d 0x15 0x6e
-
-# CHECK: ins.d v2[1], v15[1]
-# CHECK: ins.s v2[1], v15[1]
-# CHECK: ins.h v2[1], v15[1]
-# CHECK: ins.b v2[1], v15[1]
-
-# CHECK: ins.d v2[1], v15[0]
-# CHECK: ins.s v2[3], v15[2]
-# CHECK: ins.h v2[7], v15[3]
-# CHECK: ins.b v2[10], v15[5]
-
-0x00 0x1c 0x20 0x0e
-0x00 0x1c 0x20 0x4e
-
-# CHECK: and.8b  v0, v0, v0
-# CHECK: and.16b v0, v0, v0
-
-0x00 0x1c 0x60 0x0e
-
-# CHECK: bic.8b  v0, v0, v0
-
-0x00 0x8c 0x20 0x2e
-0x00 0x3c 0x20 0x0e
-0x00 0x34 0x20 0x0e
-0x00 0x34 0x20 0x2e
-0x00 0x3c 0x20 0x2e
-0x00 0x8c 0x20 0x0e
-0x00 0xd4 0xa0 0x2e
-0x00 0xec 0x20 0x2e
-0x00 0xec 0xa0 0x2e
-0x00 0xd4 0x20 0x2e
-0x00 0xd4 0x20 0x0e
-0x00 0xe4 0x20 0x0e
-0x00 0xe4 0x20 0x2e
-0x00 0xe4 0xa0 0x2e
-0x00 0xfc 0x20 0x2e
-0x00 0xc4 0x20 0x2e
-0x00 0xc4 0x20 0x0e
-0x00 0xf4 0x20 0x2e
-0x00 0xf4 0x20 0x0e
-0x00 0xc4 0xa0 0x2e
-0x00 0xc4 0xa0 0x0e
-0x00 0xf4 0xa0 0x2e
-0x00 0xf4 0xa0 0x0e
-0x00 0xcc 0x20 0x0e
-0x00 0xcc 0xa0 0x0e
-0x00 0xdc 0x20 0x0e
-0x00 0xdc 0x20 0x2e
-0x00 0xfc 0x20 0x0e
-0x00 0xfc 0xa0 0x0e
-0x00 0xd4 0xa0 0x0e
-0x00 0x94 0x20 0x0e
-0x00 0x94 0x20 0x2e
-0x00 0x9c 0x20 0x0e
-0x00 0x9c 0x20 0x2e
-0x00 0x7c 0x20 0x0e
-0x00 0x74 0x20 0x0e
-0x00 0x04 0x20 0x0e
-0x00 0x24 0x20 0x0e
-0x00 0xa4 0x20 0x0e
-0x00 0x64 0x20 0x0e
-0x00 0xac 0x20 0x0e
-0x00 0x6c 0x20 0x0e
-0x00 0x0c 0x20 0x0e
-0x00 0xb4 0x60 0x0e
-0x00 0xb4 0x60 0x2e
-0x00 0x5c 0x20 0x0e
-0x00 0x4c 0x20 0x0e
-0x00 0x2c 0x20 0x0e
-0x00 0x14 0x20 0x0e
-0x00 0x54 0x20 0x0e
-0x00 0x44 0x20 0x0e
-0x00 0x84 0x20 0x2e
-0x00 0x7c 0x20 0x2e
-0x00 0x74 0x20 0x2e
-0x00 0x04 0x20 0x2e
-0x00 0x24 0x20 0x2e
-0x00 0xa4 0x20 0x2e
-0x00 0x64 0x20 0x2e
-0x00 0xac 0x20 0x2e
-0x00 0x6c 0x20 0x2e
-0x00 0x0c 0x20 0x2e
-0x00 0x5c 0x20 0x2e
-0x00 0x4c 0x20 0x2e
-0x00 0x2c 0x20 0x2e
-0x00 0x14 0x20 0x2e
-0x00 0x54 0x20 0x2e
-0x00 0x44 0x20 0x2e
-
-# CHECK: cmeq.8b	v0, v0, v0
-# CHECK: cmge.8b	v0, v0, v0
-# CHECK: cmgt.8b	v0, v0, v0
-# CHECK: cmhi.8b	v0, v0, v0
-# CHECK: cmhs.8b	v0, v0, v0
-# CHECK: cmtst.8b	v0, v0, v0
-# CHECK: fabd.2s	v0, v0, v0
-# CHECK: facge.2s	v0, v0, v0
-# CHECK: facgt.2s	v0, v0, v0
-# CHECK: faddp.2s	v0, v0, v0
-# CHECK: fadd.2s	v0, v0, v0
-# CHECK: fcmeq.2s	v0, v0, v0
-# CHECK: fcmge.2s	v0, v0, v0
-# CHECK: fcmgt.2s	v0, v0, v0
-# CHECK: fdiv.2s	v0, v0, v0
-# CHECK: fmaxnmp.2s	v0, v0, v0
-# CHECK: fmaxnm.2s	v0, v0, v0
-# CHECK: fmaxp.2s	v0, v0, v0
-# CHECK: fmax.2s	v0, v0, v0
-# CHECK: fminnmp.2s	v0, v0, v0
-# CHECK: fminnm.2s	v0, v0, v0
-# CHECK: fminp.2s	v0, v0, v0
-# CHECK: fmin.2s	v0, v0, v0
-# CHECK: fmla.2s	v0, v0, v0
-# CHECK: fmls.2s	v0, v0, v0
-# CHECK: fmulx.2s	v0, v0, v0
-# CHECK: fmul.2s	v0, v0, v0
-# CHECK: frecps.2s	v0, v0, v0
-# CHECK: frsqrts.2s	v0, v0, v0
-# CHECK: fsub.2s	v0, v0, v0
-# CHECK: mla.8b	v0, v0, v0
-# CHECK: mls.8b	v0, v0, v0
-# CHECK: mul.8b	v0, v0, v0
-# CHECK: pmul.8b	v0, v0, v0
-# CHECK: saba.8b	v0, v0, v0
-# CHECK: sabd.8b	v0, v0, v0
-# CHECK: shadd.8b	v0, v0, v0
-# CHECK: shsub.8b	v0, v0, v0
-# CHECK: smaxp.8b	v0, v0, v0
-# CHECK: smax.8b	v0, v0, v0
-# CHECK: sminp.8b	v0, v0, v0
-# CHECK: smin.8b	v0, v0, v0
-# CHECK: sqadd.8b	v0, v0, v0
-# CHECK: sqdmulh.4h v0, v0, v0
-# CHECK: sqrdmulh.4h v0, v0, v0
-# CHECK: sqrshl.8b	v0, v0, v0
-# CHECK: sqshl.8b	v0, v0, v0
-# CHECK: sqsub.8b	v0, v0, v0
-# CHECK: srhadd.8b	v0, v0, v0
-# CHECK: srshl.8b	v0, v0, v0
-# CHECK: sshl.8b	v0, v0, v0
-# CHECK: sub.8b	v0, v0, v0
-# CHECK: uaba.8b	v0, v0, v0
-# CHECK: uabd.8b	v0, v0, v0
-# CHECK: uhadd.8b	v0, v0, v0
-# CHECK: uhsub.8b	v0, v0, v0
-# CHECK: umaxp.8b	v0, v0, v0
-# CHECK: umax.8b	v0, v0, v0
-# CHECK: uminp.8b	v0, v0, v0
-# CHECK: umin.8b	v0, v0, v0
-# CHECK: uqadd.8b	v0, v0, v0
-# CHECK: uqrshl.8b	v0, v0, v0
-# CHECK: uqshl.8b	v0, v0, v0
-# CHECK: uqsub.8b	v0, v0, v0
-# CHECK: urhadd.8b	v0, v0, v0
-# CHECK: urshl.8b	v0, v0, v0
-# CHECK: ushl.8b	v0, v0, v0
-
-0x00 0x1c 0xe0 0x2e
-0x00 0x1c 0xa0 0x2e
-0x00 0x1c 0x60 0x2e
-0x00 0x1c 0x20 0x2e
-0x00 0x1c 0xe0 0x0e
-0x00 0x1c 0xa0 0x0e
-
-# CHECK: bif.8b	v0, v0, v0
-# CHECK: bit.8b	v0, v0, v0
-# CHECK: bsl.8b	v0, v0, v0
-# CHECK: eor.8b	v0, v0, v0
-# CHECK: orn.8b	v0, v0, v0
-# CHECK: orr.8b	v0, v0, v0
-
-0x00 0x68 0x20 0x0e
-0x00 0x68 0x20 0x4e
-0x00 0x68 0x60 0x0e
-0x00 0x68 0x60 0x4e
-0x00 0x68 0xa0 0x0e
-0x00 0x68 0xa0 0x4e
-
-# CHECK: sadalp.4h	v0, v0
-# CHECK: sadalp.8h	v0, v0
-# CHECK: sadalp.2s	v0, v0
-# CHECK: sadalp.4s	v0, v0
-# CHECK: sadalp.1d	v0, v0
-# CHECK: sadalp.2d	v0, v0
-
-0x00 0x48 0x20 0x0e
-0x00 0x48 0x20 0x2e
-0x00 0x58 0x20 0x0e
-0x00 0xf8 0xa0 0x0e
-0x00 0xc8 0x21 0x0e
-0x00 0xc8 0x21 0x2e
-0x00 0xb8 0x21 0x0e
-0x00 0xb8 0x21 0x2e
-0x00 0xa8 0x21 0x0e
-0x00 0xa8 0x21 0x2e
-0x00 0xa8 0xa1 0x0e
-0x00 0xa8 0xa1 0x2e
-0x00 0xb8 0xa1 0x0e
-0x00 0xb8 0xa1 0x2e
-0x00 0xf8 0xa0 0x2e
-0x00 0xd8 0xa1 0x0e
-0x00 0xd8 0xa1 0x2e
-0x00 0xf8 0xa1 0x2e
-0x00 0xb8 0x20 0x2e
-0x00 0x58 0x20 0x2e
-0x00 0x58 0x60 0x2e
-0x00 0x18 0x20 0x0e
-0x00 0x08 0x20 0x2e
-0x00 0x08 0x20 0x0e
-0x00 0x68 0x20 0x0e
-0x00 0x28 0x20 0x0e
-0x00 0xd8 0x21 0x0e
-0x00 0x38 0x21 0x2e
-0x00 0x78 0x20 0x0e
-0x00 0x78 0x20 0x2e
-0x00 0x48 0x21 0x0e
-0x00 0x28 0x21 0x2e
-0x00 0x38 0x20 0x0e
-0x00 0x68 0x20 0x2e
-0x00 0x28 0x20 0x2e
-0x00 0xd8 0x21 0x2e
-0x00 0x48 0x21 0x2e
-0x00 0xc8 0xa1 0x0e
-0x00 0xc8 0xa1 0x2e
-0x00 0x38 0x20 0x2e
-0x00 0x28 0x21 0x0e
-0x00 0x48 0x20 0x0e
-0x00 0x48 0x20 0x2e
-0x00 0x58 0x20 0x0e
-0x00 0xf8 0xa0 0x0e
-0x00 0xc8 0x21 0x0e
-0x00 0xc8 0x21 0x2e
-0x00 0xb8 0x21 0x0e
-0x00 0xb8 0x21 0x2e
-0x00 0xa8 0x21 0x0e
-0x00 0xa8 0x21 0x2e
-0x00 0xa8 0xa1 0x0e
-0x00 0xa8 0xa1 0x2e
-0x00 0xb8 0xa1 0x0e
-0x00 0xb8 0xa1 0x2e
-0x00 0xf8 0xa0 0x2e
-0x00 0xd8 0xa1 0x0e
-0x00 0xd8 0xa1 0x2e
-0x00 0xf8 0xa1 0x2e
-0x00 0xb8 0x20 0x2e
-0x00 0x58 0x20 0x2e
-0x00 0x58 0x60 0x2e
-0x00 0x18 0x20 0x0e
-0x00 0x08 0x20 0x2e
-0x00 0x08 0x20 0x0e
-0x00 0x68 0x20 0x0e
-0x00 0x28 0x20 0x0e
-0x00 0xd8 0x21 0x0e
-0x00 0x38 0x21 0x2e
-0x00 0x78 0x20 0x0e
-0x00 0x78 0x20 0x2e
-0x00 0x48 0x21 0x0e
-0x00 0x28 0x21 0x2e
-0x00 0x38 0x20 0x0e
-0x00 0x68 0x20 0x2e
-0x00 0x28 0x20 0x2e
-0x00 0xd8 0x21 0x2e
-0x00 0x48 0x21 0x2e
-0x00 0xc8 0xa1 0x0e
-0x00 0xc8 0xa1 0x2e
-0x00 0x38 0x20 0x2e
-0x00 0x28 0x21 0x0e
-
-# CHECK: cls.8b	v0, v0
-# CHECK: clz.8b	v0, v0
-# CHECK: cnt.8b	v0, v0
-# CHECK: fabs.2s	v0, v0
-# CHECK: fcvtas.2s	v0, v0
-# CHECK: fcvtau.2s	v0, v0
-# CHECK: fcvtms.2s	v0, v0
-# CHECK: fcvtmu.2s	v0, v0
-# CHECK: fcvtns.2s	v0, v0
-# CHECK: fcvtnu.2s	v0, v0
-# CHECK: fcvtps.2s	v0, v0
-# CHECK: fcvtpu.2s	v0, v0
-# CHECK: fcvtzs.2s	v0, v0
-# CHECK: fcvtzu.2s	v0, v0
-# CHECK: fneg.2s	v0, v0
-# CHECK: frecpe.2s	v0, v0
-# CHECK: frsqrte.2s	v0, v0
-# CHECK: fsqrt.2s	v0, v0
-# CHECK: neg.8b	v0, v0
-# CHECK: not.8b	v0, v0
-# CHECK: rbit.8b	v0, v0
-# CHECK: rev16.8b	v0, v0
-# CHECK: rev32.8b	v0, v0
-# CHECK: rev64.8b	v0, v0
-# CHECK: sadalp.4h	v0, v0
-# CHECK: saddlp.4h	v0, v0
-# CHECK: scvtf.2s	v0, v0
-# CHECK: shll.8h	v0, v0, #8
-# CHECK: sqabs.8b	v0, v0
-# CHECK: sqneg.8b	v0, v0
-# CHECK: sqxtn.8b	v0, v0
-# CHECK: sqxtun.8b	v0, v0
-# CHECK: suqadd.8b	v0, v0
-# CHECK: uadalp.4h	v0, v0
-# CHECK: uaddlp.4h	v0, v0
-# CHECK: ucvtf.2s	v0, v0
-# CHECK: uqxtn.8b	v0, v0
-# CHECK: urecpe.2s	v0, v0
-# CHECK: ursqrte.2s	v0, v0
-# CHECK: usqadd.8b	v0, v0
-# CHECK: xtn.8b	v0, v0
-
-0x00 0x98 0x20 0x0e
-0x00 0x98 0x20 0x4e
-0x00 0x98 0x60 0x0e
-0x00 0x98 0x60 0x4e
-0x00 0x98 0xa0 0x0e
-0x00 0x98 0xa0 0x4e
-0x00 0x98 0xe0 0x4e
-
-# CHECK: cmeq.8b	v0, v0, #0
-# CHECK: cmeq.16b	v0, v0, #0
-# CHECK: cmeq.4h	v0, v0, #0
-# CHECK: cmeq.8h	v0, v0, #0
-# CHECK: cmeq.2s	v0, v0, #0
-# CHECK: cmeq.4s	v0, v0, #0
-# CHECK: cmeq.2d	v0, v0, #0
-
-0x00 0x88 0x20 0x2e
-0x00 0x88 0x20 0x0e
-0x00 0x98 0x20 0x2e
-0x00 0xa8 0x20 0x0e
-0x00 0xd8 0xa0 0x0e
-0x00 0xc8 0xa0 0x2e
-0x00 0xc8 0xa0 0x0e
-0x00 0xd8 0xa0 0x2e
-0x00 0xe8 0xa0 0x0e
-
-# CHECK: cmge.8b	v0, v0, #0
-# CHECK: cmgt.8b	v0, v0, #0
-# CHECK: cmle.8b	v0, v0, #0
-# CHECK: cmlt.8b	v0, v0, #0
-# CHECK: fcmeq.2s	v0, v0, #0
-# CHECK: fcmge.2s	v0, v0, #0
-# CHECK: fcmgt.2s	v0, v0, #0
-# CHECK: fcmle.2s	v0, v0, #0
-# CHECK: fcmlt.2s	v0, v0, #0
-
-0x00 0x78 0x21 0x0e
-0x00 0x78 0x21 0x4e
-0x00 0x78 0x61 0x0e
-0x00 0x78 0x61 0x4e
-0x00 0x68 0x21 0x0e
-0x00 0x68 0x21 0x4e
-0x00 0x68 0x61 0x0e
-0x00 0x68 0x61 0x4e
-0x00 0x68 0x61 0x2e
-0x00 0x68 0x61 0x6e
-
-# CHECK: fcvtl	v0.4s, v0.4h
-# CHECK: fcvtl2	v0.4s, v0.8h
-# CHECK: fcvtl	v0.2d, v0.2s
-# CHECK: fcvtl2	v0.2d, v0.4s
-# CHECK: fcvtn	v0.4h, v0.4s
-# CHECK: fcvtn2	v0.8h, v0.4s
-# CHECK: fcvtn	v0.2s, v0.2d
-# CHECK: fcvtn2	v0.4s, v0.2d
-# CHECK: fcvtxn	v0.2s, v0.2d
-# CHECK: fcvtxn2	v0.4s, v0.2d
-
-#===-------------------------------------------------------------------------===
-# AdvSIMD modified immediate instructions
-#===-------------------------------------------------------------------------===
-
-0x20 0x14 0x00 0x2f
-0x20 0x34 0x00 0x2f
-0x20 0x54 0x00 0x2f
-0x20 0x74 0x00 0x2f
-
-# CHECK: bic.2s v0, #1
-# CHECK: bic.2s v0, #1, lsl #8
-# CHECK: bic.2s v0, #1, lsl #16
-# CHECK: bic.2s v0, #1, lsl #24
-
-0x20 0x94 0x00 0x2f
-0x20 0x94 0x00 0x2f
-0x20 0xb4 0x00 0x2f
-
-# CHECK: bic.4h v0, #1
-# CHECK: bic.4h v0, #1
-# FIXME: bic.4h v0, #1, lsl #8
-#    'bic.4h' should be selected over "fcvtnu.2s v0, v1, #0"
-
-0x20 0x14 0x00 0x6f
-0x20 0x34 0x00 0x6f
-0x20 0x54 0x00 0x6f
-0x20 0x74 0x00 0x6f
-
-# CHECK: bic.4s v0, #1
-# CHECK: bic.4s v0, #1, lsl #8
-# CHECK: bic.4s v0, #1, lsl #16
-# CHECK: bic.4s v0, #1, lsl #24
-
-0x20 0x94 0x00 0x6f
-0x20 0xb4 0x00 0x6f
-
-# CHECK: bic.8h v0, #1
-# FIXME: bic.8h v0, #1, lsl #8
-#    "bic.8h" should be selected over "fcvtnu.4s v0, v1, #0"
-
-0x00 0xf4 0x02 0x6f
-
-# CHECK: fmov.2d v0, #1.250000e-01
-
-0x00 0xf4 0x02 0x0f
-0x00 0xf4 0x02 0x4f
-
-# CHECK: fmov.2s v0, #1.250000e-01
-# CHECK: fmov.4s v0, #1.250000e-01
-
-0x20 0x14 0x00 0x0f
-0x20 0x34 0x00 0x0f
-0x20 0x54 0x00 0x0f
-0x20 0x74 0x00 0x0f
-
-# CHECK: orr.2s v0, #1
-# CHECK: orr.2s v0, #1, lsl #8
-# CHECK: orr.2s v0, #1, lsl #16
-# CHECK: orr.2s v0, #1, lsl #24
-
-0x20 0x94 0x00 0x0f
-0x20 0xb4 0x00 0x0f
-
-# CHECK: orr.4h v0, #1
-# FIXME: orr.4h v0, #1, lsl #8
-#    'orr.4h' should be selected over "fcvtns.2s v0, v1, #0"
-
-0x20 0x14 0x00 0x4f
-0x20 0x34 0x00 0x4f
-0x20 0x54 0x00 0x4f
-0x20 0x74 0x00 0x4f
-
-# CHECK: orr.4s v0, #1
-# CHECK: orr.4s v0, #1, lsl #8
-# CHECK: orr.4s v0, #1, lsl #16
-# CHECK: orr.4s v0, #1, lsl #24
-
-0x20 0x94 0x00 0x4f
-0x20 0xb4 0x00 0x4f
-
-# CHECK: orr.8h v0, #1
-# FIXME: orr.8h v0, #1, lsl #8
-#    "orr.8h" should be selected over "fcvtns.4s v0, v1, #0"
-
-0x21 0x70 0x40 0x0c
-0x42 0xa0 0x40 0x4c
-0x64 0x64 0x40 0x0c
-0x87 0x24 0x40 0x4c
-0x0c 0xa8 0x40 0x0c
-0x0a 0x68 0x40 0x4c
-0x2d 0xac 0x40 0x0c
-0x4f 0x7c 0x40 0x4c
-
-# CHECK: ld1.8b { v1 }, [x1]
-# CHECK: ld1.16b { v2, v3 }, [x2]
-# CHECK: ld1.4h { v4, v5, v6 }, [x3]
-# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4]
-# CHECK: ld1.2s { v12, v13 }, [x0]
-# CHECK: ld1.4s { v10, v11, v12 }, [x0]
-# CHECK: ld1.1d { v13, v14 }, [x1]
-# CHECK: ld1.2d	{ v15 }, [x2]
-
-0x41 0x70 0xdf 0x0c
-0x41 0xa0 0xdf 0x0c
-0x41 0x60 0xdf 0x0c
-0x41 0x20 0xdf 0x0c
-0x42 0x70 0xdf 0x4c
-0x42 0xa0 0xdf 0x4c
-0x42 0x60 0xdf 0x4c
-0x42 0x20 0xdf 0x4c
-0x64 0x74 0xdf 0x0c
-0x64 0xa4 0xdf 0x0c
-0x64 0x64 0xdf 0x0c
-0x64 0x24 0xdf 0x0c
-0x87 0x74 0xdf 0x4c
-0x87 0xa4 0xdf 0x4c
-0x87 0x64 0xdf 0x4c
-0x87 0x24 0xdf 0x4c
-0x0c 0x78 0xdf 0x0c
-0x0c 0xa8 0xdf 0x0c
-0x0c 0x68 0xdf 0x0c
-0x0c 0x28 0xdf 0x0c
-0x0a 0x78 0xdf 0x4c
-0x0a 0xa8 0xdf 0x4c
-0x0a 0x68 0xdf 0x4c
-0x0a 0x28 0xdf 0x4c
-0x2d 0x7c 0xdf 0x0c
-0x2d 0xac 0xdf 0x0c
-0x2d 0x6c 0xdf 0x0c
-0x2d 0x2c 0xdf 0x0c
-0x4f 0x7c 0xdf 0x4c
-0x4f 0xac 0xdf 0x4c
-0x4f 0x6c 0xdf 0x4c
-0x4f 0x2c 0xdf 0x4c
-
-# CHECK: ld1.8b { v1 }, [x2], #8
-# CHECK: ld1.8b { v1, v2 }, [x2], #16
-# CHECK: ld1.8b { v1, v2, v3 }, [x2], #24
-# CHECK: ld1.8b { v1, v2, v3, v4 }, [x2], #32
-# CHECK: ld1.16b { v2 }, [x2], #16
-# CHECK: ld1.16b { v2, v3 }, [x2], #32
-# CHECK: ld1.16b { v2, v3, v4 }, [x2], #48
-# CHECK: ld1.16b { v2, v3, v4, v5 }, [x2], #64
-# CHECK: ld1.4h { v4 }, [x3], #8
-# CHECK: ld1.4h { v4, v5 }, [x3], #16
-# CHECK: ld1.4h { v4, v5, v6 }, [x3], #24
-# CHECK: ld1.4h { v4, v5, v6, v7 }, [x3], #32
-# CHECK: ld1.8h { v7 }, [x4], #16
-# CHECK: ld1.8h { v7, v8 }, [x4], #32
-# CHECK: ld1.8h { v7, v8, v9 }, [x4], #48
-# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4], #64
-# CHECK: ld1.2s { v12 }, [x0], #8
-# CHECK: ld1.2s { v12, v13 }, [x0], #16
-# CHECK: ld1.2s { v12, v13, v14 }, [x0], #24
-# CHECK: ld1.2s { v12, v13, v14, v15 }, [x0], #32
-# CHECK: ld1.4s { v10 }, [x0], #16
-# CHECK: ld1.4s { v10, v11 }, [x0], #32
-# CHECK: ld1.4s { v10, v11, v12 }, [x0], #48
-# CHECK: ld1.4s { v10, v11, v12, v13 }, [x0], #64
-# CHECK: ld1.1d { v13 }, [x1], #8
-# CHECK: ld1.1d { v13, v14 }, [x1], #16
-# CHECK: ld1.1d { v13, v14, v15 }, [x1], #24
-# CHECK: ld1.1d { v13, v14, v15, v16 }, [x1], #32
-# CHECK: ld1.2d { v15 }, [x2], #16
-# CHECK: ld1.2d { v15, v16 }, [x2], #32
-# CHECK: ld1.2d { v15, v16, v17 }, [x2], #48
-# CHECK: ld1.2d { v15, v16, v17, v18 }, [x2], #64
-
-0x21 0x70 0x00 0x0c
-0x42 0xa0 0x00 0x4c
-0x64 0x64 0x00 0x0c
-0x87 0x24 0x00 0x4c
-0x0c 0xa8 0x00 0x0c
-0x0a 0x68 0x00 0x4c
-0x2d 0xac 0x00 0x0c
-0x4f 0x7c 0x00 0x4c
-
-# CHECK: st1.8b { v1 }, [x1]
-# CHECK: st1.16b { v2, v3 }, [x2]
-# CHECK: st1.4h { v4, v5, v6 }, [x3]
-# CHECK: st1.8h { v7, v8, v9, v10 }, [x4]
-# CHECK: st1.2s { v12, v13 }, [x0]
-# CHECK: st1.4s { v10, v11, v12 }, [x0]
-# CHECK: st1.1d { v13, v14 }, [x1]
-# CHECK: st1.2d	{ v15 }, [x2]
-
-0x61 0x08 0x40 0x0d
-0x82 0x84 0x40 0x4d
-0xa3 0x58 0x40 0x0d
-0xc4 0x80 0x40 0x4d
-
-# CHECK: ld1.b { v1 }[2], [x3]
-# CHECK: ld1.d { v2 }[1], [x4]
-# CHECK: ld1.h { v3 }[3], [x5]
-# CHECK: ld1.s { v4 }[2], [x6]
-
-0x61 0x08 0xdf 0x0d
-0x82 0x84 0xdf 0x4d
-0xa3 0x58 0xdf 0x0d
-0xc4 0x80 0xdf 0x4d
-
-# CHECK: ld1.b { v1 }[2], [x3], #1
-# CHECK: ld1.d { v2 }[1], [x4], #8
-# CHECK: ld1.h { v3 }[3], [x5], #2
-# CHECK: ld1.s { v4 }[2], [x6], #4
-
-0x61 0x08 0x00 0x0d
-0x82 0x84 0x00 0x4d
-0xa3 0x58 0x00 0x0d
-0xc4 0x80 0x00 0x4d
-
-# CHECK: st1.b { v1 }[2], [x3]
-# CHECK: st1.d { v2 }[1], [x4]
-# CHECK: st1.h { v3 }[3], [x5]
-# CHECK: st1.s { v4 }[2], [x6]
-
-0x61 0x08 0x9f 0x0d
-0x82 0x84 0x9f 0x4d
-0xa3 0x58 0x9f 0x0d
-0xc4 0x80 0x9f 0x4d
-
-# CHECK: st1.b { v1 }[2], [x3], #1
-# CHECK: st1.d { v2 }[1], [x4], #8
-# CHECK: st1.h { v3 }[3], [x5], #2
-# CHECK: st1.s { v4 }[2], [x6], #4
-
-0x61 0x08 0xc4 0x0d
-0x82 0x84 0xc5 0x4d
-0xa3 0x58 0xc6 0x0d
-0xc4 0x80 0xc7 0x4d
-
-# CHECK: ld1.b { v1 }[2], [x3], x4
-# CHECK: ld1.d { v2 }[1], [x4], x5
-# CHECK: ld1.h { v3 }[3], [x5], x6
-# CHECK: ld1.s { v4 }[2], [x6], x7
-
-0x61 0x08 0x84 0x0d
-0x82 0x84 0x85 0x4d
-0xa3 0x58 0x86 0x0d
-0xc4 0x80 0x87 0x4d
-
-# CHECK: st1.b { v1 }[2], [x3], x4
-# CHECK: st1.d { v2 }[1], [x4], x5
-# CHECK: st1.h { v3 }[3], [x5], x6
-# CHECK: st1.s { v4 }[2], [x6], x7
-
-0x41 0x70 0xc3 0x0c
-0x42 0xa0 0xc4 0x4c
-0x64 0x64 0xc5 0x0c
-0x87 0x24 0xc6 0x4c
-0x0c 0xa8 0xc7 0x0c
-0x0a 0x68 0xc8 0x4c
-0x2d 0xac 0xc9 0x0c
-0x4f 0x7c 0xca 0x4c
-
-# CHECK: ld1.8b { v1 }, [x2], x3
-# CHECK: ld1.16b { v2, v3 }, [x2], x4
-# CHECK: ld1.4h { v4, v5, v6 }, [x3], x5
-# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4], x6
-# CHECK: ld1.2s { v12, v13 }, [x0], x7
-# CHECK: ld1.4s { v10, v11, v12 }, [x0], x8
-# CHECK: ld1.1d { v13, v14 }, [x1], x9
-# CHECK: ld1.2d { v15 }, [x2], x10
-
-0x41 0x70 0x83 0x0c
-0x42 0xa0 0x84 0x4c
-0x64 0x64 0x85 0x0c
-0x87 0x24 0x86 0x4c
-0x0c 0xa8 0x87 0x0c
-0x0a 0x68 0x88 0x4c
-0x2d 0xac 0x89 0x0c
-0x4f 0x7c 0x8a 0x4c
-
-# CHECK: st1.8b { v1 }, [x2], x3
-# CHECK: st1.16b { v2, v3 }, [x2], x4
-# CHECK: st1.4h { v4, v5, v6 }, [x3], x5
-# CHECK: st1.8h { v7, v8, v9, v10 }, [x4], x6
-# CHECK: st1.2s { v12, v13 }, [x0], x7
-# CHECK: st1.4s { v10, v11, v12 }, [x0], x8
-# CHECK: st1.1d { v13, v14 }, [x1], x9
-# CHECK: st1.2d { v15 }, [x2], x10
-
-0x41 0x70 0x9f 0x0c
-0x41 0xa0 0x9f 0x0c
-0x41 0x60 0x9f 0x0c
-0x41 0x20 0x9f 0x0c
-0x42 0x70 0x9f 0x4c
-0x42 0xa0 0x9f 0x4c
-0x42 0x60 0x9f 0x4c
-0x42 0x20 0x9f 0x4c
-0x64 0x74 0x9f 0x0c
-0x64 0xa4 0x9f 0x0c
-0x64 0x64 0x9f 0x0c
-0x64 0x24 0x9f 0x0c
-0x87 0x74 0x9f 0x4c
-0x87 0xa4 0x9f 0x4c
-0x87 0x64 0x9f 0x4c
-0x87 0x24 0x9f 0x4c
-0x0c 0x78 0x9f 0x0c
-0x0c 0xa8 0x9f 0x0c
-0x0c 0x68 0x9f 0x0c
-0x0c 0x28 0x9f 0x0c
-0x0a 0x78 0x9f 0x4c
-0x0a 0xa8 0x9f 0x4c
-0x0a 0x68 0x9f 0x4c
-0x0a 0x28 0x9f 0x4c
-0x2d 0x7c 0x9f 0x0c
-0x2d 0xac 0x9f 0x0c
-0x2d 0x6c 0x9f 0x0c
-0x2d 0x2c 0x9f 0x0c
-0x4f 0x7c 0x9f 0x4c
-0x4f 0xac 0x9f 0x4c
-0x4f 0x6c 0x9f 0x4c
-0x4f 0x2c 0x9f 0x4c
-
-# CHECK: st1.8b { v1 }, [x2], #8
-# CHECK: st1.8b { v1, v2 }, [x2], #16
-# CHECK: st1.8b { v1, v2, v3 }, [x2], #24
-# CHECK: st1.8b { v1, v2, v3, v4 }, [x2], #32
-# CHECK: st1.16b { v2 }, [x2], #16
-# CHECK: st1.16b { v2, v3 }, [x2], #32
-# CHECK: st1.16b { v2, v3, v4 }, [x2], #48
-# CHECK: st1.16b { v2, v3, v4, v5 }, [x2], #64
-# CHECK: st1.4h { v4 }, [x3], #8
-# CHECK: st1.4h { v4, v5 }, [x3], #16
-# CHECK: st1.4h { v4, v5, v6 }, [x3], #24
-# CHECK: st1.4h { v4, v5, v6, v7 }, [x3], #32
-# CHECK: st1.8h { v7 }, [x4], #16
-# CHECK: st1.8h { v7, v8 }, [x4], #32
-# CHECK: st1.8h { v7, v8, v9 }, [x4], #48
-# CHECK: st1.8h { v7, v8, v9, v10 }, [x4], #64
-# CHECK: st1.2s { v12 }, [x0], #8
-# CHECK: st1.2s { v12, v13 }, [x0], #16
-# CHECK: st1.2s { v12, v13, v14 }, [x0], #24
-# CHECK: st1.2s { v12, v13, v14, v15 }, [x0], #32
-# CHECK: st1.4s { v10 }, [x0], #16
-# CHECK: st1.4s { v10, v11 }, [x0], #32
-# CHECK: st1.4s { v10, v11, v12 }, [x0], #48
-# CHECK: st1.4s { v10, v11, v12, v13 }, [x0], #64
-# CHECK: st1.1d { v13 }, [x1], #8
-# CHECK: st1.1d { v13, v14 }, [x1], #16
-# CHECK: st1.1d { v13, v14, v15 }, [x1], #24
-# CHECK: st1.1d { v13, v14, v15, v16 }, [x1], #32
-# CHECK: st1.2d { v15 }, [x2], #16
-# CHECK: st1.2d { v15, v16 }, [x2], #32
-# CHECK: st1.2d { v15, v16, v17 }, [x2], #48
-# CHECK: st1.2d { v15, v16, v17, v18 }, [x2], #64
-
-0x21 0xc0 0x40 0x0d
-0x21 0xc0 0xc2 0x0d
-0x64 0xc4 0x40 0x0d
-0x64 0xc4 0xc5 0x0d
-0xa9 0xc8 0x40 0x0d
-0xa9 0xc8 0xc6 0x0d
-0xec 0xcc 0x40 0x0d
-0xec 0xcc 0xc8 0x0d
-
-# CHECK: ld1r.8b { v1 }, [x1]
-# CHECK: ld1r.8b { v1 }, [x1], x2
-# CHECK: ld1r.4h { v4 }, [x3]
-# CHECK: ld1r.4h { v4 }, [x3], x5
-# CHECK: ld1r.2s { v9 }, [x5]
-# CHECK: ld1r.2s { v9 }, [x5], x6
-# CHECK: ld1r.1d { v12 }, [x7]
-# CHECK: ld1r.1d { v12 }, [x7], x8
-
-0x21 0xc0 0xdf 0x0d
-0x21 0xc4 0xdf 0x0d
-0x21 0xc8 0xdf 0x0d
-0x21 0xcc 0xdf 0x0d
-
-# CHECK: ld1r.8b { v1 }, [x1], #1
-# CHECK: ld1r.4h { v1 }, [x1], #2
-# CHECK: ld1r.2s { v1 }, [x1], #4
-# CHECK: ld1r.1d { v1 }, [x1], #8
-
-0x45 0x80 0x40 0x4c
-0x0a 0x88 0x40 0x0c
-
-# CHECK: ld2.16b { v5, v6 }, [x2]
-# CHECK: ld2.2s { v10, v11 }, [x0]
-
-0x45 0x80 0x00 0x4c
-0x0a 0x88 0x00 0x0c
-
-# CHECK: st2.16b { v5, v6 }, [x2]
-# CHECK: st2.2s { v10, v11 }, [x0]
-
-0x61 0x08 0x20 0x0d
-0x82 0x84 0x20 0x4d
-0xc3 0x50 0x20 0x0d
-0xe4 0x90 0x20 0x4d
-
-# CHECK: st2.b { v1, v2 }[2], [x3]
-# CHECK: st2.d { v2, v3 }[1], [x4]
-# CHECK: st2.h { v3, v4 }[2], [x6]
-# CHECK: st2.s { v4, v5 }[3], [x7]
-
-0x61 0x08 0xbf 0x0d
-0x82 0x84 0xbf 0x4d
-0xa3 0x58 0xbf 0x0d
-0xc4 0x80 0xbf 0x4d
-
-# CHECK: st2.b { v1, v2 }[2], [x3], #2
-# CHECK: st2.d { v2, v3 }[1], [x4], #16
-# CHECK: st2.h { v3, v4 }[3], [x5], #4
-# CHECK: st2.s { v4, v5 }[2], [x6], #8
-
-0x61 0x08 0x60 0x0d
-0x82 0x84 0x60 0x4d
-0xc3 0x50 0x60 0x0d
-0xe4 0x90 0x60 0x4d
-
-# CHECK: ld2.b { v1, v2 }[2], [x3]
-# CHECK: ld2.d { v2, v3 }[1], [x4]
-# CHECK: ld2.h { v3, v4 }[2], [x6]
-# CHECK: ld2.s { v4, v5 }[3], [x7]
-
-0x61 0x08 0xff 0x0d
-0x82 0x84 0xff 0x4d
-0xa3 0x58 0xff 0x0d
-0xc4 0x80 0xff 0x4d
-
-# CHECK: ld2.b { v1, v2 }[2], [x3], #2
-# CHECK: ld2.d { v2, v3 }[1], [x4], #16
-# CHECK: ld2.h { v3, v4 }[3], [x5], #4
-# CHECK: ld2.s { v4, v5 }[2], [x6], #8
-
-0x61 0x08 0xe4 0x0d
-0x82 0x84 0xe6 0x4d
-0xa3 0x58 0xe8 0x0d
-0xc4 0x80 0xea 0x4d
-
-# CHECK: ld2.b { v1, v2 }[2], [x3], x4
-# CHECK: ld2.d { v2, v3 }[1], [x4], x6
-# CHECK: ld2.h { v3, v4 }[3], [x5], x8
-# CHECK: ld2.s { v4, v5 }[2], [x6], x10
-
-0x61 0x08 0xa4 0x0d
-0x82 0x84 0xa6 0x4d
-0xa3 0x58 0xa8 0x0d
-0xc4 0x80 0xaa 0x4d
-
-# CHECK: st2.b { v1, v2 }[2], [x3], x4
-# CHECK: st2.d { v2, v3 }[1], [x4], x6
-# CHECK: st2.h { v3, v4 }[3], [x5], x8
-# CHECK: st2.s { v4, v5 }[2], [x6], x10
-
-0x64 0x84 0xc5 0x0c
-0x0c 0x88 0xc7 0x0c
-
-# CHECK: ld2.4h { v4, v5 }, [x3], x5
-# CHECK: ld2.2s { v12, v13 }, [x0], x7
-
-0x00 0x80 0xdf 0x0c
-0x00 0x80 0xdf 0x4c
-0x00 0x84 0xdf 0x0c
-0x00 0x84 0xdf 0x4c
-0x00 0x88 0xdf 0x0c
-0x00 0x88 0xdf 0x4c
-0x00 0x8c 0xdf 0x4c
-
-# CHECK: ld2.8b { v0, v1 }, [x0], #16
-# CHECK: ld2.16b { v0, v1 }, [x0], #32
-# CHECK: ld2.4h { v0, v1 }, [x0], #16
-# CHECK: ld2.8h { v0, v1 }, [x0], #32
-# CHECK: ld2.2s { v0, v1 }, [x0], #16
-# CHECK: ld2.4s { v0, v1 }, [x0], #32
-# CHECK: ld2.2d { v0, v1 }, [x0], #32
-
-0x64 0x84 0x85 0x0c
-0x0c 0x88 0x87 0x0c
-
-# CHECK: st2.4h { v4, v5 }, [x3], x5
-# CHECK: st2.2s { v12, v13 }, [x0], x7
-
-0x00 0x80 0x9f 0x0c
-0x00 0x80 0x9f 0x4c
-0x00 0x84 0x9f 0x0c
-0x00 0x84 0x9f 0x4c
-0x00 0x88 0x9f 0x0c
-0x00 0x88 0x9f 0x4c
-0x00 0x8c 0x9f 0x4c
-
-# CHECK: st2.8b { v0, v1 }, [x0], #16
-# CHECK: st2.16b { v0, v1 }, [x0], #32
-# CHECK: st2.4h { v0, v1 }, [x0], #16
-# CHECK: st2.8h { v0, v1 }, [x0], #32
-# CHECK: st2.2s { v0, v1 }, [x0], #16
-# CHECK: st2.4s { v0, v1 }, [x0], #32
-# CHECK: st2.2d { v0, v1 }, [x0], #32
-
-0x21 0xc0 0x60 0x0d
-0x21 0xc0 0xe2 0x0d
-0x21 0xc0 0x60 0x4d
-0x21 0xc0 0xe2 0x4d
-0x21 0xc4 0x60 0x0d
-0x21 0xc4 0xe2 0x0d
-0x21 0xc4 0x60 0x4d
-0x21 0xc4 0xe2 0x4d
-0x21 0xc8 0x60 0x0d
-0x21 0xc8 0xe2 0x0d
-0x21 0xcc 0x60 0x4d
-0x21 0xcc 0xe2 0x4d
-0x21 0xcc 0x60 0x0d
-0x21 0xcc 0xe2 0x0d
-
-# CHECK: ld2r.8b { v1, v2 }, [x1]
-# CHECK: ld2r.8b { v1, v2 }, [x1], x2
-# CHECK: ld2r.16b { v1, v2 }, [x1]
-# CHECK: ld2r.16b { v1, v2 }, [x1], x2
-# CHECK: ld2r.4h { v1, v2 }, [x1]
-# CHECK: ld2r.4h { v1, v2 }, [x1], x2
-# CHECK: ld2r.8h { v1, v2 }, [x1]
-# CHECK: ld2r.8h { v1, v2 }, [x1], x2
-# CHECK: ld2r.2s { v1, v2 }, [x1]
-# CHECK: ld2r.2s { v1, v2 }, [x1], x2
-# CHECK: ld2r.2d { v1, v2 }, [x1]
-# CHECK: ld2r.2d { v1, v2 }, [x1], x2
-# CHECK: ld2r.1d { v1, v2 }, [x1]
-# CHECK: ld2r.1d { v1, v2 }, [x1], x2
-
-0x21 0xc0 0xff 0x0d
-0x21 0xc0 0xff 0x4d
-0x21 0xc4 0xff 0x0d
-0x21 0xc4 0xff 0x4d
-0x21 0xc8 0xff 0x0d
-0x21 0xcc 0xff 0x4d
-0x21 0xcc 0xff 0x0d
-
-# CHECK: ld2r.8b { v1, v2 }, [x1], #2
-# CHECK: ld2r.16b { v1, v2 }, [x1], #2
-# CHECK: ld2r.4h { v1, v2 }, [x1], #4
-# CHECK: ld2r.8h { v1, v2 }, [x1], #4
-# CHECK: ld2r.2s { v1, v2 }, [x1], #8
-# CHECK: ld2r.2d { v1, v2 }, [x1], #16
-# CHECK: ld2r.1d { v1, v2 }, [x1], #16
-
-0x21 0x40 0x40 0x0c
-0x45 0x40 0x40 0x4c
-0x0a 0x48 0x40 0x0c
-
-# CHECK: ld3.8b { v1, v2, v3 }, [x1]
-# CHECK: ld3.16b { v5, v6, v7 }, [x2]
-# CHECK: ld3.2s { v10, v11, v12 }, [x0]
-
-0x21 0x40 0x00 0x0c
-0x45 0x40 0x00 0x4c
-0x0a 0x48 0x00 0x0c
-
-# CHECK: st3.8b { v1, v2, v3 }, [x1]
-# CHECK: st3.16b { v5, v6, v7 }, [x2]
-# CHECK: st3.2s { v10, v11, v12 }, [x0]
-
-0x61 0x28 0xc4 0x0d
-0x82 0xa4 0xc5 0x4d
-0xa3 0x78 0xc6 0x0d
-0xc4 0xa0 0xc7 0x4d
-
-# CHECK: ld3.b { v1, v2, v3 }[2], [x3], x4
-# CHECK: ld3.d { v2, v3, v4 }[1], [x4], x5
-# CHECK: ld3.h { v3, v4, v5 }[3], [x5], x6
-# CHECK: ld3.s { v4, v5, v6 }[2], [x6], x7
-
-0x61 0x28 0x84 0x0d
-0x82 0xa4 0x85 0x4d
-0xa3 0x78 0x86 0x0d
-0xc4 0xa0 0x87 0x4d
-
-# CHECK: st3.b { v1, v2, v3 }[2], [x3], x4
-# CHECK: st3.d { v2, v3, v4 }[1], [x4], x5
-# CHECK: st3.h { v3, v4, v5 }[3], [x5], x6
-# CHECK: st3.s { v4, v5, v6 }[2], [x6], x7
-
-0x61 0x28 0x9f 0x0d
-0x82 0xa4 0x9f 0x4d
-0xa3 0x78 0x9f 0x0d
-0xc4 0xa0 0x9f 0x4d
-
-# CHECK: st3.b { v1, v2, v3 }[2], [x3], #3
-# CHECK: st3.d { v2, v3, v4 }[1], [x4], #24
-# CHECK: st3.h { v3, v4, v5 }[3], [x5], #6
-# CHECK: st3.s { v4, v5, v6 }[2], [x6], #12
-
-0x41 0x40 0xc3 0x0c
-0x42 0x40 0xc4 0x4c
-0x64 0x44 0xc5 0x0c
-0x87 0x44 0xc6 0x4c
-0x0c 0x48 0xc7 0x0c
-0x0a 0x48 0xc8 0x4c
-0x4f 0x4c 0xca 0x4c
-
-# CHECK: ld3.8b { v1, v2, v3 }, [x2], x3
-# CHECK: ld3.16b { v2, v3, v4 }, [x2], x4
-# CHECK: ld3.4h { v4, v5, v6 }, [x3], x5
-# CHECK: ld3.8h { v7, v8, v9 }, [x4], x6
-# CHECK: ld3.2s { v12, v13, v14 }, [x0], x7
-# CHECK: ld3.4s { v10, v11, v12 }, [x0], x8
-# CHECK: ld3.2d { v15, v16, v17 }, [x2], x10
-
-0x00 0x40 0xdf 0x0c
-0x00 0x40 0xdf 0x4c
-0x00 0x44 0xdf 0x0c
-0x00 0x44 0xdf 0x4c
-0x00 0x48 0xdf 0x0c
-0x00 0x48 0xdf 0x4c
-0x00 0x4c 0xdf 0x4c
-
-# CHECK: ld3.8b { v0, v1, v2 }, [x0], #24
-# CHECK: ld3.16b { v0, v1, v2 }, [x0], #48
-# CHECK: ld3.4h { v0, v1, v2 }, [x0], #24
-# CHECK: ld3.8h { v0, v1, v2 }, [x0], #48
-# CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
-# CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
-# CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
-
-0x41 0x40 0x83 0x0c
-0x42 0x40 0x84 0x4c
-0x64 0x44 0x85 0x0c
-0x87 0x44 0x86 0x4c
-0x0c 0x48 0x87 0x0c
-0x0a 0x48 0x88 0x4c
-0x4f 0x4c 0x8a 0x4c
-
-# CHECK: st3.8b { v1, v2, v3 }, [x2], x3
-# CHECK: st3.16b { v2, v3, v4 }, [x2], x4
-# CHECK: st3.4h { v4, v5, v6 }, [x3], x5
-# CHECK: st3.8h { v7, v8, v9 }, [x4], x6
-# CHECK: st3.2s { v12, v13, v14 }, [x0], x7
-# CHECK: st3.4s { v10, v11, v12 }, [x0], x8
-# CHECK: st3.2d { v15, v16, v17 }, [x2], x10
-
-0x00 0x40 0x9f 0x0c
-0x00 0x40 0x9f 0x4c
-0x00 0x44 0x9f 0x0c
-0x00 0x44 0x9f 0x4c
-0x00 0x48 0x9f 0x0c
-0x00 0x48 0x9f 0x4c
-0x00 0x4c 0x9f 0x4c
-
-# CHECK: st3.8b { v0, v1, v2 }, [x0], #24
-# CHECK: st3.16b { v0, v1, v2 }, [x0], #48
-# CHECK: st3.4h { v0, v1, v2 }, [x0], #24
-# CHECK: st3.8h { v0, v1, v2 }, [x0], #48
-# CHECK: st3.2s { v0, v1, v2 }, [x0], #24
-# CHECK: st3.4s { v0, v1, v2 }, [x0], #48
-# CHECK: st3.2d { v0, v1, v2 }, [x0], #48
-
-0x61 0x28 0x40 0x0d
-0x82 0xa4 0x40 0x4d
-0xc3 0x70 0x40 0x0d
-0xe4 0xb0 0x40 0x4d
-
-# CHECK: ld3.b { v1, v2, v3 }[2], [x3]
-# CHECK: ld3.d { v2, v3, v4 }[1], [x4]
-# CHECK: ld3.h { v3, v4, v5 }[2], [x6]
-# CHECK: ld3.s { v4, v5, v6 }[3], [x7]
-
-0x61 0x28 0xdf 0x0d
-0x82 0xa4 0xdf 0x4d
-0xa3 0x78 0xdf 0x0d
-0xc4 0xa0 0xdf 0x4d
-
-# CHECK: ld3.b { v1, v2, v3 }[2], [x3], #3
-# CHECK: ld3.d { v2, v3, v4 }[1], [x4], #24
-# CHECK: ld3.h { v3, v4, v5 }[3], [x5], #6
-# CHECK: ld3.s { v4, v5, v6 }[2], [x6], #12
-
-0x61 0x28 0x00 0x0d
-0x82 0xa4 0x00 0x4d
-0xc3 0x70 0x00 0x0d
-0xe4 0xb0 0x00 0x4d
-
-# CHECK: st3.b { v1, v2, v3 }[2], [x3]
-# CHECK: st3.d { v2, v3, v4 }[1], [x4]
-# CHECK: st3.h { v3, v4, v5 }[2], [x6]
-# CHECK: st3.s { v4, v5, v6 }[3], [x7]
-
-0x21 0xe0 0x40 0x0d
-0x21 0xe0 0xc2 0x0d
-0x21 0xe0 0x40 0x4d
-0x21 0xe0 0xc2 0x4d
-0x21 0xe4 0x40 0x0d
-0x21 0xe4 0xc2 0x0d
-0x21 0xe4 0x40 0x4d
-0x21 0xe4 0xc2 0x4d
-0x21 0xe8 0x40 0x0d
-0x21 0xe8 0xc2 0x0d
-0x21 0xec 0x40 0x4d
-0x21 0xec 0xc2 0x4d
-0x21 0xec 0x40 0x0d
-0x21 0xec 0xc2 0x0d
-
-# CHECK: ld3r.8b { v1, v2, v3 }, [x1]
-# CHECK: ld3r.8b { v1, v2, v3 }, [x1], x2
-# CHECK: ld3r.16b { v1, v2, v3 }, [x1]
-# CHECK: ld3r.16b { v1, v2, v3 }, [x1], x2
-# CHECK: ld3r.4h { v1, v2, v3 }, [x1]
-# CHECK: ld3r.4h { v1, v2, v3 }, [x1], x2
-# CHECK: ld3r.8h { v1, v2, v3 }, [x1]
-# CHECK: ld3r.8h { v1, v2, v3 }, [x1], x2
-# CHECK: ld3r.2s { v1, v2, v3 }, [x1]
-# CHECK: ld3r.2s { v1, v2, v3 }, [x1], x2
-# CHECK: ld3r.2d { v1, v2, v3 }, [x1]
-# CHECK: ld3r.2d { v1, v2, v3 }, [x1], x2
-# CHECK: ld3r.1d { v1, v2, v3 }, [x1]
-# CHECK: ld3r.1d { v1, v2, v3 }, [x1], x2
-
-0x21 0xe0 0xdf 0x0d
-0x21 0xe0 0xdf 0x4d
-0x21 0xe4 0xdf 0x0d
-0x21 0xe4 0xdf 0x4d
-0x21 0xe8 0xdf 0x0d
-0x21 0xec 0xdf 0x4d
-0x21 0xec 0xdf 0x0d
-
-# CHECK: ld3r.8b	{ v1, v2, v3 }, [x1], #3
-# CHECK: ld3r.16b	{ v1, v2, v3 }, [x1], #3
-# CHECK: ld3r.4h	{ v1, v2, v3 }, [x1], #6
-# CHECK: ld3r.8h	{ v1, v2, v3 }, [x1], #6
-# CHECK: ld3r.2s	{ v1, v2, v3 }, [x1], #12
-# CHECK: ld3r.2d	{ v1, v2, v3 }, [x1], #24
-# CHECK: ld3r.1d	{ v1, v2, v3 }, [x1], #24
-
-0x21 0x00 0x40 0x0c
-0x45 0x00 0x40 0x4c
-0x0a 0x08 0x40 0x0c
-
-# CHECK: ld4.8b { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4.16b { v5, v6, v7, v8 }, [x2]
-# CHECK: ld4.2s { v10, v11, v12, v13 }, [x0]
-
-0x21 0x00 0x00 0x0c
-0x45 0x00 0x00 0x4c
-0x0a 0x08 0x00 0x0c
-
-# CHECK: st4.8b { v1, v2, v3, v4 }, [x1]
-# CHECK: st4.16b { v5, v6, v7, v8 }, [x2]
-# CHECK: st4.2s { v10, v11, v12, v13 }, [x0]
-
-0x61 0x28 0xe4 0x0d
-0x82 0xa4 0xe5 0x4d
-0xa3 0x78 0xe6 0x0d
-0xc4 0xa0 0xe7 0x4d
-
-# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3], x4
-# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4], x5
-# CHECK: ld4.h { v3, v4, v5, v6 }[3], [x5], x6
-# CHECK: ld4.s { v4, v5, v6, v7 }[2], [x6], x7
-
-0x61 0x28 0xff 0x0d
-0x82 0xa4 0xff 0x4d
-0xa3 0x78 0xff 0x0d
-0xc4 0xa0 0xff 0x4d
-
-# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3], #4
-# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4], #32
-# CHECK: ld4.h { v3, v4, v5, v6 }[3], [x5], #8
-# CHECK: ld4.s { v4, v5, v6, v7 }[2], [x6], #16
-
-0x61 0x28 0xa4 0x0d
-0x82 0xa4 0xa5 0x4d
-0xa3 0x78 0xa6 0x0d
-0xc4 0xa0 0xa7 0x4d
-
-# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3], x4
-# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4], x5
-# CHECK: st4.h { v3, v4, v5, v6 }[3], [x5], x6
-# CHECK: st4.s { v4, v5, v6, v7 }[2], [x6], x7
-
-0x61 0x28 0xbf 0x0d
-0x82 0xa4 0xbf 0x4d
-0xa3 0x78 0xbf 0x0d
-0xc4 0xa0 0xbf 0x4d
-
-# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3], #4
-# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4], #32
-# CHECK: st4.h { v3, v4, v5, v6 }[3], [x5], #8
-# CHECK: st4.s { v4, v5, v6, v7 }[2], [x6], #16
-
-0x41 0x00 0xc3 0x0c
-0x42 0x00 0xc4 0x4c
-0x64 0x04 0xc5 0x0c
-0x87 0x04 0xc6 0x4c
-0x0c 0x08 0xc7 0x0c
-0x0a 0x08 0xc8 0x4c
-0x4f 0x0c 0xca 0x4c
-
-# CHECK: ld4.8b { v1, v2, v3, v4 }, [x2], x3
-# CHECK: ld4.16b { v2, v3, v4, v5 }, [x2], x4
-# CHECK: ld4.4h { v4, v5, v6, v7 }, [x3], x5
-# CHECK: ld4.8h { v7, v8, v9, v10 }, [x4], x6
-# CHECK: ld4.2s { v12, v13, v14, v15 }, [x0], x7
-# CHECK: ld4.4s { v10, v11, v12, v13 }, [x0], x8
-# CHECK: ld4.2d { v15, v16, v17, v18 }, [x2], x10
-
-0x00 0x00 0xdf 0x0c
-0x00 0x00 0xdf 0x4c
-0x00 0x04 0xdf 0x0c
-0x00 0x04 0xdf 0x4c
-0x00 0x08 0xdf 0x0c
-0x00 0x08 0xdf 0x4c
-0x00 0x0c 0xdf 0x4c
-
-# CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32
-# CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64
-# CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32
-# CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64
-# CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
-# CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
-# CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
-
-0x00 0x00 0x9f 0x0c
-0x00 0x00 0x9f 0x4c
-0x00 0x04 0x9f 0x0c
-0x00 0x04 0x9f 0x4c
-0x00 0x08 0x9f 0x0c
-0x00 0x08 0x9f 0x4c
-0x00 0x0c 0x9f 0x4c
-
-# CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32
-# CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64
-# CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32
-# CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64
-# CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
-# CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
-# CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
-
-0x41 0x00 0x83 0x0c
-0x42 0x00 0x84 0x4c
-0x64 0x04 0x85 0x0c
-0x87 0x04 0x86 0x4c
-0x0c 0x08 0x87 0x0c
-0x0a 0x08 0x88 0x4c
-0x4f 0x0c 0x8a 0x4c
-
-# CHECK: st4.8b { v1, v2, v3, v4 }, [x2], x3
-# CHECK: st4.16b { v2, v3, v4, v5 }, [x2], x4
-# CHECK: st4.4h { v4, v5, v6, v7 }, [x3], x5
-# CHECK: st4.8h { v7, v8, v9, v10 }, [x4], x6
-# CHECK: st4.2s { v12, v13, v14, v15 }, [x0], x7
-# CHECK: st4.4s { v10, v11, v12, v13 }, [x0], x8
-# CHECK: st4.2d { v15, v16, v17, v18 }, [x2], x10
-
-0x61 0x28 0x60 0x0d
-0x82 0xa4 0x60 0x4d
-0xc3 0x70 0x60 0x0d
-0xe4 0xb0 0x60 0x4d
-
-# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3]
-# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4]
-# CHECK: ld4.h { v3, v4, v5, v6 }[2], [x6]
-# CHECK: ld4.s { v4, v5, v6, v7 }[3], [x7]
-
-0x61 0x28 0x20 0x0d
-0x82 0xa4 0x20 0x4d
-0xc3 0x70 0x20 0x0d
-0xe4 0xb0 0x20 0x4d
-
-# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3]
-# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4]
-# CHECK: st4.h { v3, v4, v5, v6 }[2], [x6]
-# CHECK: st4.s { v4, v5, v6, v7 }[3], [x7]
-
-0x21 0xe0 0x60 0x0d
-0x21 0xe0 0xe2 0x0d
-0x21 0xe0 0x60 0x4d
-0x21 0xe0 0xe2 0x4d
-0x21 0xe4 0x60 0x0d
-0x21 0xe4 0xe2 0x0d
-0x21 0xe4 0x60 0x4d
-0x21 0xe4 0xe2 0x4d
-0x21 0xe8 0x60 0x0d
-0x21 0xe8 0xe2 0x0d
-0x21 0xec 0x60 0x4d
-0x21 0xec 0xe2 0x4d
-0x21 0xec 0x60 0x0d
-0x21 0xec 0xe2 0x0d
-
-# CHECK: ld4r.8b { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.8b { v1, v2, v3, v4 }, [x1], x2
-# CHECK: ld4r.16b { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.16b { v1, v2, v3, v4 }, [x1], x2
-# CHECK: ld4r.4h { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.4h { v1, v2, v3, v4 }, [x1], x2
-# CHECK: ld4r.8h { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.8h { v1, v2, v3, v4 }, [x1], x2
-# CHECK: ld4r.2s { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.2s { v1, v2, v3, v4 }, [x1], x2
-# CHECK: ld4r.2d { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.2d { v1, v2, v3, v4 }, [x1], x2
-# CHECK: ld4r.1d { v1, v2, v3, v4 }, [x1]
-# CHECK: ld4r.1d { v1, v2, v3, v4 }, [x1], x2
-
-0x21 0xe0 0xff 0x0d
-0x21 0xe0 0xff 0x4d
-0x21 0xe4 0xff 0x0d
-0x21 0xe4 0xff 0x4d
-0x21 0xe8 0xff 0x0d
-0x21 0xec 0xff 0x4d
-0x21 0xec 0xff 0x0d
-
-# CHECK: ld4r.8b	{ v1, v2, v3, v4 }, [x1], #4
-# CHECK: ld4r.16b	{ v1, v2, v3, v4 }, [x1], #4
-# CHECK: ld4r.4h	{ v1, v2, v3, v4 }, [x1], #8
-# CHECK: ld4r.8h	{ v1, v2, v3, v4 }, [x1], #8
-# CHECK: ld4r.2s	{ v1, v2, v3, v4 }, [x1], #16
-# CHECK: ld4r.2d	{ v1, v2, v3, v4 }, [x1], #32
-# CHECK: ld4r.1d	{ v1, v2, v3, v4 }, [x1], #32
-
-0x20 0xe4 0x00 0x2f
-0x20 0xe4 0x00 0x6f
-0x20 0xe4 0x00 0x0f
-0x20 0xe4 0x00 0x4f
-
-# CHECK: movi     d0, #0x000000000000ff
-# CHECK: movi.2d  v0, #0x000000000000ff
-# CHECK: movi.8b  v0, #1
-# CHECK: movi.16b v0, #1
-
-0x20 0x04 0x00 0x0f
-0x20 0x24 0x00 0x0f
-0x20 0x44 0x00 0x0f
-0x20 0x64 0x00 0x0f
-
-# CHECK: movi.2s v0, #1
-# CHECK: movi.2s v0, #1, lsl #8
-# CHECK: movi.2s v0, #1, lsl #16
-# CHECK: movi.2s v0, #1, lsl #24
-
-0x20 0x04 0x00 0x4f
-0x20 0x24 0x00 0x4f
-0x20 0x44 0x00 0x4f
-0x20 0x64 0x00 0x4f
-
-# CHECK: movi.4s v0, #1
-# CHECK: movi.4s v0, #1, lsl #8
-# CHECK: movi.4s v0, #1, lsl #16
-# CHECK: movi.4s v0, #1, lsl #24
-
-0x20 0x84 0x00 0x0f
-0x20 0xa4 0x00 0x0f
-
-# CHECK: movi.4h v0, #1
-# CHECK: movi.4h v0, #1, lsl #8
-
-0x20 0x84 0x00 0x4f
-0x20 0xa4 0x00 0x4f
-
-# CHECK: movi.8h v0, #1
-# CHECK: movi.8h v0, #1, lsl #8
-
-0x20 0x04 0x00 0x2f
-0x20 0x24 0x00 0x2f
-0x20 0x44 0x00 0x2f
-0x20 0x64 0x00 0x2f
-
-# CHECK: mvni.2s v0, #1
-# CHECK: mvni.2s v0, #1, lsl #8
-# CHECK: mvni.2s v0, #1, lsl #16
-# CHECK: mvni.2s v0, #1, lsl #24
-
-0x20 0x04 0x00 0x6f
-0x20 0x24 0x00 0x6f
-0x20 0x44 0x00 0x6f
-0x20 0x64 0x00 0x6f
-
-# CHECK: mvni.4s v0, #1
-# CHECK: mvni.4s v0, #1, lsl #8
-# CHECK: mvni.4s v0, #1, lsl #16
-# CHECK: mvni.4s v0, #1, lsl #24
-
-0x20 0x84 0x00 0x2f
-0x20 0xa4 0x00 0x2f
-
-# CHECK: mvni.4h v0, #1
-# CHECK: mvni.4h v0, #1, lsl #8
-
-0x20 0x84 0x00 0x6f
-0x20 0xa4 0x00 0x6f
-
-# CHECK: mvni.8h v0, #1
-# CHECK: mvni.8h v0, #1, lsl #8
-
-0x20 0xc4 0x00 0x2f
-0x20 0xd4 0x00 0x2f
-0x20 0xc4 0x00 0x6f
-0x20 0xd4 0x00 0x6f
-
-# CHECK: mvni.2s v0, #1, msl #8
-# CHECK: mvni.2s v0, #1, msl #16
-# CHECK: mvni.4s v0, #1, msl #8
-# CHECK: mvni.4s v0, #1, msl #16
-
-0x00 0x88 0x21 0x2e
-0x00 0x98 0x21 0x2e
-0x00 0x98 0xa1 0x2e
-0x00 0x98 0x21 0x0e
-0x00 0x88 0x21 0x0e
-0x00 0x88 0xa1 0x0e
-0x00 0x98 0xa1 0x0e
-
-# CHECK: frinta.2s	v0, v0
-# CHECK: frintx.2s	v0, v0
-# CHECK: frinti.2s	v0, v0
-# CHECK: frintm.2s	v0, v0
-# CHECK: frintn.2s	v0, v0
-# CHECK: frintp.2s	v0, v0
-# CHECK: frintz.2s	v0, v0
-
-#===-------------------------------------------------------------------------===
-# AdvSIMD scalar x index instructions
-#===-------------------------------------------------------------------------===
-
-0x00 0x18 0xa0 0x5f
-0x00 0x18 0xc0 0x5f
-0x00 0x58 0xa0 0x5f
-0x00 0x58 0xc0 0x5f
-0x00 0x98 0xa0 0x7f
-0x00 0x98 0xc0 0x7f
-0x00 0x98 0xa0 0x5f
-0x00 0x98 0xc0 0x5f
-0x00 0x38 0x70 0x5f
-0x00 0x38 0xa0 0x5f
-0x00 0x78 0x70 0x5f
-0x00 0xc8 0x70 0x5f
-0x00 0xc8 0xa0 0x5f
-0x00 0xb8 0x70 0x5f
-0x00 0xb8 0xa0 0x5f
-0x00 0xd8 0x70 0x5f
-0x00 0xd8 0xa0 0x5f
-
-# CHECK: fmla.s	s0, s0, v0[3]
-# CHECK: fmla.d	d0, d0, v0[1]
-# CHECK: fmls.s	s0, s0, v0[3]
-# CHECK: fmls.d	d0, d0, v0[1]
-# CHECK: fmulx.s	s0, s0, v0[3]
-# CHECK: fmulx.d	d0, d0, v0[1]
-# CHECK: fmul.s	s0, s0, v0[3]
-# CHECK: fmul.d	d0, d0, v0[1]
-# CHECK: sqdmlal.h	s0, h0, v0[7]
-# CHECK: sqdmlal.s	d0, s0, v0[3]
-# CHECK: sqdmlsl.h	s0, h0, v0[7]
-# CHECK: sqdmulh.h	h0, h0, v0[7]
-# CHECK: sqdmulh.s	s0, s0, v0[3]
-# CHECK: sqdmull.h	s0, h0, v0[7]
-# CHECK: sqdmull.s	d0, s0, v0[3]
-# CHECK: sqrdmulh.h	h0, h0, v0[7]
-# CHECK: sqrdmulh.s	s0, s0, v0[3]
-
-#===-------------------------------------------------------------------------===
-# AdvSIMD vector x index instructions
-#===-------------------------------------------------------------------------===
-
-  0x00 0x10 0x80 0x0f
-  0x00 0x10 0xa0 0x4f
-  0x00 0x18 0xc0 0x4f
-  0x00 0x50 0x80 0x0f
-  0x00 0x50 0xa0 0x4f
-  0x00 0x58 0xc0 0x4f
-  0x00 0x90 0x80 0x2f
-  0x00 0x90 0xa0 0x6f
-  0x00 0x98 0xc0 0x6f
-  0x00 0x90 0x80 0x0f
-  0x00 0x90 0xa0 0x4f
-  0x00 0x98 0xc0 0x4f
-  0x00 0x00 0x40 0x2f
-  0x00 0x00 0x50 0x6f
-  0x00 0x08 0x80 0x2f
-  0x00 0x08 0xa0 0x6f
-  0x00 0x40 0x40 0x2f
-  0x00 0x40 0x50 0x6f
-  0x00 0x48 0x80 0x2f
-  0x00 0x48 0xa0 0x6f
-  0x00 0x80 0x40 0x0f
-  0x00 0x80 0x50 0x4f
-  0x00 0x88 0x80 0x0f
-  0x00 0x88 0xa0 0x4f
-  0x00 0x20 0x40 0x0f
-  0x00 0x20 0x50 0x4f
-  0x00 0x28 0x80 0x0f
-  0x00 0x28 0xa0 0x4f
-  0x00 0x60 0x40 0x0f
-  0x00 0x60 0x50 0x4f
-  0x00 0x68 0x80 0x0f
-  0x00 0x68 0xa0 0x4f
-  0x00 0xa0 0x40 0x0f
-  0x00 0xa0 0x50 0x4f
-  0x00 0xa8 0x80 0x0f
-  0x00 0xa8 0xa0 0x4f
-  0x00 0x30 0x40 0x0f
-  0x00 0x30 0x50 0x4f
-  0x00 0x38 0x80 0x0f
-  0x00 0x38 0xa0 0x4f
-  0x00 0x70 0x40 0x0f
-  0x00 0x70 0x50 0x4f
-  0x00 0x78 0x80 0x0f
-  0x00 0x78 0xa0 0x4f
-  0x00 0xc0 0x40 0x0f
-  0x00 0xc0 0x50 0x4f
-  0x00 0xc8 0x80 0x0f
-  0x00 0xc8 0xa0 0x4f
-  0x00 0xb0 0x40 0x0f
-  0x00 0xb0 0x50 0x4f
-  0x00 0xb8 0x80 0x0f
-  0x00 0xb8 0xa0 0x4f
-  0x00 0xd0 0x40 0x0f
-  0x00 0xd0 0x50 0x4f
-  0x00 0xd8 0x80 0x0f
-  0x00 0xd8 0xa0 0x4f
-  0x00 0x20 0x40 0x2f
-  0x00 0x20 0x50 0x6f
-  0x00 0x28 0x80 0x2f
-  0x00 0x28 0xa0 0x6f
-  0x00 0x60 0x40 0x2f
-  0x00 0x60 0x50 0x6f
-  0x00 0x68 0x80 0x2f
-  0x00 0x68 0xa0 0x6f
-  0x00 0xa0 0x40 0x2f
-  0x00 0xa0 0x50 0x6f
-  0x00 0xa8 0x80 0x2f
-  0x00 0xa8 0xa0 0x6f
-
-# CHECK: fmla.2s	v0, v0, v0[0]
-# CHECK: fmla.4s	v0, v0, v0[1]
-# CHECK: fmla.2d	v0, v0, v0[1]
-# CHECK: fmls.2s	v0, v0, v0[0]
-# CHECK: fmls.4s	v0, v0, v0[1]
-# CHECK: fmls.2d	v0, v0, v0[1]
-# CHECK: fmulx.2s	v0, v0, v0[0]
-# CHECK: fmulx.4s	v0, v0, v0[1]
-# CHECK: fmulx.2d	v0, v0, v0[1]
-# CHECK: fmul.2s	v0, v0, v0[0]
-# CHECK: fmul.4s	v0, v0, v0[1]
-# CHECK: fmul.2d	v0, v0, v0[1]
-# CHECK: mla.4h	v0, v0, v0[0]
-# CHECK: mla.8h	v0, v0, v0[1]
-# CHECK: mla.2s	v0, v0, v0[2]
-# CHECK: mla.4s	v0, v0, v0[3]
-# CHECK: mls.4h	v0, v0, v0[0]
-# CHECK: mls.8h	v0, v0, v0[1]
-# CHECK: mls.2s	v0, v0, v0[2]
-# CHECK: mls.4s	v0, v0, v0[3]
-# CHECK: mul.4h	v0, v0, v0[0]
-# CHECK: mul.8h	v0, v0, v0[1]
-# CHECK: mul.2s	v0, v0, v0[2]
-# CHECK: mul.4s	v0, v0, v0[3]
-# CHECK: smlal.4s	v0, v0, v0[0]
-# CHECK: smlal2.4s	v0, v0, v0[1]
-# CHECK: smlal.2d	v0, v0, v0[2]
-# CHECK: smlal2.2d	v0, v0, v0[3]
-# CHECK: smlsl.4s	v0, v0, v0[0]
-# CHECK: smlsl2.4s	v0, v0, v0[1]
-# CHECK: smlsl.2d	v0, v0, v0[2]
-# CHECK: smlsl2.2d	v0, v0, v0[3]
-# CHECK: smull.4s	v0, v0, v0[0]
-# CHECK: smull2.4s	v0, v0, v0[1]
-# CHECK: smull.2d	v0, v0, v0[2]
-# CHECK: smull2.2d	v0, v0, v0[3]
-# CHECK: sqdmlal.4s	v0, v0, v0[0]
-# CHECK: sqdmlal2.4s	v0, v0, v0[1]
-# CHECK: sqdmlal.2d	v0, v0, v0[2]
-# CHECK: sqdmlal2.2d	v0, v0, v0[3]
-# CHECK: sqdmlsl.4s	v0, v0, v0[0]
-# CHECK: sqdmlsl2.4s	v0, v0, v0[1]
-# CHECK: sqdmlsl.2d	v0, v0, v0[2]
-# CHECK: sqdmlsl2.2d	v0, v0, v0[3]
-# CHECK: sqdmulh.4h	v0, v0, v0[0]
-# CHECK: sqdmulh.8h	v0, v0, v0[1]
-# CHECK: sqdmulh.2s	v0, v0, v0[2]
-# CHECK: sqdmulh.4s	v0, v0, v0[3]
-# CHECK: sqdmull.4s	v0, v0, v0[0]
-# CHECK: sqdmull2.4s	v0, v0, v0[1]
-# CHECK: sqdmull.2d	v0, v0, v0[2]
-# CHECK: sqdmull2.2d	v0, v0, v0[3]
-# CHECK: sqrdmulh.4h	v0, v0, v0[0]
-# CHECK: sqrdmulh.8h	v0, v0, v0[1]
-# CHECK: sqrdmulh.2s	v0, v0, v0[2]
-# CHECK: sqrdmulh.4s	v0, v0, v0[3]
-# CHECK: umlal.4s	v0, v0, v0[0]
-# CHECK: umlal2.4s	v0, v0, v0[1]
-# CHECK: umlal.2d	v0, v0, v0[2]
-# CHECK: umlal2.2d	v0, v0, v0[3]
-# CHECK: umlsl.4s	v0, v0, v0[0]
-# CHECK: umlsl2.4s	v0, v0, v0[1]
-# CHECK: umlsl.2d	v0, v0, v0[2]
-# CHECK: umlsl2.2d	v0, v0, v0[3]
-# CHECK: umull.4s	v0, v0, v0[0]
-# CHECK: umull2.4s	v0, v0, v0[1]
-# CHECK: umull.2d	v0, v0, v0[2]
-# CHECK: umull2.2d	v0, v0, v0[3]
-
-
-#===-------------------------------------------------------------------------===
-# AdvSIMD scalar + shift instructions
-#===-------------------------------------------------------------------------===
-
-  0x00 0x54 0x41 0x5f
-  0x00 0x54 0x41 0x7f
-  0x00 0x9c 0x09 0x5f
-  0x00 0x9c 0x12 0x5f
-  0x00 0x9c 0x23 0x5f
-  0x00 0x8c 0x09 0x7f
-  0x00 0x8c 0x12 0x7f
-  0x00 0x8c 0x23 0x7f
-  0x00 0x64 0x09 0x7f
-  0x00 0x64 0x12 0x7f
-  0x00 0x64 0x23 0x7f
-  0x00 0x64 0x44 0x7f
-  0x00 0x74 0x09 0x5f
-  0x00 0x74 0x12 0x5f
-  0x00 0x74 0x23 0x5f
-  0x00 0x74 0x44 0x5f
-  0x00 0x94 0x09 0x5f
-  0x00 0x94 0x12 0x5f
-  0x00 0x94 0x23 0x5f
-  0x00 0x84 0x09 0x7f
-  0x00 0x84 0x12 0x7f
-  0x00 0x84 0x23 0x7f
-  0x00 0x44 0x41 0x7f
-  0x00 0x24 0x41 0x5f
-  0x00 0x34 0x41 0x5f
-  0x00 0x04 0x41 0x5f
-  0x00 0xe4 0x21 0x7f
-  0x00 0xe4 0x42 0x7f
-  0x00 0x9c 0x09 0x7f
-  0x00 0x9c 0x12 0x7f
-  0x00 0x9c 0x23 0x7f
-  0x00 0x74 0x09 0x7f
-  0x00 0x74 0x12 0x7f
-  0x00 0x74 0x23 0x7f
-  0x00 0x74 0x44 0x7f
-  0x00 0x94 0x09 0x7f
-  0x00 0x94 0x12 0x7f
-  0x00 0x94 0x23 0x7f
-  0x00 0x24 0x41 0x7f
-  0x00 0x34 0x41 0x7f
-  0x00 0x04 0x41 0x7f
-  0x00 0x14 0x41 0x7f
-
-# CHECK: shl	d0, d0, #1
-# CHECK: sli	d0, d0, #1
-# CHECK: sqrshrn	b0, h0, #7
-# CHECK: sqrshrn	h0, s0, #14
-# CHECK: sqrshrn	s0, d0, #29
-# CHECK: sqrshrun	b0, h0, #7
-# CHECK: sqrshrun	h0, s0, #14
-# CHECK: sqrshrun	s0, d0, #29
-# CHECK: sqshlu	b0, b0, #1
-# CHECK: sqshlu	h0, h0, #2
-# CHECK: sqshlu	s0, s0, #3
-# CHECK: sqshlu	d0, d0, #4
-# CHECK: sqshl	b0, b0, #1
-# CHECK: sqshl	h0, h0, #2
-# CHECK: sqshl	s0, s0, #3
-# CHECK: sqshl	d0, d0, #4
-# CHECK: sqshrn	b0, h0, #7
-# CHECK: sqshrn	h0, s0, #14
-# CHECK: sqshrn	s0, d0, #29
-# CHECK: sqshrun	b0, h0, #7
-# CHECK: sqshrun	h0, s0, #14
-# CHECK: sqshrun	s0, d0, #29
-# CHECK: sri	d0, d0, #63
-# CHECK: srshr	d0, d0, #63
-# CHECK: srsra	d0, d0, #63
-# CHECK: sshr	d0, d0, #63
-# CHECK: ucvtf	s0, s0, #31
-# CHECK: ucvtf	d0, d0, #62
-# CHECK: uqrshrn	b0, h0, #7
-# CHECK: uqrshrn	h0, s0, #14
-# CHECK: uqrshrn	s0, d0, #29
-# CHECK: uqshl	b0, b0, #1
-# CHECK: uqshl	h0, h0, #2
-# CHECK: uqshl	s0, s0, #3
-# CHECK: uqshl	d0, d0, #4
-# CHECK: uqshrn	b0, h0, #7
-# CHECK: uqshrn	h0, s0, #14
-# CHECK: uqshrn	s0, d0, #29
-# CHECK: urshr	d0, d0, #63
-# CHECK: ursra	d0, d0, #63
-# CHECK: ushr	d0, d0, #63
-# CHECK: usra	d0, d0, #63
-
-#===-------------------------------------------------------------------------===
-# AdvSIMD vector + shift instructions
-#===-------------------------------------------------------------------------===
-
-  0x00 0xfc 0x21 0x0f
-  0x00 0xfc 0x22 0x4f
-  0x00 0xfc 0x43 0x4f
-  0x00 0xfc 0x21 0x2f
-  0x00 0xfc 0x22 0x6f
-  0x00 0xfc 0x43 0x6f
-  0x00 0x8c 0x09 0x0f
-  0x00 0x8c 0x0a 0x4f
-  0x00 0x8c 0x13 0x0f
-  0x00 0x8c 0x14 0x4f
-  0x00 0x8c 0x25 0x0f
-  0x00 0x8c 0x26 0x4f
-  0x00 0xe4 0x21 0x0f
-  0x00 0xe4 0x22 0x4f
-  0x00 0xe4 0x43 0x4f
-  0x00 0x54 0x09 0x0f
-  0x00 0x54 0x0a 0x4f
-  0x00 0x54 0x13 0x0f
-  0x00 0x54 0x14 0x4f
-  0x00 0x54 0x25 0x0f
-  0x00 0x54 0x26 0x4f
-  0x00 0x54 0x47 0x4f
-  0x00 0x84 0x09 0x0f
-  0x00 0x84 0x0a 0x4f
-  0x00 0x84 0x13 0x0f
-  0x00 0x84 0x14 0x4f
-  0x00 0x84 0x25 0x0f
-  0x00 0x84 0x26 0x4f
-  0x00 0x54 0x09 0x2f
-  0x00 0x54 0x0a 0x6f
-  0x00 0x54 0x13 0x2f
-  0x00 0x54 0x14 0x6f
-  0x00 0x54 0x25 0x2f
-  0x00 0x54 0x26 0x6f
-  0x00 0x54 0x47 0x6f
-  0x00 0x9c 0x09 0x0f
-  0x00 0x9c 0x0a 0x4f
-  0x00 0x9c 0x13 0x0f
-  0x00 0x9c 0x14 0x4f
-  0x00 0x9c 0x25 0x0f
-  0x00 0x9c 0x26 0x4f
-  0x00 0x8c 0x09 0x2f
-  0x00 0x8c 0x0a 0x6f
-  0x00 0x8c 0x13 0x2f
-  0x00 0x8c 0x14 0x6f
-  0x00 0x8c 0x25 0x2f
-  0x00 0x8c 0x26 0x6f
-  0x00 0x64 0x09 0x2f
-  0x00 0x64 0x0a 0x6f
-  0x00 0x64 0x13 0x2f
-  0x00 0x64 0x14 0x6f
-  0x00 0x64 0x25 0x2f
-  0x00 0x64 0x26 0x6f
-  0x00 0x64 0x47 0x6f
-  0x00 0x74 0x09 0x0f
-  0x00 0x74 0x0a 0x4f
-  0x00 0x74 0x13 0x0f
-  0x00 0x74 0x14 0x4f
-  0x00 0x74 0x25 0x0f
-  0x00 0x74 0x26 0x4f
-  0x00 0x74 0x47 0x4f
-  0x00 0x94 0x09 0x0f
-  0x00 0x94 0x0a 0x4f
-  0x00 0x94 0x13 0x0f
-  0x00 0x94 0x14 0x4f
-  0x00 0x94 0x25 0x0f
-  0x00 0x94 0x26 0x4f
-  0x00 0x84 0x09 0x2f
-  0x00 0x84 0x0a 0x6f
-  0x00 0x84 0x13 0x2f
-  0x00 0x84 0x14 0x6f
-  0x00 0x84 0x25 0x2f
-  0x00 0x84 0x26 0x6f
-  0x00 0x44 0x09 0x2f
-  0x00 0x44 0x0a 0x6f
-  0x00 0x44 0x13 0x2f
-  0x00 0x44 0x14 0x6f
-  0x00 0x44 0x25 0x2f
-  0x00 0x44 0x26 0x6f
-  0x00 0x44 0x47 0x6f
-  0x00 0x24 0x09 0x0f
-  0x00 0x24 0x0a 0x4f
-  0x00 0x24 0x13 0x0f
-  0x00 0x24 0x14 0x4f
-  0x00 0x24 0x25 0x0f
-  0x00 0x24 0x26 0x4f
-  0x00 0x24 0x47 0x4f
-  0x00 0x34 0x09 0x0f
-  0x00 0x34 0x0a 0x4f
-  0x00 0x34 0x13 0x0f
-  0x00 0x34 0x14 0x4f
-  0x00 0x34 0x25 0x0f
-  0x00 0x34 0x26 0x4f
-  0x00 0x34 0x47 0x4f
-  0x00 0xa4 0x09 0x0f
-  0x00 0xa4 0x0a 0x4f
-  0x00 0xa4 0x13 0x0f
-  0x00 0xa4 0x14 0x4f
-  0x00 0xa4 0x25 0x0f
-  0x00 0xa4 0x26 0x4f
-  0x00 0x04 0x09 0x0f
-  0x00 0x04 0x0a 0x4f
-  0x00 0x04 0x13 0x0f
-  0x00 0x04 0x14 0x4f
-  0x00 0x04 0x25 0x0f
-  0x00 0x04 0x26 0x4f
-  0x00 0x04 0x47 0x4f
-  0x00 0x04 0x09 0x0f
-  0x00 0x14 0x0a 0x4f
-  0x00 0x14 0x13 0x0f
-  0x00 0x14 0x14 0x4f
-  0x00 0x14 0x25 0x0f
-  0x00 0x14 0x26 0x4f
-  0x00 0x14 0x47 0x4f
-  0x00 0x14 0x40 0x5f
-  0x00 0xe4 0x21 0x2f
-  0x00 0xe4 0x22 0x6f
-  0x00 0xe4 0x43 0x6f
-  0x00 0x9c 0x09 0x2f
-  0x00 0x9c 0x0a 0x6f
-  0x00 0x9c 0x13 0x2f
-  0x00 0x9c 0x14 0x6f
-  0x00 0x9c 0x25 0x2f
-  0x00 0x9c 0x26 0x6f
-  0x00 0x74 0x09 0x2f
-  0x00 0x74 0x0a 0x6f
-  0x00 0x74 0x13 0x2f
-  0x00 0x74 0x14 0x6f
-  0x00 0x74 0x25 0x2f
-  0x00 0x74 0x26 0x6f
-  0x00 0x74 0x47 0x6f
-  0x00 0x94 0x09 0x2f
-  0x00 0x94 0x0a 0x6f
-  0x00 0x94 0x13 0x2f
-  0x00 0x94 0x14 0x6f
-  0x00 0x94 0x25 0x2f
-  0x00 0x94 0x26 0x6f
-  0x00 0x24 0x09 0x2f
-  0x00 0x24 0x0a 0x6f
-  0x00 0x24 0x13 0x2f
-  0x00 0x24 0x14 0x6f
-  0x00 0x24 0x25 0x2f
-  0x00 0x24 0x26 0x6f
-  0x00 0x24 0x47 0x6f
-  0x00 0x34 0x09 0x2f
-  0x00 0x34 0x0a 0x6f
-  0x00 0x34 0x13 0x2f
-  0x00 0x34 0x14 0x6f
-  0x00 0x34 0x25 0x2f
-  0x00 0x34 0x26 0x6f
-  0x00 0x34 0x47 0x6f
-  0x00 0xa4 0x09 0x2f
-  0x00 0xa4 0x0a 0x6f
-  0x00 0xa4 0x13 0x2f
-  0x00 0xa4 0x14 0x6f
-  0x00 0xa4 0x25 0x2f
-  0x00 0xa4 0x26 0x6f
-  0x00 0x04 0x09 0x2f
-  0x00 0x04 0x0a 0x6f
-  0x00 0x04 0x13 0x2f
-  0x00 0x04 0x14 0x6f
-  0x00 0x04 0x25 0x2f
-  0x00 0x04 0x26 0x6f
-  0x00 0x04 0x47 0x6f
-  0x00 0x14 0x09 0x2f
-  0x00 0x14 0x0a 0x6f
-  0x00 0x14 0x13 0x2f
-  0x00 0x14 0x14 0x6f
-  0x00 0x14 0x25 0x2f
-  0x00 0x14 0x26 0x6f
-  0x00 0x14 0x47 0x6f
-
-# CHECK: fcvtzs.2s	v0, v0, #31
-# CHECK: fcvtzs.4s	v0, v0, #30
-# CHECK: fcvtzs.2d	v0, v0, #61
-# CHECK: fcvtzu.2s	v0, v0, #31
-# CHECK: fcvtzu.4s	v0, v0, #30
-# CHECK: fcvtzu.2d	v0, v0, #61
-# CHECK: rshrn.8b	v0, v0, #7
-# CHECK: rshrn2.16b	v0, v0, #6
-# CHECK: rshrn.4h	v0, v0, #13
-# CHECK: rshrn2.8h	v0, v0, #12
-# CHECK: rshrn.2s	v0, v0, #27
-# CHECK: rshrn2.4s	v0, v0, #26
-# CHECK: scvtf.2s	v0, v0, #31
-# CHECK: scvtf.4s	v0, v0, #30
-# CHECK: scvtf.2d	v0, v0, #61
-# CHECK: shl.8b	v0, v0, #1
-# CHECK: shl.16b	v0, v0, #2
-# CHECK: shl.4h	v0, v0, #3
-# CHECK: shl.8h	v0, v0, #4
-# CHECK: shl.2s	v0, v0, #5
-# CHECK: shl.4s	v0, v0, #6
-# CHECK: shl.2d	v0, v0, #7
-# CHECK: shrn.8b	v0, v0, #7
-# CHECK: shrn2.16b	v0, v0, #6
-# CHECK: shrn.4h	v0, v0, #13
-# CHECK: shrn2.8h	v0, v0, #12
-# CHECK: shrn.2s	v0, v0, #27
-# CHECK: shrn2.4s	v0, v0, #26
-# CHECK: sli.8b	v0, v0, #1
-# CHECK: sli.16b	v0, v0, #2
-# CHECK: sli.4h	v0, v0, #3
-# CHECK: sli.8h	v0, v0, #4
-# CHECK: sli.2s	v0, v0, #5
-# CHECK: sli.4s	v0, v0, #6
-# CHECK: sli.2d	v0, v0, #7
-# CHECK: sqrshrn.8b	v0, v0, #7
-# CHECK: sqrshrn2.16b	v0, v0, #6
-# CHECK: sqrshrn.4h	v0, v0, #13
-# CHECK: sqrshrn2.8h	v0, v0, #12
-# CHECK: sqrshrn.2s	v0, v0, #27
-# CHECK: sqrshrn2.4s	v0, v0, #26
-# CHECK: sqrshrun.8b	v0, v0, #7
-# CHECK: sqrshrun2.16b	v0, v0, #6
-# CHECK: sqrshrun.4h	v0, v0, #13
-# CHECK: sqrshrun2.8h	v0, v0, #12
-# CHECK: sqrshrun.2s	v0, v0, #27
-# CHECK: sqrshrun2.4s	v0, v0, #26
-# CHECK: sqshlu.8b	v0, v0, #1
-# CHECK: sqshlu.16b	v0, v0, #2
-# CHECK: sqshlu.4h	v0, v0, #3
-# CHECK: sqshlu.8h	v0, v0, #4
-# CHECK: sqshlu.2s	v0, v0, #5
-# CHECK: sqshlu.4s	v0, v0, #6
-# CHECK: sqshlu.2d	v0, v0, #7
-# CHECK: sqshl.8b	v0, v0, #1
-# CHECK: sqshl.16b	v0, v0, #2
-# CHECK: sqshl.4h	v0, v0, #3
-# CHECK: sqshl.8h	v0, v0, #4
-# CHECK: sqshl.2s	v0, v0, #5
-# CHECK: sqshl.4s	v0, v0, #6
-# CHECK: sqshl.2d	v0, v0, #7
-# CHECK: sqshrn.8b	v0, v0, #7
-# CHECK: sqshrn2.16b	v0, v0, #6
-# CHECK: sqshrn.4h	v0, v0, #13
-# CHECK: sqshrn2.8h	v0, v0, #12
-# CHECK: sqshrn.2s	v0, v0, #27
-# CHECK: sqshrn2.4s	v0, v0, #26
-# CHECK: sqshrun.8b	v0, v0, #7
-# CHECK: sqshrun2.16b	v0, v0, #6
-# CHECK: sqshrun.4h	v0, v0, #13
-# CHECK: sqshrun2.8h	v0, v0, #12
-# CHECK: sqshrun.2s	v0, v0, #27
-# CHECK: sqshrun2.4s	v0, v0, #26
-# CHECK: sri.8b	v0, v0, #7
-# CHECK: sri.16b	v0, v0, #6
-# CHECK: sri.4h	v0, v0, #13
-# CHECK: sri.8h	v0, v0, #12
-# CHECK: sri.2s	v0, v0, #27
-# CHECK: sri.4s	v0, v0, #26
-# CHECK: sri.2d	v0, v0, #57
-# CHECK: srshr.8b	v0, v0, #7
-# CHECK: srshr.16b	v0, v0, #6
-# CHECK: srshr.4h	v0, v0, #13
-# CHECK: srshr.8h	v0, v0, #12
-# CHECK: srshr.2s	v0, v0, #27
-# CHECK: srshr.4s	v0, v0, #26
-# CHECK: srshr.2d	v0, v0, #57
-# CHECK: srsra.8b	v0, v0, #7
-# CHECK: srsra.16b	v0, v0, #6
-# CHECK: srsra.4h	v0, v0, #13
-# CHECK: srsra.8h	v0, v0, #12
-# CHECK: srsra.2s	v0, v0, #27
-# CHECK: srsra.4s	v0, v0, #26
-# CHECK: srsra.2d	v0, v0, #57
-# CHECK: sshll.8h	v0, v0, #1
-# CHECK: sshll2.8h	v0, v0, #2
-# CHECK: sshll.4s	v0, v0, #3
-# CHECK: sshll2.4s	v0, v0, #4
-# CHECK: sshll.2d	v0, v0, #5
-# CHECK: sshll2.2d	v0, v0, #6
-# CHECK: sshr.8b	v0, v0, #7
-# CHECK: sshr.16b	v0, v0, #6
-# CHECK: sshr.4h	v0, v0, #13
-# CHECK: sshr.8h	v0, v0, #12
-# CHECK: sshr.2s	v0, v0, #27
-# CHECK: sshr.4s	v0, v0, #26
-# CHECK: sshr.2d	v0, v0, #57
-# CHECK: sshr.8b	v0, v0, #7
-# CHECK: ssra.16b	v0, v0, #6
-# CHECK: ssra.4h	v0, v0, #13
-# CHECK: ssra.8h	v0, v0, #12
-# CHECK: ssra.2s	v0, v0, #27
-# CHECK: ssra.4s	v0, v0, #26
-# CHECK: ssra.2d	v0, v0, #57
-# CHECK: ssra		d0, d0, #64
-# CHECK: ucvtf.2s	v0, v0, #31
-# CHECK: ucvtf.4s	v0, v0, #30
-# CHECK: ucvtf.2d	v0, v0, #61
-# CHECK: uqrshrn.8b	v0, v0, #7
-# CHECK: uqrshrn2.16b	v0, v0, #6
-# CHECK: uqrshrn.4h	v0, v0, #13
-# CHECK: uqrshrn2.8h	v0, v0, #12
-# CHECK: uqrshrn.2s	v0, v0, #27
-# CHECK: uqrshrn2.4s	v0, v0, #26
-# CHECK: uqshl.8b	v0, v0, #1
-# CHECK: uqshl.16b	v0, v0, #2
-# CHECK: uqshl.4h	v0, v0, #3
-# CHECK: uqshl.8h	v0, v0, #4
-# CHECK: uqshl.2s	v0, v0, #5
-# CHECK: uqshl.4s	v0, v0, #6
-# CHECK: uqshl.2d	v0, v0, #7
-# CHECK: uqshrn.8b	v0, v0, #7
-# CHECK: uqshrn2.16b	v0, v0, #6
-# CHECK: uqshrn.4h	v0, v0, #13
-# CHECK: uqshrn2.8h	v0, v0, #12
-# CHECK: uqshrn.2s	v0, v0, #27
-# CHECK: uqshrn2.4s	v0, v0, #26
-# CHECK: urshr.8b	v0, v0, #7
-# CHECK: urshr.16b	v0, v0, #6
-# CHECK: urshr.4h	v0, v0, #13
-# CHECK: urshr.8h	v0, v0, #12
-# CHECK: urshr.2s	v0, v0, #27
-# CHECK: urshr.4s	v0, v0, #26
-# CHECK: urshr.2d	v0, v0, #57
-# CHECK: ursra.8b	v0, v0, #7
-# CHECK: ursra.16b	v0, v0, #6
-# CHECK: ursra.4h	v0, v0, #13
-# CHECK: ursra.8h	v0, v0, #12
-# CHECK: ursra.2s	v0, v0, #27
-# CHECK: ursra.4s	v0, v0, #26
-# CHECK: ursra.2d	v0, v0, #57
-# CHECK: ushll.8h	v0, v0, #1
-# CHECK: ushll2.8h	v0, v0, #2
-# CHECK: ushll.4s	v0, v0, #3
-# CHECK: ushll2.4s	v0, v0, #4
-# CHECK: ushll.2d	v0, v0, #5
-# CHECK: ushll2.2d	v0, v0, #6
-# CHECK: ushr.8b	v0, v0, #7
-# CHECK: ushr.16b	v0, v0, #6
-# CHECK: ushr.4h	v0, v0, #13
-# CHECK: ushr.8h	v0, v0, #12
-# CHECK: ushr.2s	v0, v0, #27
-# CHECK: ushr.4s	v0, v0, #26
-# CHECK: ushr.2d	v0, v0, #57
-# CHECK: usra.8b	v0, v0, #7
-# CHECK: usra.16b	v0, v0, #6
-# CHECK: usra.4h	v0, v0, #13
-# CHECK: usra.8h	v0, v0, #12
-# CHECK: usra.2s	v0, v0, #27
-# CHECK: usra.4s	v0, v0, #26
-# CHECK: usra.2d	v0, v0, #57
-
-
-  0x00 0xe0 0x20 0x0e
-  0x00 0xe0 0x20 0x4e
-  0x00 0xe0 0xe0 0x0e
-  0x00 0xe0 0xe0 0x4e
-
-# CHECK: pmull.8h v0, v0, v0
-# CHECK: pmull2.8h v0, v0, v0
-# CHECK: pmull.1q v0, v0, v0
-# CHECK: pmull2.1q v0, v0, v0
-
-  0x41 0xd8 0x70 0x7e
-  0x83 0xd8 0x30 0x7e
-# CHECK: faddp.2d	d1, v2
-# CHECK: faddp.2s	s3, v4
-
-  0x82 0x60 0x01 0x4e
-  0x80 0x60 0x01 0x0e
-  0xa2 0x00 0x01 0x4e
-  0xa0 0x00 0x01 0x0e
-  0xa2 0x40 0x01 0x4e
-  0xa0 0x40 0x01 0x0e
-  0xc2 0x20 0x01 0x4e
-  0xc0 0x20 0x01 0x0e
-
-# CHECK: tbl.16b	v2, { v4, v5, v6, v7 }, v1
-# CHECK: tbl.8b	v0, { v4, v5, v6, v7 }, v1
-# CHECK: tbl.16b	v2, { v5 }, v1
-# CHECK: tbl.8b	v0, { v5 }, v1
-# CHECK: tbl.16b	v2, { v5, v6, v7 }, v1
-# CHECK: tbl.8b	v0, { v5, v6, v7 }, v1
-# CHECK: tbl.16b	v2, { v6, v7 }, v1
-# CHECK: tbl.8b	v0, { v6, v7 }, v1
-#
-  0x82 0x70 0x01 0x4e
-  0x80 0x70 0x01 0x0e
-  0xa2 0x10 0x01 0x4e
-  0xa0 0x10 0x01 0x0e
-  0xa2 0x50 0x01 0x4e
-  0xa0 0x50 0x01 0x0e
-  0xc2 0x30 0x01 0x4e
-  0xc0 0x30 0x01 0x0e
-
-# CHECK: tbx.16b	v2, { v4, v5, v6, v7 }, v1
-# CHECK: tbx.8b	v0, { v4, v5, v6, v7 }, v1
-# CHECK: tbx.16b	v2, { v5 }, v1
-# CHECK: tbx.8b	v0, { v5 }, v1
-# CHECK: tbx.16b	v2, { v5, v6, v7 }, v1
-# CHECK: tbx.8b	v0, { v5, v6, v7 }, v1
-# CHECK: tbx.16b	v2, { v6, v7 }, v1
-# CHECK: tbx.8b	v0, { v6, v7 }, v1
-#
-
-0x00 0x80 0x20 0x0e
-0x00 0x80 0x20 0x4e
-0x00 0x80 0xa0 0x0e
-0x00 0x80 0xa0 0x4e
-
-# CHECK: smlal.8h v0, v0, v0
-# CHECK: smlal2.8h v0, v0, v0
-# CHECK: smlal.2d v0, v0, v0
-# CHECK: smlal2.2d v0, v0, v0
-
-0x00 0x80 0x20 0x2e
-0x00 0x80 0x20 0x6e
-0x00 0x80 0xa0 0x2e
-0x00 0x80 0xa0 0x6e
-
-# CHECK: umlal.8h v0, v0, v0
-# CHECK: umlal2.8h v0, v0, v0
-# CHECK: umlal.2d v0, v0, v0
-# CHECK: umlal2.2d v0, v0, v0
-
-0x00 0x90 0x60 0x5e
-0x00 0x90 0xa0 0x5e
-0x00 0xb0 0x60 0x5e
-0x00 0xb0 0xa0 0x5e
-
-# CHECK: sqdmlal s0, h0, h0
-# CHECK: sqdmlal d0, s0, s0
-# CHECK: sqdmlsl s0, h0, h0
-# CHECK: sqdmlsl d0, s0, s0
-
-0xaa 0xc5 0xc7 0x4d
-0xaa 0xc9 0xc7 0x4d
-0xaa 0xc1 0xc7 0x4d
-
-# CHECK: ld1r.8h { v10 }, [x13], x7
-# CHECK: ld1r.4s { v10 }, [x13], x7
-# CHECK: ld1r.16b { v10 }, [x13], x7
-
-0x00 0xd0 0x60 0x5e
-0x00 0xd0 0xa0 0x5e
-# CHECK: sqdmull	s0, h0, h0
-# CHECK: sqdmull	d0, s0, s0
-
-0x00 0xd8 0xa1 0x7e
-0x00 0xd8 0xe1 0x7e
-
-# CHECK: frsqrte s0, s0
-# CHECK: frsqrte d0, d0
-
-0xca 0xcd 0xc7 0x4d
-0xea 0xc9 0xe7 0x4d
-0xea 0xe9 0xc7 0x4d
-0xea 0xe9 0xe7 0x4d
-# CHECK: ld1r.2d	{ v10 }, [x14], x7
-# CHECK: ld2r.4s	{ v10, v11 }, [x15], x7
-# CHECK: ld3r.4s	{ v10, v11, v12 }, [x15], x7
-# CHECK: ld4r.4s	{ v10, v11, v12, v13 }, [x15], x7
-
-#===-------------------------------------------------------------------------===
-# AdvSIMD scalar three same
-#===-------------------------------------------------------------------------===
-0x62 0xdc 0x21 0x5e
-# CHECK: fmulx	s2, s3, s1
-0x62 0xdc 0x61 0x5e
-# CHECK: fmulx	d2, d3, d1
-
-
-# rdar://12511369
-0xe8 0x6b 0xdf 0x4c
-# CHECK: ld1.4s	{ v8, v9, v10 }, [sp], #48
diff --git a/test/MC/Disassembler/ARM64/arithmetic.txt b/test/MC/Disassembler/ARM64/arithmetic.txt
deleted file mode 100644
index 3981219..0000000
--- a/test/MC/Disassembler/ARM64/arithmetic.txt
+++ /dev/null
@@ -1,522 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
-
-#==---------------------------------------------------------------------------==
-# Add/Subtract with carry/borrow
-#==---------------------------------------------------------------------------==
-
-0x41 0x00 0x03 0x1a
-0x41 0x00 0x03 0x9a
-0x85 0x00 0x03 0x3a
-0x85 0x00 0x03 0xba
-
-# CHECK: adc  w1, w2, w3
-# CHECK: adc  x1, x2, x3
-# CHECK: adcs w5, w4, w3
-# CHECK: adcs x5, x4, x3
-
-0x41 0x00 0x03 0x5a
-0x41 0x00 0x03 0xda
-0x41 0x00 0x03 0x7a
-0x41 0x00 0x03 0xfa
-
-# CHECK: sbc  w1, w2, w3
-# CHECK: sbc  x1, x2, x3
-# CHECK: sbcs w1, w2, w3
-# CHECK: sbcs x1, x2, x3
-
-#==---------------------------------------------------------------------------==
-# Add/Subtract with (optionally shifted) immediate
-#==---------------------------------------------------------------------------==
-
-0x83 0x00 0x10 0x11
-0x83 0x00 0x10 0x91
-
-# CHECK: add w3, w4, #1024
-# CHECK: add x3, x4, #1024
-
-0x83 0x00 0x50 0x11
-0x83 0x00 0x40 0x11
-0x83 0x00 0x50 0x91
-0x83 0x00 0x40 0x91
-0xff 0x83 0x00 0x91
-
-# CHECK: add w3, w4, #4194304
-# CHECK: add x3, x4, #4194304
-# CHECK: add x3, x4, #0, lsl #12
-# CHECK: add sp, sp, #32
-
-0x83 0x00 0x10 0x31
-0x83 0x00 0x50 0x31
-0x83 0x00 0x10 0xb1
-0x83 0x00 0x50 0xb1
-
-# CHECK: adds w3, w4, #1024
-# CHECK: adds w3, w4, #4194304
-# CHECK: adds x3, x4, #1024
-# CHECK: adds x3, x4, #4194304
-
-0x83 0x00 0x10 0x51
-0x83 0x00 0x50 0x51
-0x83 0x00 0x10 0xd1
-0x83 0x00 0x50 0xd1
-0xff 0x83 0x00 0xd1
-
-# CHECK: sub w3, w4, #1024
-# CHECK: sub w3, w4, #4194304
-# CHECK: sub x3, x4, #1024
-# CHECK: sub x3, x4, #4194304
-# CHECK: sub sp, sp, #32
-
-0x83 0x00 0x10 0x71
-0x83 0x00 0x50 0x71
-0x83 0x00 0x10 0xf1
-0x83 0x00 0x50 0xf1
-
-# CHECK: subs w3, w4, #1024
-# CHECK: subs w3, w4, #4194304
-# CHECK: subs x3, x4, #1024
-# CHECK: subs x3, x4, #4194304
-
-#==---------------------------------------------------------------------------==
-# Add/Subtract register with (optional) shift
-#==---------------------------------------------------------------------------==
-
-0xac 0x01 0x0e 0x0b
-0xac 0x01 0x0e 0x8b
-0xac 0x31 0x0e 0x0b
-0xac 0x31 0x0e 0x8b
-0xac 0xa9 0x4e 0x0b
-0xac 0xa9 0x4e 0x8b
-0xac 0x9d 0x8e 0x0b
-0xac 0x9d 0x8e 0x8b
-
-# CHECK: add w12, w13, w14
-# CHECK: add x12, x13, x14
-# CHECK: add w12, w13, w14, lsl #12
-# CHECK: add x12, x13, x14, lsl #12
-# CHECK: add w12, w13, w14, lsr #42
-# CHECK: add x12, x13, x14, lsr #42
-# CHECK: add w12, w13, w14, asr #39
-# CHECK: add x12, x13, x14, asr #39
-
-0xac 0x01 0x0e 0x4b
-0xac 0x01 0x0e 0xcb
-0xac 0x31 0x0e 0x4b
-0xac 0x31 0x0e 0xcb
-0xac 0xa9 0x4e 0x4b
-0xac 0xa9 0x4e 0xcb
-0xac 0x9d 0x8e 0x4b
-0xac 0x9d 0x8e 0xcb
-
-# CHECK: sub w12, w13, w14
-# CHECK: sub x12, x13, x14
-# CHECK: sub w12, w13, w14, lsl #12
-# CHECK: sub x12, x13, x14, lsl #12
-# CHECK: sub w12, w13, w14, lsr #42
-# CHECK: sub x12, x13, x14, lsr #42
-# CHECK: sub w12, w13, w14, asr #39
-# CHECK: sub x12, x13, x14, asr #39
-
-0xac 0x01 0x0e 0x2b
-0xac 0x01 0x0e 0xab
-0xac 0x31 0x0e 0x2b
-0xac 0x31 0x0e 0xab
-0xac 0xa9 0x4e 0x2b
-0xac 0xa9 0x4e 0xab
-0xac 0x9d 0x8e 0x2b
-0xac 0x9d 0x8e 0xab
-
-# CHECK: adds w12, w13, w14
-# CHECK: adds x12, x13, x14
-# CHECK: adds w12, w13, w14, lsl #12
-# CHECK: adds x12, x13, x14, lsl #12
-# CHECK: adds w12, w13, w14, lsr #42
-# CHECK: adds x12, x13, x14, lsr #42
-# CHECK: adds w12, w13, w14, asr #39
-# CHECK: adds x12, x13, x14, asr #39
-
-0xac 0x01 0x0e 0x6b
-0xac 0x01 0x0e 0xeb
-0xac 0x31 0x0e 0x6b
-0xac 0x31 0x0e 0xeb
-0xac 0xa9 0x4e 0x6b
-0xac 0xa9 0x4e 0xeb
-0xac 0x9d 0x8e 0x6b
-0xac 0x9d 0x8e 0xeb
-
-# CHECK: subs w12, w13, w14
-# CHECK: subs x12, x13, x14
-# CHECK: subs w12, w13, w14, lsl #12
-# CHECK: subs x12, x13, x14, lsl #12
-# CHECK: subs w12, w13, w14, lsr #42
-# CHECK: subs x12, x13, x14, lsr #42
-# CHECK: subs w12, w13, w14, asr #39
-# CHECK: subs x12, x13, x14, asr #39
-
-#==---------------------------------------------------------------------------==
-# Add/Subtract with (optional) extend
-#==---------------------------------------------------------------------------==
-
-0x41 0x00 0x23 0x0b
-0x41 0x20 0x23 0x0b
-0x41 0x40 0x23 0x0b
-0x41 0x60 0x23 0x0b
-0x41 0x80 0x23 0x0b
-0x41 0xa0 0x23 0x0b
-0x41 0xc0 0x23 0x0b
-0x41 0xe0 0x23 0x0b
-
-# CHECK: add w1, w2, w3, uxtb
-# CHECK: add w1, w2, w3, uxth
-# CHECK: add w1, w2, w3, uxtw
-# CHECK: add w1, w2, w3, uxtx
-# CHECK: add w1, w2, w3, sxtb
-# CHECK: add w1, w2, w3, sxth
-# CHECK: add w1, w2, w3, sxtw
-# CHECK: add w1, w2, w3, sxtx
-
-0x41 0x00 0x23 0x8b
-0x41 0x20 0x23 0x8b
-0x41 0x40 0x23 0x8b
-0x41 0x80 0x23 0x8b
-0x41 0xa0 0x23 0x8b
-0x41 0xc0 0x23 0x8b
-
-# CHECK: add x1, x2, w3, uxtb
-# CHECK: add x1, x2, w3, uxth
-# CHECK: add x1, x2, w3, uxtw
-# CHECK: add x1, x2, w3, sxtb
-# CHECK: add x1, x2, w3, sxth
-# CHECK: add x1, x2, w3, sxtw
-
-0xe1 0x43 0x23 0x0b
-0xe1 0x43 0x23 0x0b
-0x5f 0x60 0x23 0x8b
-0x5f 0x60 0x23 0x8b
-
-# CHECK: add w1, wsp, w3
-# CHECK: add w1, wsp, w3
-# CHECK: add sp, x2, x3
-# CHECK: add sp, x2, x3
-
-0x41 0x00 0x23 0x4b
-0x41 0x20 0x23 0x4b
-0x41 0x40 0x23 0x4b
-0x41 0x60 0x23 0x4b
-0x41 0x80 0x23 0x4b
-0x41 0xa0 0x23 0x4b
-0x41 0xc0 0x23 0x4b
-0x41 0xe0 0x23 0x4b
-
-# CHECK: sub w1, w2, w3, uxtb
-# CHECK: sub w1, w2, w3, uxth
-# CHECK: sub w1, w2, w3, uxtw
-# CHECK: sub w1, w2, w3, uxtx
-# CHECK: sub w1, w2, w3, sxtb
-# CHECK: sub w1, w2, w3, sxth
-# CHECK: sub w1, w2, w3, sxtw
-# CHECK: sub w1, w2, w3, sxtx
-
-0x41 0x00 0x23 0xcb
-0x41 0x20 0x23 0xcb
-0x41 0x40 0x23 0xcb
-0x41 0x80 0x23 0xcb
-0x41 0xa0 0x23 0xcb
-0x41 0xc0 0x23 0xcb
-
-# CHECK: sub x1, x2, w3, uxtb
-# CHECK: sub x1, x2, w3, uxth
-# CHECK: sub x1, x2, w3, uxtw
-# CHECK: sub x1, x2, w3, sxtb
-# CHECK: sub x1, x2, w3, sxth
-# CHECK: sub x1, x2, w3, sxtw
-
-0xe1 0x43 0x23 0x4b
-0xe1 0x43 0x23 0x4b
-0x5f 0x60 0x23 0xcb
-0x5f 0x60 0x23 0xcb
-
-# CHECK: sub w1, wsp, w3
-# CHECK: sub w1, wsp, w3
-# CHECK: sub sp, x2, x3
-# CHECK: sub sp, x2, x3
-
-0x41 0x00 0x23 0x2b
-0x41 0x20 0x23 0x2b
-0x41 0x40 0x23 0x2b
-0x41 0x60 0x23 0x2b
-0x41 0x80 0x23 0x2b
-0x41 0xa0 0x23 0x2b
-0x41 0xc0 0x23 0x2b
-0x41 0xe0 0x23 0x2b
-
-# CHECK: adds w1, w2, w3, uxtb
-# CHECK: adds w1, w2, w3, uxth
-# CHECK: adds w1, w2, w3, uxtw
-# CHECK: adds w1, w2, w3, uxtx
-# CHECK: adds w1, w2, w3, sxtb
-# CHECK: adds w1, w2, w3, sxth
-# CHECK: adds w1, w2, w3, sxtw
-# CHECK: adds w1, w2, w3, sxtx
-
-0x41 0x00 0x23 0xab
-0x41 0x20 0x23 0xab
-0x41 0x40 0x23 0xab
-0x41 0x80 0x23 0xab
-0x41 0xa0 0x23 0xab
-0x41 0xc0 0x23 0xab
-
-# CHECK: adds x1, x2, w3, uxtb
-# CHECK: adds x1, x2, w3, uxth
-# CHECK: adds x1, x2, w3, uxtw
-# CHECK: adds x1, x2, w3, sxtb
-# CHECK: adds x1, x2, w3, sxth
-# CHECK: adds x1, x2, w3, sxtw
-
-0xe1 0x43 0x23 0x2b
-0xe1 0x43 0x23 0x2b
-
-# CHECK: adds w1, wsp, w3
-# CHECK: adds w1, wsp, w3
-
-0x41 0x00 0x23 0x6b
-0x41 0x20 0x23 0x6b
-0x41 0x40 0x23 0x6b
-0x41 0x60 0x23 0x6b
-0x41 0x80 0x23 0x6b
-0x41 0xa0 0x23 0x6b
-0x41 0xc0 0x23 0x6b
-0x41 0xe0 0x23 0x6b
-
-# CHECK: subs w1, w2, w3, uxtb
-# CHECK: subs w1, w2, w3, uxth
-# CHECK: subs w1, w2, w3, uxtw
-# CHECK: subs w1, w2, w3, uxtx
-# CHECK: subs w1, w2, w3, sxtb
-# CHECK: subs w1, w2, w3, sxth
-# CHECK: subs w1, w2, w3, sxtw
-# CHECK: subs w1, w2, w3, sxtx
-
-0x41 0x00 0x23 0xeb
-0x41 0x20 0x23 0xeb
-0x41 0x40 0x23 0xeb
-0x41 0x80 0x23 0xeb
-0x41 0xa0 0x23 0xeb
-0x41 0xc0 0x23 0xeb
-
-# CHECK: subs x1, x2, w3, uxtb
-# CHECK: subs x1, x2, w3, uxth
-# CHECK: subs x1, x2, w3, uxtw
-# CHECK: subs x1, x2, w3, sxtb
-# CHECK: subs x1, x2, w3, sxth
-# CHECK: subs x1, x2, w3, sxtw
-
-0xe1 0x43 0x23 0x6b
-0xe1 0x43 0x23 0x6b
-
-# CHECK: subs w1, wsp, w3
-# CHECK: subs w1, wsp, w3
-
-0x1f 0x41 0x28 0xeb
-0x3f 0x41 0x28 0x6b
-0xff 0x43 0x28 0x6b
-0xff 0x43 0x28 0xeb
-
-# CHECK: cmp x8, w8, uxtw
-# CHECK: cmp w9, w8, uxtw
-# CHECK: cmp wsp, w8
-# CHECK: cmp sp, w8
-
-0x3f 0x41 0x28 0x4b
-0xe1 0x43 0x28 0x4b
-0xff 0x43 0x28 0x4b
-0x3f 0x41 0x28 0xcb
-0xe1 0x43 0x28 0xcb
-0xff 0x43 0x28 0xcb
-0xe1 0x43 0x28 0x6b
-0xe1 0x43 0x28 0xeb
-
-# CHECK: sub wsp, w9, w8
-# CHECK: sub w1, wsp, w8
-# CHECK: sub wsp, wsp, w8
-# CHECK: sub sp, x9, w8
-# CHECK: sub x1, sp, w8
-# CHECK: sub sp, sp, w8
-# CHECK: subs w1, wsp, w8
-# CHECK: subs x1, sp, w8
-
-#==---------------------------------------------------------------------------==
-# Signed/Unsigned divide
-#==---------------------------------------------------------------------------==
-
-0x41 0x0c 0xc3 0x1a
-0x41 0x0c 0xc3 0x9a
-0x41 0x08 0xc3 0x1a
-0x41 0x08 0xc3 0x9a
-
-# CHECK: sdiv w1, w2, w3
-# CHECK: sdiv x1, x2, x3
-# CHECK: udiv w1, w2, w3
-# CHECK: udiv x1, x2, x3
-
-#==---------------------------------------------------------------------------==
-# Variable shifts
-#==---------------------------------------------------------------------------==
-
-  0x41 0x28 0xc3 0x1a
-# CHECK: asrv w1, w2, w3
-  0x41 0x28 0xc3 0x9a
-# CHECK: asrv x1, x2, x3
-  0x41 0x20 0xc3 0x1a
-# CHECK: lslv w1, w2, w3
-  0x41 0x20 0xc3 0x9a
-# CHECK: lslv x1, x2, x3
-  0x41 0x24 0xc3 0x1a
-# CHECK: lsrv w1, w2, w3
-  0x41 0x24 0xc3 0x9a
-# CHECK: lsrv x1, x2, x3
-  0x41 0x2c 0xc3 0x1a
-# CHECK: rorv w1, w2, w3
-  0x41 0x2c 0xc3 0x9a
-# CHECK: rorv x1, x2, x3
-
-#==---------------------------------------------------------------------------==
-# One operand instructions
-#==---------------------------------------------------------------------------==
-
-  0x41 0x14 0xc0 0x5a
-# CHECK: cls w1, w2
-  0x41 0x14 0xc0 0xda
-# CHECK: cls x1, x2
-  0x41 0x10 0xc0 0x5a
-# CHECK: clz w1, w2
-  0x41 0x10 0xc0 0xda
-# CHECK: clz x1, x2
-  0x41 0x00 0xc0 0x5a
-# CHECK: rbit w1, w2
-  0x41 0x00 0xc0 0xda
-# CHECK: rbit x1, x2
-  0x41 0x08 0xc0 0x5a
-# CHECK: rev w1, w2
-  0x41 0x0c 0xc0 0xda
-# CHECK: rev x1, x2
-  0x41 0x04 0xc0 0x5a
-# CHECK: rev16 w1, w2
-  0x41 0x04 0xc0 0xda
-# CHECK: rev16 x1, x2
-  0x41 0x08 0xc0 0xda
-# CHECK: rev32 x1, x2
-
-#==---------------------------------------------------------------------------==
-# 6.6.1 Multiply-add instructions
-#==---------------------------------------------------------------------------==
-
-0x41 0x10 0x03 0x1b
-0x41 0x10 0x03 0x9b
-0x41 0x90 0x03 0x1b
-0x41 0x90 0x03 0x9b
-0x41 0x10 0x23 0x9b
-0x41 0x90 0x23 0x9b
-0x41 0x10 0xa3 0x9b
-0x41 0x90 0xa3 0x9b
-
-# CHECK: madd   w1, w2, w3, w4
-# CHECK: madd   x1, x2, x3, x4
-# CHECK: msub   w1, w2, w3, w4
-# CHECK: msub   x1, x2, x3, x4
-# CHECK: smaddl x1, w2, w3, x4
-# CHECK: smsubl x1, w2, w3, x4
-# CHECK: umaddl x1, w2, w3, x4
-# CHECK: umsubl x1, w2, w3, x4
-
-#==---------------------------------------------------------------------------==
-# Multiply-high instructions
-#==---------------------------------------------------------------------------==
-
-0x41 0x7c 0x43 0x9b
-0x41 0x7c 0xc3 0x9b
-
-# CHECK: smulh x1, x2, x3
-# CHECK: umulh x1, x2, x3
-
-#==---------------------------------------------------------------------------==
-# Move immediate instructions
-#==---------------------------------------------------------------------------==
-
-0x20 0x00 0x80 0x52
-0x20 0x00 0x80 0xd2
-0x20 0x00 0xa0 0x52
-0x20 0x00 0xa0 0xd2
-
-# CHECK: movz w0, #1
-# CHECK: movz x0, #1
-# CHECK: movz w0, #1, lsl #16
-# CHECK: movz x0, #1, lsl #16
-
-0x40 0x00 0x80 0x12
-0x40 0x00 0x80 0x92
-0x40 0x00 0xa0 0x12
-0x40 0x00 0xa0 0x92
-
-# CHECK: movn w0, #2
-# CHECK: movn x0, #2
-# CHECK: movn w0, #2, lsl #16
-# CHECK: movn x0, #2, lsl #16
-
-0x20 0x00 0x80 0x72
-0x20 0x00 0x80 0xf2
-0x20 0x00 0xa0 0x72
-0x20 0x00 0xa0 0xf2
-
-# CHECK: movk w0, #1
-# CHECK: movk x0, #1
-# CHECK: movk w0, #1, lsl #16
-# CHECK: movk x0, #1, lsl #16
-
-#==---------------------------------------------------------------------------==
-# Conditionally set flags instructions
-#==---------------------------------------------------------------------------==
-
-  0x1f 0x00 0x00 0x31
-# CHECK: cmn w0, #0
-  0x1f 0xfc 0x03 0xb1
-# CHECK: x0, #255
-
-  0x23 0x08 0x42 0x3a
-# CHECK: ccmn w1, #2, #3, eq
-  0x23 0x08 0x42 0xba
-# CHECK: ccmn x1, #2, #3, eq
-  0x23 0x08 0x42 0x7a
-# CHECK: ccmp w1, #2, #3, eq
-  0x23 0x08 0x42 0xfa
-# CHECK: ccmp x1, #2, #3, eq
-
-  0x23 0x00 0x42 0x3a
-# CHECK: ccmn w1, w2, #3, eq
-  0x23 0x00 0x42 0xba
-# CHECK: ccmn x1, x2, #3, eq
-  0x23 0x00 0x42 0x7a
-# CHECK: ccmp w1, w2, #3, eq
-  0x23 0x00 0x42 0xfa
-# CHECK: ccmp x1, x2, #3, eq
-
-#==---------------------------------------------------------------------------==
-# Conditional select instructions
-#==---------------------------------------------------------------------------==
-
-  0x41 0x00 0x83 0x1a
-# CHECK: csel w1, w2, w3, eq
-  0x41 0x00 0x83 0x9a
-# CHECK: csel x1, x2, x3, eq
-  0x41 0x04 0x83 0x1a
-# CHECK: csinc w1, w2, w3, eq
-  0x41 0x04 0x83 0x9a
-# CHECK: csinc x1, x2, x3, eq
-  0x41 0x00 0x83 0x5a
-# CHECK: csinv w1, w2, w3, eq
-  0x41 0x00 0x83 0xda
-# CHECK: csinv x1, x2, x3, eq
-  0x41 0x04 0x83 0x5a
-# CHECK: csneg w1, w2, w3, eq
-  0x41 0x04 0x83 0xda
-# CHECK: csneg x1, x2, x3, eq
diff --git a/test/MC/Disassembler/ARM64/bitfield.txt b/test/MC/Disassembler/ARM64/bitfield.txt
deleted file mode 100644
index 99e7af1..0000000
--- a/test/MC/Disassembler/ARM64/bitfield.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
-
-#==---------------------------------------------------------------------------==
-# 5.4.4 Bitfield Operations
-#==---------------------------------------------------------------------------==
-
-0x41 0x3c 0x01 0x33
-0x41 0x3c 0x41 0xb3
-0x41 0x3c 0x01 0x13
-0x41 0x3c 0x41 0x93
-0x41 0x3c 0x01 0x53
-0x41 0x3c 0x41 0xd3
-
-# CHECK: bfm  w1, w2, #1, #15
-# CHECK: bfm  x1, x2, #1, #15
-# CHECK: sbfm w1, w2, #1, #15
-# CHECK: sbfm x1, x2, #1, #15
-# CHECK: ubfm w1, w2, #1, #15
-# CHECK: ubfm x1, x2, #1, #15
-
-#==---------------------------------------------------------------------------==
-# 5.4.5 Extract (immediate)
-#==---------------------------------------------------------------------------==
-
-0x41 0x3c 0x83 0x13
-0x62 0x04 0xc4 0x93
-
-# CHECK: extr w1, w2, w3, #15
-# CHECK: extr x2, x3, x4, #1
diff --git a/test/MC/Disassembler/ARM64/branch.txt b/test/MC/Disassembler/ARM64/branch.txt
deleted file mode 100644
index c5b254b..0000000
--- a/test/MC/Disassembler/ARM64/branch.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
-
-#-----------------------------------------------------------------------------
-# Unconditional branch (register) instructions.
-#-----------------------------------------------------------------------------
-
-  0xc0 0x03 0x5f 0xd6
-# CHECK: ret
-  0x20 0x00 0x5f 0xd6
-# CHECK: ret x1
-  0xe0 0x03 0xbf 0xd6
-# CHECK: drps
-  0xe0 0x03 0x9f 0xd6
-# CHECK: eret
-  0xa0 0x00 0x1f 0xd6
-# CHECK: br  x5
-  0x20 0x01 0x3f 0xd6
-# CHECK: blr x9
-  0x0B 0x00 0x18 0x37
-# CHECK: tbnz	w11, #3, #0
-
-#-----------------------------------------------------------------------------
-# Exception generation instructions.
-#-----------------------------------------------------------------------------
-
-  0x20 0x00 0x20 0xd4
-# CHECK: brk   #1
-  0x41 0x00 0xa0 0xd4
-# CHECK: dcps1 #2
-  0x62 0x00 0xa0 0xd4
-# CHECK: dcps2 #3
-  0x83 0x00 0xa0 0xd4
-# CHECK: dcps3 #4
-  0xa0 0x00 0x40 0xd4
-# CHECK: hlt   #5
-  0xc2 0x00 0x00 0xd4
-# CHECK: hvc   #6
-  0xe3 0x00 0x00 0xd4
-# CHECK: smc   #7
-  0x01 0x01 0x00 0xd4
-# CHECK: svc   #8
-
-#-----------------------------------------------------------------------------
-# PC-relative branches (both positive and negative displacement)
-#-----------------------------------------------------------------------------
-
-  0x07 0x00 0x00 0x14
-# CHECK: b #28
-  0x06 0x00 0x00 0x94
-# CHECK: bl #24
-  0xa1 0x00 0x00 0x54
-# CHECK: b.ne #20
-  0x80 0x00 0x08 0x36
-# CHECK: tbz w0, #1, #16
-  0xe1 0xff 0xf7 0x36
-# CHECK: tbz w1, #30, #-4
-  0x60 0x00 0x08 0x37
-# CHECK: tbnz w0, #1, #12
-  0x40 0x00 0x00 0xb4
-# CHECK: cbz x0, #8
-  0x20 0x00 0x00 0xb5
-# CHECK: cbnz x0, #4
-  0x1f 0x20 0x03 0xd5
-# CHECK: nop
-  0xff 0xff 0xff 0x17
-# CHECK: b #-4
-  0xc1 0xff 0xff 0x54
-# CHECK: b.ne #-8
-  0xa0 0xff 0x0f 0x36
-# CHECK: tbz w0, #1, #-12
-  0x80 0xff 0xff 0xb4
-# CHECK: cbz x0, #-16
-  0x1f 0x20 0x03 0xd5
-# CHECK: nop
-
diff --git a/test/MC/Disassembler/ARM64/crc32.txt b/test/MC/Disassembler/ARM64/crc32.txt
deleted file mode 100644
index ef0a26e..0000000
--- a/test/MC/Disassembler/ARM64/crc32.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# RUN: llvm-mc -triple=arm64 -disassemble < %s | FileCheck %s
-
-# CHECK: crc32b  w5, w7, w20
-# CHECK: crc32h  w28, wzr, w30
-# CHECK: crc32w  w0, w1, w2
-# CHECK: crc32x  w7, w9, x20
-# CHECK: crc32cb w9, w5, w4
-# CHECK: crc32ch w13, w17, w25
-# CHECK: crc32cw wzr, w3, w5
-# CHECK: crc32cx w18, w16, xzr
-0xe5 0x40 0xd4 0x1a
-0xfc 0x47 0xde 0x1a
-0x20 0x48 0xc2 0x1a
-0x27 0x4d 0xd4 0x9a
-0xa9 0x50 0xc4 0x1a
-0x2d 0x56 0xd9 0x1a
-0x7f 0x58 0xc5 0x1a
-0x12 0x5e 0xdf 0x9a
diff --git a/test/MC/Disassembler/ARM64/crypto.txt b/test/MC/Disassembler/ARM64/crypto.txt
deleted file mode 100644
index e163b2c..0000000
--- a/test/MC/Disassembler/ARM64/crypto.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
-# RUN: llvm-mc -triple arm64-apple-darwin -output-asm-variant=1 --disassemble < %s | FileCheck %s --check-prefix=CHECK-APPLE
-
-  0x20 0x48 0x28 0x4e
-  0x20 0x58 0x28 0x4e
-  0x20 0x68 0x28 0x4e
-  0x20 0x78 0x28 0x4e
-  0x20 0x00 0x02 0x5e
-  0x20 0x10 0x02 0x5e
-  0x20 0x20 0x02 0x5e
-  0x20 0x30 0x02 0x5e
-  0x20 0x40 0x02 0x5e
-  0x20 0x50 0x02 0x5e
-  0x20 0x60 0x02 0x5e
-  0x20 0x08 0x28 0x5e
-  0x20 0x18 0x28 0x5e
-  0x20 0x28 0x28 0x5e
-
-# CHECK: aese v0.16b, v1.16b
-# CHECK: aesd v0.16b, v1.16b
-# CHECK: aesmc v0.16b, v1.16b
-# CHECK: aesimc v0.16b, v1.16b
-# CHECK: sha1c q0, s1, v2.4s
-# CHECK: sha1p q0, s1, v2.4s
-# CHECK: sha1m q0, s1, v2.4s
-# CHECK: sha1su0 v0.4s, v1.4s, v2
-# CHECK: sha256h q0, q1, v2.4s
-# CHECK: sha256h2 q0, q1, v2.4s
-# CHECK: sha256su1 v0.4s, v1.4s, v2.4s
-# CHECK: sha1h s0, s1
-# CHECK: sha1su1 v0.4s, v1.4s
-# CHECK: sha256su0 v0.4s, v1.4s
-
-# CHECK-APPLE: aese.16b v0, v1
-# CHECK-APPLE: aesd.16b v0, v1
-# CHECK-APPLE: aesmc.16b v0, v1
-# CHECK-APPLE: aesimc.16b v0, v1
-# CHECK-APPLE: sha1c.4s q0, s1, v2
-# CHECK-APPLE: sha1p.4s q0, s1, v2
-# CHECK-APPLE: sha1m.4s q0, s1, v2
-# CHECK-APPLE: sha1su0.4s v0, v1, v2
-# CHECK-APPLE: sha256h.4s q0, q1, v2
-# CHECK-APPLE: sha256h2.4s q0, q1, v2
-# CHECK-APPLE: sha256su1.4s v0, v1, v2
-# CHECK-APPLE: sha1h s0, s1
-# CHECK-APPLE: sha1su1.4s v0, v1
-# CHECK-APPLE: sha256su0.4s v0, v1
diff --git a/test/MC/Disassembler/ARM64/invalid-logical.txt b/test/MC/Disassembler/ARM64/invalid-logical.txt
deleted file mode 100644
index 8a4ecb6..0000000
--- a/test/MC/Disassembler/ARM64/invalid-logical.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin -disassemble < %s 2>&1 | FileCheck %s
-
-# rdar://15226511
-0x7b 0xbf 0x25 0x72
-# CHECK: invalid instruction encoding
-# CHECK-NEXT: 0x7b 0xbf 0x25 0x72
diff --git a/test/MC/Disassembler/ARM64/lit.local.cfg b/test/MC/Disassembler/ARM64/lit.local.cfg
deleted file mode 100644
index 46a9468..0000000
--- a/test/MC/Disassembler/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-config.suffixes = ['.txt']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/MC/Disassembler/ARM64/logical.txt b/test/MC/Disassembler/ARM64/logical.txt
deleted file mode 100644
index 29db8cb..0000000
--- a/test/MC/Disassembler/ARM64/logical.txt
+++ /dev/null
@@ -1,217 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
-
-#==---------------------------------------------------------------------------==
-# 5.4.2 Logical (immediate)
-#==---------------------------------------------------------------------------==
-
-0x00 0x00 0x00 0x12
-0x00 0x00 0x40 0x92
-0x41 0x0c 0x00 0x12
-0x41 0x0c 0x40 0x92
-0xbf 0xec 0x7c 0x92
-0x00 0x00 0x00 0x72
-0x00 0x00 0x40 0xf2
-0x41 0x0c 0x00 0x72
-0x41 0x0c 0x40 0xf2
-
-# CHECK: and  w0, w0, #0x1
-# CHECK: and  x0, x0, #0x1
-# CHECK: and  w1, w2, #0xf
-# CHECK: and  x1, x2, #0xf
-# CHECK: and  sp, x5, #0xfffffffffffffff0
-# CHECK: ands w0, w0, #0x1
-# CHECK: ands x0, x0, #0x1
-# CHECK: ands w1, w2, #0xf
-# CHECK: ands x1, x2, #0xf
-
-0x41 0x00 0x12 0x52
-0x41 0x00 0x71 0xd2
-
-# CHECK: eor w1, w2, #0x4000
-# CHECK: eor x1, x2, #0x8000
-
-0x41 0x00 0x12 0x32
-0x41 0x00 0x71 0xb2
-
-# CHECK: orr w1, w2, #0x4000
-# CHECK: orr x1, x2, #0x8000
-
-#==---------------------------------------------------------------------------==
-# 5.5.3 Logical (shifted register)
-#==---------------------------------------------------------------------------==
-
-0x41 0x00 0x03 0x0a
-0x41 0x00 0x03 0x8a
-0x41 0x08 0x03 0x0a
-0x41 0x08 0x03 0x8a
-0x41 0x08 0x43 0x0a
-0x41 0x08 0x43 0x8a
-0x41 0x08 0x83 0x0a
-0x41 0x08 0x83 0x8a
-0x41 0x08 0xc3 0x0a
-0x41 0x08 0xc3 0x8a
-
-# CHECK: and  w1, w2, w3
-# CHECK: and  x1, x2, x3
-# CHECK: and  w1, w2, w3, lsl #2
-# CHECK: and  x1, x2, x3, lsl #2
-# CHECK: and  w1, w2, w3, lsr #2
-# CHECK: and  x1, x2, x3, lsr #2
-# CHECK: and  w1, w2, w3, asr #2
-# CHECK: and  x1, x2, x3, asr #2
-# CHECK: and  w1, w2, w3, ror #2
-# CHECK: and  x1, x2, x3, ror #2
-
-0x41 0x00 0x03 0x6a
-0x41 0x00 0x03 0xea
-0x41 0x08 0x03 0x6a
-0x41 0x08 0x03 0xea
-0x41 0x08 0x43 0x6a
-0x41 0x08 0x43 0xea
-0x41 0x08 0x83 0x6a
-0x41 0x08 0x83 0xea
-0x41 0x08 0xc3 0x6a
-0x41 0x08 0xc3 0xea
-
-# CHECK: ands w1, w2, w3
-# CHECK: ands x1, x2, x3
-# CHECK: ands w1, w2, w3, lsl #2
-# CHECK: ands x1, x2, x3, lsl #2
-# CHECK: ands w1, w2, w3, lsr #2
-# CHECK: ands x1, x2, x3, lsr #2
-# CHECK: ands w1, w2, w3, asr #2
-# CHECK: ands x1, x2, x3, asr #2
-# CHECK: ands w1, w2, w3, ror #2
-# CHECK: ands x1, x2, x3, ror #2
-
-0x41 0x00 0x23 0x0a
-0x41 0x00 0x23 0x8a
-0x41 0x0c 0x23 0x0a
-0x41 0x0c 0x23 0x8a
-0x41 0x0c 0x63 0x0a
-0x41 0x0c 0x63 0x8a
-0x41 0x0c 0xa3 0x0a
-0x41 0x0c 0xa3 0x8a
-0x41 0x0c 0xe3 0x0a
-0x41 0x0c 0xe3 0x8a
-
-# CHECK: bic w1, w2, w3
-# CHECK: bic x1, x2, x3
-# CHECK: bic w1, w2, w3, lsl #3
-# CHECK: bic x1, x2, x3, lsl #3
-# CHECK: bic w1, w2, w3, lsr #3
-# CHECK: bic x1, x2, x3, lsr #3
-# CHECK: bic w1, w2, w3, asr #3
-# CHECK: bic x1, x2, x3, asr #3
-# CHECK: bic w1, w2, w3, ror #3
-# CHECK: bic x1, x2, x3, ror #3
-
-0x41 0x00 0x23 0x6a
-0x41 0x00 0x23 0xea
-0x41 0x0c 0x23 0x6a
-0x41 0x0c 0x23 0xea
-0x41 0x0c 0x63 0x6a
-0x41 0x0c 0x63 0xea
-0x41 0x0c 0xa3 0x6a
-0x41 0x0c 0xa3 0xea
-0x41 0x0c 0xe3 0x6a
-0x41 0x0c 0xe3 0xea
-
-# CHECK: bics w1, w2, w3
-# CHECK: bics x1, x2, x3
-# CHECK: bics w1, w2, w3, lsl #3
-# CHECK: bics x1, x2, x3, lsl #3
-# CHECK: bics w1, w2, w3, lsr #3
-# CHECK: bics x1, x2, x3, lsr #3
-# CHECK: bics w1, w2, w3, asr #3
-# CHECK: bics x1, x2, x3, asr #3
-# CHECK: bics w1, w2, w3, ror #3
-# CHECK: bics x1, x2, x3, ror #3
-
-0x41 0x00 0x23 0x4a
-0x41 0x00 0x23 0xca
-0x41 0x10 0x23 0x4a
-0x41 0x10 0x23 0xca
-0x41 0x10 0x63 0x4a
-0x41 0x10 0x63 0xca
-0x41 0x10 0xa3 0x4a
-0x41 0x10 0xa3 0xca
-0x41 0x10 0xe3 0x4a
-0x41 0x10 0xe3 0xca
-
-# CHECK: eon w1, w2, w3
-# CHECK: eon x1, x2, x3
-# CHECK: eon w1, w2, w3, lsl #4
-# CHECK: eon x1, x2, x3, lsl #4
-# CHECK: eon w1, w2, w3, lsr #4
-# CHECK: eon x1, x2, x3, lsr #4
-# CHECK: eon w1, w2, w3, asr #4
-# CHECK: eon x1, x2, x3, asr #4
-# CHECK: eon w1, w2, w3, ror #4
-# CHECK: eon x1, x2, x3, ror #4
-
-0x41 0x00 0x03 0x4a
-0x41 0x00 0x03 0xca
-0x41 0x14 0x03 0x4a
-0x41 0x14 0x03 0xca
-0x41 0x14 0x43 0x4a
-0x41 0x14 0x43 0xca
-0x41 0x14 0x83 0x4a
-0x41 0x14 0x83 0xca
-0x41 0x14 0xc3 0x4a
-0x41 0x14 0xc3 0xca
-
-# CHECK: eor w1, w2, w3
-# CHECK: eor x1, x2, x3
-# CHECK: eor w1, w2, w3, lsl #5
-# CHECK: eor x1, x2, x3, lsl #5
-# CHECK: eor w1, w2, w3, lsr #5
-# CHECK: eor x1, x2, x3, lsr #5
-# CHECK: eor w1, w2, w3, asr #5
-# CHECK: eor x1, x2, x3, asr #5
-# CHECK: eor w1, w2, w3, ror #5
-# CHECK: eor x1, x2, x3, ror #5
-
-0x41 0x00 0x03 0x2a
-0x41 0x00 0x03 0xaa
-0x41 0x18 0x03 0x2a
-0x41 0x18 0x03 0xaa
-0x41 0x18 0x43 0x2a
-0x41 0x18 0x43 0xaa
-0x41 0x18 0x83 0x2a
-0x41 0x18 0x83 0xaa
-0x41 0x18 0xc3 0x2a
-0x41 0x18 0xc3 0xaa
-
-# CHECK: orr w1, w2, w3
-# CHECK: orr x1, x2, x3
-# CHECK: orr w1, w2, w3, lsl #6
-# CHECK: orr x1, x2, x3, lsl #6
-# CHECK: orr w1, w2, w3, lsr #6
-# CHECK: orr x1, x2, x3, lsr #6
-# CHECK: orr w1, w2, w3, asr #6
-# CHECK: orr x1, x2, x3, asr #6
-# CHECK: orr w1, w2, w3, ror #6
-# CHECK: orr x1, x2, x3, ror #6
-
-0x41 0x00 0x23 0x2a
-0x41 0x00 0x23 0xaa
-0x41 0x1c 0x23 0x2a
-0x41 0x1c 0x23 0xaa
-0x41 0x1c 0x63 0x2a
-0x41 0x1c 0x63 0xaa
-0x41 0x1c 0xa3 0x2a
-0x41 0x1c 0xa3 0xaa
-0x41 0x1c 0xe3 0x2a
-0x41 0x1c 0xe3 0xaa
-
-# CHECK: orn w1, w2, w3
-# CHECK: orn x1, x2, x3
-# CHECK: orn w1, w2, w3, lsl #7
-# CHECK: orn x1, x2, x3, lsl #7
-# CHECK: orn w1, w2, w3, lsr #7
-# CHECK: orn x1, x2, x3, lsr #7
-# CHECK: orn w1, w2, w3, asr #7
-# CHECK: orn x1, x2, x3, asr #7
-# CHECK: orn w1, w2, w3, ror #7
-# CHECK: orn x1, x2, x3, ror #7
diff --git a/test/MC/Disassembler/ARM64/memory.txt b/test/MC/Disassembler/ARM64/memory.txt
deleted file mode 100644
index 031bfa6..0000000
--- a/test/MC/Disassembler/ARM64/memory.txt
+++ /dev/null
@@ -1,558 +0,0 @@
-# RUN: llvm-mc --disassemble -triple arm64-apple-darwin < %s | FileCheck %s
-
-#-----------------------------------------------------------------------------
-# Indexed loads
-#-----------------------------------------------------------------------------
-
-  0x85 0x14 0x40 0xb9
-  0x64 0x00 0x40 0xf9
-  0xe2 0x13 0x40 0xf9
-  0xe5 0x07 0x40 0x3d
-  0xe6 0x07 0x40 0x7d
-  0xe7 0x07 0x40 0xbd
-  0xe8 0x07 0x40 0xfd
-  0xe9 0x07 0xc0 0x3d
-  0x64 0x00 0x40 0x39
-  0x20 0x78 0xa0 0xb8
-  0x85 0x50 0x40 0x39
-
-# CHECK: ldr	w5, [x4, #20]
-# CHECK: ldr	x4, [x3]
-# CHECK: ldr	x2, [sp, #32]
-# CHECK: ldr	b5, [sp, #1]
-# CHECK: ldr	h6, [sp, #2]
-# CHECK: ldr	s7, [sp, #4]
-# CHECK: ldr	d8, [sp, #8]
-# CHECK: ldr	q9, [sp, #16]
-# CHECK: ldrb	w4, [x3]
-# CHECK: ldrsw	x0, [x1, x0, lsl #2]
-# CHECK: ldrb	w5, [x4, #20]
-# CHECK: ldrsb	w9, [x3]
-# CHECK: ldrsb	x2, [sp, #128]
-# CHECK: ldrh	w2, [sp, #32]
-# CHECK: ldrsh	w3, [sp, #32]
-# CHECK: ldrsh	x5, [x9, #24]
-# CHECK: ldrsw	x9, [sp, #512]
-# CHECK: prfm	pldl3strm, [sp, #32]
-
-  0x69 0x00 0xc0 0x39
-  0xe2 0x03 0x82 0x39
-  0xe2 0x43 0x40 0x79
-  0xe3 0x43 0xc0 0x79
-  0x25 0x31 0x80 0x79
-  0xe9 0x03 0x82 0xb9
-  0xe5 0x13 0x80 0xf9
-  0x40 0x00 0x80 0xf9
-  0x41 0x00 0x80 0xf9
-  0x42 0x00 0x80 0xf9
-  0x43 0x00 0x80 0xf9
-  0x44 0x00 0x80 0xf9
-  0x45 0x00 0x80 0xf9
-  0x50 0x00 0x80 0xf9
-  0x51 0x00 0x80 0xf9
-  0x52 0x00 0x80 0xf9
-  0x53 0x00 0x80 0xf9
-  0x54 0x00 0x80 0xf9
-  0x55 0x00 0x80 0xf9
-
-# CHECK: prfm	pldl1keep, [x2]
-# CHECK: prfm	pldl1strm, [x2]
-# CHECK: prfm	pldl2keep, [x2]
-# CHECK: prfm	pldl2strm, [x2]
-# CHECK: prfm	pldl3keep, [x2]
-# CHECK: prfm	pldl3strm, [x2]
-# CHECK: prfm	pstl1keep, [x2]
-# CHECK: prfm	pstl1strm, [x2]
-# CHECK: prfm	pstl2keep, [x2]
-# CHECK: prfm	pstl2strm, [x2]
-# CHECK: prfm	pstl3keep, [x2]
-# CHECK: prfm	pstl3strm, [x2]
-
-#-----------------------------------------------------------------------------
-# Indexed stores
-#-----------------------------------------------------------------------------
-
-  0x64 0x00 0x00 0xf9
-  0xe2 0x13 0x00 0xf9
-  0x85 0x14 0x00 0xb9
-  0xe5 0x07 0x00 0x3d
-  0xe6 0x07 0x00 0x7d
-  0xe7 0x07 0x00 0xbd
-  0xe8 0x07 0x00 0xfd
-  0xe9 0x07 0x80 0x3d
-  0x64 0x00 0x00 0x39
-  0x85 0x50 0x00 0x39
-  0xe2 0x43 0x00 0x79
-
-# CHECK: str	x4, [x3]
-# CHECK: str	x2, [sp, #32]
-# CHECK: str	w5, [x4, #20]
-# CHECK: str	b5, [sp, #1]
-# CHECK: str	h6, [sp, #2]
-# CHECK: str	s7, [sp, #4]
-# CHECK: str	d8, [sp, #8]
-# CHECK: str	q9, [sp, #16]
-# CHECK: strb	w4, [x3]
-# CHECK: strb	w5, [x4, #20]
-# CHECK: strh	w2, [sp, #32]
-
-#-----------------------------------------------------------------------------
-# Unscaled immediate loads and stores
-#-----------------------------------------------------------------------------
-
-  0x62 0x00 0x40 0xb8
-  0xe2 0x83 0x41 0xb8
-  0x62 0x00 0x40 0xf8
-  0xe2 0x83 0x41 0xf8
-  0xe5 0x13 0x40 0x3c
-  0xe6 0x23 0x40 0x7c
-  0xe7 0x43 0x40 0xbc
-  0xe8 0x83 0x40 0xfc
-  0xe9 0x03 0xc1 0x3c
-  0x69 0x00 0xc0 0x38
-  0xe2 0x03 0x88 0x38
-  0xe3 0x03 0xc2 0x78
-  0x25 0x81 0x81 0x78
-  0xe9 0x03 0x98 0xb8
-
-# CHECK: ldur	w2, [x3]
-# CHECK: ldur	w2, [sp, #24]
-# CHECK: ldur	x2, [x3]
-# CHECK: ldur	x2, [sp, #24]
-# CHECK: ldur	b5, [sp, #1]
-# CHECK: ldur	h6, [sp, #2]
-# CHECK: ldur	s7, [sp, #4]
-# CHECK: ldur	d8, [sp, #8]
-# CHECK: ldur	q9, [sp, #16]
-# CHECK: ldursb	w9, [x3]
-# CHECK: ldursb	x2, [sp, #128]
-# CHECK: ldursh	w3, [sp, #32]
-# CHECK: ldursh	x5, [x9, #24]
-# CHECK: ldursw	x9, [sp, #-128]
-
-  0x64 0x00 0x00 0xb8
-  0xe2 0x03 0x02 0xb8
-  0x64 0x00 0x00 0xf8
-  0xe2 0x03 0x02 0xf8
-  0x85 0x40 0x01 0xb8
-  0xe5 0x13 0x00 0x3c
-  0xe6 0x23 0x00 0x7c
-  0xe7 0x43 0x00 0xbc
-  0xe8 0x83 0x00 0xfc
-  0xe9 0x03 0x81 0x3c
-  0x64 0x00 0x00 0x38
-  0x85 0x40 0x01 0x38
-  0xe2 0x03 0x02 0x78
-  0xe5 0x03 0x82 0xf8
-
-# CHECK: stur	w4, [x3]
-# CHECK: stur	w2, [sp, #32]
-# CHECK: stur	x4, [x3]
-# CHECK: stur	x2, [sp, #32]
-# CHECK: stur	w5, [x4, #20]
-# CHECK: stur	b5, [sp, #1]
-# CHECK: stur	h6, [sp, #2]
-# CHECK: stur	s7, [sp, #4]
-# CHECK: stur	d8, [sp, #8]
-# CHECK: stur	q9, [sp, #16]
-# CHECK: sturb	w4, [x3]
-# CHECK: sturb	w5, [x4, #20]
-# CHECK: sturh	w2, [sp, #32]
-# CHECK: prfum	pldl3strm, [sp, #32]
-
-#-----------------------------------------------------------------------------
-# Unprivileged loads and stores
-#-----------------------------------------------------------------------------
-
-  0x83 0x08 0x41 0xb8
-  0x83 0x08 0x41 0xf8
-  0x83 0x08 0x41 0x38
-  0x69 0x08 0xc0 0x38
-  0xe2 0x0b 0x88 0x38
-  0x83 0x08 0x41 0x78
-  0xe3 0x0b 0xc2 0x78
-  0x25 0x89 0x81 0x78
-  0xe9 0x0b 0x98 0xb8
-
-# CHECK: ldtr	w3, [x4, #16]
-# CHECK: ldtr	x3, [x4, #16]
-# CHECK: ldtrb	w3, [x4, #16]
-# CHECK: ldtrsb	w9, [x3]
-# CHECK: ldtrsb	x2, [sp, #128]
-# CHECK: ldtrh	w3, [x4, #16]
-# CHECK: ldtrsh	w3, [sp, #32]
-# CHECK: ldtrsh	x5, [x9, #24]
-# CHECK: ldtrsw	x9, [sp, #-128]
-
-  0x85 0x48 0x01 0xb8
-  0x64 0x08 0x00 0xf8
-  0xe2 0x0b 0x02 0xf8
-  0x64 0x08 0x00 0x38
-  0x85 0x48 0x01 0x38
-  0xe2 0x0b 0x02 0x78
-
-# CHECK: sttr	w5, [x4, #20]
-# CHECK: sttr	x4, [x3]
-# CHECK: sttr	x2, [sp, #32]
-# CHECK: sttrb	w4, [x3]
-# CHECK: sttrb	w5, [x4, #20]
-# CHECK: sttrh	w2, [sp, #32]
-
-#-----------------------------------------------------------------------------
-# Pre-indexed loads and stores
-#-----------------------------------------------------------------------------
-
-  0xfd 0x8c 0x40 0xf8
-  0xfe 0x8c 0x40 0xf8
-  0x05 0x1c 0x40 0x3c
-  0x06 0x2c 0x40 0x7c
-  0x07 0x4c 0x40 0xbc
-  0x08 0x8c 0x40 0xfc
-  0x09 0x0c 0xc1 0x3c
-
-# CHECK: ldr	fp, [x7, #8]!
-# CHECK: ldr	lr, [x7, #8]!
-# CHECK: ldr	b5, [x0, #1]!
-# CHECK: ldr	h6, [x0, #2]!
-# CHECK: ldr	s7, [x0, #4]!
-# CHECK: ldr	d8, [x0, #8]!
-# CHECK: ldr	q9, [x0, #16]!
-
-  0xfe 0x8c 0x1f 0xf8
-  0xfd 0x8c 0x1f 0xf8
-  0x05 0xfc 0x1f 0x3c
-  0x06 0xec 0x1f 0x7c
-  0x07 0xcc 0x1f 0xbc
-  0x08 0x8c 0x1f 0xfc
-  0x09 0x0c 0x9f 0x3c
-
-# CHECK: str	lr, [x7, #-8]!
-# CHECK: str	fp, [x7, #-8]!
-# CHECK: str	b5, [x0, #-1]!
-# CHECK: str	h6, [x0, #-2]!
-# CHECK: str	s7, [x0, #-4]!
-# CHECK: str	d8, [x0, #-8]!
-# CHECK: str	q9, [x0, #-16]!
-
-#-----------------------------------------------------------------------------
-# post-indexed loads and stores
-#-----------------------------------------------------------------------------
-
-  0xfe 0x84 0x1f 0xf8
-  0xfd 0x84 0x1f 0xf8
-  0x05 0xf4 0x1f 0x3c
-  0x06 0xe4 0x1f 0x7c
-  0x07 0xc4 0x1f 0xbc
-  0x08 0x84 0x1f 0xfc
-  0x09 0x04 0x9f 0x3c
-
-# CHECK: str	lr, [x7], #-8
-# CHECK: str	fp, [x7], #-8
-# CHECK: str	b5, [x0], #-1
-# CHECK: str	h6, [x0], #-2
-# CHECK: str	s7, [x0], #-4
-# CHECK: str	d8, [x0], #-8
-# CHECK: str	q9, [x0], #-16
-
-  0xfd 0x84 0x40 0xf8
-  0xfe 0x84 0x40 0xf8
-  0x05 0x14 0x40 0x3c
-  0x06 0x24 0x40 0x7c
-  0x07 0x44 0x40 0xbc
-  0x08 0x84 0x40 0xfc
-  0x09 0x04 0xc1 0x3c
-
-# CHECK: ldr	fp, [x7], #8
-# CHECK: ldr	lr, [x7], #8
-# CHECK: ldr	b5, [x0], #1
-# CHECK: ldr	h6, [x0], #2
-# CHECK: ldr	s7, [x0], #4
-# CHECK: ldr	d8, [x0], #8
-# CHECK: ldr	q9, [x0], #16
-
-#-----------------------------------------------------------------------------
-# Load/Store pair (indexed  offset)
-#-----------------------------------------------------------------------------
-
-  0xe3 0x09 0x42 0x29
-  0xe4 0x27 0x7f 0xa9
-  0xc2 0x0d 0x42 0x69
-  0xe2 0x0f 0x7e 0x69
-  0x4a 0x04 0x48 0x2d
-  0x4a 0x04 0x40 0x6d
-
-# CHECK: ldp	w3, w2, [x15, #16]
-# CHECK: ldp	x4, x9, [sp, #-16]
-# CHECK: ldpsw	x2, x3, [x14, #16]
-# CHECK: ldpsw	x2, x3, [sp, #-16]
-# CHECK: ldp	s10, s1, [x2, #64]
-# CHECK: ldp	d10, d1, [x2]
-
-  0xe3 0x09 0x02 0x29
-  0xe4 0x27 0x3f 0xa9
-  0x4a 0x04 0x08 0x2d
-  0x4a 0x04 0x00 0x6d
-
-# CHECK: stp	w3, w2, [x15, #16]
-# CHECK: stp	x4, x9, [sp, #-16]
-# CHECK: stp	s10, s1, [x2, #64]
-# CHECK: stp	d10, d1, [x2]
-
-#-----------------------------------------------------------------------------
-# Load/Store pair (pre-indexed)
-#-----------------------------------------------------------------------------
-
-  0xe3 0x09 0xc2 0x29
-  0xe4 0x27 0xff 0xa9
-  0xc2 0x0d 0xc2 0x69
-  0xe2 0x0f 0xfe 0x69
-  0x4a 0x04 0xc8 0x2d
-  0x4a 0x04 0xc1 0x6d
-
-# CHECK: ldp	w3, w2, [x15, #16]!
-# CHECK: ldp	x4, x9, [sp, #-16]!
-# CHECK: ldpsw	x2, x3, [x14, #16]!
-# CHECK: ldpsw	x2, x3, [sp, #-16]!
-# CHECK: ldp	s10, s1, [x2, #64]!
-# CHECK: ldp	d10, d1, [x2, #16]!
-
-  0xe3 0x09 0x82 0x29
-  0xe4 0x27 0xbf 0xa9
-  0x4a 0x04 0x88 0x2d
-  0x4a 0x04 0x81 0x6d
-
-# CHECK: stp	w3, w2, [x15, #16]!
-# CHECK: stp	x4, x9, [sp, #-16]!
-# CHECK: stp	s10, s1, [x2, #64]!
-# CHECK: stp	d10, d1, [x2, #16]!
-
-#-----------------------------------------------------------------------------
-# Load/Store pair (post-indexed)
-#-----------------------------------------------------------------------------
-
-  0xe3 0x09 0xc2 0x28
-  0xe4 0x27 0xff 0xa8
-  0xc2 0x0d 0xc2 0x68
-  0xe2 0x0f 0xfe 0x68
-  0x4a 0x04 0xc8 0x2c
-  0x4a 0x04 0xc1 0x6c
-
-# CHECK: ldp	w3, w2, [x15], #16
-# CHECK: ldp	x4, x9, [sp], #-16
-# CHECK: ldpsw	x2, x3, [x14], #16
-# CHECK: ldpsw	x2, x3, [sp], #-16
-# CHECK: ldp	s10, s1, [x2], #64
-# CHECK: ldp	d10, d1, [x2], #16
-
-  0xe3 0x09 0x82 0x28
-  0xe4 0x27 0xbf 0xa8
-  0x4a 0x04 0x88 0x2c
-  0x4a 0x04 0x81 0x6c
-
-# CHECK: stp	w3, w2, [x15], #16
-# CHECK: stp	x4, x9, [sp], #-16
-# CHECK: stp	s10, s1, [x2], #64
-# CHECK: stp	d10, d1, [x2], #16
-
-#-----------------------------------------------------------------------------
-# Load/Store pair (no-allocate)
-#-----------------------------------------------------------------------------
-
-  0xe3 0x09 0x42 0x28
-  0xe4 0x27 0x7f 0xa8
-  0x4a 0x04 0x48 0x2c
-  0x4a 0x04 0x40 0x6c
-
-# CHECK: ldnp	w3, w2, [x15, #16]
-# CHECK: ldnp	x4, x9, [sp, #-16]
-# CHECK: ldnp	s10, s1, [x2, #64]
-# CHECK: ldnp	d10, d1, [x2]
-
-  0xe3 0x09 0x02 0x28
-  0xe4 0x27 0x3f 0xa8
-  0x4a 0x04 0x08 0x2c
-  0x4a 0x04 0x00 0x6c
-
-# CHECK: stnp	w3, w2, [x15, #16]
-# CHECK: stnp	x4, x9, [sp, #-16]
-# CHECK: stnp	s10, s1, [x2, #64]
-# CHECK: stnp	d10, d1, [x2]
-
-#-----------------------------------------------------------------------------
-# Load/Store register offset
-#-----------------------------------------------------------------------------
-
-  0x00 0x68 0x60 0xb8
-  0x00 0x78 0x60 0xb8
-  0x00 0x68 0x60 0xf8
-  0x00 0x78 0x60 0xf8
-  0x00 0xe8 0x60 0xf8
-
-# CHECK: ldr	w0, [x0, x0]
-# CHECK: ldr	w0, [x0, x0, lsl #2]
-# CHECK: ldr	x0, [x0, x0]
-# CHECK: ldr	x0, [x0, x0, lsl #3]
-# CHECK: ldr	x0, [x0, x0, sxtx]
-
-  0x21 0x68 0x62 0x3c
-  0x21 0x78 0x62 0x3c
-  0x21 0x68 0x62 0x7c
-  0x21 0x78 0x62 0x7c
-  0x21 0x68 0x62 0xbc
-  0x21 0x78 0x62 0xbc
-  0x21 0x68 0x62 0xfc
-  0x21 0x78 0x62 0xfc
-  0x21 0x68 0xe2 0x3c
-  0x21 0x78 0xe2 0x3c
-
-# CHECK: ldr	b1, [x1, x2]
-# CHECK: ldr	b1, [x1, x2, lsl #0]
-# CHECK: ldr	h1, [x1, x2]
-# CHECK: ldr	h1, [x1, x2, lsl #1]
-# CHECK: ldr	s1, [x1, x2]
-# CHECK: ldr	s1, [x1, x2, lsl #2]
-# CHECK: ldr	d1, [x1, x2]
-# CHECK: ldr	d1, [x1, x2, lsl #3]
-# CHECK: ldr	q1, [x1, x2]
-# CHECK: ldr	q1, [x1, x2, lsl #4]
-
-  0xe1 0x6b 0x23 0xfc
-  0xe1 0x5b 0x23 0xfc
-  0xe1 0x6b 0xa3 0x3c
-  0xe1 0x5b 0xa3 0x3c
-
-# CHECK: str	d1, [sp, x3]
-# CHECK: str	d1, [sp, x3, uxtw #3]
-# CHECK: str	q1, [sp, x3]
-# CHECK: str	q1, [sp, x3, uxtw #4]
-
-#-----------------------------------------------------------------------------
-# Load/Store exclusive
-#-----------------------------------------------------------------------------
-
-  0x26 0x7c 0x5f 0x08
-  0x26 0x7c 0x5f 0x48
-  0x27 0x0d 0x7f 0x88
-  0x27 0x0d 0x7f 0xc8
-
-# CHECK: ldxrb	w6, [x1]
-# CHECK: ldxrh	w6, [x1]
-# CHECK: ldxp	w7, w3, [x9]
-# CHECK: ldxp	x7, x3, [x9]
-
-  0x64 0x7c 0x01 0xc8
-  0x64 0x7c 0x01 0x88
-  0x64 0x7c 0x01 0x08
-  0x64 0x7c 0x01 0x48
-  0x22 0x18 0x21 0xc8
-  0x22 0x18 0x21 0x88
-
-# CHECK: stxr	w1, x4, [x3]
-# CHECK: stxr	w1, w4, [x3]
-# CHECK: stxrb	w1, w4, [x3]
-# CHECK: stxrh	w1, w4, [x3]
-# CHECK: stxp	w1, x2, x6, [x1]
-# CHECK: stxp	w1, w2, w6, [x1]
-
-#-----------------------------------------------------------------------------
-# Load-acquire/Store-release non-exclusive
-#-----------------------------------------------------------------------------
-
-  0xe4 0xff 0xdf 0x88
-  0xe4 0xff 0xdf 0xc8
-  0xe4 0xff 0xdf 0x08
-  0xe4 0xff 0xdf 0x48
-
-# CHECK: ldar	w4, [sp]
-# CHECK: ldar	x4, [sp]
-# CHECK: ldarb	w4, [sp]
-# CHECK: ldarh	w4, [sp]
-
-  0xc3 0xfc 0x9f 0x88
-  0xc3 0xfc 0x9f 0xc8
-  0xc3 0xfc 0x9f 0x08
-  0xc3 0xfc 0x9f 0x48
-
-# CHECK: stlr	w3, [x6]
-# CHECK: stlr	x3, [x6]
-# CHECK: stlrb	w3, [x6]
-# CHECK: stlrh	w3, [x6]
-
-#-----------------------------------------------------------------------------
-# Load-acquire/Store-release exclusive
-#-----------------------------------------------------------------------------
-
-  0x82 0xfc 0x5f 0x88
-  0x82 0xfc 0x5f 0xc8
-  0x82 0xfc 0x5f 0x08
-  0x82 0xfc 0x5f 0x48
-  0x22 0x98 0x7f 0x88
-  0x22 0x98 0x7f 0xc8
-
-# CHECK: ldaxr	w2, [x4]
-# CHECK: ldaxr	x2, [x4]
-# CHECK: ldaxrb	w2, [x4]
-# CHECK: ldaxrh	w2, [x4]
-# CHECK: ldaxp	w2, w6, [x1]
-# CHECK: ldaxp	x2, x6, [x1]
-
-  0x27 0xfc 0x08 0xc8
-  0x27 0xfc 0x08 0x88
-  0x27 0xfc 0x08 0x08
-  0x27 0xfc 0x08 0x48
-  0x22 0x98 0x21 0xc8
-  0x22 0x98 0x21 0x88
-
-# CHECK: stlxr	w8, x7, [x1]
-# CHECK: stlxr	w8, w7, [x1]
-# CHECK: stlxrb	w8, w7, [x1]
-# CHECK: stlxrh	w8, w7, [x1]
-# CHECK: stlxp	w1, x2, x6, [x1]
-# CHECK: stlxp	w1, w2, w6, [x1]
-
-#-----------------------------------------------------------------------------
-# Load/Store with explicit LSL values
-#-----------------------------------------------------------------------------
-  0x20 0x78 0xa0 0xb8
-  0x20 0x78 0x60 0xf8
-  0x20 0x78 0x20 0xf8
-  0x20 0x78 0x60 0xb8
-  0x20 0x78 0x20 0xb8
-  0x20 0x78 0xe0 0x3c
-  0x20 0x78 0xa0 0x3c
-  0x20 0x78 0x60 0xfc
-  0x20 0x78 0x20 0xfc
-  0x20 0x78 0x60 0xbc
-  0x20 0x78 0x20 0xbc
-  0x20 0x78 0x60 0x7c
-  0x20 0x78 0x60 0x3c
-  0x20 0x78 0x60 0x38
-  0x20 0x78 0x20 0x38
-  0x20 0x78 0xe0 0x38
-  0x20 0x78 0x60 0x78
-  0x20 0x78 0x20 0x78
-  0x20 0x78 0xe0 0x78
-  0x20 0x78 0xa0 0x38
-  0x20 0x78 0xa0 0x78
-
-# CHECK: ldrsw	x0, [x1, x0, lsl #2]
-# CHECK: ldr	x0, [x1, x0, lsl #3]
-# CHECK: str	x0, [x1, x0, lsl #3]
-# CHECK: ldr	w0, [x1, x0, lsl #2]
-# CHECK: str	w0, [x1, x0, lsl #2]
-# CHECK: ldr	q0, [x1, x0, lsl #4]
-# CHECK: str	q0, [x1, x0, lsl #4]
-# CHECK: ldr	d0, [x1, x0, lsl #3]
-# CHECK: str	d0, [x1, x0, lsl #3]
-# CHECK: ldr	s0, [x1, x0, lsl #2]
-# CHECK: str	s0, [x1, x0, lsl #2]
-# CHECK: ldr	h0, [x1, x0, lsl #1]
-# CHECK: ldr	b0, [x1, x0, lsl #0]
-# CHECK: ldrb	w0, [x1, x0, lsl #0]
-# CHECK: strb	w0, [x1, x0, lsl #0]
-# CHECK: ldrsb	w0, [x1, x0, lsl #0]
-# CHECK: ldrh	w0, [x1, x0, lsl #1]
-# CHECK: strh	w0, [x1, x0, lsl #1]
-# CHECK: ldrsh	w0, [x1, x0, lsl #1]
-# CHECK: ldrsb	x0, [x1, x0, lsl #0]
-# CHECK: ldrsh	x0, [x1, x0, lsl #1]
diff --git a/test/MC/Disassembler/ARM64/scalar-fp.txt b/test/MC/Disassembler/ARM64/scalar-fp.txt
deleted file mode 100644
index b242df5..0000000
--- a/test/MC/Disassembler/ARM64/scalar-fp.txt
+++ /dev/null
@@ -1,255 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
-
-#-----------------------------------------------------------------------------
-# Floating-point arithmetic
-#-----------------------------------------------------------------------------
-
-0x41 0xc0 0x20 0x1e
-0x41 0xc0 0x60 0x1e
-
-# CHECK: fabs s1, s2
-# CHECK: fabs d1, d2
-
-0x41 0x28 0x23 0x1e
-0x41 0x28 0x63 0x1e
-
-# CHECK: fadd s1, s2, s3
-# CHECK: fadd d1, d2, d3
-
-0x41 0x18 0x23 0x1e
-0x41 0x18 0x63 0x1e
-
-# CHECK: fdiv s1, s2, s3
-# CHECK: fdiv d1, d2, d3
-
-0x41 0x10 0x03 0x1f
-0x41 0x10 0x43 0x1f
-
-# CHECK: fmadd s1, s2, s3, s4
-# CHECK: fmadd d1, d2, d3, d4
-
-0x41 0x48 0x23 0x1e
-0x41 0x48 0x63 0x1e
-0x41 0x68 0x23 0x1e
-0x41 0x68 0x63 0x1e
-
-# CHECK: fmax   s1, s2, s3
-# CHECK: fmax   d1, d2, d3
-# CHECK: fmaxnm s1, s2, s3
-# CHECK: fmaxnm d1, d2, d3
-
-0x41 0x58 0x23 0x1e
-0x41 0x58 0x63 0x1e
-0x41 0x78 0x23 0x1e
-0x41 0x78 0x63 0x1e
-
-# CHECK: fmin   s1, s2, s3
-# CHECK: fmin   d1, d2, d3
-# CHECK: fminnm s1, s2, s3
-# CHECK: fminnm d1, d2, d3
-
-0x41 0x90 0x03 0x1f
-0x41 0x90 0x43 0x1f
-
-# CHECK: fmsub s1, s2, s3, s4
-# CHECK: fmsub d1, d2, d3, d4
-
-0x41 0x08 0x23 0x1e
-0x41 0x08 0x63 0x1e
-
-# CHECK: fmul s1, s2, s3
-# CHECK: fmul d1, d2, d3
-
-0x41 0x40 0x21 0x1e
-0x41 0x40 0x61 0x1e
-
-# CHECK: fneg s1, s2
-# CHECK: fneg d1, d2
-
-0x41 0x10 0x23 0x1f
-0x41 0x10 0x63 0x1f
-
-# CHECK: fnmadd s1, s2, s3, s4
-# CHECK: fnmadd d1, d2, d3, d4
-
-0x41 0x90 0x23 0x1f
-0x41 0x90 0x63 0x1f
-
-# CHECK: fnmsub s1, s2, s3, s4
-# CHECK: fnmsub d1, d2, d3, d4
-
-0x41 0x88 0x23 0x1e
-0x41 0x88 0x63 0x1e
-
-# CHECK: fnmul s1, s2, s3
-# CHECK: fnmul d1, d2, d3
-
-0x41 0xc0 0x21 0x1e
-0x41 0xc0 0x61 0x1e
-
-# CHECK: fsqrt s1, s2
-# CHECK: fsqrt d1, d2
-
-0x41 0x38 0x23 0x1e
-0x41 0x38 0x63 0x1e
-
-# CHECK: fsub s1, s2, s3
-# CHECK: fsub d1, d2, d3
-
-#-----------------------------------------------------------------------------
-# Floating-point comparison
-#-----------------------------------------------------------------------------
-
-0x20 0x04 0x22 0x1e
-0x20 0x04 0x62 0x1e
-0x30 0x04 0x22 0x1e
-0x30 0x04 0x62 0x1e
-
-# CHECK: fccmp  s1, s2, #0, eq
-# CHECK: fccmp  d1, d2, #0, eq
-# CHECK: fccmpe s1, s2, #0, eq
-# CHECK: fccmpe d1, d2, #0, eq
-
-0x20 0x20 0x22 0x1e
-0x20 0x20 0x62 0x1e
-0x28 0x20 0x20 0x1e
-0x28 0x20 0x60 0x1e
-0x30 0x20 0x22 0x1e
-0x30 0x20 0x62 0x1e
-0x38 0x20 0x20 0x1e
-0x38 0x20 0x60 0x1e
-
-# CHECK: fcmp  s1, s2
-# CHECK: fcmp  d1, d2
-# CHECK: fcmp  s1, #0.0
-# CHECK: fcmp  d1, #0.0
-# CHECK: fcmpe s1, s2
-# CHECK: fcmpe d1, d2
-# CHECK: fcmpe s1, #0.0
-# CHECK: fcmpe d1, #0.0
-
-#-----------------------------------------------------------------------------
-# Floating-point conditional select
-#-----------------------------------------------------------------------------
-
-0x41 0x0c 0x23 0x1e
-0x41 0x0c 0x63 0x1e
-
-# CHECK: fcsel s1, s2, s3, eq
-# CHECK: fcsel d1, d2, d3, eq
-
-#-----------------------------------------------------------------------------
-# Floating-point convert
-#-----------------------------------------------------------------------------
-
-0x41 0xc0 0x63 0x1e
-0x41 0x40 0x62 0x1e
-0x41 0xc0 0xe2 0x1e
-0x41 0x40 0xe2 0x1e
-0x41 0xc0 0x22 0x1e
-0x41 0xc0 0x23 0x1e
-
-# CHECK: fcvt h1, d2
-# CHECK: fcvt s1, d2
-# CHECK: fcvt d1, h2
-# CHECK: fcvt s1, h2
-# CHECK: fcvt d1, s2
-# CHECK: fcvt h1, s2
-
-0x41 0x00 0x44 0x1e
-0x41 0x04 0x44 0x1e
-0x41 0x00 0x44 0x9e
-0x41 0x04 0x44 0x9e
-0x41 0x00 0x04 0x1e
-0x41 0x04 0x04 0x1e
-0x41 0x00 0x04 0x9e
-0x41 0x04 0x04 0x9e
-
-#-----------------------------------------------------------------------------
-# Floating-point move
-#-----------------------------------------------------------------------------
-
-0x41 0x00 0x27 0x1e
-0x41 0x00 0x26 0x1e
-0x41 0x00 0x67 0x9e
-0x41 0x00 0x66 0x9e
-
-# CHECK: fmov s1, w2
-# CHECK: fmov w1, s2
-# CHECK: fmov d1, x2
-# CHECK: fmov x1, d2
-
-0x01 0x10 0x28 0x1e
-0x01 0x10 0x68 0x1e
-0x01 0xf0 0x7b 0x1e
-0x01 0xf0 0x6b 0x1e
-
-# CHECK: fmov s1, #1.250000e-01
-# CHECK: fmov d1, #1.250000e-01
-# CHECK: fmov d1, #-4.843750e-01
-# CHECK: fmov d1, #4.843750e-01
-
-0x41 0x40 0x20 0x1e
-0x41 0x40 0x60 0x1e
-
-# CHECK: fmov s1, s2
-# CHECK: fmov d1, d2
-
-#-----------------------------------------------------------------------------
-# Floating-point round to integral
-#-----------------------------------------------------------------------------
-
-0x41 0x40 0x26 0x1e
-0x41 0x40 0x66 0x1e
-
-# CHECK: frinta s1, s2
-# CHECK: frinta d1, d2
-
-0x41 0xc0 0x27 0x1e
-0x41 0xc0 0x67 0x1e
-
-# CHECK: frinti s1, s2
-# CHECK: frinti d1, d2
-
-0x41 0x40 0x25 0x1e
-0x41 0x40 0x65 0x1e
-
-# CHECK: frintm s1, s2
-# CHECK: frintm d1, d2
-
-0x41 0x40 0x24 0x1e
-0x41 0x40 0x64 0x1e
-
-# CHECK: frintn s1, s2
-# CHECK: frintn d1, d2
-
-0x41 0xc0 0x24 0x1e
-0x41 0xc0 0x64 0x1e
-
-# CHECK: frintp s1, s2
-# CHECK: frintp d1, d2
-
-0x41 0x40 0x27 0x1e
-0x41 0x40 0x67 0x1e
-
-# CHECK: frintx s1, s2
-# CHECK: frintx d1, d2
-
-0x41 0xc0 0x25 0x1e
-0x41 0xc0 0x65 0x1e
-
-# CHECK: frintz s1, s2
-# CHECK: frintz d1, d2
-
-  0x00 0x3c 0xe0 0x7e
-  0x00 0x8c 0xe0 0x5e
-
-# CHECK: cmhs d0, d0, d0
-# CHECK: cmtst d0, d0, d0
-
-0x00 0x00 0xaf 0x9e
-0x00 0x00 0xae 0x9e
-
-# CHECK: fmov.d v0[1], x0
-# CHECK: fmov.d x0, v0[1]
-
diff --git a/test/MC/Disassembler/ARM64/system.txt b/test/MC/Disassembler/ARM64/system.txt
deleted file mode 100644
index cefa635..0000000
--- a/test/MC/Disassembler/ARM64/system.txt
+++ /dev/null
@@ -1,58 +0,0 @@
-# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
-
-
-#-----------------------------------------------------------------------------
-# Hint encodings
-#-----------------------------------------------------------------------------
-
-  0x1f 0x20 0x03 0xd5
-# CHECK: nop
-  0x9f 0x20 0x03 0xd5
-# CHECK: sev
-  0xbf 0x20 0x03 0xd5
-# CHECK: sevl
-  0x5f 0x20 0x03 0xd5
-# CHECK: wfe
-  0x7f 0x20 0x03 0xd5
-# CHECK: wfi
-  0x3f 0x20 0x03 0xd5
-# CHECK: yield
-
-#-----------------------------------------------------------------------------
-# Single-immediate operand instructions
-#-----------------------------------------------------------------------------
-
-  0x5f 0x3a 0x03 0xd5
-# CHECK: clrex #10
-  0xdf 0x3f 0x03 0xd5
-# CHECK: isb{{$}}
-  0xbf 0x33 0x03 0xd5
-# CHECK: dmb osh
-  0x9f 0x37 0x03 0xd5
-# CHECK: dsb nsh
-
-#-----------------------------------------------------------------------------
-# Generic system instructions
-#-----------------------------------------------------------------------------
-  0xff 0x05 0x0a 0xd5
-  0xe7 0x6a 0x0f 0xd5
-  0xf4 0x3f 0x2e 0xd5
-  0xbf 0x40 0x00 0xd5
-  0x00 0x00 0x10 0xd5
-  0x00 0x00 0x30 0xd5
-
-# CHECK: sys #2, c0, c5, #7
-# CHECK: sys #7, c6, c10, #7, x7
-# CHECK: sysl  x20, #6, c3, c15, #7
-# CHECK: msr  SPSel, #0
-# CHECK: msr S2_0_C0_C0_0, x0
-# CHECK: mrs x0, S2_0_C0_C0_0
-
-  0x40 0xc0 0x1e 0xd5
-  0x40 0xc0 0x1a 0xd5
-  0x40 0xc0 0x19 0xd5
-
-# CHECK: msr RMR_EL3, x0
-# CHECK: msr RMR_EL2, x0
-# CHECK: msr RMR_EL1, x0
-
diff --git a/test/MC/Disassembler/Mips/mips32r6.txt b/test/MC/Disassembler/Mips/mips32r6.txt
new file mode 100644
index 0000000..adbcd99
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips32r6.txt
@@ -0,0 +1,116 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r6 | FileCheck %s
+
+0xec 0x80 0x00 0x19 # CHECK: addiupc $4, 100
+0x7c 0x43 0x22 0xa0 # CHECK: align $4, $2, $3, 2
+0xec 0x7f 0x00 0x38 # CHECK: aluipc $3, 56
+0x3c 0x62 0xff 0xe9 # CHECK: aui $3, $2, -23
+0xec 0x7e 0xff 0xff # CHECK: auipc $3, -1
+0xe8 0x37 0x96 0xb8 # CHECK: balc 14572256
+0xc8 0x37 0x96 0xb8 # CHECK: bc 14572256
+
+# FIXME: Don't check the immediate on these for the moment, the encode/decode
+#        functions are not inverses of eachother.
+#        The immediate should be 4 but the disassembler currently emits 8
+0x45 0x20 0x00 0x01 # CHECK: bc1eqz $f0,
+0x45 0x3f 0x00 0x01 # CHECK: bc1eqz $f31,
+0x45 0xa0 0x00 0x01 # CHECK: bc1nez $f0,
+0x45 0xbf 0x00 0x01 # CHECK: bc1nez $f31,
+# FIXME: Don't check the immediate on these for the moment, the encode/decode
+#        functions are not inverses of eachother.
+#        The immediate should be 8 but the disassembler currently emits 12
+0x49 0x20 0x00 0x02 # CHECK: bc2eqz $0,
+0x49 0x3f 0x00 0x02 # CHECK: bc2eqz $31,
+0x49 0xa0 0x00 0x02 # CHECK: bc2nez $0,
+0x49 0xbf 0x00 0x02 # CHECK: bc2nez $31,
+
+0x20 0xa6 0x00 0x40 # CHECK: beqc $5, $6, 256
+# FIXME: Don't check the immediate on the bcczal's for the moment, the
+#        encode/decode functions are not inverses of eachother.
+0x20 0x02 0x01 0x4d # CHECK: beqzalc $2,
+0x60 0xa6 0x00 0x40 # CHECK: bnec $5, $6, 256
+0x60 0x02 0x01 0x4d # CHECK: bnezalc $2,
+0xd8 0xa0 0x46 0x90 # CHECK: beqzc $5, 72256
+0x18 0x42 0x01 0x4d # CHECK: bgezalc $2,
+0xf8 0xa0 0x46 0x90 # CHECK: bnezc $5, 72256
+0x5c 0xa5 0x00 0x40 # CHECK: bltzc $5, 256
+0x58 0xa5 0x00 0x40 # CHECK: bgezc $5, 256
+0x1c 0x02 0x01 0x4d # CHECK: bgtzalc $2,
+0x58 0x05 0x00 0x40 # CHECK: blezc $5, 256
+0x1c 0x42 0x01 0x4d # CHECK: bltzalc $2,
+0x5c 0x05 0x00 0x40 # CHECK: bgtzc $5, 256
+0x7c 0x02 0x20 0x20 # CHECK: bitswap $4, $2
+0x18 0x02 0x01 0x4d # CHECK: blezalc $2,
+0x60 0x00 0x00 0x01 # CHECK: bnvc $zero, $zero, 4
+0x60 0x40 0x00 0x01 # CHECK: bnvc $2, $zero, 4
+0x60 0x82 0x00 0x01 # CHECK: bnvc $4, $2, 4
+0x20 0x00 0x00 0x01 # CHECK: bovc $zero, $zero, 4
+0x20 0x40 0x00 0x01 # CHECK: bovc $2, $zero, 4
+0x20 0x82 0x00 0x01 # CHECK: bovc $4, $2, 4
+0x46 0x84 0x18 0x80 # CHECK: cmp.f.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x80 # CHECK: cmp.f.d $f2, $f3, $f4
+0x46 0x84 0x18 0x81 # CHECK: cmp.un.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x81 # CHECK: cmp.un.d $f2, $f3, $f4
+0x46 0x84 0x18 0x82 # CHECK: cmp.eq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x82 # CHECK: cmp.eq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x83 # CHECK: cmp.ueq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x83 # CHECK: cmp.ueq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x84 # CHECK: cmp.olt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x84 # CHECK: cmp.olt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x85 # CHECK: cmp.ult.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x85 # CHECK: cmp.ult.d $f2, $f3, $f4
+0x46 0x84 0x18 0x86 # CHECK: cmp.ole.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x86 # CHECK: cmp.ole.d $f2, $f3, $f4
+0x46 0x84 0x18 0x87 # CHECK: cmp.ule.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x87 # CHECK: cmp.ule.d $f2, $f3, $f4
+0x46 0x84 0x18 0x88 # CHECK: cmp.sf.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x88 # CHECK: cmp.sf.d $f2, $f3, $f4
+0x46 0x84 0x18 0x89 # CHECK: cmp.ngle.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x89 # CHECK: cmp.ngle.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8a # CHECK: cmp.seq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8a # CHECK: cmp.seq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8b # CHECK: cmp.ngl.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8b # CHECK: cmp.ngl.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8c # CHECK: cmp.lt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8c # CHECK: cmp.lt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8d # CHECK: cmp.nge.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8d # CHECK: cmp.nge.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8e # CHECK: cmp.le.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8e # CHECK: cmp.le.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8f # CHECK: cmp.ngt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8f # CHECK: cmp.ngt.d $f2, $f3, $f4
+0x00 0x64 0x10 0x9a # CHECK: div $2, $3, $4
+0x00 0x64 0x10 0x9b # CHECK: divu $2, $3, $4
+# 0xf8 0x05 0x01 0x00 # CHECK-TODO: jialc $5, 256
+# 0xd8 0x05 0x01 0x00 # CHECK-TODO: jic $5, 256
+0xec 0x48 0x00 0x43 # CHECK: lwpc $2, 268
+0xec 0x50 0x00 0x43 # CHECK: lwupc $2, 268
+0x00 0x64 0x10 0xda # CHECK: mod $2, $3, $4
+0x00 0x64 0x10 0xdb # CHECK: modu $2, $3, $4
+0x00 0x64 0x10 0x98 # CHECK: mul $2, $3, $4
+0x00 0x64 0x10 0xd8 # CHECK: muh $2, $3, $4
+0x00 0x64 0x10 0x99 # CHECK: mulu $2, $3, $4
+0x00 0x64 0x10 0xd9 # CHECK: muhu $2, $3, $4
+0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
+0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
+0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
+0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4
+0x46 0x22 0x08 0x10 # CHECK: sel.d $f0, $f1, $f2
+0x46 0x02 0x08 0x10 # CHECK: sel.s $f0, $f1, $f2
+0x00 0x64 0x10 0x35 # CHECK: seleqz $2, $3, $4
+0x00 0x64 0x10 0x37 # CHECK: selnez $2, $3, $4
+0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4
+0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4
+0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4
+0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4
+0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4
+0x46 0x00 0x20 0x9a # CHECK: rint.s $f2, $f4
+0x46 0x20 0x20 0x9a # CHECK: rint.d $f2, $f4
+0x46 0x00 0x20 0x9b # CHECK: class.s $f2, $f4
+0x46 0x20 0x20 0x9b # CHECK: class.d $f2, $f4
diff --git a/test/MC/Disassembler/Mips/mips64r6.txt b/test/MC/Disassembler/Mips/mips64r6.txt
new file mode 100644
index 0000000..f5bb14e
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips64r6.txt
@@ -0,0 +1,129 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips64r6 | FileCheck %s
+
+0xec 0x80 0x00 0x19 # CHECK: addiupc $4, 100
+0x7c 0x43 0x22 0xa0 # CHECK: align $4, $2, $3, 2
+0xec 0x7f 0x00 0x38 # CHECK: aluipc $3, 56
+0x3c 0x62 0xff 0xe9 # CHECK: aui $3, $2, -23
+0xec 0x7e 0xff 0xff # CHECK: auipc $3, -1
+0xe8 0x37 0x96 0xb8 # CHECK: balc 14572256
+0xc8 0x37 0x96 0xb8 # CHECK: bc 14572256
+
+# FIXME: Don't check the immediate on these for the moment, the encode/decode
+#        functions are not inverses of eachother.
+#        The immediate should be 4 but the disassembler currently emits 8
+0x45 0x20 0x00 0x01 # CHECK: bc1eqz $f0,
+0x45 0x3f 0x00 0x01 # CHECK: bc1eqz $f31,
+0x45 0xa0 0x00 0x01 # CHECK: bc1nez $f0,
+0x45 0xbf 0x00 0x01 # CHECK: bc1nez $f31,
+# FIXME: Don't check the immediate on these for the moment, the encode/decode
+#        functions are not inverses of eachother.
+#        The immediate should be 8 but the disassembler currently emits 12
+0x49 0x20 0x00 0x02 # CHECK: bc2eqz $0,
+0x49 0x3f 0x00 0x02 # CHECK: bc2eqz $31,
+0x49 0xa0 0x00 0x02 # CHECK: bc2nez $0,
+0x49 0xbf 0x00 0x02 # CHECK: bc2nez $31,
+
+0x20 0xa6 0x00 0x40 # CHECK: beqc $5, $6, 256
+# FIXME: Don't check the immediate on the bcczal's for the moment, the
+#        encode/decode functions are not inverses of eachother.
+0x20 0x02 0x01 0x4d # CHECK: beqzalc $2,
+0x60 0xa6 0x00 0x40 # CHECK: bnec $5, $6, 256
+0x60 0x02 0x01 0x4d # CHECK: bnezalc $2,
+0xd8 0xa0 0x46 0x90 # CHECK: beqzc $5, 72256
+0x18 0x42 0x01 0x4d # CHECK: bgezalc $2,
+0xf8 0xa0 0x46 0x90 # CHECK: bnezc $5, 72256
+0x5c 0xa5 0x00 0x40 # CHECK: bltzc $5, 256
+0x58 0xa5 0x00 0x40 # CHECK: bgezc $5, 256
+0x1c 0x02 0x01 0x4d # CHECK: bgtzalc $2,
+0x58 0x05 0x00 0x40 # CHECK: blezc $5, 256
+0x1c 0x42 0x01 0x4d # CHECK: bltzalc $2,
+0x5c 0x05 0x00 0x40 # CHECK: bgtzc $5, 256
+0x7c 0x02 0x20 0x20 # CHECK: bitswap $4, $2
+0x18 0x02 0x01 0x4d # CHECK: blezalc $2,
+0x60 0x00 0x00 0x01 # CHECK: bnvc $zero, $zero, 4
+0x60 0x40 0x00 0x01 # CHECK: bnvc $2, $zero, 4
+0x60 0x82 0x00 0x01 # CHECK: bnvc $4, $2, 4
+0x20 0x00 0x00 0x01 # CHECK: bovc $zero, $zero, 4
+0x20 0x40 0x00 0x01 # CHECK: bovc $2, $zero, 4
+0x20 0x82 0x00 0x01 # CHECK: bovc $4, $2, 4
+0x46 0x84 0x18 0x80 # CHECK: cmp.f.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x80 # CHECK: cmp.f.d $f2, $f3, $f4
+0x46 0x84 0x18 0x81 # CHECK: cmp.un.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x81 # CHECK: cmp.un.d $f2, $f3, $f4
+0x46 0x84 0x18 0x82 # CHECK: cmp.eq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x82 # CHECK: cmp.eq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x83 # CHECK: cmp.ueq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x83 # CHECK: cmp.ueq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x84 # CHECK: cmp.olt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x84 # CHECK: cmp.olt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x85 # CHECK: cmp.ult.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x85 # CHECK: cmp.ult.d $f2, $f3, $f4
+0x46 0x84 0x18 0x86 # CHECK: cmp.ole.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x86 # CHECK: cmp.ole.d $f2, $f3, $f4
+0x46 0x84 0x18 0x87 # CHECK: cmp.ule.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x87 # CHECK: cmp.ule.d $f2, $f3, $f4
+0x46 0x84 0x18 0x88 # CHECK: cmp.sf.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x88 # CHECK: cmp.sf.d $f2, $f3, $f4
+0x46 0x84 0x18 0x89 # CHECK: cmp.ngle.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x89 # CHECK: cmp.ngle.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8a # CHECK: cmp.seq.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8a # CHECK: cmp.seq.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8b # CHECK: cmp.ngl.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8b # CHECK: cmp.ngl.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8c # CHECK: cmp.lt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8c # CHECK: cmp.lt.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8d # CHECK: cmp.nge.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8d # CHECK: cmp.nge.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8e # CHECK: cmp.le.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8e # CHECK: cmp.le.d $f2, $f3, $f4
+0x46 0x84 0x18 0x8f # CHECK: cmp.ngt.s $f2, $f3, $f4
+0x46 0xa4 0x18 0x8f # CHECK: cmp.ngt.d $f2, $f3, $f4
+0x7c 0x43 0x23 0x64 # CHECK: dalign $4, $2, $3, 5
+0x74 0x62 0x12 0x34 # CHECK: daui $3, $2, 4660
+0x04 0x66 0x56 0x78 # CHECK: dahi $3, 22136
+0x04 0x7e 0xab 0xcd # CHECK: dati $3, -21555
+0x7c 0x02 0x20 0x24 # CHECK: dbitswap $4, $2
+0x00 0x64 0x10 0x9a # CHECK: div $2, $3, $4
+0x00 0x64 0x10 0x9b # CHECK: divu $2, $3, $4
+# 0xf8 0x05 0x01 0x00 # CHECK-TODO: jialc $5, 256
+# 0xd8 0x05 0x01 0x00 # CHECK-TODO: jic $5, 256
+0xec 0x48 0x00 0x43 # CHECK: lwpc $2, 268
+0xec 0x50 0x00 0x43 # CHECK: lwupc $2, 268
+0x00 0x64 0x10 0xda # CHECK: mod $2, $3, $4
+0x00 0x64 0x10 0xdb # CHECK: modu $2, $3, $4
+0x00 0x64 0x10 0x9e # CHECK: ddiv $2, $3, $4
+0x00 0x64 0x10 0x9f # CHECK: ddivu $2, $3, $4
+0x00 0x64 0x10 0xde # CHECK: dmod $2, $3, $4
+0x00 0x64 0x10 0xdf # CHECK: dmodu $2, $3, $4
+0x00 0x64 0x10 0x98 # CHECK: mul $2, $3, $4
+0x00 0x64 0x10 0xd8 # CHECK: muh $2, $3, $4
+0x00 0x64 0x10 0x99 # CHECK: mulu $2, $3, $4
+0x00 0x64 0x10 0xd9 # CHECK: muhu $2, $3, $4
+0x00 0x64 0x10 0xb8 # CHECK: dmul $2, $3, $4
+0x00 0x64 0x10 0xf8 # CHECK: dmuh $2, $3, $4
+0x00 0x64 0x10 0xb9 # CHECK: dmulu $2, $3, $4
+0x00 0x64 0x10 0xf9 # CHECK: dmuhu $2, $3, $4
+0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
+0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
+0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
+0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4
+0x46 0x22 0x08 0x10 # CHECK: sel.d $f0, $f1, $f2
+0x46 0x02 0x08 0x10 # CHECK: sel.s $f0, $f1, $f2
+0x00 0x64 0x10 0x35 # CHECK: seleqz $2, $3, $4
+0x00 0x64 0x10 0x37 # CHECK: selnez $2, $3, $4
+0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4
+0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4
+0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4
+0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4
+0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4
+0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4
+0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4
+0x46 0x00 0x20 0x9a # CHECK: rint.s $f2, $f4
+0x46 0x20 0x20 0x9a # CHECK: rint.d $f2, $f4
+0x46 0x00 0x20 0x9b # CHECK: class.s $f2, $f4
+0x46 0x20 0x20 0x9b # CHECK: class.d $f2, $f4
diff --git a/test/MC/Disassembler/Mips/msa/test_2r.txt b/test/MC/Disassembler/Mips/msa/test_2r.txt
new file mode 100644
index 0000000..7faa13c
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_2r.txt
@@ -0,0 +1,17 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 -mattr=+msa | FileCheck %s
+
+0x7b 0x00 0x4f 0x9e # CHECK:        fill.b  $w30, $9
+0x7b 0x01 0xbf 0xde # CHECK:        fill.h  $w31, $23
+0x7b 0x02 0xc4 0x1e # CHECK:        fill.w  $w16, $24
+0x7b 0x08 0x05 0x5e # CHECK:        nloc.b  $w21, $w0
+0x7b 0x09 0xfc 0x9e # CHECK:        nloc.h  $w18, $w31
+0x7b 0x0a 0xb8 0x9e # CHECK:        nloc.w  $w2, $w23
+0x7b 0x0b 0x51 0x1e # CHECK:        nloc.d  $w4, $w10
+0x7b 0x0c 0x17 0xde # CHECK:        nlzc.b  $w31, $w2
+0x7b 0x0d 0xb6 0xde # CHECK:        nlzc.h  $w27, $w22
+0x7b 0x0e 0xea 0x9e # CHECK:        nlzc.w  $w10, $w29
+0x7b 0x0f 0x4e 0x5e # CHECK:        nlzc.d  $w25, $w9
+0x7b 0x04 0x95 0x1e # CHECK:        pcnt.b  $w20, $w18
+0x7b 0x05 0x40 0x1e # CHECK:        pcnt.h  $w0, $w8
+0x7b 0x06 0x4d 0xde # CHECK:        pcnt.w  $w23, $w9
+0x7b 0x07 0xc5 0x5e # CHECK:        pcnt.d  $w21, $w24
diff --git a/test/MC/Disassembler/Mips/msa/test_2r_msa64.txt b/test/MC/Disassembler/Mips/msa/test_2r_msa64.txt
new file mode 100644
index 0000000..f212390
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_2r_msa64.txt
@@ -0,0 +1,3 @@
+# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux -mcpu=mips64r2 -mattr=+msa | FileCheck %s
+
+0x7b 0x03 0x4e 0xde # CHECK:        fill.d  $w27, $9
diff --git a/test/MC/Disassembler/Mips/msa/test_2rf.txt b/test/MC/Disassembler/Mips/msa/test_2rf.txt
new file mode 100644
index 0000000..e004f11
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_2rf.txt
@@ -0,0 +1,34 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 -mattr=+msa | FileCheck %s
+
+0x7b 0x20 0x66 0x9e # CHECK:        fclass.w        $w26, $w12
+0x7b 0x21 0x8e 0x1e # CHECK:        fclass.d        $w24, $w17
+0x7b 0x30 0x02 0x1e # CHECK:        fexupl.w        $w8, $w0
+0x7b 0x31 0xec 0x5e # CHECK:        fexupl.d        $w17, $w29
+0x7b 0x32 0x23 0x5e # CHECK:        fexupr.w        $w13, $w4
+0x7b 0x33 0x11 0x5e # CHECK:        fexupr.d        $w5, $w2
+0x7b 0x3c 0xed 0x1e # CHECK:        ffint_s.w       $w20, $w29
+0x7b 0x3d 0x7b 0x1e # CHECK:        ffint_s.d       $w12, $w15
+0x7b 0x3e 0xd9 0xde # CHECK:        ffint_u.w       $w7, $w27
+0x7b 0x3f 0x84 0xde # CHECK:        ffint_u.d       $w19, $w16
+0x7b 0x34 0x6f 0xde # CHECK:        ffql.w          $w31, $w13
+0x7b 0x35 0x6b 0x1e # CHECK:        ffql.d          $w12, $w13
+0x7b 0x36 0xf6 0xde # CHECK:        ffqr.w          $w27, $w30
+0x7b 0x37 0x7f 0x9e # CHECK:        ffqr.d          $w30, $w15
+0x7b 0x2e 0xfe 0x5e # CHECK:        flog2.w         $w25, $w31
+0x7b 0x2f 0x54 0x9e # CHECK:        flog2.d         $w18, $w10
+0x7b 0x2c 0x79 0xde # CHECK:        frint.w         $w7, $w15
+0x7b 0x2d 0xb5 0x5e # CHECK:        frint.d         $w21, $w22
+0x7b 0x2a 0x04 0xde # CHECK:        frcp.w          $w19, $w0
+0x7b 0x2b 0x71 0x1e # CHECK:        frcp.d          $w4, $w14
+0x7b 0x28 0x8b 0x1e # CHECK:        frsqrt.w        $w12, $w17
+0x7b 0x29 0x5d 0xde # CHECK:        frsqrt.d        $w23, $w11
+0x7b 0x26 0x58 0x1e # CHECK:        fsqrt.w         $w0, $w11
+0x7b 0x27 0x63 0xde # CHECK:        fsqrt.d         $w15, $w12
+0x7b 0x38 0x2f 0x9e # CHECK:        ftint_s.w       $w30, $w5
+0x7b 0x39 0xb9 0x5e # CHECK:        ftint_s.d       $w5, $w23
+0x7b 0x3a 0x75 0x1e # CHECK:        ftint_u.w       $w20, $w14
+0x7b 0x3b 0xad 0xde # CHECK:        ftint_u.d       $w23, $w21
+0x7b 0x22 0x8f 0x5e # CHECK:        ftrunc_s.w      $w29, $w17
+0x7b 0x23 0xdb 0x1e # CHECK:        ftrunc_s.d      $w12, $w27
+0x7b 0x24 0x7c 0x5e # CHECK:        ftrunc_u.w      $w17, $w15
+0x7b 0x25 0xd9 0x5e # CHECK:        ftrunc_u.d      $w5, $w27
diff --git a/test/MC/Disassembler/Mips/msa/test_3r.txt b/test/MC/Disassembler/Mips/msa/test_3r.txt
new file mode 100644
index 0000000..2ef3a89
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_3r.txt
@@ -0,0 +1,244 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 -mattr=+msa | FileCheck %s
+
+0x78 0x04 0x4e 0x90 # CHECK:        add_a.b         $w26, $w9, $w4
+0x78 0x3f 0xdd 0xd0 # CHECK:        add_a.h         $w23, $w27, $w31
+0x78 0x56 0x32 0xd0 # CHECK:        add_a.w         $w11, $w6, $w22
+0x78 0x60 0x51 0x90 # CHECK:        add_a.d         $w6, $w10, $w0
+0x78 0x93 0xc4 0xd0 # CHECK:        adds_a.b        $w19, $w24, $w19
+0x78 0xa4 0x36 0x50 # CHECK:        adds_a.h        $w25, $w6, $w4
+0x78 0xdb 0x8e 0x50 # CHECK:        adds_a.w        $w25, $w17, $w27
+0x78 0xfa 0x93 0xd0 # CHECK:        adds_a.d        $w15, $w18, $w26
+0x79 0x13 0x5f 0x50 # CHECK:        adds_s.b        $w29, $w11, $w19
+0x79 0x3a 0xb9 0x50 # CHECK:        adds_s.h        $w5, $w23, $w26
+0x79 0x4d 0x74 0x10 # CHECK:        adds_s.w        $w16, $w14, $w13
+0x79 0x7c 0x70 0x90 # CHECK:        adds_s.d        $w2, $w14, $w28
+0x79 0x8e 0x88 0xd0 # CHECK:        adds_u.b        $w3, $w17, $w14
+0x79 0xa4 0xf2 0x90 # CHECK:        adds_u.h        $w10, $w30, $w4
+0x79 0xd4 0x93 0xd0 # CHECK:        adds_u.w        $w15, $w18, $w20
+0x79 0xe9 0x57 0x90 # CHECK:        adds_u.d        $w30, $w10, $w9
+0x78 0x15 0xa6 0x0e # CHECK:        addv.b          $w24, $w20, $w21
+0x78 0x3b 0x69 0x0e # CHECK:        addv.h          $w4, $w13, $w27
+0x78 0x4e 0x5c 0xce # CHECK:        addv.w          $w19, $w11, $w14
+0x78 0x7f 0xa8 0x8e # CHECK:        addv.d          $w2, $w21, $w31
+0x7a 0x03 0x85 0xd1 # CHECK:        asub_s.b        $w23, $w16, $w3
+0x7a 0x39 0x8d 0x91 # CHECK:        asub_s.h        $w22, $w17, $w25
+0x7a 0x49 0x0e 0x11 # CHECK:        asub_s.w        $w24, $w1, $w9
+0x7a 0x6c 0x63 0x51 # CHECK:        asub_s.d        $w13, $w12, $w12
+0x7a 0x8b 0xea 0x91 # CHECK:        asub_u.b        $w10, $w29, $w11
+0x7a 0xaf 0x4c 0x91 # CHECK:        asub_u.h        $w18, $w9, $w15
+0x7a 0xdf 0x9a 0x91 # CHECK:        asub_u.w        $w10, $w19, $w31
+0x7a 0xe0 0x54 0x51 # CHECK:        asub_u.d        $w17, $w10, $w0
+0x7a 0x01 0x28 0x90 # CHECK:        ave_s.b         $w2, $w5, $w1
+0x7a 0x29 0x9c 0x10 # CHECK:        ave_s.h         $w16, $w19, $w9
+0x7a 0x45 0xfc 0x50 # CHECK:        ave_s.w         $w17, $w31, $w5
+0x7a 0x6a 0xce 0xd0 # CHECK:        ave_s.d         $w27, $w25, $w10
+0x7a 0x89 0x9c 0x10 # CHECK:        ave_u.b         $w16, $w19, $w9
+0x7a 0xab 0xe7 0x10 # CHECK:        ave_u.h         $w28, $w28, $w11
+0x7a 0xcb 0x62 0xd0 # CHECK:        ave_u.w         $w11, $w12, $w11
+0x7a 0xfc 0x9f 0x90 # CHECK:        ave_u.d         $w30, $w19, $w28
+0x7b 0x02 0x86 0x90 # CHECK:        aver_s.b        $w26, $w16, $w2
+0x7b 0x3b 0xdf 0xd0 # CHECK:        aver_s.h        $w31, $w27, $w27
+0x7b 0x59 0x97 0x10 # CHECK:        aver_s.w        $w28, $w18, $w25
+0x7b 0x7b 0xaf 0x50 # CHECK:        aver_s.d        $w29, $w21, $w27
+0x7b 0x83 0xd7 0x50 # CHECK:        aver_u.b        $w29, $w26, $w3
+0x7b 0xa9 0x94 0x90 # CHECK:        aver_u.h        $w18, $w18, $w9
+0x7b 0xdd 0xcc 0x50 # CHECK:        aver_u.w        $w17, $w25, $w29
+0x7b 0xf3 0xb5 0x90 # CHECK:        aver_u.d        $w22, $w22, $w19
+0x79 0x9d 0x78 0x8d # CHECK:        bclr.b          $w2, $w15, $w29
+0x79 0xbc 0xac 0x0d # CHECK:        bclr.h          $w16, $w21, $w28
+0x79 0xc9 0x14 0xcd # CHECK:        bclr.w          $w19, $w2, $w9
+0x79 0xe4 0xfe 0xcd # CHECK:        bclr.d          $w27, $w31, $w4
+0x7b 0x18 0x81 0x4d # CHECK:        binsl.b         $w5, $w16, $w24
+0x7b 0x2a 0x2f 0x8d # CHECK:        binsl.h         $w30, $w5, $w10
+0x7b 0x4d 0x7b 0x8d # CHECK:        binsl.w         $w14, $w15, $w13
+0x7b 0x6c 0xa5 0xcd # CHECK:        binsl.d         $w23, $w20, $w12
+0x7b 0x82 0x5d 0x8d # CHECK:        binsr.b         $w22, $w11, $w2
+0x7b 0xa6 0xd0 0x0d # CHECK:        binsr.h         $w0, $w26, $w6
+0x7b 0xdc 0x1e 0x8d # CHECK:        binsr.w         $w26, $w3, $w28
+0x7b 0xf5 0x00 0x0d # CHECK:        binsr.d         $w0, $w0, $w21
+0x7a 0x98 0x58 0x0d # CHECK:        bneg.b          $w0, $w11, $w24
+0x7a 0xa4 0x87 0x0d # CHECK:        bneg.h          $w28, $w16, $w4
+0x7a 0xd3 0xd0 0xcd # CHECK:        bneg.w          $w3, $w26, $w19
+0x7a 0xef 0xeb 0x4d # CHECK:        bneg.d          $w13, $w29, $w15
+0x7a 0x1f 0x2f 0xcd # CHECK:        bset.b          $w31, $w5, $w31
+0x7a 0x26 0x63 0x8d # CHECK:        bset.h          $w14, $w12, $w6
+0x7a 0x4c 0x4f 0xcd # CHECK:        bset.w          $w31, $w9, $w12
+0x7a 0x65 0xb1 0x4d # CHECK:        bset.d          $w5, $w22, $w5
+0x78 0x12 0xff 0xcf # CHECK:        ceq.b           $w31, $w31, $w18
+0x78 0x29 0xda 0x8f # CHECK:        ceq.h           $w10, $w27, $w9
+0x78 0x4e 0x2a 0x4f # CHECK:        ceq.w           $w9, $w5, $w14
+0x78 0x60 0x89 0x4f # CHECK:        ceq.d           $w5, $w17, $w0
+0x7a 0x09 0x25 0xcf # CHECK:        cle_s.b         $w23, $w4, $w9
+0x7a 0x33 0xdd 0x8f # CHECK:        cle_s.h         $w22, $w27, $w19
+0x7a 0x4a 0xd7 0x8f # CHECK:        cle_s.w         $w30, $w26, $w10
+0x7a 0x6a 0x2c 0x8f # CHECK:        cle_s.d         $w18, $w5, $w10
+0x7a 0x80 0xc8 0x4f # CHECK:        cle_u.b         $w1, $w25, $w0
+0x7a 0xbd 0x01 0xcf # CHECK:        cle_u.h         $w7, $w0, $w29
+0x7a 0xc1 0x96 0x4f # CHECK:        cle_u.w         $w25, $w18, $w1
+0x7a 0xfe 0x01 0x8f # CHECK:        cle_u.d         $w6, $w0, $w30
+0x79 0x15 0x16 0x4f # CHECK:        clt_s.b         $w25, $w2, $w21
+0x79 0x29 0x98 0x8f # CHECK:        clt_s.h         $w2, $w19, $w9
+0x79 0x50 0x45 0xcf # CHECK:        clt_s.w         $w23, $w8, $w16
+0x79 0x6c 0xf1 0xcf # CHECK:        clt_s.d         $w7, $w30, $w12
+0x79 0x8d 0xf8 0x8f # CHECK:        clt_u.b         $w2, $w31, $w13
+0x79 0xb7 0xfc 0x0f # CHECK:        clt_u.h         $w16, $w31, $w23
+0x79 0xc9 0xc0 0xcf # CHECK:        clt_u.w         $w3, $w24, $w9
+0x79 0xe1 0x01 0xcf # CHECK:        clt_u.d         $w7, $w0, $w1
+0x7a 0x12 0x1f 0x52 # CHECK:        div_s.b         $w29, $w3, $w18
+0x7a 0x2d 0x84 0x52 # CHECK:        div_s.h         $w17, $w16, $w13
+0x7a 0x5e 0xc9 0x12 # CHECK:        div_s.w         $w4, $w25, $w30
+0x7a 0x74 0x4f 0xd2 # CHECK:        div_s.d         $w31, $w9, $w20
+0x7a 0x8a 0xe9 0x92 # CHECK:        div_u.b         $w6, $w29, $w10
+0x7a 0xae 0xae 0x12 # CHECK:        div_u.h         $w24, $w21, $w14
+0x7a 0xd9 0x77 0x52 # CHECK:        div_u.w         $w29, $w14, $w25
+0x7a 0xf5 0x0f 0xd2 # CHECK:        div_u.d         $w31, $w1, $w21
+0x78 0x39 0xb5 0xd3 # CHECK:        dotp_s.h        $w23, $w22, $w25
+0x78 0x45 0x75 0x13 # CHECK:        dotp_s.w        $w20, $w14, $w5
+0x78 0x76 0x14 0x53 # CHECK:        dotp_s.d        $w17, $w2, $w22
+0x78 0xa6 0x13 0x53 # CHECK:        dotp_u.h        $w13, $w2, $w6
+0x78 0xd5 0xb3 0xd3 # CHECK:        dotp_u.w        $w15, $w22, $w21
+0x78 0xfa 0x81 0x13 # CHECK:        dotp_u.d        $w4, $w16, $w26
+0x79 0x36 0xe0 0x53 # CHECK:        dpadd_s.h       $w1, $w28, $w22
+0x79 0x4c 0x0a 0x93 # CHECK:        dpadd_s.w       $w10, $w1, $w12
+0x79 0x7b 0xa8 0xd3 # CHECK:        dpadd_s.d       $w3, $w21, $w27
+0x79 0xb4 0x2c 0x53 # CHECK:        dpadd_u.h       $w17, $w5, $w20
+0x79 0xd0 0x46 0x13 # CHECK:        dpadd_u.w       $w24, $w8, $w16
+0x79 0xf0 0xeb 0xd3 # CHECK:        dpadd_u.d       $w15, $w29, $w16
+0x7a 0x2c 0x59 0x13 # CHECK:        dpsub_s.h       $w4, $w11, $w12
+0x7a 0x46 0x39 0x13 # CHECK:        dpsub_s.w       $w4, $w7, $w6
+0x7a 0x7c 0x67 0xd3 # CHECK:        dpsub_s.d       $w31, $w12, $w28
+0x7a 0xb1 0xc9 0x13 # CHECK:        dpsub_u.h       $w4, $w25, $w17
+0x7a 0xd0 0xcc 0xd3 # CHECK:        dpsub_u.w       $w19, $w25, $w16
+0x7a 0xfa 0x51 0xd3 # CHECK:        dpsub_u.d       $w7, $w10, $w26
+0x7a 0x22 0xc7 0x15 # CHECK:        hadd_s.h        $w28, $w24, $w2
+0x7a 0x4b 0x8e 0x15 # CHECK:        hadd_s.w        $w24, $w17, $w11
+0x7a 0x74 0x7c 0x55 # CHECK:        hadd_s.d        $w17, $w15, $w20
+0x7a 0xb1 0xeb 0x15 # CHECK:        hadd_u.h        $w12, $w29, $w17
+0x7a 0xc6 0x2a 0x55 # CHECK:        hadd_u.w        $w9, $w5, $w6
+0x7a 0xe6 0xa0 0x55 # CHECK:        hadd_u.d        $w1, $w20, $w6
+0x7b 0x3d 0x74 0x15 # CHECK:        hsub_s.h        $w16, $w14, $w29
+0x7b 0x4b 0x6a 0x55 # CHECK:        hsub_s.w        $w9, $w13, $w11
+0x7b 0x6e 0x97 0x95 # CHECK:        hsub_s.d        $w30, $w18, $w14
+0x7b 0xae 0x61 0xd5 # CHECK:        hsub_u.h        $w7, $w12, $w14
+0x7b 0xc5 0x2d 0x55 # CHECK:        hsub_u.w        $w21, $w5, $w5
+0x7b 0xff 0x62 0xd5 # CHECK:        hsub_u.d        $w11, $w12, $w31
+0x7b 0x1e 0x84 0x94 # CHECK:        ilvev.b         $w18, $w16, $w30
+0x7b 0x2d 0x03 0x94 # CHECK:        ilvev.h         $w14, $w0, $w13
+0x7b 0x56 0xcb 0x14 # CHECK:        ilvev.w         $w12, $w25, $w22
+0x7b 0x63 0xdf 0x94 # CHECK:        ilvev.d         $w30, $w27, $w3
+0x7a 0x15 0x1f 0x54 # CHECK:        ilvl.b          $w29, $w3, $w21
+0x7a 0x31 0x56 0xd4 # CHECK:        ilvl.h          $w27, $w10, $w17
+0x7a 0x40 0x09 0x94 # CHECK:        ilvl.w          $w6, $w1, $w0
+0x7a 0x78 0x80 0xd4 # CHECK:        ilvl.d          $w3, $w16, $w24
+0x7b 0x94 0x2a 0xd4 # CHECK:        ilvod.b         $w11, $w5, $w20
+0x7b 0xbf 0x6c 0x94 # CHECK:        ilvod.h         $w18, $w13, $w31
+0x7b 0xd8 0x87 0x54 # CHECK:        ilvod.w         $w29, $w16, $w24
+0x7b 0xfd 0x65 0x94 # CHECK:        ilvod.d         $w22, $w12, $w29
+0x7a 0x86 0xf1 0x14 # CHECK:        ilvr.b          $w4, $w30, $w6
+0x7a 0xbd 0x9f 0x14 # CHECK:        ilvr.h          $w28, $w19, $w29
+0x7a 0xd5 0xa4 0x94 # CHECK:        ilvr.w          $w18, $w20, $w21
+0x7a 0xec 0xf5 0xd4 # CHECK:        ilvr.d          $w23, $w30, $w12
+0x78 0x9d 0xfc 0x52 # CHECK:        maddv.b         $w17, $w31, $w29
+0x78 0xa9 0xc1 0xd2 # CHECK:        maddv.h         $w7, $w24, $w9
+0x78 0xd4 0xb5 0x92 # CHECK:        maddv.w         $w22, $w22, $w20
+0x78 0xf4 0xd7 0x92 # CHECK:        maddv.d         $w30, $w26, $w20
+0x7b 0x17 0x5d 0xce # CHECK:        max_a.b         $w23, $w11, $w23
+0x7b 0x3e 0x2d 0x0e # CHECK:        max_a.h         $w20, $w5, $w30
+0x7b 0x5e 0x91 0xce # CHECK:        max_a.w         $w7, $w18, $w30
+0x7b 0x7f 0x42 0x0e # CHECK:        max_a.d         $w8, $w8, $w31
+0x79 0x13 0x0a 0x8e # CHECK:        max_s.b         $w10, $w1, $w19
+0x79 0x31 0xeb 0xce # CHECK:        max_s.h         $w15, $w29, $w17
+0x79 0x4e 0xeb 0xce # CHECK:        max_s.w         $w15, $w29, $w14
+0x79 0x63 0xc6 0x4e # CHECK:        max_s.d         $w25, $w24, $w3
+0x79 0x85 0xc3 0x0e # CHECK:        max_u.b         $w12, $w24, $w5
+0x79 0xa7 0x31 0x4e # CHECK:        max_u.h         $w5, $w6, $w7
+0x79 0xc7 0x24 0x0e # CHECK:        max_u.w         $w16, $w4, $w7
+0x79 0xf8 0x66 0x8e # CHECK:        max_u.d         $w26, $w12, $w24
+0x7b 0x81 0xd1 0x0e # CHECK:        min_a.b         $w4, $w26, $w1
+0x7b 0xbf 0x6b 0x0e # CHECK:        min_a.h         $w12, $w13, $w31
+0x7b 0xc0 0xa7 0x0e # CHECK:        min_a.w         $w28, $w20, $w0
+0x7b 0xf3 0xa3 0x0e # CHECK:        min_a.d         $w12, $w20, $w19
+0x7a 0x0e 0x1c 0xce # CHECK:        min_s.b         $w19, $w3, $w14
+0x7a 0x28 0xae 0xce # CHECK:        min_s.h         $w27, $w21, $w8
+0x7a 0x5e 0x70 0x0e # CHECK:        min_s.w         $w0, $w14, $w30
+0x7a 0x75 0x41 0x8e # CHECK:        min_s.d         $w6, $w8, $w21
+0x7a 0x88 0xd5 0x8e # CHECK:        min_u.b         $w22, $w26, $w8
+0x7a 0xac 0xd9 0xce # CHECK:        min_u.h         $w7, $w27, $w12
+0x7a 0xce 0xa2 0x0e # CHECK:        min_u.w         $w8, $w20, $w14
+0x7a 0xef 0x76 0x8e # CHECK:        min_u.d         $w26, $w14, $w15
+0x7b 0x1a 0x0c 0x92 # CHECK:        mod_s.b         $w18, $w1, $w26
+0x7b 0x3c 0xf7 0xd2 # CHECK:        mod_s.h         $w31, $w30, $w28
+0x7b 0x4d 0x30 0x92 # CHECK:        mod_s.w         $w2, $w6, $w13
+0x7b 0x76 0xdd 0x52 # CHECK:        mod_s.d         $w21, $w27, $w22
+0x7b 0x8d 0x3c 0x12 # CHECK:        mod_u.b         $w16, $w7, $w13
+0x7b 0xa7 0x46 0x12 # CHECK:        mod_u.h         $w24, $w8, $w7
+0x7b 0xd1 0x17 0x92 # CHECK:        mod_u.w         $w30, $w2, $w17
+0x7b 0xf9 0x17 0xd2 # CHECK:        mod_u.d         $w31, $w2, $w25
+0x79 0x0c 0x2b 0x92 # CHECK:        msubv.b         $w14, $w5, $w12
+0x79 0x3e 0x39 0x92 # CHECK:        msubv.h         $w6, $w7, $w30
+0x79 0x55 0x13 0x52 # CHECK:        msubv.w         $w13, $w2, $w21
+0x79 0x7b 0x74 0x12 # CHECK:        msubv.d         $w16, $w14, $w27
+0x78 0x0d 0x1d 0x12 # CHECK:        mulv.b          $w20, $w3, $w13
+0x78 0x2e 0xd6 0xd2 # CHECK:        mulv.h          $w27, $w26, $w14
+0x78 0x43 0xea 0x92 # CHECK:        mulv.w          $w10, $w29, $w3
+0x78 0x7d 0x99 0xd2 # CHECK:        mulv.d          $w7, $w19, $w29
+0x79 0x07 0xd9 0x54 # CHECK:        pckev.b         $w5, $w27, $w7
+0x79 0x3b 0x20 0x54 # CHECK:        pckev.h         $w1, $w4, $w27
+0x79 0x40 0xa7 0x94 # CHECK:        pckev.w         $w30, $w20, $w0
+0x79 0x6f 0x09 0x94 # CHECK:        pckev.d         $w6, $w1, $w15
+0x79 0x9e 0xe4 0x94 # CHECK:        pckod.b         $w18, $w28, $w30
+0x79 0xa8 0x2e 0x94 # CHECK:        pckod.h         $w26, $w5, $w8
+0x79 0xc2 0x22 0x54 # CHECK:        pckod.w         $w9, $w4, $w2
+0x79 0xf4 0xb7 0x94 # CHECK:        pckod.d         $w30, $w22, $w20
+0x78 0x0c 0xb9 0x54 # CHECK:        sld.b           $w5, $w23[$12]
+0x78 0x23 0xb8 0x54 # CHECK:        sld.h           $w1, $w23[$3]
+0x78 0x49 0x45 0x14 # CHECK:        sld.w           $w20, $w8[$9]
+0x78 0x7e 0xb9 0xd4 # CHECK:        sld.d           $w7, $w23[$fp]
+0x78 0x11 0x00 0xcd # CHECK:        sll.b           $w3, $w0, $w17
+0x78 0x23 0xdc 0x4d # CHECK:        sll.h           $w17, $w27, $w3
+0x78 0x46 0x3c 0x0d # CHECK:        sll.w           $w16, $w7, $w6
+0x78 0x7a 0x02 0x4d # CHECK:        sll.d           $w9, $w0, $w26
+0x78 0x81 0x0f 0x14 # CHECK:        splat.b         $w28, $w1[$1]
+0x78 0xab 0x58 0x94 # CHECK:        splat.h         $w2, $w11[$11]
+0x78 0xcb 0x05 0x94 # CHECK:        splat.w         $w22, $w0[$11]
+0x78 0xe2 0x00 0x14 # CHECK:        splat.d         $w0, $w0[$2]
+0x78 0x91 0x27 0x0d # CHECK:        sra.b           $w28, $w4, $w17
+0x78 0xa3 0x4b 0x4d # CHECK:        sra.h           $w13, $w9, $w3
+0x78 0xd3 0xae 0xcd # CHECK:        sra.w           $w27, $w21, $w19
+0x78 0xf7 0x47 0x8d # CHECK:        sra.d           $w30, $w8, $w23
+0x78 0x92 0x94 0xd5 # CHECK:        srar.b          $w19, $w18, $w18
+0x78 0xa8 0xb9 0xd5 # CHECK:        srar.h          $w7, $w23, $w8
+0x78 0xc2 0x60 0x55 # CHECK:        srar.w          $w1, $w12, $w2
+0x78 0xee 0x3d 0x55 # CHECK:        srar.d          $w21, $w7, $w14
+0x79 0x13 0x1b 0x0d # CHECK:        srl.b           $w12, $w3, $w19
+0x79 0x34 0xfd 0xcd # CHECK:        srl.h           $w23, $w31, $w20
+0x79 0x4b 0xdc 0x8d # CHECK:        srl.w           $w18, $w27, $w11
+0x79 0x7a 0x60 0xcd # CHECK:        srl.d           $w3, $w12, $w26
+0x79 0x0b 0xab 0xd5 # CHECK:        srlr.b          $w15, $w21, $w11
+0x79 0x33 0x6d 0x55 # CHECK:        srlr.h          $w21, $w13, $w19
+0x79 0x43 0xf1 0x95 # CHECK:        srlr.w          $w6, $w30, $w3
+0x79 0x6e 0x10 0x55 # CHECK:        srlr.d          $w1, $w2, $w14
+0x78 0x01 0x7e 0x51 # CHECK:        subs_s.b        $w25, $w15, $w1
+0x78 0x36 0xcf 0x11 # CHECK:        subs_s.h        $w28, $w25, $w22
+0x78 0x55 0x62 0x91 # CHECK:        subs_s.w        $w10, $w12, $w21
+0x78 0x72 0xa1 0x11 # CHECK:        subs_s.d        $w4, $w20, $w18
+0x78 0x99 0x35 0x51 # CHECK:        subs_u.b        $w21, $w6, $w25
+0x78 0xa7 0x50 0xd1 # CHECK:        subs_u.h        $w3, $w10, $w7
+0x78 0xca 0x7a 0x51 # CHECK:        subs_u.w        $w9, $w15, $w10
+0x78 0xea 0x99 0xd1 # CHECK:        subs_u.d        $w7, $w19, $w10
+0x79 0x0c 0x39 0x91 # CHECK:        subsus_u.b      $w6, $w7, $w12
+0x79 0x33 0xe9 0x91 # CHECK:        subsus_u.h      $w6, $w29, $w19
+0x79 0x47 0x79 0xd1 # CHECK:        subsus_u.w      $w7, $w15, $w7
+0x79 0x6f 0x1a 0x51 # CHECK:        subsus_u.d      $w9, $w3, $w15
+0x79 0x9f 0x1d 0x91 # CHECK:        subsuu_s.b      $w22, $w3, $w31
+0x79 0xb6 0xbc 0xd1 # CHECK:        subsuu_s.h      $w19, $w23, $w22
+0x79 0xcd 0x52 0x51 # CHECK:        subsuu_s.w      $w9, $w10, $w13
+0x79 0xe0 0x31 0x51 # CHECK:        subsuu_s.d      $w5, $w6, $w0
+0x78 0x93 0x69 0x8e # CHECK:        subv.b          $w6, $w13, $w19
+0x78 0xac 0xc9 0x0e # CHECK:        subv.h          $w4, $w25, $w12
+0x78 0xcb 0xde 0xce # CHECK:        subv.w          $w27, $w27, $w11
+0x78 0xea 0xc2 0x4e # CHECK:        subv.d          $w9, $w24, $w10
+0x78 0x05 0x80 0xd5 # CHECK:        vshf.b          $w3, $w16, $w5
+0x78 0x28 0x9d 0x15 # CHECK:        vshf.h          $w20, $w19, $w8
+0x78 0x59 0xf4 0x15 # CHECK:        vshf.w          $w16, $w30, $w25
+0x78 0x6f 0x5c 0xd5 # CHECK:        vshf.d          $w19, $w11, $w15
diff --git a/test/MC/Disassembler/Mips/msa/test_3rf.txt b/test/MC/Disassembler/Mips/msa/test_3rf.txt
new file mode 100644
index 0000000..3b7b07c
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_3rf.txt
@@ -0,0 +1,84 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 -mattr=+msa | FileCheck %s
+
+0x78 0x1c 0x9f 0x1b # CHECK:        fadd.w          $w28, $w19, $w28
+0x78 0x3d 0x13 0x5b # CHECK:        fadd.d          $w13, $w2, $w29
+0x78 0x19 0x5b 0x9a # CHECK:        fcaf.w          $w14, $w11, $w25
+0x78 0x33 0x08 0x5a # CHECK:        fcaf.d          $w1, $w1, $w19
+0x78 0x90 0xb8 0x5a # CHECK:        fceq.w          $w1, $w23, $w16
+0x78 0xb0 0x40 0x1a # CHECK:        fceq.d          $w0, $w8, $w16
+0x79 0x98 0x4c 0x1a # CHECK:        fcle.w          $w16, $w9, $w24
+0x79 0xa1 0x76 0xda # CHECK:        fcle.d          $w27, $w14, $w1
+0x79 0x08 0x47 0x1a # CHECK:        fclt.w          $w28, $w8, $w8
+0x79 0x2b 0xcf 0x9a # CHECK:        fclt.d          $w30, $w25, $w11
+0x78 0xd7 0x90 0x9c # CHECK:        fcne.w          $w2, $w18, $w23
+0x78 0xef 0xa3 0x9c # CHECK:        fcne.d          $w14, $w20, $w15
+0x78 0x59 0x92 0x9c # CHECK:        fcor.w          $w10, $w18, $w25
+0x78 0x6b 0xcc 0x5c # CHECK:        fcor.d          $w17, $w25, $w11
+0x78 0xd5 0x13 0x9a # CHECK:        fcueq.w         $w14, $w2, $w21
+0x78 0xe7 0x1f 0x5a # CHECK:        fcueq.d         $w29, $w3, $w7
+0x79 0xc3 0x2c 0x5a # CHECK:        fcule.w         $w17, $w5, $w3
+0x79 0xfe 0x0f 0xda # CHECK:        fcule.d         $w31, $w1, $w30
+0x79 0x49 0xc9 0x9a # CHECK:        fcult.w         $w6, $w25, $w9
+0x79 0x71 0x46 0xda # CHECK:        fcult.d         $w27, $w8, $w17
+0x78 0x48 0xa1 0x1a # CHECK:        fcun.w          $w4, $w20, $w8
+0x78 0x63 0x5f 0x5a # CHECK:        fcun.d          $w29, $w11, $w3
+0x78 0x93 0x93 0x5c # CHECK:        fcune.w         $w13, $w18, $w19
+0x78 0xb5 0xd4 0x1c # CHECK:        fcune.d         $w16, $w26, $w21
+0x78 0xc2 0xc3 0x5b # CHECK:        fdiv.w          $w13, $w24, $w2
+0x78 0xf9 0x24 0xdb # CHECK:        fdiv.d          $w19, $w4, $w25
+0x7a 0x10 0x02 0x1b # CHECK:        fexdo.h         $w8, $w0, $w16
+0x7a 0x3b 0x68 0x1b # CHECK:        fexdo.w         $w0, $w13, $w27
+0x79 0xc3 0x04 0x5b # CHECK:        fexp2.w         $w17, $w0, $w3
+0x79 0xea 0x05 0x9b # CHECK:        fexp2.d         $w22, $w0, $w10
+0x79 0x17 0x37 0x5b # CHECK:        fmadd.w         $w29, $w6, $w23
+0x79 0x35 0xe2 0xdb # CHECK:        fmadd.d         $w11, $w28, $w21
+0x7b 0x8d 0xb8 0x1b # CHECK:        fmax.w          $w0, $w23, $w13
+0x7b 0xa8 0x96 0x9b # CHECK:        fmax.d          $w26, $w18, $w8
+0x7b 0xca 0x82 0x9b # CHECK:        fmax_a.w        $w10, $w16, $w10
+0x7b 0xf6 0x4f 0x9b # CHECK:        fmax_a.d        $w30, $w9, $w22
+0x7b 0x1e 0x0e 0x1b # CHECK:        fmin.w          $w24, $w1, $w30
+0x7b 0x2a 0xde 0xdb # CHECK:        fmin.d          $w27, $w27, $w10
+0x7b 0x54 0xea 0x9b # CHECK:        fmin_a.w        $w10, $w29, $w20
+0x7b 0x78 0xf3 0x5b # CHECK:        fmin_a.d        $w13, $w30, $w24
+0x79 0x40 0xcc 0x5b # CHECK:        fmsub.w         $w17, $w25, $w0
+0x79 0x70 0x92 0x1b # CHECK:        fmsub.d         $w8, $w18, $w16
+0x78 0x8f 0x78 0xdb # CHECK:        fmul.w          $w3, $w15, $w15
+0x78 0xaa 0xf2 0x5b # CHECK:        fmul.d          $w9, $w30, $w10
+0x7a 0x0a 0x2e 0x5a # CHECK:        fsaf.w          $w25, $w5, $w10
+0x7a 0x3d 0x1e 0x5a # CHECK:        fsaf.d          $w25, $w3, $w29
+0x7a 0x8d 0x8a 0xda # CHECK:        fseq.w          $w11, $w17, $w13
+0x7a 0xbf 0x07 0x5a # CHECK:        fseq.d          $w29, $w0, $w31
+0x7b 0x9f 0xff 0x9a # CHECK:        fsle.w          $w30, $w31, $w31
+0x7b 0xb8 0xbc 0x9a # CHECK:        fsle.d          $w18, $w23, $w24
+0x7b 0x06 0x2b 0x1a # CHECK:        fslt.w          $w12, $w5, $w6
+0x7b 0x35 0xd4 0x1a # CHECK:        fslt.d          $w16, $w26, $w21
+0x7a 0xcc 0x0f 0x9c # CHECK:        fsne.w          $w30, $w1, $w12
+0x7a 0xf7 0x6b 0x9c # CHECK:        fsne.d          $w14, $w13, $w23
+0x7a 0x5b 0x6e 0xdc # CHECK:        fsor.w          $w27, $w13, $w27
+0x7a 0x6b 0xc3 0x1c # CHECK:        fsor.d          $w12, $w24, $w11
+0x78 0x41 0xd7 0xdb # CHECK:        fsub.w          $w31, $w26, $w1
+0x78 0x7b 0x8c 0xdb # CHECK:        fsub.d          $w19, $w17, $w27
+0x7a 0xd9 0xc4 0x1a # CHECK:        fsueq.w         $w16, $w24, $w25
+0x7a 0xee 0x74 0x9a # CHECK:        fsueq.d         $w18, $w14, $w14
+0x7b 0xcd 0xf5 0xda # CHECK:        fsule.w         $w23, $w30, $w13
+0x7b 0xfa 0x58 0x9a # CHECK:        fsule.d         $w2, $w11, $w26
+0x7b 0x56 0xd2 0xda # CHECK:        fsult.w         $w11, $w26, $w22
+0x7b 0x7e 0xb9 0x9a # CHECK:        fsult.d         $w6, $w23, $w30
+0x7a 0x5c 0x90 0xda # CHECK:        fsun.w          $w3, $w18, $w28
+0x7a 0x73 0x5c 0x9a # CHECK:        fsun.d          $w18, $w11, $w19
+0x7a 0x82 0xfc 0x1c # CHECK:        fsune.w         $w16, $w31, $w2
+0x7a 0xb1 0xd0 0xdc # CHECK:        fsune.d         $w3, $w26, $w17
+0x7a 0x98 0x24 0x1b # CHECK:        ftq.h           $w16, $w4, $w24
+0x7a 0xb9 0x29 0x5b # CHECK:        ftq.w           $w5, $w5, $w25
+0x79 0x4a 0xa4 0x1c # CHECK:        madd_q.h        $w16, $w20, $w10
+0x79 0x69 0x17 0x1c # CHECK:        madd_q.w        $w28, $w2, $w9
+0x7b 0x49 0x92 0x1c # CHECK:        maddr_q.h       $w8, $w18, $w9
+0x7b 0x70 0x67 0x5c # CHECK:        maddr_q.w       $w29, $w12, $w16
+0x79 0x8a 0xd6 0x1c # CHECK:        msub_q.h        $w24, $w26, $w10
+0x79 0xbc 0xf3 0x5c # CHECK:        msub_q.w        $w13, $w30, $w28
+0x7b 0x8b 0xab 0x1c # CHECK:        msubr_q.h       $w12, $w21, $w11
+0x7b 0xb4 0x70 0x5c # CHECK:        msubr_q.w       $w1, $w14, $w20
+0x79 0x1e 0x81 0x9c # CHECK:        mul_q.h         $w6, $w16, $w30
+0x79 0x24 0x0c 0x1c # CHECK:        mul_q.w         $w16, $w1, $w4
+0x7b 0x13 0xa1 0x9c # CHECK:        mulr_q.h        $w6, $w20, $w19
+0x7b 0x34 0x0e 0xdc # CHECK:        mulr_q.w        $w27, $w1, $w20
diff --git a/test/MC/Disassembler/Mips/msa/test_bit.txt b/test/MC/Disassembler/Mips/msa/test_bit.txt
new file mode 100644
index 0000000..422d71e
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_bit.txt
@@ -0,0 +1,50 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 -mattr=+msa | FileCheck %s
+
+0x79 0xf2 0xf5 0x49 # CHECK:        bclri.b         $w21, $w30, 2
+0x79 0xe0 0xae 0x09 # CHECK:        bclri.h         $w24, $w21, 0
+0x79 0xc3 0xf5 0xc9 # CHECK:        bclri.w         $w23, $w30, 3
+0x79 0x80 0x5a 0x49 # CHECK:        bclri.d         $w9, $w11, 0
+0x7b 0x71 0x66 0x49 # CHECK:        binsli.b        $w25, $w12, 1
+0x7b 0x60 0xb5 0x49 # CHECK:        binsli.h        $w21, $w22, 0
+0x7b 0x40 0x25 0x89 # CHECK:        binsli.w        $w22, $w4, 0
+0x7b 0x06 0x11 0x89 # CHECK:        binsli.d        $w6, $w2, 6
+0x7b 0xf0 0x9b 0xc9 # CHECK:        binsri.b        $w15, $w19, 0
+0x7b 0xe1 0xf2 0x09 # CHECK:        binsri.h        $w8, $w30, 1
+0x7b 0xc5 0x98 0x89 # CHECK:        binsri.w        $w2, $w19, 5
+0x7b 0x81 0xa4 0x89 # CHECK:        binsri.d        $w18, $w20, 1
+0x7a 0xf0 0x9e 0x09 # CHECK:        bnegi.b         $w24, $w19, 0
+0x7a 0xe3 0x5f 0x09 # CHECK:        bnegi.h         $w28, $w11, 3
+0x7a 0xc5 0xd8 0x49 # CHECK:        bnegi.w         $w1, $w27, 5
+0x7a 0x81 0xa9 0x09 # CHECK:        bnegi.d         $w4, $w21, 1
+0x7a 0x70 0x44 0x89 # CHECK:        bseti.b         $w18, $w8, 0
+0x7a 0x62 0x76 0x09 # CHECK:        bseti.h         $w24, $w14, 2
+0x7a 0x44 0x92 0x49 # CHECK:        bseti.w         $w9, $w18, 4
+0x7a 0x01 0x79 0xc9 # CHECK:        bseti.d         $w7, $w15, 1
+0x78 0x72 0xff 0xca # CHECK:        sat_s.b         $w31, $w31, 2
+0x78 0x60 0x9c 0xca # CHECK:        sat_s.h         $w19, $w19, 0
+0x78 0x40 0xec 0xca # CHECK:        sat_s.w         $w19, $w29, 0
+0x78 0x00 0xb2 0xca # CHECK:        sat_s.d         $w11, $w22, 0
+0x78 0xf3 0x68 0x4a # CHECK:        sat_u.b         $w1, $w13, 3
+0x78 0xe4 0xc7 0x8a # CHECK:        sat_u.h         $w30, $w24, 4
+0x78 0xc0 0x6f 0xca # CHECK:        sat_u.w         $w31, $w13, 0
+0x78 0x85 0x87 0x4a # CHECK:        sat_u.d         $w29, $w16, 5
+0x78 0x71 0x55 0xc9 # CHECK:        slli.b          $w23, $w10, 1
+0x78 0x61 0x92 0x49 # CHECK:        slli.h          $w9, $w18, 1
+0x78 0x44 0xea 0xc9 # CHECK:        slli.w          $w11, $w29, 4
+0x78 0x01 0xa6 0x49 # CHECK:        slli.d          $w25, $w20, 1
+0x78 0xf1 0xee 0x09 # CHECK:        srai.b          $w24, $w29, 1
+0x78 0xe0 0x30 0x49 # CHECK:        srai.h          $w1, $w6, 0
+0x78 0xc1 0xd1 0xc9 # CHECK:        srai.w          $w7, $w26, 1
+0x78 0x83 0xcd 0x09 # CHECK:        srai.d          $w20, $w25, 3
+0x79 0x70 0xc9 0x4a # CHECK:        srari.b         $w5, $w25, 0
+0x79 0x64 0x31 0xca # CHECK:        srari.h         $w7, $w6, 4
+0x79 0x45 0x5c 0x4a # CHECK:        srari.w         $w17, $w11, 5
+0x79 0x05 0xcd 0x4a # CHECK:        srari.d         $w21, $w25, 5
+0x79 0x72 0x00 0x89 # CHECK:        srli.b          $w2, $w0, 2
+0x79 0x62 0xff 0xc9 # CHECK:        srli.h          $w31, $w31, 2
+0x79 0x44 0x49 0x49 # CHECK:        srli.w          $w5, $w9, 4
+0x79 0x05 0xd6 0xc9 # CHECK:        srli.d          $w27, $w26, 5
+0x79 0xf0 0x1c 0x8a # CHECK:        srlri.b         $w18, $w3, 0
+0x79 0xe3 0x10 0x4a # CHECK:        srlri.h         $w1, $w2, 3
+0x79 0xc2 0xb2 0xca # CHECK:        srlri.w         $w11, $w22, 2
+0x79 0x86 0x56 0x0a # CHECK:        srlri.d         $w24, $w10, 6
diff --git a/test/MC/Disassembler/Mips/msa/test_ctrlregs.txt b/test/MC/Disassembler/Mips/msa/test_ctrlregs.txt
new file mode 100644
index 0000000..fb5b0be
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_ctrlregs.txt
@@ -0,0 +1,35 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 -mattr=+msa | FileCheck %s
+
+0x78 0x7e 0x00 0x59 # CHECK:  cfcmsa       $1, $0
+0x78 0x7e 0x00 0x59 # CHECK:  cfcmsa       $1, $0
+0x78 0x7e 0x08 0x99 # CHECK:  cfcmsa       $2, $1
+0x78 0x7e 0x08 0x99 # CHECK:  cfcmsa       $2, $1
+0x78 0x7e 0x10 0xd9 # CHECK:  cfcmsa       $3, $2
+0x78 0x7e 0x10 0xd9 # CHECK:  cfcmsa       $3, $2
+0x78 0x7e 0x19 0x19 # CHECK:  cfcmsa       $4, $3
+0x78 0x7e 0x19 0x19 # CHECK:  cfcmsa       $4, $3
+0x78 0x7e 0x21 0x59 # CHECK:  cfcmsa       $5, $4
+0x78 0x7e 0x21 0x59 # CHECK:  cfcmsa       $5, $4
+0x78 0x7e 0x29 0x99 # CHECK:  cfcmsa       $6, $5
+0x78 0x7e 0x29 0x99 # CHECK:  cfcmsa       $6, $5
+0x78 0x7e 0x31 0xd9 # CHECK:  cfcmsa       $7, $6
+0x78 0x7e 0x31 0xd9 # CHECK:  cfcmsa       $7, $6
+0x78 0x7e 0x3a 0x19 # CHECK:  cfcmsa       $8, $7
+0x78 0x7e 0x3a 0x19 # CHECK:  cfcmsa       $8, $7
+
+0x78 0x3e 0x08 0x19 # CHECK:  ctcmsa       $0, $1
+0x78 0x3e 0x08 0x19 # CHECK:  ctcmsa       $0, $1
+0x78 0x3e 0x10 0x59 # CHECK:  ctcmsa       $1, $2
+0x78 0x3e 0x10 0x59 # CHECK:  ctcmsa       $1, $2
+0x78 0x3e 0x18 0x99 # CHECK:  ctcmsa       $2, $3
+0x78 0x3e 0x18 0x99 # CHECK:  ctcmsa       $2, $3
+0x78 0x3e 0x20 0xd9 # CHECK:  ctcmsa       $3, $4
+0x78 0x3e 0x20 0xd9 # CHECK:  ctcmsa       $3, $4
+0x78 0x3e 0x29 0x19 # CHECK:  ctcmsa       $4, $5
+0x78 0x3e 0x29 0x19 # CHECK:  ctcmsa       $4, $5
+0x78 0x3e 0x31 0x59 # CHECK:  ctcmsa       $5, $6
+0x78 0x3e 0x31 0x59 # CHECK:  ctcmsa       $5, $6
+0x78 0x3e 0x39 0x99 # CHECK:  ctcmsa       $6, $7
+0x78 0x3e 0x39 0x99 # CHECK:  ctcmsa       $6, $7
+0x78 0x3e 0x41 0xd9 # CHECK:  ctcmsa       $7, $8
+0x78 0x3e 0x41 0xd9 # CHECK:  ctcmsa       $7, $8
diff --git a/test/MC/Disassembler/Mips/msa/test_dlsa.txt b/test/MC/Disassembler/Mips/msa/test_dlsa.txt
new file mode 100644
index 0000000..2a1d90b
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_dlsa.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux -mcpu=mips64r2 -mattr=+msa | FileCheck %s
+
+0x01 0x2a 0x40 0x15 # CHECK: dlsa        $8, $9, $10, 1
+0x01 0x2a 0x40 0x55 # CHECK: dlsa        $8, $9, $10, 2
+0x01 0x2a 0x40 0x95 # CHECK: dlsa        $8, $9, $10, 3
+0x01 0x2a 0x40 0xd5 # CHECK: dlsa        $8, $9, $10, 4
diff --git a/test/MC/Disassembler/Mips/msa/test_elm.txt b/test/MC/Disassembler/Mips/msa/test_elm.txt
new file mode 100644
index 0000000..832587b
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_elm.txt
@@ -0,0 +1,17 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 -mattr=+msa | FileCheck %s
+
+0x78 0x82 0x43 0x59 # CHECK:        copy_s.b        $13, $w8[2]
+0x78 0xa0 0xc8 0x59 # CHECK:        copy_s.h        $1, $w25[0]
+0x78 0xb1 0x2d 0x99 # CHECK:        copy_s.w        $22, $w5[1]
+0x78 0xc4 0xa5 0x99 # CHECK:        copy_u.b        $22, $w20[4]
+0x78 0xe0 0x25 0x19 # CHECK:        copy_u.h        $20, $w4[0]
+0x78 0xf2 0x6f 0x99 # CHECK:        copy_u.w        $fp, $w13[2]
+0x78 0x04 0xe8 0x19 # CHECK:        sldi.b          $w0, $w29[4]
+0x78 0x20 0x8a 0x19 # CHECK:        sldi.h          $w8, $w17[0]
+0x78 0x32 0xdd 0x19 # CHECK:        sldi.w          $w20, $w27[2]
+0x78 0x38 0x61 0x19 # CHECK:        sldi.d          $w4, $w12[0]
+0x78 0x42 0x1e 0x59 # CHECK:        splati.b        $w25, $w3[2]
+0x78 0x61 0xe6 0x19 # CHECK:        splati.h        $w24, $w28[1]
+0x78 0x70 0x93 0x59 # CHECK:        splati.w        $w13, $w18[0]
+0x78 0x78 0x0f 0x19 # CHECK:        splati.d        $w28, $w1[0]
+0x78 0xbe 0xc5 0xd9 # CHECK:        move.v          $w23, $w24
diff --git a/test/MC/Disassembler/Mips/msa/test_elm_insert.txt b/test/MC/Disassembler/Mips/msa/test_elm_insert.txt
new file mode 100644
index 0000000..605d495
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_elm_insert.txt
@@ -0,0 +1,5 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 -mattr=+msa | FileCheck %s
+
+0x79 0x03 0xed 0xd9 # CHECK:        insert.b        $w23[3], $sp
+0x79 0x22 0x2d 0x19 # CHECK:        insert.h        $w20[2], $5
+0x79 0x32 0x7a 0x19 # CHECK:        insert.w        $w8[2], $15
diff --git a/test/MC/Disassembler/Mips/msa/test_elm_insert_msa64.txt b/test/MC/Disassembler/Mips/msa/test_elm_insert_msa64.txt
new file mode 100644
index 0000000..62920f3
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_elm_insert_msa64.txt
@@ -0,0 +1,3 @@
+# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux -mcpu=mips64r2 -mattr=+msa | FileCheck %s
+
+0x79 0x39 0xe8 0x59 # CHECK:        insert.d        $w1[1], $sp
diff --git a/test/MC/Disassembler/Mips/msa/test_elm_insve.txt b/test/MC/Disassembler/Mips/msa/test_elm_insve.txt
new file mode 100644
index 0000000..c5c3ba0
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_elm_insve.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 -mattr=+msa | FileCheck %s
+
+0x79 0x43 0x4e 0x59 # CHECK:        insve.b $w25[3], $w9[0]
+0x79 0x62 0x16 0x19 # CHECK:        insve.h $w24[2], $w2[0]
+0x79 0x72 0x68 0x19 # CHECK:        insve.w $w0[2], $w13[0]
+0x79 0x78 0x90 0xd9 # CHECK:        insve.d $w3[0], $w18[0]
diff --git a/test/MC/Disassembler/Mips/msa/test_elm_msa64.txt b/test/MC/Disassembler/Mips/msa/test_elm_msa64.txt
new file mode 100644
index 0000000..70c831a
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_elm_msa64.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=mips64-unknown-linux -mcpu=mips64r2 -mattr=+msa | FileCheck %s
+
+# CHECK:        copy_s.d        $19, $w31[0]
+0x78 0xb8 0xfc 0xd9
+# CHECK:        copy_u.d        $18, $w29[1]
+0x78 0xf9 0xec 0x99
diff --git a/test/MC/Disassembler/Mips/msa/test_i10.txt b/test/MC/Disassembler/Mips/msa/test_i10.txt
new file mode 100644
index 0000000..ac95d88
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_i10.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32 -mattr=+msa | FileCheck %s
+
+0x7b 0x06 0x32 0x07 # CHECK:        ldi.b   $w8, 198
+0x7b 0x29 0xcd 0x07 # CHECK:        ldi.h   $w20, 313
+0x7b 0x4f 0x66 0x07 # CHECK:        ldi.w   $w24, 492
+0x7b 0x7a 0x66 0xc7 # CHECK:        ldi.d   $w27, 844
diff --git a/test/MC/Disassembler/Mips/msa/test_i5.txt b/test/MC/Disassembler/Mips/msa/test_i5.txt
new file mode 100644
index 0000000..bf5bc51
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_i5.txt
@@ -0,0 +1,46 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32 -mattr=+msa | FileCheck %s
+
+0x78 0x1e 0xf8 0xc6 # CHECK:        addvi.b         $w3, $w31, 30
+0x78 0x3a 0x6e 0x06 # CHECK:        addvi.h         $w24, $w13, 26
+0x78 0x5a 0xa6 0x86 # CHECK:        addvi.w         $w26, $w20, 26
+0x78 0x75 0x0c 0x06 # CHECK:        addvi.d         $w16, $w1, 21
+0x78 0x18 0xae 0x07 # CHECK:        ceqi.b          $w24, $w21, 24
+0x78 0x22 0x7f 0xc7 # CHECK:        ceqi.h          $w31, $w15, 2
+0x78 0x5f 0x0b 0x07 # CHECK:        ceqi.w          $w12, $w1, 31
+0x78 0x67 0xb6 0x07 # CHECK:        ceqi.d          $w24, $w22, 7
+0x7a 0x01 0x83 0x07 # CHECK:        clei_s.b        $w12, $w16, 1
+0x7a 0x37 0x50 0x87 # CHECK:        clei_s.h        $w2, $w10, 23
+0x7a 0x56 0x59 0x07 # CHECK:        clei_s.w        $w4, $w11, 22
+0x7a 0x76 0xe8 0x07 # CHECK:        clei_s.d        $w0, $w29, 22
+0x7a 0x83 0x8d 0x47 # CHECK:        clei_u.b        $w21, $w17, 3
+0x7a 0xb1 0x3f 0x47 # CHECK:        clei_u.h        $w29, $w7, 17
+0x7a 0xc2 0x08 0x47 # CHECK:        clei_u.w        $w1, $w1, 2
+0x7a 0xfd 0xde 0xc7 # CHECK:        clei_u.d        $w27, $w27, 29
+0x79 0x19 0x6c 0xc7 # CHECK:        clti_s.b        $w19, $w13, 25
+0x79 0x34 0x53 0xc7 # CHECK:        clti_s.h        $w15, $w10, 20
+0x79 0x4b 0x63 0x07 # CHECK:        clti_s.w        $w12, $w12, 11
+0x79 0x71 0xa7 0x47 # CHECK:        clti_s.d        $w29, $w20, 17
+0x79 0x9d 0x4b 0x87 # CHECK:        clti_u.b        $w14, $w9, 29
+0x79 0xb9 0xce 0x07 # CHECK:        clti_u.h        $w24, $w25, 25
+0x79 0xd6 0x08 0x47 # CHECK:        clti_u.w        $w1, $w1, 22
+0x79 0xe1 0xcd 0x47 # CHECK:        clti_u.d        $w21, $w25, 1
+0x79 0x01 0xad 0x86 # CHECK:        maxi_s.b        $w22, $w21, 1
+0x79 0x38 0x2f 0x46 # CHECK:        maxi_s.h        $w29, $w5, 24
+0x79 0x54 0x50 0x46 # CHECK:        maxi_s.w        $w1, $w10, 20
+0x79 0x70 0xeb 0x46 # CHECK:        maxi_s.d        $w13, $w29, 16
+0x79 0x8c 0x05 0x06 # CHECK:        maxi_u.b        $w20, $w0, 12
+0x79 0xa3 0x70 0x46 # CHECK:        maxi_u.h        $w1, $w14, 3
+0x79 0xcb 0xb6 0xc6 # CHECK:        maxi_u.w        $w27, $w22, 11
+0x79 0xe4 0x36 0x86 # CHECK:        maxi_u.d        $w26, $w6, 4
+0x7a 0x01 0x09 0x06 # CHECK:        mini_s.b        $w4, $w1, 1
+0x7a 0x37 0xde 0xc6 # CHECK:        mini_s.h        $w27, $w27, 23
+0x7a 0x49 0x5f 0x06 # CHECK:        mini_s.w        $w28, $w11, 9
+0x7a 0x6a 0x52 0xc6 # CHECK:        mini_s.d        $w11, $w10, 10
+0x7a 0x9b 0xbc 0x86 # CHECK:        mini_u.b        $w18, $w23, 27
+0x7a 0xb2 0xd1 0xc6 # CHECK:        mini_u.h        $w7, $w26, 18
+0x7a 0xda 0x62 0xc6 # CHECK:        mini_u.w        $w11, $w12, 26
+0x7a 0xe2 0x7a 0xc6 # CHECK:        mini_u.d        $w11, $w15, 2
+0x78 0x93 0xa6 0x06 # CHECK:        subvi.b         $w24, $w20, 19
+0x78 0xa4 0x9a 0xc6 # CHECK:        subvi.h         $w11, $w19, 4
+0x78 0xcb 0x53 0x06 # CHECK:        subvi.w         $w12, $w10, 11
+0x78 0xe7 0x84 0xc6 # CHECK:        subvi.d         $w19, $w16, 7
diff --git a/test/MC/Disassembler/Mips/msa/test_i8.txt b/test/MC/Disassembler/Mips/msa/test_i8.txt
new file mode 100644
index 0000000..e08c39b
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_i8.txt
@@ -0,0 +1,12 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32 -mattr=+msa | FileCheck %s
+
+0x78 0x30 0xe8 0x80 # CHECK:        andi.b  $w2, $w29, 48
+0x78 0x7e 0xb1 0x81 # CHECK:        bmnzi.b $w6, $w22, 126
+0x79 0x58 0x0e 0xc1 # CHECK:        bmzi.b  $w27, $w1, 88
+0x7a 0xbd 0x1f 0x41 # CHECK:        bseli.b $w29, $w3, 189
+0x7a 0x38 0x88 0x40 # CHECK:        nori.b  $w1, $w17, 56
+0x79 0x87 0xa6 0x80 # CHECK:        ori.b   $w26, $w20, 135
+0x78 0x69 0xf4 0xc2 # CHECK:        shf.b   $w19, $w30, 105
+0x79 0x4c 0x44 0x42 # CHECK:        shf.h   $w17, $w8, 76
+0x7a 0x5d 0x1b 0x82 # CHECK:        shf.w   $w14, $w3, 93
+0x7b 0x14 0x54 0x00 # CHECK:        xori.b  $w16, $w10, 20
diff --git a/test/MC/Disassembler/Mips/msa/test_lsa.txt b/test/MC/Disassembler/Mips/msa/test_lsa.txt
new file mode 100644
index 0000000..c3e950b
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_lsa.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32r2 -mattr=+msa | FileCheck %s
+
+0x01 0x2a 0x40 0x05 # CHECK: lsa        $8, $9, $10, 1
+0x01 0x2a 0x40 0x45 # CHECK: lsa        $8, $9, $10, 2
+0x01 0x2a 0x40 0x85 # CHECK: lsa        $8, $9, $10, 3
+0x01 0x2a 0x40 0xc5 # CHECK: lsa        $8, $9, $10, 4
diff --git a/test/MC/Disassembler/Mips/msa/test_mi10.txt b/test/MC/Disassembler/Mips/msa/test_mi10.txt
new file mode 100644
index 0000000..b75b49e
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_mi10.txt
@@ -0,0 +1,28 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32 -mattr=+msa | FileCheck %s
+
+0x7a 0x00 0x08 0x20 # CHECK:        ld.b $w0, -512($1)
+0x78 0x00 0x10 0x60 # CHECK:        ld.b $w1, 0($2)
+0x79 0xff 0x18 0xa0 # CHECK:        ld.b $w2, 511($3)
+
+0x7a 0x00 0x20 0xe1 # CHECK:        ld.h $w3, -1024($4)
+0x7b 0x00 0x29 0x21 # CHECK:        ld.h $w4, -512($5)
+0x78 0x00 0x31 0x61 # CHECK:        ld.h $w5, 0($6)
+0x79 0x00 0x39 0xa1 # CHECK:        ld.h $w6, 512($7)
+0x79 0xff 0x41 0xe1 # CHECK:        ld.h $w7, 1022($8)
+
+0x7a 0x00 0x4a 0x22 # CHECK:        ld.w $w8, -2048($9)
+0x7b 0x00 0x52 0x62 # CHECK:        ld.w $w9, -1024($10)
+0x7b 0x80 0x5a 0xa2 # CHECK:        ld.w $w10, -512($11)
+0x78 0x80 0x62 0xe2 # CHECK:        ld.w $w11, 512($12)
+0x79 0x00 0x6b 0x22 # CHECK:        ld.w $w12, 1024($13)
+0x79 0xff 0x73 0x62 # CHECK:        ld.w $w13, 2044($14)
+
+0x7a 0x00 0x7b 0xa3 # CHECK:        ld.d $w14, -4096($15)
+0x7b 0x00 0x83 0xe3 # CHECK:        ld.d $w15, -2048($16)
+0x7b 0x80 0x8c 0x23 # CHECK:        ld.d $w16, -1024($17)
+0x7b 0xc0 0x94 0x63 # CHECK:        ld.d $w17, -512($18)
+0x78 0x00 0x9c 0xa3 # CHECK:        ld.d $w18, 0($19)
+0x78 0x40 0xa4 0xe3 # CHECK:        ld.d $w19, 512($20)
+0x78 0x80 0xad 0x23 # CHECK:        ld.d $w20, 1024($21)
+0x79 0x00 0xb5 0x63 # CHECK:        ld.d $w21, 2048($22)
+0x79 0xff 0xbd 0xa3 # CHECK:        ld.d $w22, 4088($23)
diff --git a/test/MC/Disassembler/Mips/msa/test_vec.txt b/test/MC/Disassembler/Mips/msa/test_vec.txt
new file mode 100644
index 0000000..eff984f
--- /dev/null
+++ b/test/MC/Disassembler/Mips/msa/test_vec.txt
@@ -0,0 +1,9 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips32 -mattr=+msa | FileCheck %s
+
+0x78 0x1b 0xa6 0x5e # CHECK:        and.v   $w25, $w20, $w27
+0x78 0x87 0x34 0x5e # CHECK:        bmnz.v  $w17, $w6, $w7
+0x78 0xa9 0x88 0xde # CHECK:        bmz.v   $w3, $w17, $w9
+0x78 0xce 0x02 0x1e # CHECK:        bsel.v  $w8, $w0, $w14
+0x78 0x40 0xf9 0xde # CHECK:        nor.v   $w7, $w31, $w0
+0x78 0x3e 0xd6 0x1e # CHECK:        or.v    $w24, $w26, $w30
+0x78 0x6f 0xd9 0xde # CHECK:        xor.v   $w7, $w27, $w15
diff --git a/test/MC/Disassembler/Sparc/sparc-fp.txt b/test/MC/Disassembler/Sparc/sparc-fp.txt
index b279da8..b8a5017 100644
--- a/test/MC/Disassembler/Sparc/sparc-fp.txt
+++ b/test/MC/Disassembler/Sparc/sparc-fp.txt
@@ -120,13 +120,13 @@
 # CHECK: fdivq %f0, %f4, %f8
 0x91 0xa0 0x09 0xe4
 
-# CHECK: fcmps %fcc0, %f0, %f4
+# CHECK: fcmps %f0, %f4
 0x81 0xa8 0x0a 0x24
 
-# CHECK: fcmpd %fcc0, %f0, %f4
+# CHECK: fcmpd %f0, %f4
 0x81 0xa8 0x0a 0x44
 
-# CHECK: fcmpq %fcc0, %f0, %f4
+# CHECK: fcmpq %f0, %f4
 0x81 0xa8 0x0a 0x64
 
 # CHECK: fxtos %f0, %f4
diff --git a/test/MC/Disassembler/X86/prefixes.txt b/test/MC/Disassembler/X86/prefixes.txt
index 56596e3..b8830dc 100644
--- a/test/MC/Disassembler/X86/prefixes.txt
+++ b/test/MC/Disassembler/X86/prefixes.txt
@@ -44,6 +44,10 @@
 # CHECK-NEXT:	nop
 0xf0 0x90
 
+# Test that immediate is printed correctly within opsize prefix
+# CHECK: addw    $-12, %ax
+0x66,0x83,0xc0,0xf4
+
 # Test that multiple redundant prefixes work (redundant, but valid x86).
 # CHECK: rep
 # CHECK-NEXT: rep
diff --git a/test/MC/Disassembler/X86/x86-32.txt b/test/MC/Disassembler/X86/x86-32.txt
index a4a0b2c..c9c5086 100644
--- a/test/MC/Disassembler/X86/x86-32.txt
+++ b/test/MC/Disassembler/X86/x86-32.txt
@@ -708,3 +708,6 @@
 
 # CHECK: movl $4294967295, %eax
 0xc7 0xc0 0xff 0xff 0xff 0xff
+
+# CHECK: movq %mm0, %mm1
+0x0f 0x7f 0xc1
diff --git a/test/MC/ELF/comdat.s b/test/MC/ELF/comdat.s
index 05d08e14..68b0f32 100644
--- a/test/MC/ELF/comdat.s
+++ b/test/MC/ELF/comdat.s
@@ -49,7 +49,7 @@
 // Test that g1 and g2 are local, but g3 is an undefined global.
 
 // CHECK:        Symbol {
-// CHECK:          Name: g1 (1)
+// CHECK:          Name: g1
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -58,7 +58,7 @@
 // CHECK-NEXT:     Section: .foo (0x7)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: g2 (4)
+// CHECK-NEXT:     Name: g2
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -68,7 +68,7 @@
 // CHECK-NEXT:   }
 
 // CHECK:        Symbol {
-// CHECK:          Name: g3 (7)
+// CHECK:          Name: g3
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
diff --git a/test/MC/ELF/common.s b/test/MC/ELF/common.s
index 9cff927..bd96564 100644
--- a/test/MC/ELF/common.s
+++ b/test/MC/ELF/common.s
@@ -9,7 +9,7 @@
 	.comm	common1,1,1
 
 // CHECK:        Symbol {
-// CHECK:          Name: common1 (1)
+// CHECK:          Name: common1
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 1
 // CHECK-NEXT:     Binding: Local
@@ -25,7 +25,7 @@
 	.comm	common2,1,1
 
 // CHECK:        Symbol {
-// CHECK:          Name: common2 (9)
+// CHECK:          Name: common2
 // CHECK-NEXT:     Value: 0x1
 // CHECK-NEXT:     Size: 1
 // CHECK-NEXT:     Binding: Local
@@ -39,7 +39,7 @@
         .comm	common6,8,16
 
 // CHECK:        Symbol {
-// CHECK:          Name: common6 (17)
+// CHECK:          Name: common6
 // CHECK-NEXT:     Value: 0x10
 // CHECK-NEXT:     Size: 8
 // CHECK-NEXT:     Binding: Local
@@ -54,7 +54,7 @@
 	.comm	common3,4,4
 
 // CHECK:        Symbol {
-// CHECK:          Name: common3 (25)
+// CHECK:          Name: common3
 // CHECK-NEXT:     Value: 0x4
 // CHECK-NEXT:     Size: 4
 // CHECK-NEXT:     Binding: Global
@@ -76,7 +76,7 @@ foo:
 	.comm	common4,40,16
 
 // CHECK:        Symbol {
-// CHECK:          Name: common4 (37)
+// CHECK:          Name: common4
 // CHECK-NEXT:     Value: 0x10
 // CHECK-NEXT:     Size: 40
 // CHECK-NEXT:     Binding: Global
@@ -89,7 +89,7 @@ foo:
         .comm	common5,4,4
 
 // CHECK:        Symbol {
-// CHECK:          Name: common5 (45)
+// CHECK:          Name: common5
 // CHECK-NEXT:     Value: 0x4
 // CHECK-NEXT:     Size: 4
 // CHECK-NEXT:     Binding: Global
diff --git a/test/MC/ELF/comp-dir.s b/test/MC/ELF/comp-dir.s
index 1b91f64..c8d996f 100644
--- a/test/MC/ELF/comp-dir.s
+++ b/test/MC/ELF/comp-dir.s
@@ -1,5 +1,4 @@
 // REQUIRES: shell
-// XFAIL: mingw
 // RUN: llvm-mc -triple=x86_64-linux-unknown -g -fdebug-compilation-dir=/test/comp/dir %s -filetype=obj -o %t.o
 // RUN: llvm-dwarfdump -debug-dump=info %t.o | FileCheck %s
 
diff --git a/test/MC/ELF/compression.s b/test/MC/ELF/compression.s
index 305a84e..07b689e 100644
--- a/test/MC/ELF/compression.s
+++ b/test/MC/ELF/compression.s
@@ -1,28 +1,80 @@
-// RUN: llvm-mc -filetype=obj -compress-debug-sections -triple x86_64-pc-linux-gnu %s -o - | llvm-objdump -s - | FileCheck %s
+// RUN: llvm-mc -filetype=obj -compress-debug-sections -triple x86_64-pc-linux-gnu < %s -o %t
+// RUN: llvm-objdump -s %t | FileCheck %s
+// RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck --check-prefix=INFO %s
+// RUN: llvm-mc -filetype=obj -compress-debug-sections -triple i386-pc-linux-gnu < %s \
+// RUN:     | llvm-readobj -symbols - | FileCheck --check-prefix=386-SYMBOLS %s
 
 // REQUIRES: zlib
 
-// CHECK: Contents of section .debug_line:
-// FIXME: Figure out how to handle debug_line that currently uses multiple section fragments
+// CHECK: Contents of section .zdebug_line:
+// Check for the 'ZLIB' file magic at the start of the section only
+// CHECK-NEXT: ZLIB
 // CHECK-NOT: ZLIB
+// CHECK: Contents of
 
-// CHECK: Contents of section .zdebug_abbrev:
-// Check for the 'ZLIB' file magic at the start of the section
-// CHECK-NEXT: ZLIB
+// Don't compress small sections, such as this simple debug_abbrev example
+// CHECK: Contents of section .debug_abbrev:
+// CHECK-NOT: ZLIB
+// CHECK-NOT: Contents of
+
+// CHECK: Contents of section .debug_info:
 
-// We shouldn't compress the debug_frame section, since it can be relaxed
-// CHECK: Contents of section .debug_frame
+// FIXME: Handle compressing alignment fragments to support compressing debug_frame
+// CHECK: Contents of section .debug_frame:
 // CHECK-NOT: ZLIB
+// CHECK: Contents of
+
+// Decompress one valid dwarf section just to check that this roundtrips
+// INFO: 0x00000000: Compile Unit: length = 0x0000000c version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000010)
+
+// In x86 32 bit named symbols are used for temporary symbols in merge
+// sections, so make sure we handle symbols inside compressed sections
+// 386-SYMBOLS: Name: .Linfo_string0
+// 386-SYMBOLS-NOT: }
+// 386-SYMBOLS: Section: .zdebug_str
 
 	.section	.debug_line,"",@progbits
 
 	.section	.debug_abbrev,"",@progbits
+.Lsection_abbrev:
 	.byte	1                       # Abbreviation Code
+	.byte	17                      # DW_TAG_compile_unit
+	.byte	0                       # DW_CHILDREN_no
+	.byte	27                      # DW_AT_comp_dir
+	.byte	14                      # DW_FORM_strp
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+
+	.section	.debug_info,"",@progbits
+	.long	12                      # Length of Unit
+	.short	4                       # DWARF version number
+	.long	.Lsection_abbrev        # Offset Into Abbrev. Section
+	.byte	8                       # Address Size (in bytes)
+	.byte	1                       # Abbrev [1] DW_TAG_compile_unit
+	.long	.Linfo_string0          # DW_AT_comp_dir
+
 	.text
 foo:
 	.cfi_startproc
 	.file 1 "Driver.ii"
+# pad out the line table to make sure it's big enough to warrant compression
 	.loc 1 2 0
         nop
+	.loc 1 3 0
+        nop
+	.loc 1 4 0
+        nop
+	.loc 1 5 0
+        nop
+	.loc 1 6 0
+        nop
+	.loc 1 7 0
+        nop
+	.loc 1 8 0
+        nop
 	.cfi_endproc
 	.cfi_sections .debug_frame
+
+	.section        .debug_str,"MS",@progbits,1
+.Linfo_string0:
+        .asciz  "compress this                                    "
diff --git a/test/MC/ELF/file-double.s b/test/MC/ELF/file-double.s
index f9b91ed..b5da8c5 100644
--- a/test/MC/ELF/file-double.s
+++ b/test/MC/ELF/file-double.s
@@ -11,7 +11,7 @@ foo.c:
 bar.c:
 
 // CHECK:        Symbol {
-// CHECK:          Name: foo.c (1)
+// CHECK:          Name: foo.c
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -19,7 +19,7 @@ bar.c:
 // CHECK-NEXT:     Other: 0
 // CHECK-NEXT:     Section: Absolute (0xFFF1)
 // CHECK-NEXT:   }
-// CHECK:          Name: bar.c (7)
+// CHECK:          Name: bar.c
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -28,7 +28,7 @@ bar.c:
 // CHECK-NEXT:     Section: Absolute (0xFFF1)
 // CHECK-NEXT:   }
 // CHECK:        Symbol {
-// CHECK:        Name: bar.c (7)
+// CHECK:        Name: bar.c
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -37,7 +37,7 @@ bar.c:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK:        Symbol {
-// CHECK:        Name: foo.c (1)
+// CHECK:        Name: foo.c
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
diff --git a/test/MC/ELF/gen-dwarf.s b/test/MC/ELF/gen-dwarf.s
index 946119b..7f0c059 100644
--- a/test/MC/ELF/gen-dwarf.s
+++ b/test/MC/ELF/gen-dwarf.s
@@ -1,5 +1,9 @@
-// RUN: llvm-mc -g -triple  i686-pc-linux-gnu %s -filetype=obj -o - | llvm-readobj -r | FileCheck %s
-// RUN: llvm-mc -g -triple  i686-pc-linux-gnu %s -filetype=asm -o - | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -g -dwarf-version 2 -triple  i686-pc-linux-gnu %s -filetype=obj -o - | llvm-readobj -r | FileCheck %s
+// RUN: not llvm-mc -g -dwarf-version 1  -triple  i686-pc-linux-gnu %s -filetype=asm -o - 2>&1 | FileCheck --check-prefix=DWARF1 %s
+// RUN: llvm-mc -g -dwarf-version 2 -triple  i686-pc-linux-gnu %s -filetype=asm -o - | FileCheck --check-prefix=ASM --check-prefix=DWARF2 %s
+// RUN: llvm-mc -g -dwarf-version 3 -triple  i686-pc-linux-gnu %s -filetype=asm -o - | FileCheck --check-prefix=ASM --check-prefix=DWARF3 %s
+// RUN: llvm-mc -g -triple  i686-pc-linux-gnu %s -filetype=asm -o - | FileCheck --check-prefix=ASM --check-prefix=DWARF4 %s
+// RUN: not llvm-mc -g -dwarf-version 5  -triple  i686-pc-linux-gnu %s -filetype=asm -o - 2>&1 | FileCheck --check-prefix=DWARF5 %s
 
 
 // Test that on ELF:
@@ -35,7 +39,9 @@ foo:
 // Second instance of the section has the CU
 // ASM: .section .debug_info
 // Dwarf version
-// ASM: .short 2
+// DWARF2: .short 2
+// DWARF3: .short 3
+// DWARF4: .short 4
 // ASM-NEXT: .long [[ABBREV_LABEL]]
 // First .byte 1 is the abbreviation number for the compile_unit abbrev
 // ASM: .byte 1
@@ -44,3 +50,5 @@ foo:
 // ASM: .section .debug_line
 // ASM-NEXT: [[LINE_LABEL]]
 
+// DWARF1: Dwarf version 1 is not supported.
+// DWARF5: Dwarf version 5 is not supported.
diff --git a/test/MC/ELF/lcomm.s b/test/MC/ELF/lcomm.s
index 430b79b..7d8ac3f 100644
--- a/test/MC/ELF/lcomm.s
+++ b/test/MC/ELF/lcomm.s
@@ -4,7 +4,7 @@
 .lcomm B, 32 << 20
 
 // CHECK:        Symbol {
-// CHECK:          Name: A (1)
+// CHECK:          Name: A
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 5
 // CHECK-NEXT:     Binding: Local
@@ -13,7 +13,7 @@
 // CHECK-NEXT:     Section: .bss (0x3)
 // CHECK-NEXT:   }
 // CHECK:        Symbol {
-// CHECK:          Name: B (3)
+// CHECK:          Name: B
 // CHECK-NEXT:     Value: 0x5
 // CHECK-NEXT:     Size: 33554432
 // CHECK-NEXT:     Binding: Local
diff --git a/test/MC/ELF/many-sections-2.s b/test/MC/ELF/many-sections-2.s
index d1f9d00..88a4822 100644
--- a/test/MC/ELF/many-sections-2.s
+++ b/test/MC/ELF/many-sections-2.s
@@ -12,7 +12,7 @@
 
 
 // Test that both a and b show up in the correct section.
-// SYMBOLS:         Name: a (1)
+// SYMBOLS:         Name: a
 // SYMBOLS-NEXT:    Value: 0x0
 // SYMBOLS-NEXT:    Size: 0
 // SYMBOLS-NEXT:    Binding: Local (0x0)
@@ -21,7 +21,7 @@
 // SYMBOLS-NEXT:    Section: last (0xFF00)
 // SYMBOLS-NEXT:  }
 // SYMBOLS-NEXT:  Symbol {
-// SYMBOLS-NEXT:    Name: b (3)
+// SYMBOLS-NEXT:    Name: b
 // SYMBOLS-NEXT:    Value: 0x1
 // SYMBOLS-NEXT:    Size: 0
 // SYMBOLS-NEXT:    Binding: Local (0x0)
@@ -32,7 +32,7 @@
 
 
 // Test that this file has one section too many.
-// SYMBOLS:         Name: last (0)
+// SYMBOLS:         Name: last
 // SYMBOLS-NEXT:    Value: 0x0
 // SYMBOLS-NEXT:    Size: 0
 // SYMBOLS-NEXT:    Binding: Local (0x0)
diff --git a/test/MC/ELF/noexec.s b/test/MC/ELF/noexec.s
index 33cb8ae..28f50cb 100644
--- a/test/MC/ELF/noexec.s
+++ b/test/MC/ELF/noexec.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -mc-no-exec-stack -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -t | FileCheck  %s
+// RUN: llvm-mc -no-exec-stack -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -t | FileCheck  %s
 
 // CHECK:        Section {
 // CHECK:          Index: 4
diff --git a/test/MC/ELF/offset.s b/test/MC/ELF/offset.s
index a412619..f448332 100644
--- a/test/MC/ELF/offset.s
+++ b/test/MC/ELF/offset.s
@@ -71,3 +71,62 @@ sym_f = sym_a + (1 - 1)
 // CHECK-NEXT:    Other: 0
 // CHECK-NEXT:    Section: .data
 // CHECK-NEXT:  }
+
+
+        .globl test2_a
+        .globl test2_b
+        .globl test2_c
+        .globl test2_d
+        .globl test2_e
+test2_a:
+    .long 0
+test2_b = test2_a
+test2_c:
+    .long 0
+test2_d = test2_c
+test2_e = test2_d - test2_b
+// CHECK:      Symbol {
+// CHECK:        Name: test2_a
+// CHECK-NEXT:   Value: 0x5
+// CHECK-NEXT:   Size: 0
+// CHECK-NEXT:   Binding: Global
+// CHECK-NEXT:   Type: None
+// CHECK-NEXT:   Other: 0
+// CHECK-NEXT:   Section: .data
+// CHECK-NEXT: }
+// CHECK-NEXT: Symbol {
+// CHECK-NEXT:   Name: test2_b
+// CHECK-NEXT:   Value: 0x5
+// CHECK-NEXT:   Size: 0
+// CHECK-NEXT:   Binding: Global
+// CHECK-NEXT:   Type: None
+// CHECK-NEXT:   Other: 0
+// CHECK-NEXT:   Section: .data
+// CHECK-NEXT: }
+// CHECK-NEXT: Symbol {
+// CHECK-NEXT:   Name: test2_c
+// CHECK-NEXT:   Value: 0x9
+// CHECK-NEXT:   Size: 0
+// CHECK-NEXT:   Binding: Global
+// CHECK-NEXT:   Type: None
+// CHECK-NEXT:   Other: 0
+// CHECK-NEXT:   Section: .data
+// CHECK-NEXT: }
+// CHECK-NEXT: Symbol {
+// CHECK-NEXT:   Name: test2_d
+// CHECK-NEXT:   Value: 0x9
+// CHECK-NEXT:   Size: 0
+// CHECK-NEXT:   Binding: Global
+// CHECK-NEXT:   Type: None
+// CHECK-NEXT:   Other: 0
+// CHECK-NEXT:   Section: .data
+// CHECK-NEXT: }
+// CHECK-NEXT: Symbol {
+// CHECK-NEXT:   Name: test2_e
+// CHECK-NEXT:   Value: 0x4
+// CHECK-NEXT:   Size: 0
+// CHECK-NEXT:   Binding: Global
+// CHECK-NEXT:   Type: None
+// CHECK-NEXT:   Other: 0
+// CHECK-NEXT:   Section: Absolute
+// CHECK-NEXT: }
diff --git a/test/MC/ELF/pic-diff.s b/test/MC/ELF/pic-diff.s
index 30c9278..5f0b145 100644
--- a/test/MC/ELF/pic-diff.s
+++ b/test/MC/ELF/pic-diff.s
@@ -7,7 +7,7 @@
 // CHECK-NEXT: ]
 
 // CHECK:        Symbol {
-// CHECK:          Name: baz (5)
+// CHECK:          Name: baz
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
diff --git a/test/MC/ELF/pr9292.s b/test/MC/ELF/pr9292.s
index a433650..1e01194 100644
--- a/test/MC/ELF/pr9292.s
+++ b/test/MC/ELF/pr9292.s
@@ -8,7 +8,7 @@ mov %eax,bar
 
 
 // CHECK:        Symbol {
-// CHECK:          Name: bar (5)
+// CHECK:          Name: bar
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -17,7 +17,7 @@ mov %eax,bar
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo (1)
+// CHECK-NEXT:     Name: foo
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
diff --git a/test/MC/ELF/relocation-386.s b/test/MC/ELF/relocation-386.s
index 4ddfd00..ba12df0 100644
--- a/test/MC/ELF/relocation-386.s
+++ b/test/MC/ELF/relocation-386.s
@@ -62,6 +62,7 @@
 // CHECK-NEXT:     0x9E         R_386_PC16       und_symbol 0x0
 // Relocation 28 (und_symbol-bar2) is of type R_386_PC8
 // CHECK-NEXT:     0xA0         R_386_PC8        und_symbol 0x0
+// CHECK-NEXT:     0xA3         R_386_GOTOFF     und_symbol 0x0
 // CHECK-NEXT:   }
 // CHECK-NEXT: ]
 
@@ -127,6 +128,8 @@ bar2:
         .word und_symbol-bar2
         .byte und_symbol-bar2
 
+        leal 1 + und_symbol@GOTOFF, %edi
+
         .section        zedsec,"awT",@progbits
 zed:
         .long 0
diff --git a/test/MC/ELF/relocation.s b/test/MC/ELF/relocation.s
index d2ee6af..c0e6007 100644
--- a/test/MC/ELF/relocation.s
+++ b/test/MC/ELF/relocation.s
@@ -25,9 +25,15 @@ bar:
 	.word   foo-bar
 	.byte   foo-bar
 
+        # this should probably be an error...
 	zed = foo +2
 	call zed@PLT
 
+        leaq    -1+foo(%rip), %r11
+
+        movl  $_GLOBAL_OFFSET_TABLE_, %eax
+        movabs  $_GLOBAL_OFFSET_TABLE_, %rax
+
 // CHECK:        Section {
 // CHECK:          Name: .rela.text
 // CHECK:          Relocations [
@@ -52,7 +58,10 @@ bar:
 // CHECK-NEXT:       0x85 R_X86_64_TPOFF64 baz 0x0
 // CHECK-NEXT:       0x8D R_X86_64_PC16 foo 0x8D
 // CHECK-NEXT:       0x8F R_X86_64_PC8 foo 0x8F
-// CHECK-NEXT:       0x91 R_X86_64_PLT32 foo 0xFFFFFFFFFFFFFFFE
+// CHECK-NEXT:       0x91 R_X86_64_PLT32 zed 0xFFFFFFFFFFFFFFFC
+// CHECK-NEXT:       0x98 R_X86_64_PC32 foo 0xFFFFFFFFFFFFFFFB
+// CHECK-NEXT:       0x9D R_X86_64_GOTPC32 _GLOBAL_OFFSET_TABLE_ 0x1
+// CHECK-NEXT:       0xA3 R_X86_64_GOTPC64 _GLOBAL_OFFSET_TABLE_ 0x2
 // CHECK-NEXT:     ]
 // CHECK-NEXT:   }
 
diff --git a/test/MC/ELF/set.s b/test/MC/ELF/set.s
index 80e7e53..b4f77f5 100644
--- a/test/MC/ELF/set.s
+++ b/test/MC/ELF/set.s
@@ -5,7 +5,7 @@
 .set kernbase,0xffffffff80000000
 
 // CHECK:        Symbol {
-// CHECK:          Name: kernbase (1)
+// CHECK:          Name: kernbase
 // CHECK-NEXT:     Value: 0xFFFFFFFF80000000
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -26,7 +26,7 @@
 
 // Test that there is an undefined reference to bar
 // CHECK:        Symbol {
-// CHECK:          Name: bar (10)
+// CHECK:          Name: bar
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
diff --git a/test/MC/ELF/strtab-suffix-opt.s b/test/MC/ELF/strtab-suffix-opt.s
new file mode 100644
index 0000000..eb5da8a
--- /dev/null
+++ b/test/MC/ELF/strtab-suffix-opt.s
@@ -0,0 +1,21 @@
+// RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu %s -o - | llvm-readobj -symbols | FileCheck %s
+
+	.text
+	.globl	foobar
+	.align	16, 0x90
+	.type	foobar,@function
+foobar:
+	pushl	%ebp
+	movl	%esp, %ebp
+	subl	$8, %esp
+	calll	foo
+	calll	bar
+	addl	$8, %esp
+	popl	%ebp
+	retl
+.Ltmp3:
+	.size	foobar, .Ltmp3-foobar
+
+// CHECK:     Name: foobar (1)
+// CHECK:     Name: bar (4)
+// CHECK:     Name: foo (8)
diff --git a/test/MC/ELF/subtraction-error.s b/test/MC/ELF/subtraction-error.s
new file mode 100644
index 0000000..6b93d3a
--- /dev/null
+++ b/test/MC/ELF/subtraction-error.s
@@ -0,0 +1,8 @@
+// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux < %s 2>&1 | FileCheck %s
+
+a:
+    .section foo
+b:
+c = b - a
+
+; CHECK: symbol 'a' could not be evaluated in a subtraction expression
diff --git a/test/MC/ELF/symref.s b/test/MC/ELF/symref.s
deleted file mode 100644
index 737683b..0000000
--- a/test/MC/ELF/symref.s
+++ /dev/null
@@ -1,142 +0,0 @@
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -r -t | FileCheck %s
-
-defined1:
-defined2:
-defined3:
-        .symver defined1, bar1@zed
-        .symver undefined1, bar2@zed
-
-        .symver defined2, bar3@@zed
-
-        .symver defined3, bar5@@@zed
-        .symver undefined3, bar6@@@zed
-
-        .long defined1
-        .long undefined1
-        .long defined2
-        .long defined3
-        .long undefined3
-
-        .global global1
-        .symver global1, g1@@zed
-global1:
-
-// CHECK:      Relocations [
-// CHECK-NEXT:   Section (2) .rela.text {
-// CHECK-NEXT:     0x0 R_X86_64_32 .text 0x0
-// CHECK-NEXT:     0x4 R_X86_64_32 bar2@zed 0x0
-// CHECK-NEXT:     0x8 R_X86_64_32 .text 0x0
-// CHECK-NEXT:     0xC R_X86_64_32 .text 0x0
-// CHECK-NEXT:     0x10 R_X86_64_32 bar6@zed 0x0
-// CHECK-NEXT:   }
-// CHECK-NEXT: ]
-
-// CHECK:        Symbol {
-// CHECK:          Name: bar1@zed (19)
-// CHECK-NEXT:     Value: 0x0
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Local
-// CHECK-NEXT:     Type: None
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: .text (0x1)
-// CHECK-NEXT:   }
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar3@@zed (37)
-// CHECK-NEXT:     Value: 0x0
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Local
-// CHECK-NEXT:     Type: None
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: .text (0x1)
-// CHECK-NEXT:   }
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar5@@zed (47)
-// CHECK-NEXT:     Value: 0x0
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Local
-// CHECK-NEXT:     Type: None
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: .text (0x1)
-// CHECK-NEXT:   }
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: defined1 (1)
-// CHECK-NEXT:     Value: 0x0
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Local
-// CHECK-NEXT:     Type: None
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: .text (0x1)
-// CHECK-NEXT:   }
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: defined2 (10)
-// CHECK-NEXT:     Value: 0x0
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Local
-// CHECK-NEXT:     Type: None
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: .text (0x1)
-// CHECK-NEXT:   }
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: .text (0)
-// CHECK-NEXT:     Value: 0x0
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Local
-// CHECK-NEXT:     Type: Section
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: .text (0x1)
-// CHECK-NEXT:   }
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: .data (0)
-// CHECK-NEXT:     Value: 0x0
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Local
-// CHECK-NEXT:     Type: Section
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: .data (0x3)
-// CHECK-NEXT:   }
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: .bss (0)
-// CHECK-NEXT:     Value: 0x0
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Local
-// CHECK-NEXT:     Type: Section
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: .bss (0x4)
-// CHECK-NEXT:   }
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: g1@@zed (74)
-// CHECK-NEXT:     Value: 0x14
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Global
-// CHECK-NEXT:     Type: None
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: .text (0x1)
-// CHECK-NEXT:   }
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: global1 (66)
-// CHECK-NEXT:     Value: 0x14
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Global
-// CHECK-NEXT:     Type: None
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: .text (0x1)
-// CHECK-NEXT:   }
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar2@zed (28)
-// CHECK-NEXT:     Value: 0x0
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Global
-// CHECK-NEXT:     Type: None
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: Undefined (0x0)
-// CHECK-NEXT:   }
-// CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar6@zed (57)
-// CHECK-NEXT:     Value: 0x0
-// CHECK-NEXT:     Size: 0
-// CHECK-NEXT:     Binding: Global
-// CHECK-NEXT:     Type: None
-// CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: Undefined (0x0)
-// CHECK-NEXT:   }
-// CHECK-NEXT: ]
diff --git a/test/MC/ELF/symver.s b/test/MC/ELF/symver.s
new file mode 100644
index 0000000..6e5825f
--- /dev/null
+++ b/test/MC/ELF/symver.s
@@ -0,0 +1,142 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -r -t | FileCheck %s
+
+defined1:
+defined2:
+defined3:
+        .symver defined1, bar1@zed
+        .symver undefined1, bar2@zed
+
+        .symver defined2, bar3@@zed
+
+        .symver defined3, bar5@@@zed
+        .symver undefined3, bar6@@@zed
+
+        .long defined1
+        .long undefined1
+        .long defined2
+        .long defined3
+        .long undefined3
+
+        .global global1
+        .symver global1, g1@@zed
+global1:
+
+// CHECK:      Relocations [
+// CHECK-NEXT:   Section (2) .rela.text {
+// CHECK-NEXT:     0x0 R_X86_64_32 .text 0x0
+// CHECK-NEXT:     0x4 R_X86_64_32 bar2@zed 0x0
+// CHECK-NEXT:     0x8 R_X86_64_32 .text 0x0
+// CHECK-NEXT:     0xC R_X86_64_32 .text 0x0
+// CHECK-NEXT:     0x10 R_X86_64_32 bar6@zed 0x0
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+
+// CHECK:        Symbol {
+// CHECK:          Name: bar1@zed
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: bar3@@zed
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: bar5@@zed
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: defined1
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: defined2
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: .text
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: Section
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: .data
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: Section
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .data
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: .bss
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: Section
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .bss
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: g1@@zed
+// CHECK-NEXT:     Value: 0x14
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Global
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: global1
+// CHECK-NEXT:     Value: 0x14
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Global
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: bar2@zed
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Global
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: Undefined
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: bar6@zed
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Global
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: Undefined
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
diff --git a/test/MC/ELF/tls-i386.s b/test/MC/ELF/tls-i386.s
index 88e96ff..5ee3668 100644
--- a/test/MC/ELF/tls-i386.s
+++ b/test/MC/ELF/tls-i386.s
@@ -18,7 +18,7 @@
         .long   fooE@INDNTPOFF
 
 // CHECK:        Symbol {
-// CHECK:          Name: foo1 (1)
+// CHECK:          Name: foo1
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -27,7 +27,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo2 (6)
+// CHECK-NEXT:     Name: foo2
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -36,7 +36,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo3 (11)
+// CHECK-NEXT:     Name: foo3
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -45,7 +45,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo4 (16)
+// CHECK-NEXT:     Name: foo4
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -54,7 +54,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo5 (21)
+// CHECK-NEXT:     Name: foo5
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -63,7 +63,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo6 (26)
+// CHECK-NEXT:     Name: foo6
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -72,7 +72,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo7 (31)
+// CHECK-NEXT:     Name: foo7
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -81,7 +81,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo8 (36)
+// CHECK-NEXT:     Name: foo8
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -90,7 +90,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo9 (41)
+// CHECK-NEXT:     Name: foo9
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -99,7 +99,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: fooA (46)
+// CHECK-NEXT:     Name: fooA
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -108,7 +108,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: fooB (51)
+// CHECK-NEXT:     Name: fooB
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -117,7 +117,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: fooC (56)
+// CHECK-NEXT:     Name: fooC
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -126,7 +126,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: fooD (61)
+// CHECK-NEXT:     Name: fooD
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -135,7 +135,7 @@
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: fooE (66)
+// CHECK-NEXT:     Name: fooE
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
diff --git a/test/MC/ELF/tls.s b/test/MC/ELF/tls.s
index 6d4b703..79865cd 100644
--- a/test/MC/ELF/tls.s
+++ b/test/MC/ELF/tls.s
@@ -13,7 +13,7 @@ foobar:
 	.long	43
 
 // CHECK:        Symbol {
-// CHECK:          Name: foobar (31)
+// CHECK:          Name: foobar
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -23,7 +23,7 @@ foobar:
 // CHECK-NEXT:   }
 
 // CHECK:        Symbol {
-// CHECK:          Name: foo1 (1)
+// CHECK:          Name: foo1
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -32,7 +32,7 @@ foobar:
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo2 (6)
+// CHECK-NEXT:     Name: foo2
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -41,7 +41,7 @@ foobar:
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo3 (11)
+// CHECK-NEXT:     Name: foo3
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -50,7 +50,7 @@ foobar:
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo4 (16)
+// CHECK-NEXT:     Name: foo4
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -59,7 +59,7 @@ foobar:
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo5 (21)
+// CHECK-NEXT:     Name: foo5
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -68,7 +68,7 @@ foobar:
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: foo6 (26)
+// CHECK-NEXT:     Name: foo6
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
diff --git a/test/MC/ELF/type.s b/test/MC/ELF/type.s
index 638d828..c82d300 100644
--- a/test/MC/ELF/type.s
+++ b/test/MC/ELF/type.s
@@ -176,7 +176,7 @@ alias12:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: sym1 (54)
+// CHECK-NEXT:    Name: sym1
 // CHECK-NEXT:    Value: 0x0
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Global (0x1)
@@ -185,7 +185,7 @@ alias12:
 // CHECK-NEXT:    Section: .text (0x1)
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: sym10 (162)
+// CHECK-NEXT:    Name: sym10
 // CHECK-NEXT:    Value: 0x0
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Global (0x1)
@@ -194,7 +194,7 @@ alias12:
 // CHECK-NEXT:    Section: .text (0x1)
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: sym11 (176)
+// CHECK-NEXT:    Name: sym11
 // CHECK-NEXT:    Value: 0x0
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Global (0x1)
@@ -203,7 +203,7 @@ alias12:
 // CHECK-NEXT:    Section: .text (0x1)
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: sym12 (190)
+// CHECK-NEXT:    Name: sym12
 // CHECK-NEXT:    Value: 0x0
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Global (0x1)
@@ -212,7 +212,7 @@ alias12:
 // CHECK-NEXT:    Section: .text (0x1)
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: sym2 (66)
+// CHECK-NEXT:    Name: sym2
 // CHECK-NEXT:    Value: 0x0
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Global (0x1)
@@ -221,7 +221,7 @@ alias12:
 // CHECK-NEXT:    Section: .text (0x1)
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: sym3 (78)
+// CHECK-NEXT:    Name: sym3
 // CHECK-NEXT:    Value: 0x0
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Global (0x1)
@@ -230,7 +230,7 @@ alias12:
 // CHECK-NEXT:    Section: .text (0x1)
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: sym4 (90)
+// CHECK-NEXT:    Name: sym4
 // CHECK-NEXT:    Value: 0x0
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Global (0x1)
@@ -239,7 +239,7 @@ alias12:
 // CHECK-NEXT:    Section: .text (0x1)
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: sym5 (102)
+// CHECK-NEXT:    Name: sym5
 // CHECK-NEXT:    Value: 0x0
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Global (0x1)
@@ -248,7 +248,7 @@ alias12:
 // CHECK-NEXT:    Section: .text (0x1)
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: sym6 (114)
+// CHECK-NEXT:    Name: sym6
 // CHECK-NEXT:    Value: 0x0
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Global (0x1)
@@ -257,7 +257,7 @@ alias12:
 // CHECK-NEXT:    Section: .text (0x1)
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: sym7 (126)
+// CHECK-NEXT:    Name: sym7
 // CHECK-NEXT:    Value: 0x0
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Global (0x1)
@@ -266,7 +266,7 @@ alias12:
 // CHECK-NEXT:    Section: .text (0x1)
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: sym8 (138)
+// CHECK-NEXT:    Name: sym8
 // CHECK-NEXT:    Value: 0x0
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Global (0x1)
@@ -275,7 +275,7 @@ alias12:
 // CHECK-NEXT:    Section: .text (0x1)
 // CHECK-NEXT:  }
 // CHECK-NEXT:  Symbol {
-// CHECK-NEXT:    Name: sym9 (150)
+// CHECK-NEXT:    Name: sym9
 // CHECK-NEXT:    Value: 0x0
 // CHECK-NEXT:    Size: 0
 // CHECK-NEXT:    Binding: Global (0x1)
diff --git a/test/MC/ELF/undef.s b/test/MC/ELF/undef.s
index 7c2a876..245b563 100644
--- a/test/MC/ELF/undef.s
+++ b/test/MC/ELF/undef.s
@@ -19,21 +19,80 @@
         .text
         movsd   .Lsym8(%rip), %xmm1
 
-// CHECK:      Symbols [
-
-// CHECK:        Symbol {
-// CHECK:          Name: .Lsym8
-
-// CHECK:        Symbol {
-// CHECK:          Name: .Lsym1
+test2_a = undef
+test2_b = undef + 1
 
-// CHECK:        Symbol {
-// CHECK:          Name: sym6
+// CHECK:      Symbols [
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name:  (0)
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: Undefined
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: .Lsym8
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .rodata.str1.1
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: .text
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: Section
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: .data
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: Section
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .data
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: .bss
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: Section
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .bss
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: .rodata.str1.1
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: Section
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .rodata.str1.1
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: .Lsym1
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Global
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: Undefined
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: sym6
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
 // CHECK-NEXT:     Type: Object
 // CHECK-NEXT:     Other: 0
-// CHECK-NEXT:     Section: Undefined (0x0)
+// CHECK-NEXT:     Section: Undefined
 // CHECK-NEXT:   }
 // CHECK-NEXT: ]
diff --git a/test/MC/ELF/weakref.s b/test/MC/ELF/weakref.s
index cf2228d..2288264 100644
--- a/test/MC/ELF/weakref.s
+++ b/test/MC/ELF/weakref.s
@@ -80,7 +80,7 @@ bar15:
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar6 (21)
+// CHECK-NEXT:     Name: bar6
 // CHECK-NEXT:     Value: 0x18
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -89,7 +89,7 @@ bar15:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar7 (26)
+// CHECK-NEXT:     Name: bar7
 // CHECK-NEXT:     Value: 0x18
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -98,7 +98,7 @@ bar15:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar8 (31)
+// CHECK-NEXT:     Name: bar8
 // CHECK-NEXT:     Value: 0x1C
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -107,7 +107,7 @@ bar15:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar9 (36)
+// CHECK-NEXT:     Name: bar9
 // CHECK-NEXT:     Value: 0x20
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -116,7 +116,7 @@ bar15:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: .text (0)
+// CHECK-NEXT:     Name: .text
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -125,7 +125,7 @@ bar15:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: .data (0)
+// CHECK-NEXT:     Name: .data
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -134,7 +134,7 @@ bar15:
 // CHECK-NEXT:     Section: .data (0x3)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: .bss (0)
+// CHECK-NEXT:     Name: .bss
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Local
@@ -143,7 +143,7 @@ bar15:
 // CHECK-NEXT:     Section: .bss (0x4)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar10 (41)
+// CHECK-NEXT:     Name: bar10
 // CHECK-NEXT:     Value: 0x28
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -152,7 +152,7 @@ bar15:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar11 (47)
+// CHECK-NEXT:     Name: bar11
 // CHECK-NEXT:     Value: 0x30
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -161,7 +161,7 @@ bar15:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar12 (53)
+// CHECK-NEXT:     Name: bar12
 // CHECK-NEXT:     Value: 0x30
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -170,7 +170,7 @@ bar15:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar13 (59)
+// CHECK-NEXT:     Name: bar13
 // CHECK-NEXT:     Value: 0x34
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -179,7 +179,7 @@ bar15:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar14 (65)
+// CHECK-NEXT:     Name: bar14
 // CHECK-NEXT:     Value: 0x38
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -188,7 +188,7 @@ bar15:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar15 (71)
+// CHECK-NEXT:     Name: bar15
 // CHECK-NEXT:     Value: 0x40
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -197,7 +197,7 @@ bar15:
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar2 (1)
+// CHECK-NEXT:     Name: bar2
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -206,7 +206,7 @@ bar15:
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar3 (6)
+// CHECK-NEXT:     Name: bar3
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Weak
@@ -215,7 +215,7 @@ bar15:
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar4 (11)
+// CHECK-NEXT:     Name: bar4
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
@@ -224,7 +224,7 @@ bar15:
 // CHECK-NEXT:     Section: Undefined (0x0)
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: bar5 (16)
+// CHECK-NEXT:     Name: bar5
 // CHECK-NEXT:     Value: 0x0
 // CHECK-NEXT:     Size: 0
 // CHECK-NEXT:     Binding: Global
diff --git a/test/MC/MachO/AArch64/darwin-ARM64-local-label-diff.s b/test/MC/MachO/AArch64/darwin-ARM64-local-label-diff.s
new file mode 100644
index 0000000..d98c257
--- /dev/null
+++ b/test/MC/MachO/AArch64/darwin-ARM64-local-label-diff.s
@@ -0,0 +1,21 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o - < %s | macho-dump -dump-section-data | FileCheck %s
+; rdar://13028719
+
+ .globl context_save0
+ .align 6
+Lcontext_save0:
+context_save0:
+ .fill 2, 8, 5
+Lcontext_save0_end:
+Lcontext_save0_size: .quad (Lcontext_save0_end - Lcontext_save0)
+
+ .align 6
+Lcontext_save1:
+ .fill 2, 8, 0
+Lcontext_save1_end:
+Lcontext_save1_size: .quad (Lcontext_save1_end - Lcontext_save1)
+
+Llockup_release:
+ .quad 0
+
+; CHECK:  ('_section_data', '05000000 00000000 05000000 00000000 10000000 00000000 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 00000000 00000000 00000000 00000000 10000000 00000000 00000000 00000000')
diff --git a/test/MC/MachO/AArch64/darwin-ARM64-reloc.s b/test/MC/MachO/AArch64/darwin-ARM64-reloc.s
new file mode 100644
index 0000000..7f586ae
--- /dev/null
+++ b/test/MC/MachO/AArch64/darwin-ARM64-reloc.s
@@ -0,0 +1,157 @@
+; RUN: llvm-mc -n -triple arm64-apple-darwin10 %s -filetype=obj -o - | macho-dump --dump-section-data | FileCheck %s
+
+	.text
+_fred:
+	bl	_func
+	bl	_func + 20
+
+	adrp	x3, _data@page
+        ldr	w2, [x3, _data@pageoff]
+
+        add	x3, x3, _data@pageoff + 4
+
+	adrp	x3, _data@page+1
+        ldr	w2, [x3, _data@pageoff + 4]
+
+	adrp	x3, _data_ext@gotpage
+        ldr	w2, [x3, _data_ext@gotpageoff]
+
+	.data
+_data:
+        .quad _foo
+        .quad _foo + 4
+        .quad _foo - _bar
+        .quad _foo - _bar + 4
+
+        .long _foo - _bar
+
+        .quad _foo@got
+        .long _foo@got - .
+
+
+; CHECK: ('cputype', 16777228)
+; CHECK: ('cpusubtype', 0)
+; CHECK: ('filetype', 1)
+; CHECK: ('num_load_commands', 3)
+; CHECK: ('load_commands_size', 336)
+; CHECK: ('flag', 0)
+; CHECK: ('reserved', 0)
+; CHECK: ('load_commands', [
+; CHECK:   # Load Command 0
+; CHECK:  (('command', 25)
+; CHECK:   ('size', 232)
+; CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:   ('vm_addr', 0)
+; CHECK:   ('vm_size', 84)
+; CHECK:   ('file_offset', 368)
+; CHECK:   ('file_size', 84)
+; CHECK:   ('maxprot', 7)
+; CHECK:   ('initprot', 7)
+; CHECK:   ('num_sections', 2)
+; CHECK:   ('flags', 0)
+; CHECK:   ('sections', [
+; CHECK:     # Section 0
+; CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:     ('address', 0)
+; CHECK:     ('size', 36)
+; CHECK:     ('offset', 368)
+; CHECK:     ('alignment', 0)
+; CHECK:     ('reloc_offset', 452)
+; CHECK:     ('num_reloc', 13)
+; CHECK:     ('flags', 0x80000400)
+; CHECK:     ('reserved1', 0)
+; CHECK:     ('reserved2', 0)
+; CHECK:     ('reserved3', 0)
+; CHECK:    ),
+; CHECK:   ('_relocations', [
+; CHECK:     # Relocation 0
+; CHECK:     (('word-0', 0x20),
+; CHECK:      ('word-1', 0x6c000005)),
+; CHECK:     # Relocation 1
+; CHECK:     (('word-0', 0x1c),
+; CHECK:      ('word-1', 0x5d000005)),
+; CHECK:     # Relocation 2
+; CHECK:     (('word-0', 0x18),
+; CHECK:      ('word-1', 0xa4000004)),
+; CHECK:     # Relocation 3
+; CHECK:     (('word-0', 0x18),
+; CHECK:      ('word-1', 0x4c000002)),
+; CHECK:     # Relocation 4
+; CHECK:     (('word-0', 0x14),
+; CHECK:      ('word-1', 0xa4000001)),
+; CHECK:     # Relocation 5
+; CHECK:     (('word-0', 0x14),
+; CHECK:      ('word-1', 0x3d000002)),
+; CHECK:     # Relocation 6
+; CHECK:     (('word-0', 0x10),
+; CHECK:      ('word-1', 0xa4000004)),
+; CHECK:     # Relocation 7
+; CHECK:     (('word-0', 0x10),
+; CHECK:      ('word-1', 0x4c000002)),
+; CHECK:     # Relocation 8
+; CHECK:     (('word-0', 0xc),
+; CHECK:      ('word-1', 0x4c000002)),
+; CHECK:     # Relocation 9
+; CHECK:     (('word-0', 0x8),
+; CHECK:      ('word-1', 0x3d000002)),
+; CHECK:     # Relocation 10
+; CHECK:     (('word-0', 0x4),
+; CHECK:      ('word-1', 0xa4000014)),
+; CHECK:     # Relocation 11
+; CHECK:     (('word-0', 0x4),
+; CHECK:      ('word-1', 0x2d000007)),
+; CHECK:     # Relocation 12
+; CHECK:     (('word-0', 0x0),
+; CHECK:      ('word-1', 0x2d000007)),
+; CHECK:   ])
+; CHECK:   ('_section_data', '00000094 00000094 03000090 620040b9 63000091 03000090 620040b9 03000090 620040b9')
+; CHECK:     # Section 1
+; CHECK:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:     ('address', 36)
+; CHECK:     ('size', 48)
+; CHECK:     ('offset', 404)
+; CHECK:     ('alignment', 0)
+; CHECK:     ('reloc_offset', 556)
+; CHECK:     ('num_reloc', 10)
+; CHECK:     ('flags', 0x0)
+; CHECK:     ('reserved1', 0)
+; CHECK:     ('reserved2', 0)
+; CHECK:     ('reserved3', 0)
+; CHECK:    ),
+; CHECK:   ('_relocations', [
+; CHECK:     # Relocation 0
+; CHECK:     (('word-0', 0x2c),
+; CHECK:      ('word-1', 0x7d000006)),
+; CHECK:     # Relocation 1
+; CHECK:     (('word-0', 0x24),
+; CHECK:      ('word-1', 0x7e000006)),
+; CHECK:     # Relocation 2
+; CHECK:     (('word-0', 0x20),
+; CHECK:      ('word-1', 0x1c000004)),
+; CHECK:     # Relocation 3
+; CHECK:     (('word-0', 0x20),
+; CHECK:      ('word-1', 0xc000006)),
+; CHECK:     # Relocation 4
+; CHECK:     (('word-0', 0x18),
+; CHECK:      ('word-1', 0x1e000004)),
+; CHECK:     # Relocation 5
+; CHECK:     (('word-0', 0x18),
+; CHECK:      ('word-1', 0xe000006)),
+; CHECK:     # Relocation 6
+; CHECK:     (('word-0', 0x10),
+; CHECK:      ('word-1', 0x1e000004)),
+; CHECK:     # Relocation 7
+; CHECK:     (('word-0', 0x10),
+; CHECK:      ('word-1', 0xe000006)),
+; CHECK:     # Relocation 8
+; CHECK:     (('word-0', 0x8),
+; CHECK:      ('word-1', 0xe000006)),
+; CHECK:     # Relocation 9
+; CHECK:     (('word-0', 0x0),
+; CHECK:      ('word-1', 0xe000006)),
+; CHECK:   ])
+; CHECK:   ('_section_data', '00000000 00000000 04000000 00000000 00000000 00000000 04000000 00000000 00000000 00000000 00000000 d4ffffff')
+; CHECK:   ])
+; CHECK:  ),
diff --git a/test/MC/MachO/AArch64/lit.local.cfg b/test/MC/MachO/AArch64/lit.local.cfg
new file mode 100644
index 0000000..9a66a00
--- /dev/null
+++ b/test/MC/MachO/AArch64/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
+
diff --git a/test/MC/MachO/ARM/bad-darwin-directives.s b/test/MC/MachO/ARM/bad-darwin-directives.s
index 0499e40..7ac0f6f 100644
--- a/test/MC/MachO/ARM/bad-darwin-directives.s
+++ b/test/MC/MachO/ARM/bad-darwin-directives.s
@@ -1,24 +1,29 @@
-@ RUN: not llvm-mc -n -triple armv7-apple-darwin10 %s -filetype=obj -o - 2> %t.err > %t
-@ RUN: FileCheck --check-prefix=CHECK-ERROR < %t.err %s
+@ RUN: not llvm-mc -n -triple armv7-apple-darwin10 %s -filetype asm -o /dev/null 2>&1 \
+@ RUN:  | FileCheck --check-prefix CHECK-ERROR %s
+
+@ RUN: not llvm-mc -n -triple armv7-apple-darwin10 %s -filetype obj -o /dev/null 2>&1 \
+@ RUN:  | FileCheck --check-prefix CHECK-ERROR %s
+
 @ rdar://16335232
 
 .eabi_attribute 8, 1
-@ CHECK-ERROR: error: .eabi_attribute directive not valid for Mach-O
+@ CHECK-ERROR: error: unknown directive
 
 .cpu
-@ CHECK-ERROR: error: .cpu directive not valid for Mach-O
+@ CHECK-ERROR: error: unknown directive
 
 .fpu neon
-@ CHECK-ERROR: error: .fpu directive not valid for Mach-O
+@ CHECK-ERROR: error: unknown directive
 
 .arch armv7
-@ CHECK-ERROR: error: .arch directive not valid for Mach-O
+@ CHECK-ERROR: error: unknown directive
 
 .fnstart
-@ CHECK-ERROR: error: .fnstart directive not valid for Mach-O
+@ CHECK-ERROR: error: unknown directive
 
 .tlsdescseq
-@ CHECK-ERROR: error: .tlsdescseq directive not valid for Mach-O
+@ CHECK-ERROR: error: unknown directive
 
 .object_arch armv7
-@ CHECK-ERROR: error: .object_arch directive not valid for Mach-O
+@ CHECK-ERROR: error: unknown directive
+
diff --git a/test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s b/test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s
deleted file mode 100644
index d98c257..0000000
--- a/test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o - < %s | macho-dump -dump-section-data | FileCheck %s
-; rdar://13028719
-
- .globl context_save0
- .align 6
-Lcontext_save0:
-context_save0:
- .fill 2, 8, 5
-Lcontext_save0_end:
-Lcontext_save0_size: .quad (Lcontext_save0_end - Lcontext_save0)
-
- .align 6
-Lcontext_save1:
- .fill 2, 8, 0
-Lcontext_save1_end:
-Lcontext_save1_size: .quad (Lcontext_save1_end - Lcontext_save1)
-
-Llockup_release:
- .quad 0
-
-; CHECK:  ('_section_data', '05000000 00000000 05000000 00000000 10000000 00000000 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 00000000 00000000 00000000 00000000 10000000 00000000 00000000 00000000')
diff --git a/test/MC/MachO/ARM64/darwin-ARM64-reloc.s b/test/MC/MachO/ARM64/darwin-ARM64-reloc.s
deleted file mode 100644
index 7f586ae..0000000
--- a/test/MC/MachO/ARM64/darwin-ARM64-reloc.s
+++ /dev/null
@@ -1,157 +0,0 @@
-; RUN: llvm-mc -n -triple arm64-apple-darwin10 %s -filetype=obj -o - | macho-dump --dump-section-data | FileCheck %s
-
-	.text
-_fred:
-	bl	_func
-	bl	_func + 20
-
-	adrp	x3, _data@page
-        ldr	w2, [x3, _data@pageoff]
-
-        add	x3, x3, _data@pageoff + 4
-
-	adrp	x3, _data@page+1
-        ldr	w2, [x3, _data@pageoff + 4]
-
-	adrp	x3, _data_ext@gotpage
-        ldr	w2, [x3, _data_ext@gotpageoff]
-
-	.data
-_data:
-        .quad _foo
-        .quad _foo + 4
-        .quad _foo - _bar
-        .quad _foo - _bar + 4
-
-        .long _foo - _bar
-
-        .quad _foo@got
-        .long _foo@got - .
-
-
-; CHECK: ('cputype', 16777228)
-; CHECK: ('cpusubtype', 0)
-; CHECK: ('filetype', 1)
-; CHECK: ('num_load_commands', 3)
-; CHECK: ('load_commands_size', 336)
-; CHECK: ('flag', 0)
-; CHECK: ('reserved', 0)
-; CHECK: ('load_commands', [
-; CHECK:   # Load Command 0
-; CHECK:  (('command', 25)
-; CHECK:   ('size', 232)
-; CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:   ('vm_addr', 0)
-; CHECK:   ('vm_size', 84)
-; CHECK:   ('file_offset', 368)
-; CHECK:   ('file_size', 84)
-; CHECK:   ('maxprot', 7)
-; CHECK:   ('initprot', 7)
-; CHECK:   ('num_sections', 2)
-; CHECK:   ('flags', 0)
-; CHECK:   ('sections', [
-; CHECK:     # Section 0
-; CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:     ('address', 0)
-; CHECK:     ('size', 36)
-; CHECK:     ('offset', 368)
-; CHECK:     ('alignment', 0)
-; CHECK:     ('reloc_offset', 452)
-; CHECK:     ('num_reloc', 13)
-; CHECK:     ('flags', 0x80000400)
-; CHECK:     ('reserved1', 0)
-; CHECK:     ('reserved2', 0)
-; CHECK:     ('reserved3', 0)
-; CHECK:    ),
-; CHECK:   ('_relocations', [
-; CHECK:     # Relocation 0
-; CHECK:     (('word-0', 0x20),
-; CHECK:      ('word-1', 0x6c000005)),
-; CHECK:     # Relocation 1
-; CHECK:     (('word-0', 0x1c),
-; CHECK:      ('word-1', 0x5d000005)),
-; CHECK:     # Relocation 2
-; CHECK:     (('word-0', 0x18),
-; CHECK:      ('word-1', 0xa4000004)),
-; CHECK:     # Relocation 3
-; CHECK:     (('word-0', 0x18),
-; CHECK:      ('word-1', 0x4c000002)),
-; CHECK:     # Relocation 4
-; CHECK:     (('word-0', 0x14),
-; CHECK:      ('word-1', 0xa4000001)),
-; CHECK:     # Relocation 5
-; CHECK:     (('word-0', 0x14),
-; CHECK:      ('word-1', 0x3d000002)),
-; CHECK:     # Relocation 6
-; CHECK:     (('word-0', 0x10),
-; CHECK:      ('word-1', 0xa4000004)),
-; CHECK:     # Relocation 7
-; CHECK:     (('word-0', 0x10),
-; CHECK:      ('word-1', 0x4c000002)),
-; CHECK:     # Relocation 8
-; CHECK:     (('word-0', 0xc),
-; CHECK:      ('word-1', 0x4c000002)),
-; CHECK:     # Relocation 9
-; CHECK:     (('word-0', 0x8),
-; CHECK:      ('word-1', 0x3d000002)),
-; CHECK:     # Relocation 10
-; CHECK:     (('word-0', 0x4),
-; CHECK:      ('word-1', 0xa4000014)),
-; CHECK:     # Relocation 11
-; CHECK:     (('word-0', 0x4),
-; CHECK:      ('word-1', 0x2d000007)),
-; CHECK:     # Relocation 12
-; CHECK:     (('word-0', 0x0),
-; CHECK:      ('word-1', 0x2d000007)),
-; CHECK:   ])
-; CHECK:   ('_section_data', '00000094 00000094 03000090 620040b9 63000091 03000090 620040b9 03000090 620040b9')
-; CHECK:     # Section 1
-; CHECK:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-; CHECK:     ('address', 36)
-; CHECK:     ('size', 48)
-; CHECK:     ('offset', 404)
-; CHECK:     ('alignment', 0)
-; CHECK:     ('reloc_offset', 556)
-; CHECK:     ('num_reloc', 10)
-; CHECK:     ('flags', 0x0)
-; CHECK:     ('reserved1', 0)
-; CHECK:     ('reserved2', 0)
-; CHECK:     ('reserved3', 0)
-; CHECK:    ),
-; CHECK:   ('_relocations', [
-; CHECK:     # Relocation 0
-; CHECK:     (('word-0', 0x2c),
-; CHECK:      ('word-1', 0x7d000006)),
-; CHECK:     # Relocation 1
-; CHECK:     (('word-0', 0x24),
-; CHECK:      ('word-1', 0x7e000006)),
-; CHECK:     # Relocation 2
-; CHECK:     (('word-0', 0x20),
-; CHECK:      ('word-1', 0x1c000004)),
-; CHECK:     # Relocation 3
-; CHECK:     (('word-0', 0x20),
-; CHECK:      ('word-1', 0xc000006)),
-; CHECK:     # Relocation 4
-; CHECK:     (('word-0', 0x18),
-; CHECK:      ('word-1', 0x1e000004)),
-; CHECK:     # Relocation 5
-; CHECK:     (('word-0', 0x18),
-; CHECK:      ('word-1', 0xe000006)),
-; CHECK:     # Relocation 6
-; CHECK:     (('word-0', 0x10),
-; CHECK:      ('word-1', 0x1e000004)),
-; CHECK:     # Relocation 7
-; CHECK:     (('word-0', 0x10),
-; CHECK:      ('word-1', 0xe000006)),
-; CHECK:     # Relocation 8
-; CHECK:     (('word-0', 0x8),
-; CHECK:      ('word-1', 0xe000006)),
-; CHECK:     # Relocation 9
-; CHECK:     (('word-0', 0x0),
-; CHECK:      ('word-1', 0xe000006)),
-; CHECK:   ])
-; CHECK:   ('_section_data', '00000000 00000000 04000000 00000000 00000000 00000000 04000000 00000000 00000000 00000000 00000000 d4ffffff')
-; CHECK:   ])
-; CHECK:  ),
diff --git a/test/MC/MachO/ARM64/lit.local.cfg b/test/MC/MachO/ARM64/lit.local.cfg
deleted file mode 100644
index a75a42b..0000000
--- a/test/MC/MachO/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/MC/MachO/bad-darwin-x86_64-reloc-expr.s b/test/MC/MachO/bad-darwin-x86_64-reloc-expr.s
new file mode 100644
index 0000000..2b4271f
--- /dev/null
+++ b/test/MC/MachO/bad-darwin-x86_64-reloc-expr.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o - 2> %t.err > %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t.err %s
+
+.quad (0x1234 + (4 * SOME_VALUE))
+// CHECK-ERROR: error: expected relocatable expression
+// CHECK-ERROR:               ^
diff --git a/test/MC/MachO/debug_frame.s b/test/MC/MachO/debug_frame.s
index 20bfd8d..247347d 100644
--- a/test/MC/MachO/debug_frame.s
+++ b/test/MC/MachO/debug_frame.s
@@ -3,6 +3,7 @@
 // Make sure MC can handle file level .cfi_startproc and .cfi_endproc that creates
 // an empty frame.
 // rdar://10017184
+_proc:
 .cfi_startproc
 .cfi_endproc
 
diff --git a/test/MC/MachO/temp-labels.s b/test/MC/MachO/temp-labels.s
index b7382b7..ac0f620 100644
--- a/test/MC/MachO/temp-labels.s
+++ b/test/MC/MachO/temp-labels.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -L -o - | macho-dump --dump-section-data | FileCheck %s
+// RUN: llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -save-temp-labels -o - | macho-dump --dump-section-data | FileCheck %s
 
 // CHECK:   # Load Command 1
 // CHECK:  (('command', 2)
diff --git a/test/MC/Mips/cpload-bad.s b/test/MC/Mips/cpload-bad.s
new file mode 100644
index 0000000..7d186f6
--- /dev/null
+++ b/test/MC/Mips/cpload-bad.s
@@ -0,0 +1,15 @@
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        .option pic2
+        .set reorder
+        .cpload $25
+# ASM: :[[@LINE-1]]:9: warning: .cpload in reorder section
+        .set noreorder
+        .cpload $32
+# ASM: :[[@LINE-1]]:17: error: invalid register
+        .cpload $foo
+# ASM: :[[@LINE-1]]:17: error: expected register containing function address
+        .cpload bar
+# ASM: :[[@LINE-1]]:17: error: expected register containing function address
diff --git a/test/MC/Mips/cpload.s b/test/MC/Mips/cpload.s
new file mode 100644
index 0000000..bc5e797
--- /dev/null
+++ b/test/MC/Mips/cpload.s
@@ -0,0 +1,33 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=ASM
+#
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -filetype=obj -o -| \
+# RUN: llvm-objdump -d -r -arch=mips - | \
+# RUN: FileCheck %s -check-prefix=OBJ
+
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -filetype=obj -o -| \
+# RUN: llvm-objdump -d -r -arch=mips - | \
+# RUN: FileCheck %s -check-prefix=OBJ64
+
+# ASM:    .text
+# ASM:    .option pic2
+# ASM:    .set noreorder
+# ASM:    .cpload $25
+# ASM:    .set reorder
+
+# OBJ:    .text
+# OBJ:    lui $gp, 0
+# OBJ: R_MIPS_HI16 _gp_disp
+# OBJ:    addiu $gp, $gp, 0
+# OBJ: R_MIPS_LO16 _gp_disp
+# OBJ:    addu $gp, $gp, $25
+
+# OBJ64: .text
+# OBJ64-NOT: lui $gp, 0
+# OBJ64-NOT: addiu $gp, $gp, 0
+# OBJ64-NOT: addu $gp, $gp, $25
+
+        .text
+        .option pic2
+        .set noreorder
+        .cpload $25
+        .set reorder
diff --git a/test/MC/Mips/cpsetup.s b/test/MC/Mips/cpsetup.s
index dbdcaab..a21a1e3 100644
--- a/test/MC/Mips/cpsetup.s
+++ b/test/MC/Mips/cpsetup.s
@@ -1,36 +1,78 @@
+# RUN: llvm-mc -triple mips64-unknown-unknown -mattr=-n64,+o32 -filetype=obj -o - %s | \
+# RUN:   llvm-objdump -d -r -arch=mips64 - | \
+# RUN:     FileCheck -check-prefix=O32 %s
+
 # RUN: llvm-mc -triple mips64-unknown-unknown -mattr=-n64,+o32 %s | \
-# RUN:     FileCheck -check-prefix=ANY -check-prefix=O32 %s
+# RUN:   FileCheck -check-prefix=ASM %s
+
+# RUN: llvm-mc -triple mips64-unknown-unknown -mattr=-n64,+n32 -filetype=obj -o - %s | \
+# RUN:   llvm-objdump -d -r -arch=mips64 - | \
+# RUN:     FileCheck -check-prefix=NXX -check-prefix=N32 %s
+
 # RUN: llvm-mc -triple mips64-unknown-unknown -mattr=-n64,+n32 %s | \
-# RUN:     FileCheck -check-prefix=ANY -check-prefix=NXX -check-prefix=N32 %s
-# RUN: llvm-mc -triple mips64-unknown-unknown %s | \
-# RUN:     FileCheck -check-prefix=ANY -check-prefix=NXX -check-prefix=N64 %s
+# RUN:   FileCheck -check-prefix=ASM %s
 
-# TODO: !PIC -> no output
+# RUN: llvm-mc -triple mips64-unknown-unknown %s -filetype=obj -o - | \
+# RUN:   llvm-objdump -d -r -arch=mips64 - | \
+# RUN:     FileCheck -check-prefix=NXX -check-prefix=N64 %s
+
+# RUN: llvm-mc -triple mips64-unknown-unknown %s | \
+# RUN:   FileCheck -check-prefix=ASM %s
 
         .text
         .option pic2
 t1:
         .cpsetup $25, 8, __cerror
 
-# ANY-LABEL: t1:
 
 # O32-NOT: __cerror
 
+# FIXME: Direct object emission for N32 is still under development.
+# N32 doesn't allow 3 operations to be specified in the same relocation
+# record like N64 does.
+
 # NXX: sd       $gp, 8($sp)
-# NXX: lui      $gp, %hi(%neg(%gp_rel(__cerror)))
-# NXX: addiu    $gp, $gp, %lo(%neg(%gp_rel(__cerror)))
+# NXX: lui      $gp, 0
+# NXX: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# NXX: addiu    $gp, $gp, 0
+# NXX: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
 # N32: addu     $gp, $gp, $25
 # N64: daddu    $gp, $gp, $25
 
+# ASM: .cpsetup $25, 8, __cerror
+
 t2:
-# ANY-LABEL: t2:
 
         .cpsetup $25, $2, __cerror
 
 # O32-NOT: __cerror
 
+# FIXME: Direct object emission for N32 is still under development.
+# N32 doesn't allow 3 operations to be specified in the same relocation
+# record like N64 does.
+
 # NXX: move     $2, $gp
-# NXX: lui      $gp, %hi(%neg(%gp_rel(__cerror)))
-# NXX: addiu    $gp, $gp, %lo(%neg(%gp_rel(__cerror)))
+# NXX: lui      $gp, 0
+# NXX: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# NXX: addiu    $gp, $gp, 0
+# NXX: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
 # N32: addu     $gp, $gp, $25
 # N64: daddu    $gp, $gp, $25
+
+# ASM: .cpsetup $25, $2, __cerror
+
+t3:
+        .option pic0
+        nop
+        .cpsetup $25, 8, __cerror
+        nop
+
+# Testing that .cpsetup expands to nothing in this case
+# by checking that the next instruction after the first
+# nop is also a 'nop'.
+# NXX: nop
+# NXX-NEXT: nop
+
+# ASM: nop
+# ASM: .cpsetup $25, 8, __cerror
+# ASM: nop
diff --git a/test/MC/Mips/elf-N64.s b/test/MC/Mips/elf-N64.s
index 3c01803..bf6ebd7 100644
--- a/test/MC/Mips/elf-N64.s
+++ b/test/MC/Mips/elf-N64.s
@@ -1,4 +1,5 @@
 // RUN: llvm-mc -filetype=obj -triple=mips64el-pc-linux -mcpu=mips64  %s -o - | llvm-readobj -r | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple=mips64-pc-linux -mcpu=mips64  %s -o - | llvm-readobj -r | FileCheck %s
 
 // Check for N64 relocation production.
 // Check that the appropriate relocations were created.
diff --git a/test/MC/Mips/elf-gprel-32-64.s b/test/MC/Mips/elf-gprel-32-64.s
index ae75197..2f5ac66 100644
--- a/test/MC/Mips/elf-gprel-32-64.s
+++ b/test/MC/Mips/elf-gprel-32-64.s
@@ -1,6 +1,9 @@
 // RUN: llvm-mc -filetype=obj -triple=mips64el-pc-linux -mcpu=mips64 %s -o - \
 // RUN: | llvm-readobj -r \
 // RUN: | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple=mips64-pc-linux -mcpu=mips64 %s -o - \
+// RUN: | llvm-readobj -r \
+// RUN: | FileCheck %s
 
 // Check that the appropriate relocations were created.
 
diff --git a/test/MC/Mips/elf_eflags.s b/test/MC/Mips/elf_eflags.s
index c789428..8cf4960 100644
--- a/test/MC/Mips/elf_eflags.s
+++ b/test/MC/Mips/elf_eflags.s
@@ -4,40 +4,79 @@
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r2 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R2 %s
 # MIPSEL-MIPS64R2: Flags [ (0x80001100)
 
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r2 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R2-NAN2008 %s
+# MIPSEL-MIPS64R2-NAN2008: Flags [ (0x80001500)
+
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64 %s
 # MIPSEL-MIPS64: Flags [ (0x60001100)
 
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64-NAN2008 %s
+# MIPSEL-MIPS64-NAN2008: Flags [ (0x60001500)
+
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r2 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R2 %s
 # MIPSEL-MIPS32R2: Flags [ (0x70001000)
 
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R2-NAN2008 %s
+# MIPSEL-MIPS32R2-NAN2008: Flags [ (0x70001400)
+
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32 %s
 # MIPSEL-MIPS32: Flags [ (0x50001000)
 
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32-NAN2008 %s
+# MIPSEL-MIPS32-NAN2008: Flags [ (0x50001400)
+
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,n32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N32 %s
 # MIPS64EL-MIPS64R2-N32: Flags [ (0x80000020)
 
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,n32,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N32-NAN2008 %s
+# MIPS64EL-MIPS64R2-N32-NAN2008: Flags [ (0x80000420)
+
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 -mattr=-n64,n32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N32 %s
 # MIPS64EL-MIPS64-N32: Flags [ (0x60000020)
 
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 -mattr=-n64,n32,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N32-NAN2008 %s
+# MIPS64EL-MIPS64-N32-NAN2008: Flags [ (0x60000420)
+
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=n64 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N64 %s
 # MIPS64EL-MIPS64R2-N64: Flags [ (0x80000000)
 
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=n64,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N64-NAN2008 %s
+# MIPS64EL-MIPS64R2-N64-NAN2008: Flags [ (0x80000400)
+
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=n64 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N64 %s
 # MIPS64EL-MIPS64-N64: Flags [ (0x60000000)
 
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=n64,+nan2008 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N64-NAN2008 %s
+# MIPS64EL-MIPS64-N64-NAN2008: Flags [ (0x60000400)
+
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,o32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-O32 %s
 # MIPS64EL-MIPS64R2-O32: Flags [ (0x80001100)
 
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,o32,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-O32-NAN2008 %s
+# MIPS64EL-MIPS64R2-O32-NAN2008: Flags [ (0x80001500)
+
 # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips4 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS4 %s
 # MIPS4: Flags [ (0x30000000)
 
+ # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips4 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS4-NAN2008 %s
+# MIPS4-NAN2008: Flags [ (0x30000400)
+
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=-n64,o32 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-O32 %s
 # MIPS64EL-MIPS64-O32: Flags [ (0x60001100)
-	
+
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=-n64,o32,+nan2008 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-O32-NAN2008 %s
+# MIPS64EL-MIPS64-O32-NAN2008: Flags [ (0x60001500)
+
 # Default ABI for MIPS64 is N64 as opposed to GCC/GAS (N32)
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2 %s
 # MIPS64EL-MIPS64R2: Flags [ (0x80000000)
 
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-NAN2008 %s
+# MIPS64EL-MIPS64R2-NAN2008: Flags [ (0x80000400)
+
 # Default ABI for MIPS64 is N64 as opposed to GCC/GAS (N32)
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64 %s
 # MIPS64EL-MIPS64: Flags [ (0x60000000)
+
+# RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-NAN2008 %s
+# MIPS64EL-MIPS64-NAN2008: Flags [ (0x60000400)
diff --git a/test/MC/Mips/elf_eflags_nan2008.s b/test/MC/Mips/elf_eflags_nan2008.s
new file mode 100644
index 0000000..71a22be
--- /dev/null
+++ b/test/MC/Mips/elf_eflags_nan2008.s
@@ -0,0 +1,12 @@
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 %s -o - | \
+# RUN:   llvm-readobj -h | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+# RUN: llvm-mc -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+
+# This *MUST* match the output of gas compiled with the same triple.
+# CHECK-OBJ: Flags [ (0x50001400)
+
+# CHECK-ASM: .nan 2008
+
+.nan 2008
diff --git a/test/MC/Mips/elf_eflags_nanlegacy.s b/test/MC/Mips/elf_eflags_nanlegacy.s
new file mode 100644
index 0000000..6897ad2
--- /dev/null
+++ b/test/MC/Mips/elf_eflags_nanlegacy.s
@@ -0,0 +1,15 @@
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 %s -o - | \
+# RUN:   llvm-readobj -h | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+# RUN: llvm-mc -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| \
+# RUN:   FileCheck %s -check-prefix=CHECK-ASM
+
+# This *MUST* match the output of gas compiled with the same triple.
+# CHECK-OBJ: Flags [ (0x50001000)
+
+# CHECK-ASM: .nan 2008
+# CHECK-ASM: .nan legacy
+
+.nan 2008
+// Let's override the previous directive!
+.nan legacy
diff --git a/test/MC/Mips/llvm-mc-fixup-endianness.s b/test/MC/Mips/llvm-mc-fixup-endianness.s
new file mode 100644
index 0000000..bc6a5d9
--- /dev/null
+++ b/test/MC/Mips/llvm-mc-fixup-endianness.s
@@ -0,0 +1,6 @@
+# RUN: llvm-mc -show-encoding -mcpu=mips32 -triple mips-unknown-unknown %s | FileCheck -check-prefix=BE %s
+# RUN: llvm-mc -show-encoding -mcpu=mips32 -triple mipsel-unknown-unknown %s | FileCheck -check-prefix=LE %s
+#
+        .text
+        b foo # BE: b foo # encoding: [0x10,0x00,A,A]
+              # LE: b foo # encoding: [A,A,0x00,0x10]
diff --git a/test/MC/Mips/micromips-control-instructions.s b/test/MC/Mips/micromips-control-instructions.s
index 8170a9c..aff84c2 100644
--- a/test/MC/Mips/micromips-control-instructions.s
+++ b/test/MC/Mips/micromips-control-instructions.s
@@ -1,6 +1,6 @@
-# RUN: llvm-mc %s -triple=mipsel -show-encoding -mattr=micromips \
+# RUN: llvm-mc %s -triple=mipsel -show-encoding -mcpu=mips32r2 -mattr=micromips \
 # RUN: | FileCheck -check-prefix=CHECK-EL %s
-# RUN: llvm-mc %s -triple=mips -show-encoding -mattr=micromips \
+# RUN: llvm-mc %s -triple=mips -show-encoding -mcpu=mips32r2 -mattr=micromips \
 # RUN: | FileCheck -check-prefix=CHECK-EB %s
 # Check that the assembler can handle the documented syntax
 # for control instructions.
@@ -10,7 +10,7 @@
 # Little endian
 #------------------------------------------------------------------------------
 # CHECK-EL:    break                      # encoding: [0x00,0x00,0x07,0x00]
-# CHECK-EL:    break 7, 0                 # encoding: [0x07,0x00,0x07,0x00]
+# CHECK-EL:    break 7                    # encoding: [0x07,0x00,0x07,0x00]
 # CHECK-EL:    break 7, 5                 # encoding: [0x07,0x00,0x47,0x01]
 # CHECK-EL:    syscall                    # encoding: [0x00,0x00,0x7c,0x8b]
 # CHECK-EL:    syscall 396                # encoding: [0x8c,0x01,0x7c,0x8b]
@@ -28,7 +28,7 @@
 # Big endian
 #------------------------------------------------------------------------------
 # CHECK-EB:   break                       # encoding: [0x00,0x00,0x00,0x07]
-# CHECK-EB:   break 7, 0                  # encoding: [0x00,0x07,0x00,0x07]
+# CHECK-EB:   break 7                     # encoding: [0x00,0x07,0x00,0x07]
 # CHECK-EB:   break 7, 5                  # encoding: [0x00,0x07,0x01,0x47]
 # CHECK-EB:   syscall                     # encoding: [0x00,0x00,0x8b,0x7c]
 # CHECK-EB:   syscall 396                 # encoding: [0x01,0x8c,0x8b,0x7c]
diff --git a/test/MC/Mips/micromips-el-fixup-data.s b/test/MC/Mips/micromips-el-fixup-data.s
index 2293f63..4753835 100644
--- a/test/MC/Mips/micromips-el-fixup-data.s
+++ b/test/MC/Mips/micromips-el-fixup-data.s
@@ -2,7 +2,7 @@
 # RUN:   -mattr=+micromips 2>&1 -filetype=obj > %t.o
 # RUN: llvm-objdump %t.o -triple mipsel -mattr=+micromips -d | FileCheck %s
 
-# Check that fixup data is writen in the microMIPS specific little endian
+# Check that fixup data is written in the microMIPS specific little endian
 # byte order.
 
     .text
diff --git a/test/MC/Mips/mips-control-instructions.s b/test/MC/Mips/mips-control-instructions.s
index 4a16c53..47da8cc 100644
--- a/test/MC/Mips/mips-control-instructions.s
+++ b/test/MC/Mips/mips-control-instructions.s
@@ -4,7 +4,7 @@
 # RUN: | FileCheck -check-prefix=CHECK64  %s
 
 # CHECK32:    break                      # encoding: [0x00,0x00,0x00,0x0d]
-# CHECK32:    break   7, 0               # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK32:    break   7                  # encoding: [0x00,0x07,0x00,0x0d]
 # CHECK32:    break   7, 5               # encoding: [0x00,0x07,0x01,0x4d]
 # CHECK32:    syscall                    # encoding: [0x00,0x00,0x00,0x0c]
 # CHECK32:    syscall 13396              # encoding: [0x00,0x0d,0x15,0x0c]
@@ -37,7 +37,7 @@
 # CHECK32:    tnei    $3, 1023           # encoding: [0x04,0x6e,0x03,0xff]
 
 # CHECK64:    break                      # encoding: [0x00,0x00,0x00,0x0d]
-# CHECK64:    break   7, 0               # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK64:    break   7                  # encoding: [0x00,0x07,0x00,0x0d]
 # CHECK64:    break   7, 5               # encoding: [0x00,0x07,0x01,0x4d]
 # CHECK64:    syscall                    # encoding: [0x00,0x00,0x00,0x0c]
 # CHECK64:    syscall 13396              # encoding: [0x00,0x0d,0x15,0x0c]
diff --git a/test/MC/Mips/mips1/invalid-mips2-wrong-error.s b/test/MC/Mips/mips1/invalid-mips2-wrong-error.s
new file mode 100644
index 0000000..8e878fe
--- /dev/null
+++ b/test/MC/Mips/mips1/invalid-mips2-wrong-error.s
@@ -0,0 +1,16 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ldc1      $f11,16391($s0) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldc2      $8,-21181($at)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldc3      $29,-28645($s1) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ll        $v0,-7321($s2)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sc        $t7,18904($s3)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdc1      $f31,30574($t5) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdc2      $20,23157($s2)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdc3      $12,5835($t2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/mips1/invalid-mips2.s b/test/MC/Mips/mips1/invalid-mips2.s
new file mode 100644
index 0000000..6c3e80a
--- /dev/null
+++ b/test/MC/Mips/mips1/invalid-mips2.s
@@ -0,0 +1,23 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        ceil.w.d  $f11,$f25       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.w.s  $f6,$f20        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.w.d $f14,$f11       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.w.s $f8,$f9         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.w.d $f6,$f4         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.w.s $f27,$f28       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sqrt.d    $f17,$f22       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sqrt.s    $f0,$f1         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teqi      $s5,-17504      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgei      $s1,5025        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeiu     $sp,-28621      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlti      $t6,-21059      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltiu     $ra,-5076       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tnei      $t4,-29647      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.w.d $f22,$f15       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.w.s $f28,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips1/invalid-mips3-wrong-error.s b/test/MC/Mips/mips1/invalid-mips3-wrong-error.s
new file mode 100644
index 0000000..2016e70
--- /dev/null
+++ b/test/MC/Mips/mips1/invalid-mips3-wrong-error.s
@@ -0,0 +1,23 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ld        $sp,-28645($s1)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldc1      $f11,16391($s0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldc2      $8,-21181($at)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldl       $24,-4167($24)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldr       $14,-30358($s4)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ll        $v0,-7321($s2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lld       $zero,-14736($ra) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwu       $s3,-24086($v1)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sc        $15,18904($s3)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        scd       $15,-8243($sp)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sd        $12,5835($10)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdc1      $f31,30574($13)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdc2      $20,23157($s2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdl       $a3,-20961($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdr       $11,-20423($12)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/mips1/invalid-mips3.s b/test/MC/Mips/mips1/invalid-mips3.s
new file mode 100644
index 0000000..d1b0eec
--- /dev/null
+++ b/test/MC/Mips/mips1/invalid-mips3.s
@@ -0,0 +1,65 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        dmult     $s7,$9            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsub      $a3,$s6,$8        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.l.d  $f1,$f3           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.l.s  $f18,$f13         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.w.d  $f11,$f25         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.w.s  $f6,$f20          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.d.l   $f4,$f16          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.d   $f24,$f15         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.s   $f11,$f29         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.s.l   $f15,$f30         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dadd      $s3,$at,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddi     $sp,$s4,-27705    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddiu    $k0,$s6,-4586     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddu     $s3,$at,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ddiv      $zero,$k0,$s3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ddivu     $zero,$s0,$s1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmfc1     $12,$f13          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmtc1     $s0,$f14          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmultu    $a1,$a2           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,18          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,$s4,18      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,$s4,$12     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll32    $zero,18          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll32    $zero,$zero,18    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsllv     $zero,$s4,$12     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,10            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,$s2,10        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,$s2,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra32    $gp,10            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra32    $gp,$s2,10        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrav     $gp,$s2,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,23            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,$6,23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,$6,$s4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl32    $s3,23            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl32    $s3,$6,23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrlv     $s3,$14,$s4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubu     $a1,$a1,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.l.d $f26,$f7          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.l.s $f12,$f5          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.w.d $f14,$f11         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.w.s $f8,$f9           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.l.d $f12,$f1          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.l.s $f25,$f5          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.w.d $f6,$f4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.w.s $f27,$f28         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sqrt.d    $f17,$f22         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sqrt.s    $f0,$f1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teqi      $s5,-17504        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgei      $s1,5025          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeiu     $sp,-28621        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlti      $14,-21059        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltiu     $ra,-5076         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tnei      $12,-29647        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.l.d $f23,$f23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.l.s $f28,$f31         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.w.d $f22,$f15         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.w.s $f28,$f30         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips1/invalid-mips4-wrong-error.s b/test/MC/Mips/mips1/invalid-mips4-wrong-error.s
new file mode 100644
index 0000000..2016e70
--- /dev/null
+++ b/test/MC/Mips/mips1/invalid-mips4-wrong-error.s
@@ -0,0 +1,23 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ld        $sp,-28645($s1)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldc1      $f11,16391($s0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldc2      $8,-21181($at)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldl       $24,-4167($24)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldr       $14,-30358($s4)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ll        $v0,-7321($s2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lld       $zero,-14736($ra) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwu       $s3,-24086($v1)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sc        $15,18904($s3)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        scd       $15,-8243($sp)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sd        $12,5835($10)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdc1      $f31,30574($13)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdc2      $20,23157($s2)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdl       $a3,-20961($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdr       $11,-20423($12)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/mips1/invalid-mips4.s b/test/MC/Mips/mips1/invalid-mips4.s
new file mode 100644
index 0000000..61aaf58
--- /dev/null
+++ b/test/MC/Mips/mips1/invalid-mips4.s
@@ -0,0 +1,82 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ceil.l.d  $f1,$f3           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.l.s  $f18,$f13         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.w.d  $f11,$f25         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.w.s  $f6,$f20          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.d.l   $f4,$f16          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.d   $f24,$f15         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.s   $f11,$f29         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.s.l   $f15,$f30         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dadd      $s3,$at,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddi     $sp,$s4,-27705    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddiu    $k0,$s6,-4586     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddu     $s3,$at,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ddiv      $zero,$k0,$s3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ddivu     $zero,$s0,$s1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmfc1     $12,$f13          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmtc1     $s0,$f14          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmult     $s7,$9            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmultu    $a1,$a2           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,$s4,$12     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,$s4,18      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,18          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll32    $zero,$zero,18    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll32    $zero,18          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsllv     $zero,$s4,$12     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,$s2,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,$s2,10        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,10            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra32    $gp,$s2,10        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra32    $gp,10            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrav     $gp,$s2,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,$6,$s4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,$6,23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,23            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl32    $s3,$6,23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl32    $s3,23            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrlv     $s3,$14,$s4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsub      $a3,$s6,$8        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubu     $a1,$a1,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        eret                        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.l.d $f26,$f7          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.l.s $f12,$f5          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.w.d $f14,$f11         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.w.s $f8,$f9           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ldxc1     $f8,$s7($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwxc1     $f12,$s1($s8)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f10,$fcc5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn      $v1,$s1,$s0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.d    $f26,$f20,$k0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.s    $f12,$f0,$s7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.d    $f0,$f2,$fcc0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz      $a1,$s6,$9        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.s    $f25,$f7,$v1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.l.d $f12,$f1          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.l.s $f25,$f5          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.w.d $f6,$f4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.w.s $f27,$f28         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdxc1     $f11,$10($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sqrt.d    $f17,$f22         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sqrt.s    $f0,$f1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swxc1     $f19,$12($k0)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teqi      $s5,-17504        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgei      $s1,5025          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeiu     $sp,-28621        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlti      $14,-21059        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltiu     $ra,-5076         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tnei      $12,-29647        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.l.d $f23,$f23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.l.s $f28,$f31         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.w.d $f22,$f15         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.w.s $f28,$f30         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips1/invalid-mips5-wrong-error.s b/test/MC/Mips/mips1/invalid-mips5-wrong-error.s
new file mode 100644
index 0000000..74473a3
--- /dev/null
+++ b/test/MC/Mips/mips1/invalid-mips5-wrong-error.s
@@ -0,0 +1,46 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        abs.ps    $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        add.ps    $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        alnv.ps   $f12,$f18,$f30,$t0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.eq.ps   $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.f.ps    $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.le.ps   $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.lt.ps   $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.nge.ps  $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngl.ps  $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngle.ps $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngt.ps  $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ole.ps  $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.olt.ps  $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.seq.ps  $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.sf.ps   $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ueq.ps  $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ule.ps  $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ult.ps  $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.un.ps   $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.ps.s  $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.s.pl  $f30,$f1            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.s.pu  $f14,$f25           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        madd.ps   $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        mov.ps    $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movf.ps   $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movn.ps   $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movt.ps   $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movz.ps   $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        msub.ps   $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        mul.ps    $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        neg.ps    $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        nmadd.ps  $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        nmsub.ps  $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        pll.ps    $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        plu.ps    $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        pul.ps    $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        puu.ps    $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        sub.ps    $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips1/invalid-mips5.s b/test/MC/Mips/mips1/invalid-mips5.s
new file mode 100644
index 0000000..1eddf02
--- /dev/null
+++ b/test/MC/Mips/mips1/invalid-mips5.s
@@ -0,0 +1,83 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ceil.l.d  $f1,$f3           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.l.s  $f18,$f13         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.w.d  $f11,$f25         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.w.s  $f6,$f20          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.d.l   $f4,$f16          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.d   $f24,$f15         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.s   $f11,$f29         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.s.l   $f15,$f30         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dadd      $s3,$at,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddi     $sp,$s4,-27705    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddiu    $k0,$s6,-4586     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddu     $s3,$at,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ddiv      $zero,$k0,$s3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ddivu     $zero,$s0,$s1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmfc1     $t0,$f13          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmtc1     $s0,$f14          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmultu    $a1,$a2           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,18          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,$s4,18      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,$s4,$t0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll32    $zero,18          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll32    $zero,$zero,18    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsllv     $zero,$s4,$t0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,10            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,$s2,10        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,$s2,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra32    $gp,10            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra32    $gp,$s2,10        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrav     $gp,$s2,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,23            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,$6,23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,$6,$s4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl32    $s3,23            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl32    $s3,$6,23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrlv     $s3,$t2,$s4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubu     $a1,$a1,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        eret                        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.l.d $f26,$f7          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.l.s $f12,$f5          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.w.d $f14,$f11         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.w.s $f8,$f9           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ldxc1     $f8,$s7($t3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        luxc1     $f19,$s6($s5)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwxc1     $f12,$s1($s8)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$a0,$fcc7     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn      $v1,$s1,$s0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.d    $f27,$f21,$k0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.s    $f12,$f0,$s7      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.d    $f0,$f2,$fcc0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz      $a1,$s6,$a3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$a3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.s    $f25,$f7,$v1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.l.d $f12,$f1          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.l.s $f25,$f5          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.w.d $f6,$f4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.w.s $f27,$f28         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sqrt.d    $f17,$f22         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sqrt.s    $f0,$f1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swxc1     $f19,$t0($k0)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teqi      $s5,-17504        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgei      $s1,5025          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeiu     $sp,-28621        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlti      $t2,-21059        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltiu     $ra,-5076         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tnei      $t0,-29647        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.l.d $f23,$f23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.l.s $f28,$f31         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.w.d $f22,$f15         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.w.s $f28,$f30         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdxc1     $f11,$a2($t2)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        suxc1     $f12,$k1($t1)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swxc1     $f19,$t0($k0)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips1/valid-xfail.s b/test/MC/Mips/mips1/valid-xfail.s
index 2ffeaa9..7696c9e 100644
--- a/test/MC/Mips/mips1/valid-xfail.s
+++ b/test/MC/Mips/mips1/valid-xfail.s
@@ -2,16 +2,10 @@
 # they aren't implemented yet).
 # This test is set up to XPASS if any instruction generates an encoding.
 #
-# FIXME: Test MIPS-I instead of MIPS32
-# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32 | not FileCheck %s
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 | not FileCheck %s
 # CHECK-NOT: encoding
 # XFAIL: *
 
-	.set noat
-	tlbp
-	tlbr
-	tlbwi
-	tlbwr
-	lwc0	c0_entrylo,-7321($s2)
-	lwc3	$10,-32265($k0)
-	swc0	c0_prid,18904($s3)
+        .set noat
+        lwc0    c0_entrylo,-7321($s2)
+        swc0    c0_prid,18904($s3)
diff --git a/test/MC/Mips/mips1/valid.s b/test/MC/Mips/mips1/valid.s
index 7fc866a..473e6b9 100644
--- a/test/MC/Mips/mips1/valid.s
+++ b/test/MC/Mips/mips1/valid.s
@@ -1,85 +1,102 @@
 # Instructions that are valid
 #
-# FIXME: Test MIPS-I instead of MIPS32
-# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32 | FileCheck %s
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 | FileCheck %s
 
-	.set noat
-	abs.d	$f7,$f25          # CHECK: encoding:
-	abs.s	$f9,$f16
-	add	$s7,$s2,$a1
-	add.d	$f1,$f7,$f29
-	add.s	$f8,$f21,$f24
-	addi	$t5,$t1,26322
-	addu	$t1,$a0,$a2
-	and	$s7,$v0,$t4
-	c.ngl.d	$f29,$f29
-	c.ngle.d	$f0,$f16
-	c.sf.d	$f30,$f0
-	c.sf.s	$f14,$f22
-	cfc1	$s1,$21
-	ctc1	$a2,$26
-	cvt.d.s	$f22,$f28
-	cvt.d.w	$f26,$f11
-	cvt.s.d	$f26,$f8
-	cvt.s.w	$f22,$f15
-	cvt.w.d	$f20,$f14
-	cvt.w.s	$f20,$f24
-	div	$zero,$t9,$t3
-	div.d	$f29,$f20,$f27
-	div.s	$f4,$f5,$f15
-	divu	$zero,$t9,$t7
-	ehb                      # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
-	lb	$t8,-14515($t2)
-	lbu	$t0,30195($v1)
-	lh	$t3,-8556($s5)
-	lhu	$s3,-22851($v0)
-	li	$at,-29773
-	li	$zero,-29889
-	lw	$t0,5674($a1)
-	lwc1	$f16,10225($k0)
-	lwc2	$18,-841($a2)
-	lwl	$s4,-4231($t7)
-	lwr	$zero,-19147($gp)
-	mfc1	$a3,$f27
-	mfhi	$s3
-	mfhi	$sp
-	mflo	$s1
-	mov.d	$f20,$f14
-	mov.s	$f2,$f27
-	move	$s8,$a0
-	move	$t9,$a2
-	mtc1	$s8,$f9
-	mthi	$s1
-	mtlo	$sp
-	mtlo	$t9
-	mul.d	$f20,$f20,$f16
-	mul.s	$f30,$f10,$f2
-	mult	$sp,$s4
-	mult	$sp,$v0
-	multu	$gp,$k0
-	multu	$t1,$s2
-	neg.d	$f27,$f18
-	neg.s	$f1,$f15
-	nop
-	nor	$a3,$zero,$a3
-	or	$t4,$s0,$sp
-	sb	$s6,-19857($t6)
-	sh	$t6,-6704($t7)
-	sllv	$a3,$zero,$t1
-	slt	$s7,$t3,$k1
-	slti	$s1,$t2,9489
-	sltiu	$t9,$t9,-15531
-	sltu	$s4,$s5,$t3
-	srav	$s1,$s7,$sp
-	srlv	$t9,$s4,$a0
-	ssnop                    # CHECK: ssnop # encoding:  [0x00,0x00,0x00,0x40]
-	sub	$s6,$s3,$t4
-	sub.d	$f18,$f3,$f17
-	sub.s	$f23,$f22,$f22
-	subu	$sp,$s6,$s6
-	sw	$ra,-10160($sp)
-	swc1	$f6,-8465($t8)
-	swc2	$25,24880($s0)
-	swl	$t7,13694($s3)
-	swr	$s1,-26590($t6)
-	xor	$s2,$a0,$s8
+        .set noat
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addu      $9,$a0,$a2
+        and       $s7,$v0,$12
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        cfc1      $s1,$21
+        ctc1      $a2,$26
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.s.d   $f26,$f8
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)
+        lwc3      $10,-32265($k0)
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        mfc1      $a3,$f27
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $s8,$a0
+        move      $25,$a2
+        mtc1      $s8,$f9
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        sb        $s6,-19857($14)
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)
+        swc3      $10,-32265($k0)
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        xor       $s2,$a0,$s8
diff --git a/test/MC/Mips/mips2/invalid-mips3-wrong-error.s b/test/MC/Mips/mips2/invalid-mips3-wrong-error.s
new file mode 100644
index 0000000..a3f829b
--- /dev/null
+++ b/test/MC/Mips/mips2/invalid-mips3-wrong-error.s
@@ -0,0 +1,19 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        dmult     $s7,$a5           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        dsub      $a3,$s6,$a4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ld        $sp,-28645($s1)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldl       $t8,-4167($t8)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldr       $t2,-30358($s4)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lld       $zero,-14736($ra) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwu       $s3,-24086($v1)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        scd       $t3,-8243($sp)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sd        $t0,5835($a6)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdl       $a3,-20961($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdr       $a7,-20423($t0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/mips2/invalid-mips3.s b/test/MC/Mips/mips2/invalid-mips3.s
new file mode 100644
index 0000000..ef498d7
--- /dev/null
+++ b/test/MC/Mips/mips2/invalid-mips3.s
@@ -0,0 +1,48 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        ceil.l.d   $f1,$f3           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.l.s   $f18,$f13         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.d.l    $f4,$f16          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.d    $f24,$f15         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.s    $f11,$f29         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.s.l    $f15,$f30         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dadd       $s3,$at,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddi      $sp,$s4,-27705    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddiu     $k0,$s6,-4586     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddu      $s3,$at,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ddiv       $zero,$k0,$s3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ddivu      $zero,$s0,$s1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmfc1      $t0,$f13          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmtc1      $s0,$f14          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmultu     $a1,$a2           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll       $zero,18          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll       $zero,$s4,18      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll       $zero,$s4,$t0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll32     $zero,18          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll32     $zero,$zero,18    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsllv      $zero,$s4,$t0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra       $gp,10            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra       $gp,$s2,10        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra       $gp,$s2,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra32     $gp,10            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra32     $gp,$s2,10        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrav      $gp,$s2,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl       $s3,23            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl       $s3,$6,23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl       $s3,$6,$s4        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl32     $s3,23            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl32     $s3,$6,23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrlv      $s3,$t2,$s4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubu      $a1,$a1,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        eret                         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.l.d  $f26,$f7          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.l.s  $f12,$f5          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.l.d  $f12,$f1          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.l.s  $f25,$f5          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.l.d   $f23,$f23        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.l.s   $f28,$f31        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/invalid-mips32.s b/test/MC/Mips/mips2/invalid-mips32.s
new file mode 100644
index 0000000..2975c68
--- /dev/null
+++ b/test/MC/Mips/mips2/invalid-mips32.s
@@ -0,0 +1,32 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        clo       $11,$a1         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        clz       $sp,$gp         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        deret                     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        eret                      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd      $s6,$13         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd      $zero,$9        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu     $s3,$gp         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu     $24,$s2         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfc0      $a2,$14,1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz      $a1,$s6,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub      $s7,$k1         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msubu     $15,$a1         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtc0      $9,$29,3        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mul       $s0,$s4,$at     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/invalid-mips32r2-xfail.s b/test/MC/Mips/mips2/invalid-mips32r2-xfail.s
new file mode 100644
index 0000000..073f777
--- /dev/null
+++ b/test/MC/Mips/mips2/invalid-mips32r2-xfail.s
@@ -0,0 +1,11 @@
+# Instructions that are supposed to be invalid but currently aren't
+# This test will XPASS if any insn stops assembling.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN:     2> %t1
+# RUN: not FileCheck %s < %t1
+# XFAIL: *
+
+# CHECK-NOT: error
+        .set noat
+        rdhwr   $sp,$11
diff --git a/test/MC/Mips/mips2/invalid-mips32r2.s b/test/MC/Mips/mips2/invalid-mips32r2.s
new file mode 100644
index 0000000..37f2eed
--- /dev/null
+++ b/test/MC/Mips/mips2/invalid-mips32r2.s
@@ -0,0 +1,59 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        clo     $t3,$a1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        clz     $sp,$gp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.d $f24,$f15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.s $f11,$f29           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        deret                       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        di      $s8                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei      $t6                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        eret                        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ldxc1   $f8,$s7($t7)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        luxc1   $f19,$s6($s5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwxc1   $f12,$s1($s8)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd    $s6,$t5             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd    $zero,$t1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd.d  $f18,$f19,$f26,$f20 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd.s  $f1,$f31,$f19,$f25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu   $s3,$gp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu   $t8,$s2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfc0    $a2,$14,1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhc1   $s8,$f24            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf    $gp,$t0,$fcc7       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d  $f6,$f11,$fcc5      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s  $f23,$f5,$fcc6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn    $v1,$s1,$s0         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.d  $f27,$f21,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.s  $f12,$f0,$s7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt    $zero,$s4,$fcc5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.d  $f0,$f2,$fcc0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s  $f30,$f2,$fcc1      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz    $a1,$s6,$t1         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d  $f12,$f29,$t1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.s  $f25,$f7,$v1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub    $s7,$k1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub.d  $f10,$f1,$f31,$f18  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub.s  $f12,$f19,$f10,$f16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msubu   $t7,$a1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtc0    $t1,$29,3           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthc1   $zero,$f16          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mul     $s0,$s4,$at         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmadd.d $f18,$f9,$f14,$f19  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmadd.s $f0,$f5,$f25,$f12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmsub.d $f30,$f8,$f16,$f30  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmsub.s $f1,$f24,$f19,$f4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pause                       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        rotr    $1,15               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        rotr    $1,$14,15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        rotrv   $1,$14,$15          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdxc1   $f11,$t2($t6)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        seb     $t9,$t7             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        seh     $v1,$t4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        suxc1   $f12,$k1($t5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swxc1   $f19,$t4($k0)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        wsbh    $k1,$t1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/invalid-mips4-wrong-error.s b/test/MC/Mips/mips2/invalid-mips4-wrong-error.s
new file mode 100644
index 0000000..193f6d7
--- /dev/null
+++ b/test/MC/Mips/mips2/invalid-mips4-wrong-error.s
@@ -0,0 +1,14 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ld        $sp,-28645($s1) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwu       $s3,-24086($v1) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        scd       $15,-8243($sp)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sd        $12,5835($10)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdl       $a3,-20961($s8) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdr       $11,-20423($12) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/test/MC/Mips/mips2/invalid-mips4.s b/test/MC/Mips/mips2/invalid-mips4.s
new file mode 100644
index 0000000..e2eb672
--- /dev/null
+++ b/test/MC/Mips/mips2/invalid-mips4.s
@@ -0,0 +1,65 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ceil.l.d  $f1,$f3         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.l.s  $f18,$f13       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.d.l   $f4,$f16        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.d   $f24,$f15       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.s   $f11,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.s.l   $f15,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dadd      $s3,$at,$ra     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddi     $sp,$s4,-27705  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddiu    $k0,$s6,-4586   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddu     $s3,$at,$ra     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ddiv      $zero,$k0,$s3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ddivu     $zero,$s0,$s1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmfc1     $12,$f13        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmtc1     $s0,$f14        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmult     $s7,$9          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmultu    $a1,$a2         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,$s4,$12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,$s4,18    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,18        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll32    $zero,$zero,18  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll32    $zero,18        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsllv     $zero,$s4,$12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,$s2,$s3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,$s2,10      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,10          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra32    $gp,$s2,10      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra32    $gp,10          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrav     $gp,$s2,$s3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,$6,$s4      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,$6,23       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,23          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl32    $s3,$6,23       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl32    $s3,23          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrlv     $s3,$14,$s4     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsub      $a3,$s6,$8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubu     $a1,$a1,$k0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        eret                      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.l.d $f26,$f7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.l.s $f12,$f5        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ldxc1     $f8,$s7($15)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwxc1     $f12,$s1($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz      $a1,$s6,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.l.d $f12,$f1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.l.s $f25,$f5        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdxc1     $f11,$10($14)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.l.d $f23,$f23       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.l.s $f28,$f31       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/invalid-mips5-wrong-error.s b/test/MC/Mips/mips2/invalid-mips5-wrong-error.s
new file mode 100644
index 0000000..0c58c6c
--- /dev/null
+++ b/test/MC/Mips/mips2/invalid-mips5-wrong-error.s
@@ -0,0 +1,46 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        abs.ps    $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        add.ps    $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        alnv.ps   $f12,$f18,$f30,$t0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.eq.ps   $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.f.ps    $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.le.ps   $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.lt.ps   $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.nge.ps  $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngl.ps  $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngle.ps $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngt.ps  $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ole.ps  $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.olt.ps  $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.seq.ps  $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.sf.ps   $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ueq.ps  $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ule.ps  $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ult.ps  $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.un.ps   $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.ps.s  $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.s.pl  $f30,$f1            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.s.pu  $f14,$f25           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        madd.ps   $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        mov.ps    $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movf.ps   $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movn.ps   $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movt.ps   $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movz.ps   $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        msub.ps   $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        mul.ps    $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        neg.ps    $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        nmadd.ps  $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        nmsub.ps  $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        pll.ps    $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        plu.ps    $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        pul.ps    $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        puu.ps    $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        sub.ps    $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips2/invalid-mips5.s b/test/MC/Mips/mips2/invalid-mips5.s
new file mode 100644
index 0000000..f777ffe
--- /dev/null
+++ b/test/MC/Mips/mips2/invalid-mips5.s
@@ -0,0 +1,66 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips2 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ceil.l.d  $f1,$f3         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ceil.l.s  $f18,$f13       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.d.l   $f4,$f16        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.d   $f24,$f15       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.s   $f11,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.s.l   $f15,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dadd      $s3,$at,$ra     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddi     $sp,$s4,-27705  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddiu    $k0,$s6,-4586   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddu     $s3,$at,$ra     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ddiv      $zero,$k0,$s3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ddivu     $zero,$s0,$s1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmfc1     $t0,$f13        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmtc1     $s0,$f14        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dmultu    $a1,$a2         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,18        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,$s4,18    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll      $zero,$s4,$t0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll32    $zero,18        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsll32    $zero,$zero,18  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsllv     $zero,$s4,$t0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,10          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,$s2,10      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra      $gp,$s2,$s3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra32    $gp,10          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsra32    $gp,$s2,10      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrav     $gp,$s2,$s3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,23          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,$6,23       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl      $s3,$6,$s4      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl32    $s3,23          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrl32    $s3,$6,23       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsrlv     $s3,$t2,$s4     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubu     $a1,$a1,$k0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        eret                      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.l.d $f26,$f7        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        floor.l.s $f12,$f5        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ldxc1     $f8,$s7($t3)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        luxc1     $f19,$s6($s5)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwxc1     $f12,$s1($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$a0,$fcc7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz      $a1,$s6,$a1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$a1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.l.d $f12,$f1        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        round.l.s $f25,$f5        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.l.d $f23,$f23       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        trunc.l.s $f28,$f31       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdxc1     $f11,$a2($t2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        suxc1     $f12,$k1($t1)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swxc1     $f19,$t0($k0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips2/valid-xfail.s b/test/MC/Mips/mips2/valid-xfail.s
deleted file mode 100644
index 2f82f5c..0000000
--- a/test/MC/Mips/mips2/valid-xfail.s
+++ /dev/null
@@ -1,17 +0,0 @@
-# Instructions that should be valid but currently fail for known reasons (e.g.
-# they aren't implemented yet).
-# This test is set up to XPASS if any instruction generates an encoding.
-#
-# FIXME: Test MIPS-II instead of MIPS32
-# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32 | not FileCheck %s
-# CHECK-NOT: encoding
-# XFAIL: *
-
-	.set noat
-	ldc3	$29,-28645($s1)
-	lwc3	$10,-32265($k0)
-	sdc3	$12,5835($t2)
-	tlbp
-	tlbr
-	tlbwi
-	tlbwr
diff --git a/test/MC/Mips/mips2/valid.s b/test/MC/Mips/mips2/valid.s
index 1a05040..e3effde 100644
--- a/test/MC/Mips/mips2/valid.s
+++ b/test/MC/Mips/mips2/valid.s
@@ -1,107 +1,126 @@
 # Instructions that are valid
 #
-# FIXME: Test MIPS-II instead of MIPS32
-# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32 | FileCheck %s
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips2 | FileCheck %s
 
-	.set noat
-	abs.d	$f7,$f25            # CHECK: encoding
-	abs.s	$f9,$f16
-	add	$s7,$s2,$a1
-	add.d	$f1,$f7,$f29
-	add.s	$f8,$f21,$f24
-	addi	$t5,$t1,26322
-	addu	$t1,$a0,$a2
-	and	$s7,$v0,$t4
-	c.ngl.d	$f29,$f29
-	c.ngle.d	$f0,$f16
-	c.sf.d	$f30,$f0
-	c.sf.s	$f14,$f22
-	ceil.w.d	$f11,$f25
-	ceil.w.s	$f6,$f20
-	cfc1	$s1,$21
-	ctc1	$a2,$26
-	cvt.d.s	$f22,$f28
-	cvt.d.w	$f26,$f11
-	cvt.s.d	$f26,$f8
-	cvt.s.w	$f22,$f15
-	cvt.w.d	$f20,$f14
-	cvt.w.s	$f20,$f24
-	div	$zero,$t9,$t3
-	div.d	$f29,$f20,$f27
-	div.s	$f4,$f5,$f15
-	divu	$zero,$t9,$t7
-	ehb                      # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
-	floor.w.d	$f14,$f11
-	floor.w.s	$f8,$f9
-	lb	$t8,-14515($t2)
-	lbu	$t0,30195($v1)
-	ldc1	$f11,16391($s0)
-	ldc2	$8,-21181($at)
-	lh	$t3,-8556($s5)
-	lhu	$s3,-22851($v0)
-	li	$at,-29773
-	li	$zero,-29889
-	ll	$v0,-7321($s2)
-	lw	$t0,5674($a1)
-	lwc1	$f16,10225($k0)
-	lwc2	$18,-841($a2)
-	lwl	$s4,-4231($t7)
-	lwr	$zero,-19147($gp)
-	mfc1	$a3,$f27
-	mfhi	$s3
-	mfhi	$sp
-	mflo	$s1
-	mov.d	$f20,$f14
-	mov.s	$f2,$f27
-	move	$s8,$a0
-	move	$t9,$a2
-	mtc1	$s8,$f9
-	mthi	$s1
-	mtlo	$sp
-	mtlo	$t9
-	mul.d	$f20,$f20,$f16
-	mul.s	$f30,$f10,$f2
-	mult	$sp,$s4
-	mult	$sp,$v0
-	multu	$gp,$k0
-	multu	$t1,$s2
-	neg.d	$f27,$f18
-	neg.s	$f1,$f15
-	nop
-	nor	$a3,$zero,$a3
-	or	$t4,$s0,$sp
-	round.w.d	$f6,$f4
-	round.w.s	$f27,$f28
-	sb	$s6,-19857($t6)
-	sc	$t7,18904($s3)
-	sdc1	$f31,30574($t5)
-	sdc2	$20,23157($s2)
-	sh	$t6,-6704($t7)
-	sllv	$a3,$zero,$t1
-	slt	$s7,$t3,$k1
-	slti	$s1,$t2,9489
-	sltiu	$t9,$t9,-15531
-	sltu	$s4,$s5,$t3
-	sqrt.d	$f17,$f22
-	sqrt.s	$f0,$f1
-	srav	$s1,$s7,$sp
-	srlv	$t9,$s4,$a0
-	ssnop                    # CHECK: ssnop # encoding:  [0x00,0x00,0x00,0x40]
-	sub	$s6,$s3,$t4
-	sub.d	$f18,$f3,$f17
-	sub.s	$f23,$f22,$f22
-	subu	$sp,$s6,$s6
-	sw	$ra,-10160($sp)
-	swc1	$f6,-8465($t8)
-	swc2	$25,24880($s0)
-	swl	$t7,13694($s3)
-	swr	$s1,-26590($t6)
-	teqi	$s5,-17504
-	tgei	$s1,5025
-	tgeiu	$sp,-28621
-	tlti	$t6,-21059
-	tltiu	$ra,-5076
-	tnei	$t4,-29647
-	trunc.w.d	$f22,$f15
-	trunc.w.s	$f28,$f30
-	xor	$s2,$a0,$s8
+        .set noat
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addu      $9,$a0,$a2
+        and       $s7,$v0,$12
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        ceil.w.d  $f11,$f25
+        ceil.w.s  $f6,$f20
+        cfc1      $s1,$21
+        ctc1      $a2,$26
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.s.d   $f26,$f8
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        floor.w.d $f14,$f11
+        floor.w.s $f8,$f9
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        ldc1      $f11,16391($s0)
+        ldc2      $8,-21181($at)
+        ldc3      $29,-28645($s1)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        ll        $v0,-7321($s2)
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)
+        lwc3      $10,-32265($k0)
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        mfc1      $a3,$f27
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $s8,$a0
+        move      $25,$a2
+        mtc1      $s8,$f9
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        round.w.d $f6,$f4
+        round.w.s $f27,$f28
+        sb        $s6,-19857($14)
+        sc        $15,18904($s3)
+        sdc1      $f31,30574($13)
+        sdc2      $20,23157($s2)
+        sdc3      $12,5835($10)
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sqrt.d    $f17,$f22
+        sqrt.s    $f0,$f1
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)
+        swc3      $10,-32265($k0)
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        teqi      $s5,-17504
+        tgei      $s1,5025
+        tgeiu     $sp,-28621
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlti      $14,-21059
+        tltiu     $ra,-5076
+        tnei      $12,-29647
+        trunc.w.d $f22,$f15
+        trunc.w.s $f28,$f30
+        xor       $s2,$a0,$s8
diff --git a/test/MC/Mips/mips3/invalid-mips4.s b/test/MC/Mips/mips3/invalid-mips4.s
new file mode 100644
index 0000000..6e15d79
--- /dev/null
+++ b/test/MC/Mips/mips3/invalid-mips4.s
@@ -0,0 +1,23 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips3 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ldxc1     $f8,$s7($15)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwxc1     $f12,$s1($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$8,$fcc7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz      $a1,$s6,$9     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$9   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdxc1     $f11,$10($14)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swxc1     $f19,$12($k0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips3/invalid-mips5-wrong-error.s b/test/MC/Mips/mips3/invalid-mips5-wrong-error.s
new file mode 100644
index 0000000..2c0246a
--- /dev/null
+++ b/test/MC/Mips/mips3/invalid-mips5-wrong-error.s
@@ -0,0 +1,46 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips3 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        abs.ps    $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        add.ps    $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        alnv.ps   $f12,$f18,$f30,$t0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.eq.ps   $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.f.ps    $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.le.ps   $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.lt.ps   $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.nge.ps  $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngl.ps  $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngle.ps $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngt.ps  $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ole.ps  $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.olt.ps  $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.seq.ps  $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.sf.ps   $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ueq.ps  $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ule.ps  $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ult.ps  $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.un.ps   $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.ps.s  $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.s.pl  $f30,$f1            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.s.pu  $f14,$f25           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        madd.ps   $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        mov.ps    $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movf.ps   $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movn.ps   $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movt.ps   $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movz.ps   $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        msub.ps   $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        mul.ps    $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        neg.ps    $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        nmadd.ps  $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        nmsub.ps  $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        pll.ps    $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        plu.ps    $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        pul.ps    $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        puu.ps    $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        sub.ps    $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips3/invalid-mips5.s b/test/MC/Mips/mips3/invalid-mips5.s
new file mode 100644
index 0000000..d25621b
--- /dev/null
+++ b/test/MC/Mips/mips3/invalid-mips5.s
@@ -0,0 +1,25 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips3 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ldxc1     $f8,$s7($t3)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        luxc1     $f19,$s6($s5)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        lwxc1     $f12,$s1($s8)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf      $gp,$a4,$fcc7   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.d    $f6,$f11,$fcc5  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movf.s    $f23,$f5,$fcc6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn      $v1,$s1,$s0     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.d    $f27,$f21,$k0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movn.s    $f12,$f0,$s7    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt      $zero,$s4,$fcc5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.d    $f0,$f2,$fcc0   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movt.s    $f30,$f2,$fcc1  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz      $a1,$s6,$a5     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.d    $f12,$f29,$a5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        movz.s    $f25,$f7,$v1    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sdxc1     $f11,$a6($t2)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        suxc1     $f12,$k1($t1)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        swxc1     $f19,$t0($k0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips3/valid-xfail.s b/test/MC/Mips/mips3/valid-xfail.s
deleted file mode 100644
index 740663e..0000000
--- a/test/MC/Mips/mips3/valid-xfail.s
+++ /dev/null
@@ -1,15 +0,0 @@
-# Instructions that should be valid but currently fail for known reasons (e.g.
-# they aren't implemented yet).
-# This test is set up to XPASS if any instruction generates an encoding.
-#
-# FIXME: Test MIPS-III instead of MIPS64
-# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64   | not FileCheck %s
-# CHECK-NOT: encoding
-# XFAIL: *
-
-	.set noat
-	lwc3	$10,-32265($k0)
-	tlbp
-	tlbr
-	tlbwi
-	tlbwr
diff --git a/test/MC/Mips/mips3/valid.s b/test/MC/Mips/mips3/valid.s
index dc9b48c..2067666 100644
--- a/test/MC/Mips/mips3/valid.s
+++ b/test/MC/Mips/mips3/valid.s
@@ -1,145 +1,176 @@
 # Instructions that are valid
 #
-# FIXME: Test MIPS-III instead of MIPS64
-# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64   | FileCheck %s
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips3 | FileCheck %s
 
-	.set noat
-	abs.d	$f7,$f25 # CHECK:encoding
-	abs.s	$f9,$f16
-	add	$s7,$s2,$a1
-	add.d	$f1,$f7,$f29
-	add.s	$f8,$f21,$f24
-	addi	$t5,$t1,26322
-	addu	$t1,$a0,$a2
-	and	$s7,$v0,$t4
-	c.ngl.d	$f29,$f29
-	c.ngle.d	$f0,$f16
-	c.sf.d	$f30,$f0
-	c.sf.s	$f14,$f22
-	ceil.l.d	$f1,$f3
-	ceil.l.s	$f18,$f13
-	ceil.w.d	$f11,$f25
-	ceil.w.s	$f6,$f20
-	cfc1	$s1,$21
-	ctc1	$a2,$26
-	cvt.d.l	$f4,$f16
-	cvt.d.s	$f22,$f28
-	cvt.d.w	$f26,$f11
-	cvt.l.d	$f24,$f15
-	cvt.l.s	$f11,$f29
-	cvt.s.d	$f26,$f8
-	cvt.s.l	$f15,$f30
-	cvt.s.w	$f22,$f15
-	cvt.w.d	$f20,$f14
-	cvt.w.s	$f20,$f24
-	dadd	$s3,$at,$ra
-	daddi	$sp,$s4,-27705
-	daddiu	$k0,$s6,-4586
-	ddiv	$zero,$k0,$s3
-	ddivu	$zero,$s0,$s1
-	div	$zero,$t9,$t3
-	div.d	$f29,$f20,$f27
-	div.s	$f4,$f5,$f15
-	divu	$zero,$t9,$t7
-	dmfc1	$t4,$f13
-	dmtc1	$s0,$f14
-	dmult	$s7,$t1
-	dmultu	$a1,$a2
-	dsllv	$zero,$s4,$t4
-	dsrav	$gp,$s2,$s3
-	dsrlv	$s3,$t6,$s4
-	dsub	$a3,$s6,$t0
-	dsubu	$a1,$a1,$k0
-	ehb                      # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
-	eret
-	floor.l.d	$f26,$f7
-	floor.l.s	$f12,$f5
-	floor.w.d	$f14,$f11
-	floor.w.s	$f8,$f9
-	lb	$t8,-14515($t2)
-	lbu	$t0,30195($v1)
-	ld	$sp,-28645($s1)
-	ldc1	$f11,16391($s0)
-	ldc2	$8,-21181($at)
-	ldl	$t8,-4167($t8)
-	ldr	$t6,-30358($s4)
-	lh	$t3,-8556($s5)
-	lhu	$s3,-22851($v0)
-	li	$at,-29773
-	li	$zero,-29889
-	ll	$v0,-7321($s2)
-	lld	$zero,-14736($ra)
-	lw	$t0,5674($a1)
-	lwc1	$f16,10225($k0)
-	lwc2	$18,-841($a2)
-	lwl	$s4,-4231($t7)
-	lwr	$zero,-19147($gp)
-	lwu	$s3,-24086($v1)
-	mfc1	$a3,$f27
-	mfhi	$s3
-	mfhi	$sp
-	mflo	$s1
-	mov.d	$f20,$f14
-	mov.s	$f2,$f27
-	move	$a0,$a3
-	move	$s5,$a0
-	move	$s8,$a0
-	move	$t9,$a2
-	mtc1	$s8,$f9
-	mthi	$s1
-	mtlo	$sp
-	mtlo	$t9
-	mul.d	$f20,$f20,$f16
-	mul.s	$f30,$f10,$f2
-	mult	$sp,$s4
-	mult	$sp,$v0
-	multu	$gp,$k0
-	multu	$t1,$s2
-	neg.d	$f27,$f18
-	neg.s	$f1,$f15
-	nop
-	nor	$a3,$zero,$a3
-	or	$t4,$s0,$sp
-	round.l.d	$f12,$f1
-	round.l.s	$f25,$f5
-	round.w.d	$f6,$f4
-	round.w.s	$f27,$f28
-	sb	$s6,-19857($t6)
-	sc	$t7,18904($s3)
-	scd	$t7,-8243($sp)
-	sd	$t4,5835($t2)
-	sdc1	$f31,30574($t5)
-	sdc2	$20,23157($s2)
-	sdl	$a3,-20961($s8)
-	sdr	$t3,-20423($t4)
-	sh	$t6,-6704($t7)
-	sllv	$a3,$zero,$t1
-	slt	$s7,$t3,$k1
-	slti	$s1,$t2,9489
-	sltiu	$t9,$t9,-15531
-	sltu	$s4,$s5,$t3
-	sqrt.d	$f17,$f22
-	sqrt.s	$f0,$f1
-	srav	$s1,$s7,$sp
-	srlv	$t9,$s4,$a0
-	ssnop                    # CHECK: ssnop # encoding:  [0x00,0x00,0x00,0x40]
-	sub	$s6,$s3,$t4
-	sub.d	$f18,$f3,$f17
-	sub.s	$f23,$f22,$f22
-	subu	$sp,$s6,$s6
-	sw	$ra,-10160($sp)
-	swc1	$f6,-8465($t8)
-	swc2	$25,24880($s0)
-	swl	$t7,13694($s3)
-	swr	$s1,-26590($t6)
-	teqi	$s5,-17504
-	tgei	$s1,5025
-	tgeiu	$sp,-28621
-	tlti	$t6,-21059
-	tltiu	$ra,-5076
-	tnei	$t4,-29647
-	trunc.l.d	$f23,$f23
-	trunc.l.s	$f28,$f31
-	trunc.w.d	$f22,$f15
-	trunc.w.s	$f28,$f30
-	xor	$s2,$a0,$s8
+        .set noat
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addu      $9,$a0,$a2
+        and       $s7,$v0,$12
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        ceil.l.d  $f1,$f3
+        ceil.l.s  $f18,$f13
+        ceil.w.d  $f11,$f25
+        ceil.w.s  $f6,$f20
+        cfc1      $s1,$21
+        ctc1      $a2,$26
+        cvt.d.l   $f4,$f16
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.l.d   $f24,$f15
+        cvt.l.s   $f11,$f29
+        cvt.s.d   $f26,$f8
+        cvt.s.l   $f15,$f30
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        dadd      $s3,$at,$ra
+        daddi     $sp,$s4,-27705
+        daddiu    $k0,$s6,-4586
+        daddu     $s3,$at,$ra
+        ddiv      $zero,$k0,$s3
+        ddivu     $zero,$s0,$s1
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        dmfc1     $12,$f13
+        dmtc1     $s0,$f14
+        dmult     $s7,$9
+        dmultu    $a1,$a2
+        dsll      $zero,18             # CHECK: dsll $zero, $zero, 18       # encoding: [0x00,0x00,0x04,0xb8]
+        dsll      $zero,$s4,18         # CHECK: dsll $zero, $20, 18         # encoding: [0x00,0x14,0x04,0xb8]
+        dsll      $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsll32    $zero,18             # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsll32    $zero,$zero,18       # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsllv     $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsra      $gp,10               # CHECK: dsra $gp, $gp, 10           # encoding: [0x00,0x1c,0xe2,0xbb]
+        dsra      $gp,$s2,10           # CHECK: dsra $gp, $18, 10           # encoding: [0x00,0x12,0xe2,0xbb]
+        dsra      $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsra32    $gp,10               # CHECK: dsra32 $gp, $gp, 10         # encoding: [0x00,0x1c,0xe2,0xbf]
+        dsra32    $gp,$s2,10           # CHECK: dsra32 $gp, $18, 10         # encoding: [0x00,0x12,0xe2,0xbf]
+        dsrav     $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsrl      $s3,23               # CHECK: dsrl $19, $19, 23           # encoding: [0x00,0x13,0x9d,0xfa]
+        dsrl      $s3,$6,23            # CHECK: dsrl $19, $6, 23            # encoding: [0x00,0x06,0x9d,0xfa]
+        dsrl      $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsrl32    $s3,23               # CHECK: dsrl32 $19, $19, 23         # encoding: [0x00,0x13,0x9d,0xfe]
+        dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
+        dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsub      $a3,$s6,$8
+        dsubu     $a1,$a1,$k0
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        eret
+        floor.l.d $f26,$f7
+        floor.l.s $f12,$f5
+        floor.w.d $f14,$f11
+        floor.w.s $f8,$f9
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        ld        $sp,-28645($s1)
+        ldc1      $f11,16391($s0)
+        ldc2      $8,-21181($at)
+        ldl       $24,-4167($24)
+        ldr       $14,-30358($s4)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        ll        $v0,-7321($s2)
+        lld       $zero,-14736($ra)
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        lwu       $s3,-24086($v1)
+        mfc1      $a3,$f27
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $a0,$a3
+        move      $s5,$a0
+        move      $s8,$a0
+        move      $25,$a2
+        mtc1      $s8,$f9
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        round.l.d $f12,$f1
+        round.l.s $f25,$f5
+        round.w.d $f6,$f4
+        round.w.s $f27,$f28
+        sb        $s6,-19857($14)
+        sc        $15,18904($s3)
+        scd       $15,-8243($sp)
+        sd        $12,5835($10)
+        sdc1      $f31,30574($13)
+        sdc2      $20,23157($s2)
+        sdl       $a3,-20961($s8)
+        sdr       $11,-20423($12)
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sqrt.d    $f17,$f22
+        sqrt.s    $f0,$f1
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        teqi      $s5,-17504
+        tgei      $s1,5025
+        tgeiu     $sp,-28621
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlti      $14,-21059
+        tltiu     $ra,-5076
+        tnei      $12,-29647
+        trunc.l.d $f23,$f23
+        trunc.l.s $f28,$f31
+        trunc.w.d $f22,$f15
+        trunc.w.s $f28,$f30
+        xor       $s2,$a0,$s8
diff --git a/test/MC/Mips/mips32/invalid-mips32r2-xfail.s b/test/MC/Mips/mips32/invalid-mips32r2-xfail.s
index 73fba94..604ddbf 100644
--- a/test/MC/Mips/mips32/invalid-mips32r2-xfail.s
+++ b/test/MC/Mips/mips32/invalid-mips32r2-xfail.s
@@ -8,12 +8,4 @@
 
 # CHECK-NOT: error
         .set noat
-        cvt.l.d $f24,$f15
-        cvt.l.s $f11,$f29
-        di      $s8
-        ei      $t6
-        luxc1   $f19,$s6($s5)
-        mfhc1   $s8,$f24
-        mthc1   $zero,$f16
         rdhwr   $sp,$11
-        suxc1   $f12,$k1($t5)
diff --git a/test/MC/Mips/mips32/invalid-mips32r2.s b/test/MC/Mips/mips32/invalid-mips32r2.s
index 881f7f1..fa6fe32 100644
--- a/test/MC/Mips/mips32/invalid-mips32r2.s
+++ b/test/MC/Mips/mips32/invalid-mips32r2.s
@@ -4,20 +4,31 @@
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
-        .set noat
+	.set noat
+        cvt.l.d $f24,$f15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        cvt.l.s $f11,$f29           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        di      $s8                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei      $t6                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ldxc1   $f8,$s7($t7)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        luxc1   $f19,$s6($s5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1   $f12,$s1($s8)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd.d  $f18,$f19,$f26,$f20 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd.s  $f1,$f31,$f19,$f25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhc1   $s8,$f24            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         msub.d  $f10,$f1,$f31,$f18  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         msub.s  $f12,$f19,$f10,$f16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthc1   $zero,$f16          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         nmadd.d $f18,$f9,$f14,$f19  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         nmadd.s $f0,$f5,$f25,$f12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         nmsub.d $f30,$f8,$f16,$f30  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         nmsub.s $f1,$f24,$f19,$f4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         pause                       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        rotr    $1,15               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        rotr    $1,$14,15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        rotrv   $1,$14,$15          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         sdxc1   $f11,$t2($t6)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         seb     $t9,$t7             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         seh     $v1,$t4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        suxc1   $f12,$k1($t5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         swxc1   $f19,$t4($k0)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         wsbh    $k1,$t1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32/invalid-mips64.s b/test/MC/Mips/mips32/invalid-mips64.s
new file mode 100644
index 0000000..41040ed
--- /dev/null
+++ b/test/MC/Mips/mips32/invalid-mips64.s
@@ -0,0 +1,9 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips32 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+	dclo	$s2,$a2    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+	dclz	$s0,$t9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32/valid-xfail.s b/test/MC/Mips/mips32/valid-xfail.s
index 65cebd3..d680740 100644
--- a/test/MC/Mips/mips32/valid-xfail.s
+++ b/test/MC/Mips/mips32/valid-xfail.s
@@ -2,43 +2,37 @@
 # they aren't implemented yet).
 # This test is set up to XPASS if any instruction generates an encoding.
 #
-# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32   | not FileCheck %s
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32 | not FileCheck %s
 # CHECK-NOT: encoding
 # XFAIL: *
 
-	.set noat
-	c.eq.d	$fcc1,$f15,$f15
-	c.eq.s	$fcc5,$f24,$f17
-	c.f.d	$fcc4,$f11,$f21
-	c.f.s	$fcc4,$f30,$f7
-	c.le.d	$fcc4,$f18,$f1
-	c.le.s	$fcc6,$f24,$f4
-	c.lt.d	$fcc3,$f9,$f3
-	c.lt.s	$fcc2,$f17,$f14
-	c.nge.d	$fcc5,$f21,$f16
-	c.nge.s	$fcc3,$f11,$f8
-	c.ngl.s	$fcc2,$f31,$f23
-	c.ngle.s	$fcc2,$f18,$f23
-	c.ngt.d	$fcc4,$f24,$f7
-	c.ngt.s	$fcc5,$f8,$f13
-	c.ole.d	$fcc2,$f16,$f31
-	c.ole.s	$fcc3,$f7,$f20
-	c.olt.d	$fcc4,$f19,$f28
-	c.olt.s	$fcc6,$f20,$f7
-	c.seq.d	$fcc4,$f31,$f7
-	c.seq.s	$fcc7,$f1,$f25
-	c.ueq.d	$fcc4,$f13,$f25
-	c.ueq.s	$fcc6,$f3,$f30
-	c.ule.d	$fcc7,$f25,$f18
-	c.ule.s	$fcc7,$f21,$f30
-	c.ult.d	$fcc6,$f6,$f17
-	c.ult.s	$fcc7,$f24,$f10
-	c.un.d	$fcc6,$f23,$f24
-	c.un.s	$fcc1,$f30,$f4
-	ldc3	$29,-28645($s1)
-	rorv	$t5,$a3,$s5
-	sdc3	$12,5835($t2)
-	tlbp
-	tlbr
-	tlbwi
-	tlbwr
+        .set noat
+        c.eq.d          $fcc1,$f15,$f15
+        c.eq.s          $fcc5,$f24,$f17
+        c.f.d           $fcc4,$f11,$f21
+        c.f.s           $fcc4,$f30,$f7
+        c.le.d          $fcc4,$f18,$f1
+        c.le.s          $fcc6,$f24,$f4
+        c.lt.d          $fcc3,$f9,$f3
+        c.lt.s          $fcc2,$f17,$f14
+        c.nge.d         $fcc5,$f21,$f16
+        c.nge.s         $fcc3,$f11,$f8
+        c.ngl.s         $fcc2,$f31,$f23
+        c.ngle.s        $fcc2,$f18,$f23
+        c.ngt.d         $fcc4,$f24,$f7
+        c.ngt.s         $fcc5,$f8,$f13
+        c.ole.d         $fcc2,$f16,$f31
+        c.ole.s         $fcc3,$f7,$f20
+        c.olt.d         $fcc4,$f19,$f28
+        c.olt.s         $fcc6,$f20,$f7
+        c.seq.d         $fcc4,$f31,$f7
+        c.seq.s         $fcc7,$f1,$f25
+        c.ueq.d         $fcc4,$f13,$f25
+        c.ueq.s         $fcc6,$f3,$f30
+        c.ule.d         $fcc7,$f25,$f18
+        c.ule.s         $fcc7,$f21,$f30
+        c.ult.d         $fcc6,$f6,$f17
+        c.ult.s         $fcc7,$f24,$f10
+        c.un.d          $fcc6,$f23,$f24
+        c.un.s          $fcc1,$f30,$f4
+        rorv            $13,$a3,$s5
diff --git a/test/MC/Mips/mips32/valid.s b/test/MC/Mips/mips32/valid.s
index 9e83c0f..bc29bdc 100644
--- a/test/MC/Mips/mips32/valid.s
+++ b/test/MC/Mips/mips32/valid.s
@@ -1,131 +1,147 @@
 # Instructions that are valid
 #
-# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32   | FileCheck %s
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32 | FileCheck %s
 
         .set noat
-	abs.d	$f7,$f25 # CHECK: encoding
-	abs.s	$f9,$f16
-	add	$s7,$s2,$a1
-	add.d	$f1,$f7,$f29
-	add.s	$f8,$f21,$f24
-	addi	$t5,$t1,26322
-	addu	$t1,$a0,$a2
-	and	$s7,$v0,$t4
-	c.ngl.d	$f29,$f29
-	c.ngle.d	$f0,$f16
-	c.sf.d	$f30,$f0
-	c.sf.s	$f14,$f22
-	ceil.w.d	$f11,$f25
-	ceil.w.s	$f6,$f20
-	cfc1	$s1,$21
-	clo	$t3,$a1
-	clz	$sp,$gp
-	ctc1	$a2,$26
-	cvt.d.s	$f22,$f28
-	cvt.d.w	$f26,$f11
-	cvt.s.d	$f26,$f8
-	cvt.s.w	$f22,$f15
-	cvt.w.d	$f20,$f14
-	cvt.w.s	$f20,$f24
-	deret
-	div	$zero,$t9,$t3
-	div.d	$f29,$f20,$f27
-	div.s	$f4,$f5,$f15
-	divu	$zero,$t9,$t7
-	ehb                      # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
-	eret
-	floor.w.d	$f14,$f11
-	floor.w.s	$f8,$f9
-	lb	$t8,-14515($t2)
-	lbu	$t0,30195($v1)
-	ldc1	$f11,16391($s0)
-	ldc2	$8,-21181($at)
-	lh	$t3,-8556($s5)
-	lhu	$s3,-22851($v0)
-	li	$at,-29773
-	li	$zero,-29889
-	ll	$v0,-7321($s2)
-	lw	$t0,5674($a1)
-	lwc1	$f16,10225($k0)
-	lwc2	$18,-841($a2)
-	lwl	$s4,-4231($t7)
-	lwr	$zero,-19147($gp)
-	madd	$s6,$t5
-	madd	$zero,$t1
-	maddu	$s3,$gp
-	maddu	$t8,$s2
-	mfc0	$a2,$14,1
-	mfc1	$a3,$f27
-	mfhi	$s3
-	mfhi	$sp
-	mflo	$s1
-	mov.d	$f20,$f14
-	mov.s	$f2,$f27
-	move	$s8,$a0
-	move	$t9,$a2
-	movf	$gp,$t0,$fcc7
-	movf.d	$f6,$f11,$fcc5
-	movf.s	$f23,$f5,$fcc6
-	movn	$v1,$s1,$s0
-	movn.d	$f27,$f21,$k0
-	movn.s	$f12,$f0,$s7
-	movt	$zero,$s4,$fcc5
-	movt.d	$f0,$f2,$fcc0
-	movt.s	$f30,$f2,$fcc1
-	movz	$a1,$s6,$t1
-	movz.d	$f12,$f29,$t1
-	movz.s	$f25,$f7,$v1
-	msub	$s7,$k1
-	msubu	$t7,$a1
-	mtc0	$t1,$29,3
-	mtc1	$s8,$f9
-	mthi	$s1
-	mtlo	$sp
-	mtlo	$t9
-	mul	$s0,$s4,$at
-	mul.d	$f20,$f20,$f16
-	mul.s	$f30,$f10,$f2
-	mult	$sp,$s4
-	mult	$sp,$v0
-	multu	$gp,$k0
-	multu	$t1,$s2
-	neg.d	$f27,$f18
-	neg.s	$f1,$f15
-	nop
-	nor	$a3,$zero,$a3
-	or	$t4,$s0,$sp
-	round.w.d	$f6,$f4
-	round.w.s	$f27,$f28
-	sb	$s6,-19857($t6)
-	sc	$t7,18904($s3)
-	sdc1	$f31,30574($t5)
-	sdc2	$20,23157($s2)
-	sh	$t6,-6704($t7)
-	sllv	$a3,$zero,$t1
-	slt	$s7,$t3,$k1
-	slti	$s1,$t2,9489
-	sltiu	$t9,$t9,-15531
-	sltu	$s4,$s5,$t3
-	sqrt.d	$f17,$f22
-	sqrt.s	$f0,$f1
-	srav	$s1,$s7,$sp
-	srlv	$t9,$s4,$a0
-	ssnop                    # CHECK: ssnop # encoding:  [0x00,0x00,0x00,0x40]
-	sub	$s6,$s3,$t4
-	sub.d	$f18,$f3,$f17
-	sub.s	$f23,$f22,$f22
-	subu	$sp,$s6,$s6
-	sw	$ra,-10160($sp)
-	swc1	$f6,-8465($t8)
-	swc2	$25,24880($s0)
-	swl	$t7,13694($s3)
-	swr	$s1,-26590($t6)
-	teqi	$s5,-17504
-	tgei	$s1,5025
-	tgeiu	$sp,-28621
-	tlti	$t6,-21059
-	tltiu	$ra,-5076
-	tnei	$t4,-29647
-	trunc.w.d	$f22,$f15
-	trunc.w.s	$f28,$f30
-	xor	$s2,$a0,$s8
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addu      $9,$a0,$a2
+        and       $s7,$v0,$12
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        ceil.w.d  $f11,$f25
+        ceil.w.s  $f6,$f20
+        cfc1      $s1,$21
+        clo       $11,$a1
+        clz       $sp,$gp
+        ctc1      $a2,$26
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.s.d   $f26,$f8
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        deret
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        eret
+        floor.w.d $f14,$f11
+        floor.w.s $f8,$f9
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        ldc1      $f11,16391($s0)
+        ldc2      $8,-21181($at)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        ll        $v0,-7321($s2)
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        madd      $s6,$13
+        madd      $zero,$9
+        maddu     $s3,$gp
+        maddu     $24,$s2
+        mfc0      $a2,$14,1
+        mfc1      $a3,$f27
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $s8,$a0
+        move      $25,$a2
+        movf      $gp,$8,$fcc7
+        movf.d    $f6,$f11,$fcc5
+        movf.s    $f23,$f5,$fcc6
+        movn      $v1,$s1,$s0
+        movn.d    $f27,$f21,$k0
+        movn.s    $f12,$f0,$s7
+        movt      $zero,$s4,$fcc5
+        movt.d    $f0,$f2,$fcc0
+        movt.s    $f30,$f2,$fcc1
+        movz      $a1,$s6,$9
+        movz.d    $f12,$f29,$9
+        movz.s    $f25,$f7,$v1
+        msub      $s7,$k1
+        msubu     $15,$a1
+        mtc0      $9,$29,3
+        mtc1      $s8,$f9
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul       $s0,$s4,$at
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        round.w.d $f6,$f4
+        round.w.s $f27,$f28
+        sb        $s6,-19857($14)
+        sc        $15,18904($s3)
+        sdc1      $f31,30574($13)
+        sdc2      $20,23157($s2)
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sqrt.d    $f17,$f22
+        sqrt.s    $f0,$f1
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        teqi      $s5,-17504
+        tgei      $s1,5025
+        tgeiu     $sp,-28621
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlti      $14,-21059
+        tltiu     $ra,-5076
+        tnei      $12,-29647
+        trunc.w.d $f22,$f15
+        trunc.w.s $f28,$f30
+        xor       $s2,$a0,$s8
diff --git a/test/MC/Mips/mips32r2/invalid-mips64r2.s b/test/MC/Mips/mips32r2/invalid-mips64r2.s
new file mode 100644
index 0000000..293e58e
--- /dev/null
+++ b/test/MC/Mips/mips32r2/invalid-mips64r2.s
@@ -0,0 +1,10 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding \
+# RUN:     -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+	dsbh	$v1,$t6    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+	dshd	$v0,$sp    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+
diff --git a/test/MC/Mips/mips32r2/valid-xfail.s b/test/MC/Mips/mips32r2/valid-xfail.s
index 623c7f6..ef02d51 100644
--- a/test/MC/Mips/mips32r2/valid-xfail.s
+++ b/test/MC/Mips/mips32r2/valid-xfail.s
@@ -6,310 +6,304 @@
 # CHECK-NOT: encoding
 # XFAIL: *
 
-	.set noat
-	abs.ps	$f22,$f8
-	absq_s.ph	$t0,$a0
-	absq_s.qb	$t7,$s1
-	absq_s.w	$s3,$ra
-	add.ps	$f25,$f27,$f13
-	addq.ph	$s1,$t7,$at
-	addq_s.ph	$s3,$s6,$s2
-	addq_s.w	$a2,$t0,$at
-	addqh.ph	$s4,$t6,$s1
-	addqh.w	$s7,$s7,$k1
-	addqh_r.ph	$sp,$t9,$s8
-	addqh_r.w	$t0,$v1,$zero
-	addsc	$s8,$t7,$t4
-	addu.ph	$a2,$t6,$s3
-	addu.qb	$s6,$v1,$v1
-	addu_s.ph	$a3,$s3,$gp
-	addu_s.qb	$s4,$s8,$s1
-	adduh.qb	$a1,$a1,$at
-	adduh_r.qb	$a0,$t1,$t4
-	addwc	$k0,$s6,$s7
-	alnv.ps	$f12,$f18,$f30,$t4
-	and.v	$w10,$w25,$w29
-	bitrev	$t6,$at
-	bmnz.v	$w15,$w2,$w28
-	bmz.v	$w13,$w11,$w21
-	bsel.v	$w28,$w7,$w0
-	c.eq.d	$fcc1,$f15,$f15
-	c.eq.ps	$fcc5,$f0,$f9
-	c.eq.s	$fcc5,$f24,$f17
-	c.f.d	$fcc4,$f11,$f21
-	c.f.ps	$fcc6,$f11,$f11
-	c.f.s	$fcc4,$f30,$f7
-	c.le.d	$fcc4,$f18,$f1
-	c.le.ps	$fcc1,$f7,$f20
-	c.le.s	$fcc6,$f24,$f4
-	c.lt.d	$fcc3,$f9,$f3
-	c.lt.ps	$f19,$f5
-	c.lt.s	$fcc2,$f17,$f14
-	c.nge.d	$fcc5,$f21,$f16
-	c.nge.ps	$f1,$f26
-	c.nge.s	$fcc3,$f11,$f8
-	c.ngl.ps	$f21,$f30
-	c.ngl.s	$fcc2,$f31,$f23
-	c.ngle.ps	$fcc7,$f12,$f20
-	c.ngle.s	$fcc2,$f18,$f23
-	c.ngt.d	$fcc4,$f24,$f7
-	c.ngt.ps	$fcc5,$f30,$f6
-	c.ngt.s	$fcc5,$f8,$f13
-	c.ole.d	$fcc2,$f16,$f31
-	c.ole.ps	$fcc7,$f21,$f8
-	c.ole.s	$fcc3,$f7,$f20
-	c.olt.d	$fcc4,$f19,$f28
-	c.olt.ps	$fcc3,$f7,$f16
-	c.olt.s	$fcc6,$f20,$f7
-	c.seq.d	$fcc4,$f31,$f7
-	c.seq.ps	$fcc6,$f31,$f14
-	c.seq.s	$fcc7,$f1,$f25
-	c.sf.ps	$fcc6,$f4,$f6
-	c.ueq.d	$fcc4,$f13,$f25
-	c.ueq.ps	$fcc1,$f5,$f29
-	c.ueq.s	$fcc6,$f3,$f30
-	c.ule.d	$fcc7,$f25,$f18
-	c.ule.ps	$fcc6,$f17,$f3
-	c.ule.s	$fcc7,$f21,$f30
-	c.ult.d	$fcc6,$f6,$f17
-	c.ult.ps	$fcc7,$f14,$f0
-	c.ult.s	$fcc7,$f24,$f10
-	c.un.d	$fcc6,$f23,$f24
-	c.un.ps	$fcc4,$f2,$f26
-	c.un.s	$fcc1,$f30,$f4
-	ceil.l.d	$f1,$f3
-	ceil.l.s	$f18,$f13
-	cfcmsa	$s6,$19
-	cmp.eq.ph	$s7,$t6
-	cmp.le.ph	$t0,$t6
-	cmp.lt.ph	$k0,$sp
-	cmpgdu.eq.qb	$s3,$zero,$k0
-	cmpgdu.le.qb	$v1,$t7,$s2
-	cmpgdu.lt.qb	$s0,$gp,$sp
-	cmpgu.eq.qb	$t6,$s6,$s8
-	cmpgu.le.qb	$t1,$a3,$s4
-	cmpgu.lt.qb	$sp,$at,$t0
-	cmpu.eq.qb	$v0,$t8
-	cmpu.le.qb	$s1,$a1
-	cmpu.lt.qb	$at,$a3
-	ctcmsa	$31,$s7
-	cvt.d.l	$f4,$f16
-	cvt.ps.s	$f3,$f18,$f19
-	cvt.s.l	$f15,$f30
-	cvt.s.pl	$f30,$f1
-	cvt.s.pu	$f14,$f25
-	dmt	$k0
-	dpa.w.ph	$ac1,$s7,$k0
-	dpaq_s.w.ph	$ac2,$a0,$t5
-	dpaq_sa.l.w	$ac0,$a2,$t6
-	dpaqx_s.w.ph	$ac3,$a0,$t8
-	dpaqx_sa.w.ph	$ac1,$zero,$s5
-	dpau.h.qbl	$ac1,$t2,$t8
-	dpau.h.qbr	$ac1,$s7,$s6
-	dpax.w.ph	$ac3,$a0,$k0
-	dps.w.ph	$ac1,$a3,$a1
-	dpsq_s.w.ph	$ac0,$gp,$k0
-	dpsq_sa.l.w	$ac0,$a3,$t7
-	dpsqx_s.w.ph	$ac3,$t5,$a3
-	dpsqx_sa.w.ph	$ac3,$sp,$s2
-	dpsu.h.qbl	$ac2,$t6,$t2
-	dpsu.h.qbr	$ac2,$a1,$s6
-	dpsx.w.ph	$ac0,$s7,$gp
-	dvpe	$s6
-	emt	$t0
-	evpe	$v0
-	extpdpv	$s6,$ac0,$s8
-	extpv	$t5,$ac0,$t6
-	extrv.w	$t0,$ac3,$at
-	extrv_r.w	$t0,$ac1,$s6
-	extrv_rs.w	$gp,$ac1,$s6
-	extrv_s.h	$s2,$ac1,$t6
-	fclass.d	$w14,$w27
-	fclass.w	$w19,$w28
-	fexupl.d	$w10,$w29
-	fexupl.w	$w12,$w27
-	fexupr.d	$w31,$w15
-	fexupr.w	$w29,$w12
-	ffint_s.d	$w1,$w30
-	ffint_s.w	$w16,$w14
-	ffint_u.d	$w23,$w18
-	ffint_u.w	$w19,$w12
-	ffql.d	$w2,$w3
-	ffql.w	$w9,$w0
-	ffqr.d	$w25,$w24
-	ffqr.w	$w10,$w6
-	fill.b	$w9,$v1
-	fill.h	$w9,$t0
-	fill.w	$w31,$t7
-	flog2.d	$w12,$w16
-	flog2.w	$w19,$w23
-	floor.l.d	$f26,$f7
-	floor.l.s	$f12,$f5
-	fork	$s2,$t0,$a0
-	frcp.d	$w12,$w4
-	frcp.w	$w30,$w8
-	frint.d	$w20,$w8
-	frint.w	$w11,$w29
-	frsqrt.d	$w29,$w2
-	frsqrt.w	$w9,$w8
-	fsqrt.d	$w3,$w1
-	fsqrt.w	$w5,$w15
-	ftint_s.d	$w31,$w26
-	ftint_s.w	$w27,$w14
-	ftint_u.d	$w5,$w31
-	ftint_u.w	$w12,$w29
-	ftrunc_s.d	$w4,$w22
-	ftrunc_s.w	$w24,$w7
-	ftrunc_u.d	$w20,$w25
-	ftrunc_u.w	$w7,$w26
-	insv	$s2,$at
-	iret
-	lbe	$t6,122($t1)
-	lbue	$t3,-108($t2)
-	lbux	$t1,$t6($v0)
-	ldc3	$29,-28645($s1)
-	lhe	$s6,219($v1)
-	lhue	$gp,118($t3)
-	lhx	$sp,$k0($t7)
-	lle	$gp,-237($ra)
-	lwe	$ra,-145($t6)
-	lwle	$t3,-42($t3)
-	lwre	$sp,-152($t8)
-	lwx	$t4,$t4($s4)
-	madd.ps	$f22,$f3,$f14,$f3
-	maq_s.w.phl	$ac2,$t9,$t3
-	maq_s.w.phr	$ac0,$t2,$t9
-	maq_sa.w.phl	$ac3,$a1,$v1
-	maq_sa.w.phr	$ac1,$at,$t2
-	mfgc0	$s6,c0_datahi1
-	mflo	$t1,$ac2
-	modsub	$a3,$t4,$a3
-	mov.ps	$f22,$f17
-	move.v	$w8,$w17
-	movf.ps	$f10,$f28,$fcc6
-	movn.ps	$f31,$f31,$s3
-	movt.ps	$f20,$f25,$fcc2
-	movz.ps	$f18,$f17,$ra
-	msub	$ac2,$sp,$t6
-	msub.ps	$f12,$f14,$f29,$f17
-	msubu	$ac2,$a1,$t8
-	mtc0	$t1,c0_datahi1
-	mtgc0	$s4,$21,7
-	mthi	$v0,$ac1
-	mthlip	$a3,$ac0
-	mul.ph	$s4,$t8,$s0
-	mul.ps	$f14,$f0,$f16
-	mul_s.ph	$t2,$t6,$t7
-	muleq_s.w.phl	$t3,$s4,$s4
-	muleq_s.w.phr	$s6,$a0,$s8
-	muleu_s.ph.qbl	$a2,$t6,$t0
-	muleu_s.ph.qbr	$a1,$ra,$t1
-	mulq_rs.ph	$s2,$t6,$t7
-	mulq_rs.w	$at,$s4,$t9
-	mulq_s.ph	$s0,$k1,$t7
-	mulq_s.w	$t1,$a3,$s0
-	mulsa.w.ph	$ac1,$s4,$s6
-	mulsaq_s.w.ph	$ac0,$ra,$s2
-	neg.ps	$f19,$f13
-	nloc.b	$w12,$w30
-	nloc.d	$w16,$w7
-	nloc.h	$w21,$w17
-	nloc.w	$w17,$w16
-	nlzc.b	$w12,$w7
-	nlzc.d	$w14,$w14
-	nlzc.h	$w24,$w24
-	nlzc.w	$w10,$w4
-	nmadd.ps	$f27,$f4,$f9,$f25
-	nmsub.ps	$f6,$f12,$f14,$f17
-	nor.v	$w20,$w20,$w15
-	or.v	$w13,$w23,$w12
-	packrl.ph	$ra,$t8,$t6
-	pcnt.b	$w30,$w15
-	pcnt.d	$w5,$w16
-	pcnt.h	$w20,$w24
-	pcnt.w	$w22,$w20
-	pick.ph	$ra,$a2,$gp
-	pick.qb	$t3,$a0,$gp
-	pll.ps	$f25,$f9,$f30
-	plu.ps	$f1,$f26,$f29
-	preceq.w.phl	$s8,$gp
-	preceq.w.phr	$s5,$t7
-	precequ.ph.qbl	$s7,$ra
-	precequ.ph.qbla	$a0,$t1
-	precequ.ph.qbr	$ra,$s3
-	precequ.ph.qbra	$t8,$t0
-	preceu.ph.qbl	$sp,$t0
-	preceu.ph.qbla	$s6,$t3
-	preceu.ph.qbr	$gp,$s1
-	preceu.ph.qbra	$k1,$s0
-	precr.qb.ph	$v0,$t4,$s8
-	precrq.ph.w	$t6,$s8,$t8
-	precrq.qb.ph	$a2,$t4,$t4
-	precrq_rs.ph.w	$a1,$k0,$a3
-	precrqu_s.qb.ph	$zero,$gp,$s5
-	pul.ps	$f9,$f30,$f26
-	puu.ps	$f24,$f9,$f2
-	raddu.w.qb	$t9,$s3
-	rdpgpr	$s3,$t1
-	recip.d	$f19,$f6
-	recip.s	$f3,$f30
-	repl.ph	$at,-307
-	replv.ph	$v1,$s7
-	replv.qb	$t9,$t4
-	rorv	$t5,$a3,$s5
-	round.l.d	$f12,$f1
-	round.l.s	$f25,$f5
-	rsqrt.d	$f3,$f28
-	rsqrt.s	$f4,$f8
-	sbe	$s7,33($s1)
-	sce	$sp,189($t2)
-	sdc3	$12,5835($t2)
-	she	$t8,105($v0)
-	shilo	$ac1,26
-	shilov	$ac2,$t2
-	shllv.ph	$t2,$s0,$s0
-	shllv.qb	$gp,$v1,$zero
-	shllv_s.ph	$k1,$at,$t5
-	shllv_s.w	$s1,$ra,$k0
-	shrav.ph	$t9,$s2,$s1
-	shrav.qb	$zero,$t8,$t3
-	shrav_r.ph	$s3,$t3,$t9
-	shrav_r.qb	$a0,$sp,$s5
-	shrav_r.w	$s7,$s4,$s6
-	shrlv.ph	$t6,$t2,$t1
-	shrlv.qb	$a2,$s2,$t3
-	sub.ps	$f5,$f14,$f26
-	subq.ph	$ra,$t1,$s8
-	subq_s.ph	$t5,$s8,$s5
-	subq_s.w	$k1,$a2,$a3
-	subqh.ph	$t2,$at,$t1
-	subqh.w	$v0,$a2,$zero
-	subqh_r.ph	$a0,$t4,$s6
-	subqh_r.w	$t2,$a2,$gp
-	subu.ph	$t1,$s6,$s4
-	subu.qb	$s6,$a2,$s6
-	subu_s.ph	$v1,$a1,$s3
-	subu_s.qb	$s1,$at,$ra
-	subuh.qb	$zero,$gp,$gp
-	subuh_r.qb	$s4,$s8,$s6
-	swe	$t8,94($k0)
-	swle	$v1,-209($gp)
-	swre	$k0,-202($s2)
-	synci	20023($s0)
-	tlbginv
-	tlbginvf
-	tlbgp
-	tlbgr
-	tlbgwi
-	tlbgwr
-	tlbinv
-	tlbinvf
-	tlbp
-	tlbr
-	tlbwi
-	tlbwr
-	trunc.l.d	$f23,$f23
-	trunc.l.s	$f28,$f31
-	wrpgpr	$zero,$t5
-	xor.v	$w20,$w21,$w30
-	yield	$v1,$s0
+        .set noat
+        abs.ps          $f22,$f8
+        absq_s.ph       $8,$a0
+        absq_s.qb       $15,$s1
+        absq_s.w        $s3,$ra
+        add.ps          $f25,$f27,$f13
+        addq.ph         $s1,$15,$at
+        addq_s.ph       $s3,$s6,$s2
+        addq_s.w        $a2,$8,$at
+        addqh.ph        $s4,$14,$s1
+        addqh.w         $s7,$s7,$k1
+        addqh_r.ph      $sp,$25,$s8
+        addqh_r.w       $8,$v1,$zero
+        addsc           $s8,$15,$12
+        addu.ph         $a2,$14,$s3
+        addu.qb         $s6,$v1,$v1
+        addu_s.ph       $a3,$s3,$gp
+        addu_s.qb       $s4,$s8,$s1
+        adduh.qb        $a1,$a1,$at
+        adduh_r.qb      $a0,$9,$12
+        addwc           $k0,$s6,$s7
+        alnv.ps         $f12,$f18,$f30,$12
+        and.v           $w10,$w25,$w29
+        bitrev          $14,$at
+        bmnz.v          $w15,$w2,$w28
+        bmz.v           $w13,$w11,$w21
+        bsel.v          $w28,$w7,$w0
+        c.eq.d          $fcc1,$f15,$f15
+        c.eq.ps         $fcc5,$f0,$f9
+        c.eq.s          $fcc5,$f24,$f17
+        c.f.d           $fcc4,$f11,$f21
+        c.f.ps          $fcc6,$f11,$f11
+        c.f.s           $fcc4,$f30,$f7
+        c.le.d          $fcc4,$f18,$f1
+        c.le.ps         $fcc1,$f7,$f20
+        c.le.s          $fcc6,$f24,$f4
+        c.lt.d          $fcc3,$f9,$f3
+        c.lt.ps         $f19,$f5
+        c.lt.s          $fcc2,$f17,$f14
+        c.nge.d         $fcc5,$f21,$f16
+        c.nge.ps        $f1,$f26
+        c.nge.s         $fcc3,$f11,$f8
+        c.ngl.ps        $f21,$f30
+        c.ngl.s         $fcc2,$f31,$f23
+        c.ngle.ps       $fcc7,$f12,$f20
+        c.ngle.s        $fcc2,$f18,$f23
+        c.ngt.d         $fcc4,$f24,$f7
+        c.ngt.ps        $fcc5,$f30,$f6
+        c.ngt.s         $fcc5,$f8,$f13
+        c.ole.d         $fcc2,$f16,$f31
+        c.ole.ps        $fcc7,$f21,$f8
+        c.ole.s         $fcc3,$f7,$f20
+        c.olt.d         $fcc4,$f19,$f28
+        c.olt.ps        $fcc3,$f7,$f16
+        c.olt.s         $fcc6,$f20,$f7
+        c.seq.d         $fcc4,$f31,$f7
+        c.seq.ps        $fcc6,$f31,$f14
+        c.seq.s         $fcc7,$f1,$f25
+        c.sf.ps         $fcc6,$f4,$f6
+        c.ueq.d         $fcc4,$f13,$f25
+        c.ueq.ps        $fcc1,$f5,$f29
+        c.ueq.s         $fcc6,$f3,$f30
+        c.ule.d         $fcc7,$f25,$f18
+        c.ule.ps        $fcc6,$f17,$f3
+        c.ule.s         $fcc7,$f21,$f30
+        c.ult.d         $fcc6,$f6,$f17
+        c.ult.ps        $fcc7,$f14,$f0
+        c.ult.s         $fcc7,$f24,$f10
+        c.un.d          $fcc6,$f23,$f24
+        c.un.ps         $fcc4,$f2,$f26
+        c.un.s          $fcc1,$f30,$f4
+        ceil.l.d        $f1,$f3
+        ceil.l.s        $f18,$f13
+        cfcmsa          $s6,$19
+        cmp.eq.ph       $s7,$14
+        cmp.le.ph       $8,$14
+        cmp.lt.ph       $k0,$sp
+        cmpgdu.eq.qb    $s3,$zero,$k0
+        cmpgdu.le.qb    $v1,$15,$s2
+        cmpgdu.lt.qb    $s0,$gp,$sp
+        cmpgu.eq.qb     $14,$s6,$s8
+        cmpgu.le.qb     $9,$a3,$s4
+        cmpgu.lt.qb     $sp,$at,$8
+        cmpu.eq.qb      $v0,$24
+        cmpu.le.qb      $s1,$a1
+        cmpu.lt.qb      $at,$a3
+        ctcmsa          $31,$s7
+        cvt.d.l         $f4,$f16
+        cvt.ps.s        $f3,$f18,$f19
+        cvt.s.l         $f15,$f30
+        cvt.s.pl        $f30,$f1
+        cvt.s.pu        $f14,$f25
+        dmt $k0
+        dpa.w.ph        $ac1,$s7,$k0
+        dpaq_s.w.ph     $ac2,$a0,$13
+        dpaq_sa.l.w     $ac0,$a2,$14
+        dpaqx_s.w.ph    $ac3,$a0,$24
+        dpaqx_sa.w.ph   $ac1,$zero,$s5
+        dpau.h.qbl      $ac1,$10,$24
+        dpau.h.qbr      $ac1,$s7,$s6
+        dpax.w.ph       $ac3,$a0,$k0
+        dps.w.ph        $ac1,$a3,$a1
+        dpsq_s.w.ph     $ac0,$gp,$k0
+        dpsq_sa.l.w     $ac0,$a3,$15
+        dpsqx_s.w.ph    $ac3,$13,$a3
+        dpsqx_sa.w.ph   $ac3,$sp,$s2
+        dpsu.h.qbl      $ac2,$14,$10
+        dpsu.h.qbr      $ac2,$a1,$s6
+        dpsx.w.ph       $ac0,$s7,$gp
+        dvpe            $s6
+        emt $8
+        evpe            $v0
+        extpdpv         $s6,$ac0,$s8
+        extpv           $13,$ac0,$14
+        extrv.w         $8,$ac3,$at
+        extrv_r.w       $8,$ac1,$s6
+        extrv_rs.w      $gp,$ac1,$s6
+        extrv_s.h       $s2,$ac1,$14
+        fclass.d        $w14,$w27
+        fclass.w        $w19,$w28
+        fexupl.d        $w10,$w29
+        fexupl.w        $w12,$w27
+        fexupr.d        $w31,$w15
+        fexupr.w        $w29,$w12
+        ffint_s.d       $w1,$w30
+        ffint_s.w       $w16,$w14
+        ffint_u.d       $w23,$w18
+        ffint_u.w       $w19,$w12
+        ffql.d          $w2,$w3
+        ffql.w          $w9,$w0
+        ffqr.d          $w25,$w24
+        ffqr.w          $w10,$w6
+        fill.b          $w9,$v1
+        fill.h          $w9,$8
+        fill.w          $w31,$15
+        flog2.d         $w12,$w16
+        flog2.w         $w19,$w23
+        floor.l.d       $f26,$f7
+        floor.l.s       $f12,$f5
+        fork            $s2,$8,$a0
+        frcp.d          $w12,$w4
+        frcp.w          $w30,$w8
+        frint.d         $w20,$w8
+        frint.w         $w11,$w29
+        frsqrt.d        $w29,$w2
+        frsqrt.w        $w9,$w8
+        fsqrt.d         $w3,$w1
+        fsqrt.w         $w5,$w15
+        ftint_s.d       $w31,$w26
+        ftint_s.w       $w27,$w14
+        ftint_u.d       $w5,$w31
+        ftint_u.w       $w12,$w29
+        ftrunc_s.d      $w4,$w22
+        ftrunc_s.w      $w24,$w7
+        ftrunc_u.d      $w20,$w25
+        ftrunc_u.w      $w7,$w26
+        insv            $s2,$at
+        iret
+        lbe             $14,122($9)
+        lbue            $11,-108($10)
+        lbux            $9,$14($v0)
+        lhe             $s6,219($v1)
+        lhue            $gp,118($11)
+        lhx             $sp,$k0($15)
+        lle             $gp,-237($ra)
+        lwe             $ra,-145($14)
+        lwle            $11,-42($11)
+        lwre            $sp,-152($24)
+        lwx             $12,$12($s4)
+        madd.ps         $f22,$f3,$f14,$f3
+        maq_s.w.phl     $ac2,$25,$11
+        maq_s.w.phr     $ac0,$10,$25
+        maq_sa.w.phl    $ac3,$a1,$v1
+        maq_sa.w.phr    $ac1,$at,$10
+        mfgc0           $s6,c0_datahi1
+        mflo            $9,$ac2
+        modsub          $a3,$12,$a3
+        mov.ps          $f22,$f17
+        move.v          $w8,$w17
+        movf.ps         $f10,$f28,$fcc6
+        movn.ps         $f31,$f31,$s3
+        movt.ps         $f20,$f25,$fcc2
+        movz.ps         $f18,$f17,$ra
+        msub            $ac2,$sp,$14
+        msub.ps         $f12,$f14,$f29,$f17
+        msubu           $ac2,$a1,$24
+        mtc0            $9,c0_datahi1
+        mtgc0           $s4,$21,7
+        mthi            $v0,$ac1
+        mthlip          $a3,$ac0
+        mul.ph          $s4,$24,$s0
+        mul.ps          $f14,$f0,$f16
+        mul_s.ph        $10,$14,$15
+        muleq_s.w.phl   $11,$s4,$s4
+        muleq_s.w.phr   $s6,$a0,$s8
+        muleu_s.ph.qbl  $a2,$14,$8
+        muleu_s.ph.qbr  $a1,$ra,$9
+        mulq_rs.ph      $s2,$14,$15
+        mulq_rs.w       $at,$s4,$25
+        mulq_s.ph       $s0,$k1,$15
+        mulq_s.w        $9,$a3,$s0
+        mulsa.w.ph      $ac1,$s4,$s6
+        mulsaq_s.w.ph   $ac0,$ra,$s2
+        neg.ps          $f19,$f13
+        nloc.b          $w12,$w30
+        nloc.d          $w16,$w7
+        nloc.h          $w21,$w17
+        nloc.w          $w17,$w16
+        nlzc.b          $w12,$w7
+        nlzc.d          $w14,$w14
+        nlzc.h          $w24,$w24
+        nlzc.w          $w10,$w4
+        nmadd.ps        $f27,$f4,$f9,$f25
+        nmsub.ps        $f6,$f12,$f14,$f17
+        nor.v           $w20,$w20,$w15
+        or.v            $w13,$w23,$w12
+        packrl.ph       $ra,$24,$14
+        pcnt.b          $w30,$w15
+        pcnt.d          $w5,$w16
+        pcnt.h          $w20,$w24
+        pcnt.w          $w22,$w20
+        pick.ph         $ra,$a2,$gp
+        pick.qb         $11,$a0,$gp
+        pll.ps          $f25,$f9,$f30
+        plu.ps          $f1,$f26,$f29
+        preceq.w.phl    $s8,$gp
+        preceq.w.phr    $s5,$15
+        precequ.ph.qbl  $s7,$ra
+        precequ.ph.qbla $a0,$9
+        precequ.ph.qbr  $ra,$s3
+        precequ.ph.qbra $24,$8
+        preceu.ph.qbl   $sp,$8
+        preceu.ph.qbla  $s6,$11
+        preceu.ph.qbr   $gp,$s1
+        preceu.ph.qbra  $k1,$s0
+        precr.qb.ph     $v0,$12,$s8
+        precrq.ph.w     $14,$s8,$24
+        precrq.qb.ph    $a2,$12,$12
+        precrq_rs.ph.w  $a1,$k0,$a3
+        precrqu_s.qb.ph $zero,$gp,$s5
+        pul.ps          $f9,$f30,$f26
+        puu.ps          $f24,$f9,$f2
+        raddu.w.qb      $25,$s3
+        rdpgpr          $s3,$9
+        recip.d         $f19,$f6
+        recip.s         $f3,$f30
+        repl.ph         $at,-307
+        replv.ph        $v1,$s7
+        replv.qb        $25,$12
+        rorv            $13,$a3,$s5
+        round.l.d       $f12,$f1
+        round.l.s       $f25,$f5
+        rsqrt.d         $f3,$f28
+        rsqrt.s         $f4,$f8
+        sbe             $s7,33($s1)
+        sce             $sp,189($10)
+        she             $24,105($v0)
+        shilo           $ac1,26
+        shilov          $ac2,$10
+        shllv.ph        $10,$s0,$s0
+        shllv.qb        $gp,$v1,$zero
+        shllv_s.ph      $k1,$at,$13
+        shllv_s.w       $s1,$ra,$k0
+        shrav.ph        $25,$s2,$s1
+        shrav.qb        $zero,$24,$11
+        shrav_r.ph      $s3,$11,$25
+        shrav_r.qb      $a0,$sp,$s5
+        shrav_r.w       $s7,$s4,$s6
+        shrlv.ph        $14,$10,$9
+        shrlv.qb        $a2,$s2,$11
+        sub.ps          $f5,$f14,$f26
+        subq.ph         $ra,$9,$s8
+        subq_s.ph       $13,$s8,$s5
+        subq_s.w        $k1,$a2,$a3
+        subqh.ph        $10,$at,$9
+        subqh.w         $v0,$a2,$zero
+        subqh_r.ph      $a0,$12,$s6
+        subqh_r.w       $10,$a2,$gp
+        subu.ph         $9,$s6,$s4
+        subu.qb         $s6,$a2,$s6
+        subu_s.ph       $v1,$a1,$s3
+        subu_s.qb       $s1,$at,$ra
+        subuh.qb        $zero,$gp,$gp
+        subuh_r.qb      $s4,$s8,$s6
+        swe             $24,94($k0)
+        swle            $v1,-209($gp)
+        swre            $k0,-202($s2)
+        synci           20023($s0)
+        tlbginv
+        tlbginvf
+        tlbgp
+        tlbgr
+        tlbgwi
+        tlbgwr
+        tlbinv
+        tlbinvf
+        trunc.l.d       $f23,$f23
+        trunc.l.s       $f28,$f31
+        wrpgpr          $zero,$13
+        xor.v           $w20,$w21,$w30
+        yield           $v1,$s0
diff --git a/test/MC/Mips/mips32r2/valid.s b/test/MC/Mips/mips32r2/valid.s
index 3e9a1d3..26f8b6b 100644
--- a/test/MC/Mips/mips32r2/valid.s
+++ b/test/MC/Mips/mips32r2/valid.s
@@ -3,154 +3,173 @@
 # RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
 
         .set noat
-	abs.d	$f7,$f25 # CHECK: encoding
-	abs.s	$f9,$f16
-	add	$s7,$s2,$a1
-	add.d	$f1,$f7,$f29
-	add.s	$f8,$f21,$f24
-	addi	$t5,$t1,26322
-	addu	$t1,$a0,$a2
-	and	$s7,$v0,$t4
-	c.ngl.d	$f29,$f29
-	c.ngle.d	$f0,$f16
-	c.sf.d	$f30,$f0
-	c.sf.s	$f14,$f22
-	ceil.w.d	$f11,$f25
-	ceil.w.s	$f6,$f20
-	cfc1	$s1,$21
-	clo	$t3,$a1
-	clz	$sp,$gp
-	ctc1	$a2,$26
-	cvt.d.s	$f22,$f28
-	cvt.d.w	$f26,$f11
-	cvt.l.d	$f24,$f15
-	cvt.l.s	$f11,$f29
-	cvt.s.d	$f26,$f8
-	cvt.s.w	$f22,$f15
-	cvt.w.d	$f20,$f14
-	cvt.w.s	$f20,$f24
-	deret
-	di	$s8
-	div	$zero,$t9,$t3
-	div.d	$f29,$f20,$f27
-	div.s	$f4,$f5,$f15
-	divu	$zero,$t9,$t7
-	ehb                      # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
-	ei	$t6
-	eret
-	floor.w.d	$f14,$f11
-	floor.w.s	$f8,$f9
-	lb	$t8,-14515($t2)
-	lbu	$t0,30195($v1)
-	ldc1	$f11,16391($s0)
-	ldc2	$8,-21181($at)
-	ldxc1	$f8,$s7($t7)
-	lh	$t3,-8556($s5)
-	lhu	$s3,-22851($v0)
-	li	$at,-29773
-	li	$zero,-29889
-	ll	$v0,-7321($s2)
-	luxc1	$f19,$s6($s5)
-	lw	$t0,5674($a1)
-	lwc1	$f16,10225($k0)
-	lwc2	$18,-841($a2)
-	lwl	$s4,-4231($t7)
-	lwr	$zero,-19147($gp)
-	lwxc1	$f12,$s1($s8)
-	madd	$s6,$t5
-	madd	$zero,$t1
-	madd.d	$f18,$f19,$f26,$f20
-	madd.s	$f1,$f31,$f19,$f25
-	maddu	$s3,$gp
-	maddu	$t8,$s2
-	mfc0	$a2,$14,1
-	mfc1	$a3,$f27
-	mfhc1	$s8,$f24
-	mfhi	$s3
-	mfhi	$sp
-	mflo	$s1
-	mov.d	$f20,$f14
-	mov.s	$f2,$f27
-	move	$s8,$a0
-	move	$t9,$a2
-	movf	$gp,$t0,$fcc7
-	movf.d	$f6,$f11,$fcc5
-	movf.s	$f23,$f5,$fcc6
-	movn	$v1,$s1,$s0
-	movn.d	$f27,$f21,$k0
-	movn.s	$f12,$f0,$s7
-	movt	$zero,$s4,$fcc5
-	movt.d	$f0,$f2,$fcc0
-	movt.s	$f30,$f2,$fcc1
-	movz	$a1,$s6,$t1
-	movz.d	$f12,$f29,$t1
-	movz.s	$f25,$f7,$v1
-	msub	$s7,$k1
-	msub.d	$f10,$f1,$f31,$f18
-	msub.s	$f12,$f19,$f10,$f16
-	msubu	$t7,$a1
-	mtc0	$t1,$29,3
-	mtc1	$s8,$f9
-	mthc1	$zero,$f16
-	mthi	$s1
-	mtlo	$sp
-	mtlo	$t9
-	mul	$s0,$s4,$at
-	mul.d	$f20,$f20,$f16
-	mul.s	$f30,$f10,$f2
-	mult	$sp,$s4
-	mult	$sp,$v0
-	multu	$gp,$k0
-	multu	$t1,$s2
-	neg.d	$f27,$f18
-	neg.s	$f1,$f15
-	nmadd.d	$f18,$f9,$f14,$f19
-	nmadd.s	$f0,$f5,$f25,$f12
-	nmsub.d	$f30,$f8,$f16,$f30
-	nmsub.s	$f1,$f24,$f19,$f4
-	nop
-	nor	$a3,$zero,$a3
-	or	$t4,$s0,$sp
-	pause                    # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
-	rdhwr	$sp,$11
-	round.w.d	$f6,$f4
-	round.w.s	$f27,$f28
-	sb	$s6,-19857($t6)
-	sc	$t7,18904($s3)
-	sdc1	$f31,30574($t5)
-	sdc2	$20,23157($s2)
-	sdxc1	$f11,$t2($t6)
-	seb	$t9,$t7
-	seh	$v1,$t4
-	sh	$t6,-6704($t7)
-	sllv	$a3,$zero,$t1
-	slt	$s7,$t3,$k1
-	slti	$s1,$t2,9489
-	sltiu	$t9,$t9,-15531
-	sltu	$s4,$s5,$t3
-	sqrt.d	$f17,$f22
-	sqrt.s	$f0,$f1
-	srav	$s1,$s7,$sp
-	srlv	$t9,$s4,$a0
-	ssnop                    # CHECK: ssnop # encoding:  [0x00,0x00,0x00,0x40]
-	sub	$s6,$s3,$t4
-	sub.d	$f18,$f3,$f17
-	sub.s	$f23,$f22,$f22
-	subu	$sp,$s6,$s6
-	suxc1	$f12,$k1($t5)
-	sw	$ra,-10160($sp)
-	swc1	$f6,-8465($t8)
-	swc2	$25,24880($s0)
-	swl	$t7,13694($s3)
-	swr	$s1,-26590($t6)
-	swxc1	$f19,$t4($k0)
-	teqi	$s5,-17504
-	tgei	$s1,5025
-	tgeiu	$sp,-28621
-	tlti	$t6,-21059
-	tltiu	$ra,-5076
-	tnei	$t4,-29647
-	trunc.w.d	$f22,$f15
-	trunc.w.s	$f28,$f30
-	wsbh	$k1,$t1
-	xor	$s2,$a0,$s8
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addu      $9,$a0,$a2
+        and       $s7,$v0,$12
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        ceil.w.d  $f11,$f25
+        ceil.w.s  $f6,$f20
+        cfc1      $s1,$21
+        clo       $11,$a1
+        clz       $sp,$gp
+        ctc1      $a2,$26
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.l.d   $f24,$f15
+        cvt.l.s   $f11,$f29
+        cvt.s.d   $f26,$f8
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        deret
+        di        $s8
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        ei        $14
+        eret
+        floor.w.d $f14,$f11
+        floor.w.s $f8,$f9
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        ldc1      $f11,16391($s0)
+        ldc2      $8,-21181($at)
+        ldxc1     $f8,$s7($15)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        ll        $v0,-7321($s2)
+        luxc1     $f19,$s6($s5)
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        lwxc1     $f12,$s1($s8)
+        madd      $s6,$13
+        madd      $zero,$9
+        madd.d    $f18,$f19,$f26,$f20
+        madd.s    $f1,$f31,$f19,$f25
+        maddu     $s3,$gp
+        maddu     $24,$s2
+        mfc0      $a2,$14,1
+        mfc1      $a3,$f27
+        mfhc1     $s8,$f24
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $s8,$a0
+        move      $25,$a2
+        movf      $gp,$8,$fcc7
+        movf.d    $f6,$f11,$fcc5
+        movf.s    $f23,$f5,$fcc6
+        movn      $v1,$s1,$s0
+        movn.d    $f27,$f21,$k0
+        movn.s    $f12,$f0,$s7
+        movt      $zero,$s4,$fcc5
+        movt.d    $f0,$f2,$fcc0
+        movt.s    $f30,$f2,$fcc1
+        movz      $a1,$s6,$9
+        movz.d    $f12,$f29,$9
+        movz.s    $f25,$f7,$v1
+        msub      $s7,$k1
+        msub.d    $f10,$f1,$f31,$f18
+        msub.s    $f12,$f19,$f10,$f16
+        msubu     $15,$a1
+        mtc0      $9,$29,3
+        mtc1      $s8,$f9
+        mthc1     $zero,$f16
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul       $s0,$s4,$at
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nmadd.d   $f18,$f9,$f14,$f19
+        nmadd.s   $f0,$f5,$f25,$f12
+        nmsub.d   $f30,$f8,$f16,$f30
+        nmsub.s   $f1,$f24,$f19,$f4
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        pause                          # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
+        rdhwr     $sp,$11              
+        rotr      $1,15                # CHECK: rotr $1, $1, 15         # encoding: [0x00,0x21,0x0b,0xc2]
+        rotr      $1,$14,15            # CHECK: rotr $1, $14, 15        # encoding: [0x00,0x2e,0x0b,0xc2]
+        rotrv     $1,$14,$15           # CHECK: rotrv $1, $14, $15      # encoding: [0x01,0xee,0x08,0x46]
+        round.w.d $f6,$f4
+        round.w.s $f27,$f28
+        sb        $s6,-19857($14)
+        sc        $15,18904($s3)
+        sdc1      $f31,30574($13)
+        sdc2      $20,23157($s2)
+        sdxc1     $f11,$10($14)
+        seb       $25,$15
+        seh       $v1,$12
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sqrt.d    $f17,$f22
+        sqrt.s    $f0,$f1
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        suxc1     $f12,$k1($13)
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        swxc1     $f19,$12($k0)
+        teqi      $s5,-17504
+        tgei      $s1,5025
+        tgeiu     $sp,-28621
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlti      $14,-21059
+        tltiu     $ra,-5076
+        tnei      $12,-29647
+        trunc.w.d $f22,$f15
+        trunc.w.s $f28,$f30
+        wsbh      $k1,$9
+        xor       $s2,$a0,$s8
diff --git a/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s
new file mode 100644
index 0000000..aee068a
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s
@@ -0,0 +1,15 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        lwl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        swl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        swr       $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        swle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        swre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips32r6/invalid-mips1.s b/test/MC/Mips/mips32r6/invalid-mips1.s
new file mode 100644
index 0000000..aa7d407
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips1.s
@@ -0,0 +1,8 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r6/invalid-mips2-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips2-wrong-error.s
new file mode 100644
index 0000000..b799c8e
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips2-wrong-error.s
@@ -0,0 +1,20 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        beql $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgezall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bgtzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        blezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bltzall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bltzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bnel $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips32r6/invalid-mips2.s b/test/MC/Mips/mips32r6/invalid-mips2.s
new file mode 100644
index 0000000..0638e78
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips2.s
@@ -0,0 +1,14 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teqi      $s5,-17504          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgei      $s1,5025            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeiu     $sp,-28621          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlti      $14,-21059          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltiu     $ra,-5076           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tnei      $12,-29647          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s
new file mode 100644
index 0000000..e416a20
--- /dev/null
+++ b/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s
@@ -0,0 +1,16 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips32r6/relocations.s b/test/MC/Mips/mips32r6/relocations.s
new file mode 100644
index 0000000..4532e42
--- /dev/null
+++ b/test/MC/Mips/mips32r6/relocations.s
@@ -0,0 +1,55 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 \
+# RUN:   | FileCheck %s -check-prefix=CHECK-FIXUP
+# RUN: llvm-mc %s -filetype=obj -triple=mips-unknown-linux -mcpu=mips32r6 \
+# RUN:   | llvm-readobj -r | FileCheck %s -check-prefix=CHECK-ELF
+#------------------------------------------------------------------------------
+# Check that the assembler can handle the documented syntax for fixups.
+#------------------------------------------------------------------------------
+# CHECK-FIXUP: beqc $5, $6, bar # encoding: [0x20,0xa6,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_Mips_PC16
+# CHECK-FIXUP: bnec $5, $6, bar # encoding: [0x60,0xa6,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_Mips_PC16
+# CHECK-FIXUP: beqzc $9, bar    # encoding: [0xd9,0b001AAAAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC21_S2
+# CHECK-FIXUP: bnezc $9, bar    # encoding: [0xf9,0b001AAAAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC21_S2
+# CHECK-FIXUP: balc  bar        # encoding: [0b111010AA,A,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC26_S2
+# CHECK-FIXUP: bc    bar        # encoding: [0b110010AA,A,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC26_S2
+# CHECK-FIXUP: aluipc $2, %pcrel_hi(bar)    # encoding: [0xec,0x5f,A,A]
+# CHECK-FIXUP:                              #   fixup A - offset: 0,
+# CHECK-FIXUP:                                  value: bar@PCREL_HI16,
+# CHECK-FIXUP:                                  kind: fixup_MIPS_PCHI16
+# CHECK-FIXUP: addiu $2, $2, %pcrel_lo(bar) # encoding: [0x24,0x42,A,A]
+# CHECK-FIXUP:                              #   fixup A - offset: 0,
+# CHECK-FIXUP:                                  value: bar@PCREL_LO16,
+# CHECK-FIXUP:                                  kind: fixup_MIPS_PCLO16
+#------------------------------------------------------------------------------
+# Check that the appropriate relocations were created.
+#------------------------------------------------------------------------------
+# CHECK-ELF: Relocations [
+# CHECK-ELF:     0x0 R_MIPS_PC16 bar 0x0
+# CHECK-ELF:     0x4 R_MIPS_PC16 bar 0x0
+# CHECK-ELF:     0x8 R_MIPS_PC21_S2 bar 0x0
+# CHECK-ELF:     0xC R_MIPS_PC21_S2 bar 0x0
+# CHECK-ELF:     0x10 R_MIPS_PC26_S2 bar 0x0
+# CHECK-ELF:     0x14 R_MIPS_PC26_S2 bar 0x0
+# CHECK-ELF:     0x18 R_MIPS_PCHI16 bar 0x0
+# CHECK-ELF:     0x1C R_MIPS_PCLO16 bar 0x0
+# CHECK-ELF: ]
+
+  beqc  $5, $6, bar
+  bnec  $5, $6, bar
+  beqzc $9, bar
+  bnezc $9, bar
+  balc  bar
+  bc    bar
+  aluipc $2, %pcrel_hi(bar)
+  addiu  $2, $2, %pcrel_lo(bar)
diff --git a/test/MC/Mips/mips32r6/valid-xfail.s b/test/MC/Mips/mips32r6/valid-xfail.s
new file mode 100644
index 0000000..0c911d7
--- /dev/null
+++ b/test/MC/Mips/mips32r6/valid-xfail.s
@@ -0,0 +1,19 @@
+# Instructions that should be valid but currently fail for known reasons (e.g.
+# they aren't implemented yet).
+# This test is set up to XPASS if any instruction generates an encoding.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 | not FileCheck %s
+# CHECK-NOT: encoding
+# XFAIL: *
+
+        .set noat
+        bovc     $0, $2, 4       # TODO: bovc $0, $2, 4      # encoding: [0x20,0x40,0x00,0x01]
+        bovc     $2, $4, 4       # TODO: bovc $2, $4, 4      # encoding: [0x20,0x82,0x00,0x01]
+        bnvc     $0, $2, 4       # TODO: bnvc $0, $2, 4      # encoding: [0x60,0x40,0x00,0x01]
+        bnvc     $2, $4, 4       # TODO: bnvc $2, $4, 4      # encoding: [0x60,0x82,0x00,0x01]
+        beqc    $0, $6, 256      # TODO: beqc $6, $zero, 256 # encoding: [0x20,0xc0,0x00,0x40]
+        beqc    $5, $0, 256      # TODO: beqc $5, $zero, 256 # encoding: [0x20,0xa0,0x00,0x40]
+        beqc    $6, $5, 256      # TODO: beqc $5, $6, 256    # encoding: [0x20,0xa6,0x00,0x40]
+        bnec    $0, $6, 256      # TODO: bnec $6, $zero, 256 # encoding: [0x60,0xc0,0x00,0x40]
+        bnec    $5, $0, 256      # TODO: bnec $5, $zero, 256 # encoding: [0x60,0xa0,0x00,0x40]
+        bnec    $6, $5, 256      # TODO: bnec $5, $6, 256    # encoding: [0x60,0xa6,0x00,0x40]
diff --git a/test/MC/Mips/mips32r6/valid.s b/test/MC/Mips/mips32r6/valid.s
new file mode 100644
index 0000000..5b4b928
--- /dev/null
+++ b/test/MC/Mips/mips32r6/valid.s
@@ -0,0 +1,126 @@
+# Instructions that are valid
+#
+# Branches have some unusual encoding rules in MIPS32r6 so we need to test:
+#   rs == 0
+#   rs != 0
+#   rt == 0
+#   rt != 0
+#   rs < rt
+#   rs == rt
+#   rs > rt
+# appropriately for each branch instruction
+#
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r6 | FileCheck %s
+
+        .set noat
+        # FIXME: Add the instructions carried forward from older ISA's
+        addiupc $4, 100          # CHECK: addiupc $4, 100     # encoding: [0xec,0x80,0x00,0x19]
+        align   $4, $2, $3, 2    # CHECK: align $4, $2, $3, 2 # encoding: [0x7c,0x43,0x22,0xa0]
+        aluipc  $3, 56           # CHECK: aluipc $3, 56       # encoding: [0xec,0x7f,0x00,0x38]
+        aui     $3,$2,-23        # CHECK: aui $3, $2, -23     # encoding: [0x3c,0x62,0xff,0xe9]
+        auipc   $3, -1           # CHECK: auipc $3, -1        # encoding: [0xec,0x7e,0xff,0xff]
+        balc 14572256            # CHECK: balc 14572256       # encoding: [0xe8,0x37,0x96,0xb8]
+        bc 14572256              # CHECK: bc 14572256         # encoding: [0xc8,0x37,0x96,0xb8]
+        bc1eqz  $f0,4            # CHECK: bc1eqz $f0, 4       # encoding: [0x45,0x20,0x00,0x01]
+        bc1eqz  $f31,4           # CHECK: bc1eqz $f31, 4      # encoding: [0x45,0x3f,0x00,0x01]
+        bc1nez  $f0,4            # CHECK: bc1nez $f0, 4       # encoding: [0x45,0xa0,0x00,0x01]
+        bc1nez  $f31,4           # CHECK: bc1nez $f31, 4      # encoding: [0x45,0xbf,0x00,0x01]
+        bc2eqz  $0,8             # CHECK: bc2eqz $0, 8        # encoding: [0x49,0x20,0x00,0x02]
+        bc2eqz  $31,8            # CHECK: bc2eqz $31, 8       # encoding: [0x49,0x3f,0x00,0x02]
+        bc2nez  $0,8             # CHECK: bc2nez $0, 8        # encoding: [0x49,0xa0,0x00,0x02]
+        bc2nez  $31,8            # CHECK: bc2nez $31, 8       # encoding: [0x49,0xbf,0x00,0x02]
+        # beqc requires rs < rt && rs != 0 but we also accept when this is not true. See also bovc
+        # FIXME: Testcases are in valid-xfail.s at the moment
+        beqc $5, $6, 256         # CHECK: beqc $5, $6, 256    # encoding: [0x20,0xa6,0x00,0x40]
+        beqzalc $2, 1332         # CHECK: beqzalc $2, 1332    # encoding: [0x20,0x02,0x01,0x4d]
+        # bnec requires rs < rt && rs != 0 but we accept when this is not true. See also bnvc
+        # FIXME: Testcases are in valid-xfail.s at the moment
+        bnec $5, $6, 256         # CHECK: bnec $5, $6, 256    # encoding: [0x60,0xa6,0x00,0x40]
+        bnezalc $2, 1332         # CHECK: bnezalc $2, 1332    # encoding: [0x60,0x02,0x01,0x4d]
+        beqzc $5, 72256          # CHECK: beqzc $5, 72256     # encoding: [0xd8,0xa0,0x46,0x90]
+        bgezalc $2, 1332         # CHECK: bgezalc $2, 1332    # encoding: [0x18,0x42,0x01,0x4d]
+        bnezc $5, 72256          # CHECK: bnezc $5, 72256     # encoding: [0xf8,0xa0,0x46,0x90]
+        bltzc $5, 256            # CHECK: bltzc $5, 256       # encoding: [0x5c,0xa5,0x00,0x40]
+        bgezc $5, 256            # CHECK: bgezc $5, 256       # encoding: [0x58,0xa5,0x00,0x40]
+        bgtzalc $2, 1332         # CHECK: bgtzalc $2, 1332    # encoding: [0x1c,0x02,0x01,0x4d]
+        blezc $5, 256            # CHECK: blezc $5, 256       # encoding: [0x58,0x05,0x00,0x40]
+        bltzalc $2, 1332         # CHECK: bltzalc $2, 1332    # encoding: [0x1c,0x42,0x01,0x4d]
+        bgtzc $5, 256            # CHECK: bgtzc $5, 256       # encoding: [0x5c,0x05,0x00,0x40]
+        bitswap $4, $2           # CHECK: bitswap $4, $2      # encoding: [0x7c,0x02,0x20,0x20]
+        blezalc $2, 1332         # CHECK: blezalc $2, 1332    # encoding: [0x18,0x02,0x01,0x4d]
+        # bnvc requires that rs >= rt but we accept both. See also bnec
+        bnvc     $0, $0, 4       # CHECK: bnvc $zero, $zero, 4 # encoding: [0x60,0x00,0x00,0x01]
+        bnvc     $2, $0, 4       # CHECK: bnvc $2, $zero, 4    # encoding: [0x60,0x40,0x00,0x01]
+        bnvc     $4, $2, 4       # CHECK: bnvc $4, $2, 4       # encoding: [0x60,0x82,0x00,0x01]
+        # bovc requires that rs >= rt but we accept both. See also beqc
+        bovc     $0, $0, 4       # CHECK: bovc $zero, $zero, 4 # encoding: [0x20,0x00,0x00,0x01]
+        bovc     $2, $0, 4       # CHECK: bovc $2, $zero, 4    # encoding: [0x20,0x40,0x00,0x01]
+        bovc     $4, $2, 4       # CHECK: bovc $4, $2, 4       # encoding: [0x20,0x82,0x00,0x01]
+        cmp.f.s    $f2,$f3,$f4      # CHECK: cmp.f.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x80]
+        cmp.f.d    $f2,$f3,$f4      # CHECK: cmp.f.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x80]
+        cmp.un.s   $f2,$f3,$f4      # CHECK: cmp.un.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x81]
+        cmp.un.d   $f2,$f3,$f4      # CHECK: cmp.un.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x81]
+        cmp.eq.s   $f2,$f3,$f4      # CHECK: cmp.eq.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x82]
+        cmp.eq.d   $f2,$f3,$f4      # CHECK: cmp.eq.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x82]
+        cmp.ueq.s  $f2,$f3,$f4      # CHECK: cmp.ueq.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x83]
+        cmp.ueq.d  $f2,$f3,$f4      # CHECK: cmp.ueq.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x83]
+        cmp.olt.s  $f2,$f3,$f4      # CHECK: cmp.olt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x84]
+        cmp.olt.d  $f2,$f3,$f4      # CHECK: cmp.olt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x84]
+        cmp.ult.s  $f2,$f3,$f4      # CHECK: cmp.ult.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x85]
+        cmp.ult.d  $f2,$f3,$f4      # CHECK: cmp.ult.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x85]
+        cmp.ole.s  $f2,$f3,$f4      # CHECK: cmp.ole.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x86]
+        cmp.ole.d  $f2,$f3,$f4      # CHECK: cmp.ole.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x86]
+        cmp.ule.s  $f2,$f3,$f4      # CHECK: cmp.ule.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x87]
+        cmp.ule.d  $f2,$f3,$f4      # CHECK: cmp.ule.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x87]
+        cmp.sf.s   $f2,$f3,$f4      # CHECK: cmp.sf.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x88]
+        cmp.sf.d   $f2,$f3,$f4      # CHECK: cmp.sf.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x88]
+        cmp.ngle.s $f2,$f3,$f4      # CHECK: cmp.ngle.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x89]
+        cmp.ngle.d $f2,$f3,$f4      # CHECK: cmp.ngle.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x89]
+        cmp.seq.s  $f2,$f3,$f4      # CHECK: cmp.seq.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8a]
+        cmp.seq.d  $f2,$f3,$f4      # CHECK: cmp.seq.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8a]
+        cmp.ngl.s  $f2,$f3,$f4      # CHECK: cmp.ngl.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8b]
+        cmp.ngl.d  $f2,$f3,$f4      # CHECK: cmp.ngl.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8b]
+        cmp.lt.s   $f2,$f3,$f4      # CHECK: cmp.lt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8c]
+        cmp.lt.d   $f2,$f3,$f4      # CHECK: cmp.lt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8c]
+        cmp.nge.s  $f2,$f3,$f4      # CHECK: cmp.nge.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8d]
+        cmp.nge.d  $f2,$f3,$f4      # CHECK: cmp.nge.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8d]
+        cmp.le.s   $f2,$f3,$f4      # CHECK: cmp.le.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8e]
+        cmp.le.d   $f2,$f3,$f4      # CHECK: cmp.le.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8e]
+        cmp.ngt.s  $f2,$f3,$f4      # CHECK: cmp.ngt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8f]
+        cmp.ngt.d  $f2,$f3,$f4      # CHECK: cmp.ngt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8f]
+        div     $2,$3,$4         # CHECK: div $2, $3, $4   # encoding: [0x00,0x64,0x10,0x9a]
+        divu    $2,$3,$4         # CHECK: divu $2, $3, $4  # encoding: [0x00,0x64,0x10,0x9b]
+        jialc   $5, 256          # CHECK: jialc $5, 256    # encoding: [0xf8,0x05,0x01,0x00]
+        jic     $5, 256          # CHECK: jic $5, 256      # encoding: [0xd8,0x05,0x01,0x00]
+        lwpc    $2,268           # CHECK: lwpc $2, 268     # encoding: [0xec,0x48,0x00,0x43]
+        lwupc   $2,268           # CHECK: lwupc $2, 268    # encoding: [0xec,0x50,0x00,0x43]
+        mod     $2,$3,$4         # CHECK: mod $2, $3, $4   # encoding: [0x00,0x64,0x10,0xda]
+        modu    $2,$3,$4         # CHECK: modu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xdb]
+#        mul     $2,$3,$4         # CHECK-TODO: mul $2, $3, $4   # encoding: [0x00,0x64,0x10,0x98]
+        muh     $2,$3,$4         # CHECK: muh $2, $3, $4   # encoding: [0x00,0x64,0x10,0xd8]
+        mulu    $2,$3,$4         # CHECK: mulu $2, $3, $4  # encoding: [0x00,0x64,0x10,0x99]
+        muhu    $2,$3,$4         # CHECK: muhu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xd9]
+        maddf.s $f2,$f3,$f4      # CHECK: maddf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x98]
+        maddf.d $f2,$f3,$f4      # CHECK: maddf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x98]
+        msubf.s $f2,$f3,$f4      # CHECK: msubf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x99]
+        msubf.d $f2,$f3,$f4      # CHECK: msubf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x99]
+        sel.d   $f0,$f1,$f2      # CHECK: sel.d $f0, $f1, $f2 # encoding: [0x46,0x22,0x08,0x10]
+        sel.s   $f0,$f1,$f2      # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10]
+        seleqz  $2,$3,$4         # CHECK: seleqz $2, $3, $4 # encoding: [0x00,0x64,0x10,0x35]
+        selnez  $2,$3,$4         # CHECK: selnez $2, $3, $4 # encoding: [0x00,0x64,0x10,0x37]
+        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
+        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
+        min.s   $f0, $f2, $f4    # CHECK: min.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1c]
+        min.d   $f0, $f2, $f4    # CHECK: min.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1c]
+        maxa.s  $f0, $f2, $f4    # CHECK: maxa.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1f]
+        maxa.d  $f0, $f2, $f4    # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f]
+        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
+        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
+        seleqz.s $f0, $f2, $f4   # CHECK: seleqz.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x14]
+        seleqz.d $f0, $f2, $f4   # CHECK: seleqz.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x14]
+        selnez.s $f0, $f2, $f4   # CHECK: selnez.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x17]
+        selnez.d $f0, $f2, $f4   # CHECK: selnez.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x17]
+        rint.s $f2, $f4          # CHECK: rint.s $f2, $f4        # encoding: [0x46,0x00,0x20,0x9a]
+        rint.d $f2, $f4          # CHECK: rint.d $f2, $f4        # encoding: [0x46,0x20,0x20,0x9a]
+        class.s $f2, $f4         # CHECK: class.s $f2, $f4       # encoding: [0x46,0x00,0x20,0x9b]
+        class.d $f2, $f4         # CHECK: class.d $f2, $f4       # encoding: [0x46,0x20,0x20,0x9b]
diff --git a/test/MC/Mips/mips4/invalid-mips5-wrong-error.s b/test/MC/Mips/mips4/invalid-mips5-wrong-error.s
new file mode 100644
index 0000000..c6c8968
--- /dev/null
+++ b/test/MC/Mips/mips4/invalid-mips5-wrong-error.s
@@ -0,0 +1,46 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips4 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        abs.ps    $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        add.ps    $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        alnv.ps   $f12,$f18,$f30,$t0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.eq.ps   $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.f.ps    $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.le.ps   $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.lt.ps   $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.nge.ps  $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngl.ps  $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngle.ps $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngt.ps  $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ole.ps  $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.olt.ps  $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.seq.ps  $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.sf.ps   $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ueq.ps  $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ule.ps  $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ult.ps  $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.un.ps   $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.ps.s  $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.s.pl  $f30,$f1            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.s.pu  $f14,$f25           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        madd.ps   $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        mov.ps    $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movf.ps   $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movn.ps   $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movt.ps   $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movz.ps   $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        msub.ps   $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        mul.ps    $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        neg.ps    $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        nmadd.ps  $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        nmsub.ps  $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        pll.ps    $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        plu.ps    $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        pul.ps    $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        puu.ps    $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        sub.ps    $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips4/invalid-mips5.s b/test/MC/Mips/mips4/invalid-mips5.s
new file mode 100644
index 0000000..8c0db00
--- /dev/null
+++ b/test/MC/Mips/mips4/invalid-mips5.s
@@ -0,0 +1,9 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips4 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        luxc1     $f19,$s6($s5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        suxc1     $f12,$k1($t1)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips4/invalid-mips64-xfail.s b/test/MC/Mips/mips4/invalid-mips64-xfail.s
deleted file mode 100644
index d8ebcd3..0000000
--- a/test/MC/Mips/mips4/invalid-mips64-xfail.s
+++ /dev/null
@@ -1,22 +0,0 @@
-# Instructions that are supposed to be invalid but currently aren't
-# This test will XPASS if any insn stops assembling.
-#
-# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips4 \
-# RUN:     2> %t1
-# RUN: not FileCheck %s < %t1
-# XFAIL: *
-
-# CHECK-NOT: error
-        .set noat
-	deret
-	luxc1	$f19,$s6($s5)
-	madd	$s6,$t5
-	madd	$zero,$t1
-	maddu	$s3,$gp
-	maddu	$t8,$s2
-	mfc0	$a2,$14,1
-	msub	$s7,$k1
-	msubu	$t7,$a1
-	mtc0	$t1,$29,3
-	mul	$s0,$s4,$at
-	suxc1	$f12,$k1($t5)
diff --git a/test/MC/Mips/mips4/invalid-mips64.s b/test/MC/Mips/mips4/invalid-mips64.s
index e0b69f2..c6245cc 100644
--- a/test/MC/Mips/mips4/invalid-mips64.s
+++ b/test/MC/Mips/mips4/invalid-mips64.s
@@ -6,7 +6,19 @@
 # RUN: FileCheck %s < %t1
 
         .set noat
-	clo	$t3,$a1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-	clz	$sp,$gp       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-	dclo	$s2,$a2       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-	dclz	$s0,$t9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        clo     $t3,$a1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        clz     $sp,$gp       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dclo    $s2,$a2       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dclz    $s0,$t9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        deret                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        luxc1   $f19,$s6($s5) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd    $s6,$t5       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd    $zero,$t1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu   $s3,$gp       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu   $t8,$s2       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfc0    $a2,$14,1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub    $s7,$k1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msubu   $t7,$a1       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtc0    $t1,$29,3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mul     $s0,$s4,$at   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        suxc1   $f12,$k1($t5) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips4/invalid-mips64r2-xfail.s b/test/MC/Mips/mips4/invalid-mips64r2-xfail.s
index 63edb60..a5581fd 100644
--- a/test/MC/Mips/mips4/invalid-mips64r2-xfail.s
+++ b/test/MC/Mips/mips4/invalid-mips64r2-xfail.s
@@ -8,20 +8,4 @@
 
 # CHECK-NOT: error
         .set noat
-        deret
-        di      $s8
-        ei      $t6
-        luxc1   $f19,$s6($s5)
-        madd    $s6,$t5
-        madd    $zero,$t1
-        maddu   $s3,$gp
-        maddu   $t8,$s2
-        mfc0    $a2,$14,1
-        mfhc1   $s8,$f24
-        msub    $s7,$k1
-        msubu   $t7,$a1
-        mtc0    $t1,$29,3
-        mthc1   $zero,$f16
-        mul     $s0,$s4,$at
         rdhwr   $sp,$11
-        suxc1   $f12,$k1($t5)
diff --git a/test/MC/Mips/mips4/invalid-mips64r2.s b/test/MC/Mips/mips4/invalid-mips64r2.s
index ed2dff8..b259706 100644
--- a/test/MC/Mips/mips4/invalid-mips64r2.s
+++ b/test/MC/Mips/mips4/invalid-mips64r2.s
@@ -1,22 +1,37 @@
 # Instructions that are invalid
 #
-# FIXME: This test should be moved to the mips5 directory when mips5 is supported
 # RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips4 \
 # RUN:     2>%t1
 # RUN: FileCheck %s < %t1
 
         .set noat
-        clo	$t3,$a1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        clz	$sp,$gp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        dclo	$s2,$a2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-        dclz	$s0,$t9             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        clo     $t3,$a1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        clz     $sp,$gp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dclo    $s2,$a2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dclz    $s0,$t9             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        deret                       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        di      $s8                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dsbh    $v1,$t6             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dshd    $v0,$sp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei      $t6                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        luxc1   $f19,$s6($s5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd    $s6,$t5             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd    $zero,$t1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd.s  $f1,$f31,$f19,$f25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu   $s3,$gp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu   $t8,$s2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfc0    $a2,$14,1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhc1   $s8,$f24            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub    $s7,$k1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         msub.s  $f12,$f19,$f10,$f16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msubu   $t7,$a1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtc0    $t1,$29,3           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthc1   $zero,$f16          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mul     $s0,$s4,$at         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         nmadd.s $f0,$f5,$f25,$f12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         nmsub.s $f1,$f24,$f19,$f4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         pause                       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         seb     $t9,$t7             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         seh     $v1,$t4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        suxc1   $f12,$k1($t5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         wsbh    $k1,$t1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips4/valid-xfail.s b/test/MC/Mips/mips4/valid-xfail.s
index baf5c53..ff6f457 100644
--- a/test/MC/Mips/mips4/valid-xfail.s
+++ b/test/MC/Mips/mips4/valid-xfail.s
@@ -2,53 +2,48 @@
 # they aren't implemented yet).
 # This test is set up to XPASS if any instruction generates an encoding.
 #
-# FIXME: Test MIPS-IV instead of MIPS64
-# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64   | not FileCheck %s
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips4 | not FileCheck %s
 # CHECK-NOT: encoding
 # XFAIL: *
 
-	.set noat
-	c.eq.d	$fcc1,$f15,$f15
-	c.eq.s	$fcc5,$f24,$f17
-	c.f.d	$fcc4,$f11,$f21
-	c.f.s	$fcc4,$f30,$f7
-	c.le.d	$fcc4,$f18,$f1
-	c.le.s	$fcc6,$f24,$f4
-	c.lt.d	$fcc3,$f9,$f3
-	c.lt.s	$fcc2,$f17,$f14
-	c.nge.d	$fcc5,$f21,$f16
-	c.nge.s	$fcc3,$f11,$f8
-	c.ngl.s	$fcc2,$f31,$f23
-	c.ngle.s	$fcc2,$f18,$f23
-	c.ngt.d	$fcc4,$f24,$f7
-	c.ngt.s	$fcc5,$f8,$f13
-	c.ole.d	$fcc2,$f16,$f31
-	c.ole.s	$fcc3,$f7,$f20
-	c.olt.d	$fcc4,$f19,$f28
-	c.olt.s	$fcc6,$f20,$f7
-	c.seq.d	$fcc4,$f31,$f7
-	c.seq.s	$fcc7,$f1,$f25
-	c.ueq.d	$fcc4,$f13,$f25
-	c.ueq.s	$fcc6,$f3,$f30
-	c.ule.d	$fcc7,$f25,$f18
-	c.ule.s	$fcc7,$f21,$f30
-	c.ult.d	$fcc6,$f6,$f17
-	c.ult.s	$fcc7,$f24,$f10
-	c.un.d	$fcc6,$f23,$f24
-	c.un.s	$fcc1,$f30,$f4
-	madd.d	$f18,$f19,$f26,$f20
-	madd.s	$f1,$f31,$f19,$f25
-	msub.d	$f10,$f1,$f31,$f18
-	msub.s	$f12,$f19,$f10,$f16
-	nmadd.d	$f18,$f9,$f14,$f19
-	nmadd.s	$f0,$f5,$f25,$f12
-	nmsub.d	$f30,$f8,$f16,$f30
-	nmsub.s	$f1,$f24,$f19,$f4
-	recip.d	$f19,$f6
-	recip.s	$f3,$f30
-	rsqrt.d	$f3,$f28
-	rsqrt.s	$f4,$f8
-	tlbp
-	tlbr
-	tlbwi
-	tlbwr
+        .set noat
+        c.eq.d          $fcc1,$f15,$f15
+        c.eq.s          $fcc5,$f24,$f17
+        c.f.d           $fcc4,$f11,$f21
+        c.f.s           $fcc4,$f30,$f7
+        c.le.d          $fcc4,$f18,$f1
+        c.le.s          $fcc6,$f24,$f4
+        c.lt.d          $fcc3,$f9,$f3
+        c.lt.s          $fcc2,$f17,$f14
+        c.nge.d         $fcc5,$f21,$f16
+        c.nge.s         $fcc3,$f11,$f8
+        c.ngl.s         $fcc2,$f31,$f23
+        c.ngle.s        $fcc2,$f18,$f23
+        c.ngt.d         $fcc4,$f24,$f7
+        c.ngt.s         $fcc5,$f8,$f13
+        c.ole.d         $fcc2,$f16,$f31
+        c.ole.s         $fcc3,$f7,$f20
+        c.olt.d         $fcc4,$f19,$f28
+        c.olt.s         $fcc6,$f20,$f7
+        c.seq.d         $fcc4,$f31,$f7
+        c.seq.s         $fcc7,$f1,$f25
+        c.ueq.d         $fcc4,$f13,$f25
+        c.ueq.s         $fcc6,$f3,$f30
+        c.ule.d         $fcc7,$f25,$f18
+        c.ule.s         $fcc7,$f21,$f30
+        c.ult.d         $fcc6,$f6,$f17
+        c.ult.s         $fcc7,$f24,$f10
+        c.un.d          $fcc6,$f23,$f24
+        c.un.s          $fcc1,$f30,$f4
+        madd.d          $f18,$f19,$f26,$f20
+        madd.s          $f1,$f31,$f19,$f25
+        msub.d          $f10,$f1,$f31,$f18
+        msub.s          $f12,$f19,$f10,$f16
+        nmadd.d         $f18,$f9,$f14,$f19
+        nmadd.s         $f0,$f5,$f25,$f12
+        nmsub.d         $f30,$f8,$f16,$f30
+        nmsub.s         $f1,$f24,$f19,$f4
+        recip.d         $f19,$f6
+        recip.s         $f3,$f30
+        rsqrt.d         $f3,$f28
+        rsqrt.s         $f4,$f8
diff --git a/test/MC/Mips/mips4/valid.s b/test/MC/Mips/mips4/valid.s
index 8dc2a23..811584e 100644
--- a/test/MC/Mips/mips4/valid.s
+++ b/test/MC/Mips/mips4/valid.s
@@ -1,161 +1,194 @@
 # Instructions that are valid
 #
-# FIXME: Test MIPS-IV instead of MIPS64
-# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64   | FileCheck %s
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips4 | FileCheck %s
 
-	.set noat
-	abs.d	$f7,$f25 # CHECK: encoding
-	abs.s	$f9,$f16
-	add	$s7,$s2,$a1
-	add.d	$f1,$f7,$f29
-	add.s	$f8,$f21,$f24
-	addi	$t5,$t1,26322
-	addu	$t1,$a0,$a2
-	and	$s7,$v0,$t4
-	c.ngl.d	$f29,$f29
-	c.ngle.d	$f0,$f16
-	c.sf.d	$f30,$f0
-	c.sf.s	$f14,$f22
-	ceil.l.d	$f1,$f3
-	ceil.l.s	$f18,$f13
-	ceil.w.d	$f11,$f25
-	ceil.w.s	$f6,$f20
-	cfc1	$s1,$21
-	ctc1	$a2,$26
-	cvt.d.l	$f4,$f16
-	cvt.d.s	$f22,$f28
-	cvt.d.w	$f26,$f11
-	cvt.l.d	$f24,$f15
-	cvt.l.s	$f11,$f29
-	cvt.s.d	$f26,$f8
-	cvt.s.l	$f15,$f30
-	cvt.s.w	$f22,$f15
-	cvt.w.d	$f20,$f14
-	cvt.w.s	$f20,$f24
-	dadd	$s3,$at,$ra
-	daddi	$sp,$s4,-27705
-	daddiu	$k0,$s6,-4586
-	ddiv	$zero,$k0,$s3
-	ddivu	$zero,$s0,$s1
-	div	$zero,$t9,$t3
-	div.d	$f29,$f20,$f27
-	div.s	$f4,$f5,$f15
-	divu	$zero,$t9,$t7
-	dmfc1	$t4,$f13
-	dmtc1	$s0,$f14
-	dmult	$s7,$t1
-	dmultu	$a1,$a2
-	dsllv	$zero,$s4,$t4
-	dsrav	$gp,$s2,$s3
-	dsrlv	$s3,$t6,$s4
-	dsub	$a3,$s6,$t0
-	dsubu	$a1,$a1,$k0
-	ehb                      # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
-	eret
-	floor.l.d	$f26,$f7
-	floor.l.s	$f12,$f5
-	floor.w.d	$f14,$f11
-	floor.w.s	$f8,$f9
-	lb	$t8,-14515($t2)
-	lbu	$t0,30195($v1)
-	ld	$sp,-28645($s1)
-	ldc1	$f11,16391($s0)
-	ldc2	$8,-21181($at)
-	ldl	$t8,-4167($t8)
-	ldr	$t6,-30358($s4)
-	ldxc1	$f8,$s7($t7)
-	lh	$t3,-8556($s5)
-	lhu	$s3,-22851($v0)
-	li	$at,-29773
-	li	$zero,-29889
-	ll	$v0,-7321($s2)
-	lld	$zero,-14736($ra)
-	lw	$t0,5674($a1)
-	lwc1	$f16,10225($k0)
-	lwc2	$18,-841($a2)
-	lwl	$s4,-4231($t7)
-	lwr	$zero,-19147($gp)
-	lwu	$s3,-24086($v1)
-	lwxc1	$f12,$s1($s8)
-	mfc1	$a3,$f27
-	mfhi	$s3
-	mfhi	$sp
-	mflo	$s1
-	mov.d	$f20,$f14
-	mov.s	$f2,$f27
-	move	$a0,$a3
-	move	$s5,$a0
-	move	$s8,$a0
-	move	$t9,$a2
-	movf	$gp,$t0,$fcc7
-	movf.d	$f6,$f11,$fcc5
-	movf.s	$f23,$f5,$fcc6
-	movn	$v1,$s1,$s0
-	movn.d	$f27,$f21,$k0
-	movn.s	$f12,$f0,$s7
-	movt	$zero,$s4,$fcc5
-	movt.d	$f0,$f2,$fcc0
-	movt.s	$f30,$f2,$fcc1
-	movz	$a1,$s6,$t1
-	movz.d	$f12,$f29,$t1
-	movz.s	$f25,$f7,$v1
-	mtc1	$s8,$f9
-	mthi	$s1
-	mtlo	$sp
-	mtlo	$t9
-	mul.d	$f20,$f20,$f16
-	mul.s	$f30,$f10,$f2
-	mult	$sp,$s4
-	mult	$sp,$v0
-	multu	$gp,$k0
-	multu	$t1,$s2
-	neg.d	$f27,$f18
-	neg.s	$f1,$f15
-	nop
-	nor	$a3,$zero,$a3
-	or	$t4,$s0,$sp
-	round.l.d	$f12,$f1
-	round.l.s	$f25,$f5
-	round.w.d	$f6,$f4
-	round.w.s	$f27,$f28
-	sb	$s6,-19857($t6)
-	sc	$t7,18904($s3)
-	scd	$t7,-8243($sp)
-	sd	$t4,5835($t2)
-	sdc1	$f31,30574($t5)
-	sdc2	$20,23157($s2)
-	sdl	$a3,-20961($s8)
-	sdr	$t3,-20423($t4)
-	sdxc1	$f11,$t2($t6)
-	sh	$t6,-6704($t7)
-	sllv	$a3,$zero,$t1
-	slt	$s7,$t3,$k1
-	slti	$s1,$t2,9489
-	sltiu	$t9,$t9,-15531
-	sltu	$s4,$s5,$t3
-	sqrt.d	$f17,$f22
-	sqrt.s	$f0,$f1
-	srav	$s1,$s7,$sp
-	srlv	$t9,$s4,$a0
-	ssnop                    # CHECK: ssnop # encoding:  [0x00,0x00,0x00,0x40]
-	sub	$s6,$s3,$t4
-	sub.d	$f18,$f3,$f17
-	sub.s	$f23,$f22,$f22
-	subu	$sp,$s6,$s6
-	sw	$ra,-10160($sp)
-	swc1	$f6,-8465($t8)
-	swc2	$25,24880($s0)
-	swl	$t7,13694($s3)
-	swr	$s1,-26590($t6)
-	swxc1	$f19,$t4($k0)
-	teqi	$s5,-17504
-	tgei	$s1,5025
-	tgeiu	$sp,-28621
-	tlti	$t6,-21059
-	tltiu	$ra,-5076
-	tnei	$t4,-29647
-	trunc.l.d	$f23,$f23
-	trunc.l.s	$f28,$f31
-	trunc.w.d	$f22,$f15
-	trunc.w.s	$f28,$f30
-	xor	$s2,$a0,$s8
+        .set noat
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addu      $9,$a0,$a2
+        and       $s7,$v0,$12
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        ceil.l.d  $f1,$f3
+        ceil.l.s  $f18,$f13
+        ceil.w.d  $f11,$f25
+        ceil.w.s  $f6,$f20
+        cfc1      $s1,$21
+        ctc1      $a2,$26
+        cvt.d.l   $f4,$f16
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.l.d   $f24,$f15
+        cvt.l.s   $f11,$f29
+        cvt.s.d   $f26,$f8
+        cvt.s.l   $f15,$f30
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        dadd      $s3,$at,$ra
+        daddi     $sp,$s4,-27705
+        daddiu    $k0,$s6,-4586
+        daddu     $s3,$at,$ra
+        ddiv      $zero,$k0,$s3
+        ddivu     $zero,$s0,$s1
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        dmfc1     $12,$f13
+        dmtc1     $s0,$f14
+        dmult     $s7,$9
+        dmultu    $a1,$a2
+        dsll      $zero,18             # CHECK: dsll $zero, $zero, 18       # encoding: [0x00,0x00,0x04,0xb8]
+        dsll      $zero,$s4,18         # CHECK: dsll $zero, $20, 18         # encoding: [0x00,0x14,0x04,0xb8]
+        dsll      $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsll32    $zero,18             # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsll32    $zero,$zero,18       # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsllv     $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsra      $gp,10               # CHECK: dsra $gp, $gp, 10           # encoding: [0x00,0x1c,0xe2,0xbb]
+        dsra      $gp,$s2,10           # CHECK: dsra $gp, $18, 10           # encoding: [0x00,0x12,0xe2,0xbb]
+        dsra      $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsra32    $gp,10               # CHECK: dsra32 $gp, $gp, 10         # encoding: [0x00,0x1c,0xe2,0xbf]
+        dsra32    $gp,$s2,10           # CHECK: dsra32 $gp, $18, 10         # encoding: [0x00,0x12,0xe2,0xbf]
+        dsrav     $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsrl      $s3,23               # CHECK: dsrl $19, $19, 23           # encoding: [0x00,0x13,0x9d,0xfa]
+        dsrl      $s3,$6,23            # CHECK: dsrl $19, $6, 23            # encoding: [0x00,0x06,0x9d,0xfa]
+        dsrl      $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsrl32    $s3,23               # CHECK: dsrl32 $19, $19, 23         # encoding: [0x00,0x13,0x9d,0xfe]
+        dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
+        dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsub      $a3,$s6,$8
+        dsubu     $a1,$a1,$k0
+        dsub      $a3,$s6,$8
+        dsubu     $a1,$a1,$k0
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        eret
+        floor.l.d $f26,$f7
+        floor.l.s $f12,$f5
+        floor.w.d $f14,$f11
+        floor.w.s $f8,$f9
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        ld        $sp,-28645($s1)
+        ldc1      $f11,16391($s0)
+        ldc2      $8,-21181($at)
+        ldl       $24,-4167($24)
+        ldr       $14,-30358($s4)
+        ldxc1     $f8,$s7($15)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        ll        $v0,-7321($s2)
+        lld       $zero,-14736($ra)
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        lwu       $s3,-24086($v1)
+        lwxc1     $f12,$s1($s8)
+        mfc1      $a3,$f27
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $a0,$a3
+        move      $s5,$a0
+        move      $s8,$a0
+        move      $25,$a2
+        movf      $gp,$8,$fcc7
+        movf.d    $f6,$f11,$fcc5
+        movf.s    $f23,$f5,$fcc6
+        movn      $v1,$s1,$s0
+        movn.d    $f27,$f21,$k0
+        movn.s    $f12,$f0,$s7
+        movt      $zero,$s4,$fcc5
+        movt.d    $f0,$f2,$fcc0
+        movt.s    $f30,$f2,$fcc1
+        movz      $a1,$s6,$9
+        movz.d    $f12,$f29,$9
+        movz.s    $f25,$f7,$v1
+        mtc1      $s8,$f9
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        round.l.d $f12,$f1
+        round.l.s $f25,$f5
+        round.w.d $f6,$f4
+        round.w.s $f27,$f28
+        sb        $s6,-19857($14)
+        sc        $15,18904($s3)
+        scd       $15,-8243($sp)
+        sd        $12,5835($10)
+        sdc1      $f31,30574($13)
+        sdc2      $20,23157($s2)
+        sdl       $a3,-20961($s8)
+        sdr       $11,-20423($12)
+        sdxc1     $f11,$10($14)
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sqrt.d    $f17,$f22
+        sqrt.s    $f0,$f1
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        swxc1     $f19,$12($k0)
+        teqi      $s5,-17504
+        tgei      $s1,5025
+        tgeiu     $sp,-28621
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlti      $14,-21059
+        tltiu     $ra,-5076
+        tnei      $12,-29647
+        trunc.l.d $f23,$f23
+        trunc.l.s $f28,$f31
+        trunc.w.d $f22,$f15
+        trunc.w.s $f28,$f30
+        xor       $s2,$a0,$s8
diff --git a/test/MC/Mips/mips5/invalid-mips64.s b/test/MC/Mips/mips5/invalid-mips64.s
new file mode 100644
index 0000000..19d64dc
--- /dev/null
+++ b/test/MC/Mips/mips5/invalid-mips64.s
@@ -0,0 +1,21 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips5 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        clo       $11,$a1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        clz       $sp,$gp     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dclo      $s2,$a2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dclz      $s0,$25     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        deret                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd      $s6,$13     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd      $zero,$9    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu     $s3,$gp     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu     $24,$s2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfc0      $a2,$14,1   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub      $s7,$k1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msubu     $15,$a1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtc0      $9,$29,3    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mul       $s0,$s4,$at # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips5/invalid-mips64r2-xfail.s b/test/MC/Mips/mips5/invalid-mips64r2-xfail.s
new file mode 100644
index 0000000..b2b612d
--- /dev/null
+++ b/test/MC/Mips/mips5/invalid-mips64r2-xfail.s
@@ -0,0 +1,11 @@
+# Instructions that are supposed to be invalid but currently aren't
+# This test will XPASS if any insn stops assembling.
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips4 \
+# RUN:     2> %t1
+# RUN: not FileCheck %s < %t1
+# XFAIL: *
+
+# CHECK-NOT: error
+        .set noat
+	rdhwr	$sp,$11
diff --git a/test/MC/Mips/mips5/invalid-mips64r2.s b/test/MC/Mips/mips5/invalid-mips64r2.s
new file mode 100644
index 0000000..b91e520
--- /dev/null
+++ b/test/MC/Mips/mips5/invalid-mips64r2.s
@@ -0,0 +1,43 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips5 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        clo       $11,$a1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        clz       $sp,$gp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dclo      $s2,$a2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dclz      $s0,$25             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        deret                         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        di        $s8                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        drotr     $1,15               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        drotr     $1,$14,15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        drotr32   $1,15               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        drotr32   $1,$14,15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        drotrv    $1,$14,$15          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsbh      $v1,$14             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dshd      $v0,$sp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei        $14                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd      $s6,$13             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd      $zero,$9            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd.s    $f1,$f31,$f19,$f25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu     $s3,$gp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        maddu     $24,$s2             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfc0      $a2,$14,1           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhc1     $s8,$f24            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub      $s7,$k1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub.s    $f12,$f19,$f10,$f16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msubu     $15,$a1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mtc0      $9,$29,3            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthc1     $zero,$f16          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mul       $s0,$s4,$at         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmadd.s   $f0,$f5,$f25,$f12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmsub.s   $f1,$f24,$f19,$f4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pause                         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        rotr      $1,15               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        rotr      $1,$14,15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        rotrv     $1,$14,$15          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        seb       $25,$15             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        seh       $v1,$12             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        wsbh      $k1,$9              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips5/valid-xfail.s b/test/MC/Mips/mips5/valid-xfail.s
index 85d961b..8d1d0d7 100644
--- a/test/MC/Mips/mips5/valid-xfail.s
+++ b/test/MC/Mips/mips5/valid-xfail.s
@@ -2,91 +2,86 @@
 # they aren't implemented yet).
 # This test is set up to XPASS if any instruction generates an encoding.
 #
-# FIXME: Test MIPS-V instead of MIPS64
-# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64   | not FileCheck %s
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips5 | not FileCheck %s
 # CHECK-NOT: encoding
 # XFAIL: *
 
         .set noat
-	abs.ps	$f22,$f8
-	add.ps	$f25,$f27,$f13
-	alnv.ps	$f12,$f18,$f30,$t4
-	c.eq.d	$fcc1,$f15,$f15
-	c.eq.ps	$fcc5,$f0,$f9
-	c.eq.s	$fcc5,$f24,$f17
-	c.f.d	$fcc4,$f11,$f21
-	c.f.ps	$fcc6,$f11,$f11
-	c.f.s	$fcc4,$f30,$f7
-	c.le.d	$fcc4,$f18,$f1
-	c.le.ps	$fcc1,$f7,$f20
-	c.le.s	$fcc6,$f24,$f4
-	c.lt.d	$fcc3,$f9,$f3
-	c.lt.ps	$f19,$f5
-	c.lt.s	$fcc2,$f17,$f14
-	c.nge.d	$fcc5,$f21,$f16
-	c.nge.ps	$f1,$f26
-	c.nge.s	$fcc3,$f11,$f8
-	c.ngl.ps	$f21,$f30
-	c.ngl.s	$fcc2,$f31,$f23
-	c.ngle.ps	$fcc7,$f12,$f20
-	c.ngle.s	$fcc2,$f18,$f23
-	c.ngt.d	$fcc4,$f24,$f7
-	c.ngt.ps	$fcc5,$f30,$f6
-	c.ngt.s	$fcc5,$f8,$f13
-	c.ole.d	$fcc2,$f16,$f31
-	c.ole.ps	$fcc7,$f21,$f8
-	c.ole.s	$fcc3,$f7,$f20
-	c.olt.d	$fcc4,$f19,$f28
-	c.olt.ps	$fcc3,$f7,$f16
-	c.olt.s	$fcc6,$f20,$f7
-	c.seq.d	$fcc4,$f31,$f7
-	c.seq.ps	$fcc6,$f31,$f14
-	c.seq.s	$fcc7,$f1,$f25
-	c.sf.ps	$fcc6,$f4,$f6
-	c.ueq.d	$fcc4,$f13,$f25
-	c.ueq.ps	$fcc1,$f5,$f29
-	c.ueq.s	$fcc6,$f3,$f30
-	c.ule.d	$fcc7,$f25,$f18
-	c.ule.ps	$fcc6,$f17,$f3
-	c.ule.s	$fcc7,$f21,$f30
-	c.ult.d	$fcc6,$f6,$f17
-	c.ult.ps	$fcc7,$f14,$f0
-	c.ult.s	$fcc7,$f24,$f10
-	c.un.d	$fcc6,$f23,$f24
-	c.un.ps	$fcc4,$f2,$f26
-	c.un.s	$fcc1,$f30,$f4
-	cvt.ps.s	$f3,$f18,$f19
-	cvt.s.pl	$f30,$f1
-	cvt.s.pu	$f14,$f25
-	madd.d	$f18,$f19,$f26,$f20
-	madd.ps	$f22,$f3,$f14,$f3
-	madd.s	$f1,$f31,$f19,$f25
-	mov.ps	$f22,$f17
-	movf.ps	$f10,$f28,$fcc6
-	movn.ps	$f31,$f31,$s3
-	movt.ps	$f20,$f25,$fcc2
-	movz.ps	$f18,$f17,$ra
-	msub.d	$f10,$f1,$f31,$f18
-	msub.ps	$f12,$f14,$f29,$f17
-	msub.s	$f12,$f19,$f10,$f16
-	mul.ps	$f14,$f0,$f16
-	neg.ps	$f19,$f13
-	nmadd.d	$f18,$f9,$f14,$f19
-	nmadd.ps	$f27,$f4,$f9,$f25
-	nmadd.s	$f0,$f5,$f25,$f12
-	nmsub.d	$f30,$f8,$f16,$f30
-	nmsub.ps	$f6,$f12,$f14,$f17
-	nmsub.s	$f1,$f24,$f19,$f4
-	pll.ps	$f25,$f9,$f30
-	plu.ps	$f1,$f26,$f29
-	pul.ps	$f9,$f30,$f26
-	puu.ps	$f24,$f9,$f2
-	recip.d	$f19,$f6
-	recip.s	$f3,$f30
-	rsqrt.d	$f3,$f28
-	rsqrt.s	$f4,$f8
-	sub.ps	$f5,$f14,$f26
-	tlbp
-	tlbr
-	tlbwi
-	tlbwr
+        abs.ps          $f22,$f8
+        add.ps          $f25,$f27,$f13
+        alnv.ps         $f12,$f18,$f30,$12
+        c.eq.d          $fcc1,$f15,$f15
+        c.eq.ps         $fcc5,$f0,$f9
+        c.eq.s          $fcc5,$f24,$f17
+        c.f.d           $fcc4,$f11,$f21
+        c.f.ps          $fcc6,$f11,$f11
+        c.f.s           $fcc4,$f30,$f7
+        c.le.d          $fcc4,$f18,$f1
+        c.le.ps         $fcc1,$f7,$f20
+        c.le.s          $fcc6,$f24,$f4
+        c.lt.d          $fcc3,$f9,$f3
+        c.lt.ps         $f19,$f5
+        c.lt.s          $fcc2,$f17,$f14
+        c.nge.d         $fcc5,$f21,$f16
+        c.nge.ps        $f1,$f26
+        c.nge.s         $fcc3,$f11,$f8
+        c.ngl.ps        $f21,$f30
+        c.ngl.s         $fcc2,$f31,$f23
+        c.ngle.ps       $fcc7,$f12,$f20
+        c.ngle.s        $fcc2,$f18,$f23
+        c.ngt.d         $fcc4,$f24,$f7
+        c.ngt.ps        $fcc5,$f30,$f6
+        c.ngt.s         $fcc5,$f8,$f13
+        c.ole.d         $fcc2,$f16,$f31
+        c.ole.ps        $fcc7,$f21,$f8
+        c.ole.s         $fcc3,$f7,$f20
+        c.olt.d         $fcc4,$f19,$f28
+        c.olt.ps        $fcc3,$f7,$f16
+        c.olt.s         $fcc6,$f20,$f7
+        c.seq.d         $fcc4,$f31,$f7
+        c.seq.ps        $fcc6,$f31,$f14
+        c.seq.s         $fcc7,$f1,$f25
+        c.sf.ps         $fcc6,$f4,$f6
+        c.ueq.d         $fcc4,$f13,$f25
+        c.ueq.ps        $fcc1,$f5,$f29
+        c.ueq.s         $fcc6,$f3,$f30
+        c.ule.d         $fcc7,$f25,$f18
+        c.ule.ps        $fcc6,$f17,$f3
+        c.ule.s         $fcc7,$f21,$f30
+        c.ult.d         $fcc6,$f6,$f17
+        c.ult.ps        $fcc7,$f14,$f0
+        c.ult.s         $fcc7,$f24,$f10
+        c.un.d          $fcc6,$f23,$f24
+        c.un.ps         $fcc4,$f2,$f26
+        c.un.s          $fcc1,$f30,$f4
+        cvt.ps.s        $f3,$f18,$f19
+        cvt.s.pl        $f30,$f1
+        cvt.s.pu        $f14,$f25
+        madd.d          $f18,$f19,$f26,$f20
+        madd.ps         $f22,$f3,$f14,$f3
+        madd.s          $f1,$f31,$f19,$f25
+        mov.ps          $f22,$f17
+        movf.ps         $f10,$f28,$fcc6
+        movn.ps         $f31,$f31,$s3
+        movt.ps         $f20,$f25,$fcc2
+        movz.ps         $f18,$f17,$ra
+        msub.d          $f10,$f1,$f31,$f18
+        msub.ps         $f12,$f14,$f29,$f17
+        msub.s          $f12,$f19,$f10,$f16
+        mul.ps          $f14,$f0,$f16
+        neg.ps          $f19,$f13
+        nmadd.d         $f18,$f9,$f14,$f19
+        nmadd.ps        $f27,$f4,$f9,$f25
+        nmadd.s         $f0,$f5,$f25,$f12
+        nmsub.d         $f30,$f8,$f16,$f30
+        nmsub.ps        $f6,$f12,$f14,$f17
+        nmsub.s         $f1,$f24,$f19,$f4
+        pll.ps          $f25,$f9,$f30
+        plu.ps          $f1,$f26,$f29
+        pul.ps          $f9,$f30,$f26
+        puu.ps          $f24,$f9,$f2
+        recip.d         $f19,$f6
+        recip.s         $f3,$f30
+        rsqrt.d         $f3,$f28
+        rsqrt.s         $f4,$f8
+        sub.ps          $f5,$f14,$f26
diff --git a/test/MC/Mips/mips5/valid.s b/test/MC/Mips/mips5/valid.s
index ebe2f70..19aad05 100644
--- a/test/MC/Mips/mips5/valid.s
+++ b/test/MC/Mips/mips5/valid.s
@@ -1,163 +1,196 @@
 # Instructions that are valid
 #
-# FIXME: Test MIPS-V instead of MIPS64
-# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64   | FileCheck %s
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips5 | FileCheck %s
 
         .set noat
-	abs.d	$f7,$f25 # CHECK: encoding
-	abs.s	$f9,$f16
-	add	$s7,$s2,$a1
-	add.d	$f1,$f7,$f29
-	add.s	$f8,$f21,$f24
-	addi	$t5,$t1,26322
-	addu	$t1,$a0,$a2
-	and	$s7,$v0,$t4
-	c.ngl.d	$f29,$f29
-	c.ngle.d	$f0,$f16
-	c.sf.d	$f30,$f0
-	c.sf.s	$f14,$f22
-	ceil.l.d	$f1,$f3
-	ceil.l.s	$f18,$f13
-	ceil.w.d	$f11,$f25
-	ceil.w.s	$f6,$f20
-	cfc1	$s1,$21
-	ctc1	$a2,$26
-	cvt.d.l	$f4,$f16
-	cvt.d.s	$f22,$f28
-	cvt.d.w	$f26,$f11
-	cvt.l.d	$f24,$f15
-	cvt.l.s	$f11,$f29
-	cvt.s.d	$f26,$f8
-	cvt.s.l	$f15,$f30
-	cvt.s.w	$f22,$f15
-	cvt.w.d	$f20,$f14
-	cvt.w.s	$f20,$f24
-	dadd	$s3,$at,$ra
-	daddi	$sp,$s4,-27705
-	daddiu	$k0,$s6,-4586
-	ddiv	$zero,$k0,$s3
-	ddivu	$zero,$s0,$s1
-	div	$zero,$t9,$t3
-	div.d	$f29,$f20,$f27
-	div.s	$f4,$f5,$f15
-	divu	$zero,$t9,$t7
-	dmfc1	$t4,$f13
-	dmtc1	$s0,$f14
-	dmult	$s7,$t1
-	dmultu	$a1,$a2
-	dsllv	$zero,$s4,$t4
-	dsrav	$gp,$s2,$s3
-	dsrlv	$s3,$t6,$s4
-	dsub	$a3,$s6,$t0
-	dsubu	$a1,$a1,$k0
-	ehb                      # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
-	eret
-	floor.l.d	$f26,$f7
-	floor.l.s	$f12,$f5
-	floor.w.d	$f14,$f11
-	floor.w.s	$f8,$f9
-	lb	$t8,-14515($t2)
-	lbu	$t0,30195($v1)
-	ld	$sp,-28645($s1)
-	ldc1	$f11,16391($s0)
-	ldc2	$8,-21181($at)
-	ldl	$t8,-4167($t8)
-	ldr	$t6,-30358($s4)
-	ldxc1	$f8,$s7($t7)
-	lh	$t3,-8556($s5)
-	lhu	$s3,-22851($v0)
-	li	$at,-29773
-	li	$zero,-29889
-	ll	$v0,-7321($s2)
-	lld	$zero,-14736($ra)
-	luxc1	$f19,$s6($s5)
-	lw	$t0,5674($a1)
-	lwc1	$f16,10225($k0)
-	lwc2	$18,-841($a2)
-	lwl	$s4,-4231($t7)
-	lwr	$zero,-19147($gp)
-	lwu	$s3,-24086($v1)
-	lwxc1	$f12,$s1($s8)
-	mfc1	$a3,$f27
-	mfhi	$s3
-	mfhi	$sp
-	mflo	$s1
-	mov.d	$f20,$f14
-	mov.s	$f2,$f27
-	move	$a0,$a3
-	move	$s5,$a0
-	move	$s8,$a0
-	move	$t9,$a2
-	movf	$gp,$t0,$fcc7
-	movf.d	$f6,$f11,$fcc5
-	movf.s	$f23,$f5,$fcc6
-	movn	$v1,$s1,$s0
-	movn.d	$f27,$f21,$k0
-	movn.s	$f12,$f0,$s7
-	movt	$zero,$s4,$fcc5
-	movt.d	$f0,$f2,$fcc0
-	movt.s	$f30,$f2,$fcc1
-	movz	$a1,$s6,$t1
-	movz.d	$f12,$f29,$t1
-	movz.s	$f25,$f7,$v1
-	mtc1	$s8,$f9
-	mthi	$s1
-	mtlo	$sp
-	mtlo	$t9
-	mul.d	$f20,$f20,$f16
-	mul.s	$f30,$f10,$f2
-	mult	$sp,$s4
-	mult	$sp,$v0
-	multu	$gp,$k0
-	multu	$t1,$s2
-	neg.d	$f27,$f18
-	neg.s	$f1,$f15
-	nop
-	nor	$a3,$zero,$a3
-	or	$t4,$s0,$sp
-	round.l.d	$f12,$f1
-	round.l.s	$f25,$f5
-	round.w.d	$f6,$f4
-	round.w.s	$f27,$f28
-	sb	$s6,-19857($t6)
-	sc	$t7,18904($s3)
-	scd	$t7,-8243($sp)
-	sd	$t4,5835($t2)
-	sdc1	$f31,30574($t5)
-	sdc2	$20,23157($s2)
-	sdl	$a3,-20961($s8)
-	sdr	$t3,-20423($t4)
-	sdxc1	$f11,$t2($t6)
-	sh	$t6,-6704($t7)
-	sllv	$a3,$zero,$t1
-	slt	$s7,$t3,$k1
-	slti	$s1,$t2,9489
-	sltiu	$t9,$t9,-15531
-	sltu	$s4,$s5,$t3
-	sqrt.d	$f17,$f22
-	sqrt.s	$f0,$f1
-	srav	$s1,$s7,$sp
-	srlv	$t9,$s4,$a0
-	ssnop                    # CHECK: ssnop # encoding:  [0x00,0x00,0x00,0x40]
-	sub	$s6,$s3,$t4
-	sub.d	$f18,$f3,$f17
-	sub.s	$f23,$f22,$f22
-	subu	$sp,$s6,$s6
-	suxc1	$f12,$k1($t5)
-	sw	$ra,-10160($sp)
-	swc1	$f6,-8465($t8)
-	swc2	$25,24880($s0)
-	swl	$t7,13694($s3)
-	swr	$s1,-26590($t6)
-	swxc1	$f19,$t4($k0)
-	teqi	$s5,-17504
-	tgei	$s1,5025
-	tgeiu	$sp,-28621
-	tlti	$t6,-21059
-	tltiu	$ra,-5076
-	tnei	$t4,-29647
-	trunc.l.d	$f23,$f23
-	trunc.l.s	$f28,$f31
-	trunc.w.d	$f22,$f15
-	trunc.w.s	$f28,$f30
-	xor	$s2,$a0,$s8
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addu      $9,$a0,$a2
+        and       $s7,$v0,$12
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        ceil.l.d  $f1,$f3
+        ceil.l.s  $f18,$f13
+        ceil.w.d  $f11,$f25
+        ceil.w.s  $f6,$f20
+        cfc1      $s1,$21
+        ctc1      $a2,$26
+        cvt.d.l   $f4,$f16
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.l.d   $f24,$f15
+        cvt.l.s   $f11,$f29
+        cvt.s.d   $f26,$f8
+        cvt.s.l   $f15,$f30
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        dadd      $s3,$at,$ra
+        daddi     $sp,$s4,-27705
+        daddiu    $k0,$s6,-4586
+        daddu     $s3,$at,$ra
+        ddiv      $zero,$k0,$s3
+        ddivu     $zero,$s0,$s1
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        dmfc1     $12,$f13
+        dmtc1     $s0,$f14
+        dmult     $s7,$9
+        dmultu    $a1,$a2
+        dsll      $zero,18             # CHECK: dsll $zero, $zero, 18       # encoding: [0x00,0x00,0x04,0xb8]
+        dsll      $zero,$s4,18         # CHECK: dsll $zero, $20, 18         # encoding: [0x00,0x14,0x04,0xb8]
+        dsll      $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsll32    $zero,18             # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsll32    $zero,$zero,18       # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsllv     $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsra      $gp,10               # CHECK: dsra $gp, $gp, 10           # encoding: [0x00,0x1c,0xe2,0xbb]
+        dsra      $gp,$s2,10           # CHECK: dsra $gp, $18, 10           # encoding: [0x00,0x12,0xe2,0xbb]
+        dsra      $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsra32    $gp,10               # CHECK: dsra32 $gp, $gp, 10         # encoding: [0x00,0x1c,0xe2,0xbf]
+        dsra32    $gp,$s2,10           # CHECK: dsra32 $gp, $18, 10         # encoding: [0x00,0x12,0xe2,0xbf]
+        dsrav     $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsrl      $s3,23               # CHECK: dsrl $19, $19, 23           # encoding: [0x00,0x13,0x9d,0xfa]
+        dsrl      $s3,$6,23            # CHECK: dsrl $19, $6, 23            # encoding: [0x00,0x06,0x9d,0xfa]
+        dsrl      $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsrl32    $s3,23               # CHECK: dsrl32 $19, $19, 23         # encoding: [0x00,0x13,0x9d,0xfe]
+        dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
+        dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsub      $a3,$s6,$8
+        dsubu     $a1,$a1,$k0
+        dsub      $a3,$s6,$8
+        dsubu     $a1,$a1,$k0
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        eret
+        floor.l.d $f26,$f7
+        floor.l.s $f12,$f5
+        floor.w.d $f14,$f11
+        floor.w.s $f8,$f9
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        ld        $sp,-28645($s1)
+        ldc1      $f11,16391($s0)
+        ldc2      $8,-21181($at)
+        ldl       $24,-4167($24)
+        ldr       $14,-30358($s4)
+        ldxc1     $f8,$s7($15)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        ll        $v0,-7321($s2)
+        lld       $zero,-14736($ra)
+        luxc1     $f19,$s6($s5)
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        lwu       $s3,-24086($v1)
+        lwxc1     $f12,$s1($s8)
+        mfc1      $a3,$f27
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $a0,$a3
+        move      $s5,$a0
+        move      $s8,$a0
+        move      $25,$a2
+        movf      $gp,$8,$fcc7
+        movf.d    $f6,$f11,$fcc5
+        movf.s    $f23,$f5,$fcc6
+        movn      $v1,$s1,$s0
+        movn.d    $f27,$f21,$k0
+        movn.s    $f12,$f0,$s7
+        movt      $zero,$s4,$fcc5
+        movt.d    $f0,$f2,$fcc0
+        movt.s    $f30,$f2,$fcc1
+        movz      $a1,$s6,$9
+        movz.d    $f12,$f29,$9
+        movz.s    $f25,$f7,$v1
+        mtc1      $s8,$f9
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        round.l.d $f12,$f1
+        round.l.s $f25,$f5
+        round.w.d $f6,$f4
+        round.w.s $f27,$f28
+        sb        $s6,-19857($14)
+        sc        $15,18904($s3)
+        scd       $15,-8243($sp)
+        sd        $12,5835($10)
+        sdc1      $f31,30574($13)
+        sdc2      $20,23157($s2)
+        sdl       $a3,-20961($s8)
+        sdr       $11,-20423($12)
+        sdxc1     $f11,$10($14)
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sqrt.d    $f17,$f22
+        sqrt.s    $f0,$f1
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        suxc1     $f12,$k1($13)
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        swxc1     $f19,$12($k0)
+        teqi      $s5,-17504
+        tgei      $s1,5025
+        tgeiu     $sp,-28621
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlti      $14,-21059
+        tltiu     $ra,-5076
+        tnei      $12,-29647
+        trunc.l.d $f23,$f23
+        trunc.l.s $f28,$f31
+        trunc.w.d $f22,$f15
+        trunc.w.s $f28,$f30
+        xor       $s2,$a0,$s8
diff --git a/test/MC/Mips/mips64/invalid-mips64r2-xfail.s b/test/MC/Mips/mips64/invalid-mips64r2-xfail.s
index 4baf26b..b2b612d 100644
--- a/test/MC/Mips/mips64/invalid-mips64r2-xfail.s
+++ b/test/MC/Mips/mips64/invalid-mips64r2-xfail.s
@@ -8,8 +8,4 @@
 
 # CHECK-NOT: error
         .set noat
-	di	$s8
-	ei	$t6
-	mfhc1	$s8,$f24
-	mthc1	$zero,$f16
 	rdhwr	$sp,$11
diff --git a/test/MC/Mips/mips64/invalid-mips64r2.s b/test/MC/Mips/mips64/invalid-mips64r2.s
index 41aa8ae..1a5abb6 100644
--- a/test/MC/Mips/mips64/invalid-mips64r2.s
+++ b/test/MC/Mips/mips64/invalid-mips64r2.s
@@ -5,13 +5,25 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
-	dsbh	$v1,$t6             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-	dshd	$v0,$sp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-	madd.s	$f1,$f31,$f19,$f25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-	msub.s	$f12,$f19,$f10,$f16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-	nmadd.s	$f0,$f5,$f25,$f12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-	nmsub.s	$f1,$f24,$f19,$f4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-	pause                       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-	seb	$t9,$t7             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-	seh	$v1,$t4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
-	wsbh	$k1,$t1             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        di        $s8                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        drotr     $1,15               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        drotr     $1,$14,15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        drotr32   $1,15               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        drotr32   $1,$14,15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        drotrv    $1,$14,$15          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsbh      $v1,$14             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dshd      $v0,$sp             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei        $14                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        madd.s    $f1,$f31,$f19,$f25  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mfhc1     $s8,$f24            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        msub.s    $f12,$f19,$f10,$f16 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        mthc1     $zero,$f16          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmadd.s   $f0,$f5,$f25,$f12   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        nmsub.s   $f1,$f24,$f19,$f4   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        pause                         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        rotr      $1,15               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        rotr      $1,$14,15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        rotrv     $1,$14,$15          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        seb       $25,$15             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        seh       $v1,$12             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        wsbh      $k1,$9              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64/valid-xfail.s b/test/MC/Mips/mips64/valid-xfail.s
index 61bf060..e5455f5 100644
--- a/test/MC/Mips/mips64/valid-xfail.s
+++ b/test/MC/Mips/mips64/valid-xfail.s
@@ -6,93 +6,89 @@
 # CHECK-NOT: encoding
 # XFAIL: *
 
-	.set noat
-	abs.ps	$f22,$f8
-	add.ps	$f25,$f27,$f13
-	alnv.ob	$v22,$v19,$v30,$v1
-	alnv.ob	$v31,$v23,$v30,$at
-	alnv.ob	$v8,$v17,$v30,$a1
-	alnv.ps	$f12,$f18,$f30,$t4
-	c.eq.d	$fcc1,$f15,$f15
-	c.eq.ps	$fcc5,$f0,$f9
-	c.eq.s	$fcc5,$f24,$f17
-	c.f.d	$fcc4,$f11,$f21
-	c.f.ps	$fcc6,$f11,$f11
-	c.f.s	$fcc4,$f30,$f7
-	c.le.d	$fcc4,$f18,$f1
-	c.le.ps	$fcc1,$f7,$f20
-	c.le.s	$fcc6,$f24,$f4
-	c.lt.d	$fcc3,$f9,$f3
-	c.lt.ps	$f19,$f5
-	c.lt.s	$fcc2,$f17,$f14
-	c.nge.d	$fcc5,$f21,$f16
-	c.nge.ps	$f1,$f26
-	c.nge.s	$fcc3,$f11,$f8
-	c.ngl.ps	$f21,$f30
-	c.ngl.s	$fcc2,$f31,$f23
-	c.ngle.ps	$fcc7,$f12,$f20
-	c.ngle.s	$fcc2,$f18,$f23
-	c.ngt.d	$fcc4,$f24,$f7
-	c.ngt.ps	$fcc5,$f30,$f6
-	c.ngt.s	$fcc5,$f8,$f13
-	c.ole.d	$fcc2,$f16,$f31
-	c.ole.ps	$fcc7,$f21,$f8
-	c.ole.s	$fcc3,$f7,$f20
-	c.olt.d	$fcc4,$f19,$f28
-	c.olt.ps	$fcc3,$f7,$f16
-	c.olt.s	$fcc6,$f20,$f7
-	c.seq.d	$fcc4,$f31,$f7
-	c.seq.ps	$fcc6,$f31,$f14
-	c.seq.s	$fcc7,$f1,$f25
-	c.sf.ps	$fcc6,$f4,$f6
-	c.ueq.d	$fcc4,$f13,$f25
-	c.ueq.ps	$fcc1,$f5,$f29
-	c.ueq.s	$fcc6,$f3,$f30
-	c.ule.d	$fcc7,$f25,$f18
-	c.ule.ps	$fcc6,$f17,$f3
-	c.ule.s	$fcc7,$f21,$f30
-	c.ult.d	$fcc6,$f6,$f17
-	c.ult.ps	$fcc7,$f14,$f0
-	c.ult.s	$fcc7,$f24,$f10
-	c.un.d	$fcc6,$f23,$f24
-	c.un.ps	$fcc4,$f2,$f26
-	c.un.s	$fcc1,$f30,$f4
-	cvt.ps.s	$f3,$f18,$f19
-	cvt.s.pl	$f30,$f1
-	cvt.s.pu	$f14,$f25
-	dmfc0	$t2,c0_watchhi,2
-	dmtc0	$t7,c0_datalo
-	madd.d	$f18,$f19,$f26,$f20
-	madd.ps	$f22,$f3,$f14,$f3
-	madd.s	$f1,$f31,$f19,$f25
-	mov.ps	$f22,$f17
-	movf.ps	$f10,$f28,$fcc6
-	movn.ps	$f31,$f31,$s3
-	movt.ps	$f20,$f25,$fcc2
-	movz.ps	$f18,$f17,$ra
-	msgn.qh	$v0,$v24,$v20
-	msgn.qh	$v12,$v21,$v0[1]
-	msub.d	$f10,$f1,$f31,$f18
-	msub.ps	$f12,$f14,$f29,$f17
-	msub.s	$f12,$f19,$f10,$f16
-	mul.ps	$f14,$f0,$f16
-	neg.ps	$f19,$f13
-	nmadd.d	$f18,$f9,$f14,$f19
-	nmadd.ps	$f27,$f4,$f9,$f25
-	nmadd.s	$f0,$f5,$f25,$f12
-	nmsub.d	$f30,$f8,$f16,$f30
-	nmsub.ps	$f6,$f12,$f14,$f17
-	nmsub.s	$f1,$f24,$f19,$f4
-	pll.ps	$f25,$f9,$f30
-	plu.ps	$f1,$f26,$f29
-	pul.ps	$f9,$f30,$f26
-	puu.ps	$f24,$f9,$f2
-	recip.d	$f19,$f6
-	recip.s	$f3,$f30
-	rsqrt.d	$f3,$f28
-	rsqrt.s	$f4,$f8
-	sub.ps	$f5,$f14,$f26
-	tlbp
-	tlbr
-	tlbwi
-	tlbwr
+        .set noat
+        abs.ps          $f22,$f8
+        add.ps          $f25,$f27,$f13
+        alnv.ob         $v22,$v19,$v30,$v1
+        alnv.ob         $v31,$v23,$v30,$at
+        alnv.ob         $v8,$v17,$v30,$a1
+        alnv.ps         $f12,$f18,$f30,$12
+        c.eq.d          $fcc1,$f15,$f15
+        c.eq.ps         $fcc5,$f0,$f9
+        c.eq.s          $fcc5,$f24,$f17
+        c.f.d           $fcc4,$f11,$f21
+        c.f.ps          $fcc6,$f11,$f11
+        c.f.s           $fcc4,$f30,$f7
+        c.le.d          $fcc4,$f18,$f1
+        c.le.ps         $fcc1,$f7,$f20
+        c.le.s          $fcc6,$f24,$f4
+        c.lt.d          $fcc3,$f9,$f3
+        c.lt.ps         $f19,$f5
+        c.lt.s          $fcc2,$f17,$f14
+        c.nge.d         $fcc5,$f21,$f16
+        c.nge.ps        $f1,$f26
+        c.nge.s         $fcc3,$f11,$f8
+        c.ngl.ps        $f21,$f30
+        c.ngl.s         $fcc2,$f31,$f23
+        c.ngle.ps       $fcc7,$f12,$f20
+        c.ngle.s        $fcc2,$f18,$f23
+        c.ngt.d         $fcc4,$f24,$f7
+        c.ngt.ps        $fcc5,$f30,$f6
+        c.ngt.s         $fcc5,$f8,$f13
+        c.ole.d         $fcc2,$f16,$f31
+        c.ole.ps        $fcc7,$f21,$f8
+        c.ole.s         $fcc3,$f7,$f20
+        c.olt.d         $fcc4,$f19,$f28
+        c.olt.ps        $fcc3,$f7,$f16
+        c.olt.s         $fcc6,$f20,$f7
+        c.seq.d         $fcc4,$f31,$f7
+        c.seq.ps        $fcc6,$f31,$f14
+        c.seq.s         $fcc7,$f1,$f25
+        c.sf.ps         $fcc6,$f4,$f6
+        c.ueq.d         $fcc4,$f13,$f25
+        c.ueq.ps        $fcc1,$f5,$f29
+        c.ueq.s         $fcc6,$f3,$f30
+        c.ule.d         $fcc7,$f25,$f18
+        c.ule.ps        $fcc6,$f17,$f3
+        c.ule.s         $fcc7,$f21,$f30
+        c.ult.d         $fcc6,$f6,$f17
+        c.ult.ps        $fcc7,$f14,$f0
+        c.ult.s         $fcc7,$f24,$f10
+        c.un.d          $fcc6,$f23,$f24
+        c.un.ps         $fcc4,$f2,$f26
+        c.un.s          $fcc1,$f30,$f4
+        cvt.ps.s        $f3,$f18,$f19
+        cvt.s.pl        $f30,$f1
+        cvt.s.pu        $f14,$f25
+        dmfc0           $10,c0_watchhi,2
+        dmtc0           $15,c0_datalo
+        madd.d          $f18,$f19,$f26,$f20
+        madd.ps         $f22,$f3,$f14,$f3
+        madd.s          $f1,$f31,$f19,$f25
+        mov.ps          $f22,$f17
+        movf.ps         $f10,$f28,$fcc6
+        movn.ps         $f31,$f31,$s3
+        movt.ps         $f20,$f25,$fcc2
+        movz.ps         $f18,$f17,$ra
+        msgn.qh         $v0,$v24,$v20
+        msgn.qh         $v12,$v21,$v0[1]
+        msub.d          $f10,$f1,$f31,$f18
+        msub.ps         $f12,$f14,$f29,$f17
+        msub.s          $f12,$f19,$f10,$f16
+        mul.ps          $f14,$f0,$f16
+        neg.ps          $f19,$f13
+        nmadd.d         $f18,$f9,$f14,$f19
+        nmadd.ps        $f27,$f4,$f9,$f25
+        nmadd.s         $f0,$f5,$f25,$f12
+        nmsub.d         $f30,$f8,$f16,$f30
+        nmsub.ps        $f6,$f12,$f14,$f17
+        nmsub.s         $f1,$f24,$f19,$f4
+        pll.ps          $f25,$f9,$f30
+        plu.ps          $f1,$f26,$f29
+        pul.ps          $f9,$f30,$f26
+        puu.ps          $f24,$f9,$f2
+        recip.d         $f19,$f6
+        recip.s         $f3,$f30
+        rsqrt.d         $f3,$f28
+        rsqrt.s         $f4,$f8
+        sub.ps          $f5,$f14,$f26
diff --git a/test/MC/Mips/mips64/valid.s b/test/MC/Mips/mips64/valid.s
index 9ccb2ff..b9e1002 100644
--- a/test/MC/Mips/mips64/valid.s
+++ b/test/MC/Mips/mips64/valid.s
@@ -3,174 +3,208 @@
 # RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64 | FileCheck %s
 
         .set noat
-	abs.d	$f7,$f25 # CHECK: encoding
-	abs.s	$f9,$f16
-	add	$s7,$s2,$a1
-	add.d	$f1,$f7,$f29
-	add.s	$f8,$f21,$f24
-	addi	$t5,$t1,26322
-	addu	$t1,$a0,$a2
-	and	$s7,$v0,$t4
-	c.ngl.d	$f29,$f29
-	c.ngle.d	$f0,$f16
-	c.sf.d	$f30,$f0
-	c.sf.s	$f14,$f22
-	ceil.l.d	$f1,$f3
-	ceil.l.s	$f18,$f13
-	ceil.w.d	$f11,$f25
-	ceil.w.s	$f6,$f20
-	cfc1	$s1,$21
-	clo	$t3,$a1
-	clz	$sp,$gp
-	ctc1	$a2,$26
-	cvt.d.l	$f4,$f16
-	cvt.d.s	$f22,$f28
-	cvt.d.w	$f26,$f11
-	cvt.l.d	$f24,$f15
-	cvt.l.s	$f11,$f29
-	cvt.s.d	$f26,$f8
-	cvt.s.l	$f15,$f30
-	cvt.s.w	$f22,$f15
-	cvt.w.d	$f20,$f14
-	cvt.w.s	$f20,$f24
-	dadd	$s3,$at,$ra
-	daddi	$sp,$s4,-27705
-	daddiu	$k0,$s6,-4586
-	dclo	$s2,$a2
-	dclz	$s0,$t9
-	deret
-	ddiv	$zero,$k0,$s3
-	ddivu	$zero,$s0,$s1
-	div	$zero,$t9,$t3
-	div.d	$f29,$f20,$f27
-	div.s	$f4,$f5,$f15
-	divu	$zero,$t9,$t7
-	dmfc1	$t4,$f13
-	dmtc1	$s0,$f14
-	dmult	$s7,$t1
-	dmultu	$a1,$a2
-	dsllv	$zero,$s4,$t4
-	dsrav	$gp,$s2,$s3
-	dsrlv	$s3,$t6,$s4
-	dsub	$a3,$s6,$t0
-	dsubu	$a1,$a1,$k0
-	ehb                      # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
-	eret
-	floor.l.d	$f26,$f7
-	floor.l.s	$f12,$f5
-	floor.w.d	$f14,$f11
-	floor.w.s	$f8,$f9
-	lb	$t8,-14515($t2)
-	lbu	$t0,30195($v1)
-	ld	$sp,-28645($s1)
-	ldc1	$f11,16391($s0)
-	ldc2	$8,-21181($at)
-	ldl	$t8,-4167($t8)
-	ldr	$t6,-30358($s4)
-	ldxc1	$f8,$s7($t7)
-	lh	$t3,-8556($s5)
-	lhu	$s3,-22851($v0)
-	li	$at,-29773
-	li	$zero,-29889
-	ll	$v0,-7321($s2)
-	lld	$zero,-14736($ra)
-	luxc1	$f19,$s6($s5)
-	lw	$t0,5674($a1)
-	lwc1	$f16,10225($k0)
-	lwc2	$18,-841($a2)
-	lwl	$s4,-4231($t7)
-	lwr	$zero,-19147($gp)
-	lwu	$s3,-24086($v1)
-	lwxc1	$f12,$s1($s8)
-	madd	$s6,$t5
-	madd	$zero,$t1
-	maddu	$s3,$gp
-	maddu	$t8,$s2
-	mfc0	$a2,$14,1
-	mfc1	$a3,$f27
-	mfhi	$s3
-	mfhi	$sp
-	mflo	$s1
-	mov.d	$f20,$f14
-	mov.s	$f2,$f27
-	move	$a0,$a3
-	move	$s5,$a0
-	move	$s8,$a0
-	move	$t9,$a2
-	movf	$gp,$t0,$fcc7
-	movf.d	$f6,$f11,$fcc5
-	movf.s	$f23,$f5,$fcc6
-	movn	$v1,$s1,$s0
-	movn.d	$f27,$f21,$k0
-	movn.s	$f12,$f0,$s7
-	movt	$zero,$s4,$fcc5
-	movt.d	$f0,$f2,$fcc0
-	movt.s	$f30,$f2,$fcc1
-	movz	$a1,$s6,$t1
-	movz.d	$f12,$f29,$t1
-	movz.s	$f25,$f7,$v1
-	msub	$s7,$k1
-	msubu	$t7,$a1
-	mtc0	$t1,$29,3
-	mtc1	$s8,$f9
-	mthi	$s1
-	mtlo	$sp
-	mtlo	$t9
-	mul	$s0,$s4,$at
-	mul.d	$f20,$f20,$f16
-	mul.s	$f30,$f10,$f2
-	mult	$sp,$s4
-	mult	$sp,$v0
-	multu	$gp,$k0
-	multu	$t1,$s2
-	neg.d	$f27,$f18
-	neg.s	$f1,$f15
-	nop
-	nor	$a3,$zero,$a3
-	or	$t4,$s0,$sp
-	round.l.d	$f12,$f1
-	round.l.s	$f25,$f5
-	round.w.d	$f6,$f4
-	round.w.s	$f27,$f28
-	sb	$s6,-19857($t6)
-	sc	$t7,18904($s3)
-	scd	$t7,-8243($sp)
-	sd	$t4,5835($t2)
-	sdc1	$f31,30574($t5)
-	sdc2	$20,23157($s2)
-	sdl	$a3,-20961($s8)
-	sdr	$t3,-20423($t4)
-	sdxc1	$f11,$t2($t6)
-	sh	$t6,-6704($t7)
-	sllv	$a3,$zero,$t1
-	slt	$s7,$t3,$k1
-	slti	$s1,$t2,9489
-	sltiu	$t9,$t9,-15531
-	sltu	$s4,$s5,$t3
-	sqrt.d	$f17,$f22
-	sqrt.s	$f0,$f1
-	srav	$s1,$s7,$sp
-	srlv	$t9,$s4,$a0
-	ssnop                    # CHECK: ssnop # encoding:  [0x00,0x00,0x00,0x40]
-	sub	$s6,$s3,$t4
-	sub.d	$f18,$f3,$f17
-	sub.s	$f23,$f22,$f22
-	subu	$sp,$s6,$s6
-	suxc1	$f12,$k1($t5)
-	sw	$ra,-10160($sp)
-	swc1	$f6,-8465($t8)
-	swc2	$25,24880($s0)
-	swl	$t7,13694($s3)
-	swr	$s1,-26590($t6)
-	swxc1	$f19,$t4($k0)
-	teqi	$s5,-17504
-	tgei	$s1,5025
-	tgeiu	$sp,-28621
-	tlti	$t6,-21059
-	tltiu	$ra,-5076
-	tnei	$t4,-29647
-	trunc.l.d	$f23,$f23
-	trunc.l.s	$f28,$f31
-	trunc.w.d	$f22,$f15
-	trunc.w.s	$f28,$f30
-	xor	$s2,$a0,$s8
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addu      $9,$a0,$a2
+        and       $s7,$v0,$12
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        ceil.l.d  $f1,$f3
+        ceil.l.s  $f18,$f13
+        ceil.w.d  $f11,$f25
+        ceil.w.s  $f6,$f20
+        cfc1      $s1,$21
+        clo       $11,$a1
+        clz       $sp,$gp
+        ctc1      $a2,$26
+        cvt.d.l   $f4,$f16
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.l.d   $f24,$f15
+        cvt.l.s   $f11,$f29
+        cvt.s.d   $f26,$f8
+        cvt.s.l   $f15,$f30
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        dadd      $s3,$at,$ra
+        daddi     $sp,$s4,-27705
+        daddiu    $k0,$s6,-4586
+        daddu     $s3,$at,$ra
+        dclo      $s2,$a2
+        dclz      $s0,$25
+        deret
+        ddiv      $zero,$k0,$s3
+        ddivu     $zero,$s0,$s1
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        dmfc1     $12,$f13
+        dmtc1     $s0,$f14
+        dmult     $s7,$9
+        dmultu    $a1,$a2
+        dsll      $zero,18             # CHECK: dsll $zero, $zero, 18       # encoding: [0x00,0x00,0x04,0xb8]
+        dsll      $zero,$s4,18         # CHECK: dsll $zero, $20, 18         # encoding: [0x00,0x14,0x04,0xb8]
+        dsll      $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsll32    $zero,18             # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsll32    $zero,$zero,18       # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsllv     $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsra      $gp,10               # CHECK: dsra $gp, $gp, 10           # encoding: [0x00,0x1c,0xe2,0xbb]
+        dsra      $gp,$s2,10           # CHECK: dsra $gp, $18, 10           # encoding: [0x00,0x12,0xe2,0xbb]
+        dsra      $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsra32    $gp,10               # CHECK: dsra32 $gp, $gp, 10         # encoding: [0x00,0x1c,0xe2,0xbf]
+        dsra32    $gp,$s2,10           # CHECK: dsra32 $gp, $18, 10         # encoding: [0x00,0x12,0xe2,0xbf]
+        dsrav     $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsrl      $s3,23               # CHECK: dsrl $19, $19, 23           # encoding: [0x00,0x13,0x9d,0xfa]
+        dsrl      $s3,$6,23            # CHECK: dsrl $19, $6, 23            # encoding: [0x00,0x06,0x9d,0xfa]
+        dsrl      $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsrl32    $s3,23               # CHECK: dsrl32 $19, $19, 23         # encoding: [0x00,0x13,0x9d,0xfe]
+        dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
+        dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsub      $a3,$s6,$8
+        dsubu     $a1,$a1,$k0
+        dsub      $a3,$s6,$8
+        dsubu     $a1,$a1,$k0
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        eret
+        floor.l.d $f26,$f7
+        floor.l.s $f12,$f5
+        floor.w.d $f14,$f11
+        floor.w.s $f8,$f9
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        ld        $sp,-28645($s1)
+        ldc1      $f11,16391($s0)
+        ldc2      $8,-21181($at)
+        ldl       $24,-4167($24)
+        ldr       $14,-30358($s4)
+        ldxc1     $f8,$s7($15)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        ll        $v0,-7321($s2)
+        lld       $zero,-14736($ra)
+        luxc1     $f19,$s6($s5)
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        lwu       $s3,-24086($v1)
+        lwxc1     $f12,$s1($s8)
+        madd      $s6,$13
+        madd      $zero,$9
+        maddu     $s3,$gp
+        maddu     $24,$s2
+        mfc0      $a2,$14,1
+        mfc1      $a3,$f27
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $a0,$a3
+        move      $s5,$a0
+        move      $s8,$a0
+        move      $25,$a2
+        movf      $gp,$8,$fcc7
+        movf.d    $f6,$f11,$fcc5
+        movf.s    $f23,$f5,$fcc6
+        movn      $v1,$s1,$s0
+        movn.d    $f27,$f21,$k0
+        movn.s    $f12,$f0,$s7
+        movt      $zero,$s4,$fcc5
+        movt.d    $f0,$f2,$fcc0
+        movt.s    $f30,$f2,$fcc1
+        movz      $a1,$s6,$9
+        movz.d    $f12,$f29,$9
+        movz.s    $f25,$f7,$v1
+        msub      $s7,$k1
+        msubu     $15,$a1
+        mtc0      $9,$29,3
+        mtc1      $s8,$f9
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul       $s0,$s4,$at
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        round.l.d $f12,$f1
+        round.l.s $f25,$f5
+        round.w.d $f6,$f4
+        round.w.s $f27,$f28
+        sb        $s6,-19857($14)
+        sc        $15,18904($s3)
+        scd       $15,-8243($sp)
+        sd        $12,5835($10)
+        sdc1      $f31,30574($13)
+        sdc2      $20,23157($s2)
+        sdl       $a3,-20961($s8)
+        sdr       $11,-20423($12)
+        sdxc1     $f11,$10($14)
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sqrt.d    $f17,$f22
+        sqrt.s    $f0,$f1
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        suxc1     $f12,$k1($13)
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        swxc1     $f19,$12($k0)
+        teqi      $s5,-17504
+        tgei      $s1,5025
+        tgeiu     $sp,-28621
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlti      $14,-21059
+        tltiu     $ra,-5076
+        tnei      $12,-29647
+        trunc.l.d $f23,$f23
+        trunc.l.s $f28,$f31
+        trunc.w.d $f22,$f15
+        trunc.w.s $f28,$f30
+        xor       $s2,$a0,$s8
diff --git a/test/MC/Mips/mips64r2/valid-xfail.s b/test/MC/Mips/mips64r2/valid-xfail.s
index 9d9d6cd..9ac47f6 100644
--- a/test/MC/Mips/mips64r2/valid-xfail.s
+++ b/test/MC/Mips/mips64r2/valid-xfail.s
@@ -5,312 +5,307 @@
 # RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r2 | not FileCheck %s
 # CHECK-NOT: encoding
 # XFAIL: *
-# REQUIRES: asserts
 
-	.set noat
-	abs.ps	$f22,$f8
-	absq_s.ph	$t0,$a0
-	absq_s.qb	$t7,$s1
-	absq_s.w	$s3,$ra
-	add.ps	$f25,$f27,$f13
-	addq.ph	$s1,$t7,$at
-	addq_s.ph	$s3,$s6,$s2
-	addq_s.w	$a2,$t0,$at
-	addqh.ph	$s4,$t6,$s1
-	addqh.w	$s7,$s7,$k1
-	addqh_r.ph	$sp,$t9,$s8
-	addqh_r.w	$t0,$v1,$zero
-	addsc	$s8,$t7,$t4
-	addu.ph	$a2,$t6,$s3
-	addu.qb	$s6,$v1,$v1
-	addu_s.ph	$a3,$s3,$gp
-	addu_s.qb	$s4,$s8,$s1
-	adduh.qb	$a1,$a1,$at
-	adduh_r.qb	$a0,$t1,$t4
-	addwc	$k0,$s6,$s7
-	alnv.ob	$v22,$v19,$v30,$v1
-	alnv.ob	$v31,$v23,$v30,$at
-	alnv.ob	$v8,$v17,$v30,$a1
-	alnv.ps	$f12,$f18,$f30,$t4
-	and.v	$w10,$w25,$w29
-	bitrev	$t6,$at
-	bmnz.v	$w15,$w2,$w28
-	bmz.v	$w13,$w11,$w21
-	bsel.v	$w28,$w7,$w0
-	c.eq.d	$fcc1,$f15,$f15
-	c.eq.ps	$fcc5,$f0,$f9
-	c.eq.s	$fcc5,$f24,$f17
-	c.f.d	$fcc4,$f11,$f21
-	c.f.ps	$fcc6,$f11,$f11
-	c.f.s	$fcc4,$f30,$f7
-	c.le.d	$fcc4,$f18,$f1
-	c.le.ps	$fcc1,$f7,$f20
-	c.le.s	$fcc6,$f24,$f4
-	c.lt.d	$fcc3,$f9,$f3
-	c.lt.ps	$f19,$f5
-	c.lt.s	$fcc2,$f17,$f14
-	c.nge.d	$fcc5,$f21,$f16
-	c.nge.ps	$f1,$f26
-	c.nge.s	$fcc3,$f11,$f8
-	c.ngl.ps	$f21,$f30
-	c.ngl.s	$fcc2,$f31,$f23
-	c.ngle.ps	$fcc7,$f12,$f20
-	c.ngle.s	$fcc2,$f18,$f23
-	c.ngt.d	$fcc4,$f24,$f7
-	c.ngt.ps	$fcc5,$f30,$f6
-	c.ngt.s	$fcc5,$f8,$f13
-	c.ole.d	$fcc2,$f16,$f31
-	c.ole.ps	$fcc7,$f21,$f8
-	c.ole.s	$fcc3,$f7,$f20
-	c.olt.d	$fcc4,$f19,$f28
-	c.olt.ps	$fcc3,$f7,$f16
-	c.olt.s	$fcc6,$f20,$f7
-	c.seq.d	$fcc4,$f31,$f7
-	c.seq.ps	$fcc6,$f31,$f14
-	c.seq.s	$fcc7,$f1,$f25
-	c.sf.ps	$fcc6,$f4,$f6
-	c.ueq.d	$fcc4,$f13,$f25
-	c.ueq.ps	$fcc1,$f5,$f29
-	c.ueq.s	$fcc6,$f3,$f30
-	c.ule.d	$fcc7,$f25,$f18
-	c.ule.ps	$fcc6,$f17,$f3
-	c.ule.s	$fcc7,$f21,$f30
-	c.ult.d	$fcc6,$f6,$f17
-	c.ult.ps	$fcc7,$f14,$f0
-	c.ult.s	$fcc7,$f24,$f10
-	c.un.d	$fcc6,$f23,$f24
-	c.un.ps	$fcc4,$f2,$f26
-	c.un.s	$fcc1,$f30,$f4
-	cvt.ps.s	$f3,$f18,$f19
-	cmp.eq.ph	$s7,$t6
-	cmp.le.ph	$t0,$t6
-	cmp.lt.ph	$k0,$sp
-	cmpgdu.eq.qb	$s3,$zero,$k0
-	cmpgdu.le.qb	$v1,$t7,$s2
-	cmpgdu.lt.qb	$s0,$gp,$sp
-	cmpgu.eq.qb	$t6,$s6,$s8
-	cmpgu.le.qb	$t1,$a3,$s4
-	cmpgu.lt.qb	$sp,$at,$t0
-	cmpu.eq.qb	$v0,$t8
-	cmpu.le.qb	$s1,$a1
-	cmpu.lt.qb	$at,$a3
-	cvt.s.pl	$f30,$f1
-	cvt.s.pu	$f14,$f25
-	dmfc0	$t2,c0_watchhi,2
-	dmfgc0	$gp,c0_perfcnt,6
-	dmt	$k0
-	dmtc0	$t7,c0_datalo
-	dmtgc0	$a2,c0_watchlo,2
-	dpa.w.ph	$ac1,$s7,$k0
-	dpaq_s.w.ph	$ac2,$a0,$t5
-	dpaq_sa.l.w	$ac0,$a2,$t6
-	dpaqx_s.w.ph	$ac3,$a0,$t8
-	dpaqx_sa.w.ph	$ac1,$zero,$s5
-	dpau.h.qbl	$ac1,$t2,$t8
-	dpau.h.qbr	$ac1,$s7,$s6
-	dpax.w.ph	$ac3,$a0,$k0
-	dps.w.ph	$ac1,$a3,$a1
-	dpsq_s.w.ph	$ac0,$gp,$k0
-	dpsq_sa.l.w	$ac0,$a3,$t7
-	dpsqx_s.w.ph	$ac3,$t5,$a3
-	dpsqx_sa.w.ph	$ac3,$sp,$s2
-	dpsu.h.qbl	$ac2,$t6,$t2
-	dpsu.h.qbr	$ac2,$a1,$s6
-	dpsx.w.ph	$ac0,$s7,$gp
-	drorv	$at,$a1,$s7
-	dvpe	$s6
-	emt	$t0
-	evpe	$v0
-	extpdpv	$s6,$ac0,$s8
-	extpv	$t5,$ac0,$t6
-	extrv.w	$t0,$ac3,$at
-	extrv_r.w	$t0,$ac1,$s6
-	extrv_rs.w	$gp,$ac1,$s6
-	extrv_s.h	$s2,$ac1,$t6
-	fclass.d	$w14,$w27
-	fclass.w	$w19,$w28
-	fexupl.d	$w10,$w29
-	fexupl.w	$w12,$w27
-	fexupr.d	$w31,$w15
-	fexupr.w	$w29,$w12
-	ffint_s.d	$w1,$w30
-	ffint_s.w	$w16,$w14
-	ffint_u.d	$w23,$w18
-	ffint_u.w	$w19,$w12
-	ffql.d	$w2,$w3
-	ffql.w	$w9,$w0
-	ffqr.d	$w25,$w24
-	ffqr.w	$w10,$w6
-	fill.b	$w9,$v1
-	fill.d	$w28,$t0
-	fill.h	$w9,$t0
-	fill.w	$w31,$t7
-	flog2.d	$w12,$w16
-	flog2.w	$w19,$w23
-	fork	$s2,$t0,$a0
-	frcp.d	$w12,$w4
-	frcp.w	$w30,$w8
-	frint.d	$w20,$w8
-	frint.w	$w11,$w29
-	frsqrt.d	$w29,$w2
-	frsqrt.w	$w9,$w8
-	fsqrt.d	$w3,$w1
-	fsqrt.w	$w5,$w15
-	ftint_s.d	$w31,$w26
-	ftint_s.w	$w27,$w14
-	ftint_u.d	$w5,$w31
-	ftint_u.w	$w12,$w29
-	ftrunc_s.d	$w4,$w22
-	ftrunc_s.w	$w24,$w7
-	ftrunc_u.d	$w20,$w25
-	ftrunc_u.w	$w7,$w26
-	insv	$s2,$at
-	iret
-	lbe	$t6,122($t1)
-	lbue	$t3,-108($t2)
-	lbux	$t1,$t6($v0)
-	lhe	$s6,219($v1)
-	lhue	$gp,118($t3)
-	lhx	$sp,$k0($t7)
-	lle	$gp,-237($ra)
-	lwe	$ra,-145($t6)
-	lwle	$t3,-42($t3)
-	lwre	$sp,-152($t8)
-	lwx	$t4,$t4($s4)
-	madd.d	$f18,$f19,$f26,$f20
-	madd.ps	$f22,$f3,$f14,$f3
-	maq_s.w.phl	$ac2,$t9,$t3
-	maq_s.w.phr	$ac0,$t2,$t9
-	maq_sa.w.phl	$ac3,$a1,$v1
-	maq_sa.w.phr	$ac1,$at,$t2
-	mfgc0	$s6,c0_datahi1
-	mflo	$t1,$ac2
-	modsub	$a3,$t4,$a3
-	mov.ps	$f22,$f17
-	movf.ps	$f10,$f28,$fcc6
-	movn.ps	$f31,$f31,$s3
-	movt.ps	$f20,$f25,$fcc2
-	movz.ps	$f18,$f17,$ra
-	msgn.qh	$v0,$v24,$v20
-	msgn.qh	$v12,$v21,$v0[1]
-	msub	$ac2,$sp,$t6
-	msub.d	$f10,$f1,$f31,$f18
-	msub.ps	$f12,$f14,$f29,$f17
-	msubu	$ac2,$a1,$t8
-	mtc0	$t1,c0_datahi1
-	mtgc0	$s4,$21,7
-	mthi	$v0,$ac1
-	mthlip	$a3,$ac0
-	mul.ph	$s4,$t8,$s0
-	mul.ps	$f14,$f0,$f16
-	mul_s.ph	$t2,$t6,$t7
-	muleq_s.w.phl	$t3,$s4,$s4
-	muleq_s.w.phr	$s6,$a0,$s8
-	muleu_s.ph.qbl	$a2,$t6,$t0
-	muleu_s.ph.qbr	$a1,$ra,$t1
-	mulq_rs.ph	$s2,$t6,$t7
-	mulq_rs.w	$at,$s4,$t9
-	mulq_s.ph	$s0,$k1,$t7
-	mulq_s.w	$t1,$a3,$s0
-	mulsa.w.ph	$ac1,$s4,$s6
-	mulsaq_s.w.ph	$ac0,$ra,$s2
-	neg.ps	$f19,$f13
-	nloc.b	$w12,$w30
-	nloc.d	$w16,$w7
-	nloc.h	$w21,$w17
-	nloc.w	$w17,$w16
-	nlzc.b	$w12,$w7
-	nlzc.d	$w14,$w14
-	nlzc.h	$w24,$w24
-	nlzc.w	$w10,$w4
-	nmadd.d	$f18,$f9,$f14,$f19
-	nmadd.ps	$f27,$f4,$f9,$f25
-	nmsub.d	$f30,$f8,$f16,$f30
-	nmsub.ps	$f6,$f12,$f14,$f17
-	nor.v	$w20,$w20,$w15
-	or.v	$w13,$w23,$w12
-	packrl.ph	$ra,$t8,$t6
-	pcnt.b	$w30,$w15
-	pcnt.d	$w5,$w16
-	pcnt.h	$w20,$w24
-	pcnt.w	$w22,$w20
-	pick.ph	$ra,$a2,$gp
-	pick.qb	$t3,$a0,$gp
-	pll.ps	$f25,$f9,$f30
-	plu.ps	$f1,$f26,$f29
-	preceq.w.phl	$s8,$gp
-	preceq.w.phr	$s5,$t7
-	precequ.ph.qbl	$s7,$ra
-	precequ.ph.qbla	$a0,$t1
-	precequ.ph.qbr	$ra,$s3
-	precequ.ph.qbra	$t8,$t0
-	preceu.ph.qbl	$sp,$t0
-	preceu.ph.qbla	$s6,$t3
-	preceu.ph.qbr	$gp,$s1
-	preceu.ph.qbra	$k1,$s0
-	precr.qb.ph	$v0,$t4,$s8
-	precrq.ph.w	$t6,$s8,$t8
-	precrq.qb.ph	$a2,$t4,$t4
-	precrq_rs.ph.w	$a1,$k0,$a3
-	precrqu_s.qb.ph	$zero,$gp,$s5
-	pul.ps	$f9,$f30,$f26
-	puu.ps	$f24,$f9,$f2
-	raddu.w.qb	$t9,$s3
-	rdpgpr	$s3,$t1
-	recip.d	$f19,$f6
-	recip.s	$f3,$f30
-	repl.ph	$at,-307
-	replv.ph	$v1,$s7
-	replv.qb	$t9,$t4
-	rorv	$t5,$a3,$s5
-	rsqrt.d	$f3,$f28
-	rsqrt.s	$f4,$f8
-	sbe	$s7,33($s1)
-	sce	$sp,189($t2)
-	she	$t8,105($v0)
-	shilo	$ac1,26
-	shilov	$ac2,$t2
-	shllv.ph	$t2,$s0,$s0
-	shllv.qb	$gp,$v1,$zero
-	shllv_s.ph	$k1,$at,$t5
-	shllv_s.w	$s1,$ra,$k0
-	shrav.ph	$t9,$s2,$s1
-	shrav.qb	$zero,$t8,$t3
-	shrav_r.ph	$s3,$t3,$t9
-	shrav_r.qb	$a0,$sp,$s5
-	shrav_r.w	$s7,$s4,$s6
-	shrlv.ph	$t6,$t2,$t1
-	shrlv.qb	$a2,$s2,$t3
-	sub.ps	$f5,$f14,$f26
-	subq.ph	$ra,$t1,$s8
-	subq_s.ph	$t5,$s8,$s5
-	subq_s.w	$k1,$a2,$a3
-	subqh.ph	$t2,$at,$t1
-	subqh.w	$v0,$a2,$zero
-	subqh_r.ph	$a0,$t4,$s6
-	subqh_r.w	$t2,$a2,$gp
-	subu.ph	$t1,$s6,$s4
-	subu.qb	$s6,$a2,$s6
-	subu_s.ph	$v1,$a1,$s3
-	subu_s.qb	$s1,$at,$ra
-	subuh.qb	$zero,$gp,$gp
-	subuh_r.qb	$s4,$s8,$s6
-	swe	$t8,94($k0)
-	swle	$v1,-209($gp)
-	swre	$k0,-202($s2)
-	synci	20023($s0)
-	tlbginv
-	tlbginvf
-	tlbgp
-	tlbgr
-	tlbgwi
-	tlbgwr
-	tlbinv
-	tlbinvf
-	tlbp
-	tlbr
-	tlbwi
-	tlbwr
-	wrpgpr	$zero,$t5
-	xor.v	$w20,$w21,$w30
-	yield	$v1,$s0
+        .set noat
+        abs.ps          $f22,$f8
+        absq_s.ph       $8,$a0
+        absq_s.qb       $15,$s1
+        absq_s.w        $s3,$ra
+        add.ps          $f25,$f27,$f13
+        addq.ph         $s1,$15,$at
+        addq_s.ph       $s3,$s6,$s2
+        addq_s.w        $a2,$8,$at
+        addqh.ph        $s4,$14,$s1
+        addqh.w         $s7,$s7,$k1
+        addqh_r.ph      $sp,$25,$s8
+        addqh_r.w       $8,$v1,$zero
+        addsc           $s8,$15,$12
+        addu.ph         $a2,$14,$s3
+        addu.qb         $s6,$v1,$v1
+        addu_s.ph       $a3,$s3,$gp
+        addu_s.qb       $s4,$s8,$s1
+        adduh.qb        $a1,$a1,$at
+        adduh_r.qb      $a0,$9,$12
+        addwc           $k0,$s6,$s7
+        alnv.ob         $v22,$v19,$v30,$v1
+        alnv.ob         $v31,$v23,$v30,$at
+        alnv.ob         $v8,$v17,$v30,$a1
+        alnv.ps         $f12,$f18,$f30,$12
+        and.v           $w10,$w25,$w29
+        bitrev          $14,$at
+        bmnz.v          $w15,$w2,$w28
+        bmz.v           $w13,$w11,$w21
+        bsel.v          $w28,$w7,$w0
+        c.eq.d          $fcc1,$f15,$f15
+        c.eq.ps         $fcc5,$f0,$f9
+        c.eq.s          $fcc5,$f24,$f17
+        c.f.d           $fcc4,$f11,$f21
+        c.f.ps          $fcc6,$f11,$f11
+        c.f.s           $fcc4,$f30,$f7
+        c.le.d          $fcc4,$f18,$f1
+        c.le.ps         $fcc1,$f7,$f20
+        c.le.s          $fcc6,$f24,$f4
+        c.lt.d          $fcc3,$f9,$f3
+        c.lt.ps         $f19,$f5
+        c.lt.s          $fcc2,$f17,$f14
+        c.nge.d         $fcc5,$f21,$f16
+        c.nge.ps        $f1,$f26
+        c.nge.s         $fcc3,$f11,$f8
+        c.ngl.ps        $f21,$f30
+        c.ngl.s         $fcc2,$f31,$f23
+        c.ngle.ps       $fcc7,$f12,$f20
+        c.ngle.s        $fcc2,$f18,$f23
+        c.ngt.d         $fcc4,$f24,$f7
+        c.ngt.ps        $fcc5,$f30,$f6
+        c.ngt.s         $fcc5,$f8,$f13
+        c.ole.d         $fcc2,$f16,$f31
+        c.ole.ps        $fcc7,$f21,$f8
+        c.ole.s         $fcc3,$f7,$f20
+        c.olt.d         $fcc4,$f19,$f28
+        c.olt.ps        $fcc3,$f7,$f16
+        c.olt.s         $fcc6,$f20,$f7
+        c.seq.d         $fcc4,$f31,$f7
+        c.seq.ps        $fcc6,$f31,$f14
+        c.seq.s         $fcc7,$f1,$f25
+        c.sf.ps         $fcc6,$f4,$f6
+        c.ueq.d         $fcc4,$f13,$f25
+        c.ueq.ps        $fcc1,$f5,$f29
+        c.ueq.s         $fcc6,$f3,$f30
+        c.ule.d         $fcc7,$f25,$f18
+        c.ule.ps        $fcc6,$f17,$f3
+        c.ule.s         $fcc7,$f21,$f30
+        c.ult.d         $fcc6,$f6,$f17
+        c.ult.ps        $fcc7,$f14,$f0
+        c.ult.s         $fcc7,$f24,$f10
+        c.un.d          $fcc6,$f23,$f24
+        c.un.ps         $fcc4,$f2,$f26
+        c.un.s          $fcc1,$f30,$f4
+        cvt.ps.s        $f3,$f18,$f19
+        cmp.eq.ph       $s7,$14
+        cmp.le.ph       $8,$14
+        cmp.lt.ph       $k0,$sp
+        cmpgdu.eq.qb    $s3,$zero,$k0
+        cmpgdu.le.qb    $v1,$15,$s2
+        cmpgdu.lt.qb    $s0,$gp,$sp
+        cmpgu.eq.qb     $14,$s6,$s8
+        cmpgu.le.qb     $9,$a3,$s4
+        cmpgu.lt.qb     $sp,$at,$8
+        cmpu.eq.qb      $v0,$24
+        cmpu.le.qb      $s1,$a1
+        cmpu.lt.qb      $at,$a3
+        cvt.s.pl        $f30,$f1
+        cvt.s.pu        $f14,$f25
+        dmfc0           $10,c0_watchhi,2
+        dmfgc0          $gp,c0_perfcnt,6
+        dmt $k0
+        dmtc0           $15,c0_datalo
+        dmtgc0          $a2,c0_watchlo,2
+        dpa.w.ph        $ac1,$s7,$k0
+        dpaq_s.w.ph     $ac2,$a0,$13
+        dpaq_sa.l.w     $ac0,$a2,$14
+        dpaqx_s.w.ph    $ac3,$a0,$24
+        dpaqx_sa.w.ph   $ac1,$zero,$s5
+        dpau.h.qbl      $ac1,$10,$24
+        dpau.h.qbr      $ac1,$s7,$s6
+        dpax.w.ph       $ac3,$a0,$k0
+        dps.w.ph        $ac1,$a3,$a1
+        dpsq_s.w.ph     $ac0,$gp,$k0
+        dpsq_sa.l.w     $ac0,$a3,$15
+        dpsqx_s.w.ph    $ac3,$13,$a3
+        dpsqx_sa.w.ph   $ac3,$sp,$s2
+        dpsu.h.qbl      $ac2,$14,$10
+        dpsu.h.qbr      $ac2,$a1,$s6
+        dpsx.w.ph       $ac0,$s7,$gp
+        drorv           $at,$a1,$s7
+        dvpe            $s6
+        emt $8
+        evpe            $v0
+        extpdpv         $s6,$ac0,$s8
+        extpv           $13,$ac0,$14
+        extrv.w         $8,$ac3,$at
+        extrv_r.w       $8,$ac1,$s6
+        extrv_rs.w      $gp,$ac1,$s6
+        extrv_s.h       $s2,$ac1,$14
+        fclass.d        $w14,$w27
+        fclass.w        $w19,$w28
+        fexupl.d        $w10,$w29
+        fexupl.w        $w12,$w27
+        fexupr.d        $w31,$w15
+        fexupr.w        $w29,$w12
+        ffint_s.d       $w1,$w30
+        ffint_s.w       $w16,$w14
+        ffint_u.d       $w23,$w18
+        ffint_u.w       $w19,$w12
+        ffql.d          $w2,$w3
+        ffql.w          $w9,$w0
+        ffqr.d          $w25,$w24
+        ffqr.w          $w10,$w6
+        fill.b          $w9,$v1
+        fill.d          $w28,$8
+        fill.h          $w9,$8
+        fill.w          $w31,$15
+        flog2.d         $w12,$w16
+        flog2.w         $w19,$w23
+        fork            $s2,$8,$a0
+        frcp.d          $w12,$w4
+        frcp.w          $w30,$w8
+        frint.d         $w20,$w8
+        frint.w         $w11,$w29
+        frsqrt.d        $w29,$w2
+        frsqrt.w        $w9,$w8
+        fsqrt.d         $w3,$w1
+        fsqrt.w         $w5,$w15
+        ftint_s.d       $w31,$w26
+        ftint_s.w       $w27,$w14
+        ftint_u.d       $w5,$w31
+        ftint_u.w       $w12,$w29
+        ftrunc_s.d      $w4,$w22
+        ftrunc_s.w      $w24,$w7
+        ftrunc_u.d      $w20,$w25
+        ftrunc_u.w      $w7,$w26
+        insv            $s2,$at
+        iret
+        lbe $14,122($9)
+        lbue            $11,-108($10)
+        lbux            $9,$14($v0)
+        lhe $s6,219($v1)
+        lhue            $gp,118($11)
+        lhx $sp,$k0($15)
+        lle $gp,-237($ra)
+        lwe $ra,-145($14)
+        lwle            $11,-42($11)
+        lwre            $sp,-152($24)
+        lwx $12,$12($s4)
+        madd.d          $f18,$f19,$f26,$f20
+        madd.ps         $f22,$f3,$f14,$f3
+        maq_s.w.phl     $ac2,$25,$11
+        maq_s.w.phr     $ac0,$10,$25
+        maq_sa.w.phl    $ac3,$a1,$v1
+        maq_sa.w.phr    $ac1,$at,$10
+        mfgc0           $s6,c0_datahi1
+        mflo            $9,$ac2
+        modsub          $a3,$12,$a3
+        mov.ps          $f22,$f17
+        movf.ps         $f10,$f28,$fcc6
+        movn.ps         $f31,$f31,$s3
+        movt.ps         $f20,$f25,$fcc2
+        movz.ps         $f18,$f17,$ra
+        msgn.qh         $v0,$v24,$v20
+        msgn.qh         $v12,$v21,$v0[1]
+        msub            $ac2,$sp,$14
+        msub.d          $f10,$f1,$f31,$f18
+        msub.ps         $f12,$f14,$f29,$f17
+        msubu           $ac2,$a1,$24
+        mtc0            $9,c0_datahi1
+        mtgc0           $s4,$21,7
+        mthi            $v0,$ac1
+        mthlip          $a3,$ac0
+        mul.ph          $s4,$24,$s0
+        mul.ps          $f14,$f0,$f16
+        mul_s.ph        $10,$14,$15
+        muleq_s.w.phl   $11,$s4,$s4
+        muleq_s.w.phr   $s6,$a0,$s8
+        muleu_s.ph.qbl  $a2,$14,$8
+        muleu_s.ph.qbr  $a1,$ra,$9
+        mulq_rs.ph      $s2,$14,$15
+        mulq_rs.w       $at,$s4,$25
+        mulq_s.ph       $s0,$k1,$15
+        mulq_s.w        $9,$a3,$s0
+        mulsa.w.ph      $ac1,$s4,$s6
+        mulsaq_s.w.ph   $ac0,$ra,$s2
+        neg.ps          $f19,$f13
+        nloc.b          $w12,$w30
+        nloc.d          $w16,$w7
+        nloc.h          $w21,$w17
+        nloc.w          $w17,$w16
+        nlzc.b          $w12,$w7
+        nlzc.d          $w14,$w14
+        nlzc.h          $w24,$w24
+        nlzc.w          $w10,$w4
+        nmadd.d         $f18,$f9,$f14,$f19
+        nmadd.ps        $f27,$f4,$f9,$f25
+        nmsub.d         $f30,$f8,$f16,$f30
+        nmsub.ps        $f6,$f12,$f14,$f17
+        nor.v           $w20,$w20,$w15
+        or.v            $w13,$w23,$w12
+        packrl.ph       $ra,$24,$14
+        pcnt.b          $w30,$w15
+        pcnt.d          $w5,$w16
+        pcnt.h          $w20,$w24
+        pcnt.w          $w22,$w20
+        pick.ph         $ra,$a2,$gp
+        pick.qb         $11,$a0,$gp
+        pll.ps          $f25,$f9,$f30
+        plu.ps          $f1,$f26,$f29
+        preceq.w.phl    $s8,$gp
+        preceq.w.phr    $s5,$15
+        precequ.ph.qbl  $s7,$ra
+        precequ.ph.qbla $a0,$9
+        precequ.ph.qbr  $ra,$s3
+        precequ.ph.qbra $24,$8
+        preceu.ph.qbl   $sp,$8
+        preceu.ph.qbla  $s6,$11
+        preceu.ph.qbr   $gp,$s1
+        preceu.ph.qbra  $k1,$s0
+        precr.qb.ph     $v0,$12,$s8
+        precrq.ph.w     $14,$s8,$24
+        precrq.qb.ph    $a2,$12,$12
+        precrq_rs.ph.w  $a1,$k0,$a3
+        precrqu_s.qb.ph $zero,$gp,$s5
+        pul.ps          $f9,$f30,$f26
+        puu.ps          $f24,$f9,$f2
+        raddu.w.qb      $25,$s3
+        rdpgpr          $s3,$9
+        recip.d         $f19,$f6
+        recip.s         $f3,$f30
+        repl.ph         $at,-307
+        replv.ph        $v1,$s7
+        replv.qb        $25,$12
+        rorv            $13,$a3,$s5
+        rsqrt.d         $f3,$f28
+        rsqrt.s         $f4,$f8
+        sbe $s7,33($s1)
+        sce $sp,189($10)
+        she $24,105($v0)
+        shilo           $ac1,26
+        shilov          $ac2,$10
+        shllv.ph        $10,$s0,$s0
+        shllv.qb        $gp,$v1,$zero
+        shllv_s.ph      $k1,$at,$13
+        shllv_s.w       $s1,$ra,$k0
+        shrav.ph        $25,$s2,$s1
+        shrav.qb        $zero,$24,$11
+        shrav_r.ph      $s3,$11,$25
+        shrav_r.qb      $a0,$sp,$s5
+        shrav_r.w       $s7,$s4,$s6
+        shrlv.ph        $14,$10,$9
+        shrlv.qb        $a2,$s2,$11
+        sub.ps          $f5,$f14,$f26
+        subq.ph         $ra,$9,$s8
+        subq_s.ph       $13,$s8,$s5
+        subq_s.w        $k1,$a2,$a3
+        subqh.ph        $10,$at,$9
+        subqh.w         $v0,$a2,$zero
+        subqh_r.ph      $a0,$12,$s6
+        subqh_r.w       $10,$a2,$gp
+        subu.ph         $9,$s6,$s4
+        subu.qb         $s6,$a2,$s6
+        subu_s.ph       $v1,$a1,$s3
+        subu_s.qb       $s1,$at,$ra
+        subuh.qb        $zero,$gp,$gp
+        subuh_r.qb      $s4,$s8,$s6
+        swe $24,94($k0)
+        swle            $v1,-209($gp)
+        swre            $k0,-202($s2)
+        synci           20023($s0)
+        tlbginv
+        tlbginvf
+        tlbgp
+        tlbgr
+        tlbgwi
+        tlbgwr
+        tlbinv
+        tlbinvf
+        wrpgpr          $zero,$13
+        xor.v           $w20,$w21,$w30
+        yield           $v1,$s0
diff --git a/test/MC/Mips/mips64r2/valid.s b/test/MC/Mips/mips64r2/valid.s
index 826a6b2..252589d 100644
--- a/test/MC/Mips/mips64r2/valid.s
+++ b/test/MC/Mips/mips64r2/valid.s
@@ -3,189 +3,231 @@
 # RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r2 | FileCheck %s
 
         .set noat
-	abs.d	$f7,$f25 # CHECK: encoding
-	abs.s	$f9,$f16
-	add	$s7,$s2,$a1
-	add.d	$f1,$f7,$f29
-	add.s	$f8,$f21,$f24
-	addi	$t5,$t1,26322
-	addu	$t1,$a0,$a2
-	and	$s7,$v0,$t4
-	c.ngl.d	$f29,$f29
-	c.ngle.d	$f0,$f16
-	c.sf.d	$f30,$f0
-	c.sf.s	$f14,$f22
-	ceil.l.d	$f1,$f3
-	ceil.l.s	$f18,$f13
-	ceil.w.d	$f11,$f25
-	ceil.w.s	$f6,$f20
-	cfc1	$s1,$21
-	clo	$t3,$a1
-	clz	$sp,$gp
-	ctc1	$a2,$26
-	cvt.d.l	$f4,$f16
-	cvt.d.s	$f22,$f28
-	cvt.d.w	$f26,$f11
-	cvt.l.d	$f24,$f15
-	cvt.l.s	$f11,$f29
-	cvt.s.d	$f26,$f8
-	cvt.s.l	$f15,$f30
-	cvt.s.w	$f22,$f15
-	cvt.w.d	$f20,$f14
-	cvt.w.s	$f20,$f24
-	dadd	$s3,$at,$ra
-	daddi	$sp,$s4,-27705
-	daddiu	$k0,$s6,-4586
-	dclo	$s2,$a2
-	dclz	$s0,$t9
-	deret
-	di	$s8
-	ddiv	$zero,$k0,$s3
-	ddivu	$zero,$s0,$s1
-	div	$zero,$t9,$t3
-	div.d	$f29,$f20,$f27
-	div.s	$f4,$f5,$f15
-	divu	$zero,$t9,$t7
-	dmfc1	$t4,$f13
-	dmtc1	$s0,$f14
-	dmult	$s7,$t1
-	dmultu	$a1,$a2
-	dsbh	$v1,$t6
-	dshd	$v0,$sp
-	dsllv	$zero,$s4,$t4
-	dsrav	$gp,$s2,$s3
-	dsrlv	$s3,$t6,$s4
-	dsub	$a3,$s6,$t0
-	dsubu	$a1,$a1,$k0
-	ehb                      # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
-	ei	$t6
-	eret
-	floor.l.d	$f26,$f7
-	floor.l.s	$f12,$f5
-	floor.w.d	$f14,$f11
-	floor.w.s	$f8,$f9
-	lb	$t8,-14515($t2)
-	lbu	$t0,30195($v1)
-	ld	$sp,-28645($s1)
-	ldc1	$f11,16391($s0)
-	ldc2	$8,-21181($at)
-	ldl	$t8,-4167($t8)
-	ldr	$t6,-30358($s4)
-	ldxc1	$f8,$s7($t7)
-	lh	$t3,-8556($s5)
-	lhu	$s3,-22851($v0)
-	li	$at,-29773
-	li	$zero,-29889
-	ll	$v0,-7321($s2)
-	lld	$zero,-14736($ra)
-	luxc1	$f19,$s6($s5)
-	lw	$t0,5674($a1)
-	lwc1	$f16,10225($k0)
-	lwc2	$18,-841($a2)
-	lwl	$s4,-4231($t7)
-	lwr	$zero,-19147($gp)
-	lwu	$s3,-24086($v1)
-	lwxc1	$f12,$s1($s8)
-	madd	$s6,$t5
-	madd	$zero,$t1
-	madd.s	$f1,$f31,$f19,$f25
-	maddu	$s3,$gp
-	maddu	$t8,$s2
-	mfc0	$a2,$14,1
-	mfc1	$a3,$f27
-	mfhc1	$s8,$f24
-	mfhi	$s3
-	mfhi	$sp
-	mflo	$s1
-	mov.d	$f20,$f14
-	mov.s	$f2,$f27
-	move	$a0,$a3
-	move	$s5,$a0
-	move	$s8,$a0
-	move	$t9,$a2
-	movf	$gp,$t0,$fcc7
-	movf.d	$f6,$f11,$fcc5
-	movf.s	$f23,$f5,$fcc6
-	movn	$v1,$s1,$s0
-	movn.d	$f27,$f21,$k0
-	movn.s	$f12,$f0,$s7
-	movt	$zero,$s4,$fcc5
-	movt.d	$f0,$f2,$fcc0
-	movt.s	$f30,$f2,$fcc1
-	movz	$a1,$s6,$t1
-	movz.d	$f12,$f29,$t1
-	movz.s	$f25,$f7,$v1
-	msub	$s7,$k1
-	msub.s	$f12,$f19,$f10,$f16
-	msubu	$t7,$a1
-	mtc0	$t1,$29,3
-	mtc1	$s8,$f9
-	mthc1	$zero,$f16
-	mthi	$s1
-	mtlo	$sp
-	mtlo	$t9
-	mul	$s0,$s4,$at
-	mul.d	$f20,$f20,$f16
-	mul.s	$f30,$f10,$f2
-	mult	$sp,$s4
-	mult	$sp,$v0
-	multu	$gp,$k0
-	multu	$t1,$s2
-	neg.d	$f27,$f18
-	neg.s	$f1,$f15
-	nmadd.s	$f0,$f5,$f25,$f12
-	nmsub.s	$f1,$f24,$f19,$f4
-	nop
-	nor	$a3,$zero,$a3
-	or	$t4,$s0,$sp
-	pause                    # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
-	rdhwr	$sp,$11
-	round.l.d	$f12,$f1
-	round.l.s	$f25,$f5
-	round.w.d	$f6,$f4
-	round.w.s	$f27,$f28
-	sb	$s6,-19857($t6)
-	sc	$t7,18904($s3)
-	scd	$t7,-8243($sp)
-	sd	$t4,5835($t2)
-	sdc1	$f31,30574($t5)
-	sdc2	$20,23157($s2)
-	sdl	$a3,-20961($s8)
-	sdr	$t3,-20423($t4)
-	sdxc1	$f11,$t2($t6)
-	seb	$t9,$t7
-	seh	$v1,$t4
-	sh	$t6,-6704($t7)
-	sllv	$a3,$zero,$t1
-	slt	$s7,$t3,$k1
-	slti	$s1,$t2,9489
-	sltiu	$t9,$t9,-15531
-	sltu	$s4,$s5,$t3
-	sqrt.d	$f17,$f22
-	sqrt.s	$f0,$f1
-	srav	$s1,$s7,$sp
-	srlv	$t9,$s4,$a0
-	ssnop                    # CHECK: ssnop # encoding:  [0x00,0x00,0x00,0x40]
-	sub	$s6,$s3,$t4
-	sub.d	$f18,$f3,$f17
-	sub.s	$f23,$f22,$f22
-	subu	$sp,$s6,$s6
-	suxc1	$f12,$k1($t5)
-	sw	$ra,-10160($sp)
-	swc1	$f6,-8465($t8)
-	swc2	$25,24880($s0)
-	swl	$t7,13694($s3)
-	swr	$s1,-26590($t6)
-	swxc1	$f19,$t4($k0)
-	teqi	$s5,-17504
-	tgei	$s1,5025
-	tgeiu	$sp,-28621
-	tlti	$t6,-21059
-	tltiu	$ra,-5076
-	tnei	$t4,-29647
-	trunc.l.d	$f23,$f23
-	trunc.l.s	$f28,$f31
-	trunc.w.d	$f22,$f15
-	trunc.w.s	$f28,$f30
-	xor	$s2,$a0,$s8
-	wsbh	$k1,$t1
+        abs.d     $f7,$f25             # CHECK: encoding:
+        abs.s     $f9,$f16
+        add       $s7,$s2,$a1
+        add.d     $f1,$f7,$f29
+        add.s     $f8,$f21,$f24
+        addi      $13,$9,26322
+        addu      $9,$a0,$a2
+        and       $s7,$v0,$12
+        c.ngl.d   $f29,$f29
+        c.ngle.d  $f0,$f16
+        c.sf.d    $f30,$f0
+        c.sf.s    $f14,$f22
+        ceil.l.d  $f1,$f3
+        ceil.l.s  $f18,$f13
+        ceil.w.d  $f11,$f25
+        ceil.w.s  $f6,$f20
+        cfc1      $s1,$21
+        clo       $11,$a1
+        clz       $sp,$gp
+        ctc1      $a2,$26
+        cvt.d.l   $f4,$f16
+        cvt.d.s   $f22,$f28
+        cvt.d.w   $f26,$f11
+        cvt.l.d   $f24,$f15
+        cvt.l.s   $f11,$f29
+        cvt.s.d   $f26,$f8
+        cvt.s.l   $f15,$f30
+        cvt.s.w   $f22,$f15
+        cvt.w.d   $f20,$f14
+        cvt.w.s   $f20,$f24
+        dadd      $s3,$at,$ra
+        daddi     $sp,$s4,-27705
+        daddiu    $k0,$s6,-4586
+        daddu     $s3,$at,$ra
+        dclo      $s2,$a2
+        dclz      $s0,$25
+        deret
+        di        $s8
+        ddiv      $zero,$k0,$s3
+        ddivu     $zero,$s0,$s1
+        div       $zero,$25,$11
+        div.d     $f29,$f20,$f27
+        div.s     $f4,$f5,$f15
+        divu      $zero,$25,$15
+        dmfc1     $12,$f13
+        dmtc1     $s0,$f14
+        dmult     $s7,$9
+        dmultu    $a1,$a2
+        drotr     $1,15                # CHECK: drotr $1, $1, 15            # encoding: [0x00,0x21,0x0b,0xfa]
+        drotr     $1,$14,15            # CHECK: drotr $1, $14, 15           # encoding: [0x00,0x2e,0x0b,0xfa]
+        drotr32   $1,15                # CHECK: drotr32 $1, $1, 15          # encoding: [0x00,0x21,0x0b,0xfe]
+        drotr32   $1,$14,15            # CHECK: drotr32 $1, $14, 15         # encoding: [0x00,0x2e,0x0b,0xfe]
+        drotrv    $1,$14,$15           # CHECK: drotrv $1, $14, $15         # encoding: [0x01,0xee,0x08,0x56]
+        dsbh      $v1,$14
+        dshd      $v0,$sp
+        dsll      $zero,18             # CHECK: dsll $zero, $zero, 18       # encoding: [0x00,0x00,0x04,0xb8]
+        dsll      $zero,$s4,18         # CHECK: dsll $zero, $20, 18         # encoding: [0x00,0x14,0x04,0xb8]
+        dsll      $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsll32    $zero,18             # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsll32    $zero,$zero,18       # CHECK: dsll32 $zero, $zero, 18     # encoding: [0x00,0x00,0x04,0xbc]
+        dsllv     $zero,$s4,$12        # CHECK: dsllv $zero, $20, $12       # encoding: [0x01,0x94,0x00,0x14]
+        dsra      $gp,10               # CHECK: dsra $gp, $gp, 10           # encoding: [0x00,0x1c,0xe2,0xbb]
+        dsra      $gp,$s2,10           # CHECK: dsra $gp, $18, 10           # encoding: [0x00,0x12,0xe2,0xbb]
+        dsra      $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsra32    $gp,10               # CHECK: dsra32 $gp, $gp, 10         # encoding: [0x00,0x1c,0xe2,0xbf]
+        dsra32    $gp,$s2,10           # CHECK: dsra32 $gp, $18, 10         # encoding: [0x00,0x12,0xe2,0xbf]
+        dsrav     $gp,$s2,$s3          # CHECK: dsrav $gp, $18, $19         # encoding: [0x02,0x72,0xe0,0x17]
+        dsrl      $s3,23               # CHECK: dsrl $19, $19, 23           # encoding: [0x00,0x13,0x9d,0xfa]
+        dsrl      $s3,$6,23            # CHECK: dsrl $19, $6, 23            # encoding: [0x00,0x06,0x9d,0xfa]
+        dsrl      $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsrl32    $s3,23               # CHECK: dsrl32 $19, $19, 23         # encoding: [0x00,0x13,0x9d,0xfe]
+        dsrl32    $s3,$6,23            # CHECK: dsrl32 $19, $6, 23          # encoding: [0x00,0x06,0x9d,0xfe]
+        dsrlv     $s3,$6,$s4           # CHECK: dsrlv $19, $6, $20          # encoding: [0x02,0x86,0x98,0x16]
+        dsub      $a3,$s6,$8
+        dsubu     $a1,$a1,$k0
+        dsub      $a3,$s6,$8
+        dsubu     $a1,$a1,$k0
+        ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
+        ei        $14
+        eret
+        floor.l.d $f26,$f7
+        floor.l.s $f12,$f5
+        floor.w.d $f14,$f11
+        floor.w.s $f8,$f9
+        lb        $24,-14515($10)
+        lbu       $8,30195($v1)
+        ld        $sp,-28645($s1)
+        ldc1      $f11,16391($s0)
+        ldc2      $8,-21181($at)
+        ldl       $24,-4167($24)
+        ldr       $14,-30358($s4)
+        ldxc1     $f8,$s7($15)
+        lh        $11,-8556($s5)
+        lhu       $s3,-22851($v0)
+        li        $at,-29773
+        li        $zero,-29889
+        ll        $v0,-7321($s2)
+        lld       $zero,-14736($ra)
+        luxc1     $f19,$s6($s5)
+        lw        $8,5674($a1)
+        lwc1      $f16,10225($k0)
+        lwc2      $18,-841($a2)
+        lwl       $s4,-4231($15)
+        lwr       $zero,-19147($gp)
+        lwu       $s3,-24086($v1)
+        lwxc1     $f12,$s1($s8)
+        madd      $s6,$13
+        madd      $zero,$9
+        madd.s    $f1,$f31,$f19,$f25
+        maddu     $s3,$gp
+        maddu     $24,$s2
+        mfc0      $a2,$14,1
+        mfc1      $a3,$f27
+        mfhc1     $s8,$f24
+        mfhi      $s3
+        mfhi      $sp
+        mflo      $s1
+        mov.d     $f20,$f14
+        mov.s     $f2,$f27
+        move      $a0,$a3
+        move      $s5,$a0
+        move      $s8,$a0
+        move      $25,$a2
+        movf      $gp,$8,$fcc7
+        movf.d    $f6,$f11,$fcc5
+        movf.s    $f23,$f5,$fcc6
+        movn      $v1,$s1,$s0
+        movn.d    $f27,$f21,$k0
+        movn.s    $f12,$f0,$s7
+        movt      $zero,$s4,$fcc5
+        movt.d    $f0,$f2,$fcc0
+        movt.s    $f30,$f2,$fcc1
+        movz      $a1,$s6,$9
+        movz.d    $f12,$f29,$9
+        movz.s    $f25,$f7,$v1
+        msub      $s7,$k1
+        msub.s    $f12,$f19,$f10,$f16
+        msubu     $15,$a1
+        mtc0      $9,$29,3
+        mtc1      $s8,$f9
+        mthc1     $zero,$f16
+        mthi      $s1
+        mtlo      $sp
+        mtlo      $25
+        mul       $s0,$s4,$at
+        mul.d     $f20,$f20,$f16
+        mul.s     $f30,$f10,$f2
+        mult      $sp,$s4
+        mult      $sp,$v0
+        multu     $gp,$k0
+        multu     $9,$s2
+        negu      $2                   # CHECK: negu $2, $2            # encoding: [0x00,0x02,0x10,0x23]
+        negu      $2,$3                # CHECK: negu $2, $3            # encoding: [0x00,0x03,0x10,0x23]
+        neg.d     $f27,$f18
+        neg.s     $f1,$f15
+        nmadd.s   $f0,$f5,$f25,$f12
+        nmsub.s   $f1,$f24,$f19,$f4
+        nop
+        nor       $a3,$zero,$a3
+        or        $12,$s0,$sp
+        pause                          # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
+        rdhwr     $sp,$11
+        rotr      $1,15                # CHECK: rotr $1, $1, 15         # encoding: [0x00,0x21,0x0b,0xc2]
+        rotr      $1,$14,15            # CHECK: rotr $1, $14, 15        # encoding: [0x00,0x2e,0x0b,0xc2]
+        rotrv     $1,$14,$15           # CHECK: rotrv $1, $14, $15      # encoding: [0x01,0xee,0x08,0x46]
+        round.l.d $f12,$f1
+        round.l.s $f25,$f5
+        round.w.d $f6,$f4
+        round.w.s $f27,$f28
+        sb        $s6,-19857($14)
+        sc        $15,18904($s3)
+        scd       $15,-8243($sp)
+        sd        $12,5835($10)
+        sdc1      $f31,30574($13)
+        sdc2      $20,23157($s2)
+        sdl       $a3,-20961($s8)
+        sdr       $11,-20423($12)
+        sdxc1     $f11,$10($14)
+        seb       $25,$15
+        seh       $v1,$12
+        sh        $14,-6704($15)
+        sll       $a3,18               # CHECK: sll $7, $7, 18         # encoding: [0x00,0x07,0x3c,0x80]
+        sll       $a3,$zero,18         # CHECK: sll $7, $zero, 18      # encoding: [0x00,0x00,0x3c,0x80]
+        sll       $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        sllv      $a3,$zero,$9         # CHECK: sllv $7, $zero, $9     # encoding: [0x01,0x20,0x38,0x04]
+        slt       $s7,$11,$k1          # CHECK: slt $23, $11, $27      # encoding: [0x01,0x7b,0xb8,0x2a]
+        slti      $s1,$10,9489         # CHECK: slti $17, $10, 9489    # encoding: [0x29,0x51,0x25,0x11]
+        sltiu     $25,$25,-15531       # CHECK: sltiu $25, $25, -15531 # encoding: [0x2f,0x39,0xc3,0x55]
+        sltu      $s4,$s5,$11          # CHECK: sltu  $20, $21, $11    # encoding: [0x02,0xab,0xa0,0x2b]
+        sltu      $24,$25,-15531       # CHECK: sltiu $24, $25, -15531 # encoding: [0x2f,0x38,0xc3,0x55]
+        sqrt.d    $f17,$f22
+        sqrt.s    $f0,$f1
+        sra       $s1,15               # CHECK: sra $17, $17, 15       # encoding: [0x00,0x11,0x8b,0xc3]
+        sra       $s1,$s7,15           # CHECK: sra $17, $23, 15       # encoding: [0x00,0x17,0x8b,0xc3]
+        sra       $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srav      $s1,$s7,$sp          # CHECK: srav $17, $23, $sp     # encoding: [0x03,0xb7,0x88,0x07]
+        srl       $2,7                 # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $2,$2,7              # CHECK: srl $2, $2, 7          # encoding: [0x00,0x02,0x11,0xc2]
+        srl       $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
+        ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
+        sub       $s6,$s3,$12
+        sub.d     $f18,$f3,$f17
+        sub.s     $f23,$f22,$f22
+        subu      $sp,$s6,$s6
+        suxc1     $f12,$k1($13)
+        sw        $ra,-10160($sp)
+        swc1      $f6,-8465($24)
+        swc2      $25,24880($s0)
+        swl       $15,13694($s3)
+        swr       $s1,-26590($14)
+        swxc1     $f19,$12($k0)
+        teqi      $s5,-17504
+        tgei      $s1,5025
+        tgeiu     $sp,-28621
+        tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
+        tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
+        tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
+        tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlti      $14,-21059
+        tltiu     $ra,-5076
+        tnei      $12,-29647
+        trunc.l.d $f23,$f23
+        trunc.l.s $f28,$f31
+        trunc.w.d $f22,$f15
+        trunc.w.s $f28,$f30
+        xor       $s2,$a0,$s8
+        wsbh      $k1,$9
diff --git a/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s
new file mode 100644
index 0000000..f7949bb
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s
@@ -0,0 +1,15 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        lwl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        swl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        swr       $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        swle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        swre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips64r6/invalid-mips1.s b/test/MC/Mips/mips64r6/invalid-mips1.s
new file mode 100644
index 0000000..1225005
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips1.s
@@ -0,0 +1,8 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r6/invalid-mips2.s b/test/MC/Mips/mips64r6/invalid-mips2.s
new file mode 100644
index 0000000..0638e78
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips2.s
@@ -0,0 +1,14 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teqi      $s5,-17504          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgei      $s1,5025            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeiu     $sp,-28621          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlti      $14,-21059          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltiu     $ra,-5076           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tnei      $12,-29647          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r6/invalid-mips3-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips3-wrong-error.s
new file mode 100644
index 0000000..7424f49
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips3-wrong-error.s
@@ -0,0 +1,23 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        ldl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        sdr       $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        ldle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        ldre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        sdle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        sdre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        swl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        swr       $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        lwle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        swle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        swre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips64r6/invalid-mips3.s b/test/MC/Mips/mips64r6/invalid-mips3.s
new file mode 100644
index 0000000..0638e78
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips3.s
@@ -0,0 +1,14 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teqi      $s5,-17504          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgei      $s1,5025            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeiu     $sp,-28621          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlti      $14,-21059          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltiu     $ra,-5076           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tnei      $12,-29647          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
diff --git a/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s
new file mode 100644
index 0000000..6b980e6
--- /dev/null
+++ b/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s
@@ -0,0 +1,44 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        abs.ps          $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        add.ps          $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        alnv.ps         $f12,$f18,$f30,$12  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.eq.ps         $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.f.ps          $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.le.ps         $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.lt.ps         $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.nge.ps        $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngl.ps        $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngle.ps       $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ngt.ps        $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ole.ps        $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.olt.ps        $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.seq.ps        $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.sf.ps         $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ueq.ps        $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ule.ps        $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.ult.ps        $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        c.un.ps         $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.ps.s        $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        cvt.ps.pw       $f3,$f18            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        madd.ps         $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        mov.ps          $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movf.ps         $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movn.ps         $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movt.ps         $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        movz.ps         $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        msub.ps         $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        mul.ps          $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        neg.ps          $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        nmadd.ps        $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        nmsub.ps        $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        pll.ps          $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        plu.ps          $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        pul.ps          $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        puu.ps          $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        sub.ps          $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
diff --git a/test/MC/Mips/mips64r6/relocations.s b/test/MC/Mips/mips64r6/relocations.s
new file mode 100644
index 0000000..db84715
--- /dev/null
+++ b/test/MC/Mips/mips64r6/relocations.s
@@ -0,0 +1,55 @@
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64r6 \
+# RUN:   | FileCheck %s -check-prefix=CHECK-FIXUP
+# RUN: llvm-mc %s -filetype=obj -triple=mips-unknown-linux -mcpu=mips64r6 \
+# RUN:   | llvm-readobj -r | FileCheck %s -check-prefix=CHECK-ELF
+#------------------------------------------------------------------------------
+# Check that the assembler can handle the documented syntax for fixups.
+#------------------------------------------------------------------------------
+# CHECK-FIXUP: beqc $5, $6, bar # encoding: [0x20,0xa6,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_Mips_PC16
+# CHECK-FIXUP: bnec $5, $6, bar # encoding: [0x60,0xa6,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_Mips_PC16
+# CHECK-FIXUP: beqzc $9, bar    # encoding: [0xd9,0b001AAAAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC21_S2
+# CHECK-FIXUP: bnezc $9, bar    # encoding: [0xf9,0b001AAAAA,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC21_S2
+# CHECK-FIXUP: balc  bar        # encoding: [0b111010AA,A,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC26_S2
+# CHECK-FIXUP: bc    bar        # encoding: [0b110010AA,A,A,A]
+# CHECK-FIXUP:                  #   fixup A - offset: 0,
+# CHECK-FIXUP:                      value: bar, kind: fixup_MIPS_PC26_S2
+# CHECK-FIXUP: aluipc $2, %pcrel_hi(bar)    # encoding: [0xec,0x5f,A,A]
+# CHECK-FIXUP:                              #   fixup A - offset: 0,
+# CHECK-FIXUP:                                  value: bar@PCREL_HI16,
+# CHECK-FIXUP:                                  kind: fixup_MIPS_PCHI16
+# CHECK-FIXUP: addiu $2, $2, %pcrel_lo(bar) # encoding: [0x24,0x42,A,A]
+# CHECK-FIXUP:                              #   fixup A - offset: 0,
+# CHECK-FIXUP:                                  value: bar@PCREL_LO16,
+# CHECK-FIXUP:                                  kind: fixup_MIPS_PCLO16
+#------------------------------------------------------------------------------
+# Check that the appropriate relocations were created.
+#------------------------------------------------------------------------------
+# CHECK-ELF: Relocations [
+# CHECK-ELF:     0x0 R_MIPS_PC16 bar 0x0
+# CHECK-ELF:     0x4 R_MIPS_PC16 bar 0x0
+# CHECK-ELF:     0x8 R_MIPS_PC21_S2 bar 0x0
+# CHECK-ELF:     0xC R_MIPS_PC21_S2 bar 0x0
+# CHECK-ELF:     0x10 R_MIPS_PC26_S2 bar 0x0
+# CHECK-ELF:     0x14 R_MIPS_PC26_S2 bar 0x0
+# CHECK-ELF:     0x18 R_MIPS_PCHI16 bar 0x0
+# CHECK-ELF:     0x1C R_MIPS_PCLO16 bar 0x0
+# CHECK-ELF: ]
+
+  beqc  $5, $6, bar
+  bnec  $5, $6, bar
+  beqzc $9, bar
+  bnezc $9, bar
+  balc  bar
+  bc    bar
+  aluipc $2, %pcrel_hi(bar)
+  addiu  $2, $2, %pcrel_lo(bar)
diff --git a/test/MC/Mips/mips64r6/valid-xfail.s b/test/MC/Mips/mips64r6/valid-xfail.s
new file mode 100644
index 0000000..a751225
--- /dev/null
+++ b/test/MC/Mips/mips64r6/valid-xfail.s
@@ -0,0 +1,19 @@
+# Instructions that should be valid but currently fail for known reasons (e.g.
+# they aren't implemented yet).
+# This test is set up to XPASS if any instruction generates an encoding.
+#
+# RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r6 | not FileCheck %s
+# CHECK-NOT: encoding
+# XFAIL: *
+
+        .set noat
+        bovc     $0, $2, 4       # TODO: bovc $0, $2, 4      # encoding: [0x20,0x40,0x00,0x01]
+        bovc     $2, $4, 4       # TODO: bovc $2, $4, 4      # encoding: [0x20,0x82,0x00,0x01]
+        bnvc     $0, $2, 4       # TODO: bnvc $0, $2, 4      # encoding: [0x60,0x40,0x00,0x01]
+        bnvc     $2, $4, 4       # TODO: bnvc $2, $4, 4      # encoding: [0x60,0x82,0x00,0x01]
+        beqc    $0, $6, 256      # TODO: beqc $6, $zero, 256 # encoding: [0x20,0xc0,0x00,0x40]
+        beqc    $5, $0, 256      # TODO: beqc $5, $zero, 256 # encoding: [0x20,0xa0,0x00,0x40]
+        beqc    $6, $5, 256      # TODO: beqc $5, $6, 256    # encoding: [0x20,0xa6,0x00,0x40]
+        bnec    $0, $6, 256      # TODO: bnec $6, $zero, 256 # encoding: [0x60,0xc0,0x00,0x40]
+        bnec    $5, $0, 256      # TODO: bnec $5, $zero, 256 # encoding: [0x60,0xa0,0x00,0x40]
+        bnec    $6, $5, 256      # TODO: bnec $5, $6, 256    # encoding: [0x60,0xa6,0x00,0x40]
diff --git a/test/MC/Mips/mips64r6/valid.s b/test/MC/Mips/mips64r6/valid.s
new file mode 100644
index 0000000..efdfc7f
--- /dev/null
+++ b/test/MC/Mips/mips64r6/valid.s
@@ -0,0 +1,139 @@
+# Instructions that are valid
+#
+# Branches have some unusual encoding rules in MIPS32r6 so we need to test:
+#   rs == 0
+#   rs != 0
+#   rt == 0
+#   rt != 0
+#   rs < rt
+#   rs == rt
+#   rs > rt
+# appropriately for each branch instruction
+#
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64r6 | FileCheck %s
+
+        .set noat
+        # FIXME: Add the instructions carried forward from older ISA's
+        addiupc $4, 100          # CHECK: addiupc $4, 100     # encoding: [0xec,0x80,0x00,0x19]
+        align   $4, $2, $3, 2    # CHECK: align $4, $2, $3, 2 # encoding: [0x7c,0x43,0x22,0xa0]
+        aluipc  $3, 56           # CHECK: aluipc $3, 56       # encoding: [0xec,0x7f,0x00,0x38]
+        aui     $3,$2,-23        # CHECK: aui $3, $2, -23     # encoding: [0x3c,0x62,0xff,0xe9]
+        auipc   $3, -1           # CHECK: auipc $3, -1        # encoding: [0xec,0x7e,0xff,0xff]
+        balc 14572256            # CHECK: balc 14572256       # encoding: [0xe8,0x37,0x96,0xb8]
+        bc 14572256              # CHECK: bc 14572256         # encoding: [0xc8,0x37,0x96,0xb8]
+        bc1eqz  $f0,4            # CHECK: bc1eqz $f0, 4       # encoding: [0x45,0x20,0x00,0x01]
+        bc1eqz  $f31,4           # CHECK: bc1eqz $f31, 4      # encoding: [0x45,0x3f,0x00,0x01]
+        bc1nez  $f0,4            # CHECK: bc1nez $f0, 4       # encoding: [0x45,0xa0,0x00,0x01]
+        bc1nez  $f31,4           # CHECK: bc1nez $f31, 4      # encoding: [0x45,0xbf,0x00,0x01]
+        bc2eqz  $0,8             # CHECK: bc2eqz $0, 8        # encoding: [0x49,0x20,0x00,0x02]
+        bc2eqz  $31,8            # CHECK: bc2eqz $31, 8       # encoding: [0x49,0x3f,0x00,0x02]
+        bc2nez  $0,8             # CHECK: bc2nez $0, 8        # encoding: [0x49,0xa0,0x00,0x02]
+        bc2nez  $31,8            # CHECK: bc2nez $31, 8       # encoding: [0x49,0xbf,0x00,0x02]
+        # beqc requires rs < rt && rs != 0 but we also accept when this is not true. See also bovc
+        # FIXME: Testcases are in valid-xfail.s at the moment
+        beqc $5, $6, 256         # CHECK: beqc $5, $6, 256    # encoding: [0x20,0xa6,0x00,0x40]
+        beqzalc $2, 1332         # CHECK: beqzalc $2, 1332    # encoding: [0x20,0x02,0x01,0x4d]
+        # bnec requires rs < rt && rs != 0 but we accept when this is not true. See also bnvc
+        # FIXME: Testcases are in valid-xfail.s at the moment
+        bnec $5, $6, 256         # CHECK: bnec $5, $6, 256    # encoding: [0x60,0xa6,0x00,0x40]
+        bnezalc $2, 1332         # CHECK: bnezalc $2, 1332    # encoding: [0x60,0x02,0x01,0x4d]
+        beqzc $5, 72256          # CHECK: beqzc $5, 72256     # encoding: [0xd8,0xa0,0x46,0x90]
+        bgezalc $2, 1332         # CHECK: bgezalc $2, 1332    # encoding: [0x18,0x42,0x01,0x4d]
+        bnezc $5, 72256          # CHECK: bnezc $5, 72256     # encoding: [0xf8,0xa0,0x46,0x90]
+        bltzc $5, 256            # CHECK: bltzc $5, 256       # encoding: [0x5c,0xa5,0x00,0x40]
+        bgezc $5, 256            # CHECK: bgezc $5, 256       # encoding: [0x58,0xa5,0x00,0x40]
+        bgtzalc $2, 1332         # CHECK: bgtzalc $2, 1332    # encoding: [0x1c,0x02,0x01,0x4d]
+        blezc $5, 256            # CHECK: blezc $5, 256       # encoding: [0x58,0x05,0x00,0x40]
+        bltzalc $2, 1332         # CHECK: bltzalc $2, 1332    # encoding: [0x1c,0x42,0x01,0x4d]
+        bgtzc $5, 256            # CHECK: bgtzc $5, 256       # encoding: [0x5c,0x05,0x00,0x40]
+        bitswap $4, $2           # CHECK: bitswap $4, $2      # encoding: [0x7c,0x02,0x20,0x20]
+        blezalc $2, 1332         # CHECK: blezalc $2, 1332    # encoding: [0x18,0x02,0x01,0x4d]
+        # bnvc requires that rs >= rt but we accept both. See also bnec
+        bnvc     $0, $0, 4       # CHECK: bnvc $zero, $zero, 4 # encoding: [0x60,0x00,0x00,0x01]
+        bnvc     $2, $0, 4       # CHECK: bnvc $2, $zero, 4    # encoding: [0x60,0x40,0x00,0x01]
+        bnvc     $4, $2, 4       # CHECK: bnvc $4, $2, 4      # encoding: [0x60,0x82,0x00,0x01]
+        # bovc requires that rs >= rt but we accept both. See also beqc
+        bovc     $0, $0, 4       # CHECK: bovc $zero, $zero, 4 # encoding: [0x20,0x00,0x00,0x01]
+        bovc     $2, $0, 4       # CHECK: bovc $2, $zero, 4    # encoding: [0x20,0x40,0x00,0x01]
+        bovc     $4, $2, 4       # CHECK: bovc $4, $2, 4      # encoding: [0x20,0x82,0x00,0x01]
+        cmp.f.s    $f2,$f3,$f4      # CHECK: cmp.f.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x80]
+        cmp.f.d    $f2,$f3,$f4      # CHECK: cmp.f.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x80]
+        cmp.un.s   $f2,$f3,$f4      # CHECK: cmp.un.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x81]
+        cmp.un.d   $f2,$f3,$f4      # CHECK: cmp.un.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x81]
+        cmp.eq.s   $f2,$f3,$f4      # CHECK: cmp.eq.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x82]
+        cmp.eq.d   $f2,$f3,$f4      # CHECK: cmp.eq.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x82]
+        cmp.ueq.s  $f2,$f3,$f4      # CHECK: cmp.ueq.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x83]
+        cmp.ueq.d  $f2,$f3,$f4      # CHECK: cmp.ueq.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x83]
+        cmp.olt.s  $f2,$f3,$f4      # CHECK: cmp.olt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x84]
+        cmp.olt.d  $f2,$f3,$f4      # CHECK: cmp.olt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x84]
+        cmp.ult.s  $f2,$f3,$f4      # CHECK: cmp.ult.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x85]
+        cmp.ult.d  $f2,$f3,$f4      # CHECK: cmp.ult.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x85]
+        cmp.ole.s  $f2,$f3,$f4      # CHECK: cmp.ole.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x86]
+        cmp.ole.d  $f2,$f3,$f4      # CHECK: cmp.ole.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x86]
+        cmp.ule.s  $f2,$f3,$f4      # CHECK: cmp.ule.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x87]
+        cmp.ule.d  $f2,$f3,$f4      # CHECK: cmp.ule.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x87]
+        cmp.sf.s   $f2,$f3,$f4      # CHECK: cmp.sf.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x88]
+        cmp.sf.d   $f2,$f3,$f4      # CHECK: cmp.sf.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x88]
+        cmp.ngle.s $f2,$f3,$f4      # CHECK: cmp.ngle.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x89]
+        cmp.ngle.d $f2,$f3,$f4      # CHECK: cmp.ngle.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x89]
+        cmp.seq.s  $f2,$f3,$f4      # CHECK: cmp.seq.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8a]
+        cmp.seq.d  $f2,$f3,$f4      # CHECK: cmp.seq.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8a]
+        cmp.ngl.s  $f2,$f3,$f4      # CHECK: cmp.ngl.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8b]
+        cmp.ngl.d  $f2,$f3,$f4      # CHECK: cmp.ngl.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8b]
+        cmp.lt.s   $f2,$f3,$f4      # CHECK: cmp.lt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8c]
+        cmp.lt.d   $f2,$f3,$f4      # CHECK: cmp.lt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8c]
+        cmp.nge.s  $f2,$f3,$f4      # CHECK: cmp.nge.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8d]
+        cmp.nge.d  $f2,$f3,$f4      # CHECK: cmp.nge.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8d]
+        cmp.le.s   $f2,$f3,$f4      # CHECK: cmp.le.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8e]
+        cmp.le.d   $f2,$f3,$f4      # CHECK: cmp.le.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8e]
+        cmp.ngt.s  $f2,$f3,$f4      # CHECK: cmp.ngt.s $f2, $f3, $f4  # encoding: [0x46,0x84,0x18,0x8f]
+        cmp.ngt.d  $f2,$f3,$f4      # CHECK: cmp.ngt.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8f]
+        dalign  $4,$2,$3,5       # CHECK: dalign $4, $2, $3, 5 # encoding: [0x7c,0x43,0x23,0x64]
+        daui    $3,$2,0x1234     # CHECK: daui $3, $2, 4660  # encoding: [0x74,0x62,0x12,0x34]
+        dahi     $3,0x5678       # CHECK: dahi $3, 22136     # encoding: [0x04,0x66,0x56,0x78]
+        dati     $3,0xabcd       # CHECK: dati $3, 43981     # encoding: [0x04,0x7e,0xab,0xcd]
+        dbitswap $4, $2          # CHECK: dbitswap $4, $2    # encoding: [0x7c,0x02,0x20,0x24]
+        div     $2,$3,$4         # CHECK: div $2, $3, $4   # encoding: [0x00,0x64,0x10,0x9a]
+        divu    $2,$3,$4         # CHECK: divu $2, $3, $4  # encoding: [0x00,0x64,0x10,0x9b]
+        jialc   $5, 256          # CHECK: jialc $5, 256    # encoding: [0xf8,0x05,0x01,0x00]
+        jic     $5, 256          # CHECK: jic $5, 256      # encoding: [0xd8,0x05,0x01,0x00]
+        mod     $2,$3,$4         # CHECK: mod $2, $3, $4   # encoding: [0x00,0x64,0x10,0xda]
+        modu    $2,$3,$4         # CHECK: modu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xdb]
+        ddiv    $2,$3,$4         # CHECK: ddiv $2, $3, $4  # encoding: [0x00,0x64,0x10,0x9e]
+        ddivu   $2,$3,$4         # CHECK: ddivu $2, $3, $4 # encoding: [0x00,0x64,0x10,0x9f]
+        dmod    $2,$3,$4         # CHECK: dmod $2, $3, $4  # encoding: [0x00,0x64,0x10,0xde]
+        dmodu   $2,$3,$4         # CHECK: dmodu $2, $3, $4 # encoding: [0x00,0x64,0x10,0xdf]
+        lwpc    $2,268           # CHECK: lwpc $2, 268     # encoding: [0xec,0x48,0x00,0x43]
+        lwupc   $2,268           # CHECK: lwupc $2, 268    # encoding: [0xec,0x50,0x00,0x43]
+#        mul     $2,$3,$4         # CHECK-TODO: mul $2, $3, $4   # encoding: [0x00,0x64,0x10,0x98]
+        muh     $2,$3,$4         # CHECK: muh $2, $3, $4   # encoding: [0x00,0x64,0x10,0xd8]
+        mulu    $2,$3,$4         # CHECK: mulu $2, $3, $4  # encoding: [0x00,0x64,0x10,0x99]
+        muhu    $2,$3,$4         # CHECK: muhu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xd9]
+        dmul    $2,$3,$4         # CHECK: dmul $2, $3, $4  # encoding: [0x00,0x64,0x10,0xb8]
+        dmuh    $2,$3,$4         # CHECK: dmuh $2, $3, $4  # encoding: [0x00,0x64,0x10,0xf8]
+        dmulu   $2,$3,$4         # CHECK: dmulu $2, $3, $4 # encoding: [0x00,0x64,0x10,0xb9]
+        dmuhu   $2,$3,$4         # CHECK: dmuhu $2, $3, $4 # encoding: [0x00,0x64,0x10,0xf9]
+        maddf.s $f2,$f3,$f4      # CHECK: maddf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x98]
+        maddf.d $f2,$f3,$f4      # CHECK: maddf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x98]
+        msubf.s $f2,$f3,$f4      # CHECK: msubf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x99]
+        msubf.d $f2,$f3,$f4      # CHECK: msubf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x99]
+        sel.d   $f0,$f1,$f2      # CHECK: sel.d $f0, $f1, $f2 # encoding: [0x46,0x22,0x08,0x10]
+        sel.s   $f0,$f1,$f2      # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10]
+        seleqz  $2,$3,$4         # CHECK: seleqz $2, $3, $4 # encoding: [0x00,0x64,0x10,0x35]
+        selnez  $2,$3,$4         # CHECK: selnez $2, $3, $4 # encoding: [0x00,0x64,0x10,0x37]
+        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
+        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
+        min.s   $f0, $f2, $f4    # CHECK: min.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1c]
+        min.d   $f0, $f2, $f4    # CHECK: min.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1c]
+        maxa.s  $f0, $f2, $f4    # CHECK: maxa.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1f]
+        maxa.d  $f0, $f2, $f4    # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f]
+        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
+        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
+        seleqz.s $f0, $f2, $f4   # CHECK: seleqz.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x14]
+        seleqz.d $f0, $f2, $f4   # CHECK: seleqz.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x14]
+        selnez.s $f0, $f2, $f4   # CHECK: selnez.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x17]
+        selnez.d $f0, $f2, $f4   # CHECK: selnez.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x17]
+        rint.s $f2, $f4          # CHECK: rint.s $f2, $f4        # encoding: [0x46,0x00,0x20,0x9a]
+        rint.d $f2, $f4          # CHECK: rint.d $f2, $f4        # encoding: [0x46,0x20,0x20,0x9a]
+        class.s $f2, $f4         # CHECK: class.s $f2, $f4       # encoding: [0x46,0x00,0x20,0x9b]
+        class.d $f2, $f4         # CHECK: class.d $f2, $f4       # encoding: [0x46,0x20,0x20,0x9b]
diff --git a/test/MC/Mips/mips_directives.s b/test/MC/Mips/mips_directives.s
index 6780dd0..1a7d61f 100644
--- a/test/MC/Mips/mips_directives.s
+++ b/test/MC/Mips/mips_directives.s
@@ -51,7 +51,7 @@ $BB0_4:
     .set  $tmp7, $BB0_4-$BB0_2
     .set f6,$f6
 # CHECK:    abs.s   $f6, $f7           # encoding: [0x46,0x00,0x39,0x85]
-# CHECK:    lui     $1, %hi($tmp7)     # encoding: [0x3c'A',0x01'A',0x00,0x00]
+# CHECK:    lui     $1, %hi($tmp7)     # encoding: [0x3c,0x01,A,A]
 # CHECK:                               #   fixup A - offset: 0, value: ($tmp7)@ABS_HI, kind: fixup_Mips_HI16
     abs.s  f6,FPU_MASK
     lui $1, %hi($tmp7)
diff --git a/test/MC/Mips/mips_gprel16.s b/test/MC/Mips/mips_gprel16.s
index 716c75e..9dd3fa3 100644
--- a/test/MC/Mips/mips_gprel16.s
+++ b/test/MC/Mips/mips_gprel16.s
@@ -6,6 +6,9 @@
 // RUN: llvm-mc -mcpu=mips32r2 -triple=mipsel-pc-linux -filetype=obj -relocation-model=static %s -o - \
 // RUN: | llvm-objdump -disassemble -mattr +mips32r2 - \
 // RUN: | FileCheck %s
+// RUN: llvm-mc -mcpu=mips32r2 -triple=mips-pc-linux -filetype=obj -relocation-model=static %s -o - \
+// RUN: | llvm-objdump -disassemble -mattr +mips32r2 - \
+// RUN: | FileCheck %s
 
 	.text
 	.abicalls
diff --git a/test/MC/Mips/msa/test_2r.s b/test/MC/Mips/msa/test_2r.s
index b657d5f..01bea64 100644
--- a/test/MC/Mips/msa/test_2r.s
+++ b/test/MC/Mips/msa/test_2r.s
@@ -1,9 +1,5 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        fill.b  $w30, $9                # encoding: [0x7b,0x00,0x4f,0x9e]
 # CHECK:        fill.h  $w31, $23               # encoding: [0x7b,0x01,0xbf,0xde]
 # CHECK:        fill.w  $w16, $24               # encoding: [0x7b,0x02,0xc4,0x1e]
@@ -20,22 +16,6 @@
 # CHECK:        pcnt.w  $w23, $w9               # encoding: [0x7b,0x06,0x4d,0xde]
 # CHECK:        pcnt.d  $w21, $w24              # encoding: [0x7b,0x07,0xc5,0x5e]
 
-# CHECKOBJDUMP:        fill.b  $w30, $9
-# CHECKOBJDUMP:        fill.h  $w31, $23
-# CHECKOBJDUMP:        fill.w  $w16, $24
-# CHECKOBJDUMP:        nloc.b  $w21, $w0
-# CHECKOBJDUMP:        nloc.h  $w18, $w31
-# CHECKOBJDUMP:        nloc.w  $w2, $w23
-# CHECKOBJDUMP:        nloc.d  $w4, $w10
-# CHECKOBJDUMP:        nlzc.b  $w31, $w2
-# CHECKOBJDUMP:        nlzc.h  $w27, $w22
-# CHECKOBJDUMP:        nlzc.w  $w10, $w29
-# CHECKOBJDUMP:        nlzc.d  $w25, $w9
-# CHECKOBJDUMP:        pcnt.b  $w20, $w18
-# CHECKOBJDUMP:        pcnt.h  $w0, $w8
-# CHECKOBJDUMP:        pcnt.w  $w23, $w9
-# CHECKOBJDUMP:        pcnt.d  $w21, $w24
-
                 fill.b  $w30, $9
                 fill.h  $w31, $23
                 fill.w  $w16, $24
diff --git a/test/MC/Mips/msa/test_2r_msa64.s b/test/MC/Mips/msa/test_2r_msa64.s
index 743fb88..f6e35c4 100644
--- a/test/MC/Mips/msa/test_2r_msa64.s
+++ b/test/MC/Mips/msa/test_2r_msa64.s
@@ -1,11 +1,5 @@
 # RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa -filetype=obj -o - | \
-# RUN:   llvm-objdump -d -arch=mips64 -mattr=+msa - | \
-# RUN:     FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        fill.d  $w27, $9                # encoding: [0x7b,0x03,0x4e,0xde]
 
-# CHECKOBJDUMP:        fill.d  $w27, $9
-
                 fill.d  $w27, $9
diff --git a/test/MC/Mips/msa/test_2rf.s b/test/MC/Mips/msa/test_2rf.s
index 284a7d9..5d41545 100644
--- a/test/MC/Mips/msa/test_2rf.s
+++ b/test/MC/Mips/msa/test_2rf.s
@@ -1,9 +1,5 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        fclass.w        $w26, $w12              # encoding: [0x7b,0x20,0x66,0x9e]
 # CHECK:        fclass.d        $w24, $w17              # encoding: [0x7b,0x21,0x8e,0x1e]
 # CHECK:        fexupl.w        $w8, $w0                # encoding: [0x7b,0x30,0x02,0x1e]
@@ -37,39 +33,6 @@
 # CHECK:        ftrunc_u.w      $w17, $w15              # encoding: [0x7b,0x24,0x7c,0x5e]
 # CHECK:        ftrunc_u.d      $w5, $w27               # encoding: [0x7b,0x25,0xd9,0x5e]
 
-# CHECKOBJDUMP:        fclass.w        $w26, $w12
-# CHECKOBJDUMP:        fclass.d        $w24, $w17
-# CHECKOBJDUMP:        fexupl.w        $w8, $w0
-# CHECKOBJDUMP:        fexupl.d        $w17, $w29
-# CHECKOBJDUMP:        fexupr.w        $w13, $w4
-# CHECKOBJDUMP:        fexupr.d        $w5, $w2
-# CHECKOBJDUMP:        ffint_s.w       $w20, $w29
-# CHECKOBJDUMP:        ffint_s.d       $w12, $w15
-# CHECKOBJDUMP:        ffint_u.w       $w7, $w27
-# CHECKOBJDUMP:        ffint_u.d       $w19, $w16
-# CHECKOBJDUMP:        ffql.w          $w31, $w13
-# CHECKOBJDUMP:        ffql.d          $w12, $w13
-# CHECKOBJDUMP:        ffqr.w          $w27, $w30
-# CHECKOBJDUMP:        ffqr.d          $w30, $w15
-# CHECKOBJDUMP:        flog2.w         $w25, $w31
-# CHECKOBJDUMP:        flog2.d         $w18, $w10
-# CHECKOBJDUMP:        frint.w         $w7, $w15
-# CHECKOBJDUMP:        frint.d         $w21, $w22
-# CHECKOBJDUMP:        frcp.w          $w19, $w0
-# CHECKOBJDUMP:        frcp.d          $w4, $w14
-# CHECKOBJDUMP:        frsqrt.w        $w12, $w17
-# CHECKOBJDUMP:        frsqrt.d        $w23, $w11
-# CHECKOBJDUMP:        fsqrt.w         $w0, $w11
-# CHECKOBJDUMP:        fsqrt.d         $w15, $w12
-# CHECKOBJDUMP:        ftint_s.w       $w30, $w5
-# CHECKOBJDUMP:        ftint_s.d       $w5, $w23
-# CHECKOBJDUMP:        ftint_u.w       $w20, $w14
-# CHECKOBJDUMP:        ftint_u.d       $w23, $w21
-# CHECKOBJDUMP:        ftrunc_s.w      $w29, $w17
-# CHECKOBJDUMP:        ftrunc_s.d      $w12, $w27
-# CHECKOBJDUMP:        ftrunc_u.w      $w17, $w15
-# CHECKOBJDUMP:        ftrunc_u.d      $w5, $w27
-
                 fclass.w        $w26, $w12
                 fclass.d        $w24, $w17
                 fexupl.w        $w8, $w0
diff --git a/test/MC/Mips/msa/test_3r.s b/test/MC/Mips/msa/test_3r.s
index d6b33f1..df2e1e1 100644
--- a/test/MC/Mips/msa/test_3r.s
+++ b/test/MC/Mips/msa/test_3r.s
@@ -1,9 +1,5 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        add_a.b         $w26, $w9, $w4                  # encoding: [0x78,0x04,0x4e,0x90]
 # CHECK:        add_a.h         $w23, $w27, $w31                # encoding: [0x78,0x3f,0xdd,0xd0]
 # CHECK:        add_a.w         $w11, $w6, $w22                 # encoding: [0x78,0x56,0x32,0xd0]
@@ -247,249 +243,6 @@
 # CHECK:        vshf.w          $w16, $w30, $w25                # encoding: [0x78,0x59,0xf4,0x15]
 # CHECK:        vshf.d          $w19, $w11, $w15                # encoding: [0x78,0x6f,0x5c,0xd5]
 
-# CHECKOBJDUMP:        add_a.b         $w26, $w9, $w4
-# CHECKOBJDUMP:        add_a.h         $w23, $w27, $w31
-# CHECKOBJDUMP:        add_a.w         $w11, $w6, $w22
-# CHECKOBJDUMP:        add_a.d         $w6, $w10, $w0
-# CHECKOBJDUMP:        adds_a.b        $w19, $w24, $w19
-# CHECKOBJDUMP:        adds_a.h        $w25, $w6, $w4
-# CHECKOBJDUMP:        adds_a.w        $w25, $w17, $w27
-# CHECKOBJDUMP:        adds_a.d        $w15, $w18, $w26
-# CHECKOBJDUMP:        adds_s.b        $w29, $w11, $w19
-# CHECKOBJDUMP:        adds_s.h        $w5, $w23, $w26
-# CHECKOBJDUMP:        adds_s.w        $w16, $w14, $w13
-# CHECKOBJDUMP:        adds_s.d        $w2, $w14, $w28
-# CHECKOBJDUMP:        adds_u.b        $w3, $w17, $w14
-# CHECKOBJDUMP:        adds_u.h        $w10, $w30, $w4
-# CHECKOBJDUMP:        adds_u.w        $w15, $w18, $w20
-# CHECKOBJDUMP:        adds_u.d        $w30, $w10, $w9
-# CHECKOBJDUMP:        addv.b          $w24, $w20, $w21
-# CHECKOBJDUMP:        addv.h          $w4, $w13, $w27
-# CHECKOBJDUMP:        addv.w          $w19, $w11, $w14
-# CHECKOBJDUMP:        addv.d          $w2, $w21, $w31
-# CHECKOBJDUMP:        asub_s.b        $w23, $w16, $w3
-# CHECKOBJDUMP:        asub_s.h        $w22, $w17, $w25
-# CHECKOBJDUMP:        asub_s.w        $w24, $w1, $w9
-# CHECKOBJDUMP:        asub_s.d        $w13, $w12, $w12
-# CHECKOBJDUMP:        asub_u.b        $w10, $w29, $w11
-# CHECKOBJDUMP:        asub_u.h        $w18, $w9, $w15
-# CHECKOBJDUMP:        asub_u.w        $w10, $w19, $w31
-# CHECKOBJDUMP:        asub_u.d        $w17, $w10, $w0
-# CHECKOBJDUMP:        ave_s.b         $w2, $w5, $w1
-# CHECKOBJDUMP:        ave_s.h         $w16, $w19, $w9
-# CHECKOBJDUMP:        ave_s.w         $w17, $w31, $w5
-# CHECKOBJDUMP:        ave_s.d         $w27, $w25, $w10
-# CHECKOBJDUMP:        ave_u.b         $w16, $w19, $w9
-# CHECKOBJDUMP:        ave_u.h         $w28, $w28, $w11
-# CHECKOBJDUMP:        ave_u.w         $w11, $w12, $w11
-# CHECKOBJDUMP:        ave_u.d         $w30, $w19, $w28
-# CHECKOBJDUMP:        aver_s.b        $w26, $w16, $w2
-# CHECKOBJDUMP:        aver_s.h        $w31, $w27, $w27
-# CHECKOBJDUMP:        aver_s.w        $w28, $w18, $w25
-# CHECKOBJDUMP:        aver_s.d        $w29, $w21, $w27
-# CHECKOBJDUMP:        aver_u.b        $w29, $w26, $w3
-# CHECKOBJDUMP:        aver_u.h        $w18, $w18, $w9
-# CHECKOBJDUMP:        aver_u.w        $w17, $w25, $w29
-# CHECKOBJDUMP:        aver_u.d        $w22, $w22, $w19
-# CHECKOBJDUMP:        bclr.b          $w2, $w15, $w29
-# CHECKOBJDUMP:        bclr.h          $w16, $w21, $w28
-# CHECKOBJDUMP:        bclr.w          $w19, $w2, $w9
-# CHECKOBJDUMP:        bclr.d          $w27, $w31, $w4
-# CHECKOBJDUMP:        binsl.b         $w5, $w16, $w24
-# CHECKOBJDUMP:        binsl.h         $w30, $w5, $w10
-# CHECKOBJDUMP:        binsl.w         $w14, $w15, $w13
-# CHECKOBJDUMP:        binsl.d         $w23, $w20, $w12
-# CHECKOBJDUMP:        binsr.b         $w22, $w11, $w2
-# CHECKOBJDUMP:        binsr.h         $w0, $w26, $w6
-# CHECKOBJDUMP:        binsr.w         $w26, $w3, $w28
-# CHECKOBJDUMP:        binsr.d         $w0, $w0, $w21
-# CHECKOBJDUMP:        bneg.b          $w0, $w11, $w24
-# CHECKOBJDUMP:        bneg.h          $w28, $w16, $w4
-# CHECKOBJDUMP:        bneg.w          $w3, $w26, $w19
-# CHECKOBJDUMP:        bneg.d          $w13, $w29, $w15
-# CHECKOBJDUMP:        bset.b          $w31, $w5, $w31
-# CHECKOBJDUMP:        bset.h          $w14, $w12, $w6
-# CHECKOBJDUMP:        bset.w          $w31, $w9, $w12
-# CHECKOBJDUMP:        bset.d          $w5, $w22, $w5
-# CHECKOBJDUMP:        ceq.b           $w31, $w31, $w18
-# CHECKOBJDUMP:        ceq.h           $w10, $w27, $w9
-# CHECKOBJDUMP:        ceq.w           $w9, $w5, $w14
-# CHECKOBJDUMP:        ceq.d           $w5, $w17, $w0
-# CHECKOBJDUMP:        cle_s.b         $w23, $w4, $w9
-# CHECKOBJDUMP:        cle_s.h         $w22, $w27, $w19
-# CHECKOBJDUMP:        cle_s.w         $w30, $w26, $w10
-# CHECKOBJDUMP:        cle_s.d         $w18, $w5, $w10
-# CHECKOBJDUMP:        cle_u.b         $w1, $w25, $w0
-# CHECKOBJDUMP:        cle_u.h         $w7, $w0, $w29
-# CHECKOBJDUMP:        cle_u.w         $w25, $w18, $w1
-# CHECKOBJDUMP:        cle_u.d         $w6, $w0, $w30
-# CHECKOBJDUMP:        clt_s.b         $w25, $w2, $w21
-# CHECKOBJDUMP:        clt_s.h         $w2, $w19, $w9
-# CHECKOBJDUMP:        clt_s.w         $w23, $w8, $w16
-# CHECKOBJDUMP:        clt_s.d         $w7, $w30, $w12
-# CHECKOBJDUMP:        clt_u.b         $w2, $w31, $w13
-# CHECKOBJDUMP:        clt_u.h         $w16, $w31, $w23
-# CHECKOBJDUMP:        clt_u.w         $w3, $w24, $w9
-# CHECKOBJDUMP:        clt_u.d         $w7, $w0, $w1
-# CHECKOBJDUMP:        div_s.b         $w29, $w3, $w18
-# CHECKOBJDUMP:        div_s.h         $w17, $w16, $w13
-# CHECKOBJDUMP:        div_s.w         $w4, $w25, $w30
-# CHECKOBJDUMP:        div_s.d         $w31, $w9, $w20
-# CHECKOBJDUMP:        div_u.b         $w6, $w29, $w10
-# CHECKOBJDUMP:        div_u.h         $w24, $w21, $w14
-# CHECKOBJDUMP:        div_u.w         $w29, $w14, $w25
-# CHECKOBJDUMP:        div_u.d         $w31, $w1, $w21
-# CHECKOBJDUMP:        dotp_s.h        $w23, $w22, $w25
-# CHECKOBJDUMP:        dotp_s.w        $w20, $w14, $w5
-# CHECKOBJDUMP:        dotp_s.d        $w17, $w2, $w22
-# CHECKOBJDUMP:        dotp_u.h        $w13, $w2, $w6
-# CHECKOBJDUMP:        dotp_u.w        $w15, $w22, $w21
-# CHECKOBJDUMP:        dotp_u.d        $w4, $w16, $w26
-# CHECKOBJDUMP:        dpadd_s.h       $w1, $w28, $w22
-# CHECKOBJDUMP:        dpadd_s.w       $w10, $w1, $w12
-# CHECKOBJDUMP:        dpadd_s.d       $w3, $w21, $w27
-# CHECKOBJDUMP:        dpadd_u.h       $w17, $w5, $w20
-# CHECKOBJDUMP:        dpadd_u.w       $w24, $w8, $w16
-# CHECKOBJDUMP:        dpadd_u.d       $w15, $w29, $w16
-# CHECKOBJDUMP:        dpsub_s.h       $w4, $w11, $w12
-# CHECKOBJDUMP:        dpsub_s.w       $w4, $w7, $w6
-# CHECKOBJDUMP:        dpsub_s.d       $w31, $w12, $w28
-# CHECKOBJDUMP:        dpsub_u.h       $w4, $w25, $w17
-# CHECKOBJDUMP:        dpsub_u.w       $w19, $w25, $w16
-# CHECKOBJDUMP:        dpsub_u.d       $w7, $w10, $w26
-# CHECKOBJDUMP:        hadd_s.h        $w28, $w24, $w2
-# CHECKOBJDUMP:        hadd_s.w        $w24, $w17, $w11
-# CHECKOBJDUMP:        hadd_s.d        $w17, $w15, $w20
-# CHECKOBJDUMP:        hadd_u.h        $w12, $w29, $w17
-# CHECKOBJDUMP:        hadd_u.w        $w9, $w5, $w6
-# CHECKOBJDUMP:        hadd_u.d        $w1, $w20, $w6
-# CHECKOBJDUMP:        hsub_s.h        $w16, $w14, $w29
-# CHECKOBJDUMP:        hsub_s.w        $w9, $w13, $w11
-# CHECKOBJDUMP:        hsub_s.d        $w30, $w18, $w14
-# CHECKOBJDUMP:        hsub_u.h        $w7, $w12, $w14
-# CHECKOBJDUMP:        hsub_u.w        $w21, $w5, $w5
-# CHECKOBJDUMP:        hsub_u.d        $w11, $w12, $w31
-# CHECKOBJDUMP:        ilvev.b         $w18, $w16, $w30
-# CHECKOBJDUMP:        ilvev.h         $w14, $w0, $w13
-# CHECKOBJDUMP:        ilvev.w         $w12, $w25, $w22
-# CHECKOBJDUMP:        ilvev.d         $w30, $w27, $w3
-# CHECKOBJDUMP:        ilvl.b          $w29, $w3, $w21
-# CHECKOBJDUMP:        ilvl.h          $w27, $w10, $w17
-# CHECKOBJDUMP:        ilvl.w          $w6, $w1, $w0
-# CHECKOBJDUMP:        ilvl.d          $w3, $w16, $w24
-# CHECKOBJDUMP:        ilvod.b         $w11, $w5, $w20
-# CHECKOBJDUMP:        ilvod.h         $w18, $w13, $w31
-# CHECKOBJDUMP:        ilvod.w         $w29, $w16, $w24
-# CHECKOBJDUMP:        ilvod.d         $w22, $w12, $w29
-# CHECKOBJDUMP:        ilvr.b          $w4, $w30, $w6
-# CHECKOBJDUMP:        ilvr.h          $w28, $w19, $w29
-# CHECKOBJDUMP:        ilvr.w          $w18, $w20, $w21
-# CHECKOBJDUMP:        ilvr.d          $w23, $w30, $w12
-# CHECKOBJDUMP:        maddv.b         $w17, $w31, $w29
-# CHECKOBJDUMP:        maddv.h         $w7, $w24, $w9
-# CHECKOBJDUMP:        maddv.w         $w22, $w22, $w20
-# CHECKOBJDUMP:        maddv.d         $w30, $w26, $w20
-# CHECKOBJDUMP:        max_a.b         $w23, $w11, $w23
-# CHECKOBJDUMP:        max_a.h         $w20, $w5, $w30
-# CHECKOBJDUMP:        max_a.w         $w7, $w18, $w30
-# CHECKOBJDUMP:        max_a.d         $w8, $w8, $w31
-# CHECKOBJDUMP:        max_s.b         $w10, $w1, $w19
-# CHECKOBJDUMP:        max_s.h         $w15, $w29, $w17
-# CHECKOBJDUMP:        max_s.w         $w15, $w29, $w14
-# CHECKOBJDUMP:        max_s.d         $w25, $w24, $w3
-# CHECKOBJDUMP:        max_u.b         $w12, $w24, $w5
-# CHECKOBJDUMP:        max_u.h         $w5, $w6, $w7
-# CHECKOBJDUMP:        max_u.w         $w16, $w4, $w7
-# CHECKOBJDUMP:        max_u.d         $w26, $w12, $w24
-# CHECKOBJDUMP:        min_a.b         $w4, $w26, $w1
-# CHECKOBJDUMP:        min_a.h         $w12, $w13, $w31
-# CHECKOBJDUMP:        min_a.w         $w28, $w20, $w0
-# CHECKOBJDUMP:        min_a.d         $w12, $w20, $w19
-# CHECKOBJDUMP:        min_s.b         $w19, $w3, $w14
-# CHECKOBJDUMP:        min_s.h         $w27, $w21, $w8
-# CHECKOBJDUMP:        min_s.w         $w0, $w14, $w30
-# CHECKOBJDUMP:        min_s.d         $w6, $w8, $w21
-# CHECKOBJDUMP:        min_u.b         $w22, $w26, $w8
-# CHECKOBJDUMP:        min_u.h         $w7, $w27, $w12
-# CHECKOBJDUMP:        min_u.w         $w8, $w20, $w14
-# CHECKOBJDUMP:        min_u.d         $w26, $w14, $w15
-# CHECKOBJDUMP:        mod_s.b         $w18, $w1, $w26
-# CHECKOBJDUMP:        mod_s.h         $w31, $w30, $w28
-# CHECKOBJDUMP:        mod_s.w         $w2, $w6, $w13
-# CHECKOBJDUMP:        mod_s.d         $w21, $w27, $w22
-# CHECKOBJDUMP:        mod_u.b         $w16, $w7, $w13
-# CHECKOBJDUMP:        mod_u.h         $w24, $w8, $w7
-# CHECKOBJDUMP:        mod_u.w         $w30, $w2, $w17
-# CHECKOBJDUMP:        mod_u.d         $w31, $w2, $w25
-# CHECKOBJDUMP:        msubv.b         $w14, $w5, $w12
-# CHECKOBJDUMP:        msubv.h         $w6, $w7, $w30
-# CHECKOBJDUMP:        msubv.w         $w13, $w2, $w21
-# CHECKOBJDUMP:        msubv.d         $w16, $w14, $w27
-# CHECKOBJDUMP:        mulv.b          $w20, $w3, $w13
-# CHECKOBJDUMP:        mulv.h          $w27, $w26, $w14
-# CHECKOBJDUMP:        mulv.w          $w10, $w29, $w3
-# CHECKOBJDUMP:        mulv.d          $w7, $w19, $w29
-# CHECKOBJDUMP:        pckev.b         $w5, $w27, $w7
-# CHECKOBJDUMP:        pckev.h         $w1, $w4, $w27
-# CHECKOBJDUMP:        pckev.w         $w30, $w20, $w0
-# CHECKOBJDUMP:        pckev.d         $w6, $w1, $w15
-# CHECKOBJDUMP:        pckod.b         $w18, $w28, $w30
-# CHECKOBJDUMP:        pckod.h         $w26, $w5, $w8
-# CHECKOBJDUMP:        pckod.w         $w9, $w4, $w2
-# CHECKOBJDUMP:        pckod.d         $w30, $w22, $w20
-# CHECKOBJDUMP:        sld.b           $w5, $w23[$12]
-# CHECKOBJDUMP:        sld.h           $w1, $w23[$3]
-# CHECKOBJDUMP:        sld.w           $w20, $w8[$9]
-# CHECKOBJDUMP:        sld.d           $w7, $w23[$fp]
-# CHECKOBJDUMP:        sll.b           $w3, $w0, $w17
-# CHECKOBJDUMP:        sll.h           $w17, $w27, $w3
-# CHECKOBJDUMP:        sll.w           $w16, $w7, $w6
-# CHECKOBJDUMP:        sll.d           $w9, $w0, $w26
-# CHECKOBJDUMP:        splat.b         $w28, $w1[$1]
-# CHECKOBJDUMP:        splat.h         $w2, $w11[$11]
-# CHECKOBJDUMP:        splat.w         $w22, $w0[$11]
-# CHECKOBJDUMP:        splat.d         $w0, $w0[$2]
-# CHECKOBJDUMP:        sra.b           $w28, $w4, $w17
-# CHECKOBJDUMP:        sra.h           $w13, $w9, $w3
-# CHECKOBJDUMP:        sra.w           $w27, $w21, $w19
-# CHECKOBJDUMP:        sra.d           $w30, $w8, $w23
-# CHECKOBJDUMP:        srar.b          $w19, $w18, $w18
-# CHECKOBJDUMP:        srar.h          $w7, $w23, $w8
-# CHECKOBJDUMP:        srar.w          $w1, $w12, $w2
-# CHECKOBJDUMP:        srar.d          $w21, $w7, $w14
-# CHECKOBJDUMP:        srl.b           $w12, $w3, $w19
-# CHECKOBJDUMP:        srl.h           $w23, $w31, $w20
-# CHECKOBJDUMP:        srl.w           $w18, $w27, $w11
-# CHECKOBJDUMP:        srl.d           $w3, $w12, $w26
-# CHECKOBJDUMP:        srlr.b          $w15, $w21, $w11
-# CHECKOBJDUMP:        srlr.h          $w21, $w13, $w19
-# CHECKOBJDUMP:        srlr.w          $w6, $w30, $w3
-# CHECKOBJDUMP:        srlr.d          $w1, $w2, $w14
-# CHECKOBJDUMP:        subs_s.b        $w25, $w15, $w1
-# CHECKOBJDUMP:        subs_s.h        $w28, $w25, $w22
-# CHECKOBJDUMP:        subs_s.w        $w10, $w12, $w21
-# CHECKOBJDUMP:        subs_s.d        $w4, $w20, $w18
-# CHECKOBJDUMP:        subs_u.b        $w21, $w6, $w25
-# CHECKOBJDUMP:        subs_u.h        $w3, $w10, $w7
-# CHECKOBJDUMP:        subs_u.w        $w9, $w15, $w10
-# CHECKOBJDUMP:        subs_u.d        $w7, $w19, $w10
-# CHECKOBJDUMP:        subsus_u.b      $w6, $w7, $w12
-# CHECKOBJDUMP:        subsus_u.h      $w6, $w29, $w19
-# CHECKOBJDUMP:        subsus_u.w      $w7, $w15, $w7
-# CHECKOBJDUMP:        subsus_u.d      $w9, $w3, $w15
-# CHECKOBJDUMP:        subsuu_s.b      $w22, $w3, $w31
-# CHECKOBJDUMP:        subsuu_s.h      $w19, $w23, $w22
-# CHECKOBJDUMP:        subsuu_s.w      $w9, $w10, $w13
-# CHECKOBJDUMP:        subsuu_s.d      $w5, $w6, $w0
-# CHECKOBJDUMP:        subv.b          $w6, $w13, $w19
-# CHECKOBJDUMP:        subv.h          $w4, $w25, $w12
-# CHECKOBJDUMP:        subv.w          $w27, $w27, $w11
-# CHECKOBJDUMP:        subv.d          $w9, $w24, $w10
-# CHECKOBJDUMP:        vshf.b          $w3, $w16, $w5
-# CHECKOBJDUMP:        vshf.h          $w20, $w19, $w8
-# CHECKOBJDUMP:        vshf.w          $w16, $w30, $w25
-# CHECKOBJDUMP:        vshf.d          $w19, $w11, $w15
-
                 add_a.b         $w26, $w9, $w4
                 add_a.h         $w23, $w27, $w31
                 add_a.w         $w11, $w6, $w22
diff --git a/test/MC/Mips/msa/test_3rf.s b/test/MC/Mips/msa/test_3rf.s
index 6787d85..c5896d7 100644
--- a/test/MC/Mips/msa/test_3rf.s
+++ b/test/MC/Mips/msa/test_3rf.s
@@ -1,9 +1,5 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        fadd.w          $w28, $w19, $w28        # encoding: [0x78,0x1c,0x9f,0x1b]
 # CHECK:        fadd.d          $w13, $w2, $w29         # encoding: [0x78,0x3d,0x13,0x5b]
 # CHECK:        fcaf.w          $w14, $w11, $w25        # encoding: [0x78,0x19,0x5b,0x9a]
@@ -87,89 +83,6 @@
 # CHECK:        mulr_q.h        $w6, $w20, $w19         # encoding: [0x7b,0x13,0xa1,0x9c]
 # CHECK:        mulr_q.w        $w27, $w1, $w20         # encoding: [0x7b,0x34,0x0e,0xdc]
 
-# CHECKOBJDUMP:        fadd.w          $w28, $w19, $w28
-# CHECKOBJDUMP:        fadd.d          $w13, $w2, $w29
-# CHECKOBJDUMP:        fcaf.w          $w14, $w11, $w25
-# CHECKOBJDUMP:        fcaf.d          $w1, $w1, $w19
-# CHECKOBJDUMP:        fceq.w          $w1, $w23, $w16
-# CHECKOBJDUMP:        fceq.d          $w0, $w8, $w16
-# CHECKOBJDUMP:        fcle.w          $w16, $w9, $w24
-# CHECKOBJDUMP:        fcle.d          $w27, $w14, $w1
-# CHECKOBJDUMP:        fclt.w          $w28, $w8, $w8
-# CHECKOBJDUMP:        fclt.d          $w30, $w25, $w11
-# CHECKOBJDUMP:        fcne.w          $w2, $w18, $w23
-# CHECKOBJDUMP:        fcne.d          $w14, $w20, $w15
-# CHECKOBJDUMP:        fcor.w          $w10, $w18, $w25
-# CHECKOBJDUMP:        fcor.d          $w17, $w25, $w11
-# CHECKOBJDUMP:        fcueq.w         $w14, $w2, $w21
-# CHECKOBJDUMP:        fcueq.d         $w29, $w3, $w7
-# CHECKOBJDUMP:        fcule.w         $w17, $w5, $w3
-# CHECKOBJDUMP:        fcule.d         $w31, $w1, $w30
-# CHECKOBJDUMP:        fcult.w         $w6, $w25, $w9
-# CHECKOBJDUMP:        fcult.d         $w27, $w8, $w17
-# CHECKOBJDUMP:        fcun.w          $w4, $w20, $w8
-# CHECKOBJDUMP:        fcun.d          $w29, $w11, $w3
-# CHECKOBJDUMP:        fcune.w         $w13, $w18, $w19
-# CHECKOBJDUMP:        fcune.d         $w16, $w26, $w21
-# CHECKOBJDUMP:        fdiv.w          $w13, $w24, $w2
-# CHECKOBJDUMP:        fdiv.d          $w19, $w4, $w25
-# CHECKOBJDUMP:        fexdo.h         $w8, $w0, $w16
-# CHECKOBJDUMP:        fexdo.w         $w0, $w13, $w27
-# CHECKOBJDUMP:        fexp2.w         $w17, $w0, $w3
-# CHECKOBJDUMP:        fexp2.d         $w22, $w0, $w10
-# CHECKOBJDUMP:        fmadd.w         $w29, $w6, $w23
-# CHECKOBJDUMP:        fmadd.d         $w11, $w28, $w21
-# CHECKOBJDUMP:        fmax.w          $w0, $w23, $w13
-# CHECKOBJDUMP:        fmax.d          $w26, $w18, $w8
-# CHECKOBJDUMP:        fmax_a.w        $w10, $w16, $w10
-# CHECKOBJDUMP:        fmax_a.d        $w30, $w9, $w22
-# CHECKOBJDUMP:        fmin.w          $w24, $w1, $w30
-# CHECKOBJDUMP:        fmin.d          $w27, $w27, $w10
-# CHECKOBJDUMP:        fmin_a.w        $w10, $w29, $w20
-# CHECKOBJDUMP:        fmin_a.d        $w13, $w30, $w24
-# CHECKOBJDUMP:        fmsub.w         $w17, $w25, $w0
-# CHECKOBJDUMP:        fmsub.d         $w8, $w18, $w16
-# CHECKOBJDUMP:        fmul.w          $w3, $w15, $w15
-# CHECKOBJDUMP:        fmul.d          $w9, $w30, $w10
-# CHECKOBJDUMP:        fsaf.w          $w25, $w5, $w10
-# CHECKOBJDUMP:        fsaf.d          $w25, $w3, $w29
-# CHECKOBJDUMP:        fseq.w          $w11, $w17, $w13
-# CHECKOBJDUMP:        fseq.d          $w29, $w0, $w31
-# CHECKOBJDUMP:        fsle.w          $w30, $w31, $w31
-# CHECKOBJDUMP:        fsle.d          $w18, $w23, $w24
-# CHECKOBJDUMP:        fslt.w          $w12, $w5, $w6
-# CHECKOBJDUMP:        fslt.d          $w16, $w26, $w21
-# CHECKOBJDUMP:        fsne.w          $w30, $w1, $w12
-# CHECKOBJDUMP:        fsne.d          $w14, $w13, $w23
-# CHECKOBJDUMP:        fsor.w          $w27, $w13, $w27
-# CHECKOBJDUMP:        fsor.d          $w12, $w24, $w11
-# CHECKOBJDUMP:        fsub.w          $w31, $w26, $w1
-# CHECKOBJDUMP:        fsub.d          $w19, $w17, $w27
-# CHECKOBJDUMP:        fsueq.w         $w16, $w24, $w25
-# CHECKOBJDUMP:        fsueq.d         $w18, $w14, $w14
-# CHECKOBJDUMP:        fsule.w         $w23, $w30, $w13
-# CHECKOBJDUMP:        fsule.d         $w2, $w11, $w26
-# CHECKOBJDUMP:        fsult.w         $w11, $w26, $w22
-# CHECKOBJDUMP:        fsult.d         $w6, $w23, $w30
-# CHECKOBJDUMP:        fsun.w          $w3, $w18, $w28
-# CHECKOBJDUMP:        fsun.d          $w18, $w11, $w19
-# CHECKOBJDUMP:        fsune.w         $w16, $w31, $w2
-# CHECKOBJDUMP:        fsune.d         $w3, $w26, $w17
-# CHECKOBJDUMP:        ftq.h           $w16, $w4, $w24
-# CHECKOBJDUMP:        ftq.w           $w5, $w5, $w25
-# CHECKOBJDUMP:        madd_q.h        $w16, $w20, $w10
-# CHECKOBJDUMP:        madd_q.w        $w28, $w2, $w9
-# CHECKOBJDUMP:        maddr_q.h       $w8, $w18, $w9
-# CHECKOBJDUMP:        maddr_q.w       $w29, $w12, $w16
-# CHECKOBJDUMP:        msub_q.h        $w24, $w26, $w10
-# CHECKOBJDUMP:        msub_q.w        $w13, $w30, $w28
-# CHECKOBJDUMP:        msubr_q.h       $w12, $w21, $w11
-# CHECKOBJDUMP:        msubr_q.w       $w1, $w14, $w20
-# CHECKOBJDUMP:        mul_q.h         $w6, $w16, $w30
-# CHECKOBJDUMP:        mul_q.w         $w16, $w1, $w4
-# CHECKOBJDUMP:        mulr_q.h        $w6, $w20, $w19
-# CHECKOBJDUMP:        mulr_q.w        $w27, $w1, $w20
-
                 fadd.w          $w28, $w19, $w28
                 fadd.d          $w13, $w2, $w29
                 fcaf.w          $w14, $w11, $w25
diff --git a/test/MC/Mips/msa/test_bit.s b/test/MC/Mips/msa/test_bit.s
index 2e5a6a5..85ebe54 100644
--- a/test/MC/Mips/msa/test_bit.s
+++ b/test/MC/Mips/msa/test_bit.s
@@ -1,9 +1,5 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        bclri.b         $w21, $w30, 2           # encoding: [0x79,0xf2,0xf5,0x49]
 # CHECK:        bclri.h         $w24, $w21, 0           # encoding: [0x79,0xe0,0xae,0x09]
 # CHECK:        bclri.w         $w23, $w30, 3           # encoding: [0x79,0xc3,0xf5,0xc9]
@@ -53,55 +49,6 @@
 # CHECK:        srlri.w         $w11, $w22, 2           # encoding: [0x79,0xc2,0xb2,0xca]
 # CHECK:        srlri.d         $w24, $w10, 6           # encoding: [0x79,0x86,0x56,0x0a]
 
-# CHECKOBJDUMP:        bclri.b         $w21, $w30, 2
-# CHECKOBJDUMP:        bclri.h         $w24, $w21, 0
-# CHECKOBJDUMP:        bclri.w         $w23, $w30, 3
-# CHECKOBJDUMP:        bclri.d         $w9, $w11, 0
-# CHECKOBJDUMP:        binsli.b        $w25, $w12, 1
-# CHECKOBJDUMP:        binsli.h        $w21, $w22, 0
-# CHECKOBJDUMP:        binsli.w        $w22, $w4, 0
-# CHECKOBJDUMP:        binsli.d        $w6, $w2, 6
-# CHECKOBJDUMP:        binsri.b        $w15, $w19, 0
-# CHECKOBJDUMP:        binsri.h        $w8, $w30, 1
-# CHECKOBJDUMP:        binsri.w        $w2, $w19, 5
-# CHECKOBJDUMP:        binsri.d        $w18, $w20, 1
-# CHECKOBJDUMP:        bnegi.b         $w24, $w19, 0
-# CHECKOBJDUMP:        bnegi.h         $w28, $w11, 3
-# CHECKOBJDUMP:        bnegi.w         $w1, $w27, 5
-# CHECKOBJDUMP:        bnegi.d         $w4, $w21, 1
-# CHECKOBJDUMP:        bseti.b         $w18, $w8, 0
-# CHECKOBJDUMP:        bseti.h         $w24, $w14, 2
-# CHECKOBJDUMP:        bseti.w         $w9, $w18, 4
-# CHECKOBJDUMP:        bseti.d         $w7, $w15, 1
-# CHECKOBJDUMP:        sat_s.b         $w31, $w31, 2
-# CHECKOBJDUMP:        sat_s.h         $w19, $w19, 0
-# CHECKOBJDUMP:        sat_s.w         $w19, $w29, 0
-# CHECKOBJDUMP:        sat_s.d         $w11, $w22, 0
-# CHECKOBJDUMP:        sat_u.b         $w1, $w13, 3
-# CHECKOBJDUMP:        sat_u.h         $w30, $w24, 4
-# CHECKOBJDUMP:        sat_u.w         $w31, $w13, 0
-# CHECKOBJDUMP:        sat_u.d         $w29, $w16, 5
-# CHECKOBJDUMP:        slli.b          $w23, $w10, 1
-# CHECKOBJDUMP:        slli.h          $w9, $w18, 1
-# CHECKOBJDUMP:        slli.w          $w11, $w29, 4
-# CHECKOBJDUMP:        slli.d          $w25, $w20, 1
-# CHECKOBJDUMP:        srai.b          $w24, $w29, 1
-# CHECKOBJDUMP:        srai.h          $w1, $w6, 0
-# CHECKOBJDUMP:        srai.w          $w7, $w26, 1
-# CHECKOBJDUMP:        srai.d          $w20, $w25, 3
-# CHECKOBJDUMP:        srari.b         $w5, $w25, 0
-# CHECKOBJDUMP:        srari.h         $w7, $w6, 4
-# CHECKOBJDUMP:        srari.w         $w17, $w11, 5
-# CHECKOBJDUMP:        srari.d         $w21, $w25, 5
-# CHECKOBJDUMP:        srli.b          $w2, $w0, 2
-# CHECKOBJDUMP:        srli.h          $w31, $w31, 2
-# CHECKOBJDUMP:        srli.w          $w5, $w9, 4
-# CHECKOBJDUMP:        srli.d          $w27, $w26, 5
-# CHECKOBJDUMP:        srlri.b         $w18, $w3, 0
-# CHECKOBJDUMP:        srlri.h         $w1, $w2, 3
-# CHECKOBJDUMP:        srlri.w         $w11, $w22, 2
-# CHECKOBJDUMP:        srlri.d         $w24, $w10, 6
-
                 bclri.b         $w21, $w30, 2
                 bclri.h         $w24, $w21, 0
                 bclri.w         $w23, $w30, 3
diff --git a/test/MC/Mips/msa/test_cbranch.s b/test/MC/Mips/msa/test_cbranch.s
index 37b8872..aa6779b 100644
--- a/test/MC/Mips/msa/test_cbranch.s
+++ b/test/MC/Mips/msa/test_cbranch.s
@@ -7,22 +7,22 @@
 #CHECK:      bnz.w        $w2, 128      # encoding: [0x47,0xc2,0x00,0x20]
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
 #CHECK:      bnz.d        $w3, -128     # encoding: [0x47,0xe3,0xff,0xe0]
-#CHECK:      bnz.b        $w0, SYMBOL0  # encoding: [0x47'A',0x80'A',0x00,0x00]
+#CHECK:      bnz.b        $w0, SYMBOL0  # encoding: [0x47,0x80,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL0, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bnz.h        $w1, SYMBOL1  # encoding: [0x47'A',0xa1'A',0x00,0x00]
+#CHECK:      bnz.h        $w1, SYMBOL1  # encoding: [0x47,0xa1,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL1, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bnz.w        $w2, SYMBOL2  # encoding: [0x47'A',0xc2'A',0x00,0x00]
+#CHECK:      bnz.w        $w2, SYMBOL2  # encoding: [0x47,0xc2,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL2, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bnz.d        $w3, SYMBOL3  # encoding: [0x47'A',0xe3'A',0x00,0x00]
+#CHECK:      bnz.d        $w3, SYMBOL3  # encoding: [0x47,0xe3,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL3, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
 
 #CHECK:      bnz.v        $w0, 4        # encoding: [0x45,0xe0,0x00,0x01]
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bnz.v        $w0, SYMBOL0  # encoding: [0x45'A',0xe0'A',0x00,0x00]
+#CHECK:      bnz.v        $w0, SYMBOL0  # encoding: [0x45,0xe0,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL0, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
 
@@ -34,22 +34,22 @@
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
 #CHECK:      bz.d         $w3, -1024    # encoding: [0x47,0x63,0xff,0x00]
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bz.b         $w0, SYMBOL0  # encoding: [0x47'A',A,0x00,0x00]
+#CHECK:      bz.b         $w0, SYMBOL0  # encoding: [0x47,0x00,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL0, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bz.h         $w1, SYMBOL1  # encoding: [0x47'A',0x21'A',0x00,0x00]
+#CHECK:      bz.h         $w1, SYMBOL1  # encoding: [0x47,0x21,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL1, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bz.w         $w2, SYMBOL2  # encoding: [0x47'A',0x42'A',0x00,0x00]
+#CHECK:      bz.w         $w2, SYMBOL2  # encoding: [0x47,0x42,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL2, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bz.d         $w3, SYMBOL3  # encoding: [0x47'A',0x63'A',0x00,0x00]
+#CHECK:      bz.d         $w3, SYMBOL3  # encoding: [0x47,0x63,A,A]
                                         #   fixup A - offset: 0, value: SYMBOL3, kind: fixup_Mips_PC16
 #CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
 
 #CHECK:      bz.v        $w0, 4        # encoding: [0x45,0x60,0x00,0x01]
 #CHECK:      nop                       # encoding: [0x00,0x00,0x00,0x00]
-#CHECK:      bz.v        $w0, SYMBOL0  # encoding: [0x45'A',0x60'A',0x00,0x00]
+#CHECK:      bz.v        $w0, SYMBOL0  # encoding: [0x45,0x60,A,A]
                                        #   fixup A - offset: 0, value: SYMBOL0, kind: fixup_Mips_PC16
 #CHECK:      nop                       # encoding: [0x00,0x00,0x00,0x00]
 
diff --git a/test/MC/Mips/msa/test_ctrlregs.s b/test/MC/Mips/msa/test_ctrlregs.s
index a014c03..3329072b 100644
--- a/test/MC/Mips/msa/test_ctrlregs.s
+++ b/test/MC/Mips/msa/test_ctrlregs.s
@@ -1,9 +1,5 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 #CHECK:  cfcmsa       $1, $0                  # encoding: [0x78,0x7e,0x00,0x59]
 #CHECK:  cfcmsa       $1, $0                  # encoding: [0x78,0x7e,0x00,0x59]
 #CHECK:  cfcmsa       $2, $1                  # encoding: [0x78,0x7e,0x08,0x99]
@@ -38,40 +34,6 @@
 #CHECK:  ctcmsa       $7, $8                  # encoding: [0x78,0x3e,0x41,0xd9]
 #CHECK:  ctcmsa       $7, $8                  # encoding: [0x78,0x3e,0x41,0xd9]
 
-#CHECKOBJDUMP:  cfcmsa       $1, $0
-#CHECKOBJDUMP:  cfcmsa       $1, $0
-#CHECKOBJDUMP:  cfcmsa       $2, $1
-#CHECKOBJDUMP:  cfcmsa       $2, $1
-#CHECKOBJDUMP:  cfcmsa       $3, $2
-#CHECKOBJDUMP:  cfcmsa       $3, $2
-#CHECKOBJDUMP:  cfcmsa       $4, $3
-#CHECKOBJDUMP:  cfcmsa       $4, $3
-#CHECKOBJDUMP:  cfcmsa       $5, $4
-#CHECKOBJDUMP:  cfcmsa       $5, $4
-#CHECKOBJDUMP:  cfcmsa       $6, $5
-#CHECKOBJDUMP:  cfcmsa       $6, $5
-#CHECKOBJDUMP:  cfcmsa       $7, $6
-#CHECKOBJDUMP:  cfcmsa       $7, $6
-#CHECKOBJDUMP:  cfcmsa       $8, $7
-#CHECKOBJDUMP:  cfcmsa       $8, $7
-
-#CHECKOBJDUMP:  ctcmsa       $0, $1
-#CHECKOBJDUMP:  ctcmsa       $0, $1
-#CHECKOBJDUMP:  ctcmsa       $1, $2
-#CHECKOBJDUMP:  ctcmsa       $1, $2
-#CHECKOBJDUMP:  ctcmsa       $2, $3
-#CHECKOBJDUMP:  ctcmsa       $2, $3
-#CHECKOBJDUMP:  ctcmsa       $3, $4
-#CHECKOBJDUMP:  ctcmsa       $3, $4
-#CHECKOBJDUMP:  ctcmsa       $4, $5
-#CHECKOBJDUMP:  ctcmsa       $4, $5
-#CHECKOBJDUMP:  ctcmsa       $5, $6
-#CHECKOBJDUMP:  ctcmsa       $5, $6
-#CHECKOBJDUMP:  ctcmsa       $6, $7
-#CHECKOBJDUMP:  ctcmsa       $6, $7
-#CHECKOBJDUMP:  ctcmsa       $7, $8
-#CHECKOBJDUMP:  ctcmsa       $7, $8
-
 cfcmsa       $1, $msair
 cfcmsa       $1, $0
 cfcmsa       $2, $msacsr
diff --git a/test/MC/Mips/msa/test_dlsa.s b/test/MC/Mips/msa/test_dlsa.s
index a70999d..5e14571 100644
--- a/test/MC/Mips/msa/test_dlsa.s
+++ b/test/MC/Mips/msa/test_dlsa.s
@@ -1,20 +1,11 @@
 # RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa -show-encoding | \
 # RUN:   FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips64r2 -mattr=+msa -filetype=obj -o - | \
-# RUN:   llvm-objdump -d -arch=mips64 -mattr=+msa - | \
-# RUN:     FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        dlsa        $8, $9, $10, 1              # encoding: [0x01,0x2a,0x40,0x15]
 # CHECK:        dlsa        $8, $9, $10, 2              # encoding: [0x01,0x2a,0x40,0x55]
 # CHECK:        dlsa        $8, $9, $10, 3              # encoding: [0x01,0x2a,0x40,0x95]
 # CHECK:        dlsa        $8, $9, $10, 4              # encoding: [0x01,0x2a,0x40,0xd5]
 
-# CHECKOBJDUMP: dlsa        $8, $9, $10, 1
-# CHECKOBJDUMP: dlsa        $8, $9, $10, 2
-# CHECKOBJDUMP: dlsa        $8, $9, $10, 3
-# CHECKOBJDUMP: dlsa        $8, $9, $10, 4
-
                 dlsa        $8, $9, $10, 1
                 dlsa        $8, $9, $10, 2
                 dlsa        $8, $9, $10, 3
diff --git a/test/MC/Mips/msa/test_elm.s b/test/MC/Mips/msa/test_elm.s
index 1e45fd4..dbe6d5c 100644
--- a/test/MC/Mips/msa/test_elm.s
+++ b/test/MC/Mips/msa/test_elm.s
@@ -1,9 +1,5 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        copy_s.b        $13, $w8[2]             # encoding: [0x78,0x82,0x43,0x59]
 # CHECK:        copy_s.h        $1, $w25[0]             # encoding: [0x78,0xa0,0xc8,0x59]
 # CHECK:        copy_s.w        $22, $w5[1]             # encoding: [0x78,0xb1,0x2d,0x99]
@@ -20,22 +16,6 @@
 # CHECK:        splati.d        $w28, $w1[0]            # encoding: [0x78,0x78,0x0f,0x19]
 # CHECK:        move.v          $w23, $w24              # encoding: [0x78,0xbe,0xc5,0xd9]
 
-# CHECKOBJDUMP:        copy_s.b        $13, $w8[2]
-# CHECKOBJDUMP:        copy_s.h        $1, $w25[0]
-# CHECKOBJDUMP:        copy_s.w        $22, $w5[1]
-# CHECKOBJDUMP:        copy_u.b        $22, $w20[4]
-# CHECKOBJDUMP:        copy_u.h        $20, $w4[0]
-# CHECKOBJDUMP:        copy_u.w        $fp, $w13[2]
-# CHECKOBJDUMP:        sldi.b          $w0, $w29[4]
-# CHECKOBJDUMP:        sldi.h          $w8, $w17[0]
-# CHECKOBJDUMP:        sldi.w          $w20, $w27[2]
-# CHECKOBJDUMP:        sldi.d          $w4, $w12[0]
-# CHECKOBJDUMP:        splati.b        $w25, $w3[2]
-# CHECKOBJDUMP:        splati.h        $w24, $w28[1]
-# CHECKOBJDUMP:        splati.w        $w13, $w18[0]
-# CHECKOBJDUMP:        splati.d        $w28, $w1[0]
-# CHECKOBJDUMP:        move.v          $w23, $w24
-
                 copy_s.b        $13, $w8[2]
                 copy_s.h        $1, $w25[0]
                 copy_s.w        $22, $w5[1]
diff --git a/test/MC/Mips/msa/test_elm_insert.s b/test/MC/Mips/msa/test_elm_insert.s
index f66b26c..d58a4e0 100644
--- a/test/MC/Mips/msa/test_elm_insert.s
+++ b/test/MC/Mips/msa/test_elm_insert.s
@@ -1,17 +1,9 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        insert.b        $w23[3], $sp            # encoding: [0x79,0x03,0xed,0xd9]
 # CHECK:        insert.h        $w20[2], $5             # encoding: [0x79,0x22,0x2d,0x19]
 # CHECK:        insert.w        $w8[2], $15             # encoding: [0x79,0x32,0x7a,0x19]
 
-# CHECKOBJDUMP:        insert.b        $w23[3], $sp
-# CHECKOBJDUMP:        insert.h        $w20[2], $5
-# CHECKOBJDUMP:        insert.w        $w8[2], $15
-
                 insert.b        $w23[3], $sp
                 insert.h        $w20[2], $5
                 insert.w        $w8[2], $15
diff --git a/test/MC/Mips/msa/test_elm_insert_msa64.s b/test/MC/Mips/msa/test_elm_insert_msa64.s
index 8196fd0..4e99bdb 100644
--- a/test/MC/Mips/msa/test_elm_insert_msa64.s
+++ b/test/MC/Mips/msa/test_elm_insert_msa64.s
@@ -1,11 +1,5 @@
 # RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa -filetype=obj -o - | \
-# RUN:   llvm-objdump -d -arch=mips64 -mattr=+msa - | \
-# RUN:     FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        insert.d        $w1[1], $sp            # encoding: [0x79,0x39,0xe8,0x59]
 
-# CHECKOBJDUMP:        insert.d        $w1[1], $sp
-
                 insert.d        $w1[1], $sp
diff --git a/test/MC/Mips/msa/test_elm_insve.s b/test/MC/Mips/msa/test_elm_insve.s
index efdf88f..0053322 100644
--- a/test/MC/Mips/msa/test_elm_insve.s
+++ b/test/MC/Mips/msa/test_elm_insve.s
@@ -1,19 +1,10 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        insve.b $w25[3], $w9[0]         # encoding: [0x79,0x43,0x4e,0x59]
 # CHECK:        insve.h $w24[2], $w2[0]         # encoding: [0x79,0x62,0x16,0x19]
 # CHECK:        insve.w $w0[2], $w13[0]         # encoding: [0x79,0x72,0x68,0x19]
 # CHECK:        insve.d $w3[0], $w18[0]         # encoding: [0x79,0x78,0x90,0xd9]
 
-# CHECKOBJDUMP:        insve.b $w25[3], $w9[0]
-# CHECKOBJDUMP:        insve.h $w24[2], $w2[0]
-# CHECKOBJDUMP:        insve.w $w0[2], $w13[0]
-# CHECKOBJDUMP:        insve.d $w3[0], $w18[0]
-
                 insve.b $w25[3], $w9[0]
                 insve.h $w24[2], $w2[0]
                 insve.w $w0[2], $w13[0]
diff --git a/test/MC/Mips/msa/test_elm_msa64.s b/test/MC/Mips/msa/test_elm_msa64.s
index 15bfcca..5cc9147 100644
--- a/test/MC/Mips/msa/test_elm_msa64.s
+++ b/test/MC/Mips/msa/test_elm_msa64.s
@@ -1,14 +1,7 @@
 # RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa -filetype=obj -o - | \
-# RUN:   llvm-objdump -d -arch=mips64 -mattr=+msa - | \
-# RUN:     FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        copy_s.d        $19, $w31[0]             # encoding: [0x78,0xb8,0xfc,0xd9]
 # CHECK:        copy_u.d        $18, $w29[1]             # encoding: [0x78,0xf9,0xec,0x99]
 
-# CHECKOBJDUMP:        copy_s.d        $19, $w31[0]
-# CHECKOBJDUMP:        copy_u.d        $18, $w29[1]
-
         copy_s.d        $19, $w31[0]
         copy_u.d        $18, $w29[1]
diff --git a/test/MC/Mips/msa/test_i10.s b/test/MC/Mips/msa/test_i10.s
index e029dfd..d89218a 100644
--- a/test/MC/Mips/msa/test_i10.s
+++ b/test/MC/Mips/msa/test_i10.s
@@ -1,20 +1,10 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
-
 # CHECK:        ldi.b   $w8, 198                # encoding: [0x7b,0x06,0x32,0x07]
 # CHECK:        ldi.h   $w20, 313               # encoding: [0x7b,0x29,0xcd,0x07]
 # CHECK:        ldi.w   $w24, 492               # encoding: [0x7b,0x4f,0x66,0x07]
 # CHECK:        ldi.d   $w27, -180              # encoding: [0x7b,0x7a,0x66,0xc7]
 
-# CHECKOBJDUMP:        ldi.b   $w8, 198
-# CHECKOBJDUMP:        ldi.h   $w20, 313
-# CHECKOBJDUMP:        ldi.w   $w24, 492
-# CHECKOBJDUMP:        ldi.d   $w27, 844
-
                 ldi.b   $w8, 198
                 ldi.h   $w20, 313
                 ldi.w   $w24, 492
diff --git a/test/MC/Mips/msa/test_i5.s b/test/MC/Mips/msa/test_i5.s
index 56c4811..d923787 100644
--- a/test/MC/Mips/msa/test_i5.s
+++ b/test/MC/Mips/msa/test_i5.s
@@ -1,9 +1,5 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        addvi.b         $w3, $w31, 30           # encoding: [0x78,0x1e,0xf8,0xc6]
 # CHECK:        addvi.h         $w24, $w13, 26          # encoding: [0x78,0x3a,0x6e,0x06]
 # CHECK:        addvi.w         $w26, $w20, 26          # encoding: [0x78,0x5a,0xa6,0x86]
@@ -49,51 +45,6 @@
 # CHECK:        subvi.w         $w12, $w10, 11          # encoding: [0x78,0xcb,0x53,0x06]
 # CHECK:        subvi.d         $w19, $w16, 7           # encoding: [0x78,0xe7,0x84,0xc6]
 
-# CHECKOBJDUMP:        addvi.b         $w3, $w31, 30
-# CHECKOBJDUMP:        addvi.h         $w24, $w13, 26
-# CHECKOBJDUMP:        addvi.w         $w26, $w20, 26
-# CHECKOBJDUMP:        addvi.d         $w16, $w1, 21
-# CHECKOBJDUMP:        ceqi.b          $w24, $w21, 24
-# CHECKOBJDUMP:        ceqi.h          $w31, $w15, 2
-# CHECKOBJDUMP:        ceqi.w          $w12, $w1, 31
-# CHECKOBJDUMP:        ceqi.d          $w24, $w22, 7
-# CHECKOBJDUMP:        clei_s.b        $w12, $w16, 1
-# CHECKOBJDUMP:        clei_s.h        $w2, $w10, 23
-# CHECKOBJDUMP:        clei_s.w        $w4, $w11, 22
-# CHECKOBJDUMP:        clei_s.d        $w0, $w29, 22
-# CHECKOBJDUMP:        clei_u.b        $w21, $w17, 3
-# CHECKOBJDUMP:        clei_u.h        $w29, $w7, 17
-# CHECKOBJDUMP:        clei_u.w        $w1, $w1, 2
-# CHECKOBJDUMP:        clei_u.d        $w27, $w27, 29
-# CHECKOBJDUMP:        clti_s.b        $w19, $w13, 25
-# CHECKOBJDUMP:        clti_s.h        $w15, $w10, 20
-# CHECKOBJDUMP:        clti_s.w        $w12, $w12, 11
-# CHECKOBJDUMP:        clti_s.d        $w29, $w20, 17
-# CHECKOBJDUMP:        clti_u.b        $w14, $w9, 29
-# CHECKOBJDUMP:        clti_u.h        $w24, $w25, 25
-# CHECKOBJDUMP:        clti_u.w        $w1, $w1, 22
-# CHECKOBJDUMP:        clti_u.d        $w21, $w25, 1
-# CHECKOBJDUMP:        maxi_s.b        $w22, $w21, 1
-# CHECKOBJDUMP:        maxi_s.h        $w29, $w5, 24
-# CHECKOBJDUMP:        maxi_s.w        $w1, $w10, 20
-# CHECKOBJDUMP:        maxi_s.d        $w13, $w29, 16
-# CHECKOBJDUMP:        maxi_u.b        $w20, $w0, 12
-# CHECKOBJDUMP:        maxi_u.h        $w1, $w14, 3
-# CHECKOBJDUMP:        maxi_u.w        $w27, $w22, 11
-# CHECKOBJDUMP:        maxi_u.d        $w26, $w6, 4
-# CHECKOBJDUMP:        mini_s.b        $w4, $w1, 1
-# CHECKOBJDUMP:        mini_s.h        $w27, $w27, 23
-# CHECKOBJDUMP:        mini_s.w        $w28, $w11, 9
-# CHECKOBJDUMP:        mini_s.d        $w11, $w10, 10
-# CHECKOBJDUMP:        mini_u.b        $w18, $w23, 27
-# CHECKOBJDUMP:        mini_u.h        $w7, $w26, 18
-# CHECKOBJDUMP:        mini_u.w        $w11, $w12, 26
-# CHECKOBJDUMP:        mini_u.d        $w11, $w15, 2
-# CHECKOBJDUMP:        subvi.b         $w24, $w20, 19
-# CHECKOBJDUMP:        subvi.h         $w11, $w19, 4
-# CHECKOBJDUMP:        subvi.w         $w12, $w10, 11
-# CHECKOBJDUMP:        subvi.d         $w19, $w16, 7
-
                 addvi.b         $w3, $w31, 30
                 addvi.h         $w24, $w13, 26
                 addvi.w         $w26, $w20, 26
diff --git a/test/MC/Mips/msa/test_i8.s b/test/MC/Mips/msa/test_i8.s
index d4138a1..b520bb4 100644
--- a/test/MC/Mips/msa/test_i8.s
+++ b/test/MC/Mips/msa/test_i8.s
@@ -1,9 +1,5 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        andi.b  $w2, $w29, 48           # encoding: [0x78,0x30,0xe8,0x80]
 # CHECK:        bmnzi.b $w6, $w22, 126          # encoding: [0x78,0x7e,0xb1,0x81]
 # CHECK:        bmzi.b  $w27, $w1, 88           # encoding: [0x79,0x58,0x0e,0xc1]
@@ -15,17 +11,6 @@
 # CHECK:        shf.w   $w14, $w3, 93           # encoding: [0x7a,0x5d,0x1b,0x82]
 # CHECK:        xori.b  $w16, $w10, 20          # encoding: [0x7b,0x14,0x54,0x00]
 
-# CHECKOBJDUMP:        andi.b  $w2, $w29, 48
-# CHECKOBJDUMP:        bmnzi.b $w6, $w22, 126
-# CHECKOBJDUMP:        bmzi.b  $w27, $w1, 88
-# CHECKOBJDUMP:        bseli.b $w29, $w3, 189
-# CHECKOBJDUMP:        nori.b  $w1, $w17, 56
-# CHECKOBJDUMP:        ori.b   $w26, $w20, 135
-# CHECKOBJDUMP:        shf.b   $w19, $w30, 105
-# CHECKOBJDUMP:        shf.h   $w17, $w8, 76
-# CHECKOBJDUMP:        shf.w   $w14, $w3, 93
-# CHECKOBJDUMP:        xori.b  $w16, $w10, 20
-
                 andi.b  $w2, $w29, 48
                 bmnzi.b $w6, $w22, 126
                 bmzi.b  $w27, $w1, 88
diff --git a/test/MC/Mips/msa/test_lsa.s b/test/MC/Mips/msa/test_lsa.s
index 9ea76f6..22fd0b3 100644
--- a/test/MC/Mips/msa/test_lsa.s
+++ b/test/MC/Mips/msa/test_lsa.s
@@ -1,19 +1,10 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        lsa        $8, $9, $10, 1              # encoding: [0x01,0x2a,0x40,0x05]
 # CHECK:        lsa        $8, $9, $10, 2              # encoding: [0x01,0x2a,0x40,0x45]
 # CHECK:        lsa        $8, $9, $10, 3              # encoding: [0x01,0x2a,0x40,0x85]
 # CHECK:        lsa        $8, $9, $10, 4              # encoding: [0x01,0x2a,0x40,0xc5]
 
-# CHECKOBJDUMP: lsa        $8, $9, $10, 1
-# CHECKOBJDUMP: lsa        $8, $9, $10, 2
-# CHECKOBJDUMP: lsa        $8, $9, $10, 3
-# CHECKOBJDUMP: lsa        $8, $9, $10, 4
-
                 lsa        $8, $9, $10, 1
                 lsa        $8, $9, $10, 2
                 lsa        $8, $9, $10, 3
diff --git a/test/MC/Mips/msa/test_mi10.s b/test/MC/Mips/msa/test_mi10.s
index 90baeba..7269960 100644
--- a/test/MC/Mips/msa/test_mi10.s
+++ b/test/MC/Mips/msa/test_mi10.s
@@ -1,9 +1,5 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        ld.b $w0, -512($1)              # encoding: [0x7a,0x00,0x08,0x20]
 # CHECK:        ld.b $w1, 0($2)                 # encoding: [0x78,0x00,0x10,0x60]
 # CHECK:        ld.b $w2, 511($3)               # encoding: [0x79,0xff,0x18,0xa0]
@@ -31,33 +27,6 @@
 # CHECK:        ld.d $w21, 2048($22)            # encoding: [0x79,0x00,0xb5,0x63]
 # CHECK:        ld.d $w22, 4088($23)            # encoding: [0x79,0xff,0xbd,0xa3]
 
-# CHECKOBJDUMP:        ld.b $w0, -512($1)
-# CHECKOBJDUMP:        ld.b $w1, 0($2)
-# CHECKOBJDUMP:        ld.b $w2, 511($3)
-
-# CHECKOBJDUMP:        ld.h $w3, -1024($4)
-# CHECKOBJDUMP:        ld.h $w4, -512($5)
-# CHECKOBJDUMP:        ld.h $w5, 0($6)
-# CHECKOBJDUMP:        ld.h $w6, 512($7)
-# CHECKOBJDUMP:        ld.h $w7, 1022($8)
-
-# CHECKOBJDUMP:        ld.w $w8, -2048($9)
-# CHECKOBJDUMP:        ld.w $w9, -1024($10)
-# CHECKOBJDUMP:        ld.w $w10, -512($11)
-# CHECKOBJDUMP:        ld.w $w11, 512($12)
-# CHECKOBJDUMP:        ld.w $w12, 1024($13)
-# CHECKOBJDUMP:        ld.w $w13, 2044($14)
-
-# CHECKOBJDUMP:        ld.d $w14, -4096($15)
-# CHECKOBJDUMP:        ld.d $w15, -2048($16)
-# CHECKOBJDUMP:        ld.d $w16, -1024($17)
-# CHECKOBJDUMP:        ld.d $w17, -512($18)
-# CHECKOBJDUMP:        ld.d $w18, 0($19)
-# CHECKOBJDUMP:        ld.d $w19, 512($20)
-# CHECKOBJDUMP:        ld.d $w20, 1024($21)
-# CHECKOBJDUMP:        ld.d $w21, 2048($22)
-# CHECKOBJDUMP:        ld.d $w22, 4088($23)
-
         ld.b $w0, -512($1)
         ld.b $w1, 0($2)
         ld.b $w2, 511($3)
diff --git a/test/MC/Mips/msa/test_vec.s b/test/MC/Mips/msa/test_vec.s
index b62da70..3f989d3 100644
--- a/test/MC/Mips/msa/test_vec.s
+++ b/test/MC/Mips/msa/test_vec.s
@@ -1,9 +1,5 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -show-encoding | FileCheck %s
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa -filetype=obj -o - | \
-# RUN: llvm-objdump -d -arch=mips -mattr=+msa - | \
-# RUN: FileCheck %s -check-prefix=CHECKOBJDUMP
-#
 # CHECK:        and.v   $w25, $w20, $w27        # encoding: [0x78,0x1b,0xa6,0x5e]
 # CHECK:        bmnz.v  $w17, $w6, $w7          # encoding: [0x78,0x87,0x34,0x5e]
 # CHECK:        bmz.v   $w3, $w17, $w9          # encoding: [0x78,0xa9,0x88,0xde]
@@ -12,14 +8,6 @@
 # CHECK:        or.v    $w24, $w26, $w30        # encoding: [0x78,0x3e,0xd6,0x1e]
 # CHECK:        xor.v   $w7, $w27, $w15         # encoding: [0x78,0x6f,0xd9,0xde]
 
-# CHECKOBJDUMP:        and.v   $w25, $w20, $w27
-# CHECKOBJDUMP:        bmnz.v  $w17, $w6, $w7
-# CHECKOBJDUMP:        bmz.v   $w3, $w17, $w9
-# CHECKOBJDUMP:        bsel.v  $w8, $w0, $w14
-# CHECKOBJDUMP:        nor.v   $w7, $w31, $w0
-# CHECKOBJDUMP:        or.v    $w24, $w26, $w30
-# CHECKOBJDUMP:        xor.v   $w7, $w27, $w15
-
                 and.v   $w25, $w20, $w27
                 bmnz.v  $w17, $w6, $w7
                 bmz.v   $w3, $w17, $w9
diff --git a/test/MC/Mips/octeon-instructions.s b/test/MC/Mips/octeon-instructions.s
index 0244e19..b7c89b4 100644
--- a/test/MC/Mips/octeon-instructions.s
+++ b/test/MC/Mips/octeon-instructions.s
@@ -29,8 +29,18 @@
 # CHECK: pop   $2, $2                 # encoding: [0x70,0x40,0x10,0x2c]
 # CHECK: seq   $25, $23, $24          # encoding: [0x72,0xf8,0xc8,0x2a]
 # CHECK: seq   $6, $6, $24            # encoding: [0x70,0xd8,0x30,0x2a]
+# CHECK: seqi  $17, $15, -512         # encoding: [0x71,0xf1,0x80,0x2e]
+# CHECK: seqi  $16, $16, 38           # encoding: [0x72,0x10,0x09,0xae]
 # CHECK: sne   $25, $23, $24          # encoding: [0x72,0xf8,0xc8,0x2b]
 # CHECK: sne   $23, $23, $20          # encoding: [0x72,0xf4,0xb8,0x2b]
+# CHECK: snei  $4, $16, -313          # encoding: [0x72,0x04,0xb1,0xef]
+# CHECK: snei  $26, $26, 511          # encoding: [0x73,0x5a,0x7f,0xef]
+# CHECK: v3mulu $21, $10, $21         # encoding: [0x71,0x55,0xa8,0x11]
+# CHECK: v3mulu $20, $20, $10         # encoding: [0x72,0x8a,0xa0,0x11]
+# CHECK: vmm0  $3, $19, $16           # encoding: [0x72,0x70,0x18,0x10]
+# CHECK: vmm0  $ra, $ra, $9           # encoding: [0x73,0xe9,0xf8,0x10]
+# CHECK: vmulu $sp, $10, $17          # encoding: [0x71,0x51,0xe8,0x0f]
+# CHECK: vmulu $27, $27, $6           # encoding: [0x73,0x66,0xd8,0x0f]
 
   baddu $9, $6, $7
   baddu $17, $18, $19
@@ -61,5 +71,15 @@
   pop   $2
   seq   $25, $23, $24
   seq   $6, $24
+  seqi  $17, $15, -512
+  seqi  $16, 38
   sne   $25, $23, $24
   sne   $23, $20
+  snei  $4, $16, -313
+  snei  $26, 511
+  v3mulu $21, $10, $21
+  v3mulu $20, $10
+  vmm0  $3, $19, $16
+  vmm0  $31, $9
+  vmulu $29, $10, $17
+  vmulu $27, $6
diff --git a/test/MC/PowerPC/ppc64-initial-cfa.s b/test/MC/PowerPC/ppc64-initial-cfa.s
index b890b30..ca97e1b 100644
--- a/test/MC/PowerPC/ppc64-initial-cfa.s
+++ b/test/MC/PowerPC/ppc64-initial-cfa.s
@@ -7,6 +7,7 @@
 # RUN: llvm-mc -triple=powerpc64le-unknown-linux-gnu -filetype=obj -relocation-model=pic %s | \
 # RUN: llvm-readobj -s -sr -sd | FileCheck %s -check-prefix=PIC -check-prefix=PIC-LE
 
+_proc:
         .cfi_startproc
         nop
         .cfi_endproc
diff --git a/test/MC/Sparc/sparc-alu-instructions.s b/test/MC/Sparc/sparc-alu-instructions.s
index afebf64..e2e5ef8 100644
--- a/test/MC/Sparc/sparc-alu-instructions.s
+++ b/test/MC/Sparc/sparc-alu-instructions.s
@@ -70,10 +70,10 @@
         ! CHECK: subxcc %g1, %g2, %g3 ! encoding: [0x86,0xe0,0x40,0x02]
         subxcc %g1, %g2, %g3
 
-        ! CHECK: or %g0, %g1, %g3     ! encoding: [0x86,0x10,0x00,0x01]
+        ! CHECK: mov %g1, %g3     ! encoding: [0x86,0x10,0x00,0x01]
         mov %g1, %g3
 
-        ! CHECK: or %g0, 255, %g3     ! encoding: [0x86,0x10,0x20,0xff]
+        ! CHECK: mov 255, %g3     ! encoding: [0x86,0x10,0x20,0xff]
         mov 0xff, %g3
 
         ! CHECK: restore              ! encoding: [0x81,0xe8,0x00,0x00]
diff --git a/test/MC/Sparc/sparc-fp-instructions.s b/test/MC/Sparc/sparc-fp-instructions.s
index fdeaa8c..f8c130f 100644
--- a/test/MC/Sparc/sparc-fp-instructions.s
+++ b/test/MC/Sparc/sparc-fp-instructions.s
@@ -96,16 +96,16 @@
         fdivd %f0, %f4, %f8
         fdivq %f0, %f4, %f8
 
-        ! CHECK: fcmps %fcc0, %f0, %f4                  ! encoding: [0x81,0xa8,0x0a,0x24]
-        ! CHECK: fcmpd %fcc0, %f0, %f4                  ! encoding: [0x81,0xa8,0x0a,0x44]
-        ! CHECK: fcmpq %fcc0, %f0, %f4                  ! encoding: [0x81,0xa8,0x0a,0x64]
+        ! CHECK: fcmps %f0, %f4                  ! encoding: [0x81,0xa8,0x0a,0x24]
+        ! CHECK: fcmpd %f0, %f4                  ! encoding: [0x81,0xa8,0x0a,0x44]
+        ! CHECK: fcmpq %f0, %f4                  ! encoding: [0x81,0xa8,0x0a,0x64]
         fcmps %f0, %f4
         fcmpd %f0, %f4
         fcmpq %f0, %f4
 
-        ! CHECK: fcmpes %fcc0, %f0, %f4                  ! encoding: [0x81,0xa8,0x0a,0xa4]
-        ! CHECK: fcmped %fcc0, %f0, %f4                  ! encoding: [0x81,0xa8,0x0a,0xc4]
-        ! CHECK: fcmpeq %fcc0, %f0, %f4                  ! encoding: [0x81,0xa8,0x0a,0xe4]
+        ! CHECK: fcmpes %f0, %f4                  ! encoding: [0x81,0xa8,0x0a,0xa4]
+        ! CHECK: fcmped %f0, %f4                  ! encoding: [0x81,0xa8,0x0a,0xc4]
+        ! CHECK: fcmpeq %f0, %f4                  ! encoding: [0x81,0xa8,0x0a,0xe4]
         fcmpes %f0, %f4
         fcmped %f0, %f4
         fcmpeq %f0, %f4
diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index 42c50e1..2915b7a 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s
@@ -1,101 +1,5 @@
 // RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=knl --show-encoding %s | FileCheck %s
 
-// CHECK: vaddpd -8192(%rdx), %zmm27, %zmm8
-// CHECK:  encoding: [0x62,0x71,0xa5,0x40,0x58,0x42,0x80]
-          vaddpd -8192(%rdx), %zmm27, %zmm8
-
-// CHECK: vaddpd -1024(%rdx){1to8}, %zmm27, %zmm8
-// CHECK:  encoding: [0x62,0x71,0xa5,0x50,0x58,0x42,0x80]
-          vaddpd -1024(%rdx){1to8}, %zmm27, %zmm8
-
-// CHECK: vaddps -8192(%rdx), %zmm13, %zmm18
-// CHECK:  encoding: [0x62,0xe1,0x14,0x48,0x58,0x52,0x80]
-          vaddps -8192(%rdx), %zmm13, %zmm18
-
-// CHECK: vaddps -512(%rdx){1to16}, %zmm13, %zmm18
-// CHECK:  encoding: [0x62,0xe1,0x14,0x58,0x58,0x52,0x80]
-          vaddps -512(%rdx){1to16}, %zmm13, %zmm18
-
-// CHECK: vdivpd -8192(%rdx), %zmm6, %zmm18
-// CHECK:  encoding: [0x62,0xe1,0xcd,0x48,0x5e,0x52,0x80]
-          vdivpd -8192(%rdx), %zmm6, %zmm18
-
-// CHECK: vdivpd -1024(%rdx){1to8}, %zmm6, %zmm18
-// CHECK:  encoding: [0x62,0xe1,0xcd,0x58,0x5e,0x52,0x80]
-          vdivpd -1024(%rdx){1to8}, %zmm6, %zmm18
-
-// CHECK: vdivps -8192(%rdx), %zmm23, %zmm23
-// CHECK:  encoding: [0x62,0xe1,0x44,0x40,0x5e,0x7a,0x80]
-          vdivps -8192(%rdx), %zmm23, %zmm23
-
-// CHECK: vdivps -512(%rdx){1to16}, %zmm23, %zmm23
-// CHECK:  encoding: [0x62,0xe1,0x44,0x50,0x5e,0x7a,0x80]
-          vdivps -512(%rdx){1to16}, %zmm23, %zmm23
-
-// CHECK: vmaxpd -8192(%rdx), %zmm28, %zmm30
-// CHECK:  encoding: [0x62,0x61,0x9d,0x40,0x5f,0x72,0x80]
-          vmaxpd -8192(%rdx), %zmm28, %zmm30
-
-// CHECK: vmaxpd -1024(%rdx){1to8}, %zmm28, %zmm30
-// CHECK:  encoding: [0x62,0x61,0x9d,0x50,0x5f,0x72,0x80]
-          vmaxpd -1024(%rdx){1to8}, %zmm28, %zmm30
-
-// CHECK: vmaxps -8192(%rdx), %zmm6, %zmm25
-// CHECK:  encoding: [0x62,0x61,0x4c,0x48,0x5f,0x4a,0x80]
-          vmaxps -8192(%rdx), %zmm6, %zmm25
-
-// CHECK: vmaxps -512(%rdx){1to16}, %zmm6, %zmm25
-// CHECK:  encoding: [0x62,0x61,0x4c,0x58,0x5f,0x4a,0x80]
-          vmaxps -512(%rdx){1to16}, %zmm6, %zmm25
-
-// CHECK: vminpd -8192(%rdx), %zmm6, %zmm6
-// CHECK:  encoding: [0x62,0xf1,0xcd,0x48,0x5d,0x72,0x80]
-          vminpd -8192(%rdx), %zmm6, %zmm6
-
-// CHECK: vminpd -1024(%rdx){1to8}, %zmm6, %zmm6
-// CHECK:  encoding: [0x62,0xf1,0xcd,0x58,0x5d,0x72,0x80]
-          vminpd -1024(%rdx){1to8}, %zmm6, %zmm6
-
-// CHECK: vminps -8192(%rdx), %zmm3, %zmm3
-// CHECK:  encoding: [0x62,0xf1,0x64,0x48,0x5d,0x5a,0x80]
-          vminps -8192(%rdx), %zmm3, %zmm3
-
-// CHECK: vminps -512(%rdx){1to16}, %zmm3, %zmm3
-// CHECK:  encoding: [0x62,0xf1,0x64,0x58,0x5d,0x5a,0x80]
-          vminps -512(%rdx){1to16}, %zmm3, %zmm3
-
-// CHECK: vmulpd -8192(%rdx), %zmm4, %zmm24
-// CHECK:  encoding: [0x62,0x61,0xdd,0x48,0x59,0x42,0x80]
-          vmulpd -8192(%rdx), %zmm4, %zmm24
-
-// CHECK: vmulpd -1024(%rdx){1to8}, %zmm4, %zmm24
-// CHECK:  encoding: [0x62,0x61,0xdd,0x58,0x59,0x42,0x80]
-          vmulpd -1024(%rdx){1to8}, %zmm4, %zmm24
-
-// CHECK: vmulps -8192(%rdx), %zmm6, %zmm3
-// CHECK:  encoding: [0x62,0xf1,0x4c,0x48,0x59,0x5a,0x80]
-          vmulps -8192(%rdx), %zmm6, %zmm3
-
-// CHECK: vmulps -512(%rdx){1to16}, %zmm6, %zmm3
-// CHECK:  encoding: [0x62,0xf1,0x4c,0x58,0x59,0x5a,0x80]
-          vmulps -512(%rdx){1to16}, %zmm6, %zmm3
-
-// CHECK: vsubpd -8192(%rdx), %zmm12, %zmm9
-// CHECK:  encoding: [0x62,0x71,0x9d,0x48,0x5c,0x4a,0x80]
-          vsubpd -8192(%rdx), %zmm12, %zmm9
-
-// CHECK: vsubpd -1024(%rdx){1to8}, %zmm12, %zmm9
-// CHECK:  encoding: [0x62,0x71,0x9d,0x58,0x5c,0x4a,0x80]
-          vsubpd -1024(%rdx){1to8}, %zmm12, %zmm9
-
-// CHECK: vsubps -8192(%rdx), %zmm27, %zmm14
-// CHECK:  encoding: [0x62,0x71,0x24,0x40,0x5c,0x72,0x80]
-          vsubps -8192(%rdx), %zmm27, %zmm14
-
-// CHECK: vsubps -512(%rdx){1to16}, %zmm27, %zmm14
-// CHECK:  encoding: [0x62,0x71,0x24,0x50,0x5c,0x72,0x80]
-          vsubps -512(%rdx){1to16}, %zmm27, %zmm14
-
 // CHECK: vaddpd %zmm6, %zmm27, %zmm8
 // CHECK:  encoding: [0x62,0x71,0xa5,0x40,0x58,0xc6]
           vaddpd %zmm6, %zmm27, %zmm8
@@ -128,6 +32,10 @@
 // CHECK:  encoding: [0x62,0x71,0xa5,0x40,0x58,0x82,0x00,0x20,0x00,0x00]
           vaddpd 8192(%rdx), %zmm27, %zmm8
 
+// CHECK: vaddpd -8192(%rdx), %zmm27, %zmm8
+// CHECK:  encoding: [0x62,0x71,0xa5,0x40,0x58,0x42,0x80]
+          vaddpd -8192(%rdx), %zmm27, %zmm8
+
 // CHECK: vaddpd -8256(%rdx), %zmm27, %zmm8
 // CHECK:  encoding: [0x62,0x71,0xa5,0x40,0x58,0x82,0xc0,0xdf,0xff,0xff]
           vaddpd -8256(%rdx), %zmm27, %zmm8
@@ -140,6 +48,10 @@
 // CHECK:  encoding: [0x62,0x71,0xa5,0x50,0x58,0x82,0x00,0x04,0x00,0x00]
           vaddpd 1024(%rdx){1to8}, %zmm27, %zmm8
 
+// CHECK: vaddpd -1024(%rdx){1to8}, %zmm27, %zmm8
+// CHECK:  encoding: [0x62,0x71,0xa5,0x50,0x58,0x42,0x80]
+          vaddpd -1024(%rdx){1to8}, %zmm27, %zmm8
+
 // CHECK: vaddpd -1032(%rdx){1to8}, %zmm27, %zmm8
 // CHECK:  encoding: [0x62,0x71,0xa5,0x50,0x58,0x82,0xf8,0xfb,0xff,0xff]
           vaddpd -1032(%rdx){1to8}, %zmm27, %zmm8
@@ -176,6 +88,10 @@
 // CHECK:  encoding: [0x62,0xe1,0x14,0x48,0x58,0x92,0x00,0x20,0x00,0x00]
           vaddps 8192(%rdx), %zmm13, %zmm18
 
+// CHECK: vaddps -8192(%rdx), %zmm13, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0x14,0x48,0x58,0x52,0x80]
+          vaddps -8192(%rdx), %zmm13, %zmm18
+
 // CHECK: vaddps -8256(%rdx), %zmm13, %zmm18
 // CHECK:  encoding: [0x62,0xe1,0x14,0x48,0x58,0x92,0xc0,0xdf,0xff,0xff]
           vaddps -8256(%rdx), %zmm13, %zmm18
@@ -188,6 +104,10 @@
 // CHECK:  encoding: [0x62,0xe1,0x14,0x58,0x58,0x92,0x00,0x02,0x00,0x00]
           vaddps 512(%rdx){1to16}, %zmm13, %zmm18
 
+// CHECK: vaddps -512(%rdx){1to16}, %zmm13, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0x14,0x58,0x58,0x52,0x80]
+          vaddps -512(%rdx){1to16}, %zmm13, %zmm18
+
 // CHECK: vaddps -516(%rdx){1to16}, %zmm13, %zmm18
 // CHECK:  encoding: [0x62,0xe1,0x14,0x58,0x58,0x92,0xfc,0xfd,0xff,0xff]
           vaddps -516(%rdx){1to16}, %zmm13, %zmm18
@@ -224,6 +144,10 @@
 // CHECK:  encoding: [0x62,0xe1,0xcd,0x48,0x5e,0x92,0x00,0x20,0x00,0x00]
           vdivpd 8192(%rdx), %zmm6, %zmm18
 
+// CHECK: vdivpd -8192(%rdx), %zmm6, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0xcd,0x48,0x5e,0x52,0x80]
+          vdivpd -8192(%rdx), %zmm6, %zmm18
+
 // CHECK: vdivpd -8256(%rdx), %zmm6, %zmm18
 // CHECK:  encoding: [0x62,0xe1,0xcd,0x48,0x5e,0x92,0xc0,0xdf,0xff,0xff]
           vdivpd -8256(%rdx), %zmm6, %zmm18
@@ -236,6 +160,10 @@
 // CHECK:  encoding: [0x62,0xe1,0xcd,0x58,0x5e,0x92,0x00,0x04,0x00,0x00]
           vdivpd 1024(%rdx){1to8}, %zmm6, %zmm18
 
+// CHECK: vdivpd -1024(%rdx){1to8}, %zmm6, %zmm18
+// CHECK:  encoding: [0x62,0xe1,0xcd,0x58,0x5e,0x52,0x80]
+          vdivpd -1024(%rdx){1to8}, %zmm6, %zmm18
+
 // CHECK: vdivpd -1032(%rdx){1to8}, %zmm6, %zmm18
 // CHECK:  encoding: [0x62,0xe1,0xcd,0x58,0x5e,0x92,0xf8,0xfb,0xff,0xff]
           vdivpd -1032(%rdx){1to8}, %zmm6, %zmm18
@@ -272,6 +200,10 @@
 // CHECK:  encoding: [0x62,0xe1,0x44,0x40,0x5e,0xba,0x00,0x20,0x00,0x00]
           vdivps 8192(%rdx), %zmm23, %zmm23
 
+// CHECK: vdivps -8192(%rdx), %zmm23, %zmm23
+// CHECK:  encoding: [0x62,0xe1,0x44,0x40,0x5e,0x7a,0x80]
+          vdivps -8192(%rdx), %zmm23, %zmm23
+
 // CHECK: vdivps -8256(%rdx), %zmm23, %zmm23
 // CHECK:  encoding: [0x62,0xe1,0x44,0x40,0x5e,0xba,0xc0,0xdf,0xff,0xff]
           vdivps -8256(%rdx), %zmm23, %zmm23
@@ -284,6 +216,10 @@
 // CHECK:  encoding: [0x62,0xe1,0x44,0x50,0x5e,0xba,0x00,0x02,0x00,0x00]
           vdivps 512(%rdx){1to16}, %zmm23, %zmm23
 
+// CHECK: vdivps -512(%rdx){1to16}, %zmm23, %zmm23
+// CHECK:  encoding: [0x62,0xe1,0x44,0x50,0x5e,0x7a,0x80]
+          vdivps -512(%rdx){1to16}, %zmm23, %zmm23
+
 // CHECK: vdivps -516(%rdx){1to16}, %zmm23, %zmm23
 // CHECK:  encoding: [0x62,0xe1,0x44,0x50,0x5e,0xba,0xfc,0xfd,0xff,0xff]
           vdivps -516(%rdx){1to16}, %zmm23, %zmm23
@@ -320,6 +256,10 @@
 // CHECK:  encoding: [0x62,0x61,0x9d,0x40,0x5f,0xb2,0x00,0x20,0x00,0x00]
           vmaxpd 8192(%rdx), %zmm28, %zmm30
 
+// CHECK: vmaxpd -8192(%rdx), %zmm28, %zmm30
+// CHECK:  encoding: [0x62,0x61,0x9d,0x40,0x5f,0x72,0x80]
+          vmaxpd -8192(%rdx), %zmm28, %zmm30
+
 // CHECK: vmaxpd -8256(%rdx), %zmm28, %zmm30
 // CHECK:  encoding: [0x62,0x61,0x9d,0x40,0x5f,0xb2,0xc0,0xdf,0xff,0xff]
           vmaxpd -8256(%rdx), %zmm28, %zmm30
@@ -332,6 +272,10 @@
 // CHECK:  encoding: [0x62,0x61,0x9d,0x50,0x5f,0xb2,0x00,0x04,0x00,0x00]
           vmaxpd 1024(%rdx){1to8}, %zmm28, %zmm30
 
+// CHECK: vmaxpd -1024(%rdx){1to8}, %zmm28, %zmm30
+// CHECK:  encoding: [0x62,0x61,0x9d,0x50,0x5f,0x72,0x80]
+          vmaxpd -1024(%rdx){1to8}, %zmm28, %zmm30
+
 // CHECK: vmaxpd -1032(%rdx){1to8}, %zmm28, %zmm30
 // CHECK:  encoding: [0x62,0x61,0x9d,0x50,0x5f,0xb2,0xf8,0xfb,0xff,0xff]
           vmaxpd -1032(%rdx){1to8}, %zmm28, %zmm30
@@ -368,6 +312,10 @@
 // CHECK:  encoding: [0x62,0x61,0x4c,0x48,0x5f,0x8a,0x00,0x20,0x00,0x00]
           vmaxps 8192(%rdx), %zmm6, %zmm25
 
+// CHECK: vmaxps -8192(%rdx), %zmm6, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x4c,0x48,0x5f,0x4a,0x80]
+          vmaxps -8192(%rdx), %zmm6, %zmm25
+
 // CHECK: vmaxps -8256(%rdx), %zmm6, %zmm25
 // CHECK:  encoding: [0x62,0x61,0x4c,0x48,0x5f,0x8a,0xc0,0xdf,0xff,0xff]
           vmaxps -8256(%rdx), %zmm6, %zmm25
@@ -380,6 +328,10 @@
 // CHECK:  encoding: [0x62,0x61,0x4c,0x58,0x5f,0x8a,0x00,0x02,0x00,0x00]
           vmaxps 512(%rdx){1to16}, %zmm6, %zmm25
 
+// CHECK: vmaxps -512(%rdx){1to16}, %zmm6, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x4c,0x58,0x5f,0x4a,0x80]
+          vmaxps -512(%rdx){1to16}, %zmm6, %zmm25
+
 // CHECK: vmaxps -516(%rdx){1to16}, %zmm6, %zmm25
 // CHECK:  encoding: [0x62,0x61,0x4c,0x58,0x5f,0x8a,0xfc,0xfd,0xff,0xff]
           vmaxps -516(%rdx){1to16}, %zmm6, %zmm25
@@ -416,6 +368,10 @@
 // CHECK:  encoding: [0x62,0xf1,0xcd,0x48,0x5d,0xb2,0x00,0x20,0x00,0x00]
           vminpd 8192(%rdx), %zmm6, %zmm6
 
+// CHECK: vminpd -8192(%rdx), %zmm6, %zmm6
+// CHECK:  encoding: [0x62,0xf1,0xcd,0x48,0x5d,0x72,0x80]
+          vminpd -8192(%rdx), %zmm6, %zmm6
+
 // CHECK: vminpd -8256(%rdx), %zmm6, %zmm6
 // CHECK:  encoding: [0x62,0xf1,0xcd,0x48,0x5d,0xb2,0xc0,0xdf,0xff,0xff]
           vminpd -8256(%rdx), %zmm6, %zmm6
@@ -428,6 +384,10 @@
 // CHECK:  encoding: [0x62,0xf1,0xcd,0x58,0x5d,0xb2,0x00,0x04,0x00,0x00]
           vminpd 1024(%rdx){1to8}, %zmm6, %zmm6
 
+// CHECK: vminpd -1024(%rdx){1to8}, %zmm6, %zmm6
+// CHECK:  encoding: [0x62,0xf1,0xcd,0x58,0x5d,0x72,0x80]
+          vminpd -1024(%rdx){1to8}, %zmm6, %zmm6
+
 // CHECK: vminpd -1032(%rdx){1to8}, %zmm6, %zmm6
 // CHECK:  encoding: [0x62,0xf1,0xcd,0x58,0x5d,0xb2,0xf8,0xfb,0xff,0xff]
           vminpd -1032(%rdx){1to8}, %zmm6, %zmm6
@@ -464,6 +424,10 @@
 // CHECK:  encoding: [0x62,0xf1,0x64,0x48,0x5d,0x9a,0x00,0x20,0x00,0x00]
           vminps 8192(%rdx), %zmm3, %zmm3
 
+// CHECK: vminps -8192(%rdx), %zmm3, %zmm3
+// CHECK:  encoding: [0x62,0xf1,0x64,0x48,0x5d,0x5a,0x80]
+          vminps -8192(%rdx), %zmm3, %zmm3
+
 // CHECK: vminps -8256(%rdx), %zmm3, %zmm3
 // CHECK:  encoding: [0x62,0xf1,0x64,0x48,0x5d,0x9a,0xc0,0xdf,0xff,0xff]
           vminps -8256(%rdx), %zmm3, %zmm3
@@ -476,6 +440,10 @@
 // CHECK:  encoding: [0x62,0xf1,0x64,0x58,0x5d,0x9a,0x00,0x02,0x00,0x00]
           vminps 512(%rdx){1to16}, %zmm3, %zmm3
 
+// CHECK: vminps -512(%rdx){1to16}, %zmm3, %zmm3
+// CHECK:  encoding: [0x62,0xf1,0x64,0x58,0x5d,0x5a,0x80]
+          vminps -512(%rdx){1to16}, %zmm3, %zmm3
+
 // CHECK: vminps -516(%rdx){1to16}, %zmm3, %zmm3
 // CHECK:  encoding: [0x62,0xf1,0x64,0x58,0x5d,0x9a,0xfc,0xfd,0xff,0xff]
           vminps -516(%rdx){1to16}, %zmm3, %zmm3
@@ -512,6 +480,10 @@
 // CHECK:  encoding: [0x62,0x61,0xdd,0x48,0x59,0x82,0x00,0x20,0x00,0x00]
           vmulpd 8192(%rdx), %zmm4, %zmm24
 
+// CHECK: vmulpd -8192(%rdx), %zmm4, %zmm24
+// CHECK:  encoding: [0x62,0x61,0xdd,0x48,0x59,0x42,0x80]
+          vmulpd -8192(%rdx), %zmm4, %zmm24
+
 // CHECK: vmulpd -8256(%rdx), %zmm4, %zmm24
 // CHECK:  encoding: [0x62,0x61,0xdd,0x48,0x59,0x82,0xc0,0xdf,0xff,0xff]
           vmulpd -8256(%rdx), %zmm4, %zmm24
@@ -524,6 +496,10 @@
 // CHECK:  encoding: [0x62,0x61,0xdd,0x58,0x59,0x82,0x00,0x04,0x00,0x00]
           vmulpd 1024(%rdx){1to8}, %zmm4, %zmm24
 
+// CHECK: vmulpd -1024(%rdx){1to8}, %zmm4, %zmm24
+// CHECK:  encoding: [0x62,0x61,0xdd,0x58,0x59,0x42,0x80]
+          vmulpd -1024(%rdx){1to8}, %zmm4, %zmm24
+
 // CHECK: vmulpd -1032(%rdx){1to8}, %zmm4, %zmm24
 // CHECK:  encoding: [0x62,0x61,0xdd,0x58,0x59,0x82,0xf8,0xfb,0xff,0xff]
           vmulpd -1032(%rdx){1to8}, %zmm4, %zmm24
@@ -560,6 +536,10 @@
 // CHECK:  encoding: [0x62,0xf1,0x4c,0x48,0x59,0x9a,0x00,0x20,0x00,0x00]
           vmulps 8192(%rdx), %zmm6, %zmm3
 
+// CHECK: vmulps -8192(%rdx), %zmm6, %zmm3
+// CHECK:  encoding: [0x62,0xf1,0x4c,0x48,0x59,0x5a,0x80]
+          vmulps -8192(%rdx), %zmm6, %zmm3
+
 // CHECK: vmulps -8256(%rdx), %zmm6, %zmm3
 // CHECK:  encoding: [0x62,0xf1,0x4c,0x48,0x59,0x9a,0xc0,0xdf,0xff,0xff]
           vmulps -8256(%rdx), %zmm6, %zmm3
@@ -572,6 +552,10 @@
 // CHECK:  encoding: [0x62,0xf1,0x4c,0x58,0x59,0x9a,0x00,0x02,0x00,0x00]
           vmulps 512(%rdx){1to16}, %zmm6, %zmm3
 
+// CHECK: vmulps -512(%rdx){1to16}, %zmm6, %zmm3
+// CHECK:  encoding: [0x62,0xf1,0x4c,0x58,0x59,0x5a,0x80]
+          vmulps -512(%rdx){1to16}, %zmm6, %zmm3
+
 // CHECK: vmulps -516(%rdx){1to16}, %zmm6, %zmm3
 // CHECK:  encoding: [0x62,0xf1,0x4c,0x58,0x59,0x9a,0xfc,0xfd,0xff,0xff]
           vmulps -516(%rdx){1to16}, %zmm6, %zmm3
@@ -1504,6 +1488,374 @@
 // CHECK:  encoding: [0x62,0x72,0xad,0x50,0x3b,0x9a,0xf8,0xfb,0xff,0xff]
           vpminuq -1032(%rdx){1to8}, %zmm26, %zmm11
 
+// CHECK: vpmovsxbd %xmm7, %zmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x21,0xdf]
+          vpmovsxbd %xmm7, %zmm27
+
+// CHECK: vpmovsxbd %xmm7, %zmm27 {%k5}
+// CHECK:  encoding: [0x62,0x62,0x7d,0x4d,0x21,0xdf]
+          vpmovsxbd %xmm7, %zmm27 {%k5}
+
+// CHECK: vpmovsxbd %xmm7, %zmm27 {%k5} {z}
+// CHECK:  encoding: [0x62,0x62,0x7d,0xcd,0x21,0xdf]
+          vpmovsxbd %xmm7, %zmm27 {%k5} {z}
+
+// CHECK: vpmovsxbd (%rcx), %zmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x21,0x19]
+          vpmovsxbd (%rcx), %zmm27
+
+// CHECK: vpmovsxbd 291(%rax,%r14,8), %zmm27
+// CHECK:  encoding: [0x62,0x22,0x7d,0x48,0x21,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpmovsxbd 291(%rax,%r14,8), %zmm27
+
+// CHECK: vpmovsxbd 2032(%rdx), %zmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x21,0x5a,0x7f]
+          vpmovsxbd 2032(%rdx), %zmm27
+
+// CHECK: vpmovsxbd 2048(%rdx), %zmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x21,0x9a,0x00,0x08,0x00,0x00]
+          vpmovsxbd 2048(%rdx), %zmm27
+
+// CHECK: vpmovsxbd -2048(%rdx), %zmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x21,0x5a,0x80]
+          vpmovsxbd -2048(%rdx), %zmm27
+
+// CHECK: vpmovsxbd -2064(%rdx), %zmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x21,0x9a,0xf0,0xf7,0xff,0xff]
+          vpmovsxbd -2064(%rdx), %zmm27
+
+// CHECK: vpmovsxbd (%rcx), %zmm27 {%k1}
+// CHECK:  encoding: [0x62,0x62,0x7d,0x49,0x21,0x19]
+          vpmovsxbd (%rcx), %zmm27 {%k1}
+
+// CHECK: vpmovsxbd (%rcx), %zmm27 {%k2} {z}
+// CHECK:  encoding: [0x62,0x62,0x7d,0xca,0x21,0x19]
+          vpmovsxbd (%rcx), %zmm27 {%k2} {z}
+
+// CHECK: vpmovsxbq %xmm11, %zmm11
+// CHECK:  encoding: [0x62,0x52,0x7d,0x48,0x22,0xdb]
+          vpmovsxbq %xmm11, %zmm11
+
+// CHECK: vpmovsxbq %xmm11, %zmm11 {%k5}
+// CHECK:  encoding: [0x62,0x52,0x7d,0x4d,0x22,0xdb]
+          vpmovsxbq %xmm11, %zmm11 {%k5}
+
+// CHECK: vpmovsxbq %xmm11, %zmm11 {%k5} {z}
+// CHECK:  encoding: [0x62,0x52,0x7d,0xcd,0x22,0xdb]
+          vpmovsxbq %xmm11, %zmm11 {%k5} {z}
+
+// CHECK: vpmovsxbq (%rcx), %zmm11
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x22,0x19]
+          vpmovsxbq (%rcx), %zmm11
+
+// CHECK: vpmovsxbq 291(%rax,%r14,8), %zmm11
+// CHECK:  encoding: [0x62,0x32,0x7d,0x48,0x22,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpmovsxbq 291(%rax,%r14,8), %zmm11
+
+// CHECK: vpmovsxbq 1016(%rdx), %zmm11
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x22,0x5a,0x7f]
+          vpmovsxbq 1016(%rdx), %zmm11
+
+// CHECK: vpmovsxbq 1024(%rdx), %zmm11
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x22,0x9a,0x00,0x04,0x00,0x00]
+          vpmovsxbq 1024(%rdx), %zmm11
+
+// CHECK: vpmovsxbq -1024(%rdx), %zmm11
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x22,0x5a,0x80]
+          vpmovsxbq -1024(%rdx), %zmm11
+
+// CHECK: vpmovsxbq -1032(%rdx), %zmm11
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x22,0x9a,0xf8,0xfb,0xff,0xff]
+          vpmovsxbq -1032(%rdx), %zmm11
+
+// CHECK: vpmovsxdq %ymm29, %zmm26
+// CHECK:  encoding: [0x62,0x02,0x7d,0x48,0x25,0xd5]
+          vpmovsxdq %ymm29, %zmm26
+
+// CHECK: vpmovsxdq %ymm29, %zmm26 {%k1}
+// CHECK:  encoding: [0x62,0x02,0x7d,0x49,0x25,0xd5]
+          vpmovsxdq %ymm29, %zmm26 {%k1}
+
+// CHECK: vpmovsxdq %ymm29, %zmm26 {%k1} {z}
+// CHECK:  encoding: [0x62,0x02,0x7d,0xc9,0x25,0xd5]
+          vpmovsxdq %ymm29, %zmm26 {%k1} {z}
+
+// CHECK: vpmovsxdq (%rcx), %zmm26
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x25,0x11]
+          vpmovsxdq (%rcx), %zmm26
+
+// CHECK: vpmovsxdq 291(%rax,%r14,8), %zmm26
+// CHECK:  encoding: [0x62,0x22,0x7d,0x48,0x25,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpmovsxdq 291(%rax,%r14,8), %zmm26
+
+// CHECK: vpmovsxdq 4064(%rdx), %zmm26
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x25,0x52,0x7f]
+          vpmovsxdq 4064(%rdx), %zmm26
+
+// CHECK: vpmovsxdq 4096(%rdx), %zmm26
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x25,0x92,0x00,0x10,0x00,0x00]
+          vpmovsxdq 4096(%rdx), %zmm26
+
+// CHECK: vpmovsxdq -4096(%rdx), %zmm26
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x25,0x52,0x80]
+          vpmovsxdq -4096(%rdx), %zmm26
+
+// CHECK: vpmovsxdq -4128(%rdx), %zmm26
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x25,0x92,0xe0,0xef,0xff,0xff]
+          vpmovsxdq -4128(%rdx), %zmm26
+
+// CHECK: vpmovsxwd %ymm11, %zmm23
+// CHECK:  encoding: [0x62,0xc2,0x7d,0x48,0x23,0xfb]
+          vpmovsxwd %ymm11, %zmm23
+
+// CHECK: vpmovsxwd %ymm11, %zmm23 {%k2}
+// CHECK:  encoding: [0x62,0xc2,0x7d,0x4a,0x23,0xfb]
+          vpmovsxwd %ymm11, %zmm23 {%k2}
+
+// CHECK: vpmovsxwd %ymm11, %zmm23 {%k2} {z}
+// CHECK:  encoding: [0x62,0xc2,0x7d,0xca,0x23,0xfb]
+          vpmovsxwd %ymm11, %zmm23 {%k2} {z}
+
+// CHECK: vpmovsxwd (%rcx), %zmm23
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x23,0x39]
+          vpmovsxwd (%rcx), %zmm23
+
+// CHECK: vpmovsxwd 291(%rax,%r14,8), %zmm23
+// CHECK:  encoding: [0x62,0xa2,0x7d,0x48,0x23,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vpmovsxwd 291(%rax,%r14,8), %zmm23
+
+// CHECK: vpmovsxwd 4064(%rdx), %zmm23
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x23,0x7a,0x7f]
+          vpmovsxwd 4064(%rdx), %zmm23
+
+// CHECK: vpmovsxwd 4096(%rdx), %zmm23
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x23,0xba,0x00,0x10,0x00,0x00]
+          vpmovsxwd 4096(%rdx), %zmm23
+
+// CHECK: vpmovsxwd -4096(%rdx), %zmm23
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x23,0x7a,0x80]
+          vpmovsxwd -4096(%rdx), %zmm23
+
+// CHECK: vpmovsxwd -4128(%rdx), %zmm23
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x23,0xba,0xe0,0xef,0xff,0xff]
+          vpmovsxwd -4128(%rdx), %zmm23
+
+// CHECK: vpmovsxwq %xmm25, %zmm25
+// CHECK:  encoding: [0x62,0x02,0x7d,0x48,0x24,0xc9]
+          vpmovsxwq %xmm25, %zmm25
+
+// CHECK: vpmovsxwq %xmm25, %zmm25 {%k4}
+// CHECK:  encoding: [0x62,0x02,0x7d,0x4c,0x24,0xc9]
+          vpmovsxwq %xmm25, %zmm25 {%k4}
+
+// CHECK: vpmovsxwq %xmm25, %zmm25 {%k4} {z}
+// CHECK:  encoding: [0x62,0x02,0x7d,0xcc,0x24,0xc9]
+          vpmovsxwq %xmm25, %zmm25 {%k4} {z}
+
+// CHECK: vpmovsxwq (%rcx), %zmm25
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x24,0x09]
+          vpmovsxwq (%rcx), %zmm25
+
+// CHECK: vpmovsxwq 291(%rax,%r14,8), %zmm25
+// CHECK:  encoding: [0x62,0x22,0x7d,0x48,0x24,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpmovsxwq 291(%rax,%r14,8), %zmm25
+
+// CHECK: vpmovsxwq 2032(%rdx), %zmm25
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x24,0x4a,0x7f]
+          vpmovsxwq 2032(%rdx), %zmm25
+
+// CHECK: vpmovsxwq 2048(%rdx), %zmm25
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x24,0x8a,0x00,0x08,0x00,0x00]
+          vpmovsxwq 2048(%rdx), %zmm25
+
+// CHECK: vpmovsxwq -2048(%rdx), %zmm25
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x24,0x4a,0x80]
+          vpmovsxwq -2048(%rdx), %zmm25
+
+// CHECK: vpmovsxwq -2064(%rdx), %zmm25
+// CHECK:  encoding: [0x62,0x62,0x7d,0x48,0x24,0x8a,0xf0,0xf7,0xff,0xff]
+          vpmovsxwq -2064(%rdx), %zmm25
+
+// CHECK: vpmovzxbd %xmm25, %zmm18
+// CHECK:  encoding: [0x62,0x82,0x7d,0x48,0x31,0xd1]
+          vpmovzxbd %xmm25, %zmm18
+
+// CHECK: vpmovzxbd %xmm25, %zmm18 {%k7}
+// CHECK:  encoding: [0x62,0x82,0x7d,0x4f,0x31,0xd1]
+          vpmovzxbd %xmm25, %zmm18 {%k7}
+
+// CHECK: vpmovzxbd %xmm25, %zmm18 {%k7} {z}
+// CHECK:  encoding: [0x62,0x82,0x7d,0xcf,0x31,0xd1]
+          vpmovzxbd %xmm25, %zmm18 {%k7} {z}
+
+// CHECK: vpmovzxbd (%rcx), %zmm18
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x31,0x11]
+          vpmovzxbd (%rcx), %zmm18
+
+// CHECK: vpmovzxbd 291(%rax,%r14,8), %zmm18
+// CHECK:  encoding: [0x62,0xa2,0x7d,0x48,0x31,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpmovzxbd 291(%rax,%r14,8), %zmm18
+
+// CHECK: vpmovzxbd 2032(%rdx), %zmm18
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x31,0x52,0x7f]
+          vpmovzxbd 2032(%rdx), %zmm18
+
+// CHECK: vpmovzxbd 2048(%rdx), %zmm18
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x31,0x92,0x00,0x08,0x00,0x00]
+          vpmovzxbd 2048(%rdx), %zmm18
+
+// CHECK: vpmovzxbd -2048(%rdx), %zmm18
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x31,0x52,0x80]
+          vpmovzxbd -2048(%rdx), %zmm18
+
+// CHECK: vpmovzxbd -2064(%rdx), %zmm18
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x31,0x92,0xf0,0xf7,0xff,0xff]
+          vpmovzxbd -2064(%rdx), %zmm18
+
+// CHECK: vpmovzxbq %xmm15, %zmm5
+// CHECK:  encoding: [0x62,0xd2,0x7d,0x48,0x32,0xef]
+          vpmovzxbq %xmm15, %zmm5
+
+// CHECK: vpmovzxbq %xmm15, %zmm5 {%k1}
+// CHECK:  encoding: [0x62,0xd2,0x7d,0x49,0x32,0xef]
+          vpmovzxbq %xmm15, %zmm5 {%k1}
+
+// CHECK: vpmovzxbq %xmm15, %zmm5 {%k1} {z}
+// CHECK:  encoding: [0x62,0xd2,0x7d,0xc9,0x32,0xef]
+          vpmovzxbq %xmm15, %zmm5 {%k1} {z}
+
+// CHECK: vpmovzxbq (%rcx), %zmm5
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x32,0x29]
+          vpmovzxbq (%rcx), %zmm5
+
+// CHECK: vpmovzxbq 291(%rax,%r14,8), %zmm5
+// CHECK:  encoding: [0x62,0xb2,0x7d,0x48,0x32,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpmovzxbq 291(%rax,%r14,8), %zmm5
+
+// CHECK: vpmovzxbq 1016(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x32,0x6a,0x7f]
+          vpmovzxbq 1016(%rdx), %zmm5
+
+// CHECK: vpmovzxbq 1024(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x32,0xaa,0x00,0x04,0x00,0x00]
+          vpmovzxbq 1024(%rdx), %zmm5
+
+// CHECK: vpmovzxbq -1024(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x32,0x6a,0x80]
+          vpmovzxbq -1024(%rdx), %zmm5
+
+// CHECK: vpmovzxbq -1032(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x32,0xaa,0xf8,0xfb,0xff,0xff]
+          vpmovzxbq -1032(%rdx), %zmm5
+
+// CHECK: vpmovzxdq %ymm4, %zmm20
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x35,0xe4]
+          vpmovzxdq %ymm4, %zmm20
+
+// CHECK: vpmovzxdq %ymm4, %zmm20 {%k3}
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x4b,0x35,0xe4]
+          vpmovzxdq %ymm4, %zmm20 {%k3}
+
+// CHECK: vpmovzxdq %ymm4, %zmm20 {%k3} {z}
+// CHECK:  encoding: [0x62,0xe2,0x7d,0xcb,0x35,0xe4]
+          vpmovzxdq %ymm4, %zmm20 {%k3} {z}
+
+// CHECK: vpmovzxdq (%rcx), %zmm20
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x35,0x21]
+          vpmovzxdq (%rcx), %zmm20
+
+// CHECK: vpmovzxdq 291(%rax,%r14,8), %zmm20
+// CHECK:  encoding: [0x62,0xa2,0x7d,0x48,0x35,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpmovzxdq 291(%rax,%r14,8), %zmm20
+
+// CHECK: vpmovzxdq 4064(%rdx), %zmm20
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x35,0x62,0x7f]
+          vpmovzxdq 4064(%rdx), %zmm20
+
+// CHECK: vpmovzxdq 4096(%rdx), %zmm20
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x35,0xa2,0x00,0x10,0x00,0x00]
+          vpmovzxdq 4096(%rdx), %zmm20
+
+// CHECK: vpmovzxdq -4096(%rdx), %zmm20
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x35,0x62,0x80]
+          vpmovzxdq -4096(%rdx), %zmm20
+
+// CHECK: vpmovzxdq -4128(%rdx), %zmm20
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x35,0xa2,0xe0,0xef,0xff,0xff]
+          vpmovzxdq -4128(%rdx), %zmm20
+
+// CHECK: vpmovzxwd %ymm6, %zmm8
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x33,0xc6]
+          vpmovzxwd %ymm6, %zmm8
+
+// CHECK: vpmovzxwd %ymm6, %zmm8 {%k7}
+// CHECK:  encoding: [0x62,0x72,0x7d,0x4f,0x33,0xc6]
+          vpmovzxwd %ymm6, %zmm8 {%k7}
+
+// CHECK: vpmovzxwd %ymm6, %zmm8 {%k7} {z}
+// CHECK:  encoding: [0x62,0x72,0x7d,0xcf,0x33,0xc6]
+          vpmovzxwd %ymm6, %zmm8 {%k7} {z}
+
+// CHECK: vpmovzxwd (%rcx), %zmm8
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x33,0x01]
+          vpmovzxwd (%rcx), %zmm8
+
+// CHECK: vpmovzxwd 291(%rax,%r14,8), %zmm8
+// CHECK:  encoding: [0x62,0x32,0x7d,0x48,0x33,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vpmovzxwd 291(%rax,%r14,8), %zmm8
+
+// CHECK: vpmovzxwd 4064(%rdx), %zmm8
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x33,0x42,0x7f]
+          vpmovzxwd 4064(%rdx), %zmm8
+
+// CHECK: vpmovzxwd 4096(%rdx), %zmm8
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x33,0x82,0x00,0x10,0x00,0x00]
+          vpmovzxwd 4096(%rdx), %zmm8
+
+// CHECK: vpmovzxwd -4096(%rdx), %zmm8
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x33,0x42,0x80]
+          vpmovzxwd -4096(%rdx), %zmm8
+
+// CHECK: vpmovzxwd -4128(%rdx), %zmm8
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x33,0x82,0xe0,0xef,0xff,0xff]
+          vpmovzxwd -4128(%rdx), %zmm8
+
+// CHECK: vpmovzxwq %xmm15, %zmm5
+// CHECK:  encoding: [0x62,0xd2,0x7d,0x48,0x34,0xef]
+          vpmovzxwq %xmm15, %zmm5
+
+// CHECK: vpmovzxwq %xmm15, %zmm5 {%k7}
+// CHECK:  encoding: [0x62,0xd2,0x7d,0x4f,0x34,0xef]
+          vpmovzxwq %xmm15, %zmm5 {%k7}
+
+// CHECK: vpmovzxwq %xmm15, %zmm5 {%k7} {z}
+// CHECK:  encoding: [0x62,0xd2,0x7d,0xcf,0x34,0xef]
+          vpmovzxwq %xmm15, %zmm5 {%k7} {z}
+
+// CHECK: vpmovzxwq (%rcx), %zmm5
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x34,0x29]
+          vpmovzxwq (%rcx), %zmm5
+
+// CHECK: vpmovzxwq 291(%rax,%r14,8), %zmm5
+// CHECK:  encoding: [0x62,0xb2,0x7d,0x48,0x34,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpmovzxwq 291(%rax,%r14,8), %zmm5
+
+// CHECK: vpmovzxwq 2032(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x34,0x6a,0x7f]
+          vpmovzxwq 2032(%rdx), %zmm5
+
+// CHECK: vpmovzxwq 2048(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x34,0xaa,0x00,0x08,0x00,0x00]
+          vpmovzxwq 2048(%rdx), %zmm5
+
+// CHECK: vpmovzxwq -2048(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x34,0x6a,0x80]
+          vpmovzxwq -2048(%rdx), %zmm5
+
+// CHECK: vpmovzxwq -2064(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x34,0xaa,0xf0,0xf7,0xff,0xff]
+          vpmovzxwq -2064(%rdx), %zmm5
+
 // CHECK: vpmuldq %zmm9, %zmm9, %zmm29
 // CHECK:  encoding: [0x62,0x42,0xb5,0x48,0x28,0xe9]
           vpmuldq %zmm9, %zmm9, %zmm29
@@ -2056,6 +2408,10 @@
 // CHECK:  encoding: [0x62,0x71,0x9d,0x48,0x5c,0x8a,0x00,0x20,0x00,0x00]
           vsubpd 8192(%rdx), %zmm12, %zmm9
 
+// CHECK: vsubpd -8192(%rdx), %zmm12, %zmm9
+// CHECK:  encoding: [0x62,0x71,0x9d,0x48,0x5c,0x4a,0x80]
+          vsubpd -8192(%rdx), %zmm12, %zmm9
+
 // CHECK: vsubpd -8256(%rdx), %zmm12, %zmm9
 // CHECK:  encoding: [0x62,0x71,0x9d,0x48,0x5c,0x8a,0xc0,0xdf,0xff,0xff]
           vsubpd -8256(%rdx), %zmm12, %zmm9
@@ -2068,6 +2424,10 @@
 // CHECK:  encoding: [0x62,0x71,0x9d,0x58,0x5c,0x8a,0x00,0x04,0x00,0x00]
           vsubpd 1024(%rdx){1to8}, %zmm12, %zmm9
 
+// CHECK: vsubpd -1024(%rdx){1to8}, %zmm12, %zmm9
+// CHECK:  encoding: [0x62,0x71,0x9d,0x58,0x5c,0x4a,0x80]
+          vsubpd -1024(%rdx){1to8}, %zmm12, %zmm9
+
 // CHECK: vsubpd -1032(%rdx){1to8}, %zmm12, %zmm9
 // CHECK:  encoding: [0x62,0x71,0x9d,0x58,0x5c,0x8a,0xf8,0xfb,0xff,0xff]
           vsubpd -1032(%rdx){1to8}, %zmm12, %zmm9
@@ -2104,6 +2464,10 @@
 // CHECK:  encoding: [0x62,0x71,0x24,0x40,0x5c,0xb2,0x00,0x20,0x00,0x00]
           vsubps 8192(%rdx), %zmm27, %zmm14
 
+// CHECK: vsubps -8192(%rdx), %zmm27, %zmm14
+// CHECK:  encoding: [0x62,0x71,0x24,0x40,0x5c,0x72,0x80]
+          vsubps -8192(%rdx), %zmm27, %zmm14
+
 // CHECK: vsubps -8256(%rdx), %zmm27, %zmm14
 // CHECK:  encoding: [0x62,0x71,0x24,0x40,0x5c,0xb2,0xc0,0xdf,0xff,0xff]
           vsubps -8256(%rdx), %zmm27, %zmm14
@@ -2116,10 +2480,614 @@
 // CHECK:  encoding: [0x62,0x71,0x24,0x50,0x5c,0xb2,0x00,0x02,0x00,0x00]
           vsubps 512(%rdx){1to16}, %zmm27, %zmm14
 
+// CHECK: vsubps -512(%rdx){1to16}, %zmm27, %zmm14
+// CHECK:  encoding: [0x62,0x71,0x24,0x50,0x5c,0x72,0x80]
+          vsubps -512(%rdx){1to16}, %zmm27, %zmm14
+
 // CHECK: vsubps -516(%rdx){1to16}, %zmm27, %zmm14
 // CHECK:  encoding: [0x62,0x71,0x24,0x50,0x5c,0xb2,0xfc,0xfd,0xff,0xff]
           vsubps -516(%rdx){1to16}, %zmm27, %zmm14
 
+// CHECK: vpmovqb %zmm2, %xmm3
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x32,0xd3]
+          vpmovqb %zmm2, %xmm3
+
+// CHECK: vpmovqb %zmm2, %xmm3 {%k1}
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x49,0x32,0xd3]
+          vpmovqb %zmm2, %xmm3 {%k1}
+
+// CHECK: vpmovqb %zmm2, %xmm3 {%k1} {z}
+// CHECK:  encoding: [0x62,0xf2,0x7e,0xc9,0x32,0xd3]
+          vpmovqb %zmm2, %xmm3 {%k1} {z}
+
+// CHECK: vpmovsqb %zmm29, %xmm30
+// CHECK:  encoding: [0x62,0x02,0x7e,0x48,0x22,0xee]
+          vpmovsqb %zmm29, %xmm30
+
+// CHECK: vpmovsqb %zmm29, %xmm30 {%k5}
+// CHECK:  encoding: [0x62,0x02,0x7e,0x4d,0x22,0xee]
+          vpmovsqb %zmm29, %xmm30 {%k5}
+
+// CHECK: vpmovsqb %zmm29, %xmm30 {%k5} {z}
+// CHECK:  encoding: [0x62,0x02,0x7e,0xcd,0x22,0xee]
+          vpmovsqb %zmm29, %xmm30 {%k5} {z}
+
+// CHECK: vpmovusqb %zmm28, %xmm24
+// CHECK:  encoding: [0x62,0x02,0x7e,0x48,0x12,0xe0]
+          vpmovusqb %zmm28, %xmm24
+
+// CHECK: vpmovusqb %zmm28, %xmm24 {%k7}
+// CHECK:  encoding: [0x62,0x02,0x7e,0x4f,0x12,0xe0]
+          vpmovusqb %zmm28, %xmm24 {%k7}
+
+// CHECK: vpmovusqb %zmm28, %xmm24 {%k7} {z}
+// CHECK:  encoding: [0x62,0x02,0x7e,0xcf,0x12,0xe0]
+          vpmovusqb %zmm28, %xmm24 {%k7} {z}
+
+// CHECK: vpmovqw %zmm18, %xmm6
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x34,0xd6]
+          vpmovqw %zmm18, %xmm6
+
+// CHECK: vpmovqw %zmm18, %xmm6 {%k1}
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x49,0x34,0xd6]
+          vpmovqw %zmm18, %xmm6 {%k1}
+
+// CHECK: vpmovqw %zmm18, %xmm6 {%k1} {z}
+// CHECK:  encoding: [0x62,0xe2,0x7e,0xc9,0x34,0xd6]
+          vpmovqw %zmm18, %xmm6 {%k1} {z}
+
+// CHECK: vpmovsqw %zmm19, %xmm27
+// CHECK:  encoding: [0x62,0x82,0x7e,0x48,0x24,0xdb]
+          vpmovsqw %zmm19, %xmm27
+
+// CHECK: vpmovsqw %zmm19, %xmm27 {%k6}
+// CHECK:  encoding: [0x62,0x82,0x7e,0x4e,0x24,0xdb]
+          vpmovsqw %zmm19, %xmm27 {%k6}
+
+// CHECK: vpmovsqw %zmm19, %xmm27 {%k6} {z}
+// CHECK:  encoding: [0x62,0x82,0x7e,0xce,0x24,0xdb]
+          vpmovsqw %zmm19, %xmm27 {%k6} {z}
+
+// CHECK: vpmovusqw %zmm10, %xmm28
+// CHECK:  encoding: [0x62,0x12,0x7e,0x48,0x14,0xd4]
+          vpmovusqw %zmm10, %xmm28
+
+// CHECK: vpmovusqw %zmm10, %xmm28 {%k7}
+// CHECK:  encoding: [0x62,0x12,0x7e,0x4f,0x14,0xd4]
+          vpmovusqw %zmm10, %xmm28 {%k7}
+
+// CHECK: vpmovusqw %zmm10, %xmm28 {%k7} {z}
+// CHECK:  encoding: [0x62,0x12,0x7e,0xcf,0x14,0xd4]
+          vpmovusqw %zmm10, %xmm28 {%k7} {z}
+
+// CHECK: vpmovqd %zmm25, %ymm6
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x35,0xce]
+          vpmovqd %zmm25, %ymm6
+
+// CHECK: vpmovqd %zmm25, %ymm6 {%k5}
+// CHECK:  encoding: [0x62,0x62,0x7e,0x4d,0x35,0xce]
+          vpmovqd %zmm25, %ymm6 {%k5}
+
+// CHECK: vpmovqd %zmm25, %ymm6 {%k5} {z}
+// CHECK:  encoding: [0x62,0x62,0x7e,0xcd,0x35,0xce]
+          vpmovqd %zmm25, %ymm6 {%k5} {z}
+
+// CHECK: vpmovsqd %zmm2, %ymm15
+// CHECK:  encoding: [0x62,0xd2,0x7e,0x48,0x25,0xd7]
+          vpmovsqd %zmm2, %ymm15
+
+// CHECK: vpmovsqd %zmm2, %ymm15 {%k2}
+// CHECK:  encoding: [0x62,0xd2,0x7e,0x4a,0x25,0xd7]
+          vpmovsqd %zmm2, %ymm15 {%k2}
+
+// CHECK: vpmovsqd %zmm2, %ymm15 {%k2} {z}
+// CHECK:  encoding: [0x62,0xd2,0x7e,0xca,0x25,0xd7]
+          vpmovsqd %zmm2, %ymm15 {%k2} {z}
+
+// CHECK: vpmovusqd %zmm4, %ymm8
+// CHECK:  encoding: [0x62,0xd2,0x7e,0x48,0x15,0xe0]
+          vpmovusqd %zmm4, %ymm8
+
+// CHECK: vpmovusqd %zmm4, %ymm8 {%k4}
+// CHECK:  encoding: [0x62,0xd2,0x7e,0x4c,0x15,0xe0]
+          vpmovusqd %zmm4, %ymm8 {%k4}
+
+// CHECK: vpmovusqd %zmm4, %ymm8 {%k4} {z}
+// CHECK:  encoding: [0x62,0xd2,0x7e,0xcc,0x15,0xe0]
+          vpmovusqd %zmm4, %ymm8 {%k4} {z}
+
+// CHECK: vpmovdb %zmm5, %xmm2
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x31,0xea]
+          vpmovdb %zmm5, %xmm2
+
+// CHECK: vpmovdb %zmm5, %xmm2 {%k5}
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x4d,0x31,0xea]
+          vpmovdb %zmm5, %xmm2 {%k5}
+
+// CHECK: vpmovdb %zmm5, %xmm2 {%k5} {z}
+// CHECK:  encoding: [0x62,0xf2,0x7e,0xcd,0x31,0xea]
+          vpmovdb %zmm5, %xmm2 {%k5} {z}
+
+// CHECK: vpmovsdb %zmm2, %xmm21
+// CHECK:  encoding: [0x62,0xb2,0x7e,0x48,0x21,0xd5]
+          vpmovsdb %zmm2, %xmm21
+
+// CHECK: vpmovsdb %zmm2, %xmm21 {%k4}
+// CHECK:  encoding: [0x62,0xb2,0x7e,0x4c,0x21,0xd5]
+          vpmovsdb %zmm2, %xmm21 {%k4}
+
+// CHECK: vpmovsdb %zmm2, %xmm21 {%k4} {z}
+// CHECK:  encoding: [0x62,0xb2,0x7e,0xcc,0x21,0xd5]
+          vpmovsdb %zmm2, %xmm21 {%k4} {z}
+
+// CHECK: vpmovusdb %zmm2, %xmm20
+// CHECK:  encoding: [0x62,0xb2,0x7e,0x48,0x11,0xd4]
+          vpmovusdb %zmm2, %xmm20
+
+// CHECK: vpmovusdb %zmm2, %xmm20 {%k3}
+// CHECK:  encoding: [0x62,0xb2,0x7e,0x4b,0x11,0xd4]
+          vpmovusdb %zmm2, %xmm20 {%k3}
+
+// CHECK: vpmovusdb %zmm2, %xmm20 {%k3} {z}
+// CHECK:  encoding: [0x62,0xb2,0x7e,0xcb,0x11,0xd4]
+          vpmovusdb %zmm2, %xmm20 {%k3} {z}
+
+// CHECK: vpmovdw %zmm29, %ymm22
+// CHECK:  encoding: [0x62,0x22,0x7e,0x48,0x33,0xee]
+          vpmovdw %zmm29, %ymm22
+
+// CHECK: vpmovdw %zmm29, %ymm22 {%k5}
+// CHECK:  encoding: [0x62,0x22,0x7e,0x4d,0x33,0xee]
+          vpmovdw %zmm29, %ymm22 {%k5}
+
+// CHECK: vpmovdw %zmm29, %ymm22 {%k5} {z}
+// CHECK:  encoding: [0x62,0x22,0x7e,0xcd,0x33,0xee]
+          vpmovdw %zmm29, %ymm22 {%k5} {z}
+
+// CHECK: vpmovsdw %zmm14, %ymm25
+// CHECK:  encoding: [0x62,0x12,0x7e,0x48,0x23,0xf1]
+          vpmovsdw %zmm14, %ymm25
+
+// CHECK: vpmovsdw %zmm14, %ymm25 {%k4}
+// CHECK:  encoding: [0x62,0x12,0x7e,0x4c,0x23,0xf1]
+          vpmovsdw %zmm14, %ymm25 {%k4}
+
+// CHECK: vpmovsdw %zmm14, %ymm25 {%k4} {z}
+// CHECK:  encoding: [0x62,0x12,0x7e,0xcc,0x23,0xf1]
+          vpmovsdw %zmm14, %ymm25 {%k4} {z}
+
+// CHECK: vpmovusdw %zmm7, %ymm8
+// CHECK:  encoding: [0x62,0xd2,0x7e,0x48,0x13,0xf8]
+          vpmovusdw %zmm7, %ymm8
+
+// CHECK: vpmovusdw %zmm7, %ymm8 {%k1}
+// CHECK:  encoding: [0x62,0xd2,0x7e,0x49,0x13,0xf8]
+          vpmovusdw %zmm7, %ymm8 {%k1}
+
+// CHECK: vpmovusdw %zmm7, %ymm8 {%k1} {z}
+// CHECK:  encoding: [0x62,0xd2,0x7e,0xc9,0x13,0xf8]
+          vpmovusdw %zmm7, %ymm8 {%k1} {z}
+
+// CHECK: vpmovqb %zmm3, (%rcx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x32,0x19]
+          vpmovqb %zmm3, (%rcx)
+
+// CHECK: vpmovqb %zmm3, (%rcx) {%k7}
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x4f,0x32,0x19]
+          vpmovqb %zmm3, (%rcx) {%k7}
+
+// CHECK: vpmovqb %zmm3, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xb2,0x7e,0x48,0x32,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpmovqb %zmm3, 291(%rax,%r14,8)
+
+// CHECK: vpmovqb %zmm3, 1016(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x32,0x5a,0x7f]
+          vpmovqb %zmm3, 1016(%rdx)
+
+// CHECK: vpmovqb %zmm3, 1024(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x32,0x9a,0x00,0x04,0x00,0x00]
+          vpmovqb %zmm3, 1024(%rdx)
+
+// CHECK: vpmovqb %zmm3, -1024(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x32,0x5a,0x80]
+          vpmovqb %zmm3, -1024(%rdx)
+
+// CHECK: vpmovqb %zmm3, -1032(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x32,0x9a,0xf8,0xfb,0xff,0xff]
+          vpmovqb %zmm3, -1032(%rdx)
+
+// CHECK: vpmovsqb %zmm16, (%rcx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x22,0x01]
+          vpmovsqb %zmm16, (%rcx)
+
+// CHECK: vpmovsqb %zmm16, (%rcx) {%k2}
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x4a,0x22,0x01]
+          vpmovsqb %zmm16, (%rcx) {%k2}
+
+// CHECK: vpmovsqb %zmm16, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa2,0x7e,0x48,0x22,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vpmovsqb %zmm16, 291(%rax,%r14,8)
+
+// CHECK: vpmovsqb %zmm16, 1016(%rdx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x22,0x42,0x7f]
+          vpmovsqb %zmm16, 1016(%rdx)
+
+// CHECK: vpmovsqb %zmm16, 1024(%rdx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x22,0x82,0x00,0x04,0x00,0x00]
+          vpmovsqb %zmm16, 1024(%rdx)
+
+// CHECK: vpmovsqb %zmm16, -1024(%rdx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x22,0x42,0x80]
+          vpmovsqb %zmm16, -1024(%rdx)
+
+// CHECK: vpmovsqb %zmm16, -1032(%rdx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x22,0x82,0xf8,0xfb,0xff,0xff]
+          vpmovsqb %zmm16, -1032(%rdx)
+
+// CHECK: vpmovusqb %zmm28, (%rcx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x12,0x21]
+          vpmovusqb %zmm28, (%rcx)
+
+// CHECK: vpmovusqb %zmm28, (%rcx) {%k1}
+// CHECK:  encoding: [0x62,0x62,0x7e,0x49,0x12,0x21]
+          vpmovusqb %zmm28, (%rcx) {%k1}
+
+// CHECK: vpmovusqb %zmm28, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x22,0x7e,0x48,0x12,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpmovusqb %zmm28, 291(%rax,%r14,8)
+
+// CHECK: vpmovusqb %zmm28, 1016(%rdx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x12,0x62,0x7f]
+          vpmovusqb %zmm28, 1016(%rdx)
+
+// CHECK: vpmovusqb %zmm28, 1024(%rdx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x12,0xa2,0x00,0x04,0x00,0x00]
+          vpmovusqb %zmm28, 1024(%rdx)
+
+// CHECK: vpmovusqb %zmm28, -1024(%rdx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x12,0x62,0x80]
+          vpmovusqb %zmm28, -1024(%rdx)
+
+// CHECK: vpmovusqb %zmm28, -1032(%rdx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x12,0xa2,0xf8,0xfb,0xff,0xff]
+          vpmovusqb %zmm28, -1032(%rdx)
+
+// CHECK: vpmovqw %zmm7, (%rcx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x34,0x39]
+          vpmovqw %zmm7, (%rcx)
+
+// CHECK: vpmovqw %zmm7, (%rcx) {%k6}
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x4e,0x34,0x39]
+          vpmovqw %zmm7, (%rcx) {%k6}
+
+// CHECK: vpmovqw %zmm7, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xb2,0x7e,0x48,0x34,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vpmovqw %zmm7, 291(%rax,%r14,8)
+
+// CHECK: vpmovqw %zmm7, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x34,0x7a,0x7f]
+          vpmovqw %zmm7, 2032(%rdx)
+
+// CHECK: vpmovqw %zmm7, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x34,0xba,0x00,0x08,0x00,0x00]
+          vpmovqw %zmm7, 2048(%rdx)
+
+// CHECK: vpmovqw %zmm7, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x34,0x7a,0x80]
+          vpmovqw %zmm7, -2048(%rdx)
+
+// CHECK: vpmovqw %zmm7, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x34,0xba,0xf0,0xf7,0xff,0xff]
+          vpmovqw %zmm7, -2064(%rdx)
+
+// CHECK: vpmovsqw %zmm1, (%rcx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x24,0x09]
+          vpmovsqw %zmm1, (%rcx)
+
+// CHECK: vpmovsqw %zmm1, (%rcx) {%k5}
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x4d,0x24,0x09]
+          vpmovsqw %zmm1, (%rcx) {%k5}
+
+// CHECK: vpmovsqw %zmm1, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xb2,0x7e,0x48,0x24,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpmovsqw %zmm1, 291(%rax,%r14,8)
+
+// CHECK: vpmovsqw %zmm1, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x24,0x4a,0x7f]
+          vpmovsqw %zmm1, 2032(%rdx)
+
+// CHECK: vpmovsqw %zmm1, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x24,0x8a,0x00,0x08,0x00,0x00]
+          vpmovsqw %zmm1, 2048(%rdx)
+
+// CHECK: vpmovsqw %zmm1, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x24,0x4a,0x80]
+          vpmovsqw %zmm1, -2048(%rdx)
+
+// CHECK: vpmovsqw %zmm1, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x24,0x8a,0xf0,0xf7,0xff,0xff]
+          vpmovsqw %zmm1, -2064(%rdx)
+
+// CHECK: vpmovusqw %zmm25, (%rcx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x14,0x09]
+          vpmovusqw %zmm25, (%rcx)
+
+// CHECK: vpmovusqw %zmm25, (%rcx) {%k3}
+// CHECK:  encoding: [0x62,0x62,0x7e,0x4b,0x14,0x09]
+          vpmovusqw %zmm25, (%rcx) {%k3}
+
+// CHECK: vpmovusqw %zmm25, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x22,0x7e,0x48,0x14,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpmovusqw %zmm25, 291(%rax,%r14,8)
+
+// CHECK: vpmovusqw %zmm25, 2032(%rdx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x14,0x4a,0x7f]
+          vpmovusqw %zmm25, 2032(%rdx)
+
+// CHECK: vpmovusqw %zmm25, 2048(%rdx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x14,0x8a,0x00,0x08,0x00,0x00]
+          vpmovusqw %zmm25, 2048(%rdx)
+
+// CHECK: vpmovusqw %zmm25, -2048(%rdx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x14,0x4a,0x80]
+          vpmovusqw %zmm25, -2048(%rdx)
+
+// CHECK: vpmovusqw %zmm25, -2064(%rdx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x14,0x8a,0xf0,0xf7,0xff,0xff]
+          vpmovusqw %zmm25, -2064(%rdx)
+
+// CHECK: vpmovqd %zmm28, (%rcx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x35,0x21]
+          vpmovqd %zmm28, (%rcx)
+
+// CHECK: vpmovqd %zmm28, (%rcx) {%k5}
+// CHECK:  encoding: [0x62,0x62,0x7e,0x4d,0x35,0x21]
+          vpmovqd %zmm28, (%rcx) {%k5}
+
+// CHECK: vpmovqd %zmm28, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x22,0x7e,0x48,0x35,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpmovqd %zmm28, 291(%rax,%r14,8)
+
+// CHECK: vpmovqd %zmm28, 4064(%rdx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x35,0x62,0x7f]
+          vpmovqd %zmm28, 4064(%rdx)
+
+// CHECK: vpmovqd %zmm28, 4096(%rdx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x35,0xa2,0x00,0x10,0x00,0x00]
+          vpmovqd %zmm28, 4096(%rdx)
+
+// CHECK: vpmovqd %zmm28, -4096(%rdx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x35,0x62,0x80]
+          vpmovqd %zmm28, -4096(%rdx)
+
+// CHECK: vpmovqd %zmm28, -4128(%rdx)
+// CHECK:  encoding: [0x62,0x62,0x7e,0x48,0x35,0xa2,0xe0,0xef,0xff,0xff]
+          vpmovqd %zmm28, -4128(%rdx)
+
+// CHECK: vpmovsqd %zmm9, (%rcx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x25,0x09]
+          vpmovsqd %zmm9, (%rcx)
+
+// CHECK: vpmovsqd %zmm9, (%rcx) {%k7}
+// CHECK:  encoding: [0x62,0x72,0x7e,0x4f,0x25,0x09]
+          vpmovsqd %zmm9, (%rcx) {%k7}
+
+// CHECK: vpmovsqd %zmm9, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x32,0x7e,0x48,0x25,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpmovsqd %zmm9, 291(%rax,%r14,8)
+
+// CHECK: vpmovsqd %zmm9, 4064(%rdx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x25,0x4a,0x7f]
+          vpmovsqd %zmm9, 4064(%rdx)
+
+// CHECK: vpmovsqd %zmm9, 4096(%rdx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x25,0x8a,0x00,0x10,0x00,0x00]
+          vpmovsqd %zmm9, 4096(%rdx)
+
+// CHECK: vpmovsqd %zmm9, -4096(%rdx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x25,0x4a,0x80]
+          vpmovsqd %zmm9, -4096(%rdx)
+
+// CHECK: vpmovsqd %zmm9, -4128(%rdx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x25,0x8a,0xe0,0xef,0xff,0xff]
+          vpmovsqd %zmm9, -4128(%rdx)
+
+// CHECK: vpmovusqd %zmm22, (%rcx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x15,0x31]
+          vpmovusqd %zmm22, (%rcx)
+
+// CHECK: vpmovusqd %zmm22, (%rcx) {%k1}
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x49,0x15,0x31]
+          vpmovusqd %zmm22, (%rcx) {%k1}
+
+// CHECK: vpmovusqd %zmm22, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa2,0x7e,0x48,0x15,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vpmovusqd %zmm22, 291(%rax,%r14,8)
+
+// CHECK: vpmovusqd %zmm22, 4064(%rdx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x15,0x72,0x7f]
+          vpmovusqd %zmm22, 4064(%rdx)
+
+// CHECK: vpmovusqd %zmm22, 4096(%rdx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x15,0xb2,0x00,0x10,0x00,0x00]
+          vpmovusqd %zmm22, 4096(%rdx)
+
+// CHECK: vpmovusqd %zmm22, -4096(%rdx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x15,0x72,0x80]
+          vpmovusqd %zmm22, -4096(%rdx)
+
+// CHECK: vpmovusqd %zmm22, -4128(%rdx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x15,0xb2,0xe0,0xef,0xff,0xff]
+          vpmovusqd %zmm22, -4128(%rdx)
+
+// CHECK: vpmovdb %zmm12, (%rcx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x31,0x21]
+          vpmovdb %zmm12, (%rcx)
+
+// CHECK: vpmovdb %zmm12, (%rcx) {%k3}
+// CHECK:  encoding: [0x62,0x72,0x7e,0x4b,0x31,0x21]
+          vpmovdb %zmm12, (%rcx) {%k3}
+
+// CHECK: vpmovdb %zmm12, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x32,0x7e,0x48,0x31,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpmovdb %zmm12, 291(%rax,%r14,8)
+
+// CHECK: vpmovdb %zmm12, 2032(%rdx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x31,0x62,0x7f]
+          vpmovdb %zmm12, 2032(%rdx)
+
+// CHECK: vpmovdb %zmm12, 2048(%rdx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x31,0xa2,0x00,0x08,0x00,0x00]
+          vpmovdb %zmm12, 2048(%rdx)
+
+// CHECK: vpmovdb %zmm12, -2048(%rdx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x31,0x62,0x80]
+          vpmovdb %zmm12, -2048(%rdx)
+
+// CHECK: vpmovdb %zmm12, -2064(%rdx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x31,0xa2,0xf0,0xf7,0xff,0xff]
+          vpmovdb %zmm12, -2064(%rdx)
+
+// CHECK: vpmovsdb %zmm6, (%rcx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x21,0x31]
+          vpmovsdb %zmm6, (%rcx)
+
+// CHECK: vpmovsdb %zmm6, (%rcx) {%k1}
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x49,0x21,0x31]
+          vpmovsdb %zmm6, (%rcx) {%k1}
+
+// CHECK: vpmovsdb %zmm6, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xb2,0x7e,0x48,0x21,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vpmovsdb %zmm6, 291(%rax,%r14,8)
+
+// CHECK: vpmovsdb %zmm6, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x21,0x72,0x7f]
+          vpmovsdb %zmm6, 2032(%rdx)
+
+// CHECK: vpmovsdb %zmm6, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x21,0xb2,0x00,0x08,0x00,0x00]
+          vpmovsdb %zmm6, 2048(%rdx)
+
+// CHECK: vpmovsdb %zmm6, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x21,0x72,0x80]
+          vpmovsdb %zmm6, -2048(%rdx)
+
+// CHECK: vpmovsdb %zmm6, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x21,0xb2,0xf0,0xf7,0xff,0xff]
+          vpmovsdb %zmm6, -2064(%rdx)
+
+// CHECK: vpmovusdb %zmm23, (%rcx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x11,0x39]
+          vpmovusdb %zmm23, (%rcx)
+
+// CHECK: vpmovusdb %zmm23, (%rcx) {%k3}
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x4b,0x11,0x39]
+          vpmovusdb %zmm23, (%rcx) {%k3}
+
+// CHECK: vpmovusdb %zmm23, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa2,0x7e,0x48,0x11,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vpmovusdb %zmm23, 291(%rax,%r14,8)
+
+// CHECK: vpmovusdb %zmm23, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x11,0x7a,0x7f]
+          vpmovusdb %zmm23, 2032(%rdx)
+
+// CHECK: vpmovusdb %zmm23, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x11,0xba,0x00,0x08,0x00,0x00]
+          vpmovusdb %zmm23, 2048(%rdx)
+
+// CHECK: vpmovusdb %zmm23, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x11,0x7a,0x80]
+          vpmovusdb %zmm23, -2048(%rdx)
+
+// CHECK: vpmovusdb %zmm23, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xe2,0x7e,0x48,0x11,0xba,0xf0,0xf7,0xff,0xff]
+          vpmovusdb %zmm23, -2064(%rdx)
+
+// CHECK: vpmovdw %zmm7, (%rcx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x33,0x39]
+          vpmovdw %zmm7, (%rcx)
+
+// CHECK: vpmovdw %zmm7, (%rcx) {%k7}
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x4f,0x33,0x39]
+          vpmovdw %zmm7, (%rcx) {%k7}
+
+// CHECK: vpmovdw %zmm7, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xb2,0x7e,0x48,0x33,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vpmovdw %zmm7, 291(%rax,%r14,8)
+
+// CHECK: vpmovdw %zmm7, 4064(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x33,0x7a,0x7f]
+          vpmovdw %zmm7, 4064(%rdx)
+
+// CHECK: vpmovdw %zmm7, 4096(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x33,0xba,0x00,0x10,0x00,0x00]
+          vpmovdw %zmm7, 4096(%rdx)
+
+// CHECK: vpmovdw %zmm7, -4096(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x33,0x7a,0x80]
+          vpmovdw %zmm7, -4096(%rdx)
+
+// CHECK: vpmovdw %zmm7, -4128(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x33,0xba,0xe0,0xef,0xff,0xff]
+          vpmovdw %zmm7, -4128(%rdx)
+
+// CHECK: vpmovsdw %zmm14, (%rcx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x23,0x31]
+          vpmovsdw %zmm14, (%rcx)
+
+// CHECK: vpmovsdw %zmm14, (%rcx) {%k6}
+// CHECK:  encoding: [0x62,0x72,0x7e,0x4e,0x23,0x31]
+          vpmovsdw %zmm14, (%rcx) {%k6}
+
+// CHECK: vpmovsdw %zmm14, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x32,0x7e,0x48,0x23,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vpmovsdw %zmm14, 291(%rax,%r14,8)
+
+// CHECK: vpmovsdw %zmm14, 4064(%rdx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x23,0x72,0x7f]
+          vpmovsdw %zmm14, 4064(%rdx)
+
+// CHECK: vpmovsdw %zmm14, 4096(%rdx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x23,0xb2,0x00,0x10,0x00,0x00]
+          vpmovsdw %zmm14, 4096(%rdx)
+
+// CHECK: vpmovsdw %zmm14, -4096(%rdx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x23,0x72,0x80]
+          vpmovsdw %zmm14, -4096(%rdx)
+
+// CHECK: vpmovsdw %zmm14, -4128(%rdx)
+// CHECK:  encoding: [0x62,0x72,0x7e,0x48,0x23,0xb2,0xe0,0xef,0xff,0xff]
+          vpmovsdw %zmm14, -4128(%rdx)
+
+// CHECK: vpmovusdw %zmm5, (%rcx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x13,0x29]
+          vpmovusdw %zmm5, (%rcx)
+
+// CHECK: vpmovusdw %zmm5, (%rcx) {%k3}
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x4b,0x13,0x29]
+          vpmovusdw %zmm5, (%rcx) {%k3}
+
+// CHECK: vpmovusdw %zmm5, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xb2,0x7e,0x48,0x13,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpmovusdw %zmm5, 291(%rax,%r14,8)
+
+// CHECK: vpmovusdw %zmm5, 4064(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x13,0x6a,0x7f]
+          vpmovusdw %zmm5, 4064(%rdx)
+
+// CHECK: vpmovusdw %zmm5, 4096(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x13,0xaa,0x00,0x10,0x00,0x00]
+          vpmovusdw %zmm5, 4096(%rdx)
+
+// CHECK: vpmovusdw %zmm5, -4096(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x13,0x6a,0x80]
+          vpmovusdw %zmm5, -4096(%rdx)
+
+// CHECK: vpmovusdw %zmm5, -4128(%rdx)
+// CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x13,0xaa,0xe0,0xef,0xff,0xff]
+          vpmovusdw %zmm5, -4128(%rdx)
+
 // CHECK: vinserti32x4
 // CHECK: encoding: [0x62,0xa3,0x55,0x48,0x38,0xcd,0x01]
 vinserti32x4  $1, %xmm21, %zmm5, %zmm17
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 2781ef4..10d420a 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -203,7 +203,7 @@ int	$3
 // CHECK-STDERR: warning: scale factor without index register is ignored
 movaps %xmm3, (%esi, 2)
 
-// CHECK: imull $12, %eax, %eax
+// CHECK: imull $12, %eax
 imul $12, %eax
 
 // CHECK: imull %ecx, %eax
diff --git a/test/Object/Inputs/COFF/weak-external.yaml b/test/Object/Inputs/COFF/weak-external.yaml
new file mode 100644
index 0000000..064b44a
--- /dev/null
+++ b/test/Object/Inputs/COFF/weak-external.yaml
@@ -0,0 +1,43 @@
+---
+header:
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_LINE_NUMS_STRIPPED, IMAGE_FILE_32BIT_MACHINE ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    Alignment:       4
+    SectionData:     5589E583E4F0E800000000B800000000C9C39090
+    Relocations:     
+      - VirtualAddress:  7
+        SymbolName:      ___main
+        Type:            IMAGE_REL_I386_REL32
+symbols:
+  - Name:            .file
+    Value:           0
+    SectionNumber:   65534
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_FILE
+    File:            'file'
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          18
+      NumberOfRelocations: 1
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            ___main
+    Value:           0
+    SectionNumber:   0
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+    WeakExternal:    
+      TagIndex:        0
+      Characteristics: 0
+...
diff --git a/test/Object/Inputs/COFF/x86-64.yaml b/test/Object/Inputs/COFF/x86-64.yaml
index 1dc2b10..b8a863a 100644
--- a/test/Object/Inputs/COFF/x86-64.yaml
+++ b/test/Object/Inputs/COFF/x86-64.yaml
@@ -30,6 +30,16 @@ sections:
     Characteristics: [IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE, ] # 0xc0100040
     SectionData:  !hex "48656C6C6F20576F726C642100" # |Hello World!.|
 
+  - !Section
+    Name: '.CRT$XCU'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment: 8
+    SectionData:  !hex "0000000000000000"
+    Relocations:
+      - VirtualAddress: 0
+        SymbolName: '??__Ex@@YAXXZ'
+        Type: IMAGE_REL_AMD64_ADDR64
+
 symbols:
   - !Symbol
     Name: .text
@@ -91,3 +101,10 @@ symbols:
     ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
     StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2)
 
+  - !Symbol
+    Name: '??__Ex@@YAXXZ'
+    Value: 0
+    SectionNumber: 3
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_FUNCTION # (2)
+    StorageClass: IMAGE_SYM_CLASS_STATIC # (3)
diff --git a/test/Object/Inputs/macho-text-data-bss.macho-x86_64 b/test/Object/Inputs/macho-text-data-bss.macho-x86_64
new file mode 100644
index 0000000..b7628c8
Binary files /dev/null and b/test/Object/Inputs/macho-text-data-bss.macho-x86_64 differ
diff --git a/test/Object/Inputs/macho-universal-archive.x86_64.i386 b/test/Object/Inputs/macho-universal-archive.x86_64.i386
new file mode 100644
index 0000000..1660714
Binary files /dev/null and b/test/Object/Inputs/macho-universal-archive.x86_64.i386 differ
diff --git a/test/Object/Inputs/relocation-dynamic.elf-i386 b/test/Object/Inputs/relocation-dynamic.elf-i386
new file mode 100755
index 0000000..1548f13
Binary files /dev/null and b/test/Object/Inputs/relocation-dynamic.elf-i386 differ
diff --git a/test/Object/Inputs/relocation-relocatable.elf-i386 b/test/Object/Inputs/relocation-relocatable.elf-i386
new file mode 100644
index 0000000..b8f375b
Binary files /dev/null and b/test/Object/Inputs/relocation-relocatable.elf-i386 differ
diff --git a/test/Object/Inputs/trivial-object-test.coff-x86-64 b/test/Object/Inputs/trivial-object-test.coff-x86-64
index 0775914..ed144d1 100644
Binary files a/test/Object/Inputs/trivial-object-test.coff-x86-64 and b/test/Object/Inputs/trivial-object-test.coff-x86-64 differ
diff --git a/test/Object/X86/objdump-cfg-invalid-opcode.yaml b/test/Object/X86/objdump-cfg-invalid-opcode.yaml
index 56ab1d2..d0a29be 100644
--- a/test/Object/X86/objdump-cfg-invalid-opcode.yaml
+++ b/test/Object/X86/objdump-cfg-invalid-opcode.yaml
@@ -38,7 +38,7 @@ Sections:
 #CFG:     Type:            Data
 
 ##    4:   06                      (bad)
-#CFG:     Content:         06
+#CFG:     Content:         '06'
 
 #CFG:   - StartAddress:    0x0000000000000005
 #CFG:     Size:            1
diff --git a/test/Object/X86/objdump-disassembly-symbolic.test b/test/Object/X86/objdump-disassembly-symbolic.test
index 858653e..95a5fc8 100644
--- a/test/Object/X86/objdump-disassembly-symbolic.test
+++ b/test/Object/X86/objdump-disassembly-symbolic.test
@@ -46,3 +46,23 @@ MACHO-STUBS-x86-64:     1faa:       e8 09 00 00 00
 MACHO-STUBS-x86-64:     1faf:       8b 44 24 04                                     movl    4(%rsp), %eax
 MACHO-STUBS-x86-64:     1fb3:       48 83 c4 08                                     addq    $8, %rsp
 MACHO-STUBS-x86-64:     1fb7:       c3                                              ret
+
+
+RUN: llvm-objdump -d -symbolize %p/../Inputs/relocation-relocatable.elf-i386 \
+RUN:              | FileCheck %s -check-prefix ELF-i386-REL
+
+ELF-i386-REL: Disassembly of section .text:
+ELF-i386-REL-NEXT: f:
+ELF-i386-REL-NEXT:       0:	e9 fc ff ff ff                	jmp	h
+ELF-i386-REL:      g:
+ELF-i386-REL-NEXT:       5:	e9 fc ff ff ff                 	jmp	f
+
+
+RUN: llvm-objdump -d -symbolize %p/../Inputs/relocation-dynamic.elf-i386 \
+RUN:              | FileCheck %s -check-prefix ELF-i386-DYN
+
+ELF-i386-DYN: Disassembly of section .text:
+ELF-i386-DYN-NEXT: f:
+ELF-i386-DYN-NEXT:      1a4:	e9 fc ff ff ff                 	jmp	h
+ELF-i386-DYN:      g:
+ELF-i386-DYN-NEXT:      1a9:	e9 fc ff ff ff                 	jmp	f
diff --git a/test/Object/X86/yaml2obj-elf-x86-rel.yaml b/test/Object/X86/yaml2obj-elf-x86-rel.yaml
new file mode 100644
index 0000000..5ca6614
--- /dev/null
+++ b/test/Object/X86/yaml2obj-elf-x86-rel.yaml
@@ -0,0 +1,41 @@
+# RUN: yaml2obj -format=elf %s > %t
+# RUN: llvm-readobj -r %t | FileCheck %s
+
+# CHECK:      Relocations [
+# CHECK-NEXT:   Section (2) .rel.text {
+# CHECK-NEXT:     0x0 R_386_32 main 0x0
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+
+FileHeader:
+  Class:           ELFCLASS32
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_386
+Sections:
+  - Type:            SHT_PROGBITS
+    Name:            .text
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x04
+    Content:         0000000000000000
+  - Type:            SHT_REL
+    Name:            .rel.text
+    Link:            .symtab
+    Info:            .text
+    AddressAlign:    0x04
+    Relocations:
+      - Offset:          0
+        Symbol:          main
+        Type:            R_386_32
+
+Symbols:
+  Local:
+    - Name:            .text
+      Type:            STT_SECTION
+      Section:         .text
+
+  Global:
+    - Name:            main
+      Type:            STT_FUNC
+      Section:         .text
+      Size:            0x08
diff --git a/test/Object/ar-error.test b/test/Object/ar-error.test
new file mode 100644
index 0000000..7add9b4
--- /dev/null
+++ b/test/Object/ar-error.test
@@ -0,0 +1,6 @@
+Test if we get a proper error with a filename that doesn't exist
+
+RUN: not llvm-ar r %t.out.a sparkle.o %t 2>&1 | FileCheck %s
+
+# Don't check the message "No such file or directory".
+CHECK: llvm-ar{{(.exe|.EXE)?}}: sparkle.o:
diff --git a/test/Object/archive-long-index.test b/test/Object/archive-long-index.test
index bd530ed..f2f4df6 100644
--- a/test/Object/archive-long-index.test
+++ b/test/Object/archive-long-index.test
@@ -17,24 +17,24 @@ CHECKIDX: b in abcdefghijklmnopqrstuvwxyz2.o
 CHECKIDX: bda in abcdefghijklmnopqrstuvwxyz2.o
 CHECKIDX: b in abcdefghijklmnopq.o
 CHECKIDX: 1.o:
-CHECKIDX: 00000000 D abcdefghijklmnopqrstuvwxyz12345678
-CHECKIDX:          U bda
-CHECKIDX: 00000000 T main
+CHECKIDX: 0000000000000000 D abcdefghijklmnopqrstuvwxyz12345678
+CHECKIDX:                  U bda
+CHECKIDX: 0000000000000000 T main
 CHECKIDX: 2.o:
-CHECKIDX: 00000000 T fn1
+CHECKIDX: 0000000000000000 T fn1
 CHECKIDX: 3.o:
-CHECKIDX: 0000000b T fn1
-CHECKIDX: 00000000 T fn3
+CHECKIDX: 000000000000000b T fn1
+CHECKIDX: 0000000000000000 T fn3
 CHECKIDX: 4.o:
-CHECKIDX:          C shankar
+CHECKIDX:                  C shankar
 CHECKIDX: 5.o:
-CHECKIDX:          C a
+CHECKIDX:                  C a
 CHECKIDX: 6.o:
-CHECKIDX:          C b
+CHECKIDX:                  C b
 CHECKIDX: abcdefghijklmnopqrstuvwxyz1.o:
-CHECKIDX:          C a
+CHECKIDX:                  C a
 CHECKIDX: abcdefghijklmnopqrstuvwxyz2.o:
-CHECKIDX:          C b
-CHECKIDX: 00000000 T bda
+CHECKIDX:                  C b
+CHECKIDX: 0000000000000000 T bda
 CHECKIDX: abcdefghijklmnopq.o:
-CHECKIDX:          C b
+CHECKIDX:                  C b
diff --git a/test/Object/archive-symtab.test b/test/Object/archive-symtab.test
index 6379504..88c9c98 100644
--- a/test/Object/archive-symtab.test
+++ b/test/Object/archive-symtab.test
@@ -9,13 +9,13 @@ CHECK-NEXT: main in trivial-object-test2.elf-x86-64
 CHECK-NOT: bar
 
 CHECK: trivial-object-test.elf-x86-64:
-CHECK-NEXT:         U SomeOtherFunction
-CHECK-NEXT: 00000000 T main
-CHECK-NEXT:         U puts
+CHECK-NEXT:                  U SomeOtherFunction
+CHECK-NEXT: 0000000000000000 T main
+CHECK-NEXT:                  U puts
 CHECK-NEXT: trivial-object-test2.elf-x86-64:
-CHECK-NEXT: 00000000 t bar
-CHECK-NEXT: 00000006 T foo
-CHECK-NEXT: 00000016 T main
+CHECK-NEXT: 0000000000000000 t bar
+CHECK-NEXT: 0000000000000006 T foo
+CHECK-NEXT: 0000000000000016 T main
 
 RUN: rm -f %t.a
 RUN: llvm-ar rcS %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
@@ -37,13 +37,13 @@ CORRUPT-NEXT: foo in trivial-object-test2.elf-x86-64
 CORRUPT-NEXT: main in trivial-object-test2.elf-x86-64
 
 CORRUPT: trivial-object-test.elf-x86-64:
-CORRUPT-NEXT:         U SomeOtherFunction
-CORRUPT-NEXT: 00000000 T main
-CORRUPT-NEXT:         U puts
+CORRUPT-NEXT:                  U SomeOtherFunction
+CORRUPT-NEXT: 0000000000000000 T main
+CORRUPT-NEXT:                  U puts
 CORRUPT-NEXT: trivial-object-test2.elf-x86-64:
-CORRUPT-NEXT: 00000000 t bar
-CORRUPT-NEXT: 00000006 T foo
-CORRUPT-NEXT: 00000016 T main
+CORRUPT-NEXT: 0000000000000000 t bar
+CORRUPT-NEXT: 0000000000000006 T foo
+CORRUPT-NEXT: 0000000000000016 T main
 
 check that the we *don't* update the symbol table.
 RUN: llvm-ar s %t.a
diff --git a/test/Object/archive-toc.test b/test/Object/archive-toc.test
index 0a5e72b..4195c40 100644
--- a/test/Object/archive-toc.test
+++ b/test/Object/archive-toc.test
@@ -1,20 +1,20 @@
 Test reading an archive created by gnu ar
 RUN: env TZ=GMT llvm-ar tv %p/Inputs/GNU.a | FileCheck %s --check-prefix=GNU -strict-whitespace
 
-GNU:      rw-r--r-- 500/500      8 Nov 19 02:57 2004 evenlen
-GNU-NEXT: rw-r--r-- 500/500      7 Nov 19 02:57 2004 oddlen
-GNU-NEXT: rwxr-xr-x 500/500   1465 Nov 19 03:01 2004 very_long_bytecode_file_name.bc
-GNU-NEXT: rw-r--r-- 500/500   2280 Nov 19 03:04 2004 IsNAN.o
+GNU:      rw-r--r-- 500/500      8 2004-11-19 02:57:37.000000000 evenlen
+GNU-NEXT: rw-r--r-- 500/500      7 2004-11-19 02:57:21.000000000 oddlen
+GNU-NEXT: rwxr-xr-x 500/500   1465 2004-11-19 03:01:31.000000000 very_long_bytecode_file_name.bc
+GNU-NEXT: rw-r--r-- 500/500   2280 2004-11-19 03:04:30.000000000 IsNAN.o
 
 
 Test reading an archive createdy by Mac OS X ar
 RUN: env TZ=GMT llvm-ar tv %p/Inputs/MacOSX.a | FileCheck %s --check-prefix=OSX -strict-whitespace
 
 OSX-NOT: __.SYMDEF
-OSX:      rw-r--r-- 501/501      8 Nov 19 02:57 2004 evenlen
-OSX-NEXT: rw-r--r-- 501/501      8 Nov 19 02:57 2004 oddlen
-OSX-NEXT: rw-r--r-- 502/502   1465 Feb  4 06:59 2010 very_long_bytecode_file_name.bc
-OSX-NEXT: rw-r--r-- 501/501   2280 Nov 19 04:32 2004 IsNAN.o
+OSX:      rw-r--r-- 501/501      8 2004-11-19 02:57:37.000000000 evenlen
+OSX-NEXT: rw-r--r-- 501/501      8 2004-11-19 02:57:21.000000000 oddlen
+OSX-NEXT: rw-r--r-- 502/502   1465 2010-02-04 06:59:14.000000000 very_long_bytecode_file_name.bc
+OSX-NEXT: rw-r--r-- 501/501   2280 2004-11-19 04:32:06.000000000 IsNAN.o
 
 Test reading an archive created on Solaris by /usr/ccs/bin/ar
 RUN: env TZ=GMT llvm-ar tv %p/Inputs/SVR4.a | FileCheck %s -strict-whitespace
@@ -22,7 +22,7 @@ RUN: env TZ=GMT llvm-ar tv %p/Inputs/SVR4.a | FileCheck %s -strict-whitespace
 Test reading an archive created on Solaris by /usr/xpg4/bin/ar
 RUN: env TZ=GMT llvm-ar tv %p/Inputs/xpg4.a | FileCheck %s -strict-whitespace
 
-CHECK:      rw-r--r-- 1002/102      8 Nov 19 03:24 2004 evenlen
-CHECK-NEXT: rw-r--r-- 1002/102      7 Nov 19 03:24 2004 oddlen
-CHECK-NEXT: rwxr-xr-x 1002/102   1465 Nov 19 03:24 2004 very_long_bytecode_file_name.bc
-CHECK-NEXT: rw-r--r-- 1002/102   2280 Nov 19 03:24 2004 IsNAN.o
+CHECK:      rw-r--r-- 1002/102      8 2004-11-19 03:24:02.000000000 evenlen
+CHECK-NEXT: rw-r--r-- 1002/102      7 2004-11-19 03:24:02.000000000 oddlen
+CHECK-NEXT: rwxr-xr-x 1002/102   1465 2004-11-19 03:24:02.000000000 very_long_bytecode_file_name.bc
+CHECK-NEXT: rw-r--r-- 1002/102   2280 2004-11-19 03:24:02.000000000 IsNAN.o
diff --git a/test/Object/extract.ll b/test/Object/extract.ll
index 9f93c68..a4e7649 100644
--- a/test/Object/extract.ll
+++ b/test/Object/extract.ll
@@ -44,4 +44,4 @@
 ; RUN: llvm-ar rc %t.a very_long_bytecode_file_name.bc
 ; RUN: env TZ=GMT llvm-ar tv %t.a | FileCheck %s
 
-CHECK: 1465 Nov 19 03:01 2004 very_long_bytecode_file_name.bc
+CHECK: 1465 2004-11-19 03:01:31.000000000 very_long_bytecode_file_name.bc
diff --git a/test/Object/nm-shared-object.test b/test/Object/nm-shared-object.test
index b77b2ce..32ae6a8 100644
--- a/test/Object/nm-shared-object.test
+++ b/test/Object/nm-shared-object.test
@@ -2,25 +2,25 @@ RUN: llvm-nm -D %p/Inputs/shared-object-test.elf-i386 \
 RUN:         | FileCheck %s -check-prefix ELF-32
 
 ELF-32-NOT: U
-ELF-32: 0012c8 A __bss_start
-ELF-32: 0012c8 A _edata
-ELF-32: 0012cc A _end
-ELF-32: 0012c8 B common_sym
-ELF-32: 0012c4 D defined_sym
-ELF-32: 0001f0 T global_func
-ELF-32: 000000 D tls_sym
+ELF-32: 000012c8 A __bss_start
+ELF-32: 000012c8 A _edata
+ELF-32: 000012cc A _end
+ELF-32: 000012c8 B common_sym
+ELF-32: 000012c4 D defined_sym
+ELF-32: 000001f0 T global_func
+ELF-32: 00000000 D tls_sym
 
 RUN: llvm-nm -D %p/Inputs/shared-object-test.elf-x86-64 \
 RUN:         | FileCheck %s -check-prefix ELF-64
 
 ELF-64-NOT: U
-ELF-64: 200454 A __bss_start
-ELF-64: 200454 A _edata
-ELF-64: 200458 A _end
-ELF-64: 200454 B common_sym
-ELF-64: 200450 D defined_sym
-ELF-64: 0002f0 T global_func
-ELF-64: 000000 D tls_sym
+ELF-64: 0000000000200454 A __bss_start
+ELF-64: 0000000000200454 A _edata
+ELF-64: 0000000000200458 A _end
+ELF-64: 0000000000200454 B common_sym
+ELF-64: 0000000000200450 D defined_sym
+ELF-64: 00000000000002f0 T global_func
+ELF-64: 0000000000000000 D tls_sym
 
 RUN: not llvm-nm -D %p/Inputs/weak-global-symbol.macho-i386 2>&1 \
 RUN:         | FileCheck %s -check-prefix ERROR
diff --git a/test/Object/nm-trivial-object.test b/test/Object/nm-trivial-object.test
index 4e90f96..20ac662 100644
--- a/test/Object/nm-trivial-object.test
+++ b/test/Object/nm-trivial-object.test
@@ -5,11 +5,11 @@ RUN:         | FileCheck %s -check-prefix COFF
 RUN: llvm-nm %p/Inputs/trivial-object-test.elf-i386 \
 RUN:         | FileCheck %s -check-prefix ELF
 RUN: llvm-nm %p/Inputs/trivial-object-test.elf-x86-64 \
-RUN:         | FileCheck %s -check-prefix ELF
+RUN:         | FileCheck %s -check-prefix ELF64
 RUN: llvm-nm %p/Inputs/weak.elf-x86-64 \
-RUN:         | FileCheck %s -check-prefix WEAK-ELF
+RUN:         | FileCheck %s -check-prefix WEAK-ELF64
 RUN: llvm-nm %p/Inputs/absolute.elf-x86-64 \
-RUN:         | FileCheck %s -check-prefix ABSOLUTE-ELF
+RUN:         | FileCheck %s -check-prefix ABSOLUTE-ELF64
 RUN: llvm-nm %p/Inputs/trivial-object-test.macho-i386 \
 RUN:         | FileCheck %s -check-prefix macho
 RUN: llvm-nm %p/Inputs/trivial-object-test.macho-x86-64 \
@@ -17,7 +17,7 @@ RUN:         | FileCheck %s -check-prefix macho64
 RUN: llvm-nm %p/Inputs/common.coff-i386 \
 RUN:         | FileCheck %s -check-prefix COFF-COMMON
 RUN: llvm-nm %p/Inputs/relocatable-with-section-address.elf-x86-64 \
-RUN:         | FileCheck %s -check-prefix ELF-SEC-ADDR
+RUN:         | FileCheck %s -check-prefix ELF-SEC-ADDR64
 RUN: llvm-nm %p/Inputs/thumb-symbols.elf.arm \
 RUN:         | FileCheck %s -check-prefix ELF-THUMB
 
@@ -43,30 +43,34 @@ ELF:          U SomeOtherFunction
 ELF: 00000000 T main
 ELF:          U puts
 
-WEAK-ELF:          w f1
-WEAK-ELF: 00000000 W f2
-WEAK-ELF:          v x1
-WEAK-ELF: 00000000 V x2
+ELF64:                  U SomeOtherFunction
+ELF64: 0000000000000000 T main
+ELF64:                  U puts
+
+WEAK-ELF64:                  w f1
+WEAK-ELF64: 0000000000000000 W f2
+WEAK-ELF64:                  v x1
+WEAK-ELF64: 0000000000000000 V x2
 
-ABSOLUTE-ELF: 00000123 a a1
-ABSOLUTE-ELF: 00000123 A a2
+ABSOLUTE-ELF64: 0000000000000123 a a1
+ABSOLUTE-ELF64: 0000000000000123 A a2
 
-macho: 00000000 U _SomeOtherFunction
+macho:          U _SomeOtherFunction
 macho: 00000000 T _main
-macho: 00000000 U _puts
+macho:          U _puts
 
-macho64: 00000028 s L_.str
-macho64: 00000000 U _SomeOtherFunction
-macho64: 00000000 T _main
-macho64: 00000000 U _puts
+macho64: 0000000000000028 s L_.str
+macho64:                  U _SomeOtherFunction
+macho64: 0000000000000000 T _main
+macho64:                  U _puts
 
 
 Test that nm uses addresses even with ELF .o files.
-ELF-SEC-ADDR:      00000058 D a
-ELF-SEC-ADDR-NEXT: 0000005c D b
-ELF-SEC-ADDR-NEXT: 00000040 T f
-ELF-SEC-ADDR-NEXT: 00000050 T g
-ELF-SEC-ADDR-NEXT: 00000060 D p
+ELF-SEC-ADDR64:      0000000000000058 D a
+ELF-SEC-ADDR64-NEXT: 000000000000005c D b
+ELF-SEC-ADDR64-NEXT: 0000000000000040 T f
+ELF-SEC-ADDR64-NEXT: 0000000000000050 T g
+ELF-SEC-ADDR64-NEXT: 0000000000000060 D p
 
 
 Test that we drop the thumb bit only from function addresses.
diff --git a/test/Object/nm-universal-binary.test b/test/Object/nm-universal-binary.test
index 8febfdf..c20c733 100644
--- a/test/Object/nm-universal-binary.test
+++ b/test/Object/nm-universal-binary.test
@@ -1,6 +1,19 @@
-RUN: llvm-nm %p/Inputs/macho-universal.x86_64.i386 | FileCheck %s
+RUN: llvm-nm %p/Inputs/macho-universal.x86_64.i386 \
+RUN:         | FileCheck %s -check-prefix CHECK-OBJ
+RUN: llvm-nm %p/Inputs/macho-universal-archive.x86_64.i386 \
+RUN:         | FileCheck %s -check-prefix CHECK-AR
 
-CHECK: macho-universal.x86_64.i386:x86_64
-CHECK: main
-CHECK: macho-universal.x86_64.i386:i386
-CHECK: main
+CHECK-OBJ: macho-universal.x86_64.i386:x86_64
+CHECK-OBJ: 0000000100000f60 T _main
+CHECK-OBJ: macho-universal.x86_64.i386:i386
+CHECK-OBJ: 00001fa0 T _main
+
+CHECK-AR: macho-universal-archive.x86_64.i386:x86_64:hello.o:
+CHECK-AR: 0000000000000068 s EH_frame0
+CHECK-AR: 000000000000003b s L_.str
+CHECK-AR: 0000000000000000 T _main
+CHECK-AR: 0000000000000080 S _main.eh
+CHECK-AR:                  U _printf
+CHECK-AR: macho-universal-archive.x86_64.i386:i386:foo.o:
+CHECK-AR: 00000008 S _bar
+CHECK-AR: 00000000 T _foo
diff --git a/test/Object/obj2yaml-coff-weak-external.test b/test/Object/obj2yaml-coff-weak-external.test
new file mode 100644
index 0000000..4ecdc1b
--- /dev/null
+++ b/test/Object/obj2yaml-coff-weak-external.test
@@ -0,0 +1,3 @@
+RUN: yaml2obj %p/Inputs/COFF/weak-external.yaml | obj2yaml | FileCheck %s --check-prefix COFF-I386
+
+COFF-I386: Characteristics: 0
diff --git a/test/Object/obj2yaml.test b/test/Object/obj2yaml.test
index d96275f..1c15263 100644
--- a/test/Object/obj2yaml.test
+++ b/test/Object/obj2yaml.test
@@ -1,6 +1,8 @@
 RUN: obj2yaml %p/Inputs/trivial-object-test.coff-i386 | FileCheck %s --check-prefix COFF-I386
 RUN: obj2yaml %p/Inputs/trivial-object-test.coff-x86-64 | FileCheck %s --check-prefix COFF-X86-64
-
+RUN: obj2yaml %p/Inputs/trivial-object-test.elf-mipsel | FileCheck %s --check-prefix ELF-MIPSEL
+RUN: obj2yaml %p/Inputs/trivial-object-test.elf-mips64el | FileCheck %s --check-prefix ELF-MIPS64EL
+RUN: obj2yaml %p/Inputs/trivial-object-test.elf-x86-64 | FileCheck %s --check-prefix ELF-X86-64
 
 COFF-I386: header:
 COFF-I386-NEXT:  Machine: IMAGE_FILE_MACHINE_I386
@@ -112,6 +114,16 @@ COFF-X86-64-NEXT:     Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_S
 COFF-X86-64-NEXT:     Alignment: 1
 COFF-X86-64-NEXT:     SectionData: 48656C6C6F20576F726C642100
 
+COFF-X86-64:        - Name: '.CRT$XCU'
+COFF-X86-64-NEXT:     Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+COFF-X86-64-NEXT:     Alignment: 8
+COFF-X86-64-NEXT:     SectionData:  '0000000000000000'
+
+COFF-X86-64:     Relocations:
+COFF-X86-64-NEXT:       - VirtualAddress: 0
+COFF-X86-64-NEXT:         SymbolName: '??__Ex@@YAXXZ'
+COFF-X86-64-NEXT:         Type: IMAGE_REL_AMD64_ADDR64
+
 COFF-X86-64: symbols:
 COFF-X86-64-NEXT:   - Name: .text
 COFF-X86-64-NEXT:     Value: 0
@@ -143,7 +155,7 @@ COFF-X86-64:        - Name: main
 COFF-X86-64-NEXT:     Value: 0
 COFF-X86-64-NEXT:     SectionNumber: 1
 COFF-X86-64-NEXT:     SimpleType: IMAGE_SYM_TYPE_NULL
-COFF-X86-64-NEXT:     ComplexType: IMAGE_SYM_DTYPE_NULL
+COFF-X86-64-NEXT:     ComplexType: IMAGE_SYM_DTYPE_FUNCTION
 COFF-X86-64-NEXT:     StorageClass: IMAGE_SYM_CLASS_EXTERNAL
 
 COFF-X86-64:        - Name: L.str
@@ -166,4 +178,230 @@ COFF-X86-64-NEXT:     SectionNumber: 0
 COFF-X86-64-NEXT:     SimpleType: IMAGE_SYM_TYPE_NULL
 COFF-X86-64-NEXT:     ComplexType: IMAGE_SYM_DTYPE_NULL
 COFF-X86-64-NEXT:     StorageClass: IMAGE_SYM_CLASS_EXTERNAL
-COFF-X86-64-NOT:      NumberOfAuxSymbols
+
+COFF-X86-64:        - Name: '??__Ex@@YAXXZ'
+COFF-X86-64-NEXT:     Value: 0
+COFF-X86-64-NEXT:     SectionNumber: 3
+COFF-X86-64-NEXT:     SimpleType: IMAGE_SYM_TYPE_NULL
+COFF-X86-64-NEXT:     ComplexType: IMAGE_SYM_DTYPE_FUNCTION
+COFF-X86-64-NEXT:     StorageClass: IMAGE_SYM_CLASS_STATIC
+
+ELF-MIPSEL:      FileHeader:
+ELF-MIPSEL-NEXT:   Class:           ELFCLASS32
+ELF-MIPSEL-NEXT:   Data:            ELFDATA2LSB
+ELF-MIPSEL-NEXT:   Type:            ET_REL
+ELF-MIPSEL-NEXT:   Machine:         EM_MIPS
+ELF-MIPSEL-NEXT:   Flags:           [ EF_MIPS_NOREORDER, EF_MIPS_PIC, EF_MIPS_CPIC, EF_MIPS_ABI_O32, EF_MIPS_ARCH_32 ]
+ELF-MIPSEL-NEXT: Sections:
+ELF-MIPSEL-NEXT:   - Name:            .text
+ELF-MIPSEL-NEXT:     Type:            SHT_PROGBITS
+ELF-MIPSEL-NEXT:     Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000004
+ELF-MIPSEL-NEXT:     Content:         0000023C00004224E8FFBD271400BFAF1000B0AF218059000000018E000024240000198E09F8200321E000020000198E09F8200321E00002000002241000B08F1400BF8F0800E0031800BD27
+ELF-MIPSEL-NEXT:   - Name:            .rel.text
+ELF-MIPSEL-NEXT:     Type:            SHT_REL
+ELF-MIPSEL-NEXT:     Link:            .symtab
+ELF-MIPSEL-NEXT:     Info:            .text
+ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000004
+ELF-MIPSEL-NEXT:     Relocations:
+ELF-MIPSEL-NEXT:       - Offset:          0
+ELF-MIPSEL-NEXT:         Symbol:          _gp_disp
+ELF-MIPSEL-NEXT:         Type:            R_MIPS_HI16
+ELF-MIPSEL-NEXT:         Addend:          0
+ELF-MIPSEL-NEXT:       - Offset:          0x0000000000000004
+ELF-MIPSEL-NEXT:         Symbol:          _gp_disp
+ELF-MIPSEL-NEXT:         Type:            R_MIPS_LO16
+ELF-MIPSEL-NEXT:         Addend:          0
+ELF-MIPSEL-NEXT:       - Offset:          0x0000000000000018
+ELF-MIPSEL-NEXT:         Symbol:          '$.str'
+ELF-MIPSEL-NEXT:         Type:            R_MIPS_GOT16
+ELF-MIPSEL-NEXT:         Addend:          0
+ELF-MIPSEL-NEXT:       - Offset:          0x000000000000001C
+ELF-MIPSEL-NEXT:         Symbol:          '$.str'
+ELF-MIPSEL-NEXT:         Type:            R_MIPS_LO16
+ELF-MIPSEL-NEXT:         Addend:          0
+ELF-MIPSEL-NEXT:       - Offset:          0x0000000000000020
+ELF-MIPSEL-NEXT:         Symbol:          puts
+ELF-MIPSEL-NEXT:         Type:            R_MIPS_CALL16
+ELF-MIPSEL-NEXT:         Addend:          0
+ELF-MIPSEL-NEXT:       - Offset:          0x000000000000002C
+ELF-MIPSEL-NEXT:         Symbol:          SomeOtherFunction
+ELF-MIPSEL-NEXT:         Type:            R_MIPS_CALL16
+ELF-MIPSEL-NEXT:         Addend:          0
+ELF-MIPSEL-NEXT:   - Name:            .data
+ELF-MIPSEL-NEXT:     Type:            SHT_PROGBITS
+ELF-MIPSEL-NEXT:     Flags:           [ SHF_WRITE, SHF_ALLOC ]
+ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000004
+ELF-MIPSEL-NEXT:     Content:         ''
+ELF-MIPSEL-NEXT:   - Name:            .bss
+ELF-MIPSEL-NEXT:     Type:            SHT_NOBITS
+ELF-MIPSEL-NEXT:     Flags:           [ SHF_WRITE, SHF_ALLOC ]
+ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000004
+ELF-MIPSEL-NEXT:     Content:         ''
+ELF-MIPSEL-NEXT:   - Name:            .rodata.str1.1
+ELF-MIPSEL-NEXT:     Type:            SHT_PROGBITS
+ELF-MIPSEL-NEXT:     Flags:           [ SHF_ALLOC, SHF_MERGE, SHF_STRINGS ]
+ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000001
+ELF-MIPSEL-NEXT:     Content:         48656C6C6F20576F726C640A00
+ELF-MIPSEL-NEXT:   - Name:            .reginfo
+ELF-MIPSEL-NEXT:     Type:            SHT_MIPS_REGINFO
+ELF-MIPSEL-NEXT:     Flags:           [ SHF_ALLOC ]
+ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000001
+ELF-MIPSEL-NEXT:     Content:         '000000000000000000000000000000000000000000000000'
+ELF-MIPSEL-NEXT: Symbols:
+ELF-MIPSEL-NEXT:   Local:
+ELF-MIPSEL-NEXT:     - Name:            trivial.ll
+ELF-MIPSEL-NEXT:       Type:            STT_FILE
+ELF-MIPSEL-NEXT:     - Name:            '$.str'
+ELF-MIPSEL-NEXT:       Type:            STT_OBJECT
+ELF-MIPSEL-NEXT:       Section:         .rodata.str1.1
+ELF-MIPSEL-NEXT:       Size:            0x000000000000000D
+ELF-MIPSEL-NEXT:     - Name:            .text
+ELF-MIPSEL-NEXT:       Type:            STT_SECTION
+ELF-MIPSEL-NEXT:       Section:         .text
+ELF-MIPSEL-NEXT:     - Name:            .data
+ELF-MIPSEL-NEXT:       Type:            STT_SECTION
+ELF-MIPSEL-NEXT:       Section:         .data
+ELF-MIPSEL-NEXT:     - Name:            .bss
+ELF-MIPSEL-NEXT:       Type:            STT_SECTION
+ELF-MIPSEL-NEXT:       Section:         .bss
+ELF-MIPSEL-NEXT:     - Name:            .rodata.str1.1
+ELF-MIPSEL-NEXT:       Type:            STT_SECTION
+ELF-MIPSEL-NEXT:       Section:         .rodata.str1.1
+ELF-MIPSEL-NEXT:     - Name:            .reginfo
+ELF-MIPSEL-NEXT:       Type:            STT_SECTION
+ELF-MIPSEL-NEXT:       Section:         .reginfo
+ELF-MIPSEL-NEXT:   Global:
+ELF-MIPSEL-NEXT:     - Name:            main
+ELF-MIPSEL-NEXT:       Type:            STT_FUNC
+ELF-MIPSEL-NEXT:       Section:         .text
+ELF-MIPSEL-NEXT:       Size:            0x000000000000004C
+ELF-MIPSEL-NEXT:     - Name:            SomeOtherFunction
+ELF-MIPSEL-NEXT:     - Name:            _gp_disp
+ELF-MIPSEL-NEXT:     - Name:            puts
+
+ELF-MIPS64EL:      FileHeader:
+ELF-MIPS64EL-NEXT:   Class:           ELFCLASS64
+ELF-MIPS64EL-NEXT:   Data:            ELFDATA2LSB
+ELF-MIPS64EL-NEXT:   Type:            ET_REL
+ELF-MIPS64EL-NEXT:   Machine:         EM_MIPS
+ELF-MIPS64EL-NEXT:   Flags:           [ EF_MIPS_ARCH_3 ]
+ELF-MIPS64EL-NEXT: Sections:
+ELF-MIPS64EL-NEXT:   - Name:            .text
+ELF-MIPS64EL-NEXT:     Type:            SHT_PROGBITS
+ELF-MIPS64EL-NEXT:     Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+ELF-MIPS64EL-NEXT:     AddressAlign:    0x0000000000000010
+ELF-MIPS64EL-NEXT:     Content:         ''
+ELF-MIPS64EL-NEXT:   - Name:            .data
+ELF-MIPS64EL-NEXT:     Type:            SHT_PROGBITS
+ELF-MIPS64EL-NEXT:     Flags:           [ SHF_WRITE, SHF_ALLOC ]
+ELF-MIPS64EL-NEXT:     AddressAlign:    0x0000000000000010
+ELF-MIPS64EL-NEXT:     Content:         '00000000000000000000000000000000'
+ELF-MIPS64EL-NEXT:   - Name:            .rela.data
+ELF-MIPS64EL-NEXT:     Type:            SHT_RELA
+ELF-MIPS64EL-NEXT:     Link:            .symtab
+ELF-MIPS64EL-NEXT:     Info:            .data
+ELF-MIPS64EL-NEXT:     AddressAlign:    0x0000000000000008
+ELF-MIPS64EL-NEXT:     Relocations:
+ELF-MIPS64EL-NEXT:       - Offset:          0
+ELF-MIPS64EL-NEXT:         Symbol:          zed
+ELF-MIPS64EL-NEXT:         Type:            R_MIPS_64
+ELF-MIPS64EL-NEXT:         Addend:          0
+ELF-MIPS64EL-NEXT:   - Name:            .bss
+ELF-MIPS64EL-NEXT:     Type:            SHT_NOBITS
+ELF-MIPS64EL-NEXT:     Flags:           [ SHF_WRITE, SHF_ALLOC ]
+ELF-MIPS64EL-NEXT:     AddressAlign:    0x0000000000000010
+ELF-MIPS64EL-NEXT:     Content:         ''
+ELF-MIPS64EL-NEXT:   - Name:            .MIPS.options
+ELF-MIPS64EL-NEXT:     Type:            SHT_MIPS_OPTIONS
+ELF-MIPS64EL-NEXT:     Flags:           [ SHF_ALLOC ]
+ELF-MIPS64EL-NEXT:     AddressAlign:    0x0000000000000008
+ELF-MIPS64EL-NEXT:     Content:         '01280000000000000000000000000000000000000000000000000000000000000000000000000000'
+ELF-MIPS64EL-NEXT:   - Name:            .pdr
+ELF-MIPS64EL-NEXT:     Type:            SHT_PROGBITS
+ELF-MIPS64EL-NEXT:     AddressAlign:    0x0000000000000004
+ELF-MIPS64EL-NEXT:     Content:         ''
+ELF-MIPS64EL-NEXT: Symbols:
+ELF-MIPS64EL-NEXT:   Local:
+ELF-MIPS64EL-NEXT:     - Name:            .text
+ELF-MIPS64EL-NEXT:       Type:            STT_SECTION
+ELF-MIPS64EL-NEXT:       Section:         .text
+ELF-MIPS64EL-NEXT:     - Name:            .data
+ELF-MIPS64EL-NEXT:       Type:            STT_SECTION
+ELF-MIPS64EL-NEXT:       Section:         .data
+ELF-MIPS64EL-NEXT:     - Name:            .bss
+ELF-MIPS64EL-NEXT:       Type:            STT_SECTION
+ELF-MIPS64EL-NEXT:       Section:         .bss
+ELF-MIPS64EL-NEXT:     - Name:            bar
+ELF-MIPS64EL-NEXT:       Section:         .data
+ELF-MIPS64EL-NEXT:     - Name:            .MIPS.options
+ELF-MIPS64EL-NEXT:       Type:            STT_SECTION
+ELF-MIPS64EL-NEXT:       Section:         .MIPS.options
+ELF-MIPS64EL-NEXT:     - Name:            .pdr
+ELF-MIPS64EL-NEXT:       Type:            STT_SECTION
+ELF-MIPS64EL-NEXT:       Section:         .pdr
+ELF-MIPS64EL-NEXT:   Global:
+ELF-MIPS64EL-NEXT:     - Name:            zed
+
+ELF-X86-64:      FileHeader:
+ELF-X86-64-NEXT:   Class:           ELFCLASS64
+ELF-X86-64-NEXT:   Data:            ELFDATA2LSB
+ELF-X86-64-NEXT:   OSABI:           ELFOSABI_GNU
+ELF-X86-64-NEXT:   Type:            ET_REL
+ELF-X86-64-NEXT:   Machine:         EM_X86_64
+ELF-X86-64-NEXT: Sections:
+ELF-X86-64-NEXT:   - Name:            .text
+ELF-X86-64-NEXT:     Type:            SHT_PROGBITS
+ELF-X86-64-NEXT:     Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+ELF-X86-64-NEXT:     AddressAlign:    0x0000000000000010
+ELF-X86-64-NEXT:     Content:         4883EC08C744240400000000BF00000000E80000000030C0E8000000008B4424044883C408C3
+ELF-X86-64-NEXT:   - Name:            .rodata.str1.1
+ELF-X86-64-NEXT:     Type:            SHT_PROGBITS
+ELF-X86-64-NEXT:     Flags:           [ SHF_ALLOC, SHF_MERGE, SHF_STRINGS ]
+ELF-X86-64-NEXT:     Address:         0x0000000000000026
+ELF-X86-64-NEXT:     AddressAlign:    0x0000000000000001
+ELF-X86-64-NEXT:     Content:         48656C6C6F20576F726C642100
+ELF-X86-64-NEXT:   - Name:            .note.GNU-stack
+ELF-X86-64-NEXT:     Type:            SHT_PROGBITS
+ELF-X86-64-NEXT:     Address:         0x0000000000000033
+ELF-X86-64-NEXT:     AddressAlign:    0x0000000000000001
+ELF-X86-64-NEXT:     Content:         ''
+ELF-X86-64-NEXT:   - Name:            .rela.text
+ELF-X86-64-NEXT:     Type:            SHT_RELA
+ELF-X86-64-NEXT:     Address:         0x0000000000000038
+ELF-X86-64-NEXT:     Link:            .symtab
+ELF-X86-64-NEXT:     Info:            .text
+ELF-X86-64-NEXT:     AddressAlign:    0x0000000000000008
+ELF-X86-64-NEXT:     Relocations:
+ELF-X86-64-NEXT:       - Offset:          0x000000000000000D
+ELF-X86-64-NEXT:         Symbol:          .rodata.str1.1
+ELF-X86-64-NEXT:         Type:            R_X86_64_32S
+ELF-X86-64-NEXT:         Addend:          0
+ELF-X86-64-NEXT:       - Offset:          0x0000000000000012
+ELF-X86-64-NEXT:         Symbol:          puts
+ELF-X86-64-NEXT:         Type:            R_X86_64_PC32
+ELF-X86-64-NEXT:         Addend:          -4
+ELF-X86-64-NEXT:       - Offset:          0x0000000000000019
+ELF-X86-64-NEXT:         Symbol:          SomeOtherFunction
+ELF-X86-64-NEXT:         Type:            R_X86_64_PC32
+ELF-X86-64-NEXT:         Addend:          -4
+ELF-X86-64-NEXT: Symbols:
+ELF-X86-64-NEXT:   Local:
+ELF-X86-64-NEXT:     - Name:            trivial-object-test.s
+ELF-X86-64-NEXT:       Type:            STT_FILE
+ELF-X86-64-NEXT:     - Name:            .text
+ELF-X86-64-NEXT:       Type:            STT_SECTION
+ELF-X86-64-NEXT:       Section:         .text
+ELF-X86-64-NEXT:     - Name:            .rodata.str1.1
+ELF-X86-64-NEXT:       Type:            STT_SECTION
+ELF-X86-64-NEXT:       Section:         .rodata.str1.1
+ELF-X86-64-NEXT:     - Name:            .note.GNU-stack
+ELF-X86-64-NEXT:       Type:            STT_SECTION
+ELF-X86-64-NEXT:       Section:         .note.GNU-stack
+ELF-X86-64-NEXT:   Global:
+ELF-X86-64-NEXT:     - Name:            main
+ELF-X86-64-NEXT:       Type:            STT_FUNC
+ELF-X86-64-NEXT:       Section:         .text
+ELF-X86-64-NEXT:       Size:            0x0000000000000026
+ELF-X86-64-NEXT:     - Name:            SomeOtherFunction
+ELF-X86-64-NEXT:     - Name:            puts
diff --git a/test/Object/size-trivial-macho.test b/test/Object/size-trivial-macho.test
new file mode 100644
index 0000000..6ecdf5c
--- /dev/null
+++ b/test/Object/size-trivial-macho.test
@@ -0,0 +1,15 @@
+RUN: llvm-size -A %p/Inputs/macho-text-data-bss.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix A
+RUN: llvm-size -B %p/Inputs/macho-text-data-bss.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix B
+
+A: section              size   addr
+A: __text                 12      0
+A: __data                  4     12
+A: __bss                   4    112
+A: __compact_unwind       32     16
+A: __eh_frame             64     48
+A: Total                 116
+
+B:   text    data     bss     dec     hex filename
+B:     12     100       4     116      74 
diff --git a/test/Object/yaml2obj-elf-file-headers-with-e_flags.yaml b/test/Object/yaml2obj-elf-file-headers-with-e_flags.yaml
index 19cc91e..7d09807 100644
--- a/test/Object/yaml2obj-elf-file-headers-with-e_flags.yaml
+++ b/test/Object/yaml2obj-elf-file-headers-with-e_flags.yaml
@@ -5,12 +5,13 @@ FileHeader:
   Data: ELFDATA2LSB
   Type: ET_REL
   Machine: EM_MIPS
-  Flags: [ EF_MIPS_NOREORDER, EF_MIPS_ABI_O32 ]
+  Flags: [ EF_MIPS_NOREORDER, EF_MIPS_ABI_O32, EF_MIPS_ARCH_32R2 ]
 
 # CHECK: Format: ELF32-mips
 # CHECK: Arch: mipsel
 # CHECK: Machine: EM_MIPS
-# CHECK: Flags [ (0x1001)
-# CHECK-NEXT: 0x1
-# CHECK-NEXT: 0x1000
+# CHECK: Flags [ (0x70001001)
+# CHECK-NEXT: EF_MIPS_ABI_O32 (0x1000)
+# CHECK-NEXT: EF_MIPS_ARCH_32R2 (0x70000000)
+# CHECK-NEXT: EF_MIPS_NOREORDER (0x1)
 # CHECK-NEXT: ]
diff --git a/test/Object/yaml2obj-elf-rel.yaml b/test/Object/yaml2obj-elf-rel.yaml
new file mode 100644
index 0000000..6a7ed45
--- /dev/null
+++ b/test/Object/yaml2obj-elf-rel.yaml
@@ -0,0 +1,118 @@
+# RUN: yaml2obj -format=elf %s | llvm-readobj -sections -relocations - | FileCheck %s
+
+!ELF
+FileHeader: !FileHeader
+  Class: ELFCLASS32
+  Data: ELFDATA2MSB
+  Type: ET_REL
+  Machine: EM_MIPS
+
+Sections:
+- Name: .text
+  Type: SHT_PROGBITS
+  Content: "0000000000000000"
+  AddressAlign: 16
+  Flags: [SHF_ALLOC]
+
+- Name: .rel.text
+  Type: SHT_REL
+  Info: .text
+  AddressAlign: 4
+  Relocations:
+    - Offset: 0x1
+      Symbol: glob1
+      Type: R_MIPS_32
+    - Offset: 0x1
+      Symbol: glob2
+      Type: R_MIPS_CALL16
+    - Offset: 0x2
+      Symbol: loc1
+      Type: R_MIPS_LO16
+
+- Name: .rela.text
+  Type: SHT_RELA
+  Link: .symtab
+  Info: .text
+  AddressAlign: 4
+  Relocations:
+    - Offset: 0x1
+      Addend: 1
+      Symbol: glob1
+      Type: R_MIPS_32
+    - Offset: 0x1
+      Addend: 2
+      Symbol: glob2
+      Type: R_MIPS_CALL16
+    - Offset: 0x2
+      Addend: 3
+      Symbol: loc1
+      Type: R_MIPS_LO16
+
+Symbols:
+  Local:
+    - Name: loc1
+    - Name: loc2
+  Global:
+    - Name: glob1
+      Section: .text
+      Value: 0x0
+      Size: 4
+    - Name: glob2
+  Weak:
+    - Name: weak1
+
+# CHECK:        Section {
+# CHECK-NEXT:     Index: 0
+# CHECK:        }
+# CHECK:        Section {
+# CHECK-NEXT:     Index: 1
+# CHECK-NEXT:     Name: .text (16)
+# CHECK:        }
+# CHECK-NEXT:   Section {
+# CHECK-NEXT:     Index: 2
+# CHECK-NEXT:     Name: .rel.text (1)
+# CHECK-NEXT:     Type: SHT_REL (0x9)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Address: 0x0
+# CHECK-NEXT:     Offset: 0x160
+# CHECK-NEXT:     Size: 24
+# CHECK-NEXT:     Link: 4
+# CHECK-NEXT:     Info: 1
+# CHECK-NEXT:     AddressAlignment: 4
+# CHECK-NEXT:     EntrySize: 8
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Section {
+# CHECK-NEXT:     Index: 3
+# CHECK-NEXT:     Name: .rela.text (11)
+# CHECK-NEXT:     Type: SHT_RELA (0x4)
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Address: 0x0
+# CHECK-NEXT:     Offset: 0x180
+# CHECK-NEXT:     Size: 36
+# CHECK-NEXT:     Link: 4
+# CHECK-NEXT:     Info: 1
+# CHECK-NEXT:     AddressAlignment: 4
+# CHECK-NEXT:     EntrySize: 12
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Section {
+# CHECK-NEXT:     Index: 4
+# CHECK-NEXT:     Name: .symtab (40)
+# CHECK:        }
+# CHECK-NEXT:   Section {
+# CHECK-NEXT:     Index: 5
+# CHECK-NEXT:     Name: .strtab (32)
+# CHECK:        }
+# CHECK:        Relocations [
+# CHECK-NEXT:     Section (2) .rel.text {
+# CHECK-NEXT:       0x1 R_MIPS_32 glob1 0x0
+# CHECK-NEXT:       0x1 R_MIPS_CALL16 glob2 0x0
+# CHECK-NEXT:       0x2 R_MIPS_LO16 loc1 0x0
+# CHECK-NEXT:     }
+# CHECK-NEXT:     Section (3) .rela.text {
+# CHECK-NEXT:       0x1 R_MIPS_32 glob1 0x1
+# CHECK-NEXT:       0x1 R_MIPS_CALL16 glob2 0x2
+# CHECK-NEXT:       0x2 R_MIPS_LO16 loc1 0x3
+# CHECK-NEXT:     }
+# CHECK-NEXT:   ]
diff --git a/test/Object/yaml2obj-elf-section-basic.yaml b/test/Object/yaml2obj-elf-section-basic.yaml
index c1f6935..56a3fd6 100644
--- a/test/Object/yaml2obj-elf-section-basic.yaml
+++ b/test/Object/yaml2obj-elf-section-basic.yaml
@@ -1,4 +1,7 @@
 # RUN: yaml2obj -format=elf %s | llvm-readobj -sections -section-data - | FileCheck %s
+# RUN: yaml2obj -format=elf -o %t %s
+# RUN: llvm-readobj -sections -section-data %t | FileCheck %s
+
 !ELF
 FileHeader:
   Class: ELFCLASS64
@@ -14,6 +17,14 @@ Sections:
     Content: EBFE
     AddressAlign: 2
 
+  - Name: .data
+    Type: SHT_PROGBITS
+    Flags: [ SHF_ALLOC ]
+    Address: 0xCAFECAFE
+    Content: FEBF
+    Size: 8
+    AddressAlign: 2
+
 # CHECK:        Section {
 # CHECK:          Index: 0
 # CHECK:          Type: SHT_NULL (0x0)
@@ -35,14 +46,31 @@ Sections:
 # CHECK-NEXT:     )
 #
 # CHECK:        Section {
-# CHECK:          Name: .symtab (7)
+# CHECK:          Name: .data
+# CHECK-NEXT:     Type: SHT_PROGBITS (0x1)
+# CHECK-NEXT:     Flags [ (0x2)
+# CHECK-NEXT:       SHF_ALLOC (0x2)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Address: 0xCAFECAFE
+# CHECK-NEXT:     Offset: 0x1D0
+# CHECK-NEXT:     Size: 8
+# CHECK-NEXT:     Link: 0
+# CHECK-NEXT:     Info: 0
+# CHECK-NEXT:     AddressAlignment: 2
+# CHECK-NEXT:     EntrySize: 0
+# CHECK-NEXT:     SectionData (
+# CHECK-NEXT:       0000: FEBF0000 00000000 |........|
+# CHECK-NEXT:     )
+#
+# CHECK:        Section {
+# CHECK:          Name: .symtab (25)
 # CHECK:          Type: SHT_SYMTAB (0x2)
 # CHECK:        }
 # CHECK:        Section {
-# CHECK:          Name: .strtab (15)
+# CHECK:          Name: .strtab (17)
 # CHECK:          Type: SHT_STRTAB (0x3)
 # CHECK:        }
 # CHECK:        Section {
-# CHECK:          Name: .shstrtab (23)
+# CHECK:          Name: .shstrtab (7)
 # CHECK:          Type: SHT_STRTAB (0x3)
 # CHECK:        }
diff --git a/test/Object/yaml2obj-elf-section-invalid-size.yaml b/test/Object/yaml2obj-elf-section-invalid-size.yaml
new file mode 100644
index 0000000..d0cb370
--- /dev/null
+++ b/test/Object/yaml2obj-elf-section-invalid-size.yaml
@@ -0,0 +1,26 @@
+# RUN: not yaml2obj -format=elf -o %t %s 2>&1 | FileCheck %s
+
+!ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data: ELFDATA2LSB
+  Type: ET_REL
+  Machine: EM_X86_64
+
+Sections:
+  - Name: .text
+    Type: SHT_PROGBITS
+    Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+    Content: EBFE
+    AddressAlign: 2
+
+  - Name: .data
+    Type: SHT_PROGBITS
+    Flags: [ SHF_ALLOC ]
+    Content: 0000000000000000
+    Size: 2
+
+# CHECK: YAML:17:5: error: Section size must be greater or equal to the content size
+# CHECK-NEXT: - Name: .data
+# CHECK-NEXT:   ^
+# CHECK-NEXT: yaml2obj: Failed to parse YAML file!
diff --git a/test/Object/yaml2obj-readobj.test b/test/Object/yaml2obj-readobj.test
index 3031f5e..3bd0c6b 100644
--- a/test/Object/yaml2obj-readobj.test
+++ b/test/Object/yaml2obj-readobj.test
@@ -1,4 +1,7 @@
 RUN: yaml2obj %p/Inputs/COFF/i386.yaml | llvm-readobj -file-headers -relocations -expand-relocs - | FileCheck %s --check-prefix COFF-I386
+RUN: yaml2obj -o %t %p/Inputs/COFF/i386.yaml
+RUN: llvm-readobj -file-headers -relocations -expand-relocs %t \
+RUN:   | FileCheck %s --check-prefix COFF-I386
 
 // COFF-I386:  Characteristics [ (0x200)
 // COFF-I386-NEXT:    IMAGE_FILE_DEBUG_STRIPPED (0x200)
diff --git a/test/Other/extract-alias.ll b/test/Other/extract-alias.ll
index d1e4af5..dbc650e 100644
--- a/test/Other/extract-alias.ll
+++ b/test/Other/extract-alias.ll
@@ -14,7 +14,7 @@
 ; DELETE:      @zed = global i32 0
 ; DELETE:      @zeda0 = alias i32* @zed
 ; DELETE-NEXT: @a0foo = alias i32* ()* @foo
-; DELETE-NEXT: @a0a0bar = alias void ()* @a0bar
+; DELETE-NEXT: @a0a0bar = alias void ()* @bar
 ; DELETE-NEXT: @a0bar = alias void ()* @bar
 ; DELETE:      declare i32* @foo()
 ; DELETE:      define void @bar() {
@@ -25,7 +25,7 @@
 ; ALIAS: @zed = external global i32
 ; ALIAS: @zeda0 = alias i32* @zed
 
-; ALIASRE: @a0a0bar = alias void ()* @a0bar
+; ALIASRE: @a0a0bar = alias void ()* @bar
 ; ALIASRE: @a0bar = alias void ()* @bar
 ; ALIASRE: declare void @bar()
 
@@ -39,7 +39,7 @@ define i32* @foo() {
   ret i32* @zeda0
 }
 
-@a0a0bar = alias void ()* @a0bar
+@a0a0bar = alias void ()* @bar
 
 @a0bar = alias void ()* @bar
 
diff --git a/test/Other/optimization-remarks-inline.ll b/test/Other/optimization-remarks-inline.ll
new file mode 100644
index 0000000..566b206
--- /dev/null
+++ b/test/Other/optimization-remarks-inline.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -inline -pass-remarks='inline' -S 2>&1 | FileCheck %s
+; RUN: opt < %s -inline -pass-remarks='inl.*' -S 2>&1 | FileCheck %s
+; RUN: opt < %s -inline -pass-remarks='vector' -pass-remarks='inl' -S 2>&1 | FileCheck %s
+
+; These two should not yield an inline remark for the same reason.
+; In the first command, we only ask for vectorizer remarks, in the
+; second one we ask for the inliner, but we then ask for the vectorizer
+; (thus overriding the first flag).
+; RUN: opt < %s -inline -pass-remarks='vector' -S 2>&1 | FileCheck --check-prefix=REMARKS %s
+; RUN: opt < %s -inline -pass-remarks='inl' -pass-remarks='vector' -S 2>&1 | FileCheck --check-prefix=REMARKS %s
+
+; RUN: opt < %s -inline -S 2>&1 | FileCheck --check-prefix=REMARKS %s
+; RUN: not opt < %s -pass-remarks='(' 2>&1 | FileCheck --check-prefix=BAD-REGEXP %s
+
+define i32 @foo(i32 %x, i32 %y) #0 {
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  %0 = load i32* %x.addr, align 4
+  %1 = load i32* %y.addr, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+define i32 @bar(i32 %j) #0 {
+entry:
+  %j.addr = alloca i32, align 4
+  store i32 %j, i32* %j.addr, align 4
+  %0 = load i32* %j.addr, align 4
+  %1 = load i32* %j.addr, align 4
+  %sub = sub nsw i32 %1, 2
+  %call = call i32 @foo(i32 %0, i32 %sub)
+; CHECK: foo inlined into bar
+; REMARKS-NOT: foo inlined into bar
+  ret i32 %call
+}
+
+; BAD-REGEXP: Invalid regular expression '(' in -pass-remarks:
diff --git a/test/Other/pass-pipeline-parsing.ll b/test/Other/pass-pipeline-parsing.ll
index ba33610..4ec4162 100644
--- a/test/Other/pass-pipeline-parsing.ll
+++ b/test/Other/pass-pipeline-parsing.ll
@@ -105,6 +105,42 @@
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED10
 ; CHECK-UNBALANCED10: unable to parse pass pipeline description
 
+; RUN: opt -disable-output -debug-pass-manager -debug-cgscc-pass-manager \
+; RUN:     -passes=no-op-cgscc,no-op-cgscc %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-TWO-NOOP-CG
+; CHECK-TWO-NOOP-CG: Starting module pass manager
+; CHECK-TWO-NOOP-CG: Running module pass: ModuleToPostOrderCGSCCPassAdaptor
+; CHECK-TWO-NOOP-CG: Starting CGSCC pass manager
+; CHECK-TWO-NOOP-CG: Running CGSCC pass: NoOpCGSCCPass
+; CHECK-TWO-NOOP-CG: Running CGSCC pass: NoOpCGSCCPass
+; CHECK-TWO-NOOP-CG: Finished CGSCC pass manager
+; CHECK-TWO-NOOP-CG: Finished module pass manager
+
+; RUN: opt -disable-output -debug-pass-manager -debug-cgscc-pass-manager \
+; RUN:     -passes='module(function(no-op-function),cgscc(no-op-cgscc,function(no-op-function),no-op-cgscc),function(no-op-function))' %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-NESTED-MP-CG-FP
+; CHECK-NESTED-MP-CG-FP: Starting module pass manager
+; CHECK-NESTED-MP-CG-FP: Starting module pass manager
+; CHECK-NESTED-MP-CG-FP: Running module pass: ModuleToFunctionPassAdaptor
+; CHECK-NESTED-MP-CG-FP: Starting function pass manager
+; CHECK-NESTED-MP-CG-FP: Running function pass: NoOpFunctionPass
+; CHECK-NESTED-MP-CG-FP: Finished function pass manager
+; CHECK-NESTED-MP-CG-FP: Running module pass: ModuleToPostOrderCGSCCPassAdaptor
+; CHECK-NESTED-MP-CG-FP: Starting CGSCC pass manager
+; CHECK-NESTED-MP-CG-FP: Running CGSCC pass: NoOpCGSCCPass
+; CHECK-NESTED-MP-CG-FP: Running CGSCC pass: CGSCCToFunctionPassAdaptor
+; CHECK-NESTED-MP-CG-FP: Starting function pass manager
+; CHECK-NESTED-MP-CG-FP: Running function pass: NoOpFunctionPass
+; CHECK-NESTED-MP-CG-FP: Finished function pass manager
+; CHECK-NESTED-MP-CG-FP: Running CGSCC pass: NoOpCGSCCPass
+; CHECK-NESTED-MP-CG-FP: Finished CGSCC pass manager
+; CHECK-NESTED-MP-CG-FP: Running module pass: ModuleToFunctionPassAdaptor
+; CHECK-NESTED-MP-CG-FP: Starting function pass manager
+; CHECK-NESTED-MP-CG-FP: Running function pass: NoOpFunctionPass
+; CHECK-NESTED-MP-CG-FP: Finished function pass manager
+; CHECK-NESTED-MP-CG-FP: Finished module pass manager
+; CHECK-NESTED-MP-CG-FP: Finished module pass manager
+
 define void @f() {
  ret void
 }
diff --git a/test/TableGen/GeneralList.td b/test/TableGen/GeneralList.td
index 9e0c7df..17cc9a5 100644
--- a/test/TableGen/GeneralList.td
+++ b/test/TableGen/GeneralList.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s
-// XFAIL: vg_leak
 //
 // Test to make sure that lists work with any data-type
 
diff --git a/test/TableGen/lisp.td b/test/TableGen/lisp.td
index 9e58605..d753fbd 100644
--- a/test/TableGen/lisp.td
+++ b/test/TableGen/lisp.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s
-// XFAIL: vg_leak
 
 // CHECK:      def One {
 // CHECK-NEXT:   list<string> names = ["Jeffrey Sinclair"];
diff --git a/test/TableGen/listconcat.td b/test/TableGen/listconcat.td
new file mode 100644
index 0000000..870e649
--- /dev/null
+++ b/test/TableGen/listconcat.td
@@ -0,0 +1,18 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+
+// CHECK: class Y<list<string> Y:S = ?> {
+// CHECK:   list<string> T1 = !listconcat(Y:S, ["foo"]);
+// CHECK:   list<string> T2 = !listconcat(Y:S, !listconcat(["foo"], !listconcat(Y:S, ["bar", "baz"])));
+// CHECK: }
+
+// CHECK: def Z {
+// CHECK:   list<string> T1 = ["fu", "foo"];
+// CHECK:   list<string> T2 = ["fu", "foo", "fu", "bar", "baz"];
+// CHECK: }
+
+class Y<list<string> S> {
+  list<string> T1 = !listconcat(S, ["foo"]);
+  list<string> T2 = !listconcat(S, ["foo"], S, ["bar", "baz"]);
+}
+
+def Z : Y<["fu"]>;
diff --git a/test/TableGen/strconcat.td b/test/TableGen/strconcat.td
index dfb1a94..f5d7512 100644
--- a/test/TableGen/strconcat.td
+++ b/test/TableGen/strconcat.td
@@ -1,9 +1,21 @@
 // RUN: llvm-tblgen %s | FileCheck %s
 
-// CHECK: fufoo
+// CHECK: class Y<string Y:S = ?> {
+// CHECK:   string T = !strconcat(Y:S, "foo");
+// CHECK:   string T2 = !strconcat(Y:S, !strconcat("foo", !strconcat(Y:S, "bar")));
+// CHECK:   string S = "foobar";
+// CHECK: }
+
+// CHECK: def Z {
+// CHECK:   string T = "fufoo";
+// CHECK:   string T2 = "fufoofubar";
+// CHECK:   string S = "foobar";
+// CHECK: }
 
 class Y<string S> {
   string T = !strconcat(S, "foo");
+  // More than two arguments is equivalent to nested calls
+  string T2 = !strconcat(S, "foo", S, "bar");
 
   // String values concatenate lexically, as in C.
   string S = "foo" "bar";
diff --git a/test/Transforms/AddDiscriminators/no-discriminators.ll b/test/Transforms/AddDiscriminators/no-discriminators.ll
new file mode 100644
index 0000000..f7b45e29
--- /dev/null
+++ b/test/Transforms/AddDiscriminators/no-discriminators.ll
@@ -0,0 +1,71 @@
+; RUN: opt < %s -add-discriminators -S | FileCheck %s
+
+; We should not generate discriminators for DWARF versions prior to 4.
+;
+; Original code:
+;
+; int foo(long i) {
+;   if (i < 5) return 2; else return 90;
+; }
+;
+; None of the !dbg nodes associated with the if() statement should be
+; altered. If they are, it means that the discriminators pass added a
+; new lexical scope.
+
+define i32 @foo(i64 %i) #0 {
+entry:
+  %retval = alloca i32, align 4
+  %i.addr = alloca i64, align 8
+  store i64 %i, i64* %i.addr, align 8
+  call void @llvm.dbg.declare(metadata !{i64* %i.addr}, metadata !13), !dbg !14
+  %0 = load i64* %i.addr, align 8, !dbg !15
+; CHECK:  %0 = load i64* %i.addr, align 8, !dbg !15
+  %cmp = icmp slt i64 %0, 5, !dbg !15
+; CHECK:  %cmp = icmp slt i64 %0, 5, !dbg !15
+  br i1 %cmp, label %if.then, label %if.else, !dbg !15
+; CHECK:  br i1 %cmp, label %if.then, label %if.else, !dbg !15
+
+if.then:                                          ; preds = %entry
+  store i32 2, i32* %retval, !dbg !15
+  br label %return, !dbg !15
+
+if.else:                                          ; preds = %entry
+  store i32 90, i32* %retval, !dbg !15
+  br label %return, !dbg !15
+
+return:                                           ; preds = %if.else, %if.then
+  %1 = load i32* %retval, !dbg !17
+  ret i32 %1, !dbg !17
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11}
+!llvm.ident = !{!12}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [./no-discriminators] [DW_LANG_C99]
+!1 = metadata !{metadata !"no-discriminators", metadata !"."}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i64)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [./no-discriminators]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !9}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786468, null, null, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+; CHECK: !10 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{metadata !"clang version 3.5.0 "}
+!13 = metadata !{i32 786689, metadata !4, metadata !"i", metadata !5, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 1]
+!14 = metadata !{i32 1, i32 0, metadata !4, null}
+!15 = metadata !{i32 2, i32 0, metadata !16, null}
+; CHECK: !15 = metadata !{i32 2, i32 0, metadata !16, null}
+!16 = metadata !{i32 786443, metadata !1, metadata !4, i32 2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./no-discriminators]
+; CHECK: !16 = metadata !{i32 786443, metadata !1, metadata !4, i32 2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./no-discriminators]
+!17 = metadata !{i32 3, i32 0, metadata !4, null}
diff --git a/test/Transforms/ArgumentPromotion/inalloca.ll b/test/Transforms/ArgumentPromotion/inalloca.ll
index 513a968..089a78f 100644
--- a/test/Transforms/ArgumentPromotion/inalloca.ll
+++ b/test/Transforms/ArgumentPromotion/inalloca.ll
@@ -20,7 +20,7 @@ entry:
 
 define i32 @main() {
 entry:
-  %S = alloca %struct.ss
+  %S = alloca inalloca %struct.ss
   %f0 = getelementptr %struct.ss* %S, i32 0, i32 0
   %f1 = getelementptr %struct.ss* %S, i32 0, i32 1
   store i32 1, i32* %f0, align 4
@@ -42,7 +42,7 @@ entry:
 
 define i32 @test() {
 entry:
-  %S = alloca %struct.ss
+  %S = alloca inalloca %struct.ss
   %c = call i1 @g(%struct.ss* %S, %struct.ss* inalloca %S)
 ; CHECK: call i1 @g(%struct.ss* %S, %struct.ss* inalloca %S)
   ret i32 0
diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll
new file mode 100644
index 0000000..ac9fc1f
--- /dev/null
+++ b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll
@@ -0,0 +1,340 @@
+; RUN: opt -S -o - -mtriple=armv7-apple-ios7.0 -atomic-ll-sc %s | FileCheck %s
+
+define i8 @test_atomic_xchg_i8(i8* %ptr, i8 %xchgend) {
+; CHECK-LABEL: @test_atomic_xchg_i8
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[NEWVAL32:%.*]] = zext i8 %xchgend to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK-NOT: fence
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw xchg i8* %ptr, i8 %xchgend monotonic
+  ret i8 %res
+}
+
+define i16 @test_atomic_add_i16(i16* %ptr, i16 %addend) {
+; CHECK-LABEL: @test_atomic_add_i16
+; CHECK: fence release
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i16
+; CHECK: [[NEWVAL:%.*]] = add i16 [[OLDVAL]], %addend
+; CHECK: [[NEWVAL32:%.*]] = zext i16 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: fence seq_cst
+; CHECK: ret i16 [[OLDVAL]]
+  %res = atomicrmw add i16* %ptr, i16 %addend seq_cst
+  ret i16 %res
+}
+
+define i32 @test_atomic_sub_i32(i32* %ptr, i32 %subend) {
+; CHECK-LABEL: @test_atomic_sub_i32
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %ptr)
+; CHECK: [[NEWVAL:%.*]] = sub i32 [[OLDVAL]], %subend
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 [[NEWVAL]], i32* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: fence acquire
+; CHECK: ret i32 [[OLDVAL]]
+  %res = atomicrmw sub i32* %ptr, i32 %subend acquire
+  ret i32 %res
+}
+
+define i8 @test_atomic_and_i8(i8* %ptr, i8 %andend) {
+; CHECK-LABEL: @test_atomic_and_i8
+; CHECK: fence release
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[NEWVAL:%.*]] = and i8 [[OLDVAL]], %andend
+; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK-NOT: fence
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw and i8* %ptr, i8 %andend release
+  ret i8 %res
+}
+
+define i16 @test_atomic_nand_i16(i16* %ptr, i16 %nandend) {
+; CHECK-LABEL: @test_atomic_nand_i16
+; CHECK: fence release
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i16
+; CHECK: [[NEWVAL_TMP:%.*]] = xor i16 %nandend, -1
+; CHECK: [[NEWVAL:%.*]] = and i16 [[OLDVAL]], [[NEWVAL_TMP]]
+; CHECK: [[NEWVAL32:%.*]] = zext i16 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: fence seq_cst
+; CHECK: ret i16 [[OLDVAL]]
+  %res = atomicrmw nand i16* %ptr, i16 %nandend seq_cst
+  ret i16 %res
+}
+
+define i64 @test_atomic_or_i64(i64* %ptr, i64 %orend) {
+; CHECK-LABEL: @test_atomic_or_i64
+; CHECK: fence release
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[LOHI:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[PTR8]])
+; CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+; CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+; CHECK: [[LO64:%.*]] = zext i32 [[LO]] to i64
+; CHECK: [[HI64_TMP:%.*]] = zext i32 [[HI]] to i64
+; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
+; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
+; CHECK: [[NEWVAL:%.*]] = or i64 [[OLDVAL]], %orend
+; CHECK: [[NEWLO:%.*]] = trunc i64 [[NEWVAL]] to i32
+; CHECK: [[NEWHI_TMP:%.*]] = lshr i64 [[NEWVAL]], 32
+; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: fence seq_cst
+; CHECK: ret i64 [[OLDVAL]]
+  %res = atomicrmw or i64* %ptr, i64 %orend seq_cst
+  ret i64 %res
+}
+
+define i8 @test_atomic_xor_i8(i8* %ptr, i8 %xorend) {
+; CHECK-LABEL: @test_atomic_xor_i8
+; CHECK: fence release
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[NEWVAL:%.*]] = xor i8 [[OLDVAL]], %xorend
+; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: fence seq_cst
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw xor i8* %ptr, i8 %xorend seq_cst
+  ret i8 %res
+}
+
+define i8 @test_atomic_max_i8(i8* %ptr, i8 %maxend) {
+; CHECK-LABEL: @test_atomic_max_i8
+; CHECK: fence release
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[WANT_OLD:%.*]] = icmp sgt i8 [[OLDVAL]], %maxend
+; CHECK: [[NEWVAL:%.*]] = select i1 [[WANT_OLD]], i8 [[OLDVAL]], i8 %maxend
+; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: fence seq_cst
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw max i8* %ptr, i8 %maxend seq_cst
+  ret i8 %res
+}
+
+define i8 @test_atomic_min_i8(i8* %ptr, i8 %minend) {
+; CHECK-LABEL: @test_atomic_min_i8
+; CHECK: fence release
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[WANT_OLD:%.*]] = icmp sle i8 [[OLDVAL]], %minend
+; CHECK: [[NEWVAL:%.*]] = select i1 [[WANT_OLD]], i8 [[OLDVAL]], i8 %minend
+; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: fence seq_cst
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw min i8* %ptr, i8 %minend seq_cst
+  ret i8 %res
+}
+
+define i8 @test_atomic_umax_i8(i8* %ptr, i8 %umaxend) {
+; CHECK-LABEL: @test_atomic_umax_i8
+; CHECK: fence release
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[WANT_OLD:%.*]] = icmp ugt i8 [[OLDVAL]], %umaxend
+; CHECK: [[NEWVAL:%.*]] = select i1 [[WANT_OLD]], i8 [[OLDVAL]], i8 %umaxend
+; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: fence seq_cst
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw umax i8* %ptr, i8 %umaxend seq_cst
+  ret i8 %res
+}
+
+define i8 @test_atomic_umin_i8(i8* %ptr, i8 %uminend) {
+; CHECK-LABEL: @test_atomic_umin_i8
+; CHECK: fence release
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[WANT_OLD:%.*]] = icmp ule i8 [[OLDVAL]], %uminend
+; CHECK: [[NEWVAL:%.*]] = select i1 [[WANT_OLD]], i8 [[OLDVAL]], i8 %uminend
+; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: fence seq_cst
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw umin i8* %ptr, i8 %uminend seq_cst
+  ret i8 %res
+}
+
+define i8 @test_cmpxchg_i8_seqcst_seqcst(i8* %ptr, i8 %desired, i8 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i8_seqcst_seqcst
+; CHECK: fence release
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i8
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i8 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[BARRIER:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[NEWVAL32:%.*]] = zext i8 %newval to i32
+; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+
+; CHECK: [[BARRIER]]:
+; CHECK: fence seq_cst
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[DONE]]:
+; CHECK: ret i8 [[OLDVAL]]
+
+  %old = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
+  ret i8 %old
+}
+
+define i16 @test_cmpxchg_i16_seqcst_monotonic(i16* %ptr, i16 %desired, i16 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i16_seqcst_monotonic
+; CHECK: fence release
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i16
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i16 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[NEWVAL32:%.*]] = zext i16 %newval to i32
+; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+
+; CHECK: [[BARRIER]]:
+; CHECK: fence seq_cst
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[DONE]]:
+; CHECK: ret i16 [[OLDVAL]]
+
+  %old = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
+  ret i16 %old
+}
+
+define i32 @test_cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %desired, i32 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i32_acquire_acquire
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %ptr)
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i32 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i32(i32 %newval, i32* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+
+; CHECK: [[BARRIER]]:
+; CHECK: fence acquire
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[DONE]]:
+; CHECK: ret i32 [[OLDVAL]]
+
+  %old = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
+  ret i32 %old
+}
+
+define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i64_monotonic_monotonic
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[LOHI:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[PTR8]])
+; CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+; CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+; CHECK: [[LO64:%.*]] = zext i32 [[LO]] to i64
+; CHECK: [[HI64_TMP:%.*]] = zext i32 [[HI]] to i64
+; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
+; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i64 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[NEWLO:%.*]] = trunc i64 %newval to i32
+; CHECK: [[NEWHI_TMP:%.*]] = lshr i64 %newval, 32
+; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+
+; CHECK: [[BARRIER]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[DONE]]:
+; CHECK: ret i64 [[OLDVAL]]
+
+  %old = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
+  ret i64 %old
+}
\ No newline at end of file
diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll
new file mode 100644
index 0000000..bec5bef
--- /dev/null
+++ b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll
@@ -0,0 +1,202 @@
+; RUN: opt -S -o - -mtriple=armv8-linux-gnueabihf -atomic-ll-sc %s | FileCheck %s
+
+define i8 @test_atomic_xchg_i8(i8* %ptr, i8 %xchgend) {
+; CHECK-LABEL: @test_atomic_xchg_i8
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[NEWVAL32:%.*]] = zext i8 %xchgend to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK-NOT: fence
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw xchg i8* %ptr, i8 %xchgend monotonic
+  ret i8 %res
+}
+
+define i16 @test_atomic_add_i16(i16* %ptr, i16 %addend) {
+; CHECK-LABEL: @test_atomic_add_i16
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i16
+; CHECK: [[NEWVAL:%.*]] = add i16 [[OLDVAL]], %addend
+; CHECK: [[NEWVAL32:%.*]] = zext i16 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.stlex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK-NOT: fence
+; CHECK: ret i16 [[OLDVAL]]
+  %res = atomicrmw add i16* %ptr, i16 %addend seq_cst
+  ret i16 %res
+}
+
+define i32 @test_atomic_sub_i32(i32* %ptr, i32 %subend) {
+; CHECK-LABEL: @test_atomic_sub_i32
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* %ptr)
+; CHECK: [[NEWVAL:%.*]] = sub i32 [[OLDVAL]], %subend
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 [[NEWVAL]], i32* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK-NOT: fence
+; CHECK: ret i32 [[OLDVAL]]
+  %res = atomicrmw sub i32* %ptr, i32 %subend acquire
+  ret i32 %res
+}
+
+define i64 @test_atomic_or_i64(i64* %ptr, i64 %orend) {
+; CHECK-LABEL: @test_atomic_or_i64
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[LOHI:%.*]] = call { i32, i32 } @llvm.arm.ldaexd(i8* [[PTR8]])
+; CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+; CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+; CHECK: [[LO64:%.*]] = zext i32 [[LO]] to i64
+; CHECK: [[HI64_TMP:%.*]] = zext i32 [[HI]] to i64
+; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
+; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
+; CHECK: [[NEWVAL:%.*]] = or i64 [[OLDVAL]], %orend
+; CHECK: [[NEWLO:%.*]] = trunc i64 [[NEWVAL]] to i32
+; CHECK: [[NEWHI_TMP:%.*]] = lshr i64 [[NEWVAL]], 32
+; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.stlexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK-NOT: fence
+; CHECK: ret i64 [[OLDVAL]]
+  %res = atomicrmw or i64* %ptr, i64 %orend seq_cst
+  ret i64 %res
+}
+
+define i8 @test_cmpxchg_i8_seqcst_seqcst(i8* %ptr, i8 %desired, i8 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i8_seqcst_seqcst
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i8
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i8 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[BARRIER:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[NEWVAL32:%.*]] = zext i8 %newval to i32
+; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.stlex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+
+; CHECK: [[BARRIER]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[DONE]]:
+; CHECK: ret i8 [[OLDVAL]]
+
+  %old = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
+  ret i8 %old
+}
+
+define i16 @test_cmpxchg_i16_seqcst_monotonic(i16* %ptr, i16 %desired, i16 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i16_seqcst_monotonic
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i16
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i16 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[NEWVAL32:%.*]] = zext i16 %newval to i32
+; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.stlex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+
+; CHECK: [[BARRIER]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[DONE]]:
+; CHECK: ret i16 [[OLDVAL]]
+
+  %old = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
+  ret i16 %old
+}
+
+define i32 @test_cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %desired, i32 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i32_acquire_acquire
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* %ptr)
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i32 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i32(i32 %newval, i32* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+
+; CHECK: [[BARRIER]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[DONE]]:
+; CHECK: ret i32 [[OLDVAL]]
+
+  %old = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
+  ret i32 %old
+}
+
+define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i64_monotonic_monotonic
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[LOHI:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[PTR8]])
+; CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+; CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+; CHECK: [[LO64:%.*]] = zext i32 [[LO]] to i64
+; CHECK: [[HI64_TMP:%.*]] = zext i32 [[HI]] to i64
+; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
+; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i64 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[DONE:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[NEWLO:%.*]] = trunc i64 %newval to i32
+; CHECK: [[NEWHI_TMP:%.*]] = lshr i64 %newval, 32
+; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[BARRIER:.*]]
+
+; CHECK: [[BARRIER]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[DONE]]:
+; CHECK: ret i64 [[OLDVAL]]
+
+  %old = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
+  ret i64 %old
+}
\ No newline at end of file
diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg b/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg
new file mode 100644
index 0000000..8a3ba96
--- /dev/null
+++ b/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'ARM' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll
index e90900a..e0c1efa 100644
--- a/test/Transforms/BBVectorize/simple-int.ll
+++ b/test/Transforms/BBVectorize/simple-int.ll
@@ -5,6 +5,18 @@ declare double @llvm.fma.f64(double, double, double)
 declare double @llvm.fmuladd.f64(double, double, double)
 declare double @llvm.cos.f64(double)
 declare double @llvm.powi.f64(double, i32)
+declare double @llvm.round.f64(double)
+declare double @llvm.copysign.f64(double, double)
+declare double @llvm.ceil.f64(double)
+declare double @llvm.nearbyint.f64(double)
+declare double @llvm.rint.f64(double)
+declare double @llvm.trunc.f64(double)
+declare double @llvm.floor.f64(double)
+declare double @llvm.fabs.f64(double)
+declare i64 @llvm.bswap.i64(i64)
+declare i64 @llvm.ctpop.i64(i64)
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
 
 ; Basic depth-3 chain with fma
 define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
@@ -124,9 +136,371 @@ define double @test4(double %A1, double %A2, double %B1, double %B2, i32 %P) {
 ; CHECK: ret double %R
 }
 
+; Basic depth-3 chain with round
+define double @testround(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.round.f64(double %X1)
+	%Y2 = call double @llvm.round.f64(double %X2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @testround
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x double> @llvm.round.v2f64(<2 x double> %X1)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+
+}
+
+; Basic depth-3 chain with copysign
+define double @testcopysign(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.copysign.f64(double %X1, double %A1)
+	%Y2 = call double @llvm.copysign.f64(double %X2, double %A1)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @testcopysign
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1.v.i1.2 = insertelement <2 x double> %X1.v.i0.1, double %A1, i32 1
+; CHECK: %Y1 = call <2 x double> @llvm.copysign.v2f64(<2 x double> %X1, <2 x double> %Y1.v.i1.2)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+
+}
+
+; Basic depth-3 chain with ceil
+define double @testceil(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.ceil.f64(double %X1)
+	%Y2 = call double @llvm.ceil.f64(double %X2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @testceil
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %X1)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+
+}
+
+; Basic depth-3 chain with nearbyint
+define double @testnearbyint(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.nearbyint.f64(double %X1)
+	%Y2 = call double @llvm.nearbyint.f64(double %X2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @testnearbyint
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %X1)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+
+}
+
+; Basic depth-3 chain with rint
+define double @testrint(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.rint.f64(double %X1)
+	%Y2 = call double @llvm.rint.f64(double %X2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @testrint
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x double> @llvm.rint.v2f64(<2 x double> %X1)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+
+}
+
+; Basic depth-3 chain with trunc
+define double @testtrunc(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.trunc.f64(double %X1)
+	%Y2 = call double @llvm.trunc.f64(double %X2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @testtrunc
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %X1)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+
+}
+
+; Basic depth-3 chain with floor
+define double @testfloor(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.floor.f64(double %X1)
+	%Y2 = call double @llvm.floor.f64(double %X2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @testfloor
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x double> @llvm.floor.v2f64(<2 x double> %X1)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+
+}
+
+; Basic depth-3 chain with fabs
+define double @testfabs(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.fabs.f64(double %X1)
+	%Y2 = call double @llvm.fabs.f64(double %X2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @testfabs
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x double> @llvm.fabs.v2f64(<2 x double> %X1)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+
+}
+
+; Basic depth-3 chain with bswap
+define i64 @testbswap(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
+	%X1 = sub i64 %A1, %B1
+	%X2 = sub i64 %A2, %B2
+	%Y1 = call i64 @llvm.bswap.i64(i64 %X1)
+	%Y2 = call i64 @llvm.bswap.i64(i64 %X2)
+	%Z1 = add i64 %Y1, %B1
+	%Z2 = add i64 %Y2, %B2
+	%R  = mul i64 %Z1, %Z2
+	ret i64 %R
+	
+; CHECK: @testbswap
+; CHECK: %X1.v.i1.1 = insertelement <2 x i64> undef, i64 %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x i64> %X1.v.i1.1, i64 %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x i64> undef, i64 %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x i64> %X1.v.i0.1, i64 %A2, i32 1
+; CHECK: %X1 = sub <2 x i64> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %X1)
+; CHECK: %Z1 = add <2 x i64> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x i64> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x i64> %Z1, i32 1
+; CHECK: %R = mul i64 %Z1.v.r1, %Z1.v.r2
+; CHECK: ret i64 %R
+
+}
+
+; Basic depth-3 chain with ctpop
+define i64 @testctpop(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
+	%X1 = sub i64 %A1, %B1
+	%X2 = sub i64 %A2, %B2
+	%Y1 = call i64 @llvm.ctpop.i64(i64 %X1)
+	%Y2 = call i64 @llvm.ctpop.i64(i64 %X2)
+	%Z1 = add i64 %Y1, %B1
+	%Z2 = add i64 %Y2, %B2
+	%R  = mul i64 %Z1, %Z2
+	ret i64 %R
+	
+; CHECK: @testctpop
+; CHECK: %X1.v.i1.1 = insertelement <2 x i64> undef, i64 %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x i64> %X1.v.i1.1, i64 %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x i64> undef, i64 %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x i64> %X1.v.i0.1, i64 %A2, i32 1
+; CHECK: %X1 = sub <2 x i64> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %X1)
+; CHECK: %Z1 = add <2 x i64> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x i64> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x i64> %Z1, i32 1
+; CHECK: %R = mul i64 %Z1.v.r1, %Z1.v.r2
+; CHECK: ret i64 %R
+
+}
+
+; Basic depth-3 chain with ctlz
+define i64 @testctlz(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
+	%X1 = sub i64 %A1, %B1
+	%X2 = sub i64 %A2, %B2
+	%Y1 = call i64 @llvm.ctlz.i64(i64 %X1, i1 true)
+	%Y2 = call i64 @llvm.ctlz.i64(i64 %X2, i1 true)
+	%Z1 = add i64 %Y1, %B1
+	%Z2 = add i64 %Y2, %B2
+	%R  = mul i64 %Z1, %Z2
+	ret i64 %R
+
+; CHECK: @testctlz
+; CHECK: %X1.v.i1.1 = insertelement <2 x i64> undef, i64 %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x i64> %X1.v.i1.1, i64 %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x i64> undef, i64 %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x i64> %X1.v.i0.1, i64 %A2, i32 1
+; CHECK: %X1 = sub <2 x i64> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %X1, i1 true)
+; CHECK: %Z1 = add <2 x i64> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x i64> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x i64> %Z1, i32 1
+; CHECK: %R = mul i64 %Z1.v.r1, %Z1.v.r2
+; CHECK: ret i64 %R
+
+}
+
+; Basic depth-3 chain with ctlz
+define i64 @testctlzneg(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
+	%X1 = sub i64 %A1, %B1
+	%X2 = sub i64 %A2, %B2
+	%Y1 = call i64 @llvm.ctlz.i64(i64 %X1, i1 true)
+	%Y2 = call i64 @llvm.ctlz.i64(i64 %X2, i1 false)
+	%Z1 = add i64 %Y1, %B1
+	%Z2 = add i64 %Y2, %B2
+	%R  = mul i64 %Z1, %Z2
+	ret i64 %R
+
+; CHECK: @testctlzneg
+; CHECK: %X1 = sub i64 %A1, %B1
+; CHECK: %X2 = sub i64 %A2, %B2
+; CHECK: %Y1 = call i64 @llvm.ctlz.i64(i64 %X1, i1 true)
+; CHECK: %Y2 = call i64 @llvm.ctlz.i64(i64 %X2, i1 false)
+; CHECK: %Z1 = add i64 %Y1, %B1
+; CHECK: %Z2 = add i64 %Y2, %B2
+; CHECK: %R = mul i64 %Z1, %Z2
+; CHECK: ret i64 %R
+}
+
+; Basic depth-3 chain with cttz
+define i64 @testcttz(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
+	%X1 = sub i64 %A1, %B1
+	%X2 = sub i64 %A2, %B2
+	%Y1 = call i64 @llvm.cttz.i64(i64 %X1, i1 true)
+	%Y2 = call i64 @llvm.cttz.i64(i64 %X2, i1 true)
+	%Z1 = add i64 %Y1, %B1
+	%Z2 = add i64 %Y2, %B2
+	%R  = mul i64 %Z1, %Z2
+	ret i64 %R
+
+; CHECK: @testcttz
+; CHECK: %X1.v.i1.1 = insertelement <2 x i64> undef, i64 %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x i64> %X1.v.i1.1, i64 %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x i64> undef, i64 %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x i64> %X1.v.i0.1, i64 %A2, i32 1
+; CHECK: %X1 = sub <2 x i64> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %X1, i1 true)
+; CHECK: %Z1 = add <2 x i64> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x i64> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x i64> %Z1, i32 1
+; CHECK: %R = mul i64 %Z1.v.r1, %Z1.v.r2
+; CHECK: ret i64 %R
+
+}
+
+; Basic depth-3 chain with cttz
+define i64 @testcttzneg(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
+	%X1 = sub i64 %A1, %B1
+	%X2 = sub i64 %A2, %B2
+	%Y1 = call i64 @llvm.cttz.i64(i64 %X1, i1 true)
+	%Y2 = call i64 @llvm.cttz.i64(i64 %X2, i1 false)
+	%Z1 = add i64 %Y1, %B1
+	%Z2 = add i64 %Y2, %B2
+	%R  = mul i64 %Z1, %Z2
+	ret i64 %R
+
+; CHECK: @testcttzneg
+; CHECK: %X1 = sub i64 %A1, %B1
+; CHECK: %X2 = sub i64 %A2, %B2
+; CHECK: %Y1 = call i64 @llvm.cttz.i64(i64 %X1, i1 true)
+; CHECK: %Y2 = call i64 @llvm.cttz.i64(i64 %X2, i1 false)
+; CHECK: %Z1 = add i64 %Y1, %B1
+; CHECK: %Z2 = add i64 %Y2, %B2
+; CHECK: %R = mul i64 %Z1, %Z2
+; CHECK: ret i64 %R
+}
+
+
+
 ; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #0
 ; CHECK: declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) #0
 ; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) #0
 ; CHECK: declare <2 x double> @llvm.powi.v2f64(<2 x double>, i32) #0
-
+; CHECK: declare <2 x double> @llvm.round.v2f64(<2 x double>) #0
+; CHECK: declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) #0
+; CHECK: declare <2 x double> @llvm.ceil.v2f64(<2 x double>) #0
+; CHECK: declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0
+; CHECK: declare <2 x double> @llvm.rint.v2f64(<2 x double>) #0
+; CHECK: declare <2 x double> @llvm.trunc.v2f64(<2 x double>) #0
+; CHECK: declare <2 x double> @llvm.floor.v2f64(<2 x double>) #0
+; CHECK: declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #0
+; CHECK: declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) #0
+; CHECK: declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) #0
+; CHECK: declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) #0
+; CHECK: declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) #0
 ; CHECK: attributes #0 = { nounwind readnone }
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
new file mode 100644
index 0000000..a985c36
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: @load_cast_gep
+; CHECK: add i64 %sunkaddr, 40
+define void @load_cast_gep(i1 %cond, i64* %base) {
+entry:
+  %addr = getelementptr inbounds i64* %base, i64 5
+  %casted = addrspacecast i64* %addr to i32 addrspace(1)*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  %v = load i32 addrspace(1)* %casted, align 4
+  br label %fallthrough
+
+fallthrough:
+  ret void
+}
+
+; CHECK-LABEL: @store_gep_cast
+; CHECK: add i64 %sunkaddr, 20
+define void @store_gep_cast(i1 %cond, i64* %base) {
+entry:
+  %casted = addrspacecast i64* %base to i32 addrspace(1)*
+  %addr = getelementptr inbounds i32 addrspace(1)* %casted, i64 5
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+  store i32 0, i32 addrspace(1)* %addr, align 4
+  br label %fallthrough
+
+fallthrough:
+  ret void
+}
diff --git a/test/Transforms/ConstProp/loads.ll b/test/Transforms/ConstProp/loads.ll
index d05db47..0ea9c47 100644
--- a/test/Transforms/ConstProp/loads.ll
+++ b/test/Transforms/ConstProp/loads.ll
@@ -219,3 +219,37 @@ entry:
 ; BE-LABEL: @test15(
 ; BE: ret i64 2
 }
+
+@gv7 = constant [4 x i8*] [i8* null, i8* inttoptr (i64 -14 to i8*), i8* null, i8* null]
+define i64 @test16.1() {
+  %v = load i64* bitcast ([4 x i8*]* @gv7 to i64*), align 8
+  ret i64 %v
+
+; LE-LABEL: @test16.1(
+; LE: ret i64 0
+
+; BE-LABEL: @test16.1(
+; BE: ret i64 0
+}
+
+define i64 @test16.2() {
+  %v = load i64* bitcast (i8** getelementptr inbounds ([4 x i8*]* @gv7, i64 0, i64 1) to i64*), align 8
+  ret i64 %v
+
+; LE-LABEL: @test16.2(
+; LE: ret i64 -14
+
+; BE-LABEL: @test16.2(
+; BE: ret i64 -14
+}
+
+define i64 @test16.3() {
+  %v = load i64* bitcast (i8** getelementptr inbounds ([4 x i8*]* @gv7, i64 0, i64 2) to i64*), align 8
+  ret i64 %v
+
+; LE-LABEL: @test16.3(
+; LE: ret i64 0
+
+; BE-LABEL: @test16.3(
+; BE: ret i64 0
+}
diff --git a/test/Transforms/ConstantHoisting/AArch64/const-addr.ll b/test/Transforms/ConstantHoisting/AArch64/const-addr.ll
new file mode 100644
index 0000000..89d5960
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/AArch64/const-addr.ll
@@ -0,0 +1,23 @@
+; RUN: opt -mtriple=arm64-darwin-unknown -S -consthoist < %s | FileCheck %s
+
+%T = type { i32, i32, i32, i32 }
+
+define i32 @test1() nounwind {
+; CHECK-LABEL: test1
+; CHECK: %const = bitcast i64 68141056 to i64
+; CHECK: %1 = inttoptr i64 %const to %T*
+; CHECK: %o1 = getelementptr %T* %1, i32 0, i32 1
+; CHECK: %o2 = getelementptr %T* %1, i32 0, i32 2
+; CHECK: %o3 = getelementptr %T* %1, i32 0, i32 3
+  %at = inttoptr i64 68141056 to %T*
+  %o1 = getelementptr %T* %at, i32 0, i32 1
+  %t1 = load i32* %o1
+  %o2 = getelementptr %T* %at, i32 0, i32 2
+  %t2 = load i32* %o2
+  %a1 = add i32 %t1, %t2
+  %o3 = getelementptr %T* %at, i32 0, i32 3
+  %t3 = load i32* %o3
+  %a2 = add i32 %a1, %t3
+  ret i32 %a2
+}
+
diff --git a/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll b/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
new file mode 100644
index 0000000..575be79
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/AArch64/large-immediate.ll
@@ -0,0 +1,27 @@
+; RUN: opt -mtriple=arm64-darwin-unknown -S -consthoist < %s | FileCheck %s
+
+define i128 @test1(i128 %a) nounwind {
+; CHECK-LABEL: test1
+; CHECK: %const = bitcast i128 12297829382473034410122878 to i128
+  %1 = add i128 %a, 12297829382473034410122878
+  %2 = add i128 %1, 12297829382473034410122878
+  ret i128 %2
+}
+
+; Check that we don't hoist large, but cheap constants
+define i512 @test2(i512 %a) nounwind {
+; CHECK-LABEL: test2
+; CHECK-NOT: %const = bitcast i512 7 to i512
+  %1 = and i512 %a, 7
+  %2 = or i512 %1, 7
+  ret i512 %2
+}
+
+; Check that we don't hoist the shift value of a shift instruction.
+define i512 @test3(i512 %a) nounwind {
+; CHECK-LABEL: test3
+; CHECK-NOT: %const = bitcast i512 504 to i512
+  %1 = shl i512 %a, 504
+  %2 = ashr i512 %1, 504
+  ret i512 %2
+}
diff --git a/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg b/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg
new file mode 100644
index 0000000..c420349
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/AArch64/lit.local.cfg
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
diff --git a/test/Transforms/ConstantHoisting/PowerPC/const-base-addr.ll b/test/Transforms/ConstantHoisting/PowerPC/const-base-addr.ll
new file mode 100644
index 0000000..b4337ee
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/PowerPC/const-base-addr.ll
@@ -0,0 +1,23 @@
+; RUN: opt -S -consthoist < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%T = type { i32, i32, i32, i32 }
+
+; Test if even cheap base addresses are hoisted.
+define i32 @test1() nounwind {
+; CHECK-LABEL:  @test1
+; CHECK:        %const = bitcast i32 12345678 to i32
+; CHECK:        %1 = inttoptr i32 %const to %T*
+; CHECK:        %addr1 = getelementptr %T* %1, i32 0, i32 1
+  %addr1 = getelementptr %T* inttoptr (i32 12345678 to %T*), i32 0, i32 1
+  %tmp1 = load i32* %addr1
+  %addr2 = getelementptr %T* inttoptr (i32 12345678 to %T*), i32 0, i32 2
+  %tmp2 = load i32* %addr2
+  %addr3 = getelementptr %T* inttoptr (i32 12345678 to %T*), i32 0, i32 3
+  %tmp3 = load i32* %addr3
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+}
+
diff --git a/test/Transforms/ConstantHoisting/PowerPC/lit.local.cfg b/test/Transforms/ConstantHoisting/PowerPC/lit.local.cfg
new file mode 100644
index 0000000..2e46300
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/PowerPC/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'PowerPC' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/ConstantHoisting/PowerPC/masks.ll b/test/Transforms/ConstantHoisting/PowerPC/masks.ll
new file mode 100644
index 0000000..d553182
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/PowerPC/masks.ll
@@ -0,0 +1,66 @@
+; RUN: opt -S -consthoist < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Here the masks are all contiguous, and should not be hoisted.
+define i32 @test1() nounwind {
+entry:
+; CHECK-LABEL:  @test1
+; CHECK-NOT: bitcast i32 65535 to i32
+; CHECK: and i32 undef, 65535
+  %conv121 = and i32 undef, 65535
+  br i1 undef, label %if.then152, label %if.end167
+
+if.then152:
+; CHECK: and i32 undef, 65535
+  %conv153 = and i32 undef, 65535
+  br i1 undef, label %if.end167, label %end2
+
+if.end167:
+; CHECK: and i32 {{.*}}, 32768
+  %shl161 = shl nuw nsw i32 %conv121, 15
+  %0 = load i8* undef, align 1
+  %conv169 = zext i8 %0 to i32
+  %shl170 = shl nuw nsw i32 %conv169, 7
+  %shl161.masked = and i32 %shl161, 32768
+  %conv174 = or i32 %shl170, %shl161.masked
+  %cmp178 = icmp ugt i32 %conv174, 32767
+  br i1 %cmp178, label %end1, label %end2
+
+end1:
+  unreachable
+
+end2:
+  unreachable
+}
+
+; Here the masks are not contiguous, and should be hoisted.
+define i32 @test2() nounwind {
+entry:
+; CHECK-LABEL: @test2
+; CHECK: bitcast i32 65531 to i32
+  %conv121 = and i32 undef, 65531
+  br i1 undef, label %if.then152, label %if.end167
+
+if.then152:
+  %conv153 = and i32 undef, 65531
+  br i1 undef, label %if.end167, label %end2
+
+if.end167:
+; CHECK: add i32 {{.*}}, -32758
+  %shl161 = shl nuw nsw i32 %conv121, 15
+  %0 = load i8* undef, align 1
+  %conv169 = zext i8 %0 to i32
+  %shl170 = shl nuw nsw i32 %conv169, 7
+  %shl161.masked = and i32 %shl161, 32773
+  %conv174 = or i32 %shl170, %shl161.masked
+  %cmp178 = icmp ugt i32 %conv174, 32767
+  br i1 %cmp178, label %end1, label %end2
+
+end1:
+  unreachable
+
+end2:
+  unreachable
+}
+
diff --git a/test/Transforms/ConstantHoisting/X86/cast-inst.ll b/test/Transforms/ConstantHoisting/X86/cast-inst.ll
new file mode 100644
index 0000000..f490f4a
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/X86/cast-inst.ll
@@ -0,0 +1,29 @@
+; RUN: opt -S -consthoist < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Check if the materialization of the constant and the cast instruction are
+; inserted in the correct order.
+define i32 @cast_inst_test() {
+; CHECK-LABEL:  @cast_inst_test
+; CHECK:        %const = bitcast i64 4646526064 to i64
+; CHECK:        %1 = inttoptr i64 %const to i32*
+; CHECK:        %v0 = load i32* %1, align 16
+; CHECK:        %const_mat = add i64 %const, 16
+; CHECK-NEXT:   %2 = inttoptr i64 %const_mat to i32*
+; CHECK-NEXT:   %v1 = load i32* %2, align 16
+; CHECK:        %const_mat1 = add i64 %const, 32
+; CHECK-NEXT:   %3 = inttoptr i64 %const_mat1 to i32*
+; CHECK-NEXT:   %v2 = load i32* %3, align 16
+  %a0 = inttoptr i64 4646526064 to i32*
+  %v0 = load i32* %a0, align 16
+  %a1 = inttoptr i64 4646526080 to i32*
+  %v1 = load i32* %a1, align 16
+  %a2 = inttoptr i64 4646526096 to i32*
+  %v2 = load i32* %a2, align 16
+  %r0 = add i32 %v0, %v1
+  %r1 = add i32 %r0, %v2
+  ret i32 %r1
+}
+
diff --git a/test/Transforms/ConstantHoisting/X86/delete-dead-cast-inst.ll b/test/Transforms/ConstantHoisting/X86/delete-dead-cast-inst.ll
index f8e478e..d352386 100644
--- a/test/Transforms/ConstantHoisting/X86/delete-dead-cast-inst.ll
+++ b/test/Transforms/ConstantHoisting/X86/delete-dead-cast-inst.ll
@@ -1,5 +1,4 @@
-; Test if this compiles without assertions.
-; RUN: opt -S -consthoist < %s
+; RUN: opt -S -consthoist < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
@@ -7,6 +6,13 @@ target triple = "x86_64-apple-macosx10.9.0"
 %T = type { i32, i32, i32, i32 }
 
 define i32 @test1() nounwind {
+; CHECK-LABEL:  @test1
+; CHECK:        %const = bitcast i32 12345678 to i32
+; CHECK-NOT:    %base = inttoptr i32 12345678 to %T*
+; CHECK-NEXT:   %1 = inttoptr i32 %const to %T*
+; CHECK-NEXT:   %addr1 = getelementptr %T* %1, i32 0, i32 1
+; CHECK-NEXT:   %addr2 = getelementptr %T* %1, i32 0, i32 2
+; CHECK-NEXT:   %addr3 = getelementptr %T* %1, i32 0, i32 3
   %base = inttoptr i32 12345678 to %T*
   %addr1 = getelementptr %T* %base, i32 0, i32 1
   %addr2 = getelementptr %T* %base, i32 0, i32 2
diff --git a/test/Transforms/ConstantHoisting/X86/large-immediate.ll b/test/Transforms/ConstantHoisting/X86/large-immediate.ll
new file mode 100644
index 0000000..e0af9c9
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/X86/large-immediate.ll
@@ -0,0 +1,27 @@
+; RUN: opt -mtriple=x86_64-darwin-unknown -S -consthoist < %s | FileCheck %s
+
+define i128 @test1(i128 %a) nounwind {
+; CHECK-LABEL: test1
+; CHECK: %const = bitcast i128 12297829382473034410122878 to i128
+  %1 = add i128 %a, 12297829382473034410122878
+  %2 = add i128 %1, 12297829382473034410122878
+  ret i128 %2
+}
+
+; Check that we don't hoist the shift value of a shift instruction.
+define i512 @test2(i512 %a) nounwind {
+; CHECK-LABEL: test2
+; CHECK-NOT: %const = bitcast i512 504 to i512
+  %1 = shl i512 %a, 504
+  %2 = ashr i512 %1, 504
+  ret i512 %2
+}
+
+; Check that we don't hoist constants with a type larger than i128.
+define i196 @test3(i196 %a) nounwind {
+; CHECK-LABEL: test3
+; CHECK-NOT: %const = bitcast i196 2 to i196
+  %1 = mul i196 %a, 2
+  %2 = mul i196 %1, 2
+  ret i196 %2
+}
diff --git a/test/Transforms/GVN/load-pre-nonlocal.ll b/test/Transforms/GVN/load-pre-nonlocal.ll
new file mode 100644
index 0000000..7bac1b7
--- /dev/null
+++ b/test/Transforms/GVN/load-pre-nonlocal.ll
@@ -0,0 +1,87 @@
+; RUN: opt -S -o - -basicaa -domtree -gvn %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.S1 = type { i32, i32 }
+
+@a2 = common global i32* null, align 8
+@a = common global i32* null, align 8
+@s1 = common global %struct.S1 zeroinitializer, align 8
+
+; Check that GVN doesn't determine %2 is partially redundant.
+
+; CHECK-LABEL: define i32 @volatile_load
+; CHECK: for.body:
+; CHECK: %2 = load i32*
+; CHECK: %3 = load volatile i32*
+; CHECK: for.cond.for.end_crit_edge:
+
+define i32 @volatile_load(i32 %n) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load i32** @a2, align 8, !tbaa !1
+  %1 = load i32** @a, align 8, !tbaa !1
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %s.09 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+  %p.08 = phi i32* [ %0, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %2 = load i32* %p.08, align 4, !tbaa !5
+  %arrayidx = getelementptr inbounds i32* %1, i64 %indvars.iv
+  store i32 %2, i32* %arrayidx, align 4, !tbaa !5
+  %3 = load volatile i32* %p.08, align 4, !tbaa !5
+  %add = add nsw i32 %3, %s.09
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %incdec.ptr = getelementptr inbounds i32* %p.08, i64 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:
+  %s.0.lcssa = phi i32 [ %add.lcssa, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %s.0.lcssa
+}
+
+; %1 is partially redundant if %0 can be widened to a 64-bit load.
+
+; CHECK-LABEL: define i32 @overaligned_load
+; CHECK: if.end:
+; CHECK-NOT: %1 = load i32*
+
+define i32 @overaligned_load(i32 %a, i32* nocapture %b) {
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %0 = load i32* getelementptr inbounds (%struct.S1* @s1, i64 0, i32 0), align 8, !tbaa !5
+  br label %if.end
+
+if.else:
+  %arrayidx = getelementptr inbounds i32* %b, i64 2
+  store i32 10, i32* %arrayidx, align 4, !tbaa !5
+  br label %if.end
+
+if.end:
+  %i.0 = phi i32 [ %0, %if.then ], [ 0, %if.else ]
+  %p.0 = phi i32* [ getelementptr inbounds (%struct.S1* @s1, i64 0, i32 0), %if.then ], [ %b, %if.else ]
+  %add.ptr = getelementptr inbounds i32* %p.0, i64 1
+  %1 = load i32* %add.ptr, align 4, !tbaa !5
+  %add1 = add nsw i32 %1, %i.0
+  ret i32 %add1
+}
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"any pointer", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !6, metadata !6, i64 0}
+!6 = metadata !{metadata !"int", metadata !3, i64 0}
diff --git a/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll b/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll
index 6658cee..4b96799 100644
--- a/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll
+++ b/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll
@@ -1,8 +1,18 @@
-; RUN: opt < %s -globaldce -S | not grep @D
-; RUN: opt < %s -globaldce -S | grep @L | count 3
+; RUN: opt < %s -globaldce -S > %t
+; RUN: FileCheck %s < %t
+; RUN: FileCheck --check-prefix=DEAD %s < %t
 
 @A = global i32 0
+; CHECK: @A = global i32 0
+
 @D = alias internal i32* @A
+; DEAD-NOT: @D
+
 @L1 = alias i32* @A
-@L2 = alias internal i32* @L1
-@L3 = alias i32* @L2
+; CHECK: @L1 = alias i32* @A
+
+@L2 = alias internal i32* @A
+; DEAD-NOT: @L2
+
+@L3 = alias i32* @A
+; CHECK: @L3 = alias i32* @A
diff --git a/test/Transforms/GlobalDCE/global_ctors.ll b/test/Transforms/GlobalDCE/global_ctors.ll
new file mode 100644
index 0000000..91bb9ab
--- /dev/null
+++ b/test/Transforms/GlobalDCE/global_ctors.ll
@@ -0,0 +1,14 @@
+; RUN: opt -S -globaldce < %s | FileCheck %s
+
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_notremovable }]
+; CHECK-NOT: @_GLOBAL__I_a
+
+declare void @_notremovable()
+
+@llvm.global_ctors = appending global [2 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }, { i32, void ()* } { i32 65535, void ()* @_notremovable }]
+
+; Function Attrs: nounwind readnone
+define internal void @_GLOBAL__I_a() #1 section "__TEXT,__StaticInit,regular,pure_instructions" {
+entry:
+  ret void
+}
diff --git a/test/Transforms/GlobalDCE/global_ctors_integration.ll b/test/Transforms/GlobalDCE/global_ctors_integration.ll
new file mode 100644
index 0000000..5e6cc79
--- /dev/null
+++ b/test/Transforms/GlobalDCE/global_ctors_integration.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -O2 < %s | FileCheck %s
+
+; This test checks that -O2 is able to delete constructors that become empty
+; only after some optimization passes have run, even if the pass structure
+; changes.
+; CHECK-NOT: @_GLOBAL__I_a
+
+%class.Foo = type { i32 }
+
+@foo = global %class.Foo zeroinitializer, align 4
+@_ZN3Bar18LINKER_INITIALIZEDE = external constant i32
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+
+define internal void @__cxx_global_var_init() section "__TEXT,__StaticInit,regular,pure_instructions" {
+  %1 = load i32* @_ZN3Bar18LINKER_INITIALIZEDE, align 4
+  call void @_ZN3FooC1E17LinkerInitialized(%class.Foo* @foo, i32 %1)
+  ret void
+}
+
+; Function Attrs: ssp uwtable
+define linkonce_odr void @_ZN3FooC1E17LinkerInitialized(%class.Foo* %this, i32) unnamed_addr #0 align 2 {
+  %2 = alloca %class.Foo*, align 8
+  %3 = alloca i32, align 4
+  store %class.Foo* %this, %class.Foo** %2, align 8
+  store i32 %0, i32* %3, align 4
+  %4 = load %class.Foo** %2
+  %5 = load i32* %3, align 4
+  call void @_ZN3FooC2E17LinkerInitialized(%class.Foo* %4, i32 %5)
+  ret void
+}
+
+; Function Attrs: nounwind ssp uwtable
+define linkonce_odr void @_ZN3FooC2E17LinkerInitialized(%class.Foo* %this, i32) unnamed_addr #1 align 2 {
+  %2 = alloca %class.Foo*, align 8
+  %3 = alloca i32, align 4
+  store %class.Foo* %this, %class.Foo** %2, align 8
+  store i32 %0, i32* %3, align 4
+  %4 = load %class.Foo** %2
+  ret void
+}
+
+define internal void @_GLOBAL__I_a() section "__TEXT,__StaticInit,regular,pure_instructions" {
+  call void @__cxx_global_var_init()
+  ret void
+}
diff --git a/test/Transforms/GlobalMerge/AArch64/arm64.ll b/test/Transforms/GlobalMerge/AArch64/arm64.ll
new file mode 100644
index 0000000..eea474a
--- /dev/null
+++ b/test/Transforms/GlobalMerge/AArch64/arm64.ll
@@ -0,0 +1,88 @@
+; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
+; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s
+
+; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
+; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
+
+; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
+; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
+; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
+; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+@bar = internal global [5 x i32] zeroinitializer, align 4
+@baz = internal global [5 x i32] zeroinitializer, align 4
+@foo = internal global [5 x i32] zeroinitializer, align 4
+
+; Function Attrs: nounwind ssp
+define internal void @initialize() #0 {
+  %1 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %1, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
+  %2 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %2, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
+  %3 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
+  %4 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %4, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
+  %5 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %5, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
+  %6 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
+  %7 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %7, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
+  %8 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %8, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
+  %9 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
+  %10 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %10, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
+  ret void
+}
+
+declare i32 @calc(...)
+
+; Function Attrs: nounwind ssp
+define internal void @calculate() #0 {
+  %1 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
+  %2 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
+  %3 = mul nsw i32 %2, %1
+  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0), align 4
+  %4 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
+  %5 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
+  %6 = mul nsw i32 %5, %4
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 1), align 4
+  %7 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
+  %8 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
+  %9 = mul nsw i32 %8, %7
+  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 2), align 4
+  %10 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
+  %11 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
+  %12 = mul nsw i32 %11, %10
+  store i32 %12, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 3), align 4
+  %13 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
+  %14 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
+  %15 = mul nsw i32 %14, %13
+  store i32 %15, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 4), align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone ssp
+define internal i32* @returnFoo() #1 {
+  ret i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0)
+}
+
+attributes #0 = { nounwind ssp }
+attributes #1 = { nounwind readnone ssp }
+attributes #2 = { nounwind }
diff --git a/test/Transforms/GlobalMerge/AArch64/lit.local.cfg b/test/Transforms/GlobalMerge/AArch64/lit.local.cfg
new file mode 100644
index 0000000..9a66a00
--- /dev/null
+++ b/test/Transforms/GlobalMerge/AArch64/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/GlobalMerge/ARM64/arm64.ll b/test/Transforms/GlobalMerge/ARM64/arm64.ll
deleted file mode 100644
index eea474a..0000000
--- a/test/Transforms/GlobalMerge/ARM64/arm64.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
-; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s
-
-; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
-; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
-; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
-
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
-; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
-; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
-; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
-; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-target triple = "arm64-apple-ios7.0.0"
-
-@bar = internal global [5 x i32] zeroinitializer, align 4
-@baz = internal global [5 x i32] zeroinitializer, align 4
-@foo = internal global [5 x i32] zeroinitializer, align 4
-
-; Function Attrs: nounwind ssp
-define internal void @initialize() #0 {
-  %1 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %1, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
-  %2 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %2, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
-  %3 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
-  %4 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %4, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
-  %5 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %5, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
-  %6 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
-  %7 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %7, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
-  %8 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %8, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
-  %9 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
-  %10 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
-  store i32 %10, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
-  ret void
-}
-
-declare i32 @calc(...)
-
-; Function Attrs: nounwind ssp
-define internal void @calculate() #0 {
-  %1 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
-  %2 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
-  %3 = mul nsw i32 %2, %1
-  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0), align 4
-  %4 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
-  %5 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
-  %6 = mul nsw i32 %5, %4
-  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 1), align 4
-  %7 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
-  %8 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
-  %9 = mul nsw i32 %8, %7
-  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 2), align 4
-  %10 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
-  %11 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
-  %12 = mul nsw i32 %11, %10
-  store i32 %12, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 3), align 4
-  %13 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
-  %14 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
-  %15 = mul nsw i32 %14, %13
-  store i32 %15, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 4), align 4
-  ret void
-}
-
-; Function Attrs: nounwind readnone ssp
-define internal i32* @returnFoo() #1 {
-  ret i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0)
-}
-
-attributes #0 = { nounwind ssp }
-attributes #1 = { nounwind readnone ssp }
-attributes #2 = { nounwind }
diff --git a/test/Transforms/GlobalMerge/ARM64/lit.local.cfg b/test/Transforms/GlobalMerge/ARM64/lit.local.cfg
deleted file mode 100644
index a75a42b..0000000
--- a/test/Transforms/GlobalMerge/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/Transforms/GlobalOpt/2009-02-15-BitcastAlias.ll b/test/Transforms/GlobalOpt/2009-02-15-BitcastAlias.ll
index d6a565a..03d6ee4 100644
--- a/test/Transforms/GlobalOpt/2009-02-15-BitcastAlias.ll
+++ b/test/Transforms/GlobalOpt/2009-02-15-BitcastAlias.ll
@@ -2,7 +2,7 @@
 
 @g = global i32 0
 
-@a = alias bitcast (i32* @g to i8*)
+@a = alias i8, i32* @g
 
 define void @f() {
 	%tmp = load i8* @a
diff --git a/test/Transforms/GlobalOpt/alias-resolve.ll b/test/Transforms/GlobalOpt/alias-resolve.ll
index 2d5a956..bd07b31 100644
--- a/test/Transforms/GlobalOpt/alias-resolve.ll
+++ b/test/Transforms/GlobalOpt/alias-resolve.ll
@@ -1,9 +1,9 @@
 ; RUN: opt < %s -globalopt -S | FileCheck %s
 
-@foo1 = alias void ()* @foo2
+@foo1 = alias void ()* @bar2
 ; CHECK: @foo1 = alias void ()* @bar2
 
-@foo2 = alias void()* @bar1
+@foo2 = alias void()* @bar2
 ; CHECK: @foo2 = alias void ()* @bar2
 
 @bar1  = alias void ()* @bar2
diff --git a/test/Transforms/GlobalOpt/alias-used-section.ll b/test/Transforms/GlobalOpt/alias-used-section.ll
index 987c4a4..1217937 100644
--- a/test/Transforms/GlobalOpt/alias-used-section.ll
+++ b/test/Transforms/GlobalOpt/alias-used-section.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -S -globalopt < %s | FileCheck %s
 
 @_Z17in_custom_section = internal global i8 42, section "CUSTOM"
-@in_custom_section = protected dllexport alias internal i8* @_Z17in_custom_section
+@in_custom_section = dllexport alias internal i8* @_Z17in_custom_section
 
-; CHECK: @in_custom_section = internal protected dllexport global i8 42, section "CUSTOM"
+; CHECK: @in_custom_section = internal dllexport global i8 42, section "CUSTOM"
 
 @llvm.used = appending global [1 x i8*] [i8* @in_custom_section], section "llvm.metadata"
diff --git a/test/Transforms/GlobalOpt/atexit.ll b/test/Transforms/GlobalOpt/atexit.ll
index dbcd0d7..55c2dab 100644
--- a/test/Transforms/GlobalOpt/atexit.ll
+++ b/test/Transforms/GlobalOpt/atexit.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -globalopt -S | FileCheck %s
 
 ; CHECK: ModuleID
-define internal hidden i32 @__cxa_atexit(void (i8*)* nocapture %func, i8* nocapture %arg, i8* nocapture %dso_handle) nounwind readnone optsize noimplicitfloat {
+define internal i32 @__cxa_atexit(void (i8*)* nocapture %func, i8* nocapture %arg, i8* nocapture %dso_handle) nounwind readnone optsize noimplicitfloat {
   unreachable
 }
diff --git a/test/Transforms/GlobalOpt/ctor-list-opt.ll b/test/Transforms/GlobalOpt/ctor-list-opt.ll
index 542c786..450bdb8 100644
--- a/test/Transforms/GlobalOpt/ctor-list-opt.ll
+++ b/test/Transforms/GlobalOpt/ctor-list-opt.ll
@@ -1,5 +1,20 @@
-; RUN: opt < %s -globalopt -S | not grep CTOR
-@llvm.global_ctors = appending global [11 x { i32, void ()* }] [ { i32, void ()* } { i32 65535, void ()* @CTOR1 }, { i32, void ()* } { i32 65535, void ()* @CTOR1 }, { i32, void ()* } { i32 65535, void ()* @CTOR2 }, { i32, void ()* } { i32 65535, void ()* @CTOR3 }, { i32, void ()* } { i32 65535, void ()* @CTOR4 }, { i32, void ()* } { i32 65535, void ()* @CTOR5 }, { i32, void ()* } { i32 65535, void ()* @CTOR6 }, { i32, void ()* } { i32 65535, void ()* @CTOR7 }, { i32, void ()* } { i32 65535, void ()* @CTOR8 }, { i32, void ()* } { i32 65535, void ()* @CTOR9 }, { i32, void ()* } { i32 2147483647, void ()* null } ]		; <[10 x { i32, void ()* }]*> [#uses=0]
+; RUN: opt < %s -globalopt -S | FileCheck %s
+; CHECK-NOT: CTOR
+%ini = type { i32, void()*, i8* }
+@llvm.global_ctors = appending global [11 x %ini] [
+	%ini { i32 65535, void ()* @CTOR1, i8* null },
+	%ini { i32 65535, void ()* @CTOR1, i8* null },
+	%ini { i32 65535, void ()* @CTOR2, i8* null },
+	%ini { i32 65535, void ()* @CTOR3, i8* null },
+	%ini { i32 65535, void ()* @CTOR4, i8* null },
+	%ini { i32 65535, void ()* @CTOR5, i8* null },
+	%ini { i32 65535, void ()* @CTOR6, i8* null },
+	%ini { i32 65535, void ()* @CTOR7, i8* null },
+	%ini { i32 65535, void ()* @CTOR8, i8* null },
+	%ini { i32 65535, void ()* @CTOR9, i8* null },
+	%ini { i32 2147483647, void ()* null, i8* null }
+]
+
 @G = global i32 0		; <i32*> [#uses=1]
 @G2 = global i32 0		; <i32*> [#uses=1]
 @G3 = global i32 -123		; <i32*> [#uses=2]
diff --git a/test/Transforms/IPConstantProp/2009-09-24-byval-ptr.ll b/test/Transforms/IPConstantProp/2009-09-24-byval-ptr.ll
index bd174a8..4ea0b88 100644
--- a/test/Transforms/IPConstantProp/2009-09-24-byval-ptr.ll
+++ b/test/Transforms/IPConstantProp/2009-09-24-byval-ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as <%s | opt -ipsccp | llvm-dis | FileCheck %s
+; RUN: opt < %s -ipsccp -S | FileCheck %s
 ; Don't constant-propagate byval pointers, since they are not pointers!
 ; PR5038
 %struct.MYstr = type { i8, i32 }
diff --git a/test/Transforms/IndVarSimplify/pr18223.ll b/test/Transforms/IndVarSimplify/pr18223.ll
new file mode 100644
index 0000000..738f75c
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr18223.ll
@@ -0,0 +1,30 @@
+; RUN: opt -indvars -S < %s | FileCheck %s
+
+; indvars should transform the phi node pair from the for-loop
+; CHECK-LABEL: @main(
+; CHECK: ret = phi i32 [ 0, %entry ], [ 0, {{.*}} ]
+
+@c = common global i32 0, align 4
+
+define i32 @main() #0 {
+entry:
+  %0 = load i32* @c, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %for.body, label %exit
+
+for.body:
+  %inc2 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %sub = add i32 %inc2, -1
+  %cmp1 = icmp uge i32 %sub, %inc2
+  %conv = zext i1 %cmp1 to i32
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %inc2, 1
+  %cmp = icmp slt i32 %inc, 5
+  br i1 %cmp, label %for.body, label %exit
+
+exit:
+  %ret = phi i32 [ 0, %entry ], [ %conv, %for.inc ]
+  ret i32 %ret
+}
diff --git a/test/Transforms/Inline/2010-05-31-ByvalTailcall.ll b/test/Transforms/Inline/2010-05-31-ByvalTailcall.ll
deleted file mode 100644
index b37b9f2..0000000
--- a/test/Transforms/Inline/2010-05-31-ByvalTailcall.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; RUN: opt < %s -tailcallelim -inline -instcombine -dse -S | FileCheck %s
-; PR7272
-
-; When inlining through a byval call site, the inliner creates allocas which may
-; be used by inlined calls, so any inlined calls need to have their 'tail' flags
-; cleared.  If not then you can get nastiness like with this testcase, where the
-; (inlined) call to 'ext' in 'foo' was being passed an uninitialized value.
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
-target triple = "i386-pc-linux-gnu"
-
-declare void @ext(i32*)
-
-define void @bar(i32* byval %x) {
-  call void @ext(i32* %x)
-  ret void
-}
-
-define void @foo(i32* %x) {
-; CHECK-LABEL: define void @foo(
-; CHECK: store i32 %1, i32* %x
-  call void @bar(i32* byval %x)
-  ret void
-}
diff --git a/test/Transforms/Inline/always-inline.ll b/test/Transforms/Inline/always-inline.ll
index a8703b8..5ad1bde 100644
--- a/test/Transforms/Inline/always-inline.ll
+++ b/test/Transforms/Inline/always-inline.ll
@@ -122,3 +122,14 @@ entry:
   ret void
 }
 
+define i32 @inner7() {
+  ret i32 1
+}
+define i32 @outer7() {
+; CHECK-LABEL: @outer7(
+; CHECK-NOT: call
+; CHECK: ret
+
+   %r = call i32 @inner7() alwaysinline
+   ret i32 %r
+}
diff --git a/test/Transforms/Inline/byval-tail-call.ll b/test/Transforms/Inline/byval-tail-call.ll
new file mode 100644
index 0000000..3a8906a
--- /dev/null
+++ b/test/Transforms/Inline/byval-tail-call.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s -tailcallelim -inline -instcombine -dse -S | FileCheck %s
+; PR7272
+
+; Calls that capture byval parameters cannot be marked as tail calls. Other
+; tails that don't capture byval parameters can still be tail calls.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
+target triple = "i386-pc-linux-gnu"
+
+declare void @ext(i32*)
+
+define void @bar(i32* byval %x) {
+  call void @ext(i32* %x)
+  ret void
+}
+
+define void @foo(i32* %x) {
+; CHECK-LABEL: define void @foo(
+; CHECK: llvm.lifetime.start
+; CHECK: store i32 %2, i32* %x
+  call void @bar(i32* byval %x)
+  ret void
+}
+
+define internal void @qux(i32* byval %x) {
+  call void @ext(i32* %x)
+  tail call void @ext(i32* null)
+  ret void
+}
+define void @frob(i32* %x) {
+; CHECK-LABEL: define void @frob(
+; CHECK: alloca i32
+; CHECK: {{^ *}}call void @ext(
+; CHECK: tail call void @ext(i32* null)
+; CHECK: ret void
+  tail call void @qux(i32* byval %x)
+  ret void
+}
diff --git a/test/Transforms/Inline/byval_lifetime.ll b/test/Transforms/Inline/byval_lifetime.ll
new file mode 100644
index 0000000..e8dff2a
--- /dev/null
+++ b/test/Transforms/Inline/byval_lifetime.ll
@@ -0,0 +1,26 @@
+; RUN: opt -S -inline < %s | FileCheck %s
+; END.
+
+; By inlining foo, an alloca is created in main to hold the byval argument, so
+; a lifetime marker should be generated as well by default.
+
+%struct.foo = type { i32, [16 x i32] }
+
+@gFoo = global %struct.foo zeroinitializer, align 8
+
+define i32 @foo(%struct.foo* byval align 8 %f, i32 %a) {
+entry:
+  %a1 = getelementptr inbounds %struct.foo* %f, i32 0, i32 1
+  %arrayidx = getelementptr inbounds [16 x i32]* %a1, i32 0, i32 %a
+  %tmp2 = load i32* %arrayidx, align 1
+  ret i32 %tmp2
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+; CHECK-LABEL: @main
+; CHECK: llvm.lifetime.start
+; CHECK: memcpy
+entry:
+  %call = call i32 @foo(%struct.foo* byval align 8 @gFoo, i32 %argc)
+  ret i32 %call
+}
diff --git a/test/Transforms/Inline/inline-cold.ll b/test/Transforms/Inline/inline-cold.ll
index bb8c008..5743377 100644
--- a/test/Transforms/Inline/inline-cold.ll
+++ b/test/Transforms/Inline/inline-cold.ll
@@ -1,8 +1,15 @@
 ; RUN: opt < %s -inline -S -inlinecold-threshold=75 | FileCheck %s
-
 ; Test that functions with attribute Cold are not inlined while the 
 ; same function without attribute Cold will be inlined.
 
+; RUN: opt < %s -inline -S -inline-threshold=600 | FileCheck %s -check-prefix=OVERRIDE
+; The command line argument for inline-threshold should override
+; the default cold threshold, so a cold function with size bigger
+; than the default cold threshold (225) will be inlined.
+
+; RUN: opt < %s -inline -S | FileCheck %s -check-prefix=DEFAULT
+; The same cold function will not be inlined with the default behavior.
+
 @a = global i32 4
 
 ; This function should be larger than the cold threshold (75), but smaller
@@ -42,6 +49,10 @@ entry:
 define i32 @ColdFunction(i32 %a) #1 {
 ; CHECK-LABEL: @ColdFunction
 ; CHECK: ret
+; OVERRIDE-LABEL: @ColdFunction
+; OVERRIDE: ret
+; DEFAULT-LABEL: @ColdFunction
+; DEFAULT: ret
 entry:
   %a1 = load volatile i32* @a
   %x1 = add i32 %a1,  %a1
@@ -71,16 +82,117 @@ entry:
   ret i32 %add
 }
 
+; This function should be larger than the default cold threshold (225).
+define i32 @ColdFunction2(i32 %a) #1 {
+; CHECK-LABEL: @ColdFunction2
+; CHECK: ret
+; OVERRIDE-LABEL: @ColdFunction2
+; OVERRIDE: ret
+; DEFAULT-LABEL: @ColdFunction2
+; DEFAULT: ret
+entry:
+  %a1 = load volatile i32* @a
+  %x1 = add i32 %a1,  %a1
+  %a2 = load volatile i32* @a
+  %x2 = add i32 %x1, %a2
+  %a3 = load volatile i32* @a
+  %x3 = add i32 %x2, %a3
+  %a4 = load volatile i32* @a
+  %x4 = add i32 %x3, %a4
+  %a5 = load volatile i32* @a
+  %x5 = add i32 %x4, %a5
+  %a6 = load volatile i32* @a
+  %x6 = add i32 %x5, %a6
+  %a7 = load volatile i32* @a
+  %x7 = add i32 %x6, %a7
+  %a8 = load volatile i32* @a
+  %x8 = add i32 %x7, %a8
+  %a9 = load volatile i32* @a
+  %x9 = add i32 %x8, %a9
+  %a10 = load volatile i32* @a
+  %x10 = add i32 %x9, %a10
+  %a11 = load volatile i32* @a
+  %x11 = add i32 %x10, %a11
+  %a12 = load volatile i32* @a
+  %x12 = add i32 %x11, %a12
+
+  %a21 = load volatile i32* @a
+  %x21 = add i32 %x12, %a21
+  %a22 = load volatile i32* @a
+  %x22 = add i32 %x21, %a22
+  %a23 = load volatile i32* @a
+  %x23 = add i32 %x22, %a23
+  %a24 = load volatile i32* @a
+  %x24 = add i32 %x23, %a24
+  %a25 = load volatile i32* @a
+  %x25 = add i32 %x24, %a25
+  %a26 = load volatile i32* @a
+  %x26 = add i32 %x25, %a26
+  %a27 = load volatile i32* @a
+  %x27 = add i32 %x26, %a27
+  %a28 = load volatile i32* @a
+  %x28 = add i32 %x27, %a28
+  %a29 = load volatile i32* @a
+  %x29 = add i32 %x28, %a29
+  %a30 = load volatile i32* @a
+  %x30 = add i32 %x29, %a30
+  %a31 = load volatile i32* @a
+  %x31 = add i32 %x30, %a31
+  %a32 = load volatile i32* @a
+  %x32 = add i32 %x31, %a32
+
+  %a41 = load volatile i32* @a
+  %x41 = add i32 %x32, %a41
+  %a42 = load volatile i32* @a
+  %x42 = add i32 %x41, %a42
+  %a43 = load volatile i32* @a
+  %x43 = add i32 %x42, %a43
+  %a44 = load volatile i32* @a
+  %x44 = add i32 %x43, %a44
+  %a45 = load volatile i32* @a
+  %x45 = add i32 %x44, %a45
+  %a46 = load volatile i32* @a
+  %x46 = add i32 %x45, %a46
+  %a47 = load volatile i32* @a
+  %x47 = add i32 %x46, %a47
+  %a48 = load volatile i32* @a
+  %x48 = add i32 %x47, %a48
+  %a49 = load volatile i32* @a
+  %x49 = add i32 %x48, %a49
+  %a50 = load volatile i32* @a
+  %x50 = add i32 %x49, %a50
+  %a51 = load volatile i32* @a
+  %x51 = add i32 %x50, %a51
+  %a52 = load volatile i32* @a
+  %x52 = add i32 %x51, %a52
+
+  %add = add i32 %x52, %a
+  ret i32 %add
+}
+
 ; Function Attrs: nounwind readnone uwtable
 define i32 @bar(i32 %a) #0 {
 ; CHECK-LABEL: @bar
 ; CHECK: call i32 @ColdFunction(i32 5)
 ; CHECK-NOT: call i32 @simpleFunction(i32 6)
+; CHECK: call i32 @ColdFunction2(i32 5)
 ; CHECK: ret
+; OVERRIDE-LABEL: @bar
+; OVERRIDE-NOT: call i32 @ColdFunction(i32 5)
+; OVERRIDE-NOT: call i32 @simpleFunction(i32 6)
+; OVERRIDE-NOT: call i32 @ColdFunction2(i32 5)
+; OVERRIDE: ret
+; DEFAULT-LABEL: @bar
+; DEFAULT-NOT: call i32 @ColdFunction(i32 5)
+; DEFAULT-NOT: call i32 @simpleFunction(i32 6)
+; DEFAULT: call i32 @ColdFunction2(i32 5)
+; DEFAULT: ret
 entry:
   %0 = tail call i32 @ColdFunction(i32 5)
   %1 = tail call i32 @simpleFunction(i32 6)
-  %add = add i32 %0, %1
+  %2 = tail call i32 @ColdFunction2(i32 5)
+  %3 = add i32 %0, %1
+  %add = add i32 %2, %3
   ret i32 %add
 }
 
diff --git a/test/Transforms/Inline/inline-tail.ll b/test/Transforms/Inline/inline-tail.ll
index 8bb059d..b40328e 100644
--- a/test/Transforms/Inline/inline-tail.ll
+++ b/test/Transforms/Inline/inline-tail.ll
@@ -1,15 +1,182 @@
-; RUN: opt < %s -inline -S | not grep tail
+; RUN: opt < %s -inline -S | FileCheck %s
 
-declare void @bar(i32*)
+; We have to apply the less restrictive TailCallKind of the call site being
+; inlined and any call sites cloned into the caller.
 
-define internal void @foo(i32* %P) {
-        tail call void @bar( i32* %P )
-        ret void
+; No tail marker after inlining, since test_capture_c captures an alloca.
+; CHECK: define void @test_capture_a(
+; CHECK-NOT: tail
+; CHECK: call void @test_capture_c(
+
+declare void @test_capture_c(i32*)
+define internal void @test_capture_b(i32* %P) {
+  tail call void @test_capture_c(i32* %P)
+  ret void
+}
+define void @test_capture_a() {
+  %A = alloca i32  		; captured by test_capture_b
+  call void @test_capture_b(i32* %A)
+  ret void
+}
+
+; No musttail marker after inlining, since the prototypes don't match.
+; CHECK: define void @test_proto_mismatch_a(
+; CHECK-NOT: musttail
+; CHECK: call void @test_proto_mismatch_c(
+
+declare void @test_proto_mismatch_c(i32*)
+define internal void @test_proto_mismatch_b(i32* %p) {
+  musttail call void @test_proto_mismatch_c(i32* %p)
+  ret void
+}
+define void @test_proto_mismatch_a() {
+  call void @test_proto_mismatch_b(i32* null)
+  ret void
+}
+
+; After inlining through a musttail call site, we need to keep musttail markers
+; to prevent unbounded stack growth.
+; CHECK: define void @test_musttail_basic_a(
+; CHECK: musttail call void @test_musttail_basic_c(
+
+declare void @test_musttail_basic_c(i32* %p)
+define internal void @test_musttail_basic_b(i32* %p) {
+  musttail call void @test_musttail_basic_c(i32* %p)
+  ret void
+}
+define void @test_musttail_basic_a(i32* %p) {
+  musttail call void @test_musttail_basic_b(i32* %p)
+  ret void
+}
+
+; Don't insert lifetime end markers here, the lifetime is trivially over due
+; the return.
+; CHECK: define void @test_byval_a(
+; CHECK: musttail call void @test_byval_c(
+; CHECK-NEXT: ret void
+
+declare void @test_byval_c(i32* byval %p)
+define internal void @test_byval_b(i32* byval %p) {
+  musttail call void @test_byval_c(i32* byval %p)
+  ret void
+}
+define void @test_byval_a(i32* byval %p) {
+  musttail call void @test_byval_b(i32* byval %p)
+  ret void
 }
 
-define void @caller() {
-        %A = alloca i32         ; <i32*> [#uses=1]
-        call void @foo( i32* %A )
-        ret void
+; Don't insert a stack restore, we're about to return.
+; CHECK: define void @test_dynalloca_a(
+; CHECK: call i8* @llvm.stacksave(
+; CHECK: alloca i8, i32 %n
+; CHECK: musttail call void @test_dynalloca_c(
+; CHECK-NEXT: ret void
+
+declare void @escape(i8* %buf)
+declare void @test_dynalloca_c(i32* byval %p, i32 %n)
+define internal void @test_dynalloca_b(i32* byval %p, i32 %n) alwaysinline {
+  %buf = alloca i8, i32 %n              ; dynamic alloca
+  call void @escape(i8* %buf)           ; escape it
+  musttail call void @test_dynalloca_c(i32* byval %p, i32 %n)
+  ret void
+}
+define void @test_dynalloca_a(i32* byval %p, i32 %n) {
+  musttail call void @test_dynalloca_b(i32* byval %p, i32 %n)
+  ret void
 }
 
+; We can't merge the returns.
+; CHECK: define void @test_multiret_a(
+; CHECK: musttail call void @test_multiret_c(
+; CHECK-NEXT: ret void
+; CHECK: musttail call void @test_multiret_d(
+; CHECK-NEXT: ret void
+
+declare void @test_multiret_c(i1 zeroext %b)
+declare void @test_multiret_d(i1 zeroext %b)
+define internal void @test_multiret_b(i1 zeroext %b) {
+  br i1 %b, label %c, label %d
+c:
+  musttail call void @test_multiret_c(i1 zeroext %b)
+  ret void
+d:
+  musttail call void @test_multiret_d(i1 zeroext %b)
+  ret void
+}
+define void @test_multiret_a(i1 zeroext %b) {
+  musttail call void @test_multiret_b(i1 zeroext %b)
+  ret void
+}
+
+; We have to avoid bitcast chains.
+; CHECK: define i32* @test_retptr_a(
+; CHECK: musttail call i8* @test_retptr_c(
+; CHECK-NEXT: bitcast i8* {{.*}} to i32*
+; CHECK-NEXT: ret i32*
+
+declare i8* @test_retptr_c()
+define internal i16* @test_retptr_b() {
+  %rv = musttail call i8* @test_retptr_c()
+  %v = bitcast i8* %rv to i16*
+  ret i16* %v
+}
+define i32* @test_retptr_a() {
+  %rv = musttail call i16* @test_retptr_b()
+  %v = bitcast i16* %rv to i32*
+  ret i32* %v
+}
+
+; Combine the last two cases: multiple returns with pointer bitcasts.
+; CHECK: define i32* @test_multiptrret_a(
+; CHECK: musttail call i8* @test_multiptrret_c(
+; CHECK-NEXT: bitcast i8* {{.*}} to i32*
+; CHECK-NEXT: ret i32*
+; CHECK: musttail call i8* @test_multiptrret_d(
+; CHECK-NEXT: bitcast i8* {{.*}} to i32*
+; CHECK-NEXT: ret i32*
+
+declare i8* @test_multiptrret_c(i1 zeroext %b)
+declare i8* @test_multiptrret_d(i1 zeroext %b)
+define internal i16* @test_multiptrret_b(i1 zeroext %b) {
+  br i1 %b, label %c, label %d
+c:
+  %c_rv = musttail call i8* @test_multiptrret_c(i1 zeroext %b)
+  %c_v = bitcast i8* %c_rv to i16*
+  ret i16* %c_v
+d:
+  %d_rv = musttail call i8* @test_multiptrret_d(i1 zeroext %b)
+  %d_v = bitcast i8* %d_rv to i16*
+  ret i16* %d_v
+}
+define i32* @test_multiptrret_a(i1 zeroext %b) {
+  %rv = musttail call i16* @test_multiptrret_b(i1 zeroext %b)
+  %v = bitcast i16* %rv to i32*
+  ret i32* %v
+}
+
+; Inline a musttail call site which contains a normal return and a musttail call.
+; CHECK: define i32 @test_mixedret_a(
+; CHECK: br i1 %b
+; CHECK: musttail call i32 @test_mixedret_c(
+; CHECK-NEXT: ret i32
+; CHECK: call i32 @test_mixedret_d(i1 zeroext %b)
+; CHECK: add i32 1,
+; CHECK-NOT: br
+; CHECK: ret i32
+
+declare i32 @test_mixedret_c(i1 zeroext %b)
+declare i32 @test_mixedret_d(i1 zeroext %b)
+define internal i32 @test_mixedret_b(i1 zeroext %b) {
+  br i1 %b, label %c, label %d
+c:
+  %c_rv = musttail call i32 @test_mixedret_c(i1 zeroext %b)
+  ret i32 %c_rv
+d:
+  %d_rv = call i32 @test_mixedret_d(i1 zeroext %b)
+  %d_rv1 = add i32 1, %d_rv
+  ret i32 %d_rv1
+}
+define i32 @test_mixedret_a(i1 zeroext %b) {
+  %rv = musttail call i32 @test_mixedret_b(i1 zeroext %b)
+  ret i32 %rv
+}
diff --git a/test/Transforms/Inline/inline-vla.ll b/test/Transforms/Inline/inline-vla.ll
new file mode 100644
index 0000000..dc9deaa
--- /dev/null
+++ b/test/Transforms/Inline/inline-vla.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -inline %s -o - | FileCheck %s
+
+; Check that memcpy2 is completely inlined away.
+; CHECK-NOT: memcpy2
+
+@.str = private unnamed_addr constant [2 x i8] c"a\00", align 1
+@.str1 = private unnamed_addr constant [3 x i8] c"ab\00", align 1
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
+entry:
+  %data = alloca [2 x i8], align 1
+  %arraydecay = getelementptr inbounds [2 x i8]* %data, i64 0, i64 0
+  call fastcc void @memcpy2(i8* %arraydecay, i8* getelementptr inbounds ([2 x i8]* @.str, i64 0, i64 0), i64 1)
+  call fastcc void @memcpy2(i8* %arraydecay, i8* getelementptr inbounds ([3 x i8]* @.str1, i64 0, i64 0), i64 2)
+  ret i32 0
+}
+
+; Function Attrs: inlinehint nounwind ssp uwtable
+define internal fastcc void @memcpy2(i8* nocapture %dst, i8* nocapture readonly %src, i64 %size) #1 {
+entry:
+  %vla = alloca i64, i64 %size, align 16
+  %0 = bitcast i64* %vla to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %src, i64 %size, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %0, i64 %size, i32 1, i1 false)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #2
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { inlinehint nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 (trunk 205695) (llvm/trunk 205706)"}
diff --git a/test/Transforms/Inline/optimization-remarks.ll b/test/Transforms/Inline/optimization-remarks.ll
new file mode 100644
index 0000000..9108f3a
--- /dev/null
+++ b/test/Transforms/Inline/optimization-remarks.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -inline -pass-remarks=inline -pass-remarks-missed=inline -pass-remarks-analysis=inline -S 2>&1 | FileCheck %s
+
+; CHECK: foo should always be inlined (cost=always)
+; CHECK: foo inlined into bar
+; CHECK: foz should never be inlined (cost=never)
+; CHECK: foz will not be inlined into bar
+
+; Function Attrs: alwaysinline nounwind uwtable
+define i32 @foo(i32 %x, i32 %y) #0 {
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  %0 = load i32* %x.addr, align 4
+  %1 = load i32* %y.addr, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+; Function Attrs: noinline nounwind uwtable
+define float @foz(i32 %x, i32 %y) #1 {
+entry:
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  %0 = load i32* %x.addr, align 4
+  %1 = load i32* %y.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  ret float %conv
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @bar(i32 %j) #2 {
+entry:
+  %j.addr = alloca i32, align 4
+  store i32 %j, i32* %j.addr, align 4
+  %0 = load i32* %j.addr, align 4
+  %1 = load i32* %j.addr, align 4
+  %sub = sub nsw i32 %1, 2
+  %call = call i32 @foo(i32 %0, i32 %sub)
+  %conv = sitofp i32 %call to float
+  %2 = load i32* %j.addr, align 4
+  %sub1 = sub nsw i32 %2, 2
+  %3 = load i32* %j.addr, align 4
+  %call2 = call float @foz(i32 %sub1, i32 %3)
+  %mul = fmul float %conv, %call2
+  %conv3 = fptosi float %mul to i32
+  ret i32 %conv3
+}
+
+attributes #0 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 "}
diff --git a/test/Transforms/Inline/switch.ll b/test/Transforms/Inline/switch.ll
new file mode 100644
index 0000000..c5dab53
--- /dev/null
+++ b/test/Transforms/Inline/switch.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -inline -inline-threshold=20 -S | FileCheck %s
+
+define i32 @callee(i32 %a) {
+  switch i32 %a, label %sw.default [
+    i32 0, label %sw.bb0
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb4
+    i32 5, label %sw.bb5
+    i32 6, label %sw.bb6
+    i32 7, label %sw.bb7
+    i32 8, label %sw.bb8
+    i32 9, label %sw.bb9
+  ]
+
+sw.default:
+  br label %return
+
+sw.bb0:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+sw.bb2:
+  br label %return
+
+sw.bb3:
+  br label %return
+
+sw.bb4:
+  br label %return
+
+sw.bb5:
+  br label %return
+
+sw.bb6:
+  br label %return
+
+sw.bb7:
+  br label %return
+
+sw.bb8:
+  br label %return
+
+sw.bb9:
+  br label %return
+
+return:
+  ret i32 42
+}
+
+define i32 @caller(i32 %a) {
+; CHECK-LABEL: @caller(
+; CHECK: call i32 @callee(
+
+  %result = call i32 @callee(i32 %a)
+  ret i32 %result
+}
diff --git a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
index 1883a8f..39408a2 100644
--- a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
+++ b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
@@ -68,7 +68,7 @@ declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind rea
 
 define <4 x i32> @mulByZeroARM64(<4 x i16> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
   ret <4 x i32> %a
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> zeroinitializer
@@ -76,7 +76,7 @@ entry:
 
 define <4 x i32> @mulByOneARM64(<4 x i16> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %a
 ; CHECK: entry:
 ; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
@@ -85,7 +85,7 @@ entry:
 
 define <4 x i32> @constantMulARM64() nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
   ret <4 x i32> %a
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
@@ -93,7 +93,7 @@ entry:
 
 define <4 x i32> @constantMulSARM64() nounwind readnone ssp {
 entry:
-  %b = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  %b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %b
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -101,7 +101,7 @@ entry:
 
 define <4 x i32> @constantMulUARM64() nounwind readnone ssp {
 entry:
-  %b = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  %b = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %b
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -109,17 +109,17 @@ entry:
 
 define <4 x i32> @complex1ARM64(<4 x i16> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
   %b = add <4 x i32> zeroinitializer, %a
   ret <4 x i32> %b
 ; CHECK: entry:
-; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
+; CHECK-NEXT: %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
 ; CHECK-NEXT: ret <4 x i32> %a
 }
 
 define <4 x i32> @complex2ARM64(<4 x i32> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
   %b = add <4 x i32> %x, %a
   ret <4 x i32> %b
 ; CHECK: entry:
@@ -127,8 +127,8 @@ entry:
 ; CHECK-NEXT: ret <4 x i32> %b
 }
 
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
 
 ; CHECK: attributes #0 = { nounwind readnone ssp }
 ; CHECK: attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/OverlappingInsertvalues.ll b/test/Transforms/InstCombine/OverlappingInsertvalues.ll
new file mode 100644
index 0000000..9248aec
--- /dev/null
+++ b/test/Transforms/InstCombine/OverlappingInsertvalues.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that we can find and remove redundant insertvalues
+; CHECK-LABEL: foo_simple
+; CHECK-NOT: i8* %x, 0
+define { i8*, i64, i32 } @foo_simple(i8* %x, i8* %y) nounwind {
+entry:
+  %0 = insertvalue { i8*, i64, i32 } undef, i8* %x, 0
+  %1 = insertvalue { i8*, i64, i32 } %0, i8* %y, 0
+  ret { i8*, i64, i32 } %1
+}
+; Check that we can find and remove redundant nodes in insertvalues chain
+; CHECK-LABEL: foo_ovwrt_chain
+; CHECK-NOT: i64 %y, 1
+; CHECK-NOT: i32 555, 2
+define { i8*, i64, i32 } @foo_ovwrt_chain(i8* %x, i64 %y, i64 %z) nounwind {
+entry:
+  %0 = insertvalue { i8*, i64, i32 } undef, i8* %x, 0
+  %1 = insertvalue { i8*, i64, i32 } %0, i64 %y, 1
+  %2 = insertvalue { i8*, i64, i32 } %1, i32 555, 2
+  %3 = insertvalue { i8*, i64, i32 } %2, i64 %z, 1
+  %4 = insertvalue { i8*, i64, i32 } %3, i32 777, 2
+  ret { i8*, i64, i32 } %4
+}
+; Check that we propagate insertvalues only if they are use as the first
+; operand (as initial value of aggregate)
+; CHECK-LABEL: foo_use_as_second_operand
+; CHECK: i16 %x, 0
+; CHECK: %0, 1
+define { i8, {i16, i32} } @foo_use_as_second_operand(i16 %x) nounwind {
+entry:
+  %0 = insertvalue { i16, i32 } undef, i16 %x, 0
+  %1 = insertvalue { i8, {i16, i32} } undef, { i16, i32 } %0, 1
+  ret { i8, {i16, i32} } %1
+}
diff --git a/test/Transforms/InstCombine/alloca.ll b/test/Transforms/InstCombine/alloca.ll
index ae1cfa1..6d0c131 100644
--- a/test/Transforms/InstCombine/alloca.ll
+++ b/test/Transforms/InstCombine/alloca.ll
@@ -129,3 +129,24 @@ define void @test8() {
   call void (...)* @use(i32* %x)
   ret void
 }
+
+; PR19569
+%struct_type = type { i32, i32 }
+declare void @test9_aux(<{ %struct_type }>* inalloca)
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
+
+define void @test9(%struct_type* %a) {
+; CHECK-LABEL: @test9(
+entry:
+  %inalloca.save = call i8* @llvm.stacksave()
+  %argmem = alloca inalloca <{ %struct_type }>
+; CHECK: alloca inalloca i64, align 8
+  %0 = getelementptr inbounds <{ %struct_type }>* %argmem, i32 0, i32 0
+  %1 = bitcast %struct_type* %0 to i8*
+  %2 = bitcast %struct_type* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %2, i32 8, i32 4, i1 false)
+  call void @test9_aux(<{ %struct_type }>* inalloca %argmem)
+  call void @llvm.stackrestore(i8* %inalloca.save)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/bitcast-alias-function.ll b/test/Transforms/InstCombine/bitcast-alias-function.ll
index a6b56f9..284960b 100644
--- a/test/Transforms/InstCombine/bitcast-alias-function.ll
+++ b/test/Transforms/InstCombine/bitcast-alias-function.ll
@@ -6,46 +6,46 @@ target datalayout = "e-p:32:32:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16
 ; Cases that should be bitcast
 
 ; Test cast between scalars with same bit sizes
-@alias_i32_to_f32 = alias bitcast (i32 (i32)* @func_i32 to float (float)*)
+@alias_i32_to_f32 = alias float (float), i32 (i32)* @func_i32
 
 ; Test cast between vectors with same number of elements and bit sizes
-@alias_v2i32_to_v2f32 = alias bitcast (<2 x i32> (<2 x i32>)* @func_v2i32 to <2 x float> (<2 x float>)*)
+@alias_v2i32_to_v2f32 = alias <2 x float> (<2 x float>), <2 x i32> (<2 x i32>)* @func_v2i32
 
 ; Test cast from vector to scalar with same number of bits
-@alias_v2f32_to_i64 = alias bitcast (i64 (i64)* @func_i64 to <2 x float> (<2 x float>)*)
+@alias_v2f32_to_i64 = alias <2 x float> (<2 x float>), i64 (i64)* @func_i64
 
 ; Test cast from scalar to vector with same number of bits
-@alias_i64_to_v2f32 = alias bitcast (<2 x float> (<2 x float>)* @func_v2f32 to i64 (i64)*)
+@alias_i64_to_v2f32 = alias  i64 (i64), <2 x float> (<2 x float>)* @func_v2f32
 
 ; Test cast between vectors of pointers
-@alias_v2i32p_to_v2i64p = alias bitcast (<2 x i32*> (<2 x i32*>)* @func_v2i32p to <2 x i64*> (<2 x i64*>)*)
+@alias_v2i32p_to_v2i64p = alias <2 x i64*> (<2 x i64*>), <2 x i32*> (<2 x i32*>)* @func_v2i32p
 
 
 ; Cases that should be invalid and unchanged
 
 ; Test cast between scalars with different bit sizes
-@alias_i64_to_f32 = alias bitcast (i64 (i64)* @func_i64 to float (float)*)
+@alias_i64_to_f32 = alias float (float), i64 (i64)* @func_i64
 
 ; Test cast between vectors with different bit sizes but the
 ; same number of elements
-@alias_v2i64_to_v2f32 = alias bitcast (<2 x i64> (<2 x i64>)* @func_v2i64 to <2 x float> (<2 x float>)*)
+@alias_v2i64_to_v2f32 = alias <2 x float> (<2 x float>), <2 x i64> (<2 x i64>)* @func_v2i64
 
 ; Test cast between vectors with same number of bits and different
 ; numbers of elements
-@alias_v2i32_to_v4f32 = alias bitcast (<2 x i32> (<2 x i32>)* @func_v2i32 to <4 x float> (<4 x float>)*)
+@alias_v2i32_to_v4f32 = alias  <4 x float> (<4 x float>), <2 x i32> (<2 x i32>)* @func_v2i32
 
 ; Test cast between scalar and vector with different number of bits
-@alias_i64_to_v4f32 = alias bitcast (<4 x float> (<4 x float>)* @func_v4f32 to i64 (i64)*)
+@alias_i64_to_v4f32 = alias i64 (i64), <4 x float> (<4 x float>)* @func_v4f32
 
 ; Test cast between vector and scalar with different number of bits
-@alias_v4f32_to_i64 = alias bitcast (i64 (i64)* @func_i64 to <4 x float> (<4 x float>)*)
+@alias_v4f32_to_i64 = alias <4 x float> (<4 x float>), i64 (i64)* @func_i64
 
 ; Test cast from scalar to vector of pointers with same number of bits
 ; We don't know the pointer size at this point, so this can't be done
-@alias_i64_to_v2i32p = alias bitcast (<2 x i32*> (<2 x i32*>)* @func_v2i32p to i64 (i64)*)
+@alias_i64_to_v2i32p = alias  i64 (i64), <2 x i32*> (<2 x i32*>)* @func_v2i32p
 
 ; Test cast between vector of pointers and scalar with different number of bits
-@alias_v4i32p_to_i64 = alias bitcast (i64 (i64)* @func_i64 to <4 x i32*> (<4 x i32*>)*)
+@alias_v4i32p_to_i64 = alias <4 x i32*> (<4 x i32*>), i64 (i64)* @func_i64
 
 
 
diff --git a/test/Transforms/InstCombine/blend_x86.ll b/test/Transforms/InstCombine/blend_x86.ll
new file mode 100644
index 0000000..778d44b
--- /dev/null
+++ b/test/Transforms/InstCombine/blend_x86.ll
@@ -0,0 +1,55 @@
+; RUN: opt < %s -instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -S | FileCheck %s
+
+define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd
+; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x double> %ab, <2 x double> %xy
+  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00>)
+  ret <2 x double> %1
+}
+
+define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps
+; CHECK: select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %abcd, <4 x float> %xyzw
+  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
+  ret <4 x float> %1
+}
+
+define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb
+; CHECK: select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %abcd, <16 x i8> %xyzw
+  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
+  ret <16 x i8> %1
+}
+
+define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd_avx
+; CHECK: select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %ab, <4 x double> %xy
+  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00, double 0xFFFFFFFFE0000000, double 0.000000e+00>)
+  ret <4 x double> %1
+}
+
+define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps_avx
+; CHECK: select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>,  <8 x float> %abcd, <8 x float> %xyzw
+  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
+  ret <8 x float> %1
+}
+
+define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb_avx2
+; CHECK: select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %abcd, <32 x i8> %xyzw
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd,
+        <32 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
+  ret <32 x i8> %1
+}
+
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
+
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
diff --git a/test/Transforms/InstCombine/call-cast-target-inalloca.ll b/test/Transforms/InstCombine/call-cast-target-inalloca.ll
index baf97e0..90289e2 100644
--- a/test/Transforms/InstCombine/call-cast-target-inalloca.ll
+++ b/test/Transforms/InstCombine/call-cast-target-inalloca.ll
@@ -8,7 +8,7 @@ declare void @takes_i32_inalloca(i32* inalloca)
 
 define void @f() {
 ; CHECK-LABEL: define void @f()
-  %args = alloca i32
+  %args = alloca inalloca i32
   call void bitcast (void (i32)* @takes_i32 to void (i32*)*)(i32* inalloca %args)
 ; CHECK: call void bitcast
   ret void
diff --git a/test/Transforms/InstCombine/div.ll b/test/Transforms/InstCombine/div.ll
index 1bf486f..9c7ba9b 100644
--- a/test/Transforms/InstCombine/div.ll
+++ b/test/Transforms/InstCombine/div.ll
@@ -156,3 +156,22 @@ define <2 x i64> @test18(<2 x i64> %x) nounwind {
 ; CHECK-NEXT: sub <2 x i64> zeroinitializer, %x
 ; CHECK-NEXT: ret <2 x i64>
 }
+
+define i32 @test19(i32 %x) {
+  %A = udiv i32 1, %x
+  ret i32 %A
+; CHECK-LABEL: @test19(
+; CHECK-NEXT: icmp eq i32 %x, 1
+; CHECK-NEXT: zext i1 %{{.*}} to i32
+; CHECK-NEXT ret i32
+}
+
+define i32 @test20(i32 %x) {
+  %A = sdiv i32 1, %x
+  ret i32 %A
+; CHECK-LABEL: @test20(
+; CHECK-NEXT: add i32 %x, 1
+; CHECK-NEXT: icmp ult i32 %{{.*}}, 3
+; CHECK-NEXT: select i1 %{{.*}}, i32 %x, i32 {{.*}}
+; CHECK-NEXT: ret i32
+}
diff --git a/test/Transforms/InstCombine/gep-addrspace.ll b/test/Transforms/InstCombine/gep-addrspace.ll
index 24c355d..29511a3 100644
--- a/test/Transforms/InstCombine/gep-addrspace.ll
+++ b/test/Transforms/InstCombine/gep-addrspace.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-pc-win32"
@@ -17,3 +17,18 @@ ST:
   ret void
 }
 
+@array = internal addrspace(3) global [256 x float] zeroinitializer, align 4
+@scalar = internal addrspace(3) global float 0.000000e+00, align 4
+
+define void @keep_necessary_addrspacecast(i64 %i, float** %out0, float** %out1) {
+entry:
+; CHECK-LABEL: @keep_necessary_addrspacecast
+  %0 = getelementptr [256 x float]* addrspacecast ([256 x float] addrspace(3)* @array to [256 x float]*), i64 0, i64 %i
+; CHECK: addrspacecast float addrspace(3)* %{{[0-9]+}} to float*
+  %1 = getelementptr [0 x float]* addrspacecast (float addrspace(3)* @scalar to [0 x float]*), i64 0, i64 %i
+; CHECK: addrspacecast float addrspace(3)* %{{[0-9]+}} to float*
+  store float* %0, float** %out0, align 4
+  store float* %1, float** %out1, align 4
+  ret void
+}
+
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 12a4744..f45897c 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -1356,3 +1356,12 @@ define i1 @icmp_ashr_ashr_ne(i32 %a, i32 %b) nounwind {
  %z = icmp ne i32 %x, %y
  ret i1 %z
 }
+
+; CHECK-LABEL: @icmp_neg_cst_slt
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %a, 10
+; CHECK-NEXT: ret i1 [[CMP]]
+define i1 @icmp_neg_cst_slt(i32 %a) {
+  %1 = sub nsw i32 0, %a
+  %2 = icmp slt i32 %1, -10
+  ret i1 %2
+}
diff --git a/test/Transforms/InstCombine/memcpy-from-global.ll b/test/Transforms/InstCombine/memcpy-from-global.ll
index 58793ab..b5a0ab8 100644
--- a/test/Transforms/InstCombine/memcpy-from-global.ll
+++ b/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -7,11 +7,11 @@ entry:
 	%lookupTable = alloca [128 x float], align 16		; <[128 x float]*> [#uses=5]
 	%lookupTable1 = bitcast [128 x float]* %lookupTable to i8*		; <i8*> [#uses=1]
 	call void @llvm.memcpy.p0i8.p0i8.i64(i8* %lookupTable1, i8* bitcast ([128 x float]* @C.0.1248 to i8*), i64 512, i32 16, i1 false)
-        
+
 ; CHECK-LABEL: @test1(
 ; CHECK-NOT: alloca
 ; CHECK-NOT: call{{.*}}@llvm.memcpy
-        
+
 	%tmp3 = shl i32 %hash, 2		; <i32> [#uses=1]
 	%tmp5 = and i32 %tmp3, 124		; <i32> [#uses=4]
 	%tmp753 = getelementptr [128 x float]* %lookupTable, i32 0, i32 %tmp5		; <float*> [#uses=1]
@@ -37,6 +37,9 @@ entry:
 }
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
 
 %T = type { i8, [123 x i8] }
 %U = type { i32, i32, i32, i32, i32 }
@@ -64,7 +67,30 @@ define void @test2() {
   ret void
 }
 
+define void @test2_addrspacecast() {
+  %A = alloca %T
+  %B = alloca %T
+  %a = addrspacecast %T* %A to i8 addrspace(1)*
+  %b = addrspacecast %T* %B to i8 addrspace(1)*
+
+; CHECK-LABEL: @test2_addrspacecast(
+
+; %A alloca is deleted
+; This doesn't exactly match what test2 does, because folding the type
+; cast into the alloca doesn't work for the addrspacecast yet.
+; CHECK-NEXT: alloca %T
+; CHECK-NEXT: addrspacecast
+
+; use @G instead of %A
+; CHECK-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %{{.*}},
+  call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %b, i8 addrspace(1)* %a, i64 124, i32 4, i1 false)
+  call void @bar_as1(i8 addrspace(1)* %b)
+  ret void
+}
+
 declare void @bar(i8*)
+declare void @bar_as1(i8 addrspace(1)*)
 
 
 ;; Should be able to eliminate the alloca.
@@ -78,11 +104,22 @@ define void @test3() {
   ret void
 }
 
+define void @test3_addrspacecast() {
+  %A = alloca %T
+  %a = bitcast %T* %A to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %a, i8 addrspace(1)* addrspacecast (%T* @G to i8 addrspace(1)*), i64 124, i32 4, i1 false)
+  call void @bar(i8* %a) readonly
+; CHECK-LABEL: @test3_addrspacecast(
+; CHECK-NEXT: call void @bar(i8* getelementptr inbounds (%T* @G, i64 0, i32 0))
+  ret void
+}
+
+
 define void @test4() {
   %A = alloca %T
   %a = bitcast %T* %A to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
-  call void @baz(i8* byval %a) 
+  call void @baz(i8* byval %a)
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT: call void @baz(i8* byval getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
@@ -94,7 +131,7 @@ define void @test5() {
   %a = bitcast %T* %A to i8*
   call void @llvm.lifetime.start(i64 -1, i8* %a)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
-  call void @baz(i8* byval %a) 
+  call void @baz(i8* byval %a)
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT: call void @baz(i8* byval getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
@@ -135,6 +172,18 @@ define void @test8() {
   ret void
 }
 
+
+define void @test8_addrspacecast() {
+  %A = alloca %U, align 16
+  %a = bitcast %U* %A to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %a, i8 addrspace(1)* addrspacecast (%U* getelementptr ([2 x %U]* @H, i64 0, i32 1) to i8 addrspace(1)*), i64 20, i32 4, i1 false)
+  call void @bar(i8* %a) readonly
+; CHECK-LABEL: @test8_addrspacecast(
+; CHECK: llvm.memcpy
+; CHECK: bar
+  ret void
+}
+
 define void @test9() {
   %A = alloca %U, align 4
   %a = bitcast %U* %A to i8*
@@ -144,3 +193,13 @@ define void @test9() {
 ; CHECK-NEXT: call void @bar(i8* bitcast (%U* getelementptr inbounds ([2 x %U]* @H, i64 0, i64 1) to i8*))
   ret void
 }
+
+define void @test9_addrspacecast() {
+  %A = alloca %U, align 4
+  %a = bitcast %U* %A to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %a, i8 addrspace(1)* addrspacecast (%U* getelementptr ([2 x %U]* @H, i64 0, i32 1) to i8 addrspace(1)*), i64 20, i32 4, i1 false)
+  call void @bar(i8* %a) readonly
+; CHECK-LABEL: @test9_addrspacecast(
+; CHECK-NEXT: call void @bar(i8* bitcast (%U* getelementptr inbounds ([2 x %U]* @H, i64 0, i64 1) to i8*))
+  ret void
+}
diff --git a/test/Transforms/InstCombine/overflow-mul.ll b/test/Transforms/InstCombine/overflow-mul.ll
new file mode 100644
index 0000000..04019ae
--- /dev/null
+++ b/test/Transforms/InstCombine/overflow-mul.ll
@@ -0,0 +1,164 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; return mul(zext x, zext y) > MAX
+define i32 @pr4917_1(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4917_1(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+; CHECK-NOT: zext i32
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %overflow = icmp ugt i64 %mul64, 4294967295
+; CHECK: extractvalue { i32, i1 } [[MUL]], 1
+  %retval = zext i1 %overflow to i32
+  ret i32 %retval
+}
+
+; return mul(zext x, zext y) >= MAX+1
+define i32 @pr4917_1a(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4917_1a(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+; CHECK-NOT: zext i32
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %overflow = icmp uge i64 %mul64, 4294967296
+; CHECK: extractvalue { i32, i1 } [[MUL]], 1
+  %retval = zext i1 %overflow to i32
+  ret i32 %retval
+}
+
+; mul(zext x, zext y) > MAX
+; mul(x, y) is used
+define i32 @pr4917_2(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4917_2(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+; CHECK-NOT: zext i32
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %overflow = icmp ugt i64 %mul64, 4294967295
+; CHECK-DAG: [[VAL:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+  %mul32 = trunc i64 %mul64 to i32
+; CHECK-DAG: [[OVFL:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+  %retval = select i1 %overflow, i32 %mul32, i32 111
+; CHECK: select i1 [[OVFL]], i32 [[VAL]]
+  ret i32 %retval
+}
+
+; return mul(zext x, zext y) > MAX
+; mul is used in non-truncate
+define i64 @pr4917_3(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4917_3(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+  %mul64 = mul i64 %l, %r
+; CHECK-NOT: umul.with.overflow.i32
+  %overflow = icmp ugt i64 %mul64, 4294967295
+  %retval = select i1 %overflow, i64 %mul64, i64 111
+  ret i64 %retval
+}
+
+; return mul(zext x, zext y) <= MAX
+define i32 @pr4917_4(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4917_4(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+; CHECK-NOT: zext i32
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %overflow = icmp ule i64 %mul64, 4294967295
+; CHECK: extractvalue { i32, i1 } [[MUL]], 1
+; CHECK: xor
+  %retval = zext i1 %overflow to i32
+  ret i32 %retval
+}
+
+; return mul(zext x, zext y) < MAX+1
+define i32 @pr4917_4a(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4917_4a(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+; CHECK-NOT: zext i32
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %overflow = icmp ult i64 %mul64, 4294967296
+; CHECK: extractvalue { i32, i1 } [[MUL]], 1
+; CHECK: xor
+  %retval = zext i1 %overflow to i32
+  ret i32 %retval
+}
+
+; operands of mul are of different size
+define i32 @pr4917_5(i32 %x, i8 %y) nounwind {
+; CHECK-LABEL: @pr4917_5(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i8 %y to i64
+; CHECK: [[Y:%.*]] = zext i8 %y to i32
+  %mul64 = mul i64 %l, %r
+  %overflow = icmp ugt i64 %mul64, 4294967295
+  %mul32 = trunc i64 %mul64 to i32
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 [[Y]])
+; CHECK-DAG: [[VAL:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-DAG: [[OVFL:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+  %retval = select i1 %overflow, i32 %mul32, i32 111
+; CHECK: select i1 [[OVFL]], i32 [[VAL]]
+  ret i32 %retval
+}
+
+; mul(zext x, zext y) != zext trunc mul
+define i32 @pr4918_1(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4918_1(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %part32 = trunc i64 %mul64 to i32
+  %part64 = zext i32 %part32 to i64
+  %overflow = icmp ne i64 %mul64, %part64
+; CHECK: [[OVFL:%.*]] = extractvalue { i32, i1 } [[MUL:%.*]], 1
+  %retval = zext i1 %overflow to i32
+  ret i32 %retval
+}
+
+; mul(zext x, zext y) == zext trunc mul
+define i32 @pr4918_2(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4918_2(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %part32 = trunc i64 %mul64 to i32
+  %part64 = zext i32 %part32 to i64
+  %overflow = icmp eq i64 %mul64, %part64
+; CHECK: extractvalue { i32, i1 } [[MUL]]
+  %retval = zext i1 %overflow to i32
+; CHECK: xor
+  ret i32 %retval
+}
+
+; zext trunc mul != mul(zext x, zext y)
+define i32 @pr4918_3(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4918_3(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %part32 = trunc i64 %mul64 to i32
+  %part64 = zext i32 %part32 to i64
+  %overflow = icmp ne i64 %part64, %mul64
+; CHECK: extractvalue { i32, i1 } [[MUL]], 1
+  %retval = zext i1 %overflow to i32
+  ret i32 %retval
+}
+
diff --git a/test/Transforms/InstCombine/pr19420.ll b/test/Transforms/InstCombine/pr19420.ll
new file mode 100644
index 0000000..23fa0a4
--- /dev/null
+++ b/test/Transforms/InstCombine/pr19420.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; CHECK-LABEL: @test_FoldShiftByConstant_CreateSHL
+; CHECK: mul <4 x i32> %in, <i32 0, i32 -32, i32 0, i32 -32>
+; CHECK-NEXT: ret
+define <4 x i32> @test_FoldShiftByConstant_CreateSHL(<4 x i32> %in) {
+  %mul.i = mul <4 x i32> %in, <i32 0, i32 -1, i32 0, i32 -1>
+  %vshl_n = shl <4 x i32> %mul.i, <i32 5, i32 5, i32 5, i32 5>
+  ret <4 x i32> %vshl_n
+}
+
+; CHECK-LABEL: @test_FoldShiftByConstant_CreateSHL2
+; CHECK: mul <8 x i16> %in, <i16 0, i16 -32, i16 0, i16 -32, i16 0, i16 -32, i16 0, i16 -32>
+; CHECK-NEXT: ret
+define <8 x i16> @test_FoldShiftByConstant_CreateSHL2(<8 x i16> %in) {
+  %mul.i = mul <8 x i16> %in, <i16 0, i16 -1, i16 0, i16 -1, i16 0, i16 -1, i16 0, i16 -1>
+  %vshl_n = shl <8 x i16> %mul.i, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  ret <8 x i16> %vshl_n
+}
+
+; CHECK-LABEL: @test_FoldShiftByConstant_CreateAnd
+; CHECK: mul <16 x i8> %in0, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
+; CHECK-NEXT: and <16 x i8> %vsra_n2, <i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32>
+; CHECK-NEXT: ret
+define <16 x i8> @test_FoldShiftByConstant_CreateAnd(<16 x i8> %in0) {
+  %vsra_n = ashr <16 x i8> %in0, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+  %tmp = add <16 x i8> %in0, %vsra_n
+  %vshl_n = shl <16 x i8> %tmp, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+  ret <16 x i8> %vshl_n
+}
+
+
+define i32 @bar(i32 %x, i32 %y) {
+  %a = lshr i32 %x, 4
+  %b = add i32 %a, %y
+  %c = shl i32 %b, 4
+  ret i32 %c
+}
+
+define <2 x i32> @bar_v2i32(<2 x i32> %x, <2 x i32> %y) {
+  %a = lshr <2 x i32> %x, <i32 5, i32 5>
+  %b = add <2 x i32> %a, %y
+  %c = shl <2 x i32> %b, <i32 5, i32 5>
+  ret <2 x i32> %c
+}
+
+
+
+
+define i32 @foo(i32 %x, i32 %y) {
+  %a = lshr i32 %x, 4
+  %b = and i32 %a, 8
+  %c = add i32 %b, %y
+  %d = shl i32 %c, 4
+  ret i32 %d
+}
+
+define <2 x i32> @foo_v2i32(<2 x i32> %x, <2 x i32> %y) {
+  %a = lshr <2 x i32> %x, <i32 4, i32 4>
+  %b = and <2 x i32> %a, <i32 8, i32 8>
+  %c = add <2 x i32> %b, %y
+  %d = shl <2 x i32> %c, <i32 4, i32 4>
+  ret <2 x i32> %d
+}
+
+
+
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index 1458bde..2213be1 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -1031,3 +1031,103 @@ define i32 @test67(i16 %x) {
 ; CHECK: lshr exact i32 %2, 1
 ; CHECK: xor i32 %3, 42
 }
+
+; SMIN(SMIN(X, 11), 92) -> SMIN(X, 11)
+define i32 @test68(i32 %x) {
+entry:
+  %cmp = icmp slt i32 11, %x
+  %cond = select i1 %cmp, i32 11, i32 %x
+  %cmp3 = icmp slt i32 92, %cond
+  %retval = select i1 %cmp3, i32 92, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test68(
+; CHECK: ret i32 %cond
+}
+
+; MIN(MIN(X, 24), 83) -> MIN(X, 24)
+define i32 @test69(i32 %x) {
+entry:
+  %cmp = icmp ult i32 24, %x
+  %cond = select i1 %cmp, i32 24, i32 %x
+  %cmp3 = icmp ult i32 83, %cond
+  %retval = select i1 %cmp3, i32 83, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test69(
+; CHECK: ret i32 %cond
+}
+
+; SMAX(SMAX(X, 75), 36) -> SMAX(X, 75)
+define i32 @test70(i32 %x) {
+entry:
+  %cmp = icmp slt i32 %x, 75
+  %cond = select i1 %cmp, i32 75, i32 %x
+  %cmp3 = icmp slt i32 %cond, 36
+  %retval = select i1 %cmp3, i32 36, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test70(
+; CHECK: ret i32 %cond
+}
+
+; MAX(MAX(X, 68), 47) -> MAX(X, 68)
+define i32 @test71(i32 %x) {
+entry:
+  %cmp = icmp ult i32 %x, 68
+  %cond = select i1 %cmp, i32 68, i32 %x
+  %cmp3 = icmp ult i32 %cond, 47
+  %retval = select i1 %cmp3, i32 47, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test71(
+; CHECK: ret i32 %cond
+}
+
+; SMIN(SMIN(X, 92), 11) -> SMIN(X, 11)
+define i32 @test72(i32 %x) {
+  %cmp = icmp sgt i32 %x, 92
+  %cond = select i1 %cmp, i32 92, i32 %x
+  %cmp3 = icmp sgt i32 %cond, 11
+  %retval = select i1 %cmp3, i32 11, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test72(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 11
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 11, i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+; MIN(MIN(X, 83), 24) -> MIN(X, 24)
+define i32 @test73(i32 %x) {
+  %cmp = icmp ugt i32 %x, 83
+  %cond = select i1 %cmp, i32 83, i32 %x
+  %cmp3 = icmp ugt i32 %cond, 24
+  %retval = select i1 %cmp3, i32 24, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test73(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ugt i32 %x, 24
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 24, i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+; SMAX(SMAX(X, 36), 75) -> SMAX(X, 75)
+define i32 @test74(i32 %x) {
+  %cmp = icmp slt i32 %x, 36
+  %cond = select i1 %cmp, i32 36, i32 %x
+  %cmp3 = icmp slt i32 %cond, 75
+  %retval = select i1 %cmp3, i32 75, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test74(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 75
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 75, i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+; MAX(MAX(X, 47), 68) -> MAX(X, 68)
+define i32 @test75(i32 %x) {
+  %cmp = icmp ult i32 %x, 47
+  %cond = select i1 %cmp, i32 47, i32 %x
+  %cmp3 = icmp ult i32 %cond, 68
+  %retval = select i1 %cmp3, i32 68, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test75(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ult i32 %x, 68
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 68, i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
\ No newline at end of file
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index b1082f0..5586bb6 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -36,17 +36,52 @@ define i32 @test4(i8 %A) {
 define i32 @test5(i32 %A) {
 ; CHECK-LABEL: @test5(
 ; CHECK: ret i32 undef
-        %B = lshr i32 %A, 32  ;; shift all bits out 
+        %B = lshr i32 %A, 32  ;; shift all bits out
         ret i32 %B
 }
 
+define <4 x i32> @test5_splat_vector(<4 x i32> %A) {
+; CHECK-LABEL: @test5_splat_vector(
+; CHECK: ret <4 x i32> undef
+  %B = lshr <4 x i32> %A, <i32 32, i32 32, i32 32, i32 32>     ;; shift all bits out
+  ret <4 x i32> %B
+}
+
+define <4 x i32> @test5_zero_vector(<4 x i32> %A) {
+; CHECK-LABEL: @test5_zero_vector(
+; CHECK-NEXT: ret <4 x i32> %A
+  %B = lshr <4 x i32> %A, zeroinitializer
+  ret <4 x i32> %B
+}
+
+define <4 x i32> @test5_non_splat_vector(<4 x i32> %A) {
+; CHECK-LABEL: @test5_non_splat_vector(
+; CHECK-NOT: ret <4 x i32> undef
+  %B = shl <4 x i32> %A, <i32 32, i32 1, i32 2, i32 3>
+  ret <4 x i32> %B
+}
+
 define i32 @test5a(i32 %A) {
 ; CHECK-LABEL: @test5a(
 ; CHECK: ret i32 undef
-        %B = shl i32 %A, 32     ;; shift all bits out 
+        %B = shl i32 %A, 32     ;; shift all bits out
         ret i32 %B
 }
 
+define <4 x i32> @test5a_splat_vector(<4 x i32> %A) {
+; CHECK-LABEL: @test5a_splat_vector(
+; CHECK: ret <4 x i32> undef
+  %B = shl <4 x i32> %A, <i32 32, i32 32, i32 32, i32 32>     ;; shift all bits out
+  ret <4 x i32> %B
+}
+
+define <4 x i32> @test5a_non_splat_vector(<4 x i32> %A) {
+; CHECK-LABEL: @test5a_non_splat_vector(
+; CHECK-NOT: ret <4 x i32> undef
+  %B = shl <4 x i32> %A, <i32 32, i32 1, i32 2, i32 3>
+  ret <4 x i32> %B
+}
+
 define i32 @test5b() {
 ; CHECK-LABEL: @test5b(
 ; CHECK: ret i32 -1
@@ -82,7 +117,7 @@ define i32 @test6a(i32 %A) {
 define i32 @test7(i8 %A) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT: ret i32 -1
-        %shift.upgrd.3 = zext i8 %A to i32 
+        %shift.upgrd.3 = zext i8 %A to i32
         %B = ashr i32 -1, %shift.upgrd.3  ;; Always equal to -1
         ret i32 %B
 }
@@ -232,7 +267,7 @@ define i1 @test16(i32 %X) {
 ; CHECK-NEXT: and i32 %X, 16
 ; CHECK-NEXT: icmp ne i32
 ; CHECK-NEXT: ret i1
-        %tmp.3 = ashr i32 %X, 4 
+        %tmp.3 = ashr i32 %X, 4
         %tmp.6 = and i32 %tmp.3, 1
         %tmp.7 = icmp ne i32 %tmp.6, 0
         ret i1 %tmp.7
@@ -344,6 +379,20 @@ define i32 @test25(i32 %tmp.2, i32 %AA) {
         ret i32 %tmp.6
 }
 
+define <2 x i32> @test25_vector(<2 x i32> %tmp.2, <2 x i32> %AA) {
+; CHECK-LABEL: @test25_vector(
+; CHECK: %tmp.3 = lshr <2 x i32> %tmp.2, <i32 17, i32 17>
+; CHECK-NEXT: shl <2 x i32> %tmp.3, <i32 17, i32 17>
+; CHECK-NEXT: add <2 x i32> %tmp.51, %AA
+; CHECK-NEXT: and <2 x i32> %x2, <i32 -131072, i32 -131072>
+; CHECK-NEXT: ret <2 x i32>
+  %x = lshr <2 x i32> %AA, <i32 17, i32 17>
+  %tmp.3 = lshr <2 x i32> %tmp.2, <i32 17, i32 17>
+  %tmp.5 = add <2 x i32> %tmp.3, %x
+  %tmp.6 = shl <2 x i32> %tmp.5, <i32 17, i32 17>
+  ret <2 x i32> %tmp.6
+}
+
 ;; handle casts between shifts.
 define i32 @test26(i32 %A) {
 ; CHECK-LABEL: @test26(
@@ -365,12 +414,12 @@ define i1 @test27(i32 %x) nounwind {
   %z = trunc i32 %y to i1
   ret i1 %z
 }
- 
+
 define i8 @test28(i8 %x) {
 entry:
 ; CHECK-LABEL: @test28(
 ; CHECK:     icmp slt i8 %x, 0
-; CHECK-NEXT:     br i1 
+; CHECK-NEXT:     br i1
 	%tmp1 = lshr i8 %x, 7
 	%cond1 = icmp ne i8 %tmp1, 0
 	br i1 %cond1, label %bb1, label %bb2
@@ -476,7 +525,7 @@ entry:
   %ins = or i128 %tmp23, %tmp27
   %tmp45 = lshr i128 %ins, 64
   ret i128 %tmp45
-  
+
 ; CHECK-LABEL: @test36(
 ; CHECK:  %tmp231 = or i128 %B, %A
 ; CHECK:  %ins = and i128 %tmp231, 18446744073709551615
@@ -492,7 +541,7 @@ entry:
   %tmp45 = lshr i128 %ins, 64
   %tmp46 = trunc i128 %tmp45 to i64
   ret i64 %tmp46
-  
+
 ; CHECK-LABEL: @test37(
 ; CHECK:  %tmp23 = shl nuw nsw i128 %tmp22, 32
 ; CHECK:  %ins = or i128 %tmp23, %A
@@ -780,3 +829,32 @@ bb11:                                             ; preds = %bb8
 bb12:                                             ; preds = %bb11, %bb8, %bb
   ret void
 }
+
+define i32 @test64(i32 %a) {
+; CHECK-LABEL: @test64(
+; CHECK-NEXT: ret i32 undef
+  %b = ashr i32 %a, 32  ; shift all bits out
+  ret i32 %b
+}
+
+define <4 x i32> @test64_splat_vector(<4 x i32> %a) {
+; CHECK-LABEL: @test64_splat_vector
+; CHECK-NEXT: ret <4 x i32> undef
+  %b = ashr <4 x i32> %a, <i32 32, i32 32, i32 32, i32 32>  ; shift all bits out
+  ret <4 x i32> %b
+}
+
+define <4 x i32> @test64_non_splat_vector(<4 x i32> %a) {
+; CHECK-LABEL: @test64_non_splat_vector
+; CHECK-NOT: ret <4 x i32> undef
+  %b = ashr <4 x i32> %a, <i32 32, i32 0, i32 1, i32 2>  ; shift all bits out
+  ret <4 x i32> %b
+}
+
+define <2 x i65> @test_65(<2 x i64> %t) {
+; CHECK-LABEL: @test_65
+  %a = zext <2 x i64> %t to <2 x i65>
+  %sext = shl <2 x i65> %a, <i65 33, i65 33>
+  %b = ashr <2 x i65> %sext, <i65 33, i65 33>
+  ret <2 x i65> %b
+}
diff --git a/test/Transforms/InstCombine/strlen-1.ll b/test/Transforms/InstCombine/strlen-1.ll
index 4fa5b4f..4a3caf2 100644
--- a/test/Transforms/InstCombine/strlen-1.ll
+++ b/test/Transforms/InstCombine/strlen-1.ll
@@ -5,6 +5,7 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
 @hello = constant [6 x i8] c"hello\00"
+@longer = constant [7 x i8] c"longer\00"
 @null = constant [1 x i8] zeroinitializer
 @null_hello = constant [7 x i8] c"\00hello\00"
 @nullstring = constant i8 0
@@ -85,6 +86,17 @@ define i1 @test_simplify8() {
 ; CHECK-NEXT: ret i1 false
 }
 
+define i32 @test_simplify9(i1 %x) {
+; CHECK-LABEL: @test_simplify9
+  %hello = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %longer = getelementptr [7 x i8]* @longer, i32 0, i32 0
+  %s = select i1 %x, i8* %hello, i8* %longer
+  %l = call i32 @strlen(i8* %s)
+; CHECK-NEXT: select i1 %x, i32 5, i32 6
+  ret i32 %l
+; CHECK-NEXT: ret
+}
+
 ; Check cases that shouldn't be simplified.
 
 define i32 @test_no_simplify1() {
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index d12412a..41d2b29 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define i16 @test1(float %f) {
 entry:
@@ -209,4 +210,369 @@ define <4 x float> @test_select(float %f, float %g) {
   ret <4 x float> %ret
 }
 
+; We should optimize these two redundant insertqi into one
+; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
+; CHECK-NOT: insertqi
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32)
+  ret <2 x i64> %2
+}
+
+; The result of this insert is the second arg, since the top 64 bits of
+; the result are undefined, and we copy the bottom 64 bits from the
+; second arg
+; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: ret <2 x i64> %i
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
+  ret <2 x i64> %1
+}
+
+; Test the several types of ranges and ordering that exist for two insertqi
+; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
+; CHECK: ret <2 x i64> %[[RES]]
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
+; CHECK: ret <2 x i64> %[[RES]]
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
+; CHECK: ret <2 x i64> %[[RES]]
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
+; CHECK: ret <2 x i64> %[[RES]]
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
+; CHECK: ret <2 x i64> %[[RES]]
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
+; CHECK: ret <2 x i64> %[[RES]]
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
+; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) {
+; CHECK:  tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
+; CHECK:  tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
+  ret <2 x i64> %2
+}
+
+
+; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
+
+declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
+define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps(
+; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+  ret <4 x float> %a
+}
+
+declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
+define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps_256(
+; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %a
+}
+
+declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i32>)
+define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd(
+; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i32> <i32 2, i32 0>)
+  ret <2 x double> %a
+}
+
+declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i32>)
+define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd_256(
+; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i32> <i32 3, i32 1, i32 2, i32 0>)
+  ret <4 x double> %a
+}
+
+define <4 x float> @test_vpermilvar_ps_zero(<4 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps_zero(
+; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer)
+  ret <4 x float> %a
+}
+
+define <8 x float> @test_vpermilvar_ps_256_zero(<8 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps_256_zero(
+; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer)
+  ret <8 x float> %a
+}
+
+define <2 x double> @test_vpermilvar_pd_zero(<2 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd_zero(
+; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i32> zeroinitializer)
+  ret <2 x double> %a
+}
+
+define <4 x double> @test_vpermilvar_pd_256_zero(<4 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd_256_zero(
+; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i32> zeroinitializer)
+  ret <4 x double> %a
+}
+
+define <2 x i64> @test_sse2_1() nounwind readnone uwtable {
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
+  ret <2 x i64> %16
+; CHECK: test_sse2_1
+; CHECK: ret <2 x i64> <i64 72058418680037440, i64 144117112246370624>
+}
+
+define <4 x i64> @test_avx2_1() nounwind readnone uwtable {
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+; CHECK: test_avx2_1
+; CHECK: ret <4 x i64> <i64 64, i64 128, i64 192, i64 256>
+}
+
+define <2 x i64> @test_sse2_0() nounwind readnone uwtable {
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
+  ret <2 x i64> %16
+; CHECK: test_sse2_0
+; CHECK: ret <2 x i64> zeroinitializer
+}
+
+define <4 x i64> @test_avx2_0() nounwind readnone uwtable {
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+; CHECK: test_avx2_0
+; CHECK: ret <4 x i64> zeroinitializer
+}
+define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable {
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
+  ret <2 x i64> %16
+; CHECK: test_sse2_psrl_1
+; CHECK: ret <2 x i64> <i64 562954248421376, i64 9007267974742020>
+}
+
+define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable {
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+; CHECK: test_avx2_psrl_1
+; CHECK: ret <4 x i64> <i64 16, i64 32, i64 64, i64 128>
+}
+
+define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable {
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
+  ret <2 x i64> %16
+; CHECK: test_sse2_psrl_0
+; CHECK: ret <2 x i64> zeroinitializer
+}
+
+define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable {
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+; CHECK: test_avx2_psrl_0
+; CHECK: ret <4 x i64> zeroinitializer
+}
+
+declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1
+declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1
+declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1
+declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1
+declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1
+declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1
+declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1
+declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1
+declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1
+declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1
+declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1
+declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1
+declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1
+declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1
+declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1
+declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1
+declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1
+declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1
+declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1
 
+attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll
index a409a91..fc0f8bd 100644
--- a/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/test/Transforms/InstCombine/vec_shuffle.ll
@@ -244,4 +244,164 @@ define <4 x i8> @test16b(i8 %ele) {
   %tmp1 = shl <8 x i8> %tmp0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i8> %tmp2
-}
\ No newline at end of file
+}
+
+; If composition of two shuffles is identity, shuffles can be removed.
+define <4 x i32> @shuffle_17ident(<4 x i32> %v) nounwind uwtable {
+; CHECK-LABEL: @shuffle_17ident(
+; CHECK-NOT: shufflevector
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer,
+                           <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %shuffle2 = shufflevector <4 x i32> %shuffle, <4 x i32> zeroinitializer,
+                            <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+  ret <4 x i32> %shuffle2
+}
+
+; swizzle can be put after operation
+define <4 x i32> @shuffle_17and(<4 x i32> %v1, <4 x i32> %v2) nounwind uwtable {
+; CHECK-LABEL: @shuffle_17and(
+; CHECK-NOT: shufflevector
+; CHECK: and <4 x i32> %v1, %v2
+; CHECK: shufflevector
+  %t1 = shufflevector <4 x i32> %v1, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %t2 = shufflevector <4 x i32> %v2, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %r = and <4 x i32> %t1, %t2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_17add(<4 x i32> %v1, <4 x i32> %v2) nounwind uwtable {
+; CHECK-LABEL: @shuffle_17add(
+; CHECK-NOT: shufflevector
+; CHECK: add <4 x i32> %v1, %v2
+; CHECK: shufflevector
+  %t1 = shufflevector <4 x i32> %v1, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %t2 = shufflevector <4 x i32> %v2, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %r = add <4 x i32> %t1, %t2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_17addnsw(<4 x i32> %v1, <4 x i32> %v2) nounwind uwtable {
+; CHECK-LABEL: @shuffle_17addnsw(
+; CHECK-NOT: shufflevector
+; CHECK: add nsw <4 x i32> %v1, %v2
+; CHECK: shufflevector
+  %t1 = shufflevector <4 x i32> %v1, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %t2 = shufflevector <4 x i32> %v2, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %r = add nsw <4 x i32> %t1, %t2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_17addnuw(<4 x i32> %v1, <4 x i32> %v2) nounwind uwtable {
+; CHECK-LABEL: @shuffle_17addnuw(
+; CHECK-NOT: shufflevector
+; CHECK: add nuw <4 x i32> %v1, %v2
+; CHECK: shufflevector
+  %t1 = shufflevector <4 x i32> %v1, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %t2 = shufflevector <4 x i32> %v2, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %r = add nuw <4 x i32> %t1, %t2
+  ret <4 x i32> %r
+}
+
+define <4 x float> @shuffle_17fsub(<4 x float> %v1, <4 x float> %v2) nounwind uwtable {
+; CHECK-LABEL: @shuffle_17fsub(
+; CHECK-NOT: shufflevector
+; CHECK: fsub <4 x float> %v1, %v2
+; CHECK: shufflevector
+  %t1 = shufflevector <4 x float> %v1, <4 x float> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %t2 = shufflevector <4 x float> %v2, <4 x float> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %r = fsub <4 x float> %t1, %t2
+  ret <4 x float> %r
+}
+
+define <4 x i32> @shuffle_17addconst(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: @shuffle_17addconst(
+; CHECK-NOT: shufflevector
+; CHECK: [[VAR1:%[a-zA-Z0-9.]+]] = add <4 x i32> %v1, <i32 4, i32 1, i32 2, i32 3>
+; CHECK: [[VAR2:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[VAR1]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK: ret <4 x i32> [[VAR2]]
+  %t1 = shufflevector <4 x i32> %v1, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %r = add <4 x i32> %t1, <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_17add2(<4 x i32> %v) {
+; CHECK-LABEL: @shuffle_17add2(
+; CHECK-NOT: shufflevector
+; CHECK: [[VAR:%[a-zA-Z0-9.]+]] = shl <4 x i32> %v, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: ret <4 x i32> [[VAR]]
+  %t1 = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %t2 = add <4 x i32> %t1, %t1
+  %r = shufflevector <4 x i32> %t2, <4 x i32> zeroinitializer,
+                     <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_17mulsplat(<4 x i32> %v) {
+; CHECK-LABEL: @shuffle_17mulsplat(
+; CHECK-NOT: shufflevector
+; CHECK: [[VAR1:%[a-zA-Z0-9.]+]] = mul <4 x i32> %v, %v
+; CHECK: [[VAR2:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[VAR1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: ret <4 x i32> [[VAR2]]
+  %s1 = shufflevector <4 x i32> %v,
+                      <4 x i32> zeroinitializer,
+                      <4 x i32> zeroinitializer
+  %m1 = mul <4 x i32> %s1, %s1
+  %s2 = shufflevector <4 x i32> %m1,
+                      <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %s2
+}
+
+; Do not reorder shuffle and binop if LHS of shuffles are of different size
+define <2 x i32> @pr19717(<4 x i32> %in0, <2 x i32> %in1) {
+; CHECK-LABEL: @pr19717(
+; CHECK: shufflevector
+; CHECK: shufflevector
+; CHECK: mul
+  %shuffle = shufflevector <4 x i32> %in0, <4 x i32> %in0, <2 x i32> zeroinitializer
+  %shuffle4 = shufflevector <2 x i32> %in1, <2 x i32> %in1, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %shuffle4
+  ret <2 x i32> %mul
+}
+
+define <4 x i16> @pr19717a(<8 x i16> %in0, <8 x i16> %in1) {
+; CHECK-LABEL: @pr19717a(
+; CHECK: [[VAR1:%[a-zA-Z0-9.]+]] = mul <8 x i16> %in0, %in1
+; CHECK: [[VAR2:%[a-zA-Z0-9.]+]] = shufflevector <8 x i16> [[VAR1]], <8 x i16> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+; CHECK: ret <4 x i16> [[VAR2]]
+  %shuffle = shufflevector <8 x i16> %in0, <8 x i16> %in0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+  %shuffle1 = shufflevector <8 x i16> %in1, <8 x i16> %in1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+  %mul = mul <4 x i16> %shuffle, %shuffle1
+  ret <4 x i16> %mul
+}
+
+define <8 x i8> @pr19730(<16 x i8> %in0) {
+; CHECK-LABEL: @pr19730(
+; CHECK: shufflevector
+  %shuffle = shufflevector <16 x i8> %in0, <16 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %shuffle1 = shufflevector <8 x i8> %shuffle, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i8> %shuffle1
+}
+
+define i32 @pr19737(<4 x i32> %in0) {
+; CHECK-LABEL: @pr19737(
+; CHECK: [[VAR:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> %in0, i32 0
+; CHECK: ret i32 [[VAR]]
+  %shuffle.i = shufflevector <4 x i32> zeroinitializer, <4 x i32> %in0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %neg.i = xor <4 x i32> %shuffle.i, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and.i = and <4 x i32> %in0, %neg.i
+  %rv = extractelement <4 x i32> %and.i, i32 0
+  ret i32 %rv
+}
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index ee6be04..105e244 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -757,3 +757,129 @@ define <4 x i8> @vectorselectfold2(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-LABEL: @vectorselectfold
 ; CHECK-NEXT: ret <4 x i8> %a
 }
+
+define i1 @compare_always_true_slt(i16 %a) {
+  %1 = zext i16 %a to i32
+  %2 = sub nsw i32 0, %1
+  %3 = icmp slt i32 %2, 1
+  ret i1 %3
+
+; CHECK-LABEL: @compare_always_true_slt
+; CHECK-NEXT: ret i1 true
+}
+
+define i1 @compare_always_true_sle(i16 %a) {
+  %1 = zext i16 %a to i32
+  %2 = sub nsw i32 0, %1
+  %3 = icmp sle i32 %2, 0
+  ret i1 %3
+
+; CHECK-LABEL: @compare_always_true_sle
+; CHECK-NEXT: ret i1 true
+}
+
+define i1 @compare_always_false_sgt(i16 %a) {
+  %1 = zext i16 %a to i32
+  %2 = sub nsw i32 0, %1
+  %3 = icmp sgt i32 %2, 0
+  ret i1 %3
+
+; CHECK-LABEL: @compare_always_false_sgt
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @compare_always_false_sge(i16 %a) {
+  %1 = zext i16 %a to i32
+  %2 = sub nsw i32 0, %1
+  %3 = icmp sge i32 %2, 1
+  ret i1 %3
+
+; CHECK-LABEL: @compare_always_false_sge
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @compare_always_false_eq(i16 %a) {
+  %1 = zext i16 %a to i32
+  %2 = sub nsw i32 0, %1
+  %3 = icmp eq i32 %2, 1
+  ret i1 %3
+
+; CHECK-LABEL: @compare_always_false_eq
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @compare_always_false_ne(i16 %a) {
+  %1 = zext i16 %a to i32
+  %2 = sub nsw i32 0, %1
+  %3 = icmp ne i32 %2, 1
+  ret i1 %3
+
+; CHECK-LABEL: @compare_always_false_ne
+; CHECK-NEXT: ret i1 true
+}
+
+define i1 @compare_dividend(i32 %a) {
+  %div = sdiv i32 2, %a
+  %cmp = icmp eq i32 %div, 3
+  ret i1 %cmp
+
+; CHECK-LABEL: @compare_dividend
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @lshr_ugt_false(i32 %a) {
+  %shr = lshr i32 1, %a
+  %cmp = icmp ugt i32 %shr, 1
+  ret i1 %cmp
+; CHECK-LABEL: @lshr_ugt_false
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @exact_lshr_ugt_false(i32 %a) {
+  %shr = lshr exact i32 30, %a
+  %cmp = icmp ult i32 %shr, 15
+  ret i1 %cmp
+; CHECK-LABEL: @exact_lshr_ugt_false
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @lshr_sgt_false(i32 %a) {
+  %shr = lshr i32 1, %a
+  %cmp = icmp sgt i32 %shr, 1
+  ret i1 %cmp
+; CHECK-LABEL: @lshr_sgt_false
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @ashr_sgt_false(i32 %a) {
+  %shr = ashr i32 -30, %a
+  %cmp = icmp sgt i32 %shr, -1
+  ret i1 %cmp
+; CHECK-LABEL: @ashr_sgt_false
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @exact_ashr_sgt_false(i32 %a) {
+  %shr = ashr exact i32 -30, %a
+  %cmp = icmp sgt i32 %shr, -15
+  ret i1 %cmp
+; CHECK-LABEL: @exact_ashr_sgt_false
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @nonnull_arg(i32* nonnull %i) {
+  %cmp = icmp eq i32* %i, null
+  ret i1 %cmp
+; CHECK-LABEL: @nonnull_arg
+; CHECK: ret i1 false
+}
+
+declare nonnull i32* @returns_nonnull_helper()
+define i1 @returns_nonnull() {
+  %call = call nonnull i32* @returns_nonnull_helper()
+  %cmp = icmp eq i32* %call, null
+  ret i1 %cmp
+; CHECK-LABEL: @returns_nonnull
+; CHECK: ret i1 false
+}
+
diff --git a/test/Transforms/InstSimplify/dead-code-removal.ll b/test/Transforms/InstSimplify/dead-code-removal.ll
new file mode 100644
index 0000000..e181f3b
--- /dev/null
+++ b/test/Transforms/InstSimplify/dead-code-removal.ll
@@ -0,0 +1,15 @@
+; RUN: opt -instsimplify -S < %s | FileCheck %s
+
+define void @foo() nounwind {
+  br i1 undef, label %1, label %4
+
+; <label>:1                                       ; preds = %1, %0
+; CHECK-NOT: phi
+; CHECK-NOT: sub
+  %2 = phi i32 [ %3, %1 ], [ undef, %0 ]
+  %3 = sub i32 0, undef
+  br label %1
+
+; <label>:4                                       ; preds = %0
+  ret void
+}
diff --git a/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll b/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
index 47cf3f0..16bfe2a 100644
--- a/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
+++ b/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
@@ -1,10 +1,17 @@
-; RUN: opt < %s -internalize -internalize-public-api-list main -S | grep internal | count 3
+; RUN: opt < %s -internalize -internalize-public-api-list main -S | FileCheck %s
 
 @A = global i32 0
+; CHECK: @A = internal global i32 0
+
 @B = alias i32* @A
-@C = alias i32* @B
+; CHECK: @B = alias internal i32* @A
+
+@C = alias i32* @A
+; CHECK: @C = alias internal i32* @A
 
 define i32 @main() {
 	%tmp = load i32* @C
 	ret i32 %tmp
 }
+
+; CHECK: define i32 @main() {
diff --git a/test/Transforms/Internalize/local-visibility.ll b/test/Transforms/Internalize/local-visibility.ll
new file mode 100644
index 0000000..c24d4b7
--- /dev/null
+++ b/test/Transforms/Internalize/local-visibility.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -internalize -S | FileCheck %s
+; Internalized symbols should have default visibility.
+
+; CHECK: @global = global i32 0
+@global = global i32 0
+@llvm.used = appending global [1 x i32*] [i32* @global]
+
+; CHECK: @hidden.variable = internal global i32 0
+@hidden.variable = hidden global i32 0
+; CHECK: @protected.variable = internal global i32 0
+@protected.variable = protected global i32 0
+
+; CHECK: @hidden.alias = alias internal i32* @global
+@hidden.alias = hidden alias i32* @global
+; CHECK: @protected.alias = alias internal i32* @global
+@protected.alias = protected alias i32* @global
+
+; CHECK: define internal void @hidden.function() {
+define hidden void @hidden.function() {
+  ret void
+}
+; CHECK: define internal void @protected.function() {
+define protected void @protected.function() {
+  ret void
+}
diff --git a/test/Transforms/JumpThreading/phi-eq.ll b/test/Transforms/JumpThreading/phi-eq.ll
index 40d3c7e..e05d5ee 100644
--- a/test/Transforms/JumpThreading/phi-eq.ll
+++ b/test/Transforms/JumpThreading/phi-eq.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | opt -jump-threading | llvm-dis | FileCheck %s
+; RUN: opt < %s -jump-threading -S | FileCheck %s
 ; Test whether two consecutive switches with identical structures assign the
 ; proper value to the proper variable.  This is really testing 
 ; Instruction::isIdenticalToWhenDefined, as previously that function was 
diff --git a/test/Transforms/LoopSimplify/2007-10-28-InvokeCrash.ll b/test/Transforms/LoopSimplify/2007-10-28-InvokeCrash.ll
index e91d141..0534a0b 100644
--- a/test/Transforms/LoopSimplify/2007-10-28-InvokeCrash.ll
+++ b/test/Transforms/LoopSimplify/2007-10-28-InvokeCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | opt -loop-simplify -disable-output
+; RUN: opt < %s -loop-simplify -disable-output
 ; PR1752
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-s0:0:64-f80:32:32"
 target triple = "i686-pc-mingw32"
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg b/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg
new file mode 100644
index 0000000..6642d28
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AArch64/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll']
+
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/lsr-memcpy.ll b/test/Transforms/LoopStrengthReduce/AArch64/lsr-memcpy.ll
new file mode 100644
index 0000000..9a175ad
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AArch64/lsr-memcpy.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=arm64-unknown-unknown -mcpu=cyclone -pre-RA-sched=list-hybrid < %s | FileCheck %s
+; rdar://10232252
+; Prevent LSR of doing poor choice that cannot be folded in addressing mode
+
+; Remove the -pre-RA-sched=list-hybrid option after fixing:
+; <rdar://problem/12702735> [ARM64][coalescer] need better register
+; coalescing for simple unit tests.
+
+; CHECK: testCase
+; CHECK: %while.body{{$}}
+; CHECK: ldr [[STREG:x[0-9]+]], [{{x[0-9]+}}], #8
+; CHECK-NEXT: str [[STREG]], [{{x[0-9]+}}], #8
+; CHECK: %while.end
+define i32 @testCase() nounwind ssp {
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %len.06 = phi i64 [ 1288, %entry ], [ %sub, %while.body ]
+  %pDst.05 = phi i64* [ inttoptr (i64 6442450944 to i64*), %entry ], [ %incdec.ptr1, %while.body ]
+  %pSrc.04 = phi i64* [ inttoptr (i64 4294967296 to i64*), %entry ], [ %incdec.ptr, %while.body ]
+  %incdec.ptr = getelementptr inbounds i64* %pSrc.04, i64 1
+  %tmp = load volatile i64* %pSrc.04, align 8
+  %incdec.ptr1 = getelementptr inbounds i64* %pDst.05, i64 1
+  store volatile i64 %tmp, i64* %pDst.05, align 8
+  %sub = add i64 %len.06, -8
+  %cmp = icmp sgt i64 %sub, -1
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body
+  tail call void inttoptr (i64 6442450944 to void ()*)() nounwind
+  ret i32 0
+}
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll b/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll
new file mode 100644
index 0000000..10b2c3a
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -O3 -mtriple=arm64-unknown-unknown -mcpu=cyclone -pre-RA-sched=list-hybrid | FileCheck %s
+; <rdar://problem/11635990> [arm64] [lsr] Inefficient EA/loop-exit calc in bzero_phys
+;
+; LSR on loop %while.cond should reassociate non-address mode
+; expressions at use %cmp16 to avoid sinking computation into %while.body18.
+;
+; Remove the -pre-RA-sched=list-hybrid option after fixing:
+; <rdar://problem/12702735> [ARM64][coalescer] need better register
+; coalescing for simple unit tests.
+
+; CHECK: @memset
+; CHECK: %while.body18{{$}}
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #8
+; First set the IVREG variable, then use it
+; CHECK-NEXT: sub [[IVREG:x[0-9]+]],
+; CHECK: [[IVREG]], #8
+; CHECK-NEXT: cmp  [[IVREG]], #7
+; CHECK-NEXT: b.hi
+define i8* @memset(i8* %dest, i32 %val, i64 %len) nounwind ssp noimplicitfloat {
+entry:
+  %cmp = icmp eq i64 %len, 0
+  br i1 %cmp, label %done, label %while.cond.preheader
+
+while.cond.preheader:                             ; preds = %entry
+  %conv = trunc i32 %val to i8
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %while.cond.preheader
+  %ptr.0 = phi i8* [ %incdec.ptr, %while.body ], [ %dest, %while.cond.preheader ]
+  %len.addr.0 = phi i64 [ %dec, %while.body ], [ %len, %while.cond.preheader ]
+  %cond = icmp eq i64 %len.addr.0, 0
+  br i1 %cond, label %done, label %land.rhs
+
+land.rhs:                                         ; preds = %while.cond
+  %0 = ptrtoint i8* %ptr.0 to i64
+  %and = and i64 %0, 7
+  %cmp5 = icmp eq i64 %and, 0
+  br i1 %cmp5, label %if.end9, label %while.body
+
+while.body:                                       ; preds = %land.rhs
+  %incdec.ptr = getelementptr inbounds i8* %ptr.0, i64 1
+  store i8 %conv, i8* %ptr.0, align 1, !tbaa !0
+  %dec = add i64 %len.addr.0, -1
+  br label %while.cond
+
+if.end9:                                          ; preds = %land.rhs
+  %conv.mask = and i32 %val, 255
+  %1 = zext i32 %conv.mask to i64
+  %2 = shl nuw nsw i64 %1, 8
+  %ins18 = or i64 %2, %1
+  %3 = shl nuw nsw i64 %1, 16
+  %ins15 = or i64 %ins18, %3
+  %4 = shl nuw nsw i64 %1, 24
+  %5 = shl nuw nsw i64 %1, 32
+  %mask8 = or i64 %ins15, %4
+  %6 = shl nuw nsw i64 %1, 40
+  %mask5 = or i64 %mask8, %5
+  %7 = shl nuw nsw i64 %1, 48
+  %8 = shl nuw i64 %1, 56
+  %mask2.masked = or i64 %mask5, %6
+  %mask = or i64 %mask2.masked, %7
+  %ins = or i64 %mask, %8
+  %9 = bitcast i8* %ptr.0 to i64*
+  %cmp1636 = icmp ugt i64 %len.addr.0, 7
+  br i1 %cmp1636, label %while.body18, label %while.body29.lr.ph
+
+while.body18:                                     ; preds = %if.end9, %while.body18
+  %wideptr.038 = phi i64* [ %incdec.ptr19, %while.body18 ], [ %9, %if.end9 ]
+  %len.addr.137 = phi i64 [ %sub, %while.body18 ], [ %len.addr.0, %if.end9 ]
+  %incdec.ptr19 = getelementptr inbounds i64* %wideptr.038, i64 1
+  store i64 %ins, i64* %wideptr.038, align 8, !tbaa !2
+  %sub = add i64 %len.addr.137, -8
+  %cmp16 = icmp ugt i64 %sub, 7
+  br i1 %cmp16, label %while.body18, label %while.end20
+
+while.end20:                                      ; preds = %while.body18
+  %cmp21 = icmp eq i64 %sub, 0
+  br i1 %cmp21, label %done, label %while.body29.lr.ph
+
+while.body29.lr.ph:                               ; preds = %while.end20, %if.end9
+  %len.addr.1.lcssa49 = phi i64 [ %sub, %while.end20 ], [ %len.addr.0, %if.end9 ]
+  %wideptr.0.lcssa48 = phi i64* [ %incdec.ptr19, %while.end20 ], [ %9, %if.end9 ]
+  %10 = bitcast i64* %wideptr.0.lcssa48 to i8*
+  br label %while.body29
+
+while.body29:                                     ; preds = %while.body29, %while.body29.lr.ph
+  %len.addr.235 = phi i64 [ %len.addr.1.lcssa49, %while.body29.lr.ph ], [ %dec26, %while.body29 ]
+  %ptr.134 = phi i8* [ %10, %while.body29.lr.ph ], [ %incdec.ptr31, %while.body29 ]
+  %dec26 = add i64 %len.addr.235, -1
+  %incdec.ptr31 = getelementptr inbounds i8* %ptr.134, i64 1
+  store i8 %conv, i8* %ptr.134, align 1, !tbaa !0
+  %cmp27 = icmp eq i64 %dec26, 0
+  br i1 %cmp27, label %done, label %while.body29
+
+done:                                             ; preds = %while.cond, %while.body29, %while.end20, %entry
+  ret i8* %dest
+}
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA"}
+!2 = metadata !{metadata !"long long", metadata !0}
diff --git a/test/Transforms/LoopStrengthReduce/AArch64/req-regs.ll b/test/Transforms/LoopStrengthReduce/AArch64/req-regs.ll
new file mode 100644
index 0000000..217896e
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AArch64/req-regs.ll
@@ -0,0 +1,70 @@
+; RUN: llc -mcpu=cyclone -debug-only=loop-reduce < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; LSR used to fail here due to a bug in the ReqRegs test.
+; CHECK: The chosen solution requires
+; CHECK-NOT: No Satisfactory Solution
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+define void @do_integer_add(i64 %iterations, i8* nocapture readonly %cookie) {
+entry:
+  %N = bitcast i8* %cookie to i32*
+  %0 = load i32* %N, align 4
+  %add = add nsw i32 %0, 57
+  %cmp56 = icmp eq i64 %iterations, 0
+  br i1 %cmp56, label %while.end, label %for.cond.preheader.preheader
+
+for.cond.preheader.preheader:                     ; preds = %entry
+  br label %for.cond.preheader
+
+while.cond.loopexit:                              ; preds = %for.body
+  %add21.lcssa = phi i32 [ %add21, %for.body ]
+  %dec58 = add i64 %dec58.in, -1
+  %cmp = icmp eq i64 %dec58, 0
+  br i1 %cmp, label %while.end.loopexit, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %for.cond.preheader.preheader, %while.cond.loopexit
+  %dec58.in = phi i64 [ %dec58, %while.cond.loopexit ], [ %iterations, %for.cond.preheader.preheader ]
+  %a.057 = phi i32 [ %add21.lcssa, %while.cond.loopexit ], [ %add, %for.cond.preheader.preheader ]
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.cond.preheader
+  %a.154 = phi i32 [ %a.057, %for.cond.preheader ], [ %add21, %for.body ]
+  %i.053 = phi i32 [ 1, %for.cond.preheader ], [ %inc, %for.body ]
+  %inc = add nsw i32 %i.053, 1
+  %add2 = shl i32 %a.154, 1
+  %add3 = add nsw i32 %add2, %i.053
+  %add4 = shl i32 %add3, 1
+  %add5 = add nsw i32 %add4, %i.053
+  %add6 = shl i32 %add5, 1
+  %add7 = add nsw i32 %add6, %i.053
+  %add8 = shl i32 %add7, 1
+  %add9 = add nsw i32 %add8, %i.053
+  %add10 = shl i32 %add9, 1
+  %add11 = add nsw i32 %add10, %i.053
+  %add12 = shl i32 %add11, 1
+  %add13 = add nsw i32 %add12, %i.053
+  %add14 = shl i32 %add13, 1
+  %add15 = add nsw i32 %add14, %i.053
+  %add16 = shl i32 %add15, 1
+  %add17 = add nsw i32 %add16, %i.053
+  %add18 = shl i32 %add17, 1
+  %add19 = add nsw i32 %add18, %i.053
+  %add20 = shl i32 %add19, 1
+  %add21 = add nsw i32 %add20, %i.053
+  %exitcond = icmp eq i32 %inc, 1001
+  br i1 %exitcond, label %while.cond.loopexit, label %for.body
+
+while.end.loopexit:                               ; preds = %while.cond.loopexit
+  %add21.lcssa.lcssa = phi i32 [ %add21.lcssa, %while.cond.loopexit ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %a.0.lcssa = phi i32 [ %add, %entry ], [ %add21.lcssa.lcssa, %while.end.loopexit ]
+  tail call void @use_int(i32 %a.0.lcssa)
+  ret void
+}
+
+declare void @use_int(i32)
diff --git a/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll b/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
index 5d728b5..756ea82 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
+++ b/test/Transforms/LoopStrengthReduce/ARM/2012-06-15-lsr-noaddrmode.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -march=thumb -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
 ;
 ; LSR should only check for valid address modes when the IV user is a
 ; memory address.
diff --git a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
index ab7f20f..f4edf09 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
+++ b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -O3 -march=thumb -mcpu=cortex-a9 | FileCheck %s -check-prefix=A9
+; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9
+; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a9 -addr-sink-using-gep=1 %s -o - | FileCheck %s -check-prefix=A9
 
 ; @simple is the most basic chain of address induction variables. Chaining
 ; saves at least one register and avoids complex addressing and setup
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg b/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
deleted file mode 100644
index a499579..0000000
--- a/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-config.suffixes = ['.ll']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll b/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll
deleted file mode 100644
index 9a175ad..0000000
--- a/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: llc -mtriple=arm64-unknown-unknown -mcpu=cyclone -pre-RA-sched=list-hybrid < %s | FileCheck %s
-; rdar://10232252
-; Prevent LSR of doing poor choice that cannot be folded in addressing mode
-
-; Remove the -pre-RA-sched=list-hybrid option after fixing:
-; <rdar://problem/12702735> [ARM64][coalescer] need better register
-; coalescing for simple unit tests.
-
-; CHECK: testCase
-; CHECK: %while.body{{$}}
-; CHECK: ldr [[STREG:x[0-9]+]], [{{x[0-9]+}}], #8
-; CHECK-NEXT: str [[STREG]], [{{x[0-9]+}}], #8
-; CHECK: %while.end
-define i32 @testCase() nounwind ssp {
-entry:
-  br label %while.body
-
-while.body:                                       ; preds = %while.body, %entry
-  %len.06 = phi i64 [ 1288, %entry ], [ %sub, %while.body ]
-  %pDst.05 = phi i64* [ inttoptr (i64 6442450944 to i64*), %entry ], [ %incdec.ptr1, %while.body ]
-  %pSrc.04 = phi i64* [ inttoptr (i64 4294967296 to i64*), %entry ], [ %incdec.ptr, %while.body ]
-  %incdec.ptr = getelementptr inbounds i64* %pSrc.04, i64 1
-  %tmp = load volatile i64* %pSrc.04, align 8
-  %incdec.ptr1 = getelementptr inbounds i64* %pDst.05, i64 1
-  store volatile i64 %tmp, i64* %pDst.05, align 8
-  %sub = add i64 %len.06, -8
-  %cmp = icmp sgt i64 %sub, -1
-  br i1 %cmp, label %while.body, label %while.end
-
-while.end:                                        ; preds = %while.body
-  tail call void inttoptr (i64 6442450944 to void ()*)() nounwind
-  ret i32 0
-}
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll b/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll
deleted file mode 100644
index 10b2c3a..0000000
--- a/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc < %s -O3 -mtriple=arm64-unknown-unknown -mcpu=cyclone -pre-RA-sched=list-hybrid | FileCheck %s
-; <rdar://problem/11635990> [arm64] [lsr] Inefficient EA/loop-exit calc in bzero_phys
-;
-; LSR on loop %while.cond should reassociate non-address mode
-; expressions at use %cmp16 to avoid sinking computation into %while.body18.
-;
-; Remove the -pre-RA-sched=list-hybrid option after fixing:
-; <rdar://problem/12702735> [ARM64][coalescer] need better register
-; coalescing for simple unit tests.
-
-; CHECK: @memset
-; CHECK: %while.body18{{$}}
-; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #8
-; First set the IVREG variable, then use it
-; CHECK-NEXT: sub [[IVREG:x[0-9]+]],
-; CHECK: [[IVREG]], #8
-; CHECK-NEXT: cmp  [[IVREG]], #7
-; CHECK-NEXT: b.hi
-define i8* @memset(i8* %dest, i32 %val, i64 %len) nounwind ssp noimplicitfloat {
-entry:
-  %cmp = icmp eq i64 %len, 0
-  br i1 %cmp, label %done, label %while.cond.preheader
-
-while.cond.preheader:                             ; preds = %entry
-  %conv = trunc i32 %val to i8
-  br label %while.cond
-
-while.cond:                                       ; preds = %while.body, %while.cond.preheader
-  %ptr.0 = phi i8* [ %incdec.ptr, %while.body ], [ %dest, %while.cond.preheader ]
-  %len.addr.0 = phi i64 [ %dec, %while.body ], [ %len, %while.cond.preheader ]
-  %cond = icmp eq i64 %len.addr.0, 0
-  br i1 %cond, label %done, label %land.rhs
-
-land.rhs:                                         ; preds = %while.cond
-  %0 = ptrtoint i8* %ptr.0 to i64
-  %and = and i64 %0, 7
-  %cmp5 = icmp eq i64 %and, 0
-  br i1 %cmp5, label %if.end9, label %while.body
-
-while.body:                                       ; preds = %land.rhs
-  %incdec.ptr = getelementptr inbounds i8* %ptr.0, i64 1
-  store i8 %conv, i8* %ptr.0, align 1, !tbaa !0
-  %dec = add i64 %len.addr.0, -1
-  br label %while.cond
-
-if.end9:                                          ; preds = %land.rhs
-  %conv.mask = and i32 %val, 255
-  %1 = zext i32 %conv.mask to i64
-  %2 = shl nuw nsw i64 %1, 8
-  %ins18 = or i64 %2, %1
-  %3 = shl nuw nsw i64 %1, 16
-  %ins15 = or i64 %ins18, %3
-  %4 = shl nuw nsw i64 %1, 24
-  %5 = shl nuw nsw i64 %1, 32
-  %mask8 = or i64 %ins15, %4
-  %6 = shl nuw nsw i64 %1, 40
-  %mask5 = or i64 %mask8, %5
-  %7 = shl nuw nsw i64 %1, 48
-  %8 = shl nuw i64 %1, 56
-  %mask2.masked = or i64 %mask5, %6
-  %mask = or i64 %mask2.masked, %7
-  %ins = or i64 %mask, %8
-  %9 = bitcast i8* %ptr.0 to i64*
-  %cmp1636 = icmp ugt i64 %len.addr.0, 7
-  br i1 %cmp1636, label %while.body18, label %while.body29.lr.ph
-
-while.body18:                                     ; preds = %if.end9, %while.body18
-  %wideptr.038 = phi i64* [ %incdec.ptr19, %while.body18 ], [ %9, %if.end9 ]
-  %len.addr.137 = phi i64 [ %sub, %while.body18 ], [ %len.addr.0, %if.end9 ]
-  %incdec.ptr19 = getelementptr inbounds i64* %wideptr.038, i64 1
-  store i64 %ins, i64* %wideptr.038, align 8, !tbaa !2
-  %sub = add i64 %len.addr.137, -8
-  %cmp16 = icmp ugt i64 %sub, 7
-  br i1 %cmp16, label %while.body18, label %while.end20
-
-while.end20:                                      ; preds = %while.body18
-  %cmp21 = icmp eq i64 %sub, 0
-  br i1 %cmp21, label %done, label %while.body29.lr.ph
-
-while.body29.lr.ph:                               ; preds = %while.end20, %if.end9
-  %len.addr.1.lcssa49 = phi i64 [ %sub, %while.end20 ], [ %len.addr.0, %if.end9 ]
-  %wideptr.0.lcssa48 = phi i64* [ %incdec.ptr19, %while.end20 ], [ %9, %if.end9 ]
-  %10 = bitcast i64* %wideptr.0.lcssa48 to i8*
-  br label %while.body29
-
-while.body29:                                     ; preds = %while.body29, %while.body29.lr.ph
-  %len.addr.235 = phi i64 [ %len.addr.1.lcssa49, %while.body29.lr.ph ], [ %dec26, %while.body29 ]
-  %ptr.134 = phi i8* [ %10, %while.body29.lr.ph ], [ %incdec.ptr31, %while.body29 ]
-  %dec26 = add i64 %len.addr.235, -1
-  %incdec.ptr31 = getelementptr inbounds i8* %ptr.134, i64 1
-  store i8 %conv, i8* %ptr.134, align 1, !tbaa !0
-  %cmp27 = icmp eq i64 %dec26, 0
-  br i1 %cmp27, label %done, label %while.body29
-
-done:                                             ; preds = %while.cond, %while.body29, %while.end20, %entry
-  ret i8* %dest
-}
-
-!0 = metadata !{metadata !"omnipotent char", metadata !1}
-!1 = metadata !{metadata !"Simple C/C++ TBAA"}
-!2 = metadata !{metadata !"long long", metadata !0}
diff --git a/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
index e42b67f..937791d 100644
--- a/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -O3 -march=x86-64 -mcpu=core2 | FileCheck %s -check-prefix=X64
 ; RUN: llc < %s -O3 -march=x86 -mcpu=core2 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -O3 -march=x86-64 -mcpu=core2 -addr-sink-using-gep=1 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -O3 -march=x86 -mcpu=core2 -addr-sink-using-gep=1 | FileCheck %s -check-prefix=X32
 
 ; @simple is the most basic chain of address induction variables. Chaining
 ; saves at least one register and avoids complex addressing and setup
diff --git a/test/Transforms/LoopStrengthReduce/dont_insert_redundant_ops.ll b/test/Transforms/LoopStrengthReduce/dont_insert_redundant_ops.ll
index 90051e3..16bb508 100644
--- a/test/Transforms/LoopStrengthReduce/dont_insert_redundant_ops.ll
+++ b/test/Transforms/LoopStrengthReduce/dont_insert_redundant_ops.ll
@@ -1,5 +1,9 @@
 ; Check that this test makes INDVAR and related stuff dead.
-; RUN: opt < %s -loop-reduce -S | grep phi | count 2
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
+
+; CHECK: phi
+; CHECK: phi
+; CHECK-NOT: phi
 
 declare i1 @pred()
 
diff --git a/test/Transforms/LoopUnroll/X86/partial.ll b/test/Transforms/LoopUnroll/X86/partial.ll
index 15867cb..a2b04c7 100644
--- a/test/Transforms/LoopUnroll/X86/partial.ll
+++ b/test/Transforms/LoopUnroll/X86/partial.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -S -loop-unroll -mcpu=nehalem -x86-use-partial-unrolling=1 | FileCheck %s
-; RUN: opt < %s -S -loop-unroll -mcpu=core -x86-use-partial-unrolling=1 | FileCheck -check-prefix=CHECK-NOUNRL %s
+; RUN: opt < %s -S -loop-unroll -mcpu=nehalem | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -mcpu=core -unroll-runtime=0 | FileCheck -check-prefix=CHECK-NOUNRL %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -76,5 +76,52 @@ for.end:                                          ; preds = %vector.body
   ret void
 }
 
+define zeroext i16 @test1(i16* nocapture readonly %arr, i32 %n) #0 {
+entry:
+  %cmp25 = icmp eq i32 %n, 0
+  br i1 %cmp25, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %reduction.026 = phi i16 [ %add14, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16* %arr, i64 %indvars.iv
+  %0 = load i16* %arrayidx, align 2
+  %add = add i16 %0, %reduction.026
+  %sext = mul i64 %indvars.iv, 12884901888
+  %idxprom3 = ashr exact i64 %sext, 32
+  %arrayidx4 = getelementptr inbounds i16* %arr, i64 %idxprom3
+  %1 = load i16* %arrayidx4, align 2
+  %add7 = add i16 %add, %1
+  %sext28 = mul i64 %indvars.iv, 21474836480
+  %idxprom10 = ashr exact i64 %sext28, 32
+  %arrayidx11 = getelementptr inbounds i16* %arr, i64 %idxprom10
+  %2 = load i16* %arrayidx11, align 2
+  %add14 = add i16 %add7, %2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %reduction.0.lcssa = phi i16 [ 0, %entry ], [ %add14, %for.body ]
+  ret i16 %reduction.0.lcssa
+
+; This loop is too large to be partially unrolled (size=16)
+
+; CHECK-LABEL: @test1
+; CHECK: br
+; CHECK: br
+; CHECK: br
+; CHECK: br
+; CHECK-NOT: br
+
+; CHECK-NOUNRL-LABEL: @test1
+; CHECK-NOUNRL: br
+; CHECK-NOUNRL: br
+; CHECK-NOUNRL: br
+; CHECK-NOUNRL: br
+; CHECK-NOUNRL-NOT: br
+}
+
 attributes #0 = { nounwind uwtable }
 
diff --git a/test/Transforms/LoopUnroll/loop-remarks.ll b/test/Transforms/LoopUnroll/loop-remarks.ll
new file mode 100644
index 0000000..ff3ac17
--- /dev/null
+++ b/test/Transforms/LoopUnroll/loop-remarks.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -S -loop-unroll -pass-remarks=loop-unroll -unroll-count=16 2>&1 | FileCheck -check-prefix=COMPLETE-UNROLL %s
+; RUN: opt < %s -S -loop-unroll -pass-remarks=loop-unroll -unroll-count=4 2>&1 | FileCheck -check-prefix=PARTIAL-UNROLL %s
+
+; COMPLETE-UNROLL: remark: {{.*}}: completely unrolled loop with 16 iterations
+; PARTIAL-UNROLL: remark: {{.*}}: unrolled loop by a factor of 4
+
+define i32 @sum() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %s.06 = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  %i.05 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %add = add nsw i32 %i.05, 4
+  %call = tail call i32 @baz(i32 %add) #2
+  %add1 = add nsw i32 %call, %s.06
+  %inc = add nsw i32 %i.05, 1
+  %exitcond = icmp eq i32 %inc, 16
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add1
+}
+
+declare i32 @baz(i32)
diff --git a/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll b/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll
new file mode 100644
index 0000000..9962c3d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -loop-vectorize -mtriple=aarch64-none-linux-gnu -mattr=+neon -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Function Attrs: nounwind
+define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
+;CHECK-LABEL: array_add
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+entry:
+  %cmp10 = icmp sgt i32 %size, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32* %c, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret i32* %c
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll b/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
new file mode 100644
index 0000000..f8eb3ed
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -loop-vectorize -mtriple=arm64-none-linux-gnu -mattr=+neon -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Function Attrs: nounwind
+define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
+;CHECK-LABEL: array_add
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+entry:
+  %cmp10 = icmp sgt i32 %size, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32* %c, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret i32* %c
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/gather-cost.ll b/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
new file mode 100644
index 0000000..bb28538
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
@@ -0,0 +1,85 @@
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+
+@kernel = global [512 x float] zeroinitializer, align 16
+@kernel2 = global [512 x float] zeroinitializer, align 16
+@kernel3 = global [512 x float] zeroinitializer, align 16
+@kernel4 = global [512 x float] zeroinitializer, align 16
+@src_data = global [1536 x float] zeroinitializer, align 16
+@r_ = global i8 0, align 1
+@g_ = global i8 0, align 1
+@b_ = global i8 0, align 1
+
+; We don't want to vectorize most loops containing gathers because they are
+; expensive.
+; Make sure we don't vectorize it.
+; CHECK-NOT: x float>
+
+define void @_Z4testmm(i64 %size, i64 %offset) {
+entry:
+  %cmp53 = icmp eq i64 %size, 0
+  br i1 %cmp53, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
+  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
+  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
+  %add = add i64 %v.055, %offset
+  %mul = mul i64 %add, 3
+  %arrayidx = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %mul
+  %0 = load float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float]* @kernel, i64 0, i64 %v.055
+  %1 = load float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %1
+  %arrayidx4 = getelementptr inbounds [512 x float]* @kernel2, i64 0, i64 %v.055
+  %2 = load float* %arrayidx4, align 4
+  %mul5 = fmul fast float %mul3, %2
+  %arrayidx6 = getelementptr inbounds [512 x float]* @kernel3, i64 0, i64 %v.055
+  %3 = load float* %arrayidx6, align 4
+  %mul7 = fmul fast float %mul5, %3
+  %arrayidx8 = getelementptr inbounds [512 x float]* @kernel4, i64 0, i64 %v.055
+  %4 = load float* %arrayidx8, align 4
+  %mul9 = fmul fast float %mul7, %4
+  %add10 = fadd fast float %r.057, %mul9
+  %arrayidx.sum = add i64 %mul, 1
+  %arrayidx11 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
+  %5 = load float* %arrayidx11, align 4
+  %mul13 = fmul fast float %1, %5
+  %mul15 = fmul fast float %2, %mul13
+  %mul17 = fmul fast float %3, %mul15
+  %mul19 = fmul fast float %4, %mul17
+  %add20 = fadd fast float %g.056, %mul19
+  %arrayidx.sum52 = add i64 %mul, 2
+  %arrayidx21 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
+  %6 = load float* %arrayidx21, align 4
+  %mul23 = fmul fast float %1, %6
+  %mul25 = fmul fast float %2, %mul23
+  %mul27 = fmul fast float %3, %mul25
+  %mul29 = fmul fast float %4, %mul27
+  %add30 = fadd fast float %b.054, %mul29
+  %inc = add i64 %v.055, 1
+  %exitcond = icmp ne i64 %inc, %size
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+  %add30.lcssa = phi float [ %add30, %for.body ]
+  %add20.lcssa = phi float [ %add20, %for.body ]
+  %add10.lcssa = phi float [ %add10, %for.body ]
+  %phitmp = fptoui float %add10.lcssa to i8
+  %phitmp60 = fptoui float %add20.lcssa to i8
+  %phitmp61 = fptoui float %add30.lcssa to i8
+  br label %for.end
+
+for.end:
+  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  store i8 %r.0.lcssa, i8* @r_, align 1
+  store i8 %g.0.lcssa, i8* @g_, align 1
+  store i8 %b.0.lcssa, i8* @b_, align 1
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/lit.local.cfg b/test/Transforms/LoopVectorize/AArch64/lit.local.cfg
new file mode 100644
index 0000000..f1d1f88
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/LoopVectorize/ARM64/gather-cost.ll b/test/Transforms/LoopVectorize/ARM64/gather-cost.ll
deleted file mode 100644
index bb28538..0000000
--- a/test/Transforms/LoopVectorize/ARM64/gather-cost.ll
+++ /dev/null
@@ -1,85 +0,0 @@
-; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
-
-@kernel = global [512 x float] zeroinitializer, align 16
-@kernel2 = global [512 x float] zeroinitializer, align 16
-@kernel3 = global [512 x float] zeroinitializer, align 16
-@kernel4 = global [512 x float] zeroinitializer, align 16
-@src_data = global [1536 x float] zeroinitializer, align 16
-@r_ = global i8 0, align 1
-@g_ = global i8 0, align 1
-@b_ = global i8 0, align 1
-
-; We don't want to vectorize most loops containing gathers because they are
-; expensive.
-; Make sure we don't vectorize it.
-; CHECK-NOT: x float>
-
-define void @_Z4testmm(i64 %size, i64 %offset) {
-entry:
-  %cmp53 = icmp eq i64 %size, 0
-  br i1 %cmp53, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:
-  br label %for.body
-
-for.body:
-  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
-  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
-  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
-  %add = add i64 %v.055, %offset
-  %mul = mul i64 %add, 3
-  %arrayidx = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %mul
-  %0 = load float* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [512 x float]* @kernel, i64 0, i64 %v.055
-  %1 = load float* %arrayidx2, align 4
-  %mul3 = fmul fast float %0, %1
-  %arrayidx4 = getelementptr inbounds [512 x float]* @kernel2, i64 0, i64 %v.055
-  %2 = load float* %arrayidx4, align 4
-  %mul5 = fmul fast float %mul3, %2
-  %arrayidx6 = getelementptr inbounds [512 x float]* @kernel3, i64 0, i64 %v.055
-  %3 = load float* %arrayidx6, align 4
-  %mul7 = fmul fast float %mul5, %3
-  %arrayidx8 = getelementptr inbounds [512 x float]* @kernel4, i64 0, i64 %v.055
-  %4 = load float* %arrayidx8, align 4
-  %mul9 = fmul fast float %mul7, %4
-  %add10 = fadd fast float %r.057, %mul9
-  %arrayidx.sum = add i64 %mul, 1
-  %arrayidx11 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
-  %5 = load float* %arrayidx11, align 4
-  %mul13 = fmul fast float %1, %5
-  %mul15 = fmul fast float %2, %mul13
-  %mul17 = fmul fast float %3, %mul15
-  %mul19 = fmul fast float %4, %mul17
-  %add20 = fadd fast float %g.056, %mul19
-  %arrayidx.sum52 = add i64 %mul, 2
-  %arrayidx21 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
-  %6 = load float* %arrayidx21, align 4
-  %mul23 = fmul fast float %1, %6
-  %mul25 = fmul fast float %2, %mul23
-  %mul27 = fmul fast float %3, %mul25
-  %mul29 = fmul fast float %4, %mul27
-  %add30 = fadd fast float %b.054, %mul29
-  %inc = add i64 %v.055, 1
-  %exitcond = icmp ne i64 %inc, %size
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:
-  %add30.lcssa = phi float [ %add30, %for.body ]
-  %add20.lcssa = phi float [ %add20, %for.body ]
-  %add10.lcssa = phi float [ %add10, %for.body ]
-  %phitmp = fptoui float %add10.lcssa to i8
-  %phitmp60 = fptoui float %add20.lcssa to i8
-  %phitmp61 = fptoui float %add30.lcssa to i8
-  br label %for.end
-
-for.end:
-  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
-  store i8 %r.0.lcssa, i8* @r_, align 1
-  store i8 %g.0.lcssa, i8* @g_, align 1
-  store i8 %b.0.lcssa, i8* @b_, align 1
-  ret void
-}
diff --git a/test/Transforms/LoopVectorize/ARM64/lit.local.cfg b/test/Transforms/LoopVectorize/ARM64/lit.local.cfg
deleted file mode 100644
index de86e54..0000000
--- a/test/Transforms/LoopVectorize/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
-
diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index 224823b..9e4e989 100644
--- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -1,13 +1,13 @@
-; RUN: opt < %s -mcpu=corei7 -O1 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1
-; RUN: opt < %s -mcpu=corei7 -O2 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O2
-; RUN: opt < %s -mcpu=corei7 -O3 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3
-; RUN: opt < %s -mcpu=corei7 -Os -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Os
-; RUN: opt < %s -mcpu=corei7 -Oz -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Oz
-; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC
-; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC
-; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC2
-; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC2
-; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3DIS
+; RUN: opt < %s -mcpu=corei7 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1
+; RUN: opt < %s -mcpu=corei7 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os
+; RUN: opt < %s -mcpu=corei7 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz
+; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC
+; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC
+; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2
+; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2
+; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS
 
 ; This file tests the llvm.vectorizer.pragma forcing vectorization even when
 ; optimization levels are too low, or when vectorization is disabled.
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
new file mode 100644
index 0000000..84ffb27
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
@@ -0,0 +1,93 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: LV: Loop hints: force=enabled
+; CHECK: LV: Loop hints: force=?
+; No more loops in the module
+; CHECK-NOT: LV: Loop hints: force=
+; CHECK: 2 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 1 loop-vectorize               - Number of loops vectorized
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;
+; The source code for the test:
+;
+; #include <math.h>
+; void foo(float* restrict A, float * restrict B, int size)
+; {
+;   for (int i = 0; i < size; ++i) A[i] = sinf(B[i]);
+; }
+;
+
+;
+; This loop will be vectorized, although the scalar cost is lower than any of vector costs, but vectorization is explicitly forced in metadata.
+;
+
+define void @vectorized(float* noalias nocapture %A, float* noalias nocapture %B, i32 %size) {
+entry:
+  %cmp6 = icmp sgt i32 %size, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
+  %call = tail call float @llvm.sin.f32(float %0)
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !1
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+!1 = metadata !{metadata !1, metadata !2}
+!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
+
+;
+; This method will not be vectorized, as scalar cost is lower than any of vector costs.
+;
+
+define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture %B, i32 %size) {
+entry:
+  %cmp6 = icmp sgt i32 %size, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
+  %call = tail call float @llvm.sin.f32(float %0)
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !3
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+declare float @llvm.sin.f32(float) nounwind readnone
+
+; Dummy metadata
+!3 = metadata !{metadata !3}
+
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
new file mode 100644
index 0000000..1b979e5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -0,0 +1,73 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S -vectorizer-min-trip-count=21 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: LV: Loop hints: force=enabled
+; CHECK: LV: Loop hints: force=?
+; No more loops in the module
+; CHECK-NOT: LV: Loop hints: force=
+; CHECK: 2 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 1 loop-vectorize               - Number of loops vectorized
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;
+; The source code for the test:
+;
+; void foo(float* restrict A, float* restrict B)
+; {
+;     for (int i = 0; i < 20; ++i) A[i] += B[i];
+; }
+;
+
+;
+; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata.
+;
+define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
+  %1 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+!1 = metadata !{metadata !1, metadata !2}
+!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
+
+;
+; This loop will not be vectorized as the trip count is below the threshold.
+;
+define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
+  %1 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:
+  ret void
+}
+
+!3 = metadata !{metadata !3}
+
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
new file mode 100644
index 0000000..685d034
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
@@ -0,0 +1,67 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-unroll=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='.*vectorize.*' 2>&1 | FileCheck -check-prefix=UNROLLED %s
+
+; VECTORIZED: remark: {{.*}}.c:17:8: vectorized loop (vectorization factor: 4, unrolling interleave factor: 1)
+; UNROLLED: remark: {{.*}}.c:17:8: unrolled with interleaving factor 4 (vectorization not beneficial)
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @foo(i32 %n) #0 {
+entry:
+  %diff = alloca i32, align 4
+  %cb = alloca [16 x i8], align 16
+  %cc = alloca [16 x i8], align 16
+  store i32 0, i32* %diff, align 4, !dbg !10, !tbaa !11
+  br label %for.body, !dbg !15
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %add8 = phi i32 [ 0, %entry ], [ %add, %for.body ], !dbg !19
+  %arrayidx = getelementptr inbounds [16 x i8]* %cb, i64 0, i64 %indvars.iv, !dbg !19
+  %0 = load i8* %arrayidx, align 1, !dbg !19, !tbaa !21
+  %conv = sext i8 %0 to i32, !dbg !19
+  %arrayidx2 = getelementptr inbounds [16 x i8]* %cc, i64 0, i64 %indvars.iv, !dbg !19
+  %1 = load i8* %arrayidx2, align 1, !dbg !19, !tbaa !21
+  %conv3 = sext i8 %1 to i32, !dbg !19
+  %sub = sub i32 %conv, %conv3, !dbg !19
+  %add = add nsw i32 %sub, %add8, !dbg !19
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !15
+  %exitcond = icmp eq i64 %indvars.iv.next, 16, !dbg !15
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !15
+
+for.end:                                          ; preds = %for.body
+  store i32 %add, i32* %diff, align 4, !dbg !19, !tbaa !11
+  call void @ibar(i32* %diff) #2, !dbg !22
+  ret i32 0, !dbg !23
+}
+
+declare void @ibar(i32*) #1
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [./vectorization-remarks.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"vectorization-remarks.c", metadata !"."}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @foo, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 6] [foo]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [./vectorization-remarks.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{metadata !"clang version 3.5.0 "}
+!10 = metadata !{i32 8, i32 3, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!11 = metadata !{metadata !12, metadata !12, i64 0}
+!12 = metadata !{metadata !"int", metadata !13, i64 0}
+!13 = metadata !{metadata !"omnipotent char", metadata !14, i64 0}
+!14 = metadata !{metadata !"Simple C/C++ TBAA"}
+!15 = metadata !{i32 17, i32 8, metadata !16, null}
+!16 = metadata !{i32 786443, metadata !1, metadata !17, i32 17, i32 8, i32 2, i32 3} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!17 = metadata !{i32 786443, metadata !1, metadata !18, i32 17, i32 8, i32 1, i32 2} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!18 = metadata !{i32 786443, metadata !1, metadata !4, i32 17, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!19 = metadata !{i32 18, i32 5, metadata !20, null}
+!20 = metadata !{i32 786443, metadata !1, metadata !18, i32 17, i32 27, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!21 = metadata !{metadata !13, metadata !13, i64 0}
+!22 = metadata !{i32 20, i32 3, metadata !4, null}
+!23 = metadata !{i32 21, i32 3, metadata !4, null}
diff --git a/test/Transforms/LoopVectorize/store-shuffle-bug.ll b/test/Transforms/LoopVectorize/store-shuffle-bug.ll
index 0ec8010..e53c120 100644
--- a/test/Transforms/LoopVectorize/store-shuffle-bug.ll
+++ b/test/Transforms/LoopVectorize/store-shuffle-bug.ll
@@ -19,18 +19,13 @@ entry:
 
 ; CHECK-LABEL: @t(
 ; CHECK: vector.body:
-; CHECK: load <4 x i32>
-; CHECK: [[VAR1:%[a-zA-Z0-9]+]] = shufflevector
-; CHECK: load <4 x i32>
-; CHECK: [[VAR2:%[a-zA-Z0-9]+]] = shufflevector
+; CHECK: [[VAR1:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; CHECK: [[VAR2:%[a-zA-Z0-9.]+]] = load <4 x i32>
 ; CHECK: [[VAR3:%[a-zA-Z0-9]+]] = add nsw <4 x i32> [[VAR2]], [[VAR1]]
-; CHECK: [[VAR4:%[a-zA-Z0-9]+]] = shufflevector <4 x i32> [[VAR3]]
-; CHECK: store <4 x i32> [[VAR4]]
-; CHECK: load <4 x i32>
-; CHECK: [[VAR5:%[a-zA-Z0-9]+]] = shufflevector
-; CHECK-NOT: add nsw <4 x i32> [[VAR4]], [[VAR5]]
-; CHECK-NOT: add nsw <4 x i32> [[VAR5]], [[VAR4]]
-; CHECK: add nsw <4 x i32> [[VAR3]], [[VAR5]]
+; CHECK: store <4 x i32> [[VAR3]]
+; CHECK: [[VAR4:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; CHECK: add nsw <4 x i32> [[VAR3]], [[VAR4]]
+; CHECK-NOT: shufflevector
 
 for.body:
   %indvars.iv = phi i64 [ 93, %entry ], [ %indvars.iv.next, %for.body ]
diff --git a/test/Transforms/LoopVectorize/vect.omp.persistence.ll b/test/Transforms/LoopVectorize/vect.omp.persistence.ll
new file mode 100644
index 0000000..dc3df7a
--- /dev/null
+++ b/test/Transforms/LoopVectorize/vect.omp.persistence.ll
@@ -0,0 +1,88 @@
+; RUN: opt < %s -O2 -force-vector-unroll=2 -force-vector-width=4 -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Loop from "rotated"
+; CHECK: LV: Loop hints: force=enabled
+; Loop from "nonrotated"
+; CHECK: LV: Loop hints: force=enabled
+; No more loops in the module
+; CHECK-NOT: LV: Loop hints: force=
+; In total only 1 loop should be rotated.
+; CHECK: 1 loop-rotate
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; See http://reviews.llvm.org/D3348 for details.
+
+;
+; Test #1
+;
+; Ensure that "llvm.vectorizer.enable" metadata was not lost prior to LoopVectorize pass.
+; In past LoopRotate was clearing that metadata.
+;
+; The source C code is:
+; void rotated(float *a, int size)
+; {
+;   int t = 0;
+;   #pragma omp simd
+;   for (int i = 0; i < size; ++i) {
+;     a[i] = a[i-5] * a[i+2];
+;     ++t;
+;   }
+;}
+
+define void @rotated(float* nocapture %a, i64 %size) {
+entry:
+  %cmp1 = icmp sgt i64 %size, 0
+  br i1 %cmp1, label %for.header, label %for.end
+
+for.header:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %cmp2 = icmp sgt i64 %indvars.iv, %size
+  br i1 %cmp2, label %for.end, label %for.body
+
+for.body:
+
+  %0 = add nsw i64 %indvars.iv, -5
+  %arrayidx = getelementptr inbounds float* %a, i64 %0
+  %1 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
+  %2 = add nsw i64 %indvars.iv, 2
+  %arrayidx2 = getelementptr inbounds float* %a, i64 %2
+  %3 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
+  %mul = fmul float %1, %3
+  %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx4, align 4, !llvm.mem.parallel_loop_access !1
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.header, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+!1 = metadata !{metadata !1, metadata !2}
+!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
+
+;
+; Test #2
+;
+; Ensure that "llvm.vectorizer.enable" metadata was not lost even
+; if loop was not rotated (see http://reviews.llvm.org/D3348#comment-4).
+;
+define i32 @nonrotated(i32 %a) {
+entry:
+  br label %loop_cond
+loop_cond:
+  %indx = phi i32 [ 1, %entry ], [ %inc, %loop_inc ]
+  %cmp = icmp ne i32 %indx, %a
+  br i1 %cmp, label %return, label %loop_inc
+loop_inc:
+  %inc = add i32 %indx, 1
+  br label %loop_cond, !llvm.loop !3
+return:
+  ret i32 0
+}
+
+!3 = metadata !{metadata !3, metadata !4}
+!4 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
diff --git a/test/Transforms/LoopVectorize/vect.stats.ll b/test/Transforms/LoopVectorize/vect.stats.ll
new file mode 100644
index 0000000..92ec24f
--- /dev/null
+++ b/test/Transforms/LoopVectorize/vect.stats.ll
@@ -0,0 +1,65 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=4 -force-vector-width=4 -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+;
+; We have 2 loops, one of them is vectorizable and the second one is not.
+;
+
+; CHECK: 2 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 1 loop-vectorize               - Number of loops vectorized
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @vectorized(float* nocapture %a, i64 %size) {
+entry:
+  %cmp1 = icmp sgt i64 %size, 0
+  br i1 %cmp1, label %for.header, label %for.end
+
+for.header:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %cmp2 = icmp sgt i64 %indvars.iv, %size
+  br i1 %cmp2, label %for.end, label %for.body
+
+for.body:
+
+  %arrayidx = getelementptr inbounds float* %a, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %mul = fmul float %0, %0
+  store float %mul, float* %arrayidx, align 4
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.header
+
+for.end:
+  ret void
+}
+
+define void @not_vectorized(float* nocapture %a, i64 %size) {
+entry:
+  %cmp1 = icmp sgt i64 %size, 0
+  br i1 %cmp1, label %for.header, label %for.end
+
+for.header:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %cmp2 = icmp sgt i64 %indvars.iv, %size
+  br i1 %cmp2, label %for.end, label %for.body
+
+for.body:
+
+  %0 = add nsw i64 %indvars.iv, -5
+  %arrayidx = getelementptr inbounds float* %a, i64 %0
+  %1 = load float* %arrayidx, align 4
+  %2 = add nsw i64 %indvars.iv, 2
+  %arrayidx2 = getelementptr inbounds float* %a, i64 %2
+  %3 = load float* %arrayidx2, align 4
+  %mul = fmul float %1, %3
+  %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx4, align 4
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.header
+
+for.end:
+  ret void
+}
\ No newline at end of file
diff --git a/test/Transforms/MergeFunc/crash.ll b/test/Transforms/MergeFunc/crash.ll
index 0897ba2..3475e28 100644
--- a/test/Transforms/MergeFunc/crash.ll
+++ b/test/Transforms/MergeFunc/crash.ll
@@ -8,9 +8,9 @@ target triple = "i386-pc-linux-gnu"
 %.qux.2585 = type { i32, i32, i8* }
 
 @g2 = external unnamed_addr constant [9 x i8], align 1
-@g3 = internal hidden unnamed_addr constant [1 x i8*] [i8* bitcast (i8* (%.qux.2585*)* @func35 to i8*)]
+@g3 = internal unnamed_addr constant [1 x i8*] [i8* bitcast (i8* (%.qux.2585*)* @func35 to i8*)]
 
-define internal hidden i32 @func1(i32* %ptr, { i32, i32 }* nocapture %method) align 2 {
+define internal i32 @func1(i32* %ptr, { i32, i32 }* nocapture %method) align 2 {
   br label %1
 
 ; <label>:1
@@ -20,26 +20,26 @@ define internal hidden i32 @func1(i32* %ptr, { i32, i32 }* nocapture %method) al
   ret i32 undef
 }
 
-define internal hidden i32 @func10(%.qux.2496* nocapture %this) align 2 {
+define internal i32 @func10(%.qux.2496* nocapture %this) align 2 {
   %1 = getelementptr inbounds %.qux.2496* %this, i32 0, i32 1, i32 1
   %2 = load i32* %1, align 4
   ret i32 %2
 }
 
-define internal hidden i8* @func29(i32* nocapture %this) align 2 {
+define internal i8* @func29(i32* nocapture %this) align 2 {
   ret i8* getelementptr inbounds ([9 x i8]* @g2, i32 0, i32 0)
 }
 
-define internal hidden i32* @func33(%.qux.2585* nocapture %this) align 2 {
+define internal i32* @func33(%.qux.2585* nocapture %this) align 2 {
   ret i32* undef
 }
 
-define internal hidden i32* @func34(%.qux.2585* nocapture %this) align 2 {
+define internal i32* @func34(%.qux.2585* nocapture %this) align 2 {
   %1 = getelementptr inbounds %.qux.2585* %this, i32 0
   ret i32* undef
 }
 
-define internal hidden i8* @func35(%.qux.2585* nocapture %this) align 2 {
+define internal i8* @func35(%.qux.2585* nocapture %this) align 2 {
   %1 = getelementptr inbounds %.qux.2585* %this, i32 0, i32 2
   %2 = load i8** %1, align 4
   ret i8* %2
diff --git a/test/Transforms/MergeFunc/inttoptr-address-space.ll b/test/Transforms/MergeFunc/inttoptr-address-space.ll
index 0d834bc..2e5e2fc 100644
--- a/test/Transforms/MergeFunc/inttoptr-address-space.ll
+++ b/test/Transforms/MergeFunc/inttoptr-address-space.ll
@@ -6,10 +6,10 @@ target datalayout = "e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-
 %.qux.2585 = type { i32, i32, i8* }
 
 @g2 = external addrspace(1) constant [9 x i8], align 1
-@g3 = internal hidden unnamed_addr constant [1 x i8*] [i8* bitcast (i8* (%.qux.2585 addrspace(1)*)* @func35 to i8*)]
+@g3 = internal unnamed_addr constant [1 x i8*] [i8* bitcast (i8* (%.qux.2585 addrspace(1)*)* @func35 to i8*)]
 
 
-define internal hidden i32 @func10(%.qux.2496 addrspace(1)* nocapture %this) align 2 {
+define internal i32 @func10(%.qux.2496 addrspace(1)* nocapture %this) align 2 {
 bb:
   %tmp = getelementptr inbounds %.qux.2496 addrspace(1)* %this, i32 0, i32 1, i32 1
   %tmp1 = load i32 addrspace(1)* %tmp, align 4
@@ -17,7 +17,7 @@ bb:
 }
 
 ; Check for pointer bitwidth equal assertion failure
-define internal hidden i8* @func35(%.qux.2585 addrspace(1)* nocapture %this) align 2 {
+define internal i8* @func35(%.qux.2585 addrspace(1)* nocapture %this) align 2 {
 bb:
 ; CHECK-LABEL: @func35(
 ; CHECK: %[[V2:.+]] = bitcast %.qux.2585 addrspace(1)* %{{.*}} to %.qux.2496 addrspace(1)*
diff --git a/test/Transforms/MergeFunc/inttoptr.ll b/test/Transforms/MergeFunc/inttoptr.ll
index 6a69e3f..86c18a0 100644
--- a/test/Transforms/MergeFunc/inttoptr.ll
+++ b/test/Transforms/MergeFunc/inttoptr.ll
@@ -8,9 +8,9 @@ target triple = "i386-pc-linux-gnu"
 %.qux.2585 = type { i32, i32, i8* }
 
 @g2 = external unnamed_addr constant [9 x i8], align 1
-@g3 = internal hidden unnamed_addr constant [1 x i8*] [i8* bitcast (i8* (%.qux.2585*)* @func35 to i8*)]
+@g3 = internal unnamed_addr constant [1 x i8*] [i8* bitcast (i8* (%.qux.2585*)* @func35 to i8*)]
 
-define internal hidden i32 @func1(i32* %ptr, { i32, i32 }* nocapture %method) align 2 {
+define internal i32 @func1(i32* %ptr, { i32, i32 }* nocapture %method) align 2 {
 bb:
   br label %bb1
 
@@ -21,30 +21,30 @@ bb2:                                              ; preds = %bb1
   ret i32 undef
 }
 
-define internal hidden i32 @func10(%.qux.2496* nocapture %this) align 2 {
+define internal i32 @func10(%.qux.2496* nocapture %this) align 2 {
 bb:
   %tmp = getelementptr inbounds %.qux.2496* %this, i32 0, i32 1, i32 1
   %tmp1 = load i32* %tmp, align 4
   ret i32 %tmp1
 }
 
-define internal hidden i8* @func29(i32* nocapture %this) align 2 {
+define internal i8* @func29(i32* nocapture %this) align 2 {
 bb:
   ret i8* getelementptr inbounds ([9 x i8]* @g2, i32 0, i32 0)
 }
 
-define internal hidden i32* @func33(%.qux.2585* nocapture %this) align 2 {
+define internal i32* @func33(%.qux.2585* nocapture %this) align 2 {
 bb:
   ret i32* undef
 }
 
-define internal hidden i32* @func34(%.qux.2585* nocapture %this) align 2 {
+define internal i32* @func34(%.qux.2585* nocapture %this) align 2 {
 bb:
   %tmp = getelementptr inbounds %.qux.2585* %this, i32 0
   ret i32* undef
 }
 
-define internal hidden i8* @func35(%.qux.2585* nocapture %this) align 2 {
+define internal i8* @func35(%.qux.2585* nocapture %this) align 2 {
 bb:
 ; CHECK-LABEL: @func35(
 ; CHECK: %[[V2:.+]] = bitcast %.qux.2585* %{{.*}} to %.qux.2496*
diff --git a/test/Transforms/MergeFunc/mergefunc-struct-return.ll b/test/Transforms/MergeFunc/mergefunc-struct-return.ll
new file mode 100644
index 0000000..d2cbe43
--- /dev/null
+++ b/test/Transforms/MergeFunc/mergefunc-struct-return.ll
@@ -0,0 +1,40 @@
+; RUN: opt -mergefunc -S < %s | FileCheck %s
+
+; This test makes sure that the mergefunc pass, uses extract and insert value
+; to convert the struct result type; as struct types cannot be bitcast.
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+
+%kv1 = type { i32*, i32* }
+%kv2 = type { i8*, i8* }
+
+declare void @noop()
+
+define %kv1 @fn1() {
+; CHECK-LABEL: @fn1(
+  %tmp = alloca %kv1
+  %v1 = getelementptr %kv1* %tmp, i32 0, i32 0
+  store i32* null, i32** %v1
+  %v2 = getelementptr %kv1* %tmp, i32 0, i32 0
+  store i32* null, i32** %v2
+  call void @noop()
+  %v3 = load %kv1* %tmp
+  ret %kv1 %v3
+}
+
+define %kv2 @fn2() {
+; CHECK-LABEL: @fn2(
+; CHECK: %1 = tail call %kv1 @fn1()
+; CHECK: %2 = extractvalue %kv1 %1, 0
+; CHECK: %3 = bitcast i32* %2 to i8*
+; CHECK: %4 = insertvalue %kv2 undef, i8* %3, 0
+  %tmp = alloca %kv2
+  %v1 = getelementptr %kv2* %tmp, i32 0, i32 0
+  store i8* null, i8** %v1
+  %v2 = getelementptr %kv2* %tmp, i32 0, i32 0
+  store i8* null, i8** %v2
+  call void @noop()
+
+  %v3 = load %kv2* %tmp
+  ret %kv2 %v3
+}
diff --git a/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg b/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
new file mode 100644
index 0000000..c420349
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll b/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll
new file mode 100644
index 0000000..3d6da12
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll
@@ -0,0 +1,18 @@
+; RUN: opt -S -slp-vectorizer %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+define i64 @mismatched_intrinsics(<4 x i32> %in1, <2 x i32> %in2) nounwind {
+; CHECK-LABEL: @mismatched_intrinsics
+; CHECK: call i64 @llvm.arm64.neon.saddlv.i64.v4i32
+; CHECK: call i64 @llvm.arm64.neon.saddlv.i64.v2i32
+
+  %vaddlvq_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1) #2
+  %vaddlv_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in2) #2
+  %tst = icmp sgt i64 %vaddlvq_s32.i, %vaddlv_s32.i
+  %equal = sext i1 %tst to i64
+  ret i64 %equal
+}
+
+declare i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1)
+declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in1)
diff --git a/test/Transforms/SLPVectorizer/ARM64/lit.local.cfg b/test/Transforms/SLPVectorizer/ARM64/lit.local.cfg
deleted file mode 100644
index 84ac981..0000000
--- a/test/Transforms/SLPVectorizer/ARM64/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
-    config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/ARM64/mismatched-intrinsics.ll b/test/Transforms/SLPVectorizer/ARM64/mismatched-intrinsics.ll
deleted file mode 100644
index 3d6da12..0000000
--- a/test/Transforms/SLPVectorizer/ARM64/mismatched-intrinsics.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: opt -S -slp-vectorizer %s | FileCheck %s
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios5.0.0"
-
-define i64 @mismatched_intrinsics(<4 x i32> %in1, <2 x i32> %in2) nounwind {
-; CHECK-LABEL: @mismatched_intrinsics
-; CHECK: call i64 @llvm.arm64.neon.saddlv.i64.v4i32
-; CHECK: call i64 @llvm.arm64.neon.saddlv.i64.v2i32
-
-  %vaddlvq_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1) #2
-  %vaddlv_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in2) #2
-  %tst = icmp sgt i64 %vaddlvq_s32.i, %vaddlv_s32.i
-  %equal = sext i1 %tst to i64
-  ret i64 %equal
-}
-
-declare i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1)
-declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in1)
diff --git a/test/Transforms/SLPVectorizer/X86/align.ll b/test/Transforms/SLPVectorizer/X86/align.ll
new file mode 100644
index 0000000..f586573
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/align.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Simple 3-pair chain with loads and stores
+; CHECK: test1
+define void @test1(double* %a, double* %b, double* %c) {
+entry:
+  %agg.tmp.i.i.sroa.0 = alloca [3 x double], align 16
+; CHECK: %[[V0:[0-9]+]] = load <2 x double>* %[[V2:[0-9]+]], align 8
+  %i0 = load double* %a 
+  %i1 = load double* %b 
+  %mul = fmul double %i0, %i1
+  %store1 = getelementptr inbounds [3 x double]* %agg.tmp.i.i.sroa.0, i64 0, i64 1
+  %store2 = getelementptr inbounds [3 x double]* %agg.tmp.i.i.sroa.0, i64 0, i64 2
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+; CHECK: store <2 x double> %[[V1:[0-9]+]], <2 x double>* %[[V2:[0-9]+]], align 8
+  store double %mul, double* %store1
+  store double %mul5, double* %store2, align 16
+; CHECK: ret
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/call.ll b/test/Transforms/SLPVectorizer/X86/call.ll
new file mode 100644
index 0000000..83d45c0
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/call.ll
@@ -0,0 +1,128 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-999 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare double @sin(double)
+declare double @cos(double)
+declare double @pow(double, double)
+declare double @exp2(double)
+declare i64 @round(i64)
+
+
+; CHECK: sin_libm
+; CHECK: call <2 x double> @llvm.sin.v2f64
+; CHECK: ret void
+define void @sin_libm(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %call = tail call double @sin(double %mul) nounwind readnone
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %call5 = tail call double @sin(double %mul5) nounwind readnone
+  store double %call, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %call5, double* %arrayidx5, align 8
+  ret void
+}
+
+; CHECK: cos_libm
+; CHECK: call <2 x double> @llvm.cos.v2f64
+; CHECK: ret void
+define void @cos_libm(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %call = tail call double @cos(double %mul) nounwind readnone
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %call5 = tail call double @cos(double %mul5) nounwind readnone
+  store double %call, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %call5, double* %arrayidx5, align 8
+  ret void
+}
+
+; CHECK: pow_libm
+; CHECK: call <2 x double> @llvm.pow.v2f64
+; CHECK: ret void
+define void @pow_libm(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %call = tail call double @pow(double %mul,double %mul) nounwind readnone
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %call5 = tail call double @pow(double %mul5,double %mul5) nounwind readnone
+  store double %call, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %call5, double* %arrayidx5, align 8
+  ret void
+}
+
+
+; CHECK: exp2_libm
+; CHECK: call <2 x double> @llvm.exp2.v2f64
+; CHECK: ret void
+define void @exp2_libm(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %call = tail call double @exp2(double %mul) nounwind readnone
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %call5 = tail call double @exp2(double %mul5) nounwind readnone
+  store double %call, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %call5, double* %arrayidx5, align 8
+  ret void
+}
+
+
+; Negative test case
+; CHECK: round_custom
+; CHECK-NOT: load <4 x i64>
+; CHECK: ret void
+define void @round_custom(i64* %a, i64* %b, i64* %c) {
+entry:
+  %i0 = load i64* %a, align 8
+  %i1 = load i64* %b, align 8
+  %mul = mul i64 %i0, %i1
+  %call = tail call i64 @round(i64 %mul) nounwind readnone
+  %arrayidx3 = getelementptr inbounds i64* %a, i64 1
+  %i3 = load i64* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds i64* %b, i64 1
+  %i4 = load i64* %arrayidx4, align 8
+  %mul5 = mul i64 %i3, %i4
+  %call5 = tail call i64 @round(i64 %mul5) nounwind readnone
+  store i64 %call, i64* %c, align 8
+  %arrayidx5 = getelementptr inbounds i64* %c, i64 1
+  store i64 %call5, i64* %arrayidx5, align 8
+  ret void
+}
+
+
+; CHECK: declare <2 x double> @llvm.sin.v2f64(<2 x double>) #0
+; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) #0
+; CHECK: declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>) #0
+; CHECK: declare <2 x double> @llvm.exp2.v2f64(<2 x double>) #0
+
+; CHECK: attributes #0 = { nounwind readnone }
+
diff --git a/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
new file mode 100644
index 0000000..f4f112f
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
@@ -0,0 +1,175 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+@A = common global [2000 x double] zeroinitializer, align 16
+@B = common global [2000 x double] zeroinitializer, align 16
+@C = common global [2000 x float] zeroinitializer, align 16
+@D = common global [2000 x float] zeroinitializer, align 16
+
+; Currently SCEV isn't smart enough to figure out that accesses
+; A[3*i], A[3*i+1] and A[3*i+2] are consecutive, but in future
+; that would hopefully be fixed. For now, check that this isn't
+; vectorized.
+; CHECK-LABEL: foo_3double
+; CHECK-NOT: x double>
+; Function Attrs: nounwind ssp uwtable
+define void @foo_3double(i32 %u) #0 {
+entry:
+  %u.addr = alloca i32, align 4
+  store i32 %u, i32* %u.addr, align 4
+  %mul = mul nsw i32 %u, 3
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom
+  %0 = load double* %arrayidx, align 8
+  %arrayidx4 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom
+  %1 = load double* %arrayidx4, align 8
+  %add5 = fadd double %0, %1
+  store double %add5, double* %arrayidx, align 8
+  %add11 = add nsw i32 %mul, 1
+  %idxprom12 = sext i32 %add11 to i64
+  %arrayidx13 = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom12
+  %2 = load double* %arrayidx13, align 8
+  %arrayidx17 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom12
+  %3 = load double* %arrayidx17, align 8
+  %add18 = fadd double %2, %3
+  store double %add18, double* %arrayidx13, align 8
+  %add24 = add nsw i32 %mul, 2
+  %idxprom25 = sext i32 %add24 to i64
+  %arrayidx26 = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom25
+  %4 = load double* %arrayidx26, align 8
+  %arrayidx30 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom25
+  %5 = load double* %arrayidx30, align 8
+  %add31 = fadd double %4, %5
+  store double %add31, double* %arrayidx26, align 8
+  ret void
+}
+
+; SCEV should be able to tell that accesses A[C1 + C2*i], A[C1 + C2*i], ...
+; A[C1 + C2*i] are consecutive, if C2 is a power of 2, and C2 > C1 > 0.
+; Thus, the following code should be vectorized.
+; CHECK-LABEL: foo_2double
+; CHECK: x double>
+; Function Attrs: nounwind ssp uwtable
+define void @foo_2double(i32 %u) #0 {
+entry:
+  %u.addr = alloca i32, align 4
+  store i32 %u, i32* %u.addr, align 4
+  %mul = mul nsw i32 %u, 2
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom
+  %0 = load double* %arrayidx, align 8
+  %arrayidx4 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom
+  %1 = load double* %arrayidx4, align 8
+  %add5 = fadd double %0, %1
+  store double %add5, double* %arrayidx, align 8
+  %add11 = add nsw i32 %mul, 1
+  %idxprom12 = sext i32 %add11 to i64
+  %arrayidx13 = getelementptr inbounds [2000 x double]* @A, i32 0, i64 %idxprom12
+  %2 = load double* %arrayidx13, align 8
+  %arrayidx17 = getelementptr inbounds [2000 x double]* @B, i32 0, i64 %idxprom12
+  %3 = load double* %arrayidx17, align 8
+  %add18 = fadd double %2, %3
+  store double %add18, double* %arrayidx13, align 8
+  ret void
+}
+
+; Similar to the previous test, but with different datatype.
+; CHECK-LABEL: foo_4float
+; CHECK: x float>
+; Function Attrs: nounwind ssp uwtable
+define void @foo_4float(i32 %u) #0 {
+entry:
+  %u.addr = alloca i32, align 4
+  store i32 %u, i32* %u.addr, align 4
+  %mul = mul nsw i32 %u, 4
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom
+  %0 = load float* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom
+  %1 = load float* %arrayidx4, align 4
+  %add5 = fadd float %0, %1
+  store float %add5, float* %arrayidx, align 4
+  %add11 = add nsw i32 %mul, 1
+  %idxprom12 = sext i32 %add11 to i64
+  %arrayidx13 = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom12
+  %2 = load float* %arrayidx13, align 4
+  %arrayidx17 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom12
+  %3 = load float* %arrayidx17, align 4
+  %add18 = fadd float %2, %3
+  store float %add18, float* %arrayidx13, align 4
+  %add24 = add nsw i32 %mul, 2
+  %idxprom25 = sext i32 %add24 to i64
+  %arrayidx26 = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom25
+  %4 = load float* %arrayidx26, align 4
+  %arrayidx30 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom25
+  %5 = load float* %arrayidx30, align 4
+  %add31 = fadd float %4, %5
+  store float %add31, float* %arrayidx26, align 4
+  %add37 = add nsw i32 %mul, 3
+  %idxprom38 = sext i32 %add37 to i64
+  %arrayidx39 = getelementptr inbounds [2000 x float]* @C, i32 0, i64 %idxprom38
+  %6 = load float* %arrayidx39, align 4
+  %arrayidx43 = getelementptr inbounds [2000 x float]* @D, i32 0, i64 %idxprom38
+  %7 = load float* %arrayidx43, align 4
+  %add44 = fadd float %6, %7
+  store float %add44, float* %arrayidx39, align 4
+  ret void
+}
+
+; Similar to the previous tests, but now we are dealing with AddRec SCEV.
+; CHECK-LABEL: foo_loop
+; CHECK: x double>
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo_loop(double* %A, i32 %n) #0 {
+entry:
+  %A.addr = alloca double*, align 8
+  %n.addr = alloca i32, align 4
+  %sum = alloca double, align 8
+  %i = alloca i32, align 4
+  store double* %A, double** %A.addr, align 8
+  store i32 %n, i32* %n.addr, align 4
+  store double 0.000000e+00, double* %sum, align 8
+  store i32 0, i32* %i, align 4
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %0 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %1 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add7, %for.body ]
+  %mul = mul nsw i32 %0, 2
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds double* %A, i64 %idxprom
+  %2 = load double* %arrayidx, align 8
+  %mul1 = fmul double 7.000000e+00, %2
+  %add = add nsw i32 %mul, 1
+  %idxprom3 = sext i32 %add to i64
+  %arrayidx4 = getelementptr inbounds double* %A, i64 %idxprom3
+  %3 = load double* %arrayidx4, align 8
+  %mul5 = fmul double 7.000000e+00, %3
+  %add6 = fadd double %mul1, %mul5
+  %add7 = fadd double %1, %add6
+  store double %add7, double* %sum, align 8
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %i, align 4
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %split = phi double [ %add7, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %.lcssa = phi double [ %split, %for.cond.for.end_crit_edge ], [ 0.000000e+00, %entry ]
+  %conv = fptosi double %.lcssa to i32
+  ret i32 %conv
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 "}
diff --git a/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll b/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
new file mode 100644
index 0000000..ed22574
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; We will keep trying to vectorize the basic block even we already find vectorized store.
+; CHECK: test1
+; CHECK: store <2 x double>
+; CHECK: ret
+define void @test1(double* %a, double* %b, double* %c, double* %d) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  %0 = bitcast double* %a to <4 x i32>*
+  %1 = load <4 x i32>* %0, align 8
+  %2 = bitcast double* %b to <4 x i32>*
+  %3 = load <4 x i32>* %2, align 8
+  %4 = mul <4 x i32> %1, %3
+  %5 = bitcast double* %d to <4 x i32>*
+  store <4 x i32> %4, <4 x i32>* %5, align 8
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/cse.ll b/test/Transforms/SLPVectorizer/X86/cse.ll
index bbfd6f2..d2ad7eb 100644
--- a/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -217,3 +217,33 @@ return:                                           ; preds = %entry, %if.end
   ret i32 0
 }
 
+%class.B.53.55 = type { %class.A.52.54, double }
+%class.A.52.54 = type { double, double, double }
+
+@a = external global double, align 8
+
+define void @PR19646(%class.B.53.55* %this) {
+entry:
+  br i1 undef, label %if.end13, label %if.end13
+
+sw.epilog7:                                       ; No predecessors!
+  %.in = getelementptr inbounds %class.B.53.55* %this, i64 0, i32 0, i32 1
+  %0 = load double* %.in, align 8
+  %add = fadd double undef, 0.000000e+00
+  %add6 = fadd double %add, %0
+  %1 = load double* @a, align 8
+  %add8 = fadd double %1, 0.000000e+00
+  %_dy = getelementptr inbounds %class.B.53.55* %this, i64 0, i32 0, i32 2
+  %2 = load double* %_dy, align 8
+  %add10 = fadd double %add8, %2
+  br i1 undef, label %if.then12, label %if.end13
+
+if.then12:                                        ; preds = %sw.epilog7
+  %3 = load double* undef, align 8
+  br label %if.end13
+
+if.end13:                                         ; preds = %if.then12, %sw.epilog7, %entry
+  %x.1 = phi double [ 0.000000e+00, %if.then12 ], [ %add6, %sw.epilog7 ], [ undef, %entry ], [ undef, %entry ]
+  %b.0 = phi double [ %3, %if.then12 ], [ %add10, %sw.epilog7 ], [ undef, %entry], [ undef, %entry ]
+  unreachable
+}
diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index 7537ea3..9eda29f 100644
--- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -195,11 +195,35 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b,
   ret <4 x float> %rb
 }
 
+; Make sure that vectorization happens even if insertelements operations
+; must be rescheduled. The case here is from compiling Julia.
+define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @reschedule_extract(
+; CHECK: %1 = fadd <4 x float> %a, %b
+  %a0 = extractelement <4 x float> %a, i32 0
+  %b0 = extractelement <4 x float> %b, i32 0
+  %c0 = fadd float %a0, %b0
+  %v0 = insertelement <4 x float> undef, float %c0, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %b1 = extractelement <4 x float> %b, i32 1
+  %c1 = fadd float %a1, %b1
+  %v1 = insertelement <4 x float> %v0, float %c1, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %b2 = extractelement <4 x float> %b, i32 2
+  %c2 = fadd float %a2, %b2
+  %v2 = insertelement <4 x float> %v1, float %c2, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b3 = extractelement <4 x float> %b, i32 3
+  %c3 = fadd float %a3, %b3
+  %v3 = insertelement <4 x float> %v2, float %c3, i32 3
+  ret <4 x float> %v3
+}
+
 ; Check that cost model for vectorization takes credit for
 ; instructions that are erased.
 define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
 ; ZEROTHRESH-LABEL: @take_credit(
-; ZEROTHRESH-CHECK: %1 = fadd <4 x float> %a, %b
+; ZEROTHRESH: %1 = fadd <4 x float> %a, %b
   %a0 = extractelement <4 x float> %a, i32 0
   %b0 = extractelement <4 x float> %b, i32 0
   %c0 = fadd float %a0, %b0
@@ -219,4 +243,40 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %v3
 }
 
+; Make sure we handle multiple trees that feed one build vector correctly.
+define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
+entry:
+  %t0 = fadd double %w , 0.000000e+00
+  %t1 = fadd double %x , 1.000000e+00
+  %t2 = fadd double %y , 2.000000e+00
+  %t3 = fadd double %z , 3.000000e+00
+  %t4 = fmul double %t0, 1.000000e+00
+  %i1 = insertelement <4 x double> undef, double %t4, i32 3
+  %t5 = fmul double %t1, 1.000000e+00
+  %i2 = insertelement <4 x double> %i1, double %t5, i32 2
+  %t6 = fmul double %t2, 1.000000e+00
+  %i3 = insertelement <4 x double> %i2, double %t6, i32 1
+  %t7 = fmul double %t3, 1.000000e+00
+  %i4 = insertelement <4 x double> %i3, double %t7, i32 0
+  ret <4 x double> %i4
+}
+; CHECK-LABEL: @multi_tree
+; CHECK-DAG:  %[[V0:.+]] = insertelement <2 x double> undef, double %w, i32 0
+; CHECK-DAG:  %[[V1:.+]] = insertelement <2 x double> %[[V0]], double %x, i32 1
+; CHECK-DAG:  %[[V2:.+]] = fadd <2 x double> %[[V1]], <double 0.000000e+00, double 1.000000e+00>
+; CHECK-DAG:  %[[V3:.+]] = insertelement <2 x double> undef, double %y, i32 0
+; CHECK-DAG:  %[[V4:.+]] = insertelement <2 x double> %[[V3]], double %z, i32 1
+; CHECK-DAG:  %[[V5:.+]] = fadd <2 x double> %[[V4]], <double 2.000000e+00, double 3.000000e+00>
+; CHECK-DAG:  %[[V6:.+]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, %[[V2]]
+; CHECK-DAG:  %[[V7:.+]] = extractelement <2 x double> %[[V6]], i32 0
+; CHECK-DAG:  %[[I1:.+]] = insertelement <4 x double> undef, double %[[V7]], i32 3
+; CHECK-DAG:  %[[V8:.+]] = extractelement <2 x double> %[[V6]], i32 1
+; CHECK-DAG:  %[[I2:.+]] = insertelement <4 x double> %[[I1]], double %[[V8]], i32 2
+; CHECK-DAG:  %[[V9:.+]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, %[[V5]]
+; CHECK-DAG:  %[[V10:.+]] = extractelement <2 x double> %[[V9]], i32 0
+; CHECK-DAG:  %[[I3:.+]] = insertelement <4 x double> %i2, double %[[V10]], i32 1
+; CHECK-DAG:  %[[V11:.+]] = extractelement <2 x double> %[[V9]], i32 1
+; CHECK-DAG:  %[[I4:.+]] = insertelement <4 x double> %i3, double %[[V11]], i32 0
+; CHECK:  ret <4 x double> %[[I4]]
+
 attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/SLPVectorizer/X86/intrinsic.ll b/test/Transforms/SLPVectorizer/X86/intrinsic.ll
index 2b7ee75..30c5093 100644
--- a/test/Transforms/SLPVectorizer/X86/intrinsic.ll
+++ b/test/Transforms/SLPVectorizer/X86/intrinsic.ll
@@ -71,5 +71,49 @@ entry:
   ret void
 }
 
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
 
+define void @vec_bswap_i32(i32* %a, i32* %b, i32* %c) {
+entry:
+  %i0 = load i32* %a, align 4
+  %i1 = load i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.bswap.i32(i32 %add1) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 1
+  %i2 = load i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i32 1
+  %i3 = load i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.bswap.i32(i32 %add2) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32* %a, i32 2
+  %i4 = load i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32* %b, i32 2
+  %i5 = load i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.bswap.i32(i32 %add3) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32* %a, i32 3
+  %i6 = load i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32* %b, i32 3
+  %i7 = load i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.bswap.i32(i32 %add4) nounwind readnone
 
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @vec_bswap_i32(
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: call <4 x i32> @llvm.bswap.v4i32
+; CHECK: store <4 x i32>
+; CHECK: ret
+}
diff --git a/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll b/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll
new file mode 100644
index 0000000..b250735
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -slp-vectorizer -o - -S -slp-threshold=-1000
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx--nvidiacl"
+
+; CTLZ cannot be vectorized currently because the second argument is a scalar
+; for both the scalar and vector forms of the intrinsic. In the future it
+; should be possible to vectorize such functions.
+; Test causes an assert if LLVM tries to vectorize CTLZ.
+
+define <2 x i8> @cltz_test(<2 x i8> %x) #0 {
+entry:
+  %0 = extractelement <2 x i8> %x, i32 0
+  %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
+  %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0
+  %1 = extractelement <2 x i8> %x, i32 1
+  %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+  %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1
+  ret <2 x i8> %vecinit2
+}
+
+define <2 x i8> @cltz_test2(<2 x i8> %x) #1 {
+entry:
+  %0 = extractelement <2 x i8> %x, i32 0
+  %1 = extractelement <2 x i8> %x, i32 1
+  %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
+  %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+  %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0
+  %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1
+  ret <2 x i8> %vecinit2
+}
+
+declare i8 @llvm.ctlz.i8(i8, i1) #3
+
+attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/SLPVectorizer/X86/value-bug.ll b/test/Transforms/SLPVectorizer/X86/value-bug.ll
new file mode 100644
index 0000000..64d2ae1
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/value-bug.ll
@@ -0,0 +1,80 @@
+; RUN: opt -slp-vectorizer < %s -S -mtriple="x86_64-grtev3-linux-gnu" -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-grtev3-linux-gnu"
+
+; We used to crash on this example because we were building a constant
+; expression during vectorization and the vectorizer expects instructions
+; as elements of the vectorized tree.
+; CHECK-LABEL: @test
+; PR19621
+
+define void @test() {
+bb279:
+  br label %bb283
+
+bb283:
+  %Av.sroa.8.0 = phi float [ undef, %bb279 ], [ %tmp315, %exit ]
+  %Av.sroa.5.0 = phi float [ undef, %bb279 ], [ %tmp319, %exit ]
+  %Av.sroa.3.0 = phi float [ undef, %bb279 ], [ %tmp307, %exit ]
+  %Av.sroa.0.0 = phi float [ undef, %bb279 ], [ %tmp317, %exit ]
+  br label %bb284
+
+bb284:
+  %tmp7.i = fpext float %Av.sroa.3.0 to double
+  %tmp8.i = fsub double %tmp7.i, undef
+  %tmp9.i = fsub double %tmp8.i, undef
+  %tmp17.i = fpext float %Av.sroa.8.0 to double
+  %tmp19.i = fsub double %tmp17.i, undef
+  %tmp20.i = fsub double %tmp19.i, undef
+  br label %bb21.i
+
+bb21.i:
+  br i1 undef, label %bb22.i, label %exit
+
+bb22.i:
+  %tmp24.i = fadd double undef, %tmp9.i
+  %tmp26.i = fadd double undef, %tmp20.i
+  br label %bb32.i
+
+bb32.i:
+  %xs.0.i = phi double [ %tmp24.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
+  %ys.0.i = phi double [ %tmp26.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
+  br i1 undef, label %bb32.i, label %bb21.i
+
+exit:
+  %tmp303 = fpext float %Av.sroa.0.0 to double
+  %tmp304 = fmul double %tmp303, undef
+  %tmp305 = fadd double undef, %tmp304
+  %tmp306 = fadd double %tmp305, undef
+  %tmp307 = fptrunc double %tmp306 to float
+  %tmp311 = fpext float %Av.sroa.5.0 to double
+  %tmp312 = fmul double %tmp311, 0.000000e+00
+  %tmp313 = fadd double undef, %tmp312
+  %tmp314 = fadd double %tmp313, undef
+  %tmp315 = fptrunc double %tmp314 to float
+  %tmp317 = fptrunc double undef to float
+  %tmp319 = fptrunc double undef to float
+  br label %bb283
+}
+
+; Make sure that we probably handle constant folded vectorized trees. The
+; vectorizer starts at the type (%t2, %t3) and wil constant fold the tree.
+; The code that handles insertelement instructions must handle this.
+define <4 x double> @constant_folding() {
+entry:
+  %t0 = fadd double 1.000000e+00 , 0.000000e+00
+  %t1 = fadd double 1.000000e+00 , 1.000000e+00
+  %t2 = fmul double %t0, 1.000000e+00
+  %i1 = insertelement <4 x double> undef, double %t2, i32 1
+  %t3 = fmul double %t1, 1.000000e+00
+  %i2 = insertelement <4 x double> %i1, double %t3, i32 0
+  ret <4 x double> %i2
+}
+
+; CHECK-LABEL: @constant_folding
+; CHECK: %[[V0:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 0
+; CHECK: %[[V1:.+]] = insertelement <4 x double> undef, double %[[V0]], i32 1
+; CHECK: %[[V2:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 1
+; CHECK: %[[V3:.+]] = insertelement <4 x double> %[[V1]], double %[[V2]], i32 0
+; CHECK: ret <4 x double> %[[V3]]
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg
new file mode 100644
index 0000000..40532cd
--- /dev/null
+++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'NVPTX' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
new file mode 100644
index 0000000..850fc4c
--- /dev/null
+++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX
+; RUN: opt < %s -S -separate-const-offset-from-gep -gvn -dce | FileCheck %s --check-prefix=IR
+
+; Verifies the SeparateConstOffsetFromGEP pass.
+; The following code computes
+; *output = array[x][y] + array[x][y+1] + array[x+1][y] + array[x+1][y+1]
+;
+; We expect SeparateConstOffsetFromGEP to transform it to
+;
+; float *base = &a[x][y];
+; *output = base[0] + base[1] + base[32] + base[33];
+;
+; so the backend can emit PTX that uses fewer virtual registers.
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+@array = internal addrspace(3) constant [32 x [32 x float]] zeroinitializer, align 4
+
+define void @sum_of_array(i32 %x, i32 %y, float* nocapture %output) {
+.preheader:
+  %0 = zext i32 %y to i64
+  %1 = zext i32 %x to i64
+  %2 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0
+  %3 = addrspacecast float addrspace(3)* %2 to float*
+  %4 = load float* %3, align 4
+  %5 = fadd float %4, 0.000000e+00
+  %6 = add i32 %y, 1
+  %7 = zext i32 %6 to i64
+  %8 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %7
+  %9 = addrspacecast float addrspace(3)* %8 to float*
+  %10 = load float* %9, align 4
+  %11 = fadd float %5, %10
+  %12 = add i32 %x, 1
+  %13 = zext i32 %12 to i64
+  %14 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %0
+  %15 = addrspacecast float addrspace(3)* %14 to float*
+  %16 = load float* %15, align 4
+  %17 = fadd float %11, %16
+  %18 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %7
+  %19 = addrspacecast float addrspace(3)* %18 to float*
+  %20 = load float* %19, align 4
+  %21 = fadd float %17, %20
+  store float %21, float* %output, align 4
+  ret void
+}
+
+; PTX-LABEL: sum_of_array(
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rl|r)[0-9]+]]{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
+
+; IR-LABEL: @sum_of_array(
+; IR: [[BASE_PTR:%[0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i32 %x, i32 %y
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 1
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
new file mode 100644
index 0000000..2e50f5f
--- /dev/null
+++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -0,0 +1,137 @@
+; RUN: opt < %s -separate-const-offset-from-gep -dce -S | FileCheck %s
+
+; Several unit tests for -separate-const-offset-from-gep. The transformation
+; heavily relies on TargetTransformInfo, so we put these tests under
+; target-specific folders.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+; target triple is necessary; otherwise TargetTransformInfo rejects any
+; addressing mode.
+target triple = "nvptx64-unknown-unknown"
+
+%struct.S = type { float, double }
+
+@struct_array = global [1024 x %struct.S] zeroinitializer, align 16
+@float_2d_array = global [32 x [32 x float]] zeroinitializer, align 4
+
+; We should not extract any struct field indices, because fields in a struct
+; may have different types.
+define double* @struct(i32 %i) {
+entry:
+  %add = add nsw i32 %i, 5
+  %idxprom = sext i32 %add to i64
+  %p = getelementptr inbounds [1024 x %struct.S]* @struct_array, i64 0, i64 %idxprom, i32 1
+  ret double* %p
+}
+; CHECK-LABEL: @struct
+; CHECK: getelementptr [1024 x %struct.S]* @struct_array, i64 0, i32 %i, i32 1
+
+; We should be able to trace into sext/zext if it's directly used as a GEP
+; index.
+define float* @sext_zext(i32 %i, i32 %j) {
+entry:
+  %i1 = add i32 %i, 1
+  %j2 = add i32 %j, 2
+  %i1.ext = sext i32 %i1 to i64
+  %j2.ext = zext i32 %j2 to i64
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i1.ext, i64 %j2.ext
+  ret float* %p
+}
+; CHECK-LABEL: @sext_zext
+; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i32 %i, i32 %j
+; CHECK: getelementptr float* %{{[0-9]+}}, i64 34
+
+; We should be able to trace into sext/zext if it can be distributed to both
+; operands, e.g., sext (add nsw a, b) == add nsw (sext a), (sext b)
+define float* @ext_add_no_overflow(i64 %a, i32 %b, i64 %c, i32 %d) {
+  %b1 = add nsw i32 %b, 1
+  %b2 = sext i32 %b1 to i64
+  %i = add i64 %a, %b2
+  %d1 = add nuw i32 %d, 1
+  %d2 = zext i32 %d1 to i64
+  %j = add i64 %c, %d2
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %j
+  ret float* %p
+}
+; CHECK-LABEL: @ext_add_no_overflow
+; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}
+; CHECK: getelementptr float* [[BASE_PTR]], i64 33
+
+; Similar to @ext_add_no_overflow, we should be able to trace into sext/zext if
+; its operand is an "or" instruction.
+define float* @ext_or(i64 %a, i32 %b) {
+entry:
+  %b1 = shl i32 %b, 2
+  %b2 = or i32 %b1, 1
+  %b3 = or i32 %b1, 2
+  %b2.ext = sext i32 %b2 to i64
+  %b3.ext = sext i32 %b3 to i64
+  %i = add i64 %a, %b2.ext
+  %j = add i64 %a, %b3.ext
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %j
+  ret float* %p
+}
+; CHECK-LABEL: @ext_or
+; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[0-9]+}}, i64 %{{[0-9]+}}
+; CHECK: getelementptr float* [[BASE_PTR]], i64 34
+
+; We should treat "or" with no common bits (%k) as "add", and leave "or" with
+; potentially common bits (%l) as is.
+define float* @or(i64 %i) {
+entry:
+  %j = shl i64 %i, 2
+  %k = or i64 %j, 3 ; no common bits
+  %l = or i64 %j, 4 ; potentially common bits
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %k, i64 %l
+  ret float* %p
+}
+; CHECK-LABEL: @or
+; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %j, i64 %l
+; CHECK: getelementptr float* [[BASE_PTR]], i64 96
+
+; The subexpression (b + 5) is used in both "i = a + (b + 5)" and "*out = b +
+; 5". When extracting the constant offset 5, make sure "*out = b + 5" isn't
+; affected.
+define float* @expr(i64 %a, i64 %b, i64* %out) {
+entry:
+  %b5 = add i64 %b, 5
+  %i = add i64 %b5, %a
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 0
+  store i64 %b5, i64* %out
+  ret float* %p
+}
+; CHECK-LABEL: @expr
+; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %0, i64 0
+; CHECK: getelementptr float* [[BASE_PTR]], i64 160
+; CHECK: store i64 %b5, i64* %out
+
+; Verifies we handle "sub" correctly.
+define float* @sub(i64 %i, i64 %j) {
+  %i2 = sub i64 %i, 5 ; i - 5
+  %j2 = sub i64 5, %j ; 5 - i
+  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i2, i64 %j2
+  ret float* %p
+}
+; CHECK-LABEL: @sub
+; CHECK: %[[j2:[0-9]+]] = sub i64 0, %j
+; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %[[j2]]
+; CHECK: getelementptr float* [[BASE_PTR]], i64 -155
+
+%struct.Packed = type <{ [3 x i32], [8 x i64] }> ; <> means packed
+
+; Verifies we can emit correct uglygep if the address is not natually aligned.
+define i64* @packed_struct(i32 %i, i32 %j) {
+entry:
+  %s = alloca [1024 x %struct.Packed], align 16
+  %add = add nsw i32 %j, 3
+  %idxprom = sext i32 %add to i64
+  %add1 = add nsw i32 %i, 1
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds [1024 x %struct.Packed]* %s, i64 0, i64 %idxprom2, i32 1, i64 %idxprom
+  ret i64* %arrayidx3
+}
+; CHECK-LABEL: @packed_struct
+; CHECK: [[BASE_PTR:%[0-9]+]] = getelementptr [1024 x %struct.Packed]* %s, i64 0, i32 %i, i32 1, i32 %j
+; CHECK: [[CASTED_PTR:%[0-9]+]] = bitcast i64* [[BASE_PTR]] to i8*
+; CHECK: %uglygep = getelementptr i8* [[CASTED_PTR]], i64 100
+; CHECK: bitcast i8* %uglygep to i64*
diff --git a/test/Transforms/SimplifyCFG/extract-cost.ll b/test/Transforms/SimplifyCFG/extract-cost.ll
new file mode 100644
index 0000000..9c86725
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/extract-cost.ll
@@ -0,0 +1,22 @@
+; RUN: opt -simplifycfg -S  < %s | FileCheck %s
+
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
+
+define i32 @f(i32 %a, i32 %b) #0 {
+entry:
+  %uadd = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
+  %cmp = extractvalue { i32, i1 } %uadd, 1
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %0 = extractvalue { i32, i1 } %uadd, 0
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  %retval.0 = phi i32 [ %0, %if.end ], [ 0, %entry ]
+  ret i32 %retval.0
+
+; CHECK-LABEL: @f(
+; CHECK-NOT: phi
+; CHECK: select
+}
diff --git a/test/Transforms/TailCallElim/basic.ll b/test/Transforms/TailCallElim/basic.ll
index 35420ab..341736d 100644
--- a/test/Transforms/TailCallElim/basic.ll
+++ b/test/Transforms/TailCallElim/basic.ll
@@ -143,3 +143,34 @@ cond_false:
   call void @noarg()
   ret i32* null
 }
+
+; Don't tail call if a byval arg is captured.
+define void @test9(i32* byval %a) {
+; CHECK-LABEL: define void @test9(
+; CHECK: {{^ *}}call void @use(
+  call void @use(i32* %a)
+  ret void
+}
+
+%struct.X = type { i8* }
+
+declare void @ctor(%struct.X*)
+define void @test10(%struct.X* noalias sret %agg.result, i1 zeroext %b) {
+; CHECK-LABEL @test10
+entry:
+  %x = alloca %struct.X, align 8
+  br i1 %b, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @ctor(%struct.X* %agg.result)
+; CHECK: tail call void @ctor
+  br label %return
+
+if.end:
+  call void @ctor(%struct.X* %x)
+; CHECK: call void @ctor
+  br label %return
+
+return:
+  ret void
+}
diff --git a/test/Verifier/aliasing-chain.ll b/test/Verifier/aliasing-chain.ll
deleted file mode 100644
index ae0b77f..0000000
--- a/test/Verifier/aliasing-chain.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN:  not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
-; CHECK: Aliasing chain should end with function or global variable
-; Test that alising chain does not create a cycle
-
-@b1 = alias i32* @c1
-@c1 = alias i32* @b1
diff --git a/test/Verifier/bitcast-alias-address-space.ll b/test/Verifier/bitcast-alias-address-space.ll
deleted file mode 100644
index d9794d9..0000000
--- a/test/Verifier/bitcast-alias-address-space.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
-
-; CHECK: error: invalid cast opcode for cast from 'i32 addrspace(2)*' to 'i32 addrspace(1)*'
-
-target datalayout = "e-p:32:32:32-p1:16:16:16-p2:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n8:16:32"
-
-
-@data = addrspace(2) global i32 27
-
-@illegal_alias_data = alias bitcast (i32 addrspace(2)* @data to i32 addrspace(1)*)
diff --git a/test/Verifier/global-ctors.ll b/test/Verifier/global-ctors.ll
new file mode 100644
index 0000000..76570c5
--- /dev/null
+++ b/test/Verifier/global-ctors.ll
@@ -0,0 +1,11 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+@llvm.global_ctors = appending global [1 x { i32, void()*, i8 } ] [
+  { i32, void()*, i8 } { i32 65535, void ()* null, i8 0 }
+]
+; CHECK: wrong type for intrinsic global variable
+
+@llvm.global_dtors = appending global [1 x { i32, void()*, i8*, i8 } ] [
+  { i32, void()*, i8*, i8 } { i32 65535, void ()* null, i8* null, i8 0}
+]
+; CHECK: wrong type for intrinsic global variable
diff --git a/test/Verifier/inalloca3.ll b/test/Verifier/inalloca3.ll
new file mode 100644
index 0000000..c09ce10
--- /dev/null
+++ b/test/Verifier/inalloca3.ll
@@ -0,0 +1,13 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+
+declare void @doit(i64* inalloca %a)
+
+define void @a() {
+entry:
+  %a = alloca [2 x i32]
+  %b = bitcast [2 x i32]* %a to i64*
+  call void @doit(i64* inalloca %b)
+; CHECK: inalloca argument for call has mismatched alloca
+  ret void
+}
diff --git a/test/Verifier/musttail-invalid.ll b/test/Verifier/musttail-invalid.ll
new file mode 100644
index 0000000..e5f9a40
--- /dev/null
+++ b/test/Verifier/musttail-invalid.ll
@@ -0,0 +1,82 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+; Each musttail call should fail to validate.
+
+declare x86_stdcallcc void @cc_mismatch_callee()
+define void @cc_mismatch() {
+; CHECK: mismatched calling conv
+  musttail call x86_stdcallcc void @cc_mismatch_callee()
+  ret void
+}
+
+declare void @more_parms_callee(i32)
+define void @more_parms() {
+; CHECK: mismatched parameter counts
+  musttail call void @more_parms_callee(i32 0)
+  ret void
+}
+
+declare void @mismatched_intty_callee(i8)
+define void @mismatched_intty(i32) {
+; CHECK: mismatched parameter types
+  musttail call void @mismatched_intty_callee(i8 0)
+  ret void
+}
+
+declare void @mismatched_vararg_callee(i8*, ...)
+define void @mismatched_vararg(i8*) {
+; CHECK: mismatched varargs
+  musttail call void (i8*, ...)* @mismatched_vararg_callee(i8* null)
+  ret void
+}
+
+; We would make this an implicit sret parameter, which would disturb the
+; tail call.
+declare { i32, i32, i32 } @mismatched_retty_callee(i32)
+define void @mismatched_retty(i32) {
+; CHECK: mismatched return types
+  musttail call { i32, i32, i32 } @mismatched_retty_callee(i32 0)
+  ret void
+}
+
+declare void @mismatched_byval_callee({ i32 }*)
+define void @mismatched_byval({ i32 }* byval %a) {
+; CHECK: mismatched ABI impacting function attributes
+  musttail call void @mismatched_byval_callee({ i32 }* %a)
+  ret void
+}
+
+declare void @mismatched_inreg_callee(i32 inreg)
+define void @mismatched_inreg(i32 %a) {
+; CHECK: mismatched ABI impacting function attributes
+  musttail call void @mismatched_inreg_callee(i32 inreg %a)
+  ret void
+}
+
+declare void @mismatched_sret_callee(i32* sret)
+define void @mismatched_sret(i32* %a) {
+; CHECK: mismatched ABI impacting function attributes
+  musttail call void @mismatched_sret_callee(i32* sret %a)
+  ret void
+}
+
+declare void @mismatched_alignment_callee(i32* byval align 8)
+define void @mismatched_alignment(i32* byval align 4 %a) {
+; CHECK: mismatched ABI impacting function attributes
+  musttail call void @mismatched_alignment_callee(i32* byval align 8 %a)
+  ret void
+}
+
+declare i32 @not_tail_pos_callee()
+define i32 @not_tail_pos() {
+; CHECK: musttail call must be precede a ret with an optional bitcast
+  %v = musttail call i32 @not_tail_pos_callee()
+  %w = add i32 %v, 1
+  ret i32 %w
+}
+
+define void @inline_asm() {
+; CHECK: cannot use musttail call with inline asm
+  musttail call void asm "ret", ""()
+  ret void
+}
diff --git a/test/Verifier/musttail-valid.ll b/test/Verifier/musttail-valid.ll
new file mode 100644
index 0000000..815d77a
--- /dev/null
+++ b/test/Verifier/musttail-valid.ll
@@ -0,0 +1,16 @@
+; RUN: llvm-as %s -o /dev/null
+
+; Should assemble without error.
+
+declare void @similar_param_ptrty_callee(i8*)
+define void @similar_param_ptrty(i32*) {
+  musttail call void @similar_param_ptrty_callee(i8* null)
+  ret void
+}
+
+declare i8* @similar_ret_ptrty_callee()
+define i32* @similar_ret_ptrty() {
+  %v = musttail call i8* @similar_ret_ptrty_callee()
+  %w = bitcast i8* %v to i32*
+  ret i32* %w
+}
diff --git a/test/Verifier/sret.ll b/test/Verifier/sret.ll
new file mode 100644
index 0000000..1ddbf1f
--- /dev/null
+++ b/test/Verifier/sret.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+declare void @a(i32* sret %a, i32* sret %b)
+; CHECK: Cannot have multiple 'sret' parameters!
+
+declare void @b(i32* %a, i32* %b, i32* sret %c)
+; CHECK: Attribute 'sret' is not on first or second parameter!
diff --git a/test/lit.cfg b/test/lit.cfg
index df1850a..2815a61 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -95,6 +95,11 @@ for symbolizer in ['ASAN_SYMBOLIZER_PATH', 'MSAN_SYMBOLIZER_PATH']:
     if symbolizer in os.environ:
         config.environment[symbolizer] = os.environ[symbolizer]
 
+# Propagate options for sanitizers.
+for options in ['ASAN_OPTIONS']:
+    if options in os.environ:
+        config.environment[options] = os.environ[options]
+
 ###
 
 import os
@@ -211,10 +216,8 @@ config.substitutions.append( ('%exeext', config.llvm_exe_ext) )
 NOJUNK = r"(?<!\.|-|\^|/)"
 
 for pattern in [r"\bbugpoint\b(?!-)",
-                NOJUNK + r"\bclang\b(?!-)",
                 NOJUNK + r"\bllc\b",
                 r"\blli\b",
-                r"\bllvm-PerfectShuffle\b",
                 r"\bllvm-ar\b",
                 r"\bllvm-as\b",
                 r"\bllvm-bcanalyzer\b",
@@ -241,8 +244,6 @@ for pattern in [r"\bbugpoint\b(?!-)",
                 NOJUNK + r"\bopt\b",
                 r"\bFileCheck\b",
                 r"\bFileUpdate\b",
-                r"\bc-index-test\b",
-                r"\bfpcmp\b",
                 r"\bobj2yaml\b",
                 r"\byaml2obj\b",
                 # Handle these specially as they are strings searched
diff --git a/test/tools/llvm-cov/Inputs/range_based_for.gcda b/test/tools/llvm-cov/Inputs/range_based_for.gcda
new file mode 100644
index 0000000..df51888
Binary files /dev/null and b/test/tools/llvm-cov/Inputs/range_based_for.gcda differ
diff --git a/test/tools/llvm-cov/Inputs/range_based_for.gcno b/test/tools/llvm-cov/Inputs/range_based_for.gcno
new file mode 100644
index 0000000..5f30acf
Binary files /dev/null and b/test/tools/llvm-cov/Inputs/range_based_for.gcno differ
diff --git a/test/tools/llvm-cov/Inputs/test_long_file_names.output b/test/tools/llvm-cov/Inputs/test_long_file_names.output
new file mode 100644
index 0000000..e09f4cb
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/test_long_file_names.output
@@ -0,0 +1,8 @@
+File 'srcdir/./nested_dir/../test.h'
+Lines executed:100.00% of 1
+srcdir/./nested_dir/../test.h:creating 'test_paths.cpp##test.h.gcov'
+
+File 'srcdir/./nested_dir/../test.cpp'
+Lines executed:84.21% of 38
+srcdir/./nested_dir/../test.cpp:creating 'test_paths.cpp##test.cpp.gcov'
+
diff --git a/test/tools/llvm-cov/Inputs/test_long_paths.output b/test/tools/llvm-cov/Inputs/test_long_paths.output
new file mode 100644
index 0000000..376ee5b
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/test_long_paths.output
@@ -0,0 +1,8 @@
+File 'srcdir/./nested_dir/../test.h'
+Lines executed:100.00% of 1
+srcdir/./nested_dir/../test.h:creating 'srcdir#^#test_paths.cpp##srcdir#nested_dir#^#test.h.gcov'
+
+File 'srcdir/./nested_dir/../test.cpp'
+Lines executed:84.21% of 38
+srcdir/./nested_dir/../test.cpp:creating 'srcdir#^#test_paths.cpp##srcdir#nested_dir#^#test.cpp.gcov'
+
diff --git a/test/tools/llvm-cov/Inputs/test_missing.cpp.gcov b/test/tools/llvm-cov/Inputs/test_missing.cpp.gcov
new file mode 100644
index 0000000..1c138e4
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/test_missing.cpp.gcov
@@ -0,0 +1,77 @@
+        -:    0:Source:srcdir/./nested_dir/../test.cpp
+        -:    0:Graph:test_paths.gcno
+        -:    0:Data:test_paths.gcda
+        -:    0:Runs:3
+        -:    0:Programs:1
+        -:    1:/*EOF*/
+        -:    2:/*EOF*/
+        -:    3:/*EOF*/
+        -:    4:/*EOF*/
+        -:    5:/*EOF*/
+        -:    6:/*EOF*/
+        -:    7:/*EOF*/
+        -:    8:/*EOF*/
+        -:    9:/*EOF*/
+12884901888:   10:/*EOF*/
+        -:   11:/*EOF*/
+    #####:   12:/*EOF*/
+        -:   13:/*EOF*/
+        -:   14:/*EOF*/
+    #####:   15:/*EOF*/
+        -:   16:/*EOF*/
+        -:   17:/*EOF*/
+        -:   18:/*EOF*/
+        3:   19:/*EOF*/
+        3:   20:/*EOF*/
+        -:   21:/*EOF*/
+        -:   22:/*EOF*/
+        -:   23:/*EOF*/
+    #####:   24:/*EOF*/
+    #####:   25:/*EOF*/
+        -:   26:/*EOF*/
+        -:   27:/*EOF*/
+       12:   28:/*EOF*/
+       12:   29:/*EOF*/
+       12:   30:/*EOF*/
+        -:   31:/*EOF*/
+        -:   32:/*EOF*/
+       21:   33:/*EOF*/
+       36:   34:/*EOF*/
+       18:   35:/*EOF*/
+        3:   36:/*EOF*/
+        -:   37:/*EOF*/
+        -:   38:/*EOF*/
+        3:   39:/*EOF*/
+        -:   40:/*EOF*/
+        3:   41:/*EOF*/
+        3:   42:/*EOF*/
+        3:   43:/*EOF*/
+        3:   44:/*EOF*/
+        3:   45:/*EOF*/
+        3:   46:/*EOF*/
+    #####:   47:/*EOF*/
+    #####:   48:/*EOF*/
+        -:   49:/*EOF*/
+        -:   50:/*EOF*/
+       66:   51:/*EOF*/
+       30:   52:/*EOF*/
+        -:   53:/*EOF*/
+        6:   54:/*EOF*/
+        6:   55:/*EOF*/
+        -:   56:/*EOF*/
+        -:   57:/*EOF*/
+        3:   58:/*EOF*/
+        3:   59:/*EOF*/
+        -:   60:/*EOF*/
+        9:   61:/*EOF*/
+        9:   62:/*EOF*/
+        -:   63:/*EOF*/
+       12:   64:/*EOF*/
+       12:   65:/*EOF*/
+       30:   66:/*EOF*/
+        -:   67:/*EOF*/
+        3:   68:/*EOF*/
+25769803782:   69:/*EOF*/
+12884901888:   70:/*EOF*/
+        -:   71:/*EOF*/
+        3:   72:/*EOF*/
diff --git a/test/tools/llvm-cov/Inputs/test_missing.h.gcov b/test/tools/llvm-cov/Inputs/test_missing.h.gcov
new file mode 100644
index 0000000..d500e86
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/test_missing.h.gcov
@@ -0,0 +1,6 @@
+        -:    0:Source:srcdir/./nested_dir/../test.h
+        -:    0:Graph:test_paths.gcno
+        -:    0:Data:test_paths.gcda
+        -:    0:Runs:3
+        -:    0:Programs:1
+        6:    1:/*EOF*/
diff --git a/test/tools/llvm-cov/Inputs/test_missing.output b/test/tools/llvm-cov/Inputs/test_missing.output
new file mode 100644
index 0000000..ada0c36
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/test_missing.output
@@ -0,0 +1,8 @@
+File 'srcdir/./nested_dir/../test.h'
+Lines executed:100.00% of 1
+srcdir/./nested_dir/../test.h:creating 'test.h.gcov'
+
+File 'srcdir/./nested_dir/../test.cpp'
+Lines executed:84.21% of 38
+srcdir/./nested_dir/../test.cpp:creating 'test.cpp.gcov'
+
diff --git a/test/tools/llvm-cov/Inputs/test_no_output.output b/test/tools/llvm-cov/Inputs/test_no_output.output
new file mode 100644
index 0000000..74286b9
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/test_no_output.output
@@ -0,0 +1,6 @@
+File 'test.cpp'
+Lines executed:84.21% of 38
+
+File './test.h'
+Lines executed:100.00% of 1
+
diff --git a/test/tools/llvm-cov/lit.local.cfg b/test/tools/llvm-cov/lit.local.cfg
index f738810..56c6f1f 100644
--- a/test/tools/llvm-cov/lit.local.cfg
+++ b/test/tools/llvm-cov/lit.local.cfg
@@ -1 +1 @@
-config.suffixes = ['.test', '.m']
+config.suffixes = ['.test', '.m', '.cpp']
diff --git a/test/tools/llvm-cov/llvm-cov.test b/test/tools/llvm-cov/llvm-cov.test
index 19d3e5d..2345f8d 100644
--- a/test/tools/llvm-cov/llvm-cov.test
+++ b/test/tools/llvm-cov/llvm-cov.test
@@ -31,6 +31,15 @@ RUN: llvm-cov -o objdir/test test.c | diff -u test_no_options.output -
 RUN: diff -aub test_objdir.cpp.gcov test.cpp.gcov
 RUN: diff -aub test_objdir.h.gcov test.h.gcov
 
+# With gcov output disabled
+RUN: llvm-cov -n test.c | diff -u test_no_output.output -
+
+# Missing source files. This test is fragile, as it depends on being
+# run before we copy some sources into place in the next test.
+RUN: llvm-cov test_paths.cpp 2>/dev/null | diff -u test_missing.output -
+RUN: diff -aub test_missing.cpp.gcov test.cpp.gcov
+RUN: diff -aub test_missing.h.gcov test.h.gcov
+
 # Preserve paths. This mangles the output filenames.
 RUN: mkdir -p %t/srcdir/nested_dir
 RUN: cp test.cpp test.h %t/srcdir
@@ -43,6 +52,16 @@ RUN: llvm-cov test_paths.cpp | diff -u test_no_preserve_paths.output -
 RUN: diff -aub test_paths.cpp.gcov test.cpp.gcov
 RUN: diff -aub test_paths.h.gcov test.h.gcov
 
+# Long file names.
+RUN: llvm-cov -l test_paths.cpp | diff -u test_long_file_names.output -
+RUN: diff -aub test_paths.cpp.gcov test_paths.cpp##test.cpp.gcov
+RUN: diff -aub test_paths.h.gcov test_paths.cpp##test.h.gcov
+
+# Long file names and preserve paths.
+RUN: llvm-cov -lp -gcno test_paths.gcno -gcda test_paths.gcda srcdir/../test_paths.cpp | diff -u test_long_paths.output -
+RUN: diff -aub test_paths.cpp.gcov srcdir#^#test_paths.cpp##srcdir#nested_dir#^#test.cpp.gcov
+RUN: diff -aub test_paths.h.gcov srcdir#^#test_paths.cpp##srcdir#nested_dir#^#test.h.gcov
+
 # Function summaries. This changes stdout, but not the gcov files.
 RUN: llvm-cov test.c -f | diff -u test_-f.output -
 RUN: diff -aub test_no_options.cpp.gcov test.cpp.gcov
diff --git a/test/tools/llvm-cov/range_based_for.cpp b/test/tools/llvm-cov/range_based_for.cpp
new file mode 100644
index 0000000..61f60f6
--- /dev/null
+++ b/test/tools/llvm-cov/range_based_for.cpp
@@ -0,0 +1,29 @@
+// Make sure that compiler-added local variables (whose line number is zero)
+// don't crash llvm-cov.
+
+// We need shell for cd
+// REQUIRES: shell
+
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: cd %t
+// RUN: cp %s %p/Inputs/range_based_for.gc* .
+
+// RUN: llvm-cov range_based_for.cpp | FileCheck %s --check-prefix=STDOUT
+// STDOUT: File 'range_based_for.cpp'
+// STDOUT: Lines executed:100.00% of 5
+// STDOUT: range_based_for.cpp:creating 'range_based_for.cpp.gcov'
+
+// RUN: FileCheck %s --check-prefix=GCOV < %t/range_based_for.cpp.gcov
+// GCOV: -:    0:Runs:1
+// GCOV: -:    0:Programs:1
+
+int main(int argc, const char *argv[]) { // GCOV: 1:    [[@LINE]]:int main(
+  int V[] = {1, 2};                      // GCOV: 1:    [[@LINE]]:  int V[]
+  for (int &I : V) {                     // GCOV: 10:   [[@LINE]]:  for (
+  }                                      // GCOV: 2:    [[@LINE]]:  }
+  return 0;                              // GCOV: 1:    [[@LINE]]:  return
+}                                        // GCOV: -:    [[@LINE]]:}
+
+// llvm-cov doesn't work on big endian yet
+// XFAIL: powerpc64, s390x, mips-, mips64-, sparc
diff --git a/test/tools/llvm-objdump/Inputs/file-aux-record.yaml b/test/tools/llvm-objdump/Inputs/file-aux-record.yaml
new file mode 100644
index 0000000..d19afaf
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/file-aux-record.yaml
@@ -0,0 +1,21 @@
+header: !Header
+  Machine: IMAGE_FILE_MACHINE_I386 # (0x14c)
+  Characteristics: [ IMAGE_FILE_DEBUG_STRIPPED ]
+sections:
+symbols:
+  - !Symbol
+    Name: .file
+    Value: 0
+    SectionNumber: 65534
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_FILE
+    File: eighteen-chars.obj
+  - !Symbol
+    Name: '@comp.id'
+    Value: 13485607
+    SectionNumber: 65535
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_STATIC
+
diff --git a/test/tools/llvm-objdump/Inputs/file.obj.coff-arm b/test/tools/llvm-objdump/Inputs/file.obj.coff-arm
new file mode 100755
index 0000000..a333a87
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/file.obj.coff-arm differ
diff --git a/test/tools/llvm-objdump/coff-file.test b/test/tools/llvm-objdump/coff-file.test
new file mode 100644
index 0000000..75d02b8
--- /dev/null
+++ b/test/tools/llvm-objdump/coff-file.test
@@ -0,0 +1,6 @@
+RUN: llvm-objdump -t %p/Inputs/file.obj.coff-arm | FileCheck %s
+
+CHECK: .file
+CHECK-NEXT: AUX /Users/compnerd/work/llvm/test/tools/llvm-readobj/Inputs/file.asm
+CHECK-NEXT: [{{[ 0-9]+}}]
+
diff --git a/test/tools/llvm-objdump/coff-non-null-terminated-file.test b/test/tools/llvm-objdump/coff-non-null-terminated-file.test
new file mode 100644
index 0000000..125994f
--- /dev/null
+++ b/test/tools/llvm-objdump/coff-non-null-terminated-file.test
@@ -0,0 +1,5 @@
+RUN: yaml2obj %p/Inputs/file-aux-record.yaml | llvm-objdump -t - | FileCheck %s
+
+CHECK: .file
+CHECK: AUX eighteen-chars.obj{{$}}
+
diff --git a/test/tools/llvm-profdata/Inputs/no-counts.profdata b/test/tools/llvm-profdata/Inputs/no-counts.profdata
new file mode 100644
index 0000000..5c1fa15
--- /dev/null
+++ b/test/tools/llvm-profdata/Inputs/no-counts.profdata
@@ -0,0 +1,3 @@
+no_counts
+0
+0
diff --git a/test/tools/llvm-profdata/errors.test b/test/tools/llvm-profdata/errors.test
index 6ccb084..28262ef 100644
--- a/test/tools/llvm-profdata/errors.test
+++ b/test/tools/llvm-profdata/errors.test
@@ -1,13 +1,16 @@
-RUN: llvm-profdata merge %p/Inputs/foo3-1.profdata %p/Inputs/foo4-1.profdata -o /dev/null 2>&1 | FileCheck %s --check-prefix=HASH
+RUN: llvm-profdata merge %p/Inputs/foo3-1.profdata %p/Inputs/foo4-1.profdata -o %t.out 2>&1 | FileCheck %s --check-prefix=HASH
 HASH: foo4-1.profdata: foo: Function hash mismatch
 
-RUN: llvm-profdata merge %p/Inputs/overflow.profdata %p/Inputs/overflow.profdata -o /dev/null 2>&1 | FileCheck %s --check-prefix=OVERFLOW
+RUN: llvm-profdata merge %p/Inputs/overflow.profdata %p/Inputs/overflow.profdata -o %t.out 2>&1 | FileCheck %s --check-prefix=OVERFLOW
 OVERFLOW: overflow.profdata: overflow: Counter overflow
 
 RUN: not llvm-profdata show %p/Inputs/invalid-count-later.profdata 2>&1 | FileCheck %s --check-prefix=INVALID-COUNT-LATER
-RUN: not llvm-profdata merge %p/Inputs/invalid-count-later.profdata %p/Inputs/invalid-count-later.profdata 2>&1 | FileCheck %s --check-prefix=INVALID-COUNT-LATER
+RUN: not llvm-profdata merge %p/Inputs/invalid-count-later.profdata %p/Inputs/invalid-count-later.profdata -o %t.out 2>&1 | FileCheck %s --check-prefix=INVALID-COUNT-LATER
 INVALID-COUNT-LATER: error: {{.*}}invalid-count-later.profdata: Malformed profile data
 
 RUN: not llvm-profdata show %p/Inputs/bad-hash.profdata 2>&1 | FileCheck %s --check-prefix=BAD-HASH
-RUN: not llvm-profdata merge %p/Inputs/bad-hash.profdata %p/Inputs/bad-hash.profdata 2>&1 | FileCheck %s --check-prefix=BAD-HASH
+RUN: not llvm-profdata merge %p/Inputs/bad-hash.profdata %p/Inputs/bad-hash.profdata -o %t.out 2>&1 | FileCheck %s --check-prefix=BAD-HASH
 BAD-HASH: error: {{.*}}bad-hash.profdata: Malformed profile data
+
+RUN: not llvm-profdata show %p/Inputs/no-counts.profdata 2>&1 | FileCheck %s --check-prefix=NO-COUNTS
+NO-COUNTS: error: {{.*}}no-counts.profdata: Malformed profile data
diff --git a/test/tools/llvm-profdata/raw-two-profiles.test b/test/tools/llvm-profdata/raw-two-profiles.test
new file mode 100644
index 0000000..3260836
--- /dev/null
+++ b/test/tools/llvm-profdata/raw-two-profiles.test
@@ -0,0 +1,64 @@
+RUN: printf '\201rforpl\377' > %t-foo.profraw
+RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\3\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\4\0\2\0\0\0' >> %t-foo.profraw
+
+RUN: printf '\3\0\0\0' >> %t-foo.profraw
+RUN: printf '\1\0\0\0' >> %t-foo.profraw
+RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\4\0\2\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
+
+RUN: printf '\023\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf 'foo' >> %t-foo.profraw
+
+RUN: printf '\201rforpl\377' > %t-bar.profraw
+RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\2\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\3\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\6\0\2\0\0\0' >> %t-bar.profraw
+
+RUN: printf '\3\0\0\0' >> %t-bar.profraw
+RUN: printf '\2\0\0\0' >> %t-bar.profraw
+RUN: printf '\2\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\6\0\2\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
+
+RUN: printf '\067\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\101\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf 'bar' >> %t-bar.profraw
+
+Versions of the profiles that are padded to eight byte alignment.
+RUN: cat %t-foo.profraw > %t-foo-padded.profraw
+RUN: printf '\0\0\0\0\0' >> %t-foo-padded.profraw
+RUN: cat %t-bar.profraw > %t-bar-padded.profraw
+RUN: printf '\0\0\0\0\0' >> %t-bar-padded.profraw
+
+RUN: cat %t-foo.profraw %t-bar.profraw > %t-nopad.profraw
+RUN: cat %t-foo-padded.profraw %t-bar.profraw > %t-pad-between.profraw
+RUN: cat %t-foo-padded.profraw %t-bar-padded.profraw > %t-pad.profraw
+
+RUN: llvm-profdata show %t-nopad.profraw -all-functions -counts | FileCheck %s
+RUN: llvm-profdata show %t-pad-between.profraw -all-functions -counts | FileCheck %s
+RUN: llvm-profdata show %t-pad.profraw -all-functions -counts | FileCheck %s
+
+CHECK: Counters:
+CHECK:   foo:
+CHECK:     Hash: 0x0000000000000001
+CHECK:     Counters: 1
+CHECK:     Function count: 19
+CHECK:     Block counts: []
+CHECK:   bar:
+CHECK:     Hash: 0x0000000000000002
+CHECK:     Counters: 2
+CHECK:     Function count: 55
+CHECK:     Block counts: [65]
+CHECK: Functions shown: 2
+CHECK: Total functions: 2
+CHECK: Maximum function count: 55
+CHECK: Maximum internal block count: 65
diff --git a/test/tools/llvm-profdata/simple.test b/test/tools/llvm-profdata/simple.test
index 97dda5a..18741dd 100644
--- a/test/tools/llvm-profdata/simple.test
+++ b/test/tools/llvm-profdata/simple.test
@@ -1,5 +1,7 @@
-RUN: llvm-profdata merge %p/Inputs/foo3-1.profdata %p/Inputs/foo3-2.profdata | llvm-profdata show - -all-functions -counts | FileCheck %s --check-prefix=FOO3
-RUN: llvm-profdata merge %p/Inputs/foo3-2.profdata %p/Inputs/foo3-1.profdata | llvm-profdata show - -all-functions -counts | FileCheck %s --check-prefix=FOO3
+RUN: llvm-profdata merge %p/Inputs/foo3-1.profdata %p/Inputs/foo3-2.profdata -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3
+RUN: llvm-profdata merge %p/Inputs/foo3-2.profdata %p/Inputs/foo3-1.profdata -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3
 FOO3: foo:
 FOO3: Counters: 3
 FOO3: Function count: 8
@@ -8,8 +10,10 @@ FOO3: Total functions: 1
 FOO3: Maximum function count: 8
 FOO3: Maximum internal block count: 7
 
-RUN: llvm-profdata merge %p/Inputs/foo4-1.profdata %p/Inputs/foo4-2.profdata | llvm-profdata show - -all-functions -counts | FileCheck %s --check-prefix=FOO4
-RUN: llvm-profdata merge %p/Inputs/foo4-2.profdata %p/Inputs/foo4-1.profdata | llvm-profdata show - -all-functions -counts | FileCheck %s --check-prefix=FOO4
+RUN: llvm-profdata merge %p/Inputs/foo4-1.profdata %p/Inputs/foo4-2.profdata -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO4
+RUN: llvm-profdata merge %p/Inputs/foo4-2.profdata %p/Inputs/foo4-1.profdata -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO4
 FOO4: foo:
 FOO4: Counters: 4
 FOO4: Function count: 18
@@ -18,8 +22,10 @@ FOO4: Total functions: 1
 FOO4: Maximum function count: 18
 FOO4: Maximum internal block count: 48
 
-RUN: llvm-profdata merge %p/Inputs/foo3bar3-1.profdata %p/Inputs/foo3bar3-2.profdata | llvm-profdata show - -all-functions -counts | FileCheck %s --check-prefix=FOO3BAR3
-RUN: llvm-profdata merge %p/Inputs/foo3bar3-2.profdata %p/Inputs/foo3bar3-1.profdata | llvm-profdata show - -all-functions -counts | FileCheck %s --check-prefix=FOO3BAR3
+RUN: llvm-profdata merge %p/Inputs/foo3bar3-1.profdata %p/Inputs/foo3bar3-2.profdata -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3BAR3
+RUN: llvm-profdata merge %p/Inputs/foo3bar3-2.profdata %p/Inputs/foo3bar3-1.profdata -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3BAR3
 FOO3BAR3: foo:
 FOO3BAR3: Counters: 3
 FOO3BAR3: Function count: 19
@@ -32,7 +38,8 @@ FOO3BAR3: Total functions: 2
 FOO3BAR3: Maximum function count: 36
 FOO3BAR3: Maximum internal block count: 50
 
-RUN: llvm-profdata merge %p/Inputs/empty.profdata %p/Inputs/foo3-1.profdata | llvm-profdata show - -all-functions -counts | FileCheck %s --check-prefix=FOO3EMPTY
+RUN: llvm-profdata merge %p/Inputs/empty.profdata %p/Inputs/foo3-1.profdata -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3EMPTY
 FOO3EMPTY: foo:
 FOO3EMPTY: Counters: 3
 FOO3EMPTY: Function count: 1
@@ -41,7 +48,8 @@ FOO3EMPTY: Total functions: 1
 FOO3EMPTY: Maximum function count: 1
 FOO3EMPTY: Maximum internal block count: 3
 
-RUN: llvm-profdata merge %p/Inputs/foo3-1.profdata %p/Inputs/foo3bar3-1.profdata | llvm-profdata show - -all-functions -counts | FileCheck %s --check-prefix=FOO3FOO3BAR3
+RUN: llvm-profdata merge %p/Inputs/foo3-1.profdata %p/Inputs/foo3bar3-1.profdata -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3FOO3BAR3
 FOO3FOO3BAR3: foo:
 FOO3FOO3BAR3: Counters: 3
 FOO3FOO3BAR3: Function count: 3
@@ -54,7 +62,8 @@ FOO3FOO3BAR3: Total functions: 2
 FOO3FOO3BAR3: Maximum function count: 7
 FOO3FOO3BAR3: Maximum internal block count: 13
 
-RUN: llvm-profdata merge %p/Inputs/foo3-1.profdata %p/Inputs/bar3-1.profdata | llvm-profdata show - -all-functions -counts | FileCheck %s --check-prefix=DISJOINT
+RUN: llvm-profdata merge %p/Inputs/foo3-1.profdata %p/Inputs/bar3-1.profdata -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=DISJOINT
 DISJOINT: foo:
 DISJOINT: Counters: 3
 DISJOINT: Function count: 1
diff --git a/test/tools/llvm-readobj/Inputs/dynamic-table-exe.x86 b/test/tools/llvm-readobj/Inputs/dynamic-table-exe.x86
new file mode 100755
index 0000000..4edbe58
Binary files /dev/null and b/test/tools/llvm-readobj/Inputs/dynamic-table-exe.x86 differ
diff --git a/test/tools/llvm-readobj/Inputs/file-aux-record.yaml b/test/tools/llvm-readobj/Inputs/file-aux-record.yaml
new file mode 100644
index 0000000..d19afaf
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/file-aux-record.yaml
@@ -0,0 +1,21 @@
+header: !Header
+  Machine: IMAGE_FILE_MACHINE_I386 # (0x14c)
+  Characteristics: [ IMAGE_FILE_DEBUG_STRIPPED ]
+sections:
+symbols:
+  - !Symbol
+    Name: .file
+    Value: 0
+    SectionNumber: 65534
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_FILE
+    File: eighteen-chars.obj
+  - !Symbol
+    Name: '@comp.id'
+    Value: 13485607
+    SectionNumber: 65535
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_STATIC
+
diff --git a/test/tools/llvm-readobj/Inputs/file-multiple-aux-records.yaml b/test/tools/llvm-readobj/Inputs/file-multiple-aux-records.yaml
new file mode 100644
index 0000000..8d8f684
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/file-multiple-aux-records.yaml
@@ -0,0 +1,21 @@
+header: !Header
+  Machine: IMAGE_FILE_MACHINE_I386 # (0x14c)
+  Characteristics: [ IMAGE_FILE_DEBUG_STRIPPED ]
+sections:
+symbols:
+  - !Symbol
+    Name: .file
+    Value: 0
+    SectionNumber: 65534
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_FILE
+    File: first-section-has-eighteen-characters.asm
+  - !Symbol
+    Name: '@comp.id'
+    Value: 13485607
+    SectionNumber: 65535
+    SimpleType: IMAGE_SYM_TYPE_NULL
+    ComplexType: IMAGE_SYM_DTYPE_NULL
+    StorageClass: IMAGE_SYM_CLASS_STATIC
+
diff --git a/test/tools/llvm-readobj/coff-file-sections-reading.test b/test/tools/llvm-readobj/coff-file-sections-reading.test
new file mode 100644
index 0000000..5c44c16
--- /dev/null
+++ b/test/tools/llvm-readobj/coff-file-sections-reading.test
@@ -0,0 +1,18 @@
+RUN: yaml2obj %p/Inputs/file-multiple-aux-records.yaml | llvm-readobj -t - | FileCheck %s
+
+CHECK: Symbols [
+CHECK:   Symbol {
+CHECK:     Name: .file
+CHECK:     Value: 0
+CHECK:     Section:  (65534)
+CHECK:     BaseType: Null (0x0)
+CHECK:     ComplexType: Null (0x0)
+CHECK:     StorageClass: File (0x67)
+CHECK:     AuxSymbolCount: 3
+CHECK:     AuxFileRecord {
+CHECK:       FileName: first-section-has-eighteen-characters.asm
+CHECK:     }
+CHECK-NOT:     AuxFileRecord {
+CHECK:   }
+CHECK: ]
+
diff --git a/test/tools/llvm-readobj/coff-non-null-terminated-file.test b/test/tools/llvm-readobj/coff-non-null-terminated-file.test
new file mode 100644
index 0000000..8bd88f3
--- /dev/null
+++ b/test/tools/llvm-readobj/coff-non-null-terminated-file.test
@@ -0,0 +1,20 @@
+RUN: yaml2obj %p/Inputs/file-aux-record.yaml | llvm-readobj -t - | FileCheck %s
+
+CHECK: Symbols [
+CHECK:   Symbol {
+CHECK:     Name: .file
+CHECK:     Value: 0
+CHECK:     StorageClass: File
+CHECK:     AuxSymbolCount: 1
+CHECK:     AuxFileRecord {
+CHECK:       FileName: eighteen-chars.obj{{$}}
+CHECK:     }
+CHECK:   }
+CHECK:   Symbol {
+CHECK:     Name: @comp.id
+CHECK:     Value: 13485607
+CHECK:     StorageClass: Static
+CHECK:     AuxSymbolCount: 0
+CHECK:   }
+CHECK: ]
+
diff --git a/test/tools/llvm-readobj/dynamic.test b/test/tools/llvm-readobj/dynamic.test
index 6a5fe95..08f29fc 100644
--- a/test/tools/llvm-readobj/dynamic.test
+++ b/test/tools/llvm-readobj/dynamic.test
@@ -21,7 +21,7 @@ ELF-MIPS:   0x00000011 REL                  0x518
 ELF-MIPS:   0x00000012 RELSZ                16 (bytes)
 ELF-MIPS:   0x00000013 RELENT               8 (bytes)
 ELF-MIPS:   0x70000001 MIPS_RLD_VERSION     1
-ELF-MIPS:   0x70000005 MIPS_FLAGS           0x2
+ELF-MIPS:   0x70000005 MIPS_FLAGS           NOTPOT
 ELF-MIPS:   0x70000006 MIPS_BASE_ADDRESS    0x0
 ELF-MIPS:   0x7000000A MIPS_LOCAL_GOTNO     10
 ELF-MIPS:   0x70000011 MIPS_SYMTABNO        19
@@ -55,7 +55,7 @@ ELF-MIPS-EXE:   0x70000016 MIPS_RLD_MAP         0x410880
 ELF-MIPS-EXE:   0x00000015 DEBUG                0x0
 ELF-MIPS-EXE:   0x00000003 PLTGOT               0x410890
 ELF-MIPS-EXE:   0x70000001 MIPS_RLD_VERSION     1
-ELF-MIPS-EXE:   0x70000005 MIPS_FLAGS           0x2
+ELF-MIPS-EXE:   0x70000005 MIPS_FLAGS           NOTPOT
 ELF-MIPS-EXE:   0x70000006 MIPS_BASE_ADDRESS    0x400000
 ELF-MIPS-EXE:   0x7000000A MIPS_LOCAL_GOTNO     5
 ELF-MIPS-EXE:   0x70000011 MIPS_SYMTABNO        8
@@ -70,3 +70,44 @@ ELF-MIPS-EXE:   0x6FFFFFFF VERNEEDNUM           1
 ELF-MIPS-EXE:   0x6FFFFFF0 VERSYM               0x4003D8
 ELF-MIPS-EXE:   0x00000000 NULL                 0x0
 ELF-MIPS-EXE: ]
+
+RUN: llvm-readobj -dynamic-table %p/Inputs/dynamic-table-exe.x86 \
+RUN:     | FileCheck %s -check-prefix ELF-X86-EXE
+
+ELF-X86-EXE: Format: ELF32-i386
+ELF-X86-EXE: Arch: i386
+ELF-X86-EXE: AddressSize: 32bit
+ELF-X86-EXE: LoadName:
+ELF-X86-EXE: DynamicSection [ (30 entries)
+ELF-X86-EXE:   Tag        Type                 Name/Value
+ELF-X86-EXE:   0x00000001 NEEDED               SharedLibrary (libstdc++.so.6)
+ELF-X86-EXE:   0x00000001 NEEDED               SharedLibrary (libgcc_s.so.1)
+ELF-X86-EXE:   0x00000001 NEEDED               SharedLibrary (libc.so.6)
+ELF-X86-EXE:   0x0000000C INIT                 0x62C
+ELF-X86-EXE:   0x0000000D FINI                 0x920
+ELF-X86-EXE:   0x00000019 INIT_ARRAY           0x19FC
+ELF-X86-EXE:   0x0000001B INIT_ARRAYSZ         4 (bytes)
+ELF-X86-EXE:   0x0000001A FINI_ARRAY           0x1A00
+ELF-X86-EXE:   0x0000001C FINI_ARRAYSZ         4 (bytes)
+ELF-X86-EXE:   0x00000004 HASH                 0x18C
+ELF-X86-EXE:   0x6FFFFEF5 GNU_HASH             0x1E4
+ELF-X86-EXE:   0x00000005 STRTAB               0x328
+ELF-X86-EXE:   0x00000006 SYMTAB               0x218
+ELF-X86-EXE:   0x0000000A STRSZ                408 (bytes)
+ELF-X86-EXE:   0x0000000B SYMENT               16 (bytes)
+ELF-X86-EXE:   0x00000015 DEBUG                0x0
+ELF-X86-EXE:   0x00000003 PLTGOT               0x1B30
+ELF-X86-EXE:   0x00000002 PLTRELSZ             64 (bytes)
+ELF-X86-EXE:   0x00000014 PLTREL               REL
+ELF-X86-EXE:   0x00000017 JMPREL               0x5EC
+ELF-X86-EXE:   0x00000011 REL                  0x564
+ELF-X86-EXE:   0x00000012 RELSZ                136 (bytes)
+ELF-X86-EXE:   0x00000013 RELENT               8 (bytes)
+ELF-X86-EXE:   0x00000016 TEXTREL
+ELF-X86-EXE:   0x0000001E FLAGS                TEXTREL
+ELF-X86-EXE:   0x6FFFFFFE VERNEED              0x4E4
+ELF-X86-EXE:   0x6FFFFFFF VERNEEDNUM           3
+ELF-X86-EXE:   0x6FFFFFF0 VERSYM               0x4C0
+ELF-X86-EXE:   0x6FFFFFFA RELCOUNT             6
+ELF-X86-EXE:   0x00000000 NULL                 0x0
+ELF-X86-EXE: ]
diff --git a/tools/bugpoint/BugDriver.cpp b/tools/bugpoint/BugDriver.cpp
index 2d1b903..cecccbe 100644
--- a/tools/bugpoint/BugDriver.cpp
+++ b/tools/bugpoint/BugDriver.cpp
@@ -70,12 +70,16 @@ BugDriver::BugDriver(const char *toolname, bool find_bugs,
                      unsigned timeout, unsigned memlimit, bool use_valgrind,
                      LLVMContext& ctxt)
   : Context(ctxt), ToolName(toolname), ReferenceOutputFile(OutputFile),
-    Program(0), Interpreter(0), SafeInterpreter(0), gcc(0),
-    run_find_bugs(find_bugs), Timeout(timeout),
+    Program(nullptr), Interpreter(nullptr), SafeInterpreter(nullptr),
+    gcc(nullptr), run_find_bugs(find_bugs), Timeout(timeout),
     MemoryLimit(memlimit), UseValgrind(use_valgrind) {}
 
 BugDriver::~BugDriver() {
   delete Program;
+  if (Interpreter != SafeInterpreter)
+    delete Interpreter;
+  delete SafeInterpreter;
+  delete gcc;
 }
 
 
@@ -112,18 +116,18 @@ Module *llvm::ParseInputFile(const std::string &Filename,
 // parsed), and false on success.
 //
 bool BugDriver::addSources(const std::vector<std::string> &Filenames) {
-  assert(Program == 0 && "Cannot call addSources multiple times!");
+  assert(!Program && "Cannot call addSources multiple times!");
   assert(!Filenames.empty() && "Must specify at least on input filename!");
 
   // Load the first input file.
   Program = ParseInputFile(Filenames[0], Context);
-  if (Program == 0) return true;
+  if (!Program) return true;
 
   outs() << "Read input file      : '" << Filenames[0] << "'\n";
 
   for (unsigned i = 1, e = Filenames.size(); i != e; ++i) {
     std::unique_ptr<Module> M(ParseInputFile(Filenames[i], Context));
-    if (M.get() == 0) return true;
+    if (!M.get()) return true;
 
     outs() << "Linking in input file: '" << Filenames[i] << "'\n";
     std::string ErrorMessage;
diff --git a/tools/bugpoint/BugDriver.h b/tools/bugpoint/BugDriver.h
index c01bbe5..3169d29 100644
--- a/tools/bugpoint/BugDriver.h
+++ b/tools/bugpoint/BugDriver.h
@@ -202,7 +202,7 @@ public:
                    const std::string &BitcodeFile = "",
                    const std::string &SharedObj = "",
                    bool RemoveBitcode = false,
-                   std::string *Error = 0) const;
+                   std::string *Error = nullptr) const;
 
   /// EmitProgressBitcode - This function is used to output M to a file named
   /// "bugpoint-ID.bc".
@@ -244,7 +244,7 @@ public:
   /// this method will never return null.
   Module *runPassesOn(Module *M, const std::vector<std::string> &Passes,
                       bool AutoDebugCrashes = false, unsigned NumExtraArgs = 0,
-                      const char * const *ExtraArgs = NULL);
+                      const char * const *ExtraArgs = nullptr);
 
   /// runPasses - Run the specified passes on Program, outputting a bitcode
   /// file and writting the filename into OutputFile if successful.  If the
@@ -259,7 +259,7 @@ public:
                  const std::vector<std::string> &PassesToRun,
                  std::string &OutputFilename, bool DeleteOutput = false,
                  bool Quiet = false, unsigned NumExtraArgs = 0,
-                 const char * const *ExtraArgs = NULL) const;
+                 const char * const *ExtraArgs = nullptr) const;
                  
   /// runManyPasses - Take the specified pass list and create different 
   /// combinations of passes to compile the program with. Compile the program with
diff --git a/tools/bugpoint/CrashDebugger.cpp b/tools/bugpoint/CrashDebugger.cpp
index bdaa6c9..8bd61b3 100644
--- a/tools/bugpoint/CrashDebugger.cpp
+++ b/tools/bugpoint/CrashDebugger.cpp
@@ -63,7 +63,7 @@ ReducePassList::doTest(std::vector<std::string> &Prefix,
                        std::vector<std::string> &Suffix,
                        std::string &Error) {
   std::string PrefixOutput;
-  Module *OrigProgram = 0;
+  Module *OrigProgram = nullptr;
   if (!Prefix.empty()) {
     outs() << "Checking to see if these passes crash: "
            << getPassesString(Prefix) << ": ";
@@ -73,7 +73,7 @@ ReducePassList::doTest(std::vector<std::string> &Prefix,
     OrigProgram = BD.Program;
 
     BD.Program = ParseInputFile(PrefixOutput, BD.getContext());
-    if (BD.Program == 0) {
+    if (BD.Program == nullptr) {
       errs() << BD.getToolName() << ": Error reading bitcode file '"
              << PrefixOutput << "'!\n";
       exit(1);
@@ -149,7 +149,7 @@ ReduceCrashingGlobalVariables::TestGlobalVariables(
   for (Module::global_iterator I = M->global_begin(), E = M->global_end();
        I != E; ++I)
     if (I->hasInitializer() && !GVSet.count(I)) {
-      I->setInitializer(0);
+      I->setInitializer(nullptr);
       I->setLinkage(GlobalValue::ExternalLinkage);
     }
 
@@ -410,6 +410,7 @@ bool ReduceCrashingInstructions::TestInsts(std::vector<const Instruction*>
   // Verify that this is still valid.
   PassManager Passes;
   Passes.add(createVerifierPass());
+  Passes.add(createDebugInfoVerifierPass());
   Passes.run(*M);
 
   // Try running on the hacked up program...
@@ -446,7 +447,7 @@ static bool DebugACrash(BugDriver &BD,
     for (Module::global_iterator I = M->global_begin(), E = M->global_end();
          I != E; ++I)
       if (I->hasInitializer()) {
-        I->setInitializer(0);
+        I->setInitializer(nullptr);
         I->setLinkage(GlobalValue::ExternalLinkage);
         DeletedInit = true;
       }
diff --git a/tools/bugpoint/ExecutionDriver.cpp b/tools/bugpoint/ExecutionDriver.cpp
index 609de03..5ed7d2c 100644
--- a/tools/bugpoint/ExecutionDriver.cpp
+++ b/tools/bugpoint/ExecutionDriver.cpp
@@ -145,7 +145,7 @@ bool BugDriver::initializeExecutionEnvironment() {
 
   // Create an instance of the AbstractInterpreter interface as specified on
   // the command line
-  SafeInterpreter = 0;
+  SafeInterpreter = nullptr;
   std::string Message;
 
   switch (InterpreterSel) {
@@ -256,7 +256,7 @@ bool BugDriver::initializeExecutionEnvironment() {
   if (!gcc) { outs() << Message << "\nExiting.\n"; exit(1); }
 
   // If there was an error creating the selected interpreter, quit with error.
-  return Interpreter == 0;
+  return Interpreter == nullptr;
 }
 
 /// compileProgram - Try to compile the specified module, returning false and
@@ -298,7 +298,7 @@ std::string BugDriver::executeProgram(const Module *Program,
                                       const std::string &SharedObj,
                                       AbstractInterpreter *AI,
                                       std::string *Error) const {
-  if (AI == 0) AI = Interpreter;
+  if (!AI) AI = Interpreter;
   assert(AI && "Interpreter should have been created already!");
   bool CreatedBitcode = false;
   if (BitcodeFile.empty()) {
@@ -445,7 +445,7 @@ bool BugDriver::diffProgram(const Module *Program,
                             std::string *ErrMsg) const {
   // Execute the program, generating an output file...
   std::string Output(
-      executeProgram(Program, "", BitcodeFile, SharedObject, 0, ErrMsg));
+      executeProgram(Program, "", BitcodeFile, SharedObject, nullptr, ErrMsg));
   if (!ErrMsg->empty())
     return false;
 
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 8bcae8a..38cdf24 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -34,6 +34,8 @@
 #include <set>
 using namespace llvm;
 
+#define DEBUG_TYPE "bugpoint"
+
 namespace llvm {
   bool DisableSimplifyCFG = false;
   extern cl::opt<std::string> OutputPrefix;
@@ -49,7 +51,7 @@ namespace {
 
   Function* globalInitUsesExternalBA(GlobalVariable* GV) {
     if (!GV->hasInitializer())
-      return 0;
+      return nullptr;
 
     Constant *I = GV->getInitializer();
 
@@ -76,7 +78,7 @@ namespace {
           Todo.push_back(C);
       }
     }
-    return 0;
+    return nullptr;
   }
 }  // end anonymous namespace
 
@@ -148,7 +150,7 @@ Module *BugDriver::performFinalCleanups(Module *M, bool MayModifySemantics) {
     CleanupPasses.push_back("deadargelim");
 
   Module *New = runPassesOn(M, CleanupPasses);
-  if (New == 0) {
+  if (!New) {
     errs() << "Final cleanups failed.  Sorry. :(  Please report a bug!\n";
     return M;
   }
@@ -165,11 +167,11 @@ Module *BugDriver::ExtractLoop(Module *M) {
   LoopExtractPasses.push_back("loop-extract-single");
 
   Module *NewM = runPassesOn(M, LoopExtractPasses);
-  if (NewM == 0) {
+  if (!NewM) {
     outs() << "*** Loop extraction failed: ";
     EmitProgressBitcode(M, "loopextraction", true);
     outs() << "*** Sorry. :(  Please report a bug!\n";
-    return 0;
+    return nullptr;
   }
 
   // Check to see if we created any new functions.  If not, no loops were
@@ -178,7 +180,7 @@ Module *BugDriver::ExtractLoop(Module *M) {
   static unsigned NumExtracted = 32;
   if (M->size() == NewM->size() || --NumExtracted == 0) {
     delete NewM;
-    return 0;
+    return nullptr;
   } else {
     assert(M->size() < NewM->size() && "Loop extract removed functions?");
     Module::iterator MI = NewM->begin();
@@ -335,10 +337,10 @@ llvm::SplitFunctionsOutOfModule(Module *M,
                << "' and from test function '" << TestFn->getName() << "'.\n";
         exit(1);
       }
-      I->setInitializer(0);  // Delete the initializer to make it external
+      I->setInitializer(nullptr);  // Delete the initializer to make it external
     } else {
       // If we keep it in the safe module, then delete it in the test module
-      GV->setInitializer(0);
+      GV->setInitializer(nullptr);
     }
   }
 
@@ -370,7 +372,7 @@ Module *BugDriver::ExtractMappedBlocksFromModule(const
     outs() << "*** Basic Block extraction failed!\n";
     errs() << "Error creating temporary file: " << EC.message() << "\n";
     EmitProgressBitcode(M, "basicblockextractfail", true);
-    return 0;
+    return nullptr;
   }
   sys::RemoveFileOnSignal(Filename);
 
@@ -389,7 +391,7 @@ Module *BugDriver::ExtractMappedBlocksFromModule(const
     errs() << "Error writing list of blocks to not extract\n";
     EmitProgressBitcode(M, "basicblockextractfail", true);
     BlocksToNotExtractFile.os().clear_error();
-    return 0;
+    return nullptr;
   }
   BlocksToNotExtractFile.keep();
 
@@ -403,7 +405,7 @@ Module *BugDriver::ExtractMappedBlocksFromModule(const
 
   sys::fs::remove(Filename.c_str());
 
-  if (Ret == 0) {
+  if (!Ret) {
     outs() << "*** Basic Block extraction failed, please report a bug!\n";
     EmitProgressBitcode(M, "basicblockextractfail", true);
   }
diff --git a/tools/bugpoint/FindBugs.cpp b/tools/bugpoint/FindBugs.cpp
index e2941f6..a0c859b 100644
--- a/tools/bugpoint/FindBugs.cpp
+++ b/tools/bugpoint/FindBugs.cpp
@@ -45,7 +45,7 @@ bool BugDriver::runManyPasses(const std::vector<std::string> &AllPasses,
       return false;
   }
   
-  srand(time(NULL));  
+  srand(time(nullptr));
   
   unsigned num = 1;
   while(1) {  
diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index fecae60..f5936ac 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -235,7 +235,7 @@ static Module *TestMergedProgram(const BugDriver &BD, Module *M1, Module *M2,
   if (!Error.empty()) {
     // Delete the linked module
     delete M1;
-    return NULL;
+    return nullptr;
   }
   return M1;
 }
@@ -592,7 +592,7 @@ static bool ExtractBlocks(BugDriver &BD,
                                                 MiscompiledFunctions,
                                                 VMap);
   Module *Extracted = BD.ExtractMappedBlocksFromModule(Blocks, ToExtract);
-  if (Extracted == 0) {
+  if (!Extracted) {
     // Weird, extraction should have worked.
     errs() << "Nondeterministic problem extracting blocks??\n";
     delete ProgClone;
@@ -845,7 +845,7 @@ static void CleanupAndPrepareModules(BugDriver &BD, Module *&Test,
     Safe->getOrInsertFunction("getPointerToNamedFunction",
                     Type::getInt8PtrTy(Safe->getContext()),
                     Type::getInt8PtrTy(Safe->getContext()),
-                       (Type *)0);
+                       (Type *)nullptr);
 
   // Use the function we just added to get addresses of functions we need.
   for (Module::iterator F = Safe->begin(), E = Safe->end(); F != E; ++F) {
diff --git a/tools/bugpoint/OptimizerDriver.cpp b/tools/bugpoint/OptimizerDriver.cpp
index f91f493..b2722e6 100644
--- a/tools/bugpoint/OptimizerDriver.cpp
+++ b/tools/bugpoint/OptimizerDriver.cpp
@@ -36,6 +36,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "bugpoint"
+
 namespace llvm {
   extern cl::opt<std::string> OutputPrefix;
 }
@@ -194,7 +196,7 @@ bool BugDriver::runPasses(Module *Program,
   Args.push_back(InputFilename.c_str());
   for (unsigned i = 0; i < NumExtraArgs; ++i)
     Args.push_back(*ExtraArgs);
-  Args.push_back(0);
+  Args.push_back(nullptr);
 
   DEBUG(errs() << "\nAbout to run:\t";
         for (unsigned i = 0, e = Args.size()-1; i != e; ++i)
@@ -210,12 +212,12 @@ bool BugDriver::runPasses(Module *Program,
 
   // Redirect stdout and stderr to nowhere if SilencePasses is given
   StringRef Nowhere;
-  const StringRef *Redirects[3] = {0, &Nowhere, &Nowhere};
+  const StringRef *Redirects[3] = {nullptr, &Nowhere, &Nowhere};
 
   std::string ErrMsg;
-  int result = sys::ExecuteAndWait(Prog, Args.data(), 0,
-                                   (SilencePasses ? Redirects : 0), Timeout,
-                                   MemoryLimit, &ErrMsg);
+  int result = sys::ExecuteAndWait(Prog, Args.data(), nullptr,
+                                   (SilencePasses ? Redirects : nullptr),
+                                   Timeout, MemoryLimit, &ErrMsg);
 
   // If we are supposed to delete the bitcode file or if the passes crashed,
   // remove it now.  This may fail if the file was never created, but that's ok.
@@ -262,11 +264,11 @@ Module *BugDriver::runPassesOn(Module *M,
       EmitProgressBitcode(M, "pass-error",  false);
       exit(debugOptimizerCrash());
     }
-    return 0;
+    return nullptr;
   }
 
   Module *Ret = ParseInputFile(BitcodeResult, Context);
-  if (Ret == 0) {
+  if (!Ret) {
     errs() << getToolName() << ": Error reading bitcode file '"
            << BitcodeResult << "'!\n";
     exit(1);
diff --git a/tools/bugpoint/ToolRunner.cpp b/tools/bugpoint/ToolRunner.cpp
index f0fb4bf..c481b03 100644
--- a/tools/bugpoint/ToolRunner.cpp
+++ b/tools/bugpoint/ToolRunner.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "toolrunner"
 #include "ToolRunner.h"
 #include "llvm/Config/config.h"   // for HAVE_LINK_R
 #include "llvm/Support/CommandLine.h"
@@ -24,6 +23,8 @@
 #include <sstream>
 using namespace llvm;
 
+#define DEBUG_TYPE "toolrunner"
+
 namespace llvm {
   cl::opt<bool>
   SaveTemps("save-temps", cl::init(false), cl::desc("Save temporary files"));
@@ -61,7 +62,7 @@ static int RunProgramWithTimeout(StringRef ProgramPath,
                                  StringRef StdErrFile,
                                  unsigned NumSeconds = 0,
                                  unsigned MemoryLimit = 0,
-                                 std::string *ErrMsg = 0) {
+                                 std::string *ErrMsg = nullptr) {
   const StringRef *Redirects[3] = { &StdInFile, &StdOutFile, &StdErrFile };
 
 #if 0 // For debug purposes
@@ -73,7 +74,7 @@ static int RunProgramWithTimeout(StringRef ProgramPath,
   }
 #endif
 
-  return sys::ExecuteAndWait(ProgramPath, Args, 0, Redirects,
+  return sys::ExecuteAndWait(ProgramPath, Args, nullptr, Redirects,
                              NumSeconds, MemoryLimit, ErrMsg);
 }
 
@@ -102,7 +103,7 @@ static int RunProgramRemotelyWithTimeout(StringRef RemoteClientPath,
 #endif
 
   // Run the program remotely with the remote client
-  int ReturnCode = sys::ExecuteAndWait(RemoteClientPath, Args, 0,
+  int ReturnCode = sys::ExecuteAndWait(RemoteClientPath, Args, nullptr,
                                        Redirects, NumSeconds, MemoryLimit);
 
   // Has the remote client fail?
@@ -218,7 +219,7 @@ int LLI::ExecuteProgram(const std::string &Bitcode,
   // Add optional parameters to the running program from Argv
   for (unsigned i=0, e = Args.size(); i != e; ++i)
     LLIArgs.push_back(Args[i].c_str());
-  LLIArgs.push_back(0);
+  LLIArgs.push_back(nullptr);
 
   outs() << "<lli>"; outs().flush();
   DEBUG(errs() << "\nAbout to run:\t";
@@ -276,7 +277,7 @@ AbstractInterpreter *AbstractInterpreter::createLLI(const char *Argv0,
   }
 
   Message = "Cannot find `lli' in executable directory!\n";
-  return 0;
+  return nullptr;
 }
 
 //===---------------------------------------------------------------------===//
@@ -327,7 +328,7 @@ void CustomCompiler::compileProgram(const std::string &Bitcode,
   for (std::size_t i = 0; i < CompilerArgs.size(); ++i)
     ProgramArgs.push_back(CompilerArgs.at(i).c_str());
   ProgramArgs.push_back(Bitcode.c_str());
-  ProgramArgs.push_back(0);
+  ProgramArgs.push_back(nullptr);
 
   // Add optional parameters to the running program from Argv
   for (unsigned i = 0, e = CompilerArgs.size(); i != e; ++i)
@@ -384,7 +385,7 @@ int CustomExecutor::ExecuteProgram(const std::string &Bitcode,
   for (std::size_t i = 0; i < ExecutorArgs.size(); ++i)
     ProgramArgs.push_back(ExecutorArgs.at(i).c_str());
   ProgramArgs.push_back(Bitcode.c_str());
-  ProgramArgs.push_back(0);
+  ProgramArgs.push_back(nullptr);
 
   // Add optional parameters to the running program from Argv
   for (unsigned i = 0, e = Args.size(); i != e; ++i)
@@ -447,7 +448,7 @@ AbstractInterpreter *AbstractInterpreter::createCustomCompiler(
   std::vector<std::string> Args;
   lexCommand(Message, CompileCommandLine, CmdPath, Args);
   if (CmdPath.empty())
-    return 0;
+    return nullptr;
 
   return new CustomCompiler(CmdPath, Args);
 }
@@ -463,7 +464,7 @@ AbstractInterpreter *AbstractInterpreter::createCustomExecutor(
   std::vector<std::string> Args;
   lexCommand(Message, ExecCommandLine, CmdPath, Args);
   if (CmdPath.empty())
-    return 0;
+    return nullptr;
 
   return new CustomExecutor(CmdPath, Args);
 }
@@ -498,7 +499,7 @@ GCC::FileType LLC::OutputCode(const std::string &Bitcode,
   if (UseIntegratedAssembler)
     LLCArgs.push_back("-filetype=obj");
 
-  LLCArgs.push_back (0);
+  LLCArgs.push_back (nullptr);
 
   outs() << (UseIntegratedAssembler ? "<llc-ia>" : "<llc>");
   outs().flush();
@@ -558,7 +559,7 @@ LLC *AbstractInterpreter::createLLC(const char *Argv0,
       PrependMainExecutablePath("llc", Argv0, (void *)(intptr_t) & createLLC);
   if (LLCPath.empty()) {
     Message = "Cannot find `llc' in executable directory!\n";
-    return 0;
+    return nullptr;
   }
 
   GCC *gcc = GCC::create(Message, GCCBinary, GCCArgs);
@@ -624,7 +625,7 @@ int JIT::ExecuteProgram(const std::string &Bitcode,
   // Add optional parameters to the running program from Argv
   for (unsigned i=0, e = Args.size(); i != e; ++i)
     JITArgs.push_back(Args[i].c_str());
-  JITArgs.push_back(0);
+  JITArgs.push_back(nullptr);
 
   outs() << "<jit>"; outs().flush();
   DEBUG(errs() << "\nAbout to run:\t";
@@ -650,7 +651,7 @@ AbstractInterpreter *AbstractInterpreter::createJIT(const char *Argv0,
   }
 
   Message = "Cannot find `lli' in executable directory!\n";
-  return 0;
+  return nullptr;
 }
 
 //===---------------------------------------------------------------------===//
@@ -736,7 +737,7 @@ int GCC::ExecuteProgram(const std::string &ProgramFile,
 #endif
   if (TargetTriple.getArch() == Triple::sparc)
     GCCArgs.push_back("-mcpu=v9");
-  GCCArgs.push_back(0);                    // NULL terminator
+  GCCArgs.push_back(nullptr);                    // NULL terminator
 
   outs() << "<gcc>"; outs().flush();
   DEBUG(errs() << "\nAbout to run:\t";
@@ -785,7 +786,7 @@ int GCC::ExecuteProgram(const std::string &ProgramFile,
   // Add optional parameters to the running program from Argv
   for (unsigned i = 0, e = Args.size(); i != e; ++i)
     ProgramArgs.push_back(Args[i].c_str());
-  ProgramArgs.push_back(0);                // NULL terminator
+  ProgramArgs.push_back(nullptr);                // NULL terminator
 
   // Now that we have a binary, run it!
   outs() << "<program>"; outs().flush();
@@ -884,7 +885,7 @@ int GCC::MakeSharedObject(const std::string &InputFile, FileType fileType,
   // command line, so this should be safe.
   for (unsigned i = 0, e = ArgsForGCC.size(); i != e; ++i)
     GCCArgs.push_back(ArgsForGCC[i].c_str());
-  GCCArgs.push_back(0);                    // NULL terminator
+  GCCArgs.push_back(nullptr);                    // NULL terminator
 
 
 
@@ -909,7 +910,7 @@ GCC *GCC::create(std::string &Message,
   std::string GCCPath = sys::FindProgramByName(GCCBinary);
   if (GCCPath.empty()) {
     Message = "Cannot find `"+ GCCBinary +"' in PATH!\n";
-    return 0;
+    return nullptr;
   }
 
   std::string RemoteClientPath;
diff --git a/tools/bugpoint/ToolRunner.h b/tools/bugpoint/ToolRunner.h
index 38a5835..6e7b95c 100644
--- a/tools/bugpoint/ToolRunner.h
+++ b/tools/bugpoint/ToolRunner.h
@@ -63,7 +63,7 @@ public:
                      FileType fileType,
                      const std::string &InputFile,
                      const std::string &OutputFile,
-                     std::string *Error = 0,
+                     std::string *Error = nullptr,
                      const std::vector<std::string> &GCCArgs =
                          std::vector<std::string>(),
                      unsigned Timeout = 0,
@@ -89,15 +89,17 @@ class AbstractInterpreter {
 public:
   static LLC *createLLC(const char *Argv0, std::string &Message,
                         const std::string              &GCCBinary,
-                        const std::vector<std::string> *Args = 0,
-                        const std::vector<std::string> *GCCArgs = 0,
+                        const std::vector<std::string> *Args = nullptr,
+                        const std::vector<std::string> *GCCArgs = nullptr,
                         bool UseIntegratedAssembler = false);
 
-  static AbstractInterpreter* createLLI(const char *Argv0, std::string &Message,
-                                        const std::vector<std::string> *Args=0);
+  static AbstractInterpreter*
+  createLLI(const char *Argv0, std::string &Message,
+            const std::vector<std::string> *Args = nullptr);
 
-  static AbstractInterpreter* createJIT(const char *Argv0, std::string &Message,
-                                        const std::vector<std::string> *Args=0);
+  static AbstractInterpreter*
+  createJIT(const char *Argv0, std::string &Message,
+            const std::vector<std::string> *Args = nullptr);
 
   static AbstractInterpreter*
   createCustomCompiler(std::string &Message,
diff --git a/tools/bugpoint/bugpoint.cpp b/tools/bugpoint/bugpoint.cpp
index 5c03b41..c7dae0f 100644
--- a/tools/bugpoint/bugpoint.cpp
+++ b/tools/bugpoint/bugpoint.cpp
@@ -99,7 +99,7 @@ namespace {
   class AddToDriver : public FunctionPassManager {
     BugDriver &D;
   public:
-    AddToDriver(BugDriver &_D) : FunctionPassManager(0), D(_D) {}
+    AddToDriver(BugDriver &_D) : FunctionPassManager(nullptr), D(_D) {}
 
     void add(Pass *P) override {
       const void *ID = P->getPassID();
diff --git a/tools/llc/Android.mk b/tools/llc/Android.mk
index 391ba39..f9c87fe 100644
--- a/tools/llc/Android.mk
+++ b/tools/llc/Android.mk
@@ -56,7 +56,8 @@ llvm_llc_STATIC_LIBRARIES := \
   libLLVMAnalysis \
   libLLVMTarget \
   libLLVMCore \
-  libLLVMSupport
+  libLLVMSupport \
+  libLLVMObject
 
 llvm_llc_arm64_STATIC_LIBRARIES := \
   libLLVMARM64Info \
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 8fbdc49..09ff461 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -28,6 +28,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -74,15 +75,24 @@ OptLevel("O",
 static cl::opt<std::string>
 TargetTriple("mtriple", cl::desc("Override target triple for module"));
 
-cl::opt<bool> NoVerify("disable-verify", cl::Hidden,
-                       cl::desc("Do not verify input module"));
+static cl::opt<bool> NoVerify("disable-verify", cl::Hidden,
+                              cl::desc("Do not verify input module"));
 
-cl::opt<bool>
-DisableSimplifyLibCalls("disable-simplify-libcalls",
-                        cl::desc("Disable simplify-libcalls"),
-                        cl::init(false));
+static cl::opt<bool> DisableSimplifyLibCalls("disable-simplify-libcalls",
+                                             cl::desc("Disable simplify-libcalls"));
 
-static int compileModule(char**, LLVMContext&);
+static cl::opt<bool> ShowMCEncoding("show-mc-encoding", cl::Hidden,
+                                    cl::desc("Show encoding in .s output"));
+
+static cl::opt<bool> EnableDwarfDirectory(
+    "enable-dwarf-directory", cl::Hidden,
+    cl::desc("Use .file directives with an explicit directory."));
+
+static cl::opt<bool> AsmVerbose("asm-verbose",
+                                cl::desc("Add comments to directives."),
+                                cl::init(true));
+
+static int compileModule(char **, LLVMContext &);
 
 // GetFileNameRoot - Helper function to get the basename of a filename.
 static inline std::string
@@ -157,7 +167,7 @@ static tool_output_file *GetOutputStream(const char *TargetName,
   if (!error.empty()) {
     errs() << error << '\n';
     delete FDOut;
-    return 0;
+    return nullptr;
   }
 
   return FDOut;
@@ -207,17 +217,23 @@ static int compileModule(char **argv, LLVMContext &Context) {
   // Load the module to be compiled...
   SMDiagnostic Err;
   std::unique_ptr<Module> M;
-  Module *mod = 0;
+  Module *mod = nullptr;
   Triple TheTriple;
 
   bool SkipModule = MCPU == "help" ||
                     (!MAttrs.empty() && MAttrs.front() == "help");
 
+  // If user asked for the 'native' CPU, autodetect here. If autodection fails,
+  // this will set the CPU to an empty string which tells the target to
+  // pick a basic default.
+  if (MCPU == "native")
+    MCPU = sys::getHostCPUName();
+
   // If user just wants to list available options, skip module loading
   if (!SkipModule) {
     M.reset(ParseIRFile(InputFilename, Err, Context));
     mod = M.get();
-    if (mod == 0) {
+    if (mod == nullptr) {
       Err.print(argv[0], errs());
       return 1;
     }
@@ -265,19 +281,23 @@ static int compileModule(char **argv, LLVMContext &Context) {
 
   TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
   Options.DisableIntegratedAS = NoIntegratedAssembler;
+  Options.MCOptions.ShowMCEncoding = ShowMCEncoding;
+  Options.MCOptions.MCUseDwarfDirectory = EnableDwarfDirectory;
+  Options.MCOptions.AsmVerbose = AsmVerbose;
 
   std::unique_ptr<TargetMachine> target(
       TheTarget->createTargetMachine(TheTriple.getTriple(), MCPU, FeaturesStr,
                                      Options, RelocModel, CMModel, OLvl));
   assert(target.get() && "Could not allocate target machine!");
-  assert(mod && "Should have exited after outputting help!");
-  TargetMachine &Target = *target.get();
 
-  if (DisableCFI)
-    Target.setMCUseCFI(false);
+  // If we don't have a module then just exit now. We do this down
+  // here since the CPU/Feature help is underneath the target machine
+  // creation.
+  if (SkipModule)
+    return 0;
 
-  if (EnableDwarfDirectory)
-    Target.setMCUseDwarfDirectory(true);
+  assert(mod && "Should have exited if we didn't have a module!");
+  TargetMachine &Target = *target.get();
 
   if (GenerateSoftFloatCalls)
     FloatABIForCalls = FloatABI::Soft;
@@ -301,22 +321,16 @@ static int compileModule(char **argv, LLVMContext &Context) {
     mod->setDataLayout(DL);
   PM.add(new DataLayoutPass(mod));
 
-  // Override default to generate verbose assembly.
-  Target.setAsmVerbosityDefault(true);
-
-  if (RelaxAll) {
-    if (FileType != TargetMachine::CGFT_ObjectFile)
-      errs() << argv[0]
+  if (RelaxAll.getNumOccurrences() > 0 &&
+      FileType != TargetMachine::CGFT_ObjectFile)
+    errs() << argv[0]
              << ": warning: ignoring -mc-relax-all because filetype != obj";
-    else
-      Target.setMCRelaxAll(true);
-  }
 
   {
     formatted_raw_ostream FOS(Out->os());
 
-    AnalysisID StartAfterID = 0;
-    AnalysisID StopAfterID = 0;
+    AnalysisID StartAfterID = nullptr;
+    AnalysisID StopAfterID = nullptr;
     const PassRegistry *PR = PassRegistry::getPassRegistry();
     if (!StartAfter.empty()) {
       const PassInfo *PI = PR->getPassInfo(StartAfter);
diff --git a/tools/lli/RemoteMemoryManager.cpp b/tools/lli/RemoteMemoryManager.cpp
index e9f4d53..200ab75 100644
--- a/tools/lli/RemoteMemoryManager.cpp
+++ b/tools/lli/RemoteMemoryManager.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "lli"
 #include "RemoteMemoryManager.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
@@ -21,6 +20,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "lli"
+
 RemoteMemoryManager::~RemoteMemoryManager() {
   for (SmallVector<Allocation, 2>::iterator
          I = AllocatedSections.begin(), E = AllocatedSections.end();
@@ -178,16 +179,16 @@ void RemoteMemoryManager::setPoisonMemory(bool poison) { llvm_unreachable("Unexp
 void RemoteMemoryManager::AllocateGOT() { llvm_unreachable("Unexpected!"); }
 uint8_t *RemoteMemoryManager::getGOTBase() const {
   llvm_unreachable("Unexpected!");
-  return 0;
+  return nullptr;
 }
 uint8_t *RemoteMemoryManager::startFunctionBody(const Function *F, uintptr_t &ActualSize){
   llvm_unreachable("Unexpected!");
-  return 0;
+  return nullptr;
 }
 uint8_t *RemoteMemoryManager::allocateStub(const GlobalValue* F, unsigned StubSize,
                                               unsigned Alignment) {
   llvm_unreachable("Unexpected!");
-  return 0;
+  return nullptr;
 }
 void RemoteMemoryManager::endFunctionBody(const Function *F, uint8_t *FunctionStart,
                                              uint8_t *FunctionEnd) {
@@ -195,11 +196,11 @@ void RemoteMemoryManager::endFunctionBody(const Function *F, uint8_t *FunctionSt
 }
 uint8_t *RemoteMemoryManager::allocateSpace(intptr_t Size, unsigned Alignment) {
   llvm_unreachable("Unexpected!");
-  return 0;
+  return nullptr;
 }
 uint8_t *RemoteMemoryManager::allocateGlobal(uintptr_t Size, unsigned Alignment) {
   llvm_unreachable("Unexpected!");
-  return 0;
+  return nullptr;
 }
 void RemoteMemoryManager::deallocateFunctionBody(void *Body) {
   llvm_unreachable("Unexpected!");
diff --git a/tools/lli/RemoteMemoryManager.h b/tools/lli/RemoteMemoryManager.h
index 11f600f..cf5d7c6 100644
--- a/tools/lli/RemoteMemoryManager.h
+++ b/tools/lli/RemoteMemoryManager.h
@@ -63,7 +63,7 @@ private:
   RemoteTarget *Target;
 
 public:
-  RemoteMemoryManager() : Target(NULL) {}
+  RemoteMemoryManager() : Target(nullptr) {}
   virtual ~RemoteMemoryManager();
 
   uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
diff --git a/tools/lli/RemoteTarget.cpp b/tools/lli/RemoteTarget.cpp
index 55fc064..850fdc5 100644
--- a/tools/lli/RemoteTarget.cpp
+++ b/tools/lli/RemoteTarget.cpp
@@ -30,9 +30,9 @@ using namespace llvm;
 
 bool RemoteTarget::allocateSpace(size_t Size, unsigned Alignment,
                                  uint64_t &Address) {
-  sys::MemoryBlock *Prev = Allocations.size() ? &Allocations.back() : NULL;
+  sys::MemoryBlock *Prev = Allocations.size() ? &Allocations.back() : nullptr;
   sys::MemoryBlock Mem = sys::Memory::AllocateRWX(Size, Prev, &ErrorMsg);
-  if (Mem.base() == NULL)
+  if (Mem.base() == nullptr)
     return false;
   if ((uintptr_t)Mem.base() % Alignment) {
     ErrorMsg = "unable to allocate sufficiently aligned memory";
diff --git a/tools/lli/RemoteTargetExternal.cpp b/tools/lli/RemoteTargetExternal.cpp
index c1bc8df..fe46248 100644
--- a/tools/lli/RemoteTargetExternal.cpp
+++ b/tools/lli/RemoteTargetExternal.cpp
@@ -26,6 +26,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "lli"
+
 bool RemoteTargetExternal::allocateSpace(size_t Size, unsigned Alignment,
                                  uint64_t &Address) {
   DEBUG(dbgs() << "Message [allocate space] size: " << Size <<
diff --git a/tools/lli/Unix/RPCChannel.inc b/tools/lli/Unix/RPCChannel.inc
index 4d245d6..6a9ae14 100644
--- a/tools/lli/Unix/RPCChannel.inc
+++ b/tools/lli/Unix/RPCChannel.inc
@@ -60,7 +60,7 @@ bool RPCChannel::createServer() {
     }
 
     // Execute the child process.
-    char *args[1] = { NULL };
+    char *args[1] = { nullptr };
     int rc = execv(ChildName.c_str(), args);
     if (rc != 0)
       perror("Error executing child process: ");
@@ -84,7 +84,7 @@ bool RPCChannel::createClient() {
   return true;
 }
 
-void RPCChannel::Wait() { wait(NULL); }
+void RPCChannel::Wait() { wait(nullptr); }
 
 static bool CheckError(int rc, size_t Size, const char *Desc) {
   if (rc < 0) {
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index c0c0f9d..4cde105 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "lli"
 #include "llvm/IR/LLVMContext.h"
 #include "RemoteMemoryManager.h"
 #include "RemoteTarget.h"
@@ -64,6 +63,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "lli"
+
 namespace {
   cl::opt<std::string>
   InputFile(cl::desc("<input bitcode>"), cl::Positional, cl::init("-"));
@@ -282,13 +283,13 @@ public:
     const std::string ModuleID = M->getModuleIdentifier();
     std::string CacheName;
     if (!getCacheFilename(ModuleID, CacheName))
-      return NULL;
+      return nullptr;
     // Load the object from the cache filename
     std::unique_ptr<MemoryBuffer> IRObjectBuffer;
     MemoryBuffer::getFile(CacheName.c_str(), IRObjectBuffer, -1, false);
     // If the file isn't there, that's OK.
     if (!IRObjectBuffer)
-      return NULL;
+      return nullptr;
     // MCJIT will want to write into this buffer, and we don't want that
     // because the file has probably just been mmapped.  Instead we make
     // a copy.  The filed-based buffer will be released when it goes
@@ -319,8 +320,8 @@ private:
   }
 };
 
-static ExecutionEngine *EE = 0;
-static LLIObjectCache *CacheManager = 0;
+static ExecutionEngine *EE = nullptr;
+static LLIObjectCache *CacheManager = nullptr;
 
 static void do_shutdown() {
   // Cygwin-1.5 invokes DLL's dtors before atexit handler.
@@ -449,7 +450,7 @@ int main(int argc, char **argv, char * const *envp) {
     Mod->setTargetTriple(Triple::normalize(TargetTriple));
 
   // Enable MCJIT if desired.
-  RTDyldMemoryManager *RTDyldMM = 0;
+  RTDyldMemoryManager *RTDyldMM = nullptr;
   if (UseMCJIT && !ForceInterpreter) {
     builder.setUseMCJIT(true);
     if (RemoteMCJIT)
@@ -462,7 +463,7 @@ int main(int argc, char **argv, char * const *envp) {
       errs() << "error: Remote process execution requires -use-mcjit\n";
       exit(1);
     }
-    builder.setJITMemoryManager(ForceInterpreter ? 0 :
+    builder.setJITMemoryManager(ForceInterpreter ? nullptr :
                                 JITMemoryManager::CreateDefaultMemManager());
   }
 
@@ -533,7 +534,7 @@ int main(int argc, char **argv, char * const *envp) {
       Err.print(argv[0], errs());
       return 1;
     }
-    EE->addObjectFile(Obj.get());
+    EE->addObjectFile(std::unique_ptr<object::ObjectFile>(Obj.get()));
   }
 
   for (unsigned i = 0, e = ExtraArchives.size(); i != e; ++i) {
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 047f54e..ed7291e 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -516,7 +516,7 @@ computeInsertAction(ArchiveOperation Operation,
     // We could try to optimize this to a fstat, but it is not a common
     // operation.
     sys::fs::file_status Status;
-    failIfError(sys::fs::status(*MI, Status));
+    failIfError(sys::fs::status(*MI, Status), *MI);
     if (Status.getLastModificationTime() < I->getLastModified()) {
       if (PosName.empty())
         return IA_AddOldMember;
@@ -856,7 +856,7 @@ static void performWriteOperation(ArchiveOperation Operation,
   Output.keep();
   Out.close();
   sys::fs::rename(TemporaryOutput, ArchiveName);
-  TemporaryOutput = NULL;
+  TemporaryOutput = nullptr;
 }
 
 static void createSymbolTable(object::Archive *OldArchive) {
@@ -969,6 +969,6 @@ static int performOperation(ArchiveOperation Operation) {
     }
   }
 
-  performOperation(Operation, NULL);
+  performOperation(Operation, nullptr);
   return 0;
 }
diff --git a/tools/llvm-as/llvm-as.cpp b/tools/llvm-as/llvm-as.cpp
index 7583b12..007241c 100644
--- a/tools/llvm-as/llvm-as.cpp
+++ b/tools/llvm-as/llvm-as.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
@@ -94,7 +95,7 @@ int main(int argc, char **argv) {
   // Parse the file now...
   SMDiagnostic Err;
   std::unique_ptr<Module> M(ParseAssemblyFile(InputFilename, Err, Context));
-  if (M.get() == 0) {
+  if (!M.get()) {
     Err.print(argv[0], errs());
     return 1;
   }
diff --git a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index 9e17783..dfdaa03 100644
--- a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
@@ -84,7 +84,7 @@ static const char *GetBlockName(unsigned BlockID,
   if (BlockID < bitc::FIRST_APPLICATION_BLOCKID) {
     if (BlockID == bitc::BLOCKINFO_BLOCK_ID)
       return "BLOCKINFO_BLOCK";
-    return 0;
+    return nullptr;
   }
 
   // Check to see if we have a blockinfo record for this block, with a name.
@@ -95,10 +95,10 @@ static const char *GetBlockName(unsigned BlockID,
   }
 
 
-  if (CurStreamType != LLVMIRBitstream) return 0;
+  if (CurStreamType != LLVMIRBitstream) return nullptr;
 
   switch (BlockID) {
-  default:                             return 0;
+  default:                             return nullptr;
   case bitc::MODULE_BLOCK_ID:          return "MODULE_BLOCK";
   case bitc::PARAMATTR_BLOCK_ID:       return "PARAMATTR_BLOCK";
   case bitc::PARAMATTR_GROUP_BLOCK_ID: return "PARAMATTR_GROUP_BLOCK_ID";
@@ -120,13 +120,13 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
   if (BlockID < bitc::FIRST_APPLICATION_BLOCKID) {
     if (BlockID == bitc::BLOCKINFO_BLOCK_ID) {
       switch (CodeID) {
-      default: return 0;
+      default: return nullptr;
       case bitc::BLOCKINFO_CODE_SETBID:        return "SETBID";
       case bitc::BLOCKINFO_CODE_BLOCKNAME:     return "BLOCKNAME";
       case bitc::BLOCKINFO_CODE_SETRECORDNAME: return "SETRECORDNAME";
       }
     }
-    return 0;
+    return nullptr;
   }
 
   // Check to see if we have a blockinfo record for this record, with a name.
@@ -138,13 +138,13 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
   }
 
 
-  if (CurStreamType != LLVMIRBitstream) return 0;
+  if (CurStreamType != LLVMIRBitstream) return nullptr;
 
   switch (BlockID) {
-  default: return 0;
+  default: return nullptr;
   case bitc::MODULE_BLOCK_ID:
     switch (CodeID) {
-    default: return 0;
+    default: return nullptr;
     case bitc::MODULE_CODE_VERSION:     return "VERSION";
     case bitc::MODULE_CODE_TRIPLE:      return "TRIPLE";
     case bitc::MODULE_CODE_DATALAYOUT:  return "DATALAYOUT";
@@ -159,14 +159,14 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
     }
   case bitc::PARAMATTR_BLOCK_ID:
     switch (CodeID) {
-    default: return 0;
+    default: return nullptr;
     case bitc::PARAMATTR_CODE_ENTRY_OLD: return "ENTRY";
     case bitc::PARAMATTR_CODE_ENTRY:     return "ENTRY";
     case bitc::PARAMATTR_GRP_CODE_ENTRY: return "ENTRY";
     }
   case bitc::TYPE_BLOCK_ID_NEW:
     switch (CodeID) {
-    default: return 0;
+    default: return nullptr;
     case bitc::TYPE_CODE_NUMENTRY:     return "NUMENTRY";
     case bitc::TYPE_CODE_VOID:         return "VOID";
     case bitc::TYPE_CODE_FLOAT:        return "FLOAT";
@@ -189,7 +189,7 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
 
   case bitc::CONSTANTS_BLOCK_ID:
     switch (CodeID) {
-    default: return 0;
+    default: return nullptr;
     case bitc::CST_CODE_SETTYPE:         return "SETTYPE";
     case bitc::CST_CODE_NULL:            return "NULL";
     case bitc::CST_CODE_UNDEF:           return "UNDEF";
@@ -215,7 +215,7 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
     }
   case bitc::FUNCTION_BLOCK_ID:
     switch (CodeID) {
-    default: return 0;
+    default: return nullptr;
     case bitc::FUNC_CODE_DECLAREBLOCKS: return "DECLAREBLOCKS";
 
     case bitc::FUNC_CODE_INST_BINOP:        return "INST_BINOP";
@@ -249,18 +249,18 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
     }
   case bitc::VALUE_SYMTAB_BLOCK_ID:
     switch (CodeID) {
-    default: return 0;
+    default: return nullptr;
     case bitc::VST_CODE_ENTRY: return "ENTRY";
     case bitc::VST_CODE_BBENTRY: return "BBENTRY";
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch(CodeID) {
-    default:return 0;
+    default:return nullptr;
     case bitc::METADATA_ATTACHMENT: return "METADATA_ATTACHMENT";
     }
   case bitc::METADATA_BLOCK_ID:
     switch(CodeID) {
-    default:return 0;
+    default:return nullptr;
     case bitc::METADATA_STRING:      return "METADATA_STRING";
     case bitc::METADATA_NAME:        return "METADATA_NAME";
     case bitc::METADATA_KIND:        return "METADATA_KIND";
@@ -270,7 +270,7 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
     }
   case bitc::USELIST_BLOCK_ID:
     switch(CodeID) {
-    default:return 0;
+    default:return nullptr;
     case bitc::USELIST_CODE_ENTRY:   return "USELIST_CODE_ENTRY";
     }
   }
@@ -345,7 +345,7 @@ static bool ParseBlock(BitstreamCursor &Stream, unsigned BlockID,
   if (Stream.EnterSubBlock(BlockID, &NumWords))
     return Error("Malformed block record");
 
-  const char *BlockName = 0;
+  const char *BlockName = nullptr;
   if (Dump) {
     outs() << Indent << "<";
     if ((BlockName = GetBlockName(BlockID, *Stream.getBitStreamReader())))
diff --git a/tools/llvm-c-test/object.c b/tools/llvm-c-test/object.c
index 2792928..a5421d9 100644
--- a/tools/llvm-c-test/object.c
+++ b/tools/llvm-c-test/object.c
@@ -72,9 +72,8 @@ int object_list_symbols(void) {
   while (!LLVMIsSymbolIteratorAtEnd(O, sym)) {
 
     LLVMMoveToContainingSection(sect, sym);
-    printf("%s @0x%08" PRIx64 "/0x%08" PRIx64 " +%" PRIu64 " (%s)\n",
-           LLVMGetSymbolName(sym), LLVMGetSymbolAddress(sym),
-           LLVMGetSymbolFileOffset(sym), LLVMGetSymbolSize(sym),
+    printf("%s @0x%08" PRIx64 " +%" PRIu64 " (%s)\n", LLVMGetSymbolName(sym),
+           LLVMGetSymbolAddress(sym), LLVMGetSymbolSize(sym),
            LLVMGetSectionName(sect));
 
     LLVMMoveToNextSymbol(sym);
diff --git a/tools/llvm-config/CMakeLists.txt b/tools/llvm-config/CMakeLists.txt
index 6f29a82..8d83762 100644
--- a/tools/llvm-config/CMakeLists.txt
+++ b/tools/llvm-config/CMakeLists.txt
@@ -16,7 +16,8 @@ set(LLVM_OBJ_ROOT ${LLVM_BINARY_DIR})
 set(LLVM_CPPFLAGS "${CMAKE_CPP_FLAGS} ${CMAKE_CPP_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
 set(LLVM_CFLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
 set(LLVM_CXXFLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
-set(LLVM_LDFLAGS ${CMAKE_SHARED_LINKER_FLAGS})
+# Use the C++ link flags, since they should be a superset of C link flags.
+set(LLVM_LDFLAGS "${CMAKE_CXX_LINK_FLAGS}")
 set(LLVM_BUILDMODE ${CMAKE_BUILD_TYPE})
 set(LLVM_SYSTEM_LIBS ${SYSTEM_LIBS})
 string(REPLACE ";" " " LLVM_TARGETS_BUILT "${LLVM_TARGETS_TO_BUILD}")
diff --git a/tools/llvm-cov/llvm-cov.cpp b/tools/llvm-cov/llvm-cov.cpp
index 587ee11..9463609 100644
--- a/tools/llvm-cov/llvm-cov.cpp
+++ b/tools/llvm-cov/llvm-cov.cpp
@@ -26,34 +26,42 @@ using namespace llvm;
 static cl::opt<std::string> SourceFile(cl::Positional, cl::Required,
                                        cl::desc("SOURCEFILE"));
 
-static cl::opt<bool> AllBlocks("a", cl::init(false),
+static cl::opt<bool> AllBlocks("a", cl::Grouping, cl::init(false),
                                cl::desc("Display all basic blocks"));
 static cl::alias AllBlocksA("all-blocks", cl::aliasopt(AllBlocks));
 
-static cl::opt<bool> BranchProb("b", cl::init(false),
+static cl::opt<bool> BranchProb("b", cl::Grouping, cl::init(false),
                                 cl::desc("Display branch probabilities"));
 static cl::alias BranchProbA("branch-probabilities", cl::aliasopt(BranchProb));
 
-static cl::opt<bool> BranchCount("c", cl::init(false),
+static cl::opt<bool> BranchCount("c", cl::Grouping, cl::init(false),
                                  cl::desc("Display branch counts instead "
                                            "of percentages (requires -b)"));
 static cl::alias BranchCountA("branch-counts", cl::aliasopt(BranchCount));
 
-static cl::opt<bool> FuncSummary("f", cl::init(false),
+static cl::opt<bool> LongNames("l", cl::Grouping, cl::init(false),
+                               cl::desc("Prefix filenames with the main file"));
+static cl::alias LongNamesA("long-file-names", cl::aliasopt(LongNames));
+
+static cl::opt<bool> FuncSummary("f", cl::Grouping, cl::init(false),
                                  cl::desc("Show coverage for each function"));
 static cl::alias FuncSummaryA("function-summaries", cl::aliasopt(FuncSummary));
 
+static cl::opt<bool> NoOutput("n", cl::Grouping, cl::init(false),
+                              cl::desc("Do not output any .gcov files"));
+static cl::alias NoOutputA("no-output", cl::aliasopt(NoOutput));
+
 static cl::opt<std::string>
 ObjectDir("o", cl::value_desc("DIR|FILE"), cl::init(""),
           cl::desc("Find objects in DIR or based on FILE's path"));
 static cl::alias ObjectDirA("object-directory", cl::aliasopt(ObjectDir));
 static cl::alias ObjectDirB("object-file", cl::aliasopt(ObjectDir));
 
-static cl::opt<bool> PreservePaths("p", cl::init(false),
+static cl::opt<bool> PreservePaths("p", cl::Grouping, cl::init(false),
                                    cl::desc("Preserve path components"));
 static cl::alias PreservePathsA("preserve-paths", cl::aliasopt(PreservePaths));
 
-static cl::opt<bool> UncondBranch("u", cl::init(false),
+static cl::opt<bool> UncondBranch("u", cl::Grouping, cl::init(false),
                                   cl::desc("Display unconditional branch info "
                                            "(requires -b)"));
 static cl::alias UncondBranchA("unconditional-branches",
@@ -126,9 +134,9 @@ int main(int argc, char **argv) {
     GF.dump();
 
   GCOVOptions Options(AllBlocks, BranchProb, BranchCount, FuncSummary,
-                      PreservePaths, UncondBranch);
+                      PreservePaths, UncondBranch, LongNames, NoOutput);
   FileInfo FI(Options);
   GF.collectLineCounts(FI);
-  FI.print(InputGCNO, InputGCDA);
+  FI.print(SourceFile, InputGCNO, InputGCDA);
   return 0;
 }
diff --git a/tools/llvm-diff/DiffLog.cpp b/tools/llvm-diff/DiffLog.cpp
index caf779b..24a1b08 100644
--- a/tools/llvm-diff/DiffLog.cpp
+++ b/tools/llvm-diff/DiffLog.cpp
@@ -35,11 +35,11 @@ void DiffLogBuilder::addMatch(Instruction *L, Instruction *R) {
 }
 void DiffLogBuilder::addLeft(Instruction *L) {
   // HACK: VS 2010 has a bug in the stdlib that requires this.
-  Diff.push_back(DiffRecord(L, DiffRecord::second_type(0)));
+  Diff.push_back(DiffRecord(L, DiffRecord::second_type(nullptr)));
 }
 void DiffLogBuilder::addRight(Instruction *R) {
   // HACK: VS 2010 has a bug in the stdlib that requires this.
-  Diff.push_back(DiffRecord(DiffRecord::first_type(0), R));
+  Diff.push_back(DiffRecord(DiffRecord::first_type(nullptr), R));
 }
 
 unsigned DiffLogBuilder::getNumLines() const { return Diff.size(); }
diff --git a/tools/llvm-diff/DifferenceEngine.h b/tools/llvm-diff/DifferenceEngine.h
index 73bf6eb..4470968 100644
--- a/tools/llvm-diff/DifferenceEngine.h
+++ b/tools/llvm-diff/DifferenceEngine.h
@@ -59,7 +59,7 @@ namespace llvm {
     };
 
     DifferenceEngine(Consumer &consumer)
-      : consumer(consumer), globalValueOracle(0) {}
+      : consumer(consumer), globalValueOracle(nullptr) {}
 
     void diff(Module *L, Module *R);
     void diff(Function *L, Function *R);
diff --git a/tools/llvm-dis/llvm-dis.cpp b/tools/llvm-dis/llvm-dis.cpp
index c6f0dcf..0df7328 100644
--- a/tools/llvm-dis/llvm-dis.cpp
+++ b/tools/llvm-dis/llvm-dis.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataStream.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -135,7 +136,7 @@ int main(int argc, char **argv) {
       DisplayFilename = InputFilename;
     M.reset(getStreamedBitcodeModule(DisplayFilename, streamer, Context,
                                      &ErrorMessage));
-    if(M.get() != 0) {
+    if(M.get()) {
       if (error_code EC = M->materializeAllPermanently()) {
         ErrorMessage = EC.message();
         M.reset();
@@ -143,7 +144,7 @@ int main(int argc, char **argv) {
     }
   }
 
-  if (M.get() == 0) {
+  if (!M.get()) {
     errs() << argv[0] << ": ";
     if (ErrorMessage.size())
       errs() << ErrorMessage << "\n";
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index f4a9ae8..46ac36e 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -38,19 +38,6 @@ static cl::list<std::string>
 InputFilenames(cl::Positional, cl::desc("<input object files>"),
                cl::ZeroOrMore);
 
-static cl::opt<unsigned long long>
-Address("address", cl::init(-1ULL),
-        cl::desc("Print line information for a given address"));
-
-static cl::opt<bool>
-PrintFunctions("functions", cl::init(false),
-               cl::desc("Print function names as well as line information "
-                        "for a given address"));
-
-static cl::opt<bool>
-PrintInlining("inlining", cl::init(false),
-              cl::desc("Print all inlined frames for a given address"));
-
 static cl::opt<DIDumpType>
 DumpType("debug-dump", cl::init(DIDT_All),
   cl::desc("Dump of debug sections:"),
@@ -78,14 +65,6 @@ DumpType("debug-dump", cl::init(DIDT_All),
         clEnumValN(DIDT_StrOffsetsDwo, "str_offsets.dwo", ".debug_str_offsets.dwo"),
         clEnumValEnd));
 
-static void PrintDILineInfo(DILineInfo dli) {
-  if (PrintFunctions)
-    outs() << (dli.getFunctionName() ? dli.getFunctionName() : "<unknown>")
-           << "\n";
-  outs() << (dli.getFileName() ? dli.getFileName() : "<unknown>") << ':'
-         << dli.getLine() << ':' << dli.getColumn() << '\n';
-}
-
 static void DumpInput(const StringRef &Filename) {
   std::unique_ptr<MemoryBuffer> Buff;
 
@@ -103,35 +82,10 @@ static void DumpInput(const StringRef &Filename) {
 
   std::unique_ptr<DIContext> DICtx(DIContext::getDWARFContext(Obj.get()));
 
-  if (Address == -1ULL) {
-    outs() << Filename
-           << ":\tfile format " << Obj->getFileFormatName() << "\n\n";
-    // Dump the complete DWARF structure.
-    DICtx->dump(outs(), DumpType);
-  } else {
-    // Print line info for the specified address.
-    int SpecFlags = DILineInfoSpecifier::FileLineInfo |
-                    DILineInfoSpecifier::AbsoluteFilePath;
-    if (PrintFunctions)
-      SpecFlags |= DILineInfoSpecifier::FunctionName;
-    if (PrintInlining) {
-      DIInliningInfo InliningInfo =
-        DICtx->getInliningInfoForAddress(Address, SpecFlags);
-      uint32_t n = InliningInfo.getNumberOfFrames();
-      if (n == 0) {
-        // Print one empty debug line info in any case.
-        PrintDILineInfo(DILineInfo());
-      } else {
-        for (uint32_t i = 0; i < n; i++) {
-          DILineInfo dli = InliningInfo.getFrame(i);
-          PrintDILineInfo(dli);
-        }
-      }
-    } else {
-      DILineInfo dli = DICtx->getLineInfoForAddress(Address, SpecFlags);
-      PrintDILineInfo(dli);
-    }
-  }
+  outs() << Filename
+         << ":\tfile format " << Obj->getFileFormatName() << "\n\n";
+  // Dump the complete DWARF structure.
+  DICtx->dump(outs(), DumpType);
 }
 
 int main(int argc, char **argv) {
diff --git a/tools/llvm-extract/llvm-extract.cpp b/tools/llvm-extract/llvm-extract.cpp
index 2e5a2af..0f70868 100644
--- a/tools/llvm-extract/llvm-extract.cpp
+++ b/tools/llvm-extract/llvm-extract.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Regex.h"
@@ -103,7 +104,7 @@ int main(int argc, char **argv) {
   std::unique_ptr<Module> M;
   M.reset(getLazyIRFileModule(InputFilename, Err, Context));
 
-  if (M.get() == 0) {
+  if (!M.get()) {
     Err.print(argv[0], errs());
     return 1;
   }
@@ -165,10 +166,9 @@ int main(int argc, char **argv) {
         "invalid regex: " << Error;
     }
     bool match = false;
-    for (Module::global_iterator GV = M->global_begin(),
-           E = M->global_end(); GV != E; GV++) {
-      if (RegEx.match(GV->getName())) {
-        GVs.insert(&*GV);
+    for (auto &GV : M->globals()) {
+      if (RegEx.match(GV.getName())) {
+        GVs.insert(&GV);
         match = true;
       }
     }
@@ -228,22 +228,19 @@ int main(int argc, char **argv) {
   else {
     // Deleting. Materialize every GV that's *not* in GVs.
     SmallPtrSet<GlobalValue *, 8> GVSet(GVs.begin(), GVs.end());
-    for (Module::global_iterator I = M->global_begin(), E = M->global_end();
-         I != E; ++I) {
-      GlobalVariable *G = I;
-      if (!GVSet.count(G) && G->isMaterializable()) {
+    for (auto &G : M->globals()) {
+      if (!GVSet.count(&G) && G.isMaterializable()) {
         std::string ErrInfo;
-        if (G->Materialize(&ErrInfo)) {
+        if (G.Materialize(&ErrInfo)) {
           errs() << argv[0] << ": error reading input: " << ErrInfo << "\n";
           return 1;
         }
       }
     }
-    for (Module::iterator I = M->begin(), E = M->end(); I != E; ++I) {
-      Function *F = I;
-      if (!GVSet.count(F) && F->isMaterializable()) {
+    for (auto &F : *M) {
+      if (!GVSet.count(&F) && F.isMaterializable()) {
         std::string ErrInfo;
-        if (F->Materialize(&ErrInfo)) {
+        if (F.Materialize(&ErrInfo)) {
           errs() << argv[0] << ": error reading input: " << ErrInfo << "\n";
           return 1;
         }
diff --git a/tools/llvm-link/llvm-link.cpp b/tools/llvm-link/llvm-link.cpp
index 1f0e224..ed8c06e 100644
--- a/tools/llvm-link/llvm-link.cpp
+++ b/tools/llvm-link/llvm-link.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
@@ -61,13 +62,13 @@ static inline Module *LoadFile(const char *argv0, const std::string &FN,
                                LLVMContext& Context) {
   SMDiagnostic Err;
   if (Verbose) errs() << "Loading '" << FN << "'\n";
-  Module* Result = 0;
+  Module* Result = nullptr;
 
   Result = ParseIRFile(FN, Err, Context);
   if (Result) return Result;   // Load successful!
 
   Err.print(argv0, errs());
-  return NULL;
+  return nullptr;
 }
 
 int main(int argc, char **argv) {
@@ -84,7 +85,7 @@ int main(int argc, char **argv) {
 
   std::unique_ptr<Module> Composite(
       LoadFile(argv[0], InputFilenames[BaseArg], Context));
-  if (Composite.get() == 0) {
+  if (!Composite.get()) {
     errs() << argv[0] << ": error loading file '"
            << InputFilenames[BaseArg] << "'\n";
     return 1;
@@ -93,7 +94,7 @@ int main(int argc, char **argv) {
   Linker L(Composite.get(), SuppressWarnings);
   for (unsigned i = BaseArg+1; i < InputFilenames.size(); ++i) {
     std::unique_ptr<Module> M(LoadFile(argv[0], InputFilenames[i], Context));
-    if (M.get() == 0) {
+    if (!M.get()) {
       errs() << argv[0] << ": error loading file '" <<InputFilenames[i]<< "'\n";
       return 1;
     }
diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index ec3f0fa..8c2d1cd 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -17,6 +17,7 @@
 #include "llvm/LTO/LTOCodeGenerator.h"
 #include "llvm/LTO/LTOModule.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
@@ -83,7 +84,20 @@ int main(int argc, char **argv) {
 
   LTOCodeGenerator CodeGen;
 
-  CodeGen.setCodePICModel(LTO_CODEGEN_PIC_MODEL_DYNAMIC);
+  switch (RelocModel) {
+  case Reloc::Static:
+    CodeGen.setCodePICModel(LTO_CODEGEN_PIC_MODEL_STATIC);
+    break;
+  case Reloc::PIC_:
+    CodeGen.setCodePICModel(LTO_CODEGEN_PIC_MODEL_DYNAMIC);
+    break;
+  case Reloc::DynamicNoPIC:
+    CodeGen.setCodePICModel(LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC);
+    break;
+  default:
+    CodeGen.setCodePICModel(LTO_CODEGEN_PIC_MODEL_DEFAULT);
+  }
+
   CodeGen.setDebugInfo(LTO_DEBUG_MODEL_DWARF);
   CodeGen.setTargetOptions(Options);
 
@@ -130,12 +144,22 @@ int main(int argc, char **argv) {
   for (unsigned i = 0; i < KeptDSOSyms.size(); ++i)
     CodeGen.addMustPreserveSymbol(KeptDSOSyms[i].c_str());
 
+  std::string attrs;
+  for (unsigned i = 0; i < MAttrs.size(); ++i) {
+    if (i > 0)
+      attrs.append(",");
+    attrs.append(MAttrs[i]);
+  }
+
+  if (!attrs.empty())
+    CodeGen.setAttr(attrs.c_str());
+
   if (!OutputFilename.empty()) {
     size_t len = 0;
     std::string ErrorInfo;
     const void *Code = CodeGen.compile(&len, DisableOpt, DisableInline,
                                        DisableGVNLoadPRE, ErrorInfo);
-    if (Code == NULL) {
+    if (!Code) {
       errs() << argv[0]
              << ": error compiling the code: " << ErrorInfo << "\n";
       return 1;
@@ -152,7 +176,7 @@ int main(int argc, char **argv) {
     FileStream.write(reinterpret_cast<const char *>(Code), len);
   } else {
     std::string ErrorInfo;
-    const char *OutputName = NULL;
+    const char *OutputName = nullptr;
     if (!CodeGen.compile_to_file(&OutputName, DisableOpt, DisableInline,
                                  DisableGVNLoadPRE, ErrorInfo)) {
       errs() << argv[0]
diff --git a/tools/llvm-mc/Disassembler.cpp b/tools/llvm-mc/Disassembler.cpp
index 9c402f2..9367590 100644
--- a/tools/llvm-mc/Disassembler.cpp
+++ b/tools/llvm-mc/Disassembler.cpp
@@ -14,8 +14,11 @@
 
 #include "Disassembler.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -158,7 +161,24 @@ int Disassembler::disassemble(const Target &T,
                               MemoryBuffer &Buffer,
                               SourceMgr &SM,
                               raw_ostream &Out) {
-  std::unique_ptr<const MCDisassembler> DisAsm(T.createMCDisassembler(STI));
+
+  std::unique_ptr<const MCRegisterInfo> MRI(T.createMCRegInfo(Triple));
+  if (!MRI) {
+    errs() << "error: no register info for target " << Triple << "\n";
+    return -1;
+  }
+
+  std::unique_ptr<const MCAsmInfo> MAI(T.createMCAsmInfo(*MRI, Triple));
+  if (!MAI) {
+    errs() << "error: no assembly info for target " << Triple << "\n";
+    return -1;
+  }
+
+  // Set up the MCContext for creating symbols and MCExpr's.
+  MCContext Ctx(MAI.get(), MRI.get(), nullptr);
+
+  std::unique_ptr<const MCDisassembler> DisAsm(
+    T.createMCDisassembler(STI, Ctx));
   if (!DisAsm) {
     errs() << "error: no disassembler for target " << Triple << "\n";
     return -1;
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 61fd2c4..84d578b 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/FileUtilities.h"
@@ -64,12 +65,6 @@ static cl::opt<unsigned>
 OutputAsmVariant("output-asm-variant",
                  cl::desc("Syntax variant to use for output printing"));
 
-static cl::opt<bool>
-RelaxAll("mc-relax-all", cl::desc("Relax all fixups"));
-
-static cl::opt<bool>
-NoExecStack("mc-no-exec-stack", cl::desc("File doesn't need an exec stack"));
-
 enum OutputFileType {
   OFT_Null,
   OFT_AssemblyFile,
@@ -147,12 +142,12 @@ NoInitialTextSection("n", cl::desc("Don't assume assembly file starts "
                                    "in the text section"));
 
 static cl::opt<bool>
-SaveTempLabels("L", cl::desc("Don't discard temporary labels"));
-
-static cl::opt<bool>
 GenDwarfForAssembly("g", cl::desc("Generate dwarf debugging info for assembly "
                                   "source files"));
 
+static cl::opt<int>
+DwarfVersion("dwarf-version", cl::desc("Dwarf version"), cl::init(4));
+
 static cl::opt<std::string>
 DebugCompilationDir("fdebug-compilation-dir",
                     cl::desc("Specifies the debug info's compilation dir"));
@@ -161,6 +156,12 @@ static cl::opt<std::string>
 MainFileName("main-file-name",
              cl::desc("Specifies the name we should consider the input file"));
 
+static cl::opt<bool> SaveTempLabels("save-temp-labels",
+                                    cl::desc("Don't discard temporary labels"));
+
+static cl::opt<bool> NoExecStack("no-exec-stack",
+                                 cl::desc("File doesn't need an exec stack"));
+
 enum ActionType {
   AC_AsLex,
   AC_Assemble,
@@ -197,7 +198,7 @@ static const Target *GetTarget(const char *ProgName) {
                                                          Error);
   if (!TheTarget) {
     errs() << ProgName << ": " << Error;
-    return 0;
+    return nullptr;
   }
 
   // Update the triple name and return the found target.
@@ -215,7 +216,7 @@ static tool_output_file *GetOutputStream() {
   if (!Err.empty()) {
     errs() << Err << '\n';
     delete Out;
-    return 0;
+    return nullptr;
   }
 
   return Out;
@@ -320,9 +321,11 @@ static int AsLexInput(SourceMgr &SrcMgr, MCAsmInfo &MAI, tool_output_file *Out)
 static int AssembleInput(const char *ProgName, const Target *TheTarget,
                          SourceMgr &SrcMgr, MCContext &Ctx, MCStreamer &Str,
                          MCAsmInfo &MAI, MCSubtargetInfo &STI, MCInstrInfo &MCII) {
-  std::unique_ptr<MCAsmParser> Parser(createMCAsmParser(SrcMgr, Ctx, Str, MAI));
+  std::unique_ptr<MCAsmParser> Parser(
+      createMCAsmParser(SrcMgr, Ctx, Str, MAI));
   std::unique_ptr<MCTargetAsmParser> TAP(
-      TheTarget->createMCAsmParser(STI, *Parser, MCII));
+      TheTarget->createMCAsmParser(STI, *Parser, MCII,
+                                   InitMCTargetOptionsFromFlags()));
   if (!TAP) {
     errs() << ProgName
            << ": error: this target does not support assembly parsing.\n";
@@ -403,6 +406,12 @@ int main(int argc, char **argv) {
     Ctx.setAllowTemporaryLabels(false);
 
   Ctx.setGenDwarfForAssembly(GenDwarfForAssembly);
+  if (DwarfVersion < 2 || DwarfVersion > 4) {
+    errs() << ProgName << ": Dwarf version " << DwarfVersion
+           << " is not supported." << '\n';
+    return 1;
+  }
+  Ctx.setDwarfVersion(DwarfVersion);
   if (!DwarfDebugFlags.empty())
     Ctx.setDwarfDebugFlags(StringRef(DwarfDebugFlags));
   if (!DwarfDebugProducer.empty())
@@ -432,20 +441,19 @@ int main(int argc, char **argv) {
   std::unique_ptr<MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TripleName, MCPU, FeaturesStr));
 
-  MCInstPrinter *IP = NULL;
+  MCInstPrinter *IP = nullptr;
   if (FileType == OFT_AssemblyFile) {
     IP =
       TheTarget->createMCInstPrinter(OutputAsmVariant, *MAI, *MCII, *MRI, *STI);
-    MCCodeEmitter *CE = 0;
-    MCAsmBackend *MAB = 0;
+    MCCodeEmitter *CE = nullptr;
+    MCAsmBackend *MAB = nullptr;
     if (ShowEncoding) {
       CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, *STI, Ctx);
       MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, MCPU);
     }
     Str.reset(TheTarget->createAsmStreamer(Ctx, FOS, /*asmverbose*/ true,
-                                           /*UseCFI*/ true,
-                                           /*useDwarfDirectory*/
-                                           true, IP, CE, MAB, ShowInst));
+                                           /*useDwarfDirectory*/ true, IP, CE,
+                                           MAB, ShowInst));
 
   } else if (FileType == OFT_Null) {
     Str.reset(createNullStreamer(Ctx));
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 22e019a..3be9247 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -181,11 +181,30 @@ static bool compareSymbolName(const NMSymbol &A, const NMSymbol &B) {
     return false;
 }
 
+static char isSymbolList64Bit(SymbolicFile *Obj) {
+  if (isa<IRObjectFile>(Obj))
+    return false;
+  else if (isa<COFFObjectFile>(Obj))
+    return false;
+  else if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(Obj))
+    return MachO->is64Bit();
+  else if (isa<ELF32LEObjectFile>(Obj))
+    return false;
+  else if (isa<ELF64LEObjectFile>(Obj))
+    return true;
+  else if (isa<ELF32BEObjectFile>(Obj))
+    return false;
+  else if(isa<ELF64BEObjectFile>(Obj))
+    return true;
+  else
+    return false;
+}
+
 static StringRef CurrentFilename;
 typedef std::vector<NMSymbol> SymbolListT;
 static SymbolListT SymbolList;
 
-static void sortAndPrintSymbolList() {
+static void sortAndPrintSymbolList(SymbolicFile *Obj) {
   if (!NoSort) {
     if (NumericSort)
       std::sort(SymbolList.begin(), SymbolList.end(), compareSymbolAddress);
@@ -205,6 +224,15 @@ static void sortAndPrintSymbolList() {
            << "         Size   Line  Section\n";
   }
 
+  const char *printBlanks, *printFormat;
+  if (isSymbolList64Bit(Obj)) {
+    printBlanks = "                ";
+    printFormat = "%016" PRIx64;
+  } else {
+    printBlanks = "        ";
+    printFormat = "%08" PRIx64;
+  }
+
   for (SymbolListT::iterator I = SymbolList.begin(), E = SymbolList.end();
        I != E; ++I) {
     if ((I->TypeChar != 'U') && UndefinedOnly)
@@ -214,19 +242,19 @@ static void sortAndPrintSymbolList() {
     if (SizeSort && !PrintAddress && I->Size == UnknownAddressOrSize)
       continue;
 
-    char SymbolAddrStr[10] = "";
-    char SymbolSizeStr[10] = "";
+    char SymbolAddrStr[18] = "";
+    char SymbolSizeStr[18] = "";
 
     if (OutputFormat == sysv || I->Address == UnknownAddressOrSize)
-      strcpy(SymbolAddrStr, "        ");
+      strcpy(SymbolAddrStr, printBlanks);
     if (OutputFormat == sysv)
-      strcpy(SymbolSizeStr, "        ");
+      strcpy(SymbolSizeStr, printBlanks);
 
     if (I->Address != UnknownAddressOrSize)
-      format("%08" PRIx64, I->Address)
+      format(printFormat, I->Address)
           .print(SymbolAddrStr, sizeof(SymbolAddrStr));
     if (I->Size != UnknownAddressOrSize)
-      format("%08" PRIx64, I->Size).print(SymbolSizeStr, sizeof(SymbolSizeStr));
+      format(printFormat, I->Size).print(SymbolSizeStr, sizeof(SymbolSizeStr));
 
     if (OutputFormat == posix) {
       outs() << I->Name << " " << I->TypeChar << " " << SymbolAddrStr
@@ -392,7 +420,7 @@ static char getSymbolNMTypeChar(const GlobalValue &GV) {
   if (isa<GlobalVariable>(GV))
     return 'd';
   const GlobalAlias *GA = cast<GlobalAlias>(&GV);
-  const GlobalValue *AliasedGV = GA->getAliasedGlobal();
+  const GlobalValue *AliasedGV = GA->getAliasee();
   return getSymbolNMTypeChar(*AliasedGV);
 }
 
@@ -514,7 +542,7 @@ static void dumpSymbolNamesFromObject(SymbolicFile *Obj) {
   }
 
   CurrentFilename = Obj->getFileName();
-  sortAndPrintSymbolList();
+  sortAndPrintSymbolList(Obj);
 }
 
 static void dumpSymbolNamesFromFile(std::string &Filename) {
@@ -567,10 +595,24 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
                                                E = UB->end_objects();
          I != E; ++I) {
       std::unique_ptr<ObjectFile> Obj;
+      std::unique_ptr<Archive> A;
       if (!I->getAsObjectFile(Obj)) {
         outs() << Obj->getFileName() << ":\n";
         dumpSymbolNamesFromObject(Obj.get());
       }
+      else if (!I->getAsArchive(A)) {
+        for (Archive::child_iterator AI = A->child_begin(), AE = A->child_end();
+             AI != AE; ++AI) {
+          std::unique_ptr<Binary> Child;
+          if (AI->getAsBinary(Child, &Context))
+            continue;
+          if (SymbolicFile *O = dyn_cast<SymbolicFile>(Child.get())) {
+            outs() << A->getFileName() << ":";
+            outs() << O->getFileName() << ":\n";
+            dumpSymbolNamesFromObject(O);
+          }
+        }
+      }
     }
     return;
   }
diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index 89b038f..3ca582f 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
@@ -64,7 +65,7 @@ static const Target *GetTarget(const MachOObjectFile *MachOObj) {
 
   errs() << "llvm-objdump: error: unable to get target for '" << TripleName
          << "', see --version and --triple.\n";
-  return 0;
+  return nullptr;
 }
 
 struct SymbolSorter {
@@ -225,8 +226,9 @@ static void DisassembleInputMachO2(StringRef Filename,
       TheTarget->createMCAsmInfo(*MRI, TripleName));
   std::unique_ptr<const MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+  MCContext Ctx(AsmInfo.get(), MRI.get(), nullptr);
   std::unique_ptr<const MCDisassembler> DisAsm(
-      TheTarget->createMCDisassembler(*STI));
+    TheTarget->createMCDisassembler(*STI, Ctx));
   int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
   std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
       AsmPrinterVariant, *AsmInfo, *InstrInfo, *MRI, *STI));
@@ -419,9 +421,9 @@ static void DisassembleInputMachO2(StringRef Filename,
             DILineInfo dli =
               diContext->getLineInfoForAddress(SectAddress + Index);
             // Print valid line info if it changed.
-            if (dli != lastLine && dli.getLine() != 0)
-              outs() << "\t## " << dli.getFileName() << ':'
-                << dli.getLine() << ':' << dli.getColumn();
+            if (dli != lastLine && dli.Line != 0)
+              outs() << "\t## " << dli.FileName << ':' << dli.Line << ':'
+                     << dli.Column;
             lastLine = dli;
           }
           outs() << "\n";
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 729fcba..a4fc6d0 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -157,7 +157,7 @@ bool llvm::error(error_code EC) {
   return true;
 }
 
-static const Target *getTarget(const ObjectFile *Obj = NULL) {
+static const Target *getTarget(const ObjectFile *Obj = nullptr) {
   // Figure out the target triple.
   llvm::Triple TheTriple("unknown-unknown-unknown");
   if (TripleName.empty()) {
@@ -167,6 +167,12 @@ static const Target *getTarget(const ObjectFile *Obj = NULL) {
       // the best we can do here is indicate that it is mach-o.
       if (Obj->isMachO())
         TheTriple.setObjectFormat(Triple::MachO);
+
+      if (Obj->isCOFF()) {
+        const auto COFFObj = dyn_cast<COFFObjectFile>(Obj);
+        if (COFFObj->getArch() == Triple::thumb)
+          TheTriple.setTriple("thumbv7-windows");
+      }
     }
   } else
     TheTriple.setTriple(Triple::normalize(TripleName));
@@ -177,7 +183,7 @@ static const Target *getTarget(const ObjectFile *Obj = NULL) {
                                                          Error);
   if (!TheTarget) {
     errs() << ToolName << ": " << Error;
-    return 0;
+    return nullptr;
   }
 
   // Update the triple name and return the found target.
@@ -309,24 +315,25 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
     return;
   }
 
-  std::unique_ptr<MCDisassembler> DisAsm(TheTarget->createMCDisassembler(*STI));
+  std::unique_ptr<const MCObjectFileInfo> MOFI(new MCObjectFileInfo);
+  MCContext Ctx(AsmInfo.get(), MRI.get(), MOFI.get());
+
+  std::unique_ptr<MCDisassembler> DisAsm(
+    TheTarget->createMCDisassembler(*STI, Ctx));
+
   if (!DisAsm) {
     errs() << "error: no disassembler for target " << TripleName << "\n";
     return;
   }
 
-  std::unique_ptr<const MCObjectFileInfo> MOFI;
-  std::unique_ptr<MCContext> Ctx;
 
   if (Symbolize) {
-    MOFI.reset(new MCObjectFileInfo);
-    Ctx.reset(new MCContext(AsmInfo.get(), MRI.get(), MOFI.get()));
     std::unique_ptr<MCRelocationInfo> RelInfo(
-        TheTarget->createMCRelocationInfo(TripleName, *Ctx.get()));
+        TheTarget->createMCRelocationInfo(TripleName, Ctx));
     if (RelInfo) {
       std::unique_ptr<MCSymbolizer> Symzer(
-        MCObjectSymbolizer::createObjectSymbolizer(*Ctx.get(),
-                                                   std::move(RelInfo), Obj));
+        MCObjectSymbolizer::createObjectSymbolizer(Ctx, std::move(RelInfo),
+                                                   Obj));
       if (Symzer)
         DisAsm->setSymbolizer(std::move(Symzer));
     }
@@ -664,16 +671,33 @@ static void PrintSectionContents(const ObjectFile *Obj) {
 
 static void PrintCOFFSymbolTable(const COFFObjectFile *coff) {
   const coff_file_header *header;
-  if (error(coff->getHeader(header))) return;
-  int aux_count = 0;
-  const coff_symbol *symbol = 0;
-  for (int i = 0, e = header->NumberOfSymbols; i != e; ++i) {
-    if (aux_count--) {
-      // Figure out which type of aux this is.
-      if (symbol->isSectionDefinition()) { // Section definition.
+  if (error(coff->getHeader(header)))
+    return;
+
+  for (unsigned SI = 0, SE = header->NumberOfSymbols; SI != SE; ++SI) {
+    const coff_symbol *Symbol;
+    StringRef Name;
+    if (error(coff->getSymbol(SI, Symbol)))
+      return;
+
+    if (error(coff->getSymbolName(Symbol, Name)))
+      return;
+
+    outs() << "[" << format("%2d", SI) << "]"
+           << "(sec " << format("%2d", int(Symbol->SectionNumber)) << ")"
+           << "(fl 0x00)" // Flag bits, which COFF doesn't have.
+           << "(ty " << format("%3x", unsigned(Symbol->Type)) << ")"
+           << "(scl " << format("%3x", unsigned(Symbol->StorageClass)) << ") "
+           << "(nx " << unsigned(Symbol->NumberOfAuxSymbols) << ") "
+           << "0x" << format("%08x", unsigned(Symbol->Value)) << " "
+           << Name << "\n";
+
+    for (unsigned AI = 0, AE = Symbol->NumberOfAuxSymbols; AI < AE; ++AI, ++SI) {
+      if (Symbol->isSectionDefinition()) {
         const coff_aux_section_definition *asd;
-        if (error(coff->getAuxSymbol<coff_aux_section_definition>(i, asd)))
+        if (error(coff->getAuxSymbol<coff_aux_section_definition>(SI + 1, asd)))
           return;
+
         outs() << "AUX "
                << format("scnlen 0x%x nreloc %d nlnno %d checksum 0x%x "
                          , unsigned(asd->Length)
@@ -683,21 +707,20 @@ static void PrintCOFFSymbolTable(const COFFObjectFile *coff) {
                << format("assoc %d comdat %d\n"
                          , unsigned(asd->Number)
                          , unsigned(asd->Selection));
-      } else
+      } else if (Symbol->isFileRecord()) {
+        const coff_aux_file *AF;
+        if (error(coff->getAuxSymbol<coff_aux_file>(SI + 1, AF)))
+          return;
+
+        StringRef Name(AF->FileName,
+                       Symbol->NumberOfAuxSymbols * COFF::SymbolSize);
+        outs() << "AUX " << Name.rtrim(StringRef("\0", 1))  << '\n';
+
+        SI = SI + Symbol->NumberOfAuxSymbols;
+        break;
+      } else {
         outs() << "AUX Unknown\n";
-    } else {
-      StringRef name;
-      if (error(coff->getSymbol(i, symbol))) return;
-      if (error(coff->getSymbolName(symbol, name))) return;
-      outs() << "[" << format("%2d", i) << "]"
-             << "(sec " << format("%2d", int(symbol->SectionNumber)) << ")"
-             << "(fl 0x00)" // Flag bits, which COFF doesn't have.
-             << "(ty " << format("%3x", unsigned(symbol->Type)) << ")"
-             << "(scl " << format("%3x", unsigned(symbol->StorageClass)) << ") "
-             << "(nx " << unsigned(symbol->NumberOfAuxSymbols) << ") "
-             << "0x" << format("%08x", unsigned(symbol->Value)) << " "
-             << name << "\n";
-      aux_count = symbol->NumberOfAuxSymbols;
+      }
     }
   }
 }
diff --git a/tools/llvm-profdata/llvm-profdata.cpp b/tools/llvm-profdata/llvm-profdata.cpp
index 397b523..fdde32a 100644
--- a/tools/llvm-profdata/llvm-profdata.cpp
+++ b/tools/llvm-profdata/llvm-profdata.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ProfileData/InstrProfWriter.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -39,16 +40,15 @@ int merge_main(int argc, const char *argv[]) {
   cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
                                       cl::init("-"),
                                       cl::desc("Output file"));
-  cl::alias OutputFilenameA("o", cl::desc("Alias for --output"),
-                                 cl::aliasopt(OutputFilename));
+  cl::alias OutputFilenameA("o", cl::desc("Alias for --output"), cl::Required,
+                            cl::aliasopt(OutputFilename));
 
   cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
 
-  if (OutputFilename.empty())
-    OutputFilename = "-";
+  if (OutputFilename.compare("-") == 0)
+    exitWithError("Cannot write indexed profdata format to stdout.");
 
   std::string ErrorInfo;
-  // FIXME: F_Text would be available if line_iterator could accept CRLF.
   raw_fd_ostream Output(OutputFilename.data(), ErrorInfo, sys::fs::F_None);
   if (!ErrorInfo.empty())
     exitWithError(ErrorInfo, OutputFilename);
@@ -112,6 +112,7 @@ int show_main(int argc, const char *argv[]) {
                  Func.Name.find(ShowFunction) != Func.Name.npos);
 
     ++TotalFunctions;
+    assert(Func.Counts.size() > 0 && "function missing entry counter");
     if (Func.Counts[0] > MaxFunctionCount)
       MaxFunctionCount = Func.Counts[0];
 
@@ -156,7 +157,7 @@ int main(int argc, const char *argv[]) {
 
   StringRef ProgName(sys::path::filename(argv[0]));
   if (argc > 1) {
-    int (*func)(int, const char *[]) = 0;
+    int (*func)(int, const char *[]) = nullptr;
 
     if (strcmp(argv[1], "merge") == 0)
       func = merge_main;
diff --git a/tools/llvm-readobj/ARMAttributeParser.cpp b/tools/llvm-readobj/ARMAttributeParser.cpp
index 5857547..d35cd14 100644
--- a/tools/llvm-readobj/ARMAttributeParser.cpp
+++ b/tools/llvm-readobj/ARMAttributeParser.cpp
@@ -9,6 +9,7 @@
 
 #include "ARMAttributeParser.h"
 #include "StreamWriter.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/LEB128.h"
 
@@ -22,11 +23,6 @@ static const EnumEntry<unsigned> TagNames[] = {
   { "Tag_Symbol", ARMBuildAttrs::Symbol },
 };
 
-template <typename type_, size_t size_>
-size_t countof(const type_ (&)[size_]) {
-  return size_;
-}
-
 namespace llvm {
 #define ATTRIBUTE_HANDLER(Attr_)                                                \
   { ARMBuildAttrs::Attr_, &ARMAttributeParser::Attr_ }
@@ -129,7 +125,8 @@ void ARMAttributeParser::CPU_arch(AttrType Tag, const uint8_t *Data,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -155,7 +152,8 @@ void ARMAttributeParser::ARM_ISA_use(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "Not Permitted", "Permitted" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -164,7 +162,8 @@ void ARMAttributeParser::THUMB_ISA_use(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "Not Permitted", "Thumb-1", "Thumb-2" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -176,7 +175,8 @@ void ARMAttributeParser::FP_arch(AttrType Tag, const uint8_t *Data,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -185,7 +185,8 @@ void ARMAttributeParser::WMMX_arch(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "Not Permitted", "WMMXv1", "WMMXv2" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -196,7 +197,8 @@ void ARMAttributeParser::Advanced_SIMD_arch(AttrType Tag, const uint8_t *Data,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -208,7 +210,8 @@ void ARMAttributeParser::PCS_config(AttrType Tag, const uint8_t *Data,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -217,7 +220,8 @@ void ARMAttributeParser::ABI_PCS_R9_use(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "v6", "Static Base", "TLS", "Unused" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -228,7 +232,8 @@ void ARMAttributeParser::ABI_PCS_RW_data(AttrType Tag, const uint8_t *Data,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -237,7 +242,8 @@ void ARMAttributeParser::ABI_PCS_RO_data(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "Absolute", "PC-relative", "Not Permitted" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -246,7 +252,8 @@ void ARMAttributeParser::ABI_PCS_GOT_use(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "Not Permitted", "Direct", "GOT-Indirect" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -257,7 +264,8 @@ void ARMAttributeParser::ABI_PCS_wchar_t(AttrType Tag, const uint8_t *Data,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -266,7 +274,8 @@ void ARMAttributeParser::ABI_FP_rounding(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "IEEE-754", "Runtime" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -275,7 +284,8 @@ void ARMAttributeParser::ABI_FP_denormal(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "Unsupported", "IEEE-754", "Sign Only" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -284,7 +294,8 @@ void ARMAttributeParser::ABI_FP_exceptions(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "Not Permitted", "IEEE-754" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -294,7 +305,8 @@ void ARMAttributeParser::ABI_FP_user_exceptions(AttrType Tag,
   static const char *Strings[] = { "Not Permitted", "IEEE-754" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -305,7 +317,8 @@ void ARMAttributeParser::ABI_FP_number_model(AttrType Tag, const uint8_t *Data,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -318,7 +331,7 @@ void ARMAttributeParser::ABI_align_needed(AttrType Tag, const uint8_t *Data,
   uint64_t Value = ParseInteger(Data, Offset);
 
   std::string Description;
-  if (Value < countof(Strings))
+  if (Value < array_lengthof(Strings))
     Description = std::string(Strings[Value]);
   else if (Value <= 12)
     Description = std::string("8-byte alignment, ") + utostr(1 << Value)
@@ -339,7 +352,7 @@ void ARMAttributeParser::ABI_align_preserved(AttrType Tag, const uint8_t *Data,
   uint64_t Value = ParseInteger(Data, Offset);
 
   std::string Description;
-  if (Value < countof(Strings))
+  if (Value < array_lengthof(Strings))
     Description = std::string(Strings[Value]);
   else if (Value <= 12)
     Description = std::string("8-byte stack alignment, ") + utostr(1 << Value)
@@ -357,7 +370,8 @@ void ARMAttributeParser::ABI_enum_size(AttrType Tag, const uint8_t *Data,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -368,7 +382,8 @@ void ARMAttributeParser::ABI_HardFP_use(AttrType Tag, const uint8_t *Data,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -379,7 +394,8 @@ void ARMAttributeParser::ABI_VFP_args(AttrType Tag, const uint8_t *Data,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -388,7 +404,8 @@ void ARMAttributeParser::ABI_WMMX_args(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "AAPCS", "iWMMX", "Custom" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -401,7 +418,8 @@ void ARMAttributeParser::ABI_optimization_goals(AttrType Tag,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -414,7 +432,8 @@ void ARMAttributeParser::ABI_FP_optimization_goals(AttrType Tag,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -445,7 +464,8 @@ void ARMAttributeParser::CPU_unaligned_access(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "Not Permitted", "v6-style" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -454,7 +474,8 @@ void ARMAttributeParser::FP_HP_extension(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "If Available", "Permitted" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -463,7 +484,8 @@ void ARMAttributeParser::ABI_FP_16bit_format(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "Not Permitted", "IEEE-754", "VFPv3" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -472,7 +494,8 @@ void ARMAttributeParser::MPextension_use(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "Not Permitted", "Permitted" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -483,7 +506,8 @@ void ARMAttributeParser::DIV_use(AttrType Tag, const uint8_t *Data,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -492,7 +516,8 @@ void ARMAttributeParser::T2EE_use(AttrType Tag, const uint8_t *Data,
   static const char *Strings[] = { "Not Permitted", "Permitted" };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -504,7 +529,8 @@ void ARMAttributeParser::Virtualization_use(AttrType Tag, const uint8_t *Data,
   };
 
   uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc = (Value < countof(Strings)) ? Strings[Value] : NULL;
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
   PrintAttribute(Tag, Value, ValueDesc);
 }
 
@@ -534,7 +560,7 @@ void ARMAttributeParser::ParseAttributeList(const uint8_t *Data,
     Offset += Length;
 
     bool Handled = false;
-    for (unsigned AHI = 0, AHE = countof(DisplayRoutines);
+    for (unsigned AHI = 0, AHE = array_lengthof(DisplayRoutines);
          AHI != AHE && !Handled; ++AHI) {
       if (DisplayRoutines[AHI].Attribute == Tag) {
         (this->*DisplayRoutines[AHI].Routine)(ARMBuildAttrs::AttrType(Tag),
diff --git a/tools/llvm-readobj/ARMEHABIPrinter.h b/tools/llvm-readobj/ARMEHABIPrinter.h
index 75e2bee..7608cfb 100644
--- a/tools/llvm-readobj/ARMEHABIPrinter.h
+++ b/tools/llvm-readobj/ARMEHABIPrinter.h
@@ -12,6 +12,7 @@
 
 #include "Error.h"
 #include "StreamWriter.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Object/ELF.h"
 #include "llvm/Object/ELFTypes.h"
 #include "llvm/Support/ARMEHABI.h"
@@ -20,13 +21,6 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/type_traits.h"
 
-namespace {
-template <typename type_, size_t N>
-size_t countof(const type_ (&)[N]) {
-  return N;
-}
-}
-
 namespace llvm {
 namespace ARM {
 namespace EHABI {
@@ -296,7 +290,8 @@ void OpcodeDecoder::PrintRegisters(uint32_t VFPMask, StringRef Prefix) {
 void OpcodeDecoder::Decode(const uint8_t *Opcodes, off_t Offset, size_t Length) {
   for (unsigned OCI = Offset; OCI < Length + Offset; ) {
     bool Decoded = false;
-    for (unsigned REI = 0, REE = countof(Ring); REI != REE && !Decoded; ++REI) {
+    for (unsigned REI = 0, REE = array_lengthof(Ring);
+         REI != REE && !Decoded; ++REI) {
       if ((Opcodes[OCI ^ 3] & Ring[REI].Mask) == Ring[REI].Value) {
         (this->*Ring[REI].Routine)(Opcodes, OCI);
         Decoded = true;
@@ -390,7 +385,7 @@ PrinterContext<ET>::FindExceptionTable(unsigned IndexSectionIndex,
       }
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 template <typename ET>
diff --git a/tools/llvm-readobj/CMakeLists.txt b/tools/llvm-readobj/CMakeLists.txt
index 036185d..b057dcd 100644
--- a/tools/llvm-readobj/CMakeLists.txt
+++ b/tools/llvm-readobj/CMakeLists.txt
@@ -5,12 +5,13 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_tool(llvm-readobj
-  llvm-readobj.cpp
-  ObjDumper.cpp
+  ARMAttributeParser.cpp
   COFFDumper.cpp
   ELFDumper.cpp
-  MachODumper.cpp
   Error.cpp
+  llvm-readobj.cpp
+  MachODumper.cpp
+  ObjDumper.cpp
   StreamWriter.cpp
-  ARMAttributeParser.cpp
+  Win64EHDumper.cpp
   )
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index cd40da7..91f2a57 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -16,6 +16,7 @@
 #include "Error.h"
 #include "ObjDumper.h"
 #include "StreamWriter.h"
+#include "Win64EHDumper.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Object/COFF.h"
@@ -58,45 +59,24 @@ private:
   void printSymbol(const SymbolRef &Sym);
   void printRelocation(const SectionRef &Section, const RelocationRef &Reloc);
   void printDataDirectory(uint32_t Index, const std::string &FieldName);
-  void printX64UnwindInfo();
 
   template <class PEHeader> void printPEHeader(const PEHeader *Hdr);
   void printBaseOfDataField(const pe32_header *Hdr);
   void printBaseOfDataField(const pe32plus_header *Hdr);
 
-  void printRuntimeFunction(
-    const RuntimeFunction& RTF,
-    uint64_t OffsetInSection,
-    const std::vector<RelocationRef> &Rels);
-
-  void printUnwindInfo(
-    const Win64EH::UnwindInfo& UI,
-    uint64_t OffsetInSection,
-    const std::vector<RelocationRef> &Rels);
-
-  void printUnwindCode(const Win64EH::UnwindInfo &UI, ArrayRef<UnwindCode> UCs);
-
   void printCodeViewLineTables(const SectionRef &Section);
 
   void cacheRelocations();
 
-  error_code getSectionContents(
-    const std::vector<RelocationRef> &Rels,
-    uint64_t Offset,
-    ArrayRef<uint8_t> &Contents,
-    uint64_t &Addr);
-
-  error_code getSection(
-    const std::vector<RelocationRef> &Rels,
-    uint64_t Offset,
-    const coff_section **Section,
-    uint64_t *AddrPtr);
+  error_code resolveSymbol(const coff_section *Section, uint64_t Offset,
+                           SymbolRef &Sym);
+  error_code resolveSymbolName(const coff_section *Section, uint64_t Offset,
+                               StringRef &Name);
 
   typedef DenseMap<const coff_section*, std::vector<RelocationRef> > RelocMapTy;
 
   const llvm::object::COFFObjectFile *Obj;
   RelocMapTy RelocMap;
-  std::vector<RelocationRef> EmptyRelocs;
 };
 
 } // namespace
@@ -116,110 +96,33 @@ error_code createCOFFDumper(const object::ObjectFile *Obj, StreamWriter &Writer,
 
 } // namespace llvm
 
-
-// Returns the name of the unwind code.
-static StringRef getUnwindCodeTypeName(uint8_t Code) {
-  switch(Code) {
-  default: llvm_unreachable("Invalid unwind code");
-  case UOP_PushNonVol: return "PUSH_NONVOL";
-  case UOP_AllocLarge: return "ALLOC_LARGE";
-  case UOP_AllocSmall: return "ALLOC_SMALL";
-  case UOP_SetFPReg: return "SET_FPREG";
-  case UOP_SaveNonVol: return "SAVE_NONVOL";
-  case UOP_SaveNonVolBig: return "SAVE_NONVOL_FAR";
-  case UOP_SaveXMM128: return "SAVE_XMM128";
-  case UOP_SaveXMM128Big: return "SAVE_XMM128_FAR";
-  case UOP_PushMachFrame: return "PUSH_MACHFRAME";
-  }
-}
-
-// Returns the name of a referenced register.
-static StringRef getUnwindRegisterName(uint8_t Reg) {
-  switch(Reg) {
-  default: llvm_unreachable("Invalid register");
-  case 0: return "RAX";
-  case 1: return "RCX";
-  case 2: return "RDX";
-  case 3: return "RBX";
-  case 4: return "RSP";
-  case 5: return "RBP";
-  case 6: return "RSI";
-  case 7: return "RDI";
-  case 8: return "R8";
-  case 9: return "R9";
-  case 10: return "R10";
-  case 11: return "R11";
-  case 12: return "R12";
-  case 13: return "R13";
-  case 14: return "R14";
-  case 15: return "R15";
-  }
-}
-
-// Calculates the number of array slots required for the unwind code.
-static unsigned getNumUsedSlots(const UnwindCode &UnwindCode) {
-  switch (UnwindCode.getUnwindOp()) {
-  default: llvm_unreachable("Invalid unwind code");
-  case UOP_PushNonVol:
-  case UOP_AllocSmall:
-  case UOP_SetFPReg:
-  case UOP_PushMachFrame:
-    return 1;
-  case UOP_SaveNonVol:
-  case UOP_SaveXMM128:
-    return 2;
-  case UOP_SaveNonVolBig:
-  case UOP_SaveXMM128Big:
-    return 3;
-  case UOP_AllocLarge:
-    return (UnwindCode.getOpInfo() == 0) ? 2 : 3;
-  }
-}
-
-// Given a symbol sym this functions returns the address and section of it.
-static error_code resolveSectionAndAddress(const COFFObjectFile *Obj,
-                                           const SymbolRef &Sym,
-                                           const coff_section *&ResolvedSection,
-                                           uint64_t &ResolvedAddr) {
-  if (error_code EC = Sym.getAddress(ResolvedAddr))
-    return EC;
-
-  section_iterator iter(Obj->section_begin());
-  if (error_code EC = Sym.getSection(iter))
-    return EC;
-
-  ResolvedSection = Obj->getCOFFSection(*iter);
-  return object_error::success;
-}
-
-// Given a vector of relocations for a section and an offset into this section
-// the function returns the symbol used for the relocation at the offset.
-static error_code resolveSymbol(const std::vector<RelocationRef> &Rels,
-                                uint64_t Offset, SymbolRef &Sym) {
-  for (std::vector<RelocationRef>::const_iterator RelI = Rels.begin(),
-                                                  RelE = Rels.end();
-                                                  RelI != RelE; ++RelI) {
-    uint64_t Ofs;
-    if (error_code EC = RelI->getOffset(Ofs))
+// Given a a section and an offset into this section the function returns the
+// symbol used for the relocation at the offset.
+error_code COFFDumper::resolveSymbol(const coff_section *Section,
+                                     uint64_t Offset, SymbolRef &Sym) {
+  const auto &Relocations = RelocMap[Section];
+  for (const auto &Relocation : Relocations) {
+    uint64_t RelocationOffset;
+    if (error_code EC = Relocation.getOffset(RelocationOffset))
       return EC;
 
-    if (Ofs == Offset) {
-      Sym = *RelI->getSymbol();
+    if (RelocationOffset == Offset) {
+      Sym = *Relocation.getSymbol();
       return readobj_error::success;
     }
   }
-
   return readobj_error::unknown_symbol;
 }
 
-// Given a vector of relocations for a section and an offset into this section
-// the function returns the name of the symbol used for the relocation at the
-// offset.
-static error_code resolveSymbolName(const std::vector<RelocationRef> &Rels,
-                                    uint64_t Offset, StringRef &Name) {
-  SymbolRef Sym;
-  if (error_code EC = resolveSymbol(Rels, Offset, Sym)) return EC;
-  if (error_code EC = Sym.getName(Name)) return EC;
+// Given a section and an offset into this section the function returns the name
+// of the symbol used for the relocation at the offset.
+error_code COFFDumper::resolveSymbolName(const coff_section *Section,
+                                         uint64_t Offset, StringRef &Name) {
+  SymbolRef Symbol;
+  if (error_code EC = resolveSymbol(Section, Offset, Symbol))
+    return EC;
+  if (error_code EC = Symbol.getName(Name))
+    return EC;
   return object_error::success;
 }
 
@@ -403,50 +306,6 @@ WeakExternalCharacteristics[] = {
   { "Alias"    , COFF::IMAGE_WEAK_EXTERN_SEARCH_ALIAS     }
 };
 
-static const EnumEntry<unsigned> UnwindFlags[] = {
-  { "ExceptionHandler", Win64EH::UNW_ExceptionHandler },
-  { "TerminateHandler", Win64EH::UNW_TerminateHandler },
-  { "ChainInfo"       , Win64EH::UNW_ChainInfo        }
-};
-
-static const EnumEntry<unsigned> UnwindOpInfo[] = {
-  { "RAX",  0 },
-  { "RCX",  1 },
-  { "RDX",  2 },
-  { "RBX",  3 },
-  { "RSP",  4 },
-  { "RBP",  5 },
-  { "RSI",  6 },
-  { "RDI",  7 },
-  { "R8",   8 },
-  { "R9",   9 },
-  { "R10", 10 },
-  { "R11", 11 },
-  { "R12", 12 },
-  { "R13", 13 },
-  { "R14", 14 },
-  { "R15", 15 }
-};
-
-// Some additional COFF structures not defined by llvm::object.
-namespace {
-  struct coff_aux_file_record {
-    char FileName[18];
-  };
-} // namespace
-
-static uint64_t getOffsetOfLSDA(const Win64EH::UnwindInfo& UI) {
-  return static_cast<const char*>(UI.getLanguageSpecificData())
-         - reinterpret_cast<const char*>(&UI);
-}
-
-static uint32_t getLargeSlotValue(ArrayRef<UnwindCode> UCs) {
-  if (UCs.size() < 3)
-    return 0;
-
-  return UCs[1].FrameOffset + (static_cast<uint32_t>(UCs[2].FrameOffset) << 16);
-}
-
 template<typename T>
 static error_code getSymbolAuxData(const COFFObjectFile *Obj,
                                    const coff_symbol *Symbol, const T* &Aux) {
@@ -455,69 +314,6 @@ static error_code getSymbolAuxData(const COFFObjectFile *Obj,
   return readobj_error::success;
 }
 
-static std::string formatSymbol(const std::vector<RelocationRef> &Rels,
-                                uint64_t Offset, uint32_t Disp) {
-  std::string Buffer;
-  raw_string_ostream Str(Buffer);
-
-  StringRef Sym;
-  if (resolveSymbolName(Rels, Offset, Sym)) {
-    Str << format(" (0x%" PRIX64 ")", Offset);
-    return Str.str();
-  }
-
-  Str << Sym;
-  if (Disp > 0) {
-    Str << format(" +0x%X (0x%" PRIX64 ")", Disp, Offset);
-  } else {
-    Str << format(" (0x%" PRIX64 ")", Offset);
-  }
-
-  return Str.str();
-}
-
-// Given a vector of relocations for a section and an offset into this section
-// the function resolves the symbol used for the relocation at the offset and
-// returns the section content and the address inside the content pointed to
-// by the symbol.
-error_code COFFDumper::getSectionContents(
-    const std::vector<RelocationRef> &Rels, uint64_t Offset,
-    ArrayRef<uint8_t> &Contents, uint64_t &Addr) {
-
-  SymbolRef Sym;
-  const coff_section *Section;
-
-  if (error_code EC = resolveSymbol(Rels, Offset, Sym))
-    return EC;
-  if (error_code EC = resolveSectionAndAddress(Obj, Sym, Section, Addr))
-    return EC;
-  if (error_code EC = Obj->getSectionContents(Section, Contents))
-    return EC;
-
-  return object_error::success;
-}
-
-error_code COFFDumper::getSection(
-    const std::vector<RelocationRef> &Rels, uint64_t Offset,
-    const coff_section **SectionPtr, uint64_t *AddrPtr) {
-
-  SymbolRef Sym;
-  if (error_code EC = resolveSymbol(Rels, Offset, Sym))
-    return EC;
-
-  const coff_section *Section;
-  uint64_t Addr;
-  if (error_code EC = resolveSectionAndAddress(Obj, Sym, Section, Addr))
-    return EC;
-
-  if (SectionPtr)
-    *SectionPtr = Section;
-  if (AddrPtr)
-    *AddrPtr = Addr;
-
-  return object_error::success;
-}
-
 void COFFDumper::cacheRelocations() {
   for (const SectionRef &S : Obj->sections()) {
     const coff_section *Section = Obj->getCOFFSection(S);
@@ -541,7 +337,7 @@ void COFFDumper::printDataDirectory(uint32_t Index, const std::string &FieldName
 
 void COFFDumper::printFileHeaders() {
   // Print COFF header
-  const coff_file_header *COFFHeader = 0;
+  const coff_file_header *COFFHeader = nullptr;
   if (error(Obj->getCOFFHeader(COFFHeader)))
     return;
 
@@ -564,13 +360,13 @@ void COFFDumper::printFileHeaders() {
 
   // Print PE header. This header does not exist if this is an object file and
   // not an executable.
-  const pe32_header *PEHeader = 0;
+  const pe32_header *PEHeader = nullptr;
   if (error(Obj->getPE32Header(PEHeader)))
     return;
   if (PEHeader)
     printPEHeader<pe32_header>(PEHeader);
 
-  const pe32plus_header *PEPlusHeader = 0;
+  const pe32plus_header *PEPlusHeader = nullptr;
   if (error(Obj->getPE32PlusHeader(PEPlusHeader)))
     return;
   if (PEPlusHeader)
@@ -685,8 +481,8 @@ void COFFDumper::printCodeViewLineTables(const SectionRef &Section) {
         }
 
         StringRef FunctionName;
-        if (error(resolveSymbolName(RelocMap[Obj->getCOFFSection(Section)],
-                                    Offset, FunctionName)))
+        if (error(resolveSymbolName(Obj->getCOFFSection(Section), Offset,
+                                    FunctionName)))
           return;
         W.printString("FunctionName", FunctionName);
         if (FunctionLineTables.count(FunctionName) != 0) {
@@ -700,7 +496,7 @@ void COFFDumper::printCodeViewLineTables(const SectionRef &Section) {
         break;
       }
       case COFF::DEBUG_STRING_TABLE_SUBSECTION:
-        if (PayloadSize == 0 || StringTable.data() != 0 ||
+        if (PayloadSize == 0 || StringTable.data() != nullptr ||
             Contents.back() != '\0') {
           // Empty or duplicate or non-null-terminated subsection.
           error(object_error::parse_failed);
@@ -712,7 +508,8 @@ void COFFDumper::printCodeViewLineTables(const SectionRef &Section) {
         // Holds the translation table from file indices
         // to offsets in the string table.
 
-        if (PayloadSize == 0 || FileIndexToStringOffsetTable.data() != 0) {
+        if (PayloadSize == 0 ||
+            FileIndexToStringOffsetTable.data() != nullptr) {
           // Empty or duplicate subsection.
           error(object_error::parse_failed);
           return;
@@ -979,13 +776,16 @@ void COFFDumper::printSymbol(const SymbolRef &Sym) {
       W.printBinary("Unused", makeArrayRef(Aux->Unused));
 
     } else if (Symbol->isFileRecord()) {
-      const coff_aux_file_record *Aux;
+      const coff_aux_file *Aux;
       if (error(getSymbolAuxData(Obj, Symbol + I, Aux)))
         break;
 
       DictScope AS(W, "AuxFileRecord");
-      W.printString("FileName", StringRef(Aux->FileName));
 
+      StringRef Name(Aux->FileName,
+                     Symbol->NumberOfAuxSymbols * COFF::SymbolSize);
+      W.printString("FileName", Name.rtrim(StringRef("\0", 1)));
+      break;
     } else if (Symbol->isSectionDefinition()) {
       const coff_aux_section_definition *Aux;
       if (error(getSymbolAuxData(Obj, Symbol + I, Aux)))
@@ -1045,181 +845,23 @@ void COFFDumper::printUnwindInfo() {
     return;
 
   ListScope D(W, "UnwindInformation");
-  if (Header->Machine != COFF::IMAGE_FILE_MACHINE_AMD64) {
-    W.startLine() << "Unsupported image machine type "
-              "(currently only AMD64 is supported).\n";
-    return;
-  }
-
-  printX64UnwindInfo();
-}
-
-void COFFDumper::printX64UnwindInfo() {
-  for (const SectionRef &Section : Obj->sections()) {
-    StringRef Name;
-    if (error(Section.getName(Name)))
-      continue;
-    if (Name != ".pdata" && !Name.startswith(".pdata$"))
-      continue;
-
-    const coff_section *PData = Obj->getCOFFSection(Section);
-
-    ArrayRef<uint8_t> Contents;
-    if (error(Obj->getSectionContents(PData, Contents)) || Contents.empty())
-      continue;
-
-    ArrayRef<RuntimeFunction> RFs(
-      reinterpret_cast<const RuntimeFunction *>(Contents.data()),
-      Contents.size() / sizeof(RuntimeFunction));
-
-    for (const RuntimeFunction *I = RFs.begin(), *E = RFs.end(); I < E; ++I) {
-      const uint64_t OffsetInSection = std::distance(RFs.begin(), I)
-                                     * sizeof(RuntimeFunction);
-
-      printRuntimeFunction(*I, OffsetInSection, RelocMap[PData]);
-    }
-  }
-}
-
-void COFFDumper::printRuntimeFunction(
-    const RuntimeFunction& RTF,
-    uint64_t OffsetInSection,
-    const std::vector<RelocationRef> &Rels) {
-
-  DictScope D(W, "RuntimeFunction");
-  W.printString("StartAddress",
-                formatSymbol(Rels, OffsetInSection + 0, RTF.StartAddress));
-  W.printString("EndAddress",
-                formatSymbol(Rels, OffsetInSection + 4, RTF.EndAddress));
-  W.printString("UnwindInfoAddress",
-                formatSymbol(Rels, OffsetInSection + 8, RTF.UnwindInfoOffset));
-
-  const coff_section* XData = 0;
-  uint64_t UnwindInfoOffset = 0;
-  if (error(getSection(Rels, OffsetInSection + 8, &XData, &UnwindInfoOffset)))
-    return;
-
-  ArrayRef<uint8_t> XContents;
-  if (error(Obj->getSectionContents(XData, XContents)) || XContents.empty())
-    return;
-
-  UnwindInfoOffset += RTF.UnwindInfoOffset;
-  if (UnwindInfoOffset > XContents.size())
-    return;
-
-  const Win64EH::UnwindInfo *UI =
-    reinterpret_cast<const Win64EH::UnwindInfo *>(
-      XContents.data() + UnwindInfoOffset);
-
-  printUnwindInfo(*UI, UnwindInfoOffset, RelocMap[XData]);
-}
-
-void COFFDumper::printUnwindInfo(
-    const Win64EH::UnwindInfo& UI,
-    uint64_t OffsetInSection,
-    const std::vector<RelocationRef> &Rels) {
-  DictScope D(W, "UnwindInfo");
-  W.printNumber("Version", UI.getVersion());
-  W.printFlags("Flags", UI.getFlags(), makeArrayRef(UnwindFlags));
-  W.printNumber("PrologSize", UI.PrologSize);
-  if (UI.getFrameRegister() != 0) {
-    W.printEnum("FrameRegister", UI.getFrameRegister(),
-                makeArrayRef(UnwindOpInfo));
-    W.printHex("FrameOffset", UI.getFrameOffset());
-  } else {
-    W.printString("FrameRegister", StringRef("-"));
-    W.printString("FrameOffset", StringRef("-"));
-  }
-
-  W.printNumber("UnwindCodeCount", UI.NumCodes);
-  {
-    ListScope CodesD(W, "UnwindCodes");
-    ArrayRef<UnwindCode> UCs(&UI.UnwindCodes[0], UI.NumCodes);
-    for (const UnwindCode *I = UCs.begin(), *E = UCs.end(); I < E; ++I) {
-      unsigned UsedSlots = getNumUsedSlots(*I);
-      if (UsedSlots > UCs.size()) {
-        errs() << "Corrupt unwind data";
-        return;
-      }
-      printUnwindCode(UI, ArrayRef<UnwindCode>(I, E));
-      I += UsedSlots - 1;
-    }
-  }
-
-  uint64_t LSDAOffset = OffsetInSection + getOffsetOfLSDA(UI);
-  if (UI.getFlags() & (UNW_ExceptionHandler | UNW_TerminateHandler)) {
-    W.printString("Handler", formatSymbol(Rels, LSDAOffset,
-                                        UI.getLanguageSpecificHandlerOffset()));
-  } else if (UI.getFlags() & UNW_ChainInfo) {
-    const RuntimeFunction *Chained = UI.getChainedFunctionEntry();
-    if (Chained) {
-      DictScope D(W, "Chained");
-      W.printString("StartAddress", formatSymbol(Rels, LSDAOffset + 0,
-                                                        Chained->StartAddress));
-      W.printString("EndAddress", formatSymbol(Rels, LSDAOffset + 4,
-                                                          Chained->EndAddress));
-      W.printString("UnwindInfoAddress", formatSymbol(Rels, LSDAOffset + 8,
-                                                    Chained->UnwindInfoOffset));
-    }
-  }
-}
-
-// Prints one unwind code. Because an unwind code can occupy up to 3 slots in
-// the unwind codes array, this function requires that the correct number of
-// slots is provided.
-void COFFDumper::printUnwindCode(const Win64EH::UnwindInfo& UI,
-                                 ArrayRef<UnwindCode> UCs) {
-  assert(UCs.size() >= getNumUsedSlots(UCs[0]));
-
-  W.startLine() << format("0x%02X: ", unsigned(UCs[0].u.CodeOffset))
-                << getUnwindCodeTypeName(UCs[0].getUnwindOp());
-
-  uint32_t AllocSize = 0;
-
-  switch (UCs[0].getUnwindOp()) {
-  case UOP_PushNonVol:
-    outs() << " reg=" << getUnwindRegisterName(UCs[0].getOpInfo());
+  switch (Header->Machine) {
+  case COFF::IMAGE_FILE_MACHINE_AMD64: {
+    Win64EH::Dumper Dumper(W);
+    Win64EH::Dumper::SymbolResolver Resolver =
+      [](const object::coff_section *Section, uint64_t Offset,
+         SymbolRef &Symbol, void *user_data) -> error_code {
+        COFFDumper *Dumper = reinterpret_cast<COFFDumper*>(user_data);
+        return Dumper->resolveSymbol(Section, Offset, Symbol);
+      };
+    Win64EH::Dumper::Context Ctx(*Obj, Resolver, this);
+    Dumper.printData(Ctx);
     break;
-
-  case UOP_AllocLarge:
-    if (UCs[0].getOpInfo() == 0) {
-      AllocSize = UCs[1].FrameOffset * 8;
-    } else {
-      AllocSize = getLargeSlotValue(UCs);
-    }
-    outs() << " size=" << AllocSize;
-    break;
-  case UOP_AllocSmall:
-    outs() << " size=" << ((UCs[0].getOpInfo() + 1) * 8);
-    break;
-  case UOP_SetFPReg:
-    if (UI.getFrameRegister() == 0) {
-      outs() << " reg=<invalid>";
-    } else {
-      outs() << " reg=" << getUnwindRegisterName(UI.getFrameRegister())
-             << format(", offset=0x%X", UI.getFrameOffset() * 16);
-    }
-    break;
-  case UOP_SaveNonVol:
-    outs() << " reg=" << getUnwindRegisterName(UCs[0].getOpInfo())
-           << format(", offset=0x%X", UCs[1].FrameOffset * 8);
-    break;
-  case UOP_SaveNonVolBig:
-    outs() << " reg=" << getUnwindRegisterName(UCs[0].getOpInfo())
-           << format(", offset=0x%X", getLargeSlotValue(UCs));
-    break;
-  case UOP_SaveXMM128:
-    outs() << " reg=XMM" << static_cast<uint32_t>(UCs[0].getOpInfo())
-           << format(", offset=0x%X", UCs[1].FrameOffset * 16);
-    break;
-  case UOP_SaveXMM128Big:
-    outs() << " reg=XMM" << static_cast<uint32_t>(UCs[0].getOpInfo())
-           << format(", offset=0x%X", getLargeSlotValue(UCs));
-    break;
-  case UOP_PushMachFrame:
-    outs() << " errcode=" << (UCs[0].getOpInfo() == 0 ? "no" : "yes");
+  }
+  default:
+    W.printEnum("unsupported Image Machine", Header->Machine,
+                makeArrayRef(ImageFileMachineType));
     break;
   }
-
-  outs() << "\n";
 }
+
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index 4cd3393..de4c207 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -437,6 +437,29 @@ static const EnumEntry<unsigned> ElfSegmentFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, PF_R)
 };
 
+static const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_NOREORDER),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_PIC),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_CPIC),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ABI2),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_32BITMODE),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_NAN2008),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ABI_O32),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MICROMIPS),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_ASE_M16),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_1),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_2),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_3),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_4),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_5),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_32),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_64),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_32R2),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_64R2),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_32R6),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_64R6)
+};
+
 template<class ELFT>
 void ELFDumper<ELFT>::printFileHeaders() {
   const typename ELFO::Elf_Ehdr *Header = Obj->getHeader();
@@ -464,7 +487,11 @@ void ELFDumper<ELFT>::printFileHeaders() {
     W.printHex   ("Entry", Header->e_entry);
     W.printHex   ("ProgramHeaderOffset", Header->e_phoff);
     W.printHex   ("SectionHeaderOffset", Header->e_shoff);
-    W.printFlags ("Flags", Header->e_flags);
+    if (Header->e_machine == EM_MIPS)
+      W.printFlags("Flags", Header->e_flags, makeArrayRef(ElfHeaderMipsFlags),
+                   unsigned(ELF::EF_MIPS_ARCH));
+    else
+      W.printFlags("Flags", Header->e_flags);
     W.printNumber("HeaderSize", Header->e_ehsize);
     W.printNumber("ProgramHeaderEntrySize", Header->e_phentsize);
     W.printNumber("ProgramHeaderCount", Header->e_phnum);
@@ -652,7 +679,8 @@ void ELFDumper<ELFT>::printSymbol(typename ELFO::Elf_Sym_Iter Symbol) {
   std::string FullSymbolName(SymbolName);
   if (Symbol.isDynamic()) {
     bool IsDefault;
-    ErrorOr<StringRef> Version = Obj->getSymbolVersion(0, &*Symbol, IsDefault);
+    ErrorOr<StringRef> Version = Obj->getSymbolVersion(nullptr, &*Symbol,
+                                                       IsDefault);
     if (Version) {
       FullSymbolName += (IsDefault ? "@@" : "@");
       FullSymbolName += *Version;
@@ -712,6 +740,8 @@ static const char *getTypeString(uint64_t Type) {
   LLVM_READOBJ_TYPE_CASE(VERNEED);
   LLVM_READOBJ_TYPE_CASE(VERNEEDNUM);
   LLVM_READOBJ_TYPE_CASE(VERSYM);
+  LLVM_READOBJ_TYPE_CASE(RELCOUNT);
+  LLVM_READOBJ_TYPE_CASE(GNU_HASH);
   LLVM_READOBJ_TYPE_CASE(MIPS_RLD_VERSION);
   LLVM_READOBJ_TYPE_CASE(MIPS_FLAGS);
   LLVM_READOBJ_TYPE_CASE(MIPS_BASE_ADDRESS);
@@ -727,6 +757,57 @@ static const char *getTypeString(uint64_t Type) {
 
 #undef LLVM_READOBJ_TYPE_CASE
 
+#define LLVM_READOBJ_DT_FLAG_ENT(prefix, enum) \
+  { #enum, prefix##_##enum }
+
+static const EnumEntry<unsigned> ElfDynamicDTFlags[] = {
+  LLVM_READOBJ_DT_FLAG_ENT(DF, ORIGIN),
+  LLVM_READOBJ_DT_FLAG_ENT(DF, SYMBOLIC),
+  LLVM_READOBJ_DT_FLAG_ENT(DF, TEXTREL),
+  LLVM_READOBJ_DT_FLAG_ENT(DF, BIND_NOW),
+  LLVM_READOBJ_DT_FLAG_ENT(DF, STATIC_TLS)
+};
+
+static const EnumEntry<unsigned> ElfDynamicDTMipsFlags[] = {
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, NONE),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, QUICKSTART),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, NOTPOT),
+  LLVM_READOBJ_DT_FLAG_ENT(RHS, NO_LIBRARY_REPLACEMENT),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, NO_MOVE),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, SGI_ONLY),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, GUARANTEE_INIT),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, DELTA_C_PLUS_PLUS),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, GUARANTEE_START_INIT),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, PIXIE),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, DEFAULT_DELAY_LOAD),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, REQUICKSTART),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, REQUICKSTARTED),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, CORD),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, NO_UNRES_UNDEF),
+  LLVM_READOBJ_DT_FLAG_ENT(RHF, RLD_ORDER_SAFE)
+};
+
+#undef LLVM_READOBJ_DT_FLAG_ENT
+
+template <typename T, typename TFlag>
+void printFlags(T Value, ArrayRef<EnumEntry<TFlag>> Flags, raw_ostream &OS) {
+  typedef EnumEntry<TFlag> FlagEntry;
+  typedef SmallVector<FlagEntry, 10> FlagVector;
+  FlagVector SetFlags;
+
+  for (const auto &Flag : Flags) {
+    if (Flag.Value == 0)
+      continue;
+
+    if ((Value & Flag.Value) == Flag.Value)
+      SetFlags.push_back(Flag);
+  }
+
+  for (const auto &Flag : SetFlags) {
+    OS << Flag.Name << " ";
+  }
+}
+
 template <class ELFT>
 static void printValue(const ELFFile<ELFT> *O, uint64_t Type, uint64_t Value,
                        bool Is64, raw_ostream &OS) {
@@ -755,14 +836,15 @@ static void printValue(const ELFFile<ELFT> *O, uint64_t Type, uint64_t Value,
   case DT_DEBUG:
   case DT_VERNEED:
   case DT_VERSYM:
+  case DT_GNU_HASH:
   case DT_NULL:
-  case DT_MIPS_FLAGS:
   case DT_MIPS_BASE_ADDRESS:
   case DT_MIPS_GOTSYM:
   case DT_MIPS_RLD_MAP:
   case DT_MIPS_PLTGOT:
     OS << format("0x%" PRIX64, Value);
     break;
+  case DT_RELCOUNT:
   case DT_VERNEEDNUM:
   case DT_MIPS_RLD_VERSION:
   case DT_MIPS_LOCAL_GOTNO:
@@ -792,6 +874,12 @@ static void printValue(const ELFFile<ELFT> *O, uint64_t Type, uint64_t Value,
   case DT_RUNPATH:
     OS << O->getDynamicString(Value);
     break;
+  case DT_MIPS_FLAGS:
+    printFlags(Value, makeArrayRef(ElfDynamicDTMipsFlags), OS);
+    break;
+  case DT_FLAGS:
+    printFlags(Value, makeArrayRef(ElfDynamicDTFlags), OS);
+    break;
   }
 }
 
diff --git a/tools/llvm-readobj/StreamWriter.h b/tools/llvm-readobj/StreamWriter.h
index c40077a..9282dcc 100644
--- a/tools/llvm-readobj/StreamWriter.h
+++ b/tools/llvm-readobj/StreamWriter.h
@@ -81,9 +81,9 @@ public:
                  ArrayRef<EnumEntry<TEnum> > EnumValues) {
     StringRef Name;
     bool Found = false;
-    for (size_t i = 0; i < EnumValues.size(); ++i) {
-      if (EnumValues[i].Value == Value) {
-        Name = EnumValues[i].Name;
+    for (const auto &EnumItem : EnumValues) {
+      if (EnumItem.Value == Value) {
+        Name = EnumItem.Name;
         Found = true;
         break;
       }
@@ -103,25 +103,22 @@ public:
     typedef SmallVector<FlagEntry, 10> FlagVector;
     FlagVector SetFlags;
 
-    for (typename ArrayRef<FlagEntry>::const_iterator I = Flags.begin(),
-                                                 E = Flags.end(); I != E; ++I) {
-      if (I->Value == 0)
+    for (const auto &Flag : Flags) {
+      if (Flag.Value == 0)
         continue;
 
-      bool IsEnum = (I->Value & EnumMask) != 0;
-      if ((!IsEnum && (Value & I->Value) == I->Value) ||
-          (IsEnum  && (Value & EnumMask) == I->Value)) {
-        SetFlags.push_back(*I);
+      bool IsEnum = (Flag.Value & EnumMask) != 0;
+      if ((!IsEnum && (Value & Flag.Value) == Flag.Value) ||
+          (IsEnum  && (Value & EnumMask) == Flag.Value)) {
+        SetFlags.push_back(Flag);
       }
     }
 
     std::sort(SetFlags.begin(), SetFlags.end(), &flagName<TFlag>);
 
     startLine() << Label << " [ (" << hex(Value) << ")\n";
-    for (typename FlagVector::const_iterator I = SetFlags.begin(),
-                                             E = SetFlags.end();
-                                             I != E; ++I) {
-      startLine() << "  " << I->Name << " (" << hex(I->Value) << ")\n";
+    for (const auto &Flag : SetFlags) {
+      startLine() << "  " << Flag.Name << " (" << hex(Flag.Value) << ")\n";
     }
     startLine() << "]\n";
   }
@@ -176,10 +173,10 @@ public:
   void printList(StringRef Label, const SmallVectorImpl<T_> &List) {
     startLine() << Label << ": [";
     bool Comma = false;
-    for (unsigned LI = 0, LE = List.size(); LI != LE; ++LI) {
+    for (const auto &Item : List) {
       if (Comma)
         OS << ", ";
-      OS << List[LI];
+      OS << Item;
       Comma = true;
     }
     OS << "]\n";
diff --git a/tools/llvm-readobj/Win64EHDumper.cpp b/tools/llvm-readobj/Win64EHDumper.cpp
new file mode 100644
index 0000000..c64d362
--- /dev/null
+++ b/tools/llvm-readobj/Win64EHDumper.cpp
@@ -0,0 +1,328 @@
+//===- Win64EHDumper.cpp - Win64 EH Printer ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Win64EHDumper.h"
+#include "llvm-readobj.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::Win64EH;
+
+static const EnumEntry<unsigned> UnwindFlags[] = {
+  { "ExceptionHandler", UNW_ExceptionHandler },
+  { "TerminateHandler", UNW_TerminateHandler },
+  { "ChainInfo"       , UNW_ChainInfo        }
+};
+
+static const EnumEntry<unsigned> UnwindOpInfo[] = {
+  { "RAX",  0 },
+  { "RCX",  1 },
+  { "RDX",  2 },
+  { "RBX",  3 },
+  { "RSP",  4 },
+  { "RBP",  5 },
+  { "RSI",  6 },
+  { "RDI",  7 },
+  { "R8",   8 },
+  { "R9",   9 },
+  { "R10", 10 },
+  { "R11", 11 },
+  { "R12", 12 },
+  { "R13", 13 },
+  { "R14", 14 },
+  { "R15", 15 }
+};
+
+static uint64_t getOffsetOfLSDA(const UnwindInfo& UI) {
+  return static_cast<const char*>(UI.getLanguageSpecificData())
+         - reinterpret_cast<const char*>(&UI);
+}
+
+static uint32_t getLargeSlotValue(ArrayRef<UnwindCode> UC) {
+  if (UC.size() < 3)
+    return 0;
+  return UC[1].FrameOffset + (static_cast<uint32_t>(UC[2].FrameOffset) << 16);
+}
+
+// Returns the name of the unwind code.
+static StringRef getUnwindCodeTypeName(uint8_t Code) {
+  switch (Code) {
+  default: llvm_unreachable("Invalid unwind code");
+  case UOP_PushNonVol: return "PUSH_NONVOL";
+  case UOP_AllocLarge: return "ALLOC_LARGE";
+  case UOP_AllocSmall: return "ALLOC_SMALL";
+  case UOP_SetFPReg: return "SET_FPREG";
+  case UOP_SaveNonVol: return "SAVE_NONVOL";
+  case UOP_SaveNonVolBig: return "SAVE_NONVOL_FAR";
+  case UOP_SaveXMM128: return "SAVE_XMM128";
+  case UOP_SaveXMM128Big: return "SAVE_XMM128_FAR";
+  case UOP_PushMachFrame: return "PUSH_MACHFRAME";
+  }
+}
+
+// Returns the name of a referenced register.
+static StringRef getUnwindRegisterName(uint8_t Reg) {
+  switch (Reg) {
+  default: llvm_unreachable("Invalid register");
+  case 0: return "RAX";
+  case 1: return "RCX";
+  case 2: return "RDX";
+  case 3: return "RBX";
+  case 4: return "RSP";
+  case 5: return "RBP";
+  case 6: return "RSI";
+  case 7: return "RDI";
+  case 8: return "R8";
+  case 9: return "R9";
+  case 10: return "R10";
+  case 11: return "R11";
+  case 12: return "R12";
+  case 13: return "R13";
+  case 14: return "R14";
+  case 15: return "R15";
+  }
+}
+
+// Calculates the number of array slots required for the unwind code.
+static unsigned getNumUsedSlots(const UnwindCode &UnwindCode) {
+  switch (UnwindCode.getUnwindOp()) {
+  default: llvm_unreachable("Invalid unwind code");
+  case UOP_PushNonVol:
+  case UOP_AllocSmall:
+  case UOP_SetFPReg:
+  case UOP_PushMachFrame:
+    return 1;
+  case UOP_SaveNonVol:
+  case UOP_SaveXMM128:
+    return 2;
+  case UOP_SaveNonVolBig:
+  case UOP_SaveXMM128Big:
+    return 3;
+  case UOP_AllocLarge:
+    return (UnwindCode.getOpInfo() == 0) ? 2 : 3;
+  }
+}
+
+static std::string formatSymbol(const Dumper::Context &Ctx,
+                                const coff_section *Section, uint64_t Offset,
+                                uint32_t Displacement) {
+  std::string Buffer;
+  raw_string_ostream OS(Buffer);
+
+  StringRef Name;
+  SymbolRef Symbol;
+  if (Ctx.ResolveSymbol(Section, Offset, Symbol, Ctx.UserData) ||
+      Symbol.getName(Name)) {
+    OS << format(" (0x%" PRIX64 ")", Offset);
+    return OS.str();
+  }
+
+  OS << Name;
+  if (Displacement > 0)
+    OS << format(" +0x%X (0x%" PRIX64 ")", Displacement, Offset);
+  else
+    OS << format(" (0x%" PRIX64 ")", Offset);
+  return OS.str();
+}
+
+static error_code resolveRelocation(const Dumper::Context &Ctx,
+                                    const coff_section *Section,
+                                    uint64_t Offset,
+                                    const coff_section *&ResolvedSection,
+                                    uint64_t &ResolvedAddress) {
+  SymbolRef Symbol;
+  if (error_code EC = Ctx.ResolveSymbol(Section, Offset, Symbol, Ctx.UserData))
+    return EC;
+
+  if (error_code EC = Symbol.getAddress(ResolvedAddress))
+    return EC;
+
+  section_iterator SI = Ctx.COFF.section_begin();
+  if (error_code EC = Symbol.getSection(SI))
+    return EC;
+
+  ResolvedSection = Ctx.COFF.getCOFFSection(*SI);
+  return object_error::success;
+}
+
+namespace llvm {
+namespace Win64EH {
+void Dumper::printRuntimeFunctionEntry(const Context &Ctx,
+                                       const coff_section *Section,
+                                       uint64_t Offset,
+                                       const RuntimeFunction &RF) {
+  SW.printString("StartAddress",
+                 formatSymbol(Ctx, Section, Offset + 0, RF.StartAddress));
+  SW.printString("EndAddress",
+                 formatSymbol(Ctx, Section, Offset + 4, RF.EndAddress));
+  SW.printString("UnwindInfoAddress",
+                 formatSymbol(Ctx, Section, Offset + 8, RF.UnwindInfoOffset));
+}
+
+// Prints one unwind code. Because an unwind code can occupy up to 3 slots in
+// the unwind codes array, this function requires that the correct number of
+// slots is provided.
+void Dumper::printUnwindCode(const UnwindInfo& UI, ArrayRef<UnwindCode> UC) {
+  assert(UC.size() >= getNumUsedSlots(UC[0]));
+
+  SW.startLine() << format("0x%02X: ", unsigned(UC[0].u.CodeOffset))
+                 << getUnwindCodeTypeName(UC[0].getUnwindOp());
+
+  switch (UC[0].getUnwindOp()) {
+  case UOP_PushNonVol:
+    OS << " reg=" << getUnwindRegisterName(UC[0].getOpInfo());
+    break;
+
+  case UOP_AllocLarge:
+    OS << " size="
+       << ((UC[0].getOpInfo() == 0) ? UC[1].FrameOffset * 8
+                                    : getLargeSlotValue(UC));
+    break;
+
+  case UOP_AllocSmall:
+    OS << " size=" << (UC[0].getOpInfo() + 1) * 8;
+    break;
+
+  case UOP_SetFPReg:
+    if (UI.getFrameRegister() == 0)
+      OS << " reg=<invalid>";
+    else
+      OS << " reg=" << getUnwindRegisterName(UI.getFrameRegister())
+         << format(", offset=0x%X", UI.getFrameOffset() * 16);
+    break;
+
+  case UOP_SaveNonVol:
+    OS << " reg=" << getUnwindRegisterName(UC[0].getOpInfo())
+       << format(", offset=0x%X", UC[1].FrameOffset * 8);
+    break;
+
+  case UOP_SaveNonVolBig:
+    OS << " reg=" << getUnwindRegisterName(UC[0].getOpInfo())
+       << format(", offset=0x%X", getLargeSlotValue(UC));
+    break;
+
+  case UOP_SaveXMM128:
+    OS << " reg=XMM" << static_cast<uint32_t>(UC[0].getOpInfo())
+       << format(", offset=0x%X", UC[1].FrameOffset * 16);
+    break;
+
+  case UOP_SaveXMM128Big:
+    OS << " reg=XMM" << static_cast<uint32_t>(UC[0].getOpInfo())
+       << format(", offset=0x%X", getLargeSlotValue(UC));
+    break;
+
+  case UOP_PushMachFrame:
+    OS << " errcode=" << (UC[0].getOpInfo() == 0 ? "no" : "yes");
+    break;
+  }
+
+  OS << "\n";
+}
+
+void Dumper::printUnwindInfo(const Context &Ctx, const coff_section *Section,
+                             off_t Offset, const UnwindInfo &UI) {
+  DictScope UIS(SW, "UnwindInfo");
+  SW.printNumber("Version", UI.getVersion());
+  SW.printFlags("Flags", UI.getFlags(), makeArrayRef(UnwindFlags));
+  SW.printNumber("PrologSize", UI.PrologSize);
+  if (UI.getFrameRegister()) {
+    SW.printEnum("FrameRegister", UI.getFrameRegister(),
+                 makeArrayRef(UnwindOpInfo));
+    SW.printHex("FrameOffset", UI.getFrameOffset());
+  } else {
+    SW.printString("FrameRegister", StringRef("-"));
+    SW.printString("FrameOffset", StringRef("-"));
+  }
+
+  SW.printNumber("UnwindCodeCount", UI.NumCodes);
+  {
+    ListScope UCS(SW, "UnwindCodes");
+    ArrayRef<UnwindCode> UC(&UI.UnwindCodes[0], UI.NumCodes);
+    for (const UnwindCode *UCI = UC.begin(), *UCE = UC.end(); UCI < UCE; ++UCI) {
+      unsigned UsedSlots = getNumUsedSlots(*UCI);
+      if (UsedSlots > UC.size()) {
+        errs() << "corrupt unwind data";
+        return;
+      }
+
+      printUnwindCode(UI, ArrayRef<UnwindCode>(UCI, UCE));
+      UCI = UCI + UsedSlots - 1;
+    }
+  }
+
+  uint64_t LSDAOffset = Offset + getOffsetOfLSDA(UI);
+  if (UI.getFlags() & (UNW_ExceptionHandler | UNW_TerminateHandler)) {
+    SW.printString("Handler",
+                   formatSymbol(Ctx, Section, LSDAOffset,
+                                UI.getLanguageSpecificHandlerOffset()));
+  } else if (UI.getFlags() & UNW_ChainInfo) {
+    if (const RuntimeFunction *Chained = UI.getChainedFunctionEntry()) {
+      DictScope CS(SW, "Chained");
+      printRuntimeFunctionEntry(Ctx, Section, LSDAOffset, *Chained);
+    }
+  }
+}
+
+void Dumper::printRuntimeFunction(const Context &Ctx,
+                                  const coff_section *Section,
+                                  uint64_t SectionOffset,
+                                  const RuntimeFunction &RF) {
+  DictScope RFS(SW, "RuntimeFunction");
+  printRuntimeFunctionEntry(Ctx, Section, SectionOffset, RF);
+
+  const coff_section *XData;
+  uint64_t Offset;
+  if (error(resolveRelocation(Ctx, Section, SectionOffset + 8, XData, Offset)))
+    return;
+
+  ArrayRef<uint8_t> Contents;
+  if (error(Ctx.COFF.getSectionContents(XData, Contents)) || Contents.empty())
+    return;
+
+  Offset = Offset + RF.UnwindInfoOffset;
+  if (Offset > Contents.size())
+    return;
+
+  const auto UI = reinterpret_cast<const UnwindInfo*>(Contents.data() + Offset);
+  printUnwindInfo(Ctx, XData, Offset, *UI);
+}
+
+void Dumper::printData(const Context &Ctx) {
+  for (const auto &Section : Ctx.COFF.sections()) {
+    StringRef Name;
+    if (error(Section.getName(Name)))
+      continue;
+
+    if (Name != ".pdata" && !Name.startswith(".pdata$"))
+      continue;
+
+    const coff_section *PData = Ctx.COFF.getCOFFSection(Section);
+    ArrayRef<uint8_t> Contents;
+    if (error(Ctx.COFF.getSectionContents(PData, Contents)) || Contents.empty())
+      continue;
+
+    const RuntimeFunction *Entries =
+      reinterpret_cast<const RuntimeFunction *>(Contents.data());
+    const size_t Count = Contents.size() / sizeof(RuntimeFunction);
+    ArrayRef<RuntimeFunction> RuntimeFunctions(Entries, Count);
+
+    size_t Index = 0;
+    for (const auto &RF : RuntimeFunctions) {
+      printRuntimeFunction(Ctx, Ctx.COFF.getCOFFSection(Section),
+                           Index * sizeof(RuntimeFunction), RF);
+      ++Index;
+    }
+  }
+}
+}
+}
+
diff --git a/tools/llvm-readobj/Win64EHDumper.h b/tools/llvm-readobj/Win64EHDumper.h
new file mode 100644
index 0000000..2eac810
--- /dev/null
+++ b/tools/llvm-readobj/Win64EHDumper.h
@@ -0,0 +1,62 @@
+//===- Win64EHDumper.h - Win64 EH Printing ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_READOBJ_WIN64EHPRINTER_H
+#define LLVM_TOOLS_READOBJ_WIN64EHPRINTER_H
+
+#include "StreamWriter.h"
+#include "llvm/Support/Win64EH.h"
+
+namespace llvm {
+namespace object {
+class COFFObjectFile;
+class SymbolRef;
+struct coff_section;
+}
+
+namespace Win64EH {
+class Dumper {
+  StreamWriter &SW;
+  raw_ostream &OS;
+
+public:
+  typedef error_code (*SymbolResolver)(const object::coff_section *, uint64_t,
+                                       object::SymbolRef &, void *);
+
+  struct Context {
+    const object::COFFObjectFile &COFF;
+    SymbolResolver ResolveSymbol;
+    void *UserData;
+
+    Context(const object::COFFObjectFile &COFF, SymbolResolver Resolver,
+            void *UserData)
+      : COFF(COFF), ResolveSymbol(Resolver), UserData(UserData) {}
+  };
+
+private:
+  void printRuntimeFunctionEntry(const Context &Ctx,
+                                 const object::coff_section *Section,
+                                 uint64_t SectionOffset,
+                                 const RuntimeFunction &RF);
+  void printUnwindCode(const UnwindInfo& UI, ArrayRef<UnwindCode> UC);
+  void printUnwindInfo(const Context &Ctx, const object::coff_section *Section,
+                       off_t Offset, const UnwindInfo &UI);
+  void printRuntimeFunction(const Context &Ctx,
+                            const object::coff_section *Section,
+                            uint64_t SectionOffset, const RuntimeFunction &RF);
+
+public:
+  Dumper(StreamWriter &SW) : SW(SW), OS(SW.getOStream()) {}
+
+  void printData(const Context &Ctx);
+};
+}
+}
+
+#endif
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index ac43653..be5c345 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -51,6 +52,11 @@ EntryPoint("entry",
            cl::desc("Function to call as entry point."),
            cl::init("_main"));
 
+static cl::list<std::string>
+Dylibs("dylib",
+       cl::desc("Add library."),
+       cl::ZeroOrMore);
+
 /* *** */
 
 // A trivial memory manager that doesn't do anything fancy, just uses the
@@ -69,7 +75,7 @@ public:
 
   void *getPointerToNamedFunction(const std::string &Name,
                                   bool AbortOnFailure = true) override {
-    return 0;
+    return nullptr;
   }
 
   bool finalizeMemory(std::string *ErrMsg) override { return false; }
@@ -85,7 +91,7 @@ uint8_t *TrivialMemoryManager::allocateCodeSection(uintptr_t Size,
                                                    unsigned Alignment,
                                                    unsigned SectionID,
                                                    StringRef SectionName) {
-  sys::MemoryBlock MB = sys::Memory::AllocateRWX(Size, 0, 0);
+  sys::MemoryBlock MB = sys::Memory::AllocateRWX(Size, nullptr, nullptr);
   FunctionMemory.push_back(MB);
   return (uint8_t*)MB.base();
 }
@@ -95,7 +101,7 @@ uint8_t *TrivialMemoryManager::allocateDataSection(uintptr_t Size,
                                                    unsigned SectionID,
                                                    StringRef SectionName,
                                                    bool IsReadOnly) {
-  sys::MemoryBlock MB = sys::Memory::AllocateRWX(Size, 0, 0);
+  sys::MemoryBlock MB = sys::Memory::AllocateRWX(Size, nullptr, nullptr);
   DataMemory.push_back(MB);
   return (uint8_t*)MB.base();
 }
@@ -121,9 +127,25 @@ static int Error(const Twine &Msg) {
   return 1;
 }
 
+static void loadDylibs() {
+  for (const std::string &Dylib : Dylibs) {
+    if (sys::fs::is_regular_file(Dylib)) {
+      std::string ErrMsg;
+      if (sys::DynamicLibrary::LoadLibraryPermanently(Dylib.c_str(), &ErrMsg))
+        llvm::errs() << "Error loading '" << Dylib << "': "
+                     << ErrMsg << "\n";
+    } else
+      llvm::errs() << "Dylib not found: '" << Dylib << "'.\n";
+  }
+}
+
+
 /* *** */
 
 static int printLineInfoForInput() {
+  // Load any dylibs requested on the command line.
+  loadDylibs();
+
   // If we don't have any input files, read from stdin.
   if (!InputFileList.size())
     InputFileList.push_back("-");
@@ -172,8 +194,7 @@ static int printLineInfoForInput() {
         DILineInfoTable::iterator  End = Lines.end();
         for (DILineInfoTable::iterator It = Begin; It != End; ++It) {
           outs() << "  Line info @ " << It->first - Addr << ": "
-                 << It->second.getFileName()
-                 << ", line:" << It->second.getLine() << "\n";
+                 << It->second.FileName << ", line:" << It->second.Line << "\n";
         }
       }
     }
@@ -183,6 +204,9 @@ static int printLineInfoForInput() {
 }
 
 static int executeInput() {
+  // Load any dylibs requested on the command line.
+  loadDylibs();
+
   // Instantiate a dynamic linker.
   TrivialMemoryManager MemMgr;
   RuntimeDyld Dyld(&MemMgr);
@@ -214,7 +238,7 @@ static int executeInput() {
 
   // Get the address of the entry point (_main by default).
   void *MainAddress = Dyld.getSymbolAddress(EntryPoint);
-  if (MainAddress == 0)
+  if (!MainAddress)
     return Error("no definition for '" + EntryPoint + "'");
 
   // Invalidate the instruction cache for each loaded function.
@@ -235,7 +259,7 @@ static int executeInput() {
   const char **Argv = new const char*[2];
   // Use the name of the first input object module as argv[0] for the target.
   Argv[0] = InputFileList[0].c_str();
-  Argv[1] = 0;
+  Argv[1] = nullptr;
   return Main(1, Argv);
 }
 
diff --git a/tools/llvm-shlib/Makefile b/tools/llvm-shlib/Makefile
index 7bbc24c..19077a3 100644
--- a/tools/llvm-shlib/Makefile
+++ b/tools/llvm-shlib/Makefile
@@ -9,8 +9,8 @@
 
 LEVEL := ../..
 
-LIBRARYNAME = LLVM-$(LLVMVersion)
-LIBRARYALIASNAME = LLVM-$(LLVM_VERSION_MAJOR).$(LLVM_VERSION_MINOR)$(LLVM_VERSION_SUFFIX)
+LIBRARYNAME = LLVM-$(LLVM_VERSION_MAJOR).$(LLVM_VERSION_MINOR)$(LLVM_VERSION_SUFFIX)
+LIBRARYALIASNAME = LLVM-$(LLVMVersion)
 
 NO_BUILD_ARCHIVE := 1
 LINK_LIBS_IN_SHARED := 1
diff --git a/tools/llvm-size/llvm-size.cpp b/tools/llvm-size/llvm-size.cpp
index d1bd45a..58eafd4 100644
--- a/tools/llvm-size/llvm-size.cpp
+++ b/tools/llvm-size/llvm-size.cpp
@@ -93,7 +93,7 @@ static void PrintObjectSectionSizes(ObjectFile *Obj) {
   std::string fmtbuf;
   raw_string_ostream fmt(fmtbuf);
 
-  const char *radix_fmt = 0;
+  const char *radix_fmt = nullptr;
   switch (Radix) {
   case octal:
     radix_fmt = PRIo64;
diff --git a/tools/llvm-stress/llvm-stress.cpp b/tools/llvm-stress/llvm-stress.cpp
index 18f1e6c..23d3b63 100644
--- a/tools/llvm-stress/llvm-stress.cpp
+++ b/tools/llvm-stress/llvm-stress.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
@@ -245,7 +246,7 @@ protected:
 
   /// Pick a random scalar type.
   Type *pickScalarType() {
-    Type *t = 0;
+    Type *t = nullptr;
     do {
       switch (Ran->Rand() % 30) {
       case 0: t = Type::getInt1Ty(Context); break;
@@ -271,7 +272,7 @@ protected:
       case 29: if (GenX86MMX) t = Type::getX86_MMXTy(Context); break;
       default: llvm_unreachable("Invalid scalar value");
       }
-    } while (t == 0);
+    } while (t == nullptr);
 
     return t;
   }
@@ -713,6 +714,7 @@ int main(int argc, char **argv) {
 
   PassManager Passes;
   Passes.add(createVerifierPass());
+  Passes.add(createDebugInfoVerifierPass());
   Passes.add(createPrintModulePass(Out->os()));
   Passes.run(*M.get());
   Out->keep();
diff --git a/tools/llvm-symbolizer/LLVMSymbolize.cpp b/tools/llvm-symbolizer/LLVMSymbolize.cpp
index 13f2f8f..3e71111 100644
--- a/tools/llvm-symbolizer/LLVMSymbolize.cpp
+++ b/tools/llvm-symbolizer/LLVMSymbolize.cpp
@@ -35,20 +35,11 @@ static bool error(error_code ec) {
   return true;
 }
 
-static uint32_t
-getDILineInfoSpecifierFlags(const LLVMSymbolizer::Options &Opts) {
-  uint32_t Flags = llvm::DILineInfoSpecifier::FileLineInfo |
-                   llvm::DILineInfoSpecifier::AbsoluteFilePath;
-  if (Opts.PrintFunctions)
-    Flags |= llvm::DILineInfoSpecifier::FunctionName;
-  return Flags;
-}
-
-static void patchFunctionNameInDILineInfo(const std::string &NewFunctionName,
-                                          DILineInfo &LineInfo) {
-  std::string FileName = LineInfo.getFileName();
-  LineInfo = DILineInfo(StringRef(FileName), StringRef(NewFunctionName),
-                        LineInfo.getLine(), LineInfo.getColumn());
+static DILineInfoSpecifier
+getDILineInfoSpecifier(const LLVMSymbolizer::Options &Opts) {
+  return DILineInfoSpecifier(
+      DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath,
+      Opts.PrintFunctions);
 }
 
 ModuleInfo::ModuleInfo(ObjectFile *Obj, DIContext *DICtx)
@@ -122,15 +113,15 @@ DILineInfo ModuleInfo::symbolizeCode(
   DILineInfo LineInfo;
   if (DebugInfoContext) {
     LineInfo = DebugInfoContext->getLineInfoForAddress(
-        ModuleOffset, getDILineInfoSpecifierFlags(Opts));
+        ModuleOffset, getDILineInfoSpecifier(Opts));
   }
   // Override function name from symbol table if necessary.
-  if (Opts.PrintFunctions && Opts.UseSymbolTable) {
+  if (Opts.PrintFunctions != FunctionNameKind::None && Opts.UseSymbolTable) {
     std::string FunctionName;
     uint64_t Start, Size;
     if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset,
                                FunctionName, Start, Size)) {
-      patchFunctionNameInDILineInfo(FunctionName, LineInfo);
+      LineInfo.FunctionName = FunctionName;
     }
   }
   return LineInfo;
@@ -141,14 +132,14 @@ DIInliningInfo ModuleInfo::symbolizeInlinedCode(
   DIInliningInfo InlinedContext;
   if (DebugInfoContext) {
     InlinedContext = DebugInfoContext->getInliningInfoForAddress(
-        ModuleOffset, getDILineInfoSpecifierFlags(Opts));
+        ModuleOffset, getDILineInfoSpecifier(Opts));
   }
   // Make sure there is at least one frame in context.
   if (InlinedContext.getNumberOfFrames() == 0) {
     InlinedContext.addFrame(DILineInfo());
   }
   // Override the function name in lower frame with name from symbol table.
-  if (Opts.PrintFunctions && Opts.UseSymbolTable) {
+  if (Opts.PrintFunctions != FunctionNameKind::None && Opts.UseSymbolTable) {
     DIInliningInfo PatchedInlinedContext;
     for (uint32_t i = 0, n = InlinedContext.getNumberOfFrames(); i < n; i++) {
       DILineInfo LineInfo = InlinedContext.getFrame(i);
@@ -157,7 +148,7 @@ DIInliningInfo ModuleInfo::symbolizeInlinedCode(
         uint64_t Start, Size;
         if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset,
                                    FunctionName, Start, Size)) {
-          patchFunctionNameInDILineInfo(FunctionName, LineInfo);
+          LineInfo.FunctionName = FunctionName;
         }
       }
       PatchedInlinedContext.addFrame(LineInfo);
@@ -178,7 +169,7 @@ const char LLVMSymbolizer::kBadString[] = "??";
 std::string LLVMSymbolizer::symbolizeCode(const std::string &ModuleName,
                                           uint64_t ModuleOffset) {
   ModuleInfo *Info = getOrCreateModuleInfo(ModuleName);
-  if (Info == 0)
+  if (!Info)
     return printDILineInfo(DILineInfo());
   if (Opts.PrintInlining) {
     DIInliningInfo InlinedContext =
@@ -214,7 +205,6 @@ std::string LLVMSymbolizer::symbolizeData(const std::string &ModuleName,
 
 void LLVMSymbolizer::flush() {
   DeleteContainerSeconds(Modules);
-  DeleteContainerPointers(ParsedBinariesAndObjects);
   BinaryForPath.clear();
   ObjectFileForArch.clear();
 }
@@ -240,7 +230,7 @@ static bool findDebugBinary(const std::string &OrigPath,
                             std::string &Result) {
   std::string OrigRealPath = OrigPath;
 #if defined(HAVE_REALPATH)
-  if (char *RP = realpath(OrigPath.c_str(), NULL)) {
+  if (char *RP = realpath(OrigPath.c_str(), nullptr)) {
     OrigRealPath = RP;
     free(RP);
   }
@@ -306,14 +296,14 @@ LLVMSymbolizer::getOrCreateBinary(const std::string &Path) {
   BinaryMapTy::iterator I = BinaryForPath.find(Path);
   if (I != BinaryForPath.end())
     return I->second;
-  Binary *Bin = 0;
-  Binary *DbgBin = 0;
+  Binary *Bin = nullptr;
+  Binary *DbgBin = nullptr;
   ErrorOr<Binary *> BinaryOrErr = createBinary(Path);
   if (!error(BinaryOrErr.getError())) {
     std::unique_ptr<Binary> ParsedBinary(BinaryOrErr.get());
     // Check if it's a universal binary.
-    Bin = ParsedBinary.release();
-    ParsedBinariesAndObjects.push_back(Bin);
+    Bin = ParsedBinary.get();
+    ParsedBinariesAndObjects.push_back(std::move(ParsedBinary));
     if (Bin->isMachO() || Bin->isMachOUniversalBinary()) {
       // On Darwin we may find DWARF in separate object file in
       // resource directory.
@@ -323,11 +313,11 @@ LLVMSymbolizer::getOrCreateBinary(const std::string &Path) {
       error_code EC = BinaryOrErr.getError();
       if (EC != errc::no_such_file_or_directory && !error(EC)) {
         DbgBin = BinaryOrErr.get();
-        ParsedBinariesAndObjects.push_back(DbgBin);
+        ParsedBinariesAndObjects.push_back(std::unique_ptr<Binary>(DbgBin));
       }
     }
     // Try to locate the debug binary using .gnu_debuglink section.
-    if (DbgBin == 0) {
+    if (!DbgBin) {
       std::string DebuglinkName;
       uint32_t CRCHash;
       std::string DebugBinaryPath;
@@ -336,12 +326,12 @@ LLVMSymbolizer::getOrCreateBinary(const std::string &Path) {
         BinaryOrErr = createBinary(DebugBinaryPath);
         if (!error(BinaryOrErr.getError())) {
           DbgBin = BinaryOrErr.get();
-          ParsedBinariesAndObjects.push_back(DbgBin);
+          ParsedBinariesAndObjects.push_back(std::unique_ptr<Binary>(DbgBin));
         }
       }
     }
   }
-  if (DbgBin == 0)
+  if (!DbgBin)
     DbgBin = Bin;
   BinaryPair Res = std::make_pair(Bin, DbgBin);
   BinaryForPath[Path] = Res;
@@ -350,9 +340,9 @@ LLVMSymbolizer::getOrCreateBinary(const std::string &Path) {
 
 ObjectFile *
 LLVMSymbolizer::getObjectFileFromBinary(Binary *Bin, const std::string &ArchName) {
-  if (Bin == 0)
-    return 0;
-  ObjectFile *Res = 0;
+  if (!Bin)
+    return nullptr;
+  ObjectFile *Res = nullptr;
   if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(Bin)) {
     ObjectFileForArchMapTy::iterator I = ObjectFileForArch.find(
         std::make_pair(UB, ArchName));
@@ -360,8 +350,8 @@ LLVMSymbolizer::getObjectFileFromBinary(Binary *Bin, const std::string &ArchName
       return I->second;
     std::unique_ptr<ObjectFile> ParsedObj;
     if (!UB->getObjectForArch(Triple(ArchName).getArch(), ParsedObj)) {
-      Res = ParsedObj.release();
-      ParsedBinariesAndObjects.push_back(Res);
+      Res = ParsedObj.get();
+      ParsedBinariesAndObjects.push_back(std::move(ParsedObj));
     }
     ObjectFileForArch[std::make_pair(UB, ArchName)] = Res;
   } else if (Bin->isObject()) {
@@ -390,10 +380,10 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
   ObjectFile *Obj = getObjectFileFromBinary(Binaries.first, ArchName);
   ObjectFile *DbgObj = getObjectFileFromBinary(Binaries.second, ArchName);
 
-  if (Obj == 0) {
+  if (!Obj) {
     // Failed to find valid object file.
-    Modules.insert(make_pair(ModuleName, (ModuleInfo *)0));
-    return 0;
+    Modules.insert(make_pair(ModuleName, (ModuleInfo *)nullptr));
+    return nullptr;
   }
   DIContext *Context = DIContext::getDWARFContext(DbgObj);
   assert(Context);
@@ -407,19 +397,18 @@ std::string LLVMSymbolizer::printDILineInfo(DILineInfo LineInfo) const {
   // cannot fetch. We replace it to "??" to make our output closer to addr2line.
   static const std::string kDILineInfoBadString = "<invalid>";
   std::stringstream Result;
-  if (Opts.PrintFunctions) {
-    std::string FunctionName = LineInfo.getFunctionName();
+  if (Opts.PrintFunctions != FunctionNameKind::None) {
+    std::string FunctionName = LineInfo.FunctionName;
     if (FunctionName == kDILineInfoBadString)
       FunctionName = kBadString;
     else if (Opts.Demangle)
       FunctionName = DemangleName(FunctionName);
     Result << FunctionName << "\n";
   }
-  std::string Filename = LineInfo.getFileName();
+  std::string Filename = LineInfo.FileName;
   if (Filename == kDILineInfoBadString)
     Filename = kBadString;
-  Result << Filename << ":" << LineInfo.getLine() << ":" << LineInfo.getColumn()
-         << "\n";
+  Result << Filename << ":" << LineInfo.Line << ":" << LineInfo.Column << "\n";
   return Result.str();
 }
 
@@ -436,7 +425,7 @@ std::string LLVMSymbolizer::DemangleName(const std::string &Name) {
   if (Name.substr(0, 2) != "_Z")
     return Name;
   int status = 0;
-  char *DemangledName = __cxa_demangle(Name.c_str(), 0, 0, &status);
+  char *DemangledName = __cxa_demangle(Name.c_str(), nullptr, nullptr, &status);
   if (status != 0)
     return Name;
   std::string Result = DemangledName;
diff --git a/tools/llvm-symbolizer/LLVMSymbolize.h b/tools/llvm-symbolizer/LLVMSymbolize.h
index 288be80..45febe0 100644
--- a/tools/llvm-symbolizer/LLVMSymbolize.h
+++ b/tools/llvm-symbolizer/LLVMSymbolize.h
@@ -19,10 +19,12 @@
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include <map>
+#include <memory>
 #include <string>
 
 namespace llvm {
 
+typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind;
 using namespace object;
 
 namespace symbolize {
@@ -33,17 +35,17 @@ class LLVMSymbolizer {
 public:
   struct Options {
     bool UseSymbolTable : 1;
-    bool PrintFunctions : 1;
+    FunctionNameKind PrintFunctions;
     bool PrintInlining : 1;
     bool Demangle : 1;
     std::string DefaultArch;
-    Options(bool UseSymbolTable = true, bool PrintFunctions = true,
+    Options(bool UseSymbolTable = true,
+            FunctionNameKind PrintFunctions = FunctionNameKind::LinkageName,
             bool PrintInlining = true, bool Demangle = true,
             std::string DefaultArch = "")
         : UseSymbolTable(UseSymbolTable), PrintFunctions(PrintFunctions),
           PrintInlining(PrintInlining), Demangle(Demangle),
-          DefaultArch(DefaultArch) {
-    }
+          DefaultArch(DefaultArch) {}
   };
 
   LLVMSymbolizer(const Options &Opts = Options()) : Opts(Opts) {}
@@ -72,7 +74,7 @@ private:
   std::string printDILineInfo(DILineInfo LineInfo) const;
 
   // Owns all the parsed binaries and object files.
-  SmallVector<Binary*, 4> ParsedBinariesAndObjects;
+  SmallVector<std::unique_ptr<Binary>, 4> ParsedBinariesAndObjects;
   // Owns module info objects.
   typedef std::map<std::string, ModuleInfo *> ModuleMapTy;
   ModuleMapTy Modules;
diff --git a/tools/llvm-symbolizer/llvm-symbolizer.cpp b/tools/llvm-symbolizer/llvm-symbolizer.cpp
index 83f5c5e..29db172 100644
--- a/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -35,10 +35,15 @@ ClUseSymbolTable("use-symbol-table", cl::init(true),
                  cl::desc("Prefer names in symbol table to names "
                           "in debug info"));
 
-static cl::opt<bool>
-ClPrintFunctions("functions", cl::init(true),
-                 cl::desc("Print function names as well as line "
-                          "information for a given address"));
+static cl::opt<FunctionNameKind> ClPrintFunctions(
+    "functions", cl::init(FunctionNameKind::LinkageName),
+    cl::desc("Print function name for a given address:"),
+    cl::values(clEnumValN(FunctionNameKind::None, "none", "omit function name"),
+               clEnumValN(FunctionNameKind::ShortName, "short",
+                          "print short function name"),
+               clEnumValN(FunctionNameKind::LinkageName, "linkage",
+                          "print function linkage name"),
+               clEnumValEnd));
 
 static cl::opt<bool>
 ClPrintInlining("inlining", cl::init(true),
@@ -85,7 +90,7 @@ static bool parseCommand(bool &IsData, std::string &ModuleName,
       char quote = *pos;
       pos++;
       char *end = strchr(pos, quote);
-      if (end == 0)
+      if (!end)
         return false;
       ModuleName = std::string(pos, end - pos);
       pos = end + 1;
diff --git a/tools/lto/lto.cpp b/tools/lto/lto.cpp
index cc8318a..64abf5c 100644
--- a/tools/lto/lto.cpp
+++ b/tools/lto/lto.cpp
@@ -56,37 +56,45 @@ static void lto_initialize() {
   }
 }
 
-/// lto_get_version - Returns a printable string.
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(LTOCodeGenerator, lto_code_gen_t)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(LTOModule, lto_module_t)
+
+// Convert the subtarget features into a string to pass to LTOCodeGenerator.
+static void lto_add_attrs(lto_code_gen_t cg) {
+  LTOCodeGenerator *CG = unwrap(cg);
+  if (MAttrs.size()) {
+    std::string attrs;
+    for (unsigned i = 0; i < MAttrs.size(); ++i) {
+      if (i > 0)
+        attrs.append(",");
+      attrs.append(MAttrs[i]);
+    }
+
+    CG->setAttr(attrs.c_str());
+  }
+}
+
 extern const char* lto_get_version() {
   return LTOCodeGenerator::getVersionString();
 }
 
-/// lto_get_error_message - Returns the last error string or NULL if last
-/// operation was successful.
 const char* lto_get_error_message() {
   return sLastErrorString.c_str();
 }
 
-/// lto_module_is_object_file - Validates if a file is a loadable object file.
 bool lto_module_is_object_file(const char* path) {
   return LTOModule::isBitcodeFile(path);
 }
 
-/// lto_module_is_object_file_for_target - Validates if a file is a loadable
-/// object file compilable for requested target.
 bool lto_module_is_object_file_for_target(const char* path,
                                           const char* target_triplet_prefix) {
   return LTOModule::isBitcodeFileForTarget(path, target_triplet_prefix);
 }
 
-/// lto_module_is_object_file_in_memory - Validates if a buffer is a loadable
-/// object file.
 bool lto_module_is_object_file_in_memory(const void* mem, size_t length) {
   return LTOModule::isBitcodeFile(mem, length);
 }
 
-/// lto_module_is_object_file_in_memory_for_target - Validates if a buffer is a
-/// loadable object file compilable for the target.
 bool
 lto_module_is_object_file_in_memory_for_target(const void* mem,
                                             size_t length,
@@ -94,120 +102,89 @@ lto_module_is_object_file_in_memory_for_target(const void* mem,
   return LTOModule::isBitcodeFileForTarget(mem, length, target_triplet_prefix);
 }
 
-/// lto_module_create - Loads an object file from disk. Returns NULL on error
-/// (check lto_get_error_message() for details).
 lto_module_t lto_module_create(const char* path) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return LTOModule::makeLTOModule(path, Options, sLastErrorString);
+  return wrap(LTOModule::makeLTOModule(path, Options, sLastErrorString));
 }
 
-/// lto_module_create_from_fd - Loads an object file from disk. Returns NULL on
-/// error (check lto_get_error_message() for details).
 lto_module_t lto_module_create_from_fd(int fd, const char *path, size_t size) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return LTOModule::makeLTOModule(fd, path, size, Options, sLastErrorString);
+  return wrap(
+      LTOModule::makeLTOModule(fd, path, size, Options, sLastErrorString));
 }
 
-/// lto_module_create_from_fd_at_offset - Loads an object file from disk.
-/// Returns NULL on error (check lto_get_error_message() for details).
 lto_module_t lto_module_create_from_fd_at_offset(int fd, const char *path,
                                                  size_t file_size,
                                                  size_t map_size,
                                                  off_t offset) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return LTOModule::makeLTOModule(fd, path, map_size, offset, Options,
-                                  sLastErrorString);
+  return wrap(LTOModule::makeLTOModule(fd, path, map_size, offset, Options,
+                                       sLastErrorString));
 }
 
-/// lto_module_create_from_memory - Loads an object file from memory. Returns
-/// NULL on error (check lto_get_error_message() for details).
 lto_module_t lto_module_create_from_memory(const void* mem, size_t length) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return LTOModule::makeLTOModule(mem, length, Options, sLastErrorString);
+  return wrap(LTOModule::makeLTOModule(mem, length, Options, sLastErrorString));
 }
 
-/// Loads an object file from memory with an extra path argument.
-/// Returns NULL on error (check lto_get_error_message() for details).
 lto_module_t lto_module_create_from_memory_with_path(const void* mem,
                                                      size_t length,
                                                      const char *path) {
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
-  return LTOModule::makeLTOModule(mem, length, Options, sLastErrorString, path);
+  return wrap(
+      LTOModule::makeLTOModule(mem, length, Options, sLastErrorString, path));
 }
 
-/// lto_module_dispose - Frees all memory for a module. Upon return the
-/// lto_module_t is no longer valid.
-void lto_module_dispose(lto_module_t mod) {
-  delete mod;
-}
+void lto_module_dispose(lto_module_t mod) { delete unwrap(mod); }
 
-/// lto_module_get_target_triple - Returns triplet string which the object
-/// module was compiled under.
 const char* lto_module_get_target_triple(lto_module_t mod) {
-  return mod->getTargetTriple();
+  return unwrap(mod)->getTargetTriple();
 }
 
-/// lto_module_set_target_triple - Sets triple string with which the object will
-/// be codegened.
 void lto_module_set_target_triple(lto_module_t mod, const char *triple) {
-  return mod->setTargetTriple(triple);
+  return unwrap(mod)->setTargetTriple(triple);
 }
 
-/// lto_module_get_num_symbols - Returns the number of symbols in the object
-/// module.
 unsigned int lto_module_get_num_symbols(lto_module_t mod) {
-  return mod->getSymbolCount();
+  return unwrap(mod)->getSymbolCount();
 }
 
-/// lto_module_get_symbol_name - Returns the name of the ith symbol in the
-/// object module.
 const char* lto_module_get_symbol_name(lto_module_t mod, unsigned int index) {
-  return mod->getSymbolName(index);
+  return unwrap(mod)->getSymbolName(index);
 }
 
-/// lto_module_get_symbol_attribute - Returns the attributes of the ith symbol
-/// in the object module.
 lto_symbol_attributes lto_module_get_symbol_attribute(lto_module_t mod,
                                                       unsigned int index) {
-  return mod->getSymbolAttributes(index);
+  return unwrap(mod)->getSymbolAttributes(index);
 }
 
-/// lto_module_get_num_deplibs - Returns the number of dependent libraries in
-/// the object module.
 unsigned int lto_module_get_num_deplibs(lto_module_t mod) {
-  return mod->getDependentLibraryCount();
+  return unwrap(mod)->getDependentLibraryCount();
 }
 
-/// lto_module_get_deplib - Returns the ith dependent library in the module.
 const char* lto_module_get_deplib(lto_module_t mod, unsigned int index) {
-  return mod->getDependentLibrary(index);
+  return unwrap(mod)->getDependentLibrary(index);
 }
 
-/// lto_module_get_num_linkeropts - Returns the number of linker options in the
-/// object module.
 unsigned int lto_module_get_num_linkeropts(lto_module_t mod) {
-  return mod->getLinkerOptCount();
+  return unwrap(mod)->getLinkerOptCount();
 }
 
-/// lto_module_get_linkeropt - Returns the ith linker option in the module.
 const char* lto_module_get_linkeropt(lto_module_t mod, unsigned int index) {
-  return mod->getLinkerOpt(index);
+  return unwrap(mod)->getLinkerOpt(index);
 }
 
-/// Set a diagnostic handler.
 void lto_codegen_set_diagnostic_handler(lto_code_gen_t cg,
                                         lto_diagnostic_handler_t diag_handler,
                                         void *ctxt) {
-  cg->setDiagnosticHandler(diag_handler, ctxt);
+  unwrap(cg)->setDiagnosticHandler(diag_handler, ctxt);
 }
 
-/// lto_codegen_create - Instantiates a code generator. Returns NULL if there
-/// is an error.
 lto_code_gen_t lto_codegen_create(void) {
   lto_initialize();
 
@@ -216,102 +193,76 @@ lto_code_gen_t lto_codegen_create(void) {
   LTOCodeGenerator *CodeGen = new LTOCodeGenerator();
   if (CodeGen)
     CodeGen->setTargetOptions(Options);
-  return CodeGen;
+  return wrap(CodeGen);
 }
 
-/// lto_codegen_dispose - Frees all memory for a code generator. Upon return the
-/// lto_code_gen_t is no longer valid.
-void lto_codegen_dispose(lto_code_gen_t cg) {
-  delete cg;
-}
+void lto_codegen_dispose(lto_code_gen_t cg) { delete unwrap(cg); }
 
-/// lto_codegen_add_module - Add an object module to the set of modules for
-/// which code will be generated. Returns true on error (check
-/// lto_get_error_message() for details).
 bool lto_codegen_add_module(lto_code_gen_t cg, lto_module_t mod) {
-  return !cg->addModule(mod, sLastErrorString);
+  return !unwrap(cg)->addModule(unwrap(mod), sLastErrorString);
 }
 
-/// lto_codegen_set_debug_model - Sets what if any format of debug info should
-/// be generated. Returns true on error (check lto_get_error_message() for
-/// details).
 bool lto_codegen_set_debug_model(lto_code_gen_t cg, lto_debug_model debug) {
-  cg->setDebugInfo(debug);
+  unwrap(cg)->setDebugInfo(debug);
   return false;
 }
 
-/// lto_codegen_set_pic_model - Sets what code model to generated. Returns true
-/// on error (check lto_get_error_message() for details).
 bool lto_codegen_set_pic_model(lto_code_gen_t cg, lto_codegen_model model) {
-  cg->setCodePICModel(model);
+  unwrap(cg)->setCodePICModel(model);
   return false;
 }
 
-/// lto_codegen_set_cpu - Sets the cpu to generate code for.
 void lto_codegen_set_cpu(lto_code_gen_t cg, const char *cpu) {
-  return cg->setCpu(cpu);
+  return unwrap(cg)->setCpu(cpu);
+}
+
+void lto_codegen_set_attr(lto_code_gen_t cg, const char *attr) {
+  return unwrap(cg)->setAttr(attr);
 }
 
-/// lto_codegen_set_assembler_path - Sets the path to the assembler tool.
 void lto_codegen_set_assembler_path(lto_code_gen_t cg, const char *path) {
   // In here only for backwards compatibility. We use MC now.
 }
 
-/// lto_codegen_set_assembler_args - Sets extra arguments that libLTO should
-/// pass to the assembler.
 void lto_codegen_set_assembler_args(lto_code_gen_t cg, const char **args,
                                     int nargs) {
   // In here only for backwards compatibility. We use MC now.
 }
 
-/// lto_codegen_add_must_preserve_symbol - Adds to a list of all global symbols
-/// that must exist in the final generated code. If a function is not listed
-/// there, it might be inlined into every usage and optimized away.
 void lto_codegen_add_must_preserve_symbol(lto_code_gen_t cg,
                                           const char *symbol) {
-  cg->addMustPreserveSymbol(symbol);
+  unwrap(cg)->addMustPreserveSymbol(symbol);
 }
 
-/// lto_codegen_write_merged_modules - Writes a new file at the specified path
-/// that contains the merged contents of all modules added so far. Returns true
-/// on error (check lto_get_error_message() for details).
 bool lto_codegen_write_merged_modules(lto_code_gen_t cg, const char *path) {
   if (!parsedOptions) {
-    cg->parseCodeGenDebugOptions();
+    unwrap(cg)->parseCodeGenDebugOptions();
+    lto_add_attrs(cg);
     parsedOptions = true;
   }
-  return !cg->writeMergedModules(path, sLastErrorString);
+  return !unwrap(cg)->writeMergedModules(path, sLastErrorString);
 }
 
-/// lto_codegen_compile - Generates code for all added modules into one native
-/// object file. On success returns a pointer to a generated mach-o/ELF buffer
-/// and length set to the buffer size. The buffer is owned by the lto_code_gen_t
-/// object and will be freed when lto_codegen_dispose() is called, or
-/// lto_codegen_compile() is called again. On failure, returns NULL (check
-/// lto_get_error_message() for details).
 const void *lto_codegen_compile(lto_code_gen_t cg, size_t *length) {
   if (!parsedOptions) {
-    cg->parseCodeGenDebugOptions();
+    unwrap(cg)->parseCodeGenDebugOptions();
+    lto_add_attrs(cg);
     parsedOptions = true;
   }
-  return cg->compile(length, DisableOpt, DisableInline, DisableGVNLoadPRE,
-                     sLastErrorString);
+  return unwrap(cg)->compile(length, DisableOpt, DisableInline,
+                             DisableGVNLoadPRE, sLastErrorString);
 }
 
-/// lto_codegen_compile_to_file - Generates code for all added modules into one
-/// native object file. The name of the file is written to name. Returns true on
-/// error.
 bool lto_codegen_compile_to_file(lto_code_gen_t cg, const char **name) {
   if (!parsedOptions) {
-    cg->parseCodeGenDebugOptions();
+    unwrap(cg)->parseCodeGenDebugOptions();
+    lto_add_attrs(cg);
     parsedOptions = true;
   }
-  return !cg->compile_to_file(name, DisableOpt, DisableInline, DisableGVNLoadPRE,
-                              sLastErrorString);
+  return !unwrap(cg)->compile_to_file(name, DisableOpt, DisableInline,
+                                      DisableGVNLoadPRE, sLastErrorString);
 }
 
-/// lto_codegen_debug_options - Used to pass extra options to the code
-/// generator.
 void lto_codegen_debug_options(lto_code_gen_t cg, const char *opt) {
-  cg->setCodeGenDebugOptions(opt);
+  unwrap(cg)->setCodeGenDebugOptions(opt);
 }
diff --git a/tools/obj2yaml/CMakeLists.txt b/tools/obj2yaml/CMakeLists.txt
index 9c10c04..f167ed5 100644
--- a/tools/obj2yaml/CMakeLists.txt
+++ b/tools/obj2yaml/CMakeLists.txt
@@ -4,5 +4,5 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_utility(obj2yaml
-  obj2yaml.cpp coff2yaml.cpp
+  obj2yaml.cpp coff2yaml.cpp elf2yaml.cpp Error.cpp
   )
diff --git a/tools/obj2yaml/Error.cpp b/tools/obj2yaml/Error.cpp
new file mode 100644
index 0000000..7be468d
--- /dev/null
+++ b/tools/obj2yaml/Error.cpp
@@ -0,0 +1,54 @@
+//===- Error.cpp - system_error extensions for obj2yaml ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Error.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class _obj2yaml_error_category : public error_category {
+public:
+  const char *name() const override;
+  std::string message(int ev) const override;
+  error_condition default_error_condition(int ev) const override;
+};
+} // namespace
+
+const char *_obj2yaml_error_category::name() const { return "obj2yaml"; }
+
+std::string _obj2yaml_error_category::message(int ev) const {
+  switch (ev) {
+  case obj2yaml_error::success:
+    return "Success";
+  case obj2yaml_error::file_not_found:
+    return "No such file.";
+  case obj2yaml_error::unrecognized_file_format:
+    return "Unrecognized file type.";
+  case obj2yaml_error::unsupported_obj_file_format:
+    return "Unsupported object file format.";
+  default:
+    llvm_unreachable("An enumerator of obj2yaml_error does not have a message "
+                     "defined.");
+  }
+}
+
+error_condition
+_obj2yaml_error_category::default_error_condition(int ev) const {
+  if (ev == obj2yaml_error::success)
+    return errc::success;
+  return errc::invalid_argument;
+}
+
+namespace llvm {
+const error_category &obj2yaml_category() {
+  static _obj2yaml_error_category o;
+  return o;
+}
+} // namespace llvm
diff --git a/tools/obj2yaml/Error.h b/tools/obj2yaml/Error.h
new file mode 100644
index 0000000..a326664
--- /dev/null
+++ b/tools/obj2yaml/Error.h
@@ -0,0 +1,42 @@
+//===- Error.h - system_error extensions for obj2yaml -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_ERROR_H
+#define LLVM_TOOLS_ERROR_H
+
+#include "llvm/Support/system_error.h"
+
+namespace llvm {
+
+const error_category &obj2yaml_category();
+
+struct obj2yaml_error {
+  enum _ {
+    success = 0,
+    file_not_found,
+    unrecognized_file_format,
+    unsupported_obj_file_format
+  };
+  _ v_;
+
+  obj2yaml_error(_ v) : v_(v) {}
+  explicit obj2yaml_error(int v) : v_(_(v)) {}
+  operator int() const {return v_;}
+};
+
+inline error_code make_error_code(obj2yaml_error e) {
+  return error_code(static_cast<int>(e), obj2yaml_category());
+}
+
+template <> struct is_error_code_enum<obj2yaml_error> : std::true_type { };
+template <> struct is_error_code_enum<obj2yaml_error::_> : std::true_type { };
+
+} // namespace llvm
+
+#endif
diff --git a/tools/obj2yaml/coff2yaml.cpp b/tools/obj2yaml/coff2yaml.cpp
index ef70922..42b09d3 100644
--- a/tools/obj2yaml/coff2yaml.cpp
+++ b/tools/obj2yaml/coff2yaml.cpp
@@ -210,10 +210,7 @@ COFFYAML::Object &COFFDumper::getYAMLObj() {
   return YAMLObj;
 }
 
-error_code coff2yaml(raw_ostream &Out, MemoryBuffer *Buff) {
-  error_code ec;
-  object::COFFObjectFile Obj(Buff, ec);
-  check(ec);
+error_code coff2yaml(raw_ostream &Out, const object::COFFObjectFile &Obj) {
   COFFDumper Dumper(Obj);
 
   yaml::Output Yout(Out);
diff --git a/tools/obj2yaml/elf2yaml.cpp b/tools/obj2yaml/elf2yaml.cpp
new file mode 100644
index 0000000..7642921
--- /dev/null
+++ b/tools/obj2yaml/elf2yaml.cpp
@@ -0,0 +1,290 @@
+//===------ utils/elf2yaml.cpp - obj2yaml conversion tool -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Error.h"
+#include "obj2yaml.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFYAML.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/YAMLTraits.h"
+
+using namespace llvm;
+
+namespace {
+
+template <class ELFT>
+class ELFDumper {
+  typedef typename object::ELFFile<ELFT>::Elf_Shdr Elf_Shdr;
+  typedef typename object::ELFFile<ELFT>::Elf_Sym_Iter Elf_Sym_Iter;
+
+  const object::ELFFile<ELFT> &Obj;
+
+  error_code dumpSymbol(Elf_Sym_Iter Sym, ELFYAML::Symbol &S);
+  error_code dumpCommonSection(const Elf_Shdr *Shdr, ELFYAML::Section &S);
+  template <class RelT>
+  error_code dumpRelocation(const Elf_Shdr *Shdr, const RelT *Rel,
+                            ELFYAML::Relocation &R);
+
+  ErrorOr<ELFYAML::RelocationSection *> dumpRelSection(const Elf_Shdr *Shdr);
+  ErrorOr<ELFYAML::RelocationSection *> dumpRelaSection(const Elf_Shdr *Shdr);
+  ErrorOr<ELFYAML::RawContentSection *>
+  dumpContentSection(const Elf_Shdr *Shdr);
+
+public:
+  ELFDumper(const object::ELFFile<ELFT> &O);
+  ErrorOr<ELFYAML::Object *> dump();
+};
+
+}
+
+template <class ELFT>
+ELFDumper<ELFT>::ELFDumper(const object::ELFFile<ELFT> &O)
+    : Obj(O) {}
+
+template <class ELFT>
+ErrorOr<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
+  auto Y = make_unique<ELFYAML::Object>();
+
+  // Dump header
+  Y->Header.Class = ELFYAML::ELF_ELFCLASS(Obj.getHeader()->getFileClass());
+  Y->Header.Data = ELFYAML::ELF_ELFDATA(Obj.getHeader()->getDataEncoding());
+  Y->Header.OSABI = Obj.getHeader()->e_ident[ELF::EI_OSABI];
+  Y->Header.Type = Obj.getHeader()->e_type;
+  Y->Header.Machine = Obj.getHeader()->e_machine;
+  Y->Header.Flags = Obj.getHeader()->e_flags;
+  Y->Header.Entry = Obj.getHeader()->e_entry;
+
+  // Dump sections
+  for (const Elf_Shdr &Sec : Obj.sections()) {
+    switch (Sec.sh_type) {
+    case ELF::SHT_NULL:
+    case ELF::SHT_SYMTAB:
+    case ELF::SHT_DYNSYM:
+    case ELF::SHT_STRTAB:
+      // Do not dump these sections.
+      break;
+    case ELF::SHT_RELA: {
+      ErrorOr<ELFYAML::RelocationSection *> S = dumpRelaSection(&Sec);
+      if (error_code EC = S.getError())
+        return EC;
+      Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(S.get()));
+      break;
+    }
+    case ELF::SHT_REL: {
+      ErrorOr<ELFYAML::RelocationSection *> S = dumpRelSection(&Sec);
+      if (error_code EC = S.getError())
+        return EC;
+      Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(S.get()));
+      break;
+    }
+    default: {
+      ErrorOr<ELFYAML::RawContentSection *> S = dumpContentSection(&Sec);
+      if (error_code EC = S.getError())
+        return EC;
+      Y->Sections.push_back(std::unique_ptr<ELFYAML::Section>(S.get()));
+    }
+    }
+  }
+
+  // Dump symbols
+  bool IsFirstSym = true;
+  for (auto SI = Obj.begin_symbols(), SE = Obj.end_symbols(); SI != SE; ++SI) {
+    if (IsFirstSym) {
+      IsFirstSym = false;
+      continue;
+    }
+
+    ELFYAML::Symbol S;
+    if (error_code EC = ELFDumper<ELFT>::dumpSymbol(SI, S))
+      return EC;
+
+    switch (SI->getBinding())
+    {
+    case ELF::STB_LOCAL:
+      Y->Symbols.Local.push_back(S);
+      break;
+    case ELF::STB_GLOBAL:
+      Y->Symbols.Global.push_back(S);
+      break;
+    case ELF::STB_WEAK:
+      Y->Symbols.Weak.push_back(S);
+      break;
+    default:
+      llvm_unreachable("Unknown ELF symbol binding");
+    }
+  }
+
+  return Y.release();
+}
+
+template <class ELFT>
+error_code ELFDumper<ELFT>::dumpSymbol(Elf_Sym_Iter Sym, ELFYAML::Symbol &S) {
+  S.Type = Sym->getType();
+  S.Value = Sym->st_value;
+  S.Size = Sym->st_size;
+
+  ErrorOr<StringRef> NameOrErr = Obj.getSymbolName(Sym);
+  if (error_code EC = NameOrErr.getError())
+    return EC;
+  S.Name = NameOrErr.get();
+
+  const Elf_Shdr *Shdr = Obj.getSection(&*Sym);
+  if (!Shdr)
+    return obj2yaml_error::success;
+
+  NameOrErr = Obj.getSectionName(Shdr);
+  if (error_code EC = NameOrErr.getError())
+    return EC;
+  S.Section = NameOrErr.get();
+
+  return obj2yaml_error::success;
+}
+
+template <class ELFT>
+template <class RelT>
+error_code ELFDumper<ELFT>::dumpRelocation(const Elf_Shdr *Shdr,
+                                           const RelT *Rel,
+                                           ELFYAML::Relocation &R) {
+  R.Type = Rel->getType(Obj.isMips64EL());
+  R.Offset = Rel->r_offset;
+  R.Addend = 0;
+
+  auto NamePair = Obj.getRelocationSymbol(Shdr, Rel);
+  if (!NamePair.first)
+    return obj2yaml_error::success;
+
+  ErrorOr<StringRef> NameOrErr =
+      Obj.getSymbolName(NamePair.first, NamePair.second);
+  if (error_code EC = NameOrErr.getError())
+    return EC;
+  R.Symbol = NameOrErr.get();
+
+  return obj2yaml_error::success;
+}
+
+template <class ELFT>
+error_code ELFDumper<ELFT>::dumpCommonSection(const Elf_Shdr *Shdr,
+                                              ELFYAML::Section &S) {
+  S.Type = Shdr->sh_type;
+  S.Flags = Shdr->sh_flags;
+  S.Address = Shdr->sh_addr;
+  S.AddressAlign = Shdr->sh_addralign;
+
+  ErrorOr<StringRef> NameOrErr = Obj.getSectionName(Shdr);
+  if (error_code EC = NameOrErr.getError())
+    return EC;
+  S.Name = NameOrErr.get();
+
+  if (Shdr->sh_link != ELF::SHN_UNDEF) {
+    if (const Elf_Shdr *LinkSection = Obj.getSection(Shdr->sh_link)) {
+      NameOrErr = Obj.getSectionName(LinkSection);
+      if (error_code EC = NameOrErr.getError())
+        return EC;
+      S.Link = NameOrErr.get();
+    }
+  }
+  if (Shdr->sh_info != ELF::SHN_UNDEF) {
+    if (const Elf_Shdr *InfoSection = Obj.getSection(Shdr->sh_info)) {
+      NameOrErr = Obj.getSectionName(InfoSection);
+      if (error_code EC = NameOrErr.getError())
+        return EC;
+      S.Info = NameOrErr.get();
+    }
+  }
+  return obj2yaml_error::success;
+}
+
+template <class ELFT>
+ErrorOr<ELFYAML::RelocationSection *>
+ELFDumper<ELFT>::dumpRelSection(const Elf_Shdr *Shdr) {
+  assert(Shdr->sh_type == ELF::SHT_REL && "Section type is not SHT_REL");
+  auto S = make_unique<ELFYAML::RelocationSection>();
+
+  if (error_code EC = dumpCommonSection(Shdr, *S))
+    return EC;
+
+  for (auto RI = Obj.begin_rel(Shdr), RE = Obj.end_rel(Shdr); RI != RE;
+       ++RI) {
+    ELFYAML::Relocation R;
+    if (error_code EC = dumpRelocation(Shdr, &*RI, R))
+      return EC;
+    S->Relocations.push_back(R);
+  }
+
+  return S.release();
+}
+
+template <class ELFT>
+ErrorOr<ELFYAML::RelocationSection *>
+ELFDumper<ELFT>::dumpRelaSection(const Elf_Shdr *Shdr) {
+  assert(Shdr->sh_type == ELF::SHT_RELA && "Section type is not SHT_RELA");
+  auto S = make_unique<ELFYAML::RelocationSection>();
+
+  if (error_code EC = dumpCommonSection(Shdr, *S))
+    return EC;
+
+  for (auto RI = Obj.begin_rela(Shdr), RE = Obj.end_rela(Shdr); RI != RE;
+       ++RI) {
+    ELFYAML::Relocation R;
+    if (error_code EC = dumpRelocation(Shdr, &*RI, R))
+      return EC;
+    R.Addend = RI->r_addend;
+    S->Relocations.push_back(R);
+  }
+
+  return S.release();
+}
+
+template <class ELFT>
+ErrorOr<ELFYAML::RawContentSection *>
+ELFDumper<ELFT>::dumpContentSection(const Elf_Shdr *Shdr) {
+  auto S = make_unique<ELFYAML::RawContentSection>();
+
+  if (error_code EC = dumpCommonSection(Shdr, *S))
+    return EC;
+
+  ErrorOr<ArrayRef<uint8_t>> ContentOrErr = Obj.getSectionContents(Shdr);
+  if (error_code EC = ContentOrErr.getError())
+    return EC;
+  S->Content = object::yaml::BinaryRef(ContentOrErr.get());
+  S->Size = S->Content.binary_size();
+
+  return S.release();
+}
+
+template <class ELFT>
+static error_code elf2yaml(raw_ostream &Out, const object::ELFFile<ELFT> &Obj) {
+  ELFDumper<ELFT> Dumper(Obj);
+  ErrorOr<ELFYAML::Object *> YAMLOrErr = Dumper.dump();
+  if (error_code EC = YAMLOrErr.getError())
+    return EC;
+
+  std::unique_ptr<ELFYAML::Object> YAML(YAMLOrErr.get());
+  yaml::Output Yout(Out);
+  Yout << *YAML;
+
+  return object::object_error::success;
+}
+
+error_code elf2yaml(raw_ostream &Out, const object::ObjectFile &Obj) {
+  if (const auto *ELFObj = dyn_cast<object::ELF32LEObjectFile>(&Obj))
+    return elf2yaml(Out, *ELFObj->getELFFile());
+
+  if (const auto *ELFObj = dyn_cast<object::ELF32BEObjectFile>(&Obj))
+    return elf2yaml(Out, *ELFObj->getELFFile());
+
+  if (const auto *ELFObj = dyn_cast<object::ELF64LEObjectFile>(&Obj))
+    return elf2yaml(Out, *ELFObj->getELFFile());
+
+  if (const auto *ELFObj = dyn_cast<object::ELF64BEObjectFile>(&Obj))
+    return elf2yaml(Out, *ELFObj->getELFFile());
+
+  return obj2yaml_error::unsupported_obj_file_format;
+}
diff --git a/tools/obj2yaml/obj2yaml.cpp b/tools/obj2yaml/obj2yaml.cpp
index 38779fe..7fe034d 100644
--- a/tools/obj2yaml/obj2yaml.cpp
+++ b/tools/obj2yaml/obj2yaml.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Error.h"
 #include "obj2yaml.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/COFF.h"
@@ -16,16 +17,32 @@
 #include "llvm/Support/Signals.h"
 
 using namespace llvm;
+using namespace llvm::object;
 
-namespace {
-enum ObjectFileType {
-  coff
-};
+static error_code dumpObject(const ObjectFile &Obj) {
+  if (Obj.isCOFF())
+    return coff2yaml(outs(), cast<COFFObjectFile>(Obj));
+  if (Obj.isELF())
+    return elf2yaml(outs(), Obj);
+
+  return obj2yaml_error::unsupported_obj_file_format;
 }
 
-cl::opt<ObjectFileType> InputFormat(
-    cl::desc("Choose input format"),
-    cl::values(clEnumVal(coff, "process COFF object files"), clEnumValEnd));
+static error_code dumpInput(StringRef File) {
+  if (File != "-" && !sys::fs::exists(File))
+    return obj2yaml_error::file_not_found;
+
+  ErrorOr<Binary *> BinaryOrErr = createBinary(File);
+  if (error_code EC = BinaryOrErr.getError())
+    return EC;
+
+  std::unique_ptr<Binary> Binary(BinaryOrErr.get());
+  // TODO: If this is an archive, then burst it and dump each entry
+  if (ObjectFile *Obj = dyn_cast<ObjectFile>(Binary.get()))
+    return dumpObject(*Obj);
+
+  return obj2yaml_error::unrecognized_file_format;
+}
 
 cl::opt<std::string> InputFilename(cl::Positional, cl::desc("<input file>"),
                                    cl::init("-"));
@@ -36,17 +53,9 @@ int main(int argc, char *argv[]) {
   PrettyStackTraceProgram X(argc, argv);
   llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
 
-  // Process the input file
-  std::unique_ptr<MemoryBuffer> buf;
-
-  // TODO: If this is an archive, then burst it and dump each entry
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename, buf)) {
-    errs() << "Error: '" << ec.message() << "' opening file '" << InputFilename
-           << "'\n";
-  } else {
-    ec = coff2yaml(outs(), buf.release());
-    if (ec)
-      errs() << "Error: " << ec.message() << " dumping COFF file\n";
+  if (error_code EC = dumpInput(InputFilename)) {
+    errs() << "Error: '" << EC.message() << "'\n";
+    return 1;
   }
 
   return 0;
diff --git a/tools/obj2yaml/obj2yaml.h b/tools/obj2yaml/obj2yaml.h
index bde82e6..73c58fa 100644
--- a/tools/obj2yaml/obj2yaml.h
+++ b/tools/obj2yaml/obj2yaml.h
@@ -13,10 +13,13 @@
 #ifndef LLVM_TOOLS_OBJ2YAML_H
 #define LLVM_TOOLS_OBJ2YAML_H
 
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
 
-llvm::error_code coff2yaml(llvm::raw_ostream &Out, llvm::MemoryBuffer *TheObj);
+llvm::error_code coff2yaml(llvm::raw_ostream &Out,
+                           const llvm::object::COFFObjectFile &Obj);
+llvm::error_code elf2yaml(llvm::raw_ostream &Out,
+                          const llvm::object::ObjectFile &Obj);
 
 #endif
diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp
index fc4a1bf..8076ff4 100644
--- a/tools/opt/NewPMDriver.cpp
+++ b/tools/opt/NewPMDriver.cpp
@@ -16,6 +16,7 @@
 #include "NewPMDriver.h"
 #include "Passes.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/IR/IRPrintingPasses.h"
@@ -34,14 +35,27 @@ bool llvm::runPassPipeline(StringRef Arg0, LLVMContext &Context, Module &M,
                            tool_output_file *Out, StringRef PassPipeline,
                            OutputKind OK, VerifierKind VK) {
   FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
   ModuleAnalysisManager MAM;
 
-  // FIXME: Lift this registration of analysis passes into a .def file adjacent
-  // to the one used to associate names with passes.
-  MAM.registerPass(LazyCallGraphAnalysis());
+#define MODULE_ANALYSIS(NAME, CREATE_PASS) \
+  MAM.registerPass(CREATE_PASS);
+#include "PassRegistry.def"
+
+#define CGSCC_ANALYSIS(NAME, CREATE_PASS) \
+  CGAM.registerPass(CREATE_PASS);
+#include "PassRegistry.def"
+
+#define FUNCTION_ANALYSIS(NAME, CREATE_PASS) \
+  FAM.registerPass(CREATE_PASS);
+#include "PassRegistry.def"
 
   // Cross register the analysis managers through their proxies.
   MAM.registerPass(FunctionAnalysisManagerModuleProxy(FAM));
+  MAM.registerPass(CGSCCAnalysisManagerModuleProxy(CGAM));
+  CGAM.registerPass(FunctionAnalysisManagerCGSCCProxy(FAM));
+  CGAM.registerPass(ModuleAnalysisManagerCGSCCProxy(MAM));
+  FAM.registerPass(CGSCCAnalysisManagerFunctionProxy(CGAM));
   FAM.registerPass(ModuleAnalysisManagerFunctionProxy(MAM));
 
   ModulePassManager MPM;
diff --git a/tools/opt/PassRegistry.def b/tools/opt/PassRegistry.def
new file mode 100644
index 0000000..e1e4900
--- /dev/null
+++ b/tools/opt/PassRegistry.def
@@ -0,0 +1,51 @@
+//===- PassRegistry.def - Registry of passes --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is used as the registry of passes that are part of the core LLVM
+// libraries. This file describes both transformation passes and analyses
+// Analyses are registered while transformation passes have names registered
+// that can be used when providing a textual pass pipeline.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef MODULE_ANALYSIS
+#define MODULE_ANALYSIS(NAME, CREATE_PASS)
+#endif
+MODULE_ANALYSIS("lcg", LazyCallGraphAnalysis())
+#undef MODULE_ANALYSIS
+
+#ifndef MODULE_PASS
+#define MODULE_PASS(NAME, CREATE_PASS)
+#endif
+MODULE_PASS("print", PrintModulePass(dbgs()))
+MODULE_PASS("print-cg", LazyCallGraphPrinterPass(dbgs()))
+#undef MODULE_PASS
+
+#ifndef CGSCC_ANALYSIS
+#define CGSCC_ANALYSIS(NAME, CREATE_PASS)
+#endif
+#undef CGSCC_ANALYSIS
+
+#ifndef CGSCC_PASS
+#define CGSCC_PASS(NAME, CREATE_PASS)
+#endif
+#undef CGSCC_PASS
+
+#ifndef FUNCTION_ANALYSIS
+#define FUNCTION_ANALYSIS(NAME, CREATE_PASS)
+#endif
+#undef FUNCTION_ANALYSIS
+
+#ifndef FUNCTION_PASS
+#define FUNCTION_PASS(NAME, CREATE_PASS)
+#endif
+FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
+#undef FUNCTION_PASS
diff --git a/tools/opt/Passes.cpp b/tools/opt/Passes.cpp
index ffdf9bf..a171f42 100644
--- a/tools/opt/Passes.cpp
+++ b/tools/opt/Passes.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Passes.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/PassManager.h"
@@ -31,6 +32,14 @@ struct NoOpModulePass {
   static StringRef name() { return "NoOpModulePass"; }
 };
 
+/// \brief No-op CGSCC pass which does nothing.
+struct NoOpCGSCCPass {
+  PreservedAnalyses run(LazyCallGraph::SCC *C) {
+    return PreservedAnalyses::all();
+  }
+  static StringRef name() { return "NoOpCGSCCPass"; }
+};
+
 /// \brief No-op function pass which does nothing.
 struct NoOpFunctionPass {
   PreservedAnalyses run(Function *F) { return PreservedAnalyses::all(); }
@@ -39,19 +48,29 @@ struct NoOpFunctionPass {
 
 } // End anonymous namespace.
 
-// FIXME: Factor all of the parsing logic into a .def file that we include
-// under different macros.
 static bool isModulePassName(StringRef Name) {
   if (Name == "no-op-module") return true;
-  if (Name == "print") return true;
-  if (Name == "print-cg") return true;
+
+#define MODULE_PASS(NAME, CREATE_PASS) if (Name == NAME) return true;
+#include "PassRegistry.def"
+
+  return false;
+}
+
+static bool isCGSCCPassName(StringRef Name) {
+  if (Name == "no-op-cgscc") return true;
+
+#define CGSCC_PASS(NAME, CREATE_PASS) if (Name == NAME) return true;
+#include "PassRegistry.def"
 
   return false;
 }
 
 static bool isFunctionPassName(StringRef Name) {
   if (Name == "no-op-function") return true;
-  if (Name == "print") return true;
+
+#define FUNCTION_PASS(NAME, CREATE_PASS) if (Name == NAME) return true;
+#include "PassRegistry.def"
 
   return false;
 }
@@ -61,14 +80,30 @@ static bool parseModulePassName(ModulePassManager &MPM, StringRef Name) {
     MPM.addPass(NoOpModulePass());
     return true;
   }
-  if (Name == "print") {
-    MPM.addPass(PrintModulePass(dbgs()));
-    return true;
+
+#define MODULE_PASS(NAME, CREATE_PASS)                                         \
+  if (Name == NAME) {                                                          \
+    MPM.addPass(CREATE_PASS);                                                  \
+    return true;                                                               \
   }
-  if (Name == "print-cg") {
-    MPM.addPass(LazyCallGraphPrinterPass(dbgs()));
+#include "PassRegistry.def"
+
+  return false;
+}
+
+static bool parseCGSCCPassName(CGSCCPassManager &CGPM, StringRef Name) {
+  if (Name == "no-op-cgscc") {
+    CGPM.addPass(NoOpCGSCCPass());
     return true;
   }
+
+#define CGSCC_PASS(NAME, CREATE_PASS)                                          \
+  if (Name == NAME) {                                                          \
+    CGPM.addPass(CREATE_PASS);                                                 \
+    return true;                                                               \
+  }
+#include "PassRegistry.def"
+
   return false;
 }
 
@@ -77,10 +112,14 @@ static bool parseFunctionPassName(FunctionPassManager &FPM, StringRef Name) {
     FPM.addPass(NoOpFunctionPass());
     return true;
   }
-  if (Name == "print") {
-    FPM.addPass(PrintFunctionPass(dbgs()));
-    return true;
+
+#define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
+  if (Name == NAME) {                                                          \
+    FPM.addPass(CREATE_PASS);                                                  \
+    return true;                                                               \
   }
+#include "PassRegistry.def"
+
   return false;
 }
 
@@ -121,6 +160,55 @@ static bool parseFunctionPassPipeline(FunctionPassManager &FPM,
   }
 }
 
+static bool parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
+                                      StringRef &PipelineText,
+                                      bool VerifyEachPass) {
+  for (;;) {
+    // Parse nested pass managers by recursing.
+    if (PipelineText.startswith("cgscc(")) {
+      CGSCCPassManager NestedCGPM;
+
+      // Parse the inner pipeline into the nested manager.
+      PipelineText = PipelineText.substr(strlen("cgscc("));
+      if (!parseCGSCCPassPipeline(NestedCGPM, PipelineText, VerifyEachPass) ||
+          PipelineText.empty())
+        return false;
+      assert(PipelineText[0] == ')');
+      PipelineText = PipelineText.substr(1);
+
+      // Add the nested pass manager with the appropriate adaptor.
+      CGPM.addPass(std::move(NestedCGPM));
+    } else if (PipelineText.startswith("function(")) {
+      FunctionPassManager NestedFPM;
+
+      // Parse the inner pipeline inte the nested manager.
+      PipelineText = PipelineText.substr(strlen("function("));
+      if (!parseFunctionPassPipeline(NestedFPM, PipelineText, VerifyEachPass) ||
+          PipelineText.empty())
+        return false;
+      assert(PipelineText[0] == ')');
+      PipelineText = PipelineText.substr(1);
+
+      // Add the nested pass manager with the appropriate adaptor.
+      CGPM.addPass(createCGSCCToFunctionPassAdaptor(std::move(NestedFPM)));
+    } else {
+      // Otherwise try to parse a pass name.
+      size_t End = PipelineText.find_first_of(",)");
+      if (!parseCGSCCPassName(CGPM, PipelineText.substr(0, End)))
+        return false;
+      // FIXME: No verifier support for CGSCC passes!
+
+      PipelineText = PipelineText.substr(End);
+    }
+
+    if (PipelineText.empty() || PipelineText[0] == ')')
+      return true;
+
+    assert(PipelineText[0] == ',');
+    PipelineText = PipelineText.substr(1);
+  }
+}
+
 static bool parseModulePassPipeline(ModulePassManager &MPM,
                                     StringRef &PipelineText,
                                     bool VerifyEachPass) {
@@ -139,6 +227,20 @@ static bool parseModulePassPipeline(ModulePassManager &MPM,
 
       // Now add the nested manager as a module pass.
       MPM.addPass(std::move(NestedMPM));
+    } else if (PipelineText.startswith("cgscc(")) {
+      CGSCCPassManager NestedCGPM;
+
+      // Parse the inner pipeline inte the nested manager.
+      PipelineText = PipelineText.substr(strlen("cgscc("));
+      if (!parseCGSCCPassPipeline(NestedCGPM, PipelineText, VerifyEachPass) ||
+          PipelineText.empty())
+        return false;
+      assert(PipelineText[0] == ')');
+      PipelineText = PipelineText.substr(1);
+
+      // Add the nested pass manager with the appropriate adaptor.
+      MPM.addPass(
+          createModuleToPostOrderCGSCCPassAdaptor(std::move(NestedCGPM)));
     } else if (PipelineText.startswith("function(")) {
       FunctionPassManager NestedFPM;
 
@@ -180,6 +282,14 @@ bool llvm::parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
   if (PipelineText.startswith("module("))
     return parseModulePassPipeline(MPM, PipelineText, VerifyEachPass) &&
            PipelineText.empty();
+  if (PipelineText.startswith("cgscc(")) {
+    CGSCCPassManager CGPM;
+    if (!parseCGSCCPassPipeline(CGPM, PipelineText, VerifyEachPass) ||
+        !PipelineText.empty())
+      return false;
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
+    return true;
+  }
   if (PipelineText.startswith("function(")) {
     FunctionPassManager FPM;
     if (!parseFunctionPassPipeline(FPM, PipelineText, VerifyEachPass) ||
@@ -196,6 +306,15 @@ bool llvm::parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
     return parseModulePassPipeline(MPM, PipelineText, VerifyEachPass) &&
            PipelineText.empty();
 
+  if (isCGSCCPassName(FirstName)) {
+    CGSCCPassManager CGPM;
+    if (!parseCGSCCPassPipeline(CGPM, PipelineText, VerifyEachPass) ||
+        !PipelineText.empty())
+      return false;
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
+    return true;
+  }
+
   if (isFunctionPassName(FirstName)) {
     FunctionPassManager FPM;
     if (!parseFunctionPassPipeline(FPM, PipelineText, VerifyEachPass) ||
diff --git a/tools/opt/PrintSCC.cpp b/tools/opt/PrintSCC.cpp
index cbc0a55..78ede2b 100644
--- a/tools/opt/PrintSCC.cpp
+++ b/tools/opt/PrintSCC.cpp
@@ -39,7 +39,7 @@ namespace {
     CFGSCC() : FunctionPass(ID) {}
     bool runOnFunction(Function& func) override;
 
-    void print(raw_ostream &O, const Module* = 0) const override { }
+    void print(raw_ostream &O, const Module* = nullptr) const override { }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesAll();
@@ -53,7 +53,7 @@ namespace {
     // run - Print out SCCs in the call graph for the specified module.
     bool runOnModule(Module &M) override;
 
-    void print(raw_ostream &O, const Module* = 0) const override { }
+    void print(raw_ostream &O, const Module* = nullptr) const override { }
 
     // getAnalysisUsage - This pass requires the CallGraph.
     void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -75,7 +75,7 @@ bool CFGSCC::runOnFunction(Function &F) {
   unsigned sccNum = 0;
   errs() << "SCCs for Function " << F.getName() << " in PostOrder:";
   for (scc_iterator<Function*> SCCI = scc_begin(&F); !SCCI.isAtEnd(); ++SCCI) {
-    std::vector<BasicBlock*> &nextSCC = *SCCI;
+    const std::vector<BasicBlock *> &nextSCC = *SCCI;
     errs() << "\nSCC #" << ++sccNum << " : ";
     for (std::vector<BasicBlock*>::const_iterator I = nextSCC.begin(),
            E = nextSCC.end(); I != E; ++I)
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 5a19881..6f0fbf6 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -35,6 +35,7 @@
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
@@ -191,7 +192,10 @@ static inline void addPass(PassManagerBase &PM, Pass *P) {
   PM.add(P);
 
   // If we are verifying all of the intermediate steps, add the verifier...
-  if (VerifyEach) PM.add(createVerifierPass());
+  if (VerifyEach) {
+    PM.add(createVerifierPass());
+    PM.add(createDebugInfoVerifierPass());
+  }
 }
 
 /// AddOptimizationPasses - This routine adds optimization passes
@@ -201,7 +205,8 @@ static inline void addPass(PassManagerBase &PM, Pass *P) {
 /// OptLevel - Optimization Level
 static void AddOptimizationPasses(PassManagerBase &MPM,FunctionPassManager &FPM,
                                   unsigned OptLevel, unsigned SizeLevel) {
-  FPM.add(createVerifierPass());                  // Verify that input is correct
+  FPM.add(createVerifierPass());          // Verify that input is correct
+  MPM.add(createDebugInfoVerifierPass()); // Verify that debug info is correct
 
   PassManagerBuilder Builder;
   Builder.OptLevel = OptLevel;
@@ -240,6 +245,9 @@ static void AddStandardCompilePasses(PassManagerBase &PM) {
   if (StripDebug)
     addPass(PM, createStripSymbolsPass(true));
 
+  // Verify debug info only after it's (possibly) stripped.
+  PM.add(createDebugInfoVerifierPass());
+
   if (DisableOptimizations) return;
 
   // -std-compile-opts adds the same module passes as -O3.
@@ -257,6 +265,9 @@ static void AddStandardLinkPasses(PassManagerBase &PM) {
   if (StripDebug)
     addPass(PM, createStripSymbolsPass(true));
 
+  // Verify debug info only after it's (possibly) stripped.
+  PM.add(createDebugInfoVerifierPass());
+
   if (DisableOptimizations) return;
 
   PassManagerBuilder Builder;
@@ -285,7 +296,7 @@ static TargetMachine* GetTargetMachine(Triple TheTriple) {
                                                          Error);
   // Some modules don't specify a triple, and this is okay.
   if (!TheTarget) {
-    return 0;
+    return nullptr;
   }
 
   // Package up features to be passed to target/subtarget
@@ -341,8 +352,9 @@ int main(int argc, char **argv) {
   initializeInstrumentation(Registry);
   initializeTarget(Registry);
   // For codegen passes, only passes that do IR to IR transformation are
-  // supported. For now, just add CodeGenPrepare.
+  // supported.
   initializeCodeGenPreparePass(Registry);
+  initializeAtomicExpandLoadLinkedPass(Registry);
 
 #ifdef LINK_POLLY_INTO_TOOLS
   polly::initializePollyPasses(Registry);
@@ -362,7 +374,7 @@ int main(int argc, char **argv) {
   std::unique_ptr<Module> M;
   M.reset(ParseIRFile(InputFilename, Err, Context));
 
-  if (M.get() == 0) {
+  if (!M.get()) {
     Err.print(argv[0], errs());
     return 1;
   }
@@ -442,7 +454,7 @@ int main(int argc, char **argv) {
     Passes.add(new DataLayoutPass(M.get()));
 
   Triple ModuleTriple(M->getTargetTriple());
-  TargetMachine *Machine = 0;
+  TargetMachine *Machine = nullptr;
   if (ModuleTriple.getArch())
     Machine = GetTargetMachine(Triple(ModuleTriple));
   std::unique_ptr<TargetMachine> TM(Machine);
@@ -526,7 +538,7 @@ int main(int argc, char **argv) {
     }
 
     const PassInfo *PassInf = PassList[i];
-    Pass *P = 0;
+    Pass *P = nullptr;
     if (PassInf->getTargetMachineCtor())
       P = PassInf->getTargetMachineCtor()(TM.get());
     else if (PassInf->getNormalCtor())
@@ -600,8 +612,10 @@ int main(int argc, char **argv) {
   }
 
   // Check that the module is well formed on completion of optimization
-  if (!NoVerify && !VerifyEach)
+  if (!NoVerify && !VerifyEach) {
     Passes.add(createVerifierPass());
+    Passes.add(createDebugInfoVerifierPass());
+  }
 
   // Write bitcode or assembly to the output as the last step...
   if (!NoOutput && !AnalyzeOnly) {
diff --git a/tools/yaml2obj/yaml2elf.cpp b/tools/yaml2obj/yaml2elf.cpp
index 21506d9..bb52cda 100644
--- a/tools/yaml2obj/yaml2elf.cpp
+++ b/tools/yaml2obj/yaml2elf.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ELFYAML.h"
+#include "llvm/Object/StringTableBuilder.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/YAMLTraits.h"
@@ -23,47 +24,6 @@
 
 using namespace llvm;
 
-// There is similar code in yaml2coff, but with some slight COFF-specific
-// variations like different initial state. Might be able to deduplicate
-// some day, but also want to make sure that the Mach-O use case is served.
-//
-// This class has a deliberately small interface, since a lot of
-// implementation variation is possible.
-//
-// TODO: Use an ordered container with a suffix-based comparison in order
-// to deduplicate suffixes. std::map<> with a custom comparator is likely
-// to be the simplest implementation, but a suffix trie could be more
-// suitable for the job.
-namespace {
-class StringTableBuilder {
-  /// \brief Indices of strings currently present in `Buf`.
-  StringMap<unsigned> StringIndices;
-  /// \brief The contents of the string table as we build it.
-  std::string Buf;
-public:
-  StringTableBuilder() {
-    Buf.push_back('\0');
-  }
-  /// \returns Index of string in string table.
-  unsigned addString(StringRef S) {
-    StringMapEntry<unsigned> &Entry = StringIndices.GetOrCreateValue(S);
-    unsigned &I = Entry.getValue();
-    if (I != 0)
-      return I;
-    I = Buf.size();
-    Buf.append(S.begin(), S.end());
-    Buf.push_back('\0');
-    return I;
-  }
-  size_t size() const {
-    return Buf.size();
-  }
-  void writeToStream(raw_ostream &OS) {
-    OS.write(Buf.data(), Buf.size());
-  }
-};
-} // end anonymous namespace
-
 // This class is used to build up a contiguous binary blob while keeping
 // track of an offset in the output (which notionally begins at
 // `InitialOffset`).
@@ -94,23 +54,23 @@ public:
 };
 } // end anonymous namespace
 
-// Used to keep track of section names, so that in the YAML file sections
-// can be referenced by name instead of by index.
+// Used to keep track of section and symbol names, so that in the YAML file
+// sections and symbols can be referenced by name instead of by index.
 namespace {
-class SectionNameToIdxMap {
+class NameToIdxMap {
   StringMap<int> Map;
 public:
   /// \returns true if name is already present in the map.
-  bool addName(StringRef SecName, unsigned i) {
-    StringMapEntry<int> &Entry = Map.GetOrCreateValue(SecName, -1);
+  bool addName(StringRef Name, unsigned i) {
+    StringMapEntry<int> &Entry = Map.GetOrCreateValue(Name, -1);
     if (Entry.getValue() != -1)
       return true;
     Entry.setValue((int)i);
     return false;
   }
   /// \returns true if name is not present in the map
-  bool lookupSection(StringRef SecName, unsigned &Idx) const {
-    StringMap<int>::const_iterator I = Map.find(SecName);
+  bool lookup(StringRef Name, unsigned &Idx) const {
+    StringMap<int>::const_iterator I = Map.find(Name);
     if (I == Map.end())
       return true;
     Idx = I->getValue();
@@ -143,6 +103,8 @@ class ELFState {
   typedef typename object::ELFFile<ELFT>::Elf_Ehdr Elf_Ehdr;
   typedef typename object::ELFFile<ELFT>::Elf_Shdr Elf_Shdr;
   typedef typename object::ELFFile<ELFT>::Elf_Sym Elf_Sym;
+  typedef typename object::ELFFile<ELFT>::Elf_Rel Elf_Rel;
+  typedef typename object::ELFFile<ELFT>::Elf_Rela Elf_Rela;
 
   /// \brief The future ".strtab" section.
   StringTableBuilder DotStrtab;
@@ -150,10 +112,13 @@ class ELFState {
   /// \brief The future ".shstrtab" section.
   StringTableBuilder DotShStrtab;
 
-  SectionNameToIdxMap SN2I;
+  NameToIdxMap SN2I;
+  NameToIdxMap SymN2I;
   const ELFYAML::Object &Doc;
 
   bool buildSectionIndex();
+  bool buildSymbolIndex(std::size_t &StartIndex,
+                        const std::vector<ELFYAML::Symbol> &Symbols);
   void initELFHeader(Elf_Ehdr &Header);
   bool initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
                           ContiguousBlobAccumulator &CBA);
@@ -164,6 +129,12 @@ class ELFState {
                                ContiguousBlobAccumulator &CBA);
   void addSymbols(const std::vector<ELFYAML::Symbol> &Symbols,
                   std::vector<Elf_Sym> &Syms, unsigned SymbolBinding);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::RawContentSection &Section,
+                           ContiguousBlobAccumulator &CBA);
+  bool writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::RelocationSection &Section,
+                           ContiguousBlobAccumulator &CBA);
 
   // - SHT_NULL entry (placed first, i.e. 0'th entry)
   // - symbol table (.symtab) (placed third to last)
@@ -217,28 +188,48 @@ bool ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
   zero(SHeader);
   SHeaders.push_back(SHeader);
 
+  for (const auto &Sec : Doc.Sections)
+    DotShStrtab.add(Sec->Name);
+  DotShStrtab.finalize();
+
   for (const auto &Sec : Doc.Sections) {
     zero(SHeader);
-    SHeader.sh_name = DotShStrtab.addString(Sec.Name);
-    SHeader.sh_type = Sec.Type;
-    SHeader.sh_flags = Sec.Flags;
-    SHeader.sh_addr = Sec.Address;
-
-    Sec.Content.writeAsBinary(CBA.getOSAndAlignedOffset(SHeader.sh_offset));
-    SHeader.sh_size = Sec.Content.binary_size();
+    SHeader.sh_name = DotShStrtab.getOffset(Sec->Name);
+    SHeader.sh_type = Sec->Type;
+    SHeader.sh_flags = Sec->Flags;
+    SHeader.sh_addr = Sec->Address;
+    SHeader.sh_addralign = Sec->AddressAlign;
 
-    if (!Sec.Link.empty()) {
+    if (!Sec->Link.empty()) {
       unsigned Index;
-      if (SN2I.lookupSection(Sec.Link, Index)) {
-        errs() << "error: Unknown section referenced: '" << Sec.Link
-               << "' at YAML section '" << Sec.Name << "'.\n";
-        return false;;
+      if (SN2I.lookup(Sec->Link, Index)) {
+        errs() << "error: Unknown section referenced: '" << Sec->Link
+               << "' at YAML section '" << Sec->Name << "'.\n";
+        return false;
       }
       SHeader.sh_link = Index;
     }
-    SHeader.sh_info = 0;
-    SHeader.sh_addralign = Sec.AddressAlign;
-    SHeader.sh_entsize = 0;
+
+    if (auto S = dyn_cast<ELFYAML::RawContentSection>(Sec.get()))
+      writeSectionContent(SHeader, *S, CBA);
+    else if (auto S = dyn_cast<ELFYAML::RelocationSection>(Sec.get())) {
+      if (S->Link.empty())
+        // For relocation section set link to .symtab by default.
+        SHeader.sh_link = getDotSymTabSecNo();
+
+      unsigned Index;
+      if (SN2I.lookup(S->Info, Index)) {
+        errs() << "error: Unknown section referenced: '" << S->Info
+               << "' at YAML section '" << S->Name << "'.\n";
+        return false;
+      }
+      SHeader.sh_info = Index;
+
+      if (!writeSectionContent(SHeader, *S, CBA))
+        return false;
+    } else
+      llvm_unreachable("Unknown section type");
+
     SHeaders.push_back(SHeader);
   }
   return true;
@@ -248,7 +239,7 @@ template <class ELFT>
 void ELFState<ELFT>::initSymtabSectionHeader(Elf_Shdr &SHeader,
                                              ContiguousBlobAccumulator &CBA) {
   zero(SHeader);
-  SHeader.sh_name = DotShStrtab.addString(StringRef(".symtab"));
+  SHeader.sh_name = DotShStrtab.getOffset(".symtab");
   SHeader.sh_type = ELF::SHT_SYMTAB;
   SHeader.sh_link = getDotStrTabSecNo();
   // One greater than symbol table index of the last local symbol.
@@ -262,6 +253,16 @@ void ELFState<ELFT>::initSymtabSectionHeader(Elf_Shdr &SHeader,
     zero(Sym);
     Syms.push_back(Sym);
   }
+
+  // Add symbol names to .strtab.
+  for (const auto &Sym : Doc.Symbols.Local)
+    DotStrtab.add(Sym.Name);
+  for (const auto &Sym : Doc.Symbols.Global)
+    DotStrtab.add(Sym.Name);
+  for (const auto &Sym : Doc.Symbols.Weak)
+    DotStrtab.add(Sym.Name);
+  DotStrtab.finalize();
+
   addSymbols(Doc.Symbols.Local, Syms, ELF::STB_LOCAL);
   addSymbols(Doc.Symbols.Global, Syms, ELF::STB_GLOBAL);
   addSymbols(Doc.Symbols.Weak, Syms, ELF::STB_WEAK);
@@ -276,10 +277,10 @@ void ELFState<ELFT>::initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name,
                                              StringTableBuilder &STB,
                                              ContiguousBlobAccumulator &CBA) {
   zero(SHeader);
-  SHeader.sh_name = DotShStrtab.addString(Name);
+  SHeader.sh_name = DotShStrtab.getOffset(Name);
   SHeader.sh_type = ELF::SHT_STRTAB;
-  STB.writeToStream(CBA.getOSAndAlignedOffset(SHeader.sh_offset));
-  SHeader.sh_size = STB.size();
+  CBA.getOSAndAlignedOffset(SHeader.sh_offset) << STB.data();
+  SHeader.sh_size = STB.data().size();
   SHeader.sh_addralign = 1;
 }
 
@@ -291,11 +292,11 @@ void ELFState<ELFT>::addSymbols(const std::vector<ELFYAML::Symbol> &Symbols,
     Elf_Sym Symbol;
     zero(Symbol);
     if (!Sym.Name.empty())
-      Symbol.st_name = DotStrtab.addString(Sym.Name);
+      Symbol.st_name = DotStrtab.getOffset(Sym.Name);
     Symbol.setBindingAndType(SymbolBinding, Sym.Type);
     if (!Sym.Section.empty()) {
       unsigned Index;
-      if (SN2I.lookupSection(Sym.Section, Index)) {
+      if (SN2I.lookup(Sym.Section, Index)) {
         errs() << "error: Unknown section referenced: '" << Sym.Section
                << "' by YAML symbol " << Sym.Name << ".\n";
         exit(1);
@@ -308,13 +309,71 @@ void ELFState<ELFT>::addSymbols(const std::vector<ELFYAML::Symbol> &Symbols,
   }
 }
 
+template <class ELFT>
+void
+ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
+                                    const ELFYAML::RawContentSection &Section,
+                                    ContiguousBlobAccumulator &CBA) {
+  assert(Section.Size >= Section.Content.binary_size() &&
+         "Section size and section content are inconsistent");
+  raw_ostream &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset);
+  Section.Content.writeAsBinary(OS);
+  for (auto i = Section.Content.binary_size(); i < Section.Size; ++i)
+    OS.write(0);
+  SHeader.sh_entsize = 0;
+  SHeader.sh_size = Section.Size;
+}
+
+template <class ELFT>
+bool
+ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
+                                    const ELFYAML::RelocationSection &Section,
+                                    ContiguousBlobAccumulator &CBA) {
+  if (Section.Type != llvm::ELF::SHT_REL &&
+      Section.Type != llvm::ELF::SHT_RELA) {
+    errs() << "error: Invalid relocation section type.\n";
+    return false;
+  }
+
+  bool IsRela = Section.Type == llvm::ELF::SHT_RELA;
+  SHeader.sh_entsize = IsRela ? sizeof(Elf_Rela) : sizeof(Elf_Rel);
+  SHeader.sh_size = SHeader.sh_entsize * Section.Relocations.size();
+
+  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset);
+
+  for (const auto &Rel : Section.Relocations) {
+    unsigned SymIdx;
+    if (SymN2I.lookup(Rel.Symbol, SymIdx)) {
+      errs() << "error: Unknown symbol referenced: '" << Rel.Symbol
+             << "' at YAML relocation.\n";
+      return false;
+    }
+
+    if (IsRela) {
+      Elf_Rela REntry;
+      zero(REntry);
+      REntry.r_offset = Rel.Offset;
+      REntry.r_addend = Rel.Addend;
+      REntry.setSymbolAndType(SymIdx, Rel.Type);
+      OS.write((const char *)&REntry, sizeof(REntry));
+    } else {
+      Elf_Rel REntry;
+      zero(REntry);
+      REntry.r_offset = Rel.Offset;
+      REntry.setSymbolAndType(SymIdx, Rel.Type);
+      OS.write((const char *)&REntry, sizeof(REntry));
+    }
+  }
+  return true;
+}
+
 template <class ELFT> bool ELFState<ELFT>::buildSectionIndex() {
   SN2I.addName(".symtab", getDotSymTabSecNo());
   SN2I.addName(".strtab", getDotStrTabSecNo());
   SN2I.addName(".shstrtab", getDotShStrTabSecNo());
 
   for (unsigned i = 0, e = Doc.Sections.size(); i != e; ++i) {
-    StringRef Name = Doc.Sections[i].Name;
+    StringRef Name = Doc.Sections[i]->Name;
     if (Name.empty())
       continue;
     // "+ 1" to take into account the SHT_NULL entry.
@@ -328,11 +387,33 @@ template <class ELFT> bool ELFState<ELFT>::buildSectionIndex() {
 }
 
 template <class ELFT>
+bool
+ELFState<ELFT>::buildSymbolIndex(std::size_t &StartIndex,
+                                 const std::vector<ELFYAML::Symbol> &Symbols) {
+  for (const auto &Sym : Symbols) {
+    ++StartIndex;
+    if (Sym.Name.empty())
+      continue;
+    if (SymN2I.addName(Sym.Name, StartIndex)) {
+      errs() << "error: Repeated symbol name: '" << Sym.Name << "'.\n";
+      return false;
+    }
+  }
+  return true;
+}
+
+template <class ELFT>
 int ELFState<ELFT>::writeELF(raw_ostream &OS, const ELFYAML::Object &Doc) {
   ELFState<ELFT> State(Doc);
   if (!State.buildSectionIndex())
     return 1;
 
+  std::size_t StartSymIndex = 0;
+  if (!State.buildSymbolIndex(StartSymIndex, Doc.Symbols.Local) ||
+      !State.buildSymbolIndex(StartSymIndex, Doc.Symbols.Global) ||
+      !State.buildSymbolIndex(StartSymIndex, Doc.Symbols.Weak))
+    return 1;
+
   Elf_Ehdr Header;
   State.initELFHeader(Header);
 
@@ -345,6 +426,12 @@ int ELFState<ELFT>::writeELF(raw_ostream &OS, const ELFYAML::Object &Doc) {
       Header.e_ehsize + Header.e_shentsize * Header.e_shnum;
   ContiguousBlobAccumulator CBA(SectionContentBeginOffset);
 
+  // Doc might not contain .symtab, .strtab and .shstrtab sections,
+  // but we will emit them, so make sure to add them to ShStrTabSHeader.
+  State.DotShStrtab.add(".symtab");
+  State.DotShStrtab.add(".strtab");
+  State.DotShStrtab.add(".shstrtab");
+
   std::vector<Elf_Shdr> SHeaders;
   if(!State.initSectionHeaders(SHeaders, CBA))
     return 1;
@@ -395,13 +482,13 @@ int yaml2elf(llvm::raw_ostream &Out, llvm::MemoryBuffer *Buf) {
   typedef ELFType<support::big, 4, false> BE32;
   if (is64Bit(Doc)) {
     if (isLittleEndian(Doc))
-      return ELFState<LE64>::writeELF(outs(), Doc);
+      return ELFState<LE64>::writeELF(Out, Doc);
     else
-      return ELFState<BE64>::writeELF(outs(), Doc);
+      return ELFState<BE64>::writeELF(Out, Doc);
   } else {
     if (isLittleEndian(Doc))
-      return ELFState<LE32>::writeELF(outs(), Doc);
+      return ELFState<LE32>::writeELF(Out, Doc);
     else
-      return ELFState<BE32>::writeELF(outs(), Doc);
+      return ELFState<BE32>::writeELF(Out, Doc);
   }
 }
diff --git a/tools/yaml2obj/yaml2obj.cpp b/tools/yaml2obj/yaml2obj.cpp
index cc0fecc..2493b48 100644
--- a/tools/yaml2obj/yaml2obj.cpp
+++ b/tools/yaml2obj/yaml2obj.cpp
@@ -16,12 +16,14 @@
 
 #include "yaml2obj.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
+#include "llvm/Support/ToolOutputFile.h"
 
 using namespace llvm;
 
@@ -49,6 +51,8 @@ cl::opt<YAMLObjectFormat> Format(
     clEnumValN(YOF_ELF, "elf", "ELF object file format"),
   clEnumValEnd));
 
+static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
+                                           cl::value_desc("filename"));
 
 int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv);
@@ -56,15 +60,31 @@ int main(int argc, char **argv) {
   PrettyStackTraceProgram X(argc, argv);
   llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
 
+  if (OutputFilename.empty())
+    OutputFilename = "-";
+
+  std::string ErrorInfo;
+  std::unique_ptr<tool_output_file> Out(
+      new tool_output_file(OutputFilename.c_str(), ErrorInfo, sys::fs::F_None));
+  if (!ErrorInfo.empty()) {
+    errs() << ErrorInfo << '\n';
+    return 1;
+  }
+
   std::unique_ptr<MemoryBuffer> Buf;
   if (MemoryBuffer::getFileOrSTDIN(Input, Buf))
     return 1;
-  if (Format == YOF_COFF) {
-    return yaml2coff(outs(), Buf.get());
-  } else if (Format == YOF_ELF) {
-    return yaml2elf(outs(), Buf.get());
-  } else {
+
+  int Res = 1;
+  if (Format == YOF_COFF)
+    Res = yaml2coff(Out->os(), Buf.get());
+  else if (Format == YOF_ELF)
+    Res = yaml2elf(Out->os(), Buf.get());
+  else
     errs() << "Not yet implemented\n";
-    return 1;
-  }
+
+  if (Res == 0)
+    Out->keep();
+
+  return Res;
 }
diff --git a/unittests/ADT/PointerUnionTest.cpp b/unittests/ADT/PointerUnionTest.cpp
index 7eb7181..3bfb79c 100644
--- a/unittests/ADT/PointerUnionTest.cpp
+++ b/unittests/ADT/PointerUnionTest.cpp
@@ -13,22 +13,24 @@ using namespace llvm;
 
 namespace {
 
-typedef PointerUnion<int*, float*> PU;
+typedef PointerUnion<int *, float *> PU;
 
-// Test fixture
-class PointerUnionTest : public testing::Test {
-};
+struct PointerUnionTest : public testing::Test {
+  float f;
+  int i;
 
-float f = 3.14f;
-int i = 42;
+  PU a, b, c, n;
 
-const PU a(&f);
-const PU b(&i);
-const PU n;
+  PointerUnionTest() : f(3.14f), i(42), a(&f), b(&i), c(&i), n() {}
+};
 
 TEST_F(PointerUnionTest, Comparison) {
+  EXPECT_TRUE(a == a);
+  EXPECT_FALSE(a != a);
   EXPECT_TRUE(a != b);
   EXPECT_FALSE(a == b);
+  EXPECT_TRUE(b == c);
+  EXPECT_FALSE(b != c);
   EXPECT_TRUE(b != n);
   EXPECT_FALSE(b == n);
 }
@@ -44,21 +46,27 @@ TEST_F(PointerUnionTest, Null) {
   EXPECT_TRUE((bool)a);
   EXPECT_TRUE((bool)b);
   EXPECT_FALSE(n);
+
+  EXPECT_NE(n, b);
+  EXPECT_EQ(b, c);
+  b = nullptr;
+  EXPECT_EQ(n, b);
+  EXPECT_NE(b, c);
 }
 
 TEST_F(PointerUnionTest, Is) {
-  EXPECT_FALSE(a.is<int*>());
-  EXPECT_TRUE(a.is<float*>());
-  EXPECT_TRUE(b.is<int*>());
-  EXPECT_FALSE(b.is<float*>());
-  EXPECT_TRUE(n.is<int*>());
-  EXPECT_FALSE(n.is<float*>());
+  EXPECT_FALSE(a.is<int *>());
+  EXPECT_TRUE(a.is<float *>());
+  EXPECT_TRUE(b.is<int *>());
+  EXPECT_FALSE(b.is<float *>());
+  EXPECT_TRUE(n.is<int *>());
+  EXPECT_FALSE(n.is<float *>());
 }
 
 TEST_F(PointerUnionTest, Get) {
-  EXPECT_EQ(a.get<float*>(), &f);
-  EXPECT_EQ(b.get<int*>(), &i);
-  EXPECT_EQ(n.get<int*>(), (int*)0);
+  EXPECT_EQ(a.get<float *>(), &f);
+  EXPECT_EQ(b.get<int *>(), &i);
+  EXPECT_EQ(n.get<int *>(), (int *)0);
 }
 
 } // end anonymous namespace
diff --git a/unittests/ADT/SCCIteratorTest.cpp b/unittests/ADT/SCCIteratorTest.cpp
index 92b4b31..8609732 100644
--- a/unittests/ADT/SCCIteratorTest.cpp
+++ b/unittests/ADT/SCCIteratorTest.cpp
@@ -277,7 +277,7 @@ TEST(SCCIteratorTest, AllSmallGraphs) {
     GT::NodeSubset NodesInSomeSCC;
 
     for (scc_iterator<GT> I = scc_begin(G), E = scc_end(G); I != E; ++I) {
-      std::vector<GT::NodeType*> &SCC = *I;
+      const std::vector<GT::NodeType *> &SCC = *I;
 
       // Get the nodes in this SCC as a NodeSubset rather than a vector.
       GT::NodeSubset NodesInThisSCC;
diff --git a/unittests/ADT/SmallVectorTest.cpp b/unittests/ADT/SmallVectorTest.cpp
index 90c7982..58f5591 100644
--- a/unittests/ADT/SmallVectorTest.cpp
+++ b/unittests/ADT/SmallVectorTest.cpp
@@ -29,27 +29,43 @@ private:
   static int numDestructorCalls;
   static int numAssignmentCalls;
 
+  bool constructed;
   int value;
 
 public:
-  Constructable() : value(0) {
+  Constructable() : constructed(true), value(0) {
     ++numConstructorCalls;
   }
 
-  Constructable(int val) : value(val) {
+  Constructable(int val) : constructed(true), value(val) {
     ++numConstructorCalls;
   }
 
-  Constructable(const Constructable & src) {
+  Constructable(const Constructable & src) : constructed(true) {
+    value = src.value;
+    ++numConstructorCalls;
+  }
+
+  Constructable(Constructable && src) : constructed(true) {
     value = src.value;
     ++numConstructorCalls;
   }
 
   ~Constructable() {
+    EXPECT_TRUE(constructed);
     ++numDestructorCalls;
+    constructed = false;
   }
 
   Constructable & operator=(const Constructable & src) {
+    EXPECT_TRUE(constructed);
+    value = src.value;
+    ++numAssignmentCalls;
+    return *this;
+  }
+
+  Constructable & operator=(Constructable && src) {
+    EXPECT_TRUE(constructed);
     value = src.value;
     ++numAssignmentCalls;
     return *this;
@@ -338,6 +354,36 @@ TYPED_TEST(SmallVectorTest, AssignTest) {
   this->assertValuesInOrder(this->theVector, 2u, 77, 77);
 }
 
+// Move-assign test
+TYPED_TEST(SmallVectorTest, MoveAssignTest) {
+  SCOPED_TRACE("MoveAssignTest");
+
+  // Set up our vector with a single element, but enough capacity for 4.
+  this->theVector.reserve(4);
+  this->theVector.push_back(Constructable(1));
+  
+  // Set up the other vector with 2 elements.
+  this->otherVector.push_back(Constructable(2));
+  this->otherVector.push_back(Constructable(3));
+
+  // Move-assign from the other vector.
+  this->theVector = std::move(this->otherVector);
+
+  // Make sure we have the right result.
+  this->assertValuesInOrder(this->theVector, 2u, 2, 3);
+
+  // Make sure the # of constructor/destructor calls line up. There
+  // are two live objects after clearing the other vector.
+  this->otherVector.clear();
+  EXPECT_EQ(Constructable::getNumConstructorCalls()-2, 
+            Constructable::getNumDestructorCalls());
+
+  // There shouldn't be any live objects any more.
+  this->theVector.clear();
+  EXPECT_EQ(Constructable::getNumConstructorCalls(), 
+            Constructable::getNumDestructorCalls());
+}
+
 // Erase a single element
 TYPED_TEST(SmallVectorTest, EraseTest) {
   SCOPED_TRACE("EraseTest");
@@ -455,13 +501,12 @@ TYPED_TEST(SmallVectorTest, DirectVectorTest) {
   this->theVector.reserve(4);
   EXPECT_LE(4u, this->theVector.capacity());
   EXPECT_EQ(0, Constructable::getNumConstructorCalls());
-  this->theVector.end()[0] = 1;
-  this->theVector.end()[1] = 2;
-  this->theVector.end()[2] = 3;
-  this->theVector.end()[3] = 4;
-  this->theVector.set_size(4);
+  this->theVector.push_back(1);
+  this->theVector.push_back(2);
+  this->theVector.push_back(3);
+  this->theVector.push_back(4);
   EXPECT_EQ(4u, this->theVector.size());
-  EXPECT_EQ(4, Constructable::getNumConstructorCalls());
+  EXPECT_EQ(8, Constructable::getNumConstructorCalls());
   EXPECT_EQ(1, this->theVector[0].getValue());
   EXPECT_EQ(2, this->theVector[1].getValue());
   EXPECT_EQ(3, this->theVector[2].getValue());
diff --git a/unittests/ADT/StringMapTest.cpp b/unittests/ADT/StringMapTest.cpp
index b6d41bc..de18e07 100644
--- a/unittests/ADT/StringMapTest.cpp
+++ b/unittests/ADT/StringMapTest.cpp
@@ -218,4 +218,92 @@ TEST_F(StringMapTest, NonDefaultConstructable) {
   ASSERT_EQ(iter->second.i, 123);
 }
 
+struct MoveOnly {
+  int i;
+  MoveOnly(int i) : i(i) {}
+  MoveOnly(MoveOnly &&RHS) : i(RHS.i) {}
+  MoveOnly &operator=(MoveOnly &&RHS) {
+    i = RHS.i;
+    return *this;
+  }
+
+private:
+  MoveOnly(const MoveOnly &);
+  MoveOnly &operator=(const MoveOnly &);
+};
+
+TEST_F(StringMapTest, MoveOnlyKey) {
+  StringMap<MoveOnly> t;
+  t.GetOrCreateValue("Test", MoveOnly(42));
+  StringRef Key = "Test";
+  StringMapEntry<MoveOnly>::Create(Key.begin(), Key.end(), MoveOnly(42))
+      ->Destroy();
+}
+
+TEST_F(StringMapTest, MoveConstruct) {
+  StringMap<int> A;
+  A.GetOrCreateValue("x", 42);
+  StringMap<int> B = std::move(A);
+  ASSERT_EQ(A.size(), 0u);
+  ASSERT_EQ(B.size(), 1u);
+  ASSERT_EQ(B["x"], 42);
+  ASSERT_EQ(B.count("y"), 0u);
+}
+
+TEST_F(StringMapTest, MoveAssignment) {
+  StringMap<int> A;
+  A["x"] = 42;
+  StringMap<int> B;
+  B["y"] = 117;
+  A = std::move(B);
+  ASSERT_EQ(A.size(), 1u);
+  ASSERT_EQ(B.size(), 0u);
+  ASSERT_EQ(A["y"], 117);
+  ASSERT_EQ(B.count("x"), 0u);
+}
+
+struct Countable {
+  int &InstanceCount;
+  int Number;
+  Countable(int Number, int &InstanceCount)
+      : InstanceCount(InstanceCount), Number(Number) {
+    ++InstanceCount;
+  }
+  Countable(Countable &&C) : InstanceCount(C.InstanceCount), Number(C.Number) {
+    ++InstanceCount;
+    C.Number = -1;
+  }
+  Countable(const Countable &C)
+      : InstanceCount(C.InstanceCount), Number(C.Number) {
+    ++InstanceCount;
+  }
+  Countable &operator=(Countable C) {
+    Number = C.Number;
+    return *this;
+  }
+  ~Countable() { --InstanceCount; }
+};
+
+TEST_F(StringMapTest, MoveDtor) {
+  int InstanceCount = 0;
+  StringMap<Countable> A;
+  A.GetOrCreateValue("x", Countable(42, InstanceCount));
+  ASSERT_EQ(InstanceCount, 1);
+  auto I = A.find("x");
+  ASSERT_NE(I, A.end());
+  ASSERT_EQ(I->second.Number, 42);
+
+  StringMap<Countable> B;
+  B = std::move(A);
+  ASSERT_EQ(InstanceCount, 1);
+  ASSERT_TRUE(A.empty());
+  I = B.find("x");
+  ASSERT_NE(I, B.end());
+  ASSERT_EQ(I->second.Number, 42);
+
+  B = StringMap<Countable>();
+  ASSERT_EQ(InstanceCount, 0);
+  ASSERT_TRUE(B.empty());
+}
+
 } // end anonymous namespace
diff --git a/unittests/ADT/StringRefTest.cpp b/unittests/ADT/StringRefTest.cpp
index c7fd9d0..d80179b 100644
--- a/unittests/ADT/StringRefTest.cpp
+++ b/unittests/ADT/StringRefTest.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Allocator.h"
diff --git a/unittests/Analysis/CMakeLists.txt b/unittests/Analysis/CMakeLists.txt
index d9f8c0c..8454860 100644
--- a/unittests/Analysis/CMakeLists.txt
+++ b/unittests/Analysis/CMakeLists.txt
@@ -7,5 +7,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(AnalysisTests
   CFGTest.cpp
+  LazyCallGraphTest.cpp
   ScalarEvolutionTest.cpp
+  MixedTBAATest.cpp
   )
diff --git a/unittests/Analysis/LazyCallGraphTest.cpp b/unittests/Analysis/LazyCallGraphTest.cpp
new file mode 100644
index 0000000..d7c7045
--- /dev/null
+++ b/unittests/Analysis/LazyCallGraphTest.cpp
@@ -0,0 +1,720 @@
+//===- LazyCallGraphTest.cpp - Unit tests for the lazy CG analysis --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+using namespace llvm;
+
+namespace {
+
+std::unique_ptr<Module> parseAssembly(const char *Assembly) {
+  auto M = make_unique<Module>("Module", getGlobalContext());
+
+  SMDiagnostic Error;
+  bool Parsed =
+      ParseAssemblyString(Assembly, M.get(), Error, M->getContext()) == M.get();
+
+  std::string ErrMsg;
+  raw_string_ostream OS(ErrMsg);
+  Error.print("", OS);
+
+  // A failure here means that the test itself is buggy.
+  if (!Parsed)
+    report_fatal_error(OS.str().c_str());
+
+  return M;
+}
+
+// IR forming a call graph with a diamond of triangle-shaped SCCs:
+//
+//         d1
+//        /  \
+//       d3--d2
+//      /     \
+//     b1     c1
+//   /  \    /  \
+//  b3--b2  c3--c2
+//       \  /
+//        a1
+//       /  \
+//      a3--a2
+//
+// All call edges go up between SCCs, and clockwise around the SCC.
+static const char DiamondOfTriangles[] =
+     "define void @a1() {\n"
+     "entry:\n"
+     "  call void @a2()\n"
+     "  call void @b2()\n"
+     "  call void @c3()\n"
+     "  ret void\n"
+     "}\n"
+     "define void @a2() {\n"
+     "entry:\n"
+     "  call void @a3()\n"
+     "  ret void\n"
+     "}\n"
+     "define void @a3() {\n"
+     "entry:\n"
+     "  call void @a1()\n"
+     "  ret void\n"
+     "}\n"
+     "define void @b1() {\n"
+     "entry:\n"
+     "  call void @b2()\n"
+     "  call void @d3()\n"
+     "  ret void\n"
+     "}\n"
+     "define void @b2() {\n"
+     "entry:\n"
+     "  call void @b3()\n"
+     "  ret void\n"
+     "}\n"
+     "define void @b3() {\n"
+     "entry:\n"
+     "  call void @b1()\n"
+     "  ret void\n"
+     "}\n"
+     "define void @c1() {\n"
+     "entry:\n"
+     "  call void @c2()\n"
+     "  call void @d2()\n"
+     "  ret void\n"
+     "}\n"
+     "define void @c2() {\n"
+     "entry:\n"
+     "  call void @c3()\n"
+     "  ret void\n"
+     "}\n"
+     "define void @c3() {\n"
+     "entry:\n"
+     "  call void @c1()\n"
+     "  ret void\n"
+     "}\n"
+     "define void @d1() {\n"
+     "entry:\n"
+     "  call void @d2()\n"
+     "  ret void\n"
+     "}\n"
+     "define void @d2() {\n"
+     "entry:\n"
+     "  call void @d3()\n"
+     "  ret void\n"
+     "}\n"
+     "define void @d3() {\n"
+     "entry:\n"
+     "  call void @d1()\n"
+     "  ret void\n"
+     "}\n";
+
+TEST(LazyCallGraphTest, BasicGraphFormation) {
+  std::unique_ptr<Module> M = parseAssembly(DiamondOfTriangles);
+  LazyCallGraph CG(*M);
+
+  // The order of the entry nodes should be stable w.r.t. the source order of
+  // the IR, and everything in our module is an entry node, so just directly
+  // build variables for each node.
+  auto I = CG.begin();
+  LazyCallGraph::Node &A1 = *I++;
+  EXPECT_EQ("a1", A1.getFunction().getName());
+  LazyCallGraph::Node &A2 = *I++;
+  EXPECT_EQ("a2", A2.getFunction().getName());
+  LazyCallGraph::Node &A3 = *I++;
+  EXPECT_EQ("a3", A3.getFunction().getName());
+  LazyCallGraph::Node &B1 = *I++;
+  EXPECT_EQ("b1", B1.getFunction().getName());
+  LazyCallGraph::Node &B2 = *I++;
+  EXPECT_EQ("b2", B2.getFunction().getName());
+  LazyCallGraph::Node &B3 = *I++;
+  EXPECT_EQ("b3", B3.getFunction().getName());
+  LazyCallGraph::Node &C1 = *I++;
+  EXPECT_EQ("c1", C1.getFunction().getName());
+  LazyCallGraph::Node &C2 = *I++;
+  EXPECT_EQ("c2", C2.getFunction().getName());
+  LazyCallGraph::Node &C3 = *I++;
+  EXPECT_EQ("c3", C3.getFunction().getName());
+  LazyCallGraph::Node &D1 = *I++;
+  EXPECT_EQ("d1", D1.getFunction().getName());
+  LazyCallGraph::Node &D2 = *I++;
+  EXPECT_EQ("d2", D2.getFunction().getName());
+  LazyCallGraph::Node &D3 = *I++;
+  EXPECT_EQ("d3", D3.getFunction().getName());
+  EXPECT_EQ(CG.end(), I);
+
+  // Build vectors and sort them for the rest of the assertions to make them
+  // independent of order.
+  std::vector<std::string> Nodes;
+
+  for (LazyCallGraph::Node &N : A1)
+    Nodes.push_back(N.getFunction().getName());
+  std::sort(Nodes.begin(), Nodes.end());
+  EXPECT_EQ("a2", Nodes[0]);
+  EXPECT_EQ("b2", Nodes[1]);
+  EXPECT_EQ("c3", Nodes[2]);
+  Nodes.clear();
+
+  EXPECT_EQ(A2.end(), std::next(A2.begin()));
+  EXPECT_EQ("a3", A2.begin()->getFunction().getName());
+  EXPECT_EQ(A3.end(), std::next(A3.begin()));
+  EXPECT_EQ("a1", A3.begin()->getFunction().getName());
+
+  for (LazyCallGraph::Node &N : B1)
+    Nodes.push_back(N.getFunction().getName());
+  std::sort(Nodes.begin(), Nodes.end());
+  EXPECT_EQ("b2", Nodes[0]);
+  EXPECT_EQ("d3", Nodes[1]);
+  Nodes.clear();
+
+  EXPECT_EQ(B2.end(), std::next(B2.begin()));
+  EXPECT_EQ("b3", B2.begin()->getFunction().getName());
+  EXPECT_EQ(B3.end(), std::next(B3.begin()));
+  EXPECT_EQ("b1", B3.begin()->getFunction().getName());
+
+  for (LazyCallGraph::Node &N : C1)
+    Nodes.push_back(N.getFunction().getName());
+  std::sort(Nodes.begin(), Nodes.end());
+  EXPECT_EQ("c2", Nodes[0]);
+  EXPECT_EQ("d2", Nodes[1]);
+  Nodes.clear();
+
+  EXPECT_EQ(C2.end(), std::next(C2.begin()));
+  EXPECT_EQ("c3", C2.begin()->getFunction().getName());
+  EXPECT_EQ(C3.end(), std::next(C3.begin()));
+  EXPECT_EQ("c1", C3.begin()->getFunction().getName());
+
+  EXPECT_EQ(D1.end(), std::next(D1.begin()));
+  EXPECT_EQ("d2", D1.begin()->getFunction().getName());
+  EXPECT_EQ(D2.end(), std::next(D2.begin()));
+  EXPECT_EQ("d3", D2.begin()->getFunction().getName());
+  EXPECT_EQ(D3.end(), std::next(D3.begin()));
+  EXPECT_EQ("d1", D3.begin()->getFunction().getName());
+
+  // Now lets look at the SCCs.
+  auto SCCI = CG.postorder_scc_begin();
+
+  LazyCallGraph::SCC &D = *SCCI++;
+  for (LazyCallGraph::Node *N : D)
+    Nodes.push_back(N->getFunction().getName());
+  std::sort(Nodes.begin(), Nodes.end());
+  EXPECT_EQ(3u, Nodes.size());
+  EXPECT_EQ("d1", Nodes[0]);
+  EXPECT_EQ("d2", Nodes[1]);
+  EXPECT_EQ("d3", Nodes[2]);
+  Nodes.clear();
+  EXPECT_FALSE(D.isParentOf(D));
+  EXPECT_FALSE(D.isChildOf(D));
+  EXPECT_FALSE(D.isAncestorOf(D));
+  EXPECT_FALSE(D.isDescendantOf(D));
+
+  LazyCallGraph::SCC &C = *SCCI++;
+  for (LazyCallGraph::Node *N : C)
+    Nodes.push_back(N->getFunction().getName());
+  std::sort(Nodes.begin(), Nodes.end());
+  EXPECT_EQ(3u, Nodes.size());
+  EXPECT_EQ("c1", Nodes[0]);
+  EXPECT_EQ("c2", Nodes[1]);
+  EXPECT_EQ("c3", Nodes[2]);
+  Nodes.clear();
+  EXPECT_TRUE(C.isParentOf(D));
+  EXPECT_FALSE(C.isChildOf(D));
+  EXPECT_TRUE(C.isAncestorOf(D));
+  EXPECT_FALSE(C.isDescendantOf(D));
+
+  LazyCallGraph::SCC &B = *SCCI++;
+  for (LazyCallGraph::Node *N : B)
+    Nodes.push_back(N->getFunction().getName());
+  std::sort(Nodes.begin(), Nodes.end());
+  EXPECT_EQ(3u, Nodes.size());
+  EXPECT_EQ("b1", Nodes[0]);
+  EXPECT_EQ("b2", Nodes[1]);
+  EXPECT_EQ("b3", Nodes[2]);
+  Nodes.clear();
+  EXPECT_TRUE(B.isParentOf(D));
+  EXPECT_FALSE(B.isChildOf(D));
+  EXPECT_TRUE(B.isAncestorOf(D));
+  EXPECT_FALSE(B.isDescendantOf(D));
+  EXPECT_FALSE(B.isAncestorOf(C));
+  EXPECT_FALSE(C.isAncestorOf(B));
+
+  LazyCallGraph::SCC &A = *SCCI++;
+  for (LazyCallGraph::Node *N : A)
+    Nodes.push_back(N->getFunction().getName());
+  std::sort(Nodes.begin(), Nodes.end());
+  EXPECT_EQ(3u, Nodes.size());
+  EXPECT_EQ("a1", Nodes[0]);
+  EXPECT_EQ("a2", Nodes[1]);
+  EXPECT_EQ("a3", Nodes[2]);
+  Nodes.clear();
+  EXPECT_TRUE(A.isParentOf(B));
+  EXPECT_TRUE(A.isParentOf(C));
+  EXPECT_FALSE(A.isParentOf(D));
+  EXPECT_TRUE(A.isAncestorOf(B));
+  EXPECT_TRUE(A.isAncestorOf(C));
+  EXPECT_TRUE(A.isAncestorOf(D));
+
+  EXPECT_EQ(CG.postorder_scc_end(), SCCI);
+}
+
+static Function &lookupFunction(Module &M, StringRef Name) {
+  for (Function &F : M)
+    if (F.getName() == Name)
+      return F;
+  report_fatal_error("Couldn't find function!");
+}
+
+TEST(LazyCallGraphTest, BasicGraphMutation) {
+  std::unique_ptr<Module> M = parseAssembly(
+      "define void @a() {\n"
+      "entry:\n"
+      "  call void @b()\n"
+      "  call void @c()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @b() {\n"
+      "entry:\n"
+      "  ret void\n"
+      "}\n"
+      "define void @c() {\n"
+      "entry:\n"
+      "  ret void\n"
+      "}\n");
+  LazyCallGraph CG(*M);
+
+  LazyCallGraph::Node &A = CG.get(lookupFunction(*M, "a"));
+  LazyCallGraph::Node &B = CG.get(lookupFunction(*M, "b"));
+  EXPECT_EQ(2, std::distance(A.begin(), A.end()));
+  EXPECT_EQ(0, std::distance(B.begin(), B.end()));
+
+  CG.insertEdge(B, lookupFunction(*M, "c"));
+  EXPECT_EQ(1, std::distance(B.begin(), B.end()));
+  LazyCallGraph::Node &C = *B.begin();
+  EXPECT_EQ(0, std::distance(C.begin(), C.end()));
+
+  CG.insertEdge(C, B.getFunction());
+  EXPECT_EQ(1, std::distance(C.begin(), C.end()));
+  EXPECT_EQ(&B, &*C.begin());
+
+  CG.insertEdge(C, C.getFunction());
+  EXPECT_EQ(2, std::distance(C.begin(), C.end()));
+  EXPECT_EQ(&B, &*C.begin());
+  EXPECT_EQ(&C, &*std::next(C.begin()));
+
+  CG.removeEdge(C, B.getFunction());
+  EXPECT_EQ(1, std::distance(C.begin(), C.end()));
+  EXPECT_EQ(&C, &*C.begin());
+
+  CG.removeEdge(C, C.getFunction());
+  EXPECT_EQ(0, std::distance(C.begin(), C.end()));
+
+  CG.removeEdge(B, C.getFunction());
+  EXPECT_EQ(0, std::distance(B.begin(), B.end()));
+}
+
+TEST(LazyCallGraphTest, MultiArmSCC) {
+  // Two interlocking cycles. The really useful thing about this SCC is that it
+  // will require Tarjan's DFS to backtrack and finish processing all of the
+  // children of each node in the SCC.
+  std::unique_ptr<Module> M = parseAssembly(
+      "define void @a() {\n"
+      "entry:\n"
+      "  call void @b()\n"
+      "  call void @d()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @b() {\n"
+      "entry:\n"
+      "  call void @c()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @c() {\n"
+      "entry:\n"
+      "  call void @a()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @d() {\n"
+      "entry:\n"
+      "  call void @e()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @e() {\n"
+      "entry:\n"
+      "  call void @a()\n"
+      "  ret void\n"
+      "}\n");
+  LazyCallGraph CG(*M);
+
+  // Force the graph to be fully expanded.
+  auto SCCI = CG.postorder_scc_begin();
+  LazyCallGraph::SCC &SCC = *SCCI++;
+  EXPECT_EQ(CG.postorder_scc_end(), SCCI);
+
+  LazyCallGraph::Node &A = *CG.lookup(lookupFunction(*M, "a"));
+  LazyCallGraph::Node &B = *CG.lookup(lookupFunction(*M, "b"));
+  LazyCallGraph::Node &C = *CG.lookup(lookupFunction(*M, "c"));
+  LazyCallGraph::Node &D = *CG.lookup(lookupFunction(*M, "d"));
+  LazyCallGraph::Node &E = *CG.lookup(lookupFunction(*M, "e"));
+  EXPECT_EQ(&SCC, CG.lookupSCC(A));
+  EXPECT_EQ(&SCC, CG.lookupSCC(B));
+  EXPECT_EQ(&SCC, CG.lookupSCC(C));
+  EXPECT_EQ(&SCC, CG.lookupSCC(D));
+  EXPECT_EQ(&SCC, CG.lookupSCC(E));
+}
+
+TEST(LazyCallGraphTest, OutgoingSCCEdgeInsertion) {
+  std::unique_ptr<Module> M = parseAssembly(
+      "define void @a() {\n"
+      "entry:\n"
+      "  call void @b()\n"
+      "  call void @c()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @b() {\n"
+      "entry:\n"
+      "  call void @d()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @c() {\n"
+      "entry:\n"
+      "  call void @d()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @d() {\n"
+      "entry:\n"
+      "  ret void\n"
+      "}\n");
+  LazyCallGraph CG(*M);
+
+  // Force the graph to be fully expanded.
+  for (LazyCallGraph::SCC &C : CG.postorder_sccs())
+    (void)C;
+
+  LazyCallGraph::Node &A = *CG.lookup(lookupFunction(*M, "a"));
+  LazyCallGraph::Node &B = *CG.lookup(lookupFunction(*M, "b"));
+  LazyCallGraph::Node &C = *CG.lookup(lookupFunction(*M, "c"));
+  LazyCallGraph::Node &D = *CG.lookup(lookupFunction(*M, "d"));
+  LazyCallGraph::SCC &AC = *CG.lookupSCC(A);
+  LazyCallGraph::SCC &BC = *CG.lookupSCC(B);
+  LazyCallGraph::SCC &CC = *CG.lookupSCC(C);
+  LazyCallGraph::SCC &DC = *CG.lookupSCC(D);
+  EXPECT_TRUE(AC.isAncestorOf(BC));
+  EXPECT_TRUE(AC.isAncestorOf(CC));
+  EXPECT_TRUE(AC.isAncestorOf(DC));
+  EXPECT_TRUE(DC.isDescendantOf(AC));
+  EXPECT_TRUE(DC.isDescendantOf(BC));
+  EXPECT_TRUE(DC.isDescendantOf(CC));
+
+  EXPECT_EQ(2, std::distance(A.begin(), A.end()));
+  AC.insertOutgoingEdge(A, D);
+  EXPECT_EQ(3, std::distance(A.begin(), A.end()));
+  EXPECT_TRUE(AC.isParentOf(DC));
+  EXPECT_EQ(&AC, CG.lookupSCC(A));
+  EXPECT_EQ(&BC, CG.lookupSCC(B));
+  EXPECT_EQ(&CC, CG.lookupSCC(C));
+  EXPECT_EQ(&DC, CG.lookupSCC(D));
+}
+
+TEST(LazyCallGraphTest, IncomingSCCEdgeInsertion) {
+  // We want to ensure we can add edges even across complex diamond graphs, so
+  // we use the diamond of triangles graph defined above. The ascii diagram is
+  // repeated here for easy reference.
+  //
+  //         d1       |
+  //        /  \      |
+  //       d3--d2     |
+  //      /     \     |
+  //     b1     c1    |
+  //   /  \    /  \   |
+  //  b3--b2  c3--c2  |
+  //       \  /       |
+  //        a1        |
+  //       /  \       |
+  //      a3--a2      |
+  //
+  std::unique_ptr<Module> M = parseAssembly(DiamondOfTriangles);
+  LazyCallGraph CG(*M);
+
+  // Force the graph to be fully expanded.
+  for (LazyCallGraph::SCC &C : CG.postorder_sccs())
+    (void)C;
+
+  LazyCallGraph::Node &A1 = *CG.lookup(lookupFunction(*M, "a1"));
+  LazyCallGraph::Node &A2 = *CG.lookup(lookupFunction(*M, "a2"));
+  LazyCallGraph::Node &A3 = *CG.lookup(lookupFunction(*M, "a3"));
+  LazyCallGraph::Node &B1 = *CG.lookup(lookupFunction(*M, "b1"));
+  LazyCallGraph::Node &B2 = *CG.lookup(lookupFunction(*M, "b2"));
+  LazyCallGraph::Node &B3 = *CG.lookup(lookupFunction(*M, "b3"));
+  LazyCallGraph::Node &C1 = *CG.lookup(lookupFunction(*M, "c1"));
+  LazyCallGraph::Node &C2 = *CG.lookup(lookupFunction(*M, "c2"));
+  LazyCallGraph::Node &C3 = *CG.lookup(lookupFunction(*M, "c3"));
+  LazyCallGraph::Node &D1 = *CG.lookup(lookupFunction(*M, "d1"));
+  LazyCallGraph::Node &D2 = *CG.lookup(lookupFunction(*M, "d2"));
+  LazyCallGraph::Node &D3 = *CG.lookup(lookupFunction(*M, "d3"));
+  LazyCallGraph::SCC &AC = *CG.lookupSCC(A1);
+  LazyCallGraph::SCC &BC = *CG.lookupSCC(B1);
+  LazyCallGraph::SCC &CC = *CG.lookupSCC(C1);
+  LazyCallGraph::SCC &DC = *CG.lookupSCC(D1);
+  ASSERT_EQ(&AC, CG.lookupSCC(A2));
+  ASSERT_EQ(&AC, CG.lookupSCC(A3));
+  ASSERT_EQ(&BC, CG.lookupSCC(B2));
+  ASSERT_EQ(&BC, CG.lookupSCC(B3));
+  ASSERT_EQ(&CC, CG.lookupSCC(C2));
+  ASSERT_EQ(&CC, CG.lookupSCC(C3));
+  ASSERT_EQ(&DC, CG.lookupSCC(D2));
+  ASSERT_EQ(&DC, CG.lookupSCC(D3));
+  ASSERT_EQ(1, std::distance(D2.begin(), D2.end()));
+
+  // Add an edge to make the graph:
+  //
+  //         d1         |
+  //        /  \        |
+  //       d3--d2---.   |
+  //      /     \    |  |
+  //     b1     c1   |  |
+  //   /  \    /  \ /   |
+  //  b3--b2  c3--c2    |
+  //       \  /         |
+  //        a1          |
+  //       /  \         |
+  //      a3--a2        |
+  CC.insertIncomingEdge(D2, C2);
+  // Make sure we connected the nodes.
+  EXPECT_EQ(2, std::distance(D2.begin(), D2.end()));
+
+  // Make sure we have the correct nodes in the SCC sets.
+  EXPECT_EQ(&AC, CG.lookupSCC(A1));
+  EXPECT_EQ(&AC, CG.lookupSCC(A2));
+  EXPECT_EQ(&AC, CG.lookupSCC(A3));
+  EXPECT_EQ(&BC, CG.lookupSCC(B1));
+  EXPECT_EQ(&BC, CG.lookupSCC(B2));
+  EXPECT_EQ(&BC, CG.lookupSCC(B3));
+  EXPECT_EQ(&CC, CG.lookupSCC(C1));
+  EXPECT_EQ(&CC, CG.lookupSCC(C2));
+  EXPECT_EQ(&CC, CG.lookupSCC(C3));
+  EXPECT_EQ(&CC, CG.lookupSCC(D1));
+  EXPECT_EQ(&CC, CG.lookupSCC(D2));
+  EXPECT_EQ(&CC, CG.lookupSCC(D3));
+
+  // And that ancestry tests have been updated.
+  EXPECT_TRUE(AC.isParentOf(BC));
+  EXPECT_TRUE(AC.isParentOf(CC));
+  EXPECT_FALSE(AC.isAncestorOf(DC));
+  EXPECT_FALSE(BC.isAncestorOf(DC));
+  EXPECT_FALSE(CC.isAncestorOf(DC));
+}
+
+TEST(LazyCallGraphTest, IncomingSCCEdgeInsertionMidTraversal) {
+  // This is the same fundamental test as the previous, but we perform it
+  // having only partially walked the SCCs of the graph.
+  std::unique_ptr<Module> M = parseAssembly(DiamondOfTriangles);
+  LazyCallGraph CG(*M);
+
+  // Walk the SCCs until we find the one containing 'c1'.
+  auto SCCI = CG.postorder_scc_begin(), SCCE = CG.postorder_scc_end();
+  ASSERT_NE(SCCI, SCCE);
+  LazyCallGraph::SCC &DC = *SCCI;
+  ASSERT_NE(&DC, nullptr);
+  ++SCCI;
+  ASSERT_NE(SCCI, SCCE);
+  LazyCallGraph::SCC &CC = *SCCI;
+  ASSERT_NE(&CC, nullptr);
+
+  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "a1")));
+  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "a2")));
+  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "a3")));
+  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "b1")));
+  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "b2")));
+  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "b3")));
+  LazyCallGraph::Node &C1 = *CG.lookup(lookupFunction(*M, "c1"));
+  LazyCallGraph::Node &C2 = *CG.lookup(lookupFunction(*M, "c2"));
+  LazyCallGraph::Node &C3 = *CG.lookup(lookupFunction(*M, "c3"));
+  LazyCallGraph::Node &D1 = *CG.lookup(lookupFunction(*M, "d1"));
+  LazyCallGraph::Node &D2 = *CG.lookup(lookupFunction(*M, "d2"));
+  LazyCallGraph::Node &D3 = *CG.lookup(lookupFunction(*M, "d3"));
+  ASSERT_EQ(&CC, CG.lookupSCC(C1));
+  ASSERT_EQ(&CC, CG.lookupSCC(C2));
+  ASSERT_EQ(&CC, CG.lookupSCC(C3));
+  ASSERT_EQ(&DC, CG.lookupSCC(D1));
+  ASSERT_EQ(&DC, CG.lookupSCC(D2));
+  ASSERT_EQ(&DC, CG.lookupSCC(D3));
+  ASSERT_EQ(1, std::distance(D2.begin(), D2.end()));
+
+  CC.insertIncomingEdge(D2, C2);
+  EXPECT_EQ(2, std::distance(D2.begin(), D2.end()));
+
+  // Make sure we have the correct nodes in the SCC sets.
+  EXPECT_EQ(&CC, CG.lookupSCC(C1));
+  EXPECT_EQ(&CC, CG.lookupSCC(C2));
+  EXPECT_EQ(&CC, CG.lookupSCC(C3));
+  EXPECT_EQ(&CC, CG.lookupSCC(D1));
+  EXPECT_EQ(&CC, CG.lookupSCC(D2));
+  EXPECT_EQ(&CC, CG.lookupSCC(D3));
+
+  // Check that we can form the last two SCCs now in a coherent way.
+  ++SCCI;
+  EXPECT_NE(SCCI, SCCE);
+  LazyCallGraph::SCC &BC = *SCCI;
+  EXPECT_NE(&BC, nullptr);
+  EXPECT_EQ(&BC, CG.lookupSCC(*CG.lookup(lookupFunction(*M, "b1"))));
+  EXPECT_EQ(&BC, CG.lookupSCC(*CG.lookup(lookupFunction(*M, "b2"))));
+  EXPECT_EQ(&BC, CG.lookupSCC(*CG.lookup(lookupFunction(*M, "b3"))));
+  ++SCCI;
+  EXPECT_NE(SCCI, SCCE);
+  LazyCallGraph::SCC &AC = *SCCI;
+  EXPECT_NE(&AC, nullptr);
+  EXPECT_EQ(&AC, CG.lookupSCC(*CG.lookup(lookupFunction(*M, "a1"))));
+  EXPECT_EQ(&AC, CG.lookupSCC(*CG.lookup(lookupFunction(*M, "a2"))));
+  EXPECT_EQ(&AC, CG.lookupSCC(*CG.lookup(lookupFunction(*M, "a3"))));
+  ++SCCI;
+  EXPECT_EQ(SCCI, SCCE);
+}
+
+TEST(LazyCallGraphTest, InterSCCEdgeRemoval) {
+  std::unique_ptr<Module> M = parseAssembly(
+      "define void @a() {\n"
+      "entry:\n"
+      "  call void @b()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @b() {\n"
+      "entry:\n"
+      "  ret void\n"
+      "}\n");
+  LazyCallGraph CG(*M);
+
+  // Force the graph to be fully expanded.
+  for (LazyCallGraph::SCC &C : CG.postorder_sccs())
+    (void)C;
+
+  LazyCallGraph::Node &A = *CG.lookup(lookupFunction(*M, "a"));
+  LazyCallGraph::Node &B = *CG.lookup(lookupFunction(*M, "b"));
+  LazyCallGraph::SCC &AC = *CG.lookupSCC(A);
+  LazyCallGraph::SCC &BC = *CG.lookupSCC(B);
+
+  EXPECT_EQ("b", A.begin()->getFunction().getName());
+  EXPECT_EQ(B.end(), B.begin());
+  EXPECT_EQ(&AC, &*BC.parent_begin());
+
+  AC.removeInterSCCEdge(A, B);
+
+  EXPECT_EQ(A.end(), A.begin());
+  EXPECT_EQ(B.end(), B.begin());
+  EXPECT_EQ(BC.parent_end(), BC.parent_begin());
+}
+
+TEST(LazyCallGraphTest, IntraSCCEdgeInsertion) {
+  std::unique_ptr<Module> M1 = parseAssembly(
+      "define void @a() {\n"
+      "entry:\n"
+      "  call void @b()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @b() {\n"
+      "entry:\n"
+      "  call void @c()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @c() {\n"
+      "entry:\n"
+      "  call void @a()\n"
+      "  ret void\n"
+      "}\n");
+  LazyCallGraph CG1(*M1);
+
+  // Force the graph to be fully expanded.
+  auto SCCI = CG1.postorder_scc_begin();
+  LazyCallGraph::SCC &SCC = *SCCI++;
+  EXPECT_EQ(CG1.postorder_scc_end(), SCCI);
+
+  LazyCallGraph::Node &A = *CG1.lookup(lookupFunction(*M1, "a"));
+  LazyCallGraph::Node &B = *CG1.lookup(lookupFunction(*M1, "b"));
+  LazyCallGraph::Node &C = *CG1.lookup(lookupFunction(*M1, "c"));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(A));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(B));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(C));
+
+  // Insert an edge from 'a' to 'c'. Nothing changes about the SCCs.
+  SCC.insertIntraSCCEdge(A, C);
+  EXPECT_EQ(2, std::distance(A.begin(), A.end()));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(A));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(B));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(C));
+
+  // Insert a self edge from 'a' back to 'a'.
+  SCC.insertIntraSCCEdge(A, A);
+  EXPECT_EQ(3, std::distance(A.begin(), A.end()));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(A));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(B));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(C));
+}
+
+TEST(LazyCallGraphTest, IntraSCCEdgeRemoval) {
+  // A nice fully connected (including self-edges) SCC.
+  std::unique_ptr<Module> M1 = parseAssembly(
+      "define void @a() {\n"
+      "entry:\n"
+      "  call void @a()\n"
+      "  call void @b()\n"
+      "  call void @c()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @b() {\n"
+      "entry:\n"
+      "  call void @a()\n"
+      "  call void @b()\n"
+      "  call void @c()\n"
+      "  ret void\n"
+      "}\n"
+      "define void @c() {\n"
+      "entry:\n"
+      "  call void @a()\n"
+      "  call void @b()\n"
+      "  call void @c()\n"
+      "  ret void\n"
+      "}\n");
+  LazyCallGraph CG1(*M1);
+
+  // Force the graph to be fully expanded.
+  auto SCCI = CG1.postorder_scc_begin();
+  LazyCallGraph::SCC &SCC = *SCCI++;
+  EXPECT_EQ(CG1.postorder_scc_end(), SCCI);
+
+  LazyCallGraph::Node &A = *CG1.lookup(lookupFunction(*M1, "a"));
+  LazyCallGraph::Node &B = *CG1.lookup(lookupFunction(*M1, "b"));
+  LazyCallGraph::Node &C = *CG1.lookup(lookupFunction(*M1, "c"));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(A));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(B));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(C));
+
+  // Remove the edge from b -> a, which should leave the 3 functions still in
+  // a single connected component because of a -> b -> c -> a.
+  SmallVector<LazyCallGraph::SCC *, 1> NewSCCs = SCC.removeIntraSCCEdge(B, A);
+  EXPECT_EQ(0u, NewSCCs.size());
+  EXPECT_EQ(&SCC, CG1.lookupSCC(A));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(B));
+  EXPECT_EQ(&SCC, CG1.lookupSCC(C));
+
+  // Remove the edge from c -> a, which should leave 'a' in the original SCC
+  // and form a new SCC for 'b' and 'c'.
+  NewSCCs = SCC.removeIntraSCCEdge(C, A);
+  EXPECT_EQ(1u, NewSCCs.size());
+  EXPECT_EQ(&SCC, CG1.lookupSCC(A));
+  EXPECT_EQ(1, std::distance(SCC.begin(), SCC.end()));
+  LazyCallGraph::SCC *SCC2 = CG1.lookupSCC(B);
+  EXPECT_EQ(SCC2, CG1.lookupSCC(C));
+  EXPECT_EQ(SCC2, NewSCCs[0]);
+}
+
+}
diff --git a/unittests/Analysis/MixedTBAATest.cpp b/unittests/Analysis/MixedTBAATest.cpp
new file mode 100644
index 0000000..2cf7c73
--- /dev/null
+++ b/unittests/Analysis/MixedTBAATest.cpp
@@ -0,0 +1,77 @@
+//===--- MixedTBAATest.cpp - Mixed TBAA unit tests ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace {
+
+class MixedTBAATest : public testing::Test {
+protected:
+  MixedTBAATest() : M("MixedTBAATest", C), MD(C) {}
+
+  LLVMContext C;
+  Module M;
+  MDBuilder MD;
+  PassManager PM;
+};
+
+TEST_F(MixedTBAATest, MixedTBAA) {
+  // Setup function.
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C),
+                                        std::vector<Type *>(), false);
+  auto *F = cast<Function>(M.getOrInsertFunction("f", FTy));
+  auto *BB = BasicBlock::Create(C, "entry", F);
+  auto IntType = Type::getInt32Ty(C);
+  auto PtrType = Type::getInt32PtrTy(C);
+  auto *Value  = ConstantInt::get(IntType, 42);
+  auto *Addr = ConstantPointerNull::get(PtrType);
+
+  auto *Store1 = new StoreInst(Value, Addr, BB);
+  auto *Store2 = new StoreInst(Value, Addr, BB);
+  ReturnInst::Create(C, 0, BB);
+
+  // New TBAA metadata
+  {
+    auto RootMD = MD.createTBAARoot("Simple C/C++ TBAA");
+    auto MD1 = MD.createTBAAScalarTypeNode("omnipotent char", RootMD);
+    auto MD2 = MD.createTBAAScalarTypeNode("int", MD1);
+    auto MD3 = MD.createTBAAStructTagNode(MD2, MD2, 0);
+    Store2->setMetadata(LLVMContext::MD_tbaa, MD3);
+  }
+
+  // Old TBAA metadata
+  {
+    auto RootMD = MD.createTBAARoot("Simple C/C++ TBAA");
+    auto MD1 = MD.createTBAANode("omnipotent char", RootMD);
+    auto MD2 = MD.createTBAANode("int", MD1);
+    Store1->setMetadata(LLVMContext::MD_tbaa, MD2);
+  }
+
+  // Run the TBAA eval pass on a mixture of path-aware and non-path-aware TBAA.
+  // The order of the metadata (path-aware vs non-path-aware) is important,
+  // because the AA eval pass only runs one test per store-pair.
+  const char* args[] = { "MixedTBAATest", "-evaluate-tbaa" };
+  cl::ParseCommandLineOptions(sizeof(args) / sizeof(const char*), args);
+  PM.add(createTypeBasedAliasAnalysisPass());
+  PM.add(createAAEvalPass());
+  PM.run(M);
+}
+
+} // end anonymous namspace
+} // end llvm namespace
+
diff --git a/unittests/Bitcode/BitReaderTest.cpp b/unittests/Bitcode/BitReaderTest.cpp
index ba03023..b6a3e9a 100644
--- a/unittests/Bitcode/BitReaderTest.cpp
+++ b/unittests/Bitcode/BitReaderTest.cpp
@@ -59,6 +59,7 @@ TEST(BitReaderTest, MaterializeFunctionsForBlockAddr) { // PR11677
   std::unique_ptr<Module> m(ModuleOrErr.get());
   PassManager passes;
   passes.add(createVerifierPass());
+  passes.add(createDebugInfoVerifierPass());
   passes.run(*m);
 }
 
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 9e2f60c..bbab2a1 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -19,6 +19,7 @@ add_subdirectory(DebugInfo)
 add_subdirectory(ExecutionEngine)
 add_subdirectory(IR)
 add_subdirectory(LineEditor)
+add_subdirectory(Linker)
 add_subdirectory(MC)
 add_subdirectory(Object)
 add_subdirectory(Option)
diff --git a/unittests/CodeGen/DIEHashTest.cpp b/unittests/CodeGen/DIEHashTest.cpp
index c874cef..04c5a8a 100644
--- a/unittests/CodeGen/DIEHashTest.cpp
+++ b/unittests/CodeGen/DIEHashTest.cpp
@@ -12,6 +12,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
+#include "llvm/ADT/STLExtras.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -59,7 +60,7 @@ TEST(DIEHashTest, NamedType) {
 TEST(DIEHashTest, NamespacedType) {
   DIE CU(dwarf::DW_TAG_compile_unit);
 
-  DIE *Space = new DIE(dwarf::DW_TAG_namespace);
+  auto Space = make_unique<DIE>(dwarf::DW_TAG_namespace);
   DIEInteger One(1);
   DIEString SpaceStr(&One, "space");
   Space->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &SpaceStr);
@@ -67,15 +68,16 @@ TEST(DIEHashTest, NamespacedType) {
   Space->addValue(dwarf::DW_AT_declaration, dwarf::DW_FORM_flag_present, &One);
   // sibling?
 
-  DIE *Foo = new DIE(dwarf::DW_TAG_structure_type);
+  auto Foo = make_unique<DIE>(dwarf::DW_TAG_structure_type);
   DIEString FooStr(&One, "foo");
   Foo->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
   Foo->addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
 
-  Space->addChild(Foo);
-  CU.addChild(Space);
+  DIE &N = *Foo;
+  Space->addChild(std::move(Foo));
+  CU.addChild(std::move(Space));
 
-  uint64_t MD5Res = DIEHash().computeTypeSignature(*Foo);
+  uint64_t MD5Res = DIEHash().computeTypeSignature(N);
 
   // The exact same hash GCC produces for this DIE.
   ASSERT_EQ(0x7b80381fd17f1e33ULL, MD5Res);
@@ -87,15 +89,6 @@ TEST(DIEHashTest, TypeWithMember) {
   DIEInteger Four(4);
   Unnamed.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Four);
 
-  DIE *Member = new DIE(dwarf::DW_TAG_member);
-  DIEString MemberStr(&Four, "member");
-  Member->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemberStr);
-  DIEInteger Zero(0);
-  Member->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1,
-                   &Zero);
-
-  Unnamed.addChild(Member);
-
   DIE Int(dwarf::DW_TAG_base_type);
   DIEString IntStr(&Four, "int");
   Int.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &IntStr);
@@ -103,9 +96,18 @@ TEST(DIEHashTest, TypeWithMember) {
   DIEInteger Five(5);
   Int.addValue(dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, &Five);
 
-  DIEEntry IntRef(&Int);
+  DIEEntry IntRef(Int);
+
+  auto Member = make_unique<DIE>(dwarf::DW_TAG_member);
+  DIEString MemberStr(&Four, "member");
+  Member->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemberStr);
+  DIEInteger Zero(0);
+  Member->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1,
+                   &Zero);
   Member->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &IntRef);
 
+  Unnamed.addChild(std::move(Member));
+
   uint64_t MD5Res = DIEHash().computeTypeSignature(Unnamed);
 
   ASSERT_EQ(0x5646aa436b7e07c6ULL, MD5Res);
@@ -117,35 +119,35 @@ TEST(DIEHashTest, ReusedType) {
   DIEInteger Eight(8);
   Unnamed.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
 
-  DIE *Mem1 = new DIE(dwarf::DW_TAG_member);
   DIEInteger Four(4);
+  DIE Int(dwarf::DW_TAG_base_type);
+  DIEString IntStr(&Four, "int");
+  Int.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &IntStr);
+  Int.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Four);
+  DIEInteger Five(5);
+  Int.addValue(dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, &Five);
+
+  DIEEntry IntRef(Int);
+
+  auto Mem1 = make_unique<DIE>(dwarf::DW_TAG_member);
   DIEString Mem1Str(&Four, "mem1");
   Mem1->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &Mem1Str);
   DIEInteger Zero(0);
   Mem1->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1,
                  &Zero);
+  Mem1->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &IntRef);
 
-  Unnamed.addChild(Mem1);
+  Unnamed.addChild(std::move(Mem1));
 
-  DIE *Mem2 = new DIE(dwarf::DW_TAG_member);
+  auto Mem2 = make_unique<DIE>(dwarf::DW_TAG_member);
   DIEString Mem2Str(&Four, "mem2");
   Mem2->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &Mem2Str);
   Mem2->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1,
                  &Four);
-
-  Unnamed.addChild(Mem2);
-
-  DIE Int(dwarf::DW_TAG_base_type);
-  DIEString IntStr(&Four, "int");
-  Int.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &IntStr);
-  Int.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Four);
-  DIEInteger Five(5);
-  Int.addValue(dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, &Five);
-
-  DIEEntry IntRef(&Int);
-  Mem1->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &IntRef);
   Mem2->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &IntRef);
 
+  Unnamed.addChild(std::move(Mem2));
+
   uint64_t MD5Res = DIEHash().computeTypeSignature(Unnamed);
 
   ASSERT_EQ(0x3a7dc3ed7b76b2f8ULL, MD5Res);
@@ -159,14 +161,14 @@ TEST(DIEHashTest, RecursiveType) {
   DIEString FooStr(&One, "foo");
   Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
 
-  DIE *Mem = new DIE(dwarf::DW_TAG_member);
+  auto Mem = make_unique<DIE>(dwarf::DW_TAG_member);
   DIEString MemStr(&One, "mem");
   Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
-  DIEEntry FooRef(&Foo);
+  DIEEntry FooRef(Foo);
   Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooRef);
   // DW_AT_external and DW_AT_declaration are ignored anyway, so skip them.
 
-  Foo.addChild(Mem);
+  Foo.addChild(std::move(Mem));
 
   uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
 
@@ -181,7 +183,7 @@ TEST(DIEHashTest, Pointer) {
   DIEString FooStr(&Eight, "foo");
   Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
 
-  DIE *Mem = new DIE(dwarf::DW_TAG_member);
+  auto Mem = make_unique<DIE>(dwarf::DW_TAG_member);
   DIEString MemStr(&Eight, "mem");
   Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
   DIEInteger Zero(0);
@@ -189,13 +191,13 @@ TEST(DIEHashTest, Pointer) {
 
   DIE FooPtr(dwarf::DW_TAG_pointer_type);
   FooPtr.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
-  DIEEntry FooRef(&Foo);
+  DIEEntry FooRef(Foo);
   FooPtr.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooRef);
 
-  DIEEntry FooPtrRef(&FooPtr);
+  DIEEntry FooPtrRef(FooPtr);
   Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooPtrRef);
 
-  Foo.addChild(Mem);
+  Foo.addChild(std::move(Mem));
 
   uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
 
@@ -210,7 +212,7 @@ TEST(DIEHashTest, Reference) {
   DIEString FooStr(&Eight, "foo");
   Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
 
-  DIE *Mem = new DIE(dwarf::DW_TAG_member);
+  auto Mem = make_unique<DIE>(dwarf::DW_TAG_member);
   DIEString MemStr(&Eight, "mem");
   Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
   DIEInteger Zero(0);
@@ -218,17 +220,17 @@ TEST(DIEHashTest, Reference) {
 
   DIE FooRef(dwarf::DW_TAG_reference_type);
   FooRef.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
-  DIEEntry FooEntry(&Foo);
+  DIEEntry FooEntry(Foo);
   FooRef.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooEntry);
 
   DIE FooRefConst(dwarf::DW_TAG_const_type);
-  DIEEntry FooRefRef(&FooRef);
+  DIEEntry FooRefRef(FooRef);
   FooRefConst.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooRefRef);
 
-  DIEEntry FooRefConstRef(&FooRefConst);
+  DIEEntry FooRefConstRef(FooRefConst);
   Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooRefConstRef);
 
-  Foo.addChild(Mem);
+  Foo.addChild(std::move(Mem));
 
   uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
 
@@ -243,7 +245,7 @@ TEST(DIEHashTest, RValueReference) {
   DIEString FooStr(&Eight, "foo");
   Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
 
-  DIE *Mem = new DIE(dwarf::DW_TAG_member);
+  auto Mem = make_unique<DIE>(dwarf::DW_TAG_member);
   DIEString MemStr(&Eight, "mem");
   Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
   DIEInteger Zero(0);
@@ -251,17 +253,17 @@ TEST(DIEHashTest, RValueReference) {
 
   DIE FooRef(dwarf::DW_TAG_rvalue_reference_type);
   FooRef.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
-  DIEEntry FooEntry(&Foo);
+  DIEEntry FooEntry(Foo);
   FooRef.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooEntry);
 
   DIE FooRefConst(dwarf::DW_TAG_const_type);
-  DIEEntry FooRefRef(&FooRef);
+  DIEEntry FooRefRef(FooRef);
   FooRefConst.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooRefRef);
 
-  DIEEntry FooRefConstRef(&FooRefConst);
+  DIEEntry FooRefConstRef(FooRefConst);
   Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooRefConstRef);
 
-  Foo.addChild(Mem);
+  Foo.addChild(std::move(Mem));
 
   uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
 
@@ -276,22 +278,22 @@ TEST(DIEHashTest, PtrToMember) {
   DIEString FooStr(&Eight, "foo");
   Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
 
-  DIE *Mem = new DIE(dwarf::DW_TAG_member);
+  auto Mem = make_unique<DIE>(dwarf::DW_TAG_member);
   DIEString MemStr(&Eight, "mem");
   Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
   DIEInteger Zero(0);
   Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1, &Zero);
 
   DIE PtrToFooMem(dwarf::DW_TAG_ptr_to_member_type);
-  DIEEntry FooEntry(&Foo);
+  DIEEntry FooEntry(Foo);
   PtrToFooMem.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooEntry);
   PtrToFooMem.addValue(dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
                        &FooEntry);
 
-  DIEEntry PtrToFooMemRef(&PtrToFooMem);
+  DIEEntry PtrToFooMemRef(PtrToFooMem);
   Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &PtrToFooMemRef);
 
-  Foo.addChild(Mem);
+  Foo.addChild(std::move(Mem));
 
   uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
 
@@ -320,22 +322,22 @@ TEST(DIEHashTest, PtrToMemberDeclDefMatch) {
     Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
     Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
 
-    DIE *Mem = new DIE(dwarf::DW_TAG_member);
+    auto Mem = make_unique<DIE>(dwarf::DW_TAG_member);
     Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
     Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1,
                   &Zero);
 
     DIE PtrToFooMem(dwarf::DW_TAG_ptr_to_member_type);
-    DIEEntry BarEntry(&Bar);
+    DIEEntry BarEntry(Bar);
     PtrToFooMem.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &BarEntry);
-    DIEEntry FooEntry(&Foo);
+    DIEEntry FooEntry(Foo);
     PtrToFooMem.addValue(dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
                          &FooEntry);
 
-    DIEEntry PtrToFooMemRef(&PtrToFooMem);
+    DIEEntry PtrToFooMemRef(PtrToFooMem);
     Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &PtrToFooMemRef);
 
-    Foo.addChild(Mem);
+    Foo.addChild(std::move(Mem));
 
     MD5ResDecl = DIEHash().computeTypeSignature(Foo);
   }
@@ -349,22 +351,22 @@ TEST(DIEHashTest, PtrToMemberDeclDefMatch) {
     Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
     Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
 
-    DIE *Mem = new DIE(dwarf::DW_TAG_member);
+    auto Mem = make_unique<DIE>(dwarf::DW_TAG_member);
     Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
     Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1,
                   &Zero);
 
     DIE PtrToFooMem(dwarf::DW_TAG_ptr_to_member_type);
-    DIEEntry BarEntry(&Bar);
+    DIEEntry BarEntry(Bar);
     PtrToFooMem.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &BarEntry);
-    DIEEntry FooEntry(&Foo);
+    DIEEntry FooEntry(Foo);
     PtrToFooMem.addValue(dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
                          &FooEntry);
 
-    DIEEntry PtrToFooMemRef(&PtrToFooMem);
+    DIEEntry PtrToFooMemRef(PtrToFooMem);
     Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &PtrToFooMemRef);
 
-    Foo.addChild(Mem);
+    Foo.addChild(std::move(Mem));
 
     MD5ResDef = DIEHash().computeTypeSignature(Foo);
   }
@@ -393,21 +395,21 @@ TEST(DIEHashTest, PtrToMemberDeclDefMisMatch) {
     Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
     Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
 
-    DIE *Mem = new DIE(dwarf::DW_TAG_member);
+    auto Mem = make_unique<DIE>(dwarf::DW_TAG_member);
     Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
     Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1,
                   &Zero);
 
     DIE PtrToFooMem(dwarf::DW_TAG_ptr_to_member_type);
-    DIEEntry BarEntry(&Bar);
+    DIEEntry BarEntry(Bar);
     PtrToFooMem.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &BarEntry);
     PtrToFooMem.addValue(dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
                          &BarEntry);
 
-    DIEEntry PtrToFooMemRef(&PtrToFooMem);
+    DIEEntry PtrToFooMemRef(PtrToFooMem);
     Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &PtrToFooMemRef);
 
-    Foo.addChild(Mem);
+    Foo.addChild(std::move(Mem));
 
     MD5ResDecl = DIEHash().computeTypeSignature(Foo);
   }
@@ -421,21 +423,21 @@ TEST(DIEHashTest, PtrToMemberDeclDefMisMatch) {
     Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
     Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
 
-    DIE *Mem = new DIE(dwarf::DW_TAG_member);
+    auto Mem = make_unique<DIE>(dwarf::DW_TAG_member);
     Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
     Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1,
                   &Zero);
 
     DIE PtrToFooMem(dwarf::DW_TAG_ptr_to_member_type);
-    DIEEntry BarEntry(&Bar);
+    DIEEntry BarEntry(Bar);
     PtrToFooMem.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &BarEntry);
     PtrToFooMem.addValue(dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
                          &BarEntry);
 
-    DIEEntry PtrToFooMemRef(&PtrToFooMem);
+    DIEEntry PtrToFooMemRef(PtrToFooMem);
     Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &PtrToFooMemRef);
 
-    Foo.addChild(Mem);
+    Foo.addChild(std::move(Mem));
 
     MD5ResDef = DIEHash().computeTypeSignature(Foo);
   }
@@ -463,19 +465,19 @@ TEST(DIEHashTest, RefUnnamedType) {
   Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
   Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
 
-  DIE *Mem = new DIE(dwarf::DW_TAG_member);
+  auto Mem = make_unique<DIE>(dwarf::DW_TAG_member);
   Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
   Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1, &Zero);
 
   DIE UnnamedPtr(dwarf::DW_TAG_pointer_type);
   UnnamedPtr.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
-  DIEEntry UnnamedRef(&Unnamed);
+  DIEEntry UnnamedRef(Unnamed);
   UnnamedPtr.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &UnnamedRef);
 
-  DIEEntry UnnamedPtrRef(&UnnamedPtr);
+  DIEEntry UnnamedPtrRef(UnnamedPtr);
   Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &UnnamedPtrRef);
 
-  Foo.addChild(Mem);
+  Foo.addChild(std::move(Mem));
 
   uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
 
@@ -488,12 +490,12 @@ TEST(DIEHashTest, NestedType) {
   DIEInteger One(1);
   Unnamed.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
 
-  DIE *Foo = new DIE(dwarf::DW_TAG_structure_type);
+  auto Foo = make_unique<DIE>(dwarf::DW_TAG_structure_type);
   DIEString FooStr(&One, "foo");
   Foo->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
   Foo->addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
 
-  Unnamed.addChild(Foo);
+  Unnamed.addChild(std::move(Foo));
 
   uint64_t MD5Res = DIEHash().computeTypeSignature(Unnamed);
 
@@ -507,11 +509,11 @@ TEST(DIEHashTest, MemberFunc) {
   DIEInteger One(1);
   Unnamed.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
 
-  DIE *Func = new DIE(dwarf::DW_TAG_subprogram);
+  auto Func = make_unique<DIE>(dwarf::DW_TAG_subprogram);
   DIEString FuncStr(&One, "func");
   Func->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FuncStr);
 
-  Unnamed.addChild(Func);
+  Unnamed.addChild(std::move(Func));
 
   uint64_t MD5Res = DIEHash().computeTypeSignature(Unnamed);
 
@@ -531,7 +533,7 @@ TEST(DIEHashTest, MemberFuncFlag) {
   A.addValue(dwarf::DW_AT_decl_file, dwarf::DW_FORM_data1, &One);
   A.addValue(dwarf::DW_AT_decl_line, dwarf::DW_FORM_data1, &One);
 
-  DIE *Func = new DIE(dwarf::DW_TAG_subprogram);
+  auto Func = make_unique<DIE>(dwarf::DW_TAG_subprogram);
   DIEString FuncStr(&One, "func");
   DIEString FuncLinkage(&One, "_ZN1A4funcEv");
   DIEInteger Two(2);
@@ -542,7 +544,7 @@ TEST(DIEHashTest, MemberFuncFlag) {
   Func->addValue(dwarf::DW_AT_linkage_name, dwarf::DW_FORM_strp, &FuncLinkage);
   Func->addValue(dwarf::DW_AT_declaration, dwarf::DW_FORM_flag_present, &One);
 
-  A.addChild(Func);
+  A.addChild(std::move(Func));
 
   uint64_t MD5Res = DIEHash().computeTypeSignature(A);
 
@@ -567,17 +569,17 @@ TEST(DIEHashTest, MemberSdata) {
   DIEInteger Four(4);
   DIEInteger Five(5);
   DIEString FStr(&One, "int");
-  DIE *IntTyDIE = new DIE(dwarf::DW_TAG_base_type);
-  IntTyDIE->addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Four);
-  IntTyDIE->addValue(dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, &Five);
-  IntTyDIE->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FStr);
+  DIE IntTyDIE(dwarf::DW_TAG_base_type);
+  IntTyDIE.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Four);
+  IntTyDIE.addValue(dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, &Five);
+  IntTyDIE.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FStr);
 
   DIEEntry IntTy(IntTyDIE);
-  DIE *PITyDIE = new DIE(dwarf::DW_TAG_const_type);
+  auto PITyDIE = make_unique<DIE>(dwarf::DW_TAG_const_type);
   PITyDIE->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &IntTy);
 
-  DIEEntry PITy(PITyDIE);
-  DIE *PI = new DIE(dwarf::DW_TAG_member);
+  DIEEntry PITy(*PITyDIE);
+  auto PI = make_unique<DIE>(dwarf::DW_TAG_member);
   DIEString PIStr(&One, "PI");
   DIEInteger Two(2);
   DIEInteger NegThree(-3);
@@ -589,7 +591,7 @@ TEST(DIEHashTest, MemberSdata) {
   PI->addValue(dwarf::DW_AT_declaration, dwarf::DW_FORM_flag_present, &One);
   PI->addValue(dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, &NegThree);
 
-  A.addChild(PI);
+  A.addChild(std::move(PI));
 
   uint64_t MD5Res = DIEHash().computeTypeSignature(A);
   ASSERT_EQ(0x9a216000dd3788a7ULL, MD5Res);
@@ -611,17 +613,17 @@ TEST(DIEHashTest, MemberBlock) {
 
   DIEInteger Four(4);
   DIEString FStr(&One, "float");
-  DIE *FloatTyDIE = new DIE(dwarf::DW_TAG_base_type);
+  auto FloatTyDIE = make_unique<DIE>(dwarf::DW_TAG_base_type);
   FloatTyDIE->addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Four);
   FloatTyDIE->addValue(dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, &Four);
   FloatTyDIE->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FStr);
 
-  DIEEntry FloatTy(FloatTyDIE);
-  DIE *PITyDIE = new DIE(dwarf::DW_TAG_const_type);
+  DIEEntry FloatTy(*FloatTyDIE);
+  auto PITyDIE = make_unique<DIE>(dwarf::DW_TAG_const_type);
   PITyDIE->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FloatTy);
 
-  DIEEntry PITy(PITyDIE);
-  DIE *PI = new DIE(dwarf::DW_TAG_member);
+  DIEEntry PITy(*PITyDIE);
+  auto PI = make_unique<DIE>(dwarf::DW_TAG_member);
   DIEString PIStr(&One, "PI");
   DIEInteger Two(2);
   PI->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &PIStr);
@@ -631,20 +633,20 @@ TEST(DIEHashTest, MemberBlock) {
   PI->addValue(dwarf::DW_AT_external, dwarf::DW_FORM_flag_present, &One);
   PI->addValue(dwarf::DW_AT_declaration, dwarf::DW_FORM_flag_present, &One);
 
-  DIEBlock *PIBlock = new DIEBlock();
+  DIEBlock PIBlock;
   DIEInteger Blk1(0xc3);
   DIEInteger Blk2(0xf5);
   DIEInteger Blk3(0x48);
   DIEInteger Blk4(0x40);
 
-  PIBlock->addValue((dwarf::Attribute)0, dwarf::DW_FORM_data1, &Blk1);
-  PIBlock->addValue((dwarf::Attribute)0, dwarf::DW_FORM_data1, &Blk2);
-  PIBlock->addValue((dwarf::Attribute)0, dwarf::DW_FORM_data1, &Blk3);
-  PIBlock->addValue((dwarf::Attribute)0, dwarf::DW_FORM_data1, &Blk4);
+  PIBlock.addValue((dwarf::Attribute)0, dwarf::DW_FORM_data1, &Blk1);
+  PIBlock.addValue((dwarf::Attribute)0, dwarf::DW_FORM_data1, &Blk2);
+  PIBlock.addValue((dwarf::Attribute)0, dwarf::DW_FORM_data1, &Blk3);
+  PIBlock.addValue((dwarf::Attribute)0, dwarf::DW_FORM_data1, &Blk4);
 
-  PI->addValue(dwarf::DW_AT_const_value, dwarf::DW_FORM_block1, PIBlock);
+  PI->addValue(dwarf::DW_AT_const_value, dwarf::DW_FORM_block1, &PIBlock);
 
-  A.addChild(PI);
+  A.addChild(std::move(PI));
 
   uint64_t MD5Res = DIEHash().computeTypeSignature(A);
   ASSERT_EQ(0x493af53ad3d3f651ULL, MD5Res);
diff --git a/unittests/ExecutionEngine/JIT/IntelJITEventListenerTest.cpp b/unittests/ExecutionEngine/JIT/IntelJITEventListenerTest.cpp
index d3f66a2..db90887 100644
--- a/unittests/ExecutionEngine/JIT/IntelJITEventListenerTest.cpp
+++ b/unittests/ExecutionEngine/JIT/IntelJITEventListenerTest.cpp
@@ -83,7 +83,7 @@ public:
     EXPECT_TRUE(0 != MockWrapper);
 
     Listener.reset(JITEventListener::createIntelJITEventListener(
-      MockWrapper.take()));
+      MockWrapper.release()));
     EXPECT_TRUE(0 != Listener);
     EE->RegisterJITEventListener(Listener.get());
   }
diff --git a/unittests/ExecutionEngine/JIT/JITTest.cpp b/unittests/ExecutionEngine/JIT/JITTest.cpp
index 9e65fee..f438286 100644
--- a/unittests/ExecutionEngine/JIT/JITTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITTest.cpp
@@ -114,8 +114,8 @@ public:
     return Result;
   }
   int stubsAllocated;
-  virtual uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
-                                unsigned Alignment) {
+  uint8_t *allocateStub(const GlobalValue *F, unsigned StubSize,
+                        unsigned Alignment) override {
     stubsAllocated++;
     return Base->allocateStub(F, StubSize, Alignment);
   }
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
index 3813d59..20d3f13 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
@@ -28,6 +28,7 @@ using namespace llvm;
 
 static bool didCallAllocateCodeSection;
 static bool didAllocateCompactUnwindSection;
+static bool didCallYield;
 
 static uint8_t *roundTripAllocateCodeSection(void *object, uintptr_t size,
                                              unsigned alignment,
@@ -64,6 +65,10 @@ static void roundTripDestroy(void *object) {
   delete static_cast<SectionMemoryManager*>(object);
 }
 
+static void yield(LLVMContextRef, void *) {
+  didCallYield = true;
+}
+
 namespace {
 
 // memory manager to test reserve allocation space callback
@@ -142,6 +147,7 @@ protected:
   virtual void SetUp() {
     didCallAllocateCodeSection = false;
     didAllocateCompactUnwindSection = false;
+    didCallYield = false;
     Module = 0;
     Function = 0;
     Engine = 0;
@@ -429,3 +435,24 @@ TEST_F(MCJITCAPITest, reserve_allocation_space) {
   EXPECT_TRUE(MM->UsedCodeSize > 0); 
   EXPECT_TRUE(MM->UsedDataSizeRW > 0);
 }
+
+TEST_F(MCJITCAPITest, yield) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  buildSimpleFunction();
+  buildMCJITOptions();
+  buildMCJITEngine();
+  LLVMContextRef C = LLVMGetGlobalContext();
+  LLVMContextSetYieldCallback(C, yield, NULL);
+  buildAndRunPasses();
+
+  union {
+    void *raw;
+    int (*usable)();
+  } functionPointer;
+  functionPointer.raw = LLVMGetPointerToGlobal(Engine, Function);
+
+  EXPECT_EQ(42, functionPointer.usable());
+  EXPECT_TRUE(didCallYield);
+}
+
diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt
index 7368a24..b439d59 100644
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt
@@ -21,6 +21,7 @@ set(IRSources
   PatternMatch.cpp
   TypeBuilderTest.cpp
   TypesTest.cpp
+  UserTest.cpp
   ValueHandleTest.cpp
   ValueMapTest.cpp
   ValueTest.cpp
diff --git a/unittests/IR/ConstantsTest.cpp b/unittests/IR/ConstantsTest.cpp
index b3aa810..c11729c 100644
--- a/unittests/IR/ConstantsTest.cpp
+++ b/unittests/IR/ConstantsTest.cpp
@@ -254,6 +254,34 @@ TEST(ConstantsTest, AsInstructionsTest) {
         P6STR ", i32 1");
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+TEST(ConstantsTest, ReplaceWithConstantTest) {
+  std::unique_ptr<Module> M(new Module("MyModule", getGlobalContext()));
+
+  Type *Int32Ty = Type::getInt32Ty(getGlobalContext());
+  Constant *One = ConstantInt::get(Int32Ty, 1);
+
+  Constant *Global =
+      M->getOrInsertGlobal("dummy", PointerType::getUnqual(Int32Ty));
+  Constant *GEP = ConstantExpr::getGetElementPtr(Global, One);
+  EXPECT_DEATH(Global->replaceAllUsesWith(GEP),
+               "this->replaceAllUsesWith\\(expr\\(this\\)\\) is NOT valid!");
+}
+
+TEST(ConstantsTest, ReplaceInAliasTest) {
+  std::unique_ptr<Module> M(new Module("MyModule", getGlobalContext()));
+
+  Type *Int32Ty = Type::getInt32Ty(getGlobalContext());
+  auto *Global = cast<GlobalObject>(M->getOrInsertGlobal("dummy", Int32Ty));
+  auto *GA = GlobalAlias::create(GlobalValue::ExternalLinkage, "alias", Global);
+  EXPECT_DEATH(Global->replaceAllUsesWith(GA),
+               "replaceAliasUseWith cannot form an alias cycle");
+}
+
+#endif
+#endif
+
 #undef CHECK
 
 }  // end anonymous namespace
diff --git a/unittests/IR/InstructionsTest.cpp b/unittests/IR/InstructionsTest.cpp
index e76afa8..336f5a2 100644
--- a/unittests/IR/InstructionsTest.cpp
+++ b/unittests/IR/InstructionsTest.cpp
@@ -483,6 +483,39 @@ TEST(InstructionsTest, isEliminableCastPair) {
 
 }
 
+TEST(InstructionsTest, CloneCall) {
+  LLVMContext &C(getGlobalContext());
+  Type *Int32Ty = Type::getInt32Ty(C);
+  Type *ArgTys[] = {Int32Ty, Int32Ty, Int32Ty};
+  Type *FnTy = FunctionType::get(Int32Ty, ArgTys, /*isVarArg=*/false);
+  Value *Callee = Constant::getNullValue(FnTy->getPointerTo());
+  Value *Args[] = {
+    ConstantInt::get(Int32Ty, 1),
+    ConstantInt::get(Int32Ty, 2),
+    ConstantInt::get(Int32Ty, 3)
+  };
+  std::unique_ptr<CallInst> Call(CallInst::Create(Callee, Args, "result"));
+
+  // Test cloning the tail call kind.
+  CallInst::TailCallKind Kinds[] = {CallInst::TCK_None, CallInst::TCK_Tail,
+                                    CallInst::TCK_MustTail};
+  for (CallInst::TailCallKind TCK : Kinds) {
+    Call->setTailCallKind(TCK);
+    std::unique_ptr<CallInst> Clone(cast<CallInst>(Call->clone()));
+    EXPECT_EQ(Call->getTailCallKind(), Clone->getTailCallKind());
+  }
+  Call->setTailCallKind(CallInst::TCK_None);
+
+  // Test cloning an attribute.
+  {
+    AttrBuilder AB;
+    AB.addAttribute(Attribute::ReadOnly);
+    Call->setAttributes(AttributeSet::get(C, AttributeSet::FunctionIndex, AB));
+    std::unique_ptr<CallInst> Clone(cast<CallInst>(Call->clone()));
+    EXPECT_TRUE(Clone->onlyReadsMemory());
+  }
+}
+
 }  // end anonymous namespace
 }  // end namespace llvm
 
diff --git a/unittests/IR/MDBuilderTest.cpp b/unittests/IR/MDBuilderTest.cpp
index 665d559..c8b5a09 100644
--- a/unittests/IR/MDBuilderTest.cpp
+++ b/unittests/IR/MDBuilderTest.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
 #include "gtest/gtest.h"
 
diff --git a/unittests/IR/UserTest.cpp b/unittests/IR/UserTest.cpp
new file mode 100644
index 0000000..9c0e7b2
--- /dev/null
+++ b/unittests/IR/UserTest.cpp
@@ -0,0 +1,96 @@
+//===- llvm/unittest/IR/UserTest.cpp - User unit tests --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+
+namespace {
+
+TEST(UserTest, ValueOpIteration) {
+  LLVMContext C;
+
+  const char *ModuleString = "define void @f(i32 %x, i32 %y) {\n"
+                             "entry:\n"
+                             "  switch i32 undef, label %s0\n"
+                             "      [ i32 1, label %s1\n"
+                             "        i32 2, label %s2\n"
+                             "        i32 3, label %s3\n"
+                             "        i32 4, label %s4\n"
+                             "        i32 5, label %s5\n"
+                             "        i32 6, label %s6\n"
+                             "        i32 7, label %s7\n"
+                             "        i32 8, label %s8\n"
+                             "        i32 9, label %s9 ]\n"
+                             "\n"
+                             "s0:\n"
+                             "  br label %exit\n"
+                             "s1:\n"
+                             "  br label %exit\n"
+                             "s2:\n"
+                             "  br label %exit\n"
+                             "s3:\n"
+                             "  br label %exit\n"
+                             "s4:\n"
+                             "  br label %exit\n"
+                             "s5:\n"
+                             "  br label %exit\n"
+                             "s6:\n"
+                             "  br label %exit\n"
+                             "s7:\n"
+                             "  br label %exit\n"
+                             "s8:\n"
+                             "  br label %exit\n"
+                             "s9:\n"
+                             "  br label %exit\n"
+                             "\n"
+                             "exit:\n"
+                             "  %phi = phi i32 [ 0, %s0 ], [ 1, %s1 ],\n"
+                             "                 [ 2, %s2 ], [ 3, %s3 ],\n"
+                             "                 [ 4, %s4 ], [ 5, %s5 ],\n"
+                             "                 [ 6, %s6 ], [ 7, %s7 ],\n"
+                             "                 [ 8, %s8 ], [ 9, %s9 ]\n"
+                             "  ret void\n"
+                             "}\n";
+  SMDiagnostic Err;
+  Module *M = ParseAssemblyString(ModuleString, NULL, Err, C);
+
+  Function *F = M->getFunction("f");
+  BasicBlock &ExitBB = F->back();
+  PHINode &P = cast<PHINode>(ExitBB.front());
+  EXPECT_TRUE(P.value_op_begin() == P.value_op_begin());
+  EXPECT_FALSE(P.value_op_begin() == P.value_op_end());
+  EXPECT_TRUE(P.value_op_begin() != P.value_op_end());
+  EXPECT_FALSE(P.value_op_end() != P.value_op_end());
+  EXPECT_TRUE(P.value_op_begin() < P.value_op_end());
+  EXPECT_FALSE(P.value_op_begin() < P.value_op_begin());
+  EXPECT_TRUE(P.value_op_end() > P.value_op_begin());
+  EXPECT_FALSE(P.value_op_begin() > P.value_op_begin());
+  EXPECT_TRUE(P.value_op_begin() <= P.value_op_begin());
+  EXPECT_FALSE(P.value_op_end() <= P.value_op_begin());
+  EXPECT_TRUE(P.value_op_begin() >= P.value_op_begin());
+  EXPECT_FALSE(P.value_op_begin() >= P.value_op_end());
+  EXPECT_EQ(10, std::distance(P.value_op_begin(), P.value_op_end()));
+
+  User::value_op_iterator I = P.value_op_begin();
+  I += 3;
+  EXPECT_EQ(std::next(P.value_op_begin(), 3), I);
+  EXPECT_EQ(P.getOperand(3), *I);
+  I++;
+  EXPECT_EQ(P.getOperand(6), I[2]);
+  EXPECT_EQ(P.value_op_end(), (I - 2) + 8);
+}
+
+} // end anonymous namespace
diff --git a/unittests/IR/VerifierTest.cpp b/unittests/IR/VerifierTest.cpp
index 0a660a6..252bdd3 100644
--- a/unittests/IR/VerifierTest.cpp
+++ b/unittests/IR/VerifierTest.cpp
@@ -52,9 +52,7 @@ TEST(VerifierTest, AliasUnnamedAddr) {
   GlobalVariable *Aliasee = new GlobalVariable(M, Ty, true,
                                                GlobalValue::ExternalLinkage,
                                                Init, "foo");
-  GlobalAlias *GA = new GlobalAlias(Type::getInt8PtrTy(C),
-                                    GlobalValue::ExternalLinkage,
-                                    "bar", Aliasee, &M);
+  auto *GA = GlobalAlias::create(GlobalValue::ExternalLinkage, "bar", Aliasee);
   GA->setUnnamedAddr(true);
   std::string Error;
   raw_string_ostream ErrorOS(Error);
diff --git a/unittests/LineEditor/LineEditor.cpp b/unittests/LineEditor/LineEditor.cpp
index cb115bd..26053c0 100644
--- a/unittests/LineEditor/LineEditor.cpp
+++ b/unittests/LineEditor/LineEditor.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/LineEditor/LineEditor.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "gtest/gtest.h"
 
diff --git a/unittests/Linker/LinkModulesTest.cpp b/unittests/Linker/LinkModulesTest.cpp
index 7b40b9f..1d5db36 100644
--- a/unittests/Linker/LinkModulesTest.cpp
+++ b/unittests/Linker/LinkModulesTest.cpp
@@ -22,11 +22,9 @@ namespace {
 class LinkModuleTest : public testing::Test {
 protected:
   virtual void SetUp() {
-    LLVMContext &Ctx = getGlobalContext();
     M.reset(new Module("MyModule", Ctx));
-    FunctionType *FTy = FunctionType::get(Type::getInt8PtrTy(Ctx),
-                                          Type::getInt32Ty(Ctx),
-                                          false /*=isVarArg*/);
+    FunctionType *FTy = FunctionType::get(
+        Type::getInt8PtrTy(Ctx), Type::getInt32Ty(Ctx), false /*=isVarArg*/);
     F = Function::Create(FTy, Function::ExternalLinkage, "ba_func", M.get());
     F->setCallingConv(CallingConv::C);
 
@@ -38,12 +36,10 @@ protected:
     ArrayType *AT = ArrayType::get(Type::getInt8PtrTy(Ctx), 3);
 
     GV = new GlobalVariable(*M.get(), AT, false /*=isConstant*/,
-                            GlobalValue::InternalLinkage,
-                            0, "switch.bas");
-
+                            GlobalValue::InternalLinkage, 0, "switch.bas");
 
     // Global Initializer
-    std::vector<Constant*> Init;
+    std::vector<Constant *> Init;
     Constant *SwitchCase1BA = BlockAddress::get(SwitchCase1BB);
     Init.push_back(SwitchCase1BA);
 
@@ -51,17 +47,16 @@ protected:
     Init.push_back(SwitchCase2BA);
 
     ConstantInt *One = ConstantInt::get(Type::getInt32Ty(Ctx), 1);
-    Constant *OnePtr = ConstantExpr::getCast(Instruction::IntToPtr,
-                                             One, Type::getInt8PtrTy(Ctx));
+    Constant *OnePtr = ConstantExpr::getCast(Instruction::IntToPtr, One,
+                                             Type::getInt8PtrTy(Ctx));
     Init.push_back(OnePtr);
 
     GV->setInitializer(ConstantArray::get(AT, Init));
   }
 
-  virtual void TearDown() {
-    M.reset();
-  }
+  virtual void TearDown() { M.reset(); }
 
+  LLVMContext Ctx;
   std::unique_ptr<Module> M;
   Function *F;
   GlobalVariable *GV;
@@ -72,10 +67,9 @@ protected:
 };
 
 TEST_F(LinkModuleTest, BlockAddress) {
-  LLVMContext &Ctx = getGlobalContext();
   IRBuilder<> Builder(EntryBB);
 
-  std::vector<Value*> GEPIndices;
+  std::vector<Value *> GEPIndices;
   GEPIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ctx), 0));
   GEPIndices.push_back(F->arg_begin());
 
@@ -93,7 +87,7 @@ TEST_F(LinkModuleTest, BlockAddress) {
   Builder.SetInsertPoint(ExitBB);
   Builder.CreateRet(ConstantPointerNull::get(Type::getInt8PtrTy(Ctx)));
 
-  Module *LinkedModule = new Module("MyModuleLinked", getGlobalContext());
+  Module *LinkedModule = new Module("MyModuleLinked", Ctx);
   Linker::LinkModules(LinkedModule, M.get(), Linker::PreserveSource, 0);
 
   // Delete the original module.
@@ -117,7 +111,7 @@ TEST_F(LinkModuleTest, BlockAddress) {
             LinkedModule->getFunction("ba_func"));
   EXPECT_EQ(cast<BlockAddress>(Elem)->getBasicBlock()->getParent(),
             LinkedModule->getFunction("ba_func"));
-  
+
   Elem = Init->getOperand(1);
   ASSERT_TRUE(isa<BlockAddress>(Elem));
   EXPECT_EQ(cast<BlockAddress>(Elem)->getFunction(),
@@ -129,11 +123,9 @@ TEST_F(LinkModuleTest, BlockAddress) {
 }
 
 TEST_F(LinkModuleTest, EmptyModule) {
-  LLVMContext &Ctx = getGlobalContext();
   Module *InternalM = new Module("InternalModule", Ctx);
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),
-                                        Type::getInt8PtrTy(Ctx),
-                                        false /*=isVarArgs*/);
+  FunctionType *FTy = FunctionType::get(
+      Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx), false /*=isVarArgs*/);
 
   F = Function::Create(FTy, Function::InternalLinkage, "bar", InternalM);
   F->setCallingConv(CallingConv::C);
@@ -145,12 +137,11 @@ TEST_F(LinkModuleTest, EmptyModule) {
   StructType *STy = StructType::create(Ctx, PointerType::get(FTy, 0));
 
   GlobalVariable *GV =
-    new GlobalVariable(*InternalM, STy, false /*=isConstant*/,
-                       GlobalValue::InternalLinkage, 0, "g");
+      new GlobalVariable(*InternalM, STy, false /*=isConstant*/,
+                         GlobalValue::InternalLinkage, 0, "g");
 
   GV->setInitializer(ConstantStruct::get(STy, F));
 
-
   Module *EmptyM = new Module("EmptyModule1", Ctx);
   Linker::LinkModules(EmptyM, InternalM, Linker::PreserveSource, 0);
 
diff --git a/unittests/Object/CMakeLists.txt b/unittests/Object/CMakeLists.txt
index 6dd66ce..580a894 100644
--- a/unittests/Object/CMakeLists.txt
+++ b/unittests/Object/CMakeLists.txt
@@ -4,5 +4,6 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_unittest(ObjectTests
+  StringTableBuilderTest.cpp
   YAMLTest.cpp
   )
diff --git a/unittests/Object/StringTableBuilderTest.cpp b/unittests/Object/StringTableBuilderTest.cpp
new file mode 100644
index 0000000..130eb4a
--- /dev/null
+++ b/unittests/Object/StringTableBuilderTest.cpp
@@ -0,0 +1,40 @@
+//===----------- StringTableBuilderTest.cpp -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "llvm/Object/StringTableBuilder.h"
+#include <string>
+
+using namespace llvm;
+
+namespace {
+
+TEST(StringTableBuilderTest, Basic) {
+  StringTableBuilder B;
+
+  B.add("foo");
+  B.add("bar");
+  B.add("foobar");
+
+  B.finalize();
+
+  std::string Expected;
+  Expected += '\x00';
+  Expected += "foobar";
+  Expected += '\x00';
+  Expected += "foo";
+  Expected += '\x00';
+
+  EXPECT_EQ(Expected, B.data());
+  EXPECT_EQ(1U, B.getOffset("foobar"));
+  EXPECT_EQ(4U, B.getOffset("bar"));
+  EXPECT_EQ(8U, B.getOffset("foo"));
+}
+
+}
diff --git a/unittests/Option/OptionParsingTest.cpp b/unittests/Option/OptionParsingTest.cpp
index 6beb286..aa32da3 100644
--- a/unittests/Option/OptionParsingTest.cpp
+++ b/unittests/Option/OptionParsingTest.cpp
@@ -68,7 +68,7 @@ TEST(Option, OptionParsing) {
   TestOptTable T;
   unsigned MAI, MAC;
   std::unique_ptr<InputArgList> AL(
-      T.ParseArgs(Args, array_endof(Args), MAI, MAC));
+      T.ParseArgs(std::begin(Args), std::end(Args), MAI, MAC));
 
   // Check they all exist.
   EXPECT_TRUE(AL->hasArg(OPT_A));
@@ -114,7 +114,7 @@ TEST(Option, ParseWithFlagExclusions) {
   std::unique_ptr<InputArgList> AL;
 
   // Exclude flag3 to avoid parsing as OPT_SLASH_C.
-  AL.reset(T.ParseArgs(Args, array_endof(Args), MAI, MAC,
+  AL.reset(T.ParseArgs(std::begin(Args), std::end(Args), MAI, MAC,
                        /*FlagsToInclude=*/0,
                        /*FlagsToExclude=*/OptFlag3));
   EXPECT_TRUE(AL->hasArg(OPT_A));
@@ -122,7 +122,7 @@ TEST(Option, ParseWithFlagExclusions) {
   EXPECT_FALSE(AL->hasArg(OPT_SLASH_C));
 
   // Exclude flag1 to avoid parsing as OPT_C.
-  AL.reset(T.ParseArgs(Args, array_endof(Args), MAI, MAC,
+  AL.reset(T.ParseArgs(std::begin(Args), std::end(Args), MAI, MAC,
                        /*FlagsToInclude=*/0,
                        /*FlagsToExclude=*/OptFlag1));
   EXPECT_TRUE(AL->hasArg(OPT_B));
@@ -130,7 +130,7 @@ TEST(Option, ParseWithFlagExclusions) {
   EXPECT_TRUE(AL->hasArg(OPT_SLASH_C));
 
   const char *NewArgs[] = { "/C", "foo", "--C=bar" };
-  AL.reset(T.ParseArgs(NewArgs, array_endof(NewArgs), MAI, MAC));
+  AL.reset(T.ParseArgs(std::begin(NewArgs), std::end(NewArgs), MAI, MAC));
   EXPECT_TRUE(AL->hasArg(OPT_SLASH_C));
   EXPECT_TRUE(AL->hasArg(OPT_C));
   EXPECT_EQ(AL->getLastArgValue(OPT_SLASH_C), "foo");
@@ -143,7 +143,7 @@ TEST(Option, ParseAliasInGroup) {
 
   const char *MyArgs[] = { "-I" };
   std::unique_ptr<InputArgList> AL(
-      T.ParseArgs(MyArgs, array_endof(MyArgs), MAI, MAC));
+      T.ParseArgs(std::begin(MyArgs), std::end(MyArgs), MAI, MAC));
   EXPECT_TRUE(AL->hasArg(OPT_H));
 }
 
@@ -153,7 +153,7 @@ TEST(Option, AliasArgs) {
 
   const char *MyArgs[] = { "-J", "-Joo" };
   std::unique_ptr<InputArgList> AL(
-      T.ParseArgs(MyArgs, array_endof(MyArgs), MAI, MAC));
+      T.ParseArgs(std::begin(MyArgs), std::end(MyArgs), MAI, MAC));
   EXPECT_TRUE(AL->hasArg(OPT_B));
   EXPECT_EQ(AL->getAllArgValues(OPT_B)[0], "foo");
   EXPECT_EQ(AL->getAllArgValues(OPT_B)[1], "bar");
@@ -165,7 +165,7 @@ TEST(Option, IgnoreCase) {
 
   const char *MyArgs[] = { "-a", "-joo" };
   std::unique_ptr<InputArgList> AL(
-      T.ParseArgs(MyArgs, array_endof(MyArgs), MAI, MAC));
+      T.ParseArgs(std::begin(MyArgs), std::end(MyArgs), MAI, MAC));
   EXPECT_TRUE(AL->hasArg(OPT_A));
   EXPECT_TRUE(AL->hasArg(OPT_B));
 }
@@ -176,7 +176,7 @@ TEST(Option, DoNotIgnoreCase) {
 
   const char *MyArgs[] = { "-a", "-joo" };
   std::unique_ptr<InputArgList> AL(
-      T.ParseArgs(MyArgs, array_endof(MyArgs), MAI, MAC));
+      T.ParseArgs(std::begin(MyArgs), std::end(MyArgs), MAI, MAC));
   EXPECT_FALSE(AL->hasArg(OPT_A));
   EXPECT_FALSE(AL->hasArg(OPT_B));
 }
@@ -187,7 +187,7 @@ TEST(Option, SlurpEmpty) {
 
   const char *MyArgs[] = { "-A", "-slurp" };
   std::unique_ptr<InputArgList> AL(
-      T.ParseArgs(MyArgs, array_endof(MyArgs), MAI, MAC));
+      T.ParseArgs(std::begin(MyArgs), std::end(MyArgs), MAI, MAC));
   EXPECT_TRUE(AL->hasArg(OPT_A));
   EXPECT_TRUE(AL->hasArg(OPT_Slurp));
   EXPECT_EQ(AL->getAllArgValues(OPT_Slurp).size(), 0U);
@@ -199,7 +199,7 @@ TEST(Option, Slurp) {
 
   const char *MyArgs[] = { "-A", "-slurp", "-B", "--", "foo" };
   std::unique_ptr<InputArgList> AL(
-      T.ParseArgs(MyArgs, array_endof(MyArgs), MAI, MAC));
+      T.ParseArgs(std::begin(MyArgs), std::end(MyArgs), MAI, MAC));
   EXPECT_EQ(AL->size(), 2U);
   EXPECT_TRUE(AL->hasArg(OPT_A));
   EXPECT_FALSE(AL->hasArg(OPT_B));
diff --git a/unittests/Support/AllocatorTest.cpp b/unittests/Support/AllocatorTest.cpp
index bcf6bf1..0fc84c7 100644
--- a/unittests/Support/AllocatorTest.cpp
+++ b/unittests/Support/AllocatorTest.cpp
@@ -29,6 +29,21 @@ TEST(AllocatorTest, Basics) {
   EXPECT_EQ(2, b[9]);
   EXPECT_EQ(3, *c);
   EXPECT_EQ(1U, Alloc.GetNumSlabs());
+
+  BumpPtrAllocator Alloc2 = std::move(Alloc);
+  EXPECT_EQ(0U, Alloc.GetNumSlabs());
+  EXPECT_EQ(1U, Alloc2.GetNumSlabs());
+
+  // Make sure the old pointers still work. These are especially interesting
+  // under ASan or Valgrind.
+  EXPECT_EQ(1, *a);
+  EXPECT_EQ(2, b[0]);
+  EXPECT_EQ(2, b[9]);
+  EXPECT_EQ(3, *c);
+
+  Alloc = std::move(Alloc2);
+  EXPECT_EQ(0U, Alloc2.GetNumSlabs());
+  EXPECT_EQ(1U, Alloc.GetNumSlabs());
 }
 
 // Allocate enough bytes to create three slabs.
@@ -84,7 +99,7 @@ TEST(AllocatorTest, TestOverflow) {
   BumpPtrAllocator Alloc;
 
   // Fill the slab right up until the end pointer.
-  Alloc.Allocate(4096 - sizeof(MemSlab), 0);
+  Alloc.Allocate(4096, 0);
   EXPECT_EQ(1U, Alloc.GetNumSlabs());
 
   // If we don't allocate a new slab, then we will have overflowed.
@@ -102,49 +117,52 @@ TEST(AllocatorTest, TestSmallSlabSize) {
 
 // Mock slab allocator that returns slabs aligned on 4096 bytes.  There is no
 // easy portable way to do this, so this is kind of a hack.
-class MockSlabAllocator : public SlabAllocator {
-  MemSlab *LastSlab;
+class MockSlabAllocator {
+  static size_t LastSlabSize;
 
 public:
-  virtual ~MockSlabAllocator() { }
+  ~MockSlabAllocator() { }
 
-  virtual MemSlab *Allocate(size_t Size) {
+  void *Allocate(size_t Size, size_t /*Alignment*/) {
     // Allocate space for the alignment, the slab, and a void* that goes right
     // before the slab.
     size_t Alignment = 4096;
     void *MemBase = malloc(Size + Alignment - 1 + sizeof(void*));
 
-    // Make the slab.
-    MemSlab *Slab = (MemSlab*)(((uintptr_t)MemBase+sizeof(void*)+Alignment-1) &
-                               ~(uintptr_t)(Alignment - 1));
-    Slab->Size = Size;
-    Slab->NextPtr = 0;
+    // Find the slab start.
+    void *Slab = alignPtr((char *)MemBase + sizeof(void *), Alignment);
 
     // Hold a pointer to the base so we can free the whole malloced block.
     ((void**)Slab)[-1] = MemBase;
 
-    LastSlab = Slab;
+    LastSlabSize = Size;
     return Slab;
   }
 
-  virtual void Deallocate(MemSlab *Slab) {
+  void Deallocate(void *Slab, size_t Size) {
     free(((void**)Slab)[-1]);
   }
 
-  MemSlab *GetLastSlab() {
-    return LastSlab;
-  }
+  static size_t GetLastSlabSize() { return LastSlabSize; }
 };
 
+size_t MockSlabAllocator::LastSlabSize = 0;
+
 // Allocate a large-ish block with a really large alignment so that the
 // allocator will think that it has space, but after it does the alignment it
 // will not.
 TEST(AllocatorTest, TestBigAlignment) {
-  MockSlabAllocator SlabAlloc;
-  BumpPtrAllocator Alloc(SlabAlloc);
-  uintptr_t Ptr = (uintptr_t)Alloc.Allocate(3000, 2048);
-  MemSlab *Slab = SlabAlloc.GetLastSlab();
-  EXPECT_LE(Ptr + 3000, ((uintptr_t)Slab) + Slab->Size);
+  BumpPtrAllocatorImpl<MockSlabAllocator> Alloc;
+
+  // First allocate a tiny bit to ensure we have to re-align things.
+  (void)Alloc.Allocate(1, 0);
+
+  // Now the big chunk with a big alignment.
+  (void)Alloc.Allocate(3000, 2048);
+
+  // We test that the last slab size is not the default 4096 byte slab, but
+  // rather a custom sized slab that is larger.
+  EXPECT_GT(MockSlabAllocator::GetLastSlabSize(), 4096u);
 }
 
 }  // anonymous namespace
diff --git a/unittests/Support/BlockFrequencyTest.cpp b/unittests/Support/BlockFrequencyTest.cpp
index c318451..f6e3537 100644
--- a/unittests/Support/BlockFrequencyTest.cpp
+++ b/unittests/Support/BlockFrequencyTest.cpp
@@ -15,9 +15,8 @@ TEST(BlockFrequencyTest, OneToZero) {
   EXPECT_EQ(Freq.getFrequency(), 0u);
 
   Freq = BlockFrequency(1);
-  uint32_t Remainder = Freq.scale(Prob);
+  Freq *= Prob;
   EXPECT_EQ(Freq.getFrequency(), 0u);
-  EXPECT_EQ(Remainder, UINT32_MAX - 1);
 }
 
 TEST(BlockFrequencyTest, OneToOne) {
@@ -27,9 +26,8 @@ TEST(BlockFrequencyTest, OneToOne) {
   EXPECT_EQ(Freq.getFrequency(), 1u);
 
   Freq = BlockFrequency(1);
-  uint32_t Remainder = Freq.scale(Prob);
+  Freq *= Prob;
   EXPECT_EQ(Freq.getFrequency(), 1u);
-  EXPECT_EQ(Remainder, 0u);
 }
 
 TEST(BlockFrequencyTest, ThreeToOne) {
@@ -39,9 +37,8 @@ TEST(BlockFrequencyTest, ThreeToOne) {
   EXPECT_EQ(Freq.getFrequency(), 1u);
 
   Freq = BlockFrequency(3);
-  uint32_t Remainder = Freq.scale(Prob);
+  Freq *= Prob;
   EXPECT_EQ(Freq.getFrequency(), 1u);
-  EXPECT_EQ(Remainder, 0u);
 }
 
 TEST(BlockFrequencyTest, MaxToHalfMax) {
@@ -51,9 +48,8 @@ TEST(BlockFrequencyTest, MaxToHalfMax) {
   EXPECT_EQ(Freq.getFrequency(), 9223372034707292159ULL);
 
   Freq = BlockFrequency(UINT64_MAX);
-  uint32_t Remainder = Freq.scale(Prob);
+  Freq *= Prob;
   EXPECT_EQ(Freq.getFrequency(), 9223372034707292159ULL);
-  EXPECT_EQ(Remainder, 0u);
 }
 
 TEST(BlockFrequencyTest, BigToBig) {
@@ -65,9 +61,8 @@ TEST(BlockFrequencyTest, BigToBig) {
   EXPECT_EQ(Freq.getFrequency(), Big);
 
   Freq = BlockFrequency(Big);
-  uint32_t Remainder = Freq.scale(Prob);
+  Freq *= Prob;
   EXPECT_EQ(Freq.getFrequency(), Big);
-  EXPECT_EQ(Remainder, 0u);
 }
 
 TEST(BlockFrequencyTest, MaxToMax) {
@@ -80,109 +75,8 @@ TEST(BlockFrequencyTest, MaxToMax) {
   // value, we do not signal saturation if the result equals said value, but
   // saturating does not occur.
   Freq = BlockFrequency(UINT64_MAX);
-  uint32_t Remainder = Freq.scale(Prob);
+  Freq *= Prob;
   EXPECT_EQ(Freq.getFrequency(), UINT64_MAX);
-  EXPECT_EQ(Remainder, 0u);
-}
-
-TEST(BlockFrequencyTest, ScaleResultRemainderTest) {
-  struct {
-    uint64_t Freq;
-    uint32_t Prob[2];
-    uint64_t ExpectedFreq;
-    uint32_t ExpectedRemainder;
-  } Tests[80] = {
-    // Data for scaling that results in <= 64 bit division.
-    { 0x1423e2a50ULL, { 0x64819521, 0x7765dd13 }, 0x10f418889ULL, 0x92b9d25 },
-    { 0x35ef14ceULL, { 0x28ade3c7, 0x304532ae }, 0x2d73c33aULL, 0x2c0fd0b6 },
-    { 0xd03dbfbe24ULL, { 0x790079, 0xe419f3 }, 0x6e776fc1fdULL, 0x4a06dd },
-    { 0x21d67410bULL, { 0x302a9dc2, 0x3ddb4442 }, 0x1a5948fd6ULL, 0x265d1c2a },
-    { 0x8664aeadULL, { 0x3d523513, 0x403523b1 }, 0x805a04cfULL, 0x324c27b8 },
-    { 0x201db0cf4ULL, { 0x35112a7b, 0x79fc0c74 }, 0xdf8b07f6ULL, 0x490c1dc4 },
-    { 0x13f1e4430aULL, { 0x21c92bf, 0x21e63aae }, 0x13e0cba15ULL, 0x1df47c30 },
-    { 0x16c83229ULL, { 0x3793f66f, 0x53180dea }, 0xf3ce7b6ULL, 0x1d0c1b6b },
-    { 0xc62415be8ULL, { 0x9cc4a63, 0x4327ae9b }, 0x1ce8b71caULL, 0x3f2c696a },
-    { 0x6fac5e434ULL, { 0xe5f9170, 0x1115e10b }, 0x5df23dd4cULL, 0x4dafc7c },
-    { 0x1929375f2ULL, { 0x3a851375, 0x76c08456 }, 0xc662b082ULL, 0x343589ee },
-    { 0x243c89db6ULL, { 0x354ebfc0, 0x450ef197 }, 0x1bf8c1661ULL, 0x4948e49 },
-    { 0x310e9b31aULL, { 0x1b1b8acf, 0x2d3629f0 }, 0x1d69c93f9ULL, 0x73e3b96 },
-    { 0xa1fae921dULL, { 0xa7a098c, 0x10469f44 }, 0x684413d6cULL, 0x86a882c },
-    { 0xc1582d957ULL, { 0x498e061, 0x59856bc }, 0x9edc5f4e7ULL, 0x29b0653 },
-    { 0x57cfee75ULL, { 0x1d061dc3, 0x7c8bfc17 }, 0x1476a220ULL, 0x2383d33f },
-    { 0x139220080ULL, { 0x294a6c71, 0x2a2b07c9 }, 0x1329e1c76ULL, 0x7aa5da },
-    { 0x1665d353cULL, { 0x7080db5, 0xde0d75c }, 0xb590d9fbULL, 0x7ba8c38 },
-    { 0xe8f14541ULL, { 0x5188e8b2, 0x736527ef }, 0xa4971be5ULL, 0x6b612167 },
-    { 0x2f4775f29ULL, { 0x254ef0fe, 0x435fcf50 }, 0x1a2e449c1ULL, 0x28bbf5e },
-    { 0x27b85d8d7ULL, { 0x304c8220, 0x5de678f2 }, 0x146e3bef9ULL, 0x4b27097e },
-    { 0x1d362e36bULL, { 0x36c85b12, 0x37a66f55 }, 0x1cc19b8e6ULL, 0x688e828 },
-    { 0x155fd48c7ULL, { 0xf5894d, 0x1256108 }, 0x11e383602ULL, 0x111f0cb },
-    { 0xb5db2d15ULL, { 0x39bb26c5, 0x5bdcda3e }, 0x72499259ULL, 0x59c4939b },
-    { 0x153990298ULL, { 0x48921c09, 0x706eb817 }, 0xdb3268e8ULL, 0x66bb8a80 },
-    { 0x28a7c3ed7ULL, { 0x1f776fd7, 0x349f7a70 }, 0x184f73ae1ULL, 0x28910321 },
-    { 0x724dbeabULL, { 0x1bd149f5, 0x253a085e }, 0x5569c0b3ULL, 0xff8e2ed },
-    { 0xd8f0c513ULL, { 0x18c8cc4c, 0x1b72bad0 }, 0xc3e30643ULL, 0xd85e134 },
-    { 0x17ce3dcbULL, { 0x1e4c6260, 0x233b359e }, 0x1478f4afULL, 0x49ea31e },
-    { 0x1ce036ce0ULL, { 0x29e3c8af, 0x5318dd4a }, 0xe8e76196ULL, 0x11d5b9c4 },
-    { 0x1473ae2aULL, { 0x29b897ba, 0x2be29378 }, 0x13718185ULL, 0x6f93b2c },
-    { 0x1dd41aa68ULL, { 0x3d0a4441, 0x5a0e8f12 }, 0x1437b6bbfULL, 0x54b09ffa },
-    { 0x1b49e4a53ULL, { 0x3430c1fe, 0x5a204aed }, 0xfcd6852fULL, 0x15ad6ed7 },
-    { 0x217941b19ULL, { 0x12ced2bd, 0x21b68310 }, 0x12aca65b1ULL, 0x1b2a9565 },
-    { 0xac6a4dc8ULL, { 0x3ed68da8, 0x6fdca34c }, 0x60da926dULL, 0x22ff53e4 },
-    { 0x1c503a4e7ULL, { 0xfcbbd32, 0x11e48d17 }, 0x18fec7d38ULL, 0xa8aa816 },
-    { 0x1c885855ULL, { 0x213e919d, 0x25941897 }, 0x193de743ULL, 0x4ea09c },
-    { 0x29b9c168eULL, { 0x2b644aea, 0x45725ee7 }, 0x1a122e5d5ULL, 0xbee1099 },
-    { 0x806a33f2ULL, { 0x30a80a23, 0x5063733a }, 0x4db9a264ULL, 0x1eaed76e },
-    { 0x282afc96bULL, { 0x143ae554, 0x1a9863ff }, 0x1e8de5204ULL, 0x158d9020 },
-    // Data for scaling that results in > 64 bit division.
-    { 0x23ca5f2f672ca41cULL, { 0xecbc641, 0x111373f7 }, 0x1f0301e5e8295ab5ULL, 0xf627f79 },
-    { 0x5e4f2468142265e3ULL, { 0x1ddf5837, 0x32189233 }, 0x383ca7ba9fdd2c8cULL, 0x1c8f33e1 },
-    { 0x277a1a6f6b266bf6ULL, { 0x415d81a8, 0x61eb5e1e }, 0x1a5a3e1d41b30c0fULL, 0x29cde3ae },
-    { 0x1bdbb49a237035cbULL, { 0xea5bf17, 0x1d25ffb3 }, 0xdffc51c53d44b93ULL, 0x5170574 },
-    { 0x2bce6d29b64fb8ULL, { 0x3bfd5631, 0x7525c9bb }, 0x166ebedda7ac57ULL, 0x3026dfab },
-    { 0x3a02116103df5013ULL, { 0x2ee18a83, 0x3299aea8 }, 0x35be8922ab1e2a84ULL, 0x298d9919 },
-    { 0x7b5762390799b18cULL, { 0x12f8e5b9, 0x2563bcd4 }, 0x3e960077aca01209ULL, 0x93afeb8 },
-    { 0x69cfd72537021579ULL, { 0x4c35f468, 0x6a40feee }, 0x4be4cb3848be98a3ULL, 0x4ff96b9e },
-    { 0x49dfdf835120f1c1ULL, { 0x8cb3759, 0x559eb891 }, 0x79663f7120edadeULL, 0x51b1fb5b },
-    { 0x74b5be5c27676381ULL, { 0x47e4c5e0, 0x7c7b19ff }, 0x4367d2dff36a1028ULL, 0x7a7b5608 },
-    { 0x4f50f97075e7f431ULL, { 0x9a50a17, 0x11cd1185 }, 0x2af952b34c032df4ULL, 0xfddc6a3 },
-    { 0x2f8b0d712e393be4ULL, { 0x1487e386, 0x15aa356e }, 0x2d0df36478a776aaULL, 0x14e2564c },
-    { 0x224c1c75999d3deULL, { 0x3b2df0ea, 0x4523b100 }, 0x1d5b481d145f08aULL, 0x15145eec },
-    { 0x2bcbcea22a399a76ULL, { 0x28b58212, 0x48dd013e }, 0x187814d084c47cabULL, 0x3a38ebe2 },
-    { 0x1dbfca91257cb2d1ULL, { 0x1a8c04d9, 0x5e92502c }, 0x859cf7d00f77545ULL, 0x7431f4d },
-    { 0x7f20039b57cda935ULL, { 0xeccf651, 0x323f476e }, 0x25720cd976461a77ULL, 0x202817a3 },
-    { 0x40512c6a586aa087ULL, { 0x113b0423, 0x398c9eab }, 0x1341c03de8696a7eULL, 0x1e27284b },
-    { 0x63d802693f050a11ULL, { 0xf50cdd6, 0xfce2a44 }, 0x60c0177bb5e46846ULL, 0xf7ad89e },
-    { 0x2d956b422838de77ULL, { 0xb2d345b, 0x1321e557 }, 0x1aa0ed16b6aa5319ULL, 0xfe1a5ce },
-    { 0x5a1cdf0c1657bc91ULL, { 0x1d77bb0c, 0x1f991ff1 }, 0x54097ee94ff87560ULL, 0x11c4a26c },
-    { 0x3801b26d7e00176bULL, { 0xeed25da, 0x1a819d8b }, 0x1f89e96a3a639526ULL, 0xcd51e7c },
-    { 0x37655e74338e1e45ULL, { 0x300e170a, 0x5a1595fe }, 0x1d8cfb55fddc0441ULL, 0x3df05434 },
-    { 0x7b38703f2a84e6ULL, { 0x66d9053, 0xc79b6b9 }, 0x3f7d4c91774094ULL, 0x26d939e },
-    { 0x2245063c0acb3215ULL, { 0x30ce2f5b, 0x610e7271 }, 0x113b916468389235ULL, 0x1b588512 },
-    { 0x6bc195877b7b8a7eULL, { 0x392004aa, 0x4a24e60c }, 0x530594fb17db6ba5ULL, 0x35c0a5f0 },
-    { 0x40a3fde23c7b43dbULL, { 0x4e712195, 0x6553e56e }, 0x320a799bc76a466aULL, 0x5e23a5eb },
-    { 0x1d3dfc2866fbccbaULL, { 0x5075b517, 0x5fc42245 }, 0x18917f0061595bc3ULL, 0x3fcf4527 },
-    { 0x19aeb14045a61121ULL, { 0x1bf6edec, 0x707e2f4b }, 0x6626672a070bcc7ULL, 0x3607801f },
-    { 0x44ff90486c531e9fULL, { 0x66598a, 0x8a90dc }, 0x32f6f2b0525199b0ULL, 0x5ab576 },
-    { 0x3f3e7121092c5bcbULL, { 0x1c754df7, 0x5951a1b9 }, 0x14267f50b7ef375dULL, 0x221220a8 },
-    { 0x60e2dafb7e50a67eULL, { 0x4d96c66e, 0x65bd878d }, 0x49e31715ac393f8bULL, 0x4e97b195 },
-    { 0x656286667e0e6e29ULL, { 0x9d971a2, 0xacda23b }, 0x5c6ee315ead6cb4fULL, 0x516f5bd },
-    { 0x1114e0974255d507ULL, { 0x1c693, 0x2d6ff }, 0xaae42e4b35f6e60ULL, 0x8b65 },
-    { 0x508c8baf3a70ff5aULL, { 0x3b26b779, 0x6ad78745 }, 0x2c98387636c4b365ULL, 0x11dc6a51 },
-    { 0x5b47bc666bf1f9cfULL, { 0x10a87ed6, 0x187d358a }, 0x3e1767155848368bULL, 0xfb871c },
-    { 0x50954e3744460395ULL, { 0x7a42263, 0xcdaa048 }, 0x2fe739f0aee1fee1ULL, 0xb8add57 },
-    { 0x20020b406550dd8fULL, { 0x3318539, 0x42eead0 }, 0x186f326325fa346bULL, 0x10d3ae7 },
-    { 0x5bcb0b872439ffd5ULL, { 0x6f61fb2, 0x9af7344 }, 0x41fa1e3bec3c1b30ULL, 0x4fee45a },
-    { 0x7a670f365db87a53ULL, { 0x417e102, 0x3bb54c67 }, 0x8642a558304fd9eULL, 0x3b65f514 },
-    { 0x1ef0db1e7bab1cd0ULL, { 0x2b60cf38, 0x4188f78f }, 0x147ae0d6226b2ee6ULL, 0x336b6106 }
-  };
-
-  for (unsigned i = 0; i < 80; i++) {
-    BlockFrequency Freq(Tests[i].Freq);
-    uint32_t Remainder = Freq.scale(BranchProbability(Tests[i].Prob[0],
-                                                      Tests[i].Prob[1]));
-    EXPECT_EQ(Tests[i].ExpectedFreq, Freq.getFrequency());
-    EXPECT_EQ(Tests[i].ExpectedRemainder, Remainder);
-  }
 }
 
 TEST(BlockFrequency, Divide) {
@@ -208,35 +102,6 @@ TEST(BlockFrequencyTest, Saturate) {
   EXPECT_EQ(33506781356485509ULL, Freq.getFrequency());
 }
 
-TEST(BlockFrequencyTest, ProbabilityCompare) {
-  BranchProbability A(4, 5);
-  BranchProbability B(4U << 29, 5U << 29);
-  BranchProbability C(3, 4);
-
-  EXPECT_TRUE(A == B);
-  EXPECT_FALSE(A != B);
-  EXPECT_FALSE(A < B);
-  EXPECT_FALSE(A > B);
-  EXPECT_TRUE(A <= B);
-  EXPECT_TRUE(A >= B);
-
-  EXPECT_FALSE(B == C);
-  EXPECT_TRUE(B != C);
-  EXPECT_FALSE(B < C);
-  EXPECT_TRUE(B > C);
-  EXPECT_FALSE(B <= C);
-  EXPECT_TRUE(B >= C);
-
-  BranchProbability BigZero(0, UINT32_MAX);
-  BranchProbability BigOne(UINT32_MAX, UINT32_MAX);
-  EXPECT_FALSE(BigZero == BigOne);
-  EXPECT_TRUE(BigZero != BigOne);
-  EXPECT_TRUE(BigZero < BigOne);
-  EXPECT_FALSE(BigZero > BigOne);
-  EXPECT_TRUE(BigZero <= BigOne);
-  EXPECT_FALSE(BigZero >= BigOne);
-}
-
 TEST(BlockFrequencyTest, SaturatingRightShift) {
   BlockFrequency Freq(0x10080ULL);
   Freq >>= 2;
diff --git a/unittests/Support/BranchProbabilityTest.cpp b/unittests/Support/BranchProbabilityTest.cpp
new file mode 100644
index 0000000..bbd4d4e
--- /dev/null
+++ b/unittests/Support/BranchProbabilityTest.cpp
@@ -0,0 +1,282 @@
+//===- unittest/Support/BranchProbabilityTest.cpp - BranchProbability tests -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace llvm {
+void PrintTo(const BranchProbability &P, ::std::ostream *os) {
+  *os << P.getNumerator() << "/" << P.getDenominator();
+}
+}
+namespace {
+
+typedef BranchProbability BP;
+TEST(BranchProbabilityTest, Accessors) {
+  EXPECT_EQ(1u, BP(1, 7).getNumerator());
+  EXPECT_EQ(7u, BP(1, 7).getDenominator());
+  EXPECT_EQ(0u, BP::getZero().getNumerator());
+  EXPECT_EQ(1u, BP::getZero().getDenominator());
+  EXPECT_EQ(1u, BP::getOne().getNumerator());
+  EXPECT_EQ(1u, BP::getOne().getDenominator());
+}
+
+TEST(BranchProbabilityTest, Operators) {
+  EXPECT_TRUE(BP(1, 7) < BP(2, 7));
+  EXPECT_TRUE(BP(1, 7) < BP(1, 4));
+  EXPECT_TRUE(BP(5, 7) < BP(3, 4));
+  EXPECT_FALSE(BP(1, 7) < BP(1, 7));
+  EXPECT_FALSE(BP(1, 7) < BP(2, 14));
+  EXPECT_FALSE(BP(4, 7) < BP(1, 2));
+  EXPECT_FALSE(BP(4, 7) < BP(3, 7));
+
+  EXPECT_FALSE(BP(1, 7) > BP(2, 7));
+  EXPECT_FALSE(BP(1, 7) > BP(1, 4));
+  EXPECT_FALSE(BP(5, 7) > BP(3, 4));
+  EXPECT_FALSE(BP(1, 7) > BP(1, 7));
+  EXPECT_FALSE(BP(1, 7) > BP(2, 14));
+  EXPECT_TRUE(BP(4, 7) > BP(1, 2));
+  EXPECT_TRUE(BP(4, 7) > BP(3, 7));
+
+  EXPECT_TRUE(BP(1, 7) <= BP(2, 7));
+  EXPECT_TRUE(BP(1, 7) <= BP(1, 4));
+  EXPECT_TRUE(BP(5, 7) <= BP(3, 4));
+  EXPECT_TRUE(BP(1, 7) <= BP(1, 7));
+  EXPECT_TRUE(BP(1, 7) <= BP(2, 14));
+  EXPECT_FALSE(BP(4, 7) <= BP(1, 2));
+  EXPECT_FALSE(BP(4, 7) <= BP(3, 7));
+
+  EXPECT_FALSE(BP(1, 7) >= BP(2, 7));
+  EXPECT_FALSE(BP(1, 7) >= BP(1, 4));
+  EXPECT_FALSE(BP(5, 7) >= BP(3, 4));
+  EXPECT_TRUE(BP(1, 7) >= BP(1, 7));
+  EXPECT_TRUE(BP(1, 7) >= BP(2, 14));
+  EXPECT_TRUE(BP(4, 7) >= BP(1, 2));
+  EXPECT_TRUE(BP(4, 7) >= BP(3, 7));
+
+  EXPECT_FALSE(BP(1, 7) == BP(2, 7));
+  EXPECT_FALSE(BP(1, 7) == BP(1, 4));
+  EXPECT_FALSE(BP(5, 7) == BP(3, 4));
+  EXPECT_TRUE(BP(1, 7) == BP(1, 7));
+  EXPECT_TRUE(BP(1, 7) == BP(2, 14));
+  EXPECT_FALSE(BP(4, 7) == BP(1, 2));
+  EXPECT_FALSE(BP(4, 7) == BP(3, 7));
+
+  EXPECT_TRUE(BP(1, 7) != BP(2, 7));
+  EXPECT_TRUE(BP(1, 7) != BP(1, 4));
+  EXPECT_TRUE(BP(5, 7) != BP(3, 4));
+  EXPECT_FALSE(BP(1, 7) != BP(1, 7));
+  EXPECT_FALSE(BP(1, 7) != BP(2, 14));
+  EXPECT_TRUE(BP(4, 7) != BP(1, 2));
+  EXPECT_TRUE(BP(4, 7) != BP(3, 7));
+}
+
+TEST(BranchProbabilityTest, MoreOperators) {
+  BP A(4, 5);
+  BP B(4U << 29, 5U << 29);
+  BP C(3, 4);
+
+  EXPECT_TRUE(A == B);
+  EXPECT_FALSE(A != B);
+  EXPECT_FALSE(A < B);
+  EXPECT_FALSE(A > B);
+  EXPECT_TRUE(A <= B);
+  EXPECT_TRUE(A >= B);
+
+  EXPECT_FALSE(B == C);
+  EXPECT_TRUE(B != C);
+  EXPECT_FALSE(B < C);
+  EXPECT_TRUE(B > C);
+  EXPECT_FALSE(B <= C);
+  EXPECT_TRUE(B >= C);
+
+  BP BigZero(0, UINT32_MAX);
+  BP BigOne(UINT32_MAX, UINT32_MAX);
+  EXPECT_FALSE(BigZero == BigOne);
+  EXPECT_TRUE(BigZero != BigOne);
+  EXPECT_TRUE(BigZero < BigOne);
+  EXPECT_FALSE(BigZero > BigOne);
+  EXPECT_TRUE(BigZero <= BigOne);
+  EXPECT_FALSE(BigZero >= BigOne);
+}
+
+TEST(BranchProbabilityTest, getCompl) {
+  EXPECT_EQ(BP(5, 7), BP(2, 7).getCompl());
+  EXPECT_EQ(BP(2, 7), BP(5, 7).getCompl());
+  EXPECT_EQ(BP::getZero(), BP(7, 7).getCompl());
+  EXPECT_EQ(BP::getOne(), BP(0, 7).getCompl());
+}
+
+TEST(BranchProbabilityTest, scale) {
+  // Multiply by 1.0.
+  EXPECT_EQ(UINT64_MAX, BP(1, 1).scale(UINT64_MAX));
+  EXPECT_EQ(UINT64_MAX, BP(7, 7).scale(UINT64_MAX));
+  EXPECT_EQ(UINT32_MAX, BP(1, 1).scale(UINT32_MAX));
+  EXPECT_EQ(UINT32_MAX, BP(7, 7).scale(UINT32_MAX));
+  EXPECT_EQ(0u, BP(1, 1).scale(0));
+  EXPECT_EQ(0u, BP(7, 7).scale(0));
+
+  // Multiply by 0.0.
+  EXPECT_EQ(0u, BP(0, 1).scale(UINT64_MAX));
+  EXPECT_EQ(0u, BP(0, 1).scale(UINT64_MAX));
+  EXPECT_EQ(0u, BP(0, 1).scale(0));
+
+  auto Two63 = UINT64_C(1) << 63;
+  auto Two31 = UINT64_C(1) << 31;
+
+  // Multiply by 0.5.
+  EXPECT_EQ(Two63 - 1, BP(1, 2).scale(UINT64_MAX));
+
+  // Big fractions.
+  EXPECT_EQ(1u, BP(Two31, UINT32_MAX).scale(2));
+  EXPECT_EQ(Two31, BP(Two31, UINT32_MAX).scale(Two31 * 2));
+  EXPECT_EQ(Two63 + Two31, BP(Two31, UINT32_MAX).scale(UINT64_MAX));
+
+  // High precision.
+  EXPECT_EQ(UINT64_C(9223372047592194055),
+            BP(Two31 + 1, UINT32_MAX - 2).scale(UINT64_MAX));
+}
+
+TEST(BranchProbabilityTest, scaleByInverse) {
+  // Divide by 1.0.
+  EXPECT_EQ(UINT64_MAX, BP(1, 1).scaleByInverse(UINT64_MAX));
+  EXPECT_EQ(UINT64_MAX, BP(7, 7).scaleByInverse(UINT64_MAX));
+  EXPECT_EQ(UINT32_MAX, BP(1, 1).scaleByInverse(UINT32_MAX));
+  EXPECT_EQ(UINT32_MAX, BP(7, 7).scaleByInverse(UINT32_MAX));
+  EXPECT_EQ(0u, BP(1, 1).scaleByInverse(0));
+  EXPECT_EQ(0u, BP(7, 7).scaleByInverse(0));
+
+  // Divide by something very small.
+  EXPECT_EQ(UINT64_MAX, BP(1, UINT32_MAX).scaleByInverse(UINT64_MAX));
+  EXPECT_EQ(uint64_t(UINT32_MAX) * UINT32_MAX,
+            BP(1, UINT32_MAX).scaleByInverse(UINT32_MAX));
+  EXPECT_EQ(UINT32_MAX, BP(1, UINT32_MAX).scaleByInverse(1));
+
+  auto Two63 = UINT64_C(1) << 63;
+  auto Two31 = UINT64_C(1) << 31;
+
+  // Divide by 0.5.
+  EXPECT_EQ(UINT64_MAX - 1, BP(1, 2).scaleByInverse(Two63 - 1));
+  EXPECT_EQ(UINT64_MAX, BP(1, 2).scaleByInverse(Two63));
+
+  // Big fractions.
+  EXPECT_EQ(1u, BP(Two31, UINT32_MAX).scaleByInverse(1));
+  EXPECT_EQ(2u, BP(Two31 - 1, UINT32_MAX).scaleByInverse(1));
+  EXPECT_EQ(Two31 * 2 - 1, BP(Two31, UINT32_MAX).scaleByInverse(Two31));
+  EXPECT_EQ(Two31 * 2 + 1, BP(Two31 - 1, UINT32_MAX).scaleByInverse(Two31));
+  EXPECT_EQ(UINT64_MAX, BP(Two31, UINT32_MAX).scaleByInverse(Two63 + Two31));
+
+  // High precision.  The exact answers to these are close to the successors of
+  // the floor.  If we were rounding, these would round up.
+  EXPECT_EQ(UINT64_C(18446744065119617030),
+            BP(Two31 + 2, UINT32_MAX - 2)
+                .scaleByInverse(UINT64_C(9223372047592194055)));
+  EXPECT_EQ(UINT64_C(18446744065119617026),
+            BP(Two31 + 1, UINT32_MAX).scaleByInverse(Two63 + Two31));
+}
+
+TEST(BranchProbabilityTest, scaleBruteForce) {
+  struct {
+    uint64_t Num;
+    uint32_t Prob[2];
+    uint64_t Result;
+  } Tests[] = {
+    // Data for scaling that results in <= 64 bit division.
+    { 0x1423e2a50ULL, { 0x64819521, 0x7765dd13 }, 0x10f418889ULL },
+    { 0x35ef14ceULL, { 0x28ade3c7, 0x304532ae }, 0x2d73c33aULL },
+    { 0xd03dbfbe24ULL, { 0x790079, 0xe419f3 }, 0x6e776fc1fdULL },
+    { 0x21d67410bULL, { 0x302a9dc2, 0x3ddb4442 }, 0x1a5948fd6ULL },
+    { 0x8664aeadULL, { 0x3d523513, 0x403523b1 }, 0x805a04cfULL },
+    { 0x201db0cf4ULL, { 0x35112a7b, 0x79fc0c74 }, 0xdf8b07f6ULL },
+    { 0x13f1e4430aULL, { 0x21c92bf, 0x21e63aae }, 0x13e0cba15ULL },
+    { 0x16c83229ULL, { 0x3793f66f, 0x53180dea }, 0xf3ce7b6ULL },
+    { 0xc62415be8ULL, { 0x9cc4a63, 0x4327ae9b }, 0x1ce8b71caULL },
+    { 0x6fac5e434ULL, { 0xe5f9170, 0x1115e10b }, 0x5df23dd4cULL },
+    { 0x1929375f2ULL, { 0x3a851375, 0x76c08456 }, 0xc662b082ULL },
+    { 0x243c89db6ULL, { 0x354ebfc0, 0x450ef197 }, 0x1bf8c1661ULL },
+    { 0x310e9b31aULL, { 0x1b1b8acf, 0x2d3629f0 }, 0x1d69c93f9ULL },
+    { 0xa1fae921dULL, { 0xa7a098c, 0x10469f44 }, 0x684413d6cULL },
+    { 0xc1582d957ULL, { 0x498e061, 0x59856bc }, 0x9edc5f4e7ULL },
+    { 0x57cfee75ULL, { 0x1d061dc3, 0x7c8bfc17 }, 0x1476a220ULL },
+    { 0x139220080ULL, { 0x294a6c71, 0x2a2b07c9 }, 0x1329e1c76ULL },
+    { 0x1665d353cULL, { 0x7080db5, 0xde0d75c }, 0xb590d9fbULL },
+    { 0xe8f14541ULL, { 0x5188e8b2, 0x736527ef }, 0xa4971be5ULL },
+    { 0x2f4775f29ULL, { 0x254ef0fe, 0x435fcf50 }, 0x1a2e449c1ULL },
+    { 0x27b85d8d7ULL, { 0x304c8220, 0x5de678f2 }, 0x146e3bef9ULL },
+    { 0x1d362e36bULL, { 0x36c85b12, 0x37a66f55 }, 0x1cc19b8e6ULL },
+    { 0x155fd48c7ULL, { 0xf5894d, 0x1256108 }, 0x11e383602ULL },
+    { 0xb5db2d15ULL, { 0x39bb26c5, 0x5bdcda3e }, 0x72499259ULL },
+    { 0x153990298ULL, { 0x48921c09, 0x706eb817 }, 0xdb3268e8ULL },
+    { 0x28a7c3ed7ULL, { 0x1f776fd7, 0x349f7a70 }, 0x184f73ae1ULL },
+    { 0x724dbeabULL, { 0x1bd149f5, 0x253a085e }, 0x5569c0b3ULL },
+    { 0xd8f0c513ULL, { 0x18c8cc4c, 0x1b72bad0 }, 0xc3e30643ULL },
+    { 0x17ce3dcbULL, { 0x1e4c6260, 0x233b359e }, 0x1478f4afULL },
+    { 0x1ce036ce0ULL, { 0x29e3c8af, 0x5318dd4a }, 0xe8e76196ULL },
+    { 0x1473ae2aULL, { 0x29b897ba, 0x2be29378 }, 0x13718185ULL },
+    { 0x1dd41aa68ULL, { 0x3d0a4441, 0x5a0e8f12 }, 0x1437b6bbfULL },
+    { 0x1b49e4a53ULL, { 0x3430c1fe, 0x5a204aed }, 0xfcd6852fULL },
+    { 0x217941b19ULL, { 0x12ced2bd, 0x21b68310 }, 0x12aca65b1ULL },
+    { 0xac6a4dc8ULL, { 0x3ed68da8, 0x6fdca34c }, 0x60da926dULL },
+    { 0x1c503a4e7ULL, { 0xfcbbd32, 0x11e48d17 }, 0x18fec7d38ULL },
+    { 0x1c885855ULL, { 0x213e919d, 0x25941897 }, 0x193de743ULL },
+    { 0x29b9c168eULL, { 0x2b644aea, 0x45725ee7 }, 0x1a122e5d5ULL },
+    { 0x806a33f2ULL, { 0x30a80a23, 0x5063733a }, 0x4db9a264ULL },
+    { 0x282afc96bULL, { 0x143ae554, 0x1a9863ff }, 0x1e8de5204ULL },
+    // Data for scaling that results in > 64 bit division.
+    { 0x23ca5f2f672ca41cULL, { 0xecbc641, 0x111373f7 }, 0x1f0301e5e8295ab5ULL },
+    { 0x5e4f2468142265e3ULL, { 0x1ddf5837, 0x32189233 }, 0x383ca7ba9fdd2c8cULL },
+    { 0x277a1a6f6b266bf6ULL, { 0x415d81a8, 0x61eb5e1e }, 0x1a5a3e1d41b30c0fULL },
+    { 0x1bdbb49a237035cbULL, { 0xea5bf17, 0x1d25ffb3 }, 0xdffc51c53d44b93ULL },
+    { 0x2bce6d29b64fb8ULL, { 0x3bfd5631, 0x7525c9bb }, 0x166ebedda7ac57ULL },
+    { 0x3a02116103df5013ULL, { 0x2ee18a83, 0x3299aea8 }, 0x35be8922ab1e2a84ULL },
+    { 0x7b5762390799b18cULL, { 0x12f8e5b9, 0x2563bcd4 }, 0x3e960077aca01209ULL },
+    { 0x69cfd72537021579ULL, { 0x4c35f468, 0x6a40feee }, 0x4be4cb3848be98a3ULL },
+    { 0x49dfdf835120f1c1ULL, { 0x8cb3759, 0x559eb891 }, 0x79663f7120edadeULL },
+    { 0x74b5be5c27676381ULL, { 0x47e4c5e0, 0x7c7b19ff }, 0x4367d2dff36a1028ULL },
+    { 0x4f50f97075e7f431ULL, { 0x9a50a17, 0x11cd1185 }, 0x2af952b34c032df4ULL },
+    { 0x2f8b0d712e393be4ULL, { 0x1487e386, 0x15aa356e }, 0x2d0df36478a776aaULL },
+    { 0x224c1c75999d3deULL, { 0x3b2df0ea, 0x4523b100 }, 0x1d5b481d145f08aULL },
+    { 0x2bcbcea22a399a76ULL, { 0x28b58212, 0x48dd013e }, 0x187814d084c47cabULL },
+    { 0x1dbfca91257cb2d1ULL, { 0x1a8c04d9, 0x5e92502c }, 0x859cf7d00f77545ULL },
+    { 0x7f20039b57cda935ULL, { 0xeccf651, 0x323f476e }, 0x25720cd976461a77ULL },
+    { 0x40512c6a586aa087ULL, { 0x113b0423, 0x398c9eab }, 0x1341c03de8696a7eULL },
+    { 0x63d802693f050a11ULL, { 0xf50cdd6, 0xfce2a44 }, 0x60c0177bb5e46846ULL },
+    { 0x2d956b422838de77ULL, { 0xb2d345b, 0x1321e557 }, 0x1aa0ed16b6aa5319ULL },
+    { 0x5a1cdf0c1657bc91ULL, { 0x1d77bb0c, 0x1f991ff1 }, 0x54097ee94ff87560ULL },
+    { 0x3801b26d7e00176bULL, { 0xeed25da, 0x1a819d8b }, 0x1f89e96a3a639526ULL },
+    { 0x37655e74338e1e45ULL, { 0x300e170a, 0x5a1595fe }, 0x1d8cfb55fddc0441ULL },
+    { 0x7b38703f2a84e6ULL, { 0x66d9053, 0xc79b6b9 }, 0x3f7d4c91774094ULL },
+    { 0x2245063c0acb3215ULL, { 0x30ce2f5b, 0x610e7271 }, 0x113b916468389235ULL },
+    { 0x6bc195877b7b8a7eULL, { 0x392004aa, 0x4a24e60c }, 0x530594fb17db6ba5ULL },
+    { 0x40a3fde23c7b43dbULL, { 0x4e712195, 0x6553e56e }, 0x320a799bc76a466aULL },
+    { 0x1d3dfc2866fbccbaULL, { 0x5075b517, 0x5fc42245 }, 0x18917f0061595bc3ULL },
+    { 0x19aeb14045a61121ULL, { 0x1bf6edec, 0x707e2f4b }, 0x6626672a070bcc7ULL },
+    { 0x44ff90486c531e9fULL, { 0x66598a, 0x8a90dc }, 0x32f6f2b0525199b0ULL },
+    { 0x3f3e7121092c5bcbULL, { 0x1c754df7, 0x5951a1b9 }, 0x14267f50b7ef375dULL },
+    { 0x60e2dafb7e50a67eULL, { 0x4d96c66e, 0x65bd878d }, 0x49e31715ac393f8bULL },
+    { 0x656286667e0e6e29ULL, { 0x9d971a2, 0xacda23b }, 0x5c6ee315ead6cb4fULL },
+    { 0x1114e0974255d507ULL, { 0x1c693, 0x2d6ff }, 0xaae42e4b35f6e60ULL },
+    { 0x508c8baf3a70ff5aULL, { 0x3b26b779, 0x6ad78745 }, 0x2c98387636c4b365ULL },
+    { 0x5b47bc666bf1f9cfULL, { 0x10a87ed6, 0x187d358a }, 0x3e1767155848368bULL },
+    { 0x50954e3744460395ULL, { 0x7a42263, 0xcdaa048 }, 0x2fe739f0aee1fee1ULL },
+    { 0x20020b406550dd8fULL, { 0x3318539, 0x42eead0 }, 0x186f326325fa346bULL },
+    { 0x5bcb0b872439ffd5ULL, { 0x6f61fb2, 0x9af7344 }, 0x41fa1e3bec3c1b30ULL },
+    { 0x7a670f365db87a53ULL, { 0x417e102, 0x3bb54c67 }, 0x8642a558304fd9eULL },
+    { 0x1ef0db1e7bab1cd0ULL, { 0x2b60cf38, 0x4188f78f }, 0x147ae0d6226b2ee6ULL }
+  };
+
+  for (const auto &T : Tests) {
+    EXPECT_EQ(T.Result, BP(T.Prob[0], T.Prob[1]).scale(T.Num));
+  }
+}
+
+}
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index 4afa4fd..0ea9310 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_unittest(SupportTests
   AllocatorTest.cpp
   ArrayRecyclerTest.cpp
   BlockFrequencyTest.cpp
+  BranchProbabilityTest.cpp
   Casting.cpp
   CommandLineTest.cpp
   CompressionTest.cpp
@@ -15,12 +16,13 @@ add_llvm_unittest(SupportTests
   EndianTest.cpp
   ErrorOrTest.cpp
   FileOutputBufferTest.cpp
+  IteratorTest.cpp
   LEB128Test.cpp
   LineIteratorTest.cpp
   LockFileManagerTest.cpp
+  MD5Test.cpp
   ManagedStatic.cpp
   MathExtrasTest.cpp
-  MD5Test.cpp
   MemoryBufferTest.cpp
   MemoryTest.cpp
   Path.cpp
diff --git a/unittests/Support/CompressionTest.cpp b/unittests/Support/CompressionTest.cpp
index db6a8bb..698ae3a 100644
--- a/unittests/Support/CompressionTest.cpp
+++ b/unittests/Support/CompressionTest.cpp
@@ -13,8 +13,8 @@
 
 #include "llvm/Support/Compression.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -24,20 +24,17 @@ namespace {
 #if LLVM_ENABLE_ZLIB == 1 && HAVE_LIBZ
 
 void TestZlibCompression(StringRef Input, zlib::CompressionLevel Level) {
-  std::unique_ptr<MemoryBuffer> Compressed;
-  std::unique_ptr<MemoryBuffer> Uncompressed;
+  SmallString<32> Compressed;
+  SmallString<32> Uncompressed;
   EXPECT_EQ(zlib::StatusOK, zlib::compress(Input, Compressed, Level));
   // Check that uncompressed buffer is the same as original.
-  EXPECT_EQ(zlib::StatusOK, zlib::uncompress(Compressed->getBuffer(),
-                                             Uncompressed, Input.size()));
-  EXPECT_EQ(Input.size(), Uncompressed->getBufferSize());
-  EXPECT_EQ(0,
-            memcmp(Input.data(), Uncompressed->getBufferStart(), Input.size()));
+  EXPECT_EQ(zlib::StatusOK,
+            zlib::uncompress(Compressed, Uncompressed, Input.size()));
+  EXPECT_EQ(Input, Uncompressed);
   if (Input.size() > 0) {
     // Uncompression fails if expected length is too short.
     EXPECT_EQ(zlib::StatusBufferTooShort,
-              zlib::uncompress(Compressed->getBuffer(), Uncompressed,
-                               Input.size() - 1));
+              zlib::uncompress(Compressed, Uncompressed, Input.size() - 1));
   }
 }
 
diff --git a/unittests/Support/FileOutputBufferTest.cpp b/unittests/Support/FileOutputBufferTest.cpp
index 6d62999..0801f85 100644
--- a/unittests/Support/FileOutputBufferTest.cpp
+++ b/unittests/Support/FileOutputBufferTest.cpp
@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FileOutputBuffer.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -38,7 +37,7 @@ TEST(FileOutputBuffer, Test) {
   SmallString<128> File1(TestDirectory);
 	File1.append("/file1");
   {
-    OwningPtr<FileOutputBuffer> Buffer;
+    std::unique_ptr<FileOutputBuffer> Buffer;
     ASSERT_NO_ERROR(FileOutputBuffer::create(File1, 8192, Buffer));
     // Start buffer with special header.
     memcpy(Buffer->getBufferStart(), "AABBCCDDEEFFGGHHIIJJ", 20);
@@ -62,7 +61,7 @@ TEST(FileOutputBuffer, Test) {
   SmallString<128> File2(TestDirectory);
 	File2.append("/file2");
   {
-    OwningPtr<FileOutputBuffer> Buffer2;
+    std::unique_ptr<FileOutputBuffer> Buffer2;
     ASSERT_NO_ERROR(FileOutputBuffer::create(File2, 8192, Buffer2));
     // Fill buffer with special header.
     memcpy(Buffer2->getBufferStart(), "AABBCCDDEEFFGGHHIIJJ", 20);
@@ -78,7 +77,7 @@ TEST(FileOutputBuffer, Test) {
   SmallString<128> File3(TestDirectory);
 	File3.append("/file3");
   {
-    OwningPtr<FileOutputBuffer> Buffer;
+    std::unique_ptr<FileOutputBuffer> Buffer;
     ASSERT_NO_ERROR(FileOutputBuffer::create(File3, 8192000, Buffer));
     // Start buffer with special header.
     memcpy(Buffer->getBufferStart(), "AABBCCDDEEFFGGHHIIJJ", 20);
@@ -102,7 +101,7 @@ TEST(FileOutputBuffer, Test) {
   SmallString<128> File4(TestDirectory);
 	File4.append("/file4");
   {
-    OwningPtr<FileOutputBuffer> Buffer;
+    std::unique_ptr<FileOutputBuffer> Buffer;
     ASSERT_NO_ERROR(FileOutputBuffer::create(File4, 8192, Buffer,
                                               FileOutputBuffer::F_executable));
     // Start buffer with special header.
diff --git a/unittests/Support/IteratorTest.cpp b/unittests/Support/IteratorTest.cpp
new file mode 100644
index 0000000..8384832
--- /dev/null
+++ b/unittests/Support/IteratorTest.cpp
@@ -0,0 +1,101 @@
+//===- IteratorTest.cpp - Unit tests for iterator utilities ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(PointeeIteratorTest, Basic) {
+  int arr[4] = { 1, 2, 3, 4 };
+  SmallVector<int *, 4> V;
+  V.push_back(&arr[0]);
+  V.push_back(&arr[1]);
+  V.push_back(&arr[2]);
+  V.push_back(&arr[3]);
+
+  typedef pointee_iterator<SmallVectorImpl<int *>::const_iterator> test_iterator;
+
+  test_iterator Begin, End;
+  Begin = V.begin();
+  End = test_iterator(V.end());
+
+  test_iterator I = Begin;
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(*V[i], *I);
+
+    EXPECT_EQ(I, Begin + i);
+    EXPECT_EQ(I, std::next(Begin, i));
+    test_iterator J = Begin;
+    J += i;
+    EXPECT_EQ(I, J);
+    EXPECT_EQ(*V[i], Begin[i]);
+
+    EXPECT_NE(I, End);
+    EXPECT_GT(End, I);
+    EXPECT_LT(I, End);
+    EXPECT_GE(I, Begin);
+    EXPECT_LE(Begin, I);
+
+    EXPECT_EQ(i, I - Begin);
+    EXPECT_EQ(i, std::distance(Begin, I));
+    EXPECT_EQ(Begin, I - i);
+
+    test_iterator K = I++;
+    EXPECT_EQ(K, std::prev(I));
+  }
+  EXPECT_EQ(End, I);
+}
+
+TEST(PointeeIteratorTest, SmartPointer) {
+  SmallVector<std::unique_ptr<int>, 4> V;
+  V.push_back(make_unique<int>(1));
+  V.push_back(make_unique<int>(2));
+  V.push_back(make_unique<int>(3));
+  V.push_back(make_unique<int>(4));
+
+  typedef pointee_iterator<
+      SmallVectorImpl<std::unique_ptr<int>>::const_iterator> test_iterator;
+
+  test_iterator Begin, End;
+  Begin = V.begin();
+  End = test_iterator(V.end());
+
+  test_iterator I = Begin;
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(*V[i], *I);
+
+    EXPECT_EQ(I, Begin + i);
+    EXPECT_EQ(I, std::next(Begin, i));
+    test_iterator J = Begin;
+    J += i;
+    EXPECT_EQ(I, J);
+    EXPECT_EQ(*V[i], Begin[i]);
+
+    EXPECT_NE(I, End);
+    EXPECT_GT(End, I);
+    EXPECT_LT(I, End);
+    EXPECT_GE(I, Begin);
+    EXPECT_LE(Begin, I);
+
+    EXPECT_EQ(i, I - Begin);
+    EXPECT_EQ(i, std::distance(Begin, I));
+    EXPECT_EQ(Begin, I - i);
+
+    test_iterator K = I++;
+    EXPECT_EQ(K, std::prev(I));
+  }
+  EXPECT_EQ(End, I);
+}
+
+} // anonymous namespace
diff --git a/unittests/Support/MemoryBufferTest.cpp b/unittests/Support/MemoryBufferTest.cpp
index 43b7e0d..6790d0c 100644
--- a/unittests/Support/MemoryBufferTest.cpp
+++ b/unittests/Support/MemoryBufferTest.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FileSystem.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
@@ -36,7 +35,7 @@ protected:
   /// anew before using MemoryBuffer.
   void testGetOpenFileSlice(bool Reopen);
 
-  typedef OwningPtr<MemoryBuffer> OwningBuffer;
+  typedef std::unique_ptr<MemoryBuffer> OwningBuffer;
 
   std::string data;
 };
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index 52a8f6b..cf95532 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -302,11 +302,23 @@ struct StringTypes {
   llvm::StringRef str3;
   llvm::StringRef str4;
   llvm::StringRef str5;
+  llvm::StringRef str6;
+  llvm::StringRef str7;
+  llvm::StringRef str8;
+  llvm::StringRef str9;
+  llvm::StringRef str10;
+  llvm::StringRef str11;
   std::string stdstr1;
   std::string stdstr2;
   std::string stdstr3;
   std::string stdstr4;
   std::string stdstr5;
+  std::string stdstr6;
+  std::string stdstr7;
+  std::string stdstr8;
+  std::string stdstr9;
+  std::string stdstr10;
+  std::string stdstr11;
 };
 
 namespace llvm {
@@ -319,11 +331,23 @@ namespace yaml {
       io.mapRequired("str3",      st.str3);
       io.mapRequired("str4",      st.str4);
       io.mapRequired("str5",      st.str5);
+      io.mapRequired("str6",      st.str6);
+      io.mapRequired("str7",      st.str7);
+      io.mapRequired("str8",      st.str8);
+      io.mapRequired("str9",      st.str9);
+      io.mapRequired("str10",     st.str10);
+      io.mapRequired("str11",     st.str11);
       io.mapRequired("stdstr1",   st.stdstr1);
       io.mapRequired("stdstr2",   st.stdstr2);
       io.mapRequired("stdstr3",   st.stdstr3);
       io.mapRequired("stdstr4",   st.stdstr4);
       io.mapRequired("stdstr5",   st.stdstr5);
+      io.mapRequired("stdstr6",   st.stdstr6);
+      io.mapRequired("stdstr7",   st.stdstr7);
+      io.mapRequired("stdstr8",   st.stdstr8);
+      io.mapRequired("stdstr9",   st.stdstr9);
+      io.mapRequired("stdstr10",  st.stdstr10);
+      io.mapRequired("stdstr11",  st.stdstr11);
     }
   };
 }
@@ -338,11 +362,23 @@ TEST(YAMLIO, TestReadWriteStringTypes) {
     map.str3 = "`ccc";
     map.str4 = "@ddd";
     map.str5 = "";
+    map.str6 = "0000000004000000";
+    map.str7 = "true";
+    map.str8 = "FALSE";
+    map.str9 = "~";
+    map.str10 = "0.2e20";
+    map.str11 = "0x30";
     map.stdstr1 = "'eee";
     map.stdstr2 = "\"fff";
     map.stdstr3 = "`ggg";
     map.stdstr4 = "@hhh";
     map.stdstr5 = "";
+    map.stdstr6 = "0000000004000000";
+    map.stdstr7 = "true";
+    map.stdstr8 = "FALSE";
+    map.stdstr9 = "~";
+    map.stdstr10 = "0.2e20";
+    map.stdstr11 = "0x30";
 
     llvm::raw_string_ostream ostr(intermediate);
     Output yout(ostr);
@@ -355,11 +391,18 @@ TEST(YAMLIO, TestReadWriteStringTypes) {
   EXPECT_NE(llvm::StringRef::npos, flowOut.find("'`ccc'"));
   EXPECT_NE(llvm::StringRef::npos, flowOut.find("'@ddd'"));
   EXPECT_NE(llvm::StringRef::npos, flowOut.find("''\n"));
+  EXPECT_NE(llvm::StringRef::npos, flowOut.find("'0000000004000000'\n"));
+  EXPECT_NE(llvm::StringRef::npos, flowOut.find("'true'\n"));
+  EXPECT_NE(llvm::StringRef::npos, flowOut.find("'FALSE'\n"));
+  EXPECT_NE(llvm::StringRef::npos, flowOut.find("'~'\n"));
+  EXPECT_NE(llvm::StringRef::npos, flowOut.find("'0.2e20'\n"));
+  EXPECT_NE(llvm::StringRef::npos, flowOut.find("'0x30'\n"));
   EXPECT_NE(std::string::npos, flowOut.find("'''eee"));
   EXPECT_NE(std::string::npos, flowOut.find("'\"fff'"));
   EXPECT_NE(std::string::npos, flowOut.find("'`ggg'"));
   EXPECT_NE(std::string::npos, flowOut.find("'@hhh'"));
   EXPECT_NE(std::string::npos, flowOut.find("''\n"));
+  EXPECT_NE(std::string::npos, flowOut.find("'0000000004000000'\n"));
 
   {
     Input yin(intermediate);
@@ -372,11 +415,13 @@ TEST(YAMLIO, TestReadWriteStringTypes) {
     EXPECT_TRUE(map.str3.equals("`ccc"));
     EXPECT_TRUE(map.str4.equals("@ddd"));
     EXPECT_TRUE(map.str5.equals(""));
+    EXPECT_TRUE(map.str6.equals("0000000004000000"));
     EXPECT_TRUE(map.stdstr1 == "'eee");
     EXPECT_TRUE(map.stdstr2 == "\"fff");
     EXPECT_TRUE(map.stdstr3 == "`ggg");
     EXPECT_TRUE(map.stdstr4 == "@hhh");
     EXPECT_TRUE(map.stdstr5 == "");
+    EXPECT_TRUE(map.stdstr6 == "0000000004000000");
   }
 }
 
@@ -602,6 +647,7 @@ namespace yaml {
           return "malformed by";
       }
     }
+    static bool mustQuote(StringRef) { return true; }
   };
 }
 }
@@ -663,6 +709,8 @@ namespace yaml {
       value = n;
       return StringRef();
     }
+
+    static bool mustQuote(StringRef) { return false; }
   };
 }
 }
diff --git a/unittests/Transforms/Utils/SpecialCaseList.cpp b/unittests/Transforms/Utils/SpecialCaseList.cpp
index 748834f..fd00687 100644
--- a/unittests/Transforms/Utils/SpecialCaseList.cpp
+++ b/unittests/Transforms/Utils/SpecialCaseList.cpp
@@ -34,9 +34,8 @@ protected:
         M, ST, false, GlobalValue::ExternalLinkage, 0, Name);
   }
 
-  GlobalAlias *makeAlias(StringRef Name, GlobalValue *Aliasee) {
-    return new GlobalAlias(Aliasee->getType(), GlobalValue::ExternalLinkage,
-                           Name, Aliasee, Aliasee->getParent());
+  GlobalAlias *makeAlias(StringRef Name, GlobalObject *Aliasee) {
+    return GlobalAlias::create(GlobalValue::ExternalLinkage, Name, Aliasee);
   }
 
   SpecialCaseList *makeSpecialCaseList(StringRef List, std::string &Error) {
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index a1f4be9..a124377 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -965,7 +965,8 @@ static void PrintCheckFailed(const SourceMgr &SM, const CheckString &CheckStr,
 
 /// CountNumNewlinesBetween - Count the number of newlines in the specified
 /// range.
-static unsigned CountNumNewlinesBetween(StringRef Range) {
+static unsigned CountNumNewlinesBetween(StringRef Range,
+                                        const char *&FirstNewLine) {
   unsigned NumNewLines = 0;
   while (1) {
     // Scan for newline.
@@ -980,6 +981,9 @@ static unsigned CountNumNewlinesBetween(StringRef Range) {
         (Range[0] != Range[1]))
       Range = Range.substr(1);
     Range = Range.substr(1);
+
+    if (NumNewLines == 1)
+      FirstNewLine = Range.begin();
   }
 }
 
@@ -1039,7 +1043,8 @@ bool CheckString::CheckNext(const SourceMgr &SM, StringRef Buffer) const {
              SMLoc::getFromPointer(Buffer.data())))->getBufferStart() &&
          "CHECK-NEXT can't be the first check in a file");
 
-  unsigned NumNewLines = CountNumNewlinesBetween(Buffer);
+  const char *FirstNewLine = 0;
+  unsigned NumNewLines = CountNumNewlinesBetween(Buffer, FirstNewLine);
 
   if (NumNewLines == 0) {
     SM.PrintMessage(Loc, SourceMgr::DK_Error, Prefix +
@@ -1058,6 +1063,8 @@ bool CheckString::CheckNext(const SourceMgr &SM, StringRef Buffer) const {
                     SourceMgr::DK_Note, "'next' match was here");
     SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()), SourceMgr::DK_Note,
                     "previous match ended here");
+    SM.PrintMessage(SMLoc::getFromPointer(FirstNewLine), SourceMgr::DK_Note,
+                    "non-matching line after previous match is here");
     return true;
   }
 
diff --git a/utils/FileUpdate/FileUpdate.cpp b/utils/FileUpdate/FileUpdate.cpp
index 5ccf3f3..1bf1248 100644
--- a/utils/FileUpdate/FileUpdate.cpp
+++ b/utils/FileUpdate/FileUpdate.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
diff --git a/utils/PerfectShuffle/CMakeLists.txt b/utils/PerfectShuffle/CMakeLists.txt
new file mode 100644
index 0000000..ed70760
--- /dev/null
+++ b/utils/PerfectShuffle/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_utility(llvm-PerfectShuffle
+  PerfectShuffle.cpp
+  )
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index 4169f8d..3d72741 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -117,6 +117,8 @@
 #include <sstream>
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-matcher-emitter"
+
 static cl::opt<std::string>
 MatchPrefix("match-prefix", cl::init(""),
             cl::desc("Only match instructions with the given prefix"));
@@ -191,10 +193,10 @@ struct ClassInfo {
   /// parsing on the operand.
   std::string ParserMethod;
 
-  /// For register classes, the records for all the registers in this class.
+  /// For register classes: the records for all the registers in this class.
   RegisterSet Registers;
 
-  /// For custom match classes, he diagnostic kind for when the predicate fails.
+  /// For custom match classes: the diagnostic kind for when the predicate fails.
   std::string DiagnosticType;
 public:
   /// isRegisterClass() - Check if this is a register class.
@@ -306,8 +308,8 @@ struct MatchableInfo {
     /// Register record if this token is singleton register.
     Record *SingletonReg;
 
-    explicit AsmOperand(StringRef T) : Token(T), Class(0), SubOpIdx(-1),
-                                       SingletonReg(0) {}
+    explicit AsmOperand(StringRef T) : Token(T), Class(nullptr), SubOpIdx(-1),
+                                       SingletonReg(nullptr) {}
   };
 
   /// ResOperand - This represents a single operand in the result instruction
@@ -571,6 +573,11 @@ struct SubtargetFeatureInfo {
   std::string getEnumName() const {
     return "Feature_" + TheDef->getName();
   }
+
+  void dump() {
+    errs() << getEnumName() << " " << Index << "\n";
+    TheDef->dump();
+  }
 };
 
 struct OperandMatchEntry {
@@ -666,7 +673,7 @@ public:
     assert(Def->isSubClassOf("Predicate") && "Invalid predicate type!");
     std::map<Record*, SubtargetFeatureInfo*, LessRecordByID>::const_iterator I =
       SubtargetFeatures.find(Def);
-    return I == SubtargetFeatures.end() ? 0 : I->second;
+    return I == SubtargetFeatures.end() ? nullptr : I->second;
   }
 
   RecordKeeper &getRecords() const {
@@ -1018,7 +1025,7 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
     // RegisterOperand may have an associated ParserMatchClass. If it does,
     // use it, else just fall back to the underlying register class.
     const RecordVal *R = Rec->getValue("ParserMatchClass");
-    if (R == 0 || R->getValue() == 0)
+    if (!R || !R->getValue())
       PrintFatalError("Record `" + Rec->getName() +
         "' does not have a ParserMatchClass!\n");
 
@@ -1322,6 +1329,7 @@ void AsmMatcherInfo::buildInfo() {
 
     unsigned FeatureNo = SubtargetFeatures.size();
     SubtargetFeatures[Pred] = new SubtargetFeatureInfo(Pred, FeatureNo);
+    DEBUG(SubtargetFeatures[Pred]->dump());
     assert(FeatureNo < 32 && "Too many subtarget features!");
   }
 
@@ -1373,7 +1381,8 @@ void AsmMatcherInfo::buildInfo() {
     std::vector<Record*> AllInstAliases =
       Records.getAllDerivedDefinitions("InstAlias");
     for (unsigned i = 0, e = AllInstAliases.size(); i != e; ++i) {
-      CodeGenInstAlias *Alias = new CodeGenInstAlias(AllInstAliases[i], Target);
+      CodeGenInstAlias *Alias =
+          new CodeGenInstAlias(AllInstAliases[i], AsmVariantNo, Target);
 
       // If the tblgen -match-prefix option is specified (for tblgen hackers),
       // filter the set of instruction aliases we consider, based on the target
@@ -1897,7 +1906,7 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
       }
       case MatchableInfo::ResOperand::RegOperand: {
         std::string Reg, Name;
-        if (OpInfo.Register == 0) {
+        if (!OpInfo.Register) {
           Name = "reg0";
           Reg = "0";
         } else {
@@ -2196,18 +2205,35 @@ static void emitMatchRegisterName(CodeGenTarget &Target, Record *AsmParser,
   OS << "}\n\n";
 }
 
+static const char *getMinimalTypeForRange(uint64_t Range) {
+  assert(Range <= 0xFFFFFFFFULL && "Enum too large");
+  if (Range > 0xFFFF)
+    return "uint32_t";
+  if (Range > 0xFF)
+    return "uint16_t";
+  return "uint8_t";
+}
+
+static const char *getMinimalRequiredFeaturesType(const AsmMatcherInfo &Info) {
+  uint64_t MaxIndex = Info.SubtargetFeatures.size();
+  if (MaxIndex > 0)
+    MaxIndex--;
+  return getMinimalTypeForRange(1ULL << MaxIndex);
+}
+
 /// emitSubtargetFeatureFlagEnumeration - Emit the subtarget feature flag
 /// definitions.
 static void emitSubtargetFeatureFlagEnumeration(AsmMatcherInfo &Info,
                                                 raw_ostream &OS) {
   OS << "// Flags for subtarget features that participate in "
      << "instruction matching.\n";
-  OS << "enum SubtargetFeatureFlag {\n";
+  OS << "enum SubtargetFeatureFlag : " << getMinimalRequiredFeaturesType(Info)
+     << " {\n";
   for (std::map<Record*, SubtargetFeatureInfo*, LessRecordByID>::const_iterator
          it = Info.SubtargetFeatures.begin(),
          ie = Info.SubtargetFeatures.end(); it != ie; ++it) {
     SubtargetFeatureInfo &SFI = *it->second;
-    OS << "  " << SFI.getEnumName() << " = (1 << " << SFI.Index << "),\n";
+    OS << "  " << SFI.getEnumName() << " = (1U << " << SFI.Index << "),\n";
   }
   OS << "  Feature_None = 0\n";
   OS << "};\n\n";
@@ -2319,7 +2345,7 @@ static std::string GetAliasRequiredFeatures(Record *R,
   for (unsigned i = 0, e = ReqFeatures.size(); i != e; ++i) {
     SubtargetFeatureInfo *F = Info.getSubtargetFeature(ReqFeatures[i]);
 
-    if (F == 0)
+    if (!F)
       PrintFatalError(R->getLoc(), "Predicate '" + ReqFeatures[i]->getName() +
                     "' is not marked as an AssemblerPredicate!");
 
@@ -2443,15 +2469,6 @@ static bool emitMnemonicAliases(raw_ostream &OS, const AsmMatcherInfo &Info,
   return true;
 }
 
-static const char *getMinimalTypeForRange(uint64_t Range) {
-  assert(Range < 0xFFFFFFFFULL && "Enum too large");
-  if (Range > 0xFFFF)
-    return "uint32_t";
-  if (Range > 0xFF)
-    return "uint16_t";
-  return "uint8_t";
-}
-
 static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
                               const AsmMatcherInfo &Info, StringRef ClassName,
                               StringToOffsetTable &StringTable,
@@ -2466,7 +2483,7 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
   // Emit the static custom operand parsing table;
   OS << "namespace {\n";
   OS << "  struct OperandMatchEntry {\n";
-  OS << "    " << getMinimalTypeForRange(1ULL << Info.SubtargetFeatures.size())
+  OS << "    " << getMinimalRequiredFeaturesType(Info)
                << " RequiredFeatures;\n";
   OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex)
                << " Mnemonic;\n";
@@ -2802,7 +2819,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "    uint16_t Opcode;\n";
   OS << "    " << getMinimalTypeForRange(Info.Matchables.size())
                << " ConvertFn;\n";
-  OS << "    " << getMinimalTypeForRange(1ULL << Info.SubtargetFeatures.size())
+  OS << "    " << getMinimalRequiredFeaturesType(Info)
                << " RequiredFeatures;\n";
   OS << "    " << getMinimalTypeForRange(Info.Classes.size())
                << " Classes[" << MaxNumOperands << "];\n";
@@ -2881,8 +2898,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
     Record *AsmVariant = Target.getAsmParserVariant(VC);
     int AsmVariantNo = AsmVariant->getValueAsInt("Variant");
-    OS << "  case " << AsmVariantNo << ": Start = MatchTable" << VC
-       << "; End = array_endof(MatchTable" << VC << "); break;\n";
+    OS << "  case " << AsmVariantNo << ": Start = std::begin(MatchTable" << VC
+       << "); End = std::end(MatchTable" << VC << "); break;\n";
   }
   OS << "  }\n";
   OS << "  // Search the table.\n";
@@ -2936,8 +2953,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
     Record *AsmVariant = Target.getAsmParserVariant(VC);
     int AsmVariantNo = AsmVariant->getValueAsInt("Variant");
-    OS << "  case " << AsmVariantNo << ": Start = MatchTable" << VC
-       << "; End = array_endof(MatchTable" << VC << "); break;\n";
+    OS << "  case " << AsmVariantNo << ": Start = std::begin(MatchTable" << VC
+       << "); End = std::end(MatchTable" << VC << "); break;\n";
   }
   OS << "  }\n";
   OS << "  // Search the table.\n";
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index f9e1990..2741d8f 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -15,6 +15,7 @@
 #include "AsmWriterInst.h"
 #include "CodeGenTarget.h"
 #include "SequenceToOffsetTable.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
@@ -29,6 +30,8 @@
 #include <vector>
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-writer-emitter"
+
 namespace {
 class AsmWriterEmitter {
   RecordKeeper &Records;
@@ -36,6 +39,7 @@ class AsmWriterEmitter {
   std::map<const CodeGenInstruction*, AsmWriterInst*> CGIAWIMap;
   const std::vector<const CodeGenInstruction*> *NumberedInstructions;
   std::vector<AsmWriterInst> Instructions;
+  std::vector<std::string> PrintMethods;
 public:
   AsmWriterEmitter(RecordKeeper &R);
 
@@ -152,7 +156,7 @@ FindUniqueOperandCommands(std::vector<std::string> &UniqueOperandCommands,
 
   for (unsigned i = 0, e = NumberedInstructions->size(); i != e; ++i) {
     const AsmWriterInst *Inst = getAsmWriterInstByID(i);
-    if (Inst == 0)
+    if (!Inst)
       continue; // PHI, INLINEASM, CFI_INSTRUCTION, etc.
 
     std::string Command;
@@ -301,7 +305,7 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
   // representation.
   for (unsigned i = 0, e = NumberedInstructions->size(); i != e; ++i) {
     AsmWriterInst *AWI = CGIAWIMap[NumberedInstructions->at(i)];
-    if (AWI != 0 &&
+    if (AWI &&
         AWI->Operands[0].OperandType ==
                  AsmWriterOperand::isLiteralTextOperand &&
         !AWI->Operands[0].Str.empty()) {
@@ -317,7 +321,7 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
   for (unsigned i = 0, e = NumberedInstructions->size(); i != e; ++i) {
     AsmWriterInst *AWI = CGIAWIMap[NumberedInstructions->at(i)];
     unsigned Idx;
-    if (AWI == 0) {
+    if (!AWI) {
       // Something not handled by the asmwriter printer.
       Idx = ~0U;
     } else if (AWI->Operands[0].OperandType !=
@@ -626,22 +630,45 @@ namespace {
 // alias for that pattern.
 class IAPrinter {
   std::vector<std::string> Conds;
-  std::map<StringRef, unsigned> OpMap;
+  std::map<StringRef, std::pair<int, int>> OpMap;
+  SmallVector<Record*, 4> ReqFeatures;
+
   std::string Result;
   std::string AsmString;
-  SmallVector<Record*, 4> ReqFeatures;
 public:
-  IAPrinter(std::string R, std::string AS)
-    : Result(R), AsmString(AS) {}
+  IAPrinter(std::string R, std::string AS) : Result(R), AsmString(AS) {}
 
   void addCond(const std::string &C) { Conds.push_back(C); }
 
-  void addOperand(StringRef Op, unsigned Idx) {
-    assert(Idx < 0xFF && "Index too large!");
-    OpMap[Op] = Idx;
+  void addOperand(StringRef Op, int OpIdx, int PrintMethodIdx = -1) {
+    assert(OpIdx >= 0 && OpIdx < 0xFE && "Idx out of range");
+    assert(PrintMethodIdx >= -1 && PrintMethodIdx < 0xFF &&
+           "Idx out of range");
+    OpMap[Op] = std::make_pair(OpIdx, PrintMethodIdx);
   }
-  unsigned getOpIndex(StringRef Op) { return OpMap[Op]; }
+
   bool isOpMapped(StringRef Op) { return OpMap.find(Op) != OpMap.end(); }
+  int getOpIndex(StringRef Op) { return OpMap[Op].first; }
+  std::pair<int, int> &getOpData(StringRef Op) { return OpMap[Op]; }
+
+  std::pair<StringRef, StringRef::iterator> parseName(StringRef::iterator Start,
+                                                      StringRef::iterator End) {
+    StringRef::iterator I = Start;
+    if (*I == '{') {
+      // ${some_name}
+      Start = ++I;
+      while (I != End && *I != '}')
+        ++I;
+    } else {
+      // $name, just eat the usual suspects.
+      while (I != End &&
+             ((*I >= 'a' && *I <= 'z') || (*I >= 'A' && *I <= 'Z') ||
+              (*I >= '0' && *I <= '9') || *I == '_'))
+        ++I;
+    }
+
+    return std::make_pair(StringRef(Start, I - Start), I);
+  }
 
   void print(raw_ostream &O) {
     if (Conds.empty() && ReqFeatures.empty()) {
@@ -667,28 +694,30 @@ public:
     // Directly mangle mapped operands into the string. Each operand is
     // identified by a '$' sign followed by a byte identifying the number of the
     // operand. We add one to the index to avoid zero bytes.
-    std::pair<StringRef, StringRef> ASM = StringRef(AsmString).split(' ');
-    SmallString<128> OutString = ASM.first;
-    if (!ASM.second.empty()) {
-      raw_svector_ostream OS(OutString);
-      OS << ' ';
-      for (StringRef::iterator I = ASM.second.begin(), E = ASM.second.end();
-           I != E;) {
-        OS << *I;
-        if (*I == '$') {
-          StringRef::iterator Start = ++I;
-          while (I != E &&
-                 ((*I >= 'a' && *I <= 'z') || (*I >= 'A' && *I <= 'Z') ||
-                  (*I >= '0' && *I <= '9') || *I == '_'))
-            ++I;
-          StringRef Name(Start, I - Start);
-          assert(isOpMapped(Name) && "Unmapped operand!");
-          OS << format("\\x%02X", (unsigned char)getOpIndex(Name) + 1);
-        } else {
-          ++I;
-        }
+    StringRef ASM(AsmString);
+    SmallString<128> OutString;
+    raw_svector_ostream OS(OutString);
+    for (StringRef::iterator I = ASM.begin(), E = ASM.end(); I != E;) {
+      OS << *I;
+      if (*I == '$') {
+        StringRef Name;
+        std::tie(Name, I) = parseName(++I, E);
+        assert(isOpMapped(Name) && "Unmapped operand!");
+
+        int OpIndex, PrintIndex;
+        std::tie(OpIndex, PrintIndex) = getOpData(Name);
+        if (PrintIndex == -1) {
+          // Can use the default printOperand route.
+          OS << format("\\x%02X", (unsigned char)OpIndex + 1);
+        } else
+          // 3 bytes if a PrintMethod is needed: 0xFF, the MCInst operand
+          // number, and which of our pre-detected Methods to call.
+          OS << format("\\xFF\\x%02X\\x%02X", OpIndex + 1, PrintIndex + 1);
+      } else {
+        ++I;
       }
     }
+    OS.flush();
 
     // Emit the string.
     O.indent(6) << "AsmString = \"" << OutString.str() << "\";\n";
@@ -709,109 +738,99 @@ public:
 
     return true;
   }
-
-  bool operator()(const IAPrinter &RHS) {
-    if (Conds.size() < RHS.Conds.size())
-      return true;
-
-    unsigned Idx = 0;
-    for (std::vector<std::string>::iterator
-           I = Conds.begin(), E = Conds.end(); I != E; ++I)
-      if (*I != RHS.Conds[Idx++])
-        return *I < RHS.Conds[Idx++];
-
-    return false;
-  }
 };
 
 } // end anonymous namespace
 
-static unsigned CountNumOperands(StringRef AsmString) {
-  unsigned NumOps = 0;
-  std::pair<StringRef, StringRef> ASM = AsmString.split(' ');
+static unsigned CountNumOperands(StringRef AsmString, unsigned Variant) {
+  std::string FlatAsmString =
+      CodeGenInstruction::FlattenAsmStringVariants(AsmString, Variant);
+  AsmString = FlatAsmString;
 
-  while (!ASM.second.empty()) {
-    ++NumOps;
-    ASM = ASM.second.split(' ');
-  }
-
-  return NumOps;
+  return AsmString.count(' ') + AsmString.count('\t');
 }
 
-static unsigned CountResultNumOperands(StringRef AsmString) {
-  unsigned NumOps = 0;
-  std::pair<StringRef, StringRef> ASM = AsmString.split('\t');
-
-  if (!ASM.second.empty()) {
-    size_t I = ASM.second.find('{');
-    StringRef Str = ASM.second;
-    if (I != StringRef::npos)
-      Str = ASM.second.substr(I, ASM.second.find('|', I));
-
-    ASM = Str.split(' ');
+namespace {
+struct AliasPriorityComparator {
+  typedef std::pair<CodeGenInstAlias *, int> ValueType;
+  bool operator()(const ValueType &LHS, const ValueType &RHS) {
+    if (LHS.second ==  RHS.second) {
+      // We don't actually care about the order, but for consistency it
+      // shouldn't depend on pointer comparisons.
+      return LHS.first->TheDef->getName() < RHS.first->TheDef->getName();
+    }
 
-    do {
-      ++NumOps;
-      ASM = ASM.second.split(' ');
-    } while (!ASM.second.empty());
+    // Aliases with larger priorities should be considered first.
+    return LHS.second > RHS.second;
   }
-
-  return NumOps;
+};
 }
 
+
 void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   Record *AsmWriter = Target.getAsmWriter();
 
   O << "\n#ifdef PRINT_ALIAS_INSTR\n";
   O << "#undef PRINT_ALIAS_INSTR\n\n";
 
+  //////////////////////////////
+  // Gather information about aliases we need to print
+  //////////////////////////////
+
   // Emit the method that prints the alias instruction.
   std::string ClassName = AsmWriter->getValueAsString("AsmWriterClassName");
+  unsigned Variant = AsmWriter->getValueAsInt("Variant");
 
   std::vector<Record*> AllInstAliases =
     Records.getAllDerivedDefinitions("InstAlias");
 
   // Create a map from the qualified name to a list of potential matches.
-  std::map<std::string, std::vector<CodeGenInstAlias*> > AliasMap;
+  typedef std::set<std::pair<CodeGenInstAlias*, int>, AliasPriorityComparator>
+      AliasWithPriority;
+  std::map<std::string, AliasWithPriority> AliasMap;
   for (std::vector<Record*>::iterator
          I = AllInstAliases.begin(), E = AllInstAliases.end(); I != E; ++I) {
-    CodeGenInstAlias *Alias = new CodeGenInstAlias(*I, Target);
+    CodeGenInstAlias *Alias = new CodeGenInstAlias(*I, Variant, Target);
     const Record *R = *I;
-    if (!R->getValueAsBit("EmitAlias"))
-      continue; // We were told not to emit the alias, but to emit the aliasee.
+    int Priority = R->getValueAsInt("EmitPriority");
+    if (Priority < 1)
+      continue; // Aliases with priority 0 are never emitted.
+
     const DagInit *DI = R->getValueAsDag("ResultInst");
     const DefInit *Op = cast<DefInit>(DI->getOperator());
-    AliasMap[getQualifiedName(Op->getDef())].push_back(Alias);
+    AliasMap[getQualifiedName(Op->getDef())].insert(std::make_pair(Alias,
+                                                                   Priority));
   }
 
   // A map of which conditions need to be met for each instruction operand
   // before it can be matched to the mnemonic.
   std::map<std::string, std::vector<IAPrinter*> > IAPrinterMap;
 
-  for (std::map<std::string, std::vector<CodeGenInstAlias*> >::iterator
-         I = AliasMap.begin(), E = AliasMap.end(); I != E; ++I) {
-    std::vector<CodeGenInstAlias*> &Aliases = I->second;
-
-    for (std::vector<CodeGenInstAlias*>::iterator
-           II = Aliases.begin(), IE = Aliases.end(); II != IE; ++II) {
-      const CodeGenInstAlias *CGA = *II;
+  for (auto &Aliases : AliasMap) {
+    for (auto &Alias : Aliases.second) {
+      const CodeGenInstAlias *CGA = Alias.first;
       unsigned LastOpNo = CGA->ResultInstOperandIndex.size();
       unsigned NumResultOps =
-        CountResultNumOperands(CGA->ResultInst->AsmString);
+        CountNumOperands(CGA->ResultInst->AsmString, Variant);
 
       // Don't emit the alias if it has more operands than what it's aliasing.
-      if (NumResultOps < CountNumOperands(CGA->AsmString))
+      if (NumResultOps < CountNumOperands(CGA->AsmString, Variant))
         continue;
 
       IAPrinter *IAP = new IAPrinter(CGA->Result->getAsString(),
                                      CGA->AsmString);
 
+      unsigned NumMIOps = 0;
+      for (auto &Operand : CGA->ResultOperands)
+        NumMIOps += Operand.getMINumOperands();
+
       std::string Cond;
-      Cond = std::string("MI->getNumOperands() == ") + llvm::utostr(LastOpNo);
+      Cond = std::string("MI->getNumOperands() == ") + llvm::utostr(NumMIOps);
       IAP->addCond(Cond);
 
       bool CantHandle = false;
 
+      unsigned MIOpNum = 0;
       for (unsigned i = 0, e = LastOpNo; i != e; ++i) {
         const CodeGenInstAlias::ResultOperand &RO = CGA->ResultOperands[i];
 
@@ -819,42 +838,56 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
         case CodeGenInstAlias::ResultOperand::K_Record: {
           const Record *Rec = RO.getRecord();
           StringRef ROName = RO.getName();
-
+          int PrintMethodIdx = -1;
+
+          // These two may have a PrintMethod, which we want to record (if it's
+          // the first time we've seen it) and provide an index for the aliasing
+          // code to use.
+          if (Rec->isSubClassOf("RegisterOperand") ||
+              Rec->isSubClassOf("Operand")) {
+            std::string PrintMethod = Rec->getValueAsString("PrintMethod");
+            if (PrintMethod != "" && PrintMethod != "printOperand") {
+              PrintMethodIdx = std::find(PrintMethods.begin(),
+                                         PrintMethods.end(), PrintMethod) -
+                               PrintMethods.begin();
+              if (static_cast<unsigned>(PrintMethodIdx) == PrintMethods.size())
+                PrintMethods.push_back(PrintMethod);
+            }
+          }
 
           if (Rec->isSubClassOf("RegisterOperand"))
             Rec = Rec->getValueAsDef("RegClass");
           if (Rec->isSubClassOf("RegisterClass")) {
-            Cond = std::string("MI->getOperand(")+llvm::utostr(i)+").isReg()";
+            Cond = std::string("MI->getOperand(") + llvm::utostr(MIOpNum) +
+                   ").isReg()";
             IAP->addCond(Cond);
 
             if (!IAP->isOpMapped(ROName)) {
-              IAP->addOperand(ROName, i);
+              IAP->addOperand(ROName, MIOpNum, PrintMethodIdx);
               Record *R = CGA->ResultOperands[i].getRecord();
               if (R->isSubClassOf("RegisterOperand"))
                 R = R->getValueAsDef("RegClass");
               Cond = std::string("MRI.getRegClass(") + Target.getName() + "::" +
-                R->getName() + "RegClassID)"
-                ".contains(MI->getOperand(" + llvm::utostr(i) + ").getReg())";
+                     R->getName() + "RegClassID)"
+                                    ".contains(MI->getOperand(" +
+                     llvm::utostr(MIOpNum) + ").getReg())";
               IAP->addCond(Cond);
             } else {
               Cond = std::string("MI->getOperand(") +
-                llvm::utostr(i) + ").getReg() == MI->getOperand(" +
+                llvm::utostr(MIOpNum) + ").getReg() == MI->getOperand(" +
                 llvm::utostr(IAP->getOpIndex(ROName)) + ").getReg()";
               IAP->addCond(Cond);
             }
           } else {
-            assert(Rec->isSubClassOf("Operand") && "Unexpected operand!");
-            // FIXME: We may need to handle these situations.
-            delete IAP;
-            IAP = 0;
-            CantHandle = true;
-            break;
+            // Assume all printable operands are desired for now. This can be
+            // overridden in the InstAlias instantiation if necessary.
+            IAP->addOperand(ROName, MIOpNum, PrintMethodIdx);
           }
 
           break;
         }
         case CodeGenInstAlias::ResultOperand::K_Imm: {
-          std::string Op = "MI->getOperand(" + llvm::utostr(i) + ")";
+          std::string Op = "MI->getOperand(" + llvm::utostr(MIOpNum) + ")";
 
           // Just because the alias has an immediate result, doesn't mean the
           // MCInst will. An MCExpr could be present, for example.
@@ -874,20 +907,25 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
           }
 
           Cond = std::string("MI->getOperand(") +
-            llvm::utostr(i) + ").getReg() == " + Target.getName() +
+            llvm::utostr(MIOpNum) + ").getReg() == " + Target.getName() +
             "::" + CGA->ResultOperands[i].getRegister()->getName();
           IAP->addCond(Cond);
           break;
         }
 
         if (!IAP) break;
+        MIOpNum += RO.getMINumOperands();
       }
 
       if (CantHandle) continue;
-      IAPrinterMap[I->first].push_back(IAP);
+      IAPrinterMap[Aliases.first].push_back(IAP);
     }
   }
 
+  //////////////////////////////
+  // Write out the printAliasInstr function
+  //////////////////////////////
+
   std::string Header;
   raw_string_ostream HeaderO(Header);
 
@@ -952,7 +990,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   // Code that prints the alias, replacing the operands with the ones from the
   // MCInst.
   O << "  unsigned I = 0;\n";
-  O << "  while (AsmString[I] != ' ' && AsmString[I] != '\\0')\n";
+  O << "  while (AsmString[I] != ' ' && AsmString[I] != '\t' &&\n";
+  O << "         AsmString[I] != '\\0')\n";
   O << "    ++I;\n";
   O << "  OS << '\\t' << StringRef(AsmString, I);\n";
 
@@ -961,7 +1000,13 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   O << "    do {\n";
   O << "      if (AsmString[I] == '$') {\n";
   O << "        ++I;\n";
-  O << "        printOperand(MI, unsigned(AsmString[I++]) - 1, OS);\n";
+  O << "        if (AsmString[I] == (char)0xff) {\n";
+  O << "          ++I;\n";
+  O << "          int OpIdx = AsmString[I++] - 1;\n";
+  O << "          int PrintMethodIdx = AsmString[I++] - 1;\n";
+  O << "          printCustomAliasOperand(MI, OpIdx, PrintMethodIdx, OS);\n";
+  O << "        } else\n";
+  O << "          printOperand(MI, unsigned(AsmString[I++]) - 1, OS);\n";
   O << "      } else {\n";
   O << "        OS << AsmString[I++];\n";
   O << "      }\n";
@@ -971,6 +1016,31 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   O << "  return true;\n";
   O << "}\n\n";
 
+  //////////////////////////////
+  // Write out the printCustomAliasOperand function
+  //////////////////////////////
+
+  O << "void " << Target.getName() << ClassName << "::"
+    << "printCustomAliasOperand(\n"
+    << "         const MCInst *MI, unsigned OpIdx,\n"
+    << "         unsigned PrintMethodIdx, raw_ostream &OS) {\n";
+  if (PrintMethods.empty())
+    O << "  llvm_unreachable(\"Unknown PrintMethod kind\");\n";
+  else {
+    O << "  switch (PrintMethodIdx) {\n"
+      << "  default:\n"
+      << "    llvm_unreachable(\"Unknown PrintMethod kind\");\n"
+      << "    break;\n";
+
+    for (unsigned i = 0; i < PrintMethods.size(); ++i) {
+      O << "  case " << i << ":\n"
+        << "    " << PrintMethods[i] << "(MI, OpIdx, OS);\n"
+        << "    break;\n";
+    }
+    O << "  }\n";
+  }    
+  O << "}\n\n";
+
   O << "#endif // PRINT_ALIAS_INSTR\n";
 }
 
diff --git a/utils/TableGen/CTagsEmitter.cpp b/utils/TableGen/CTagsEmitter.cpp
index 1d240fa..7108679 100644
--- a/utils/TableGen/CTagsEmitter.cpp
+++ b/utils/TableGen/CTagsEmitter.cpp
@@ -13,8 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ctags-emitter"
-
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/TableGen/Error.h"
@@ -24,6 +22,8 @@
 #include <vector>
 using namespace llvm;
 
+#define DEBUG_TYPE "ctags-emitter"
+
 namespace llvm { extern SourceMgr SrcMgr; }
 
 namespace {
diff --git a/utils/TableGen/CallingConvEmitter.cpp b/utils/TableGen/CallingConvEmitter.cpp
index 96bd336..6d43e8e 100644
--- a/utils/TableGen/CallingConvEmitter.cpp
+++ b/utils/TableGen/CallingConvEmitter.cpp
@@ -112,7 +112,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
         O << IndentStr << "if (unsigned Reg = State.AllocateReg(";
         O << getQualifiedName(RegList->getElementAsRecord(0)) << ")) {\n";
       } else {
-        O << IndentStr << "static const uint16_t RegList" << ++Counter
+        O << IndentStr << "static const MCPhysReg RegList" << ++Counter
           << "[] = {\n";
         O << IndentStr << "  ";
         for (unsigned i = 0, e = RegList->getSize(); i != e; ++i) {
@@ -143,7 +143,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
         unsigned RegListNumber = ++Counter;
         unsigned ShadowRegListNumber = ++Counter;
 
-        O << IndentStr << "static const uint16_t RegList" << RegListNumber
+        O << IndentStr << "static const MCPhysReg RegList" << RegListNumber
           << "[] = {\n";
         O << IndentStr << "  ";
         for (unsigned i = 0, e = RegList->getSize(); i != e; ++i) {
@@ -152,7 +152,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
         }
         O << "\n" << IndentStr << "};\n";
 
-        O << IndentStr << "static const uint16_t RegList"
+        O << IndentStr << "static const MCPhysReg RegList"
           << ShadowRegListNumber << "[] = {\n";
         O << IndentStr << "  ";
         for (unsigned i = 0, e = ShadowRegList->getSize(); i != e; ++i) {
@@ -196,7 +196,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
 
       unsigned ShadowRegListNumber = ++Counter;
 
-      O << IndentStr << "static const uint16_t ShadowRegList"
+      O << IndentStr << "static const MCPhysReg ShadowRegList"
           << ShadowRegListNumber << "[] = {\n";
       O << IndentStr << "  ";
       for (unsigned i = 0, e = ShadowRegList->getSize(); i != e; ++i) {
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 0af7e3d..00bc9a5 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -25,6 +25,8 @@
 #include <set>
 using namespace llvm;
 
+#define DEBUG_TYPE "dag-patterns"
+
 //===----------------------------------------------------------------------===//
 //  EEVT::TypeSet Implementation
 //===----------------------------------------------------------------------===//
@@ -83,7 +85,7 @@ bool EEVT::TypeSet::FillWithPossibleTypes(TreePattern &TP,
     return false;
 
   for (unsigned i = 0, e = LegalTypes.size(); i != e; ++i)
-    if (Pred == 0 || Pred(LegalTypes[i]))
+    if (!Pred || Pred(LegalTypes[i]))
       TypeVec.push_back(LegalTypes[i]);
 
   // If we have nothing that matches the predicate, bail out.
@@ -736,9 +738,13 @@ static unsigned getPatternSize(const TreePatternNode *P,
   // specified. To get best possible pattern match we'll need to dynamically
   // calculate the complexity of all patterns a dag can potentially map to.
   const ComplexPattern *AM = P->getComplexPatternInfo(CGP);
-  if (AM)
+  if (AM) {
     Size += AM->getNumOperands() * 3;
 
+    // We don't want to count any children twice, so return early.
+    return Size;
+  }
+
   // If this node has some predicate function that must match, it adds to the
   // complexity of this node.
   if (!P->getPredicateFns().empty())
@@ -976,7 +982,7 @@ bool TreePatternNode::UpdateNodeTypeFromInst(unsigned ResNo,
 
   // Both RegisterClass and RegisterOperand operands derive their types from a
   // register class def.
-  Record *RC = 0;
+  Record *RC = nullptr;
   if (Operand->isSubClassOf("RegisterClass"))
     RC = Operand;
   else if (Operand->isSubClassOf("RegisterOperand"))
@@ -1094,7 +1100,7 @@ static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
 
     // Get the result tree.
     DagInit *Tree = Operator->getValueAsDag("Fragment");
-    Record *Op = 0;
+    Record *Op = nullptr;
     if (Tree)
       if (DefInit *DI = dyn_cast<DefInit>(Tree->getOperator()))
         Op = DI->getDef();
@@ -1120,6 +1126,9 @@ static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
   if (Operator->isSubClassOf("ValueType"))
     return 1;  // A type-cast of one result.
 
+  if (Operator->isSubClassOf("ComplexPattern"))
+    return 1;
+
   Operator->dump();
   errs() << "Unhandled node in GetNumNodeResults\n";
   exit(1);
@@ -1256,7 +1265,7 @@ SubstituteFormalArguments(std::map<std::string, TreePatternNode*> &ArgMap) {
 /// PatFrag references.
 TreePatternNode *TreePatternNode::InlinePatternFragments(TreePattern &TP) {
   if (TP.hasError())
-    return 0;
+    return nullptr;
 
   if (isLeaf())
      return this;  // nothing to do.
@@ -1285,7 +1294,7 @@ TreePatternNode *TreePatternNode::InlinePatternFragments(TreePattern &TP) {
   if (Frag->getNumArgs() != Children.size()) {
     TP.error("'" + Op->getName() + "' fragment requires " +
              utostr(Frag->getNumArgs()) + " operands!");
-    return 0;
+    return nullptr;
   }
 
   TreePatternNode *FragTree = Frag->getOnlyTree()->clone();
@@ -1423,6 +1432,9 @@ static EEVT::TypeSet getImplicitType(Record *R, unsigned ResNo,
     return EEVT::TypeSet(); // Unknown.
   }
 
+  if (R->isSubClassOf("Operand"))
+    return EEVT::TypeSet(getValueType(R->getValueAsDef("Type")));
+
   TP.error("Unknown node flavor used in pattern: " + R->getName());
   return EEVT::TypeSet(MVT::Other, TP);
 }
@@ -1435,7 +1447,7 @@ getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const {
   if (getOperator() != CDP.get_intrinsic_void_sdnode() &&
       getOperator() != CDP.get_intrinsic_w_chain_sdnode() &&
       getOperator() != CDP.get_intrinsic_wo_chain_sdnode())
-    return 0;
+    return nullptr;
 
   unsigned IID = cast<IntInit>(getChild(0)->getLeafValue())->getValue();
   return &CDP.getIntrinsicInfo(IID);
@@ -1445,12 +1457,37 @@ getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const {
 /// return the ComplexPattern information, otherwise return null.
 const ComplexPattern *
 TreePatternNode::getComplexPatternInfo(const CodeGenDAGPatterns &CGP) const {
-  if (!isLeaf()) return 0;
+  Record *Rec;
+  if (isLeaf()) {
+    DefInit *DI = dyn_cast<DefInit>(getLeafValue());
+    if (!DI)
+      return nullptr;
+    Rec = DI->getDef();
+  } else
+    Rec = getOperator();
 
-  DefInit *DI = dyn_cast<DefInit>(getLeafValue());
-  if (DI && DI->getDef()->isSubClassOf("ComplexPattern"))
-    return &CGP.getComplexPattern(DI->getDef());
-  return 0;
+  if (!Rec->isSubClassOf("ComplexPattern"))
+    return nullptr;
+  return &CGP.getComplexPattern(Rec);
+}
+
+unsigned TreePatternNode::getNumMIResults(const CodeGenDAGPatterns &CGP) const {
+  // A ComplexPattern specifically declares how many results it fills in.
+  if (const ComplexPattern *CP = getComplexPatternInfo(CGP))
+    return CP->getNumOperands();
+
+  // If MIOperandInfo is specified, that gives the count.
+  if (isLeaf()) {
+    DefInit *DI = dyn_cast<DefInit>(getLeafValue());
+    if (DI && DI->getDef()->isSubClassOf("Operand")) {
+      DagInit *MIOps = DI->getDef()->getValueAsDag("MIOperandInfo");
+      if (MIOps->getNumArgs())
+        return MIOps->getNumArgs();
+    }
+  }
+
+  // Otherwise there is just one result.
+  return 1;
 }
 
 /// NodeHasProperty - Return true if this node has the specified property.
@@ -1681,9 +1718,9 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
         DagInit *MIOpInfo = OperandNode->getValueAsDag("MIOperandInfo");
         if (unsigned NumArgs = MIOpInfo->getNumArgs()) {
           // But don't do that if the whole operand is being provided by
-          // a single ComplexPattern.
-          const ComplexPattern *AM = Child->getComplexPatternInfo(CDP);
-          if (!AM || AM->getNumOperands() < NumArgs) {
+          // a single ComplexPattern-related Operand.
+
+          if (Child->getNumMIResults(CDP) < NumArgs) {
             // Match first sub-operand against the child we already have.
             Record *SubRec = cast<DefInit>(MIOpInfo->getArg(0))->getDef();
             MadeChange |=
@@ -1723,6 +1760,15 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     return MadeChange;
   }
 
+  if (getOperator()->isSubClassOf("ComplexPattern")) {
+    bool MadeChange = false;
+
+    for (unsigned i = 0; i < getNumChildren(); ++i)
+      MadeChange |= getChild(i)->ApplyTypeConstraints(TP, NotRegisters);
+
+    return MadeChange;
+  }
+
   assert(getOperator()->isSubClassOf("SDNodeXForm") && "Unknown node type!");
 
   // Node transforms always take one operand.
@@ -1779,6 +1825,9 @@ bool TreePatternNode::canPatternMatch(std::string &Reason,
     return true;
   }
 
+  if (getOperator()->isSubClassOf("ComplexPattern"))
+    return true;
+
   // If this node is a commutative operator, check that the LHS isn't an
   // immediate.
   const SDNodeInfo &NodeInfo = CDP.getSDNodeInfo(getOperator());
@@ -1888,7 +1937,7 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
   if (BitsInit *BI = dyn_cast<BitsInit>(TheInit)) {
     // Turn this into an IntInit.
     Init *II = BI->convertInitializerTo(IntRecTy::get());
-    if (II == 0 || !isa<IntInit>(II))
+    if (!II || !isa<IntInit>(II))
       error("Bits value must be constants!");
     return ParseTreePattern(II, OpName);
   }
@@ -1925,6 +1974,7 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
       !Operator->isSubClassOf("Instruction") &&
       !Operator->isSubClassOf("SDNodeXForm") &&
       !Operator->isSubClassOf("Intrinsic") &&
+      !Operator->isSubClassOf("ComplexPattern") &&
       Operator->getName() != "set" &&
       Operator->getName() != "implicit")
     error("Unrecognized node '" + Operator->getName() + "'!");
@@ -1980,6 +2030,27 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
     Children.insert(Children.begin(), IIDNode);
   }
 
+  if (Operator->isSubClassOf("ComplexPattern")) {
+    for (unsigned i = 0; i < Children.size(); ++i) {
+      TreePatternNode *Child = Children[i];
+
+      if (Child->getName().empty())
+        error("All arguments to a ComplexPattern must be named");
+
+      // Check that the ComplexPattern uses are consistent: "(MY_PAT $a, $b)"
+      // and "(MY_PAT $b, $a)" should not be allowed in the same pattern;
+      // neither should "(MY_PAT_1 $a, $b)" and "(MY_PAT_2 $a, $b)".
+      auto OperandId = std::make_pair(Operator, i);
+      auto PrevOp = ComplexPatternOperands.find(Child->getName());
+      if (PrevOp != ComplexPatternOperands.end()) {
+        if (PrevOp->getValue() != OperandId)
+          error("All ComplexPattern operands must appear consistently: "
+                "in the same order in just one ComplexPattern instance.");
+      } else
+        ComplexPatternOperands[Child->getName()] = OperandId;
+    }
+  }
+
   unsigned NumResults = GetNumNodeResults(Operator, CDP);
   TreePatternNode *Result = new TreePatternNode(Operator, Children, NumResults);
   Result->setName(OpName);
@@ -2551,14 +2622,11 @@ public:
       return;
     }
 
-    // Get information about the SDNode for the operator.
-    const SDNodeInfo &OpInfo = CDP.getSDNodeInfo(N->getOperator());
-
     // Notice properties of the node.
-    if (OpInfo.hasProperty(SDNPMayStore)) mayStore = true;
-    if (OpInfo.hasProperty(SDNPMayLoad)) mayLoad = true;
-    if (OpInfo.hasProperty(SDNPSideEffect)) hasSideEffects = true;
-    if (OpInfo.hasProperty(SDNPVariadic)) isVariadic = true;
+    if (N->NodeHasProperty(SDNPMayStore, CDP)) mayStore = true;
+    if (N->NodeHasProperty(SDNPMayLoad, CDP)) mayLoad = true;
+    if (N->NodeHasProperty(SDNPSideEffect, CDP)) hasSideEffects = true;
+    if (N->NodeHasProperty(SDNPVariadic, CDP)) isVariadic = true;
 
     if (const CodeGenIntrinsic *IntInfo = N->getIntrinsicInfo(CDP)) {
       // If this is an intrinsic, analyze it.
@@ -2739,7 +2807,7 @@ const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
 
     // Check that all of the results occur first in the list.
     std::vector<Record*> Results;
-    TreePatternNode *Res0Node = 0;
+    TreePatternNode *Res0Node = nullptr;
     for (unsigned i = 0; i != NumResults; ++i) {
       if (i == CGI.Operands.size())
         I->error("'" + InstResults.begin()->first +
@@ -2748,13 +2816,13 @@ const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
 
       // Check that it exists in InstResults.
       TreePatternNode *RNode = InstResults[OpName];
-      if (RNode == 0)
+      if (!RNode)
         I->error("Operand $" + OpName + " does not exist in operand list!");
 
       if (i == 0)
         Res0Node = RNode;
       Record *R = cast<DefInit>(RNode->getLeafValue())->getDef();
-      if (R == 0)
+      if (!R)
         I->error("Operand $" + OpName + " should be a set destination: all "
                  "outputs must occur before inputs in operand list!");
 
@@ -2811,7 +2879,7 @@ const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
 
       // Promote the xform function to be an explicit node if set.
       if (Record *Xform = OpNode->getTransformFn()) {
-        OpNode->setTransformFn(0);
+        OpNode->setTransformFn(nullptr);
         std::vector<TreePatternNode*> Children;
         Children.push_back(OpNode);
         OpNode = new TreePatternNode(Xform, Children, OpNode->getNumTypes());
@@ -2855,7 +2923,7 @@ void CodeGenDAGPatterns::ParseInstructions() {
   std::vector<Record*> Instrs = Records.getAllDerivedDefinitions("Instruction");
 
   for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
-    ListInit *LI = 0;
+    ListInit *LI = nullptr;
 
     if (isa<ListInit>(Instrs[i]->getValueInit("Pattern")))
       LI = Instrs[i]->getValueAsListInit("Pattern");
@@ -2890,7 +2958,7 @@ void CodeGenDAGPatterns::ParseInstructions() {
       // Create and insert the instruction.
       std::vector<Record*> ImpResults;
       Instructions.insert(std::make_pair(Instrs[i],
-                          DAGInstruction(0, Results, Operands, ImpResults)));
+                          DAGInstruction(nullptr, Results, Operands, ImpResults)));
       continue;  // no pattern.
     }
 
@@ -2907,7 +2975,7 @@ void CodeGenDAGPatterns::ParseInstructions() {
        E = Instructions.end(); II != E; ++II) {
     DAGInstruction &TheInst = II->second;
     TreePattern *I = TheInst.getPattern();
-    if (I == 0) continue;  // No pattern.
+    if (!I) continue;  // No pattern.
 
     // FIXME: Assume only the first tree is the pattern. The others are clobber
     // nodes.
@@ -2983,7 +3051,7 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern,
   // they don't exist in the input pattern.
   for (std::map<std::string, NameRecord>::iterator
        I = DstNames.begin(), E = DstNames.end(); I != E; ++I) {
-    if (SrcNames[I->first].first == 0)
+    if (SrcNames[I->first].first == nullptr)
       Pattern->error("Pattern has input without matching name in output: $" +
                      I->first);
   }
@@ -2992,7 +3060,7 @@ void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern,
   // name isn't used in the dest, and isn't used to tie two values together.
   for (std::map<std::string, NameRecord>::iterator
        I = SrcNames.begin(), E = SrcNames.end(); I != E; ++I)
-    if (DstNames[I->first].first == 0 && SrcNames[I->first].second == 1)
+    if (DstNames[I->first].first == nullptr && SrcNames[I->first].second == 1)
       Pattern->error("Pattern has dead named input: $" + I->first);
 
   PatternsToMatch.push_back(PTM);
@@ -3280,7 +3348,7 @@ void CodeGenDAGPatterns::ParsePatterns() {
     for (unsigned ii = 0, ee = DstPattern->getNumChildren(); ii != ee; ++ii) {
       TreePatternNode *OpNode = DstPattern->getChild(ii);
       if (Record *Xform = OpNode->getTransformFn()) {
-        OpNode->setTransformFn(0);
+        OpNode->setTransformFn(nullptr);
         std::vector<TreePatternNode*> Children;
         Children.push_back(OpNode);
         OpNode = new TreePatternNode(Xform, Children, OpNode->getNumTypes());
@@ -3432,8 +3500,8 @@ static void GenerateVariantsOf(TreePatternNode *N,
                                std::vector<TreePatternNode*> &OutVariants,
                                CodeGenDAGPatterns &CDP,
                                const MultipleUseVarSet &DepVars) {
-  // We cannot permute leaves.
-  if (N->isLeaf()) {
+  // We cannot permute leaves or ComplexPattern uses.
+  if (N->isLeaf() || N->getOperator()->isSubClassOf("ComplexPattern")) {
     OutVariants.push_back(N);
     return;
   }
diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h
index d995329..fb30cdd 100644
--- a/utils/TableGen/CodeGenDAGPatterns.h
+++ b/utils/TableGen/CodeGenDAGPatterns.h
@@ -148,8 +148,8 @@ namespace EEVT {
     /// valid on completely unknown type sets.  If Pred is non-null, only MVTs
     /// that pass the predicate are added.
     bool FillWithPossibleTypes(TreePattern &TP,
-                               bool (*Pred)(MVT::SimpleValueType) = 0,
-                               const char *PredicateName = 0);
+                               bool (*Pred)(MVT::SimpleValueType) = nullptr,
+                               const char *PredicateName = nullptr);
   };
 }
 
@@ -329,11 +329,11 @@ class TreePatternNode {
 public:
   TreePatternNode(Record *Op, const std::vector<TreePatternNode*> &Ch,
                   unsigned NumResults)
-    : Operator(Op), Val(0), TransformFn(0), Children(Ch) {
+    : Operator(Op), Val(nullptr), TransformFn(nullptr), Children(Ch) {
     Types.resize(NumResults);
   }
   TreePatternNode(Init *val, unsigned NumResults)    // leaf ctor
-    : Operator(0), Val(val), TransformFn(0) {
+    : Operator(nullptr), Val(val), TransformFn(nullptr) {
     Types.resize(NumResults);
   }
   ~TreePatternNode();
@@ -342,7 +342,7 @@ public:
   const std::string &getName() const { return Name; }
   void setName(StringRef N) { Name.assign(N.begin(), N.end()); }
 
-  bool isLeaf() const { return Val != 0; }
+  bool isLeaf() const { return Val != nullptr; }
 
   // Type accessors.
   unsigned getNumTypes() const { return Types.size(); }
@@ -409,6 +409,12 @@ public:
   const ComplexPattern *
   getComplexPatternInfo(const CodeGenDAGPatterns &CGP) const;
 
+  /// Returns the number of MachineInstr operands that would be produced by this
+  /// node if it mapped directly to an output Instruction's
+  /// operand. ComplexPattern specifies this explicitly; MIOperandInfo gives it
+  /// for Operands; otherwise 1.
+  unsigned getNumMIResults(const CodeGenDAGPatterns &CGP) const;
+
   /// NodeHasProperty - Return true if this node has the specified property.
   bool NodeHasProperty(SDNP Property, const CodeGenDAGPatterns &CGP) const;
 
@@ -527,6 +533,13 @@ class TreePattern {
   /// hasError - True if the currently processed nodes have unresolvable types
   /// or other non-fatal errors
   bool HasError;
+
+  /// It's important that the usage of operands in ComplexPatterns is
+  /// consistent: each named operand can be defined by at most one
+  /// ComplexPattern. This records the ComplexPattern instance and the operand
+  /// number for each operand encountered in a ComplexPattern to aid in that
+  /// check.
+  StringMap<std::pair<Record *, unsigned>> ComplexPatternOperands;
 public:
 
   /// TreePattern constructor - Parse the specified DagInits into the
@@ -580,7 +593,7 @@ public:
   /// patterns as possible.  Return true if all types are inferred, false
   /// otherwise.  Bail out if a type contradiction is found.
   bool InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> >
-                          *NamedTypes=0);
+                          *NamedTypes=nullptr);
 
   /// error - If this is the first error in the current resolution step,
   /// print it and set the error flag.  Otherwise, continue silently.
@@ -619,7 +632,7 @@ public:
                  const std::vector<Record*> &operands,
                  const std::vector<Record*> &impresults)
     : Pattern(TP), Results(results), Operands(operands),
-      ImpResults(impresults), ResultPattern(0) {}
+      ImpResults(impresults), ResultPattern(nullptr) {}
 
   TreePattern *getPattern() const { return Pattern; }
   unsigned getNumResults() const { return Results.size(); }
@@ -768,7 +781,7 @@ public:
     return PatternFragments.find(R)->second;
   }
   TreePattern *getPatternFragmentIfRead(Record *R) const {
-    if (!PatternFragments.count(R)) return 0;
+    if (!PatternFragments.count(R)) return nullptr;
     return PatternFragments.find(R)->second;
   }
 
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index 5eebb91..2577ad4 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -69,7 +69,7 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
     std::string EncoderMethod;
     std::string OperandType = "OPERAND_UNKNOWN";
     unsigned NumOps = 1;
-    DagInit *MIOpInfo = 0;
+    DagInit *MIOpInfo = nullptr;
     if (Rec->isSubClassOf("RegisterOperand")) {
       PrintMethod = Rec->getValueAsString("PrintMethod");
     } else if (Rec->isSubClassOf("Operand")) {
@@ -182,7 +182,7 @@ CGIOperandList::ParseOperandName(const std::string &Op, bool AllowWholeOp) {
 
   // Find the suboperand number involved.
   DagInit *MIOpInfo = OperandList[OpIdx].MIOperandInfo;
-  if (MIOpInfo == 0)
+  if (!MIOpInfo)
     PrintFatalError(TheDef->getName() + ": unknown suboperand name in '" + Op + "'");
 
   // Find the operand with the right name.
@@ -290,7 +290,7 @@ void CGIOperandList::ProcessDisableEncoding(std::string DisableEncoding) {
 //===----------------------------------------------------------------------===//
 
 CodeGenInstruction::CodeGenInstruction(Record *R)
-  : TheDef(R), Operands(R), InferredFrom(0) {
+  : TheDef(R), Operands(R), InferredFrom(nullptr) {
   Namespace = R->getValueAsString("Namespace");
   AsmString = R->getValueAsString("AsmString");
 
@@ -436,7 +436,7 @@ bool CodeGenInstAlias::tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
                                        ResultOperand &ResOp) {
   Init *Arg = Result->getArg(AliasOpNo);
   DefInit *ADI = dyn_cast<DefInit>(Arg);
-  Record *ResultRecord = ADI ? ADI->getDef() : 0;
+  Record *ResultRecord = ADI ? ADI->getDef() : nullptr;
 
   if (ADI && ADI->getDef() == InstOpRec) {
     // If the operand is a record, it must have a name, and the record type
@@ -504,7 +504,7 @@ bool CodeGenInstAlias::tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
     //  throw TGError(Loc, "reg0 used for result that is not an "
     //                "OptionalDefOperand!");
 
-    ResOp = ResultOperand(static_cast<Record*>(0));
+    ResOp = ResultOperand(static_cast<Record*>(nullptr));
     return true;
   }
 
@@ -536,13 +536,34 @@ bool CodeGenInstAlias::tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
   return false;
 }
 
-CodeGenInstAlias::CodeGenInstAlias(Record *R, CodeGenTarget &T) : TheDef(R) {
-  AsmString = R->getValueAsString("AsmString");
+unsigned CodeGenInstAlias::ResultOperand::getMINumOperands() const {
+  if (!isRecord())
+    return 1;
+
+  Record *Rec = getRecord();
+  if (!Rec->isSubClassOf("Operand"))
+    return 1;
+
+  DagInit *MIOpInfo = Rec->getValueAsDag("MIOperandInfo");
+  if (MIOpInfo->getNumArgs() == 0) {
+    // Unspecified, so it defaults to 1
+    return 1;
+  }
+
+  return MIOpInfo->getNumArgs();
+}
+
+CodeGenInstAlias::CodeGenInstAlias(Record *R, unsigned Variant,
+                                   CodeGenTarget &T)
+    : TheDef(R) {
   Result = R->getValueAsDag("ResultInst");
+  AsmString = R->getValueAsString("AsmString");
+  AsmString = CodeGenInstruction::FlattenAsmStringVariants(AsmString, Variant);
+
 
   // Verify that the root of the result is an instruction.
   DefInit *DI = dyn_cast<DefInit>(Result->getOperator());
-  if (DI == 0 || !DI->getDef()->isSubClassOf("Instruction"))
+  if (!DI || !DI->getDef()->isSubClassOf("Instruction"))
     PrintFatalError(R->getLoc(),
                     "result of inst alias should be an instruction");
 
diff --git a/utils/TableGen/CodeGenInstruction.h b/utils/TableGen/CodeGenInstruction.h
index 00d89bf..f143875 100644
--- a/utils/TableGen/CodeGenInstruction.h
+++ b/utils/TableGen/CodeGenInstruction.h
@@ -149,6 +149,12 @@ namespace llvm {
     OperandInfo &back() { return OperandList.back(); }
     const OperandInfo &back() const { return OperandList.back(); }
 
+    typedef std::vector<OperandInfo>::iterator iterator;
+    typedef std::vector<OperandInfo>::const_iterator const_iterator;
+    iterator begin() { return OperandList.begin(); }
+    const_iterator begin() const { return OperandList.begin(); }
+    iterator end() { return OperandList.end(); }
+    const_iterator end() const { return OperandList.end(); }
 
     /// getOperandNamed - Return the index of the operand with the specified
     /// non-empty name.  If the instruction does not have an operand with the
@@ -318,6 +324,8 @@ namespace llvm {
       Record *getRecord() const { assert(isRecord()); return R; }
       int64_t getImm() const { assert(isImm()); return Imm; }
       Record *getRegister() const { assert(isReg()); return R; }
+
+      unsigned getMINumOperands() const;
     };
 
     /// ResultOperands - The decoded operands for the result instruction.
@@ -330,7 +338,7 @@ namespace llvm {
     /// of them are matched by the operand, the second value should be -1.
     std::vector<std::pair<unsigned, int> > ResultInstOperandIndex;
 
-    CodeGenInstAlias(Record *R, CodeGenTarget &T);
+    CodeGenInstAlias(Record *R, unsigned Variant, CodeGenTarget &T);
 
     bool tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
                          Record *InstOpRec, bool hasSubOps, ArrayRef<SMLoc> Loc,
diff --git a/utils/TableGen/CodeGenMapTable.cpp b/utils/TableGen/CodeGenMapTable.cpp
index b97126b..7e5aa9c 100644
--- a/utils/TableGen/CodeGenMapTable.cpp
+++ b/utils/TableGen/CodeGenMapTable.cpp
@@ -326,7 +326,7 @@ Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr,
   const std::vector<Record*> &RelatedInstrVec = RowInstrMap[KeyValue];
 
   ListInit *ColFields = InstrMapDesc.getColFields();
-  Record *MatchInstr = NULL;
+  Record *MatchInstr = nullptr;
 
   for (unsigned i = 0, e = RelatedInstrVec.size(); i < e; i++) {
     bool MatchFound = true;
@@ -378,7 +378,7 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
     unsigned RelExists = 0;
     if (ColInstrs.size()) {
       for (unsigned j = 0; j < NumCol; j++) {
-        if (ColInstrs[j] != NULL) {
+        if (ColInstrs[j] != nullptr) {
           RelExists = 1;
           OutStr += ", ";
           OutStr += TargetName;
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index e0e0b62..8099f13 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -12,8 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "regalloc-emitter"
-
 #include "CodeGenRegisters.h"
 #include "CodeGenTarget.h"
 #include "llvm/ADT/IntEqClasses.h"
@@ -26,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "regalloc-emitter"
+
 //===----------------------------------------------------------------------===//
 //                             CodeGenSubRegIndex
 //===----------------------------------------------------------------------===//
@@ -41,7 +41,7 @@ CodeGenSubRegIndex::CodeGenSubRegIndex(Record *R, unsigned Enum)
 
 CodeGenSubRegIndex::CodeGenSubRegIndex(StringRef N, StringRef Nspace,
                                        unsigned Enum)
-  : TheDef(0), Name(N), Namespace(Nspace), Size(-1), Offset(-1),
+  : TheDef(nullptr), Name(N), Namespace(Nspace), Size(-1), Offset(-1),
     EnumValue(Enum), LaneMask(0), AllSuperRegsCovered(true) {
 }
 
@@ -725,7 +725,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
 CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
                                            StringRef Name, Key Props)
   : Members(*Props.Members),
-    TheDef(0),
+    TheDef(nullptr),
     Name(Name),
     TopoSigs(RegBank.getNumTopoSigs()),
     EnumValue(-1),
@@ -1312,7 +1312,7 @@ static void computeUberWeights(std::vector<UberRegSet> &UberSets,
          E = UberSets.end(); I != E; ++I) {
 
     // Initialize all unit weights in this set, and remember the max units/reg.
-    const CodeGenRegister *Reg = 0;
+    const CodeGenRegister *Reg = nullptr;
     unsigned MaxWeight = 0, Weight = 0;
     for (RegUnitIterator UnitI(I->Regs); UnitI.isValid(); ++UnitI) {
       if (Reg != UnitI.getReg()) {
@@ -1923,7 +1923,7 @@ const CodeGenRegisterClass*
 CodeGenRegBank::getRegClassForRegister(Record *R) {
   const CodeGenRegister *Reg = getReg(R);
   ArrayRef<CodeGenRegisterClass*> RCs = getRegClasses();
-  const CodeGenRegisterClass *FoundRC = 0;
+  const CodeGenRegisterClass *FoundRC = nullptr;
   for (unsigned i = 0, e = RCs.size(); i != e; ++i) {
     const CodeGenRegisterClass &RC = *RCs[i];
     if (!RC.contains(Reg))
@@ -1938,7 +1938,7 @@ CodeGenRegBank::getRegClassForRegister(Record *R) {
 
     // If a register's classes have different types, return null.
     if (RC.getValueTypes() != FoundRC->getValueTypes())
-      return 0;
+      return nullptr;
 
     // Check to see if the previously found class that contains
     // the register is a subclass of the current class. If so,
@@ -1956,7 +1956,7 @@ CodeGenRegBank::getRegClassForRegister(Record *R) {
 
     // Multiple classes, and neither is a superclass of the other.
     // Return null.
-    return 0;
+    return nullptr;
   }
   return FoundRC;
 }
diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index 03ffb43..30732c8 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h
@@ -71,7 +71,7 @@ namespace llvm {
     // Returns NULL if this and Idx don't compose.
     CodeGenSubRegIndex *compose(CodeGenSubRegIndex *Idx) const {
       CompMap::const_iterator I = Composed.find(Idx);
-      return I == Composed.end() ? 0 : I->second;
+      return I == Composed.end() ? nullptr : I->second;
     }
 
     // Add a composite subreg index: this+A = B.
@@ -90,7 +90,8 @@ namespace llvm {
         B->Offset = Offset + A->Offset;
         B->Size = A->Size;
       }
-      return (Ins.second || Ins.first->second == B) ? 0 : Ins.first->second;
+      return (Ins.second || Ins.first->second == B) ? nullptr
+                                                    : Ins.first->second;
     }
 
     // Update the composite maps of components specified in 'ComposedOf'.
@@ -414,7 +415,9 @@ namespace llvm {
     // contain this unit.
     unsigned RegClassUnitSetsIdx;
 
-    RegUnit() : Weight(0), RegClassUnitSetsIdx(0) { Roots[0] = Roots[1] = 0; }
+    RegUnit() : Weight(0), RegClassUnitSetsIdx(0) {
+      Roots[0] = Roots[1] = nullptr;
+    }
 
     ArrayRef<const CodeGenRegister*> getRoots() const {
       assert(!(Roots[1] && !Roots[0]) && "Invalid roots array");
@@ -572,7 +575,7 @@ namespace llvm {
 
     // Create a native register unit that is associated with one or two root
     // registers.
-    unsigned newRegUnit(CodeGenRegister *R0, CodeGenRegister *R1 = 0) {
+    unsigned newRegUnit(CodeGenRegister *R0, CodeGenRegister *R1 = nullptr) {
       RegUnits.resize(RegUnits.size() + 1);
       RegUnits.back().Roots[0] = R0;
       RegUnits.back().Roots[1] = R1;
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index a07524d..79d60ac 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -12,8 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "subtarget-emitter"
-
 #include "CodeGenSchedule.h"
 #include "CodeGenTarget.h"
 #include "llvm/ADT/STLExtras.h"
@@ -23,6 +21,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "subtarget-emitter"
+
 #ifndef NDEBUG
 static void dumpIdxVec(const IdxVec &V) {
   for (unsigned i = 0, e = V.size(); i < e; ++i) {
@@ -59,7 +59,7 @@ struct InstRegexOp : public SetTheory::Operator {
 
   void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
              ArrayRef<SMLoc> Loc) override {
-    SmallVector<Regex*, 4> RegexList;
+    SmallVector<Regex, 4> RegexList;
     for (DagInit::const_arg_iterator
            AI = Expr->arg_begin(), AE = Expr->arg_end(); AI != AE; ++AI) {
       StringInit *SI = dyn_cast<StringInit>(*AI);
@@ -72,17 +72,15 @@ struct InstRegexOp : public SetTheory::Operator {
         pat.insert(0, "^(");
         pat.insert(pat.end(), ')');
       }
-      RegexList.push_back(new Regex(pat));
+      RegexList.push_back(Regex(pat));
     }
     for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
            E = Target.inst_end(); I != E; ++I) {
-      for (SmallVectorImpl<Regex*>::iterator
-             RI = RegexList.begin(), RE = RegexList.end(); RI != RE; ++RI) {
-        if ((*RI)->match((*I)->TheDef->getName()))
+      for (auto &R : RegexList) {
+        if (R.match((*I)->TheDef->getName()))
           Elts.insert((*I)->TheDef);
       }
     }
-    DeleteContainerPointers(RegexList);
   }
 };
 } // end anonymous namespace
@@ -429,7 +427,7 @@ void CodeGenSchedModels::expandRWSeqForProc(
   const CodeGenProcModel &ProcModel) const {
 
   const CodeGenSchedRW &SchedWrite = getSchedRW(RWIdx, IsRead);
-  Record *AliasDef = 0;
+  Record *AliasDef = nullptr;
   for (RecIter AI = SchedWrite.Aliases.begin(), AE = SchedWrite.Aliases.end();
        AI != AE; ++AI) {
     const CodeGenSchedRW &AliasRW = getSchedRW((*AI)->getValueAsDef("AliasRW"));
@@ -1315,7 +1313,7 @@ static void inferFromTransitions(ArrayRef<PredTransition> LastTransitions,
     IdxVec ProcIndices(I->ProcIndices.begin(), I->ProcIndices.end());
     CodeGenSchedTransition SCTrans;
     SCTrans.ToClassIdx =
-      SchedModels.addSchedClass(/*ItinClassDef=*/0, OperWritesVariant,
+      SchedModels.addSchedClass(/*ItinClassDef=*/nullptr, OperWritesVariant,
                                 OperReadsVariant, ProcIndices);
     SCTrans.ProcIndices = ProcIndices;
     // The final PredTerm is unique set of predicates guarding the transition.
@@ -1621,7 +1619,7 @@ Record *CodeGenSchedModels::findProcResUnits(Record *ProcResKind,
   if (ProcResKind->isSubClassOf("ProcResourceUnits"))
     return ProcResKind;
 
-  Record *ProcUnitDef = 0;
+  Record *ProcUnitDef = nullptr;
   RecVec ProcResourceDefs =
     Records.getAllDerivedDefinitions("ProcResourceUnits");
 
diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
index 5ce679a..65ac602 100644
--- a/utils/TableGen/CodeGenSchedule.h
+++ b/utils/TableGen/CodeGenSchedule.h
@@ -56,7 +56,7 @@ struct CodeGenSchedRW {
   RecVec Aliases;
 
   CodeGenSchedRW()
-    : Index(0), TheDef(0), IsRead(false), IsAlias(false),
+    : Index(0), TheDef(nullptr), IsRead(false), IsAlias(false),
       HasVariants(false), IsVariadic(false), IsSequence(false) {}
   CodeGenSchedRW(unsigned Idx, Record *Def)
     : Index(Idx), TheDef(Def), IsAlias(false), IsVariadic(false) {
@@ -74,7 +74,7 @@ struct CodeGenSchedRW {
 
   CodeGenSchedRW(unsigned Idx, bool Read, const IdxVec &Seq,
                  const std::string &Name)
-    : Index(Idx), Name(Name), TheDef(0), IsRead(Read), IsAlias(false),
+    : Index(Idx), Name(Name), TheDef(nullptr), IsRead(Read), IsAlias(false),
       HasVariants(false), IsVariadic(false), IsSequence(true), Sequence(Seq) {
     assert(Sequence.size() > 1 && "implied sequence needs >1 RWs");
   }
@@ -142,7 +142,7 @@ struct CodeGenSchedClass {
   // off to join another inferred class.
   RecVec InstRWs;
 
-  CodeGenSchedClass(): Index(0), ItinClassDef(0) {}
+  CodeGenSchedClass(): Index(0), ItinClassDef(nullptr) {}
 
   bool isKeyEqual(Record *IC, const IdxVec &W, const IdxVec &R) {
     return ItinClassDef == IC && Writes == W && Reads == R;
@@ -248,6 +248,28 @@ class CodeGenSchedModels {
 public:
   CodeGenSchedModels(RecordKeeper& RK, const CodeGenTarget &TGT);
 
+  // iterator access to the scheduling classes.
+  typedef std::vector<CodeGenSchedClass>::iterator class_iterator;
+  typedef std::vector<CodeGenSchedClass>::const_iterator const_class_iterator;
+  class_iterator classes_begin() { return SchedClasses.begin(); }
+  const_class_iterator classes_begin() const { return SchedClasses.begin(); }
+  class_iterator classes_end() { return SchedClasses.end(); }
+  const_class_iterator classes_end() const { return SchedClasses.end(); }
+  iterator_range<class_iterator> classes() {
+   return iterator_range<class_iterator>(classes_begin(), classes_end());
+  }
+  iterator_range<const_class_iterator> classes() const {
+   return iterator_range<const_class_iterator>(classes_begin(), classes_end());
+  }
+  iterator_range<class_iterator> explicit_classes() {
+    return iterator_range<class_iterator>(
+        classes_begin(), classes_begin() + NumInstrSchedClasses);
+  }
+  iterator_range<const_class_iterator> explicit_classes() const {
+    return iterator_range<const_class_iterator>(
+        classes_begin(), classes_begin() + NumInstrSchedClasses);
+  }
+
   Record *getModelOrItinDef(Record *ProcDef) const {
     Record *ModelDef = ProcDef->getValueAsDef("SchedModel");
     Record *ItinsDef = ProcDef->getValueAsDef("ProcItin");
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index dd9c23c..de00dc6 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -133,7 +133,7 @@ std::string llvm::getQualifiedName(const Record *R) {
 /// getTarget - Return the current instance of the Target class.
 ///
 CodeGenTarget::CodeGenTarget(RecordKeeper &records)
-  : Records(records), RegBank(0), SchedModels(0) {
+  : Records(records), RegBank(nullptr), SchedModels(nullptr) {
   std::vector<Record*> Targets = Records.getAllDerivedDefinitions("Target");
   if (Targets.size() == 0)
     PrintFatalError("ERROR: No 'Target' subclasses defined!");
@@ -226,7 +226,7 @@ const CodeGenRegister *CodeGenTarget::getRegisterByName(StringRef Name) const {
   const StringMap<CodeGenRegister*> &Regs = getRegBank().getRegistersByName();
   StringMap<CodeGenRegister*>::const_iterator I = Regs.find(Name);
   if (I == Regs.end())
-    return 0;
+    return nullptr;
   return I->second;
 }
 
@@ -287,7 +287,7 @@ GetInstByName(const char *Name,
 
   DenseMap<const Record*, CodeGenInstruction*>::const_iterator
     I = Insts.find(Rec);
-  if (Rec == 0 || I == Insts.end())
+  if (!Rec || I == Insts.end())
     PrintFatalError(Twine("Could not find '") + Name + "' instruction!");
   return I->second;
 }
@@ -301,7 +301,7 @@ void CodeGenTarget::ComputeInstrsByEnum() const {
       "GC_LABEL",     "KILL",          "EXTRACT_SUBREG",   "INSERT_SUBREG",
       "IMPLICIT_DEF", "SUBREG_TO_REG", "COPY_TO_REGCLASS", "DBG_VALUE",
       "REG_SEQUENCE", "COPY",          "BUNDLE",           "LIFETIME_START",
-      "LIFETIME_END", "STACKMAP",      "PATCHPOINT",       0};
+      "LIFETIME_END", "STACKMAP",      "PATCHPOINT",       nullptr};
   const DenseMap<const Record*, CodeGenInstruction*> &Insts = getInstructions();
   for (const char *const *p = FixedInstrs; *p; ++p) {
     const CodeGenInstruction *Instr = GetInstByName(*p, Insts, Records);
diff --git a/utils/TableGen/CodeGenTarget.h b/utils/TableGen/CodeGenTarget.h
index d6458f4..5414310 100644
--- a/utils/TableGen/CodeGenTarget.h
+++ b/utils/TableGen/CodeGenTarget.h
@@ -171,6 +171,9 @@ public:
   typedef std::vector<const CodeGenInstruction*>::const_iterator inst_iterator;
   inst_iterator inst_begin() const{return getInstructionsByEnumValue().begin();}
   inst_iterator inst_end() const { return getInstructionsByEnumValue().end(); }
+  iterator_range<inst_iterator> instructions() const {
+    return iterator_range<inst_iterator>(inst_begin(), inst_end());
+  }
 
 
   /// isLittleEndianEncoding - are instruction bit patterns defined as  [0..n]?
diff --git a/utils/TableGen/DAGISelEmitter.cpp b/utils/TableGen/DAGISelEmitter.cpp
index 9294cd5..82682cd 100644
--- a/utils/TableGen/DAGISelEmitter.cpp
+++ b/utils/TableGen/DAGISelEmitter.cpp
@@ -18,6 +18,8 @@
 #include "llvm/TableGen/TableGenBackend.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "dag-isel-emitter"
+
 namespace {
 /// DAGISelEmitter - The top-level class which coordinates construction
 /// and emission of the instruction selector.
diff --git a/utils/TableGen/DAGISelMatcher.cpp b/utils/TableGen/DAGISelMatcher.cpp
index 2557bb6..9c40799 100644
--- a/utils/TableGen/DAGISelMatcher.cpp
+++ b/utils/TableGen/DAGISelMatcher.cpp
@@ -43,7 +43,7 @@ Matcher *Matcher::unlinkNode(Matcher *Other) {
   for (; Cur && Cur->getNext() != Other; Cur = Cur->getNext())
     /*empty*/;
 
-  if (Cur == 0) return 0;
+  if (!Cur) return nullptr;
   Cur->takeNext();
   Cur->setNext(Other->takeNext());
   return this;
@@ -108,7 +108,7 @@ TreePredicateFn CheckPredicateMatcher::getPredicate() const {
 void ScopeMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "Scope\n";
   for (unsigned i = 0, e = getNumChildren(); i != e; ++i) {
-    if (getChild(i) == 0)
+    if (!getChild(i))
       OS.indent(indent+1) << "NULL POINTER\n";
     else
       getChild(i)->print(OS, indent+2);
diff --git a/utils/TableGen/DAGISelMatcher.h b/utils/TableGen/DAGISelMatcher.h
index 56a571f..f8f6c54 100644
--- a/utils/TableGen/DAGISelMatcher.h
+++ b/utils/TableGen/DAGISelMatcher.h
@@ -207,7 +207,7 @@ public:
 
   Matcher *takeChild(unsigned i) {
     Matcher *Res = Children[i];
-    Children[i] = 0;
+    Children[i] = nullptr;
     return Res;
   }
 
diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index 416e8e3..0059570 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -142,7 +142,7 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
   switch (N->getKind()) {
   case Matcher::Scope: {
     const ScopeMatcher *SM = cast<ScopeMatcher>(N);
-    assert(SM->getNext() == 0 && "Shouldn't have next after scope");
+    assert(SM->getNext() == nullptr && "Shouldn't have next after scope");
 
     unsigned StartIdx = CurrentIdx;
 
@@ -725,7 +725,7 @@ void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
 }
 
 static void BuildHistogram(const Matcher *M, std::vector<unsigned> &OpcodeFreq){
-  for (; M != 0; M = M->getNext()) {
+  for (; M != nullptr; M = M->getNext()) {
     // Count this node.
     if (unsigned(M->getKind()) >= OpcodeFreq.size())
       OpcodeFreq.resize(M->getKind()+1);
diff --git a/utils/TableGen/DAGISelMatcherGen.cpp b/utils/TableGen/DAGISelMatcherGen.cpp
index 8ae7444..97e37ba 100644
--- a/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/utils/TableGen/DAGISelMatcherGen.cpp
@@ -62,6 +62,13 @@ namespace {
     /// insertion easier.
     StringMap<unsigned> VariableMap;
 
+    /// This maintains the recorded operand number that OPC_CheckComplexPattern
+    /// drops each sub-operand into. We don't want to insert these into
+    /// VariableMap because that leads to identity checking if they are
+    /// encountered multiple times. Biased by 1 like VariableMap for
+    /// consistency.
+    StringMap<unsigned> NamedComplexPatternOperands;
+
     /// NextRecordedOperandNo - As we emit opcodes to record matched values in
     /// the RecordedNodes array, this keeps track of which slot will be next to
     /// record into.
@@ -76,10 +83,8 @@ namespace {
     SmallVector<unsigned, 2> MatchedGlueResultNodes;
 
     /// MatchedComplexPatterns - This maintains a list of all of the
-    /// ComplexPatterns that we need to check.  The patterns are known to have
-    /// names which were recorded.  The second element of each pair is the first
-    /// slot number that the OPC_CheckComplexPat opcode drops the matched
-    /// results into.
+    /// ComplexPatterns that we need to check. The second element of each pair
+    /// is the recorded operand number of the input node.
     SmallVector<std::pair<const TreePatternNode*,
                           unsigned>, 2> MatchedComplexPatterns;
 
@@ -115,6 +120,11 @@ namespace {
     void EmitOperatorMatchCode(const TreePatternNode *N,
                                TreePatternNode *NodeNoTypes);
 
+    /// If this is the first time a node with unique identifier Name has been
+    /// seen, record it. Otherwise, emit a check to make sure this is the same
+    /// node. Returns true if this is the first encounter.
+    bool recordUniqueNode(std::string Name);
+
     // Result Code Generation.
     unsigned getNamedArgumentSlot(StringRef Name) {
       unsigned VarMapEntry = VariableMap[Name];
@@ -144,7 +154,7 @@ namespace {
 MatcherGen::MatcherGen(const PatternToMatch &pattern,
                        const CodeGenDAGPatterns &cgp)
 : Pattern(pattern), CGP(cgp), NextRecordedOperandNo(0),
-  TheMatcher(0), CurPredicate(0) {
+  TheMatcher(nullptr), CurPredicate(nullptr) {
   // We need to produce the matcher tree for the patterns source pattern.  To do
   // this we need to match the structure as well as the types.  To do the type
   // matching, we want to figure out the fewest number of type checks we need to
@@ -182,7 +192,7 @@ void MatcherGen::InferPossibleTypes() {
 
 /// AddMatcher - Add a matcher node to the current graph we're building.
 void MatcherGen::AddMatcher(Matcher *NewNode) {
-  if (CurPredicate != 0)
+  if (CurPredicate)
     CurPredicate->setNext(NewNode);
   else
     TheMatcher = NewNode;
@@ -218,7 +228,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
   }
 
   DefInit *DI = dyn_cast<DefInit>(N->getLeafValue());
-  if (DI == 0) {
+  if (!DI) {
     errs() << "Unknown leaf kind: " << *N << "\n";
     abort();
   }
@@ -266,7 +276,8 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
 
     // Remember this ComplexPattern so that we can emit it after all the other
     // structural matches are done.
-    MatchedComplexPatterns.push_back(std::make_pair(N, 0));
+    unsigned InputOperand = VariableMap[N->getName()] - 1;
+    MatchedComplexPatterns.push_back(std::make_pair(N, InputOperand));
     return;
   }
 
@@ -277,6 +288,25 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
 void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
                                        TreePatternNode *NodeNoTypes) {
   assert(!N->isLeaf() && "Not an operator?");
+
+  if (N->getOperator()->isSubClassOf("ComplexPattern")) {
+    // The "name" of a non-leaf complex pattern (MY_PAT $op1, $op2) is
+    // "MY_PAT:op1:op2". We should already have validated that the uses are
+    // consistent.
+    std::string PatternName = N->getOperator()->getName();
+    for (unsigned i = 0; i < N->getNumChildren(); ++i) {
+      PatternName += ":";
+      PatternName += N->getChild(i)->getName();
+    }
+
+    if (recordUniqueNode(PatternName)) {
+      auto NodeAndOpNum = std::make_pair(N, NextRecordedOperandNo - 1);
+      MatchedComplexPatterns.push_back(NodeAndOpNum);
+    }
+
+    return;
+  }
+
   const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N->getOperator());
 
   // If this is an 'and R, 1234' where the operation is AND/OR and the RHS is
@@ -415,6 +445,22 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
   }
 }
 
+bool MatcherGen::recordUniqueNode(std::string Name) {
+  unsigned &VarMapEntry = VariableMap[Name];
+  if (VarMapEntry == 0) {
+    // If it is a named node, we must emit a 'Record' opcode.
+    AddMatcher(new RecordMatcher("$" + Name, NextRecordedOperandNo));
+    VarMapEntry = ++NextRecordedOperandNo;
+    return true;
+  }
+
+  // If we get here, this is a second reference to a specific name.  Since
+  // we already have checked that the first reference is valid, we don't
+  // have to recursively match it, just check that it's the same as the
+  // previously named thing.
+  AddMatcher(new CheckSameMatcher(VarMapEntry-1));
+  return false;
+}
 
 void MatcherGen::EmitMatchCode(const TreePatternNode *N,
                                TreePatternNode *NodeNoTypes) {
@@ -432,21 +478,9 @@ void MatcherGen::EmitMatchCode(const TreePatternNode *N,
 
   // If this node has a name associated with it, capture it in VariableMap. If
   // we already saw this in the pattern, emit code to verify dagness.
-  if (!N->getName().empty()) {
-    unsigned &VarMapEntry = VariableMap[N->getName()];
-    if (VarMapEntry == 0) {
-      // If it is a named node, we must emit a 'Record' opcode.
-      AddMatcher(new RecordMatcher("$" + N->getName(), NextRecordedOperandNo));
-      VarMapEntry = ++NextRecordedOperandNo;
-    } else {
-      // If we get here, this is a second reference to a specific name.  Since
-      // we already have checked that the first reference is valid, we don't
-      // have to recursively match it, just check that it's the same as the
-      // previously named thing.
-      AddMatcher(new CheckSameMatcher(VarMapEntry-1));
+  if (!N->getName().empty())
+    if (!recordUniqueNode(N->getName()))
       return;
-    }
-  }
 
   if (N->isLeaf())
     EmitLeafMatchCode(N);
@@ -497,16 +531,20 @@ bool MatcherGen::EmitMatcherCode(unsigned Variant) {
     const TreePatternNode *N = MatchedComplexPatterns[i].first;
 
     // Remember where the results of this match get stuck.
-    MatchedComplexPatterns[i].second = NextRecordedOperandNo;
+    if (N->isLeaf()) {
+      NamedComplexPatternOperands[N->getName()] = NextRecordedOperandNo + 1;
+    } else {
+      unsigned CurOp = NextRecordedOperandNo;
+      for (unsigned i = 0; i < N->getNumChildren(); ++i) {
+        NamedComplexPatternOperands[N->getChild(i)->getName()] = CurOp + 1;
+        CurOp += N->getChild(i)->getNumMIResults(CGP);
+      }
+    }
 
     // Get the slot we recorded the value in from the name on the node.
-    unsigned RecNodeEntry = VariableMap[N->getName()];
-    assert(!N->getName().empty() && RecNodeEntry &&
-           "Complex pattern should have a name and slot");
-    --RecNodeEntry;  // Entries in VariableMap are biased.
+    unsigned RecNodeEntry = MatchedComplexPatterns[i].second;
 
-    const ComplexPattern &CP =
-      CGP.getComplexPattern(((DefInit*)N->getLeafValue())->getDef());
+    const ComplexPattern &CP = *N->getComplexPatternInfo(CGP);
 
     // Emit a CheckComplexPat operation, which does the match (aborting if it
     // fails) and pushes the matched operands onto the recorded nodes list.
@@ -543,21 +581,12 @@ void MatcherGen::EmitResultOfNamedOperand(const TreePatternNode *N,
                                           SmallVectorImpl<unsigned> &ResultOps){
   assert(!N->getName().empty() && "Operand not named!");
 
-  // A reference to a complex pattern gets all of the results of the complex
-  // pattern's match.
-  if (const ComplexPattern *CP = N->getComplexPatternInfo(CGP)) {
-    unsigned SlotNo = 0;
-    for (unsigned i = 0, e = MatchedComplexPatterns.size(); i != e; ++i)
-      if (MatchedComplexPatterns[i].first->getName() == N->getName()) {
-        SlotNo = MatchedComplexPatterns[i].second;
-        break;
-      }
-    assert(SlotNo != 0 && "Didn't get a slot number assigned?");
+  if (unsigned SlotNo = NamedComplexPatternOperands[N->getName()]) {
+    // Complex operands have already been completely selected, just find the
+    // right slot ant add the arguments directly.
+    for (unsigned i = 0; i < N->getNumMIResults(CGP); ++i)
+      ResultOps.push_back(SlotNo - 1 + i);
 
-    // The first slot entry is the node itself, the subsequent entries are the
-    // matched values.
-    for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i)
-      ResultOps.push_back(SlotNo+i);
     return;
   }
 
@@ -575,7 +604,8 @@ void MatcherGen::EmitResultOfNamedOperand(const TreePatternNode *N,
     }
   }
 
-  ResultOps.push_back(SlotNo);
+  for (unsigned i = 0; i < N->getNumMIResults(CGP); ++i)
+    ResultOps.push_back(SlotNo + i);
 }
 
 void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N,
@@ -600,7 +630,7 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N,
     }
 
     if (Def->getName() == "zero_reg") {
-      AddMatcher(new EmitRegisterMatcher(0, N->getType(0)));
+      AddMatcher(new EmitRegisterMatcher(nullptr, N->getType(0)));
       ResultOps.push_back(NextRecordedOperandNo++);
       return;
     }
@@ -642,7 +672,7 @@ GetInstPatternNode(const DAGInstruction &Inst, const TreePatternNode *N) {
   else if (/*isRoot*/ N == Pattern.getDstPattern())
     InstPatNode = Pattern.getSrcPattern();
   else
-    return 0;
+    return nullptr;
 
   if (InstPatNode && !InstPatNode->isLeaf() &&
       InstPatNode->getOperator()->getName() == "set")
@@ -806,7 +836,7 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
   if (isRoot && !Pattern.getDstRegs().empty()) {
     // If the root came from an implicit def in the instruction handling stuff,
     // don't re-add it.
-    Record *HandledReg = 0;
+    Record *HandledReg = nullptr;
     if (II.HasOneImplicitDefWithKnownVT(CGT) != MVT::Other)
       HandledReg = II.ImplicitDefs[0];
 
@@ -924,7 +954,7 @@ void MatcherGen::EmitResultCode() {
   if (!Pattern.getDstRegs().empty()) {
     // If the root came from an implicit def in the instruction handling stuff,
     // don't re-add it.
-    Record *HandledReg = 0;
+    Record *HandledReg = nullptr;
     const TreePatternNode *DstPat = Pattern.getDstPattern();
     if (!DstPat->isLeaf() &&DstPat->getOperator()->isSubClassOf("Instruction")){
       const CodeGenTarget &CGT = CGP.getTargetInfo();
@@ -962,7 +992,7 @@ Matcher *llvm::ConvertPatternToMatcher(const PatternToMatch &Pattern,
 
   // Generate the code for the matcher.
   if (Gen.EmitMatcherCode(Variant))
-    return 0;
+    return nullptr;
 
   // FIXME2: Kill extra MoveParent commands at the end of the matcher sequence.
   // FIXME2: Split result code out to another table, and make the matcher end
diff --git a/utils/TableGen/DAGISelMatcherOpt.cpp b/utils/TableGen/DAGISelMatcherOpt.cpp
index b7f3b6c..0b117eb 100644
--- a/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "isel-opt"
 #include "DAGISelMatcher.h"
 #include "CodeGenDAGPatterns.h"
 #include "llvm/ADT/DenseSet.h"
@@ -20,13 +19,15 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "isel-opt"
+
 /// ContractNodes - Turn multiple matcher node patterns like 'MoveChild+Record'
 /// into single compound nodes like RecordChild.
 static void ContractNodes(std::unique_ptr<Matcher> &MatcherPtr,
                           const CodeGenDAGPatterns &CGP) {
   // If we reached the end of the chain, we're done.
   Matcher *N = MatcherPtr.get();
-  if (N == 0) return;
+  if (!N) return;
   
   // If we have a scope node, walk down all of the children.
   if (ScopeMatcher *Scope = dyn_cast<ScopeMatcher>(N)) {
@@ -41,7 +42,7 @@ static void ContractNodes(std::unique_ptr<Matcher> &MatcherPtr,
   // If we found a movechild node with a node that comes in a 'foochild' form,
   // transform it.
   if (MoveChildMatcher *MC = dyn_cast<MoveChildMatcher>(N)) {
-    Matcher *New = 0;
+    Matcher *New = nullptr;
     if (RecordMatcher *RM = dyn_cast<RecordMatcher>(MC->getNext()))
       if (MC->getChildNo() < 8)  // Only have RecordChild0...7
         New = new RecordChildMatcher(MC->getChildNo(), RM->getWhatFor(),
@@ -191,7 +192,7 @@ static void SinkPatternPredicates(std::unique_ptr<Matcher> &MatcherPtr) {
   // Recursively scan for a PatternPredicate.
   // If we reached the end of the chain, we're done.
   Matcher *N = MatcherPtr.get();
-  if (N == 0) return;
+  if (!N) return;
   
   // Walk down all members of a scope node.
   if (ScopeMatcher *Scope = dyn_cast<ScopeMatcher>(N)) {
@@ -206,7 +207,7 @@ static void SinkPatternPredicates(std::unique_ptr<Matcher> &MatcherPtr) {
   // If this node isn't a CheckPatternPredicateMatcher we keep scanning until
   // we find one.
   CheckPatternPredicateMatcher *CPPM =dyn_cast<CheckPatternPredicateMatcher>(N);
-  if (CPPM == 0)
+  if (!CPPM)
     return SinkPatternPredicates(N->getNextPtr());
   
   // Ok, we found one, lets try to sink it. Check if we can sink it past the
@@ -236,7 +237,7 @@ static Matcher *FindNodeWithKind(Matcher *M, Matcher::KindTy Kind) {
   for (; M; M = M->getNext())
     if (M->getKind() == Kind)
       return M;
-  return 0;
+  return nullptr;
 }
 
 
@@ -255,11 +256,11 @@ static Matcher *FindNodeWithKind(Matcher *M, Matcher::KindTy Kind) {
 static void FactorNodes(std::unique_ptr<Matcher> &MatcherPtr) {
   // If we reached the end of the chain, we're done.
   Matcher *N = MatcherPtr.get();
-  if (N == 0) return;
+  if (!N) return;
   
   // If this is not a push node, just scan for one.
   ScopeMatcher *Scope = dyn_cast<ScopeMatcher>(N);
-  if (Scope == 0)
+  if (!Scope)
     return FactorNodes(N->getNextPtr());
   
   // Okay, pull together the children of the scope node into a vector so we can
@@ -335,7 +336,7 @@ static void FactorNodes(std::unique_ptr<Matcher> &MatcherPtr) {
       // or the same as what we're looking for.  If so, reorder it.
       if (Optn->isSimplePredicateOrRecordNode()) {
         Matcher *M2 = FindNodeWithKind(ScanMatcher, Optn->getKind());
-        if (M2 != 0 && M2 != ScanMatcher &&
+        if (M2 && M2 != ScanMatcher &&
             M2->canMoveBefore(ScanMatcher) &&
             (M2->isEqual(Optn) || M2->isContradictory(Optn))) {
           Matcher *MatcherWithoutM2 = ScanMatcher->unlinkNode(M2);
@@ -399,7 +400,7 @@ static void FactorNodes(std::unique_ptr<Matcher> &MatcherPtr) {
   }
   
   if (NewOptionsToMatch.empty()) {
-    MatcherPtr.reset(0);
+    MatcherPtr.reset(nullptr);
     return;
   }
   
@@ -427,7 +428,7 @@ static void FactorNodes(std::unique_ptr<Matcher> &MatcherPtr) {
       CheckTypeMatcher *CTM =
         cast_or_null<CheckTypeMatcher>(FindNodeWithKind(NewOptionsToMatch[i],
                                                         Matcher::CheckType));
-      if (CTM == 0 ||
+      if (!CTM ||
           // iPTR checks could alias any other case without us knowing, don't
           // bother with them.
           CTM->getType() == MVT::iPTR ||
diff --git a/utils/TableGen/DFAPacketizerEmitter.cpp b/utils/TableGen/DFAPacketizerEmitter.cpp
index 2549c47..ea14cb9 100644
--- a/utils/TableGen/DFAPacketizerEmitter.cpp
+++ b/utils/TableGen/DFAPacketizerEmitter.cpp
@@ -82,14 +82,15 @@ namespace {
 class State {
  public:
   static int currentStateNum;
-  int stateNum;
-  bool isInitial;
-  std::set<unsigned> stateInfo;
-  typedef std::map<unsigned, State *> TransitionMap;
-  TransitionMap Transitions;
+  // stateNum is the only member used for equality/ordering, all other members
+  // can be mutated even in const State objects.
+  const int stateNum;
+  mutable bool isInitial;
+  mutable std::set<unsigned> stateInfo;
+  typedef std::map<unsigned, const State *> TransitionMap;
+  mutable TransitionMap Transitions;
 
   State();
-  State(const State &S);
 
   bool operator<(const State &s) const {
     return stateNum < s.stateNum;
@@ -108,16 +109,16 @@ class State {
   // AddInsnClass - Return all combinations of resource reservation
   // which are possible from this state (PossibleStates).
   //
-  void AddInsnClass(unsigned InsnClass, std::set<unsigned> &PossibleStates);
+  void AddInsnClass(unsigned InsnClass, std::set<unsigned> &PossibleStates) const;
   // 
   // addTransition - Add a transition from this state given the input InsnClass
   //
-  void addTransition(unsigned InsnClass, State *To);
+  void addTransition(unsigned InsnClass, const State *To) const;
   //
   // hasTransition - Returns true if there is a transition from this state
   // given the input InsnClass
   //
-  bool hasTransition(unsigned InsnClass);
+  bool hasTransition(unsigned InsnClass) const;
 };
 } // End anonymous namespace.
 
@@ -128,10 +129,9 @@ namespace {
 class DFA {
 public:
   DFA();
-  ~DFA();
 
   // Set of states. Need to keep this sorted to emit the transition table.
-  typedef std::set<State *, less_ptr<State> > StateSet;
+  typedef std::set<State> StateSet;
   StateSet states;
 
   State *currentState;
@@ -139,8 +139,7 @@ public:
   //
   // Modify the DFA.
   //
-  void initialize();
-  void addState(State *);
+  const State &newState();
 
   //
   // writeTable: Print out a table representing the DFA.
@@ -156,21 +155,12 @@ public:
 State::State() :
   stateNum(currentStateNum++), isInitial(false) {}
 
-
-State::State(const State &S) :
-  stateNum(currentStateNum++), isInitial(S.isInitial),
-  stateInfo(S.stateInfo) {}
-
-DFA::DFA(): currentState(NULL) {}
-
-DFA::~DFA() {
-  DeleteContainerPointers(states);
-}
+DFA::DFA(): currentState(nullptr) {}
 
 // 
 // addTransition - Add a transition from this state given the input InsnClass
 //
-void State::addTransition(unsigned InsnClass, State *To) {
+void State::addTransition(unsigned InsnClass, const State *To) const {
   assert(!Transitions.count(InsnClass) &&
       "Cannot have multiple transitions for the same input");
   Transitions[InsnClass] = To;
@@ -180,7 +170,7 @@ void State::addTransition(unsigned InsnClass, State *To) {
 // hasTransition - Returns true if there is a transition from this state
 // given the input InsnClass
 //
-bool State::hasTransition(unsigned InsnClass) {
+bool State::hasTransition(unsigned InsnClass) const {
   return Transitions.count(InsnClass) > 0;
 }
 
@@ -189,7 +179,7 @@ bool State::hasTransition(unsigned InsnClass) {
 // which are possible from this state (PossibleStates).
 //
 void State::AddInsnClass(unsigned InsnClass,
-                            std::set<unsigned> &PossibleStates) {
+                            std::set<unsigned> &PossibleStates) const {
   //
   // Iterate over all resource states in currentState.
   //
@@ -248,15 +238,10 @@ bool State::canAddInsnClass(unsigned InsnClass) const {
 }
 
 
-void DFA::initialize() {
-  assert(currentState && "Missing current state");
-  currentState->isInitial = true;
-}
-
-
-void DFA::addState(State *S) {
-  assert(!states.count(S) && "State already exists");
-  states.insert(S);
+const State &DFA::newState() {
+  auto IterPair = states.insert(State());
+  assert(IterPair.second && "State already exists");
+  return *IterPair.first;
 }
 
 
@@ -292,16 +277,16 @@ void DFA::writeTableAndAPI(raw_ostream &OS, const std::string &TargetName) {
   // to construct the StateEntry table.
   int ValidTransitions = 0;
   for (unsigned i = 0; i < states.size(); ++i, ++SI) {
-    assert (((*SI)->stateNum == (int) i) && "Mismatch in state numbers");
+    assert ((SI->stateNum == (int) i) && "Mismatch in state numbers");
     StateEntry[i] = ValidTransitions;
     for (State::TransitionMap::iterator
-        II = (*SI)->Transitions.begin(), IE = (*SI)->Transitions.end();
+        II = SI->Transitions.begin(), IE = SI->Transitions.end();
         II != IE; ++II) {
       OS << "{" << II->first << ", "
          << II->second->stateNum
          << "},    ";
     }
-    ValidTransitions += (*SI)->Transitions.size();
+    ValidTransitions += SI->Transitions.size();
 
     // If there are no valid transitions from this stage, we need a sentinel
     // transition.
@@ -447,12 +432,11 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
   // Run a worklist algorithm to generate the DFA.
   //
   DFA D;
-  State *Initial = new State;
+  const State *Initial = &D.newState();
   Initial->isInitial = true;
   Initial->stateInfo.insert(0x0);
-  D.addState(Initial);
-  SmallVector<State*, 32> WorkList;
-  std::map<std::set<unsigned>, State*> Visited;
+  SmallVector<const State*, 32> WorkList;
+  std::map<std::set<unsigned>, const State*> Visited;
 
   WorkList.push_back(Initial);
 
@@ -474,7 +458,7 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
   //             Add S' to Visited
   //
   while (!WorkList.empty()) {
-    State *current = WorkList.pop_back_val();
+    const State *current = WorkList.pop_back_val();
     for (DenseSet<unsigned>::iterator CI = allInsnClasses.begin(),
            CE = allInsnClasses.end(); CI != CE; ++CI) {
       unsigned InsnClass = *CI;
@@ -486,7 +470,7 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
       //
       if (!current->hasTransition(InsnClass) &&
           current->canAddInsnClass(InsnClass)) {
-        State *NewState = NULL;
+        const State *NewState;
         current->AddInsnClass(InsnClass, NewStateResources);
         assert(NewStateResources.size() && "New states must be generated");
 
@@ -494,13 +478,12 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
         // If we have seen this state before, then do not create a new state.
         //
         //
-        std::map<std::set<unsigned>, State*>::iterator VI;
-        if ((VI = Visited.find(NewStateResources)) != Visited.end())
+        auto VI = Visited.find(NewStateResources);
+        if (VI != Visited.end())
           NewState = VI->second;
         else {
-          NewState = new State;
+          NewState = &D.newState();
           NewState->stateInfo = NewStateResources;
-          D.addState(NewState);
           Visited[NewStateResources] = NewState;
           WorkList.push_back(NewState);
         }
diff --git a/utils/TableGen/DisassemblerEmitter.cpp b/utils/TableGen/DisassemblerEmitter.cpp
index 0020de6..f02051a 100644
--- a/utils/TableGen/DisassemblerEmitter.cpp
+++ b/utils/TableGen/DisassemblerEmitter.cpp
@@ -127,10 +127,13 @@ void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
   }
 
   // ARM and Thumb have a CHECK() macro to deal with DecodeStatuses.
-  if (Target.getName() == "ARM" ||
-      Target.getName() == "Thumb" || 
-      Target.getName() == "AArch64") {
-    EmitFixedLenDecoder(Records, OS, Target.getName() == "AArch64" ? "AArch64" : "ARM",
+  if (Target.getName() == "ARM" || Target.getName() == "Thumb" ||
+      Target.getName() == "AArch64" || Target.getName() == "ARM64") {
+    std::string PredicateNamespace = Target.getName();
+    if (PredicateNamespace == "Thumb")
+      PredicateNamespace = "ARM";
+
+    EmitFixedLenDecoder(Records, OS, PredicateNamespace,
                         "if (!Check(S, ", ")) return MCDisassembler::Fail;",
                         "S", "MCDisassembler::Fail",
                         "  MCDisassembler::DecodeStatus S = "
diff --git a/utils/TableGen/FastISelEmitter.cpp b/utils/TableGen/FastISelEmitter.cpp
index 3a3f836..154f96d 100644
--- a/utils/TableGen/FastISelEmitter.cpp
+++ b/utils/TableGen/FastISelEmitter.cpp
@@ -188,7 +188,7 @@ struct OperandsSignature {
       return true;
     }
 
-    const CodeGenRegisterClass *DstRC = 0;
+    const CodeGenRegisterClass *DstRC = nullptr;
 
     for (unsigned i = 0, e = InstPatNode->getNumChildren(); i != e; ++i) {
       TreePatternNode *Op = InstPatNode->getChild(i);
@@ -252,7 +252,7 @@ struct OperandsSignature {
       Record *OpLeafRec = OpDI->getDef();
 
       // For now, the only other thing we accept is register operands.
-      const CodeGenRegisterClass *RC = 0;
+      const CodeGenRegisterClass *RC = nullptr;
       if (OpLeafRec->isSubClassOf("RegisterOperand"))
         OpLeafRec = OpLeafRec->getValueAsDef("RegClass");
       if (OpLeafRec->isSubClassOf("RegisterClass"))
@@ -459,7 +459,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
 
     // For now, ignore instructions where the first operand is not an
     // output register.
-    const CodeGenRegisterClass *DstRC = 0;
+    const CodeGenRegisterClass *DstRC = nullptr;
     std::string SubRegNo;
     if (Op->getName() != "EXTRACT_SUBREG") {
       Record *Op0Rec = II.Operands[0].Rec;
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index e249a94..42639cc 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -12,8 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "decoder-emitter"
-
 #include "CodeGenTarget.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallString.h"
@@ -34,6 +32,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "decoder-emitter"
+
 namespace {
 struct EncodingField {
   unsigned Base, Width, Offset;
@@ -347,7 +347,7 @@ public:
                 unsigned BW,
                 const FixedLenDecoderEmitter *E)
     : AllInstructions(Insts), Opcodes(IDs), Operands(Ops), Filters(),
-      Parent(NULL), BestIndex(-1), BitWidth(BW), Emitter(E) {
+      Parent(nullptr), BestIndex(-1), BitWidth(BW), Emitter(E) {
     for (unsigned i = 0; i < BitWidth; ++i)
       FilterBitValues.push_back(BIT_UNFILTERED);
 
@@ -1776,7 +1776,7 @@ static bool populateInstruction(CodeGenTarget &Target,
       // Determine if Vals[i] actually contributes to the Inst encoding.
       unsigned bi = 0;
       for (; bi < Bits.getNumBits(); ++bi) {
-        VarInit *Var = 0;
+        VarInit *Var = nullptr;
         VarBitInit *BI = dyn_cast<VarBitInit>(Bits.getBit(bi));
         if (BI)
           Var = dyn_cast<VarInit>(BI->getBitVar());
@@ -1798,7 +1798,7 @@ static bool populateInstruction(CodeGenTarget &Target,
       // Get the bit range for this operand:
       unsigned bitStart = bi++, bitWidth = 1;
       for (; bi < Bits.getNumBits(); ++bi) {
-        VarInit *Var = 0;
+        VarInit *Var = nullptr;
         VarBitInit *BI = dyn_cast<VarBitInit>(Bits.getBit(bi));
         if (BI)
           Var = dyn_cast<VarInit>(BI->getBitVar());
@@ -1837,7 +1837,7 @@ static bool populateInstruction(CodeGenTarget &Target,
 
       RecordVal *DecoderString = TypeRecord->getValue("DecoderMethod");
       StringInit *String = DecoderString ?
-        dyn_cast<StringInit>(DecoderString->getValue()) : 0;
+        dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
       if (String && String->getValue() != "")
         Decoder = String->getValue();
 
@@ -1866,7 +1866,7 @@ static bool populateInstruction(CodeGenTarget &Target,
 
       DecoderString = TypeRecord->getValue("DecoderMethod");
       String = DecoderString ?
-        dyn_cast<StringInit>(DecoderString->getValue()) : 0;
+        dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
       if (!isReg && String && String->getValue() != "")
         Decoder = String->getValue();
 
@@ -1938,7 +1938,7 @@ static bool populateInstruction(CodeGenTarget &Target,
 
     RecordVal *DecoderString = TypeRecord->getValue("DecoderMethod");
     StringInit *String = DecoderString ?
-      dyn_cast<StringInit>(DecoderString->getValue()) : 0;
+      dyn_cast<StringInit>(DecoderString->getValue()) : nullptr;
     if (!isReg && String && String->getValue() != "")
       Decoder = String->getValue();
 
@@ -1948,7 +1948,7 @@ static bool populateInstruction(CodeGenTarget &Target,
     unsigned Offset = 0;
 
     for (unsigned bi = 0; bi < Bits.getNumBits(); ++bi) {
-      VarInit *Var = 0;
+      VarInit *Var = nullptr;
       VarBitInit *BI = dyn_cast<VarBitInit>(Bits.getBit(bi));
       if (BI)
         Var = dyn_cast<VarInit>(BI->getBitVar());
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index 7aa0f40..76f05ce 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -88,7 +88,7 @@ std::vector<std::string>
 InstrInfoEmitter::GetOperandInfo(const CodeGenInstruction &Inst) {
   std::vector<std::string> Result;
 
-  for (unsigned i = 0, e = Inst.Operands.size(); i != e; ++i) {
+  for (auto &Op : Inst.Operands) {
     // Handle aggregate operands and normal operands the same way by expanding
     // either case into a list of operands for this op.
     std::vector<CGIOperandList::OperandInfo> OperandList;
@@ -97,14 +97,14 @@ InstrInfoEmitter::GetOperandInfo(const CodeGenInstruction &Inst) {
     // registers in their multi-operand operands.  It may also be an anonymous
     // operand, which has a single operand, but no declared class for the
     // operand.
-    DagInit *MIOI = Inst.Operands[i].MIOperandInfo;
+    DagInit *MIOI = Op.MIOperandInfo;
 
     if (!MIOI || MIOI->getNumArgs() == 0) {
       // Single, anonymous, operand.
-      OperandList.push_back(Inst.Operands[i]);
+      OperandList.push_back(Op);
     } else {
-      for (unsigned j = 0, e = Inst.Operands[i].MINumOperands; j != e; ++j) {
-        OperandList.push_back(Inst.Operands[i]);
+      for (unsigned j = 0, e = Op.MINumOperands; j != e; ++j) {
+        OperandList.push_back(Op);
 
         Record *OpR = cast<DefInit>(MIOI->getArg(j))->getDef();
         OperandList.back().Rec = OpR;
@@ -134,24 +134,24 @@ InstrInfoEmitter::GetOperandInfo(const CodeGenInstruction &Inst) {
 
       // Predicate operands.  Check to see if the original unexpanded operand
       // was of type PredicateOp.
-      if (Inst.Operands[i].Rec->isSubClassOf("PredicateOp"))
+      if (Op.Rec->isSubClassOf("PredicateOp"))
         Res += "|(1<<MCOI::Predicate)";
 
       // Optional def operands.  Check to see if the original unexpanded operand
       // was of type OptionalDefOperand.
-      if (Inst.Operands[i].Rec->isSubClassOf("OptionalDefOperand"))
+      if (Op.Rec->isSubClassOf("OptionalDefOperand"))
         Res += "|(1<<MCOI::OptionalDef)";
 
       // Fill in operand type.
       Res += ", MCOI::";
-      assert(!Inst.Operands[i].OperandType.empty() && "Invalid operand type.");
-      Res += Inst.Operands[i].OperandType;
+      assert(!Op.OperandType.empty() && "Invalid operand type.");
+      Res += Op.OperandType;
 
       // Fill in constraint info.
       Res += ", ";
 
       const CGIOperandList::ConstraintInfo &Constraint =
-        Inst.Operands[i].Constraints[j];
+        Op.Constraints[j];
       if (Constraint.isNone())
         Res += "0";
       else if (Constraint.isEarlyClobber())
@@ -177,16 +177,15 @@ void InstrInfoEmitter::EmitOperandInfo(raw_ostream &OS,
 
   OS << "\n";
   const CodeGenTarget &Target = CDP.getTargetInfo();
-  for (CodeGenTarget::inst_iterator II = Target.inst_begin(),
-       E = Target.inst_end(); II != E; ++II) {
-    std::vector<std::string> OperandInfo = GetOperandInfo(**II);
+  for (const CodeGenInstruction *Inst : Target.instructions()) {
+    std::vector<std::string> OperandInfo = GetOperandInfo(*Inst);
     unsigned &N = OperandInfoIDs[OperandInfo];
     if (N != 0) continue;
 
     N = ++OperandListNum;
     OS << "static const MCOperandInfo OperandInfo" << N << "[] = { ";
-    for (unsigned i = 0, e = OperandInfo.size(); i != e; ++i)
-      OS << "{ " << OperandInfo[i] << " }, ";
+    for (const std::string &Info : OperandInfo)
+      OS << "{ " << Info << " }, ";
     OS << "};\n";
   }
 }
@@ -206,14 +205,11 @@ void InstrInfoEmitter::initOperandMapData(
         OpNameMapTy &OperandMap) {
 
   unsigned NumOperands = 0;
-  for (unsigned i = 0, e = NumberedInstructions.size(); i != e; ++i) {
-    const CodeGenInstruction *Inst = NumberedInstructions[i];
-    if (!Inst->TheDef->getValueAsBit("UseNamedOperandTable")) {
+  for (const CodeGenInstruction *Inst : NumberedInstructions) {
+    if (!Inst->TheDef->getValueAsBit("UseNamedOperandTable"))
       continue;
-    }
     std::map<unsigned, unsigned> OpList;
-    for (unsigned j = 0, je = Inst->Operands.size(); j != je; ++j) {
-      const CGIOperandList::OperandInfo &Info = Inst->Operands[j];
+    for (const auto &Info : Inst->Operands) {
       StrUintMapIter I = Operands.find(Info.Name);
 
       if (I == Operands.end()) {
@@ -256,8 +252,8 @@ void InstrInfoEmitter::emitOperandNameMappings(raw_ostream &OS,
   OS << "namespace " << Namespace << " {\n";
   OS << "namespace " << OpNameNS << " { \n";
   OS << "enum {\n";
-  for (StrUintMapIter i = Operands.begin(), e = Operands.end(); i != e; ++i)
-    OS << "  " << i->first << " = " << i->second << ",\n";
+  for (const auto &Op : Operands)
+    OS << "  " << Op.first << " = " << Op.second << ",\n";
 
   OS << "OPERAND_LAST";
   OS << "\n};\n";
@@ -274,15 +270,13 @@ void InstrInfoEmitter::emitOperandNameMappings(raw_ostream &OS,
   if (!Operands.empty()) {
     OS << "  static const int16_t OperandMap [][" << Operands.size()
        << "] = {\n";
-    for (OpNameMapTy::iterator i = OperandMap.begin(), e = OperandMap.end();
-                                                       i != e; ++i) {
-      const std::map<unsigned, unsigned> &OpList = i->first;
+    for (const auto &Entry : OperandMap) {
+      const std::map<unsigned, unsigned> &OpList = Entry.first;
       OS << "{";
 
       // Emit a row of the OperandMap table
-      for (unsigned ii = 0, ie = Operands.size(); ii != ie; ++ii)
-        OS << (OpList.count(ii) == 0 ? -1 : (int)OpList.find(ii)->second)
-           << ", ";
+      for (unsigned i = 0, e = Operands.size(); i != e; ++i)
+        OS << (OpList.count(i) == 0 ? -1 : (int)OpList.find(i)->second) << ", ";
 
       OS << "},\n";
     }
@@ -290,12 +284,9 @@ void InstrInfoEmitter::emitOperandNameMappings(raw_ostream &OS,
 
     OS << "  switch(Opcode) {\n";
     unsigned TableIndex = 0;
-    for (OpNameMapTy::iterator i = OperandMap.begin(), e = OperandMap.end();
-                                                       i != e; ++i) {
-      std::vector<std::string> &OpcodeList = i->second;
-
-      for (unsigned ii = 0, ie = OpcodeList.size(); ii != ie; ++ii)
-        OS << "  case " << OpcodeList[ii] << ":\n";
+    for (const auto &Entry : OperandMap) {
+      for (const std::string &Name : Entry.second)
+        OS << "  case " << Name << ":\n";
 
       OS << "    return OperandMap[" << TableIndex++ << "][NamedIdx];\n";
     }
@@ -328,9 +319,11 @@ void InstrInfoEmitter::emitOperandTypesEnum(raw_ostream &OS,
   OS << "namespace OpTypes { \n";
   OS << "enum OperandType {\n";
 
-  for (unsigned oi = 0, oe = Operands.size(); oi != oe; ++oi) {
-    if (!Operands[oi]->isAnonymous())
-      OS << "  " << Operands[oi]->getName() << " = " << oi << ",\n";
+  unsigned EnumVal = 0;
+  for (const Record *Op : Operands) {
+    if (!Op->isAnonymous())
+      OS << "  " << Op->getName() << " = " << EnumVal << ",\n";
+    ++EnumVal;
   }
 
   OS << "  OPERAND_TYPE_LIST_END" << "\n};\n";
@@ -365,9 +358,8 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
   unsigned ListNumber = 0;
 
   // Emit all of the instruction's implicit uses and defs.
-  for (CodeGenTarget::inst_iterator II = Target.inst_begin(),
-         E = Target.inst_end(); II != E; ++II) {
-    Record *Inst = (*II)->TheDef;
+  for (const CodeGenInstruction *II : Target.instructions()) {
+    Record *Inst = II->TheDef;
     std::vector<Record*> Uses = Inst->getValueAsListOfDefs("Uses");
     if (!Uses.empty()) {
       unsigned &IL = EmittedLists[Uses];
@@ -391,29 +383,30 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
   const std::vector<const CodeGenInstruction*> &NumberedInstructions =
     Target.getInstructionsByEnumValue();
 
-  for (unsigned i = 0, e = NumberedInstructions.size(); i != e; ++i)
-    emitRecord(*NumberedInstructions[i], i, InstrInfo, EmittedLists,
-               OperandInfoIDs, OS);
-  OS << "};\n\n";
-
-  // Build an array of instruction names
   SequenceToOffsetTable<std::string> InstrNames;
-  for (unsigned i = 0, e = NumberedInstructions.size(); i != e; ++i) {
-    const CodeGenInstruction *Instr = NumberedInstructions[i];
-    InstrNames.add(Instr->TheDef->getName());
+  unsigned Num = 0;
+  for (const CodeGenInstruction *Inst : NumberedInstructions) {
+    // Keep a list of the instruction names.
+    InstrNames.add(Inst->TheDef->getName());
+    // Emit the record into the table.
+    emitRecord(*Inst, Num++, InstrInfo, EmittedLists, OperandInfoIDs, OS);
   }
+  OS << "};\n\n";
 
+  // Emit the array of instruction names.
   InstrNames.layout();
   OS << "extern const char " << TargetName << "InstrNameData[] = {\n";
   InstrNames.emit(OS, printChar);
   OS << "};\n\n";
 
   OS << "extern const unsigned " << TargetName <<"InstrNameIndices[] = {";
-  for (unsigned i = 0, e = NumberedInstructions.size(); i != e; ++i) {
-    if (i % 8 == 0)
+  Num = 0;
+  for (const CodeGenInstruction *Inst : NumberedInstructions) {
+    // Newline every eight entries.
+    if (Num % 8 == 0)
       OS << "\n    ";
-    const CodeGenInstruction *Instr = NumberedInstructions[i];
-    OS << InstrNames.get(Instr->TheDef->getName()) << "U, ";
+    OS << InstrNames.get(Inst->TheDef->getName()) << "U, ";
+    ++Num;
   }
 
   OS << "\n};\n\n";
@@ -530,20 +523,20 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
   // Emit the implicit uses and defs lists...
   std::vector<Record*> UseList = Inst.TheDef->getValueAsListOfDefs("Uses");
   if (UseList.empty())
-    OS << "NULL, ";
+    OS << "nullptr, ";
   else
     OS << "ImplicitList" << EmittedLists[UseList] << ", ";
 
   std::vector<Record*> DefList = Inst.TheDef->getValueAsListOfDefs("Defs");
   if (DefList.empty())
-    OS << "NULL, ";
+    OS << "nullptr, ";
   else
     OS << "ImplicitList" << EmittedLists[DefList] << ", ";
 
   // Emit the operand info.
   std::vector<std::string> OperandInfo = GetOperandInfo(Inst);
   if (OperandInfo.empty())
-    OS << "0";
+    OS << "nullptr";
   else
     OS << "OperandInfo" << OpInfo.find(OperandInfo)->second;
 
@@ -555,10 +548,10 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
   else if (!Inst.DeprecatedReason.empty())
     // Emit the Subtarget feature.
     OS << "," << Target.getInstNamespace() << "::" << Inst.DeprecatedReason
-       << ",0";
+       << ",nullptr";
   else
     // Instruction isn't deprecated.
-    OS << ",0,0";
+    OS << ",0,nullptr";
 
   OS << " },  // Inst #" << Num << " = " << Inst.TheDef->getName() << "\n";
 }
@@ -586,18 +579,16 @@ void InstrInfoEmitter::emitEnums(raw_ostream &OS) {
 
   OS << "namespace " << Namespace << " {\n";
   OS << "  enum {\n";
-  for (unsigned i = 0, e = NumberedInstructions.size(); i != e; ++i) {
-    OS << "    " << NumberedInstructions[i]->TheDef->getName()
-       << "\t= " << i << ",\n";
-  }
+  unsigned Num = 0;
+  for (const CodeGenInstruction *Inst : NumberedInstructions)
+    OS << "    " << Inst->TheDef->getName() << "\t= " << Num++ << ",\n";
   OS << "    INSTRUCTION_LIST_END = " << NumberedInstructions.size() << "\n";
   OS << "  };\n";
   OS << "namespace Sched {\n";
   OS << "  enum {\n";
-  for (unsigned i = 0, e = SchedModels.numInstrSchedClasses(); i != e; ++i) {
-    OS << "    " << SchedModels.getSchedClass(i).Name
-       << "\t= " << i << ",\n";
-  }
+  Num = 0;
+  for (const auto &Class : SchedModels.explicit_classes())
+    OS << "    " << Class.Name << "\t= " << Num++ << ",\n";
   OS << "    SCHED_LIST_END = " << SchedModels.numInstrSchedClasses() << "\n";
   OS << "  };\n}\n}\n";
   OS << "} // End llvm namespace \n";
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index 7b0a2b6..1927ad9 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -14,6 +14,7 @@
 #include "CodeGenIntrinsics.h"
 #include "CodeGenTarget.h"
 #include "SequenceToOffsetTable.h"
+#include "TableGenBackends.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
@@ -45,8 +46,6 @@ public:
                                 raw_ostream &OS);
   void EmitIntrinsicToOverloadTable(const std::vector<CodeGenIntrinsic> &Ints,
                                     raw_ostream &OS);
-  void EmitVerifier(const std::vector<CodeGenIntrinsic> &Ints,
-                    raw_ostream &OS);
   void EmitGenerator(const std::vector<CodeGenIntrinsic> &Ints,
                      raw_ostream &OS);
   void EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints,
@@ -478,11 +477,13 @@ void IntrinsicEmitter::EmitGenerator(const std::vector<CodeGenIntrinsic> &Ints,
   OS << "#endif\n\n";  // End of GET_INTRINSIC_GENERATOR_GLOBAL
 }
 
+namespace {
 enum ModRefKind {
   MRK_none,
   MRK_readonly,
   MRK_readnone
 };
+}
 
 static ModRefKind getModRefKind(const CodeGenIntrinsic &intrinsic) {
   switch (intrinsic.ModRef) {
@@ -789,10 +790,6 @@ EmitIntrinsicToGCCBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints,
   OS << "#endif\n\n";
 }
 
-namespace llvm {
-
-void EmitIntrinsics(RecordKeeper &RK, raw_ostream &OS, bool TargetOnly = false) {
+void llvm::EmitIntrinsics(RecordKeeper &RK, raw_ostream &OS, bool TargetOnly) {
   IntrinsicEmitter(RK, TargetOnly).run(OS);
 }
-
-} // End llvm namespace
diff --git a/utils/TableGen/PseudoLoweringEmitter.cpp b/utils/TableGen/PseudoLoweringEmitter.cpp
index 100338c..3b74ac4 100644
--- a/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "pseudo-lowering"
 #include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
 #include "llvm/ADT/IndexedMap.h"
@@ -21,6 +20,8 @@
 #include <vector>
 using namespace llvm;
 
+#define DEBUG_TYPE "pseudo-lowering"
+
 namespace {
 class PseudoLoweringEmitter {
   struct OpData {
@@ -156,8 +157,7 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
 
   // If there are more operands that weren't in the DAG, they have to
   // be operands that have default values, or we have an error. Currently,
-  // Operands that are a sublass of OperandWithDefaultOp have default values.
-
+  // Operands that are a subclass of OperandWithDefaultOp have default values.
 
   // Validate that each result pattern argument has a matching (by name)
   // argument in the source instruction, in either the (outs) or (ins) list.
@@ -200,70 +200,74 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
 
   o << "bool " << Target.getName() + "AsmPrinter" << "::\n"
     << "emitPseudoExpansionLowering(MCStreamer &OutStreamer,\n"
-    << "                            const MachineInstr *MI) {\n"
-    << "  switch (MI->getOpcode()) {\n"
-    << "    default: return false;\n";
-  for (unsigned i = 0, e = Expansions.size(); i != e; ++i) {
-    PseudoExpansion &Expansion = Expansions[i];
-    CodeGenInstruction &Source = Expansion.Source;
-    CodeGenInstruction &Dest = Expansion.Dest;
-    o << "    case " << Source.Namespace << "::"
-      << Source.TheDef->getName() << ": {\n"
-      << "      MCInst TmpInst;\n"
-      << "      MCOperand MCOp;\n"
-      << "      TmpInst.setOpcode(" << Dest.Namespace << "::"
-      << Dest.TheDef->getName() << ");\n";
-
-    // Copy the operands from the source instruction.
-    // FIXME: Instruction operands with defaults values (predicates and cc_out
-    //        in ARM, for example shouldn't need explicit values in the
-    //        expansion DAG.
-    unsigned MIOpNo = 0;
-    for (unsigned OpNo = 0, E = Dest.Operands.size(); OpNo != E;
-         ++OpNo) {
-      o << "      // Operand: " << Dest.Operands[OpNo].Name << "\n";
-      for (unsigned i = 0, e = Dest.Operands[OpNo].MINumOperands;
-           i != e; ++i) {
-        switch (Expansion.OperandMap[MIOpNo + i].Kind) {
-        case OpData::Operand:
-          o << "      lowerOperand(MI->getOperand("
-            << Source.Operands[Expansion.OperandMap[MIOpNo].Data
-                .Operand].MIOperandNo + i
-            << "), MCOp);\n"
-            << "      TmpInst.addOperand(MCOp);\n";
-          break;
-        case OpData::Imm:
-          o << "      TmpInst.addOperand(MCOperand::CreateImm("
-            << Expansion.OperandMap[MIOpNo + i].Data.Imm << "));\n";
-          break;
-        case OpData::Reg: {
-          Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg;
-          o << "      TmpInst.addOperand(MCOperand::CreateReg(";
-          // "zero_reg" is special.
-          if (Reg->getName() == "zero_reg")
-            o << "0";
-          else
-            o << Reg->getValueAsString("Namespace") << "::" << Reg->getName();
-          o << "));\n";
-          break;
-        }
+    << "                            const MachineInstr *MI) {\n";
+
+  if (!Expansions.empty()) {
+    o << "  switch (MI->getOpcode()) {\n"
+      << "    default: return false;\n";
+    for (auto &Expansion : Expansions) {
+      CodeGenInstruction &Source = Expansion.Source;
+      CodeGenInstruction &Dest = Expansion.Dest;
+      o << "    case " << Source.Namespace << "::"
+        << Source.TheDef->getName() << ": {\n"
+        << "      MCInst TmpInst;\n"
+        << "      MCOperand MCOp;\n"
+        << "      TmpInst.setOpcode(" << Dest.Namespace << "::"
+        << Dest.TheDef->getName() << ");\n";
+
+      // Copy the operands from the source instruction.
+      // FIXME: Instruction operands with defaults values (predicates and cc_out
+      //        in ARM, for example shouldn't need explicit values in the
+      //        expansion DAG.
+      unsigned MIOpNo = 0;
+      for (const auto &DestOperand : Dest.Operands) {
+        o << "      // Operand: " << DestOperand.Name << "\n";
+        for (unsigned i = 0, e = DestOperand.MINumOperands; i != e; ++i) {
+          switch (Expansion.OperandMap[MIOpNo + i].Kind) {
+            case OpData::Operand:
+            o << "      lowerOperand(MI->getOperand("
+              << Source.Operands[Expansion.OperandMap[MIOpNo].Data
+              .Operand].MIOperandNo + i
+              << "), MCOp);\n"
+              << "      TmpInst.addOperand(MCOp);\n";
+            break;
+            case OpData::Imm:
+            o << "      TmpInst.addOperand(MCOperand::CreateImm("
+              << Expansion.OperandMap[MIOpNo + i].Data.Imm << "));\n";
+            break;
+            case OpData::Reg: {
+              Record *Reg = Expansion.OperandMap[MIOpNo + i].Data.Reg;
+              o << "      TmpInst.addOperand(MCOperand::CreateReg(";
+              // "zero_reg" is special.
+              if (Reg->getName() == "zero_reg")
+                o << "0";
+              else
+                o << Reg->getValueAsString("Namespace") << "::"
+                  << Reg->getName();
+              o << "));\n";
+              break;
+            }
+          }
         }
+        MIOpNo += DestOperand.MINumOperands;
       }
-      MIOpNo += Dest.Operands[OpNo].MINumOperands;
-    }
-    if (Dest.Operands.isVariadic) {
-      MIOpNo = Source.Operands.size() + 1;
-      o << "      // variable_ops\n";
-      o << "      for (unsigned i = " << MIOpNo
-        << ", e = MI->getNumOperands(); i != e; ++i)\n"
-        << "        if (lowerOperand(MI->getOperand(i), MCOp))\n"
-        << "          TmpInst.addOperand(MCOp);\n";
+      if (Dest.Operands.isVariadic) {
+        MIOpNo = Source.Operands.size() + 1;
+        o << "      // variable_ops\n";
+        o << "      for (unsigned i = " << MIOpNo
+          << ", e = MI->getNumOperands(); i != e; ++i)\n"
+          << "        if (lowerOperand(MI->getOperand(i), MCOp))\n"
+          << "          TmpInst.addOperand(MCOp);\n";
+      }
+      o << "      EmitToStreamer(OutStreamer, TmpInst);\n"
+        << "      break;\n"
+        << "    }\n";
     }
-    o << "      EmitToStreamer(OutStreamer, TmpInst);\n"
-      << "      break;\n"
-      << "    }\n";
-  }
-  o << "  }\n  return true;\n}\n\n";
+    o << "  }\n  return true;";
+  } else
+    o << "  return false;";
+
+  o << "\n}\n\n";
 }
 
 void PseudoLoweringEmitter::run(raw_ostream &o) {
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 61ae262..573c37f 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -225,7 +225,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
   for (unsigned i = 0; i < NumSets; ++i ) {
     OS << "    \"" << RegBank.getRegSetAt(i).Name << "\",\n";
   }
-  OS << "    0 };\n"
+  OS << "    nullptr };\n"
      << "  return PressureNameTable[Idx];\n"
      << "}\n\n";
 
@@ -831,7 +831,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   // Emit the table of register unit roots. Each regunit has one or two root
   // registers.
-  OS << "extern const uint16_t " << TargetName << "RegUnitRoots[][2] = {\n";
+  OS << "extern const MCPhysReg " << TargetName << "RegUnitRoots[][2] = {\n";
   for (unsigned i = 0, e = RegBank.getNumNativeRegUnits(); i != e; ++i) {
     ArrayRef<const CodeGenRegister*> Roots = RegBank.getRegUnit(i).getRoots();
     assert(!Roots.empty() && "All regunits must have a root register.");
@@ -858,7 +858,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
 
     // Emit the register list now.
     OS << "  // " << Name << " Register Class...\n"
-       << "  const uint16_t " << Name
+       << "  const MCPhysReg " << Name
        << "[] = {\n    ";
     for (unsigned i = 0, e = Order.size(); i != e; ++i) {
       Record *Reg = Order[i];
@@ -1068,7 +1068,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   // Now that all of the structs have been emitted, emit the instances.
   if (!RegisterClasses.empty()) {
     OS << "\nstatic const TargetRegisterClass *const "
-       << "NullRegClasses[] = { NULL };\n\n";
+       << "NullRegClasses[] = { nullptr };\n\n";
 
     // Emit register class bit mask tables. The first bit mask emitted for a
     // register class, RC, is the set of sub-classes, including RC itself.
@@ -1135,7 +1135,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
          << RC.getName() << "Superclasses[] = {\n";
       for (unsigned i = 0; i != Supers.size(); ++i)
         OS << "  &" << Supers[i]->getQualifiedName() << "RegClass,\n";
-      OS << "  NULL\n};\n\n";
+      OS << "  nullptr\n};\n\n";
     }
 
     // Emit methods.
@@ -1189,7 +1189,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
       else
         OS << RC.getName() << "Superclasses,\n    ";
       if (RC.AltOrderSelect.empty())
-        OS << "0\n";
+        OS << "nullptr\n";
       else
         OS << RC.getName() << "GetRawAllocationOrder\n";
       OS << "  };\n\n";
@@ -1258,7 +1258,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
        << "  if (!Idx) return RC;\n  --Idx;\n"
        << "  assert(Idx < " << SubRegIndices.size() << " && \"Bad subreg\");\n"
        << "  unsigned TV = Table[RC->getID()][Idx];\n"
-       << "  return TV ? getRegClass(TV - 1) : 0;\n}\n\n";
+       << "  return TV ? getRegClass(TV - 1) : nullptr;\n}\n\n";
   }
 
   EmitRegUnitPressure(OS, RegBank, ClassName);
@@ -1267,7 +1267,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   OS << "extern const MCRegisterDesc " << TargetName << "RegDesc[];\n";
   OS << "extern const MCPhysReg " << TargetName << "RegDiffLists[];\n";
   OS << "extern const char " << TargetName << "RegStrings[];\n";
-  OS << "extern const uint16_t " << TargetName << "RegUnitRoots[][2];\n";
+  OS << "extern const MCPhysReg " << TargetName << "RegUnitRoots[][2];\n";
   OS << "extern const uint16_t " << TargetName << "SubRegIdxLists[];\n";
   OS << "extern const MCRegisterInfo::SubRegCoveredBits "
      << TargetName << "SubRegIdxRanges[];\n";
diff --git a/utils/TableGen/SetTheory.cpp b/utils/TableGen/SetTheory.cpp
index d952e68..5ead7ed 100644
--- a/utils/TableGen/SetTheory.cpp
+++ b/utils/TableGen/SetTheory.cpp
@@ -318,6 +318,6 @@ const RecVec *SetTheory::expand(Record *Set) {
   }
 
   // Set is not expandable.
-  return 0;
+  return nullptr;
 }
 
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 71301aa..06f8694 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "subtarget-emitter"
-
 #include "CodeGenTarget.h"
 #include "CodeGenSchedule.h"
 #include "llvm/ADT/STLExtras.h"
@@ -29,6 +27,8 @@
 #include <vector>
 using namespace llvm;
 
+#define DEBUG_TYPE "subtarget-emitter"
+
 namespace {
 class SubtargetEmitter {
   // Each processor has a SchedClassDesc table with an entry for each SchedClass.
@@ -578,7 +578,7 @@ EmitItineraries(raw_ostream &OS,
     OS << "\n";
     OS << "static const llvm::InstrItinerary ";
     if (ItinList.empty()) {
-      OS << '*' << Name << " = 0;\n";
+      OS << '*' << Name << " = nullptr;\n";
       continue;
     }
 
@@ -631,7 +631,7 @@ void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel,
   for (unsigned i = 0, e = ProcModel.ProcResourceDefs.size(); i < e; ++i) {
     Record *PRDef = ProcModel.ProcResourceDefs[i];
 
-    Record *SuperDef = 0;
+    Record *SuperDef = nullptr;
     unsigned SuperIdx = 0;
     unsigned NumUnits = 0;
     int BufferSize = PRDef->getValueAsInt("BufferSize");
@@ -676,7 +676,7 @@ Record *SubtargetEmitter::FindWriteResources(
   if (SchedWrite.TheDef->isSubClassOf("SchedWriteRes"))
     return SchedWrite.TheDef;
 
-  Record *AliasDef = 0;
+  Record *AliasDef = nullptr;
   for (RecIter AI = SchedWrite.Aliases.begin(), AE = SchedWrite.Aliases.end();
        AI != AE; ++AI) {
     const CodeGenSchedRW &AliasRW =
@@ -696,7 +696,7 @@ Record *SubtargetEmitter::FindWriteResources(
     return AliasDef;
 
   // Check this processor's list of write resources.
-  Record *ResDef = 0;
+  Record *ResDef = nullptr;
   for (RecIter WRI = ProcModel.WriteResDefs.begin(),
          WRE = ProcModel.WriteResDefs.end(); WRI != WRE; ++WRI) {
     if (!(*WRI)->isSubClassOf("WriteRes"))
@@ -730,7 +730,7 @@ Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead,
     return SchedRead.TheDef;
 
   // Check this processor's list of aliases for SchedRead.
-  Record *AliasDef = 0;
+  Record *AliasDef = nullptr;
   for (RecIter AI = SchedRead.Aliases.begin(), AE = SchedRead.Aliases.end();
        AI != AE; ++AI) {
     const CodeGenSchedRW &AliasRW =
@@ -750,7 +750,7 @@ Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead,
     return AliasDef;
 
   // Check this processor's ReadAdvanceList.
-  Record *ResDef = 0;
+  Record *ResDef = nullptr;
   for (RecIter RAI = ProcModel.ReadAdvanceDefs.begin(),
          RAE = ProcModel.ReadAdvanceDefs.end(); RAI != RAE; ++RAI) {
     if (!(*RAI)->isSubClassOf("ReadAdvance"))
@@ -884,7 +884,7 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
     if (!SCI->InstRWs.empty()) {
       // This class has a default ReadWrite list which can be overriden by
       // InstRW definitions.
-      Record *RWDef = 0;
+      Record *RWDef = nullptr;
       for (RecIter RWI = SCI->InstRWs.begin(), RWE = SCI->InstRWs.end();
            RWI != RWE; ++RWI) {
         Record *RWModelDef = (*RWI)->getValueAsDef("SchedModel");
@@ -1195,6 +1195,7 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
     OS << "static const llvm::MCSchedModel " << PI->ModelName << "(\n";
     EmitProcessorProp(OS, PI->ModelDef, "IssueWidth", ',');
     EmitProcessorProp(OS, PI->ModelDef, "MicroOpBufferSize", ',');
+    EmitProcessorProp(OS, PI->ModelDef, "LoopMicroOpBufferSize", ',');
     EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ',');
     EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ',');
     EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');
@@ -1454,11 +1455,11 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   if (NumFeatures)
     OS << Target << "FeatureKV, ";
   else
-    OS << "0, ";
+    OS << "None, ";
   if (NumProcs)
     OS << Target << "SubTypeKV, ";
   else
-    OS << "0, ";
+    OS << "None, ";
   OS << '\n'; OS.indent(22);
   OS << Target << "ProcSchedKV, "
      << Target << "WriteProcResTable, "
@@ -1468,10 +1469,10 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << '\n'; OS.indent(22);
     OS << Target << "Stages, "
        << Target << "OperandCycles, "
-       << Target << "ForwardingPaths, ";
+       << Target << "ForwardingPaths";
   } else
-    OS << "0, 0, 0, ";
-  OS << NumFeatures << ", " << NumProcs << ");\n}\n\n";
+    OS << "0, 0, 0";
+  OS << ");\n}\n\n";
 
   OS << "} // End llvm namespace \n";
 
@@ -1481,7 +1482,6 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "#undef GET_SUBTARGETINFO_TARGET_DESC\n";
 
   OS << "#include \"llvm/Support/Debug.h\"\n";
-  OS << "#include \"llvm/Support/raw_ostream.h\"\n";
   ParseFeaturesFunction(OS, NumFeatures, NumProcs);
 
   OS << "#endif // GET_SUBTARGETINFO_TARGET_DESC\n\n";
@@ -1532,13 +1532,13 @@ void SubtargetEmitter::run(raw_ostream &OS) {
      << "  : TargetSubtargetInfo() {\n"
      << "  InitMCSubtargetInfo(TT, CPU, FS, ";
   if (NumFeatures)
-    OS << Target << "FeatureKV, ";
+    OS << "makeArrayRef(" << Target << "FeatureKV, " << NumFeatures << "), ";
   else
-    OS << "0, ";
+    OS << "None, ";
   if (NumProcs)
-    OS << Target << "SubTypeKV, ";
+    OS << "makeArrayRef(" << Target << "SubTypeKV, " << NumProcs << "), ";
   else
-    OS << "0, ";
+    OS << "None, ";
   OS << '\n'; OS.indent(22);
   OS << Target << "ProcSchedKV, "
      << Target << "WriteProcResTable, "
@@ -1548,10 +1548,10 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   if (SchedModels.hasItineraries()) {
     OS << Target << "Stages, "
        << Target << "OperandCycles, "
-       << Target << "ForwardingPaths, ";
+       << Target << "ForwardingPaths";
   } else
-    OS << "0, 0, 0, ";
-  OS << NumFeatures << ", " << NumProcs << ");\n}\n\n";
+    OS << "0, 0, 0";
+  OS << ");\n}\n\n";
 
   EmitSchedModelHelpers(ClassName, OS);
 
diff --git a/utils/TableGen/X86DisassemblerShared.h b/utils/TableGen/X86DisassemblerShared.h
index 036e924..9e79b9c 100644
--- a/utils/TableGen/X86DisassemblerShared.h
+++ b/utils/TableGen/X86DisassemblerShared.h
@@ -13,23 +13,42 @@
 #include <string.h>
 #include <string>
 
-#define INSTRUCTION_SPECIFIER_FIELDS       \
-  struct OperandSpecifier operands[X86_MAX_OPERANDS]; \
-  InstructionContext      insnContext;     \
-  std::string             name;            \
-                                           \
-  InstructionSpecifier() {                 \
-    insnContext = IC;                      \
-    name = "";                             \
-    memset(operands, 0, sizeof(operands)); \
+#include "../../lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h"
+
+struct InstructionSpecifier {
+  llvm::X86Disassembler::OperandSpecifier
+      operands[llvm::X86Disassembler::X86_MAX_OPERANDS];
+  llvm::X86Disassembler::InstructionContext insnContext;
+  std::string name;
+
+  InstructionSpecifier() {
+    insnContext = llvm::X86Disassembler::IC;
+    name = "";
+    memset(operands, 0, sizeof(operands));
   }
+};
 
-#define INSTRUCTION_IDS           \
-  InstrUID   instructionIDs[256];
+/// Specifies whether a ModR/M byte is needed and (if so) which
+/// instruction each possible value of the ModR/M byte corresponds to. Once
+/// this information is known, we have narrowed down to a single instruction.
+struct ModRMDecision {
+  uint8_t modrm_type;
+  llvm::X86Disassembler::InstrUID instructionIDs[256];
+};
 
-#include "../../lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h"
+/// Specifies which set of ModR/M->instruction tables to look at
+/// given a particular opcode.
+struct OpcodeDecision {
+  ModRMDecision modRMDecisions[256];
+};
 
-#undef INSTRUCTION_SPECIFIER_FIELDS
-#undef INSTRUCTION_IDS
+/// Specifies which opcode->instruction tables to look at given
+/// a particular context (set of attributes).  Since there are many possible
+/// contexts, the decoder first uses CONTEXTS_SYM to determine which context
+/// applies given a specific set of attributes.  Hence there are only IC_max
+/// entries in this table, rather than 2^(ATTR_max).
+struct ContextDecision {
+  OpcodeDecision opcodeDecisions[llvm::X86Disassembler::IC_max];
+};
 
 #endif
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index 95651f6..ead419e 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -788,7 +788,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
 
   OpcodeType    opcodeType  = (OpcodeType)-1;
 
-  ModRMFilter*  filter      = NULL;
+  ModRMFilter*  filter      = nullptr;
   uint8_t       opcodeToSet = 0;
 
   switch (OpMap) {
diff --git a/utils/TableGen/module.modulemap b/utils/TableGen/module.modulemap
new file mode 100644
index 0000000..8871bbf
--- /dev/null
+++ b/utils/TableGen/module.modulemap
@@ -0,0 +1,4 @@
+module TableGen {
+  umbrella "."
+  module * { export * }
+}
diff --git a/utils/lit/utils/check-coverage b/utils/lit/utils/check-coverage
index bb3d17e..128e827 100755
--- a/utils/lit/utils/check-coverage
+++ b/utils/lit/utils/check-coverage
@@ -9,13 +9,13 @@ if [ ! -f setup.py ] || [ ! -d lit ]; then
 fi
 
 # Parse command line arguments.
-if [ "$1" == "--generate-html" ]; then
+if [ "$1" = "--generate-html" ]; then
     GENERATE_HTML=1
     shift
 fi
 
 # If invoked with no arguments, run all the tests.
-if [ $# == "0" ]; then
+if [ $# = "0" ]; then
     set -- "tests"
 fi
 
diff --git a/utils/lit/utils/check-sdist b/utils/lit/utils/check-sdist
index 743a971..f03266a 100755
--- a/utils/lit/utils/check-sdist
+++ b/utils/lit/utils/check-sdist
@@ -1,6 +1,6 @@
 #!/bin/sh
 
-if [ $# == 1 ]; then
+if [ $# = 1 ]; then
     cd $1
 fi
 
diff --git a/utils/lldbDataFormatters.py b/utils/lldbDataFormatters.py
index 1baf398..352448d 100644
--- a/utils/lldbDataFormatters.py
+++ b/utils/lldbDataFormatters.py
@@ -1,10 +1,18 @@
 """
-Load into LLDB with:
-script import lldbDataFormatters
-type synthetic add -x "^llvm::SmallVectorImpl<.+>$" -l lldbDataFormatters.SmallVectorSynthProvider
-type synthetic add -x "^llvm::SmallVector<.+,.+>$" -l lldbDataFormatters.SmallVectorSynthProvider
+LLDB Formatters for LLVM data types.
+
+Load into LLDB with 'command script import /path/to/lldbDataFormatters.py'
 """
 
+def __lldb_init_module(debugger, internal_dict):
+    debugger.HandleCommand('type category define -e llvm -l c++')
+    debugger.HandleCommand('type synthetic add -w llvm '
+                           '-l lldbDataFormatters.SmallVectorSynthProvider '
+                           '-x "^llvm::SmallVectorImpl<.+>$"')
+    debugger.HandleCommand('type synthetic add -w llvm '
+                           '-l lldbDataFormatters.SmallVectorSynthProvider '
+                           '-x "^llvm::SmallVector<.+,.+>$"')
+
 # Pretty printer for llvm::SmallVector/llvm::SmallVectorImpl
 class SmallVectorSynthProvider:
     def __init__(self, valobj, dict):
diff --git a/utils/llvm-build/llvmbuild/componentinfo.py b/utils/llvm-build/llvmbuild/componentinfo.py
index eda3a48..b384acd 100644
--- a/utils/llvm-build/llvmbuild/componentinfo.py
+++ b/utils/llvm-build/llvmbuild/componentinfo.py
@@ -9,7 +9,7 @@ except:
     import ConfigParser as configparser
 import sys
 
-from llvmbuild.util import *
+from llvmbuild.util import fatal, warning
 
 class ParseError(Exception):
     pass
diff --git a/utils/llvm-build/llvmbuild/main.py b/utils/llvm-build/llvmbuild/main.py
index 6cb5c12..37aa5d8 100644
--- a/utils/llvm-build/llvmbuild/main.py
+++ b/utils/llvm-build/llvmbuild/main.py
@@ -5,7 +5,7 @@ import sys
 import llvmbuild.componentinfo as componentinfo
 import llvmbuild.configutil as configutil
 
-from llvmbuild.util import *
+from llvmbuild.util import fatal, note
 
 ###
 
@@ -719,7 +719,9 @@ def add_magic_target_components(parser, project, opts):
         enable_targets = available_targets.values()
     else:
         # We support both space separated and semi-colon separated lists.
-        if ' ' in opts.enable_targets:
+        if opts.enable_targets == '':
+            enable_target_names = []
+        elif ' ' in opts.enable_targets:
             enable_target_names = opts.enable_targets.split()
         else:
             enable_target_names = opts.enable_targets.split(';')
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index f10b822..b4d7689 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -223,7 +223,7 @@ function check_valid_urls() {
         echo "# Validating $proj SVN URL"
 
         if ! svn ls $Base_url/$proj/tags/RELEASE_$Release_no_dot/$RC > /dev/null 2>&1 ; then
-            echo "llvm $Release release candidate $RC doesn't exist!"
+            echo "$proj $Release release candidate $RC doesn't exist!"
             exit 1
         fi
     done
diff --git a/utils/yaml-bench/YAMLBench.cpp b/utils/yaml-bench/YAMLBench.cpp
index 3e80f23..58b7356 100644
--- a/utils/yaml-bench/YAMLBench.cpp
+++ b/utils/yaml-bench/YAMLBench.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This program executes the YAMLParser on differntly sized YAML texts and
+// This program executes the YAMLParser on differently sized YAML texts and
 // outputs the run time.
 //
 //===----------------------------------------------------------------------===//
-- 
cgit v1.1